2113 files changed, 379723 insertions, 77026 deletions
diff --git a/libavcodec/.gitignore b/libavcodec/.gitignore
index 77a2ab1..28814f7 100644
--- a/libavcodec/.gitignore
+++ b/libavcodec/.gitignore
@@ -2,3 +2,5 @@
 /*_tables.c
 /*_tables.h
 /bsf_list.c
+/codec_list.c
+/parser_list.c
diff --git a/libavcodec/012v.c b/libavcodec/012v.c
new file mode 100644
index 0000000..b5a4066
--- /dev/null
+++ b/libavcodec/012v.c
@@ -0,0 +1,155 @@
+/*
+ * 012v decoder
+ *
+ * Copyright (C) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int zero12v_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV422P16;
+    avctx->bits_per_raw_sample = 10;
+
+    if (avctx->codec_tag == MKTAG('a', '1', '2', 'v'))
+        avpriv_request_sample(avctx, "transparency");
+
+    return 0;
+}
+
+static int zero12v_decode_frame(AVCodecContext *avctx, void *data,
+                                int *got_frame, AVPacket *avpkt)
+{
+    int line, ret;
+    const int width = avctx->width;
+    AVFrame *pic = data;
+    uint16_t *y, *u, *v;
+    const uint8_t *line_end, *src = avpkt->data;
+    int stride = avctx->width * 8 / 3;
+
+    if (width <= 1 || avctx->height <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions %dx%d not supported.\n", width, avctx->height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (   avctx->codec_tag == MKTAG('0', '1', '2', 'v')
+        && avpkt->size % avctx->height == 0
+        && avpkt->size / avctx->height * 3 >= width * 8)
+        stride = avpkt->size / avctx->height;
+
+    if (avpkt->size < avctx->height * stride) {
+        av_log(avctx, AV_LOG_ERROR, "Packet too small: %d instead of %d\n",
+               avpkt->size, avctx->height * stride);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->pict_type = AV_PICTURE_TYPE_I;
+    pic->key_frame = 1;
+
+    line_end = avpkt->data + stride;
+    for (line = 0; line < avctx->height; line++) {
+        uint16_t y_temp[6] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+        uint16_t u_temp[3] = {0x8000, 0x8000, 0x8000};
+        uint16_t v_temp[3] = {0x8000, 0x8000, 0x8000};
+        int x;
+        y = (uint16_t *)(pic->data[0] + line * pic->linesize[0]);
+        u = (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
+        v = (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
+
+        for (x = 0; x < width; x += 6) {
+            uint32_t t;
+
+            if (width - x < 6 || line_end - src < 16) {
+                y = y_temp;
+                u = u_temp;
+                v = v_temp;
+            }
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *u++ = t <<  6 & 0xFFC0;
+            *y++ = t >>  4 & 0xFFC0;
+            *v++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *y++ = t <<  6 & 0xFFC0;
+            *u++ = t >>  4 & 0xFFC0;
+            *y++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *v++ = t <<  6 & 0xFFC0;
+            *y++ = t >>  4 & 0xFFC0;
+            *u++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *y++ = t <<  6 & 0xFFC0;
+            *v++ = t >>  4 & 0xFFC0;
+            *y++ = t >> 14 & 0xFFC0;
+
+            if (width - x < 6)
+                break;
+        }
+
+        if (x < width) {
+            y = x   + (uint16_t *)(pic->data[0] + line * pic->linesize[0]);
+            u = x/2 + (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
+            v = x/2 + (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
+            memcpy(y, y_temp, sizeof(*y) * (width - x));
+            memcpy(u, u_temp, sizeof(*u) * (width - x + 1) / 2);
+            memcpy(v, v_temp, sizeof(*v) * (width - x + 1) / 2);
+        }
+
+        line_end += stride;
+        src = line_end - stride;
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_zero12v_decoder = {
+    .name           = "012v",
+    .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_012V,
+    .init           = zero12v_decode_init,
+    .decode         = zero12v_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c
index 4a9a14a..2b88c89 100644
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -2,20 +2,20 @@
  * 4XM codec
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,17 +26,17 @@
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/frame.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "blockdsp.h"
 #include "bswapdsp.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
-#include "vlc.h"
+
 
 #define BLOCK_TYPE_VLC_BITS 5
 #define ACDC_VLC_BITS 9
@@ -138,14 +138,14 @@ typedef struct FourXContext {
     BswapDSPContext bbdsp;
     uint16_t *frame_buffer;
     uint16_t *last_frame_buffer;
-    BitstreamContext pre_bc;    // ac/dc prefix
-    BitstreamContext bc;
+    GetBitContext pre_gb;          ///< ac/dc prefix
+    GetBitContext gb;
     GetByteContext g;
     GetByteContext g2;
     int mv[256];
     VLC pre_vlc;
     int last_dc;
-    DECLARE_ALIGNED(16, int16_t, block)[6][64];
+    DECLARE_ALIGNED(32, int16_t, block)[6][64];
     void *bitstream_buffer;
     unsigned int bitstream_buffer_size;
     int version;
@@ -291,7 +291,7 @@ static void init_mv(FourXContext *f, int linesize)
     }
 #endif
 
-static inline void mcdc(uint16_t *dst, uint16_t *src, int log2w,
+static inline void mcdc(uint16_t *dst, const uint16_t *src, int log2w,
                         int h, int stride, int scale, unsigned dc)
 {
     int i;
@@ -335,36 +335,32 @@ static inline void mcdc(uint16_t *dst, uint16_t *src, int log2w,
         }
         break;
     default:
-        break;
+        av_assert0(0);
     }
 }
 
-static int decode_p_block(FourXContext *f, uint16_t *dst, uint16_t *src,
+static int decode_p_block(FourXContext *f, uint16_t *dst, const uint16_t *src,
                           int log2w, int log2h, int stride)
 {
     int index, h, code, ret, scale = 1;
     uint16_t *start, *end;
     unsigned dc = 0;
 
-    if (log2h < 0 || log2w < 0)
-        return AVERROR_INVALIDDATA;
+    av_assert0(log2w >= 0 && log2h >= 0);
 
     index = size2index[log2h][log2w];
-    if (index < 0)
-        return AVERROR_INVALIDDATA;
+    av_assert0(index >= 0);
 
     h     = 1 << log2h;
-    code  = bitstream_read_vlc(&f->bc, block_type_vlc[1 - (f->version > 1)][index].table,
-                               BLOCK_TYPE_VLC_BITS, 1);
-    if (code < 0 || code > 6)
-        return AVERROR_INVALIDDATA;
+    code  = get_vlc2(&f->gb, block_type_vlc[1 - (f->version > 1)][index].table,
+                     BLOCK_TYPE_VLC_BITS, 1);
+    av_assert0(code >= 0 && code <= 6);
 
     start = f->last_frame_buffer;
     end   = start + stride * (f->avctx->height - h + 1) - (1 << log2w);
 
     if (code == 1) {
-        if (--log2h < 0)
-            return AVERROR_INVALIDDATA;
+        log2h--;
         if ((ret = decode_p_block(f, dst, src, log2w, log2h, stride)) < 0)
             return ret;
         return decode_p_block(f, dst + (stride << log2h),
@@ -378,24 +374,42 @@ static int decode_p_block(FourXContext *f, uint16_t *dst, uint16_t *src,
                               src + (1 << log2w),
                               log2w, log2h, stride);
     } else if (code == 6) {
+        if (bytestream2_get_bytes_left(&f->g2) < 4) {
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         if (log2w) {
-            dst[0]      = bytestream2_get_le16(&f->g2);
-            dst[1]      = bytestream2_get_le16(&f->g2);
+            dst[0]      = bytestream2_get_le16u(&f->g2);
+            dst[1]      = bytestream2_get_le16u(&f->g2);
         } else {
-            dst[0]      = bytestream2_get_le16(&f->g2);
-            dst[stride] = bytestream2_get_le16(&f->g2);
+            dst[0]      = bytestream2_get_le16u(&f->g2);
+            dst[stride] = bytestream2_get_le16u(&f->g2);
         }
         return 0;
     }
 
+    if ((code&3)==0 && bytestream2_get_bytes_left(&f->g) < 1) {
+        av_log(f->avctx, AV_LOG_ERROR, "bytestream overread\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     if (code == 0) {
         src  += f->mv[bytestream2_get_byte(&f->g)];
     } else if (code == 3 && f->version >= 2) {
         return 0;
     } else if (code == 4) {
         src  += f->mv[bytestream2_get_byte(&f->g)];
+        if (bytestream2_get_bytes_left(&f->g2) < 2){
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         dc    = bytestream2_get_le16(&f->g2);
     } else if (code == 5) {
+        if (bytestream2_get_bytes_left(&f->g2) < 2){
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert0(start <= src && src <= end);
         scale = 0;
         dc    = bytestream2_get_le16(&f->g2);
     }
@@ -424,9 +438,9 @@ static int decode_p_frame(FourXContext *f, const uint8_t *buf, int length)
     src = f->last_frame_buffer;
 
     if (f->version > 1) {
-        if (length < 20)
-            return AVERROR_INVALIDDATA;
         extra           = 20;
+        if (length < extra)
+            return AVERROR_INVALIDDATA;
         bitstream_size  = AV_RL32(buf + 8);
         wordstream_size = AV_RL32(buf + 12);
         bytestream_size = AV_RL32(buf + 16);
@@ -437,25 +451,22 @@ static int decode_p_frame(FourXContext *f, const uint8_t *buf, int length)
         bytestream_size = FFMAX(length - bitstream_size - wordstream_size, 0);
     }
 
-    if (bitstream_size + bytestream_size + wordstream_size + extra != length
-        || bitstream_size  > (1 << 26)
-        || bytestream_size > (1 << 26)
-        || wordstream_size > (1 << 26)) {
-        av_log(f->avctx, AV_LOG_ERROR, "lengths %d %d %d %d\n",
-               bitstream_size, bytestream_size, wordstream_size,
-               bitstream_size + bytestream_size + wordstream_size - length);
+    if (bitstream_size > length || bitstream_size >= INT_MAX/8 ||
+        bytestream_size > length - bitstream_size ||
+        wordstream_size > length - bytestream_size - bitstream_size ||
+        extra > length - bytestream_size - bitstream_size - wordstream_size) {
+        av_log(f->avctx, AV_LOG_ERROR, "lengths %d %d %d %d\n", bitstream_size, bytestream_size, wordstream_size,
+        bitstream_size+ bytestream_size+ wordstream_size - length);
         return AVERROR_INVALIDDATA;
     }
 
-    av_fast_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
-                   bitstream_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
+                          bitstream_size);
     if (!f->bitstream_buffer)
         return AVERROR(ENOMEM);
     f->bbdsp.bswap_buf(f->bitstream_buffer, (const uint32_t *) (buf + extra),
                        bitstream_size / 4);
-    memset((uint8_t*)f->bitstream_buffer + bitstream_size,
-           0, AV_INPUT_BUFFER_PADDING_SIZE);
-    bitstream_init8(&f->bc, f->bitstream_buffer, bitstream_size);
+    init_get_bits(&f->gb, f->bitstream_buffer, 8 * bitstream_size);
 
     wordstream_offset = extra + bitstream_size;
     bytestream_offset = extra + bitstream_size + wordstream_size;
@@ -485,20 +496,27 @@ static int decode_i_block(FourXContext *f, int16_t *block)
 {
     int code, i, j, level, val;
 
+    if (get_bits_left(&f->gb) < 2){
+        av_log(f->avctx, AV_LOG_ERROR, "%d bits left before decode_i_block()\n", get_bits_left(&f->gb));
+        return AVERROR_INVALIDDATA;
+    }
+
     /* DC coef */
-    val = bitstream_read_vlc(&f->pre_bc, f->pre_vlc.table, ACDC_VLC_BITS, 3);
-    if (val >> 4)
+    val = get_vlc2(&f->pre_gb, f->pre_vlc.table, ACDC_VLC_BITS, 3);
+    if (val >> 4) {
         av_log(f->avctx, AV_LOG_ERROR, "error dc run != 0\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     if (val)
-        val = bitstream_read_xbits(&f->bc, val);
+        val = get_xbits(&f->gb, val);
 
     val        = val * dequant_table[0] + f->last_dc;
     f->last_dc = block[0] = val;
     /* AC coefs */
     i = 1;
     for (;;) {
-        code = bitstream_read_vlc(&f->pre_bc, f->pre_vlc.table, ACDC_VLC_BITS, 3);
+        code = get_vlc2(&f->pre_gb, f->pre_vlc.table, ACDC_VLC_BITS, 3);
 
         /* EOB */
         if (code == 0)
@@ -506,10 +524,15 @@ static int decode_i_block(FourXContext *f, int16_t *block)
         if (code == 0xf0) {
             i += 16;
         } else {
-            level = bitstream_read_xbits(&f->bc, code & 0xf);
+            if (code & 0xf) {
+                level = get_xbits(&f->gb, code & 0xf);
+            } else {
+                av_log(f->avctx, AV_LOG_ERROR, "0 coeff\n");
+                return AVERROR_INVALIDDATA;
+            }
             i    += code >> 4;
             if (i >= 64) {
-                av_log(f->avctx, AV_LOG_ERROR, "run %d oveflow\n", i);
+                av_log(f->avctx, AV_LOG_ERROR, "run %d overflow\n", i);
                 return 0;
             }
 
@@ -586,7 +609,7 @@ static int decode_i_mb(FourXContext *f)
 
 static const uint8_t *read_huffman_tables(FourXContext *f,
                                           const uint8_t * const buf,
-                                          int len)
+                                          int buf_size)
 {
     int frequency[512] = { 0 };
     uint8_t flag[512];
@@ -595,6 +618,7 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     int bits_tab[257];
     int start, end;
     const uint8_t *ptr = buf;
+    const uint8_t *ptr_end = buf + buf_size;
     int j;
 
     memset(up, -1, sizeof(up));
@@ -604,10 +628,10 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     for (;;) {
         int i;
 
-        len -= end - start + 1;
-
-        if (end < start || len < 0)
+        if (ptr_end - ptr < FFMAX(end - start + 1, 0) + 1) {
+            av_log(f->avctx, AV_LOG_ERROR, "invalid data in read_huffman_tables\n");
             return NULL;
+        }
 
         for (i = start; i <= end; i++)
             frequency[i] = *ptr++;
@@ -615,9 +639,6 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
         if (start == 0)
             break;
 
-        if (--len < 0)
-            return NULL;
-
         end = *ptr++;
     }
     frequency[256] = 1;
@@ -625,6 +646,11 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     while ((ptr - buf) & 3)
         ptr++; // 4byte align
 
+    if (ptr > ptr_end) {
+        av_log(f->avctx, AV_LOG_ERROR, "ptr overflow in read_huffman_tables\n");
+        return NULL;
+    }
+
     for (j = 257; j < 512; j++) {
         int min_freq[2] = { 256 * 256, 256 * 256 };
         int smallest[2] = { 0, 0 };
@@ -693,6 +719,7 @@ static int decode_i2_frame(FourXContext *f, const uint8_t *buf, int length)
     const int height = f->avctx->height;
     const int mbs    = (FFALIGN(width, 16) >> 4) * (FFALIGN(height, 16) >> 4);
     uint16_t *dst    = f->frame_buffer;
+    const uint8_t *buf_end = buf + length;
     GetByteContext g3;
 
     if (length < mbs * 8) {
@@ -704,6 +731,8 @@ static int decode_i2_frame(FourXContext *f, const uint8_t *buf, int length)
     for (y = 0; y < height; y += 16) {
         for (x = 0; x < width; x += 16) {
             unsigned int color[4] = { 0 }, bits;
+            if (buf_end - buf < 8)
+                return AVERROR_INVALIDDATA;
             // warning following is purely guessed ...
             color[0] = bytestream2_get_le16u(&g3);
             color[1] = bytestream2_get_le16u(&g3);
@@ -737,7 +766,6 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
     const int width  = f->avctx->width;
     const int height = f->avctx->height;
     const unsigned int bitstream_size = AV_RL32(buf);
-    int token_count av_unused;
     unsigned int prestream_size;
     const uint8_t *prestream;
 
@@ -749,7 +777,6 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
         return AVERROR_INVALIDDATA;
     }
 
-    token_count    =     AV_RL32(buf + bitstream_size + 8);
     prestream_size = 4 * AV_RL32(buf + bitstream_size + 4);
     prestream      =             buf + bitstream_size + 12;
 
@@ -766,19 +793,19 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
         return AVERROR_INVALIDDATA;
     }
 
-    bitstream_init8(&f->bc, buf + 4, bitstream_size);
+    av_assert0(prestream <= buf + length);
+
+    init_get_bits(&f->gb, buf + 4, 8 * bitstream_size);
 
     prestream_size = length + buf - prestream;
 
-    av_fast_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
-                   prestream_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
+                          prestream_size);
     if (!f->bitstream_buffer)
         return AVERROR(ENOMEM);
     f->bbdsp.bswap_buf(f->bitstream_buffer, (const uint32_t *) prestream,
                        prestream_size / 4);
-    memset((uint8_t*)f->bitstream_buffer + prestream_size,
-           0, AV_INPUT_BUFFER_PADDING_SIZE);
-    bitstream_init8(&f->pre_bc, f->bitstream_buffer, prestream_size);
+    init_get_bits(&f->pre_gb, f->bitstream_buffer, 8 * prestream_size);
 
     f->last_dc = 0 * 128 * 8 * 8;
 
@@ -791,7 +818,7 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
         }
     }
 
-    if (bitstream_read_vlc(&f->pre_bc, f->pre_vlc.table, ACDC_VLC_BITS, 3) != 256)
+    if (get_vlc2(&f->pre_gb, f->pre_vlc.table, ACDC_VLC_BITS, 3) != 256)
         av_log(f->avctx, AV_LOG_ERROR, "end mismatch\n");
 
     return 0;
@@ -809,11 +836,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (buf_size < 20)
         return AVERROR_INVALIDDATA;
 
-    if (avctx->width % 16 || avctx->height % 16) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Dimensions non-multiple of 16 are invalid.\n");
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(avctx->width % 16 == 0 && avctx->height % 16 == 0);
 
     if (buf_size < AV_RL32(buf + 4) + 8) {
         av_log(f->avctx, AV_LOG_ERROR, "size mismatch %d %"PRIu32"\n",
@@ -829,9 +852,19 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         const int data_size  = buf_size - 20;
         CFrameBuffer *cfrm;
 
+        if (f->version <= 1) {
+            av_log(f->avctx, AV_LOG_ERROR, "cfrm in version %d\n", f->version);
+            return AVERROR_INVALIDDATA;
+        }
+
         id         = AV_RL32(buf + 12);
         whole_size = AV_RL32(buf + 16);
 
+        if (data_size < 0 || whole_size < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "sizes invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         for (i = 0; i < CFRAME_BUFFER_COUNT; i++)
             if (f->cfrm[i].id && f->cfrm[i].id < avctx->frame_number)
                 av_log(f->avctx, AV_LOG_ERROR, "lost c frame %d\n",
@@ -850,11 +883,14 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         }
         cfrm = &f->cfrm[i];
 
+        if (data_size > UINT_MAX -  cfrm->size - AV_INPUT_BUFFER_PADDING_SIZE)
+            return AVERROR_INVALIDDATA;
+
         cfrm->data = av_fast_realloc(cfrm->data, &cfrm->allocated_size,
                                      cfrm->size + data_size + AV_INPUT_BUFFER_PADDING_SIZE);
         // explicit check needed as memcpy below might not catch a NULL
         if (!cfrm->data) {
-            av_log(f->avctx, AV_LOG_ERROR, "realloc failure");
+            av_log(f->avctx, AV_LOG_ERROR, "realloc failure\n");
             return AVERROR(ENOMEM);
         }
 
@@ -881,24 +917,27 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         frame_size = buf_size - 12;
     }
 
-
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0)
         return ret;
-    }
 
     if (frame_4cc == AV_RL32("ifr2")) {
         picture->pict_type = AV_PICTURE_TYPE_I;
-        if ((ret = decode_i2_frame(f, buf - 4, frame_size + 4)) < 0)
+        if ((ret = decode_i2_frame(f, buf - 4, frame_size + 4)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode i2 frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("ifrm")) {
         picture->pict_type = AV_PICTURE_TYPE_I;
-        if ((ret = decode_i_frame(f, buf, frame_size)) < 0)
+        if ((ret = decode_i_frame(f, buf, frame_size)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode i frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("pfrm") || frame_4cc == AV_RL32("pfr2")) {
         picture->pict_type = AV_PICTURE_TYPE_P;
-        if ((ret = decode_p_frame(f, buf, frame_size)) < 0)
+        if ((ret = decode_p_frame(f, buf, frame_size)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode p frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("snd_")) {
         av_log(avctx, AV_LOG_ERROR, "ignoring snd_ chunk length:%d\n",
                buf_size);
@@ -948,6 +987,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "extradata wrong or missing\n");
         return AVERROR_INVALIDDATA;
     }
+    if((avctx->width % 16) || (avctx->height % 16)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported width/height\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     ret = av_image_check_size(avctx->width, avctx->height, 0, avctx);
     if (ret < 0)
@@ -961,7 +1004,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     f->version = AV_RL32(avctx->extradata) >> 16;
-    ff_blockdsp_init(&f->bdsp);
+    ff_blockdsp_init(&f->bdsp, avctx);
     ff_bswapdsp_init(&f->bbdsp);
     f->avctx = avctx;
     init_vlcs(f);
diff --git a/libavcodec/8bps.c b/libavcodec/8bps.c
index 7ba2b31..aa2318f 100644
--- a/libavcodec/8bps.c
+++ b/libavcodec/8bps.c
@@ -2,20 +2,20 @@
  * Quicktime Planar RGB (8BPS) Video Decoder
  * Copyright (C) 2003 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
  *
  * Supports: PAL8 (RGB 8bpp, paletted)
  *         : BGR24 (RGB 24bpp) (can also output it as RGB32)
- *         : RGB32 (RGB 32bpp, 4th plane is probably alpha and it's ignored)
+ *         : RGB32 (RGB 32bpp, 4th plane is alpha)
  */
 
 #include <stdio.h>
@@ -41,7 +41,7 @@
 
 
 static const enum AVPixelFormat pixfmt_rgb24[] = {
-    AV_PIX_FMT_BGR24, AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE };
+    AV_PIX_FMT_BGR24, AV_PIX_FMT_0RGB32, AV_PIX_FMT_NONE };
 
 typedef struct EightBpsContext {
     AVCodecContext *avctx;
@@ -70,21 +70,15 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     unsigned char *planemap = c->planemap;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     ep = encoded + buf_size;
 
     /* Set data pointer after line lengths */
     dp = encoded + planes * (height << 1);
 
-    /* Ignore alpha plane, don't know what to do with it */
-    if (planes == 4)
-        planes--;
-
-    px_inc = planes + (avctx->pix_fmt == AV_PIX_FMT_RGB32);
+    px_inc = planes + (avctx->pix_fmt == AV_PIX_FMT_0RGB32);
 
     for (p = 0; p < planes; p++) {
         /* Lines length pointer for this plane */
@@ -128,12 +122,15 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (avctx->bits_per_coded_sample <= 8) {
+        int size;
         const uint8_t *pal = av_packet_get_side_data(avpkt,
                                                      AV_PKT_DATA_PALETTE,
-                                                     NULL);
-        if (pal) {
+                                                     &size);
+        if (pal && size == AVPALETTE_SIZE) {
             frame->palette_has_changed = 1;
             memcpy(c->pal, pal, AVPALETTE_SIZE);
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
         }
 
         memcpy (frame->data[1], c->pal, AVPALETTE_SIZE);
@@ -179,7 +176,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         c->planemap[0] = HAVE_BIGENDIAN ? 1 : 2; // 1st plane is red
         c->planemap[1] = HAVE_BIGENDIAN ? 2 : 1; // 2nd plane is green
         c->planemap[2] = HAVE_BIGENDIAN ? 3 : 0; // 3rd plane is blue
-        c->planemap[3] = HAVE_BIGENDIAN ? 0 : 3; // 4th plane is alpha???
+        c->planemap[3] = HAVE_BIGENDIAN ? 0 : 3; // 4th plane is alpha
     }
     return 0;
 }
diff --git a/libavcodec/8svx.c b/libavcodec/8svx.c
index fe90b16..edc945c 100644
--- a/libavcodec/8svx.c
+++ b/libavcodec/8svx.c
@@ -1,21 +1,21 @@
 /*
- * 8SVX audio decoder
  * Copyright (C) 2008 Jaikrishnan Menon
+ * Copyright (C) 2011 Stefano Sabatini
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,18 @@
  *
  * supports: fibonacci delta encoding
  *         : exponential encoding
+ *
+ * For more information about the 8SVX format:
+ * http://netghost.narod.ru/gff/vendspec/iff/iff.txt
+ * http://sox.sourceforge.net/AudioFormats-11.html
+ * http://aminet.net/package/mus/misc/wavepak
+ * http://amigan.1emu.net/reg/8SVX.txt
+ *
+ * Samples can be found here:
+ * http://aminet.net/mods/smpl/
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "libavutil/common.h"
@@ -44,18 +54,17 @@ typedef struct EightSvxContext {
     int data_idx;
 } EightSvxContext;
 
-static const int8_t fibonacci[16]   = { -34, -21, -13,  -8, -5, -3, -2, -1,
-                                          0,   1,   2,   3,  5,  8, 13, 21 };
-static const int8_t exponential[16] = { -128, -64, -32, -16, -8, -4, -2, -1,
-                                           0,   1,   2,   4,  8, 16, 32, 64 };
+static const int8_t fibonacci[16]   = { -34,  -21, -13,  -8, -5, -3, -2, -1, 0, 1, 2, 3, 5, 8,  13, 21 };
+static const int8_t exponential[16] = { -128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64 };
 
-#define MAX_FRAME_SIZE 32768
+#define MAX_FRAME_SIZE 2048
 
 /**
  * Delta decode the compressed values in src, and put the resulting
  * decoded samples in dst.
  *
  * @param[in,out] state starting value. it is saved for use in the next call.
+ * @param table delta sequence table
  */
 static void delta_decode(uint8_t *dst, const uint8_t *src, int src_size,
                          uint8_t *state, const int8_t *table)
@@ -73,12 +82,6 @@ static void delta_decode(uint8_t *dst, const uint8_t *src, int src_size,
     *state = val;
 }
 
-static void raw_decode(uint8_t *dst, const int8_t *src, int src_size)
-{
-    while (src_size--)
-        *dst++ = *src++ + 128;
-}
-
 /** decode a frame */
 static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
                                  int *got_frame_ptr, AVPacket *avpkt)
@@ -87,27 +90,23 @@ static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame       = data;
     int buf_size;
     int ch, ret;
-    int is_compr = (avctx->codec_id != AV_CODEC_ID_PCM_S8_PLANAR);
+    int hdr_size = 2;
 
-    /* for the first packet, copy data to buffer */
-    if (avpkt->data) {
-        int hdr_size  = is_compr ? 2 : 0;
-        int chan_size = (avpkt->size - hdr_size * avctx->channels) / avctx->channels;
+    /* decode and interleave the first packet */
+    if (!esc->data[0] && avpkt) {
+        int chan_size = avpkt->size / avctx->channels - hdr_size;
 
-        if (avpkt->size < hdr_size * avctx->channels) {
-            av_log(avctx, AV_LOG_ERROR, "packet size is too small\n");
-            return AVERROR_INVALIDDATA;
+        if (avpkt->size % avctx->channels) {
+            av_log(avctx, AV_LOG_WARNING, "Packet with odd size, ignoring last byte\n");
         }
-        if (esc->data[0]) {
-            av_log(avctx, AV_LOG_ERROR, "unexpected data after first packet\n");
+        if (avpkt->size < (hdr_size + 1) * avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "packet size is too small\n");
             return AVERROR_INVALIDDATA;
         }
 
-        if (is_compr) {
         esc->fib_acc[0] = avpkt->data[1] + 128;
         if (avctx->channels == 2)
             esc->fib_acc[1] = avpkt->data[2+chan_size+1] + 128;
-        }
 
         esc->data_idx  = 0;
         esc->data_size = chan_size;
@@ -136,30 +135,22 @@ static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     /* get output buffer */
-    frame->nb_samples = buf_size * (is_compr + 1);
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = buf_size * 2;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (ch = 0; ch < avctx->channels; ch++) {
-        if (is_compr) {
-            delta_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
-                         buf_size, &esc->fib_acc[ch], esc->table);
-        } else {
-            raw_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
-                       buf_size);
-        }
+        delta_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
+                     buf_size, &esc->fib_acc[ch], esc->table);
     }
 
     esc->data_idx += buf_size;
 
     *got_frame_ptr = 1;
 
-    return avpkt->size;
+    return ((avctx->frame_number == 0)*hdr_size + buf_size)*avctx->channels;
 }
 
-/** initialize 8svx decoder */
 static av_cold int eightsvx_decode_init(AVCodecContext *avctx)
 {
     EightSvxContext *esc = avctx->priv_data;
@@ -169,17 +160,12 @@ static av_cold int eightsvx_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    switch(avctx->codec->id) {
-        case AV_CODEC_ID_8SVX_FIB:
-          esc->table = fibonacci;
-          break;
-        case AV_CODEC_ID_8SVX_EXP:
-          esc->table = exponential;
-          break;
-        case AV_CODEC_ID_PCM_S8_PLANAR:
-            break;
-        default:
-          return AVERROR_INVALIDDATA;
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_8SVX_FIB: esc->table = fibonacci;    break;
+    case AV_CODEC_ID_8SVX_EXP: esc->table = exponential;  break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Invalid codec id %d.\n", avctx->codec->id);
+        return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
 
@@ -192,10 +178,13 @@ static av_cold int eightsvx_decode_close(AVCodecContext *avctx)
 
     av_freep(&esc->data[0]);
     av_freep(&esc->data[1]);
+    esc->data_size = 0;
+    esc->data_idx = 0;
 
     return 0;
 }
 
+#if CONFIG_EIGHTSVX_FIB_DECODER
 AVCodec ff_eightsvx_fib_decoder = {
   .name           = "8svx_fib",
   .long_name      = NULL_IF_CONFIG_SMALL("8SVX fibonacci"),
@@ -203,13 +192,14 @@ AVCodec ff_eightsvx_fib_decoder = {
   .id             = AV_CODEC_ID_8SVX_FIB,
   .priv_data_size = sizeof (EightSvxContext),
   .init           = eightsvx_decode_init,
-  .close          = eightsvx_decode_close,
   .decode         = eightsvx_decode_frame,
-  .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+  .close          = eightsvx_decode_close,
+  .capabilities   = AV_CODEC_CAP_DR1,
   .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                     AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_EIGHTSVX_EXP_DECODER
 AVCodec ff_eightsvx_exp_decoder = {
   .name           = "8svx_exp",
   .long_name      = NULL_IF_CONFIG_SMALL("8SVX exponential"),
@@ -217,23 +207,10 @@ AVCodec ff_eightsvx_exp_decoder = {
   .id             = AV_CODEC_ID_8SVX_EXP,
   .priv_data_size = sizeof (EightSvxContext),
   .init           = eightsvx_decode_init,
-  .close          = eightsvx_decode_close,
   .decode         = eightsvx_decode_frame,
-  .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+  .close          = eightsvx_decode_close,
+  .capabilities   = AV_CODEC_CAP_DR1,
   .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                     AV_SAMPLE_FMT_NONE },
 };
-
-AVCodec ff_pcm_s8_planar_decoder = {
-    .name           = "pcm_s8_planar",
-    .long_name      = NULL_IF_CONFIG_SMALL("PCM signed 8-bit planar"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_PCM_S8_PLANAR,
-    .priv_data_size = sizeof(EightSvxContext),
-    .init           = eightsvx_decode_init,
-    .close          = eightsvx_decode_close,
-    .decode         = eightsvx_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
-                                                      AV_SAMPLE_FMT_NONE },
-};
+#endif
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index ae35628..15c43a8 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1,24 +1,29 @@
 NAME = avcodec
-DESC = Libav codec library
+DESC = FFmpeg codec library
 
 HEADERS = ac3_parser.h                                                  \
           adts_parser.h                                                 \
           avcodec.h                                                     \
+          avdct.h                                                       \
           avfft.h                                                       \
           d3d11va.h                                                     \
           dirac.h                                                       \
           dv_profile.h                                                  \
           dxva2.h                                                       \
+          jni.h                                                         \
+          mediacodec.h                                                  \
           qsv.h                                                         \
           vaapi.h                                                       \
-          vda.h                                                         \
           vdpau.h                                                       \
           version.h                                                     \
+          videotoolbox.h                                                \
           vorbis_parser.h                                               \
+          xvmc.h                                                        \
 
 OBJS = ac3_parser.o                                                     \
        adts_parser.o                                                    \
        allcodecs.o                                                      \
+       avdct.o                                                          \
        avpacket.o                                                       \
        avpicture.o                                                      \
        bitstream.o                                                      \
@@ -32,11 +37,14 @@ OBJS = ac3_parser.o                                                     \
        dv_profile.o                                                     \
        encode.o                                                         \
        imgconvert.o                                                     \
-       log2_tab.o                                                       \
+       jni.o                                                            \
        mathtables.o                                                     \
+       mediacodec.o                                                     \
        mpeg12framerate.o                                                \
        options.o                                                        \
+       mjpegenc_huffman.o                                               \
        parser.o                                                         \
+       parsers.o                                                        \
        profiles.o                                                       \
        qsv_api.o                                                        \
        raw.o                                                            \
@@ -46,7 +54,7 @@ OBJS = ac3_parser.o                                                     \
 
 # subsystems
 OBJS-$(CONFIG_AANDCTTABLES)            += aandcttab.o
-OBJS-$(CONFIG_AC3DSP)                  += ac3dsp.o
+OBJS-$(CONFIG_AC3DSP)                  += ac3dsp.o ac3.o ac3tab.o
 OBJS-$(CONFIG_ADTS_HEADER)             += adts_header.o mpeg4audio.o
 OBJS-$(CONFIG_AMF)                     += amfenc.o
 OBJS-$(CONFIG_AUDIO_FRAME_QUEUE)       += audio_frame_queue.o
@@ -55,16 +63,22 @@ OBJS-$(CONFIG_BLOCKDSP)                += blockdsp.o
 OBJS-$(CONFIG_BSWAPDSP)                += bswapdsp.o
 OBJS-$(CONFIG_CABAC)                   += cabac.o
 OBJS-$(CONFIG_CBS)                     += cbs.o
+OBJS-$(CONFIG_CBS_AV1)                 += cbs_av1.o
 OBJS-$(CONFIG_CBS_H264)                += cbs_h2645.o h2645_parse.o
 OBJS-$(CONFIG_CBS_H265)                += cbs_h2645.o h2645_parse.o
+OBJS-$(CONFIG_CBS_JPEG)                += cbs_jpeg.o
 OBJS-$(CONFIG_CBS_MPEG2)               += cbs_mpeg2.o
+OBJS-$(CONFIG_CBS_VP9)                 += cbs_vp9.o
+OBJS-$(CONFIG_CRYSTALHD)               += crystalhd.o
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
 OBJS-$(CONFIG_ERROR_RESILIENCE)        += error_resilience.o
+OBJS-$(CONFIG_EXIF)                    += exif.o tiff_common.o
 OBJS-$(CONFIG_FAANDCT)                 += faandct.o
 OBJS-$(CONFIG_FAANIDCT)                += faanidct.o
 OBJS-$(CONFIG_FDCTDSP)                 += fdctdsp.o jfdctfst.o jfdctint.o
 FFT-OBJS-$(CONFIG_HARDCODED_TABLES)    += cos_tables.o cos_fixed_tables.o
 OBJS-$(CONFIG_FFT)                     += avfft.o fft_fixed.o fft_float.o \
+                                          fft_fixed_32.o fft_init_table.o \
                                           $(FFT-OBJS-yes)
 OBJS-$(CONFIG_FLACDSP)                 += flacdsp.o
 OBJS-$(CONFIG_FMTCONVERT)              += fmtconvert.o
@@ -75,22 +89,27 @@ OBJS-$(CONFIG_H264DSP)                 += h264dsp.o h264idct.o
 OBJS-$(CONFIG_H264PARSE)               += h264_parse.o h2645_parse.o h264_ps.o
 OBJS-$(CONFIG_H264PRED)                += h264pred.o
 OBJS-$(CONFIG_H264QPEL)                += h264qpel.o
-OBJS-$(CONFIG_HEVCPARSE)               += h2645_parse.o hevc_ps.o
+OBJS-$(CONFIG_HEVCPARSE)               += hevc_parse.o h2645_parse.o hevc_ps.o hevc_sei.o hevc_data.o
 OBJS-$(CONFIG_HPELDSP)                 += hpeldsp.o
 OBJS-$(CONFIG_HUFFMAN)                 += huffman.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += huffyuvdsp.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += huffyuvencdsp.o
 OBJS-$(CONFIG_IDCTDSP)                 += idctdsp.o simple_idct.o jrevdct.o
 OBJS-$(CONFIG_IIRFILTER)               += iirfilter.o
-OBJS-$(CONFIG_IMDCT15)                 += imdct15.o
-OBJS-$(CONFIG_INTRAX8)                 += intrax8.o intrax8dsp.o
+OBJS-$(CONFIG_MDCT15)                  += mdct15.o
+OBJS-$(CONFIG_INTRAX8)                 += intrax8.o intrax8dsp.o msmpeg4data.o
 OBJS-$(CONFIG_IVIDSP)                  += ivi_dsp.o
+OBJS-$(CONFIG_JNI)                     += ffjni.o jni.o
 OBJS-$(CONFIG_JPEGTABLES)              += jpegtables.o
+OBJS-$(CONFIG_LLAUDDSP)                += lossless_audiodsp.o
+OBJS-$(CONFIG_LLVIDDSP)                += lossless_videodsp.o
+OBJS-$(CONFIG_LLVIDENCDSP)             += lossless_videoencdsp.o
 OBJS-$(CONFIG_LPC)                     += lpc.o
 OBJS-$(CONFIG_LSP)                     += lsp.o
 OBJS-$(CONFIG_LZF)                     += lzf.o
-OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o
+OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o mdct_fixed_32.o
 OBJS-$(CONFIG_ME_CMP)                  += me_cmp.o
+OBJS-$(CONFIG_MEDIACODEC)              += mediacodecdec_common.o mediacodec_surface.o mediacodec_wrapper.o mediacodec_sw_buffer.o
 OBJS-$(CONFIG_MPEG_ER)                 += mpeg_er.o
 OBJS-$(CONFIG_MPEGAUDIO)               += mpegaudio.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += mpegaudiodsp.o                \
@@ -112,10 +131,10 @@ OBJS-$(CONFIG_QSV)                     += qsv.o
 OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
 OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
 OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
-RDFT-OBJS-$(CONFIG_HARDCODED_TABLES)   += sin_tables.o
-OBJS-$(CONFIG_RDFT)                    += rdft.o $(RDFT-OBJS-yes)
+OBJS-$(CONFIG_RDFT)                    += rdft.o
 OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
-OBJS-$(CONFIG_SINEWIN)                 += sinewin.o
+OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
+OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
 OBJS-$(CONFIG_SNAPPY)                  += snappy.o
 OBJS-$(CONFIG_STARTCODE)               += startcode.o
 OBJS-$(CONFIG_TEXTUREDSP)              += texturedsp.o
@@ -127,29 +146,39 @@ OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
 OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
+OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
 OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
 
 # decoders/encoders
+OBJS-$(CONFIG_ZERO12V_DECODER)         += 012v.o
 OBJS-$(CONFIG_A64MULTI_ENCODER)        += a64multienc.o elbg.o
 OBJS-$(CONFIG_A64MULTI5_ENCODER)       += a64multienc.o elbg.o
-OBJS-$(CONFIG_AAC_DECODER)             += aacdec.o aactab.o aacsbr.o aacps.o \
+OBJS-$(CONFIG_AAC_DECODER)             += aacdec.o aactab.o aacsbr.o aacps_float.o \
                                           mpeg4audio.o kbdwin.o \
-                                          sbrdsp.o aacpsdsp.o
-OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o    \
+                                          sbrdsp.o aacpsdsp_float.o cbrt_data.o
+OBJS-$(CONFIG_AAC_FIXED_DECODER)       += aacdec_fixed.o aactab.o aacsbr_fixed.o aacps_fixed.o \
+                                          mpeg4audio.o kbdwin.o \
+                                          sbrdsp_fixed.o aacpsdsp_fixed.o cbrt_data_fixed.o
+OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o aacenctab.o    \
                                           aacpsy.o aactab.o      \
-                                          psymodel.o mpeg4audio.o kbdwin.o
+                                          aacenc_is.o \
+                                          aacenc_tns.o \
+                                          aacenc_ltp.o \
+                                          aacenc_pred.o \
+                                          psymodel.o mpeg4audio.o kbdwin.o cbrt_data.o
 OBJS-$(CONFIG_AASC_DECODER)            += aasc.o msrledec.o
-OBJS-$(CONFIG_AC3_DECODER)             += ac3dec.o ac3dec_data.o ac3.o kbdwin.o
+OBJS-$(CONFIG_AC3_DECODER)             += ac3dec_float.o ac3dec_data.o ac3.o kbdwin.o ac3tab.o
+OBJS-$(CONFIG_AC3_FIXED_DECODER)       += ac3dec_fixed.o ac3dec_data.o ac3.o kbdwin.o ac3tab.o
 OBJS-$(CONFIG_AC3_ENCODER)             += ac3enc_float.o ac3enc.o ac3tab.o \
                                           ac3.o kbdwin.o
 OBJS-$(CONFIG_AC3_FIXED_ENCODER)       += ac3enc_fixed.o ac3enc.o ac3tab.o ac3.o
 OBJS-$(CONFIG_AIC_DECODER)             += aic.o
-OBJS-$(CONFIG_ALAC_DECODER)            += alac.o alac_data.o
+OBJS-$(CONFIG_ALAC_DECODER)            += alac.o alac_data.o alacdsp.o
 OBJS-$(CONFIG_ALAC_ENCODER)            += alacenc.o alac_data.o
 OBJS-$(CONFIG_ALIAS_PIX_DECODER)       += aliaspixdec.o
 OBJS-$(CONFIG_ALIAS_PIX_ENCODER)       += aliaspixenc.o
-OBJS-$(CONFIG_ALS_DECODER)             += alsdec.o bgmc.o mpeg4audio.o
+OBJS-$(CONFIG_ALS_DECODER)             += alsdec.o bgmc.o mlz.o mpeg4audio.o
 OBJS-$(CONFIG_AMRNB_DECODER)           += amrnbdec.o celp_filters.o   \
                                           celp_math.o acelp_filters.o \
                                           acelp_vectors.o             \
@@ -158,9 +187,20 @@ OBJS-$(CONFIG_AMRWB_DECODER)           += amrwbdec.o celp_filters.o   \
                                           celp_math.o acelp_filters.o \
                                           acelp_vectors.o             \
                                           acelp_pitch_delay.o
+OBJS-$(CONFIG_AMV_ENCODER)             += mjpegenc.o mjpegenc_common.o \
+                                          mjpegenc_huffman.o
 OBJS-$(CONFIG_ANM_DECODER)             += anm.o
 OBJS-$(CONFIG_ANSI_DECODER)            += ansi.o cga_data.o
 OBJS-$(CONFIG_APE_DECODER)             += apedec.o
+OBJS-$(CONFIG_APTX_DECODER)            += aptx.o
+OBJS-$(CONFIG_APTX_ENCODER)            += aptx.o
+OBJS-$(CONFIG_APTX_HD_DECODER)         += aptx.o
+OBJS-$(CONFIG_APTX_HD_ENCODER)         += aptx.o
+OBJS-$(CONFIG_APNG_DECODER)            += png.o pngdec.o pngdsp.o
+OBJS-$(CONFIG_APNG_ENCODER)            += png.o pngenc.o
+OBJS-$(CONFIG_ARBC_DECODER)            += arbc.o
+OBJS-$(CONFIG_SSA_DECODER)             += assdec.o ass.o
+OBJS-$(CONFIG_SSA_ENCODER)             += assenc.o ass.o
 OBJS-$(CONFIG_ASS_DECODER)             += assdec.o ass.o
 OBJS-$(CONFIG_ASS_ENCODER)             += assenc.o ass.o
 OBJS-$(CONFIG_ASV1_DECODER)            += asvdec.o asv.o mpeg12data.o
@@ -169,16 +209,29 @@ OBJS-$(CONFIG_ASV2_DECODER)            += asvdec.o asv.o mpeg12data.o
 OBJS-$(CONFIG_ASV2_ENCODER)            += asvenc.o asv.o mpeg12data.o
 OBJS-$(CONFIG_ATRAC1_DECODER)          += atrac1.o atrac.o
 OBJS-$(CONFIG_ATRAC3_DECODER)          += atrac3.o atrac.o
+OBJS-$(CONFIG_ATRAC3AL_DECODER)        += atrac3.o atrac.o
 OBJS-$(CONFIG_ATRAC3P_DECODER)         += atrac3plusdec.o atrac3plus.o \
                                           atrac3plusdsp.o atrac.o
+OBJS-$(CONFIG_ATRAC3PAL_DECODER)       += atrac3plusdec.o atrac3plus.o \
+                                          atrac3plusdsp.o atrac.o
+OBJS-$(CONFIG_ATRAC9_DECODER)          += atrac9dec.o
 OBJS-$(CONFIG_AURA_DECODER)            += cyuv.o
 OBJS-$(CONFIG_AURA2_DECODER)           += aura.o
+OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o mjpegdec.o
+OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
+OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_AVS_DECODER)             += avs.o
+OBJS-$(CONFIG_AVUI_DECODER)            += avuidec.o
+OBJS-$(CONFIG_AVUI_ENCODER)            += avuienc.o
+OBJS-$(CONFIG_AYUV_DECODER)            += v408dec.o
+OBJS-$(CONFIG_AYUV_ENCODER)            += v408enc.o
 OBJS-$(CONFIG_BETHSOFTVID_DECODER)     += bethsoftvideo.o
 OBJS-$(CONFIG_BFI_DECODER)             += bfi.o
 OBJS-$(CONFIG_BINK_DECODER)            += bink.o binkdsp.o
 OBJS-$(CONFIG_BINKAUDIO_DCT_DECODER)   += binkaudio.o
 OBJS-$(CONFIG_BINKAUDIO_RDFT_DECODER)  += binkaudio.o
+OBJS-$(CONFIG_BINTEXT_DECODER)         += bintext.o cga_data.o
+OBJS-$(CONFIG_BITPACKED_DECODER)       += bitpacked.o
 OBJS-$(CONFIG_BMP_DECODER)             += bmp.o msrledec.o
 OBJS-$(CONFIG_BMP_ENCODER)             += bmpenc.o
 OBJS-$(CONFIG_BMV_AUDIO_DECODER)       += bmvaudio.o
@@ -186,12 +239,13 @@ OBJS-$(CONFIG_BMV_VIDEO_DECODER)       += bmvvideo.o
 OBJS-$(CONFIG_BRENDER_PIX_DECODER)     += brenderpix.o
 OBJS-$(CONFIG_C93_DECODER)             += c93.o
 OBJS-$(CONFIG_CAVS_DECODER)            += cavs.o cavsdec.o cavsdsp.o \
-                                          cavsdata.o mpeg12data.o
+                                          cavsdata.o
+OBJS-$(CONFIG_CCAPTION_DECODER)        += ccaption_dec.o
 OBJS-$(CONFIG_CDGRAPHICS_DECODER)      += cdgraphics.o
 OBJS-$(CONFIG_CDXL_DECODER)            += cdxl.o
 OBJS-$(CONFIG_CFHD_DECODER)            += cfhd.o cfhddata.o
 OBJS-$(CONFIG_CINEPAK_DECODER)         += cinepak.o
-OBJS-$(CONFIG_CINEPAK_ENCODER)         += cinepakenc.o
+OBJS-$(CONFIG_CINEPAK_ENCODER)         += cinepakenc.o elbg.o
 OBJS-$(CONFIG_CLEARVIDEO_DECODER)      += clearvideo.o
 OBJS-$(CONFIG_CLJR_DECODER)            += cljrdec.o
 OBJS-$(CONFIG_CLJR_ENCODER)            += cljrenc.o
@@ -199,30 +253,42 @@ OBJS-$(CONFIG_CLLC_DECODER)            += cllc.o canopus.o
 OBJS-$(CONFIG_COMFORTNOISE_DECODER)    += cngdec.o celp_filters.o
 OBJS-$(CONFIG_COMFORTNOISE_ENCODER)    += cngenc.o
 OBJS-$(CONFIG_COOK_DECODER)            += cook.o
+OBJS-$(CONFIG_CPIA_DECODER)            += cpia.o
 OBJS-$(CONFIG_CSCD_DECODER)            += cscd.o
 OBJS-$(CONFIG_CYUV_DECODER)            += cyuv.o
-OBJS-$(CONFIG_DCA_DECODER)             += dcadec.o dca.o dcadsp.o      \
-                                          dcadata.o dca_exss.o         \
-                                          dca_xll.o synth_filter.o
+OBJS-$(CONFIG_DCA_DECODER)             += dcadec.o dca.o dcadata.o dcahuff.o \
+                                          dca_core.o dca_exss.o dca_xll.o dca_lbr.o \
+                                          dcadsp.o dcadct.o synth_filter.o
+OBJS-$(CONFIG_DCA_ENCODER)             += dcaenc.o dca.o dcadata.o dcahuff.o \
+                                          dcaadpcm.o
 OBJS-$(CONFIG_DDS_DECODER)             += dds.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += diracdec.o dirac.o diracdsp.o diractab.o \
+                                          dirac_arith.o dirac_dwt.o dirac_vlc.o
 OBJS-$(CONFIG_DFA_DECODER)             += dfa.o
 OBJS-$(CONFIG_DNXHD_DECODER)           += dnxhddec.o dnxhddata.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += dnxhdenc.o dnxhddata.o
+OBJS-$(CONFIG_DOLBY_E_DECODER)         += dolby_e.o kbdwin.o
 OBJS-$(CONFIG_DPX_DECODER)             += dpx.o
 OBJS-$(CONFIG_DPX_ENCODER)             += dpxenc.o
+OBJS-$(CONFIG_DSD_LSBF_DECODER)        += dsddec.o dsd.o
+OBJS-$(CONFIG_DSD_MSBF_DECODER)        += dsddec.o dsd.o
+OBJS-$(CONFIG_DSD_LSBF_PLANAR_DECODER) += dsddec.o dsd.o
+OBJS-$(CONFIG_DSD_MSBF_PLANAR_DECODER) += dsddec.o dsd.o
 OBJS-$(CONFIG_DSICINAUDIO_DECODER)     += dsicinaudio.o
 OBJS-$(CONFIG_DSICINVIDEO_DECODER)     += dsicinvideo.o
 OBJS-$(CONFIG_DSS_SP_DECODER)          += dss_sp.o
+OBJS-$(CONFIG_DST_DECODER)             += dstdec.o dsd.o
 OBJS-$(CONFIG_DVBSUB_DECODER)          += dvbsubdec.o
 OBJS-$(CONFIG_DVBSUB_ENCODER)          += dvbsub.o
 OBJS-$(CONFIG_DVDSUB_DECODER)          += dvdsubdec.o
 OBJS-$(CONFIG_DVDSUB_ENCODER)          += dvdsubenc.o
+OBJS-$(CONFIG_DVAUDIO_DECODER)         += dvaudiodec.o
 OBJS-$(CONFIG_DVVIDEO_DECODER)         += dvdec.o dv.o dvdata.o
 OBJS-$(CONFIG_DVVIDEO_ENCODER)         += dvenc.o dv.o dvdata.o
 OBJS-$(CONFIG_DXA_DECODER)             += dxa.o
 OBJS-$(CONFIG_DXTORY_DECODER)          += dxtory.o
 OBJS-$(CONFIG_DXV_DECODER)             += dxv.o
-OBJS-$(CONFIG_EAC3_DECODER)            += eac3dec.o eac3_data.o
+OBJS-$(CONFIG_EAC3_DECODER)            += eac3_data.o
 OBJS-$(CONFIG_EAC3_ENCODER)            += eac3enc.o eac3_data.o
 OBJS-$(CONFIG_EACMV_DECODER)           += eacmv.o
 OBJS-$(CONFIG_EAMAD_DECODER)           += eamad.o eaidct.o mpeg12.o \
@@ -235,14 +301,19 @@ OBJS-$(CONFIG_EIGHTSVX_EXP_DECODER)    += 8svx.o
 OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
 OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
 OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
-OBJS-$(CONFIG_EXR_DECODER)             += exr.o
+OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
+OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
 OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
 OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
+OBJS-$(CONFIG_FFWAVESYNTH_DECODER)     += ffwavesynth.o
 OBJS-$(CONFIG_FIC_DECODER)             += fic.o
+OBJS-$(CONFIG_FITS_DECODER)            += fitsdec.o fits.o
+OBJS-$(CONFIG_FITS_ENCODER)            += fitsenc.o
 OBJS-$(CONFIG_FLAC_DECODER)            += flacdec.o flacdata.o flac.o
-OBJS-$(CONFIG_FLAC_ENCODER)            += flacenc.o flacdata.o flac.o
+OBJS-$(CONFIG_FLAC_ENCODER)            += flacenc.o flacdata.o flac.o vorbis_data.o
 OBJS-$(CONFIG_FLASHSV_DECODER)         += flashsv.o
 OBJS-$(CONFIG_FLASHSV_ENCODER)         += flashsvenc.o
+OBJS-$(CONFIG_FLASHSV2_ENCODER)        += flashsv2enc.o
 OBJS-$(CONFIG_FLASHSV2_DECODER)        += flashsv.o
 OBJS-$(CONFIG_FLIC_DECODER)            += flicvideo.o
 OBJS-$(CONFIG_FMVC_DECODER)            += fmvc.o
@@ -251,13 +322,14 @@ OBJS-$(CONFIG_FRAPS_DECODER)           += fraps.o
 OBJS-$(CONFIG_FRWU_DECODER)            += frwu.o
 OBJS-$(CONFIG_G2M_DECODER)             += g2meet.o elsdec.o
 OBJS-$(CONFIG_G723_1_DECODER)          += g723_1dec.o g723_1.o \
-                                          acelp_vectors.o celp_filters.o \
-                                          celp_math.o
+                                          acelp_vectors.o celp_filters.o celp_math.o
 OBJS-$(CONFIG_G723_1_ENCODER)          += g723_1enc.o g723_1.o \
-                                          acelp_vectors.o celp_filters.o \
-                                          celp_math.o
+                                          acelp_vectors.o celp_filters.o celp_math.o
+OBJS-$(CONFIG_G729_DECODER)            += g729dec.o lsp.o celp_math.o celp_filters.o acelp_filters.o acelp_pitch_delay.o acelp_vectors.o g729postfilter.o
+OBJS-$(CONFIG_GDV_DECODER)             += gdv.o
 OBJS-$(CONFIG_GIF_DECODER)             += gifdec.o lzw.o
 OBJS-$(CONFIG_GIF_ENCODER)             += gif.o lzwenc.o
+OBJS-$(CONFIG_GREMLIN_DPCM_DECODER)    += dpcm.o
 OBJS-$(CONFIG_GSM_DECODER)             += gsmdec.o gsmdec_data.o msgsmdec.o
 OBJS-$(CONFIG_GSM_MS_DECODER)          += gsmdec.o gsmdec_data.o msgsmdec.o
 OBJS-$(CONFIG_H261_DECODER)            += h261dec.o h261data.o h261.o
@@ -267,45 +339,69 @@ OBJS-$(CONFIG_H263_DECODER)            += h263dec.o h263.o ituh263dec.o        \
                                           intelh263dec.o h263data.o
 OBJS-$(CONFIG_H263_ENCODER)            += mpeg4videoenc.o mpeg4video.o  \
                                           h263.o ituh263enc.o flvenc.o h263data.o
+OBJS-$(CONFIG_H263_V4L2M2M_DECODER)    += v4l2_m2m_dec.o
+OBJS-$(CONFIG_H263_V4L2M2M_ENCODER)    += v4l2_m2m_enc.o
 OBJS-$(CONFIG_H264_DECODER)            += h264dec.o h264_cabac.o h264_cavlc.o \
                                           h264_direct.o h264_loopfilter.o  \
                                           h264_mb.o h264_picture.o \
                                           h264_refs.o h264_sei.o \
                                           h264_slice.o h264data.o
 OBJS-$(CONFIG_H264_AMF_ENCODER)        += amfenc_h264.o
+OBJS-$(CONFIG_H264_CUVID_DECODER)      += cuviddec.o
+OBJS-$(CONFIG_H264_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_H264_MMAL_DECODER)       += mmaldec.o
 OBJS-$(CONFIG_H264_NVENC_ENCODER)      += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_ENCODER)           += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_H264_ENCODER)      += nvenc_h264.o
 OBJS-$(CONFIG_H264_OMX_ENCODER)        += omx.o
 OBJS-$(CONFIG_H264_QSV_DECODER)        += qsvdec_h2645.o
 OBJS-$(CONFIG_H264_QSV_ENCODER)        += qsvenc_h264.o
-OBJS-$(CONFIG_H264_VAAPI_ENCODER)      += vaapi_encode_h264.o
+OBJS-$(CONFIG_H264_RKMPP_DECODER)      += rkmppdec.o
+OBJS-$(CONFIG_H264_VAAPI_ENCODER)      += vaapi_encode_h264.o h264_levels.o
+OBJS-$(CONFIG_H264_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
+OBJS-$(CONFIG_H264_V4L2M2M_DECODER)    += v4l2_m2m_dec.o
+OBJS-$(CONFIG_H264_V4L2M2M_ENCODER)    += v4l2_m2m_enc.o
 OBJS-$(CONFIG_HAP_DECODER)             += hapdec.o hap.o
 OBJS-$(CONFIG_HAP_ENCODER)             += hapenc.o hap.o
-OBJS-$(CONFIG_HEVC_DECODER)            += hevcdec.o hevc_mvs.o hevc_sei.o \
+OBJS-$(CONFIG_HCOM_DECODER)            += hcom.o
+OBJS-$(CONFIG_HEVC_DECODER)            += hevcdec.o hevc_mvs.o \
                                           hevc_cabac.o hevc_refs.o hevcpred.o    \
                                           hevcdsp.o hevc_filter.o hevc_data.o
 OBJS-$(CONFIG_HEVC_AMF_ENCODER)        += amfenc_hevc.o
+OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuviddec.o
+OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_HEVC_NVENC_ENCODER)      += nvenc_hevc.o
+OBJS-$(CONFIG_NVENC_HEVC_ENCODER)      += nvenc_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_DECODER)        += qsvdec_h2645.o
 OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o       \
                                           hevc_data.o
-OBJS-$(CONFIG_HEVC_VAAPI_ENCODER)      += vaapi_encode_h265.o
+OBJS-$(CONFIG_HEVC_RKMPP_DECODER)      += rkmppdec.o
+OBJS-$(CONFIG_HEVC_VAAPI_ENCODER)      += vaapi_encode_h265.o h265_profile_level.o
+OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER)    += v4l2_m2m_dec.o
+OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER)    += v4l2_m2m_enc.o
 OBJS-$(CONFIG_HNM4_VIDEO_DECODER)      += hnm4video.o
 OBJS-$(CONFIG_HQ_HQA_DECODER)          += hq_hqa.o hq_hqadata.o hq_hqadsp.o \
                                           canopus.o
 OBJS-$(CONFIG_HQX_DECODER)             += hqx.o hqxvlc.o hqxdsp.o canopus.o
 OBJS-$(CONFIG_HUFFYUV_DECODER)         += huffyuv.o huffyuvdec.o
 OBJS-$(CONFIG_HUFFYUV_ENCODER)         += huffyuv.o huffyuvenc.o
+OBJS-$(CONFIG_HYMT_DECODER)            += huffyuv.o huffyuvdec.o
 OBJS-$(CONFIG_IDCIN_DECODER)           += idcinvideo.o
-OBJS-$(CONFIG_IFF_BYTERUN1_DECODER)    += iff.o
+OBJS-$(CONFIG_IDF_DECODER)             += bintext.o cga_data.o
 OBJS-$(CONFIG_IFF_ILBM_DECODER)        += iff.o
+OBJS-$(CONFIG_ILBC_DECODER)            += ilbcdec.o
 OBJS-$(CONFIG_IMC_DECODER)             += imc.o
+OBJS-$(CONFIG_IMM4_DECODER)            += imm4.o
 OBJS-$(CONFIG_INDEO2_DECODER)          += indeo2.o
 OBJS-$(CONFIG_INDEO3_DECODER)          += indeo3.o
 OBJS-$(CONFIG_INDEO4_DECODER)          += indeo4.o ivi.o
 OBJS-$(CONFIG_INDEO5_DECODER)          += indeo5.o ivi.o
+OBJS-$(CONFIG_INTERPLAY_ACM_DECODER)   += interplayacm.o
 OBJS-$(CONFIG_INTERPLAY_DPCM_DECODER)  += dpcm.o
 OBJS-$(CONFIG_INTERPLAY_VIDEO_DECODER) += interplayvideo.o
+OBJS-$(CONFIG_JACOSUB_DECODER)         += jacosubdec.o ass.o
+OBJS-$(CONFIG_JPEG2000_ENCODER)        += j2kenc.o mqcenc.o mqc.o jpeg2000.o \
+                                          jpeg2000dwt.o
 OBJS-$(CONFIG_JPEG2000_DECODER)        += jpeg2000dec.o jpeg2000.o jpeg2000dsp.o \
                                           jpeg2000dwt.o mqcdec.o mqc.o
 OBJS-$(CONFIG_JPEGLS_DECODER)          += jpeglsdec.o jpegls.o
@@ -316,25 +412,35 @@ OBJS-$(CONFIG_KMVC_DECODER)            += kmvc.o
 OBJS-$(CONFIG_LAGARITH_DECODER)        += lagarith.o lagarithrac.o
 OBJS-$(CONFIG_LJPEG_ENCODER)           += ljpegenc.o mjpegenc_common.o
 OBJS-$(CONFIG_LOCO_DECODER)            += loco.o
+OBJS-$(CONFIG_M101_DECODER)            += m101.o
 OBJS-$(CONFIG_MACE3_DECODER)           += mace.o
 OBJS-$(CONFIG_MACE6_DECODER)           += mace.o
 OBJS-$(CONFIG_MAGICYUV_DECODER)        += magicyuv.o
+OBJS-$(CONFIG_MAGICYUV_ENCODER)        += magicyuvenc.o
 OBJS-$(CONFIG_MDEC_DECODER)            += mdec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_METASOUND_DECODER)       += metasound.o metasound_data.o \
                                           twinvq.o
+OBJS-$(CONFIG_MICRODVD_DECODER)        += microdvddec.o ass.o
 OBJS-$(CONFIG_MIMIC_DECODER)           += mimic.o
 OBJS-$(CONFIG_MJPEG_DECODER)           += mjpegdec.o
-OBJS-$(CONFIG_MJPEG_ENCODER)           += mjpegenc.o mjpegenc_common.o
+OBJS-$(CONFIG_MJPEG_ENCODER)           += mjpegenc.o mjpegenc_common.o \
+                                          mjpegenc_huffman.o
 OBJS-$(CONFIG_MJPEGB_DECODER)          += mjpegbdec.o
+OBJS-$(CONFIG_MJPEG_CUVID_DECODER)     += cuviddec.o
 OBJS-$(CONFIG_MJPEG_QSV_ENCODER)       += qsvenc_jpeg.o
 OBJS-$(CONFIG_MJPEG_VAAPI_ENCODER)     += vaapi_encode_mjpeg.o
 OBJS-$(CONFIG_MLP_DECODER)             += mlpdec.o mlpdsp.o
+OBJS-$(CONFIG_MLP_ENCODER)             += mlpenc.o mlp.o
 OBJS-$(CONFIG_MMVIDEO_DECODER)         += mmvideo.o
 OBJS-$(CONFIG_MOTIONPIXELS_DECODER)    += motionpixels.o
+OBJS-$(CONFIG_MOVTEXT_DECODER)         += movtextdec.o ass.o
+OBJS-$(CONFIG_MOVTEXT_ENCODER)         += movtextenc.o ass_split.o
 OBJS-$(CONFIG_MP1_DECODER)             += mpegaudiodec_fixed.o
 OBJS-$(CONFIG_MP1FLOAT_DECODER)        += mpegaudiodec_float.o
 OBJS-$(CONFIG_MP2_DECODER)             += mpegaudiodec_fixed.o
-OBJS-$(CONFIG_MP2_ENCODER)             += mpegaudioenc.o mpegaudio.o \
+OBJS-$(CONFIG_MP2_ENCODER)             += mpegaudioenc_float.o mpegaudio.o \
+                                          mpegaudiodata.o mpegaudiodsp_data.o
+OBJS-$(CONFIG_MP2FIXED_ENCODER)        += mpegaudioenc_fixed.o mpegaudio.o \
                                           mpegaudiodata.o mpegaudiodsp_data.o
 OBJS-$(CONFIG_MP2FLOAT_DECODER)        += mpegaudiodec_float.o
 OBJS-$(CONFIG_MP3_DECODER)             += mpegaudiodec_fixed.o
@@ -345,17 +451,29 @@ OBJS-$(CONFIG_MP3ON4_DECODER)          += mpegaudiodec_fixed.o mpeg4audio.o
 OBJS-$(CONFIG_MP3ON4FLOAT_DECODER)     += mpegaudiodec_float.o mpeg4audio.o
 OBJS-$(CONFIG_MPC7_DECODER)            += mpc7.o mpc.o
 OBJS-$(CONFIG_MPC8_DECODER)            += mpc8.o mpc.o
+OBJS-$(CONFIG_MPEGVIDEO_DECODER)       += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG1VIDEO_DECODER)      += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG1VIDEO_ENCODER)      += mpeg12enc.o mpeg12.o
+OBJS-$(CONFIG_MPEG1_CUVID_DECODER)     += cuviddec.o
+OBJS-$(CONFIG_MPEG1_V4L2M2M_DECODER)   += v4l2_m2m_dec.o
+OBJS-$(CONFIG_MPEG2_MMAL_DECODER)      += mmaldec.o
 OBJS-$(CONFIG_MPEG2_QSV_DECODER)       += qsvdec_other.o
 OBJS-$(CONFIG_MPEG2_QSV_ENCODER)       += qsvenc_mpeg2.o
 OBJS-$(CONFIG_MPEG2VIDEO_DECODER)      += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG2VIDEO_ENCODER)      += mpeg12enc.o mpeg12.o
-OBJS-$(CONFIG_MPEG2_MMAL_DECODER)      += mmaldec.o
+OBJS-$(CONFIG_MPEG2_CUVID_DECODER)     += cuviddec.o
+OBJS-$(CONFIG_MPEG2_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_MPEG2_VAAPI_ENCODER)     += vaapi_encode_mpeg2.o
+OBJS-$(CONFIG_MPEG2_V4L2M2M_DECODER)   += v4l2_m2m_dec.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += xvididct.o
+OBJS-$(CONFIG_MPEG4_CUVID_DECODER)     += cuviddec.o
+OBJS-$(CONFIG_MPEG4_MEDIACODEC_DECODER) += mediacodecdec.o
 OBJS-$(CONFIG_MPEG4_OMX_ENCODER)       += omx.o
+OBJS-$(CONFIG_MPEG4_V4L2M2M_DECODER)   += v4l2_m2m_dec.o
+OBJS-$(CONFIG_MPEG4_V4L2M2M_ENCODER)   += v4l2_m2m_enc.o
+OBJS-$(CONFIG_MPL2_DECODER)            += mpl2dec.o ass.o
 OBJS-$(CONFIG_MSA1_DECODER)            += mss3.o
+OBJS-$(CONFIG_MSCC_DECODER)            += mscc.o
 OBJS-$(CONFIG_MSMPEG4V1_DECODER)       += msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_MSMPEG4V2_DECODER)       += msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_MSMPEG4V2_ENCODER)       += msmpeg4enc.o msmpeg4.o msmpeg4data.o
@@ -365,17 +483,21 @@ OBJS-$(CONFIG_MSRLE_DECODER)           += msrle.o msrledec.o
 OBJS-$(CONFIG_MSS1_DECODER)            += mss1.o mss12.o
 OBJS-$(CONFIG_MSS2_DECODER)            += mss2.o mss12.o mss2dsp.o wmv2data.o
 OBJS-$(CONFIG_MSVIDEO1_DECODER)        += msvideo1.o
+OBJS-$(CONFIG_MSVIDEO1_ENCODER)        += msvideo1enc.o elbg.o
 OBJS-$(CONFIG_MSZH_DECODER)            += lcldec.o
 OBJS-$(CONFIG_MTS2_DECODER)            += mss4.o
 OBJS-$(CONFIG_MVC1_DECODER)            += mvcdec.o
 OBJS-$(CONFIG_MVC2_DECODER)            += mvcdec.o
+OBJS-$(CONFIG_MWSC_DECODER)            += mwsc.o
 OBJS-$(CONFIG_MXPEG_DECODER)           += mxpegdec.o
 OBJS-$(CONFIG_NELLYMOSER_DECODER)      += nellymoserdec.o nellymoser.o
 OBJS-$(CONFIG_NELLYMOSER_ENCODER)      += nellymoserenc.o nellymoser.o
 OBJS-$(CONFIG_NUV_DECODER)             += nuv.o rtjpeg.o
 OBJS-$(CONFIG_ON2AVC_DECODER)          += on2avc.o on2avcdata.o
-OBJS-$(CONFIG_OPUS_DECODER)            += opusdec.o opus.o opus_celt.o \
-                                          opus_silk.o vorbis_data.o
+OBJS-$(CONFIG_OPUS_DECODER)            += opusdec.o opus.o opus_celt.o opus_rc.o \
+                                          opus_pvq.o opus_silk.o opustab.o vorbis_data.o
+OBJS-$(CONFIG_OPUS_ENCODER)            += opusenc.o opus.o opus_rc.o opustab.o opus_pvq.o \
+                                          opusenc_psy.o
 OBJS-$(CONFIG_PAF_AUDIO_DECODER)       += pafaudio.o
 OBJS-$(CONFIG_PAF_VIDEO_DECODER)       += pafvideo.o
 OBJS-$(CONFIG_PAM_DECODER)             += pnmdec.o pnm.o
@@ -391,29 +513,39 @@ OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
 OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
 OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
 OBJS-$(CONFIG_PIXLET_DECODER)          += pixlet.o
+OBJS-$(CONFIG_PJS_DECODER)             += textdec.o ass.o
 OBJS-$(CONFIG_PNG_DECODER)             += png.o pngdec.o pngdsp.o
 OBJS-$(CONFIG_PNG_ENCODER)             += png.o pngenc.o
 OBJS-$(CONFIG_PPM_DECODER)             += pnmdec.o pnm.o
 OBJS-$(CONFIG_PPM_ENCODER)             += pnmenc.o
-OBJS-$(CONFIG_PRORES_DECODER)          += proresdec.o proresdata.o proresdsp.o
-OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc.o proresdata.o
+OBJS-$(CONFIG_PRORES_DECODER)          += proresdec2.o proresdsp.o proresdata.o
+OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc_anatoliy.o proresdata.o
+OBJS-$(CONFIG_PRORES_AW_ENCODER)       += proresenc_anatoliy.o proresdata.o
+OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o
+OBJS-$(CONFIG_PROSUMER_DECODER)        += prosumer.o
+OBJS-$(CONFIG_PSD_DECODER)             += psd.o
 OBJS-$(CONFIG_PTX_DECODER)             += ptx.o
 OBJS-$(CONFIG_QCELP_DECODER)           += qcelpdec.o                     \
                                           celp_filters.o acelp_vectors.o \
                                           acelp_filters.o
 OBJS-$(CONFIG_QDM2_DECODER)            += qdm2.o
+OBJS-$(CONFIG_QDMC_DECODER)            += qdmc.o
 OBJS-$(CONFIG_QDRAW_DECODER)           += qdrw.o
 OBJS-$(CONFIG_QPEG_DECODER)            += qpeg.o
 OBJS-$(CONFIG_QTRLE_DECODER)           += qtrle.o
 OBJS-$(CONFIG_QTRLE_ENCODER)           += qtrleenc.o
 OBJS-$(CONFIG_R10K_DECODER)            += r210dec.o
+OBJS-$(CONFIG_R10K_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_R210_DECODER)            += r210dec.o
+OBJS-$(CONFIG_R210_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_RA_144_DECODER)          += ra144dec.o ra144.o celp_filters.o
 OBJS-$(CONFIG_RA_144_ENCODER)          += ra144enc.o ra144.o celp_filters.o
 OBJS-$(CONFIG_RA_288_DECODER)          += ra288.o celp_filters.o
 OBJS-$(CONFIG_RALF_DECODER)            += ralf.o
+OBJS-$(CONFIG_RASC_DECODER)            += rasc.o
 OBJS-$(CONFIG_RAWVIDEO_DECODER)        += rawdec.o
 OBJS-$(CONFIG_RAWVIDEO_ENCODER)        += rawenc.o
+OBJS-$(CONFIG_REALTEXT_DECODER)        += realtextdec.o ass.o
 OBJS-$(CONFIG_RL2_DECODER)             += rl2.o
 OBJS-$(CONFIG_ROQ_DECODER)             += roqvideodec.o roqvideo.o
 OBJS-$(CONFIG_ROQ_ENCODER)             += roqvideoenc.o roqvideo.o elbg.o
@@ -427,12 +559,17 @@ OBJS-$(CONFIG_RV20_DECODER)            += rv10.o
 OBJS-$(CONFIG_RV20_ENCODER)            += rv20enc.o
 OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o rv30dsp.o
 OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o rv40dsp.o
+OBJS-$(CONFIG_SAMI_DECODER)            += samidec.o ass.o htmlsubtitles.o
 OBJS-$(CONFIG_S302M_DECODER)           += s302m.o
+OBJS-$(CONFIG_S302M_ENCODER)           += s302menc.o
 OBJS-$(CONFIG_SANM_DECODER)            += sanm.o
+OBJS-$(CONFIG_SCPR_DECODER)            += scpr.o
 OBJS-$(CONFIG_SCREENPRESSO_DECODER)    += screenpresso.o
+OBJS-$(CONFIG_SDX2_DPCM_DECODER)       += dpcm.o
 OBJS-$(CONFIG_SGI_DECODER)             += sgidec.o
 OBJS-$(CONFIG_SGI_ENCODER)             += sgienc.o rle.o
 OBJS-$(CONFIG_SGIRLE_DECODER)          += sgirledec.o
+OBJS-$(CONFIG_SHEERVIDEO_DECODER)      += sheervideo.o
 OBJS-$(CONFIG_SHORTEN_DECODER)         += shorten.o
 OBJS-$(CONFIG_SIPR_DECODER)            += sipr.o acelp_pitch_delay.o \
                                           celp_math.o acelp_vectors.o \
@@ -441,38 +578,66 @@ OBJS-$(CONFIG_SIPR_DECODER)            += sipr.o acelp_pitch_delay.o \
 OBJS-$(CONFIG_SMACKAUD_DECODER)        += smacker.o
 OBJS-$(CONFIG_SMACKER_DECODER)         += smacker.o
 OBJS-$(CONFIG_SMC_DECODER)             += smc.o
+OBJS-$(CONFIG_SMVJPEG_DECODER)         += smvjpegdec.o
+OBJS-$(CONFIG_SNOW_DECODER)            += snowdec.o snow.o snow_dwt.o
+OBJS-$(CONFIG_SNOW_ENCODER)            += snowenc.o snow.o snow_dwt.o             \
+                                          h263.o ituh263enc.o
 OBJS-$(CONFIG_SOL_DPCM_DECODER)        += dpcm.o
+OBJS-$(CONFIG_SONIC_DECODER)           += sonic.o
+OBJS-$(CONFIG_SONIC_ENCODER)           += sonic.o
+OBJS-$(CONFIG_SONIC_LS_ENCODER)        += sonic.o
+OBJS-$(CONFIG_SPEEDHQ_DECODER)         += speedhq.o mpeg12.o mpeg12data.o simple_idct.o
 OBJS-$(CONFIG_SP5X_DECODER)            += sp5xdec.o
-OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o
+OBJS-$(CONFIG_SRGC_DECODER)            += mscc.o
+OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o htmlsubtitles.o
+OBJS-$(CONFIG_SRT_ENCODER)             += srtenc.o ass_split.o
+OBJS-$(CONFIG_STL_DECODER)             += textdec.o ass.o
+OBJS-$(CONFIG_SUBRIP_DECODER)          += srtdec.o ass.o htmlsubtitles.o
+OBJS-$(CONFIG_SUBRIP_ENCODER)          += srtenc.o ass_split.o
+OBJS-$(CONFIG_SUBVIEWER1_DECODER)      += textdec.o ass.o
+OBJS-$(CONFIG_SUBVIEWER_DECODER)       += subviewerdec.o ass.o
 OBJS-$(CONFIG_SUNRAST_DECODER)         += sunrast.o
 OBJS-$(CONFIG_SUNRAST_ENCODER)         += sunrastenc.o
+OBJS-$(CONFIG_LIBRSVG_DECODER)         += librsvgdec.o
+OBJS-$(CONFIG_SBC_DECODER)             += sbcdec.o sbcdec_data.o sbc.o
+OBJS-$(CONFIG_SBC_ENCODER)             += sbcenc.o sbc.o sbcdsp.o sbcdsp_data.o
 OBJS-$(CONFIG_SVQ1_DECODER)            += svq1dec.o svq1.o svq13.o h263data.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += svq1enc.o svq1.o  h263data.o  \
                                           h263.o ituh263enc.o
 OBJS-$(CONFIG_SVQ3_DECODER)            += svq3.o svq13.o mpegutils.o h264data.o
-OBJS-$(CONFIG_TAK_DECODER)             += takdec.o tak.o
+OBJS-$(CONFIG_TEXT_DECODER)            += textdec.o ass.o
+OBJS-$(CONFIG_TEXT_ENCODER)            += srtenc.o ass_split.o
+OBJS-$(CONFIG_TAK_DECODER)             += takdec.o tak.o takdsp.o
 OBJS-$(CONFIG_TARGA_DECODER)           += targa.o
 OBJS-$(CONFIG_TARGA_ENCODER)           += targaenc.o rle.o
+OBJS-$(CONFIG_TARGA_Y216_DECODER)      += targa_y216dec.o
 OBJS-$(CONFIG_TDSC_DECODER)            += tdsc.o
 OBJS-$(CONFIG_TIERTEXSEQVIDEO_DECODER) += tiertexseqv.o
-OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o faxcompr.o
-OBJS-$(CONFIG_TIFF_ENCODER)            += tiffenc.o rle.o lzwenc.o
+OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o faxcompr.o tiff_data.o tiff_common.o
+OBJS-$(CONFIG_TIFF_ENCODER)            += tiffenc.o rle.o lzwenc.o tiff_data.o
 OBJS-$(CONFIG_TMV_DECODER)             += tmv.o cga_data.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += mlpdec.o mlpdsp.o
+OBJS-$(CONFIG_TRUEHD_ENCODER)          += mlpenc.o mlp.o
 OBJS-$(CONFIG_TRUEMOTION1_DECODER)     += truemotion1.o
 OBJS-$(CONFIG_TRUEMOTION2_DECODER)     += truemotion2.o
 OBJS-$(CONFIG_TRUEMOTION2RT_DECODER)   += truemotion2rt.o
 OBJS-$(CONFIG_TRUESPEECH_DECODER)      += truespeech.o
 OBJS-$(CONFIG_TSCC_DECODER)            += tscc.o msrledec.o
 OBJS-$(CONFIG_TSCC2_DECODER)           += tscc2.o
-OBJS-$(CONFIG_TTA_DECODER)             += tta.o
+OBJS-$(CONFIG_TTA_DECODER)             += tta.o ttadata.o ttadsp.o
+OBJS-$(CONFIG_TTA_ENCODER)             += ttaenc.o ttaencdsp.o ttadata.o
 OBJS-$(CONFIG_TWINVQ_DECODER)          += twinvqdec.o twinvq.o
 OBJS-$(CONFIG_TXD_DECODER)             += txd.o
 OBJS-$(CONFIG_ULTI_DECODER)            += ulti.o
-OBJS-$(CONFIG_UTVIDEO_DECODER)         += utvideodec.o utvideo.o
+OBJS-$(CONFIG_UTVIDEO_DECODER)         += utvideodec.o utvideo.o utvideodsp.o
 OBJS-$(CONFIG_UTVIDEO_ENCODER)         += utvideoenc.o utvideo.o
 OBJS-$(CONFIG_V210_DECODER)            += v210dec.o
 OBJS-$(CONFIG_V210_ENCODER)            += v210enc.o
 OBJS-$(CONFIG_V210X_DECODER)           += v210x.o
+OBJS-$(CONFIG_V308_DECODER)            += v308dec.o
+OBJS-$(CONFIG_V308_ENCODER)            += v308enc.o
+OBJS-$(CONFIG_V408_DECODER)            += v408dec.o
+OBJS-$(CONFIG_V408_ENCODER)            += v408enc.o
 OBJS-$(CONFIG_V410_DECODER)            += v410dec.o
 OBJS-$(CONFIG_V410_ENCODER)            += v410enc.o
 OBJS-$(CONFIG_VB_DECODER)              += vb.o
@@ -480,9 +645,12 @@ OBJS-$(CONFIG_VBLE_DECODER)            += vble.o
 OBJS-$(CONFIG_VC1_DECODER)             += vc1dec.o vc1_block.o vc1_loopfilter.o \
                                           vc1_mc.o vc1_pred.o vc1.o vc1data.o \
                                           msmpeg4dec.o msmpeg4.o msmpeg4data.o \
-                                          wmv2data.o
+                                          wmv2dsp.o wmv2data.o
+OBJS-$(CONFIG_VC1_CUVID_DECODER)       += cuviddec.o
 OBJS-$(CONFIG_VC1_MMAL_DECODER)        += mmaldec.o
 OBJS-$(CONFIG_VC1_QSV_DECODER)         += qsvdec_other.o
+OBJS-$(CONFIG_VC1_V4L2M2M_DECODER)     += v4l2_m2m_dec.o
+OBJS-$(CONFIG_VC2_ENCODER)             += vc2enc.o vc2enc_dwt.o diractab.o
 OBJS-$(CONFIG_VCR1_DECODER)            += vcr1.o
 OBJS-$(CONFIG_VMDAUDIO_DECODER)        += vmdaudio.o
 OBJS-$(CONFIG_VMDVIDEO_DECODER)        += vmdvideo.o
@@ -497,14 +665,29 @@ OBJS-$(CONFIG_VP6_DECODER)             += vp6.o vp56.o vp56data.o \
                                           vp6dsp.o vp56rac.o
 OBJS-$(CONFIG_VP7_DECODER)             += vp8.o vp56rac.o
 OBJS-$(CONFIG_VP8_DECODER)             += vp8.o vp56rac.o
+OBJS-$(CONFIG_VP8_CUVID_DECODER)       += cuviddec.o
+OBJS-$(CONFIG_VP8_MEDIACODEC_DECODER)  += mediacodecdec.o
 OBJS-$(CONFIG_VP8_QSV_DECODER)         += qsvdec_other.o
+OBJS-$(CONFIG_VP8_RKMPP_DECODER)       += rkmppdec.o
 OBJS-$(CONFIG_VP8_VAAPI_ENCODER)       += vaapi_encode_vp8.o
-OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9data.o vp9dsp.o \
-                                          vp9block.o vp9prob.o vp9mvs.o vp56rac.o
+OBJS-$(CONFIG_VP8_V4L2M2M_DECODER)     += v4l2_m2m_dec.o
+OBJS-$(CONFIG_VP8_V4L2M2M_ENCODER)     += v4l2_m2m_enc.o
+OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9data.o vp9dsp.o vp9lpf.o vp9recon.o \
+                                          vp9block.o vp9prob.o vp9mvs.o vp56rac.o \
+                                          vp9dsp_8bpp.o vp9dsp_10bpp.o vp9dsp_12bpp.o
+OBJS-$(CONFIG_VP9_CUVID_DECODER)       += cuviddec.o
+OBJS-$(CONFIG_VP9_MEDIACODEC_DECODER)  += mediacodecdec.o
+OBJS-$(CONFIG_VP9_RKMPP_DECODER)       += rkmppdec.o
 OBJS-$(CONFIG_VP9_VAAPI_ENCODER)       += vaapi_encode_vp9.o
+OBJS-$(CONFIG_VPLAYER_DECODER)         += textdec.o ass.o
+OBJS-$(CONFIG_VP9_V4L2M2M_DECODER)     += v4l2_m2m_dec.o
 OBJS-$(CONFIG_VQA_DECODER)             += vqavideo.o
 OBJS-$(CONFIG_WAVPACK_DECODER)         += wavpack.o
+OBJS-$(CONFIG_WAVPACK_ENCODER)         += wavpackenc.o
+OBJS-$(CONFIG_WCMV_DECODER)            += wcmv.o
 OBJS-$(CONFIG_WEBP_DECODER)            += webp.o
+OBJS-$(CONFIG_WEBVTT_DECODER)          += webvttdec.o ass.o
+OBJS-$(CONFIG_WEBVTT_ENCODER)          += webvttenc.o ass_split.o
 OBJS-$(CONFIG_WMALOSSLESS_DECODER)     += wmalosslessdec.o wma_common.o
 OBJS-$(CONFIG_WMAPRO_DECODER)          += wmaprodec.o wma.o wma_common.o
 OBJS-$(CONFIG_WMAV1_DECODER)           += wmadec.o wma.o wma_common.o aactab.o
@@ -515,24 +698,37 @@ OBJS-$(CONFIG_WMAVOICE_DECODER)        += wmavoice.o \
                                           celp_filters.o \
                                           acelp_vectors.o acelp_filters.o
 OBJS-$(CONFIG_WMV1_DECODER)            += msmpeg4dec.o msmpeg4.o msmpeg4data.o
+OBJS-$(CONFIG_WMV1_ENCODER)            += msmpeg4enc.o
 OBJS-$(CONFIG_WMV2_DECODER)            += wmv2dec.o wmv2.o wmv2data.o \
                                           msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_WMV2_ENCODER)            += wmv2enc.o wmv2.o wmv2data.o \
                                           msmpeg4.o msmpeg4enc.o msmpeg4data.o
 OBJS-$(CONFIG_WNV1_DECODER)            += wnv1.o
+OBJS-$(CONFIG_WRAPPED_AVFRAME_DECODER) += wrapped_avframe.o
 OBJS-$(CONFIG_WRAPPED_AVFRAME_ENCODER) += wrapped_avframe.o
 OBJS-$(CONFIG_WS_SND1_DECODER)         += ws-snd1.o
 OBJS-$(CONFIG_XAN_DPCM_DECODER)        += dpcm.o
 OBJS-$(CONFIG_XAN_WC3_DECODER)         += xan.o
 OBJS-$(CONFIG_XAN_WC4_DECODER)         += xxan.o
+OBJS-$(CONFIG_XBIN_DECODER)            += bintext.o cga_data.o
 OBJS-$(CONFIG_XBM_DECODER)             += xbmdec.o
 OBJS-$(CONFIG_XBM_ENCODER)             += xbmenc.o
+OBJS-$(CONFIG_XFACE_DECODER)           += xfacedec.o xface.o
+OBJS-$(CONFIG_XFACE_ENCODER)           += xfaceenc.o xface.o
 OBJS-$(CONFIG_XL_DECODER)              += xl.o
+OBJS-$(CONFIG_XMA1_DECODER)            += wmaprodec.o wma.o wma_common.o
+OBJS-$(CONFIG_XMA2_DECODER)            += wmaprodec.o wma.o wma_common.o
+OBJS-$(CONFIG_XPM_DECODER)             += xpmdec.o
 OBJS-$(CONFIG_XSUB_DECODER)            += xsubdec.o
 OBJS-$(CONFIG_XSUB_ENCODER)            += xsubenc.o
 OBJS-$(CONFIG_XWD_DECODER)             += xwddec.o
 OBJS-$(CONFIG_XWD_ENCODER)             += xwdenc.o
+OBJS-$(CONFIG_Y41P_DECODER)            += y41pdec.o
+OBJS-$(CONFIG_Y41P_ENCODER)            += y41penc.o
+OBJS-$(CONFIG_YLC_DECODER)             += ylc.o
 OBJS-$(CONFIG_YOP_DECODER)             += yop.o
+OBJS-$(CONFIG_YUV4_DECODER)            += yuv4dec.o
+OBJS-$(CONFIG_YUV4_ENCODER)            += yuv4enc.o
 OBJS-$(CONFIG_ZEROCODEC_DECODER)       += zerocodec.o
 OBJS-$(CONFIG_ZLIB_DECODER)            += lcldec.o
 OBJS-$(CONFIG_ZLIB_ENCODER)            += lclenc.o
@@ -544,6 +740,9 @@ OBJS-$(CONFIG_PCM_ALAW_DECODER)           += pcm.o
 OBJS-$(CONFIG_PCM_ALAW_ENCODER)           += pcm.o
 OBJS-$(CONFIG_PCM_BLURAY_DECODER)         += pcm-bluray.o
 OBJS-$(CONFIG_PCM_DVD_DECODER)            += pcm-dvd.o
+OBJS-$(CONFIG_PCM_DVD_ENCODER)            += pcm-dvdenc.o
+OBJS-$(CONFIG_PCM_F16LE_DECODER)          += pcm.o
+OBJS-$(CONFIG_PCM_F24LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_F32BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_F32BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_F32LE_DECODER)          += pcm.o
@@ -557,13 +756,16 @@ OBJS-$(CONFIG_PCM_MULAW_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_MULAW_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S8_DECODER)             += pcm.o
 OBJS-$(CONFIG_PCM_S8_ENCODER)             += pcm.o
-OBJS-$(CONFIG_PCM_S8_PLANAR_DECODER)      += 8svx.o
+OBJS-$(CONFIG_PCM_S8_PLANAR_DECODER)      += pcm.o
+OBJS-$(CONFIG_PCM_S8_PLANAR_ENCODER)      += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S16BE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S16LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S24BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24DAUD_DECODER)        += pcm.o
@@ -571,11 +773,17 @@ OBJS-$(CONFIG_PCM_S24DAUD_ENCODER)        += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S24LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S32BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S32LE_PLANAR_ENCODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S64BE_DECODER)          += pcm.o
+OBJS-$(CONFIG_PCM_S64BE_ENCODER)          += pcm.o
+OBJS-$(CONFIG_PCM_S64LE_DECODER)          += pcm.o
+OBJS-$(CONFIG_PCM_S64LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_U8_DECODER)             += pcm.o
 OBJS-$(CONFIG_PCM_U8_ENCODER)             += pcm.o
 OBJS-$(CONFIG_PCM_U16BE_DECODER)          += pcm.o
@@ -590,12 +798,17 @@ OBJS-$(CONFIG_PCM_U32BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_U32BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_U32LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_U32LE_ENCODER)          += pcm.o
+OBJS-$(CONFIG_PCM_VIDC_DECODER)           += pcm.o
+OBJS-$(CONFIG_PCM_VIDC_ENCODER)           += pcm.o
 OBJS-$(CONFIG_PCM_ZORK_DECODER)           += pcm.o
 
 OBJS-$(CONFIG_ADPCM_4XM_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_ADX_DECODER)          += adxdec.o adx.o
 OBJS-$(CONFIG_ADPCM_ADX_ENCODER)          += adxenc.o adx.o
+OBJS-$(CONFIG_ADPCM_AFC_DECODER)          += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_AICA_DECODER)         += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_CT_DECODER)           += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_DTK_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_DECODER)           += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_MAXIS_XA_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_R1_DECODER)        += adpcm.o adpcm_data.o
@@ -606,68 +819,93 @@ OBJS-$(CONFIG_ADPCM_G722_DECODER)         += g722.o g722dsp.o g722dec.o
 OBJS-$(CONFIG_ADPCM_G722_ENCODER)         += g722.o g722dsp.o g722enc.o
 OBJS-$(CONFIG_ADPCM_G726_DECODER)         += g726.o
 OBJS-$(CONFIG_ADPCM_G726_ENCODER)         += g726.o
+OBJS-$(CONFIG_ADPCM_G726LE_DECODER)       += g726.o
+OBJS-$(CONFIG_ADPCM_G726LE_ENCODER)       += g726.o
 OBJS-$(CONFIG_ADPCM_IMA_AMV_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_APC_DECODER)      += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_DAT4_DECODER)     += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_DK3_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_DK4_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_EA_EACS_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_EA_SEAD_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_ISS_DECODER)      += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_OKI_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_QT_DECODER)       += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_QT_ENCODER)       += adpcmenc.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_RAD_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_SMJPEG_DECODER)   += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WAV_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WAV_ENCODER)      += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WS_DECODER)       += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_MS_DECODER)           += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_MS_ENCODER)           += adpcmenc.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_MTAF_DECODER)         += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_PSX_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SBPRO_2_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SBPRO_3_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SBPRO_4_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SWF_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SWF_ENCODER)          += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_THP_DECODER)          += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_THP_LE_DECODER)       += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_VIMA_DECODER)         += vima.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_XA_DECODER)           += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_YAMAHA_DECODER)       += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_YAMAHA_ENCODER)       += adpcmenc.o adpcm_data.o
 
 # hardware accelerators
-OBJS-$(CONFIG_CUVID)                      += cuvid.o
 OBJS-$(CONFIG_D3D11VA)                    += dxva2.o
 OBJS-$(CONFIG_DXVA2)                      += dxva2.o
+OBJS-$(CONFIG_NVDEC)                      += nvdec.o
 OBJS-$(CONFIG_VAAPI)                      += vaapi_decode.o
-OBJS-$(CONFIG_VDA)                        += vda.o
+OBJS-$(CONFIG_VIDEOTOOLBOX)               += videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                      += vdpau.o
 
 OBJS-$(CONFIG_H263_VAAPI_HWACCEL)         += vaapi_mpeg4.o
-OBJS-$(CONFIG_H264_CUVID_HWACCEL)         += cuvid_h264.o
+OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
 OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
 OBJS-$(CONFIG_H264_DXVA2_HWACCEL)         += dxva2_h264.o
+OBJS-$(CONFIG_H264_NVDEC_HWACCEL)         += nvdec_h264.o
 OBJS-$(CONFIG_H264_QSV_HWACCEL)           += qsvdec_h2645.o
 OBJS-$(CONFIG_H264_VAAPI_HWACCEL)         += vaapi_h264.o
-OBJS-$(CONFIG_H264_VDA_HWACCEL)           += vda_h264.o
 OBJS-$(CONFIG_H264_VDPAU_HWACCEL)         += vdpau_h264.o
-OBJS-$(CONFIG_HEVC_CUVID_HWACCEL)         += cuvid_hevc.o
+OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
 OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
+OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec_h2645.o
 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
+OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
+OBJS-$(CONFIG_MJPEG_VAAPI_HWACCEL)        += vaapi_mjpeg.o
+OBJS-$(CONFIG_MPEG1_NVDEC_HWACCEL)        += nvdec_mpeg12.o
 OBJS-$(CONFIG_MPEG1_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+OBJS-$(CONFIG_MPEG1_XVMC_HWACCEL)         += mpegvideo_xvmc.o
 OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL)      += dxva2_mpeg2.o
 OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL)        += dxva2_mpeg2.o
+OBJS-$(CONFIG_MPEG2_NVDEC_HWACCEL)        += nvdec_mpeg12.o
 OBJS-$(CONFIG_MPEG2_QSV_HWACCEL)          += qsvdec_other.o
 OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL)        += vaapi_mpeg2.o
 OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+OBJS-$(CONFIG_MPEG2_XVMC_HWACCEL)         += mpegvideo_xvmc.o
+OBJS-$(CONFIG_MPEG4_NVDEC_HWACCEL)        += nvdec_mpeg4.o
 OBJS-$(CONFIG_MPEG4_VAAPI_HWACCEL)        += vaapi_mpeg4.o
 OBJS-$(CONFIG_MPEG4_VDPAU_HWACCEL)        += vdpau_mpeg4.o
+OBJS-$(CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
 OBJS-$(CONFIG_VC1_D3D11VA_HWACCEL)        += dxva2_vc1.o
 OBJS-$(CONFIG_VC1_DXVA2_HWACCEL)          += dxva2_vc1.o
+OBJS-$(CONFIG_VC1_NVDEC_HWACCEL)          += nvdec_vc1.o
 OBJS-$(CONFIG_VC1_QSV_HWACCEL)            += qsvdec_other.o
 OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
 OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
+OBJS-$(CONFIG_VP8_NVDEC_HWACCEL)          += nvdec_vp8.o
 OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
+OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
+OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
+OBJS-$(CONFIG_VP9_NVDEC_HWACCEL)          += nvdec_vp9.o
+OBJS-$(CONFIG_VP9_VAAPI_HWACCEL)          += vaapi_vp9.o
 OBJS-$(CONFIG_VP8_QSV_HWACCEL)            += qsvdec_other.o
 
 # libavformat dependencies
@@ -675,23 +913,56 @@ OBJS-$(CONFIG_ISO_MEDIA)               += mpeg4audio.o mpegaudiodata.o
 
 OBJS-$(CONFIG_ADTS_MUXER)              += mpeg4audio.o
 OBJS-$(CONFIG_CAF_DEMUXER)             += ac3tab.o
+OBJS-$(CONFIG_CODEC2_DEMUXER)          += codec2utils.o
+OBJS-$(CONFIG_CODEC2_MUXER)            += codec2utils.o
+OBJS-$(CONFIG_CODEC2RAW_DEMUXER)       += codec2utils.o
+OBJS-$(CONFIG_DNXHD_DEMUXER)           += dnxhddata.o
+OBJS-$(CONFIG_FITS_DEMUXER)            += fits.o
 OBJS-$(CONFIG_FLV_DEMUXER)             += mpeg4audio.o
 OBJS-$(CONFIG_LATM_MUXER)              += mpeg4audio.o
 OBJS-$(CONFIG_MATROSKA_AUDIO_MUXER)    += mpeg4audio.o
 OBJS-$(CONFIG_MATROSKA_MUXER)          += mpeg4audio.o
 OBJS-$(CONFIG_MOV_DEMUXER)             += ac3tab.o
+OBJS-$(CONFIG_MXF_MUXER)               += dnxhddata.o
 OBJS-$(CONFIG_NUT_MUXER)               += mpegaudiodata.o
+OBJS-$(CONFIG_NUT_DEMUXER)             += mpegaudiodata.o mpeg4audio.o
 OBJS-$(CONFIG_RTP_MUXER)               += mpeg4audio.o
 OBJS-$(CONFIG_SPDIF_MUXER)             += dca.o
 OBJS-$(CONFIG_TAK_DEMUXER)             += tak.o
 OBJS-$(CONFIG_WEBM_MUXER)              += mpeg4audio.o
 
+# libavfilter dependencies
+OBJS-$(CONFIG_ELBG_FILTER)             += elbg.o
+
 # external codec libraries
-OBJS-$(CONFIG_LIBAOM_AV1_DECODER)         += libaomdec.o libaom.o
-OBJS-$(CONFIG_LIBAOM_AV1_ENCODER)         += libaomenc.o libaom.o
+OBJS-$(CONFIG_AAC_AT_DECODER)             += audiotoolboxdec.o
+OBJS-$(CONFIG_AC3_AT_DECODER)             += audiotoolboxdec.o
+OBJS-$(CONFIG_ADPCM_IMA_QT_AT_DECODER)    += audiotoolboxdec.o
+OBJS-$(CONFIG_ALAC_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_AMR_NB_AT_DECODER)          += audiotoolboxdec.o
+OBJS-$(CONFIG_EAC3_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_GSM_MS_AT_DECODER)          += audiotoolboxdec.o
+OBJS-$(CONFIG_ILBC_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_MP1_AT_DECODER)             += audiotoolboxdec.o
+OBJS-$(CONFIG_MP2_AT_DECODER)             += audiotoolboxdec.o
+OBJS-$(CONFIG_MP3_AT_DECODER)             += audiotoolboxdec.o
+OBJS-$(CONFIG_PCM_MULAW_AT_DECODER)       += audiotoolboxdec.o
+OBJS-$(CONFIG_PCM_ALAW_AT_DECODER)        += audiotoolboxdec.o
+OBJS-$(CONFIG_QDMC_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_QDM2_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_AAC_AT_ENCODER)             += audiotoolboxenc.o
+OBJS-$(CONFIG_ALAC_AT_ENCODER)            += audiotoolboxenc.o
+OBJS-$(CONFIG_ILBC_AT_ENCODER)            += audiotoolboxenc.o
+OBJS-$(CONFIG_PCM_ALAW_AT_ENCODER)        += audiotoolboxenc.o
+OBJS-$(CONFIG_PCM_MULAW_AT_ENCODER)       += audiotoolboxenc.o
+OBJS-$(CONFIG_LIBAOM_AV1_DECODER)         += libaomdec.o
+OBJS-$(CONFIG_LIBAOM_AV1_ENCODER)         += libaomenc.o
+OBJS-$(CONFIG_LIBARIBB24_DECODER)         += libaribb24.o ass.o
+OBJS-$(CONFIG_LIBCELT_DECODER)            += libcelt_dec.o
+OBJS-$(CONFIG_LIBCODEC2_DECODER)          += libcodec2.o codec2utils.o
+OBJS-$(CONFIG_LIBCODEC2_ENCODER)          += libcodec2.o codec2utils.o
 OBJS-$(CONFIG_LIBDAV1D_DECODER)           += libdav1d.o
-OBJS-$(CONFIG_LIBDCADEC_DECODER)          += libdcadec.o dca.o
-OBJS-$(CONFIG_LIBFAAC_ENCODER)            += libfaac.o
+OBJS-$(CONFIG_LIBDAVS2_DECODER)           += libdavs2.o
 OBJS-$(CONFIG_LIBFDK_AAC_DECODER)         += libfdk-aacdec.o
 OBJS-$(CONFIG_LIBFDK_AAC_ENCODER)         += libfdk-aacenc.o
 OBJS-$(CONFIG_LIBGSM_DECODER)             += libgsmdec.o
@@ -713,29 +984,29 @@ OBJS-$(CONFIG_LIBOPUS_DECODER)            += libopusdec.o libopus.o     \
                                              vorbis_data.o
 OBJS-$(CONFIG_LIBOPUS_ENCODER)            += libopusenc.o libopus.o     \
                                              vorbis_data.o
-OBJS-$(CONFIG_LIBSCHROEDINGER_DECODER)    += libschroedingerdec.o \
-                                             libschroedinger.o
-OBJS-$(CONFIG_LIBSCHROEDINGER_ENCODER)    += libschroedingerenc.o \
-                                             libschroedinger.o
+OBJS-$(CONFIG_LIBSHINE_ENCODER)           += libshine.o
 OBJS-$(CONFIG_LIBSPEEX_DECODER)           += libspeexdec.o
 OBJS-$(CONFIG_LIBSPEEX_ENCODER)           += libspeexenc.o
 OBJS-$(CONFIG_LIBTHEORA_ENCODER)          += libtheoraenc.o
 OBJS-$(CONFIG_LIBTWOLAME_ENCODER)         += libtwolame.o
-OBJS-$(CONFIG_LIBVO_AACENC_ENCODER)       += libvo-aacenc.o mpeg4audio.o
 OBJS-$(CONFIG_LIBVO_AMRWBENC_ENCODER)     += libvo-amrwbenc.o
-OBJS-$(CONFIG_LIBVORBIS_ENCODER)          += libvorbis.o \
+OBJS-$(CONFIG_LIBVORBIS_DECODER)          += libvorbisdec.o
+OBJS-$(CONFIG_LIBVORBIS_ENCODER)          += libvorbisenc.o \
                                              vorbis_data.o
-OBJS-$(CONFIG_LIBVPX_VP8_DECODER)         += libvpxdec.o libvpx.o
-OBJS-$(CONFIG_LIBVPX_VP8_ENCODER)         += libvpxenc.o libvpx.o
+OBJS-$(CONFIG_LIBVPX_VP8_DECODER)         += libvpxdec.o
+OBJS-$(CONFIG_LIBVPX_VP8_ENCODER)         += libvpxenc.o
 OBJS-$(CONFIG_LIBVPX_VP9_DECODER)         += libvpxdec.o libvpx.o
 OBJS-$(CONFIG_LIBVPX_VP9_ENCODER)         += libvpxenc.o libvpx.o
 OBJS-$(CONFIG_LIBWAVPACK_ENCODER)         += libwavpackenc.o
-OBJS-$(CONFIG_LIBWEBP_ENCODER)            += libwebpenc.o
+OBJS-$(CONFIG_LIBWEBP_ENCODER)            += libwebpenc_common.o libwebpenc.o
+OBJS-$(CONFIG_LIBWEBP_ANIM_ENCODER)       += libwebpenc_common.o libwebpenc_animencoder.o
 OBJS-$(CONFIG_LIBX262_ENCODER)            += libx264.o
 OBJS-$(CONFIG_LIBX264_ENCODER)            += libx264.o
 OBJS-$(CONFIG_LIBX265_ENCODER)            += libx265.o
 OBJS-$(CONFIG_LIBXAVS_ENCODER)            += libxavs.o
+OBJS-$(CONFIG_LIBXAVS2_ENCODER)           += libxavs2.o
 OBJS-$(CONFIG_LIBXVID_ENCODER)            += libxvid.o
+OBJS-$(CONFIG_LIBZVBI_TELETEXT_DECODER)   += libzvbi-teletextdec.o ass.o
 
 # parsers
 OBJS-$(CONFIG_AAC_LATM_PARSER)         += latm_parser.o
@@ -743,16 +1014,24 @@ OBJS-$(CONFIG_AAC_PARSER)              += aac_parser.o aac_ac3_parser.o \
                                           mpeg4audio.o
 OBJS-$(CONFIG_AC3_PARSER)              += ac3tab.o aac_ac3_parser.o
 OBJS-$(CONFIG_ADX_PARSER)              += adx_parser.o adx.o
+OBJS-$(CONFIG_AV1_PARSER)              += av1_parser.o av1_parse.o
+OBJS-$(CONFIG_AVS2_PARSER)             += avs2_parser.o
 OBJS-$(CONFIG_BMP_PARSER)              += bmp_parser.o
 OBJS-$(CONFIG_CAVSVIDEO_PARSER)        += cavs_parser.o
 OBJS-$(CONFIG_COOK_PARSER)             += cook_parser.o
-OBJS-$(CONFIG_DCA_PARSER)              += dca_parser.o dca.o
+OBJS-$(CONFIG_DCA_PARSER)              += dca_parser.o dca_exss.o dca.o
 OBJS-$(CONFIG_DIRAC_PARSER)            += dirac_parser.o
-OBJS-$(CONFIG_DNXHD_PARSER)            += dnxhd_parser.o
+OBJS-$(CONFIG_DNXHD_PARSER)            += dnxhd_parser.o dnxhddata.o
 OBJS-$(CONFIG_DPX_PARSER)              += dpx_parser.o
+OBJS-$(CONFIG_DVAUDIO_PARSER)          += dvaudio_parser.o
 OBJS-$(CONFIG_DVBSUB_PARSER)           += dvbsub_parser.o
+OBJS-$(CONFIG_DVD_NAV_PARSER)          += dvd_nav_parser.o
 OBJS-$(CONFIG_DVDSUB_PARSER)           += dvdsub_parser.o
-OBJS-$(CONFIG_FLAC_PARSER)             += flac_parser.o flacdata.o flac.o
+OBJS-$(CONFIG_FLAC_PARSER)             += flac_parser.o flacdata.o flac.o \
+                                          vorbis_data.o
+OBJS-$(CONFIG_G723_1_PARSER)           += g723_1_parser.o
+OBJS-$(CONFIG_G729_PARSER)             += g729_parser.o
+OBJS-$(CONFIG_GIF_PARSER)              += gif_parser.o
 OBJS-$(CONFIG_GSM_PARSER)              += gsm_parser.o
 OBJS-$(CONFIG_H261_PARSER)             += h261_parser.o
 OBJS-$(CONFIG_H263_PARSER)             += h263_parser.o
@@ -763,41 +1042,58 @@ OBJS-$(CONFIG_MLP_PARSER)              += mlp_parser.o mlp.o
 OBJS-$(CONFIG_MPEG4VIDEO_PARSER)       += mpeg4video_parser.o h263.o \
                                           mpeg4videodec.o mpeg4video.o \
                                           ituh263dec.o h263dec.o h263data.o
+OBJS-$(CONFIG_PNG_PARSER)              += png_parser.o
 OBJS-$(CONFIG_MPEGAUDIO_PARSER)        += mpegaudio_parser.o
 OBJS-$(CONFIG_MPEGVIDEO_PARSER)        += mpegvideo_parser.o    \
                                           mpeg12.o mpeg12data.o
-OBJS-$(CONFIG_OPUS_PARSER)             += opus_parser.o opus.o vorbis_data.o
+OBJS-$(CONFIG_OPUS_PARSER)             += opus_parser.o opus.o opustab.o \
+                                          opus_rc.o vorbis_data.o
 OBJS-$(CONFIG_PNG_PARSER)              += png_parser.o
 OBJS-$(CONFIG_PNM_PARSER)              += pnm_parser.o pnm.o
 OBJS-$(CONFIG_RV30_PARSER)             += rv34_parser.o
 OBJS-$(CONFIG_RV40_PARSER)             += rv34_parser.o
+OBJS-$(CONFIG_SBC_PARSER)              += sbc_parser.o
+OBJS-$(CONFIG_SIPR_PARSER)             += sipr_parser.o
 OBJS-$(CONFIG_TAK_PARSER)              += tak_parser.o tak.o
 OBJS-$(CONFIG_VC1_PARSER)              += vc1_parser.o vc1.o vc1data.o  \
                                           simple_idct.o wmv2data.o
 OBJS-$(CONFIG_VP3_PARSER)              += vp3_parser.o
 OBJS-$(CONFIG_VP8_PARSER)              += vp8_parser.o
+OBJS-$(CONFIG_VP9_PARSER)              += vp9_parser.o
+OBJS-$(CONFIG_XMA_PARSER)              += xma_parser.o
 
 # bitstream filters
 OBJS-$(CONFIG_AAC_ADTSTOASC_BSF)          += aac_adtstoasc_bsf.o mpeg4audio.o
+OBJS-$(CONFIG_AV1_METADATA_BSF)           += av1_metadata_bsf.o
 OBJS-$(CONFIG_CHOMP_BSF)                  += chomp_bsf.o
 OBJS-$(CONFIG_DUMP_EXTRADATA_BSF)         += dump_extradata_bsf.o
+OBJS-$(CONFIG_DCA_CORE_BSF)               += dca_core_bsf.o
+OBJS-$(CONFIG_EAC3_CORE_BSF)              += eac3_core_bsf.o
 OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF)      += extract_extradata_bsf.o    \
-                                             h2645_parse.o
-OBJS-$(CONFIG_H264_METADATA_BSF)          += h264_metadata_bsf.o
+                                             av1_parse.o h2645_parse.o
+OBJS-$(CONFIG_FILTER_UNITS_BSF)           += filter_units_bsf.o
+OBJS-$(CONFIG_H264_METADATA_BSF)          += h264_metadata_bsf.o h264_levels.o
 OBJS-$(CONFIG_H264_MP4TOANNEXB_BSF)       += h264_mp4toannexb_bsf.o
 OBJS-$(CONFIG_H264_REDUNDANT_PPS_BSF)     += h264_redundant_pps_bsf.o
+OBJS-$(CONFIG_HAPQA_EXTRACT_BSF)          += hapqa_extract_bsf.o hap.o
 OBJS-$(CONFIG_HEVC_METADATA_BSF)          += h265_metadata_bsf.o
 OBJS-$(CONFIG_HEVC_MP4TOANNEXB_BSF)       += hevc_mp4toannexb_bsf.o
 OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF)        += imx_dump_header_bsf.o
 OBJS-$(CONFIG_MJPEG2JPEG_BSF)             += mjpeg2jpeg_bsf.o
 OBJS-$(CONFIG_MJPEGA_DUMP_HEADER_BSF)     += mjpega_dump_header_bsf.o
+OBJS-$(CONFIG_MPEG4_UNPACK_BFRAMES_BSF)   += mpeg4_unpack_bframes_bsf.o
 OBJS-$(CONFIG_MOV2TEXTSUB_BSF)            += movsub_bsf.o
+OBJS-$(CONFIG_MP3_HEADER_DECOMPRESS_BSF)  += mp3_header_decompress_bsf.o \
+                                             mpegaudiodata.o
 OBJS-$(CONFIG_MPEG2_METADATA_BSF)         += mpeg2_metadata_bsf.o
 OBJS-$(CONFIG_NOISE_BSF)                  += noise_bsf.o
 OBJS-$(CONFIG_NULL_BSF)                   += null_bsf.o
+OBJS-$(CONFIG_PRORES_METADATA_BSF)        += prores_metadata_bsf.o
 OBJS-$(CONFIG_REMOVE_EXTRADATA_BSF)       += remove_extradata_bsf.o
 OBJS-$(CONFIG_TEXT2MOVSUB_BSF)            += movsub_bsf.o
 OBJS-$(CONFIG_TRACE_HEADERS_BSF)          += trace_headers_bsf.o
+OBJS-$(CONFIG_TRUEHD_CORE_BSF)            += truehd_core_bsf.o mlp_parser.o mlp.o
+OBJS-$(CONFIG_VP9_METADATA_BSF)           += vp9_metadata_bsf.o
 OBJS-$(CONFIG_VP9_RAW_REORDER_BSF)        += vp9_raw_reorder_bsf.o
 OBJS-$(CONFIG_VP9_SUPERFRAME_BSF)         += vp9_superframe_bsf.o
 OBJS-$(CONFIG_VP9_SUPERFRAME_SPLIT_BSF)   += vp9_superframe_split_bsf.o
@@ -806,40 +1102,71 @@ OBJS-$(CONFIG_VP9_SUPERFRAME_SPLIT_BSF)   += vp9_superframe_split_bsf.o
 OBJS-$(HAVE_LIBC_MSVCRT)               += file_open.o
 OBJS-$(HAVE_THREADS)                   += pthread.o pthread_slice.o pthread_frame.o
 
+OBJS-$(CONFIG_FRAME_THREAD_ENCODER)    += frame_thread_encoder.o
+
+# Windows resource file
+SLIBOBJS-$(HAVE_GNU_WINDRES)           += avcodecres.o
+
 SKIPHEADERS                            += %_tablegen.h                  \
                                           %_tables.h                    \
-                                          aac_tablegen_decl.h           \
                                           fft-internal.h                \
                                           tableprint.h                  \
+                                          tableprint_vlc.h              \
+                                          aaccoder_twoloop.h            \
+                                          aaccoder_trellis.h            \
+                                          aacenc_quantization.h         \
+                                          aacenc_quantization_misc.h    \
                                           $(ARCH)/vp56_arith.h          \
 
-SKIPHEADERS-$(CONFIG_CUVID)            += cuvid.h
 SKIPHEADERS-$(CONFIG_AMF)              += amfenc.h
 SKIPHEADERS-$(CONFIG_D3D11VA)          += d3d11va.h dxva2_internal.h
 SKIPHEADERS-$(CONFIG_DXVA2)            += dxva2.h dxva2_internal.h
-SKIPHEADERS-$(CONFIG_LIBAOM)           += libaom.h
-SKIPHEADERS-$(CONFIG_LIBSCHROEDINGER)  += libschroedinger.h
+SKIPHEADERS-$(CONFIG_JNI)              += ffjni.h
 SKIPHEADERS-$(CONFIG_LIBVPX)           += libvpx.h
+SKIPHEADERS-$(CONFIG_LIBWEBP_ENCODER)  += libwebpenc_common.h
+SKIPHEADERS-$(CONFIG_MEDIACODEC)       += mediacodecdec_common.h mediacodec_surface.h mediacodec_wrapper.h mediacodec_sw_buffer.h
+SKIPHEADERS-$(CONFIG_NVDEC)            += nvdec.h
 SKIPHEADERS-$(CONFIG_NVENC)            += nvenc.h
 SKIPHEADERS-$(CONFIG_QSV)              += qsv.h qsv_internal.h
 SKIPHEADERS-$(CONFIG_QSVDEC)           += qsvdec.h
 SKIPHEADERS-$(CONFIG_QSVENC)           += qsvenc.h
+SKIPHEADERS-$(CONFIG_XVMC)             += xvmc.h
 SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_decode.h vaapi_encode.h
-SKIPHEADERS-$(CONFIG_VDA)              += vda.h vda_internal.h
 SKIPHEADERS-$(CONFIG_VDPAU)            += vdpau.h vdpau_internal.h
+SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += videotoolbox.h vt_internal.h
+SKIPHEADERS-$(CONFIG_V4L2_M2M)         += v4l2_buffers.h v4l2_context.h v4l2_m2m.h
+
+TESTPROGS = avpacket                                                    \
+            celp_math                                                   \
+            codec_desc                                                  \
+            htmlsubtitles                                               \
+            imgconvert                                                  \
+            jpeg2000dwt                                                 \
+            mathops                                                    \
+            options                                                     \
+            mjpegenc_huffman                                            \
+            utils                                                       \
 
-TESTPROGS-$(CONFIG_FFT)                   += fft fft-fixed
+TESTPROGS-$(CONFIG_CABAC)                 += cabac
+TESTPROGS-$(CONFIG_DCT)                   += avfft
+TESTPROGS-$(CONFIG_FFT)                   += fft fft-fixed fft-fixed32
 TESTPROGS-$(CONFIG_GOLOMB)                += golomb
 TESTPROGS-$(CONFIG_IDCTDSP)               += dct
 TESTPROGS-$(CONFIG_IIRFILTER)             += iirfilter
+TESTPROGS-$(HAVE_MMX)                     += motion
 TESTPROGS-$(CONFIG_MPEGVIDEO)             += mpeg12framerate
+TESTPROGS-$(CONFIG_H264_METADATA_BSF)     += h264_levels
 TESTPROGS-$(CONFIG_RANGECODER)            += rangecoder
+TESTPROGS-$(CONFIG_SNOW_ENCODER)          += snowenc
 
 TESTOBJS = dctref.o
 
-HOSTPROGS = aac_tablegen                                                \
-            aacps_tablegen                                              \
+TOOLS = fourcc2pixfmt
+
+HOSTPROGS = aacps_tablegen                                              \
+            aacps_fixed_tablegen                                        \
             cbrt_tablegen                                               \
+            cbrt_fixed_tablegen                                         \
             cos_tablegen                                                \
             dv_tablegen                                                 \
             motionpixels_tablegen                                       \
@@ -847,6 +1174,7 @@ HOSTPROGS = aac_tablegen                                                \
             pcm_tablegen                                                \
             qdm2_tablegen                                               \
             sinewin_tablegen                                            \
+            sinewin_fixed_tablegen                                      \
 
 CLEANFILES = *_tables.c *_tables.h *_tablegen$(HOSTEXESUF)
 
@@ -865,8 +1193,9 @@ else
 $(SUBDIR)%_tablegen$(HOSTEXESUF): HOSTCFLAGS += -DCONFIG_SMALL=0
 endif
 
-GEN_HEADERS = cbrt_tables.h aacps_tables.h aac_tables.h dv_tables.h     \
-              sinewin_tables.h mpegaudio_tables.h motionpixels_tables.h \
+GEN_HEADERS = cbrt_tables.h cbrt_fixed_tables.h aacps_tables.h aacps_fixed_tables.h \
+              dv_tables.h     \
+              sinewin_tables.h sinewin_fixed_tables.h mpegaudio_tables.h motionpixels_tables.h \
               pcm_tables.h qdm2_tables.h
 GEN_HEADERS := $(addprefix $(SUBDIR), $(GEN_HEADERS))
 
@@ -874,9 +1203,11 @@ $(GEN_HEADERS): $(SUBDIR)%_tables.h: $(SUBDIR)%_tablegen$(HOSTEXESUF)
 	$(M)./$< > $@
 
 ifdef CONFIG_HARDCODED_TABLES
-$(SUBDIR)aacdec.o: $(SUBDIR)cbrt_tables.h
-$(SUBDIR)aacps.o: $(SUBDIR)aacps_tables.h
-$(SUBDIR)aactab.o: $(SUBDIR)aac_tables.h
+$(SUBDIR)cbrt_data.o: $(SUBDIR)cbrt_tables.h
+$(SUBDIR)cbrt_data_fixed.o: $(SUBDIR)cbrt_fixed_tables.h
+$(SUBDIR)aacps_float.o: $(SUBDIR)aacps_tables.h
+$(SUBDIR)aacps_fixed.o: $(SUBDIR)aacps_fixed_tables.h
+$(SUBDIR)aactab_fixed.o: $(SUBDIR)aac_fixed_tables.h
 $(SUBDIR)dvenc.o: $(SUBDIR)dv_tables.h
 $(SUBDIR)motionpixels.o: $(SUBDIR)motionpixels_tables.h
 $(SUBDIR)mpegaudiodec_fixed.o: $(SUBDIR)mpegaudio_tables.h
@@ -884,4 +1215,5 @@ $(SUBDIR)mpegaudiodec_float.o: $(SUBDIR)mpegaudio_tables.h
 $(SUBDIR)pcm.o: $(SUBDIR)pcm_tables.h
 $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
 $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+$(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
 endif
diff --git a/libavcodec/a64colors.h b/libavcodec/a64colors.h
index d977426..a9cdb6f 100644
--- a/libavcodec/a64colors.h
+++ b/libavcodec/a64colors.h
@@ -2,20 +2,20 @@
  * a64 video encoder - c64 colors in rgb (Pepto)
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/a64enc.h b/libavcodec/a64enc.h
deleted file mode 100644
index 65c1d30..0000000
--- a/libavcodec/a64enc.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * a64 video encoder - basic headers
- * Copyright (c) 2009 Tobias Bindhammer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * a64 video encoder - basic headers
- */
-
-#ifndef AVCODEC_A64ENC_H
-#define AVCODEC_A64ENC_H
-
-#include "libavutil/lfg.h"
-#include "avcodec.h"
-
-#define C64XRES 320
-#define C64YRES 200
-
-typedef struct A64Context {
-    /* variables for multicolor modes */
-    AVLFG randctx;
-    int mc_lifetime;
-    int mc_use_5col;
-    unsigned mc_frame_counter;
-    int *mc_meta_charset;
-    int *mc_charmap;
-    int *mc_best_cb;
-    int mc_luma_vals[5];
-    uint8_t *mc_charset;
-    uint8_t *mc_colram;
-    uint8_t *mc_palette;
-    int mc_pal_size;
-
-    /* pts of the next packet that will be output */
-    int64_t next_pts;
-} A64Context;
-
-#endif /* AVCODEC_A64ENC_H */
diff --git a/libavcodec/a64multienc.c b/libavcodec/a64multienc.c
index 5d8d162..91aac09 100644
--- a/libavcodec/a64multienc.c
+++ b/libavcodec/a64multienc.c
@@ -2,20 +2,20 @@
  * a64 video encoder - multicolor modes
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,11 +24,11 @@
  * a64 video encoder - multicolor modes
  */
 
-#include "a64enc.h"
 #include "a64colors.h"
 #include "a64tables.h"
 #include "elbg.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
@@ -37,6 +37,28 @@
 #define INTERLACED    1
 #define CROP_SCREENS  1
 
+#define C64XRES 320
+#define C64YRES 200
+
+typedef struct A64Context {
+    /* variables for multicolor modes */
+    AVLFG randctx;
+    int mc_lifetime;
+    int mc_use_5col;
+    unsigned mc_frame_counter;
+    int *mc_meta_charset;
+    int *mc_charmap;
+    int *mc_best_cb;
+    int mc_luma_vals[5];
+    uint8_t *mc_charset;
+    uint8_t *mc_colram;
+    uint8_t *mc_palette;
+    int mc_pal_size;
+
+    /* pts of the next packet that will be output */
+    int64_t next_pts;
+} A64Context;
+
 /* gray gradient */
 static const int mc_colors[5]={0x0,0xb,0xc,0xf,0x1};
 
@@ -58,9 +80,13 @@ static void to_meta_with_crop(AVCodecContext *avctx,
             for (y = blocky; y < blocky + 8 && y < C64YRES; y++) {
                 for (x = blockx; x < blockx + 8 && x < C64XRES; x += 2) {
                     if(x < width && y < height) {
-                        /* build average over 2 pixels */
-                        luma = (src[(x + 0 + y * p->linesize[0])] +
-                                src[(x + 1 + y * p->linesize[0])]) / 2;
+                        if (x + 1 < width) {
+                            /* build average over 2 pixels */
+                            luma = (src[(x + 0 + y * p->linesize[0])] +
+                                    src[(x + 1 + y * p->linesize[0])]) / 2;
+                        } else {
+                            luma = src[(x + y * p->linesize[0])];
+                        }
                         /* write blocks as linear data now so they are suitable for elbg */
                         dest[0] = luma;
                     }
@@ -166,11 +192,11 @@ static void render_charset(AVCodecContext *avctx, uint8_t *charset,
 static av_cold int a64multi_close_encoder(AVCodecContext *avctx)
 {
     A64Context *c = avctx->priv_data;
-    av_free(c->mc_meta_charset);
-    av_free(c->mc_best_cb);
-    av_free(c->mc_charset);
-    av_free(c->mc_charmap);
-    av_free(c->mc_colram);
+    av_freep(&c->mc_meta_charset);
+    av_freep(&c->mc_best_cb);
+    av_freep(&c->mc_charset);
+    av_freep(&c->mc_charmap);
+    av_freep(&c->mc_colram);
     return 0;
 }
 
@@ -199,9 +225,9 @@ static av_cold int a64multi_encode_init(AVCodecContext *avctx)
                            a64_palette[mc_colors[a]][2] * 0.11;
     }
 
-    if (!(c->mc_meta_charset = av_malloc(32000 * c->mc_lifetime * sizeof(int))) ||
+    if (!(c->mc_meta_charset = av_mallocz_array(c->mc_lifetime, 32000 * sizeof(int))) ||
        !(c->mc_best_cb       = av_malloc(CHARSET_CHARS * 32 * sizeof(int)))     ||
-       !(c->mc_charmap       = av_mallocz(1000 * c->mc_lifetime * sizeof(int))) ||
+       !(c->mc_charmap       = av_mallocz_array(c->mc_lifetime, 1000 * sizeof(int))) ||
        !(c->mc_colram        = av_mallocz(CHARSET_CHARS * sizeof(uint8_t)))     ||
        !(c->mc_charset       = av_malloc(0x800 * (INTERLACED+1) * sizeof(uint8_t)))) {
         av_log(avctx, AV_LOG_ERROR, "Failed to allocate buffer memory.\n");
@@ -217,12 +243,6 @@ static av_cold int a64multi_encode_init(AVCodecContext *avctx)
     AV_WB32(avctx->extradata, c->mc_lifetime);
     AV_WB32(avctx->extradata + 16, INTERLACED);
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
     if (!avctx->codec_tag)
          avctx->codec_tag = AV_RL32("a64m");
 
@@ -247,7 +267,7 @@ static void a64_compress_colram(unsigned char *buf, int *charmap, uint8_t *colra
 }
 
 static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                                 const AVFrame *pict, int *got_packet)
+                                 const AVFrame *p, int *got_packet)
 {
     A64Context *c = avctx->priv_data;
 
@@ -257,7 +277,7 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int b_width;
 
     int req_size, ret;
-    uint8_t *buf;
+    uint8_t *buf = NULL;
 
     int *charmap     = c->mc_charmap;
     uint8_t *colram  = c->mc_colram;
@@ -280,7 +300,7 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     /* no data, means end encoding asap */
-    if (!pict) {
+    if (!p) {
         /* all done, end encoding */
         if (!c->mc_lifetime) return 0;
         /* no more frames in queue, prepare to flush remaining frames */
@@ -293,16 +313,10 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     } else {
         /* fill up mc_meta_charset with data until lifetime exceeds */
         if (c->mc_frame_counter < c->mc_lifetime) {
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-            avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-            to_meta_with_crop(avctx, pict, meta + 32000 * c->mc_frame_counter);
+            to_meta_with_crop(avctx, p, meta + 32000 * c->mc_frame_counter);
             c->mc_frame_counter++;
             if (c->next_pts == AV_NOPTS_VALUE)
-                c->next_pts = pict->pts;
+                c->next_pts = p->pts;
             /* lifetime is not reached so wait for next frame first */
             return 0;
         }
@@ -313,19 +327,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
         req_size = 0;
         /* any frames to encode? */
         if (c->mc_lifetime) {
-            req_size = charset_size + c->mc_lifetime*(screen_size + colram_size);
-            if ((ret = ff_alloc_packet(pkt, req_size)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", req_size);
+            int alloc_size = charset_size + c->mc_lifetime*(screen_size + colram_size);
+            if ((ret = ff_alloc_packet2(avctx, pkt, alloc_size, 0)) < 0)
                 return ret;
-            }
             buf = pkt->data;
 
             /* calc optimal new charset + charmaps */
-            ret = ff_init_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
+            ret = avpriv_init_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
                                CHARSET_CHARS, 50, charmap, &c->randctx);
             if (ret < 0)
                 return ret;
-            ret = ff_do_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
+            ret = avpriv_do_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
                              CHARSET_CHARS, 50, charmap, &c->randctx);
             if (ret < 0)
                 return ret;
@@ -338,7 +350,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             /* advance pointers */
             buf      += charset_size;
-            charset  += charset_size;
+            req_size += charset_size;
         }
 
         /* write x frames to buf */
@@ -375,6 +387,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         pkt->pts = pkt->dts = c->next_pts;
         c->next_pts         = AV_NOPTS_VALUE;
 
+        av_assert0(pkt->size >= req_size);
         pkt->size   = req_size;
         pkt->flags |= AV_PKT_FLAG_KEY;
         *got_packet = !!req_size;
@@ -382,6 +395,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+#if CONFIG_A64MULTI_ENCODER
 AVCodec ff_a64multi_encoder = {
     .name           = "a64multi",
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64"),
@@ -394,7 +408,8 @@ AVCodec ff_a64multi_encoder = {
     .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
     .capabilities   = AV_CODEC_CAP_DELAY,
 };
-
+#endif
+#if CONFIG_A64MULTI5_ENCODER
 AVCodec ff_a64multi5_encoder = {
     .name           = "a64multi5",
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64, extended with 5th color (colram)"),
@@ -407,3 +422,4 @@ AVCodec ff_a64multi5_encoder = {
     .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
     .capabilities   = AV_CODEC_CAP_DELAY,
 };
+#endif
diff --git a/libavcodec/a64tables.h b/libavcodec/a64tables.h
index b95c5ce..a955ef4 100644
--- a/libavcodec/a64tables.h
+++ b/libavcodec/a64tables.h
@@ -2,20 +2,20 @@
  * a64 video encoder - tables used by a64 encoders
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index fed6bf4..c2b9c98 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,14 @@
 #ifndef AVCODEC_AAC_H
 #define AVCODEC_AAC_H
 
+
+#include "aac_defines.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
 #include "avcodec.h"
-#include "imdct15.h"
+#if !USE_FIXED
+#include "mdct15.h"
+#endif
 #include "fft.h"
 #include "mpeg4audio.h"
 #include "sbr.h"
@@ -45,6 +50,8 @@
 #define TNS_MAX_ORDER 20
 #define MAX_LTP_LONG_SFB 40
 
+#define CLIP_AVOIDANCE_FACTOR 0.95f
+
 enum RawDataBlockType {
     TYPE_SCE,
     TYPE_CPE,
@@ -76,12 +83,13 @@ enum BandType {
     ZERO_BT        = 0,     ///< Scalefactors and spectral data are all zero.
     FIRST_PAIR_BT  = 5,     ///< This and later band types encode two values (rather than four) with one code word.
     ESC_BT         = 11,    ///< Spectral data are coded with an escape sequence.
+    RESERVED_BT    = 12,    ///< Band types following are encoded differently from others.
     NOISE_BT       = 13,    ///< Spectral data are scaled white noise not coded in the bitstream.
-    INTENSITY_BT2  = 14,    ///< Scalefactor data are intensity stereo positions.
-    INTENSITY_BT   = 15,    ///< Scalefactor data are intensity stereo positions.
+    INTENSITY_BT2  = 14,    ///< Scalefactor data are intensity stereo positions (out of phase).
+    INTENSITY_BT   = 15,    ///< Scalefactor data are intensity stereo positions (in phase).
 };
 
-#define IS_CODEBOOK_UNSIGNED(x) ((x - 1) & 10)
+#define IS_CODEBOOK_UNSIGNED(x) (((x) - 1) & 10)
 
 enum ChannelPosition {
     AAC_CHANNEL_OFF   = 0,
@@ -125,12 +133,14 @@ typedef struct OutputConfiguration {
  * Predictor State
  */
 typedef struct PredictorState {
-    float cor0;
-    float cor1;
-    float var0;
-    float var1;
-    float r0;
-    float r1;
+    AAC_FLOAT cor0;
+    AAC_FLOAT cor1;
+    AAC_FLOAT var0;
+    AAC_FLOAT var1;
+    AAC_FLOAT r0;
+    AAC_FLOAT r1;
+    AAC_FLOAT k1;
+    AAC_FLOAT x_est;
 } PredictorState;
 
 #define MAX_PREDICTORS 672
@@ -141,13 +151,20 @@ typedef struct PredictorState {
 #define SCALE_MAX_DIFF   60    ///< maximum scalefactor difference allowed by standard
 #define SCALE_DIFF_ZERO  60    ///< codebook index corresponding to zero scalefactor indices difference
 
+#define POW_SF2_ZERO    200    ///< ff_aac_pow2sf_tab index corresponding to pow(2, 0);
+
+#define NOISE_PRE       256    ///< preamble for NOISE_BT, put in bitstream with the first noise band
+#define NOISE_PRE_BITS    9    ///< length of preamble
+#define NOISE_OFFSET     90    ///< subtracted from global gain, used as offset for the preamble
+
 /**
  * Long Term Prediction
  */
 typedef struct LongTermPrediction {
     int8_t present;
     int16_t lag;
-    float coef;
+    int coef_idx;
+    INTFLOAT coef;
     int8_t used[MAX_LTP_LONG_SFB];
 } LongTermPrediction;
 
@@ -169,7 +186,10 @@ typedef struct IndividualChannelStream {
     int predictor_present;
     int predictor_initialized;
     int predictor_reset_group;
+    int predictor_reset_count[31];  ///< used by encoder to count prediction resets
     uint8_t prediction_used[41];
+    uint8_t window_clipping[8]; ///< set if a certain window is near clipping
+    float clip_avoidance_factor; ///< set if any window is near clipping to the necessary atennuation factor to avoid it
 } IndividualChannelStream;
 
 /**
@@ -181,7 +201,8 @@ typedef struct TemporalNoiseShaping {
     int length[8][4];
     int direction[8][4];
     int order[8][4];
-    float coef[8][4][TNS_MAX_ORDER];
+    int coef_idx[8][4][TNS_MAX_ORDER];
+    INTFLOAT coef[8][4][TNS_MAX_ORDER];
 } TemporalNoiseShaping;
 
 /**
@@ -218,7 +239,7 @@ typedef struct ChannelCoupling {
     int ch_select[8];      /**< [0] shared list of gains; [1] list of gains for right channel;
                             *   [2] list of gains for left channel; [3] lists of gains for both channels
                             */
-    float gain[16][120];
+    INTFLOAT gain[16][120];
 } ChannelCoupling;
 
 /**
@@ -229,26 +250,36 @@ typedef struct SingleChannelElement {
     TemporalNoiseShaping tns;
     Pulse pulse;
     enum BandType band_type[128];                   ///< band types
+    enum BandType band_alt[128];                    ///< alternative band type (used by encoder)
     int band_type_run_end[120];                     ///< band type run end points
-    float sf[120];                                  ///< scalefactors
+    INTFLOAT sf[120];                               ///< scalefactors
     int sf_idx[128];                                ///< scalefactor indices (used by encoder)
     uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
-    DECLARE_ALIGNED(32, float,   coeffs)[1024];     ///< coefficients for IMDCT
-    DECLARE_ALIGNED(32, float,   saved)[1536];      ///< overlap
-    DECLARE_ALIGNED(32, float,   ret_buf)[2048];    ///< PCM output buffer
-    DECLARE_ALIGNED(16, float,   ltp_state)[3072];  ///< time signal for LTP
+    uint8_t can_pns[128];                           ///< band is allowed to PNS (informative)
+    float  is_ener[128];                            ///< Intensity stereo pos (used by encoder)
+    float pns_ener[128];                            ///< Noise energy values (used by encoder)
+    DECLARE_ALIGNED(32, INTFLOAT, pcoeffs)[1024];   ///< coefficients for IMDCT, pristine
+    DECLARE_ALIGNED(32, INTFLOAT, coeffs)[1024];    ///< coefficients for IMDCT, maybe processed
+    DECLARE_ALIGNED(32, INTFLOAT, saved)[1536];     ///< overlap
+    DECLARE_ALIGNED(32, INTFLOAT, ret_buf)[2048];   ///< PCM output buffer
+    DECLARE_ALIGNED(16, INTFLOAT, ltp_state)[3072]; ///< time signal for LTP
+    DECLARE_ALIGNED(32, AAC_FLOAT, lcoeffs)[1024];  ///< MDCT of LTP coefficients (used by encoder)
+    DECLARE_ALIGNED(32, AAC_FLOAT, prcoeffs)[1024]; ///< Main prediction coefs (used by encoder)
     PredictorState predictor_state[MAX_PREDICTORS];
-    float *ret;                                     ///< PCM output
+    INTFLOAT *ret;                                  ///< PCM output
 } SingleChannelElement;
 
 /**
  * channel element - generic struct for SCE/CPE/CCE/LFE
  */
 typedef struct ChannelElement {
+    int present;
     // CPE specific
     int common_window;        ///< Set if channels share a common 'IndividualChannelStream' in bitstream.
     int     ms_mode;          ///< Signals mid/side stereo flags coding mode (used by encoder)
+    uint8_t is_mode;          ///< Set if any bands have been encoded using intensity stereo (used by encoder)
     uint8_t ms_mask[128];     ///< Set if mid/side stereo is used for each scalefactor window band
+    uint8_t is_mask[128];     ///< Set if intensity stereo is used (used by encoder)
     // shared
     SingleChannelElement ch[2];
     // CCE specific
@@ -259,7 +290,8 @@ typedef struct ChannelElement {
 /**
  * main AAC context
  */
-typedef struct AACContext {
+struct AACContext {
+    AVClass        *class;
     AVCodecContext *avctx;
     AVFrame *frame;
 
@@ -273,6 +305,7 @@ typedef struct AACContext {
     ChannelElement          *che[4][MAX_ELEM_ID];
     ChannelElement  *tag_che_map[4][MAX_ELEM_ID];
     int tags_mapped;
+    int warned_remapping_once;
     /** @} */
 
     /**
@@ -280,7 +313,7 @@ typedef struct AACContext {
      * (We do not want to have these on the stack.)
      * @{
      */
-    DECLARE_ALIGNED(32, float, buf_mdct)[1024];
+    DECLARE_ALIGNED(32, INTFLOAT, buf_mdct)[1024];
     /** @} */
 
     /**
@@ -291,8 +324,14 @@ typedef struct AACContext {
     FFTContext mdct_small;
     FFTContext mdct_ld;
     FFTContext mdct_ltp;
-    IMDCT15Context *mdct480;
-    AVFloatDSPContext fdsp;
+#if USE_FIXED
+    AVFixedDSPContext *fdsp;
+#else
+    MDCT15Context *mdct120;
+    MDCT15Context *mdct480;
+    MDCT15Context *mdct960;
+    AVFloatDSPContext *fdsp;
+#endif /* USE_FIXED */
     int random_state;
     /** @} */
 
@@ -303,9 +342,36 @@ typedef struct AACContext {
     SingleChannelElement *output_element[MAX_CHANNELS]; ///< Points to each SingleChannelElement
     /** @} */
 
-    DECLARE_ALIGNED(32, float, temp)[128];
+
+    /**
+     * @name Japanese DTV specific extension
+     * @{
+     */
+    int force_dmono_mode;///< 0->not dmono, 1->use first channel, 2->use second channel
+    int dmono_mode;      ///< 0->not dmono, 1->use first channel, 2->use second channel
+    /** @} */
+
+    DECLARE_ALIGNED(32, INTFLOAT, temp)[128];
 
     OutputConfiguration oc[2];
-} AACContext;
+    int warned_num_aac_frames;
+    int warned_960_sbr;
+
+    int warned_gain_control;
+
+    /* aacdec functions pointers */
+    void (*imdct_and_windowing)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_ltp)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_tns)(INTFLOAT coef[1024], TemporalNoiseShaping *tns,
+                      IndividualChannelStream *ics, int decode);
+    void (*windowing_and_mdct_ltp)(AACContext *ac, INTFLOAT *out,
+                                   INTFLOAT *in, IndividualChannelStream *ics);
+    void (*update_ltp)(AACContext *ac, SingleChannelElement *sce);
+    void (*vector_pow43)(int *coefs, int len);
+    void (*subband_scale)(int *dst, int *src, int scale, int offset, int len, void *log_context);
+
+};
+
+void ff_aacdec_init_mips(AACContext *c);
 
 #endif /* AVCODEC_AAC_H */
diff --git a/libavcodec/aac_ac3_parser.c b/libavcodec/aac_ac3_parser.c
index a754f4a..54e4598 100644
--- a/libavcodec/aac_ac3_parser.c
+++ b/libavcodec/aac_ac3_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ int ff_aac_ac3_parse(AVCodecParserContext *s1,
     ParseContext *pc = &s->pc;
     int len, i;
     int new_frame_start;
+    int got_frame = 0;
 
 get_next:
     i=END_NOT_FOUND;
@@ -51,6 +52,7 @@ get_next:
             if(len<=0){
                 i=END_NOT_FOUND;
             }else{
+                got_frame = 1;
                 s->state=0;
                 i-= s->header_size -1;
                 s->remaining_size = len;
@@ -58,6 +60,9 @@ get_next:
                     s->remaining_size += i;
                     goto get_next;
                 }
+                else if (i < 0) {
+                    s->remaining_size += i;
+                }
             }
         }
     }
@@ -76,19 +81,25 @@ get_next:
     if(s->codec_id)
         avctx->codec_id = s->codec_id;
 
-    /* Due to backwards compatible HE-AAC the sample rate, channel count,
-       and total number of samples found in an AAC ADTS header are not
-       reliable. Bit rate is still accurate because the total frame duration in
-       seconds is still correct (as is the number of bits in the frame). */
-    if (avctx->codec_id != AV_CODEC_ID_AAC) {
-        avctx->sample_rate = s->sample_rate;
-        avctx->channels = s->channels;
-        avctx->channel_layout = s->channel_layout;
-        s1->duration = s->samples;
-        avctx->audio_service_type = s->service_type;
-    }
+    if (got_frame) {
+        /* Due to backwards compatible HE-AAC the sample rate, channel count,
+           and total number of samples found in an AAC ADTS header are not
+           reliable. Bit rate is still accurate because the total frame
+           duration in seconds is still correct (as is the number of bits in
+           the frame). */
+        if (avctx->codec_id != AV_CODEC_ID_AAC) {
+            avctx->sample_rate = s->sample_rate;
+            if (avctx->codec_id != AV_CODEC_ID_EAC3) {
+                avctx->channels = s->channels;
+                avctx->channel_layout = s->channel_layout;
+            }
+            s1->duration = s->samples;
+            avctx->audio_service_type = s->service_type;
+        }
 
-    avctx->bit_rate = s->bit_rate;
+        if (avctx->codec_id != AV_CODEC_ID_EAC3)
+            avctx->bit_rate = s->bit_rate;
+    }
 
     return i;
 }
diff --git a/libavcodec/aac_ac3_parser.h b/libavcodec/aac_ac3_parser.h
index 99286f0..c2506a5 100644
--- a/libavcodec/aac_ac3_parser.h
+++ b/libavcodec/aac_ac3_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac_adtstoasc_bsf.c b/libavcodec/aac_adtstoasc_bsf.c
index 778387c..6541b11 100644
--- a/libavcodec/aac_adtstoasc_bsf.c
+++ b/libavcodec/aac_adtstoasc_bsf.c
@@ -2,20 +2,20 @@
  * MPEG-2/4 AAC ADTS to MPEG-4 Audio Specific Configuration bitstream filter
  * Copyright (c) 2009 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,27 +36,26 @@ typedef struct AACBSFContext {
  * This filter creates an MPEG-4 AudioSpecificConfig from an MPEG-2/4
  * ADTS header and removes the ADTS header.
  */
-static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *out)
+static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *pkt)
 {
     AACBSFContext *ctx = bsfc->priv_data;
 
     GetBitContext gb;
     PutBitContext pb;
     AACADTSHeaderInfo hdr;
-    AVPacket *in;
     int ret;
 
-    ret = ff_bsf_get_packet(bsfc, &in);
+    ret = ff_bsf_get_packet_ref(bsfc, pkt);
     if (ret < 0)
         return ret;
 
-    if (in->size < AV_AAC_ADTS_HEADER_SIZE)
-        goto packet_too_small;
+    if (bsfc->par_in->extradata && pkt->size >= 2 && (AV_RB16(pkt->data) >> 4) != 0xfff)
+        return 0;
 
-    init_get_bits(&gb, in->data, AV_AAC_ADTS_HEADER_SIZE * 8);
+    if (pkt->size < AV_AAC_ADTS_HEADER_SIZE)
+        goto packet_too_small;
 
-    if (bsfc->par_in->extradata && show_bits(&gb, 12) != 0xfff)
-        goto finish;
+    init_get_bits(&gb, pkt->data, AV_AAC_ADTS_HEADER_SIZE * 8);
 
     if (ff_adts_header_parse(&gb, &hdr) < 0) {
         av_log(bsfc, AV_LOG_ERROR, "Error parsing ADTS frame header!\n");
@@ -71,10 +70,10 @@ static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *out)
         goto fail;
     }
 
-    in->size -= AV_AAC_ADTS_HEADER_SIZE + 2 * !hdr.crc_absent;
-    if (in->size <= 0)
+    pkt->size -= AV_AAC_ADTS_HEADER_SIZE + 2 * !hdr.crc_absent;
+    if (pkt->size <= 0)
         goto packet_too_small;
-    in->data += AV_AAC_ADTS_HEADER_SIZE + 2 * !hdr.crc_absent;
+    pkt->data += AV_AAC_ADTS_HEADER_SIZE + 2 * !hdr.crc_absent;
 
     if (!ctx->first_frame_done) {
         int            pce_size = 0;
@@ -82,7 +81,7 @@ static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *out)
         uint8_t       *extradata;
 
         if (!hdr.chan_config) {
-            init_get_bits(&gb, in->data, in->size * 8);
+            init_get_bits(&gb, pkt->data, pkt->size * 8);
             if (get_bits(&gb, 3) != 5) {
                 avpriv_report_missing_feature(bsfc,
                                               "PCE-based channel configuration "
@@ -94,11 +93,11 @@ static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *out)
             init_put_bits(&pb, pce_data, MAX_PCE_SIZE);
             pce_size = ff_copy_pce_data(&pb, &gb) / 8;
             flush_put_bits(&pb);
-            in->size -= get_bits_count(&gb)/8;
-            in->data += get_bits_count(&gb)/8;
+            pkt->size -= get_bits_count(&gb)/8;
+            pkt->data += get_bits_count(&gb)/8;
         }
 
-        extradata = av_packet_new_side_data(in, AV_PKT_DATA_NEW_EXTRADATA,
+        extradata = av_packet_new_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
                                             2 + pce_size);
         if (!extradata) {
             ret = AVERROR(ENOMEM);
@@ -120,17 +119,13 @@ static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *out)
         ctx->first_frame_done = 1;
     }
 
-finish:
-    av_packet_move_ref(out, in);
-    av_packet_free(&in);
-
     return 0;
 
 packet_too_small:
     av_log(bsfc, AV_LOG_ERROR, "Input packet too small\n");
     ret = AVERROR_INVALIDDATA;
 fail:
-    av_packet_free(&in);
+    av_packet_unref(pkt);
     return ret;
 }
 
diff --git a/libavcodec/aac_defines.h b/libavcodec/aac_defines.h
new file mode 100644
index 0000000..438d78a
--- /dev/null
+++ b/libavcodec/aac_defines.h
@@ -0,0 +1,116 @@
+/*
+ * AAC defines
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AAC_DEFINES_H
+#define AVCODEC_AAC_DEFINES_H
+
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
+#if USE_FIXED
+
+#include "libavutil/softfloat.h"
+
+#define FFT_FLOAT    0
+#define FFT_FIXED_32 1
+
+#define AAC_RENAME(x)       x ## _fixed
+#define AAC_RENAME_32(x)    x ## _fixed_32
+typedef int                 INTFLOAT;
+typedef unsigned            UINTFLOAT;  ///< Equivalent to INTFLOAT, Used as temporal cast to avoid undefined sign overflow operations.
+typedef int64_t             INT64FLOAT;
+typedef int16_t             SHORTFLOAT;
+typedef SoftFloat           AAC_FLOAT;
+typedef int                 AAC_SIGNE;
+#define FIXR(a)             ((int)((a) * 1 + 0.5))
+#define FIXR10(a)           ((int)((a) * 1024.0 + 0.5))
+#define Q23(a)              (int)((a) * 8388608.0 + 0.5)
+#define Q30(x)              (int)((x)*1073741824.0 + 0.5)
+#define Q31(x)              (int)((x)*2147483648.0 + 0.5)
+#define RANGE15(x)          x
+#define GET_GAIN(x, y)      (-(y) * (1 << (x))) + 1024
+#define AAC_MUL16(x, y)     (int)(((int64_t)(x) * (y) + 0x8000) >> 16)
+#define AAC_MUL26(x, y)     (int)(((int64_t)(x) * (y) + 0x2000000) >> 26)
+#define AAC_MUL30(x, y)     (int)(((int64_t)(x) * (y) + 0x20000000) >> 30)
+#define AAC_MUL31(x, y)     (int)(((int64_t)(x) * (y) + 0x40000000) >> 31)
+#define AAC_MADD28(x, y, a, b) (int)((((int64_t)(x) * (y)) + \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x8000000) >> 28)
+#define AAC_MADD30(x, y, a, b) (int)((((int64_t)(x) * (y)) + \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x20000000) >> 30)
+#define AAC_MADD30_V8(x, y, a, b, c, d, e, f) (int)((((int64_t)(x) * (y)) + \
+                                                     ((int64_t)(a) * (b)) + \
+                                                     ((int64_t)(c) * (d)) + \
+                                                     ((int64_t)(e) * (f)) + \
+                                                       0x20000000) >> 30)
+#define AAC_MSUB30(x, y, a, b) (int)((((int64_t)(x) * (y)) - \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x20000000) >> 30)
+#define AAC_MSUB30_V8(x, y, a, b, c, d, e, f) (int)((((int64_t)(x) * (y)) + \
+                                                     ((int64_t)(a) * (b)) - \
+                                                     ((int64_t)(c) * (d)) - \
+                                                     ((int64_t)(e) * (f)) + \
+                                                       0x20000000) >> 30)
+#define AAC_MSUB31_V3(x, y, z)    (int)((((int64_t)(x) * (z)) - \
+                                      ((int64_t)(y) * (z)) + \
+                                        0x40000000) >> 31)
+#define AAC_HALF_SUM(x, y)  (((x) >> 1) + ((y) >> 1))
+#define AAC_SRA_R(x, y)     (int)(((x) + (1 << ((y) - 1))) >> (y))
+
+#else
+
+#define FFT_FLOAT    1
+#define FFT_FIXED_32 0
+
+#define AAC_RENAME(x)       x
+#define AAC_RENAME_32(x)    x
+typedef float               INTFLOAT;
+typedef float               UINTFLOAT;
+typedef float               INT64FLOAT;
+typedef float               SHORTFLOAT;
+typedef float               AAC_FLOAT;
+typedef unsigned            AAC_SIGNE;
+#define FIXR(x)             ((float)(x))
+#define FIXR10(x)           ((float)(x))
+#define Q23(x)              ((float)(x))
+#define Q30(x)              ((float)(x))
+#define Q31(x)              ((float)(x))
+#define RANGE15(x)          (32768.0 * (x))
+#define GET_GAIN(x, y)      powf((x), -(y))
+#define AAC_MUL16(x, y)     ((x) * (y))
+#define AAC_MUL26(x, y)     ((x) * (y))
+#define AAC_MUL30(x, y)     ((x) * (y))
+#define AAC_MUL31(x, y)     ((x) * (y))
+#define AAC_MADD28(x, y, a, b) ((x) * (y) + (a) * (b))
+#define AAC_MADD30(x, y, a, b) ((x) * (y) + (a) * (b))
+#define AAC_MADD30_V8(x, y, a, b, c, d, e, f) ((x) * (y) + (a) * (b) + \
+                                               (c) * (d) + (e) * (f))
+#define AAC_MSUB30(x, y, a, b) ((x) * (y) - (a) * (b))
+#define AAC_MSUB30_V8(x, y, a, b, c, d, e, f) ((x) * (y) + (a) * (b) - \
+                                               (c) * (d) - (e) * (f))
+#define AAC_MSUB31_V3(x, y, z)    ((x) - (y)) * (z)
+#define AAC_HALF_SUM(x, y)  ((x) + (y)) * 0.5f
+#define AAC_SRA_R(x, y)     (x)
+
+#endif /* USE_FIXED */
+
+#endif /* AVCODEC_AAC_DEFINES_H */
diff --git a/libavcodec/aac_parser.c b/libavcodec/aac_parser.c
index 41b301c..b869262 100644
--- a/libavcodec/aac_parser.c
+++ b/libavcodec/aac_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac_tablegen.h b/libavcodec/aac_tablegen.h
deleted file mode 100644
index 8a05ec5..0000000
--- a/libavcodec/aac_tablegen.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Header file for hardcoded AAC tables
- *
- * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_AAC_TABLEGEN_H
-#define AVCODEC_AAC_TABLEGEN_H
-
-#include "aac_tablegen_decl.h"
-
-#if CONFIG_HARDCODED_TABLES
-#include "libavcodec/aac_tables.h"
-#else
-#include "libavutil/mathematics.h"
-float ff_aac_pow2sf_tab[428];
-
-void ff_aac_tableinit(void)
-{
-    int i;
-    for (i = 0; i < 428; i++)
-        ff_aac_pow2sf_tab[i] = pow(2, (i - POW_SF2_ZERO) / 4.0);
-}
-#endif /* CONFIG_HARDCODED_TABLES */
-
-#endif /* AVCODEC_AAC_TABLEGEN_H */
diff --git a/libavcodec/aac_tablegen_decl.h b/libavcodec/aac_tablegen_decl.h
deleted file mode 100644
index a5fd1cf..0000000
--- a/libavcodec/aac_tablegen_decl.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Header file for hardcoded AAC tables
- *
- * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_AAC_TABLEGEN_DECL_H
-#define AVCODEC_AAC_TABLEGEN_DECL_H
-
-#define POW_SF2_ZERO    200    ///< ff_aac_pow2sf_tab index corresponding to pow(2, 0);
-
-#if CONFIG_HARDCODED_TABLES
-#define ff_aac_tableinit()
-extern const float ff_aac_pow2sf_tab[428];
-#else
-void ff_aac_tableinit(void);
-extern       float ff_aac_pow2sf_tab[428];
-#endif /* CONFIG_HARDCODED_TABLES */
-
-#endif /* AVCODEC_AAC_TABLEGEN_DECL_H */
diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index a654844..baa8248 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -2,20 +2,20 @@
  * AAC coefficients encoder
  * Copyright (C) 2008-2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,269 +33,34 @@
 #include "libavutil/libm.h" // brought forward to work around cygwin header breakage
 
 #include <float.h>
+
 #include "libavutil/mathematics.h"
+#include "mathops.h"
 #include "avcodec.h"
 #include "put_bits.h"
 #include "aac.h"
 #include "aacenc.h"
 #include "aactab.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
+#include "aacenc_quantization.h"
 
-/** bits needed to code codebook run value for long windows */
-static const uint8_t run_value_bits_long[64] = {
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
-};
-
-/** bits needed to code codebook run value for short windows */
-static const uint8_t run_value_bits_short[16] = {
-    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
-};
-
-static const uint8_t * const run_value_bits[2] = {
-    run_value_bits_long, run_value_bits_short
-};
-
-
-/**
- * Quantize one coefficient.
- * @return absolute value of the quantized coefficient
- * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
- */
-static av_always_inline int quant(float coef, const float Q)
-{
-    float a = coef * Q;
-    return sqrtf(a * sqrtf(a)) + 0.4054;
-}
-
-static void quantize_bands(int *out, const float *in, const float *scaled,
-                           int size, float Q34, int is_signed, int maxval)
-{
-    int i;
-    double qc;
-    for (i = 0; i < size; i++) {
-        qc = scaled[i] * Q34;
-        out[i] = (int)FFMIN(qc + 0.4054, (double)maxval);
-        if (is_signed && in[i] < 0.0f) {
-            out[i] = -out[i];
-        }
-    }
-}
-
-static void abs_pow34_v(float *out, const float *in, const int size)
-{
-#ifndef USE_REALLY_FULL_SEARCH
-    int i;
-    for (i = 0; i < size; i++) {
-        float a = fabsf(in[i]);
-        out[i] = sqrtf(a * sqrtf(a));
-    }
-#endif /* USE_REALLY_FULL_SEARCH */
-}
-
-static const uint8_t aac_cb_range [12] = {0, 3, 3, 3, 3, 9, 9, 8, 8, 13, 13, 17};
-static const uint8_t aac_cb_maxval[12] = {0, 1, 1, 2, 2, 4, 4, 7, 7, 12, 12, 16};
-
-/**
- * Calculate rate distortion cost for quantizing with given codebook
- *
- * @return quantization distortion
- */
-static av_always_inline float quantize_and_encode_band_cost_template(
-                                struct AACEncContext *s,
-                                PutBitContext *pb, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits, int BT_ZERO, int BT_UNSIGNED,
-                                int BT_PAIR, int BT_ESC)
-{
-    const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512;
-    const float Q   = ff_aac_pow2sf_tab [q_idx];
-    const float Q34 = ff_aac_pow34sf_tab[q_idx];
-    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
-    const float CLIPPED_ESCAPE = 165140.0f*IQ;
-    int i, j;
-    float cost = 0;
-    const int dim = BT_PAIR ? 2 : 4;
-    int resbits = 0;
-    const int range  = aac_cb_range[cb];
-    const int maxval = aac_cb_maxval[cb];
-    int off;
-
-    if (BT_ZERO) {
-        for (i = 0; i < size; i++)
-            cost += in[i]*in[i];
-        if (bits)
-            *bits = 0;
-        return cost * lambda;
-    }
-    if (!scaled) {
-        abs_pow34_v(s->scoefs, in, size);
-        scaled = s->scoefs;
-    }
-    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, maxval);
-    if (BT_UNSIGNED) {
-        off = 0;
-    } else {
-        off = maxval;
-    }
-    for (i = 0; i < size; i += dim) {
-        const float *vec;
-        int *quants = s->qcoefs + i;
-        int curidx = 0;
-        int curbits;
-        float rd = 0.0f;
-        for (j = 0; j < dim; j++) {
-            curidx *= range;
-            curidx += quants[j] + off;
-        }
-        curbits =  ff_aac_spectral_bits[cb-1][curidx];
-        vec     = &ff_aac_codebook_vectors[cb-1][curidx*dim];
-        if (BT_UNSIGNED) {
-            for (j = 0; j < dim; j++) {
-                float t = fabsf(in[i+j]);
-                float di;
-                if (BT_ESC && vec[j] == 64.0f) { //FIXME: slow
-                    if (t >= CLIPPED_ESCAPE) {
-                        di = t - CLIPPED_ESCAPE;
-                        curbits += 21;
-                    } else {
-                        int c = av_clip_uintp2(quant(t, Q), 13);
-                        di = t - c*cbrtf(c)*IQ;
-                        curbits += av_log2(c)*2 - 4 + 1;
-                    }
-                } else {
-                    di = t - vec[j]*IQ;
-                }
-                if (vec[j] != 0.0f)
-                    curbits++;
-                rd += di*di;
-            }
-        } else {
-            for (j = 0; j < dim; j++) {
-                float di = in[i+j] - vec[j]*IQ;
-                rd += di*di;
-            }
-        }
-        cost    += rd * lambda + curbits;
-        resbits += curbits;
-        if (cost >= uplim)
-            return uplim;
-        if (pb) {
-            put_bits(pb, ff_aac_spectral_bits[cb-1][curidx], ff_aac_spectral_codes[cb-1][curidx]);
-            if (BT_UNSIGNED)
-                for (j = 0; j < dim; j++)
-                    if (ff_aac_codebook_vectors[cb-1][curidx*dim+j] != 0.0f)
-                        put_bits(pb, 1, in[i+j] < 0.0f);
-            if (BT_ESC) {
-                for (j = 0; j < 2; j++) {
-                    if (ff_aac_codebook_vectors[cb-1][curidx*2+j] == 64.0f) {
-                        int coef = av_clip_uintp2(quant(fabsf(in[i+j]), Q), 13);
-                        int len = av_log2(coef);
-
-                        put_bits(pb, len - 4 + 1, (1 << (len - 4 + 1)) - 2);
-                        put_bits(pb, len, coef & ((1 << len) - 1));
-                    }
-                }
-            }
-        }
-    }
-
-    if (bits)
-        *bits = resbits;
-    return cost;
-}
+#include "aacenc_is.h"
+#include "aacenc_tns.h"
+#include "aacenc_ltp.h"
+#include "aacenc_pred.h"
 
-#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC) \
-static float quantize_and_encode_band_cost_ ## NAME(                                        \
-                                struct AACEncContext *s,                                \
-                                PutBitContext *pb, const float *in,                     \
-                                const float *scaled, int size, int scale_idx,           \
-                                int cb, const float lambda, const float uplim,          \
-                                int *bits) {                                            \
-    return quantize_and_encode_band_cost_template(                                      \
-                                s, pb, in, scaled, size, scale_idx,                     \
-                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits,              \
-                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC);                 \
-}
+#include "libavcodec/aaccoder_twoloop.h"
 
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1)
-
-static float (*const quantize_and_encode_band_cost_arr[])(
-                                struct AACEncContext *s,
-                                PutBitContext *pb, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits) = {
-    quantize_and_encode_band_cost_ZERO,
-    quantize_and_encode_band_cost_SQUAD,
-    quantize_and_encode_band_cost_SQUAD,
-    quantize_and_encode_band_cost_UQUAD,
-    quantize_and_encode_band_cost_UQUAD,
-    quantize_and_encode_band_cost_SPAIR,
-    quantize_and_encode_band_cost_SPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_ESC,
-};
+/* Parameter of f(x) = a*(lambda/100), defines the maximum fourier spread
+ * beyond which no PNS is used (since the SFBs contain tone rather than noise) */
+#define NOISE_SPREAD_THRESHOLD 0.9f
 
-#define quantize_and_encode_band_cost(                                  \
-                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)                    \
-    quantize_and_encode_band_cost_arr[cb](                              \
-                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
-
-static float quantize_band_cost(struct AACEncContext *s, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits)
-{
-    return quantize_and_encode_band_cost(s, NULL, in, scaled, size, scale_idx,
-                                         cb, lambda, uplim, bits);
-}
+/* Parameter of f(x) = a*(100/lambda), defines how much PNS is allowed to
+ * replace low energy non zero bands */
+#define NOISE_LAMBDA_REPLACE 1.948f
 
-static void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
-                                     const float *in, int size, int scale_idx,
-                                     int cb, const float lambda)
-{
-    quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL);
-}
-
-static float find_max_val(int group_len, int swb_size, const float *scaled) {
-    float maxval = 0.0f;
-    int w2, i;
-    for (w2 = 0; w2 < group_len; w2++) {
-        for (i = 0; i < swb_size; i++) {
-            maxval = FFMAX(maxval, scaled[w2*128+i]);
-        }
-    }
-    return maxval;
-}
-
-static int find_min_book(float maxval, int sf) {
-    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
-    float Q34 = sqrtf(Q * sqrtf(Q));
-    int qmaxval, cb;
-    qmaxval = maxval * Q34 + 0.4054f;
-    if      (qmaxval ==  0) cb = 0;
-    else if (qmaxval ==  1) cb = 1;
-    else if (qmaxval ==  2) cb = 3;
-    else if (qmaxval <=  4) cb = 5;
-    else if (qmaxval <=  7) cb = 7;
-    else if (qmaxval <= 12) cb = 9;
-    else                    cb = 11;
-    return cb;
-}
+#include "libavcodec/aaccoder_trellis.h"
 
 /**
  * structure used in optimal codebook search
@@ -312,7 +77,7 @@ typedef struct BandCodingPath {
 static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce,
                                      int win, int group_len, const float lambda)
 {
-    BandCodingPath path[120][12];
+    BandCodingPath path[120][CB_TOT_ALL];
     int w, swb, cb, start, size;
     int i, j;
     const int max_sfb  = sce->ics.max_sfb;
@@ -323,9 +88,9 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     float next_minrd = INFINITY;
     int next_mincb = 0;
 
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+    s->abs_pow34(s->scoefs, sce->coeffs, 1024);
     start = win*128;
-    for (cb = 0; cb < 12; cb++) {
+    for (cb = 0; cb < CB_TOT_ALL; cb++) {
         path[0][cb].cost     = 0.0f;
         path[0][cb].prev_idx = -1;
         path[0][cb].run      = 0;
@@ -333,7 +98,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     for (swb = 0; swb < max_sfb; swb++) {
         size = sce->ics.swb_sizes[swb];
         if (sce->zeroes[win*16 + swb]) {
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT_ALL; cb++) {
                 path[swb+1][cb].prev_idx = cb;
                 path[swb+1][cb].cost     = path[swb][cb].cost;
                 path[swb+1][cb].run      = path[swb][cb].run + 1;
@@ -343,15 +108,22 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
             int mincb = next_mincb;
             next_minrd = INFINITY;
             next_mincb = 0;
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT_ALL; cb++) {
                 float cost_stay_here, cost_get_here;
                 float rd = 0.0f;
+                if (cb >= 12 && sce->band_type[win*16+swb] < aac_cb_out_map[cb] ||
+                    cb  < aac_cb_in_map[sce->band_type[win*16+swb]] && sce->band_type[win*16+swb] > aac_cb_out_map[cb]) {
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].cost     = INFINITY;
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
+                    continue;
+                }
                 for (w = 0; w < group_len; w++) {
                     FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(win+w)*16+swb];
-                    rd += quantize_band_cost(s, sce->coeffs + start + w*128,
-                                             s->scoefs + start + w*128, size,
-                                             sce->sf_idx[(win+w)*16+swb], cb,
-                                             lambda / band->threshold, INFINITY, NULL);
+                    rd += quantize_band_cost(s, &sce->coeffs[start + w*128],
+                                             &s->scoefs[start + w*128], size,
+                                             sce->sf_idx[(win+w)*16+swb], aac_cb_out_map[cb],
+                                             lambda / band->threshold, INFINITY, NULL, NULL, 0);
                 }
                 cost_stay_here = path[swb][cb].cost + rd;
                 cost_get_here  = minrd              + rd + run_bits + 4;
@@ -379,11 +151,12 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //convert resulting path from backward-linked list
     stack_len = 0;
     idx       = 0;
-    for (cb = 1; cb < 12; cb++)
+    for (cb = 1; cb < CB_TOT_ALL; cb++)
         if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
             idx = cb;
     ppos = max_sfb;
     while (ppos > 0) {
+        av_assert1(idx >= 0);
         cb = idx;
         stackrun[stack_len] = path[ppos][cb].run;
         stackcb [stack_len] = cb;
@@ -394,12 +167,13 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //perform actual band info encoding
     start = 0;
     for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
         count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
+        memset(sce->zeroes + win*16 + start, !cb, count);
         //XXX: memset when band_type is also uint8_t
         for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
+            sce->band_type[win*16 + start] = cb;
             start++;
         }
         while (count >= run_esc) {
@@ -410,147 +184,54 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     }
 }
 
-static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
-                                  int win, int group_len, const float lambda)
+
+typedef struct TrellisPath {
+    float cost;
+    int prev;
+} TrellisPath;
+
+#define TRELLIS_STAGES 121
+#define TRELLIS_STATES (SCALE_MAX_DIFF+1)
+
+static void set_special_band_scalefactors(AACEncContext *s, SingleChannelElement *sce)
 {
-    BandCodingPath path[120][12];
-    int w, swb, cb, start, size;
-    int i, j;
-    const int max_sfb  = sce->ics.max_sfb;
-    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
-    const int run_esc  = (1 << run_bits) - 1;
-    int idx, ppos, count;
-    int stackrun[120], stackcb[120], stack_len;
-    float next_minbits = INFINITY;
-    int next_mincb = 0;
+    int w, g;
+    int prevscaler_n = -255, prevscaler_i = 0;
+    int bands = 0;
 
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    start = win*128;
-    for (cb = 0; cb < 12; cb++) {
-        path[0][cb].cost     = run_bits+4;
-        path[0][cb].prev_idx = -1;
-        path[0][cb].run      = 0;
-    }
-    for (swb = 0; swb < max_sfb; swb++) {
-        size = sce->ics.swb_sizes[swb];
-        if (sce->zeroes[win*16 + swb]) {
-            float cost_stay_here = path[swb][0].cost;
-            float cost_get_here  = next_minbits + run_bits + 4;
-            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
-                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
-                cost_stay_here += run_bits;
-            if (cost_get_here < cost_stay_here) {
-                path[swb+1][0].prev_idx = next_mincb;
-                path[swb+1][0].cost     = cost_get_here;
-                path[swb+1][0].run      = 1;
-            } else {
-                path[swb+1][0].prev_idx = 0;
-                path[swb+1][0].cost     = cost_stay_here;
-                path[swb+1][0].run      = path[swb][0].run + 1;
-            }
-            next_minbits = path[swb+1][0].cost;
-            next_mincb = 0;
-            for (cb = 1; cb < 12; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-        } else {
-            float minbits = next_minbits;
-            int mincb = next_mincb;
-            int startcb = sce->band_type[win*16+swb];
-            next_minbits = INFINITY;
-            next_mincb = 0;
-            for (cb = 0; cb < startcb; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-            for (cb = startcb; cb < 12; cb++) {
-                float cost_stay_here, cost_get_here;
-                float bits = 0.0f;
-                for (w = 0; w < group_len; w++) {
-                    bits += quantize_band_cost(s, sce->coeffs + start + w*128,
-                                               s->scoefs + start + w*128, size,
-                                               sce->sf_idx[(win+w)*16+swb], cb,
-                                               0, INFINITY, NULL);
-                }
-                cost_stay_here = path[swb][cb].cost + bits;
-                cost_get_here  = minbits            + bits + run_bits + 4;
-                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
-                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
-                    cost_stay_here += run_bits;
-                if (cost_get_here < cost_stay_here) {
-                    path[swb+1][cb].prev_idx = mincb;
-                    path[swb+1][cb].cost     = cost_get_here;
-                    path[swb+1][cb].run      = 1;
-                } else {
-                    path[swb+1][cb].prev_idx = cb;
-                    path[swb+1][cb].cost     = cost_stay_here;
-                    path[swb+1][cb].run      = path[swb][cb].run + 1;
-                }
-                if (path[swb+1][cb].cost < next_minbits) {
-                    next_minbits = path[swb+1][cb].cost;
-                    next_mincb = cb;
-                }
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0; g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g])
+                continue;
+            if (sce->band_type[w*16+g] == INTENSITY_BT || sce->band_type[w*16+g] == INTENSITY_BT2) {
+                sce->sf_idx[w*16+g] = av_clip(roundf(log2f(sce->is_ener[w*16+g])*2), -155, 100);
+                bands++;
+            } else if (sce->band_type[w*16+g] == NOISE_BT) {
+                sce->sf_idx[w*16+g] = av_clip(3+ceilf(log2f(sce->pns_ener[w*16+g])*2), -100, 155);
+                if (prevscaler_n == -255)
+                    prevscaler_n = sce->sf_idx[w*16+g];
+                bands++;
             }
         }
-        start += sce->ics.swb_sizes[swb];
     }
 
-    //convert resulting path from backward-linked list
-    stack_len = 0;
-    idx       = 0;
-    for (cb = 1; cb < 12; cb++)
-        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
-            idx = cb;
-    ppos = max_sfb;
-    while (ppos > 0) {
-        assert(idx >= 0);
-        cb = idx;
-        stackrun[stack_len] = path[ppos][cb].run;
-        stackcb [stack_len] = cb;
-        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
-        ppos -= path[ppos][cb].run;
-        stack_len++;
-    }
-    //perform actual band info encoding
-    start = 0;
-    for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
-        count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
-        //XXX: memset when band_type is also uint8_t
-        for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
-            start++;
-        }
-        while (count >= run_esc) {
-            put_bits(&s->pb, run_bits, run_esc);
-            count -= run_esc;
+    if (!bands)
+        return;
+
+    /* Clip the scalefactor indices */
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0; g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g])
+                continue;
+            if (sce->band_type[w*16+g] == INTENSITY_BT || sce->band_type[w*16+g] == INTENSITY_BT2) {
+                sce->sf_idx[w*16+g] = prevscaler_i = av_clip(sce->sf_idx[w*16+g], prevscaler_i - SCALE_MAX_DIFF, prevscaler_i + SCALE_MAX_DIFF);
+            } else if (sce->band_type[w*16+g] == NOISE_BT) {
+                sce->sf_idx[w*16+g] = prevscaler_n = av_clip(sce->sf_idx[w*16+g], prevscaler_n - SCALE_MAX_DIFF, prevscaler_n + SCALE_MAX_DIFF);
+            }
         }
-        put_bits(&s->pb, run_bits, count);
     }
 }
 
-/** Return the minimum scalefactor where the quantized coef does not clip. */
-static av_always_inline uint8_t coef2minsf(float coef) {
-    return av_clip_uint8(log2f(coef)*4 - 69 + SCALE_ONE_POS - SCALE_DIV_512);
-}
-
-/** Return the maximum scalefactor where the quantized coef is not zero. */
-static av_always_inline uint8_t coef2maxsf(float coef) {
-    return av_clip_uint8(log2f(coef)*4 +  6 + SCALE_ONE_POS - SCALE_DIV_512);
-}
-
-typedef struct TrellisPath {
-    float cost;
-    int prev;
-} TrellisPath;
-
-#define TRELLIS_STAGES 121
-#define TRELLIS_STATES (SCALE_MAX_DIFF+1)
-
 static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                                        SingleChannelElement *sce,
                                        const float lambda)
@@ -582,9 +263,9 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
     }
 
     //minimum scalefactor index is when minimum nonzero coefficient after quantizing is not clipped
-    q0 = coef2minsf(q0f);
+    q0 = av_clip(coef2minsf(q0f), 0, SCALE_MAX_POS-1);
     //maximum scalefactor index is when maximum coefficient after quantizing is still not zero
-    q1 = coef2maxsf(q1f);
+    q1 = av_clip(coef2maxsf(q1f), 1, SCALE_MAX_POS);
     if (q1 - q0 > 60) {
         int q0low  = q0;
         int q1high = q1;
@@ -600,6 +281,12 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
             q1  = q1high;
         }
     }
+    // q0 == q1 isn't really a legal situation
+    if (q0 == q1) {
+        // the following is indirect but guarantees q1 != q0 && q1 near q0
+        q1 = av_clip(q0+1, 1, SCALE_MAX_POS);
+        q0 = av_clip(q1-1, 0, SCALE_MAX_POS - 1);
+    }
 
     for (i = 0; i < TRELLIS_STATES; i++) {
         paths[0][i].cost    = 0.0f;
@@ -612,11 +299,11 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
         }
     }
     idx = 1;
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+    s->abs_pow34(s->scoefs, sce->coeffs, 1024);
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         start = w*128;
         for (g = 0; g < sce->ics.num_swb; g++) {
-            const float *coefs = sce->coeffs + start;
+            const float *coefs = &sce->coeffs[start];
             float qmin, qmax;
             int nz = 0;
 
@@ -648,6 +335,10 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                 maxscale = coef2maxsf(qmax);
                 minscale = av_clip(minscale - q0, 0, TRELLIS_STATES - 1);
                 maxscale = av_clip(maxscale - q0, 0, TRELLIS_STATES);
+                if (minscale == maxscale) {
+                    maxscale = av_clip(minscale+1, 1, TRELLIS_STATES);
+                    minscale = av_clip(maxscale-1, 0, TRELLIS_STATES - 1);
+                }
                 maxval = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], s->scoefs+start);
                 for (q = minscale; q < maxscale; q++) {
                     float dist = 0;
@@ -655,7 +346,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                     for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                         FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                         dist += quantize_band_cost(s, coefs + w2*128, s->scoefs + start + w2*128, sce->ics.swb_sizes[g],
-                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL);
+                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL, NULL, 0);
                     }
                     minrd = FFMIN(minrd, dist);
 
@@ -691,27 +382,23 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
     }
     while (idx) {
         sce->sf_idx[bandaddr[idx]] = minq + q0;
-        minq = paths[idx][minq].prev;
+        minq = FFMAX(paths[idx][minq].prev, 0);
         idx--;
     }
     //set the same quantizers inside window groups
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
-        for (g = 0;  g < sce->ics.num_swb; g++)
+        for (g = 0; g < sce->ics.num_swb; g++)
             for (w2 = 1; w2 < sce->ics.group_len[w]; w2++)
                 sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-/**
- * two-loop quantizers search taken from ISO 13818-7 Appendix C
- */
-static void search_for_quantizers_twoloop(AVCodecContext *avctx,
-                                          AACEncContext *s,
-                                          SingleChannelElement *sce,
-                                          const float lambda)
+static void search_for_quantizers_fast(AVCodecContext *avctx, AACEncContext *s,
+                                       SingleChannelElement *sce,
+                                       const float lambda)
 {
     int start = 0, i, w, w2, g;
     int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128];
+    float dists[128] = { 0 }, uplims[128] = { 0 };
     float maxvals[128];
     int fflag, minscaler;
     int its  = 0;
@@ -721,15 +408,17 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
     // for values above this the decoder might end up in an endless loop
     // due to always having more bits than what can be encoded.
     destbits = FFMIN(destbits, 5800);
-    //XXX: some heuristic to determine initial quantizers will reduce search time
+    //some heuristic to determine initial quantizers will reduce search time
     //determine zero bands and upper limits
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
+        start = 0;
+        for (g = 0; g < sce->ics.num_swb; g++) {
             int nz = 0;
-            float uplim = 0.0f;
+            float uplim = 0.0f, energy = 0.0f;
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                 uplim += band->threshold;
+                energy += band->energy;
                 if (band->energy <= band->threshold || band->threshold == 0.0f) {
                     sce->zeroes[(w+w2)*16+g] = 1;
                     continue;
@@ -737,14 +426,16 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                 nz = 1;
             }
             uplims[w*16+g] = uplim *512;
+            sce->band_type[w*16+g] = 0;
             sce->zeroes[w*16+g] = !nz;
             if (nz)
                 minthr = FFMIN(minthr, uplim);
             allz |= nz;
+            start += sce->ics.swb_sizes[g];
         }
     }
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
+        for (g = 0; g < sce->ics.num_swb; g++) {
             if (sce->zeroes[w*16+g]) {
                 sce->sf_idx[w*16+g] = SCALE_ONE_POS;
                 continue;
@@ -755,11 +446,12 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
 
     if (!allz)
         return;
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+    s->abs_pow34(s->scoefs, sce->coeffs, 1024);
+    ff_quantize_band_cost_cache_init(s);
 
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
+        for (g = 0; g < sce->ics.num_swb; g++) {
             const float *scaled = s->scoefs + start;
             maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
             start += sce->ics.swb_sizes[g];
@@ -776,10 +468,9 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
         do {
             int prev = -1;
             tbits = 0;
-            fflag = 0;
             for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
                 start = w*128;
-                for (g = 0;  g < sce->ics.num_swb; g++) {
+                for (g = 0; g < sce->ics.num_swb; g++) {
                     const float *coefs = sce->coeffs + start;
                     const float *scaled = s->scoefs + start;
                     int bits = 0;
@@ -794,14 +485,13 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                     cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
                     for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                         int b;
-                        dist += quantize_band_cost(s, coefs + w2*128,
-                                                   scaled + w2*128,
-                                                   sce->ics.swb_sizes[g],
-                                                   sce->sf_idx[w*16+g],
-                                                   cb,
-                                                   1.0f,
-                                                   INFINITY,
-                                                   &b);
+                        dist += quantize_band_cost_cached(s, w + w2, g,
+                                                          coefs + w2*128,
+                                                          scaled + w2*128,
+                                                          sce->ics.swb_sizes[g],
+                                                          sce->sf_idx[w*16+g],
+                                                          cb, 1.0f, INFINITY,
+                                                          &b, NULL, 0);
                         bits += b;
                     }
                     dists[w*16+g] = dist - bits;
@@ -829,6 +519,7 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
 
         fflag = 0;
         minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
+
         for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
             for (g = 0; g < sce->ics.num_swb; g++) {
                 int prevsc = sce->sf_idx[w*16+g];
@@ -849,292 +540,425 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
     } while (fflag && its < 10);
 }
 
-static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
-                                       SingleChannelElement *sce,
-                                       const float lambda)
+static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce)
 {
-    int start = 0, i, w, w2, g;
-    float uplim[128], maxq[128];
-    int minq, maxsf;
-    float distfact = ((sce->ics.num_windows > 1) ? 85.80 : 147.84) / lambda;
-    int last = 0, lastband = 0, curband = 0;
-    float avg_energy = 0.0;
-    if (sce->ics.num_windows == 1) {
-        start = 0;
-        for (i = 0; i < 1024; i++) {
-            if (i - start >= sce->ics.swb_sizes[curband]) {
-                start += sce->ics.swb_sizes[curband];
-                curband++;
-            }
-            if (sce->coeffs[i]) {
-                avg_energy += sce->coeffs[i] * sce->coeffs[i];
-                last = i;
-                lastband = curband;
-            }
-        }
+    FFPsyBand *band;
+    int w, g, w2, i;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
+    float *PNS = &s->scoefs[0*128], *PNS34 = &s->scoefs[1*128];
+    float *NOR34 = &s->scoefs[3*128];
+    uint8_t nextband[128];
+    const float lambda = s->lambda;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
+    const float thr_mult = NOISE_LAMBDA_REPLACE*(100.0f/lambda);
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float dist_bias = av_clipf(4.f * 120 / lambda, 0.25f, 4.0f);
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & AV_CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int prev = -1000, prev_sf = -1;
+    int frame_bit_rate = (avctx->flags & AV_CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
     } else {
-        for (w = 0; w < 8; w++) {
-            const float *coeffs = sce->coeffs + w*128;
-            start = 0;
-            for (i = 0; i < 128; i++) {
-                if (i - start >= sce->ics.swb_sizes[curband]) {
-                    start += sce->ics.swb_sizes[curband];
-                    curband++;
-                }
-                if (coeffs[i]) {
-                    avg_energy += coeffs[i] * coeffs[i];
-                    last = FFMAX(last, i);
-                    lastband = FFMAX(lastband, curband);
-                }
-            }
-        }
-    }
-    last++;
-    avg_energy /= last;
-    if (avg_energy == 0.0f) {
-        for (i = 0; i < FF_ARRAY_ELEMS(sce->sf_idx); i++)
-            sce->sf_idx[i] = SCALE_ONE_POS;
-        return;
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
     }
+
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+    ff_init_nextband_map(sce, nextband);
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
+        int wstart = w*128;
         for (g = 0; g < sce->ics.num_swb; g++) {
-            float *coefs   = sce->coeffs + start;
-            const int size = sce->ics.swb_sizes[g];
-            int start2 = start, end2 = start + size, peakpos = start;
-            float maxval = -1, thr = 0.0f, t;
-            maxq[w*16+g] = 0.0f;
-            if (g > lastband) {
-                maxq[w*16+g] = 0.0f;
-                start += size;
-                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++)
-                    memset(coefs + w2*128, 0, sizeof(coefs[0])*size);
+            int noise_sfi;
+            float dist1 = 0.0f, dist2 = 0.0f, noise_amp;
+            float pns_energy = 0.0f, pns_tgt_energy, energy_ratio, dist_thresh;
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
+            const int start = wstart+sce->ics.swb_offset[g];
+            const float freq = (start-wstart)*freq_mult;
+            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
+            if (freq < NOISE_LOW_LIMIT || (start-wstart) >= cutoff) {
+                if (!sce->zeroes[w*16+g])
+                    prev_sf = sce->sf_idx[w*16+g];
                 continue;
             }
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                for (i = 0; i < size; i++) {
-                    float t = coefs[w2*128+i]*coefs[w2*128+i];
-                    maxq[w*16+g] = FFMAX(maxq[w*16+g], fabsf(coefs[w2*128 + i]));
-                    thr += t;
-                    if (sce->ics.num_windows == 1 && maxval < t) {
-                        maxval  = t;
-                        peakpos = start+i;
-                    }
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                sfb_energy += band->energy;
+                spread     = FFMIN(spread, band->spread);
+                threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
+                } else {
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
                 }
             }
-            if (sce->ics.num_windows == 1) {
-                start2 = FFMAX(peakpos - 2, start2);
-                end2   = FFMIN(peakpos + 3, end2);
-            } else {
-                start2 -= start;
-                end2   -= start;
-            }
-            start += size;
-            thr = pow(thr / (avg_energy * (end2 - start2)), 0.3 + 0.1*(lastband - g) / lastband);
-            t   = 1.0 - (1.0 * start2 / last);
-            uplim[w*16+g] = distfact / (1.4 * thr + t*t*t + 0.075);
-        }
-    }
-    memset(sce->sf_idx, 0, sizeof(sce->sf_idx));
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *coefs  = sce->coeffs + start;
-            const float *scaled = s->scoefs   + start;
-            const int size      = sce->ics.swb_sizes[g];
-            int scf, prev_scf, step;
-            int min_scf = -1, max_scf = 256;
-            float curdiff;
-            if (maxq[w*16+g] < 21.544) {
-                sce->zeroes[w*16+g] = 1;
-                start += size;
+
+            /* Ramps down at ~8000Hz and loosens the dist threshold */
+            dist_thresh = av_clipf(2.5f*NOISE_LOW_LIMIT/freq, 0.5f, 2.5f) * dist_bias;
+
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             *
+             * At this stage, point 2 is relaxed for zeroed bands near the noise threshold (hole avoidance is more important)
+             */
+            if ((!sce->zeroes[w*16+g] && !ff_sfdelta_can_remove_band(sce, nextband, prev_sf, w*16+g)) ||
+                ((sce->zeroes[w*16+g] || !sce->band_alt[w*16+g]) && sfb_energy < threshold*sqrtf(1.0f/freq_boost)) || spread < spread_threshold ||
+                (!sce->zeroes[w*16+g] && sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost) ||
+                min_energy < pns_transient_energy_r * max_energy ) {
+                sce->pns_ener[w*16+g] = sfb_energy;
+                if (!sce->zeroes[w*16+g])
+                    prev_sf = sce->sf_idx[w*16+g];
                 continue;
             }
-            sce->zeroes[w*16+g] = 0;
-            scf  = prev_scf = av_clip(SCALE_ONE_POS - SCALE_DIV_512 - log2f(1/maxq[w*16+g])*16/3, 60, 218);
-            step = 16;
-            for (;;) {
-                float dist = 0.0f;
-                int quant_max;
-
-                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                    int b;
-                    dist += quantize_band_cost(s, coefs + w2*128,
-                                               scaled + w2*128,
-                                               sce->ics.swb_sizes[g],
-                                               scf,
-                                               ESC_BT,
-                                               lambda,
-                                               INFINITY,
-                                               &b);
-                    dist -= b;
-                }
-                dist *= 1.0f / 512.0f / lambda;
-                quant_max = quant(maxq[w*16+g], ff_aac_pow2sf_tab[POW_SF2_ZERO - scf + SCALE_ONE_POS - SCALE_DIV_512]);
-                if (quant_max >= 8191) { // too much, return to the previous quantizer
-                    sce->sf_idx[w*16+g] = prev_scf;
-                    break;
+
+            pns_tgt_energy = sfb_energy*FFMIN(1.0f, spread*spread);
+            noise_sfi = av_clip(roundf(log2f(pns_tgt_energy)*2), -100, 155); /* Quantize */
+            noise_amp = -ff_aac_pow2sf_tab[noise_sfi + POW_SF2_ZERO];    /* Dequantize */
+            if (prev != -1000) {
+                int noise_sfdiff = noise_sfi - prev + SCALE_DIFF_ZERO;
+                if (noise_sfdiff < 0 || noise_sfdiff > 2*SCALE_MAX_DIFF) {
+                    if (!sce->zeroes[w*16+g])
+                        prev_sf = sce->sf_idx[w*16+g];
+                    continue;
                 }
-                prev_scf = scf;
-                curdiff = fabsf(dist - uplim[w*16+g]);
-                if (curdiff <= 1.0f)
-                    step = 0;
-                else
-                    step = log2f(curdiff);
-                if (dist > uplim[w*16+g])
-                    step = -step;
-                scf += step;
-                scf = av_clip_uint8(scf);
-                step = scf - prev_scf;
-                if (FFABS(step) <= 1 || (step > 0 && scf >= max_scf) || (step < 0 && scf <= min_scf)) {
-                    sce->sf_idx[w*16+g] = av_clip(scf, min_scf, max_scf);
-                    break;
+            }
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                float band_energy, scale, pns_senergy;
+                const int start_c = (w+w2)*128+sce->ics.swb_offset[g];
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                for (i = 0; i < sce->ics.swb_sizes[g]; i++) {
+                    s->random_state  = lcg_random(s->random_state);
+                    PNS[i] = s->random_state;
                 }
-                if (step > 0)
-                    min_scf = prev_scf;
-                else
-                    max_scf = prev_scf;
+                band_energy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                scale = noise_amp/sqrtf(band_energy);
+                s->fdsp->vector_fmul_scalar(PNS, PNS, scale, sce->ics.swb_sizes[g]);
+                pns_senergy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                pns_energy += pns_senergy;
+                s->abs_pow34(NOR34, &sce->coeffs[start_c], sce->ics.swb_sizes[g]);
+                s->abs_pow34(PNS34, PNS, sce->ics.swb_sizes[g]);
+                dist1 += quantize_band_cost(s, &sce->coeffs[start_c],
+                                            NOR34,
+                                            sce->ics.swb_sizes[g],
+                                            sce->sf_idx[(w+w2)*16+g],
+                                            sce->band_alt[(w+w2)*16+g],
+                                            lambda/band->threshold, INFINITY, NULL, NULL, 0);
+                /* Estimate rd on average as 5 bits for SF, 4 for the CB, plus spread energy * lambda/thr */
+                dist2 += band->energy/(band->spread*band->spread)*lambda*dist_thresh/band->threshold;
+            }
+            if (g && sce->band_type[w*16+g-1] == NOISE_BT) {
+                dist2 += 5;
+            } else {
+                dist2 += 9;
+            }
+            energy_ratio = pns_tgt_energy/pns_energy; /* Compensates for quantization error */
+            sce->pns_ener[w*16+g] = energy_ratio*pns_tgt_energy;
+            if (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || (energy_ratio > 0.85f && energy_ratio < 1.25f && dist2 < dist1)) {
+                sce->band_type[w*16+g] = NOISE_BT;
+                sce->zeroes[w*16+g] = 0;
+                prev = noise_sfi;
+            } else {
+                if (!sce->zeroes[w*16+g])
+                    prev_sf = sce->sf_idx[w*16+g];
             }
-            start += size;
         }
     }
-    minq = sce->sf_idx[0] ? sce->sf_idx[0] : INT_MAX;
-    for (i = 1; i < 128; i++) {
-        if (!sce->sf_idx[i])
-            sce->sf_idx[i] = sce->sf_idx[i-1];
-        else
-            minq = FFMIN(minq, sce->sf_idx[i]);
-    }
-    if (minq == INT_MAX)
-        minq = 0;
-    minq = FFMIN(minq, SCALE_MAX_POS);
-    maxsf = FFMIN(minq + SCALE_MAX_DIFF, SCALE_MAX_POS);
-    for (i = 126; i >= 0; i--) {
-        if (!sce->sf_idx[i])
-            sce->sf_idx[i] = sce->sf_idx[i+1];
-        sce->sf_idx[i] = av_clip(sce->sf_idx[i], minq, maxsf);
-    }
 }
 
-static void search_for_quantizers_fast(AVCodecContext *avctx, AACEncContext *s,
-                                       SingleChannelElement *sce,
-                                       const float lambda)
+static void mark_pns(AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce)
 {
-    int i, w, w2, g;
-    int minq = 255;
+    FFPsyBand *band;
+    int w, g, w2;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
+    const float lambda = s->lambda;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & AV_CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int frame_bit_rate = (avctx->flags & AV_CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
+    } else {
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+    }
 
-    memset(sce->sf_idx, 0, sizeof(sce->sf_idx));
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         for (g = 0; g < sce->ics.num_swb; g++) {
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
+            const int start = sce->ics.swb_offset[g];
+            const float freq = start*freq_mult;
+            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
+            if (freq < NOISE_LOW_LIMIT || start >= cutoff) {
+                sce->can_pns[w*16+g] = 0;
+                continue;
+            }
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                if (band->energy <= band->threshold) {
-                    sce->sf_idx[(w+w2)*16+g] = 218;
-                    sce->zeroes[(w+w2)*16+g] = 1;
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                sfb_energy += band->energy;
+                spread     = FFMIN(spread, band->spread);
+                threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
                 } else {
-                    sce->sf_idx[(w+w2)*16+g] = av_clip(SCALE_ONE_POS - SCALE_DIV_512 + log2f(band->threshold), 80, 218);
-                    sce->zeroes[(w+w2)*16+g] = 0;
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
                 }
-                minq = FFMIN(minq, sce->sf_idx[(w+w2)*16+g]);
+            }
+
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             */
+            sce->pns_ener[w*16+g] = sfb_energy;
+            if (sfb_energy < threshold*sqrtf(1.5f/freq_boost) || spread < spread_threshold || min_energy < pns_transient_energy_r * max_energy) {
+                sce->can_pns[w*16+g] = 0;
+            } else {
+                sce->can_pns[w*16+g] = 1;
             }
         }
     }
-    for (i = 0; i < 128; i++) {
-        sce->sf_idx[i] = 140;
-        //av_clip(sce->sf_idx[i], minq, minq + SCALE_MAX_DIFF - 1);
-    }
-    //set the same quantizers inside window groups
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
-        for (g = 0;  g < sce->ics.num_swb; g++)
-            for (w2 = 1; w2 < sce->ics.group_len[w]; w2++)
-                sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-static void search_for_ms(AACEncContext *s, ChannelElement *cpe,
-                          const float lambda)
+static void search_for_ms(AACEncContext *s, ChannelElement *cpe)
 {
-    int start = 0, i, w, w2, g;
-    float M[128], S[128];
-    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
+    int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
+    uint8_t nextband0[128], nextband1[128];
+    float *M   = s->scoefs + 128*0, *S   = s->scoefs + 128*1;
+    float *L34 = s->scoefs + 128*2, *R34 = s->scoefs + 128*3;
+    float *M34 = s->scoefs + 128*4, *S34 = s->scoefs + 128*5;
+    const float lambda = s->lambda;
+    const float mslambda = FFMIN(1.0f, lambda / 120.f);
     SingleChannelElement *sce0 = &cpe->ch[0];
     SingleChannelElement *sce1 = &cpe->ch[1];
     if (!cpe->common_window)
         return;
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce0, nextband0);
+    ff_init_nextband_map(sce1, nextband1);
+
+    prev_mid = sce0->sf_idx[0];
+    prev_side = sce1->sf_idx[0];
     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
-        for (g = 0;  g < sce0->ics.num_swb; g++) {
-            if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
-                float dist1 = 0.0f, dist2 = 0.0f;
+        start = 0;
+        for (g = 0; g < sce0->ics.num_swb; g++) {
+            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
+            if (!cpe->is_mask[w*16+g])
+                cpe->ms_mask[w*16+g] = 0;
+            if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
+                float Mmax = 0.0f, Smax = 0.0f;
+
+                /* Must compute mid/side SF and book for the whole window group */
                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
-                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
-                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
-                    float minthr = FFMIN(band0->threshold, band1->threshold);
-                    float maxthr = FFMAX(band0->threshold, band1->threshold);
                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
-                        M[i] = (sce0->coeffs[start+w2*128+i]
-                              + sce1->coeffs[start+w2*128+i]) * 0.5;
+                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
                         S[i] =  M[i]
-                              - sce1->coeffs[start+w2*128+i];
+                              - sce1->coeffs[start+(w+w2)*128+i];
+                    }
+                    s->abs_pow34(M34, M, sce0->ics.swb_sizes[g]);
+                    s->abs_pow34(S34, S, sce0->ics.swb_sizes[g]);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
+                        Mmax = FFMAX(Mmax, M34[i]);
+                        Smax = FFMAX(Smax, S34[i]);
+                    }
+                }
+
+                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
+                    float dist1 = 0.0f, dist2 = 0.0f;
+                    int B0 = 0, B1 = 0;
+                    int minidx;
+                    int mididx, sididx;
+                    int midcb, sidcb;
+
+                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
+                    mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
+                        && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
+                            || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
+                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
+                        continue;
+                    }
+
+                    midcb = find_min_book(Mmax, mididx);
+                    sidcb = find_min_book(Smax, sididx);
+
+                    /* No CB can be zero */
+                    midcb = FFMAX(1,midcb);
+                    sidcb = FFMAX(1,sidcb);
+
+                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                        float minthr = FFMIN(band0->threshold, band1->threshold);
+                        int b1,b2,b3,b4;
+                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                            S[i] =  M[i]
+                                  - sce1->coeffs[start+(w+w2)*128+i];
+                        }
+
+                        s->abs_pow34(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        s->abs_pow34(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        s->abs_pow34(M34, M,                         sce0->ics.swb_sizes[g]);
+                        s->abs_pow34(S34, S,                         sce0->ics.swb_sizes[g]);
+                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                    L34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[w*16+g],
+                                                    sce0->band_type[w*16+g],
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
+                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                    R34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[w*16+g],
+                                                    sce1->band_type[w*16+g],
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
+                        dist2 += quantize_band_cost(s, M,
+                                                    M34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    mididx,
+                                                    midcb,
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
+                        dist2 += quantize_band_cost(s, S,
+                                                    S34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sididx,
+                                                    sidcb,
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
+                        B0 += b1+b2;
+                        B1 += b3+b4;
+                        dist1 -= b1+b2;
+                        dist2 -= b3+b4;
+                    }
+                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
+                    if (cpe->ms_mask[w*16+g]) {
+                        if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
+                            sce0->sf_idx[w*16+g] = mididx;
+                            sce1->sf_idx[w*16+g] = sididx;
+                            sce0->band_type[w*16+g] = midcb;
+                            sce1->band_type[w*16+g] = sidcb;
+                        } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
+                            /* ms_mask unneeded, and it confuses some decoders */
+                            cpe->ms_mask[w*16+g] = 0;
+                        }
+                        break;
+                    } else if (B1 > B0) {
+                        /* More boost won't fix this */
+                        break;
                     }
-                    abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
-                    dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
-                                                L34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL);
-                    dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
-                                                R34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL);
-                    dist2 += quantize_band_cost(s, M,
-                                                M34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL);
-                    dist2 += quantize_band_cost(s, S,
-                                                S34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL);
                 }
-                cpe->ms_mask[w*16+g] = dist2 < dist1;
             }
+            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
+                prev_mid = sce0->sf_idx[w*16+g];
+            if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                prev_side = sce1->sf_idx[w*16+g];
             start += sce0->ics.swb_sizes[g];
         }
     }
 }
 
-const AACCoefficientsEncoder ff_aac_coders[] = {
-    {
-        search_for_quantizers_faac,
-        encode_window_bands_info,
-        quantize_and_encode_band,
-        search_for_ms,
-    },
-    {
+const AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
+    [AAC_CODER_ANMR] = {
         search_for_quantizers_anmr,
         encode_window_bands_info,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_ltp_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_pred,
+        ff_aac_adjust_common_ltp,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        ff_aac_update_ltp,
+        ff_aac_ltp_insert_new_frame,
+        set_special_band_scalefactors,
+        search_for_pns,
+        mark_pns,
+        ff_aac_search_for_tns,
+        ff_aac_search_for_ltp,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
-    {
+    [AAC_CODER_TWOLOOP] = {
         search_for_quantizers_twoloop,
         codebook_trellis_rate,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_ltp_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_pred,
+        ff_aac_adjust_common_ltp,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        ff_aac_update_ltp,
+        ff_aac_ltp_insert_new_frame,
+        set_special_band_scalefactors,
+        search_for_pns,
+        mark_pns,
+        ff_aac_search_for_tns,
+        ff_aac_search_for_ltp,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
-    {
+    [AAC_CODER_FAST] = {
         search_for_quantizers_fast,
-        encode_window_bands_info,
+        codebook_trellis_rate,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_ltp_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_pred,
+        ff_aac_adjust_common_ltp,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        ff_aac_update_ltp,
+        ff_aac_ltp_insert_new_frame,
+        set_special_band_scalefactors,
+        search_for_pns,
+        mark_pns,
+        ff_aac_search_for_tns,
+        ff_aac_search_for_ltp,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
 };
diff --git a/libavcodec/aaccoder_trellis.h b/libavcodec/aaccoder_trellis.h
new file mode 100644
index 0000000..940ebf0
--- /dev/null
+++ b/libavcodec/aaccoder_trellis.h
@@ -0,0 +1,192 @@
+/*
+ * AAC encoder trellis codebook selector
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder trellis codebook selector
+ * @author Konstantin Shishkov
+ */
+
+/**
+ * This file contains a template for the codebook_trellis_rate selector function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost_bits
+ *  - abs_pow34_v
+ */
+
+#ifndef AVCODEC_AACCODER_TRELLIS_H
+#define AVCODEC_AACCODER_TRELLIS_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+
+/**
+ * structure used in optimal codebook search
+ */
+typedef struct TrellisBandCodingPath {
+    int prev_idx; ///< pointer to the previous path point
+    float cost;   ///< path cost
+    int run;
+} TrellisBandCodingPath;
+
+
+static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
+                                  int win, int group_len, const float lambda)
+{
+    TrellisBandCodingPath path[120][CB_TOT_ALL];
+    int w, swb, cb, start, size;
+    int i, j;
+    const int max_sfb  = sce->ics.max_sfb;
+    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
+    const int run_esc  = (1 << run_bits) - 1;
+    int idx, ppos, count;
+    int stackrun[120], stackcb[120], stack_len;
+    float next_minbits = INFINITY;
+    int next_mincb = 0;
+
+    s->abs_pow34(s->scoefs, sce->coeffs, 1024);
+    start = win*128;
+    for (cb = 0; cb < CB_TOT_ALL; cb++) {
+        path[0][cb].cost     = run_bits+4;
+        path[0][cb].prev_idx = -1;
+        path[0][cb].run      = 0;
+    }
+    for (swb = 0; swb < max_sfb; swb++) {
+        size = sce->ics.swb_sizes[swb];
+        if (sce->zeroes[win*16 + swb]) {
+            float cost_stay_here = path[swb][0].cost;
+            float cost_get_here  = next_minbits + run_bits + 4;
+            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
+                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
+                cost_stay_here += run_bits;
+            if (cost_get_here < cost_stay_here) {
+                path[swb+1][0].prev_idx = next_mincb;
+                path[swb+1][0].cost     = cost_get_here;
+                path[swb+1][0].run      = 1;
+            } else {
+                path[swb+1][0].prev_idx = 0;
+                path[swb+1][0].cost     = cost_stay_here;
+                path[swb+1][0].run      = path[swb][0].run + 1;
+            }
+            next_minbits = path[swb+1][0].cost;
+            next_mincb = 0;
+            for (cb = 1; cb < CB_TOT_ALL; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+        } else {
+            float minbits = next_minbits;
+            int mincb = next_mincb;
+            int startcb = sce->band_type[win*16+swb];
+            startcb = aac_cb_in_map[startcb];
+            next_minbits = INFINITY;
+            next_mincb = 0;
+            for (cb = 0; cb < startcb; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+            for (cb = startcb; cb < CB_TOT_ALL; cb++) {
+                float cost_stay_here, cost_get_here;
+                float bits = 0.0f;
+                if (cb >= 12 && sce->band_type[win*16+swb] != aac_cb_out_map[cb]) {
+                    path[swb+1][cb].cost = 61450;
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].run = 0;
+                    continue;
+                }
+                for (w = 0; w < group_len; w++) {
+                    bits += quantize_band_cost_bits(s, &sce->coeffs[start + w*128],
+                                               &s->scoefs[start + w*128], size,
+                                               sce->sf_idx[win*16+swb],
+                                               aac_cb_out_map[cb],
+                                               0, INFINITY, NULL, NULL, 0);
+                }
+                cost_stay_here = path[swb][cb].cost + bits;
+                cost_get_here  = minbits            + bits + run_bits + 4;
+                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
+                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
+                    cost_stay_here += run_bits;
+                if (cost_get_here < cost_stay_here) {
+                    path[swb+1][cb].prev_idx = mincb;
+                    path[swb+1][cb].cost     = cost_get_here;
+                    path[swb+1][cb].run      = 1;
+                } else {
+                    path[swb+1][cb].prev_idx = cb;
+                    path[swb+1][cb].cost     = cost_stay_here;
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
+                }
+                if (path[swb+1][cb].cost < next_minbits) {
+                    next_minbits = path[swb+1][cb].cost;
+                    next_mincb = cb;
+                }
+            }
+        }
+        start += sce->ics.swb_sizes[swb];
+    }
+
+    //convert resulting path from backward-linked list
+    stack_len = 0;
+    idx       = 0;
+    for (cb = 1; cb < CB_TOT_ALL; cb++)
+        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
+            idx = cb;
+    ppos = max_sfb;
+    while (ppos > 0) {
+        av_assert1(idx >= 0);
+        cb = idx;
+        stackrun[stack_len] = path[ppos][cb].run;
+        stackcb [stack_len] = cb;
+        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
+        ppos -= path[ppos][cb].run;
+        stack_len++;
+    }
+    //perform actual band info encoding
+    start = 0;
+    for (i = stack_len - 1; i >= 0; i--) {
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
+        count = stackrun[i];
+        memset(sce->zeroes + win*16 + start, !cb, count);
+        //XXX: memset when band_type is also uint8_t
+        for (j = 0; j < count; j++) {
+            sce->band_type[win*16 + start] = cb;
+            start++;
+        }
+        while (count >= run_esc) {
+            put_bits(&s->pb, run_bits, run_esc);
+            count -= run_esc;
+        }
+        put_bits(&s->pb, run_bits, count);
+    }
+}
+
+
+#endif /* AVCODEC_AACCODER_TRELLIS_H */
diff --git a/libavcodec/aaccoder_twoloop.h b/libavcodec/aaccoder_twoloop.h
new file mode 100644
index 0000000..8e1bc88
--- /dev/null
+++ b/libavcodec/aaccoder_twoloop.h
@@ -0,0 +1,763 @@
+/*
+ * AAC encoder twoloop coder
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder twoloop coder
+ * @author Konstantin Shishkov, Claudio Freire
+ */
+
+/**
+ * This file contains a template for the twoloop coder function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost
+ *  - abs_pow34_v
+ *  - find_max_val
+ *  - find_min_book
+ *  - find_form_factor
+ */
+
+#ifndef AVCODEC_AACCODER_TWOLOOP_H
+#define AVCODEC_AACCODER_TWOLOOP_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "mathops.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+
+/** Frequency in Hz for lower limit of noise substitution **/
+#define NOISE_LOW_LIMIT 4000
+
+#define sclip(x) av_clip(x,60,218)
+
+/* Reflects the cost to change codebooks */
+static inline int ff_pns_bits(SingleChannelElement *sce, int w, int g)
+{
+    return (!g || !sce->zeroes[w*16+g-1] || !sce->can_pns[w*16+g-1]) ? 9 : 5;
+}
+
+/**
+ * two-loop quantizers search taken from ISO 13818-7 Appendix C
+ */
+static void search_for_quantizers_twoloop(AVCodecContext *avctx,
+                                          AACEncContext *s,
+                                          SingleChannelElement *sce,
+                                          const float lambda)
+{
+    int start = 0, i, w, w2, g, recomprd;
+    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & AV_CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+    int refbits = destbits;
+    int toomanybits, toofewbits;
+    char nzs[128];
+    uint8_t nextband[128];
+    int maxsf[128], minsf[128];
+    float dists[128] = { 0 }, qenergies[128] = { 0 }, uplims[128], euplims[128], energies[128];
+    float maxvals[128], spread_thr_r[128];
+    float min_spread_thr_r, max_spread_thr_r;
+
+    /**
+     * rdlambda controls the maximum tolerated distortion. Twoloop
+     * will keep iterating until it fails to lower it or it reaches
+     * ulimit * rdlambda. Keeping it low increases quality on difficult
+     * signals, but lower it too much, and bits will be taken from weak
+     * signals, creating "holes". A balance is necessary.
+     * rdmax and rdmin specify the relative deviation from rdlambda
+     * allowed for tonality compensation
+     */
+    float rdlambda = av_clipf(2.0f * 120.f / lambda, 0.0625f, 16.0f);
+    const float nzslope = 1.5f;
+    float rdmin = 0.03125f;
+    float rdmax = 1.0f;
+
+    /**
+     * sfoffs controls an offset of optmium allocation that will be
+     * applied based on lambda. Keep it real and modest, the loop
+     * will take care of the rest, this just accelerates convergence
+     */
+    float sfoffs = av_clipf(log2f(120.0f / lambda) * 4.0f, -5, 10);
+
+    int fflag, minscaler, maxscaler, nminscaler;
+    int its  = 0;
+    int maxits = 30;
+    int allz = 0;
+    int tbits;
+    int cutoff = 1024;
+    int pns_start_pos;
+    int prev;
+
+    /**
+     * zeroscale controls a multiplier of the threshold, if band energy
+     * is below this, a zero is forced. Keep it lower than 1, unless
+     * low lambda is used, because energy < threshold doesn't mean there's
+     * no audible signal outright, it's just energy. Also make it rise
+     * slower than rdlambda, as rdscale has due compensation with
+     * noisy band depriorization below, whereas zeroing logic is rather dumb
+     */
+    float zeroscale;
+    if (lambda > 120.f) {
+        zeroscale = av_clipf(powf(120.f / lambda, 0.25f), 0.0625f, 1.0f);
+    } else {
+        zeroscale = 1.f;
+    }
+
+    if (s->psy.bitres.alloc >= 0) {
+        /**
+         * Psy granted us extra bits to use, from the reservoire
+         * adjust for lambda except what psy already did
+         */
+        destbits = s->psy.bitres.alloc
+            * (lambda / (avctx->global_quality ? avctx->global_quality : 120));
+    }
+
+    if (avctx->flags & AV_CODEC_FLAG_QSCALE) {
+        /**
+         * Constant Q-scale doesn't compensate MS coding on its own
+         * No need to be overly precise, this only controls RD
+         * adjustment CB limits when going overboard
+         */
+        if (s->options.mid_side && s->cur_type == TYPE_CPE)
+            destbits *= 2;
+
+        /**
+         * When using a constant Q-scale, don't adjust bits, just use RD
+         * Don't let it go overboard, though... 8x psy target is enough
+         */
+        toomanybits = 5800;
+        toofewbits = destbits / 16;
+
+        /** Don't offset scalers, just RD */
+        sfoffs = sce->ics.num_windows - 1;
+        rdlambda = sqrtf(rdlambda);
+
+        /** search further */
+        maxits *= 2;
+    } else {
+        /* When using ABR, be strict, but a reasonable leeway is
+         * critical to allow RC to smoothly track desired bitrate
+         * without sudden quality drops that cause audible artifacts.
+         * Symmetry is also desirable, to avoid systematic bias.
+         */
+        toomanybits = destbits + destbits/8;
+        toofewbits = destbits - destbits/8;
+
+        sfoffs = 0;
+        rdlambda = sqrtf(rdlambda);
+    }
+
+    /** and zero out above cutoff frequency */
+    {
+        int wlen = 1024 / sce->ics.num_windows;
+        int bandwidth;
+
+        /**
+         * Scale, psy gives us constant quality, this LP only scales
+         * bitrate by lambda, so we save bits on subjectively unimportant HF
+         * rather than increase quantization noise. Adjust nominal bitrate
+         * to effective bitrate according to encoding parameters,
+         * AAC_CUTOFF_FROM_BITRATE is calibrated for effective bitrate.
+         */
+        float rate_bandwidth_multiplier = 1.5f;
+        int frame_bit_rate = (avctx->flags & AV_CODEC_FLAG_QSCALE)
+            ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+            : (avctx->bit_rate / avctx->channels);
+
+        /** Compensate for extensions that increase efficiency */
+        if (s->options.pns || s->options.intensity_stereo)
+            frame_bit_rate *= 1.15f;
+
+        if (avctx->cutoff > 0) {
+            bandwidth = avctx->cutoff;
+        } else {
+            bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+            s->psy.cutoff = bandwidth;
+        }
+
+        cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+        pns_start_pos = NOISE_LOW_LIMIT * 2 * wlen / avctx->sample_rate;
+    }
+
+    /**
+     * for values above this the decoder might end up in an endless loop
+     * due to always having more bits than what can be encoded.
+     */
+    destbits = FFMIN(destbits, 5800);
+    toomanybits = FFMIN(toomanybits, 5800);
+    toofewbits = FFMIN(toofewbits, 5800);
+    /**
+     * XXX: some heuristic to determine initial quantizers will reduce search time
+     * determine zero bands and upper distortion limits
+     */
+    min_spread_thr_r = -1;
+    max_spread_thr_r = -1;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+            int nz = 0;
+            float uplim = 0.0f, energy = 0.0f, spread = 0.0f;
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                if (start >= cutoff || band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f) {
+                    sce->zeroes[(w+w2)*16+g] = 1;
+                    continue;
+                }
+                nz = 1;
+            }
+            if (!nz) {
+                uplim = 0.0f;
+            } else {
+                nz = 0;
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                    FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                    if (band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f)
+                        continue;
+                    uplim += band->threshold;
+                    energy += band->energy;
+                    spread += band->spread;
+                    nz++;
+                }
+            }
+            uplims[w*16+g] = uplim;
+            energies[w*16+g] = energy;
+            nzs[w*16+g] = nz;
+            sce->zeroes[w*16+g] = !nz;
+            allz |= nz;
+            if (nz && sce->can_pns[w*16+g]) {
+                spread_thr_r[w*16+g] = energy * nz / (uplim * spread);
+                if (min_spread_thr_r < 0) {
+                    min_spread_thr_r = max_spread_thr_r = spread_thr_r[w*16+g];
+                } else {
+                    min_spread_thr_r = FFMIN(min_spread_thr_r, spread_thr_r[w*16+g]);
+                    max_spread_thr_r = FFMAX(max_spread_thr_r, spread_thr_r[w*16+g]);
+                }
+            }
+        }
+    }
+
+    /** Compute initial scalers */
+    minscaler = 65535;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g]) {
+                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
+                continue;
+            }
+            /**
+             * log2f-to-distortion ratio is, technically, 2 (1.5db = 4, but it's power vs level so it's 2).
+             * But, as offsets are applied, low-frequency signals are too sensitive to the induced distortion,
+             * so we make scaling more conservative by choosing a lower log2f-to-distortion ratio, and thus
+             * more robust.
+             */
+            sce->sf_idx[w*16+g] = av_clip(
+                SCALE_ONE_POS
+                    + 1.75*log2f(FFMAX(0.00125f,uplims[w*16+g]) / sce->ics.swb_sizes[g])
+                    + sfoffs,
+                60, SCALE_MAX_POS);
+            minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+        }
+    }
+
+    /** Clip */
+    minscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
+        for (g = 0;  g < sce->ics.num_swb; g++)
+            if (!sce->zeroes[w*16+g])
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF - 1);
+
+    if (!allz)
+        return;
+    s->abs_pow34(s->scoefs, sce->coeffs, 1024);
+    ff_quantize_band_cost_cache_init(s);
+
+    for (i = 0; i < sizeof(minsf) / sizeof(minsf[0]); ++i)
+        minsf[i] = 0;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            const float *scaled = s->scoefs + start;
+            int minsfidx;
+            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
+            if (maxvals[w*16+g] > 0) {
+                minsfidx = coef2minsf(maxvals[w*16+g]);
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++)
+                    minsf[(w+w2)*16+g] = minsfidx;
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    /**
+     * Scale uplims to match rate distortion to quality
+     * bu applying noisy band depriorization and tonal band priorization.
+     * Maxval-energy ratio gives us an idea of how noisy/tonal the band is.
+     * If maxval^2 ~ energy, then that band is mostly noise, and we can relax
+     * rate distortion requirements.
+     */
+    memcpy(euplims, uplims, sizeof(euplims));
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** psy already priorizes transients to some extent */
+        float de_psy_factor = (sce->ics.num_windows > 1) ? 8.0f / sce->ics.group_len[w] : 1.0f;
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (nzs[g] > 0) {
+                float cleanup_factor = ff_sqrf(av_clipf(start / (cutoff * 0.75f), 1.0f, 2.0f));
+                float energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    nzslope * cleanup_factor);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                uplims[w*16+g] *= av_clipf(rdlambda * energy2uplim, rdmin, rdmax)
+                                  * sce->ics.group_len[w];
+
+                energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    2.0f);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                euplims[w*16+g] *= av_clipf(rdlambda * energy2uplim * sce->ics.group_len[w],
+                    0.5f, 1.0f);
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    for (i = 0; i < sizeof(maxsf) / sizeof(maxsf[0]); ++i)
+        maxsf[i] = SCALE_MAX_POS;
+
+    //perform two-loop search
+    //outer loop - improve quality
+    do {
+        //inner loop - quantize spectrum to fit into given number of bits
+        int overdist;
+        int qstep = its ? 1 : 32;
+        do {
+            int changed = 0;
+            prev = -1;
+            recomprd = 0;
+            tbits = 0;
+            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                start = w*128;
+                for (g = 0;  g < sce->ics.num_swb; g++) {
+                    const float *coefs = &sce->coeffs[start];
+                    const float *scaled = &s->scoefs[start];
+                    int bits = 0;
+                    int cb;
+                    float dist = 0.0f;
+                    float qenergy = 0.0f;
+
+                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                        start += sce->ics.swb_sizes[g];
+                        if (sce->can_pns[w*16+g]) {
+                            /** PNS isn't free */
+                            tbits += ff_pns_bits(sce, w, g);
+                        }
+                        continue;
+                    }
+                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                        int b;
+                        float sqenergy;
+                        dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                   scaled + w2*128,
+                                                   sce->ics.swb_sizes[g],
+                                                   sce->sf_idx[w*16+g],
+                                                   cb,
+                                                   1.0f,
+                                                   INFINITY,
+                                                   &b, &sqenergy,
+                                                   0);
+                        bits += b;
+                        qenergy += sqenergy;
+                    }
+                    dists[w*16+g] = dist - bits;
+                    qenergies[w*16+g] = qenergy;
+                    if (prev != -1) {
+                        int sfdiff = av_clip(sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO, 0, 2*SCALE_MAX_DIFF);
+                        bits += ff_aac_scalefactor_bits[sfdiff];
+                    }
+                    tbits += bits;
+                    start += sce->ics.swb_sizes[g];
+                    prev = sce->sf_idx[w*16+g];
+                }
+            }
+            if (tbits > toomanybits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] < (SCALE_MAX_POS - SCALE_DIV_512)) {
+                        int maxsf_i = (tbits > 5800) ? SCALE_MAX_POS : maxsf[i];
+                        int new_sf = FFMIN(maxsf_i, sce->sf_idx[i] + qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
+            } else if (tbits < toofewbits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] > SCALE_ONE_POS) {
+                        int new_sf = FFMAX3(minsf[i], SCALE_ONE_POS, sce->sf_idx[i] - qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
+            }
+            qstep >>= 1;
+            if (!qstep && tbits > toomanybits && sce->sf_idx[0] < 217 && changed)
+                qstep = 1;
+        } while (qstep);
+
+        overdist = 1;
+        fflag = tbits < toofewbits;
+        for (i = 0; i < 2 && (overdist || recomprd); ++i) {
+            if (recomprd) {
+                /** Must recompute distortion */
+                prev = -1;
+                tbits = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    start = w*128;
+                    for (g = 0;  g < sce->ics.num_swb; g++) {
+                        const float *coefs = sce->coeffs + start;
+                        const float *scaled = s->scoefs + start;
+                        int bits = 0;
+                        int cb;
+                        float dist = 0.0f;
+                        float qenergy = 0.0f;
+
+                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                            start += sce->ics.swb_sizes[g];
+                            if (sce->can_pns[w*16+g]) {
+                                /** PNS isn't free */
+                                tbits += ff_pns_bits(sce, w, g);
+                            }
+                            continue;
+                        }
+                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                            int b;
+                            float sqenergy;
+                            dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                    scaled + w2*128,
+                                                    sce->ics.swb_sizes[g],
+                                                    sce->sf_idx[w*16+g],
+                                                    cb,
+                                                    1.0f,
+                                                    INFINITY,
+                                                    &b, &sqenergy,
+                                                    0);
+                            bits += b;
+                            qenergy += sqenergy;
+                        }
+                        dists[w*16+g] = dist - bits;
+                        qenergies[w*16+g] = qenergy;
+                        if (prev != -1) {
+                            int sfdiff = av_clip(sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO, 0, 2*SCALE_MAX_DIFF);
+                            bits += ff_aac_scalefactor_bits[sfdiff];
+                        }
+                        tbits += bits;
+                        start += sce->ics.swb_sizes[g];
+                        prev = sce->sf_idx[w*16+g];
+                    }
+                }
+            }
+            if (!i && s->options.pns && its > maxits/2 && tbits > toofewbits) {
+                float maxoverdist = 0.0f;
+                float ovrfactor = 1.f+(maxits-its)*16.f/maxits;
+                overdist = recomprd = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                        if (!sce->zeroes[w*16+g] && sce->sf_idx[w*16+g] > SCALE_ONE_POS && dists[w*16+g] > uplims[w*16+g]*ovrfactor) {
+                            float ovrdist = dists[w*16+g] / FFMAX(uplims[w*16+g],euplims[w*16+g]);
+                            maxoverdist = FFMAX(maxoverdist, ovrdist);
+                            overdist++;
+                        }
+                    }
+                }
+                if (overdist) {
+                    /* We have overdistorted bands, trade for zeroes (that can be noise)
+                     * Zero the bands in the lowest 1.25% spread-energy-threshold ranking
+                     */
+                    float minspread = max_spread_thr_r;
+                    float maxspread = min_spread_thr_r;
+                    float zspread;
+                    int zeroable = 0;
+                    int zeroed = 0;
+                    int maxzeroed, zloop;
+                    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                            if (start >= pns_start_pos && !sce->zeroes[w*16+g] && sce->can_pns[w*16+g]) {
+                                minspread = FFMIN(minspread, spread_thr_r[w*16+g]);
+                                maxspread = FFMAX(maxspread, spread_thr_r[w*16+g]);
+                                zeroable++;
+                            }
+                        }
+                    }
+                    zspread = (maxspread-minspread) * 0.0125f + minspread;
+                    /* Don't PNS everything even if allowed. It suppresses bit starvation signals from RC,
+                     * and forced the hand of the later search_for_pns step.
+                     * Instead, PNS a fraction of the spread_thr_r range depending on how starved for bits we are,
+                     * and leave further PNSing to search_for_pns if worthwhile.
+                     */
+                    zspread = FFMIN3(min_spread_thr_r * 8.f, zspread,
+                        ((toomanybits - tbits) * min_spread_thr_r + (tbits - toofewbits) * max_spread_thr_r) / (toomanybits - toofewbits + 1));
+                    maxzeroed = FFMIN(zeroable, FFMAX(1, (zeroable * its + maxits - 1) / (2 * maxits)));
+                    for (zloop = 0; zloop < 2; zloop++) {
+                        /* Two passes: first distorted stuff - two birds in one shot and all that,
+                         * then anything viable. Viable means not zero, but either CB=zero-able
+                         * (too high SF), not SF <= 1 (that means we'd be operating at very high
+                         * quality, we don't want PNS when doing VHQ), PNS allowed, and within
+                         * the lowest ranking percentile.
+                         */
+                        float loopovrfactor = (zloop) ? 1.0f : ovrfactor;
+                        int loopminsf = (zloop) ? (SCALE_ONE_POS - SCALE_DIV_512) : SCALE_ONE_POS;
+                        int mcb;
+                        for (g = sce->ics.num_swb-1; g > 0 && zeroed < maxzeroed; g--) {
+                            if (sce->ics.swb_offset[g] < pns_start_pos)
+                                continue;
+                            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                                if (!sce->zeroes[w*16+g] && sce->can_pns[w*16+g] && spread_thr_r[w*16+g] <= zspread
+                                    && sce->sf_idx[w*16+g] > loopminsf
+                                    && (dists[w*16+g] > loopovrfactor*uplims[w*16+g] || !(mcb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]))
+                                        || (mcb <= 1 && dists[w*16+g] > FFMIN(uplims[w*16+g], euplims[w*16+g]))) ) {
+                                    sce->zeroes[w*16+g] = 1;
+                                    sce->band_type[w*16+g] = 0;
+                                    zeroed++;
+                                }
+                            }
+                        }
+                    }
+                    if (zeroed)
+                        recomprd = fflag = 1;
+                } else {
+                    overdist = 0;
+                }
+            }
+        }
+
+        minscaler = SCALE_MAX_POS;
+        maxscaler = 0;
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0;  g < sce->ics.num_swb; g++) {
+                if (!sce->zeroes[w*16+g]) {
+                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+                    maxscaler = FFMAX(maxscaler, sce->sf_idx[w*16+g]);
+                }
+            }
+        }
+
+        minscaler = nminscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+        prev = -1;
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            /** Start with big steps, end up fine-tunning */
+            int depth = (its > maxits/2) ? ((its > maxits*2/3) ? 1 : 3) : 10;
+            int edepth = depth+2;
+            float uplmax = its / (maxits*0.25f) + 1.0f;
+            uplmax *= (tbits > destbits) ? FFMIN(2.0f, tbits / (float)FFMAX(1,destbits)) : 1.0f;
+            start = w * 128;
+            for (g = 0; g < sce->ics.num_swb; g++) {
+                int prevsc = sce->sf_idx[w*16+g];
+                if (prev < 0 && !sce->zeroes[w*16+g])
+                    prev = sce->sf_idx[0];
+                if (!sce->zeroes[w*16+g]) {
+                    const float *coefs = sce->coeffs + start;
+                    const float *scaled = s->scoefs + start;
+                    int cmb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    int mindeltasf = FFMAX(0, prev - SCALE_MAX_DIFF);
+                    int maxdeltasf = FFMIN(SCALE_MAX_POS - SCALE_DIV_512, prev + SCALE_MAX_DIFF);
+                    if ((!cmb || dists[w*16+g] > uplims[w*16+g]) && sce->sf_idx[w*16+g] > FFMAX(mindeltasf, minsf[w*16+g])) {
+                        /* Try to make sure there is some energy in every nonzero band
+                         * NOTE: This algorithm must be forcibly imbalanced, pushing harder
+                         *  on holes or more distorted bands at first, otherwise there's
+                         *  no net gain (since the next iteration will offset all bands
+                         *  on the opposite direction to compensate for extra bits)
+                         */
+                        for (i = 0; i < edepth && sce->sf_idx[w*16+g] > mindeltasf; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            int mb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1);
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                            dist = qenergy = 0.f;
+                            bits = 0;
+                            if (!cb) {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g]-1, maxsf[w*16+g]);
+                            } else if (i >= depth && dists[w*16+g] < euplims[w*16+g]) {
+                                break;
+                            }
+                            /* !g is the DC band, it's important, since quantization error here
+                             * applies to less than a cycle, it creates horrible intermodulation
+                             * distortion if it doesn't stick to what psy requests
+                             */
+                            if (!g && sce->ics.num_windows > 1 && dists[w*16+g] >= euplims[w*16+g])
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g], maxsf[w*16+g]);
+                            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                int b;
+                                float sqenergy;
+                                dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                        scaled + w2*128,
+                                                        sce->ics.swb_sizes[g],
+                                                        sce->sf_idx[w*16+g]-1,
+                                                        cb,
+                                                        1.0f,
+                                                        INFINITY,
+                                                        &b, &sqenergy,
+                                                        0);
+                                bits += b;
+                                qenergy += sqenergy;
+                            }
+                            sce->sf_idx[w*16+g]--;
+                            dists[w*16+g] = dist - bits;
+                            qenergies[w*16+g] = qenergy;
+                            if (mb && (sce->sf_idx[w*16+g] < mindeltasf || (
+                                    (dists[w*16+g] < FFMIN(uplmax*uplims[w*16+g], euplims[w*16+g]))
+                                    && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                                ) )) {
+                                break;
+                            }
+                        }
+                    } else if (tbits > toofewbits && sce->sf_idx[w*16+g] < FFMIN(maxdeltasf, maxsf[w*16+g])
+                            && (dists[w*16+g] < FFMIN(euplims[w*16+g], uplims[w*16+g]))
+                            && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                        ) {
+                        /** Um... over target. Save bits for more important stuff. */
+                        for (i = 0; i < depth && sce->sf_idx[w*16+g] < maxdeltasf; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]+1);
+                            if (cb > 0) {
+                                dist = qenergy = 0.f;
+                                bits = 0;
+                                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                    int b;
+                                    float sqenergy;
+                                    dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                            scaled + w2*128,
+                                                            sce->ics.swb_sizes[g],
+                                                            sce->sf_idx[w*16+g]+1,
+                                                            cb,
+                                                            1.0f,
+                                                            INFINITY,
+                                                            &b, &sqenergy,
+                                                            0);
+                                    bits += b;
+                                    qenergy += sqenergy;
+                                }
+                                dist -= bits;
+                                if (dist < FFMIN(euplims[w*16+g], uplims[w*16+g])) {
+                                    sce->sf_idx[w*16+g]++;
+                                    dists[w*16+g] = dist;
+                                    qenergies[w*16+g] = qenergy;
+                                } else {
+                                    break;
+                                }
+                            } else {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g], maxsf[w*16+g]);
+                                break;
+                            }
+                        }
+                    }
+                    prev = sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], mindeltasf, maxdeltasf);
+                    if (sce->sf_idx[w*16+g] != prevsc)
+                        fflag = 1;
+                    nminscaler = FFMIN(nminscaler, sce->sf_idx[w*16+g]);
+                    sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                }
+                start += sce->ics.swb_sizes[g];
+            }
+        }
+
+        /** SF difference limit violation risk. Must re-clamp. */
+        prev = -1;
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0; g < sce->ics.num_swb; g++) {
+                if (!sce->zeroes[w*16+g]) {
+                    int prevsf = sce->sf_idx[w*16+g];
+                    if (prev < 0)
+                        prev = prevsf;
+                    sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], prev - SCALE_MAX_DIFF, prev + SCALE_MAX_DIFF);
+                    sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    prev = sce->sf_idx[w*16+g];
+                    if (!fflag && prevsf != sce->sf_idx[w*16+g])
+                        fflag = 1;
+                }
+            }
+        }
+
+        its++;
+    } while (fflag && its < maxits);
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce, nextband);
+
+    prev = -1;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** Make sure proper codebooks are set */
+        for (g = 0; g < sce->ics.num_swb; g++) {
+            if (!sce->zeroes[w*16+g]) {
+                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                if (sce->band_type[w*16+g] <= 0) {
+                    if (!ff_sfdelta_can_remove_band(sce, nextband, prev, w*16+g)) {
+                        /** Cannot zero out, make sure it's not attempted */
+                        sce->band_type[w*16+g] = 1;
+                    } else {
+                        sce->zeroes[w*16+g] = 1;
+                        sce->band_type[w*16+g] = 0;
+                    }
+                }
+            } else {
+                sce->band_type[w*16+g] = 0;
+            }
+            /** Check that there's no SF delta range violations */
+            if (!sce->zeroes[w*16+g]) {
+                if (prev != -1) {
+                    av_unused int sfdiff = sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO;
+                    av_assert1(sfdiff >= 0 && sfdiff <= 2*SCALE_MAX_DIFF);
+                } else if (sce->zeroes[0]) {
+                    /** Set global gain to something useful */
+                    sce->sf_idx[0] = sce->sf_idx[w*16+g];
+                }
+                prev = sce->sf_idx[w*16+g];
+            }
+        }
+    }
+}
+
+#endif /* AVCODEC_AACCODER_TWOLOOP_H */
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c
index e436b4f..d394700 100644
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -8,20 +8,20 @@
  * Copyright (c) 2008-2010 Paul Kendall <paul@kcbbs.gen.nz>
  * Copyright (c) 2010      Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,60 +32,17 @@
  * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
  */
 
-/*
- * supported tools
- *
- * Support?             Name
- * N (code in SoC repo) gain control
- * Y                    block switching
- * Y                    window shapes - standard
- * N                    window shapes - Low Delay
- * Y                    filterbank - standard
- * N (code in SoC repo) filterbank - Scalable Sample Rate
- * Y                    Temporal Noise Shaping
- * Y                    Long Term Prediction
- * Y                    intensity stereo
- * Y                    channel coupling
- * Y                    frequency domain prediction
- * Y                    Perceptual Noise Substitution
- * Y                    Mid/Side stereo
- * N                    Scalable Inverse AAC Quantization
- * N                    Frequency Selective Switch
- * N                    upsampling filter
- * Y                    quantization & coding - AAC
- * N                    quantization & coding - TwinVQ
- * N                    quantization & coding - BSAC
- * N                    AAC Error Resilience tools
- * N                    Error Resilience payload syntax
- * N                    Error Protection tool
- * N                    CELP
- * N                    Silence Compression
- * N                    HVXC
- * N                    HVXC 4kbits/s VR
- * N                    Structured Audio tools
- * N                    Structured Audio Sample Bank Format
- * N                    MIDI
- * N                    Harmonic and Individual Lines plus Noise
- * N                    Text-To-Speech Interface
- * Y                    Spectral Band Replication
- * Y (not in this code) Layer-1
- * Y (not in this code) Layer-2
- * Y (not in this code) Layer-3
- * N                    SinuSoidal Coding (Transient, Sinusoid, Noise)
- * Y                    Parametric Stereo
- * N                    Direct Stream Transfer
- *
- * Note: - HE AAC v1 comprises LC AAC with Spectral Band Replication.
- *       - HE AAC v2 comprises LC AAC with Spectral Band Replication and
-           Parametric Stereo.
- */
+#define FFT_FLOAT 1
+#define FFT_FIXED_32 0
+#define USE_FIXED 0
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
 #include "fft.h"
-#include "imdct15.h"
+#include "mdct15.h"
 #include "lpc.h"
 #include "kbdwin.h"
 #include "sinewin.h"
@@ -94,13 +51,13 @@
 #include "aactab.h"
 #include "aacdectab.h"
 #include "adts_header.h"
-#include "cbrt_tablegen.h"
+#include "cbrt_data.h"
 #include "sbr.h"
 #include "aacsbr.h"
 #include "mpeg4audio.h"
+#include "profiles.h"
 #include "libavutil/intfloat.h"
 
-#include <assert.h>
 #include <errno.h>
 #include <math.h>
 #include <stdint.h>
@@ -108,855 +65,10 @@
 
 #if ARCH_ARM
 #   include "arm/aac.h"
+#elif ARCH_MIPS
+#   include "mips/aacdec_mips.h"
 #endif
 
-#include "libavutil/thread.h"
-
-static VLC vlc_scalefactors;
-static VLC vlc_spectral[11];
-
-static const char overread_err[] = "Input buffer exhausted before END element found\n";
-
-static int count_channels(uint8_t (*layout)[3], int tags)
-{
-    int i, sum = 0;
-    for (i = 0; i < tags; i++) {
-        int syn_ele = layout[i][0];
-        int pos     = layout[i][2];
-        sum += (1 + (syn_ele == TYPE_CPE)) *
-               (pos != AAC_CHANNEL_OFF && pos != AAC_CHANNEL_CC);
-    }
-    return sum;
-}
-
-/**
- * Check for the channel element in the current channel position configuration.
- * If it exists, make sure the appropriate element is allocated and map the
- * channel order to match the internal Libav channel layout.
- *
- * @param   che_pos current channel position configuration
- * @param   type channel element type
- * @param   id channel element id
- * @param   channels count of the number of channels in the configuration
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static av_cold int che_configure(AACContext *ac,
-                                 enum ChannelPosition che_pos,
-                                 int type, int id, int *channels)
-{
-    if (che_pos) {
-        if (!ac->che[type][id]) {
-            if (!(ac->che[type][id] = av_mallocz(sizeof(ChannelElement))))
-                return AVERROR(ENOMEM);
-            ff_aac_sbr_ctx_init(ac, &ac->che[type][id]->sbr);
-        }
-        if (type != TYPE_CCE) {
-            if (*channels >= MAX_CHANNELS - 2)
-                return AVERROR_INVALIDDATA;
-            ac->output_element[(*channels)++] = &ac->che[type][id]->ch[0];
-            if (type == TYPE_CPE ||
-                (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1)) {
-                ac->output_element[(*channels)++] = &ac->che[type][id]->ch[1];
-            }
-        }
-    } else {
-        if (ac->che[type][id])
-            ff_aac_sbr_ctx_close(&ac->che[type][id]->sbr);
-        av_freep(&ac->che[type][id]);
-    }
-    return 0;
-}
-
-static int frame_configure_elements(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int type, id, ch, ret;
-
-    /* set channel pointers to internal buffers by default */
-    for (type = 0; type < 4; type++) {
-        for (id = 0; id < MAX_ELEM_ID; id++) {
-            ChannelElement *che = ac->che[type][id];
-            if (che) {
-                che->ch[0].ret = che->ch[0].ret_buf;
-                che->ch[1].ret = che->ch[1].ret_buf;
-            }
-        }
-    }
-
-    /* get output buffer */
-    av_frame_unref(ac->frame);
-    ac->frame->nb_samples = 2048;
-    if ((ret = ff_get_buffer(avctx, ac->frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
-
-    /* map output channel pointers to AVFrame data */
-    for (ch = 0; ch < avctx->channels; ch++) {
-        if (ac->output_element[ch])
-            ac->output_element[ch]->ret = (float *)ac->frame->extended_data[ch];
-    }
-
-    return 0;
-}
-
-struct elem_to_channel {
-    uint64_t av_position;
-    uint8_t syn_ele;
-    uint8_t elem_id;
-    uint8_t aac_position;
-};
-
-static int assign_pair(struct elem_to_channel e2c_vec[MAX_ELEM_ID],
-                       uint8_t (*layout_map)[3], int offset, uint64_t left,
-                       uint64_t right, int pos)
-{
-    if (layout_map[offset][0] == TYPE_CPE) {
-        e2c_vec[offset] = (struct elem_to_channel) {
-            .av_position  = left | right,
-            .syn_ele      = TYPE_CPE,
-            .elem_id      = layout_map[offset][1],
-            .aac_position = pos
-        };
-        return 1;
-    } else {
-        e2c_vec[offset] = (struct elem_to_channel) {
-            .av_position  = left,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[offset][1],
-            .aac_position = pos
-        };
-        e2c_vec[offset + 1] = (struct elem_to_channel) {
-            .av_position  = right,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[offset + 1][1],
-            .aac_position = pos
-        };
-        return 2;
-    }
-}
-
-static int count_paired_channels(uint8_t (*layout_map)[3], int tags, int pos,
-                                 int *current)
-{
-    int num_pos_channels = 0;
-    int first_cpe        = 0;
-    int sce_parity       = 0;
-    int i;
-    for (i = *current; i < tags; i++) {
-        if (layout_map[i][2] != pos)
-            break;
-        if (layout_map[i][0] == TYPE_CPE) {
-            if (sce_parity) {
-                if (pos == AAC_CHANNEL_FRONT && !first_cpe) {
-                    sce_parity = 0;
-                } else {
-                    return -1;
-                }
-            }
-            num_pos_channels += 2;
-            first_cpe         = 1;
-        } else {
-            num_pos_channels++;
-            sce_parity ^= 1;
-        }
-    }
-    if (sce_parity &&
-        ((pos == AAC_CHANNEL_FRONT && first_cpe) || pos == AAC_CHANNEL_SIDE))
-        return -1;
-    *current = i;
-    return num_pos_channels;
-}
-
-static uint64_t sniff_channel_order(uint8_t (*layout_map)[3], int tags)
-{
-    int i, n, total_non_cc_elements;
-    struct elem_to_channel e2c_vec[4 * MAX_ELEM_ID] = { { 0 } };
-    int num_front_channels, num_side_channels, num_back_channels;
-    uint64_t layout;
-
-    if (FF_ARRAY_ELEMS(e2c_vec) < tags)
-        return 0;
-
-    i = 0;
-    num_front_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_FRONT, &i);
-    if (num_front_channels < 0)
-        return 0;
-    num_side_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_SIDE, &i);
-    if (num_side_channels < 0)
-        return 0;
-    num_back_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_BACK, &i);
-    if (num_back_channels < 0)
-        return 0;
-
-    if (num_side_channels == 0 && num_back_channels >= 4) {
-        num_side_channels = 2;
-        num_back_channels -= 2;
-    }
-
-    i = 0;
-    if (num_front_channels & 1) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_FRONT_CENTER,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_FRONT
-        };
-        i++;
-        num_front_channels--;
-    }
-    if (num_front_channels >= 4) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_FRONT_LEFT_OF_CENTER,
-                         AV_CH_FRONT_RIGHT_OF_CENTER,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-    if (num_front_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_FRONT_LEFT,
-                         AV_CH_FRONT_RIGHT,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-    while (num_front_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-
-    if (num_side_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_SIDE_LEFT,
-                         AV_CH_SIDE_RIGHT,
-                         AAC_CHANNEL_FRONT);
-        num_side_channels -= 2;
-    }
-    while (num_side_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_SIDE);
-        num_side_channels -= 2;
-    }
-
-    while (num_back_channels >= 4) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_BACK);
-        num_back_channels -= 2;
-    }
-    if (num_back_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_BACK_LEFT,
-                         AV_CH_BACK_RIGHT,
-                         AAC_CHANNEL_BACK);
-        num_back_channels -= 2;
-    }
-    if (num_back_channels) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_BACK_CENTER,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_BACK
-        };
-        i++;
-        num_back_channels--;
-    }
-
-    if (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_LOW_FREQUENCY,
-            .syn_ele      = TYPE_LFE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_LFE
-        };
-        i++;
-    }
-    while (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = UINT64_MAX,
-            .syn_ele      = TYPE_LFE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_LFE
-        };
-        i++;
-    }
-
-    // Must choose a stable sort
-    total_non_cc_elements = n = i;
-    do {
-        int next_n = 0;
-        for (i = 1; i < n; i++)
-            if (e2c_vec[i - 1].av_position > e2c_vec[i].av_position) {
-                FFSWAP(struct elem_to_channel, e2c_vec[i - 1], e2c_vec[i]);
-                next_n = i;
-            }
-        n = next_n;
-    } while (n > 0);
-
-    layout = 0;
-    for (i = 0; i < total_non_cc_elements; i++) {
-        layout_map[i][0] = e2c_vec[i].syn_ele;
-        layout_map[i][1] = e2c_vec[i].elem_id;
-        layout_map[i][2] = e2c_vec[i].aac_position;
-        if (e2c_vec[i].av_position != UINT64_MAX) {
-            layout |= e2c_vec[i].av_position;
-        }
-    }
-
-    return layout;
-}
-
-/**
- * Save current output configuration if and only if it has been locked.
- */
-static void push_output_configuration(AACContext *ac) {
-    if (ac->oc[1].status == OC_LOCKED) {
-        ac->oc[0] = ac->oc[1];
-    }
-    ac->oc[1].status = OC_NONE;
-}
-
-/**
- * Restore the previous output configuration if and only if the current
- * configuration is unlocked.
- */
-static void pop_output_configuration(AACContext *ac) {
-    if (ac->oc[1].status != OC_LOCKED && ac->oc[0].status != OC_NONE) {
-        ac->oc[1] = ac->oc[0];
-        ac->avctx->channels = ac->oc[1].channels;
-        ac->avctx->channel_layout = ac->oc[1].channel_layout;
-    }
-}
-
-/**
- * Configure output channel order based on the current program
- * configuration element.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int output_configure(AACContext *ac,
-                            uint8_t layout_map[MAX_ELEM_ID * 4][3], int tags,
-                            enum OCStatus oc_type, int get_new_frame)
-{
-    AVCodecContext *avctx = ac->avctx;
-    int i, channels = 0, ret;
-    uint64_t layout = 0;
-    uint8_t id_map[TYPE_END][MAX_ELEM_ID] = {{ 0 }};
-    uint8_t type_counts[TYPE_END] = { 0 };
-
-    if (ac->oc[1].layout_map != layout_map) {
-        memcpy(ac->oc[1].layout_map, layout_map, tags * sizeof(layout_map[0]));
-        ac->oc[1].layout_map_tags = tags;
-    }
-    for (i = 0; i < tags; i++) {
-        int type =         layout_map[i][0];
-        int id =           layout_map[i][1];
-        id_map[type][id] = type_counts[type]++;
-    }
-    // Try to sniff a reasonable channel order, otherwise output the
-    // channels in the order the PCE declared them.
-    if (avctx->request_channel_layout != AV_CH_LAYOUT_NATIVE)
-        layout = sniff_channel_order(layout_map, tags);
-    for (i = 0; i < tags; i++) {
-        int type =     layout_map[i][0];
-        int id =       layout_map[i][1];
-        int iid =      id_map[type][id];
-        int position = layout_map[i][2];
-        // Allocate or free elements depending on if they are in the
-        // current program configuration.
-        ret = che_configure(ac, position, type, iid, &channels);
-        if (ret < 0)
-            return ret;
-        ac->tag_che_map[type][id] = ac->che[type][iid];
-    }
-    if (ac->oc[1].m4ac.ps == 1 && channels == 2) {
-        if (layout == AV_CH_FRONT_CENTER) {
-            layout = AV_CH_FRONT_LEFT|AV_CH_FRONT_RIGHT;
-        } else {
-            layout = 0;
-        }
-    }
-
-    avctx->channel_layout = ac->oc[1].channel_layout = layout;
-    avctx->channels       = ac->oc[1].channels       = channels;
-    ac->oc[1].status = oc_type;
-
-    if (get_new_frame) {
-        if ((ret = frame_configure_elements(ac->avctx)) < 0)
-            return ret;
-    }
-
-    return 0;
-}
-
-/**
- * Set up channel positions based on a default channel configuration
- * as specified in table 1.17.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int set_default_channel_config(AVCodecContext *avctx,
-                                      uint8_t (*layout_map)[3],
-                                      int *tags,
-                                      int channel_config)
-{
-    if (channel_config < 1 || (channel_config > 7 && channel_config < 11) ||
-        channel_config > 12) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid default channel configuration (%d)\n",
-               channel_config);
-        return AVERROR_INVALIDDATA;
-    }
-    *tags = tags_per_config[channel_config];
-    memcpy(layout_map, aac_channel_layout_map[channel_config - 1],
-           *tags * sizeof(*layout_map));
-    return 0;
-}
-
-static ChannelElement *get_che(AACContext *ac, int type, int elem_id)
-{
-    /* For PCE based channel configurations map the channels solely based
-     * on tags. */
-    if (!ac->oc[1].m4ac.chan_config) {
-        return ac->tag_che_map[type][elem_id];
-    }
-    // Allow single CPE stereo files to be signalled with mono configuration.
-    if (!ac->tags_mapped && type == TYPE_CPE &&
-        ac->oc[1].m4ac.chan_config == 1) {
-        uint8_t layout_map[MAX_ELEM_ID*4][3];
-        int layout_map_tags;
-        push_output_configuration(ac);
-
-        if (set_default_channel_config(ac->avctx, layout_map,
-                                       &layout_map_tags, 2) < 0)
-            return NULL;
-        if (output_configure(ac, layout_map, layout_map_tags,
-                             OC_TRIAL_FRAME, 1) < 0)
-            return NULL;
-
-        ac->oc[1].m4ac.chan_config = 2;
-        ac->oc[1].m4ac.ps = 0;
-    }
-    // And vice-versa
-    if (!ac->tags_mapped && type == TYPE_SCE &&
-        ac->oc[1].m4ac.chan_config == 2) {
-        uint8_t layout_map[MAX_ELEM_ID * 4][3];
-        int layout_map_tags;
-        push_output_configuration(ac);
-
-        if (set_default_channel_config(ac->avctx, layout_map,
-                                       &layout_map_tags, 1) < 0)
-            return NULL;
-        if (output_configure(ac, layout_map, layout_map_tags,
-                             OC_TRIAL_FRAME, 1) < 0)
-            return NULL;
-
-        ac->oc[1].m4ac.chan_config = 1;
-        if (ac->oc[1].m4ac.sbr)
-            ac->oc[1].m4ac.ps = -1;
-    }
-    /* For indexed channel configurations map the channels solely based
-     * on position. */
-    switch (ac->oc[1].m4ac.chan_config) {
-    case 12:
-    case 7:
-        if (ac->tags_mapped == 3 && type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][2];
-        }
-    case 11:
-        if (ac->tags_mapped == 2 &&
-            ac->oc[1].m4ac.chan_config == 11 &&
-            type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
-        }
-    case 6:
-        /* Some streams incorrectly code 5.1 audio as
-         * SCE[0] CPE[0] CPE[1] SCE[1]
-         * instead of
-         * SCE[0] CPE[0] CPE[1] LFE[0].
-         * If we seem to have encountered such a stream, transfer
-         * the LFE[0] element to the SCE[1]'s mapping */
-        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_LFE][0];
-        }
-    case 5:
-        if (ac->tags_mapped == 2 && type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][1];
-        }
-    case 4:
-        if (ac->tags_mapped == 2 &&
-            ac->oc[1].m4ac.chan_config == 4 &&
-            type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
-        }
-    case 3:
-    case 2:
-        if (ac->tags_mapped == (ac->oc[1].m4ac.chan_config != 2) &&
-            type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][0];
-        } else if (ac->oc[1].m4ac.chan_config == 2) {
-            return NULL;
-        }
-    case 1:
-        if (!ac->tags_mapped && type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][0];
-        }
-    default:
-        return NULL;
-    }
-}
-
-/**
- * Decode an array of 4 bit element IDs, optionally interleaved with a
- * stereo/mono switching bit.
- *
- * @param type speaker type/position for these channels
- */
-static void decode_channel_map(uint8_t layout_map[][3],
-                               enum ChannelPosition type,
-                               GetBitContext *gb, int n)
-{
-    while (n--) {
-        enum RawDataBlockType syn_ele;
-        switch (type) {
-        case AAC_CHANNEL_FRONT:
-        case AAC_CHANNEL_BACK:
-        case AAC_CHANNEL_SIDE:
-            syn_ele = get_bits1(gb);
-            break;
-        case AAC_CHANNEL_CC:
-            skip_bits1(gb);
-            syn_ele = TYPE_CCE;
-            break;
-        case AAC_CHANNEL_LFE:
-            syn_ele = TYPE_LFE;
-            break;
-        default:
-            // AAC_CHANNEL_OFF has no channel map
-            return;
-        }
-        layout_map[0][0] = syn_ele;
-        layout_map[0][1] = get_bits(gb, 4);
-        layout_map[0][2] = type;
-        layout_map++;
-    }
-}
-
-/**
- * Decode program configuration element; reference: table 4.2.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_pce(AVCodecContext *avctx, MPEG4AudioConfig *m4ac,
-                      uint8_t (*layout_map)[3],
-                      GetBitContext *gb)
-{
-    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc;
-    int sampling_index;
-    int comment_len;
-    int tags;
-
-    skip_bits(gb, 2);  // object_type
-
-    sampling_index = get_bits(gb, 4);
-    if (m4ac->sampling_index != sampling_index)
-        av_log(avctx, AV_LOG_WARNING,
-               "Sample rate index in program config element does not "
-               "match the sample rate index configured by the container.\n");
-
-    num_front       = get_bits(gb, 4);
-    num_side        = get_bits(gb, 4);
-    num_back        = get_bits(gb, 4);
-    num_lfe         = get_bits(gb, 2);
-    num_assoc_data  = get_bits(gb, 3);
-    num_cc          = get_bits(gb, 4);
-
-    if (get_bits1(gb))
-        skip_bits(gb, 4); // mono_mixdown_tag
-    if (get_bits1(gb))
-        skip_bits(gb, 4); // stereo_mixdown_tag
-
-    if (get_bits1(gb))
-        skip_bits(gb, 3); // mixdown_coeff_index and pseudo_surround
-
-    decode_channel_map(layout_map       , AAC_CHANNEL_FRONT, gb, num_front);
-    tags = num_front;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_SIDE,  gb, num_side);
-    tags += num_side;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_BACK,  gb, num_back);
-    tags += num_back;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_LFE,   gb, num_lfe);
-    tags += num_lfe;
-
-    skip_bits_long(gb, 4 * num_assoc_data);
-
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_CC,    gb, num_cc);
-    tags += num_cc;
-
-    align_get_bits(gb);
-
-    /* comment field, first byte is length */
-    comment_len = get_bits(gb, 8) * 8;
-    if (get_bits_left(gb) < comment_len) {
-        av_log(avctx, AV_LOG_ERROR, overread_err);
-        return AVERROR_INVALIDDATA;
-    }
-    skip_bits_long(gb, comment_len);
-    return tags;
-}
-
-/**
- * Decode GA "General Audio" specific configuration; reference: table 4.1.
- *
- * @param   ac          pointer to AACContext, may be null
- * @param   avctx       pointer to AVCCodecContext, used for logging
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_ga_specific_config(AACContext *ac, AVCodecContext *avctx,
-                                     GetBitContext *gb,
-                                     MPEG4AudioConfig *m4ac,
-                                     int channel_config)
-{
-    int extension_flag, ret, ep_config, res_flags;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int tags = 0;
-
-    if (get_bits1(gb)) { // frameLengthFlag
-        avpriv_request_sample(avctx, "960/120 MDCT window");
-        return AVERROR_PATCHWELCOME;
-    }
-    m4ac->frame_length_short = 0;
-
-    if (get_bits1(gb))       // dependsOnCoreCoder
-        skip_bits(gb, 14);   // coreCoderDelay
-    extension_flag = get_bits1(gb);
-
-    if (m4ac->object_type == AOT_AAC_SCALABLE ||
-        m4ac->object_type == AOT_ER_AAC_SCALABLE)
-        skip_bits(gb, 3);     // layerNr
-
-    if (channel_config == 0) {
-        skip_bits(gb, 4);  // element_instance_tag
-        tags = decode_pce(avctx, m4ac, layout_map, gb);
-        if (tags < 0)
-            return tags;
-    } else {
-        if ((ret = set_default_channel_config(avctx, layout_map,
-                                              &tags, channel_config)))
-            return ret;
-    }
-
-    if (count_channels(layout_map, tags) > 1) {
-        m4ac->ps = 0;
-    } else if (m4ac->sbr == 1 && m4ac->ps == -1)
-        m4ac->ps = 1;
-
-    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
-        return ret;
-
-    if (extension_flag) {
-        switch (m4ac->object_type) {
-        case AOT_ER_BSAC:
-            skip_bits(gb, 5);    // numOfSubFrame
-            skip_bits(gb, 11);   // layer_length
-            break;
-        case AOT_ER_AAC_LC:
-        case AOT_ER_AAC_LTP:
-        case AOT_ER_AAC_SCALABLE:
-        case AOT_ER_AAC_LD:
-            res_flags = get_bits(gb, 3);
-            if (res_flags) {
-                avpriv_report_missing_feature(avctx,
-                                              "AAC data resilience (flags %x)",
-                                              res_flags);
-                return AVERROR_PATCHWELCOME;
-            }
-            break;
-        }
-        skip_bits1(gb);    // extensionFlag3 (TBD in version 3)
-    }
-    switch (m4ac->object_type) {
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LTP:
-    case AOT_ER_AAC_SCALABLE:
-    case AOT_ER_AAC_LD:
-        ep_config = get_bits(gb, 2);
-        if (ep_config) {
-            avpriv_report_missing_feature(avctx,
-                                          "epConfig %d", ep_config);
-            return AVERROR_PATCHWELCOME;
-        }
-    }
-    return 0;
-}
-
-static int decode_eld_specific_config(AACContext *ac, AVCodecContext *avctx,
-                                     GetBitContext *gb,
-                                     MPEG4AudioConfig *m4ac,
-                                     int channel_config)
-{
-    int ret, ep_config, res_flags;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int tags = 0;
-    const int ELDEXT_TERM = 0;
-
-    m4ac->ps  = 0;
-    m4ac->sbr = 0;
-
-    m4ac->frame_length_short = get_bits1(gb);
-    res_flags = get_bits(gb, 3);
-    if (res_flags) {
-        avpriv_report_missing_feature(avctx,
-                                      "AAC data resilience (flags %x)",
-                                      res_flags);
-        return AVERROR_PATCHWELCOME;
-    }
-
-    if (get_bits1(gb)) { // ldSbrPresentFlag
-        avpriv_report_missing_feature(avctx,
-                                      "Low Delay SBR");
-        return AVERROR_PATCHWELCOME;
-    }
-
-    while (get_bits(gb, 4) != ELDEXT_TERM) {
-        int len = get_bits(gb, 4);
-        if (len == 15)
-            len += get_bits(gb, 8);
-        if (len == 15 + 255)
-            len += get_bits(gb, 16);
-        if (get_bits_left(gb) < len * 8 + 4) {
-            av_log(avctx, AV_LOG_ERROR, overread_err);
-            return AVERROR_INVALIDDATA;
-        }
-        skip_bits_long(gb, 8 * len);
-    }
-
-    if ((ret = set_default_channel_config(avctx, layout_map,
-                                          &tags, channel_config)))
-        return ret;
-
-    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
-        return ret;
-
-    ep_config = get_bits(gb, 2);
-    if (ep_config) {
-        avpriv_report_missing_feature(avctx,
-                                      "epConfig %d", ep_config);
-        return AVERROR_PATCHWELCOME;
-    }
-    return 0;
-}
-
-/**
- * Decode audio specific configuration; reference: table 1.13.
- *
- * @param   ac          pointer to AACContext, may be null
- * @param   avctx       pointer to AVCCodecContext, used for logging
- * @param   m4ac        pointer to MPEG4AudioConfig, used for parsing
- * @param   data        pointer to buffer holding an audio specific config
- * @param   bit_size    size of audio specific config or data in bits
- * @param   sync_extension look for an appended sync extension
- *
- * @return  Returns error status or number of consumed bits. <0 - error
- */
-static int decode_audio_specific_config(AACContext *ac,
-                                        AVCodecContext *avctx,
-                                        MPEG4AudioConfig *m4ac,
-                                        const uint8_t *data, int bit_size,
-                                        int sync_extension)
-{
-    GetBitContext gb;
-    int i, ret;
-
-    ff_dlog(avctx, "extradata size %d\n", avctx->extradata_size);
-    for (i = 0; i < avctx->extradata_size; i++)
-        ff_dlog(avctx, "%02x ", avctx->extradata[i]);
-    ff_dlog(avctx, "\n");
-
-    if ((ret = init_get_bits(&gb, data, bit_size)) < 0)
-        return ret;
-
-    if ((i = avpriv_mpeg4audio_get_config(m4ac, data, bit_size,
-                                          sync_extension)) < 0)
-        return AVERROR_INVALIDDATA;
-    if (m4ac->sampling_index > 12) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid sampling rate index %d\n",
-               m4ac->sampling_index);
-        return AVERROR_INVALIDDATA;
-    }
-    if (m4ac->object_type == AOT_ER_AAC_LD &&
-        (m4ac->sampling_index < 3 || m4ac->sampling_index > 7)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid low delay sampling rate index %d\n",
-               m4ac->sampling_index);
-        return AVERROR_INVALIDDATA;
-    }
-
-    skip_bits_long(&gb, i);
-
-    switch (m4ac->object_type) {
-    case AOT_AAC_MAIN:
-    case AOT_AAC_LC:
-    case AOT_AAC_LTP:
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LD:
-        if ((ret = decode_ga_specific_config(ac, avctx, &gb,
-                                            m4ac, m4ac->chan_config)) < 0)
-            return ret;
-        break;
-    case AOT_ER_AAC_ELD:
-        if ((ret = decode_eld_specific_config(ac, avctx, &gb,
-                                              m4ac, m4ac->chan_config)) < 0)
-            return ret;
-        break;
-    default:
-        avpriv_report_missing_feature(avctx,
-                                      "Audio object type %s%d",
-                                      m4ac->sbr == 1 ? "SBR+" : "",
-                                      m4ac->object_type);
-        return AVERROR(ENOSYS);
-    }
-
-    ff_dlog(avctx,
-            "AOT %d chan config %d sampling index %d (%d) SBR %d PS %d\n",
-            m4ac->object_type, m4ac->chan_config, m4ac->sampling_index,
-            m4ac->sample_rate, m4ac->sbr,
-            m4ac->ps);
-
-    return get_bits_count(&gb);
-}
-
-/**
- * linear congruential pseudorandom number generator
- *
- * @param   previous_val    pointer to the current state of the generator
- *
- * @return  Returns a 32-bit pseudorandom integer
- */
-static av_always_inline int lcg_random(int previous_val)
-{
-    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
-    return v.s;
-}
-
 static av_always_inline void reset_predict_state(PredictorState *ps)
 {
     ps->r0   = 0.0f;
@@ -967,508 +79,6 @@ static av_always_inline void reset_predict_state(PredictorState *ps)
     ps->var1 = 1.0f;
 }
 
-static void reset_all_predictors(PredictorState *ps)
-{
-    int i;
-    for (i = 0; i < MAX_PREDICTORS; i++)
-        reset_predict_state(&ps[i]);
-}
-
-static int sample_rate_idx (int rate)
-{
-         if (92017 <= rate) return 0;
-    else if (75132 <= rate) return 1;
-    else if (55426 <= rate) return 2;
-    else if (46009 <= rate) return 3;
-    else if (37566 <= rate) return 4;
-    else if (27713 <= rate) return 5;
-    else if (23004 <= rate) return 6;
-    else if (18783 <= rate) return 7;
-    else if (13856 <= rate) return 8;
-    else if (11502 <= rate) return 9;
-    else if (9391  <= rate) return 10;
-    else                    return 11;
-}
-
-static void reset_predictor_group(PredictorState *ps, int group_num)
-{
-    int i;
-    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
-        reset_predict_state(&ps[i]);
-}
-
-#define AAC_INIT_VLC_STATIC(num, size)                                     \
-    INIT_VLC_STATIC(&vlc_spectral[num], 8, ff_aac_spectral_sizes[num],     \
-         ff_aac_spectral_bits[num], sizeof(ff_aac_spectral_bits[num][0]),  \
-                                    sizeof(ff_aac_spectral_bits[num][0]),  \
-        ff_aac_spectral_codes[num], sizeof(ff_aac_spectral_codes[num][0]), \
-                                    sizeof(ff_aac_spectral_codes[num][0]), \
-        size);
-
-static av_cold void aac_static_table_init(void)
-{
-    AAC_INIT_VLC_STATIC( 0, 304);
-    AAC_INIT_VLC_STATIC( 1, 270);
-    AAC_INIT_VLC_STATIC( 2, 550);
-    AAC_INIT_VLC_STATIC( 3, 300);
-    AAC_INIT_VLC_STATIC( 4, 328);
-    AAC_INIT_VLC_STATIC( 5, 294);
-    AAC_INIT_VLC_STATIC( 6, 306);
-    AAC_INIT_VLC_STATIC( 7, 268);
-    AAC_INIT_VLC_STATIC( 8, 510);
-    AAC_INIT_VLC_STATIC( 9, 366);
-    AAC_INIT_VLC_STATIC(10, 462);
-
-    ff_aac_sbr_init();
-
-    ff_aac_tableinit();
-
-    INIT_VLC_STATIC(&vlc_scalefactors, 7,
-                    FF_ARRAY_ELEMS(ff_aac_scalefactor_code),
-                    ff_aac_scalefactor_bits,
-                    sizeof(ff_aac_scalefactor_bits[0]),
-                    sizeof(ff_aac_scalefactor_bits[0]),
-                    ff_aac_scalefactor_code,
-                    sizeof(ff_aac_scalefactor_code[0]),
-                    sizeof(ff_aac_scalefactor_code[0]),
-                    352);
-
-
-    // window initialization
-    ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024);
-    ff_kbd_window_init(ff_aac_kbd_short_128, 6.0, 128);
-    ff_init_ff_sine_windows(10);
-    ff_init_ff_sine_windows( 9);
-    ff_init_ff_sine_windows( 7);
-
-    cbrt_tableinit();
-}
-
-static AVOnce aac_init = AV_ONCE_INIT;
-
-static av_cold int aac_decode_init(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int ret;
-
-    ret = ff_thread_once(&aac_init, &aac_static_table_init);
-    if (ret != 0)
-        return AVERROR_UNKNOWN;
-
-    ac->avctx = avctx;
-    ac->oc[1].m4ac.sample_rate = avctx->sample_rate;
-
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
-
-    if (avctx->extradata_size > 0) {
-        if ((ret = decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
-                                                avctx->extradata,
-                                                avctx->extradata_size * 8,
-                                                1)) < 0)
-            return ret;
-    } else {
-        int sr, i;
-        uint8_t layout_map[MAX_ELEM_ID*4][3];
-        int layout_map_tags;
-
-        sr = sample_rate_idx(avctx->sample_rate);
-        ac->oc[1].m4ac.sampling_index = sr;
-        ac->oc[1].m4ac.channels = avctx->channels;
-        ac->oc[1].m4ac.sbr = -1;
-        ac->oc[1].m4ac.ps = -1;
-
-        for (i = 0; i < FF_ARRAY_ELEMS(ff_mpeg4audio_channels); i++)
-            if (ff_mpeg4audio_channels[i] == avctx->channels)
-                break;
-        if (i == FF_ARRAY_ELEMS(ff_mpeg4audio_channels)) {
-            i = 0;
-        }
-        ac->oc[1].m4ac.chan_config = i;
-
-        if (ac->oc[1].m4ac.chan_config) {
-            int ret = set_default_channel_config(avctx, layout_map,
-                &layout_map_tags, ac->oc[1].m4ac.chan_config);
-            if (!ret)
-                output_configure(ac, layout_map, layout_map_tags,
-                                 OC_GLOBAL_HDR, 0);
-            else if (avctx->err_recognition & AV_EF_EXPLODE)
-                return AVERROR_INVALIDDATA;
-        }
-    }
-
-    avpriv_float_dsp_init(&ac->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-
-    ac->random_state = 0x1f2e3d4c;
-
-    ff_mdct_init(&ac->mdct,       11, 1, 1.0 / (32768.0 * 1024.0));
-    ff_mdct_init(&ac->mdct_ld,    10, 1, 1.0 / (32768.0 * 512.0));
-    ff_mdct_init(&ac->mdct_small,  8, 1, 1.0 / (32768.0 * 128.0));
-    ff_mdct_init(&ac->mdct_ltp,   11, 0, -2.0 * 32768.0);
-    ret = ff_imdct15_init(&ac->mdct480, 5);
-    if (ret < 0)
-        return ret;
-
-    return 0;
-}
-
-/**
- * Skip data_stream_element; reference: table 4.10.
- */
-static int skip_data_stream_element(AACContext *ac, GetBitContext *gb)
-{
-    int byte_align = get_bits1(gb);
-    int count = get_bits(gb, 8);
-    if (count == 255)
-        count += get_bits(gb, 8);
-    if (byte_align)
-        align_get_bits(gb);
-
-    if (get_bits_left(gb) < 8 * count) {
-        av_log(ac->avctx, AV_LOG_ERROR, overread_err);
-        return AVERROR_INVALIDDATA;
-    }
-    skip_bits_long(gb, 8 * count);
-    return 0;
-}
-
-static int decode_prediction(AACContext *ac, IndividualChannelStream *ics,
-                             GetBitContext *gb)
-{
-    int sfb;
-    if (get_bits1(gb)) {
-        ics->predictor_reset_group = get_bits(gb, 5);
-        if (ics->predictor_reset_group == 0 ||
-            ics->predictor_reset_group > 30) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid Predictor Reset Group.\n");
-            return AVERROR_INVALIDDATA;
-        }
-    }
-    for (sfb = 0; sfb < FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index]); sfb++) {
-        ics->prediction_used[sfb] = get_bits1(gb);
-    }
-    return 0;
-}
-
-/**
- * Decode Long Term Prediction data; reference: table 4.xx.
- */
-static void decode_ltp(LongTermPrediction *ltp,
-                       GetBitContext *gb, uint8_t max_sfb)
-{
-    int sfb;
-
-    ltp->lag  = get_bits(gb, 11);
-    ltp->coef = ltp_coef[get_bits(gb, 3)];
-    for (sfb = 0; sfb < FFMIN(max_sfb, MAX_LTP_LONG_SFB); sfb++)
-        ltp->used[sfb] = get_bits1(gb);
-}
-
-/**
- * Decode Individual Channel Stream info; reference: table 4.6.
- */
-static int decode_ics_info(AACContext *ac, IndividualChannelStream *ics,
-                           GetBitContext *gb)
-{
-    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
-    const int aot = m4ac->object_type;
-    const int sampling_index = m4ac->sampling_index;
-    if (aot != AOT_ER_AAC_ELD) {
-        if (get_bits1(gb)) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Reserved bit set.\n");
-            if (ac->avctx->err_recognition & AV_EF_BITSTREAM)
-                return AVERROR_INVALIDDATA;
-        }
-        ics->window_sequence[1] = ics->window_sequence[0];
-        ics->window_sequence[0] = get_bits(gb, 2);
-        if (aot == AOT_ER_AAC_LD &&
-            ics->window_sequence[0] != ONLY_LONG_SEQUENCE) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "AAC LD is only defined for ONLY_LONG_SEQUENCE but "
-                   "window sequence %d found.\n", ics->window_sequence[0]);
-            ics->window_sequence[0] = ONLY_LONG_SEQUENCE;
-            return AVERROR_INVALIDDATA;
-        }
-        ics->use_kb_window[1]   = ics->use_kb_window[0];
-        ics->use_kb_window[0]   = get_bits1(gb);
-    }
-    ics->num_window_groups  = 1;
-    ics->group_len[0]       = 1;
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        int i;
-        ics->max_sfb = get_bits(gb, 4);
-        for (i = 0; i < 7; i++) {
-            if (get_bits1(gb)) {
-                ics->group_len[ics->num_window_groups - 1]++;
-            } else {
-                ics->num_window_groups++;
-                ics->group_len[ics->num_window_groups - 1] = 1;
-            }
-        }
-        ics->num_windows       = 8;
-        ics->swb_offset        =    ff_swb_offset_128[sampling_index];
-        ics->num_swb           =   ff_aac_num_swb_128[sampling_index];
-        ics->tns_max_bands     = ff_tns_max_bands_128[sampling_index];
-        ics->predictor_present = 0;
-    } else {
-        ics->max_sfb           = get_bits(gb, 6);
-        ics->num_windows       = 1;
-        if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD) {
-            if (m4ac->frame_length_short) {
-                ics->swb_offset    =     ff_swb_offset_480[sampling_index];
-                ics->num_swb       =    ff_aac_num_swb_480[sampling_index];
-                ics->tns_max_bands =  ff_tns_max_bands_480[sampling_index];
-            } else {
-                ics->swb_offset    =     ff_swb_offset_512[sampling_index];
-                ics->num_swb       =    ff_aac_num_swb_512[sampling_index];
-                ics->tns_max_bands =  ff_tns_max_bands_512[sampling_index];
-            }
-            if (!ics->num_swb || !ics->swb_offset)
-                return AVERROR_BUG;
-        } else {
-            ics->swb_offset    =    ff_swb_offset_1024[sampling_index];
-            ics->num_swb       =   ff_aac_num_swb_1024[sampling_index];
-            ics->tns_max_bands = ff_tns_max_bands_1024[sampling_index];
-        }
-        if (aot != AOT_ER_AAC_ELD) {
-            ics->predictor_present     = get_bits1(gb);
-            ics->predictor_reset_group = 0;
-        }
-        if (ics->predictor_present) {
-            if (aot == AOT_AAC_MAIN) {
-                if (decode_prediction(ac, ics, gb)) {
-                    return AVERROR_INVALIDDATA;
-                }
-            } else if (aot == AOT_AAC_LC ||
-                       aot == AOT_ER_AAC_LC) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Prediction is not allowed in AAC-LC.\n");
-                return AVERROR_INVALIDDATA;
-            } else {
-                if (aot == AOT_ER_AAC_LD) {
-                    avpriv_report_missing_feature(ac->avctx, "LTP in ER AAC LD");
-                    return AVERROR_PATCHWELCOME;
-                }
-                if ((ics->ltp.present = get_bits(gb, 1)))
-                    decode_ltp(&ics->ltp, gb, ics->max_sfb);
-            }
-        }
-    }
-
-    if (ics->max_sfb > ics->num_swb) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Number of scalefactor bands in group (%d) "
-               "exceeds limit (%d).\n",
-               ics->max_sfb, ics->num_swb);
-        return AVERROR_INVALIDDATA;
-    }
-
-    return 0;
-}
-
-/**
- * Decode band types (section_data payload); reference: table 4.46.
- *
- * @param   band_type           array of the used band type
- * @param   band_type_run_end   array of the last scalefactor band of a band type run
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_band_types(AACContext *ac, enum BandType band_type[120],
-                             int band_type_run_end[120], GetBitContext *gb,
-                             IndividualChannelStream *ics)
-{
-    int g, idx = 0;
-    const int bits = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) ? 3 : 5;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        int k = 0;
-        while (k < ics->max_sfb) {
-            uint8_t sect_end = k;
-            int sect_len_incr;
-            int sect_band_type = get_bits(gb, 4);
-            if (sect_band_type == 12) {
-                av_log(ac->avctx, AV_LOG_ERROR, "invalid band type\n");
-                return AVERROR_INVALIDDATA;
-            }
-            do {
-                sect_len_incr = get_bits(gb, bits);
-                sect_end += sect_len_incr;
-                if (get_bits_left(gb) < 0) {
-                    av_log(ac->avctx, AV_LOG_ERROR, overread_err);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (sect_end > ics->max_sfb) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "Number of bands (%d) exceeds limit (%d).\n",
-                           sect_end, ics->max_sfb);
-                    return AVERROR_INVALIDDATA;
-                }
-            } while (sect_len_incr == (1 << bits) - 1);
-            for (; k < sect_end; k++) {
-                band_type        [idx]   = sect_band_type;
-                band_type_run_end[idx++] = sect_end;
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode scalefactors; reference: table 4.47.
- *
- * @param   global_gain         first scalefactor value as scalefactors are differentially coded
- * @param   band_type           array of the used band type
- * @param   band_type_run_end   array of the last scalefactor band of a band type run
- * @param   sf                  array of scalefactors or intensity stereo positions
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_scalefactors(AACContext *ac, float sf[120], GetBitContext *gb,
-                               unsigned int global_gain,
-                               IndividualChannelStream *ics,
-                               enum BandType band_type[120],
-                               int band_type_run_end[120])
-{
-    int g, i, idx = 0;
-    int offset[3] = { global_gain, global_gain - 90, 0 };
-    int clipped_offset;
-    int noise_flag = 1;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb;) {
-            int run_end = band_type_run_end[idx];
-            if (band_type[idx] == ZERO_BT) {
-                for (; i < run_end; i++, idx++)
-                    sf[idx] = 0.0;
-            } else if ((band_type[idx] == INTENSITY_BT) ||
-                       (band_type[idx] == INTENSITY_BT2)) {
-                for (; i < run_end; i++, idx++) {
-                    offset[2] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    clipped_offset = av_clip(offset[2], -155, 100);
-                    if (offset[2] != clipped_offset) {
-                        avpriv_request_sample(ac->avctx,
-                                              "If you heard an audible artifact, there may be a bug in the decoder. "
-                                              "Clipped intensity stereo position (%d -> %d)",
-                                              offset[2], clipped_offset);
-                    }
-                    sf[idx] = ff_aac_pow2sf_tab[-clipped_offset + POW_SF2_ZERO];
-                }
-            } else if (band_type[idx] == NOISE_BT) {
-                for (; i < run_end; i++, idx++) {
-                    if (noise_flag-- > 0)
-                        offset[1] += get_bits(gb, 9) - 256;
-                    else
-                        offset[1] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    clipped_offset = av_clip(offset[1], -100, 155);
-                    if (offset[1] != clipped_offset) {
-                        avpriv_request_sample(ac->avctx,
-                                              "If you heard an audible artifact, there may be a bug in the decoder. "
-                                              "Clipped noise gain (%d -> %d)",
-                                              offset[1], clipped_offset);
-                    }
-                    sf[idx] = -ff_aac_pow2sf_tab[clipped_offset + POW_SF2_ZERO];
-                }
-            } else {
-                for (; i < run_end; i++, idx++) {
-                    offset[0] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    if (offset[0] > 255U) {
-                        av_log(ac->avctx, AV_LOG_ERROR,
-                               "Scalefactor (%d) out of range.\n", offset[0]);
-                        return AVERROR_INVALIDDATA;
-                    }
-                    sf[idx] = -ff_aac_pow2sf_tab[offset[0] - 100 + POW_SF2_ZERO];
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode pulse data; reference: table 4.7.
- */
-static int decode_pulses(Pulse *pulse, GetBitContext *gb,
-                         const uint16_t *swb_offset, int num_swb)
-{
-    int i, pulse_swb;
-    pulse->num_pulse = get_bits(gb, 2) + 1;
-    pulse_swb        = get_bits(gb, 6);
-    if (pulse_swb >= num_swb)
-        return -1;
-    pulse->pos[0]    = swb_offset[pulse_swb];
-    pulse->pos[0]   += get_bits(gb, 5);
-    if (pulse->pos[0] > 1023)
-        return -1;
-    pulse->amp[0]    = get_bits(gb, 4);
-    for (i = 1; i < pulse->num_pulse; i++) {
-        pulse->pos[i] = get_bits(gb, 5) + pulse->pos[i - 1];
-        if (pulse->pos[i] > 1023)
-            return -1;
-        pulse->amp[i] = get_bits(gb, 4);
-    }
-    return 0;
-}
-
-/**
- * Decode Temporal Noise Shaping data; reference: table 4.48.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_tns(AACContext *ac, TemporalNoiseShaping *tns,
-                      GetBitContext *gb, const IndividualChannelStream *ics)
-{
-    int w, filt, i, coef_len, coef_res, coef_compress;
-    const int is8 = ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE;
-    const int tns_max_order = is8 ? 7 : ac->oc[1].m4ac.object_type == AOT_AAC_MAIN ? 20 : 12;
-    for (w = 0; w < ics->num_windows; w++) {
-        if ((tns->n_filt[w] = get_bits(gb, 2 - is8))) {
-            coef_res = get_bits1(gb);
-
-            for (filt = 0; filt < tns->n_filt[w]; filt++) {
-                int tmp2_idx;
-                tns->length[w][filt] = get_bits(gb, 6 - 2 * is8);
-
-                if ((tns->order[w][filt] = get_bits(gb, 5 - 2 * is8)) > tns_max_order) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "TNS filter order %d is greater than maximum %d.\n",
-                           tns->order[w][filt], tns_max_order);
-                    tns->order[w][filt] = 0;
-                    return AVERROR_INVALIDDATA;
-                }
-                if (tns->order[w][filt]) {
-                    tns->direction[w][filt] = get_bits1(gb);
-                    coef_compress = get_bits1(gb);
-                    coef_len = coef_res + 3 - coef_compress;
-                    tmp2_idx = 2 * coef_compress + coef_res;
-
-                    for (i = 0; i < tns->order[w][filt]; i++)
-                        tns->coef[w][filt][i] = tns_tmp2_map[tmp2_idx][get_bits(gb, coef_len)];
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode Mid/Side data; reference: table 4.54.
- *
- * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
- *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
- *                      [3] reserved for scalable AAC
- */
-static void decode_mid_side_stereo(ChannelElement *cpe, GetBitContext *gb,
-                                   int ms_present)
-{
-    int idx;
-    int max_idx = cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb;
-    if (ms_present == 1) {
-        for (idx = 0; idx < max_idx; idx++)
-            cpe->ms_mask[idx] = get_bits1(gb);
-    } else if (ms_present == 2) {
-        memset(cpe->ms_mask, 1, max_idx * sizeof(cpe->ms_mask[0]));
-    }
-}
-
 #ifndef VMUL2
 static inline float *VMUL2(float *dst, const float *v, unsigned idx,
                            const float *scale)
@@ -1537,233 +147,6 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
 }
 #endif
 
-/**
- * Decode spectral data; reference: table 4.50.
- * Dequantize and scale spectral data; reference: 4.6.3.3.
- *
- * @param   coef            array of dequantized, scaled spectral data
- * @param   sf              array of scalefactors or intensity stereo positions
- * @param   pulse_present   set if pulses are present
- * @param   pulse           pointer to pulse data struct
- * @param   band_type       array of the used band type
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_spectrum_and_dequant(AACContext *ac, float coef[1024],
-                                       GetBitContext *gb, const float sf[120],
-                                       int pulse_present, const Pulse *pulse,
-                                       const IndividualChannelStream *ics,
-                                       enum BandType band_type[120])
-{
-    int i, k, g, idx = 0;
-    const int c = 1024 / ics->num_windows;
-    const uint16_t *offsets = ics->swb_offset;
-    float *coef_base = coef;
-
-    for (g = 0; g < ics->num_windows; g++)
-        memset(coef + g * 128 + offsets[ics->max_sfb], 0,
-               sizeof(float) * (c - offsets[ics->max_sfb]));
-
-    for (g = 0; g < ics->num_window_groups; g++) {
-        unsigned g_len = ics->group_len[g];
-
-        for (i = 0; i < ics->max_sfb; i++, idx++) {
-            const unsigned cbt_m1 = band_type[idx] - 1;
-            float *cfo = coef + offsets[i];
-            int off_len = offsets[i + 1] - offsets[i];
-            int group;
-
-            if (cbt_m1 >= INTENSITY_BT2 - 1) {
-                for (group = 0; group < g_len; group++, cfo+=128) {
-                    memset(cfo, 0, off_len * sizeof(float));
-                }
-            } else if (cbt_m1 == NOISE_BT - 1) {
-                for (group = 0; group < g_len; group++, cfo+=128) {
-                    float scale;
-                    float band_energy;
-
-                    for (k = 0; k < off_len; k++) {
-                        ac->random_state  = lcg_random(ac->random_state);
-                        cfo[k] = ac->random_state;
-                    }
-
-                    band_energy = ac->fdsp.scalarproduct_float(cfo, cfo, off_len);
-                    scale = sf[idx] / sqrtf(band_energy);
-                    ac->fdsp.vector_fmul_scalar(cfo, cfo, scale, off_len);
-                }
-            } else {
-                const float *vq = ff_aac_codebook_vector_vals[cbt_m1];
-                const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cbt_m1];
-                VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cbt_m1].table;
-                OPEN_READER(re, gb);
-
-                switch (cbt_m1 >> 1) {
-                case 0:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned cb_idx;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            cf = VMUL4(cf, vq, cb_idx, sf + idx);
-                        } while (len -= 4);
-                    }
-                    break;
-
-                case 1:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nnz;
-                            unsigned cb_idx;
-                            uint32_t bits;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 8 & 15;
-                            bits = nnz ? GET_CACHE(re, gb) : 0;
-                            LAST_SKIP_BITS(re, gb, nnz);
-                            cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx);
-                        } while (len -= 4);
-                    }
-                    break;
-
-                case 2:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned cb_idx;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            cf = VMUL2(cf, vq, cb_idx, sf + idx);
-                        } while (len -= 2);
-                    }
-                    break;
-
-                case 3:
-                case 4:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nnz;
-                            unsigned cb_idx;
-                            unsigned sign;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 8 & 15;
-                            sign = nnz ? SHOW_UBITS(re, gb, nnz) << (cb_idx >> 12) : 0;
-                            LAST_SKIP_BITS(re, gb, nnz);
-                            cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx);
-                        } while (len -= 2);
-                    }
-                    break;
-
-                default:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        uint32_t *icf = (uint32_t *) cf;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nzt, nnz;
-                            unsigned cb_idx;
-                            uint32_t bits;
-                            int j;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-
-                            if (!code) {
-                                *icf++ = 0;
-                                *icf++ = 0;
-                                continue;
-                            }
-
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 12;
-                            nzt = cb_idx >> 8;
-                            bits = SHOW_UBITS(re, gb, nnz) << (32-nnz);
-                            LAST_SKIP_BITS(re, gb, nnz);
-
-                            for (j = 0; j < 2; j++) {
-                                if (nzt & 1<<j) {
-                                    uint32_t b;
-                                    int n;
-                                    /* The total length of escape_sequence must be < 22 bits according
-                                       to the specification (i.e. max is 111111110xxxxxxxxxxxx). */
-                                    UPDATE_CACHE(re, gb);
-                                    b = GET_CACHE(re, gb);
-                                    b = 31 - av_log2(~b);
-
-                                    if (b > 8) {
-                                        av_log(ac->avctx, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
-                                        return AVERROR_INVALIDDATA;
-                                    }
-
-                                    SKIP_BITS(re, gb, b + 1);
-                                    b += 4;
-                                    n = (1 << b) + SHOW_UBITS(re, gb, b);
-                                    LAST_SKIP_BITS(re, gb, b);
-                                    *icf++ = cbrt_tab[n] | (bits & 1U<<31);
-                                    bits <<= 1;
-                                } else {
-                                    unsigned v = ((const uint32_t*)vq)[cb_idx & 15];
-                                    *icf++ = (bits & 1U<<31) | v;
-                                    bits <<= !!v;
-                                }
-                                cb_idx >>= 4;
-                            }
-                        } while (len -= 2);
-
-                        ac->fdsp.vector_fmul_scalar(cfo, cfo, sf[idx], off_len);
-                    }
-                }
-
-                CLOSE_READER(re, gb);
-            }
-        }
-        coef += g_len << 7;
-    }
-
-    if (pulse_present) {
-        idx = 0;
-        for (i = 0; i < pulse->num_pulse; i++) {
-            float co = coef_base[ pulse->pos[i] ];
-            while (offsets[idx + 1] <= pulse->pos[i])
-                idx++;
-            if (band_type[idx] != NOISE_BT && sf[idx]) {
-                float ico = -pulse->amp[i];
-                if (co) {
-                    co /= sf[idx];
-                    ico = co / sqrtf(sqrtf(fabsf(co))) + (co > 0 ? -ico : ico);
-                }
-                coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx];
-            }
-        }
-    }
-    return 0;
-}
-
 static av_always_inline float flt16_round(float pf)
 {
     union av_intfloat32 tmp;
@@ -1820,738 +203,6 @@ static av_always_inline void predict(PredictorState *ps, float *coef,
 }
 
 /**
- * Apply AAC-Main style frequency domain prediction.
- */
-static void apply_prediction(AACContext *ac, SingleChannelElement *sce)
-{
-    int sfb, k;
-
-    if (!sce->ics.predictor_initialized) {
-        reset_all_predictors(sce->predictor_state);
-        sce->ics.predictor_initialized = 1;
-    }
-
-    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
-        for (sfb = 0;
-             sfb < ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index];
-             sfb++) {
-            for (k = sce->ics.swb_offset[sfb];
-                 k < sce->ics.swb_offset[sfb + 1];
-                 k++) {
-                predict(&sce->predictor_state[k], &sce->coeffs[k],
-                        sce->ics.predictor_present &&
-                        sce->ics.prediction_used[sfb]);
-            }
-        }
-        if (sce->ics.predictor_reset_group)
-            reset_predictor_group(sce->predictor_state,
-                                  sce->ics.predictor_reset_group);
-    } else
-        reset_all_predictors(sce->predictor_state);
-}
-
-/**
- * Decode an individual_channel_stream payload; reference: table 4.44.
- *
- * @param   common_window   Channels have independent [0], or shared [1], Individual Channel Stream information.
- * @param   scale_flag      scalable [1] or non-scalable [0] AAC (Unused until scalable AAC is implemented.)
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_ics(AACContext *ac, SingleChannelElement *sce,
-                      GetBitContext *gb, int common_window, int scale_flag)
-{
-    Pulse pulse;
-    TemporalNoiseShaping    *tns = &sce->tns;
-    IndividualChannelStream *ics = &sce->ics;
-    float *out = sce->coeffs;
-    int global_gain, eld_syntax, er_syntax, pulse_present = 0;
-    int ret;
-
-    eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-    er_syntax  = ac->oc[1].m4ac.object_type == AOT_ER_AAC_LC ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LTP ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LD ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-
-    /* This assignment is to silence a GCC warning about the variable being used
-     * uninitialized when in fact it always is.
-     */
-    pulse.num_pulse = 0;
-
-    global_gain = get_bits(gb, 8);
-
-    if (!common_window && !scale_flag) {
-        if (decode_ics_info(ac, ics, gb) < 0)
-            return AVERROR_INVALIDDATA;
-    }
-
-    if ((ret = decode_band_types(ac, sce->band_type,
-                                 sce->band_type_run_end, gb, ics)) < 0)
-        return ret;
-    if ((ret = decode_scalefactors(ac, sce->sf, gb, global_gain, ics,
-                                  sce->band_type, sce->band_type_run_end)) < 0)
-        return ret;
-
-    pulse_present = 0;
-    if (!scale_flag) {
-        if (!eld_syntax && (pulse_present = get_bits1(gb))) {
-            if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Pulse tool not allowed in eight short sequence.\n");
-                return AVERROR_INVALIDDATA;
-            }
-            if (decode_pulses(&pulse, gb, ics->swb_offset, ics->num_swb)) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Pulse data corrupt or invalid.\n");
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        tns->present = get_bits1(gb);
-        if (tns->present && !er_syntax)
-            if (decode_tns(ac, tns, gb, ics) < 0)
-                return AVERROR_INVALIDDATA;
-        if (!eld_syntax && get_bits1(gb)) {
-            avpriv_request_sample(ac->avctx, "SSR");
-            return AVERROR_PATCHWELCOME;
-        }
-        // I see no textual basis in the spec for this occurring after SSR gain
-        // control, but this is what both reference and real implementations do
-        if (tns->present && er_syntax)
-            if (decode_tns(ac, tns, gb, ics) < 0)
-                return AVERROR_INVALIDDATA;
-    }
-
-    if (decode_spectrum_and_dequant(ac, out, gb, sce->sf, pulse_present,
-                                    &pulse, ics, sce->band_type) < 0)
-        return AVERROR_INVALIDDATA;
-
-    if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN && !common_window)
-        apply_prediction(ac, sce);
-
-    return 0;
-}
-
-/**
- * Mid/Side stereo decoding; reference: 4.6.8.1.3.
- */
-static void apply_mid_side_stereo(AACContext *ac, ChannelElement *cpe)
-{
-    const IndividualChannelStream *ics = &cpe->ch[0].ics;
-    float *ch0 = cpe->ch[0].coeffs;
-    float *ch1 = cpe->ch[1].coeffs;
-    int g, i, group, idx = 0;
-    const uint16_t *offsets = ics->swb_offset;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb; i++, idx++) {
-            if (cpe->ms_mask[idx] &&
-                cpe->ch[0].band_type[idx] < NOISE_BT &&
-                cpe->ch[1].band_type[idx] < NOISE_BT) {
-                for (group = 0; group < ics->group_len[g]; group++) {
-                    ac->fdsp.butterflies_float(ch0 + group * 128 + offsets[i],
-                                               ch1 + group * 128 + offsets[i],
-                                               offsets[i+1] - offsets[i]);
-                }
-            }
-        }
-        ch0 += ics->group_len[g] * 128;
-        ch1 += ics->group_len[g] * 128;
-    }
-}
-
-/**
- * intensity stereo decoding; reference: 4.6.8.2.3
- *
- * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
- *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
- *                      [3] reserved for scalable AAC
- */
-static void apply_intensity_stereo(AACContext *ac,
-                                   ChannelElement *cpe, int ms_present)
-{
-    const IndividualChannelStream *ics = &cpe->ch[1].ics;
-    SingleChannelElement         *sce1 = &cpe->ch[1];
-    float *coef0 = cpe->ch[0].coeffs, *coef1 = cpe->ch[1].coeffs;
-    const uint16_t *offsets = ics->swb_offset;
-    int g, group, i, idx = 0;
-    int c;
-    float scale;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb;) {
-            if (sce1->band_type[idx] == INTENSITY_BT ||
-                sce1->band_type[idx] == INTENSITY_BT2) {
-                const int bt_run_end = sce1->band_type_run_end[idx];
-                for (; i < bt_run_end; i++, idx++) {
-                    c = -1 + 2 * (sce1->band_type[idx] - 14);
-                    if (ms_present)
-                        c *= 1 - 2 * cpe->ms_mask[idx];
-                    scale = c * sce1->sf[idx];
-                    for (group = 0; group < ics->group_len[g]; group++)
-                        ac->fdsp.vector_fmul_scalar(coef1 + group * 128 + offsets[i],
-                                                    coef0 + group * 128 + offsets[i],
-                                                    scale,
-                                                    offsets[i + 1] - offsets[i]);
-                }
-            } else {
-                int bt_run_end = sce1->band_type_run_end[idx];
-                idx += bt_run_end - i;
-                i    = bt_run_end;
-            }
-        }
-        coef0 += ics->group_len[g] * 128;
-        coef1 += ics->group_len[g] * 128;
-    }
-}
-
-/**
- * Decode a channel_pair_element; reference: table 4.4.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_cpe(AACContext *ac, GetBitContext *gb, ChannelElement *cpe)
-{
-    int i, ret, common_window, ms_present = 0;
-    int eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-
-    common_window = eld_syntax || get_bits1(gb);
-    if (common_window) {
-        if (decode_ics_info(ac, &cpe->ch[0].ics, gb))
-            return AVERROR_INVALIDDATA;
-        i = cpe->ch[1].ics.use_kb_window[0];
-        cpe->ch[1].ics = cpe->ch[0].ics;
-        cpe->ch[1].ics.use_kb_window[1] = i;
-        if (cpe->ch[1].ics.predictor_present &&
-            (ac->oc[1].m4ac.object_type != AOT_AAC_MAIN))
-            if ((cpe->ch[1].ics.ltp.present = get_bits(gb, 1)))
-                decode_ltp(&cpe->ch[1].ics.ltp, gb, cpe->ch[1].ics.max_sfb);
-        ms_present = get_bits(gb, 2);
-        if (ms_present == 3) {
-            av_log(ac->avctx, AV_LOG_ERROR, "ms_present = 3 is reserved.\n");
-            return AVERROR_INVALIDDATA;
-        } else if (ms_present)
-            decode_mid_side_stereo(cpe, gb, ms_present);
-    }
-    if ((ret = decode_ics(ac, &cpe->ch[0], gb, common_window, 0)))
-        return ret;
-    if ((ret = decode_ics(ac, &cpe->ch[1], gb, common_window, 0)))
-        return ret;
-
-    if (common_window) {
-        if (ms_present)
-            apply_mid_side_stereo(ac, cpe);
-        if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN) {
-            apply_prediction(ac, &cpe->ch[0]);
-            apply_prediction(ac, &cpe->ch[1]);
-        }
-    }
-
-    apply_intensity_stereo(ac, cpe, ms_present);
-    return 0;
-}
-
-static const float cce_scale[] = {
-    1.09050773266525765921, //2^(1/8)
-    1.18920711500272106672, //2^(1/4)
-    M_SQRT2,
-    2,
-};
-
-/**
- * Decode coupling_channel_element; reference: table 4.8.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_cce(AACContext *ac, GetBitContext *gb, ChannelElement *che)
-{
-    int num_gain = 0;
-    int c, g, sfb, ret;
-    int sign;
-    float scale;
-    SingleChannelElement *sce = &che->ch[0];
-    ChannelCoupling     *coup = &che->coup;
-
-    coup->coupling_point = 2 * get_bits1(gb);
-    coup->num_coupled = get_bits(gb, 3);
-    for (c = 0; c <= coup->num_coupled; c++) {
-        num_gain++;
-        coup->type[c] = get_bits1(gb) ? TYPE_CPE : TYPE_SCE;
-        coup->id_select[c] = get_bits(gb, 4);
-        if (coup->type[c] == TYPE_CPE) {
-            coup->ch_select[c] = get_bits(gb, 2);
-            if (coup->ch_select[c] == 3)
-                num_gain++;
-        } else
-            coup->ch_select[c] = 2;
-    }
-    coup->coupling_point += get_bits1(gb) || (coup->coupling_point >> 1);
-
-    sign  = get_bits(gb, 1);
-    scale = cce_scale[get_bits(gb, 2)];
-
-    if ((ret = decode_ics(ac, sce, gb, 0, 0)))
-        return ret;
-
-    for (c = 0; c < num_gain; c++) {
-        int idx  = 0;
-        int cge  = 1;
-        int gain = 0;
-        float gain_cache = 1.0;
-        if (c) {
-            cge = coup->coupling_point == AFTER_IMDCT ? 1 : get_bits1(gb);
-            gain = cge ? get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60: 0;
-            gain_cache = powf(scale, -gain);
-        }
-        if (coup->coupling_point == AFTER_IMDCT) {
-            coup->gain[c][0] = gain_cache;
-        } else {
-            for (g = 0; g < sce->ics.num_window_groups; g++) {
-                for (sfb = 0; sfb < sce->ics.max_sfb; sfb++, idx++) {
-                    if (sce->band_type[idx] != ZERO_BT) {
-                        if (!cge) {
-                            int t = get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                            if (t) {
-                                int s = 1;
-                                t = gain += t;
-                                if (sign) {
-                                    s  -= 2 * (t & 0x1);
-                                    t >>= 1;
-                                }
-                                gain_cache = powf(scale, -t) * s;
-                            }
-                        }
-                        coup->gain[c][idx] = gain_cache;
-                    }
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Parse whether channels are to be excluded from Dynamic Range Compression; reference: table 4.53.
- *
- * @return  Returns number of bytes consumed.
- */
-static int decode_drc_channel_exclusions(DynamicRangeControl *che_drc,
-                                         GetBitContext *gb)
-{
-    int i;
-    int num_excl_chan = 0;
-
-    do {
-        for (i = 0; i < 7; i++)
-            che_drc->exclude_mask[num_excl_chan++] = get_bits1(gb);
-    } while (num_excl_chan < MAX_CHANNELS - 7 && get_bits1(gb));
-
-    return num_excl_chan / 7;
-}
-
-/**
- * Decode dynamic range information; reference: table 4.52.
- *
- * @return  Returns number of bytes consumed.
- */
-static int decode_dynamic_range(DynamicRangeControl *che_drc,
-                                GetBitContext *gb)
-{
-    int n             = 1;
-    int drc_num_bands = 1;
-    int i;
-
-    /* pce_tag_present? */
-    if (get_bits1(gb)) {
-        che_drc->pce_instance_tag  = get_bits(gb, 4);
-        skip_bits(gb, 4); // tag_reserved_bits
-        n++;
-    }
-
-    /* excluded_chns_present? */
-    if (get_bits1(gb)) {
-        n += decode_drc_channel_exclusions(che_drc, gb);
-    }
-
-    /* drc_bands_present? */
-    if (get_bits1(gb)) {
-        che_drc->band_incr            = get_bits(gb, 4);
-        che_drc->interpolation_scheme = get_bits(gb, 4);
-        n++;
-        drc_num_bands += che_drc->band_incr;
-        for (i = 0; i < drc_num_bands; i++) {
-            che_drc->band_top[i] = get_bits(gb, 8);
-            n++;
-        }
-    }
-
-    /* prog_ref_level_present? */
-    if (get_bits1(gb)) {
-        che_drc->prog_ref_level = get_bits(gb, 7);
-        skip_bits1(gb); // prog_ref_level_reserved_bits
-        n++;
-    }
-
-    for (i = 0; i < drc_num_bands; i++) {
-        che_drc->dyn_rng_sgn[i] = get_bits1(gb);
-        che_drc->dyn_rng_ctl[i] = get_bits(gb, 7);
-        n++;
-    }
-
-    return n;
-}
-
-/**
- * Decode extension data (incomplete); reference: table 4.51.
- *
- * @param   cnt length of TYPE_FIL syntactic element in bytes
- *
- * @return Returns number of bytes consumed
- */
-static int decode_extension_payload(AACContext *ac, GetBitContext *gb, int cnt,
-                                    ChannelElement *che, enum RawDataBlockType elem_type)
-{
-    int crc_flag = 0;
-    int res = cnt;
-    switch (get_bits(gb, 4)) { // extension type
-    case EXT_SBR_DATA_CRC:
-        crc_flag++;
-    case EXT_SBR_DATA:
-        if (!che) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR was found before the first channel element.\n");
-            return res;
-        } else if (!ac->oc[1].m4ac.sbr) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR signaled to be not-present but was found in the bitstream.\n");
-            skip_bits_long(gb, 8 * cnt - 4);
-            return res;
-        } else if (ac->oc[1].m4ac.sbr == -1 && ac->oc[1].status == OC_LOCKED) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Implicit SBR was found with a first occurrence after the first frame.\n");
-            skip_bits_long(gb, 8 * cnt - 4);
-            return res;
-        } else if (ac->oc[1].m4ac.ps == -1 && ac->oc[1].status < OC_LOCKED && ac->avctx->channels == 1) {
-            ac->oc[1].m4ac.sbr = 1;
-            ac->oc[1].m4ac.ps = 1;
-            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
-            output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
-                             ac->oc[1].status, 1);
-        } else {
-            ac->oc[1].m4ac.sbr = 1;
-            ac->avctx->profile = FF_PROFILE_AAC_HE;
-        }
-        res = ff_decode_sbr_extension(ac, &che->sbr, gb, crc_flag, cnt, elem_type);
-        break;
-    case EXT_DYNAMIC_RANGE:
-        res = decode_dynamic_range(&ac->che_drc, gb);
-        break;
-    case EXT_FILL:
-    case EXT_FILL_DATA:
-    case EXT_DATA_ELEMENT:
-    default:
-        skip_bits_long(gb, 8 * cnt - 4);
-        break;
-    };
-    return res;
-}
-
-/**
- * Decode Temporal Noise Shaping filter coefficients and apply all-pole filters; reference: 4.6.9.3.
- *
- * @param   decode  1 if tool is used normally, 0 if tool is used in LTP.
- * @param   coef    spectral coefficients
- */
-static void apply_tns(float coef[1024], TemporalNoiseShaping *tns,
-                      IndividualChannelStream *ics, int decode)
-{
-    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
-    int w, filt, m, i;
-    int bottom, top, order, start, end, size, inc;
-    float lpc[TNS_MAX_ORDER];
-    float tmp[TNS_MAX_ORDER + 1];
-
-    for (w = 0; w < ics->num_windows; w++) {
-        bottom = ics->num_swb;
-        for (filt = 0; filt < tns->n_filt[w]; filt++) {
-            top    = bottom;
-            bottom = FFMAX(0, top - tns->length[w][filt]);
-            order  = tns->order[w][filt];
-            if (order == 0)
-                continue;
-
-            // tns_decode_coef
-            compute_lpc_coefs(tns->coef[w][filt], order, lpc, 0, 0, 0);
-
-            start = ics->swb_offset[FFMIN(bottom, mmm)];
-            end   = ics->swb_offset[FFMIN(   top, mmm)];
-            if ((size = end - start) <= 0)
-                continue;
-            if (tns->direction[w][filt]) {
-                inc = -1;
-                start = end - 1;
-            } else {
-                inc = 1;
-            }
-            start += w * 128;
-
-            if (decode) {
-                // ar filter
-                for (m = 0; m < size; m++, start += inc)
-                    for (i = 1; i <= FFMIN(m, order); i++)
-                        coef[start] -= coef[start - i * inc] * lpc[i - 1];
-            } else {
-                // ma filter
-                for (m = 0; m < size; m++, start += inc) {
-                    tmp[0] = coef[start];
-                    for (i = 1; i <= FFMIN(m, order); i++)
-                        coef[start] += tmp[i] * lpc[i - 1];
-                    for (i = order; i > 0; i--)
-                        tmp[i] = tmp[i - 1];
-                }
-            }
-        }
-    }
-}
-
-/**
- *  Apply windowing and MDCT to obtain the spectral
- *  coefficient from the predicted sample by LTP.
- */
-static void windowing_and_mdct_ltp(AACContext *ac, float *out,
-                                   float *in, IndividualChannelStream *ics)
-{
-    const float *lwindow      = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
-
-    if (ics->window_sequence[0] != LONG_STOP_SEQUENCE) {
-        ac->fdsp.vector_fmul(in, in, lwindow_prev, 1024);
-    } else {
-        memset(in, 0, 448 * sizeof(float));
-        ac->fdsp.vector_fmul(in + 448, in + 448, swindow_prev, 128);
-    }
-    if (ics->window_sequence[0] != LONG_START_SEQUENCE) {
-        ac->fdsp.vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024);
-    } else {
-        ac->fdsp.vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128);
-        memset(in + 1024 + 576, 0, 448 * sizeof(float));
-    }
-    ac->mdct_ltp.mdct_calc(&ac->mdct_ltp, out, in);
-}
-
-/**
- * Apply the long term prediction
- */
-static void apply_ltp(AACContext *ac, SingleChannelElement *sce)
-{
-    const LongTermPrediction *ltp = &sce->ics.ltp;
-    const uint16_t *offsets = sce->ics.swb_offset;
-    int i, sfb;
-
-    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
-        float *predTime = sce->ret;
-        float *predFreq = ac->buf_mdct;
-        int16_t num_samples = 2048;
-
-        if (ltp->lag < 1024)
-            num_samples = ltp->lag + 1024;
-        for (i = 0; i < num_samples; i++)
-            predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
-        memset(&predTime[i], 0, (2048 - i) * sizeof(float));
-
-        windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
-
-        if (sce->tns.present)
-            apply_tns(predFreq, &sce->tns, &sce->ics, 0);
-
-        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
-            if (ltp->used[sfb])
-                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
-                    sce->coeffs[i] += predFreq[i];
-    }
-}
-
-/**
- * Update the LTP buffer for next frame
- */
-static void update_ltp(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *saved     = sce->saved;
-    float *saved_ltp = sce->coeffs;
-    const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    int i;
-
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        memcpy(saved_ltp,       saved, 512 * sizeof(float));
-        memset(saved_ltp + 576, 0,     448 * sizeof(float));
-        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
-        for (i = 0; i < 64; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i];
-    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
-        memcpy(saved_ltp,       ac->buf_mdct + 512, 448 * sizeof(float));
-        memset(saved_ltp + 576, 0,                  448 * sizeof(float));
-        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
-        for (i = 0; i < 64; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i];
-    } else { // LONG_STOP or ONLY_LONG
-        ac->fdsp.vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
-        for (i = 0; i < 512; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * lwindow[511 - i];
-    }
-
-    memcpy(sce->ltp_state,      sce->ltp_state+1024, 1024 * sizeof(*sce->ltp_state));
-    memcpy(sce->ltp_state+1024, sce->ret,            1024 * sizeof(*sce->ltp_state));
-    memcpy(sce->ltp_state+2048, saved_ltp,           1024 * sizeof(*sce->ltp_state));
-}
-
-/**
- * Conduct IMDCT and windowing.
- */
-static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
-    float *buf  = ac->buf_mdct;
-    float *temp = ac->temp;
-    int i;
-
-    // imdct
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        for (i = 0; i < 1024; i += 128)
-            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
-    } else
-        ac->mdct.imdct_half(&ac->mdct, buf, in);
-
-    /* window overlapping
-     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
-     * and long to short transitions are considered to be short to short
-     * transitions. This leaves just two cases (long to long and short to short)
-     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
-     */
-    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
-            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
-        ac->fdsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
-    } else {
-        memcpy(                         out,               saved,            448 * sizeof(float));
-
-        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-            ac->fdsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
-            ac->fdsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
-            memcpy(                     out + 448 + 4*128, temp, 64 * sizeof(float));
-        } else {
-            ac->fdsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
-            memcpy(                     out + 576,         buf + 64,         448 * sizeof(float));
-        }
-    }
-
-    // buffer update
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        memcpy(                     saved,       temp + 64,         64 * sizeof(float));
-        ac->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
-        ac->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
-        ac->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
-        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
-    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
-        memcpy(                     saved,       buf + 512,        448 * sizeof(float));
-        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
-    } else { // LONG_STOP or ONLY_LONG
-        memcpy(                     saved,       buf + 512,        512 * sizeof(float));
-    }
-}
-
-static void imdct_and_windowing_ld(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    float *buf  = ac->buf_mdct;
-
-    // imdct
-    ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
-
-    // window overlapping
-    if (ics->use_kb_window[1]) {
-        // AAC LD uses a low overlap sine window instead of a KBD window
-        memcpy(out, saved, 192 * sizeof(float));
-        ac->fdsp.vector_fmul_window(out + 192, saved + 192, buf, ff_sine_128, 64);
-        memcpy(                     out + 320, buf + 64, 192 * sizeof(float));
-    } else {
-        ac->fdsp.vector_fmul_window(out, saved, buf, ff_sine_512, 256);
-    }
-
-    // buffer update
-    memcpy(saved, buf + 256, 256 * sizeof(float));
-}
-
-static void imdct_and_windowing_eld(AACContext *ac, SingleChannelElement *sce)
-{
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    float *buf  = ac->buf_mdct;
-    int i;
-    const int n  = ac->oc[1].m4ac.frame_length_short ? 480 : 512;
-    const int n2 = n >> 1;
-    const int n4 = n >> 2;
-    const float *const window = n == 480 ? ff_aac_eld_window_480 :
-                                           ff_aac_eld_window_512;
-
-    // Inverse transform, mapped to the conventional IMDCT by
-    // Chivukula, R.K.; Reznik, Y.A.; Devarajan, V.,
-    // "Efficient algorithms for MPEG-4 AAC-ELD, AAC-LD and AAC-LC filterbanks,"
-    // Audio, Language and Image Processing, 2008. ICALIP 2008. International Conference on
-    // URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4590245&isnumber=4589950
-    for (i = 0; i < n2; i+=2) {
-        float temp;
-        temp =  in[i    ]; in[i    ] = -in[n - 1 - i]; in[n - 1 - i] = temp;
-        temp = -in[i + 1]; in[i + 1] =  in[n - 2 - i]; in[n - 2 - i] = temp;
-    }
-    if (n == 480)
-        ac->mdct480->imdct_half(ac->mdct480, buf, in, 1, -1.f/(16*1024*960));
-    else
-        ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
-    for (i = 0; i < n; i+=2) {
-        buf[i] = -buf[i];
-    }
-    // Like with the regular IMDCT at this point we still have the middle half
-    // of a transform but with even symmetry on the left and odd symmetry on
-    // the right
-
-    // window overlapping
-    // The spec says to use samples [0..511] but the reference decoder uses
-    // samples [128..639].
-    for (i = n4; i < n2; i ++) {
-        out[i - n4] =    buf[n2 - 1 - i]       * window[i       - n4] +
-                       saved[      i + n2]     * window[i +   n - n4] +
-                      -saved[  n + n2 - 1 - i] * window[i + 2*n - n4] +
-                      -saved[2*n + n2 + i]     * window[i + 3*n - n4];
-    }
-    for (i = 0; i < n2; i ++) {
-        out[n4 + i] =    buf[i]               * window[i + n2       - n4] +
-                      -saved[      n - 1 - i] * window[i + n2 +   n - n4] +
-                      -saved[  n + i]         * window[i + n2 + 2*n - n4] +
-                       saved[2*n + n - 1 - i] * window[i + n2 + 3*n - n4];
-    }
-    for (i = 0; i < n4; i ++) {
-        out[n2 + n4 + i] =    buf[      i + n2]     * window[i +   n - n4] +
-                           -saved[      n2 - 1 - i] * window[i + 2*n - n4] +
-                           -saved[  n + n2 + i]     * window[i + 3*n - n4];
-    }
-
-    // buffer update
-    memmove(saved + n, saved, 2 * n * sizeof(float));
-    memcpy( saved,       buf,     n * sizeof(float));
-}
-
-/**
  * Apply dependent channel coupling (applied before IMDCT).
  *
  * @param   index   index into coupling gain array
@@ -2606,441 +257,7 @@ static void apply_independent_coupling(AACContext *ac,
         dest[i] += gain * src[i];
 }
 
-/**
- * channel coupling transformation interface
- *
- * @param   apply_coupling_method   pointer to (in)dependent coupling function
- */
-static void apply_channel_coupling(AACContext *ac, ChannelElement *cc,
-                                   enum RawDataBlockType type, int elem_id,
-                                   enum CouplingPoint coupling_point,
-                                   void (*apply_coupling_method)(AACContext *ac, SingleChannelElement *target, ChannelElement *cce, int index))
-{
-    int i, c;
-
-    for (i = 0; i < MAX_ELEM_ID; i++) {
-        ChannelElement *cce = ac->che[TYPE_CCE][i];
-        int index = 0;
-
-        if (cce && cce->coup.coupling_point == coupling_point) {
-            ChannelCoupling *coup = &cce->coup;
-
-            for (c = 0; c <= coup->num_coupled; c++) {
-                if (coup->type[c] == type && coup->id_select[c] == elem_id) {
-                    if (coup->ch_select[c] != 1) {
-                        apply_coupling_method(ac, &cc->ch[0], cce, index);
-                        if (coup->ch_select[c] != 0)
-                            index++;
-                    }
-                    if (coup->ch_select[c] != 2)
-                        apply_coupling_method(ac, &cc->ch[1], cce, index++);
-                } else
-                    index += 1 + (coup->ch_select[c] == 3);
-            }
-        }
-    }
-}
-
-/**
- * Convert spectral data to float samples, applying all supported tools as appropriate.
- */
-static void spectral_to_sample(AACContext *ac)
-{
-    int i, type;
-    void (*imdct_and_window)(AACContext *ac, SingleChannelElement *sce);
-    switch (ac->oc[1].m4ac.object_type) {
-    case AOT_ER_AAC_LD:
-        imdct_and_window = imdct_and_windowing_ld;
-        break;
-    case AOT_ER_AAC_ELD:
-        imdct_and_window = imdct_and_windowing_eld;
-        break;
-    default:
-        imdct_and_window = imdct_and_windowing;
-    }
-    for (type = 3; type >= 0; type--) {
-        for (i = 0; i < MAX_ELEM_ID; i++) {
-            ChannelElement *che = ac->che[type][i];
-            if (che) {
-                if (type <= TYPE_CPE)
-                    apply_channel_coupling(ac, che, type, i, BEFORE_TNS, apply_dependent_coupling);
-                if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
-                    if (che->ch[0].ics.predictor_present) {
-                        if (che->ch[0].ics.ltp.present)
-                            apply_ltp(ac, &che->ch[0]);
-                        if (che->ch[1].ics.ltp.present && type == TYPE_CPE)
-                            apply_ltp(ac, &che->ch[1]);
-                    }
-                }
-                if (che->ch[0].tns.present)
-                    apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
-                if (che->ch[1].tns.present)
-                    apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
-                if (type <= TYPE_CPE)
-                    apply_channel_coupling(ac, che, type, i, BETWEEN_TNS_AND_IMDCT, apply_dependent_coupling);
-                if (type != TYPE_CCE || che->coup.coupling_point == AFTER_IMDCT) {
-                    imdct_and_window(ac, &che->ch[0]);
-                    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                        update_ltp(ac, &che->ch[0]);
-                    if (type == TYPE_CPE) {
-                        imdct_and_window(ac, &che->ch[1]);
-                        if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                            update_ltp(ac, &che->ch[1]);
-                    }
-                    if (ac->oc[1].m4ac.sbr > 0) {
-                        ff_sbr_apply(ac, &che->sbr, type, che->ch[0].ret, che->ch[1].ret);
-                    }
-                }
-                if (type <= TYPE_CCE)
-                    apply_channel_coupling(ac, che, type, i, AFTER_IMDCT, apply_independent_coupling);
-            }
-        }
-    }
-}
-
-static int parse_adts_frame_header(AACContext *ac, GetBitContext *gb)
-{
-    int size;
-    AACADTSHeaderInfo hdr_info;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int layout_map_tags, ret;
-
-    size = ff_adts_header_parse(gb, &hdr_info);
-    if (size > 0) {
-        if (hdr_info.num_aac_frames != 1) {
-            avpriv_report_missing_feature(ac->avctx,
-                                          "More than one AAC RDB per ADTS frame");
-            return AVERROR_PATCHWELCOME;
-        }
-        push_output_configuration(ac);
-        if (hdr_info.chan_config) {
-            ac->oc[1].m4ac.chan_config = hdr_info.chan_config;
-            if ((ret = set_default_channel_config(ac->avctx,
-                                                  layout_map,
-                                                  &layout_map_tags,
-                                                  hdr_info.chan_config)) < 0)
-                return ret;
-            if ((ret = output_configure(ac, layout_map, layout_map_tags,
-                                        FFMAX(ac->oc[1].status,
-                                              OC_TRIAL_FRAME), 0)) < 0)
-                return ret;
-        } else {
-            ac->oc[1].m4ac.chan_config = 0;
-        }
-        ac->oc[1].m4ac.sample_rate     = hdr_info.sample_rate;
-        ac->oc[1].m4ac.sampling_index  = hdr_info.sampling_index;
-        ac->oc[1].m4ac.object_type     = hdr_info.object_type;
-        ac->oc[1].m4ac.frame_length_short = 0;
-        if (ac->oc[0].status != OC_LOCKED ||
-            ac->oc[0].m4ac.chan_config != hdr_info.chan_config ||
-            ac->oc[0].m4ac.sample_rate != hdr_info.sample_rate) {
-            ac->oc[1].m4ac.sbr = -1;
-            ac->oc[1].m4ac.ps  = -1;
-        }
-        if (!hdr_info.crc_absent)
-            skip_bits(gb, 16);
-    }
-    return size;
-}
-
-static int aac_decode_er_frame(AVCodecContext *avctx, void *data,
-                               int *got_frame_ptr, GetBitContext *gb)
-{
-    AACContext *ac = avctx->priv_data;
-    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
-    ChannelElement *che;
-    int err, i;
-    int samples = m4ac->frame_length_short ? 960 : 1024;
-    int chan_config = m4ac->chan_config;
-    int aot = m4ac->object_type;
-
-    if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD)
-        samples >>= 1;
-
-    ac->frame = data;
-
-    if ((err = frame_configure_elements(avctx)) < 0)
-        return err;
-
-    // The FF_PROFILE_AAC_* defines are all object_type - 1
-    // This may lead to an undefined profile being signaled
-    ac->avctx->profile = aot - 1;
-
-    ac->tags_mapped = 0;
-
-    if (chan_config < 0 || (chan_config >= 8 && chan_config < 11) || chan_config >= 13) {
-        avpriv_request_sample(avctx, "Unknown ER channel configuration %d",
-                              chan_config);
-        return AVERROR_INVALIDDATA;
-    }
-    for (i = 0; i < tags_per_config[chan_config]; i++) {
-        const int elem_type = aac_channel_layout_map[chan_config-1][i][0];
-        const int elem_id   = aac_channel_layout_map[chan_config-1][i][1];
-        if (!(che=get_che(ac, elem_type, elem_id))) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "channel element %d.%d is not allocated\n",
-                   elem_type, elem_id);
-            return AVERROR_INVALIDDATA;
-        }
-        if (aot != AOT_ER_AAC_ELD)
-            skip_bits(gb, 4);
-        switch (elem_type) {
-        case TYPE_SCE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            break;
-        case TYPE_CPE:
-            err = decode_cpe(ac, gb, che);
-            break;
-        case TYPE_LFE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            break;
-        }
-        if (err < 0)
-            return err;
-    }
-
-    spectral_to_sample(ac);
-
-    ac->frame->nb_samples = samples;
-    ac->frame->sample_rate = avctx->sample_rate;
-    *got_frame_ptr = 1;
-
-    skip_bits_long(gb, get_bits_left(gb));
-    return 0;
-}
-
-static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
-                                int *got_frame_ptr, GetBitContext *gb)
-{
-    AACContext *ac = avctx->priv_data;
-    ChannelElement *che = NULL, *che_prev = NULL;
-    enum RawDataBlockType elem_type, elem_type_prev = TYPE_END;
-    int err, elem_id;
-    int samples = 0, multiplier, audio_found = 0, pce_found = 0;
-
-    ac->frame = data;
-
-    if (show_bits(gb, 12) == 0xfff) {
-        if ((err = parse_adts_frame_header(ac, gb)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error decoding AAC frame header.\n");
-            goto fail;
-        }
-        if (ac->oc[1].m4ac.sampling_index > 12) {
-            av_log(ac->avctx, AV_LOG_ERROR, "invalid sampling rate index %d\n", ac->oc[1].m4ac.sampling_index);
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-    }
-
-    if (avctx->channels)
-        if ((err = frame_configure_elements(avctx)) < 0)
-            goto fail;
-
-    // The FF_PROFILE_AAC_* defines are all object_type - 1
-    // This may lead to an undefined profile being signaled
-    ac->avctx->profile = ac->oc[1].m4ac.object_type - 1;
-
-    ac->tags_mapped = 0;
-    // parse
-    while ((elem_type = get_bits(gb, 3)) != TYPE_END) {
-        elem_id = get_bits(gb, 4);
-
-        if (!avctx->channels && elem_type != TYPE_PCE) {
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-
-        if (elem_type < TYPE_DSE) {
-            if (!(che=get_che(ac, elem_type, elem_id))) {
-                av_log(ac->avctx, AV_LOG_ERROR, "channel element %d.%d is not allocated\n",
-                       elem_type, elem_id);
-                err = AVERROR_INVALIDDATA;
-                goto fail;
-            }
-            samples = 1024;
-        }
-
-        switch (elem_type) {
-
-        case TYPE_SCE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            audio_found = 1;
-            break;
-
-        case TYPE_CPE:
-            err = decode_cpe(ac, gb, che);
-            audio_found = 1;
-            break;
-
-        case TYPE_CCE:
-            err = decode_cce(ac, gb, che);
-            break;
-
-        case TYPE_LFE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            audio_found = 1;
-            break;
-
-        case TYPE_DSE:
-            err = skip_data_stream_element(ac, gb);
-            break;
-
-        case TYPE_PCE: {
-            uint8_t layout_map[MAX_ELEM_ID*4][3];
-            int tags;
-            push_output_configuration(ac);
-            tags = decode_pce(avctx, &ac->oc[1].m4ac, layout_map, gb);
-            if (tags < 0) {
-                err = tags;
-                break;
-            }
-            if (pce_found) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Not evaluating a further program_config_element as this construct is dubious at best.\n");
-                pop_output_configuration(ac);
-            } else {
-                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
-                pce_found = 1;
-            }
-            break;
-        }
-
-        case TYPE_FIL:
-            if (elem_id == 15)
-                elem_id += get_bits(gb, 8) - 1;
-            if (get_bits_left(gb) < 8 * elem_id) {
-                    av_log(avctx, AV_LOG_ERROR, overread_err);
-                    err = AVERROR_INVALIDDATA;
-                    goto fail;
-            }
-            while (elem_id > 0)
-                elem_id -= decode_extension_payload(ac, gb, elem_id, che_prev, elem_type_prev);
-            err = 0; /* FIXME */
-            break;
-
-        default:
-            err = AVERROR_BUG; /* should not happen, but keeps compiler happy */
-            break;
-        }
-
-        che_prev       = che;
-        elem_type_prev = elem_type;
-
-        if (err)
-            goto fail;
-
-        if (get_bits_left(gb) < 3) {
-            av_log(avctx, AV_LOG_ERROR, overread_err);
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-    }
-
-    if (!avctx->channels) {
-        *got_frame_ptr = 0;
-        return 0;
-    }
-
-    spectral_to_sample(ac);
-
-    multiplier = (ac->oc[1].m4ac.sbr == 1) ? ac->oc[1].m4ac.ext_sample_rate > ac->oc[1].m4ac.sample_rate : 0;
-    samples <<= multiplier;
-
-    if (ac->oc[1].status && audio_found) {
-        avctx->sample_rate = ac->oc[1].m4ac.sample_rate << multiplier;
-        avctx->frame_size = samples;
-        ac->oc[1].status = OC_LOCKED;
-    }
-
-    if (samples) {
-        ac->frame->nb_samples = samples;
-        ac->frame->sample_rate = avctx->sample_rate;
-    }
-    *got_frame_ptr = !!samples;
-
-    return 0;
-fail:
-    pop_output_configuration(ac);
-    return err;
-}
-
-static int aac_decode_frame(AVCodecContext *avctx, void *data,
-                            int *got_frame_ptr, AVPacket *avpkt)
-{
-    AACContext *ac = avctx->priv_data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
-    GetBitContext gb;
-    int buf_consumed;
-    int buf_offset;
-    int err;
-    int new_extradata_size;
-    const uint8_t *new_extradata = av_packet_get_side_data(avpkt,
-                                       AV_PKT_DATA_NEW_EXTRADATA,
-                                       &new_extradata_size);
-
-    if (new_extradata) {
-        av_free(avctx->extradata);
-        avctx->extradata = av_mallocz(new_extradata_size +
-                                      AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!avctx->extradata)
-            return AVERROR(ENOMEM);
-        avctx->extradata_size = new_extradata_size;
-        memcpy(avctx->extradata, new_extradata, new_extradata_size);
-        push_output_configuration(ac);
-        if (decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
-                                         avctx->extradata,
-                                         avctx->extradata_size*8, 1) < 0) {
-            pop_output_configuration(ac);
-            return AVERROR_INVALIDDATA;
-        }
-    }
-
-    if ((err = init_get_bits(&gb, buf, buf_size * 8)) < 0)
-        return err;
-
-    switch (ac->oc[1].m4ac.object_type) {
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LTP:
-    case AOT_ER_AAC_LD:
-    case AOT_ER_AAC_ELD:
-        err = aac_decode_er_frame(avctx, data, got_frame_ptr, &gb);
-        break;
-    default:
-        err = aac_decode_frame_int(avctx, data, got_frame_ptr, &gb);
-    }
-    if (err < 0)
-        return err;
-
-    buf_consumed = (get_bits_count(&gb) + 7) >> 3;
-    for (buf_offset = buf_consumed; buf_offset < buf_size; buf_offset++)
-        if (buf[buf_offset])
-            break;
-
-    return buf_size > buf_offset ? buf_consumed : buf_size;
-}
-
-static av_cold int aac_decode_close(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int i, type;
-
-    for (i = 0; i < MAX_ELEM_ID; i++) {
-        for (type = 0; type < 4; type++) {
-            if (ac->che[type][i])
-                ff_aac_sbr_ctx_close(&ac->che[type][i]->sbr);
-            av_freep(&ac->che[type][i]);
-        }
-    }
-
-    ff_mdct_end(&ac->mdct);
-    ff_mdct_end(&ac->mdct_small);
-    ff_mdct_end(&ac->mdct_ld);
-    ff_mdct_end(&ac->mdct_ltp);
-    ff_imdct15_uninit(&ac->mdct480);
-    return 0;
-}
-
+#include "aacdec_template.c"
 
 #define LOAS_SYNC_WORD   0x2b7       ///< 11 bits LOAS sync word
 
@@ -3067,38 +284,48 @@ static int latm_decode_audio_specific_config(struct LATMContext *latmctx,
     AACContext *ac        = &latmctx->aac_ctx;
     AVCodecContext *avctx = ac->avctx;
     MPEG4AudioConfig m4ac = { 0 };
+    GetBitContext gbc;
     int config_start_bit  = get_bits_count(gb);
     int sync_extension    = 0;
-    int bits_consumed, esize;
+    int bits_consumed, esize, i;
 
-    if (asclen) {
+    if (asclen > 0) {
         sync_extension = 1;
         asclen         = FFMIN(asclen, get_bits_left(gb));
-    } else
-        asclen         = get_bits_left(gb);
-
-    if (config_start_bit % 8) {
-        avpriv_request_sample(latmctx->aac_ctx.avctx,
-                              "Non-byte-aligned audio-specific config");
-        return AVERROR_PATCHWELCOME;
+        init_get_bits(&gbc, gb->buffer, config_start_bit + asclen);
+        skip_bits_long(&gbc, config_start_bit);
+    } else if (asclen == 0) {
+        gbc = *gb;
+    } else {
+        return AVERROR_INVALIDDATA;
     }
-    if (asclen <= 0)
+
+    if (get_bits_left(gb) <= 0)
         return AVERROR_INVALIDDATA;
-    bits_consumed = decode_audio_specific_config(NULL, avctx, &m4ac,
-                                         gb->buffer + (config_start_bit / 8),
-                                         asclen, sync_extension);
 
-    if (bits_consumed < 0)
+    bits_consumed = decode_audio_specific_config_gb(NULL, avctx, &m4ac,
+                                                    &gbc, config_start_bit,
+                                                    sync_extension);
+
+    if (bits_consumed < config_start_bit)
         return AVERROR_INVALIDDATA;
+    bits_consumed -= config_start_bit;
+
+    if (asclen == 0)
+      asclen = bits_consumed;
 
     if (!latmctx->initialized ||
         ac->oc[1].m4ac.sample_rate != m4ac.sample_rate ||
         ac->oc[1].m4ac.chan_config != m4ac.chan_config) {
 
-        av_log(avctx, AV_LOG_INFO, "audio config changed\n");
+        if (latmctx->initialized) {
+            av_log(avctx, AV_LOG_INFO, "audio config changed (sample_rate=%d, chan_config=%d)\n", m4ac.sample_rate, m4ac.chan_config);
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "initializing latmctx\n");
+        }
         latmctx->initialized = 0;
 
-        esize = (bits_consumed+7) / 8;
+        esize = (asclen + 7) / 8;
 
         if (avctx->extradata_size < esize) {
             av_free(avctx->extradata);
@@ -3108,12 +335,15 @@ static int latm_decode_audio_specific_config(struct LATMContext *latmctx,
         }
 
         avctx->extradata_size = esize;
-        memcpy(avctx->extradata, gb->buffer + (config_start_bit/8), esize);
+        gbc = *gb;
+        for (i = 0; i < esize; i++) {
+          avctx->extradata[i] = get_bits(&gbc, 8);
+        }
         memset(avctx->extradata+esize, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     }
-    skip_bits_long(gb, bits_consumed);
+    skip_bits_long(gb, asclen);
 
-    return bits_consumed;
+    return 0;
 }
 
 static int read_stream_mux_config(struct LATMContext *latmctx,
@@ -3138,9 +368,9 @@ static int read_stream_mux_config(struct LATMContext *latmctx,
             return AVERROR_PATCHWELCOME;
         }
 
-        // for each program (which there is only on in DVB)
+        // for each program (which there is only one in DVB)
 
-        // for each layer (which there is only on in DVB)
+        // for each layer (which there is only one in DVB)
         if (get_bits(gb, 3)) {                   // numLayer
             avpriv_request_sample(latmctx->aac_ctx.avctx, "Multiple layers");
             return AVERROR_PATCHWELCOME;
@@ -3154,8 +384,6 @@ static int read_stream_mux_config(struct LATMContext *latmctx,
             int ascLen = latm_get_value(gb);
             if ((ret = latm_decode_audio_specific_config(latmctx, gb, ascLen)) < 0)
                 return ret;
-            ascLen -= ret;
-            skip_bits_long(gb, ascLen);
         }
 
         latmctx->frame_length_type = get_bits(gb, 3);
@@ -3203,6 +431,8 @@ static int read_payload_length_info(struct LATMContext *ctx, GetBitContext *gb)
     if (ctx->frame_length_type == 0) {
         int mux_slot_length = 0;
         do {
+            if (get_bits_left(gb) < 8)
+                return AVERROR_INVALIDDATA;
             tmp = get_bits(gb, 8);
             mux_slot_length += tmp;
         } while (tmp == 255);
@@ -3232,7 +462,7 @@ static int read_audio_mux_element(struct LATMContext *latmctx,
     }
     if (latmctx->audio_mux_version_A == 0) {
         int mux_slot_length_bytes = read_payload_length_info(latmctx, gb);
-        if (mux_slot_length_bytes * 8 > get_bits_left(gb)) {
+        if (mux_slot_length_bytes < 0 || mux_slot_length_bytes * 8LL > get_bits_left(gb)) {
             av_log(latmctx->aac_ctx.avctx, AV_LOG_ERROR, "incomplete frame\n");
             return AVERROR_INVALIDDATA;
         } else if (mux_slot_length_bytes * 8 + 256 < get_bits_left(gb)) {
@@ -3253,7 +483,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
     int                 muxlength, err;
     GetBitContext       gb;
 
-    if ((err = init_get_bits(&gb, avpkt->data, avpkt->size * 8)) < 0)
+    if ((err = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
         return err;
 
     // check for LOAS sync word
@@ -3261,7 +491,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
         return AVERROR_INVALIDDATA;
 
     muxlength = get_bits(&gb, 13) + 3;
-    // not enough data, the parser should have sorted this
+    // not enough data, the parser should have sorted this out
     if (muxlength > avpkt->size)
         return AVERROR_INVALIDDATA;
 
@@ -3276,7 +506,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
             push_output_configuration(&latmctx->aac_ctx);
             if ((err = decode_audio_specific_config(
                     &latmctx->aac_ctx, avctx, &latmctx->aac_ctx.oc[1].m4ac,
-                    avctx->extradata, avctx->extradata_size*8, 1)) < 0) {
+                    avctx->extradata, avctx->extradata_size*8LL, 1)) < 0) {
                 pop_output_configuration(&latmctx->aac_ctx);
                 return err;
             }
@@ -3299,7 +529,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
         err = aac_decode_er_frame(avctx, out, got_frame_ptr, &gb);
         break;
     default:
-        err = aac_decode_frame_int(avctx, out, got_frame_ptr, &gb);
+        err = aac_decode_frame_int(avctx, out, got_frame_ptr, &gb, avpkt);
     }
     if (err < 0)
         return err;
@@ -3318,7 +548,6 @@ static av_cold int latm_decode_init(AVCodecContext *avctx)
     return ret;
 }
 
-
 AVCodec ff_aac_decoder = {
     .name            = "aac",
     .long_name       = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
@@ -3334,6 +563,9 @@ AVCodec ff_aac_decoder = {
     .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
     .caps_internal   = FF_CODEC_CAP_INIT_THREADSAFE,
     .channel_layouts = aac_channel_layout,
+    .flush = flush,
+    .priv_class      = &aac_decoder_class,
+    .profiles        = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
 };
 
 /*
@@ -3356,4 +588,6 @@ AVCodec ff_aac_latm_decoder = {
     .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
     .caps_internal   = FF_CODEC_CAP_INIT_THREADSAFE,
     .channel_layouts = aac_channel_layout,
+    .flush = flush,
+    .profiles        = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
 };
diff --git a/libavcodec/aacdec_fixed.c b/libavcodec/aacdec_fixed.c
new file mode 100644
index 0000000..2c594c6
--- /dev/null
+++ b/libavcodec/aacdec_fixed.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * AAC decoder fixed-point implementation
+ *
+ * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
+ * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC decoder
+ * @author Oded Shimon  ( ods15 ods15 dyndns org )
+ * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * Fixed point implementation
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#define USE_FIXED 1
+
+#include "libavutil/fixed_dsp.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "fft.h"
+#include "lpc.h"
+#include "kbdwin.h"
+#include "sinewin.h"
+
+#include "aac.h"
+#include "aactab.h"
+#include "aacdectab.h"
+#include "adts_header.h"
+#include "cbrt_data.h"
+#include "sbr.h"
+#include "aacsbr.h"
+#include "mpeg4audio.h"
+#include "profiles.h"
+#include "libavutil/intfloat.h"
+
+#include <math.h>
+#include <string.h>
+
+static av_always_inline void reset_predict_state(PredictorState *ps)
+{
+    ps->r0.mant   = 0;
+    ps->r0.exp   = 0;
+    ps->r1.mant   = 0;
+    ps->r1.exp   = 0;
+    ps->cor0.mant = 0;
+    ps->cor0.exp = 0;
+    ps->cor1.mant = 0;
+    ps->cor1.exp = 0;
+    ps->var0.mant = 0x20000000;
+    ps->var0.exp = 1;
+    ps->var1.mant = 0x20000000;
+    ps->var1.exp = 1;
+}
+
+static const int exp2tab[4] = { Q31(1.0000000000/2), Q31(1.1892071150/2), Q31(1.4142135624/2), Q31(1.6817928305/2) };  // 2^0, 2^0.25, 2^0.5, 2^0.75
+
+static inline int *DEC_SPAIR(int *dst, unsigned idx)
+{
+    dst[0] = (idx & 15) - 4;
+    dst[1] = (idx >> 4 & 15) - 4;
+
+    return dst + 2;
+}
+
+static inline int *DEC_SQUAD(int *dst, unsigned idx)
+{
+    dst[0] = (idx & 3) - 1;
+    dst[1] = (idx >> 2 & 3) - 1;
+    dst[2] = (idx >> 4 & 3) - 1;
+    dst[3] = (idx >> 6 & 3) - 1;
+
+    return dst + 4;
+}
+
+static inline int *DEC_UPAIR(int *dst, unsigned idx, unsigned sign)
+{
+    dst[0] = (idx & 15) * (1 - (sign & 0xFFFFFFFE));
+    dst[1] = (idx >> 4 & 15) * (1 - ((sign & 1) * 2));
+
+    return dst + 2;
+}
+
+static inline int *DEC_UQUAD(int *dst, unsigned idx, unsigned sign)
+{
+    unsigned nz = idx >> 12;
+
+    dst[0] = (idx & 3) * (1 + (((int)sign >> 31) * 2));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[1] = (idx >> 2 & 3) * (1 + (((int)sign >> 31) * 2));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[2] = (idx >> 4 & 3) * (1 + (((int)sign >> 31) * 2));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[3] = (idx >> 6 & 3) * (1 + (((int)sign >> 31) * 2));
+
+    return dst + 4;
+}
+
+static void vector_pow43(int *coefs, int len)
+{
+    int i, coef;
+
+    for (i=0; i<len; i++) {
+        coef = coefs[i];
+        if (coef < 0)
+            coef = -(int)ff_cbrt_tab_fixed[-coef];
+        else
+            coef = (int)ff_cbrt_tab_fixed[coef];
+        coefs[i] = coef;
+    }
+}
+
+static void subband_scale(int *dst, int *src, int scale, int offset, int len, void *log_context)
+{
+    int ssign = scale < 0 ? -1 : 1;
+    int s = FFABS(scale);
+    unsigned int round;
+    int i, out, c = exp2tab[s & 3];
+
+    s = offset - (s >> 2);
+
+    if (s > 31) {
+        for (i=0; i<len; i++) {
+            dst[i] = 0;
+        }
+    } else if (s > 0) {
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)(((int64_t)src[i] * c) >> 32);
+            dst[i] = ((int)(out+round) >> s) * ssign;
+        }
+    } else if (s > -32) {
+        s = s + 32;
+        round = 1U << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)((int64_t)((int64_t)src[i] * c + round) >> s);
+            dst[i] = out * (unsigned)ssign;
+        }
+    } else {
+        av_log(log_context, AV_LOG_ERROR, "Overflow in subband_scale()\n");
+    }
+}
+
+static void noise_scale(int *coefs, int scale, int band_energy, int len)
+{
+    int ssign = scale < 0 ? -1 : 1;
+    int s = FFABS(scale);
+    unsigned int round;
+    int i, out, c = exp2tab[s & 3];
+    int nlz = 0;
+
+    while (band_energy > 0x7fff) {
+        band_energy >>= 1;
+        nlz++;
+    }
+    c /= band_energy;
+    s = 21 + nlz - (s >> 2);
+
+    if (s > 31) {
+        for (i=0; i<len; i++) {
+            coefs[i] = 0;
+        }
+    } else if (s >= 0) {
+        round = s ? 1 << (s-1) : 0;
+        for (i=0; i<len; i++) {
+            out = (int)(((int64_t)coefs[i] * c) >> 32);
+            coefs[i] = ((int)(out+round) >> s) * ssign;
+        }
+    }
+    else {
+        s = s + 32;
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)((int64_t)((int64_t)coefs[i] * c + round) >> s);
+            coefs[i] = out * ssign;
+        }
+    }
+}
+
+static av_always_inline SoftFloat flt16_round(SoftFloat pf)
+{
+    SoftFloat tmp;
+    int s;
+
+    tmp.exp = pf.exp;
+    s = pf.mant >> 31;
+    tmp.mant = (pf.mant ^ s) - s;
+    tmp.mant = (tmp.mant + 0x00200000U) & 0xFFC00000U;
+    tmp.mant = (tmp.mant ^ s) - s;
+
+    return tmp;
+}
+
+static av_always_inline SoftFloat flt16_even(SoftFloat pf)
+{
+    SoftFloat tmp;
+    int s;
+
+    tmp.exp = pf.exp;
+    s = pf.mant >> 31;
+    tmp.mant = (pf.mant ^ s) - s;
+    tmp.mant = (tmp.mant + 0x001FFFFFU + (tmp.mant & 0x00400000U >> 16)) & 0xFFC00000U;
+    tmp.mant = (tmp.mant ^ s) - s;
+
+    return tmp;
+}
+
+static av_always_inline SoftFloat flt16_trunc(SoftFloat pf)
+{
+    SoftFloat pun;
+    int s;
+
+    pun.exp = pf.exp;
+    s = pf.mant >> 31;
+    pun.mant = (pf.mant ^ s) - s;
+    pun.mant = pun.mant & 0xFFC00000U;
+    pun.mant = (pun.mant ^ s) - s;
+
+    return pun;
+}
+
+static av_always_inline void predict(PredictorState *ps, int *coef,
+                                     int output_enable)
+{
+    const SoftFloat a     = { 1023410176, 0 };  // 61.0 / 64
+    const SoftFloat alpha = {  973078528, 0 };  // 29.0 / 32
+    SoftFloat e0, e1;
+    SoftFloat pv;
+    SoftFloat k1, k2;
+    SoftFloat   r0 = ps->r0,     r1 = ps->r1;
+    SoftFloat cor0 = ps->cor0, cor1 = ps->cor1;
+    SoftFloat var0 = ps->var0, var1 = ps->var1;
+    SoftFloat tmp;
+
+    if (var0.exp > 1 || (var0.exp == 1 && var0.mant > 0x20000000)) {
+        k1 = av_mul_sf(cor0, flt16_even(av_div_sf(a, var0)));
+    }
+    else {
+        k1.mant = 0;
+        k1.exp = 0;
+    }
+
+    if (var1.exp > 1 || (var1.exp == 1 && var1.mant > 0x20000000)) {
+        k2 = av_mul_sf(cor1, flt16_even(av_div_sf(a, var1)));
+    }
+    else {
+        k2.mant = 0;
+        k2.exp = 0;
+    }
+
+    tmp = av_mul_sf(k1, r0);
+    pv = flt16_round(av_add_sf(tmp, av_mul_sf(k2, r1)));
+    if (output_enable) {
+        int shift = 28 - pv.exp;
+
+        if (shift < 31) {
+            if (shift > 0) {
+                *coef += (unsigned)((pv.mant + (1 << (shift - 1))) >> shift);
+            } else
+                *coef += (unsigned)pv.mant << -shift;
+        }
+    }
+
+    e0 = av_int2sf(*coef, 2);
+    e1 = av_sub_sf(e0, tmp);
+
+    ps->cor1 = flt16_trunc(av_add_sf(av_mul_sf(alpha, cor1), av_mul_sf(r1, e1)));
+    tmp = av_add_sf(av_mul_sf(r1, r1), av_mul_sf(e1, e1));
+    tmp.exp--;
+    ps->var1 = flt16_trunc(av_add_sf(av_mul_sf(alpha, var1), tmp));
+    ps->cor0 = flt16_trunc(av_add_sf(av_mul_sf(alpha, cor0), av_mul_sf(r0, e0)));
+    tmp = av_add_sf(av_mul_sf(r0, r0), av_mul_sf(e0, e0));
+    tmp.exp--;
+    ps->var0 = flt16_trunc(av_add_sf(av_mul_sf(alpha, var0), tmp));
+
+    ps->r1 = flt16_trunc(av_mul_sf(a, av_sub_sf(r0, av_mul_sf(k1, e0))));
+    ps->r0 = flt16_trunc(av_mul_sf(a, e0));
+}
+
+
+static const int cce_scale_fixed[8] = {
+    Q30(1.0),          //2^(0/8)
+    Q30(1.0905077327), //2^(1/8)
+    Q30(1.1892071150), //2^(2/8)
+    Q30(1.2968395547), //2^(3/8)
+    Q30(1.4142135624), //2^(4/8)
+    Q30(1.5422108254), //2^(5/8)
+    Q30(1.6817928305), //2^(6/8)
+    Q30(1.8340080864), //2^(7/8)
+};
+
+/**
+ * Apply dependent channel coupling (applied before IMDCT).
+ *
+ * @param   index   index into coupling gain array
+ */
+static void apply_dependent_coupling_fixed(AACContext *ac,
+                                     SingleChannelElement *target,
+                                     ChannelElement *cce, int index)
+{
+    IndividualChannelStream *ics = &cce->ch[0].ics;
+    const uint16_t *offsets = ics->swb_offset;
+    int *dest = target->coeffs;
+    const int *src = cce->ch[0].coeffs;
+    int g, i, group, k, idx = 0;
+    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Dependent coupling is not supported together with LTP\n");
+        return;
+    }
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            if (cce->ch[0].band_type[idx] != ZERO_BT) {
+                const int gain = cce->coup.gain[index][idx];
+                int shift, round, c, tmp;
+
+                if (gain < 0) {
+                    c = -cce_scale_fixed[-gain & 7];
+                    shift = (-gain-1024) >> 3;
+                }
+                else {
+                    c = cce_scale_fixed[gain & 7];
+                    shift = (gain-1024) >> 3;
+                }
+
+                if (shift < -31) {
+                    // Nothing to do
+                } else if (shift < 0) {
+                    shift = -shift;
+                    round = 1 << (shift - 1);
+
+                    for (group = 0; group < ics->group_len[g]; group++) {
+                        for (k = offsets[i]; k < offsets[i + 1]; k++) {
+                            tmp = (int)(((int64_t)src[group * 128 + k] * c + \
+                                       (int64_t)0x1000000000) >> 37);
+                            dest[group * 128 + k] += (tmp + (int64_t)round) >> shift;
+                        }
+                    }
+                }
+                else {
+                    for (group = 0; group < ics->group_len[g]; group++) {
+                        for (k = offsets[i]; k < offsets[i + 1]; k++) {
+                            tmp = (int)(((int64_t)src[group * 128 + k] * c + \
+                                        (int64_t)0x1000000000) >> 37);
+                            dest[group * 128 + k] += tmp * (1U << shift);
+                        }
+                    }
+                }
+            }
+        }
+        dest += ics->group_len[g] * 128;
+        src  += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * Apply independent channel coupling (applied after IMDCT).
+ *
+ * @param   index   index into coupling gain array
+ */
+static void apply_independent_coupling_fixed(AACContext *ac,
+                                       SingleChannelElement *target,
+                                       ChannelElement *cce, int index)
+{
+    int i, c, shift, round, tmp;
+    const int gain = cce->coup.gain[index][0];
+    const int *src = cce->ch[0].ret;
+    unsigned int *dest = target->ret;
+    const int len = 1024 << (ac->oc[1].m4ac.sbr == 1);
+
+    c = cce_scale_fixed[gain & 7];
+    shift = (gain-1024) >> 3;
+    if (shift < -31) {
+        return;
+    } else if (shift < 0) {
+        shift = -shift;
+        round = 1 << (shift - 1);
+
+        for (i = 0; i < len; i++) {
+            tmp = (int)(((int64_t)src[i] * c + (int64_t)0x1000000000) >> 37);
+            dest[i] += (tmp + round) >> shift;
+        }
+    }
+    else {
+      for (i = 0; i < len; i++) {
+          tmp = (int)(((int64_t)src[i] * c + (int64_t)0x1000000000) >> 37);
+          dest[i] += tmp * (1U << shift);
+      }
+    }
+}
+
+#include "aacdec_template.c"
+
+AVCodec ff_aac_fixed_decoder = {
+    .name            = "aac_fixed",
+    .long_name       = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
+    .type            = AVMEDIA_TYPE_AUDIO,
+    .id              = AV_CODEC_ID_AAC,
+    .priv_data_size  = sizeof(AACContext),
+    .init            = aac_decode_init,
+    .close           = aac_decode_close,
+    .decode          = aac_decode_frame,
+    .sample_fmts     = (const enum AVSampleFormat[]) {
+        AV_SAMPLE_FMT_S32P, AV_SAMPLE_FMT_NONE
+    },
+    .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
+    .caps_internal   = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts = aac_channel_layout,
+    .profiles        = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
+    .flush = flush,
+};
diff --git a/libavcodec/aacdec_template.c b/libavcodec/aacdec_template.c
new file mode 100644
index 0000000..721511c
--- /dev/null
+++ b/libavcodec/aacdec_template.c
@@ -0,0 +1,3448 @@
+/*
+ * AAC decoder
+ * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
+ * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
+ * Copyright (c) 2008-2013 Alex Converse <alex.converse@gmail.com>
+ *
+ * AAC LATM decoder
+ * Copyright (c) 2008-2010 Paul Kendall <paul@kcbbs.gen.nz>
+ * Copyright (c) 2010      Janne Grunau <janne-libav@jannau.net>
+ *
+ * AAC decoder fixed-point implementation
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC decoder
+ * @author Oded Shimon  ( ods15 ods15 dyndns org )
+ * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * AAC decoder fixed-point implementation
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ * @author Nedeljko Babic ( nedeljko.babic imgtec com )
+ */
+
+/*
+ * supported tools
+ *
+ * Support?                     Name
+ * N (code in SoC repo)         gain control
+ * Y                            block switching
+ * Y                            window shapes - standard
+ * N                            window shapes - Low Delay
+ * Y                            filterbank - standard
+ * N (code in SoC repo)         filterbank - Scalable Sample Rate
+ * Y                            Temporal Noise Shaping
+ * Y                            Long Term Prediction
+ * Y                            intensity stereo
+ * Y                            channel coupling
+ * Y                            frequency domain prediction
+ * Y                            Perceptual Noise Substitution
+ * Y                            Mid/Side stereo
+ * N                            Scalable Inverse AAC Quantization
+ * N                            Frequency Selective Switch
+ * N                            upsampling filter
+ * Y                            quantization & coding - AAC
+ * N                            quantization & coding - TwinVQ
+ * N                            quantization & coding - BSAC
+ * N                            AAC Error Resilience tools
+ * N                            Error Resilience payload syntax
+ * N                            Error Protection tool
+ * N                            CELP
+ * N                            Silence Compression
+ * N                            HVXC
+ * N                            HVXC 4kbits/s VR
+ * N                            Structured Audio tools
+ * N                            Structured Audio Sample Bank Format
+ * N                            MIDI
+ * N                            Harmonic and Individual Lines plus Noise
+ * N                            Text-To-Speech Interface
+ * Y                            Spectral Band Replication
+ * Y (not in this code)         Layer-1
+ * Y (not in this code)         Layer-2
+ * Y (not in this code)         Layer-3
+ * N                            SinuSoidal Coding (Transient, Sinusoid, Noise)
+ * Y                            Parametric Stereo
+ * N                            Direct Stream Transfer
+ * Y  (not in fixed point code) Enhanced AAC Low Delay (ER AAC ELD)
+ *
+ * Note: - HE AAC v1 comprises LC AAC with Spectral Band Replication.
+ *       - HE AAC v2 comprises LC AAC with Spectral Band Replication and
+           Parametric Stereo.
+ */
+
+#include "libavutil/thread.h"
+
+static VLC vlc_scalefactors;
+static VLC vlc_spectral[11];
+
+static int output_configure(AACContext *ac,
+                            uint8_t layout_map[MAX_ELEM_ID*4][3], int tags,
+                            enum OCStatus oc_type, int get_new_frame);
+
+#define overread_err "Input buffer exhausted before END element found\n"
+
+static int count_channels(uint8_t (*layout)[3], int tags)
+{
+    int i, sum = 0;
+    for (i = 0; i < tags; i++) {
+        int syn_ele = layout[i][0];
+        int pos     = layout[i][2];
+        sum += (1 + (syn_ele == TYPE_CPE)) *
+               (pos != AAC_CHANNEL_OFF && pos != AAC_CHANNEL_CC);
+    }
+    return sum;
+}
+
+/**
+ * Check for the channel element in the current channel position configuration.
+ * If it exists, make sure the appropriate element is allocated and map the
+ * channel order to match the internal FFmpeg channel layout.
+ *
+ * @param   che_pos current channel position configuration
+ * @param   type channel element type
+ * @param   id channel element id
+ * @param   channels count of the number of channels in the configuration
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static av_cold int che_configure(AACContext *ac,
+                                 enum ChannelPosition che_pos,
+                                 int type, int id, int *channels)
+{
+    if (*channels >= MAX_CHANNELS)
+        return AVERROR_INVALIDDATA;
+    if (che_pos) {
+        if (!ac->che[type][id]) {
+            if (!(ac->che[type][id] = av_mallocz(sizeof(ChannelElement))))
+                return AVERROR(ENOMEM);
+            AAC_RENAME(ff_aac_sbr_ctx_init)(ac, &ac->che[type][id]->sbr, type);
+        }
+        if (type != TYPE_CCE) {
+            if (*channels >= MAX_CHANNELS - (type == TYPE_CPE || (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1))) {
+                av_log(ac->avctx, AV_LOG_ERROR, "Too many channels\n");
+                return AVERROR_INVALIDDATA;
+            }
+            ac->output_element[(*channels)++] = &ac->che[type][id]->ch[0];
+            if (type == TYPE_CPE ||
+                (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1)) {
+                ac->output_element[(*channels)++] = &ac->che[type][id]->ch[1];
+            }
+        }
+    } else {
+        if (ac->che[type][id])
+            AAC_RENAME(ff_aac_sbr_ctx_close)(&ac->che[type][id]->sbr);
+        av_freep(&ac->che[type][id]);
+    }
+    return 0;
+}
+
+static int frame_configure_elements(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int type, id, ch, ret;
+
+    /* set channel pointers to internal buffers by default */
+    for (type = 0; type < 4; type++) {
+        for (id = 0; id < MAX_ELEM_ID; id++) {
+            ChannelElement *che = ac->che[type][id];
+            if (che) {
+                che->ch[0].ret = che->ch[0].ret_buf;
+                che->ch[1].ret = che->ch[1].ret_buf;
+            }
+        }
+    }
+
+    /* get output buffer */
+    av_frame_unref(ac->frame);
+    if (!avctx->channels)
+        return 1;
+
+    ac->frame->nb_samples = 2048;
+    if ((ret = ff_get_buffer(avctx, ac->frame, 0)) < 0)
+        return ret;
+
+    /* map output channel pointers to AVFrame data */
+    for (ch = 0; ch < avctx->channels; ch++) {
+        if (ac->output_element[ch])
+            ac->output_element[ch]->ret = (INTFLOAT *)ac->frame->extended_data[ch];
+    }
+
+    return 0;
+}
+
+struct elem_to_channel {
+    uint64_t av_position;
+    uint8_t syn_ele;
+    uint8_t elem_id;
+    uint8_t aac_position;
+};
+
+static int assign_pair(struct elem_to_channel e2c_vec[MAX_ELEM_ID],
+                       uint8_t (*layout_map)[3], int offset, uint64_t left,
+                       uint64_t right, int pos)
+{
+    if (layout_map[offset][0] == TYPE_CPE) {
+        e2c_vec[offset] = (struct elem_to_channel) {
+            .av_position  = left | right,
+            .syn_ele      = TYPE_CPE,
+            .elem_id      = layout_map[offset][1],
+            .aac_position = pos
+        };
+        return 1;
+    } else {
+        e2c_vec[offset] = (struct elem_to_channel) {
+            .av_position  = left,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[offset][1],
+            .aac_position = pos
+        };
+        e2c_vec[offset + 1] = (struct elem_to_channel) {
+            .av_position  = right,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[offset + 1][1],
+            .aac_position = pos
+        };
+        return 2;
+    }
+}
+
+static int count_paired_channels(uint8_t (*layout_map)[3], int tags, int pos,
+                                 int *current)
+{
+    int num_pos_channels = 0;
+    int first_cpe        = 0;
+    int sce_parity       = 0;
+    int i;
+    for (i = *current; i < tags; i++) {
+        if (layout_map[i][2] != pos)
+            break;
+        if (layout_map[i][0] == TYPE_CPE) {
+            if (sce_parity) {
+                if (pos == AAC_CHANNEL_FRONT && !first_cpe) {
+                    sce_parity = 0;
+                } else {
+                    return -1;
+                }
+            }
+            num_pos_channels += 2;
+            first_cpe         = 1;
+        } else {
+            num_pos_channels++;
+            sce_parity ^= 1;
+        }
+    }
+    if (sce_parity &&
+        ((pos == AAC_CHANNEL_FRONT && first_cpe) || pos == AAC_CHANNEL_SIDE))
+        return -1;
+    *current = i;
+    return num_pos_channels;
+}
+
+static uint64_t sniff_channel_order(uint8_t (*layout_map)[3], int tags)
+{
+    int i, n, total_non_cc_elements;
+    struct elem_to_channel e2c_vec[4 * MAX_ELEM_ID] = { { 0 } };
+    int num_front_channels, num_side_channels, num_back_channels;
+    uint64_t layout;
+
+    if (FF_ARRAY_ELEMS(e2c_vec) < tags)
+        return 0;
+
+    i = 0;
+    num_front_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_FRONT, &i);
+    if (num_front_channels < 0)
+        return 0;
+    num_side_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_SIDE, &i);
+    if (num_side_channels < 0)
+        return 0;
+    num_back_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_BACK, &i);
+    if (num_back_channels < 0)
+        return 0;
+
+    if (num_side_channels == 0 && num_back_channels >= 4) {
+        num_side_channels = 2;
+        num_back_channels -= 2;
+    }
+
+    i = 0;
+    if (num_front_channels & 1) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_FRONT_CENTER,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_FRONT
+        };
+        i++;
+        num_front_channels--;
+    }
+    if (num_front_channels >= 4) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_FRONT_LEFT_OF_CENTER,
+                         AV_CH_FRONT_RIGHT_OF_CENTER,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+    if (num_front_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_FRONT_LEFT,
+                         AV_CH_FRONT_RIGHT,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+    while (num_front_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+
+    if (num_side_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_SIDE_LEFT,
+                         AV_CH_SIDE_RIGHT,
+                         AAC_CHANNEL_FRONT);
+        num_side_channels -= 2;
+    }
+    while (num_side_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_SIDE);
+        num_side_channels -= 2;
+    }
+
+    while (num_back_channels >= 4) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_BACK);
+        num_back_channels -= 2;
+    }
+    if (num_back_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_BACK_LEFT,
+                         AV_CH_BACK_RIGHT,
+                         AAC_CHANNEL_BACK);
+        num_back_channels -= 2;
+    }
+    if (num_back_channels) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_BACK_CENTER,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_BACK
+        };
+        i++;
+        num_back_channels--;
+    }
+
+    if (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_LOW_FREQUENCY,
+            .syn_ele      = TYPE_LFE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_LFE
+        };
+        i++;
+    }
+    while (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = UINT64_MAX,
+            .syn_ele      = TYPE_LFE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_LFE
+        };
+        i++;
+    }
+
+    // Must choose a stable sort
+    total_non_cc_elements = n = i;
+    do {
+        int next_n = 0;
+        for (i = 1; i < n; i++)
+            if (e2c_vec[i - 1].av_position > e2c_vec[i].av_position) {
+                FFSWAP(struct elem_to_channel, e2c_vec[i - 1], e2c_vec[i]);
+                next_n = i;
+            }
+        n = next_n;
+    } while (n > 0);
+
+    layout = 0;
+    for (i = 0; i < total_non_cc_elements; i++) {
+        layout_map[i][0] = e2c_vec[i].syn_ele;
+        layout_map[i][1] = e2c_vec[i].elem_id;
+        layout_map[i][2] = e2c_vec[i].aac_position;
+        if (e2c_vec[i].av_position != UINT64_MAX) {
+            layout |= e2c_vec[i].av_position;
+        }
+    }
+
+    return layout;
+}
+
+/**
+ * Save current output configuration if and only if it has been locked.
+ */
+static int push_output_configuration(AACContext *ac) {
+    int pushed = 0;
+
+    if (ac->oc[1].status == OC_LOCKED || ac->oc[0].status == OC_NONE) {
+        ac->oc[0] = ac->oc[1];
+        pushed = 1;
+    }
+    ac->oc[1].status = OC_NONE;
+    return pushed;
+}
+
+/**
+ * Restore the previous output configuration if and only if the current
+ * configuration is unlocked.
+ */
+static void pop_output_configuration(AACContext *ac) {
+    if (ac->oc[1].status != OC_LOCKED && ac->oc[0].status != OC_NONE) {
+        ac->oc[1] = ac->oc[0];
+        ac->avctx->channels = ac->oc[1].channels;
+        ac->avctx->channel_layout = ac->oc[1].channel_layout;
+        output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
+                         ac->oc[1].status, 0);
+    }
+}
+
+/**
+ * Configure output channel order based on the current program
+ * configuration element.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int output_configure(AACContext *ac,
+                            uint8_t layout_map[MAX_ELEM_ID * 4][3], int tags,
+                            enum OCStatus oc_type, int get_new_frame)
+{
+    AVCodecContext *avctx = ac->avctx;
+    int i, channels = 0, ret;
+    uint64_t layout = 0;
+    uint8_t id_map[TYPE_END][MAX_ELEM_ID] = {{ 0 }};
+    uint8_t type_counts[TYPE_END] = { 0 };
+
+    if (ac->oc[1].layout_map != layout_map) {
+        memcpy(ac->oc[1].layout_map, layout_map, tags * sizeof(layout_map[0]));
+        ac->oc[1].layout_map_tags = tags;
+    }
+    for (i = 0; i < tags; i++) {
+        int type =         layout_map[i][0];
+        int id =           layout_map[i][1];
+        id_map[type][id] = type_counts[type]++;
+        if (id_map[type][id] >= MAX_ELEM_ID) {
+            avpriv_request_sample(ac->avctx, "Too large remapped id");
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+    // Try to sniff a reasonable channel order, otherwise output the
+    // channels in the order the PCE declared them.
+    if (avctx->request_channel_layout != AV_CH_LAYOUT_NATIVE)
+        layout = sniff_channel_order(layout_map, tags);
+    for (i = 0; i < tags; i++) {
+        int type =     layout_map[i][0];
+        int id =       layout_map[i][1];
+        int iid =      id_map[type][id];
+        int position = layout_map[i][2];
+        // Allocate or free elements depending on if they are in the
+        // current program configuration.
+        ret = che_configure(ac, position, type, iid, &channels);
+        if (ret < 0)
+            return ret;
+        ac->tag_che_map[type][id] = ac->che[type][iid];
+    }
+    if (ac->oc[1].m4ac.ps == 1 && channels == 2) {
+        if (layout == AV_CH_FRONT_CENTER) {
+            layout = AV_CH_FRONT_LEFT|AV_CH_FRONT_RIGHT;
+        } else {
+            layout = 0;
+        }
+    }
+
+    if (layout) avctx->channel_layout = layout;
+                            ac->oc[1].channel_layout = layout;
+    avctx->channels       = ac->oc[1].channels       = channels;
+    ac->oc[1].status = oc_type;
+
+    if (get_new_frame) {
+        if ((ret = frame_configure_elements(ac->avctx)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+static void flush(AVCodecContext *avctx)
+{
+    AACContext *ac= avctx->priv_data;
+    int type, i, j;
+
+    for (type = 3; type >= 0; type--) {
+        for (i = 0; i < MAX_ELEM_ID; i++) {
+            ChannelElement *che = ac->che[type][i];
+            if (che) {
+                for (j = 0; j <= 1; j++) {
+                    memset(che->ch[j].saved, 0, sizeof(che->ch[j].saved));
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Set up channel positions based on a default channel configuration
+ * as specified in table 1.17.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int set_default_channel_config(AVCodecContext *avctx,
+                                      uint8_t (*layout_map)[3],
+                                      int *tags,
+                                      int channel_config)
+{
+    if (channel_config < 1 || (channel_config > 7 && channel_config < 11) ||
+        channel_config > 12) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid default channel configuration (%d)\n",
+               channel_config);
+        return AVERROR_INVALIDDATA;
+    }
+    *tags = tags_per_config[channel_config];
+    memcpy(layout_map, aac_channel_layout_map[channel_config - 1],
+           *tags * sizeof(*layout_map));
+
+    /*
+     * AAC specification has 7.1(wide) as a default layout for 8-channel streams.
+     * However, at least Nero AAC encoder encodes 7.1 streams using the default
+     * channel config 7, mapping the side channels of the original audio stream
+     * to the second AAC_CHANNEL_FRONT pair in the AAC stream. Similarly, e.g. FAAD
+     * decodes the second AAC_CHANNEL_FRONT pair as side channels, therefore decoding
+     * the incorrect streams as if they were correct (and as the encoder intended).
+     *
+     * As actual intended 7.1(wide) streams are very rare, default to assuming a
+     * 7.1 layout was intended.
+     */
+    if (channel_config == 7 && avctx->strict_std_compliance < FF_COMPLIANCE_STRICT) {
+        av_log(avctx, AV_LOG_INFO, "Assuming an incorrectly encoded 7.1 channel layout"
+               " instead of a spec-compliant 7.1(wide) layout, use -strict %d to decode"
+               " according to the specification instead.\n", FF_COMPLIANCE_STRICT);
+        layout_map[2][2] = AAC_CHANNEL_SIDE;
+    }
+
+    return 0;
+}
+
+static ChannelElement *get_che(AACContext *ac, int type, int elem_id)
+{
+    /* For PCE based channel configurations map the channels solely based
+     * on tags. */
+    if (!ac->oc[1].m4ac.chan_config) {
+        return ac->tag_che_map[type][elem_id];
+    }
+    // Allow single CPE stereo files to be signalled with mono configuration.
+    if (!ac->tags_mapped && type == TYPE_CPE &&
+        ac->oc[1].m4ac.chan_config == 1) {
+        uint8_t layout_map[MAX_ELEM_ID*4][3];
+        int layout_map_tags;
+        push_output_configuration(ac);
+
+        av_log(ac->avctx, AV_LOG_DEBUG, "mono with CPE\n");
+
+        if (set_default_channel_config(ac->avctx, layout_map,
+                                       &layout_map_tags, 2) < 0)
+            return NULL;
+        if (output_configure(ac, layout_map, layout_map_tags,
+                             OC_TRIAL_FRAME, 1) < 0)
+            return NULL;
+
+        ac->oc[1].m4ac.chan_config = 2;
+        ac->oc[1].m4ac.ps = 0;
+    }
+    // And vice-versa
+    if (!ac->tags_mapped && type == TYPE_SCE &&
+        ac->oc[1].m4ac.chan_config == 2) {
+        uint8_t layout_map[MAX_ELEM_ID * 4][3];
+        int layout_map_tags;
+        push_output_configuration(ac);
+
+        av_log(ac->avctx, AV_LOG_DEBUG, "stereo with SCE\n");
+
+        if (set_default_channel_config(ac->avctx, layout_map,
+                                       &layout_map_tags, 1) < 0)
+            return NULL;
+        if (output_configure(ac, layout_map, layout_map_tags,
+                             OC_TRIAL_FRAME, 1) < 0)
+            return NULL;
+
+        ac->oc[1].m4ac.chan_config = 1;
+        if (ac->oc[1].m4ac.sbr)
+            ac->oc[1].m4ac.ps = -1;
+    }
+    /* For indexed channel configurations map the channels solely based
+     * on position. */
+    switch (ac->oc[1].m4ac.chan_config) {
+    case 12:
+    case 7:
+        if (ac->tags_mapped == 3 && type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][2];
+        }
+    case 11:
+        if (ac->tags_mapped == 2 &&
+            ac->oc[1].m4ac.chan_config == 11 &&
+            type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
+        }
+    case 6:
+        /* Some streams incorrectly code 5.1 audio as
+         * SCE[0] CPE[0] CPE[1] SCE[1]
+         * instead of
+         * SCE[0] CPE[0] CPE[1] LFE[0].
+         * If we seem to have encountered such a stream, transfer
+         * the LFE[0] element to the SCE[1]'s mapping */
+        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
+            if (!ac->warned_remapping_once && (type != TYPE_LFE || elem_id != 0)) {
+                av_log(ac->avctx, AV_LOG_WARNING,
+                   "This stream seems to incorrectly report its last channel as %s[%d], mapping to LFE[0]\n",
+                   type == TYPE_SCE ? "SCE" : "LFE", elem_id);
+                ac->warned_remapping_once++;
+            }
+            ac->tags_mapped++;
+            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_LFE][0];
+        }
+    case 5:
+        if (ac->tags_mapped == 2 && type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][1];
+        }
+    case 4:
+        /* Some streams incorrectly code 4.0 audio as
+         * SCE[0] CPE[0] LFE[0]
+         * instead of
+         * SCE[0] CPE[0] SCE[1].
+         * If we seem to have encountered such a stream, transfer
+         * the SCE[1] element to the LFE[0]'s mapping */
+        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
+            if (!ac->warned_remapping_once && (type != TYPE_SCE || elem_id != 1)) {
+                av_log(ac->avctx, AV_LOG_WARNING,
+                   "This stream seems to incorrectly report its last channel as %s[%d], mapping to SCE[1]\n",
+                   type == TYPE_SCE ? "SCE" : "LFE", elem_id);
+                ac->warned_remapping_once++;
+            }
+            ac->tags_mapped++;
+            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_SCE][1];
+        }
+        if (ac->tags_mapped == 2 &&
+            ac->oc[1].m4ac.chan_config == 4 &&
+            type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
+        }
+    case 3:
+    case 2:
+        if (ac->tags_mapped == (ac->oc[1].m4ac.chan_config != 2) &&
+            type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][0];
+        } else if (ac->oc[1].m4ac.chan_config == 2) {
+            return NULL;
+        }
+    case 1:
+        if (!ac->tags_mapped && type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][0];
+        }
+    default:
+        return NULL;
+    }
+}
+
+/**
+ * Decode an array of 4 bit element IDs, optionally interleaved with a
+ * stereo/mono switching bit.
+ *
+ * @param type speaker type/position for these channels
+ */
+static void decode_channel_map(uint8_t layout_map[][3],
+                               enum ChannelPosition type,
+                               GetBitContext *gb, int n)
+{
+    while (n--) {
+        enum RawDataBlockType syn_ele;
+        switch (type) {
+        case AAC_CHANNEL_FRONT:
+        case AAC_CHANNEL_BACK:
+        case AAC_CHANNEL_SIDE:
+            syn_ele = get_bits1(gb);
+            break;
+        case AAC_CHANNEL_CC:
+            skip_bits1(gb);
+            syn_ele = TYPE_CCE;
+            break;
+        case AAC_CHANNEL_LFE:
+            syn_ele = TYPE_LFE;
+            break;
+        default:
+            // AAC_CHANNEL_OFF has no channel map
+            av_assert0(0);
+        }
+        layout_map[0][0] = syn_ele;
+        layout_map[0][1] = get_bits(gb, 4);
+        layout_map[0][2] = type;
+        layout_map++;
+    }
+}
+
+static inline void relative_align_get_bits(GetBitContext *gb,
+                                           int reference_position) {
+    int n = (reference_position - get_bits_count(gb) & 7);
+    if (n)
+        skip_bits(gb, n);
+}
+
+/**
+ * Decode program configuration element; reference: table 4.2.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_pce(AVCodecContext *avctx, MPEG4AudioConfig *m4ac,
+                      uint8_t (*layout_map)[3],
+                      GetBitContext *gb, int byte_align_ref)
+{
+    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc;
+    int sampling_index;
+    int comment_len;
+    int tags;
+
+    skip_bits(gb, 2);  // object_type
+
+    sampling_index = get_bits(gb, 4);
+    if (m4ac->sampling_index != sampling_index)
+        av_log(avctx, AV_LOG_WARNING,
+               "Sample rate index in program config element does not "
+               "match the sample rate index configured by the container.\n");
+
+    num_front       = get_bits(gb, 4);
+    num_side        = get_bits(gb, 4);
+    num_back        = get_bits(gb, 4);
+    num_lfe         = get_bits(gb, 2);
+    num_assoc_data  = get_bits(gb, 3);
+    num_cc          = get_bits(gb, 4);
+
+    if (get_bits1(gb))
+        skip_bits(gb, 4); // mono_mixdown_tag
+    if (get_bits1(gb))
+        skip_bits(gb, 4); // stereo_mixdown_tag
+
+    if (get_bits1(gb))
+        skip_bits(gb, 3); // mixdown_coeff_index and pseudo_surround
+
+    if (get_bits_left(gb) < 5 * (num_front + num_side + num_back + num_cc) + 4 *(num_lfe + num_assoc_data + num_cc)) {
+        av_log(avctx, AV_LOG_ERROR, "decode_pce: " overread_err);
+        return -1;
+    }
+    decode_channel_map(layout_map       , AAC_CHANNEL_FRONT, gb, num_front);
+    tags = num_front;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_SIDE,  gb, num_side);
+    tags += num_side;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_BACK,  gb, num_back);
+    tags += num_back;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_LFE,   gb, num_lfe);
+    tags += num_lfe;
+
+    skip_bits_long(gb, 4 * num_assoc_data);
+
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_CC,    gb, num_cc);
+    tags += num_cc;
+
+    relative_align_get_bits(gb, byte_align_ref);
+
+    /* comment field, first byte is length */
+    comment_len = get_bits(gb, 8) * 8;
+    if (get_bits_left(gb) < comment_len) {
+        av_log(avctx, AV_LOG_ERROR, "decode_pce: " overread_err);
+        return AVERROR_INVALIDDATA;
+    }
+    skip_bits_long(gb, comment_len);
+    return tags;
+}
+
+/**
+ * Decode GA "General Audio" specific configuration; reference: table 4.1.
+ *
+ * @param   ac          pointer to AACContext, may be null
+ * @param   avctx       pointer to AVCCodecContext, used for logging
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_ga_specific_config(AACContext *ac, AVCodecContext *avctx,
+                                     GetBitContext *gb,
+                                     int get_bit_alignment,
+                                     MPEG4AudioConfig *m4ac,
+                                     int channel_config)
+{
+    int extension_flag, ret, ep_config, res_flags;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int tags = 0;
+
+#if USE_FIXED
+    if (get_bits1(gb)) { // frameLengthFlag
+        avpriv_report_missing_feature(avctx, "Fixed point 960/120 MDCT window");
+        return AVERROR_PATCHWELCOME;
+    }
+    m4ac->frame_length_short = 0;
+#else
+    m4ac->frame_length_short = get_bits1(gb);
+    if (m4ac->frame_length_short && m4ac->sbr == 1) {
+      avpriv_report_missing_feature(avctx, "SBR with 960 frame length");
+      if (ac) ac->warned_960_sbr = 1;
+      m4ac->sbr = 0;
+      m4ac->ps = 0;
+    }
+#endif
+
+    if (get_bits1(gb))       // dependsOnCoreCoder
+        skip_bits(gb, 14);   // coreCoderDelay
+    extension_flag = get_bits1(gb);
+
+    if (m4ac->object_type == AOT_AAC_SCALABLE ||
+        m4ac->object_type == AOT_ER_AAC_SCALABLE)
+        skip_bits(gb, 3);     // layerNr
+
+    if (channel_config == 0) {
+        skip_bits(gb, 4);  // element_instance_tag
+        tags = decode_pce(avctx, m4ac, layout_map, gb, get_bit_alignment);
+        if (tags < 0)
+            return tags;
+    } else {
+        if ((ret = set_default_channel_config(avctx, layout_map,
+                                              &tags, channel_config)))
+            return ret;
+    }
+
+    if (count_channels(layout_map, tags) > 1) {
+        m4ac->ps = 0;
+    } else if (m4ac->sbr == 1 && m4ac->ps == -1)
+        m4ac->ps = 1;
+
+    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
+        return ret;
+
+    if (extension_flag) {
+        switch (m4ac->object_type) {
+        case AOT_ER_BSAC:
+            skip_bits(gb, 5);    // numOfSubFrame
+            skip_bits(gb, 11);   // layer_length
+            break;
+        case AOT_ER_AAC_LC:
+        case AOT_ER_AAC_LTP:
+        case AOT_ER_AAC_SCALABLE:
+        case AOT_ER_AAC_LD:
+            res_flags = get_bits(gb, 3);
+            if (res_flags) {
+                avpriv_report_missing_feature(avctx,
+                                              "AAC data resilience (flags %x)",
+                                              res_flags);
+                return AVERROR_PATCHWELCOME;
+            }
+            break;
+        }
+        skip_bits1(gb);    // extensionFlag3 (TBD in version 3)
+    }
+    switch (m4ac->object_type) {
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LTP:
+    case AOT_ER_AAC_SCALABLE:
+    case AOT_ER_AAC_LD:
+        ep_config = get_bits(gb, 2);
+        if (ep_config) {
+            avpriv_report_missing_feature(avctx,
+                                          "epConfig %d", ep_config);
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+    return 0;
+}
+
+static int decode_eld_specific_config(AACContext *ac, AVCodecContext *avctx,
+                                     GetBitContext *gb,
+                                     MPEG4AudioConfig *m4ac,
+                                     int channel_config)
+{
+    int ret, ep_config, res_flags;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int tags = 0;
+    const int ELDEXT_TERM = 0;
+
+    m4ac->ps  = 0;
+    m4ac->sbr = 0;
+#if USE_FIXED
+    if (get_bits1(gb)) { // frameLengthFlag
+        avpriv_request_sample(avctx, "960/120 MDCT window");
+        return AVERROR_PATCHWELCOME;
+    }
+#else
+    m4ac->frame_length_short = get_bits1(gb);
+#endif
+    res_flags = get_bits(gb, 3);
+    if (res_flags) {
+        avpriv_report_missing_feature(avctx,
+                                      "AAC data resilience (flags %x)",
+                                      res_flags);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (get_bits1(gb)) { // ldSbrPresentFlag
+        avpriv_report_missing_feature(avctx,
+                                      "Low Delay SBR");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    while (get_bits(gb, 4) != ELDEXT_TERM) {
+        int len = get_bits(gb, 4);
+        if (len == 15)
+            len += get_bits(gb, 8);
+        if (len == 15 + 255)
+            len += get_bits(gb, 16);
+        if (get_bits_left(gb) < len * 8 + 4) {
+            av_log(avctx, AV_LOG_ERROR, overread_err);
+            return AVERROR_INVALIDDATA;
+        }
+        skip_bits_long(gb, 8 * len);
+    }
+
+    if ((ret = set_default_channel_config(avctx, layout_map,
+                                          &tags, channel_config)))
+        return ret;
+
+    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
+        return ret;
+
+    ep_config = get_bits(gb, 2);
+    if (ep_config) {
+        avpriv_report_missing_feature(avctx,
+                                      "epConfig %d", ep_config);
+        return AVERROR_PATCHWELCOME;
+    }
+    return 0;
+}
+
+/**
+ * Decode audio specific configuration; reference: table 1.13.
+ *
+ * @param   ac          pointer to AACContext, may be null
+ * @param   avctx       pointer to AVCCodecContext, used for logging
+ * @param   m4ac        pointer to MPEG4AudioConfig, used for parsing
+ * @param   gb          buffer holding an audio specific config
+ * @param   get_bit_alignment relative alignment for byte align operations
+ * @param   sync_extension look for an appended sync extension
+ *
+ * @return  Returns error status or number of consumed bits. <0 - error
+ */
+static int decode_audio_specific_config_gb(AACContext *ac,
+                                           AVCodecContext *avctx,
+                                           MPEG4AudioConfig *m4ac,
+                                           GetBitContext *gb,
+                                           int get_bit_alignment,
+                                           int sync_extension)
+{
+    int i, ret;
+    GetBitContext gbc = *gb;
+
+    if ((i = ff_mpeg4audio_get_config_gb(m4ac, &gbc, sync_extension)) < 0)
+        return AVERROR_INVALIDDATA;
+
+    if (m4ac->sampling_index > 12) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid sampling rate index %d\n",
+               m4ac->sampling_index);
+        return AVERROR_INVALIDDATA;
+    }
+    if (m4ac->object_type == AOT_ER_AAC_LD &&
+        (m4ac->sampling_index < 3 || m4ac->sampling_index > 7)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid low delay sampling rate index %d\n",
+               m4ac->sampling_index);
+        return AVERROR_INVALIDDATA;
+    }
+
+    skip_bits_long(gb, i);
+
+    switch (m4ac->object_type) {
+    case AOT_AAC_MAIN:
+    case AOT_AAC_LC:
+    case AOT_AAC_SSR:
+    case AOT_AAC_LTP:
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LD:
+        if ((ret = decode_ga_specific_config(ac, avctx, gb, get_bit_alignment,
+                                            m4ac, m4ac->chan_config)) < 0)
+            return ret;
+        break;
+    case AOT_ER_AAC_ELD:
+        if ((ret = decode_eld_specific_config(ac, avctx, gb,
+                                              m4ac, m4ac->chan_config)) < 0)
+            return ret;
+        break;
+    default:
+        avpriv_report_missing_feature(avctx,
+                                      "Audio object type %s%d",
+                                      m4ac->sbr == 1 ? "SBR+" : "",
+                                      m4ac->object_type);
+        return AVERROR(ENOSYS);
+    }
+
+    ff_dlog(avctx,
+            "AOT %d chan config %d sampling index %d (%d) SBR %d PS %d\n",
+            m4ac->object_type, m4ac->chan_config, m4ac->sampling_index,
+            m4ac->sample_rate, m4ac->sbr,
+            m4ac->ps);
+
+    return get_bits_count(gb);
+}
+
+static int decode_audio_specific_config(AACContext *ac,
+                                        AVCodecContext *avctx,
+                                        MPEG4AudioConfig *m4ac,
+                                        const uint8_t *data, int64_t bit_size,
+                                        int sync_extension)
+{
+    int i, ret;
+    GetBitContext gb;
+
+    if (bit_size < 0 || bit_size > INT_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "Audio specific config size is invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ff_dlog(avctx, "audio specific config size %d\n", (int)bit_size >> 3);
+    for (i = 0; i < bit_size >> 3; i++)
+        ff_dlog(avctx, "%02x ", data[i]);
+    ff_dlog(avctx, "\n");
+
+    if ((ret = init_get_bits(&gb, data, bit_size)) < 0)
+        return ret;
+
+    return decode_audio_specific_config_gb(ac, avctx, m4ac, &gb, 0,
+                                           sync_extension);
+}
+
+/**
+ * linear congruential pseudorandom number generator
+ *
+ * @param   previous_val    pointer to the current state of the generator
+ *
+ * @return  Returns a 32-bit pseudorandom integer
+ */
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+static void reset_all_predictors(PredictorState *ps)
+{
+    int i;
+    for (i = 0; i < MAX_PREDICTORS; i++)
+        reset_predict_state(&ps[i]);
+}
+
+static int sample_rate_idx (int rate)
+{
+         if (92017 <= rate) return 0;
+    else if (75132 <= rate) return 1;
+    else if (55426 <= rate) return 2;
+    else if (46009 <= rate) return 3;
+    else if (37566 <= rate) return 4;
+    else if (27713 <= rate) return 5;
+    else if (23004 <= rate) return 6;
+    else if (18783 <= rate) return 7;
+    else if (13856 <= rate) return 8;
+    else if (11502 <= rate) return 9;
+    else if (9391  <= rate) return 10;
+    else                    return 11;
+}
+
+static void reset_predictor_group(PredictorState *ps, int group_num)
+{
+    int i;
+    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
+        reset_predict_state(&ps[i]);
+}
+
+#define AAC_INIT_VLC_STATIC(num, size)                                     \
+    INIT_VLC_STATIC(&vlc_spectral[num], 8, ff_aac_spectral_sizes[num],     \
+         ff_aac_spectral_bits[num], sizeof(ff_aac_spectral_bits[num][0]),  \
+                                    sizeof(ff_aac_spectral_bits[num][0]),  \
+        ff_aac_spectral_codes[num], sizeof(ff_aac_spectral_codes[num][0]), \
+                                    sizeof(ff_aac_spectral_codes[num][0]), \
+        size);
+
+static void aacdec_init(AACContext *ac);
+
+static av_cold void aac_static_table_init(void)
+{
+    AAC_INIT_VLC_STATIC( 0, 304);
+    AAC_INIT_VLC_STATIC( 1, 270);
+    AAC_INIT_VLC_STATIC( 2, 550);
+    AAC_INIT_VLC_STATIC( 3, 300);
+    AAC_INIT_VLC_STATIC( 4, 328);
+    AAC_INIT_VLC_STATIC( 5, 294);
+    AAC_INIT_VLC_STATIC( 6, 306);
+    AAC_INIT_VLC_STATIC( 7, 268);
+    AAC_INIT_VLC_STATIC( 8, 510);
+    AAC_INIT_VLC_STATIC( 9, 366);
+    AAC_INIT_VLC_STATIC(10, 462);
+
+    AAC_RENAME(ff_aac_sbr_init)();
+
+    ff_aac_tableinit();
+
+    INIT_VLC_STATIC(&vlc_scalefactors, 7,
+                    FF_ARRAY_ELEMS(ff_aac_scalefactor_code),
+                    ff_aac_scalefactor_bits,
+                    sizeof(ff_aac_scalefactor_bits[0]),
+                    sizeof(ff_aac_scalefactor_bits[0]),
+                    ff_aac_scalefactor_code,
+                    sizeof(ff_aac_scalefactor_code[0]),
+                    sizeof(ff_aac_scalefactor_code[0]),
+                    352);
+
+    // window initialization
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_long_1024), 4.0, 1024);
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_short_128), 6.0, 128);
+#if !USE_FIXED
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_long_960), 4.0, 960);
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_short_120), 6.0, 120);
+    AAC_RENAME(ff_sine_window_init)(AAC_RENAME(ff_sine_960), 960);
+    AAC_RENAME(ff_sine_window_init)(AAC_RENAME(ff_sine_120), 120);
+#endif
+    AAC_RENAME(ff_init_ff_sine_windows)(10);
+    AAC_RENAME(ff_init_ff_sine_windows)( 9);
+    AAC_RENAME(ff_init_ff_sine_windows)( 7);
+
+    AAC_RENAME(ff_cbrt_tableinit)();
+}
+
+static AVOnce aac_table_init = AV_ONCE_INIT;
+
+static av_cold int aac_decode_init(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int ret;
+
+    ret = ff_thread_once(&aac_table_init, &aac_static_table_init);
+    if (ret != 0)
+        return AVERROR_UNKNOWN;
+
+    ac->avctx = avctx;
+    ac->oc[1].m4ac.sample_rate = avctx->sample_rate;
+
+    aacdec_init(ac);
+#if USE_FIXED
+    avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
+#else
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+#endif /* USE_FIXED */
+
+    if (avctx->extradata_size > 0) {
+        if ((ret = decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
+                                                avctx->extradata,
+                                                avctx->extradata_size * 8LL,
+                                                1)) < 0)
+            return ret;
+    } else {
+        int sr, i;
+        uint8_t layout_map[MAX_ELEM_ID*4][3];
+        int layout_map_tags;
+
+        sr = sample_rate_idx(avctx->sample_rate);
+        ac->oc[1].m4ac.sampling_index = sr;
+        ac->oc[1].m4ac.channels = avctx->channels;
+        ac->oc[1].m4ac.sbr = -1;
+        ac->oc[1].m4ac.ps = -1;
+
+        for (i = 0; i < FF_ARRAY_ELEMS(ff_mpeg4audio_channels); i++)
+            if (ff_mpeg4audio_channels[i] == avctx->channels)
+                break;
+        if (i == FF_ARRAY_ELEMS(ff_mpeg4audio_channels)) {
+            i = 0;
+        }
+        ac->oc[1].m4ac.chan_config = i;
+
+        if (ac->oc[1].m4ac.chan_config) {
+            int ret = set_default_channel_config(avctx, layout_map,
+                &layout_map_tags, ac->oc[1].m4ac.chan_config);
+            if (!ret)
+                output_configure(ac, layout_map, layout_map_tags,
+                                 OC_GLOBAL_HDR, 0);
+            else if (avctx->err_recognition & AV_EF_EXPLODE)
+                return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (avctx->channels > MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "Too many channels\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+#if USE_FIXED
+    ac->fdsp = avpriv_alloc_fixed_dsp(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#else
+    ac->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#endif /* USE_FIXED */
+    if (!ac->fdsp) {
+        return AVERROR(ENOMEM);
+    }
+
+    ac->random_state = 0x1f2e3d4c;
+
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct,       11, 1, 1.0 / RANGE15(1024.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_ld,    10, 1, 1.0 / RANGE15(512.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_small,  8, 1, 1.0 / RANGE15(128.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_ltp,   11, 0, RANGE15(-2.0));
+#if !USE_FIXED
+    ret = ff_mdct15_init(&ac->mdct120, 1, 3, 1.0f/(16*1024*120*2));
+    if (ret < 0)
+        return ret;
+    ret = ff_mdct15_init(&ac->mdct480, 1, 5, 1.0f/(16*1024*960));
+    if (ret < 0)
+        return ret;
+    ret = ff_mdct15_init(&ac->mdct960, 1, 6, 1.0f/(16*1024*960*2));
+    if (ret < 0)
+        return ret;
+#endif
+
+    return 0;
+}
+
+/**
+ * Skip data_stream_element; reference: table 4.10.
+ */
+static int skip_data_stream_element(AACContext *ac, GetBitContext *gb)
+{
+    int byte_align = get_bits1(gb);
+    int count = get_bits(gb, 8);
+    if (count == 255)
+        count += get_bits(gb, 8);
+    if (byte_align)
+        align_get_bits(gb);
+
+    if (get_bits_left(gb) < 8 * count) {
+        av_log(ac->avctx, AV_LOG_ERROR, "skip_data_stream_element: "overread_err);
+        return AVERROR_INVALIDDATA;
+    }
+    skip_bits_long(gb, 8 * count);
+    return 0;
+}
+
+static int decode_prediction(AACContext *ac, IndividualChannelStream *ics,
+                             GetBitContext *gb)
+{
+    int sfb;
+    if (get_bits1(gb)) {
+        ics->predictor_reset_group = get_bits(gb, 5);
+        if (ics->predictor_reset_group == 0 ||
+            ics->predictor_reset_group > 30) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid Predictor Reset Group.\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    for (sfb = 0; sfb < FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index]); sfb++) {
+        ics->prediction_used[sfb] = get_bits1(gb);
+    }
+    return 0;
+}
+
+/**
+ * Decode Long Term Prediction data; reference: table 4.xx.
+ */
+static void decode_ltp(LongTermPrediction *ltp,
+                       GetBitContext *gb, uint8_t max_sfb)
+{
+    int sfb;
+
+    ltp->lag  = get_bits(gb, 11);
+    ltp->coef = ltp_coef[get_bits(gb, 3)];
+    for (sfb = 0; sfb < FFMIN(max_sfb, MAX_LTP_LONG_SFB); sfb++)
+        ltp->used[sfb] = get_bits1(gb);
+}
+
+/**
+ * Decode Individual Channel Stream info; reference: table 4.6.
+ */
+static int decode_ics_info(AACContext *ac, IndividualChannelStream *ics,
+                           GetBitContext *gb)
+{
+    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
+    const int aot = m4ac->object_type;
+    const int sampling_index = m4ac->sampling_index;
+    int ret_fail = AVERROR_INVALIDDATA;
+
+    if (aot != AOT_ER_AAC_ELD) {
+        if (get_bits1(gb)) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Reserved bit set.\n");
+            if (ac->avctx->err_recognition & AV_EF_BITSTREAM)
+                return AVERROR_INVALIDDATA;
+        }
+        ics->window_sequence[1] = ics->window_sequence[0];
+        ics->window_sequence[0] = get_bits(gb, 2);
+        if (aot == AOT_ER_AAC_LD &&
+            ics->window_sequence[0] != ONLY_LONG_SEQUENCE) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "AAC LD is only defined for ONLY_LONG_SEQUENCE but "
+                   "window sequence %d found.\n", ics->window_sequence[0]);
+            ics->window_sequence[0] = ONLY_LONG_SEQUENCE;
+            return AVERROR_INVALIDDATA;
+        }
+        ics->use_kb_window[1]   = ics->use_kb_window[0];
+        ics->use_kb_window[0]   = get_bits1(gb);
+    }
+    ics->num_window_groups  = 1;
+    ics->group_len[0]       = 1;
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        int i;
+        ics->max_sfb = get_bits(gb, 4);
+        for (i = 0; i < 7; i++) {
+            if (get_bits1(gb)) {
+                ics->group_len[ics->num_window_groups - 1]++;
+            } else {
+                ics->num_window_groups++;
+                ics->group_len[ics->num_window_groups - 1] = 1;
+            }
+        }
+        ics->num_windows       = 8;
+        if (m4ac->frame_length_short) {
+            ics->swb_offset    =  ff_swb_offset_120[sampling_index];
+            ics->num_swb       = ff_aac_num_swb_120[sampling_index];
+        } else {
+            ics->swb_offset    =  ff_swb_offset_128[sampling_index];
+            ics->num_swb       = ff_aac_num_swb_128[sampling_index];
+        }
+        ics->tns_max_bands     = ff_tns_max_bands_128[sampling_index];
+        ics->predictor_present = 0;
+    } else {
+        ics->max_sfb           = get_bits(gb, 6);
+        ics->num_windows       = 1;
+        if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD) {
+            if (m4ac->frame_length_short) {
+                ics->swb_offset    =     ff_swb_offset_480[sampling_index];
+                ics->num_swb       =    ff_aac_num_swb_480[sampling_index];
+                ics->tns_max_bands =  ff_tns_max_bands_480[sampling_index];
+            } else {
+                ics->swb_offset    =     ff_swb_offset_512[sampling_index];
+                ics->num_swb       =    ff_aac_num_swb_512[sampling_index];
+                ics->tns_max_bands =  ff_tns_max_bands_512[sampling_index];
+            }
+            if (!ics->num_swb || !ics->swb_offset) {
+                ret_fail = AVERROR_BUG;
+                goto fail;
+            }
+        } else {
+            if (m4ac->frame_length_short) {
+                ics->num_swb    = ff_aac_num_swb_960[sampling_index];
+                ics->swb_offset = ff_swb_offset_960[sampling_index];
+            } else {
+                ics->num_swb    = ff_aac_num_swb_1024[sampling_index];
+                ics->swb_offset = ff_swb_offset_1024[sampling_index];
+            }
+            ics->tns_max_bands = ff_tns_max_bands_1024[sampling_index];
+        }
+        if (aot != AOT_ER_AAC_ELD) {
+            ics->predictor_present     = get_bits1(gb);
+            ics->predictor_reset_group = 0;
+        }
+        if (ics->predictor_present) {
+            if (aot == AOT_AAC_MAIN) {
+                if (decode_prediction(ac, ics, gb)) {
+                    goto fail;
+                }
+            } else if (aot == AOT_AAC_LC ||
+                       aot == AOT_ER_AAC_LC) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Prediction is not allowed in AAC-LC.\n");
+                goto fail;
+            } else {
+                if (aot == AOT_ER_AAC_LD) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "LTP in ER AAC LD not yet implemented.\n");
+                    ret_fail = AVERROR_PATCHWELCOME;
+                    goto fail;
+                }
+                if ((ics->ltp.present = get_bits(gb, 1)))
+                    decode_ltp(&ics->ltp, gb, ics->max_sfb);
+            }
+        }
+    }
+
+    if (ics->max_sfb > ics->num_swb) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Number of scalefactor bands in group (%d) "
+               "exceeds limit (%d).\n",
+               ics->max_sfb, ics->num_swb);
+        goto fail;
+    }
+
+    return 0;
+fail:
+    ics->max_sfb = 0;
+    return ret_fail;
+}
+
+/**
+ * Decode band types (section_data payload); reference: table 4.46.
+ *
+ * @param   band_type           array of the used band type
+ * @param   band_type_run_end   array of the last scalefactor band of a band type run
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_band_types(AACContext *ac, enum BandType band_type[120],
+                             int band_type_run_end[120], GetBitContext *gb,
+                             IndividualChannelStream *ics)
+{
+    int g, idx = 0;
+    const int bits = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) ? 3 : 5;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        int k = 0;
+        while (k < ics->max_sfb) {
+            uint8_t sect_end = k;
+            int sect_len_incr;
+            int sect_band_type = get_bits(gb, 4);
+            if (sect_band_type == 12) {
+                av_log(ac->avctx, AV_LOG_ERROR, "invalid band type\n");
+                return AVERROR_INVALIDDATA;
+            }
+            do {
+                sect_len_incr = get_bits(gb, bits);
+                sect_end += sect_len_incr;
+                if (get_bits_left(gb) < 0) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "decode_band_types: "overread_err);
+                    return AVERROR_INVALIDDATA;
+                }
+                if (sect_end > ics->max_sfb) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "Number of bands (%d) exceeds limit (%d).\n",
+                           sect_end, ics->max_sfb);
+                    return AVERROR_INVALIDDATA;
+                }
+            } while (sect_len_incr == (1 << bits) - 1);
+            for (; k < sect_end; k++) {
+                band_type        [idx]   = sect_band_type;
+                band_type_run_end[idx++] = sect_end;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode scalefactors; reference: table 4.47.
+ *
+ * @param   global_gain         first scalefactor value as scalefactors are differentially coded
+ * @param   band_type           array of the used band type
+ * @param   band_type_run_end   array of the last scalefactor band of a band type run
+ * @param   sf                  array of scalefactors or intensity stereo positions
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_scalefactors(AACContext *ac, INTFLOAT sf[120], GetBitContext *gb,
+                               unsigned int global_gain,
+                               IndividualChannelStream *ics,
+                               enum BandType band_type[120],
+                               int band_type_run_end[120])
+{
+    int g, i, idx = 0;
+    int offset[3] = { global_gain, global_gain - NOISE_OFFSET, 0 };
+    int clipped_offset;
+    int noise_flag = 1;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb;) {
+            int run_end = band_type_run_end[idx];
+            if (band_type[idx] == ZERO_BT) {
+                for (; i < run_end; i++, idx++)
+                    sf[idx] = FIXR(0.);
+            } else if ((band_type[idx] == INTENSITY_BT) ||
+                       (band_type[idx] == INTENSITY_BT2)) {
+                for (; i < run_end; i++, idx++) {
+                    offset[2] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    clipped_offset = av_clip(offset[2], -155, 100);
+                    if (offset[2] != clipped_offset) {
+                        avpriv_request_sample(ac->avctx,
+                                              "If you heard an audible artifact, there may be a bug in the decoder. "
+                                              "Clipped intensity stereo position (%d -> %d)",
+                                              offset[2], clipped_offset);
+                    }
+#if USE_FIXED
+                    sf[idx] = 100 - clipped_offset;
+#else
+                    sf[idx] = ff_aac_pow2sf_tab[-clipped_offset + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            } else if (band_type[idx] == NOISE_BT) {
+                for (; i < run_end; i++, idx++) {
+                    if (noise_flag-- > 0)
+                        offset[1] += get_bits(gb, NOISE_PRE_BITS) - NOISE_PRE;
+                    else
+                        offset[1] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    clipped_offset = av_clip(offset[1], -100, 155);
+                    if (offset[1] != clipped_offset) {
+                        avpriv_request_sample(ac->avctx,
+                                              "If you heard an audible artifact, there may be a bug in the decoder. "
+                                              "Clipped noise gain (%d -> %d)",
+                                              offset[1], clipped_offset);
+                    }
+#if USE_FIXED
+                    sf[idx] = -(100 + clipped_offset);
+#else
+                    sf[idx] = -ff_aac_pow2sf_tab[clipped_offset + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            } else {
+                for (; i < run_end; i++, idx++) {
+                    offset[0] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    if (offset[0] > 255U) {
+                        av_log(ac->avctx, AV_LOG_ERROR,
+                               "Scalefactor (%d) out of range.\n", offset[0]);
+                        return AVERROR_INVALIDDATA;
+                    }
+#if USE_FIXED
+                    sf[idx] = -offset[0];
+#else
+                    sf[idx] = -ff_aac_pow2sf_tab[offset[0] - 100 + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode pulse data; reference: table 4.7.
+ */
+static int decode_pulses(Pulse *pulse, GetBitContext *gb,
+                         const uint16_t *swb_offset, int num_swb)
+{
+    int i, pulse_swb;
+    pulse->num_pulse = get_bits(gb, 2) + 1;
+    pulse_swb        = get_bits(gb, 6);
+    if (pulse_swb >= num_swb)
+        return -1;
+    pulse->pos[0]    = swb_offset[pulse_swb];
+    pulse->pos[0]   += get_bits(gb, 5);
+    if (pulse->pos[0] >= swb_offset[num_swb])
+        return -1;
+    pulse->amp[0]    = get_bits(gb, 4);
+    for (i = 1; i < pulse->num_pulse; i++) {
+        pulse->pos[i] = get_bits(gb, 5) + pulse->pos[i - 1];
+        if (pulse->pos[i] >= swb_offset[num_swb])
+            return -1;
+        pulse->amp[i] = get_bits(gb, 4);
+    }
+    return 0;
+}
+
+/**
+ * Decode Temporal Noise Shaping data; reference: table 4.48.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_tns(AACContext *ac, TemporalNoiseShaping *tns,
+                      GetBitContext *gb, const IndividualChannelStream *ics)
+{
+    int w, filt, i, coef_len, coef_res, coef_compress;
+    const int is8 = ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int tns_max_order = is8 ? 7 : ac->oc[1].m4ac.object_type == AOT_AAC_MAIN ? 20 : 12;
+    for (w = 0; w < ics->num_windows; w++) {
+        if ((tns->n_filt[w] = get_bits(gb, 2 - is8))) {
+            coef_res = get_bits1(gb);
+
+            for (filt = 0; filt < tns->n_filt[w]; filt++) {
+                int tmp2_idx;
+                tns->length[w][filt] = get_bits(gb, 6 - 2 * is8);
+
+                if ((tns->order[w][filt] = get_bits(gb, 5 - 2 * is8)) > tns_max_order) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "TNS filter order %d is greater than maximum %d.\n",
+                           tns->order[w][filt], tns_max_order);
+                    tns->order[w][filt] = 0;
+                    return AVERROR_INVALIDDATA;
+                }
+                if (tns->order[w][filt]) {
+                    tns->direction[w][filt] = get_bits1(gb);
+                    coef_compress = get_bits1(gb);
+                    coef_len = coef_res + 3 - coef_compress;
+                    tmp2_idx = 2 * coef_compress + coef_res;
+
+                    for (i = 0; i < tns->order[w][filt]; i++)
+                        tns->coef[w][filt][i] = tns_tmp2_map[tmp2_idx][get_bits(gb, coef_len)];
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode Mid/Side data; reference: table 4.54.
+ *
+ * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
+ *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
+ *                      [3] reserved for scalable AAC
+ */
+static void decode_mid_side_stereo(ChannelElement *cpe, GetBitContext *gb,
+                                   int ms_present)
+{
+    int idx;
+    int max_idx = cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb;
+    if (ms_present == 1) {
+        for (idx = 0; idx < max_idx; idx++)
+            cpe->ms_mask[idx] = get_bits1(gb);
+    } else if (ms_present == 2) {
+        memset(cpe->ms_mask, 1, max_idx * sizeof(cpe->ms_mask[0]));
+    }
+}
+
+/**
+ * Decode spectral data; reference: table 4.50.
+ * Dequantize and scale spectral data; reference: 4.6.3.3.
+ *
+ * @param   coef            array of dequantized, scaled spectral data
+ * @param   sf              array of scalefactors or intensity stereo positions
+ * @param   pulse_present   set if pulses are present
+ * @param   pulse           pointer to pulse data struct
+ * @param   band_type       array of the used band type
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_spectrum_and_dequant(AACContext *ac, INTFLOAT coef[1024],
+                                       GetBitContext *gb, const INTFLOAT sf[120],
+                                       int pulse_present, const Pulse *pulse,
+                                       const IndividualChannelStream *ics,
+                                       enum BandType band_type[120])
+{
+    int i, k, g, idx = 0;
+    const int c = 1024 / ics->num_windows;
+    const uint16_t *offsets = ics->swb_offset;
+    INTFLOAT *coef_base = coef;
+
+    for (g = 0; g < ics->num_windows; g++)
+        memset(coef + g * 128 + offsets[ics->max_sfb], 0,
+               sizeof(INTFLOAT) * (c - offsets[ics->max_sfb]));
+
+    for (g = 0; g < ics->num_window_groups; g++) {
+        unsigned g_len = ics->group_len[g];
+
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            const unsigned cbt_m1 = band_type[idx] - 1;
+            INTFLOAT *cfo = coef + offsets[i];
+            int off_len = offsets[i + 1] - offsets[i];
+            int group;
+
+            if (cbt_m1 >= INTENSITY_BT2 - 1) {
+                for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                    memset(cfo, 0, off_len * sizeof(*cfo));
+                }
+            } else if (cbt_m1 == NOISE_BT - 1) {
+                for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+#if !USE_FIXED
+                    float scale;
+#endif /* !USE_FIXED */
+                    INTFLOAT band_energy;
+
+                    for (k = 0; k < off_len; k++) {
+                        ac->random_state  = lcg_random(ac->random_state);
+#if USE_FIXED
+                        cfo[k] = ac->random_state >> 3;
+#else
+                        cfo[k] = ac->random_state;
+#endif /* USE_FIXED */
+                    }
+
+#if USE_FIXED
+                    band_energy = ac->fdsp->scalarproduct_fixed(cfo, cfo, off_len);
+                    band_energy = fixed_sqrt(band_energy, 31);
+                    noise_scale(cfo, sf[idx], band_energy, off_len);
+#else
+                    band_energy = ac->fdsp->scalarproduct_float(cfo, cfo, off_len);
+                    scale = sf[idx] / sqrtf(band_energy);
+                    ac->fdsp->vector_fmul_scalar(cfo, cfo, scale, off_len);
+#endif /* USE_FIXED */
+                }
+            } else {
+#if !USE_FIXED
+                const float *vq = ff_aac_codebook_vector_vals[cbt_m1];
+#endif /* !USE_FIXED */
+                const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cbt_m1];
+                VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cbt_m1].table;
+                OPEN_READER(re, gb);
+
+                switch (cbt_m1 >> 1) {
+                case 0:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned cb_idx;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+#if USE_FIXED
+                            cf = DEC_SQUAD(cf, cb_idx);
+#else
+                            cf = VMUL4(cf, vq, cb_idx, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 4);
+                    }
+                    break;
+
+                case 1:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nnz;
+                            unsigned cb_idx;
+                            uint32_t bits;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 8 & 15;
+                            bits = nnz ? GET_CACHE(re, gb) : 0;
+                            LAST_SKIP_BITS(re, gb, nnz);
+#if USE_FIXED
+                            cf = DEC_UQUAD(cf, cb_idx, bits);
+#else
+                            cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 4);
+                    }
+                    break;
+
+                case 2:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned cb_idx;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+#if USE_FIXED
+                            cf = DEC_SPAIR(cf, cb_idx);
+#else
+                            cf = VMUL2(cf, vq, cb_idx, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 2);
+                    }
+                    break;
+
+                case 3:
+                case 4:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nnz;
+                            unsigned cb_idx;
+                            unsigned sign;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 8 & 15;
+                            sign = nnz ? SHOW_UBITS(re, gb, nnz) << (cb_idx >> 12) : 0;
+                            LAST_SKIP_BITS(re, gb, nnz);
+#if USE_FIXED
+                            cf = DEC_UPAIR(cf, cb_idx, sign);
+#else
+                            cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 2);
+                    }
+                    break;
+
+                default:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+#if USE_FIXED
+                        int *icf = cfo;
+                        int v;
+#else
+                        float *cf = cfo;
+                        uint32_t *icf = (uint32_t *) cf;
+#endif /* USE_FIXED */
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nzt, nnz;
+                            unsigned cb_idx;
+                            uint32_t bits;
+                            int j;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+
+                            if (!code) {
+                                *icf++ = 0;
+                                *icf++ = 0;
+                                continue;
+                            }
+
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 12;
+                            nzt = cb_idx >> 8;
+                            bits = SHOW_UBITS(re, gb, nnz) << (32-nnz);
+                            LAST_SKIP_BITS(re, gb, nnz);
+
+                            for (j = 0; j < 2; j++) {
+                                if (nzt & 1<<j) {
+                                    uint32_t b;
+                                    int n;
+                                    /* The total length of escape_sequence must be < 22 bits according
+                                       to the specification (i.e. max is 111111110xxxxxxxxxxxx). */
+                                    UPDATE_CACHE(re, gb);
+                                    b = GET_CACHE(re, gb);
+                                    b = 31 - av_log2(~b);
+
+                                    if (b > 8) {
+                                        av_log(ac->avctx, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
+
+                                    SKIP_BITS(re, gb, b + 1);
+                                    b += 4;
+                                    n = (1 << b) + SHOW_UBITS(re, gb, b);
+                                    LAST_SKIP_BITS(re, gb, b);
+#if USE_FIXED
+                                    v = n;
+                                    if (bits & 1U<<31)
+                                        v = -v;
+                                    *icf++ = v;
+#else
+                                    *icf++ = ff_cbrt_tab[n] | (bits & 1U<<31);
+#endif /* USE_FIXED */
+                                    bits <<= 1;
+                                } else {
+#if USE_FIXED
+                                    v = cb_idx & 15;
+                                    if (bits & 1U<<31)
+                                        v = -v;
+                                    *icf++ = v;
+#else
+                                    unsigned v = ((const uint32_t*)vq)[cb_idx & 15];
+                                    *icf++ = (bits & 1U<<31) | v;
+#endif /* USE_FIXED */
+                                    bits <<= !!v;
+                                }
+                                cb_idx >>= 4;
+                            }
+                        } while (len -= 2);
+#if !USE_FIXED
+                        ac->fdsp->vector_fmul_scalar(cfo, cfo, sf[idx], off_len);
+#endif /* !USE_FIXED */
+                    }
+                }
+
+                CLOSE_READER(re, gb);
+            }
+        }
+        coef += g_len << 7;
+    }
+
+    if (pulse_present) {
+        idx = 0;
+        for (i = 0; i < pulse->num_pulse; i++) {
+            INTFLOAT co = coef_base[ pulse->pos[i] ];
+            while (offsets[idx + 1] <= pulse->pos[i])
+                idx++;
+            if (band_type[idx] != NOISE_BT && sf[idx]) {
+                INTFLOAT ico = -pulse->amp[i];
+#if USE_FIXED
+                if (co) {
+                    ico = co + (co > 0 ? -ico : ico);
+                }
+                coef_base[ pulse->pos[i] ] = ico;
+#else
+                if (co) {
+                    co /= sf[idx];
+                    ico = co / sqrtf(sqrtf(fabsf(co))) + (co > 0 ? -ico : ico);
+                }
+                coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx];
+#endif /* USE_FIXED */
+            }
+        }
+    }
+#if USE_FIXED
+    coef = coef_base;
+    idx = 0;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        unsigned g_len = ics->group_len[g];
+
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            const unsigned cbt_m1 = band_type[idx] - 1;
+            int *cfo = coef + offsets[i];
+            int off_len = offsets[i + 1] - offsets[i];
+            int group;
+
+            if (cbt_m1 < NOISE_BT - 1) {
+                for (group = 0; group < (int)g_len; group++, cfo+=128) {
+                    ac->vector_pow43(cfo, off_len);
+                    ac->subband_scale(cfo, cfo, sf[idx], 34, off_len, ac->avctx);
+                }
+            }
+        }
+        coef += g_len << 7;
+    }
+#endif /* USE_FIXED */
+    return 0;
+}
+
+/**
+ * Apply AAC-Main style frequency domain prediction.
+ */
+static void apply_prediction(AACContext *ac, SingleChannelElement *sce)
+{
+    int sfb, k;
+
+    if (!sce->ics.predictor_initialized) {
+        reset_all_predictors(sce->predictor_state);
+        sce->ics.predictor_initialized = 1;
+    }
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        for (sfb = 0;
+             sfb < ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index];
+             sfb++) {
+            for (k = sce->ics.swb_offset[sfb];
+                 k < sce->ics.swb_offset[sfb + 1];
+                 k++) {
+                predict(&sce->predictor_state[k], &sce->coeffs[k],
+                        sce->ics.predictor_present &&
+                        sce->ics.prediction_used[sfb]);
+            }
+        }
+        if (sce->ics.predictor_reset_group)
+            reset_predictor_group(sce->predictor_state,
+                                  sce->ics.predictor_reset_group);
+    } else
+        reset_all_predictors(sce->predictor_state);
+}
+
+static void decode_gain_control(SingleChannelElement * sce, GetBitContext * gb)
+{
+    // wd_num, wd_test, aloc_size
+    static const uint8_t gain_mode[4][3] = {
+        {1, 0, 5},  // ONLY_LONG_SEQUENCE = 0,
+        {2, 1, 2},  // LONG_START_SEQUENCE,
+        {8, 0, 2},  // EIGHT_SHORT_SEQUENCE,
+        {2, 1, 5},  // LONG_STOP_SEQUENCE
+    };
+
+    const int mode = sce->ics.window_sequence[0];
+    uint8_t bd, wd, ad;
+
+    // FIXME: Store the gain control data on |sce| and do something with it.
+    uint8_t max_band = get_bits(gb, 2);
+    for (bd = 0; bd < max_band; bd++) {
+        for (wd = 0; wd < gain_mode[mode][0]; wd++) {
+            uint8_t adjust_num = get_bits(gb, 3);
+            for (ad = 0; ad < adjust_num; ad++) {
+                skip_bits(gb, 4 + ((wd == 0 && gain_mode[mode][1])
+                                     ? 4
+                                     : gain_mode[mode][2]));
+            }
+        }
+    }
+}
+
+/**
+ * Decode an individual_channel_stream payload; reference: table 4.44.
+ *
+ * @param   common_window   Channels have independent [0], or shared [1], Individual Channel Stream information.
+ * @param   scale_flag      scalable [1] or non-scalable [0] AAC (Unused until scalable AAC is implemented.)
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_ics(AACContext *ac, SingleChannelElement *sce,
+                      GetBitContext *gb, int common_window, int scale_flag)
+{
+    Pulse pulse;
+    TemporalNoiseShaping    *tns = &sce->tns;
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *out = sce->coeffs;
+    int global_gain, eld_syntax, er_syntax, pulse_present = 0;
+    int ret;
+
+    eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+    er_syntax  = ac->oc[1].m4ac.object_type == AOT_ER_AAC_LC ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LTP ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LD ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+
+    /* This assignment is to silence a GCC warning about the variable being used
+     * uninitialized when in fact it always is.
+     */
+    pulse.num_pulse = 0;
+
+    global_gain = get_bits(gb, 8);
+
+    if (!common_window && !scale_flag) {
+        ret = decode_ics_info(ac, ics, gb);
+        if (ret < 0)
+            goto fail;
+    }
+
+    if ((ret = decode_band_types(ac, sce->band_type,
+                                 sce->band_type_run_end, gb, ics)) < 0)
+        goto fail;
+    if ((ret = decode_scalefactors(ac, sce->sf, gb, global_gain, ics,
+                                  sce->band_type, sce->band_type_run_end)) < 0)
+        goto fail;
+
+    pulse_present = 0;
+    if (!scale_flag) {
+        if (!eld_syntax && (pulse_present = get_bits1(gb))) {
+            if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Pulse tool not allowed in eight short sequence.\n");
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
+            if (decode_pulses(&pulse, gb, ics->swb_offset, ics->num_swb)) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Pulse data corrupt or invalid.\n");
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
+        }
+        tns->present = get_bits1(gb);
+        if (tns->present && !er_syntax) {
+            ret = decode_tns(ac, tns, gb, ics);
+            if (ret < 0)
+                goto fail;
+        }
+        if (!eld_syntax && get_bits1(gb)) {
+            decode_gain_control(sce, gb);
+            if (!ac->warned_gain_control) {
+                avpriv_report_missing_feature(ac->avctx, "Gain control");
+                ac->warned_gain_control = 1;
+            }
+        }
+        // I see no textual basis in the spec for this occurring after SSR gain
+        // control, but this is what both reference and real implmentations do
+        if (tns->present && er_syntax) {
+            ret = decode_tns(ac, tns, gb, ics);
+            if (ret < 0)
+                goto fail;
+        }
+    }
+
+    ret = decode_spectrum_and_dequant(ac, out, gb, sce->sf, pulse_present,
+                                    &pulse, ics, sce->band_type);
+    if (ret < 0)
+        goto fail;
+
+    if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN && !common_window)
+        apply_prediction(ac, sce);
+
+    return 0;
+fail:
+    tns->present = 0;
+    return ret;
+}
+
+/**
+ * Mid/Side stereo decoding; reference: 4.6.8.1.3.
+ */
+static void apply_mid_side_stereo(AACContext *ac, ChannelElement *cpe)
+{
+    const IndividualChannelStream *ics = &cpe->ch[0].ics;
+    INTFLOAT *ch0 = cpe->ch[0].coeffs;
+    INTFLOAT *ch1 = cpe->ch[1].coeffs;
+    int g, i, group, idx = 0;
+    const uint16_t *offsets = ics->swb_offset;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            if (cpe->ms_mask[idx] &&
+                cpe->ch[0].band_type[idx] < NOISE_BT &&
+                cpe->ch[1].band_type[idx] < NOISE_BT) {
+#if USE_FIXED
+                for (group = 0; group < ics->group_len[g]; group++) {
+                    ac->fdsp->butterflies_fixed(ch0 + group * 128 + offsets[i],
+                                                ch1 + group * 128 + offsets[i],
+                                                offsets[i+1] - offsets[i]);
+#else
+                for (group = 0; group < ics->group_len[g]; group++) {
+                    ac->fdsp->butterflies_float(ch0 + group * 128 + offsets[i],
+                                               ch1 + group * 128 + offsets[i],
+                                               offsets[i+1] - offsets[i]);
+#endif /* USE_FIXED */
+                }
+            }
+        }
+        ch0 += ics->group_len[g] * 128;
+        ch1 += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * intensity stereo decoding; reference: 4.6.8.2.3
+ *
+ * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
+ *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
+ *                      [3] reserved for scalable AAC
+ */
+static void apply_intensity_stereo(AACContext *ac,
+                                   ChannelElement *cpe, int ms_present)
+{
+    const IndividualChannelStream *ics = &cpe->ch[1].ics;
+    SingleChannelElement         *sce1 = &cpe->ch[1];
+    INTFLOAT *coef0 = cpe->ch[0].coeffs, *coef1 = cpe->ch[1].coeffs;
+    const uint16_t *offsets = ics->swb_offset;
+    int g, group, i, idx = 0;
+    int c;
+    INTFLOAT scale;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb;) {
+            if (sce1->band_type[idx] == INTENSITY_BT ||
+                sce1->band_type[idx] == INTENSITY_BT2) {
+                const int bt_run_end = sce1->band_type_run_end[idx];
+                for (; i < bt_run_end; i++, idx++) {
+                    c = -1 + 2 * (sce1->band_type[idx] - 14);
+                    if (ms_present)
+                        c *= 1 - 2 * cpe->ms_mask[idx];
+                    scale = c * sce1->sf[idx];
+                    for (group = 0; group < ics->group_len[g]; group++)
+#if USE_FIXED
+                        ac->subband_scale(coef1 + group * 128 + offsets[i],
+                                      coef0 + group * 128 + offsets[i],
+                                      scale,
+                                      23,
+                                      offsets[i + 1] - offsets[i] ,ac->avctx);
+#else
+                        ac->fdsp->vector_fmul_scalar(coef1 + group * 128 + offsets[i],
+                                                    coef0 + group * 128 + offsets[i],
+                                                    scale,
+                                                    offsets[i + 1] - offsets[i]);
+#endif /* USE_FIXED */
+                }
+            } else {
+                int bt_run_end = sce1->band_type_run_end[idx];
+                idx += bt_run_end - i;
+                i    = bt_run_end;
+            }
+        }
+        coef0 += ics->group_len[g] * 128;
+        coef1 += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * Decode a channel_pair_element; reference: table 4.4.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_cpe(AACContext *ac, GetBitContext *gb, ChannelElement *cpe)
+{
+    int i, ret, common_window, ms_present = 0;
+    int eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+
+    common_window = eld_syntax || get_bits1(gb);
+    if (common_window) {
+        if (decode_ics_info(ac, &cpe->ch[0].ics, gb))
+            return AVERROR_INVALIDDATA;
+        i = cpe->ch[1].ics.use_kb_window[0];
+        cpe->ch[1].ics = cpe->ch[0].ics;
+        cpe->ch[1].ics.use_kb_window[1] = i;
+        if (cpe->ch[1].ics.predictor_present &&
+            (ac->oc[1].m4ac.object_type != AOT_AAC_MAIN))
+            if ((cpe->ch[1].ics.ltp.present = get_bits(gb, 1)))
+                decode_ltp(&cpe->ch[1].ics.ltp, gb, cpe->ch[1].ics.max_sfb);
+        ms_present = get_bits(gb, 2);
+        if (ms_present == 3) {
+            av_log(ac->avctx, AV_LOG_ERROR, "ms_present = 3 is reserved.\n");
+            return AVERROR_INVALIDDATA;
+        } else if (ms_present)
+            decode_mid_side_stereo(cpe, gb, ms_present);
+    }
+    if ((ret = decode_ics(ac, &cpe->ch[0], gb, common_window, 0)))
+        return ret;
+    if ((ret = decode_ics(ac, &cpe->ch[1], gb, common_window, 0)))
+        return ret;
+
+    if (common_window) {
+        if (ms_present)
+            apply_mid_side_stereo(ac, cpe);
+        if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN) {
+            apply_prediction(ac, &cpe->ch[0]);
+            apply_prediction(ac, &cpe->ch[1]);
+        }
+    }
+
+    apply_intensity_stereo(ac, cpe, ms_present);
+    return 0;
+}
+
+static const float cce_scale[] = {
+    1.09050773266525765921, //2^(1/8)
+    1.18920711500272106672, //2^(1/4)
+    M_SQRT2,
+    2,
+};
+
+/**
+ * Decode coupling_channel_element; reference: table 4.8.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_cce(AACContext *ac, GetBitContext *gb, ChannelElement *che)
+{
+    int num_gain = 0;
+    int c, g, sfb, ret;
+    int sign;
+    INTFLOAT scale;
+    SingleChannelElement *sce = &che->ch[0];
+    ChannelCoupling     *coup = &che->coup;
+
+    coup->coupling_point = 2 * get_bits1(gb);
+    coup->num_coupled = get_bits(gb, 3);
+    for (c = 0; c <= coup->num_coupled; c++) {
+        num_gain++;
+        coup->type[c] = get_bits1(gb) ? TYPE_CPE : TYPE_SCE;
+        coup->id_select[c] = get_bits(gb, 4);
+        if (coup->type[c] == TYPE_CPE) {
+            coup->ch_select[c] = get_bits(gb, 2);
+            if (coup->ch_select[c] == 3)
+                num_gain++;
+        } else
+            coup->ch_select[c] = 2;
+    }
+    coup->coupling_point += get_bits1(gb) || (coup->coupling_point >> 1);
+
+    sign  = get_bits(gb, 1);
+#if USE_FIXED
+    scale = get_bits(gb, 2);
+#else
+    scale = cce_scale[get_bits(gb, 2)];
+#endif
+
+    if ((ret = decode_ics(ac, sce, gb, 0, 0)))
+        return ret;
+
+    for (c = 0; c < num_gain; c++) {
+        int idx  = 0;
+        int cge  = 1;
+        int gain = 0;
+        INTFLOAT gain_cache = FIXR10(1.);
+        if (c) {
+            cge = coup->coupling_point == AFTER_IMDCT ? 1 : get_bits1(gb);
+            gain = cge ? get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60: 0;
+            gain_cache = GET_GAIN(scale, gain);
+#if USE_FIXED
+            if ((abs(gain_cache)-1024) >> 3 > 30)
+                return AVERROR(ERANGE);
+#endif
+        }
+        if (coup->coupling_point == AFTER_IMDCT) {
+            coup->gain[c][0] = gain_cache;
+        } else {
+            for (g = 0; g < sce->ics.num_window_groups; g++) {
+                for (sfb = 0; sfb < sce->ics.max_sfb; sfb++, idx++) {
+                    if (sce->band_type[idx] != ZERO_BT) {
+                        if (!cge) {
+                            int t = get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
+                            if (t) {
+                                int s = 1;
+                                t = gain += t;
+                                if (sign) {
+                                    s  -= 2 * (t & 0x1);
+                                    t >>= 1;
+                                }
+                                gain_cache = GET_GAIN(scale, t) * s;
+#if USE_FIXED
+                                if ((abs(gain_cache)-1024) >> 3 > 30)
+                                    return AVERROR(ERANGE);
+#endif
+                            }
+                        }
+                        coup->gain[c][idx] = gain_cache;
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Parse whether channels are to be excluded from Dynamic Range Compression; reference: table 4.53.
+ *
+ * @return  Returns number of bytes consumed.
+ */
+static int decode_drc_channel_exclusions(DynamicRangeControl *che_drc,
+                                         GetBitContext *gb)
+{
+    int i;
+    int num_excl_chan = 0;
+
+    do {
+        for (i = 0; i < 7; i++)
+            che_drc->exclude_mask[num_excl_chan++] = get_bits1(gb);
+    } while (num_excl_chan < MAX_CHANNELS - 7 && get_bits1(gb));
+
+    return num_excl_chan / 7;
+}
+
+/**
+ * Decode dynamic range information; reference: table 4.52.
+ *
+ * @return  Returns number of bytes consumed.
+ */
+static int decode_dynamic_range(DynamicRangeControl *che_drc,
+                                GetBitContext *gb)
+{
+    int n             = 1;
+    int drc_num_bands = 1;
+    int i;
+
+    /* pce_tag_present? */
+    if (get_bits1(gb)) {
+        che_drc->pce_instance_tag  = get_bits(gb, 4);
+        skip_bits(gb, 4); // tag_reserved_bits
+        n++;
+    }
+
+    /* excluded_chns_present? */
+    if (get_bits1(gb)) {
+        n += decode_drc_channel_exclusions(che_drc, gb);
+    }
+
+    /* drc_bands_present? */
+    if (get_bits1(gb)) {
+        che_drc->band_incr            = get_bits(gb, 4);
+        che_drc->interpolation_scheme = get_bits(gb, 4);
+        n++;
+        drc_num_bands += che_drc->band_incr;
+        for (i = 0; i < drc_num_bands; i++) {
+            che_drc->band_top[i] = get_bits(gb, 8);
+            n++;
+        }
+    }
+
+    /* prog_ref_level_present? */
+    if (get_bits1(gb)) {
+        che_drc->prog_ref_level = get_bits(gb, 7);
+        skip_bits1(gb); // prog_ref_level_reserved_bits
+        n++;
+    }
+
+    for (i = 0; i < drc_num_bands; i++) {
+        che_drc->dyn_rng_sgn[i] = get_bits1(gb);
+        che_drc->dyn_rng_ctl[i] = get_bits(gb, 7);
+        n++;
+    }
+
+    return n;
+}
+
+static int decode_fill(AACContext *ac, GetBitContext *gb, int len) {
+    uint8_t buf[256];
+    int i, major, minor;
+
+    if (len < 13+7*8)
+        goto unknown;
+
+    get_bits(gb, 13); len -= 13;
+
+    for(i=0; i+1<sizeof(buf) && len>=8; i++, len-=8)
+        buf[i] = get_bits(gb, 8);
+
+    buf[i] = 0;
+    if (ac->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(ac->avctx, AV_LOG_DEBUG, "FILL:%s\n", buf);
+
+    if (sscanf(buf, "libfaac %d.%d", &major, &minor) == 2){
+        ac->avctx->internal->skip_samples = 1024;
+    }
+
+unknown:
+    skip_bits_long(gb, len);
+
+    return 0;
+}
+
+/**
+ * Decode extension data (incomplete); reference: table 4.51.
+ *
+ * @param   cnt length of TYPE_FIL syntactic element in bytes
+ *
+ * @return Returns number of bytes consumed
+ */
+static int decode_extension_payload(AACContext *ac, GetBitContext *gb, int cnt,
+                                    ChannelElement *che, enum RawDataBlockType elem_type)
+{
+    int crc_flag = 0;
+    int res = cnt;
+    int type = get_bits(gb, 4);
+
+    if (ac->avctx->debug & FF_DEBUG_STARTCODE)
+        av_log(ac->avctx, AV_LOG_DEBUG, "extension type: %d len:%d\n", type, cnt);
+
+    switch (type) { // extension type
+    case EXT_SBR_DATA_CRC:
+        crc_flag++;
+    case EXT_SBR_DATA:
+        if (!che) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR was found before the first channel element.\n");
+            return res;
+        } else if (ac->oc[1].m4ac.frame_length_short) {
+            if (!ac->warned_960_sbr)
+              avpriv_report_missing_feature(ac->avctx,
+                                            "SBR with 960 frame length");
+            ac->warned_960_sbr = 1;
+            skip_bits_long(gb, 8 * cnt - 4);
+            return res;
+        } else if (!ac->oc[1].m4ac.sbr) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR signaled to be not-present but was found in the bitstream.\n");
+            skip_bits_long(gb, 8 * cnt - 4);
+            return res;
+        } else if (ac->oc[1].m4ac.sbr == -1 && ac->oc[1].status == OC_LOCKED) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Implicit SBR was found with a first occurrence after the first frame.\n");
+            skip_bits_long(gb, 8 * cnt - 4);
+            return res;
+        } else if (ac->oc[1].m4ac.ps == -1 && ac->oc[1].status < OC_LOCKED && ac->avctx->channels == 1) {
+            ac->oc[1].m4ac.sbr = 1;
+            ac->oc[1].m4ac.ps = 1;
+            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
+            output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
+                             ac->oc[1].status, 1);
+        } else {
+            ac->oc[1].m4ac.sbr = 1;
+            ac->avctx->profile = FF_PROFILE_AAC_HE;
+        }
+        res = AAC_RENAME(ff_decode_sbr_extension)(ac, &che->sbr, gb, crc_flag, cnt, elem_type);
+        break;
+    case EXT_DYNAMIC_RANGE:
+        res = decode_dynamic_range(&ac->che_drc, gb);
+        break;
+    case EXT_FILL:
+        decode_fill(ac, gb, 8 * cnt - 4);
+        break;
+    case EXT_FILL_DATA:
+    case EXT_DATA_ELEMENT:
+    default:
+        skip_bits_long(gb, 8 * cnt - 4);
+        break;
+    };
+    return res;
+}
+
+/**
+ * Decode Temporal Noise Shaping filter coefficients and apply all-pole filters; reference: 4.6.9.3.
+ *
+ * @param   decode  1 if tool is used normally, 0 if tool is used in LTP.
+ * @param   coef    spectral coefficients
+ */
+static void apply_tns(INTFLOAT coef_param[1024], TemporalNoiseShaping *tns,
+                      IndividualChannelStream *ics, int decode)
+{
+    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
+    int w, filt, m, i;
+    int bottom, top, order, start, end, size, inc;
+    INTFLOAT lpc[TNS_MAX_ORDER];
+    INTFLOAT tmp[TNS_MAX_ORDER+1];
+    UINTFLOAT *coef = coef_param;
+
+    for (w = 0; w < ics->num_windows; w++) {
+        bottom = ics->num_swb;
+        for (filt = 0; filt < tns->n_filt[w]; filt++) {
+            top    = bottom;
+            bottom = FFMAX(0, top - tns->length[w][filt]);
+            order  = tns->order[w][filt];
+            if (order == 0)
+                continue;
+
+            // tns_decode_coef
+            AAC_RENAME(compute_lpc_coefs)(tns->coef[w][filt], order, lpc, 0, 0, 0);
+
+            start = ics->swb_offset[FFMIN(bottom, mmm)];
+            end   = ics->swb_offset[FFMIN(   top, mmm)];
+            if ((size = end - start) <= 0)
+                continue;
+            if (tns->direction[w][filt]) {
+                inc = -1;
+                start = end - 1;
+            } else {
+                inc = 1;
+            }
+            start += w * 128;
+
+            if (decode) {
+                // ar filter
+                for (m = 0; m < size; m++, start += inc)
+                    for (i = 1; i <= FFMIN(m, order); i++)
+                        coef[start] -= AAC_MUL26((INTFLOAT)coef[start - i * inc], lpc[i - 1]);
+            } else {
+                // ma filter
+                for (m = 0; m < size; m++, start += inc) {
+                    tmp[0] = coef[start];
+                    for (i = 1; i <= FFMIN(m, order); i++)
+                        coef[start] += AAC_MUL26(tmp[i], lpc[i - 1]);
+                    for (i = order; i > 0; i--)
+                        tmp[i] = tmp[i - 1];
+                }
+            }
+        }
+    }
+}
+
+/**
+ *  Apply windowing and MDCT to obtain the spectral
+ *  coefficient from the predicted sample by LTP.
+ */
+static void windowing_and_mdct_ltp(AACContext *ac, INTFLOAT *out,
+                                   INTFLOAT *in, IndividualChannelStream *ics)
+{
+    const INTFLOAT *lwindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    const INTFLOAT *lwindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+
+    if (ics->window_sequence[0] != LONG_STOP_SEQUENCE) {
+        ac->fdsp->vector_fmul(in, in, lwindow_prev, 1024);
+    } else {
+        memset(in, 0, 448 * sizeof(*in));
+        ac->fdsp->vector_fmul(in + 448, in + 448, swindow_prev, 128);
+    }
+    if (ics->window_sequence[0] != LONG_START_SEQUENCE) {
+        ac->fdsp->vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024);
+    } else {
+        ac->fdsp->vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128);
+        memset(in + 1024 + 576, 0, 448 * sizeof(*in));
+    }
+    ac->mdct_ltp.mdct_calc(&ac->mdct_ltp, out, in);
+}
+
+/**
+ * Apply the long term prediction
+ */
+static void apply_ltp(AACContext *ac, SingleChannelElement *sce)
+{
+    const LongTermPrediction *ltp = &sce->ics.ltp;
+    const uint16_t *offsets = sce->ics.swb_offset;
+    int i, sfb;
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        INTFLOAT *predTime = sce->ret;
+        INTFLOAT *predFreq = ac->buf_mdct;
+        int16_t num_samples = 2048;
+
+        if (ltp->lag < 1024)
+            num_samples = ltp->lag + 1024;
+        for (i = 0; i < num_samples; i++)
+            predTime[i] = AAC_MUL30(sce->ltp_state[i + 2048 - ltp->lag], ltp->coef);
+        memset(&predTime[i], 0, (2048 - i) * sizeof(*predTime));
+
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+
+        if (sce->tns.present)
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+
+        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
+            if (ltp->used[sfb])
+                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
+                    sce->coeffs[i] += (UINTFLOAT)predFreq[i];
+    }
+}
+
+/**
+ * Update the LTP buffer for next frame
+ */
+static void update_ltp(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *saved     = sce->saved;
+    INTFLOAT *saved_ltp = sce->coeffs;
+    const INTFLOAT *lwindow = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    int i;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        memcpy(saved_ltp,       saved, 512 * sizeof(*saved_ltp));
+        memset(saved_ltp + 576, 0,     448 * sizeof(*saved_ltp));
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+
+        for (i = 0; i < 64; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], swindow[63 - i]);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        memcpy(saved_ltp,       ac->buf_mdct + 512, 448 * sizeof(*saved_ltp));
+        memset(saved_ltp + 576, 0,                  448 * sizeof(*saved_ltp));
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+
+        for (i = 0; i < 64; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], swindow[63 - i]);
+    } else { // LONG_STOP or ONLY_LONG
+        ac->fdsp->vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
+
+        for (i = 0; i < 512; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], lwindow[511 - i]);
+    }
+
+    memcpy(sce->ltp_state,      sce->ltp_state+1024, 1024 * sizeof(*sce->ltp_state));
+    memcpy(sce->ltp_state+1024, sce->ret,            1024 * sizeof(*sce->ltp_state));
+    memcpy(sce->ltp_state+2048, saved_ltp,           1024 * sizeof(*sce->ltp_state));
+}
+
+/**
+ * Conduct IMDCT and windowing.
+ */
+static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    const INTFLOAT *swindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    const INTFLOAT *lwindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    INTFLOAT *buf  = ac->buf_mdct;
+    INTFLOAT *temp = ac->temp;
+    int i;
+
+    // imdct
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 1024; i += 128)
+            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
+    } else {
+        ac->mdct.imdct_half(&ac->mdct, buf, in);
+#if USE_FIXED
+        for (i=0; i<1024; i++)
+          buf[i] = (buf[i] + 4) >> 3;
+#endif /* USE_FIXED */
+    }
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp->vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+    } else {
+        memcpy(                         out,               saved,            448 * sizeof(*out));
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            ac->fdsp->vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
+            ac->fdsp->vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
+            memcpy(                     out + 448 + 4*128, temp, 64 * sizeof(*out));
+        } else {
+            ac->fdsp->vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            memcpy(                     out + 576,         buf + 64,         448 * sizeof(*out));
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        memcpy(                     saved,       temp + 64,         64 * sizeof(*saved));
+        ac->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(*saved));
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        memcpy(                     saved,       buf + 512,        448 * sizeof(*saved));
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(*saved));
+    } else { // LONG_STOP or ONLY_LONG
+        memcpy(                     saved,       buf + 512,        512 * sizeof(*saved));
+    }
+}
+
+/**
+ * Conduct IMDCT and windowing.
+ */
+static void imdct_and_windowing_960(AACContext *ac, SingleChannelElement *sce)
+{
+#if !USE_FIXED
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    const INTFLOAT *swindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_120) : AAC_RENAME(ff_sine_120);
+    const INTFLOAT *lwindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_long_960) : AAC_RENAME(ff_sine_960);
+    const INTFLOAT *swindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_short_120) : AAC_RENAME(ff_sine_120);
+    INTFLOAT *buf  = ac->buf_mdct;
+    INTFLOAT *temp = ac->temp;
+    int i;
+
+    // imdct
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 8; i++)
+            ac->mdct120->imdct_half(ac->mdct120, buf + i * 120, in + i * 128, 1);
+    } else {
+        ac->mdct960->imdct_half(ac->mdct960, buf, in, 1);
+    }
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+        (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp->vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 480);
+    } else {
+        memcpy(                          out,               saved,            420 * sizeof(*out));
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            ac->fdsp->vector_fmul_window(out + 420 + 0*120, saved + 420,      buf + 0*120, swindow_prev, 60);
+            ac->fdsp->vector_fmul_window(out + 420 + 1*120, buf + 0*120 + 60, buf + 1*120, swindow,      60);
+            ac->fdsp->vector_fmul_window(out + 420 + 2*120, buf + 1*120 + 60, buf + 2*120, swindow,      60);
+            ac->fdsp->vector_fmul_window(out + 420 + 3*120, buf + 2*120 + 60, buf + 3*120, swindow,      60);
+            ac->fdsp->vector_fmul_window(temp,              buf + 3*120 + 60, buf + 4*120, swindow,      60);
+            memcpy(                      out + 420 + 4*120, temp, 60 * sizeof(*out));
+        } else {
+            ac->fdsp->vector_fmul_window(out + 420,         saved + 420,      buf,         swindow_prev, 60);
+            memcpy(                      out + 540,         buf + 60,         420 * sizeof(*out));
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        memcpy(                      saved,       temp + 60,         60 * sizeof(*saved));
+        ac->fdsp->vector_fmul_window(saved + 60,  buf + 4*120 + 60, buf + 5*120, swindow, 60);
+        ac->fdsp->vector_fmul_window(saved + 180, buf + 5*120 + 60, buf + 6*120, swindow, 60);
+        ac->fdsp->vector_fmul_window(saved + 300, buf + 6*120 + 60, buf + 7*120, swindow, 60);
+        memcpy(                      saved + 420, buf + 7*120 + 60,  60 * sizeof(*saved));
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        memcpy(                      saved,       buf + 480,        420 * sizeof(*saved));
+        memcpy(                      saved + 420, buf + 7*120 + 60,  60 * sizeof(*saved));
+    } else { // LONG_STOP or ONLY_LONG
+        memcpy(                      saved,       buf + 480,        480 * sizeof(*saved));
+    }
+#endif
+}
+static void imdct_and_windowing_ld(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    INTFLOAT *buf  = ac->buf_mdct;
+#if USE_FIXED
+    int i;
+#endif /* USE_FIXED */
+
+    // imdct
+    ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
+
+#if USE_FIXED
+    for (i = 0; i < 1024; i++)
+        buf[i] = (buf[i] + 2) >> 2;
+#endif /* USE_FIXED */
+
+    // window overlapping
+    if (ics->use_kb_window[1]) {
+        // AAC LD uses a low overlap sine window instead of a KBD window
+        memcpy(out, saved, 192 * sizeof(*out));
+        ac->fdsp->vector_fmul_window(out + 192, saved + 192, buf, AAC_RENAME(ff_sine_128), 64);
+        memcpy(                     out + 320, buf + 64, 192 * sizeof(*out));
+    } else {
+        ac->fdsp->vector_fmul_window(out, saved, buf, AAC_RENAME(ff_sine_512), 256);
+    }
+
+    // buffer update
+    memcpy(saved, buf + 256, 256 * sizeof(*saved));
+}
+
+static void imdct_and_windowing_eld(AACContext *ac, SingleChannelElement *sce)
+{
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    INTFLOAT *buf  = ac->buf_mdct;
+    int i;
+    const int n  = ac->oc[1].m4ac.frame_length_short ? 480 : 512;
+    const int n2 = n >> 1;
+    const int n4 = n >> 2;
+    const INTFLOAT *const window = n == 480 ? AAC_RENAME(ff_aac_eld_window_480) :
+                                           AAC_RENAME(ff_aac_eld_window_512);
+
+    // Inverse transform, mapped to the conventional IMDCT by
+    // Chivukula, R.K.; Reznik, Y.A.; Devarajan, V.,
+    // "Efficient algorithms for MPEG-4 AAC-ELD, AAC-LD and AAC-LC filterbanks,"
+    // International Conference on Audio, Language and Image Processing, ICALIP 2008.
+    // URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4590245&isnumber=4589950
+    for (i = 0; i < n2; i+=2) {
+        INTFLOAT temp;
+        temp =  in[i    ]; in[i    ] = -in[n - 1 - i]; in[n - 1 - i] = temp;
+        temp = -in[i + 1]; in[i + 1] =  in[n - 2 - i]; in[n - 2 - i] = temp;
+    }
+#if !USE_FIXED
+    if (n == 480)
+        ac->mdct480->imdct_half(ac->mdct480, buf, in, 1);
+    else
+#endif
+        ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
+
+#if USE_FIXED
+    for (i = 0; i < 1024; i++)
+      buf[i] = (buf[i] + 1) >> 1;
+#endif /* USE_FIXED */
+
+    for (i = 0; i < n; i+=2) {
+        buf[i] = -buf[i];
+    }
+    // Like with the regular IMDCT at this point we still have the middle half
+    // of a transform but with even symmetry on the left and odd symmetry on
+    // the right
+
+    // window overlapping
+    // The spec says to use samples [0..511] but the reference decoder uses
+    // samples [128..639].
+    for (i = n4; i < n2; i ++) {
+        out[i - n4] = AAC_MUL31(   buf[    n2 - 1 - i] , window[i       - n4]) +
+                      AAC_MUL31( saved[        i + n2] , window[i +   n - n4]) +
+                      AAC_MUL31(-saved[n + n2 - 1 - i] , window[i + 2*n - n4]) +
+                      AAC_MUL31(-saved[  2*n + n2 + i] , window[i + 3*n - n4]);
+    }
+    for (i = 0; i < n2; i ++) {
+        out[n4 + i] = AAC_MUL31(   buf[              i] , window[i + n2       - n4]) +
+                      AAC_MUL31(-saved[      n - 1 - i] , window[i + n2 +   n - n4]) +
+                      AAC_MUL31(-saved[          n + i] , window[i + n2 + 2*n - n4]) +
+                      AAC_MUL31( saved[2*n + n - 1 - i] , window[i + n2 + 3*n - n4]);
+    }
+    for (i = 0; i < n4; i ++) {
+        out[n2 + n4 + i] = AAC_MUL31(   buf[    i + n2] , window[i +   n - n4]) +
+                           AAC_MUL31(-saved[n2 - 1 - i] , window[i + 2*n - n4]) +
+                           AAC_MUL31(-saved[n + n2 + i] , window[i + 3*n - n4]);
+    }
+
+    // buffer update
+    memmove(saved + n, saved, 2 * n * sizeof(*saved));
+    memcpy( saved,       buf,     n * sizeof(*saved));
+}
+
+/**
+ * channel coupling transformation interface
+ *
+ * @param   apply_coupling_method   pointer to (in)dependent coupling function
+ */
+static void apply_channel_coupling(AACContext *ac, ChannelElement *cc,
+                                   enum RawDataBlockType type, int elem_id,
+                                   enum CouplingPoint coupling_point,
+                                   void (*apply_coupling_method)(AACContext *ac, SingleChannelElement *target, ChannelElement *cce, int index))
+{
+    int i, c;
+
+    for (i = 0; i < MAX_ELEM_ID; i++) {
+        ChannelElement *cce = ac->che[TYPE_CCE][i];
+        int index = 0;
+
+        if (cce && cce->coup.coupling_point == coupling_point) {
+            ChannelCoupling *coup = &cce->coup;
+
+            for (c = 0; c <= coup->num_coupled; c++) {
+                if (coup->type[c] == type && coup->id_select[c] == elem_id) {
+                    if (coup->ch_select[c] != 1) {
+                        apply_coupling_method(ac, &cc->ch[0], cce, index);
+                        if (coup->ch_select[c] != 0)
+                            index++;
+                    }
+                    if (coup->ch_select[c] != 2)
+                        apply_coupling_method(ac, &cc->ch[1], cce, index++);
+                } else
+                    index += 1 + (coup->ch_select[c] == 3);
+            }
+        }
+    }
+}
+
+/**
+ * Convert spectral data to samples, applying all supported tools as appropriate.
+ */
+static void spectral_to_sample(AACContext *ac, int samples)
+{
+    int i, type;
+    void (*imdct_and_window)(AACContext *ac, SingleChannelElement *sce);
+    switch (ac->oc[1].m4ac.object_type) {
+    case AOT_ER_AAC_LD:
+        imdct_and_window = imdct_and_windowing_ld;
+        break;
+    case AOT_ER_AAC_ELD:
+        imdct_and_window = imdct_and_windowing_eld;
+        break;
+    default:
+        if (ac->oc[1].m4ac.frame_length_short)
+            imdct_and_window = imdct_and_windowing_960;
+        else
+            imdct_and_window = ac->imdct_and_windowing;
+    }
+    for (type = 3; type >= 0; type--) {
+        for (i = 0; i < MAX_ELEM_ID; i++) {
+            ChannelElement *che = ac->che[type][i];
+            if (che && che->present) {
+                if (type <= TYPE_CPE)
+                    apply_channel_coupling(ac, che, type, i, BEFORE_TNS, AAC_RENAME(apply_dependent_coupling));
+                if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
+                    if (che->ch[0].ics.predictor_present) {
+                        if (che->ch[0].ics.ltp.present)
+                            ac->apply_ltp(ac, &che->ch[0]);
+                        if (che->ch[1].ics.ltp.present && type == TYPE_CPE)
+                            ac->apply_ltp(ac, &che->ch[1]);
+                    }
+                }
+                if (che->ch[0].tns.present)
+                    ac->apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
+                if (che->ch[1].tns.present)
+                    ac->apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
+                if (type <= TYPE_CPE)
+                    apply_channel_coupling(ac, che, type, i, BETWEEN_TNS_AND_IMDCT, AAC_RENAME(apply_dependent_coupling));
+                if (type != TYPE_CCE || che->coup.coupling_point == AFTER_IMDCT) {
+                    imdct_and_window(ac, &che->ch[0]);
+                    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
+                        ac->update_ltp(ac, &che->ch[0]);
+                    if (type == TYPE_CPE) {
+                        imdct_and_window(ac, &che->ch[1]);
+                        if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
+                            ac->update_ltp(ac, &che->ch[1]);
+                    }
+                    if (ac->oc[1].m4ac.sbr > 0) {
+                        AAC_RENAME(ff_sbr_apply)(ac, &che->sbr, type, che->ch[0].ret, che->ch[1].ret);
+                    }
+                }
+                if (type <= TYPE_CCE)
+                    apply_channel_coupling(ac, che, type, i, AFTER_IMDCT, AAC_RENAME(apply_independent_coupling));
+
+#if USE_FIXED
+                {
+                    int j;
+                    /* preparation for resampler */
+                    for(j = 0; j<samples; j++){
+                        che->ch[0].ret[j] = (int32_t)av_clip64((int64_t)che->ch[0].ret[j]*128, INT32_MIN, INT32_MAX-0x8000)+0x8000;
+                        if(type == TYPE_CPE)
+                            che->ch[1].ret[j] = (int32_t)av_clip64((int64_t)che->ch[1].ret[j]*128, INT32_MIN, INT32_MAX-0x8000)+0x8000;
+                    }
+                }
+#endif /* USE_FIXED */
+                che->present = 0;
+            } else if (che) {
+                av_log(ac->avctx, AV_LOG_VERBOSE, "ChannelElement %d.%d missing \n", type, i);
+            }
+        }
+    }
+}
+
+static int parse_adts_frame_header(AACContext *ac, GetBitContext *gb)
+{
+    int size;
+    AACADTSHeaderInfo hdr_info;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int layout_map_tags, ret;
+
+    size = ff_adts_header_parse(gb, &hdr_info);
+    if (size > 0) {
+        if (!ac->warned_num_aac_frames && hdr_info.num_aac_frames != 1) {
+            // This is 2 for "VLB " audio in NSV files.
+            // See samples/nsv/vlb_audio.
+            avpriv_report_missing_feature(ac->avctx,
+                                          "More than one AAC RDB per ADTS frame");
+            ac->warned_num_aac_frames = 1;
+        }
+        push_output_configuration(ac);
+        if (hdr_info.chan_config) {
+            ac->oc[1].m4ac.chan_config = hdr_info.chan_config;
+            if ((ret = set_default_channel_config(ac->avctx,
+                                                  layout_map,
+                                                  &layout_map_tags,
+                                                  hdr_info.chan_config)) < 0)
+                return ret;
+            if ((ret = output_configure(ac, layout_map, layout_map_tags,
+                                        FFMAX(ac->oc[1].status,
+                                              OC_TRIAL_FRAME), 0)) < 0)
+                return ret;
+        } else {
+            ac->oc[1].m4ac.chan_config = 0;
+            /**
+             * dual mono frames in Japanese DTV can have chan_config 0
+             * WITHOUT specifying PCE.
+             *  thus, set dual mono as default.
+             */
+            if (ac->dmono_mode && ac->oc[0].status == OC_NONE) {
+                layout_map_tags = 2;
+                layout_map[0][0] = layout_map[1][0] = TYPE_SCE;
+                layout_map[0][2] = layout_map[1][2] = AAC_CHANNEL_FRONT;
+                layout_map[0][1] = 0;
+                layout_map[1][1] = 1;
+                if (output_configure(ac, layout_map, layout_map_tags,
+                                     OC_TRIAL_FRAME, 0))
+                    return -7;
+            }
+        }
+        ac->oc[1].m4ac.sample_rate     = hdr_info.sample_rate;
+        ac->oc[1].m4ac.sampling_index  = hdr_info.sampling_index;
+        ac->oc[1].m4ac.object_type     = hdr_info.object_type;
+        ac->oc[1].m4ac.frame_length_short = 0;
+        if (ac->oc[0].status != OC_LOCKED ||
+            ac->oc[0].m4ac.chan_config != hdr_info.chan_config ||
+            ac->oc[0].m4ac.sample_rate != hdr_info.sample_rate) {
+            ac->oc[1].m4ac.sbr = -1;
+            ac->oc[1].m4ac.ps  = -1;
+        }
+        if (!hdr_info.crc_absent)
+            skip_bits(gb, 16);
+    }
+    return size;
+}
+
+static int aac_decode_er_frame(AVCodecContext *avctx, void *data,
+                               int *got_frame_ptr, GetBitContext *gb)
+{
+    AACContext *ac = avctx->priv_data;
+    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
+    ChannelElement *che;
+    int err, i;
+    int samples = m4ac->frame_length_short ? 960 : 1024;
+    int chan_config = m4ac->chan_config;
+    int aot = m4ac->object_type;
+
+    if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD)
+        samples >>= 1;
+
+    ac->frame = data;
+
+    if ((err = frame_configure_elements(avctx)) < 0)
+        return err;
+
+    // The FF_PROFILE_AAC_* defines are all object_type - 1
+    // This may lead to an undefined profile being signaled
+    ac->avctx->profile = aot - 1;
+
+    ac->tags_mapped = 0;
+
+    if (chan_config < 0 || (chan_config >= 8 && chan_config < 11) || chan_config >= 13) {
+        avpriv_request_sample(avctx, "Unknown ER channel configuration %d",
+                              chan_config);
+        return AVERROR_INVALIDDATA;
+    }
+    for (i = 0; i < tags_per_config[chan_config]; i++) {
+        const int elem_type = aac_channel_layout_map[chan_config-1][i][0];
+        const int elem_id   = aac_channel_layout_map[chan_config-1][i][1];
+        if (!(che=get_che(ac, elem_type, elem_id))) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "channel element %d.%d is not allocated\n",
+                   elem_type, elem_id);
+            return AVERROR_INVALIDDATA;
+        }
+        che->present = 1;
+        if (aot != AOT_ER_AAC_ELD)
+            skip_bits(gb, 4);
+        switch (elem_type) {
+        case TYPE_SCE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            break;
+        case TYPE_CPE:
+            err = decode_cpe(ac, gb, che);
+            break;
+        case TYPE_LFE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            break;
+        }
+        if (err < 0)
+            return err;
+    }
+
+    spectral_to_sample(ac, samples);
+
+    if (!ac->frame->data[0] && samples) {
+        av_log(avctx, AV_LOG_ERROR, "no frame data found\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ac->frame->nb_samples = samples;
+    ac->frame->sample_rate = avctx->sample_rate;
+    *got_frame_ptr = 1;
+
+    skip_bits_long(gb, get_bits_left(gb));
+    return 0;
+}
+
+static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
+                                int *got_frame_ptr, GetBitContext *gb, AVPacket *avpkt)
+{
+    AACContext *ac = avctx->priv_data;
+    ChannelElement *che = NULL, *che_prev = NULL;
+    enum RawDataBlockType elem_type, che_prev_type = TYPE_END;
+    int err, elem_id;
+    int samples = 0, multiplier, audio_found = 0, pce_found = 0;
+    int is_dmono, sce_count = 0;
+    int payload_alignment;
+    uint8_t che_presence[4][MAX_ELEM_ID] = {{0}};
+
+    ac->frame = data;
+
+    if (show_bits(gb, 12) == 0xfff) {
+        if ((err = parse_adts_frame_header(ac, gb)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Error decoding AAC frame header.\n");
+            goto fail;
+        }
+        if (ac->oc[1].m4ac.sampling_index > 12) {
+            av_log(ac->avctx, AV_LOG_ERROR, "invalid sampling rate index %d\n", ac->oc[1].m4ac.sampling_index);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+    }
+
+    if ((err = frame_configure_elements(avctx)) < 0)
+        goto fail;
+
+    // The FF_PROFILE_AAC_* defines are all object_type - 1
+    // This may lead to an undefined profile being signaled
+    ac->avctx->profile = ac->oc[1].m4ac.object_type - 1;
+
+    payload_alignment = get_bits_count(gb);
+    ac->tags_mapped = 0;
+    // parse
+    while ((elem_type = get_bits(gb, 3)) != TYPE_END) {
+        elem_id = get_bits(gb, 4);
+
+        if (avctx->debug & FF_DEBUG_STARTCODE)
+            av_log(avctx, AV_LOG_DEBUG, "Elem type:%x id:%x\n", elem_type, elem_id);
+
+        if (!avctx->channels && elem_type != TYPE_PCE) {
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+
+        if (elem_type < TYPE_DSE) {
+            if (che_presence[elem_type][elem_id]) {
+                int error = che_presence[elem_type][elem_id] > 1;
+                av_log(ac->avctx, error ? AV_LOG_ERROR : AV_LOG_DEBUG, "channel element %d.%d duplicate\n",
+                       elem_type, elem_id);
+                if (error) {
+                    err = AVERROR_INVALIDDATA;
+                    goto fail;
+                }
+            }
+            che_presence[elem_type][elem_id]++;
+
+            if (!(che=get_che(ac, elem_type, elem_id))) {
+                av_log(ac->avctx, AV_LOG_ERROR, "channel element %d.%d is not allocated\n",
+                       elem_type, elem_id);
+                err = AVERROR_INVALIDDATA;
+                goto fail;
+            }
+            samples = ac->oc[1].m4ac.frame_length_short ? 960 : 1024;
+            che->present = 1;
+        }
+
+        switch (elem_type) {
+
+        case TYPE_SCE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            audio_found = 1;
+            sce_count++;
+            break;
+
+        case TYPE_CPE:
+            err = decode_cpe(ac, gb, che);
+            audio_found = 1;
+            break;
+
+        case TYPE_CCE:
+            err = decode_cce(ac, gb, che);
+            break;
+
+        case TYPE_LFE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            audio_found = 1;
+            break;
+
+        case TYPE_DSE:
+            err = skip_data_stream_element(ac, gb);
+            break;
+
+        case TYPE_PCE: {
+            uint8_t layout_map[MAX_ELEM_ID*4][3];
+            int tags;
+
+            int pushed = push_output_configuration(ac);
+            if (pce_found && !pushed) {
+                err = AVERROR_INVALIDDATA;
+                goto fail;
+            }
+
+            tags = decode_pce(avctx, &ac->oc[1].m4ac, layout_map, gb,
+                              payload_alignment);
+            if (tags < 0) {
+                err = tags;
+                break;
+            }
+            if (pce_found) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not evaluating a further program_config_element as this construct is dubious at best.\n");
+                pop_output_configuration(ac);
+            } else {
+                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
+                if (!err)
+                    ac->oc[1].m4ac.chan_config = 0;
+                pce_found = 1;
+            }
+            break;
+        }
+
+        case TYPE_FIL:
+            if (elem_id == 15)
+                elem_id += get_bits(gb, 8) - 1;
+            if (get_bits_left(gb) < 8 * elem_id) {
+                    av_log(avctx, AV_LOG_ERROR, "TYPE_FIL: "overread_err);
+                    err = AVERROR_INVALIDDATA;
+                    goto fail;
+            }
+            while (elem_id > 0)
+                elem_id -= decode_extension_payload(ac, gb, elem_id, che_prev, che_prev_type);
+            err = 0; /* FIXME */
+            break;
+
+        default:
+            err = AVERROR_BUG; /* should not happen, but keeps compiler happy */
+            break;
+        }
+
+        if (elem_type < TYPE_DSE) {
+            che_prev      = che;
+            che_prev_type = elem_type;
+        }
+
+        if (err)
+            goto fail;
+
+        if (get_bits_left(gb) < 3) {
+            av_log(avctx, AV_LOG_ERROR, overread_err);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+    }
+
+    if (!avctx->channels) {
+        *got_frame_ptr = 0;
+        return 0;
+    }
+
+    multiplier = (ac->oc[1].m4ac.sbr == 1) ? ac->oc[1].m4ac.ext_sample_rate > ac->oc[1].m4ac.sample_rate : 0;
+    samples <<= multiplier;
+
+    spectral_to_sample(ac, samples);
+
+    if (ac->oc[1].status && audio_found) {
+        avctx->sample_rate = ac->oc[1].m4ac.sample_rate << multiplier;
+        avctx->frame_size = samples;
+        ac->oc[1].status = OC_LOCKED;
+    }
+
+    if (multiplier)
+        avctx->internal->skip_samples_multiplier = 2;
+
+    if (!ac->frame->data[0] && samples) {
+        av_log(avctx, AV_LOG_ERROR, "no frame data found\n");
+        err = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    if (samples) {
+        ac->frame->nb_samples = samples;
+        ac->frame->sample_rate = avctx->sample_rate;
+    } else
+        av_frame_unref(ac->frame);
+    *got_frame_ptr = !!samples;
+
+    /* for dual-mono audio (SCE + SCE) */
+    is_dmono = ac->dmono_mode && sce_count == 2 &&
+               ac->oc[1].channel_layout == (AV_CH_FRONT_LEFT | AV_CH_FRONT_RIGHT);
+    if (is_dmono) {
+        if (ac->dmono_mode == 1)
+            ((AVFrame *)data)->data[1] =((AVFrame *)data)->data[0];
+        else if (ac->dmono_mode == 2)
+            ((AVFrame *)data)->data[0] =((AVFrame *)data)->data[1];
+    }
+
+    return 0;
+fail:
+    pop_output_configuration(ac);
+    return err;
+}
+
+static int aac_decode_frame(AVCodecContext *avctx, void *data,
+                            int *got_frame_ptr, AVPacket *avpkt)
+{
+    AACContext *ac = avctx->priv_data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    GetBitContext gb;
+    int buf_consumed;
+    int buf_offset;
+    int err;
+    int new_extradata_size;
+    const uint8_t *new_extradata = av_packet_get_side_data(avpkt,
+                                       AV_PKT_DATA_NEW_EXTRADATA,
+                                       &new_extradata_size);
+    int jp_dualmono_size;
+    const uint8_t *jp_dualmono   = av_packet_get_side_data(avpkt,
+                                       AV_PKT_DATA_JP_DUALMONO,
+                                       &jp_dualmono_size);
+
+    if (new_extradata) {
+        /* discard previous configuration */
+        ac->oc[1].status = OC_NONE;
+        err = decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
+                                           new_extradata,
+                                           new_extradata_size * 8LL, 1);
+        if (err < 0) {
+            return err;
+        }
+    }
+
+    ac->dmono_mode = 0;
+    if (jp_dualmono && jp_dualmono_size > 0)
+        ac->dmono_mode =  1 + *jp_dualmono;
+    if (ac->force_dmono_mode >= 0)
+        ac->dmono_mode = ac->force_dmono_mode;
+
+    if (INT_MAX / 8 <= buf_size)
+        return AVERROR_INVALIDDATA;
+
+    if ((err = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return err;
+
+    switch (ac->oc[1].m4ac.object_type) {
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LTP:
+    case AOT_ER_AAC_LD:
+    case AOT_ER_AAC_ELD:
+        err = aac_decode_er_frame(avctx, data, got_frame_ptr, &gb);
+        break;
+    default:
+        err = aac_decode_frame_int(avctx, data, got_frame_ptr, &gb, avpkt);
+    }
+    if (err < 0)
+        return err;
+
+    buf_consumed = (get_bits_count(&gb) + 7) >> 3;
+    for (buf_offset = buf_consumed; buf_offset < buf_size; buf_offset++)
+        if (buf[buf_offset])
+            break;
+
+    return buf_size > buf_offset ? buf_consumed : buf_size;
+}
+
+static av_cold int aac_decode_close(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int i, type;
+
+    for (i = 0; i < MAX_ELEM_ID; i++) {
+        for (type = 0; type < 4; type++) {
+            if (ac->che[type][i])
+                AAC_RENAME(ff_aac_sbr_ctx_close)(&ac->che[type][i]->sbr);
+            av_freep(&ac->che[type][i]);
+        }
+    }
+
+    ff_mdct_end(&ac->mdct);
+    ff_mdct_end(&ac->mdct_small);
+    ff_mdct_end(&ac->mdct_ld);
+    ff_mdct_end(&ac->mdct_ltp);
+#if !USE_FIXED
+    ff_mdct15_uninit(&ac->mdct120);
+    ff_mdct15_uninit(&ac->mdct480);
+    ff_mdct15_uninit(&ac->mdct960);
+#endif
+    av_freep(&ac->fdsp);
+    return 0;
+}
+
+static void aacdec_init(AACContext *c)
+{
+    c->imdct_and_windowing                      = imdct_and_windowing;
+    c->apply_ltp                                = apply_ltp;
+    c->apply_tns                                = apply_tns;
+    c->windowing_and_mdct_ltp                   = windowing_and_mdct_ltp;
+    c->update_ltp                               = update_ltp;
+#if USE_FIXED
+    c->vector_pow43                             = vector_pow43;
+    c->subband_scale                            = subband_scale;
+#endif
+
+#if !USE_FIXED
+    if(ARCH_MIPS)
+        ff_aacdec_init_mips(c);
+#endif /* !USE_FIXED */
+}
+/**
+ * AVOptions for Japanese DTV specific extensions (ADTS only)
+ */
+#define AACDEC_FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+    {"dual_mono_mode", "Select the channel to decode for dual mono",
+     offsetof(AACContext, force_dmono_mode), AV_OPT_TYPE_INT, {.i64=-1}, -1, 2,
+     AACDEC_FLAGS, "dual_mono_mode"},
+
+    {"auto", "autoselection",            0, AV_OPT_TYPE_CONST, {.i64=-1}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"main", "Select Main/Left channel", 0, AV_OPT_TYPE_CONST, {.i64= 1}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"sub" , "Select Sub/Right channel", 0, AV_OPT_TYPE_CONST, {.i64= 2}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"both", "Select both channels",     0, AV_OPT_TYPE_CONST, {.i64= 0}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+
+    {NULL},
+};
+
+static const AVClass aac_decoder_class = {
+    .class_name = "AAC decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
diff --git a/libavcodec/aacdectab.h b/libavcodec/aacdectab.h
index b7c5f7e..baf51a7 100644
--- a/libavcodec/aacdectab.h
+++ b/libavcodec/aacdectab.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,49 +35,6 @@
 
 #include <stdint.h>
 
-/* @name ltp_coef
- * Table of the LTP coefficients
- */
-static const float ltp_coef[8] = {
-    0.570829, 0.696616, 0.813004, 0.911304,
-    0.984900, 1.067894, 1.194601, 1.369533,
-};
-
-/* @name tns_tmp2_map
- * Tables of the tmp2[] arrays of LPC coefficients used for TNS.
- * The suffix _M_N[] indicate the values of coef_compress and coef_res
- * respectively.
- * @{
- */
-static const float tns_tmp2_map_1_3[4] = {
-     0.00000000, -0.43388373,  0.64278758,  0.34202015,
-};
-
-static const float tns_tmp2_map_0_3[8] = {
-     0.00000000, -0.43388373, -0.78183150, -0.97492790,
-     0.98480773,  0.86602539,  0.64278758,  0.34202015,
-};
-
-static const float tns_tmp2_map_1_4[8] = {
-     0.00000000, -0.20791170, -0.40673664, -0.58778524,
-     0.67369562,  0.52643216,  0.36124167,  0.18374951,
-};
-
-static const float tns_tmp2_map_0_4[16] = {
-     0.00000000, -0.20791170, -0.40673664, -0.58778524,
-    -0.74314481, -0.86602539, -0.95105654, -0.99452192,
-     0.99573416,  0.96182561,  0.89516330,  0.79801720,
-     0.67369562,  0.52643216,  0.36124167,  0.18374951,
-};
-
-static const float * const tns_tmp2_map[4] = {
-    tns_tmp2_map_0_3,
-    tns_tmp2_map_0_4,
-    tns_tmp2_map_1_3,
-    tns_tmp2_map_1_4
-};
-// @}
-
 static const int8_t tags_per_config[16] = { 0, 1, 1, 2, 3, 3, 4, 5, 0, 0, 0, 4, 5, 0, 5, 0 };
 
 static const uint8_t aac_channel_layout_map[16][5][3] = {
diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index b7f60fb..4d0abb1 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -2,20 +2,20 @@
  * AAC encoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,10 @@
 /***********************************
  *              TODOs:
  * add sane pulse detection
- * add temporal noise shaping
  ***********************************/
 
+#include "libavutil/libm.h"
+#include "libavutil/thread.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
@@ -42,143 +43,93 @@
 #include "aac.h"
 #include "aactab.h"
 #include "aacenc.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
 
 #include "psymodel.h"
 
-#define AAC_MAX_CHANNELS 6
+static AVOnce aac_table_init = AV_ONCE_INIT;
 
-#define ERROR_IF(cond, ...) \
-    if (cond) { \
-        av_log(avctx, AV_LOG_ERROR, __VA_ARGS__); \
-        return AVERROR(EINVAL); \
+static void put_pce(PutBitContext *pb, AVCodecContext *avctx)
+{
+    int i, j;
+    AACEncContext *s = avctx->priv_data;
+    AACPCEInfo *pce = &s->pce;
+    const int bitexact = avctx->flags & AV_CODEC_FLAG_BITEXACT;
+    const char *aux_data = bitexact ? "Lavc" : LIBAVCODEC_IDENT;
+
+    put_bits(pb, 4, 0);
+
+    put_bits(pb, 2, avctx->profile);
+    put_bits(pb, 4, s->samplerate_index);
+
+    put_bits(pb, 4, pce->num_ele[0]); /* Front */
+    put_bits(pb, 4, pce->num_ele[1]); /* Side */
+    put_bits(pb, 4, pce->num_ele[2]); /* Back */
+    put_bits(pb, 2, pce->num_ele[3]); /* LFE */
+    put_bits(pb, 3, 0); /* Assoc data */
+    put_bits(pb, 4, 0); /* CCs */
+
+    put_bits(pb, 1, 0); /* Stereo mixdown */
+    put_bits(pb, 1, 0); /* Mono mixdown */
+    put_bits(pb, 1, 0); /* Something else */
+
+    for (i = 0; i < 4; i++) {
+        for (j = 0; j < pce->num_ele[i]; j++) {
+            if (i < 3)
+                put_bits(pb, 1, pce->pairing[i][j]);
+            put_bits(pb, 4, pce->index[i][j]);
+        }
     }
 
-float ff_aac_pow34sf_tab[428];
-
-static const uint8_t swb_size_1024_96[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
-    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_64[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8,
-    12, 12, 12, 16, 16, 16, 20, 24, 24, 28, 36,
-    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40
-};
-
-static const uint8_t swb_size_1024_48[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-    96
-};
-
-static const uint8_t swb_size_1024_32[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-};
-
-static const uint8_t swb_size_1024_24[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 36, 36, 40, 44, 48, 52, 52, 64, 64, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_16[] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 24, 24, 28, 28,
-    32, 36, 40, 40, 44, 48, 52, 56, 60, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_8[] = {
-    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-    16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 28, 28,
-    32, 36, 36, 40, 44, 48, 52, 56, 60, 64, 80
-};
-
-static const uint8_t * const swb_size_1024[] = {
-    swb_size_1024_96, swb_size_1024_96, swb_size_1024_64,
-    swb_size_1024_48, swb_size_1024_48, swb_size_1024_32,
-    swb_size_1024_24, swb_size_1024_24, swb_size_1024_16,
-    swb_size_1024_16, swb_size_1024_16, swb_size_1024_8
-};
-
-static const uint8_t swb_size_128_96[] = {
-    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
-};
-
-static const uint8_t swb_size_128_48[] = {
-    4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16
-};
-
-static const uint8_t swb_size_128_24[] = {
-    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 20
-};
-
-static const uint8_t swb_size_128_16[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 12, 12, 16, 20, 20
-};
-
-static const uint8_t swb_size_128_8[] = {
-    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 12, 16, 20, 20
-};
-
-static const uint8_t * const swb_size_128[] = {
-    /* the last entry on the following row is swb_size_128_64 but is a
-       duplicate of swb_size_128_96 */
-    swb_size_128_96, swb_size_128_96, swb_size_128_96,
-    swb_size_128_48, swb_size_128_48, swb_size_128_48,
-    swb_size_128_24, swb_size_128_24, swb_size_128_16,
-    swb_size_128_16, swb_size_128_16, swb_size_128_8
-};
-
-/** default channel configurations */
-static const uint8_t aac_chan_configs[6][5] = {
- {1, TYPE_SCE},                               // 1 channel  - single channel element
- {1, TYPE_CPE},                               // 2 channels - channel pair
- {2, TYPE_SCE, TYPE_CPE},                     // 3 channels - center + stereo
- {3, TYPE_SCE, TYPE_CPE, TYPE_SCE},           // 4 channels - front center + stereo + back center
- {3, TYPE_SCE, TYPE_CPE, TYPE_CPE},           // 5 channels - front center + stereo + back stereo
- {4, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_LFE}, // 6 channels - front center + stereo + back stereo + LFE
-};
-
-/**
- * Table to remap channels from Libav's default order to AAC order.
- */
-static const uint8_t aac_chan_maps[AAC_MAX_CHANNELS][AAC_MAX_CHANNELS] = {
-    { 0 },
-    { 0, 1 },
-    { 2, 0, 1 },
-    { 2, 0, 1, 3 },
-    { 2, 0, 1, 3, 4 },
-    { 2, 0, 1, 4, 5, 3 },
-};
+    avpriv_align_put_bits(pb);
+    put_bits(pb, 8, strlen(aux_data));
+    avpriv_put_string(pb, aux_data, 0);
+}
 
 /**
  * Make AAC audio config object.
  * @see 1.6.2.1 "Syntax - AudioSpecificConfig"
  */
-static void put_audio_specific_config(AVCodecContext *avctx)
+static int put_audio_specific_config(AVCodecContext *avctx)
 {
     PutBitContext pb;
     AACEncContext *s = avctx->priv_data;
+    int channels = (!s->needs_pce)*(s->channels - (s->channels == 8 ? 1 : 0));
+    const int max_size = 32;
+
+    avctx->extradata = av_mallocz(max_size);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
 
-    init_put_bits(&pb, avctx->extradata, avctx->extradata_size*8);
-    put_bits(&pb, 5, 2); //object type - AAC-LC
+    init_put_bits(&pb, avctx->extradata, max_size);
+    put_bits(&pb, 5, s->profile+1); //profile
     put_bits(&pb, 4, s->samplerate_index); //sample rate index
-    put_bits(&pb, 4, s->channels);
+    put_bits(&pb, 4, channels);
     //GASpecificConfig
     put_bits(&pb, 1, 0); //frame length - 1024 samples
     put_bits(&pb, 1, 0); //does not depend on core coder
     put_bits(&pb, 1, 0); //is not extension
+    if (s->needs_pce)
+        put_pce(&pb, avctx);
 
     //Explicitly Mark SBR absent
     put_bits(&pb, 11, 0x2b7); //sync extension
     put_bits(&pb, 5,  AOT_SBR);
     put_bits(&pb, 1,  0);
     flush_put_bits(&pb);
+    avctx->extradata_size = put_bits_count(&pb) >> 3;
+
+    return 0;
+}
+
+void ff_quantize_band_cost_cache_init(struct AACEncContext *s)
+{
+    ++s->quantize_band_cost_cache_generation;
+    if (s->quantize_band_cost_cache_generation == 0) {
+        memset(s->quantize_band_cost_cache, 0, sizeof(s->quantize_band_cost_cache));
+        s->quantize_band_cost_cache_generation = 1;
+    }
 }
 
 #define WINDOW_FUNC(type) \
@@ -250,16 +201,17 @@ static void apply_window_and_mdct(AACEncContext *s, SingleChannelElement *sce,
                                   float *audio)
 {
     int i;
-    float *output = sce->ret_buf;
+    const float *output = sce->ret_buf;
 
-    apply_window[sce->ics.window_sequence[0]](&s->fdsp, sce, audio);
+    apply_window[sce->ics.window_sequence[0]](s->fdsp, sce, audio);
 
     if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE)
         s->mdct1024.mdct_calc(&s->mdct1024, sce->coeffs, output);
     else
         for (i = 0; i < 1024; i += 128)
-            s->mdct128.mdct_calc(&s->mdct128, sce->coeffs + i, output + i*2);
+            s->mdct128.mdct_calc(&s->mdct128, &sce->coeffs[i], output + i*2);
     memcpy(audio, audio + 1024, sizeof(audio[0]) * 1024);
+    memcpy(sce->pcoeffs, sce->coeffs, sizeof(sce->pcoeffs));
 }
 
 /**
@@ -275,7 +227,7 @@ static void put_ics_info(AACEncContext *s, IndividualChannelStream *info)
     put_bits(&s->pb, 1, info->use_kb_window[0]);
     if (info->window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
         put_bits(&s->pb, 6, info->max_sfb);
-        put_bits(&s->pb, 1, 0);            // no prediction
+        put_bits(&s->pb, 1, !!info->predictor_present);
     } else {
         put_bits(&s->pb, 4, info->max_sfb);
         for (w = 1; w < 8; w++)
@@ -304,27 +256,18 @@ static void encode_ms_info(PutBitContext *pb, ChannelElement *cpe)
 static void adjust_frame_information(ChannelElement *cpe, int chans)
 {
     int i, w, w2, g, ch;
-    int start, maxsfb, cmaxsfb;
+    int maxsfb, cmaxsfb;
 
     for (ch = 0; ch < chans; ch++) {
         IndividualChannelStream *ics = &cpe->ch[ch].ics;
-        start = 0;
         maxsfb = 0;
         cpe->ch[ch].pulse.num_pulse = 0;
-        for (w = 0; w < ics->num_windows*16; w += 16) {
-            for (g = 0; g < ics->num_swb; g++) {
-                //apply M/S
-                if (cpe->common_window && !ch && cpe->ms_mask[w + g]) {
-                    for (i = 0; i < ics->swb_sizes[g]; i++) {
-                        cpe->ch[0].coeffs[start+i] = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
-                        cpe->ch[1].coeffs[start+i] =  cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
-                    }
-                }
-                start += ics->swb_sizes[g];
+        for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+            for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+                for (cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w*16+cmaxsfb-1]; cmaxsfb--)
+                    ;
+                maxsfb = FFMAX(maxsfb, cmaxsfb);
             }
-            for (cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w+cmaxsfb-1]; cmaxsfb--)
-                ;
-            maxsfb = FFMAX(maxsfb, cmaxsfb);
         }
         ics->max_sfb = maxsfb;
 
@@ -360,6 +303,67 @@ static void adjust_frame_information(ChannelElement *cpe, int chans)
     }
 }
 
+static void apply_intensity_stereo(ChannelElement *cpe)
+{
+    int w, w2, g, i;
+    IndividualChannelStream *ics = &cpe->ch[0].ics;
+    if (!cpe->common_window)
+        return;
+    for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+        for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+            int start = (w+w2) * 128;
+            for (g = 0; g < ics->num_swb; g++) {
+                int p  = -1 + 2 * (cpe->ch[1].band_type[w*16+g] - 14);
+                float scale = cpe->ch[0].is_ener[w*16+g];
+                if (!cpe->is_mask[w*16 + g]) {
+                    start += ics->swb_sizes[g];
+                    continue;
+                }
+                if (cpe->ms_mask[w*16 + g])
+                    p *= -1;
+                for (i = 0; i < ics->swb_sizes[g]; i++) {
+                    float sum = (cpe->ch[0].coeffs[start+i] + p*cpe->ch[1].coeffs[start+i])*scale;
+                    cpe->ch[0].coeffs[start+i] = sum;
+                    cpe->ch[1].coeffs[start+i] = 0.0f;
+                }
+                start += ics->swb_sizes[g];
+            }
+        }
+    }
+}
+
+static void apply_mid_side_stereo(ChannelElement *cpe)
+{
+    int w, w2, g, i;
+    IndividualChannelStream *ics = &cpe->ch[0].ics;
+    if (!cpe->common_window)
+        return;
+    for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+        for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+            int start = (w+w2) * 128;
+            for (g = 0; g < ics->num_swb; g++) {
+                /* ms_mask can be used for other purposes in PNS and I/S,
+                 * so must not apply M/S if any band uses either, even if
+                 * ms_mask is set.
+                 */
+                if (!cpe->ms_mask[w*16 + g] || cpe->is_mask[w*16 + g]
+                    || cpe->ch[0].band_type[w*16 + g] >= NOISE_BT
+                    || cpe->ch[1].band_type[w*16 + g] >= NOISE_BT) {
+                    start += ics->swb_sizes[g];
+                    continue;
+                }
+                for (i = 0; i < ics->swb_sizes[g]; i++) {
+                    float L = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) * 0.5f;
+                    float R = L - cpe->ch[1].coeffs[start+i];
+                    cpe->ch[0].coeffs[start+i] = L;
+                    cpe->ch[1].coeffs[start+i] = R;
+                }
+                start += ics->swb_sizes[g];
+            }
+        }
+    }
+}
+
 /**
  * Encode scalefactor band coding type.
  */
@@ -367,6 +371,9 @@ static void encode_band_info(AACEncContext *s, SingleChannelElement *sce)
 {
     int w;
 
+    if (s->coder->set_special_band_scalefactors)
+        s->coder->set_special_band_scalefactors(s, sce);
+
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
         s->coder->encode_window_bands_info(s, sce, w, sce->ics.group_len[w], s->lambda);
 }
@@ -377,16 +384,30 @@ static void encode_band_info(AACEncContext *s, SingleChannelElement *sce)
 static void encode_scale_factors(AVCodecContext *avctx, AACEncContext *s,
                                  SingleChannelElement *sce)
 {
-    int off = sce->sf_idx[0], diff;
+    int diff, off_sf = sce->sf_idx[0], off_pns = sce->sf_idx[0] - NOISE_OFFSET;
+    int off_is = 0, noise_flag = 1;
     int i, w;
 
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         for (i = 0; i < sce->ics.max_sfb; i++) {
             if (!sce->zeroes[w*16 + i]) {
-                diff = sce->sf_idx[w*16 + i] - off + SCALE_DIFF_ZERO;
-                if (diff < 0 || diff > 120)
-                    av_log(avctx, AV_LOG_ERROR, "Scalefactor difference is too big to be coded\n");
-                off = sce->sf_idx[w*16 + i];
+                if (sce->band_type[w*16 + i] == NOISE_BT) {
+                    diff = sce->sf_idx[w*16 + i] - off_pns;
+                    off_pns = sce->sf_idx[w*16 + i];
+                    if (noise_flag-- > 0) {
+                        put_bits(&s->pb, NOISE_PRE_BITS, diff + NOISE_PRE);
+                        continue;
+                    }
+                } else if (sce->band_type[w*16 + i] == INTENSITY_BT  ||
+                           sce->band_type[w*16 + i] == INTENSITY_BT2) {
+                    diff = sce->sf_idx[w*16 + i] - off_is;
+                    off_is = sce->sf_idx[w*16 + i];
+                } else {
+                    diff = sce->sf_idx[w*16 + i] - off_sf;
+                    off_sf = sce->sf_idx[w*16 + i];
+                }
+                diff += SCALE_DIFF_ZERO;
+                av_assert0(diff >= 0 && diff <= 120);
                 put_bits(&s->pb, ff_aac_scalefactor_bits[diff], ff_aac_scalefactor_code[diff]);
             }
         }
@@ -426,18 +447,41 @@ static void encode_spectral_coeffs(AACEncContext *s, SingleChannelElement *sce)
                 start += sce->ics.swb_sizes[i];
                 continue;
             }
-            for (w2 = w; w2 < w + sce->ics.group_len[w]; w2++)
-                s->coder->quantize_and_encode_band(s, &s->pb, sce->coeffs + start + w2*128,
-                                                   sce->ics.swb_sizes[i],
+            for (w2 = w; w2 < w + sce->ics.group_len[w]; w2++) {
+                s->coder->quantize_and_encode_band(s, &s->pb,
+                                                   &sce->coeffs[start + w2*128],
+                                                   NULL, sce->ics.swb_sizes[i],
                                                    sce->sf_idx[w*16 + i],
                                                    sce->band_type[w*16 + i],
-                                                   s->lambda);
+                                                   s->lambda,
+                                                   sce->ics.window_clipping[w]);
+            }
             start += sce->ics.swb_sizes[i];
         }
     }
 }
 
 /**
+ * Downscale spectral coefficients for near-clipping windows to avoid artifacts
+ */
+static void avoid_clipping(AACEncContext *s, SingleChannelElement *sce)
+{
+    int start, i, j, w;
+
+    if (sce->ics.clip_avoidance_factor < 1.0f) {
+        for (w = 0; w < sce->ics.num_windows; w++) {
+            start = 0;
+            for (i = 0; i < sce->ics.max_sfb; i++) {
+                float *swb_coeffs = &sce->coeffs[start + w*128];
+                for (j = 0; j < sce->ics.swb_sizes[i]; j++)
+                    swb_coeffs[j] *= sce->ics.clip_avoidance_factor;
+                start += sce->ics.swb_sizes[i];
+            }
+        }
+    }
+}
+
+/**
  * Encode one channel of audio data.
  */
 static int encode_individual_channel(AVCodecContext *avctx, AACEncContext *s,
@@ -445,12 +489,19 @@ static int encode_individual_channel(AVCodecContext *avctx, AACEncContext *s,
                                      int common_window)
 {
     put_bits(&s->pb, 8, sce->sf_idx[0]);
-    if (!common_window)
+    if (!common_window) {
         put_ics_info(s, &sce->ics);
+        if (s->coder->encode_main_pred)
+            s->coder->encode_main_pred(s, sce);
+        if (s->coder->encode_ltp_info)
+            s->coder->encode_ltp_info(s, sce, 0);
+    }
     encode_band_info(s, sce);
     encode_scale_factors(avctx, s, sce);
     encode_pulses(s, &sce->pulse);
-    put_bits(&s->pb, 1, 0); //tns
+    put_bits(&s->pb, 1, !!sce->tns.present);
+    if (s->coder->encode_tns_info)
+        s->coder->encode_tns_info(s, sce);
     put_bits(&s->pb, 1, 0); //ssr
     encode_spectral_coeffs(s, sce);
     return 0;
@@ -478,13 +529,13 @@ static void put_bitstream_info(AACEncContext *s, const char *name)
 
 /*
  * Copy input samples.
- * Channels are reordered from Libav's default order to AAC order.
+ * Channels are reordered from libavcodec's default order to AAC order.
  */
 static void copy_input_samples(AACEncContext *s, const AVFrame *frame)
 {
     int ch;
     int end = 2048 + (frame ? frame->nb_samples : 0);
-    const uint8_t *channel_map = aac_chan_maps[s->channels - 1];
+    const uint8_t *channel_map = s->reorder_map;
 
     /* copy and remap input samples */
     for (ch = 0; ch < s->channels; ch++) {
@@ -508,18 +559,21 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     AACEncContext *s = avctx->priv_data;
     float **samples = s->planar_samples, *samples2, *la, *overlap;
     ChannelElement *cpe;
-    int i, ch, w, g, chans, tag, start_ch, ret;
+    SingleChannelElement *sce;
+    IndividualChannelStream *ics;
+    int i, its, ch, w, chans, tag, start_ch, ret, frame_bits;
+    int target_bits, rate_bits, too_many_bits, too_few_bits;
+    int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
     int chan_el_counter[4];
-    int frame_bits;
     FFPsyWindowInfo windows[AAC_MAX_CHANNELS];
 
-    if (s->last_frame == 2)
-        return 0;
-
     /* add current frame to queue */
     if (frame) {
         if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
             return ret;
+    } else {
+        if (!s->afq.remaining_samples || (!s->afq.frame_alloc && !s->afq.frame_count))
+            return 0;
     }
 
     copy_input_samples(s, frame);
@@ -536,18 +590,22 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         chans    = tag == TYPE_CPE ? 2 : 1;
         cpe      = &s->cpe[i];
         for (ch = 0; ch < chans; ch++) {
-            IndividualChannelStream *ics = &cpe->ch[ch].ics;
-            int cur_channel = start_ch + ch;
-            overlap  = &samples[cur_channel][0];
+            int k;
+            float clip_avoidance_factor;
+            sce = &cpe->ch[ch];
+            ics = &sce->ics;
+            s->cur_channel = start_ch + ch;
+            overlap  = &samples[s->cur_channel][0];
             samples2 = overlap + 1024;
             la       = samples2 + (448+64);
             if (!frame)
                 la = NULL;
             if (tag == TYPE_LFE) {
-                wi[ch].window_type[0] = ONLY_LONG_SEQUENCE;
+                wi[ch].window_type[0] = wi[ch].window_type[1] = ONLY_LONG_SEQUENCE;
                 wi[ch].window_shape   = 0;
                 wi[ch].num_windows    = 1;
                 wi[ch].grouping[0]    = 1;
+                wi[ch].clipping[0]    = 0;
 
                 /* Only the lowest 12 coefficients are used in a LFE channel.
                  * The expression below results in only the bottom 8 coefficients
@@ -555,7 +613,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                  */
                 ics->num_swb = s->samplerate_index >= 8 ? 1 : 3;
             } else {
-                wi[ch] = s->psy.model->window(&s->psy, samples2, la, cur_channel,
+                wi[ch] = s->psy.model->window(&s->psy, samples2, la, s->cur_channel,
                                               ics->window_sequence[0]);
             }
             ics->window_sequence[1] = ics->window_sequence[0];
@@ -565,24 +623,71 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             ics->num_windows        = wi[ch].num_windows;
             ics->swb_sizes          = s->psy.bands    [ics->num_windows == 8];
             ics->num_swb            = tag == TYPE_LFE ? ics->num_swb : s->psy.num_bands[ics->num_windows == 8];
+            ics->max_sfb            = FFMIN(ics->max_sfb, ics->num_swb);
+            ics->swb_offset         = wi[ch].window_type[0] == EIGHT_SHORT_SEQUENCE ?
+                                        ff_swb_offset_128 [s->samplerate_index]:
+                                        ff_swb_offset_1024[s->samplerate_index];
+            ics->tns_max_bands      = wi[ch].window_type[0] == EIGHT_SHORT_SEQUENCE ?
+                                        ff_tns_max_bands_128 [s->samplerate_index]:
+                                        ff_tns_max_bands_1024[s->samplerate_index];
+
             for (w = 0; w < ics->num_windows; w++)
                 ics->group_len[w] = wi[ch].grouping[w];
 
-            apply_window_and_mdct(s, &cpe->ch[ch], overlap);
+            /* Calculate input sample maximums and evaluate clipping risk */
+            clip_avoidance_factor = 0.0f;
+            for (w = 0; w < ics->num_windows; w++) {
+                const float *wbuf = overlap + w * 128;
+                const int wlen = 2048 / ics->num_windows;
+                float max = 0;
+                int j;
+                /* mdct input is 2 * output */
+                for (j = 0; j < wlen; j++)
+                    max = FFMAX(max, fabsf(wbuf[j]));
+                wi[ch].clipping[w] = max;
+            }
+            for (w = 0; w < ics->num_windows; w++) {
+                if (wi[ch].clipping[w] > CLIP_AVOIDANCE_FACTOR) {
+                    ics->window_clipping[w] = 1;
+                    clip_avoidance_factor = FFMAX(clip_avoidance_factor, wi[ch].clipping[w]);
+                } else {
+                    ics->window_clipping[w] = 0;
+                }
+            }
+            if (clip_avoidance_factor > CLIP_AVOIDANCE_FACTOR) {
+                ics->clip_avoidance_factor = CLIP_AVOIDANCE_FACTOR / clip_avoidance_factor;
+            } else {
+                ics->clip_avoidance_factor = 1.0f;
+            }
+
+            apply_window_and_mdct(s, sce, overlap);
+
+            if (s->options.ltp && s->coder->update_ltp) {
+                s->coder->update_ltp(s, sce);
+                apply_window[sce->ics.window_sequence[0]](s->fdsp, sce, &sce->ltp_state[0]);
+                s->mdct1024.mdct_calc(&s->mdct1024, sce->lcoeffs, sce->ret_buf);
+            }
+
+            for (k = 0; k < 1024; k++) {
+                if (!(fabs(cpe->ch[ch].coeffs[k]) < 1E16)) { // Ensure headroom for energy calculation
+                    av_log(avctx, AV_LOG_ERROR, "Input contains (near) NaN/+-Inf\n");
+                    return AVERROR(EINVAL);
+                }
+            }
+            avoid_clipping(s, sce);
         }
         start_ch += chans;
     }
-    if ((ret = ff_alloc_packet(avpkt, 768 * s->channels))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 8192 * s->channels, 0)) < 0)
         return ret;
-    }
-
+    frame_bits = its = 0;
     do {
         init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
         if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & AV_CODEC_FLAG_BITEXACT))
             put_bitstream_info(s, LIBAVCODEC_IDENT);
         start_ch = 0;
+        target_bits = 0;
         memset(chan_el_counter, 0, sizeof(chan_el_counter));
         for (i = 0; i < s->chan_map[0]; i++) {
             FFPsyWindowInfo* wi = windows + start_ch;
@@ -590,16 +695,39 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             tag      = s->chan_map[i+1];
             chans    = tag == TYPE_CPE ? 2 : 1;
             cpe      = &s->cpe[i];
+            cpe->common_window = 0;
+            memset(cpe->is_mask, 0, sizeof(cpe->is_mask));
+            memset(cpe->ms_mask, 0, sizeof(cpe->ms_mask));
             put_bits(&s->pb, 3, tag);
             put_bits(&s->pb, 4, chan_el_counter[tag]++);
-            for (ch = 0; ch < chans; ch++)
-                coeffs[ch] = cpe->ch[ch].coeffs;
+            for (ch = 0; ch < chans; ch++) {
+                sce = &cpe->ch[ch];
+                coeffs[ch] = sce->coeffs;
+                sce->ics.predictor_present = 0;
+                sce->ics.ltp.present = 0;
+                memset(sce->ics.ltp.used, 0, sizeof(sce->ics.ltp.used));
+                memset(sce->ics.prediction_used, 0, sizeof(sce->ics.prediction_used));
+                memset(&sce->tns, 0, sizeof(TemporalNoiseShaping));
+                for (w = 0; w < 128; w++)
+                    if (sce->band_type[w] > RESERVED_BT)
+                        sce->band_type[w] = 0;
+            }
+            s->psy.bitres.alloc = -1;
+            s->psy.bitres.bits = s->last_frame_pb_count / s->channels;
             s->psy.model->analyze(&s->psy, start_ch, coeffs, wi);
+            if (s->psy.bitres.alloc > 0) {
+                /* Lambda unused here on purpose, we need to take psy's unscaled allocation */
+                target_bits += s->psy.bitres.alloc
+                    * (s->lambda / (avctx->global_quality ? avctx->global_quality : 120));
+                s->psy.bitres.alloc /= chans;
+            }
+            s->cur_type = tag;
             for (ch = 0; ch < chans; ch++) {
                 s->cur_channel = start_ch + ch;
+                if (s->options.pns && s->coder->mark_pns)
+                    s->coder->mark_pns(s, avctx, &cpe->ch[ch]);
                 s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda);
             }
-            cpe->common_window = 0;
             if (chans > 1
                 && wi[0].window_type[0] == wi[1].window_type[0]
                 && wi[0].window_shape   == wi[1].window_shape) {
@@ -612,23 +740,73 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                     }
                 }
             }
+            for (ch = 0; ch < chans; ch++) { /* TNS and PNS */
+                sce = &cpe->ch[ch];
+                s->cur_channel = start_ch + ch;
+                if (s->options.tns && s->coder->search_for_tns)
+                    s->coder->search_for_tns(s, sce);
+                if (s->options.tns && s->coder->apply_tns_filt)
+                    s->coder->apply_tns_filt(s, sce);
+                if (sce->tns.present)
+                    tns_mode = 1;
+                if (s->options.pns && s->coder->search_for_pns)
+                    s->coder->search_for_pns(s, avctx, sce);
+            }
             s->cur_channel = start_ch;
-            if (s->options.stereo_mode && cpe->common_window) {
-                if (s->options.stereo_mode > 0) {
-                    IndividualChannelStream *ics = &cpe->ch[0].ics;
-                    for (w = 0; w < ics->num_windows; w += ics->group_len[w])
-                        for (g = 0;  g < ics->num_swb; g++)
-                            cpe->ms_mask[w*16+g] = 1;
-                } else if (s->coder->search_for_ms) {
-                    s->coder->search_for_ms(s, cpe, s->lambda);
+            if (s->options.intensity_stereo) { /* Intensity Stereo */
+                if (s->coder->search_for_is)
+                    s->coder->search_for_is(s, avctx, cpe);
+                if (cpe->is_mode) is_mode = 1;
+                apply_intensity_stereo(cpe);
+            }
+            if (s->options.pred) { /* Prediction */
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->options.pred && s->coder->search_for_pred)
+                        s->coder->search_for_pred(s, sce);
+                    if (cpe->ch[ch].ics.predictor_present) pred_mode = 1;
+                }
+                if (s->coder->adjust_common_pred)
+                    s->coder->adjust_common_pred(s, cpe);
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->options.pred && s->coder->apply_main_pred)
+                        s->coder->apply_main_pred(s, sce);
                 }
+                s->cur_channel = start_ch;
+            }
+            if (s->options.mid_side) { /* Mid/Side stereo */
+                if (s->options.mid_side == -1 && s->coder->search_for_ms)
+                    s->coder->search_for_ms(s, cpe);
+                else if (cpe->common_window)
+                    memset(cpe->ms_mask, 1, sizeof(cpe->ms_mask));
+                apply_mid_side_stereo(cpe);
             }
             adjust_frame_information(cpe, chans);
+            if (s->options.ltp) { /* LTP */
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->coder->search_for_ltp)
+                        s->coder->search_for_ltp(s, sce, cpe->common_window);
+                    if (sce->ics.ltp.present) pred_mode = 1;
+                }
+                s->cur_channel = start_ch;
+                if (s->coder->adjust_common_ltp)
+                    s->coder->adjust_common_ltp(s, cpe);
+            }
             if (chans == 2) {
                 put_bits(&s->pb, 1, cpe->common_window);
                 if (cpe->common_window) {
                     put_ics_info(s, &cpe->ch[0].ics);
+                    if (s->coder->encode_main_pred)
+                        s->coder->encode_main_pred(s, &cpe->ch[0]);
+                    if (s->coder->encode_ltp_info)
+                        s->coder->encode_ltp_info(s, &cpe->ch[0], 1);
                     encode_ms_info(&s->pb, cpe);
+                    if (cpe->ms_mode) ms_mode = 1;
                 }
             }
             for (ch = 0; ch < chans; ch++) {
@@ -638,34 +816,77 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             start_ch += chans;
         }
 
-        frame_bits = put_bits_count(&s->pb);
-        if (frame_bits <= 6144 * s->channels - 3) {
-            s->psy.bitres.bits = frame_bits / s->channels;
+        if (avctx->flags & AV_CODEC_FLAG_QSCALE) {
+            /* When using a constant Q-scale, don't mess with lambda */
             break;
         }
 
-        s->lambda *= avctx->bit_rate * 1024.0f / avctx->sample_rate / frame_bits;
+        /* rate control stuff
+         * allow between the nominal bitrate, and what psy's bit reservoir says to target
+         * but drift towards the nominal bitrate always
+         */
+        frame_bits = put_bits_count(&s->pb);
+        rate_bits = avctx->bit_rate * 1024 / avctx->sample_rate;
+        rate_bits = FFMIN(rate_bits, 6144 * s->channels - 3);
+        too_many_bits = FFMAX(target_bits, rate_bits);
+        too_many_bits = FFMIN(too_many_bits, 6144 * s->channels - 3);
+        too_few_bits = FFMIN(FFMAX(rate_bits - rate_bits/4, target_bits), too_many_bits);
+
+        /* When using ABR, be strict (but only for increasing) */
+        too_few_bits = too_few_bits - too_few_bits/8;
+        too_many_bits = too_many_bits + too_many_bits/2;
+
+        if (   its == 0 /* for steady-state Q-scale tracking */
+            || (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits))
+            || frame_bits >= 6144 * s->channels - 3  )
+        {
+            float ratio = ((float)rate_bits) / frame_bits;
+
+            if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
+                /*
+                 * This path is for steady-state Q-scale tracking
+                 * When frame bits fall within the stable range, we still need to adjust
+                 * lambda to maintain it like so in a stable fashion (large jumps in lambda
+                 * create artifacts and should be avoided), but slowly
+                 */
+                ratio = sqrtf(sqrtf(ratio));
+                ratio = av_clipf(ratio, 0.9f, 1.1f);
+            } else {
+                /* Not so fast though */
+                ratio = sqrtf(ratio);
+            }
+            s->lambda = FFMIN(s->lambda * ratio, 65536.f);
 
+            /* Keep iterating if we must reduce and lambda is in the sky */
+            if (ratio > 0.9f && ratio < 1.1f) {
+                break;
+            } else {
+                if (is_mode || ms_mode || tns_mode || pred_mode) {
+                    for (i = 0; i < s->chan_map[0]; i++) {
+                        // Must restore coeffs
+                        chans = tag == TYPE_CPE ? 2 : 1;
+                        cpe = &s->cpe[i];
+                        for (ch = 0; ch < chans; ch++)
+                            memcpy(cpe->ch[ch].coeffs, cpe->ch[ch].pcoeffs, sizeof(cpe->ch[ch].coeffs));
+                    }
+                }
+                its++;
+            }
+        } else {
+            break;
+        }
     } while (1);
 
+    if (s->options.ltp && s->coder->ltp_insert_new_frame)
+        s->coder->ltp_insert_new_frame(s);
+
     put_bits(&s->pb, 3, TYPE_END);
     flush_put_bits(&s->pb);
-    frame_bits = put_bits_count(&s->pb);
-#if FF_API_STAT_BITS
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->frame_bits = frame_bits;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    // rate control stuff
-    if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
-        float ratio = avctx->bit_rate * 1024.0f / avctx->sample_rate / frame_bits;
-        s->lambda *= ratio;
-        s->lambda = FFMIN(s->lambda, 65536.f);
-    }
 
-    if (!frame)
-        s->last_frame++;
+    s->last_frame_pb_count = put_bits_count(&s->pb);
+
+    s->lambda_sum += s->lambda;
+    s->lambda_count++;
 
     ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
                        &avpkt->duration);
@@ -679,13 +900,17 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
 {
     AACEncContext *s = avctx->priv_data;
 
+    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_sum / s->lambda_count);
+
     ff_mdct_end(&s->mdct1024);
     ff_mdct_end(&s->mdct128);
     ff_psy_end(&s->psy);
+    ff_lpc_end(&s->lpc);
     if (s->psypp)
         ff_psy_preprocess_end(s->psypp);
     av_freep(&s->buffer.samples);
     av_freep(&s->cpe);
+    av_freep(&s->fdsp);
     ff_af_queue_close(&s->afq);
     return 0;
 }
@@ -694,7 +919,9 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
 {
     int ret = 0;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     // window init
     ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024);
@@ -702,9 +929,9 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
     ff_init_ff_sine_windows(10);
     ff_init_ff_sine_windows(7);
 
-    if (ret = ff_mdct_init(&s->mdct1024, 11, 0, 32768.0))
+    if ((ret = ff_mdct_init(&s->mdct1024, 11, 0, 32768.0)) < 0)
         return ret;
-    if (ret = ff_mdct_init(&s->mdct128,   8, 0, 32768.0))
+    if ((ret = ff_mdct_init(&s->mdct128,   8, 0, 32768.0)) < 0)
         return ret;
 
     return 0;
@@ -713,9 +940,8 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
 static av_cold int alloc_buffers(AVCodecContext *avctx, AACEncContext *s)
 {
     int ch;
-    FF_ALLOCZ_OR_GOTO(avctx, s->buffer.samples, 3 * 1024 * s->channels * sizeof(s->buffer.samples[0]), alloc_fail);
-    FF_ALLOCZ_OR_GOTO(avctx, s->cpe, sizeof(ChannelElement) * s->chan_map[0], alloc_fail);
-    FF_ALLOCZ_OR_GOTO(avctx, avctx->extradata, 5 + AV_INPUT_BUFFER_PADDING_SIZE, alloc_fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->buffer.samples, s->channels, 3 * 1024 * sizeof(s->buffer.samples[0]), alloc_fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->cpe, s->chan_map[0], sizeof(ChannelElement), alloc_fail);
 
     for(ch = 0; ch < s->channels; ch++)
         s->planar_samples[ch] = s->buffer.samples + 3 * 1024 * ch;
@@ -725,6 +951,11 @@ alloc_fail:
     return AVERROR(ENOMEM);
 }
 
+static av_cold void aac_encode_init_tables(void)
+{
+    ff_aac_tableinit();
+}
+
 static av_cold int aac_encode_init(AVCodecContext *avctx)
 {
     AACEncContext *s = avctx->priv_data;
@@ -733,28 +964,117 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
     uint8_t grouping[AAC_MAX_CHANNELS];
     int lengths[2];
 
+    /* Constants */
+    s->last_frame_pb_count = 0;
     avctx->frame_size = 1024;
+    avctx->initial_padding = 1024;
+    s->lambda = avctx->global_quality > 0 ? avctx->global_quality : 120;
 
-    for (i = 0; i < 16; i++)
-        if (avctx->sample_rate == avpriv_mpeg4audio_sample_rates[i])
+    /* Channel map and unspecified bitrate guessing */
+    s->channels = avctx->channels;
+
+    s->needs_pce = 1;
+    for (i = 0; i < FF_ARRAY_ELEMS(aac_normal_chan_layouts); i++) {
+        if (avctx->channel_layout == aac_normal_chan_layouts[i]) {
+            s->needs_pce = s->options.pce;
             break;
+        }
+    }
 
-    s->channels = avctx->channels;
+    if (s->needs_pce) {
+        char buf[64];
+        for (i = 0; i < FF_ARRAY_ELEMS(aac_pce_configs); i++)
+            if (avctx->channel_layout == aac_pce_configs[i].layout)
+                break;
+        av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
+        ERROR_IF(i == FF_ARRAY_ELEMS(aac_pce_configs), "Unsupported channel layout \"%s\"\n", buf);
+        av_log(avctx, AV_LOG_INFO, "Using a PCE to encode channel layout \"%s\"\n", buf);
+        s->pce = aac_pce_configs[i];
+        s->reorder_map = s->pce.reorder_map;
+        s->chan_map = s->pce.config_map;
+    } else {
+        s->reorder_map = aac_chan_maps[s->channels - 1];
+        s->chan_map = aac_chan_configs[s->channels - 1];
+    }
 
-    ERROR_IF(i == 16,
+    if (!avctx->bit_rate) {
+        for (i = 1; i <= s->chan_map[0]; i++) {
+            avctx->bit_rate += s->chan_map[i] == TYPE_CPE ? 128000 : /* Pair */
+                               s->chan_map[i] == TYPE_LFE ? 16000  : /* LFE  */
+                                                            69000  ; /* SCE  */
+        }
+    }
+
+    /* Samplerate */
+    for (i = 0; i < 16; i++)
+        if (avctx->sample_rate == avpriv_mpeg4audio_sample_rates[i])
+            break;
+    s->samplerate_index = i;
+    ERROR_IF(s->samplerate_index == 16 ||
+             s->samplerate_index >= ff_aac_swb_size_1024_len ||
+             s->samplerate_index >= ff_aac_swb_size_128_len,
              "Unsupported sample rate %d\n", avctx->sample_rate);
-    ERROR_IF(s->channels > AAC_MAX_CHANNELS,
-             "Unsupported number of channels: %d\n", s->channels);
-    ERROR_IF(avctx->profile != FF_PROFILE_UNKNOWN && avctx->profile != FF_PROFILE_AAC_LOW,
-             "Unsupported profile %d\n", avctx->profile);
-    ERROR_IF(1024.0 * avctx->bit_rate / avctx->sample_rate > 6144 * s->channels,
-             "Too many bits %f > %d per frame requested\n",
+
+    /* Bitrate limiting */
+    WARN_IF(1024.0 * avctx->bit_rate / avctx->sample_rate > 6144 * s->channels,
+             "Too many bits %f > %d per frame requested, clamping to max\n",
              1024.0 * avctx->bit_rate / avctx->sample_rate,
              6144 * s->channels);
+    avctx->bit_rate = (int64_t)FFMIN(6144 * s->channels / 1024.0 * avctx->sample_rate,
+                                     avctx->bit_rate);
+
+    /* Profile and option setting */
+    avctx->profile = avctx->profile == FF_PROFILE_UNKNOWN ? FF_PROFILE_AAC_LOW :
+                     avctx->profile;
+    for (i = 0; i < FF_ARRAY_ELEMS(aacenc_profiles); i++)
+        if (avctx->profile == aacenc_profiles[i])
+            break;
+    if (avctx->profile == FF_PROFILE_MPEG2_AAC_LOW) {
+        avctx->profile = FF_PROFILE_AAC_LOW;
+        ERROR_IF(s->options.pred,
+                 "Main prediction unavailable in the \"mpeg2_aac_low\" profile\n");
+        ERROR_IF(s->options.ltp,
+                 "LTP prediction unavailable in the \"mpeg2_aac_low\" profile\n");
+        WARN_IF(s->options.pns,
+                "PNS unavailable in the \"mpeg2_aac_low\" profile, turning off\n");
+        s->options.pns = 0;
+    } else if (avctx->profile == FF_PROFILE_AAC_LTP) {
+        s->options.ltp = 1;
+        ERROR_IF(s->options.pred,
+                 "Main prediction unavailable in the \"aac_ltp\" profile\n");
+    } else if (avctx->profile == FF_PROFILE_AAC_MAIN) {
+        s->options.pred = 1;
+        ERROR_IF(s->options.ltp,
+                 "LTP prediction unavailable in the \"aac_main\" profile\n");
+    } else if (s->options.ltp) {
+        avctx->profile = FF_PROFILE_AAC_LTP;
+        WARN_IF(1,
+                "Chainging profile to \"aac_ltp\"\n");
+        ERROR_IF(s->options.pred,
+                 "Main prediction unavailable in the \"aac_ltp\" profile\n");
+    } else if (s->options.pred) {
+        avctx->profile = FF_PROFILE_AAC_MAIN;
+        WARN_IF(1,
+                "Chainging profile to \"aac_main\"\n");
+        ERROR_IF(s->options.ltp,
+                 "LTP prediction unavailable in the \"aac_main\" profile\n");
+    }
+    s->profile = avctx->profile;
+
+    /* Coder limitations */
+    s->coder = &ff_aac_coders[s->options.coder];
+    if (s->options.coder == AAC_CODER_ANMR) {
+        ERROR_IF(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL,
+                 "The ANMR coder is considered experimental, add -strict -2 to enable!\n");
+        s->options.intensity_stereo = 0;
+        s->options.pns = 0;
+    }
+    ERROR_IF(s->options.ltp && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL,
+             "The LPT profile requires experimental compliance, add -strict -2 to enable!\n");
 
-    s->samplerate_index = i;
-
-    s->chan_map = aac_chan_configs[s->channels-1];
+    /* M/S introduces horrible artifacts with multichannel files, this is temporary */
+    if (s->channels > 3)
+        s->options.mid_side = 0;
 
     if ((ret = dsp_init(avctx, s)) < 0)
         goto fail;
@@ -762,29 +1082,34 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
     if ((ret = alloc_buffers(avctx, s)) < 0)
         goto fail;
 
-    avctx->extradata_size = 5;
-    put_audio_specific_config(avctx);
+    if ((ret = put_audio_specific_config(avctx)))
+        goto fail;
 
-    sizes[0]   = swb_size_1024[i];
-    sizes[1]   = swb_size_128[i];
-    lengths[0] = ff_aac_num_swb_1024[i];
-    lengths[1] = ff_aac_num_swb_128[i];
+    sizes[0]   = ff_aac_swb_size_1024[s->samplerate_index];
+    sizes[1]   = ff_aac_swb_size_128[s->samplerate_index];
+    lengths[0] = ff_aac_num_swb_1024[s->samplerate_index];
+    lengths[1] = ff_aac_num_swb_128[s->samplerate_index];
     for (i = 0; i < s->chan_map[0]; i++)
         grouping[i] = s->chan_map[i + 1] == TYPE_CPE;
     if ((ret = ff_psy_init(&s->psy, avctx, 2, sizes, lengths,
                            s->chan_map[0], grouping)) < 0)
         goto fail;
     s->psypp = ff_psy_preprocess_init(avctx);
-    s->coder = &ff_aac_coders[2];
+    ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON);
+    s->random_state = 0x1f2e3d4c;
 
-    s->lambda = avctx->global_quality ? avctx->global_quality : 120;
+    s->abs_pow34   = abs_pow34_v;
+    s->quant_bands = quantize_bands;
 
-    ff_aac_tableinit();
+    if (ARCH_X86)
+        ff_aac_dsp_init_x86(s);
 
-    for (i = 0; i < 428; i++)
-        ff_aac_pow34sf_tab[i] = sqrt(ff_aac_pow2sf_tab[i] * sqrt(ff_aac_pow2sf_tab[i]));
+    if (HAVE_MIPSDSP)
+        ff_aac_coder_init_mips(s);
+
+    if ((ret = ff_thread_once(&aac_table_init, &aac_encode_init_tables)) != 0)
+        return AVERROR_UNKNOWN;
 
-    avctx->initial_padding = 1024;
     ff_af_queue_init(avctx, &s->afq);
 
     return 0;
@@ -795,10 +1120,17 @@ fail:
 
 #define AACENC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
 static const AVOption aacenc_options[] = {
-    {"stereo_mode", "Stereo coding method", offsetof(AACEncContext, options.stereo_mode), AV_OPT_TYPE_INT, {.i64 = 0}, -1, 1, AACENC_FLAGS, "stereo_mode"},
-        {"auto",     "Selected by the Encoder", 0, AV_OPT_TYPE_CONST, {.i64 = -1 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
-        {"ms_off",   "Disable Mid/Side coding", 0, AV_OPT_TYPE_CONST, {.i64 =  0 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
-        {"ms_force", "Force Mid/Side for the whole frame if possible", 0, AV_OPT_TYPE_CONST, {.i64 =  1 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
+    {"aac_coder", "Coding algorithm", offsetof(AACEncContext, options.coder), AV_OPT_TYPE_INT, {.i64 = AAC_CODER_FAST}, 0, AAC_CODER_NB-1, AACENC_FLAGS, "coder"},
+        {"anmr",     "ANMR method",               0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_ANMR},    INT_MIN, INT_MAX, AACENC_FLAGS, "coder"},
+        {"twoloop",  "Two loop searching method", 0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_TWOLOOP}, INT_MIN, INT_MAX, AACENC_FLAGS, "coder"},
+        {"fast",     "Default fast search",       0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_FAST},    INT_MIN, INT_MAX, AACENC_FLAGS, "coder"},
+    {"aac_ms", "Force M/S stereo coding", offsetof(AACEncContext, options.mid_side), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, AACENC_FLAGS},
+    {"aac_is", "Intensity stereo coding", offsetof(AACEncContext, options.intensity_stereo), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS},
+    {"aac_pns", "Perceptual noise substitution", offsetof(AACEncContext, options.pns), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS},
+    {"aac_tns", "Temporal noise shaping", offsetof(AACEncContext, options.tns), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS},
+    {"aac_ltp", "Long term prediction", offsetof(AACEncContext, options.ltp), AV_OPT_TYPE_BOOL, {.i64 = 0}, -1, 1, AACENC_FLAGS},
+    {"aac_pred", "AAC-Main prediction", offsetof(AACEncContext, options.pred), AV_OPT_TYPE_BOOL, {.i64 = 0}, -1, 1, AACENC_FLAGS},
+    {"aac_pce", "Forces the use of PCEs", offsetof(AACEncContext, options.pce), AV_OPT_TYPE_BOOL, {.i64 = 0}, -1, 1, AACENC_FLAGS},
     {NULL}
 };
 
@@ -809,6 +1141,11 @@ static const AVClass aacenc_class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
+static const AVCodecDefault aac_encode_defaults[] = {
+    { "b", "0" },
+    { NULL }
+};
+
 AVCodec ff_aac_encoder = {
     .name           = "aac",
     .long_name      = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
@@ -818,8 +1155,10 @@ AVCodec ff_aac_encoder = {
     .init           = aac_encode_init,
     .encode2        = aac_encode_frame,
     .close          = aac_encode_end,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY |
-                      AV_CODEC_CAP_EXPERIMENTAL,
+    .defaults       = aac_encode_defaults,
+    .supported_samplerates = mpeg4audio_sample_rates,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLTP,
                                                      AV_SAMPLE_FMT_NONE },
     .priv_class     = &aacenc_class,
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index f77b200..5a015ca 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -2,20 +2,20 @@
  * AAC encoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,25 @@
 #include "audio_frame_queue.h"
 #include "psymodel.h"
 
+#include "lpc.h"
+
+typedef enum AACCoder {
+    AAC_CODER_ANMR = 0,
+    AAC_CODER_TWOLOOP,
+    AAC_CODER_FAST,
+
+    AAC_CODER_NB,
+}AACCoder;
+
 typedef struct AACEncOptions {
-    int stereo_mode;
+    int coder;
+    int pns;
+    int tns;
+    int ltp;
+    int pce;
+    int pred;
+    int mid_side;
+    int intensity_stereo;
 } AACEncOptions;
 
 struct AACEncContext;
@@ -41,13 +58,318 @@ typedef struct AACCoefficientsEncoder {
                                   SingleChannelElement *sce, const float lambda);
     void (*encode_window_bands_info)(struct AACEncContext *s, SingleChannelElement *sce,
                                      int win, int group_len, const float lambda);
-    void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, int size,
-                                     int scale_idx, int cb, const float lambda);
-    void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe, const float lambda);
+    void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, float *out, int size,
+                                     int scale_idx, int cb, const float lambda, int rtz);
+    void (*encode_tns_info)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*encode_ltp_info)(struct AACEncContext *s, SingleChannelElement *sce, int common_window);
+    void (*encode_main_pred)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*adjust_common_pred)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*adjust_common_ltp)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*apply_main_pred)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*apply_tns_filt)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*update_ltp)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*ltp_insert_new_frame)(struct AACEncContext *s);
+    void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
+    void (*mark_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
+    void (*search_for_tns)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*search_for_ltp)(struct AACEncContext *s, SingleChannelElement *sce, int common_window);
+    void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*search_for_is)(struct AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
+    void (*search_for_pred)(struct AACEncContext *s, SingleChannelElement *sce);
 } AACCoefficientsEncoder;
 
 extern const AACCoefficientsEncoder ff_aac_coders[];
 
+typedef struct AACQuantizeBandCostCacheEntry {
+    float rd;
+    float energy;
+    int bits;
+    char cb;
+    char rtz;
+    uint16_t generation;
+} AACQuantizeBandCostCacheEntry;
+
+typedef struct AACPCEInfo {
+    int64_t layout;
+    int num_ele[4];                              ///< front, side, back, lfe
+    int pairing[3][8];                           ///< front, side, back
+    int index[4][8];                             ///< front, side, back, lfe
+    uint8_t config_map[16];                      ///< configs the encoder's channel specific settings
+    uint8_t reorder_map[16];                     ///< maps channels from lavc to aac order
+} AACPCEInfo;
+
+/**
+ * List of PCE (Program Configuration Element) for the channel layouts listed
+ * in channel_layout.h
+ *
+ * For those wishing in the future to add other layouts:
+ *
+ * - num_ele: number of elements in each group of front, side, back, lfe channels
+ *            (an element is of type SCE (single channel), CPE (channel pair) for
+ *            the first 3 groups; and is LFE for LFE group).
+ *
+ * - pairing: 0 for an SCE element or 1 for a CPE; does not apply to LFE group
+ *
+ * - index: there are three independent indices for SCE, CPE and LFE;
+ *     they are incremented irrespective of the group to which the element belongs;
+ *     they are not reset when going from one group to another
+ *
+ *     Example: for 7.0 channel layout,
+ *        .pairing = { { 1, 0 }, { 1 }, { 1 }, }, (3 CPE and 1 SCE in front group)
+ *        .index = { { 0, 0 }, { 1 }, { 2 }, },
+ *               (index is 0 for the single SCE but goes from 0 to 2 for the CPEs)
+ *
+ *     The index order impacts the channel ordering. But is otherwise arbitrary
+ *     (the sequence could have been 2, 0, 1 instead of 0, 1, 2).
+ *
+ *     Spec allows for discontinuous indices, e.g. if one has a total of two SCE,
+ *     SCE.0 SCE.15 is OK per spec; BUT it won't be decoded by our AAC decoder
+ *     which at this time requires that indices fully cover some range starting
+ *     from 0 (SCE.1 SCE.0 is OK but not SCE.0 SCE.15).
+ *
+ * - config_map: total number of elements and their types. Beware, the way the
+ *               types are ordered impacts the final channel ordering.
+ *
+ * - reorder_map: reorders the channels.
+ *
+ */
+static const AACPCEInfo aac_pce_configs[] = {
+    {
+        .layout = AV_CH_LAYOUT_MONO,
+        .num_ele = { 1, 0, 0, 0 },
+        .pairing = { { 0 }, },
+        .index = { { 0 }, },
+        .config_map = { 1, TYPE_SCE, },
+        .reorder_map = { 0 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_STEREO,
+        .num_ele = { 1, 0, 0, 0 },
+        .pairing = { { 1 }, },
+        .index = { { 0 }, },
+        .config_map = { 1, TYPE_CPE, },
+        .reorder_map = { 0, 1 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_2POINT1,
+        .num_ele = { 1, 0, 0, 1 },
+        .pairing = { { 1 }, },
+        .index = { { 0 },{ 0 },{ 0 },{ 0 } },
+        .config_map = { 2, TYPE_CPE, TYPE_LFE },
+        .reorder_map = { 0, 1, 2 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_2_1,
+        .num_ele = { 1, 0, 1, 0 },
+        .pairing = { { 1 },{ 0 },{ 0 } },
+        .index = { { 0 },{ 0 },{ 0 }, },
+        .config_map = { 2, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_SURROUND,
+        .num_ele = { 2, 0, 0, 0 },
+        .pairing = { { 1, 0 }, },
+        .index = { { 0, 0 }, },
+        .config_map = { 2, TYPE_CPE, TYPE_SCE, },
+        .reorder_map = { 0, 1, 2 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_3POINT1,
+        .num_ele = { 2, 0, 0, 1 },
+        .pairing = { { 1, 0 }, },
+        .index = { { 0, 0 }, { 0 }, { 0 }, { 0 }, },
+        .config_map = { 3, TYPE_CPE, TYPE_SCE, TYPE_LFE },
+        .reorder_map = { 0, 1, 2, 3 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_4POINT0,
+        .num_ele = { 2, 0, 1, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 0 }, },
+        .index = { { 0, 0 }, { 0 }, { 1 } },
+        .config_map = { 3, TYPE_CPE, TYPE_SCE, TYPE_SCE },
+        .reorder_map = {  0, 1, 2, 3 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_4POINT1,
+        .num_ele = { 2, 1, 1, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 0 }, },
+        .index = { { 0, 0 }, { 1 }, { 2 }, { 0 } },
+        .config_map = { 4, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_2_2,
+        .num_ele = { 1, 1, 0, 0 },
+        .pairing = { { 1 }, { 1 }, },
+        .index = { { 0 }, { 1 }, },
+        .config_map = { 2, TYPE_CPE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_QUAD,
+        .num_ele = { 1, 0, 1, 0 },
+        .pairing = { { 1 }, { 0 }, { 1 }, },
+        .index = { { 0 }, { 0 }, { 1 } },
+        .config_map = { 2, TYPE_CPE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_5POINT0,
+        .num_ele = { 2, 1, 0, 0 },
+        .pairing = { { 1, 0 }, { 1 }, },
+        .index = { { 0, 0 }, { 1 } },
+        .config_map = { 3, TYPE_CPE, TYPE_SCE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_5POINT1,
+        .num_ele = { 2, 1, 1, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 1 }, },
+        .index = { { 0, 0 }, { 1 }, { 1 } },
+        .config_map = { 4, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_5POINT0_BACK,
+        .num_ele = { 2, 0, 1, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 1 } },
+        .index = { { 0, 0 }, { 0 }, { 1 } },
+        .config_map = { 3, TYPE_CPE, TYPE_SCE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_5POINT1_BACK,
+        .num_ele = { 2, 1, 1, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 1 }, },
+        .index = { { 0, 0 }, { 1 }, { 1 } },
+        .config_map = { 4, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_6POINT0,
+        .num_ele = { 2, 1, 1, 0 },
+        .pairing = { { 1, 0 }, { 1 }, { 0 }, },
+        .index = { { 0, 0 }, { 1 }, { 1 } },
+        .config_map = { 4, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_6POINT0_FRONT,
+        .num_ele = { 2, 1, 0, 0 },
+        .pairing = { { 1, 1 }, { 1 } },
+        .index = { { 1, 0 }, { 2 }, },
+        .config_map = { 3, TYPE_CPE, TYPE_CPE, TYPE_CPE, },
+        .reorder_map = { 0, 1, 2, 3, 4, 5 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_HEXAGONAL,
+        .num_ele = { 2, 0, 2, 0 },
+        .pairing = { { 1, 0 },{ 0 },{ 1, 0 }, },
+        .index = { { 0, 0 },{ 0 },{ 1, 1 } },
+        .config_map = { 4, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_SCE, },
+        .reorder_map = { 0, 1, 2, 3, 4, 5 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_6POINT1,
+        .num_ele = { 2, 1, 2, 0 },
+        .pairing = { { 1, 0 },{ 0 },{ 1, 0 }, },
+        .index = { { 0, 0 },{ 1 },{ 1, 2 } },
+        .config_map = { 5, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_6POINT1_BACK,
+        .num_ele = { 2, 1, 2, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 1, 0 }, },
+        .index = { { 0, 0 }, { 1 }, { 1, 2 } },
+        .config_map = { 5, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_6POINT1_FRONT,
+        .num_ele = { 2, 1, 2, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 1, 0 }, },
+        .index = { { 0, 0 }, { 1 }, { 1, 2 } },
+        .config_map = { 5, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_7POINT0,
+        .num_ele = { 2, 1, 1, 0 },
+        .pairing = { { 1, 0 }, { 1 }, { 1 }, },
+        .index = { { 0, 0 }, { 1 }, { 2 }, },
+        .config_map = { 4, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_7POINT0_FRONT,
+        .num_ele = { 2, 1, 1, 0 },
+        .pairing = { { 1, 0 }, { 1 }, { 1 }, },
+        .index = { { 0, 0 }, { 1 }, { 2 }, },
+        .config_map = { 4, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_7POINT1,
+        .num_ele = { 2, 1, 2, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 1, 1 }, },
+        .index = { { 0, 0 }, { 1 }, { 1, 2 }, { 0 } },
+        .config_map = { 5, TYPE_CPE, TYPE_SCE,  TYPE_SCE, TYPE_CPE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6, 7 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_7POINT1_WIDE,
+        .num_ele = { 2, 1, 2, 0 },
+        .pairing = { { 1, 0 }, { 0 },{  1, 1 }, },
+        .index = { { 0, 0 }, { 1 }, { 1, 2 }, { 0 } },
+        .config_map = { 5, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_CPE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6, 7 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_7POINT1_WIDE_BACK,
+        .num_ele = { 2, 1, 2, 0 },
+        .pairing = { { 1, 0 }, { 0 }, { 1, 1 }, },
+        .index = { { 0, 0 }, { 1 }, { 1, 2 }, { 0 } },
+        .config_map = { 5, TYPE_CPE, TYPE_SCE, TYPE_SCE, TYPE_CPE, TYPE_CPE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6, 7 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_OCTAGONAL,
+        .num_ele = { 2, 1, 2, 0 },
+        .pairing = { { 1, 0 }, { 1 }, { 1, 0 }, },
+        .index = { { 0, 0 }, { 1 }, { 2, 1 } },
+        .config_map = { 5, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6, 7 },
+    },
+    {   /* Meant for order 2/mixed ambisonics */
+        .layout = AV_CH_LAYOUT_OCTAGONAL | AV_CH_TOP_CENTER,
+        .num_ele = { 2, 2, 2, 0 },
+        .pairing = { { 1, 0 }, { 1, 0 }, { 1, 0 }, },
+        .index = { { 0, 0 }, { 1, 1 }, { 2, 2 } },
+        .config_map = { 6, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+    },
+    {   /* Meant for order 2/mixed ambisonics */
+        .layout = AV_CH_LAYOUT_6POINT0_FRONT | AV_CH_BACK_CENTER |
+                  AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT | AV_CH_TOP_CENTER,
+        .num_ele = { 2, 2, 2, 0 },
+        .pairing = { { 1, 1 }, { 1, 0 }, { 1, 0 }, },
+        .index = { { 0, 1 }, { 2, 0 }, { 3, 1 } },
+        .config_map = { 6, TYPE_CPE, TYPE_CPE, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+    },
+    {
+        .layout = AV_CH_LAYOUT_HEXADECAGONAL,
+        .num_ele = { 4, 2, 4, 0 },
+        .pairing = { { 1, 0, 1, 0 }, { 1, 1 }, { 1, 0, 1, 0 }, },
+        .index = { { 0, 0, 1, 1 }, { 2, 3 }, { 4, 2, 5, 3 } },
+        .config_map = { 10, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_CPE, TYPE_SCE, TYPE_CPE, TYPE_SCE },
+        .reorder_map = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+    },
+};
+
 /**
  * AAC encoder context
  */
@@ -57,29 +379,50 @@ typedef struct AACEncContext {
     PutBitContext pb;
     FFTContext mdct1024;                         ///< long (1024 samples) frame transform context
     FFTContext mdct128;                          ///< short (128 samples) frame transform context
-    AVFloatDSPContext fdsp;
-    float *planar_samples[6];                    ///< saved preprocessed input
+    AVFloatDSPContext *fdsp;
+    AACPCEInfo pce;                              ///< PCE data, if needed
+    float *planar_samples[16];                   ///< saved preprocessed input
 
+    int profile;                                 ///< copied from avctx
+    int needs_pce;                               ///< flag for non-standard layout
+    LPCContext lpc;                              ///< used by TNS
     int samplerate_index;                        ///< MPEG-4 samplerate index
     int channels;                                ///< channel count
+    const uint8_t *reorder_map;                  ///< lavc to aac reorder map
     const uint8_t *chan_map;                     ///< channel configuration map
 
     ChannelElement *cpe;                         ///< channel elements
     FFPsyContext psy;
     struct FFPsyPreprocessContext* psypp;
     const AACCoefficientsEncoder *coder;
-    int cur_channel;
-    int last_frame;
+    int cur_channel;                             ///< current channel for coder context
+    int random_state;
     float lambda;
+    int last_frame_pb_count;                     ///< number of bits for the previous frame
+    float lambda_sum;                            ///< sum(lambda), for Qvg reporting
+    int lambda_count;                            ///< count(lambda), for Qvg reporting
+    enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to
+
     AudioFrameQueue afq;
     DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
     DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
 
+    uint16_t quantize_band_cost_cache_generation;
+    AACQuantizeBandCostCacheEntry quantize_band_cost_cache[256][128]; ///< memoization area for quantize_band_cost
+
+    void (*abs_pow34)(float *out, const float *in, const int size);
+    void (*quant_bands)(int *out, const float *in, const float *scaled,
+                        int size, int is_signed, int maxval, const float Q34,
+                        const float rounding);
+
     struct {
         float *samples;
     } buffer;
 } AACEncContext;
 
-extern float ff_aac_pow34sf_tab[428];
+void ff_aac_dsp_init_x86(AACEncContext *s);
+void ff_aac_coder_init_mips(AACEncContext *c);
+void ff_quantize_band_cost_cache_init(struct AACEncContext *s);
+
 
 #endif /* AVCODEC_AACENC_H */
diff --git a/libavcodec/aacenc_is.c b/libavcodec/aacenc_is.c
new file mode 100644
index 0000000..2f5b7eb
--- /dev/null
+++ b/libavcodec/aacenc_is.c
@@ -0,0 +1,158 @@
+/*
+ * AAC encoder intensity stereo
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder Intensity Stereo
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aacenc.h"
+#include "aacenc_utils.h"
+#include "aacenc_is.h"
+#include "aacenc_quantization.h"
+
+struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
+                                         int start, int w, int g, float ener0,
+                                         float ener1, float ener01,
+                                         int use_pcoeffs, int phase)
+{
+    int i, w2;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    float *L = use_pcoeffs ? sce0->pcoeffs : sce0->coeffs;
+    float *R = use_pcoeffs ? sce1->pcoeffs : sce1->coeffs;
+    float *L34 = &s->scoefs[256*0], *R34 = &s->scoefs[256*1];
+    float *IS  = &s->scoefs[256*2], *I34 = &s->scoefs[256*3];
+    float dist1 = 0.0f, dist2 = 0.0f;
+    struct AACISError is_error = {0};
+
+    if (ener01 <= 0 || ener0 <= 0) {
+        is_error.pass = 0;
+        return is_error;
+    }
+
+    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+        int is_band_type, is_sf_idx = FFMAX(1, sce0->sf_idx[w*16+g]-4);
+        float e01_34 = phase*pos_pow34(ener1/ener0);
+        float maxval, dist_spec_err = 0.0f;
+        float minthr = FFMIN(band0->threshold, band1->threshold);
+        for (i = 0; i < sce0->ics.swb_sizes[g]; i++)
+            IS[i] = (L[start+(w+w2)*128+i] + phase*R[start+(w+w2)*128+i])*sqrt(ener0/ener01);
+        s->abs_pow34(L34, &L[start+(w+w2)*128], sce0->ics.swb_sizes[g]);
+        s->abs_pow34(R34, &R[start+(w+w2)*128], sce0->ics.swb_sizes[g]);
+        s->abs_pow34(I34, IS,                   sce0->ics.swb_sizes[g]);
+        maxval = find_max_val(1, sce0->ics.swb_sizes[g], I34);
+        is_band_type = find_min_book(maxval, is_sf_idx);
+        dist1 += quantize_band_cost(s, &L[start + (w+w2)*128], L34,
+                                    sce0->ics.swb_sizes[g],
+                                    sce0->sf_idx[w*16+g],
+                                    sce0->band_type[w*16+g],
+                                    s->lambda / band0->threshold, INFINITY, NULL, NULL, 0);
+        dist1 += quantize_band_cost(s, &R[start + (w+w2)*128], R34,
+                                    sce1->ics.swb_sizes[g],
+                                    sce1->sf_idx[w*16+g],
+                                    sce1->band_type[w*16+g],
+                                    s->lambda / band1->threshold, INFINITY, NULL, NULL, 0);
+        dist2 += quantize_band_cost(s, IS, I34, sce0->ics.swb_sizes[g],
+                                    is_sf_idx, is_band_type,
+                                    s->lambda / minthr, INFINITY, NULL, NULL, 0);
+        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+            dist_spec_err += (L34[i] - I34[i])*(L34[i] - I34[i]);
+            dist_spec_err += (R34[i] - I34[i]*e01_34)*(R34[i] - I34[i]*e01_34);
+        }
+        dist_spec_err *= s->lambda / minthr;
+        dist2 += dist_spec_err;
+    }
+
+    is_error.pass = dist2 <= dist1;
+    is_error.phase = phase;
+    is_error.error = dist2 - dist1;
+    is_error.dist1 = dist1;
+    is_error.dist2 = dist2;
+    is_error.ener01 = ener01;
+
+    return is_error;
+}
+
+void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe)
+{
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    int start = 0, count = 0, w, w2, g, i, prev_sf1 = -1, prev_bt = -1, prev_is = 0;
+    const float freq_mult = avctx->sample_rate/(1024.0f/sce0->ics.num_windows)/2.0f;
+    uint8_t nextband1[128];
+
+    if (!cpe->common_window)
+        return;
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce1, nextband1);
+
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce0->ics.num_swb; g++) {
+            if (start*freq_mult > INT_STEREO_LOW_LIMIT*(s->lambda/170.0f) &&
+                cpe->ch[0].band_type[w*16+g] != NOISE_BT && !cpe->ch[0].zeroes[w*16+g] &&
+                cpe->ch[1].band_type[w*16+g] != NOISE_BT && !cpe->ch[1].zeroes[w*16+g] &&
+                ff_sfdelta_can_remove_band(sce1, nextband1, prev_sf1, w*16+g)) {
+                float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f, ener01p = 0.0f;
+                struct AACISError ph_err1, ph_err2, *best;
+                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                        float coef0 = sce0->coeffs[start+(w+w2)*128+i];
+                        float coef1 = sce1->coeffs[start+(w+w2)*128+i];
+                        ener0  += coef0*coef0;
+                        ener1  += coef1*coef1;
+                        ener01 += (coef0 + coef1)*(coef0 + coef1);
+                        ener01p += (coef0 - coef1)*(coef0 - coef1);
+                    }
+                }
+                ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                                 ener0, ener1, ener01p, 0, -1);
+                ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                                 ener0, ener1, ener01, 0, +1);
+                best = (ph_err1.pass && ph_err1.error < ph_err2.error) ? &ph_err1 : &ph_err2;
+                if (best->pass) {
+                    cpe->is_mask[w*16+g] = 1;
+                    cpe->ms_mask[w*16+g] = 0;
+                    cpe->ch[0].is_ener[w*16+g] = sqrt(ener0 / best->ener01);
+                    cpe->ch[1].is_ener[w*16+g] = ener0/ener1;
+                    cpe->ch[1].band_type[w*16+g] = (best->phase > 0) ? INTENSITY_BT : INTENSITY_BT2;
+                    if (prev_is && prev_bt != cpe->ch[1].band_type[w*16+g]) {
+                        /** Flip M/S mask and pick the other CB, since it encodes more efficiently */
+                        cpe->ms_mask[w*16+g] = 1;
+                        cpe->ch[1].band_type[w*16+g] = (best->phase > 0) ? INTENSITY_BT2 : INTENSITY_BT;
+                    }
+                    prev_bt = cpe->ch[1].band_type[w*16+g];
+                    count++;
+                }
+            }
+            if (!sce1->zeroes[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                prev_sf1 = sce1->sf_idx[w*16+g];
+            prev_is = cpe->is_mask[w*16+g];
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+    cpe->is_mode = !!count;
+}
diff --git a/libavcodec/aacenc_is.h b/libavcodec/aacenc_is.h
new file mode 100644
index 0000000..269fd1a
--- /dev/null
+++ b/libavcodec/aacenc_is.h
@@ -0,0 +1,51 @@
+/*
+ * AAC encoder intensity stereo
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder Intensity Stereo
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_IS_H
+#define AVCODEC_AACENC_IS_H
+
+#include "aacenc.h"
+
+/** Frequency in Hz for lower limit of intensity stereo **/
+#define INT_STEREO_LOW_LIMIT 6100
+
+struct AACISError {
+    int pass;    /* 1 if dist2 <= dist1  */
+    int phase;   /* -1 or +1             */
+    float error; /* fabs(dist1 - dist2)  */
+    float dist1; /* From original coeffs */
+    float dist2; /* From IS'd coeffs     */
+    float ener01;
+};
+
+struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
+                                         int start, int w, int g, float ener0,
+                                         float ener1, float ener01,
+                                         int use_pcoeffs, int phase);
+void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
+
+#endif /* AVCODEC_AACENC_IS_H */
diff --git a/libavcodec/aacenc_ltp.c b/libavcodec/aacenc_ltp.c
new file mode 100644
index 0000000..674a2a0
--- /dev/null
+++ b/libavcodec/aacenc_ltp.c
@@ -0,0 +1,236 @@
+/*
+ * AAC encoder long term prediction extension
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder long term prediction extension
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aacenc_ltp.h"
+#include "aacenc_quantization.h"
+#include "aacenc_utils.h"
+
+/**
+ * Encode LTP data.
+ */
+void ff_aac_encode_ltp_info(AACEncContext *s, SingleChannelElement *sce,
+                            int common_window)
+{
+    int i;
+    IndividualChannelStream *ics = &sce->ics;
+    if (s->profile != FF_PROFILE_AAC_LTP || !ics->predictor_present)
+        return;
+    if (common_window)
+        put_bits(&s->pb, 1, 0);
+    put_bits(&s->pb, 1, ics->ltp.present);
+    if (!ics->ltp.present)
+        return;
+    put_bits(&s->pb, 11, ics->ltp.lag);
+    put_bits(&s->pb, 3,  ics->ltp.coef_idx);
+    for (i = 0; i < FFMIN(ics->max_sfb, MAX_LTP_LONG_SFB); i++)
+        put_bits(&s->pb, 1, ics->ltp.used[i]);
+}
+
+void ff_aac_ltp_insert_new_frame(AACEncContext *s)
+{
+    int i, ch, tag, chans, cur_channel, start_ch = 0;
+    ChannelElement *cpe;
+    SingleChannelElement *sce;
+    for (i = 0; i < s->chan_map[0]; i++) {
+        cpe = &s->cpe[i];
+        tag      = s->chan_map[i+1];
+        chans    = tag == TYPE_CPE ? 2 : 1;
+        for (ch = 0; ch < chans; ch++) {
+            sce = &cpe->ch[ch];
+            cur_channel = start_ch + ch;
+            /* New sample + overlap */
+            memcpy(&sce->ltp_state[0],    &sce->ltp_state[1024], 1024*sizeof(sce->ltp_state[0]));
+            memcpy(&sce->ltp_state[1024], &s->planar_samples[cur_channel][2048], 1024*sizeof(sce->ltp_state[0]));
+            memcpy(&sce->ltp_state[2048], &sce->ret_buf[0], 1024*sizeof(sce->ltp_state[0]));
+            sce->ics.ltp.lag = 0;
+        }
+        start_ch += chans;
+    }
+}
+
+static void get_lag(float *buf, const float *new, LongTermPrediction *ltp)
+{
+    int i, j, lag = 0, max_corr = 0;
+    float max_ratio = 0.0f;
+    for (i = 0; i < 2048; i++) {
+        float corr, s0 = 0.0f, s1 = 0.0f;
+        const int start = FFMAX(0, i - 1024);
+        for (j = start; j < 2048; j++) {
+            const int idx = j - i + 1024;
+            s0 += new[j]*buf[idx];
+            s1 += buf[idx]*buf[idx];
+        }
+        corr = s1 > 0.0f ? s0/sqrt(s1) : 0.0f;
+        if (corr > max_corr) {
+            max_corr = corr;
+            lag = i;
+            max_ratio = corr/(2048-start);
+        }
+    }
+    ltp->lag = FFMAX(av_clip_uintp2(lag, 11), 0);
+    ltp->coef_idx = quant_array_idx(max_ratio, ltp_coef, 8);
+    ltp->coef = ltp_coef[ltp->coef_idx];
+}
+
+static void generate_samples(float *buf, LongTermPrediction *ltp)
+{
+    int i, samples_num = 2048;
+    if (!ltp->lag) {
+        ltp->present = 0;
+        return;
+    } else if (ltp->lag < 1024) {
+        samples_num = ltp->lag + 1024;
+    }
+    for (i = 0; i < samples_num; i++)
+        buf[i] = ltp->coef*buf[i + 2048 - ltp->lag];
+    memset(&buf[i], 0, (2048 - i)*sizeof(float));
+}
+
+/**
+ * Process LTP parameters
+ * @see Patent WO2006070265A1
+ */
+void ff_aac_update_ltp(AACEncContext *s, SingleChannelElement *sce)
+{
+    float *pred_signal = &sce->ltp_state[0];
+    const float *samples = &s->planar_samples[s->cur_channel][1024];
+
+    if (s->profile != FF_PROFILE_AAC_LTP)
+        return;
+
+    /* Calculate lag */
+    get_lag(pred_signal, samples, &sce->ics.ltp);
+    generate_samples(pred_signal, &sce->ics.ltp);
+}
+
+void ff_aac_adjust_common_ltp(AACEncContext *s, ChannelElement *cpe)
+{
+    int sfb, count = 0;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+
+    if (!cpe->common_window ||
+        sce0->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE ||
+        sce1->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        sce0->ics.ltp.present = 0;
+        return;
+    }
+
+    for (sfb = 0; sfb < FFMIN(sce0->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++) {
+        int sum = sce0->ics.ltp.used[sfb] + sce1->ics.ltp.used[sfb];
+        if (sum != 2) {
+            sce0->ics.ltp.used[sfb] = 0;
+        } else if (sum == 2) {
+            count++;
+        }
+    }
+
+    sce0->ics.ltp.present = !!count;
+    sce0->ics.predictor_present = !!count;
+}
+
+/**
+ * Mark LTP sfb's
+ */
+void ff_aac_search_for_ltp(AACEncContext *s, SingleChannelElement *sce,
+                           int common_window)
+{
+    int w, g, w2, i, start = 0, count = 0;
+    int saved_bits = -(15 + FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB));
+    float *C34 = &s->scoefs[128*0], *PCD = &s->scoefs[128*1];
+    float *PCD34 = &s->scoefs[128*2];
+    const int max_ltp = FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB);
+
+    if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        if (sce->ics.ltp.lag) {
+            memset(&sce->ltp_state[0], 0, 3072*sizeof(sce->ltp_state[0]));
+            memset(&sce->ics.ltp, 0, sizeof(LongTermPrediction));
+        }
+        return;
+    }
+
+    if (!sce->ics.ltp.lag || s->lambda > 120.0f)
+        return;
+
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            int bits1 = 0, bits2 = 0;
+            float dist1 = 0.0f, dist2 = 0.0f;
+            if (w*16+g > max_ltp) {
+                start += sce->ics.swb_sizes[g];
+                continue;
+            }
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                int bits_tmp1, bits_tmp2;
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                for (i = 0; i < sce->ics.swb_sizes[g]; i++)
+                    PCD[i] = sce->coeffs[start+(w+w2)*128+i] - sce->lcoeffs[start+(w+w2)*128+i];
+                s->abs_pow34(C34,  &sce->coeffs[start+(w+w2)*128],  sce->ics.swb_sizes[g]);
+                s->abs_pow34(PCD34, PCD, sce->ics.swb_sizes[g]);
+                dist1 += quantize_band_cost(s, &sce->coeffs[start+(w+w2)*128], C34, sce->ics.swb_sizes[g],
+                                            sce->sf_idx[(w+w2)*16+g], sce->band_type[(w+w2)*16+g],
+                                            s->lambda/band->threshold, INFINITY, &bits_tmp1, NULL, 0);
+                dist2 += quantize_band_cost(s, PCD, PCD34, sce->ics.swb_sizes[g],
+                                            sce->sf_idx[(w+w2)*16+g],
+                                            sce->band_type[(w+w2)*16+g],
+                                            s->lambda/band->threshold, INFINITY, &bits_tmp2, NULL, 0);
+                bits1 += bits_tmp1;
+                bits2 += bits_tmp2;
+            }
+            if (dist2 < dist1 && bits2 < bits1) {
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++)
+                    for (i = 0; i < sce->ics.swb_sizes[g]; i++)
+                        sce->coeffs[start+(w+w2)*128+i] -= sce->lcoeffs[start+(w+w2)*128+i];
+                sce->ics.ltp.used[w*16+g] = 1;
+                saved_bits += bits1 - bits2;
+                count++;
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    sce->ics.ltp.present = !!count && (saved_bits >= 0);
+    sce->ics.predictor_present = !!sce->ics.ltp.present;
+
+    /* Reset any marked sfbs */
+    if (!sce->ics.ltp.present && !!count) {
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            start = 0;
+            for (g = 0;  g < sce->ics.num_swb; g++) {
+                if (sce->ics.ltp.used[w*16+g]) {
+                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                        for (i = 0; i < sce->ics.swb_sizes[g]; i++) {
+                            sce->coeffs[start+(w+w2)*128+i] += sce->lcoeffs[start+(w+w2)*128+i];
+                        }
+                    }
+                }
+                start += sce->ics.swb_sizes[g];
+            }
+        }
+    }
+}
diff --git a/libavcodec/aacenc_ltp.h b/libavcodec/aacenc_ltp.h
new file mode 100644
index 0000000..7276878
--- /dev/null
+++ b/libavcodec/aacenc_ltp.h
@@ -0,0 +1,41 @@
+/*
+ * AAC encoder long term prediction extension
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder long term prediction extension
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_LTP_H
+#define AVCODEC_AACENC_LTP_H
+
+#include "aacenc.h"
+
+void ff_aac_encode_ltp_info(AACEncContext *s, SingleChannelElement *sce,
+                            int common_window);
+void ff_aac_update_ltp(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_adjust_common_ltp(AACEncContext *s, ChannelElement *cpe);
+void ff_aac_ltp_insert_new_frame(AACEncContext *s);
+void ff_aac_search_for_ltp(AACEncContext *s, SingleChannelElement *sce,
+                           int common_window);
+
+#endif /* AVCODEC_AACENC_LTP_H */
diff --git a/libavcodec/aacenc_pred.c b/libavcodec/aacenc_pred.c
new file mode 100644
index 0000000..d111192
--- /dev/null
+++ b/libavcodec/aacenc_pred.c
@@ -0,0 +1,347 @@
+/*
+ * AAC encoder main-type prediction
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder main-type prediction
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aactab.h"
+#include "aacenc_pred.h"
+#include "aacenc_utils.h"
+#include "aacenc_is.h"            /* <- Needed for common window distortions */
+#include "aacenc_quantization.h"
+
+#define RESTORE_PRED(sce, sfb) \
+        if (sce->ics.prediction_used[sfb]) {\
+            sce->ics.prediction_used[sfb] = 0;\
+            sce->band_type[sfb] = sce->band_alt[sfb];\
+        }
+
+static inline float flt16_round(float pf)
+{
+    union av_intfloat32 tmp;
+    tmp.f = pf;
+    tmp.i = (tmp.i + 0x00008000U) & 0xFFFF0000U;
+    return tmp.f;
+}
+
+static inline float flt16_even(float pf)
+{
+    union av_intfloat32 tmp;
+    tmp.f = pf;
+    tmp.i = (tmp.i + 0x00007FFFU + (tmp.i & 0x00010000U >> 16)) & 0xFFFF0000U;
+    return tmp.f;
+}
+
+static inline float flt16_trunc(float pf)
+{
+    union av_intfloat32 pun;
+    pun.f = pf;
+    pun.i &= 0xFFFF0000U;
+    return pun.f;
+}
+
+static inline void predict(PredictorState *ps, float *coef, float *rcoef, int set)
+{
+    float k2;
+    const float a     = 0.953125; // 61.0 / 64
+    const float alpha = 0.90625;  // 29.0 / 32
+    const float   k1 = ps->k1;
+    const float   r0 = ps->r0,     r1 = ps->r1;
+    const float cor0 = ps->cor0, cor1 = ps->cor1;
+    const float var0 = ps->var0, var1 = ps->var1;
+    const float e0 = *coef - ps->x_est;
+    const float e1 = e0 - k1 * r0;
+
+    if (set)
+        *coef = e0;
+
+    ps->cor1 = flt16_trunc(alpha * cor1 + r1 * e1);
+    ps->var1 = flt16_trunc(alpha * var1 + 0.5f * (r1 * r1 + e1 * e1));
+    ps->cor0 = flt16_trunc(alpha * cor0 + r0 * e0);
+    ps->var0 = flt16_trunc(alpha * var0 + 0.5f * (r0 * r0 + e0 * e0));
+    ps->r1   = flt16_trunc(a * (r0 - k1 * e0));
+    ps->r0   = flt16_trunc(a * e0);
+
+    /* Prediction for next frame */
+    ps->k1   = ps->var0 > 1 ? ps->cor0 * flt16_even(a / ps->var0) : 0;
+    k2       = ps->var1 > 1 ? ps->cor1 * flt16_even(a / ps->var1) : 0;
+    *rcoef   = ps->x_est = flt16_round(ps->k1*ps->r0 + k2*ps->r1);
+}
+
+static inline void reset_predict_state(PredictorState *ps)
+{
+    ps->r0    = 0.0f;
+    ps->r1    = 0.0f;
+    ps->k1    = 0.0f;
+    ps->cor0  = 0.0f;
+    ps->cor1  = 0.0f;
+    ps->var0  = 1.0f;
+    ps->var1  = 1.0f;
+    ps->x_est = 0.0f;
+}
+
+static inline void reset_all_predictors(PredictorState *ps)
+{
+    int i;
+    for (i = 0; i < MAX_PREDICTORS; i++)
+        reset_predict_state(&ps[i]);
+}
+
+static inline void reset_predictor_group(SingleChannelElement *sce, int group_num)
+{
+    int i;
+    PredictorState *ps = sce->predictor_state;
+    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
+        reset_predict_state(&ps[i]);
+}
+
+void ff_aac_apply_main_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb, k;
+    const int pmax = FFMIN(sce->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        for (sfb = 0; sfb < pmax; sfb++) {
+            for (k = sce->ics.swb_offset[sfb]; k < sce->ics.swb_offset[sfb + 1]; k++) {
+                predict(&sce->predictor_state[k], &sce->coeffs[k], &sce->prcoeffs[k],
+                        sce->ics.predictor_present && sce->ics.prediction_used[sfb]);
+            }
+        }
+        if (sce->ics.predictor_reset_group) {
+            reset_predictor_group(sce, sce->ics.predictor_reset_group);
+        }
+    } else {
+        reset_all_predictors(sce->predictor_state);
+    }
+}
+
+/* If inc = 0 you can check if this returns 0 to see if you can reset freely */
+static inline int update_counters(IndividualChannelStream *ics, int inc)
+{
+    int i;
+    for (i = 1; i < 31; i++) {
+        ics->predictor_reset_count[i] += inc;
+        if (ics->predictor_reset_count[i] > PRED_RESET_FRAME_MIN)
+            return i; /* Reset this immediately */
+    }
+    return 0;
+}
+
+void ff_aac_adjust_common_pred(AACEncContext *s, ChannelElement *cpe)
+{
+    int start, w, w2, g, i, count = 0;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    const int pmax0 = FFMIN(sce0->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    const int pmax1 = FFMIN(sce1->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    const int pmax  = FFMIN(pmax0, pmax1);
+
+    if (!cpe->common_window ||
+        sce0->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE ||
+        sce1->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE)
+        return;
+
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0; g < sce0->ics.num_swb; g++) {
+            int sfb = w*16+g;
+            int sum = sce0->ics.prediction_used[sfb] + sce1->ics.prediction_used[sfb];
+            float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f;
+            struct AACISError ph_err1, ph_err2, *erf;
+            if (sfb < PRED_SFB_START || sfb > pmax || sum != 2) {
+                RESTORE_PRED(sce0, sfb);
+                RESTORE_PRED(sce1, sfb);
+                start += sce0->ics.swb_sizes[g];
+                continue;
+            }
+            for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                    float coef0 = sce0->pcoeffs[start+(w+w2)*128+i];
+                    float coef1 = sce1->pcoeffs[start+(w+w2)*128+i];
+                    ener0  += coef0*coef0;
+                    ener1  += coef1*coef1;
+                    ener01 += (coef0 + coef1)*(coef0 + coef1);
+                }
+            }
+            ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                             ener0, ener1, ener01, 1, -1);
+            ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                             ener0, ener1, ener01, 1, +1);
+            erf = ph_err1.error < ph_err2.error ? &ph_err1 : &ph_err2;
+            if (erf->pass) {
+                sce0->ics.prediction_used[sfb] = 1;
+                sce1->ics.prediction_used[sfb] = 1;
+                count++;
+            } else {
+                RESTORE_PRED(sce0, sfb);
+                RESTORE_PRED(sce1, sfb);
+            }
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+
+    sce1->ics.predictor_present = sce0->ics.predictor_present = !!count;
+}
+
+static void update_pred_resets(SingleChannelElement *sce)
+{
+    int i, max_group_id_c, max_frame = 0;
+    float avg_frame = 0.0f;
+    IndividualChannelStream *ics = &sce->ics;
+
+    /* Update the counters and immediately update any frame behind schedule */
+    if ((ics->predictor_reset_group = update_counters(&sce->ics, 1)))
+        return;
+
+    for (i = 1; i < 31; i++) {
+        /* Count-based */
+        if (ics->predictor_reset_count[i] > max_frame) {
+            max_group_id_c = i;
+            max_frame = ics->predictor_reset_count[i];
+        }
+        avg_frame = (ics->predictor_reset_count[i] + avg_frame)/2;
+    }
+
+    if (max_frame > PRED_RESET_MIN) {
+        ics->predictor_reset_group = max_group_id_c;
+    } else {
+        ics->predictor_reset_group = 0;
+    }
+}
+
+void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb, i, count = 0, cost_coeffs = 0, cost_pred = 0;
+    const int pmax = FFMIN(sce->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    float *O34  = &s->scoefs[128*0], *P34 = &s->scoefs[128*1];
+    float *SENT = &s->scoefs[128*2], *S34 = &s->scoefs[128*3];
+    float *QERR = &s->scoefs[128*4];
+
+    if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        sce->ics.predictor_present = 0;
+        return;
+    }
+
+    if (!sce->ics.predictor_initialized) {
+        reset_all_predictors(sce->predictor_state);
+        sce->ics.predictor_initialized = 1;
+        memcpy(sce->prcoeffs, sce->coeffs, 1024*sizeof(float));
+        for (i = 1; i < 31; i++)
+            sce->ics.predictor_reset_count[i] = i;
+    }
+
+    update_pred_resets(sce);
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+
+    for (sfb = PRED_SFB_START; sfb < pmax; sfb++) {
+        int cost1, cost2, cb_p;
+        float dist1, dist2, dist_spec_err = 0.0f;
+        const int cb_n = sce->zeroes[sfb] ? 0 : sce->band_type[sfb];
+        const int cb_min = sce->zeroes[sfb] ? 0 : 1;
+        const int cb_max = sce->zeroes[sfb] ? 0 : RESERVED_BT;
+        const int start_coef = sce->ics.swb_offset[sfb];
+        const int num_coeffs = sce->ics.swb_offset[sfb + 1] - start_coef;
+        const FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[sfb];
+
+        if (start_coef + num_coeffs > MAX_PREDICTORS ||
+            (s->cur_channel && sce->band_type[sfb] >= INTENSITY_BT2) ||
+            sce->band_type[sfb] == NOISE_BT)
+            continue;
+
+        /* Normal coefficients */
+        s->abs_pow34(O34, &sce->coeffs[start_coef], num_coeffs);
+        dist1 = quantize_and_encode_band_cost(s, NULL, &sce->coeffs[start_coef], NULL,
+                                              O34, num_coeffs, sce->sf_idx[sfb],
+                                              cb_n, s->lambda / band->threshold, INFINITY, &cost1, NULL, 0);
+        cost_coeffs += cost1;
+
+        /* Encoded coefficients - needed for #bits, band type and quant. error */
+        for (i = 0; i < num_coeffs; i++)
+            SENT[i] = sce->coeffs[start_coef + i] - sce->prcoeffs[start_coef + i];
+        s->abs_pow34(S34, SENT, num_coeffs);
+        if (cb_n < RESERVED_BT)
+            cb_p = av_clip(find_min_book(find_max_val(1, num_coeffs, S34), sce->sf_idx[sfb]), cb_min, cb_max);
+        else
+            cb_p = cb_n;
+        quantize_and_encode_band_cost(s, NULL, SENT, QERR, S34, num_coeffs,
+                                      sce->sf_idx[sfb], cb_p, s->lambda / band->threshold, INFINITY,
+                                      &cost2, NULL, 0);
+
+        /* Reconstructed coefficients - needed for distortion measurements */
+        for (i = 0; i < num_coeffs; i++)
+            sce->prcoeffs[start_coef + i] += QERR[i] != 0.0f ? (sce->prcoeffs[start_coef + i] - QERR[i]) : 0.0f;
+        s->abs_pow34(P34, &sce->prcoeffs[start_coef], num_coeffs);
+        if (cb_n < RESERVED_BT)
+            cb_p = av_clip(find_min_book(find_max_val(1, num_coeffs, P34), sce->sf_idx[sfb]), cb_min, cb_max);
+        else
+            cb_p = cb_n;
+        dist2 = quantize_and_encode_band_cost(s, NULL, &sce->prcoeffs[start_coef], NULL,
+                                              P34, num_coeffs, sce->sf_idx[sfb],
+                                              cb_p, s->lambda / band->threshold, INFINITY, NULL, NULL, 0);
+        for (i = 0; i < num_coeffs; i++)
+            dist_spec_err += (O34[i] - P34[i])*(O34[i] - P34[i]);
+        dist_spec_err *= s->lambda / band->threshold;
+        dist2 += dist_spec_err;
+
+        if (dist2 <= dist1 && cb_p <= cb_n) {
+            cost_pred += cost2;
+            sce->ics.prediction_used[sfb] = 1;
+            sce->band_alt[sfb]  = cb_n;
+            sce->band_type[sfb] = cb_p;
+            count++;
+        } else {
+            cost_pred += cost1;
+            sce->band_alt[sfb] = cb_p;
+        }
+    }
+
+    if (count && cost_coeffs < cost_pred) {
+        count = 0;
+        for (sfb = PRED_SFB_START; sfb < pmax; sfb++)
+            RESTORE_PRED(sce, sfb);
+        memset(&sce->ics.prediction_used, 0, sizeof(sce->ics.prediction_used));
+    }
+
+    sce->ics.predictor_present = !!count;
+}
+
+/**
+ * Encoder predictors data.
+ */
+void ff_aac_encode_main_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb;
+    IndividualChannelStream *ics = &sce->ics;
+    const int pmax = FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+
+    if (s->profile != FF_PROFILE_AAC_MAIN ||
+        !ics->predictor_present)
+        return;
+
+    put_bits(&s->pb, 1, !!ics->predictor_reset_group);
+    if (ics->predictor_reset_group)
+        put_bits(&s->pb, 5, ics->predictor_reset_group);
+    for (sfb = 0; sfb < pmax; sfb++)
+        put_bits(&s->pb, 1, ics->prediction_used[sfb]);
+}
diff --git a/libavcodec/aacenc_pred.h b/libavcodec/aacenc_pred.h
new file mode 100644
index 0000000..aa305f4
--- /dev/null
+++ b/libavcodec/aacenc_pred.h
@@ -0,0 +1,47 @@
+/*
+ * AAC encoder main-type prediction
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder main-type prediction
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_PRED_H
+#define AVCODEC_AACENC_PRED_H
+
+#include "aacenc.h"
+
+/* Every predictor group needs to get reset at least once in this many frames */
+#define PRED_RESET_FRAME_MIN 240
+
+/* Any frame with less than this amount of frames since last reset is ok */
+#define PRED_RESET_MIN 64
+
+/* Raise to filter any low frequency artifacts due to prediction */
+#define PRED_SFB_START 10
+
+void ff_aac_apply_main_pred(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_adjust_common_pred(AACEncContext *s, ChannelElement *cpe);
+void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_encode_main_pred(AACEncContext *s, SingleChannelElement *sce);
+
+#endif /* AVCODEC_AACENC_PRED_H */
diff --git a/libavcodec/aacenc_quantization.h b/libavcodec/aacenc_quantization.h
new file mode 100644
index 0000000..fc5a46b
--- /dev/null
+++ b/libavcodec/aacenc_quantization.h
@@ -0,0 +1,283 @@
+/*
+ * AAC encoder quantizer
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder quantizer
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_QUANTIZATION_H
+#define AVCODEC_AACENC_QUANTIZATION_H
+
+#include "aactab.h"
+#include "aacenc.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
+
+/**
+ * Calculate rate distortion cost for quantizing with given codebook
+ *
+ * @return quantization distortion
+ */
+static av_always_inline float quantize_and_encode_band_cost_template(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *out,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int BT_ZERO, int BT_UNSIGNED,
+                                int BT_PAIR, int BT_ESC, int BT_NOISE, int BT_STEREO,
+                                const float ROUNDING)
+{
+    const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512;
+    const float Q   = ff_aac_pow2sf_tab [q_idx];
+    const float Q34 = ff_aac_pow34sf_tab[q_idx];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    const float CLIPPED_ESCAPE = 165140.0f*IQ;
+    int i, j;
+    float cost = 0;
+    float qenergy = 0;
+    const int dim = BT_PAIR ? 2 : 4;
+    int resbits = 0;
+    int off;
+
+    if (BT_ZERO || BT_NOISE || BT_STEREO) {
+        for (i = 0; i < size; i++)
+            cost += in[i]*in[i];
+        if (bits)
+            *bits = 0;
+        if (energy)
+            *energy = qenergy;
+        if (out) {
+            for (i = 0; i < size; i += dim)
+                for (j = 0; j < dim; j++)
+                    out[i+j] = 0.0f;
+        }
+        return cost * lambda;
+    }
+    if (!scaled) {
+        s->abs_pow34(s->scoefs, in, size);
+        scaled = s->scoefs;
+    }
+    s->quant_bands(s->qcoefs, in, scaled, size, !BT_UNSIGNED, aac_cb_maxval[cb], Q34, ROUNDING);
+    if (BT_UNSIGNED) {
+        off = 0;
+    } else {
+        off = aac_cb_maxval[cb];
+    }
+    for (i = 0; i < size; i += dim) {
+        const float *vec;
+        int *quants = s->qcoefs + i;
+        int curidx = 0;
+        int curbits;
+        float quantized, rd = 0.0f;
+        for (j = 0; j < dim; j++) {
+            curidx *= aac_cb_range[cb];
+            curidx += quants[j] + off;
+        }
+        curbits =  ff_aac_spectral_bits[cb-1][curidx];
+        vec     = &ff_aac_codebook_vectors[cb-1][curidx*dim];
+        if (BT_UNSIGNED) {
+            for (j = 0; j < dim; j++) {
+                float t = fabsf(in[i+j]);
+                float di;
+                if (BT_ESC && vec[j] == 64.0f) { //FIXME: slow
+                    if (t >= CLIPPED_ESCAPE) {
+                        quantized = CLIPPED_ESCAPE;
+                        curbits += 21;
+                    } else {
+                        int c = av_clip_uintp2(quant(t, Q, ROUNDING), 13);
+                        quantized = c*cbrtf(c)*IQ;
+                        curbits += av_log2(c)*2 - 4 + 1;
+                    }
+                } else {
+                    quantized = vec[j]*IQ;
+                }
+                di = t - quantized;
+                if (out)
+                    out[i+j] = in[i+j] >= 0 ? quantized : -quantized;
+                if (vec[j] != 0.0f)
+                    curbits++;
+                qenergy += quantized*quantized;
+                rd += di*di;
+            }
+        } else {
+            for (j = 0; j < dim; j++) {
+                quantized = vec[j]*IQ;
+                qenergy += quantized*quantized;
+                if (out)
+                    out[i+j] = quantized;
+                rd += (in[i+j] - quantized)*(in[i+j] - quantized);
+            }
+        }
+        cost    += rd * lambda + curbits;
+        resbits += curbits;
+        if (cost >= uplim)
+            return uplim;
+        if (pb) {
+            put_bits(pb, ff_aac_spectral_bits[cb-1][curidx], ff_aac_spectral_codes[cb-1][curidx]);
+            if (BT_UNSIGNED)
+                for (j = 0; j < dim; j++)
+                    if (ff_aac_codebook_vectors[cb-1][curidx*dim+j] != 0.0f)
+                        put_bits(pb, 1, in[i+j] < 0.0f);
+            if (BT_ESC) {
+                for (j = 0; j < 2; j++) {
+                    if (ff_aac_codebook_vectors[cb-1][curidx*2+j] == 64.0f) {
+                        int coef = av_clip_uintp2(quant(fabsf(in[i+j]), Q, ROUNDING), 13);
+                        int len = av_log2(coef);
+
+                        put_bits(pb, len - 4 + 1, (1 << (len - 4 + 1)) - 2);
+                        put_sbits(pb, len, coef);
+                    }
+                }
+            }
+        }
+    }
+
+    if (bits)
+        *bits = resbits;
+    if (energy)
+        *energy = qenergy;
+    return cost;
+}
+
+static inline float quantize_and_encode_band_cost_NONE(struct AACEncContext *s, PutBitContext *pb,
+                                                const float *in, float *quant, const float *scaled,
+                                                int size, int scale_idx, int cb,
+                                                const float lambda, const float uplim,
+                                                int *bits, float *energy) {
+    av_assert0(0);
+    return 0.0f;
+}
+
+#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO, ROUNDING) \
+static float quantize_and_encode_band_cost_ ## NAME(                                         \
+                                struct AACEncContext *s,                                     \
+                                PutBitContext *pb, const float *in, float *quant,            \
+                                const float *scaled, int size, int scale_idx,                \
+                                int cb, const float lambda, const float uplim,               \
+                                int *bits, float *energy) {                                  \
+    return quantize_and_encode_band_cost_template(                                           \
+                                s, pb, in, quant, scaled, size, scale_idx,                   \
+                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits, energy,           \
+                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO,  \
+                                ROUNDING);                                                   \
+}
+
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC_RTZ, 0, 1, 1, 1, 0, 0, ROUND_TO_ZERO)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NOISE, 0, 0, 0, 0, 1, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(STEREO,0, 0, 0, 0, 0, 1, ROUND_STANDARD)
+
+static float (*const quantize_and_encode_band_cost_arr[])(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *quant,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy) = {
+    quantize_and_encode_band_cost_ZERO,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_ESC,
+    quantize_and_encode_band_cost_NONE,     /* CB 12 doesn't exist */
+    quantize_and_encode_band_cost_NOISE,
+    quantize_and_encode_band_cost_STEREO,
+    quantize_and_encode_band_cost_STEREO,
+};
+
+static float (*const quantize_and_encode_band_cost_rtz_arr[])(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *quant,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy) = {
+    quantize_and_encode_band_cost_ZERO,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_ESC_RTZ,
+    quantize_and_encode_band_cost_NONE,     /* CB 12 doesn't exist */
+    quantize_and_encode_band_cost_NOISE,
+    quantize_and_encode_band_cost_STEREO,
+    quantize_and_encode_band_cost_STEREO,
+};
+
+#define quantize_and_encode_band_cost(                                  \
+                                s, pb, in, quant, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy, rtz)               \
+    ((rtz) ? quantize_and_encode_band_cost_rtz_arr : quantize_and_encode_band_cost_arr)[cb]( \
+                                s, pb, in, quant, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy)
+
+static inline float quantize_band_cost(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    return quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
+                                         cb, lambda, uplim, bits, energy, rtz);
+}
+
+static inline int quantize_band_cost_bits(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    int auxbits;
+    quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
+                                         cb, 0.0f, uplim, &auxbits, energy, rtz);
+    if (bits) {
+        *bits = auxbits;
+    }
+    return auxbits;
+}
+
+static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
+                                            const float *in, float *out, int size, int scale_idx,
+                                            int cb, const float lambda, int rtz)
+{
+    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
+                                  INFINITY, NULL, NULL, rtz);
+}
+
+#include "aacenc_quantization_misc.h"
+
+#endif /* AVCODEC_AACENC_QUANTIZATION_H */
diff --git a/libavcodec/aacenc_quantization_misc.h b/libavcodec/aacenc_quantization_misc.h
new file mode 100644
index 0000000..28676ca
--- /dev/null
+++ b/libavcodec/aacenc_quantization_misc.h
@@ -0,0 +1,53 @@
+/*
+ * AAC encoder quantization
+ * Copyright (C) 2015 Claudio Freire
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder quantization misc reusable function templates
+ * @author Claudio Freire ( klaussfreire gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_QUANTIZATION_MISC_H
+#define AVCODEC_AACENC_QUANTIZATION_MISC_H
+
+static inline float quantize_band_cost_cached(struct AACEncContext *s, int w, int g, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    AACQuantizeBandCostCacheEntry *entry;
+    av_assert1(scale_idx >= 0 && scale_idx < 256);
+    entry = &s->quantize_band_cost_cache[scale_idx][w*16+g];
+    if (entry->generation != s->quantize_band_cost_cache_generation || entry->cb != cb || entry->rtz != rtz) {
+        entry->rd = quantize_band_cost(s, in, scaled, size, scale_idx,
+                                       cb, lambda, uplim, &entry->bits, &entry->energy, rtz);
+        entry->cb = cb;
+        entry->rtz = rtz;
+        entry->generation = s->quantize_band_cost_cache_generation;
+    }
+    if (bits)
+        *bits = entry->bits;
+    if (energy)
+        *energy = entry->energy;
+    return entry->rd;
+}
+
+#endif /* AVCODEC_AACENC_QUANTIZATION_MISC_H */
diff --git a/libavcodec/aacenc_tns.c b/libavcodec/aacenc_tns.c
new file mode 100644
index 0000000..2ffe1f8
--- /dev/null
+++ b/libavcodec/aacenc_tns.c
@@ -0,0 +1,215 @@
+/*
+ * AAC encoder TNS
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder temporal noise shaping
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "libavutil/libm.h"
+#include "aacenc.h"
+#include "aacenc_tns.h"
+#include "aactab.h"
+#include "aacenc_utils.h"
+#include "aacenc_quantization.h"
+
+/* Could be set to 3 to save an additional bit at the cost of little quality */
+#define TNS_Q_BITS 4
+
+/* Coefficient resolution in short windows */
+#define TNS_Q_BITS_IS8 4
+
+/* We really need the bits we save here elsewhere */
+#define TNS_ENABLE_COEF_COMPRESSION
+
+/* TNS will only be used if the LPC gain is within these margins */
+#define TNS_GAIN_THRESHOLD_LOW      1.4f
+#define TNS_GAIN_THRESHOLD_HIGH     1.16f*TNS_GAIN_THRESHOLD_LOW
+
+static inline int compress_coeffs(int *coef, int order, int c_bits)
+{
+    int i;
+    const int low_idx   = c_bits ?  4 : 2;
+    const int shift_val = c_bits ?  8 : 4;
+    const int high_idx  = c_bits ? 11 : 5;
+#ifndef TNS_ENABLE_COEF_COMPRESSION
+    return 0;
+#endif /* TNS_ENABLE_COEF_COMPRESSION */
+    for (i = 0; i < order; i++)
+        if (coef[i] >= low_idx && coef[i] <= high_idx)
+            return 0;
+    for (i = 0; i < order; i++)
+        coef[i] -= (coef[i] > high_idx) ? shift_val : 0;
+    return 1;
+}
+
+/**
+ * Encode TNS data.
+ * Coefficient compression is simply not lossless as it should be
+ * on any decoder tested and as such is not active.
+ */
+void ff_aac_encode_tns_info(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    int i, w, filt, coef_compress = 0, coef_len;
+    const int is8 = sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int c_bits = is8 ? TNS_Q_BITS_IS8 == 4 : TNS_Q_BITS == 4;
+
+    if (!sce->tns.present)
+        return;
+
+    for (i = 0; i < sce->ics.num_windows; i++) {
+        put_bits(&s->pb, 2 - is8, sce->tns.n_filt[i]);
+        if (!tns->n_filt[i])
+            continue;
+        put_bits(&s->pb, 1, c_bits);
+        for (filt = 0; filt < tns->n_filt[i]; filt++) {
+            put_bits(&s->pb, 6 - 2 * is8, tns->length[i][filt]);
+            put_bits(&s->pb, 5 - 2 * is8, tns->order[i][filt]);
+            if (!tns->order[i][filt])
+                continue;
+            put_bits(&s->pb, 1, tns->direction[i][filt]);
+            coef_compress = compress_coeffs(tns->coef_idx[i][filt],
+                                            tns->order[i][filt], c_bits);
+            put_bits(&s->pb, 1, coef_compress);
+            coef_len = c_bits + 3 - coef_compress;
+            for (w = 0; w < tns->order[i][filt]; w++)
+                put_bits(&s->pb, coef_len, tns->coef_idx[i][filt][w]);
+        }
+    }
+}
+
+/* Apply TNS filter */
+void ff_aac_apply_tns(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    IndividualChannelStream *ics = &sce->ics;
+    int w, filt, m, i, top, order, bottom, start, end, size, inc;
+    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
+    float lpc[TNS_MAX_ORDER];
+
+    for (w = 0; w < ics->num_windows; w++) {
+        bottom = ics->num_swb;
+        for (filt = 0; filt < tns->n_filt[w]; filt++) {
+            top    = bottom;
+            bottom = FFMAX(0, top - tns->length[w][filt]);
+            order  = tns->order[w][filt];
+            if (order == 0)
+                continue;
+
+            // tns_decode_coef
+            compute_lpc_coefs(tns->coef[w][filt], order, lpc, 0, 0, 0);
+
+            start = ics->swb_offset[FFMIN(bottom, mmm)];
+            end   = ics->swb_offset[FFMIN(   top, mmm)];
+            if ((size = end - start) <= 0)
+                continue;
+            if (tns->direction[w][filt]) {
+                inc = -1;
+                start = end - 1;
+            } else {
+                inc = 1;
+            }
+            start += w * 128;
+
+            /* AR filter */
+            for (m = 0; m < size; m++, start += inc) {
+                for (i = 1; i <= FFMIN(m, order); i++) {
+                    sce->coeffs[start] += lpc[i-1]*sce->pcoeffs[start - i*inc];
+                }
+            }
+        }
+    }
+}
+
+/*
+ * c_bits - 1 if 4 bit coefficients, 0 if 3 bit coefficients
+ */
+static inline void quantize_coefs(double *coef, int *idx, float *lpc, int order,
+                                  int c_bits)
+{
+    int i;
+    const float *quant_arr = tns_tmp2_map[c_bits];
+    for (i = 0; i < order; i++) {
+        idx[i] = quant_array_idx(coef[i], quant_arr, c_bits ? 16 : 8);
+        lpc[i] = quant_arr[idx[i]];
+    }
+}
+
+/*
+ * 3 bits per coefficient with 8 short windows
+ */
+void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    int w, g, count = 0;
+    double gain, coefs[MAX_LPC_ORDER];
+    const int mmm = FFMIN(sce->ics.tns_max_bands, sce->ics.max_sfb);
+    const int is8 = sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int c_bits = is8 ? TNS_Q_BITS_IS8 == 4 : TNS_Q_BITS == 4;
+    const int sfb_start = av_clip(tns_min_sfb[is8][s->samplerate_index], 0, mmm);
+    const int sfb_end   = av_clip(sce->ics.num_swb, 0, mmm);
+    const int order = is8 ? 7 : s->profile == FF_PROFILE_AAC_LOW ? 12 : TNS_MAX_ORDER;
+    const int slant = sce->ics.window_sequence[0] == LONG_STOP_SEQUENCE  ? 1 :
+                      sce->ics.window_sequence[0] == LONG_START_SEQUENCE ? 0 : 2;
+    const int sfb_len = sfb_end - sfb_start;
+    const int coef_len = sce->ics.swb_offset[sfb_end] - sce->ics.swb_offset[sfb_start];
+
+    if (coef_len <= 0 || sfb_len <= 0) {
+        sce->tns.present = 0;
+        return;
+    }
+
+    for (w = 0; w < sce->ics.num_windows; w++) {
+        float en[2] = {0.0f, 0.0f};
+        int oc_start = 0, os_start = 0;
+        int coef_start = sce->ics.swb_offset[sfb_start];
+
+        for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
+            FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
+            if (g > sfb_start + (sfb_len/2))
+                en[1] += band->energy;
+            else
+                en[0] += band->energy;
+        }
+
+        /* LPC */
+        gain = ff_lpc_calc_ref_coefs_f(&s->lpc, &sce->coeffs[w*128 + coef_start],
+                                       coef_len, order, coefs);
+
+        if (!order || !isfinite(gain) || gain < TNS_GAIN_THRESHOLD_LOW || gain > TNS_GAIN_THRESHOLD_HIGH)
+            continue;
+
+        tns->n_filt[w] = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;
+        for (g = 0; g < tns->n_filt[w]; g++) {
+            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[!g];
+            tns->order[w][g] = g < tns->n_filt[w] ? order/tns->n_filt[w] : order - oc_start;
+            tns->length[w][g] = g < tns->n_filt[w] ? sfb_len/tns->n_filt[w] : sfb_len - os_start;
+            quantize_coefs(&coefs[oc_start], tns->coef_idx[w][g], tns->coef[w][g],
+                            tns->order[w][g], c_bits);
+            oc_start += tns->order[w][g];
+            os_start += tns->length[w][g];
+        }
+        count++;
+    }
+    sce->tns.present = !!count;
+}
diff --git a/libavcodec/aacenc_tns.h b/libavcodec/aacenc_tns.h
new file mode 100644
index 0000000..466738d
--- /dev/null
+++ b/libavcodec/aacenc_tns.h
@@ -0,0 +1,37 @@
+/*
+ * AAC encoder TNS
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder temporal noise shaping
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_TNS_H
+#define AVCODEC_AACENC_TNS_H
+
+#include "aacenc.h"
+
+void ff_aac_encode_tns_info(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_apply_tns(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce);
+
+#endif /* AVCODEC_AACENC_TNS_H */
diff --git a/libavcodec/aacenc_utils.h b/libavcodec/aacenc_utils.h
new file mode 100644
index 0000000..bef4c10
--- /dev/null
+++ b/libavcodec/aacenc_utils.h
@@ -0,0 +1,279 @@
+/*
+ * AAC encoder utilities
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder utilities
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_UTILS_H
+#define AVCODEC_AACENC_UTILS_H
+
+#include "libavutil/ffmath.h"
+#include "aac.h"
+#include "aacenctab.h"
+#include "aactab.h"
+
+#define ROUND_STANDARD 0.4054f
+#define ROUND_TO_ZERO 0.1054f
+#define C_QUANT 0.4054f
+
+static inline void abs_pow34_v(float *out, const float *in, const int size)
+{
+    int i;
+    for (i = 0; i < size; i++) {
+        float a = fabsf(in[i]);
+        out[i] = sqrtf(a * sqrtf(a));
+    }
+}
+
+static inline float pos_pow34(float a)
+{
+    return sqrtf(a * sqrtf(a));
+}
+
+/**
+ * Quantize one coefficient.
+ * @return absolute value of the quantized coefficient
+ * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
+ */
+static inline int quant(float coef, const float Q, const float rounding)
+{
+    float a = coef * Q;
+    return sqrtf(a * sqrtf(a)) + rounding;
+}
+
+static inline void quantize_bands(int *out, const float *in, const float *scaled,
+                                  int size, int is_signed, int maxval, const float Q34,
+                                  const float rounding)
+{
+    int i;
+    for (i = 0; i < size; i++) {
+        float qc = scaled[i] * Q34;
+        int tmp = (int)FFMIN(qc + rounding, (float)maxval);
+        if (is_signed && in[i] < 0.0f) {
+            tmp = -tmp;
+        }
+        out[i] = tmp;
+    }
+}
+
+static inline float find_max_val(int group_len, int swb_size, const float *scaled)
+{
+    float maxval = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        for (i = 0; i < swb_size; i++) {
+            maxval = FFMAX(maxval, scaled[w2*128+i]);
+        }
+    }
+    return maxval;
+}
+
+static inline int find_min_book(float maxval, int sf)
+{
+    float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
+    int qmaxval, cb;
+    qmaxval = maxval * Q34 + C_QUANT;
+    if (qmaxval >= (FF_ARRAY_ELEMS(aac_maxval_cb)))
+        cb = 11;
+    else
+        cb = aac_maxval_cb[qmaxval];
+    return cb;
+}
+
+static inline float find_form_factor(int group_len, int swb_size, float thresh,
+                                     const float *scaled, float nzslope) {
+    const float iswb_size = 1.0f / swb_size;
+    const float iswb_sizem1 = 1.0f / (swb_size - 1);
+    const float ethresh = thresh;
+    float form = 0.0f, weight = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        float e = 0.0f, e2 = 0.0f, var = 0.0f, maxval = 0.0f;
+        float nzl = 0;
+        for (i = 0; i < swb_size; i++) {
+            float s = fabsf(scaled[w2*128+i]);
+            maxval = FFMAX(maxval, s);
+            e += s;
+            e2 += s *= s;
+            /* We really don't want a hard non-zero-line count, since
+             * even below-threshold lines do add up towards band spectral power.
+             * So, fall steeply towards zero, but smoothly
+             */
+            if (s >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                if (nzslope == 2.f)
+                    nzl += (s / ethresh) * (s / ethresh);
+                else
+                    nzl += ff_fast_powf(s / ethresh, nzslope);
+            }
+        }
+        if (e2 > thresh) {
+            float frm;
+            e *= iswb_size;
+
+            /** compute variance */
+            for (i = 0; i < swb_size; i++) {
+                float d = fabsf(scaled[w2*128+i]) - e;
+                var += d*d;
+            }
+            var = sqrtf(var * iswb_sizem1);
+
+            e2 *= iswb_size;
+            frm = e / FFMIN(e+4*var,maxval);
+            form += e2 * sqrtf(frm) / FFMAX(0.5f,nzl);
+            weight += e2;
+        }
+    }
+    if (weight > 0) {
+        return form / weight;
+    } else {
+        return 1.0f;
+    }
+}
+
+/** Return the minimum scalefactor where the quantized coef does not clip. */
+static inline uint8_t coef2minsf(float coef)
+{
+    return av_clip_uint8(log2f(coef)*4 - 69 + SCALE_ONE_POS - SCALE_DIV_512);
+}
+
+/** Return the maximum scalefactor where the quantized coef is not zero. */
+static inline uint8_t coef2maxsf(float coef)
+{
+    return av_clip_uint8(log2f(coef)*4 +  6 + SCALE_ONE_POS - SCALE_DIV_512);
+}
+
+/*
+ * Returns the closest possible index to an array of float values, given a value.
+ */
+static inline int quant_array_idx(const float val, const float *arr, const int num)
+{
+    int i, index = 0;
+    float quant_min_err = INFINITY;
+    for (i = 0; i < num; i++) {
+        float error = (val - arr[i])*(val - arr[i]);
+        if (error < quant_min_err) {
+            quant_min_err = error;
+            index = i;
+        }
+    }
+    return index;
+}
+
+/**
+ * approximates exp10f(-3.0f*(0.5f + 0.5f * cosf(FFMIN(b,15.5f) / 15.5f)))
+ */
+static av_always_inline float bval2bmax(float b)
+{
+    return 0.001f + 0.0035f * (b*b*b) / (15.5f*15.5f*15.5f);
+}
+
+/*
+ * Compute a nextband map to be used with SF delta constraint utilities.
+ * The nextband array should contain 128 elements, and positions that don't
+ * map to valid, nonzero bands of the form w*16+g (with w being the initial
+ * window of the window group, only) are left indetermined.
+ */
+static inline void ff_init_nextband_map(const SingleChannelElement *sce, uint8_t *nextband)
+{
+    unsigned char prevband = 0;
+    int w, g;
+    /** Just a safe default */
+    for (g = 0; g < 128; g++)
+        nextband[g] = g;
+
+    /** Now really navigate the nonzero band chain */
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0; g < sce->ics.num_swb; g++) {
+            if (!sce->zeroes[w*16+g] && sce->band_type[w*16+g] < RESERVED_BT)
+                prevband = nextband[prevband] = w*16+g;
+        }
+    }
+    nextband[prevband] = prevband; /* terminate */
+}
+
+/*
+ * Updates nextband to reflect a removed band (equivalent to
+ * calling ff_init_nextband_map after marking a band as zero)
+ */
+static inline void ff_nextband_remove(uint8_t *nextband, int prevband, int band)
+{
+    nextband[prevband] = nextband[band];
+}
+
+/*
+ * Checks whether the specified band could be removed without inducing
+ * scalefactor delta that violates SF delta encoding constraints.
+ * prev_sf has to be the scalefactor of the previous nonzero, nonspecial
+ * band, in encoding order, or negative if there was no such band.
+ */
+static inline int ff_sfdelta_can_remove_band(const SingleChannelElement *sce,
+    const uint8_t *nextband, int prev_sf, int band)
+{
+    return prev_sf >= 0
+        && sce->sf_idx[nextband[band]] >= (prev_sf - SCALE_MAX_DIFF)
+        && sce->sf_idx[nextband[band]] <= (prev_sf + SCALE_MAX_DIFF);
+}
+
+/*
+ * Checks whether the specified band's scalefactor could be replaced
+ * with another one without violating SF delta encoding constraints.
+ * prev_sf has to be the scalefactor of the previous nonzero, nonsepcial
+ * band, in encoding order, or negative if there was no such band.
+ */
+static inline int ff_sfdelta_can_replace(const SingleChannelElement *sce,
+    const uint8_t *nextband, int prev_sf, int new_sf, int band)
+{
+    return new_sf >= (prev_sf - SCALE_MAX_DIFF)
+        && new_sf <= (prev_sf + SCALE_MAX_DIFF)
+        && sce->sf_idx[nextband[band]] >= (new_sf - SCALE_MAX_DIFF)
+        && sce->sf_idx[nextband[band]] <= (new_sf + SCALE_MAX_DIFF);
+}
+
+/**
+ * linear congruential pseudorandom number generator
+ *
+ * @param   previous_val    pointer to the current state of the generator
+ *
+ * @return  Returns a 32-bit pseudorandom integer
+ */
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+#define ERROR_IF(cond, ...) \
+    if (cond) { \
+        av_log(avctx, AV_LOG_ERROR, __VA_ARGS__); \
+        return AVERROR(EINVAL); \
+    }
+
+#define WARN_IF(cond, ...) \
+    if (cond) { \
+        av_log(avctx, AV_LOG_WARNING, __VA_ARGS__); \
+    }
+
+#endif /* AVCODEC_AACENC_UTILS_H */
diff --git a/libavcodec/aacenctab.c b/libavcodec/aacenctab.c
new file mode 100644
index 0000000..f3d70fb
--- /dev/null
+++ b/libavcodec/aacenctab.c
@@ -0,0 +1,108 @@
+/*
+ * AAC encoder data
+ * Copyright (c) 2015 Rostislav Pehlivanov ( atomnuker gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "aacenctab.h"
+
+static const uint8_t swb_size_128_96[] = {
+    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
+};
+
+static const uint8_t swb_size_128_64[] = {
+    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
+};
+
+static const uint8_t swb_size_128_48[] = {
+    4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16
+};
+
+static const uint8_t swb_size_128_24[] = {
+    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 20
+};
+
+static const uint8_t swb_size_128_16[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 12, 12, 16, 20, 20
+};
+
+static const uint8_t swb_size_128_8[] = {
+    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 12, 16, 20, 20
+};
+
+static const uint8_t swb_size_1024_96[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_64[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8,
+    12, 12, 12, 16, 16, 16, 20, 24, 24, 28, 36,
+    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40
+};
+
+static const uint8_t swb_size_1024_48[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    96
+};
+
+static const uint8_t swb_size_1024_32[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+};
+
+static const uint8_t swb_size_1024_24[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 36, 36, 40, 44, 48, 52, 52, 64, 64, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_16[] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 24, 24, 28, 28,
+    32, 36, 40, 40, 44, 48, 52, 56, 60, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_8[] = {
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 28, 28,
+    32, 36, 36, 40, 44, 48, 52, 56, 60, 64, 80
+};
+
+const uint8_t *ff_aac_swb_size_128[] = {
+    swb_size_128_96, swb_size_128_96, swb_size_128_64,
+    swb_size_128_48, swb_size_128_48, swb_size_128_48,
+    swb_size_128_24, swb_size_128_24, swb_size_128_16,
+    swb_size_128_16, swb_size_128_16, swb_size_128_8,
+    swb_size_128_8
+};
+
+const uint8_t *ff_aac_swb_size_1024[] = {
+    swb_size_1024_96, swb_size_1024_96, swb_size_1024_64,
+    swb_size_1024_48, swb_size_1024_48, swb_size_1024_32,
+    swb_size_1024_24, swb_size_1024_24, swb_size_1024_16,
+    swb_size_1024_16, swb_size_1024_16, swb_size_1024_8,
+    swb_size_1024_8
+};
+
+const int ff_aac_swb_size_128_len  = FF_ARRAY_ELEMS(ff_aac_swb_size_128);
+const int ff_aac_swb_size_1024_len = FF_ARRAY_ELEMS(ff_aac_swb_size_1024);
diff --git a/libavcodec/aacenctab.h b/libavcodec/aacenctab.h
new file mode 100644
index 0000000..64932d7
--- /dev/null
+++ b/libavcodec/aacenctab.h
@@ -0,0 +1,139 @@
+/*
+ * AAC encoder data
+ * Copyright (c) 2015 Rostislav Pehlivanov ( atomnuker gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder data
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENCTAB_H
+#define AVCODEC_AACENCTAB_H
+
+#include "aac.h"
+
+/** Total number of usable codebooks **/
+#define CB_TOT 12
+
+/** Total number of codebooks, including special ones **/
+#define CB_TOT_ALL 15
+
+#define AAC_MAX_CHANNELS 16
+
+extern const uint8_t *ff_aac_swb_size_1024[];
+extern const int      ff_aac_swb_size_1024_len;
+extern const uint8_t *ff_aac_swb_size_128[];
+extern const int      ff_aac_swb_size_128_len;
+
+/* Supported layouts without using a PCE */
+static const int64_t aac_normal_chan_layouts[7] = {
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_LAYOUT_4POINT0,
+    AV_CH_LAYOUT_5POINT0_BACK,
+    AV_CH_LAYOUT_5POINT1_BACK,
+    AV_CH_LAYOUT_7POINT1,
+};
+
+/** default channel configurations */
+static const uint8_t aac_chan_configs[AAC_MAX_CHANNELS][6] = {
+    {1, TYPE_SCE},                                         // 1 channel  - single channel element
+    {1, TYPE_CPE},                                         // 2 channels - channel pair
+    {2, TYPE_SCE, TYPE_CPE},                               // 3 channels - center + stereo
+    {3, TYPE_SCE, TYPE_CPE, TYPE_SCE},                     // 4 channels - front center + stereo + back center
+    {3, TYPE_SCE, TYPE_CPE, TYPE_CPE},                     // 5 channels - front center + stereo + back stereo
+    {4, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_LFE},           // 6 channels - front center + stereo + back stereo + LFE
+    {0},                                                   // 7 channels - invalid without PCE
+    {5, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_CPE, TYPE_LFE}, // 8 channels - front center + front stereo + side stereo + back stereo + LFE
+};
+
+/**
+ * Table to remap channels from libavcodec's default order to AAC order.
+ */
+static const uint8_t aac_chan_maps[AAC_MAX_CHANNELS][AAC_MAX_CHANNELS] = {
+    { 0 },
+    { 0, 1 },
+    { 2, 0, 1 },
+    { 2, 0, 1, 3 },
+    { 2, 0, 1, 3, 4 },
+    { 2, 0, 1, 4, 5, 3 },
+    { 0 },
+    { 2, 0, 1, 6, 7, 4, 5, 3 },
+};
+
+/* duplicated from avpriv_mpeg4audio_sample_rates to avoid shared build
+ * failures */
+static const int mpeg4audio_sample_rates[16] = {
+    96000, 88200, 64000, 48000, 44100, 32000,
+    24000, 22050, 16000, 12000, 11025, 8000, 7350
+};
+
+/** bits needed to code codebook run value for long windows */
+static const uint8_t run_value_bits_long[64] = {
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
+};
+
+/** bits needed to code codebook run value for short windows */
+static const uint8_t run_value_bits_short[16] = {
+    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
+};
+
+/* TNS starting SFBs for long and short windows */
+static const uint8_t tns_min_sfb_short[16] = {
+    2, 2, 2, 3, 3, 4, 6, 6, 8, 10, 10, 12, 12, 12, 12, 12
+};
+
+static const uint8_t tns_min_sfb_long[16] = {
+    12, 13, 15, 16, 17, 20, 25, 26, 24, 28, 30, 31, 31, 31, 31, 31
+};
+
+static const uint8_t * const tns_min_sfb[2] = {
+    tns_min_sfb_long, tns_min_sfb_short
+};
+
+static const uint8_t * const run_value_bits[2] = {
+    run_value_bits_long, run_value_bits_short
+};
+
+/** Map to convert values from BandCodingPath index to a codebook index **/
+static const uint8_t aac_cb_out_map[CB_TOT_ALL]  = {0,1,2,3,4,5,6,7,8,9,10,11,13,14,15};
+/** Inverse map to convert from codebooks to BandCodingPath indices **/
+static const uint8_t aac_cb_in_map[CB_TOT_ALL+1] = {0,1,2,3,4,5,6,7,8,9,10,11,0,12,13,14};
+
+static const uint8_t aac_cb_range [12] = {0, 3, 3, 3, 3, 9, 9, 8, 8, 13, 13, 17};
+static const uint8_t aac_cb_maxval[12] = {0, 1, 1, 2, 2, 4, 4, 7, 7, 12, 12, 16};
+
+static const unsigned char aac_maxval_cb[] = {
+    0, 1, 3, 5, 5, 7, 7, 7, 9, 9, 9, 9, 9, 11
+};
+
+static const int aacenc_profiles[] = {
+    FF_PROFILE_AAC_MAIN,
+    FF_PROFILE_AAC_LOW,
+    FF_PROFILE_AAC_LTP,
+    FF_PROFILE_MPEG2_AAC_LOW,
+};
+
+#endif /* AVCODEC_AACENCTAB_H */
diff --git a/libavcodec/aacps.c b/libavcodec/aacps.c
index df069c3..d5dca64 100644
--- a/libavcodec/aacps.c
+++ b/libavcodec/aacps.c
@@ -2,31 +2,38 @@
  * MPEG-4 Parametric Stereo decoding functions
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
  */
 
 #include <stdint.h>
 #include "libavutil/common.h"
-#include "libavutil/internal.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "aacps.h"
+#if USE_FIXED
+#include "aacps_fixed_tablegen.h"
+#else
+#include "libavutil/internal.h"
 #include "aacps_tablegen.h"
+#endif /* USE_FIXED */
 #include "aacpsdata.c"
 
 #define PS_BASELINE 0  ///< Operate in Baseline PS mode
@@ -111,7 +118,7 @@ static int read_ ## PAR ## _data(AVCodecContext *avctx, GetBitContext *gb, PSCon
     return 0; \
 err: \
     av_log(avctx, AV_LOG_ERROR, "illegal "#PAR"\n"); \
-    return -1; \
+    return AVERROR_INVALIDDATA; \
 }
 
 READ_PAR_DATA(iid,    huff_offset[table_idx],    0, FFABS(ps->iid_par[e][b]) > 7 + 8 * ps->iid_quant)
@@ -148,7 +155,7 @@ static void ipdopd_reset(int8_t *ipd_hist, int8_t *opd_hist)
     }
 }
 
-int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps, int bits_left)
+int AAC_RENAME(ff_ps_read_data)(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps, int bits_left)
 {
     int e;
     int bit_count_start = get_bits_count(gb_host);
@@ -189,8 +196,13 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps
 
     ps->border_position[0] = -1;
     if (ps->frame_class) {
-        for (e = 1; e <= ps->num_env; e++)
+        for (e = 1; e <= ps->num_env; e++) {
             ps->border_position[e] = get_bits(gb, 5);
+            if (ps->border_position[e] < ps->border_position[e-1]) {
+                av_log(avctx, AV_LOG_ERROR, "border_position non monotone.\n");
+                goto err;
+            }
+        }
     } else
         for (e = 1; e <= ps->num_env; e++)
             ps->border_position[e] = (e * numQMFSlots >> ff_log2_tab[ps->num_env]) - 1;
@@ -236,6 +248,7 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps
     if (!ps->num_env || ps->border_position[ps->num_env] < numQMFSlots - 1) {
         //Create a fake envelope
         int source = ps->num_env ? ps->num_env - 1 : ps->num_env_old - 1;
+        int b;
         if (source >= 0 && source != ps->num_env) {
             if (ps->enable_iid) {
                 memcpy(ps->iid_par+ps->num_env, ps->iid_par+source, sizeof(ps->iid_par[0]));
@@ -248,6 +261,22 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps
                 memcpy(ps->opd_par+ps->num_env, ps->opd_par+source, sizeof(ps->opd_par[0]));
             }
         }
+        if (ps->enable_iid){
+            for (b = 0; b < ps->nr_iid_par; b++) {
+                if (FFABS(ps->iid_par[ps->num_env][b]) > 7 + 8 * ps->iid_quant) {
+                    av_log(avctx, AV_LOG_ERROR, "iid_par invalid\n");
+                    goto err;
+                }
+            }
+        }
+        if (ps->enable_icc){
+            for (b = 0; b < ps->nr_iid_par; b++) {
+                if (ps->icc_par[ps->num_env][b] > 7U) {
+                    av_log(avctx, AV_LOG_ERROR, "icc_par invalid\n");
+                    goto err;
+                }
+            }
+        }
         ps->num_env++;
         ps->border_position[ps->num_env] = numQMFSlots - 1;
     }
@@ -285,35 +314,41 @@ err:
 
 /** Split one subband into 2 subsubbands with a symmetric real filter.
  * The filter must have its non-center even coefficients equal to zero. */
-static void hybrid2_re(float (*in)[2], float (*out)[32][2], const float filter[8], int len, int reverse)
+static void hybrid2_re(INTFLOAT (*in)[2], INTFLOAT (*out)[32][2], const INTFLOAT filter[8], int len, int reverse)
 {
     int i, j;
     for (i = 0; i < len; i++, in++) {
-        float re_in = filter[6] * in[6][0];          //real inphase
-        float re_op = 0.0f;                          //real out of phase
-        float im_in = filter[6] * in[6][1];          //imag inphase
-        float im_op = 0.0f;                          //imag out of phase
+        INT64FLOAT re_in = AAC_MUL31(filter[6], in[6][0]); //real inphase
+        INT64FLOAT re_op = 0.0f;                          //real out of phase
+        INT64FLOAT im_in = AAC_MUL31(filter[6], in[6][1]); //imag inphase
+        INT64FLOAT im_op = 0.0f;                          //imag out of phase
         for (j = 0; j < 6; j += 2) {
-            re_op += filter[j+1] * (in[j+1][0] + in[12-j-1][0]);
-            im_op += filter[j+1] * (in[j+1][1] + in[12-j-1][1]);
+            re_op += (INT64FLOAT)filter[j+1] * (in[j+1][0] + in[12-j-1][0]);
+            im_op += (INT64FLOAT)filter[j+1] * (in[j+1][1] + in[12-j-1][1]);
         }
-        out[ reverse][i][0] = re_in + re_op;
-        out[ reverse][i][1] = im_in + im_op;
-        out[!reverse][i][0] = re_in - re_op;
-        out[!reverse][i][1] = im_in - im_op;
+
+#if USE_FIXED
+        re_op = (re_op + 0x40000000) >> 31;
+        im_op = (im_op + 0x40000000) >> 31;
+#endif /* USE_FIXED */
+
+        out[ reverse][i][0] = (INTFLOAT)(re_in + re_op);
+        out[ reverse][i][1] = (INTFLOAT)(im_in + im_op);
+        out[!reverse][i][0] = (INTFLOAT)(re_in - re_op);
+        out[!reverse][i][1] = (INTFLOAT)(im_in - im_op);
     }
 }
 
 /** Split one subband into 6 subsubbands with a complex filter */
-static void hybrid6_cx(PSDSPContext *dsp, float (*in)[2], float (*out)[32][2],
-                       TABLE_CONST float (*filter)[8][2], int len)
+static void hybrid6_cx(PSDSPContext *dsp, INTFLOAT (*in)[2], INTFLOAT (*out)[32][2],
+                       TABLE_CONST INTFLOAT (*filter)[8][2], int len)
 {
     int i;
     int N = 8;
-    LOCAL_ALIGNED_16(float, temp, [8], [2]);
+    LOCAL_ALIGNED_16(INTFLOAT, temp, [8], [2]);
 
     for (i = 0; i < len; i++, in++) {
-        dsp->hybrid_analysis(temp, in, (const float (*)[8][2]) filter, 1, N);
+        dsp->hybrid_analysis(temp, in, (const INTFLOAT (*)[8][2]) filter, 1, N);
         out[0][i][0] = temp[6][0];
         out[0][i][1] = temp[6][1];
         out[1][i][0] = temp[7][0];
@@ -330,18 +365,18 @@ static void hybrid6_cx(PSDSPContext *dsp, float (*in)[2], float (*out)[32][2],
 }
 
 static void hybrid4_8_12_cx(PSDSPContext *dsp,
-                            float (*in)[2], float (*out)[32][2],
-                            TABLE_CONST float (*filter)[8][2], int N, int len)
+                            INTFLOAT (*in)[2], INTFLOAT (*out)[32][2],
+                            TABLE_CONST INTFLOAT (*filter)[8][2], int N, int len)
 {
     int i;
 
     for (i = 0; i < len; i++, in++) {
-        dsp->hybrid_analysis(out[0] + i, in, (const float (*)[8][2]) filter, 32, N);
+        dsp->hybrid_analysis(out[0] + i, in, (const INTFLOAT (*)[8][2]) filter, 32, N);
     }
 }
 
-static void hybrid_analysis(PSDSPContext *dsp, float out[91][32][2],
-                            float in[5][44][2], float L[2][38][64],
+static void hybrid_analysis(PSDSPContext *dsp, INTFLOAT out[91][32][2],
+                            INTFLOAT in[5][44][2], INTFLOAT L[2][38][64],
                             int is34, int len)
 {
     int i, j;
@@ -370,8 +405,8 @@ static void hybrid_analysis(PSDSPContext *dsp, float out[91][32][2],
     }
 }
 
-static void hybrid_synthesis(PSDSPContext *dsp, float out[2][38][64],
-                             float in[91][32][2], int is34, int len)
+static void hybrid_synthesis(PSDSPContext *dsp, INTFLOAT out[2][38][64],
+                             INTFLOAT in[91][32][2], int is34, int len)
 {
     int i, n;
     if (is34) {
@@ -412,9 +447,10 @@ static void hybrid_synthesis(PSDSPContext *dsp, float out[2][38][64],
 }
 
 /// All-pass filter decay slope
-#define DECAY_SLOPE      0.05f
+#define DECAY_SLOPE      Q30(0.05f)
 /// Number of frequency bands that can be addressed by the parameter index, b(k)
 static const int   NR_PAR_BANDS[]      = { 20, 34 };
+static const int   NR_IPDOPD_BANDS[]   = { 11, 17 };
 /// Number of frequency bands that can be addressed by the sub subband index, k
 static const int   NR_BANDS[]          = { 71, 91 };
 /// Start frequency band for the all-pass filter decay slope
@@ -465,28 +501,43 @@ static void map_idx_34_to_20(int8_t *par_mapped, const int8_t *par, int full)
     }
 }
 
-static void map_val_34_to_20(float par[PS_MAX_NR_IIDICC])
+static void map_val_34_to_20(INTFLOAT par[PS_MAX_NR_IIDICC])
 {
+#if USE_FIXED
+    par[ 0] = (int)(((int64_t)(par[ 0] + (unsigned)(par[ 1]>>1)) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 1] = (int)(((int64_t)((par[ 1]>>1) + (unsigned)par[ 2]) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 2] = (int)(((int64_t)(par[ 3] + (unsigned)(par[ 4]>>1)) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 3] = (int)(((int64_t)((par[ 4]>>1) + (unsigned)par[ 5]) * 1431655765 + \
+                      0x40000000) >> 31);
+#else
     par[ 0] = (2*par[ 0] +   par[ 1]) * 0.33333333f;
     par[ 1] = (  par[ 1] + 2*par[ 2]) * 0.33333333f;
     par[ 2] = (2*par[ 3] +   par[ 4]) * 0.33333333f;
     par[ 3] = (  par[ 4] + 2*par[ 5]) * 0.33333333f;
-    par[ 4] = (  par[ 6] +   par[ 7]) * 0.5f;
-    par[ 5] = (  par[ 8] +   par[ 9]) * 0.5f;
+#endif /* USE_FIXED */
+    par[ 4] = AAC_HALF_SUM(par[ 6], par[ 7]);
+    par[ 5] = AAC_HALF_SUM(par[ 8], par[ 9]);
     par[ 6] =    par[10];
     par[ 7] =    par[11];
-    par[ 8] = (  par[12] +   par[13]) * 0.5f;
-    par[ 9] = (  par[14] +   par[15]) * 0.5f;
+    par[ 8] = AAC_HALF_SUM(par[12], par[13]);
+    par[ 9] = AAC_HALF_SUM(par[14], par[15]);
     par[10] =    par[16];
     par[11] =    par[17];
     par[12] =    par[18];
     par[13] =    par[19];
-    par[14] = (  par[20] +   par[21]) * 0.5f;
-    par[15] = (  par[22] +   par[23]) * 0.5f;
-    par[16] = (  par[24] +   par[25]) * 0.5f;
-    par[17] = (  par[26] +   par[27]) * 0.5f;
+    par[14] = AAC_HALF_SUM(par[20], par[21]);
+    par[15] = AAC_HALF_SUM(par[22], par[23]);
+    par[16] = AAC_HALF_SUM(par[24], par[25]);
+    par[17] = AAC_HALF_SUM(par[26], par[27]);
+#if USE_FIXED
+    par[18] = (((par[28]+2)>>2) + ((par[29]+2)>>2) + ((par[30]+2)>>2) + ((par[31]+2)>>2));
+#else
     par[18] = (  par[28] +   par[29] +   par[30] +   par[31]) * 0.25f;
-    par[19] = (  par[32] +   par[33]) * 0.5f;
+#endif /* USE_FIXED */
+    par[19] = AAC_HALF_SUM(par[32], par[33]);
 }
 
 static void map_idx_10_to_34(int8_t *par_mapped, const int8_t *par, int full)
@@ -571,7 +622,7 @@ static void map_idx_20_to_34(int8_t *par_mapped, const int8_t *par, int full)
     par_mapped[ 0] =  par[ 0];
 }
 
-static void map_val_20_to_34(float par[PS_MAX_NR_IIDICC])
+static void map_val_20_to_34(INTFLOAT par[PS_MAX_NR_IIDICC])
 {
     par[33] =  par[19];
     par[32] =  par[19];
@@ -602,28 +653,29 @@ static void map_val_20_to_34(float par[PS_MAX_NR_IIDICC])
     par[ 7] =  par[ 4];
     par[ 6] =  par[ 4];
     par[ 5] =  par[ 3];
-    par[ 4] = (par[ 2] + par[ 3]) * 0.5f;
+    par[ 4] = AAC_HALF_SUM(par[ 2], par[ 3]);
     par[ 3] =  par[ 2];
     par[ 2] =  par[ 1];
-    par[ 1] = (par[ 0] + par[ 1]) * 0.5f;
-    par[ 0] =  par[ 0];
+    par[ 1] = AAC_HALF_SUM(par[ 0], par[ 1]);
 }
 
-static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[32][2], int is34)
+static void decorrelation(PSContext *ps, INTFLOAT (*out)[32][2], const INTFLOAT (*s)[32][2], int is34)
 {
-    LOCAL_ALIGNED_16(float, power, [34], [PS_QMF_TIME_SLOTS]);
-    LOCAL_ALIGNED_16(float, transient_gain, [34], [PS_QMF_TIME_SLOTS]);
-    float *peak_decay_nrg = ps->peak_decay_nrg;
-    float *power_smooth = ps->power_smooth;
-    float *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
-    float (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
-    float (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
-    const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
-    const float peak_decay_factor = 0.76592833836465f;
+    LOCAL_ALIGNED_16(INTFLOAT, power, [34], [PS_QMF_TIME_SLOTS]);
+    LOCAL_ALIGNED_16(INTFLOAT, transient_gain, [34], [PS_QMF_TIME_SLOTS]);
+    INTFLOAT *peak_decay_nrg = ps->peak_decay_nrg;
+    INTFLOAT *power_smooth = ps->power_smooth;
+    INTFLOAT *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
+    INTFLOAT (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
+    INTFLOAT (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
+#if !USE_FIXED
     const float transient_impact  = 1.5f;
     const float a_smooth          = 0.25f; ///< Smoothing coefficient
+#endif /* USE_FIXED */
+    const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
     int i, k, m, n;
     int n0 = 0, nL = 32;
+    const INTFLOAT peak_decay_factor = Q31(0.76592833836465f);
 
     memset(power, 0, 34 * sizeof(*power));
 
@@ -641,6 +693,24 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
     }
 
     //Transient detection
+#if USE_FIXED
+    for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
+        for (n = n0; n < nL; n++) {
+            int decayed_peak;
+            decayed_peak = (int)(((int64_t)peak_decay_factor * \
+                                           peak_decay_nrg[i] + 0x40000000) >> 31);
+            peak_decay_nrg[i] = FFMAX(decayed_peak, power[i][n]);
+            power_smooth[i] += (power[i][n] + 2LL - power_smooth[i]) >> 2;
+            peak_decay_diff_smooth[i] += (peak_decay_nrg[i] + 2LL - power[i][n] - \
+                                          peak_decay_diff_smooth[i]) >> 2;
+
+            if (peak_decay_diff_smooth[i]) {
+                transient_gain[i][n] = FFMIN(power_smooth[i]*43691LL / peak_decay_diff_smooth[i], 1<<16);
+            } else
+                transient_gain[i][n] = 1 << 16;
+        }
+    }
+#else
     for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
         for (n = n0; n < nL; n++) {
             float decayed_peak = peak_decay_factor * peak_decay_nrg[i];
@@ -654,6 +724,7 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
         }
     }
 
+#endif /* USE_FIXED */
     //Decorrelation and transient reduction
     //                         PS_AP_LINKS - 1
     //                               -----
@@ -664,8 +735,22 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
     //d[k][z] (out) = transient_gain_mapped[k][z] * H[k][z] * s[k][z]
     for (k = 0; k < NR_ALLPASS_BANDS[is34]; k++) {
         int b = k_to_i[k];
+#if USE_FIXED
+        int g_decay_slope;
+
+        if (k - DECAY_CUTOFF[is34] <= 0) {
+          g_decay_slope = 1 << 30;
+        }
+        else if (k - DECAY_CUTOFF[is34] >= 20) {
+          g_decay_slope = 0;
+        }
+        else {
+          g_decay_slope = (1 << 30) - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
+        }
+#else
         float g_decay_slope = 1.f - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
         g_decay_slope = av_clipf(g_decay_slope, 0.f, 1.f);
+#endif /* USE_FIXED */
         memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
         memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
         for (m = 0; m < PS_AP_LINKS; m++) {
@@ -673,7 +758,7 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
         }
         ps->dsp.decorrelate(out[k], delay[k] + PS_MAX_DELAY - 2, ap_delay[k],
                             phi_fract[is34][k],
-                            (const float (*)[2]) Q_fract_allpass[is34][k],
+                            (const INTFLOAT (*)[2]) Q_fract_allpass[is34][k],
                             transient_gain[b], g_decay_slope, nL - n0);
     }
     for (; k < SHORT_DELAY_BAND[is34]; k++) {
@@ -732,14 +817,14 @@ static void remap20(int8_t (**p_par_mapped)[PS_MAX_NR_IIDICC],
     }
 }
 
-static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2], int is34)
+static void stereo_processing(PSContext *ps, INTFLOAT (*l)[32][2], INTFLOAT (*r)[32][2], int is34)
 {
     int e, b, k;
 
-    float (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
-    float (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
-    float (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
-    float (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
+    INTFLOAT (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
+    INTFLOAT (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
+    INTFLOAT (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
+    INTFLOAT (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
     int8_t *opd_hist = ps->opd_hist;
     int8_t *ipd_hist = ps->ipd_hist;
     int8_t iid_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
@@ -751,7 +836,7 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     int8_t (*ipd_mapped)[PS_MAX_NR_IIDICC] = ipd_mapped_buf;
     int8_t (*opd_mapped)[PS_MAX_NR_IIDICC] = opd_mapped_buf;
     const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
-    TABLE_CONST float (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
+    TABLE_CONST INTFLOAT (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
 
     //Remapping
     if (ps->num_env_old) {
@@ -806,35 +891,36 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     //Mixing
     for (e = 0; e < ps->num_env; e++) {
         for (b = 0; b < NR_PAR_BANDS[is34]; b++) {
-            float h11, h12, h21, h22;
+            INTFLOAT h11, h12, h21, h22;
             h11 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][0];
             h12 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][1];
             h21 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][2];
             h22 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][3];
-            if (!PS_BASELINE && ps->enable_ipdopd && b < ps->nr_ipdopd_par) {
+
+            if (!PS_BASELINE && ps->enable_ipdopd && b < NR_IPDOPD_BANDS[is34]) {
                 //The spec say says to only run this smoother when enable_ipdopd
                 //is set but the reference decoder appears to run it constantly
-                float h11i, h12i, h21i, h22i;
-                float ipd_adj_re, ipd_adj_im;
+                INTFLOAT h11i, h12i, h21i, h22i;
+                INTFLOAT ipd_adj_re, ipd_adj_im;
                 int opd_idx = opd_hist[b] * 8 + opd_mapped[e][b];
                 int ipd_idx = ipd_hist[b] * 8 + ipd_mapped[e][b];
-                float opd_re = pd_re_smooth[opd_idx];
-                float opd_im = pd_im_smooth[opd_idx];
-                float ipd_re = pd_re_smooth[ipd_idx];
-                float ipd_im = pd_im_smooth[ipd_idx];
+                INTFLOAT opd_re = pd_re_smooth[opd_idx];
+                INTFLOAT opd_im = pd_im_smooth[opd_idx];
+                INTFLOAT ipd_re = pd_re_smooth[ipd_idx];
+                INTFLOAT ipd_im = pd_im_smooth[ipd_idx];
                 opd_hist[b] = opd_idx & 0x3F;
                 ipd_hist[b] = ipd_idx & 0x3F;
 
-                ipd_adj_re = opd_re*ipd_re + opd_im*ipd_im;
-                ipd_adj_im = opd_im*ipd_re - opd_re*ipd_im;
-                h11i = h11 * opd_im;
-                h11  = h11 * opd_re;
-                h12i = h12 * ipd_adj_im;
-                h12  = h12 * ipd_adj_re;
-                h21i = h21 * opd_im;
-                h21  = h21 * opd_re;
-                h22i = h22 * ipd_adj_im;
-                h22  = h22 * ipd_adj_re;
+                ipd_adj_re = AAC_MADD30(opd_re, ipd_re, opd_im, ipd_im);
+                ipd_adj_im = AAC_MSUB30(opd_im, ipd_re, opd_re, ipd_im);
+                h11i = AAC_MUL30(h11,  opd_im);
+                h11  = AAC_MUL30(h11,  opd_re);
+                h12i = AAC_MUL30(h12,  ipd_adj_im);
+                h12  = AAC_MUL30(h12,  ipd_adj_re);
+                h21i = AAC_MUL30(h21,  opd_im);
+                h21  = AAC_MUL30(h21,  opd_re);
+                h22i = AAC_MUL30(h22,  ipd_adj_im);
+                h22  = AAC_MUL30(h22,  ipd_adj_re);
                 H11[1][e+1][b] = h11i;
                 H12[1][e+1][b] = h12i;
                 H21[1][e+1][b] = h21i;
@@ -846,11 +932,14 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
             H22[0][e+1][b] = h22;
         }
         for (k = 0; k < NR_BANDS[is34]; k++) {
-            float h[2][4];
-            float h_step[2][4];
+            LOCAL_ALIGNED_16(INTFLOAT, h, [2], [4]);
+            LOCAL_ALIGNED_16(INTFLOAT, h_step, [2], [4]);
             int start = ps->border_position[e];
             int stop  = ps->border_position[e+1];
-            float width = 1.f / (stop - start);
+            INTFLOAT width = Q30(1.f) / ((stop - start) ? (stop - start) : 1);
+#if USE_FIXED
+            width = FFMIN(2U*width, INT_MAX);
+#endif
             b = k_to_i[k];
             h[0][0] = H11[0][e][b];
             h[0][1] = H12[0][e][b];
@@ -871,27 +960,28 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
             }
             }
             //Interpolation
-            h_step[0][0] = (H11[0][e+1][b] - h[0][0]) * width;
-            h_step[0][1] = (H12[0][e+1][b] - h[0][1]) * width;
-            h_step[0][2] = (H21[0][e+1][b] - h[0][2]) * width;
-            h_step[0][3] = (H22[0][e+1][b] - h[0][3]) * width;
+            h_step[0][0] = AAC_MSUB31_V3(H11[0][e+1][b], h[0][0], width);
+            h_step[0][1] = AAC_MSUB31_V3(H12[0][e+1][b], h[0][1], width);
+            h_step[0][2] = AAC_MSUB31_V3(H21[0][e+1][b], h[0][2], width);
+            h_step[0][3] = AAC_MSUB31_V3(H22[0][e+1][b], h[0][3], width);
             if (!PS_BASELINE && ps->enable_ipdopd) {
-                h_step[1][0] = (H11[1][e+1][b] - h[1][0]) * width;
-                h_step[1][1] = (H12[1][e+1][b] - h[1][1]) * width;
-                h_step[1][2] = (H21[1][e+1][b] - h[1][2]) * width;
-                h_step[1][3] = (H22[1][e+1][b] - h[1][3]) * width;
+                h_step[1][0] = AAC_MSUB31_V3(H11[1][e+1][b], h[1][0], width);
+                h_step[1][1] = AAC_MSUB31_V3(H12[1][e+1][b], h[1][1], width);
+                h_step[1][2] = AAC_MSUB31_V3(H21[1][e+1][b], h[1][2], width);
+                h_step[1][3] = AAC_MSUB31_V3(H22[1][e+1][b], h[1][3], width);
             }
-            ps->dsp.stereo_interpolate[!PS_BASELINE && ps->enable_ipdopd](
-                l[k] + start + 1, r[k] + start + 1,
-                h, h_step, stop - start);
+            if (stop - start)
+                ps->dsp.stereo_interpolate[!PS_BASELINE && ps->enable_ipdopd](
+                    l[k] + 1 + start, r[k] + 1 + start,
+                    h, h_step, stop - start);
         }
     }
 }
 
-int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float R[2][38][64], int top)
+int AAC_RENAME(ff_ps_apply)(AVCodecContext *avctx, PSContext *ps, INTFLOAT L[2][38][64], INTFLOAT R[2][38][64], int top)
 {
-    LOCAL_ALIGNED_16(float, Lbuf, [91], [32][2]);
-    LOCAL_ALIGNED_16(float, Rbuf, [91], [32][2]);
+    INTFLOAT (*Lbuf)[32][2] = ps->Lbuf;
+    INTFLOAT (*Rbuf)[32][2] = ps->Rbuf;
     const int len = 32;
     int is34 = ps->is34bands;
 
@@ -901,7 +991,7 @@ int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float
         memset(ps->ap_delay + top, 0, (NR_ALLPASS_BANDS[is34] - top)*sizeof(ps->ap_delay[0]));
 
     hybrid_analysis(&ps->dsp, Lbuf, ps->in_buf, L, is34, len);
-    decorrelation(ps, Rbuf, (const float (*)[32][2]) Lbuf, is34);
+    decorrelation(ps, Rbuf, (const INTFLOAT (*)[32][2]) Lbuf, is34);
     stereo_processing(ps, Lbuf, Rbuf, is34);
     hybrid_synthesis(&ps->dsp, L, Lbuf, is34, len);
     hybrid_synthesis(&ps->dsp, R, Rbuf, is34, len);
@@ -918,7 +1008,7 @@ int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float
 #define PS_VLC_ROW(name) \
     { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
 
-av_cold void ff_ps_init(void) {
+av_cold void AAC_RENAME(ff_ps_init)(void) {
     // Syntax initialization
     static const struct {
         const void *ps_codes, *ps_bits;
@@ -950,7 +1040,7 @@ av_cold void ff_ps_init(void) {
     ps_tableinit();
 }
 
-av_cold void ff_ps_ctx_init(PSContext *ps)
+av_cold void AAC_RENAME(ff_ps_ctx_init)(PSContext *ps)
 {
-    ff_psdsp_init(&ps->dsp);
+    AAC_RENAME(ff_psdsp_init)(&ps->dsp);
 }
diff --git a/libavcodec/aacps.h b/libavcodec/aacps.h
index e8a195a..61edce3 100644
--- a/libavcodec/aacps.h
+++ b/libavcodec/aacps.h
@@ -2,25 +2,25 @@
  * MPEG-4 Parametric Stereo definitions and declarations
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_PS_H
-#define AVCODEC_PS_H
+#ifndef AVCODEC_AACPS_H
+#define AVCODEC_AACPS_H
 
 #include <stdint.h>
 
@@ -61,24 +61,26 @@ typedef struct PSContext {
     int    is34bands;
     int    is34bands_old;
 
-    DECLARE_ALIGNED(16, float, in_buf)[5][44][2];
-    DECLARE_ALIGNED(16, float, delay)[PS_MAX_SSB][PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2];
-    DECLARE_ALIGNED(16, float, ap_delay)[PS_MAX_AP_BANDS][PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2];
-    DECLARE_ALIGNED(16, float, peak_decay_nrg)[34];
-    DECLARE_ALIGNED(16, float, power_smooth)[34];
-    DECLARE_ALIGNED(16, float, peak_decay_diff_smooth)[34];
-    DECLARE_ALIGNED(16, float, H11)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H12)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H21)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H22)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, in_buf)[5][44][2];
+    DECLARE_ALIGNED(16, INTFLOAT, delay)[PS_MAX_SSB][PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2];
+    DECLARE_ALIGNED(16, INTFLOAT, ap_delay)[PS_MAX_AP_BANDS][PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2];
+    DECLARE_ALIGNED(16, INTFLOAT, peak_decay_nrg)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, power_smooth)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, peak_decay_diff_smooth)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, H11)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H12)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H21)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H22)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, Lbuf)[91][32][2];
+    DECLARE_ALIGNED(16, INTFLOAT, Rbuf)[91][32][2];
     int8_t opd_hist[PS_MAX_NR_IIDICC];
     int8_t ipd_hist[PS_MAX_NR_IIDICC];
     PSDSPContext dsp;
 } PSContext;
 
-void ff_ps_init(void);
-void ff_ps_ctx_init(PSContext *ps);
-int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int bits_left);
-int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float R[2][38][64], int top);
+void AAC_RENAME(ff_ps_init)(void);
+void AAC_RENAME(ff_ps_ctx_init)(PSContext *ps);
+int AAC_RENAME(ff_ps_read_data)(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int bits_left);
+int AAC_RENAME(ff_ps_apply)(AVCodecContext *avctx, PSContext *ps, INTFLOAT L[2][38][64], INTFLOAT R[2][38][64], int top);
 
-#endif /* AVCODEC_PS_H */
+#endif /* AVCODEC_AACPS_H */
diff --git a/libavcodec/aacps_fixed.c b/libavcodec/aacps_fixed.c
new file mode 100644
index 0000000..46af213
--- /dev/null
+++ b/libavcodec/aacps_fixed.c
@@ -0,0 +1,24 @@
+/*
+ * MPEG-4 Parametric Stereo decoding functions
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+
+#include "aacps.c"
diff --git a/libavcodec/aac_tablegen.c b/libavcodec/aacps_fixed_tablegen.c
index b2c6c95..9e30699 100644
--- a/libavcodec/aac_tablegen.c
+++ b/libavcodec/aacps_fixed_tablegen.c
@@ -1,37 +1,24 @@
 /*
- * Generate a header file for hardcoded AAC tables
+ * Generate a header file for hardcoded Parametric Stereo tables
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "aac_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    ff_aac_tableinit();
-
-    write_fileheader();
-
-    WRITE_ARRAY("const", float, ff_aac_pow2sf_tab);
-
-    return 0;
-}
+#define USE_FIXED 1
+#include "aacps_tablegen_template.c"
diff --git a/libavcodec/aacps_fixed_tablegen.h b/libavcodec/aacps_fixed_tablegen.h
new file mode 100644
index 0000000..8b82deb
--- /dev/null
+++ b/libavcodec/aacps_fixed_tablegen.h
@@ -0,0 +1,403 @@
+/*
+ * Header file for hardcoded Parametric Stereo tables
+ *
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+
+#ifndef AVCODEC_AACPS_FIXED_TABLEGEN_H
+#define AVCODEC_AACPS_FIXED_TABLEGEN_H
+
+#include <math.h>
+#include <stdint.h>
+
+#if CONFIG_HARDCODED_TABLES
+#define ps_tableinit()
+#define TABLE_CONST const
+#include "libavcodec/aacps_fixed_tables.h"
+#else
+#include "libavutil/common.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/mem.h"
+
+#include "aac_defines.h"
+#include "libavutil/softfloat.h"
+#define NR_ALLPASS_BANDS20 30
+#define NR_ALLPASS_BANDS34 50
+#define PS_AP_LINKS 3
+#define TABLE_CONST
+static int pd_re_smooth[8*8*8];
+static int pd_im_smooth[8*8*8];
+static int HA[46][8][4];
+static int HB[46][8][4];
+static DECLARE_ALIGNED(16, int, f20_0_8) [ 8][8][2];
+static DECLARE_ALIGNED(16, int, f34_0_12)[12][8][2];
+static DECLARE_ALIGNED(16, int, f34_1_8) [ 8][8][2];
+static DECLARE_ALIGNED(16, int, f34_2_4) [ 4][8][2];
+static TABLE_CONST DECLARE_ALIGNED(16, int, Q_fract_allpass)[2][50][3][2];
+static DECLARE_ALIGNED(16, int, phi_fract)[2][50][2];
+
+static const int g0_Q8[] = {
+    Q31(0.00746082949812f), Q31(0.02270420949825f), Q31(0.04546865930473f), Q31(0.07266113929591f),
+    Q31(0.09885108575264f), Q31(0.11793710567217f), Q31(0.125f)
+};
+
+static const int g0_Q12[] = {
+    Q31(0.04081179924692f), Q31(0.03812810994926f), Q31(0.05144908135699f), Q31(0.06399831151592f),
+    Q31(0.07428313801106f), Q31(0.08100347892914f), Q31(0.08333333333333f)
+};
+
+static const int g1_Q8[] = {
+    Q31(0.01565675600122f), Q31(0.03752716391991f), Q31(0.05417891378782f), Q31(0.08417044116767f),
+    Q31(0.10307344158036f), Q31(0.12222452249753f), Q31(0.125f)
+};
+
+static const int g2_Q4[] = {
+    Q31(-0.05908211155639f), Q31(-0.04871498374946f), Q31(0.0f),   Q31(0.07778723915851f),
+    Q31( 0.16486303567403f), Q31( 0.23279856662996f), Q31(0.25f)
+};
+
+static const int sintbl_4[4]   = {           0,  1073741824,           0, -1073741824 };
+static const int costbl_4[4]   = {  1073741824,           0, -1073741824,           0 };
+static const int sintbl_8[8]   = {           0,   759250125,  1073741824,   759250125,
+                                             0,  -759250125, -1073741824,  -759250125 };
+static const int costbl_8[8]   = {  1073741824,   759250125,           0,  -759250125,
+                                   -1073741824,  -759250125,           0,   759250125 };
+static const int sintbl_12[12] = {           0,   536870912,   929887697,  1073741824,
+                                     929887697,   536870912,           0,  -536870912,
+                                    -929887697, -1073741824,  -929887697,  -536870912 };
+static const int costbl_12[12] = {  1073741824,   929887697,   536870912,           0,
+                                    -536870912,  -929887697, -1073741824,  -929887697,
+                                    -536870912,           0,   536870912,   929887697 };
+
+static void make_filters_from_proto(int (*filter)[8][2], const int *proto, int bands)
+{
+
+    const int *sinptr, *cosptr;
+    int s, c, sinhalf, coshalf;
+    int q, n;
+
+    if (bands == 4) {
+        sinptr = sintbl_4;
+        cosptr = costbl_4;
+        sinhalf = 759250125;
+        coshalf = 759250125;
+    } else if (bands == 8) {
+        sinptr = sintbl_8;
+        cosptr = costbl_8;
+        sinhalf = 410903207;
+        coshalf = 992008094;
+    } else {
+        sinptr = sintbl_12;
+        cosptr = costbl_12;
+        sinhalf = 277904834;
+        coshalf = 1037154959;
+    }
+
+    for (q = 0; q < bands; q++) {
+        for (n = 0; n < 7; n++) {
+            int theta = (q*(n-6) + (n>>1) - 3) % bands;
+
+            if (theta < 0)
+                theta += bands;
+            s = sinptr[theta];
+            c = cosptr[theta];
+
+            if (n & 1) {
+                theta = (int)(((int64_t)c * coshalf - (int64_t)s * sinhalf + 0x20000000) >> 30);
+                s = (int)(((int64_t)s * coshalf + (int64_t)c * sinhalf + 0x20000000) >> 30);
+                c = theta;
+            }
+            filter[q][n][0] = (int)(((int64_t)proto[n] * c + 0x20000000) >> 30);
+            filter[q][n][1] = -(int)(((int64_t)proto[n] * s + 0x20000000) >> 30);
+        }
+    }
+}
+
+static void ps_tableinit(void)
+{
+    static const int ipdopd_sin[] = { Q30(0), Q30(M_SQRT1_2), Q30(1), Q30( M_SQRT1_2), Q30( 0), Q30(-M_SQRT1_2), Q30(-1), Q30(-M_SQRT1_2) };
+    static const int ipdopd_cos[] = { Q30(1), Q30(M_SQRT1_2), Q30(0), Q30(-M_SQRT1_2), Q30(-1), Q30(-M_SQRT1_2), Q30( 0), Q30( M_SQRT1_2) };
+    int pd0, pd1, pd2;
+    int idx;
+
+    static const int alpha_tab[] =
+    {
+      Q30(1.5146213770f/M_PI), Q30(1.5181334019f/M_PI), Q30(1.5234849453f/M_PI), Q30(1.5369486809f/M_PI), Q30(1.5500687361f/M_PI), Q30(1.5679757595f/M_PI),
+      Q30(1.4455626011f/M_PI), Q30(1.4531552792f/M_PI), Q30(1.4648091793f/M_PI), Q30(1.4945238829f/M_PI), Q30(1.5239057541f/M_PI), Q30(1.5644006729f/M_PI),
+      Q30(1.3738563061f/M_PI), Q30(1.3851221800f/M_PI), Q30(1.4026404619f/M_PI), Q30(1.4484288692f/M_PI), Q30(1.4949874878f/M_PI), Q30(1.5604078770f/M_PI),
+      Q30(1.2645189762f/M_PI), Q30(1.2796478271f/M_PI), Q30(1.3038636446f/M_PI), Q30(1.3710125685f/M_PI), Q30(1.4443849325f/M_PI), Q30(1.5532352924f/M_PI),
+      Q30(1.1507037878f/M_PI), Q30(1.1669205427f/M_PI), Q30(1.1938756704f/M_PI), Q30(1.2754167318f/M_PI), Q30(1.3761177063f/M_PI), Q30(1.5429240465f/M_PI),
+      Q30(1.0079245567f/M_PI), Q30(1.0208238363f/M_PI), Q30(1.0433073044f/M_PI), Q30(1.1208510399f/M_PI), Q30(1.2424604893f/M_PI), Q30(1.5185726881f/M_PI),
+      Q30(0.8995233774f/M_PI), Q30(0.9069069624f/M_PI), Q30(0.9201194048f/M_PI), Q30(0.9698365927f/M_PI), Q30(1.0671583414f/M_PI), Q30(1.4647934437f/M_PI),
+      Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI),
+      Q30(0.6712729335f/M_PI), Q30(0.6638893485f/M_PI), Q30(0.6506769061f/M_PI), Q30(0.6009597182f/M_PI), Q30(0.5036380291f/M_PI), Q30(0.1060028747f/M_PI),
+      Q30(0.5628717542f/M_PI), Q30(0.5499725342f/M_PI), Q30(0.5274890065f/M_PI), Q30(0.4499453008f/M_PI), Q30(0.3283358216f/M_PI), Q30(0.0522236861f/M_PI),
+      Q30(0.4200925827f/M_PI), Q30(0.4038758278f/M_PI), Q30(0.3769206405f/M_PI), Q30(0.2953795493f/M_PI), Q30(0.1946786791f/M_PI), Q30(0.0278722942f/M_PI),
+      Q30(0.3062773645f/M_PI), Q30(0.2911485136f/M_PI), Q30(0.2669326365f/M_PI), Q30(0.1997837722f/M_PI), Q30(0.1264114529f/M_PI), Q30(0.0175609849f/M_PI),
+      Q30(0.1969399750f/M_PI), Q30(0.1856741160f/M_PI), Q30(0.1681558639f/M_PI), Q30(0.1223674342f/M_PI), Q30(0.0758088827f/M_PI), Q30(0.0103884479f/M_PI),
+      Q30(0.1252337098f/M_PI), Q30(0.1176410317f/M_PI), Q30(0.1059871912f/M_PI), Q30(0.0762724727f/M_PI), Q30(0.0468905345f/M_PI), Q30(0.0063956482f/M_PI),
+      Q30(0.0561749674f/M_PI), Q30(0.0526629239f/M_PI), Q30(0.0473113805f/M_PI), Q30(0.0338476151f/M_PI), Q30(0.0207276177f/M_PI), Q30(0.0028205961f/M_PI),
+      Q30(1.5676341057f/M_PI), Q30(1.5678333044f/M_PI), Q30(1.5681363344f/M_PI), Q30(1.5688960552f/M_PI), Q30(1.5696337223f/M_PI), Q30(1.5706381798f/M_PI),
+      Q30(1.5651730299f/M_PI), Q30(1.5655272007f/M_PI), Q30(1.5660660267f/M_PI), Q30(1.5674170256f/M_PI), Q30(1.5687289238f/M_PI), Q30(1.5705151558f/M_PI),
+      Q30(1.5607966185f/M_PI), Q30(1.5614265203f/M_PI), Q30(1.5623844862f/M_PI), Q30(1.5647867918f/M_PI), Q30(1.5671195984f/M_PI), Q30(1.5702962875f/M_PI),
+      Q30(1.5530153513f/M_PI), Q30(1.5541347265f/M_PI), Q30(1.5558375120f/M_PI), Q30(1.5601085424f/M_PI), Q30(1.5642569065f/M_PI), Q30(1.5699069500f/M_PI),
+      Q30(1.5391840935f/M_PI), Q30(1.5411708355f/M_PI), Q30(1.5441943407f/M_PI), Q30(1.5517836809f/M_PI), Q30(1.5591609478f/M_PI), Q30(1.5692136288f/M_PI),
+      Q30(1.5146213770f/M_PI), Q30(1.5181334019f/M_PI), Q30(1.5234849453f/M_PI), Q30(1.5369486809f/M_PI), Q30(1.5500687361f/M_PI), Q30(1.5679757595f/M_PI),
+      Q30(1.4915299416f/M_PI), Q30(1.4964480400f/M_PI), Q30(1.5039558411f/M_PI), Q30(1.5229074955f/M_PI), Q30(1.5414420366f/M_PI), Q30(1.5667995214f/M_PI),
+      Q30(1.4590617418f/M_PI), Q30(1.4658898115f/M_PI), Q30(1.4763505459f/M_PI), Q30(1.5029321909f/M_PI), Q30(1.5291173458f/M_PI), Q30(1.5651149750f/M_PI),
+      Q30(1.4136143923f/M_PI), Q30(1.4229322672f/M_PI), Q30(1.4373078346f/M_PI), Q30(1.4743183851f/M_PI), Q30(1.5113102198f/M_PI), Q30(1.5626684427f/M_PI),
+      Q30(1.3505556583f/M_PI), Q30(1.3628427982f/M_PI), Q30(1.3820509911f/M_PI), Q30(1.4327841997f/M_PI), Q30(1.4850014448f/M_PI), Q30(1.5590143204f/M_PI),
+      Q30(1.2645189762f/M_PI), Q30(1.2796478271f/M_PI), Q30(1.3038636446f/M_PI), Q30(1.3710125685f/M_PI), Q30(1.4443849325f/M_PI), Q30(1.5532352924f/M_PI),
+      Q30(1.1919227839f/M_PI), Q30(1.2081253529f/M_PI), Q30(1.2346779108f/M_PI), Q30(1.3123005629f/M_PI), Q30(1.4034168720f/M_PI), Q30(1.5471596718f/M_PI),
+      Q30(1.1061993837f/M_PI), Q30(1.1219338179f/M_PI), Q30(1.1484941244f/M_PI), Q30(1.2320860624f/M_PI), Q30(1.3421301842f/M_PI), Q30(1.5373806953f/M_PI),
+      Q30(1.0079245567f/M_PI), Q30(1.0208238363f/M_PI), Q30(1.0433073044f/M_PI), Q30(1.1208510399f/M_PI), Q30(1.2424604893f/M_PI), Q30(1.5185726881f/M_PI),
+      Q30(0.8995233774f/M_PI), Q30(0.9069069624f/M_PI), Q30(0.9201194048f/M_PI), Q30(0.9698365927f/M_PI), Q30(1.0671583414f/M_PI), Q30(1.4647934437f/M_PI),
+      Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI),
+      Q30(0.6712729335f/M_PI), Q30(0.6638893485f/M_PI), Q30(0.6506769061f/M_PI), Q30(0.6009597182f/M_PI), Q30(0.5036380291f/M_PI), Q30(0.1060028747f/M_PI),
+      Q30(0.5628717542f/M_PI), Q30(0.5499725342f/M_PI), Q30(0.5274890065f/M_PI), Q30(0.4499453008f/M_PI), Q30(0.3283358216f/M_PI), Q30(0.0522236861f/M_PI),
+      Q30(0.4645969570f/M_PI), Q30(0.4488625824f/M_PI), Q30(0.4223022461f/M_PI), Q30(0.3387103081f/M_PI), Q30(0.2286661267f/M_PI), Q30(0.0334156826f/M_PI),
+      Q30(0.3788735867f/M_PI), Q30(0.3626709878f/M_PI), Q30(0.3361184299f/M_PI), Q30(0.2584958076f/M_PI), Q30(0.1673794836f/M_PI), Q30(0.0236366931f/M_PI),
+      Q30(0.3062773645f/M_PI), Q30(0.2911485136f/M_PI), Q30(0.2669326365f/M_PI), Q30(0.1997837722f/M_PI), Q30(0.1264114529f/M_PI), Q30(0.0175609849f/M_PI),
+      Q30(0.2202406377f/M_PI), Q30(0.2079535723f/M_PI), Q30(0.1887452900f/M_PI), Q30(0.1380121708f/M_PI), Q30(0.0857949182f/M_PI), Q30(0.0117820343f/M_PI),
+      Q30(0.1571819335f/M_PI), Q30(0.1478640437f/M_PI), Q30(0.1334884763f/M_PI), Q30(0.0964778885f/M_PI), Q30(0.0594860613f/M_PI), Q30(0.0081279324f/M_PI),
+      Q30(0.1117345318f/M_PI), Q30(0.1049065739f/M_PI), Q30(0.0944457650f/M_PI), Q30(0.0678641573f/M_PI), Q30(0.0416790098f/M_PI), Q30(0.0056813755f/M_PI),
+      Q30(0.0792663917f/M_PI), Q30(0.0743482932f/M_PI), Q30(0.0668405443f/M_PI), Q30(0.0478888862f/M_PI), Q30(0.0293543357f/M_PI), Q30(0.0039967746f/M_PI),
+      Q30(0.0561749674f/M_PI), Q30(0.0526629239f/M_PI), Q30(0.0473113805f/M_PI), Q30(0.0338476151f/M_PI), Q30(0.0207276177f/M_PI), Q30(0.0028205961f/M_PI),
+      Q30(0.0316122435f/M_PI), Q30(0.0296254847f/M_PI), Q30(0.0266019460f/M_PI), Q30(0.0190126132f/M_PI), Q30(0.0116353342f/M_PI), Q30(0.0015827164f/M_PI),
+      Q30(0.0177809205f/M_PI), Q30(0.0166615788f/M_PI), Q30(0.0149587989f/M_PI), Q30(0.0106877899f/M_PI), Q30(0.0065393616f/M_PI), Q30(0.0008894200f/M_PI),
+      Q30(0.0099996664f/M_PI), Q30(0.0093698399f/M_PI), Q30(0.0084118480f/M_PI), Q30(0.0060095116f/M_PI), Q30(0.0036767013f/M_PI), Q30(0.0005000498f/M_PI),
+      Q30(0.0056233541f/M_PI), Q30(0.0052691097f/M_PI), Q30(0.0047303112f/M_PI), Q30(0.0033792770f/M_PI), Q30(0.0020674451f/M_PI), Q30(0.0002811795f/M_PI),
+      Q30(0.0031622672f/M_PI), Q30(0.0029630491f/M_PI), Q30(0.0026600463f/M_PI), Q30(0.0019002859f/M_PI), Q30(0.0011625893f/M_PI), Q30(0.0001581155f/M_PI)
+    };
+
+    static const int gamma_tab[] =
+    {
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0433459543f/M_PI), Q30(0.0672172382f/M_PI), Q30(0.0997167900f/M_PI), Q30(0.1162951663f/M_PI), Q30(0.1250736862f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0672341362f/M_PI), Q30(0.1045235619f/M_PI), Q30(0.1558904350f/M_PI), Q30(0.1824723780f/M_PI), Q30(0.1966800541f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1315985769f/M_PI), Q30(0.2072522491f/M_PI), Q30(0.3188187480f/M_PI), Q30(0.3825501204f/M_PI), Q30(0.4193951190f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1784276664f/M_PI), Q30(0.2856673002f/M_PI), Q30(0.4630723596f/M_PI), Q30(0.5971632004f/M_PI), Q30(0.7603877187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1315985769f/M_PI), Q30(0.2072522491f/M_PI), Q30(0.3188187480f/M_PI), Q30(0.3825501204f/M_PI), Q30(0.4193951190f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0672341362f/M_PI), Q30(0.1045235619f/M_PI), Q30(0.1558904350f/M_PI), Q30(0.1824723780f/M_PI), Q30(0.1966800541f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0433459543f/M_PI), Q30(0.0672172382f/M_PI), Q30(0.0997167900f/M_PI), Q30(0.1162951663f/M_PI), Q30(0.1250736862f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0011053939f/M_PI), Q30(0.0017089852f/M_PI), Q30(0.0025254129f/M_PI), Q30(0.0029398468f/M_PI), Q30(0.0031597170f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0019607407f/M_PI), Q30(0.0030395309f/M_PI), Q30(0.0044951206f/M_PI), Q30(0.0052305623f/M_PI), Q30(0.0056152637f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0034913034f/M_PI), Q30(0.0054070661f/M_PI), Q30(0.0079917293f/M_PI), Q30(0.0092999367f/M_PI), Q30(0.0099875759f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0062100487f/M_PI), Q30(0.0096135242f/M_PI), Q30(0.0142110568f/M_PI), Q30(0.0165348612f/M_PI), Q30(0.0177587029f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0110366223f/M_PI), Q30(0.0170863140f/M_PI), Q30(0.0252620988f/M_PI), Q30(0.0293955617f/M_PI), Q30(0.0315726399f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0275881495f/M_PI), Q30(0.0427365713f/M_PI), Q30(0.0632618815f/M_PI), Q30(0.0736731067f/M_PI), Q30(0.0791663304f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0387469754f/M_PI), Q30(0.0600636788f/M_PI), Q30(0.0890387669f/M_PI), Q30(0.1037906483f/M_PI), Q30(0.1115923747f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0541138873f/M_PI), Q30(0.0839984417f/M_PI), Q30(0.1248718798f/M_PI), Q30(0.1458375156f/M_PI), Q30(0.1569785923f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0747506917f/M_PI), Q30(0.1163287833f/M_PI), Q30(0.1738867164f/M_PI), Q30(0.2038587779f/M_PI), Q30(0.2199459076f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1212290376f/M_PI), Q30(0.1903949380f/M_PI), Q30(0.2907958031f/M_PI), Q30(0.3466993868f/M_PI), Q30(0.3782821596f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1418247074f/M_PI), Q30(0.2240308374f/M_PI), Q30(0.3474813402f/M_PI), Q30(0.4202919006f/M_PI), Q30(0.4637607038f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1784276664f/M_PI), Q30(0.2856673002f/M_PI), Q30(0.4630723596f/M_PI), Q30(0.5971632004f/M_PI), Q30(0.7603877187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1418247074f/M_PI), Q30(0.2240308374f/M_PI), Q30(0.3474813402f/M_PI), Q30(0.4202919006f/M_PI), Q30(0.4637607038f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1212290376f/M_PI), Q30(0.1903949380f/M_PI), Q30(0.2907958031f/M_PI), Q30(0.3466993868f/M_PI), Q30(0.3782821596f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0747506917f/M_PI), Q30(0.1163287833f/M_PI), Q30(0.1738867164f/M_PI), Q30(0.2038587779f/M_PI), Q30(0.2199459076f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0541138873f/M_PI), Q30(0.0839984417f/M_PI), Q30(0.1248718798f/M_PI), Q30(0.1458375156f/M_PI), Q30(0.1569785923f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0387469754f/M_PI), Q30(0.0600636788f/M_PI), Q30(0.0890387669f/M_PI), Q30(0.1037906483f/M_PI), Q30(0.1115923747f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0275881495f/M_PI), Q30(0.0427365713f/M_PI), Q30(0.0632618815f/M_PI), Q30(0.0736731067f/M_PI), Q30(0.0791663304f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0110366223f/M_PI), Q30(0.0170863140f/M_PI), Q30(0.0252620988f/M_PI), Q30(0.0293955617f/M_PI), Q30(0.0315726399f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0062100487f/M_PI), Q30(0.0096135242f/M_PI), Q30(0.0142110568f/M_PI), Q30(0.0165348612f/M_PI), Q30(0.0177587029f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0034913034f/M_PI), Q30(0.0054070661f/M_PI), Q30(0.0079917293f/M_PI), Q30(0.0092999367f/M_PI), Q30(0.0099875759f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0019607407f/M_PI), Q30(0.0030395309f/M_PI), Q30(0.0044951206f/M_PI), Q30(0.0052305623f/M_PI), Q30(0.0056152637f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0011053939f/M_PI), Q30(0.0017089852f/M_PI), Q30(0.0025254129f/M_PI), Q30(0.0029398468f/M_PI), Q30(0.0031597170f/M_PI)
+    };
+
+    static const int iid_par_dequant_c1[] = {
+        //iid_par_dequant_default
+        Q30(1.41198278375959f), Q30(1.40313815268360f), Q30(1.38687670404960f), Q30(1.34839972492648f),
+        Q30(1.29124937110028f), Q30(1.19603741667993f), Q30(1.10737240362323f), Q30(1),
+        Q30(0.87961716655242f), Q30(0.75464859232732f), Q30(0.57677990744575f), Q30(0.42640143271122f),
+        Q30(0.27671828230984f), Q30(0.17664462766713f), Q30(0.07940162697653f),
+        //iid_par_dequant_fine
+        Q30(1.41420649135832f), Q30(1.41419120222364f), Q30(1.41414285699784f), Q30(1.41399000859438f),
+        Q30(1.41350698548044f), Q30(1.41198278375959f), Q30(1.40977302262355f), Q30(1.40539479488545f),
+        Q30(1.39677960498402f), Q30(1.38005309967827f), Q30(1.34839972492648f), Q30(1.31392017367631f),
+        Q30(1.26431008149654f), Q30(1.19603741667993f), Q30(1.10737240362323f), Q30(1),
+        Q30(0.87961716655242f), Q30(0.75464859232732f), Q30(0.63365607219232f), Q30(0.52308104267543f),
+        Q30(0.42640143271122f), Q30(0.30895540465965f), Q30(0.22137464873077f), Q30(0.15768788954414f),
+        Q30(0.11198225164225f), Q30(0.07940162697653f), Q30(0.04469901562677f), Q30(0.02514469318284f),
+        Q30(0.01414142856998f), Q30(0.00795258154731f), Q30(0.00447211359449f),
+    };
+
+    static const int acos_icc_invq[] = {
+        Q31(0), Q31(0.178427635f/M_PI), Q31(0.28566733f/M_PI), Q31(0.46307236f/M_PI), Q31(0.59716315f/M_PI), Q31(0.78539816f/M_PI), Q31(1.10030855f/M_PI), Q31(1.57079633f/M_PI)
+    };
+    int iid, icc;
+
+    int k, m;
+    static const int8_t f_center_20[] = {
+        -3, -1, 1, 3, 5, 7, 10, 14, 18, 22,
+    };
+    static const int32_t f_center_34[] = {
+      Q31(  2/768.0),Q31(  6/768.0),Q31(10/768.0),Q31(14/768.0),Q31( 18/768.0),Q31( 22/768.0),Q31( 26/768.0),Q31(30/768.0),
+      Q31( 34/768.0),Q31(-10/768.0),Q31(-6/768.0),Q31(-2/768.0),Q31( 51/768.0),Q31( 57/768.0),Q31( 15/768.0),Q31(21/768.0),
+      Q31( 27/768.0),Q31( 33/768.0),Q31(39/768.0),Q31(45/768.0),Q31( 54/768.0),Q31( 66/768.0),Q31( 78/768.0),Q31(42/768.0),
+      Q31(102/768.0),Q31( 66/768.0),Q31(78/768.0),Q31(90/768.0),Q31(102/768.0),Q31(114/768.0),Q31(126/768.0),Q31(90/768.0)
+    };
+    static const int fractional_delay_links[] = { Q31(0.43f), Q31(0.75f), Q31(0.347f) };
+    const int fractional_delay_gain = Q31(0.39f);
+
+    for (pd0 = 0; pd0 < 8; pd0++) {
+        int pd0_re = (ipdopd_cos[pd0]+2)>>2;
+        int pd0_im = (ipdopd_sin[pd0]+2)>>2;
+        for (pd1 = 0; pd1 < 8; pd1++) {
+            int pd1_re = ipdopd_cos[pd1] >> 1;
+            int pd1_im = ipdopd_sin[pd1] >> 1;
+            for (pd2 = 0; pd2 < 8; pd2++) {
+                int shift, round;
+                int pd2_re = ipdopd_cos[pd2];
+                int pd2_im = ipdopd_sin[pd2];
+                int re_smooth = pd0_re + pd1_re + pd2_re;
+                int im_smooth = pd0_im + pd1_im + pd2_im;
+
+                SoftFloat pd_mag = av_int2sf(((ipdopd_cos[(pd0-pd1)&7]+8)>>4) + ((ipdopd_cos[(pd0-pd2)&7]+4)>>3) +
+                                               ((ipdopd_cos[(pd1-pd2)&7]+2)>>2) + 0x15000000, 28);
+                pd_mag = av_div_sf(FLOAT_1, av_sqrt_sf(pd_mag));
+                shift = 30 - pd_mag.exp;
+                round = 1 << (shift-1);
+                pd_re_smooth[pd0*64+pd1*8+pd2] = (int)(((int64_t)re_smooth * pd_mag.mant + round) >> shift);
+                pd_im_smooth[pd0*64+pd1*8+pd2] = (int)(((int64_t)im_smooth * pd_mag.mant + round) >> shift);
+            }
+        }
+    }
+
+    idx = 0;
+    for (iid = 0; iid < 46; iid++) {
+        int c1, c2;
+
+        c1 = iid_par_dequant_c1[iid];
+        if (iid < 15)
+          c2 = iid_par_dequant_c1[14-iid];
+        else
+          c2 = iid_par_dequant_c1[60-iid];
+
+        for (icc = 0; icc < 8; icc++) {
+            /*if (PS_BASELINE || ps->icc_mode < 3)*/{
+                int alpha, beta;
+                int ca, sa, cb, sb;
+
+                alpha = acos_icc_invq[icc];
+                beta = (int)(((int64_t)alpha * 1518500250 + 0x40000000) >> 31);
+                alpha >>= 1;
+                beta = (int)(((int64_t)beta * (c1 - c2) + 0x40000000) >> 31);
+                av_sincos_sf(beta + alpha, &sa, &ca);
+                av_sincos_sf(beta - alpha, &sb, &cb);
+
+                HA[iid][icc][0] = (int)(((int64_t)c2 * ca + 0x20000000) >> 30);
+                HA[iid][icc][1] = (int)(((int64_t)c1 * cb + 0x20000000) >> 30);
+                HA[iid][icc][2] = (int)(((int64_t)c2 * sa + 0x20000000) >> 30);
+                HA[iid][icc][3] = (int)(((int64_t)c1 * sb + 0x20000000) >> 30);
+            } /* else */ {
+                int alpha_int, gamma_int;
+                int alpha_c_int, alpha_s_int, gamma_c_int, gamma_s_int;
+
+                alpha_int = alpha_tab[idx];
+                gamma_int = gamma_tab[idx];
+
+                av_sincos_sf(alpha_int, &alpha_s_int, &alpha_c_int);
+                av_sincos_sf(gamma_int, &gamma_s_int, &gamma_c_int);
+
+                alpha_c_int = (int)(((int64_t)alpha_c_int * 1518500250 + 0x20000000) >> 30);
+                alpha_s_int = (int)(((int64_t)alpha_s_int * 1518500250 + 0x20000000) >> 30);
+
+                HB[iid][icc][0] = (int)(((int64_t)alpha_c_int * gamma_c_int + 0x20000000) >> 30);
+                HB[iid][icc][1] = (int)(((int64_t)alpha_s_int * gamma_c_int + 0x20000000) >> 30);
+                HB[iid][icc][2] = -(int)(((int64_t)alpha_s_int * gamma_s_int + 0x20000000) >> 30);
+                HB[iid][icc][3] = (int)(((int64_t)alpha_c_int * gamma_s_int + 0x20000000) >> 30);
+            }
+
+            if (icc < 5 || icc > 6)
+              idx++;
+        }
+    }
+
+    for (k = 0; k < NR_ALLPASS_BANDS20; k++) {
+        int theta;
+        int64_t f_center;
+        int c, s;
+
+        if (k < FF_ARRAY_ELEMS(f_center_20))
+          f_center = f_center_20[k];
+        else
+          f_center = (k << 3) - 52;
+
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            theta = (int)(((int64_t)fractional_delay_links[m] * f_center + 8) >> 4);
+            av_sincos_sf(-theta, &s, &c);
+            Q_fract_allpass[0][k][m][0] = c;
+            Q_fract_allpass[0][k][m][1] = s;
+        }
+
+        theta = (int)(((int64_t)fractional_delay_gain * f_center + 8) >> 4);
+        av_sincos_sf(-theta, &s, &c);
+        phi_fract[0][k][0] = c;
+        phi_fract[0][k][1] = s;
+    }
+
+    for (k = 0; k < NR_ALLPASS_BANDS34; k++) {
+        int theta, f_center;
+        int c, s;
+
+        if (k < FF_ARRAY_ELEMS(f_center_34))
+            f_center = f_center_34[k];
+        else
+            f_center = ((int64_t)k << 26) - (53 << 25);
+
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            theta = (int)(((int64_t)fractional_delay_links[m] * f_center + 0x10000000) >> 27);
+            av_sincos_sf(-theta, &s, &c);
+            Q_fract_allpass[1][k][m][0] = c;
+            Q_fract_allpass[1][k][m][1] = s;
+        }
+
+        theta = (int)(((int64_t)fractional_delay_gain * f_center + 0x10000000) >> 27);
+        av_sincos_sf(-theta, &s, &c);
+        phi_fract[1][k][0] = c;
+        phi_fract[1][k][1] = s;
+    }
+
+    make_filters_from_proto(f20_0_8,  g0_Q8,   8);
+    make_filters_from_proto(f34_0_12, g0_Q12, 12);
+    make_filters_from_proto(f34_1_8,  g1_Q8,   8);
+    make_filters_from_proto(f34_2_4,  g2_Q4,   4);
+}
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AVCODEC_AACPS_FIXED_TABLEGEN_H */
diff --git a/libavcodec/aacps_float.c b/libavcodec/aacps_float.c
new file mode 100644
index 0000000..73259c1
--- /dev/null
+++ b/libavcodec/aacps_float.c
@@ -0,0 +1,24 @@
+/*
+ * MPEG-4 Parametric Stereo decoding functions
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 0
+
+#include "aacps.c"
diff --git a/libavcodec/aacps_tablegen.c b/libavcodec/aacps_tablegen.c
index 537b6ba..26a6752 100644
--- a/libavcodec/aacps_tablegen.c
+++ b/libavcodec/aacps_tablegen.c
@@ -3,91 +3,22 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "aacps_tablegen.h"
-#include "tableprint.h"
-
-void write_float_3d_array (const void *p, int b, int c, int d)
-{
-    int i;
-    const float *f = p;
-    for (i = 0; i < b; i++) {
-        printf("{\n");
-        write_float_2d_array(f, c, d);
-        printf("},\n");
-        f += c * d;
-    }
-}
-
-void write_float_4d_array (const void *p, int a, int b, int c, int d)
-{
-    int i;
-    const float *f = p;
-    for (i = 0; i < a; i++) {
-        printf("{\n");
-        write_float_3d_array(f, b, c, d);
-        printf("},\n");
-        f += b * c * d;
-    }
-}
-
-int main(void)
-{
-    ps_tableinit();
-
-    write_fileheader();
-
-    printf("static const float pd_re_smooth[8*8*8] = {\n");
-    write_float_array(pd_re_smooth, 8*8*8);
-    printf("};\n");
-    printf("static const float pd_im_smooth[8*8*8] = {\n");
-    write_float_array(pd_im_smooth, 8*8*8);
-    printf("};\n");
-
-    printf("static const float HA[46][8][4] = {\n");
-    write_float_3d_array(HA, 46, 8, 4);
-    printf("};\n");
-    printf("static const float HB[46][8][4] = {\n");
-    write_float_3d_array(HB, 46, 8, 4);
-    printf("};\n");
-
-    printf("static const DECLARE_ALIGNED(16, float, f20_0_8)[8][8][2] = {\n");
-    write_float_3d_array(f20_0_8, 8, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_0_12)[12][8][2] = {\n");
-    write_float_3d_array(f34_0_12, 12, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_1_8)[8][8][2] = {\n");
-    write_float_3d_array(f34_1_8, 8, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_2_4)[4][8][2] = {\n");
-    write_float_3d_array(f34_2_4, 4, 8, 2);
-    printf("};\n");
-
-    printf("static TABLE_CONST DECLARE_ALIGNED(16, float, Q_fract_allpass)[2][50][3][2] = {\n");
-    write_float_4d_array(Q_fract_allpass, 2, 50, 3, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, phi_fract)[2][50][2] = {\n");
-    write_float_3d_array(phi_fract, 2, 50, 2);
-    printf("};\n");
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "aacps_tablegen_template.c"
diff --git a/libavcodec/aacps_tablegen.h b/libavcodec/aacps_tablegen.h
index a53f9fa..0ac4f68 100644
--- a/libavcodec/aacps_tablegen.h
+++ b/libavcodec/aacps_tablegen.h
@@ -3,25 +3,25 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AACPS_TABLEGEN_H
-#define AACPS_TABLEGEN_H
+#ifndef AVCODEC_AACPS_TABLEGEN_H
+#define AVCODEC_AACPS_TABLEGEN_H
 
 #include <math.h>
 #include <stdint.h>
@@ -70,7 +70,7 @@ static const float g2_Q4[] = {
      0.16486303567403f,  0.23279856662996f, 0.25f
 };
 
-static void make_filters_from_proto(float (*filter)[8][2], const float *proto, int bands)
+static av_cold void make_filters_from_proto(float (*filter)[8][2], const float *proto, int bands)
 {
     int q, n;
     for (q = 0; q < bands; q++) {
@@ -82,7 +82,7 @@ static void make_filters_from_proto(float (*filter)[8][2], const float *proto, i
     }
 }
 
-static void ps_tableinit(void)
+static av_cold void ps_tableinit(void)
 {
     static const float ipdopd_sin[] = { 0, M_SQRT1_2, 1,  M_SQRT1_2,  0, -M_SQRT1_2, -1, -M_SQRT1_2 };
     static const float ipdopd_cos[] = { 1, M_SQRT1_2, 0, -M_SQRT1_2, -1, -M_SQRT1_2,  0,  M_SQRT1_2 };
@@ -136,7 +136,7 @@ static void ps_tableinit(void)
                 float pd2_im = ipdopd_sin[pd2];
                 float re_smooth = 0.25f * pd0_re + 0.5f * pd1_re + pd2_re;
                 float im_smooth = 0.25f * pd0_im + 0.5f * pd1_im + pd2_im;
-                float pd_mag = 1 / sqrt(im_smooth * im_smooth + re_smooth * re_smooth);
+                float pd_mag = 1 / hypot(im_smooth, re_smooth);
                 pd_re_smooth[pd0*64+pd1*8+pd2] = re_smooth * pd_mag;
                 pd_im_smooth[pd0*64+pd1*8+pd2] = im_smooth * pd_mag;
             }
@@ -214,4 +214,4 @@ static void ps_tableinit(void)
 }
 #endif /* CONFIG_HARDCODED_TABLES */
 
-#endif /* AACPS_TABLEGEN_H */
+#endif /* AVCODEC_AACPS_TABLEGEN_H */
diff --git a/libavcodec/aacps_tablegen_template.c b/libavcodec/aacps_tablegen_template.c
new file mode 100644
index 0000000..341bd44
--- /dev/null
+++ b/libavcodec/aacps_tablegen_template.c
@@ -0,0 +1,107 @@
+/*
+ * Generate a header file for hardcoded Parametric Stereo tables
+ *
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#define CONFIG_HARDCODED_TABLES 0
+#include "aac_defines.h"
+
+#if USE_FIXED
+#define TYPE_NAME "int32_t"
+typedef int32_t INT32FLOAT;
+#define ARRAY_RENAME(x) write_int32_t_ ## x
+#define ARRAY_URENAME(x) write_uint32_t_ ## x
+#include "aacps_fixed_tablegen.h"
+#else
+#define TYPE_NAME "float"
+typedef float INT32FLOAT;
+#define ARRAY_RENAME(x) write_float_ ## x
+#define ARRAY_URENAME(x) write_float_ ## x
+#include "aacps_tablegen.h"
+#endif /* USE_FIXED */
+#include "tableprint.h"
+
+void ARRAY_RENAME(3d_array) (const void *p, int b, int c, int d)
+{
+    int i;
+    const INT32FLOAT *f = p;
+    for (i = 0; i < b; i++) {
+        printf("{\n");
+        ARRAY_URENAME(2d_array)(f, c, d);
+        printf("},\n");
+        f += c * d;
+    }
+}
+
+void ARRAY_RENAME(4d_array) (const void *p, int a, int b, int c, int d)
+{
+    int i;
+    const INT32FLOAT *f = p;
+    for (i = 0; i < a; i++) {
+        printf("{\n");
+        ARRAY_RENAME(3d_array)(f, b, c, d);
+        printf("},\n");
+        f += b * c * d;
+    }
+}
+
+int main(void)
+{
+    ps_tableinit();
+
+    write_fileheader();
+
+    printf("static const %s pd_re_smooth[8*8*8] = {\n", TYPE_NAME);
+    ARRAY_RENAME(array)(pd_re_smooth, 8*8*8);
+    printf("};\n");
+    printf("static const %s pd_im_smooth[8*8*8] = {\n", TYPE_NAME);
+    ARRAY_RENAME(array)(pd_im_smooth, 8*8*8);
+    printf("};\n");
+
+    printf("static const %s HA[46][8][4] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(HA, 46, 8, 4);
+    printf("};\n");
+    printf("static const %s HB[46][8][4] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(HB, 46, 8, 4);
+    printf("};\n");
+
+    printf("static const DECLARE_ALIGNED(16, %s, f20_0_8)[8][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f20_0_8, 8, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_0_12)[12][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_0_12, 12, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_1_8)[8][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_1_8, 8, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_2_4)[4][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_2_4, 4, 8, 2);
+    printf("};\n");
+
+    printf("static const DECLARE_ALIGNED(16, %s, Q_fract_allpass)[2][50][3][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(4d_array)(Q_fract_allpass, 2, 50, 3, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, phi_fract)[2][50][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(phi_fract, 2, 50, 2);
+    printf("};\n");
+
+    return 0;
+}
diff --git a/libavcodec/aacpsdata.c b/libavcodec/aacpsdata.c
index 675bd8e..5c1a1b0 100644
--- a/libavcodec/aacpsdata.c
+++ b/libavcodec/aacpsdata.c
@@ -2,20 +2,20 @@
  * MPEG-4 Parametric Stereo data tables
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -157,7 +157,7 @@ static const int8_t k_to_i_34[] = {
     33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33
 };
 
-static const float g1_Q2[] = {
-    0.0f,  0.01899487526049f, 0.0f, -0.07293139167538f,
-    0.0f,  0.30596630545168f, 0.5f
+static const INTFLOAT g1_Q2[] = {
+    Q31(0.0f),  Q31(0.01899487526049f), Q31(0.0f), Q31(-0.07293139167538f),
+    Q31(0.0f),  Q31(0.30596630545168f), Q31(0.5f)
 };
diff --git a/libavcodec/aacpsdsp.c b/libavcodec/aacpsdsp.c
deleted file mode 100644
index 88e731f..0000000
--- a/libavcodec/aacpsdsp.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "aacpsdsp.h"
-
-static void ps_add_squares_c(float *dst, const float (*src)[2], int n)
-{
-    int i;
-    for (i = 0; i < n; i++)
-        dst[i] += src[i][0] * src[i][0] + src[i][1] * src[i][1];
-}
-
-static void ps_mul_pair_single_c(float (*dst)[2], float (*src0)[2], float *src1,
-                                 int n)
-{
-    int i;
-    for (i = 0; i < n; i++) {
-        dst[i][0] = src0[i][0] * src1[i];
-        dst[i][1] = src0[i][1] * src1[i];
-    }
-}
-
-static void ps_hybrid_analysis_c(float (*out)[2], float (*in)[2],
-                                 const float (*filter)[8][2],
-                                 int stride, int n)
-{
-    int i, j;
-
-    for (i = 0; i < n; i++) {
-        float sum_re = filter[i][6][0] * in[6][0];
-        float sum_im = filter[i][6][0] * in[6][1];
-
-        for (j = 0; j < 6; j++) {
-            float in0_re = in[j][0];
-            float in0_im = in[j][1];
-            float in1_re = in[12-j][0];
-            float in1_im = in[12-j][1];
-            sum_re += filter[i][j][0] * (in0_re + in1_re) -
-                      filter[i][j][1] * (in0_im - in1_im);
-            sum_im += filter[i][j][0] * (in0_im + in1_im) +
-                      filter[i][j][1] * (in0_re - in1_re);
-        }
-        out[i * stride][0] = sum_re;
-        out[i * stride][1] = sum_im;
-    }
-}
-
-static void ps_hybrid_analysis_ileave_c(float (*out)[32][2], float L[2][38][64],
-                                        int i, int len)
-{
-    int j;
-
-    for (; i < 64; i++) {
-        for (j = 0; j < len; j++) {
-            out[i][j][0] = L[0][j][i];
-            out[i][j][1] = L[1][j][i];
-        }
-    }
-}
-
-static void ps_hybrid_synthesis_deint_c(float out[2][38][64],
-                                        float (*in)[32][2],
-                                        int i, int len)
-{
-    int n;
-
-    for (; i < 64; i++) {
-        for (n = 0; n < len; n++) {
-            out[0][n][i] = in[i][n][0];
-            out[1][n][i] = in[i][n][1];
-        }
-    }
-}
-
-static void ps_decorrelate_c(float (*out)[2], float (*delay)[2],
-                             float (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
-                             const float phi_fract[2], const float (*Q_fract)[2],
-                             const float *transient_gain,
-                             float g_decay_slope,
-                             int len)
-{
-    static const float a[] = { 0.65143905753106f,
-                               0.56471812200776f,
-                               0.48954165955695f };
-    float ag[PS_AP_LINKS];
-    int m, n;
-
-    for (m = 0; m < PS_AP_LINKS; m++)
-        ag[m] = a[m] * g_decay_slope;
-
-    for (n = 0; n < len; n++) {
-        float in_re = delay[n][0] * phi_fract[0] - delay[n][1] * phi_fract[1];
-        float in_im = delay[n][0] * phi_fract[1] + delay[n][1] * phi_fract[0];
-        for (m = 0; m < PS_AP_LINKS; m++) {
-            float a_re                = ag[m] * in_re;
-            float a_im                = ag[m] * in_im;
-            float link_delay_re       = ap_delay[m][n+2-m][0];
-            float link_delay_im       = ap_delay[m][n+2-m][1];
-            float fractional_delay_re = Q_fract[m][0];
-            float fractional_delay_im = Q_fract[m][1];
-            float apd_re = in_re;
-            float apd_im = in_im;
-            in_re = link_delay_re * fractional_delay_re -
-                    link_delay_im * fractional_delay_im - a_re;
-            in_im = link_delay_re * fractional_delay_im +
-                    link_delay_im * fractional_delay_re - a_im;
-            ap_delay[m][n+5][0] = apd_re + ag[m] * in_re;
-            ap_delay[m][n+5][1] = apd_im + ag[m] * in_im;
-        }
-        out[n][0] = transient_gain[n] * in_re;
-        out[n][1] = transient_gain[n] * in_im;
-    }
-}
-
-static void ps_stereo_interpolate_c(float (*l)[2], float (*r)[2],
-                                    float h[2][4], float h_step[2][4],
-                                    int len)
-{
-    float h0 = h[0][0];
-    float h1 = h[0][1];
-    float h2 = h[0][2];
-    float h3 = h[0][3];
-    float hs0 = h_step[0][0];
-    float hs1 = h_step[0][1];
-    float hs2 = h_step[0][2];
-    float hs3 = h_step[0][3];
-    int n;
-
-    for (n = 0; n < len; n++) {
-        //l is s, r is d
-        float l_re = l[n][0];
-        float l_im = l[n][1];
-        float r_re = r[n][0];
-        float r_im = r[n][1];
-        h0 += hs0;
-        h1 += hs1;
-        h2 += hs2;
-        h3 += hs3;
-        l[n][0] = h0 * l_re + h2 * r_re;
-        l[n][1] = h0 * l_im + h2 * r_im;
-        r[n][0] = h1 * l_re + h3 * r_re;
-        r[n][1] = h1 * l_im + h3 * r_im;
-    }
-}
-
-static void ps_stereo_interpolate_ipdopd_c(float (*l)[2], float (*r)[2],
-                                           float h[2][4], float h_step[2][4],
-                                           int len)
-{
-    float h00  = h[0][0],      h10  = h[1][0];
-    float h01  = h[0][1],      h11  = h[1][1];
-    float h02  = h[0][2],      h12  = h[1][2];
-    float h03  = h[0][3],      h13  = h[1][3];
-    float hs00 = h_step[0][0], hs10 = h_step[1][0];
-    float hs01 = h_step[0][1], hs11 = h_step[1][1];
-    float hs02 = h_step[0][2], hs12 = h_step[1][2];
-    float hs03 = h_step[0][3], hs13 = h_step[1][3];
-    int n;
-
-    for (n = 0; n < len; n++) {
-        //l is s, r is d
-        float l_re = l[n][0];
-        float l_im = l[n][1];
-        float r_re = r[n][0];
-        float r_im = r[n][1];
-        h00 += hs00;
-        h01 += hs01;
-        h02 += hs02;
-        h03 += hs03;
-        h10 += hs10;
-        h11 += hs11;
-        h12 += hs12;
-        h13 += hs13;
-
-        l[n][0] = h00 * l_re + h02 * r_re - h10 * l_im - h12 * r_im;
-        l[n][1] = h00 * l_im + h02 * r_im + h10 * l_re + h12 * r_re;
-        r[n][0] = h01 * l_re + h03 * r_re - h11 * l_im - h13 * r_im;
-        r[n][1] = h01 * l_im + h03 * r_im + h11 * l_re + h13 * r_re;
-    }
-}
-
-av_cold void ff_psdsp_init(PSDSPContext *s)
-{
-    s->add_squares            = ps_add_squares_c;
-    s->mul_pair_single        = ps_mul_pair_single_c;
-    s->hybrid_analysis        = ps_hybrid_analysis_c;
-    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_c;
-    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_c;
-    s->decorrelate            = ps_decorrelate_c;
-    s->stereo_interpolate[0]  = ps_stereo_interpolate_c;
-    s->stereo_interpolate[1]  = ps_stereo_interpolate_ipdopd_c;
-
-    if (ARCH_ARM)
-        ff_psdsp_init_arm(s);
-}
diff --git a/libavcodec/aacpsdsp.h b/libavcodec/aacpsdsp.h
index dc380b1..917ac53 100644
--- a/libavcodec/aacpsdsp.h
+++ b/libavcodec/aacpsdsp.h
@@ -1,53 +1,60 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef LIBAVCODEC_AACPSDSP_H
-#define LIBAVCODEC_AACPSDSP_H
+#ifndef AVCODEC_AACPSDSP_H
+#define AVCODEC_AACPSDSP_H
+
+#include <stddef.h>
+
+#include "aac_defines.h"
 
 #define PS_QMF_TIME_SLOTS 32
 #define PS_AP_LINKS 3
 #define PS_MAX_AP_DELAY 5
 
 typedef struct PSDSPContext {
-    void (*add_squares)(float *dst, const float (*src)[2], int n);
-    void (*mul_pair_single)(float (*dst)[2], float (*src0)[2], float *src1,
+    void (*add_squares)(INTFLOAT *dst, const INTFLOAT (*src)[2], int n);
+    void (*mul_pair_single)(INTFLOAT (*dst)[2], INTFLOAT (*src0)[2], INTFLOAT *src1,
                             int n);
-    void (*hybrid_analysis)(float (*out)[2], float (*in)[2],
-                            const float (*filter)[8][2],
-                            int stride, int n);
-    void (*hybrid_analysis_ileave)(float (*out)[32][2], float L[2][38][64],
+    void (*hybrid_analysis)(INTFLOAT (*out)[2], INTFLOAT (*in)[2],
+                            const INTFLOAT (*filter)[8][2],
+                            ptrdiff_t stride, int n);
+    void (*hybrid_analysis_ileave)(INTFLOAT (*out)[32][2], INTFLOAT L[2][38][64],
                                    int i, int len);
-    void (*hybrid_synthesis_deint)(float out[2][38][64], float (*in)[32][2],
+    void (*hybrid_synthesis_deint)(INTFLOAT out[2][38][64], INTFLOAT (*in)[32][2],
                                    int i, int len);
-    void (*decorrelate)(float (*out)[2], float (*delay)[2],
-                        float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
-                        const float phi_fract[2], const float (*Q_fract)[2],
-                        const float *transient_gain,
-                        float g_decay_slope,
+    void (*decorrelate)(INTFLOAT (*out)[2], INTFLOAT (*delay)[2],
+                        INTFLOAT (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
+                        const INTFLOAT phi_fract[2], const INTFLOAT (*Q_fract)[2],
+                        const INTFLOAT *transient_gain,
+                        INTFLOAT g_decay_slope,
                         int len);
-    void (*stereo_interpolate[2])(float (*l)[2], float (*r)[2],
-                                  float h[2][4], float h_step[2][4],
+    void (*stereo_interpolate[2])(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                  INTFLOAT h[2][4], INTFLOAT h_step[2][4],
                                   int len);
 } PSDSPContext;
 
-void ff_psdsp_init(PSDSPContext *s);
+void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s);
 void ff_psdsp_init_arm(PSDSPContext *s);
+void ff_psdsp_init_aarch64(PSDSPContext *s);
+void ff_psdsp_init_mips(PSDSPContext *s);
+void ff_psdsp_init_x86(PSDSPContext *s);
 
-#endif /* LIBAVCODEC_AACPSDSP_H */
+#endif /* AVCODEC_AACPSDSP_H */
diff --git a/libavcodec/aacpsdsp_fixed.c b/libavcodec/aacpsdsp_fixed.c
new file mode 100644
index 0000000..2413295
--- /dev/null
+++ b/libavcodec/aacpsdsp_fixed.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+
+#include "aacpsdsp_template.c"
diff --git a/libavcodec/aacpsdsp_float.c b/libavcodec/aacpsdsp_float.c
new file mode 100644
index 0000000..99aa650
--- /dev/null
+++ b/libavcodec/aacpsdsp_float.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 0
+
+#include "aacpsdsp_template.c"
diff --git a/libavcodec/aacpsdsp_template.c b/libavcodec/aacpsdsp_template.c
new file mode 100644
index 0000000..5f4be01
--- /dev/null
+++ b/libavcodec/aacpsdsp_template.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "aacpsdsp.h"
+
+static void ps_add_squares_c(INTFLOAT *dst, const INTFLOAT (*src)[2], int n)
+{
+    int i;
+    for (i = 0; i < n; i++)
+        dst[i] += (UINTFLOAT)AAC_MADD28(src[i][0], src[i][0], src[i][1], src[i][1]);
+}
+
+static void ps_mul_pair_single_c(INTFLOAT (*dst)[2], INTFLOAT (*src0)[2], INTFLOAT *src1,
+                                 int n)
+{
+    int i;
+    for (i = 0; i < n; i++) {
+        dst[i][0] = AAC_MUL16(src0[i][0], src1[i]);
+        dst[i][1] = AAC_MUL16(src0[i][1], src1[i]);
+    }
+}
+
+static void ps_hybrid_analysis_c(INTFLOAT (*out)[2], INTFLOAT (*in)[2],
+                                 const INTFLOAT (*filter)[8][2],
+                                 ptrdiff_t stride, int n)
+{
+    int i, j;
+
+    for (i = 0; i < n; i++) {
+        INT64FLOAT sum_re = (INT64FLOAT)filter[i][6][0] * in[6][0];
+        INT64FLOAT sum_im = (INT64FLOAT)filter[i][6][0] * in[6][1];
+
+        for (j = 0; j < 6; j++) {
+            INTFLOAT in0_re = in[j][0];
+            INTFLOAT in0_im = in[j][1];
+            INTFLOAT in1_re = in[12-j][0];
+            INTFLOAT in1_im = in[12-j][1];
+            sum_re += (INT64FLOAT)filter[i][j][0] * (in0_re + in1_re) -
+                      (INT64FLOAT)filter[i][j][1] * (in0_im - in1_im);
+            sum_im += (INT64FLOAT)filter[i][j][0] * (in0_im + in1_im) +
+                      (INT64FLOAT)filter[i][j][1] * (in0_re - in1_re);
+        }
+#if USE_FIXED
+        out[i * stride][0] = (int)((sum_re + 0x40000000) >> 31);
+        out[i * stride][1] = (int)((sum_im + 0x40000000) >> 31);
+#else
+        out[i * stride][0] = sum_re;
+        out[i * stride][1] = sum_im;
+#endif /* USE_FIXED */
+    }
+}
+
+static void ps_hybrid_analysis_ileave_c(INTFLOAT (*out)[32][2], INTFLOAT L[2][38][64],
+                                      int i, int len)
+{
+    int j;
+
+    for (; i < 64; i++) {
+        for (j = 0; j < len; j++) {
+            out[i][j][0] = L[0][j][i];
+            out[i][j][1] = L[1][j][i];
+        }
+    }
+}
+
+static void ps_hybrid_synthesis_deint_c(INTFLOAT out[2][38][64],
+                                      INTFLOAT (*in)[32][2],
+                                      int i, int len)
+{
+    int n;
+
+    for (; i < 64; i++) {
+        for (n = 0; n < len; n++) {
+            out[0][n][i] = in[i][n][0];
+            out[1][n][i] = in[i][n][1];
+        }
+    }
+}
+
+static void ps_decorrelate_c(INTFLOAT (*out)[2], INTFLOAT (*delay)[2],
+                             INTFLOAT (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
+                             const INTFLOAT phi_fract[2], const INTFLOAT (*Q_fract)[2],
+                             const INTFLOAT *transient_gain,
+                             INTFLOAT g_decay_slope,
+                             int len)
+{
+    static const INTFLOAT a[] = { Q31(0.65143905753106f),
+                               Q31(0.56471812200776f),
+                               Q31(0.48954165955695f) };
+    INTFLOAT ag[PS_AP_LINKS];
+    int m, n;
+
+    for (m = 0; m < PS_AP_LINKS; m++)
+        ag[m] = AAC_MUL30(a[m], g_decay_slope);
+
+    for (n = 0; n < len; n++) {
+        INTFLOAT in_re = AAC_MSUB30(delay[n][0], phi_fract[0], delay[n][1], phi_fract[1]);
+        INTFLOAT in_im = AAC_MADD30(delay[n][0], phi_fract[1], delay[n][1], phi_fract[0]);
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            INTFLOAT a_re                = AAC_MUL31(ag[m], in_re);
+            INTFLOAT a_im                = AAC_MUL31(ag[m], in_im);
+            INTFLOAT link_delay_re       = ap_delay[m][n+2-m][0];
+            INTFLOAT link_delay_im       = ap_delay[m][n+2-m][1];
+            INTFLOAT fractional_delay_re = Q_fract[m][0];
+            INTFLOAT fractional_delay_im = Q_fract[m][1];
+            INTFLOAT apd_re = in_re;
+            INTFLOAT apd_im = in_im;
+            in_re = AAC_MSUB30(link_delay_re, fractional_delay_re,
+                    link_delay_im, fractional_delay_im);
+            in_re -= (UINTFLOAT)a_re;
+            in_im = AAC_MADD30(link_delay_re, fractional_delay_im,
+                    link_delay_im, fractional_delay_re);
+            in_im -= (UINTFLOAT)a_im;
+            ap_delay[m][n+5][0] = apd_re + (UINTFLOAT)AAC_MUL31(ag[m], in_re);
+            ap_delay[m][n+5][1] = apd_im + (UINTFLOAT)AAC_MUL31(ag[m], in_im);
+        }
+        out[n][0] = AAC_MUL16(transient_gain[n], in_re);
+        out[n][1] = AAC_MUL16(transient_gain[n], in_im);
+    }
+}
+
+static void ps_stereo_interpolate_c(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                    INTFLOAT h[2][4], INTFLOAT h_step[2][4],
+                                    int len)
+{
+    INTFLOAT h0 = h[0][0];
+    INTFLOAT h1 = h[0][1];
+    INTFLOAT h2 = h[0][2];
+    INTFLOAT h3 = h[0][3];
+    UINTFLOAT hs0 = h_step[0][0];
+    UINTFLOAT hs1 = h_step[0][1];
+    UINTFLOAT hs2 = h_step[0][2];
+    UINTFLOAT hs3 = h_step[0][3];
+    int n;
+
+    for (n = 0; n < len; n++) {
+        //l is s, r is d
+        INTFLOAT l_re = l[n][0];
+        INTFLOAT l_im = l[n][1];
+        INTFLOAT r_re = r[n][0];
+        INTFLOAT r_im = r[n][1];
+        h0 += hs0;
+        h1 += hs1;
+        h2 += hs2;
+        h3 += hs3;
+        l[n][0] = AAC_MADD30(h0, l_re, h2, r_re);
+        l[n][1] = AAC_MADD30(h0, l_im, h2, r_im);
+        r[n][0] = AAC_MADD30(h1, l_re, h3, r_re);
+        r[n][1] = AAC_MADD30(h1, l_im, h3, r_im);
+    }
+}
+
+static void ps_stereo_interpolate_ipdopd_c(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                           INTFLOAT h[2][4], INTFLOAT h_step[2][4],
+                                           int len)
+{
+    INTFLOAT h00  = h[0][0],      h10  = h[1][0];
+    INTFLOAT h01  = h[0][1],      h11  = h[1][1];
+    INTFLOAT h02  = h[0][2],      h12  = h[1][2];
+    INTFLOAT h03  = h[0][3],      h13  = h[1][3];
+    UINTFLOAT hs00 = h_step[0][0], hs10 = h_step[1][0];
+    UINTFLOAT hs01 = h_step[0][1], hs11 = h_step[1][1];
+    UINTFLOAT hs02 = h_step[0][2], hs12 = h_step[1][2];
+    UINTFLOAT hs03 = h_step[0][3], hs13 = h_step[1][3];
+    int n;
+
+    for (n = 0; n < len; n++) {
+        //l is s, r is d
+        INTFLOAT l_re = l[n][0];
+        INTFLOAT l_im = l[n][1];
+        INTFLOAT r_re = r[n][0];
+        INTFLOAT r_im = r[n][1];
+        h00 += hs00;
+        h01 += hs01;
+        h02 += hs02;
+        h03 += hs03;
+        h10 += hs10;
+        h11 += hs11;
+        h12 += hs12;
+        h13 += hs13;
+
+        l[n][0] = AAC_MSUB30_V8(h00, l_re, h02, r_re, h10, l_im, h12, r_im);
+        l[n][1] = AAC_MADD30_V8(h00, l_im, h02, r_im, h10, l_re, h12, r_re);
+        r[n][0] = AAC_MSUB30_V8(h01, l_re, h03, r_re, h11, l_im, h13, r_im);
+        r[n][1] = AAC_MADD30_V8(h01, l_im, h03, r_im, h11, l_re, h13, r_re);
+    }
+}
+
+av_cold void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s)
+{
+    s->add_squares            = ps_add_squares_c;
+    s->mul_pair_single        = ps_mul_pair_single_c;
+    s->hybrid_analysis        = ps_hybrid_analysis_c;
+    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_c;
+    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_c;
+    s->decorrelate            = ps_decorrelate_c;
+    s->stereo_interpolate[0]  = ps_stereo_interpolate_c;
+    s->stereo_interpolate[1]  = ps_stereo_interpolate_ipdopd_c;
+
+#if !USE_FIXED
+    if (ARCH_ARM)
+        ff_psdsp_init_arm(s);
+    if (ARCH_AARCH64)
+        ff_psdsp_init_aarch64(s);
+    if (ARCH_MIPS)
+        ff_psdsp_init_mips(s);
+    if (ARCH_X86)
+        ff_psdsp_init_x86(s);
+#endif /* !USE_FIXED */
+}
diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
index 272be9f..fca692c 100644
--- a/libavcodec/aacpsy.c
+++ b/libavcodec/aacpsy.c
@@ -2,20 +2,20 @@
  * AAC encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/ffmath.h"
+
 #include "avcodec.h"
 #include "aactab.h"
 #include "psymodel.h"
@@ -78,6 +80,8 @@
 #define PSY_3GPP_AH_THR_LONG    0.5f
 #define PSY_3GPP_AH_THR_SHORT   0.63f
 
+#define PSY_PE_FORGET_SLOPE  511
+
 enum {
     PSY_3GPP_AH_NONE,
     PSY_3GPP_AH_INACTIVE,
@@ -85,6 +89,7 @@ enum {
 };
 
 #define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
+#define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f)
 
 /* LAME psy model constants */
 #define PSY_LAME_FIR_LEN 21         ///< LAME psy model FIR order
@@ -155,6 +160,7 @@ typedef struct AacPsyContext{
     } pe;
     AacPsyCoeffs psy_coef[2][64];
     AacPsyChannel *ch;
+    float global_quality; ///< normalized global quality taken from avctx
 }AacPsyContext;
 
 /**
@@ -216,6 +222,10 @@ static const float psy_fir_coeffs[] = {
     -5.52212e-17 * 2, -0.313819 * 2
 };
 
+#if ARCH_MIPS
+#   include "mips/aacpsy_mips.h"
+#endif /* ARCH_MIPS */
+
 /**
  * Calculate the ABR attack threshold from the above LAME psymodel table.
  */
@@ -293,17 +303,24 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
     float bark;
     int i, j, g, start;
     float prev, minscale, minath, minsnr, pe_min;
-    const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
-    const int bandwidth    = ctx->avctx->cutoff ? ctx->avctx->cutoff : ctx->avctx->sample_rate / 2;
+    int chan_bitrate = ctx->avctx->bit_rate / ((ctx->avctx->flags & AV_CODEC_FLAG_QSCALE) ? 2.0f : ctx->avctx->channels);
+
+    const int bandwidth    = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
     const float num_bark   = calc_bark((float)bandwidth);
 
     ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
     if (!ctx->model_priv_data)
         return AVERROR(ENOMEM);
     pctx = ctx->model_priv_data;
+    pctx->global_quality = (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) * 0.01f;
+
+    if (ctx->avctx->flags & AV_CODEC_FLAG_QSCALE) {
+        /* Use the target average bitrate to compute spread parameters */
+        chan_bitrate = (int)(chan_bitrate / 120.0 * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120));
+    }
 
     pctx->chan_bitrate = chan_bitrate;
-    pctx->frame_bits   = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate;
+    pctx->frame_bits   = FFMIN(2560, chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate);
     pctx->pe.min       =  8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
     pctx->pe.max       = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
     ctx->bitres.size   = 6144 - pctx->frame_bits;
@@ -332,12 +349,12 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
         for (g = 0; g < ctx->num_bands[j] - 1; g++) {
             AacPsyCoeffs *coeff = &coeffs[g];
             float bark_width = coeffs[g+1].barks - coeffs->barks;
-            coeff->spread_low[0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_LOW);
-            coeff->spread_hi [0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_HI);
-            coeff->spread_low[1] = pow(10.0, -bark_width * en_spread_low);
-            coeff->spread_hi [1] = pow(10.0, -bark_width * en_spread_hi);
+            coeff->spread_low[0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_LOW);
+            coeff->spread_hi [0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_HI);
+            coeff->spread_low[1] = ff_exp10(-bark_width * en_spread_low);
+            coeff->spread_hi [1] = ff_exp10(-bark_width * en_spread_hi);
             pe_min = bark_pe * bark_width;
-            minsnr = pow(2.0f, pe_min / band_sizes[g]) - 1.5f;
+            minsnr = exp2(pe_min / band_sizes[g]) - 1.5f;
             coeff->min_snr = av_clipf(1.0f / minsnr, PSY_SNR_25DB, PSY_SNR_1DB);
         }
         start = 0;
@@ -350,9 +367,9 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
         }
     }
 
-    pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels);
+    pctx->ch = av_mallocz_array(ctx->avctx->channels, sizeof(AacPsyChannel));
     if (!pctx->ch) {
-        av_freep(&pctx);
+        av_freep(&ctx->model_priv_data);
         return AVERROR(ENOMEM);
     }
 
@@ -391,7 +408,7 @@ static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
                                                  int channel, int prev_type)
 {
     int i, j;
-    int br               = ctx->avctx->bit_rate / ctx->avctx->channels;
+    int br               = ((AacPsyContext*)ctx->model_priv_data)->chan_bitrate;
     int attack_ratio     = br <= 16000 ? 18 : 10;
     AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
     AacPsyChannel *pch  = &pctx->ch[channel];
@@ -480,7 +497,7 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
     const float bitspend_add   = short_window ? PSY_3GPP_SPEND_ADD_S   : PSY_3GPP_SPEND_ADD_L;
     const float clip_low       = short_window ? PSY_3GPP_CLIP_LO_S     : PSY_3GPP_CLIP_LO_L;
     const float clip_high      = short_window ? PSY_3GPP_CLIP_HI_S     : PSY_3GPP_CLIP_HI_L;
-    float clipped_pe, bit_save, bit_spend, bit_factor, fill_level;
+    float clipped_pe, bit_save, bit_spend, bit_factor, fill_level, forgetful_min_pe;
 
     ctx->fill_level += ctx->frame_bits - bits;
     ctx->fill_level  = av_clip(ctx->fill_level, 0, size);
@@ -497,11 +514,21 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
      * Hopefully below is correct.
      */
     bit_factor = 1.0f - bit_save + ((bit_spend - bit_save) / (ctx->pe.max - ctx->pe.min)) * (clipped_pe - ctx->pe.min);
-    /* NOTE: The reference encoder attempts to center pe max/min around the current pe. */
+    /* NOTE: The reference encoder attempts to center pe max/min around the current pe.
+     * Here we do that by slowly forgetting pe.min when pe stays in a range that makes
+     * it unlikely (ie: above the mean)
+     */
     ctx->pe.max = FFMAX(pe, ctx->pe.max);
-    ctx->pe.min = FFMIN(pe, ctx->pe.min);
+    forgetful_min_pe = ((ctx->pe.min * PSY_PE_FORGET_SLOPE)
+        + FFMAX(ctx->pe.min, pe * (pe / ctx->pe.max))) / (PSY_PE_FORGET_SLOPE + 1);
+    ctx->pe.min = FFMIN(pe, forgetful_min_pe);
 
-    return FFMIN(ctx->frame_bits * bit_factor, ctx->frame_bits + size - bits);
+    /* NOTE: allocate a minimum of 1/8th average frame bits, to avoid
+     *   reservoir starvation from producing zero-bit frames
+     */
+    return FFMIN(
+        ctx->frame_bits * bit_factor,
+        FFMAX(ctx->frame_bits + size - bits, ctx->frame_bits / 8));
 }
 
 static float calc_pe_3gpp(AacPsyBand *band)
@@ -532,8 +559,11 @@ static float calc_reduction_3gpp(float a, float desired_pe, float pe,
 {
     float thr_avg, reduction;
 
-    thr_avg   = powf(2.0f, (a - pe) / (4.0f * active_lines));
-    reduction = powf(2.0f, (a - desired_pe) / (4.0f * active_lines)) - thr_avg;
+    if(active_lines == 0.0)
+        return 0;
+
+    thr_avg   = exp2f((a - pe) / (4.0f * active_lines));
+    reduction = exp2f((a - desired_pe) / (4.0f * active_lines)) - thr_avg;
 
     return FFMAX(reduction, 0.0f);
 }
@@ -544,8 +574,10 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
     float thr = band->thr;
 
     if (band->energy > thr) {
-        thr = powf(thr, 0.25f) + reduction;
-        thr = powf(thr, 4.0f);
+        thr = sqrtf(thr);
+        thr = sqrtf(thr) + reduction;
+        thr *= thr;
+        thr *= thr;
 
         /* This deviates from the 3GPP spec to match the reference encoder.
          * It performs min(thr_reduced, max(thr, energy/min_snr)) only for bands
@@ -561,6 +593,56 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
     return thr;
 }
 
+#ifndef calc_thr_3gpp
+static void calc_thr_3gpp(const FFPsyWindowInfo *wi, const int num_bands, AacPsyChannel *pch,
+                          const uint8_t *band_sizes, const float *coefs, const int cutoff)
+{
+    int i, w, g;
+    int start = 0, wstart = 0;
+    for (w = 0; w < wi->num_windows*16; w += 16) {
+        wstart = 0;
+        for (g = 0; g < num_bands; g++) {
+            AacPsyBand *band = &pch->band[w+g];
+
+            float form_factor = 0.0f;
+            float Temp;
+            band->energy = 0.0f;
+            if (wstart < cutoff) {
+                for (i = 0; i < band_sizes[g]; i++) {
+                    band->energy += coefs[start+i] * coefs[start+i];
+                    form_factor  += sqrtf(fabs(coefs[start+i]));
+                }
+            }
+            Temp = band->energy > 0 ? sqrtf((float)band_sizes[g] / band->energy) : 0;
+            band->thr      = band->energy * 0.001258925f;
+            band->nz_lines = form_factor * sqrtf(Temp);
+
+            start += band_sizes[g];
+            wstart += band_sizes[g];
+        }
+    }
+}
+#endif /* calc_thr_3gpp */
+
+#ifndef psy_hp_filter
+static void psy_hp_filter(const float *firbuf, float *hpfsmpl, const float *psy_fir_coeffs)
+{
+    int i, j;
+    for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
+        float sum1, sum2;
+        sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
+        sum2 = 0.0;
+        for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
+            sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
+            sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
+        }
+        /* NOTE: The LAME psymodel expects it's input in the range -32768 to 32768.
+         *       Tuning this for normalized floats would be difficult. */
+        hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
+    }
+}
+#endif /* psy_hp_filter */
+
 /**
  * Calculate band thresholds as suggested in 3GPP TS26.403
  */
@@ -569,33 +651,20 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 {
     AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
     AacPsyChannel *pch  = &pctx->ch[channel];
-    int start = 0;
     int i, w, g;
-    float desired_bits, desired_pe, delta_pe, reduction, spread_en[128] = {0};
+    float desired_bits, desired_pe, delta_pe, reduction= NAN, spread_en[128] = {0};
     float a = 0.0f, active_lines = 0.0f, norm_fac = 0.0f;
     float pe = pctx->chan_bitrate > 32000 ? 0.0f : FFMAX(50.0f, 100.0f - pctx->chan_bitrate * 100.0f / 32000.0f);
     const int      num_bands   = ctx->num_bands[wi->num_windows == 8];
     const uint8_t *band_sizes  = ctx->bands[wi->num_windows == 8];
     AacPsyCoeffs  *coeffs      = pctx->psy_coef[wi->num_windows == 8];
     const float avoid_hole_thr = wi->num_windows == 8 ? PSY_3GPP_AH_THR_SHORT : PSY_3GPP_AH_THR_LONG;
+    const int bandwidth        = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
+    const int cutoff           = bandwidth * 2048 / wi->num_windows / ctx->avctx->sample_rate;
 
     //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
-    for (w = 0; w < wi->num_windows*16; w += 16) {
-        for (g = 0; g < num_bands; g++) {
-            AacPsyBand *band = &pch->band[w+g];
+    calc_thr_3gpp(wi, num_bands, pch, band_sizes, coefs, cutoff);
 
-            float form_factor = 0.0f;
-            band->energy = 0.0f;
-            for (i = 0; i < band_sizes[g]; i++) {
-                band->energy += coefs[start+i] * coefs[start+i];
-                form_factor  += sqrtf(fabs(coefs[start+i]));
-            }
-            band->thr      = band->energy * 0.001258925f;
-            band->nz_lines = form_factor / powf(band->energy / band_sizes[g], 0.25f);
-
-            start += band_sizes[g];
-        }
-    }
     //modify thresholds and energies - spread, threshold in quiet, pre-echo control
     for (w = 0; w < wi->num_windows*16; w += 16) {
         AacPsyBand *bands = &pch->band[w];
@@ -616,7 +685,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
             band->thr_quiet = band->thr = FFMAX(band->thr, coeffs[g].ath);
             //5.4.2.5 "Pre-echo control"
-            if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (wi->window_type[1] == LONG_START_SEQUENCE && !w)))
+            if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (!w && wi->window_type[1] == LONG_START_SEQUENCE)))
                 band->thr = FFMAX(PSY_3GPP_RPEMIN*band->thr, FFMIN(band->thr,
                                   PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
 
@@ -635,16 +704,36 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
     /* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
     ctx->ch[channel].entropy = pe;
-    desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
-    desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
-    /* NOTE: PE correction is kept simple. During initial testing it had very
-     *       little effect on the final bitrate. Probably a good idea to come
-     *       back and do more testing later.
-     */
-    if (ctx->bitres.bits > 0)
-        desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
-                               0.85f, 1.15f);
+    if (ctx->avctx->flags & AV_CODEC_FLAG_QSCALE) {
+        /* (2.5 * 120) achieves almost transparent rate, and we want to give
+         * ample room downwards, so we make that equivalent to QSCALE=2.4
+         */
+        desired_pe = pe * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) / (2 * 2.5f * 120.0f);
+        desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+        desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+
+        /* PE slope smoothing */
+        if (ctx->bitres.bits > 0) {
+            desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+            desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+        }
+
+        pctx->pe.max = FFMAX(pe, pctx->pe.max);
+        pctx->pe.min = FFMIN(pe, pctx->pe.min);
+    } else {
+        desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
+        desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
+
+        /* NOTE: PE correction is kept simple. During initial testing it had very
+         *       little effect on the final bitrate. Probably a good idea to come
+         *       back and do more testing later.
+         */
+        if (ctx->bitres.bits > 0)
+            desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
+                                   0.85f, 1.15f);
+    }
     pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
+    ctx->bitres.alloc = desired_bits;
 
     if (desired_pe < pe) {
         /* 5.6.1.3.4 "First Estimation of the reduction value" */
@@ -681,7 +770,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
             }
             desired_pe_no_ah = FFMAX(desired_pe - (pe - pe_no_ah), 0.0f);
             if (active_lines > 0.0f)
-                reduction += calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
+                reduction = calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
 
             pe = 0.0f;
             for (w = 0; w < wi->num_windows*16; w += 16) {
@@ -691,7 +780,10 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
                     if (active_lines > 0.0f)
                         band->thr = calc_reduced_thr_3gpp(band, coeffs[g].min_snr, reduction);
                     pe += calc_pe_3gpp(band);
-                    band->norm_fac = band->active_lines / band->thr;
+                    if (band->thr > 0.0f)
+                        band->norm_fac = band->active_lines / band->thr;
+                    else
+                        band->norm_fac = 0.0f;
                     norm_fac += band->norm_fac;
                 }
             }
@@ -711,7 +803,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
                         float delta_sfb_pe = band->norm_fac * norm_fac * delta_pe;
                         float thr = band->thr;
 
-                        thr *= powf(2.0f, delta_sfb_pe / band->active_lines);
+                        thr *= exp2f(delta_sfb_pe / band->active_lines);
                         if (thr > coeffs[g].min_snr * band->energy && band->avoid_holes == PSY_3GPP_AH_INACTIVE)
                             thr = FFMAX(band->thr, coeffs[g].min_snr * band->energy);
                         band->thr = thr;
@@ -742,6 +834,8 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
             psy_band->threshold = band->thr;
             psy_band->energy    = band->energy;
+            psy_band->spread    = band->active_lines * 2.0f / band_sizes[g];
+            psy_band->bits      = PSY_3GPP_PE_TO_BITS(band->pe);
         }
     }
 
@@ -801,21 +895,10 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
         float energy_subshort[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS];
         float energy_short[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
         const float *firbuf = la + (AAC_BLOCK_SIZE_SHORT/4 - PSY_LAME_FIR_LEN);
-        int j, att_sum = 0;
+        int att_sum = 0;
 
         /* LAME comment: apply high pass filter of fs/4 */
-        for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
-            float sum1, sum2;
-            sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
-            sum2 = 0.0;
-            for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
-                sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
-                sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
-            }
-            /* NOTE: The LAME psymodel expects its input in the range -32768 to
-             * 32768. Tuning this for normalized floats would be difficult. */
-            hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
-        }
+        psy_hp_filter(firbuf, hpfsmpl, psy_fir_coeffs);
 
         /* Calculate the energies of each sub-shortblock */
         for (i = 0; i < PSY_LAME_NUM_SUBBLOCKS; i++) {
@@ -893,12 +976,14 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
 
     wi.window_type[1] = prev_type;
     if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
+
         wi.num_windows  = 1;
         wi.grouping[0]  = 1;
         if (wi.window_type[0] == LONG_START_SEQUENCE)
             wi.window_shape = 0;
         else
             wi.window_shape = 1;
+
     } else {
         int lastgrp = 0;
 
diff --git a/libavcodec/aacsbr.c b/libavcodec/aacsbr.c
index d9bbe5e..1d2a8d4 100644
--- a/libavcodec/aacsbr.c
+++ b/libavcodec/aacsbr.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,263 +25,31 @@
  * AAC Spectral Band Replication decoding functions
  * @author Robert Swain ( rob opendot cl )
  */
+#define USE_FIXED 0
 
 #include "aac.h"
 #include "sbr.h"
 #include "aacsbr.h"
 #include "aacsbrdata.h"
+#include "aacsbr_tablegen.h"
 #include "fft.h"
+#include "internal.h"
 #include "aacps.h"
 #include "sbrdsp.h"
 #include "libavutil/internal.h"
 #include "libavutil/libm.h"
+#include "libavutil/avassert.h"
 
 #include <stdint.h>
 #include <float.h>
+#include <math.h>
 
-#define ENVELOPE_ADJUSTMENT_OFFSET 2
-#define NOISE_FLOOR_OFFSET 6.0f
-
-/**
- * SBR VLC tables
- */
-enum {
-    T_HUFFMAN_ENV_1_5DB,
-    F_HUFFMAN_ENV_1_5DB,
-    T_HUFFMAN_ENV_BAL_1_5DB,
-    F_HUFFMAN_ENV_BAL_1_5DB,
-    T_HUFFMAN_ENV_3_0DB,
-    F_HUFFMAN_ENV_3_0DB,
-    T_HUFFMAN_ENV_BAL_3_0DB,
-    F_HUFFMAN_ENV_BAL_3_0DB,
-    T_HUFFMAN_NOISE_3_0DB,
-    T_HUFFMAN_NOISE_BAL_3_0DB,
-};
-
-/**
- * bs_frame_class - frame class of current SBR frame (14496-3 sp04 p98)
- */
-enum {
-    FIXFIX,
-    FIXVAR,
-    VARFIX,
-    VARVAR,
-};
-
-enum {
-    EXTENSION_ID_PS = 2,
-};
+#if ARCH_MIPS
+#include "mips/aacsbr_mips.h"
+#endif /* ARCH_MIPS */
 
 static VLC vlc_sbr[10];
-static const int8_t vlc_sbr_lav[10] =
-    { 60, 60, 24, 24, 31, 31, 12, 12, 31, 12 };
-
-#define SBR_INIT_VLC_STATIC(num, size) \
-    INIT_VLC_STATIC(&vlc_sbr[num], 9, sbr_tmp[num].table_size / sbr_tmp[num].elem_size,     \
-                    sbr_tmp[num].sbr_bits ,                      1,                      1, \
-                    sbr_tmp[num].sbr_codes, sbr_tmp[num].elem_size, sbr_tmp[num].elem_size, \
-                    size)
-
-#define SBR_VLC_ROW(name) \
-    { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
-
-av_cold void ff_aac_sbr_init(void)
-{
-    int n;
-    static const struct {
-        const void *sbr_codes, *sbr_bits;
-        const unsigned int table_size, elem_size;
-    } sbr_tmp[] = {
-        SBR_VLC_ROW(t_huffman_env_1_5dB),
-        SBR_VLC_ROW(f_huffman_env_1_5dB),
-        SBR_VLC_ROW(t_huffman_env_bal_1_5dB),
-        SBR_VLC_ROW(f_huffman_env_bal_1_5dB),
-        SBR_VLC_ROW(t_huffman_env_3_0dB),
-        SBR_VLC_ROW(f_huffman_env_3_0dB),
-        SBR_VLC_ROW(t_huffman_env_bal_3_0dB),
-        SBR_VLC_ROW(f_huffman_env_bal_3_0dB),
-        SBR_VLC_ROW(t_huffman_noise_3_0dB),
-        SBR_VLC_ROW(t_huffman_noise_bal_3_0dB),
-    };
-
-    // SBR VLC table initialization
-    SBR_INIT_VLC_STATIC(0, 1098);
-    SBR_INIT_VLC_STATIC(1, 1092);
-    SBR_INIT_VLC_STATIC(2, 768);
-    SBR_INIT_VLC_STATIC(3, 1026);
-    SBR_INIT_VLC_STATIC(4, 1058);
-    SBR_INIT_VLC_STATIC(5, 1052);
-    SBR_INIT_VLC_STATIC(6, 544);
-    SBR_INIT_VLC_STATIC(7, 544);
-    SBR_INIT_VLC_STATIC(8, 592);
-    SBR_INIT_VLC_STATIC(9, 512);
-
-    for (n = 1; n < 320; n++)
-        sbr_qmf_window_us[320 + n] = sbr_qmf_window_us[320 - n];
-    sbr_qmf_window_us[384] = -sbr_qmf_window_us[384];
-    sbr_qmf_window_us[512] = -sbr_qmf_window_us[512];
-
-    for (n = 0; n < 320; n++)
-        sbr_qmf_window_ds[n] = sbr_qmf_window_us[2*n];
-
-    ff_ps_init();
-}
-
-/** Places SBR in pure upsampling mode. */
-static void sbr_turnoff(SpectralBandReplication *sbr) {
-    sbr->start = 0;
-    // Init defaults used in pure upsampling mode
-    sbr->kx[1] = 32; //Typo in spec, kx' inits to 32
-    sbr->m[1] = 0;
-    // Reset values for first SBR header
-    sbr->data[0].e_a[1] = sbr->data[1].e_a[1] = -1;
-    memset(&sbr->spectrum_params, -1, sizeof(SpectrumParameters));
-}
-
-av_cold void ff_aac_sbr_ctx_init(AACContext *ac, SpectralBandReplication *sbr)
-{
-    sbr->kx[0] = sbr->kx[1];
-    sbr_turnoff(sbr);
-    sbr->data[0].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
-    sbr->data[1].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
-    /* SBR requires samples to be scaled to +/-32768.0 to work correctly.
-     * mdct scale factors are adjusted to scale up from +/-1.0 at analysis
-     * and scale back down at synthesis. */
-    ff_mdct_init(&sbr->mdct,     7, 1, 1.0 / (64 * 32768.0));
-    ff_mdct_init(&sbr->mdct_ana, 7, 1, -2.0 * 32768.0);
-    ff_ps_ctx_init(&sbr->ps);
-    ff_sbrdsp_init(&sbr->dsp);
-}
-
-av_cold void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr)
-{
-    ff_mdct_end(&sbr->mdct);
-    ff_mdct_end(&sbr->mdct_ana);
-}
-
-static int qsort_comparison_function_int16(const void *a, const void *b)
-{
-    return *(const int16_t *)a - *(const int16_t *)b;
-}
-
-static inline int in_table_int16(const int16_t *table, int last_el, int16_t needle)
-{
-    int i;
-    for (i = 0; i <= last_el; i++)
-        if (table[i] == needle)
-            return 1;
-    return 0;
-}
-
-/// Limiter Frequency Band Table (14496-3 sp04 p198)
-static void sbr_make_f_tablelim(SpectralBandReplication *sbr)
-{
-    int k;
-    if (sbr->bs_limiter_bands > 0) {
-        static const float bands_warped[3] = { 1.32715174233856803909f,   //2^(0.49/1.2)
-                                               1.18509277094158210129f,   //2^(0.49/2)
-                                               1.11987160404675912501f }; //2^(0.49/3)
-        const float lim_bands_per_octave_warped = bands_warped[sbr->bs_limiter_bands - 1];
-        int16_t patch_borders[7];
-        uint16_t *in = sbr->f_tablelim + 1, *out = sbr->f_tablelim;
-
-        patch_borders[0] = sbr->kx[1];
-        for (k = 1; k <= sbr->num_patches; k++)
-            patch_borders[k] = patch_borders[k-1] + sbr->patch_num_subbands[k-1];
-
-        memcpy(sbr->f_tablelim, sbr->f_tablelow,
-               (sbr->n[0] + 1) * sizeof(sbr->f_tablelow[0]));
-        if (sbr->num_patches > 1)
-            memcpy(sbr->f_tablelim + sbr->n[0] + 1, patch_borders + 1,
-                   (sbr->num_patches - 1) * sizeof(patch_borders[0]));
-
-        qsort(sbr->f_tablelim, sbr->num_patches + sbr->n[0],
-              sizeof(sbr->f_tablelim[0]),
-              qsort_comparison_function_int16);
-
-        sbr->n_lim = sbr->n[0] + sbr->num_patches - 1;
-        while (out < sbr->f_tablelim + sbr->n_lim) {
-            if (*in >= *out * lim_bands_per_octave_warped) {
-                *++out = *in++;
-            } else if (*in == *out ||
-                !in_table_int16(patch_borders, sbr->num_patches, *in)) {
-                in++;
-                sbr->n_lim--;
-            } else if (!in_table_int16(patch_borders, sbr->num_patches, *out)) {
-                *out = *in++;
-                sbr->n_lim--;
-            } else {
-                *++out = *in++;
-            }
-        }
-    } else {
-        sbr->f_tablelim[0] = sbr->f_tablelow[0];
-        sbr->f_tablelim[1] = sbr->f_tablelow[sbr->n[0]];
-        sbr->n_lim = 1;
-    }
-}
-
-static unsigned int read_sbr_header(SpectralBandReplication *sbr, GetBitContext *gb)
-{
-    unsigned int cnt = get_bits_count(gb);
-    uint8_t bs_header_extra_1;
-    uint8_t bs_header_extra_2;
-    int old_bs_limiter_bands = sbr->bs_limiter_bands;
-    SpectrumParameters old_spectrum_params;
-
-    sbr->start = 1;
-
-    // Save last spectrum parameters variables to compare to new ones
-    memcpy(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters));
-
-    sbr->bs_amp_res_header              = get_bits1(gb);
-    sbr->spectrum_params.bs_start_freq  = get_bits(gb, 4);
-    sbr->spectrum_params.bs_stop_freq   = get_bits(gb, 4);
-    sbr->spectrum_params.bs_xover_band  = get_bits(gb, 3);
-                                          skip_bits(gb, 2); // bs_reserved
-
-    bs_header_extra_1 = get_bits1(gb);
-    bs_header_extra_2 = get_bits1(gb);
-
-    if (bs_header_extra_1) {
-        sbr->spectrum_params.bs_freq_scale  = get_bits(gb, 2);
-        sbr->spectrum_params.bs_alter_scale = get_bits1(gb);
-        sbr->spectrum_params.bs_noise_bands = get_bits(gb, 2);
-    } else {
-        sbr->spectrum_params.bs_freq_scale  = 2;
-        sbr->spectrum_params.bs_alter_scale = 1;
-        sbr->spectrum_params.bs_noise_bands = 2;
-    }
-
-    // Check if spectrum parameters changed
-    if (memcmp(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters)))
-        sbr->reset = 1;
-
-    if (bs_header_extra_2) {
-        sbr->bs_limiter_bands  = get_bits(gb, 2);
-        sbr->bs_limiter_gains  = get_bits(gb, 2);
-        sbr->bs_interpol_freq  = get_bits1(gb);
-        sbr->bs_smoothing_mode = get_bits1(gb);
-    } else {
-        sbr->bs_limiter_bands  = 2;
-        sbr->bs_limiter_gains  = 2;
-        sbr->bs_interpol_freq  = 1;
-        sbr->bs_smoothing_mode = 1;
-    }
-
-    if (sbr->bs_limiter_bands != old_bs_limiter_bands && !sbr->reset)
-        sbr_make_f_tablelim(sbr);
-
-    return get_bits_count(gb) - cnt;
-}
-
-static int array_min_int16(const int16_t *array, int nel)
-{
-    int i, min = array[0];
-    for (i = 1; i < nel; i++)
-        min = FFMIN(array[i], min);
-    return min;
-}
+static void aacsbr_func_ptr_init(AACSBRContext *c);
 
 static void make_bands(int16_t* bands, int start, int stop, int num_bands)
 {
@@ -301,923 +69,70 @@ static void make_bands(int16_t* bands, int start, int stop, int num_bands)
     bands[num_bands-1] = stop - previous;
 }
 
-static int check_n_master(AVCodecContext *avctx, int n_master, int bs_xover_band)
-{
-    // Requirements (14496-3 sp04 p205)
-    if (n_master <= 0) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid n_master: %d\n", n_master);
-        return -1;
-    }
-    if (bs_xover_band >= n_master) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid bitstream, crossover band index beyond array bounds: %d\n",
-               bs_xover_band);
-        return -1;
-    }
-    return 0;
-}
-
-/// Master Frequency Band Table (14496-3 sp04 p194)
-static int sbr_make_f_master(AACContext *ac, SpectralBandReplication *sbr,
-                             SpectrumParameters *spectrum)
-{
-    unsigned int temp, max_qmf_subbands = 0;
-    unsigned int start_min, stop_min;
-    int k;
-    const int8_t *sbr_offset_ptr;
-    int16_t stop_dk[13];
-
-    switch (sbr->sample_rate) {
-    case 16000:
-        sbr_offset_ptr = sbr_offset[0];
-        break;
-    case 22050:
-        sbr_offset_ptr = sbr_offset[1];
-        break;
-    case 24000:
-        sbr_offset_ptr = sbr_offset[2];
-        break;
-    case 32000:
-        sbr_offset_ptr = sbr_offset[3];
-        break;
-    case 44100: case 48000: case 64000:
-        sbr_offset_ptr = sbr_offset[4];
-        break;
-    case 88200: case 96000: case 128000: case 176400: case 192000:
-        sbr_offset_ptr = sbr_offset[5];
-        break;
-    default:
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Unsupported sample rate for SBR: %d\n", sbr->sample_rate);
-        return -1;
-    }
-
-    if (sbr->sample_rate < 32000) {
-        temp = 3000;
-    } else if (sbr->sample_rate < 64000) {
-        temp = 4000;
-    } else
-        temp = 5000;
-
-    start_min = ((temp << 7) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-    stop_min  = ((temp << 8) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-
-    sbr->k[0] = start_min + sbr_offset_ptr[spectrum->bs_start_freq];
-
-    if (spectrum->bs_stop_freq < 14) {
-        sbr->k[2] = stop_min;
-        make_bands(stop_dk, stop_min, 64, 13);
-        qsort(stop_dk, 13, sizeof(stop_dk[0]), qsort_comparison_function_int16);
-        for (k = 0; k < spectrum->bs_stop_freq; k++)
-            sbr->k[2] += stop_dk[k];
-    } else if (spectrum->bs_stop_freq == 14) {
-        sbr->k[2] = 2*sbr->k[0];
-    } else if (spectrum->bs_stop_freq == 15) {
-        sbr->k[2] = 3*sbr->k[0];
-    } else {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bs_stop_freq: %d\n", spectrum->bs_stop_freq);
-        return -1;
-    }
-    sbr->k[2] = FFMIN(64, sbr->k[2]);
-
-    // Requirements (14496-3 sp04 p205)
-    if (sbr->sample_rate <= 32000) {
-        max_qmf_subbands = 48;
-    } else if (sbr->sample_rate == 44100) {
-        max_qmf_subbands = 35;
-    } else if (sbr->sample_rate >= 48000)
-        max_qmf_subbands = 32;
-
-    if (sbr->k[2] - sbr->k[0] > max_qmf_subbands) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bitstream, too many QMF subbands: %d\n", sbr->k[2] - sbr->k[0]);
-        return -1;
-    }
-
-    if (!spectrum->bs_freq_scale) {
-        int dk, k2diff;
-
-        dk = spectrum->bs_alter_scale + 1;
-        sbr->n_master = ((sbr->k[2] - sbr->k[0] + (dk&2)) >> dk) << 1;
-        if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-            return -1;
-
-        for (k = 1; k <= sbr->n_master; k++)
-            sbr->f_master[k] = dk;
-
-        k2diff = sbr->k[2] - sbr->k[0] - sbr->n_master * dk;
-        if (k2diff < 0) {
-            sbr->f_master[1]--;
-            sbr->f_master[2]-= (k2diff < -1);
-        } else if (k2diff) {
-            sbr->f_master[sbr->n_master]++;
-        }
-
-        sbr->f_master[0] = sbr->k[0];
-        for (k = 1; k <= sbr->n_master; k++)
-            sbr->f_master[k] += sbr->f_master[k - 1];
-
-    } else {
-        int half_bands = 7 - spectrum->bs_freq_scale;      // bs_freq_scale  = {1,2,3}
-        int two_regions, num_bands_0;
-        int vdk0_max, vdk1_min;
-        int16_t vk0[49];
-
-        if (49 * sbr->k[2] > 110 * sbr->k[0]) {
-            two_regions = 1;
-            sbr->k[1] = 2 * sbr->k[0];
-        } else {
-            two_regions = 0;
-            sbr->k[1] = sbr->k[2];
-        }
-
-        num_bands_0 = lrintf(half_bands * log2f(sbr->k[1] / (float)sbr->k[0])) * 2;
-
-        if (num_bands_0 <= 0) { // Requirements (14496-3 sp04 p205)
-            av_log(ac->avctx, AV_LOG_ERROR, "Invalid num_bands_0: %d\n", num_bands_0);
-            return -1;
-        }
-
-        vk0[0] = 0;
-
-        make_bands(vk0+1, sbr->k[0], sbr->k[1], num_bands_0);
-
-        qsort(vk0 + 1, num_bands_0, sizeof(vk0[1]), qsort_comparison_function_int16);
-        vdk0_max = vk0[num_bands_0];
-
-        vk0[0] = sbr->k[0];
-        for (k = 1; k <= num_bands_0; k++) {
-            if (vk0[k] <= 0) { // Requirements (14496-3 sp04 p205)
-                av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk0[%d]: %d\n", k, vk0[k]);
-                return -1;
-            }
-            vk0[k] += vk0[k-1];
-        }
-
-        if (two_regions) {
-            int16_t vk1[49];
-            float invwarp = spectrum->bs_alter_scale ? 0.76923076923076923077f
-                                                     : 1.0f; // bs_alter_scale = {0,1}
-            int num_bands_1 = lrintf(half_bands * invwarp *
-                                     log2f(sbr->k[2] / (float)sbr->k[1])) * 2;
-
-            make_bands(vk1+1, sbr->k[1], sbr->k[2], num_bands_1);
-
-            vdk1_min = array_min_int16(vk1 + 1, num_bands_1);
-
-            if (vdk1_min < vdk0_max) {
-                int change;
-                qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
-                change = FFMIN(vdk0_max - vk1[1], (vk1[num_bands_1] - vk1[1]) >> 1);
-                vk1[1]           += change;
-                vk1[num_bands_1] -= change;
-            }
-
-            qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
-
-            vk1[0] = sbr->k[1];
-            for (k = 1; k <= num_bands_1; k++) {
-                if (vk1[k] <= 0) { // Requirements (14496-3 sp04 p205)
-                    av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk1[%d]: %d\n", k, vk1[k]);
-                    return -1;
-                }
-                vk1[k] += vk1[k-1];
-            }
-
-            sbr->n_master = num_bands_0 + num_bands_1;
-            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-                return -1;
-            memcpy(&sbr->f_master[0],               vk0,
-                   (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
-            memcpy(&sbr->f_master[num_bands_0 + 1], vk1 + 1,
-                    num_bands_1      * sizeof(sbr->f_master[0]));
-
-        } else {
-            sbr->n_master = num_bands_0;
-            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-                return -1;
-            memcpy(sbr->f_master, vk0, (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
-        }
-    }
-
-    return 0;
-}
-
-/// High Frequency Generation - Patch Construction (14496-3 sp04 p216 fig. 4.46)
-static int sbr_hf_calc_npatches(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int i, k, sb = 0;
-    int msb = sbr->k[0];
-    int usb = sbr->kx[1];
-    int goal_sb = ((1000 << 11) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-
-    sbr->num_patches = 0;
-
-    if (goal_sb < sbr->kx[1] + sbr->m[1]) {
-        for (k = 0; sbr->f_master[k] < goal_sb; k++) ;
-    } else
-        k = sbr->n_master;
-
-    do {
-        int odd = 0;
-        for (i = k; i == k || sb > (sbr->k[0] - 1 + msb - odd); i--) {
-            sb = sbr->f_master[i];
-            odd = (sb + sbr->k[0]) & 1;
-        }
-
-        // Requirements (14496-3 sp04 p205) sets the maximum number of patches to 5.
-        // After this check the final number of patches can still be six which is
-        // illegal however the Coding Technologies decoder check stream has a final
-        // count of 6 patches
-        if (sbr->num_patches > 5) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Too many patches: %d\n", sbr->num_patches);
-            return -1;
-        }
-
-        sbr->patch_num_subbands[sbr->num_patches]  = FFMAX(sb - usb, 0);
-        sbr->patch_start_subband[sbr->num_patches] = sbr->k[0] - odd - sbr->patch_num_subbands[sbr->num_patches];
-
-        if (sbr->patch_num_subbands[sbr->num_patches] > 0) {
-            usb = sb;
-            msb = sb;
-            sbr->num_patches++;
-        } else
-            msb = sbr->kx[1];
-
-        if (sbr->f_master[k] - sb < 3)
-            k = sbr->n_master;
-    } while (sb != sbr->kx[1] + sbr->m[1]);
-
-    if (sbr->num_patches > 1 &&
-        sbr->patch_num_subbands[sbr->num_patches - 1] < 3)
-        sbr->num_patches--;
-
-    return 0;
-}
-
-/// Derived Frequency Band Tables (14496-3 sp04 p197)
-static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int k, temp;
-
-    sbr->n[1] = sbr->n_master - sbr->spectrum_params.bs_xover_band;
-    sbr->n[0] = (sbr->n[1] + 1) >> 1;
-
-    memcpy(sbr->f_tablehigh, &sbr->f_master[sbr->spectrum_params.bs_xover_band],
-           (sbr->n[1] + 1) * sizeof(sbr->f_master[0]));
-    sbr->m[1] = sbr->f_tablehigh[sbr->n[1]] - sbr->f_tablehigh[0];
-    sbr->kx[1] = sbr->f_tablehigh[0];
-
-    // Requirements (14496-3 sp04 p205)
-    if (sbr->kx[1] + sbr->m[1] > 64) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Stop frequency border too high: %d\n", sbr->kx[1] + sbr->m[1]);
-        return -1;
-    }
-    if (sbr->kx[1] > 32) {
-        av_log(ac->avctx, AV_LOG_ERROR, "Start frequency border too high: %d\n", sbr->kx[1]);
-        return -1;
-    }
-
-    sbr->f_tablelow[0] = sbr->f_tablehigh[0];
-    temp = sbr->n[1] & 1;
-    for (k = 1; k <= sbr->n[0]; k++)
-        sbr->f_tablelow[k] = sbr->f_tablehigh[2 * k - temp];
-
-    sbr->n_q = FFMAX(1, lrintf(sbr->spectrum_params.bs_noise_bands *
-                               log2f(sbr->k[2] / (float)sbr->kx[1]))); // 0 <= bs_noise_bands <= 3
-    if (sbr->n_q > 5) {
-        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
-        return -1;
-    }
-
-    sbr->f_tablenoise[0] = sbr->f_tablelow[0];
-    temp = 0;
-    for (k = 1; k <= sbr->n_q; k++) {
-        temp += (sbr->n[0] - temp) / (sbr->n_q + 1 - k);
-        sbr->f_tablenoise[k] = sbr->f_tablelow[temp];
-    }
-
-    if (sbr_hf_calc_npatches(ac, sbr) < 0)
-        return -1;
-
-    sbr_make_f_tablelim(sbr);
-
-    sbr->data[0].f_indexnoise = 0;
-    sbr->data[1].f_indexnoise = 0;
-
-    return 0;
-}
-
-static av_always_inline void get_bits1_vector(GetBitContext *gb, uint8_t *vec,
-                                              int elements)
-{
-    int i;
-    for (i = 0; i < elements; i++) {
-        vec[i] = get_bits1(gb);
-    }
-}
-
-/** ceil(log2(index+1)) */
-static const int8_t ceil_log2[] = {
-    0, 1, 2, 2, 3, 3,
-};
-
-static int read_sbr_grid(AACContext *ac, SpectralBandReplication *sbr,
-                         GetBitContext *gb, SBRData *ch_data)
-{
-    int i;
-    int bs_pointer = 0;
-    // frameLengthFlag ? 15 : 16; 960 sample length frames unsupported; this value is numTimeSlots
-    int abs_bord_trail = 16;
-    int num_rel_lead, num_rel_trail;
-    unsigned bs_num_env_old = ch_data->bs_num_env;
-
-    ch_data->bs_freq_res[0] = ch_data->bs_freq_res[ch_data->bs_num_env];
-    ch_data->bs_amp_res = sbr->bs_amp_res_header;
-    ch_data->t_env_num_env_old = ch_data->t_env[bs_num_env_old];
-
-    switch (ch_data->bs_frame_class = get_bits(gb, 2)) {
-    case FIXFIX:
-        ch_data->bs_num_env                 = 1 << get_bits(gb, 2);
-        num_rel_lead                        = ch_data->bs_num_env - 1;
-        if (ch_data->bs_num_env == 1)
-            ch_data->bs_amp_res = 0;
-
-        if (ch_data->bs_num_env > 4) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid bitstream, too many SBR envelopes in FIXFIX type SBR frame: %d\n",
-                   ch_data->bs_num_env);
-            return -1;
-        }
-
-        ch_data->t_env[0]                   = 0;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        abs_bord_trail = (abs_bord_trail + (ch_data->bs_num_env >> 1)) /
-                   ch_data->bs_num_env;
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + abs_bord_trail;
-
-        ch_data->bs_freq_res[1] = get_bits1(gb);
-        for (i = 1; i < ch_data->bs_num_env; i++)
-            ch_data->bs_freq_res[i + 1] = ch_data->bs_freq_res[1];
-        break;
-    case FIXVAR:
-        abs_bord_trail                     += get_bits(gb, 2);
-        num_rel_trail                       = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_trail + 1;
-        ch_data->t_env[0]                   = 0;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_trail; i++)
-            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
-                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        for (i = 0; i < ch_data->bs_num_env; i++)
-            ch_data->bs_freq_res[ch_data->bs_num_env - i] = get_bits1(gb);
-        break;
-    case VARFIX:
-        ch_data->t_env[0]                   = get_bits(gb, 2);
-        num_rel_lead                        = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_lead + 1;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
-        break;
-    case VARVAR:
-        ch_data->t_env[0]                   = get_bits(gb, 2);
-        abs_bord_trail                     += get_bits(gb, 2);
-        num_rel_lead                        = get_bits(gb, 2);
-        num_rel_trail                       = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_lead + num_rel_trail + 1;
-
-        if (ch_data->bs_num_env > 5) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid bitstream, too many SBR envelopes in VARVAR type SBR frame: %d\n",
-                   ch_data->bs_num_env);
-            return -1;
-        }
-
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
-        for (i = 0; i < num_rel_trail; i++)
-            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
-                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
-        break;
-    }
-
-    if (bs_pointer < 0 || bs_pointer > ch_data->bs_num_env + 1) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bitstream, bs_pointer points to a middle noise border outside the time borders table: %d\n",
-               bs_pointer);
-        return -1;
-    }
-
-    for (i = 1; i <= ch_data->bs_num_env; i++) {
-        if (ch_data->t_env[i-1] > ch_data->t_env[i]) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Non monotone time borders\n");
-            return -1;
-        }
-    }
-
-    ch_data->bs_num_noise = (ch_data->bs_num_env > 1) + 1;
-
-    ch_data->t_q[0]                     = ch_data->t_env[0];
-    ch_data->t_q[ch_data->bs_num_noise] = ch_data->t_env[ch_data->bs_num_env];
-    if (ch_data->bs_num_noise > 1) {
-        int idx;
-        if (ch_data->bs_frame_class == FIXFIX) {
-            idx = ch_data->bs_num_env >> 1;
-        } else if (ch_data->bs_frame_class & 1) { // FIXVAR or VARVAR
-            idx = ch_data->bs_num_env - FFMAX(bs_pointer - 1, 1);
-        } else { // VARFIX
-            if (!bs_pointer)
-                idx = 1;
-            else if (bs_pointer == 1)
-                idx = ch_data->bs_num_env - 1;
-            else // bs_pointer > 1
-                idx = bs_pointer - 1;
-        }
-        ch_data->t_q[1] = ch_data->t_env[idx];
-    }
-
-    ch_data->e_a[0] = -(ch_data->e_a[1] != bs_num_env_old); // l_APrev
-    ch_data->e_a[1] = -1;
-    if ((ch_data->bs_frame_class & 1) && bs_pointer) { // FIXVAR or VARVAR and bs_pointer != 0
-        ch_data->e_a[1] = ch_data->bs_num_env + 1 - bs_pointer;
-    } else if ((ch_data->bs_frame_class == 2) && (bs_pointer > 1)) // VARFIX and bs_pointer > 1
-        ch_data->e_a[1] = bs_pointer - 1;
-
-    return 0;
-}
-
-static void copy_sbr_grid(SBRData *dst, const SBRData *src) {
-    //These variables are saved from the previous frame rather than copied
-    dst->bs_freq_res[0]    = dst->bs_freq_res[dst->bs_num_env];
-    dst->t_env_num_env_old = dst->t_env[dst->bs_num_env];
-    dst->e_a[0]            = -(dst->e_a[1] != dst->bs_num_env);
-
-    //These variables are read from the bitstream and therefore copied
-    memcpy(dst->bs_freq_res+1, src->bs_freq_res+1, sizeof(dst->bs_freq_res)-sizeof(*dst->bs_freq_res));
-    memcpy(dst->t_env,         src->t_env,         sizeof(dst->t_env));
-    memcpy(dst->t_q,           src->t_q,           sizeof(dst->t_q));
-    dst->bs_num_env        = src->bs_num_env;
-    dst->bs_amp_res        = src->bs_amp_res;
-    dst->bs_num_noise      = src->bs_num_noise;
-    dst->bs_frame_class    = src->bs_frame_class;
-    dst->e_a[1]            = src->e_a[1];
-}
-
-/// Read how the envelope and noise floor data is delta coded
-static void read_sbr_dtdf(SpectralBandReplication *sbr, GetBitContext *gb,
-                          SBRData *ch_data)
-{
-    get_bits1_vector(gb, ch_data->bs_df_env,   ch_data->bs_num_env);
-    get_bits1_vector(gb, ch_data->bs_df_noise, ch_data->bs_num_noise);
-}
-
-/// Read inverse filtering data
-static void read_sbr_invf(SpectralBandReplication *sbr, GetBitContext *gb,
-                          SBRData *ch_data)
-{
-    int i;
-
-    memcpy(ch_data->bs_invf_mode[1], ch_data->bs_invf_mode[0], 5 * sizeof(uint8_t));
-    for (i = 0; i < sbr->n_q; i++)
-        ch_data->bs_invf_mode[0][i] = get_bits(gb, 2);
-}
-
-static void read_sbr_envelope(SpectralBandReplication *sbr, GetBitContext *gb,
-                              SBRData *ch_data, int ch)
-{
-    int bits;
-    int i, j, k;
-    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
-    int t_lav, f_lav;
-    const int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
-    const int odd = sbr->n[1] & 1;
-
-    if (sbr->bs_coupling && ch) {
-        if (ch_data->bs_amp_res) {
-            bits   = 5;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_3_0DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_3_0DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
-        } else {
-            bits   = 6;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_1_5DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_1_5DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_1_5DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_1_5DB];
-        }
-    } else {
-        if (ch_data->bs_amp_res) {
-            bits   = 6;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_3_0DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_3_0DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
-        } else {
-            bits   = 7;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_1_5DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_1_5DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_1_5DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_1_5DB];
-        }
-    }
-
-    for (i = 0; i < ch_data->bs_num_env; i++) {
-        if (ch_data->bs_df_env[i]) {
-            // bs_freq_res[0] == bs_freq_res[bs_num_env] from prev frame
-            if (ch_data->bs_freq_res[i + 1] == ch_data->bs_freq_res[i]) {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-            } else if (ch_data->bs_freq_res[i + 1]) {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
-                    k = (j + odd) >> 1; // find k such that f_tablelow[k] <= f_tablehigh[j] < f_tablelow[k + 1]
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-                }
-            } else {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
-                    k = j ? 2*j - odd : 0; // find k such that f_tablehigh[k] == f_tablelow[j]
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-                }
-            }
-        } else {
-            ch_data->env_facs[i + 1][0] = delta * get_bits(gb, bits); // bs_env_start_value_balance
-            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
-                ch_data->env_facs[i + 1][j] = ch_data->env_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
-        }
-    }
-
-    //assign 0th elements of env_facs from last elements
-    memcpy(ch_data->env_facs[0], ch_data->env_facs[ch_data->bs_num_env],
-           sizeof(ch_data->env_facs[0]));
-}
-
-static void read_sbr_noise(SpectralBandReplication *sbr, GetBitContext *gb,
-                           SBRData *ch_data, int ch)
-{
-    int i, j;
-    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
-    int t_lav, f_lav;
-    int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
-
-    if (sbr->bs_coupling && ch) {
-        t_huff = vlc_sbr[T_HUFFMAN_NOISE_BAL_3_0DB].table;
-        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_BAL_3_0DB];
-        f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
-        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
-    } else {
-        t_huff = vlc_sbr[T_HUFFMAN_NOISE_3_0DB].table;
-        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_3_0DB];
-        f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
-        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
-    }
-
-    for (i = 0; i < ch_data->bs_num_noise; i++) {
-        if (ch_data->bs_df_noise[i]) {
-            for (j = 0; j < sbr->n_q; j++)
-                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 2) - t_lav);
-        } else {
-            ch_data->noise_facs[i + 1][0] = delta * get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
-            for (j = 1; j < sbr->n_q; j++)
-                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
-        }
-    }
-
-    //assign 0th elements of noise_facs from last elements
-    memcpy(ch_data->noise_facs[0], ch_data->noise_facs[ch_data->bs_num_noise],
-           sizeof(ch_data->noise_facs[0]));
-}
-
-static void read_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
-                               GetBitContext *gb,
-                               int bs_extension_id, int *num_bits_left)
-{
-    switch (bs_extension_id) {
-    case EXTENSION_ID_PS:
-        if (!ac->oc[1].m4ac.ps) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Parametric Stereo signaled to be not-present but was found in the bitstream.\n");
-            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-            *num_bits_left = 0;
-        } else {
-            *num_bits_left -= ff_ps_read_data(ac->avctx, gb, &sbr->ps, *num_bits_left);
-            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
-        }
-        break;
-    default:
-        // some files contain 0-padding
-        if (bs_extension_id || *num_bits_left > 16 || show_bits(gb, *num_bits_left))
-            avpriv_request_sample(ac->avctx, "Reserved SBR extensions");
-        skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-        *num_bits_left = 0;
-        break;
-    }
-}
-
-static int read_sbr_single_channel_element(AACContext *ac,
-                                            SpectralBandReplication *sbr,
-                                            GetBitContext *gb)
-{
-    if (get_bits1(gb)) // bs_data_extra
-        skip_bits(gb, 4); // bs_reserved
-
-    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
-        return -1;
-    read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-    read_sbr_invf(sbr, gb, &sbr->data[0]);
-    read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-    read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-
-    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
-
-    return 0;
-}
-
-static int read_sbr_channel_pair_element(AACContext *ac,
-                                          SpectralBandReplication *sbr,
-                                          GetBitContext *gb)
-{
-    if (get_bits1(gb))    // bs_data_extra
-        skip_bits(gb, 8); // bs_reserved
-
-    if ((sbr->bs_coupling = get_bits1(gb))) {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
-            return -1;
-        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
-        read_sbr_invf(sbr, gb, &sbr->data[0]);
-        memcpy(sbr->data[1].bs_invf_mode[1], sbr->data[1].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
-        memcpy(sbr->data[1].bs_invf_mode[0], sbr->data[0].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
-        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
-        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
-    } else {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]) ||
-            read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
-            return -1;
-        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
-        read_sbr_invf(sbr, gb, &sbr->data[0]);
-        read_sbr_invf(sbr, gb, &sbr->data[1]);
-        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
-        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
-    }
-
-    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
-    if ((sbr->data[1].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[1].bs_add_harmonic, sbr->n[1]);
-
-    return 0;
-}
-
-static unsigned int read_sbr_data(AACContext *ac, SpectralBandReplication *sbr,
-                                  GetBitContext *gb, int id_aac)
-{
-    unsigned int cnt = get_bits_count(gb);
-
-    if (id_aac == TYPE_SCE || id_aac == TYPE_CCE) {
-        if (read_sbr_single_channel_element(ac, sbr, gb)) {
-            sbr_turnoff(sbr);
-            return get_bits_count(gb) - cnt;
-        }
-    } else if (id_aac == TYPE_CPE) {
-        if (read_sbr_channel_pair_element(ac, sbr, gb)) {
-            sbr_turnoff(sbr);
-            return get_bits_count(gb) - cnt;
-        }
-    } else {
-        av_log(ac->avctx, AV_LOG_ERROR,
-            "Invalid bitstream - cannot apply SBR to element type %d\n", id_aac);
-        sbr_turnoff(sbr);
-        return get_bits_count(gb) - cnt;
-    }
-    if (get_bits1(gb)) { // bs_extended_data
-        int num_bits_left = get_bits(gb, 4); // bs_extension_size
-        if (num_bits_left == 15)
-            num_bits_left += get_bits(gb, 8); // bs_esc_count
-
-        num_bits_left <<= 3;
-        while (num_bits_left > 7) {
-            num_bits_left -= 2;
-            read_sbr_extension(ac, sbr, gb, get_bits(gb, 2), &num_bits_left); // bs_extension_id
-        }
-        if (num_bits_left < 0) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR Extension over read.\n");
-        }
-        if (num_bits_left > 0)
-            skip_bits(gb, num_bits_left);
-    }
-
-    return get_bits_count(gb) - cnt;
-}
-
-static void sbr_reset(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int err;
-    err = sbr_make_f_master(ac, sbr, &sbr->spectrum_params);
-    if (err >= 0)
-        err = sbr_make_f_derived(ac, sbr);
-    if (err < 0) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "SBR reset failed. Switching SBR to pure upsampling mode.\n");
-        sbr_turnoff(sbr);
-    }
-}
-
-/**
- * Decode Spectral Band Replication extension data; reference: table 4.55.
- *
- * @param   crc flag indicating the presence of CRC checksum
- * @param   cnt length of TYPE_FIL syntactic element in bytes
- *
- * @return  Returns number of bytes consumed from the TYPE_FIL element.
- */
-int ff_decode_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
-                            GetBitContext *gb_host, int crc, int cnt, int id_aac)
-{
-    unsigned int num_sbr_bits = 0, num_align_bits;
-    unsigned bytes_read;
-    GetBitContext gbc = *gb_host, *gb = &gbc;
-    skip_bits_long(gb_host, cnt*8 - 4);
-
-    sbr->reset = 0;
-
-    if (!sbr->sample_rate)
-        sbr->sample_rate = 2 * ac->oc[1].m4ac.sample_rate; //TODO use the nominal sample rate for arbitrary sample rate support
-    if (!ac->oc[1].m4ac.ext_sample_rate)
-        ac->oc[1].m4ac.ext_sample_rate = 2 * ac->oc[1].m4ac.sample_rate;
-
-    if (crc) {
-        skip_bits(gb, 10); // bs_sbr_crc_bits; TODO - implement CRC check
-        num_sbr_bits += 10;
-    }
-
-    //Save some state from the previous frame.
-    sbr->kx[0] = sbr->kx[1];
-    sbr->m[0] = sbr->m[1];
-    sbr->kx_and_m_pushed = 1;
-
-    num_sbr_bits++;
-    if (get_bits1(gb)) // bs_header_flag
-        num_sbr_bits += read_sbr_header(sbr, gb);
-
-    if (sbr->reset)
-        sbr_reset(ac, sbr);
-
-    if (sbr->start)
-        num_sbr_bits  += read_sbr_data(ac, sbr, gb, id_aac);
-
-    num_align_bits = ((cnt << 3) - 4 - num_sbr_bits) & 7;
-    bytes_read = ((num_sbr_bits + num_align_bits + 4) >> 3);
-
-    if (bytes_read > cnt) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Expected to read %d SBR bytes actually read %d.\n", cnt, bytes_read);
-        sbr_turnoff(sbr);
-    }
-    return cnt;
-}
-
 /// Dequantization and stereo decoding (14496-3 sp04 p203)
 static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
 {
     int k, e;
     int ch;
-
+    static const double exp2_tab[2] = {1, M_SQRT2};
     if (id_aac == TYPE_CPE && sbr->bs_coupling) {
-        float alpha      = sbr->data[0].bs_amp_res ?  1.0f :  0.5f;
-        float pan_offset = sbr->data[0].bs_amp_res ? 12.0f : 24.0f;
+        int pan_offset = sbr->data[0].bs_amp_res ? 12 : 24;
         for (e = 1; e <= sbr->data[0].bs_num_env; e++) {
             for (k = 0; k < sbr->n[sbr->data[0].bs_freq_res[e]]; k++) {
-                float temp1 = exp2f(sbr->data[0].env_facs[e][k] * alpha + 7.0f);
-                float temp2 = exp2f((pan_offset - sbr->data[1].env_facs[e][k]) * alpha);
-                float fac   = temp1 / (1.0f + temp2);
+                float temp1, temp2, fac;
+                if (sbr->data[0].bs_amp_res) {
+                    temp1 = ff_exp2fi(sbr->data[0].env_facs_q[e][k] + 7);
+                    temp2 = ff_exp2fi(pan_offset - sbr->data[1].env_facs_q[e][k]);
+                }
+                else {
+                    temp1 = ff_exp2fi((sbr->data[0].env_facs_q[e][k]>>1) + 7) *
+                            exp2_tab[sbr->data[0].env_facs_q[e][k] & 1];
+                    temp2 = ff_exp2fi((pan_offset - sbr->data[1].env_facs_q[e][k])>>1) *
+                            exp2_tab[(pan_offset - sbr->data[1].env_facs_q[e][k]) & 1];
+                }
+                if (temp1 > 1E20) {
+                    av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                    temp1 = 1;
+                }
+                fac   = temp1 / (1.0f + temp2);
                 sbr->data[0].env_facs[e][k] = fac;
                 sbr->data[1].env_facs[e][k] = fac * temp2;
             }
         }
         for (e = 1; e <= sbr->data[0].bs_num_noise; e++) {
             for (k = 0; k < sbr->n_q; k++) {
-                float temp1 = exp2f(NOISE_FLOOR_OFFSET - sbr->data[0].noise_facs[e][k] + 1);
-                float temp2 = exp2f(12 - sbr->data[1].noise_facs[e][k]);
-                float fac   = temp1 / (1.0f + temp2);
+                float temp1 = ff_exp2fi(NOISE_FLOOR_OFFSET - sbr->data[0].noise_facs_q[e][k] + 1);
+                float temp2 = ff_exp2fi(12 - sbr->data[1].noise_facs_q[e][k]);
+                float fac;
+                av_assert0(temp1 <= 1E20);
+                fac = temp1 / (1.0f + temp2);
                 sbr->data[0].noise_facs[e][k] = fac;
                 sbr->data[1].noise_facs[e][k] = fac * temp2;
             }
         }
     } else { // SCE or one non-coupled CPE
         for (ch = 0; ch < (id_aac == TYPE_CPE) + 1; ch++) {
-            float alpha = sbr->data[ch].bs_amp_res ? 1.0f : 0.5f;
             for (e = 1; e <= sbr->data[ch].bs_num_env; e++)
-                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++)
-                    sbr->data[ch].env_facs[e][k] =
-                        exp2f(alpha * sbr->data[ch].env_facs[e][k] + 6.0f);
+                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++){
+                    if (sbr->data[ch].bs_amp_res)
+                        sbr->data[ch].env_facs[e][k] = ff_exp2fi(sbr->data[ch].env_facs_q[e][k] + 6);
+                    else
+                        sbr->data[ch].env_facs[e][k] = ff_exp2fi((sbr->data[ch].env_facs_q[e][k]>>1) + 6)
+                                                       * exp2_tab[sbr->data[ch].env_facs_q[e][k] & 1];
+                    if (sbr->data[ch].env_facs[e][k] > 1E20) {
+                        av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                        sbr->data[ch].env_facs[e][k] = 1;
+                    }
+                }
+
             for (e = 1; e <= sbr->data[ch].bs_num_noise; e++)
                 for (k = 0; k < sbr->n_q; k++)
                     sbr->data[ch].noise_facs[e][k] =
-                        exp2f(NOISE_FLOOR_OFFSET - sbr->data[ch].noise_facs[e][k]);
+                        ff_exp2fi(NOISE_FLOOR_OFFSET - sbr->data[ch].noise_facs_q[e][k]);
         }
     }
 }
 
-/**
- * Analysis QMF Bank (14496-3 sp04 p206)
- *
- * @param   x       pointer to the beginning of the first sample window
- * @param   W       array of complex-valued samples split into subbands
- */
-static void sbr_qmf_analysis(AVFloatDSPContext *dsp, FFTContext *mdct,
-                             SBRDSPContext *sbrdsp, const float *in, float *x,
-                             float z[320], float W[2][32][32][2], int buf_idx)
-{
-    int i;
-    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
-    memcpy(x+288, in,         1024*sizeof(x[0]));
-    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
-                               // are not supported
-        dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
-        sbrdsp->sum64x5(z);
-        sbrdsp->qmf_pre_shuffle(z);
-        mdct->imdct_half(mdct, z, z+64);
-        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
-        x += 32;
-    }
-}
-
-/**
- * Synthesis QMF Bank (14496-3 sp04 p206) and Downsampled Synthesis QMF Bank
- * (14496-3 sp04 p206)
- */
-static void sbr_qmf_synthesis(FFTContext *mdct,
-                              SBRDSPContext *sbrdsp, AVFloatDSPContext *dsp,
-                              float *out, float X[2][38][64],
-                              float mdct_buf[2][64],
-                              float *v0, int *v_off, const unsigned int div)
-{
-    int i, n;
-    const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
-    const int step = 128 >> div;
-    float *v;
-    for (i = 0; i < 32; i++) {
-        if (*v_off < step) {
-            int saved_samples = (1280 - 128) >> div;
-            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(float));
-            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
-        } else {
-            *v_off -= step;
-        }
-        v = v0 + *v_off;
-        if (div) {
-            for (n = 0; n < 32; n++) {
-                X[0][i][   n] = -X[0][i][n];
-                X[0][i][32+n] =  X[1][i][31-n];
-            }
-            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
-            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
-        } else {
-            sbrdsp->neg_odd_64(X[1][i]);
-            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
-            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
-            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
-        }
-        dsp->vector_fmul    (out, v                , sbr_qmf_window                       , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
-        out += 64 >> div;
-    }
-}
-
 /** High Frequency Generation (14496-3 sp04 p214+) and Inverse Filtering
  * (14496-3 sp04 p214)
  * Warning: This routine does not seem numerically stable.
@@ -1297,203 +212,6 @@ static void sbr_chirp(SpectralBandReplication *sbr, SBRData *ch_data)
     }
 }
 
-/// Generate the subband filtered lowband
-static int sbr_lf_gen(AACContext *ac, SpectralBandReplication *sbr,
-                      float X_low[32][40][2], const float W[2][32][32][2],
-                      int buf_idx)
-{
-    int i, k;
-    const int t_HFGen = 8;
-    const int i_f = 32;
-    memset(X_low, 0, 32*sizeof(*X_low));
-    for (k = 0; k < sbr->kx[1]; k++) {
-        for (i = t_HFGen; i < i_f + t_HFGen; i++) {
-            X_low[k][i][0] = W[buf_idx][i - t_HFGen][k][0];
-            X_low[k][i][1] = W[buf_idx][i - t_HFGen][k][1];
-        }
-    }
-    buf_idx = 1-buf_idx;
-    for (k = 0; k < sbr->kx[0]; k++) {
-        for (i = 0; i < t_HFGen; i++) {
-            X_low[k][i][0] = W[buf_idx][i + i_f - t_HFGen][k][0];
-            X_low[k][i][1] = W[buf_idx][i + i_f - t_HFGen][k][1];
-        }
-    }
-    return 0;
-}
-
-/// High Frequency Generator (14496-3 sp04 p215)
-static int sbr_hf_gen(AACContext *ac, SpectralBandReplication *sbr,
-                      float X_high[64][40][2], const float X_low[32][40][2],
-                      const float (*alpha0)[2], const float (*alpha1)[2],
-                      const float bw_array[5], const uint8_t *t_env,
-                      int bs_num_env)
-{
-    int j, x;
-    int g = 0;
-    int k = sbr->kx[1];
-    for (j = 0; j < sbr->num_patches; j++) {
-        for (x = 0; x < sbr->patch_num_subbands[j]; x++, k++) {
-            const int p = sbr->patch_start_subband[j] + x;
-            while (g <= sbr->n_q && k >= sbr->f_tablenoise[g])
-                g++;
-            g--;
-
-            if (g < 0) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "ERROR : no subband found for frequency %d\n", k);
-                return -1;
-            }
-
-            sbr->dsp.hf_gen(X_high[k] + ENVELOPE_ADJUSTMENT_OFFSET,
-                            X_low[p]  + ENVELOPE_ADJUSTMENT_OFFSET,
-                            alpha0[p], alpha1[p], bw_array[g],
-                            2 * t_env[0], 2 * t_env[bs_num_env]);
-        }
-    }
-    if (k < sbr->m[1] + sbr->kx[1])
-        memset(X_high + k, 0, (sbr->m[1] + sbr->kx[1] - k) * sizeof(*X_high));
-
-    return 0;
-}
-
-/// Generate the subband filtered lowband
-static int sbr_x_gen(SpectralBandReplication *sbr, float X[2][38][64],
-                     const float Y0[38][64][2], const float Y1[38][64][2],
-                     const float X_low[32][40][2], int ch)
-{
-    int k, i;
-    const int i_f = 32;
-    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
-    memset(X, 0, 2*sizeof(*X));
-    for (k = 0; k < sbr->kx[0]; k++) {
-        for (i = 0; i < i_Temp; i++) {
-            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
-            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
-        }
-    }
-    for (; k < sbr->kx[0] + sbr->m[0]; k++) {
-        for (i = 0; i < i_Temp; i++) {
-            X[0][i][k] = Y0[i + i_f][k][0];
-            X[1][i][k] = Y0[i + i_f][k][1];
-        }
-    }
-
-    for (k = 0; k < sbr->kx[1]; k++) {
-        for (i = i_Temp; i < 38; i++) {
-            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
-            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
-        }
-    }
-    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
-        for (i = i_Temp; i < i_f; i++) {
-            X[0][i][k] = Y1[i][k][0];
-            X[1][i][k] = Y1[i][k][1];
-        }
-    }
-    return 0;
-}
-
-/** High Frequency Adjustment (14496-3 sp04 p217) and Mapping
- * (14496-3 sp04 p217)
- */
-static int sbr_mapping(AACContext *ac, SpectralBandReplication *sbr,
-                        SBRData *ch_data, int e_a[2])
-{
-    int e, i, m;
-
-    memset(ch_data->s_indexmapped[1], 0, 7*sizeof(ch_data->s_indexmapped[1]));
-    for (e = 0; e < ch_data->bs_num_env; e++) {
-        const unsigned int ilim = sbr->n[ch_data->bs_freq_res[e + 1]];
-        uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
-        int k;
-
-        if (sbr->kx[1] != table[0]) {
-            av_log(ac->avctx, AV_LOG_ERROR, "kx != f_table{high,low}[0]. "
-                   "Derived frequency tables were not regenerated.\n");
-            sbr_turnoff(sbr);
-            return AVERROR_BUG;
-        }
-        for (i = 0; i < ilim; i++)
-            for (m = table[i]; m < table[i + 1]; m++)
-                sbr->e_origmapped[e][m - sbr->kx[1]] = ch_data->env_facs[e+1][i];
-
-        // ch_data->bs_num_noise > 1 => 2 noise floors
-        k = (ch_data->bs_num_noise > 1) && (ch_data->t_env[e] >= ch_data->t_q[1]);
-        for (i = 0; i < sbr->n_q; i++)
-            for (m = sbr->f_tablenoise[i]; m < sbr->f_tablenoise[i + 1]; m++)
-                sbr->q_mapped[e][m - sbr->kx[1]] = ch_data->noise_facs[k+1][i];
-
-        for (i = 0; i < sbr->n[1]; i++) {
-            if (ch_data->bs_add_harmonic_flag) {
-                const unsigned int m_midpoint =
-                    (sbr->f_tablehigh[i] + sbr->f_tablehigh[i + 1]) >> 1;
-
-                ch_data->s_indexmapped[e + 1][m_midpoint - sbr->kx[1]] = ch_data->bs_add_harmonic[i] *
-                    (e >= e_a[1] || (ch_data->s_indexmapped[0][m_midpoint - sbr->kx[1]] == 1));
-            }
-        }
-
-        for (i = 0; i < ilim; i++) {
-            int additional_sinusoid_present = 0;
-            for (m = table[i]; m < table[i + 1]; m++) {
-                if (ch_data->s_indexmapped[e + 1][m - sbr->kx[1]]) {
-                    additional_sinusoid_present = 1;
-                    break;
-                }
-            }
-            memset(&sbr->s_mapped[e][table[i] - sbr->kx[1]], additional_sinusoid_present,
-                   (table[i + 1] - table[i]) * sizeof(sbr->s_mapped[e][0]));
-        }
-    }
-
-    memcpy(ch_data->s_indexmapped[0], ch_data->s_indexmapped[ch_data->bs_num_env], sizeof(ch_data->s_indexmapped[0]));
-    return 0;
-}
-
-/// Estimation of current envelope (14496-3 sp04 p218)
-static void sbr_env_estimate(float (*e_curr)[48], float X_high[64][40][2],
-                             SpectralBandReplication *sbr, SBRData *ch_data)
-{
-    int e, m;
-    int kx1 = sbr->kx[1];
-
-    if (sbr->bs_interpol_freq) {
-        for (e = 0; e < ch_data->bs_num_env; e++) {
-            const float recip_env_size = 0.5f / (ch_data->t_env[e + 1] - ch_data->t_env[e]);
-            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-
-            for (m = 0; m < sbr->m[1]; m++) {
-                float sum = sbr->dsp.sum_square(X_high[m+kx1] + ilb, iub - ilb);
-                e_curr[e][m] = sum * recip_env_size;
-            }
-        }
-    } else {
-        int k, p;
-
-        for (e = 0; e < ch_data->bs_num_env; e++) {
-            const int env_size = 2 * (ch_data->t_env[e + 1] - ch_data->t_env[e]);
-            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            const uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
-
-            for (p = 0; p < sbr->n[ch_data->bs_freq_res[e + 1]]; p++) {
-                float sum = 0.0f;
-                const int den = env_size * (table[p + 1] - table[p]);
-
-                for (k = table[p]; k < table[p + 1]; k++) {
-                    sum += sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb);
-                }
-                sum /= den;
-                for (k = table[p]; k < table[p + 1]; k++) {
-                    e_curr[e][k - kx1] = sum;
-                }
-            }
-        }
-    }
-}
-
 /**
  * Calculation of levels of additional HF signal components (14496-3 sp04 p219)
  * and Calculation of gain (14496-3 sp04 p219)
@@ -1523,6 +241,7 @@ static void sbr_gain_calc(AACContext *ac, SpectralBandReplication *sbr,
                                             ((1.0f + sbr->e_curr[e][m]) *
                                              (1.0f + sbr->q_mapped[e][m])));
                 }
+                sbr->gain[e][m] += FLT_MIN;
             }
             for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
                 sum[0] += sbr->e_origmapped[e][m];
@@ -1570,10 +289,6 @@ static void sbr_hf_assemble(float Y1[38][64][2],
         0.11516383427084,
         0.03183050093751,
     };
-    static const int8_t phi[2][4] = {
-        {  1,  0, -1,  0}, // real
-        {  0,  1,  0, -1}, // imaginary
-    };
     float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
     int indexnoise = ch_data->f_indexnoise;
     int indexsine  = ch_data->f_indexsine;
@@ -1603,7 +318,6 @@ static void sbr_hf_assemble(float Y1[38][64][2],
 
     for (e = 0; e < ch_data->bs_num_env; e++) {
         for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
-            int phi_sign = (1 - 2*(kx & 1));
             LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
             LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
             float *g_filt, *q_filt;
@@ -1633,13 +347,17 @@ static void sbr_hf_assemble(float Y1[38][64][2],
                                                    q_filt, indexnoise,
                                                    kx, m_max);
             } else {
-                for (m = 0; m < m_max; m++) {
-                    Y1[i][m + kx][0] +=
-                        sbr->s_m[e][m] * phi[0][indexsine];
-                    Y1[i][m + kx][1] +=
-                        sbr->s_m[e][m] * (phi[1][indexsine] * phi_sign);
-                    phi_sign = -phi_sign;
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                float *out = &Y1[i][kx][idx];
+                float *in  = sbr->s_m[e];
+                for (m = 0; m+1 < m_max; m+=2) {
+                    out[2*m  ] += in[m  ] * A;
+                    out[2*m+2] += in[m+1] * B;
                 }
+                if(m_max&1)
+                    out[2*m  ] += in[m  ] * A;
             }
             indexnoise = (indexnoise + m_max) & 0x1ff;
             indexsine = (indexsine + 1) & 3;
@@ -1649,81 +367,4 @@ static void sbr_hf_assemble(float Y1[38][64][2],
     ch_data->f_indexsine  = indexsine;
 }
 
-void ff_sbr_apply(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
-                  float* L, float* R)
-{
-    int downsampled = ac->oc[1].m4ac.ext_sample_rate < sbr->sample_rate;
-    int ch;
-    int nch = (id_aac == TYPE_CPE) ? 2 : 1;
-    int err;
-
-    if (!sbr->kx_and_m_pushed) {
-        sbr->kx[0] = sbr->kx[1];
-        sbr->m[0] = sbr->m[1];
-    } else {
-        sbr->kx_and_m_pushed = 0;
-    }
-
-    if (sbr->start) {
-        sbr_dequant(sbr, id_aac);
-    }
-    for (ch = 0; ch < nch; ch++) {
-        /* decode channel */
-        sbr_qmf_analysis(&ac->fdsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples,
-                         (float*)sbr->qmf_filter_scratch,
-                         sbr->data[ch].W, sbr->data[ch].Ypos);
-        sbr_lf_gen(ac, sbr, sbr->X_low,
-                   (const float (*)[32][32][2]) sbr->data[ch].W,
-                   sbr->data[ch].Ypos);
-        sbr->data[ch].Ypos ^= 1;
-        if (sbr->start) {
-            sbr_hf_inverse_filter(&sbr->dsp, sbr->alpha0, sbr->alpha1,
-                                  (const float (*)[40][2]) sbr->X_low, sbr->k[0]);
-            sbr_chirp(sbr, &sbr->data[ch]);
-            sbr_hf_gen(ac, sbr, sbr->X_high,
-                       (const float (*)[40][2]) sbr->X_low,
-                       (const float (*)[2]) sbr->alpha0,
-                       (const float (*)[2]) sbr->alpha1,
-                       sbr->data[ch].bw_array, sbr->data[ch].t_env,
-                       sbr->data[ch].bs_num_env);
-
-            // hf_adj
-            err = sbr_mapping(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
-            if (!err) {
-                sbr_env_estimate(sbr->e_curr, sbr->X_high, sbr, &sbr->data[ch]);
-                sbr_gain_calc(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
-                sbr_hf_assemble(sbr->data[ch].Y[sbr->data[ch].Ypos],
-                                (const float (*)[40][2]) sbr->X_high,
-                                sbr, &sbr->data[ch],
-                                sbr->data[ch].e_a);
-            }
-        }
-
-        /* synthesis */
-        sbr_x_gen(sbr, sbr->X[ch],
-                  (const float (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
-                  (const float (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
-                  (const float (*)[40][2]) sbr->X_low, ch);
-    }
-
-    if (ac->oc[1].m4ac.ps == 1) {
-        if (sbr->ps.start) {
-            ff_ps_apply(ac->avctx, &sbr->ps, sbr->X[0], sbr->X[1], sbr->kx[1] + sbr->m[1]);
-        } else {
-            memcpy(sbr->X[1], sbr->X[0], sizeof(sbr->X[0]));
-        }
-        nch = 2;
-    }
-
-    sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, &ac->fdsp,
-                      L, sbr->X[0], sbr->qmf_filter_scratch,
-                      sbr->data[0].synthesis_filterbank_samples,
-                      &sbr->data[0].synthesis_filterbank_samples_offset,
-                      downsampled);
-    if (nch == 2)
-        sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, &ac->fdsp,
-                          R, sbr->X[1], sbr->qmf_filter_scratch,
-                          sbr->data[1].synthesis_filterbank_samples,
-                          &sbr->data[1].synthesis_filterbank_samples_offset,
-                          downsampled);
-}
+#include "aacsbr_template.c"
diff --git a/libavcodec/aacsbr.h b/libavcodec/aacsbr.h
index 9bc5e29..dd8b66c 100644
--- a/libavcodec/aacsbr.h
+++ b/libavcodec/aacsbr.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2010      Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,17 +33,64 @@
 #include "aac.h"
 #include "sbr.h"
 
+#define ENVELOPE_ADJUSTMENT_OFFSET 2
+#define NOISE_FLOOR_OFFSET 6
+
+/**
+ * SBR VLC tables
+ */
+enum {
+    T_HUFFMAN_ENV_1_5DB,
+    F_HUFFMAN_ENV_1_5DB,
+    T_HUFFMAN_ENV_BAL_1_5DB,
+    F_HUFFMAN_ENV_BAL_1_5DB,
+    T_HUFFMAN_ENV_3_0DB,
+    F_HUFFMAN_ENV_3_0DB,
+    T_HUFFMAN_ENV_BAL_3_0DB,
+    F_HUFFMAN_ENV_BAL_3_0DB,
+    T_HUFFMAN_NOISE_3_0DB,
+    T_HUFFMAN_NOISE_BAL_3_0DB,
+};
+
+/**
+ * bs_frame_class - frame class of current SBR frame (14496-3 sp04 p98)
+ */
+enum {
+    FIXFIX,
+    FIXVAR,
+    VARFIX,
+    VARVAR,
+};
+
+enum {
+    EXTENSION_ID_PS = 2,
+};
+
+static const int8_t vlc_sbr_lav[10] =
+    { 60, 60, 24, 24, 31, 31, 12, 12, 31, 12 };
+
+#define SBR_INIT_VLC_STATIC(num, size) \
+    INIT_VLC_STATIC(&vlc_sbr[num], 9, sbr_tmp[num].table_size / sbr_tmp[num].elem_size,     \
+                    sbr_tmp[num].sbr_bits ,                      1,                      1, \
+                    sbr_tmp[num].sbr_codes, sbr_tmp[num].elem_size, sbr_tmp[num].elem_size, \
+                    size)
+
+#define SBR_VLC_ROW(name) \
+    { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
+
 /** Initialize SBR. */
-void ff_aac_sbr_init(void);
+void AAC_RENAME(ff_aac_sbr_init)(void);
 /** Initialize one SBR context. */
-void ff_aac_sbr_ctx_init(AACContext *ac, SpectralBandReplication *sbr);
+void AAC_RENAME(ff_aac_sbr_ctx_init)(AACContext *ac, SpectralBandReplication *sbr, int id_aac);
 /** Close one SBR context. */
-void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr);
+void AAC_RENAME(ff_aac_sbr_ctx_close)(SpectralBandReplication *sbr);
 /** Decode one SBR element. */
-int ff_decode_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
+int AAC_RENAME(ff_decode_sbr_extension)(AACContext *ac, SpectralBandReplication *sbr,
                             GetBitContext *gb, int crc, int cnt, int id_aac);
 /** Apply one SBR element to one AAC element. */
-void ff_sbr_apply(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
-                  float* L, float *R);
+void AAC_RENAME(ff_sbr_apply)(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
+                  INTFLOAT* L, INTFLOAT *R);
+
+void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c);
 
 #endif /* AVCODEC_AACSBR_H */
diff --git a/libavcodec/aacsbr_fixed.c b/libavcodec/aacsbr_fixed.c
new file mode 100644
index 0000000..59cbba1
--- /dev/null
+++ b/libavcodec/aacsbr_fixed.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * AAC Spectral Band Replication decoding functions (fixed-point)
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC Spectral Band Replication decoding functions (fixed-point)
+ * Note: Rounding-to-nearest used unless otherwise stated
+ * @author Robert Swain ( rob opendot cl )
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ */
+#define USE_FIXED 1
+
+#include "aac.h"
+#include "sbr.h"
+#include "aacsbr.h"
+#include "aacsbrdata.h"
+#include "aacsbr_fixed_tablegen.h"
+#include "fft.h"
+#include "aacps.h"
+#include "sbrdsp.h"
+#include "libavutil/internal.h"
+#include "libavutil/libm.h"
+#include "libavutil/avassert.h"
+
+#include <stdint.h>
+#include <float.h>
+#include <math.h>
+
+static VLC vlc_sbr[10];
+static void aacsbr_func_ptr_init(AACSBRContext *c);
+static const int CONST_LN2       = Q31(0.6931471806/256);  // ln(2)/256
+static const int CONST_RECIP_LN2 = Q31(0.7213475204);      // 0.5/ln(2)
+static const int CONST_076923    = Q31(0.76923076923076923077f);
+
+static const int fixed_log_table[10] =
+{
+    Q31(1.0/2), Q31(1.0/3), Q31(1.0/4), Q31(1.0/5), Q31(1.0/6),
+    Q31(1.0/7), Q31(1.0/8), Q31(1.0/9), Q31(1.0/10), Q31(1.0/11)
+};
+
+static int fixed_log(int x)
+{
+    int i, ret, xpow, tmp;
+
+    ret = x;
+    xpow = x;
+    for (i=0; i<10; i+=2){
+        xpow = (int)(((int64_t)xpow * x + 0x40000000) >> 31);
+        tmp = (int)(((int64_t)xpow * fixed_log_table[i] + 0x40000000) >> 31);
+        ret -= tmp;
+
+        xpow = (int)(((int64_t)xpow * x + 0x40000000) >> 31);
+        tmp = (int)(((int64_t)xpow * fixed_log_table[i+1] + 0x40000000) >> 31);
+        ret += tmp;
+    }
+
+    return ret;
+}
+
+static const int fixed_exp_table[7] =
+{
+    Q31(1.0/2), Q31(1.0/6), Q31(1.0/24), Q31(1.0/120),
+    Q31(1.0/720), Q31(1.0/5040), Q31(1.0/40320)
+};
+
+static int fixed_exp(int x)
+{
+    int i, ret, xpow, tmp;
+
+    ret = 0x800000 + x;
+    xpow = x;
+    for (i=0; i<7; i++){
+        xpow = (int)(((int64_t)xpow * x + 0x400000) >> 23);
+        tmp = (int)(((int64_t)xpow * fixed_exp_table[i] + 0x40000000) >> 31);
+        ret += tmp;
+    }
+
+    return ret;
+}
+
+static void make_bands(int16_t* bands, int start, int stop, int num_bands)
+{
+    int k, previous, present;
+    int base, prod, nz = 0;
+
+    base = (stop << 23) / start;
+    while (base < 0x40000000){
+        base <<= 1;
+        nz++;
+    }
+    base = fixed_log(base - 0x80000000);
+    base = (((base + 0x80) >> 8) + (8-nz)*CONST_LN2) / num_bands;
+    base = fixed_exp(base);
+
+    previous = start;
+    prod = start << 23;
+
+    for (k = 0; k < num_bands-1; k++) {
+        prod = (int)(((int64_t)prod * base + 0x400000) >> 23);
+        present = (prod + 0x400000) >> 23;
+        bands[k] = present - previous;
+        previous = present;
+    }
+    bands[num_bands-1] = stop - previous;
+}
+
+/// Dequantization and stereo decoding (14496-3 sp04 p203)
+static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
+{
+    int k, e;
+    int ch;
+
+    if (id_aac == TYPE_CPE && sbr->bs_coupling) {
+        int alpha      = sbr->data[0].bs_amp_res ?  2 :  1;
+        int pan_offset = sbr->data[0].bs_amp_res ? 12 : 24;
+        for (e = 1; e <= sbr->data[0].bs_num_env; e++) {
+            for (k = 0; k < sbr->n[sbr->data[0].bs_freq_res[e]]; k++) {
+                SoftFloat temp1, temp2, fac;
+
+                temp1.exp = sbr->data[0].env_facs_q[e][k] * alpha + 14;
+                if (temp1.exp & 1)
+                  temp1.mant = 759250125;
+                else
+                  temp1.mant = 0x20000000;
+                temp1.exp = (temp1.exp >> 1) + 1;
+                if (temp1.exp > 66) { // temp1 > 1E20
+                    av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                    temp1 = FLOAT_1;
+                }
+
+                temp2.exp = (pan_offset - sbr->data[1].env_facs_q[e][k]) * alpha;
+                if (temp2.exp & 1)
+                  temp2.mant = 759250125;
+                else
+                  temp2.mant = 0x20000000;
+                temp2.exp = (temp2.exp >> 1) + 1;
+                fac   = av_div_sf(temp1, av_add_sf(FLOAT_1, temp2));
+                sbr->data[0].env_facs[e][k] = fac;
+                sbr->data[1].env_facs[e][k] = av_mul_sf(fac, temp2);
+            }
+        }
+        for (e = 1; e <= sbr->data[0].bs_num_noise; e++) {
+            for (k = 0; k < sbr->n_q; k++) {
+                SoftFloat temp1, temp2, fac;
+
+                temp1.exp = NOISE_FLOOR_OFFSET - \
+                    sbr->data[0].noise_facs_q[e][k] + 2;
+                temp1.mant = 0x20000000;
+                av_assert0(temp1.exp <= 66);
+                temp2.exp = 12 - sbr->data[1].noise_facs_q[e][k] + 1;
+                temp2.mant = 0x20000000;
+                fac   = av_div_sf(temp1, av_add_sf(FLOAT_1, temp2));
+                sbr->data[0].noise_facs[e][k] = fac;
+                sbr->data[1].noise_facs[e][k] = av_mul_sf(fac, temp2);
+            }
+        }
+    } else { // SCE or one non-coupled CPE
+        for (ch = 0; ch < (id_aac == TYPE_CPE) + 1; ch++) {
+            int alpha = sbr->data[ch].bs_amp_res ? 2 : 1;
+            for (e = 1; e <= sbr->data[ch].bs_num_env; e++)
+                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++){
+                    SoftFloat temp1;
+
+                    temp1.exp = alpha * sbr->data[ch].env_facs_q[e][k] + 12;
+                    if (temp1.exp & 1)
+                        temp1.mant = 759250125;
+                    else
+                        temp1.mant = 0x20000000;
+                    temp1.exp = (temp1.exp >> 1) + 1;
+                    if (temp1.exp > 66) { // temp1 > 1E20
+                        av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                        temp1 = FLOAT_1;
+                    }
+                    sbr->data[ch].env_facs[e][k] = temp1;
+                }
+            for (e = 1; e <= sbr->data[ch].bs_num_noise; e++)
+                for (k = 0; k < sbr->n_q; k++){
+                    sbr->data[ch].noise_facs[e][k].exp = NOISE_FLOOR_OFFSET - \
+                        sbr->data[ch].noise_facs_q[e][k] + 1;
+                    sbr->data[ch].noise_facs[e][k].mant = 0x20000000;
+                }
+        }
+    }
+}
+
+/** High Frequency Generation (14496-3 sp04 p214+) and Inverse Filtering
+ * (14496-3 sp04 p214)
+ * Warning: This routine does not seem numerically stable.
+ */
+static void sbr_hf_inverse_filter(SBRDSPContext *dsp,
+                                  int (*alpha0)[2], int (*alpha1)[2],
+                                  const int X_low[32][40][2], int k0)
+{
+    int k;
+    int shift, round;
+
+    for (k = 0; k < k0; k++) {
+        SoftFloat phi[3][2][2];
+        SoftFloat a00, a01, a10, a11;
+        SoftFloat dk;
+
+        dsp->autocorrelate(X_low[k], phi);
+
+        dk = av_sub_sf(av_mul_sf(phi[2][1][0], phi[1][0][0]),
+             av_mul_sf(av_add_sf(av_mul_sf(phi[1][1][0], phi[1][1][0]),
+             av_mul_sf(phi[1][1][1], phi[1][1][1])), FLOAT_0999999));
+
+        if (!dk.mant) {
+            a10 = FLOAT_0;
+            a11 = FLOAT_0;
+        } else {
+            SoftFloat temp_real, temp_im;
+            temp_real = av_sub_sf(av_sub_sf(av_mul_sf(phi[0][0][0], phi[1][1][0]),
+                                            av_mul_sf(phi[0][0][1], phi[1][1][1])),
+                                  av_mul_sf(phi[0][1][0], phi[1][0][0]));
+            temp_im   = av_sub_sf(av_add_sf(av_mul_sf(phi[0][0][0], phi[1][1][1]),
+                                            av_mul_sf(phi[0][0][1], phi[1][1][0])),
+                                  av_mul_sf(phi[0][1][1], phi[1][0][0]));
+
+            a10 = av_div_sf(temp_real, dk);
+            a11 = av_div_sf(temp_im,   dk);
+        }
+
+        if (!phi[1][0][0].mant) {
+            a00 = FLOAT_0;
+            a01 = FLOAT_0;
+        } else {
+            SoftFloat temp_real, temp_im;
+            temp_real = av_add_sf(phi[0][0][0],
+                                  av_add_sf(av_mul_sf(a10, phi[1][1][0]),
+                                            av_mul_sf(a11, phi[1][1][1])));
+            temp_im   = av_add_sf(phi[0][0][1],
+                                  av_sub_sf(av_mul_sf(a11, phi[1][1][0]),
+                                            av_mul_sf(a10, phi[1][1][1])));
+
+            temp_real.mant = -temp_real.mant;
+            temp_im.mant   = -temp_im.mant;
+            a00 = av_div_sf(temp_real, phi[1][0][0]);
+            a01 = av_div_sf(temp_im,   phi[1][0][0]);
+        }
+
+        shift = a00.exp;
+        if (shift >= 3)
+            alpha0[k][0] = 0x7fffffff;
+        else if (shift <= -30)
+            alpha0[k][0] = 0;
+        else {
+            shift = 1-shift;
+            if (shift <= 0)
+                alpha0[k][0] = a00.mant * (1<<-shift);
+            else {
+                round = 1 << (shift-1);
+                alpha0[k][0] = (a00.mant + round) >> shift;
+            }
+        }
+
+        shift = a01.exp;
+        if (shift >= 3)
+            alpha0[k][1] = 0x7fffffff;
+        else if (shift <= -30)
+            alpha0[k][1] = 0;
+        else {
+            shift = 1-shift;
+            if (shift <= 0)
+                alpha0[k][1] = a01.mant * (1<<-shift);
+            else {
+                round = 1 << (shift-1);
+                alpha0[k][1] = (a01.mant + round) >> shift;
+            }
+        }
+        shift = a10.exp;
+        if (shift >= 3)
+            alpha1[k][0] = 0x7fffffff;
+        else if (shift <= -30)
+            alpha1[k][0] = 0;
+        else {
+            shift = 1-shift;
+            if (shift <= 0)
+                alpha1[k][0] = a10.mant * (1<<-shift);
+            else {
+                round = 1 << (shift-1);
+                alpha1[k][0] = (a10.mant + round) >> shift;
+            }
+        }
+
+        shift = a11.exp;
+        if (shift >= 3)
+            alpha1[k][1] = 0x7fffffff;
+        else if (shift <= -30)
+            alpha1[k][1] = 0;
+        else {
+            shift = 1-shift;
+            if (shift <= 0)
+                alpha1[k][1] = a11.mant * (1<<-shift);
+            else {
+                round = 1 << (shift-1);
+                alpha1[k][1] = (a11.mant + round) >> shift;
+            }
+        }
+
+        shift = (int)(((int64_t)(alpha1[k][0]>>1) * (alpha1[k][0]>>1) + \
+                       (int64_t)(alpha1[k][1]>>1) * (alpha1[k][1]>>1) + \
+                       0x40000000) >> 31);
+        if (shift >= 0x20000000){
+            alpha1[k][0] = 0;
+            alpha1[k][1] = 0;
+            alpha0[k][0] = 0;
+            alpha0[k][1] = 0;
+        }
+
+        shift = (int)(((int64_t)(alpha0[k][0]>>1) * (alpha0[k][0]>>1) + \
+                       (int64_t)(alpha0[k][1]>>1) * (alpha0[k][1]>>1) + \
+                       0x40000000) >> 31);
+        if (shift >= 0x20000000){
+            alpha1[k][0] = 0;
+            alpha1[k][1] = 0;
+            alpha0[k][0] = 0;
+            alpha0[k][1] = 0;
+        }
+    }
+}
+
+/// Chirp Factors (14496-3 sp04 p214)
+static void sbr_chirp(SpectralBandReplication *sbr, SBRData *ch_data)
+{
+    int i;
+    int new_bw;
+    static const int bw_tab[] = { 0, 1610612736, 1932735283, 2104533975 };
+    int64_t accu;
+
+    for (i = 0; i < sbr->n_q; i++) {
+        if (ch_data->bs_invf_mode[0][i] + ch_data->bs_invf_mode[1][i] == 1)
+            new_bw = 1288490189;
+        else
+            new_bw = bw_tab[ch_data->bs_invf_mode[0][i]];
+
+        if (new_bw < ch_data->bw_array[i]){
+            accu  = (int64_t)new_bw * 1610612736;
+            accu += (int64_t)ch_data->bw_array[i] * 0x20000000;
+            new_bw = (int)((accu + 0x40000000) >> 31);
+        } else {
+            accu  = (int64_t)new_bw * 1946157056;
+            accu += (int64_t)ch_data->bw_array[i] * 201326592;
+            new_bw = (int)((accu + 0x40000000) >> 31);
+        }
+        ch_data->bw_array[i] = new_bw < 0x2000000 ? 0 : new_bw;
+    }
+}
+
+/**
+ * Calculation of levels of additional HF signal components (14496-3 sp04 p219)
+ * and Calculation of gain (14496-3 sp04 p219)
+ */
+static void sbr_gain_calc(AACContext *ac, SpectralBandReplication *sbr,
+                          SBRData *ch_data, const int e_a[2])
+{
+    int e, k, m;
+    // max gain limits : -3dB, 0dB, 3dB, inf dB (limiter off)
+    static const SoftFloat limgain[4] = { { 760155524,  0 }, { 0x20000000,  1 },
+                                            { 758351638,  1 }, { 625000000, 34 } };
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        int delta = !((e == e_a[1]) || (e == e_a[0]));
+        for (k = 0; k < sbr->n_lim; k++) {
+            SoftFloat gain_boost, gain_max;
+            SoftFloat sum[2];
+            sum[0] = sum[1] = FLOAT_0;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                const SoftFloat temp = av_div_sf(sbr->e_origmapped[e][m],
+                                            av_add_sf(FLOAT_1, sbr->q_mapped[e][m]));
+                sbr->q_m[e][m] = av_sqrt_sf(av_mul_sf(temp, sbr->q_mapped[e][m]));
+                sbr->s_m[e][m] = av_sqrt_sf(av_mul_sf(temp, av_int2sf(ch_data->s_indexmapped[e + 1][m], 0)));
+                if (!sbr->s_mapped[e][m]) {
+                    if (delta) {
+                      sbr->gain[e][m] = av_sqrt_sf(av_div_sf(sbr->e_origmapped[e][m],
+                                            av_mul_sf(av_add_sf(FLOAT_1, sbr->e_curr[e][m]),
+                                            av_add_sf(FLOAT_1, sbr->q_mapped[e][m]))));
+                    } else {
+                      sbr->gain[e][m] = av_sqrt_sf(av_div_sf(sbr->e_origmapped[e][m],
+                                            av_add_sf(FLOAT_1, sbr->e_curr[e][m])));
+                    }
+                } else {
+                    sbr->gain[e][m] = av_sqrt_sf(
+                                        av_div_sf(
+                                            av_mul_sf(sbr->e_origmapped[e][m], sbr->q_mapped[e][m]),
+                                            av_mul_sf(
+                                                av_add_sf(FLOAT_1, sbr->e_curr[e][m]),
+                                                av_add_sf(FLOAT_1, sbr->q_mapped[e][m]))));
+                }
+                sbr->gain[e][m] = av_add_sf(sbr->gain[e][m], FLOAT_MIN);
+            }
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sum[0] = av_add_sf(sum[0], sbr->e_origmapped[e][m]);
+                sum[1] = av_add_sf(sum[1], sbr->e_curr[e][m]);
+            }
+            gain_max = av_mul_sf(limgain[sbr->bs_limiter_gains],
+                            av_sqrt_sf(
+                                av_div_sf(
+                                    av_add_sf(FLOAT_EPSILON, sum[0]),
+                                    av_add_sf(FLOAT_EPSILON, sum[1]))));
+            if (av_gt_sf(gain_max, FLOAT_100000))
+              gain_max = FLOAT_100000;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                SoftFloat q_m_max = av_div_sf(
+                                        av_mul_sf(sbr->q_m[e][m], gain_max),
+                                        sbr->gain[e][m]);
+                if (av_gt_sf(sbr->q_m[e][m], q_m_max))
+                  sbr->q_m[e][m] = q_m_max;
+                if (av_gt_sf(sbr->gain[e][m], gain_max))
+                  sbr->gain[e][m] = gain_max;
+            }
+            sum[0] = sum[1] = FLOAT_0;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sum[0] = av_add_sf(sum[0], sbr->e_origmapped[e][m]);
+                sum[1] = av_add_sf(sum[1],
+                            av_mul_sf(
+                                av_mul_sf(sbr->e_curr[e][m],
+                                          sbr->gain[e][m]),
+                                sbr->gain[e][m]));
+                sum[1] = av_add_sf(sum[1],
+                            av_mul_sf(sbr->s_m[e][m], sbr->s_m[e][m]));
+                if (delta && !sbr->s_m[e][m].mant)
+                  sum[1] = av_add_sf(sum[1],
+                                av_mul_sf(sbr->q_m[e][m], sbr->q_m[e][m]));
+            }
+            gain_boost = av_sqrt_sf(
+                            av_div_sf(
+                                av_add_sf(FLOAT_EPSILON, sum[0]),
+                                av_add_sf(FLOAT_EPSILON, sum[1])));
+            if (av_gt_sf(gain_boost, FLOAT_1584893192))
+              gain_boost = FLOAT_1584893192;
+
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sbr->gain[e][m] = av_mul_sf(sbr->gain[e][m], gain_boost);
+                sbr->q_m[e][m]  = av_mul_sf(sbr->q_m[e][m], gain_boost);
+                sbr->s_m[e][m]  = av_mul_sf(sbr->s_m[e][m], gain_boost);
+            }
+        }
+    }
+}
+
+/// Assembling HF Signals (14496-3 sp04 p220)
+static void sbr_hf_assemble(int Y1[38][64][2],
+                            const int X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2])
+{
+    int e, i, j, m;
+    const int h_SL = 4 * !sbr->bs_smoothing_mode;
+    const int kx = sbr->kx[1];
+    const int m_max = sbr->m[1];
+    static const SoftFloat h_smooth[5] = {
+      { 715827883, -1 },
+      { 647472402, -1 },
+      { 937030863, -2 },
+      { 989249804, -3 },
+      { 546843842, -4 },
+    };
+    SoftFloat (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
+    int indexnoise = ch_data->f_indexnoise;
+    int indexsine  = ch_data->f_indexsine;
+
+    if (sbr->reset) {
+        for (i = 0; i < h_SL; i++) {
+            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    } else if (h_SL) {
+        for (i = 0; i < 4; i++) {
+            memcpy(g_temp[i + 2 * ch_data->t_env[0]],
+                   g_temp[i + 2 * ch_data->t_env_num_env_old],
+                   sizeof(g_temp[0]));
+            memcpy(q_temp[i + 2 * ch_data->t_env[0]],
+                   q_temp[i + 2 * ch_data->t_env_num_env_old],
+                   sizeof(q_temp[0]));
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            memcpy(g_temp[h_SL + i], sbr->gain[e], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[h_SL + i], sbr->q_m[e],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            SoftFloat g_filt_tab[48];
+            SoftFloat q_filt_tab[48];
+            SoftFloat *g_filt, *q_filt;
+
+            if (h_SL && e != e_a[0] && e != e_a[1]) {
+                g_filt = g_filt_tab;
+                q_filt = q_filt_tab;
+                for (m = 0; m < m_max; m++) {
+                    const int idx1 = i + h_SL;
+                    g_filt[m].mant = g_filt[m].exp = 0;
+                    q_filt[m].mant = q_filt[m].exp = 0;
+                    for (j = 0; j <= h_SL; j++) {
+                        g_filt[m] = av_add_sf(g_filt[m],
+                                        av_mul_sf(g_temp[idx1 - j][m],
+                                            h_smooth[j]));
+                        q_filt[m] = av_add_sf(q_filt[m],
+                                        av_mul_sf(q_temp[idx1 - j][m],
+                                            h_smooth[j]));
+                    }
+                }
+            } else {
+                g_filt = g_temp[i + h_SL];
+                q_filt = q_temp[i];
+            }
+
+            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
+                               i + ENVELOPE_ADJUSTMENT_OFFSET);
+
+            if (e != e_a[0] && e != e_a[1]) {
+                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
+                                                   q_filt, indexnoise,
+                                                   kx, m_max);
+            } else {
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                unsigned *out = &Y1[i][kx][idx];
+                int shift;
+                unsigned round;
+
+                SoftFloat *in  = sbr->s_m[e];
+                for (m = 0; m+1 < m_max; m+=2) {
+                    int shift2;
+                    shift = 22 - in[m  ].exp;
+                    shift2= 22 - in[m+1].exp;
+                    if (shift < 1 || shift2 < 1) {
+                        av_log(NULL, AV_LOG_ERROR, "Overflow in sbr_hf_assemble, shift=%d,%d\n", shift, shift2);
+                        return;
+                    }
+                    if (shift < 32) {
+                        round = 1 << (shift-1);
+                        out[2*m  ] += (int)(in[m  ].mant * A + round) >> shift;
+                    }
+
+                    if (shift2 < 32) {
+                        round = 1 << (shift2-1);
+                        out[2*m+2] += (int)(in[m+1].mant * B + round) >> shift2;
+                    }
+                }
+                if(m_max&1)
+                {
+                    shift = 22 - in[m  ].exp;
+                    if (shift < 1) {
+                        av_log(NULL, AV_LOG_ERROR, "Overflow in sbr_hf_assemble, shift=%d\n", shift);
+                        return;
+                    } else if (shift < 32) {
+                        round = 1 << (shift-1);
+                        out[2*m  ] += (int)(in[m  ].mant * A + round) >> shift;
+                    }
+                }
+            }
+            indexnoise = (indexnoise + m_max) & 0x1ff;
+            indexsine = (indexsine + 1) & 3;
+        }
+    }
+    ch_data->f_indexnoise = indexnoise;
+    ch_data->f_indexsine  = indexsine;
+}
+
+#include "aacsbr_template.c"
diff --git a/libavcodec/aacsbr_fixed_tablegen.h b/libavcodec/aacsbr_fixed_tablegen.h
new file mode 100644
index 0000000..3fcf020
--- /dev/null
+++ b/libavcodec/aacsbr_fixed_tablegen.h
@@ -0,0 +1,28 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_FIXED_TABLEGEN_H
+#define AVCODEC_AACSBR_FIXED_TABLEGEN_H
+
+#include "aacsbr_tablegen_common.h"
+
+#endif /* AVCODEC_AACSBR_FIXED_TABLEGEN_H */
diff --git a/libavcodec/aacsbr_tablegen.h b/libavcodec/aacsbr_tablegen.h
new file mode 100644
index 0000000..242a963
--- /dev/null
+++ b/libavcodec/aacsbr_tablegen.h
@@ -0,0 +1,28 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_TABLEGEN_H
+#define AVCODEC_AACSBR_TABLEGEN_H
+
+#include "aacsbr_tablegen_common.h"
+
+#endif /* AVCODEC_AACSBR_TABLEGEN_H */
diff --git a/libavcodec/aacsbr_tablegen_common.h b/libavcodec/aacsbr_tablegen_common.h
new file mode 100644
index 0000000..8e0dd9e
--- /dev/null
+++ b/libavcodec/aacsbr_tablegen_common.h
@@ -0,0 +1,114 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_TABLEGEN_COMMON_H
+#define AVCODEC_AACSBR_TABLEGEN_COMMON_H
+#include "aac_defines.h"
+#include "libavutil/mem.h"
+
+///< window coefficients for analysis/synthesis QMF banks
+static DECLARE_ALIGNED(32, INTFLOAT, sbr_qmf_window_ds)[320];
+static DECLARE_ALIGNED(32, INTFLOAT, sbr_qmf_window_us)[640] = {
+    Q31( 0.0000000000f), Q31(-0.0005525286f), Q31(-0.0005617692f), Q31(-0.0004947518f),
+    Q31(-0.0004875227f), Q31(-0.0004893791f), Q31(-0.0005040714f), Q31(-0.0005226564f),
+    Q31(-0.0005466565f), Q31(-0.0005677802f), Q31(-0.0005870930f), Q31(-0.0006132747f),
+    Q31(-0.0006312493f), Q31(-0.0006540333f), Q31(-0.0006777690f), Q31(-0.0006941614f),
+    Q31(-0.0007157736f), Q31(-0.0007255043f), Q31(-0.0007440941f), Q31(-0.0007490598f),
+    Q31(-0.0007681371f), Q31(-0.0007724848f), Q31(-0.0007834332f), Q31(-0.0007779869f),
+    Q31(-0.0007803664f), Q31(-0.0007801449f), Q31(-0.0007757977f), Q31(-0.0007630793f),
+    Q31(-0.0007530001f), Q31(-0.0007319357f), Q31(-0.0007215391f), Q31(-0.0006917937f),
+    Q31(-0.0006650415f), Q31(-0.0006341594f), Q31(-0.0005946118f), Q31(-0.0005564576f),
+    Q31(-0.0005145572f), Q31(-0.0004606325f), Q31(-0.0004095121f), Q31(-0.0003501175f),
+    Q31(-0.0002896981f), Q31(-0.0002098337f), Q31(-0.0001446380f), Q31(-0.0000617334f),
+    Q31( 0.0000134949f), Q31( 0.0001094383f), Q31( 0.0002043017f), Q31( 0.0002949531f),
+    Q31( 0.0004026540f), Q31( 0.0005107388f), Q31( 0.0006239376f), Q31( 0.0007458025f),
+    Q31( 0.0008608443f), Q31( 0.0009885988f), Q31( 0.0011250155f), Q31( 0.0012577884f),
+    Q31( 0.0013902494f), Q31( 0.0015443219f), Q31( 0.0016868083f), Q31( 0.0018348265f),
+    Q31( 0.0019841140f), Q31( 0.0021461583f), Q31( 0.0023017254f), Q31( 0.0024625616f),
+    Q31( 0.0026201758f), Q31( 0.0027870464f), Q31( 0.0029469447f), Q31( 0.0031125420f),
+    Q31( 0.0032739613f), Q31( 0.0034418874f), Q31( 0.0036008268f), Q31( 0.0037603922f),
+    Q31( 0.0039207432f), Q31( 0.0040819753f), Q31( 0.0042264269f), Q31( 0.0043730719f),
+    Q31( 0.0045209852f), Q31( 0.0046606460f), Q31( 0.0047932560f), Q31( 0.0049137603f),
+    Q31( 0.0050393022f), Q31( 0.0051407353f), Q31( 0.0052461166f), Q31( 0.0053471681f),
+    Q31( 0.0054196775f), Q31( 0.0054876040f), Q31( 0.0055475714f), Q31( 0.0055938023f),
+    Q31( 0.0056220643f), Q31( 0.0056455196f), Q31( 0.0056389199f), Q31( 0.0056266114f),
+    Q31( 0.0055917128f), Q31( 0.0055404363f), Q31( 0.0054753783f), Q31( 0.0053838975f),
+    Q31( 0.0052715758f), Q31( 0.0051382275f), Q31( 0.0049839687f), Q31( 0.0048109469f),
+    Q31( 0.0046039530f), Q31( 0.0043801861f), Q31( 0.0041251642f), Q31( 0.0038456408f),
+    Q31( 0.0035401246f), Q31( 0.0032091885f), Q31( 0.0028446757f), Q31( 0.0024508540f),
+    Q31( 0.0020274176f), Q31( 0.0015784682f), Q31( 0.0010902329f), Q31( 0.0005832264f),
+    Q31( 0.0000276045f), Q31(-0.0005464280f), Q31(-0.0011568135f), Q31(-0.0018039472f),
+    Q31(-0.0024826723f), Q31(-0.0031933778f), Q31(-0.0039401124f), Q31(-0.0047222596f),
+    Q31(-0.0055337211f), Q31(-0.0063792293f), Q31(-0.0072615816f), Q31(-0.0081798233f),
+    Q31(-0.0091325329f), Q31(-0.0101150215f), Q31(-0.0111315548f), Q31(-0.0121849995f),
+    Q31( 0.0132718220f), Q31( 0.0143904666f), Q31( 0.0155405553f), Q31( 0.0167324712f),
+    Q31( 0.0179433381f), Q31( 0.0191872431f), Q31( 0.0204531793f), Q31( 0.0217467550f),
+    Q31( 0.0230680169f), Q31( 0.0244160992f), Q31( 0.0257875847f), Q31( 0.0271859429f),
+    Q31( 0.0286072173f), Q31( 0.0300502657f), Q31( 0.0315017608f), Q31( 0.0329754081f),
+    Q31( 0.0344620948f), Q31( 0.0359697560f), Q31( 0.0374812850f), Q31( 0.0390053679f),
+    Q31( 0.0405349170f), Q31( 0.0420649094f), Q31( 0.0436097542f), Q31( 0.0451488405f),
+    Q31( 0.0466843027f), Q31( 0.0482165720f), Q31( 0.0497385755f), Q31( 0.0512556155f),
+    Q31( 0.0527630746f), Q31( 0.0542452768f), Q31( 0.0557173648f), Q31( 0.0571616450f),
+    Q31( 0.0585915683f), Q31( 0.0599837480f), Q31( 0.0613455171f), Q31( 0.0626857808f),
+    Q31( 0.0639715898f), Q31( 0.0652247106f), Q31( 0.0664367512f), Q31( 0.0676075985f),
+    Q31( 0.0687043828f), Q31( 0.0697630244f), Q31( 0.0707628710f), Q31( 0.0717002673f),
+    Q31( 0.0725682583f), Q31( 0.0733620255f), Q31( 0.0741003642f), Q31( 0.0747452558f),
+    Q31( 0.0753137336f), Q31( 0.0758008358f), Q31( 0.0761992479f), Q31( 0.0764992170f),
+    Q31( 0.0767093490f), Q31( 0.0768173975f), Q31( 0.0768230011f), Q31( 0.0767204924f),
+    Q31( 0.0765050718f), Q31( 0.0761748321f), Q31( 0.0757305756f), Q31( 0.0751576255f),
+    Q31( 0.0744664394f), Q31( 0.0736406005f), Q31( 0.0726774642f), Q31( 0.0715826364f),
+    Q31( 0.0703533073f), Q31( 0.0689664013f), Q31( 0.0674525021f), Q31( 0.0657690668f),
+    Q31( 0.0639444805f), Q31( 0.0619602779f), Q31( 0.0598166570f), Q31( 0.0575152691f),
+    Q31( 0.0550460034f), Q31( 0.0524093821f), Q31( 0.0495978676f), Q31( 0.0466303305f),
+    Q31( 0.0434768782f), Q31( 0.0401458278f), Q31( 0.0366418116f), Q31( 0.0329583930f),
+    Q31( 0.0290824006f), Q31( 0.0250307561f), Q31( 0.0207997072f), Q31( 0.0163701258f),
+    Q31( 0.0117623832f), Q31( 0.0069636862f), Q31( 0.0019765601f), Q31(-0.0032086896f),
+    Q31(-0.0085711749f), Q31(-0.0141288827f), Q31(-0.0198834129f), Q31(-0.0258227288f),
+    Q31(-0.0319531274f), Q31(-0.0382776572f), Q31(-0.0447806821f), Q31(-0.0514804176f),
+    Q31(-0.0583705326f), Q31(-0.0654409853f), Q31(-0.0726943300f), Q31(-0.0801372934f),
+    Q31(-0.0877547536f), Q31(-0.0955533352f), Q31(-0.1035329531f), Q31(-0.1116826931f),
+    Q31(-0.1200077984f), Q31(-0.1285002850f), Q31(-0.1371551761f), Q31(-0.1459766491f),
+    Q31(-0.1549607071f), Q31(-0.1640958855f), Q31(-0.1733808172f), Q31(-0.1828172548f),
+    Q31(-0.1923966745f), Q31(-0.2021250176f), Q31(-0.2119735853f), Q31(-0.2219652696f),
+    Q31(-0.2320690870f), Q31(-0.2423016884f), Q31(-0.2526480309f), Q31(-0.2631053299f),
+    Q31(-0.2736634040f), Q31(-0.2843214189f), Q31(-0.2950716717f), Q31(-0.3059098575f),
+    Q31(-0.3168278913f), Q31(-0.3278113727f), Q31(-0.3388722693f), Q31(-0.3499914122f),
+    Q31( 0.3611589903f), Q31( 0.3723795546f), Q31( 0.3836350013f), Q31( 0.3949211761f),
+    Q31( 0.4062317676f), Q31( 0.4175696896f), Q31( 0.4289119920f), Q31( 0.4402553754f),
+    Q31( 0.4515996535f), Q31( 0.4629308085f), Q31( 0.4742453214f), Q31( 0.4855253091f),
+    Q31( 0.4967708254f), Q31( 0.5079817500f), Q31( 0.5191234970f), Q31( 0.5302240895f),
+    Q31( 0.5412553448f), Q31( 0.5522051258f), Q31( 0.5630789140f), Q31( 0.5738524131f),
+    Q31( 0.5845403235f), Q31( 0.5951123086f), Q31( 0.6055783538f), Q31( 0.6159109932f),
+    Q31( 0.6261242695f), Q31( 0.6361980107f), Q31( 0.6461269695f), Q31( 0.6559016302f),
+    Q31( 0.6655139880f), Q31( 0.6749663190f), Q31( 0.6842353293f), Q31( 0.6933282376f),
+    Q31( 0.7022388719f), Q31( 0.7109410426f), Q31( 0.7194462634f), Q31( 0.7277448900f),
+    Q31( 0.7358211758f), Q31( 0.7436827863f), Q31( 0.7513137456f), Q31( 0.7587080760f),
+    Q31( 0.7658674865f), Q31( 0.7727780881f), Q31( 0.7794287519f), Q31( 0.7858353120f),
+    Q31( 0.7919735841f), Q31( 0.7978466413f), Q31( 0.8034485751f), Q31( 0.8087695004f),
+    Q31( 0.8138191270f), Q31( 0.8185776004f), Q31( 0.8230419890f), Q31( 0.8272275347f),
+    Q31( 0.8311038457f), Q31( 0.8346937361f), Q31( 0.8379717337f), Q31( 0.8409541392f),
+    Q31( 0.8436238281f), Q31( 0.8459818469f), Q31( 0.8480315777f), Q31( 0.8497805198f),
+    Q31( 0.8511971524f), Q31( 0.8523047035f), Q31( 0.8531020949f), Q31( 0.8535720573f),
+    Q31( 0.8537385600f),
+};
+
+#endif /* AVCODEC_AACSBR_TABLEGEN_COMMON_H */
diff --git a/libavcodec/aacsbr_template.c b/libavcodec/aacsbr_template.c
new file mode 100644
index 0000000..821615f
--- /dev/null
+++ b/libavcodec/aacsbr_template.c
@@ -0,0 +1,1583 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * Fixed point code
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC Spectral Band Replication decoding functions
+ * @author Robert Swain ( rob opendot cl )
+ * @author Stanislav Ocovaj ( stanislav.ocovaj@imgtec.com )
+ * @author Zoran Basaric ( zoran.basaric@imgtec.com )
+ */
+
+#include "libavutil/qsort.h"
+
+static av_cold void aacsbr_tableinit(void)
+{
+    int n;
+    for (n = 1; n < 320; n++)
+        sbr_qmf_window_us[320 + n] = sbr_qmf_window_us[320 - n];
+    sbr_qmf_window_us[384] = -sbr_qmf_window_us[384];
+    sbr_qmf_window_us[512] = -sbr_qmf_window_us[512];
+
+    for (n = 0; n < 320; n++)
+        sbr_qmf_window_ds[n] = sbr_qmf_window_us[2*n];
+}
+
+av_cold void AAC_RENAME(ff_aac_sbr_init)(void)
+{
+    static const struct {
+        const void *sbr_codes, *sbr_bits;
+        const unsigned int table_size, elem_size;
+    } sbr_tmp[] = {
+        SBR_VLC_ROW(t_huffman_env_1_5dB),
+        SBR_VLC_ROW(f_huffman_env_1_5dB),
+        SBR_VLC_ROW(t_huffman_env_bal_1_5dB),
+        SBR_VLC_ROW(f_huffman_env_bal_1_5dB),
+        SBR_VLC_ROW(t_huffman_env_3_0dB),
+        SBR_VLC_ROW(f_huffman_env_3_0dB),
+        SBR_VLC_ROW(t_huffman_env_bal_3_0dB),
+        SBR_VLC_ROW(f_huffman_env_bal_3_0dB),
+        SBR_VLC_ROW(t_huffman_noise_3_0dB),
+        SBR_VLC_ROW(t_huffman_noise_bal_3_0dB),
+    };
+
+    // SBR VLC table initialization
+    SBR_INIT_VLC_STATIC(0, 1098);
+    SBR_INIT_VLC_STATIC(1, 1092);
+    SBR_INIT_VLC_STATIC(2, 768);
+    SBR_INIT_VLC_STATIC(3, 1026);
+    SBR_INIT_VLC_STATIC(4, 1058);
+    SBR_INIT_VLC_STATIC(5, 1052);
+    SBR_INIT_VLC_STATIC(6, 544);
+    SBR_INIT_VLC_STATIC(7, 544);
+    SBR_INIT_VLC_STATIC(8, 592);
+    SBR_INIT_VLC_STATIC(9, 512);
+
+    aacsbr_tableinit();
+
+    AAC_RENAME(ff_ps_init)();
+}
+
+/** Places SBR in pure upsampling mode. */
+static void sbr_turnoff(SpectralBandReplication *sbr) {
+    sbr->start = 0;
+    sbr->ready_for_dequant = 0;
+    // Init defults used in pure upsampling mode
+    sbr->kx[1] = 32; //Typo in spec, kx' inits to 32
+    sbr->m[1] = 0;
+    // Reset values for first SBR header
+    sbr->data[0].e_a[1] = sbr->data[1].e_a[1] = -1;
+    memset(&sbr->spectrum_params, -1, sizeof(SpectrumParameters));
+}
+
+av_cold void AAC_RENAME(ff_aac_sbr_ctx_init)(AACContext *ac, SpectralBandReplication *sbr, int id_aac)
+{
+    if(sbr->mdct.mdct_bits)
+        return;
+    sbr->kx[0] = sbr->kx[1];
+    sbr->id_aac = id_aac;
+    sbr_turnoff(sbr);
+    sbr->data[0].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
+    sbr->data[1].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
+    /* SBR requires samples to be scaled to +/-32768.0 to work correctly.
+     * mdct scale factors are adjusted to scale up from +/-1.0 at analysis
+     * and scale back down at synthesis. */
+    AAC_RENAME_32(ff_mdct_init)(&sbr->mdct,     7, 1, 1.0 / (64 * 32768.0));
+    AAC_RENAME_32(ff_mdct_init)(&sbr->mdct_ana, 7, 1, -2.0 * 32768.0);
+    AAC_RENAME(ff_ps_ctx_init)(&sbr->ps);
+    AAC_RENAME(ff_sbrdsp_init)(&sbr->dsp);
+    aacsbr_func_ptr_init(&sbr->c);
+}
+
+av_cold void AAC_RENAME(ff_aac_sbr_ctx_close)(SpectralBandReplication *sbr)
+{
+    AAC_RENAME_32(ff_mdct_end)(&sbr->mdct);
+    AAC_RENAME_32(ff_mdct_end)(&sbr->mdct_ana);
+}
+
+static int qsort_comparison_function_int16(const void *a, const void *b)
+{
+    return *(const int16_t *)a - *(const int16_t *)b;
+}
+
+static inline int in_table_int16(const int16_t *table, int last_el, int16_t needle)
+{
+    int i;
+    for (i = 0; i <= last_el; i++)
+        if (table[i] == needle)
+            return 1;
+    return 0;
+}
+
+/// Limiter Frequency Band Table (14496-3 sp04 p198)
+static void sbr_make_f_tablelim(SpectralBandReplication *sbr)
+{
+    int k;
+    if (sbr->bs_limiter_bands > 0) {
+        static const INTFLOAT bands_warped[3] = { Q23(1.32715174233856803909f),   //2^(0.49/1.2)
+                                               Q23(1.18509277094158210129f),   //2^(0.49/2)
+                                               Q23(1.11987160404675912501f) }; //2^(0.49/3)
+        const INTFLOAT lim_bands_per_octave_warped = bands_warped[sbr->bs_limiter_bands - 1];
+        int16_t patch_borders[7];
+        uint16_t *in = sbr->f_tablelim + 1, *out = sbr->f_tablelim;
+
+        patch_borders[0] = sbr->kx[1];
+        for (k = 1; k <= sbr->num_patches; k++)
+            patch_borders[k] = patch_borders[k-1] + sbr->patch_num_subbands[k-1];
+
+        memcpy(sbr->f_tablelim, sbr->f_tablelow,
+               (sbr->n[0] + 1) * sizeof(sbr->f_tablelow[0]));
+        if (sbr->num_patches > 1)
+            memcpy(sbr->f_tablelim + sbr->n[0] + 1, patch_borders + 1,
+                   (sbr->num_patches - 1) * sizeof(patch_borders[0]));
+
+        AV_QSORT(sbr->f_tablelim, sbr->num_patches + sbr->n[0],
+              uint16_t,
+              qsort_comparison_function_int16);
+
+        sbr->n_lim = sbr->n[0] + sbr->num_patches - 1;
+        while (out < sbr->f_tablelim + sbr->n_lim) {
+#if USE_FIXED
+            if ((*in << 23) >= *out * lim_bands_per_octave_warped) {
+#else
+            if (*in >= *out * lim_bands_per_octave_warped) {
+#endif /* USE_FIXED */
+                *++out = *in++;
+            } else if (*in == *out ||
+                !in_table_int16(patch_borders, sbr->num_patches, *in)) {
+                in++;
+                sbr->n_lim--;
+            } else if (!in_table_int16(patch_borders, sbr->num_patches, *out)) {
+                *out = *in++;
+                sbr->n_lim--;
+            } else {
+                *++out = *in++;
+            }
+        }
+    } else {
+        sbr->f_tablelim[0] = sbr->f_tablelow[0];
+        sbr->f_tablelim[1] = sbr->f_tablelow[sbr->n[0]];
+        sbr->n_lim = 1;
+    }
+}
+
+static unsigned int read_sbr_header(SpectralBandReplication *sbr, GetBitContext *gb)
+{
+    unsigned int cnt = get_bits_count(gb);
+    uint8_t bs_header_extra_1;
+    uint8_t bs_header_extra_2;
+    int old_bs_limiter_bands = sbr->bs_limiter_bands;
+    SpectrumParameters old_spectrum_params;
+
+    sbr->start = 1;
+    sbr->ready_for_dequant = 0;
+
+    // Save last spectrum parameters variables to compare to new ones
+    memcpy(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters));
+
+    sbr->bs_amp_res_header              = get_bits1(gb);
+    sbr->spectrum_params.bs_start_freq  = get_bits(gb, 4);
+    sbr->spectrum_params.bs_stop_freq   = get_bits(gb, 4);
+    sbr->spectrum_params.bs_xover_band  = get_bits(gb, 3);
+                                          skip_bits(gb, 2); // bs_reserved
+
+    bs_header_extra_1 = get_bits1(gb);
+    bs_header_extra_2 = get_bits1(gb);
+
+    if (bs_header_extra_1) {
+        sbr->spectrum_params.bs_freq_scale  = get_bits(gb, 2);
+        sbr->spectrum_params.bs_alter_scale = get_bits1(gb);
+        sbr->spectrum_params.bs_noise_bands = get_bits(gb, 2);
+    } else {
+        sbr->spectrum_params.bs_freq_scale  = 2;
+        sbr->spectrum_params.bs_alter_scale = 1;
+        sbr->spectrum_params.bs_noise_bands = 2;
+    }
+
+    // Check if spectrum parameters changed
+    if (memcmp(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters)))
+        sbr->reset = 1;
+
+    if (bs_header_extra_2) {
+        sbr->bs_limiter_bands  = get_bits(gb, 2);
+        sbr->bs_limiter_gains  = get_bits(gb, 2);
+        sbr->bs_interpol_freq  = get_bits1(gb);
+        sbr->bs_smoothing_mode = get_bits1(gb);
+    } else {
+        sbr->bs_limiter_bands  = 2;
+        sbr->bs_limiter_gains  = 2;
+        sbr->bs_interpol_freq  = 1;
+        sbr->bs_smoothing_mode = 1;
+    }
+
+    if (sbr->bs_limiter_bands != old_bs_limiter_bands && !sbr->reset)
+        sbr_make_f_tablelim(sbr);
+
+    return get_bits_count(gb) - cnt;
+}
+
+static int array_min_int16(const int16_t *array, int nel)
+{
+    int i, min = array[0];
+    for (i = 1; i < nel; i++)
+        min = FFMIN(array[i], min);
+    return min;
+}
+
+static int check_n_master(AVCodecContext *avctx, int n_master, int bs_xover_band)
+{
+    // Requirements (14496-3 sp04 p205)
+    if (n_master <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid n_master: %d\n", n_master);
+        return -1;
+    }
+    if (bs_xover_band >= n_master) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid bitstream, crossover band index beyond array bounds: %d\n",
+               bs_xover_band);
+        return -1;
+    }
+    return 0;
+}
+
+/// Master Frequency Band Table (14496-3 sp04 p194)
+static int sbr_make_f_master(AACContext *ac, SpectralBandReplication *sbr,
+                             SpectrumParameters *spectrum)
+{
+    unsigned int temp, max_qmf_subbands = 0;
+    unsigned int start_min, stop_min;
+    int k;
+    const int8_t *sbr_offset_ptr;
+    int16_t stop_dk[13];
+
+    switch (sbr->sample_rate) {
+    case 16000:
+        sbr_offset_ptr = sbr_offset[0];
+        break;
+    case 22050:
+        sbr_offset_ptr = sbr_offset[1];
+        break;
+    case 24000:
+        sbr_offset_ptr = sbr_offset[2];
+        break;
+    case 32000:
+        sbr_offset_ptr = sbr_offset[3];
+        break;
+    case 44100: case 48000: case 64000:
+        sbr_offset_ptr = sbr_offset[4];
+        break;
+    case 88200: case 96000: case 128000: case 176400: case 192000:
+        sbr_offset_ptr = sbr_offset[5];
+        break;
+    default:
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Unsupported sample rate for SBR: %d\n", sbr->sample_rate);
+        return -1;
+    }
+
+    if (sbr->sample_rate < 32000) {
+        temp = 3000;
+    } else if (sbr->sample_rate < 64000) {
+        temp = 4000;
+    } else
+        temp = 5000;
+
+    start_min = ((temp << 7) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+    stop_min  = ((temp << 8) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+
+    sbr->k[0] = start_min + sbr_offset_ptr[spectrum->bs_start_freq];
+
+    if (spectrum->bs_stop_freq < 14) {
+        sbr->k[2] = stop_min;
+        make_bands(stop_dk, stop_min, 64, 13);
+        AV_QSORT(stop_dk, 13, int16_t, qsort_comparison_function_int16);
+        for (k = 0; k < spectrum->bs_stop_freq; k++)
+            sbr->k[2] += stop_dk[k];
+    } else if (spectrum->bs_stop_freq == 14) {
+        sbr->k[2] = 2*sbr->k[0];
+    } else if (spectrum->bs_stop_freq == 15) {
+        sbr->k[2] = 3*sbr->k[0];
+    } else {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bs_stop_freq: %d\n", spectrum->bs_stop_freq);
+        return -1;
+    }
+    sbr->k[2] = FFMIN(64, sbr->k[2]);
+
+    // Requirements (14496-3 sp04 p205)
+    if (sbr->sample_rate <= 32000) {
+        max_qmf_subbands = 48;
+    } else if (sbr->sample_rate == 44100) {
+        max_qmf_subbands = 35;
+    } else if (sbr->sample_rate >= 48000)
+        max_qmf_subbands = 32;
+    else
+        av_assert0(0);
+
+    if (sbr->k[2] - sbr->k[0] > max_qmf_subbands) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bitstream, too many QMF subbands: %d\n", sbr->k[2] - sbr->k[0]);
+        return -1;
+    }
+
+    if (!spectrum->bs_freq_scale) {
+        int dk, k2diff;
+
+        dk = spectrum->bs_alter_scale + 1;
+        sbr->n_master = ((sbr->k[2] - sbr->k[0] + (dk&2)) >> dk) << 1;
+        if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+            return -1;
+
+        for (k = 1; k <= sbr->n_master; k++)
+            sbr->f_master[k] = dk;
+
+        k2diff = sbr->k[2] - sbr->k[0] - sbr->n_master * dk;
+        if (k2diff < 0) {
+            sbr->f_master[1]--;
+            sbr->f_master[2]-= (k2diff < -1);
+        } else if (k2diff) {
+            sbr->f_master[sbr->n_master]++;
+        }
+
+        sbr->f_master[0] = sbr->k[0];
+        for (k = 1; k <= sbr->n_master; k++)
+            sbr->f_master[k] += sbr->f_master[k - 1];
+
+    } else {
+        int half_bands = 7 - spectrum->bs_freq_scale;      // bs_freq_scale  = {1,2,3}
+        int two_regions, num_bands_0;
+        int vdk0_max, vdk1_min;
+        int16_t vk0[49];
+#if USE_FIXED
+        int tmp, nz = 0;
+#endif /* USE_FIXED */
+
+        if (49 * sbr->k[2] > 110 * sbr->k[0]) {
+            two_regions = 1;
+            sbr->k[1] = 2 * sbr->k[0];
+        } else {
+            two_regions = 0;
+            sbr->k[1] = sbr->k[2];
+        }
+
+#if USE_FIXED
+        tmp = (sbr->k[1] << 23) / sbr->k[0];
+        while (tmp < 0x40000000) {
+          tmp <<= 1;
+          nz++;
+        }
+        tmp = fixed_log(tmp - 0x80000000);
+        tmp = (int)(((int64_t)tmp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+        tmp = (((tmp + 0x80) >> 8) + ((8 - nz) << 23)) * half_bands;
+        num_bands_0 = ((tmp + 0x400000) >> 23) * 2;
+#else
+        num_bands_0 = lrintf(half_bands * log2f(sbr->k[1] / (float)sbr->k[0])) * 2;
+#endif /* USE_FIXED */
+
+        if (num_bands_0 <= 0) { // Requirements (14496-3 sp04 p205)
+            av_log(ac->avctx, AV_LOG_ERROR, "Invalid num_bands_0: %d\n", num_bands_0);
+            return -1;
+        }
+
+        vk0[0] = 0;
+
+        make_bands(vk0+1, sbr->k[0], sbr->k[1], num_bands_0);
+
+        AV_QSORT(vk0 + 1, num_bands_0, int16_t, qsort_comparison_function_int16);
+        vdk0_max = vk0[num_bands_0];
+
+        vk0[0] = sbr->k[0];
+        for (k = 1; k <= num_bands_0; k++) {
+            if (vk0[k] <= 0) { // Requirements (14496-3 sp04 p205)
+                av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk0[%d]: %d\n", k, vk0[k]);
+                return -1;
+            }
+            vk0[k] += vk0[k-1];
+        }
+
+        if (two_regions) {
+            int16_t vk1[49];
+#if USE_FIXED
+            int num_bands_1;
+
+            tmp = (sbr->k[2] << 23) / sbr->k[1];
+            nz = 0;
+            while (tmp < 0x40000000) {
+              tmp <<= 1;
+              nz++;
+            }
+            tmp = fixed_log(tmp - 0x80000000);
+            tmp = (int)(((int64_t)tmp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+            tmp = (((tmp + 0x80) >> 8) + ((8 - nz) << 23)) * half_bands;
+            if (spectrum->bs_alter_scale)
+                tmp = (int)(((int64_t)tmp * CONST_076923 + 0x40000000) >> 31);
+            num_bands_1 = ((tmp + 0x400000) >> 23) * 2;
+#else
+            float invwarp = spectrum->bs_alter_scale ? 0.76923076923076923077f
+                                                     : 1.0f; // bs_alter_scale = {0,1}
+            int num_bands_1 = lrintf(half_bands * invwarp *
+                                     log2f(sbr->k[2] / (float)sbr->k[1])) * 2;
+#endif /* USE_FIXED */
+            make_bands(vk1+1, sbr->k[1], sbr->k[2], num_bands_1);
+
+            vdk1_min = array_min_int16(vk1 + 1, num_bands_1);
+
+            if (vdk1_min < vdk0_max) {
+                int change;
+                AV_QSORT(vk1 + 1, num_bands_1, int16_t, qsort_comparison_function_int16);
+                change = FFMIN(vdk0_max - vk1[1], (vk1[num_bands_1] - vk1[1]) >> 1);
+                vk1[1]           += change;
+                vk1[num_bands_1] -= change;
+            }
+
+            AV_QSORT(vk1 + 1, num_bands_1, int16_t, qsort_comparison_function_int16);
+
+            vk1[0] = sbr->k[1];
+            for (k = 1; k <= num_bands_1; k++) {
+                if (vk1[k] <= 0) { // Requirements (14496-3 sp04 p205)
+                    av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk1[%d]: %d\n", k, vk1[k]);
+                    return -1;
+                }
+                vk1[k] += vk1[k-1];
+            }
+
+            sbr->n_master = num_bands_0 + num_bands_1;
+            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+                return -1;
+            memcpy(&sbr->f_master[0],               vk0,
+                   (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
+            memcpy(&sbr->f_master[num_bands_0 + 1], vk1 + 1,
+                    num_bands_1      * sizeof(sbr->f_master[0]));
+
+        } else {
+            sbr->n_master = num_bands_0;
+            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+                return -1;
+            memcpy(sbr->f_master, vk0, (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
+        }
+    }
+
+    return 0;
+}
+
+/// High Frequency Generation - Patch Construction (14496-3 sp04 p216 fig. 4.46)
+static int sbr_hf_calc_npatches(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int i, k, last_k = -1, last_msb = -1, sb = 0;
+    int msb = sbr->k[0];
+    int usb = sbr->kx[1];
+    int goal_sb = ((1000 << 11) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+
+    sbr->num_patches = 0;
+
+    if (goal_sb < sbr->kx[1] + sbr->m[1]) {
+        for (k = 0; sbr->f_master[k] < goal_sb; k++) ;
+    } else
+        k = sbr->n_master;
+
+    do {
+        int odd = 0;
+        if (k == last_k && msb == last_msb) {
+            av_log(ac->avctx, AV_LOG_ERROR, "patch construction failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+        last_k = k;
+        last_msb = msb;
+        for (i = k; i == k || sb > (sbr->k[0] - 1 + msb - odd); i--) {
+            sb = sbr->f_master[i];
+            odd = (sb + sbr->k[0]) & 1;
+        }
+
+        // Requirements (14496-3 sp04 p205) sets the maximum number of patches to 5.
+        // After this check the final number of patches can still be six which is
+        // illegal however the Coding Technologies decoder check stream has a final
+        // count of 6 patches
+        if (sbr->num_patches > 5) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Too many patches: %d\n", sbr->num_patches);
+            return -1;
+        }
+
+        sbr->patch_num_subbands[sbr->num_patches]  = FFMAX(sb - usb, 0);
+        sbr->patch_start_subband[sbr->num_patches] = sbr->k[0] - odd - sbr->patch_num_subbands[sbr->num_patches];
+
+        if (sbr->patch_num_subbands[sbr->num_patches] > 0) {
+            usb = sb;
+            msb = sb;
+            sbr->num_patches++;
+        } else
+            msb = sbr->kx[1];
+
+        if (sbr->f_master[k] - sb < 3)
+            k = sbr->n_master;
+    } while (sb != sbr->kx[1] + sbr->m[1]);
+
+    if (sbr->num_patches > 1 &&
+        sbr->patch_num_subbands[sbr->num_patches - 1] < 3)
+        sbr->num_patches--;
+
+    return 0;
+}
+
+/// Derived Frequency Band Tables (14496-3 sp04 p197)
+static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int k, temp;
+#if USE_FIXED
+    int nz = 0;
+#endif /* USE_FIXED */
+
+    sbr->n[1] = sbr->n_master - sbr->spectrum_params.bs_xover_band;
+    sbr->n[0] = (sbr->n[1] + 1) >> 1;
+
+    memcpy(sbr->f_tablehigh, &sbr->f_master[sbr->spectrum_params.bs_xover_band],
+           (sbr->n[1] + 1) * sizeof(sbr->f_master[0]));
+    sbr->m[1] = sbr->f_tablehigh[sbr->n[1]] - sbr->f_tablehigh[0];
+    sbr->kx[1] = sbr->f_tablehigh[0];
+
+    // Requirements (14496-3 sp04 p205)
+    if (sbr->kx[1] + sbr->m[1] > 64) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Stop frequency border too high: %d\n", sbr->kx[1] + sbr->m[1]);
+        return -1;
+    }
+    if (sbr->kx[1] > 32) {
+        av_log(ac->avctx, AV_LOG_ERROR, "Start frequency border too high: %d\n", sbr->kx[1]);
+        return -1;
+    }
+
+    sbr->f_tablelow[0] = sbr->f_tablehigh[0];
+    temp = sbr->n[1] & 1;
+    for (k = 1; k <= sbr->n[0]; k++)
+        sbr->f_tablelow[k] = sbr->f_tablehigh[2 * k - temp];
+#if USE_FIXED
+    temp = (sbr->k[2] << 23) / sbr->kx[1];
+    while (temp < 0x40000000) {
+        temp <<= 1;
+        nz++;
+    }
+    temp = fixed_log(temp - 0x80000000);
+    temp = (int)(((int64_t)temp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+    temp = (((temp + 0x80) >> 8) + ((8 - nz) << 23)) * sbr->spectrum_params.bs_noise_bands;
+
+    sbr->n_q = (temp + 0x400000) >> 23;
+    if (sbr->n_q < 1)
+        sbr->n_q = 1;
+#else
+    sbr->n_q = FFMAX(1, lrintf(sbr->spectrum_params.bs_noise_bands *
+                               log2f(sbr->k[2] / (float)sbr->kx[1]))); // 0 <= bs_noise_bands <= 3
+#endif /* USE_FIXED */
+
+    if (sbr->n_q > 5) {
+        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
+        return -1;
+    }
+
+    sbr->f_tablenoise[0] = sbr->f_tablelow[0];
+    temp = 0;
+    for (k = 1; k <= sbr->n_q; k++) {
+        temp += (sbr->n[0] - temp) / (sbr->n_q + 1 - k);
+        sbr->f_tablenoise[k] = sbr->f_tablelow[temp];
+    }
+
+    if (sbr_hf_calc_npatches(ac, sbr) < 0)
+        return -1;
+
+    sbr_make_f_tablelim(sbr);
+
+    sbr->data[0].f_indexnoise = 0;
+    sbr->data[1].f_indexnoise = 0;
+
+    return 0;
+}
+
+static av_always_inline void get_bits1_vector(GetBitContext *gb, uint8_t *vec,
+                                              int elements)
+{
+    int i;
+    for (i = 0; i < elements; i++) {
+        vec[i] = get_bits1(gb);
+    }
+}
+
+/** ceil(log2(index+1)) */
+static const int8_t ceil_log2[] = {
+    0, 1, 2, 2, 3, 3,
+};
+
+static int read_sbr_grid(AACContext *ac, SpectralBandReplication *sbr,
+                         GetBitContext *gb, SBRData *ch_data)
+{
+    int i;
+    int bs_pointer = 0;
+    // frameLengthFlag ? 15 : 16; 960 sample length frames unsupported; this value is numTimeSlots
+    int abs_bord_trail = 16;
+    int num_rel_lead, num_rel_trail;
+    unsigned bs_num_env_old = ch_data->bs_num_env;
+    int bs_frame_class, bs_num_env;
+
+    ch_data->bs_freq_res[0] = ch_data->bs_freq_res[ch_data->bs_num_env];
+    ch_data->bs_amp_res = sbr->bs_amp_res_header;
+    ch_data->t_env_num_env_old = ch_data->t_env[bs_num_env_old];
+
+    switch (bs_frame_class = get_bits(gb, 2)) {
+    case FIXFIX:
+        bs_num_env = 1 << get_bits(gb, 2);
+        if (bs_num_env > 4) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid bitstream, too many SBR envelopes in FIXFIX type SBR frame: %d\n",
+                   bs_num_env);
+            return -1;
+        }
+        ch_data->bs_num_env = bs_num_env;
+        num_rel_lead                        = ch_data->bs_num_env - 1;
+        if (ch_data->bs_num_env == 1)
+            ch_data->bs_amp_res = 0;
+
+
+        ch_data->t_env[0]                   = 0;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        abs_bord_trail = (abs_bord_trail + (ch_data->bs_num_env >> 1)) /
+                   ch_data->bs_num_env;
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + abs_bord_trail;
+
+        ch_data->bs_freq_res[1] = get_bits1(gb);
+        for (i = 1; i < ch_data->bs_num_env; i++)
+            ch_data->bs_freq_res[i + 1] = ch_data->bs_freq_res[1];
+        break;
+    case FIXVAR:
+        abs_bord_trail                     += get_bits(gb, 2);
+        num_rel_trail                       = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_trail + 1;
+        ch_data->t_env[0]                   = 0;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_trail; i++)
+            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
+                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        for (i = 0; i < ch_data->bs_num_env; i++)
+            ch_data->bs_freq_res[ch_data->bs_num_env - i] = get_bits1(gb);
+        break;
+    case VARFIX:
+        ch_data->t_env[0]                   = get_bits(gb, 2);
+        num_rel_lead                        = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_lead + 1;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
+        break;
+    case VARVAR:
+        ch_data->t_env[0]                   = get_bits(gb, 2);
+        abs_bord_trail                     += get_bits(gb, 2);
+        num_rel_lead                        = get_bits(gb, 2);
+        num_rel_trail                       = get_bits(gb, 2);
+        bs_num_env                          = num_rel_lead + num_rel_trail + 1;
+
+        if (bs_num_env > 5) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid bitstream, too many SBR envelopes in VARVAR type SBR frame: %d\n",
+                   bs_num_env);
+            return -1;
+        }
+        ch_data->bs_num_env = bs_num_env;
+
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
+        for (i = 0; i < num_rel_trail; i++)
+            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
+                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
+        break;
+    }
+    ch_data->bs_frame_class = bs_frame_class;
+
+    av_assert0(bs_pointer >= 0);
+    if (bs_pointer > ch_data->bs_num_env + 1) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bitstream, bs_pointer points to a middle noise border outside the time borders table: %d\n",
+               bs_pointer);
+        return -1;
+    }
+
+    for (i = 1; i <= ch_data->bs_num_env; i++) {
+        if (ch_data->t_env[i-1] >= ch_data->t_env[i]) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Not strictly monotone time borders\n");
+            return -1;
+        }
+    }
+
+    ch_data->bs_num_noise = (ch_data->bs_num_env > 1) + 1;
+
+    ch_data->t_q[0]                     = ch_data->t_env[0];
+    ch_data->t_q[ch_data->bs_num_noise] = ch_data->t_env[ch_data->bs_num_env];
+    if (ch_data->bs_num_noise > 1) {
+        int idx;
+        if (ch_data->bs_frame_class == FIXFIX) {
+            idx = ch_data->bs_num_env >> 1;
+        } else if (ch_data->bs_frame_class & 1) { // FIXVAR or VARVAR
+            idx = ch_data->bs_num_env - FFMAX(bs_pointer - 1, 1);
+        } else { // VARFIX
+            if (!bs_pointer)
+                idx = 1;
+            else if (bs_pointer == 1)
+                idx = ch_data->bs_num_env - 1;
+            else // bs_pointer > 1
+                idx = bs_pointer - 1;
+        }
+        ch_data->t_q[1] = ch_data->t_env[idx];
+    }
+
+    ch_data->e_a[0] = -(ch_data->e_a[1] != bs_num_env_old); // l_APrev
+    ch_data->e_a[1] = -1;
+    if ((ch_data->bs_frame_class & 1) && bs_pointer) { // FIXVAR or VARVAR and bs_pointer != 0
+        ch_data->e_a[1] = ch_data->bs_num_env + 1 - bs_pointer;
+    } else if ((ch_data->bs_frame_class == 2) && (bs_pointer > 1)) // VARFIX and bs_pointer > 1
+        ch_data->e_a[1] = bs_pointer - 1;
+
+    return 0;
+}
+
+static void copy_sbr_grid(SBRData *dst, const SBRData *src) {
+    //These variables are saved from the previous frame rather than copied
+    dst->bs_freq_res[0]    = dst->bs_freq_res[dst->bs_num_env];
+    dst->t_env_num_env_old = dst->t_env[dst->bs_num_env];
+    dst->e_a[0]            = -(dst->e_a[1] != dst->bs_num_env);
+
+    //These variables are read from the bitstream and therefore copied
+    memcpy(dst->bs_freq_res+1, src->bs_freq_res+1, sizeof(dst->bs_freq_res)-sizeof(*dst->bs_freq_res));
+    memcpy(dst->t_env,         src->t_env,         sizeof(dst->t_env));
+    memcpy(dst->t_q,           src->t_q,           sizeof(dst->t_q));
+    dst->bs_num_env        = src->bs_num_env;
+    dst->bs_amp_res        = src->bs_amp_res;
+    dst->bs_num_noise      = src->bs_num_noise;
+    dst->bs_frame_class    = src->bs_frame_class;
+    dst->e_a[1]            = src->e_a[1];
+}
+
+/// Read how the envelope and noise floor data is delta coded
+static void read_sbr_dtdf(SpectralBandReplication *sbr, GetBitContext *gb,
+                          SBRData *ch_data)
+{
+    get_bits1_vector(gb, ch_data->bs_df_env,   ch_data->bs_num_env);
+    get_bits1_vector(gb, ch_data->bs_df_noise, ch_data->bs_num_noise);
+}
+
+/// Read inverse filtering data
+static void read_sbr_invf(SpectralBandReplication *sbr, GetBitContext *gb,
+                          SBRData *ch_data)
+{
+    int i;
+
+    memcpy(ch_data->bs_invf_mode[1], ch_data->bs_invf_mode[0], 5 * sizeof(uint8_t));
+    for (i = 0; i < sbr->n_q; i++)
+        ch_data->bs_invf_mode[0][i] = get_bits(gb, 2);
+}
+
+static int read_sbr_envelope(AACContext *ac, SpectralBandReplication *sbr, GetBitContext *gb,
+                              SBRData *ch_data, int ch)
+{
+    int bits;
+    int i, j, k;
+    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
+    int t_lav, f_lav;
+    const int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
+    const int odd = sbr->n[1] & 1;
+
+    if (sbr->bs_coupling && ch) {
+        if (ch_data->bs_amp_res) {
+            bits   = 5;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_3_0DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_3_0DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
+        } else {
+            bits   = 6;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_1_5DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_1_5DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_1_5DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_1_5DB];
+        }
+    } else {
+        if (ch_data->bs_amp_res) {
+            bits   = 6;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_3_0DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_3_0DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
+        } else {
+            bits   = 7;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_1_5DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_1_5DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_1_5DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_1_5DB];
+        }
+    }
+
+    for (i = 0; i < ch_data->bs_num_env; i++) {
+        if (ch_data->bs_df_env[i]) {
+            // bs_freq_res[0] == bs_freq_res[bs_num_env] from prev frame
+            if (ch_data->bs_freq_res[i + 1] == ch_data->bs_freq_res[i]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i][j] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                    if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                        av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            } else if (ch_data->bs_freq_res[i + 1]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = (j + odd) >> 1; // find k such that f_tablelow[k] <= f_tablehigh[j] < f_tablelow[k + 1]
+                    ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                    if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                        av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            } else {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = j ? 2*j - odd : 0; // find k such that f_tablehigh[k] == f_tablelow[j]
+                    ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                    if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                        av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            }
+        } else {
+            ch_data->env_facs_q[i + 1][0] = delta * get_bits(gb, bits); // bs_env_start_value_balance
+            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+                if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        }
+    }
+
+    //assign 0th elements of env_facs_q from last elements
+    memcpy(ch_data->env_facs_q[0], ch_data->env_facs_q[ch_data->bs_num_env],
+           sizeof(ch_data->env_facs_q[0]));
+
+    return 0;
+}
+
+static int read_sbr_noise(AACContext *ac, SpectralBandReplication *sbr, GetBitContext *gb,
+                           SBRData *ch_data, int ch)
+{
+    int i, j;
+    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
+    int t_lav, f_lav;
+    int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
+
+    if (sbr->bs_coupling && ch) {
+        t_huff = vlc_sbr[T_HUFFMAN_NOISE_BAL_3_0DB].table;
+        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_BAL_3_0DB];
+        f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
+        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
+    } else {
+        t_huff = vlc_sbr[T_HUFFMAN_NOISE_3_0DB].table;
+        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_3_0DB];
+        f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
+        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
+    }
+
+    for (i = 0; i < ch_data->bs_num_noise; i++) {
+        if (ch_data->bs_df_noise[i]) {
+            for (j = 0; j < sbr->n_q; j++) {
+                ch_data->noise_facs_q[i + 1][j] = ch_data->noise_facs_q[i][j] + delta * (get_vlc2(gb, t_huff, 9, 2) - t_lav);
+                if (ch_data->noise_facs_q[i + 1][j] > 30U) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "noise_facs_q %d is invalid\n", ch_data->noise_facs_q[i + 1][j]);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        } else {
+            ch_data->noise_facs_q[i + 1][0] = delta * get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
+            for (j = 1; j < sbr->n_q; j++) {
+                ch_data->noise_facs_q[i + 1][j] = ch_data->noise_facs_q[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+                if (ch_data->noise_facs_q[i + 1][j] > 30U) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "noise_facs_q %d is invalid\n", ch_data->noise_facs_q[i + 1][j]);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        }
+    }
+
+    //assign 0th elements of noise_facs_q from last elements
+    memcpy(ch_data->noise_facs_q[0], ch_data->noise_facs_q[ch_data->bs_num_noise],
+           sizeof(ch_data->noise_facs_q[0]));
+    return 0;
+}
+
+static void read_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
+                               GetBitContext *gb,
+                               int bs_extension_id, int *num_bits_left)
+{
+    switch (bs_extension_id) {
+    case EXTENSION_ID_PS:
+        if (!ac->oc[1].m4ac.ps) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Parametric Stereo signaled to be not-present but was found in the bitstream.\n");
+            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+            *num_bits_left = 0;
+        } else {
+            *num_bits_left -= AAC_RENAME(ff_ps_read_data)(ac->avctx, gb, &sbr->ps, *num_bits_left);
+            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
+        }
+        break;
+    default:
+        // some files contain 0-padding
+        if (bs_extension_id || *num_bits_left > 16 || show_bits(gb, *num_bits_left))
+            avpriv_request_sample(ac->avctx, "Reserved SBR extensions");
+        skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+        *num_bits_left = 0;
+        break;
+    }
+}
+
+static int read_sbr_single_channel_element(AACContext *ac,
+                                            SpectralBandReplication *sbr,
+                                            GetBitContext *gb)
+{
+    int ret;
+
+    if (get_bits1(gb)) // bs_data_extra
+        skip_bits(gb, 4); // bs_reserved
+
+    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+        return -1;
+    read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+    read_sbr_invf(sbr, gb, &sbr->data[0]);
+    if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+        return ret;
+    if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+        return ret;
+
+    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
+
+    return 0;
+}
+
+static int read_sbr_channel_pair_element(AACContext *ac,
+                                          SpectralBandReplication *sbr,
+                                          GetBitContext *gb)
+{
+    int ret;
+
+    if (get_bits1(gb))    // bs_data_extra
+        skip_bits(gb, 8); // bs_reserved
+
+    if ((sbr->bs_coupling = get_bits1(gb))) {
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+            return -1;
+        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
+        read_sbr_invf(sbr, gb, &sbr->data[0]);
+        memcpy(sbr->data[1].bs_invf_mode[1], sbr->data[1].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
+        memcpy(sbr->data[1].bs_invf_mode[0], sbr->data[0].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+    } else {
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]) ||
+            read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
+            return -1;
+        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
+        read_sbr_invf(sbr, gb, &sbr->data[0]);
+        read_sbr_invf(sbr, gb, &sbr->data[1]);
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+    }
+
+    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
+    if ((sbr->data[1].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[1].bs_add_harmonic, sbr->n[1]);
+
+    return 0;
+}
+
+static unsigned int read_sbr_data(AACContext *ac, SpectralBandReplication *sbr,
+                                  GetBitContext *gb, int id_aac)
+{
+    unsigned int cnt = get_bits_count(gb);
+
+    sbr->id_aac = id_aac;
+    sbr->ready_for_dequant = 1;
+
+    if (id_aac == TYPE_SCE || id_aac == TYPE_CCE) {
+        if (read_sbr_single_channel_element(ac, sbr, gb)) {
+            sbr_turnoff(sbr);
+            return get_bits_count(gb) - cnt;
+        }
+    } else if (id_aac == TYPE_CPE) {
+        if (read_sbr_channel_pair_element(ac, sbr, gb)) {
+            sbr_turnoff(sbr);
+            return get_bits_count(gb) - cnt;
+        }
+    } else {
+        av_log(ac->avctx, AV_LOG_ERROR,
+            "Invalid bitstream - cannot apply SBR to element type %d\n", id_aac);
+        sbr_turnoff(sbr);
+        return get_bits_count(gb) - cnt;
+    }
+    if (get_bits1(gb)) { // bs_extended_data
+        int num_bits_left = get_bits(gb, 4); // bs_extension_size
+        if (num_bits_left == 15)
+            num_bits_left += get_bits(gb, 8); // bs_esc_count
+
+        num_bits_left <<= 3;
+        while (num_bits_left > 7) {
+            num_bits_left -= 2;
+            read_sbr_extension(ac, sbr, gb, get_bits(gb, 2), &num_bits_left); // bs_extension_id
+        }
+        if (num_bits_left < 0) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR Extension over read.\n");
+        }
+        if (num_bits_left > 0)
+            skip_bits(gb, num_bits_left);
+    }
+
+    return get_bits_count(gb) - cnt;
+}
+
+static void sbr_reset(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int err;
+    err = sbr_make_f_master(ac, sbr, &sbr->spectrum_params);
+    if (err >= 0)
+        err = sbr_make_f_derived(ac, sbr);
+    if (err < 0) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "SBR reset failed. Switching SBR to pure upsampling mode.\n");
+        sbr_turnoff(sbr);
+    }
+}
+
+/**
+ * Decode Spectral Band Replication extension data; reference: table 4.55.
+ *
+ * @param   crc flag indicating the presence of CRC checksum
+ * @param   cnt length of TYPE_FIL syntactic element in bytes
+ *
+ * @return  Returns number of bytes consumed from the TYPE_FIL element.
+ */
+int AAC_RENAME(ff_decode_sbr_extension)(AACContext *ac, SpectralBandReplication *sbr,
+                            GetBitContext *gb_host, int crc, int cnt, int id_aac)
+{
+    unsigned int num_sbr_bits = 0, num_align_bits;
+    unsigned bytes_read;
+    GetBitContext gbc = *gb_host, *gb = &gbc;
+    skip_bits_long(gb_host, cnt*8 - 4);
+
+    sbr->reset = 0;
+
+    if (!sbr->sample_rate)
+        sbr->sample_rate = 2 * ac->oc[1].m4ac.sample_rate; //TODO use the nominal sample rate for arbitrary sample rate support
+    if (!ac->oc[1].m4ac.ext_sample_rate)
+        ac->oc[1].m4ac.ext_sample_rate = 2 * ac->oc[1].m4ac.sample_rate;
+
+    if (crc) {
+        skip_bits(gb, 10); // bs_sbr_crc_bits; TODO - implement CRC check
+        num_sbr_bits += 10;
+    }
+
+    //Save some state from the previous frame.
+    sbr->kx[0] = sbr->kx[1];
+    sbr->m[0] = sbr->m[1];
+    sbr->kx_and_m_pushed = 1;
+
+    num_sbr_bits++;
+    if (get_bits1(gb)) // bs_header_flag
+        num_sbr_bits += read_sbr_header(sbr, gb);
+
+    if (sbr->reset)
+        sbr_reset(ac, sbr);
+
+    if (sbr->start)
+        num_sbr_bits  += read_sbr_data(ac, sbr, gb, id_aac);
+
+    num_align_bits = ((cnt << 3) - 4 - num_sbr_bits) & 7;
+    bytes_read = ((num_sbr_bits + num_align_bits + 4) >> 3);
+
+    if (bytes_read > cnt) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Expected to read %d SBR bytes actually read %d.\n", cnt, bytes_read);
+        sbr_turnoff(sbr);
+    }
+    return cnt;
+}
+
+/**
+ * Analysis QMF Bank (14496-3 sp04 p206)
+ *
+ * @param   x       pointer to the beginning of the first sample window
+ * @param   W       array of complex-valued samples split into subbands
+ */
+#ifndef sbr_qmf_analysis
+#if USE_FIXED
+static void sbr_qmf_analysis(AVFixedDSPContext *dsp, FFTContext *mdct,
+#else
+static void sbr_qmf_analysis(AVFloatDSPContext *dsp, FFTContext *mdct,
+#endif /* USE_FIXED */
+                             SBRDSPContext *sbrdsp, const INTFLOAT *in, INTFLOAT *x,
+                             INTFLOAT z[320], INTFLOAT W[2][32][32][2], int buf_idx)
+{
+    int i;
+#if USE_FIXED
+    int j;
+#endif
+    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
+    memcpy(x+288, in,         1024*sizeof(x[0]));
+    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
+                               // are not supported
+        dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
+        sbrdsp->sum64x5(z);
+        sbrdsp->qmf_pre_shuffle(z);
+#if USE_FIXED
+        for (j = 64; j < 128; j++) {
+            if (z[j] > 1<<24) {
+                av_log(NULL, AV_LOG_WARNING,
+                       "sbr_qmf_analysis: value %09d too large, setting to %09d\n",
+                       z[j], 1<<24);
+                z[j] = 1<<24;
+            } else if (z[j] < -(1<<24)) {
+                av_log(NULL, AV_LOG_WARNING,
+                       "sbr_qmf_analysis: value %09d too small, setting to %09d\n",
+                       z[j], -(1<<24));
+                z[j] = -(1<<24);
+            }
+        }
+#endif
+        mdct->imdct_half(mdct, z, z+64);
+        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
+        x += 32;
+    }
+}
+#endif
+
+/**
+ * Synthesis QMF Bank (14496-3 sp04 p206) and Downsampled Synthesis QMF Bank
+ * (14496-3 sp04 p206)
+ */
+#ifndef sbr_qmf_synthesis
+static void sbr_qmf_synthesis(FFTContext *mdct,
+#if USE_FIXED
+                              SBRDSPContext *sbrdsp, AVFixedDSPContext *dsp,
+#else
+                              SBRDSPContext *sbrdsp, AVFloatDSPContext *dsp,
+#endif /* USE_FIXED */
+                              INTFLOAT *out, INTFLOAT X[2][38][64],
+                              INTFLOAT mdct_buf[2][64],
+                              INTFLOAT *v0, int *v_off, const unsigned int div)
+{
+    int i, n;
+    const INTFLOAT *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
+    const int step = 128 >> div;
+    INTFLOAT *v;
+    for (i = 0; i < 32; i++) {
+        if (*v_off < step) {
+            int saved_samples = (1280 - 128) >> div;
+            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(INTFLOAT));
+            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
+        } else {
+            *v_off -= step;
+        }
+        v = v0 + *v_off;
+        if (div) {
+            for (n = 0; n < 32; n++) {
+                X[0][i][   n] = -X[0][i][n];
+                X[0][i][32+n] =  X[1][i][31-n];
+            }
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
+        } else {
+            sbrdsp->neg_odd_64(X[1][i]);
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
+            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
+        }
+        dsp->vector_fmul    (out, v                , sbr_qmf_window                       , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
+        out += 64 >> div;
+    }
+}
+#endif
+
+/// Generate the subband filtered lowband
+static int sbr_lf_gen(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_low[32][40][2], const INTFLOAT W[2][32][32][2],
+                      int buf_idx)
+{
+    int i, k;
+    const int t_HFGen = 8;
+    const int i_f = 32;
+    memset(X_low, 0, 32*sizeof(*X_low));
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = t_HFGen; i < i_f + t_HFGen; i++) {
+            X_low[k][i][0] = W[buf_idx][i - t_HFGen][k][0];
+            X_low[k][i][1] = W[buf_idx][i - t_HFGen][k][1];
+        }
+    }
+    buf_idx = 1-buf_idx;
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < t_HFGen; i++) {
+            X_low[k][i][0] = W[buf_idx][i + i_f - t_HFGen][k][0];
+            X_low[k][i][1] = W[buf_idx][i + i_f - t_HFGen][k][1];
+        }
+    }
+    return 0;
+}
+
+/// High Frequency Generator (14496-3 sp04 p215)
+static int sbr_hf_gen(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_high[64][40][2], const INTFLOAT X_low[32][40][2],
+                      const INTFLOAT (*alpha0)[2], const INTFLOAT (*alpha1)[2],
+                      const INTFLOAT bw_array[5], const uint8_t *t_env,
+                      int bs_num_env)
+{
+    int j, x;
+    int g = 0;
+    int k = sbr->kx[1];
+    for (j = 0; j < sbr->num_patches; j++) {
+        for (x = 0; x < sbr->patch_num_subbands[j]; x++, k++) {
+            const int p = sbr->patch_start_subband[j] + x;
+            while (g <= sbr->n_q && k >= sbr->f_tablenoise[g])
+                g++;
+            g--;
+
+            if (g < 0) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "ERROR : no subband found for frequency %d\n", k);
+                return -1;
+            }
+
+            sbr->dsp.hf_gen(X_high[k] + ENVELOPE_ADJUSTMENT_OFFSET,
+                            X_low[p]  + ENVELOPE_ADJUSTMENT_OFFSET,
+                            alpha0[p], alpha1[p], bw_array[g],
+                            2 * t_env[0], 2 * t_env[bs_num_env]);
+        }
+    }
+    if (k < sbr->m[1] + sbr->kx[1])
+        memset(X_high + k, 0, (sbr->m[1] + sbr->kx[1] - k) * sizeof(*X_high));
+
+    return 0;
+}
+
+/// Generate the subband filtered lowband
+static int sbr_x_gen(SpectralBandReplication *sbr, INTFLOAT X[2][38][64],
+                     const INTFLOAT Y0[38][64][2], const INTFLOAT Y1[38][64][2],
+                     const INTFLOAT X_low[32][40][2], int ch)
+{
+    int k, i;
+    const int i_f = 32;
+    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
+    memset(X, 0, 2*sizeof(*X));
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < i_Temp; i++) {
+            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
+            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
+        }
+    }
+    for (; k < sbr->kx[0] + sbr->m[0]; k++) {
+        for (i = 0; i < i_Temp; i++) {
+            X[0][i][k] = Y0[i + i_f][k][0];
+            X[1][i][k] = Y0[i + i_f][k][1];
+        }
+    }
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = i_Temp; i < 38; i++) {
+            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
+            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
+        }
+    }
+    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
+        for (i = i_Temp; i < i_f; i++) {
+            X[0][i][k] = Y1[i][k][0];
+            X[1][i][k] = Y1[i][k][1];
+        }
+    }
+    return 0;
+}
+
+/** High Frequency Adjustment (14496-3 sp04 p217) and Mapping
+ * (14496-3 sp04 p217)
+ */
+static int sbr_mapping(AACContext *ac, SpectralBandReplication *sbr,
+                        SBRData *ch_data, int e_a[2])
+{
+    int e, i, m;
+
+    memset(ch_data->s_indexmapped[1], 0, 7*sizeof(ch_data->s_indexmapped[1]));
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        const unsigned int ilim = sbr->n[ch_data->bs_freq_res[e + 1]];
+        uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
+        int k;
+
+        if (sbr->kx[1] != table[0]) {
+            av_log(ac->avctx, AV_LOG_ERROR, "kx != f_table{high,low}[0]. "
+                   "Derived frequency tables were not regenerated.\n");
+            sbr_turnoff(sbr);
+            return AVERROR_BUG;
+        }
+        for (i = 0; i < ilim; i++)
+            for (m = table[i]; m < table[i + 1]; m++)
+                sbr->e_origmapped[e][m - sbr->kx[1]] = ch_data->env_facs[e+1][i];
+
+        // ch_data->bs_num_noise > 1 => 2 noise floors
+        k = (ch_data->bs_num_noise > 1) && (ch_data->t_env[e] >= ch_data->t_q[1]);
+        for (i = 0; i < sbr->n_q; i++)
+            for (m = sbr->f_tablenoise[i]; m < sbr->f_tablenoise[i + 1]; m++)
+                sbr->q_mapped[e][m - sbr->kx[1]] = ch_data->noise_facs[k+1][i];
+
+        for (i = 0; i < sbr->n[1]; i++) {
+            if (ch_data->bs_add_harmonic_flag) {
+                const unsigned int m_midpoint =
+                    (sbr->f_tablehigh[i] + sbr->f_tablehigh[i + 1]) >> 1;
+
+                ch_data->s_indexmapped[e + 1][m_midpoint - sbr->kx[1]] = ch_data->bs_add_harmonic[i] *
+                    (e >= e_a[1] || (ch_data->s_indexmapped[0][m_midpoint - sbr->kx[1]] == 1));
+            }
+        }
+
+        for (i = 0; i < ilim; i++) {
+            int additional_sinusoid_present = 0;
+            for (m = table[i]; m < table[i + 1]; m++) {
+                if (ch_data->s_indexmapped[e + 1][m - sbr->kx[1]]) {
+                    additional_sinusoid_present = 1;
+                    break;
+                }
+            }
+            memset(&sbr->s_mapped[e][table[i] - sbr->kx[1]], additional_sinusoid_present,
+                   (table[i + 1] - table[i]) * sizeof(sbr->s_mapped[e][0]));
+        }
+    }
+
+    memcpy(ch_data->s_indexmapped[0], ch_data->s_indexmapped[ch_data->bs_num_env], sizeof(ch_data->s_indexmapped[0]));
+    return 0;
+}
+
+/// Estimation of current envelope (14496-3 sp04 p218)
+static void sbr_env_estimate(AAC_FLOAT (*e_curr)[48], INTFLOAT X_high[64][40][2],
+                             SpectralBandReplication *sbr, SBRData *ch_data)
+{
+    int e, m;
+    int kx1 = sbr->kx[1];
+
+    if (sbr->bs_interpol_freq) {
+        for (e = 0; e < ch_data->bs_num_env; e++) {
+#if USE_FIXED
+            const SoftFloat recip_env_size = av_int2sf(0x20000000 / (ch_data->t_env[e + 1] - ch_data->t_env[e]), 30);
+#else
+            const float recip_env_size = 0.5f / (ch_data->t_env[e + 1] - ch_data->t_env[e]);
+#endif /* USE_FIXED */
+            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+
+            for (m = 0; m < sbr->m[1]; m++) {
+                AAC_FLOAT sum = sbr->dsp.sum_square(X_high[m+kx1] + ilb, iub - ilb);
+#if USE_FIXED
+                e_curr[e][m] = av_mul_sf(sum, recip_env_size);
+#else
+                e_curr[e][m] = sum * recip_env_size;
+#endif /* USE_FIXED */
+            }
+        }
+    } else {
+        int k, p;
+
+        for (e = 0; e < ch_data->bs_num_env; e++) {
+            const int env_size = 2 * (ch_data->t_env[e + 1] - ch_data->t_env[e]);
+            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            const uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
+
+            for (p = 0; p < sbr->n[ch_data->bs_freq_res[e + 1]]; p++) {
+#if USE_FIXED
+                SoftFloat sum = FLOAT_0;
+                const SoftFloat den = av_int2sf(0x20000000 / (env_size * (table[p + 1] - table[p])), 29);
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    sum = av_add_sf(sum, sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb));
+                }
+                sum = av_mul_sf(sum, den);
+#else
+                float sum = 0.0f;
+                const int den = env_size * (table[p + 1] - table[p]);
+
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    sum += sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb);
+                }
+                sum /= den;
+#endif /* USE_FIXED */
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    e_curr[e][k - kx1] = sum;
+                }
+            }
+        }
+    }
+}
+
+void AAC_RENAME(ff_sbr_apply)(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
+                  INTFLOAT* L, INTFLOAT* R)
+{
+    int downsampled = ac->oc[1].m4ac.ext_sample_rate < sbr->sample_rate;
+    int ch;
+    int nch = (id_aac == TYPE_CPE) ? 2 : 1;
+    int err;
+
+    if (id_aac != sbr->id_aac) {
+        av_log(ac->avctx, id_aac == TYPE_LFE ? AV_LOG_VERBOSE : AV_LOG_WARNING,
+            "element type mismatch %d != %d\n", id_aac, sbr->id_aac);
+        sbr_turnoff(sbr);
+    }
+
+    if (sbr->start && !sbr->ready_for_dequant) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "No quantized data read for sbr_dequant.\n");
+        sbr_turnoff(sbr);
+    }
+
+    if (!sbr->kx_and_m_pushed) {
+        sbr->kx[0] = sbr->kx[1];
+        sbr->m[0] = sbr->m[1];
+    } else {
+        sbr->kx_and_m_pushed = 0;
+    }
+
+    if (sbr->start) {
+        sbr_dequant(sbr, id_aac);
+        sbr->ready_for_dequant = 0;
+    }
+    for (ch = 0; ch < nch; ch++) {
+        /* decode channel */
+        sbr_qmf_analysis(ac->fdsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples,
+                         (INTFLOAT*)sbr->qmf_filter_scratch,
+                         sbr->data[ch].W, sbr->data[ch].Ypos);
+        sbr->c.sbr_lf_gen(ac, sbr, sbr->X_low,
+                          (const INTFLOAT (*)[32][32][2]) sbr->data[ch].W,
+                          sbr->data[ch].Ypos);
+        sbr->data[ch].Ypos ^= 1;
+        if (sbr->start) {
+            sbr->c.sbr_hf_inverse_filter(&sbr->dsp, sbr->alpha0, sbr->alpha1,
+                                         (const INTFLOAT (*)[40][2]) sbr->X_low, sbr->k[0]);
+            sbr_chirp(sbr, &sbr->data[ch]);
+            av_assert0(sbr->data[ch].bs_num_env > 0);
+            sbr_hf_gen(ac, sbr, sbr->X_high,
+                       (const INTFLOAT (*)[40][2]) sbr->X_low,
+                       (const INTFLOAT (*)[2]) sbr->alpha0,
+                       (const INTFLOAT (*)[2]) sbr->alpha1,
+                       sbr->data[ch].bw_array, sbr->data[ch].t_env,
+                       sbr->data[ch].bs_num_env);
+
+            // hf_adj
+            err = sbr_mapping(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
+            if (!err) {
+                sbr_env_estimate(sbr->e_curr, sbr->X_high, sbr, &sbr->data[ch]);
+                sbr_gain_calc(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
+                sbr->c.sbr_hf_assemble(sbr->data[ch].Y[sbr->data[ch].Ypos],
+                                (const INTFLOAT (*)[40][2]) sbr->X_high,
+                                sbr, &sbr->data[ch],
+                                sbr->data[ch].e_a);
+            }
+        }
+
+        /* synthesis */
+        sbr->c.sbr_x_gen(sbr, sbr->X[ch],
+                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
+                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
+                  (const INTFLOAT (*)[40][2]) sbr->X_low, ch);
+    }
+
+    if (ac->oc[1].m4ac.ps == 1) {
+        if (sbr->ps.start) {
+            AAC_RENAME(ff_ps_apply)(ac->avctx, &sbr->ps, sbr->X[0], sbr->X[1], sbr->kx[1] + sbr->m[1]);
+        } else {
+            memcpy(sbr->X[1], sbr->X[0], sizeof(sbr->X[0]));
+        }
+        nch = 2;
+    }
+
+    sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, ac->fdsp,
+                      L, sbr->X[0], sbr->qmf_filter_scratch,
+                      sbr->data[0].synthesis_filterbank_samples,
+                      &sbr->data[0].synthesis_filterbank_samples_offset,
+                      downsampled);
+    if (nch == 2)
+        sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, ac->fdsp,
+                          R, sbr->X[1], sbr->qmf_filter_scratch,
+                          sbr->data[1].synthesis_filterbank_samples,
+                          &sbr->data[1].synthesis_filterbank_samples_offset,
+                          downsampled);
+}
+
+static void aacsbr_func_ptr_init(AACSBRContext *c)
+{
+    c->sbr_lf_gen            = sbr_lf_gen;
+    c->sbr_hf_assemble       = sbr_hf_assemble;
+    c->sbr_x_gen             = sbr_x_gen;
+    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter;
+
+#if !USE_FIXED
+    if(ARCH_MIPS)
+        ff_aacsbr_func_ptr_init_mips(c);
+#endif
+}
diff --git a/libavcodec/aacsbrdata.h b/libavcodec/aacsbrdata.h
index f309059..4ff8fae 100644
--- a/libavcodec/aacsbrdata.h
+++ b/libavcodec/aacsbrdata.h
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding data
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 
 #include <stdint.h>
 #include "libavutil/mem.h"
+#include "aac_defines.h"
 
 ///< Huffman tables for SBR
 
@@ -266,351 +267,269 @@ static const int8_t sbr_offset[6][16] = {
     {-2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  9, 11, 13, 16, 20, 24}, // 64000 Hz <  fs_sbr
 };
 
-///< window coefficients for analysis/synthesis QMF banks
-static DECLARE_ALIGNED(32, float, sbr_qmf_window_ds)[320];
-static DECLARE_ALIGNED(32, float, sbr_qmf_window_us)[640] = {
-     0.0000000000, -0.0005525286, -0.0005617692, -0.0004947518,
-    -0.0004875227, -0.0004893791, -0.0005040714, -0.0005226564,
-    -0.0005466565, -0.0005677802, -0.0005870930, -0.0006132747,
-    -0.0006312493, -0.0006540333, -0.0006777690, -0.0006941614,
-    -0.0007157736, -0.0007255043, -0.0007440941, -0.0007490598,
-    -0.0007681371, -0.0007724848, -0.0007834332, -0.0007779869,
-    -0.0007803664, -0.0007801449, -0.0007757977, -0.0007630793,
-    -0.0007530001, -0.0007319357, -0.0007215391, -0.0006917937,
-    -0.0006650415, -0.0006341594, -0.0005946118, -0.0005564576,
-    -0.0005145572, -0.0004606325, -0.0004095121, -0.0003501175,
-    -0.0002896981, -0.0002098337, -0.0001446380, -0.0000617334,
-     0.0000134949,  0.0001094383,  0.0002043017,  0.0002949531,
-     0.0004026540,  0.0005107388,  0.0006239376,  0.0007458025,
-     0.0008608443,  0.0009885988,  0.0011250155,  0.0012577884,
-     0.0013902494,  0.0015443219,  0.0016868083,  0.0018348265,
-     0.0019841140,  0.0021461583,  0.0023017254,  0.0024625616,
-     0.0026201758,  0.0027870464,  0.0029469447,  0.0031125420,
-     0.0032739613,  0.0034418874,  0.0036008268,  0.0037603922,
-     0.0039207432,  0.0040819753,  0.0042264269,  0.0043730719,
-     0.0045209852,  0.0046606460,  0.0047932560,  0.0049137603,
-     0.0050393022,  0.0051407353,  0.0052461166,  0.0053471681,
-     0.0054196775,  0.0054876040,  0.0055475714,  0.0055938023,
-     0.0056220643,  0.0056455196,  0.0056389199,  0.0056266114,
-     0.0055917128,  0.0055404363,  0.0054753783,  0.0053838975,
-     0.0052715758,  0.0051382275,  0.0049839687,  0.0048109469,
-     0.0046039530,  0.0043801861,  0.0041251642,  0.0038456408,
-     0.0035401246,  0.0032091885,  0.0028446757,  0.0024508540,
-     0.0020274176,  0.0015784682,  0.0010902329,  0.0005832264,
-     0.0000276045, -0.0005464280, -0.0011568135, -0.0018039472,
-    -0.0024826723, -0.0031933778, -0.0039401124, -0.0047222596,
-    -0.0055337211, -0.0063792293, -0.0072615816, -0.0081798233,
-    -0.0091325329, -0.0101150215, -0.0111315548, -0.0121849995,
-     0.0132718220,  0.0143904666,  0.0155405553,  0.0167324712,
-     0.0179433381,  0.0191872431,  0.0204531793,  0.0217467550,
-     0.0230680169,  0.0244160992,  0.0257875847,  0.0271859429,
-     0.0286072173,  0.0300502657,  0.0315017608,  0.0329754081,
-     0.0344620948,  0.0359697560,  0.0374812850,  0.0390053679,
-     0.0405349170,  0.0420649094,  0.0436097542,  0.0451488405,
-     0.0466843027,  0.0482165720,  0.0497385755,  0.0512556155,
-     0.0527630746,  0.0542452768,  0.0557173648,  0.0571616450,
-     0.0585915683,  0.0599837480,  0.0613455171,  0.0626857808,
-     0.0639715898,  0.0652247106,  0.0664367512,  0.0676075985,
-     0.0687043828,  0.0697630244,  0.0707628710,  0.0717002673,
-     0.0725682583,  0.0733620255,  0.0741003642,  0.0747452558,
-     0.0753137336,  0.0758008358,  0.0761992479,  0.0764992170,
-     0.0767093490,  0.0768173975,  0.0768230011,  0.0767204924,
-     0.0765050718,  0.0761748321,  0.0757305756,  0.0751576255,
-     0.0744664394,  0.0736406005,  0.0726774642,  0.0715826364,
-     0.0703533073,  0.0689664013,  0.0674525021,  0.0657690668,
-     0.0639444805,  0.0619602779,  0.0598166570,  0.0575152691,
-     0.0550460034,  0.0524093821,  0.0495978676,  0.0466303305,
-     0.0434768782,  0.0401458278,  0.0366418116,  0.0329583930,
-     0.0290824006,  0.0250307561,  0.0207997072,  0.0163701258,
-     0.0117623832,  0.0069636862,  0.0019765601, -0.0032086896,
-    -0.0085711749, -0.0141288827, -0.0198834129, -0.0258227288,
-    -0.0319531274, -0.0382776572, -0.0447806821, -0.0514804176,
-    -0.0583705326, -0.0654409853, -0.0726943300, -0.0801372934,
-    -0.0877547536, -0.0955533352, -0.1035329531, -0.1116826931,
-    -0.1200077984, -0.1285002850, -0.1371551761, -0.1459766491,
-    -0.1549607071, -0.1640958855, -0.1733808172, -0.1828172548,
-    -0.1923966745, -0.2021250176, -0.2119735853, -0.2219652696,
-    -0.2320690870, -0.2423016884, -0.2526480309, -0.2631053299,
-    -0.2736634040, -0.2843214189, -0.2950716717, -0.3059098575,
-    -0.3168278913, -0.3278113727, -0.3388722693, -0.3499914122,
-     0.3611589903,  0.3723795546,  0.3836350013,  0.3949211761,
-     0.4062317676,  0.4175696896,  0.4289119920,  0.4402553754,
-     0.4515996535,  0.4629308085,  0.4742453214,  0.4855253091,
-     0.4967708254,  0.5079817500,  0.5191234970,  0.5302240895,
-     0.5412553448,  0.5522051258,  0.5630789140,  0.5738524131,
-     0.5845403235,  0.5951123086,  0.6055783538,  0.6159109932,
-     0.6261242695,  0.6361980107,  0.6461269695,  0.6559016302,
-     0.6655139880,  0.6749663190,  0.6842353293,  0.6933282376,
-     0.7022388719,  0.7109410426,  0.7194462634,  0.7277448900,
-     0.7358211758,  0.7436827863,  0.7513137456,  0.7587080760,
-     0.7658674865,  0.7727780881,  0.7794287519,  0.7858353120,
-     0.7919735841,  0.7978466413,  0.8034485751,  0.8087695004,
-     0.8138191270,  0.8185776004,  0.8230419890,  0.8272275347,
-     0.8311038457,  0.8346937361,  0.8379717337,  0.8409541392,
-     0.8436238281,  0.8459818469,  0.8480315777,  0.8497805198,
-     0.8511971524,  0.8523047035,  0.8531020949,  0.8535720573,
-     0.8537385600,
-};
-
-/* First two entries repeated at end to simplify SIMD implementations. */
-const DECLARE_ALIGNED(16, float, ff_sbr_noise_table)[][2] = {
-{-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
-{ 0.14130051758487, -0.95090983575689}, {-0.47005496701697, -0.37340549728647},
-{ 0.80705063769351,  0.29653668284408}, {-0.38981478896926,  0.89572605717087},
-{-0.01053049862020, -0.66959058036166}, {-0.91266367957293, -0.11522938140034},
-{ 0.54840422910309,  0.75221367176302}, { 0.40009252867955, -0.98929400334421},
-{-0.99867974711855, -0.88147068645358}, {-0.95531076805040,  0.90908757154593},
-{-0.45725933317144, -0.56716323646760}, {-0.72929675029275, -0.98008272727324},
-{ 0.75622801399036,  0.20950329995549}, { 0.07069442601050, -0.78247898470706},
-{ 0.74496252926055, -0.91169004445807}, {-0.96440182703856, -0.94739918296622},
-{ 0.30424629369539, -0.49438267012479}, { 0.66565033746925,  0.64652935542491},
-{ 0.91697008020594,  0.17514097332009}, {-0.70774918760427,  0.52548653416543},
-{-0.70051415345560, -0.45340028808763}, {-0.99496513054797, -0.90071908066973},
-{ 0.98164490790123, -0.77463155528697}, {-0.54671580548181, -0.02570928536004},
-{-0.01689629065389,  0.00287506445732}, {-0.86110349531986,  0.42548583726477},
-{-0.98892980586032, -0.87881132267556}, { 0.51756627678691,  0.66926784710139},
-{-0.99635026409640, -0.58107730574765}, {-0.99969370862163,  0.98369989360250},
-{ 0.55266258627194,  0.59449057465591}, { 0.34581177741673,  0.94879421061866},
-{ 0.62664209577999, -0.74402970906471}, {-0.77149701404973, -0.33883658042801},
-{-0.91592244254432,  0.03687901376713}, {-0.76285492357887, -0.91371867919124},
-{ 0.79788337195331, -0.93180971199849}, { 0.54473080610200, -0.11919206037186},
-{-0.85639281671058,  0.42429854760451}, {-0.92882402971423,  0.27871809078609},
-{-0.11708371046774, -0.99800843444966}, { 0.21356749817493, -0.90716295627033},
-{-0.76191692573909,  0.99768118356265}, { 0.98111043100884, -0.95854459734407},
-{-0.85913269895572,  0.95766566168880}, {-0.93307242253692,  0.49431757696466},
-{ 0.30485754879632, -0.70540034357529}, { 0.85289650925190,  0.46766131791044},
-{ 0.91328082618125, -0.99839597361769}, {-0.05890199924154,  0.70741827819497},
-{ 0.28398686150148,  0.34633555702188}, { 0.95258164539612, -0.54893416026939},
-{-0.78566324168507, -0.75568541079691}, {-0.95789495447877, -0.20423194696966},
-{ 0.82411158711197,  0.96654618432562}, {-0.65185446735885, -0.88734990773289},
-{-0.93643603134666,  0.99870790442385}, { 0.91427159529618, -0.98290505544444},
-{-0.70395684036886,  0.58796798221039}, { 0.00563771969365,  0.61768196727244},
-{ 0.89065051931895,  0.52783352697585}, {-0.68683707712762,  0.80806944710339},
-{ 0.72165342518718, -0.69259857349564}, {-0.62928247730667,  0.13627037407335},
-{ 0.29938434065514, -0.46051329682246}, {-0.91781958879280, -0.74012716684186},
-{ 0.99298717043688,  0.40816610075661}, { 0.82368298622748, -0.74036047190173},
-{-0.98512833386833, -0.99972330709594}, {-0.95915368242257, -0.99237800466040},
-{-0.21411126572790, -0.93424819052545}, {-0.68821476106884, -0.26892306315457},
-{ 0.91851997982317,  0.09358228901785}, {-0.96062769559127,  0.36099095133739},
-{ 0.51646184922287, -0.71373332873917}, { 0.61130721139669,  0.46950141175917},
-{ 0.47336129371299, -0.27333178296162}, { 0.90998308703519,  0.96715662938132},
-{ 0.44844799194357,  0.99211574628306}, { 0.66614891079092,  0.96590176169121},
-{ 0.74922239129237, -0.89879858826087}, {-0.99571588506485,  0.52785521494349},
-{ 0.97401082477563, -0.16855870075190}, { 0.72683747733879, -0.48060774432251},
-{ 0.95432193457128,  0.68849603408441}, {-0.72962208425191, -0.76608443420917},
-{-0.85359479233537,  0.88738125901579}, {-0.81412430338535, -0.97480768049637},
-{-0.87930772356786,  0.74748307690436}, {-0.71573331064977, -0.98570608178923},
-{ 0.83524300028228,  0.83702537075163}, {-0.48086065601423, -0.98848504923531},
-{ 0.97139128574778,  0.80093621198236}, { 0.51992825347895,  0.80247631400510},
-{-0.00848591195325, -0.76670128000486}, {-0.70294374303036,  0.55359910445577},
-{-0.95894428168140, -0.43265504344783}, { 0.97079252950321,  0.09325857238682},
-{-0.92404293670797,  0.85507704027855}, {-0.69506469500450,  0.98633412625459},
-{ 0.26559203620024,  0.73314307966524}, { 0.28038443336943,  0.14537913654427},
-{-0.74138124825523,  0.99310339807762}, {-0.01752795995444, -0.82616635284178},
-{-0.55126773094930, -0.98898543862153}, { 0.97960898850996, -0.94021446752851},
-{-0.99196309146936,  0.67019017358456}, {-0.67684928085260,  0.12631491649378},
-{ 0.09140039465500, -0.20537731453108}, {-0.71658965751996, -0.97788200391224},
-{ 0.81014640078925,  0.53722648362443}, { 0.40616991671205, -0.26469008598449},
-{-0.67680188682972,  0.94502052337695}, { 0.86849774348749, -0.18333598647899},
-{-0.99500381284851, -0.02634122068550}, { 0.84329189340667,  0.10406957462213},
-{-0.09215968531446,  0.69540012101253}, { 0.99956173327206, -0.12358542001404},
-{-0.79732779473535, -0.91582524736159}, { 0.96349973642406,  0.96640458041000},
-{-0.79942778496547,  0.64323902822857}, {-0.11566039853896,  0.28587846253726},
-{-0.39922954514662,  0.94129601616966}, { 0.99089197565987, -0.92062625581587},
-{ 0.28631285179909, -0.91035047143603}, {-0.83302725605608, -0.67330410892084},
-{ 0.95404443402072,  0.49162765398743}, {-0.06449863579434,  0.03250560813135},
-{-0.99575054486311,  0.42389784469507}, {-0.65501142790847,  0.82546114655624},
-{-0.81254441908887, -0.51627234660629}, {-0.99646369485481,  0.84490533520752},
-{ 0.00287840603348,  0.64768261158166}, { 0.70176989408455, -0.20453028573322},
-{ 0.96361882270190,  0.40706967140989}, {-0.68883758192426,  0.91338958840772},
-{-0.34875585502238,  0.71472290693300}, { 0.91980081243087,  0.66507455644919},
-{-0.99009048343881,  0.85868021604848}, { 0.68865791458395,  0.55660316809678},
-{-0.99484402129368, -0.20052559254934}, { 0.94214511408023, -0.99696425367461},
-{-0.67414626793544,  0.49548221180078}, {-0.47339353684664, -0.85904328834047},
-{ 0.14323651387360, -0.94145598222488}, {-0.29268293575672,  0.05759224927952},
-{ 0.43793861458754, -0.78904969892724}, {-0.36345126374441,  0.64874435357162},
-{-0.08750604656825,  0.97686944362527}, {-0.96495267812511, -0.53960305946511},
-{ 0.55526940659947,  0.78891523734774}, { 0.73538215752630,  0.96452072373404},
-{-0.30889773919437, -0.80664389776860}, { 0.03574995626194, -0.97325616900959},
-{ 0.98720684660488,  0.48409133691962}, {-0.81689296271203, -0.90827703628298},
-{ 0.67866860118215,  0.81284503870856}, {-0.15808569732583,  0.85279555024382},
-{ 0.80723395114371, -0.24717418514605}, { 0.47788757329038, -0.46333147839295},
-{ 0.96367554763201,  0.38486749303242}, {-0.99143875716818, -0.24945277239809},
-{ 0.83081876925833, -0.94780851414763}, {-0.58753191905341,  0.01290772389163},
-{ 0.95538108220960, -0.85557052096538}, {-0.96490920476211, -0.64020970923102},
-{-0.97327101028521,  0.12378128133110}, { 0.91400366022124,  0.57972471346930},
-{-0.99925837363824,  0.71084847864067}, {-0.86875903507313, -0.20291699203564},
-{-0.26240034795124, -0.68264554369108}, {-0.24664412953388, -0.87642273115183},
-{ 0.02416275806869,  0.27192914288905}, { 0.82068619590515, -0.85087787994476},
-{ 0.88547373760759, -0.89636802901469}, {-0.18173078152226, -0.26152145156800},
-{ 0.09355476558534,  0.54845123045604}, {-0.54668414224090,  0.95980774020221},
-{ 0.37050990604091, -0.59910140383171}, {-0.70373594262891,  0.91227665827081},
-{-0.34600785879594, -0.99441426144200}, {-0.68774481731008, -0.30238837956299},
-{-0.26843291251234,  0.83115668004362}, { 0.49072334613242, -0.45359708737775},
-{ 0.38975993093975,  0.95515358099121}, {-0.97757125224150,  0.05305894580606},
-{-0.17325552859616, -0.92770672250494}, { 0.99948035025744,  0.58285545563426},
-{-0.64946246527458,  0.68645507104960}, {-0.12016920576437, -0.57147322153312},
-{-0.58947456517751, -0.34847132454388}, {-0.41815140454465,  0.16276422358861},
-{ 0.99885650204884,  0.11136095490444}, {-0.56649614128386, -0.90494866361587},
-{ 0.94138021032330,  0.35281916733018}, {-0.75725076534641,  0.53650549640587},
-{ 0.20541973692630, -0.94435144369918}, { 0.99980371023351,  0.79835913565599},
-{ 0.29078277605775,  0.35393777921520}, {-0.62858772103030,  0.38765693387102},
-{ 0.43440904467688, -0.98546330463232}, {-0.98298583762390,  0.21021524625209},
-{ 0.19513029146934, -0.94239832251867}, {-0.95476662400101,  0.98364554179143},
-{ 0.93379635304810, -0.70881994583682}, {-0.85235410573336, -0.08342347966410},
-{-0.86425093011245, -0.45795025029466}, { 0.38879779059045,  0.97274429344593},
-{ 0.92045124735495, -0.62433652524220}, { 0.89162532251878,  0.54950955570563},
-{-0.36834336949252,  0.96458298020975}, { 0.93891760988045, -0.89968353740388},
-{ 0.99267657565094, -0.03757034316958}, {-0.94063471614176,  0.41332338538963},
-{ 0.99740224117019, -0.16830494996370}, {-0.35899413170555, -0.46633226649613},
-{ 0.05237237274947, -0.25640361602661}, { 0.36703583957424, -0.38653265641875},
-{ 0.91653180367913, -0.30587628726597}, { 0.69000803499316,  0.90952171386132},
-{-0.38658751133527,  0.99501571208985}, {-0.29250814029851,  0.37444994344615},
-{-0.60182204677608,  0.86779651036123}, {-0.97418588163217,  0.96468523666475},
-{ 0.88461574003963,  0.57508405276414}, { 0.05198933055162,  0.21269661669964},
-{-0.53499621979720,  0.97241553731237}, {-0.49429560226497,  0.98183865291903},
-{-0.98935142339139, -0.40249159006933}, {-0.98081380091130, -0.72856895534041},
-{-0.27338148835532,  0.99950922447209}, { 0.06310802338302, -0.54539587529618},
-{-0.20461677199539, -0.14209977628489}, { 0.66223843141647,  0.72528579940326},
-{-0.84764345483665,  0.02372316801261}, {-0.89039863483811,  0.88866581484602},
-{ 0.95903308477986,  0.76744927173873}, { 0.73504123909879, -0.03747203173192},
-{-0.31744434966056, -0.36834111883652}, {-0.34110827591623,  0.40211222807691},
-{ 0.47803883714199, -0.39423219786288}, { 0.98299195879514,  0.01989791390047},
-{-0.30963073129751, -0.18076720599336}, { 0.99992588229018, -0.26281872094289},
-{-0.93149731080767, -0.98313162570490}, { 0.99923472302773, -0.80142993767554},
-{-0.26024169633417, -0.75999759855752}, {-0.35712514743563,  0.19298963768574},
-{-0.99899084509530,  0.74645156992493}, { 0.86557171579452,  0.55593866696299},
-{ 0.33408042438752,  0.86185953874709}, { 0.99010736374716,  0.04602397576623},
-{-0.66694269691195, -0.91643611810148}, { 0.64016792079480,  0.15649530836856},
-{ 0.99570534804836,  0.45844586038111}, {-0.63431466947340,  0.21079116459234},
-{-0.07706847005931, -0.89581437101329}, { 0.98590090577724,  0.88241721133981},
-{ 0.80099335254678, -0.36851896710853}, { 0.78368131392666,  0.45506999802597},
-{ 0.08707806671691,  0.80938994918745}, {-0.86811883080712,  0.39347308654705},
-{-0.39466529740375, -0.66809432114456}, { 0.97875325649683, -0.72467840967746},
-{-0.95038560288864,  0.89563219587625}, { 0.17005239424212,  0.54683053962658},
-{-0.76910792026848, -0.96226617549298}, { 0.99743281016846,  0.42697157037567},
-{ 0.95437383549973,  0.97002324109952}, { 0.99578905365569, -0.54106826257356},
-{ 0.28058259829990, -0.85361420634036}, { 0.85256524470573, -0.64567607735589},
-{-0.50608540105128, -0.65846015480300}, {-0.97210735183243, -0.23095213067791},
-{ 0.95424048234441, -0.99240147091219}, {-0.96926570524023,  0.73775654896574},
-{ 0.30872163214726,  0.41514960556126}, {-0.24523839572639,  0.63206633394807},
-{-0.33813265086024, -0.38661779441897}, {-0.05826828420146, -0.06940774188029},
-{-0.22898461455054,  0.97054853316316}, {-0.18509915019881,  0.47565762892084},
-{-0.10488238045009, -0.87769947402394}, {-0.71886586182037,  0.78030982480538},
-{ 0.99793873738654,  0.90041310491497}, { 0.57563307626120, -0.91034337352097},
-{ 0.28909646383717,  0.96307783970534}, { 0.42188998312520,  0.48148651230437},
-{ 0.93335049681047, -0.43537023883588}, {-0.97087374418267,  0.86636445711364},
-{ 0.36722871286923,  0.65291654172961}, {-0.81093025665696,  0.08778370229363},
-{-0.26240603062237, -0.92774095379098}, { 0.83996497984604,  0.55839849139647},
-{-0.99909615720225, -0.96024605713970}, { 0.74649464155061,  0.12144893606462},
-{-0.74774595569805, -0.26898062008959}, { 0.95781667469567, -0.79047927052628},
-{ 0.95472308713099, -0.08588776019550}, { 0.48708332746299,  0.99999041579432},
-{ 0.46332038247497,  0.10964126185063}, {-0.76497004940162,  0.89210929242238},
-{ 0.57397389364339,  0.35289703373760}, { 0.75374316974495,  0.96705214651335},
-{-0.59174397685714, -0.89405370422752}, { 0.75087906691890, -0.29612672982396},
-{-0.98607857336230,  0.25034911730023}, {-0.40761056640505, -0.90045573444695},
-{ 0.66929266740477,  0.98629493401748}, {-0.97463695257310, -0.00190223301301},
-{ 0.90145509409859,  0.99781390365446}, {-0.87259289048043,  0.99233587353666},
-{-0.91529461447692, -0.15698707534206}, {-0.03305738840705, -0.37205262859764},
-{ 0.07223051368337, -0.88805001733626}, { 0.99498012188353,  0.97094358113387},
-{-0.74904939500519,  0.99985483641521}, { 0.04585228574211,  0.99812337444082},
-{-0.89054954257993, -0.31791913188064}, {-0.83782144651251,  0.97637632547466},
-{ 0.33454804933804, -0.86231516800408}, {-0.99707579362824,  0.93237990079441},
-{-0.22827527843994,  0.18874759397997}, { 0.67248046289143, -0.03646211390569},
-{-0.05146538187944, -0.92599700120679}, { 0.99947295749905,  0.93625229707912},
-{ 0.66951124390363,  0.98905825623893}, {-0.99602956559179, -0.44654715757688},
-{ 0.82104905483590,  0.99540741724928}, { 0.99186510988782,  0.72023001312947},
-{-0.65284592392918,  0.52186723253637}, { 0.93885443798188, -0.74895312615259},
-{ 0.96735248738388,  0.90891816978629}, {-0.22225968841114,  0.57124029781228},
-{-0.44132783753414, -0.92688840659280}, {-0.85694974219574,  0.88844532719844},
-{ 0.91783042091762, -0.46356892383970}, { 0.72556974415690, -0.99899555770747},
-{-0.99711581834508,  0.58211560180426}, { 0.77638976371966,  0.94321834873819},
-{ 0.07717324253925,  0.58638399856595}, {-0.56049829194163,  0.82522301569036},
-{ 0.98398893639988,  0.39467440420569}, { 0.47546946844938,  0.68613044836811},
-{ 0.65675089314631,  0.18331637134880}, { 0.03273375457980, -0.74933109564108},
-{-0.38684144784738,  0.51337349030406}, {-0.97346267944545, -0.96549364384098},
-{-0.53282156061942, -0.91423265091354}, { 0.99817310731176,  0.61133572482148},
-{-0.50254500772635, -0.88829338134294}, { 0.01995873238855,  0.85223515096765},
-{ 0.99930381973804,  0.94578896296649}, { 0.82907767600783, -0.06323442598128},
-{-0.58660709669728,  0.96840773806582}, {-0.17573736667267, -0.48166920859485},
-{ 0.83434292401346, -0.13023450646997}, { 0.05946491307025,  0.20511047074866},
-{ 0.81505484574602, -0.94685947861369}, {-0.44976380954860,  0.40894572671545},
-{-0.89746474625671,  0.99846578838537}, { 0.39677256130792, -0.74854668609359},
-{-0.07588948563079,  0.74096214084170}, { 0.76343198951445,  0.41746629422634},
-{-0.74490104699626,  0.94725911744610}, { 0.64880119792759,  0.41336660830571},
-{ 0.62319537462542, -0.93098313552599}, { 0.42215817594807, -0.07712787385208},
-{ 0.02704554141885, -0.05417518053666}, { 0.80001773566818,  0.91542195141039},
-{-0.79351832348816, -0.36208897989136}, { 0.63872359151636,  0.08128252493444},
-{ 0.52890520960295,  0.60048872455592}, { 0.74238552914587,  0.04491915291044},
-{ 0.99096131449250, -0.19451182854402}, {-0.80412329643109, -0.88513818199457},
-{-0.64612616129736,  0.72198674804544}, { 0.11657770663191, -0.83662833815041},
-{-0.95053182488101, -0.96939905138082}, {-0.62228872928622,  0.82767262846661},
-{ 0.03004475787316, -0.99738896333384}, {-0.97987214341034,  0.36526129686425},
-{-0.99986980746200, -0.36021610299715}, { 0.89110648599879, -0.97894250343044},
-{ 0.10407960510582,  0.77357793811619}, { 0.95964737821728, -0.35435818285502},
-{ 0.50843233159162,  0.96107691266205}, { 0.17006334670615, -0.76854025314829},
-{ 0.25872675063360,  0.99893303933816}, {-0.01115998681937,  0.98496019742444},
-{-0.79598702973261,  0.97138411318894}, {-0.99264708948101, -0.99542822402536},
-{-0.99829663752818,  0.01877138824311}, {-0.70801016548184,  0.33680685948117},
-{-0.70467057786826,  0.93272777501857}, { 0.99846021905254, -0.98725746254433},
-{-0.63364968534650, -0.16473594423746}, {-0.16258217500792, -0.95939125400802},
-{-0.43645594360633, -0.94805030113284}, {-0.99848471702976,  0.96245166923809},
-{-0.16796458968998, -0.98987511890470}, {-0.87979225745213, -0.71725725041680},
-{ 0.44183099021786, -0.93568974498761}, { 0.93310180125532, -0.99913308068246},
-{-0.93941931782002, -0.56409379640356}, {-0.88590003188677,  0.47624600491382},
-{ 0.99971463703691, -0.83889954253462}, {-0.75376385639978,  0.00814643438625},
-{ 0.93887685615875, -0.11284528204636}, { 0.85126435782309,  0.52349251543547},
-{ 0.39701421446381,  0.81779634174316}, {-0.37024464187437, -0.87071656222959},
-{-0.36024828242896,  0.34655735648287}, {-0.93388812549209, -0.84476541096429},
-{-0.65298804552119, -0.18439575450921}, { 0.11960319006843,  0.99899346780168},
-{ 0.94292565553160,  0.83163906518293}, { 0.75081145286948, -0.35533223142265},
-{ 0.56721979748394, -0.24076836414499}, { 0.46857766746029, -0.30140233457198},
-{ 0.97312313923635, -0.99548191630031}, {-0.38299976567017,  0.98516909715427},
-{ 0.41025800019463,  0.02116736935734}, { 0.09638062008048,  0.04411984381457},
-{-0.85283249275397,  0.91475563922421}, { 0.88866808958124, -0.99735267083226},
-{-0.48202429536989, -0.96805608884164}, { 0.27572582416567,  0.58634753335832},
-{-0.65889129659168,  0.58835634138583}, { 0.98838086953732,  0.99994349600236},
-{-0.20651349620689,  0.54593044066355}, {-0.62126416356920, -0.59893681700392},
-{ 0.20320105410437, -0.86879180355289}, {-0.97790548600584,  0.96290806999242},
-{ 0.11112534735126,  0.21484763313301}, {-0.41368337314182,  0.28216837680365},
-{ 0.24133038992960,  0.51294362630238}, {-0.66393410674885, -0.08249679629081},
-{-0.53697829178752, -0.97649903936228}, {-0.97224737889348,  0.22081333579837},
-{ 0.87392477144549, -0.12796173740361}, { 0.19050361015753,  0.01602615387195},
-{-0.46353441212724, -0.95249041539006}, {-0.07064096339021, -0.94479803205886},
-{-0.92444085484466, -0.10457590187436}, {-0.83822593578728, -0.01695043208885},
-{ 0.75214681811150, -0.99955681042665}, {-0.42102998829339,  0.99720941999394},
-{-0.72094786237696, -0.35008961934255}, { 0.78843311019251,  0.52851398958271},
-{ 0.97394027897442, -0.26695944086561}, { 0.99206463477946, -0.57010120849429},
-{ 0.76789609461795, -0.76519356730966}, {-0.82002421836409, -0.73530179553767},
-{ 0.81924990025724,  0.99698425250579}, {-0.26719850873357,  0.68903369776193},
-{-0.43311260380975,  0.85321815947490}, { 0.99194979673836,  0.91876249766422},
-{-0.80692001248487, -0.32627540663214}, { 0.43080003649976, -0.21919095636638},
-{ 0.67709491937357, -0.95478075822906}, { 0.56151770568316, -0.70693811747778},
-{ 0.10831862810749, -0.08628837174592}, { 0.91229417540436, -0.65987351408410},
-{-0.48972893932274,  0.56289246362686}, {-0.89033658689697, -0.71656563987082},
-{ 0.65269447475094,  0.65916004833932}, { 0.67439478141121, -0.81684380846796},
-{-0.47770832416973, -0.16789556203025}, {-0.99715979260878, -0.93565784007648},
-{-0.90889593602546,  0.62034397054380}, {-0.06618622548177, -0.23812217221359},
-{ 0.99430266919728,  0.18812555317553}, { 0.97686402381843, -0.28664534366620},
-{ 0.94813650221268, -0.97506640027128}, {-0.95434497492853, -0.79607978501983},
-{-0.49104783137150,  0.32895214359663}, { 0.99881175120751,  0.88993983831354},
-{ 0.50449166760303, -0.85995072408434}, { 0.47162891065108, -0.18680204049569},
-{-0.62081581361840,  0.75000676218956}, {-0.43867015250812,  0.99998069244322},
-{ 0.98630563232075, -0.53578899600662}, {-0.61510362277374, -0.89515019899997},
-{-0.03841517601843, -0.69888815681179}, {-0.30102157304644, -0.07667808922205},
-{ 0.41881284182683,  0.02188098922282}, {-0.86135454941237,  0.98947480909359},
-{ 0.67226861393788, -0.13494389011014}, {-0.70737398842068, -0.76547349325992},
-{ 0.94044946687963,  0.09026201157416}, {-0.82386352534327,  0.08924768823676},
-{-0.32070666698656,  0.50143421908753}, { 0.57593163224487, -0.98966422921509},
-{-0.36326018419965,  0.07440243123228}, { 0.99979044674350, -0.14130287347405},
-{-0.92366023326932, -0.97979298068180}, {-0.44607178518598, -0.54233252016394},
-{ 0.44226800932956,  0.71326756742752}, { 0.03671907158312,  0.63606389366675},
-{ 0.52175424682195, -0.85396826735705}, {-0.94701139690956, -0.01826348194255},
-{-0.98759606946049,  0.82288714303073}, { 0.87434794743625,  0.89399495655433},
-{-0.93412041758744,  0.41374052024363}, { 0.96063943315511,  0.93116709541280},
-{ 0.97534253457837,  0.86150930812689}, { 0.99642466504163,  0.70190043427512},
-{-0.94705089665984, -0.29580042814306}, { 0.91599807087376, -0.98147830385781},
-{-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
+/* First eight entries repeated at end to simplify SIMD implementations. */
+const DECLARE_ALIGNED(16, INTFLOAT, AAC_RENAME(ff_sbr_noise_table))[][2] = {
+{Q31(-0.99948153278296f), Q31(-0.59483417516607f)}, {Q31( 0.97113454393991f), Q31(-0.67528515225647f)},
+{Q31( 0.14130051758487f), Q31(-0.95090983575689f)}, {Q31(-0.47005496701697f), Q31(-0.37340549728647f)},
+{Q31( 0.80705063769351f), Q31( 0.29653668284408f)}, {Q31(-0.38981478896926f), Q31( 0.89572605717087f)},
+{Q31(-0.01053049862020f), Q31(-0.66959058036166f)}, {Q31(-0.91266367957293f), Q31(-0.11522938140034f)},
+{Q31( 0.54840422910309f), Q31( 0.75221367176302f)}, {Q31( 0.40009252867955f), Q31(-0.98929400334421f)},
+{Q31(-0.99867974711855f), Q31(-0.88147068645358f)}, {Q31(-0.95531076805040f), Q31( 0.90908757154593f)},
+{Q31(-0.45725933317144f), Q31(-0.56716323646760f)}, {Q31(-0.72929675029275f), Q31(-0.98008272727324f)},
+{Q31( 0.75622801399036f), Q31( 0.20950329995549f)}, {Q31( 0.07069442601050f), Q31(-0.78247898470706f)},
+{Q31( 0.74496252926055f), Q31(-0.91169004445807f)}, {Q31(-0.96440182703856f), Q31(-0.94739918296622f)},
+{Q31( 0.30424629369539f), Q31(-0.49438267012479f)}, {Q31( 0.66565033746925f), Q31( 0.64652935542491f)},
+{Q31( 0.91697008020594f), Q31( 0.17514097332009f)}, {Q31(-0.70774918760427f), Q31( 0.52548653416543f)},
+{Q31(-0.70051415345560f), Q31(-0.45340028808763f)}, {Q31(-0.99496513054797f), Q31(-0.90071908066973f)},
+{Q31( 0.98164490790123f), Q31(-0.77463155528697f)}, {Q31(-0.54671580548181f), Q31(-0.02570928536004f)},
+{Q31(-0.01689629065389f), Q31( 0.00287506445732f)}, {Q31(-0.86110349531986f), Q31( 0.42548583726477f)},
+{Q31(-0.98892980586032f), Q31(-0.87881132267556f)}, {Q31( 0.51756627678691f), Q31( 0.66926784710139f)},
+{Q31(-0.99635026409640f), Q31(-0.58107730574765f)}, {Q31(-0.99969370862163f), Q31( 0.98369989360250f)},
+{Q31( 0.55266258627194f), Q31( 0.59449057465591f)}, {Q31( 0.34581177741673f), Q31( 0.94879421061866f)},
+{Q31( 0.62664209577999f), Q31(-0.74402970906471f)}, {Q31(-0.77149701404973f), Q31(-0.33883658042801f)},
+{Q31(-0.91592244254432f), Q31( 0.03687901376713f)}, {Q31(-0.76285492357887f), Q31(-0.91371867919124f)},
+{Q31( 0.79788337195331f), Q31(-0.93180971199849f)}, {Q31( 0.54473080610200f), Q31(-0.11919206037186f)},
+{Q31(-0.85639281671058f), Q31( 0.42429854760451f)}, {Q31(-0.92882402971423f), Q31( 0.27871809078609f)},
+{Q31(-0.11708371046774f), Q31(-0.99800843444966f)}, {Q31( 0.21356749817493f), Q31(-0.90716295627033f)},
+{Q31(-0.76191692573909f), Q31( 0.99768118356265f)}, {Q31( 0.98111043100884f), Q31(-0.95854459734407f)},
+{Q31(-0.85913269895572f), Q31( 0.95766566168880f)}, {Q31(-0.93307242253692f), Q31( 0.49431757696466f)},
+{Q31( 0.30485754879632f), Q31(-0.70540034357529f)}, {Q31( 0.85289650925190f), Q31( 0.46766131791044f)},
+{Q31( 0.91328082618125f), Q31(-0.99839597361769f)}, {Q31(-0.05890199924154f), Q31( 0.70741827819497f)},
+{Q31( 0.28398686150148f), Q31( 0.34633555702188f)}, {Q31( 0.95258164539612f), Q31(-0.54893416026939f)},
+{Q31(-0.78566324168507f), Q31(-0.75568541079691f)}, {Q31(-0.95789495447877f), Q31(-0.20423194696966f)},
+{Q31( 0.82411158711197f), Q31( 0.96654618432562f)}, {Q31(-0.65185446735885f), Q31(-0.88734990773289f)},
+{Q31(-0.93643603134666f), Q31( 0.99870790442385f)}, {Q31( 0.91427159529618f), Q31(-0.98290505544444f)},
+{Q31(-0.70395684036886f), Q31( 0.58796798221039f)}, {Q31( 0.00563771969365f), Q31( 0.61768196727244f)},
+{Q31( 0.89065051931895f), Q31( 0.52783352697585f)}, {Q31(-0.68683707712762f), Q31( 0.80806944710339f)},
+{Q31( 0.72165342518718f), Q31(-0.69259857349564f)}, {Q31(-0.62928247730667f), Q31( 0.13627037407335f)},
+{Q31( 0.29938434065514f), Q31(-0.46051329682246f)}, {Q31(-0.91781958879280f), Q31(-0.74012716684186f)},
+{Q31( 0.99298717043688f), Q31( 0.40816610075661f)}, {Q31( 0.82368298622748f), Q31(-0.74036047190173f)},
+{Q31(-0.98512833386833f), Q31(-0.99972330709594f)}, {Q31(-0.95915368242257f), Q31(-0.99237800466040f)},
+{Q31(-0.21411126572790f), Q31(-0.93424819052545f)}, {Q31(-0.68821476106884f), Q31(-0.26892306315457f)},
+{Q31( 0.91851997982317f), Q31( 0.09358228901785f)}, {Q31(-0.96062769559127f), Q31( 0.36099095133739f)},
+{Q31( 0.51646184922287f), Q31(-0.71373332873917f)}, {Q31( 0.61130721139669f), Q31( 0.46950141175917f)},
+{Q31( 0.47336129371299f), Q31(-0.27333178296162f)}, {Q31( 0.90998308703519f), Q31( 0.96715662938132f)},
+{Q31( 0.44844799194357f), Q31( 0.99211574628306f)}, {Q31( 0.66614891079092f), Q31( 0.96590176169121f)},
+{Q31( 0.74922239129237f), Q31(-0.89879858826087f)}, {Q31(-0.99571588506485f), Q31( 0.52785521494349f)},
+{Q31( 0.97401082477563f), Q31(-0.16855870075190f)}, {Q31( 0.72683747733879f), Q31(-0.48060774432251f)},
+{Q31( 0.95432193457128f), Q31( 0.68849603408441f)}, {Q31(-0.72962208425191f), Q31(-0.76608443420917f)},
+{Q31(-0.85359479233537f), Q31( 0.88738125901579f)}, {Q31(-0.81412430338535f), Q31(-0.97480768049637f)},
+{Q31(-0.87930772356786f), Q31( 0.74748307690436f)}, {Q31(-0.71573331064977f), Q31(-0.98570608178923f)},
+{Q31( 0.83524300028228f), Q31( 0.83702537075163f)}, {Q31(-0.48086065601423f), Q31(-0.98848504923531f)},
+{Q31( 0.97139128574778f), Q31( 0.80093621198236f)}, {Q31( 0.51992825347895f), Q31( 0.80247631400510f)},
+{Q31(-0.00848591195325f), Q31(-0.76670128000486f)}, {Q31(-0.70294374303036f), Q31( 0.55359910445577f)},
+{Q31(-0.95894428168140f), Q31(-0.43265504344783f)}, {Q31( 0.97079252950321f), Q31( 0.09325857238682f)},
+{Q31(-0.92404293670797f), Q31( 0.85507704027855f)}, {Q31(-0.69506469500450f), Q31( 0.98633412625459f)},
+{Q31( 0.26559203620024f), Q31( 0.73314307966524f)}, {Q31( 0.28038443336943f), Q31( 0.14537913654427f)},
+{Q31(-0.74138124825523f), Q31( 0.99310339807762f)}, {Q31(-0.01752795995444f), Q31(-0.82616635284178f)},
+{Q31(-0.55126773094930f), Q31(-0.98898543862153f)}, {Q31( 0.97960898850996f), Q31(-0.94021446752851f)},
+{Q31(-0.99196309146936f), Q31( 0.67019017358456f)}, {Q31(-0.67684928085260f), Q31( 0.12631491649378f)},
+{Q31( 0.09140039465500f), Q31(-0.20537731453108f)}, {Q31(-0.71658965751996f), Q31(-0.97788200391224f)},
+{Q31( 0.81014640078925f), Q31( 0.53722648362443f)}, {Q31( 0.40616991671205f), Q31(-0.26469008598449f)},
+{Q31(-0.67680188682972f), Q31( 0.94502052337695f)}, {Q31( 0.86849774348749f), Q31(-0.18333598647899f)},
+{Q31(-0.99500381284851f), Q31(-0.02634122068550f)}, {Q31( 0.84329189340667f), Q31( 0.10406957462213f)},
+{Q31(-0.09215968531446f), Q31( 0.69540012101253f)}, {Q31( 0.99956173327206f), Q31(-0.12358542001404f)},
+{Q31(-0.79732779473535f), Q31(-0.91582524736159f)}, {Q31( 0.96349973642406f), Q31( 0.96640458041000f)},
+{Q31(-0.79942778496547f), Q31( 0.64323902822857f)}, {Q31(-0.11566039853896f), Q31( 0.28587846253726f)},
+{Q31(-0.39922954514662f), Q31( 0.94129601616966f)}, {Q31( 0.99089197565987f), Q31(-0.92062625581587f)},
+{Q31( 0.28631285179909f), Q31(-0.91035047143603f)}, {Q31(-0.83302725605608f), Q31(-0.67330410892084f)},
+{Q31( 0.95404443402072f), Q31( 0.49162765398743f)}, {Q31(-0.06449863579434f), Q31( 0.03250560813135f)},
+{Q31(-0.99575054486311f), Q31( 0.42389784469507f)}, {Q31(-0.65501142790847f), Q31( 0.82546114655624f)},
+{Q31(-0.81254441908887f), Q31(-0.51627234660629f)}, {Q31(-0.99646369485481f), Q31( 0.84490533520752f)},
+{Q31( 0.00287840603348f), Q31( 0.64768261158166f)}, {Q31( 0.70176989408455f), Q31(-0.20453028573322f)},
+{Q31( 0.96361882270190f), Q31( 0.40706967140989f)}, {Q31(-0.68883758192426f), Q31( 0.91338958840772f)},
+{Q31(-0.34875585502238f), Q31( 0.71472290693300f)}, {Q31( 0.91980081243087f), Q31( 0.66507455644919f)},
+{Q31(-0.99009048343881f), Q31( 0.85868021604848f)}, {Q31( 0.68865791458395f), Q31( 0.55660316809678f)},
+{Q31(-0.99484402129368f), Q31(-0.20052559254934f)}, {Q31( 0.94214511408023f), Q31(-0.99696425367461f)},
+{Q31(-0.67414626793544f), Q31( 0.49548221180078f)}, {Q31(-0.47339353684664f), Q31(-0.85904328834047f)},
+{Q31( 0.14323651387360f), Q31(-0.94145598222488f)}, {Q31(-0.29268293575672f), Q31( 0.05759224927952f)},
+{Q31( 0.43793861458754f), Q31(-0.78904969892724f)}, {Q31(-0.36345126374441f), Q31( 0.64874435357162f)},
+{Q31(-0.08750604656825f), Q31( 0.97686944362527f)}, {Q31(-0.96495267812511f), Q31(-0.53960305946511f)},
+{Q31( 0.55526940659947f), Q31( 0.78891523734774f)}, {Q31( 0.73538215752630f), Q31( 0.96452072373404f)},
+{Q31(-0.30889773919437f), Q31(-0.80664389776860f)}, {Q31( 0.03574995626194f), Q31(-0.97325616900959f)},
+{Q31( 0.98720684660488f), Q31( 0.48409133691962f)}, {Q31(-0.81689296271203f), Q31(-0.90827703628298f)},
+{Q31( 0.67866860118215f), Q31( 0.81284503870856f)}, {Q31(-0.15808569732583f), Q31( 0.85279555024382f)},
+{Q31( 0.80723395114371f), Q31(-0.24717418514605f)}, {Q31( 0.47788757329038f), Q31(-0.46333147839295f)},
+{Q31( 0.96367554763201f), Q31( 0.38486749303242f)}, {Q31(-0.99143875716818f), Q31(-0.24945277239809f)},
+{Q31( 0.83081876925833f), Q31(-0.94780851414763f)}, {Q31(-0.58753191905341f), Q31( 0.01290772389163f)},
+{Q31( 0.95538108220960f), Q31(-0.85557052096538f)}, {Q31(-0.96490920476211f), Q31(-0.64020970923102f)},
+{Q31(-0.97327101028521f), Q31( 0.12378128133110f)}, {Q31( 0.91400366022124f), Q31( 0.57972471346930f)},
+{Q31(-0.99925837363824f), Q31( 0.71084847864067f)}, {Q31(-0.86875903507313f), Q31(-0.20291699203564f)},
+{Q31(-0.26240034795124f), Q31(-0.68264554369108f)}, {Q31(-0.24664412953388f), Q31(-0.87642273115183f)},
+{Q31( 0.02416275806869f), Q31( 0.27192914288905f)}, {Q31( 0.82068619590515f), Q31(-0.85087787994476f)},
+{Q31( 0.88547373760759f), Q31(-0.89636802901469f)}, {Q31(-0.18173078152226f), Q31(-0.26152145156800f)},
+{Q31( 0.09355476558534f), Q31( 0.54845123045604f)}, {Q31(-0.54668414224090f), Q31( 0.95980774020221f)},
+{Q31( 0.37050990604091f), Q31(-0.59910140383171f)}, {Q31(-0.70373594262891f), Q31( 0.91227665827081f)},
+{Q31(-0.34600785879594f), Q31(-0.99441426144200f)}, {Q31(-0.68774481731008f), Q31(-0.30238837956299f)},
+{Q31(-0.26843291251234f), Q31( 0.83115668004362f)}, {Q31( 0.49072334613242f), Q31(-0.45359708737775f)},
+{Q31( 0.38975993093975f), Q31( 0.95515358099121f)}, {Q31(-0.97757125224150f), Q31( 0.05305894580606f)},
+{Q31(-0.17325552859616f), Q31(-0.92770672250494f)}, {Q31( 0.99948035025744f), Q31( 0.58285545563426f)},
+{Q31(-0.64946246527458f), Q31( 0.68645507104960f)}, {Q31(-0.12016920576437f), Q31(-0.57147322153312f)},
+{Q31(-0.58947456517751f), Q31(-0.34847132454388f)}, {Q31(-0.41815140454465f), Q31( 0.16276422358861f)},
+{Q31( 0.99885650204884f), Q31( 0.11136095490444f)}, {Q31(-0.56649614128386f), Q31(-0.90494866361587f)},
+{Q31( 0.94138021032330f), Q31( 0.35281916733018f)}, {Q31(-0.75725076534641f), Q31( 0.53650549640587f)},
+{Q31( 0.20541973692630f), Q31(-0.94435144369918f)}, {Q31( 0.99980371023351f), Q31( 0.79835913565599f)},
+{Q31( 0.29078277605775f), Q31( 0.35393777921520f)}, {Q31(-0.62858772103030f), Q31( 0.38765693387102f)},
+{Q31( 0.43440904467688f), Q31(-0.98546330463232f)}, {Q31(-0.98298583762390f), Q31( 0.21021524625209f)},
+{Q31( 0.19513029146934f), Q31(-0.94239832251867f)}, {Q31(-0.95476662400101f), Q31( 0.98364554179143f)},
+{Q31( 0.93379635304810f), Q31(-0.70881994583682f)}, {Q31(-0.85235410573336f), Q31(-0.08342347966410f)},
+{Q31(-0.86425093011245f), Q31(-0.45795025029466f)}, {Q31( 0.38879779059045f), Q31( 0.97274429344593f)},
+{Q31( 0.92045124735495f), Q31(-0.62433652524220f)}, {Q31( 0.89162532251878f), Q31( 0.54950955570563f)},
+{Q31(-0.36834336949252f), Q31( 0.96458298020975f)}, {Q31( 0.93891760988045f), Q31(-0.89968353740388f)},
+{Q31( 0.99267657565094f), Q31(-0.03757034316958f)}, {Q31(-0.94063471614176f), Q31( 0.41332338538963f)},
+{Q31( 0.99740224117019f), Q31(-0.16830494996370f)}, {Q31(-0.35899413170555f), Q31(-0.46633226649613f)},
+{Q31( 0.05237237274947f), Q31(-0.25640361602661f)}, {Q31( 0.36703583957424f), Q31(-0.38653265641875f)},
+{Q31( 0.91653180367913f), Q31(-0.30587628726597f)}, {Q31( 0.69000803499316f), Q31( 0.90952171386132f)},
+{Q31(-0.38658751133527f), Q31( 0.99501571208985f)}, {Q31(-0.29250814029851f), Q31( 0.37444994344615f)},
+{Q31(-0.60182204677608f), Q31( 0.86779651036123f)}, {Q31(-0.97418588163217f), Q31( 0.96468523666475f)},
+{Q31( 0.88461574003963f), Q31( 0.57508405276414f)}, {Q31( 0.05198933055162f), Q31( 0.21269661669964f)},
+{Q31(-0.53499621979720f), Q31( 0.97241553731237f)}, {Q31(-0.49429560226497f), Q31( 0.98183865291903f)},
+{Q31(-0.98935142339139f), Q31(-0.40249159006933f)}, {Q31(-0.98081380091130f), Q31(-0.72856895534041f)},
+{Q31(-0.27338148835532f), Q31( 0.99950922447209f)}, {Q31( 0.06310802338302f), Q31(-0.54539587529618f)},
+{Q31(-0.20461677199539f), Q31(-0.14209977628489f)}, {Q31( 0.66223843141647f), Q31( 0.72528579940326f)},
+{Q31(-0.84764345483665f), Q31( 0.02372316801261f)}, {Q31(-0.89039863483811f), Q31( 0.88866581484602f)},
+{Q31( 0.95903308477986f), Q31( 0.76744927173873f)}, {Q31( 0.73504123909879f), Q31(-0.03747203173192f)},
+{Q31(-0.31744434966056f), Q31(-0.36834111883652f)}, {Q31(-0.34110827591623f), Q31( 0.40211222807691f)},
+{Q31( 0.47803883714199f), Q31(-0.39423219786288f)}, {Q31( 0.98299195879514f), Q31( 0.01989791390047f)},
+{Q31(-0.30963073129751f), Q31(-0.18076720599336f)}, {Q31( 0.99992588229018f), Q31(-0.26281872094289f)},
+{Q31(-0.93149731080767f), Q31(-0.98313162570490f)}, {Q31( 0.99923472302773f), Q31(-0.80142993767554f)},
+{Q31(-0.26024169633417f), Q31(-0.75999759855752f)}, {Q31(-0.35712514743563f), Q31( 0.19298963768574f)},
+{Q31(-0.99899084509530f), Q31( 0.74645156992493f)}, {Q31( 0.86557171579452f), Q31( 0.55593866696299f)},
+{Q31( 0.33408042438752f), Q31( 0.86185953874709f)}, {Q31( 0.99010736374716f), Q31( 0.04602397576623f)},
+{Q31(-0.66694269691195f), Q31(-0.91643611810148f)}, {Q31( 0.64016792079480f), Q31( 0.15649530836856f)},
+{Q31( 0.99570534804836f), Q31( 0.45844586038111f)}, {Q31(-0.63431466947340f), Q31( 0.21079116459234f)},
+{Q31(-0.07706847005931f), Q31(-0.89581437101329f)}, {Q31( 0.98590090577724f), Q31( 0.88241721133981f)},
+{Q31( 0.80099335254678f), Q31(-0.36851896710853f)}, {Q31( 0.78368131392666f), Q31( 0.45506999802597f)},
+{Q31( 0.08707806671691f), Q31( 0.80938994918745f)}, {Q31(-0.86811883080712f), Q31( 0.39347308654705f)},
+{Q31(-0.39466529740375f), Q31(-0.66809432114456f)}, {Q31( 0.97875325649683f), Q31(-0.72467840967746f)},
+{Q31(-0.95038560288864f), Q31( 0.89563219587625f)}, {Q31( 0.17005239424212f), Q31( 0.54683053962658f)},
+{Q31(-0.76910792026848f), Q31(-0.96226617549298f)}, {Q31( 0.99743281016846f), Q31( 0.42697157037567f)},
+{Q31( 0.95437383549973f), Q31( 0.97002324109952f)}, {Q31( 0.99578905365569f), Q31(-0.54106826257356f)},
+{Q31( 0.28058259829990f), Q31(-0.85361420634036f)}, {Q31( 0.85256524470573f), Q31(-0.64567607735589f)},
+{Q31(-0.50608540105128f), Q31(-0.65846015480300f)}, {Q31(-0.97210735183243f), Q31(-0.23095213067791f)},
+{Q31( 0.95424048234441f), Q31(-0.99240147091219f)}, {Q31(-0.96926570524023f), Q31( 0.73775654896574f)},
+{Q31( 0.30872163214726f), Q31( 0.41514960556126f)}, {Q31(-0.24523839572639f), Q31( 0.63206633394807f)},
+{Q31(-0.33813265086024f), Q31(-0.38661779441897f)}, {Q31(-0.05826828420146f), Q31(-0.06940774188029f)},
+{Q31(-0.22898461455054f), Q31( 0.97054853316316f)}, {Q31(-0.18509915019881f), Q31( 0.47565762892084f)},
+{Q31(-0.10488238045009f), Q31(-0.87769947402394f)}, {Q31(-0.71886586182037f), Q31( 0.78030982480538f)},
+{Q31( 0.99793873738654f), Q31( 0.90041310491497f)}, {Q31( 0.57563307626120f), Q31(-0.91034337352097f)},
+{Q31( 0.28909646383717f), Q31( 0.96307783970534f)}, {Q31( 0.42188998312520f), Q31( 0.48148651230437f)},
+{Q31( 0.93335049681047f), Q31(-0.43537023883588f)}, {Q31(-0.97087374418267f), Q31( 0.86636445711364f)},
+{Q31( 0.36722871286923f), Q31( 0.65291654172961f)}, {Q31(-0.81093025665696f), Q31( 0.08778370229363f)},
+{Q31(-0.26240603062237f), Q31(-0.92774095379098f)}, {Q31( 0.83996497984604f), Q31( 0.55839849139647f)},
+{Q31(-0.99909615720225f), Q31(-0.96024605713970f)}, {Q31( 0.74649464155061f), Q31( 0.12144893606462f)},
+{Q31(-0.74774595569805f), Q31(-0.26898062008959f)}, {Q31( 0.95781667469567f), Q31(-0.79047927052628f)},
+{Q31( 0.95472308713099f), Q31(-0.08588776019550f)}, {Q31( 0.48708332746299f), Q31( 0.99999041579432f)},
+{Q31( 0.46332038247497f), Q31( 0.10964126185063f)}, {Q31(-0.76497004940162f), Q31( 0.89210929242238f)},
+{Q31( 0.57397389364339f), Q31( 0.35289703373760f)}, {Q31( 0.75374316974495f), Q31( 0.96705214651335f)},
+{Q31(-0.59174397685714f), Q31(-0.89405370422752f)}, {Q31( 0.75087906691890f), Q31(-0.29612672982396f)},
+{Q31(-0.98607857336230f), Q31( 0.25034911730023f)}, {Q31(-0.40761056640505f), Q31(-0.90045573444695f)},
+{Q31( 0.66929266740477f), Q31( 0.98629493401748f)}, {Q31(-0.97463695257310f), Q31(-0.00190223301301f)},
+{Q31( 0.90145509409859f), Q31( 0.99781390365446f)}, {Q31(-0.87259289048043f), Q31( 0.99233587353666f)},
+{Q31(-0.91529461447692f), Q31(-0.15698707534206f)}, {Q31(-0.03305738840705f), Q31(-0.37205262859764f)},
+{Q31( 0.07223051368337f), Q31(-0.88805001733626f)}, {Q31( 0.99498012188353f), Q31( 0.97094358113387f)},
+{Q31(-0.74904939500519f), Q31( 0.99985483641521f)}, {Q31( 0.04585228574211f), Q31( 0.99812337444082f)},
+{Q31(-0.89054954257993f), Q31(-0.31791913188064f)}, {Q31(-0.83782144651251f), Q31( 0.97637632547466f)},
+{Q31( 0.33454804933804f), Q31(-0.86231516800408f)}, {Q31(-0.99707579362824f), Q31( 0.93237990079441f)},
+{Q31(-0.22827527843994f), Q31( 0.18874759397997f)}, {Q31( 0.67248046289143f), Q31(-0.03646211390569f)},
+{Q31(-0.05146538187944f), Q31(-0.92599700120679f)}, {Q31( 0.99947295749905f), Q31( 0.93625229707912f)},
+{Q31( 0.66951124390363f), Q31( 0.98905825623893f)}, {Q31(-0.99602956559179f), Q31(-0.44654715757688f)},
+{Q31( 0.82104905483590f), Q31( 0.99540741724928f)}, {Q31( 0.99186510988782f), Q31( 0.72023001312947f)},
+{Q31(-0.65284592392918f), Q31( 0.52186723253637f)}, {Q31( 0.93885443798188f), Q31(-0.74895312615259f)},
+{Q31( 0.96735248738388f), Q31( 0.90891816978629f)}, {Q31(-0.22225968841114f), Q31( 0.57124029781228f)},
+{Q31(-0.44132783753414f), Q31(-0.92688840659280f)}, {Q31(-0.85694974219574f), Q31( 0.88844532719844f)},
+{Q31( 0.91783042091762f), Q31(-0.46356892383970f)}, {Q31( 0.72556974415690f), Q31(-0.99899555770747f)},
+{Q31(-0.99711581834508f), Q31( 0.58211560180426f)}, {Q31( 0.77638976371966f), Q31( 0.94321834873819f)},
+{Q31( 0.07717324253925f), Q31( 0.58638399856595f)}, {Q31(-0.56049829194163f), Q31( 0.82522301569036f)},
+{Q31( 0.98398893639988f), Q31( 0.39467440420569f)}, {Q31( 0.47546946844938f), Q31( 0.68613044836811f)},
+{Q31( 0.65675089314631f), Q31( 0.18331637134880f)}, {Q31( 0.03273375457980f), Q31(-0.74933109564108f)},
+{Q31(-0.38684144784738f), Q31( 0.51337349030406f)}, {Q31(-0.97346267944545f), Q31(-0.96549364384098f)},
+{Q31(-0.53282156061942f), Q31(-0.91423265091354f)}, {Q31( 0.99817310731176f), Q31( 0.61133572482148f)},
+{Q31(-0.50254500772635f), Q31(-0.88829338134294f)}, {Q31( 0.01995873238855f), Q31( 0.85223515096765f)},
+{Q31( 0.99930381973804f), Q31( 0.94578896296649f)}, {Q31( 0.82907767600783f), Q31(-0.06323442598128f)},
+{Q31(-0.58660709669728f), Q31( 0.96840773806582f)}, {Q31(-0.17573736667267f), Q31(-0.48166920859485f)},
+{Q31( 0.83434292401346f), Q31(-0.13023450646997f)}, {Q31( 0.05946491307025f), Q31( 0.20511047074866f)},
+{Q31( 0.81505484574602f), Q31(-0.94685947861369f)}, {Q31(-0.44976380954860f), Q31( 0.40894572671545f)},
+{Q31(-0.89746474625671f), Q31( 0.99846578838537f)}, {Q31( 0.39677256130792f), Q31(-0.74854668609359f)},
+{Q31(-0.07588948563079f), Q31( 0.74096214084170f)}, {Q31( 0.76343198951445f), Q31( 0.41746629422634f)},
+{Q31(-0.74490104699626f), Q31( 0.94725911744610f)}, {Q31( 0.64880119792759f), Q31( 0.41336660830571f)},
+{Q31( 0.62319537462542f), Q31(-0.93098313552599f)}, {Q31( 0.42215817594807f), Q31(-0.07712787385208f)},
+{Q31( 0.02704554141885f), Q31(-0.05417518053666f)}, {Q31( 0.80001773566818f), Q31( 0.91542195141039f)},
+{Q31(-0.79351832348816f), Q31(-0.36208897989136f)}, {Q31( 0.63872359151636f), Q31( 0.08128252493444f)},
+{Q31( 0.52890520960295f), Q31( 0.60048872455592f)}, {Q31( 0.74238552914587f), Q31( 0.04491915291044f)},
+{Q31( 0.99096131449250f), Q31(-0.19451182854402f)}, {Q31(-0.80412329643109f), Q31(-0.88513818199457f)},
+{Q31(-0.64612616129736f), Q31( 0.72198674804544f)}, {Q31( 0.11657770663191f), Q31(-0.83662833815041f)},
+{Q31(-0.95053182488101f), Q31(-0.96939905138082f)}, {Q31(-0.62228872928622f), Q31( 0.82767262846661f)},
+{Q31( 0.03004475787316f), Q31(-0.99738896333384f)}, {Q31(-0.97987214341034f), Q31( 0.36526129686425f)},
+{Q31(-0.99986980746200f), Q31(-0.36021610299715f)}, {Q31( 0.89110648599879f), Q31(-0.97894250343044f)},
+{Q31( 0.10407960510582f), Q31( 0.77357793811619f)}, {Q31( 0.95964737821728f), Q31(-0.35435818285502f)},
+{Q31( 0.50843233159162f), Q31( 0.96107691266205f)}, {Q31( 0.17006334670615f), Q31(-0.76854025314829f)},
+{Q31( 0.25872675063360f), Q31( 0.99893303933816f)}, {Q31(-0.01115998681937f), Q31( 0.98496019742444f)},
+{Q31(-0.79598702973261f), Q31( 0.97138411318894f)}, {Q31(-0.99264708948101f), Q31(-0.99542822402536f)},
+{Q31(-0.99829663752818f), Q31( 0.01877138824311f)}, {Q31(-0.70801016548184f), Q31( 0.33680685948117f)},
+{Q31(-0.70467057786826f), Q31( 0.93272777501857f)}, {Q31( 0.99846021905254f), Q31(-0.98725746254433f)},
+{Q31(-0.63364968534650f), Q31(-0.16473594423746f)}, {Q31(-0.16258217500792f), Q31(-0.95939125400802f)},
+{Q31(-0.43645594360633f), Q31(-0.94805030113284f)}, {Q31(-0.99848471702976f), Q31( 0.96245166923809f)},
+{Q31(-0.16796458968998f), Q31(-0.98987511890470f)}, {Q31(-0.87979225745213f), Q31(-0.71725725041680f)},
+{Q31( 0.44183099021786f), Q31(-0.93568974498761f)}, {Q31( 0.93310180125532f), Q31(-0.99913308068246f)},
+{Q31(-0.93941931782002f), Q31(-0.56409379640356f)}, {Q31(-0.88590003188677f), Q31( 0.47624600491382f)},
+{Q31( 0.99971463703691f), Q31(-0.83889954253462f)}, {Q31(-0.75376385639978f), Q31( 0.00814643438625f)},
+{Q31( 0.93887685615875f), Q31(-0.11284528204636f)}, {Q31( 0.85126435782309f), Q31( 0.52349251543547f)},
+{Q31( 0.39701421446381f), Q31( 0.81779634174316f)}, {Q31(-0.37024464187437f), Q31(-0.87071656222959f)},
+{Q31(-0.36024828242896f), Q31( 0.34655735648287f)}, {Q31(-0.93388812549209f), Q31(-0.84476541096429f)},
+{Q31(-0.65298804552119f), Q31(-0.18439575450921f)}, {Q31( 0.11960319006843f), Q31( 0.99899346780168f)},
+{Q31( 0.94292565553160f), Q31( 0.83163906518293f)}, {Q31( 0.75081145286948f), Q31(-0.35533223142265f)},
+{Q31( 0.56721979748394f), Q31(-0.24076836414499f)}, {Q31( 0.46857766746029f), Q31(-0.30140233457198f)},
+{Q31( 0.97312313923635f), Q31(-0.99548191630031f)}, {Q31(-0.38299976567017f), Q31( 0.98516909715427f)},
+{Q31( 0.41025800019463f), Q31( 0.02116736935734f)}, {Q31( 0.09638062008048f), Q31( 0.04411984381457f)},
+{Q31(-0.85283249275397f), Q31( 0.91475563922421f)}, {Q31( 0.88866808958124f), Q31(-0.99735267083226f)},
+{Q31(-0.48202429536989f), Q31(-0.96805608884164f)}, {Q31( 0.27572582416567f), Q31( 0.58634753335832f)},
+{Q31(-0.65889129659168f), Q31( 0.58835634138583f)}, {Q31( 0.98838086953732f), Q31( 0.99994349600236f)},
+{Q31(-0.20651349620689f), Q31( 0.54593044066355f)}, {Q31(-0.62126416356920f), Q31(-0.59893681700392f)},
+{Q31( 0.20320105410437f), Q31(-0.86879180355289f)}, {Q31(-0.97790548600584f), Q31( 0.96290806999242f)},
+{Q31( 0.11112534735126f), Q31( 0.21484763313301f)}, {Q31(-0.41368337314182f), Q31( 0.28216837680365f)},
+{Q31( 0.24133038992960f), Q31( 0.51294362630238f)}, {Q31(-0.66393410674885f), Q31(-0.08249679629081f)},
+{Q31(-0.53697829178752f), Q31(-0.97649903936228f)}, {Q31(-0.97224737889348f), Q31( 0.22081333579837f)},
+{Q31( 0.87392477144549f), Q31(-0.12796173740361f)}, {Q31( 0.19050361015753f), Q31( 0.01602615387195f)},
+{Q31(-0.46353441212724f), Q31(-0.95249041539006f)}, {Q31(-0.07064096339021f), Q31(-0.94479803205886f)},
+{Q31(-0.92444085484466f), Q31(-0.10457590187436f)}, {Q31(-0.83822593578728f), Q31(-0.01695043208885f)},
+{Q31( 0.75214681811150f), Q31(-0.99955681042665f)}, {Q31(-0.42102998829339f), Q31( 0.99720941999394f)},
+{Q31(-0.72094786237696f), Q31(-0.35008961934255f)}, {Q31( 0.78843311019251f), Q31( 0.52851398958271f)},
+{Q31( 0.97394027897442f), Q31(-0.26695944086561f)}, {Q31( 0.99206463477946f), Q31(-0.57010120849429f)},
+{Q31( 0.76789609461795f), Q31(-0.76519356730966f)}, {Q31(-0.82002421836409f), Q31(-0.73530179553767f)},
+{Q31( 0.81924990025724f), Q31( 0.99698425250579f)}, {Q31(-0.26719850873357f), Q31( 0.68903369776193f)},
+{Q31(-0.43311260380975f), Q31( 0.85321815947490f)}, {Q31( 0.99194979673836f), Q31( 0.91876249766422f)},
+{Q31(-0.80692001248487f), Q31(-0.32627540663214f)}, {Q31( 0.43080003649976f), Q31(-0.21919095636638f)},
+{Q31( 0.67709491937357f), Q31(-0.95478075822906f)}, {Q31( 0.56151770568316f), Q31(-0.70693811747778f)},
+{Q31( 0.10831862810749f), Q31(-0.08628837174592f)}, {Q31( 0.91229417540436f), Q31(-0.65987351408410f)},
+{Q31(-0.48972893932274f), Q31( 0.56289246362686f)}, {Q31(-0.89033658689697f), Q31(-0.71656563987082f)},
+{Q31( 0.65269447475094f), Q31( 0.65916004833932f)}, {Q31( 0.67439478141121f), Q31(-0.81684380846796f)},
+{Q31(-0.47770832416973f), Q31(-0.16789556203025f)}, {Q31(-0.99715979260878f), Q31(-0.93565784007648f)},
+{Q31(-0.90889593602546f), Q31( 0.62034397054380f)}, {Q31(-0.06618622548177f), Q31(-0.23812217221359f)},
+{Q31( 0.99430266919728f), Q31( 0.18812555317553f)}, {Q31( 0.97686402381843f), Q31(-0.28664534366620f)},
+{Q31( 0.94813650221268f), Q31(-0.97506640027128f)}, {Q31(-0.95434497492853f), Q31(-0.79607978501983f)},
+{Q31(-0.49104783137150f), Q31( 0.32895214359663f)}, {Q31( 0.99881175120751f), Q31( 0.88993983831354f)},
+{Q31( 0.50449166760303f), Q31(-0.85995072408434f)}, {Q31( 0.47162891065108f), Q31(-0.18680204049569f)},
+{Q31(-0.62081581361840f), Q31( 0.75000676218956f)}, {Q31(-0.43867015250812f), Q31( 0.99998069244322f)},
+{Q31( 0.98630563232075f), Q31(-0.53578899600662f)}, {Q31(-0.61510362277374f), Q31(-0.89515019899997f)},
+{Q31(-0.03841517601843f), Q31(-0.69888815681179f)}, {Q31(-0.30102157304644f), Q31(-0.07667808922205f)},
+{Q31( 0.41881284182683f), Q31( 0.02188098922282f)}, {Q31(-0.86135454941237f), Q31( 0.98947480909359f)},
+{Q31( 0.67226861393788f), Q31(-0.13494389011014f)}, {Q31(-0.70737398842068f), Q31(-0.76547349325992f)},
+{Q31( 0.94044946687963f), Q31( 0.09026201157416f)}, {Q31(-0.82386352534327f), Q31( 0.08924768823676f)},
+{Q31(-0.32070666698656f), Q31( 0.50143421908753f)}, {Q31( 0.57593163224487f), Q31(-0.98966422921509f)},
+{Q31(-0.36326018419965f), Q31( 0.07440243123228f)}, {Q31( 0.99979044674350f), Q31(-0.14130287347405f)},
+{Q31(-0.92366023326932f), Q31(-0.97979298068180f)}, {Q31(-0.44607178518598f), Q31(-0.54233252016394f)},
+{Q31( 0.44226800932956f), Q31( 0.71326756742752f)}, {Q31( 0.03671907158312f), Q31( 0.63606389366675f)},
+{Q31( 0.52175424682195f), Q31(-0.85396826735705f)}, {Q31(-0.94701139690956f), Q31(-0.01826348194255f)},
+{Q31(-0.98759606946049f), Q31( 0.82288714303073f)}, {Q31( 0.87434794743625f), Q31( 0.89399495655433f)},
+{Q31(-0.93412041758744f), Q31( 0.41374052024363f)}, {Q31( 0.96063943315511f), Q31( 0.93116709541280f)},
+{Q31( 0.97534253457837f), Q31( 0.86150930812689f)}, {Q31( 0.99642466504163f), Q31( 0.70190043427512f)},
+{Q31(-0.94705089665984f), Q31(-0.29580042814306f)}, {Q31( 0.91599807087376f), Q31(-0.98147830385781f)},
+// Start of duplicated table
+{Q31(-0.99948153278296f), Q31(-0.59483417516607f)}, {Q31( 0.97113454393991f), Q31(-0.67528515225647f)},
+{Q31( 0.14130051758487f), Q31(-0.95090983575689f)}, {Q31(-0.47005496701697f), Q31(-0.37340549728647f)},
+{Q31( 0.80705063769351f), Q31( 0.29653668284408f)}, {Q31(-0.38981478896926f), Q31( 0.89572605717087f)},
+{Q31(-0.01053049862020f), Q31(-0.66959058036166f)}, {Q31(-0.91266367957293f), Q31(-0.11522938140034f)},
 };
 
 #endif /* AVCODEC_AACSBRDATA_H */
diff --git a/libavcodec/aactab.c b/libavcodec/aactab.c
index 9f1e8af..df551b0 100644
--- a/libavcodec/aactab.c
+++ b/libavcodec/aactab.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,17 +29,27 @@
 
 #include "libavutil/mem.h"
 #include "aac.h"
-#include "aac_tablegen.h"
 
 #include <stdint.h>
 
+float ff_aac_pow2sf_tab[428];
+float ff_aac_pow34sf_tab[428];
+
 DECLARE_ALIGNED(32, float,  ff_aac_kbd_long_1024)[1024];
 DECLARE_ALIGNED(32, float,  ff_aac_kbd_short_128)[128];
+DECLARE_ALIGNED(32, float,  ff_aac_kbd_long_960)[960];
+DECLARE_ALIGNED(32, float,  ff_aac_kbd_short_120)[120];
+DECLARE_ALIGNED(32, int,    ff_aac_kbd_long_1024_fixed)[1024];
+DECLARE_ALIGNED(32, int,    ff_aac_kbd_short_128_fixed)[128];
 
 const uint8_t ff_aac_num_swb_1024[] = {
     41, 41, 47, 49, 49, 51, 47, 47, 43, 43, 43, 40, 40
 };
 
+const uint8_t ff_aac_num_swb_960[] = {
+    40, 40, 46, 49, 49, 49, 46, 46, 42, 42, 42, 40, 40
+};
+
 const uint8_t ff_aac_num_swb_512[] = {
      0,  0,  0, 36, 36, 37, 31, 31,  0,  0,  0,  0,  0
 };
@@ -52,6 +62,10 @@ const uint8_t ff_aac_num_swb_128[] = {
     12, 12, 12, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15
 };
 
+const uint8_t ff_aac_num_swb_120[] = {
+    12, 12, 12, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15
+};
+
 const uint8_t ff_aac_pred_sfb_max[] = {
     33, 33, 38, 40, 40, 40, 41, 41, 37, 37, 37, 34, 34
 };
@@ -1225,6 +1239,100 @@ static const uint16_t swb_offset_128_8[] = {
     36,  44,  52,  60,  72,  88, 108, 128
 };
 
+static const uint16_t swb_offset_960_96[] =
+{
+    0,   4,   8,   12,  16,  20,  24,  28,  32,  36,
+    40,  44,  48,  52,  56,  64,  72,  80,  88,  96,
+    108, 120, 132, 144, 156, 172, 188, 212, 240, 276,
+    320, 384, 448, 512, 576, 640, 704, 768, 832, 896,
+    960
+};
+
+static const uint16_t swb_offset_960_64[] =
+{
+    0,   4,   8,   12,  16,  20,  24,  28,  32,  36,
+    40,  44,  48,  52,  56,  64,  72,  80,  88,  100,
+    112, 124, 140, 156, 172, 192, 216, 240, 268, 304,
+    344, 384, 424, 464, 504, 544, 584, 624, 664, 704,
+    744, 784, 824, 864, 904, 944, 960
+};
+
+static const uint16_t swb_offset_960_48[] =
+{
+    0,   4,   8,   12,  16,  20,  24,  28,  32,  36,
+    40,  48,  56,  64,  72,  80,  88,  96,  108, 120,
+    132, 144, 160, 176, 196, 216, 240, 264, 292, 320,
+    352, 384, 416, 448, 480, 512, 544, 576, 608, 640,
+    672, 704, 736, 768, 800, 832, 864, 896, 928, 960
+};
+
+static const uint16_t swb_offset_960_32[] =
+{
+    0,   4,   8,   12,  16,  20,  24,  28,  32,  36,
+    40,  48,  56,  64,  72,  80,  88,  96,  108, 120,
+    132, 144, 160, 176, 196, 216, 240, 264, 292, 320,
+    352, 384, 416, 448, 480, 512, 544, 576, 608, 640,
+    672, 704, 736, 768, 800, 832, 864, 896, 928, 960
+};
+
+static const uint16_t swb_offset_960_24[] =
+{
+    0,   4,   8,   12,  16,  20,  24,  28,  32,  36,
+    40,  44,  52,  60,  68,  76,  84,  92,  100, 108,
+    116, 124, 136, 148, 160, 172, 188, 204, 220, 240,
+    260, 284, 308, 336, 364, 396, 432, 468, 508, 552,
+    600, 652, 704, 768, 832, 896, 960
+};
+
+static const uint16_t swb_offset_960_16[] =
+{
+    0,   8,   16,  24,  32,  40,  48,  56,  64,  72,
+    80,  88,  100, 112, 124, 136, 148, 160, 172, 184,
+    196, 212, 228, 244, 260, 280, 300, 320, 344, 368,
+    396, 424, 456, 492, 532, 572, 616, 664, 716, 772,
+    832, 896, 960
+};
+
+static const uint16_t swb_offset_960_8[] =
+{
+    0,   12,  24,  36,  48,  60,  72,  84,  96,  108,
+    120, 132, 144, 156, 172, 188, 204, 220, 236, 252,
+    268, 288, 308, 328, 348, 372, 396, 420, 448, 476,
+    508, 544, 580, 620, 664, 712, 764, 820, 880, 944,
+    960
+};
+
+
+static const uint16_t swb_offset_120_96[] =
+{
+    0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64, 92, 120
+};
+
+static const uint16_t swb_offset_120_64[] =
+{
+    0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64, 92, 120
+};
+
+static const uint16_t swb_offset_120_48[] =
+{
+    0,  4, 8, 12, 16, 20, 28, 36, 44, 56, 68, 80, 96, 112, 120
+};
+
+static const uint16_t swb_offset_120_24[] =
+{
+    0, 4, 8, 12, 16, 20, 24, 28, 36, 44, 52, 64, 76, 92, 108, 120
+};
+
+static const uint16_t swb_offset_120_16[] =
+{
+    0, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 60, 72, 88, 108, 120
+};
+
+static const uint16_t swb_offset_120_8[] =
+{
+    0, 4, 8, 12, 16,  20, 24, 28, 36, 44, 52, 60, 72, 88, 108, 120
+};
+
 const uint16_t * const ff_swb_offset_1024[] = {
     swb_offset_1024_96, swb_offset_1024_96, swb_offset_1024_64,
     swb_offset_1024_48, swb_offset_1024_48, swb_offset_1024_32,
@@ -1233,6 +1341,14 @@ const uint16_t * const ff_swb_offset_1024[] = {
     swb_offset_1024_8
 };
 
+const uint16_t * const ff_swb_offset_960[] = {
+    swb_offset_960_96, swb_offset_960_96, swb_offset_960_64,
+    swb_offset_960_48, swb_offset_960_48, swb_offset_960_32,
+    swb_offset_960_24, swb_offset_960_24, swb_offset_960_16,
+    swb_offset_960_16, swb_offset_960_16, swb_offset_960_8,
+    swb_offset_960_8
+};
+
 const uint16_t * const ff_swb_offset_512[] = {
     NULL,               NULL,               NULL,
     swb_offset_512_48,  swb_offset_512_48,  swb_offset_512_32,
@@ -1259,6 +1375,14 @@ const uint16_t * const ff_swb_offset_128[] = {
     swb_offset_128_8
 };
 
+const uint16_t * const ff_swb_offset_120[] = {
+    swb_offset_120_96, swb_offset_120_96, swb_offset_120_96,
+    swb_offset_120_48, swb_offset_120_48, swb_offset_120_48,
+    swb_offset_120_24, swb_offset_120_24, swb_offset_120_16,
+    swb_offset_120_16, swb_offset_120_16, swb_offset_120_8,
+    swb_offset_120_8
+};
+
 // @}
 
 /* @name ff_tns_max_bands
@@ -1767,6 +1891,490 @@ const DECLARE_ALIGNED(32, float, ff_aac_eld_window_512)[1920] = {
     -0.00111144, -0.00109764, -0.00108377, -0.00106989,
 };
 
+/* Q30 representation of ff_aac_eld_window_512 table */
+const DECLARE_ALIGNED(32, int, ff_aac_eld_window_512_fixed)[1920] = {
+    0x003783ba, 0x005d04f4, 0x008ae226, 0x00c02021,
+    0x00fb1804, 0x013a30a8, 0x017be9e6, 0x01bf296c,
+    0x02033204, 0x0247502c, 0x028adab0, 0x02cd9568,
+    0x030fa980, 0x03513dc0, 0x03927274, 0x03d363e0,
+    0x04142e40, 0x0454edc0, 0x0495bd48, 0x04d6a060,
+    0x051786d8, 0x05586548, 0x059935e8, 0x05d9feb0,
+    0x061acea0, 0x065bb680, 0x069cc800, 0x06de13f0,
+    0x071fa748, 0x07618b80, 0x07a3c7a8, 0x07e66da0,
+    0x082999d0, 0x086d6590, 0x08b1e640, 0x08f72850,
+    0x093d3120, 0x09840550, 0x09cba880, 0x0a1415f0,
+    0x0a5d41b0, 0x0aa720d0, 0x0af1a9a0, 0x0b3cce70,
+    0x0b887ec0, 0x0bd4ac10, 0x0c214a70, 0x0c6e5130,
+    0x0cbbba50, 0x0d098130, 0x0d57a240, 0x0da61a60,
+    0x0df4e620, 0x0e4401d0, 0x0e9369f0, 0x0ee31de0,
+    0x0f332000, 0x0f837180, 0x0fd412a0, 0x10250260,
+    0x10763f20, 0x10c7c660, 0x11199560, 0x116baa00,
+    0x11be0400, 0x1210a1c0, 0x12638180, 0x12b69ee0,
+    0x1309f3e0, 0x135d7ac0, 0x13b12dc0, 0x1404ffa0,
+    0x1458dd40, 0x14acb720, 0x15008120, 0x15543260,
+    0x15a7c460, 0x15fb3160, 0x164e7520, 0x16a193c0,
+    0x16f49740, 0x17478720, 0x179a6720, 0x17ed3720,
+    0x183ff460, 0x18929c20, 0x18e52b00, 0x19379c00,
+    0x1989e900, 0x19dc0ca0, 0x1a2e0280, 0x1a7fc400,
+    0x1ad14a00, 0x1b228ec0, 0x1b738ea0, 0x1bc44540,
+    0x1c14ada0, 0x1c64c380, 0x1cb48440, 0x1d03f420,
+    0x1d531c00, 0x1da20160, 0x1df0a660, 0x1e3f0860,
+    0x1e8d2340, 0x1edaf340, 0x1f2875e0, 0x1f75a700,
+    0x1fc281e0, 0x200f0380, 0x205b2ac0, 0x20a6f980,
+    0x20f27200, 0x213d9600, 0x21886580, 0x21d2e040,
+    0x221d0640, 0x2266d6c0, 0x22b05180, 0x22f97580,
+    0x23424280, 0x238ab880, 0x23d2d780, 0x241aa040,
+    0x246213c0, 0x24a93300, 0x24efff80, 0x25367b40,
+    0x256f68c0, 0x25b53580, 0x25faa580, 0x263fb940,
+    0x26847080, 0x26c8cbc0, 0x270ccb00, 0x27506e40,
+    0x2793b600, 0x27d6a200, 0x281932c0, 0x285b6880,
+    0x289d4400, 0x28dec5c0, 0x291feec0, 0x2960bf80,
+    0x29a137c0, 0x29e15800, 0x2a212000, 0x2a609080,
+    0x2a9fa980, 0x2ade6b40, 0x2b1cd600, 0x2b5aea00,
+    0x2b98a740, 0x2bd60d80, 0x2c131cc0, 0x2c4fd500,
+    0x2c8c3600, 0x2cc83f00, 0x2d03f040, 0x2d3f48c0,
+    0x2d7a48c0, 0x2db4ef40, 0x2def3c40, 0x2e292ec0,
+    0x2e62c700, 0x2e9c0400, 0x2ed4e580, 0x2f0d6ac0,
+    0x2f4592c0, 0x2f7d5c80, 0x2fb4c6c0, 0x2febd140,
+    0x30227b40, 0x3058c400, 0x308eab40, 0x30c43040,
+    0x30f95100, 0x312e0d00, 0x31626240, 0x31965040,
+    0x31c9d5c0, 0x31fcf240, 0x322fa480, 0x3261ec00,
+    0x3293c7c0, 0x32c53680, 0x32f63780, 0x3326c9c0,
+    0x3356ec00, 0x33869d00, 0x33b5db80, 0x33e4a700,
+    0x3412fdc0, 0x3440df40, 0x346e4a80, 0x349b3e40,
+    0x34c7ba00, 0x34f3bd80, 0x351f47c0, 0x354a5840,
+    0x3574ee40, 0x359f0900, 0x35c8a840, 0x35f1cb80,
+    0x361a71c0, 0x36429a80, 0x366a4580, 0x36917280,
+    0x36b82100, 0x36de5180, 0x37040340, 0x372936c0,
+    0x374dec40, 0x37722340, 0x3795dc40, 0x37b91780,
+    0x37dbd600, 0x37fe18c0, 0x381fe080, 0x38412e00,
+    0x38620280, 0x38825f40, 0x38a24540, 0x38c1b680,
+    0x38e0b5c0, 0x38ff4540, 0x391d6800, 0x393b20c0,
+    0x39587280, 0x39755fc0, 0x3991eb80, 0x39ae1a80,
+    0x39c9f280, 0x39e57980, 0x3a00b600, 0x3a1bae00,
+    0x3a366800, 0x3a50e9c0, 0x3a6b3a40, 0x3a8560c0,
+    0x3a9f6640, 0x3ab95400, 0x3ad332c0, 0x3aed0680,
+    0x3b06cf80, 0x3b208d40, 0x3b3a3e80, 0x3b53cb80,
+    0x3b6d0780, 0x3b85c380, 0x3b9dd0c0, 0x3bb4eb40,
+    0x3bcabac0, 0x3bdee680, 0x3bf11680, 0x3c011440,
+    0x3c179ac0, 0x3c1c4f00, 0x3c21aa40, 0x3c278880,
+    0x3c2dba80, 0x3c341140, 0x3c3a5e80, 0x3c409100,
+    0x3c46b480, 0x3c4cd5c0, 0x3c530180, 0x3c593cc0,
+    0x3c5f84c0, 0x3c65d640, 0x3c6c2e40, 0x3c728b40,
+    0x3c78ee80, 0x3c7f5840, 0x3c85c940, 0x3c8c4240,
+    0x3c92c380, 0x3c994cc0, 0x3c9fde40, 0x3ca67880,
+    0x3cad1ac0, 0x3cb3c540, 0x3cba7800, 0x3cc132c0,
+    0x3cc7f640, 0x3ccec280, 0x3cd59800, 0x3cdc76c0,
+    0x3ce35e80, 0x3cea4f00, 0x3cf147c0, 0x3cf84900,
+    0x3cff5340, 0x3d0666c0, 0x3d0d8400, 0x3d14ab40,
+    0x3d1bdc00, 0x3d2315c0, 0x3d2a5880, 0x3d31a440,
+    0x3d38f900, 0x3d405780, 0x3d47c040, 0x3d4f3300,
+    0x3d56af40, 0x3d5e3500, 0x3d65c380, 0x3d6d5ac0,
+    0x3d74fb40, 0x3d7ca540, 0x3d845900, 0x3d8c1680,
+    0x3d93dd00, 0x3d9bac80, 0x3da38400, 0x3dab6400,
+    0x3db34c80, 0x3dbb3dc0, 0x3dc33840, 0x3dcb3bc0,
+    0x3dd347c0, 0x3ddb5bc0, 0x3de37780, 0x3deb9b00,
+    0x3df3c600, 0x3dfbf940, 0x3e0434c0, 0x3e0c7840,
+    0x3e14c3c0, 0x3e1d1640, 0x3e256f80, 0x3e2dcf40,
+    0x3e363580, 0x3e3ea300, 0x3e4717c0, 0x3e4f9380,
+    0x3e581600, 0x3e609e40, 0x3e692c40, 0x3e71bf80,
+    0x3e7a5840, 0x3e82f740, 0x3e8b9c40, 0x3e944700,
+    0x3e9cf780, 0x3ea5ad00, 0x3eae66c0, 0x3eb72500,
+    0x3ebfe780, 0x3ec8af00, 0x3ed17b80, 0x3eda4d00,
+    0x3ee32340, 0x3eebfd40, 0x3ef4dac0, 0x3efdbbc0,
+    0x3f06a040, 0x3f0f88c0, 0x3f187540, 0x3f216600,
+    0x3f2a5a80, 0x3f335200, 0x3f3c4c40, 0x3f454940,
+    0x3f4e4940, 0x3f574c80, 0x3f605340, 0x3f695dc0,
+    0x3f726b40, 0x3f7b7b40, 0x3f848dc0, 0x3f8da240,
+    0x3f96b940, 0x3f9fd300, 0x3fa8f040, 0x3fb21080,
+    0x3fbb33c0, 0x3fc459c0, 0x3fcd81c0, 0x3fd6abc0,
+    0x3fdfd780, 0x3fe90480, 0x3ff23280, 0x3ffb6100,
+    0x40049f80, 0x400dd080, 0x40170400, 0x40203880,
+    0x40296f00, 0x4032a600, 0x403bde00, 0x40451680,
+    0x404e4f00, 0x40578700, 0x4060be80, 0x4069f500,
+    0x40732b80, 0x407c6280, 0x40859980, 0x408ed100,
+    0x40980800, 0x40a13f00, 0x40aa7500, 0x40b3a980,
+    0x40bcdd80, 0x40c61180, 0x40cf4500, 0x40d87800,
+    0x40e1ab00, 0x40eadc80, 0x40f40c80, 0x40fd3a80,
+    0x41066700, 0x410f9300, 0x4118bd80, 0x4121e700,
+    0x412b0f80, 0x41343580, 0x413d5880, 0x41467980,
+    0x414f9780, 0x4158b380, 0x4161cd80, 0x416ae580,
+    0x4173fb00, 0x417d0d00, 0x41861b80, 0x418f2600,
+    0x41982c80, 0x41a12f80, 0x41aa3000, 0x41b32c80,
+    0x41bc2580, 0x41c51a00, 0x41ce0900, 0x41d6f300,
+    0x41dfd800, 0x41e8b880, 0x41f19400, 0x41fa6b80,
+    0x42033d00, 0x420c0900, 0x4214cf00, 0x421d8e00,
+    0x42264680, 0x422ef980, 0x4237a680, 0x42404d80,
+    0x4248ee00, 0x42518780, 0x425a1a00, 0x4262a480,
+    0x426b2800, 0x4273a400, 0x427c1980, 0x42848880,
+    0x428cef80, 0x42954f00, 0x429da680, 0x42a5f500,
+    0x42ae3b80, 0x42b67a00, 0x42beb100, 0x42c6e080,
+    0x42cf0780, 0x42d72680, 0x42df3c00, 0x42e74880,
+    0x42ef4c80, 0x42f74880, 0x42ff3c80, 0x43072880,
+    0x430f0c80, 0x4316e800, 0x431eba00, 0x43268380,
+    0x432e4480, 0x4335fd00, 0x433dae80, 0x43455800,
+    0x434cfa00, 0x43549400, 0x435c2500, 0x4363ad80,
+    0x436b2e00, 0x4372a700, 0x437a1800, 0x43818200,
+    0x4388e400, 0x43903f00, 0x43979200, 0x439edd00,
+    0x43a62080, 0x43ad5c80, 0x43b49180, 0x43bbbf80,
+    0x43c2e800, 0x43ca0b00, 0x43d12980, 0x43d84280,
+    0x43df5200, 0x43e65500, 0x43ed4800, 0x43f43080,
+    0x43fb1c80, 0x44021b80, 0x44093a00, 0x44106480,
+    0x44176700, 0x441e0c00, 0x44241e00, 0x44297380,
+    0x4425dc00, 0x44240180, 0x441ff300, 0x4419e300,
+    0x44123f80, 0x44097500, 0x43ffe900, 0x43f5e700,
+    0x43eb9f00, 0x43e13f00, 0x43d6f200, 0x43ccbd80,
+    0x43c28400, 0x43b82780, 0x43ad8b00, 0x43a29c80,
+    0x43975180, 0x438ba080, 0x437f8180, 0x4372fd00,
+    0x43662b00, 0x43592480, 0x434c0000, 0x433ecd00,
+    0x43319180, 0x43245300, 0x43171700, 0x4309da80,
+    0x42fc9300, 0x42ef3500, 0x42e1b600, 0x42d40280,
+    0x42c60000, 0x42b79300, 0x42a8a180, 0x42991a00,
+    0x4288f200, 0x42782100, 0x42669e00, 0x42546880,
+    0x42418800, 0x422e0480, 0x4219e500, 0x42053680,
+    0x41f00980, 0x41da7080, 0x41c47b00, 0x41ae3600,
+    0x4197ab80, 0x4180e400, 0x4169e780, 0x4152bb00,
+    0x413b5e80, 0x4123d180, 0x410c1480, 0x40f42100,
+    0x40dbed00, 0x40c36c80, 0x40aa9600, 0x40915f80,
+    0x4077c100, 0x405db280, 0x40432c80, 0x40282580,
+    0x400c9280, 0x3ff068c0, 0x3fd39dc0, 0x3fb62bc0,
+    0x3f981200, 0x3f795080, 0x3f59e780, 0x3f39ebc0,
+    0x3f198680, 0x3ef8e100, 0x3ed82440, 0x3eb76c80,
+    0x3e96c940, 0x3e764900, 0x3e55f980, 0x3e35cb00,
+    0x3e1590c0, 0x3df51cc0, 0x3dd44200, 0x3db2e640,
+    0x3d910200, 0x3d6e8e40, 0x3d4b8480, 0x3d27e600,
+    0x3d03bc00, 0x3cdf0fc0, 0x3cb9eb80, 0x3c946240,
+    0x3c6e9180, 0x3c489700, 0x3c229000, 0x3bfc95c0,
+    0x3bd6bd00, 0x3bb11a80, 0x3b8bc180, 0x3b669bc0,
+    0x3b416a00, 0x3b1beb80, 0x3af5e140, 0x3acf3300,
+    0x3aa7ef80, 0x3a802780, 0x3a57eb80, 0x3a2f5880,
+    0x3a069640, 0x39ddcd40, 0x39b524c0, 0x398ca540,
+    0x39643800, 0x393bc540, 0x39133580, 0x38ea7ac0,
+    0x38c19040, 0x389871c0, 0x386f1b40, 0x38458e00,
+    0x381bd000, 0x37f1e780, 0x37c7db00, 0x379db080,
+    0x37736e80, 0x37491b00, 0x371ebcc0, 0x36f45980,
+    0x36c96600, 0x369ed300, 0x36740380, 0x3648ffc0,
+    0x361dcf40, 0x35f27a00, 0x35c70780, 0x359b7f80,
+    0x356fe9c0, 0x35444dc0, 0x3518b280, 0x34ed1940,
+    0x34c17c00, 0x3495d4c0, 0x346a1d40, 0x343e4300,
+    0x34122840, 0x33e5ae00, 0x33b8b780, 0x338b4dc0,
+    0x335d9f00, 0x332fdc00, 0x33023440, 0x32d4cc40,
+    0x32a7bc80, 0x327b1d40, 0x324f04c0, 0x32235280,
+    0x31f7b100, 0x31cbc7c0, 0x319f4140, 0x3171fb40,
+    0x31440840, 0x31157d00, 0x30e66e80, 0x30b6fc40,
+    0x30875080, 0x30579600, 0x3027f700, 0x2ff89140,
+    0x2fc976c0, 0x2f9ab880, 0x2f6c6780, 0x2f3e8780,
+    0x2f111000, 0x2ee3f800, 0x2eb73480, 0x2e8a9840,
+    0x2e5dd340, 0x2e3093c0, 0x2e028ac0, 0x2dd39680,
+    0x2da3c480, 0x2d732380, 0x2d41c400, 0x2d0fd300,
+    0x2cdd9ac0, 0x2cab6640, 0x2c797f00, 0x2c480d40,
+    0x2c171700, 0x2be6a0c0, 0x2bb6ae80, 0x2b8739c0,
+    0x2b583200, 0x2b298600, 0x2afb2400, 0x2accfa40,
+    0x2a9ef500, 0x2a710100, 0x2a430ac0, 0x2a14f9c0,
+    0x29e6b0c0, 0x29b81240, 0x29890140, 0x29596900,
+    0x29293e00, 0x28f87500, 0x28c70340, 0x2894efc0,
+    0x28625140, 0x282f4040, 0x27fbd5c0, 0x27c83540,
+    0x27948ec0, 0x27611240, 0x272def80, 0x26fb4cc0,
+    0x26c94780, 0x2697fcc0, 0x26678880, 0x2637f740,
+    0x26094540, 0x25db6dc0, 0x25ae6b40, 0x25821680,
+    0x255627c0, 0x252a55c0, 0x24fe5680, 0x24d1db40,
+    0x24a48fc0, 0x24761f40, 0x244637c0, 0x2414c900,
+    0x23e20240, 0x23ae1740, 0x23793bc0, 0x2343cc00,
+    0x230e4ac0, 0x22d93c80, 0x22a52400, 0x22725180,
+    0x2240e480, 0x2210f9c0, 0x21e2ab40, 0x21b5c7c0,
+    0x2189d2c0, 0x215e4d40, 0x2132b900, 0x2106ba80,
+    0x20da1940, 0x20ac9d80, 0x207e11c0, 0x204e77c0,
+    0x201e0880, 0x1fecfea0, 0x1fbb94e0, 0x1f8a0500,
+    0x1f59d340, 0x1f27ac20, 0x1ef67c60, 0x1ec64e40,
+    0x1e96fdc0, 0x1e686400, 0x1e3a5a00, 0x1e0cae80,
+    0x1ddf25e0, 0x1db18460, 0x1d839020, 0x1d5536e0,
+    0x1d268e80, 0x1cf7ae60, 0x1cc8aea0, 0x1c99af00,
+    0x1c6ad820, 0x1c3c5280, 0x1c0e4500, 0x1be0ab60,
+    0x1bb35620, 0x1b861400, 0x1b58b480, 0x1b2b1a00,
+    0x1afd39c0, 0x1acf09a0, 0x1aa080c0, 0x1a71b020,
+    0x1a42c2a0, 0x1a13e420, 0x19e53fc0, 0x19b6eb00,
+    0x1988e620, 0x195b3060, 0x192dc8a0, 0x1900a8a0,
+    0x18d3c4e0, 0x18a711e0, 0x187a83e0, 0x184e10e0,
+    0x1821b060, 0x17f55a00, 0x17c90580, 0x179cb100,
+    0x177060a0, 0x17441880, 0x1717dd20, 0x16ebb080,
+    0x16bf9260, 0x169382e0, 0x166781c0, 0x163b8f80,
+    0x160fade0, 0x15e3de40, 0x15b82220, 0x158c7ae0,
+    0x1560ea80, 0x15357240, 0x150a1400, 0x14ded020,
+    0x14b3a640, 0x148895a0, 0x145d9dc0, 0x1432bde0,
+    0x1407f540, 0x13dd4380, 0x13b2a860, 0x13882460,
+    0x135db880, 0x133365a0, 0x13092cc0, 0x12df0e60,
+    0x12b50aa0, 0x128b2120, 0x12615200, 0x12379da0,
+    0x120e04c0, 0x11e48820, 0x11bb2860, 0x1191e600,
+    0x1168c080, 0x113fb7a0, 0x1116cb40, 0x10edfba0,
+    0x10c54a00, 0x109cb7a0, 0x10744560, 0x104bf420,
+    0x1023c3e0, 0x0ffbb500, 0x0fd3c790, 0x0fabfbe0,
+    0x0f845290, 0x0f5ccc40, 0x0f356970, 0x0f0e2a60,
+    0x0ee70eb0, 0x0ec01610, 0x0e994040, 0x0e728d50,
+    0x0e4bfdf0, 0x0e2592c0, 0x0dff4c70, 0x0dd92af0,
+    0x0db32da0, 0x0d8d53e0, 0x0d679cf0, 0x0d420880,
+    0x0d1c9680, 0x0cf74700, 0x0cd219f0, 0x0cad0eb0,
+    0x0c882450, 0x0c6359a0, 0x0c3ead90, 0x0c1a1f80,
+    0x0bf5af40, 0x0bd15cf0, 0x0bad2870, 0x0b891440,
+    0x0b652530, 0x0b416020, 0x0b1dca30, 0x0afa6810,
+    0x0ad73ee0, 0x0ab45370, 0x0a91aac0, 0x0a6f49b0,
+    0x0a4da7f0, 0x0a2c7e20, 0x0a0ba310, 0x09eb1220,
+    0x09cac6e0, 0x09aabc70, 0x098aee40, 0x096b57a0,
+    0x094bf400, 0x092cbea0, 0x090db2e0, 0x08eecef0,
+    0x08d01360, 0x08b18110, 0x089318b0, 0x0874db00,
+    0x0856c880, 0x0838e1b0, 0x081b2730, 0x07fd99a8,
+    0x07e03a28, 0x07c309a8, 0x07a60910, 0x07893918,
+    0x076c99d0, 0x07502b90, 0x0733ee70, 0x0717e2f8,
+    0x06fc09b8, 0x06e06378, 0x06c4f0b8, 0x06a9b1c8,
+    0x068ea6a0, 0x0673cf18, 0x06592b18, 0x063ebad0,
+    0x06247ed0, 0x060a7780, 0x05f0a570, 0x05d708b8,
+    0x05bda128, 0x05a46e80, 0x058b7078, 0x0572a740,
+    0x055a1330, 0x0541b4d8, 0x05298c98, 0x05119a88,
+    0x04f9de50, 0x04e257a0, 0x04cb0630, 0x04b3ea00,
+    0x049d0378, 0x04865308, 0x046fd918, 0x045995a8,
+    0x04438860, 0x042db0d0, 0x04180ea0, 0x0402a1d0,
+    0x03ed6abc, 0x03d869b8, 0x03c39f28, 0x03af0af0,
+    0x039aaca0, 0x038683b4, 0x03728fc0, 0x035ed0b0,
+    0x034b46c4, 0x0337f254, 0x0324d3a0, 0x0311eab0,
+    0x02ff370c, 0x02ecb85c, 0x02da6e34, 0x02c858a8,
+    0x02b67820, 0x02a4cd28, 0x02935820, 0x02821920,
+    0x02710fac, 0x02603b54, 0x024f9bb4, 0x023f308c,
+    0x022ef9e8, 0x021ef7c8, 0x020f2a40, 0x01ff908e,
+    0x01f02974, 0x01e0f38a, 0x01d1ed94, 0x01c316d6,
+    0x01b46f5e, 0x01a5f720, 0x0197ae28, 0x018994ea,
+    0x017bac54, 0x016df546, 0x016070ae, 0x01532078,
+    0x01460760, 0x01392834, 0x012c85a4, 0x01201f7a,
+    0x0113f27c, 0x0107fb6c, 0x00fc36fd, 0x00f0a2d5,
+    0x00e53d51, 0x00da050f, 0x00cef88c, 0x00c41869,
+    0x00b9671f, 0x00aee754, 0x00a49b80, 0x009a8384,
+    0x00909ca6, 0x0086e400, 0x007d56e3, 0x0073f48e,
+    0x006abe70, 0x0061b5de, 0x0058dc65, 0x005033b4,
+    0x0047be30, 0x003f7e30, 0x00377619, 0x002fa4d4,
+    0x002805ee, 0x002094cb, 0x00194cb8, 0x00122856,
+    0x000b215c, 0x00043148, 0xfffd51f0, 0xfff683a0,
+    0xffefcd4d, 0xffe9362f, 0xffe2c57d, 0xffdc855c,
+    0xffd682c4, 0xffd0cad4, 0xffcb6a2c, 0xffc663bc,
+    0xffc1b06f, 0xffbd48e1, 0xffb92570, 0xffb53a54,
+    0xffb1779c, 0xffadcd38, 0xffaa2b42, 0xffa68855,
+    0xffa2e141, 0xff9f332c, 0xff9b7b9c, 0xff97bf2e,
+    0xff9409e2, 0xff9067e2, 0xff8ce556, 0xff898bf0,
+    0xff866306, 0xff8371d0, 0xff80bf63, 0xff7e4eba,
+    0xff7c1eaa, 0xff7a2e04, 0xff787b47, 0xff770280,
+    0xff75bd06, 0xff74a3f7, 0xff73b0b2, 0xff72dd02,
+    0xff72237e, 0xff717ebe, 0xff70e94c, 0xff705f59,
+    0xff6fde6a, 0xff6f6426, 0xff6eee40, 0xff6e7d0b,
+    0xff6e1359, 0xff6db403, 0xff6d61f8, 0xff6d2054,
+    0xff6cf267, 0xff6cdb76, 0xff6cdebb, 0xff6cff47,
+    0xff6d3fc9, 0xff6da306, 0xff6e2b82, 0xff6eda13,
+    0xff6fad6d, 0xff70a463, 0xff71bd9d, 0xff72f662,
+    0xff744a80, 0xff75b5c4, 0xff773409, 0xff78c0a6,
+    0xff7a5693, 0xff7bf0dc, 0xff7d8abb, 0xff7f2301,
+    0xff80bc08, 0xff825854, 0xff83fa56, 0xff85a55c,
+    0xff875d22, 0xff892598, 0xff8b025d, 0xff8cf53c,
+    0xff8efdf4, 0xff911c48, 0xff934fc9, 0xff959675,
+    0xff97ec86, 0xff9a4e35, 0xff9cb7d2, 0xff9f26cc,
+    0xffa199ce, 0xffa40f74, 0xffa6867c, 0xffa8feb2,
+    0xffab78e0, 0xffadf5c7, 0xffb07640, 0xffb2fba0,
+    0xffb587a2, 0xffb81bfb, 0xffbaba46, 0xffbd6236,
+    0xffc011a8, 0xffc2c679, 0xffc57e84, 0xffc83894,
+    0xffcaf41a, 0xffcdb0b8, 0xffd06e17, 0xffd32bf7,
+    0xffd5ea38, 0xffd8a8c3, 0xffdb6764, 0xffde25fb,
+    0xffe0e471, 0xffe3a2b2, 0xffe66087, 0xffe91da6,
+    0xffebd978, 0xffee9351, 0xfff14ab0, 0xfff3fef6,
+    0xfff6af94, 0xfff95c0c, 0xfffc03c7, 0xfffea659,
+    0x00015885, 0x0003f2e9, 0x00068a73, 0x00091e8d,
+    0x000bae7f, 0x000e39bf, 0x0010bf96, 0x00133f78,
+    0x0015b8c4, 0x00182ae4, 0x001a9558, 0x001cf7b2,
+    0x001f51e0, 0x0021a3b4, 0x0023ed25, 0x00262df2,
+    0x002865c5, 0x002a9469, 0x002cb967, 0x002ed4aa,
+    0x0030e607, 0x0032ed88, 0x0034eb2f, 0x0036de23,
+    0x0038c503, 0x003a9e4c, 0x003c68a6, 0x003e23dd,
+    0x003fd0db, 0x00417083, 0x0043038b, 0x00448adf,
+    0x00460740, 0x0047799c, 0x0048e2b2, 0x004a42af,
+    0x004b98fb, 0x004ce50b, 0x004e2654, 0x004f5b5d,
+    0x005081c3, 0x00519716, 0x00529920, 0x005386d0,
+    0x0054603f, 0x00552581, 0x0055d6cc, 0x00567558,
+    0x0057033c, 0x005782b4, 0x0057f5b6, 0x00585e46,
+    0x0058be68, 0x005917ff, 0x00596ce4, 0x0059bcc0,
+    0x005a053a, 0x005a43ee, 0x005a76ae, 0x005a9b37,
+    0x005aaf38, 0x005ab07a, 0x005a9cef, 0x005a7349,
+    0x005a3328, 0x0059dc0a, 0x00596db0, 0x0058e8e5,
+    0x00584f98, 0x0057a3c0, 0x0056e738, 0x00561bec,
+    0x005543df, 0x0054610b, 0x0053753e, 0x0052824e,
+    0x005189f6, 0x00508dec, 0x004f8fc0, 0x004e8fd0,
+    0x004d8d26, 0x004c86d7, 0x004b7c0a, 0x004a6b33,
+    0x00495239, 0x00482f0e, 0x0046ffc4, 0x0045c201,
+    0x00447337, 0x004310cc, 0x00419871, 0x004008e4,
+    0x003e6231, 0x003ca460, 0x003acf8a, 0x0038e57a,
+    0x0036e981, 0x0034defa, 0x0032c94b, 0x0030acc6,
+    0x002e8eb4, 0x002c7452, 0x002a62aa, 0x00285bbf,
+    0x00265eda, 0x00246b24, 0x00227f9c, 0x002098e7,
+    0x001eb13b, 0x001cc2ef, 0x001ac899, 0x0018be3d,
+    0x0016a198, 0x00147065, 0x00122897, 0x000fcbc5,
+    0x000d5f03, 0x000ae77a, 0x00086a52, 0x0005eb92,
+    0x00036e4a, 0x0000f57e, 0xfffe8414, 0xfffc1a78,
+    0xfff9b6bb, 0xfff756d9, 0xfff4f8d0, 0xfff29add,
+    0xfff03b87, 0xffedd94c, 0xffeb7295, 0xffe9072b,
+    0xffe6981a, 0xffe4265b, 0xffe1b30e, 0xffdf3f2b,
+    0xffdccb9e, 0xffda5993, 0xffd7ea0c, 0xffd57d60,
+    0xffd31302, 0xffd0aa27, 0xffce4243, 0xffcbdb40,
+    0xffc97595, 0xffc711a2, 0xffc4af9d, 0xffc24fa6,
+    0xffbff1de, 0xffbd9699, 0xffbb3e44, 0xffb8e8d5,
+    0xffb695f4, 0xffb44522, 0xffb1f627, 0xffafa8f0,
+    0xffad5d91, 0xffab140a, 0xffa8cc1c, 0xffa68590,
+    0xffa44066, 0xffa1fca0, 0xff9fba30, 0xff9d7902,
+    0xff9b3916, 0xff98fa6d, 0xff96bd06, 0xff9480b6,
+    0xff924532, 0xff900a24, 0xff8dcf41, 0xff8b9433,
+    0xff895884, 0xff871bd3, 0xff84dd8a, 0xff829d34,
+    0xff805a43, 0xff7e142d, 0xff7bca71, 0xff797c83,
+    0xff7729e3, 0xff74d204, 0xff727451, 0xff70101e,
+    0xff6da493, 0xff6b30d1, 0xff68b3f4, 0xff662d31,
+    0xff639bd1, 0xff60ff09, 0xff5e562c, 0xff5ba3e0,
+    0xff58ee39, 0xff563c22, 0xff5394f3, 0xff50fd1e,
+    0xff4e7599, 0xff4bff32, 0xff499ad4, 0xff47490a,
+    0xff450a36, 0xff42deb7, 0xff40c6cf, 0xff3ec2be,
+    0xff3cd299, 0xff3af681, 0xff392e6a, 0xff377a4a,
+    0xff35d9f7, 0xff344d44, 0xff32d3e8, 0xff316d96,
+    0xff3019d9, 0xff2ed83a, 0xff2da82f, 0xff2c88bf,
+    0xff2b78b4, 0xff2a76cc, 0xff298184, 0xff289890,
+    0xff27bc7d, 0xff26ee21, 0xff262e28, 0xff257cdc,
+    0xff24d9f4, 0xff244524, 0xff23be15, 0xff234488,
+    0xff22d852, 0xff227947, 0xff22273d, 0xff21e1d2,
+    0xff21a871, 0xff217a79, 0xff215748, 0xff213eca,
+    0xff21319e, 0xff21305c, 0xff213baf, 0xff2153c2,
+    0xff21782b, 0xff21a892, 0xff21e477, 0xff222bda,
+    0xff227f26, 0xff22debd, 0xff234b09, 0xff23c394,
+    0xff24471d, 0xff24d42b, 0xff25695c, 0xff260538,
+    0xff26a652, 0xff274b28, 0xff27f22d, 0xff2899d2,
+    0xff295975, 0xff29f2ad, 0xff2a96d7, 0xff2b45f4,
+    0xff2bffe3, 0xff2cc4ba, 0xff2d9458, 0xff2e6ede,
+    0xff2f544c, 0xff3044b7, 0xff314034, 0xff3246fa,
+    0xff33591e, 0xff3476e0, 0xff35a060, 0xff36d534,
+    0xff38148f, 0xff395daf, 0xff3aafd4, 0xff3c0ac8,
+    0xff3d6ed6, 0xff3edc54, 0xff405382, 0xff41d3f5,
+    0xff435ccc, 0xff44ed0f, 0xff4683d3, 0xff482080,
+    0xff49c297, 0xff4b69ab, 0xff4d1547, 0xff4ec4f5,
+    0xff50781d, 0xff522e20, 0xff53e692, 0xff55a15d,
+    0xff575f17, 0xff592022, 0xff5ae4de, 0xff5cacb4,
+    0xff5e75e2, 0xff603ee5, 0xff62062f, 0xff63caab,
+    0xff658b55, 0xff67476d, 0xff68fe11, 0xff6aaea0,
+    0xff6c5899, 0xff6dfb86, 0xff6f96e7, 0xff712a65,
+    0xff72b59f, 0xff74382b, 0xff75b1d3, 0xff772276,
+    0xff788a20, 0xff79e8e5, 0xff7b3ef0, 0xff7c8c98,
+    0xff7dd249, 0xff7f108c, 0xff804804, 0xff817d0e,
+    0xff82b74a, 0xff83fde6, 0xff855762, 0xff86c622,
+    0xff884904, 0xff89ded1, 0xff8b8646, 0xff8d3e4c,
+    0xff8f05cc, 0xff90dbc6, 0xff92bf2a, 0xff94af04,
+    0xff96aa26, 0xff98af9a, 0xff9abe48, 0xff9cd543,
+    0xff9ef3c1, 0xffa118ea, 0xffa343fd, 0xffa57423,
+    0xffa7a890, 0xffa9e084, 0xffac1b31, 0xffae5802,
+    0xffb09680, 0xffb2d621, 0xffb51678, 0xffb75704,
+    0xffb99726, 0xffbbd645, 0xffbe13d7, 0xffc04f26,
+    0xffc2879a, 0xffc4bc72, 0xffc6ed24, 0xffc918e3,
+    0xffcb3eb8, 0xffcd5dcc, 0xffcf7549, 0xffd184d8,
+    0xffd38c8f, 0xffd58ca4, 0xffd7854d, 0xffd97694,
+    0xffdb606e, 0xffdd42d1, 0xffdf1da8, 0xffe0f09b,
+    0xffe2bb00, 0xffe47c41, 0xffe633c6, 0xffe7e150,
+    0xffe98534, 0xffeb1fb4, 0xffecb10e, 0xffee3944,
+    0xffefb7e9, 0xfff12cbe, 0xfff29762, 0xfff3f789,
+    0xfff54cbe, 0xfff69695, 0xfff7d4b8, 0xfff90748,
+    0xfffa2ee5, 0xfffb4c3c, 0xfffc6003, 0xfffd6af0,
+    0xfffe6dda, 0xffff69b8, 0x00005f4b, 0x00014e7f,
+    0x00023646, 0x000315b4, 0x0003ebd3, 0x0004b74a,
+    0x00057677, 0x000627e2, 0x0006ca09, 0x00075ce1,
+    0x0007e196, 0x00085955, 0x0008c556, 0x00092751,
+    0x00098153, 0x0009d581, 0x000a25be, 0x000a732b,
+    0x000abe1f, 0x000b06e4, 0x000b4db1, 0x000b91fa,
+    0x000bd266, 0x000c0da0, 0x000c426e, 0x000c6ffb,
+    0x000c95b0, 0x000cb2f7, 0x000cc76e, 0x000cd317,
+    0x000cd647, 0x000cd17f, 0x000cc52b, 0x000cb1ea,
+    0x000c98c0, 0x000c7a62, 0x000c57c7, 0x000c3187,
+    0x000c0862, 0x000bdcd8, 0x000baf81, 0x000b80c7,
+    0x000b50ec, 0x000b202f, 0x000aeec6, 0x000abcb2,
+    0x000a89d2, 0x000a5605, 0x000a2116, 0x0009eafb,
+    0x0009b37d, 0x00097a9d, 0x00094030, 0x00090440,
+    0x0008c6b9, 0x000887ae, 0x0008470c, 0x00080512,
+    0x0007c1f6, 0x00077df9, 0x0007395a, 0x0006f45b,
+    0x0006af67, 0x00066abe, 0x000626b6, 0x0005e38f,
+    0x0005a1a0, 0x0005611e, 0x00052234, 0x0004e502,
+    0x0004a95d, 0x00046f46, 0x00043691, 0x0003ff33,
+    0x0003c90d, 0x0003941f, 0x00036047, 0x00032d9c,
+    0x0002fc1e, 0x0002cbed, 0x00029d1e, 0x00026fbc,
+    0x000243f2, 0x000219d6, 0x0001f17d, 0x0001caf1,
+    0x0001a63e, 0x00018363, 0x00016256, 0x00014316,
+    0x0001258f, 0x000109cb, 0x0000efaa, 0x0000d720,
+    0x0000c03a, 0x0000aacb, 0x000096de, 0x0000846a,
+    0x0000736d, 0x000063d3, 0x000055a6, 0x000048d0,
+    0x00003d47, 0x000032f6, 0x000029dc, 0x000021d9,
+    0x00001ae3, 0x000014ee, 0x00000fdb, 0x00000ba9,
+    0x00000839, 0x00000589, 0x00000370, 0x000001ee,
+    0x000000d7, 0x00000036, 0xffffffe0, 0xffffffc0,
+    0xffffffd5, 0xfffffff5, 0x0000000b, 0x0000000b,
+    0x0000000b, 0x0000000b, 0xfffffff5, 0xffffffd5,
+    0xffffffca, 0xffffffe0, 0x00000036, 0x000000d7,
+    0x000001ce, 0x0000033b, 0x00000529, 0x000007ad,
+    0x00000ac8, 0x00000e99, 0x00001316, 0x0000185e,
+    0x00001e7e, 0x00002575, 0x00002d4c, 0x0000361b,
+    0x00003fd6, 0x00004a93, 0x00005647, 0x00006312,
+    0x000070de, 0x00007fad, 0x00008f87, 0x0000a064,
+    0x0000b242, 0x0000c52d, 0x0000d919, 0x0000ee12,
+    0x0001040c, 0x00011b13, 0x0001331b, 0x00014c30,
+    0x0001663c, 0x0001814a, 0x00019d4f, 0x0001ba35,
+    0x0001d7e7, 0x0001f645, 0x00021544, 0x000234c3,
+    0x000254b9, 0x00027505, 0x000295a7, 0x0002b67e,
+    0x0002d7a1, 0x0002f904, 0x00031ab2, 0x00033ca0,
+    0x00035ee5, 0x0003818a, 0x0003a485, 0x0003c7e1,
+    0x0003eb72, 0x00040f0e, 0x0004329f, 0x000455e6,
+    0x000478c0, 0x00049aef, 0x0004bc52, 0x0004dca9,
+    0x0004fbde, 0x000519c5, 0x00053635, 0x0005512d,
+    0x00056aae, 0x000582a1, 0x00059927, 0x0005ae40,
+    0x0005c1f6, 0x0005d455, 0x0005e572, 0x0005f56d,
+    0x00060446, 0x0006121e, 0x00061f09, 0x00062b08,
+    0x00063605, 0x00063feb, 0x00064899, 0x00064ff0,
+    0x000655a5, 0x00065996, 0x00065b6f, 0x00065af8,
+    0x000657e9, 0x000651d4, 0x00064884, 0x00063bae,
+    0x00062b33, 0x00061706, 0x0005fefd, 0x0005e344,
+    0x0005c404, 0x0005a195, 0x00057c41, 0x00055473,
+    0x00052ac2, 0x0004ffc4, 0x0004d410, 0x0004a7e5,
+    0x00047b4f, 0x00044e39, 0x00042096, 0x0003f208,
+    0x0003c1e1, 0x00038f77, 0x00035a12, 0x00032127,
+    0x0002e476, 0x0002a389, 0x00025e29, 0x0002146d,
+    0x0001c700, 0x00017682, 0x000123a1, 0x0000cefd,
+    0x000078f7, 0x0000221a, 0xffffcad1, 0xffff7332,
+    0xffff1b1e, 0xfffec253, 0xfffe6891, 0xfffe0da2,
+    0xfffdb15c, 0xfffd5393, 0xfffcf412, 0xfffc92e3,
+    0xfffc3032, 0xfffbcc29, 0xfffb6714, 0xfffb0113,
+    0xfffa9a5b, 0xfffa3337, 0xfff9cbd4, 0xfff96450,
+    0xfff8fcac, 0xfff894dc, 0xfff82cd8, 0xfff7c4a8,
+    0xfff75c6d, 0xfff6f45e, 0xfff68c84, 0xfff62500,
+    0xfff5bde8, 0xfff5575a, 0xfff4f179, 0xfff48c64,
+    0xfff42810, 0xfff3c488, 0xfff361d7, 0xfff30008,
+    0xfff29f3a, 0xfff23f78, 0xfff1e0d8, 0xfff1835b,
+    0xfff1272a, 0xfff0cc46, 0xfff072cf, 0xfff01ad0,
+    0xffefc469, 0xffef6fa4, 0xffef1ca3, 0xffeecb7a,
+    0xffee7c1f, 0xffee2eb2, 0xffede33d, 0xffed99c1,
+    0xffed5249, 0xffed0cde, 0xffecc98d, 0xffec8849,
+    0xffec4934, 0xffec0c38, 0xffebd175, 0xffeb98eb,
+    0xffeb62a4, 0xffeb2ead, 0xffeafd19, 0xffeacdea,
+    0xffeaa129, 0xffea76cc, 0xffea4ef4, 0xffea299f,
+    0xffea06e5, 0xffe9e6ce, 0xffe9c97d, 0xffe9aebb,
+    0xffe99651, 0xffe97fd6, 0xffe96ad3, 0xffe95711,
+    0xffe9447d, 0xffe93315, 0xffe922ce, 0xffe913a0,
+    0xffe90588, 0xffe8f887, 0xffe8ec93, 0xffe8e1c1,
+    0xffe8d806, 0xffe8cf77, 0xffe8c816, 0xffe8c1eb,
+    0xffe8bd03, 0xffe8b967, 0xffe8b72e, 0xffe8b64d,
+    0xffe8b6d8, 0xffe8b8dc, 0xffe8bc6c, 0xffe8c18a,
+    0xffe8c840, 0xffe8d0a4, 0xffe8daca, 0xffe8e69e,
+    0xffe8f42a, 0xffe9035a, 0xffe9142b, 0xffe926a0,
+    0xffe93ab7, 0xffe95066, 0xffe967b8, 0xffe980ad,
+    0xffe99b3a, 0xffe9b754, 0xffe9d511, 0xffe9f45b,
+    0xffea1532, 0xffea3797, 0xffea5b89, 0xffea8108,
+    0xffeaa7ff, 0xffead079, 0xffeafa55, 0xffeb259e,
+    0xffeb5254, 0xffeb8061, 0xffebafdc, 0xffebe0ae,
+    0xffec12ce, 0xffec462f, 0xffec7add, 0xffecb0a3,
+    0xffece774, 0xffed1f32, 0xffed57a7, 0xffed90b2,
+    0xffedca48, 0xffee042a, 0xffee3e57, 0xffee788e,
+};
+
 const DECLARE_ALIGNED(32, float, ff_aac_eld_window_480)[1800] = {
      0.00101191,  0.00440397,  0.00718669,  0.01072130,
      0.01459757,  0.01875954,  0.02308987,  0.02751541,
@@ -2219,3 +2827,456 @@ const DECLARE_ALIGNED(32, float, ff_aac_eld_window_480)[1800] = {
     -0.00115988, -0.00114605, -0.00113200, -0.00111778,
     -0.00110343, -0.00108898, -0.00107448, -0.00105995,
 };
+
+const DECLARE_ALIGNED(32, int, ff_aac_eld_window_480_fixed)[1800] = {
+    0x00109442, 0x00482797, 0x0075bf2a, 0x00afa864,
+    0x00ef2aa5, 0x01335b36, 0x017a4df0, 0x01c2cffe,
+    0x020bfb4c, 0x0254fd74, 0x029d557c, 0x02e50574,
+    0x032c41a8, 0x03732c08, 0x03b9cb88, 0x040032e8,
+    0x044686f0, 0x048cd578, 0x04d30738, 0x05190500,
+    0x055ec210, 0x05a44750, 0x05e9aeb8, 0x062f0c80,
+    0x067477a0, 0x06ba1ac0, 0x07001998, 0x074680e0,
+    0x078d5ec0, 0x07d4d038, 0x081cf8f0, 0x0865f8b0,
+    0x08afe0e0, 0x08fab150, 0x09466cd0, 0x09931910,
+    0x09e0adb0, 0x0a2f1640, 0x0a7e43f0, 0x0ace2960,
+    0x0b1eb180, 0x0b6fc4b0, 0x0bc15050, 0x0c134710,
+    0x0c65a420, 0x0cb86340, 0x0d0b7df0, 0x0d5ef450,
+    0x0db2cb60, 0x0e070180, 0x0e5b91f0, 0x0eb07f20,
+    0x0f05d0a0, 0x0f5b8920, 0x0fb1a950, 0x10082e40,
+    0x105f1400, 0x10b65820, 0x110df780, 0x1165f120,
+    0x11be43e0, 0x1216eea0, 0x126feac0, 0x12c92b00,
+    0x1322a620, 0x137c55c0, 0x13d61ae0, 0x142fc940,
+    0x148949e0, 0x14e28da0, 0x153b9a80, 0x15947640,
+    0x15ed1840, 0x16458660, 0x169deb20, 0x16f663c0,
+    0x174ef8c0, 0x17a7a120, 0x180041c0, 0x1858d000,
+    0x18b14940, 0x1909a140, 0x1961c820, 0x19b9b620,
+    0x1a116480, 0x1a68c1a0, 0x1abfbd00, 0x1b164f60,
+    0x1b6c7580, 0x1bc23120, 0x1c1780e0, 0x1c6c5d00,
+    0x1cc0dbe0, 0x1d1532a0, 0x1d697660, 0x1dbdac20,
+    0x1e11b280, 0x1e655b80, 0x1eb89e80, 0x1f0b7720,
+    0x1f5dd680, 0x1fafaec0, 0x2000fb00, 0x2051c340,
+    0x20a22ac0, 0x20f24580, 0x214213c0, 0x21919140,
+    0x21e0b300, 0x222f7580, 0x227dd900, 0x22cbd880,
+    0x23196ec0, 0x23669b00, 0x23b35d80, 0x23ffb6c0,
+    0x244ba7c0, 0x249731c0, 0x24e25700, 0x252d1940,
+    0x2594ae40, 0x25deea40, 0x2628bd00, 0x26722680,
+    0x26bb2740, 0x2703bf40, 0x274beec0, 0x2793b600,
+    0x27db1500, 0x28220c00, 0x28689b80, 0x28aec4c0,
+    0x28f48800, 0x2939e680, 0x297ee080, 0x29c37600,
+    0x2a07a740, 0x2a4b74c0, 0x2a8ede80, 0x2ad1e500,
+    0x2b148880, 0x2b56c940, 0x2b98a740, 0x2bda2240,
+    0x2c1b3a80, 0x2c5bef80, 0x2c9c4100, 0x2cdc2e80,
+    0x2d1bb800, 0x2d5adc80, 0x2d999b80, 0x2dd7f500,
+    0x2e15e800, 0x2e537400, 0x2e9098c0, 0x2ecd5540,
+    0x2f09a900, 0x2f4592c0, 0x2f811140, 0x2fbc2340,
+    0x2ff6c7c0, 0x3030fe80, 0x306ac6c0, 0x30a41f80,
+    0x30dd07c0, 0x31157dc0, 0x314d7fc0, 0x31850c80,
+    0x31bc22c0, 0x31f2c1c0, 0x3228e840, 0x325e9540,
+    0x3293c7c0, 0x32c87e40, 0x32fcb800, 0x33307340,
+    0x3363aec0, 0x33966940, 0x33c8a140, 0x33fa5580,
+    0x342b84c0, 0x345c2dc0, 0x348c4f80, 0x34bbe900,
+    0x34eaf9c0, 0x35198080, 0x35477d00, 0x3574ee40,
+    0x35a1d340, 0x35ce2bc0, 0x35f9f6c0, 0x36253380,
+    0x364fe180, 0x367a0040, 0x36a38f80, 0x36cc8ec0,
+    0x36f4fe80, 0x371cde80, 0x37442e80, 0x376aef00,
+    0x37912000, 0x37b6c200, 0x37dbd600, 0x38005d00,
+    0x38245840, 0x3847c880, 0x386aaf80, 0x388d0e80,
+    0x38aee700, 0x38d03bc0, 0x38f11000, 0x39116700,
+    0x39314440, 0x3950ab00, 0x396f9e80, 0x398e22c0,
+    0x39ac3c40, 0x39c9f280, 0x39e74cc0, 0x3a045280,
+    0x3a210b40, 0x3a3d7ec0, 0x3a59b480, 0x3a75b480,
+    0x3a918900, 0x3aad3cc0, 0x3ac8db00, 0x3ae46bc0,
+    0x3afff080, 0x3b1b6840, 0x3b36d2c0, 0x3b521980,
+    0x3b6d0780, 0x3b876400, 0x3ba0f4c0, 0x3bb96740,
+    0x3bd03dc0, 0x3be56580, 0x3bf6dec0, 0x3c0c6140,
+    0x3c15a9c0, 0x3c1a5780, 0x3c1fd0c0, 0x3c25edc0,
+    0x3c2c78c0, 0x3c333880, 0x3c39f3c0, 0x3c409100,
+    0x3c471d00, 0x3c4da780, 0x3c543f40, 0x3c5ae880,
+    0x3c619f00, 0x3c685f00, 0x3c6f25c0, 0x3c75f280,
+    0x3c7cc6c0, 0x3c83a2c0, 0x3c8a87c0, 0x3c9175c0,
+    0x3c986d00, 0x3c9f6e00, 0x3ca67880, 0x3cad8c40,
+    0x3cb4a980, 0x3cbbd000, 0x3cc2ffc0, 0x3cca3940,
+    0x3cd17d40, 0x3cd8cb80, 0x3ce02480, 0x3ce78740,
+    0x3ceef3c0, 0x3cf66a00, 0x3cfdea00, 0x3d0574c0,
+    0x3d0d0a40, 0x3d14ab40, 0x3d1c5700, 0x3d240d00,
+    0x3d2bcd40, 0x3d3397c0, 0x3d3b6cc0, 0x3d434d00,
+    0x3d4b38c0, 0x3d532fc0, 0x3d5b3180, 0x3d633dc0,
+    0x3d6b53c0, 0x3d737400, 0x3d7b9f00, 0x3d83d540,
+    0x3d8c1680, 0x3d946200, 0x3d9cb780, 0x3da51680,
+    0x3dad7f00, 0x3db5f140, 0x3dbe6dc0, 0x3dc6f480,
+    0x3dcf8540, 0x3dd81fc0, 0x3de0c300, 0x3de96ec0,
+    0x3df22340, 0x3dfae0c0, 0x3e03a800, 0x3e0c7840,
+    0x3e155180, 0x3e1e32c0, 0x3e271bc0, 0x3e300c00,
+    0x3e390400, 0x3e420400, 0x3e4b0c40, 0x3e541c80,
+    0x3e5d33c0, 0x3e6651c0, 0x3e6f7580, 0x3e789fc0,
+    0x3e81d080, 0x3e8b0880, 0x3e944700, 0x3e9d8c00,
+    0x3ea6d680, 0x3eb02600, 0x3eb97a80, 0x3ec2d400,
+    0x3ecc3340, 0x3ed59880, 0x3edf0300, 0x3ee87280,
+    0x3ef1e600, 0x3efb5d40, 0x3f04d880, 0x3f0e5840,
+    0x3f17dcc0, 0x3f216600, 0x3f2af340, 0x3f348440,
+    0x3f3e1840, 0x3f47af40, 0x3f514a00, 0x3f5ae840,
+    0x3f648b00, 0x3f6e3140, 0x3f77db00, 0x3f818740,
+    0x3f8b3600, 0x3f94e780, 0x3f9e9c40, 0x3fa85480,
+    0x3fb21080, 0x3fbbcfc0, 0x3fc59200, 0x3fcf56c0,
+    0x3fd91dc0, 0x3fe2e640, 0x3fecb040, 0x3ff67b40,
+    0x40098600, 0x40135580, 0x401d2700, 0x4026fa00,
+    0x4030ce80, 0x403aa380, 0x40447900, 0x404e4f00,
+    0x40582400, 0x4061f900, 0x406bcd00, 0x4075a080,
+    0x407f7480, 0x40894900, 0x40931e00, 0x409cf280,
+    0x40a6c600, 0x40b09800, 0x40ba6980, 0x40c43a80,
+    0x40ce0b00, 0x40d7db00, 0x40e1ab00, 0x40eb7980,
+    0x40f54600, 0x40ff1080, 0x4108d980, 0x4112a100,
+    0x411c6800, 0x41262d80, 0x412ff080, 0x4139b180,
+    0x41436e80, 0x414d2980, 0x4156e100, 0x41609700,
+    0x416a4a80, 0x4173fb00, 0x417da800, 0x41875000,
+    0x4190f400, 0x419a9400, 0x41a43000, 0x41adc880,
+    0x41b75d00, 0x41c0ec80, 0x41ca7700, 0x41d3fb00,
+    0x41dd7980, 0x41e6f280, 0x41f06600, 0x41f9d480,
+    0x42033d00, 0x420c9f00, 0x4215f980, 0x421f4d00,
+    0x42289900, 0x4231de80, 0x423b1d00, 0x42445500,
+    0x424d8500, 0x4256ad00, 0x425fcc80, 0x4268e380,
+    0x4271f200, 0x427af900, 0x4283f880, 0x428cef80,
+    0x4295de00, 0x429ec280, 0x42a79d80, 0x42b06f00,
+    0x42b93800, 0x42c1f800, 0x42caaf80, 0x42d35d80,
+    0x42dc0100, 0x42e49b00, 0x42ed2a80, 0x42f5b080,
+    0x42fe2d80, 0x4306a180, 0x430f0c80, 0x43176d80,
+    0x431fc480, 0x43281100, 0x43305400, 0x43388e80,
+    0x4340c000, 0x4348e900, 0x43510900, 0x43591f00,
+    0x43612b80, 0x43692f00, 0x43712900, 0x43791a80,
+    0x43810380, 0x4388e400, 0x4390bc00, 0x43988b00,
+    0x43a05180, 0x43a80f00, 0x43afc480, 0x43b77180,
+    0x43bf1780, 0x43c6b700, 0x43ce5100, 0x43d5e580,
+    0x43dd7100, 0x43e4ef80, 0x43ec5b80, 0x43f3ba80,
+    0x43fb1c80, 0x44029400, 0x440a2e80, 0x4411d080,
+    0x44193800, 0x44202480, 0x44265880, 0x442ba780,
+    0x442d8680, 0x4428a500, 0x44241380, 0x441ccb00,
+    0x44140100, 0x440a1200, 0x43ff7280, 0x43f46980,
+    0x43e93200, 0x43ddff00, 0x43d2dc80, 0x43c7ac00,
+    0x43bc4900, 0x43b09400, 0x43a47d80, 0x4397fd80,
+    0x438b0780, 0x437d9b80, 0x436fd380, 0x4361cd80,
+    0x4353a800, 0x43457500, 0x43373c80, 0x43290500,
+    0x431ad400, 0x430ca280, 0x42fe6000, 0x42f00080,
+    0x42e17380, 0x42d29e00, 0x42c35d80, 0x42b39200,
+    0x42a32080, 0x4291fc00, 0x42801900, 0x426d6d80,
+    0x4259f680, 0x4245bd00, 0x4230ca80, 0x421b2900,
+    0x4204e800, 0x41ee1d00, 0x41d6dd80, 0x41bf3c80,
+    0x41a74680, 0x418f0680, 0x41768800, 0x415dd100,
+    0x4144e400, 0x412bbf80, 0x41126400, 0x40f8cc00,
+    0x40deea00, 0x40c4b100, 0x40aa1400, 0x408f0800,
+    0x40738380, 0x40577d80, 0x403aeb80, 0x401dc180,
+    0x3ffff240, 0x3fe170c0, 0x3fc232c0, 0x3fa23680,
+    0x3f817c40, 0x3f6002c0, 0x3f3ddec0, 0x3f1b4180,
+    0x3ef85d40, 0x3ed56340, 0x3eb27240, 0x3e8f9c40,
+    0x3e6cf400, 0x3e4a81c0, 0x3e282140, 0x3e059980,
+    0x3de2b280, 0x3dbf4100, 0x3d9b3640, 0x3d768b00,
+    0x3d513640, 0x3d2b3840, 0x3d049b80, 0x3cdd6b40,
+    0x3cb5b400, 0x3c8d8f40, 0x3c652080, 0x3c3c8c40,
+    0x3c13f480, 0x3beb7580, 0x3bc327c0, 0x3b9b2680,
+    0x3b737000, 0x3b4bc580, 0x3b23d740, 0x3afb5640,
+    0x3ad21c40, 0x3aa83780, 0x3a7dbc40, 0x3a52bf80,
+    0x3a276600, 0x39fbe0c0, 0x39d06140, 0x39a50ec0,
+    0x3979e300, 0x394ebf40, 0x392386c0, 0x38f82280,
+    0x38cc89c0, 0x38a0b7c0, 0x3874a740, 0x38485840,
+    0x381bd1c0, 0x37ef1b40, 0x37c23cc0, 0x37953dc0,
+    0x376825c0, 0x373afc80, 0x370dc980, 0x36e09440,
+    0x36b41dc0, 0x36862100, 0x3657e480, 0x36297240,
+    0x35fad380, 0x35cc1200, 0x359d36c0, 0x356e4b40,
+    0x353f5880, 0x35106780, 0x34e17780, 0x34b28240,
+    0x34838040, 0x345466c0, 0x34251940, 0x33f57280,
+    0x33c54bc0, 0x33949840, 0x33638380, 0x33324980,
+    0x33012500, 0x32d04480, 0x329fc7c0, 0x326fcbc0,
+    0x324068c0, 0x32116fc0, 0x31e27600, 0x31b30fc0,
+    0x3182e300, 0x3151e240, 0x312029c0, 0x30edd080,
+    0x30baf700, 0x3087cd00, 0x30548600, 0x30215680,
+    0x2fee65c0, 0x2fbbca40, 0x2f899980, 0x2f57e6c0,
+    0x2f26b540, 0x2ef5f980, 0x2ec5aa00, 0x2e95afc0,
+    0x2e65c180, 0x2e357b40, 0x2e047840, 0x2dd27380,
+    0x2d9f6c40, 0x2d6b7780, 0x2d36a6c0, 0x2d012940,
+    0x2ccb5680, 0x2c958a00, 0x2c601b80, 0x2c2b3640,
+    0x2bf6dfc0, 0x2bc31ec0, 0x2b8ff500, 0x2b5d5540,
+    0x2b2b2a00, 0x2af95e80, 0x2ac7dd80, 0x2a968f80,
+    0x2a655d40, 0x2a342f00, 0x2a02e8c0, 0x29d16700,
+    0x299f8640, 0x296d2380, 0x293a2740, 0x29068400,
+    0x28d22b40, 0x289d1540, 0x28675280, 0x28310180,
+    0x27fa3f00, 0x27c32f80, 0x278c08c0, 0x275505c0,
+    0x271e60c0, 0x26e84b00, 0x26b2e880, 0x267e5cc0,
+    0x264ac940, 0x26183a40, 0x25e6aa80, 0x25b615c0,
+    0x25866b80, 0x25576b40, 0x2528ba00, 0x24f9ffc0,
+    0x24cadfc0, 0x249af540, 0x2469da80, 0x24372780,
+    0x2402b800, 0x23ccbfc0, 0x23957cc0, 0x235d3140,
+    0x23245200, 0x22eb8000, 0x22b35cc0, 0x227c7940,
+    0x22471d40, 0x22136840, 0x21e18240, 0x21b15d80,
+    0x21827dc0, 0x21544600, 0x21261b00, 0x20f78600,
+    0x20c83e00, 0x20980000, 0x20668e00, 0x2033f300,
+    0x20007400, 0x1fcc64e0, 0x1f97d120, 0x1f642320,
+    0x1f2f49e0, 0x1efaa840, 0x1ec73580, 0x1e94d880,
+    0x1e636120, 0x1e32a160, 0x1e025ba0, 0x1dd24300,
+    0x1da20e60, 0x1d717940, 0x1d407560, 0x1d0f2040,
+    0x1cdd95c0, 0x1cabf500, 0x1c7a6940, 0x1c492340,
+    0x1c185680, 0x1be818c0, 0x1bb83f60, 0x1b888d20,
+    0x1b58c640, 0x1b28c240, 0x1af871e0, 0x1ac7c960,
+    0x1a96bf00, 0x1a656b60, 0x1a340360, 0x1a02bd20,
+    0x19d1c6c0, 0x19a12f40, 0x1970f480, 0x19411640,
+    0x19119000, 0x18e255a0, 0x18b358a0, 0x18848b20,
+    0x1855e040, 0x18274e00, 0x17f8c9e0, 0x17ca4a80,
+    0x179bce40, 0x176d5a60, 0x173ef400, 0x17109fe0,
+    0x16e25f60, 0x16b43240, 0x16861880, 0x16581220,
+    0x162a20c0, 0x15fc4620, 0x15ce8420, 0x15a0dca0,
+    0x157351c0, 0x1545e580, 0x151899a0, 0x14eb6ec0,
+    0x14be63a0, 0x14917a00, 0x14649ae0, 0x14377060,
+    0x1409d0c0, 0x13dbbb20, 0x13ad58e0, 0x137f0160,
+    0x1350cc80, 0x1322b8c0, 0x12f4ca60, 0x12c704e0,
+    0x129968a0, 0x126bf5c0, 0x123eade0, 0x12119300,
+    0x11e4a660, 0x11b7e860, 0x118b5940, 0x115ef8a0,
+    0x1132c600, 0x1106c1a0, 0x10daecc0, 0x10af4900,
+    0x1083d7a0, 0x10589c00, 0x102d9a00, 0x1002d1e0,
+    0x0fd842c0, 0x0fadde80, 0x0f839a50, 0x0f597700,
+    0x0f2f76e0, 0x0f05a170, 0x0edbf9c0, 0x0eb27f30,
+    0x0e8930d0, 0x0e600d70, 0x0e371550, 0x0e0e4950,
+    0x0de5ab50, 0x0dbd3d20, 0x0d94fe10, 0x0d6cecb0,
+    0x0d450220, 0x0d1d38f0, 0x0cf59130, 0x0cce0c30,
+    0x0ca6af10, 0x0c7f7b80, 0x0c587010, 0x0c318960,
+    0x0c0ac200, 0x0be418d0, 0x0bbd8da0, 0x0b9724e0,
+    0x0b70e6c0, 0x0b4ad970, 0x0b2502f0, 0x0aff6930,
+    0x0ada1250, 0x0ab50430, 0x0a9044d0, 0x0a6bda30,
+    0x0a3bedf0, 0x0a18be40, 0x09f5e530, 0x09d35cf0,
+    0x09b11ff0, 0x098f2890, 0x096d7120, 0x094bf400,
+    0x092aab80, 0x09099240, 0x08e8a620, 0x08c7e850,
+    0x08a75990, 0x0886fae0, 0x0866ccf0, 0x0846d070,
+    0x08270610, 0x08076e70, 0x07e80ac8, 0x07c8dc60,
+    0x07a9e440, 0x078b2348, 0x076c99d0, 0x074e4818,
+    0x07302e50, 0x07124d18, 0x06f4a530, 0x06d73778,
+    0x06ba0488, 0x069d0c88, 0x06804f68, 0x0663cce0,
+    0x06478528, 0x062b78a0, 0x060fa7e8, 0x05f413b8,
+    0x05d8bc38, 0x05bda128, 0x05a2c258, 0x05881f60,
+    0x056db888, 0x05538e60, 0x0539a170, 0x051ff218,
+    0x05068040, 0x04ed4b90, 0x04d45398, 0x04bb9820,
+    0x04a31988, 0x048ad860, 0x0472d528, 0x045b0ff0,
+    0x04438860, 0x042c3de8, 0x04153040, 0x03fe5f4c,
+    0x03e7cb98, 0x03d17580, 0x03bb5d64, 0x03a582e8,
+    0x038fe588, 0x037a8494, 0x03655fcc, 0x03507768,
+    0x033bcbb4, 0x03275d28, 0x03132bc0, 0x02ff370c,
+    0x02eb7e94, 0x02d801e8, 0x02c4c11c, 0x02b1bcbc,
+    0x029ef578, 0x028c6ba8, 0x027a1f20, 0x02680f54,
+    0x02563bac, 0x0244a3c8, 0x023347a0, 0x02222730,
+    0x0211429c, 0x02009938, 0x01f02974, 0x01dff1ae,
+    0x01cff058, 0x01c024c8, 0x01b08ef4, 0x01a12eda,
+    0x019204b0, 0x01831138, 0x01745588, 0x0165d2c2,
+    0x01578a96, 0x01497ffc, 0x013bb670, 0x012e3160,
+    0x0120f146, 0x0113f27c, 0x0107310c, 0x00faa909,
+    0x00ee57a1, 0x00e23b09, 0x00d6515b, 0x00ca9977,
+    0x00bf1509, 0x00b3c74d, 0x00a8b388, 0x009ddb3d,
+    0x00933bf2, 0x0088d22c, 0x007e9a70, 0x0074935a,
+    0x006abe70, 0x00611d5c, 0x0057b1f8, 0x004e7e73,
+    0x0045859b, 0x003cca96, 0x00344f32, 0x002c1074,
+    0x00240873, 0x001c31ba, 0x0014863f, 0x000cfe8b,
+    0x00059307, 0xfffe3b9a, 0xfff6f718, 0xffefcd4d,
+    0xffe8c6f4, 0xffe1ed10, 0xffdb4c57, 0xffd4f484,
+    0xffcef5dc, 0xffc95d0c, 0xffc4284e, 0xffbf4e14,
+    0xffbac5ae, 0xffb68360, 0xffb27548, 0xffae87be,
+    0xffaaa733, 0xffa6c67e, 0xffa2e141, 0xff9ef40c,
+    0xff9afc25, 0xff970058, 0xff930f7c, 0xff8f3857,
+    0xff8b8900, 0xff880bfe, 0xff84c9ea, 0xff81cbbd,
+    0xff7f17ad, 0xff7cadc6, 0xff7a8c4e, 0xff78b1cd,
+    0xff7719f3, 0xff75bd06, 0xff7492a4, 0xff7392bf,
+    0xff72b600, 0xff71f5c6, 0xff714b72, 0xff70b0ed,
+    0xff702232, 0xff6f9c90, 0xff6f1cee, 0xff6ea21f,
+    0xff6e2e9c, 0xff6dc617, 0xff6d6c09, 0xff6d2425,
+    0xff6cf267, 0xff6cdaca, 0xff6ce155, 0xff6d0983,
+    0xff6d56bb, 0xff6dcc4c, 0xff6e6cd0, 0xff6f3832,
+    0xff702cc4, 0xff71492e, 0xff728ae2, 0xff73ed63,
+    0xff756b7c, 0xff77001c, 0xff78a5d9, 0xff7a5693,
+    0xff7c0c40, 0xff7dc141, 0xff7f74aa, 0xff81298b,
+    0xff82e2de, 0xff84a3de, 0xff8670bd, 0xff884e42,
+    0xff8a410c, 0xff8c4c7f, 0xff8e70fc, 0xff90ae18,
+    0xff93037e, 0xff956f12, 0xff97ec86, 0xff9a7724,
+    0xff9d0a9d, 0xff9fa3ea, 0xffa2417e, 0xffa4e1ac,
+    0xffa78332, 0xffaa265a, 0xffaccc26, 0xffaf758e,
+    0xffb223d4, 0xffb4d906, 0xffb79726, 0xffba604e,
+    0xffbd349e, 0xffc011a8, 0xffc2f4d2, 0xffc5db82,
+    0xffc8c45f, 0xffcbaed5, 0xffce9a6d, 0xffd186c6,
+    0xffd473aa, 0xffd760e5, 0xffda4e55, 0xffdd3bd0,
+    0xffe0292b, 0xffe31645, 0xffe602ff, 0xffe8eef7,
+    0xffebd978, 0xffeec1bf, 0xfff1a72c, 0xfff488fe,
+    0xfff76689, 0xfffa3f2c, 0xfffd1245, 0xffffdf33,
+    0x000020ac, 0x0002e66f, 0x0005a937, 0x00086839,
+    0x000b22b3, 0x000dd7da, 0x001086ec, 0x00132f3c,
+    0x0015d001, 0x00186897, 0x001af849, 0x001d7eb6,
+    0x001ffbbe, 0x00226f41, 0x0024d8e8, 0x00273874,
+    0x00298d82, 0x002bd7aa, 0x002e16d4, 0x00304af6,
+    0x00327406, 0x00349203, 0x0036a416, 0x0038a893,
+    0x003a9da0, 0x003c8170, 0x003e53b8, 0x0040159a,
+    0x0041c816, 0x00436c92, 0x0045042c, 0x00468ff2,
+    0x00481106, 0x004987fe, 0x004af466, 0x004c5599,
+    0x004daae4, 0x004ef28c, 0x005029c4, 0x00514d9a,
+    0x00525b57, 0x005351f7, 0x00543190, 0x0054fa43,
+    0x0055ac2f, 0x00564938, 0x0056d3f7, 0x00574f3c,
+    0x0057bdd7, 0x00582260, 0x00587f28, 0x0058d6b1,
+    0x0059293c, 0x0059741a, 0x0059b472, 0x0059e73c,
+    0x005a0976, 0x005a1870, 0x005a116e, 0x0059f224,
+    0x0059b964, 0x005966ce, 0x0058f9e2, 0x005872e8,
+    0x0057d407, 0x00571f82, 0x005657b0, 0x00557ecd,
+    0x00549731, 0x0053a34b, 0x0052a56a, 0x00519fc6,
+    0x00509482, 0x004f85a4, 0x004e74ee, 0x004d6214,
+    0x004c4bd3, 0x004b314c, 0x004a1110, 0x0048e8c8,
+    0x0047b5f7, 0x00467626, 0x00452690, 0x0043c405,
+    0x00424b7f, 0x0040ba04, 0x003f0e53, 0x003d488b,
+    0x003b688c, 0x00396eb6, 0x00375dfb, 0x00353aaa,
+    0x003308ac, 0x0030ccb1, 0x002e8cf1, 0x002c4fd5,
+    0x002a1be8, 0x0027f486, 0x0025d90d, 0x0023c852,
+    0x0021c13b, 0x001fbf23, 0x001dbafc, 0x001badc6,
+    0x00199136, 0x00176150, 0x00151b86, 0x0012bcd1,
+    0x001044d1, 0x000db8d0, 0x000b1f43, 0x00087e89,
+    0x0005dbe2, 0x00033b1e, 0x00009fee, 0xfffe0d82,
+    0xfffb83cf, 0xfff90047, 0xfff6805a, 0xfff4019a,
+    0xfff18203, 0xffeeffb2, 0xffec78ba, 0xffe9ec4d,
+    0xffe75b4e, 0xffe4c71f, 0xffe23138, 0xffdf9ae6,
+    0xffdd0574, 0xffda723c, 0xffd7e24a, 0xffd55567,
+    0xffd2cabe, 0xffd04161, 0xffcdb890, 0xffcb306a,
+    0xffc8a95c, 0xffc62406, 0xffc3a140, 0xffc12188,
+    0xffbea542, 0xffbc2cc2, 0xffb9b7d2, 0xffb745f2,
+    0xffb4d6ac, 0xffb268fe, 0xffaffc72, 0xffad90e8,
+    0xffab263e, 0xffa8bcb8, 0xffa6547e, 0xffa3ed7b,
+    0xffa187ba, 0xff9f2351, 0xff9cc055, 0xff9a5ebc,
+    0xff97fe84, 0xff959f84, 0xff934146, 0xff90e37d,
+    0xff8e858a, 0xff8c26c0, 0xff89c69e, 0xff876483,
+    0xff84ffe4, 0xff82982b, 0xff802cb6, 0xff7dbccf,
+    0xff7b47b4, 0xff78ccd0, 0xff764b6c, 0xff73c2db,
+    0xff713227, 0xff6e9864, 0xff6bf470, 0xff694553,
+    0xff668a0d, 0xff63c1a6, 0xff60ec34, 0xff5e0e9e,
+    0xff5b30d3, 0xff585b8c, 0xff5595c9, 0xff52e1da,
+    0xff5040a0, 0xff4db31c, 0xff4b3a3b, 0xff48d67e,
+    0xff468850, 0xff445011, 0xff422ded, 0xff4021f9,
+    0xff3e2c56, 0xff3c4cf8, 0xff3a83df, 0xff38d0ec,
+    0xff3733c9, 0xff35ac14, 0xff343963, 0xff32db09,
+    0xff319066, 0xff305898, 0xff2f323d, 0xff2e1bb2,
+    0xff2d1369, 0xff2c18f8, 0xff2b2d2a, 0xff2a50e1,
+    0xff2984f4, 0xff28c978, 0xff281e01, 0xff278245,
+    0xff26f5c3, 0xff26785a, 0xff2609bf, 0xff25a9c8,
+    0xff255814, 0xff2513f6, 0xff24dcc4, 0xff24b1a6,
+    0xff2492b1, 0xff248093, 0xff247c0b, 0xff2485c6,
+    0xff249daf, 0xff24c359, 0xff24f639, 0xff253605,
+    0xff258312, 0xff25ddd5, 0xff2646e7, 0xff26be25,
+    0xff274264, 0xff27d1f6, 0xff286b19, 0xff290c13,
+    0xff29b30d, 0xff2a5e38, 0xff2b0bbd, 0xff2bb9a2,
+    0xff29a9d2, 0xff2a53dc, 0xff2b0a5a, 0xff2bcd43,
+    0xff2c9c76, 0xff2d7808, 0xff2e5ffa, 0xff2f544c,
+    0xff305528, 0xff316299, 0xff327ce0, 0xff33a432,
+    0xff34d8ba, 0xff361a8e, 0xff3768f8, 0xff38c2f5,
+    0xff3a2784, 0xff3b9623, 0xff3d0ef4, 0xff3e9277,
+    0xff4020ed, 0xff41ba14, 0xff435ccc, 0xff4507fd,
+    0xff46ba84, 0xff4873ac, 0xff4a32ea, 0xff4bf7bb,
+    0xff4dc17f, 0xff4f8fa0, 0xff516167, 0xff53361d,
+    0xff550d79, 0xff56e7ee, 0xff58c5ff, 0xff5aa84d,
+    0xff5c8e41, 0xff5e75e2, 0xff605d4d, 0xff6242b6,
+    0xff6424b8, 0xff66023d, 0xff67da44, 0xff69abd6,
+    0xff6b7646, 0xff6d38e8, 0xff6ef348, 0xff70a4ce,
+    0xff724d0f, 0xff73eb95, 0xff757fff, 0xff770a2d,
+    0xff788a20, 0xff79fff6, 0xff7b6be7, 0xff7cce52,
+    0xff7e27e4, 0xff7f78fc, 0xff80c38a, 0xff820e98,
+    0xff836378, 0xff84caaa, 0xff864990, 0xff87dff4,
+    0xff898c30, 0xff8b4cda, 0xff8d207a, 0xff8f05cc,
+    0xff90fb9b, 0xff930098, 0xff95138e, 0xff97332d,
+    0xff995e2a, 0xff9b934e, 0xff9dd18c, 0xffa017e3,
+    0xffa26550, 0xffa4b8e7, 0xffa711a8, 0xffa96eae,
+    0xffabcefc, 0xffae31cc, 0xffb09680, 0xffb2fc82,
+    0xffb5635a, 0xffb7ca52, 0xffba30a8, 0xffbc95a8,
+    0xffbef8a4, 0xffc158d0, 0xffc3b557, 0xffc60d6b,
+    0xffc86041, 0xffcaacb7, 0xffccf1cb, 0xffcf2e5c,
+    0xffd161e8, 0xffd38c8f, 0xffd5ae88, 0xffd7c808,
+    0xffd9d925, 0xffdbe1c8, 0xffdde1f3, 0xffdfd964,
+    0xffe1c79b, 0xffe3abcc, 0xffe5852a, 0xffe75341,
+    0xffe9162f, 0xffeace55, 0xffec7c15, 0xffee1f63,
+    0xffefb7e9, 0xfff1453d, 0xfff2c6fd, 0xfff43ca8,
+    0xfff5a5d4, 0xfff701ea, 0xfff850b4, 0xfff99288,
+    0xfffac853, 0xfffbf2d5, 0xfffd12e6, 0xfffe2991,
+    0xffff37e4, 0x00003eea, 0x00013ec4, 0x00023646,
+    0x0003244d, 0x00040797, 0x0004de8c, 0x0005a734,
+    0x00065fab, 0x0007068f, 0x00079c82, 0x000822fa,
+    0x00089b70, 0x000907a6, 0x00096a01, 0x0009c506,
+    0x000a1b37, 0x000a6e18, 0x000abe1f, 0x000b0bac,
+    0x000b5701, 0x000b9f3b, 0x000be2c2, 0x000c1fff,
+    0x000c5599, 0x000c829a, 0x000ca661, 0x000cc058,
+    0x000cd028, 0x000cd63d, 0x000cd317, 0x000cc739,
+    0x000cb36d, 0x000c98c0, 0x000c7833, 0x000c52df,
+    0x000c2984, 0x000bfcf9, 0x000bcdea, 0x000b9cf7,
+    0x000b6a97, 0x000b3700, 0x000b029d, 0x000acd79,
+    0x000a977e, 0x000a6076, 0x000a2838, 0x0009eea1,
+    0x0009b37d, 0x000976c2, 0x0009384e, 0x0008f816,
+    0x0008b612, 0x0008724a, 0x00082cd5, 0x0007e5e8,
+    0x00079dce, 0x000754de, 0x00070b62, 0x0006c1c6,
+    0x0006786a, 0x00062fba, 0x0005e801, 0x0005a1a0,
+    0x00055ce1, 0x000519fb, 0x0004d8f8, 0x000499b8,
+    0x00045c30, 0x00042040, 0x0003e5c8, 0x0003acb3,
+    0x000374df, 0x00033e59, 0x00030934, 0x0002d57d,
+    0x0002a348, 0x000272b6, 0x000243f2, 0x00021711,
+    0x0001ec3e, 0x0001c37a, 0x00019cc3, 0x00017830,
+    0x000155a0, 0x00013514, 0x0001168b, 0x0000f9e6,
+    0x0000df23, 0x0000c62e, 0x0000aef2, 0x00009978,
+    0x000085a1, 0x0000736d, 0x000062dc, 0x000053d8,
+    0x0000466c, 0x00003a62, 0x00002fd1, 0x00002681,
+    0x00001e73, 0x00001792, 0x000011c9, 0x00000cf6,
+    0x0000091a, 0x000005ff, 0x000003b1, 0x00000203,
+    0x000000d7, 0x0000002b, 0xffffffd5, 0xffffffc0,
+    0xffffffd5, 0x00000000, 0x00000015, 0x00000000,
+    0x00000000, 0x00000015, 0x00000000, 0xffffffd5,
+    0xffffffca, 0xffffffd5, 0x0000002b, 0x000000cc,
+    0x000001e3, 0x0000037b, 0x0000059f, 0x0000086e,
+    0x00000bf4, 0x0000103b, 0x00001564, 0x00001b6e,
+    0x0000226f, 0x00002a68, 0x00003377, 0x00003d93,
+    0x000048c5, 0x00005525, 0x000062a6, 0x00007155,
+    0x0000812f, 0x00009237, 0x0000a455, 0x0000b7ab,
+    0x0000cc18, 0x0000e1bd, 0x0000f878, 0x0001106c,
+    0x00012981, 0x000143c2, 0x00015f30, 0x00017bb6,
+    0x00019948, 0x0001b7e6, 0x0001d771, 0x0001f7bc,
+    0x000218b4, 0x00023a42, 0x00025c3b, 0x00027ea0,
+    0x0002a150, 0x0002c440, 0x0002e771, 0x00030aed,
+    0x00032eb4, 0x000352db, 0x00037759, 0x00039c4c,
+    0x0003c1ac, 0x0003e74b, 0x00040d00, 0x0004329f,
+    0x000457de, 0x00047c9c, 0x0004a083, 0x0004c35e,
+    0x0004e502, 0x00050543, 0x000523ec, 0x000540e7,
+    0x00055c2b, 0x000575c0, 0x00058da9, 0x0005a3e4,
+    0x0005b886, 0x0005cbb1, 0x0005dd65, 0x0005edcb,
+    0x0005fcfa, 0x00060afc, 0x00061808, 0x000623fc,
+    0x00062ec3, 0x00063849, 0x0006404b, 0x000646ac,
+    0x00064b13, 0x00064d37, 0x00064cd6, 0x0006497b,
+    0x000642c5, 0x0006385e, 0x000629f0, 0x00061766,
+    0x000600a0, 0x0005e57d, 0x0005c63e, 0x0005a322,
+    0x00057c97, 0x00055306, 0x00052711, 0x0004f96f,
+    0x0004caeb, 0x00049bfc, 0x00046c96, 0x00043cbb,
+    0x00040c3f, 0x0003daab, 0x0003a734, 0x000370f9,
+    0x0003372d, 0x0002f944, 0x0002b6d4, 0x00026f71,
+    0x000222fb, 0x0001d212, 0x00017d84, 0x00012630,
+    0x0000ccda, 0x00007200, 0x0000163b, 0xffffba15,
+    0xffff5da3, 0xffff0091, 0xfffea293, 0xfffe4367,
+    0xfffde2da, 0xfffd809f, 0xfffd1c81, 0xfffcb66a,
+    0xfffc4e90, 0xfffbe53e, 0xfffb7aa0, 0xfffb0f0a,
+    0xfffaa2c9, 0xfffa3612, 0xfff9c92f, 0xfff95c2d,
+    0xfff8eef4, 0xfff8817c, 0xfff813c3, 0xfff7a5d4,
+    0xfff737e5, 0xfff6ca17, 0xfff65c9e, 0xfff5efbc,
+    0xfff58390, 0xfff51830, 0xfff4adbc, 0xfff44435,
+    0xfff3db9a, 0xfff373d6, 0xfff30cfd, 0xfff2a71c,
+    0xfff24248, 0xfff1de9f, 0xfff17c44, 0xfff11b56,
+    0xfff0bbea, 0xfff05e17, 0xfff00206, 0xffefa7d9,
+    0xffef4f99, 0xffeef95d, 0xffeea53a, 0xffee533a,
+    0xffee035e, 0xffedb5b0, 0xffed6a3c, 0xffed20f5,
+    0xffecd9fe, 0xffec9555, 0xffec5305, 0xffec1319,
+    0xffebd591, 0xffeb9a83, 0xffeb61f9, 0xffeb2bfe,
+    0xffeaf89c, 0xffeac7ea, 0xffea99d2, 0xffea6e7e,
+    0xffea45ef, 0xffea203a, 0xffe9fda0, 0xffe9decc,
+    0xffe9c3de, 0xffe9ac56, 0xffe99789, 0xffe9845e,
+    0xffe97295, 0xffe96219, 0xffe952ea, 0xffe944f3,
+    0xffe93833, 0xffe92c9f, 0xffe92238, 0xffe918fe,
+    0xffe910fb, 0xffe90a3a, 0xffe904c6, 0xffe900a0,
+    0xffe8fddb, 0xffe8fc83, 0xffe8fca4, 0xffe8fe3c,
+    0xffe9016c, 0xffe9061e, 0xffe90c74, 0xffe9146c,
+    0xffe91e11, 0xffe929a5, 0xffe93731, 0xffe946c0,
+    0xffe95833, 0xffe96b7e, 0xffe98082, 0xffe9975e,
+    0xffe9affd, 0xffe9ca5e, 0xffe9e68e, 0xffea0481,
+    0xffea242b, 0xffea458e, 0xffea6894, 0xffea8d52,
+    0xffeab3c8, 0xffeadc0c, 0xffeb05fe, 0xffeb31a7,
+    0xffeb5ede, 0xffeb8da2, 0xffebbdf4, 0xffebefbd,
+    0xffec231f, 0xffec5802, 0xffec8e5e, 0xffecc61c,
+    0xffecff1c, 0xffed391e, 0xffed740c, 0xffedafb1,
+    0xffedebe1, 0xffee287d, 0xffee654e, 0xffeea23f,
+};
diff --git a/libavcodec/aactab.h b/libavcodec/aactab.h
index b1e9510..7cd8128 100644
--- a/libavcodec/aactab.h
+++ b/libavcodec/aactab.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,31 +32,130 @@
 
 #include "libavutil/mem.h"
 #include "aac.h"
-#include "aac_tablegen_decl.h"
 
 #include <stdint.h>
 
 /* NOTE:
- * Tables in this file are used by the AAC decoder and will be used by the AAC
- * encoder.
+ * Tables in this file are shared by the AAC decoders and encoder
  */
 
+extern float ff_aac_pow2sf_tab[428];
+extern float ff_aac_pow34sf_tab[428];
+
+static inline void ff_aac_tableinit(void)
+{
+    int i;
+
+    /* 2^(i/16) for 0 <= i <= 15 */
+    static const float exp2_lut[] = {
+        1.00000000000000000000,
+        1.04427378242741384032,
+        1.09050773266525765921,
+        1.13878863475669165370,
+        1.18920711500272106672,
+        1.24185781207348404859,
+        1.29683955465100966593,
+        1.35425554693689272830,
+        1.41421356237309504880,
+        1.47682614593949931139,
+        1.54221082540794082361,
+        1.61049033194925430818,
+        1.68179283050742908606,
+        1.75625216037329948311,
+        1.83400808640934246349,
+        1.91520656139714729387,
+    };
+    float t1 = 8.8817841970012523233890533447265625e-16; // 2^(-50)
+    float t2 = 3.63797880709171295166015625e-12; // 2^(-38)
+    int t1_inc_cur, t2_inc_cur;
+    int t1_inc_prev = 0;
+    int t2_inc_prev = 8;
+
+    for (i = 0; i < 428; i++) {
+        t1_inc_cur = 4 * (i % 4);
+        t2_inc_cur = (8 + 3*i) % 16;
+        if (t1_inc_cur < t1_inc_prev)
+            t1 *= 2;
+        if (t2_inc_cur < t2_inc_prev)
+            t2 *= 2;
+        // A much more efficient and accurate way of doing:
+        // ff_aac_pow2sf_tab[i] = pow(2, (i - POW_SF2_ZERO) / 4.0);
+        // ff_aac_pow34sf_tab[i] = pow(ff_aac_pow2sf_tab[i], 3.0/4.0);
+        ff_aac_pow2sf_tab[i] = t1 * exp2_lut[t1_inc_cur];
+        ff_aac_pow34sf_tab[i] = t2 * exp2_lut[t2_inc_cur];
+        t1_inc_prev = t1_inc_cur;
+        t2_inc_prev = t2_inc_cur;
+    }
+}
+
+/* @name ltp_coef
+ * Table of the LTP coefficients
+ */
+static const INTFLOAT ltp_coef[8] = {
+    Q30(0.570829), Q30(0.696616), Q30(0.813004), Q30(0.911304),
+    Q30(0.984900), Q30(1.067894), Q30(1.194601), Q30(1.369533),
+};
+
+/* @name tns_tmp2_map
+ * Tables of the tmp2[] arrays of LPC coefficients used for TNS.
+ * The suffix _M_N[] indicate the values of coef_compress and coef_res
+ * respectively.
+ * @{
+ */
+static const INTFLOAT tns_tmp2_map_1_3[4] = {
+    Q31(0.00000000), Q31(-0.43388373),  Q31(0.64278758),  Q31(0.34202015),
+};
+
+static const INTFLOAT tns_tmp2_map_0_3[8] = {
+    Q31(0.00000000), Q31(-0.43388373), Q31(-0.78183150), Q31(-0.97492790),
+    Q31(0.98480773), Q31( 0.86602539), Q31( 0.64278758), Q31( 0.34202015),
+};
+
+static const INTFLOAT tns_tmp2_map_1_4[8] = {
+    Q31(0.00000000), Q31(-0.20791170), Q31(-0.40673664), Q31(-0.58778524),
+    Q31(0.67369562), Q31( 0.52643216), Q31( 0.36124167), Q31( 0.18374951),
+};
+
+static const INTFLOAT tns_tmp2_map_0_4[16] = {
+    Q31( 0.00000000), Q31(-0.20791170), Q31(-0.40673664), Q31(-0.58778524),
+    Q31(-0.74314481), Q31(-0.86602539), Q31(-0.95105654), Q31(-0.99452192),
+    Q31( 0.99573416), Q31( 0.96182561), Q31( 0.89516330), Q31( 0.79801720),
+    Q31( 0.67369562), Q31( 0.52643216), Q31( 0.36124167), Q31( 0.18374951),
+};
+
+static const INTFLOAT * const tns_tmp2_map[4] = {
+    tns_tmp2_map_0_3,
+    tns_tmp2_map_0_4,
+    tns_tmp2_map_1_3,
+    tns_tmp2_map_1_4
+};
+// @}
+
 /* @name window coefficients
  * @{
  */
 DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_long_1024)[1024];
 DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_short_128)[128];
+DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_long_960)[960];
+DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_short_120)[120];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_long_1024_fixed)[1024];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_long_512_fixed)[512];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_short_128_fixed)[128];
 DECLARE_ALIGNED(32, extern const float, ff_aac_eld_window_512)[1920];
+DECLARE_ALIGNED(32, extern const int,   ff_aac_eld_window_512_fixed)[1920];
 DECLARE_ALIGNED(32, extern const float, ff_aac_eld_window_480)[1800];
+DECLARE_ALIGNED(32, extern const int,   ff_aac_eld_window_480_fixed)[1800];
 // @}
 
 /* @name number of scalefactor window bands for long and short transform windows respectively
  * @{
  */
 extern const uint8_t ff_aac_num_swb_1024[];
+extern const uint8_t ff_aac_num_swb_960 [];
 extern const uint8_t ff_aac_num_swb_512 [];
 extern const uint8_t ff_aac_num_swb_480 [];
 extern const uint8_t ff_aac_num_swb_128 [];
+extern const uint8_t ff_aac_num_swb_120 [];
 // @}
 
 extern const uint8_t ff_aac_pred_sfb_max [];
@@ -73,9 +172,11 @@ extern const float *ff_aac_codebook_vector_vals[];
 extern const uint16_t *ff_aac_codebook_vector_idx[];
 
 extern const uint16_t * const ff_swb_offset_1024[13];
+extern const uint16_t * const ff_swb_offset_960 [13];
 extern const uint16_t * const ff_swb_offset_512 [13];
 extern const uint16_t * const ff_swb_offset_480 [13];
 extern const uint16_t * const ff_swb_offset_128 [13];
+extern const uint16_t * const ff_swb_offset_120 [13];
 
 extern const uint8_t ff_tns_max_bands_1024[13];
 extern const uint8_t ff_tns_max_bands_512 [13];
diff --git a/libavcodec/aandcttab.c b/libavcodec/aandcttab.c
index 0c5b573..97013d2 100644
--- a/libavcodec/aandcttab.c
+++ b/libavcodec/aandcttab.c
@@ -1,24 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * AAN (Arai Agui Aakajima) (I)DCT tables
+ * AAN (Arai, Agui and Nakajima) (I)DCT tables
  */
 
 #include <stdint.h>
diff --git a/libavcodec/aandcttab.h b/libavcodec/aandcttab.h
index daccb7b..b0a2f44 100644
--- a/libavcodec/aandcttab.h
+++ b/libavcodec/aandcttab.h
@@ -1,24 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * AAN (Arai Agui Nakajima) (I)DCT tables
+ * AAN (Arai, Agui and Nakajima) (I)DCT tables
  */
 
 #ifndef AVCODEC_AANDCTTAB_H
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 7228eae..8bc8bc5 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -6,19 +6,21 @@ OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
 OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
-OBJS-$(CONFIG_IMDCT15)                  += aarch64/imdct15_init.o
-OBJS-$(CONFIG_MDCT)                     += aarch64/mdct_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
 OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o
 
 # decoders/encoders
-OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
+OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \
+                                           aarch64/sbrdsp_init_aarch64.o
+OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
-OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
+OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o
 OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
-OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_aarch64.o
+OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_10bpp_aarch64.o \
+                                           aarch64/vp9dsp_init_12bpp_aarch64.o \
+                                           aarch64/vp9dsp_init_aarch64.o
 
 # ARMv8 optimizations
 
@@ -28,6 +30,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
 # NEON optimizations
 
 # subsystems
+NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/sbrdsp_neon.o
 NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
 NEON-OBJS-$(CONFIG_FMTCONVERT)          += aarch64/fmtconvert_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
@@ -37,15 +40,19 @@ NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
 NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
                                            aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
-NEON-OBJS-$(CONFIG_IMDCT15)             += aarch64/imdct15_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_init_aarch64.o      \
+                                           aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
 
 # decoders/encoders
-NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
-                                           aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
-NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_neon.o             \
+NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
+                                           aarch64/vp9itxfm_neon.o             \
+                                           aarch64/vp9lpf_16bpp_neon.o         \
                                            aarch64/vp9lpf_neon.o               \
+                                           aarch64/vp9mc_16bpp_neon.o          \
                                            aarch64/vp9mc_neon.o
diff --git a/libavcodec/aarch64/aacpsdsp_init_aarch64.c b/libavcodec/aarch64/aacpsdsp_init_aarch64.c
new file mode 100644
index 0000000..5e7e19b
--- /dev/null
+++ b/libavcodec/aarch64/aacpsdsp_init_aarch64.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+void ff_ps_stereo_interpolate_ipdopd_neon(float (*l)[2], float (*r)[2],
+                                          float h[2][4], float h_step[2][4],
+                                          int len);
+
+av_cold void ff_psdsp_init_aarch64(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->add_squares           = ff_ps_add_squares_neon;
+        s->mul_pair_single       = ff_ps_mul_pair_single_neon;
+        s->hybrid_analysis       = ff_ps_hybrid_analysis_neon;
+        s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
+        s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_neon;
+    }
+}
diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S
new file mode 100644
index 0000000..ff4e6e2
--- /dev/null
+++ b/libavcodec/aarch64/aacpsdsp_neon.S
@@ -0,0 +1,148 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_ps_add_squares_neon, export=1
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        fmul        v0.4S, v0.4S, v0.4S
+        fmul        v1.4S, v1.4S, v1.4S
+        faddp       v2.4S, v0.4S, v1.4S
+        ld1         {v3.4S}, [x0]
+        fadd        v3.4S, v3.4S, v2.4S
+        st1         {v3.4S}, [x0], #16
+        subs        w2, w2, #4
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_mul_pair_single_neon, export=1
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        ld1         {v2.4S},       [x2], #16
+        zip1        v3.4S, v2.4S, v2.4S
+        zip2        v4.4S, v2.4S, v2.4S
+        fmul        v0.4S, v0.4S, v3.4S
+        fmul        v1.4S, v1.4S, v4.4S
+        st1         {v0.4S,v1.4S}, [x0], #32
+        subs        w3, w3, #4
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_stereo_interpolate_neon, export=1
+        ld1         {v0.4S}, [x2]
+        ld1         {v1.4S}, [x3]
+        zip1        v4.4S, v0.4S, v0.4S
+        zip2        v5.4S, v0.4S, v0.4S
+        zip1        v6.4S, v1.4S, v1.4S
+        zip2        v7.4S, v1.4S, v1.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v4.4S, v4.4S, v6.4S
+        fadd        v5.4S, v5.4S, v7.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v2.4S, v2.4S, v4.4S
+        fmla        v2.4S, v3.4S, v5.4S
+        st1         {v2.D}[0], [x0], #8
+        st1         {v2.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_stereo_interpolate_ipdopd_neon, export=1
+        ld1         {v0.4S,v1.4S}, [x2]
+        ld1         {v6.4S,v7.4S}, [x3]
+        fneg        v2.4S, v1.4S
+        fneg        v3.4S, v7.4S
+        zip1        v16.4S, v0.4S, v0.4S
+        zip2        v17.4S, v0.4S, v0.4S
+        zip1        v18.4S, v2.4S, v1.4S
+        zip2        v19.4S, v2.4S, v1.4S
+        zip1        v20.4S, v6.4S, v6.4S
+        zip2        v21.4S, v6.4S, v6.4S
+        zip1        v22.4S, v3.4S, v7.4S
+        zip2        v23.4S, v3.4S, v7.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v16.4S, v16.4S, v20.4S
+        fadd        v17.4S, v17.4S, v21.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v4.4S, v2.4S, v16.4S
+        fmla        v4.4S, v3.4S, v17.4S
+        fadd        v18.4S, v18.4S, v22.4S
+        fadd        v19.4S, v19.4S, v23.4S
+        ext         v2.16B, v2.16B, v2.16B, #4
+        ext         v3.16B, v3.16B, v3.16B, #4
+        fmla        v4.4S, v2.4S, v18.4S
+        fmla        v4.4S, v3.4S, v19.4S
+        st1         {v4.D}[0], [x0], #8
+        st1         {v4.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_hybrid_analysis_neon, export=1
+        lsl         x3, x3, #3
+        ld2         {v0.4S,v1.4S}, [x1], #32
+        ld2         {v2.2S,v3.2S}, [x1], #16
+        ld1         {v24.2S},      [x1], #8
+        ld2         {v4.2S,v5.2S}, [x1], #16
+        ld2         {v6.4S,v7.4S}, [x1]
+        rev64       v6.4S, v6.4S
+        rev64       v7.4S, v7.4S
+        ext         v6.16B, v6.16B, v6.16B, #8
+        ext         v7.16B, v7.16B, v7.16B, #8
+        rev64       v4.2S, v4.2S
+        rev64       v5.2S, v5.2S
+        mov         v2.D[1], v3.D[0]
+        mov         v4.D[1], v5.D[0]
+        mov         v5.D[1], v2.D[0]
+        mov         v3.D[1], v4.D[0]
+        fadd        v16.4S, v0.4S, v6.4S
+        fadd        v17.4S, v1.4S, v7.4S
+        fsub        v18.4S, v1.4S, v7.4S
+        fsub        v19.4S, v0.4S, v6.4S
+        fadd        v22.4S, v2.4S, v4.4S
+        fsub        v23.4S, v5.4S, v3.4S
+        trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
+        trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
+1:      ld2         {v2.4S,v3.4S}, [x2], #32
+        ld2         {v4.2S,v5.2S}, [x2], #16
+        ld1         {v6.2S},       [x2], #8
+        add         x2, x2, #8
+        mov         v4.D[1], v5.D[0]
+        mov         v6.S[1], v6.S[0]
+        fmul        v6.2S, v6.2S, v24.2S
+        fmul        v0.4S, v2.4S, v16.4S
+        fmul        v1.4S, v2.4S, v17.4S
+        fmls        v0.4S, v3.4S, v18.4S
+        fmla        v1.4S, v3.4S, v19.4S
+        fmla        v0.4S, v4.4S, v20.4S
+        fmla        v1.4S, v4.4S, v21.4S
+        faddp       v0.4S, v0.4S, v1.4S
+        faddp       v0.4S, v0.4S, v0.4S
+        fadd        v0.2S, v0.2S, v6.2S
+        st1         {v0.2S}, [x0], x3
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h
index 60e32dd..e05c5ad 100644
--- a/libavcodec/aarch64/asm-offsets.h
+++ b/libavcodec/aarch64/asm-offsets.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/cabac.h b/libavcodec/aarch64/cabac.h
index e12953e..6b9b77e 100644
--- a/libavcodec/aarch64/cabac.h
+++ b/libavcodec/aarch64/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S
deleted file mode 100644
index 4cd3328..0000000
--- a/libavcodec/aarch64/dcadsp_neon.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
-        mov             x3,  #32                // decifactor
-        sub             x1,  x1,  #7*4
-        add             x4,  x0,  #2*32*4 - 16  // out2
-        mov             x7,  #-16
-
-        ld1             {v0.4s,v1.4s}, [x1]
-        // reverse [-num_coeffs + 1, 0]
-        ext             v3.16b, v0.16b, v0.16b, #8
-        ext             v2.16b, v1.16b, v1.16b, #8
-        rev64           v3.4s,  v3.4s
-        rev64           v2.4s,  v2.4s
-1:
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        subs            x3,  x3,  #4
-        fmul            v16.4s, v2.4s,  v4.4s
-        fmul            v23.4s, v0.4s,  v4.4s
-        fmul            v17.4s, v2.4s,  v6.4s
-        fmul            v22.4s, v0.4s,  v6.4s
-
-        fmla            v16.4s, v3.4s,  v5.4s
-        fmla            v23.4s, v1.4s,  v5.4s
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        fmla            v17.4s, v3.4s,  v7.4s
-        fmla            v22.4s, v1.4s,  v7.4s
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        fmul            v18.4s, v2.4s,  v4.4s
-        fmul            v21.4s, v0.4s,  v4.4s
-        fmul            v19.4s, v2.4s,  v6.4s
-        fmul            v20.4s, v0.4s,  v6.4s
-
-        fmla            v18.4s, v3.4s,  v5.4s
-        fmla            v21.4s, v1.4s,  v5.4s
-        fmla            v19.4s, v3.4s,  v7.4s
-        fmla            v20.4s, v1.4s,  v7.4s
-
-        faddp           v16.4s, v16.4s, v17.4s
-        faddp           v18.4s, v18.4s, v19.4s
-        faddp           v20.4s, v20.4s, v21.4s
-        faddp           v22.4s, v22.4s, v23.4s
-        faddp           v16.4s, v16.4s, v18.4s
-        faddp           v20.4s, v20.4s, v22.4s
-
-        st1             {v16.4s}, [x0], #16
-        st1             {v20.4s}, [x4], x7
-        b.gt            1b
-
-        ret
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
-        mov             x3,  #64                // decifactor
-        sub             x1,  x1,  #3*4
-        add             x4,  x0,  #2*64*4 - 16  // out2
-        mov             x7,  #-16
-
-        ld1             {v0.4s}, [x1]
-        // reverse [-num_coeffs + 1, 0]
-        ext             v1.16b, v0.16b, v0.16b, #8
-        rev64           v1.4s,  v1.4s
-
-1:
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        subs            x3,  x3,  #4
-        fmul            v16.4s, v1.4s,  v4.4s
-        fmul            v23.4s, v0.4s,  v4.4s
-        fmul            v17.4s, v1.4s,  v5.4s
-        fmul            v22.4s, v0.4s,  v5.4s
-        fmul            v18.4s, v1.4s,  v6.4s
-        fmul            v21.4s, v0.4s,  v6.4s
-        fmul            v19.4s, v1.4s,  v7.4s
-        fmul            v20.4s, v0.4s,  v7.4s
-        faddp           v16.4s, v16.4s, v17.4s
-        faddp           v18.4s, v18.4s, v19.4s
-        faddp           v20.4s, v20.4s, v21.4s
-        faddp           v22.4s, v22.4s, v23.4s
-        faddp           v16.4s, v16.4s, v18.4s
-        faddp           v20.4s, v20.4s, v22.4s
-        st1             {v16.4s}, [x0], #16
-        st1             {v20.4s}, [x4], x7
-        b.gt            1b
-
-        ret
-endfunc
diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c
index 9cc57d3..db28520 100644
--- a/libavcodec/aarch64/fft_init_aarch64.c
+++ b/libavcodec/aarch64/fft_init_aarch64.c
@@ -1,23 +1,25 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/aarch64/cpu.h"
@@ -27,6 +29,10 @@
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 av_cold void ff_fft_init_aarch64(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -34,5 +40,11 @@ av_cold void ff_fft_init_aarch64(FFTContext *s)
     if (have_neon(cpu_flags)) {
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
     }
 }
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index e205e23..862039f 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -8,20 +8,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fmtconvert_init.c b/libavcodec/aarch64/fmtconvert_init.c
index 0a55a1b..210e74b 100644
--- a/libavcodec/aarch64/fmtconvert_init.c
+++ b/libavcodec/aarch64/fmtconvert_init.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized Format Conversion Utils
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fmtconvert_neon.S b/libavcodec/aarch64/fmtconvert_neon.S
index 3b33c87..2161c3a 100644
--- a/libavcodec/aarch64/fmtconvert_neon.S
+++ b/libavcodec/aarch64/fmtconvert_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau  <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c
index a373291..fa6e0ea 100644
--- a/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index edc256c..8be7578 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -442,7 +442,7 @@ endconst
         h264_chroma_mc4 avg, rv40
 #endif
 
-#if CONFIG_VC1_DECODER
+#if CONFIG_VC1DSP
         h264_chroma_mc8 put, vc1
         h264_chroma_mc8 avg, vc1
         h264_chroma_mc4 put, vc1
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index 07bda2f..649d2ab 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,39 +25,39 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/h264dsp.h"
 
-void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                      int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                      int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                            int beta);
-void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                            int beta);
-void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                        int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                        int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta);
-void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta);
-void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride,
+void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, ptrdiff_t stride,
                                                    int alpha, int beta);
 
-void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, ptrdiff_t stride, int height,
                                    int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, ptrdiff_t stride, int height,
                                   int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, ptrdiff_t stride, int height,
                                   int log2_den, int weight, int offset);
 
-void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                      int height, int log2_den, int weightd,
                                      int weights, int offset);
-void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                     int height, int log2_den, int weightd,
                                     int weights, int offset);
-void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                     int height, int log2_den, int weightd,
                                     int weights, int offset);
 
@@ -91,10 +91,12 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
         c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
 
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
-        c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+        if (chroma_format_idc <= 1) {
+            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+            c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
+            c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
+        }
         c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
-        c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
-        c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
         c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 448e575..80ac09d 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 1c43c1f..7de4420 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 #include "neon.S"
 
 function ff_h264_idct_add_neon, export=1
+.L_ff_h264_idct_add_neon:
         ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
         sxtw            x2,     w2
         movi            v30.8H, #0
@@ -77,6 +78,7 @@ function ff_h264_idct_add_neon, export=1
 endfunc
 
 function ff_h264_idct_dc_add_neon, export=1
+.L_ff_h264_idct_dc_add_neon:
         sxtw            x2,  w2
         mov             w3,       #0
         ld1r            {v2.8H},  [x1]
@@ -106,8 +108,8 @@ function ff_h264_idct_add16_neon, export=1
         mov             w9,  w3         // stride
         movrel          x7,  scan8
         mov             x10, #16
-        movrel          x13, X(ff_h264_idct_dc_add_neon)
-        movrel          x14, X(ff_h264_idct_add_neon)
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
 1:      mov             w2,  w9
         ldrb            w3,  [x7], #1
         ldrsw           x0,  [x5], #4
@@ -133,8 +135,8 @@ function ff_h264_idct_add16intra_neon, export=1
         mov             w9,  w3         // stride
         movrel          x7,  scan8
         mov             x10, #16
-        movrel          x13, X(ff_h264_idct_dc_add_neon)
-        movrel          x14, X(ff_h264_idct_add_neon)
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
 1:      mov             w2,  w9
         ldrb            w3,  [x7], #1
         ldrsw           x0,  [x5], #4
@@ -160,8 +162,8 @@ function ff_h264_idct_add8_neon, export=1
         add             x5,  x1,  #16*4         // block_offset
         add             x9,  x2,  #16*32        // block
         mov             w19, w3                 // stride
-        movrel          x13, X(ff_h264_idct_dc_add_neon)
-        movrel          x14, X(ff_h264_idct_add_neon)
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
         movrel          x7,  scan8, 16
         mov             x10, #0
         mov             x11, #16
@@ -263,6 +265,7 @@ endfunc
 .endm
 
 function ff_h264_idct8_add_neon, export=1
+.L_ff_h264_idct8_add_neon:
         movi            v19.8H,   #0
         sxtw            x2,       w2
         ld1             {v24.8H, v25.8H}, [x1]
@@ -326,6 +329,7 @@ function ff_h264_idct8_add_neon, export=1
 endfunc
 
 function ff_h264_idct8_dc_add_neon, export=1
+.L_ff_h264_idct8_dc_add_neon:
         mov             w3,       #0
         sxtw            x2,       w2
         ld1r            {v31.8H}, [x1]
@@ -375,8 +379,8 @@ function ff_h264_idct8_add4_neon, export=1
         mov             w2,  w3
         movrel          x7,  scan8
         mov             w10, #16
-        movrel          x13, X(ff_h264_idct8_dc_add_neon)
-        movrel          x14, X(ff_h264_idct8_add_neon)
+        movrel          x13, .L_ff_h264_idct8_dc_add_neon
+        movrel          x14, .L_ff_h264_idct8_add_neon
 1:      ldrb            w9,  [x7], #4
         ldrsw           x0,  [x5], #16
         ldrb            w9,  [x4, w9, UXTW]
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index 8f912cb..b144376 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index a38a27f..213b40b 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 74088b2..77f41d9 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 731dc06..d27cfac 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_init_aarch64.c b/libavcodec/aarch64/hpeldsp_init_aarch64.c
index 6bc4c09..144ae2b 100644
--- a/libavcodec/aarch64/hpeldsp_init_aarch64.c
+++ b/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S
index 2978290..a491c17 100644
--- a/libavcodec/aarch64/hpeldsp_neon.S
+++ b/libavcodec/aarch64/hpeldsp_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/idct.h b/libavcodec/aarch64/idct.h
new file mode 100644
index 0000000..5c49046
--- /dev/null
+++ b/libavcodec/aarch64/idct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_IDCT_H
+#define AVCODEC_AARCH64_IDCT_H
+
+#include <stdint.h>
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+#endif /* AVCODEC_AARCH64_IDCT_H */
diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
new file mode 100644
index 0000000..0406e60
--- /dev/null
+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -0,0 +1,41 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+
+av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if (!avctx->lowres && !high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+            c->idct_put  = ff_simple_idct_put_neon;
+            c->idct_add  = ff_simple_idct_add_neon;
+            c->idct      = ff_simple_idct_neon;
+            c->perm_type = FF_IDCT_PERM_PARTTRANS;
+        }
+    }
+}
diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c
deleted file mode 100644
index 38018f2..0000000
--- a/libavcodec/aarch64/imdct15_init.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-#include "libavutil/internal.h"
-
-#include "libavcodec/imdct15.h"
-
-#include "asm-offsets.h"
-
-AV_CHECK_OFFSET(IMDCT15Context, exptab,         CELT_EXPTAB);
-AV_CHECK_OFFSET(IMDCT15Context, fft_n,          CELT_FFT_N);
-AV_CHECK_OFFSET(IMDCT15Context, len2,           CELT_LEN2);
-AV_CHECK_OFFSET(IMDCT15Context, len4,           CELT_LEN4);
-AV_CHECK_OFFSET(IMDCT15Context, tmp,            CELT_TMP);
-AV_CHECK_OFFSET(IMDCT15Context, twiddle_exptab, CELT_TWIDDLE);
-
-void ff_celt_imdct_half_neon(IMDCT15Context *s, float *dst, const float *src,
-                             ptrdiff_t stride, float scale);
-
-void ff_imdct15_init_aarch64(IMDCT15Context *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_half = ff_celt_imdct_half_neon;
-    }
-}
diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S
deleted file mode 100644
index d99edf4..0000000
--- a/libavcodec/aarch64/imdct15_neon.S
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
- * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-#include "asm-offsets.h"
-
-.macro shuffle a, b, c, d
-const shuffle_\a\b\c\d, align=4
-        .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
-        .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
-        .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
-        .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
-endconst
-.endm
-
-shuffle 0, 2, 1, 3
-shuffle 1, 0, 3, 2
-shuffle 2, 3, 0, 1
-shuffle 3, 1, 2, 0
-
-
-function fft5_neon
-        lsl             x2,  x2,  #3
-        ld1             {v24.2s},         [x1],  x2
-        ld2             {v25.s,v26.s}[0], [x1],  x2
-        ld2             {v25.s,v26.s}[1], [x1],  x2
-        ld2             {v25.s,v26.s}[2], [x1],  x2
-        ld2             {v25.s,v26.s}[3], [x1]
-        dup             v6.4s,  v24.s[0]
-        dup             v7.4s,  v24.s[1]
-
-        faddp           v0.4s,  v25.4s, v26.4s
-        // z[][0], z[][3]
-        fmul            v16.4s, v25.4s, v15.s[0] // rr
-        fmul            v17.4s, v25.4s, v15.s[1] // ri
-        fmul            v18.4s, v26.4s, v15.s[0] // ir
-        fmul            v19.4s, v26.4s, v15.s[1] // ii
-        faddp           v0.4s,  v0.4s,  v0.4s
-        // z[][1], z[][2]
-        fmul            v20.4s, v25.4s, v15.s[2] // rr
-        fmul            v21.4s, v25.4s, v15.s[3] // ri
-        fmul            v22.4s, v26.4s, v15.s[2] // ir
-        fmul            v23.4s, v26.4s, v15.s[3] // ii
-        fadd            v0.2s,  v24.2s, v0.2s   // out[0]
-
-        // z[0123][0], z[0123][3]
-        fsub            v24.4s, v16.4s, v19.4s  //    (c).re =  rr - ii;
-        fadd            v27.4s, v16.4s, v19.4s  //    (d).re =  rr + ii;
-        ld1             {v16.16b},  [x11]
-        ld1             {v19.16b},  [x14]
-        fadd            v28.4s, v17.4s, v18.4s  //    (c).im =  ri + ir;
-        fsub            v31.4s, v18.4s, v17.4s  //    (d).im = -ri + ir;
-        ld1             {v17.16b},  [x12]
-        // z[0123][1], z[0123][2]
-        fsub            v25.4s, v20.4s, v23.4s  //    (c).re =  rr - ii;
-        fadd            v26.4s, v20.4s, v23.4s  //    (d).re =  rr + ii;
-        ld1             {v18.16b},  [x13]
-        fadd            v29.4s, v21.4s, v22.4s  //    (c).im =  ri + ir;
-        fsub            v30.4s, v22.4s, v21.4s  //    (d).im = -ri + ir;
-
-        //real
-        tbl             v20.16b, {v24.16b}, v16.16b
-        tbl             v21.16b, {v25.16b}, v17.16b
-        tbl             v22.16b, {v26.16b}, v18.16b
-        tbl             v23.16b, {v27.16b}, v19.16b
-        //imag
-        tbl             v16.16b, {v28.16b}, v16.16b
-        tbl             v17.16b, {v29.16b}, v17.16b
-        tbl             v18.16b, {v30.16b}, v18.16b
-        tbl             v19.16b, {v31.16b}, v19.16b
-
-        fadd            v6.4s,  v6.4s,  v20.4s
-        fadd            v22.4s, v22.4s, v23.4s
-        fadd            v7.4s,  v7.4s,  v16.4s
-        fadd            v18.4s, v18.4s, v19.4s
-
-        fadd            v21.4s, v21.4s, v22.4s
-        fadd            v17.4s, v17.4s, v18.4s
-        fadd            v6.4s,  v6.4s,  v21.4s
-        fadd            v7.4s,  v7.4s,  v17.4s
-
-        ret
-endfunc
-
-function fft15_neon
-        mov             x8,  x1
-        mov             x9,  x30
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-
-        add             x1,  x8,  x3,  lsl #3   // in + 1 * stride
-        bl              fft5_neon
-        mov             v1.8b,   v0.8b
-        mov             v2.16b,  v6.16b
-        mov             v3.16b,  v7.16b
-
-        add             x1,  x8,  x3,  lsl #4   // in + 2 * stride
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-        bl              fft5_neon
-        zip1            v1.4s,   v1.4s,  v0.4s
-        mov             v4.16b,  v6.16b
-        mov             v5.16b,  v7.16b
-
-        mov             x1,  x8                 // in + 0 * stride
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-        bl              fft5_neon
-
-        faddp           v20.4s, v1.4s,  v1.4s
-
-        ext             v18.16b, v8.16b,  v8.16b,  #4
-        ext             v19.16b, v9.16b,  v9.16b,  #4
-        mov             v16.16b, v6.16b
-        mov             v17.16b, v7.16b
-        fadd            v20.2s, v20.2s, v0.2s
-
-        uzp1            v18.4s, v18.4s, v10.4s  // exp[2,4,6,8].re
-        uzp1            v19.4s, v19.4s, v11.4s  // exp[2,4,6,8].im
-
-        st1             {v20.2s},  [x0], #8     // out[0]
-
-        fmla            v16.4s, v2.4s,  v8.4s
-        fmls            v16.4s, v3.4s,  v9.4s
-
-        fmla            v17.4s, v2.4s,  v9.4s
-        fmla            v17.4s, v3.4s,  v8.4s
-
-        fmla            v16.4s, v4.4s,  v18.4s
-        fmls            v16.4s, v5.4s,  v19.4s
-
-        fmla            v17.4s, v4.4s,  v19.4s
-        fmla            v17.4s, v5.4s,  v18.4s
-
-        zip1            v18.4s, v16.4s, v17.4s
-        zip2            v19.4s, v16.4s, v17.4s
-
-        rev64           v31.4s, v14.4s
-        trn1            v28.2d, v1.2d,  v1.2d
-        trn2            v29.2d, v1.2d,  v1.2d
-        zip1            v30.2d, v14.2d, v31.2d
-        zip2            v31.2d, v14.2d, v31.2d
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[1-4]
-
-        fmul            v16.4s, v28.4s, v30.4s
-        fmul            v17.4s, v29.4s, v30.4s
-        fmls            v16.4s, v29.4s, v31.4s
-        fmla            v17.4s, v28.4s, v31.4s
-        faddp           v16.4s, v16.4s, v16.4s
-        faddp           v17.4s, v17.4s, v17.4s
-        zip1            v18.2s, v16.2s, v17.2s
-        zip2            v19.2s, v16.2s, v17.2s
-
-        fadd            v18.2s, v18.2s, v0.2s
-        fadd            v0.2s,  v19.2s, v0.2s
-
-        ext             v30.16b, v12.16b, v12.16b, #4
-        ext             v31.16b, v13.16b, v13.16b, #4
-        mov             v16.16b, v6.16b
-        mov             v17.16b, v7.16b
-
-        uzp1            v30.4s, v30.4s, v8.4s
-        uzp1            v31.4s, v31.4s, v9.4s
-
-        st1             {v18.2s},  [x0], #8     // out[5]
-
-        fmla            v16.4s, v2.4s,  v10.4s
-        fmls            v16.4s, v3.4s,  v11.4s
-
-        fmla            v17.4s, v2.4s,  v11.4s
-        fmla            v17.4s, v3.4s,  v10.4s
-
-        fmla            v16.4s, v4.4s,  v30.4s
-        fmls            v16.4s, v5.4s,  v31.4s
-
-        fmla            v17.4s, v4.4s,  v31.4s
-        fmla            v17.4s, v5.4s,  v30.4s
-
-        zip1            v18.4s, v16.4s, v17.4s
-        zip2            v19.4s, v16.4s, v17.4s
-
-        ext             v30.16b, v10.16b, v10.16b, #4
-        ext             v31.16b, v11.16b, v11.16b, #4
-
-        fmla            v6.4s,  v2.4s,  v12.4s
-        fmls            v6.4s,  v3.4s,  v13.4s
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[6-9]
-
-        uzp1            v30.4s, v30.4s, v12.4s
-        uzp1            v31.4s, v31.4s, v13.4s
-
-        fmla            v7.4s,  v2.4s,  v13.4s
-        fmla            v7.4s,  v3.4s,  v12.4s
-
-        st1             {v0.2s},  [x0], #8     // out[10]
-
-        fmla            v6.4s,  v4.4s,  v30.4s
-        fmls            v6.4s,  v5.4s,  v31.4s
-
-        fmla            v7.4s,  v4.4s,  v31.4s
-        fmla            v7.4s,  v5.4s,  v30.4s
-
-        zip1            v18.4s, v6.4s,  v7.4s
-        zip2            v19.4s, v6.4s,  v7.4s
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[11-14]
-
-        ret             x9
-endfunc
-
-// x0: out, x1: out+len2, x2: exptab, x3: len2
-function fft15_pass
-        ands            x6,  x3,  #3
-        mov             x4,  x0
-        mov             x5,  x1
-        b.eq            9f
-        ld1             {v0.2s},  [x0], #8
-        ld1             {v1.2s},  [x1], #8
-        sub             x3,  x3,  x6
-        subs            x6,  x6,  #1
-        fadd            v2.2s,  v0.2s,  v1.2s
-        fsub            v3.2s,  v0.2s,  v1.2s
-        add             x2,  x2,  #8
-        st1             {v2.2s},  [x4], #8
-        st1             {v3.2s},  [x5], #8
-        b.eq            9f
-1:
-        subs            x6,  x6,  #1
-        ldp             s4,  s5,  [x2], #8
-        ldp             s2,  s3,  [x1], #8
-        ldp             s0,  s1,  [x0], #8
-
-        fmul            s6,  s2,  s4
-        fmul            s7,  s2,  s5
-        fmls            s6,  s3,  v5.s[0]
-        fmla            s7,  s3,  v4.s[0]
-
-        fsub            s2,  s0,  s6
-        fsub            s3,  s1,  s7
-        fadd            s0,  s0,  s6
-        fadd            s1,  s1,  s7
-
-        stp             s2,  s3,  [x5], #8
-        stp             s0,  s1,  [x4], #8
-        b.gt            1b
-9:
-        ld1             {v4.4s,v5.4s}, [x2],  #32
-        ld2             {v2.4s,v3.4s}, [x1],  #32
-        uzp1            v6.4s,  v4.4s,  v5.4s
-        uzp2            v7.4s,  v4.4s,  v5.4s
-        ld2             {v0.4s,v1.4s}, [x0],  #32
-8:
-        subs            x3,  x3,  #8
-
-        fmul            v4.4s,  v2.4s,  v6.4s
-        fmul            v5.4s,  v2.4s,  v7.4s
-        b.lt            4f
-
-        ld1             {v18.4s,v19.4s}, [x2],  #32
-
-        fmls            v4.4s,  v3.4s,  v7.4s
-        fmla            v5.4s,  v3.4s,  v6.4s
-
-        ld2             {v22.4s,v23.4s}, [x1],  #32
-
-        fsub            v2.4s,  v0.4s,  v4.4s
-        fadd            v0.4s,  v0.4s,  v4.4s
-        fsub            v3.4s,  v1.4s,  v5.4s
-        fadd            v1.4s,  v1.4s,  v5.4s
-
-        uzp1            v16.4s, v18.4s, v19.4s
-        uzp2            v17.4s, v18.4s, v19.4s
-
-        st2             {v2.4s,v3.4s}, [x5],  #32
-        st2             {v0.4s,v1.4s}, [x4],  #32
-        ld2             {v20.4s,v21.4s}, [x0],  #32
-
-        fmul            v18.4s, v22.4s, v16.4s
-        fmul            v19.4s, v22.4s, v17.4s
-        b.eq            0f
-
-        ld1             {v4.4s,v5.4s}, [x2],  #32
-
-        fmls            v18.4s, v23.4s, v17.4s
-        fmla            v19.4s, v23.4s, v16.4s
-
-        ld2             {v2.4s,v3.4s}, [x1],  #32
-
-        fsub            v22.4s, v20.4s, v18.4s
-        fadd            v20.4s, v20.4s, v18.4s
-        fsub            v23.4s, v21.4s, v19.4s
-        fadd            v21.4s, v21.4s, v19.4s
-
-        uzp1            v6.4s,  v4.4s,  v5.4s
-        uzp2            v7.4s,  v4.4s,  v5.4s
-
-        st2             {v22.4s,v23.4s}, [x5],  #32
-        st2             {v20.4s,v21.4s}, [x4],  #32
-        ld2             {v0.4s,v1.4s}, [x0],  #32
-
-        b               8b
-4:
-        fmls            v4.4s,  v3.4s,  v7.4s
-        fmla            v5.4s,  v3.4s,  v6.4s
-
-        fsub            v2.4s,  v0.4s,  v4.4s
-        fadd            v0.4s,  v0.4s,  v4.4s
-        fsub            v3.4s,  v1.4s,  v5.4s
-        fadd            v1.4s,  v1.4s,  v5.4s
-
-        st2             {v2.4s,v3.4s}, [x5],  #32
-        st2             {v0.4s,v1.4s}, [x4],  #32
-
-        ret
-0:
-        fmls            v18.4s, v23.4s, v17.4s
-        fmla            v19.4s, v23.4s, v16.4s
-
-        fsub            v22.4s, v20.4s, v18.4s
-        fadd            v20.4s, v20.4s, v18.4s
-        fsub            v23.4s, v21.4s, v19.4s
-        fadd            v21.4s, v21.4s, v19.4s
-
-        st2             {v22.4s,v23.4s}, [x5],  #32
-        st2             {v20.4s,v21.4s}, [x4],  #32
-
-        ret
-endfunc
-
-function fft30_neon, align=6
-        sub             sp,  sp,  #0x20
-        stp             x20, x21, [sp]
-        stp             x22, x30, [sp, #0x10]
-        mov             x21, x1
-        mov             x22, x2
-        mov             x20, x4
-        mov             x0,  x21
-        mov             x1,  x22
-        lsl             x3,  x20, #1
-        bl              fft15_neon
-
-        add             x0,  x21, #15*8
-        add             x1,  x22, x20,  lsl #3
-        lsl             x3,  x20, #1
-        bl              fft15_neon
-
-        ldr             x2,  [x10, #(CELT_EXPTAB + 8)]  // s->exptab[1]
-        add             x0,  x21, #0
-        add             x1,  x21, #15*8
-        mov             x3,  #15
-        ldp             x20, x21, [sp]
-        ldp             x22, x30, [sp, #0x10]
-        add             sp,  sp,  #0x20
-        b               fft15_pass
-endfunc
-
-.macro  def_fft n, n2
-function fft\n\()_neon, align=6
-        sub             sp,  sp,  #0x30
-        stp             x20, x21, [sp]
-        stp             x22, x30, [sp, #0x10]
-        stp             x23, x24, [sp, #0x20]
-        mov             x21, x1
-        mov             x22, x2
-        mov             x23, x3
-        mov             x20, x4
-        sub             x3,  x3,  #1
-        lsl             x4,  x4,  #1
-        bl              fft\n2\()_neon
-
-        add             x1,  x21, #(\n2 * 8)
-        add             x2,  x22, x20, lsl #3
-        sub             x3,  x23, #1
-        lsl             x4,  x20, #1
-        bl              fft\n2\()_neon
-
-        add             x5,  x10, #CELT_EXPTAB
-        mov             x0,  x21
-        ldr             x2,  [x5,  x23, lsl #3] // s->exptab[N]
-        add             x1,  x21, #(\n2 * 8)
-        mov             x3,  #\n2
-        ldp             x20, x21, [sp]
-        ldp             x22, x30, [sp, #0x10]
-        ldp             x23, x24, [sp, #0x20]
-        add             sp,  sp,  #0x30
-        b               fft15_pass
-endfunc
-.endm
-
-        def_fft    60,  30
-        def_fft   120,  60
-        def_fft   240, 120
-        def_fft   480, 240
-        def_fft   960, 480
-
-function fft_b15_calc_neon
-        sub             sp,  sp,  #0x50
-        ldr             x8,  [x0,  #CELT_EXPTAB]    // s->exptab[0]
-        movrel          x6,  fact5
-        movrel          x11, shuffle_0213
-        movrel          x12, shuffle_1032
-        movrel          x13, shuffle_2301
-        movrel          x14, shuffle_3120
-        add             x8,  x8,  #8
-        movrel          x5,  fft_tab_neon
-        stp             x20, x30, [sp]
-        stp             d8,  d9,  [sp, #0x10]
-        stp             d10, d11, [sp, #0x20]
-        stp             d12, d13, [sp, #0x30]
-        stp             d14, d15, [sp, #0x40]
-        ld1             {v15.4s}, [x6]
-        ld1             {v0.4s,v1.4s},   [x8],  #32
-        ld1             {v6.2s},  [x8],  #8
-        ld1             {v2.4s,v3.4s},   [x8],  #32
-        ld1             {v7.2s},  [x8],  #8
-        ld1             {v4.4s,v5.4s},   [x8],  #32
-        uzp1            v8.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].re
-        uzp2            v9.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].im
-        uzp1            v10.4s, v2.4s,  v3.4s   // exp[ 6 -  9].re
-        uzp2            v11.4s, v2.4s,  v3.4s   // exp[ 6 -  9].im
-        uzp1            v12.4s, v4.4s,  v5.4s   // exp[11 - 14].re
-        uzp2            v13.4s, v4.4s,  v5.4s   // exp[11 - 14].im
-        zip1            v14.4s, v6.4s,  v7.4s   // exp[5,10].re/exp[5,10].im
-        add             x5,  x5,  x3,  lsl #3
-        ldr             x5,  [x5]
-        mov             x10, x0
-        blr             x5
-        ldp             x20, x30, [sp]
-        ldp             d8,  d9,  [sp, #0x10]
-        ldp             d10, d11, [sp, #0x20]
-        ldp             d12, d13, [sp, #0x30]
-        ldp             d14, d15, [sp, #0x40]
-        add             sp,  sp,  #0x50
-        ret
-endfunc
-
-const   fft_tab_neon, relocate=1
-        .quad fft15_neon
-        .quad fft30_neon
-        .quad fft60_neon
-        .quad fft120_neon
-        .quad fft240_neon
-        .quad fft480_neon
-        .quad fft960_neon
-endconst
-
-function ff_celt_imdct_half_neon, export=1
-        sub             sp,  sp,  #0x20
-        stp             x21, x30, [sp]
-        str             s0, [sp, #0x10]
-
-        ldp             w5,  w6,  [x0,  #CELT_LEN2] // CELT_LEN4
-        mov             x10, x0
-        mov             x21, x1
-        sub             w5,  w5,  #1
-        lsl             x7,  x3,  #3            //  2 * stride * sizeof(float)
-        sub             x8,  xzr, x3,  lsl #3   // -2 * stride * sizeof(float)
-        mul             x5,  x5,  x3
-        ldp             x9,  x10, [x0,  #CELT_TMP]  // CELT_TWIDDLE
-        ldr             w3,  [x0, #CELT_FFT_N]
-        add             x5,  x2,  x5,  lsl #2
-        mov             x11, x9
-
-        sub             w6,  w6,  #4
-        ld1             {v0.s}[0],  [x5], x8
-        ld1             {v1.s}[0],  [x2], x7
-        ld1             {v4.4s,v5.4s}, [x10], #32
-        ld1             {v0.s}[1],  [x5], x8
-        ld1             {v1.s}[1],  [x2], x7
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[2],  [x5], x8
-        ld1             {v1.s}[2],  [x2], x7
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[3],  [x5], x8
-        ld1             {v1.s}[3],  [x2], x7
-1:
-        subs            w6,  w6,  #4
-
-        ld1             {v20.s}[0], [x5], x8
-        ld1             {v21.s}[0], [x2], x7
-        ld1             {v4.4s,v5.4s}, [x10], #32
-
-        fmul            v6.4s,  v0.4s,  v2.4s
-        fmul            v7.4s,  v0.4s,  v3.4s
-
-        ld1             {v20.s}[1], [x5], x8
-        ld1             {v21.s}[1], [x2], x7
-
-        fmls            v6.4s,  v1.4s,  v3.4s
-        fmla            v7.4s,  v1.4s,  v2.4s
-
-        ld1             {v20.s}[2], [x5], x8
-        ld1             {v21.s}[2], [x2], x7
-
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v20.s}[3], [x5], x8
-        ld1             {v21.s}[3], [x2], x7
-
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-
-        fmul            v6.4s,  v20.4s, v2.4s
-        fmul            v7.4s,  v20.4s, v3.4s
-
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        fmls            v6.4s,  v21.4s, v3.4s
-        fmla            v7.4s,  v21.4s, v2.4s
-
-        b.eq            3f
-
-        subs            w6,  w6,  #4
-        ld1             {v4.4s,v5.4s}, [x10], #32
-        ld1             {v0.s}[0],  [x5], x8
-        ld1             {v1.s}[0],  [x2], x7
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[1],  [x5], x8
-        ld1             {v1.s}[1],  [x2], x7
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[2],  [x5], x8
-        ld1             {v1.s}[2],  [x2], x7
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-        ld1             {v0.s}[3],  [x5], x8
-        ld1             {v1.s}[3],  [x2], x7
-
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        b.gt            1b
-
-        fmul            v6.4s,  v0.4s,  v2.4s
-        fmul            v7.4s,  v0.4s,  v3.4s
-        fmls            v6.4s,  v1.4s,  v3.4s
-        fmla            v7.4s,  v1.4s,  v2.4s
-3:
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        mov             x2,  x11
-        mov             x4,  #1
-
-        bl              fft_b15_calc_neon
-
-        ldr             w5,  [x10, #CELT_LEN4]
-        ldr             x6,  [x10, #CELT_TWIDDLE]
-        ldr             s31, [sp, #0x10]
-
-        add             x1,  x21, x5,  lsl #2
-        add             x3,  x6,  x5,  lsl #2
-        sub             x0,  x1,  #16
-        sub             x2,  x3,  #16
-        mov             x8,  #-16
-        mov             x7,  #16
-        mov             x10, x0
-        mov             x11, x1
-
-        sub             w5,  w5,  #4
-
-        ld1             {v0.4s},  [x0], x8
-        ld1             {v1.4s},  [x1], x7
-        ld1             {v2.4s},  [x2], x8
-        ld1             {v3.4s},  [x3], x7
-
-        uzp1            v4.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].re
-        uzp2            v6.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].im
-
-        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
-        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
-        fmul            v1.4s,  v6.4s,  v5.4s
-        fmul            v0.4s,  v6.4s,  v7.4s
-2:
-        subs            w5,  w5,  #4
-
-        ld1             {v20.4s}, [x0], x8
-
-        fmla            v1.4s,  v4.4s,  v7.4s
-        fmls            v0.4s,  v4.4s,  v5.4s
-
-        ld1             {v21.4s}, [x1], x7
-
-        ext             v1.16b, v1.16b, v1.16b, #8
-        fmul            v0.4s,  v0.4s,  v31.s[0]
-
-        ld1             {v2.4s},  [x2], x8
-
-        rev64           v1.4s,  v1.4s
-        fmul            v1.4s,  v1.4s,  v31.s[0]
-
-        ld1             {v3.4s},  [x3], x7
-
-        zip1            v5.4s,  v0.4s,  v1.4s
-        zip2            v7.4s,  v0.4s,  v1.4s
-
-        uzp1            v4.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].re
-        uzp2            v6.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].im
-
-        st1             {v5.4s},  [x10], x8
-        st1             {v7.4s},  [x11], x7
-
-        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
-        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
-        fmul            v1.4s,  v6.4s,  v5.4s
-        fmul            v0.4s,  v6.4s,  v7.4s
-        b.gt            2b
-
-        fmla            v1.4s,  v4.4s,  v7.4s
-        fmls            v0.4s,  v4.4s,  v5.4s
-        ext             v1.16b, v1.16b, v1.16b, #8
-        fmul            v0.4s,  v0.4s,  v31.s[0]
-        rev64           v1.4s,  v1.4s
-        fmul            v1.4s,  v1.4s,  v31.s[0]
-        zip1            v5.4s,  v0.4s,  v1.4s
-        zip2            v7.4s,  v0.4s,  v1.4s
-        st1             {v5.4s},  [x10], x8
-        st1             {v7.4s},  [x11], x7
-
-        ldp             x21, x30, [sp]
-        add             sp,  sp,  #0x20
-        ret
-endfunc
-
-// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
-const   fact5,          align=4
-        .float           0.30901699437494745, 0.95105651629515353
-        .float          -0.80901699437494734, 0.58778525229247325
-endconst
diff --git a/libavcodec/aarch64/mdct_init.c b/libavcodec/aarch64/mdct_init.c
deleted file mode 100644
index 816111a..0000000
--- a/libavcodec/aarch64/mdct_init.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_aarch64(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_calc   = ff_imdct_calc_neon;
-        s->imdct_half   = ff_imdct_half_neon;
-        s->mdct_calc    = ff_mdct_calc_neon;
-        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-    }
-}
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index bccd832..1fd199c 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_init.c b/libavcodec/aarch64/mpegaudiodsp_init.c
index 849e310..5d966af 100644
--- a/libavcodec/aarch64/mpegaudiodsp_init.c
+++ b/libavcodec/aarch64/mpegaudiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S
index 2a36f67..b6ef131 100644
--- a/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 377009e..0fddbec 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neontest.c b/libavcodec/aarch64/neontest.c
index 201bfb1..a24c22d 100644
--- a/libavcodec/aarch64/neontest.c
+++ b/libavcodec/aarch64/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c
index f7fcd5b..142705d 100644
--- a/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/sbrdsp_init_aarch64.c b/libavcodec/aarch64/sbrdsp_init_aarch64.c
new file mode 100644
index 0000000..9c96799
--- /dev/null
+++ b/libavcodec/aarch64/sbrdsp_init_aarch64.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/sbrdsp.h"
+
+void ff_sbr_sum64x5_neon(float *z);
+float ff_sbr_sum_square_neon(float (*x)[2], int n);
+void ff_sbr_neg_odd_64_neon(float *x);
+void ff_sbr_qmf_pre_shuffle_neon(float *z);
+void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
+void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
+                           const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
+                        const float alpha0[2], const float alpha1[2],
+                        float bw, int start, int end);
+void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
+void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+av_cold void ff_sbrdsp_init_aarch64(SBRDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->sum64x5 = ff_sbr_sum64x5_neon;
+        s->sum_square = ff_sbr_sum_square_neon;
+        s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
+        s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
+        s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
+        s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
+        s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
+        s->hf_g_filt = ff_sbr_hf_g_filt_neon;
+        s->hf_gen = ff_sbr_hf_gen_neon;
+        s->autocorrelate = ff_sbr_autocorrelate_neon;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
+    }
+}
diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S
new file mode 100644
index 0000000..d23717e
--- /dev/null
+++ b/libavcodec/aarch64/sbrdsp_neon.S
@@ -0,0 +1,327 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const factors, align=4
+        .float 1.0, -1.0, 1.0, -1.0
+endconst
+
+const phi_noise_0, align=4
+        .float 1.0, 0.0, 1.0, 0.0
+endconst
+
+const phi_noise_1, align=4
+        .float 0.0,  1.0,  0.0, -1.0
+        .float 0.0, -1.0,  0.0,  1.0
+endconst
+
+const phi_noise_2, align=4
+        .float -1.0, 0.0, -1.0, 0.0
+endconst
+
+const phi_noise_3, align=4
+        .float 0.0, -1.0,  0.0,  1.0
+        .float 0.0,  1.0,  0.0, -1.0
+endconst
+
+function ff_sbr_sum64x5_neon, export=1
+        add             x1, x0, #64*4
+        add             x2, x0, #128*4
+        add             x3, x0, #192*4
+        add             x4, x0, #256*4
+        mov             x5, #64
+1:      ld1             {v0.4S}, [x0]
+        ld1             {v1.4S}, [x1], #16
+        fadd            v0.4S, v0.4S, v1.4S
+        ld1             {v2.4S}, [x2], #16
+        fadd            v0.4S, v0.4S, v2.4S
+        ld1             {v3.4S}, [x3], #16
+        fadd            v0.4S, v0.4S, v3.4S
+        ld1             {v4.4S}, [x4], #16
+        fadd            v0.4S, v0.4S, v4.4S
+        st1             {v0.4S}, [x0], #16
+        subs            x5, x5, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_sum_square_neon, export=1
+        movi            v0.4S, #0
+1:      ld1             {v1.4S}, [x0], #16
+        fmla            v0.4S, v1.4S, v1.4S
+        subs            w1, w1, #2
+        b.gt            1b
+        faddp           v0.4S, v0.4S, v0.4S
+        faddp           v0.4S, v0.4S, v0.4S
+        ret
+endfunc
+
+function ff_sbr_neg_odd_64_neon, export=1
+        mov             x1, x0
+        movi            v5.4S, #1<<7, lsl #24
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
+.rept 3
+        st2             {v0.4S, v1.4S}, [x1], #32
+        eor             v3.16B, v3.16B, v5.16B
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
+.endr
+        eor             v3.16B, v3.16B, v5.16B
+        st2             {v0.4S, v1.4S}, [x1], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        ret
+endfunc
+
+function ff_sbr_qmf_pre_shuffle_neon, export=1
+        add             x1, x0, #60*4
+        add             x2, x0, #64*4
+        mov             x3, #-16
+        mov             x4, #-4
+        movi            v6.4S, #1<<7, lsl #24
+        ld1             {v0.2S}, [x0], #8
+        st1             {v0.2S}, [x2], #8
+.rept 7
+        ld1             {v1.4S}, [x1], x3
+        ld1             {v2.4S}, [x0], #16
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st2             {v1.4S, v2.4S}, [x2], #32
+.endr
+        add             x1, x1, #8
+        ld1             {v1.2S}, [x1], x4
+        ld1             {v2.2S}, [x0], #8
+        ld1             {v1.S}[3], [x1]
+        ld1             {v2.S}[2], [x0]
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        st2             {v1.2S, v2.2S}, [x2], #16
+        st2             {v1.S, v2.S}[2], [x2]
+        ret
+endfunc
+
+function ff_sbr_qmf_post_shuffle_neon, export=1
+        add             x2, x1, #60*4
+        mov             x3, #-16
+        mov             x4, #32
+        movi            v6.4S, #1<<7, lsl #24
+1:      ld1             {v0.4S}, [x2], x3
+        ld1             {v1.4S}, [x1], #16
+        eor             v0.16B, v0.16B, v6.16B
+        rev64           v0.4S, v0.4S
+        ext             v0.16B, v0.16B, v0.16B, #8
+        st2             {v0.4S, v1.4S}, [x0], #32
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_qmf_deint_neg_neon, export=1
+        add             x1, x1, #56*4
+        add             x2, x0, #60*4
+        mov             x3, #-32
+        mov             x4, #32
+        movi            v2.4S, #1<<7, lsl #24
+1:      ld2             {v0.4S, v1.4S}, [x1], x3
+        eor             v0.16B, v0.16B, v2.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st1             {v0.4S}, [x2]
+        st1             {v1.4S}, [x0], #16
+        sub             x2, x2, #16
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_qmf_deint_bfly_neon, export=1
+        add             x2, x2, #60*4
+        add             x3, x0, #124*4
+        mov             x4, #64
+        mov             x5, #-16
+1:      ld1             {v0.4S}, [x1], #16
+        ld1             {v1.4S}, [x2], x5
+        rev64           v2.4S, v0.4S
+        ext             v2.16B, v2.16B, v2.16B, #8
+        rev64           v3.4S, v1.4S
+        ext             v3.16B, v3.16B, v3.16B, #8
+        fadd            v1.4S, v1.4S, v2.4S
+        fsub            v0.4S, v0.4S, v3.4S
+        st1             {v0.4S}, [x0], #16
+        st1             {v1.4S}, [x3], x5
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_hf_gen_neon, export=1
+        sxtw            x4, w4
+        sxtw            x5, w5
+        movrel          x6, factors
+        ld1             {v7.4S}, [x6]
+        dup             v1.4S, v0.S[0]
+        mov             v2.8B, v1.8B
+        mov             v2.S[2], v7.S[0]
+        mov             v2.S[3], v7.S[0]
+        fmul            v1.4S, v1.4S, v2.4S
+        ld1             {v0.D}[0], [x3]
+        ld1             {v0.D}[1], [x2]
+        fmul            v0.4S, v0.4S, v1.4S
+        fmul            v1.4S, v0.4S, v7.4S
+        rev64           v0.4S, v0.4S
+        sub             x7, x5, x4
+        add             x0, x0, x4, lsl #3
+        add             x1, x1, x4, lsl #3
+        sub             x1, x1, #16
+1:      ld1             {v2.4S}, [x1], #16
+        ld1             {v3.2S}, [x1]
+        fmul            v4.4S, v2.4S, v1.4S
+        fmul            v5.4S, v2.4S, v0.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        mov             v4.S[1], v5.S[0]
+        fadd            v4.2S, v4.2S, v3.2S
+        st1             {v4.2S}, [x0], #8
+        sub             x1, x1, #8
+        subs            x7, x7, #1
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_hf_g_filt_neon, export=1
+        sxtw            x3, w3
+        sxtw            x4, w4
+        mov             x5, #40*2*4
+        add             x1, x1, x4, lsl #3
+1:      ld1             {v0.2S}, [x1], x5
+        ld1             {v1.S}[0], [x2], #4
+        fmul            v2.4S, v0.4S, v1.S[0]
+        st1             {v2.2S}, [x0], #8
+        subs            x3, x3, #1
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_autocorrelate_neon, export=1
+        mov             x2, #38
+        movrel          x3, factors
+        ld1             {v0.4S}, [x3]
+        movi            v1.4S, #0
+        movi            v2.4S, #0
+        movi            v3.4S, #0
+        ld1             {v4.2S}, [x0], #8
+        ld1             {v5.2S}, [x0], #8
+        fmul            v16.2S, v4.2S, v4.2S
+        fmul            v17.2S, v5.2S, v4.S[0]
+        fmul            v18.2S, v5.2S, v4.S[1]
+1:      ld1             {v5.D}[1], [x0], #8
+        fmla            v1.2S, v4.2S, v4.2S
+        fmla            v2.4S, v5.4S, v4.S[0]
+        fmla            v3.4S, v5.4S, v4.S[1]
+        mov             v4.D[0], v5.D[0]
+        mov             v5.D[0], v5.D[1]
+        subs            x2, x2, #1
+        b.gt            1b
+        fmul            v19.2S, v4.2S, v4.2S
+        fmul            v20.2S, v5.2S, v4.S[0]
+        fmul            v21.2S, v5.2S, v4.S[1]
+        fadd            v22.4S, v2.4S, v20.4S
+        fsub            v22.4S, v22.4S, v17.4S
+        fadd            v23.4S, v3.4S, v21.4S
+        fsub            v23.4S, v23.4S, v18.4S
+        rev64           v23.4S, v23.4S
+        fmul            v23.4S, v23.4S, v0.4S
+        fadd            v22.4S, v22.4S, v23.4S
+        st1             {v22.4S}, [x1], #16
+        fadd            v23.2S, v1.2S, v19.2S
+        fsub            v23.2S, v23.2S, v16.2S
+        faddp           v23.2S, v23.2S, v23.2S
+        st1             {v23.S}[0], [x1]
+        add             x1, x1, #8
+        rev64           v3.2S, v3.2S
+        fmul            v3.2S, v3.2S, v0.2S
+        fadd            v2.2S, v2.2S, v3.2S
+        st1             {v2.2S}, [x1]
+        add             x1, x1, #16
+        faddp           v1.2S, v1.2S, v1.2S
+        st1             {v1.S}[0], [x1]
+        ret
+endfunc
+
+.macro apply_noise_common
+        sxtw            x3, w3
+        sxtw            x5, w5
+        movrel          x7, X(ff_sbr_noise_table)
+        add             x3, x3, #1
+1:      and             x3, x3, #0x1ff
+        add             x8, x7, x3, lsl #3
+        add             x3, x3, #2
+        ld1             {v2.4S}, [x0]
+        ld1             {v3.2S}, [x1], #8
+        ld1             {v4.2S}, [x2], #8
+        ld1             {v5.4S}, [x8]
+        mov             v6.16B, v2.16B
+        zip1            v3.4S, v3.4S, v3.4S
+        zip1            v4.4S, v4.4S, v4.4S
+        fmla            v6.4S, v1.4S, v3.4S
+        fmla            v2.4S, v5.4S, v4.4S
+        fcmeq           v7.4S, v3.4S, #0
+        bif             v2.16B, v6.16B, v7.16B
+        st1             {v2.4S}, [x0], #16
+        subs            x5, x5, #2
+        b.gt            1b
+.endm
+
+function ff_sbr_hf_apply_noise_0_neon, export=1
+        movrel          x9, phi_noise_0
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_1_neon, export=1
+        movrel          x9, phi_noise_1
+        and             x4, x4, #1
+        add             x9, x9, x4, lsl #4
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_2_neon, export=1
+        movrel          x9, phi_noise_2
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_3_neon, export=1
+        movrel          x9, phi_noise_3
+        and             x4, x4, #1
+        add             x9, x9, x4, lsl #4
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S
new file mode 100644
index 0000000..5e4d021
--- /dev/null
+++ b/libavcodec/aarch64/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4c ((1<<(COL_SHIFT-1))/Z4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define z1 v0.H[0]
+#define z2 v0.H[1]
+#define z3 v0.H[2]
+#define z4 v0.H[3]
+#define z5 v0.H[4]
+#define z6 v0.H[5]
+#define z7 v0.H[6]
+#define z4c v0.H[7]
+
+const   idct_coeff_neon, align=4
+        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+endconst
+
+.macro idct_start data
+        prfm            pldl1keep, [\data]
+        mov             x10, x30
+        movrel          x3, idct_coeff_neon
+        ld1             {v0.2D}, [x3]
+.endm
+
+.macro idct_end
+        br              x10
+.endm
+
+.macro smull1 a, b, c
+        smull           \a, \b, \c
+.endm
+
+.macro smlal1 a, b, c
+        smlal           \a, \b, \c
+.endm
+
+.macro smlsl1 a, b, c
+        smlsl           \a, \b, \c
+.endm
+
+.macro idct_col4_top y1, y2, y3, y4, i, l
+        smull\i         v7.4S,  \y3\l, z2
+        smull\i         v16.4S, \y3\l, z6
+        smull\i         v17.4S, \y2\l, z1
+        add             v19.4S, v23.4S, v7.4S
+        smull\i         v18.4S, \y2\l, z3
+        add             v20.4S, v23.4S, v16.4S
+        smull\i         v5.4S,  \y2\l, z5
+        sub             v21.4S, v23.4S, v16.4S
+        smull\i         v6.4S,  \y2\l, z7
+        sub             v22.4S, v23.4S, v7.4S
+
+        smlal\i         v17.4S, \y4\l, z3
+        smlsl\i         v18.4S, \y4\l, z7
+        smlsl\i         v5.4S,  \y4\l, z1
+        smlsl\i         v6.4S,  \y4\l, z5
+.endm
+
+.macro idct_row4_neon y1, y2, y3, y4, pass
+        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
+        movi            v23.4S, #1<<2, lsl #8
+        orr             v5.16B, \y1\().16B, \y2\().16B
+        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
+        orr             v6.16B, \y3\().16B, \y4\().16B
+        orr             v5.16B, v5.16B, v6.16B
+        mov             x3, v5.D[1]
+        smlal           v23.4S, \y1\().4H, z4
+
+        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
+
+        cmp             x3, #0
+        b.eq            \pass\()f
+
+        smull2          v7.4S, \y1\().8H, z4
+        smlal2          v17.4S, \y2\().8H, z5
+        smlsl2          v18.4S, \y2\().8H, z1
+        smull2          v16.4S, \y3\().8H, z2
+        smlal2          v5.4S, \y2\().8H, z7
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+        smlal2          v6.4S, \y2\().8H, z3
+        smull2          v7.4S, \y3\().8H, z6
+        smlal2          v17.4S, \y4\().8H, z7
+        smlsl2          v18.4S, \y4\().8H, z5
+        smlal2          v5.4S, \y4\().8H, z3
+        smlsl2          v6.4S, \y4\().8H, z1
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+        sub             v22.4S, v22.4S, v7.4S
+
+\pass:  add             \y3\().4S, v19.4S, v17.4S
+        add             \y4\().4S, v20.4S, v18.4S
+        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
+        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
+        add             v7.4S, v21.4S, v5.4S
+        add             v16.4S, v22.4S, v6.4S
+        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
+        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
+        sub             v22.4S, v22.4S, v6.4S
+        sub             v19.4S, v19.4S, v17.4S
+        sub             v21.4S, v21.4S, v5.4S
+        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
+        sub             v20.4S, v20.4S, v18.4S
+        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
+        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
+        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
+
+        trn1            v16.8H, \y1\().8H, \y2\().8H
+        trn2            v17.8H, \y1\().8H, \y2\().8H
+        trn1            v18.8H, \y3\().8H, \y4\().8H
+        trn2            v19.8H, \y3\().8H, \y4\().8H
+        trn1            \y1\().4S, v16.4S, v18.4S
+        trn1            \y2\().4S, v17.4S, v19.4S
+        trn2            \y3\().4S, v16.4S, v18.4S
+        trn2            \y4\().4S, v17.4S, v19.4S
+.endm
+
+.macro declare_idct_col4_neon i, l
+function idct_col4_neon\i
+        dup             v23.4H, z4c
+.if \i == 1
+        add             v23.4H, v23.4H, v24.4H
+.else
+        mov             v5.D[0], v24.D[1]
+        add             v23.4H, v23.4H, v5.4H
+.endif
+        smull           v23.4S, v23.4H, z4
+
+        idct_col4_top   v24, v25, v26, v27, \i, \l
+
+        mov             x4, v28.D[\i - 1]
+        mov             x5, v29.D[\i - 1]
+        cmp             x4, #0
+        b.eq            1f
+
+        smull\i         v7.4S,  v28\l,  z4
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+
+1:      mov             x4, v30.D[\i - 1]
+        cmp             x5, #0
+        b.eq            2f
+
+        smlal\i         v17.4S, v29\l, z5
+        smlsl\i         v18.4S, v29\l, z1
+        smlal\i         v5.4S,  v29\l, z7
+        smlal\i         v6.4S,  v29\l, z3
+
+2:      mov             x5, v31.D[\i - 1]
+        cmp             x4, #0
+        b.eq            3f
+
+        smull\i         v7.4S,  v30\l, z6
+        smull\i         v16.4S, v30\l, z2
+        add             v19.4S, v19.4S, v7.4S
+        sub             v22.4S, v22.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+
+3:      cmp             x5, #0
+        b.eq            4f
+
+        smlal\i         v17.4S, v31\l, z7
+        smlsl\i         v18.4S, v31\l, z5
+        smlal\i         v5.4S,  v31\l, z3
+        smlsl\i         v6.4S,  v31\l, z1
+
+4:      addhn           v7.4H, v19.4S, v17.4S
+        addhn2          v7.8H, v20.4S, v18.4S
+        subhn           v18.4H, v20.4S, v18.4S
+        subhn2          v18.8H, v19.4S, v17.4S
+
+        addhn           v16.4H, v21.4S, v5.4S
+        addhn2          v16.8H, v22.4S, v6.4S
+        subhn           v17.4H, v22.4S, v6.4S
+        subhn2          v17.8H, v21.4S, v5.4S
+
+        ret
+endfunc
+.endm
+
+declare_idct_col4_neon 1, .4H
+declare_idct_col4_neon 2, .8H
+
+function ff_simple_idct_put_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
+
+        zip1            v16.4S, v1.4S, v2.4S
+        zip2            v17.4S, v1.4S, v2.4S
+
+        st1             {v16.D}[0], [x0], x1
+        st1             {v16.D}[1], [x0], x1
+
+        zip1            v18.4S, v3.4S, v4.4S
+        zip2            v19.4S, v3.4S, v4.4S
+
+        st1             {v17.D}[0], [x0], x1
+        st1             {v17.D}[1], [x0], x1
+        st1             {v18.D}[0], [x0], x1
+        st1             {v18.D}[1], [x0], x1
+        st1             {v19.D}[0], [x0], x1
+        st1             {v19.D}[1], [x0], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_add_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        mov             x9,  x0
+        ld1             {v19.D}[0], [x0], x1
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        ld1             {v19.D}[1], [x0], x1
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        ld1             {v20.D}[0], [x0], x1
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        ld1             {v20.D}[1], [x0], x1
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        ld1             {v21.D}[0], [x0], x1
+        uaddw           v23.8H, v23.8H, v19.8B
+        uaddw2          v24.8H, v24.8H, v19.16B
+        ld1             {v21.D}[1], [x0], x1
+        sqxtun          v23.8B, v23.8H
+        sqxtun2         v23.16B, v24.8H
+        ld1             {v22.D}[0], [x0], x1
+        uaddw           v24.8H, v25.8H, v20.8B
+        uaddw2          v25.8H, v26.8H, v20.16B
+        ld1             {v22.D}[1], [x0], x1
+        sqxtun          v24.8B, v24.8H
+        sqxtun2         v24.16B, v25.8H
+        st1             {v23.D}[0], [x9], x1
+        uaddw           v25.8H, v27.8H, v21.8B
+        uaddw2          v26.8H, v28.8H, v21.16B
+        st1             {v23.D}[1], [x9], x1
+        sqxtun          v25.8B, v25.8H
+        sqxtun2         v25.16B, v26.8H
+        st1             {v24.D}[0], [x9], x1
+        uaddw           v26.8H, v29.8H, v22.8B
+        uaddw2          v27.8H, v30.8H, v22.16B
+        st1             {v24.D}[1], [x9], x1
+        sqxtun          v26.8B, v26.8H
+        sqxtun2         v26.16B, v27.8H
+        st1             {v25.D}[0], [x9], x1
+        st1             {v25.D}[1], [x9], x1
+        st1             {v26.D}[0], [x9], x1
+        st1             {v26.D}[1], [x9], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_neon, export=1
+        idct_start      x0
+
+        mov             x2,  x0
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        sub             x2, x2, #128
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        st1             {v23.2D,v24.2D}, [x2], #32
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        st1             {v25.2D,v26.2D}, [x2], #32
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        st1             {v27.2D,v28.2D}, [x2], #32
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        st1             {v29.2D,v30.2D}, [x2], #32
+
+        idct_end
+endfunc
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/synth_filter_init.c
index d3430d0..767b011 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/synth_filter_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,8 +23,8 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavutil/attributes.h"
 #include "libavutil/internal.h"
-#include "libavcodec/dcadsp.h"
 #include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
 
 #include "asm-offsets.h"
 
@@ -32,25 +32,12 @@
 AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
 #endif
 
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
 void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float *synth_buf_ptr, int *synth_buf_offset,
                                 float synth_buf2[32], const float window[512],
                                 float out[32], const float in[32],
                                 float scale);
 
-av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
-        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
-    }
-}
-
 av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
index b001c73..8fcd71f 100644
--- a/libavcodec/aarch64/synth_filter_neon.S
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index ab97a97..13dfd74 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp.S b/libavcodec/aarch64/videodsp.S
index 7ce5a7d..24067cc 100644
--- a/libavcodec/aarch64/videodsp.S
+++ b/libavcodec/aarch64/videodsp.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp_init.c b/libavcodec/aarch64/videodsp_init.c
index 59b697d..6f667a6 100644
--- a/libavcodec/aarch64/videodsp_init.c
+++ b/libavcodec/aarch64/videodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_init.c b/libavcodec/aarch64/vorbisdsp_init.c
index 3559b54..c796f95 100644
--- a/libavcodec/aarch64/vorbisdsp_init.c
+++ b/libavcodec/aarch64/vorbisdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_neon.S b/libavcodec/aarch64/vorbisdsp_neon.S
index 11f71f1..e76feeb 100644
--- a/libavcodec/aarch64/vorbisdsp_neon.S
+++ b/libavcodec/aarch64/vorbisdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h
index 616252e..871fed7 100644
--- a/libavcodec/aarch64/vp8dsp.h
+++ b/libavcodec/aarch64/vp8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c
index 723afb4..fc7e831 100644
--- a/libavcodec/aarch64/vp8dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
index cac4558..be4f26c 100644
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -6,20 +6,20 @@
  * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
  * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libaom.h b/libavcodec/aarch64/vp9dsp_init.h
index d3d52f7..9df1752 100644
--- a/libavcodec/libaom.h
+++ b/libavcodec/aarch64/vp9dsp_init.h
@@ -1,31 +1,29 @@
 /*
- * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
+ * Copyright (c) 2017 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_LIBAOM_H
-#define AVCODEC_LIBAOM_H
+#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
+#define AVCODEC_AARCH64_VP9DSP_INIT_H
 
-#include <aom/aom_codec.h>
+#include "libavcodec/vp9dsp.h"
 
-#include "libavutil/pixfmt.h"
+void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
 
-enum AVPixelFormat ff_aom_imgfmt_to_pixfmt(aom_img_fmt_t img, int depth);
-aom_img_fmt_t ff_aom_pixfmt_to_imgfmt(enum AVPixelFormat pix);
-
-#endif /* AVCODEC_LIBAOM_H */
+#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
diff --git a/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
new file mode 100644
index 0000000..0fa0d7f
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
new file mode 100644
index 0000000..dae2232
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
new file mode 100644
index 0000000..8dcfdea
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix)                                          \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp)                                                   \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp)                                      \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src,                 \
+                                                ptrdiff_t src_stride,               \
+                                                int h, int mx, int my)              \
+{                                                                                   \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]);         \
+    /* We only need h + 7 lines, but the horizontal filter assumes an               \
+     * even number of rows, so filter h + 8 lines here. */                          \
+    ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz,                          \
+                                             src - 3 * src_stride, src_stride,      \
+                                             h + 8, mx, 0);                         \
+    ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride,                    \
+                                                temp + 3 * 2 * sz, 2 * sz,          \
+                                                h, 0, my);                          \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp)  \
+    decl_mc_func(op, regular, dir, sz, bpp); \
+    decl_mc_func(op, sharp,   dir, sz, bpp); \
+    decl_mc_func(op, smooth,  dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp)           \
+    decl_filter_funcs(put, h,  sz, bpp); \
+    decl_filter_funcs(avg, h,  sz, bpp); \
+    decl_filter_funcs(put, v,  sz, bpp); \
+    decl_filter_funcs(avg, v,  sz, bpp); \
+    decl_filter_funcs(put, hv, sz, bpp); \
+    decl_filter_funcs(avg, hv, sz, bpp)
+
+#define ff_vp9_copy32_neon  ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon  ff_vp9_copy64_aarch64
+#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64,  );
+declare_fpel(copy, 32,  );
+declare_fpel(copy, 16,  );
+declare_fpel(copy, 8,   );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8,  _16);
+declare_fpel(avg, 4,  _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp)        \
+    define_8tap_2d_fn(put, regular, sz, bpp) \
+    define_8tap_2d_fn(put, sharp,   sz, bpp) \
+    define_8tap_2d_fn(put, smooth,  sz, bpp) \
+    define_8tap_2d_fn(avg, regular, sz, bpp) \
+    define_8tap_2d_fn(avg, sharp,   sz, bpp) \
+    define_8tap_2d_fn(avg, smooth,  sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8,  BPP)
+define_8tap_2d_funcs(4,  BPP)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+    init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+    init_fpel(idx, 1, sz, avg,  suffix)
+
+#define init_copy_avg(idx, sz1, sz2) \
+    init_copy(idx, sz2, _neon);      \
+    init_avg (idx, sz1, _16_neon)
+
+    if (have_armv8(cpu_flags)) {
+        init_copy(0, 128, _aarch64);
+        init_copy(1, 64,  _aarch64);
+        init_copy(2, 32,  _aarch64);
+    }
+
+    if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp)            \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, hv, 1, 1, sz,        , bpp)
+
+
+        init_avg(0, 64, _16_neon);
+        init_avg(1, 32, _16_neon);
+        init_avg(2, 16, _16_neon);
+        init_copy_avg(3, 8, 16);
+        init_copy_avg(4, 4, 8);
+
+        init_mc_funcs_dirs(0, 64, BPP);
+        init_mc_funcs_dirs(1, 32, BPP);
+        init_mc_funcs_dirs(2, 16, BPP);
+        init_mc_funcs_dirs(3, 8,  BPP);
+        init_mc_funcs_dirs(4, 4,  BPP);
+    }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp)                                     \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst,    \
+                                                                 ptrdiff_t stride, \
+                                                                 int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp)      \
+    define_itxfm(idct,  idct,  sz, bpp); \
+    define_itxfm(iadst, idct,  sz, bpp); \
+    define_itxfm(idct,  iadst, sz, bpp); \
+    define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4,  BPP);
+define_itxfm_funcs(8,  BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4,  BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp)                                               \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_##bpp##_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp)     \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+        init_itxfm(TX_4X4,   4x4,   BPP);
+        init_itxfm(TX_8X8,   8x8,   BPP);
+        init_itxfm(TX_16X16, 16x16, BPP);
+        init_idct(TX_32X32, idct_idct_32x32, BPP);
+        init_idct(4,        iwht_iwht_4x4,   BPP);
+    }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+    define_loop_filter(h, wd, size, bpp);  \
+    define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4,  8,  BPP);
+define_loop_filters(8,  8,  BPP);
+define_loop_filters(16, 8,  BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+    dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+    init_lpf_func_8(idx, 0, h, wd, bpp);  \
+    init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp)   \
+    init_lpf_func_16(0, h, bpp); \
+    init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+    init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp);  \
+    init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp)        \
+    init_lpf_funcs_8_wd(0, 4,  bpp); \
+    init_lpf_funcs_8_wd(1, 8,  bpp); \
+    init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp)           \
+    init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+    init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+    init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+    init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+        init_lpf_funcs_8(BPP);
+        init_lpf_funcs_16(BPP);
+        init_lpf_funcs_mix2(BPP);
+    }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+    vp9dsp_mc_init_aarch64(dsp);
+    vp9dsp_loopfilter_init_aarch64(dsp);
+    vp9dsp_itxfm_init_aarch64(dsp);
+}
diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c
index 3ce2c1b..4c69975 100644
--- a/libavcodec/aarch64/vp9dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -1,28 +1,30 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/internal.h"
 #include "libavutil/aarch64/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
 
 #define declare_fpel(type, sz)                                          \
 void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
@@ -239,8 +241,17 @@ static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
     }
 }
 
-av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp)
+av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
 {
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_aarch64(dsp);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_aarch64(dsp);
+        return;
+    } else if (bpp != 8)
+        return;
+
     vp9dsp_mc_init_aarch64(dsp);
     vp9dsp_loopfilter_init_aarch64(dsp);
     vp9dsp_itxfm_init_aarch64(dsp);
diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
new file mode 100644
index 0000000..68296d9
--- /dev/null
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,2017 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
+        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
+        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
+        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
+        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
+        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
+        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
+        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
+        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
+.endm
+
+// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
+// over two registers.
+.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
+        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
+        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
+
+        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
+        // while swapping the two 4x4 matrices between each other
+
+        // First step of the 4x4 transpose of r1-r7, into t0-t3
+        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
+        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
+        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
+
+        // First step of the 4x4 transpose of r8-r12, into r1-r7
+        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
+        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
+        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
+        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
+
+        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
+        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
+        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
+        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
+        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
+
+        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
+        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
+        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
+        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
+
+        // Move the outputs of trn1 back in place
+        mov             \r1\().16b,  \t0\().16b
+        mov             \r3\().16b,  \t1\().16b
+.endm
+
+// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+// in/out are .4s registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+        neg             \tmp4\().4s, v0.4s
+.endif
+        add             \tmp1\().4s, \in1\().4s,  \in2\().4s
+        sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
+.if \neg > 0
+        smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
+        smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
+.else
+        smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
+        smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
+.endif
+.ifb \tmp5
+        rshrn           \out1\().2s, \tmp3\().2d, #14
+        rshrn2          \out1\().4s, \tmp4\().2d, #14
+        smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
+        smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
+        rshrn           \out2\().2s, \tmp3\().2d, #14
+        rshrn2          \out2\().4s, \tmp4\().2d, #14
+.else
+        smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
+        smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
+        rshrn           \out1\().2s, \tmp3\().2d, #14
+        rshrn2          \out1\().4s, \tmp4\().2d, #14
+        rshrn           \out2\().2s, \tmp5\().2d, #14
+        rshrn2          \out2\().4s, \tmp6\().2d, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
+        smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
+        rshrn           \out1\().2s, \tmp1\().2d, #14
+        rshrn2          \out1\().4s, \tmp2\().2d, #14
+        rshrn           \out2\().2s, \tmp1\().2d, #14
+        rshrn2          \out2\().4s, \tmp2\().2d, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .2d registers, in are 2 x .4s registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+        smull           \out1\().2d, \in1\().2s, \coef1
+        smull2          \out2\().2d, \in1\().4s, \coef1
+        smull           \out3\().2d, \in1\().2s, \coef2
+        smull2          \out4\().2d, \in1\().4s, \coef2
+        smlsl           \out1\().2d, \in2\().2s, \coef2
+        smlsl2          \out2\().2d, \in2\().4s, \coef2
+        smlal           \out3\().2d, \in2\().2s, \coef1
+        smlal2          \out4\().2d, \in2\().4s, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .4s registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+        neg             \tmp3\().2d, \tmp3\().2d
+        neg             \tmp4\().2d, \tmp4\().2d
+.endif
+        rshrn           \inout1\().2s, \tmp1\().2d,  #14
+        rshrn2          \inout1\().4s, \tmp2\().2d,  #14
+        rshrn           \inout2\().2s, \tmp3\().2d,  #14
+        rshrn2          \inout2\().4s, \tmp4\().2d,  #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().2d, \inout1\().2s, \coef1
+        smull2          \tmp2\().2d, \inout1\().4s, \coef1
+        smull           \tmp3\().2d, \inout1\().2s, \coef2
+        smull2          \tmp4\().2d, \inout1\().4s, \coef2
+        rshrn           \inout1\().2s, \tmp1\().2d, #14
+        rshrn2          \inout1\().4s, \tmp2\().2d, #14
+        rshrn           \inout2\().2s, \tmp3\().2d, #14
+        rshrn2          \inout2\().4s, \tmp4\().2d, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().2d, \inout2\().2s, \coef2
+        smull2          \tmp2\().2d, \inout2\().4s, \coef2
+        smull           \tmp3\().2d, \inout2\().2s, \coef1
+        smull2          \tmp4\().2d, \inout2\().4s, \coef1
+        neg             \tmp1\().2d, \tmp1\().2d
+        neg             \tmp2\().2d, \tmp2\().2d
+        rshrn           \inout2\().2s, \tmp3\().2d, #14
+        rshrn2          \inout2\().4s, \tmp4\().2d, #14
+        rshrn           \inout1\().2s, \tmp1\().2d, #14
+        rshrn2          \inout1\().4s, \tmp2\().2d, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().2d, \in\().2s, \coef
+        smull2          \out2\().2d, \in\().4s, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().2s, \in1\().2d, \shift
+        rshrn2          \out\().4s, \in2\().2d, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_4s out1, out2, in1, in2
+        add             \out1\().4s, \in1\().4s, \in2\().4s
+        sub             \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_4s_r out1, out2, in1, in2
+        sub             \out1\().4s, \in1\().4s, \in2\().4s
+        add             \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .4s registers, in are 4 x .2d registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        add             \tmp1\().2d, \in1\().2d, \in3\().2d
+        add             \tmp2\().2d, \in2\().2d, \in4\().2d
+        sub             \tmp3\().2d, \in1\().2d, \in3\().2d
+        sub             \tmp4\().2d, \in2\().2d, \in4\().2d
+        rshrn           \out1\().2s, \tmp1\().2d,  #14
+        rshrn2          \out1\().4s, \tmp2\().2d,  #14
+        rshrn           \out2\().2s, \tmp3\().2d,  #14
+        rshrn2          \out2\().4s, \tmp4\().2d,  #14
+.endm
+
+.macro iwht4_10 c0, c1, c2, c3
+        add             \c0\().4s, \c0\().4s, \c1\().4s
+        sub             v17.4s,    \c2\().4s, \c3\().4s
+        sub             v16.4s,    \c0\().4s, v17.4s
+        sshr            v16.4s,    v16.4s,    #1
+        sub             \c2\().4s, v16.4s,    \c1\().4s
+        sub             \c1\().4s, v16.4s,    \c3\().4s
+        add             \c3\().4s, v17.4s,    \c2\().4s
+        sub             \c0\().4s, \c0\().4s, \c1\().4s
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3
+        iwht4_10        \c0, \c1, \c2, \c3
+.endm
+
+.macro idct4_10 c0, c1, c2, c3
+        mul             v22.4s,    \c1\().4s, v0.s[3]
+        mul             v20.4s,    \c1\().4s, v0.s[2]
+        add             v16.4s,    \c0\().4s, \c2\().4s
+        sub             v17.4s,    \c0\().4s, \c2\().4s
+        mla             v22.4s,    \c3\().4s, v0.s[2]
+        mul             v18.4s,    v16.4s,    v0.s[0]
+        mul             v24.4s,    v17.4s,    v0.s[0]
+        mls             v20.4s,    \c3\().4s, v0.s[3]
+        srshr           v22.4s,    v22.4s,    #14
+        srshr           v18.4s,    v18.4s,    #14
+        srshr           v24.4s,    v24.4s,    #14
+        srshr           v20.4s,    v20.4s,    #14
+        add             \c0\().4s, v18.4s,    v22.4s
+        sub             \c3\().4s, v18.4s,    v22.4s
+        add             \c1\().4s, v24.4s,    v20.4s
+        sub             \c2\().4s, v24.4s,    v20.4s
+.endm
+
+.macro idct4_12 c0, c1, c2, c3
+        smull           v22.2d,    \c1\().2s, v0.s[3]
+        smull2          v23.2d,    \c1\().4s, v0.s[3]
+        smull           v20.2d,    \c1\().2s, v0.s[2]
+        smull2          v21.2d,    \c1\().4s, v0.s[2]
+        add             v16.4s,    \c0\().4s, \c2\().4s
+        sub             v17.4s,    \c0\().4s, \c2\().4s
+        smlal           v22.2d,    \c3\().2s, v0.s[2]
+        smlal2          v23.2d,    \c3\().4s, v0.s[2]
+        smull           v18.2d,    v16.2s,    v0.s[0]
+        smull2          v19.2d,    v16.4s,    v0.s[0]
+        smull           v24.2d,    v17.2s,    v0.s[0]
+        smull2          v25.2d,    v17.4s,    v0.s[0]
+        smlsl           v20.2d,    \c3\().2s, v0.s[3]
+        smlsl2          v21.2d,    \c3\().4s, v0.s[3]
+        rshrn           v22.2s,    v22.2d,    #14
+        rshrn2          v22.4s,    v23.2d,    #14
+        rshrn           v18.2s,    v18.2d,    #14
+        rshrn2          v18.4s,    v19.2d,    #14
+        rshrn           v24.2s,    v24.2d,    #14
+        rshrn2          v24.4s,    v25.2d,    #14
+        rshrn           v20.2s,    v20.2d,    #14
+        rshrn2          v20.4s,    v21.2d,    #14
+        add             \c0\().4s, v18.4s,    v22.4s
+        sub             \c3\().4s, v18.4s,    v22.4s
+        add             \c1\().4s, v24.4s,    v20.4s
+        sub             \c2\().4s, v24.4s,    v20.4s
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3
+        mul             v16.4s,    \c0\().4s, v1.s[0]
+        mla             v16.4s,    \c2\().4s, v1.s[1]
+        mla             v16.4s,    \c3\().4s, v1.s[2]
+        mul             v18.4s,    \c0\().4s, v1.s[2]
+        mls             v18.4s,    \c2\().4s, v1.s[0]
+        sub             \c0\().4s, \c0\().4s, \c2\().4s
+        mls             v18.4s,    \c3\().4s, v1.s[1]
+        add             \c0\().4s, \c0\().4s, \c3\().4s
+        mul             v22.4s,    \c1\().4s, v1.s[3]
+        mul             v20.4s,    \c0\().4s, v1.s[3]
+        add             v24.4s,    v16.4s,    v22.4s
+        add             v26.4s,    v18.4s,    v22.4s
+        srshr           \c0\().4s, v24.4s,    #14
+        add             v16.4s,    v16.4s,    v18.4s
+        srshr           \c1\().4s, v26.4s,    #14
+        sub             v16.4s,    v16.4s,    v22.4s
+        srshr           \c2\().4s, v20.4s,    #14
+        srshr           \c3\().4s, v16.4s,    #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3
+        smull           v16.2d,    \c0\().2s, v1.s[0]
+        smull2          v17.2d,    \c0\().4s, v1.s[0]
+        smlal           v16.2d,    \c2\().2s, v1.s[1]
+        smlal2          v17.2d,    \c2\().4s, v1.s[1]
+        smlal           v16.2d,    \c3\().2s, v1.s[2]
+        smlal2          v17.2d,    \c3\().4s, v1.s[2]
+        smull           v18.2d,    \c0\().2s, v1.s[2]
+        smull2          v19.2d,    \c0\().4s, v1.s[2]
+        smlsl           v18.2d,    \c2\().2s, v1.s[0]
+        smlsl2          v19.2d,    \c2\().4s, v1.s[0]
+        sub             \c0\().4s, \c0\().4s, \c2\().4s
+        smlsl           v18.2d,    \c3\().2s, v1.s[1]
+        smlsl2          v19.2d,    \c3\().4s, v1.s[1]
+        add             \c0\().4s, \c0\().4s, \c3\().4s
+        smull           v22.2d,    \c1\().2s, v1.s[3]
+        smull2          v23.2d,    \c1\().4s, v1.s[3]
+        smull           v20.2d,    \c0\().2s, v1.s[3]
+        smull2          v21.2d,    \c0\().4s, v1.s[3]
+        add             v24.2d,    v16.2d,    v22.2d
+        add             v25.2d,    v17.2d,    v23.2d
+        add             v26.2d,    v18.2d,    v22.2d
+        add             v27.2d,    v19.2d,    v23.2d
+        rshrn           \c0\().2s, v24.2d,    #14
+        rshrn2          \c0\().4s, v25.2d,    #14
+        add             v16.2d,    v16.2d,    v18.2d
+        add             v17.2d,    v17.2d,    v19.2d
+        rshrn           \c1\().2s, v26.2d,    #14
+        rshrn2          \c1\().4s, v27.2d,    #14
+        sub             v16.2d,    v16.2d,    v22.2d
+        sub             v17.2d,    v17.2d,    v23.2d
+        rshrn           \c2\().2s, v20.2d,    #14
+        rshrn2          \c2\().4s, v21.2d,    #14
+        rshrn           \c3\().2s, v16.2d,    #14
+        rshrn2          \c3\().4s, v17.2d,    #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+.endif
+.ifc \txfm1,iadst
+        movrel          x4,  iadst4_coeffs
+        ld1             {v0.d}[1], [x4]
+        sxtl2           v1.4s,  v0.8h
+.endif
+.else
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.8h}, [x4]
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+
+        movi            v30.4s, #0
+        movi            v31.4s, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.ne            1f
+        // DC-only for idct/idct
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s, v0.s[0]
+        rshrn           v2.2s,  v2.2d, #14
+        smull           v2.2d,  v2.2s, v0.s[0]
+        rshrn           v2.2s,  v2.2d, #14
+        st1             {v31.s}[0], [x2]
+        dup             v4.4s,  v2.s[0]
+        mov             v5.16b, v4.16b
+        mov             v6.16b, v4.16b
+        mov             v7.16b, v4.16b
+        b               2f
+.endif
+
+1:
+        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
+        st1             {v30.4s,v31.4s}, [x2], #32
+
+.ifc \txfm1,iwht
+        sshr            v4.4s,  v4.4s,  #2
+        sshr            v5.4s,  v5.4s,  #2
+        sshr            v6.4s,  v6.4s,  #2
+        sshr            v7.4s,  v7.4s,  #2
+.endif
+
+        \txfm1\()4_\bpp v4,  v5,  v6,  v7
+
+        st1             {v30.4s,v31.4s}, [x2], #32
+        // Transpose 4x4 with 32 bit elements
+        transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
+
+        \txfm2\()4_\bpp v4,  v5,  v6,  v7
+2:
+        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        ld1             {v0.4h},   [x0], x1
+        ld1             {v1.4h},   [x0], x1
+.ifnc \txfm1,iwht
+        srshr           v4.4s,  v4.4s,  #4
+        srshr           v5.4s,  v5.4s,  #4
+        srshr           v6.4s,  v6.4s,  #4
+        srshr           v7.4s,  v7.4s,  #4
+.endif
+        uaddw           v4.4s,  v4.4s,  v0.4h
+        uaddw           v5.4s,  v5.4s,  v1.4h
+        ld1             {v2.4h},   [x0], x1
+        ld1             {v3.4h},   [x0], x1
+        sqxtun          v0.4h,  v4.4s
+        sqxtun2         v0.8h,  v5.4s
+        sub             x0,  x0,  x1, lsl #2
+
+        uaddw           v6.4s,  v6.4s,  v2.4h
+        umin            v0.8h,  v0.8h,  v31.8h
+        uaddw           v7.4s,  v7.4s,  v3.4h
+        st1             {v0.4h},   [x0], x1
+        sqxtun          v2.4h,  v6.4s
+        sqxtun2         v2.8h,  v7.4s
+        umin            v2.8h,  v2.8h,  v31.8h
+
+        st1             {v0.d}[1], [x0], x1
+        st1             {v2.4h},   [x0], x1
+        st1             {v2.d}[1], [x0], x1
+
+        ret
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct,  idct,  \bpp
+itxfm_func4x4 iadst, idct,  \bpp
+itxfm_func4x4 idct,  iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht,  iwht,  \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+function idct8x8_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+
+        movi            v1.4h,  #0
+        sxtl            v0.4s,  v0.4h
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v2.4s,  v2.4s,  #5
+
+        mov             x4,  #8
+        mov             x3,  x0
+        dup             v31.8h, w5
+1:
+        // Loop to add the constant from v2 into all 8x8 outputs
+        subs            x4,  x4,  #2
+        ld1             {v3.8h},  [x0], x1
+        ld1             {v4.8h},  [x0], x1
+        uaddw           v16.4s, v2.4s,  v3.4h
+        uaddw2          v17.4s, v2.4s,  v3.8h
+        uaddw           v18.4s, v2.4s,  v4.4h
+        uaddw2          v19.4s, v2.4s,  v4.8h
+        sqxtun          v3.4h,  v16.4s
+        sqxtun2         v3.8h,  v17.4s
+        sqxtun          v4.4h,  v18.4s
+        sqxtun2         v4.8h,  v19.4s
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h},  [x3], x1
+        st1             {v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+        dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
+        dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
+        dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
+        dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
+
+        butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
+        butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
+        butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
+        butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
+
+        dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
+
+        butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
+        butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
+        butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
+        butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
+.endm
+
+.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+        dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
+        dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
+
+        dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
+        dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
+
+        dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
+        dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
+
+        dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
+        dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
+
+        butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
+        neg             \r7\().4s, \r7\().4s // r7 = out[7]
+        butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
+
+        dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
+        dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
+
+        dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
+
+        dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
+        neg             \r3\().4s, \r3\().4s  // r3 = out[3]
+
+        dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
+        neg             \r1\().4s, \r1\().4s  // r1 = out[1]
+
+        dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
+        neg             \r5\().4s, \r5\().4s  // r5 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct8x8_dc_add_neon
+.endif
+        // The iadst also uses a few coefficients from
+        // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+        movrel          x4,  idct_coeffs
+.else
+        movrel          x4,  iadst8_coeffs
+        ld1             {v1.8h}, [x4], #16
+        stp             d8,  d9,  [sp, #-0x10]!
+        sxtl2           v3.4s,  v1.8h
+        sxtl            v2.4s,  v1.4h
+.endif
+        ld1             {v0.8h}, [x4]
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+
+1:
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
+        ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
+        ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
+        ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
+        sub             x2,  x2,  #256
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
+        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
+.else
+        \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
+        \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
+.endif
+
+        // Transpose 8x8 with 16 bit elements
+        transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
+        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
+.else
+        \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
+        \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
+.endif
+2:
+        mov             x3,  x0
+        // Add into the destination
+        ld1             {v0.8h},  [x0], x1
+        srshr           v16.4s, v16.4s, #5
+        srshr           v17.4s, v17.4s, #5
+        ld1             {v1.8h},  [x0], x1
+        srshr           v18.4s, v18.4s, #5
+        srshr           v19.4s, v19.4s, #5
+        ld1             {v2.8h},  [x0], x1
+        srshr           v20.4s, v20.4s, #5
+        srshr           v21.4s, v21.4s, #5
+        uaddw           v16.4s, v16.4s, v0.4h
+        uaddw2          v17.4s, v17.4s, v0.8h
+        ld1             {v3.8h},  [x0], x1
+        srshr           v22.4s, v22.4s, #5
+        srshr           v23.4s, v23.4s, #5
+        uaddw           v18.4s, v18.4s, v1.4h
+        uaddw2          v19.4s, v19.4s, v1.8h
+        ld1             {v4.8h},  [x0], x1
+        srshr           v24.4s, v24.4s, #5
+        srshr           v25.4s, v25.4s, #5
+        uaddw           v20.4s, v20.4s, v2.4h
+        uaddw2          v21.4s, v21.4s, v2.8h
+        sqxtun          v0.4h,  v16.4s
+        sqxtun2         v0.8h,  v17.4s
+        dup             v16.8h, w5
+        ld1             {v5.8h},  [x0], x1
+        srshr           v26.4s, v26.4s, #5
+        srshr           v27.4s, v27.4s, #5
+        uaddw           v22.4s, v22.4s, v3.4h
+        uaddw2          v23.4s, v23.4s, v3.8h
+        sqxtun          v1.4h,  v18.4s
+        sqxtun2         v1.8h,  v19.4s
+        umin            v0.8h,  v0.8h,  v16.8h
+        ld1             {v6.8h},  [x0], x1
+        srshr           v28.4s, v28.4s, #5
+        srshr           v29.4s, v29.4s, #5
+        uaddw           v24.4s, v24.4s, v4.4h
+        uaddw2          v25.4s, v25.4s, v4.8h
+        sqxtun          v2.4h,  v20.4s
+        sqxtun2         v2.8h,  v21.4s
+        umin            v1.8h,  v1.8h,  v16.8h
+        ld1             {v7.8h},  [x0], x1
+        srshr           v30.4s, v30.4s, #5
+        srshr           v31.4s, v31.4s, #5
+        uaddw           v26.4s, v26.4s, v5.4h
+        uaddw2          v27.4s, v27.4s, v5.8h
+        sqxtun          v3.4h,  v22.4s
+        sqxtun2         v3.8h,  v23.4s
+        umin            v2.8h,  v2.8h,  v16.8h
+
+        st1             {v0.8h},  [x3], x1
+        uaddw           v28.4s, v28.4s, v6.4h
+        uaddw2          v29.4s, v29.4s, v6.8h
+        st1             {v1.8h},  [x3], x1
+        sqxtun          v4.4h,  v24.4s
+        sqxtun2         v4.8h,  v25.4s
+        umin            v3.8h,  v3.8h,  v16.8h
+        st1             {v2.8h},  [x3], x1
+        uaddw           v30.4s, v30.4s, v7.4h
+        uaddw2          v31.4s, v31.4s, v7.8h
+        st1             {v3.8h},  [x3], x1
+        sqxtun          v5.4h,  v26.4s
+        sqxtun2         v5.8h,  v27.4s
+        umin            v4.8h,  v4.8h,  v16.8h
+        st1             {v4.8h},  [x3], x1
+        sqxtun          v6.4h,  v28.4s
+        sqxtun2         v6.8h,  v29.4s
+        umin            v5.8h,  v5.8h,  v16.8h
+        st1             {v5.8h},  [x3], x1
+        sqxtun          v7.4h,  v30.4s
+        sqxtun2         v7.8h,  v31.4s
+        umin            v6.8h,  v6.8h,  v16.8h
+
+        st1             {v6.8h},  [x3], x1
+        umin            v7.8h,  v7.8h,  v16.8h
+        st1             {v7.8h},  [x3], x1
+
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d8,  d9,  [sp], 0x10
+.endif
+        ret
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+        mov             x5,  #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+        mov             x5,  #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v0.4s,  v2.4s,  #6
+
+        mov             x3, x0
+        mov             x4, #16
+        dup             v31.8h, w13
+1:
+        // Loop to add the constant from v2 into all 16x16 outputs
+        subs            x4,  x4,  #2
+        ld1             {v1.8h,v2.8h},  [x0], x1
+        uaddw           v16.4s, v0.4s,  v1.4h
+        uaddw2          v17.4s, v0.4s,  v1.8h
+        ld1             {v3.8h,v4.8h},  [x0], x1
+        uaddw           v18.4s, v0.4s,  v2.4h
+        uaddw2          v19.4s, v0.4s,  v2.8h
+        uaddw           v20.4s, v0.4s,  v3.4h
+        uaddw2          v21.4s, v0.4s,  v3.8h
+        uaddw           v22.4s, v0.4s,  v4.4h
+        uaddw2          v23.4s, v0.4s,  v4.8h
+        sqxtun          v1.4h,  v16.4s
+        sqxtun2         v1.8h,  v17.4s
+        sqxtun          v2.4h,  v18.4s
+        sqxtun2         v2.8h,  v19.4s
+        sqxtun          v3.4h,  v20.4s
+        sqxtun2         v3.8h,  v21.4s
+        sqxtun          v4.4h,  v22.4s
+        sqxtun2         v4.8h,  v23.4s
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v2.8h,  v2.8h,  v31.8h
+        st1             {v1.8h,v2.8h},  [x3], x1
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h,v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct16_end
+        butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
+        butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
+        butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
+        butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
+        butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
+        butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
+        butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
+        butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
+
+        dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
+        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
+
+        butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
+        butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
+        butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
+        butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
+        butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
+        butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
+        butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
+        butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
+        ret
+.endm
+
+function idct16
+        dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
+        dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
+        dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
+        dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
+        dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
+        dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
+        dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+        dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
+        dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        dsmull_h        v24, v25, v19, v3.s[3]
+        dsmull_h        v4,  v5,  v17, v2.s[0]
+        dsmull_h        v7,  v6,  v18, v1.s[1]
+        dsmull_h        v30, v31, v18, v1.s[0]
+        neg             v24.2d,  v24.2d
+        neg             v25.2d,  v25.2d
+        dsmull_h        v29, v28, v17, v2.s[1]
+        dsmull_h        v26, v27, v19, v3.s[2]
+        dsmull_h        v22, v23, v16, v0.s[0]
+        drshrn_h        v24, v24, v25, #14
+        drshrn_h        v16, v4,  v5,  #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v6,  v30, v31, #14
+        drshrn_h        v29, v29, v28, #14
+        drshrn_h        v17, v26, v27, #14
+        drshrn_h        v28, v22, v23, #14
+
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
+        neg             v22.2d,  v22.2d
+        neg             v23.2d,  v23.2d
+        drshrn_h        v27, v20, v21, #14
+        drshrn_h        v21, v22, v23, #14
+        drshrn_h        v23, v18, v19, #14
+        drshrn_h        v25, v30, v31, #14
+        mov             v4.16b,  v28.16b
+        mov             v5.16b,  v28.16b
+        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
+        mov             v20.16b, v28.16b
+        idct16_end
+endfunc
+
+function iadst16
+        ld1             {v0.8h,v1.8h}, [x11]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
+        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
+        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
+        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
+        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
+        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
+        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
+        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
+
+        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
+        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
+        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
+        ld1             {v0.8h}, [x10]
+        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
+        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
+
+        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
+        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
+        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
+        butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
+        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
+
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
+        butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
+        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
+        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
+
+        butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
+        butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
+
+        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
+        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
+        neg             v29.4s, v29.4s                   // v29 = out[13]
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
+        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
+
+        butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
+        butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
+
+        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
+        neg             v19.4s, v19.4s                   // v19 = out[3]
+        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
+
+        butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
+        butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
+
+        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
+        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
+
+        neg             v31.4s,  v5.4s                    // v31 = out[15]
+        neg             v17.4s,  v3.4s                    // v17 = out[1]
+
+        mov             v16.16b, v2.16b
+        mov             v30.16b, v4.16b
+        ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+        ld1             {v\i\().4s},  [\src], \inc
+.endm
+.macro store i, dst, inc
+        st1             {v\i\().4s},  [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+        movi            v\i\()\size,  \imm
+.endm
+.macro load_clear i, src, inc
+        ld1             {v\i\().4s}, [\src]
+        st1             {v4.4s},  [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
+        srshr           \coef0, \coef0, #6
+        ld1             {v4.4h},   [x0], x1
+        srshr           \coef1, \coef1, #6
+        ld1             {v4.d}[1], [x3], x1
+        srshr           \coef2, \coef2, #6
+        ld1             {v5.4h},   [x0], x1
+        srshr           \coef3, \coef3, #6
+        uaddw           \coef0, \coef0, v4.4h
+        ld1             {v5.d}[1], [x3], x1
+        srshr           \coef4, \coef4, #6
+        uaddw2          \coef1, \coef1, v4.8h
+        ld1             {v6.4h},   [x0], x1
+        srshr           \coef5, \coef5, #6
+        uaddw           \coef2, \coef2, v5.4h
+        ld1             {v6.d}[1], [x3], x1
+        sqxtun          v4.4h,  \coef0
+        srshr           \coef6, \coef6, #6
+        uaddw2          \coef3, \coef3, v5.8h
+        ld1             {v7.4h},   [x0], x1
+        sqxtun2         v4.8h,  \coef1
+        srshr           \coef7, \coef7, #6
+        uaddw           \coef4, \coef4, v6.4h
+        ld1             {v7.d}[1], [x3], x1
+        umin            v4.8h,  v4.8h,  v8.8h
+        sub             x0,  x0,  x1, lsl #2
+        sub             x3,  x3,  x1, lsl #2
+        sqxtun          v5.4h,  \coef2
+        uaddw2          \coef5, \coef5, v6.8h
+        st1             {v4.4h},   [x0], x1
+        sqxtun2         v5.8h,  \coef3
+        uaddw           \coef6, \coef6, v7.4h
+        st1             {v4.d}[1], [x3], x1
+        umin            v5.8h,  v5.8h,  v8.8h
+        sqxtun          v6.4h,  \coef4
+        uaddw2          \coef7, \coef7, v7.8h
+        st1             {v5.4h},   [x0], x1
+        sqxtun2         v6.8h,  \coef5
+        st1             {v5.d}[1], [x3], x1
+        umin            v6.8h,  v6.8h,  v8.8h
+        sqxtun          v7.4h,  \coef6
+        st1             {v6.4h},   [x0], x1
+        sqxtun2         v7.8h,  \coef7
+        st1             {v6.d}[1], [x3], x1
+        umin            v7.8h,  v7.8h,  v8.8h
+        st1             {v7.4h},   [x0], x1
+        st1             {v7.d}[1], [x3], x1
+.endm
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x4 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_4x16_pass1_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              \txfm\()16
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        cmp             x1,  #12
+        b.eq            1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        store           \i,  x0,  #16
+.endr
+        br              x14
+1:
+        // Special case: For the last input column (x1 == 12),
+        // which would be stored as the last row in the temp buffer,
+        // don't store the first 4x4 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // last 4x4 block).
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+
+        mov             v28.16b, v16.16b
+        mov             v29.16b, v17.16b
+        mov             v30.16b, v18.16b
+        mov             v31.16b, v19.16b
+        br              x14
+endfunc
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 4x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_4x16_pass2_neon
+        mov             x14, x30
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 28, 29, 30, 31
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              \txfm\()16
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+// This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+        .short  0, 10, 38, 89
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct16x16_dc_add_neon
+.endif
+        mov             x15, x30
+        // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
+.ifnc \txfm1\()_\txfm2,idct_idct
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+.endif
+        stp             d8,  d9,  [sp, #-0x10]!
+
+        sub             sp,  sp,  #1024
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        movrel          x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+        movrel          x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+        ld1             {v0.8h,v1.8h}, [x10]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+        mov             x9,  #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #10
+        b.le            idct16x16_quarter_add_16_neon
+        cmp             w3,  #38
+        b.le            idct16x16_half_add_16_neon
+
+        movrel          x12, min_eob_idct_idct_16, 2
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  sp,  #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(16 - \i)/4
+        b.le            1f
+.endif
+.endif
+        mov             x1,  #\i
+        add             x2,  x6,  #(\i*4)
+        bl              \txfm1\()16_1d_4x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+        ld1             {v0.8h,v1.8h}, [x10]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        // Set v28-v31 to zero, for the in-register passthrough of
+        // coefficients to pass 2.
+        movi            v28.4s,  #0
+        movi            v29.4s,  #0
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
+.endr
+        b.ne            2b
+3:
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        mov             x3,  #\i
+        bl              \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+        add             sp,  sp,  #1024
+        ldp             d8,  d9,  [sp], 0x10
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x15
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+        mov             x13, #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+        mov             x13, #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_4x16_pass1_quarter_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_quarter
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        // The first 4x4 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+        mov             x14, x30
+
+        // Only load the top 4 lines, and only do it for the later slices.
+        // For the first slice, d16-d19 is kept in registers from the first pass.
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_quarter
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_half
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        cmp             x1,  #4
+        b.eq            1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        store           \i,  x0,  #16
+.endr
+        br              x14
+1:
+        // Special case: For the second input column (r1 == 4),
+        // which would be stored as the second row in the temp buffer,
+        // don't store the first 4x4 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // second 4x4 block).
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v22.16b, v18.16b
+        mov             v23.16b, v19.16b
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+        mov             x14, x30
+
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_half
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+        add             x0,  sp,  #(0*64)
+        mov             x1,  #0
+        add             x2,  x6,  #(0*4)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+        add             x0,  sp,  #(4*64)
+        mov             x1,  #4
+        add             x2,  x6,  #(4*4)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        mov             x3,  #\i
+        bl              idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #1024
+        ldp             d8,  d9,  [sp], 0x10
+        br              x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v0.4s,  v2.4s,  #6
+
+        mov             x3,  x0
+        mov             x4,  #32
+        sub             x1,  x1,  #32
+        dup             v31.8h, w13
+1:
+        // Loop to add the constant v0 into all 32x32 outputs
+        subs            x4,  x4,  #1
+        ld1             {v1.8h,v2.8h},  [x0], #32
+        uaddw           v16.4s, v0.4s,  v1.4h
+        uaddw2          v17.4s, v0.4s,  v1.8h
+        ld1             {v3.8h,v4.8h},  [x0], x1
+        uaddw           v18.4s, v0.4s,  v2.4h
+        uaddw2          v19.4s, v0.4s,  v2.8h
+        uaddw           v20.4s, v0.4s,  v3.4h
+        uaddw2          v21.4s, v0.4s,  v3.8h
+        uaddw           v22.4s, v0.4s,  v4.4h
+        uaddw2          v23.4s, v0.4s,  v4.8h
+        sqxtun          v1.4h,  v16.4s
+        sqxtun2         v1.8h,  v17.4s
+        sqxtun          v2.4h,  v18.4s
+        sqxtun2         v2.8h,  v19.4s
+        sqxtun          v3.4h,  v20.4s
+        sqxtun2         v3.8h,  v21.4s
+        sqxtun          v4.4h,  v22.4s
+        sqxtun2         v4.8h,  v23.4s
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v2.8h,  v2.8h,  v31.8h
+        st1             {v1.8h,v2.8h},  [x3], #32
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h,v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct32_end
+        butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
+        butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
+        butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
+        butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
+        butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
+        butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
+        butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
+        butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
+
+        dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
+        dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
+        dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
+        dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+        butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
+        butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+        butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
+        butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+        butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
+        butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
+        butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
+        butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
+
+        dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
+        dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
+        dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
+        dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
+        ret
+.endm
+
+function idct32_odd
+        dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        dsmull_h        v4,  v5,  v16, v10.s[0]
+        dsmull_h        v28, v29, v19, v11.s[3]
+        dsmull_h        v30, v31, v16, v10.s[1]
+        dsmull_h        v22, v23, v17, v13.s[2]
+        dsmull_h        v7,  v6,  v17, v13.s[3]
+        dsmull_h        v26, v27, v19, v11.s[2]
+        dsmull_h        v20, v21, v18, v12.s[0]
+        dsmull_h        v24, v25, v18, v12.s[1]
+
+        neg             v28.2d, v28.2d
+        neg             v29.2d, v29.2d
+        neg             v7.2d,  v7.2d
+        neg             v6.2d,  v6.2d
+
+        drshrn_h        v4,  v4,  v5,  #14
+        drshrn_h        v5,  v28, v29, #14
+        drshrn_h        v29, v30, v31, #14
+        drshrn_h        v28, v22, v23, #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v31, v26, v27, #14
+        drshrn_h        v6,  v20, v21, #14
+        drshrn_h        v30, v24, v25, #14
+
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
+        drshrn_h        v23, v16, v17, #14
+        drshrn_h        v24, v18, v19, #14
+        neg             v20.2d, v20.2d
+        neg             v21.2d, v21.2d
+        drshrn_h        v27, v27, v26, #14
+        drshrn_h        v20, v20, v21, #14
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
+        drshrn_h        v21, v16, v17, #14
+        drshrn_h        v26, v18, v19, #14
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
+        drshrn_h        v25, v16, v17, #14
+        neg             v18.2d, v18.2d
+        neg             v19.2d, v19.2d
+        drshrn_h        v22, v18, v19, #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_4x32_pass1\suffix\()_neon
+        mov             x14, x30
+
+        movi            v4.4s,  #0
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the registers a, b, c, d horizontally, followed by the
+        // same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+        // There's no rev128 instruction, but we reverse each 64 bit
+        // half, and then flip them using an ext with 8 bytes offset.
+        rev64           v7.4s, \d
+        st1             {\a},  [x0], #16
+        ext             v7.16b, v7.16b, v7.16b, #8
+        st1             {\b},  [x0], #16
+        rev64           v6.4s, \c
+        st1             {\c},  [x0], #16
+        ext             v6.16b, v6.16b, v6.16b, #8
+        st1             {\d},  [x0], #16
+        rev64           v5.4s, \b
+        st1             {v7.4s},  [x0], #16
+        ext             v5.16b, v5.16b, v5.16b, #8
+        st1             {v6.4s},  [x0], #16
+        rev64           v4.4s, \a
+        st1             {v5.4s},  [x0], #16
+        ext             v4.16b, v4.16b, v4.16b, #8
+        st1             {v4.4s},  [x0], #16
+.endm
+        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
+        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
+        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
+        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
+        sub             x0,  x0,  #512
+.purgem store_rev
+
+        // Move x2 back to the start of the input, and move
+        // to the first odd row
+.ifb \suffix
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        add             x2,  x2,  #128
+
+        movi            v4.4s,  #0
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
+        transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
+        transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
+        transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
+
+        // Store the registers a, b, c, d horizontally,
+        // adding into the output first, and the mirrored,
+        // subtracted from the output.
+.macro store_rev a, b, c, d, a16b, b16b
+        ld1             {v4.4s},  [x0]
+        rev64           v9.4s, \d
+        add             v4.4s, v4.4s, \a
+        st1             {v4.4s},  [x0], #16
+        rev64           v8.4s, \c
+        ld1             {v4.4s},  [x0]
+        ext             v9.16b, v9.16b, v9.16b, #8
+        add             v4.4s, v4.4s, \b
+        st1             {v4.4s},  [x0], #16
+        ext             v8.16b, v8.16b, v8.16b, #8
+        ld1             {v4.4s},  [x0]
+        rev64           \b, \b
+        add             v4.4s, v4.4s, \c
+        st1             {v4.4s},  [x0], #16
+        rev64           \a, \a
+        ld1             {v4.4s},  [x0]
+        ext             \b16b, \b16b, \b16b, #8
+        add             v4.4s, v4.4s, \d
+        st1             {v4.4s},  [x0], #16
+        ext             \a16b, \a16b, \a16b, #8
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, v9.4s
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, v8.4s
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, \b
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, \a
+        st1             {v4.4s},  [x0], #16
+.endm
+
+        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
+        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
+        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
+        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
+.purgem store_rev
+        br              x14
+endfunc
+
+// This is mostly the same as 4x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_4x32_pass2\suffix\()_neon
+        mov             x14, x30
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i, x2, x9
+.endr
+
+        sub             x2,  x2,  x9, lsl #4
+        add             x2,  x2,  #128
+
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        sub             x2,  x2,  #128
+
+        bl              idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+        ld1             {v4.4s},  [x2], x9
+        ld1             {v5.4s},  [x2], x9
+        add             v4.4s, v4.4s, \a
+        ld1             {v6.4s},  [x2], x9
+        add             v5.4s, v5.4s, \b
+        ld1             {v7.4s},  [x2], x9
+        add             v6.4s, v6.4s, \c
+        add             v7.4s, v7.4s, \d
+.else
+        ld1             {v4.4s},  [x2], x7
+        ld1             {v5.4s},  [x2], x7
+        sub             v4.4s, v4.4s, \a
+        ld1             {v6.4s},  [x2], x7
+        sub             v5.4s, v5.4s, \b
+        ld1             {v7.4s},  [x2], x7
+        sub             v6.4s, v6.4s, \c
+        sub             v7.4s, v7.4s, \d
+.endif
+        ld1             {v8.4h},   [x0], x1
+        ld1             {v8.d}[1], [x0], x1
+        srshr           v4.4s, v4.4s, #6
+        ld1             {v9.4h},   [x0], x1
+        srshr           v5.4s, v5.4s, #6
+        uaddw           v4.4s, v4.4s, v8.4h
+        ld1             {v9.d}[1], [x0], x1
+        srshr           v6.4s, v6.4s, #6
+        uaddw2          v5.4s, v5.4s, v8.8h
+        srshr           v7.4s, v7.4s, #6
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v6.4s, v6.4s, v9.4h
+        sqxtun          v4.4h, v4.4s
+        uaddw2          v7.4s, v7.4s, v9.8h
+        sqxtun2         v4.8h, v5.4s
+        umin            v4.8h, v4.8h, v15.8h
+        st1             {v4.4h},   [x0], x1
+        sqxtun          v5.4h, v6.4s
+        st1             {v4.d}[1], [x0], x1
+        sqxtun2         v5.8h, v7.4s
+        umin            v5.8h, v5.8h, v15.8h
+        st1             {v5.4h},   [x0], x1
+        st1             {v5.d}[1], [x0], x1
+.endm
+        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
+        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
+        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
+        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
+        sub             x2,  x2,  x9
+        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
+        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
+        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
+        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
+.purgem load_acc_store
+        br              x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+        cmp             w3,  #1
+        b.eq            idct32x32_dc_add_neon
+
+        movrel          x10, idct_coeffs
+
+        mov             x15, x30
+        stp             d8,  d9,  [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d14, d15, [sp, #-0x10]!
+
+        sub             sp,  sp,  #4096
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        // Double stride of the input, since we only read every other line
+        mov             x9,  #256
+        neg             x7,  x9
+
+        ld1             {v0.8h,v1.8h},   [x10], #32
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+        ld1             {v10.8h,v11.8h}, [x10]
+        sxtl            v12.4s, v11.4h
+        sxtl2           v13.4s, v11.8h
+        sxtl2           v11.4s, v10.8h
+        sxtl            v10.4s, v10.4h
+
+        dup             v15.8h, w13
+
+        cmp             w3,  #34
+        b.le            idct32x32_quarter_add_16_neon
+        cmp             w3,  #135
+        b.le            idct32x32_half_add_16_neon
+
+        movrel          x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  sp,  #(\i*128)
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(32 - \i)/4
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.4s,  #0
+        movi            v17.4s,  #0
+        movi            v18.4s,  #0
+        movi            v19.4s,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+.endr
+        b.ne            2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        bl              idct32_1d_4x32_pass2_neon
+.endr
+
+        add             sp,  sp,  #4096
+        ldp             d14, d15, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d8,  d9,  [sp], 0x10
+
+        br              x15
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+        mov             x13, #0x03ff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+        mov             x13, #0x0fff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 4
+        add             x0,  sp,  #(\i*128)
+.ifc \size,quarter
+.if \i == 4
+        cmp             w3,  #9
+        b.le            1f
+.endif
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 8, 12
+        add             x0,  sp,  #(\i*128)
+.if \i == 12
+        cmp             w3,  #70
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+.endif
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.4s,  #0
+        movi            v17.4s,  #0
+        movi            v18.4s,  #0
+        movi            v19.4s,  #0
+
+.rept 4
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+.endr
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        bl              idct32_1d_4x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #4096
+        ldp             d14, d15, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d8,  d9,  [sp], 0x10
+
+        br              x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 3ffb418..99413b0 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
new file mode 100644
index 0000000..9075f3d
--- /dev/null
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
+// The input to and output from this macro is in the registers v16-v31,
+// and v0-v7 are used as scratch registers.
+// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
+// Depending on the width of the loop filter, we either use v16-v19
+// and v28-v31 as temp registers, or v8-v15.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+        dup             v0.8h,  w2                   // E
+        dup             v2.8h,  w3                   // I
+        dup             v3.8h,  w4                   // H
+
+        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
+        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
+        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
+        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
+        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
+        umax            v4.8h,  v4.8h,  v5.8h
+        umax            v5.8h,  v6.8h,  v7.8h
+        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
+        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
+        umax            v4.8h,  v4.8h,  v5.8h
+        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
+        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
+        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
+        ushr            v5.8h,  v5.8h,  #1
+        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
+        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        cmhs            v6.8h,  v0.8h,  v6.8h
+        and             v4.16b, v4.16b, v6.16b       // fm
+
+        // If no pixels need filtering, just exit as soon as possible
+        mov             x11, v4.d[0]
+        mov             x12, v4.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        br              x10
+1:
+
+.if \wd >= 8
+        dup             v0.8h,  w5
+
+        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
+        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
+        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
+        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
+        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
+        umax            v6.8h,  v6.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  \tmp1\().8h
+        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
+.if \wd == 16
+        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
+        umax            v6.8h,  v6.8h,  v1.8h
+        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
+        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
+        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
+        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
+        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
+        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
+        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
+        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
+        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
+
+        umax            v7.8h,  v7.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  v8.8h
+        umax            v9.8h,  v9.8h,  v10.8h
+        umax            v11.8h, v11.8h, v12.8h
+        // The rest of the calculation of flat8out is interleaved below
+.else
+        // The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+        // Calculate the normal inner loop filter for 2 or 4 pixels
+        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v1.8h
+        umax            v9.8h,  v9.8h,  v11.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  v1.8h
+.endif
+        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v9.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+.endif
+        dup             \tmp2\().8h,  w6                        // left shift for saturation
+        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
+        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
+        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
+        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
+        movi            \tmp5\().8h,  #3
+.if \wd == 8
+        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
+.endif
+        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
+.if \wd == 8
+        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
+.endif
+        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
+.if \wd == 16
+        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
+.elseif \wd == 8
+        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
+.endif
+        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
+.if \wd == 16
+        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
+.endif
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
+        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
+        movi            v2.8h,  #4
+        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        movi            v3.8h,  #3
+        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
+        movi            \tmp5\().8h,  #0
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        dup             \tmp6\().8h,  w7                        // max pixel value
+.if \wd == 16
+        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
+.endif
+
+        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
+
+        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
+        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
+        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
+        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
+
+        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
+        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
+        smin            v0.8h,   v0.8h,   \tmp6\().8h
+        smin            v2.8h,   v2.8h,   \tmp6\().8h
+        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
+        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
+        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
+        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
+        bit             v24.16b, v2.16b,  v4.16b
+
+        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
+        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
+.if \wd >= 8
+        mov             x11, v6.d[0]
+.endif
+        smin            v0.8h,  v0.8h,  \tmp6\().8h
+        smin            v2.8h,  v2.8h,  \tmp6\().8h
+.if \wd >= 8
+        mov             x12, v6.d[1]
+.endif
+        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
+        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
+.if \wd >= 8
+        adds            x11, x11, x12
+.endif
+        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
+        bit             v25.16b, v2.16b,  v5.16b
+
+        // If no pixels need flat8in, jump to flat8out
+        // (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd >= 8
+.if \wd == 16
+        b.eq            6f
+.else
+        b.ne            1f
+        br              x13
+1:
+.endif
+
+        // flat8in
+        add             \tmp1\().8h, v20.8h, v21.8h
+        add             \tmp3\().8h, v22.8h, v25.8h
+        add             \tmp5\().8h, v20.8h, v22.8h
+        add             \tmp7\().8h, v23.8h, v26.8h
+        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
+        add             v0.8h,  v0.8h,  v23.8h
+        add             v0.8h,  v0.8h,  v24.8h
+        add             v0.8h,  v0.8h,  \tmp5\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        urshr           v2.8h,  v0.8h,  #3                      // out p2
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        add             \tmp1\().8h, v20.8h,  v23.8h
+        add             \tmp3\().8h, v24.8h,  v27.8h
+        urshr           v3.8h,  v0.8h,  #3                      // out p1
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        add             \tmp5\().8h, v21.8h,  v24.8h
+        add             \tmp7\().8h, v25.8h,  v27.8h
+        urshr           v4.8h,  v0.8h,  #3                      // out p0
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        add             \tmp1\().8h, v22.8h,  v25.8h
+        add             \tmp3\().8h, v26.8h,  v27.8h
+        urshr           v5.8h,  v0.8h,  #3                      // out q0
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        // The output here is written back into the input registers. This doesn't
+        // matter for the flat8part below, since we only update those pixels
+        // which won't be touched below.
+        bit             v21.16b, v2.16b,  v6.16b
+        bit             v22.16b, v3.16b,  v6.16b
+        bit             v23.16b, v4.16b,  v6.16b
+        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
+        bit             v24.16b, v5.16b,  v6.16b
+        bit             v25.16b, \tmp5\().16b,  v6.16b
+        bit             v26.16b, \tmp6\().16b,  v6.16b
+.endif
+.if \wd == 16
+6:
+        orr             v2.16b,  v6.16b,  v7.16b
+        mov             x11, v2.d[0]
+        mov             x12, v2.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels needed flat8in nor flat8out, jump to a
+        // writeout of the inner 4 pixels
+        br              x14
+1:
+
+        mov             x11, v7.d[0]
+        mov             x12, v7.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+        br              x15
+
+1:
+        // flat8out
+        // This writes all outputs into v2-v17 (skipping v6 and v16).
+        // If this part is skipped, the output is read from v21-v26 (which is the input
+        // to this section).
+        shl             v0.8h,   v16.8h,  #3     // 8 * v16
+        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
+        add             v0.8h,   v0.8h,   v17.8h
+        add             v8.8h,   v17.8h,  v18.8h
+        add             v10.8h,  v19.8h,  v20.8h
+        add             v0.8h,   v0.8h,   v8.8h
+        add             v8.8h,   v16.8h,  v17.8h
+        add             v12.8h,  v21.8h,  v22.8h
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v18.8h,  v25.8h
+        add             v14.8h,  v23.8h,  v24.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v18.8h
+        add             v14.8h,  v19.8h,  v26.8h
+        urshr           v2.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v19.8h
+        add             v10.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v2.16b,  v17.16b, v7.16b
+        urshr           v3.8h ,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v20.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v3.16b,  v18.16b, v7.16b
+        urshr           v4.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v21.8h
+        add             v10.8h,  v22.8h,  v29.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v4.16b,  v19.16b, v7.16b
+        urshr           v5.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v22.8h
+        add             v14.8h,  v23.8h,  v30.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v5.16b,  v20.16b, v7.16b
+        urshr           v6.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v16.8h,  v23.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v24.8h,  v31.8h
+        bif             v6.16b,  v21.16b, v7.16b
+        urshr           v8.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        sub             v10.8h,  v12.8h,  v10.8h
+        add             v12.8h,  v17.8h,  v24.8h
+        add             v14.8h,  v25.8h,  v31.8h
+        bif             v8.16b,  v22.16b, v7.16b
+        urshr           v9.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v26.8h,  v31.8h
+        bif             v9.16b,  v23.16b, v7.16b
+        urshr           v10.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v18.8h,  v25.8h
+        add             v18.8h,  v19.8h,  v26.8h
+        sub             v12.8h,  v12.8h,  v14.8h
+        add             v14.8h,  v27.8h,  v31.8h
+        bif             v10.16b, v24.16b, v7.16b
+        urshr           v11.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v12.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v18.8h
+        add             v18.8h,  v28.8h,  v31.8h
+        bif             v11.16b, v25.16b, v7.16b
+        sub             v18.8h,  v18.8h,  v12.8h
+        urshr           v12.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        add             v20.8h,  v29.8h,  v31.8h
+        bif             v12.16b, v26.16b, v7.16b
+        urshr           v13.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v18.8h
+        sub             v20.8h,  v20.8h,  v14.8h
+        add             v18.8h,  v22.8h,  v29.8h
+        add             v22.8h,  v30.8h,  v31.8h
+        bif             v13.16b, v27.16b, v7.16b
+        urshr           v14.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v20.8h
+        sub             v22.8h,  v22.8h,  v18.8h
+        bif             v14.16b, v28.16b, v7.16b
+        urshr           v15.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v22.8h
+        bif             v15.16b, v29.16b, v7.16b
+        urshr           v17.8h,  v0.8h,   #4
+        bif             v17.16b, v30.16b, v7.16b
+.endif
+.endm
+
+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_8
+        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_16
+        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
+        ret
+endfunc
+
+.macro loop_filter_4
+        bl              vp9_loop_filter_4
+.endm
+
+.macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
+        bl              vp9_loop_filter_8
+.endm
+
+.macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
+        bl              vp9_loop_filter_16
+.endm
+
+
+// The public functions in this file have got the following signature:
+// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp, push
+function ff_\func\()_\bpp\()_neon, export=1
+.if \push
+        mov             x16, x30
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+.if \push
+        bl              \func\()_16_neon
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+        br              x16
+.else
+        b               \func\()_16_neon
+.endif
+endfunc
+.endm
+
+.macro bpp_frontends func, push=0
+        bpp_frontend    \func, 10, \push
+        bpp_frontend    \func, 12, \push
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+        mov             x16, x30
+.if \push
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        bl              \func\()_\int_suffix\()_16_neon
+.if \push
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+        mov             x16, x30
+        lsr             w8,  w2,  #8
+        lsr             w14, w3,  #8
+        lsr             w15, w4,  #8
+        and             w2,  w2,  #0xff
+        and             w3,  w3,  #0xff
+        and             w4,  w4,  #0xff
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        lsl             w2,  w8,  #\bpp - 8
+        lsl             w3,  w14, #\bpp - 8
+        lsl             w4,  w15, #\bpp - 8
+        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+        bpp_frontend_mix2 \wd1, \wd2, v, 10
+        bpp_frontend_mix2 \wd1, \wd2, v, 12
+        bpp_frontend_mix2 \wd1, \wd2, h, 10
+        bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x0,  x0,  x1, lsl #2
+        sub             x9,  x9,  x1, lsl #1
+
+        loop_filter_4
+
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+function vp9_loop_filter_h_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_4
+
+        // Move x9 forward by 2 pixels; we don't need to rewrite the
+        // outermost 2 pixels since they aren't changed.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+
+        // We only will write the mid 4 pixels back; after the loop filter,
+        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
+        // We need to transpose them to columns, done with a 4x8 transpose
+        // (which in practice is two 4x4 transposes of the two 4x4 halves
+        // of the 8x4 pixels; into 4x8 pixels).
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+function vp9_loop_filter_v_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #2
+        add             x9,  x9,  x1
+
+        loop_filter_8
+
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+
+        br              x10
+6:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+function vp9_loop_filter_h_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_8
+
+        add             x0,  x9,  x1, lsl #2
+
+        // Even though only 6 pixels per row have been changed, we write the
+        // full 8 pixel registers.
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        st1             {v20.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v21.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v27.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        br              x10
+6:
+        // If we didn't need to do the flat8in part, we use the same writeback
+        // as in loop_filter_h_4_8.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #3
+        ld1             {v16.8h}, [x9], x1 // p7
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v17.8h}, [x9], x1 // p6
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v18.8h}, [x9], x1 // p5
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v19.8h}, [x9], x1 // p4
+        ld1             {v27.8h}, [x0], x1 // q3
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v28.8h}, [x0], x1 // q4
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v29.8h}, [x0], x1 // q5
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v30.8h}, [x0], x1 // q6
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v31.8h}, [x0], x1 // q7
+        sub             x9,  x9,  x1, lsl #3
+        sub             x0,  x0,  x1, lsl #3
+        add             x9,  x9,  x1
+
+        loop_filter_16
+
+        // If we did the flat8out part, we get the output in
+        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+        // store v2-v9 there, and v10-v17 into x0.
+        st1             {v2.8h},  [x9], x1
+        st1             {v10.8h}, [x0], x1
+        st1             {v3.8h},  [x9], x1
+        st1             {v11.8h}, [x0], x1
+        st1             {v4.8h},  [x9], x1
+        st1             {v12.8h}, [x0], x1
+        st1             {v5.8h},  [x9], x1
+        st1             {v13.8h}, [x0], x1
+        st1             {v6.8h},  [x9], x1
+        st1             {v14.8h}, [x0], x1
+        st1             {v8.8h},  [x9], x1
+        st1             {v15.8h}, [x0], x1
+        st1             {v9.8h},  [x9], x1
+        st1             {v17.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  x1
+
+        br              x10
+8:
+        add             x9,  x9,  x1, lsl #2
+        // If we didn't do the flat8out part, the output is left in the
+        // input registers.
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x10
+7:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
+
+function vp9_loop_filter_h_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #16
+        ld1             {v16.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v17.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v18.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v19.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v28.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v29.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v30.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v31.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        sub             x9,  x9,  x1, lsl #3
+
+        // The 16x8 pixels read above is in two 8x8 blocks; the left
+        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
+        // of this, to get one column per register.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+        loop_filter_16
+
+        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
+        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
+
+        st1             {v16.8h}, [x9], x1
+        st1             {v10.8h}, [x0], x1
+        st1             {v2.8h},  [x9], x1
+        st1             {v11.8h}, [x0], x1
+        st1             {v3.8h},  [x9], x1
+        st1             {v12.8h}, [x0], x1
+        st1             {v4.8h},  [x9], x1
+        st1             {v13.8h}, [x0], x1
+        st1             {v5.8h},  [x9], x1
+        st1             {v14.8h}, [x0], x1
+        st1             {v6.8h},  [x9], x1
+        st1             {v15.8h}, [x0], x1
+        st1             {v8.8h},  [x9], x1
+        st1             {v17.8h}, [x0], x1
+        st1             {v9.8h},  [x9], x1
+        st1             {v31.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+
+        br              x10
+8:
+        // The same writeback as in loop_filter_h_8_8
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        st1             {v20.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v21.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v27.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+        br              x10
+7:
+        // The same writeback as in loop_filter_h_4_8
+        sub             x9,  x0,  #4
+        add             x0,  x9,  x1, lsl #2
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e9c4970..0878763 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp9mc_16bpp_neon.S b/libavcodec/aarch64/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000..cac6428
--- /dev/null
+++ b/libavcodec/aarch64/vp9mc_16bpp_neon.S
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+//                            const uint8_t *ref, ptrdiff_t ref_stride,
+//                            int h, int mx, int my);
+
+function ff_vp9_copy128_aarch64, export=1
+1:
+        ldp             x5,  x6,  [x2]
+        ldp             x7,  x8,  [x2, #16]
+        stp             x5,  x6,  [x0]
+        ldp             x9,  x10, [x2, #32]
+        stp             x7,  x8,  [x0, #16]
+        subs            w4,  w4,  #1
+        ldp             x11, x12, [x2, #48]
+        stp             x9,  x10, [x0, #32]
+        stp             x11, x12, [x0, #48]
+        ldp             x5,  x6,  [x2, #64]
+        ldp             x7,  x8,  [x2, #80]
+        stp             x5,  x6,  [x0, #64]
+        ldp             x9,  x10, [x2, #96]
+        stp             x7,  x8,  [x0, #80]
+        ldp             x11, x12, [x2, #112]
+        stp             x9,  x10, [x0, #96]
+        stp             x11, x12, [x0, #112]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+        mov             x5,  x0
+        sub             x1,  x1,  #64
+        sub             x3,  x3,  #64
+1:
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+        urhadd          v0.8h,  v0.8h,  v4.8h
+        urhadd          v1.8h,  v1.8h,  v5.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+        urhadd          v2.8h,  v2.8h,  v6.8h
+        urhadd          v3.8h,  v3.8h,  v7.8h
+        subs            w4,  w4,  #1
+        urhadd          v16.8h, v16.8h, v20.8h
+        urhadd          v17.8h, v17.8h, v21.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], #64
+        urhadd          v18.8h, v18.8h, v22.8h
+        urhadd          v19.8h, v19.8h, v23.8h
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], x1
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+        urhadd          v0.8h,  v0.8h,  v4.8h
+        urhadd          v1.8h,  v1.8h,  v5.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+        urhadd          v2.8h,  v2.8h,  v6.8h
+        urhadd          v3.8h,  v3.8h,  v7.8h
+        subs            w4,  w4,  #2
+        urhadd          v16.8h, v16.8h, v20.8h
+        urhadd          v17.8h, v17.8h, v21.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], x1
+        urhadd          v18.8h, v18.8h, v22.8h
+        urhadd          v19.8h, v19.8h, v23.8h
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+1:
+        ld1             {v2.8h, v3.8h},  [x2], x3
+        ld1             {v0.8h, v1.8h},  [x0]
+        urhadd          v0.8h,  v0.8h,  v2.8h
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        subs            w4,  w4,  #1
+        st1             {v0.8h, v1.8h},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.8h},  [x2], x3
+        ld1             {v0.8h},  [x0], x1
+        ld1             {v3.8h},  [x2], x3
+        urhadd          v0.8h,  v0.8h,  v2.8h
+        ld1             {v1.8h},  [x0], x1
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        subs            w4,  w4,  #2
+        st1             {v0.8h},  [x5], x1
+        st1             {v1.8h},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.4h},  [x2], x3
+        ld1             {v0.4h},  [x0], x1
+        ld1             {v3.4h},  [x2], x3
+        urhadd          v0.4h,  v0.4h,  v2.4h
+        ld1             {v1.4h},  [x0], x1
+        urhadd          v1.4h,  v1.4h,  v3.4h
+        subs            w4,  w4,  #2
+        st1             {v0.4h},  [x5], x1
+        st1             {v1.8b},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
+// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
+// for size >= 16)
+.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
+        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+        smlal           \dst1\().4s, v20.4h, v0.h[\offset]
+        smlal           \dst5\().4s, v22.4h, v0.h[\offset]
+.if \size >= 16
+        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+.endif
+.if \size >= 8
+        smlal2          \dst2\().4s, v20.8h, v0.h[\offset]
+        smlal2          \dst6\().4s, v22.8h, v0.h[\offset]
+.endif
+.if \size >= 16
+        smlal           \dst3\().4s, v21.4h, v0.h[\offset]
+        smlal           \dst7\().4s, v23.4h, v0.h[\offset]
+        smlal2          \dst4\().4s, v21.8h, v0.h[\offset]
+        smlal2          \dst8\().4s, v23.8h, v0.h[\offset]
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width (in bytes) is passed in x5, the height in w4 and
+// the filter coefficients in x9.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+        sub             x2,  x2,  #6
+        add             x6,  x0,  x1
+        add             x7,  x2,  x3
+        add             x1,  x1,  x1
+        add             x3,  x3,  x3
+        // Only size >= 16 loops horizontally and needs
+        // reduced dst stride
+.if \size >= 16
+        sub             x1,  x1,  x5
+.endif
+        // size >= 16 loads two qwords and increments r2,
+        // for size 4/8 it's enough with one qword and no
+        // postincrement
+.if \size >= 16
+        sub             x3,  x3,  x5
+        sub             x3,  x3,  #16
+.endif
+        // Load the filter vector
+        ld1             {v0.8h},  [x9]
+1:
+.if \size >= 16
+        mov             x9,  x5
+.endif
+        // Load src
+.if \size >= 16
+        ld1             {v5.8h,  v6.8h,  v7.8h},  [x2], #48
+        ld1             {v16.8h, v17.8h, v18.8h}, [x7], #48
+.else
+        ld1             {v5.8h,  v6.8h},  [x2]
+        ld1             {v16.8h, v17.8h}, [x7]
+.endif
+2:
+
+        smull           v1.4s,  v5.4h,  v0.h[0]
+        smull           v24.4s, v16.4h, v0.h[0]
+.if \size >= 8
+        smull2          v2.4s,  v5.8h,  v0.h[0]
+        smull2          v25.4s, v16.8h, v0.h[0]
+.endif
+.if \size >= 16
+        smull           v3.4s,  v6.4h,  v0.h[0]
+        smull           v26.4s, v17.4h, v0.h[0]
+        smull2          v4.4s,  v6.8h,  v0.h[0]
+        smull2          v27.4s, v17.8h, v0.h[0]
+.endif
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 1, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 2, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 3, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 4, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 5, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 6, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 7, \size
+
+        // Round, shift and saturate
+        // The sqrshrun takes care of clamping negative values to zero, but
+        // we manually need to do umin with the max pixel value.
+        sqrshrun        v1.4h,  v1.4s,  #7
+        sqrshrun        v24.4h, v24.4s, #7
+.if \size >= 8
+        sqrshrun2       v1.8h,  v2.4s,  #7
+        sqrshrun2       v24.8h, v25.4s, #7
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v24.8h, v24.8h, v31.8h
+.if \size >= 16
+        sqrshrun        v2.4h,  v3.4s,  #7
+        sqrshrun        v25.4h, v26.4s, #7
+        sqrshrun2       v2.8h,  v4.4s,  #7
+        sqrshrun2       v25.8h, v27.4s, #7
+        umin            v2.8h,  v2.8h,  v31.8h
+        umin            v25.8h, v25.8h, v31.8h
+.endif
+.else
+        umin            v1.4h,  v1.4h,  v31.4h
+        umin            v24.4h, v24.4h, v31.4h
+.endif
+        // Average
+.ifc \type,avg
+.if \size >= 16
+        ld1             {v3.8h,  v4.8h},  [x0]
+        ld1             {v29.8h, v30.8h}, [x6]
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        urhadd          v2.8h,  v2.8h,  v4.8h
+        urhadd          v24.8h, v24.8h, v29.8h
+        urhadd          v25.8h, v25.8h, v30.8h
+.elseif \size >= 8
+        ld1             {v3.8h},  [x0]
+        ld1             {v4.8h},  [x6]
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        urhadd          v24.8h, v24.8h, v4.8h
+.else
+        ld1             {v3.4h},  [x0]
+        ld1             {v4.4h},  [x6]
+        urhadd          v1.4h,  v1.4h,  v3.4h
+        urhadd          v24.4h, v24.4h, v4.4h
+.endif
+.endif
+        // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+        subs            x9,  x9,  #32
+        st1             {v1.8h,  v2.8h},  [x0], #32
+        st1             {v24.8h, v25.8h}, [x6], #32
+        b.eq            3f
+        mov             v5.16b,  v7.16b
+        mov             v16.16b, v18.16b
+        ld1             {v6.8h,  v7.8h},  [x2], #32
+        ld1             {v17.8h, v18.8h}, [x7], #32
+        b               2b
+.elseif \size == 8
+        st1             {v1.8h},  [x0]
+        st1             {v24.8h}, [x6]
+.else // \size == 4
+        st1             {v1.4h},  [x0]
+        st1             {v24.4h}, [x6]
+.endif
+3:
+        // Loop vertically
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x2,  x2,  x3
+        add             x7,  x7,  x3
+        subs            w4,  w4,  #2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
+        cmp             w5,  #8
+        add             x9,  x6,  w5, uxtw #4
+        mov             x5,  #2*\size
+.if \size >= 16
+        b               \type\()_8tap_16h
+.else
+        b               \type\()_8tap_\size\()h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp,   2, \size, \bpp
+do_8tap_h_func avg, sharp,   2, \size, \bpp
+do_8tap_h_func put, smooth,  0, \size, \bpp
+do_8tap_h_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8,  \bpp
+do_8tap_h_filters 4,  \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg4
+.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
+        sqrshrun        \reg2\().4h,  \reg2\().4s, #7
+        sqrshrun        \reg3\().4h,  \reg3\().4s, #7
+        sqrshrun        \reg4\().4h,  \reg4\().4s, #7
+.ifc \type,avg
+        ld1             {\tmp1\().4h},  [x7], x1
+        ld1             {\tmp2\().4h},  [x7], x1
+        ld1             {\tmp3\().4h},  [x7], x1
+        ld1             {\tmp4\().4h},  [x7], x1
+.endif
+        umin            \reg1\().4h,  \reg1\().4h,  \minreg\().4h
+        umin            \reg2\().4h,  \reg2\().4h,  \minreg\().4h
+        umin            \reg3\().4h,  \reg3\().4h,  \minreg\().4h
+        umin            \reg4\().4h,  \reg4\().4h,  \minreg\().4h
+.ifc \type,avg
+        urhadd          \reg1\().4h,  \reg1\().4h,  \tmp1\().4h
+        urhadd          \reg2\().4h,  \reg2\().4h,  \tmp2\().4h
+        urhadd          \reg3\().4h,  \reg3\().4h,  \tmp3\().4h
+        urhadd          \reg4\().4h,  \reg4\().4h,  \tmp4\().4h
+.endif
+        st1             {\reg1\().4h},  [x0], x1
+        st1             {\reg2\().4h},  [x0], x1
+        st1             {\reg3\().4h},  [x0], x1
+        st1             {\reg4\().4h},  [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-8, where
+// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
+.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
+        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
+        sqrshrun2       \reg1\().8h,  \reg2\().4s, #7
+        sqrshrun        \reg2\().4h,  \reg3\().4s, #7
+        sqrshrun2       \reg2\().8h,  \reg4\().4s, #7
+        sqrshrun        \reg3\().4h,  \reg5\().4s, #7
+        sqrshrun2       \reg3\().8h,  \reg6\().4s, #7
+        sqrshrun        \reg4\().4h,  \reg7\().4s, #7
+        sqrshrun2       \reg4\().8h,  \reg8\().4s, #7
+.ifc \type,avg
+        ld1             {\reg5\().8h},  [x7], x1
+        ld1             {\reg6\().8h},  [x7], x1
+        ld1             {\reg7\().8h},  [x7], x1
+        ld1             {\reg8\().8h},  [x7], x1
+.endif
+        umin            \reg1\().8h,  \reg1\().8h,  \minreg\().8h
+        umin            \reg2\().8h,  \reg2\().8h,  \minreg\().8h
+        umin            \reg3\().8h,  \reg3\().8h,  \minreg\().8h
+        umin            \reg4\().8h,  \reg4\().8h,  \minreg\().8h
+.ifc \type,avg
+        urhadd          \reg1\().8h,  \reg1\().8h,  \reg5\().8h
+        urhadd          \reg2\().8h,  \reg2\().8h,  \reg6\().8h
+        urhadd          \reg3\().8h,  \reg3\().8h,  \reg7\().8h
+        urhadd          \reg4\().8h,  \reg4\().8h,  \reg8\().8h
+.endif
+        st1             {\reg1\().8h},  [x0], x1
+        st1             {\reg2\().8h},  [x0], x1
+        st1             {\reg3\().8h},  [x0], x1
+        st1             {\reg4\().8h},  [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+        smull           \dst1\().4s, \src1\().4h, v0.h[0]
+        smull           \dst2\().4s, \src2\().4h, v0.h[0]
+        smull           \tmp1\().4s, \src2\().4h, v0.h[1]
+        smull           \tmp2\().4s, \src3\().4h, v0.h[1]
+        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
+        smlal           \dst2\().4s, \src4\().4h, v0.h[2]
+        smlal           \tmp1\().4s, \src4\().4h, v0.h[3]
+        smlal           \tmp2\().4s, \src5\().4h, v0.h[3]
+        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
+        smlal           \dst2\().4s, \src6\().4h, v0.h[4]
+        smlal           \tmp1\().4s, \src6\().4h, v0.h[5]
+        smlal           \tmp2\().4s, \src7\().4h, v0.h[5]
+        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
+        smlal           \dst2\().4s, \src8\().4h, v0.h[6]
+        smlal           \tmp1\().4s, \src8\().4h, v0.h[7]
+        smlal           \tmp2\().4s, \src9\().4h, v0.h[7]
+        add             \dst1\().4s, \dst1\().4s, \tmp1\().4s
+        add             \dst2\().4s, \dst2\().4s, \tmp2\().4s
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
+// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
+        smull           \dst1\().4s, \src1\().4h, v0.h[0]
+        smull2          \dst2\().4s, \src1\().8h, v0.h[0]
+        smull           \dst3\().4s, \src2\().4h, v0.h[0]
+        smull2          \dst4\().4s, \src2\().8h, v0.h[0]
+        smlal           \dst1\().4s, \src2\().4h, v0.h[1]
+        smlal2          \dst2\().4s, \src2\().8h, v0.h[1]
+        smlal           \dst3\().4s, \src3\().4h, v0.h[1]
+        smlal2          \dst4\().4s, \src3\().8h, v0.h[1]
+        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
+        smlal2          \dst2\().4s, \src3\().8h, v0.h[2]
+        smlal           \dst3\().4s, \src4\().4h, v0.h[2]
+        smlal2          \dst4\().4s, \src4\().8h, v0.h[2]
+        smlal           \dst1\().4s, \src4\().4h, v0.h[3]
+        smlal2          \dst2\().4s, \src4\().8h, v0.h[3]
+        smlal           \dst3\().4s, \src5\().4h, v0.h[3]
+        smlal2          \dst4\().4s, \src5\().8h, v0.h[3]
+        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
+        smlal2          \dst2\().4s, \src5\().8h, v0.h[4]
+        smlal           \dst3\().4s, \src6\().4h, v0.h[4]
+        smlal2          \dst4\().4s, \src6\().8h, v0.h[4]
+        smlal           \dst1\().4s, \src6\().4h, v0.h[5]
+        smlal2          \dst2\().4s, \src6\().8h, v0.h[5]
+        smlal           \dst3\().4s, \src7\().4h, v0.h[5]
+        smlal2          \dst4\().4s, \src7\().8h, v0.h[5]
+        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
+        smlal2          \dst2\().4s, \src7\().8h, v0.h[6]
+        smlal           \dst3\().4s, \src8\().4h, v0.h[6]
+        smlal2          \dst4\().4s, \src8\().8h, v0.h[6]
+        smlal           \dst1\().4s, \src8\().4h, v0.h[7]
+        smlal2          \dst2\().4s, \src8\().8h, v0.h[7]
+        smlal           \dst3\().4s, \src9\().4h, v0.h[7]
+        smlal2          \dst4\().4s, \src9\().8h, v0.h[7]
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+1:
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+        mov             x6,  x4
+
+        ld1             {v17.8h}, [x2], x3
+        ld1             {v18.8h}, [x2], x3
+        ld1             {v19.8h}, [x2], x3
+        ld1             {v20.8h}, [x2], x3
+        ld1             {v21.8h}, [x2], x3
+        ld1             {v22.8h}, [x2], x3
+        ld1             {v23.8h}, [x2], x3
+2:
+        ld1             {v24.8h}, [x2], x3
+        ld1             {v25.8h}, [x2], x3
+        ld1             {v26.8h}, [x2], x3
+        ld1             {v27.8h}, [x2], x3
+
+        convolve8       v2,  v3,  v4,  v5,  v17, v18, v19, v20, v21, v22, v23, v24, v25
+        convolve8       v6,  v7,  v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v30, v31, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        ld1             {v16.8h}, [x2], x3
+        ld1             {v17.8h}, [x2], x3
+        ld1             {v18.8h}, [x2], x3
+        ld1             {v19.8h}, [x2], x3
+        convolve8       v2,  v3,  v4,  v5,  v21, v22, v23, v24, v25, v26, v27, v16, v17
+        convolve8       v6,  v7,  v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v20, v21, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        ld1             {v20.8h}, [x2], x3
+        ld1             {v21.8h}, [x2], x3
+        ld1             {v22.8h}, [x2], x3
+        ld1             {v23.8h}, [x2], x3
+        convolve8       v2,  v3,  v4,  v5,  v25, v26, v27, v16, v17, v18, v19, v20, v21
+        convolve8       v6,  v7,  v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v24, v25, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.ne            2b
+
+8:
+        subs            x5,  x5,  #8
+        b.eq            9f
+        // x0 -= h * dst_stride
+        msub            x0,  x1,  x4, x0
+        // x2 -= h * src_stride
+        msub            x2,  x3,  x4, x2
+        // x2 -= 8 * src_stride
+        sub             x2,  x2,  x3, lsl #3
+        // x2 += 1 * src_stride
+        add             x2,  x2,  x3
+        add             x2,  x2,  #16
+        add             x0,  x0,  #16
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+
+        ld1             {v16.4h}, [x2], x3
+        ld1             {v17.4h}, [x2], x3
+        ld1             {v18.4h}, [x2], x3
+        ld1             {v19.4h}, [x2], x3
+        ld1             {v20.4h}, [x2], x3
+        ld1             {v21.4h}, [x2], x3
+        ld1             {v22.4h}, [x2], x3
+        ld1             {v23.4h}, [x2], x3
+        ld1             {v24.4h}, [x2], x3
+        ld1             {v25.4h}, [x2], x3
+        ld1             {v26.4h}, [x2], x3
+
+        convolve4       v2,  v3,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
+        convolve4       v4,  v5,  v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
+        do_store4       v2,  v3,  v4,  v5,  v28, v29, v30, v31, v1,  \type
+
+        subs            x4,  x4,  #4
+        b.eq            9f
+
+        ld1             {v27.4h}, [x2], x3
+        ld1             {v28.4h}, [x2], x3
+        ld1             {v29.4h}, [x2], x3
+        ld1             {v30.4h}, [x2], x3
+
+        convolve4       v2,  v3,  v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
+        convolve4       v4,  v5,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
+        do_store4       v2,  v3,  v4,  v5,  v16, v17, v18, v19, v1,  \type
+
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+        uxtw            x4,  w4
+        mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
+        add             x6,  x5,  w6, uxtw #4
+        mov             x5,  #\size
+.if \size >= 8
+        b               \type\()_8tap_8v
+.else
+        b               \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp,   2, \size, \bpp
+do_8tap_v_func avg, sharp,   2, \size, \bpp
+do_8tap_v_func put, smooth,  0, \size, \bpp
+do_8tap_v_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8,  \bpp
+do_8tap_v_filters 4,  \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 584c114..f67624c 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -269,8 +269,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2
         sub             x3,  x3,  #8
 .endif
         // Load the filter vector
-        ld1             {v0.8b},  [x9]
-        sxtl            v0.8h,  v0.8b
+        ld1             {v0.8h},  [x9]
 1:
 .if \size >= 16
         mov             x9,  x5
@@ -384,9 +383,9 @@ do_8tap_h_size 16
 
 .macro do_8tap_h_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
-        movrel          x6,  X(ff_vp9_subpel_filters), 120*\offset - 8
+        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
         cmp             w5,  #8
-        add             x9,  x6,  w5, uxtw #3
+        add             x9,  x6,  w5, uxtw #4
         mov             x5,  #\size
 .if \size >= 16
         b.ge            \type\()_8tap_16h_34
@@ -516,8 +515,7 @@ do_8tap_h_filters 4
 function \type\()_8tap_8v_\idx1\idx2
         sub             x2,  x2,  x3, lsl #1
         sub             x2,  x2,  x3
-        ld1             {v0.8b},  [x6]
-        sxtl            v0.8h,  v0.8b
+        ld1             {v0.8h},  [x6]
 1:
 .ifc \type,avg
         mov             x7,  x0
@@ -590,8 +588,7 @@ do_8tap_8v avg, 4, 3
 function \type\()_8tap_4v_\idx1\idx2
         sub             x2,  x2,  x3, lsl #1
         sub             x2,  x2,  x3
-        ld1             {v0.8b},  [x6]
-        sxtl            v0.8h,  v0.8b
+        ld1             {v0.8h},  [x6]
 .ifc \type,avg
         mov             x7,  x0
 .endif
@@ -660,9 +657,9 @@ do_8tap_4v avg, 4, 3
 .macro do_8tap_v_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
         uxtw            x4,  w4
-        movrel          x5,  X(ff_vp9_subpel_filters), 120*\offset - 8
+        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
         cmp             w6,  #8
-        add             x6,  x5,  w6, uxtw #3
+        add             x6,  x5,  w6, uxtw #4
         mov             x5,  #\size
 .if \size >= 8
         b.ge            \type\()_8tap_8v_34
diff --git a/libavcodec/aasc.c b/libavcodec/aasc.c
index c4800f0..58cc3c8 100644
--- a/libavcodec/aasc.c
+++ b/libavcodec/aasc.c
@@ -2,20 +2,20 @@
  * Autodesk RLE Decoder
  * Copyright (C) 2005 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,15 +36,39 @@ typedef struct AascContext {
     AVCodecContext *avctx;
     GetByteContext gb;
     AVFrame *frame;
+
+    uint32_t palette[AVPALETTE_COUNT];
+    int palette_size;
 } AascContext;
 
 static av_cold int aasc_decode_init(AVCodecContext *avctx)
 {
     AascContext *s = avctx->priv_data;
+    uint8_t *ptr;
+    int i;
 
     s->avctx = avctx;
-
-    avctx->pix_fmt = AV_PIX_FMT_BGR24;
+    switch (avctx->bits_per_coded_sample) {
+    case 8:
+        avctx->pix_fmt = AV_PIX_FMT_PAL8;
+
+        ptr = avctx->extradata;
+        s->palette_size = FFMIN(avctx->extradata_size, AVPALETTE_SIZE);
+        for (i = 0; i < s->palette_size / 4; i++) {
+            s->palette[i] = 0xFFU << 24 | AV_RL32(ptr);
+            ptr += 4;
+        }
+        break;
+    case 16:
+        avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
+        break;
+    case 24:
+        avctx->pix_fmt = AV_PIX_FMT_BGR24;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n", avctx->bits_per_coded_sample);
+        return -1;
+    }
 
     s->frame = av_frame_alloc();
     if (!s->frame)
@@ -60,27 +84,35 @@ static int aasc_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
     AascContext *s     = avctx->priv_data;
-    int compr, i, stride, ret;
+    int compr, i, stride, psize, ret;
 
-    if (buf_size < 4)
+    if (buf_size < 4) {
+        av_log(avctx, AV_LOG_ERROR, "frame too short\n");
         return AVERROR_INVALIDDATA;
+    }
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     compr     = AV_RL32(buf);
     buf      += 4;
     buf_size -= 4;
+    psize = avctx->bits_per_coded_sample / 8;
+    switch (avctx->codec_tag) {
+    case MKTAG('A', 'A', 'S', '4'):
+        bytestream2_init(&s->gb, buf - 4, buf_size + 4);
+        ff_msrle_decode(avctx, s->frame, 8, &s->gb);
+        break;
+    case MKTAG('A', 'A', 'S', 'C'):
     switch (compr) {
     case 0:
-        stride = (avctx->width * 3 + 3) & ~3;
+        stride = (avctx->width * psize + psize) & ~psize;
         if (buf_size < stride * avctx->height)
             return AVERROR_INVALIDDATA;
         for (i = avctx->height - 1; i >= 0; i--) {
-            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * 3);
+            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
             buf += stride;
+            buf_size -= stride;
         }
         break;
     case 1:
@@ -91,6 +123,14 @@ static int aasc_decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
         return AVERROR_INVALIDDATA;
     }
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown FourCC: %X\n", avctx->codec_tag);
+        return -1;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        memcpy(s->frame->data[1], s->palette, s->palette_size);
 
     *got_frame = 1;
     if ((ret = av_frame_ref(data, s->frame)) < 0)
diff --git a/libavcodec/ac3.c b/libavcodec/ac3.c
index 1118b52..6d09288 100644
--- a/libavcodec/ac3.c
+++ b/libavcodec/ac3.c
@@ -2,20 +2,20 @@
  * Common code between the AC-3 encoder and decoder
  * Copyright (c) 2000 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,8 +40,6 @@ const uint8_t ff_ac3_band_start_tab[AC3_CRITICAL_BANDS+1] = {
      79,  85, 97, 109, 121, 133, 157, 181, 205, 229, 253
 };
 
-#if CONFIG_HARDCODED_TABLES
-
 /**
  * Map each frequency coefficient bin to the critical band that contains it.
  */
@@ -70,10 +68,6 @@ const uint8_t ff_ac3_bin_to_band_tab[253] = {
     49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49
 };
 
-#else /* CONFIG_HARDCODED_TABLES */
-uint8_t ff_ac3_bin_to_band_tab[253];
-#endif
-
 static inline int calc_lowcomp1(int a, int b0, int b1, int c)
 {
     if ((b0 + 256) == b1) {
@@ -132,6 +126,9 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
     int band_start, band_end, begin, end1;
     int lowcomp, fastleak, slowleak;
 
+    if (end <= 0)
+        return AVERROR_INVALIDDATA;
+
     /* excitation function */
     band_start = ff_ac3_bin_to_band_tab[start];
     band_end   = ff_ac3_bin_to_band_tab[end-1] + 1;
@@ -201,9 +198,9 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
             if (band >= AC3_CRITICAL_BANDS || dba_lengths[seg] > AC3_CRITICAL_BANDS-band)
                 return -1;
             if (dba_values[seg] >= 4) {
-                delta = (dba_values[seg] - 3) << 7;
+                delta = (dba_values[seg] - 3) * 128;
             } else {
-                delta = (dba_values[seg] - 4) << 7;
+                delta = (dba_values[seg] - 4) * 128;
             }
             for (i = 0; i < dba_lengths[seg]; i++) {
                 mask[band++] += delta;
@@ -212,21 +209,3 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
     }
     return 0;
 }
-
-/**
- * Initialize some tables.
- * note: This function must remain thread safe because it is called by the
- *       AVParser init code.
- */
-av_cold void ff_ac3_common_init(void)
-{
-#if !CONFIG_HARDCODED_TABLES
-    /* compute ff_ac3_bin_to_band_tab from ff_ac3_band_start_tab */
-    int bin = 0, band;
-    for (band = 0; band < AC3_CRITICAL_BANDS; band++) {
-        int band_end = ff_ac3_band_start_tab[band+1];
-        while (bin < band_end)
-            ff_ac3_bin_to_band_tab[bin++] = band;
-    }
-#endif /* !CONFIG_HARDCODED_TABLES */
-}
diff --git a/libavcodec/ac3.h b/libavcodec/ac3.h
index f2cb6c3..f8f6a81 100644
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -2,20 +2,20 @@
  * Common code between the AC-3 encoder and decoder
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #define AVCODEC_AC3_H
 
 #define AC3_MAX_CODED_FRAME_SIZE 3840 /* in bytes */
+#define EAC3_MAX_CHANNELS 16          /**< maximum number of channels in EAC3 */
 #define AC3_MAX_CHANNELS 7            /**< maximum number of channels, including coupling channel */
 #define CPL_CH 0                      /**< coupling channel index */
 
@@ -39,6 +40,8 @@
 #define AC3_CRITICAL_BANDS 50
 #define AC3_MAX_CPL_BANDS  18
 
+#include "libavutil/opt.h"
+#include "avcodec.h"
 #include "ac3tab.h"
 
 /* exponent encoding strategy */
@@ -49,11 +52,59 @@
 #define EXP_D25   2
 #define EXP_D45   3
 
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
+#if USE_FIXED
+
+#define FFT_FLOAT 0
+
+#define FIXR(a)                 ((int)((a) * 0 + 0.5))
+#define FIXR12(a)               ((int)((a) * 4096 + 0.5))
+#define FIXR15(a)               ((int)((a) * 32768 + 0.5))
+#define ROUND15(x)              ((x) + 16384) >> 15
+
+#define AC3_RENAME(x)           x ## _fixed
+#define AC3_NORM(norm)          (1<<24)/(norm)
+#define AC3_MUL(a,b)            ((((int64_t) (a)) * (b))>>12)
+#define AC3_RANGE(x)            ((x)|(((x)&128)<<1))
+#define AC3_HEAVY_RANGE(x)      ((x)<<1)
+#define AC3_DYNAMIC_RANGE(x)    (x)
+#define AC3_SPX_BLEND(x)        (x)
+#define AC3_DYNAMIC_RANGE1      0
+
+typedef int                     INTFLOAT;
+typedef int16_t                 SHORTFLOAT;
+
+#else /* USE_FIXED */
+
+#define FIXR(x)                 ((float)(x))
+#define FIXR12(x)               ((float)(x))
+#define FIXR15(x)               ((float)(x))
+#define ROUND15(x)              (x)
+
+#define AC3_RENAME(x)           x
+#define AC3_NORM(norm)          (1.0f/(norm))
+#define AC3_MUL(a,b)            ((a) * (b))
+#define AC3_RANGE(x)            (dynamic_range_tab[(x)])
+#define AC3_HEAVY_RANGE(x)      (ff_ac3_heavy_dynamic_range_tab[(x)])
+#define AC3_DYNAMIC_RANGE(x)    (powf(x,  s->drc_scale))
+#define AC3_SPX_BLEND(x)        (x)* (1.0f/32)
+#define AC3_DYNAMIC_RANGE1      1.0f
+
+typedef float                   INTFLOAT;
+typedef float                   SHORTFLOAT;
+
+#endif /* USE_FIXED */
+
+#define AC3_LEVEL(x)            ROUND15((x) * FIXR15(M_SQRT1_2))
+
 /* pre-defined gain values */
-#define LEVEL_PLUS_3DB          1.4142135623730950
+#define LEVEL_PLUS_3DB          M_SQRT2
 #define LEVEL_PLUS_1POINT5DB    1.1892071150027209
 #define LEVEL_MINUS_1POINT5DB   0.8408964152537145
-#define LEVEL_MINUS_3DB         0.7071067811865476
+#define LEVEL_MINUS_3DB         M_SQRT1_2
 #define LEVEL_MINUS_4POINT5DB   0.5946035575013605
 #define LEVEL_MINUS_6DB         0.5000000000000000
 #define LEVEL_MINUS_9DB         0.3535533905932738
diff --git a/libavcodec/ac3_parser.c b/libavcodec/ac3_parser.c
index 53189e0..1e203ae 100644
--- a/libavcodec/ac3_parser.c
+++ b/libavcodec/ac3_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -137,8 +137,8 @@ int ff_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
         hdr->channel_mode = get_bits(gbc, 3);
         hdr->lfe_on = get_bits1(gbc);
 
-        hdr->bit_rate = (uint32_t)(8.0 * hdr->frame_size * hdr->sample_rate /
-                        (hdr->num_blocks * 256.0));
+        hdr->bit_rate = 8LL * hdr->frame_size * hdr->sample_rate /
+                        (hdr->num_blocks * 256);
         hdr->channels = ff_ac3_channels_tab[hdr->channel_mode] + hdr->lfe_on;
     }
     hdr->channel_layout = avpriv_ac3_channel_layout_tab[hdr->channel_mode];
@@ -148,6 +148,30 @@ int ff_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
     return 0;
 }
 
+// TODO: Better way to pass AC3HeaderInfo fields to mov muxer.
+int avpriv_ac3_parse_header(AC3HeaderInfo **phdr, const uint8_t *buf,
+                            size_t size)
+{
+    GetBitContext gb;
+    AC3HeaderInfo *hdr;
+    int err;
+
+    if (!*phdr)
+        *phdr = av_mallocz(sizeof(AC3HeaderInfo));
+    if (!*phdr)
+        return AVERROR(ENOMEM);
+    hdr = *phdr;
+
+    err = init_get_bits8(&gb, buf, size);
+    if (err < 0)
+        return AVERROR_INVALIDDATA;
+    err = ff_ac3_parse_header(&gb, hdr);
+    if (err < 0)
+        return AVERROR_INVALIDDATA;
+
+    return get_bits_count(&gb);
+}
+
 int av_ac3_parse_header(const uint8_t *buf, size_t size,
                         uint8_t *bitstream_id, uint16_t *frame_size)
 {
@@ -196,8 +220,8 @@ static int ac3_sync(uint64_t state, AACAC3ParseContext *hdr_info,
     else if (hdr_info->codec_id == AV_CODEC_ID_NONE)
         hdr_info->codec_id = AV_CODEC_ID_AC3;
 
-    *need_next_header = (hdr.frame_type != EAC3_FRAME_TYPE_AC3_CONVERT);
     *new_frame_start  = (hdr.frame_type != EAC3_FRAME_TYPE_DEPENDENT);
+    *need_next_header = *new_frame_start || (hdr.frame_type != EAC3_FRAME_TYPE_AC3_CONVERT);
     return hdr.frame_size;
 }
 
@@ -220,6 +244,12 @@ AVCodecParser ff_ac3_parser = {
 
 #else
 
+int avpriv_ac3_parse_header(AC3HeaderInfo **phdr, const uint8_t *buf,
+                            size_t size)
+{
+    return AVERROR(ENOSYS);
+}
+
 int av_ac3_parse_header(const uint8_t *buf, size_t size,
                         uint8_t *bitstream_id, uint16_t *frame_size)
 {
diff --git a/libavcodec/ac3_parser.h b/libavcodec/ac3_parser.h
index f78c461..ff8cc4c 100644
--- a/libavcodec/ac3_parser.h
+++ b/libavcodec/ac3_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3_parser_internal.h b/libavcodec/ac3_parser_internal.h
index 5e305e8..3648802 100644
--- a/libavcodec/ac3_parser_internal.h
+++ b/libavcodec/ac3_parser_internal.h
@@ -1,20 +1,20 @@
 /*
  * AC-3 parser internal code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,4 +36,7 @@
  */
 int ff_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr);
 
+int avpriv_ac3_parse_header(AC3HeaderInfo **hdr, const uint8_t *buf,
+                            size_t size);
+
 #endif /* AVCODEC_AC3_PARSER_INTERNAL_H */
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 4be0f1f..eaa327a 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -7,20 +7,20 @@
  * Copyright (c) 2007-2008 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  * Copyright (c) 2007 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,8 +63,11 @@ static const uint8_t quantization_tab[16] = {
     5, 6, 7, 8, 9, 10, 11, 12, 14, 16
 };
 
+#if (!USE_FIXED)
 /** dynamic range table. converts codes to scale factors. */
 static float dynamic_range_tab[256];
+float ff_ac3_heavy_dynamic_range_tab[256];
+#endif
 
 /** Adjustments in dB gain */
 static const float gain_levels[9] = {
@@ -111,7 +114,7 @@ static const uint8_t ac3_default_coeffs[8][5][2] = {
 static inline int
 symmetric_dequant(int code, int levels)
 {
-    return ((code - (levels >> 1)) << 24) / levels;
+    return ((code - (levels >> 1)) * (1 << 24)) / levels;
 }
 
 /*
@@ -158,12 +161,21 @@ static av_cold void ac3_tables_init(void)
         b5_mantissas[i] = symmetric_dequant(i, 15);
     }
 
+#if (!USE_FIXED)
     /* generate dynamic range table
        reference: Section 7.7.1 Dynamic Range Control */
     for (i = 0; i < 256; i++) {
         int v = (i >> 5) - ((i >> 7) << 3) - 5;
         dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0x1F) | 0x20);
     }
+
+    /* generate compr dynamic range table
+       reference: Section 7.7.2 Heavy Compression */
+    for (i = 0; i < 256; i++) {
+        int v = (i >> 4) - ((i >> 7) << 4) - 4;
+        ff_ac3_heavy_dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0xF) | 0x10);
+    }
+#endif
 }
 
 /**
@@ -176,18 +188,26 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
-    ff_ac3_common_init();
     ac3_tables_init();
     ff_mdct_init(&s->imdct_256, 8, 1, 1.0);
     ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
-    ff_kbd_window_init(s->window, 5.0, 256);
+    AC3_RENAME(ff_kbd_window_init)(s->window, 5.0, 256);
     ff_bswapdsp_init(&s->bdsp);
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+
+#if (USE_FIXED)
+    s->fdsp = avpriv_alloc_fixed_dsp(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#else
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
     ff_fmt_convert_init(&s->fmt_conv, avctx);
+#endif
+
+    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
     av_lfg_init(&s->dith_state, 0);
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    if (USE_FIXED)
+        avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
+    else
+        avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
     /* allow downmixing to stereo or mono */
     if (avctx->channels > 1 &&
@@ -219,9 +239,19 @@ static int ac3_parse_header(AC3DecodeContext *s)
     /* read the rest of the bsi. read twice for dual mono mode. */
     i = !s->channel_mode;
     do {
-        skip_bits(gbc, 5); // skip dialog normalization
-        if (get_bits1(gbc))
-            skip_bits(gbc, 8); //skip compression
+        s->dialog_normalization[(!s->channel_mode)-i] = -get_bits(gbc, 5);
+        if (s->dialog_normalization[(!s->channel_mode)-i] == 0) {
+            s->dialog_normalization[(!s->channel_mode)-i] = -31;
+        }
+        if (s->target_level != 0) {
+            s->level_gain[(!s->channel_mode)-i] = powf(2.0f,
+                (float)(s->target_level -
+                s->dialog_normalization[(!s->channel_mode)-i])/6.0f);
+        }
+        if (s->compression_exists[(!s->channel_mode)-i] = get_bits1(gbc)) {
+            s->heavy_dynamic_range[(!s->channel_mode)-i] =
+                AC3_HEAVY_RANGE(get_bits(gbc, 8));
+        }
         if (get_bits1(gbc))
             skip_bits(gbc, 8); //skip language code
         if (get_bits1(gbc))
@@ -287,6 +317,7 @@ static int parse_frame_header(AC3DecodeContext *s)
     s->fbw_channels                 = s->channels - s->lfe_on;
     s->lfe_ch                       = s->fbw_channels + 1;
     s->frame_size                   = hdr.frame_size;
+    s->superframe_size             += hdr.frame_size;
     s->preferred_downmix            = AC3_DMIXMOD_NOTINDICATED;
     s->center_mix_level             = hdr.center_mix_level;
     s->center_mix_level_ltrt        = 4; // -3.0dB
@@ -338,48 +369,53 @@ static int set_downmix_coeffs(AC3DecodeContext *s)
     float cmix = gain_levels[s->  center_mix_level];
     float smix = gain_levels[s->surround_mix_level];
     float norm0, norm1;
+    float downmix_coeffs[2][AC3_MAX_CHANNELS];
 
     if (!s->downmix_coeffs[0]) {
-        s->downmix_coeffs[0] = av_malloc(2 * AC3_MAX_CHANNELS *
-                                         sizeof(**s->downmix_coeffs));
+        s->downmix_coeffs[0] = av_malloc_array(2 * AC3_MAX_CHANNELS,
+                                               sizeof(**s->downmix_coeffs));
         if (!s->downmix_coeffs[0])
             return AVERROR(ENOMEM);
         s->downmix_coeffs[1] = s->downmix_coeffs[0] + AC3_MAX_CHANNELS;
     }
 
     for (i = 0; i < s->fbw_channels; i++) {
-        s->downmix_coeffs[0][i] = gain_levels[ac3_default_coeffs[s->channel_mode][i][0]];
-        s->downmix_coeffs[1][i] = gain_levels[ac3_default_coeffs[s->channel_mode][i][1]];
+        downmix_coeffs[0][i] = gain_levels[ac3_default_coeffs[s->channel_mode][i][0]];
+        downmix_coeffs[1][i] = gain_levels[ac3_default_coeffs[s->channel_mode][i][1]];
     }
     if (s->channel_mode > 1 && s->channel_mode & 1) {
-        s->downmix_coeffs[0][1] = s->downmix_coeffs[1][1] = cmix;
+        downmix_coeffs[0][1] = downmix_coeffs[1][1] = cmix;
     }
     if (s->channel_mode == AC3_CHMODE_2F1R || s->channel_mode == AC3_CHMODE_3F1R) {
         int nf = s->channel_mode - 2;
-        s->downmix_coeffs[0][nf] = s->downmix_coeffs[1][nf] = smix * LEVEL_MINUS_3DB;
+        downmix_coeffs[0][nf] = downmix_coeffs[1][nf] = smix * LEVEL_MINUS_3DB;
     }
     if (s->channel_mode == AC3_CHMODE_2F2R || s->channel_mode == AC3_CHMODE_3F2R) {
         int nf = s->channel_mode - 4;
-        s->downmix_coeffs[0][nf] = s->downmix_coeffs[1][nf+1] = smix;
+        downmix_coeffs[0][nf] = downmix_coeffs[1][nf+1] = smix;
     }
 
     /* renormalize */
     norm0 = norm1 = 0.0;
     for (i = 0; i < s->fbw_channels; i++) {
-        norm0 += s->downmix_coeffs[0][i];
-        norm1 += s->downmix_coeffs[1][i];
+        norm0 += downmix_coeffs[0][i];
+        norm1 += downmix_coeffs[1][i];
     }
     norm0 = 1.0f / norm0;
     norm1 = 1.0f / norm1;
     for (i = 0; i < s->fbw_channels; i++) {
-        s->downmix_coeffs[0][i] *= norm0;
-        s->downmix_coeffs[1][i] *= norm1;
+        downmix_coeffs[0][i] *= norm0;
+        downmix_coeffs[1][i] *= norm1;
     }
 
     if (s->output_mode == AC3_CHMODE_MONO) {
         for (i = 0; i < s->fbw_channels; i++)
-            s->downmix_coeffs[0][i] = (s->downmix_coeffs[0][i] +
-                                       s->downmix_coeffs[1][i]) * LEVEL_MINUS_3DB;
+            downmix_coeffs[0][i] = (downmix_coeffs[0][i] +
+                                    downmix_coeffs[1][i]) * LEVEL_MINUS_3DB;
+    }
+    for (i = 0; i < s->fbw_channels; i++) {
+        s->downmix_coeffs[0][i] = FIXR12(downmix_coeffs[0][i]);
+        s->downmix_coeffs[1][i] = FIXR12(downmix_coeffs[1][i]);
     }
 
     return 0;
@@ -389,7 +425,8 @@ static int set_downmix_coeffs(AC3DecodeContext *s)
  * Decode the grouped exponents according to exponent strategy.
  * reference: Section 7.1.3 Exponent Decoding
  */
-static int decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
+static int decode_exponents(AC3DecodeContext *s,
+                            GetBitContext *gbc, int exp_strategy, int ngrps,
                             uint8_t absexp, int8_t *dexps)
 {
     int i, j, grp, group_size;
@@ -400,6 +437,10 @@ static int decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
     group_size = exp_strategy + (exp_strategy == EXP_D45);
     for (grp = 0, i = 0; grp < ngrps; grp++) {
         expacc = get_bits(gbc, 7);
+        if (expacc >= 125) {
+            av_log(s->avctx, AV_LOG_ERROR, "expacc %d is out-of-range\n", expacc);
+            return AVERROR_INVALIDDATA;
+        }
         dexp[i++] = ungroup_3_in_7_bits_tab[expacc][0];
         dexp[i++] = ungroup_3_in_7_bits_tab[expacc][1];
         dexp[i++] = ungroup_3_in_7_bits_tab[expacc][2];
@@ -409,8 +450,10 @@ static int decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
     prevexp = absexp;
     for (i = 0, j = 0; i < ngrps * 3; i++) {
         prevexp += dexp[i] - 2;
-        if (prevexp > 24U)
-            return -1;
+        if (prevexp > 24U) {
+            av_log(s->avctx, AV_LOG_ERROR, "exponent %d is out-of-range\n", prevexp);
+            return AVERROR_INVALIDDATA;
+        }
         switch (group_size) {
         case 4: dexps[j++] = prevexp;
                 dexps[j++] = prevexp;
@@ -439,7 +482,7 @@ static void calc_transform_coeffs_cpl(AC3DecodeContext *s)
                 int cpl_coord = s->cpl_coords[ch][band] << 5;
                 for (bin = band_start; bin < band_end; bin++) {
                     s->fixed_coeffs[ch][bin] =
-                        MULH(s->fixed_coeffs[CPL_CH][bin] << 4, cpl_coord);
+                        MULH(s->fixed_coeffs[CPL_CH][bin] * (1 << 4), cpl_coord);
                 }
                 if (ch == 2 && s->phase_flags[band]) {
                     for (bin = band_start; bin < band_end; bin++)
@@ -485,7 +528,7 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
         case 0:
             /* random noise with approximate range of -0.707 to 0.707 */
             if (dither)
-                mantissa = (av_lfg_get(&s->dith_state) / 362) - 5932275;
+                mantissa = (((av_lfg_get(&s->dith_state)>>8)*181)>>8) - 5931008;
             else
                 mantissa = 0;
             break;
@@ -532,8 +575,11 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
             break;
         default: /* 6 to 15 */
             /* Shift mantissa and sign-extend it. */
-            mantissa = get_sbits(gbc, quantization_tab[bap]);
-            mantissa <<= 24 - quantization_tab[bap];
+            if (bap > 15) {
+                av_log(s->avctx, AV_LOG_ERROR, "bap %d is invalid in plain AC-3\n", bap);
+                bap = 15;
+            }
+            mantissa = (unsigned)get_sbits(gbc, quantization_tab[bap]) << (24 - quantization_tab[bap]);
             break;
         }
         coeffs[freq] = mantissa >> exps[freq];
@@ -567,7 +613,7 @@ static inline void decode_transform_coeffs_ch(AC3DecodeContext *s, int blk,
         /* if AHT is used, mantissas for all blocks are encoded in the first
            block of the frame. */
         int bin;
-        if (!blk && CONFIG_EAC3_DECODER)
+        if (CONFIG_EAC3_DECODER && !blk)
             ff_eac3_decode_transform_coeffs_aht_ch(s, ch);
         for (bin = s->start_freq[ch]; bin < s->end_freq[ch]; bin++) {
             s->fixed_coeffs[ch][bin] = s->pre_mantissa[ch][bin][blk] >> s->dexps[ch][bin];
@@ -638,27 +684,37 @@ static void do_rematrixing(AC3DecodeContext *s)
  * Convert frequency domain coefficients to time-domain audio samples.
  * reference: Section 7.9.4 Transformation Equations
  */
-static inline void do_imdct(AC3DecodeContext *s, int channels)
+static inline void do_imdct(AC3DecodeContext *s, int channels, int offset)
 {
     int ch;
 
     for (ch = 1; ch <= channels; ch++) {
         if (s->block_switch[ch]) {
             int i;
-            float *x = s->tmp_output + 128;
+            FFTSample *x = s->tmp_output + 128;
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i];
             s->imdct_256.imdct_half(&s->imdct_256, s->tmp_output, x);
-            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+#if USE_FIXED
+            s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1 + offset],
+                                       s->tmp_output, s->window, 128, 8);
+#else
+            s->fdsp->vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1 + offset],
                                        s->tmp_output, s->window, 128);
+#endif
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i + 1];
-            s->imdct_256.imdct_half(&s->imdct_256, s->delay[ch - 1], x);
+            s->imdct_256.imdct_half(&s->imdct_256, s->delay[ch - 1 + offset], x);
         } else {
             s->imdct_512.imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]);
-            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+#if USE_FIXED
+            s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1 + offset],
+                                       s->tmp_output, s->window, 128, 8);
+#else
+            s->fdsp->vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1 + offset],
                                        s->tmp_output, s->window, 128);
-            memcpy(s->delay[ch - 1], s->tmp_output + 128, 128 * sizeof(float));
+#endif
+            memcpy(s->delay[ch - 1 + offset], s->tmp_output + 128, 128 * sizeof(FFTSample));
         }
     }
 }
@@ -706,30 +762,31 @@ static void ac3_upmix_delay(AC3DecodeContext *s)
  * @param[in] default_band_struct default band structure table
  * @param[out] num_bands number of bands (optionally NULL)
  * @param[out] band_sizes array containing the number of bins in each band (optionally NULL)
+ * @param[in,out] band_struct current band structure
  */
 static void decode_band_structure(GetBitContext *gbc, int blk, int eac3,
                                   int ecpl, int start_subband, int end_subband,
                                   const uint8_t *default_band_struct,
-                                  int *num_bands, uint8_t *band_sizes)
+                                  int *num_bands, uint8_t *band_sizes,
+                                  uint8_t *band_struct, int band_struct_size)
 {
     int subbnd, bnd, n_subbands, n_bands=0;
     uint8_t bnd_sz[22];
-    uint8_t coded_band_struct[22];
-    const uint8_t *band_struct;
 
     n_subbands = end_subband - start_subband;
 
+    if (!blk)
+        memcpy(band_struct, default_band_struct, band_struct_size);
+
+    av_assert0(band_struct_size >= start_subband + n_subbands);
+
+    band_struct += start_subband + 1;
+
     /* decode band structure from bitstream or use default */
     if (!eac3 || get_bits1(gbc)) {
         for (subbnd = 0; subbnd < n_subbands - 1; subbnd++) {
-            coded_band_struct[subbnd] = get_bits1(gbc);
+            band_struct[subbnd] = get_bits1(gbc);
         }
-        band_struct = coded_band_struct;
-    } else if (!blk) {
-        band_struct = &default_band_struct[start_subband+1];
-    } else {
-        /* no change in band structure */
-        return;
     }
 
     /* calculate number of bands and band sizes based on band structure.
@@ -778,6 +835,9 @@ static inline int spx_strategy(AC3DecodeContext *s, int blk)
     if (start_subband > 7)
         start_subband += start_subband - 7;
     end_subband    = get_bits(bc, 3) + 5;
+#if USE_FIXED
+    s->spx_dst_end_freq = end_freq_inv_tab[end_subband-5];
+#endif
     if (end_subband   > 7)
         end_subband   += end_subband   - 7;
     dst_start_freq = dst_start_freq * 12 + 25;
@@ -798,14 +858,15 @@ static inline int spx_strategy(AC3DecodeContext *s, int blk)
 
     s->spx_dst_start_freq = dst_start_freq;
     s->spx_src_start_freq = src_start_freq;
-    s->spx_dst_end_freq   = dst_end_freq;
+    if (!USE_FIXED)
+        s->spx_dst_end_freq   = dst_end_freq;
 
     decode_band_structure(bc, blk, s->eac3, 0,
                           start_subband, end_subband,
                           ff_eac3_default_spx_band_struct,
                           &s->num_spx_bands,
-                          s->spx_band_sizes);
-
+                          s->spx_band_sizes,
+                          s->spx_band_struct, sizeof(s->spx_band_struct));
     return 0;
 }
 
@@ -818,26 +879,47 @@ static inline void spx_coordinates(AC3DecodeContext *s)
     for (ch = 1; ch <= fbw_channels; ch++) {
         if (s->channel_uses_spx[ch]) {
             if (s->first_spx_coords[ch] || get_bits1(bc)) {
-                float spx_blend;
+                INTFLOAT spx_blend;
                 int bin, master_spx_coord;
 
                 s->first_spx_coords[ch] = 0;
-                spx_blend        = get_bits(bc, 5) * (1.0f / 32);
+                spx_blend = AC3_SPX_BLEND(get_bits(bc, 5));
                 master_spx_coord = get_bits(bc, 2) * 3;
 
                 bin = s->spx_src_start_freq;
                 for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
-                    int bandsize;
+                    int bandsize = s->spx_band_sizes[bnd];
                     int spx_coord_exp, spx_coord_mant;
-                    float nratio, sblend, nblend, spx_coord;
+                    INTFLOAT nratio, sblend, nblend;
+#if USE_FIXED
+                    /* calculate blending factors */
+                    int64_t accu = ((bin << 23) + (bandsize << 22))
+                                 * (int64_t)s->spx_dst_end_freq;
+                    nratio = (int)(accu >> 32);
+                    nratio -= spx_blend << 18;
+
+                    if (nratio < 0) {
+                        nblend = 0;
+                        sblend = 0x800000;
+                    } else if (nratio > 0x7fffff) {
+                        nblend = 14529495; // sqrt(3) in FP.23
+                        sblend = 0;
+                    } else {
+                        nblend = fixed_sqrt(nratio, 23);
+                        accu = (int64_t)nblend * 1859775393;
+                        nblend = (int)((accu + (1<<29)) >> 30);
+                        sblend = fixed_sqrt(0x800000 - nratio, 23);
+                    }
+#else
+                    float spx_coord;
 
                     /* calculate blending factors */
-                    bandsize = s->spx_band_sizes[bnd];
                     nratio = ((float)((bin + (bandsize >> 1))) / s->spx_dst_end_freq) - spx_blend;
                     nratio = av_clipf(nratio, 0.0f, 1.0f);
                     nblend = sqrtf(3.0f * nratio); // noise is scaled by sqrt(3)
                                                    // to give unity variance
                     sblend = sqrtf(1.0f - nratio);
+#endif
                     bin += bandsize;
 
                     /* decode spx coordinates */
@@ -846,11 +928,18 @@ static inline void spx_coordinates(AC3DecodeContext *s)
                     if (spx_coord_exp == 15) spx_coord_mant <<= 1;
                     else                     spx_coord_mant += 4;
                     spx_coord_mant <<= (25 - spx_coord_exp - master_spx_coord);
-                    spx_coord = spx_coord_mant * (1.0f / (1 << 23));
 
                     /* multiply noise and signal blending factors by spx coordinate */
+#if USE_FIXED
+                    accu = (int64_t)nblend * spx_coord_mant;
+                    s->spx_noise_blend[ch][bnd]  = (int)((accu + (1<<22)) >> 23);
+                    accu = (int64_t)sblend * spx_coord_mant;
+                    s->spx_signal_blend[ch][bnd] = (int)((accu + (1<<22)) >> 23);
+#else
+                    spx_coord = spx_coord_mant * (1.0f / (1 << 23));
                     s->spx_noise_blend [ch][bnd] = nblend * spx_coord;
                     s->spx_signal_blend[ch][bnd] = sblend * spx_coord;
+#endif
                 }
             }
         } else {
@@ -914,7 +1003,8 @@ static inline int coupling_strategy(AC3DecodeContext *s, int blk,
         decode_band_structure(bc, blk, s->eac3, 0, cpl_start_subband,
                               cpl_end_subband,
                               ff_eac3_default_cpl_band_struct,
-                              &s->num_cpl_bands, s->cpl_band_sizes);
+                              &s->num_cpl_bands, s->cpl_band_sizes,
+                              s->cpl_band_struct, sizeof(s->cpl_band_struct));
     } else {
         /* coupling not in use */
         for (ch = 1; ch <= fbw_channels; ch++) {
@@ -974,7 +1064,7 @@ static inline int coupling_coordinates(AC3DecodeContext *s, int blk)
 /**
  * Decode a single audio block from the AC-3 bitstream.
  */
-static int decode_audio_block(AC3DecodeContext *s, int blk)
+static int decode_audio_block(AC3DecodeContext *s, int blk, int offset)
 {
     int fbw_channels = s->fbw_channels;
     int channel_mode = s->channel_mode;
@@ -1008,13 +1098,14 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         if (get_bits1(gbc)) {
             /* Allow asymmetric application of DRC when drc_scale > 1.
                Amplification of quiet sounds is enhanced */
-            float range = dynamic_range_tab[get_bits(gbc, 8)];
-            if (range > 1.0 || s->drc_scale <= 1.0)
-                s->dynamic_range[i] = powf(range, s->drc_scale);
+            int range_bits = get_bits(gbc, 8);
+            INTFLOAT range = AC3_RANGE(range_bits);
+            if (range_bits <= 127 || s->drc_scale <= 1.0)
+                s->dynamic_range[i] = AC3_DYNAMIC_RANGE(range);
             else
                 s->dynamic_range[i] = range;
         } else if (blk == 0) {
-            s->dynamic_range[i] = 1.0f;
+            s->dynamic_range[i] = AC3_DYNAMIC_RANGE1;
         }
     } while (i--);
 
@@ -1024,11 +1115,13 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         if (s->spx_in_use) {
             if ((ret = spx_strategy(s, blk)) < 0)
                 return ret;
-        } else {
-            for (ch = 1; ch <= fbw_channels; ch++) {
-                s->channel_uses_spx[ch] = 0;
-                s->first_spx_coords[ch] = 1;
-            }
+        }
+    }
+    if (!s->eac3 || !s->spx_in_use) {
+        s->spx_in_use = 0;
+        for (ch = 1; ch <= fbw_channels; ch++) {
+            s->channel_uses_spx[ch] = 0;
+            s->first_spx_coords[ch] = 1;
         }
     }
 
@@ -1116,10 +1209,9 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
     for (ch = !cpl_in_use; ch <= s->channels; ch++) {
         if (s->exp_strategy[blk][ch] != EXP_REUSE) {
             s->dexps[ch][0] = get_bits(gbc, 4) << !ch;
-            if (decode_exponents(gbc, s->exp_strategy[blk][ch],
+            if (decode_exponents(s, gbc, s->exp_strategy[blk][ch],
                                  s->num_exp_groups[ch], s->dexps[ch][0],
                                  &s->dexps[ch][s->start_freq[ch]+!!ch])) {
-                av_log(s->avctx, AV_LOG_ERROR, "exponent out-of-range\n");
                 return AVERROR_INVALIDDATA;
             }
             if (ch != CPL_CH && ch != s->lfe_ch)
@@ -1296,18 +1388,28 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
 
     /* apply scaling to coefficients (headroom, dynrng) */
     for (ch = 1; ch <= s->channels; ch++) {
-        float gain = 1.0 / 4194304.0f;
-        if (s->channel_mode == AC3_CHMODE_DUALMONO) {
-            gain *= s->dynamic_range[2 - ch];
-        } else {
-            gain *= s->dynamic_range[0];
-        }
+        int audio_channel = 0;
+        INTFLOAT gain;
+        if (s->channel_mode == AC3_CHMODE_DUALMONO && ch <= 2)
+            audio_channel = 2-ch;
+        if (s->heavy_compression && s->compression_exists[audio_channel])
+            gain = s->heavy_dynamic_range[audio_channel];
+        else
+            gain = s->dynamic_range[audio_channel];
+
+#if USE_FIXED
+        scale_coefs(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
+#else
+        if (s->target_level != 0)
+          gain = gain * s->level_gain[audio_channel];
+        gain *= 1.0 / 4194304.0f;
         s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch],
                                                s->fixed_coeffs[ch], gain, 256);
+#endif
     }
 
     /* apply spectral extension to high frequency bins */
-    if (s->spx_in_use && CONFIG_EAC3_DECODER) {
+    if (CONFIG_EAC3_DECODER && s->spx_in_use) {
         ff_eac3_apply_spectral_extension(s);
     }
 
@@ -1325,25 +1427,30 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
             ac3_upmix_delay(s);
         }
 
-        do_imdct(s, s->channels);
+        do_imdct(s, s->channels, offset);
 
         if (downmix_output) {
+#if USE_FIXED
+            ac3_downmix_c_fixed16(s->outptr, s->downmix_coeffs,
+                              s->out_channels, s->fbw_channels, 256);
+#else
             ff_ac3dsp_downmix(&s->ac3dsp, s->outptr, s->downmix_coeffs,
                               s->out_channels, s->fbw_channels, 256);
+#endif
         }
     } else {
         if (downmix_output) {
-            ff_ac3dsp_downmix(&s->ac3dsp, s->xcfptr + 1, s->downmix_coeffs,
-                              s->out_channels, s->fbw_channels, 256);
+            AC3_RENAME(ff_ac3dsp_downmix)(&s->ac3dsp, s->xcfptr + 1, s->downmix_coeffs,
+                                          s->out_channels, s->fbw_channels, 256);
         }
 
         if (downmix_output && !s->downmixed) {
             s->downmixed = 1;
-            ff_ac3dsp_downmix(&s->ac3dsp, s->dlyptr, s->downmix_coeffs,
-                              s->out_channels, s->fbw_channels, 128);
+            AC3_RENAME(ff_ac3dsp_downmix)(&s->ac3dsp, s->dlyptr, s->downmix_coeffs,
+                                          s->out_channels, s->fbw_channels, 128);
         }
 
-        do_imdct(s, s->out_channels);
+        do_imdct(s, s->out_channels, offset);
     }
 
     return 0;
@@ -1357,14 +1464,37 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
 {
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
+    int buf_size, full_buf_size = avpkt->size;
     AC3DecodeContext *s = avctx->priv_data;
-    int blk, ch, err, ret;
+    int blk, ch, err, offset, ret;
+    int i;
+    int skip = 0, got_independent_frame = 0;
     const uint8_t *channel_map;
-    const float *output[AC3_MAX_CHANNELS];
+    uint8_t extended_channel_map[EAC3_MAX_CHANNELS];
+    const SHORTFLOAT *output[AC3_MAX_CHANNELS];
     enum AVMatrixEncoding matrix_encoding;
     AVDownmixInfo *downmix_info;
 
+    s->superframe_size = 0;
+
+    buf_size = full_buf_size;
+    for (i = 1; i < buf_size; i += 2) {
+        if (buf[i] == 0x77 || buf[i] == 0x0B) {
+            if ((buf[i] ^ buf[i-1]) == (0x77 ^ 0x0B)) {
+                i--;
+                break;
+            } else if ((buf[i] ^ buf[i+1]) == (0x77 ^ 0x0B)) {
+                break;
+            }
+        }
+    }
+    if (i >= buf_size)
+        return AVERROR_INVALIDDATA;
+    if (i > 10)
+        return i;
+    buf += i;
+    buf_size -= i;
+
     /* copy input buffer to decoder context to avoid reading past the end
        of the buffer, which can be caused by a damaged input stream. */
     if (buf_size >= 2 && AV_RB16(buf) == 0x770B) {
@@ -1374,9 +1504,18 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
                             (const uint16_t *) buf, cnt);
     } else
         memcpy(s->input_buffer, buf, FFMIN(buf_size, AC3_FRAME_BUFFER_SIZE));
+
+    /* if consistent noise generation is enabled, seed the linear feedback generator
+     * with the contents of the AC-3 frame so that the noise is identical across
+     * decodes given the same AC-3 frame data, for use with non-linear edititing software. */
+    if (s->consistent_noise_generation)
+        av_lfg_init_from_data(&s->dith_state, s->input_buffer, FFMIN(buf_size, AC3_FRAME_BUFFER_SIZE));
+
     buf = s->input_buffer;
+dependent_frame:
     /* initialize the GetBitContext with the start of valid AC-3 Frame */
-    init_get_bits(&s->gbc, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gbc, buf, buf_size)) < 0)
+        return ret;
 
     /* parse the syncinfo */
     err = parse_frame_header(s);
@@ -1397,11 +1536,11 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
             break;
         case AAC_AC3_PARSE_ERROR_FRAME_TYPE:
             /* skip frame if CRC is ok. otherwise use error concealment. */
-            /* TODO: add support for substreams and dependent frames */
-            if (s->frame_type == EAC3_FRAME_TYPE_DEPENDENT || s->substreamid) {
+            /* TODO: add support for substreams */
+            if (s->substreamid) {
                 av_log(avctx, AV_LOG_DEBUG,
-                       "unsupported frame type %d: skipping frame\n",
-                       s->frame_type);
+                       "unsupported substream %d: skipping frame\n",
+                       s->substreamid);
                 *got_frame_ptr = 0;
                 return buf_size;
             } else {
@@ -1420,7 +1559,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         if (s->frame_size > buf_size) {
             av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
             err = AAC_AC3_PARSE_ERROR_FRAME_SIZE;
-        } else if (avctx->err_recognition & AV_EF_CRCCHECK) {
+        } else if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL)) {
             /* check for crc mismatch */
             if (av_crc(av_crc_get_table(AV_CRC_16_ANSI), 0, &buf[2],
                        s->frame_size - 2)) {
@@ -1432,10 +1571,10 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         }
     }
 
-    /* if frame is ok, set audio parameters */
-    if (!err) {
-        avctx->sample_rate = s->sample_rate;
-        avctx->bit_rate    = s->bit_rate;
+    if (s->frame_type == EAC3_FRAME_TYPE_DEPENDENT && !got_independent_frame) {
+        av_log(avctx, AV_LOG_WARNING, "Ignoring dependent frame without independent frame.\n");
+        *got_frame_ptr = 0;
+        return FFMIN(full_buf_size, s->frame_size);
     }
 
     /* channel config */
@@ -1454,6 +1593,10 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
             s->output_mode  = AC3_CHMODE_STEREO;
         }
 
+        s->loro_center_mix_level   = gain_levels[s->  center_mix_level];
+        s->loro_surround_mix_level = gain_levels[s->surround_mix_level];
+        s->ltrt_center_mix_level   = LEVEL_MINUS_3DB;
+        s->ltrt_surround_mix_level = LEVEL_MINUS_3DB;
         /* set downmixing coefficients if needed */
         if (s->channels != s->out_channels && !((s->output_mode & AC3_OUTPUT_LFEON) &&
                 s->fbw_channels == s->out_channels)) {
@@ -1476,39 +1619,147 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
     if (s->bitstream_mode == 0x7 && s->channels > 1)
         avctx->audio_service_type = AV_AUDIO_SERVICE_TYPE_KARAOKE;
 
-    /* get output buffer */
-    frame->nb_samples = s->num_blocks * AC3_BLOCK_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
-
     /* decode the audio blocks */
     channel_map = ff_ac3_dec_channel_map[s->output_mode & ~AC3_OUTPUT_LFEON][s->lfe_on];
+    offset = s->frame_type == EAC3_FRAME_TYPE_DEPENDENT ? AC3_MAX_CHANNELS : 0;
+    for (ch = 0; ch < AC3_MAX_CHANNELS; ch++) {
+        output[ch] = s->output[ch + offset];
+        s->outptr[ch] = s->output[ch + offset];
+    }
     for (ch = 0; ch < s->channels; ch++) {
         if (ch < s->out_channels)
-            s->outptr[channel_map[ch]] = (float *)frame->data[ch];
-        else
-            s->outptr[ch] = s->output[ch];
-        output[ch] = s->output[ch];
+            s->outptr[channel_map[ch]] = s->output_buffer[ch + offset];
     }
     for (blk = 0; blk < s->num_blocks; blk++) {
-        if (!err && decode_audio_block(s, blk)) {
+        if (!err && decode_audio_block(s, blk, offset)) {
             av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
             err = 1;
         }
         if (err)
             for (ch = 0; ch < s->out_channels; ch++)
-                memcpy(s->outptr[channel_map[ch]], output[ch], sizeof(**output) * AC3_BLOCK_SIZE);
+                memcpy(s->output_buffer[ch + offset] + AC3_BLOCK_SIZE*blk, output[ch], AC3_BLOCK_SIZE*sizeof(SHORTFLOAT));
         for (ch = 0; ch < s->out_channels; ch++)
             output[ch] = s->outptr[channel_map[ch]];
-        for (ch = 0; ch < s->out_channels; ch++)
-            s->outptr[ch] += AC3_BLOCK_SIZE;
+        for (ch = 0; ch < s->out_channels; ch++) {
+            if (!ch || channel_map[ch])
+                s->outptr[channel_map[ch]] += AC3_BLOCK_SIZE;
+        }
     }
 
     /* keep last block for error concealment in next frame */
     for (ch = 0; ch < s->out_channels; ch++)
-        memcpy(s->output[ch], output[ch], sizeof(**output) * AC3_BLOCK_SIZE);
+        memcpy(s->output[ch + offset], output[ch], AC3_BLOCK_SIZE*sizeof(SHORTFLOAT));
+
+    /* check if there is dependent frame */
+    if (buf_size > s->frame_size) {
+        AC3HeaderInfo hdr;
+        int err;
+
+        if (buf_size - s->frame_size <= 16) {
+            skip = buf_size - s->frame_size;
+            goto skip;
+        }
+
+        if ((ret = init_get_bits8(&s->gbc, buf + s->frame_size, buf_size - s->frame_size)) < 0)
+            return ret;
+
+        err = ff_ac3_parse_header(&s->gbc, &hdr);
+        if (err)
+            return err;
+
+        if (hdr.frame_type == EAC3_FRAME_TYPE_DEPENDENT) {
+            if (hdr.num_blocks != s->num_blocks || s->sample_rate != hdr.sample_rate) {
+                av_log(avctx, AV_LOG_WARNING, "Ignoring non-compatible dependent frame.\n");
+            } else {
+                buf += s->frame_size;
+                buf_size -= s->frame_size;
+                s->prev_output_mode = s->output_mode;
+                s->prev_bit_rate = s->bit_rate;
+                got_independent_frame = 1;
+                goto dependent_frame;
+            }
+        }
+    }
+skip:
+
+    frame->decode_error_flags = err ? FF_DECODE_ERROR_INVALID_BITSTREAM : 0;
+
+    /* if frame is ok, set audio parameters */
+    if (!err) {
+        avctx->sample_rate = s->sample_rate;
+        avctx->bit_rate    = s->bit_rate + s->prev_bit_rate;
+    }
+
+    for (ch = 0; ch < EAC3_MAX_CHANNELS; ch++)
+        extended_channel_map[ch] = ch;
+
+    if (s->frame_type == EAC3_FRAME_TYPE_DEPENDENT) {
+        uint64_t ich_layout = avpriv_ac3_channel_layout_tab[s->prev_output_mode & ~AC3_OUTPUT_LFEON];
+        int channel_map_size = ff_ac3_channels_tab[s->output_mode & ~AC3_OUTPUT_LFEON] + s->lfe_on;
+        uint64_t channel_layout;
+        int extend = 0;
+
+        if (s->prev_output_mode & AC3_OUTPUT_LFEON)
+            ich_layout |= AV_CH_LOW_FREQUENCY;
+
+        channel_layout = ich_layout;
+        for (ch = 0; ch < 16; ch++) {
+            if (s->channel_map & (1 << (EAC3_MAX_CHANNELS - ch - 1))) {
+                channel_layout |= ff_eac3_custom_channel_map_locations[ch][1];
+            }
+        }
+        if (av_get_channel_layout_nb_channels(channel_layout) > EAC3_MAX_CHANNELS) {
+            av_log(avctx, AV_LOG_ERROR, "Too many channels (%d) coded\n",
+                   av_get_channel_layout_nb_channels(channel_layout));
+            return AVERROR_INVALIDDATA;
+        }
+
+        avctx->channel_layout = channel_layout;
+        avctx->channels = av_get_channel_layout_nb_channels(channel_layout);
+
+        for (ch = 0; ch < EAC3_MAX_CHANNELS; ch++) {
+            if (s->channel_map & (1 << (EAC3_MAX_CHANNELS - ch - 1))) {
+                if (ff_eac3_custom_channel_map_locations[ch][0]) {
+                    int index = av_get_channel_layout_channel_index(channel_layout,
+                                                                    ff_eac3_custom_channel_map_locations[ch][1]);
+                    if (index < 0)
+                        return AVERROR_INVALIDDATA;
+                    if (extend >= channel_map_size)
+                        return AVERROR_INVALIDDATA;
+
+                    extended_channel_map[index] = offset + channel_map[extend++];
+                } else {
+                    int i;
+
+                    for (i = 0; i < 64; i++) {
+                        if ((1ULL << i) & ff_eac3_custom_channel_map_locations[ch][1]) {
+                            int index = av_get_channel_layout_channel_index(channel_layout,
+                                                                            1ULL << i);
+                            if (index < 0)
+                                return AVERROR_INVALIDDATA;
+                            if (extend >= channel_map_size)
+                                return AVERROR_INVALIDDATA;
+
+                            extended_channel_map[index] = offset + channel_map[extend++];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /* get output buffer */
+    frame->nb_samples = s->num_blocks * AC3_BLOCK_SIZE;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (ch = 0; ch < avctx->channels; ch++) {
+        int map = extended_channel_map[ch];
+        av_assert0(ch>=AV_NUM_DATA_POINTERS || frame->extended_data[ch] == frame->data[ch]);
+        memcpy((SHORTFLOAT *)frame->extended_data[ch],
+               s->output_buffer[map],
+               s->num_blocks * AC3_BLOCK_SIZE * sizeof(SHORTFLOAT));
+    }
 
     /*
      * AVMatrixEncoding
@@ -1568,7 +1819,10 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return FFMIN(buf_size, s->frame_size);
+    if (!s->superframe_size)
+        return FFMIN(full_buf_size, s->frame_size + skip);
+
+    return FFMIN(full_buf_size, s->superframe_size + skip);
 }
 
 /**
@@ -1579,6 +1833,7 @@ static av_cold int ac3_decode_end(AVCodecContext *avctx)
     AC3DecodeContext *s = avctx->priv_data;
     ff_mdct_end(&s->imdct_512);
     ff_mdct_end(&s->imdct_256);
+    av_freep(&s->fdsp);
     av_freep(&s->downmix_coeffs[0]);
 
     return 0;
@@ -1586,53 +1841,3 @@ static av_cold int ac3_decode_end(AVCodecContext *avctx)
 
 #define OFFSET(x) offsetof(AC3DecodeContext, x)
 #define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM)
-static const AVOption options[] = {
-    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
-    { NULL},
-};
-
-static const AVClass ac3_decoder_class = {
-    .class_name = "AC3 decoder",
-    .item_name  = av_default_item_name,
-    .option     = options,
-    .version    = LIBAVUTIL_VERSION_INT,
-};
-
-AVCodec ff_ac3_decoder = {
-    .name           = "ac3",
-    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_AC3,
-    .priv_data_size = sizeof (AC3DecodeContext),
-    .init           = ac3_decode_init,
-    .close          = ac3_decode_end,
-    .decode         = ac3_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &ac3_decoder_class,
-};
-
-#if CONFIG_EAC3_DECODER
-static const AVClass eac3_decoder_class = {
-    .class_name = "E-AC3 decoder",
-    .item_name  = av_default_item_name,
-    .option     = options,
-    .version    = LIBAVUTIL_VERSION_INT,
-};
-
-AVCodec ff_eac3_decoder = {
-    .name           = "eac3",
-    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52B (AC-3, E-AC-3)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_EAC3,
-    .priv_data_size = sizeof (AC3DecodeContext),
-    .init           = ac3_decode_init,
-    .close          = ac3_decode_end,
-    .decode         = ac3_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &eac3_decoder_class,
-};
-#endif
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index 4a7e281..ce1434b 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -2,20 +2,20 @@
  * Common code between the AC-3 and E-AC-3 decoders
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,6 +51,7 @@
 #define AVCODEC_AC3DEC_H
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
 #include "libavutil/lfg.h"
 #include "ac3.h"
 #include "ac3dsp.h"
@@ -75,6 +76,7 @@ typedef struct AC3DecodeContext {
 ///@{
     int frame_type;                         ///< frame type                             (strmtyp)
     int substreamid;                        ///< substream identification
+    int superframe_size;                    ///< current superframe size, in bytes
     int frame_size;                         ///< current frame size, in bytes
     int bit_rate;                           ///< stream bit rate, in bits-per-second
     int sample_rate;                        ///< sample frequency, in Hz
@@ -83,7 +85,10 @@ typedef struct AC3DecodeContext {
     int bitstream_mode;                     ///< bitstream mode                         (bsmod)
     int channel_mode;                       ///< channel mode                           (acmod)
     int lfe_on;                             ///< lfe channel in use
-    int channel_map;                        ///< custom channel map
+    int dialog_normalization[2];            ///< dialog level in dBFS                   (dialnorm)
+    int compression_exists[2];              ///< compression field is valid for frame   (compre)
+    int compression_gain[2];                ///< gain to apply for heavy compression    (compr)
+    int channel_map;                        ///< custom channel map                     (chanmap)
     int preferred_downmix;                  ///< Preferred 2-channel downmix mode       (dmixmod)
     int center_mix_level;                   ///< Center mix level index
     int center_mix_level_ltrt;              ///< Center mix level index for Lt/Rt       (ltrtcmixlev)
@@ -99,6 +104,14 @@ typedef struct AC3DecodeContext {
     int dolby_headphone_mode;               ///< dolby headphone mode                   (dheadphonmod)
 ///@}
 
+    int preferred_stereo_downmix;
+    float ltrt_center_mix_level;
+    float ltrt_surround_mix_level;
+    float loro_center_mix_level;
+    float loro_surround_mix_level;
+    int target_level;                       ///< target level in dBFS
+    float level_gain[2];
+
 ///@name Frame syntax parameters
     int snr_offset_strategy;                ///< SNR offset strategy                    (snroffststr)
     int block_switch_syntax;                ///< block switch syntax enabled            (blkswe)
@@ -116,6 +129,7 @@ typedef struct AC3DecodeContext {
     int phase_flags_in_use;                 ///< phase flags in use                     (phsflginu)
     int phase_flags[AC3_MAX_CPL_BANDS];     ///< phase flags                            (phsflg)
     int num_cpl_bands;                      ///< number of coupling bands               (ncplbnd)
+    uint8_t cpl_band_struct[AC3_MAX_CPL_BANDS];
     uint8_t cpl_band_sizes[AC3_MAX_CPL_BANDS]; ///< number of coeffs in each coupling band
     int firstchincpl;                       ///< first channel in coupling
     int first_cpl_coords[AC3_MAX_CHANNELS]; ///< first coupling coordinates states      (firstcplcos)
@@ -132,10 +146,11 @@ typedef struct AC3DecodeContext {
     int spx_dst_start_freq;                     ///< spx starting frequency bin for copying (copystartmant)
                                                 ///< the copy region ends at the start of the spx region.
     int num_spx_bands;                          ///< number of spx bands                    (nspxbnds)
+    uint8_t spx_band_struct[SPX_MAX_BANDS];
     uint8_t spx_band_sizes[SPX_MAX_BANDS];      ///< number of bins in each spx band
     uint8_t first_spx_coords[AC3_MAX_CHANNELS]; ///< first spx coordinates states           (firstspxcos)
-    float spx_noise_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS]; ///< spx noise blending factor  (nblendfact)
-    float spx_signal_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS];///< spx signal blending factor (sblendfact)
+    INTFLOAT spx_noise_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS]; ///< spx noise blending factor  (nblendfact)
+    INTFLOAT spx_signal_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS];///< spx signal blending factor (sblendfact)
 ///@}
 
 ///@name Adaptive hybrid transform
@@ -147,15 +162,19 @@ typedef struct AC3DecodeContext {
     int fbw_channels;                           ///< number of full-bandwidth channels
     int channels;                               ///< number of total channels
     int lfe_ch;                                 ///< index of LFE channel
-    float *downmix_coeffs[2];                   ///< stereo downmix coefficients
+    SHORTFLOAT *downmix_coeffs[2];              ///< stereo downmix coefficients
     int downmixed;                              ///< indicates if coeffs are currently downmixed
     int output_mode;                            ///< output channel configuration
+    int prev_output_mode;                       ///< output channel configuration for previous frame
     int out_channels;                           ///< number of output channels
+    int prev_bit_rate;                          ///< stream bit rate, in bits-per-second for previous frame
 ///@}
 
 ///@name Dynamic range
-    float dynamic_range[2];                 ///< dynamic range
-    float drc_scale;                        ///< percentage of dynamic range compression to be applied
+    INTFLOAT dynamic_range[2];                 ///< dynamic range
+    INTFLOAT drc_scale;                        ///< percentage of dynamic range compression to be applied
+    int heavy_compression;                     ///< apply heavy compression
+    INTFLOAT heavy_dynamic_range[2];           ///< heavy dynamic range compression
 ///@}
 
 ///@name Bandwidth
@@ -163,6 +182,10 @@ typedef struct AC3DecodeContext {
     int end_freq[AC3_MAX_CHANNELS];         ///< end frequency bin                      (endmant)
 ///@}
 
+///@name Consistent noise generation
+    int consistent_noise_generation;        ///< seed noise generation with AC-3 frame on decode
+///@}
+
 ///@name Rematrixing
     int num_rematrixing_bands;              ///< number of rematrixing bands            (nrematbnd)
     int rematrixing_flags[4];               ///< rematrixing flags                      (rematflg)
@@ -203,23 +226,28 @@ typedef struct AC3DecodeContext {
 
 ///@name Optimization
     BswapDSPContext bdsp;
-    AVFloatDSPContext fdsp;
+#if USE_FIXED
+    AVFixedDSPContext *fdsp;
+#else
+    AVFloatDSPContext *fdsp;
+#endif
     AC3DSPContext ac3dsp;
     FmtConvertContext fmt_conv;             ///< optimized conversion functions
 ///@}
 
-    float *outptr[AC3_MAX_CHANNELS];
-    float *xcfptr[AC3_MAX_CHANNELS];
-    float *dlyptr[AC3_MAX_CHANNELS];
+    SHORTFLOAT *outptr[AC3_MAX_CHANNELS];
+    INTFLOAT *xcfptr[AC3_MAX_CHANNELS];
+    INTFLOAT *dlyptr[AC3_MAX_CHANNELS];
 
 ///@name Aligned arrays
-    DECLARE_ALIGNED(16, int32_t, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];     ///< fixed-point transform coefficients
-    DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
-    DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
-    DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
-    DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
-    DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
+    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///< fixed-point transform coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, delay)[EAC3_MAX_CHANNELS][AC3_BLOCK_SIZE];         ///< delay - added to the next block
+    DECLARE_ALIGNED(32, INTFLOAT, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
+    DECLARE_ALIGNED(32, SHORTFLOAT, output)[EAC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
     DECLARE_ALIGNED(32, uint8_t, input_buffer)[AC3_FRAME_BUFFER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; ///< temp buffer to prevent overread
+    DECLARE_ALIGNED(32, SHORTFLOAT, output_buffer)[EAC3_MAX_CHANNELS][AC3_BLOCK_SIZE * 6];  ///< final output buffer
 ///@}
 } AC3DecodeContext;
 
@@ -227,19 +255,23 @@ typedef struct AC3DecodeContext {
  * Parse the E-AC-3 frame header.
  * This parses both the bit stream info and audio frame header.
  */
-int ff_eac3_parse_header(AC3DecodeContext *s);
+static int ff_eac3_parse_header(AC3DecodeContext *s);
 
 /**
  * Decode mantissas in a single channel for the entire frame.
  * This is used when AHT mode is enabled.
  */
-void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch);
+static void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch);
 
 /**
  * Apply spectral extension to each channel by copying lower frequency
  * coefficients to higher frequency bins and applying side information to
  * approximate the original high frequency signal.
  */
-void ff_eac3_apply_spectral_extension(AC3DecodeContext *s);
+static void ff_eac3_apply_spectral_extension(AC3DecodeContext *s);
+
+#if (!USE_FIXED)
+extern float ff_ac3_heavy_dynamic_range_tab[256];
+#endif
 
 #endif /* AVCODEC_AC3DEC_H */
diff --git a/libavcodec/ac3dec_data.c b/libavcodec/ac3dec_data.c
index 272a963..d0a9b1e 100644
--- a/libavcodec/ac3dec_data.c
+++ b/libavcodec/ac3dec_data.c
@@ -2,20 +2,20 @@
  * AC-3 and E-AC-3 decoder tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3dec_data.h b/libavcodec/ac3dec_data.h
index c0a584e..975b52e 100644
--- a/libavcodec/ac3dec_data.h
+++ b/libavcodec/ac3dec_data.h
@@ -2,20 +2,20 @@
  * AC-3 and E-AC-3 decoder tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3dec_fixed.c b/libavcodec/ac3dec_fixed.c
new file mode 100644
index 0000000..bd66175
--- /dev/null
+++ b/libavcodec/ac3dec_fixed.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ *
+ * AC3 fixed-point decoder for MIPS platforms
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define USE_FIXED 1
+#define FFT_FIXED_32 1
+#include "ac3dec.h"
+
+
+static const int end_freq_inv_tab[8] =
+{
+    50529027, 44278013, 39403370, 32292987, 27356480, 23729101, 20951060, 18755316
+};
+
+static void scale_coefs (
+    int32_t *dst,
+    const int32_t *src,
+    int dynrng,
+    int len)
+{
+    int i, shift;
+    unsigned mul, round;
+    int temp, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    mul = (dynrng & 0x1f) + 0x20;
+    shift = 4 - (sign_extend(dynrng, 9) >> 5);
+    if (shift > 0 ) {
+      round = 1 << (shift-1);
+      for (i=0; i<len; i+=8) {
+
+          temp = src[i] * mul;
+          temp1 = src[i+1] * mul;
+          temp = temp + round;
+          temp2 = src[i+2] * mul;
+
+          temp1 = temp1 + round;
+          dst[i] = temp >> shift;
+          temp3 = src[i+3] * mul;
+          temp2 = temp2 + round;
+
+          dst[i+1] = temp1 >> shift;
+          temp4 = src[i + 4] * mul;
+          temp3 = temp3 + round;
+          dst[i+2] = temp2 >> shift;
+
+          temp5 = src[i+5] * mul;
+          temp4 = temp4 + round;
+          dst[i+3] = temp3 >> shift;
+          temp6 = src[i+6] * mul;
+
+          dst[i+4] = temp4 >> shift;
+          temp5 = temp5 + round;
+          temp7 = src[i+7] * mul;
+          temp6 = temp6 + round;
+
+          dst[i+5] = temp5 >> shift;
+          temp7 = temp7 + round;
+          dst[i+6] = temp6 >> shift;
+          dst[i+7] = temp7 >> shift;
+
+      }
+    } else {
+      shift = -shift;
+      for (i=0; i<len; i+=8) {
+
+          temp = src[i] * mul;
+          temp1 = src[i+1] * mul;
+          temp2 = src[i+2] * mul;
+
+          dst[i] = temp << shift;
+          temp3 = src[i+3] * mul;
+
+          dst[i+1] = temp1 << shift;
+          temp4 = src[i + 4] * mul;
+          dst[i+2] = temp2 << shift;
+
+          temp5 = src[i+5] * mul;
+          dst[i+3] = temp3 << shift;
+          temp6 = src[i+6] * mul;
+
+          dst[i+4] = temp4 << shift;
+          temp7 = src[i+7] * mul;
+
+          dst[i+5] = temp5 << shift;
+          dst[i+6] = temp6 << shift;
+          dst[i+7] = temp7 << shift;
+
+      }
+    }
+}
+
+/**
+ * Downmix samples from original signal to stereo or mono (this is for 16-bit samples
+ * and fixed point decoder - original (for 32-bit samples) is in ac3dsp.c).
+ */
+static void ac3_downmix_c_fixed16(int16_t **samples, int16_t **matrix,
+                                  int out_ch, int in_ch, int len)
+{
+    int i, j;
+    int v0, v1;
+    if (out_ch == 2) {
+        for (i = 0; i < len; i++) {
+            v0 = v1 = 0;
+            for (j = 0; j < in_ch; j++) {
+                v0 += samples[j][i] * matrix[0][j];
+                v1 += samples[j][i] * matrix[1][j];
+            }
+            samples[0][i] = (v0+2048)>>12;
+            samples[1][i] = (v1+2048)>>12;
+        }
+    } else if (out_ch == 1) {
+        for (i = 0; i < len; i++) {
+            v0 = 0;
+            for (j = 0; j < in_ch; j++)
+                v0 += samples[j][i] * matrix[0][j];
+            samples[0][i] = (v0+2048)>>12;
+        }
+    }
+}
+
+#include "eac3dec.c"
+#include "ac3dec.c"
+
+static const AVOption options[] = {
+    { "cons_noisegen", "enable consistent noise generation", OFFSET(consistent_noise_generation), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, PAR },
+    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
+    { "heavy_compr", "enable heavy dynamic range compression", OFFSET(heavy_compression), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, PAR },
+    { NULL},
+};
+
+static const AVClass ac3_decoder_class = {
+    .class_name = "Fixed-Point AC-3 Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ac3_fixed_decoder = {
+    .name           = "ac3_fixed",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_AC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &ac3_decoder_class,
+};
diff --git a/libavcodec/ac3dec_float.c b/libavcodec/ac3dec_float.c
new file mode 100644
index 0000000..b85a4ce
--- /dev/null
+++ b/libavcodec/ac3dec_float.c
@@ -0,0 +1,93 @@
+/*
+ * AC-3 Audio Decoder
+ * This code was developed as part of Google Summer of Code 2006.
+ * E-AC-3 support was added as part of Google Summer of Code 2007.
+ *
+ * Copyright (c) 2006 Kartikey Mahendra BHATT (bhattkm at gmail dot com)
+ * Copyright (c) 2007-2008 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
+ * Copyright (c) 2007 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * Upmix delay samples from stereo to original channel layout.
+ */
+#include "ac3dec.h"
+#include "eac3dec.c"
+#include "ac3dec.c"
+
+static const AVOption options[] = {
+    { "cons_noisegen", "enable consistent noise generation", OFFSET(consistent_noise_generation), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, PAR },
+    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
+    { "heavy_compr", "enable heavy dynamic range compression", OFFSET(heavy_compression), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, PAR },
+    { "target_level", "target level in -dBFS (0 not applied)", OFFSET(target_level), AV_OPT_TYPE_INT, {.i64 = 0 }, -31, 0, PAR },
+
+{"dmix_mode", "Preferred Stereo Downmix Mode", OFFSET(preferred_stereo_downmix), AV_OPT_TYPE_INT, {.i64 = -1 }, -1, 2, 0, "dmix_mode"},
+{"ltrt_cmixlev",   "Lt/Rt Center Mix Level",   OFFSET(ltrt_center_mix_level),    AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"ltrt_surmixlev", "Lt/Rt Surround Mix Level", OFFSET(ltrt_surround_mix_level),  AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"loro_cmixlev",   "Lo/Ro Center Mix Level",   OFFSET(loro_center_mix_level),    AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"loro_surmixlev", "Lo/Ro Surround Mix Level", OFFSET(loro_surround_mix_level),  AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+
+    { NULL},
+};
+
+static const AVClass ac3_decoder_class = {
+    .class_name = "AC3 decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ac3_decoder = {
+    .name           = "ac3",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_AC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &ac3_decoder_class,
+};
+
+#if CONFIG_EAC3_DECODER
+static const AVClass eac3_decoder_class = {
+    .class_name = "E-AC3 decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_eac3_decoder = {
+    .name           = "eac3",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_EAC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52B (AC-3, E-AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &eac3_decoder_class,
+};
+#endif
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 440bd1a..43438da 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -2,20 +2,20 @@
  * AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -171,6 +171,48 @@ static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs)
     }
 }
 
+static void ac3_sum_square_butterfly_int32_c(int64_t sum[4],
+                                             const int32_t *coef0,
+                                             const int32_t *coef1,
+                                             int len)
+{
+    int i;
+
+    sum[0] = sum[1] = sum[2] = sum[3] = 0;
+
+    for (i = 0; i < len; i++) {
+        int lt = coef0[i];
+        int rt = coef1[i];
+        int md = lt + rt;
+        int sd = lt - rt;
+        MAC64(sum[0], lt, lt);
+        MAC64(sum[1], rt, rt);
+        MAC64(sum[2], md, md);
+        MAC64(sum[3], sd, sd);
+    }
+}
+
+static void ac3_sum_square_butterfly_float_c(float sum[4],
+                                             const float *coef0,
+                                             const float *coef1,
+                                             int len)
+{
+    int i;
+
+    sum[0] = sum[1] = sum[2] = sum[3] = 0;
+
+    for (i = 0; i < len; i++) {
+        float lt = coef0[i];
+        float rt = coef1[i];
+        float md = lt + rt;
+        float sd = lt - rt;
+        sum[0] += lt * lt;
+        sum[1] += rt * rt;
+        sum[2] += md * md;
+        sum[3] += sd * sd;
+    }
+}
+
 static void ac3_downmix_5_to_2_symmetric_c(float **samples, float **matrix,
                                            int len)
 {
@@ -237,6 +279,101 @@ static void ac3_downmix_c(float **samples, float **matrix,
     }
 }
 
+static void ac3_downmix_5_to_2_symmetric_c_fixed(int32_t **samples, int16_t **matrix,
+                                           int len)
+{
+    int i;
+    int64_t v0, v1;
+    int16_t front_mix    = matrix[0][0];
+    int16_t center_mix   = matrix[0][1];
+    int16_t surround_mix = matrix[0][3];
+
+    for (i = 0; i < len; i++) {
+        v0 = (int64_t)samples[0][i] * front_mix  +
+             (int64_t)samples[1][i] * center_mix +
+             (int64_t)samples[3][i] * surround_mix;
+
+        v1 = (int64_t)samples[1][i] * center_mix +
+             (int64_t)samples[2][i] * front_mix  +
+             (int64_t)samples[4][i] * surround_mix;
+
+        samples[0][i] = (v0+2048)>>12;
+        samples[1][i] = (v1+2048)>>12;
+    }
+}
+
+static void ac3_downmix_5_to_1_symmetric_c_fixed(int32_t **samples, int16_t **matrix,
+                                                 int len)
+{
+    int i;
+    int64_t v0;
+    int16_t front_mix    = matrix[0][0];
+    int16_t center_mix   = matrix[0][1];
+    int16_t surround_mix = matrix[0][3];
+
+    for (i = 0; i < len; i++) {
+        v0 = (int64_t)samples[0][i] * front_mix    +
+             (int64_t)samples[1][i] * center_mix   +
+             (int64_t)samples[2][i] * front_mix    +
+             (int64_t)samples[3][i] * surround_mix +
+             (int64_t)samples[4][i] * surround_mix;
+
+        samples[0][i] = (v0+2048)>>12;
+    }
+}
+
+static void ac3_downmix_c_fixed(int32_t **samples, int16_t **matrix,
+                                int out_ch, int in_ch, int len)
+{
+    int i, j;
+    int64_t v0, v1;
+    if (out_ch == 2) {
+        for (i = 0; i < len; i++) {
+            v0 = v1 = 0;
+            for (j = 0; j < in_ch; j++) {
+                v0 += (int64_t)samples[j][i] * matrix[0][j];
+                v1 += (int64_t)samples[j][i] * matrix[1][j];
+            }
+            samples[0][i] = (v0+2048)>>12;
+            samples[1][i] = (v1+2048)>>12;
+        }
+    } else if (out_ch == 1) {
+        for (i = 0; i < len; i++) {
+            v0 = 0;
+            for (j = 0; j < in_ch; j++)
+                v0 += (int64_t)samples[j][i] * matrix[0][j];
+            samples[0][i] = (v0+2048)>>12;
+        }
+    }
+}
+
+void ff_ac3dsp_downmix_fixed(AC3DSPContext *c, int32_t **samples, int16_t **matrix,
+                             int out_ch, int in_ch, int len)
+{
+    if (c->in_channels != in_ch || c->out_channels != out_ch) {
+        c->in_channels  = in_ch;
+        c->out_channels = out_ch;
+        c->downmix_fixed = NULL;
+
+        if (in_ch == 5 && out_ch == 2 &&
+            !(matrix[1][0] | matrix[0][2]  |
+              matrix[1][3] | matrix[0][4]  |
+             (matrix[0][1] ^ matrix[1][1]) |
+             (matrix[0][0] ^ matrix[1][2]))) {
+            c->downmix_fixed = ac3_downmix_5_to_2_symmetric_c_fixed;
+        } else if (in_ch == 5 && out_ch == 1 &&
+                   matrix[0][0] == matrix[0][2] &&
+                   matrix[0][3] == matrix[0][4]) {
+            c->downmix_fixed = ac3_downmix_5_to_1_symmetric_c_fixed;
+        }
+    }
+
+    if (c->downmix_fixed)
+        c->downmix_fixed(samples, matrix, len);
+    else
+        ac3_downmix_c_fixed(samples, matrix, out_ch, in_ch, len);
+}
+
 static void apply_window_int16_c(int16_t *output, const int16_t *input,
                                  const int16_t *window, unsigned int len)
 {
@@ -293,13 +430,18 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
     c->update_bap_counts = ac3_update_bap_counts_c;
     c->compute_mantissa_size = ac3_compute_mantissa_size_c;
     c->extract_exponents = ac3_extract_exponents_c;
+    c->sum_square_butterfly_int32 = ac3_sum_square_butterfly_int32_c;
+    c->sum_square_butterfly_float = ac3_sum_square_butterfly_float_c;
     c->in_channels           = 0;
     c->out_channels          = 0;
     c->downmix               = NULL;
+    c->downmix_fixed         = NULL;
     c->apply_window_int16 = apply_window_int16_c;
 
     if (ARCH_ARM)
         ff_ac3dsp_init_arm(c, bit_exact);
     if (ARCH_X86)
         ff_ac3dsp_init_x86(c, bit_exact);
+    if (ARCH_MIPS)
+        ff_ac3dsp_init_mips(c, bit_exact);
 }
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index c33a0be..161de4c 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -2,20 +2,20 @@
  * AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -126,9 +126,16 @@ typedef struct AC3DSPContext {
 
     void (*extract_exponents)(uint8_t *exp, int32_t *coef, int nb_coefs);
 
+    void (*sum_square_butterfly_int32)(int64_t sum[4], const int32_t *coef0,
+                                       const int32_t *coef1, int len);
+
+    void (*sum_square_butterfly_float)(float sum[4], const float *coef0,
+                                       const float *coef1, int len);
+
     int out_channels;
     int in_channels;
     void (*downmix)(float **samples, float **matrix, int len);
+    void (*downmix_fixed)(int32_t **samples, int16_t **matrix, int len);
 
     /**
      * Apply symmetric window in 16-bit fixed-point.
@@ -148,9 +155,13 @@ typedef struct AC3DSPContext {
 void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact);
 
 void ff_ac3dsp_downmix(AC3DSPContext *c, float **samples, float **matrix,
                        int out_ch, int in_ch, int len);
+void ff_ac3dsp_downmix_fixed(AC3DSPContext *c, int32_t **samples, int16_t **matrix,
+                             int out_ch, int in_ch, int len);
+
 void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c);
 
 #endif /* AVCODEC_AC3DSP_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 48bc2e7..e7e18af 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,13 +36,13 @@
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
+#include "internal.h"
 #include "me_cmp.h"
 #include "put_bits.h"
 #include "audiodsp.h"
 #include "ac3dsp.h"
 #include "ac3.h"
 #include "fft.h"
-#include "internal.h"
 #include "ac3enc.h"
 #include "eac3enc.h"
 
@@ -274,7 +274,7 @@ void ff_ac3_apply_rematrixing(AC3EncodeContext *s)
     int nb_coefs;
     int blk, bnd, i;
     int start, end;
-    uint8_t *flags;
+    uint8_t *flags = NULL;
 
     if (!s->rematrixing_enabled)
         return;
@@ -1183,7 +1183,7 @@ static inline int asym_quant(int c, int e, int qbits)
 {
     int m;
 
-    c = (((c << e) >> (24 - qbits)) + 1) >> 1;
+    c = (((c * (1<<e)) >> (24 - qbits)) + 1) >> 1;
     m = (1 << (qbits-1));
     if (c >= m)
         c = m - 1;
@@ -1211,14 +1211,11 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
     int i;
 
     for (i = start_freq; i < end_freq; i++) {
-        int v;
         int c = fixed_coef[i];
         int e = exp[i];
-        int b = bap[i];
-        switch (b) {
-        case 0:
-            v = 0;
-            break;
+        int v = bap[i];
+        if (v)
+        switch (v) {
         case 1:
             v = sym_quant(c, e, 3);
             switch (s->mant1_cnt) {
@@ -1287,7 +1284,7 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
             v = asym_quant(c, e, 16);
             break;
         default:
-            v = asym_quant(c, e, b - 1);
+            v = asym_quant(c, e, v - 1);
             break;
         }
         qmant[i] = v;
@@ -1387,7 +1384,7 @@ static void ac3_output_frame_header(AC3EncodeContext *s)
  */
 static void output_audio_block(AC3EncodeContext *s, int blk)
 {
-    int ch, i, baie, bnd, got_cpl, ch0;
+    int ch, i, baie, bnd, got_cpl, av_uninit(ch0);
     AC3Block *block = &s->blocks[blk];
 
     /* block switching */
@@ -1676,9 +1673,9 @@ void ff_ac3_output_frame(AC3EncodeContext *s, unsigned char *frame)
 }
 
 
-#ifdef DEBUG
 static void dprint_options(AC3EncodeContext *s)
 {
+#ifdef DEBUG
     AVCodecContext *avctx = s->avctx;
     AC3EncOptions *opt = &s->options;
     char strbuf[32];
@@ -1787,10 +1784,8 @@ static void dprint_options(AC3EncodeContext *s)
             ff_dlog(avctx, "extended bitstream info 2: {not written}\n");
         }
     }
-}
-#else
-#define dprint_options(x) do {} while(0)
 #endif
+}
 
 
 #define FLT_OPTION_THRESHOLD 0.01
@@ -1805,7 +1800,7 @@ static int validate_float_option(float v, const float *v_list, int v_list_size)
             break;
     }
     if (i == v_list_size)
-        return -1;
+        return AVERROR(EINVAL);
 
     return i;
 }
@@ -2025,6 +2020,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
     AC3EncodeContext *s = avctx->priv_data;
 
     av_freep(&s->windowed_samples);
+    if (s->planar_samples)
     for (ch = 0; ch < s->channels; ch++)
         av_freep(&s->planar_samples[ch]);
     av_freep(&s->planar_samples);
@@ -2040,6 +2036,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
     av_freep(&s->qmant_buffer);
     av_freep(&s->cpl_coord_exp_buffer);
     av_freep(&s->cpl_coord_mant_buffer);
+    av_freep(&s->fdsp);
     for (blk = 0; blk < s->num_blocks; blk++) {
         AC3Block *block = &s->blocks[blk];
         av_freep(&block->mdct_coef);
@@ -2156,8 +2153,9 @@ static av_cold int validate_options(AC3EncodeContext *s)
 
     /* validate bit rate */
     if (s->eac3) {
-        int max_br, min_br, wpf, min_br_dist, min_br_code;
+        int max_br, min_br, wpf, min_br_code;
         int num_blks_code, num_blocks, frame_samples;
+        long long min_br_dist;
 
         /* calculate min/max bitrate */
         /* TODO: More testing with 3 and 2 blocks. All E-AC-3 samples I've
@@ -2187,9 +2185,9 @@ static av_cold int validate_options(AC3EncodeContext *s)
            this is needed for lookup tables for bandwidth and coupling
            parameter selection */
         min_br_code = -1;
-        min_br_dist = INT_MAX;
+        min_br_dist = INT64_MAX;
         for (i = 0; i < 19; i++) {
-            int br_dist = abs(ff_ac3_bitrate_tab[i] * 1000 - avctx->bit_rate);
+            long long br_dist = llabs(ff_ac3_bitrate_tab[i] * 1000 - avctx->bit_rate);
             if (br_dist < min_br_dist) {
                 min_br_dist = br_dist;
                 min_br_code = i;
@@ -2202,10 +2200,11 @@ static av_cold int validate_options(AC3EncodeContext *s)
             wpf--;
         s->frame_size_min = 2 * wpf;
     } else {
-        int best_br = 0, best_code = 0, best_diff = INT_MAX;
+        int best_br = 0, best_code = 0;
+        long long best_diff = INT64_MAX;
         for (i = 0; i < 19; i++) {
             int br   = (ff_ac3_bitrate_tab[i] >> s->bit_alloc.sr_shift) * 1000;
-            int diff = abs(br - avctx->bit_rate);
+            long long diff = llabs(br - avctx->bit_rate);
             if (diff < best_diff) {
                 best_br   = br;
                 best_code = i;
@@ -2253,7 +2252,7 @@ static av_cold int validate_options(AC3EncodeContext *s)
  */
 static av_cold void set_bandwidth(AC3EncodeContext *s)
 {
-    int blk, ch, cpl_start;
+    int blk, ch, av_uninit(cpl_start);
 
     if (s->cutoff) {
         /* calculate bandwidth based on user-specified cutoff frequency */
@@ -2332,50 +2331,50 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     if (s->allocate_sample_buffers(s))
         goto alloc_fail;
 
-    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->bap_buffer, total_coefs,
                      sizeof(*s->bap_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->bap1_buffer, total_coefs,
                      sizeof(*s->bap1_buffer), alloc_fail);
-    FF_ALLOCZ_OR_GOTO(avctx, s->mdct_coef_buffer, total_coefs *
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->mdct_coef_buffer, total_coefs,
                       sizeof(*s->mdct_coef_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->exp_buffer, total_coefs,
                      sizeof(*s->exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, channel_blocks * 128 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->grouped_exp_buffer, channel_blocks, 128 *
                      sizeof(*s->grouped_exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->psd_buffer, total_coefs,
                      sizeof(*s->psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, channel_blocks * 64 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->band_psd_buffer, channel_blocks, 64 *
                      sizeof(*s->band_psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, channel_blocks * 64 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->mask_buffer, channel_blocks, 64 *
                      sizeof(*s->mask_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->qmant_buffer, total_coefs,
                      sizeof(*s->qmant_buffer), alloc_fail);
     if (s->cpl_enabled) {
-        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_exp_buffer, channel_blocks * 16 *
+        FF_ALLOC_ARRAY_OR_GOTO(avctx, s->cpl_coord_exp_buffer, channel_blocks, 16 *
                          sizeof(*s->cpl_coord_exp_buffer), alloc_fail);
-        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_mant_buffer, channel_blocks * 16 *
+        FF_ALLOC_ARRAY_OR_GOTO(avctx, s->cpl_coord_mant_buffer, channel_blocks, 16 *
                          sizeof(*s->cpl_coord_mant_buffer), alloc_fail);
     }
     for (blk = 0; blk < s->num_blocks; blk++) {
         AC3Block *block = &s->blocks[blk];
-        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, channels * sizeof(*block->mdct_coef),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->mdct_coef, channels, sizeof(*block->mdct_coef),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->exp, channels * sizeof(*block->exp),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->exp, channels, sizeof(*block->exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->grouped_exp, channels * sizeof(*block->grouped_exp),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->grouped_exp, channels, sizeof(*block->grouped_exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->psd, channels * sizeof(*block->psd),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->psd, channels, sizeof(*block->psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->band_psd, channels * sizeof(*block->band_psd),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->band_psd, channels, sizeof(*block->band_psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->mask, channels * sizeof(*block->mask),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->mask, channels, sizeof(*block->mask),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->qmant, channels * sizeof(*block->qmant),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->qmant, channels, sizeof(*block->qmant),
                           alloc_fail);
         if (s->cpl_enabled) {
-            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_exp, channels * sizeof(*block->cpl_coord_exp),
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->cpl_coord_exp, channels, sizeof(*block->cpl_coord_exp),
                               alloc_fail);
-            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_mant, channels * sizeof(*block->cpl_coord_mant),
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->cpl_coord_mant, channels, sizeof(*block->cpl_coord_mant),
                               alloc_fail);
         }
 
@@ -2398,11 +2397,11 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     }
 
     if (!s->fixed_point) {
-        FF_ALLOCZ_OR_GOTO(avctx, s->fixed_coef_buffer, total_coefs *
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->fixed_coef_buffer, total_coefs,
                           sizeof(*s->fixed_coef_buffer), alloc_fail);
         for (blk = 0; blk < s->num_blocks; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->fixed_coef, channels,
                               sizeof(*block->fixed_coef), alloc_fail);
             for (ch = 0; ch < channels; ch++)
                 block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (s->num_blocks * ch + blk)];
@@ -2410,7 +2409,7 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     } else {
         for (blk = 0; blk < s->num_blocks; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->fixed_coef, channels,
                               sizeof(*block->fixed_coef), alloc_fail);
             for (ch = 0; ch < channels; ch++)
                 block->fixed_coef[ch] = (int32_t *)block->mdct_coef[ch];
@@ -2432,8 +2431,6 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
 
     s->eac3 = avctx->codec_id == AV_CODEC_ID_EAC3;
 
-    ff_ac3_common_init();
-
     ret = validate_options(s);
     if (ret)
         return ret;
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index 76b6d7f..a2442d0 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000 Fabrice Bellard
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -165,7 +165,7 @@ typedef struct AC3EncodeContext {
     AVCodecContext *avctx;                  ///< parent AVCodecContext
     PutBitContext pb;                       ///< bitstream writer context
     AudioDSPContext adsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     MECmpContext mecc;
     AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
     FFTContext mdct;                        ///< FFT context for MDCT calculation
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index a4ab5df..b23fc64 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -74,6 +74,12 @@ static void scale_coefficients(AC3EncodeContext *s)
     }
 }
 
+static void sum_square_butterfly(AC3EncodeContext *s, int64_t sum[4],
+                                 const int32_t *coef0, const int32_t *coef1,
+                                 int len)
+{
+    s->ac3dsp.sum_square_butterfly_int32(sum, coef0, coef1, len);
+}
 
 /*
  * Clip MDCT coefficients to allowable range.
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 95acea7..d6e658b 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,7 +36,6 @@
 
 #define AC3ENC_TYPE AC3ENC_TYPE_AC3
 #include "ac3enc_opts_template.c"
-
 static const AVClass ac3enc_class = {
     .class_name = "AC-3 Encoder",
     .item_name  = av_default_item_name,
@@ -79,6 +78,13 @@ static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl)
     return FFMIN(coord, COEF_MAX);
 }
 
+static void sum_square_butterfly(AC3EncodeContext *s, float sum[4],
+                                 const float *coef0, const float *coef1,
+                                 int len)
+{
+    s->ac3dsp.sum_square_butterfly_float(sum, coef0, coef1, len);
+}
+
 
 #include "ac3enc_template.c"
 
@@ -109,7 +115,7 @@ av_cold int ff_ac3_float_mdct_init(AC3EncodeContext *s)
     n  = 1 << 9;
     n2 = n >> 1;
 
-    window = av_malloc(n * sizeof(*window));
+    window = av_malloc_array(n, sizeof(*window));
     if (!window) {
         av_log(s->avctx, AV_LOG_ERROR, "Cannot allocate memory.\n");
         return AVERROR(ENOMEM);
@@ -126,7 +132,9 @@ av_cold int ff_ac3_float_mdct_init(AC3EncodeContext *s)
 av_cold int ff_ac3_float_encode_init(AVCodecContext *avctx)
 {
     AC3EncodeContext *s = avctx->priv_data;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
     return ff_ac3_encode_init(avctx);
 }
 
diff --git a/libavcodec/ac3enc_opts_template.c b/libavcodec/ac3enc_opts_template.c
index a08c70d..57b65a7 100644
--- a/libavcodec/ac3enc_opts_template.c
+++ b/libavcodec/ac3enc_opts_template.c
@@ -2,20 +2,20 @@
  * AC-3 encoder options
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 
 static const AVOption ac3_options[] = {
 /* Metadata Options */
-{"per_frame_metadata", "Allow Changing Metadata Per-Frame", OFFSET(allow_per_frame_metadata), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, AC3ENC_PARAM},
+{"per_frame_metadata", "Allow Changing Metadata Per-Frame", OFFSET(allow_per_frame_metadata), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AC3ENC_PARAM},
 #if AC3ENC_TYPE != AC3ENC_TYPE_EAC3
 /* AC-3 downmix levels */
 {"center_mixlev", "Center Mix Level", OFFSET(center_mix_level), AV_OPT_TYPE_FLOAT, {.dbl = LEVEL_MINUS_4POINT5DB }, 0.0, 1.0, AC3ENC_PARAM},
@@ -68,7 +68,7 @@ static const AVOption ac3_options[] = {
     {"standard", "Standard (default)", 0, AV_OPT_TYPE_CONST, {.i64 = AC3ENC_OPT_ADCONV_STANDARD }, INT_MIN, INT_MAX, AC3ENC_PARAM, "ad_conv_type"},
     {"hdcd",     "HDCD",               0, AV_OPT_TYPE_CONST, {.i64 = AC3ENC_OPT_ADCONV_HDCD     }, INT_MIN, INT_MAX, AC3ENC_PARAM, "ad_conv_type"},
 /* Other Encoding Options */
-{"stereo_rematrixing", "Stereo Rematrixing", OFFSET(stereo_rematrixing), AV_OPT_TYPE_INT, {.i64 = AC3ENC_OPT_ON }, AC3ENC_OPT_OFF, AC3ENC_OPT_ON, AC3ENC_PARAM},
+{"stereo_rematrixing", "Stereo Rematrixing", OFFSET(stereo_rematrixing), AV_OPT_TYPE_BOOL, {.i64 = 1 }, 0, 1, AC3ENC_PARAM},
 {"channel_coupling",   "Channel Coupling",   OFFSET(channel_coupling),   AV_OPT_TYPE_INT, {.i64 = AC3ENC_OPT_AUTO }, AC3ENC_OPT_AUTO, AC3ENC_OPT_ON, AC3ENC_PARAM, "channel_coupling"},
     {"auto", "Selected by the Encoder", 0, AV_OPT_TYPE_CONST, {.i64 = AC3ENC_OPT_AUTO }, INT_MIN, INT_MAX, AC3ENC_PARAM, "channel_coupling"},
 {"cpl_start_band", "Coupling Start Band", OFFSET(cpl_start), AV_OPT_TYPE_INT, {.i64 = AC3ENC_OPT_AUTO }, AC3ENC_OPT_AUTO, 15, AC3ENC_PARAM, "cpl_start_band"},
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index ef40b5a..be65987 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2011 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,7 +43,7 @@ int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
 
     FF_ALLOC_OR_GOTO(s->avctx, s->windowed_samples, AC3_WINDOW_SIZE *
                      sizeof(*s->windowed_samples), alloc_fail);
-    FF_ALLOC_OR_GOTO(s->avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
+    FF_ALLOC_ARRAY_OR_GOTO(s->avctx, s->planar_samples, s->channels, sizeof(*s->planar_samples),
                      alloc_fail);
     for (ch = 0; ch < s->channels; ch++) {
         FF_ALLOCZ_OR_GOTO(s->avctx, s->planar_samples[ch],
@@ -59,7 +59,7 @@ alloc_fail:
 
 /*
  * Copy input samples.
- * Channels are reordered from Libav's default order to AC-3 order.
+ * Channels are reordered from FFmpeg's default order to AC-3 order.
  */
 static void copy_input_samples(AC3EncodeContext *s, SampleType **samples)
 {
@@ -94,7 +94,7 @@ static void apply_mdct(AC3EncodeContext *s)
             const SampleType *input_samples = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];
 
 #if CONFIG_AC3ENC_FLOAT
-            s->fdsp.vector_fmul(s->windowed_samples, input_samples,
+            s->fdsp->vector_fmul(s->windowed_samples, input_samples,
                                 s->mdct_window, AC3_WINDOW_SIZE);
 #else
             s->ac3dsp.apply_window_int16(s->windowed_samples, input_samples,
@@ -122,7 +122,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
 #else
     int32_t (*fixed_cpl_coords)[AC3_MAX_CHANNELS][16] = cpl_coords;
 #endif
-    int blk, ch, bnd, i, j;
+    int av_uninit(blk), ch, bnd, i, j;
     CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
     int cpl_start, num_cpl_coefs;
 
@@ -325,8 +325,8 @@ static void apply_channel_coupling(AC3EncodeContext *s)
 static void compute_rematrixing_strategy(AC3EncodeContext *s)
 {
     int nb_coefs;
-    int blk, bnd, i;
-    AC3Block *block, *block0;
+    int blk, bnd;
+    AC3Block *block, *block0 = NULL;
 
     if (s->channel_mode != AC3_CHMODE_STEREO)
         return;
@@ -350,20 +350,12 @@ static void compute_rematrixing_strategy(AC3EncodeContext *s)
         }
 
         for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
-            /* calculate calculate sum of squared coeffs for one band in one block */
+            /* calculate sum of squared coeffs for one band in one block */
             int start = ff_ac3_rematrix_band_tab[bnd];
             int end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
-            CoefSumType sum[4] = {0,};
-            for (i = start; i < end; i++) {
-                CoefType lt = block->mdct_coef[1][i];
-                CoefType rt = block->mdct_coef[2][i];
-                CoefType md = lt + rt;
-                CoefType sd = lt - rt;
-                MAC_COEF(sum[0], lt, lt);
-                MAC_COEF(sum[1], rt, rt);
-                MAC_COEF(sum[2], md, md);
-                MAC_COEF(sum[3], sd, sd);
-            }
+            CoefSumType sum[4];
+            sum_square_butterfly(s, sum, block->mdct_coef[1] + start,
+                                 block->mdct_coef[2] + start, end - start);
 
             /* compare sums to determine if rematrixing will be used for this band */
             if (FFMIN(sum[2], sum[3]) < FFMIN(sum[0], sum[1]))
@@ -432,10 +424,8 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, AVPacket *avpkt,
 
     ff_ac3_quantize_mantissas(s);
 
-    if ((ret = ff_alloc_packet(avpkt, s->frame_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->frame_size, 0)) < 0)
         return ret;
-    }
     ff_ac3_output_frame(s, avpkt->data);
 
     if (frame->pts != AV_NOPTS_VALUE)
diff --git a/libavcodec/ac3tab.c b/libavcodec/ac3tab.c
index 3cd07f9..bd88f32 100644
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -2,20 +2,20 @@
  * AC-3 tables
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -116,7 +116,7 @@ const uint8_t ff_ac3_enc_channel_map[8][2][6] = {
 };
 
 /**
- * Table to remap channels from from AC-3 order to SMPTE order.
+ * Table to remap channels from AC-3 order to SMPTE order.
  * [channel_mode][lfe][ch]
  */
 const uint8_t ff_ac3_dec_channel_map[8][2][6] = {
@@ -314,3 +314,21 @@ const uint16_t ff_eac3_default_chmap[8] = {
     AC3_CHMAP_L |               AC3_CHMAP_R | AC3_CHMAP_L_SUR |                  AC3_CHMAP_R_SUR,
     AC3_CHMAP_L | AC3_CHMAP_C | AC3_CHMAP_R | AC3_CHMAP_L_SUR |                  AC3_CHMAP_R_SUR
 };
+const uint64_t ff_eac3_custom_channel_map_locations[16][2] = {
+    { 1, AV_CH_FRONT_LEFT },
+    { 1, AV_CH_FRONT_CENTER },
+    { 1, AV_CH_FRONT_RIGHT },
+    { 1, AV_CH_SIDE_LEFT },
+    { 1, AV_CH_SIDE_RIGHT },
+    { 0, AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER },
+    { 0, AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT },
+    { 0, AV_CH_BACK_CENTER },
+    { 0, AV_CH_TOP_CENTER },
+    { 0, AV_CH_SURROUND_DIRECT_LEFT | AV_CH_SURROUND_DIRECT_RIGHT },
+    { 0, AV_CH_WIDE_LEFT | AV_CH_WIDE_RIGHT },
+    { 0, AV_CH_TOP_FRONT_LEFT | AV_CH_TOP_FRONT_RIGHT},
+    { 0, AV_CH_TOP_FRONT_CENTER },
+    { 0, AV_CH_TOP_BACK_LEFT | AV_CH_TOP_BACK_RIGHT },
+    { 0, AV_CH_LOW_FREQUENCY_2 },
+    { 1, AV_CH_LOW_FREQUENCY },
+};
diff --git a/libavcodec/ac3tab.h b/libavcodec/ac3tab.h
index 4c0122c..aa71acb 100644
--- a/libavcodec/ac3tab.h
+++ b/libavcodec/ac3tab.h
@@ -2,20 +2,20 @@
  * AC-3 tables
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,6 @@
 #include "ac3.h"
 #include "internal.h"
 
-#if CONFIG_HARDCODED_TABLES
-#   define HCONST const
-#else
-#   define HCONST
-#endif
-
 extern const uint16_t ff_ac3_frame_size_tab[38][3];
 extern const uint8_t  ff_ac3_channels_tab[8];
 extern av_export_avcodec const uint16_t avpriv_ac3_channel_layout_tab[8];
@@ -55,7 +49,9 @@ extern const int16_t  ff_ac3_floor_tab[8];
 extern const uint16_t ff_ac3_fast_gain_tab[8];
 extern const uint16_t ff_eac3_default_chmap[8];
 extern const uint8_t  ff_ac3_band_start_tab[AC3_CRITICAL_BANDS+1];
-extern HCONST uint8_t ff_ac3_bin_to_band_tab[253];
+extern const uint8_t  ff_ac3_bin_to_band_tab[253];
+extern const uint64_t ff_eac3_custom_channel_map_locations[16][2];
+
 
 /** Custom channel map locations bitmask
  *  Other channels described in documentation:
diff --git a/libavcodec/acelp_filters.c b/libavcodec/acelp_filters.c
index 93bec65..35aa863 100644
--- a/libavcodec/acelp_filters.c
+++ b/libavcodec/acelp_filters.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "avcodec.h"
 #include "acelp_filters.h"
@@ -46,7 +47,7 @@ void ff_acelp_interpolate(int16_t* out, const int16_t* in,
 {
     int n, i;
 
-    assert(frac_pos >= 0 && frac_pos < precision);
+    av_assert1(frac_pos >= 0 && frac_pos < precision);
 
     for (n = 0; n < length; n++) {
         int idx = 0;
@@ -69,7 +70,7 @@ void ff_acelp_interpolate(int16_t* out, const int16_t* in,
             v += in[n - i] * filter_coeffs[idx - frac_pos];
         }
         if (av_clip_int16(v >> 15) != (v >> 15))
-            av_log(NULL, AV_LOG_WARNING, "overflow that would need cliping in ff_acelp_interpolate()\n");
+            av_log(NULL, AV_LOG_WARNING, "overflow that would need clipping in ff_acelp_interpolate()\n");
         out[n] = v >> 15;
     }
 }
@@ -143,3 +144,12 @@ void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
     samples[0] -= tilt * *mem;
     *mem = new_tilt_mem;
 }
+
+void ff_acelp_filter_init(ACELPFContext *c)
+{
+    c->acelp_interpolatef                      = ff_acelp_interpolatef;
+    c->acelp_apply_order_2_transfer_function   = ff_acelp_apply_order_2_transfer_function;
+
+    if(HAVE_MIPSFPU)
+        ff_acelp_filter_init_mips(c);
+}
diff --git a/libavcodec/acelp_filters.h b/libavcodec/acelp_filters.h
index 2be4c24..fe86cb2 100644
--- a/libavcodec/acelp_filters.h
+++ b/libavcodec/acelp_filters.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,39 @@
 
 #include <stdint.h>
 
+typedef struct ACELPFContext {
+    /**
+    * Floating point version of ff_acelp_interpolate()
+    */
+    void (*acelp_interpolatef)(float *out, const float *in,
+                            const float *filter_coeffs, int precision,
+                            int frac_pos, int filter_length, int length);
+
+    /**
+     * Apply an order 2 rational transfer function in-place.
+     *
+     * @param out output buffer for filtered speech samples
+     * @param in input buffer containing speech data (may be the same as out)
+     * @param zero_coeffs z^-1 and z^-2 coefficients of the numerator
+     * @param pole_coeffs z^-1 and z^-2 coefficients of the denominator
+     * @param gain scale factor for final output
+     * @param mem intermediate values used by filter (should be 0 initially)
+     * @param n number of samples (should be a multiple of eight)
+     */
+    void (*acelp_apply_order_2_transfer_function)(float *out, const float *in,
+                                                  const float zero_coeffs[2],
+                                                  const float pole_coeffs[2],
+                                                  float gain,
+                                                  float mem[2], int n);
+
+}ACELPFContext;
+
+/**
+ * Initialize ACELPFContext.
+ */
+void ff_acelp_filter_init(ACELPFContext *c);
+void ff_acelp_filter_init_mips(ACELPFContext *c);
+
 /**
  * low-pass Finite Impulse Response filter coefficients.
  *
@@ -75,7 +108,7 @@ void ff_acelp_interpolatef(float *out, const float *in,
  *
  * The filter has a cut-off frequency of 1/80 of the sampling freq
  *
- * @note Two items before the top of the out buffer must contain two items from the
+ * @note Two items before the top of the in buffer must contain two items from the
  *       tail of the previous subframe.
  *
  * @remark It is safe to pass the same array in in and out parameters.
diff --git a/libavcodec/acelp_pitch_delay.c b/libavcodec/acelp_pitch_delay.c
index 1965772..a070d1b 100644
--- a/libavcodec/acelp_pitch_delay.c
+++ b/libavcodec/acelp_pitch_delay.c
@@ -3,24 +3,25 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/common.h"
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
@@ -107,9 +108,20 @@ int16_t ff_acelp_decode_gain_code(
     for(i=0; i<ma_pred_order; i++)
         mr_energy += quant_energy[i] * ma_prediction_coeff[i];
 
-    mr_energy = gain_corr_factor * exp(M_LN10 / (20 << 23) * mr_energy) /
+#ifdef G729_BITEXACT
+    mr_energy += (((-6165LL * ff_log2(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size, 0))) >> 3) & ~0x3ff);
+
+    mr_energy = (5439 * (mr_energy >> 15)) >> 8;           // (0.15) = (0.15) * (7.23)
+
+    return bidir_sal(
+               ((ff_exp2(mr_energy & 0x7fff) + 16) >> 5) * (gain_corr_factor >> 1),
+               (mr_energy >> 15) - 25
+           );
+#else
+    mr_energy = gain_corr_factor * ff_exp10((double)mr_energy / (20 << 23)) /
                 sqrt(adsp->scalarproduct_int16(fc_v, fc_v, subframe_size));
     return mr_energy >> 12;
+#endif
 }
 
 float ff_amr_set_fixed_gain(float fixed_gain_factor, float fixed_mean_energy,
@@ -120,10 +132,10 @@ float ff_amr_set_fixed_gain(float fixed_gain_factor, float fixed_mean_energy,
     // ^g_c = ^gamma_gc * 100.05 (predicted dB + mean dB - dB of fixed vector)
     // Note 10^(0.05 * -10log(average x2)) = 1/sqrt((average x2)).
     float val = fixed_gain_factor *
-        exp2f(M_LOG2_10 * 0.05 *
+        ff_exp10(0.05 *
               (avpriv_scalarproduct_float_c(pred_table, prediction_error, 4) +
                energy_mean)) /
-        sqrtf(fixed_mean_energy);
+        sqrtf(fixed_mean_energy ? fixed_mean_energy : 1.0);
 
     // update quantified prediction error energy history
     memmove(&prediction_error[0], &prediction_error[1],
diff --git a/libavcodec/acelp_pitch_delay.h b/libavcodec/acelp_pitch_delay.h
index 7b5b33d..2aade2f 100644
--- a/libavcodec/acelp_pitch_delay.h
+++ b/libavcodec/acelp_pitch_delay.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/acelp_vectors.c b/libavcodec/acelp_vectors.c
index 0c660ac..798217d 100644
--- a/libavcodec/acelp_vectors.c
+++ b/libavcodec/acelp_vectors.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
@@ -50,6 +51,26 @@ const uint8_t ff_fc_2pulses_9bits_track1_gray[16] =
   28, 26,
 };
 
+const uint8_t ff_fc_2pulses_9bits_track2_gray[32] =
+{
+  0,  2,
+  5,  4,
+  12, 10,
+  7,  9,
+  25, 24,
+  20, 22,
+  14, 15,
+  19, 17,
+  36, 31,
+  21, 26,
+  1,  6,
+  16, 11,
+  27, 29,
+  32, 30,
+  39, 37,
+  34, 35,
+};
+
 const uint8_t ff_fc_4pulses_8bits_tracks_13[16] =
 {
   0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75,
@@ -219,11 +240,13 @@ void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
         int x   = in->x[i], repeats = !((in->no_repeat_mask >> i) & 1);
         float y = in->y[i] * scale;
 
-        do {
-            out[x] += y;
-            y *= in->pitch_fac;
-            x += in->pitch_lag;
-        } while (x < size && repeats);
+        if (in->pitch_lag > 0)
+            av_assert0(x < size);
+            do {
+                out[x] += y;
+                y *= in->pitch_fac;
+                x += in->pitch_lag;
+            } while (x < size && repeats);
     }
 }
 
@@ -234,9 +257,18 @@ void ff_clear_fixed_vector(float *out, const AMRFixed *in, int size)
     for (i=0; i < in->n; i++) {
         int x  = in->x[i], repeats = !((in->no_repeat_mask >> i) & 1);
 
-        do {
-            out[x] = 0.0;
-            x += in->pitch_lag;
-        } while (x < size && repeats);
+        if (in->pitch_lag > 0)
+            do {
+                out[x] = 0.0;
+                x += in->pitch_lag;
+            } while (x < size && repeats);
     }
 }
+
+void ff_acelp_vectors_init(ACELPVContext *c)
+{
+    c->weighted_vector_sumf   = ff_weighted_vector_sumf;
+
+    if(HAVE_MIPSFPU)
+        ff_acelp_vectors_init_mips(c);
+}
diff --git a/libavcodec/acelp_vectors.h b/libavcodec/acelp_vectors.h
index d6226bf..fae834d 100644
--- a/libavcodec/acelp_vectors.h
+++ b/libavcodec/acelp_vectors.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,30 @@
 
 #include <stdint.h>
 
+typedef struct ACELPVContext {
+    /**
+     * float implementation of weighted sum of two vectors.
+     * @param[out] out result of addition
+     * @param in_a first vector
+     * @param in_b second vector
+     * @param weight_coeff_a first vector weight coefficient
+     * @param weight_coeff_a second vector weight coefficient
+     * @param length vectors length (should be a multiple of two)
+     *
+     * @note It is safe to pass the same buffer for out and in_a or in_b.
+     */
+    void (*weighted_vector_sumf)(float *out, const float *in_a, const float *in_b,
+                                 float weight_coeff_a, float weight_coeff_b,
+                                 int length);
+
+}ACELPVContext;
+
+/**
+ * Initialize ACELPVContext.
+ */
+void ff_acelp_vectors_init(ACELPVContext *c);
+void ff_acelp_vectors_init_mips(ACELPVContext *c);
+
 /** Sparse representation for the algebraic codebook (fixed) vector */
 typedef struct AMRFixed {
     int      n;
@@ -82,6 +106,37 @@ extern const uint8_t ff_fc_2pulses_9bits_track1[16];
 extern const uint8_t ff_fc_2pulses_9bits_track1_gray[16];
 
 /**
+ * Track|Pulse|        Positions
+ * -----------------------------------------
+ *  2   | 1   | 0, 7, 14, 20, 27, 34,  1, 21
+ *      |     | 2, 9, 15, 22, 29, 35,  6, 26
+ *      |     | 4,10, 17, 24, 30, 37, 11, 31
+ *      |     | 5,12, 19, 25, 32, 39, 16, 36
+ * -----------------------------------------
+ *
+ * @remark Track in the table should be read top-to-bottom, left-to-right.
+ *
+ * @note (EE.1) This table (from the reference code) does not comply with
+ *              the specification.
+ *              The specification contains the following table:
+ *
+ * Track|Pulse|        Positions
+ * -----------------------------------------
+ *  2   | 1   | 0, 5, 10, 15, 20, 25, 30, 35
+ *      |     | 1, 6, 11, 16, 21, 26, 31, 36
+ *      |     | 2, 7, 12, 17, 22, 27, 32, 37
+ *      |     | 4, 9, 14, 19, 24, 29, 34, 39
+ *
+ * -----------------------------------------
+ *
+ * @note (EE.2) Reference G.729D code also uses gray decoding for each
+ *              pulse index before looking up the value in the table.
+ *
+ * Used in G.729 @@6.4k (with gray coding)
+ */
+extern const uint8_t ff_fc_2pulses_9bits_track2_gray[32];
+
+/**
  * b60 hamming windowed sinc function coefficients
  */
 extern const float ff_b60_sinc[61];
diff --git a/libavcodec/adpcm.c b/libavcodec/adpcm.c
index fe51c0d..aa9c7c5 100644
--- a/libavcodec/adpcm.c
+++ b/libavcodec/adpcm.c
@@ -13,26 +13,24 @@
  * MAXIS EA ADPCM decoder by Robert Marston (rmarston@gmail.com)
  * THP ADPCM decoder by Marco Gerards (mgerards@xs4all.nl)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
-
 #include "avcodec.h"
-#include "bitstream.h"
-#include "put_bits.h"
+#include "get_bits.h"
 #include "bytestream.h"
 #include "adpcm.h"
 #include "adpcm_data.h"
@@ -86,8 +84,9 @@ static const int swf_index_tables[4][16] = {
 /* end of tables */
 
 typedef struct ADPCMDecodeContext {
-    ADPCMChannelStatus status[6];
+    ADPCMChannelStatus status[14];
     int vqa_version;                /**< VQA version. Used for ADPCM_IMA_WS */
+    int has_status;
 } ADPCMDecodeContext;
 
 static av_cold int adpcm_decode_init(AVCodecContext * avctx)
@@ -97,15 +96,29 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
     unsigned int max_channels = 2;
 
     switch(avctx->codec->id) {
+    case AV_CODEC_ID_ADPCM_DTK:
     case AV_CODEC_ID_ADPCM_EA:
         min_channels = 2;
         break;
+    case AV_CODEC_ID_ADPCM_AFC:
     case AV_CODEC_ID_ADPCM_EA_R1:
     case AV_CODEC_ID_ADPCM_EA_R2:
     case AV_CODEC_ID_ADPCM_EA_R3:
     case AV_CODEC_ID_ADPCM_EA_XAS:
         max_channels = 6;
         break;
+    case AV_CODEC_ID_ADPCM_MTAF:
+        min_channels = 2;
+        max_channels = 8;
+        break;
+    case AV_CODEC_ID_ADPCM_PSX:
+        max_channels = 8;
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_DAT4:
+    case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
+        max_channels = 14;
+        break;
     }
     if (avctx->channels < min_channels || avctx->channels > max_channels) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
@@ -117,10 +130,8 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
         c->status[0].step = c->status[1].step = 511;
         break;
     case AV_CODEC_ID_ADPCM_IMA_WAV:
-        if (avctx->bits_per_coded_sample != 4) {
-            av_log(avctx, AV_LOG_ERROR, "Only 4-bit ADPCM IMA WAV files are supported\n");
-            return -1;
-        }
+        if (avctx->bits_per_coded_sample < 2 || avctx->bits_per_coded_sample > 5)
+            return AVERROR_INVALIDDATA;
         break;
     case AV_CODEC_ID_ADPCM_IMA_APC:
         if (avctx->extradata && avctx->extradata_size >= 8) {
@@ -137,6 +148,8 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
     }
 
     switch(avctx->codec->id) {
+        case AV_CODEC_ID_ADPCM_AICA:
+        case AV_CODEC_ID_ADPCM_IMA_DAT4:
         case AV_CODEC_ID_ADPCM_IMA_QT:
         case AV_CODEC_ID_ADPCM_IMA_WAV:
         case AV_CODEC_ID_ADPCM_4XM:
@@ -146,6 +159,11 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
         case AV_CODEC_ID_ADPCM_EA_R3:
         case AV_CODEC_ID_ADPCM_EA_XAS:
         case AV_CODEC_ID_ADPCM_THP:
+        case AV_CODEC_ID_ADPCM_THP_LE:
+        case AV_CODEC_ID_ADPCM_AFC:
+        case AV_CODEC_ID_ADPCM_DTK:
+        case AV_CODEC_ID_ADPCM_PSX:
+        case AV_CODEC_ID_ADPCM_MTAF:
             avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
             break;
         case AV_CODEC_ID_ADPCM_IMA_WS:
@@ -159,7 +177,7 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
     return 0;
 }
 
-static inline short adpcm_ima_expand_nibble(ADPCMChannelStatus *c, char nibble, int shift)
+static inline int16_t adpcm_ima_expand_nibble(ADPCMChannelStatus *c, int8_t nibble, int shift)
 {
     int step_index;
     int predictor;
@@ -182,7 +200,30 @@ static inline short adpcm_ima_expand_nibble(ADPCMChannelStatus *c, char nibble,
     c->predictor = av_clip_int16(predictor);
     c->step_index = step_index;
 
-    return (short)c->predictor;
+    return (int16_t)c->predictor;
+}
+
+static inline int16_t adpcm_ima_wav_expand_nibble(ADPCMChannelStatus *c, GetBitContext *gb, int bps)
+{
+    int nibble, step_index, predictor, sign, delta, diff, step, shift;
+
+    shift = bps - 1;
+    nibble = get_bits_le(gb, bps),
+    step = ff_adpcm_step_table[c->step_index];
+    step_index = c->step_index + ff_adpcm_index_tables[bps - 2][nibble];
+    step_index = av_clip(step_index, 0, 88);
+
+    sign = nibble & (1 << shift);
+    delta = av_mod_uintp2(nibble, shift);
+    diff = ((2 * delta + 1) * step) >> shift;
+    predictor = c->predictor;
+    if (sign) predictor -= diff;
+    else predictor += diff;
+
+    c->predictor = av_clip_int16(predictor);
+    c->step_index = step_index;
+
+    return (int16_t)c->predictor;
 }
 
 static inline int adpcm_ima_qt_expand_nibble(ADPCMChannelStatus *c, int nibble, int shift)
@@ -211,7 +252,7 @@ static inline int adpcm_ima_qt_expand_nibble(ADPCMChannelStatus *c, int nibble,
     return c->predictor;
 }
 
-static inline short adpcm_ms_expand_nibble(ADPCMChannelStatus *c, int nibble)
+static inline int16_t adpcm_ms_expand_nibble(ADPCMChannelStatus *c, int nibble)
 {
     int predictor;
 
@@ -222,11 +263,36 @@ static inline short adpcm_ms_expand_nibble(ADPCMChannelStatus *c, int nibble)
     c->sample1 = av_clip_int16(predictor);
     c->idelta = (ff_adpcm_AdaptationTable[(int)nibble] * c->idelta) >> 8;
     if (c->idelta < 16) c->idelta = 16;
+    if (c->idelta > INT_MAX/768) {
+        av_log(NULL, AV_LOG_WARNING, "idelta overflow\n");
+        c->idelta = INT_MAX/768;
+    }
 
     return c->sample1;
 }
 
-static inline short adpcm_ct_expand_nibble(ADPCMChannelStatus *c, char nibble)
+static inline int16_t adpcm_ima_oki_expand_nibble(ADPCMChannelStatus *c, int nibble)
+{
+    int step_index, predictor, sign, delta, diff, step;
+
+    step = ff_adpcm_oki_step_table[c->step_index];
+    step_index = c->step_index + ff_adpcm_index_table[(unsigned)nibble];
+    step_index = av_clip(step_index, 0, 48);
+
+    sign = nibble & 8;
+    delta = nibble & 7;
+    diff = ((2 * delta + 1) * step) >> 3;
+    predictor = c->predictor;
+    if (sign) predictor -= diff;
+    else predictor += diff;
+
+    c->predictor = av_clip_intp2(predictor, 11);
+    c->step_index = step_index;
+
+    return c->predictor << 4;
+}
+
+static inline int16_t adpcm_ct_expand_nibble(ADPCMChannelStatus *c, int8_t nibble)
 {
     int sign, delta, diff;
     int new_step;
@@ -244,10 +310,10 @@ static inline short adpcm_ct_expand_nibble(ADPCMChannelStatus *c, char nibble)
     new_step = (ff_adpcm_AdaptationTable[nibble & 7] * c->step) >> 8;
     c->step = av_clip(new_step, 511, 32767);
 
-    return (short)c->predictor;
+    return (int16_t)c->predictor;
 }
 
-static inline short adpcm_sbpro_expand_nibble(ADPCMChannelStatus *c, char nibble, int size, int shift)
+static inline int16_t adpcm_sbpro_expand_nibble(ADPCMChannelStatus *c, int8_t nibble, int size, int shift)
 {
     int sign, delta, diff;
 
@@ -264,10 +330,10 @@ static inline short adpcm_sbpro_expand_nibble(ADPCMChannelStatus *c, char nibble
     else if (delta == 0 && c->step > 0)
         c->step--;
 
-    return (short) c->predictor;
+    return (int16_t) c->predictor;
 }
 
-static inline short adpcm_yamaha_expand_nibble(ADPCMChannelStatus *c, unsigned char nibble)
+static inline int16_t adpcm_yamaha_expand_nibble(ADPCMChannelStatus *c, uint8_t nibble)
 {
     if(!c->step) {
         c->predictor = 0;
@@ -277,7 +343,16 @@ static inline short adpcm_yamaha_expand_nibble(ADPCMChannelStatus *c, unsigned c
     c->predictor += (c->step * ff_adpcm_yamaha_difflookup[nibble]) / 8;
     c->predictor = av_clip_int16(c->predictor);
     c->step = (c->step * ff_adpcm_yamaha_indexscale[nibble]) >> 8;
-    c->step = av_clip(c->step, 127, 24567);
+    c->step = av_clip(c->step, 127, 24576);
+    return c->predictor;
+}
+
+static inline int16_t adpcm_mtaf_expand_nibble(ADPCMChannelStatus *c, uint8_t nibble)
+{
+    c->predictor += ff_adpcm_mtaf_stepsize[c->step][nibble];
+    c->predictor = av_clip_int16(c->predictor);
+    c->step += ff_adpcm_index_table[nibble];
+    c->step = av_clip_uintp2(c->step, 5);
     return c->predictor;
 }
 
@@ -299,11 +374,9 @@ static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
     for(i=0;i<4;i++) {
         shift  = 12 - (in[4+i*2] & 15);
         filter = in[4+i*2] >> 4;
-        if (filter > 4) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid XA-ADPCM filter %d (max. allowed is 4)\n",
-                   filter);
-            return AVERROR_INVALIDDATA;
+        if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table)) {
+            avpriv_request_sample(avctx, "unknown XA-ADPCM filter %d", filter);
+            filter=0;
         }
         f0 = xa_adpcm_table[filter][0];
         f1 = xa_adpcm_table[filter][1];
@@ -330,12 +403,11 @@ static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
 
         shift  = 12 - (in[5+i*2] & 15);
         filter = in[5+i*2] >> 4;
-        if (filter > 4) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid XA-ADPCM filter %d (max. allowed is 4)\n",
-                   filter);
-            return AVERROR_INVALIDDATA;
+        if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table)) {
+            avpriv_request_sample(avctx, "unknown XA-ADPCM filter %d", filter);
+            filter=0;
         }
+
         f0 = xa_adpcm_table[filter][0];
         f1 = xa_adpcm_table[filter][1];
 
@@ -367,35 +439,34 @@ static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
 static void adpcm_swf_decode(AVCodecContext *avctx, const uint8_t *buf, int buf_size, int16_t *samples)
 {
     ADPCMDecodeContext *c = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
     const int *table;
     int k0, signmask, nb_bits, count;
     int size = buf_size*8;
     int i;
 
-    bitstream_init(&bc, buf, size);
+    init_get_bits(&gb, buf, size);
 
     //read bits & initial values
-    nb_bits = bitstream_read(&bc, 2)+2;
+    nb_bits = get_bits(&gb, 2)+2;
     table = swf_index_tables[nb_bits-2];
     k0 = 1 << (nb_bits-2);
     signmask = 1 << (nb_bits-1);
 
-    while (bitstream_tell(&bc) <= size - 22 * avctx->channels) {
+    while (get_bits_count(&gb) <= size - 22*avctx->channels) {
         for (i = 0; i < avctx->channels; i++) {
-            *samples++              =
-            c->status[i].predictor  = bitstream_read_signed(&bc, 16);
-            c->status[i].step_index = bitstream_read(&bc, 6);
+            *samples++ = c->status[i].predictor = get_sbits(&gb, 16);
+            c->status[i].step_index = get_bits(&gb, 6);
         }
 
-        for (count = 0; bitstream_tell(&bc) <= size - nb_bits * avctx->channels && count < 4095; count++) {
+        for (count = 0; get_bits_count(&gb) <= size - nb_bits*avctx->channels && count < 4095; count++) {
             int i;
 
             for (i = 0; i < avctx->channels; i++) {
                 // similar to IMA adpcm
-                int delta = bitstream_read(&bc, nb_bits);
+                int delta = get_bits(&gb, nb_bits);
                 int step = ff_adpcm_step_table[c->status[i].step_index];
-                long vpdiff = 0; // vpdiff = (delta+0.5)*step/4
+                int vpdiff = 0; // vpdiff = (delta+0.5)*step/4
                 int k = k0;
 
                 do {
@@ -430,9 +501,11 @@ static void adpcm_swf_decode(AVCodecContext *avctx, const uint8_t *buf, int buf_
  * @param[out] coded_samples set to the number of samples as coded in the
  *                           packet, or 0 if the codec does not encode the
  *                           number of samples in each frame.
+ * @param[out] approx_nb_samples set to non-zero if the number of samples
+ *                               returned is an approximation.
  */
 static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
-                          int buf_size, int *coded_samples)
+                          int buf_size, int *coded_samples, int *approx_nb_samples)
 {
     ADPCMDecodeContext *s = avctx->priv_data;
     int nb_samples        = 0;
@@ -441,6 +514,10 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     int header_size;
 
     *coded_samples = 0;
+    *approx_nb_samples = 0;
+
+    if(ch <= 0)
+        return 0;
 
     switch (avctx->codec->id) {
     /* constant, only check buf_size */
@@ -458,8 +535,10 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     case AV_CODEC_ID_ADPCM_CT:
     case AV_CODEC_ID_ADPCM_IMA_APC:
     case AV_CODEC_ID_ADPCM_IMA_EA_SEAD:
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
     case AV_CODEC_ID_ADPCM_IMA_WS:
     case AV_CODEC_ID_ADPCM_YAMAHA:
+    case AV_CODEC_ID_ADPCM_AICA:
         nb_samples = buf_size * 2 / ch;
         break;
     }
@@ -470,9 +549,10 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     header_size = 0;
     switch (avctx->codec->id) {
         case AV_CODEC_ID_ADPCM_4XM:
+        case AV_CODEC_ID_ADPCM_IMA_DAT4:
         case AV_CODEC_ID_ADPCM_IMA_ISS:     header_size = 4 * ch;      break;
         case AV_CODEC_ID_ADPCM_IMA_AMV:     header_size = 8;           break;
-        case AV_CODEC_ID_ADPCM_IMA_SMJPEG:  header_size = 4;           break;
+        case AV_CODEC_ID_ADPCM_IMA_SMJPEG:  header_size = 4 * ch;      break;
     }
     if (header_size > 0)
         return (buf_size - header_size) * 2 / ch;
@@ -516,6 +596,7 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         *coded_samples -= *coded_samples % 28;
         nb_samples      = (buf_size - header_size) * 2 / ch;
         nb_samples     -= nb_samples % 28;
+        *approx_nb_samples = 1;
         break;
     case AV_CODEC_ID_ADPCM_IMA_DK3:
         if (avctx->block_align > 0)
@@ -525,17 +606,35 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     case AV_CODEC_ID_ADPCM_IMA_DK4:
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
+        if (buf_size < 4 * ch)
+            return AVERROR_INVALIDDATA;
         nb_samples = 1 + (buf_size - 4 * ch) * 2 / ch;
         break;
+    case AV_CODEC_ID_ADPCM_IMA_RAD:
+        if (avctx->block_align > 0)
+            buf_size = FFMIN(buf_size, avctx->block_align);
+        nb_samples = (buf_size - 4 * ch) * 2 / ch;
+        break;
     case AV_CODEC_ID_ADPCM_IMA_WAV:
+    {
+        int bsize = ff_adpcm_ima_block_sizes[avctx->bits_per_coded_sample - 2];
+        int bsamples = ff_adpcm_ima_block_samples[avctx->bits_per_coded_sample - 2];
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
-        nb_samples = 1 + (buf_size - 4 * ch) / (4 * ch) * 8;
+        if (buf_size < 4 * ch)
+            return AVERROR_INVALIDDATA;
+        nb_samples = 1 + (buf_size - 4 * ch) / (bsize * ch) * bsamples;
         break;
+    }
     case AV_CODEC_ID_ADPCM_MS:
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
-        nb_samples = 2 + (buf_size - 7 * ch) * 2 / ch;
+        nb_samples = (buf_size - 6 * ch) * 2 / ch;
+        break;
+    case AV_CODEC_ID_ADPCM_MTAF:
+        if (avctx->block_align > 0)
+            buf_size = FFMIN(buf_size, avctx->block_align);
+        nb_samples = (buf_size - 16 * (ch / 2)) * 2 / ch;
         break;
     case AV_CODEC_ID_ADPCM_SBPRO_2:
     case AV_CODEC_ID_ADPCM_SBPRO_3:
@@ -548,6 +647,8 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         case AV_CODEC_ID_ADPCM_SBPRO_4: samples_per_byte = 2; break;
         }
         if (!s->status[0].step_index) {
+            if (buf_size < ch)
+                return AVERROR_INVALIDDATA;
             nb_samples++;
             buf_size -= ch;
         }
@@ -568,15 +669,33 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         break;
     }
     case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
+        if (avctx->extradata) {
+            nb_samples = buf_size * 14 / (8 * ch);
+            break;
+        }
         has_coded_samples = 1;
         bytestream2_skip(gb, 4); // channel size
-        *coded_samples  = bytestream2_get_be32(gb);
-        *coded_samples -= *coded_samples % 14;
-        nb_samples      = (buf_size - 80) / (8 * ch) * 14;
+        *coded_samples  = (avctx->codec->id == AV_CODEC_ID_ADPCM_THP_LE) ?
+                          bytestream2_get_le32(gb) :
+                          bytestream2_get_be32(gb);
+        buf_size       -= 8 + 36 * ch;
+        buf_size       /= ch;
+        nb_samples      = buf_size / 8 * 14;
+        if (buf_size % 8 > 1)
+            nb_samples     += (buf_size % 8 - 1) * 2;
+        *approx_nb_samples = 1;
+        break;
+    case AV_CODEC_ID_ADPCM_AFC:
+        nb_samples = buf_size / (9 * ch) * 16;
         break;
     case AV_CODEC_ID_ADPCM_XA:
         nb_samples = (buf_size / 128) * 224 / ch;
         break;
+    case AV_CODEC_ID_ADPCM_DTK:
+    case AV_CODEC_ID_ADPCM_PSX:
+        nb_samples = buf_size / (16 * ch) * 28;
+        break;
     }
 
     /* validate coded sample count */
@@ -595,15 +714,15 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
     ADPCMDecodeContext *c = avctx->priv_data;
     ADPCMChannelStatus *cs;
     int n, m, channel, i;
-    short *samples;
+    int16_t *samples;
     int16_t **samples_p;
     int st; /* stereo */
     int count1, count2;
-    int nb_samples, coded_samples, ret;
+    int nb_samples, coded_samples, approx_nb_samples, ret;
     GetByteContext gb;
 
     bytestream2_init(&gb, buf, buf_size);
-    nb_samples = get_nb_samples(avctx, &gb, buf_size, &coded_samples);
+    nb_samples = get_nb_samples(avctx, &gb, buf_size, &coded_samples, &approx_nb_samples);
     if (nb_samples <= 0) {
         av_log(avctx, AV_LOG_ERROR, "invalid number of samples in packet\n");
         return AVERROR_INVALIDDATA;
@@ -611,17 +730,15 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = nb_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
-    samples = (short *)frame->data[0];
+    samples = (int16_t *)frame->data[0];
     samples_p = (int16_t **)frame->extended_data;
 
     /* use coded_samples when applicable */
     /* it is always <= nb_samples, so the output buffer will be large enough */
     if (coded_samples) {
-        if (coded_samples != nb_samples)
+        if (!approx_nb_samples && coded_samples != nb_samples)
             av_log(avctx, AV_LOG_WARNING, "mismatch in coded sample count\n");
         frame->nb_samples = nb_samples = coded_samples;
     }
@@ -683,6 +800,33 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
 
+        if (avctx->bits_per_coded_sample != 4) {
+            int samples_per_block = ff_adpcm_ima_block_samples[avctx->bits_per_coded_sample - 2];
+            int block_size = ff_adpcm_ima_block_sizes[avctx->bits_per_coded_sample - 2];
+            uint8_t temp[20 + AV_INPUT_BUFFER_PADDING_SIZE] = { 0 };
+            GetBitContext g;
+
+            for (n = 0; n < (nb_samples - 1) / samples_per_block; n++) {
+                for (i = 0; i < avctx->channels; i++) {
+                    int j;
+
+                    cs = &c->status[i];
+                    samples = &samples_p[i][1 + n * samples_per_block];
+                    for (j = 0; j < block_size; j++) {
+                        temp[j] = buf[4 * avctx->channels + block_size * n * avctx->channels +
+                                        (j % 4) + (j / 4) * (avctx->channels * 4) + i * 4];
+                    }
+                    ret = init_get_bits8(&g, (const uint8_t *)&temp, block_size);
+                    if (ret < 0)
+                        return ret;
+                    for (m = 0; m < samples_per_block; m++) {
+                        samples[m] = adpcm_ima_wav_expand_nibble(cs, &g,
+                                          avctx->bits_per_coded_sample);
+                    }
+                }
+            }
+            bytestream2_skip(&gb, avctx->block_align - avctx->channels * 4);
+        } else {
         for (n = 0; n < (nb_samples - 1) / 8; n++) {
             for (i = 0; i < avctx->channels; i++) {
                 cs = &c->status[i];
@@ -694,6 +838,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 }
             }
         }
+        }
         break;
     case AV_CODEC_ID_ADPCM_4XM:
         for (i = 0; i < avctx->channels; i++)
@@ -761,6 +906,27 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     }
+    case AV_CODEC_ID_ADPCM_MTAF:
+        for (channel = 0; channel < avctx->channels; channel+=2) {
+            bytestream2_skipu(&gb, 4);
+            c->status[channel    ].step      = bytestream2_get_le16u(&gb) & 0x1f;
+            c->status[channel + 1].step      = bytestream2_get_le16u(&gb) & 0x1f;
+            c->status[channel    ].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
+            bytestream2_skipu(&gb, 2);
+            c->status[channel + 1].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
+            bytestream2_skipu(&gb, 2);
+            for (n = 0; n < nb_samples; n+=2) {
+                int v = bytestream2_get_byteu(&gb);
+                samples_p[channel][n    ] = adpcm_mtaf_expand_nibble(&c->status[channel], v & 0x0F);
+                samples_p[channel][n + 1] = adpcm_mtaf_expand_nibble(&c->status[channel], v >> 4  );
+            }
+            for (n = 0; n < nb_samples; n+=2) {
+                int v = bytestream2_get_byteu(&gb);
+                samples_p[channel + 1][n    ] = adpcm_mtaf_expand_nibble(&c->status[channel + 1], v & 0x0F);
+                samples_p[channel + 1][n + 1] = adpcm_mtaf_expand_nibble(&c->status[channel + 1], v >> 4  );
+            }
+        }
+        break;
     case AV_CODEC_ID_ADPCM_IMA_DK4:
         for (channel = 0; channel < avctx->channels; channel++) {
             cs = &c->status[channel];
@@ -772,7 +938,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 return AVERROR_INVALIDDATA;
             }
         }
-        for (n = (nb_samples >> (1 - st)) - 1; n > 0; n--) {
+        for (n = (nb_samples - 1) >> (1 - st); n > 0; n--) {
             int v = bytestream2_get_byteu(&gb);
             *samples++ = adpcm_ima_expand_nibble(&c->status[0 ], v >> 4  , 3);
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v & 0x0F, 3);
@@ -837,6 +1003,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = c->status[0].predictor + c->status[1].predictor;
             *samples++ = c->status[0].predictor - c->status[1].predictor;
         }
+
+        if ((bytestream2_tell(&gb) & 1))
+            bytestream2_skip(&gb, 1);
         break;
     }
     case AV_CODEC_ID_ADPCM_IMA_ISS:
@@ -866,6 +1035,18 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v2, 3);
         }
         break;
+    case AV_CODEC_ID_ADPCM_IMA_DAT4:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            cs = &c->status[channel];
+            samples = samples_p[channel];
+            bytestream2_skip(&gb, 4);
+            for (n = 0; n < nb_samples; n += 2) {
+                int v = bytestream2_get_byteu(&gb);
+                *samples++ = adpcm_ima_expand_nibble(cs, v >> 4  , 3);
+                *samples++ = adpcm_ima_expand_nibble(cs, v & 0x0F, 3);
+            }
+        }
+        break;
     case AV_CODEC_ID_ADPCM_IMA_APC:
         while (bytestream2_get_bytes_left(&gb) > 0) {
             int v = bytestream2_get_byteu(&gb);
@@ -873,6 +1054,38 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v & 0x0F, 3);
         }
         break;
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
+        while (bytestream2_get_bytes_left(&gb) > 0) {
+            int v = bytestream2_get_byteu(&gb);
+            *samples++ = adpcm_ima_oki_expand_nibble(&c->status[0],  v >> 4  );
+            *samples++ = adpcm_ima_oki_expand_nibble(&c->status[st], v & 0x0F);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_RAD:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            cs = &c->status[channel];
+            cs->step_index = sign_extend(bytestream2_get_le16u(&gb), 16);
+            cs->predictor  = sign_extend(bytestream2_get_le16u(&gb), 16);
+            if (cs->step_index > 88u){
+                av_log(avctx, AV_LOG_ERROR, "ERROR: step_index[%d] = %i\n",
+                       channel, cs->step_index);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        for (n = 0; n < nb_samples / 2; n++) {
+            int byte[2];
+
+            byte[0] = bytestream2_get_byteu(&gb);
+            if (st)
+                byte[1] = bytestream2_get_byteu(&gb);
+            for(channel = 0; channel < avctx->channels; channel++) {
+                *samples++ = adpcm_ima_expand_nibble(&c->status[channel], byte[channel] & 0x0F, 3);
+            }
+            for(channel = 0; channel < avctx->channels; channel++) {
+                *samples++ = adpcm_ima_expand_nibble(&c->status[channel], byte[channel] >> 4  , 3);
+            }
+        }
+        break;
     case AV_CODEC_ID_ADPCM_IMA_WS:
         if (c->vqa_version == 3) {
             for (channel = 0; channel < avctx->channels; channel++) {
@@ -902,6 +1115,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         int16_t *out1 = samples_p[1];
         int samples_per_block = 28 * (3 - avctx->channels) * 4;
         int sample_offset = 0;
+        int bytes_remaining;
         while (bytestream2_get_bytes_left(&gb) >= 128) {
             if ((ret = xa_decode(avctx, out0, out1, buf + bytestream2_tell(&gb),
                                  &c->status[0], &c->status[1],
@@ -910,6 +1124,12 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             bytestream2_skipu(&gb, 128);
             sample_offset += samples_per_block;
         }
+        /* Less than a full block of data left, e.g. when reading from
+         * 2324 byte per sector XA; the remainder is padding */
+        bytes_remaining = bytestream2_get_bytes_left(&gb);
+        if (bytes_remaining > 0) {
+            bytestream2_skip(&gb, bytes_remaining);
+        }
         break;
     }
     case AV_CODEC_ID_ADPCM_IMA_EA_EACS:
@@ -948,6 +1168,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         /* Each EA ADPCM frame has a 12-byte header followed by 30-byte pieces,
            each coding 28 stereo samples. */
 
+        if(avctx->channels != 2)
+            return AVERROR_INVALIDDATA;
+
         current_left_sample   = sign_extend(bytestream2_get_le16u(&gb), 16);
         previous_left_sample  = sign_extend(bytestream2_get_le16u(&gb), 16);
         current_right_sample  = sign_extend(bytestream2_get_le16u(&gb), 16);
@@ -1133,16 +1356,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     case AV_CODEC_ID_ADPCM_IMA_AMV:
-    case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
-        if (avctx->codec->id == AV_CODEC_ID_ADPCM_IMA_AMV) {
-            c->status[0].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
-            c->status[0].step_index = bytestream2_get_le16u(&gb);
-            bytestream2_skipu(&gb, 4);
-        } else {
-            c->status[0].predictor = sign_extend(bytestream2_get_be16u(&gb), 16);
-            c->status[0].step_index = bytestream2_get_byteu(&gb);
-            bytestream2_skipu(&gb, 1);
-        }
+        c->status[0].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
+        c->status[0].step_index = bytestream2_get_byteu(&gb);
+        bytestream2_skipu(&gb, 5);
         if (c->status[0].step_index > 88u) {
             av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n",
                    c->status[0].step_index);
@@ -1150,18 +1366,29 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
 
         for (n = nb_samples >> (1 - st); n > 0; n--) {
-            int hi, lo, v = bytestream2_get_byteu(&gb);
+            int v = bytestream2_get_byteu(&gb);
 
-            if (avctx->codec->id == AV_CODEC_ID_ADPCM_IMA_AMV) {
-                hi = v & 0x0F;
-                lo = v >> 4;
-            } else {
-                lo = v & 0x0F;
-                hi = v >> 4;
+            *samples++ = adpcm_ima_expand_nibble(&c->status[0], v >> 4, 3);
+            *samples++ = adpcm_ima_expand_nibble(&c->status[0], v & 0xf, 3);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
+        for (i = 0; i < avctx->channels; i++) {
+            c->status[i].predictor = sign_extend(bytestream2_get_be16u(&gb), 16);
+            c->status[i].step_index = bytestream2_get_byteu(&gb);
+            bytestream2_skipu(&gb, 1);
+            if (c->status[i].step_index > 88u) {
+                av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n",
+                       c->status[i].step_index);
+                return AVERROR_INVALIDDATA;
             }
+        }
+
+        for (n = nb_samples >> (1 - st); n > 0; n--) {
+            int v = bytestream2_get_byteu(&gb);
 
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], lo, 3);
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], hi, 3);
+            *samples++ = adpcm_ima_qt_expand_nibble(&c->status[0 ], v >> 4, 3);
+            *samples++ = adpcm_ima_qt_expand_nibble(&c->status[st], v & 0xf, 3);
         }
         break;
     case AV_CODEC_ID_ADPCM_CT:
@@ -1191,7 +1418,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                                                        byte & 0x0F, 4, 0);
             }
         } else if (avctx->codec->id == AV_CODEC_ID_ADPCM_SBPRO_3) {
-            for (n = nb_samples / 3; n > 0; n--) {
+            for (n = (nb_samples<<st) / 3; n > 0; n--) {
                 int byte = bytestream2_get_byteu(&gb);
                 *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
                                                         byte >> 5        , 3, 0);
@@ -1225,26 +1452,119 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_yamaha_expand_nibble(&c->status[st], v >> 4  );
         }
         break;
+    case AV_CODEC_ID_ADPCM_AICA:
+        if (!c->has_status) {
+            for (channel = 0; channel < avctx->channels; channel++)
+                c->status[channel].step = 0;
+            c->has_status = 1;
+        }
+        for (channel = 0; channel < avctx->channels; channel++) {
+            samples = samples_p[channel];
+            for (n = nb_samples >> 1; n > 0; n--) {
+                int v = bytestream2_get_byteu(&gb);
+                *samples++ = adpcm_yamaha_expand_nibble(&c->status[channel], v & 0x0F);
+                *samples++ = adpcm_yamaha_expand_nibble(&c->status[channel], v >> 4  );
+            }
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_AFC:
+    {
+        int samples_per_block;
+        int blocks;
+
+        if (avctx->extradata && avctx->extradata_size == 1 && avctx->extradata[0]) {
+            samples_per_block = avctx->extradata[0] / 16;
+            blocks = nb_samples / avctx->extradata[0];
+        } else {
+            samples_per_block = nb_samples / 16;
+            blocks = 1;
+        }
+
+        for (m = 0; m < blocks; m++) {
+        for (channel = 0; channel < avctx->channels; channel++) {
+            int prev1 = c->status[channel].sample1;
+            int prev2 = c->status[channel].sample2;
+
+            samples = samples_p[channel] + m * 16;
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < samples_per_block; i++) {
+                int byte = bytestream2_get_byteu(&gb);
+                int scale = 1 << (byte >> 4);
+                int index = byte & 0xf;
+                int factor1 = ff_adpcm_afc_coeffs[0][index];
+                int factor2 = ff_adpcm_afc_coeffs[1][index];
+
+                /* Decode 16 samples.  */
+                for (n = 0; n < 16; n++) {
+                    int32_t sampledat;
+
+                    if (n & 1) {
+                        sampledat = sign_extend(byte, 4);
+                    } else {
+                        byte = bytestream2_get_byteu(&gb);
+                        sampledat = sign_extend(byte >> 4, 4);
+                    }
+
+                    sampledat = ((prev1 * factor1 + prev2 * factor2) +
+                                 ((sampledat * scale) << 11)) >> 11;
+                    *samples = av_clip_int16(sampledat);
+                    prev2 = prev1;
+                    prev1 = *samples++;
+                }
+            }
+
+            c->status[channel].sample1 = prev1;
+            c->status[channel].sample2 = prev2;
+        }
+        }
+        bytestream2_seek(&gb, 0, SEEK_END);
+        break;
+    }
     case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
     {
-        int table[2][16];
-        int prev[2][2];
+        int table[14][16];
         int ch;
 
-        for (i = 0; i < 2; i++)
-            for (n = 0; n < 16; n++)
-                table[i][n] = sign_extend(bytestream2_get_be16u(&gb), 16);
+#define THP_GET16(g) \
+    sign_extend( \
+        avctx->codec->id == AV_CODEC_ID_ADPCM_THP_LE ? \
+        bytestream2_get_le16u(&(g)) : \
+        bytestream2_get_be16u(&(g)), 16)
+
+        if (avctx->extradata) {
+            GetByteContext tb;
+            if (avctx->extradata_size < 32 * avctx->channels) {
+                av_log(avctx, AV_LOG_ERROR, "Missing coeff table\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-        /* Initialize the previous sample.  */
-        for (i = 0; i < 2; i++)
-            for (n = 0; n < 2; n++)
-                prev[i][n] = sign_extend(bytestream2_get_be16u(&gb), 16);
+            bytestream2_init(&tb, avctx->extradata, avctx->extradata_size);
+            for (i = 0; i < avctx->channels; i++)
+                for (n = 0; n < 16; n++)
+                    table[i][n] = THP_GET16(tb);
+        } else {
+            for (i = 0; i < avctx->channels; i++)
+                for (n = 0; n < 16; n++)
+                    table[i][n] = THP_GET16(gb);
+
+            if (!c->has_status) {
+                /* Initialize the previous sample.  */
+                for (i = 0; i < avctx->channels; i++) {
+                    c->status[i].sample1 = THP_GET16(gb);
+                    c->status[i].sample2 = THP_GET16(gb);
+                }
+                c->has_status = 1;
+            } else {
+                bytestream2_skip(&gb, avctx->channels * 4);
+            }
+        }
 
-        for (ch = 0; ch <= st; ch++) {
+        for (ch = 0; ch < avctx->channels; ch++) {
             samples = samples_p[ch];
 
             /* Read in every sample for this channel.  */
-            for (i = 0; i < nb_samples / 14; i++) {
+            for (i = 0; i < (nb_samples + 13) / 14; i++) {
                 int byte = bytestream2_get_byteu(&gb);
                 int index = (byte >> 4) & 7;
                 unsigned int exp = byte & 0x0F;
@@ -1252,7 +1572,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 int factor2 = table[ch][index * 2 + 1];
 
                 /* Decode 14 samples.  */
-                for (n = 0; n < 14; n++) {
+                for (n = 0; n < 14 && (i * 14 + n < nb_samples); n++) {
                     int32_t sampledat;
 
                     if (n & 1) {
@@ -1262,30 +1582,131 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                         sampledat = sign_extend(byte >> 4, 4);
                     }
 
-                    sampledat = ((prev[ch][0]*factor1
-                                + prev[ch][1]*factor2) >> 11) + (sampledat << exp);
+                    sampledat = ((c->status[ch].sample1 * factor1
+                                + c->status[ch].sample2 * factor2) >> 11) + (sampledat << exp);
                     *samples = av_clip_int16(sampledat);
-                    prev[ch][1] = prev[ch][0];
-                    prev[ch][0] = *samples++;
+                    c->status[ch].sample2 = c->status[ch].sample1;
+                    c->status[ch].sample1 = *samples++;
                 }
             }
         }
         break;
     }
+    case AV_CODEC_ID_ADPCM_DTK:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            samples = samples_p[channel];
+
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < nb_samples / 28; i++) {
+                int byte, header;
+                if (channel)
+                    bytestream2_skipu(&gb, 1);
+                header = bytestream2_get_byteu(&gb);
+                bytestream2_skipu(&gb, 3 - channel);
+
+                /* Decode 28 samples.  */
+                for (n = 0; n < 28; n++) {
+                    int32_t sampledat, prev;
+
+                    switch (header >> 4) {
+                    case 1:
+                        prev = (c->status[channel].sample1 * 0x3c);
+                        break;
+                    case 2:
+                        prev = (c->status[channel].sample1 * 0x73) - (c->status[channel].sample2 * 0x34);
+                        break;
+                    case 3:
+                        prev = (c->status[channel].sample1 * 0x62) - (c->status[channel].sample2 * 0x37);
+                        break;
+                    default:
+                        prev = 0;
+                    }
+
+                    prev = av_clip_intp2((prev + 0x20) >> 6, 21);
+
+                    byte = bytestream2_get_byteu(&gb);
+                    if (!channel)
+                        sampledat = sign_extend(byte, 4);
+                    else
+                        sampledat = sign_extend(byte >> 4, 4);
+
+                    sampledat = (((sampledat << 12) >> (header & 0xf)) << 6) + prev;
+                    *samples++ = av_clip_int16(sampledat >> 6);
+                    c->status[channel].sample2 = c->status[channel].sample1;
+                    c->status[channel].sample1 = sampledat;
+                }
+            }
+            if (!channel)
+                bytestream2_seek(&gb, 0, SEEK_SET);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_PSX:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            samples = samples_p[channel];
+
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < nb_samples / 28; i++) {
+                int filter, shift, flag, byte;
+
+                filter = bytestream2_get_byteu(&gb);
+                shift  = filter & 0xf;
+                filter = filter >> 4;
+                if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table))
+                    return AVERROR_INVALIDDATA;
+                flag   = bytestream2_get_byteu(&gb);
+
+                /* Decode 28 samples.  */
+                for (n = 0; n < 28; n++) {
+                    int sample = 0, scale;
+
+                    if (flag < 0x07) {
+                        if (n & 1) {
+                            scale = sign_extend(byte >> 4, 4);
+                        } else {
+                            byte  = bytestream2_get_byteu(&gb);
+                            scale = sign_extend(byte, 4);
+                        }
+
+                        scale  = scale << 12;
+                        sample = (int)((scale >> shift) + (c->status[channel].sample1 * xa_adpcm_table[filter][0] + c->status[channel].sample2 * xa_adpcm_table[filter][1]) / 64);
+                    }
+                    *samples++ = av_clip_int16(sample);
+                    c->status[channel].sample2 = c->status[channel].sample1;
+                    c->status[channel].sample1 = sample;
+                }
+            }
+        }
+        break;
 
     default:
-        return -1;
+        av_assert0(0); // unsupported codec_id should not happen
+    }
+
+    if (avpkt->size && bytestream2_tell(&gb) == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Nothing consumed\n");
+        return AVERROR_INVALIDDATA;
     }
 
     *got_frame_ptr = 1;
 
+    if (avpkt->size < bytestream2_tell(&gb)) {
+        av_log(avctx, AV_LOG_ERROR, "Overread of %d < %d\n", avpkt->size, bytestream2_tell(&gb));
+        return avpkt->size;
+    }
+
     return bytestream2_tell(&gb);
 }
 
+static void adpcm_flush(AVCodecContext *avctx)
+{
+    ADPCMDecodeContext *c = avctx->priv_data;
+    c->has_status = 0;
+}
+
 
 static const enum AVSampleFormat sample_fmts_s16[]  = { AV_SAMPLE_FMT_S16,
                                                         AV_SAMPLE_FMT_NONE };
-static const enum AVSampleFormat sample_fmts_s16p[] = { AV_SAMPLE_FMT_S16,
+static const enum AVSampleFormat sample_fmts_s16p[] = { AV_SAMPLE_FMT_S16P,
                                                         AV_SAMPLE_FMT_NONE };
 static const enum AVSampleFormat sample_fmts_both[] = { AV_SAMPLE_FMT_S16,
                                                         AV_SAMPLE_FMT_S16P,
@@ -1300,13 +1721,17 @@ AVCodec ff_ ## name_ ## _decoder = {                        \
     .priv_data_size = sizeof(ADPCMDecodeContext),           \
     .init           = adpcm_decode_init,                    \
     .decode         = adpcm_decode_frame,                   \
+    .flush          = adpcm_flush,                          \
     .capabilities   = AV_CODEC_CAP_DR1,                     \
     .sample_fmts    = sample_fmts_,                         \
 }
 
 /* Note: Do not forget to add new entries to the Makefile as well. */
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_4XM,         sample_fmts_s16p, adpcm_4xm,         "ADPCM 4X Movie");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_AFC,         sample_fmts_s16p, adpcm_afc,         "ADPCM Nintendo Gamecube AFC");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_AICA,        sample_fmts_s16p, adpcm_aica,        "ADPCM Yamaha AICA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_CT,          sample_fmts_s16,  adpcm_ct,          "ADPCM Creative Technology");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_DTK,         sample_fmts_s16p, adpcm_dtk,         "ADPCM Nintendo Gamecube DTK");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA,          sample_fmts_s16,  adpcm_ea,          "ADPCM Electronic Arts");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_MAXIS_XA, sample_fmts_s16,  adpcm_ea_maxis_xa, "ADPCM Electronic Arts Maxis CDROM XA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_R1,       sample_fmts_s16p, adpcm_ea_r1,       "ADPCM Electronic Arts R1");
@@ -1315,20 +1740,26 @@ ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_R3,       sample_fmts_s16p, adpcm_ea_r3,
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_XAS,      sample_fmts_s16p, adpcm_ea_xas,      "ADPCM Electronic Arts XAS");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_AMV,     sample_fmts_s16,  adpcm_ima_amv,     "ADPCM IMA AMV");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_APC,     sample_fmts_s16,  adpcm_ima_apc,     "ADPCM IMA CRYO APC");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_DAT4,    sample_fmts_s16,  adpcm_ima_dat4,    "ADPCM IMA Eurocom DAT4");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_DK3,     sample_fmts_s16,  adpcm_ima_dk3,     "ADPCM IMA Duck DK3");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_DK4,     sample_fmts_s16,  adpcm_ima_dk4,     "ADPCM IMA Duck DK4");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_EA_EACS, sample_fmts_s16,  adpcm_ima_ea_eacs, "ADPCM IMA Electronic Arts EACS");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_EA_SEAD, sample_fmts_s16,  adpcm_ima_ea_sead, "ADPCM IMA Electronic Arts SEAD");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_ISS,     sample_fmts_s16,  adpcm_ima_iss,     "ADPCM IMA Funcom ISS");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_OKI,     sample_fmts_s16,  adpcm_ima_oki,     "ADPCM IMA Dialogic OKI");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_QT,      sample_fmts_s16p, adpcm_ima_qt,      "ADPCM IMA QuickTime");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_RAD,     sample_fmts_s16,  adpcm_ima_rad,     "ADPCM IMA Radical");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_SMJPEG,  sample_fmts_s16,  adpcm_ima_smjpeg,  "ADPCM IMA Loki SDL MJPEG");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_WAV,     sample_fmts_s16p, adpcm_ima_wav,     "ADPCM IMA WAV");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_WS,      sample_fmts_both, adpcm_ima_ws,      "ADPCM IMA Westwood");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_MS,          sample_fmts_s16,  adpcm_ms,          "ADPCM Microsoft");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_MTAF,        sample_fmts_s16p, adpcm_mtaf,        "ADPCM MTAF");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_PSX,         sample_fmts_s16p, adpcm_psx,         "ADPCM Playstation");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_2,     sample_fmts_s16,  adpcm_sbpro_2,     "ADPCM Sound Blaster Pro 2-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_3,     sample_fmts_s16,  adpcm_sbpro_3,     "ADPCM Sound Blaster Pro 2.6-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_4,     sample_fmts_s16,  adpcm_sbpro_4,     "ADPCM Sound Blaster Pro 4-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SWF,         sample_fmts_s16,  adpcm_swf,         "ADPCM Shockwave Flash");
-ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP,         sample_fmts_s16p, adpcm_thp,         "ADPCM Nintendo Gamecube THP");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP_LE,      sample_fmts_s16p, adpcm_thp_le,      "ADPCM Nintendo THP (little-endian)");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP,         sample_fmts_s16p, adpcm_thp,         "ADPCM Nintendo THP");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_XA,          sample_fmts_s16p, adpcm_xa,          "ADPCM CDROM XA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_YAMAHA,      sample_fmts_s16,  adpcm_yamaha,      "ADPCM Yamaha");
diff --git a/libavcodec/adpcm.h b/libavcodec/adpcm.h
index 11be5a9..580db7d 100644
--- a/libavcodec/adpcm.h
+++ b/libavcodec/adpcm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001-2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,8 +38,8 @@ typedef struct ADPCMChannelStatus {
     int prev_sample;
 
     /* MS version */
-    int16_t sample1;
-    int16_t sample2;
+    int sample1;
+    int sample2;
     int coeff1;
     int coeff2;
     int idelta;
diff --git a/libavcodec/adpcm_data.c b/libavcodec/adpcm_data.c
index e40abc5..4cce0a5 100644
--- a/libavcodec/adpcm_data.c
+++ b/libavcodec/adpcm_data.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001-2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,33 @@
 
 /* ff_adpcm_step_table[] and ff_adpcm_index_table[] are from the ADPCM
    reference source */
-/* This is the index table: */
+static const int8_t adpcm_index_table2[4] = {
+    -1,  2,
+    -1,  2,
+};
+
+static const int8_t adpcm_index_table3[8] = {
+    -1, -1,  1,  2,
+    -1, -1,  1,  2,
+};
+
 const int8_t ff_adpcm_index_table[16] = {
     -1, -1, -1, -1, 2, 4, 6, 8,
     -1, -1, -1, -1, 2, 4, 6, 8,
 };
 
+static const int8_t adpcm_index_table5[32] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 4, 6, 8, 10, 13, 16,
+    -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 4, 6, 8, 10, 13, 16,
+};
+
+const int8_t * const ff_adpcm_index_tables[4] = {
+    &adpcm_index_table2[0],
+    &adpcm_index_table3[0],
+    &ff_adpcm_index_table[0],
+    &adpcm_index_table5[0],
+};
+
 /**
  * This is the step table. Note that many programs use slight deviations from
  * this table, but such deviations are negligible:
@@ -49,6 +70,14 @@ const int16_t ff_adpcm_step_table[89] = {
     15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767
 };
 
+const int16_t ff_adpcm_oki_step_table[49] = {
+     16,  17,  19,  21,   23,   25,   28,   31,   34,  37,
+     41,  45,  50,  55,   60,   66,   73,   80,   88,  97,
+    107, 118, 130, 143,  157,  173,  190,  209,  230, 253,
+    279, 307, 337, 371,  408,  449,  494,  544,  598, 658,
+    724, 796, 876, 963, 1060, 1166, 1282, 1411, 1552
+};
+
 /* These are for MS-ADPCM */
 /* ff_adpcm_AdaptationTable[], ff_adpcm_AdaptCoeff1[], and
    ff_adpcm_AdaptCoeff2[] are from libsndfile */
@@ -76,3 +105,75 @@ const int8_t ff_adpcm_yamaha_difflookup[] = {
      1,  3,  5,  7,  9,  11,  13,  15,
     -1, -3, -5, -7, -9, -11, -13, -15
 };
+
+const uint16_t ff_adpcm_afc_coeffs[2][16] = {
+    { 0, 2048, 0, 1024, 4096, 3584, 3072, 4608, 4200, 4800, 5120, 2048, 1024, 64512, 64512, 63488 },
+    { 0, 0, 2048, 1024, 63488, 64000, 64512, 62976, 63288, 63236, 62464, 63488, 64512, 1024, 0, 0 }
+};
+
+const int16_t ff_adpcm_mtaf_stepsize[32][16] = {
+    {     1,     5,     9,    13,    16,    20,    24,    28,
+         -1,    -5,    -9,   -13,   -16,   -20,   -24,   -28, },
+    {     2,     6,    11,    15,    20,    24,    29,    33,
+         -2,    -6,   -11,   -15,   -20,   -24,   -29,   -33, },
+    {     2,     7,    13,    18,    23,    28,    34,    39,
+         -2,    -7,   -13,   -18,   -23,   -28,   -34,   -39, },
+    {     3,     9,    15,    21,    28,    34,    40,    46,
+         -3,    -9,   -15,   -21,   -28,   -34,   -40,   -46, },
+    {     3,    11,    18,    26,    33,    41,    48,    56,
+         -3,   -11,   -18,   -26,   -33,   -41,   -48,   -56, },
+    {     4,    13,    22,    31,    40,    49,    58,    67,
+         -4,   -13,   -22,   -31,   -40,   -49,   -58,   -67, },
+    {     5,    16,    26,    37,    48,    59,    69,    80,
+         -5,   -16,   -26,   -37,   -48,   -59,   -69,   -80, },
+    {     6,    19,    31,    44,    57,    70,    82,    95,
+         -6,   -19,   -31,   -44,   -57,   -70,   -82,   -95, },
+    {     7,    22,    38,    53,    68,    83,    99,   114,
+         -7,   -22,   -38,   -53,   -68,   -83,   -99,  -114, },
+    {     9,    27,    45,    63,    81,    99,   117,   135,
+         -9,   -27,   -45,   -63,   -81,   -99,  -117,  -135, },
+    {    10,    32,    53,    75,    96,   118,   139,   161,
+        -10,   -32,   -53,   -75,   -96,  -118,  -139,  -161, },
+    {    12,    38,    64,    90,   115,   141,   167,   193,
+        -12,   -38,   -64,   -90,  -115,  -141,  -167,  -193, },
+    {    15,    45,    76,   106,   137,   167,   198,   228,
+        -15,   -45,   -76,  -106,  -137,  -167,  -198,  -228, },
+    {    18,    54,    91,   127,   164,   200,   237,   273,
+        -18,   -54,   -91,  -127,  -164,  -200,  -237,  -273, },
+    {    21,    65,   108,   152,   195,   239,   282,   326,
+        -21,   -65,  -108,  -152,  -195,  -239,  -282,  -326, },
+    {    25,    77,   129,   181,   232,   284,   336,   388,
+        -25,   -77,  -129,  -181,  -232,  -284,  -336,  -388, },
+    {    30,    92,   153,   215,   276,   338,   399,   461,
+        -30,   -92,  -153,  -215,  -276,  -338,  -399,  -461, },
+    {    36,   109,   183,   256,   329,   402,   476,   549,
+        -36,  -109,  -183,  -256,  -329,  -402,  -476,  -549, },
+    {    43,   130,   218,   305,   392,   479,   567,   654,
+        -43,  -130,  -218,  -305,  -392,  -479,  -567,  -654, },
+    {    52,   156,   260,   364,   468,   572,   676,   780,
+        -52,  -156,  -260,  -364,  -468,  -572,  -676,  -780, },
+    {    62,   186,   310,   434,   558,   682,   806,   930,
+        -62,  -186,  -310,  -434,  -558,  -682,  -806,  -930, },
+    {    73,   221,   368,   516,   663,   811,   958,  1106,
+        -73,  -221,  -368,  -516,  -663,  -811,  -958, -1106, },
+    {    87,   263,   439,   615,   790,   966,  1142,  1318,
+        -87,  -263,  -439,  -615,  -790,  -966, -1142, -1318, },
+    {   104,   314,   523,   733,   942,  1152,  1361,  1571,
+       -104,  -314,  -523,  -733,  -942, -1152, -1361, -1571, },
+    {   124,   374,   623,   873,  1122,  1372,  1621,  1871,
+       -124,  -374,  -623,  -873, -1122, -1372, -1621, -1871, },
+    {   148,   445,   743,  1040,  1337,  1634,  1932,  2229,
+       -148,  -445,  -743, -1040, -1337, -1634, -1932, -2229, },
+    {   177,   531,   885,  1239,  1593,  1947,  2301,  2655,
+       -177,  -531,  -885, -1239, -1593, -1947, -2301, -2655, },
+    {   210,   632,  1053,  1475,  1896,  2318,  2739,  3161,
+       -210,  -632, -1053, -1475, -1896, -2318, -2739, -3161, },
+    {   251,   753,  1255,  1757,  2260,  2762,  3264,  3766,
+       -251,  -753, -1255, -1757, -2260, -2762, -3264, -3766, },
+    {   299,   897,  1495,  2093,  2692,  3290,  3888,  4486,
+       -299,  -897, -1495, -2093, -2692, -3290, -3888, -4486, },
+    {   356,  1068,  1781,  2493,  3206,  3918,  4631,  5343,
+       -356, -1068, -1781, -2493, -3206, -3918, -4631, -5343, },
+    {   424,  1273,  2121,  2970,  3819,  4668,  5516,  6365,
+       -424, -1273, -2121, -2970, -3819, -4668, -5516, -6365, },
+};
diff --git a/libavcodec/adpcm_data.h b/libavcodec/adpcm_data.h
index cecd156..5a68713 100644
--- a/libavcodec/adpcm_data.h
+++ b/libavcodec/adpcm_data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001-2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,19 @@
 
 #include <stdint.h>
 
+static const uint8_t ff_adpcm_ima_block_sizes[4]   = {  4, 12, 4, 20 };
+static const uint8_t ff_adpcm_ima_block_samples[4] = { 16, 32, 8, 32 };
+
+extern const int8_t * const ff_adpcm_index_tables[4];
 extern const int8_t  ff_adpcm_index_table[16];
 extern const int16_t ff_adpcm_step_table[89];
+extern const int16_t ff_adpcm_oki_step_table[49];
 extern const int16_t ff_adpcm_AdaptationTable[];
 extern const uint8_t ff_adpcm_AdaptCoeff1[];
 extern const int8_t  ff_adpcm_AdaptCoeff2[];
 extern const int16_t ff_adpcm_yamaha_indexscale[];
 extern const int8_t  ff_adpcm_yamaha_difflookup[];
+extern const int16_t ff_adpcm_afc_coeffs[2][16];
+extern const int16_t ff_adpcm_mtaf_stepsize[32][16];
 
 #endif /* AVCODEC_ADPCM_DATA_H */
diff --git a/libavcodec/adpcmenc.c b/libavcodec/adpcmenc.c
index 0757624..668939c 100644
--- a/libavcodec/adpcmenc.c
+++ b/libavcodec/adpcmenc.c
@@ -5,20 +5,20 @@
  * fringe ADPCM codecs (e.g., DK3, DK4, Westwood)
  *   by Mike Melanson (melanson@pcisys.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,6 +58,8 @@ typedef struct ADPCMEncodeContext {
 
 #define FREEZE_INTERVAL 128
 
+static av_cold int adpcm_encode_close(AVCodecContext *avctx);
+
 static av_cold int adpcm_encode_init(AVCodecContext *avctx)
 {
     ADPCMEncodeContext *s = avctx->priv_data;
@@ -99,6 +101,7 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
         /* seems frame_size isn't taken into account...
            have to buffer the samples :-( */
         avctx->block_align = BLKSIZE;
+        avctx->bits_per_coded_sample = 4;
         break;
     case AV_CODEC_ID_ADPCM_IMA_QT:
         avctx->frame_size  = 64;
@@ -107,8 +110,8 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
     case AV_CODEC_ID_ADPCM_MS:
         /* each 16 bits sample gives one nibble
            and we have 7 bytes per channel overhead */
-        avctx->frame_size = (BLKSIZE - 7 * avctx->channels) * 2 /
-                             avctx->channels + 2;
+        avctx->frame_size = (BLKSIZE - 7 * avctx->channels) * 2 / avctx->channels + 2;
+        avctx->bits_per_coded_sample = 4;
         avctx->block_align    = BLKSIZE;
         if (!(avctx->extradata = av_malloc(32 + AV_INPUT_BUFFER_PADDING_SIZE)))
             goto error;
@@ -143,10 +146,7 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
 
     return 0;
 error:
-    av_freep(&s->paths);
-    av_freep(&s->node_buf);
-    av_freep(&s->nodep_buf);
-    av_freep(&s->trellis_hash);
+    adpcm_encode_close(avctx);
     return ret;
 }
 
@@ -179,24 +179,27 @@ static inline uint8_t adpcm_ima_qt_compress_sample(ADPCMChannelStatus *c,
                                                    int16_t sample)
 {
     int delta  = sample - c->prev_sample;
-    int mask, step = ff_adpcm_step_table[c->step_index];
-    int diff   = step >> 3;
-    int nibble = 0;
+    int diff, step = ff_adpcm_step_table[c->step_index];
+    int nibble = 8*(delta < 0);
 
-    if (delta < 0) {
-        nibble = 8;
-        delta  = -delta;
-    }
+    delta= abs(delta);
+    diff = delta + (step >> 3);
 
-    for (mask = 4; mask;) {
-        if (delta >= step) {
-            nibble |= mask;
-            delta  -= step;
-            diff   += step;
-        }
-        step >>= 1;
-        mask >>= 1;
+    if (delta >= step) {
+        nibble |= 4;
+        delta  -= step;
+    }
+    step >>= 1;
+    if (delta >= step) {
+        nibble |= 2;
+        delta  -= step;
     }
+    step >>= 1;
+    if (delta >= step) {
+        nibble |= 1;
+        delta  -= step;
+    }
+    diff -= delta;
 
     if (nibble & 8)
         c->prev_sample -= diff;
@@ -224,7 +227,7 @@ static inline uint8_t adpcm_ms_compress_sample(ADPCMChannelStatus *c,
         bias = -c->idelta / 2;
 
     nibble = (nibble + bias) / c->idelta;
-    nibble = av_clip(nibble, -8, 7) & 0x0F;
+    nibble = av_clip_intp2(nibble, 3) & 0x0F;
 
     predictor += ((nibble & 0x08) ? (nibble - 0x10) : nibble) * c->idelta;
 
@@ -255,7 +258,7 @@ static inline uint8_t adpcm_yamaha_compress_sample(ADPCMChannelStatus *c,
     c->predictor += ((c->step * ff_adpcm_yamaha_difflookup[nibble]) / 8);
     c->predictor = av_clip_int16(c->predictor);
     c->step = (c->step * ff_adpcm_yamaha_indexscale[nibble]) >> 8;
-    c->step = av_clip(c->step, 127, 24567);
+    c->step = av_clip(c->step, 127, 24576);
 
     return nibble;
 }
@@ -329,7 +332,7 @@ static void adpcm_compress_trellis(AVCodecContext *avctx,
                     uint8_t *h;\
                     dec_sample = av_clip_int16(dec_sample);\
                     d = sample - dec_sample;\
-                    ssd = nodes[j]->ssd + d*d;\
+                    ssd = nodes[j]->ssd + d*(unsigned)d;\
                     /* Check for wraparound, skip such samples completely. \
                      * Note, changing ssd to a 64 bit variable would be \
                      * simpler, avoiding this check, but it's slower on \
@@ -364,7 +367,7 @@ static void adpcm_compress_trellis(AVCodecContext *avctx,
                     *h = generation;\
                     u  = nodes_next[pos];\
                     if (!u) {\
-                        assert(pathn < FREEZE_INTERVAL << avctx->trellis);\
+                        av_assert1(pathn < FREEZE_INTERVAL << avctx->trellis);\
                         u = t++;\
                         nodes_next[pos] = u;\
                         u->path = pathn++;\
@@ -412,7 +415,7 @@ static void adpcm_compress_trellis(AVCodecContext *avctx,
             } else { //AV_CODEC_ID_ADPCM_YAMAHA
                 LOOP_NODES(yamaha, step,
                            av_clip((step * ff_adpcm_yamaha_indexscale[nibble]) >> 8,
-                                   127, 24567));
+                                   127, 24576));
 #undef LOOP_NODES
 #undef STORE_NODE
             }
@@ -483,10 +486,8 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         pkt_size = (2 + avctx->channels * (22 + 4 * (frame->nb_samples - 1)) + 7) / 8;
     else
         pkt_size = avctx->block_align;
-    if ((ret = ff_alloc_packet(avpkt, pkt_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, pkt_size, 0)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     switch(avctx->codec->id) {
@@ -508,7 +509,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
         /* stereo: 4 bytes (8 samples) for left, 4 bytes for right */
         if (avctx->trellis > 0) {
-            FF_ALLOC_OR_GOTO(avctx, buf, avctx->channels * blocks * 8, error);
+            FF_ALLOC_ARRAY_OR_GOTO(avctx, buf, avctx->channels, blocks * 8, error);
             for (ch = 0; ch < avctx->channels; ch++) {
                 adpcm_compress_trellis(avctx, &samples_p[ch][1],
                                        buf + ch * blocks * 8, &c->status[ch],
@@ -540,7 +541,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_ADPCM_IMA_QT:
     {
         PutBitContext pb;
-        init_put_bits(&pb, dst, pkt_size * 8);
+        init_put_bits(&pb, dst, pkt_size);
 
         for (ch = 0; ch < avctx->channels; ch++) {
             ADPCMChannelStatus *status = &c->status[ch];
@@ -570,7 +571,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_ADPCM_SWF:
     {
         PutBitContext pb;
-        init_put_bits(&pb, dst, pkt_size * 8);
+        init_put_bits(&pb, dst, pkt_size);
 
         n = frame->nb_samples - 1;
 
@@ -580,7 +581,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         // init the encoder state
         for (i = 0; i < avctx->channels; i++) {
             // clip step so it fits 6 bits
-            c->status[i].step_index = av_clip(c->status[i].step_index, 0, 63);
+            c->status[i].step_index = av_clip_uintp2(c->status[i].step_index, 6);
             put_sbits(&pb, 16, samples[i]);
             put_bits(&pb, 6, c->status[i].step_index);
             c->status[i].prev_sample = samples[i];
diff --git a/libavcodec/adts_header.c b/libavcodec/adts_header.c
index 3b84505..0889820 100644
--- a/libavcodec/adts_header.c
+++ b/libavcodec/adts_header.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2009 Alex Converse
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adts_header.h b/libavcodec/adts_header.h
index 386aa0a..f615f6a 100644
--- a/libavcodec/adts_header.h
+++ b/libavcodec/adts_header.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adts_parser.c b/libavcodec/adts_parser.c
index 5821e6f..5c9f8ff 100644
--- a/libavcodec/adts_parser.c
+++ b/libavcodec/adts_parser.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adts_parser.h b/libavcodec/adts_parser.h
index 1a3328f..f85becd 100644
--- a/libavcodec/adts_parser.h
+++ b/libavcodec/adts_parser.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adx.c b/libavcodec/adx.c
index d941d7b..cd88b16 100644
--- a/libavcodec/adx.c
+++ b/libavcodec/adx.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adx.h b/libavcodec/adx.h
index 9ae84dc..08f749a 100644
--- a/libavcodec/adx.h
+++ b/libavcodec/adx.h
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adx_parser.c b/libavcodec/adx_parser.c
index 706e242..1fa718f 100644
--- a/libavcodec/adx_parser.c
+++ b/libavcodec/adx_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adxdec.c b/libavcodec/adxdec.c
index 86aaade..178ea99 100644
--- a/libavcodec/adxdec.c
+++ b/libavcodec/adxdec.c
@@ -2,28 +2,27 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
-
 #include "avcodec.h"
 #include "adx.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 /**
@@ -67,7 +66,7 @@ static int adx_decode(ADXContext *c, int16_t *out, int offset,
                       const uint8_t *in, int ch)
 {
     ADXChannelState *prev = &c->prev[ch];
-    BitstreamContext bc;
+    GetBitContext gb;
     int scale = AV_RB16(in);
     int i;
     int s0, s1, s2, d;
@@ -76,13 +75,13 @@ static int adx_decode(ADXContext *c, int16_t *out, int offset,
     if (scale & 0x8000)
         return -1;
 
-    bitstream_init8(&bc, in + 2, BLOCK_SIZE - 2);
+    init_get_bits(&gb, in + 2, (BLOCK_SIZE - 2) * 8);
     out += offset;
     s1 = prev->s1;
     s2 = prev->s2;
     for (i = 0; i < BLOCK_SAMPLES; i++) {
-        d  = bitstream_read_signed(&bc, 4);
-        s0 = ((d << COEFF_BITS) * scale + c->coeff[0] * s1 + c->coeff[1] * s2) >> COEFF_BITS;
+        d  = get_sbits(&gb, 4);
+        s0 = ((d * (1 << COEFF_BITS)) * scale + c->coeff[0] * s1 + c->coeff[1] * s2) >> COEFF_BITS;
         s2 = s1;
         s1 = av_clip_int16(s0);
         *out++ = s1;
@@ -102,6 +101,7 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
     int16_t **samples;
     int samples_offset;
     const uint8_t *buf  = avpkt->data;
+    const uint8_t *buf_end = buf + avpkt->size;
     int num_blocks, ch, ret;
 
     if (c->eof) {
@@ -142,16 +142,14 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = num_blocks * BLOCK_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t **)frame->extended_data;
     samples_offset = 0;
 
     while (num_blocks--) {
         for (ch = 0; ch < c->channels; ch++) {
-            if (adx_decode(c, samples[ch], samples_offset, buf, ch)) {
+            if (buf_end - buf < BLOCK_SIZE || adx_decode(c, samples[ch], samples_offset, buf, ch)) {
                 c->eof = 1;
                 buf = avpkt->data + avpkt->size;
                 break;
@@ -159,9 +157,11 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
             buf_size -= BLOCK_SIZE;
             buf      += BLOCK_SIZE;
         }
-        samples_offset += BLOCK_SAMPLES;
+        if (!c->eof)
+            samples_offset += BLOCK_SAMPLES;
     }
 
+    frame->nb_samples = samples_offset;
     *got_frame_ptr = 1;
 
     return buf - avpkt->data;
diff --git a/libavcodec/adxenc.c b/libavcodec/adxenc.c
index e730811..f1ba591 100644
--- a/libavcodec/adxenc.c
+++ b/libavcodec/adxenc.c
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,14 +43,12 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
     int s0, s1, s2, d;
     int max = 0;
     int min = 0;
-    int data[BLOCK_SAMPLES];
 
     s1 = prev->s1;
     s2 = prev->s2;
     for (i = 0, j = 0; j < 32; i += channels, j++) {
         s0 = wav[i];
         d = ((s0 << COEFF_BITS) - c->coeff[0] * s1 - c->coeff[1] * s2) >> COEFF_BITS;
-        data[j] = d;
         if (max < d)
             max = d;
         if (min > d)
@@ -58,10 +56,10 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
         s2 = s1;
         s1 = s0;
     }
-    prev->s1 = s1;
-    prev->s2 = s2;
 
     if (max == 0 && min == 0) {
+        prev->s1 = s1;
+        prev->s2 = s2;
         memset(adx, 0, BLOCK_SIZE);
         return;
     }
@@ -77,8 +75,23 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
     AV_WB16(adx, scale);
 
     init_put_bits(&pb, adx + 2, 16);
-    for (i = 0; i < BLOCK_SAMPLES; i++)
-        put_sbits(&pb, 4, av_clip(data[i] / scale, -8, 7));
+
+    s1 = prev->s1;
+    s2 = prev->s2;
+    for (i = 0, j = 0; j < 32; i += channels, j++) {
+        d = ((wav[i] << COEFF_BITS) - c->coeff[0] * s1 - c->coeff[1] * s2) >> COEFF_BITS;
+
+        d = av_clip_intp2(ROUNDED_DIV(d, scale), 3);
+
+        put_sbits(&pb, 4, d);
+
+        s0 = ((d << COEFF_BITS) * scale + c->coeff[0] * s1 + c->coeff[1] * s2) >> COEFF_BITS;
+        s2 = s1;
+        s1 = s0;
+    }
+    prev->s1 = s1;
+    prev->s2 = s2;
+
     flush_put_bits(&pb);
 }
 
@@ -133,10 +146,8 @@ static int adx_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int ch, out_size, ret;
 
     out_size = BLOCK_SIZE * avctx->channels + !c->header_parsed * HEADER_SIZE;
-    if ((ret = ff_alloc_packet(avpkt, out_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     if (!c->header_parsed) {
diff --git a/libavcodec/aic.c b/libavcodec/aic.c
index de9d7de..dc28c83 100644
--- a/libavcodec/aic.c
+++ b/libavcodec/aic.c
@@ -3,30 +3,30 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
-#include "golomb.h"
 #include "internal.h"
+#include "get_bits.h"
+#include "golomb.h"
 #include "idctdsp.h"
 #include "thread.h"
 #include "unary.h"
@@ -153,6 +153,7 @@ typedef struct AICContext {
     int16_t        *data_ptr[NUM_BANDS];
 
     DECLARE_ALIGNED(16, int16_t, block)[64];
+    DECLARE_ALIGNED(16, uint8_t, quant_matrix)[64];
 } AICContext;
 
 static int aic_decode_header(AICContext *ctx, const uint8_t *src, int size)
@@ -191,41 +192,45 @@ static int aic_decode_header(AICContext *ctx, const uint8_t *src, int size)
 #define GET_CODE(val, type, add_bits)                         \
     do {                                                      \
         if (type)                                             \
-            val = get_ue_golomb(bc);                          \
+            val = get_ue_golomb(gb);                          \
         else                                                  \
-            val = get_unary(bc, 1, 31);                       \
+            val = get_unary(gb, 1, 31);                       \
         if (add_bits)                                         \
-            val = (val << add_bits) + bitstream_read(bc, add_bits); \
+            val = (val << add_bits) + get_bits(gb, add_bits); \
     } while (0)
 
-static int aic_decode_coeffs(BitstreamContext *bc, int16_t *dst,
+static int aic_decode_coeffs(GetBitContext *gb, int16_t *dst,
                              int band, int slice_width, int force_chroma)
 {
     int has_skips, coeff_type, coeff_bits, skip_type, skip_bits;
     const int num_coeffs = aic_num_band_coeffs[band];
     const uint8_t *scan = aic_scan[band | force_chroma];
-    int mb, idx, val;
+    int mb, idx;
+    unsigned val;
+
+    if (get_bits_left(gb) < 5)
+        return AVERROR_INVALIDDATA;
 
-    has_skips  = bitstream_read_bit(bc);
-    coeff_type = bitstream_read_bit(bc);
-    coeff_bits = bitstream_read(bc, 3);
+    has_skips  = get_bits1(gb);
+    coeff_type = get_bits1(gb);
+    coeff_bits = get_bits(gb, 3);
 
     if (has_skips) {
-        skip_type = bitstream_read_bit(bc);
-        skip_bits = bitstream_read(bc, 3);
+        skip_type = get_bits1(gb);
+        skip_bits = get_bits(gb, 3);
 
         for (mb = 0; mb < slice_width; mb++) {
             idx = -1;
             do {
                 GET_CODE(val, skip_type, skip_bits);
-                if (val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 idx += val + 1;
                 if (idx >= num_coeffs)
                     break;
                 GET_CODE(val, coeff_type, coeff_bits);
                 val++;
-                if (val >= 0x10000 || val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 dst[scan[idx]] = val;
             } while (idx < num_coeffs - 1);
@@ -235,7 +240,7 @@ static int aic_decode_coeffs(BitstreamContext *bc, int16_t *dst,
         for (mb = 0; mb < slice_width; mb++) {
             for (idx = 0; idx < num_coeffs; idx++) {
                 GET_CODE(val, coeff_type, coeff_bits);
-                if (val >= 0x10000 || val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 dst[scan[idx]] = val;
             }
@@ -287,7 +292,7 @@ static void recombine_block_il(int16_t *dst, const uint8_t *scan,
     }
 }
 
-static void unquant_block(int16_t *block, int q)
+static void unquant_block(int16_t *block, int q, uint8_t *quant_matrix)
 {
     int i;
 
@@ -295,7 +300,7 @@ static void unquant_block(int16_t *block, int q)
         int val  = (uint16_t)block[i];
         int sign = val & 1;
 
-        block[i] = (((val >> 1) ^ -sign) * q * aic_quant_matrix[i] >> 4)
+        block[i] = (((val >> 1) ^ -sign) * q * quant_matrix[i] >> 4)
                    + sign;
     }
 }
@@ -303,9 +308,11 @@ static void unquant_block(int16_t *block, int q)
 static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
                             const uint8_t *src, int src_size)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int ret, i, mb, blk;
     int slice_width = FFMIN(ctx->slice_width, ctx->mb_width - mb_x);
+    int last_row = mb_y && mb_y == ctx->mb_height - 1;
+    int y_pos, c_pos;
     uint8_t *Y, *C[2];
     uint8_t *dst;
     int16_t *base_y = ctx->data_ptr[COEFF_LUMA];
@@ -314,16 +321,24 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
     int16_t *ext_c  = ctx->data_ptr[COEFF_CHROMA_EXT];
     const int ystride = ctx->frame->linesize[0];
 
-    Y = ctx->frame->data[0] + mb_x * 16 + mb_y * 16 * ystride;
+    if (last_row) {
+        y_pos = (ctx->avctx->height - 16);
+        c_pos = ((ctx->avctx->height+1)/2 - 8);
+    } else {
+        y_pos = mb_y * 16;
+        c_pos = mb_y * 8;
+    }
+
+    Y = ctx->frame->data[0] + mb_x * 16 + y_pos * ystride;
     for (i = 0; i < 2; i++)
         C[i] = ctx->frame->data[i + 1] + mb_x * 8
-               + mb_y * 8 * ctx->frame->linesize[i + 1];
-    bitstream_init8(&bc, src, src_size);
+               + c_pos * ctx->frame->linesize[i + 1];
+    init_get_bits(&gb, src, src_size * 8);
 
     memset(ctx->slice_data, 0,
            sizeof(*ctx->slice_data) * slice_width * AIC_BAND_COEFFS);
     for (i = 0; i < NUM_BANDS; i++)
-        if ((ret = aic_decode_coeffs(&bc, ctx->data_ptr[i],
+        if ((ret = aic_decode_coeffs(&gb, ctx->data_ptr[i],
                                      i, slice_width,
                                      !ctx->interlaced)) < 0)
             return ret;
@@ -336,7 +351,7 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
             else
                 recombine_block_il(ctx->block, ctx->scantable.permutated,
                                    &base_y, &ext_y, blk);
-            unquant_block(ctx->block, ctx->quant);
+            unquant_block(ctx->block, ctx->quant, ctx->quant_matrix);
             ctx->idsp.idct(ctx->block);
 
             if (!ctx->interlaced) {
@@ -353,7 +368,7 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
         for (blk = 0; blk < 2; blk++) {
             recombine_block(ctx->block, ctx->scantable.permutated,
                             &base_c, &ext_c);
-            unquant_block(ctx->block, ctx->quant);
+            unquant_block(ctx->block, ctx->quant, ctx->quant_matrix);
             ctx->idsp.idct(ctx->block);
             ctx->idsp.put_signed_pixels_clamped(ctx->block, C[blk],
                                                 ctx->frame->linesize[blk + 1]);
@@ -439,13 +454,15 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 64; i++)
         scan[i] = i;
     ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, scan);
+    for (i = 0; i < 64; i++)
+        ctx->quant_matrix[ctx->idsp.idct_permutation[i]] = aic_quant_matrix[i];
 
     ctx->mb_width  = FFALIGN(avctx->width,  16) >> 4;
     ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
 
     ctx->num_x_slices = (ctx->mb_width + 15) >> 4;
     ctx->slice_width  = 16;
-    for (i = 1; i < 32; i++) {
+    for (i = 1; i < ctx->mb_width; i++) {
         if (!(ctx->mb_width % i) && (ctx->mb_width / i <= 32)) {
             ctx->slice_width  = ctx->mb_width / i;
             ctx->num_x_slices = i;
@@ -453,7 +470,7 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
         }
     }
 
-    ctx->slice_data = av_malloc(ctx->slice_width * AIC_BAND_COEFFS
+    ctx->slice_data = av_malloc_array(ctx->slice_width, AIC_BAND_COEFFS
                                 * sizeof(*ctx->slice_data));
     if (!ctx->slice_data) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating slice buffer\n");
diff --git a/libavcodec/alac.c b/libavcodec/alac.c
index 0f1c59e..d6b87db 100644
--- a/libavcodec/alac.c
+++ b/libavcodec/alac.c
@@ -2,20 +2,20 @@
  * ALAC (Apple Lossless Audio Codec) decoder
  * Copyright (c) 2005 David Hammerton
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,8 @@
  *  8 bits  compatible version   (0)
  *  8 bits  sample size
  *  8 bits  history mult         (40)
- *  8 bits  initial history      (14)
- *  8 bits  rice param limit     (10)
+ *  8 bits  initial history      (10)
+ *  8 bits  rice param limit     (14)
  *  8 bits  channels
  * 16 bits  maxRun               (255)
  * 32 bits  max coded frame size (0 means unknown)
@@ -48,20 +48,23 @@
 #include <inttypes.h>
 
 #include "libavutil/channel_layout.h"
-
+#include "libavutil/opt.h"
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "bytestream.h"
 #include "internal.h"
-#include "mathops.h"
+#include "thread.h"
 #include "unary.h"
+#include "mathops.h"
 #include "alac_data.h"
+#include "alacdsp.h"
 
 #define ALAC_EXTRADATA_SIZE 36
 
 typedef struct ALACContext {
+    AVClass *class;
     AVCodecContext *avctx;
-    BitstreamContext bc;
+    GetBitContext gb;
     int channels;
 
     int32_t *predict_error_buffer[2];
@@ -73,34 +76,40 @@ typedef struct ALACContext {
     uint8_t  rice_history_mult;
     uint8_t  rice_initial_history;
     uint8_t  rice_limit;
+    int      sample_rate;
 
     int extra_bits;     /**< number of extra bits beyond 16-bit */
     int nb_samples;     /**< number of samples in the current frame */
+
+    int direct_output;
+    int extra_bit_bug;
+
+    ALACDSPContext dsp;
 } ALACContext;
 
-static inline unsigned int decode_scalar(BitstreamContext *bc, int k, int bps)
+static inline unsigned int decode_scalar(GetBitContext *gb, int k, int bps)
 {
-    unsigned int x = get_unary_0_9(bc);
+    unsigned int x = get_unary_0_9(gb);
 
     if (x > 8) { /* RICE THRESHOLD */
         /* use alternative encoding */
-        x = bitstream_read(bc, bps);
+        x = get_bits_long(gb, bps);
     } else if (k != 1) {
-        int extrabits = bitstream_peek(bc, k);
+        int extrabits = show_bits(gb, k);
 
         /* multiply x by 2^k - 1, as part of their strange algorithm */
         x = (x << k) - x;
 
         if (extrabits > 1) {
             x += extrabits - 1;
-            bitstream_skip(bc, k);
+            skip_bits(gb, k);
         } else
-            bitstream_skip(bc, k - 1);
+            skip_bits(gb, k - 1);
     }
     return x;
 }
 
-static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
+static int rice_decompress(ALACContext *alac, int32_t *output_buffer,
                             int nb_samples, int bps, int rice_history_mult)
 {
     int i;
@@ -111,10 +120,13 @@ static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
         int k;
         unsigned int x;
 
+        if(get_bits_left(&alac->gb) <= 0)
+            return AVERROR_INVALIDDATA;
+
         /* calculate rice param and decode next value */
         k = av_log2((history >> 9) + 3);
         k = FFMIN(k, alac->rice_limit);
-        x = decode_scalar(&alac->bc, k, bps);
+        x = decode_scalar(&alac->gb, k, bps);
         x += sign_modifier;
         sign_modifier = 0;
         output_buffer[i] = (x >> 1) ^ -(x & 1);
@@ -133,7 +145,7 @@ static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
             /* calculate rice param and decode block size */
             k = 7 - av_log2(history) + ((history + 16) >> 6);
             k = FFMIN(k, alac->rice_limit);
-            block_size = decode_scalar(&alac->bc, k, 16);
+            block_size = decode_scalar(&alac->gb, k, 16);
 
             if (block_size > 0) {
                 if (block_size >= nb_samples - i) {
@@ -151,6 +163,7 @@ static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
             history = 0;
         }
     }
+    return 0;
 }
 
 static inline int sign_only(int v)
@@ -187,7 +200,7 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
     }
 
     /* read warm-up samples */
-    for (i = 1; i <= lpc_order; i++)
+    for (i = 1; i <= lpc_order && i < nb_samples; i++)
         buffer_out[i] = sign_extend(buffer_out[i - 1] + error_buffer[i], bps);
 
     /* NOTE: 4 and 8 are very common cases that could be optimized. */
@@ -221,35 +234,6 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
     }
 }
 
-static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
-                               int decorr_shift, int decorr_left_weight)
-{
-    int i;
-
-    for (i = 0; i < nb_samples; i++) {
-        int32_t a, b;
-
-        a = buffer[0][i];
-        b = buffer[1][i];
-
-        a -= (b * decorr_left_weight) >> decorr_shift;
-        b += a;
-
-        buffer[0][i] = b;
-        buffer[1][i] = a;
-    }
-}
-
-static void append_extra_bits(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
-                              int extra_bits, int channels, int nb_samples)
-{
-    int i, ch;
-
-    for (ch = 0; ch < channels; ch++)
-        for (i = 0; i < nb_samples; i++)
-            buffer[ch][i] = (buffer[ch][i] << extra_bits) | extra_bits_buffer[ch][i];
-}
-
 static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
                           int channels)
 {
@@ -258,24 +242,24 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
     uint32_t output_samples;
     int i, ch;
 
-    bitstream_skip(&alac->bc, 4);  /* element instance tag */
-    bitstream_skip(&alac->bc, 12); /* unused header bits */
+    skip_bits(&alac->gb, 4);  /* element instance tag */
+    skip_bits(&alac->gb, 12); /* unused header bits */
 
     /* the number of output samples is stored in the frame */
-    has_size = bitstream_read_bit(&alac->bc);
+    has_size = get_bits1(&alac->gb);
 
-    alac->extra_bits = bitstream_read(&alac->bc, 2) << 3;
+    alac->extra_bits = get_bits(&alac->gb, 2) << 3;
     bps = alac->sample_size - alac->extra_bits + channels - 1;
-    if (bps > 32) {
+    if (bps > 32U) {
         avpriv_report_missing_feature(avctx, "bps %d", bps);
         return AVERROR_PATCHWELCOME;
     }
 
     /* whether the frame is compressed */
-    is_compressed = !bitstream_read_bit(&alac->bc);
+    is_compressed = !get_bits1(&alac->gb);
 
     if (has_size)
-        output_samples = bitstream_read(&alac->bc, 32);
+        output_samples = get_bits_long(&alac->gb, 32);
     else
         output_samples = alac->max_samples_per_frame;
     if (!output_samples || output_samples > alac->max_samples_per_frame) {
@@ -284,19 +268,18 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
         return AVERROR_INVALIDDATA;
     }
     if (!alac->nb_samples) {
+        ThreadFrame tframe = { .f = frame };
         /* get output buffer */
         frame->nb_samples = output_samples;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
             return ret;
-        }
     } else if (output_samples != alac->nb_samples) {
         av_log(avctx, AV_LOG_ERROR, "sample count mismatch: %"PRIu32" != %d\n",
                output_samples, alac->nb_samples);
         return AVERROR_INVALIDDATA;
     }
     alac->nb_samples = output_samples;
-    if (alac->sample_size > 16) {
+    if (alac->direct_output) {
         for (ch = 0; ch < channels; ch++)
             alac->output_samples_buffer[ch] = (int32_t *)frame->extended_data[ch_index + ch];
     }
@@ -314,33 +297,37 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
             return AVERROR(ENOSYS);
         }
 
-        decorr_shift       = bitstream_read(&alac->bc, 8);
-        decorr_left_weight = bitstream_read(&alac->bc, 8);
+        decorr_shift       = get_bits(&alac->gb, 8);
+        decorr_left_weight = get_bits(&alac->gb, 8);
 
         for (ch = 0; ch < channels; ch++) {
-            prediction_type[ch]   = bitstream_read(&alac->bc, 4);
-            lpc_quant[ch]         = bitstream_read(&alac->bc, 4);
-            rice_history_mult[ch] = bitstream_read(&alac->bc, 3);
-            lpc_order[ch]         = bitstream_read(&alac->bc, 5);
+            prediction_type[ch]   = get_bits(&alac->gb, 4);
+            lpc_quant[ch]         = get_bits(&alac->gb, 4);
+            rice_history_mult[ch] = get_bits(&alac->gb, 3);
+            lpc_order[ch]         = get_bits(&alac->gb, 5);
 
             if (lpc_order[ch] >= alac->max_samples_per_frame)
                 return AVERROR_INVALIDDATA;
 
             /* read the predictor table */
             for (i = lpc_order[ch] - 1; i >= 0; i--)
-                lpc_coefs[ch][i] = bitstream_read_signed(&alac->bc, 16);
+                lpc_coefs[ch][i] = get_sbits(&alac->gb, 16);
         }
 
         if (alac->extra_bits) {
             for (i = 0; i < alac->nb_samples; i++) {
+                if(get_bits_left(&alac->gb) <= 0)
+                    return AVERROR_INVALIDDATA;
                 for (ch = 0; ch < channels; ch++)
-                    alac->extra_bits_buffer[ch][i] = bitstream_read(&alac->bc, alac->extra_bits);
+                    alac->extra_bits_buffer[ch][i] = get_bits(&alac->gb, alac->extra_bits);
             }
         }
         for (ch = 0; ch < channels; ch++) {
-            rice_decompress(alac, alac->predict_error_buffer[ch],
+            int ret=rice_decompress(alac, alac->predict_error_buffer[ch],
                             alac->nb_samples, bps,
                             rice_history_mult[ch] * alac->rice_history_mult / 4);
+            if(ret<0)
+                return ret;
 
             /* adaptive FIR filter */
             if (prediction_type[ch] == 15) {
@@ -365,9 +352,11 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
     } else {
         /* not compressed, easy case */
         for (i = 0; i < alac->nb_samples; i++) {
+            if(get_bits_left(&alac->gb) <= 0)
+                return AVERROR_INVALIDDATA;
             for (ch = 0; ch < channels; ch++) {
                 alac->output_samples_buffer[ch][i] =
-                         bitstream_read_signed(&alac->bc, alac->sample_size);
+                         get_sbits_long(&alac->gb, alac->sample_size);
             }
         }
         alac->extra_bits   = 0;
@@ -375,14 +364,24 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
         decorr_left_weight = 0;
     }
 
-    if (channels == 2 && decorr_left_weight) {
-        decorrelate_stereo(alac->output_samples_buffer, alac->nb_samples,
-                           decorr_shift, decorr_left_weight);
-    }
+    if (channels == 2) {
+        if (alac->extra_bits && alac->extra_bit_bug) {
+            alac->dsp.append_extra_bits[1](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                           alac->extra_bits, channels, alac->nb_samples);
+        }
 
-    if (alac->extra_bits) {
-        append_extra_bits(alac->output_samples_buffer, alac->extra_bits_buffer,
-                          alac->extra_bits, channels, alac->nb_samples);
+        if (decorr_left_weight) {
+            alac->dsp.decorrelate_stereo(alac->output_samples_buffer, alac->nb_samples,
+                                         decorr_shift, decorr_left_weight);
+        }
+
+        if (alac->extra_bits && !alac->extra_bit_bug) {
+            alac->dsp.append_extra_bits[1](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                           alac->extra_bits, channels, alac->nb_samples);
+        }
+    } else if (alac->extra_bits) {
+        alac->dsp.append_extra_bits[0](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                       alac->extra_bits, channels, alac->nb_samples);
     }
 
     switch(alac->sample_size) {
@@ -393,6 +392,12 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
                 *outbuffer++ = alac->output_samples_buffer[ch][i];
         }}
         break;
+    case 20: {
+        for (ch = 0; ch < channels; ch++) {
+            for (i = 0; i < alac->nb_samples; i++)
+                alac->output_samples_buffer[ch][i] <<= 12;
+        }}
+        break;
     case 24: {
         for (ch = 0; ch < channels; ch++) {
             for (i = 0; i < alac->nb_samples; i++)
@@ -413,13 +418,14 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
     int channels;
     int ch, ret, got_end;
 
-    bitstream_init8(&alac->bc, avpkt->data, avpkt->size);
+    if ((ret = init_get_bits8(&alac->gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     got_end = 0;
     alac->nb_samples = 0;
     ch = 0;
-    while (bitstream_bits_left(&alac->bc) >= 3) {
-        element = bitstream_read(&alac->bc, 3);
+    while (get_bits_left(&alac->gb) >= 3) {
+        element = get_bits(&alac->gb, 3);
         if (element == TYPE_END) {
             got_end = 1;
             break;
@@ -439,7 +445,7 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
         ret = decode_element(avctx, frame,
                              ff_alac_channel_layout_offsets[alac->channels - 1][ch],
                              channels);
-        if (ret < 0 && bitstream_bits_left(&alac->bc))
+        if (ret < 0 && get_bits_left(&alac->gb))
             return ret;
 
         ch += channels;
@@ -448,17 +454,16 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_ERROR, "no end tag found. incomplete packet.\n");
         return AVERROR_INVALIDDATA;
     }
-    if (!alac->nb_samples) {
-        av_log(avctx, AV_LOG_ERROR, "No decodable data in the packet\n");
-        return AVERROR_INVALIDDATA;
-    }
 
-    if (avpkt->size * 8 - bitstream_tell(&alac->bc) > 8) {
+    if (avpkt->size * 8 - get_bits_count(&alac->gb) > 8) {
         av_log(avctx, AV_LOG_ERROR, "Error : %d bits left\n",
-               avpkt->size * 8 - bitstream_tell(&alac->bc));
+               avpkt->size * 8 - get_bits_count(&alac->gb));
     }
 
-    *got_frame_ptr = 1;
+    if (alac->channels == ch && alac->nb_samples)
+        *got_frame_ptr = 1;
+    else
+        av_log(avctx, AV_LOG_WARNING, "Failed to decode all channels\n");
 
     return avpkt->size;
 }
@@ -470,7 +475,7 @@ static av_cold int alac_decode_close(AVCodecContext *avctx)
     int ch;
     for (ch = 0; ch < FFMIN(alac->channels, 2); ch++) {
         av_freep(&alac->predict_error_buffer[ch]);
-        if (alac->sample_size == 16)
+        if (!alac->direct_output)
             av_freep(&alac->output_samples_buffer[ch]);
         av_freep(&alac->extra_bits_buffer[ch]);
     }
@@ -483,17 +488,24 @@ static int allocate_buffers(ALACContext *alac)
     int ch;
     int buf_size = alac->max_samples_per_frame * sizeof(int32_t);
 
+    for (ch = 0; ch < 2; ch++) {
+        alac->predict_error_buffer[ch]  = NULL;
+        alac->output_samples_buffer[ch] = NULL;
+        alac->extra_bits_buffer[ch]     = NULL;
+    }
+
     for (ch = 0; ch < FFMIN(alac->channels, 2); ch++) {
         FF_ALLOC_OR_GOTO(alac->avctx, alac->predict_error_buffer[ch],
                          buf_size, buf_alloc_fail);
 
-        if (alac->sample_size == 16) {
+        alac->direct_output = alac->sample_size > 16;
+        if (!alac->direct_output) {
             FF_ALLOC_OR_GOTO(alac->avctx, alac->output_samples_buffer[ch],
-                             buf_size, buf_alloc_fail);
+                             buf_size + AV_INPUT_BUFFER_PADDING_SIZE, buf_alloc_fail);
         }
 
         FF_ALLOC_OR_GOTO(alac->avctx, alac->extra_bits_buffer[ch],
-                         buf_size, buf_alloc_fail);
+                         buf_size + AV_INPUT_BUFFER_PADDING_SIZE, buf_alloc_fail);
     }
     return 0;
 buf_alloc_fail:
@@ -512,7 +524,7 @@ static int alac_set_info(ALACContext *alac)
 
     alac->max_samples_per_frame = bytestream2_get_be32u(&gb);
     if (!alac->max_samples_per_frame ||
-        alac->max_samples_per_frame > INT_MAX / sizeof(int32_t)) {
+        alac->max_samples_per_frame > 4096 * 4096) {
         av_log(alac->avctx, AV_LOG_ERROR,
                "max samples per frame invalid: %"PRIu32"\n",
                alac->max_samples_per_frame);
@@ -527,7 +539,7 @@ static int alac_set_info(ALACContext *alac)
     bytestream2_get_be16u(&gb); // maxRun
     bytestream2_get_be32u(&gb); // max coded frame size
     bytestream2_get_be32u(&gb); // average bitrate
-    bytestream2_get_be32u(&gb); // samplerate
+    alac->sample_rate          = bytestream2_get_be32u(&gb);
 
     return 0;
 }
@@ -540,17 +552,18 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
 
     /* initialize from the extradata */
     if (alac->avctx->extradata_size < ALAC_EXTRADATA_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "alac: extradata is too small\n");
+        av_log(avctx, AV_LOG_ERROR, "extradata is too small\n");
         return AVERROR_INVALIDDATA;
     }
-    if (alac_set_info(alac)) {
-        av_log(avctx, AV_LOG_ERROR, "alac: set_info failed\n");
-        return -1;
+    if ((ret = alac_set_info(alac)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "set_info failed\n");
+        return ret;
     }
 
     switch (alac->sample_size) {
     case 16: avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
              break;
+    case 20:
     case 24:
     case 32: avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
              break;
@@ -558,6 +571,7 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
              return AVERROR_PATCHWELCOME;
     }
     avctx->bits_per_raw_sample = alac->sample_size;
+    avctx->sample_rate         = alac->sample_rate;
 
     if (alac->channels < 1) {
         av_log(avctx, AV_LOG_WARNING, "Invalid channel count\n");
@@ -568,7 +582,7 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
         else
             avctx->channels = alac->channels;
     }
-    if (avctx->channels > ALAC_MAX_CHANNELS) {
+    if (avctx->channels > ALAC_MAX_CHANNELS || avctx->channels <= 0 ) {
         avpriv_report_missing_feature(avctx, "Channel count %d",
                                       avctx->channels);
         return AVERROR_PATCHWELCOME;
@@ -580,9 +594,34 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
         return ret;
     }
 
+    ff_alacdsp_init(&alac->dsp);
+
     return 0;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    ALACContext *alac = avctx->priv_data;
+    alac->avctx = avctx;
+    return allocate_buffers(alac);
+}
+#endif
+
+static const AVOption options[] = {
+    { "extra_bits_bug", "Force non-standard decoding process",
+      offsetof(ALACContext, extra_bit_bug), AV_OPT_TYPE_BOOL, { .i64 = 0 },
+      0, 1, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM },
+    { NULL },
+};
+
+static const AVClass alac_class = {
+    .class_name = "alac",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_alac_decoder = {
     .name           = "alac",
     .long_name      = NULL_IF_CONFIG_SMALL("ALAC (Apple Lossless Audio Codec)"),
@@ -592,5 +631,7 @@ AVCodec ff_alac_decoder = {
     .init           = alac_decode_init,
     .close          = alac_decode_close,
     .decode         = alac_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .priv_class     = &alac_class
 };
diff --git a/libavcodec/alac_data.c b/libavcodec/alac_data.c
index 9e13119..0bcb06c 100644
--- a/libavcodec/alac_data.c
+++ b/libavcodec/alac_data.c
@@ -1,20 +1,20 @@
 /*
  * ALAC encoder and decoder common data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/alac_data.h b/libavcodec/alac_data.h
index ebb1f33..650d6dc 100644
--- a/libavcodec/alac_data.h
+++ b/libavcodec/alac_data.h
@@ -1,20 +1,20 @@
 /*
  * ALAC encoder and decoder common data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/alacdsp.c b/libavcodec/alacdsp.c
new file mode 100644
index 0000000..ecbaedb
--- /dev/null
+++ b/libavcodec/alacdsp.c
@@ -0,0 +1,63 @@
+/*
+ * ALAC (Apple Lossless Audio Codec) decoder
+ * Copyright (c) 2005 David Hammerton
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "alacdsp.h"
+#include "config.h"
+
+static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
+                               int decorr_shift, int decorr_left_weight)
+{
+    int i;
+
+    for (i = 0; i < nb_samples; i++) {
+        int32_t a, b;
+
+        a = buffer[0][i];
+        b = buffer[1][i];
+
+        a -= (b * decorr_left_weight) >> decorr_shift;
+        b += a;
+
+        buffer[0][i] = b;
+        buffer[1][i] = a;
+    }
+}
+
+static void append_extra_bits(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                              int extra_bits, int channels, int nb_samples)
+{
+    int i, ch;
+
+    for (ch = 0; ch < channels; ch++)
+        for (i = 0; i < nb_samples; i++)
+            buffer[ch][i] = (buffer[ch][i] << extra_bits) | extra_bits_buffer[ch][i];
+}
+
+av_cold void ff_alacdsp_init(ALACDSPContext *c)
+{
+    c->decorrelate_stereo   = decorrelate_stereo;
+    c->append_extra_bits[0] =
+    c->append_extra_bits[1] = append_extra_bits;
+
+    if (ARCH_X86)
+        ff_alacdsp_init_x86(c);
+}
diff --git a/libavcodec/alacdsp.h b/libavcodec/alacdsp.h
new file mode 100644
index 0000000..f8b56dd
--- /dev/null
+++ b/libavcodec/alacdsp.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALACDSP_H
+#define AVCODEC_ALACDSP_H
+
+#include <stdint.h>
+
+typedef struct ALACDSPContext {
+    void (*decorrelate_stereo)(int32_t *buffer[2], int nb_samples,
+                               int decorr_shift, int decorr_left_weight);
+    void (*append_extra_bits[2])(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                 int extra_bits, int channels, int nb_samples);
+} ALACDSPContext;
+
+void ff_alacdsp_init(ALACDSPContext *c);
+void ff_alacdsp_init_x86(ALACDSPContext *c);
+
+#endif /* AVCODEC_ALACDSP_H */
diff --git a/libavcodec/alacenc.c b/libavcodec/alacenc.c
index d921fa1..804cc7b 100644
--- a/libavcodec/alacenc.c
+++ b/libavcodec/alacenc.c
@@ -2,20 +2,20 @@
  * ALAC audio encoder
  * Copyright (c) 2008  Jaikrishnan Menon <realityman@gmx.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,7 @@
 #define DEFAULT_MAX_PRED_ORDER    6
 #define DEFAULT_MIN_PRED_ORDER    4
 #define ALAC_MAX_LPC_PRECISION    9
+#define ALAC_MIN_LPC_SHIFT        0
 #define ALAC_MAX_LPC_SHIFT        9
 
 #define ALAC_CHMODE_LEFT_RIGHT    0
@@ -70,7 +71,7 @@ typedef struct AlacEncodeContext {
     int write_sample_size;
     int extra_bits;
     int32_t sample_buf[2][DEFAULT_FRAME_SIZE];
-    int32_t predictor_buf[DEFAULT_FRAME_SIZE];
+    int32_t predictor_buf[2][DEFAULT_FRAME_SIZE];
     int interlacing_shift;
     int interlacing_leftweight;
     PutBitContext pbctx;
@@ -171,7 +172,8 @@ static void calc_predictor_params(AlacEncodeContext *s, int ch)
                                       s->max_prediction_order,
                                       ALAC_MAX_LPC_PRECISION, coefs, shift,
                                       FF_LPC_TYPE_LEVINSON, 0,
-                                      ORDER_METHOD_EST, ALAC_MAX_LPC_SHIFT, 1);
+                                      ORDER_METHOD_EST, ALAC_MIN_LPC_SHIFT,
+                                      ALAC_MAX_LPC_SHIFT, 1);
 
         s->lpc[ch].lpc_order = opt_order;
         s->lpc[ch].lpc_quant = shift[opt_order-1];
@@ -256,13 +258,14 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
 {
     int i;
     AlacLPCContext lpc = s->lpc[ch];
+    int32_t *residual = s->predictor_buf[ch];
 
     if (lpc.lpc_order == 31) {
-        s->predictor_buf[0] = s->sample_buf[ch][0];
+        residual[0] = s->sample_buf[ch][0];
 
         for (i = 1; i < s->frame_size; i++) {
-            s->predictor_buf[i] = s->sample_buf[ch][i    ] -
-                                  s->sample_buf[ch][i - 1];
+            residual[i] = s->sample_buf[ch][i    ] -
+                          s->sample_buf[ch][i - 1];
         }
 
         return;
@@ -272,12 +275,11 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
 
     if (lpc.lpc_order > 0) {
         int32_t *samples  = s->sample_buf[ch];
-        int32_t *residual = s->predictor_buf;
 
         // generate warm-up samples
         residual[0] = samples[0];
         for (i = 1; i <= lpc.lpc_order; i++)
-            residual[i] = samples[i] - samples[i-1];
+            residual[i] = sign_extend(samples[i] - samples[i-1], s->write_sample_size);
 
         // perform lpc on remaining samples
         for (i = lpc.lpc_order + 1; i < s->frame_size; i++) {
@@ -316,11 +318,11 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
     }
 }
 
-static void alac_entropy_coder(AlacEncodeContext *s)
+static void alac_entropy_coder(AlacEncodeContext *s, int ch)
 {
     unsigned int history = s->rc.initial_history;
     int sign_modifier = 0, i, k;
-    int32_t *samples = s->predictor_buf;
+    int32_t *samples = s->predictor_buf[ch];
 
     for (i = 0; i < s->frame_size;) {
         int x;
@@ -397,6 +399,19 @@ static void write_element(AlacEncodeContext *s,
         init_sample_buffers(s, channels, samples);
         write_element_header(s, element, instance);
 
+        // extract extra bits if needed
+        if (s->extra_bits) {
+            uint32_t mask = (1 << s->extra_bits) - 1;
+            for (j = 0; j < channels; j++) {
+                int32_t *extra = s->predictor_buf[j];
+                int32_t *smp   = s->sample_buf[j];
+                for (i = 0; i < s->frame_size; i++) {
+                    extra[i] = smp[i] & mask;
+                    smp[i] >>= s->extra_bits;
+                }
+            }
+        }
+
         if (channels == 2)
             alac_stereo_decorrelation(s);
         else
@@ -419,11 +434,9 @@ static void write_element(AlacEncodeContext *s,
 
         // write extra bits if needed
         if (s->extra_bits) {
-            uint32_t mask = (1 << s->extra_bits) - 1;
             for (i = 0; i < s->frame_size; i++) {
                 for (j = 0; j < channels; j++) {
-                    put_bits(pb, s->extra_bits, s->sample_buf[j][i] & mask);
-                    s->sample_buf[j][i] >>= s->extra_bits;
+                    put_bits(pb, s->extra_bits, s->predictor_buf[j][i]);
                 }
             }
         }
@@ -435,10 +448,11 @@ static void write_element(AlacEncodeContext *s,
             // TODO: determine when this will actually help. for now it's not used.
             if (prediction_type == 15) {
                 // 2nd pass 1st order filter
+                int32_t *residual = s->predictor_buf[i];
                 for (j = s->frame_size - 1; j > 0; j--)
-                    s->predictor_buf[j] -= s->predictor_buf[j - 1];
+                    residual[j] -= residual[j - 1];
             }
-            alac_entropy_coder(s);
+            alac_entropy_coder(s, i);
         }
     }
 }
@@ -611,10 +625,8 @@ static int alac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     else
         max_frame_size = s->max_coded_frame_size;
 
-    if ((ret = ff_alloc_packet(avpkt, 2 * max_frame_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 4 * max_frame_size, 0)) < 0)
         return ret;
-    }
 
     /* use verbatim mode for compression_level 0 */
     if (s->compression_level) {
diff --git a/libavcodec/aliaspixdec.c b/libavcodec/aliaspixdec.c
index 8c18924..087b18f 100644
--- a/libavcodec/aliaspixdec.c
+++ b/libavcodec/aliaspixdec.c
@@ -2,20 +2,20 @@
  * Alias PIX image decoder
  * Copyright (C) 2014 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aliaspixenc.c b/libavcodec/aliaspixenc.c
index 63016af..a9ba00c 100644
--- a/libavcodec/aliaspixenc.c
+++ b/libavcodec/aliaspixenc.c
@@ -2,20 +2,20 @@
  * Alias PIX image encoder
  * Copyright (C) 2014 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,7 +61,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     length = ALIAS_HEADER_SIZE + 4 * width * height; // max possible
-    if ((ret = ff_alloc_packet(pkt, length)) < 0) {
+    if ((ret = ff_alloc_packet2(avctx, pkt, length, ALIAS_HEADER_SIZE + height*2)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", length);
         return ret;
     }
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index f38ea19..b26aeca 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -2,20 +2,20 @@
  * Provide registration of all codecs, parsers and bitstream filters for libavcodec.
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,490 +25,893 @@
  */
 
 #include "config.h"
+#include "libavutil/thread.h"
 #include "avcodec.h"
 #include "version.h"
 
-#define REGISTER_ENCODER(X, x)                                          \
-    {                                                                   \
-        extern AVCodec ff_##x##_encoder;                                \
-        if (CONFIG_##X##_ENCODER)                                       \
-            avcodec_register(&ff_##x##_encoder);                        \
-    }
+extern AVCodec ff_a64multi_encoder;
+extern AVCodec ff_a64multi5_encoder;
+extern AVCodec ff_aasc_decoder;
+extern AVCodec ff_aic_decoder;
+extern AVCodec ff_alias_pix_encoder;
+extern AVCodec ff_alias_pix_decoder;
+extern AVCodec ff_amv_encoder;
+extern AVCodec ff_amv_decoder;
+extern AVCodec ff_anm_decoder;
+extern AVCodec ff_ansi_decoder;
+extern AVCodec ff_apng_encoder;
+extern AVCodec ff_apng_decoder;
+extern AVCodec ff_arbc_decoder;
+extern AVCodec ff_asv1_encoder;
+extern AVCodec ff_asv1_decoder;
+extern AVCodec ff_asv2_encoder;
+extern AVCodec ff_asv2_decoder;
+extern AVCodec ff_aura_decoder;
+extern AVCodec ff_aura2_decoder;
+extern AVCodec ff_avrp_encoder;
+extern AVCodec ff_avrp_decoder;
+extern AVCodec ff_avrn_decoder;
+extern AVCodec ff_avs_decoder;
+extern AVCodec ff_avui_encoder;
+extern AVCodec ff_avui_decoder;
+extern AVCodec ff_ayuv_encoder;
+extern AVCodec ff_ayuv_decoder;
+extern AVCodec ff_bethsoftvid_decoder;
+extern AVCodec ff_bfi_decoder;
+extern AVCodec ff_bink_decoder;
+extern AVCodec ff_bitpacked_decoder;
+extern AVCodec ff_bmp_encoder;
+extern AVCodec ff_bmp_decoder;
+extern AVCodec ff_bmv_video_decoder;
+extern AVCodec ff_brender_pix_decoder;
+extern AVCodec ff_c93_decoder;
+extern AVCodec ff_cavs_decoder;
+extern AVCodec ff_cdgraphics_decoder;
+extern AVCodec ff_cdxl_decoder;
+extern AVCodec ff_cfhd_decoder;
+extern AVCodec ff_cinepak_encoder;
+extern AVCodec ff_cinepak_decoder;
+extern AVCodec ff_clearvideo_decoder;
+extern AVCodec ff_cljr_encoder;
+extern AVCodec ff_cljr_decoder;
+extern AVCodec ff_cllc_decoder;
+extern AVCodec ff_comfortnoise_encoder;
+extern AVCodec ff_comfortnoise_decoder;
+extern AVCodec ff_cpia_decoder;
+extern AVCodec ff_cscd_decoder;
+extern AVCodec ff_cyuv_decoder;
+extern AVCodec ff_dds_decoder;
+extern AVCodec ff_dfa_decoder;
+extern AVCodec ff_dirac_decoder;
+extern AVCodec ff_dnxhd_encoder;
+extern AVCodec ff_dnxhd_decoder;
+extern AVCodec ff_dpx_encoder;
+extern AVCodec ff_dpx_decoder;
+extern AVCodec ff_dsicinvideo_decoder;
+extern AVCodec ff_dvaudio_decoder;
+extern AVCodec ff_dvvideo_encoder;
+extern AVCodec ff_dvvideo_decoder;
+extern AVCodec ff_dxa_decoder;
+extern AVCodec ff_dxtory_decoder;
+extern AVCodec ff_dxv_decoder;
+extern AVCodec ff_eacmv_decoder;
+extern AVCodec ff_eamad_decoder;
+extern AVCodec ff_eatgq_decoder;
+extern AVCodec ff_eatgv_decoder;
+extern AVCodec ff_eatqi_decoder;
+extern AVCodec ff_eightbps_decoder;
+extern AVCodec ff_eightsvx_exp_decoder;
+extern AVCodec ff_eightsvx_fib_decoder;
+extern AVCodec ff_escape124_decoder;
+extern AVCodec ff_escape130_decoder;
+extern AVCodec ff_exr_decoder;
+extern AVCodec ff_ffv1_encoder;
+extern AVCodec ff_ffv1_decoder;
+extern AVCodec ff_ffvhuff_encoder;
+extern AVCodec ff_ffvhuff_decoder;
+extern AVCodec ff_fic_decoder;
+extern AVCodec ff_fits_encoder;
+extern AVCodec ff_fits_decoder;
+extern AVCodec ff_flashsv_encoder;
+extern AVCodec ff_flashsv_decoder;
+extern AVCodec ff_flashsv2_encoder;
+extern AVCodec ff_flashsv2_decoder;
+extern AVCodec ff_flic_decoder;
+extern AVCodec ff_flv_encoder;
+extern AVCodec ff_flv_decoder;
+extern AVCodec ff_fmvc_decoder;
+extern AVCodec ff_fourxm_decoder;
+extern AVCodec ff_fraps_decoder;
+extern AVCodec ff_frwu_decoder;
+extern AVCodec ff_g2m_decoder;
+extern AVCodec ff_gdv_decoder;
+extern AVCodec ff_gif_encoder;
+extern AVCodec ff_gif_decoder;
+extern AVCodec ff_h261_encoder;
+extern AVCodec ff_h261_decoder;
+extern AVCodec ff_h263_encoder;
+extern AVCodec ff_h263_decoder;
+extern AVCodec ff_h263i_decoder;
+extern AVCodec ff_h263p_encoder;
+extern AVCodec ff_h263p_decoder;
+extern AVCodec ff_h263_v4l2m2m_decoder;
+extern AVCodec ff_h264_decoder;
+extern AVCodec ff_h264_crystalhd_decoder;
+extern AVCodec ff_h264_v4l2m2m_decoder;
+extern AVCodec ff_h264_mediacodec_decoder;
+extern AVCodec ff_h264_mmal_decoder;
+extern AVCodec ff_h264_qsv_decoder;
+extern AVCodec ff_h264_rkmpp_decoder;
+extern AVCodec ff_hap_encoder;
+extern AVCodec ff_hap_decoder;
+extern AVCodec ff_hevc_decoder;
+extern AVCodec ff_hevc_qsv_decoder;
+extern AVCodec ff_hevc_rkmpp_decoder;
+extern AVCodec ff_hevc_v4l2m2m_decoder;
+extern AVCodec ff_hnm4_video_decoder;
+extern AVCodec ff_hq_hqa_decoder;
+extern AVCodec ff_hqx_decoder;
+extern AVCodec ff_huffyuv_encoder;
+extern AVCodec ff_huffyuv_decoder;
+extern AVCodec ff_hymt_decoder;
+extern AVCodec ff_idcin_decoder;
+extern AVCodec ff_iff_ilbm_decoder;
+extern AVCodec ff_imm4_decoder;
+extern AVCodec ff_indeo2_decoder;
+extern AVCodec ff_indeo3_decoder;
+extern AVCodec ff_indeo4_decoder;
+extern AVCodec ff_indeo5_decoder;
+extern AVCodec ff_interplay_video_decoder;
+extern AVCodec ff_jpeg2000_encoder;
+extern AVCodec ff_jpeg2000_decoder;
+extern AVCodec ff_jpegls_encoder;
+extern AVCodec ff_jpegls_decoder;
+extern AVCodec ff_jv_decoder;
+extern AVCodec ff_kgv1_decoder;
+extern AVCodec ff_kmvc_decoder;
+extern AVCodec ff_lagarith_decoder;
+extern AVCodec ff_ljpeg_encoder;
+extern AVCodec ff_loco_decoder;
+extern AVCodec ff_m101_decoder;
+extern AVCodec ff_magicyuv_encoder;
+extern AVCodec ff_magicyuv_decoder;
+extern AVCodec ff_mdec_decoder;
+extern AVCodec ff_mimic_decoder;
+extern AVCodec ff_mjpeg_encoder;
+extern AVCodec ff_mjpeg_decoder;
+extern AVCodec ff_mjpegb_decoder;
+extern AVCodec ff_mmvideo_decoder;
+extern AVCodec ff_motionpixels_decoder;
+extern AVCodec ff_mpeg1video_encoder;
+extern AVCodec ff_mpeg1video_decoder;
+extern AVCodec ff_mpeg2video_encoder;
+extern AVCodec ff_mpeg2video_decoder;
+extern AVCodec ff_mpeg4_encoder;
+extern AVCodec ff_mpeg4_decoder;
+extern AVCodec ff_mpeg4_crystalhd_decoder;
+extern AVCodec ff_mpeg4_v4l2m2m_decoder;
+extern AVCodec ff_mpeg4_mmal_decoder;
+extern AVCodec ff_mpegvideo_decoder;
+extern AVCodec ff_mpeg1_v4l2m2m_decoder;
+extern AVCodec ff_mpeg2_mmal_decoder;
+extern AVCodec ff_mpeg2_crystalhd_decoder;
+extern AVCodec ff_mpeg2_v4l2m2m_decoder;
+extern AVCodec ff_mpeg2_qsv_decoder;
+extern AVCodec ff_mpeg2_mediacodec_decoder;
+extern AVCodec ff_msa1_decoder;
+extern AVCodec ff_mscc_decoder;
+extern AVCodec ff_msmpeg4v1_decoder;
+extern AVCodec ff_msmpeg4v2_encoder;
+extern AVCodec ff_msmpeg4v2_decoder;
+extern AVCodec ff_msmpeg4v3_encoder;
+extern AVCodec ff_msmpeg4v3_decoder;
+extern AVCodec ff_msmpeg4_crystalhd_decoder;
+extern AVCodec ff_msrle_decoder;
+extern AVCodec ff_mss1_decoder;
+extern AVCodec ff_mss2_decoder;
+extern AVCodec ff_msvideo1_encoder;
+extern AVCodec ff_msvideo1_decoder;
+extern AVCodec ff_mszh_decoder;
+extern AVCodec ff_mts2_decoder;
+extern AVCodec ff_mvc1_decoder;
+extern AVCodec ff_mvc2_decoder;
+extern AVCodec ff_mwsc_decoder;
+extern AVCodec ff_mxpeg_decoder;
+extern AVCodec ff_nuv_decoder;
+extern AVCodec ff_paf_video_decoder;
+extern AVCodec ff_pam_encoder;
+extern AVCodec ff_pam_decoder;
+extern AVCodec ff_pbm_encoder;
+extern AVCodec ff_pbm_decoder;
+extern AVCodec ff_pcx_encoder;
+extern AVCodec ff_pcx_decoder;
+extern AVCodec ff_pgm_encoder;
+extern AVCodec ff_pgm_decoder;
+extern AVCodec ff_pgmyuv_encoder;
+extern AVCodec ff_pgmyuv_decoder;
+extern AVCodec ff_pictor_decoder;
+extern AVCodec ff_pixlet_decoder;
+extern AVCodec ff_png_encoder;
+extern AVCodec ff_png_decoder;
+extern AVCodec ff_ppm_encoder;
+extern AVCodec ff_ppm_decoder;
+extern AVCodec ff_prores_encoder;
+extern AVCodec ff_prores_decoder;
+extern AVCodec ff_prores_aw_encoder;
+extern AVCodec ff_prores_ks_encoder;
+extern AVCodec ff_prosumer_decoder;
+extern AVCodec ff_psd_decoder;
+extern AVCodec ff_ptx_decoder;
+extern AVCodec ff_qdraw_decoder;
+extern AVCodec ff_qpeg_decoder;
+extern AVCodec ff_qtrle_encoder;
+extern AVCodec ff_qtrle_decoder;
+extern AVCodec ff_r10k_encoder;
+extern AVCodec ff_r10k_decoder;
+extern AVCodec ff_r210_encoder;
+extern AVCodec ff_r210_decoder;
+extern AVCodec ff_rasc_decoder;
+extern AVCodec ff_rawvideo_encoder;
+extern AVCodec ff_rawvideo_decoder;
+extern AVCodec ff_rl2_decoder;
+extern AVCodec ff_roq_encoder;
+extern AVCodec ff_roq_decoder;
+extern AVCodec ff_rpza_decoder;
+extern AVCodec ff_rscc_decoder;
+extern AVCodec ff_rv10_encoder;
+extern AVCodec ff_rv10_decoder;
+extern AVCodec ff_rv20_encoder;
+extern AVCodec ff_rv20_decoder;
+extern AVCodec ff_rv30_decoder;
+extern AVCodec ff_rv40_decoder;
+extern AVCodec ff_s302m_encoder;
+extern AVCodec ff_s302m_decoder;
+extern AVCodec ff_sanm_decoder;
+extern AVCodec ff_scpr_decoder;
+extern AVCodec ff_screenpresso_decoder;
+extern AVCodec ff_sdx2_dpcm_decoder;
+extern AVCodec ff_sgi_encoder;
+extern AVCodec ff_sgi_decoder;
+extern AVCodec ff_sgirle_decoder;
+extern AVCodec ff_sheervideo_decoder;
+extern AVCodec ff_smacker_decoder;
+extern AVCodec ff_smc_decoder;
+extern AVCodec ff_smvjpeg_decoder;
+extern AVCodec ff_snow_encoder;
+extern AVCodec ff_snow_decoder;
+extern AVCodec ff_sp5x_decoder;
+extern AVCodec ff_speedhq_decoder;
+extern AVCodec ff_srgc_decoder;
+extern AVCodec ff_sunrast_encoder;
+extern AVCodec ff_sunrast_decoder;
+extern AVCodec ff_svq1_encoder;
+extern AVCodec ff_svq1_decoder;
+extern AVCodec ff_svq3_decoder;
+extern AVCodec ff_targa_encoder;
+extern AVCodec ff_targa_decoder;
+extern AVCodec ff_targa_y216_decoder;
+extern AVCodec ff_tdsc_decoder;
+extern AVCodec ff_theora_decoder;
+extern AVCodec ff_thp_decoder;
+extern AVCodec ff_tiertexseqvideo_decoder;
+extern AVCodec ff_tiff_encoder;
+extern AVCodec ff_tiff_decoder;
+extern AVCodec ff_tmv_decoder;
+extern AVCodec ff_truemotion1_decoder;
+extern AVCodec ff_truemotion2_decoder;
+extern AVCodec ff_truemotion2rt_decoder;
+extern AVCodec ff_tscc_decoder;
+extern AVCodec ff_tscc2_decoder;
+extern AVCodec ff_txd_decoder;
+extern AVCodec ff_ulti_decoder;
+extern AVCodec ff_utvideo_encoder;
+extern AVCodec ff_utvideo_decoder;
+extern AVCodec ff_v210_encoder;
+extern AVCodec ff_v210_decoder;
+extern AVCodec ff_v210x_decoder;
+extern AVCodec ff_v308_encoder;
+extern AVCodec ff_v308_decoder;
+extern AVCodec ff_v408_encoder;
+extern AVCodec ff_v408_decoder;
+extern AVCodec ff_v410_encoder;
+extern AVCodec ff_v410_decoder;
+extern AVCodec ff_vb_decoder;
+extern AVCodec ff_vble_decoder;
+extern AVCodec ff_vc1_decoder;
+extern AVCodec ff_vc1_crystalhd_decoder;
+extern AVCodec ff_vc1image_decoder;
+extern AVCodec ff_vc1_mmal_decoder;
+extern AVCodec ff_vc1_qsv_decoder;
+extern AVCodec ff_vc1_v4l2m2m_decoder;
+extern AVCodec ff_vc2_encoder;
+extern AVCodec ff_vcr1_decoder;
+extern AVCodec ff_vmdvideo_decoder;
+extern AVCodec ff_vmnc_decoder;
+extern AVCodec ff_vp3_decoder;
+extern AVCodec ff_vp5_decoder;
+extern AVCodec ff_vp6_decoder;
+extern AVCodec ff_vp6a_decoder;
+extern AVCodec ff_vp6f_decoder;
+extern AVCodec ff_vp7_decoder;
+extern AVCodec ff_vp8_decoder;
+extern AVCodec ff_vp8_rkmpp_decoder;
+extern AVCodec ff_vp8_v4l2m2m_decoder;
+extern AVCodec ff_vp9_decoder;
+extern AVCodec ff_vp9_rkmpp_decoder;
+extern AVCodec ff_vp9_v4l2m2m_decoder;
+extern AVCodec ff_vqa_decoder;
+extern AVCodec ff_webp_decoder;
+extern AVCodec ff_wcmv_decoder;
+extern AVCodec ff_wrapped_avframe_encoder;
+extern AVCodec ff_wrapped_avframe_decoder;
+extern AVCodec ff_wmv1_encoder;
+extern AVCodec ff_wmv1_decoder;
+extern AVCodec ff_wmv2_encoder;
+extern AVCodec ff_wmv2_decoder;
+extern AVCodec ff_wmv3_decoder;
+extern AVCodec ff_wmv3_crystalhd_decoder;
+extern AVCodec ff_wmv3image_decoder;
+extern AVCodec ff_wnv1_decoder;
+extern AVCodec ff_xan_wc3_decoder;
+extern AVCodec ff_xan_wc4_decoder;
+extern AVCodec ff_xbm_encoder;
+extern AVCodec ff_xbm_decoder;
+extern AVCodec ff_xface_encoder;
+extern AVCodec ff_xface_decoder;
+extern AVCodec ff_xl_decoder;
+extern AVCodec ff_xpm_decoder;
+extern AVCodec ff_xwd_encoder;
+extern AVCodec ff_xwd_decoder;
+extern AVCodec ff_y41p_encoder;
+extern AVCodec ff_y41p_decoder;
+extern AVCodec ff_ylc_decoder;
+extern AVCodec ff_yop_decoder;
+extern AVCodec ff_yuv4_encoder;
+extern AVCodec ff_yuv4_decoder;
+extern AVCodec ff_zero12v_decoder;
+extern AVCodec ff_zerocodec_decoder;
+extern AVCodec ff_zlib_encoder;
+extern AVCodec ff_zlib_decoder;
+extern AVCodec ff_zmbv_encoder;
+extern AVCodec ff_zmbv_decoder;
+
+/* audio codecs */
+extern AVCodec ff_aac_encoder;
+extern AVCodec ff_aac_decoder;
+extern AVCodec ff_aac_fixed_decoder;
+extern AVCodec ff_aac_latm_decoder;
+extern AVCodec ff_ac3_encoder;
+extern AVCodec ff_ac3_decoder;
+extern AVCodec ff_ac3_fixed_encoder;
+extern AVCodec ff_ac3_fixed_decoder;
+extern AVCodec ff_alac_encoder;
+extern AVCodec ff_alac_decoder;
+extern AVCodec ff_als_decoder;
+extern AVCodec ff_amrnb_decoder;
+extern AVCodec ff_amrwb_decoder;
+extern AVCodec ff_ape_decoder;
+extern AVCodec ff_aptx_encoder;
+extern AVCodec ff_aptx_decoder;
+extern AVCodec ff_aptx_hd_encoder;
+extern AVCodec ff_aptx_hd_decoder;
+extern AVCodec ff_atrac1_decoder;
+extern AVCodec ff_atrac3_decoder;
+extern AVCodec ff_atrac3al_decoder;
+extern AVCodec ff_atrac3p_decoder;
+extern AVCodec ff_atrac3pal_decoder;
+extern AVCodec ff_atrac9_decoder;
+extern AVCodec ff_binkaudio_dct_decoder;
+extern AVCodec ff_binkaudio_rdft_decoder;
+extern AVCodec ff_bmv_audio_decoder;
+extern AVCodec ff_cook_decoder;
+extern AVCodec ff_dca_encoder;
+extern AVCodec ff_dca_decoder;
+extern AVCodec ff_dolby_e_decoder;
+extern AVCodec ff_dsd_lsbf_decoder;
+extern AVCodec ff_dsd_msbf_decoder;
+extern AVCodec ff_dsd_lsbf_planar_decoder;
+extern AVCodec ff_dsd_msbf_planar_decoder;
+extern AVCodec ff_dsicinaudio_decoder;
+extern AVCodec ff_dss_sp_decoder;
+extern AVCodec ff_dst_decoder;
+extern AVCodec ff_eac3_encoder;
+extern AVCodec ff_eac3_decoder;
+extern AVCodec ff_evrc_decoder;
+extern AVCodec ff_ffwavesynth_decoder;
+extern AVCodec ff_flac_encoder;
+extern AVCodec ff_flac_decoder;
+extern AVCodec ff_g723_1_encoder;
+extern AVCodec ff_g723_1_decoder;
+extern AVCodec ff_g729_decoder;
+extern AVCodec ff_gsm_decoder;
+extern AVCodec ff_gsm_ms_decoder;
+extern AVCodec ff_hcom_decoder;
+extern AVCodec ff_iac_decoder;
+extern AVCodec ff_ilbc_decoder;
+extern AVCodec ff_imc_decoder;
+extern AVCodec ff_interplay_acm_decoder;
+extern AVCodec ff_mace3_decoder;
+extern AVCodec ff_mace6_decoder;
+extern AVCodec ff_metasound_decoder;
+extern AVCodec ff_mlp_encoder;
+extern AVCodec ff_mlp_decoder;
+extern AVCodec ff_mp1_decoder;
+extern AVCodec ff_mp1float_decoder;
+extern AVCodec ff_mp2_encoder;
+extern AVCodec ff_mp2_decoder;
+extern AVCodec ff_mp2float_decoder;
+extern AVCodec ff_mp2fixed_encoder;
+extern AVCodec ff_mp3float_decoder;
+extern AVCodec ff_mp3_decoder;
+extern AVCodec ff_mp3adufloat_decoder;
+extern AVCodec ff_mp3adu_decoder;
+extern AVCodec ff_mp3on4float_decoder;
+extern AVCodec ff_mp3on4_decoder;
+extern AVCodec ff_mpc7_decoder;
+extern AVCodec ff_mpc8_decoder;
+extern AVCodec ff_nellymoser_encoder;
+extern AVCodec ff_nellymoser_decoder;
+extern AVCodec ff_on2avc_decoder;
+extern AVCodec ff_opus_encoder;
+extern AVCodec ff_opus_decoder;
+extern AVCodec ff_paf_audio_decoder;
+extern AVCodec ff_qcelp_decoder;
+extern AVCodec ff_qdm2_decoder;
+extern AVCodec ff_qdmc_decoder;
+extern AVCodec ff_ra_144_encoder;
+extern AVCodec ff_ra_144_decoder;
+extern AVCodec ff_ra_288_decoder;
+extern AVCodec ff_ralf_decoder;
+extern AVCodec ff_sbc_encoder;
+extern AVCodec ff_sbc_decoder;
+extern AVCodec ff_shorten_decoder;
+extern AVCodec ff_sipr_decoder;
+extern AVCodec ff_smackaud_decoder;
+extern AVCodec ff_sonic_encoder;
+extern AVCodec ff_sonic_decoder;
+extern AVCodec ff_sonic_ls_encoder;
+extern AVCodec ff_tak_decoder;
+extern AVCodec ff_truehd_encoder;
+extern AVCodec ff_truehd_decoder;
+extern AVCodec ff_truespeech_decoder;
+extern AVCodec ff_tta_encoder;
+extern AVCodec ff_tta_decoder;
+extern AVCodec ff_twinvq_decoder;
+extern AVCodec ff_vmdaudio_decoder;
+extern AVCodec ff_vorbis_encoder;
+extern AVCodec ff_vorbis_decoder;
+extern AVCodec ff_wavpack_encoder;
+extern AVCodec ff_wavpack_decoder;
+extern AVCodec ff_wmalossless_decoder;
+extern AVCodec ff_wmapro_decoder;
+extern AVCodec ff_wmav1_encoder;
+extern AVCodec ff_wmav1_decoder;
+extern AVCodec ff_wmav2_encoder;
+extern AVCodec ff_wmav2_decoder;
+extern AVCodec ff_wmavoice_decoder;
+extern AVCodec ff_ws_snd1_decoder;
+extern AVCodec ff_xma1_decoder;
+extern AVCodec ff_xma2_decoder;
+
+/* PCM codecs */
+extern AVCodec ff_pcm_alaw_encoder;
+extern AVCodec ff_pcm_alaw_decoder;
+extern AVCodec ff_pcm_bluray_decoder;
+extern AVCodec ff_pcm_dvd_encoder;
+extern AVCodec ff_pcm_dvd_decoder;
+extern AVCodec ff_pcm_f16le_decoder;
+extern AVCodec ff_pcm_f24le_decoder;
+extern AVCodec ff_pcm_f32be_encoder;
+extern AVCodec ff_pcm_f32be_decoder;
+extern AVCodec ff_pcm_f32le_encoder;
+extern AVCodec ff_pcm_f32le_decoder;
+extern AVCodec ff_pcm_f64be_encoder;
+extern AVCodec ff_pcm_f64be_decoder;
+extern AVCodec ff_pcm_f64le_encoder;
+extern AVCodec ff_pcm_f64le_decoder;
+extern AVCodec ff_pcm_lxf_decoder;
+extern AVCodec ff_pcm_mulaw_encoder;
+extern AVCodec ff_pcm_mulaw_decoder;
+extern AVCodec ff_pcm_s8_encoder;
+extern AVCodec ff_pcm_s8_decoder;
+extern AVCodec ff_pcm_s8_planar_encoder;
+extern AVCodec ff_pcm_s8_planar_decoder;
+extern AVCodec ff_pcm_s16be_encoder;
+extern AVCodec ff_pcm_s16be_decoder;
+extern AVCodec ff_pcm_s16be_planar_encoder;
+extern AVCodec ff_pcm_s16be_planar_decoder;
+extern AVCodec ff_pcm_s16le_encoder;
+extern AVCodec ff_pcm_s16le_decoder;
+extern AVCodec ff_pcm_s16le_planar_encoder;
+extern AVCodec ff_pcm_s16le_planar_decoder;
+extern AVCodec ff_pcm_s24be_encoder;
+extern AVCodec ff_pcm_s24be_decoder;
+extern AVCodec ff_pcm_s24daud_encoder;
+extern AVCodec ff_pcm_s24daud_decoder;
+extern AVCodec ff_pcm_s24le_encoder;
+extern AVCodec ff_pcm_s24le_decoder;
+extern AVCodec ff_pcm_s24le_planar_encoder;
+extern AVCodec ff_pcm_s24le_planar_decoder;
+extern AVCodec ff_pcm_s32be_encoder;
+extern AVCodec ff_pcm_s32be_decoder;
+extern AVCodec ff_pcm_s32le_encoder;
+extern AVCodec ff_pcm_s32le_decoder;
+extern AVCodec ff_pcm_s32le_planar_encoder;
+extern AVCodec ff_pcm_s32le_planar_decoder;
+extern AVCodec ff_pcm_s64be_encoder;
+extern AVCodec ff_pcm_s64be_decoder;
+extern AVCodec ff_pcm_s64le_encoder;
+extern AVCodec ff_pcm_s64le_decoder;
+extern AVCodec ff_pcm_u8_encoder;
+extern AVCodec ff_pcm_u8_decoder;
+extern AVCodec ff_pcm_u16be_encoder;
+extern AVCodec ff_pcm_u16be_decoder;
+extern AVCodec ff_pcm_u16le_encoder;
+extern AVCodec ff_pcm_u16le_decoder;
+extern AVCodec ff_pcm_u24be_encoder;
+extern AVCodec ff_pcm_u24be_decoder;
+extern AVCodec ff_pcm_u24le_encoder;
+extern AVCodec ff_pcm_u24le_decoder;
+extern AVCodec ff_pcm_u32be_encoder;
+extern AVCodec ff_pcm_u32be_decoder;
+extern AVCodec ff_pcm_u32le_encoder;
+extern AVCodec ff_pcm_u32le_decoder;
+extern AVCodec ff_pcm_vidc_encoder;
+extern AVCodec ff_pcm_vidc_decoder;
+extern AVCodec ff_pcm_zork_decoder;
+
+/* DPCM codecs */
+extern AVCodec ff_gremlin_dpcm_decoder;
+extern AVCodec ff_interplay_dpcm_decoder;
+extern AVCodec ff_roq_dpcm_encoder;
+extern AVCodec ff_roq_dpcm_decoder;
+extern AVCodec ff_sol_dpcm_decoder;
+extern AVCodec ff_xan_dpcm_decoder;
+
+/* ADPCM codecs */
+extern AVCodec ff_adpcm_4xm_decoder;
+extern AVCodec ff_adpcm_adx_encoder;
+extern AVCodec ff_adpcm_adx_decoder;
+extern AVCodec ff_adpcm_afc_decoder;
+extern AVCodec ff_adpcm_aica_decoder;
+extern AVCodec ff_adpcm_ct_decoder;
+extern AVCodec ff_adpcm_dtk_decoder;
+extern AVCodec ff_adpcm_ea_decoder;
+extern AVCodec ff_adpcm_ea_maxis_xa_decoder;
+extern AVCodec ff_adpcm_ea_r1_decoder;
+extern AVCodec ff_adpcm_ea_r2_decoder;
+extern AVCodec ff_adpcm_ea_r3_decoder;
+extern AVCodec ff_adpcm_ea_xas_decoder;
+extern AVCodec ff_adpcm_g722_encoder;
+extern AVCodec ff_adpcm_g722_decoder;
+extern AVCodec ff_adpcm_g726_encoder;
+extern AVCodec ff_adpcm_g726_decoder;
+extern AVCodec ff_adpcm_g726le_encoder;
+extern AVCodec ff_adpcm_g726le_decoder;
+extern AVCodec ff_adpcm_ima_amv_decoder;
+extern AVCodec ff_adpcm_ima_apc_decoder;
+extern AVCodec ff_adpcm_ima_dat4_decoder;
+extern AVCodec ff_adpcm_ima_dk3_decoder;
+extern AVCodec ff_adpcm_ima_dk4_decoder;
+extern AVCodec ff_adpcm_ima_ea_eacs_decoder;
+extern AVCodec ff_adpcm_ima_ea_sead_decoder;
+extern AVCodec ff_adpcm_ima_iss_decoder;
+extern AVCodec ff_adpcm_ima_oki_decoder;
+extern AVCodec ff_adpcm_ima_qt_encoder;
+extern AVCodec ff_adpcm_ima_qt_decoder;
+extern AVCodec ff_adpcm_ima_rad_decoder;
+extern AVCodec ff_adpcm_ima_smjpeg_decoder;
+extern AVCodec ff_adpcm_ima_wav_encoder;
+extern AVCodec ff_adpcm_ima_wav_decoder;
+extern AVCodec ff_adpcm_ima_ws_decoder;
+extern AVCodec ff_adpcm_ms_encoder;
+extern AVCodec ff_adpcm_ms_decoder;
+extern AVCodec ff_adpcm_mtaf_decoder;
+extern AVCodec ff_adpcm_psx_decoder;
+extern AVCodec ff_adpcm_sbpro_2_decoder;
+extern AVCodec ff_adpcm_sbpro_3_decoder;
+extern AVCodec ff_adpcm_sbpro_4_decoder;
+extern AVCodec ff_adpcm_swf_encoder;
+extern AVCodec ff_adpcm_swf_decoder;
+extern AVCodec ff_adpcm_thp_decoder;
+extern AVCodec ff_adpcm_thp_le_decoder;
+extern AVCodec ff_adpcm_vima_decoder;
+extern AVCodec ff_adpcm_xa_decoder;
+extern AVCodec ff_adpcm_yamaha_encoder;
+extern AVCodec ff_adpcm_yamaha_decoder;
+
+/* subtitles */
+extern AVCodec ff_ssa_encoder;
+extern AVCodec ff_ssa_decoder;
+extern AVCodec ff_ass_encoder;
+extern AVCodec ff_ass_decoder;
+extern AVCodec ff_ccaption_decoder;
+extern AVCodec ff_dvbsub_encoder;
+extern AVCodec ff_dvbsub_decoder;
+extern AVCodec ff_dvdsub_encoder;
+extern AVCodec ff_dvdsub_decoder;
+extern AVCodec ff_jacosub_decoder;
+extern AVCodec ff_microdvd_decoder;
+extern AVCodec ff_movtext_encoder;
+extern AVCodec ff_movtext_decoder;
+extern AVCodec ff_mpl2_decoder;
+extern AVCodec ff_pgssub_decoder;
+extern AVCodec ff_pjs_decoder;
+extern AVCodec ff_realtext_decoder;
+extern AVCodec ff_sami_decoder;
+extern AVCodec ff_srt_encoder;
+extern AVCodec ff_srt_decoder;
+extern AVCodec ff_stl_decoder;
+extern AVCodec ff_subrip_encoder;
+extern AVCodec ff_subrip_decoder;
+extern AVCodec ff_subviewer_decoder;
+extern AVCodec ff_subviewer1_decoder;
+extern AVCodec ff_text_encoder;
+extern AVCodec ff_text_decoder;
+extern AVCodec ff_vplayer_decoder;
+extern AVCodec ff_webvtt_encoder;
+extern AVCodec ff_webvtt_decoder;
+extern AVCodec ff_xsub_encoder;
+extern AVCodec ff_xsub_decoder;
+
+/* external libraries */
+extern AVCodec ff_aac_at_encoder;
+extern AVCodec ff_aac_at_decoder;
+extern AVCodec ff_ac3_at_decoder;
+extern AVCodec ff_adpcm_ima_qt_at_decoder;
+extern AVCodec ff_alac_at_encoder;
+extern AVCodec ff_alac_at_decoder;
+extern AVCodec ff_amr_nb_at_decoder;
+extern AVCodec ff_eac3_at_decoder;
+extern AVCodec ff_gsm_ms_at_decoder;
+extern AVCodec ff_ilbc_at_encoder;
+extern AVCodec ff_ilbc_at_decoder;
+extern AVCodec ff_mp1_at_decoder;
+extern AVCodec ff_mp2_at_decoder;
+extern AVCodec ff_mp3_at_decoder;
+extern AVCodec ff_pcm_alaw_at_encoder;
+extern AVCodec ff_pcm_alaw_at_decoder;
+extern AVCodec ff_pcm_mulaw_at_encoder;
+extern AVCodec ff_pcm_mulaw_at_decoder;
+extern AVCodec ff_qdmc_at_decoder;
+extern AVCodec ff_qdm2_at_decoder;
+extern AVCodec ff_libaom_av1_decoder;
+extern AVCodec ff_libaom_av1_encoder;
+extern AVCodec ff_libaribb24_decoder;
+extern AVCodec ff_libcelt_decoder;
+extern AVCodec ff_libcodec2_encoder;
+extern AVCodec ff_libcodec2_decoder;
+extern AVCodec ff_libdav1d_decoder;
+extern AVCodec ff_libdavs2_decoder;
+extern AVCodec ff_libfdk_aac_encoder;
+extern AVCodec ff_libfdk_aac_decoder;
+extern AVCodec ff_libgsm_encoder;
+extern AVCodec ff_libgsm_decoder;
+extern AVCodec ff_libgsm_ms_encoder;
+extern AVCodec ff_libgsm_ms_decoder;
+extern AVCodec ff_libilbc_encoder;
+extern AVCodec ff_libilbc_decoder;
+extern AVCodec ff_libmp3lame_encoder;
+extern AVCodec ff_libopencore_amrnb_encoder;
+extern AVCodec ff_libopencore_amrnb_decoder;
+extern AVCodec ff_libopencore_amrwb_decoder;
+extern AVCodec ff_libopenjpeg_encoder;
+extern AVCodec ff_libopenjpeg_decoder;
+extern AVCodec ff_libopus_encoder;
+extern AVCodec ff_libopus_decoder;
+extern AVCodec ff_librsvg_decoder;
+extern AVCodec ff_libshine_encoder;
+extern AVCodec ff_libspeex_encoder;
+extern AVCodec ff_libspeex_decoder;
+extern AVCodec ff_libtheora_encoder;
+extern AVCodec ff_libtwolame_encoder;
+extern AVCodec ff_libvo_amrwbenc_encoder;
+extern AVCodec ff_libvorbis_encoder;
+extern AVCodec ff_libvorbis_decoder;
+extern AVCodec ff_libvpx_vp8_encoder;
+extern AVCodec ff_libvpx_vp8_decoder;
+extern AVCodec ff_libvpx_vp9_encoder;
+extern AVCodec ff_libvpx_vp9_decoder;
+extern AVCodec ff_libwavpack_encoder;
+/* preferred over libwebp */
+extern AVCodec ff_libwebp_anim_encoder;
+extern AVCodec ff_libwebp_encoder;
+extern AVCodec ff_libx262_encoder;
+extern AVCodec ff_libx264_encoder;
+extern AVCodec ff_libx264rgb_encoder;
+extern AVCodec ff_libx265_encoder;
+extern AVCodec ff_libxavs_encoder;
+extern AVCodec ff_libxavs2_encoder;
+extern AVCodec ff_libxvid_encoder;
+extern AVCodec ff_libzvbi_teletext_decoder;
+
+/* text */
+extern AVCodec ff_bintext_decoder;
+extern AVCodec ff_xbin_decoder;
+extern AVCodec ff_idf_decoder;
 
-#define REGISTER_DECODER(X, x)                                          \
-    {                                                                   \
-        extern AVCodec ff_##x##_decoder;                                \
-        if (CONFIG_##X##_DECODER)                                       \
-            avcodec_register(&ff_##x##_decoder);                        \
+/* external libraries, that shouldn't be used by default if one of the
+ * above is available */
+extern AVCodec ff_h263_v4l2m2m_encoder;
+extern AVCodec ff_libopenh264_encoder;
+extern AVCodec ff_libopenh264_decoder;
+extern AVCodec ff_h264_amf_encoder;
+extern AVCodec ff_h264_cuvid_decoder;
+extern AVCodec ff_h264_nvenc_encoder;
+extern AVCodec ff_h264_omx_encoder;
+extern AVCodec ff_h264_qsv_encoder;
+extern AVCodec ff_h264_v4l2m2m_encoder;
+extern AVCodec ff_h264_vaapi_encoder;
+extern AVCodec ff_h264_videotoolbox_encoder;
+#if FF_API_NVENC_OLD_NAME
+extern AVCodec ff_nvenc_encoder;
+extern AVCodec ff_nvenc_h264_encoder;
+extern AVCodec ff_nvenc_hevc_encoder;
+#endif
+extern AVCodec ff_hevc_amf_encoder;
+extern AVCodec ff_hevc_cuvid_decoder;
+extern AVCodec ff_hevc_mediacodec_decoder;
+extern AVCodec ff_hevc_nvenc_encoder;
+extern AVCodec ff_hevc_qsv_encoder;
+extern AVCodec ff_hevc_v4l2m2m_encoder;
+extern AVCodec ff_hevc_vaapi_encoder;
+extern AVCodec ff_hevc_videotoolbox_encoder;
+extern AVCodec ff_libkvazaar_encoder;
+extern AVCodec ff_mjpeg_cuvid_decoder;
+extern AVCodec ff_mjpeg_qsv_encoder;
+extern AVCodec ff_mjpeg_vaapi_encoder;
+extern AVCodec ff_mpeg1_cuvid_decoder;
+extern AVCodec ff_mpeg2_cuvid_decoder;
+extern AVCodec ff_mpeg2_qsv_encoder;
+extern AVCodec ff_mpeg2_vaapi_encoder;
+extern AVCodec ff_mpeg4_cuvid_decoder;
+extern AVCodec ff_mpeg4_mediacodec_decoder;
+extern AVCodec ff_mpeg4_v4l2m2m_encoder;
+extern AVCodec ff_vc1_cuvid_decoder;
+extern AVCodec ff_vp8_cuvid_decoder;
+extern AVCodec ff_vp8_mediacodec_decoder;
+extern AVCodec ff_vp8_qsv_decoder;
+extern AVCodec ff_vp8_v4l2m2m_encoder;
+extern AVCodec ff_vp8_vaapi_encoder;
+extern AVCodec ff_vp9_cuvid_decoder;
+extern AVCodec ff_vp9_mediacodec_decoder;
+extern AVCodec ff_vp9_vaapi_encoder;
+
+// The iterate API is not usable with ossfuzz due to the excessive size of binaries created
+#if CONFIG_OSSFUZZ
+AVCodec * codec_list[] = {
+    NULL,
+    NULL
+};
+#else
+#include "libavcodec/codec_list.c"
+#endif
+
+static AVOnce av_codec_static_init = AV_ONCE_INIT;
+static void av_codec_init_static(void)
+{
+    for (int i = 0; codec_list[i]; i++) {
+        if (codec_list[i]->init_static_data)
+            codec_list[i]->init_static_data((AVCodec*)codec_list[i]);
     }
+}
+
+const AVCodec *av_codec_iterate(void **opaque)
+{
+    uintptr_t i = (uintptr_t)*opaque;
+    const AVCodec *c = codec_list[i];
+
+    ff_thread_once(&av_codec_static_init, av_codec_init_static);
+
+    if (c)
+        *opaque = (void*)(i + 1);
+
+    return c;
+}
 
-#define REGISTER_ENCDEC(X, x) REGISTER_ENCODER(X, x); REGISTER_DECODER(X, x)
+#if FF_API_NEXT
+FF_DISABLE_DEPRECATION_WARNINGS
+static AVOnce av_codec_next_init = AV_ONCE_INIT;
 
-#define REGISTER_PARSER(X, x)                                           \
-    {                                                                   \
-        extern AVCodecParser ff_##x##_parser;                           \
-        if (CONFIG_##X##_PARSER)                                        \
-            av_register_codec_parser(&ff_##x##_parser);                 \
+static void av_codec_init_next(void)
+{
+    AVCodec *prev = NULL, *p;
+    void *i = 0;
+    while ((p = (AVCodec*)av_codec_iterate(&i))) {
+        if (prev)
+            prev->next = p;
+        prev = p;
     }
+}
+
+
+
+av_cold void avcodec_register(AVCodec *codec)
+{
+    ff_thread_once(&av_codec_next_init, av_codec_init_next);
+}
+
+AVCodec *av_codec_next(const AVCodec *c)
+{
+    ff_thread_once(&av_codec_next_init, av_codec_init_next);
+
+    if (c)
+        return c->next;
+    else
+        return (AVCodec*)codec_list[0];
+}
 
 void avcodec_register_all(void)
 {
-    static int initialized;
-
-    if (initialized)
-        return;
-    initialized = 1;
-
-    /* video codecs */
-    REGISTER_ENCODER(A64MULTI,          a64multi);
-    REGISTER_ENCODER(A64MULTI5,         a64multi5);
-    REGISTER_DECODER(AASC,              aasc);
-    REGISTER_DECODER(AIC,               aic);
-    REGISTER_ENCDEC (ALIAS_PIX,         alias_pix);
-    REGISTER_DECODER(AMV,               amv);
-    REGISTER_DECODER(ANM,               anm);
-    REGISTER_DECODER(ANSI,              ansi);
-    REGISTER_ENCDEC (ASV1,              asv1);
-    REGISTER_ENCDEC (ASV2,              asv2);
-    REGISTER_DECODER(AURA,              aura);
-    REGISTER_DECODER(AURA2,             aura2);
-    REGISTER_DECODER(AVS,               avs);
-    REGISTER_DECODER(BETHSOFTVID,       bethsoftvid);
-    REGISTER_DECODER(BFI,               bfi);
-    REGISTER_DECODER(BINK,              bink);
-    REGISTER_ENCDEC (BMP,               bmp);
-    REGISTER_DECODER(BMV_VIDEO,         bmv_video);
-    REGISTER_DECODER(BRENDER_PIX,       brender_pix);
-    REGISTER_DECODER(C93,               c93);
-    REGISTER_DECODER(CAVS,              cavs);
-    REGISTER_DECODER(CDGRAPHICS,        cdgraphics);
-    REGISTER_DECODER(CDXL,              cdxl);
-    REGISTER_DECODER(CFHD,              cfhd);
-    REGISTER_ENCDEC (CINEPAK,           cinepak);
-    REGISTER_DECODER(CLEARVIDEO,        clearvideo);
-    REGISTER_ENCDEC (CLJR,              cljr);
-    REGISTER_DECODER(CLLC,              cllc);
-    REGISTER_ENCDEC (COMFORTNOISE,      comfortnoise);
-    REGISTER_DECODER(CSCD,              cscd);
-    REGISTER_DECODER(CYUV,              cyuv);
-    REGISTER_DECODER(DDS,               dds);
-    REGISTER_DECODER(DFA,               dfa);
-    REGISTER_ENCDEC (DNXHD,             dnxhd);
-    REGISTER_ENCDEC (DPX,               dpx);
-    REGISTER_DECODER(DSICINVIDEO,       dsicinvideo);
-    REGISTER_ENCDEC (DVVIDEO,           dvvideo);
-    REGISTER_DECODER(DXA,               dxa);
-    REGISTER_DECODER(DXTORY,            dxtory);
-    REGISTER_DECODER(DXV,               dxv);
-    REGISTER_DECODER(EACMV,             eacmv);
-    REGISTER_DECODER(EAMAD,             eamad);
-    REGISTER_DECODER(EATGQ,             eatgq);
-    REGISTER_DECODER(EATGV,             eatgv);
-    REGISTER_DECODER(EATQI,             eatqi);
-    REGISTER_DECODER(EIGHTBPS,          eightbps);
-    REGISTER_DECODER(EIGHTSVX_EXP,      eightsvx_exp);
-    REGISTER_DECODER(EIGHTSVX_FIB,      eightsvx_fib);
-    REGISTER_DECODER(ESCAPE124,         escape124);
-    REGISTER_DECODER(ESCAPE130,         escape130);
-    REGISTER_DECODER(EXR,               exr);
-    REGISTER_ENCDEC (FFV1,              ffv1);
-    REGISTER_ENCDEC (FFVHUFF,           ffvhuff);
-    REGISTER_DECODER(FIC,               fic);
-    REGISTER_ENCDEC (FLASHSV,           flashsv);
-    REGISTER_DECODER(FLASHSV2,          flashsv2);
-    REGISTER_DECODER(FLIC,              flic);
-    REGISTER_ENCDEC (FLV,               flv);
-    REGISTER_DECODER(FMVC,              fmvc);
-    REGISTER_DECODER(FOURXM,            fourxm);
-    REGISTER_DECODER(FRAPS,             fraps);
-    REGISTER_DECODER(FRWU,              frwu);
-    REGISTER_DECODER(G2M,               g2m);
-    REGISTER_ENCDEC (GIF,               gif);
-    REGISTER_ENCDEC (H261,              h261);
-    REGISTER_ENCDEC (H263,              h263);
-    REGISTER_DECODER(H263I,             h263i);
-    REGISTER_ENCODER(H263P,             h263p);
-    REGISTER_DECODER(H264,              h264);
-    REGISTER_DECODER(H264_MMAL,         h264_mmal);
-    REGISTER_DECODER(H264_QSV,          h264_qsv);
-    REGISTER_ENCDEC (HAP,               hap);
-    REGISTER_DECODER(HEVC,              hevc);
-    REGISTER_DECODER(HEVC_QSV,          hevc_qsv);
-    REGISTER_DECODER(HNM4_VIDEO,        hnm4_video);
-    REGISTER_DECODER(HQ_HQA,            hq_hqa);
-    REGISTER_DECODER(HQX,               hqx);
-    REGISTER_ENCDEC (HUFFYUV,           huffyuv);
-    REGISTER_DECODER(IDCIN,             idcin);
-    REGISTER_DECODER(IFF_BYTERUN1,      iff_byterun1);
-    REGISTER_DECODER(IFF_ILBM,          iff_ilbm);
-    REGISTER_DECODER(INDEO2,            indeo2);
-    REGISTER_DECODER(INDEO3,            indeo3);
-    REGISTER_DECODER(INDEO4,            indeo4);
-    REGISTER_DECODER(INDEO5,            indeo5);
-    REGISTER_DECODER(INTERPLAY_VIDEO,   interplay_video);
-    REGISTER_DECODER(JPEG2000,          jpeg2000);
-    REGISTER_ENCDEC (JPEGLS,            jpegls);
-    REGISTER_DECODER(JV,                jv);
-    REGISTER_DECODER(KGV1,              kgv1);
-    REGISTER_DECODER(KMVC,              kmvc);
-    REGISTER_DECODER(LAGARITH,          lagarith);
-    REGISTER_ENCODER(LJPEG,             ljpeg);
-    REGISTER_DECODER(LOCO,              loco);
-    REGISTER_DECODER(MAGICYUV,          magicyuv);
-    REGISTER_DECODER(MDEC,              mdec);
-    REGISTER_DECODER(MIMIC,             mimic);
-    REGISTER_ENCDEC (MJPEG,             mjpeg);
-    REGISTER_DECODER(MJPEGB,            mjpegb);
-    REGISTER_DECODER(MMVIDEO,           mmvideo);
-    REGISTER_DECODER(MOTIONPIXELS,      motionpixels);
-    REGISTER_ENCDEC (MPEG1VIDEO,        mpeg1video);
-    REGISTER_ENCDEC (MPEG2VIDEO,        mpeg2video);
-    REGISTER_DECODER(MPEG2_MMAL,        mpeg2_mmal);
-    REGISTER_DECODER(MPEG2_QSV,         mpeg2_qsv);
-    REGISTER_ENCDEC (MPEG4,             mpeg4);
-    REGISTER_DECODER(MSA1,              msa1);
-    REGISTER_DECODER(MSMPEG4V1,         msmpeg4v1);
-    REGISTER_ENCDEC (MSMPEG4V2,         msmpeg4v2);
-    REGISTER_ENCDEC (MSMPEG4V3,         msmpeg4v3);
-    REGISTER_DECODER(MSRLE,             msrle);
-    REGISTER_DECODER(MSS1,              mss1);
-    REGISTER_DECODER(MSS2,              mss2);
-    REGISTER_DECODER(MSVIDEO1,          msvideo1);
-    REGISTER_DECODER(MSZH,              mszh);
-    REGISTER_DECODER(MTS2,              mts2);
-    REGISTER_DECODER(MVC1,              mvc1);
-    REGISTER_DECODER(MVC2,              mvc2);
-    REGISTER_DECODER(MXPEG,             mxpeg);
-    REGISTER_DECODER(NUV,               nuv);
-    REGISTER_DECODER(PAF_VIDEO,         paf_video);
-    REGISTER_ENCDEC (PAM,               pam);
-    REGISTER_ENCDEC (PBM,               pbm);
-    REGISTER_ENCDEC (PCX,               pcx);
-    REGISTER_ENCDEC (PGM,               pgm);
-    REGISTER_ENCDEC (PGMYUV,            pgmyuv);
-    REGISTER_DECODER(PICTOR,            pictor);
-    REGISTER_DECODER(PIXLET,            pixlet);
-    REGISTER_ENCDEC (PNG,               png);
-    REGISTER_ENCDEC (PPM,               ppm);
-    REGISTER_ENCDEC (PRORES,            prores);
-    REGISTER_DECODER(PTX,               ptx);
-    REGISTER_DECODER(QDRAW,             qdraw);
-    REGISTER_DECODER(QPEG,              qpeg);
-    REGISTER_ENCDEC (QTRLE,             qtrle);
-    REGISTER_DECODER(R10K,              r10k);
-    REGISTER_DECODER(R210,              r210);
-    REGISTER_ENCDEC (RAWVIDEO,          rawvideo);
-    REGISTER_DECODER(RL2,               rl2);
-    REGISTER_ENCDEC (ROQ,               roq);
-    REGISTER_DECODER(RPZA,              rpza);
-    REGISTER_DECODER(RSCC,              rscc);
-    REGISTER_ENCDEC (RV10,              rv10);
-    REGISTER_ENCDEC (RV20,              rv20);
-    REGISTER_DECODER(RV30,              rv30);
-    REGISTER_DECODER(RV40,              rv40);
-    REGISTER_DECODER(S302M,             s302m);
-    REGISTER_DECODER(SANM,              sanm);
-    REGISTER_DECODER(SCREENPRESSO,      screenpresso);
-    REGISTER_ENCDEC (SGI,               sgi);
-    REGISTER_DECODER(SGIRLE,            sgirle);
-    REGISTER_DECODER(SMACKER,           smacker);
-    REGISTER_DECODER(SMC,               smc);
-    REGISTER_DECODER(SP5X,              sp5x);
-    REGISTER_ENCDEC (SUNRAST,           sunrast);
-    REGISTER_ENCDEC (SVQ1,              svq1);
-    REGISTER_DECODER(SVQ3,              svq3);
-    REGISTER_ENCDEC (TARGA,             targa);
-    REGISTER_DECODER(TDSC,              tdsc);
-    REGISTER_DECODER(THEORA,            theora);
-    REGISTER_DECODER(THP,               thp);
-    REGISTER_DECODER(TIERTEXSEQVIDEO,   tiertexseqvideo);
-    REGISTER_ENCDEC (TIFF,              tiff);
-    REGISTER_DECODER(TMV,               tmv);
-    REGISTER_DECODER(TRUEMOTION1,       truemotion1);
-    REGISTER_DECODER(TRUEMOTION2,       truemotion2);
-    REGISTER_DECODER(TRUEMOTION2RT,     truemotion2rt);
-    REGISTER_DECODER(TSCC,              tscc);
-    REGISTER_DECODER(TSCC2,             tscc2);
-    REGISTER_DECODER(TXD,               txd);
-    REGISTER_DECODER(ULTI,              ulti);
-    REGISTER_ENCDEC (UTVIDEO,           utvideo);
-    REGISTER_ENCDEC (V210,              v210);
-    REGISTER_DECODER(V210X,             v210x);
-    REGISTER_ENCDEC (V410,              v410);
-    REGISTER_DECODER(VB,                vb);
-    REGISTER_DECODER(VBLE,              vble);
-    REGISTER_DECODER(VC1,               vc1);
-    REGISTER_DECODER(VC1IMAGE,          vc1image);
-    REGISTER_DECODER(VC1_MMAL,          vc1_mmal);
-    REGISTER_DECODER(VC1_QSV,           vc1_qsv);
-    REGISTER_DECODER(VCR1,              vcr1);
-    REGISTER_DECODER(VMDVIDEO,          vmdvideo);
-    REGISTER_DECODER(VMNC,              vmnc);
-    REGISTER_DECODER(VP3,               vp3);
-    REGISTER_DECODER(VP5,               vp5);
-    REGISTER_DECODER(VP6,               vp6);
-    REGISTER_DECODER(VP6A,              vp6a);
-    REGISTER_DECODER(VP6F,              vp6f);
-    REGISTER_DECODER(VP7,               vp7);
-    REGISTER_DECODER(VP8,               vp8);
-    REGISTER_DECODER(VP8_QSV,           vp8_qsv);
-    REGISTER_DECODER(VP9,               vp9);
-    REGISTER_DECODER(VQA,               vqa);
-    REGISTER_DECODER(WEBP,              webp);
-    REGISTER_ENCODER(WRAPPED_AVFRAME,   wrapped_avframe);
-    REGISTER_ENCDEC (WMV1,              wmv1);
-    REGISTER_ENCDEC (WMV2,              wmv2);
-    REGISTER_DECODER(WMV3,              wmv3);
-    REGISTER_DECODER(WMV3IMAGE,         wmv3image);
-    REGISTER_DECODER(WNV1,              wnv1);
-    REGISTER_DECODER(XAN_WC3,           xan_wc3);
-    REGISTER_DECODER(XAN_WC4,           xan_wc4);
-    REGISTER_ENCDEC (XBM,               xbm);
-    REGISTER_DECODER(XL,                xl);
-    REGISTER_ENCDEC (XWD,               xwd);
-    REGISTER_DECODER(YOP,               yop);
-    REGISTER_DECODER(ZEROCODEC,         zerocodec);
-    REGISTER_ENCDEC (ZLIB,              zlib);
-    REGISTER_ENCDEC (ZMBV,              zmbv);
-
-    /* audio codecs */
-    REGISTER_ENCDEC (AAC,               aac);
-    REGISTER_DECODER(AAC_LATM,          aac_latm);
-    REGISTER_ENCDEC (AC3,               ac3);
-    REGISTER_ENCODER(AC3_FIXED,         ac3_fixed);
-    REGISTER_ENCDEC (ALAC,              alac);
-    REGISTER_DECODER(ALS,               als);
-    REGISTER_DECODER(AMRNB,             amrnb);
-    REGISTER_DECODER(AMRWB,             amrwb);
-    REGISTER_DECODER(APE,               ape);
-    REGISTER_DECODER(ATRAC1,            atrac1);
-    REGISTER_DECODER(ATRAC3,            atrac3);
-    REGISTER_DECODER(ATRAC3P,           atrac3p);
-    REGISTER_DECODER(BINKAUDIO_DCT,     binkaudio_dct);
-    REGISTER_DECODER(BINKAUDIO_RDFT,    binkaudio_rdft);
-    REGISTER_DECODER(BMV_AUDIO,         bmv_audio);
-    REGISTER_DECODER(COOK,              cook);
-    REGISTER_DECODER(DCA,               dca);
-    REGISTER_DECODER(DSICINAUDIO,       dsicinaudio);
-    REGISTER_DECODER(DSS_SP,            dss_sp);
-    REGISTER_ENCDEC (EAC3,              eac3);
-    REGISTER_ENCDEC (FLAC,              flac);
-    REGISTER_ENCDEC (G723_1,            g723_1);
-    REGISTER_DECODER(GSM,               gsm);
-    REGISTER_DECODER(GSM_MS,            gsm_ms);
-    REGISTER_DECODER(IAC,               iac);
-    REGISTER_DECODER(IMC,               imc);
-    REGISTER_DECODER(MACE3,             mace3);
-    REGISTER_DECODER(MACE6,             mace6);
-    REGISTER_DECODER(METASOUND,         metasound);
-    REGISTER_DECODER(MLP,               mlp);
-    REGISTER_DECODER(MP1,               mp1);
-    REGISTER_DECODER(MP1FLOAT,          mp1float);
-    REGISTER_ENCDEC (MP2,               mp2);
-    REGISTER_DECODER(MP2FLOAT,          mp2float);
-    REGISTER_DECODER(MP3,               mp3);
-    REGISTER_DECODER(MP3FLOAT,          mp3float);
-    REGISTER_DECODER(MP3ADU,            mp3adu);
-    REGISTER_DECODER(MP3ADUFLOAT,       mp3adufloat);
-    REGISTER_DECODER(MP3ON4,            mp3on4);
-    REGISTER_DECODER(MP3ON4FLOAT,       mp3on4float);
-    REGISTER_DECODER(MPC7,              mpc7);
-    REGISTER_DECODER(MPC8,              mpc8);
-    REGISTER_ENCDEC (NELLYMOSER,        nellymoser);
-    REGISTER_DECODER(ON2AVC,            on2avc);
-    REGISTER_DECODER(OPUS,              opus);
-    REGISTER_DECODER(PAF_AUDIO,         paf_audio);
-    REGISTER_DECODER(QCELP,             qcelp);
-    REGISTER_DECODER(QDM2,              qdm2);
-    REGISTER_ENCDEC (RA_144,            ra_144);
-    REGISTER_DECODER(RA_288,            ra_288);
-    REGISTER_DECODER(RALF,              ralf);
-    REGISTER_DECODER(SHORTEN,           shorten);
-    REGISTER_DECODER(SIPR,              sipr);
-    REGISTER_DECODER(SMACKAUD,          smackaud);
-    REGISTER_DECODER(TAK,               tak);
-    REGISTER_DECODER(TRUEHD,            truehd);
-    REGISTER_DECODER(TRUESPEECH,        truespeech);
-    REGISTER_DECODER(TTA,               tta);
-    REGISTER_DECODER(TWINVQ,            twinvq);
-    REGISTER_DECODER(VMDAUDIO,          vmdaudio);
-    REGISTER_ENCDEC (VORBIS,            vorbis);
-    REGISTER_DECODER(WAVPACK,           wavpack);
-    REGISTER_DECODER(WMALOSSLESS,       wmalossless);
-    REGISTER_DECODER(WMAPRO,            wmapro);
-    REGISTER_ENCDEC (WMAV1,             wmav1);
-    REGISTER_ENCDEC (WMAV2,             wmav2);
-    REGISTER_DECODER(WMAVOICE,          wmavoice);
-    REGISTER_DECODER(WS_SND1,           ws_snd1);
-
-    /* PCM codecs */
-    REGISTER_ENCDEC (PCM_ALAW,          pcm_alaw);
-    REGISTER_DECODER(PCM_BLURAY,        pcm_bluray);
-    REGISTER_DECODER(PCM_DVD,           pcm_dvd);
-    REGISTER_ENCDEC (PCM_F32BE,         pcm_f32be);
-    REGISTER_ENCDEC (PCM_F32LE,         pcm_f32le);
-    REGISTER_ENCDEC (PCM_F64BE,         pcm_f64be);
-    REGISTER_ENCDEC (PCM_F64LE,         pcm_f64le);
-    REGISTER_DECODER(PCM_LXF,           pcm_lxf);
-    REGISTER_ENCDEC (PCM_MULAW,         pcm_mulaw);
-    REGISTER_ENCDEC (PCM_S8,            pcm_s8);
-    REGISTER_DECODER(PCM_S8_PLANAR,     pcm_s8_planar);
-    REGISTER_ENCDEC (PCM_S16BE,         pcm_s16be);
-    REGISTER_DECODER(PCM_S16BE_PLANAR,  pcm_s16be_planar);
-    REGISTER_ENCDEC (PCM_S16LE,         pcm_s16le);
-    REGISTER_DECODER(PCM_S16LE_PLANAR,  pcm_s16le_planar);
-    REGISTER_ENCDEC (PCM_S24BE,         pcm_s24be);
-    REGISTER_ENCDEC (PCM_S24DAUD,       pcm_s24daud);
-    REGISTER_ENCDEC (PCM_S24LE,         pcm_s24le);
-    REGISTER_DECODER(PCM_S24LE_PLANAR,  pcm_s24le_planar);
-    REGISTER_ENCDEC (PCM_S32BE,         pcm_s32be);
-    REGISTER_ENCDEC (PCM_S32LE,         pcm_s32le);
-    REGISTER_DECODER(PCM_S32LE_PLANAR,  pcm_s32le_planar);
-    REGISTER_ENCDEC (PCM_U8,            pcm_u8);
-    REGISTER_ENCDEC (PCM_U16BE,         pcm_u16be);
-    REGISTER_ENCDEC (PCM_U16LE,         pcm_u16le);
-    REGISTER_ENCDEC (PCM_U24BE,         pcm_u24be);
-    REGISTER_ENCDEC (PCM_U24LE,         pcm_u24le);
-    REGISTER_ENCDEC (PCM_U32BE,         pcm_u32be);
-    REGISTER_ENCDEC (PCM_U32LE,         pcm_u32le);
-    REGISTER_DECODER(PCM_ZORK ,         pcm_zork);
-
-    /* DPCM codecs */
-    REGISTER_DECODER(INTERPLAY_DPCM,    interplay_dpcm);
-    REGISTER_ENCDEC (ROQ_DPCM,          roq_dpcm);
-    REGISTER_DECODER(SOL_DPCM,          sol_dpcm);
-    REGISTER_DECODER(XAN_DPCM,          xan_dpcm);
-
-    /* ADPCM codecs */
-    REGISTER_DECODER(ADPCM_4XM,         adpcm_4xm);
-    REGISTER_ENCDEC (ADPCM_ADX,         adpcm_adx);
-    REGISTER_DECODER(ADPCM_CT,          adpcm_ct);
-    REGISTER_DECODER(ADPCM_EA,          adpcm_ea);
-    REGISTER_DECODER(ADPCM_EA_MAXIS_XA, adpcm_ea_maxis_xa);
-    REGISTER_DECODER(ADPCM_EA_R1,       adpcm_ea_r1);
-    REGISTER_DECODER(ADPCM_EA_R2,       adpcm_ea_r2);
-    REGISTER_DECODER(ADPCM_EA_R3,       adpcm_ea_r3);
-    REGISTER_DECODER(ADPCM_EA_XAS,      adpcm_ea_xas);
-    REGISTER_ENCDEC (ADPCM_G722,        adpcm_g722);
-    REGISTER_ENCDEC (ADPCM_G726,        adpcm_g726);
-    REGISTER_DECODER(ADPCM_IMA_AMV,     adpcm_ima_amv);
-    REGISTER_DECODER(ADPCM_IMA_APC,     adpcm_ima_apc);
-    REGISTER_DECODER(ADPCM_IMA_DK3,     adpcm_ima_dk3);
-    REGISTER_DECODER(ADPCM_IMA_DK4,     adpcm_ima_dk4);
-    REGISTER_DECODER(ADPCM_IMA_EA_EACS, adpcm_ima_ea_eacs);
-    REGISTER_DECODER(ADPCM_IMA_EA_SEAD, adpcm_ima_ea_sead);
-    REGISTER_DECODER(ADPCM_IMA_ISS,     adpcm_ima_iss);
-    REGISTER_ENCDEC (ADPCM_IMA_QT,      adpcm_ima_qt);
-    REGISTER_DECODER(ADPCM_IMA_SMJPEG,  adpcm_ima_smjpeg);
-    REGISTER_ENCDEC (ADPCM_IMA_WAV,     adpcm_ima_wav);
-    REGISTER_DECODER(ADPCM_IMA_WS,      adpcm_ima_ws);
-    REGISTER_ENCDEC (ADPCM_MS,          adpcm_ms);
-    REGISTER_DECODER(ADPCM_SBPRO_2,     adpcm_sbpro_2);
-    REGISTER_DECODER(ADPCM_SBPRO_3,     adpcm_sbpro_3);
-    REGISTER_DECODER(ADPCM_SBPRO_4,     adpcm_sbpro_4);
-    REGISTER_ENCDEC (ADPCM_SWF,         adpcm_swf);
-    REGISTER_DECODER(ADPCM_THP,         adpcm_thp);
-    REGISTER_DECODER(ADPCM_VIMA,        adpcm_vima);
-    REGISTER_DECODER(ADPCM_XA,          adpcm_xa);
-    REGISTER_ENCDEC (ADPCM_YAMAHA,      adpcm_yamaha);
-
-    /* subtitles */
-    REGISTER_ENCDEC (ASS,               ass);
-    REGISTER_ENCDEC (DVBSUB,            dvbsub);
-    REGISTER_ENCDEC (DVDSUB,            dvdsub);
-    REGISTER_DECODER(PGSSUB,            pgssub);
-    REGISTER_DECODER(SRT,               srt);
-    REGISTER_ENCDEC (XSUB,              xsub);
-
-    /* external libraries */
-    REGISTER_ENCDEC (LIBAOM_AV1,        libaom_av1);
-    REGISTER_DECODER(LIBDAV1D,          libdav1d)
-    REGISTER_DECODER(LIBDCADEC,         libdcadec)
-    REGISTER_ENCODER(LIBFAAC,           libfaac);
-    REGISTER_ENCDEC (LIBFDK_AAC,        libfdk_aac);
-    REGISTER_ENCDEC (LIBGSM,            libgsm);
-    REGISTER_ENCDEC (LIBGSM_MS,         libgsm_ms);
-    REGISTER_ENCDEC (LIBILBC,           libilbc);
-    REGISTER_ENCODER(LIBMP3LAME,        libmp3lame);
-    REGISTER_ENCDEC (LIBOPENCORE_AMRNB, libopencore_amrnb);
-    REGISTER_DECODER(LIBOPENCORE_AMRWB, libopencore_amrwb);
-    REGISTER_ENCDEC (LIBOPENJPEG,       libopenjpeg);
-    REGISTER_ENCDEC (LIBOPUS,           libopus);
-    REGISTER_ENCDEC (LIBSCHROEDINGER,   libschroedinger);
-    REGISTER_ENCDEC (LIBSPEEX,          libspeex);
-    REGISTER_ENCODER(LIBTHEORA,         libtheora);
-    REGISTER_ENCODER(LIBTWOLAME,        libtwolame);
-    REGISTER_ENCODER(LIBVO_AACENC,      libvo_aacenc);
-    REGISTER_ENCODER(LIBVO_AMRWBENC,    libvo_amrwbenc);
-    REGISTER_ENCODER(LIBVORBIS,         libvorbis);
-    REGISTER_ENCDEC (LIBVPX_VP8,        libvpx_vp8);
-    REGISTER_ENCDEC (LIBVPX_VP9,        libvpx_vp9);
-    REGISTER_ENCODER(LIBWAVPACK,        libwavpack);
-    REGISTER_ENCODER(LIBWEBP,           libwebp);
-    REGISTER_ENCODER(LIBX262,           libx262);
-    REGISTER_ENCODER(LIBX264,           libx264);
-    REGISTER_ENCODER(LIBX265,           libx265);
-    REGISTER_ENCODER(LIBXAVS,           libxavs);
-    REGISTER_ENCODER(LIBXVID,           libxvid);
-
-    /* external libraries, that shouldn't be used by default if one of the
-     * above is available */
-    REGISTER_ENCDEC (LIBOPENH264,       libopenh264);
-    REGISTER_ENCODER(H264_AMF,          h264_amf);
-    REGISTER_ENCODER(H264_NVENC,        h264_nvenc);
-    REGISTER_ENCODER(H264_OMX,          h264_omx);
-    REGISTER_ENCODER(H264_QSV,          h264_qsv);
-    REGISTER_ENCODER(H264_VAAPI,        h264_vaapi);
-    REGISTER_ENCODER(LIBKVAZAAR,        libkvazaar);
-    REGISTER_ENCODER(HEVC_AMF,          hevc_amf);
-    REGISTER_ENCODER(HEVC_NVENC,        hevc_nvenc);
-    REGISTER_ENCODER(HEVC_QSV,          hevc_qsv);
-    REGISTER_ENCODER(HEVC_VAAPI,        hevc_vaapi);
-    REGISTER_ENCODER(MJPEG_QSV,         mjpeg_qsv);
-    REGISTER_ENCODER(MJPEG_VAAPI,       mjpeg_vaapi);
-    REGISTER_ENCODER(MPEG2_QSV,         mpeg2_qsv);
-    REGISTER_ENCODER(MPEG2_VAAPI,       mpeg2_vaapi);
-    REGISTER_ENCODER(MPEG4_OMX,         mpeg4_omx);
-#if FF_API_NVENC_OLD_NAME
-    REGISTER_ENCODER(NVENC_H264,        nvenc_h264);
-    REGISTER_ENCODER(NVENC_HEVC,        nvenc_hevc);
+    ff_thread_once(&av_codec_next_init, av_codec_init_next);
+}
+FF_ENABLE_DEPRECATION_WARNINGS
 #endif
-    REGISTER_ENCODER(VP8_VAAPI,         vp8_vaapi);
-    REGISTER_ENCODER(VP9_VAAPI,         vp9_vaapi);
-
-    /* parsers */
-    REGISTER_PARSER(AAC,                aac);
-    REGISTER_PARSER(AAC_LATM,           aac_latm);
-    REGISTER_PARSER(AC3,                ac3);
-    REGISTER_PARSER(ADX,                adx);
-    REGISTER_PARSER(BMP,                bmp);
-    REGISTER_PARSER(CAVSVIDEO,          cavsvideo);
-    REGISTER_PARSER(COOK,               cook);
-    REGISTER_PARSER(DCA,                dca);
-    REGISTER_PARSER(DIRAC,              dirac);
-    REGISTER_PARSER(DNXHD,              dnxhd);
-    REGISTER_PARSER(DPX,                dpx);
-    REGISTER_PARSER(DVBSUB,             dvbsub);
-    REGISTER_PARSER(DVDSUB,             dvdsub);
-    REGISTER_PARSER(FLAC,               flac);
-    REGISTER_PARSER(GSM,                gsm);
-    REGISTER_PARSER(H261,               h261);
-    REGISTER_PARSER(H263,               h263);
-    REGISTER_PARSER(H264,               h264);
-    REGISTER_PARSER(HEVC,               hevc);
-    REGISTER_PARSER(MJPEG,              mjpeg);
-    REGISTER_PARSER(MLP,                mlp);
-    REGISTER_PARSER(MPEG4VIDEO,         mpeg4video);
-    REGISTER_PARSER(MPEGAUDIO,          mpegaudio);
-    REGISTER_PARSER(MPEGVIDEO,          mpegvideo);
-    REGISTER_PARSER(OPUS,               opus);
-    REGISTER_PARSER(PNG,                png);
-    REGISTER_PARSER(PNM,                pnm);
-    REGISTER_PARSER(RV30,               rv30);
-    REGISTER_PARSER(RV40,               rv40);
-    REGISTER_PARSER(TAK,                tak);
-    REGISTER_PARSER(VC1,                vc1);
-    REGISTER_PARSER(VORBIS,             vorbis);
-    REGISTER_PARSER(VP3,                vp3);
-    REGISTER_PARSER(VP8,                vp8);
+
+static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
+{
+    switch(id){
+        //This is for future deprecatec codec ids, its empty since
+        //last major bump but will fill up again over time, please don't remove it
+        default                                         : return id;
+    }
+}
+
+static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
+{
+    const AVCodec *p, *experimental = NULL;
+    void *i = 0;
+
+    id = remap_deprecated_codec_id(id);
+
+    while ((p = av_codec_iterate(&i))) {
+        if (!x(p))
+            continue;
+        if (p->id == id) {
+            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
+                experimental = p;
+            } else
+                return (AVCodec*)p;
+        }
+    }
+
+    return (AVCodec*)experimental;
+}
+
+AVCodec *avcodec_find_encoder(enum AVCodecID id)
+{
+    return find_codec(id, av_codec_is_encoder);
+}
+
+AVCodec *avcodec_find_decoder(enum AVCodecID id)
+{
+    return find_codec(id, av_codec_is_decoder);
+}
+
+static AVCodec *find_codec_by_name(const char *name, int (*x)(const AVCodec *))
+{
+    void *i = 0;
+    const AVCodec *p;
+
+    if (!name)
+        return NULL;
+
+    while ((p = av_codec_iterate(&i))) {
+        if (!x(p))
+            continue;
+        if (strcmp(name, p->name) == 0)
+            return (AVCodec*)p;
+    }
+
+    return NULL;
+}
+
+AVCodec *avcodec_find_encoder_by_name(const char *name)
+{
+    return find_codec_by_name(name, av_codec_is_encoder);
+}
+
+AVCodec *avcodec_find_decoder_by_name(const char *name)
+{
+    return find_codec_by_name(name, av_codec_is_decoder);
 }
diff --git a/libavcodec/alpha/Makefile b/libavcodec/alpha/Makefile
new file mode 100644
index 0000000..796d976
--- /dev/null
+++ b/libavcodec/alpha/Makefile
@@ -0,0 +1,10 @@
+OBJS-$(CONFIG_BLOCKDSP)                 += alpha/blockdsp_alpha.o
+OBJS-$(CONFIG_ME_CMP)                   += alpha/me_cmp_alpha.o         \
+                                           alpha/me_cmp_mvi_asm.o
+OBJS-$(CONFIG_HPELDSP)                  += alpha/hpeldsp_alpha.o        \
+                                           alpha/hpeldsp_alpha_asm.o
+OBJS-$(CONFIG_IDCTDSP)                  += alpha/idctdsp_alpha.o        \
+                                           alpha/idctdsp_alpha_asm.o    \
+                                           alpha/simple_idct_alpha.o
+OBJS-$(CONFIG_MPEGVIDEO)                += alpha/mpegvideo_alpha.o
+OBJS-$(CONFIG_PIXBLOCKDSP)              += alpha/pixblockdsp_alpha.o
diff --git a/libavcodec/alpha/asm.h b/libavcodec/alpha/asm.h
new file mode 100644
index 0000000..6d850ce
--- /dev/null
+++ b/libavcodec/alpha/asm.h
@@ -0,0 +1,153 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_ASM_H
+#define AVCODEC_ALPHA_ASM_H
+
+#include <inttypes.h>
+
+#include "libavutil/common.h"
+
+#if AV_GCC_VERSION_AT_LEAST(2,96)
+# define likely(x)      __builtin_expect((x) != 0, 1)
+# define unlikely(x)    __builtin_expect((x) != 0, 0)
+#else
+# define likely(x)      (x)
+# define unlikely(x)    (x)
+#endif
+
+#define AMASK_BWX (1 << 0)
+#define AMASK_FIX (1 << 1)
+#define AMASK_CIX (1 << 2)
+#define AMASK_MVI (1 << 8)
+
+static inline uint64_t BYTE_VEC(uint64_t x)
+{
+    x |= x <<  8;
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+static inline uint64_t WORD_VEC(uint64_t x)
+{
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+
+#define sextw(x) ((int16_t) (x))
+
+#ifdef __GNUC__
+#define ldq(p)                                                  \
+    (((const union {                                            \
+        uint64_t __l;                                           \
+        __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)];  \
+    } *) (p))->__l)
+#define ldl(p)                                                  \
+    (((const union {                                            \
+        int32_t __l;                                            \
+        __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)];   \
+    } *) (p))->__l)
+#define stq(l, p)                                                       \
+    do {                                                                \
+        (((union {                                                      \
+            uint64_t __l;                                               \
+            __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)];      \
+        } *) (p))->__l) = l;                                            \
+    } while (0)
+#define stl(l, p)                                                       \
+    do {                                                                \
+        (((union {                                                      \
+            int32_t __l;                                                \
+            __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)];       \
+        } *) (p))->__l) = l;                                            \
+    } while (0)
+struct unaligned_long { uint64_t l; } __attribute__((packed));
+#define ldq_u(p)        (*(const uint64_t *) (((uint64_t) (p)) & ~7ul))
+#define uldq(a)         (((const struct unaligned_long *) (a))->l)
+
+#if AV_GCC_VERSION_AT_LEAST(3,3)
+#define prefetch(p)     __builtin_prefetch((p), 0, 1)
+#define prefetch_en(p)  __builtin_prefetch((p), 0, 0)
+#define prefetch_m(p)   __builtin_prefetch((p), 1, 1)
+#define prefetch_men(p) __builtin_prefetch((p), 1, 0)
+#define cmpbge          __builtin_alpha_cmpbge
+/* Avoid warnings.  */
+#define extql(a, b)     __builtin_alpha_extql(a, (uint64_t) (b))
+#define extwl(a, b)     __builtin_alpha_extwl(a, (uint64_t) (b))
+#define extqh(a, b)     __builtin_alpha_extqh(a, (uint64_t) (b))
+#define zap             __builtin_alpha_zap
+#define zapnot          __builtin_alpha_zapnot
+#define amask           __builtin_alpha_amask
+#define implver         __builtin_alpha_implver
+#define rpcc            __builtin_alpha_rpcc
+#else
+#define prefetch(p)     __asm__ volatile("ldl $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_en(p)  __asm__ volatile("ldq $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_m(p)   __asm__ volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_men(p) __asm__ volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define cmpbge(a, b) ({ uint64_t __r; __asm__ ("cmpbge  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extql(a, b)  ({ uint64_t __r; __asm__ ("extql   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extwl(a, b)  ({ uint64_t __r; __asm__ ("extwl   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extqh(a, b)  ({ uint64_t __r; __asm__ ("extqh   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zap(a, b)    ({ uint64_t __r; __asm__ ("zap     %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zapnot(a, b) ({ uint64_t __r; __asm__ ("zapnot  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define amask(a)     ({ uint64_t __r; __asm__ ("amask   %1,%0"      : "=r" (__r) : "rI"  (a));           __r; })
+#define implver()    ({ uint64_t __r; __asm__ ("implver %0"         : "=r" (__r));                       __r; })
+#define rpcc()       ({ uint64_t __r; __asm__ volatile ("rpcc %0"   : "=r" (__r));                       __r; })
+#endif
+#define wh64(p) __asm__ volatile("wh64 (%0)" : : "r"(p) : "memory")
+
+#if AV_GCC_VERSION_AT_LEAST(3,3) && defined(__alpha_max__)
+#define minub8  __builtin_alpha_minub8
+#define minsb8  __builtin_alpha_minsb8
+#define minuw4  __builtin_alpha_minuw4
+#define minsw4  __builtin_alpha_minsw4
+#define maxub8  __builtin_alpha_maxub8
+#define maxsb8  __builtin_alpha_maxsb8
+#define maxuw4  __builtin_alpha_maxuw4
+#define maxsw4  __builtin_alpha_maxsw4
+#define perr    __builtin_alpha_perr
+#define pklb    __builtin_alpha_pklb
+#define pkwb    __builtin_alpha_pkwb
+#define unpkbl  __builtin_alpha_unpkbl
+#define unpkbw  __builtin_alpha_unpkbw
+#else
+#define minub8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsb8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minuw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxub8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsb8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxuw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define perr(a, b)   ({ uint64_t __r; __asm__ (".arch ev6; perr    %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; })
+#define pklb(a)      ({ uint64_t __r; __asm__ (".arch ev6; pklb    %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define pkwb(a)      ({ uint64_t __r; __asm__ (".arch ev6; pkwb    %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define unpkbl(a)    ({ uint64_t __r; __asm__ (".arch ev6; unpkbl  %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define unpkbw(a)    ({ uint64_t __r; __asm__ (".arch ev6; unpkbw  %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#endif
+
+#else
+#error "Unknown compiler!"
+#endif
+
+#endif /* AVCODEC_ALPHA_ASM_H */
diff --git a/libavcodec/alpha/blockdsp_alpha.c b/libavcodec/alpha/blockdsp_alpha.c
new file mode 100644
index 0000000..c6f0964
--- /dev/null
+++ b/libavcodec/alpha/blockdsp_alpha.c
@@ -0,0 +1,49 @@
+/*
+ * Alpha optimised block operations
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/blockdsp.h"
+#include "asm.h"
+
+static void clear_blocks_axp(int16_t *blocks) {
+    uint64_t *p = (uint64_t *) blocks;
+    int n = sizeof(int16_t) * 6 * 64;
+
+    do {
+        p[0] = 0;
+        p[1] = 0;
+        p[2] = 0;
+        p[3] = 0;
+        p[4] = 0;
+        p[5] = 0;
+        p[6] = 0;
+        p[7] = 0;
+        p += 8;
+        n -= 8 * 8;
+    } while (n);
+}
+
+av_cold void ff_blockdsp_init_alpha(BlockDSPContext *c)
+{
+    c->clear_blocks = clear_blocks_axp;
+}
diff --git a/libavcodec/alpha/hpeldsp_alpha.c b/libavcodec/alpha/hpeldsp_alpha.c
new file mode 100644
index 0000000..8d54807
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.c
@@ -0,0 +1,213 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/hpeldsp.h"
+#include "hpeldsp_alpha.h"
+#include "asm.h"
+
+static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
+{
+    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+static inline uint64_t avg2(uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+#if 0
+/* The XY2 routines basically utilize this scheme, but reuse parts in
+   each iteration.  */
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
+{
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+                    + (l2 & BYTE_VEC(0x03))
+                    + (l3 & BYTE_VEC(0x03))
+                    + (l4 & BYTE_VEC(0x03))
+                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+#endif
+
+#define OP(LOAD, STORE)                         \
+    do {                                        \
+        STORE(LOAD(pixels), block);             \
+        pixels += line_size;                    \
+        block += line_size;                     \
+    } while (--h)
+
+#define OP_X2(LOAD, STORE)                                      \
+    do {                                                        \
+        uint64_t pix1, pix2;                                    \
+                                                                \
+        pix1 = LOAD(pixels);                                    \
+        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
+        STORE(AVG2(pix1, pix2), block);                         \
+        pixels += line_size;                                    \
+        block += line_size;                                     \
+    } while (--h)
+
+#define OP_Y2(LOAD, STORE)                      \
+    do {                                        \
+        uint64_t pix = LOAD(pixels);            \
+        do {                                    \
+            uint64_t next_pix;                  \
+                                                \
+            pixels += line_size;                \
+            next_pix = LOAD(pixels);            \
+            STORE(AVG2(pix, next_pix), block);  \
+            block += line_size;                 \
+            pix = next_pix;                     \
+        } while (--h);                          \
+    } while (0)
+
+#define OP_XY2(LOAD, STORE)                                                 \
+    do {                                                                    \
+        uint64_t pix1 = LOAD(pixels);                                       \
+        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
+        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
+                       + (pix2 & BYTE_VEC(0x03));                           \
+        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
+                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
+                                                                            \
+        do {                                                                \
+            uint64_t npix1, npix2;                                          \
+            uint64_t npix_l, npix_h;                                        \
+            uint64_t avg;                                                   \
+                                                                            \
+            pixels += line_size;                                            \
+            npix1 = LOAD(pixels);                                           \
+            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
+            npix_l = (npix1 & BYTE_VEC(0x03))                               \
+                   + (npix2 & BYTE_VEC(0x03));                              \
+            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
+                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
+            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
+                + pix_h + npix_h;                                           \
+            STORE(avg, block);                                              \
+                                                                            \
+            block += line_size;                                             \
+            pix_l = npix_l;                                                 \
+            pix_h = npix_h;                                                 \
+        } while (--h);                                                      \
+    } while (0)
+
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
+static void OPNAME ## _pixels ## SUFF ## _axp                               \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    if ((size_t) pixels & 0x7) {                                            \
+        OPKIND(uldq, STORE);                                                \
+    } else {                                                                \
+        OPKIND(ldq, STORE);                                                 \
+    }                                                                       \
+}                                                                           \
+                                                                            \
+static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
+    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
+}
+
+#define PIXOP(OPNAME, STORE)                    \
+    MAKE_OP(OPNAME, ,     OP,     STORE)        \
+    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
+    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
+    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
+
+/* Rounding primitives.  */
+#define AVG2 avg2
+#define AVG4 avg4
+#define AVG4_ROUNDER BYTE_VEC(0x02)
+#define STORE(l, b) stq(l, b)
+PIXOP(put, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg, STORE);
+
+/* Not rounding primitives.  */
+#undef AVG2
+#undef AVG4
+#undef AVG4_ROUNDER
+#undef STORE
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define AVG4_ROUNDER BYTE_VEC(0x01)
+#define STORE(l, b) stq(l, b)
+PIXOP(put_no_rnd, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg_no_rnd, STORE);
+
+static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
+                                 ptrdiff_t line_size, int h)
+{
+    put_pixels_axp_asm(block,     pixels,     line_size, h);
+    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
+}
+
+av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
+    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
+
+    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
+    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
+    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
+    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
+
+    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
+    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
+    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
+    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
+
+    c->avg_pixels_tab[1][0] = avg_pixels_axp;
+    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
+    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
+    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
+}
diff --git a/libavcodec/alpha/hpeldsp_alpha.h b/libavcodec/alpha/hpeldsp_alpha.h
new file mode 100644
index 0000000..985182c
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H
+#define AVCODEC_ALPHA_HPELDSP_ALPHA_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+
+#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */
diff --git a/libavcodec/alpha/hpeldsp_alpha_asm.S b/libavcodec/alpha/hpeldsp_alpha_asm.S
new file mode 100644
index 0000000..df386c4
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha_asm.S
@@ -0,0 +1,125 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ *                         int line_size, int h)
+ */
+        .align 6
+        .globl put_pixels_axp_asm
+        .ent put_pixels_axp_asm
+put_pixels_axp_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        and     a1, 7, t0
+        beq     t0, $aligned
+
+        .align 4
+$unaligned:
+        ldq_u   t0, 0(a1)
+        ldq_u   t1, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t2, 0(a1)
+        ldq_u   t3, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t4, 0(a1)
+        ldq_u   t5, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t6, 0(a1)
+        ldq_u   t7, 8(a1)
+        extql   t0, a1, t0
+        addq    a1, a2, a1
+
+        extqh   t1, a1, t1
+        addq    a0, a2, t8
+        extql   t2, a1, t2
+        addq    t8, a2, t9
+
+        extqh   t3, a1, t3
+        addq    t9, a2, ta
+        extql   t4, a1, t4
+        or      t0, t1, t0
+
+        extqh   t5, a1, t5
+        or      t2, t3, t2
+        extql   t6, a1, t6
+        or      t4, t5, t4
+
+        extqh   t7, a1, t7
+        or      t6, t7, t6
+        stq     t0, 0(a0)
+        stq     t2, 0(t8)
+
+        stq     t4, 0(t9)
+        subq    a3, 4, a3
+        stq     t6, 0(ta)
+        addq    ta, a2, a0
+
+        bne     a3, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        ldq     t0, 0(a1)
+        addq    a1, a2, a1
+        ldq     t1, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t2, 0(a1)
+        addq    a1, a2, a1
+        ldq     t3, 0(a1)
+
+        addq    a0, a2, t4
+        addq    a1, a2, a1
+        addq    t4, a2, t5
+        subq    a3, 4, a3
+
+        stq     t0, 0(a0)
+        addq    t5, a2, t6
+        stq     t1, 0(t4)
+        addq    t6, a2, a0
+
+        stq     t2, 0(t5)
+        stq     t3, 0(t6)
+
+        bne     a3, $aligned
+        ret
+        .end put_pixels_axp_asm
diff --git a/libavcodec/alpha/idctdsp_alpha.c b/libavcodec/alpha/idctdsp_alpha.c
new file mode 100644
index 0000000..bd43842
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp_alpha.h"
+#include "asm.h"
+
+void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+
+void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                 ptrdiff_t line_size);
+void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                 ptrdiff_t line_size);
+
+#if 0
+/* These functions were the base for the optimized assembler routines,
+   and remain here for documentation purposes.  */
+static void put_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
+                                   ptrdiff_t line_size)
+{
+    int i = 8;
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+
+    do {
+        uint64_t shorts0, shorts1;
+
+        shorts0 = ldq(block);
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+        stl(pkwb(shorts0), pixels);
+
+        shorts1 = ldq(block + 4);
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--i);
+}
+
+void add_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
+                            ptrdiff_t line_size)
+{
+    int h = 8;
+    /* Keep this function a leaf function by generating the constants
+       manually (mainly for the hack value ;-).  */
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+    uint64_t signmask  = zap(-1, 0x33);
+    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
+
+    do {
+        uint64_t shorts0, pix0, signs0;
+        uint64_t shorts1, pix1, signs1;
+
+        shorts0 = ldq(block);
+        shorts1 = ldq(block + 4);
+
+        pix0    = unpkbw(ldl(pixels));
+        /* Signed subword add (MMX paddw).  */
+        signs0  = shorts0 & signmask;
+        shorts0 &= ~signmask;
+        shorts0 += pix0;
+        shorts0 ^= signs0;
+        /* Clamp. */
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+
+        /* Next 4.  */
+        pix1    = unpkbw(ldl(pixels + 4));
+        signs1  = shorts1 & signmask;
+        shorts1 &= ~signmask;
+        shorts1 += pix1;
+        shorts1 ^= signs1;
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+
+        stl(pkwb(shorts0), pixels);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--h);
+}
+#endif
+
+av_cold void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx,
+                                   unsigned high_bit_depth)
+{
+    /* amask clears all bits that correspond to present features.  */
+    if (amask(AMASK_MVI) == 0) {
+        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
+        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;
+    }
+
+    put_pixels_clamped_axp_p = c->put_pixels_clamped;
+    add_pixels_clamped_axp_p = c->add_pixels_clamped;
+
+    if (!high_bit_depth && !avctx->lowres &&
+        (avctx->idct_algo == FF_IDCT_AUTO)) {
+        c->idct_put = ff_simple_idct_put_axp;
+        c->idct_add = ff_simple_idct_add_axp;
+        c->idct =     ff_simple_idct_axp;
+    }
+}
diff --git a/libavcodec/alpha/idctdsp_alpha.h b/libavcodec/alpha/idctdsp_alpha.h
new file mode 100644
index 0000000..8cc969d
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_IDCTDSP_ALPHA_H
+#define AVCODEC_ALPHA_IDCTDSP_ALPHA_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                        ptrdiff_t line_size);
+extern void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                        ptrdiff_t line_size);
+
+void ff_simple_idct_axp(int16_t *block);
+void ff_simple_idct_put_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+#endif /* AVCODEC_ALPHA_IDCTDSP_ALPHA_H */
diff --git a/libavcodec/alpha/idctdsp_alpha_asm.S b/libavcodec/alpha/idctdsp_alpha_asm.S
new file mode 100644
index 0000000..f545df9
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha_asm.S
@@ -0,0 +1,167 @@
+/*
+ * Alpha optimized IDCT-related routines
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+ *                                 ptrdiff_t line_size)
+ */
+        .align 6
+        .globl put_pixels_clamped_mvi_asm
+        .ent put_pixels_clamped_mvi_asm
+put_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t8, -1
+        lda     t9, 8           # loop counter
+        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
+
+        .align 4
+1:      ldq     t0,  0(a0)
+        ldq     t1,  8(a0)
+        ldq     t2, 16(a0)
+        ldq     t3, 24(a0)
+
+        maxsw4  t0, zero, t0
+        subq    t9, 2, t9
+        maxsw4  t1, zero, t1
+        lda     a0, 32(a0)
+
+        maxsw4  t2, zero, t2
+        addq    a1, a2, ta
+        maxsw4  t3, zero, t3
+        minsw4  t0, t8, t0
+
+        minsw4  t1, t8, t1
+        minsw4  t2, t8, t2
+        minsw4  t3, t8, t3
+        pkwb    t0, t0
+
+        pkwb    t1, t1
+        pkwb    t2, t2
+        pkwb    t3, t3
+        stl     t0, 0(a1)
+
+        stl     t1, 4(a1)
+        addq    ta, a2, a1
+        stl     t2, 0(ta)
+        stl     t3, 4(ta)
+
+        bne     t9, 1b
+        ret
+        .end put_pixels_clamped_mvi_asm
+
+/************************************************************************
+ * void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+ *                                 ptrdiff_t line_size)
+ */
+        .align 6
+        .globl add_pixels_clamped_mvi_asm
+        .ent add_pixels_clamped_mvi_asm
+add_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t1, -1
+        lda     th, 8
+        zap     t1, 0x33, tg
+        nop
+
+        srl     tg, 1, t0
+        xor     tg, t0, tg      # 0x8000800080008000
+        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
+
+        .align 4
+1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
+        ldl     t4, 4(a1)       # pix1
+        addq    a1, a2, te      # pixels += line_size
+        ldq     t0, 0(a0)       # shorts0
+
+        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
+        ldl     ta, 4(te)       # pix3
+        ldq     t3, 8(a0)       # shorts1
+        ldq     t6, 16(a0)      # shorts2
+
+        ldq     t9, 24(a0)      # shorts3
+        unpkbw  t1, t1          # 0 0 (quarter/op no.)
+        and     t0, tg, t2      # 0 1
+        unpkbw  t4, t4          # 1 0
+
+        bic     t0, tg, t0      # 0 2
+        unpkbw  t7, t7          # 2 0
+        and     t3, tg, t5      # 1 1
+        addq    t0, t1, t0      # 0 3
+
+        xor     t0, t2, t0      # 0 4
+        unpkbw  ta, ta          # 3 0
+        and     t6, tg, t8      # 2 1
+        maxsw4  t0, zero, t0    # 0 5
+
+        bic     t3, tg, t3      # 1 2
+        bic     t6, tg, t6      # 2 2
+        minsw4  t0, tf, t0      # 0 6
+        addq    t3, t4, t3      # 1 3
+
+        pkwb    t0, t0          # 0 7
+        xor     t3, t5, t3      # 1 4
+        maxsw4  t3, zero, t3    # 1 5
+        addq    t6, t7, t6      # 2 3
+
+        xor     t6, t8, t6      # 2 4
+        and     t9, tg, tb      # 3 1
+        minsw4  t3, tf, t3      # 1 6
+        bic     t9, tg, t9      # 3 2
+
+        maxsw4  t6, zero, t6    # 2 5
+        addq    t9, ta, t9      # 3 3
+        stl     t0, 0(a1)       # 0 8
+        minsw4  t6, tf, t6      # 2 6
+
+        xor     t9, tb, t9      # 3 4
+        maxsw4  t9, zero, t9    # 3 5
+        lda     a0, 32(a0)      # block += 16;
+        pkwb    t3, t3          # 1 7
+
+        minsw4  t9, tf, t9      # 3 6
+        subq    th, 2, th
+        pkwb    t6, t6          # 2 7
+        pkwb    t9, t9          # 3 7
+
+        stl     t3, 4(a1)       # 1 8
+        addq    te, a2, a1      # pixels += line_size
+        stl     t6, 0(te)       # 2 8
+        stl     t9, 4(te)       # 3 8
+
+        bne     th, 1b
+        ret
+        .end add_pixels_clamped_mvi_asm
diff --git a/libavcodec/alpha/me_cmp_alpha.c b/libavcodec/alpha/me_cmp_alpha.c
new file mode 100644
index 0000000..8f36019
--- /dev/null
+++ b/libavcodec/alpha/me_cmp_alpha.c
@@ -0,0 +1,317 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/me_cmp.h"
+#include "asm.h"
+
+int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
+
+static inline uint64_t avg2(uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
+{
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+                    + (l2 & BYTE_VEC(0x03))
+                    + (l3 & BYTE_VEC(0x03))
+                    + (l4 & BYTE_VEC(0x03))
+                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+
+static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    if ((size_t) pix2 & 0x7) {
+        /* works only when pix2 is actually unaligned */
+        do {                    /* do 8 pixel a time */
+            uint64_t p1, p2;
+
+            p1  = ldq(pix1);
+            p2  = uldq(pix2);
+            result += perr(p1, p2);
+
+            pix1 += line_size;
+            pix2 += line_size;
+        } while (--h);
+    } else {
+        do {
+            uint64_t p1, p2;
+
+            p1 = ldq(pix1);
+            p2 = ldq(pix2);
+            result += perr(p1, p2);
+
+            pix1 += line_size;
+            pix2 += line_size;
+        } while (--h);
+    }
+
+    return result;
+}
+
+#if 0                           /* now done in assembly */
+int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int result = 0;
+    int h = 16;
+
+    if ((size_t) pix2 & 0x7) {
+        /* works only when pix2 is actually unaligned */
+        do {                    /* do 16 pixel a time */
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t t;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            t     = ldq_u(pix2 + 8);
+            p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+            p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+    } else {
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            p2_l = ldq(pix2);
+            p2_r = ldq(pix2 + 8);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+    }
+
+    return result;
+}
+#endif
+
+static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+    uint64_t disalign = (size_t) pix2 & 0x7;
+
+    switch (disalign) {
+    case 0:
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            l    = ldq(pix2);
+            r    = ldq(pix2 + 8);
+            p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
+            p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    case 7:
+        /* |.......l|lllllllr|rrrrrrr*|
+           This case is special because disalign1 would be 8, which
+           gets treated as 0 by extqh.  At least it is a bit faster
+           that way :)  */
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, m, r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            l     = ldq_u(pix2);
+            m     = ldq_u(pix2 + 8);
+            r     = ldq_u(pix2 + 16);
+            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
+            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    default:
+        do {
+            uint64_t disalign1 = disalign + 1;
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, m, r;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            l     = ldq_u(pix2);
+            m     = ldq_u(pix2 + 8);
+            r     = ldq_u(pix2 + 16);
+            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
+                         extql(l, disalign1) | extqh(m, disalign1));
+            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
+                         extql(m, disalign1) | extqh(r, disalign1));
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    }
+    return result;
+}
+
+static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    if ((size_t) pix2 & 0x7) {
+        uint64_t t, p2_l, p2_r;
+        t     = ldq_u(pix2 + 8);
+        p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+        p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+
+        do {
+            uint64_t p1_l, p1_r, np2_l, np2_r;
+            uint64_t t;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            pix2 += line_size;
+            t     = ldq_u(pix2 + 8);
+            np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+            np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+
+            result += perr(p1_l, avg2(p2_l, np2_l))
+                    + perr(p1_r, avg2(p2_r, np2_r));
+
+            pix1 += line_size;
+            p2_l  = np2_l;
+            p2_r  = np2_r;
+
+        } while (--h);
+    } else {
+        uint64_t p2_l, p2_r;
+        p2_l = ldq(pix2);
+        p2_r = ldq(pix2 + 8);
+        do {
+            uint64_t p1_l, p1_r, np2_l, np2_r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            pix2 += line_size;
+            np2_l = ldq(pix2);
+            np2_r = ldq(pix2 + 8);
+
+            result += perr(p1_l, avg2(p2_l, np2_l))
+                    + perr(p1_r, avg2(p2_r, np2_r));
+
+            pix1 += line_size;
+            p2_l  = np2_l;
+            p2_r  = np2_r;
+        } while (--h);
+    }
+    return result;
+}
+
+static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    uint64_t p1_l, p1_r;
+    uint64_t p2_l, p2_r, p2_x;
+
+    p1_l = ldq(pix1);
+    p1_r = ldq(pix1 + 8);
+
+    if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
+        p2_l = uldq(pix2);
+        p2_r = uldq(pix2 + 8);
+        p2_x = (uint64_t) pix2[16] << 56;
+    } else {
+        p2_l = ldq(pix2);
+        p2_r = ldq(pix2 + 8);
+        p2_x = ldq(pix2 + 16) << 56;
+    }
+
+    do {
+        uint64_t np1_l, np1_r;
+        uint64_t np2_l, np2_r, np2_x;
+
+        pix1 += line_size;
+        pix2 += line_size;
+
+        np1_l = ldq(pix1);
+        np1_r = ldq(pix1 + 8);
+
+        if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
+            np2_l = uldq(pix2);
+            np2_r = uldq(pix2 + 8);
+            np2_x = (uint64_t) pix2[16] << 56;
+        } else {
+            np2_l = ldq(pix2);
+            np2_r = ldq(pix2 + 8);
+            np2_x = ldq(pix2 + 16) << 56;
+        }
+
+        result += perr(p1_l,
+                       avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
+                            np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
+                + perr(p1_r,
+                       avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
+                            np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
+
+        p1_l = np1_l;
+        p1_r = np1_r;
+        p2_l = np2_l;
+        p2_r = np2_r;
+        p2_x = np2_x;
+    } while (--h);
+
+    return result;
+}
+
+av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx)
+{
+    /* amask clears all bits that correspond to present features.  */
+    if (amask(AMASK_MVI) == 0) {
+        c->sad[0]           = pix_abs16x16_mvi_asm;
+        c->sad[1]           = pix_abs8x8_mvi;
+        c->pix_abs[0][0]    = pix_abs16x16_mvi_asm;
+        c->pix_abs[1][0]    = pix_abs8x8_mvi;
+        c->pix_abs[0][1]    = pix_abs16x16_x2_mvi;
+        c->pix_abs[0][2]    = pix_abs16x16_y2_mvi;
+        c->pix_abs[0][3]    = pix_abs16x16_xy2_mvi;
+    }
+}
diff --git a/libavcodec/alpha/me_cmp_mvi_asm.S b/libavcodec/alpha/me_cmp_mvi_asm.S
new file mode 100644
index 0000000..2399085
--- /dev/null
+++ b/libavcodec/alpha/me_cmp_mvi_asm.S
@@ -0,0 +1,179 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "regdef.h"
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/*****************************************************************************
+ * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size)
+ *
+ * This code is written with a pca56 in mind. For ev6, one should
+ * really take the increased latency of 3 cycles for MVI instructions
+ * into account.
+ *
+ * It is important to keep the loading and first use of a register as
+ * far apart as possible, because if a register is accessed before it
+ * has been fetched from memory, the CPU will stall.
+ */
+        .align 4
+        .globl pix_abs16x16_mvi_asm
+        .ent pix_abs16x16_mvi_asm
+pix_abs16x16_mvi_asm:
+        .frame sp, 0, ra, 0
+        .prologue 0
+
+        and     a2, 7, t0
+        clr     v0
+        beq     t0, $aligned
+        .align 4
+$unaligned:
+        /* Registers:
+           line 0:
+           t0:  left_u -> left lo -> left
+           t1:  mid
+           t2:  right_u -> right hi -> right
+           t3:  ref left
+           t4:  ref right
+           line 1:
+           t5:  left_u -> left lo -> left
+           t6:  mid
+           t7:  right_u -> right hi -> right
+           t8:  ref left
+           t9:  ref right
+           temp:
+           ta:  left hi
+           tb:  right lo
+           tc:  error left
+           td:  error right  */
+
+        /* load line 0 */
+        ldq_u   t0, 0(a2)       # left_u
+        ldq_u   t1, 8(a2)       # mid
+        ldq_u   t2, 16(a2)      # right_u
+        ldq     t3, 0(a1)       # ref left
+        ldq     t4, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        addq    a2, a3, a2      # pix2
+        /* load line 1 */
+        ldq_u   t5, 0(a2)       # left_u
+        ldq_u   t6, 8(a2)       # mid
+        ldq_u   t7, 16(a2)      # right_u
+        ldq     t8, 0(a1)       # ref left
+        ldq     t9, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        addq    a2, a3, a2      # pix2
+        /* calc line 0 */
+        extql   t0, a2, t0      # left lo
+        extqh   t1, a2, ta      # left hi
+        extql   t1, a2, tb      # right lo
+        or      t0, ta, t0      # left
+        extqh   t2, a2, t2      # right hi
+        perr    t3, t0, tc      # error left
+        or      t2, tb, t2      # right
+        perr    t4, t2, td      # error right
+        addq    v0, tc, v0      # add error left
+        addq    v0, td, v0      # add error left
+        /* calc line 1 */
+        extql   t5, a2, t5      # left lo
+        extqh   t6, a2, ta      # left hi
+        extql   t6, a2, tb      # right lo
+        or      t5, ta, t5      # left
+        extqh   t7, a2, t7      # right hi
+        perr    t8, t5, tc      # error left
+        or      t7, tb, t7      # right
+        perr    t9, t7, td      # error right
+        addq    v0, tc, v0      # add error left
+        addq    v0, td, v0      # add error left
+        /* loop */
+        subq    a4,  2, a4      # h -= 2
+        bne     a4, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        /* load line 0 */
+        ldq     t0, 0(a2)       # left
+        ldq     t1, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     t2, 0(a1)       # ref left
+        ldq     t3, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 1 */
+        ldq     t4, 0(a2)       # left
+        ldq     t5, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     t6, 0(a1)       # ref left
+        ldq     t7, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 2 */
+        ldq     t8, 0(a2)       # left
+        ldq     t9, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     ta, 0(a1)       # ref left
+        ldq     tb, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 3 */
+        ldq     tc, 0(a2)       # left
+        ldq     td, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     te, 0(a1)       # ref left
+        ldq     a0, 8(a1)       # ref right
+        /* calc line 0 */
+        perr    t0, t2, t0      # error left
+        addq    a1, a3, a1      # pix1
+        perr    t1, t3, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 1 */
+        perr    t4, t6, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    t5, t7, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 2 */
+        perr    t8, ta, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    t9, tb, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 3 */
+        perr    tc, te, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    td, a0, t1      # error right
+        addq    v0, t0, v0      # add error left
+        addq    v0, t1, v0      # add error right
+        /* loop */
+        subq    a4,  4, a4      # h -= 4
+        bne     a4, $aligned
+        ret
+        .end pix_abs16x16_mvi_asm
diff --git a/libavcodec/alpha/mpegvideo_alpha.c b/libavcodec/alpha/mpegvideo_alpha.c
new file mode 100644
index 0000000..126fe26
--- /dev/null
+++ b/libavcodec/alpha/mpegvideo_alpha.c
@@ -0,0 +1,110 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/mpegvideo.h"
+#include "asm.h"
+
+static void dct_unquantize_h263_axp(int16_t *block, int n_coeffs,
+                                    uint64_t qscale, uint64_t qadd)
+{
+    uint64_t qmul = qscale << 1;
+    uint64_t correction = WORD_VEC(qmul * 255 >> 8);
+    int i;
+
+    qadd = WORD_VEC(qadd);
+
+    for(i = 0; i <= n_coeffs; block += 4, i += 4) {
+        uint64_t levels, negmask, zeros, add, sub;
+
+        levels = ldq(block);
+        if (levels == 0)
+            continue;
+
+#ifdef __alpha_max__
+        /* I don't think the speed difference justifies runtime
+           detection.  */
+        negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */
+        negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
+#else
+        negmask = cmpbge(WORD_VEC(0x7fff), levels);
+        negmask &= (negmask >> 1) | (1 << 7);
+        negmask = zap(-1, negmask);
+#endif
+
+        zeros = cmpbge(0, levels);
+        zeros &= zeros >> 1;
+        /* zeros |= zeros << 1 is not needed since qadd <= 255, so
+           zapping the lower byte suffices.  */
+
+        levels *= qmul;
+        levels -= correction & (negmask << 16);
+
+        add = qadd & ~negmask;
+        sub = qadd &  negmask;
+        /* Set qadd to 0 for levels == 0.  */
+        add = zap(add, zeros);
+        levels += add;
+        levels -= sub;
+
+        stq(levels, block);
+    }
+}
+
+static void dct_unquantize_h263_intra_axp(MpegEncContext *s, int16_t *block,
+                                    int n, int qscale)
+{
+    int n_coeffs;
+    uint64_t qadd;
+    int16_t block0 = block[0];
+
+    if (!s->h263_aic) {
+        if (n < 4)
+            block0 *= s->y_dc_scale;
+        else
+            block0 *= s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    } else {
+        qadd = 0;
+    }
+
+    if(s->ac_pred)
+        n_coeffs = 63;
+    else
+        n_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    dct_unquantize_h263_axp(block, n_coeffs, qscale, qadd);
+
+    block[0] = block0;
+}
+
+static void dct_unquantize_h263_inter_axp(MpegEncContext *s, int16_t *block,
+                                    int n, int qscale)
+{
+    int n_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+    dct_unquantize_h263_axp(block, n_coeffs, qscale, (qscale - 1) | 1);
+}
+
+av_cold void ff_mpv_common_init_axp(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_axp;
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_axp;
+}
diff --git a/libavcodec/alpha/pixblockdsp_alpha.c b/libavcodec/alpha/pixblockdsp_alpha.c
new file mode 100644
index 0000000..c2f1a1d
--- /dev/null
+++ b/libavcodec/alpha/pixblockdsp_alpha.c
@@ -0,0 +1,79 @@
+/*
+ * SIMD-optimized pixel operations
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/pixblockdsp.h"
+#include "asm.h"
+
+static void get_pixels_mvi(int16_t *restrict block,
+                           const uint8_t *restrict pixels, ptrdiff_t stride)
+{
+    int h = 8;
+
+    do {
+        uint64_t p;
+
+        p = ldq(pixels);
+        stq(unpkbw(p),       block);
+        stq(unpkbw(p >> 32), block + 4);
+
+        pixels += stride;
+        block += 8;
+    } while (--h);
+}
+
+static void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                            ptrdiff_t stride)
+{
+    int h = 8;
+    uint64_t mask = 0x4040;
+
+    mask |= mask << 16;
+    mask |= mask << 32;
+    do {
+        uint64_t x, y, c, d, a;
+        uint64_t signs;
+
+        x = ldq(s1);
+        y = ldq(s2);
+        c = cmpbge(x, y);
+        d = x - y;
+        a = zap(mask, c);       /* We use 0x4040404040404040 here...  */
+        d += 4 * a;             /* ...so we can use s4addq here.      */
+        signs = zap(-1, c);
+
+        stq(unpkbw(d)       | (unpkbw(signs)       << 8), block);
+        stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
+
+        s1 += stride;
+        s2 += stride;
+        block += 8;
+    } while (--h);
+}
+
+av_cold void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx,
+                                       unsigned high_bit_depth)
+{
+    if (amask(AMASK_MVI) == 0) {
+        if (!high_bit_depth)
+            c->get_pixels = get_pixels_mvi;
+        c->diff_pixels = diff_pixels_mvi;
+    }
+}
diff --git a/libavcodec/alpha/regdef.h b/libavcodec/alpha/regdef.h
new file mode 100644
index 0000000..f05577a
--- /dev/null
+++ b/libavcodec/alpha/regdef.h
@@ -0,0 +1,77 @@
+/*
+ * Alpha optimized DSP utils
+ * copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Some BSDs don't seem to have regdef.h... sigh  */
+#ifndef AVCODEC_ALPHA_REGDEF_H
+#define AVCODEC_ALPHA_REGDEF_H
+
+#define v0      $0      /* function return value */
+
+#define t0      $1      /* temporary registers (caller-saved) */
+#define t1      $2
+#define t2      $3
+#define t3      $4
+#define t4      $5
+#define t5      $6
+#define t6      $7
+#define t7      $8
+
+#define s0      $9      /* saved-registers (callee-saved registers) */
+#define s1      $10
+#define s2      $11
+#define s3      $12
+#define s4      $13
+#define s5      $14
+#define s6      $15
+#define fp      s6      /* frame-pointer (s6 in frame-less procedures) */
+
+#define a0      $16     /* argument registers (caller-saved) */
+#define a1      $17
+#define a2      $18
+#define a3      $19
+#define a4      $20
+#define a5      $21
+
+#define t8      $22     /* more temps (caller-saved) */
+#define t9      $23
+#define t10     $24
+#define t11     $25
+#define ra      $26     /* return address register */
+#define t12     $27
+
+#define pv      t12     /* procedure-variable register */
+#define AT      $at     /* assembler temporary */
+#define gp      $29     /* global pointer */
+#define sp      $30     /* stack pointer */
+#define zero    $31     /* reads as zero, writes are noops */
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+#endif /* AVCODEC_ALPHA_REGDEF_H */
diff --git a/libavcodec/alpha/simple_idct_alpha.c b/libavcodec/alpha/simple_idct_alpha.c
new file mode 100644
index 0000000..6e377ef
--- /dev/null
+++ b/libavcodec/alpha/simple_idct_alpha.c
@@ -0,0 +1,303 @@
+/*
+ * Simple IDCT (Alpha optimized)
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * based upon some outcommented C code from mpeg2dec (idct_mmx.c
+ * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
+ *
+ * Alpha optimizations by Måns Rullgård <mans@mansr.com>
+ *                     and Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_alpha.h"
+#include "asm.h"
+
+// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
+// W4 is actually exactly 16384, but using 16383 works around
+// accumulating rounding errors for some encoders
+#define W1 22725
+#define W2 21407
+#define W3 19266
+#define W4 16383
+#define W5 12873
+#define W6  8867
+#define W7  4520
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
+static inline int idct_row(int16_t *row)
+{
+    int a0, a1, a2, a3, b0, b1, b2, b3, t;
+    uint64_t l, r, t2;
+    l = ldq(row);
+    r = ldq(row + 4);
+
+    if (l == 0 && r == 0)
+        return 0;
+
+    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
+
+    if (((l & ~0xffffUL) | r) == 0) {
+        a0 >>= ROW_SHIFT;
+        t2 = (uint16_t) a0;
+        t2 |= t2 << 16;
+        t2 |= t2 << 32;
+
+        stq(t2, row);
+        stq(t2, row + 4);
+        return 1;
+    }
+
+    a1 = a0;
+    a2 = a0;
+    a3 = a0;
+
+    t = extwl(l, 4);            /* row[2] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W2 * t;
+        a1 += W6 * t;
+        a2 -= W6 * t;
+        a3 -= W2 * t;
+    }
+
+    t = extwl(r, 0);            /* row[4] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W4 * t;
+        a1 -= W4 * t;
+        a2 -= W4 * t;
+        a3 += W4 * t;
+    }
+
+    t = extwl(r, 4);            /* row[6] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W6 * t;
+        a1 -= W2 * t;
+        a2 += W2 * t;
+        a3 -= W6 * t;
+    }
+
+    t = extwl(l, 2);            /* row[1] */
+    if (t != 0) {
+        t = sextw(t);
+        b0 = W1 * t;
+        b1 = W3 * t;
+        b2 = W5 * t;
+        b3 = W7 * t;
+    } else {
+        b0 = 0;
+        b1 = 0;
+        b2 = 0;
+        b3 = 0;
+    }
+
+    t = extwl(l, 6);            /* row[3] */
+    if (t) {
+        t = sextw(t);
+        b0 += W3 * t;
+        b1 -= W7 * t;
+        b2 -= W1 * t;
+        b3 -= W5 * t;
+    }
+
+
+    t = extwl(r, 2);            /* row[5] */
+    if (t) {
+        t = sextw(t);
+        b0 += W5 * t;
+        b1 -= W1 * t;
+        b2 += W7 * t;
+        b3 += W3 * t;
+    }
+
+    t = extwl(r, 6);            /* row[7] */
+    if (t) {
+        t = sextw(t);
+        b0 += W7 * t;
+        b1 -= W5 * t;
+        b2 += W3 * t;
+        b3 -= W1 * t;
+    }
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+
+    return 2;
+}
+
+static inline void idct_col(int16_t *col)
+{
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    col[0] += (1 << (COL_SHIFT - 1)) / W4;
+
+    a0 = W4 * col[8 * 0];
+    a1 = W4 * col[8 * 0];
+    a2 = W4 * col[8 * 0];
+    a3 = W4 * col[8 * 0];
+
+    if (col[8 * 2]) {
+        a0 += W2 * col[8 * 2];
+        a1 += W6 * col[8 * 2];
+        a2 -= W6 * col[8 * 2];
+        a3 -= W2 * col[8 * 2];
+    }
+
+    if (col[8 * 4]) {
+        a0 += W4 * col[8 * 4];
+        a1 -= W4 * col[8 * 4];
+        a2 -= W4 * col[8 * 4];
+        a3 += W4 * col[8 * 4];
+    }
+
+    if (col[8 * 6]) {
+        a0 += W6 * col[8 * 6];
+        a1 -= W2 * col[8 * 6];
+        a2 += W2 * col[8 * 6];
+        a3 -= W6 * col[8 * 6];
+    }
+
+    if (col[8 * 1]) {
+        b0 = W1 * col[8 * 1];
+        b1 = W3 * col[8 * 1];
+        b2 = W5 * col[8 * 1];
+        b3 = W7 * col[8 * 1];
+    } else {
+        b0 = 0;
+        b1 = 0;
+        b2 = 0;
+        b3 = 0;
+    }
+
+    if (col[8 * 3]) {
+        b0 += W3 * col[8 * 3];
+        b1 -= W7 * col[8 * 3];
+        b2 -= W1 * col[8 * 3];
+        b3 -= W5 * col[8 * 3];
+    }
+
+    if (col[8 * 5]) {
+        b0 += W5 * col[8 * 5];
+        b1 -= W1 * col[8 * 5];
+        b2 += W7 * col[8 * 5];
+        b3 += W3 * col[8 * 5];
+    }
+
+    if (col[8 * 7]) {
+        b0 += W7 * col[8 * 7];
+        b1 -= W5 * col[8 * 7];
+        b2 += W3 * col[8 * 7];
+        b3 -= W1 * col[8 * 7];
+    }
+
+    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
+    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
+    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
+    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
+    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
+    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
+    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
+    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
+}
+
+/* If all rows but the first one are zero after row transformation,
+   all rows will be identical after column transformation.  */
+static inline void idct_col2(int16_t *col)
+{
+    int i;
+    uint64_t l, r;
+
+    for (i = 0; i < 8; ++i) {
+        int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
+
+        a0 *= W4;
+        col[i] = a0 >> COL_SHIFT;
+    }
+
+    l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
+    stq(l, col +  2 * 4); stq(r, col +  3 * 4);
+    stq(l, col +  4 * 4); stq(r, col +  5 * 4);
+    stq(l, col +  6 * 4); stq(r, col +  7 * 4);
+    stq(l, col +  8 * 4); stq(r, col +  9 * 4);
+    stq(l, col + 10 * 4); stq(r, col + 11 * 4);
+    stq(l, col + 12 * 4); stq(r, col + 13 * 4);
+    stq(l, col + 14 * 4); stq(r, col + 15 * 4);
+}
+
+void ff_simple_idct_axp(int16_t *block)
+{
+
+    int i;
+    int rowsZero = 1;           /* all rows except row 0 zero */
+    int rowsConstant = 1;       /* all rows consist of a constant value */
+
+    for (i = 0; i < 8; i++) {
+        int sparseness = idct_row(block + 8 * i);
+
+        if (i > 0 && sparseness > 0)
+            rowsZero = 0;
+        if (sparseness == 2)
+            rowsConstant = 0;
+    }
+
+    if (rowsZero) {
+        idct_col2(block);
+    } else if (rowsConstant) {
+        idct_col(block);
+        for (i = 0; i < 8; i += 2) {
+            uint64_t v = (uint16_t) block[0];
+            uint64_t w = (uint16_t) block[8];
+
+            v |= v << 16;
+            w |= w << 16;
+            v |= v << 32;
+            w |= w << 32;
+            stq(v, block + 0 * 4);
+            stq(v, block + 1 * 4);
+            stq(w, block + 2 * 4);
+            stq(w, block + 3 * 4);
+            block += 4 * 4;
+        }
+    } else {
+        for (i = 0; i < 8; i++)
+            idct_col(block + i);
+    }
+}
+
+void ff_simple_idct_put_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_simple_idct_axp(block);
+    put_pixels_clamped_axp_p(block, dest, line_size);
+}
+
+void ff_simple_idct_add_axp(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_simple_idct_axp(block);
+    add_pixels_clamped_axp_p(block, dest, line_size);
+}
diff --git a/libavcodec/alsdec.c b/libavcodec/alsdec.c
index 5f09a9d..ca8701e 100644
--- a/libavcodec/alsdec.c
+++ b/libavcodec/alsdec.c
@@ -1,43 +1,45 @@
 /*
  * MPEG-4 ALS decoder
- * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * MPEG-4 ALS decoder
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 #include <inttypes.h>
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
+#include "unary.h"
 #include "mpeg4audio.h"
-#include "bytestream.h"
 #include "bgmc.h"
 #include "bswapdsp.h"
 #include "internal.h"
-#include "unary.h"
-
+#include "mlz.h"
 #include "libavutil/samplefmt.h"
 #include "libavutil/crc.h"
+#include "libavutil/softfloat_ieee754.h"
+#include "libavutil/intfloat.h"
+#include "libavutil/intreadwrite.h"
 
 #include <stdint.h>
 
@@ -192,7 +194,7 @@ typedef struct ALSChannelData {
 typedef struct ALSDecContext {
     AVCodecContext *avctx;
     ALSSpecificConfig sconf;
-    BitstreamContext bc;
+    GetBitContext gb;
     BswapDSPContext bdsp;
     const AVCRC *crc_table;
     uint32_t crc_org;               ///< CRC value of the original input data
@@ -200,6 +202,7 @@ typedef struct ALSDecContext {
     unsigned int cur_frame_length;  ///< length of the current frame to decode
     unsigned int frame_id;          ///< the frame ID / number of the current frame
     unsigned int js_switch;         ///< if true, joint-stereo decoding is enforced
+    unsigned int cs_switch;         ///< if true, channel rearrangement is done
     unsigned int num_blocks;        ///< number of blocks used in the current frame
     unsigned int s_max;             ///< maximum Rice parameter allowed in entropy coding
     uint8_t *bgmc_lut;              ///< pointer at lookup tables used for BGMC
@@ -225,6 +228,14 @@ typedef struct ALSDecContext {
     int32_t **raw_samples;          ///< decoded raw samples for each channel
     int32_t *raw_buffer;            ///< contains all decoded raw samples including carryover samples
     uint8_t *crc_buffer;            ///< buffer of byte order corrected samples used for CRC check
+    MLZ* mlz;                       ///< masked lz decompression structure
+    SoftFloat_IEEE754 *acf;         ///< contains common multiplier for all channels
+    int *last_acf_mantissa;         ///< contains the last acf mantissa data of common multiplier for all channels
+    int *shift_value;               ///< value by which the binary point is to be shifted for all channels
+    int *last_shift_value;          ///< contains last shift value for all channels
+    int **raw_mantissa;             ///< decoded mantissa bits of the difference signal
+    unsigned char *larray;          ///< buffer to store the output of masked lz decompression
+    int *nbits;                     ///< contains the number of bits to read for masked lz decompression for all samples
 } ALSDecContext;
 
 
@@ -247,9 +258,9 @@ typedef struct ALSBlockData {
 } ALSBlockData;
 
 
-#ifdef DEBUG
 static av_cold void dprint_specific_config(ALSDecContext *ctx)
 {
+#ifdef DEBUG
     AVCodecContext *avctx    = ctx->avctx;
     ALSSpecificConfig *sconf = &ctx->sconf;
 
@@ -271,25 +282,25 @@ static av_cold void dprint_specific_config(ALSDecContext *ctx)
     ff_dlog(avctx, "chan_sort = %i\n",            sconf->chan_sort);
     ff_dlog(avctx, "RLSLMS = %i\n",               sconf->rlslms);
     ff_dlog(avctx, "chan_config_info = %i\n",     sconf->chan_config_info);
-}
-#else
-#define dprint_specific_config(x) do {} while(0)
 #endif
+}
 
 
 /** Read an ALSSpecificConfig from a buffer into the output struct.
  */
 static av_cold int read_specific_config(ALSDecContext *ctx)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     uint64_t ht_size;
     int i, config_offset;
-    MPEG4AudioConfig m4ac;
+    MPEG4AudioConfig m4ac = {0};
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
     uint32_t als_id, header_size, trailer_size;
+    int ret;
 
-    bitstream_init8(&bc, avctx->extradata, avctx->extradata_size);
+    if ((ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size)) < 0)
+        return ret;
 
     config_offset = avpriv_mpeg4audio_get_config(&m4ac, avctx->extradata,
                                                  avctx->extradata_size * 8, 1);
@@ -297,40 +308,40 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
     if (config_offset < 0)
         return AVERROR_INVALIDDATA;
 
-    bitstream_skip(&bc, config_offset);
+    skip_bits_long(&gb, config_offset);
 
-    if (bitstream_bits_left(&bc) < (30 << 3))
+    if (get_bits_left(&gb) < (30 << 3))
         return AVERROR_INVALIDDATA;
 
     // read the fixed items
-    als_id                      = bitstream_read(&bc, 32);
+    als_id                      = get_bits_long(&gb, 32);
     avctx->sample_rate          = m4ac.sample_rate;
-    bitstream_skip(&bc, 32); // sample rate already known
-    sconf->samples              = bitstream_read(&bc, 32);
+    skip_bits_long(&gb, 32); // sample rate already known
+    sconf->samples              = get_bits_long(&gb, 32);
     avctx->channels             = m4ac.channels;
-    bitstream_skip(&bc, 16); // number of channels already known
-    bitstream_skip(&bc, 3);  // skip file_type
-    sconf->resolution           = bitstream_read(&bc, 3);
-    sconf->floating             = bitstream_read_bit(&bc);
-    sconf->msb_first            = bitstream_read_bit(&bc);
-    sconf->frame_length         = bitstream_read(&bc, 16) + 1;
-    sconf->ra_distance          = bitstream_read(&bc, 8);
-    sconf->ra_flag              = bitstream_read(&bc, 2);
-    sconf->adapt_order          = bitstream_read_bit(&bc);
-    sconf->coef_table           = bitstream_read(&bc, 2);
-    sconf->long_term_prediction = bitstream_read_bit(&bc);
-    sconf->max_order            = bitstream_read(&bc, 10);
-    sconf->block_switching      = bitstream_read(&bc, 2);
-    sconf->bgmc                 = bitstream_read_bit(&bc);
-    sconf->sb_part              = bitstream_read_bit(&bc);
-    sconf->joint_stereo         = bitstream_read_bit(&bc);
-    sconf->mc_coding            = bitstream_read_bit(&bc);
-    sconf->chan_config          = bitstream_read_bit(&bc);
-    sconf->chan_sort            = bitstream_read_bit(&bc);
-    sconf->crc_enabled          = bitstream_read_bit(&bc);
-    sconf->rlslms               = bitstream_read_bit(&bc);
-    bitstream_skip(&bc, 5);  // skip 5 reserved bits
-    bitstream_skip(&bc, 1);  // skip aux_data_enabled
+    skip_bits(&gb, 16);      // number of channels already known
+    skip_bits(&gb, 3);       // skip file_type
+    sconf->resolution           = get_bits(&gb, 3);
+    sconf->floating             = get_bits1(&gb);
+    sconf->msb_first            = get_bits1(&gb);
+    sconf->frame_length         = get_bits(&gb, 16) + 1;
+    sconf->ra_distance          = get_bits(&gb, 8);
+    sconf->ra_flag              = get_bits(&gb, 2);
+    sconf->adapt_order          = get_bits1(&gb);
+    sconf->coef_table           = get_bits(&gb, 2);
+    sconf->long_term_prediction = get_bits1(&gb);
+    sconf->max_order            = get_bits(&gb, 10);
+    sconf->block_switching      = get_bits(&gb, 2);
+    sconf->bgmc                 = get_bits1(&gb);
+    sconf->sb_part              = get_bits1(&gb);
+    sconf->joint_stereo         = get_bits1(&gb);
+    sconf->mc_coding            = get_bits1(&gb);
+    sconf->chan_config          = get_bits1(&gb);
+    sconf->chan_sort            = get_bits1(&gb);
+    sconf->crc_enabled          = get_bits1(&gb);
+    sconf->rlslms               = get_bits1(&gb);
+    skip_bits(&gb, 5);       // skip 5 reserved bits
+    skip_bits1(&gb);         // skip aux_data_enabled
 
 
     // check for ALSSpecificConfig struct
@@ -341,7 +352,7 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
 
     // read channel config
     if (sconf->chan_config)
-        sconf->chan_config_info = bitstream_read(&bc, 16);
+        sconf->chan_config_info = get_bits(&gb, 16);
     // TODO: use this to set avctx->channel_layout
 
 
@@ -349,29 +360,41 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
     if (sconf->chan_sort && avctx->channels > 1) {
         int chan_pos_bits = av_ceil_log2(avctx->channels);
         int bits_needed  = avctx->channels * chan_pos_bits + 7;
-        if (bitstream_bits_left(&bc) < bits_needed)
+        if (get_bits_left(&gb) < bits_needed)
             return AVERROR_INVALIDDATA;
 
-        if (!(sconf->chan_pos = av_malloc(avctx->channels * sizeof(*sconf->chan_pos))))
+        if (!(sconf->chan_pos = av_malloc_array(avctx->channels, sizeof(*sconf->chan_pos))))
             return AVERROR(ENOMEM);
 
-        for (i = 0; i < avctx->channels; i++)
-            sconf->chan_pos[i] = bitstream_read(&bc, chan_pos_bits);
+        ctx->cs_switch = 1;
 
-        bitstream_align(&bc);
-        // TODO: use this to actually do channel sorting
-    } else {
-        sconf->chan_sort = 0;
+        for (i = 0; i < avctx->channels; i++) {
+            sconf->chan_pos[i] = -1;
+        }
+
+        for (i = 0; i < avctx->channels; i++) {
+            int idx;
+
+            idx = get_bits(&gb, chan_pos_bits);
+            if (idx >= avctx->channels || sconf->chan_pos[idx] != -1) {
+                av_log(avctx, AV_LOG_WARNING, "Invalid channel reordering.\n");
+                ctx->cs_switch = 0;
+                break;
+            }
+            sconf->chan_pos[idx] = i;
+        }
+
+        align_get_bits(&gb);
     }
 
 
     // read fixed header and trailer sizes,
     // if size = 0xFFFFFFFF then there is no data field!
-    if (bitstream_bits_left(&bc) < 64)
+    if (get_bits_left(&gb) < 64)
         return AVERROR_INVALIDDATA;
 
-    header_size  = bitstream_read(&bc, 32);
-    trailer_size = bitstream_read(&bc, 32);
+    header_size  = get_bits_long(&gb, 32);
+    trailer_size = get_bits_long(&gb, 32);
     if (header_size  == 0xFFFFFFFF)
         header_size  = 0;
     if (trailer_size == 0xFFFFFFFF)
@@ -381,26 +404,26 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
 
 
     // skip the header and trailer data
-    if (bitstream_bits_left(&bc) < ht_size)
+    if (get_bits_left(&gb) < ht_size)
         return AVERROR_INVALIDDATA;
 
     if (ht_size > INT32_MAX)
         return AVERROR_PATCHWELCOME;
 
-    bitstream_skip(&bc, ht_size);
+    skip_bits_long(&gb, ht_size);
 
 
     // initialize CRC calculation
     if (sconf->crc_enabled) {
-        if (bitstream_bits_left(&bc) < 32)
+        if (get_bits_left(&gb) < 32)
             return AVERROR_INVALIDDATA;
 
-        if (avctx->err_recognition & AV_EF_CRCCHECK) {
+        if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL)) {
             ctx->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
             ctx->crc       = 0xFFFFFFFF;
-            ctx->crc_org   = ~bitstream_read(&bc, 32);
+            ctx->crc_org   = ~get_bits_long(&gb, 32);
         } else
-            bitstream_skip(&bc, 32);
+            skip_bits_long(&gb, 32);
     }
 
 
@@ -429,9 +452,7 @@ static int check_specific_config(ALSDecContext *ctx)
         }                                               \
     }
 
-    MISSING_ERR(sconf->floating,  "Floating point decoding",     AVERROR_PATCHWELCOME);
     MISSING_ERR(sconf->rlslms,    "Adaptive RLS-LMS prediction", AVERROR_PATCHWELCOME);
-    MISSING_ERR(sconf->chan_sort, "Channel sorting",             0);
 
     return error;
 }
@@ -463,15 +484,15 @@ static void parse_bs_info(const uint32_t bs_info, unsigned int n,
 
 /** Read and decode a Rice codeword.
  */
-static int32_t decode_rice(BitstreamContext *bc, unsigned int k)
+static int32_t decode_rice(GetBitContext *gb, unsigned int k)
 {
-    int max = bitstream_bits_left(bc) - k;
-    int q   = get_unary(bc, 0, max);
-    int r   = k ? bitstream_read_bit(bc) : !(q & 1);
+    int max = get_bits_left(gb) - k;
+    int q   = get_unary(gb, 0, max);
+    int r   = k ? get_bits1(gb) : !(q & 1);
 
     if (k > 1) {
         q <<= (k - 1);
-        q  += bitstream_read(bc, k - 1);
+        q  += get_bits_long(gb, k - 1);
     } else if (!k) {
         q >>= 1;
     }
@@ -505,13 +526,13 @@ static void get_block_sizes(ALSDecContext *ctx, unsigned int *div_blocks,
                             uint32_t *bs_info)
 {
     ALSSpecificConfig *sconf     = &ctx->sconf;
-    BitstreamContext *bc         = &ctx->bc;
+    GetBitContext *gb            = &ctx->gb;
     unsigned int *ptr_div_blocks = div_blocks;
     unsigned int b;
 
     if (sconf->block_switching) {
         unsigned int bs_info_len = 1 << (sconf->block_switching + 2);
-        *bs_info = bitstream_read(bc, bs_info_len);
+        *bs_info = get_bits_long(gb, bs_info_len);
         *bs_info <<= (32 - bs_info_len);
     }
 
@@ -554,26 +575,31 @@ static void get_block_sizes(ALSDecContext *ctx, unsigned int *div_blocks,
 
 /** Read the block data for a constant block
  */
-static void read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
+static int read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 {
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
-    BitstreamContext *bc     = &ctx->bc;
+    GetBitContext *gb        = &ctx->gb;
+
+    if (bd->block_length <= 0)
+        return AVERROR_INVALIDDATA;
 
     *bd->raw_samples = 0;
-    *bd->const_block = bitstream_read_bit(bc);  // 1 = constant value, 0 = zero block (silence)
-    bd->js_blocks    = bitstream_read_bit(bc);
+    *bd->const_block = get_bits1(gb);    // 1 = constant value, 0 = zero block (silence)
+    bd->js_blocks    = get_bits1(gb);
 
     // skip 5 reserved bits
-    bitstream_skip(bc, 5);
+    skip_bits(gb, 5);
 
     if (*bd->const_block) {
         unsigned int const_val_bits = sconf->floating ? 24 : avctx->bits_per_raw_sample;
-        *bd->raw_samples = bitstream_read_signed(bc, const_val_bits);
+        *bd->raw_samples = get_sbits_long(gb, const_val_bits);
     }
 
     // ensure constant block decoding by reusing this field
     *bd->const_block = 1;
+
+    return 0;
 }
 
 
@@ -597,7 +623,7 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 {
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
-    BitstreamContext *bc     = &ctx->bc;
+    GetBitContext *gb        = &ctx->gb;
     unsigned int k;
     unsigned int s[8];
     unsigned int sx[8];
@@ -613,7 +639,7 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
     *bd->const_block = 0;
 
     *bd->opt_order  = 1;
-    bd->js_blocks   = bitstream_read_bit(bc);
+    bd->js_blocks   = get_bits1(gb);
 
     opt_order       = *bd->opt_order;
 
@@ -622,9 +648,9 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
         log2_sub_blocks = 0;
     } else {
         if (sconf->bgmc && sconf->sb_part)
-            log2_sub_blocks = bitstream_read(bc, 2);
+            log2_sub_blocks = get_bits(gb, 2);
         else
-            log2_sub_blocks = 2 * bitstream_read_bit(bc);
+            log2_sub_blocks = 2 * get_bits1(gb);
     }
 
     sub_blocks = 1 << log2_sub_blocks;
@@ -640,18 +666,18 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
     sb_length = bd->block_length >> log2_sub_blocks;
 
     if (sconf->bgmc) {
-        s[0] = bitstream_read(bc, 8 + (sconf->resolution > 1));
+        s[0] = get_bits(gb, 8 + (sconf->resolution > 1));
         for (k = 1; k < sub_blocks; k++)
-            s[k] = s[k - 1] + decode_rice(bc, 2);
+            s[k] = s[k - 1] + decode_rice(gb, 2);
 
         for (k = 0; k < sub_blocks; k++) {
             sx[k]   = s[k] & 0x0F;
             s [k] >>= 4;
         }
     } else {
-        s[0] = bitstream_read(bc, 4 + (sconf->resolution > 1));
+        s[0] = get_bits(gb, 4 + (sconf->resolution > 1));
         for (k = 1; k < sub_blocks; k++)
-            s[k] = s[k - 1] + decode_rice(bc, 0);
+            s[k] = s[k - 1] + decode_rice(gb, 0);
     }
     for (k = 1; k < sub_blocks; k++)
         if (s[k] > 32) {
@@ -659,8 +685,8 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
             return AVERROR_INVALIDDATA;
         }
 
-    if (bitstream_read_bit(bc))
-        *bd->shift_lsbs = bitstream_read(bc, 4) + 1;
+    if (get_bits1(gb))
+        *bd->shift_lsbs = get_bits(gb, 4) + 1;
 
     *bd->store_prev_samples = (bd->js_blocks && bd->raw_other) || *bd->shift_lsbs;
 
@@ -669,16 +695,15 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
         if (sconf->adapt_order && sconf->max_order) {
             int opt_order_length = av_ceil_log2(av_clip((bd->block_length >> 3) - 1,
                                                 2, sconf->max_order + 1));
-            *bd->opt_order       = bitstream_read(bc, opt_order_length);
+            *bd->opt_order       = get_bits(gb, opt_order_length);
             if (*bd->opt_order > sconf->max_order) {
                 *bd->opt_order = sconf->max_order;
-                av_log(avctx, AV_LOG_ERROR, "Predictor order too large!\n");
+                av_log(avctx, AV_LOG_ERROR, "Predictor order too large.\n");
                 return AVERROR_INVALIDDATA;
             }
         } else {
             *bd->opt_order = sconf->max_order;
         }
-
         opt_order = *bd->opt_order;
 
         if (opt_order) {
@@ -688,15 +713,15 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                 add_base = 0x7F;
 
                 // read coefficient 0
-                quant_cof[0] = 32 * parcor_scaled_values[bitstream_read(bc, 7)];
+                quant_cof[0] = 32 * parcor_scaled_values[get_bits(gb, 7)];
 
                 // read coefficient 1
                 if (opt_order > 1)
-                    quant_cof[1] = -32 * parcor_scaled_values[bitstream_read(bc, 7)];
+                    quant_cof[1] = -32 * parcor_scaled_values[get_bits(gb, 7)];
 
                 // read coefficients 2 to opt_order
                 for (k = 2; k < opt_order; k++)
-                    quant_cof[k] = bitstream_read(bc, 7);
+                    quant_cof[k] = get_bits(gb, 7);
             } else {
                 int k_max;
                 add_base = 1;
@@ -706,10 +731,10 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                 for (k = 0; k < k_max; k++) {
                     int rice_param = parcor_rice_table[sconf->coef_table][k][1];
                     int offset     = parcor_rice_table[sconf->coef_table][k][0];
-                    quant_cof[k] = decode_rice(bc, rice_param) + offset;
+                    quant_cof[k] = decode_rice(gb, rice_param) + offset;
                     if (quant_cof[k] < -64 || quant_cof[k] > 63) {
                         av_log(avctx, AV_LOG_ERROR,
-                               "quant_cof %"PRIu32" is out of range\n",
+                               "quant_cof %"PRId32" is out of range.\n",
                                quant_cof[k]);
                         return AVERROR_INVALIDDATA;
                     }
@@ -718,11 +743,11 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                 // read coefficients 20 to 126
                 k_max = FFMIN(opt_order, 127);
                 for (; k < k_max; k++)
-                    quant_cof[k] = decode_rice(bc, 2) + (k & 1);
+                    quant_cof[k] = decode_rice(gb, 2) + (k & 1);
 
                 // read coefficients 127 to opt_order
                 for (; k < opt_order; k++)
-                    quant_cof[k] = decode_rice(bc, 1);
+                    quant_cof[k] = decode_rice(gb, 1);
 
                 quant_cof[0] = 32 * parcor_scaled_values[quant_cof[0] + 64];
 
@@ -731,28 +756,33 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
             }
 
             for (k = 2; k < opt_order; k++)
-                quant_cof[k] = (quant_cof[k] << 14) + (add_base << 13);
+                quant_cof[k] = (quant_cof[k] * (1 << 14)) + (add_base << 13);
         }
     }
 
     // read LTP gain and lag values
     if (sconf->long_term_prediction) {
-        *bd->use_ltp = bitstream_read_bit(bc);
+        *bd->use_ltp = get_bits1(gb);
 
         if (*bd->use_ltp) {
             int r, c;
 
-            bd->ltp_gain[0]   = decode_rice(bc, 1) << 3;
-            bd->ltp_gain[1]   = decode_rice(bc, 2) << 3;
+            bd->ltp_gain[0]   = decode_rice(gb, 1) << 3;
+            bd->ltp_gain[1]   = decode_rice(gb, 2) << 3;
+
+            r                 = get_unary(gb, 0, 4);
+            c                 = get_bits(gb, 2);
+            if (r >= 4) {
+                av_log(avctx, AV_LOG_ERROR, "r overflow\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-            r                 = get_unary(bc, 0, 3);
-            c                 = bitstream_read(bc, 2);
             bd->ltp_gain[2]   = ltp_gain_values[r][c];
 
-            bd->ltp_gain[3]   = decode_rice(bc, 2) << 3;
-            bd->ltp_gain[4]   = decode_rice(bc, 1) << 3;
+            bd->ltp_gain[3]   = decode_rice(gb, 2) << 3;
+            bd->ltp_gain[4]   = decode_rice(gb, 1) << 3;
 
-            *bd->ltp_lag      = bitstream_read(bc, ctx->ltp_lag_length);
+            *bd->ltp_lag      = get_bits(gb, ctx->ltp_lag_length);
             *bd->ltp_lag     += FFMAX(4, opt_order + 1);
         }
     }
@@ -760,11 +790,11 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
     // read first value and residuals in case of a random access block
     if (bd->ra_block) {
         if (opt_order)
-            bd->raw_samples[0] = decode_rice(bc, avctx->bits_per_raw_sample - 4);
+            bd->raw_samples[0] = decode_rice(gb, avctx->bits_per_raw_sample - 4);
         if (opt_order > 1)
-            bd->raw_samples[1] = decode_rice(bc, FFMIN(s[0] + 3, ctx->s_max));
+            bd->raw_samples[1] = decode_rice(gb, FFMIN(s[0] + 3, ctx->s_max));
         if (opt_order > 2)
-            bd->raw_samples[2] = decode_rice(bc, FFMIN(s[0] + 1, ctx->s_max));
+            bd->raw_samples[2] = decode_rice(gb, FFMIN(s[0] + 1, ctx->s_max));
 
         start = FFMIN(opt_order, 3);
     }
@@ -780,7 +810,7 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
         unsigned int low;
         unsigned int value;
 
-        ff_bgmc_decode_init(bc, &high, &low, &value);
+        ff_bgmc_decode_init(gb, &high, &low, &value);
 
         current_res = bd->raw_samples + start;
 
@@ -790,13 +820,13 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
             k    [sb] = s[sb] > b ? s[sb] - b : 0;
             delta[sb] = 5 - s[sb] + k[sb];
 
-            ff_bgmc_decode(bc, sb_len, current_res, delta[sb], sx[sb], &high,
-                           &low, &value, ctx->bgmc_lut, ctx->bgmc_lut_status);
+            ff_bgmc_decode(gb, sb_len, current_res,
+                        delta[sb], sx[sb], &high, &low, &value, ctx->bgmc_lut, ctx->bgmc_lut_status);
 
             current_res += sb_len;
         }
 
-        ff_bgmc_decode_end(bc);
+        ff_bgmc_decode_end(gb);
 
 
         // read least significant bits and tails
@@ -814,7 +844,7 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                     unsigned int max_msb =   (2 + (sx[sb] > 2) + (sx[sb] > 10))
                                           << (5 - delta[sb]);
 
-                    res = decode_rice(bc, cur_s);
+                    res = decode_rice(gb, cur_s);
 
                     if (res >= 0) {
                         res += (max_msb    ) << cur_k;
@@ -831,8 +861,8 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                     res >>= 1;
 
                     if (cur_k) {
-                        res <<= cur_k;
-                        res  |= bitstream_read(bc, cur_k);
+                        res  *= 1 << cur_k;
+                        res  |= get_bits_long(gb, cur_k);
                     }
                 }
 
@@ -844,12 +874,9 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 
         for (sb = 0; sb < sub_blocks; sb++, start = 0)
             for (; start < sb_length; start++)
-                *current_res++ = decode_rice(bc, s[sb]);
+                *current_res++ = decode_rice(gb, s[sb]);
      }
 
-    if (!sconf->mc_coding || ctx->js_switch)
-        bitstream_align(bc);
-
     return 0;
 }
 
@@ -893,7 +920,7 @@ static int decode_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 
     // reconstruct all samples from residuals
     if (bd->ra_block) {
-        for (smp = 0; smp < opt_order; smp++) {
+        for (smp = 0; smp < FFMIN(opt_order, block_length); smp++) {
             y = 1 << 19;
 
             for (sb = 0; sb < smp; sb++)
@@ -967,17 +994,21 @@ static int decode_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
  */
 static int read_block(ALSDecContext *ctx, ALSBlockData *bd)
 {
-    int ret = 0;
-    BitstreamContext *bc = &ctx->bc;
+    int ret;
+    GetBitContext *gb        = &ctx->gb;
+    ALSSpecificConfig *sconf = &ctx->sconf;
 
     *bd->shift_lsbs = 0;
     // read block type flag and read the samples accordingly
-    if (bitstream_read_bit(bc)) {
+    if (get_bits1(gb)) {
         ret = read_var_block_data(ctx, bd);
     } else {
-        read_const_block_data(ctx, bd);
+        ret = read_const_block_data(ctx, bd);
     }
 
+    if (!sconf->mc_coding || ctx->js_switch)
+        align_get_bits(gb);
+
     return ret;
 }
 
@@ -1029,8 +1060,8 @@ static void zero_remaining(unsigned int b, unsigned int b_max,
 {
     unsigned int count = 0;
 
-    for (; b < b_max; b++)
-        count += div_blocks[b];
+    while (b < b_max)
+        count += div_blocks[b++];
 
     if (count)
         memset(buf, 0, sizeof(*buf) * count);
@@ -1135,7 +1166,7 @@ static int decode_blocks(ALSDecContext *ctx, unsigned int ra_frame,
         // reconstruct joint-stereo blocks
         if (bd[0].js_blocks) {
             if (bd[1].js_blocks)
-                av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel pair!\n");
+                av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel pair.\n");
 
             for (s = 0; s < div_blocks[b]; s++)
                 bd[0].raw_samples[s] = bd[1].raw_samples[s] - bd[0].raw_samples[s];
@@ -1163,9 +1194,9 @@ fail:
     return ret;
 }
 
-static inline int als_weighting(BitstreamContext *bc, int k, int off)
+static inline int als_weighting(GetBitContext *gb, int k, int off)
 {
-    int idx = av_clip(decode_rice(bc, k) + off,
+    int idx = av_clip(decode_rice(gb, k) + off,
                       0, FF_ARRAY_ELEMS(mcc_weightings) - 1);
     return mcc_weightings[idx];
 }
@@ -1174,32 +1205,32 @@ static inline int als_weighting(BitstreamContext *bc, int k, int off)
   */
 static int read_channel_data(ALSDecContext *ctx, ALSChannelData *cd, int c)
 {
-    BitstreamContext *bc    = &ctx->bc;
+    GetBitContext *gb       = &ctx->gb;
     ALSChannelData *current = cd;
     unsigned int channels   = ctx->avctx->channels;
     int entries             = 0;
 
-    while (entries < channels && !(current->stop_flag = bitstream_read_bit(bc))) {
-        current->master_channel = bitstream_read(bc, av_ceil_log2(channels));
+    while (entries < channels && !(current->stop_flag = get_bits1(gb))) {
+        current->master_channel = get_bits_long(gb, av_ceil_log2(channels));
 
         if (current->master_channel >= channels) {
-            av_log(ctx->avctx, AV_LOG_ERROR, "Invalid master channel!\n");
+            av_log(ctx->avctx, AV_LOG_ERROR, "Invalid master channel.\n");
             return AVERROR_INVALIDDATA;
         }
 
         if (current->master_channel != c) {
-            current->time_diff_flag = bitstream_read_bit(bc);
-            current->weighting[0]   = als_weighting(bc, 1, 16);
-            current->weighting[1]   = als_weighting(bc, 2, 14);
-            current->weighting[2]   = als_weighting(bc, 1, 16);
+            current->time_diff_flag = get_bits1(gb);
+            current->weighting[0]   = als_weighting(gb, 1, 16);
+            current->weighting[1]   = als_weighting(gb, 2, 14);
+            current->weighting[2]   = als_weighting(gb, 1, 16);
 
             if (current->time_diff_flag) {
-                current->weighting[3] = als_weighting(bc, 1, 16);
-                current->weighting[4] = als_weighting(bc, 1, 16);
-                current->weighting[5] = als_weighting(bc, 1, 16);
+                current->weighting[3] = als_weighting(gb, 1, 16);
+                current->weighting[4] = als_weighting(gb, 1, 16);
+                current->weighting[5] = als_weighting(gb, 1, 16);
 
-                current->time_diff_sign  = bitstream_read_bit(bc);
-                current->time_diff_index = bitstream_read(bc, ctx->ltp_lag_length - 3) + 3;
+                current->time_diff_sign  = get_bits1(gb);
+                current->time_diff_index = get_bits(gb, ctx->ltp_lag_length - 3) + 3;
             }
         }
 
@@ -1208,11 +1239,11 @@ static int read_channel_data(ALSDecContext *ctx, ALSChannelData *cd, int c)
     }
 
     if (entries == channels) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Damaged channel data!\n");
+        av_log(ctx->avctx, AV_LOG_ERROR, "Damaged channel data.\n");
         return AVERROR_INVALIDDATA;
     }
 
-    bitstream_align(bc);
+    align_get_bits(gb);
     return 0;
 }
 
@@ -1241,7 +1272,7 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
     }
 
     if (dep == channels) {
-        av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel correlation!\n");
+        av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel correlation.\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -1256,21 +1287,31 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
     bd->quant_cof   = ctx->quant_cof[c];
     bd->raw_samples = ctx->raw_samples[c] + offset;
 
-    dep = 0;
-    while (!ch[dep].stop_flag) {
+    for (dep = 0; !ch[dep].stop_flag; dep++) {
         ptrdiff_t smp;
         ptrdiff_t begin = 1;
         ptrdiff_t end   = bd->block_length - 1;
         int64_t y;
         int32_t *master = ctx->raw_samples[ch[dep].master_channel] + offset;
 
+        if (ch[dep].master_channel == c)
+            continue;
+
         if (ch[dep].time_diff_flag) {
             int t = ch[dep].time_diff_index;
 
             if (ch[dep].time_diff_sign) {
                 t      = -t;
+                if (begin < t) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "begin %"PTRDIFF_SPECIFIER" smaller than time diff index %d.\n", begin, t);
+                    return AVERROR_INVALIDDATA;
+                }
                 begin -= t;
             } else {
+                if (end < t) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "end %"PTRDIFF_SPECIFIER" smaller than time diff index %d.\n", end, t);
+                    return AVERROR_INVALIDDATA;
+                }
                 end   -= t;
             }
 
@@ -1314,10 +1355,240 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
                 bd->raw_samples[smp] += y >> 7;
             }
         }
+    }
 
-        dep++;
+    return 0;
+}
+
+
+/** multiply two softfloats and handle the rounding off
+ */
+static SoftFloat_IEEE754 multiply(SoftFloat_IEEE754 a, SoftFloat_IEEE754 b) {
+    uint64_t mantissa_temp;
+    uint64_t mask_64;
+    int cutoff_bit_count;
+    unsigned char last_2_bits;
+    unsigned int mantissa;
+    int32_t sign;
+    uint32_t return_val = 0;
+    int bit_count       = 48;
+
+    sign = a.sign ^ b.sign;
+
+    // Multiply mantissa bits in a 64-bit register
+    mantissa_temp = (uint64_t)a.mant * (uint64_t)b.mant;
+    mask_64       = (uint64_t)0x1 << 47;
+
+    // Count the valid bit count
+    while (!(mantissa_temp & mask_64) && mask_64) {
+        bit_count--;
+        mask_64 >>= 1;
+    }
+
+    // Round off
+    cutoff_bit_count = bit_count - 24;
+    if (cutoff_bit_count > 0) {
+        last_2_bits = (unsigned char)(((unsigned int)mantissa_temp >> (cutoff_bit_count - 1)) & 0x3 );
+        if ((last_2_bits == 0x3) || ((last_2_bits == 0x1) && ((unsigned int)mantissa_temp & ((0x1UL << (cutoff_bit_count - 1)) - 1)))) {
+            // Need to round up
+            mantissa_temp += (uint64_t)0x1 << cutoff_bit_count;
+        }
+    }
+
+    mantissa = (unsigned int)(mantissa_temp >> cutoff_bit_count);
+
+    // Need one more shift?
+    if (mantissa & 0x01000000ul) {
+        bit_count++;
+        mantissa >>= 1;
     }
 
+    if (!sign) {
+        return_val = 0x80000000U;
+    }
+
+    return_val |= (a.exp + b.exp + bit_count - 47) << 23;
+    return_val |= mantissa;
+    return av_bits2sf_ieee754(return_val);
+}
+
+
+/** Read and decode the floating point sample data
+ */
+static int read_diff_float_data(ALSDecContext *ctx, unsigned int ra_frame) {
+    AVCodecContext *avctx   = ctx->avctx;
+    GetBitContext *gb       = &ctx->gb;
+    SoftFloat_IEEE754 *acf  = ctx->acf;
+    int *shift_value        = ctx->shift_value;
+    int *last_shift_value   = ctx->last_shift_value;
+    int *last_acf_mantissa  = ctx->last_acf_mantissa;
+    int **raw_mantissa      = ctx->raw_mantissa;
+    int *nbits              = ctx->nbits;
+    unsigned char *larray   = ctx->larray;
+    int frame_length        = ctx->cur_frame_length;
+    SoftFloat_IEEE754 scale = av_int2sf_ieee754(0x1u, 23);
+    unsigned int partA_flag;
+    unsigned int highest_byte;
+    unsigned int shift_amp;
+    uint32_t tmp_32;
+    int use_acf;
+    int nchars;
+    int i;
+    int c;
+    long k;
+    long nbits_aligned;
+    unsigned long acc;
+    unsigned long j;
+    uint32_t sign;
+    uint32_t e;
+    uint32_t mantissa;
+
+    skip_bits_long(gb, 32); //num_bytes_diff_float
+    use_acf = get_bits1(gb);
+
+    if (ra_frame) {
+        memset(last_acf_mantissa, 0, avctx->channels * sizeof(*last_acf_mantissa));
+        memset(last_shift_value,  0, avctx->channels * sizeof(*last_shift_value) );
+        ff_mlz_flush_dict(ctx->mlz);
+    }
+
+    for (c = 0; c < avctx->channels; ++c) {
+        if (use_acf) {
+            //acf_flag
+            if (get_bits1(gb)) {
+                tmp_32 = get_bits(gb, 23);
+                last_acf_mantissa[c] = tmp_32;
+            } else {
+                tmp_32 = last_acf_mantissa[c];
+            }
+            acf[c] = av_bits2sf_ieee754(tmp_32);
+        } else {
+            acf[c] = FLOAT_1;
+        }
+
+        highest_byte = get_bits(gb, 2);
+        partA_flag   = get_bits1(gb);
+        shift_amp    = get_bits1(gb);
+
+        if (shift_amp) {
+            shift_value[c] = get_bits(gb, 8);
+            last_shift_value[c] = shift_value[c];
+        } else {
+            shift_value[c] = last_shift_value[c];
+        }
+
+        if (partA_flag) {
+            if (!get_bits1(gb)) { //uncompressed
+                for (i = 0; i < frame_length; ++i) {
+                    if (ctx->raw_samples[c][i] == 0) {
+                        ctx->raw_mantissa[c][i] = get_bits_long(gb, 32);
+                    }
+                }
+            } else { //compressed
+                nchars = 0;
+                for (i = 0; i < frame_length; ++i) {
+                    if (ctx->raw_samples[c][i] == 0) {
+                        nchars += 4;
+                    }
+                }
+
+                tmp_32 = ff_mlz_decompression(ctx->mlz, gb, nchars, larray);
+                if(tmp_32 != nchars) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "Error in MLZ decompression (%"PRId32", %d).\n", tmp_32, nchars);
+                    return AVERROR_INVALIDDATA;
+                }
+
+                for (i = 0; i < frame_length; ++i) {
+                    ctx->raw_mantissa[c][i] = AV_RB32(larray);
+                }
+            }
+        }
+
+        //decode part B
+        if (highest_byte) {
+            for (i = 0; i < frame_length; ++i) {
+                if (ctx->raw_samples[c][i] != 0) {
+                    //The following logic is taken from Tabel 14.45 and 14.46 from the ISO spec
+                    if (av_cmp_sf_ieee754(acf[c], FLOAT_1)) {
+                        nbits[i] = 23 - av_log2(abs(ctx->raw_samples[c][i]));
+                    } else {
+                        nbits[i] = 23;
+                    }
+                    nbits[i] = FFMIN(nbits[i], highest_byte*8);
+                }
+            }
+
+            if (!get_bits1(gb)) { //uncompressed
+                for (i = 0; i < frame_length; ++i) {
+                    if (ctx->raw_samples[c][i] != 0) {
+                        raw_mantissa[c][i] = get_bitsz(gb, nbits[i]);
+                    }
+                }
+            } else { //compressed
+                nchars = 0;
+                for (i = 0; i < frame_length; ++i) {
+                    if (ctx->raw_samples[c][i]) {
+                        nchars += (int) nbits[i] / 8;
+                        if (nbits[i] & 7) {
+                            ++nchars;
+                        }
+                    }
+                }
+
+                tmp_32 = ff_mlz_decompression(ctx->mlz, gb, nchars, larray);
+                if(tmp_32 != nchars) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "Error in MLZ decompression (%"PRId32", %d).\n", tmp_32, nchars);
+                    return AVERROR_INVALIDDATA;
+                }
+
+                j = 0;
+                for (i = 0; i < frame_length; ++i) {
+                    if (ctx->raw_samples[c][i]) {
+                        if (nbits[i] & 7) {
+                            nbits_aligned = 8 * ((unsigned int)(nbits[i] / 8) + 1);
+                        } else {
+                            nbits_aligned = nbits[i];
+                        }
+                        acc = 0;
+                        for (k = 0; k < nbits_aligned/8; ++k) {
+                            acc = (acc << 8) + larray[j++];
+                        }
+                        acc >>= (nbits_aligned - nbits[i]);
+                        raw_mantissa[c][i] = acc;
+                    }
+                }
+            }
+        }
+
+        for (i = 0; i < frame_length; ++i) {
+            SoftFloat_IEEE754 pcm_sf = av_int2sf_ieee754(ctx->raw_samples[c][i], 0);
+            pcm_sf = av_div_sf_ieee754(pcm_sf, scale);
+
+            if (ctx->raw_samples[c][i] != 0) {
+                if (!av_cmp_sf_ieee754(acf[c], FLOAT_1)) {
+                    pcm_sf = multiply(acf[c], pcm_sf);
+                }
+
+                sign = pcm_sf.sign;
+                e = pcm_sf.exp;
+                mantissa = (pcm_sf.mant | 0x800000) + raw_mantissa[c][i];
+
+                while(mantissa >= 0x1000000) {
+                    e++;
+                    mantissa >>= 1;
+                }
+
+                if (mantissa) e += (shift_value[c] - 127);
+                mantissa &= 0x007fffffUL;
+
+                tmp_32 = (sign << 31) | ((e + EXP_BIAS) << 23) | (mantissa);
+                ctx->raw_samples[c][i] = tmp_32;
+            } else {
+                ctx->raw_samples[c][i] = raw_mantissa[c][i] & 0x007fffffUL;
+            }
+        }
+        align_get_bits(gb);
+    }
     return 0;
 }
 
@@ -1328,7 +1599,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
 {
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
-    BitstreamContext *bc     = &ctx->bc;
+    GetBitContext *gb = &ctx->gb;
     unsigned int div_blocks[32];                ///< block sizes.
     unsigned int c;
     unsigned int js_blocks[2];
@@ -1337,11 +1608,11 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
 
     // skip the size of the ra unit if present in the frame
     if (sconf->ra_flag == RA_FLAG_FRAMES && ra_frame)
-        bitstream_skip(bc, 32);
+        skip_bits_long(gb, 32);
 
     if (sconf->mc_coding && sconf->joint_stereo) {
-        ctx->js_switch = bitstream_read_bit(bc);
-        bitstream_align(bc);
+        ctx->js_switch = get_bits1(gb);
+        align_get_bits(gb);
     }
 
     if (!sconf->mc_coding || ctx->js_switch) {
@@ -1360,7 +1631,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
                     independent_bs = 2;
 
             // if this is the last channel, it has to be decoded independently
-            if (c == avctx->channels - 1)
+            if (c == avctx->channels - 1 || (c & 1))
                 independent_bs = 1;
 
             if (independent_bs) {
@@ -1390,7 +1661,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
 
         for (c = 0; c < avctx->channels; c++)
             if (ctx->chan_data[c] < ctx->chan_data_buffer) {
-                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid channel data!\n");
+                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid channel data.\n");
                 return AVERROR_INVALIDDATA;
             }
 
@@ -1446,6 +1717,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
                 bd.lpc_cof     = ctx->lpc_cof[c];
                 bd.quant_cof   = ctx->quant_cof[c];
                 bd.raw_samples = ctx->raw_samples[c] + offset;
+
                 if ((ret = decode_block(ctx, &bd)) < 0)
                     return ret;
             }
@@ -1462,7 +1734,14 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
                     sizeof(*ctx->raw_samples[c]) * sconf->max_order);
     }
 
-    // TODO: read_diff_float_data
+    if (sconf->floating) {
+        read_diff_float_data(ctx, ra_frame);
+    }
+
+    if (get_bits_left(gb) < 0) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Overread %d\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
 
     return 0;
 }
@@ -1481,7 +1760,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     int invalid_frame, ret;
     unsigned int c, sample, ra_frame, bytes_read, shift;
 
-    bitstream_init8(&ctx->bc, buffer, buffer_size);
+    if ((ret = init_get_bits8(&ctx->gb, buffer, buffer_size)) < 0)
+        return ret;
 
     // In the case that the distance between random access frames is set to zero
     // (sconf->ra_distance == 0) no frame is treated as a random access frame.
@@ -1505,19 +1785,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
 
     /* get output buffer */
     frame->nb_samples = ctx->cur_frame_length;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     // transform decoded frame into output format
-    #define INTERLEAVE_OUTPUT(bps)                                 \
-    {                                                              \
-        int##bps##_t *dest = (int##bps##_t*)frame->data[0];        \
-        shift = bps - ctx->avctx->bits_per_raw_sample;             \
-        for (sample = 0; sample < ctx->cur_frame_length; sample++) \
-            for (c = 0; c < avctx->channels; c++)                  \
-                *dest++ = ctx->raw_samples[c][sample] << shift;    \
+    #define INTERLEAVE_OUTPUT(bps)                                                   \
+    {                                                                                \
+        int##bps##_t *dest = (int##bps##_t*)frame->data[0];                          \
+        shift = bps - ctx->avctx->bits_per_raw_sample;                               \
+        if (!ctx->cs_switch) {                                                       \
+            for (sample = 0; sample < ctx->cur_frame_length; sample++)               \
+                for (c = 0; c < avctx->channels; c++)                                \
+                    *dest++ = ctx->raw_samples[c][sample] << shift;                  \
+        } else {                                                                     \
+            for (sample = 0; sample < ctx->cur_frame_length; sample++)               \
+                for (c = 0; c < avctx->channels; c++)                                \
+                    *dest++ = ctx->raw_samples[sconf->chan_pos[c]][sample] << shift; \
+        }                                                                            \
     }
 
     if (ctx->avctx->bits_per_raw_sample <= 16) {
@@ -1527,7 +1811,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     }
 
     // update CRC
-    if (sconf->crc_enabled && (avctx->err_recognition & AV_EF_CRCCHECK)) {
+    if (sconf->crc_enabled && (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL))) {
         int swap = HAVE_BIGENDIAN != sconf->msb_first;
 
         if (ctx->avctx->bits_per_raw_sample == 24) {
@@ -1586,7 +1870,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     *got_frame_ptr = 1;
 
     bytes_read = invalid_frame ? buffer_size :
-                                 (bitstream_tell(&ctx->bc) + 7) >> 3;
+                                 (get_bits_count(&ctx->gb) + 7) >> 3;
 
     return bytes_read;
 }
@@ -1597,6 +1881,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
 static av_cold int decode_end(AVCodecContext *avctx)
 {
     ALSDecContext *ctx = avctx->priv_data;
+    int i;
 
     av_freep(&ctx->sconf.chan_pos);
 
@@ -1622,6 +1907,22 @@ static av_cold int decode_end(AVCodecContext *avctx)
     av_freep(&ctx->chan_data_buffer);
     av_freep(&ctx->reverted_channels);
     av_freep(&ctx->crc_buffer);
+    if (ctx->mlz) {
+        av_freep(&ctx->mlz->dict);
+        av_freep(&ctx->mlz);
+    }
+    av_freep(&ctx->acf);
+    av_freep(&ctx->last_acf_mantissa);
+    av_freep(&ctx->shift_value);
+    av_freep(&ctx->last_shift_value);
+    if (ctx->raw_mantissa) {
+        for (i = 0; i < avctx->channels; i++) {
+            av_freep(&ctx->raw_mantissa[i]);
+        }
+        av_freep(&ctx->raw_mantissa);
+    }
+    av_freep(&ctx->larray);
+    av_freep(&ctx->nbits);
 
     return 0;
 }
@@ -1684,14 +1985,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
     // allocate quantized parcor coefficient buffer
     num_buffers = sconf->mc_coding ? avctx->channels : 1;
 
-    ctx->quant_cof        = av_malloc(sizeof(*ctx->quant_cof) * num_buffers);
-    ctx->lpc_cof          = av_malloc(sizeof(*ctx->lpc_cof)   * num_buffers);
-    ctx->quant_cof_buffer = av_malloc(sizeof(*ctx->quant_cof_buffer) *
-                                      num_buffers * sconf->max_order);
-    ctx->lpc_cof_buffer   = av_malloc(sizeof(*ctx->lpc_cof_buffer) *
-                                      num_buffers * sconf->max_order);
-    ctx->lpc_cof_reversed_buffer = av_malloc(sizeof(*ctx->lpc_cof_buffer) *
-                                             sconf->max_order);
+    ctx->quant_cof        = av_malloc_array(num_buffers, sizeof(*ctx->quant_cof));
+    ctx->lpc_cof          = av_malloc_array(num_buffers, sizeof(*ctx->lpc_cof));
+    ctx->quant_cof_buffer = av_malloc_array(num_buffers * sconf->max_order,
+                                            sizeof(*ctx->quant_cof_buffer));
+    ctx->lpc_cof_buffer   = av_malloc_array(num_buffers * sconf->max_order,
+                                            sizeof(*ctx->lpc_cof_buffer));
+    ctx->lpc_cof_reversed_buffer = av_malloc_array(sconf->max_order,
+                                                   sizeof(*ctx->lpc_cof_buffer));
 
     if (!ctx->quant_cof              || !ctx->lpc_cof        ||
         !ctx->quant_cof_buffer       || !ctx->lpc_cof_buffer ||
@@ -1708,15 +2009,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     // allocate and assign lag and gain data buffer for ltp mode
-    ctx->const_block     = av_malloc (sizeof(*ctx->const_block) * num_buffers);
-    ctx->shift_lsbs      = av_malloc (sizeof(*ctx->shift_lsbs)  * num_buffers);
-    ctx->opt_order       = av_malloc (sizeof(*ctx->opt_order)   * num_buffers);
-    ctx->store_prev_samples = av_malloc(sizeof(*ctx->store_prev_samples) * num_buffers);
-    ctx->use_ltp         = av_mallocz(sizeof(*ctx->use_ltp)  * num_buffers);
-    ctx->ltp_lag         = av_malloc (sizeof(*ctx->ltp_lag)  * num_buffers);
-    ctx->ltp_gain        = av_malloc (sizeof(*ctx->ltp_gain) * num_buffers);
-    ctx->ltp_gain_buffer = av_malloc (sizeof(*ctx->ltp_gain_buffer) *
-                                      num_buffers * 5);
+    ctx->const_block     = av_malloc_array(num_buffers, sizeof(*ctx->const_block));
+    ctx->shift_lsbs      = av_malloc_array(num_buffers, sizeof(*ctx->shift_lsbs));
+    ctx->opt_order       = av_malloc_array(num_buffers, sizeof(*ctx->opt_order));
+    ctx->store_prev_samples = av_malloc_array(num_buffers, sizeof(*ctx->store_prev_samples));
+    ctx->use_ltp         = av_mallocz_array(num_buffers, sizeof(*ctx->use_ltp));
+    ctx->ltp_lag         = av_malloc_array(num_buffers, sizeof(*ctx->ltp_lag));
+    ctx->ltp_gain        = av_malloc_array(num_buffers, sizeof(*ctx->ltp_gain));
+    ctx->ltp_gain_buffer = av_malloc_array(num_buffers * 5, sizeof(*ctx->ltp_gain_buffer));
 
     if (!ctx->const_block || !ctx->shift_lsbs ||
         !ctx->opt_order || !ctx->store_prev_samples ||
@@ -1732,12 +2032,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     // allocate and assign channel data buffer for mcc mode
     if (sconf->mc_coding) {
-        ctx->chan_data_buffer  = av_malloc(sizeof(*ctx->chan_data_buffer) *
-                                           num_buffers * num_buffers);
-        ctx->chan_data         = av_malloc(sizeof(*ctx->chan_data) *
-                                           num_buffers);
-        ctx->reverted_channels = av_malloc(sizeof(*ctx->reverted_channels) *
-                                           num_buffers);
+        ctx->chan_data_buffer  = av_mallocz_array(num_buffers * num_buffers,
+                                                 sizeof(*ctx->chan_data_buffer));
+        ctx->chan_data         = av_mallocz_array(num_buffers,
+                                                 sizeof(*ctx->chan_data));
+        ctx->reverted_channels = av_malloc_array(num_buffers,
+                                                 sizeof(*ctx->reverted_channels));
 
         if (!ctx->chan_data_buffer || !ctx->chan_data || !ctx->reverted_channels) {
             av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
@@ -1755,9 +2055,35 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     channel_size      = sconf->frame_length + sconf->max_order;
 
-    ctx->prev_raw_samples = av_malloc (sizeof(*ctx->prev_raw_samples) * sconf->max_order);
-    ctx->raw_buffer       = av_mallocz(sizeof(*ctx->     raw_buffer)  * avctx->channels * channel_size);
-    ctx->raw_samples      = av_malloc (sizeof(*ctx->     raw_samples) * avctx->channels);
+    ctx->prev_raw_samples = av_malloc_array(sconf->max_order, sizeof(*ctx->prev_raw_samples));
+    ctx->raw_buffer       = av_mallocz_array(avctx->channels * channel_size, sizeof(*ctx->raw_buffer));
+    ctx->raw_samples      = av_malloc_array(avctx->channels, sizeof(*ctx->raw_samples));
+
+    if (sconf->floating) {
+        ctx->acf               = av_malloc_array(avctx->channels, sizeof(*ctx->acf));
+        ctx->shift_value       = av_malloc_array(avctx->channels, sizeof(*ctx->shift_value));
+        ctx->last_shift_value  = av_malloc_array(avctx->channels, sizeof(*ctx->last_shift_value));
+        ctx->last_acf_mantissa = av_malloc_array(avctx->channels, sizeof(*ctx->last_acf_mantissa));
+        ctx->raw_mantissa      = av_mallocz_array(avctx->channels, sizeof(*ctx->raw_mantissa));
+
+        ctx->larray = av_malloc_array(ctx->cur_frame_length * 4, sizeof(*ctx->larray));
+        ctx->nbits  = av_malloc_array(ctx->cur_frame_length, sizeof(*ctx->nbits));
+        ctx->mlz    = av_mallocz(sizeof(*ctx->mlz));
+
+        if (!ctx->mlz || !ctx->acf || !ctx->shift_value || !ctx->last_shift_value
+            || !ctx->last_acf_mantissa || !ctx->raw_mantissa) {
+            av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        ff_mlz_init_dict(avctx, ctx->mlz);
+        ff_mlz_flush_dict(ctx->mlz);
+
+        for (c = 0; c < avctx->channels; ++c) {
+            ctx->raw_mantissa[c] = av_mallocz_array(ctx->cur_frame_length, sizeof(**ctx->raw_mantissa));
+        }
+    }
 
     // allocate previous raw sample buffer
     if (!ctx->prev_raw_samples || !ctx->raw_buffer|| !ctx->raw_samples) {
@@ -1773,11 +2099,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     // allocate crc buffer
     if (HAVE_BIGENDIAN != sconf->msb_first && sconf->crc_enabled &&
-        (avctx->err_recognition & AV_EF_CRCCHECK)) {
-        ctx->crc_buffer = av_malloc(sizeof(*ctx->crc_buffer) *
-                                    ctx->cur_frame_length *
-                                    avctx->channels *
-                                    av_get_bytes_per_sample(avctx->sample_fmt));
+        (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL))) {
+        ctx->crc_buffer = av_malloc_array(ctx->cur_frame_length *
+                                          avctx->channels *
+                                          av_get_bytes_per_sample(avctx->sample_fmt),
+                                          sizeof(*ctx->crc_buffer));
         if (!ctx->crc_buffer) {
             av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
             ret = AVERROR(ENOMEM);
diff --git a/libavcodec/amfenc.c b/libavcodec/amfenc.c
index 9a60050..384d8ef 100644
--- a/libavcodec/amfenc.c
+++ b/libavcodec/amfenc.c
@@ -1,52 +1,51 @@
 /*
- * AMD AMF support
- * Copyright (C) 2017 Luca Barbato
- * Copyright (C) 2017 Mikhail Mironov <mikhail.mironov@amd.com>
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
+
 #include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/hwcontext.h"
-#include "internal.h"
 #if CONFIG_D3D11VA
 #include "libavutil/hwcontext_d3d11va.h"
 #endif
+#if CONFIG_DXVA2
+#define COBJMACROS
+#include "libavutil/hwcontext_dxva2.h"
+#endif
 #include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/time.h"
 
 #include "amfenc.h"
+#include "internal.h"
 
 #if CONFIG_D3D11VA
 #include <d3d11.h>
 #endif
 
-#if HAVE_WINDOWS_H
-#include <windows.h>
-#define dlopen(filename, flags) LoadLibrary((filename))
-#define dlsym(handle, symbol)   GetProcAddress(handle, symbol)
-#define dlclose(handle)         FreeLibrary(handle)
+#ifdef _WIN32
+#include "compat/w32dlfcn.h"
 #else
 #include <dlfcn.h>
 #endif
 
-#define LIBAV_AMF_WRITER_ID L"libav_log"
+#define FFMPEG_AMF_WRITER_ID L"ffmpeg_amf"
 
 #define PTS_PROP L"PtsProp"
 
@@ -56,6 +55,9 @@ const enum AVPixelFormat ff_amf_pix_fmts[] = {
 #if CONFIG_D3D11VA
     AV_PIX_FMT_D3D11,
 #endif
+#if CONFIG_DXVA2
+    AV_PIX_FMT_DXVA2_VLD,
+#endif
     AV_PIX_FMT_NONE
 };
 
@@ -68,22 +70,13 @@ static const FormatMap format_map[] =
 {
     { AV_PIX_FMT_NONE,       AMF_SURFACE_UNKNOWN },
     { AV_PIX_FMT_NV12,       AMF_SURFACE_NV12 },
-//    { AV_PIX_FMT_BGR0,       AMF_SURFACE_BGRA },
-//    { AV_PIX_FMT_RGB0,       AMF_SURFACE_RGBA },
+    { AV_PIX_FMT_BGR0,       AMF_SURFACE_BGRA },
+    { AV_PIX_FMT_RGB0,       AMF_SURFACE_RGBA },
     { AV_PIX_FMT_GRAY8,      AMF_SURFACE_GRAY8 },
     { AV_PIX_FMT_YUV420P,    AMF_SURFACE_YUV420P },
     { AV_PIX_FMT_YUYV422,    AMF_SURFACE_YUY2 },
-    { AV_PIX_FMT_D3D11,      AMF_SURFACE_NV12 },
 };
 
-
-static int is_hwaccel_pix_fmt(enum AVPixelFormat pix_fmt)
-{
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
-    return desc->flags & AV_PIX_FMT_FLAG_HWACCEL;
-}
-
-
 static enum AMF_SURFACE_FORMAT amf_av_to_amf_format(enum AVPixelFormat fmt)
 {
     int i;
@@ -114,16 +107,11 @@ static AMFTraceWriterVtbl tracer_vtbl =
 
 static int amf_load_library(AVCodecContext *avctx)
 {
-    AmfContext             *ctx = avctx->priv_data;
-    AMFInit_Fn              init_fun = NULL;
-    AMFQueryVersion_Fn      version_fun = NULL;
-    AMF_RESULT              res = AMF_OK;
+    AmfContext        *ctx = avctx->priv_data;
+    AMFInit_Fn         init_fun;
+    AMFQueryVersion_Fn version_fun;
+    AMF_RESULT         res;
 
-    ctx->eof = 0;
-    ctx->delayed_drain = 0;
-    ctx->hw_frames_ctx = NULL;
-    ctx->hw_device_ctx = NULL;
-    ctx->delayed_surface = NULL;
     ctx->delayed_frame = av_frame_alloc();
     if (!ctx->delayed_frame) {
         return AVERROR(ENOMEM);
@@ -157,10 +145,76 @@ static int amf_load_library(AVCodecContext *avctx)
     return 0;
 }
 
+#if CONFIG_D3D11VA
+static int amf_init_from_d3d11_device(AVCodecContext *avctx, AVD3D11VADeviceContext *hwctx)
+{
+    AmfContext *ctx = avctx->priv_data;
+    AMF_RESULT res;
+
+    res = ctx->context->pVtbl->InitDX11(ctx->context, hwctx->device, AMF_DX11_1);
+    if (res != AMF_OK) {
+        if (res == AMF_NOT_SUPPORTED)
+            av_log(avctx, AV_LOG_ERROR, "AMF via D3D11 is not supported on the given device.\n");
+        else
+            av_log(avctx, AV_LOG_ERROR, "AMF failed to initialise on the given D3D11 device: %d.\n", res);
+        return AVERROR(ENODEV);
+    }
+
+    return 0;
+}
+#endif
+
+#if CONFIG_DXVA2
+static int amf_init_from_dxva2_device(AVCodecContext *avctx, AVDXVA2DeviceContext *hwctx)
+{
+    AmfContext *ctx = avctx->priv_data;
+    HANDLE device_handle;
+    IDirect3DDevice9 *device;
+    HRESULT hr;
+    AMF_RESULT res;
+    int ret;
+
+    hr = IDirect3DDeviceManager9_OpenDeviceHandle(hwctx->devmgr, &device_handle);
+    if (FAILED(hr)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to open device handle for Direct3D9 device: %lx.\n", (unsigned long)hr);
+        return AVERROR_EXTERNAL;
+    }
+
+    hr = IDirect3DDeviceManager9_LockDevice(hwctx->devmgr, device_handle, &device, FALSE);
+    if (SUCCEEDED(hr)) {
+        IDirect3DDeviceManager9_UnlockDevice(hwctx->devmgr, device_handle, FALSE);
+        ret = 0;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Failed to lock device handle for Direct3D9 device: %lx.\n", (unsigned long)hr);
+        ret = AVERROR_EXTERNAL;
+    }
+
+    IDirect3DDeviceManager9_CloseDeviceHandle(hwctx->devmgr, device_handle);
+
+    if (ret < 0)
+        return ret;
+
+    res = ctx->context->pVtbl->InitDX9(ctx->context, device);
+
+    IDirect3DDevice9_Release(device);
+
+    if (res != AMF_OK) {
+        if (res == AMF_NOT_SUPPORTED)
+            av_log(avctx, AV_LOG_ERROR, "AMF via D3D9 is not supported on the given device.\n");
+        else
+            av_log(avctx, AV_LOG_ERROR, "AMF failed to initialise on given D3D9 device: %d.\n", res);
+        return AVERROR(ENODEV);
+    }
+
+    return 0;
+}
+#endif
+
 static int amf_init_context(AVCodecContext *avctx)
 {
-    AmfContext         *ctx = avctx->priv_data;
-    AMF_RESULT          res = AMF_OK;
+    AmfContext *ctx = avctx->priv_data;
+    AMF_RESULT  res;
+    av_unused int ret;
 
     ctx->hwsurfaces_in_queue = 0;
     ctx->hwsurfaces_in_queue_max = 16;
@@ -176,64 +230,90 @@ static int amf_init_context(AVCodecContext *avctx)
     // connect AMF logger to av_log
     ctx->tracer.vtbl = &tracer_vtbl;
     ctx->tracer.avctx = avctx;
-    ctx->trace->pVtbl->RegisterWriter(ctx->trace, LIBAV_AMF_WRITER_ID,(AMFTraceWriter *)&ctx->tracer, 1);
-    ctx->trace->pVtbl->SetWriterLevel(ctx->trace, LIBAV_AMF_WRITER_ID, AMF_TRACE_TRACE);
+    ctx->trace->pVtbl->RegisterWriter(ctx->trace, FFMPEG_AMF_WRITER_ID,(AMFTraceWriter*)&ctx->tracer, 1);
+    ctx->trace->pVtbl->SetWriterLevel(ctx->trace, FFMPEG_AMF_WRITER_ID, AMF_TRACE_TRACE);
 
     res = ctx->factory->pVtbl->CreateContext(ctx->factory, &ctx->context);
     AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR_UNKNOWN, "CreateContext() failed with error %d\n", res);
-    // try to reuse existing DX device
-#if CONFIG_D3D11VA
+
+    // If a device was passed to the encoder, try to initialise from that.
     if (avctx->hw_frames_ctx) {
-        AVHWFramesContext *device_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-        if (device_ctx->device_ctx->type == AV_HWDEVICE_TYPE_D3D11VA) {
-            if (amf_av_to_amf_format(device_ctx->sw_format) != AMF_SURFACE_UNKNOWN) {
-                if (device_ctx->device_ctx->hwctx) {
-                    AVD3D11VADeviceContext *device_d3d11 = (AVD3D11VADeviceContext *)device_ctx->device_ctx->hwctx;
-                    res = ctx->context->pVtbl->InitDX11(ctx->context, device_d3d11->device, AMF_DX11_1);
-                    if (res == AMF_OK) {
-                        ctx->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
-                        if (!ctx->hw_frames_ctx) {
-                            return AVERROR(ENOMEM);
-                        }
-                        if (device_ctx->initial_pool_size > 0)
-                            ctx->hwsurfaces_in_queue_max = device_ctx->initial_pool_size - 1;
-                    } else {
-                        if(res == AMF_NOT_SUPPORTED)
-                            av_log(avctx, AV_LOG_INFO, "avctx->hw_frames_ctx has D3D11 device which doesn't have D3D11VA interface, switching to default\n");
-                        else
-                            av_log(avctx, AV_LOG_INFO, "avctx->hw_frames_ctx has non-AMD device, switching to default\n");
-                    }
-                }
-            } else {
-                av_log(avctx, AV_LOG_INFO, "avctx->hw_frames_ctx has format not uspported by AMF, switching to default\n");
-            }
+        AVHWFramesContext *frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+
+        if (amf_av_to_amf_format(frames_ctx->sw_format) == AMF_SURFACE_UNKNOWN) {
+            av_log(avctx, AV_LOG_ERROR, "Format of input frames context (%s) is not supported by AMF.\n",
+                   av_get_pix_fmt_name(frames_ctx->sw_format));
+            return AVERROR(EINVAL);
         }
-    } else if (avctx->hw_device_ctx) {
-        AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)(avctx->hw_device_ctx->data);
-        if (device_ctx->type == AV_HWDEVICE_TYPE_D3D11VA) {
-            if (device_ctx->hwctx) {
-                AVD3D11VADeviceContext *device_d3d11 = (AVD3D11VADeviceContext *)device_ctx->hwctx;
-                res = ctx->context->pVtbl->InitDX11(ctx->context, device_d3d11->device, AMF_DX11_1);
-                if (res == AMF_OK) {
-                    ctx->hw_device_ctx = av_buffer_ref(avctx->hw_device_ctx);
-                    if (!ctx->hw_device_ctx) {
-                        return AVERROR(ENOMEM);
-                    }
-                } else {
-                    if (res == AMF_NOT_SUPPORTED)
-                        av_log(avctx, AV_LOG_INFO, "avctx->hw_device_ctx has D3D11 device which doesn't have D3D11VA interface, switching to default\n");
-                    else
-                        av_log(avctx, AV_LOG_INFO, "avctx->hw_device_ctx has non-AMD device, switching to default\n");
-                }
-            }
+
+        switch (frames_ctx->device_ctx->type) {
+#if CONFIG_D3D11VA
+        case AV_HWDEVICE_TYPE_D3D11VA:
+            ret = amf_init_from_d3d11_device(avctx, frames_ctx->device_ctx->hwctx);
+            if (ret < 0)
+                return ret;
+            break;
+#endif
+#if CONFIG_DXVA2
+        case AV_HWDEVICE_TYPE_DXVA2:
+            ret = amf_init_from_dxva2_device(avctx, frames_ctx->device_ctx->hwctx);
+            if (ret < 0)
+                return ret;
+            break;
+#endif
+        default:
+            av_log(avctx, AV_LOG_ERROR, "AMF initialisation from a %s frames context is not supported.\n",
+                   av_hwdevice_get_type_name(frames_ctx->device_ctx->type));
+            return AVERROR(ENOSYS);
         }
-    }
+
+        ctx->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
+        if (!ctx->hw_frames_ctx)
+            return AVERROR(ENOMEM);
+
+        if (frames_ctx->initial_pool_size > 0)
+            ctx->hwsurfaces_in_queue_max = frames_ctx->initial_pool_size - 1;
+
+    } else if (avctx->hw_device_ctx) {
+        AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)avctx->hw_device_ctx->data;
+
+        switch (device_ctx->type) {
+#if CONFIG_D3D11VA
+        case AV_HWDEVICE_TYPE_D3D11VA:
+            ret = amf_init_from_d3d11_device(avctx, device_ctx->hwctx);
+            if (ret < 0)
+                return ret;
+            break;
 #endif
-    if (!ctx->hw_frames_ctx && !ctx->hw_device_ctx) {
+#if CONFIG_DXVA2
+        case AV_HWDEVICE_TYPE_DXVA2:
+            ret = amf_init_from_dxva2_device(avctx, device_ctx->hwctx);
+            if (ret < 0)
+                return ret;
+            break;
+#endif
+        default:
+            av_log(avctx, AV_LOG_ERROR, "AMF initialisation from a %s device is not supported.\n",
+                   av_hwdevice_get_type_name(device_ctx->type));
+            return AVERROR(ENOSYS);
+        }
+
+        ctx->hw_device_ctx = av_buffer_ref(avctx->hw_device_ctx);
+        if (!ctx->hw_device_ctx)
+            return AVERROR(ENOMEM);
+
+    } else {
         res = ctx->context->pVtbl->InitDX11(ctx->context, NULL, AMF_DX11_1);
-        if (res != AMF_OK) {
+        if (res == AMF_OK) {
+            av_log(avctx, AV_LOG_VERBOSE, "AMF initialisation succeeded via D3D11.\n");
+        } else {
             res = ctx->context->pVtbl->InitDX9(ctx->context, NULL);
-            AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR_UNKNOWN, "InitDX9() failed with error %d\n", res);
+            if (res == AMF_OK) {
+                av_log(avctx, AV_LOG_VERBOSE, "AMF initialisation succeeded via D3D9.\n");
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "AMF initialisation failed via D3D9: error %d.\n", res);
+                return AVERROR(ENOSYS);
+            }
         }
     }
     return 0;
@@ -241,9 +321,10 @@ static int amf_init_context(AVCodecContext *avctx)
 
 static int amf_init_encoder(AVCodecContext *avctx)
 {
-    AmfContext          *ctx = avctx->priv_data;
-    const wchar_t       *codec_id = NULL;
-    AMF_RESULT           res = AMF_OK;
+    AmfContext        *ctx = avctx->priv_data;
+    const wchar_t     *codec_id = NULL;
+    AMF_RESULT         res;
+    enum AVPixelFormat pix_fmt;
 
     switch (avctx->codec->id) {
         case AV_CODEC_ID_H264:
@@ -257,8 +338,14 @@ static int amf_init_encoder(AVCodecContext *avctx)
     }
     AMF_RETURN_IF_FALSE(ctx, codec_id != NULL, AVERROR(EINVAL), "Codec %d is not supported\n", avctx->codec->id);
 
-    ctx->format = amf_av_to_amf_format(avctx->pix_fmt);
-    AMF_RETURN_IF_FALSE(ctx, ctx->format != AMF_SURFACE_UNKNOWN, AVERROR(EINVAL), "Format %d is not supported\n", avctx->pix_fmt);
+    if (ctx->hw_frames_ctx)
+        pix_fmt = ((AVHWFramesContext*)ctx->hw_frames_ctx->data)->sw_format;
+    else
+        pix_fmt = avctx->pix_fmt;
+
+    ctx->format = amf_av_to_amf_format(pix_fmt);
+    AMF_RETURN_IF_FALSE(ctx, ctx->format != AMF_SURFACE_UNKNOWN, AVERROR(EINVAL),
+                        "Format %s is not supported\n", av_get_pix_fmt_name(pix_fmt));
 
     res = ctx->factory->pVtbl->CreateComponent(ctx->factory, ctx->context, codec_id, &ctx->encoder);
     AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR_ENCODER_NOT_FOUND, "CreateComponent(%ls) failed with error %d\n", codec_id, res);
@@ -268,9 +355,9 @@ static int amf_init_encoder(AVCodecContext *avctx)
 
 int av_cold ff_amf_encode_close(AVCodecContext *avctx)
 {
-    AmfContext      *ctx = avctx->priv_data;
-    if (ctx->delayed_surface)
-    {
+    AmfContext *ctx = avctx->priv_data;
+
+    if (ctx->delayed_surface) {
         ctx->delayed_surface->pVtbl->Release(ctx->delayed_surface);
         ctx->delayed_surface = NULL;
     }
@@ -290,7 +377,7 @@ int av_cold ff_amf_encode_close(AVCodecContext *avctx)
     av_buffer_unref(&ctx->hw_frames_ctx);
 
     if (ctx->trace) {
-        ctx->trace->pVtbl->UnregisterWriter(ctx->trace, LIBAV_AMF_WRITER_ID);
+        ctx->trace->pVtbl->UnregisterWriter(ctx->trace, FFMPEG_AMF_WRITER_ID);
     }
     if (ctx->library) {
         dlclose(ctx->library);
@@ -302,9 +389,7 @@ int av_cold ff_amf_encode_close(AVCodecContext *avctx)
     ctx->version = 0;
     ctx->delayed_drain = 0;
     av_frame_free(&ctx->delayed_frame);
-    av_fifo_free(ctx->timestamp_list);
-    ctx->timestamp_list = NULL;
-    ctx->timestamp_last = 0;
+    av_fifo_freep(&ctx->timestamp_list);
 
     return 0;
 }
@@ -312,32 +397,14 @@ int av_cold ff_amf_encode_close(AVCodecContext *avctx)
 static int amf_copy_surface(AVCodecContext *avctx, const AVFrame *frame,
     AMFSurface* surface)
 {
-    AVFrame        *sw_frame = NULL;
-    AMFPlane       *plane = NULL;
-    uint8_t        *dst_data[4];
-    int             dst_linesize[4];
-    int             ret = 0;
-    int             planes;
-    int             i;
-
-    if (frame->hw_frames_ctx && is_hwaccel_pix_fmt(frame->format)) {
-        if (!(sw_frame = av_frame_alloc())) {
-            av_log(avctx, AV_LOG_ERROR, "Can not alloc frame\n");
-            ret = AVERROR(ENOMEM);
-            goto fail;
-        }
-        if ((ret = av_hwframe_transfer_data(sw_frame, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error transferring the data to system memory\n");
-            goto fail;
-        }
-        frame = sw_frame;
-    }
-    planes = (int)surface->pVtbl->GetPlanesCount(surface);
-    if (planes > amf_countof(dst_data)) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid number of planes %d in surface\n", planes);
-        ret = AVERROR(EINVAL);
-        goto fail;
-    }
+    AMFPlane *plane;
+    uint8_t  *dst_data[4];
+    int       dst_linesize[4];
+    int       planes;
+    int       i;
+
+    planes = surface->pVtbl->GetPlanesCount(surface);
+    av_assert0(planes < FF_ARRAY_ELEMS(dst_data));
 
     for (i = 0; i < planes; i++) {
         plane = surface->pVtbl->GetPlaneAt(surface, i);
@@ -348,37 +415,30 @@ static int amf_copy_surface(AVCodecContext *avctx, const AVFrame *frame,
         (const uint8_t**)frame->data, frame->linesize, frame->format,
         avctx->width, avctx->height);
 
-fail:
-    if (sw_frame) {
-        av_frame_free(&sw_frame);
-    }
-    return ret;
+    return 0;
 }
 
 static inline int timestamp_queue_enqueue(AVCodecContext *avctx, int64_t timestamp)
 {
     AmfContext         *ctx = avctx->priv_data;
     if (av_fifo_space(ctx->timestamp_list) < sizeof(timestamp)) {
-        int size = av_fifo_size(ctx->timestamp_list);
-        if (INT_MAX / 2 - size < sizeof(timestamp))
-            return AVERROR(EINVAL);
-        av_fifo_realloc2(ctx->timestamp_list, (size + sizeof(timestamp)) * 2);
+        if (av_fifo_grow(ctx->timestamp_list, sizeof(timestamp)) < 0) {
+            return AVERROR(ENOMEM);
+        }
     }
     av_fifo_generic_write(ctx->timestamp_list, &timestamp, sizeof(timestamp), NULL);
-    ctx->timestamp_last = timestamp;
     return 0;
 }
 
 static int amf_copy_buffer(AVCodecContext *avctx, AVPacket *pkt, AMFBuffer *buffer)
 {
-    AmfContext             *ctx = avctx->priv_data;
-    int                     ret;
-    AMFVariantStruct        var = {0};
-    int64_t                 timestamp = AV_NOPTS_VALUE;
-    int64_t                 size = buffer->pVtbl->GetSize(buffer);
-
-    //if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0) {
-    if  (ret = ff_alloc_packet(pkt, size)) {
+    AmfContext      *ctx = avctx->priv_data;
+    int              ret;
+    AMFVariantStruct var = {0};
+    int64_t          timestamp = AV_NOPTS_VALUE;
+    int64_t          size = buffer->pVtbl->GetSize(buffer);
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0) {
         return ret;
     }
     memcpy(pkt->data, buffer->pVtbl->GetNative(buffer), size);
@@ -411,13 +471,19 @@ static int amf_copy_buffer(AVCodecContext *avctx, AVPacket *pkt, AMFBuffer *buff
 
     // calc dts shift if max_b_frames > 0
     if (avctx->max_b_frames > 0 && ctx->dts_delay == 0) {
+        int64_t timestamp_last = AV_NOPTS_VALUE;
         AMF_RETURN_IF_FALSE(ctx, av_fifo_size(ctx->timestamp_list) > 0, AVERROR_UNKNOWN,
             "timestamp_list is empty while max_b_frames = %d\n", avctx->max_b_frames);
-
-        if (timestamp < 0 || ctx->timestamp_last < AV_NOPTS_VALUE) {
+        av_fifo_generic_peek_at(
+            ctx->timestamp_list,
+            &timestamp_last,
+            (av_fifo_size(ctx->timestamp_list) / sizeof(timestamp) - 1) * sizeof(timestamp_last),
+            sizeof(timestamp_last),
+            NULL);
+        if (timestamp < 0 || timestamp_last < AV_NOPTS_VALUE) {
             return AVERROR(ERANGE);
         }
-        ctx->dts_delay = ctx->timestamp_last - timestamp;
+        ctx->dts_delay = timestamp_last - timestamp;
     }
     pkt->dts = timestamp - ctx->dts_delay;
     return 0;
@@ -426,20 +492,7 @@ static int amf_copy_buffer(AVCodecContext *avctx, AVPacket *pkt, AMFBuffer *buff
 // amfenc API implementation
 int ff_amf_encode_init(AVCodecContext *avctx)
 {
-    AmfContext     *ctx = avctx->priv_data;
-    int             ret;
-
-    ctx->factory = NULL;
-    ctx->debug = NULL;
-    ctx->trace = NULL;
-    ctx->context = NULL;
-    ctx->encoder = NULL;
-    ctx->library = NULL;
-    ctx->version = 0;
-    ctx->eof = 0;
-    ctx->format = 0;
-    ctx->tracer.vtbl = NULL;
-    ctx->tracer.avctx = NULL;
+    int ret;
 
     if ((ret = amf_load_library(avctx)) == 0) {
         if ((ret = amf_init_context(avctx)) == 0) {
@@ -516,18 +569,18 @@ static AMFBuffer *amf_create_buffer_with_frame_ref(const AVFrame *frame, AMFCont
 
 static void amf_release_buffer_with_frame_ref(AMFBuffer *frame_ref_storage_buffer)
 {
-    AVFrame *av_frame_ref;
-    memcpy(&av_frame_ref, frame_ref_storage_buffer->pVtbl->GetNative(frame_ref_storage_buffer), sizeof(av_frame_ref));
-    av_frame_free(&av_frame_ref);
+    AVFrame *frame_ref;
+    memcpy(&frame_ref, frame_ref_storage_buffer->pVtbl->GetNative(frame_ref_storage_buffer), sizeof(frame_ref));
+    av_frame_free(&frame_ref);
     frame_ref_storage_buffer->pVtbl->Release(frame_ref_storage_buffer);
 }
 
 int ff_amf_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
-    AMF_RESULT      res = AMF_OK;
-    AmfContext     *ctx = avctx->priv_data;
-    AMFSurface     *surface = NULL;
-    int             ret;
+    AmfContext *ctx = avctx->priv_data;
+    AMFSurface *surface;
+    AMF_RESULT  res;
+    int         ret;
 
     if (!ctx->encoder)
         return AVERROR(EINVAL);
@@ -551,31 +604,58 @@ int ff_amf_send_frame(AVCodecContext *avctx, const AVFrame *frame)
             return AVERROR_EOF;
         }
     } else { // submit frame
+        int hw_surface = 0;
+
         if (ctx->delayed_surface != NULL) {
             return AVERROR(EAGAIN); // should not happen when called from ffmpeg, other clients may resubmit
         }
         // prepare surface from frame
-        if (frame->hw_frames_ctx && ( // HW frame detected
-            // check if the same hw_frames_ctx as used in initialization
-            (ctx->hw_frames_ctx && frame->hw_frames_ctx->data == ctx->hw_frames_ctx->data) ||
-            // check if the same hw_device_ctx as used in initialization
-            (ctx->hw_device_ctx && ((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx ==
-            (AVHWDeviceContext*)ctx->hw_device_ctx->data)
-        )) {
-            AMFBuffer *frame_ref_storage_buffer;
-
+        switch (frame->format) {
 #if CONFIG_D3D11VA
-            static const GUID AMFTextureArrayIndexGUID = { 0x28115527, 0xe7c3, 0x4b66, { 0x99, 0xd3, 0x4f, 0x2a, 0xe6, 0xb4, 0x7f, 0xaf } };
-            ID3D11Texture2D *texture = (ID3D11Texture2D*)frame->data[0]; // actual texture
-            int index = (int)(size_t)frame->data[1]; // index is a slice in texture array is - set to tell AMF which slice to use
-            texture->lpVtbl->SetPrivateData(texture, &AMFTextureArrayIndexGUID, sizeof(index), &index);
+        case AV_PIX_FMT_D3D11:
+            {
+                static const GUID AMFTextureArrayIndexGUID = { 0x28115527, 0xe7c3, 0x4b66, { 0x99, 0xd3, 0x4f, 0x2a, 0xe6, 0xb4, 0x7f, 0xaf } };
+                ID3D11Texture2D *texture = (ID3D11Texture2D*)frame->data[0]; // actual texture
+                int index = (intptr_t)frame->data[1]; // index is a slice in texture array is - set to tell AMF which slice to use
+
+                av_assert0(frame->hw_frames_ctx       && ctx->hw_frames_ctx &&
+                           frame->hw_frames_ctx->data == ctx->hw_frames_ctx->data);
+
+                texture->lpVtbl->SetPrivateData(texture, &AMFTextureArrayIndexGUID, sizeof(index), &index);
+
+                res = ctx->context->pVtbl->CreateSurfaceFromDX11Native(ctx->context, texture, &surface, NULL); // wrap to AMF surface
+                AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR(ENOMEM), "CreateSurfaceFromDX11Native() failed  with error %d\n", res);
 
-            res = ctx->context->pVtbl->CreateSurfaceFromDX11Native(ctx->context, texture, &surface, NULL); // wrap to AMF surface
-            AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR(ENOMEM), "CreateSurfaceFromDX11Native() failed  with error %d\n", res);
+                hw_surface = 1;
+            }
+            break;
+#endif
+#if CONFIG_DXVA2
+        case AV_PIX_FMT_DXVA2_VLD:
+            {
+                IDirect3DSurface9 *texture = (IDirect3DSurface9 *)frame->data[3]; // actual texture
+
+                res = ctx->context->pVtbl->CreateSurfaceFromDX9Native(ctx->context, texture, &surface, NULL); // wrap to AMF surface
+                AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR(ENOMEM), "CreateSurfaceFromDX9Native() failed  with error %d\n", res);
+
+                hw_surface = 1;
+            }
+            break;
+#endif
+        default:
+            {
+                res = ctx->context->pVtbl->AllocSurface(ctx->context, AMF_MEMORY_HOST, ctx->format, avctx->width, avctx->height, &surface);
+                AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR(ENOMEM), "AllocSurface() failed  with error %d\n", res);
+                amf_copy_surface(avctx, frame, surface);
+            }
+            break;
+        }
+
+        if (hw_surface) {
+            AMFBuffer *frame_ref_storage_buffer;
 
             // input HW surfaces can be vertically aligned by 16; tell AMF the real size
             surface->pVtbl->SetCrop(surface, 0, 0, frame->width, frame->height);
-#endif
 
             frame_ref_storage_buffer = amf_create_buffer_with_frame_ref(frame, ctx->context);
             AMF_RETURN_IF_FALSE(ctx, frame_ref_storage_buffer != NULL, AVERROR(ENOMEM), "create_buffer_with_frame_ref() returned NULL\n");
@@ -584,11 +664,8 @@ int ff_amf_send_frame(AVCodecContext *avctx, const AVFrame *frame)
             AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR_UNKNOWN, "SetProperty failed for \"av_frame_ref\" with error %d\n", res);
             ctx->hwsurfaces_in_queue++;
             frame_ref_storage_buffer->pVtbl->Release(frame_ref_storage_buffer);
-        } else {
-            res = ctx->context->pVtbl->AllocSurface(ctx->context, AMF_MEMORY_HOST, ctx->format, avctx->width, avctx->height, &surface);
-            AMF_RETURN_IF_FALSE(ctx, res == AMF_OK, AVERROR(ENOMEM), "AllocSurface() failed  with error %d\n", res);
-            amf_copy_surface(avctx, frame, surface);
         }
+
         surface->pVtbl->SetPts(surface, frame->pts);
         AMF_ASSIGN_PROPERTY_INT64(res, surface, PTS_PROP, frame->pts);
 
diff --git a/libavcodec/amfenc.h b/libavcodec/amfenc.h
index 6d13eb0..b136184 100644
--- a/libavcodec/amfenc.h
+++ b/libavcodec/amfenc.h
@@ -1,25 +1,20 @@
 /*
- * AMD AMF support
- * Copyright (C) 2017 Luca Barbato
- * Copyright (C) 2017 Mikhail Mironov <mikhail.mironov@amd.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
 
 #ifndef AVCODEC_AMFENC_H
 #define AVCODEC_AMFENC_H
@@ -31,7 +26,6 @@
 
 #include "libavutil/fifo.h"
 
-#include "config.h"
 #include "avcodec.h"
 
 
@@ -78,10 +72,10 @@ typedef struct AmfContext {
 
     // shift dts back by max_b_frames in timing
     AVFifoBuffer       *timestamp_list;
-    int64_t             timestamp_last;
     int64_t             dts_delay;
 
-    // common encoder options
+    // common encoder option options
+
     int                 log_to_dbg;
 
     // Static options, have to be set before Init() call
@@ -153,7 +147,4 @@ extern const enum AVPixelFormat ff_amf_pix_fmts[];
         return ret_value; \
     }
 
-#define AMF_COMMON_OPTIONS \
-    { "log_to_dbg",     "Enable AMF logging to debug output",   OFFSET(log_to_dbg), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE } \
-
 #endif //AVCODEC_AMFENC_H
diff --git a/libavcodec/amfenc_h264.c b/libavcodec/amfenc_h264.c
index 01b0c3a..2c082e9 100644
--- a/libavcodec/amfenc_h264.c
+++ b/libavcodec/amfenc_h264.c
@@ -1,22 +1,18 @@
 /*
- * AMD AMF support
- * Copyright (C) 2017 Luca Barbato
- * Copyright (C) 2017 Mikhail Mironov <mikhail.mironov@amd.com>
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,10 +66,10 @@ static const AVOption options[] = {
 
 
     /// Quality Preset
-    { "quality_preset",        "Quality Preference",                   OFFSET(quality),    AV_OPT_TYPE_INT,   { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_SPEED    }, AMF_VIDEO_ENCODER_QUALITY_PRESET_BALANCED, AMF_VIDEO_ENCODER_QUALITY_PRESET_QUALITY, VE, "quality_preset" },
-    { "speed",          "Prefer Speed",                         0,                  AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_SPEED    },       0, 0, VE, "quality_preset" },
-    { "balanced",       "Balanced",                             0,                  AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_BALANCED },    0, 0, VE, "quality_preset" },
-    { "quality",        "Prefer Quality",                       0,                  AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_QUALITY  },     0, 0, VE, "quality_preset" },
+    { "quality",        "Quality Preference",                   OFFSET(quality),    AV_OPT_TYPE_INT,   { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_SPEED    }, AMF_VIDEO_ENCODER_QUALITY_PRESET_BALANCED, AMF_VIDEO_ENCODER_QUALITY_PRESET_QUALITY, VE, "quality" },
+    { "speed",          "Prefer Speed",                         0,                  AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_SPEED    },       0, 0, VE, "quality" },
+    { "balanced",       "Balanced",                             0,                  AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_BALANCED },    0, 0, VE, "quality" },
+    { "quality",        "Prefer Quality",                       0,                  AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_QUALITY_PRESET_QUALITY  },     0, 0, VE, "quality" },
 
     // Dynamic
     /// Rate Control Method
@@ -84,10 +80,10 @@ static const AVOption options[] = {
     { "vbr_latency",    "Latency Constrained Variable Bitrate", 0,                         AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_RATE_CONTROL_METHOD_LATENCY_CONSTRAINED_VBR }, 0, 0, VE, "rc" },
 
     /// Enforce HRD, Filler Data, VBAQ, Frame Skipping
-    { "enforce_hrd",    "Enforce HRD",                          OFFSET(enforce_hrd),        AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "filler_data",    "Filler Data Enable",                   OFFSET(filler_data),        AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "vbaq",           "Enable VBAQ",                          OFFSET(enable_vbaq),        AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "frame_skipping", "Rate Control Based Frame Skip",        OFFSET(skip_frame),         AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "enforce_hrd",    "Enforce HRD",                          OFFSET(enforce_hrd),        AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "filler_data",    "Filler Data Enable",                   OFFSET(filler_data),        AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "vbaq",           "Enable VBAQ",                          OFFSET(enable_vbaq),        AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "frame_skipping", "Rate Control Based Frame Skip",        OFFSET(skip_frame),         AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
 
     /// QP Values
     { "qp_i",           "Quantization Parameter for I-Frame",   OFFSET(qp_i),               AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 51, VE },
@@ -95,7 +91,7 @@ static const AVOption options[] = {
     { "qp_b",           "Quantization Parameter for B-Frame",   OFFSET(qp_b),               AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 51, VE },
 
     /// Pre-Pass, Pre-Analysis, Two-Pass
-    { "preanalysis",    "Pre-Analysis Mode",                    OFFSET(preanalysis),        AV_OPT_TYPE_INT,{ .i64 = 0 }, 0, 1, VE, NULL },
+    { "preanalysis",    "Pre-Analysis Mode",                    OFFSET(preanalysis),        AV_OPT_TYPE_BOOL,{ .i64 = 0 }, 0, 1, VE, NULL },
 
     /// Maximum Access Unit Size
     { "max_au_size",    "Maximum Access Unit Size for rate control (in bits)",   OFFSET(max_au_size),        AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
@@ -106,7 +102,7 @@ static const AVOption options[] = {
     /// B-Frames
     // BPicturesPattern=bf
     { "bf_delta_qp",    "B-Picture Delta QP",                   OFFSET(b_frame_delta_qp),   AV_OPT_TYPE_INT,  { .i64 = 4 }, -10, 10, VE },
-    { "bf_ref",         "Enable Reference to B-Frames",         OFFSET(b_frame_ref),        AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
+    { "bf_ref",         "Enable Reference to B-Frames",         OFFSET(b_frame_ref),        AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VE },
     { "bf_ref_delta_qp","Reference B-Picture Delta QP",         OFFSET(ref_b_frame_delta_qp), AV_OPT_TYPE_INT,  { .i64 = 4 }, -10, 10, VE },
 
     /// Intra-Refresh
@@ -118,12 +114,12 @@ static const AVOption options[] = {
     { "cavlc",          "Context Adaptive Variable-Length Coding", 0,                  AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_CALV },      0, 0, VE, "coder" },
     { "cabac",          "Context Adaptive Binary Arithmetic Coding", 0,                AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_CABAC },     0, 0, VE, "coder" },
 
-    { "me_half_pel",    "Enable ME Half Pixel",                 OFFSET(me_half_pel),   AV_OPT_TYPE_INT,  { .i64 = 1 }, 0, 1, VE },
-    { "me_quarter_pel", "Enable ME Quarter Pixel",              OFFSET(me_quarter_pel),AV_OPT_TYPE_INT,  { .i64 = 1 }, 0, 1, VE },
+    { "me_half_pel",    "Enable ME Half Pixel",                 OFFSET(me_half_pel),   AV_OPT_TYPE_BOOL,  { .i64 = 1 }, 0, 1, VE },
+    { "me_quarter_pel", "Enable ME Quarter Pixel",              OFFSET(me_quarter_pel),AV_OPT_TYPE_BOOL,  { .i64 = 1 }, 0, 1, VE },
 
-    { "aud",            "Inserts AU Delimiter NAL unit",        OFFSET(aud)          ,AV_OPT_TYPE_INT,  { .i64 = 0 }, 0, 1, VE },
+    { "aud",            "Inserts AU Delimiter NAL unit",        OFFSET(aud)          ,AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
 
-    AMF_COMMON_OPTIONS,
+    { "log_to_dbg",     "Enable AMF logging to debug output",   OFFSET(log_to_dbg)    , AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
 
     { NULL }
 };
@@ -217,7 +213,6 @@ static av_cold int amf_encode_init_h264(AVCodecContext *avctx)
         }
     }
 
-
     if (ctx->rate_control_mode == AMF_VIDEO_ENCODER_RATE_CONTROL_METHOD_CONSTANT_QP) {
         AMF_ASSIGN_PROPERTY_INT64(res, ctx->encoder, AMF_VIDEO_ENCODER_RATE_CONTROL_PREANALYSIS_ENABLE, AMF_VIDEO_ENCODER_PREENCODE_DISABLED);
         if (ctx->preanalysis)
diff --git a/libavcodec/amfenc_hevc.c b/libavcodec/amfenc_hevc.c
index fc64dec..7c9a33a 100644
--- a/libavcodec/amfenc_hevc.c
+++ b/libavcodec/amfenc_hevc.c
@@ -1,26 +1,21 @@
 /*
- * AMD AMF support
- * Copyright (C) 2017 Luca Barbato
- * Copyright (C) 2017 Mikhail Mironov <mikhail.mironov@amd.com>
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "amfenc.h"
@@ -58,10 +53,10 @@ static const AVOption options[] = {
     { "6.1",            "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_LEVEL_6_1 }, 0, 0, VE, "level" },
     { "6.2",            "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_LEVEL_6_2 }, 0, 0, VE, "level" },
 
-    { "quality_preset",        "Set the encoding quality",                 OFFSET(quality),      AV_OPT_TYPE_INT,   { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_SPEED }, AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_QUALITY, AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_SPEED, VE, "quality_preset" },
-    { "balanced",       "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_BALANCED }, 0, 0, VE, "quality_preset" },
-    { "speed",          "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_SPEED    }, 0, 0, VE, "quality_preset" },
-    { "quality",        "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_QUALITY  }, 0, 0, VE, "quality_preset" },
+    { "quality",        "Set the encoding quality",                 OFFSET(quality),      AV_OPT_TYPE_INT,   { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_SPEED }, AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_QUALITY, AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_SPEED, VE, "quality" },
+    { "balanced",       "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_BALANCED }, 0, 0, VE, "quality" },
+    { "speed",          "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_SPEED    }, 0, 0, VE, "quality" },
+    { "quality",        "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_QUALITY_PRESET_QUALITY  }, 0, 0, VE, "quality" },
 
     { "rc",             "Set the rate control mode",            OFFSET(rate_control_mode), AV_OPT_TYPE_INT, { .i64 = AMF_VIDEO_ENCODER_HEVC_RATE_CONTROL_METHOD_UNKNOWN }, AMF_VIDEO_ENCODER_HEVC_RATE_CONTROL_METHOD_UNKNOWN, AMF_VIDEO_ENCODER_HEVC_RATE_CONTROL_METHOD_CBR, VE, "rc" },
     { "cqp",            "Constant Quantization Parameter",      0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_RATE_CONTROL_METHOD_CONSTANT_QP             }, 0, 0, VE, "rc" },
@@ -75,10 +70,10 @@ static const AVOption options[] = {
     { "idr",            "", 0, AV_OPT_TYPE_CONST, { .i64 = AMF_VIDEO_ENCODER_HEVC_HEADER_INSERTION_MODE_IDR_ALIGNED }, 0, 0, VE, "hdrmode" },
 
     { "gops_per_idr",    "GOPs per IDR 0-no IDR will be inserted",  OFFSET(gops_per_idr),  AV_OPT_TYPE_INT,  { .i64 = 60 },  0, INT_MAX, VE },
-    { "preanalysis",    "Enable preanalysis",                       OFFSET(preanalysis),   AV_OPT_TYPE_INT, { .i64 = 0  },  0, 1, VE},
-    { "vbaq",           "Enable VBAQ",                              OFFSET(enable_vbaq),   AV_OPT_TYPE_INT, { .i64 = 0  },  0, 1, VE},
-    { "enforce_hrd",    "Enforce HRD",                              OFFSET(enforce_hrd),   AV_OPT_TYPE_INT, { .i64 = 0  },  0, 1, VE},
-    { "filler_data",    "Filler Data Enable",                       OFFSET(filler_data),   AV_OPT_TYPE_INT, { .i64 = 0  },  0, 1, VE},
+    { "preanalysis",    "Enable preanalysis",                       OFFSET(preanalysis),   AV_OPT_TYPE_BOOL, { .i64 = 0  },  0, 1, VE},
+    { "vbaq",           "Enable VBAQ",                              OFFSET(enable_vbaq),   AV_OPT_TYPE_BOOL, { .i64 = 0  },  0, 1, VE},
+    { "enforce_hrd",    "Enforce HRD",                              OFFSET(enforce_hrd),   AV_OPT_TYPE_BOOL, { .i64 = 0  },  0, 1, VE},
+    { "filler_data",    "Filler Data Enable",                       OFFSET(filler_data),   AV_OPT_TYPE_BOOL, { .i64 = 0  },  0, 1, VE},
     { "max_au_size",    "Maximum Access Unit Size for rate control (in bits)", OFFSET(max_au_size),   AV_OPT_TYPE_INT,{ .i64 = 0 }, 0, INT_MAX, VE},
     { "min_qp_i",       "min quantization parameter for I-frame",   OFFSET(min_qp_i),      AV_OPT_TYPE_INT, { .i64 = -1  }, -1, 51, VE },
     { "max_qp_i",       "max quantization parameter for I-frame",   OFFSET(max_qp_i),      AV_OPT_TYPE_INT, { .i64 = -1  }, -1, 51, VE },
@@ -86,14 +81,13 @@ static const AVOption options[] = {
     { "max_qp_p",       "max quantization parameter for P-frame",   OFFSET(max_qp_p),      AV_OPT_TYPE_INT, { .i64 = -1  }, -1, 51, VE },
     { "qp_p",           "quantization parameter for P-frame",       OFFSET(qp_p),          AV_OPT_TYPE_INT, { .i64 = -1  }, -1, 51, VE },
     { "qp_i",           "quantization parameter for I-frame",       OFFSET(qp_i),          AV_OPT_TYPE_INT, { .i64 = -1  }, -1, 51, VE },
-    { "skip_frame",     "Rate Control Based Frame Skip",            OFFSET(skip_frame),    AV_OPT_TYPE_INT,{ .i64 = 0   },  0, 1, VE },
-    { "me_half_pel",    "Enable ME Half Pixel",                     OFFSET(me_half_pel),   AV_OPT_TYPE_INT,{ .i64 = 1   },  0, 1, VE },
-    { "me_quarter_pel", "Enable ME Quarter Pixel ",                 OFFSET(me_quarter_pel),AV_OPT_TYPE_INT,{ .i64 = 1   },  0, 1, VE },
-
-    { "aud",            "Inserts AU Delimiter NAL unit",            OFFSET(aud)           ,AV_OPT_TYPE_INT,{ .i64 = 0 }, 0, 1, VE },
+    { "skip_frame",     "Rate Control Based Frame Skip",            OFFSET(skip_frame),    AV_OPT_TYPE_BOOL,{ .i64 = 0   },  0, 1, VE },
+    { "me_half_pel",    "Enable ME Half Pixel",                     OFFSET(me_half_pel),   AV_OPT_TYPE_BOOL,{ .i64 = 1   },  0, 1, VE },
+    { "me_quarter_pel", "Enable ME Quarter Pixel ",                 OFFSET(me_quarter_pel),AV_OPT_TYPE_BOOL,{ .i64 = 1   },  0, 1, VE },
 
-    AMF_COMMON_OPTIONS,
+    { "aud",            "Inserts AU Delimiter NAL unit",            OFFSET(aud)           ,AV_OPT_TYPE_BOOL,{ .i64 = 0 }, 0, 1, VE },
 
+    { "log_to_dbg",     "Enable AMF logging to debug output",   OFFSET(log_to_dbg), AV_OPT_TYPE_BOOL,{ .i64 = 0 }, 0, 1, VE },
     { NULL }
 };
 
diff --git a/libavcodec/amr.h b/libavcodec/amr.h
index 676c963..727f8c3 100644
--- a/libavcodec/amr.h
+++ b/libavcodec/amr.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #include "avcodec.h"
 
 #ifdef AMR_USE_16BIT_TABLES
-#define R_TABLE_TYPE uint16_t
+typedef uint16_t R_TABLE_TYPE;
 #else
-#define R_TABLE_TYPE uint8_t
+typedef uint8_t R_TABLE_TYPE;
 #endif
 
 /**
diff --git a/libavcodec/amrnbdata.h b/libavcodec/amrnbdata.h
index 4eaeb0e..435fd99 100644
--- a/libavcodec/amrnbdata.h
+++ b/libavcodec/amrnbdata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Robert Swain
  * Copyright (c) 2009 Colin McQuillan
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/amrnbdec.c b/libavcodec/amrnbdec.c
index e2b5641..ea299ac 100644
--- a/libavcodec/amrnbdec.c
+++ b/libavcodec/amrnbdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Robert Swain
  * Copyright (c) 2009 Colin McQuillan
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,8 @@
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
+#include "celp_math.h"
 #include "celp_filters.h"
 #include "acelp_filters.h"
 #include "acelp_vectors.h"
@@ -84,7 +86,7 @@
 /** Maximum sharpening factor
  *
  * The specification says 0.8, which should be 13107, but the reference C code
- * uses 13017 instead. (Amusingly the same applies to SHARP_MAX in bitexact G.729.)
+ * uses 13017 instead. (Amusingly the same applies to SHARP_MAX in g729dec.c.)
  */
 #define SHARP_MAX 0.79449462890625
 
@@ -136,6 +138,11 @@ typedef struct AMRContext {
 
     float samples_in[LP_FILTER_ORDER + AMR_SUBFRAME_SIZE]; ///< floating point samples
 
+    ACELPFContext                     acelpf_ctx; ///< context for filters for ACELP-based codecs
+    ACELPVContext                     acelpv_ctx; ///< context for vector operations for ACELP-based codecs
+    CELPFContext                       celpf_ctx; ///< context for filters for CELP-based codecs
+    CELPMContext                       celpm_ctx; ///< context for fixed point math operations
+
 } AMRContext;
 
 /** Double version of ff_weighted_vector_sumf() */
@@ -162,7 +169,8 @@ static av_cold int amrnb_decode_init(AVCodecContext *avctx)
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 8000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
     // p->excitation always points to the same position in p->excitation_buf
@@ -176,6 +184,11 @@ static av_cold int amrnb_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 4; i++)
         p->prediction_error[i] = MIN_ENERGY;
 
+    ff_acelp_filter_init(&p->acelpf_ctx);
+    ff_acelp_vectors_init(&p->acelpv_ctx);
+    ff_celp_filter_init(&p->celpf_ctx);
+    ff_celp_math_init(&p->celpm_ctx);
+
     return 0;
 }
 
@@ -219,15 +232,16 @@ static enum Mode unpack_bitstream(AMRContext *p, const uint8_t *buf,
  * Interpolate the LSF vector (used for fixed gain smoothing).
  * The interpolation is done over all four subframes even in MODE_12k2.
  *
+ * @param[in]     ctx       The Context
  * @param[in,out] lsf_q     LSFs in [0,1] for each subframe
  * @param[in]     lsf_new   New LSFs in [0,1] for subframe 4
  */
-static void interpolate_lsf(float lsf_q[4][LP_FILTER_ORDER], float *lsf_new)
+static void interpolate_lsf(ACELPVContext *ctx, float lsf_q[4][LP_FILTER_ORDER], float *lsf_new)
 {
     int i;
 
     for (i = 0; i < 4; i++)
-        ff_weighted_vector_sumf(lsf_q[i], lsf_q[3], lsf_new,
+        ctx->weighted_vector_sumf(lsf_q[i], lsf_q[3], lsf_new,
                                 0.25 * (3 - i), 0.25 * (i + 1),
                                 LP_FILTER_ORDER);
 }
@@ -271,7 +285,7 @@ static void lsf2lsp_for_mode12k2(AMRContext *p, double lsp[LP_FILTER_ORDER],
     ff_set_min_dist_lsf(lsf_q, MIN_LSF_SPACING, LP_FILTER_ORDER);
 
     if (update)
-        interpolate_lsf(p->lsf_q, lsf_q);
+        interpolate_lsf(&p->acelpv_ctx, p->lsf_q, lsf_q);
 
     ff_acelp_lsf2lspd(lsp, lsf_q, LP_FILTER_ORDER);
 }
@@ -334,7 +348,7 @@ static void lsf2lsp_3(AMRContext *p)
     ff_set_min_dist_lsf(lsf_q, MIN_LSF_SPACING, LP_FILTER_ORDER);
 
     // store data for computing the next frame's LSFs
-    interpolate_lsf(p->lsf_q, lsf_q);
+    interpolate_lsf(&p->acelpv_ctx, p->lsf_q, lsf_q);
     memcpy(p->prev_lsf_r, lsf_r, LP_FILTER_ORDER * sizeof(*lsf_r));
 
     ff_acelp_lsf2lspd(p->lsp[3], lsf_q, LP_FILTER_ORDER);
@@ -385,22 +399,23 @@ static void decode_pitch_vector(AMRContext *p,
         decode_pitch_lag_1_6(&pitch_lag_int, &pitch_lag_frac,
                              amr_subframe->p_lag, p->pitch_lag_int,
                              subframe);
-    } else
+    } else {
         ff_decode_pitch_lag(&pitch_lag_int, &pitch_lag_frac,
                             amr_subframe->p_lag,
                             p->pitch_lag_int, subframe,
                             mode != MODE_4k75 && mode != MODE_5k15,
                             mode <= MODE_6k7 ? 4 : (mode == MODE_7k95 ? 5 : 6));
+        pitch_lag_frac *= 2;
+    }
 
     p->pitch_lag_int = pitch_lag_int; // store previous lag in a uint8_t
 
-    pitch_lag_frac <<= (p->cur_frame_mode != MODE_12k2);
-
     pitch_lag_int += pitch_lag_frac > 0;
 
     /* Calculate the pitch vector by interpolating the past excitation at the
        pitch lag using a b60 hamming windowed sinc function.   */
-    ff_acelp_interpolatef(p->excitation, p->excitation + 1 - pitch_lag_int,
+    p->acelpf_ctx.acelp_interpolatef(p->excitation,
+                          p->excitation + 1 - pitch_lag_int,
                           ff_b60_sinc, 6,
                           pitch_lag_frac + 6 - 6*(pitch_lag_frac > 0),
                           10, AMR_SUBFRAME_SIZE);
@@ -484,7 +499,7 @@ static void decode_8_pulses_31bits(const int16_t *fixed_index,
 static void decode_fixed_sparse(AMRFixed *fixed_sparse, const uint16_t *pulses,
                                 const enum Mode mode, const int subframe)
 {
-    assert(MODE_4k75 <= mode && mode <= MODE_12k2);
+    av_assert1(MODE_4k75 <= (signed)mode && mode <= MODE_12k2);
 
     if (mode == MODE_12k2) {
         ff_decode_10_pulses_35bits(pulses, fixed_sparse, gray_decode, 5, 3);
@@ -785,12 +800,12 @@ static int synthesis(AMRContext *p, float *lpc,
         for (i = 0; i < AMR_SUBFRAME_SIZE; i++)
             p->pitch_vector[i] *= 0.25;
 
-    ff_weighted_vector_sumf(excitation, p->pitch_vector, fixed_vector,
+    p->acelpv_ctx.weighted_vector_sumf(excitation, p->pitch_vector, fixed_vector,
                             p->pitch_gain[4], fixed_gain, AMR_SUBFRAME_SIZE);
 
     // emphasize pitch vector contribution
     if (p->pitch_gain[4] > 0.5 && !overflow) {
-        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+        float energy = p->celpm_ctx.dot_productf(excitation, excitation,
                                                     AMR_SUBFRAME_SIZE);
         float pitch_factor =
             p->pitch_gain[4] *
@@ -805,7 +820,8 @@ static int synthesis(AMRContext *p, float *lpc,
                                                 AMR_SUBFRAME_SIZE);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, lpc, excitation, AMR_SUBFRAME_SIZE,
+    p->celpf_ctx.celp_lp_synthesis_filterf(samples, lpc, excitation,
+                                 AMR_SUBFRAME_SIZE,
                                  LP_FILTER_ORDER);
 
     // detect overflow
@@ -851,10 +867,11 @@ static void update_state(AMRContext *p)
 /**
  * Get the tilt factor of a formant filter from its transfer function
  *
+ * @param p     The Context
  * @param lpc_n LP_FILTER_ORDER coefficients of the numerator
  * @param lpc_d LP_FILTER_ORDER coefficients of the denominator
  */
-static float tilt_factor(float *lpc_n, float *lpc_d)
+static float tilt_factor(AMRContext *p, float *lpc_n, float *lpc_d)
 {
     float rh0, rh1; // autocorrelation at lag 0 and 1
 
@@ -864,11 +881,12 @@ static float tilt_factor(float *lpc_n, float *lpc_d)
 
     hf[0] = 1.0;
     memcpy(hf + 1, lpc_n, sizeof(float) * LP_FILTER_ORDER);
-    ff_celp_lp_synthesis_filterf(hf, lpc_d, hf, AMR_TILT_RESPONSE,
+    p->celpf_ctx.celp_lp_synthesis_filterf(hf, lpc_d, hf,
+                                 AMR_TILT_RESPONSE,
                                  LP_FILTER_ORDER);
 
-    rh0 = avpriv_scalarproduct_float_c(hf, hf,     AMR_TILT_RESPONSE);
-    rh1 = avpriv_scalarproduct_float_c(hf, hf + 1, AMR_TILT_RESPONSE - 1);
+    rh0 = p->celpm_ctx.dot_productf(hf, hf,     AMR_TILT_RESPONSE);
+    rh1 = p->celpm_ctx.dot_productf(hf, hf + 1, AMR_TILT_RESPONSE - 1);
 
     // The spec only specifies this check for 12.2 and 10.2 kbit/s
     // modes. But in the ref source the tilt is always non-negative.
@@ -888,7 +906,7 @@ static void postfilter(AMRContext *p, float *lpc, float *buf_out)
     int i;
     float *samples          = p->samples_in + LP_FILTER_ORDER; // Start of input
 
-    float speech_gain       = avpriv_scalarproduct_float_c(samples, samples,
+    float speech_gain       = p->celpm_ctx.dot_productf(samples, samples,
                                                            AMR_SUBFRAME_SIZE);
 
     float pole_out[AMR_SUBFRAME_SIZE + LP_FILTER_ORDER];  // Output of pole filter
@@ -909,16 +927,16 @@ static void postfilter(AMRContext *p, float *lpc, float *buf_out)
     }
 
     memcpy(pole_out, p->postfilter_mem, sizeof(float) * LP_FILTER_ORDER);
-    ff_celp_lp_synthesis_filterf(pole_out + LP_FILTER_ORDER, lpc_d, samples,
+    p->celpf_ctx.celp_lp_synthesis_filterf(pole_out + LP_FILTER_ORDER, lpc_d, samples,
                                  AMR_SUBFRAME_SIZE, LP_FILTER_ORDER);
     memcpy(p->postfilter_mem, pole_out + AMR_SUBFRAME_SIZE,
            sizeof(float) * LP_FILTER_ORDER);
 
-    ff_celp_lp_zero_synthesis_filterf(buf_out, lpc_n,
+    p->celpf_ctx.celp_lp_zero_synthesis_filterf(buf_out, lpc_n,
                                       pole_out + LP_FILTER_ORDER,
                                       AMR_SUBFRAME_SIZE, LP_FILTER_ORDER);
 
-    ff_tilt_compensation(&p->tilt_mem, tilt_factor(lpc_n, lpc_d), buf_out,
+    ff_tilt_compensation(&p->tilt_mem, tilt_factor(p, lpc_n, lpc_d), buf_out,
                          AMR_SUBFRAME_SIZE);
 
     ff_adaptive_gain_control(buf_out, buf_out, speech_gain, AMR_SUBFRAME_SIZE,
@@ -945,10 +963,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = AMR_BLOCK_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (float *)frame->data[0];
 
     p->cur_frame_mode = unpack_bitstream(p, buf, buf_size);
@@ -957,7 +973,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
     if (p->cur_frame_mode == MODE_DTX) {
-        avpriv_request_sample(avctx, "dtx mode");
+        avpriv_report_missing_feature(avctx, "dtx mode");
+        av_log(avctx, AV_LOG_INFO, "Note: libopencore_amrnb supports dtx\n");
         return AVERROR_PATCHWELCOME;
     }
 
@@ -995,7 +1012,7 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
 
         p->fixed_gain[4] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  avpriv_scalarproduct_float_c(p->fixed_vector,
+                       p->celpm_ctx.dot_productf(p->fixed_vector,
                                                                p->fixed_vector,
                                                                AMR_SUBFRAME_SIZE) /
                                   AMR_SUBFRAME_SIZE,
@@ -1041,7 +1058,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
         update_state(p);
     }
 
-    ff_acelp_apply_order_2_transfer_function(buf_out, buf_out, highpass_zeros,
+    p->acelpf_ctx.acelp_apply_order_2_transfer_function(buf_out,
+                                             buf_out, highpass_zeros,
                                              highpass_poles,
                                              highpass_gain * AMR_SAMPLE_SCALE,
                                              p->high_pass_mem, AMR_BLOCK_SIZE);
@@ -1052,7 +1070,7 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
      * for fixed_gain_smooth.
      * The specification has an incorrect formula: the reference decoder uses
      * qbar(n-1) rather than qbar(n) in section 6.1(4) equation 71. */
-    ff_weighted_vector_sumf(p->lsf_avg, p->lsf_avg, p->lsf_q[3],
+    p->acelpv_ctx.weighted_vector_sumf(p->lsf_avg, p->lsf_avg, p->lsf_q[3],
                             0.84, 0.16, LP_FILTER_ORDER);
 
     *got_frame_ptr = 1;
diff --git a/libavcodec/amrwbdata.h b/libavcodec/amrwbdata.h
index 19f5a31..8a8cbfd 100644
--- a/libavcodec/amrwbdata.h
+++ b/libavcodec/amrwbdata.h
@@ -2,20 +2,20 @@
  * AMR wideband data and definitions
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/amrwbdec.c b/libavcodec/amrwbdec.c
index f1fbcc0..47fe7eb 100644
--- a/libavcodec/amrwbdec.c
+++ b/libavcodec/amrwbdec.c
@@ -2,20 +2,20 @@
  * AMR wideband decoder
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A particular PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "avcodec.h"
 #include "lsp.h"
 #include "celp_filters.h"
+#include "celp_math.h"
 #include "acelp_filters.h"
 #include "acelp_vectors.h"
 #include "acelp_pitch_delay.h"
@@ -41,6 +42,7 @@
 #include "amr.h"
 
 #include "amrwbdata.h"
+#include "mips/amrwbdec_mips.h"
 
 typedef struct AMRWBContext {
     AMRWBFrame                             frame; ///< AMRWB parameters decoded from bitstream
@@ -84,6 +86,11 @@ typedef struct AMRWBContext {
 
     AVLFG                                   prng; ///< random number generator for white noise excitation
     uint8_t                          first_frame; ///< flag active during decoding of the first frame
+    ACELPFContext                     acelpf_ctx; ///< context for filters for ACELP-based codecs
+    ACELPVContext                     acelpv_ctx; ///< context for vector operations for ACELP-based codecs
+    CELPFContext                       celpf_ctx; ///< context for filters for CELP-based codecs
+    CELPMContext                       celpm_ctx; ///< context for fixed point math operations
+
 } AMRWBContext;
 
 static av_cold int amrwb_decode_init(AVCodecContext *avctx)
@@ -98,7 +105,8 @@ static av_cold int amrwb_decode_init(AVCodecContext *avctx)
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 16000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 16000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
     av_lfg_init(&ctx->prng, 1);
@@ -112,6 +120,11 @@ static av_cold int amrwb_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 4; i++)
         ctx->prediction_error[i] = MIN_ENERGY;
 
+    ff_acelp_filter_init(&ctx->acelpf_ctx);
+    ff_acelp_vectors_init(&ctx->acelpv_ctx);
+    ff_celp_filter_init(&ctx->celpf_ctx);
+    ff_celp_math_init(&ctx->celpm_ctx);
+
     return 0;
 }
 
@@ -249,7 +262,7 @@ static void decode_pitch_lag_high(int *lag_int, int *lag_frac, int pitch_index,
             *lag_frac = pitch_index - (*lag_int << 2) + 136;
         } else if (pitch_index < 440) {
             *lag_int  = (pitch_index + 257 - 376) >> 1;
-            *lag_frac = (pitch_index - (*lag_int << 1) + 256 - 376) << 1;
+            *lag_frac = (pitch_index - (*lag_int << 1) + 256 - 376) * 2;
             /* the actual resolution is 1/2 but expressed as 1/4 */
         } else {
             *lag_int  = pitch_index - 280;
@@ -279,7 +292,7 @@ static void decode_pitch_lag_low(int *lag_int, int *lag_frac, int pitch_index,
     if (subframe == 0 || (subframe == 2 && mode != MODE_6k60)) {
         if (pitch_index < 116) {
             *lag_int  = (pitch_index + 69) >> 1;
-            *lag_frac = (pitch_index - (*lag_int << 1) + 68) << 1;
+            *lag_frac = (pitch_index - (*lag_int << 1) + 68) * 2;
         } else {
             *lag_int  = pitch_index - 24;
             *lag_frac = 0;
@@ -289,7 +302,7 @@ static void decode_pitch_lag_low(int *lag_int, int *lag_frac, int pitch_index,
                                 AMRWB_P_DELAY_MIN, AMRWB_P_DELAY_MAX - 15);
     } else {
         *lag_int  = (pitch_index + 1) >> 1;
-        *lag_frac = (pitch_index - (*lag_int << 1)) << 1;
+        *lag_frac = (pitch_index - (*lag_int << 1)) * 2;
         *lag_int += *base_lag_int;
     }
 }
@@ -323,7 +336,8 @@ static void decode_pitch_vector(AMRWBContext *ctx,
 
     /* Calculate the pitch vector by interpolating the past excitation at the
        pitch lag using a hamming windowed sinc function */
-    ff_acelp_interpolatef(exc, exc + 1 - pitch_lag_int,
+    ctx->acelpf_ctx.acelp_interpolatef(exc,
+                          exc + 1 - pitch_lag_int,
                           ac_inter, 4,
                           pitch_lag_frac + (pitch_lag_frac > 0 ? 0 : 4),
                           LP_ORDER, AMRWB_SFR_SIZE + 1);
@@ -341,7 +355,7 @@ static void decode_pitch_vector(AMRWBContext *ctx,
 }
 
 /** Get x bits in the index interval [lsb,lsb+len-1] inclusive */
-#define BIT_STR(x,lsb,len) (((x) >> (lsb)) & ((1 << (len)) - 1))
+#define BIT_STR(x,lsb,len) av_mod_uintp2((x) >> (lsb), (len))
 
 /** Get the bit at specified position */
 #define BIT_POS(x, p) (((x) >> (p)) & 1)
@@ -582,20 +596,22 @@ static void pitch_sharpening(AMRWBContext *ctx, float *fixed_vector)
  *
  * @param[in] p_vector, f_vector   Pitch and fixed excitation vectors
  * @param[in] p_gain, f_gain       Pitch and fixed gains
+ * @param[in] ctx                  The context
  */
 // XXX: There is something wrong with the precision here! The magnitudes
 // of the energies are not correct. Please check the reference code carefully
 static float voice_factor(float *p_vector, float p_gain,
-                          float *f_vector, float f_gain)
+                          float *f_vector, float f_gain,
+                          CELPMContext *ctx)
 {
-    double p_ener = (double) avpriv_scalarproduct_float_c(p_vector, p_vector,
+    double p_ener = (double) ctx->dot_productf(p_vector, p_vector,
                                                           AMRWB_SFR_SIZE) *
                     p_gain * p_gain;
-    double f_ener = (double) avpriv_scalarproduct_float_c(f_vector, f_vector,
+    double f_ener = (double) ctx->dot_productf(f_vector, f_vector,
                                                           AMRWB_SFR_SIZE) *
                     f_gain * f_gain;
 
-    return (p_ener - f_ener) / (p_ener + f_ener);
+    return (p_ener - f_ener) / (p_ener + f_ener + 0.01);
 }
 
 /**
@@ -755,13 +771,13 @@ static void synthesis(AMRWBContext *ctx, float *lpc, float *excitation,
                       float fixed_gain, const float *fixed_vector,
                       float *samples)
 {
-    ff_weighted_vector_sumf(excitation, ctx->pitch_vector, fixed_vector,
+    ctx->acelpv_ctx.weighted_vector_sumf(excitation, ctx->pitch_vector, fixed_vector,
                             ctx->pitch_gain[0], fixed_gain, AMRWB_SFR_SIZE);
 
     /* emphasize pitch vector contribution in low bitrate modes */
     if (ctx->pitch_gain[0] > 0.5 && ctx->fr_cur_mode <= MODE_8k85) {
         int i;
-        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+        float energy = ctx->celpm_ctx.dot_productf(excitation, excitation,
                                                     AMRWB_SFR_SIZE);
 
         // XXX: Weird part in both ref code and spec. A unknown parameter
@@ -775,7 +791,7 @@ static void synthesis(AMRWBContext *ctx, float *lpc, float *excitation,
                                                 energy, AMRWB_SFR_SIZE);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, lpc, excitation,
+    ctx->celpf_ctx.celp_lp_synthesis_filterf(samples, lpc, excitation,
                                  AMRWB_SFR_SIZE, LP_ORDER);
 }
 
@@ -807,8 +823,9 @@ static void de_emphasis(float *out, float *in, float m, float mem[1])
  * @param[out] out                 Buffer for interpolated signal
  * @param[in]  in                  Current signal data (length 0.8*o_size)
  * @param[in]  o_size              Output signal length
+ * @param[in] ctx                  The context
  */
-static void upsample_5_4(float *out, const float *in, int o_size)
+static void upsample_5_4(float *out, const float *in, int o_size, CELPMContext *ctx)
 {
     const float *in0 = in - UPS_FIR_SIZE + 1;
     int i, j, k;
@@ -821,7 +838,7 @@ static void upsample_5_4(float *out, const float *in, int o_size)
         i++;
 
         for (k = 1; k < 5; k++) {
-            out[i] = avpriv_scalarproduct_float_c(in0 + int_part,
+            out[i] = ctx->dot_productf(in0 + int_part,
                                                   upsample_fir[4 - frac_part],
                                                   UPS_MEM_SIZE);
             int_part++;
@@ -845,15 +862,20 @@ static float find_hb_gain(AMRWBContext *ctx, const float *synth,
 {
     int wsp = (vad > 0);
     float tilt;
+    float tmp;
 
     if (ctx->fr_cur_mode == MODE_23k85)
         return qua_hb_gain[hb_idx] * (1.0f / (1 << 14));
 
-    tilt = avpriv_scalarproduct_float_c(synth, synth + 1, AMRWB_SFR_SIZE - 1) /
-           avpriv_scalarproduct_float_c(synth, synth, AMRWB_SFR_SIZE);
+    tmp = ctx->celpm_ctx.dot_productf(synth, synth + 1, AMRWB_SFR_SIZE - 1);
+
+    if (tmp > 0) {
+        tilt = tmp / ctx->celpm_ctx.dot_productf(synth, synth, AMRWB_SFR_SIZE);
+    } else
+        tilt = 0;
 
     /* return gain bounded by [0.1, 1.0] */
-    return av_clipf((1.0 - FFMAX(0.0, tilt)) * (1.25 - 0.25 * wsp), 0.1, 1.0);
+    return av_clipf((1.0 - tilt) * (1.25 - 0.25 * wsp), 0.1, 1.0);
 }
 
 /**
@@ -869,7 +891,7 @@ static void scaled_hb_excitation(AMRWBContext *ctx, float *hb_exc,
                                  const float *synth_exc, float hb_gain)
 {
     int i;
-    float energy = avpriv_scalarproduct_float_c(synth_exc, synth_exc,
+    float energy = ctx->celpm_ctx.dot_productf(synth_exc, synth_exc,
                                                 AMRWB_SFR_SIZE);
 
     /* Generate a white-noise excitation */
@@ -1000,7 +1022,7 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
         float e_isf[LP_ORDER_16k]; // ISF vector for extrapolation
         double e_isp[LP_ORDER_16k];
 
-        ff_weighted_vector_sumf(e_isf, isf_past, isf, isfp_inter[subframe],
+        ctx->acelpv_ctx.weighted_vector_sumf(e_isf, isf_past, isf, isfp_inter[subframe],
                                 1.0 - isfp_inter[subframe], LP_ORDER);
 
         extrapolate_isf(e_isf);
@@ -1014,7 +1036,7 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
         lpc_weighting(hb_lpc, ctx->lp_coef[subframe], 0.6, LP_ORDER);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, hb_lpc, exc, AMRWB_SFR_SIZE_16k,
+    ctx->celpf_ctx.celp_lp_synthesis_filterf(samples, hb_lpc, exc, AMRWB_SFR_SIZE_16k,
                                  (mode == MODE_6k60) ? LP_ORDER_16k : LP_ORDER);
 }
 
@@ -1029,6 +1051,8 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
  *
  * @remark It is safe to pass the same array in in and out parameters
  */
+
+#ifndef hb_fir_filter
 static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
                           float mem[HB_FIR_SIZE], const float *in)
 {
@@ -1046,6 +1070,7 @@ static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
 
     memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
 }
+#endif /* hb_fir_filter */
 
 /**
  * Update context state before the next subframe.
@@ -1089,10 +1114,8 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 4 * AMRWB_SFR_SIZE_16k;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (float *)frame->data[0];
 
     header_size      = decode_mime_header(ctx, buf);
@@ -1163,7 +1186,7 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
         ctx->fixed_gain[0] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  avpriv_scalarproduct_float_c(ctx->fixed_vector,
+                                  ctx->celpm_ctx.dot_productf(ctx->fixed_vector,
                                                                ctx->fixed_vector,
                                                                AMRWB_SFR_SIZE) /
                                   AMRWB_SFR_SIZE,
@@ -1172,7 +1195,8 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
         /* Calculate voice factor and store tilt for next subframe */
         voice_fac      = voice_factor(ctx->pitch_vector, ctx->pitch_gain[0],
-                                      ctx->fixed_vector, ctx->fixed_gain[0]);
+                                      ctx->fixed_vector, ctx->fixed_gain[0],
+                                      &ctx->celpm_ctx);
         ctx->tilt_coef = voice_fac * 0.25 + 0.25;
 
         /* Construct current excitation */
@@ -1198,15 +1222,15 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
         de_emphasis(&ctx->samples_up[UPS_MEM_SIZE],
                     &ctx->samples_az[LP_ORDER], PREEMPH_FAC, ctx->demph_mem);
 
-        ff_acelp_apply_order_2_transfer_function(&ctx->samples_up[UPS_MEM_SIZE],
+        ctx->acelpf_ctx.acelp_apply_order_2_transfer_function(&ctx->samples_up[UPS_MEM_SIZE],
             &ctx->samples_up[UPS_MEM_SIZE], hpf_zeros, hpf_31_poles,
             hpf_31_gain, ctx->hpf_31_mem, AMRWB_SFR_SIZE);
 
         upsample_5_4(sub_buf, &ctx->samples_up[UPS_FIR_SIZE],
-                     AMRWB_SFR_SIZE_16k);
+                     AMRWB_SFR_SIZE_16k, &ctx->celpm_ctx);
 
         /* High frequency band (6.4 - 7.0 kHz) generation part */
-        ff_acelp_apply_order_2_transfer_function(hb_samples,
+        ctx->acelpf_ctx.acelp_apply_order_2_transfer_function(hb_samples,
             &ctx->samples_up[UPS_MEM_SIZE], hpf_zeros, hpf_400_poles,
             hpf_400_gain, ctx->hpf_400_mem, AMRWB_SFR_SIZE);
 
diff --git a/libavcodec/anm.c b/libavcodec/anm.c
index af8d843..ab6a399 100644
--- a/libavcodec/anm.c
+++ b/libavcodec/anm.c
@@ -2,20 +2,20 @@
  * Deluxe Paint Animation decoder
  * Copyright (c) 2009 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,12 +47,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     bytestream2_init(&s->gb, avctx->extradata, avctx->extradata_size);
-    if (bytestream2_get_bytes_left(&s->gb) < 16 * 8 + 4 * 256)
+    if (bytestream2_get_bytes_left(&s->gb) < 16 * 8 + 4 * 256) {
+        av_frame_free(&s->frame);
         return AVERROR_INVALIDDATA;
+    }
 
     bytestream2_skipu(&s->gb, 16 * 8);
     for (i = 0; i < 256; i++)
-        s->palette[i] = bytestream2_get_le32u(&s->gb);
+        s->palette[i] = (0xFFU << 24) | bytestream2_get_le32u(&s->gb);
 
     return 0;
 }
@@ -117,10 +119,8 @@ static int decode_frame(AVCodecContext *avctx,
     uint8_t *dst, *dst_end;
     int count, ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
     dst     = s->frame->data[0];
     dst_end = s->frame->data[0] + s->frame->linesize[0]*avctx->height;
 
@@ -128,11 +128,11 @@ static int decode_frame(AVCodecContext *avctx,
 
     if (bytestream2_get_byte(&s->gb) != 0x42) {
         avpriv_request_sample(avctx, "Unknown record type");
-        return buf_size;
+        return AVERROR_INVALIDDATA;
     }
     if (bytestream2_get_byte(&s->gb)) {
         avpriv_request_sample(avctx, "Padding bytes");
-        return buf_size;
+        return AVERROR_PATCHWELCOME;
     }
     bytestream2_skip(&s->gb, 2);
 
diff --git a/libavcodec/ansi.c b/libavcodec/ansi.c
index 0bdbdbe..f1fafab 100644
--- a/libavcodec/ansi.c
+++ b/libavcodec/ansi.c
@@ -2,20 +2,20 @@
  * ASCII/ANSI art decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "libavutil/common.h"
 #include "libavutil/frame.h"
 #include "libavutil/lfg.h"
+#include "libavutil/xga_font_data.h"
 #include "avcodec.h"
 #include "cga_data.h"
 #include "internal.h"
@@ -60,6 +61,7 @@ typedef struct AnsiContext {
     int attributes;       /**< attribute flags */
     int fg;               /**< foreground color */
     int bg;               /**< background color */
+    int first_frame;
 
     /* ansi parser state machine */
     enum {
@@ -78,12 +80,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
     AnsiContext *s = avctx->priv_data;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
-    s->frame = av_frame_alloc();
-    if (!s->frame)
-        return AVERROR(ENOMEM);
-
     /* defaults */
-    s->font        = ff_vga16_font;
+    s->font        = avpriv_vga16_font;
     s->font_height = 16;
     s->fg          = DEFAULT_FG_COLOR;
     s->bg          = DEFAULT_BG_COLOR;
@@ -92,16 +90,39 @@ static av_cold int decode_init(AVCodecContext *avctx)
         int ret = ff_set_dimensions(avctx, 80 << 3, 25 << 4);
         if (ret < 0)
             return ret;
+    } else if (avctx->width % FONT_WIDTH || avctx->height % s->font_height) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid dimensions %d %d\n", avctx->width, avctx->height);
+        return AVERROR(EINVAL);
     }
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
+static void set_palette(uint32_t *pal)
+{
+    int r, g, b;
+    memcpy(pal, ff_cga_palette, 16 * 4);
+    pal += 16;
+#define COLOR(x) ((x) * 40 + 55)
+    for (r = 0; r < 6; r++)
+        for (g = 0; g < 6; g++)
+            for (b = 0; b < 6; b++)
+                *pal++ = 0xFF000000 | (COLOR(r) << 16) | (COLOR(g) << 8) | COLOR(b);
+#define GRAY(x) ((x) * 10 + 8)
+    for (g = 0; g < 24; g++)
+        *pal++ = 0xFF000000 | (GRAY(g) << 16) | (GRAY(g) << 8) | GRAY(g);
+}
+
 static void hscroll(AVCodecContext *avctx)
 {
     AnsiContext *s = avctx->priv_data;
     int i;
 
-    if (s->y < avctx->height - s->font_height) {
+    if (s->y <= avctx->height - 2*s->font_height) {
         s->y += s->font_height;
         return;
     }
@@ -154,7 +175,7 @@ static void draw_char(AVCodecContext *avctx, int c)
     ff_draw_pc_font(s->frame->data[0] + s->y * s->frame->linesize[0] + s->x,
                     s->frame->linesize[0], s->font, s->font_height, c, fg, bg);
     s->x += FONT_WIDTH;
-    if (s->x >= avctx->width) {
+    if (s->x > avctx->width - FONT_WIDTH) {
         s->x = 0;
         hscroll(avctx);
     }
@@ -168,8 +189,8 @@ static int execute_code(AVCodecContext * avctx, int c)
 {
     AnsiContext *s = avctx->priv_data;
     int ret, i;
-    int width = 0;
-    int height = 0;
+    int width  = avctx->width;
+    int height = avctx->height;
 
     switch(c) {
     case 'A': //Cursor Up
@@ -195,19 +216,19 @@ static int execute_code(AVCodecContext * avctx, int c)
             s->args[0] = DEFAULT_SCREEN_MODE;
         switch(s->args[0]) {
         case 0: case 1: case 4: case 5: case 13: case 19: //320x200 (25 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 40<<3;
             height = 25<<3;
             break;
         case 2: case 3: //640x400 (25 rows)
-            s->font = ff_vga16_font;
+            s->font = avpriv_vga16_font;
             s->font_height = 16;
             width  = 80<<3;
             height = 25<<4;
             break;
         case 6: case 14: //640x200 (25 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 25<<3;
@@ -215,13 +236,13 @@ static int execute_code(AVCodecContext * avctx, int c)
         case 7: //set line wrapping
             break;
         case 15: case 16: //640x350 (43 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 43<<3;
             break;
         case 17: case 18: //640x480 (60 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 60<<4;
@@ -229,20 +250,19 @@ static int execute_code(AVCodecContext * avctx, int c)
         default:
             avpriv_request_sample(avctx, "Unsupported screen mode");
         }
-        if (width != 0 && height != 0 &&
-            (width != avctx->width || height != avctx->height)) {
+        s->x = av_clip(s->x, 0, width  - FONT_WIDTH);
+        s->y = av_clip(s->y, 0, height - s->font_height);
+        if (width != avctx->width || height != avctx->height) {
             av_frame_unref(s->frame);
             ret = ff_set_dimensions(avctx, width, height);
             if (ret < 0)
                 return ret;
-            ret = ff_get_buffer(avctx, s->frame, AV_GET_BUFFER_FLAG_REF);
-            if (ret < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, s->frame,
+                                     AV_GET_BUFFER_FLAG_REF)) < 0)
                 return ret;
-            }
             s->frame->pict_type           = AV_PICTURE_TYPE_I;
             s->frame->palette_has_changed = 1;
-            memcpy(s->frame->data[1], ff_cga_palette, 16 * 4);
+            set_palette((uint32_t *)s->frame->data[1]);
             erase_screen(avctx);
         } else if (c == 'l') {
             erase_screen(avctx);
@@ -290,12 +310,20 @@ static int execute_code(AVCodecContext * avctx, int c)
                 s->bg = DEFAULT_BG_COLOR;
             } else if (m == 1 || m == 2 || m == 4 || m == 5 || m == 7 || m == 8) {
                 s->attributes |= 1 << (m - 1);
-            } else if (m >= 30 && m <= 38) {
+            } else if (m >= 30 && m <= 37) {
                 s->fg = ansi_to_cga[m - 30];
+            } else if (m == 38 && i + 2 < FFMIN(s->nb_args, MAX_NB_ARGS) && s->args[i + 1] == 5 && s->args[i + 2] < 256) {
+                int index = s->args[i + 2];
+                s->fg = index < 16 ? ansi_to_cga[index] : index;
+                i += 2;
             } else if (m == 39) {
                 s->fg = ansi_to_cga[DEFAULT_FG_COLOR];
             } else if (m >= 40 && m <= 47) {
                 s->bg = ansi_to_cga[m - 40];
+            } else if (m == 48 && i + 2 < FFMIN(s->nb_args, MAX_NB_ARGS) && s->args[i + 1] == 5 && s->args[i + 2] < 256) {
+                int index = s->args[i + 2];
+                s->bg = index < 16 ? ansi_to_cga[index] : index;
+                i += 2;
             } else if (m == 49) {
                 s->fg = ansi_to_cga[DEFAULT_BG_COLOR];
             } else {
@@ -319,6 +347,8 @@ static int execute_code(AVCodecContext * avctx, int c)
         avpriv_request_sample(avctx, "Unknown escape code");
         break;
     }
+    s->x = av_clip(s->x, 0, avctx->width  - FONT_WIDTH);
+    s->y = av_clip(s->y, 0, avctx->height - s->font_height);
     return 0;
 }
 
@@ -332,19 +362,21 @@ static int decode_frame(AVCodecContext *avctx,
     const uint8_t *buf_end   = buf+buf_size;
     int ret, i, count;
 
-    ret = ff_reget_buffer(avctx, s->frame);
-    if (ret < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
     if (!avctx->frame_number) {
-        memset(s->frame->data[0], 0, avctx->height * FFABS(s->frame->linesize[0]));
+        for (i=0; i<avctx->height; i++)
+            memset(s->frame->data[0]+ i*s->frame->linesize[0], 0, avctx->width);
         memset(s->frame->data[1], 0, AVPALETTE_SIZE);
     }
 
     s->frame->pict_type           = AV_PICTURE_TYPE_I;
     s->frame->palette_has_changed = 1;
-    memcpy(s->frame->data[1], ff_cga_palette, 16 * 4);
+    set_palette((uint32_t *)s->frame->data[1]);
+    if (!s->first_frame) {
+        erase_screen(avctx);
+        s->first_frame = 1;
+    }
 
     while(buf < buf_end) {
         switch(s->state) {
@@ -383,7 +415,7 @@ static int decode_frame(AVCodecContext *avctx,
             if (buf[0] == '[') {
                 s->state   = STATE_CODE;
                 s->nb_args = 0;
-                s->args[0] = 0;
+                s->args[0] = -1;
             } else {
                 s->state = STATE_NORMAL;
                 draw_char(avctx, 0x1B);
@@ -394,8 +426,8 @@ static int decode_frame(AVCodecContext *avctx,
             switch(buf[0]) {
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9':
-                if (s->nb_args < MAX_NB_ARGS)
-                    s->args[s->nb_args] = s->args[s->nb_args] * 10 + buf[0] - '0';
+                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args] < 6553)
+                    s->args[s->nb_args] = FFMAX(s->args[s->nb_args], 0) * 10 + buf[0] - '0';
                 break;
             case ';':
                 s->nb_args++;
@@ -411,7 +443,7 @@ static int decode_frame(AVCodecContext *avctx,
             default:
                 if (s->nb_args > MAX_NB_ARGS)
                     av_log(avctx, AV_LOG_WARNING, "args overflow (%i)\n", s->nb_args);
-                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args])
+                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args] >= 0)
                     s->nb_args++;
                 if ((ret = execute_code(avctx, buf[0])) < 0)
                     return ret;
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index da45e85..15eb416 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
  *  based upon libdemac from Dave Chapman.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,13 +25,12 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
-
-#include "apedsp.h"
+#include "lossless_audiodsp.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bswapdsp.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "unary.h"
 
 /**
@@ -138,7 +137,7 @@ typedef struct APEContext {
     AVClass *class;                          ///< class for AVOptions
     AVCodecContext *avctx;
     BswapDSPContext bdsp;
-    APEDSPContext adsp;
+    LLAudDSPContext adsp;
     int channels;
     int samples;                             ///< samples left to decode in current frame
     int bps;
@@ -163,7 +162,7 @@ typedef struct APEContext {
     APERice riceX;                           ///< rice code parameters for the second channel
     APERice riceY;                           ///< rice code parameters for the first channel
     APEFilter filters[APE_FILTER_LEVELS][2]; ///< filters used for reconstruction
-    BitstreamContext bc;
+    GetBitContext gb;
 
     uint8_t *data;                           ///< current frame data
     uint8_t *data_end;                       ///< frame data end
@@ -213,19 +212,6 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul)
-{
-    int res = 0;
-
-    while (order--) {
-        res   += *v1 * *v2++;
-        *v1++ += mul * *v3++;
-    }
-    return res;
-}
-
 static av_cold int ape_decode_init(AVCodecContext *avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -261,9 +247,10 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
     s->compression_level = AV_RL16(avctx->extradata + 2);
     s->flags             = AV_RL16(avctx->extradata + 4);
 
-    av_log(avctx, AV_LOG_DEBUG, "Compression Level: %d - Flags: %d\n",
+    av_log(avctx, AV_LOG_VERBOSE, "Compression Level: %d - Flags: %d\n",
            s->compression_level, s->flags);
     if (s->compression_level % 1000 || s->compression_level > COMPRESSION_LEVEL_INSANE ||
+        !s->compression_level ||
         (s->fileversion < 3930 && s->compression_level == COMPRESSION_LEVEL_INSANE)) {
         av_log(avctx, AV_LOG_ERROR, "Incorrect compression level %d\n",
                s->compression_level);
@@ -306,16 +293,8 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
         s->predictor_decode_stereo = predictor_decode_stereo_3950;
     }
 
-    s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-
-    if (ARCH_ARM)
-        ff_apedsp_init_arm(&s->adsp);
-    if (ARCH_PPC)
-        ff_apedsp_init_ppc(&s->adsp);
-    if (ARCH_X86)
-        ff_apedsp_init_x86(&s->adsp);
-
     ff_bswapdsp_init(&s->bdsp);
+    ff_llauddsp_init(&s->adsp);
     avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
     return 0;
@@ -485,24 +464,24 @@ static inline void update_rice(APERice *rice, unsigned int x)
         rice->k++;
 }
 
-static inline int get_rice_ook(BitstreamContext *bc, int k)
+static inline int get_rice_ook(GetBitContext *gb, int k)
 {
     unsigned int x;
 
-    x = get_unary(bc, 1, bitstream_bits_left(bc));
+    x = get_unary(gb, 1, get_bits_left(gb));
 
     if (k)
-        x = (x << k) | bitstream_read(bc, k);
+        x = (x << k) | get_bits(gb, k);
 
     return x;
 }
 
-static inline int ape_decode_value_3860(APEContext *ctx, BitstreamContext *bc,
+static inline int ape_decode_value_3860(APEContext *ctx, GetBitContext *gb,
                                         APERice *rice)
 {
     unsigned int x, overflow;
 
-    overflow = get_unary(bc, 1, bitstream_bits_left(bc));
+    overflow = get_unary(gb, 1, get_bits_left(gb));
 
     if (ctx->fileversion > 3880) {
         while (overflow >= 16) {
@@ -513,9 +492,12 @@ static inline int ape_decode_value_3860(APEContext *ctx, BitstreamContext *bc,
 
     if (!rice->k)
         x = overflow;
-    else
-        x = (overflow << rice->k) + bitstream_read(bc, rice->k);
-
+    else if(rice->k <= MIN_CACHE_BITS) {
+        x = (overflow << rice->k) + get_bits(gb, rice->k);
+    } else {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Too many bits: %"PRIu32"\n", rice->k);
+        return AVERROR_INVALIDDATA;
+    }
     rice->ksum += x - (rice->ksum + 8 >> 4);
     if (rice->ksum < (rice->k ? 1 << (rice->k + 4) : 0))
         rice->k--;
@@ -523,10 +505,7 @@ static inline int ape_decode_value_3860(APEContext *ctx, BitstreamContext *bc,
         rice->k++;
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
@@ -542,9 +521,13 @@ static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
     } else
         tmpk = (rice->k < 1) ? 0 : rice->k - 1;
 
-    if (tmpk <= 16 || ctx->fileversion < 3910)
+    if (tmpk <= 16 || ctx->fileversion < 3910) {
+        if (tmpk > 23) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "Too many bits: %d\n", tmpk);
+            return AVERROR_INVALIDDATA;
+        }
         x = range_decode_bits(ctx, tmpk);
-    else if (tmpk <= 32) {
+    } else if (tmpk <= 31) {
         x = range_decode_bits(ctx, 16);
         x |= (range_decode_bits(ctx, tmpk - 16) << 16);
     } else {
@@ -556,10 +539,7 @@ static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
     update_rice(rice, x);
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static inline int ape_decode_value_3990(APEContext *ctx, APERice *rice)
@@ -602,13 +582,10 @@ static inline int ape_decode_value_3990(APEContext *ctx, APERice *rice)
     update_rice(rice, x);
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
-static void decode_array_0000(APEContext *ctx, BitstreamContext *bc,
+static void decode_array_0000(APEContext *ctx, GetBitContext *gb,
                               int32_t *out, APERice *rice, int blockstodecode)
 {
     int i;
@@ -616,19 +593,23 @@ static void decode_array_0000(APEContext *ctx, BitstreamContext *bc,
 
     rice->ksum = 0;
     for (i = 0; i < FFMIN(blockstodecode, 5); i++) {
-        out[i] = get_rice_ook(&ctx->bc, 10);
+        out[i] = get_rice_ook(&ctx->gb, 10);
         rice->ksum += out[i];
     }
     rice->k = av_log2(rice->ksum / 10) + 1;
+    if (rice->k >= 24)
+        return;
     for (; i < FFMIN(blockstodecode, 64); i++) {
-        out[i] = get_rice_ook(&ctx->bc, rice->k);
+        out[i] = get_rice_ook(&ctx->gb, rice->k);
         rice->ksum += out[i];
         rice->k = av_log2(rice->ksum / ((i + 1) * 2)) + 1;
+        if (rice->k >= 24)
+            return;
     }
     ksummax = 1 << rice->k + 7;
     ksummin = rice->k ? (1 << rice->k + 6) : 0;
     for (; i < blockstodecode; i++) {
-        out[i] = get_rice_ook(&ctx->bc, rice->k);
+        out[i] = get_rice_ook(&ctx->gb, rice->k);
         rice->ksum += out[i] - out[i - 64];
         while (rice->ksum < ksummin) {
             rice->k--;
@@ -644,25 +625,21 @@ static void decode_array_0000(APEContext *ctx, BitstreamContext *bc,
         }
     }
 
-    for (i = 0; i < blockstodecode; i++) {
-        if (out[i] & 1)
-            out[i] = (out[i] >> 1) + 1;
-        else
-            out[i] = -(out[i] >> 1);
-    }
+    for (i = 0; i < blockstodecode; i++)
+        out[i] = ((out[i] >> 1) ^ ((out[i] & 1) - 1)) + 1;
 }
 
 static void entropy_decode_mono_0000(APEContext *ctx, int blockstodecode)
 {
-    decode_array_0000(ctx, &ctx->bc, ctx->decoded[0], &ctx->riceY,
+    decode_array_0000(ctx, &ctx->gb, ctx->decoded[0], &ctx->riceY,
                       blockstodecode);
 }
 
 static void entropy_decode_stereo_0000(APEContext *ctx, int blockstodecode)
 {
-    decode_array_0000(ctx, &ctx->bc, ctx->decoded[0], &ctx->riceY,
+    decode_array_0000(ctx, &ctx->gb, ctx->decoded[0], &ctx->riceY,
                       blockstodecode);
-    decode_array_0000(ctx, &ctx->bc, ctx->decoded[1], &ctx->riceX,
+    decode_array_0000(ctx, &ctx->gb, ctx->decoded[1], &ctx->riceX,
                       blockstodecode);
 }
 
@@ -671,7 +648,7 @@ static void entropy_decode_mono_3860(APEContext *ctx, int blockstodecode)
     int32_t *decoded0 = ctx->decoded[0];
 
     while (blockstodecode--)
-        *decoded0++ = ape_decode_value_3860(ctx, &ctx->bc, &ctx->riceY);
+        *decoded0++ = ape_decode_value_3860(ctx, &ctx->gb, &ctx->riceY);
 }
 
 static void entropy_decode_stereo_3860(APEContext *ctx, int blockstodecode)
@@ -681,9 +658,9 @@ static void entropy_decode_stereo_3860(APEContext *ctx, int blockstodecode)
     int blocks = blockstodecode;
 
     while (blockstodecode--)
-        *decoded0++ = ape_decode_value_3860(ctx, &ctx->bc, &ctx->riceY);
+        *decoded0++ = ape_decode_value_3860(ctx, &ctx->gb, &ctx->riceY);
     while (blocks--)
-        *decoded1++ = ape_decode_value_3860(ctx, &ctx->bc, &ctx->riceX);
+        *decoded1++ = ape_decode_value_3860(ctx, &ctx->gb, &ctx->riceX);
 }
 
 static void entropy_decode_mono_3900(APEContext *ctx, int blockstodecode)
@@ -748,7 +725,7 @@ static int init_entropy_decoder(APEContext *ctx)
             return AVERROR_INVALIDDATA;
         ctx->CRC = bytestream_get_be32(&ctx->ptr);
     } else {
-        ctx->CRC = bitstream_read(&ctx->bc, 32);
+        ctx->CRC = get_bits_long(&ctx->gb, 32);
     }
 
     /* Read the frame flags if they exist */
@@ -909,11 +886,14 @@ static av_always_inline int filter_3800(APEPredictor *p,
     return p->filterA[filter];
 }
 
-static void long_filter_high_3800(int32_t *buffer, int order, int shift,
-                                  int32_t *coeffs, int32_t *delay, int length)
+static void long_filter_high_3800(int32_t *buffer, int order, int shift, int length)
 {
     int i, j;
     int32_t dotprod, sign;
+    int32_t coeffs[256], delay[256];
+
+    if (order >= length)
+        return;
 
     memset(coeffs, 0, order * sizeof(*coeffs));
     for (i = 0; i < order; i++)
@@ -923,7 +903,7 @@ static void long_filter_high_3800(int32_t *buffer, int order, int shift,
         sign = APESIGN(buffer[i]);
         for (j = 0; j < order; j++) {
             dotprod += delay[j] * coeffs[j];
-            coeffs[j] -= (((delay[j] >> 30) & 2) - 1) * sign;
+            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
         }
         buffer[i] -= dotprod >> shift;
         for (j = 0; j < order - 1; j++)
@@ -943,7 +923,7 @@ static void long_filter_ehigh_3830(int32_t *buffer, int length)
         sign = APESIGN(buffer[i]);
         for (j = 7; j >= 0; j--) {
             dotprod += delay[j] * coeffs[j];
-            coeffs[j] -= (((delay[j] >> 30) & 2) - 1) * sign;
+            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
         }
         for (j = 7; j > 0; j--)
             delay[j] = delay[j - 1];
@@ -957,13 +937,12 @@ static void predictor_decode_stereo_3800(APEContext *ctx, int count)
     APEPredictor *p = &ctx->predictor;
     int32_t *decoded0 = ctx->decoded[0];
     int32_t *decoded1 = ctx->decoded[1];
-    int32_t coeffs[256], delay[256];
     int start = 4, shift = 10;
 
     if (ctx->compression_level == COMPRESSION_LEVEL_HIGH) {
         start = 16;
-        long_filter_high_3800(decoded0, 16, 9, coeffs, delay, count);
-        long_filter_high_3800(decoded1, 16, 9, coeffs, delay, count);
+        long_filter_high_3800(decoded0, 16, 9, count);
+        long_filter_high_3800(decoded1, 16, 9, count);
     } else if (ctx->compression_level == COMPRESSION_LEVEL_EXTRA_HIGH) {
         int order = 128, shift2 = 11;
 
@@ -975,8 +954,8 @@ static void predictor_decode_stereo_3800(APEContext *ctx, int count)
             long_filter_ehigh_3830(decoded1 + order, count - order);
         }
         start = order;
-        long_filter_high_3800(decoded0, order, shift2, coeffs, delay, count);
-        long_filter_high_3800(decoded1, order, shift2, coeffs, delay, count);
+        long_filter_high_3800(decoded0, order, shift2, count);
+        long_filter_high_3800(decoded1, order, shift2, count);
     }
 
     while (count--) {
@@ -1012,12 +991,11 @@ static void predictor_decode_mono_3800(APEContext *ctx, int count)
 {
     APEPredictor *p = &ctx->predictor;
     int32_t *decoded0 = ctx->decoded[0];
-    int32_t coeffs[256], delay[256];
     int start = 4, shift = 10;
 
     if (ctx->compression_level == COMPRESSION_LEVEL_HIGH) {
         start = 16;
-        long_filter_high_3800(decoded0, 16, 9, coeffs, delay, count);
+        long_filter_high_3800(decoded0, 16, 9, count);
     } else if (ctx->compression_level == COMPRESSION_LEVEL_EXTRA_HIGH) {
         int order = 128, shift2 = 11;
 
@@ -1028,7 +1006,7 @@ static void predictor_decode_mono_3800(APEContext *ctx, int count)
             long_filter_ehigh_3830(decoded0 + order, count - order);
         }
         start = order;
-        long_filter_high_3800(decoded0, order, shift2, coeffs, delay, count);
+        long_filter_high_3800(decoded0, order, shift2, count);
     }
 
     while (count--) {
@@ -1402,7 +1380,7 @@ static void ape_unpack_stereo(APEContext *ctx, int count)
     int32_t *decoded0 = ctx->decoded[0];
     int32_t *decoded1 = ctx->decoded[1];
 
-    if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
+    if ((ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) == APE_FRAMECODE_STEREO_SILENCE) {
         /* We are pure silence, so we're done. */
         av_log(ctx->avctx, AV_LOG_DEBUG, "pure silence stereo\n");
         return;
@@ -1434,6 +1412,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
     int32_t *sample24;
     int i, ch, ret;
     int blockstodecode;
+    uint64_t decoded_buffer_size;
 
     /* this should never be negative, but bad things will happen if it is, so
        check it just to make sure. */
@@ -1458,7 +1437,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
         }
         if (s->fileversion < 3950) // previous versions overread two bytes
             buf_size += 2;
-        av_fast_malloc(&s->data, &s->data_size, buf_size);
+        av_fast_padded_malloc(&s->data, &s->data_size, buf_size);
         if (!s->data)
             return AVERROR(ENOMEM);
         s->bdsp.bswap_buf((uint32_t *) s->data, (const uint32_t *) buf,
@@ -1481,26 +1460,26 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
             }
             s->ptr += offset;
         } else {
-            bitstream_init8(&s->bc, s->ptr, s->data_end - s->ptr);
+            if ((ret = init_get_bits8(&s->gb, s->ptr, s->data_end - s->ptr)) < 0)
+                return ret;
             if (s->fileversion > 3800)
-                bitstream_skip(&s->bc, offset * 8);
+                skip_bits_long(&s->gb, offset * 8);
             else
-                bitstream_skip(&s->bc, offset);
+                skip_bits_long(&s->gb, offset);
         }
 
-        if (!nblocks || nblocks > INT_MAX) {
+        if (!nblocks || nblocks > INT_MAX / 2 / sizeof(*s->decoded_buffer) - 8) {
             av_log(avctx, AV_LOG_ERROR, "Invalid sample count: %"PRIu32".\n",
                    nblocks);
             return AVERROR_INVALIDDATA;
         }
-        s->samples = nblocks;
 
         /* Initialize the frame decoder */
         if (init_frame_decoder(s) < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error reading frame header\n");
             return AVERROR_INVALIDDATA;
         }
-
+        s->samples = nblocks;
     }
 
     if (!s->data) {
@@ -1515,8 +1494,9 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
         blockstodecode = s->samples;
 
     /* reallocate decoded sample buffer if needed */
-    av_fast_malloc(&s->decoded_buffer, &s->decoded_size,
-                   2 * FFALIGN(blockstodecode, 8) * sizeof(*s->decoded_buffer));
+    decoded_buffer_size = 2LL * FFALIGN(blockstodecode, 8) * sizeof(*s->decoded_buffer);
+    av_assert0(decoded_buffer_size <= INT_MAX);
+    av_fast_malloc(&s->decoded_buffer, &s->decoded_size, decoded_buffer_size);
     if (!s->decoded_buffer)
         return AVERROR(ENOMEM);
     memset(s->decoded_buffer, 0, s->decoded_size);
@@ -1525,10 +1505,8 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = blockstodecode;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     s->error=0;
 
@@ -1572,7 +1550,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return (s->samples == 0) ? avpkt->size : 0;
+    return !s->samples ? avpkt->size : 0;
 }
 
 static void ape_flush(AVCodecContext *avctx)
diff --git a/libavcodec/apng.h b/libavcodec/apng.h
new file mode 100644
index 0000000..41249e0
--- /dev/null
+++ b/libavcodec/apng.h
@@ -0,0 +1,41 @@
+/*
+ * APNG common header
+ * Copyright (c) 2014 Benoit Fouet
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * APNG common header
+ */
+
+#ifndef AVCODEC_APNG_H
+#define AVCODEC_APNG_H
+
+enum {
+   APNG_DISPOSE_OP_NONE       = 0,
+   APNG_DISPOSE_OP_BACKGROUND = 1,
+   APNG_DISPOSE_OP_PREVIOUS   = 2,
+};
+
+enum {
+    APNG_BLEND_OP_SOURCE = 0,
+    APNG_BLEND_OP_OVER   = 1,
+};
+
+#endif /* AVCODEC_APNG_H */
diff --git a/libavcodec/aptx.c b/libavcodec/aptx.c
new file mode 100644
index 0000000..8750d84
--- /dev/null
+++ b/libavcodec/aptx.c
@@ -0,0 +1,1162 @@
+/*
+ * Audio Processing Technology codec for Bluetooth (aptX)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "mathops.h"
+#include "audio_frame_queue.h"
+
+
+enum channels {
+    LEFT,
+    RIGHT,
+    NB_CHANNELS
+};
+
+enum subbands {
+    LF,  // Low Frequency (0-5.5 kHz)
+    MLF, // Medium-Low Frequency (5.5-11kHz)
+    MHF, // Medium-High Frequency (11-16.5kHz)
+    HF,  // High Frequency (16.5-22kHz)
+    NB_SUBBANDS
+};
+
+#define NB_FILTERS 2
+#define FILTER_TAPS 16
+
+typedef struct {
+    int pos;
+    int32_t buffer[2*FILTER_TAPS];
+} FilterSignal;
+
+typedef struct {
+    FilterSignal outer_filter_signal[NB_FILTERS];
+    FilterSignal inner_filter_signal[NB_FILTERS][NB_FILTERS];
+} QMFAnalysis;
+
+typedef struct {
+    int32_t quantized_sample;
+    int32_t quantized_sample_parity_change;
+    int32_t error;
+} Quantize;
+
+typedef struct {
+    int32_t quantization_factor;
+    int32_t factor_select;
+    int32_t reconstructed_difference;
+} InvertQuantize;
+
+typedef struct {
+    int32_t prev_sign[2];
+    int32_t s_weight[2];
+    int32_t d_weight[24];
+    int32_t pos;
+    int32_t reconstructed_differences[48];
+    int32_t previous_reconstructed_sample;
+    int32_t predicted_difference;
+    int32_t predicted_sample;
+} Prediction;
+
+typedef struct {
+    int32_t codeword_history;
+    int32_t dither_parity;
+    int32_t dither[NB_SUBBANDS];
+
+    QMFAnalysis qmf;
+    Quantize quantize[NB_SUBBANDS];
+    InvertQuantize invert_quantize[NB_SUBBANDS];
+    Prediction prediction[NB_SUBBANDS];
+} Channel;
+
+typedef struct {
+    int hd;
+    int block_size;
+    int32_t sync_idx;
+    Channel channels[NB_CHANNELS];
+    AudioFrameQueue afq;
+} AptXContext;
+
+
+static const int32_t quantize_intervals_LF[65] = {
+      -9948,    9948,   29860,   49808,   69822,   89926,  110144,  130502,
+     151026,  171738,  192666,  213832,  235264,  256982,  279014,  301384,
+     324118,  347244,  370790,  394782,  419250,  444226,  469742,  495832,
+     522536,  549890,  577936,  606720,  636290,  666700,  698006,  730270,
+     763562,  797958,  833538,  870398,  908640,  948376,  989740, 1032874,
+    1077948, 1125150, 1174700, 1226850, 1281900, 1340196, 1402156, 1468282,
+    1539182, 1615610, 1698514, 1789098, 1888944, 2000168, 2125700, 2269750,
+    2438670, 2642660, 2899462, 3243240, 3746078, 4535138, 5664098, 7102424,
+    8897462,
+};
+static const int32_t invert_quantize_dither_factors_LF[65] = {
+       9948,   9948,   9962,   9988,  10026,  10078,  10142,  10218,
+      10306,  10408,  10520,  10646,  10784,  10934,  11098,  11274,
+      11462,  11664,  11880,  12112,  12358,  12618,  12898,  13194,
+      13510,  13844,  14202,  14582,  14988,  15422,  15884,  16380,
+      16912,  17484,  18098,  18762,  19480,  20258,  21106,  22030,
+      23044,  24158,  25390,  26760,  28290,  30008,  31954,  34172,
+      36728,  39700,  43202,  47382,  52462,  58762,  66770,  77280,
+      91642, 112348, 144452, 199326, 303512, 485546, 643414, 794914,
+    1000124,
+};
+static const int32_t quantize_dither_factors_LF[65] = {
+        0,     4,     7,    10,    13,    16,    19,    22,
+       26,    28,    32,    35,    38,    41,    44,    47,
+       51,    54,    58,    62,    65,    70,    74,    79,
+       84,    90,    95,   102,   109,   116,   124,   133,
+      143,   154,   166,   180,   195,   212,   231,   254,
+      279,   308,   343,   383,   430,   487,   555,   639,
+      743,   876,  1045,  1270,  1575,  2002,  2628,  3591,
+     5177,  8026, 13719, 26047, 45509, 39467, 37875, 51303,
+        0,
+};
+static const int16_t quantize_factor_select_offset_LF[65] = {
+      0, -21, -19, -17, -15, -12, -10,  -8,
+     -6,  -4,  -1,   1,   3,   6,   8,  10,
+     13,  15,  18,  20,  23,  26,  29,  31,
+     34,  37,  40,  43,  47,  50,  53,  57,
+     60,  64,  68,  72,  76,  80,  85,  89,
+     94,  99, 105, 110, 116, 123, 129, 136,
+    144, 152, 161, 171, 182, 194, 207, 223,
+    241, 263, 291, 328, 382, 467, 522, 522,
+    522,
+};
+
+
+static const int32_t quantize_intervals_MLF[9] = {
+    -89806, 89806, 278502, 494338, 759442, 1113112, 1652322, 2720256, 5190186,
+};
+static const int32_t invert_quantize_dither_factors_MLF[9] = {
+    89806, 89806, 98890, 116946, 148158, 205512, 333698, 734236, 1735696,
+};
+static const int32_t quantize_dither_factors_MLF[9] = {
+    0, 2271, 4514, 7803, 14339, 32047, 100135, 250365, 0,
+};
+static const int16_t quantize_factor_select_offset_MLF[9] = {
+    0, -14, 6, 29, 58, 96, 154, 270, 521,
+};
+
+
+static const int32_t quantize_intervals_MHF[3] = {
+    -194080, 194080, 890562,
+};
+static const int32_t invert_quantize_dither_factors_MHF[3] = {
+    194080, 194080, 502402,
+};
+static const int32_t quantize_dither_factors_MHF[3] = {
+    0, 77081, 0,
+};
+static const int16_t quantize_factor_select_offset_MHF[3] = {
+    0, -33, 136,
+};
+
+
+static const int32_t quantize_intervals_HF[5] = {
+    -163006, 163006, 542708, 1120554, 2669238,
+};
+static const int32_t invert_quantize_dither_factors_HF[5] = {
+    163006, 163006, 216698, 361148, 1187538,
+};
+static const int32_t quantize_dither_factors_HF[5] = {
+    0, 13423, 36113, 206598, 0,
+};
+static const int16_t quantize_factor_select_offset_HF[5] = {
+    0, -8, 33, 95, 262,
+};
+
+
+static const int32_t hd_quantize_intervals_LF[257] = {
+      -2436,    2436,    7308,   12180,   17054,   21930,   26806,   31686,
+      36566,   41450,   46338,   51230,   56124,   61024,   65928,   70836,
+      75750,   80670,   85598,   90530,   95470,  100418,  105372,  110336,
+     115308,  120288,  125278,  130276,  135286,  140304,  145334,  150374,
+     155426,  160490,  165566,  170654,  175756,  180870,  185998,  191138,
+     196294,  201466,  206650,  211850,  217068,  222300,  227548,  232814,
+     238096,  243396,  248714,  254050,  259406,  264778,  270172,  275584,
+     281018,  286470,  291944,  297440,  302956,  308496,  314056,  319640,
+     325248,  330878,  336532,  342212,  347916,  353644,  359398,  365178,
+     370986,  376820,  382680,  388568,  394486,  400430,  406404,  412408,
+     418442,  424506,  430600,  436726,  442884,  449074,  455298,  461554,
+     467844,  474168,  480528,  486922,  493354,  499820,  506324,  512866,
+     519446,  526064,  532722,  539420,  546160,  552940,  559760,  566624,
+     573532,  580482,  587478,  594520,  601606,  608740,  615920,  623148,
+     630426,  637754,  645132,  652560,  660042,  667576,  675164,  682808,
+     690506,  698262,  706074,  713946,  721876,  729868,  737920,  746036,
+     754216,  762460,  770770,  779148,  787594,  796108,  804694,  813354,
+     822086,  830892,  839774,  848736,  857776,  866896,  876100,  885386,
+     894758,  904218,  913766,  923406,  933138,  942964,  952886,  962908,
+     973030,  983254,  993582, 1004020, 1014566, 1025224, 1035996, 1046886,
+    1057894, 1069026, 1080284, 1091670, 1103186, 1114838, 1126628, 1138558,
+    1150634, 1162858, 1175236, 1187768, 1200462, 1213320, 1226346, 1239548,
+    1252928, 1266490, 1280242, 1294188, 1308334, 1322688, 1337252, 1352034,
+    1367044, 1382284, 1397766, 1413494, 1429478, 1445728, 1462252, 1479058,
+    1496158, 1513562, 1531280, 1549326, 1567710, 1586446, 1605550, 1625034,
+    1644914, 1665208, 1685932, 1707108, 1728754, 1750890, 1773542, 1796732,
+    1820488, 1844840, 1869816, 1895452, 1921780, 1948842, 1976680, 2005338,
+    2034868, 2065322, 2096766, 2129260, 2162880, 2197708, 2233832, 2271352,
+    2310384, 2351050, 2393498, 2437886, 2484404, 2533262, 2584710, 2639036,
+    2696578, 2757738, 2822998, 2892940, 2968278, 3049896, 3138912, 3236760,
+    3345312, 3467068, 3605434, 3765154, 3952904, 4177962, 4452178, 4787134,
+    5187290, 5647128, 6159120, 6720518, 7332904, 8000032, 8726664, 9518152,
+    10380372,
+};
+static const int32_t hd_invert_quantize_dither_factors_LF[257] = {
+      2436,   2436,   2436,   2436,   2438,   2438,   2438,   2440,
+      2442,   2442,   2444,   2446,   2448,   2450,   2454,   2456,
+      2458,   2462,   2464,   2468,   2472,   2476,   2480,   2484,
+      2488,   2492,   2498,   2502,   2506,   2512,   2518,   2524,
+      2528,   2534,   2540,   2548,   2554,   2560,   2568,   2574,
+      2582,   2588,   2596,   2604,   2612,   2620,   2628,   2636,
+      2646,   2654,   2664,   2672,   2682,   2692,   2702,   2712,
+      2722,   2732,   2742,   2752,   2764,   2774,   2786,   2798,
+      2810,   2822,   2834,   2846,   2858,   2870,   2884,   2896,
+      2910,   2924,   2938,   2952,   2966,   2980,   2994,   3010,
+      3024,   3040,   3056,   3070,   3086,   3104,   3120,   3136,
+      3154,   3170,   3188,   3206,   3224,   3242,   3262,   3280,
+      3300,   3320,   3338,   3360,   3380,   3400,   3422,   3442,
+      3464,   3486,   3508,   3532,   3554,   3578,   3602,   3626,
+      3652,   3676,   3702,   3728,   3754,   3780,   3808,   3836,
+      3864,   3892,   3920,   3950,   3980,   4010,   4042,   4074,
+      4106,   4138,   4172,   4206,   4240,   4276,   4312,   4348,
+      4384,   4422,   4460,   4500,   4540,   4580,   4622,   4664,
+      4708,   4752,   4796,   4842,   4890,   4938,   4986,   5036,
+      5086,   5138,   5192,   5246,   5300,   5358,   5416,   5474,
+      5534,   5596,   5660,   5726,   5792,   5860,   5930,   6002,
+      6074,   6150,   6226,   6306,   6388,   6470,   6556,   6644,
+      6736,   6828,   6924,   7022,   7124,   7228,   7336,   7448,
+      7562,   7680,   7802,   7928,   8058,   8192,   8332,   8476,
+      8624,   8780,   8940,   9106,   9278,   9458,   9644,   9840,
+     10042,  10252,  10472,  10702,  10942,  11194,  11458,  11734,
+     12024,  12328,  12648,  12986,  13342,  13720,  14118,  14540,
+     14990,  15466,  15976,  16520,  17102,  17726,  18398,  19124,
+     19908,  20760,  21688,  22702,  23816,  25044,  26404,  27922,
+     29622,  31540,  33720,  36222,  39116,  42502,  46514,  51334,
+     57218,  64536,  73830,  85890, 101860, 123198, 151020, 183936,
+    216220, 243618, 268374, 293022, 319362, 347768, 378864, 412626, 449596,
+};
+static const int32_t hd_quantize_dither_factors_LF[256] = {
+       0,    0,    0,    1,    0,    0,    1,    1,
+       0,    1,    1,    1,    1,    1,    1,    1,
+       1,    1,    1,    1,    1,    1,    1,    1,
+       1,    2,    1,    1,    2,    2,    2,    1,
+       2,    2,    2,    2,    2,    2,    2,    2,
+       2,    2,    2,    2,    2,    2,    2,    3,
+       2,    3,    2,    3,    3,    3,    3,    3,
+       3,    3,    3,    3,    3,    3,    3,    3,
+       3,    3,    3,    3,    3,    4,    3,    4,
+       4,    4,    4,    4,    4,    4,    4,    4,
+       4,    4,    4,    4,    5,    4,    4,    5,
+       4,    5,    5,    5,    5,    5,    5,    5,
+       5,    5,    6,    5,    5,    6,    5,    6,
+       6,    6,    6,    6,    6,    6,    6,    7,
+       6,    7,    7,    7,    7,    7,    7,    7,
+       7,    7,    8,    8,    8,    8,    8,    8,
+       8,    9,    9,    9,    9,    9,    9,    9,
+      10,   10,   10,   10,   10,   11,   11,   11,
+      11,   11,   12,   12,   12,   12,   13,   13,
+      13,   14,   14,   14,   15,   15,   15,   15,
+      16,   16,   17,   17,   17,   18,   18,   18,
+      19,   19,   20,   21,   21,   22,   22,   23,
+      23,   24,   25,   26,   26,   27,   28,   29,
+      30,   31,   32,   33,   34,   35,   36,   37,
+      39,   40,   42,   43,   45,   47,   49,   51,
+      53,   55,   58,   60,   63,   66,   69,   73,
+      76,   80,   85,   89,   95,  100,  106,  113,
+     119,  128,  136,  146,  156,  168,  182,  196,
+     213,  232,  254,  279,  307,  340,  380,  425,
+     480,  545,  626,  724,  847, 1003, 1205, 1471,
+    1830, 2324, 3015, 3993, 5335, 6956, 8229, 8071,
+    6850, 6189, 6162, 6585, 7102, 7774, 8441, 9243,
+};
+static const int16_t hd_quantize_factor_select_offset_LF[257] = {
+      0, -22, -21, -21, -20, -20, -19, -19,
+    -18, -18, -17, -17, -16, -16, -15, -14,
+    -14, -13, -13, -12, -12, -11, -11, -10,
+    -10,  -9,  -9,  -8,  -7,  -7,  -6,  -6,
+     -5,  -5,  -4,  -4,  -3,  -3,  -2,  -1,
+     -1,   0,   0,   1,   1,   2,   2,   3,
+      4,   4,   5,   5,   6,   6,   7,   8,
+      8,   9,   9,  10,  11,  11,  12,  12,
+     13,  14,  14,  15,  15,  16,  17,  17,
+     18,  19,  19,  20,  20,  21,  22,  22,
+     23,  24,  24,  25,  26,  26,  27,  28,
+     28,  29,  30,  30,  31,  32,  33,  33,
+     34,  35,  35,  36,  37,  38,  38,  39,
+     40,  41,  41,  42,  43,  44,  44,  45,
+     46,  47,  48,  48,  49,  50,  51,  52,
+     52,  53,  54,  55,  56,  57,  58,  58,
+     59,  60,  61,  62,  63,  64,  65,  66,
+     67,  68,  69,  69,  70,  71,  72,  73,
+     74,  75,  77,  78,  79,  80,  81,  82,
+     83,  84,  85,  86,  87,  89,  90,  91,
+     92,  93,  94,  96,  97,  98,  99, 101,
+    102, 103, 105, 106, 107, 109, 110, 112,
+    113, 115, 116, 118, 119, 121, 122, 124,
+    125, 127, 129, 130, 132, 134, 136, 137,
+    139, 141, 143, 145, 147, 149, 151, 153,
+    155, 158, 160, 162, 164, 167, 169, 172,
+    174, 177, 180, 182, 185, 188, 191, 194,
+    197, 201, 204, 208, 211, 215, 219, 223,
+    227, 232, 236, 241, 246, 251, 257, 263,
+    269, 275, 283, 290, 298, 307, 317, 327,
+    339, 352, 367, 384, 404, 429, 458, 494,
+    522, 522, 522, 522, 522, 522, 522, 522, 522,
+};
+
+
+static const int32_t hd_quantize_intervals_MLF[33] = {
+      -21236,   21236,   63830,  106798,  150386,  194832,  240376,  287258,
+      335726,  386034,  438460,  493308,  550924,  611696,  676082,  744626,
+      817986,  896968,  982580, 1076118, 1179278, 1294344, 1424504, 1574386,
+     1751090, 1966260, 2240868, 2617662, 3196432, 4176450, 5658260, 7671068,
+    10380372,
+};
+static const int32_t hd_invert_quantize_dither_factors_MLF[33] = {
+    21236,  21236,  21360,  21608,  21978,  22468,  23076,   23806,
+    24660,  25648,  26778,  28070,  29544,  31228,  33158,   35386,
+    37974,  41008,  44606,  48934,  54226,  60840,  69320,   80564,
+    96140, 119032, 155576, 221218, 357552, 622468, 859344, 1153464, 1555840,
+};
+static const int32_t hd_quantize_dither_factors_MLF[32] = {
+       0,   31,    62,    93,   123,   152,   183,    214,
+     247,  283,   323,   369,   421,   483,   557,    647,
+     759,  900,  1082,  1323,  1654,  2120,  2811,   3894,
+    5723, 9136, 16411, 34084, 66229, 59219, 73530, 100594,
+};
+static const int16_t hd_quantize_factor_select_offset_MLF[33] = {
+      0, -21, -16, -12,  -7,  -2,   3,   8,
+     13,  19,  24,  30,  36,  43,  50,  57,
+     65,  74,  83,  93, 104, 117, 131, 147,
+    166, 189, 219, 259, 322, 427, 521, 521, 521,
+};
+
+
+static const int32_t hd_quantize_intervals_MHF[9] = {
+    -95044, 95044, 295844, 528780, 821332, 1226438, 1890540, 3344850, 6450664,
+};
+static const int32_t hd_invert_quantize_dither_factors_MHF[9] = {
+    95044, 95044, 105754, 127180, 165372, 39736, 424366, 1029946, 2075866,
+};
+static const int32_t hd_quantize_dither_factors_MHF[8] = {
+    0, 2678, 5357, 9548, -31409, 96158, 151395, 261480,
+};
+static const int16_t hd_quantize_factor_select_offset_MHF[9] = {
+    0, -17, 5, 30, 62, 105, 177, 334, 518,
+};
+
+
+static const int32_t hd_quantize_intervals_HF[17] = {
+     -45754,   45754,  138496,  234896,  337336,  448310,  570738,  708380,
+     866534, 1053262, 1281958, 1577438, 1993050, 2665984, 3900982, 5902844,
+    8897462,
+};
+static const int32_t hd_invert_quantize_dither_factors_HF[17] = {
+    45754,  45754,  46988,  49412,  53026,  57950,  64478,   73164,
+    84988, 101740, 126958, 168522, 247092, 425842, 809154, 1192708, 1801910,
+};
+static const int32_t hd_quantize_dither_factors_HF[16] = {
+       0,  309,   606,   904,  1231,  1632,  2172,   2956,
+    4188, 6305, 10391, 19643, 44688, 95828, 95889, 152301,
+};
+static const int16_t hd_quantize_factor_select_offset_HF[17] = {
+     0, -18,  -8,   2,  13,  25,  38,  53,
+    70,  90, 115, 147, 192, 264, 398, 521, 521,
+};
+
+typedef const struct {
+    const int32_t *quantize_intervals;
+    const int32_t *invert_quantize_dither_factors;
+    const int32_t *quantize_dither_factors;
+    const int16_t *quantize_factor_select_offset;
+    int tables_size;
+    int32_t factor_max;
+    int32_t prediction_order;
+} ConstTables;
+
+static ConstTables tables[2][NB_SUBBANDS] = {
+    {
+        [LF]  = { quantize_intervals_LF,
+                  invert_quantize_dither_factors_LF,
+                  quantize_dither_factors_LF,
+                  quantize_factor_select_offset_LF,
+                  FF_ARRAY_ELEMS(quantize_intervals_LF),
+                  0x11FF, 24 },
+        [MLF] = { quantize_intervals_MLF,
+                  invert_quantize_dither_factors_MLF,
+                  quantize_dither_factors_MLF,
+                  quantize_factor_select_offset_MLF,
+                  FF_ARRAY_ELEMS(quantize_intervals_MLF),
+                  0x14FF, 12 },
+        [MHF] = { quantize_intervals_MHF,
+                  invert_quantize_dither_factors_MHF,
+                  quantize_dither_factors_MHF,
+                  quantize_factor_select_offset_MHF,
+                  FF_ARRAY_ELEMS(quantize_intervals_MHF),
+                  0x16FF, 6 },
+        [HF]  = { quantize_intervals_HF,
+                  invert_quantize_dither_factors_HF,
+                  quantize_dither_factors_HF,
+                  quantize_factor_select_offset_HF,
+                  FF_ARRAY_ELEMS(quantize_intervals_HF),
+                  0x15FF, 12 },
+    },
+    {
+        [LF]  = { hd_quantize_intervals_LF,
+                  hd_invert_quantize_dither_factors_LF,
+                  hd_quantize_dither_factors_LF,
+                  hd_quantize_factor_select_offset_LF,
+                  FF_ARRAY_ELEMS(hd_quantize_intervals_LF),
+                  0x11FF, 24 },
+        [MLF] = { hd_quantize_intervals_MLF,
+                  hd_invert_quantize_dither_factors_MLF,
+                  hd_quantize_dither_factors_MLF,
+                  hd_quantize_factor_select_offset_MLF,
+                  FF_ARRAY_ELEMS(hd_quantize_intervals_MLF),
+                  0x14FF, 12 },
+        [MHF] = { hd_quantize_intervals_MHF,
+                  hd_invert_quantize_dither_factors_MHF,
+                  hd_quantize_dither_factors_MHF,
+                  hd_quantize_factor_select_offset_MHF,
+                  FF_ARRAY_ELEMS(hd_quantize_intervals_MHF),
+                  0x16FF, 6 },
+        [HF]  = { hd_quantize_intervals_HF,
+                  hd_invert_quantize_dither_factors_HF,
+                  hd_quantize_dither_factors_HF,
+                  hd_quantize_factor_select_offset_HF,
+                  FF_ARRAY_ELEMS(hd_quantize_intervals_HF),
+                  0x15FF, 12 },
+    }
+};
+
+static const int16_t quantization_factors[32] = {
+    2048, 2093, 2139, 2186, 2233, 2282, 2332, 2383,
+    2435, 2489, 2543, 2599, 2656, 2714, 2774, 2834,
+    2896, 2960, 3025, 3091, 3158, 3228, 3298, 3371,
+    3444, 3520, 3597, 3676, 3756, 3838, 3922, 4008,
+};
+
+
+/* Rounded right shift with optionnal clipping */
+#define RSHIFT_SIZE(size)                                                     \
+av_always_inline                                                              \
+static int##size##_t rshift##size(int##size##_t value, int shift)             \
+{                                                                             \
+    int##size##_t rounding = (int##size##_t)1 << (shift - 1);                 \
+    int##size##_t mask = ((int##size##_t)1 << (shift + 1)) - 1;               \
+    return ((value + rounding) >> shift) - ((value & mask) == rounding);      \
+}                                                                             \
+av_always_inline                                                              \
+static int##size##_t rshift##size##_clip24(int##size##_t value, int shift)    \
+{                                                                             \
+    return av_clip_intp2(rshift##size(value, shift), 23);                     \
+}
+RSHIFT_SIZE(32)
+RSHIFT_SIZE(64)
+
+
+av_always_inline
+static void aptx_update_codeword_history(Channel *channel)
+{
+    int32_t cw = ((channel->quantize[0].quantized_sample & 3) << 0) +
+                 ((channel->quantize[1].quantized_sample & 2) << 1) +
+                 ((channel->quantize[2].quantized_sample & 1) << 3);
+    channel->codeword_history = (cw << 8) + (channel->codeword_history << 4);
+}
+
+static void aptx_generate_dither(Channel *channel)
+{
+    int subband;
+    int64_t m;
+    int32_t d;
+
+    aptx_update_codeword_history(channel);
+
+    m = (int64_t)5184443 * (channel->codeword_history >> 7);
+    d = (m << 2) + (m >> 22);
+    for (subband = 0; subband < NB_SUBBANDS; subband++)
+        channel->dither[subband] = d << (23 - 5*subband);
+    channel->dither_parity = (d >> 25) & 1;
+}
+
+/*
+ * Convolution filter coefficients for the outer QMF of the QMF tree.
+ * The 2 sets are a mirror of each other.
+ */
+static const int32_t aptx_qmf_outer_coeffs[NB_FILTERS][FILTER_TAPS] = {
+    {
+        730, -413, -9611, 43626, -121026, 269973, -585547, 2801966,
+        697128, -160481, 27611, 8478, -10043, 3511, 688, -897,
+    },
+    {
+        -897, 688, 3511, -10043, 8478, 27611, -160481, 697128,
+        2801966, -585547, 269973, -121026, 43626, -9611, -413, 730,
+    },
+};
+
+/*
+ * Convolution filter coefficients for the inner QMF of the QMF tree.
+ * The 2 sets are a mirror of each other.
+ */
+static const int32_t aptx_qmf_inner_coeffs[NB_FILTERS][FILTER_TAPS] = {
+    {
+       1033, -584, -13592, 61697, -171156, 381799, -828088, 3962579,
+       985888, -226954, 39048, 11990, -14203, 4966, 973, -1268,
+    },
+    {
+      -1268, 973, 4966, -14203, 11990, 39048, -226954, 985888,
+      3962579, -828088, 381799, -171156, 61697, -13592, -584, 1033,
+    },
+};
+
+/*
+ * Push one sample into a circular signal buffer.
+ */
+av_always_inline
+static void aptx_qmf_filter_signal_push(FilterSignal *signal, int32_t sample)
+{
+    signal->buffer[signal->pos            ] = sample;
+    signal->buffer[signal->pos+FILTER_TAPS] = sample;
+    signal->pos = (signal->pos + 1) & (FILTER_TAPS - 1);
+}
+
+/*
+ * Compute the convolution of the signal with the coefficients, and reduce
+ * to 24 bits by applying the specified right shifting.
+ */
+av_always_inline
+static int32_t aptx_qmf_convolution(FilterSignal *signal,
+                                    const int32_t coeffs[FILTER_TAPS],
+                                    int shift)
+{
+    int32_t *sig = &signal->buffer[signal->pos];
+    int64_t e = 0;
+    int i;
+
+    for (i = 0; i < FILTER_TAPS; i++)
+        e += MUL64(sig[i], coeffs[i]);
+
+    return rshift64_clip24(e, shift);
+}
+
+/*
+ * Half-band QMF analysis filter realized with a polyphase FIR filter.
+ * Split into 2 subbands and downsample by 2.
+ * So for each pair of samples that goes in, one sample goes out,
+ * split into 2 separate subbands.
+ */
+av_always_inline
+static void aptx_qmf_polyphase_analysis(FilterSignal signal[NB_FILTERS],
+                                        const int32_t coeffs[NB_FILTERS][FILTER_TAPS],
+                                        int shift,
+                                        int32_t samples[NB_FILTERS],
+                                        int32_t *low_subband_output,
+                                        int32_t *high_subband_output)
+{
+    int32_t subbands[NB_FILTERS];
+    int i;
+
+    for (i = 0; i < NB_FILTERS; i++) {
+        aptx_qmf_filter_signal_push(&signal[i], samples[NB_FILTERS-1-i]);
+        subbands[i] = aptx_qmf_convolution(&signal[i], coeffs[i], shift);
+    }
+
+    *low_subband_output  = av_clip_intp2(subbands[0] + subbands[1], 23);
+    *high_subband_output = av_clip_intp2(subbands[0] - subbands[1], 23);
+}
+
+/*
+ * Two stage QMF analysis tree.
+ * Split 4 input samples into 4 subbands and downsample by 4.
+ * So for each group of 4 samples that goes in, one sample goes out,
+ * split into 4 separate subbands.
+ */
+static void aptx_qmf_tree_analysis(QMFAnalysis *qmf,
+                                   int32_t samples[4],
+                                   int32_t subband_samples[4])
+{
+    int32_t intermediate_samples[4];
+    int i;
+
+    /* Split 4 input samples into 2 intermediate subbands downsampled to 2 samples */
+    for (i = 0; i < 2; i++)
+        aptx_qmf_polyphase_analysis(qmf->outer_filter_signal,
+                                    aptx_qmf_outer_coeffs, 23,
+                                    &samples[2*i],
+                                    &intermediate_samples[0+i],
+                                    &intermediate_samples[2+i]);
+
+    /* Split 2 intermediate subband samples into 4 final subbands downsampled to 1 sample */
+    for (i = 0; i < 2; i++)
+        aptx_qmf_polyphase_analysis(qmf->inner_filter_signal[i],
+                                    aptx_qmf_inner_coeffs, 23,
+                                    &intermediate_samples[2*i],
+                                    &subband_samples[2*i+0],
+                                    &subband_samples[2*i+1]);
+}
+
+/*
+ * Half-band QMF synthesis filter realized with a polyphase FIR filter.
+ * Join 2 subbands and upsample by 2.
+ * So for each 2 subbands sample that goes in, a pair of samples goes out.
+ */
+av_always_inline
+static void aptx_qmf_polyphase_synthesis(FilterSignal signal[NB_FILTERS],
+                                         const int32_t coeffs[NB_FILTERS][FILTER_TAPS],
+                                         int shift,
+                                         int32_t low_subband_input,
+                                         int32_t high_subband_input,
+                                         int32_t samples[NB_FILTERS])
+{
+    int32_t subbands[NB_FILTERS];
+    int i;
+
+    subbands[0] = low_subband_input + high_subband_input;
+    subbands[1] = low_subband_input - high_subband_input;
+
+    for (i = 0; i < NB_FILTERS; i++) {
+        aptx_qmf_filter_signal_push(&signal[i], subbands[1-i]);
+        samples[i] = aptx_qmf_convolution(&signal[i], coeffs[i], shift);
+    }
+}
+
+/*
+ * Two stage QMF synthesis tree.
+ * Join 4 subbands and upsample by 4.
+ * So for each 4 subbands sample that goes in, a group of 4 samples goes out.
+ */
+static void aptx_qmf_tree_synthesis(QMFAnalysis *qmf,
+                                    int32_t subband_samples[4],
+                                    int32_t samples[4])
+{
+    int32_t intermediate_samples[4];
+    int i;
+
+    /* Join 4 subbands into 2 intermediate subbands upsampled to 2 samples. */
+    for (i = 0; i < 2; i++)
+        aptx_qmf_polyphase_synthesis(qmf->inner_filter_signal[i],
+                                     aptx_qmf_inner_coeffs, 22,
+                                     subband_samples[2*i+0],
+                                     subband_samples[2*i+1],
+                                     &intermediate_samples[2*i]);
+
+    /* Join 2 samples from intermediate subbands upsampled to 4 samples. */
+    for (i = 0; i < 2; i++)
+        aptx_qmf_polyphase_synthesis(qmf->outer_filter_signal,
+                                     aptx_qmf_outer_coeffs, 21,
+                                     intermediate_samples[0+i],
+                                     intermediate_samples[2+i],
+                                     &samples[2*i]);
+}
+
+
+av_always_inline
+static int32_t aptx_bin_search(int32_t value, int32_t factor,
+                               const int32_t *intervals, int32_t nb_intervals)
+{
+    int32_t idx = 0;
+    int i;
+
+    for (i = nb_intervals >> 1; i > 0; i >>= 1)
+        if (MUL64(factor, intervals[idx + i]) <= ((int64_t)value << 24))
+            idx += i;
+
+    return idx;
+}
+
+static void aptx_quantize_difference(Quantize *quantize,
+                                     int32_t sample_difference,
+                                     int32_t dither,
+                                     int32_t quantization_factor,
+                                     ConstTables *tables)
+{
+    const int32_t *intervals = tables->quantize_intervals;
+    int32_t quantized_sample, dithered_sample, parity_change;
+    int32_t d, mean, interval, inv, sample_difference_abs;
+    int64_t error;
+
+    sample_difference_abs = FFABS(sample_difference);
+    sample_difference_abs = FFMIN(sample_difference_abs, (1 << 23) - 1);
+
+    quantized_sample = aptx_bin_search(sample_difference_abs >> 4,
+                                       quantization_factor,
+                                       intervals, tables->tables_size);
+
+    d = rshift32_clip24(MULH(dither, dither), 7) - (1 << 23);
+    d = rshift64(MUL64(d, tables->quantize_dither_factors[quantized_sample]), 23);
+
+    intervals += quantized_sample;
+    mean = (intervals[1] + intervals[0]) / 2;
+    interval = (intervals[1] - intervals[0]) * (-(sample_difference < 0) | 1);
+
+    dithered_sample = rshift64_clip24(MUL64(dither, interval) + ((int64_t)av_clip_intp2(mean + d, 23) << 32), 32);
+    error = ((int64_t)sample_difference_abs << 20) - MUL64(dithered_sample, quantization_factor);
+    quantize->error = FFABS(rshift64(error, 23));
+
+    parity_change = quantized_sample;
+    if (error < 0)
+        quantized_sample--;
+    else
+        parity_change--;
+
+    inv = -(sample_difference < 0);
+    quantize->quantized_sample               = quantized_sample ^ inv;
+    quantize->quantized_sample_parity_change = parity_change    ^ inv;
+}
+
+static void aptx_encode_channel(Channel *channel, int32_t samples[4], int hd)
+{
+    int32_t subband_samples[4];
+    int subband;
+    aptx_qmf_tree_analysis(&channel->qmf, samples, subband_samples);
+    aptx_generate_dither(channel);
+    for (subband = 0; subband < NB_SUBBANDS; subband++) {
+        int32_t diff = av_clip_intp2(subband_samples[subband] - channel->prediction[subband].predicted_sample, 23);
+        aptx_quantize_difference(&channel->quantize[subband], diff,
+                                 channel->dither[subband],
+                                 channel->invert_quantize[subband].quantization_factor,
+                                 &tables[hd][subband]);
+    }
+}
+
+static void aptx_decode_channel(Channel *channel, int32_t samples[4])
+{
+    int32_t subband_samples[4];
+    int subband;
+    for (subband = 0; subband < NB_SUBBANDS; subband++)
+        subband_samples[subband] = channel->prediction[subband].previous_reconstructed_sample;
+    aptx_qmf_tree_synthesis(&channel->qmf, subband_samples, samples);
+}
+
+
+static void aptx_invert_quantization(InvertQuantize *invert_quantize,
+                                     int32_t quantized_sample, int32_t dither,
+                                     ConstTables *tables)
+{
+    int32_t qr, idx, shift, factor_select;
+
+    idx = (quantized_sample ^ -(quantized_sample < 0)) + 1;
+    qr = tables->quantize_intervals[idx] / 2;
+    if (quantized_sample < 0)
+        qr = -qr;
+
+    qr = rshift64_clip24(((int64_t)qr<<32) + MUL64(dither, tables->invert_quantize_dither_factors[idx]), 32);
+    invert_quantize->reconstructed_difference = MUL64(invert_quantize->quantization_factor, qr) >> 19;
+
+    /* update factor_select */
+    factor_select = 32620 * invert_quantize->factor_select;
+    factor_select = rshift32(factor_select + (tables->quantize_factor_select_offset[idx] << 15), 15);
+    invert_quantize->factor_select = av_clip(factor_select, 0, tables->factor_max);
+
+    /* update quantization factor */
+    idx = (invert_quantize->factor_select & 0xFF) >> 3;
+    shift = (tables->factor_max - invert_quantize->factor_select) >> 8;
+    invert_quantize->quantization_factor = (quantization_factors[idx] << 11) >> shift;
+}
+
+static int32_t *aptx_reconstructed_differences_update(Prediction *prediction,
+                                                      int32_t reconstructed_difference,
+                                                      int order)
+{
+    int32_t *rd1 = prediction->reconstructed_differences, *rd2 = rd1 + order;
+    int p = prediction->pos;
+
+    rd1[p] = rd2[p];
+    prediction->pos = p = (p + 1) % order;
+    rd2[p] = reconstructed_difference;
+    return &rd2[p];
+}
+
+static void aptx_prediction_filtering(Prediction *prediction,
+                                      int32_t reconstructed_difference,
+                                      int order)
+{
+    int32_t reconstructed_sample, predictor, srd0;
+    int32_t *reconstructed_differences;
+    int64_t predicted_difference = 0;
+    int i;
+
+    reconstructed_sample = av_clip_intp2(reconstructed_difference + prediction->predicted_sample, 23);
+    predictor = av_clip_intp2((MUL64(prediction->s_weight[0], prediction->previous_reconstructed_sample)
+                             + MUL64(prediction->s_weight[1], reconstructed_sample)) >> 22, 23);
+    prediction->previous_reconstructed_sample = reconstructed_sample;
+
+    reconstructed_differences = aptx_reconstructed_differences_update(prediction, reconstructed_difference, order);
+    srd0 = FFDIFFSIGN(reconstructed_difference, 0) << 23;
+    for (i = 0; i < order; i++) {
+        int32_t srd = FF_SIGNBIT(reconstructed_differences[-i-1]) | 1;
+        prediction->d_weight[i] -= rshift32(prediction->d_weight[i] - srd*srd0, 8);
+        predicted_difference += MUL64(reconstructed_differences[-i], prediction->d_weight[i]);
+    }
+
+    prediction->predicted_difference = av_clip_intp2(predicted_difference >> 22, 23);
+    prediction->predicted_sample = av_clip_intp2(predictor + prediction->predicted_difference, 23);
+}
+
+static void aptx_process_subband(InvertQuantize *invert_quantize,
+                                 Prediction *prediction,
+                                 int32_t quantized_sample, int32_t dither,
+                                 ConstTables *tables)
+{
+    int32_t sign, same_sign[2], weight[2], sw1, range;
+
+    aptx_invert_quantization(invert_quantize, quantized_sample, dither, tables);
+
+    sign = FFDIFFSIGN(invert_quantize->reconstructed_difference,
+                      -prediction->predicted_difference);
+    same_sign[0] = sign * prediction->prev_sign[0];
+    same_sign[1] = sign * prediction->prev_sign[1];
+    prediction->prev_sign[0] = prediction->prev_sign[1];
+    prediction->prev_sign[1] = sign | 1;
+
+    range = 0x100000;
+    sw1 = rshift32(-same_sign[1] * prediction->s_weight[1], 1);
+    sw1 = (av_clip(sw1, -range, range) & ~0xF) << 4;
+
+    range = 0x300000;
+    weight[0] = 254 * prediction->s_weight[0] + 0x800000*same_sign[0] + sw1;
+    prediction->s_weight[0] = av_clip(rshift32(weight[0], 8), -range, range);
+
+    range = 0x3C0000 - prediction->s_weight[0];
+    weight[1] = 255 * prediction->s_weight[1] + 0xC00000*same_sign[1];
+    prediction->s_weight[1] = av_clip(rshift32(weight[1], 8), -range, range);
+
+    aptx_prediction_filtering(prediction,
+                              invert_quantize->reconstructed_difference,
+                              tables->prediction_order);
+}
+
+static void aptx_invert_quantize_and_prediction(Channel *channel, int hd)
+{
+    int subband;
+    for (subband = 0; subband < NB_SUBBANDS; subband++)
+        aptx_process_subband(&channel->invert_quantize[subband],
+                             &channel->prediction[subband],
+                             channel->quantize[subband].quantized_sample,
+                             channel->dither[subband],
+                             &tables[hd][subband]);
+}
+
+static int32_t aptx_quantized_parity(Channel *channel)
+{
+    int32_t parity = channel->dither_parity;
+    int subband;
+
+    for (subband = 0; subband < NB_SUBBANDS; subband++)
+        parity ^= channel->quantize[subband].quantized_sample;
+
+    return parity & 1;
+}
+
+/* For each sample, ensure that the parity of all subbands of all channels
+ * is 0 except once every 8 samples where the parity is forced to 1. */
+static int aptx_check_parity(Channel channels[NB_CHANNELS], int32_t *idx)
+{
+    int32_t parity = aptx_quantized_parity(&channels[LEFT])
+                   ^ aptx_quantized_parity(&channels[RIGHT]);
+
+    int eighth = *idx == 7;
+    *idx = (*idx + 1) & 7;
+
+    return parity ^ eighth;
+}
+
+static void aptx_insert_sync(Channel channels[NB_CHANNELS], int32_t *idx)
+{
+    if (aptx_check_parity(channels, idx)) {
+        int i;
+        Channel *c;
+        static const int map[] = { 1, 2, 0, 3 };
+        Quantize *min = &channels[NB_CHANNELS-1].quantize[map[0]];
+        for (c = &channels[NB_CHANNELS-1]; c >= channels; c--)
+            for (i = 0; i < NB_SUBBANDS; i++)
+                if (c->quantize[map[i]].error < min->error)
+                    min = &c->quantize[map[i]];
+
+        /* Forcing the desired parity is done by offsetting by 1 the quantized
+         * sample from the subband featuring the smallest quantization error. */
+        min->quantized_sample = min->quantized_sample_parity_change;
+    }
+}
+
+static uint16_t aptx_pack_codeword(Channel *channel)
+{
+    int32_t parity = aptx_quantized_parity(channel);
+    return (((channel->quantize[3].quantized_sample & 0x06) | parity) << 13)
+         | (((channel->quantize[2].quantized_sample & 0x03)         ) << 11)
+         | (((channel->quantize[1].quantized_sample & 0x0F)         ) <<  7)
+         | (((channel->quantize[0].quantized_sample & 0x7F)         ) <<  0);
+}
+
+static uint32_t aptxhd_pack_codeword(Channel *channel)
+{
+    int32_t parity = aptx_quantized_parity(channel);
+    return (((channel->quantize[3].quantized_sample & 0x01E) | parity) << 19)
+         | (((channel->quantize[2].quantized_sample & 0x00F)         ) << 15)
+         | (((channel->quantize[1].quantized_sample & 0x03F)         ) <<  9)
+         | (((channel->quantize[0].quantized_sample & 0x1FF)         ) <<  0);
+}
+
+static void aptx_unpack_codeword(Channel *channel, uint16_t codeword)
+{
+    channel->quantize[0].quantized_sample = sign_extend(codeword >>  0, 7);
+    channel->quantize[1].quantized_sample = sign_extend(codeword >>  7, 4);
+    channel->quantize[2].quantized_sample = sign_extend(codeword >> 11, 2);
+    channel->quantize[3].quantized_sample = sign_extend(codeword >> 13, 3);
+    channel->quantize[3].quantized_sample = (channel->quantize[3].quantized_sample & ~1)
+                                          | aptx_quantized_parity(channel);
+}
+
+static void aptxhd_unpack_codeword(Channel *channel, uint32_t codeword)
+{
+    channel->quantize[0].quantized_sample = sign_extend(codeword >>  0, 9);
+    channel->quantize[1].quantized_sample = sign_extend(codeword >>  9, 6);
+    channel->quantize[2].quantized_sample = sign_extend(codeword >> 15, 4);
+    channel->quantize[3].quantized_sample = sign_extend(codeword >> 19, 5);
+    channel->quantize[3].quantized_sample = (channel->quantize[3].quantized_sample & ~1)
+                                          | aptx_quantized_parity(channel);
+}
+
+static void aptx_encode_samples(AptXContext *ctx,
+                                int32_t samples[NB_CHANNELS][4],
+                                uint8_t *output)
+{
+    int channel;
+    for (channel = 0; channel < NB_CHANNELS; channel++)
+        aptx_encode_channel(&ctx->channels[channel], samples[channel], ctx->hd);
+
+    aptx_insert_sync(ctx->channels, &ctx->sync_idx);
+
+    for (channel = 0; channel < NB_CHANNELS; channel++) {
+        aptx_invert_quantize_and_prediction(&ctx->channels[channel], ctx->hd);
+        if (ctx->hd)
+            AV_WB24(output + 3*channel,
+                    aptxhd_pack_codeword(&ctx->channels[channel]));
+        else
+            AV_WB16(output + 2*channel,
+                    aptx_pack_codeword(&ctx->channels[channel]));
+    }
+}
+
+static int aptx_decode_samples(AptXContext *ctx,
+                                const uint8_t *input,
+                                int32_t samples[NB_CHANNELS][4])
+{
+    int channel, ret;
+
+    for (channel = 0; channel < NB_CHANNELS; channel++) {
+        aptx_generate_dither(&ctx->channels[channel]);
+
+        if (ctx->hd)
+            aptxhd_unpack_codeword(&ctx->channels[channel],
+                                   AV_RB24(input + 3*channel));
+        else
+            aptx_unpack_codeword(&ctx->channels[channel],
+                                 AV_RB16(input + 2*channel));
+        aptx_invert_quantize_and_prediction(&ctx->channels[channel], ctx->hd);
+    }
+
+    ret = aptx_check_parity(ctx->channels, &ctx->sync_idx);
+
+    for (channel = 0; channel < NB_CHANNELS; channel++)
+        aptx_decode_channel(&ctx->channels[channel], samples[channel]);
+
+    return ret;
+}
+
+
+static av_cold int aptx_init(AVCodecContext *avctx)
+{
+    AptXContext *s = avctx->priv_data;
+    int chan, subband;
+
+    s->hd = avctx->codec->id == AV_CODEC_ID_APTX_HD;
+    s->block_size = s->hd ? 6 : 4;
+
+    if (avctx->frame_size == 0)
+        avctx->frame_size = 256 * s->block_size;
+
+    if (avctx->frame_size % s->block_size) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Frame size must be a multiple of %d samples\n", s->block_size);
+        return AVERROR(EINVAL);
+    }
+
+    for (chan = 0; chan < NB_CHANNELS; chan++) {
+        Channel *channel = &s->channels[chan];
+        for (subband = 0; subband < NB_SUBBANDS; subband++) {
+            Prediction *prediction = &channel->prediction[subband];
+            prediction->prev_sign[0] = 1;
+            prediction->prev_sign[1] = 1;
+        }
+    }
+
+    ff_af_queue_init(avctx, &s->afq);
+    return 0;
+}
+
+static int aptx_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame_ptr, AVPacket *avpkt)
+{
+    AptXContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    int pos, opos, channel, sample, ret;
+
+    if (avpkt->size < s->block_size) {
+        av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* get output buffer */
+    frame->channels = NB_CHANNELS;
+    frame->format = AV_SAMPLE_FMT_S32P;
+    frame->nb_samples = 4 * avpkt->size / s->block_size;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (pos = 0, opos = 0; opos < frame->nb_samples; pos += s->block_size, opos += 4) {
+        int32_t samples[NB_CHANNELS][4];
+
+        if (aptx_decode_samples(s, &avpkt->data[pos], samples)) {
+            av_log(avctx, AV_LOG_ERROR, "Synchronization error\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        for (channel = 0; channel < NB_CHANNELS; channel++)
+            for (sample = 0; sample < 4; sample++)
+                AV_WN32A(&frame->data[channel][4*(opos+sample)],
+                         samples[channel][sample] << 8);
+    }
+
+    *got_frame_ptr = 1;
+    return s->block_size * frame->nb_samples / 4;
+}
+
+static int aptx_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                             const AVFrame *frame, int *got_packet_ptr)
+{
+    AptXContext *s = avctx->priv_data;
+    int pos, ipos, channel, sample, output_size, ret;
+
+    if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
+        return ret;
+
+    output_size = s->block_size * frame->nb_samples/4;
+    if ((ret = ff_alloc_packet2(avctx, avpkt, output_size, 0)) < 0)
+        return ret;
+
+    for (pos = 0, ipos = 0; pos < output_size; pos += s->block_size, ipos += 4) {
+        int32_t samples[NB_CHANNELS][4];
+
+        for (channel = 0; channel < NB_CHANNELS; channel++)
+            for (sample = 0; sample < 4; sample++)
+                samples[channel][sample] = (int32_t)AV_RN32A(&frame->data[channel][4*(ipos+sample)]) >> 8;
+
+        aptx_encode_samples(s, samples, avpkt->data + pos);
+    }
+
+    ff_af_queue_remove(&s->afq, frame->nb_samples, &avpkt->pts, &avpkt->duration);
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static av_cold int aptx_close(AVCodecContext *avctx)
+{
+    AptXContext *s = avctx->priv_data;
+    ff_af_queue_close(&s->afq);
+    return 0;
+}
+
+
+#if CONFIG_APTX_DECODER
+AVCodec ff_aptx_decoder = {
+    .name                  = "aptx",
+    .long_name             = NULL_IF_CONFIG_SMALL("aptX (Audio Processing Technology for Bluetooth)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_APTX,
+    .priv_data_size        = sizeof(AptXContext),
+    .init                  = aptx_init,
+    .decode                = aptx_decode_frame,
+    .close                 = aptx_close,
+    .capabilities          = AV_CODEC_CAP_DR1,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_STEREO, 0},
+    .sample_fmts           = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S32P,
+                                                             AV_SAMPLE_FMT_NONE },
+};
+#endif
+
+#if CONFIG_APTX_HD_DECODER
+AVCodec ff_aptx_hd_decoder = {
+    .name                  = "aptx_hd",
+    .long_name             = NULL_IF_CONFIG_SMALL("aptX HD (Audio Processing Technology for Bluetooth)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_APTX_HD,
+    .priv_data_size        = sizeof(AptXContext),
+    .init                  = aptx_init,
+    .decode                = aptx_decode_frame,
+    .close                 = aptx_close,
+    .capabilities          = AV_CODEC_CAP_DR1,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_STEREO, 0},
+    .sample_fmts           = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S32P,
+                                                             AV_SAMPLE_FMT_NONE },
+};
+#endif
+
+#if CONFIG_APTX_ENCODER
+AVCodec ff_aptx_encoder = {
+    .name                  = "aptx",
+    .long_name             = NULL_IF_CONFIG_SMALL("aptX (Audio Processing Technology for Bluetooth)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_APTX,
+    .priv_data_size        = sizeof(AptXContext),
+    .init                  = aptx_init,
+    .encode2               = aptx_encode_frame,
+    .close                 = aptx_close,
+    .capabilities          = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_STEREO, 0},
+    .sample_fmts           = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S32P,
+                                                             AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]) {8000, 16000, 24000, 32000, 44100, 48000, 0},
+};
+#endif
+
+#if CONFIG_APTX_HD_ENCODER
+AVCodec ff_aptx_hd_encoder = {
+    .name                  = "aptx_hd",
+    .long_name             = NULL_IF_CONFIG_SMALL("aptX HD (Audio Processing Technology for Bluetooth)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_APTX_HD,
+    .priv_data_size        = sizeof(AptXContext),
+    .init                  = aptx_init,
+    .encode2               = aptx_encode_frame,
+    .close                 = aptx_close,
+    .capabilities          = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_STEREO, 0},
+    .sample_fmts           = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S32P,
+                                                             AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]) {8000, 16000, 24000, 32000, 44100, 48000, 0},
+};
+#endif
diff --git a/libavcodec/arbc.c b/libavcodec/arbc.c
new file mode 100644
index 0000000..841a9f1
--- /dev/null
+++ b/libavcodec/arbc.c
@@ -0,0 +1,218 @@
+/*
+ * Gryphon's Anim Compressor decoder
+ * Copyright (c) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/imgutils.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+typedef struct ARBCContext {
+    GetByteContext gb;
+
+    AVFrame *prev_frame;
+} ARBCContext;
+
+static void fill_tile4(AVCodecContext *avctx, uint8_t *color, AVFrame *frame)
+{
+    ARBCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    int nb_tiles = bytestream2_get_le16(gb);
+    int h = avctx->height - 1;
+
+    if ((avctx->width / 4 + 1) * (avctx->height / 4 + 1) < nb_tiles)
+        return;
+
+    for (int i = 0; i < nb_tiles; i++) {
+        int y = bytestream2_get_byte(gb);
+        int x = bytestream2_get_byte(gb);
+        uint16_t mask = bytestream2_get_le16(gb);
+        int start_y = y * 4, start_x = x * 4;
+        int end_y = start_y + 4, end_x = start_x + 4;
+
+        for (int j = start_y; j < end_y; j++) {
+            for (int k = start_x; k < end_x; k++) {
+                if (mask & 0x8000) {
+                    if (j >= avctx->height || k >= avctx->width) {
+                        mask = mask << 1;
+                        continue;
+                    }
+                    frame->data[0][frame->linesize[0] * (h - j) + 3 * k + 0] = color[0];
+                    frame->data[0][frame->linesize[0] * (h - j) + 3 * k + 1] = color[1];
+                    frame->data[0][frame->linesize[0] * (h - j) + 3 * k + 2] = color[2];
+                }
+                mask = mask << 1;
+            }
+        }
+    }
+}
+
+static void fill_tileX(AVCodecContext *avctx, int tile_width, int tile_height,
+                       uint8_t *color, AVFrame *frame)
+{
+    ARBCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    const int step_h = tile_height / 4;
+    const int step_w = tile_width / 4;
+    int nb_tiles = bytestream2_get_le16(gb);
+    int h = avctx->height - 1;
+
+    if ((avctx->width / tile_width + 1) * (avctx->height / tile_height + 1) < nb_tiles)
+        return;
+
+    for (int i = 0; i < nb_tiles; i++) {
+        int y = bytestream2_get_byte(gb);
+        int x = bytestream2_get_byte(gb);
+        uint16_t mask = bytestream2_get_le16(gb);
+        int start_y = y * tile_height, start_x = x * tile_width;
+        int end_y = start_y + tile_height, end_x = start_x + tile_width;
+
+        for (int j = start_y; j < end_y; j += step_h) {
+            for (int k = start_x; k < end_x; k += step_w) {
+                if (mask & 0x8000U) {
+                    for (int m = 0; m < step_h; m++) {
+                        for (int n = 0; n < step_w; n++) {
+                            if (j + m >= avctx->height || k + n >= avctx->width)
+                                continue;
+                            frame->data[0][frame->linesize[0] * (h - (j + m)) + 3 * (k + n) + 0] = color[0];
+                            frame->data[0][frame->linesize[0] * (h - (j + m)) + 3 * (k + n) + 1] = color[1];
+                            frame->data[0][frame->linesize[0] * (h - (j + m)) + 3 * (k + n) + 2] = color[2];
+                        }
+                    }
+                }
+                mask = mask << 1;
+            }
+        }
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame, AVPacket *avpkt)
+{
+    ARBCContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret, nb_segments, keyframe = 1;
+
+    if (avpkt->size < 10)
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+
+    if (s->prev_frame->data[0]) {
+        ret = av_frame_copy(frame, s->prev_frame);
+        if (ret < 0)
+            return ret;
+    }
+
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
+    bytestream2_skip(&s->gb, 8);
+    nb_segments = bytestream2_get_le16(&s->gb);
+    if (nb_segments == 0)
+        keyframe = 0;
+
+    for (int i = 0; i < nb_segments; i++) {
+        int resolution_flag;
+        uint8_t fill[3];
+
+        if (bytestream2_get_bytes_left(&s->gb) <= 0)
+            return AVERROR_INVALIDDATA;
+
+        fill[0] = bytestream2_get_byte(&s->gb);
+        bytestream2_skip(&s->gb, 1);
+        fill[1] = bytestream2_get_byte(&s->gb);
+        bytestream2_skip(&s->gb, 1);
+        fill[2] = bytestream2_get_byte(&s->gb);
+        bytestream2_skip(&s->gb, 1);
+        resolution_flag = bytestream2_get_byte(&s->gb);
+
+        if (resolution_flag & 0x10)
+            fill_tileX(avctx, 1024, 1024, fill, frame);
+        if (resolution_flag & 0x08)
+            fill_tileX(avctx, 256, 256, fill, frame);
+        if (resolution_flag & 0x04)
+            fill_tileX(avctx, 64, 64, fill, frame);
+        if (resolution_flag & 0x02)
+            fill_tileX(avctx, 16, 16, fill, frame);
+        if (resolution_flag & 0x01)
+            fill_tile4(avctx, fill, frame);
+    }
+
+    av_frame_unref(s->prev_frame);
+    if ((ret = av_frame_ref(s->prev_frame, frame)) < 0)
+        return ret;
+
+    frame->pict_type = keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+    frame->key_frame = keyframe;
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    ARBCContext *s = avctx->priv_data;
+
+    avctx->pix_fmt = AV_PIX_FMT_RGB24;
+
+    s->prev_frame = av_frame_alloc();
+    if (!s->prev_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static void decode_flush(AVCodecContext *avctx)
+{
+    ARBCContext *s = avctx->priv_data;
+
+    av_frame_unref(s->prev_frame);
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    ARBCContext *s = avctx->priv_data;
+
+    av_frame_free(&s->prev_frame);
+
+    return 0;
+}
+
+AVCodec ff_arbc_decoder = {
+    .name           = "arbc",
+    .long_name      = NULL_IF_CONFIG_SMALL("Gryphon's Anim Compressor"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_ARBC,
+    .priv_data_size = sizeof(ARBCContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .flush          = decode_flush,
+    .close          = decode_close,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 49e17ce..e656011 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,8 +21,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
                                           arm/idctdsp_arm.o             \
                                           arm/jrevdct_arm.o             \
                                           arm/simple_idct_arm.o
-OBJS-$(CONFIG_MDCT)                    += arm/mdct_init_arm.o           \
-                                          arm/mdct_fixed_init_arm.o
+OBJS-$(CONFIG_LLAUDDSP)                += arm/lossless_audiodsp_init_arm.o
 OBJS-$(CONFIG_ME_CMP)                  += arm/me_cmp_init_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
@@ -39,14 +38,16 @@ OBJS-$(CONFIG_VP8DSP)                  += arm/vp8dsp_init_arm.o
 # decoders/encoders
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                           arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_APE_DECODER)             += arm/apedsp_init_arm.o
-OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
+OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
 OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
+OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += arm/vorbisdsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER)             += arm/vp6dsp_init_arm.o
-OBJS-$(CONFIG_VP9_DECODER)             += arm/vp9dsp_init_arm.o
+OBJS-$(CONFIG_VP9_DECODER)             += arm/vp9dsp_init_10bpp_arm.o   \
+                                          arm/vp9dsp_init_12bpp_arm.o   \
+                                          arm/vp9dsp_init_arm.o
 
 
 # ARMv5 optimizations
@@ -81,6 +82,7 @@ ARMV6-OBJS-$(CONFIG_VP8DSP)            += arm/vp8_armv6.o               \
 
 # decoders/encoders
 ARMV6-OBJS-$(CONFIG_MLP_DECODER)       += arm/mlpdsp_armv6.o
+ARMV6-OBJS-$(CONFIG_SBC_ENCODER)       += arm/sbcdsp_armv6.o
 
 
 # VFP optimizations
@@ -91,8 +93,7 @@ VFP-OBJS-$(CONFIG_FMTCONVERT)          += arm/fmtconvert_vfp.o
 VFP-OBJS-$(CONFIG_MDCT)                += arm/mdct_vfp.o
 
 # decoders/encoders
-VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/dcadsp_vfp.o              \
-                                          arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/synth_filter_vfp.o
 
 
 # NEON optimizations
@@ -132,16 +133,22 @@ NEON-OBJS-$(CONFIG_VP8DSP)             += arm/vp8dsp_init_neon.o        \
 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
                                           arm/sbrdsp_neon.o
-NEON-OBJS-$(CONFIG_APE_DECODER)        += arm/apedsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
-                                          arm/synth_filter_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevc_idct.o               \
-                                          arm/hevc_mc.o
+NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+                                          arm/hevcdsp_deblock_neon.o    \
+                                          arm/hevcdsp_idct_neon.o       \
+                                          arm/hevcdsp_qpel_neon.o       \
+                                          arm/hevcdsp_sao_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
+NEON-OBJS-$(CONFIG_SBC_ENCODER)        += arm/sbcdsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)     += arm/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp6dsp_neon.o
-NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9itxfm_neon.o           \
+NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9itxfm_16bpp_neon.o     \
+                                          arm/vp9itxfm_neon.o           \
+                                          arm/vp9lpf_16bpp_neon.o       \
                                           arm/vp9lpf_neon.o             \
+                                          arm/vp9mc_16bpp_neon.o        \
                                           arm/vp9mc_neon.o
diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h
index 4f143cb..cafa881 100644
--- a/libavcodec/arm/aac.h
+++ b/libavcodec/arm/aac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/aacpsdsp_init_arm.c b/libavcodec/arm/aacpsdsp_init_arm.c
index 6326376..6eb979e 100644
--- a/libavcodec/arm/aacpsdsp_init_arm.c
+++ b/libavcodec/arm/aacpsdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@ void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
                                 float *src1, int n);
 void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
                                 const float (*filter)[8][2],
-                                int stride, int n);
+                                ptrdiff_t stride, int n);
 void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64],
                                        int i, int len);
 void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2],
diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S
index fb00900..3b1bed2 100644
--- a/libavcodec/arm/aacpsdsp_neon.S
+++ b/libavcodec/arm/aacpsdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -232,12 +232,11 @@ endfunc
 function ff_ps_stereo_interpolate_neon, export=1
         vld1.32         {q0},     [r2]
         vld1.32         {q14},    [r3]
-        vadd.f32        q15, q14, q14
         mov             r2,  r0
         mov             r3,  r1
         ldr             r12, [sp]
         vadd.f32        q1,  q0,  q14
-        vadd.f32        q0,  q0,  q15
+        vadd.f32        q0,  q1,  q14
         vld1.32         {q2},     [r0,:64]!
         vld1.32         {q3},     [r1,:64]!
         subs            r12, r12, #1
@@ -251,8 +250,10 @@ function ff_ps_stereo_interpolate_neon, export=1
         vmla.f32        d17, d7,  d1[0]
         vmla.f32        d18, d6,  d3[1]
         vmla.f32        d19, d7,  d1[1]
-        vadd.f32        q1,  q1,  q15
-        vadd.f32        q0,  q0,  q15
+        vadd.f32        q1,  q1,  q14
+        vadd.f32        q0,  q0,  q14
+        vadd.f32        q1,  q1,  q14
+        vadd.f32        q0,  q0,  q14
         vld1.32         {q2},     [r0,:64]!
         vld1.32         {q3},     [r1,:64]!
         vst1.32         {q8},     [r2,:64]!
diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S
index ed8eb37..1aea190 100644
--- a/libavcodec/arm/ac3dsp_arm.S
+++ b/libavcodec/arm/ac3dsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S
index 2028d0b..1d2563d 100644
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_init_arm.c b/libavcodec/arm/ac3dsp_init_arm.c
index a48353a..a3c32ff 100644
--- a/libavcodec/arm/ac3dsp_init_arm.c
+++ b/libavcodec/arm/ac3dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,14 @@ void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
                                 const int16_t *window, unsigned n);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+                                            const int32_t *coef0,
+                                            const int32_t *coef1,
+                                            int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+                                            const float *coef0,
+                                            const float *coef1,
+                                            int len);
 
 void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
                                      int start, int end,
@@ -59,5 +67,7 @@ av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
         c->float_to_fixed24      = ff_float_to_fixed24_neon;
         c->extract_exponents     = ff_ac3_extract_exponents_neon;
         c->apply_window_int16    = ff_apply_window_int16_neon;
+        c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+        c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
     }
 }
diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index f97b190..89d0ae8 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -131,3 +131,47 @@ function ff_apply_window_int16_neon, export=1
 
         pop             {r4,pc}
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+        vmov.i64        q0,  #0
+        vmov.i64        q1,  #0
+        vmov.i64        q2,  #0
+        vmov.i64        q3,  #0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.s32        d18, d16, d17
+        vsub.s32        d19, d16, d17
+        vmlal.s32       q0,  d16, d16
+        vmlal.s32       q1,  d17, d17
+        vmlal.s32       q2,  d18, d18
+        vmlal.s32       q3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vadd.s64        d0,  d0,  d1
+        vadd.s64        d1,  d2,  d3
+        vadd.s64        d2,  d4,  d5
+        vadd.s64        d3,  d6,  d7
+        vst1.64         {q0-q1},  [r0]
+        bx              lr
+endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+        vmov.f32        q0,  #0.0
+        vmov.f32        q1,  #0.0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.f32        d18, d16, d17
+        vsub.f32        d19, d16, d17
+        vmla.f32        d0,  d16, d16
+        vmla.f32        d1,  d17, d17
+        vmla.f32        d2,  d18, d18
+        vmla.f32        d3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vpadd.f32       d0,  d0,  d1
+        vpadd.f32       d1,  d2,  d3
+        vst1.32         {q0},     [r0]
+        bx              lr
+endfunc
diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index 0ea2f04..a2174b0 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_arm.h b/libavcodec/arm/audiodsp_arm.h
index e97e804..213660d 100644
--- a/libavcodec/arm/audiodsp_arm.h
+++ b/libavcodec/arm/audiodsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_arm.c b/libavcodec/arm/audiodsp_init_arm.c
index ea9ec3c..74aa52a 100644
--- a/libavcodec/arm/audiodsp_init_arm.c
+++ b/libavcodec/arm/audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized audio functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_neon.c b/libavcodec/arm/audiodsp_init_neon.c
index 08405cb..6902db8 100644
--- a/libavcodec/arm/audiodsp_init_neon.c
+++ b/libavcodec/arm/audiodsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_neon.S b/libavcodec/arm/audiodsp_neon.S
index 5871b82..cea700c 100644
--- a/libavcodec/arm/audiodsp_neon.S
+++ b/libavcodec/arm/audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/blockdsp_arm.h b/libavcodec/arm/blockdsp_arm.h
index d26630e..59ebeb8 100644
--- a/libavcodec/arm/blockdsp_arm.h
+++ b/libavcodec/arm/blockdsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/blockdsp_init_arm.c b/libavcodec/arm/blockdsp_init_arm.c
index a5db201..2080d52 100644
--- a/libavcodec/arm/blockdsp_init_arm.c
+++ b/libavcodec/arm/blockdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized block operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/blockdsp_init_neon.c b/libavcodec/arm/blockdsp_init_neon.c
index e285750..0600bc6 100644
--- a/libavcodec/arm/blockdsp_init_neon.c
+++ b/libavcodec/arm/blockdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised block operations
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/blockdsp_neon.S b/libavcodec/arm/blockdsp_neon.S
index 98df2c6..9fc63cb 100644
--- a/libavcodec/arm/blockdsp_neon.S
+++ b/libavcodec/arm/blockdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised block functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index 6ff5f1a..fdbf86b 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,12 +59,18 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
         "tst        %[r_c]        , %[r_c]                      \n\t"
         "bne        2f                                          \n\t"
         "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+        "add        %[r_c]        , %[r_c]      , #2            \n\t"
+        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
+#else
         "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
         "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
         "cmp        %[r_c]        , %[r_b]                      \n\t"
         "itt        lt                                          \n\t"
         "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
         "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
+#endif
         "sub        %[r_c]        , %[low]      , #1            \n\t"
         "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
         "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 4aed576..ae4b730 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,10 +24,9 @@
 #include <stdint.h>
 
 #include "config.h"
-#include "libavcodec/dcadsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4)
+#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
 
 #define decode_blockcodes decode_blockcodes
 static inline int decode_blockcodes(int code1, int code2, int levels,
@@ -35,46 +34,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
 {
     int32_t v0, v1, v2, v3, v4, v5;
 
-    __asm__ ("smmul   %8,  %14, %18           \n"
-             "smmul   %11, %15, %18           \n"
-             "smlabb  %14, %8,  %17, %14      \n"
-             "smlabb  %15, %11, %17, %15      \n"
-             "smmul   %9,  %8,  %18           \n"
-             "smmul   %12, %11, %18           \n"
-             "sub     %14, %14, %16, lsr #1   \n"
-             "sub     %15, %15, %16, lsr #1   \n"
-             "smlabb  %8,  %9,  %17, %8       \n"
-             "smlabb  %11, %12, %17, %11      \n"
-             "smmul   %10, %9,  %18           \n"
-             "smmul   %13, %12, %18           \n"
-             "str     %14, %0                 \n"
-             "str     %15, %4                 \n"
-             "sub     %8,  %8,  %16, lsr #1   \n"
-             "sub     %11, %11, %16, lsr #1   \n"
-             "smlabb  %9,  %10, %17, %9       \n"
-             "smlabb  %12, %13, %17, %12      \n"
-             "smmul   %14, %10, %18           \n"
-             "smmul   %15, %13, %18           \n"
-             "str     %8,  %1                 \n"
-             "str     %11, %5                 \n"
-             "sub     %9,  %9,  %16, lsr #1   \n"
-             "sub     %12, %12, %16, lsr #1   \n"
-             "smlabb  %10, %14, %17, %10      \n"
-             "smlabb  %13, %15, %17, %13      \n"
-             "str     %9,  %2                 \n"
-             "str     %12, %6                 \n"
-             "sub     %10, %10, %16, lsr #1   \n"
-             "sub     %13, %13, %16, lsr #1   \n"
-             "str     %10, %3                 \n"
-             "str     %13, %7                 \n"
-             : "=m"(values[0]), "=m"(values[1]),
-               "=m"(values[2]), "=m"(values[3]),
-               "=m"(values[4]), "=m"(values[5]),
-               "=m"(values[6]), "=m"(values[7]),
-               "=&r"(v0), "=&r"(v1), "=&r"(v2),
+    __asm__ ("smmul   %0,  %6,  %10           \n"
+             "smmul   %3,  %7,  %10           \n"
+             "smlabb  %6,  %0,  %9,  %6       \n"
+             "smlabb  %7,  %3,  %9,  %7       \n"
+             "smmul   %1,  %0,  %10           \n"
+             "smmul   %4,  %3,  %10           \n"
+             "sub     %6,  %6,  %8,  lsr #1   \n"
+             "sub     %7,  %7,  %8,  lsr #1   \n"
+             "smlabb  %0,  %1,  %9,  %0       \n"
+             "smlabb  %3,  %4,  %9,  %3       \n"
+             "smmul   %2,  %1,  %10           \n"
+             "smmul   %5,  %4,  %10           \n"
+             "str     %6,  [%11, #0]          \n"
+             "str     %7,  [%11, #16]         \n"
+             "sub     %0,  %0,  %8,  lsr #1   \n"
+             "sub     %3,  %3,  %8,  lsr #1   \n"
+             "smlabb  %1,  %2,  %9,  %1       \n"
+             "smlabb  %4,  %5,  %9,  %4       \n"
+             "smmul   %6,  %2,  %10           \n"
+             "smmul   %7,  %5,  %10           \n"
+             "str     %0,  [%11, #4]          \n"
+             "str     %3,  [%11, #20]         \n"
+             "sub     %1,  %1,  %8,  lsr #1   \n"
+             "sub     %4,  %4,  %8,  lsr #1   \n"
+             "smlabb  %2,  %6,  %9,  %2       \n"
+             "smlabb  %5,  %7,  %9,  %5       \n"
+             "str     %1,  [%11, #8]          \n"
+             "str     %4,  [%11, #24]         \n"
+             "sub     %2,  %2,  %8,  lsr #1   \n"
+             "sub     %5,  %5,  %8,  lsr #1   \n"
+             "str     %2,  [%11, #12]         \n"
+             "str     %5,  [%11, #28]         \n"
+             : "=&r"(v0), "=&r"(v1), "=&r"(v2),
                "=&r"(v3), "=&r"(v4), "=&r"(v5),
                "+&r"(code1), "+&r"(code2)
-             : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
+             : "r"(levels - 1), "r"(-levels),
+               "r"(ff_inverse[levels]), "r"(values)
+             : "memory");
 
     return code1 | code2;
 }
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
deleted file mode 100644
index 735c4c2..0000000
--- a/libavcodec/arm/dcadsp_neon.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
-        push            {r4-r6,lr}
-        mov             r3,  #32                @ decifactor
-        mov             r6,  #256/32
-        b               dca_lfe_fir
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
-        push            {r4-r6,lr}
-        mov             r3,  #64                @ decifactor
-        mov             r6,  #256/64
-dca_lfe_fir:
-        add             r4,  r0,  r3,  lsl #2   @ out2
-        add             r5,  r2,  #256*4-16     @ cf1
-        sub             r1,  r1,  #12
-        mov             lr,  #-16
-1:
-        vmov.f32        q2,  #0.0               @ v0
-        vmov.f32        q3,  #0.0               @ v1
-        mov             r12, r6
-2:
-        vld1.32         {q8},     [r2,:128]!    @ cf0
-        vld1.32         {q9},     [r5,:128], lr @ cf1
-        vld1.32         {q1},     [r1], lr      @ in
-        subs            r12, r12, #4
-        vrev64.32       q10, q8
-        vmla.f32        q3,  q1,  q9
-        vmla.f32        d4,  d2,  d21
-        vmla.f32        d5,  d3,  d20
-        bne             2b
-
-        add             r1,  r1,  r6,  lsl #2
-        subs            r3,  r3,  #1
-        vadd.f32        d4,  d4,  d5
-        vadd.f32        d6,  d6,  d7
-        vpadd.f32       d5,  d4,  d6
-        vst1.32         {d5[0]},  [r0,:32]!
-        vst1.32         {d5[1]},  [r4,:32]!
-        bne             1b
-
-        pop             {r4-r6,pc}
-endfunc
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
deleted file mode 100644
index c9114d4..0000000
--- a/libavcodec/arm/dcadsp_vfp.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison@riscosopen.org>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-POUT          .req    a1
-PIN           .req    a2
-PCOEF         .req    a3
-OLDFPSCR      .req    a4
-COUNTER       .req    ip
-
-IN0           .req    s4
-IN1           .req    s5
-IN2           .req    s6
-IN3           .req    s7
-IN4           .req    s0
-IN5           .req    s1
-IN6           .req    s2
-IN7           .req    s3
-COEF0         .req    s8   @ coefficient elements
-COEF1         .req    s9
-COEF2         .req    s10
-COEF3         .req    s11
-COEF4         .req    s12
-COEF5         .req    s13
-COEF6         .req    s14
-COEF7         .req    s15
-ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
-ACCUM4        .req    s20
-POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
-POST1         .req    s25
-POST2         .req    s26
-POST3         .req    s27
-
-
-.macro inner_loop  decifactor, dir, tail, head
- .ifc "\dir","up"
-  .set X, 0
-  .set Y, 4
- .else
-  .set X, 4*JMAX*4 - 4
-  .set Y, -4
- .endif
- .ifnc "\head",""
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
- .endif
- .ifnc "\tail",""
-        vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
- .endif
- .ifnc "\head",""
-        vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
- .endif
- .ifnc "\head",""
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
-   .ifc "\tail",""
-        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
-   .endif
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
-   .ifnc "\tail",""
-        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
-   .endif
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
- .endif
- .ifnc "\tail",""
-        vstmia  POUT!, {POST0-POST3}
- .endif
- .ifnc "\head",""
-        vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
-        vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
-  .if \decifactor == 32
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
-        vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
-        vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
-        vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
-        vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
-  .endif
- .endif
-.endm
-
-.macro dca_lfe_fir  decifactor
-function ff_dca_lfe_fir\decifactor\()_vfp, export=1
-        fmrx    OLDFPSCR, FPSCR
-        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
-        fmxr    FPSCR, ip
-        vldr    IN0, [PIN, #-0*4]
-        vldr    IN1, [PIN, #-1*4]
-        vldr    IN2, [PIN, #-2*4]
-        vldr    IN3, [PIN, #-3*4]
- .if \decifactor == 32
-  .set JMAX, 8
-        vpush   {s16-s31}
-        vldr    IN4, [PIN, #-4*4]
-        vldr    IN5, [PIN, #-5*4]
-        vldr    IN6, [PIN, #-6*4]
-        vldr    IN7, [PIN, #-7*4]
- .else
-  .set JMAX, 4
-        vpush   {s16-s27}
- .endif
-
-        mov     COUNTER, #\decifactor/4 - 1
-        inner_loop  \decifactor, up,, head
-1:      add     PCOEF, PCOEF, #4*JMAX*4
-        subs    COUNTER, COUNTER, #1
-        inner_loop  \decifactor, up, tail, head
-        bne     1b
-        inner_loop  \decifactor, up, tail
-
-        mov     COUNTER, #\decifactor/4 - 1
-        inner_loop  \decifactor, down,, head
-1:      sub     PCOEF, PCOEF, #4*JMAX*4
-        subs    COUNTER, COUNTER, #1
-        inner_loop  \decifactor, down, tail, head
-        bne     1b
-        inner_loop  \decifactor, down, tail
-
- .if \decifactor == 32
-        vpop    {s16-s31}
- .else
-        vpop    {s16-s27}
- .endif
-        fmxr    FPSCR, OLDFPSCR
-        bx      lr
-endfunc
-.endm
-
-        dca_lfe_fir  64
- .ltorg
-        dca_lfe_fir  32
-
-        .unreq  POUT
-        .unreq  PIN
-        .unreq  PCOEF
-        .unreq  OLDFPSCR
-        .unreq  COUNTER
-
-        .unreq  IN0
-        .unreq  IN1
-        .unreq  IN2
-        .unreq  IN3
-        .unreq  IN4
-        .unreq  IN5
-        .unreq  IN6
-        .unreq  IN7
-        .unreq  COEF0
-        .unreq  COEF1
-        .unreq  COEF2
-        .unreq  COEF3
-        .unreq  COEF4
-        .unreq  COEF5
-        .unreq  COEF6
-        .unreq  COEF7
-        .unreq  ACCUM0
-        .unreq  ACCUM4
-        .unreq  POST0
-        .unreq  POST1
-        .unreq  POST2
-        .unreq  POST3
-
-
-IN      .req    a1
-SBACT   .req    a2
-OLDFPSCR .req   a3
-IMDCT   .req    a4
-WINDOW  .req    v1
-OUT     .req    v2
-BUF     .req    v3
-SCALEINT .req   v4 @ only used in softfp case
-COUNT   .req    v5
-
-SCALE   .req    s0
-
-/* Stack layout differs in softfp and hardfp cases:
- *
- * hardfp
- *      fp -> 6 arg words saved by caller
- *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
- *            s16-s23 on entry
- *            align 16
- *     buf -> 8*32*4 bytes buffer
- *            s0 on entry
- *      sp -> 3 arg words for callee
- *
- * softfp
- *      fp -> 7 arg words saved by caller
- *            a4,v1-v5,fp,lr on entry
- *            s16-s23 on entry
- *            align 16
- *     buf -> 8*32*4 bytes buffer
- *      sp -> 4 arg words for callee
- */
-
-/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- *                                 SynthFilterContext *synth, FFTContext *imdct,
- *                                 float (*synth_buf_ptr)[512],
- *                                 int *synth_buf_offset, float (*synth_buf2)[32],
- *                                 const float (*window)[512], float *samples_out,
- *                                 float (*raXin)[32], float scale);
- */
-function ff_dca_qmf_32_subbands_vfp, export=1
-VFP     push    {a3-a4,v1-v3,v5,fp,lr}
-NOVFP   push    {a4,v1-v5,fp,lr}
-        add     fp, sp, #8*4
-        vpush   {s16-s23}
-        @ The buffer pointed at by raXin isn't big enough for us to do a
-        @ complete matrix transposition as we want to, so allocate an
-        @ alternative buffer from the stack. Align to 4 words for speed.
-        sub     BUF, sp, #8*32*4
-        bic     BUF, BUF, #15
-        mov     sp, BUF
-        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
-        fmrx    OLDFPSCR, FPSCR
-        fmxr    FPSCR, lr
-        @ COUNT is used to count down 2 things at once:
-        @ bits 0-4 are the number of word pairs remaining in the output row
-        @ bits 5-31 are the number of words to copy (with possible negation)
-        @   from the source matrix before we start zeroing the remainder
-        mov     COUNT, #(-4 << 5) + 16
-        adds    COUNT, COUNT, SBACT, lsl #5
-        bmi     2f
-1:
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  [IN, #(1*8+0)*4]
-        vldr    s11, [IN, #(1*8+1)*4]
-        vldr    s13, [IN, #(1*8+2)*4]
-        vldr    s15, [IN, #(1*8+3)*4]
-        vneg.f  s16, s16
-        vldr    s17, [IN, #(1*8+4)*4]
-        vldr    s19, [IN, #(1*8+5)*4]
-        vldr    s21, [IN, #(1*8+6)*4]
-        vldr    s23, [IN, #(1*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        vldr    s9,  [IN, #(3*8+0)*4]
-        vldr    s11, [IN, #(3*8+1)*4]
-        vldr    s13, [IN, #(3*8+2)*4]
-        vldr    s15, [IN, #(3*8+3)*4]
-        vldr    s17, [IN, #(3*8+4)*4]
-        vldr    s19, [IN, #(3*8+5)*4]
-        vldr    s21, [IN, #(3*8+6)*4]
-        vldr    s23, [IN, #(3*8+7)*4]
-        vneg.f  s9, s9
-        vldr    s8,  [IN, #(2*8+0)*4]
-        vldr    s10, [IN, #(2*8+1)*4]
-        vldr    s12, [IN, #(2*8+2)*4]
-        vldr    s14, [IN, #(2*8+3)*4]
-        vneg.f  s17, s17
-        vldr    s16, [IN, #(2*8+4)*4]
-        vldr    s18, [IN, #(2*8+5)*4]
-        vldr    s20, [IN, #(2*8+6)*4]
-        vldr    s22, [IN, #(2*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+2)*4]
-        vstr    d5,  [BUF, #(1*32+2)*4]
-        vstr    d6,  [BUF, #(2*32+2)*4]
-        vstr    d7,  [BUF, #(3*32+2)*4]
-        vstr    d8,  [BUF, #(4*32+2)*4]
-        vstr    d9,  [BUF, #(5*32+2)*4]
-        vstr    d10, [BUF, #(6*32+2)*4]
-        vstr    d11, [BUF, #(7*32+2)*4]
-        add     IN, IN, #4*8*4
-        add     BUF, BUF, #4*4
-        subs    COUNT, COUNT, #(4 << 5) + 2
-        bpl     1b
-2:      @ Now deal with trailing < 4 samples
-        adds    COUNT, COUNT, #3 << 5
-        bmi     4f  @ sb_act was a multiple of 4
-        bics    lr, COUNT, #0x1F
-        bne     3f
-        @ sb_act was n*4+1
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  zero
-        vldr    s11, zero
-        vldr    s13, zero
-        vldr    s15, zero
-        vneg.f  s16, s16
-        vldr    s17, zero
-        vldr    s19, zero
-        vldr    s21, zero
-        vldr    s23, zero
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #1
-        b       4f
-3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  [IN, #(1*8+0)*4]
-        vldr    s11, [IN, #(1*8+1)*4]
-        vldr    s13, [IN, #(1*8+2)*4]
-        vldr    s15, [IN, #(1*8+3)*4]
-        vneg.f  s16, s16
-        vldr    s17, [IN, #(1*8+4)*4]
-        vldr    s19, [IN, #(1*8+5)*4]
-        vldr    s21, [IN, #(1*8+6)*4]
-        vldr    s23, [IN, #(1*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #(2 << 5) + 1
-        bics    lr, COUNT, #0x1F
-        bne     4f
-        @ sb_act was n*4+3
-        vldr    s8,  [IN, #(2*8+0)*4]
-        vldr    s10, [IN, #(2*8+1)*4]
-        vldr    s12, [IN, #(2*8+2)*4]
-        vldr    s14, [IN, #(2*8+3)*4]
-        vldr    s16, [IN, #(2*8+4)*4]
-        vldr    s18, [IN, #(2*8+5)*4]
-        vldr    s20, [IN, #(2*8+6)*4]
-        vldr    s22, [IN, #(2*8+7)*4]
-        vldr    s9,  zero
-        vldr    s11, zero
-        vldr    s13, zero
-        vldr    s15, zero
-        vldr    s17, zero
-        vldr    s19, zero
-        vldr    s21, zero
-        vldr    s23, zero
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #1
-4:      @ Now fill the remainder with 0
-        vldr    s8, zero
-        vldr    s9, zero
-        ands    COUNT, COUNT, #0x1F
-        beq     6f
-5:      vstr    d4, [BUF, #(0*32+0)*4]
-        vstr    d4, [BUF, #(1*32+0)*4]
-        vstr    d4, [BUF, #(2*32+0)*4]
-        vstr    d4, [BUF, #(3*32+0)*4]
-        vstr    d4, [BUF, #(4*32+0)*4]
-        vstr    d4, [BUF, #(5*32+0)*4]
-        vstr    d4, [BUF, #(6*32+0)*4]
-        vstr    d4, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        subs    COUNT, COUNT, #1
-        bne     5b
-6:
-        fmxr    FPSCR, OLDFPSCR
-        ldr     WINDOW, [fp, #3*4]
-        ldr     OUT, [fp, #4*4]
-        sub     BUF, BUF, #32*4
-NOVFP   ldr     SCALEINT, [fp, #6*4]
-        mov     COUNT, #8
-VFP     vpush   {SCALE}
-VFP     sub     sp, sp, #3*4
-NOVFP   sub     sp, sp, #4*4
-7:
-VFP     ldr     a1, [fp, #-7*4]     @ imdct
-NOVFP   ldr     a1, [fp, #-8*4]
-        ldmia   fp, {a2-a4}
-VFP     stmia   sp, {WINDOW, OUT, BUF}
-NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
-VFP     vldr    SCALE, [sp, #3*4]
-        bl      X(ff_synth_filter_float_vfp)
-        add     OUT, OUT, #32*4
-        add     BUF, BUF, #32*4
-        subs    COUNT, COUNT, #1
-        bne     7b
-
-A       sub     sp, fp, #(8+8)*4
-T       sub     fp, fp, #(8+8)*4
-T       mov     sp, fp
-        vpop    {s16-s23}
-VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
-NOVFP   pop     {a4,v1-v5,fp,pc}
-endfunc
-
-        .unreq  IN
-        .unreq  SBACT
-        .unreq  OLDFPSCR
-        .unreq  IMDCT
-        .unreq  WINDOW
-        .unreq  OUT
-        .unreq  BUF
-        .unreq  SCALEINT
-        .unreq  COUNT
-
-        .unreq  SCALE
-
-        .align 2
-zero:   .word   0
diff --git a/libavcodec/arm/fft_fixed_init_arm.c b/libavcodec/arm/fft_fixed_init_arm.c
index 5132b09..11226d6 100644
--- a/libavcodec/arm/fft_fixed_init_arm.c
+++ b/libavcodec/arm/fft_fixed_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,8 @@
 #include "libavcodec/fft.h"
 
 void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
+void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
 
 av_cold void ff_fft_fixed_init_arm(FFTContext *s)
 {
@@ -33,6 +35,16 @@ av_cold void ff_fft_fixed_init_arm(FFTContext *s)
 
     if (have_neon(cpu_flags)) {
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+#if CONFIG_FFT
         s->fft_calc        = ff_fft_fixed_calc_neon;
+#endif
+
+#if CONFIG_MDCT
+        if (!s->inverse && s->nbits >= 3) {
+            s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+            s->mdct_calc        = ff_mdct_fixed_calc_neon;
+            s->mdct_calcw       = ff_mdct_fixed_calcw_neon;
+        }
+#endif
     }
 }
diff --git a/libavcodec/arm/fft_fixed_neon.S b/libavcodec/arm/fft_fixed_neon.S
index c70a189..2651607 100644
--- a/libavcodec/arm/fft_fixed_neon.S
+++ b/libavcodec/arm/fft_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 4d047ea..331bd65 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,16 +29,33 @@ void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 av_cold void ff_fft_init_arm(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_vfp_vm(cpu_flags)) {
         s->fft_calc     = ff_fft_calc_vfp;
+#if CONFIG_MDCT
+        s->imdct_half   = ff_imdct_half_vfp;
+#endif
     }
 
     if (have_neon(cpu_flags)) {
+#if CONFIG_FFT
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
+#endif
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
     }
 }
diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
index b161015..48f8dfc 100644
--- a/libavcodec/arm/fft_neon.S
+++ b/libavcodec/arm/fft_neon.S
@@ -7,20 +7,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index c2801fa..ac60132 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_arm.S b/libavcodec/arm/flacdsp_arm.S
index d4441da..f8861c5 100644
--- a/libavcodec/arm/flacdsp_arm.S
+++ b/libavcodec/arm/flacdsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_init_arm.c b/libavcodec/arm/flacdsp_init_arm.c
index 0530cf7..564e3dc 100644
--- a/libavcodec/arm/flacdsp_init_arm.c
+++ b/libavcodec/arm/flacdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,9 @@
 void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
-av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
                                  int bps)
 {
-    if (bps <= 16)
-        c->lpc = ff_flac_lpc_16_arm;
+    if (CONFIG_FLAC_DECODER)
+        c->lpc16 = ff_flac_lpc_16_arm;
 }
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
index 70c8023..e88255d 100644
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized Format Conversion Utils
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
index 5d48e3d..738953e 100644
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau  <janne-libav@jannau.net>b
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 4e43f42..b14af45 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_init_arm.c b/libavcodec/arm/g722dsp_init_arm.c
index 5edf619..c0e5d8b 100644
--- a/libavcodec/arm/g722dsp_init_arm.c
+++ b/libavcodec/arm/g722dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_neon.S b/libavcodec/arm/g722dsp_neon.S
index 5fa3c27..757e53f 100644
--- a/libavcodec/arm/g722dsp_neon.S
+++ b/libavcodec/arm/g722dsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions for G722 coding
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c
index 0e84362..aae804b 100644
--- a/libavcodec/arm/h264chroma_init_arm.c
+++ b/libavcodec/arm/h264chroma_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index 779dc0b..5a4159e 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -455,7 +455,7 @@ endconst
         h264_chroma_mc4 avg, rv40
 #endif
 
-#if CONFIG_VC1_DECODER
+#if CONFIG_VC1DSP
         h264_chroma_mc8 put, vc1
         h264_chroma_mc8 avg, vc1
         h264_chroma_mc4 put, vc1
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index 7afd350..8940265 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,13 +25,13 @@
 #include "libavcodec/h264dsp.h"
 #include "libavcodec/arm/startcode.h"
 
-void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                      int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                      int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                        int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                        int beta, int8_t *tc0);
 
 void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
@@ -72,11 +72,14 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
 static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
                                       const int chroma_format_idc)
 {
+#if HAVE_NEON
     if (bit_depth == 8) {
         c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
         c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if(chroma_format_idc == 1){
         c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+        }
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
         c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
@@ -96,6 +99,7 @@ static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
         c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
         c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
     }
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
@@ -103,8 +107,10 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 5e75565..274a547 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index b078cf2..93859db 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index a445d4d..cc324d7 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,6 +49,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
                                         const int bit_depth,
                                         const int chroma_format_idc)
 {
+#if HAVE_NEON
     const int high_depth = bit_depth > 8;
 
     if (high_depth)
@@ -81,6 +82,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
     if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
         codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
         h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
diff --git a/libavcodec/arm/h264pred_neon.S b/libavcodec/arm/h264pred_neon.S
index 332f94b..4dc47ba 100644
--- a/libavcodec/arm/h264pred_neon.S
+++ b/libavcodec/arm/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_init_arm.c b/libavcodec/arm/h264qpel_init_arm.c
index 01615b5..71237be 100644
--- a/libavcodec/arm/h264qpel_init_arm.c
+++ b/libavcodec/arm/h264qpel_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_neon.S b/libavcodec/arm/h264qpel_neon.S
index 6c51250..21336c6 100644
--- a/libavcodec/arm/h264qpel_neon.S
+++ b/libavcodec/arm/h264qpel_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hevc_mc.S b/libavcodec/arm/hevc_mc.S
deleted file mode 100644
index f499d29..0000000
--- a/libavcodec/arm/hevc_mc.S
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * ARM NEON optimised MC functions for HEVC decoding
- *
- * Copyright (c) 2017 Alexandra Hájková
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-.macro get_pixels4 bitdepth
-function ff_hevc_get_pixels_4_\bitdepth\()_neon, export=1
-@r0 dst, r1 dststride, r2 src, r3 srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-
-1:
-  .if \bitdepth == 8
-        vld1.32         {d0[0]}, [r2], r3
-        vld1.32         {d1[0]}, [r2], r3
-        vld1.32         {d2[0]}, [r2], r3
-        vld1.32         {d3[0]}, [r2], r3
-        vshll.u8        q8, d0, #6
-        vshll.u8        q9, d1, #6
-        vshll.u8        q10, d2, #6
-        vshll.u8        q11, d3, #6
-  .else
-        vld1.16         {d0}, [r2], r3
-        vld1.16         {d1}, [r2], r3
-        vld1.16         {d2}, [r2], r3
-        vld1.16         {d3}, [r2], r3
-        vshl.i16        d16, d0, #4
-        vshl.i16        d18, d1, #4
-        vshl.i16        d20, d2, #4
-        vshl.i16        d22, d3, #4
-  .endif
-
-        vst1.16         {d16}, [r0, :64], r1
-        vst1.16         {d18}, [r0, :64], r1
-        vst1.16         {d20}, [r0, :64], r1
-        vst1.16         {d22}, [r0, :64], r1
-        subs            r12, #4
-        bgt             1b
-
-        bx              lr
-endfunc
-.endm
-
-.macro get_pixels8 bitdepth
-function ff_hevc_get_pixels_8_\bitdepth\()_neon, export=1
-@r0 dst, r1 dststride, r2 src, r3 srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-
-1:
-  .if \bitdepth == 8
-        vld1.8          {d0}, [r2], r3
-        vld1.8          {d1}, [r2], r3
-        vld1.8          {d2}, [r2], r3
-        vld1.8          {d3}, [r2], r3
-        vshll.u8        q8, d0, #6
-        vshll.u8        q9, d1, #6
-        vshll.u8        q10, d2, #6
-        vshll.u8        q11, d3, #6
-  .else
-        vld1.16         {d16-d17}, [r2], r3
-        vld1.16         {d18-d19}, [r2], r3
-        vld1.16         {d20-d21}, [r2], r3
-        vld1.16         {d22-d23}, [r2], r3
-        vshl.i16        q8, q8, #4
-        vshl.i16        q9, q9, #4
-        vshl.i16        q10, q10, #4
-        vshl.i16        q11, q11, #4
-  .endif
-
-        vst1.16         {d16-d17}, [r0, :64], r1
-        vst1.16         {d18-d19}, [r0, :64], r1
-        vst1.16         {d20-d21}, [r0, :64], r1
-        vst1.16         {d22-d23}, [r0, :64], r1
-        subs            r12, #4
-        bgt             1b
-
-        bx              lr
-endfunc
-.endm
-
-.macro get_pixels12 bitdepth
-function ff_hevc_get_pixels_12_\bitdepth\()_neon, export=1
-@r0 - dst, r1 - dststride, r2 - src, r3 - srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-        push            {r4-r5, lr}
-        add             r4, r0, #16
-
-1:
-  .if \bitdepth == 8
-        add             r5, r2, #8
-        vld1.8          {d0}, [r2], r3
-        vld1.32         {d4[0]}, [r5], r3
-        vld1.8          {d1}, [r2], r3
-        vld1.32         {d5[0]}, [r5], r3
-        vld1.8          {d2}, [r2], r3
-        vld1.32         {d6[0]}, [r5], r3
-        vld1.8          {d3}, [r2], r3
-        vld1.32         {d7[0]}, [r5], r3
-        vshll.u8        q8, d0, #6
-        vshll.u8        q12, d4, #6
-        vshll.u8        q9, d1, #6
-        vshll.u8        q13, d5, #6
-        vshll.u8        q10, d2, #6
-        vshll.u8        q14, d6, #6
-        vshll.u8        q11, d3, #6
-        vshll.u8        q15, d7, #6
-  .else
-        add             r5, r2, #16
-        vld1.16         {d16-d17}, [r2], r3
-        vld1.16         {d24}, [r5], r3
-        vld1.16         {d18-d19}, [r2], r3
-        vld1.16         {d26}, [r5], r3
-        vld1.16         {d20-d21}, [r2], r3
-        vld1.16         {d28}, [r5], r3
-        vld1.16         {d22-d23}, [r2], r3
-        vld1.16         {d30}, [r5], r3
-        vshl.i16        q8, q8, #4
-        vshl.i16        d24, d24, #4
-        vshl.i16        q9, q9, #4
-        vshl.i16        d26, d26, #4
-        vshl.i16        q10, q10, #4
-        vshl.i16        d28, d28, #4
-        vshl.i16        q11, q11, #4
-        vshl.i16        d30, d30, #4
-  .endif
-
-        vst1.16         {d16-d17}, [r0, :64], r1
-        vst1.16         {d24}, [r4, :64], r1
-        vst1.16         {d18-d19}, [r0, :64], r1
-        vst1.16         {d26}, [r4, :64], r1
-        vst1.16         {d20-d21}, [r0, :64], r1
-        vst1.16         {d28}, [r4, :64], r1
-        vst1.16         {d22-d23}, [r0, :64], r1
-        vst1.16         {d30}, [r4, :64], r1
-        subs            r12, #4
-        bgt             1b
-
-        pop             {r4-r5, pc}
-endfunc
-.endm
-
-@8 bitdepth case
-.macro process_8 load
-        vld1.8          {d0-d1}, [\load], r3
-        vld1.8          {d2-d3}, [\load], r3
-        vld1.8          {d4-d5}, [\load], r3
-        vld1.8          {d6-d7}, [\load], r3
-        vshll.u8        q8, d0, #6
-        vshll.u8        q9, d1, #6
-        vshll.u8        q10, d2, #6
-        vshll.u8        q11, d3, #6
-        vshll.u8        q12, d4, #6
-        vshll.u8        q13, d5, #6
-        vshll.u8        q14, d6, #6
-        vshll.u8        q15, d7, #6
-.endm
-
-@10 bitdepth case
-.macro process_10 load
-        vld1.16         {d16-d19}, [\load], r3
-        vld1.16         {d20-d23}, [\load], r3
-        vld1.16         {d24-d27}, [\load], r3
-        vld1.16         {d28-d31}, [\load], r3
-        vshl.i16        q8, q8, #4
-        vshl.i16        q9, q9, #4
-        vshl.i16        q10, q10, #4
-        vshl.i16        q11, q11, #4
-        vshl.i16        q12, q12, #4
-        vshl.i16        q13, q13, #4
-        vshl.i16        q14, q14, #4
-        vshl.i16        q15, q15, #4
-.endm
-
-.macro store_4x16 store
-        vst1.16         {d16-d19}, [\store, :128], r1
-        vst1.16         {d20-d23}, [\store, :128], r1
-        vst1.16         {d24-d27}, [\store, :128], r1
-        vst1.16         {d28-d31}, [\store, :128], r1
-.endm
-
-.macro get_pixels16 bitdepth
-function ff_hevc_get_pixels_16_\bitdepth\()_neon, export=1
-@r0 dst, r1 dststride, r2 src, r3 srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-
-1:
-  .if \bitdepth == 8
-        process_8       r2
-  .else
-        process_10      r2
-  .endif
-
-        store_4x16      r0
-        subs            r12, #4
-        bgt             1b
-
-        bx              lr
-endfunc
-.endm
-
-.macro get_pixels24 bitdepth
-function ff_hevc_get_pixels_24_\bitdepth\()_neon, export=1
-@r0 dst, r1 dststride, r2 src, r3 srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-
-        push            {r0-r4, lr}
-        push            {r12}
-        bl              X(ff_hevc_get_pixels_8_\bitdepth\()_neon)
-        pop             {r12}
-        pop             {r0-r4, lr}
-
-  .if \bitdepth == 8
-        add             r2, #8
-  .else
-        add             r2, #16
-  .endif
-        add             r0, #16
-        b               X(ff_hevc_get_pixels_16_\bitdepth\()_neon)
-endfunc
-.endm
-
-.macro get_pixels32 bitdepth
-function ff_hevc_get_pixels_32_\bitdepth\()_neon, export=1
-@r0 dst, r1 dststride, r2 src, r3 srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-        push            {r4-r5, lr}
-  .if \bitdepth == 8
-        add             r4, r2, #16
-  .else
-        add             r4, r2, #32
-  .endif
-        add             r5, r0, #32
-
-1:
-  .if \bitdepth == 8
-        process_8       r2
-  .else
-        process_10      r2
-  .endif
-        store_4x16      r0
-
-  .if \bitdepth == 8
-        process_8       r4
-  .else
-        process_10      r4
-  .endif
-        store_4x16      r5
-
-        subs            r12, #4
-        bgt             1b
-
-        pop             {r4-r5, pc}
-endfunc
-.endm
-
-.macro get_pixels48 bitdepth
-function ff_hevc_get_pixels_48_\bitdepth\()_neon, export=1
-@r0 dst, r1 dststride, r2 src, r3 srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-
-        push            {r0-r4, lr}
-        push            {r12}
-        bl              X(ff_hevc_get_pixels_16_\bitdepth\()_neon)
-        pop             {r12}
-        pop             {r0-r4, lr}
-
-  .if \bitdepth == 8
-        add             r2, #16
-  .else
-        add             r2, #32
-  .endif
-        add             r0, #32
-        b               X(ff_hevc_get_pixels_32_\bitdepth\()_neon)
-endfunc
-.endm
-
-.macro get_pixels64 bitdepth
-function ff_hevc_get_pixels_64_\bitdepth\()_neon, export=1
-@r0 dst, r1 dststride, r2 src, r3 srcstride
-        ldr             r12, [sp] @height
-        cmp             r12, #0
-        it              eq
-        bxeq            lr
-        push            {r4-r9, lr}
-  .if \bitdepth == 8
-        add             r4, r2, #16
-        add             r6, r4, #16
-        add             r8, r6, #16
-  .else
-        add             r4, r2, #32
-        add             r6, r4, #32
-        add             r8, r6, #32
-  .endif
-        add             r5, r0, #32
-        add             r7, r5, #32
-        add             r9, r7, #32
-
-1:
-  .if \bitdepth == 8
-        process_8       r2
-  .else
-        process_10      r2
-  .endif
-        store_4x16      r0
-
-  .if \bitdepth == 8
-        process_8       r4
-  .else
-        process_10      r4
-  .endif
-        store_4x16      r5
-
-  .if \bitdepth == 8
-        process_8       r6
-  .else
-        process_10      r6
-  .endif
-        store_4x16      r7
-  .if \bitdepth == 8
-        process_8       r8
-  .else
-        process_10      r8
-  .endif
-        store_4x16      r9
-
-        subs            r12, #4
-        bgt             1b
-
-        pop             {r4-r9, pc}
-
-endfunc
-.endm
-
-get_pixels4 8
-get_pixels4 10
-
-get_pixels8 8
-get_pixels8 10
-
-get_pixels12 8
-get_pixels12 10
-
-get_pixels16 8
-get_pixels16 10
-
-get_pixels24 8
-get_pixels24 10
-
-get_pixels32 8
-get_pixels32 10
-
-get_pixels48 8
-get_pixels48 10
-
-get_pixels64 8
-get_pixels64 10
diff --git a/libavcodec/arm/hevcdsp_arm.h b/libavcodec/arm/hevcdsp_arm.h
new file mode 100644
index 0000000..47cdfa5
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_arm.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
+#define AVCODEC_ARM_HEVCDSP_ARM_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000..7cb7487
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start
+        ldr      r12, [r2]
+        ldr      r3, [r2, #4]
+        add      r2, r3, r12
+        cmp      r2, #0
+        it       eq
+        bxeq     lr
+.endm
+
+.macro hevc_loop_filter_chroma_body
+        vsubl.u8  q3, d4, d2
+        vsubl.u8  q11, d18, d19
+        vshl.i16  q3, #2
+        vadd.i16  q11, q3
+        vdup.16   d0, r12
+        vdup.16   d1, r3
+        vrshr.s16 q11, q11, #3
+        vneg.s16  q12, q0
+        vmovl.u8  q2, d4
+        vmin.s16  q11, q11, q0
+        vmax.s16  q11, q11, q12
+        vaddw.u8  q1, q11, d2
+        vsub.i16  q2, q11
+        vqmovun.s16 d2, q1
+        vqmovun.s16 d4, q2
+.endm
+
+.macro hevc_loop_filter_luma_start
+        ldr     r12, [r3]
+        ldr      r3, [r3, #4]
+        lsl      r3, #16
+        orr      r3, r12
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        lsr      r3, #16
+.endm
+
+.macro hevc_loop_filter_luma_body
+        vmovl.u8  q8, d16
+        vmovl.u8  q9, d18
+        vmovl.u8  q10, d20
+        vmovl.u8  q11, d22
+        vmovl.u8  q12, d24
+        vmovl.u8  q13, d26
+        vmovl.u8  q14, d28
+        vmovl.u8  q15, d30
+
+        vadd.i16   q7, q9, q11
+        vadd.i16   q6, q14, q12
+        vsub.i16   q7, q10
+        vsub.i16   q6, q13
+        vabd.s16   q7, q7, q10
+        vabd.s16   q6, q6, q13
+
+
+        vdup.16    q0, r2
+        vmov       q4, q7
+        vmov       q5, q6
+        vdup.16    d4, r12
+        vtrn.16    q7, q4
+        vtrn.16    q6, q5
+
+        vshl.u64   q7, #32
+        vshr.u64   q4, #32
+        vshl.u64   q6, #32
+        vshr.u64   q5, #32
+        vshr.u64   q7, #32
+        vshr.u64   q6, #32
+        vshl.u64   q5, #32
+        vshl.u64   q4, #32
+        vorr       q6, q5
+        vorr       q7, q4
+        vdup.16    d5, r3
+        vadd.i16   q5, q7, q6
+
+        vmov       q4, q5
+        vmov       q3, q5
+        vtrn.32    q3, q4
+
+        vadd.i16   q4, q3
+
+        vshl.s16   q5, q5, #1
+        vcgt.s16   q3, q0, q4
+
+        vmovn.i16  d6, q3
+        vshr.s16   q1, q0, #2
+        vmovn.i16  d6, q3
+        vcgt.s16   q5, q1, q5
+        vmov       r7, s12
+        cmp        r7, #0
+        beq        bypasswrite
+
+        vpadd.i32  d0, d14, d12
+        vpadd.i32  d1, d15, d13
+        vmov       q4, q2
+        vshl.s16   q2, #2
+        vshr.s16   q1, q1, #1
+        vrhadd.s16 q2, q4
+
+        vabd.s16   q7, q8, q11
+        vaba.s16   q7, q15, q12
+
+        vmovn.i32  d0, q0
+        vmov       r5, r6, s0, s1
+        vcgt.s16   q6, q1, q7
+        vand       q5, q5, q6
+        vabd.s16   q7, q11, q12
+        vcgt.s16   q6, q2, q7
+        vand       q5, q5, q6
+
+        vmov       q2, q5
+        vtrn.s16   q5, q2
+        vshr.u64   q2, #32
+        vshl.u64   q5, #32
+        vshl.u64   q2, #32
+        vshr.u64   q5, #32
+        vorr       q5, q2
+
+        vmov       q2, q5
+        vshl.i16   q7, q4, #1
+        vtrn.32    q2, q5
+        vand       q5, q2
+        vneg.s16   q6, q7
+        vmovn.i16  d4, q5
+        vmovn.i16  d4, q2
+        vmov       r8, s8
+
+        and        r9, r8, r7
+        cmp        r9, #0
+        beq        1f
+
+        vadd.i16  q2, q11, q12
+        vadd.i16  q4, q9, q8
+        vadd.i16  q1, q2, q10
+        vdup.16   d10, r9
+        vadd.i16  q0, q1, q9
+        vshl.i16  q4, #1
+        lsr        r9, #16
+        vadd.i16  q1, q0
+        vrshr.s16 q3, q0, #2
+        vadd.i16  q1, q13
+        vadd.i16  q4, q0
+        vsub.i16  q3, q10
+        vrshr.s16 q1, #3
+        vrshr.s16 q4, #3
+        vmax.s16  q3, q6
+        vsub.i16  q1, q11
+        vsub.i16  q4, q9
+        vmin.s16  q3, q7
+        vmax.s16  q4, q6
+        vmax.s16  q1, q6
+        vadd.i16  q3, q10
+        vmin.s16  q4, q7
+        vmin.s16  q1, q7
+        vdup.16   d11, r9
+        vadd.i16  q4, q9
+        vadd.i16  q1, q11
+        vbit      q9, q4, q5
+        vadd.i16  q4, q2, q13
+        vbit      q11, q1, q5
+        vadd.i16  q0, q4, q14
+        vadd.i16  q2, q15, q14
+        vadd.i16  q4, q0
+
+        vshl.i16  q2, #1
+        vadd.i16  q4, q10
+        vbit      q10, q3, q5
+        vrshr.s16 q4, #3
+        vadd.i16  q2, q0
+        vrshr.s16 q3, q0, #2
+        vsub.i16  q4, q12
+        vrshr.s16 q2, #3
+        vsub.i16  q3, q13
+        vmax.s16  q4, q6
+        vsub.i16  q2, q14
+        vmax.s16  q3, q6
+        vmin.s16  q4, q7
+        vmax.s16  q2, q6
+        vmin.s16  q3, q7
+        vadd.i16  q4, q12
+        vmin.s16  q2, q7
+        vadd.i16  q3, q13
+        vbit      q12, q4, q5
+        vadd.i16  q2, q14
+        vbit      q13, q3, q5
+        vbit      q14, q2, q5
+
+1:
+        mvn       r8, r8
+        and       r9, r8, r7
+        cmp       r9, #0
+        beq       2f
+
+        vdup.16    q4, r2
+
+        vdup.16   d10, r9
+        lsr       r9, #16
+        vmov       q1, q4
+        vdup.16   d11, r9
+        vshr.s16   q1, #1
+        vsub.i16  q2, q12, q11
+        vadd.i16   q4, q1
+        vshl.s16  q0, q2, #3
+        vshr.s16   q4, #3
+        vadd.i16  q2, q0
+        vsub.i16  q0, q13, q10
+        vsub.i16  q2, q0
+        vshl.i16  q0, q0, #1
+        vsub.i16  q2, q0
+        vshl.s16  q1, q7, 2
+        vrshr.s16 q2, q2, #4
+        vadd.i16  q1, q7
+        vabs.s16  q3, q2
+        vshr.s16  q6, q6, #1
+        vcgt.s16  q1, q1, q3
+        vand      q5, q1
+        vshr.s16  q7, q7, #1
+        vmax.s16  q2, q2, q6
+        vmin.s16  q2, q2, q7
+
+        vshr.s16  q7, q7, #1
+        vrhadd.s16 q3, q9, q11
+        vneg.s16  q6, q7
+        vsub.s16  q3, q10
+        vdup.16   d2, r5
+        vhadd.s16 q3, q2
+        vdup.16   d3, r6
+        vmax.s16  q3, q3, q6
+        vcgt.s16  q1, q4, q1
+        vmin.s16  q3, q3, q7
+        vand      q1, q5
+        vadd.i16  q3, q10
+        lsr       r5, #16
+        lsr       r6, #16
+        vbit      q10, q3, q1
+
+        vrhadd.s16 q3, q14, q12
+        vdup.16   d2, r5
+        vsub.s16  q3, q13
+        vdup.16   d3, r6
+        vhsub.s16 q3, q2
+        vcgt.s16  q1, q4, q1
+        vmax.s16  q3, q3, q6
+        vand      q1, q5
+        vmin.s16  q3, q3, q7
+        vadd.i16  q3, q13
+        vbit      q13, q3, q1
+        vadd.i16  q0, q11, q2
+        vsub.i16  q4, q12, q2
+        vbit      q11, q0, q5
+        vbit      q12, q4, q5
+
+2:
+        vqmovun.s16 d16, q8
+        vqmovun.s16 d18, q9
+        vqmovun.s16 d20, q10
+        vqmovun.s16 d22, q11
+        vqmovun.s16 d24, q12
+        vqmovun.s16 d26, q13
+        vqmovun.s16 d28, q14
+        vqmovun.s16 d30, q15
+.endm
+
+function ff_hevc_v_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d22}, [r0], r1
+        vld1.8   {d24}, [r0], r1
+        vld1.8   {d26}, [r0], r1
+        vld1.8   {d28}, [r0], r1
+        vld1.8   {d30}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        hevc_loop_filter_luma_body
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0], r1
+        vst1.8   {d30}, [r0]
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_h_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, r0, r1, lsl #2
+        vld1.8  {d16}, [r0], r1
+        vld1.8  {d18}, [r0], r1
+        vld1.8  {d20}, [r0], r1
+        vld1.8  {d22}, [r0], r1
+        vld1.8  {d24}, [r0], r1
+        vld1.8  {d26}, [r0], r1
+        vld1.8  {d28}, [r0], r1
+        vld1.8  {d30}, [r0], r1
+        sub        r0, r0, r1, lsl #3
+        add        r0, r1
+        hevc_loop_filter_luma_body
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0]
+bypasswrite:
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_v_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d17}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2},  [r0], r1
+        vld1.8   {d4},  [r0], r1
+        vld1.8   {d19}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d21}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        hevc_loop_filter_chroma_body
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d17}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d2},  [r0], r1
+        vst1.8   {d4},  [r0], r1
+        vst1.8   {d19}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d21}, [r0]
+        bx       lr
+endfunc
+
+function ff_hevc_h_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, r0, r1, lsl #1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2}, [r0], r1
+        vld1.8   {d4}, [r0], r1
+        vld1.8   {d19}, [r0]
+        sub      r0, r0, r1, lsl #1
+        hevc_loop_filter_chroma_body
+        vst1.8   {d2}, [r0], r1
+        vst1.8   {d4}, [r0]
+        bx       lr
+endfunc
diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevcdsp_idct_neon.S
index 79799b2..75795e6 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -1,23 +1,22 @@
 /*
  * ARM NEON optimised IDCT functions for HEVC decoding
- *
  * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
  * Copyright (c) 2017 Alexandra Hájková
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -982,3 +981,63 @@ idct_32x32 8
 idct_32x32_dc 8
 idct_32x32 10
 idct_32x32_dc 10
+
+/* uses registers q2 - q9 for temp values */
+/* TODO: reorder */
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
+        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
+        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
+        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
+
+        vaddl.s16   q7, \r0, \r3    // src0 + src3
+        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
+        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
+
+        vmul.s32    q8, q5, d0[1]   // 29 * c0
+        vmul.s32    q9, q2, d1[0]   // 55 * c1
+        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
+        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
+
+        vmul.s32    q2, q2, d0[1]   // 29 * c1
+        vmul.s32    q9, q4, d1[0]   // 55 * c2
+        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
+        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
+
+        vmul.s32    q5, q5, d1[0]   // 55 * c0
+        vmul.s32    q4, q4, d0[1]   // 29 * c2
+        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
+        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
+
+        vqrshrn.s32   \r0, q8, \shift
+        vqrshrn.s32   \r1, q9, \shift
+        vqrshrn.s32   \r2, q7, \shift
+        vqrshrn.s32   \r3, q5, \shift
+.endm
+
+.ltorg
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x4a  // 74
+        vmov.32     d0[0], r3
+        ldr         r3, =0x1d  // 29
+        vmov.32     d0[1], r3
+        ldr         r3, =0x37  // 55
+        vmov.32     d1[0], r3
+
+        tr4_luma_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_luma_shift d28, d29, d30, d31, #12
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c
index 60c211d..e8fa1f7 100644
--- a/libavcodec/arm/hevcdsp_init_arm.c
+++ b/libavcodec/arm/hevcdsp_init_arm.c
@@ -1,21 +1,20 @@
 /*
- * ARM NEON optimised HEVC IDCT
- * Copyright (c) 2017 Alexandra Hájková
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,146 +23,12 @@
 #include "libavutil/arm/cpu.h"
 
 #include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
 
-
-void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
-
-void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
-void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
-void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
-void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs);
-void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
-void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
-void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
-void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
-
-void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit);
-void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
-void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
-void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit);
-void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
-void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
-void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
-void ff_hevc_idct_32x32_10_neon(int16_t *coeffs, int col_limit);
-
-void ff_hevc_get_pixels_4_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                 ptrdiff_t srcstride, int height, int mx, int my,
-                                 int16_t *mcbuffer);
-void ff_hevc_get_pixels_4_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_8_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                 ptrdiff_t srcstride, int height, int mx, int my,
-                                 int16_t *mcbuffer);
-void ff_hevc_get_pixels_8_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_12_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_12_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height, int mx, int my,
-                                   int16_t *mcbuffer);
-void ff_hevc_get_pixels_16_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_16_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height, int mx, int my,
-                                   int16_t *mcbuffer);
-void ff_hevc_get_pixels_24_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_24_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height, int mx, int my,
-                                   int16_t *mcbuffer);
-void ff_hevc_get_pixels_32_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_32_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height, int mx, int my,
-                                   int16_t *mcbuffer);
-void ff_hevc_get_pixels_48_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_48_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height, int mx, int my,
-                                   int16_t *mcbuffer);
-void ff_hevc_get_pixels_64_8_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                  ptrdiff_t srcstride, int height, int mx, int my,
-                                  int16_t *mcbuffer);
-void ff_hevc_get_pixels_64_10_neon(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height, int mx, int my,
-                                   int16_t *mcbuffer);
-
-av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, int bit_depth)
+av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_neon(cpu_flags)) {
-        if (bit_depth == 8) {
-            c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon;
-            c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon;
-            c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
-            c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
-
-            c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
-            c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
-            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
-            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
-
-            c->idct[0] = ff_hevc_idct_4x4_8_neon;
-            c->idct[1] = ff_hevc_idct_8x8_8_neon;
-            c->idct[2] = ff_hevc_idct_16x16_8_neon;
-            c->idct[3] = ff_hevc_idct_32x32_8_neon;
-
-            c->put_hevc_qpel[0][0][0] = ff_hevc_get_pixels_4_8_neon;
-            c->put_hevc_qpel[0][0][1] = ff_hevc_get_pixels_8_8_neon;
-            c->put_hevc_qpel[0][0][2] = ff_hevc_get_pixels_12_8_neon;
-            c->put_hevc_qpel[0][0][3] = ff_hevc_get_pixels_16_8_neon;
-            c->put_hevc_qpel[0][0][4] = ff_hevc_get_pixels_24_8_neon;
-            c->put_hevc_qpel[0][0][5] = ff_hevc_get_pixels_32_8_neon;
-            c->put_hevc_qpel[0][0][6] = ff_hevc_get_pixels_48_8_neon;
-            c->put_hevc_qpel[0][0][7] = ff_hevc_get_pixels_64_8_neon;
-        }
-        if (bit_depth == 10) {
-            c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
-            c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon;
-            c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon;
-            c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon;
-
-            c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_neon;
-            c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon;
-            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon;
-            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon;
-
-            c->idct[0] = ff_hevc_idct_4x4_10_neon;
-            c->idct[1] = ff_hevc_idct_8x8_10_neon;
-            c->idct[2] = ff_hevc_idct_16x16_10_neon;
-            c->idct[3] = ff_hevc_idct_32x32_10_neon;
-
-            c->put_hevc_qpel[0][0][0] = ff_hevc_get_pixels_4_10_neon;
-            c->put_hevc_qpel[0][0][1] = ff_hevc_get_pixels_8_10_neon;
-            c->put_hevc_qpel[0][0][2] = ff_hevc_get_pixels_12_10_neon;
-            c->put_hevc_qpel[0][0][3] = ff_hevc_get_pixels_16_10_neon;
-            c->put_hevc_qpel[0][0][4] = ff_hevc_get_pixels_24_10_neon;
-            c->put_hevc_qpel[0][0][5] = ff_hevc_get_pixels_32_10_neon;
-            c->put_hevc_qpel[0][0][6] = ff_hevc_get_pixels_48_10_neon;
-            c->put_hevc_qpel[0][0][7] = ff_hevc_get_pixels_64_10_neon;
-        }
-    }
+    if (have_neon(cpu_flags))
+        ff_hevc_dsp_init_neon(c, bit_depth);
 }
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
new file mode 100644
index 0000000..201a088
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "libavcodec/avcodec.h"
+#include "hevcdsp_arm.h"
+
+void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height);
+void ff_hevc_sao_edge_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+                                  int eo, int width, int height);
+
+void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
+void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+
+#define PUT_PIXELS(name) \
+    void name(int16_t *dst, uint8_t *src, \
+                                ptrdiff_t srcstride, int height, \
+                                intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int width);
+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+#define QPEL_FUNC(name) \
+    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+                                   int height, int width)
+
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW_PIX(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int height, intptr_t mx, intptr_t my, int width);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
+#undef QPEL_FUNC_UW_PIX
+
+#define QPEL_FUNC_UW(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_sao_band_filter_neon_8(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int width, int height, int16_t *offset_table);
+
+void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height) {
+    uint8_t *dst = _dst;
+    uint8_t *src = _src;
+    int16_t offset_table[32] = {0};
+    int k;
+
+    for (k = 0; k < 4; k++) {
+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+    }
+
+    ff_hevc_sao_band_filter_neon_8(dst, src, stride_dst, stride_src, width, height, offset_table);
+}
+
+void ff_hevc_sao_edge_filter_neon_8(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int width, int height,
+                                    int a_stride, int b_stride, int16_t *sao_offset_val, uint8_t *edge_idx);
+
+void ff_hevc_sao_edge_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+                                  int eo, int width, int height) {
+    static uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    uint8_t *dst = _dst;
+    uint8_t *src = _src;
+    int a_stride, b_stride;
+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+
+    ff_hevc_sao_edge_filter_neon_8(dst, src, stride_dst, stride_src, width, height, a_stride, b_stride, sao_offset_val, edge_idx);
+}
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width) {
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+    if (bit_depth == 8) {
+        int x;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_neon_8_wrapper;
+        c->sao_band_filter[1]          = ff_hevc_sao_band_filter_neon_8_wrapper;
+        c->sao_band_filter[2]          = ff_hevc_sao_band_filter_neon_8_wrapper;
+        c->sao_band_filter[3]          = ff_hevc_sao_band_filter_neon_8_wrapper;
+        c->sao_band_filter[4]          = ff_hevc_sao_band_filter_neon_8_wrapper;
+        c->sao_edge_filter[0]          = ff_hevc_sao_edge_filter_neon_8_wrapper;
+        c->sao_edge_filter[1]          = ff_hevc_sao_edge_filter_neon_8_wrapper;
+        c->sao_edge_filter[2]          = ff_hevc_sao_edge_filter_neon_8_wrapper;
+        c->sao_edge_filter[3]          = ff_hevc_sao_edge_filter_neon_8_wrapper;
+        c->sao_edge_filter[4]          = ff_hevc_sao_edge_filter_neon_8_wrapper;
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
+        c->idct[0]                     = ff_hevc_idct_4x4_8_neon;
+        c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
+        c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
+        c->idct[3]                     = ff_hevc_idct_32x32_8_neon;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
+        put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+        put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+        put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+        put_hevc_qpel_neon[0][1]       = ff_hevc_put_qpel_h1_neon_8;
+        put_hevc_qpel_neon[0][2]       = ff_hevc_put_qpel_h2_neon_8;
+        put_hevc_qpel_neon[0][3]       = ff_hevc_put_qpel_h3_neon_8;
+        put_hevc_qpel_neon[1][1]       = ff_hevc_put_qpel_h1v1_neon_8;
+        put_hevc_qpel_neon[1][2]       = ff_hevc_put_qpel_h2v1_neon_8;
+        put_hevc_qpel_neon[1][3]       = ff_hevc_put_qpel_h3v1_neon_8;
+        put_hevc_qpel_neon[2][1]       = ff_hevc_put_qpel_h1v2_neon_8;
+        put_hevc_qpel_neon[2][2]       = ff_hevc_put_qpel_h2v2_neon_8;
+        put_hevc_qpel_neon[2][3]       = ff_hevc_put_qpel_h3v2_neon_8;
+        put_hevc_qpel_neon[3][1]       = ff_hevc_put_qpel_h1v3_neon_8;
+        put_hevc_qpel_neon[3][2]       = ff_hevc_put_qpel_h2v3_neon_8;
+        put_hevc_qpel_neon[3][3]       = ff_hevc_put_qpel_h3v3_neon_8;
+        put_hevc_qpel_uw_neon[1][0]      = ff_hevc_put_qpel_uw_v1_neon_8;
+        put_hevc_qpel_uw_neon[2][0]      = ff_hevc_put_qpel_uw_v2_neon_8;
+        put_hevc_qpel_uw_neon[3][0]      = ff_hevc_put_qpel_uw_v3_neon_8;
+        put_hevc_qpel_uw_neon[0][1]      = ff_hevc_put_qpel_uw_h1_neon_8;
+        put_hevc_qpel_uw_neon[0][2]      = ff_hevc_put_qpel_uw_h2_neon_8;
+        put_hevc_qpel_uw_neon[0][3]      = ff_hevc_put_qpel_uw_h3_neon_8;
+        put_hevc_qpel_uw_neon[1][1]      = ff_hevc_put_qpel_uw_h1v1_neon_8;
+        put_hevc_qpel_uw_neon[1][2]      = ff_hevc_put_qpel_uw_h2v1_neon_8;
+        put_hevc_qpel_uw_neon[1][3]      = ff_hevc_put_qpel_uw_h3v1_neon_8;
+        put_hevc_qpel_uw_neon[2][1]      = ff_hevc_put_qpel_uw_h1v2_neon_8;
+        put_hevc_qpel_uw_neon[2][2]      = ff_hevc_put_qpel_uw_h2v2_neon_8;
+        put_hevc_qpel_uw_neon[2][3]      = ff_hevc_put_qpel_uw_h3v2_neon_8;
+        put_hevc_qpel_uw_neon[3][1]      = ff_hevc_put_qpel_uw_h1v3_neon_8;
+        put_hevc_qpel_uw_neon[3][2]      = ff_hevc_put_qpel_uw_h2v3_neon_8;
+        put_hevc_qpel_uw_neon[3][3]      = ff_hevc_put_qpel_uw_h3v3_neon_8;
+        for (x = 0; x < 10; x++) {
+            c->put_hevc_qpel[x][1][0]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][0][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][1][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][0]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][0][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+        }
+        c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+        c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+        c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+        c->put_hevc_qpel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+        c->put_hevc_qpel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+        c->put_hevc_qpel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+        c->put_hevc_qpel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+        c->put_hevc_qpel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+        c->put_hevc_qpel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+        c->put_hevc_qpel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+
+        c->put_hevc_qpel_uni[1][0][0]  = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
+        c->put_hevc_qpel_uni[3][0][0]  = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
+        c->put_hevc_qpel_uni[5][0][0]  = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
+        c->put_hevc_qpel_uni[6][0][0]  = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
+        c->put_hevc_qpel_uni[7][0][0]  = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
+        c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+        c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+    }
+
+    if (bit_depth == 10) {
+        c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
+        c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon;
+        c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon;
+        c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon;
+
+        c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_neon;
+        c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon;
+        c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon;
+        c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon;
+
+        c->idct[0] = ff_hevc_idct_4x4_10_neon;
+        c->idct[1] = ff_hevc_idct_8x8_10_neon;
+        c->idct[2] = ff_hevc_idct_16x16_10_neon;
+        c->idct[3] = ff_hevc_idct_32x32_10_neon;
+    }
+}
diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000..caa6efa
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -0,0 +1,999 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro regshuffle_d8
+    vmov d16, d17
+    vmov d17, d18
+    vmov d18, d19
+    vmov d19, d20
+    vmov d20, d21
+    vmov d21, d22
+    vmov d22, d23
+.endm
+
+.macro regshuffle_q8
+    vmov q0, q1
+    vmov q1, q2
+    vmov q2, q3
+    vmov q3, q4
+    vmov q4, q5
+    vmov q5, q6
+    vmov q6, q7
+.endm
+
+.macro vextin8
+        pld       [r2]
+        vld1.8    {q11}, [r2], r3
+        vext.8    d16, d22, d23, #1
+        vext.8    d17, d22, d23, #2
+        vext.8    d18, d22, d23, #3
+        vext.8    d19, d22, d23, #4
+        vext.8    d20, d22, d23, #5
+        vext.8    d21, d22, d23, #6
+        vext.8    d22, d22, d23, #7
+.endm
+
+.macro loadin8
+        pld       [r2]
+        vld1.8    {d16}, [r2], r3
+        pld       [r2]
+        vld1.8    {d17}, [r2], r3
+        pld       [r2]
+        vld1.8    {d18}, [r2], r3
+        pld       [r2]
+        vld1.8    {d19}, [r2], r3
+        pld       [r2]
+        vld1.8    {d20}, [r2], r3
+        pld       [r2]
+        vld1.8    {d21}, [r2], r3
+        pld       [r2]
+        vld1.8    {d22}, [r2], r3
+        pld       [r2]
+        vld1.8    {d23}, [r2], r3
+.endm
+
+.macro qpel_filter_1_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d6, d16   // 58 * d0
+        vmull.s16  q10, d7, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d4, d17   // 10 * c0
+        vmull.s16  q12, d5, d17   // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d8, d16   // 17 * e0
+        vmull.s16  q14, d9, d16   // 17 * e1
+        vmull.s16  q15, d10, d17  //  5 * f0
+        vmull.s16   q8, d11, d17  //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d2, #2    // 4 * b0
+        vshll.s16  q12, d3, #2    // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d12, d0   // g0 - a0
+        vsubl.s16  q14, d13, d1   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+// input  q0 - q7
+// output q8
+.macro qpel_filter_2_32b
+        vmov.i32   q8, #11
+        vaddl.s16   q9, d6, d8   // d0 + e0
+        vaddl.s16  q10, d7, d9   // d1 + e1
+        vaddl.s16  q11, d4, d10  // c0 + f0
+        vaddl.s16  q12, d5, d11  // c1 + f1
+        vmul.s32   q11, q8       // 11 * (c0 + f0)
+        vmul.s32   q12, q8       // 11 * (c1 + f1)
+        vmov.i32   q8, #40
+        vaddl.s16  q15, d2, d12  // b0 + g0
+        vmul.s32    q9, q8       // 40 * (d0 + e0)
+        vmul.s32   q10, q8       // 40 * (d1 + e1)
+        vaddl.s16   q8, d3, d13  // b1 + g1
+        vaddl.s16  q13, d0, d14  // a0 + h0
+        vaddl.s16  q14, d1, d15  // a1 + h1
+        vshl.s32   q15, #2       // 4*(b0+g0)
+        vshl.s32    q8, #2       // 4*(b1+g1)
+        vadd.s32   q11, q13      // 11 * (c0 + f0) + a0 + h0
+        vadd.s32   q12, q14      // 11 * (c1 + f1) + a1 + h1
+        vadd.s32   q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
+        vadd.s32   q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
+        vsub.s32   q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
+        vsub.s32   q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_3_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d8, d16   // 58 * d0
+        vmull.s16  q10, d9, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d10, d17  // 10 * c0
+        vmull.s16  q12, d11, d17  // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d6, d16   // 17 * e0
+        vmull.s16  q14, d7, d16   // 17 * e1
+        vmull.s16  q15, d4, d17   //  5 * f0
+        vmull.s16   q8, d5, d17   //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d12, #2   // 4 * b0
+        vshll.s16  q12, d13, #2   // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d2, d14   // g0 - a0
+        vsubl.s16  q14, d3, d15   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_1 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d20, #4   // 16*e
+        vshll.u8   q14, d21, #2   // 4*f
+        vmull.u8  \out, d19, d24  // 58*d
+        vaddw.u8   q13, q13, d20  // 17*e
+        vmull.u8   q15, d18, d25  // 10*c
+        vaddw.u8   q14, q14, d21  // 5*f
+        vsubl.u8   q12, d22, d16  // g - a
+        vadd.u16  \out, q13       // 58d + 17e
+        vshll.u8   q13, d17, #2   // 4*b
+        vadd.u16   q15, q14       // 10*c + 5*f
+        vadd.s16   q13, q12       // - a + 4*b + g
+        vsub.s16  \out, q15       // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13       // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro qpel_filter_2 out=q7
+        vmov.i16   q12, #10
+        vmov.i16   q14, #11
+        vaddl.u8   q13, d19, d20   // d + e
+        vaddl.u8   q15, d18, d21   // c + f
+        vmul.u16   q13, q12        // 10 * (d+e)
+        vmul.u16   q15, q14        // 11 * ( c + f)
+        vaddl.u8  \out, d17, d22   // b + g
+        vaddl.u8   q12, d16, d23   // a + h
+        vadd.u16  \out, q13        // b + 10 * (d + e) + g
+        vadd.s16   q12, q15
+        vshl.u16  \out, #2         // 4 * (b + 10 * (d + e) + g)
+        vsub.s16  \out, q12
+.endm
+
+.macro qpel_filter_3 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d19, #4     // 16*e
+        vshll.u8   q14, d18, #2     // 4*f
+        vmull.u8  \out, d20, d24    // 58*d
+        vaddw.u8   q13, q13, d19    // 17*e
+        vmull.u8   q15, d21, d25    // 10*c
+        vaddw.u8   q14, q14, d18    // 5*f
+        vsubl.u8   q12, d17, d23    // g - a
+        vadd.u16  \out, q13         // 58d + 17e
+        vshll.u8   q13, d22, #2     // 4*b
+        vadd.u16   q15, q14         // 10*c + 5*f
+        vadd.s16   q13, q12         // - a + 4*b + g
+        vsub.s16  \out, q15         // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13         // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro  hevc_put_qpel_vX_neon_8 filter
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        lsl       r1, #1
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vst1.16    {q7}, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vst1.16    d14, [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro  hevc_put_qpel_uw_vX_neon_8 filter
+        push   {r4-r10}
+        ldr    r5, [sp, #28] // width
+        ldr    r4, [sp, #32] // height
+        ldr    r8, [sp, #36] // src2
+        ldr    r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32    d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+        b   99f
+.Lbi\@: lsl       r9, #1
+        mov       r10, r8
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+        push     {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush    {d8-d15}
+        sub       r2, #4
+        lsl       r1, #1
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16   {q7}, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16  d14, [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush    {d8-d15}
+        sub       r2, #4
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32  d0[0], [r0], r1
+        bne       4b
+        b         99f
+.Lbi\@:
+        lsl       r9, #1
+        cmp       r5, #4
+        beq       4f
+        mov       r10, r8
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        add       r10, #16
+        mov       r8, r10
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        lsl       r1, #1
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vst1.16    {q8}, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vst1.16    d16, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.32        d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+        b   99f
+.Lbi\@: lsl      r9, #1
+        mov      r10, r8
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q8
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d16
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1, qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2, qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3, qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1, qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2, qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3, qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1, qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2, qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3, qpel_filter_3_32b
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1, qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2, qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3, qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1, qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2, qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3, qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1, qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2, qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3, qpel_filter_3_32b
+endfunc
+
+.macro init_put_pixels
+        pld    [r1]
+        pld    [r1, r2]
+        mov    r12, MAX_PB_SIZE
+        lsl    r12, #1
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+        init_put_pixels
+        vmov.u8      d5, #255
+        vshr.u64     d5, #32
+0:      subs r3, #1
+        vld1.32     {d0[0]}, [r1], r2
+        pld [r1]
+        vld1.32     d6, [r0]
+        vshll.u8    q0, d0, #6
+        vbit        d6, d0, d5
+        vst1.32     d6, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.32   {d0[0]}, [r1], r2
+        vld1.32   {d0[1]}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8   q0, d0, #6
+        vst1.64   {d0}, [r0], r12
+        vst1.64   {d1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+        init_put_pixels
+        vmov.u8      q10, #255
+        vshr.u64     d21, #32
+0:      subs r3, #1
+        vld1.16     {d0}, [r1], r2
+        pld [r1]
+        vshll.u8    q0, d0, #6
+        vld1.8      {q12}, [r0]
+        vbit        q12, q0, q10
+        vst1.8      {q12}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {d0}, [r1], r2
+        vld1.8   {d2}, [r1], r2
+        pld        [r1]
+        pld        [r1, r2]
+        vshll.u8   q0, d0, #6
+        vshll.u8   q1, d2, #6
+        vst1.16   {q0}, [r0], r12
+        vst1.16   {q1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.64    {d0}, [r1]
+        add       r1, #8
+        vld1.32   {d1[0]}, [r1], r2
+        sub       r1, #8
+        vld1.64    {d2}, [r1]
+        add       r1, #8
+        vld1.32   {d1[1]}, [r1], r2
+        sub       r1, #8
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vmov      d22, d19
+        vst1.64   {d16, d17, d18}, [r0], r12
+        vst1.64   {d20, d21, d22}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {q0}, [r1], r2
+        vld1.8   {q1}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vst1.8    {q8, q9}, [r0], r12
+        vst1.8    {q10, q11}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8   {d0, d1, d2}, [r1], r2
+        pld       [r1]
+        vshll.u8  q10, d0, #6
+        vshll.u8  q11, d1, #6
+        vshll.u8  q12, d2, #6
+        vstm     r0, {q10, q11, q12}
+        add      r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8 {q0, q1}, [r1], r2
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vstm    r0, {q8, q9, q10, q11}
+        add     r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add r1, #32
+        vld1.8    {q2}, [r1], r2
+        sub r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vstm r0, {q8, q9, q10, q11, q12, q13}
+        add  r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add      r1, #32
+        vld1.8    {q2, q3}, [r1], r2
+        sub      r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vshll.u8  q14, d6, #6
+        vshll.u8  q15, d7, #6
+        vstm    r0, {q8, q9, q10, q11, q12, q13, q14, q15}
+        add r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
+        push   {r4-r9}
+        ldr    r5, [sp, #24] // width
+        ldr    r4, [sp, #28] // height
+        ldr    r8, [sp, #32] // src2
+        ldr    r9, [sp, #36] // src2stride
+        vpush {d8-d15}
+        cmp    r8, #0
+        bne    2f
+1:      subs r4, #1
+        vld1.8     {d0}, [r2], r3
+        vst1.8      d0, [r0], r1
+        bne 1b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+2:      subs  r4, #1
+        vld1.8         {d0}, [r2], r3
+        vld1.16        {q1}, [r8], r9
+        vshll.u8       q0, d0, #6
+        vqadd.s16      q0, q1
+        vqrshrun.s16   d0, q0, #7
+        vst1.8      d0, [r0], r1
+        bne 2b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+endfunc
+
+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        ldr    r12, [sp] // height
+1:      subs   r12, #4
+        vld1.32     {\regs}  , [r2], r3
+        vld1.32     {\regs2} , [r2], r3
+        vld1.32     {\regs3} , [r2], r3
+        vld1.32     {\regs4} , [r2], r3
+        vst1.32     {\regs}  , [r0], r1
+        vst1.32     {\regs2} , [r0], r1
+        vst1.32     {\regs3} , [r0], r1
+        vst1.32     {\regs4} , [r0], r1
+        bne 1b
+        bx lr
+endfunc
+.endm
+
+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        push   {r4-r5}
+        ldr    r12, [sp, #8] // height
+1:      subs r12, #2
+        mov      r4, r2
+        vld1.32   {\regs} , [r2]!
+        vld1.32   {\regs2} , [r2]
+        add      r2, r4, r3
+        mov      r4, r2
+        vld1.32   {\regs3} , [r2]!
+        vld1.32   {\regs4} , [r2]
+        add      r2, r4, r3
+        mov      r5, r0
+        vst1.32   {\regs} , [r0]!
+        vst1.32   {\regs2} , [r0]
+        add      r0, r5, r1
+        mov      r5, r0
+        vst1.32   {\regs3} , [r0]!
+        vst1.32   {\regs4} , [r0]
+        add      r0, r5, r1
+        bne 1b
+        pop   {r4-r5}
+        bx lr
+endfunc
+.endm
+
+put_qpel_uw_pixels    4, d0[0], d0[1], d1[0], d1[1]
+put_qpel_uw_pixels    8, d0,    d1,    d2,    d3
+put_qpel_uw_pixels_m 12, d0,    d1[0], d2,    d3[0]
+put_qpel_uw_pixels   16, q0,    q1,    q2,    q3
+put_qpel_uw_pixels   24, d0-d2, d3-d5, d16-d18, d19-d21
+put_qpel_uw_pixels   32, q0-q1, q2-q3, q8-q9, q10-q11
+put_qpel_uw_pixels_m 48, q0-q1, q2,    q8-q9, q10
+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
new file mode 100644
index 0000000..3471679
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_sao_neon.S
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2017 Meng Wang <wangmeng.kids@bytedance.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+function ff_hevc_sao_band_filter_neon_8, export=1
+        push    {r4-r10}
+        ldr     r5,  [sp, #28]   // width
+        ldr     r4,  [sp, #32]   // height
+        ldr     r8,  [sp, #36]   // offset_table
+        vpush   {d8-d15}
+        mov     r12,  r4         // r12 = height
+        mov     r6,   r0         // r6 = r0 = dst
+        mov     r7,   r1         // r7 = r1 = src
+        vldm    r8,   {q0-q3}
+        vmov.u16    q15,  #1
+        vmov.u8     q14,  #32
+0:      pld      [r1]
+        vld1.8   {d16},  [r1], r3
+        cmp      r5,    #4
+        beq      4f
+8:      subs     r4,    #1
+        vshr.u8  d17,   d16,  #3   // index = [src>>3]
+        vshll.u8 q9,    d17,  #1   // lowIndex = 2*index
+        vadd.u16 q11,   q9,   q15  // highIndex = (2*index+1) << 8
+        vshl.u16 q10,   q11,  #8   // q10: highIndex;  q9: lowIndex;
+        vadd.u16 q10,   q9         // combine high and low index;
+        // Look-up Table Round 1; index range: 0-15
+        vtbx.8   d24,   {q0-q1},   d20
+        vtbx.8   d25,   {q0-q1},   d21
+        // Look-up Table Round 2; index range: 16-31
+        vsub.u8  q10,   q14        // Look-up with 8bit
+        vtbx.8   d24,   {q2-q3},   d20
+        vtbx.8   d25,   {q2-q3},   d21
+        vaddw.u8 q13,   q12,       d16
+        vqmovun.s16      d8,         q13
+        vst1.8    d8,   [r0],      r2
+        vld1.8   {d16}, [r1],      r3
+        bne      8b
+        subs     r5,    #8
+        beq      99f
+        mov      r4,    r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r1, r7
+        b        0b
+4:      subs     r4,    #1
+        vshr.u8  d17,   d16,  #3  // src>>3
+        vshll.u8 q9,    d17,  #1   // lowIndex = 2*index
+        vadd.u16 q11,   q9,   q15  // highIndex = (2*index+1) << 8
+        vshl.u16 q10,   q11,  #8   // q10: highIndex;  q9: lowIndex;
+        vadd.u16 q10,   q9         // combine high and low index;
+        // Look-up Table Round 1; index range: 0-15
+        vtbx.8   d24,   {q0-q1},   d20
+        vtbx.8   d25,   {q0-q1},   d21
+        // Look-up Table Round 2; index range: 16-32
+        vsub.u8  q10,   q14        // Look-up with 8bit
+        vtbx.8   d24,   {q2-q3},   d20
+        vtbx.8   d25,   {q2-q3},   d21
+        vaddw.u8 q13,   q12,       d16
+        vqmovun.s16     d14,       q13
+        vst1.32   d14[0],    [r0],     r2
+        vld1.32   {d16[0]},  [r1],     r3
+        bne      4b
+        b        99f
+99:
+        vpop {d8-d15}
+        pop  {r4-r10}
+        bx   lr
+endfunc
+
+function ff_hevc_sao_edge_filter_neon_8, export=1
+        push    {r4-r11}
+        ldr     r5,  [sp, #32]   // width
+        ldr     r4,  [sp, #36]   // height
+        ldr     r8,  [sp, #40]   // a_stride
+        ldr     r9,  [sp, #44]   // b_stride
+        ldr     r10, [sp, #48]   // sao_offset_val
+        ldr     r11, [sp, #52]   // edge_idx
+        vpush   {d8-d15}
+        mov     r12,  r4         // r12 = height
+        mov     r6,   r0         // r6 = r0 = dst
+        mov     r7,   r1         // r7 = r1 = src
+        vld1.8  {d0}, [r11]      // edge_idx tabel load in d0 5x8bit
+        vld1.16 {q1}, [r10]      // sao_offset_val table load in q1, 5x16bit
+        vmov.u8  d1,  #2
+        vmov.u16 q2,  #1
+0:      mov      r10,    r1
+        add      r10,    r8           // src[x + a_stride]
+        mov      r11,    r1
+        add      r11,    r9           // src[x + b_stride]
+        pld      [r1]
+        vld1.8   {d16},  [r1],  r3    // src[x]  8x8bit
+        vld1.8   {d17},  [r10], r3    // src[x + a_stride]
+        vld1.8   {d18},  [r11], r3    // src[x + b_stride]
+        cmp      r5,     #4
+        beq      4f
+8:      subs     r4,     #1
+        vcgt.u8  d8,     d16,   d17
+        vshr.u8  d9,     d8,    #7
+        vclt.u8  d8,     d16,   d17
+        vadd.u8  d8,     d9           // diff0
+        vcgt.u8  d10,    d16,   d18
+        vshr.u8  d11,    d10,   #7
+        vclt.u8  d10,    d16,   d18
+        vadd.u8  d10,    d11          // diff1
+        vadd.s8  d8,     d10
+        vadd.s8  d8,     d1
+        vtbx.8   d9,     {d0},  d8    // offset_val
+        vshll.u8 q6,     d9,    #1    // lowIndex
+        vadd.u16 q7,     q6,    q2
+        vshl.u16 q10,    q7,    #8    // highIndex
+        vadd.u16 q10,    q6           // combine lowIndex and highIndex, offset_val
+        vtbx.8   d22,    {q1},  d20
+        vtbx.8   d23,    {q1},  d21
+        vaddw.u8 q12,    q11,   d16
+        vqmovun.s16      d26,   q12
+        vst1.8   d26,    [r0],  r2
+        vld1.8   {d16},  [r1],  r3    // src[x]  8x8bit
+        vld1.8   {d17},  [r10], r3    // src[x + a_stride]
+        vld1.8   {d18},  [r11], r3    // src[x + b_stride]
+        bne      8b
+        subs     r5,     #8
+        beq      99f
+        mov      r4,     r12
+        add      r6,     #8
+        mov      r0,     r6
+        add      r7,     #8
+        mov      r1,     r7
+        b        0b
+4:      subs     r4,    #1
+        vcgt.u8  d8,     d16,   d17
+        vshr.u8  d9,     d8,    #7
+        vclt.u8  d8,     d16,   d17
+        vadd.u8  d8,     d9           // diff0
+        vcgt.u8  d10,    d16,   d18
+        vshr.u8  d11,    d10,   #7
+        vclt.u8  d10,    d16,   d18
+        vadd.u8  d10,    d11          // diff1
+        vadd.s8  d8,     d10
+        vadd.s8  d8,     d1
+        vtbx.8   d9,     {d0},  d8    // offset_val
+        vshll.u8 q6,     d9,    #1    // lowIndex
+        vadd.u16 q7,     q6,    q2
+        vshl.u16 q10,    q7,    #8    // highIndex
+        vadd.u16 q10,    q6           // combine lowIndex and highIndex, offset_val
+        vtbx.8   d22,    {q1},  d20
+        vtbx.8   d23,    {q1},  d21
+        vaddw.u8 q12,    q11,   d16
+        vqmovun.s16      d26,   q12
+        vst1.32  d26[0], [r0],  r2
+        vld1.32   {d16[0]},  [r1],  r3
+        vld1.32   {d17[0]},  [r10], r3    // src[x + a_stride]
+        vld1.32   {d18[0]},  [r11], r3    // src[x + b_stride]
+        bne      4b
+        b        99f
+99:
+        vpop {d8-d15}
+        pop  {r4-r11}
+        bx   lr
+endfunc
diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S
index 6eb4837..6f3e3fb 100644
--- a/libavcodec/arm/hpeldsp_arm.S
+++ b/libavcodec/arm/hpeldsp_arm.S
@@ -2,20 +2,20 @@
 @ ARMv4-optimized halfpel functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h
index a864152..5f3c774 100644
--- a/libavcodec/arm/hpeldsp_arm.h
+++ b/libavcodec/arm/hpeldsp_arm.h
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S
index f1abc32..a8bd459 100644
--- a/libavcodec/arm/hpeldsp_armv6.S
+++ b/libavcodec/arm/hpeldsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c
index 6390660..1977b13 100644
--- a/libavcodec/arm/hpeldsp_init_arm.c
+++ b/libavcodec/arm/hpeldsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized halfpel functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c
index 67a500d..967a8e0 100644
--- a/libavcodec/arm/hpeldsp_init_armv6.c
+++ b/libavcodec/arm/hpeldsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c
index 76d4eaf..d9feadd 100644
--- a/libavcodec/arm/hpeldsp_init_neon.c
+++ b/libavcodec/arm/hpeldsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S
index 90bc3cb..cf4a6cf 100644
--- a/libavcodec/arm/hpeldsp_neon.S
+++ b/libavcodec/arm/hpeldsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idct.h b/libavcodec/arm/idct.h
index db4d6c5..6c79a69 100644
--- a/libavcodec/arm/idct.h
+++ b/libavcodec/arm/idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S
index 0d6a76b..057eff9 100644
--- a/libavcodec/arm/idctdsp_arm.S
+++ b/libavcodec/arm/idctdsp_arm.S
@@ -2,20 +2,20 @@
 @ ARMv4-optimized IDCT functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h
index 9012b82..d7bc5cd 100644
--- a/libavcodec/arm/idctdsp_arm.h
+++ b/libavcodec/arm/idctdsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S
index c180d73..a6e77d6 100644
--- a/libavcodec/arm/idctdsp_armv6.S
+++ b/libavcodec/arm/idctdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c
index 8216985..ebc90e4 100644
--- a/libavcodec/arm/idctdsp_init_arm.c
+++ b/libavcodec/arm/idctdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized IDCT functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,28 +39,28 @@ static void j_rev_dct_arm_put(uint8_t *dest, ptrdiff_t line_size,
                               int16_t *block)
 {
     ff_j_rev_dct_arm(block);
-    ff_put_pixels_clamped(block, dest, line_size);
+    ff_put_pixels_clamped_c(block, dest, line_size);
 }
 
 static void j_rev_dct_arm_add(uint8_t *dest, ptrdiff_t line_size,
                               int16_t *block)
 {
     ff_j_rev_dct_arm(block);
-    ff_add_pixels_clamped(block, dest, line_size);
+    ff_add_pixels_clamped_arm(block, dest, line_size);
 }
 
 static void simple_idct_arm_put(uint8_t *dest, ptrdiff_t line_size,
                                 int16_t *block)
 {
     ff_simple_idct_arm(block);
-    ff_put_pixels_clamped(block, dest, line_size);
+    ff_put_pixels_clamped_c(block, dest, line_size);
 }
 
 static void simple_idct_arm_add(uint8_t *dest, ptrdiff_t line_size,
                                 int16_t *block)
 {
     ff_simple_idct_arm(block);
-    ff_add_pixels_clamped(block, dest, line_size);
+    ff_add_pixels_clamped_arm(block, dest, line_size);
 }
 
 av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
@@ -68,8 +68,8 @@ av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_ARM) {
             c->idct_put  = j_rev_dct_arm_put;
             c->idct_add  = j_rev_dct_arm_add;
diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c
index 251165d..3d881e1 100644
--- a/libavcodec/arm/idctdsp_init_armv5te.c
+++ b/libavcodec/arm/idctdsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,9 @@
 av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
-    if (!high_bit_depth &&
+    if (!avctx->lowres && !high_bit_depth &&
         (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
          avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
         c->idct_put  = ff_simple_idct_put_armv5te;
         c->idct_add  = ff_simple_idct_add_armv5te;
diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c
index 3941ee8..edf3070 100644
--- a/libavcodec/arm/idctdsp_init_armv6.c
+++ b/libavcodec/arm/idctdsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,8 @@ void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
 av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
             c->idct_put  = ff_simple_idct_put_armv6;
             c->idct_add  = ff_simple_idct_add_armv6;
diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c
index c94f7b6..b70c5b0 100644
--- a/libavcodec/arm/idctdsp_init_neon.c
+++ b/libavcodec/arm/idctdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,16 @@
 #include "idct.h"
 #include "idctdsp_arm.h"
 
-void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
 
 av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
+    if (!avctx->lowres && !high_bit_depth) {
         if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
             avctx->idct_algo == FF_IDCT_SIMPLENEON) {
             c->idct_put  = ff_simple_idct_put_neon;
             c->idct_add  = ff_simple_idct_add_neon;
diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S
index 7095879..1911a33 100644
--- a/libavcodec/arm/idctdsp_neon.S
+++ b/libavcodec/arm/idctdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 42f3739..72c4c77 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -1,21 +1,21 @@
 /*
  * ARM NEON optimised integer operations
- * Copyright (c) 2009 Kostya Shishkov
+ * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ function ff_scalarproduct_int16_neon, export=1
         vmlal.s16       q2,  d18,  d22
         vmlal.s16       q3,  d19,  d23
         subs            r2,  r2,   #16
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
@@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1
         vmov.32         r0,  d3[0]
         bx              lr
 endfunc
+
diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/lossless_audiodsp_init_arm.c
index 47ea034..981a39a 100644
--- a/libavcodec/arm/apedsp_init_arm.c
+++ b/libavcodec/arm/lossless_audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,12 +23,12 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3, int len, int mul);
 
-av_cold void ff_apedsp_init_arm(APEDSPContext *c)
+av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/lossless_audiodsp_neon.S
index 7cfbf43..ba7c45f 100644
--- a/libavcodec/arm/apedsp_neon.S
+++ b/libavcodec/arm/lossless_audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised integer operations
  * Copyright (c) 2009 Kostya Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ function ff_scalarproduct_and_madd_int16_neon, export=1
         vst1.16         {q10},     [r12,:128]!
         subs            r3,  r3,   #16
         vst1.16         {q13},     [r12,:128]!
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
index 45ac67d..dc57c55 100644
--- a/libavcodec/arm/mathops.h
+++ b/libavcodec/arm/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_fixed_init_arm.c b/libavcodec/arm/mdct_fixed_init_arm.c
deleted file mode 100644
index 606c80c..0000000
--- a/libavcodec/arm/mdct_fixed_init_arm.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#define FFT_FLOAT 0
-#include "libavcodec/fft.h"
-
-void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
-void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
-
-av_cold void ff_mdct_fixed_init_arm(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        if (!s->inverse && s->nbits >= 3) {
-            s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-            s->mdct_calc        = ff_mdct_fixed_calc_neon;
-            s->mdct_calcw       = ff_mdct_fixed_calcw_neon;
-        }
-    }
-}
diff --git a/libavcodec/arm/mdct_fixed_neon.S b/libavcodec/arm/mdct_fixed_neon.S
index c77be59..365c5e7 100644
--- a/libavcodec/arm/mdct_fixed_neon.S
+++ b/libavcodec/arm/mdct_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_init_arm.c b/libavcodec/arm/mdct_init_arm.c
deleted file mode 100644
index 24678dd..0000000
--- a/libavcodec/arm/mdct_init_arm.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_arm(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_vfp_vm(cpu_flags)) {
-        s->imdct_half   = ff_imdct_half_vfp;
-    }
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_calc   = ff_imdct_calc_neon;
-        s->imdct_half   = ff_imdct_half_neon;
-        s->mdct_calc    = ff_mdct_calc_neon;
-        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-    }
-}
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
index bfe259c..a6952fa 100644
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised MDCT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index f3fe668..43f6d14 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_armv6.S b/libavcodec/arm/me_cmp_armv6.S
index 436e20d..fa5a823 100644
--- a/libavcodec/arm/me_cmp_armv6.S
+++ b/libavcodec/arm/me_cmp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_init_arm.c b/libavcodec/arm/me_cmp_init_arm.c
index 4d73f3e..03870a2 100644
--- a/libavcodec/arm/me_cmp_init_arm.c
+++ b/libavcodec/arm/me_cmp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv5te.S b/libavcodec/arm/mlpdsp_armv5te.S
index 4272dae..4f9aa48 100644
--- a/libavcodec/arm/mlpdsp_armv5te.S
+++ b/libavcodec/arm/mlpdsp_armv5te.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
index de9db46..b7ecf6c 100644
--- a/libavcodec/arm/mlpdsp_armv6.S
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 4cdd10c..34a5f61 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
index 49bd0bc..977abb6 100644
--- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_init_arm.c b/libavcodec/arm/mpegaudiodsp_init_arm.c
index e73aee6..d87bd27 100644
--- a/libavcodec/arm/mpegaudiodsp_init_arm.c
+++ b/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 #include "config.h"
 
 void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
-                                        int *dither, int16_t *out, int incr);
+                                        int *dither, int16_t *out, ptrdiff_t incr);
 
 av_cold void ff_mpadsp_init_arm(MPADSPContext *s)
 {
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 34e9cf1..918be16 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2002 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_arm.h b/libavcodec/arm/mpegvideo_arm.h
index 17e3a5b..709ae6b 100644
--- a/libavcodec/arm/mpegvideo_arm.h
+++ b/libavcodec/arm/mpegvideo_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c
index 4bb7b6e..e20bb4c 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -2,24 +2,25 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
 #include "mpegvideo_arm.h"
@@ -55,7 +56,7 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
     int level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qmul = qscale << 1;
 
@@ -84,7 +85,7 @@ static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
     int qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
index 4426e15..8687d6b 100644
--- a/libavcodec/arm/mpegvideo_armv5te_s.S
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -2,20 +2,20 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 3e1f7b5..1889d7a 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S b/libavcodec/arm/mpegvideoencdsp_armv6.S
index 99db501..ab0dad7 100644
--- a/libavcodec/arm/mpegvideoencdsp_armv6.S
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_init_arm.c b/libavcodec/arm/mpegvideoencdsp_init_arm.c
index ab9ba3e..4bfe835 100644
--- a/libavcodec/arm/mpegvideoencdsp_init_arm.c
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neon.S b/libavcodec/arm/neon.S
index 716a607..787bc4b 100644
--- a/libavcodec/arm/neon.S
+++ b/libavcodec/arm/neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neontest.c b/libavcodec/arm/neontest.c
index 67d7747..f9c0dbf 100644
--- a/libavcodec/arm/neontest.c
+++ b/libavcodec/arm/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/pixblockdsp_armv6.S b/libavcodec/arm/pixblockdsp_armv6.S
index 4c925a4..b10ea78 100644
--- a/libavcodec/arm/pixblockdsp_armv6.S
+++ b/libavcodec/arm/pixblockdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/pixblockdsp_init_arm.c b/libavcodec/arm/pixblockdsp_init_arm.c
index bb32631..59d2b49 100644
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rdft_init_arm.c b/libavcodec/arm/rdft_init_arm.c
index 2858ba9..1c5d8be 100644
--- a/libavcodec/arm/rdft_init_arm.c
+++ b/libavcodec/arm/rdft_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S
index 7d01d53..eabb92b 100644
--- a/libavcodec/arm/rdft_neon.S
+++ b/libavcodec/arm/rdft_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised RDFT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,18 +30,21 @@ function ff_rdft_calc_neon, export=1
 
         lsls            r6,  r6,  #31
         bne             1f
-        add             r0,  r4,  #20
+        add             r0,  r4,  #24
         bl              X(ff_fft_permute_neon)
-        add             r0,  r4,  #20
+        add             r0,  r4,  #24
         mov             r1,  r5
         bl              X(ff_fft_calc_neon)
 1:
         ldr             r12, [r4, #0]           @ nbits
         mov             r2,  #1
+        ldr             r8,  [r4, #20]          @ negative_sin
         lsl             r12, r2,  r12
         add             r0,  r5,  #8
+        lsl             r8,  r8,  #31
         add             r1,  r5,  r12, lsl #2
         lsr             r12, r12, #2
+        vdup.32         d26, r8
         ldr             r2,  [r4, #12]          @ tcos
         sub             r12, r12, #2
         ldr             r3,  [r4, #16]          @ tsin
@@ -55,6 +58,7 @@ function ff_rdft_calc_neon, export=1
         vld1.32         {d5},     [r3,:64]!     @ tsin[i]
         vmov.f32        d18, #0.5               @ k1
         vdup.32         d19, r6
+        veor            d5,  d26, d5
         pld             [r0, #32]
         veor            d19, d18, d19           @ k2
         vmov.i32        d16, #0
@@ -90,6 +94,7 @@ function ff_rdft_calc_neon, export=1
         vld1.32         {d5},     [r3,:64]!     @  tsin[i]
         veor            d24, d22, d17           @  ev.re,-ev.im
         vrev64.32       d3,  d23                @  od.re, od.im
+        veor            d5, d26, d5
         pld             [r2, #32]
         veor            d2,  d3,  d16           @ -od.re, od.im
         pld             [r3, #32]
@@ -140,10 +145,10 @@ function ff_rdft_calc_neon, export=1
 
         vmul.f32        d22, d22, d18
         vst1.32         {d22},    [r5,:64]
-        add             r0,  r4,  #20
+        add             r0,  r4,  #24
         mov             r1,  r5
         bl              X(ff_fft_permute_neon)
-        add             r0,  r4,  #20
+        add             r0,  r4,  #24
         mov             r1,  r5
         pop             {r4-r8,lr}
         b               X(ff_fft_calc_neon)
diff --git a/libavcodec/arm/rv34dsp_init_arm.c b/libavcodec/arm/rv34dsp_init_arm.c
index 5ce787b..8bfe90b 100644
--- a/libavcodec/arm/rv34dsp_init_arm.c
+++ b/libavcodec/arm/rv34dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index a29123f..3d4a83d 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_init_arm.c b/libavcodec/arm/rv40dsp_init_arm.c
index df3e461..c24854d 100644
--- a/libavcodec/arm/rv40dsp_init_arm.c
+++ b/libavcodec/arm/rv40dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 6bd45eb..099f88c 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbcdsp_armv6.S b/libavcodec/arm/sbcdsp_armv6.S
new file mode 100644
index 0000000..f1ff845
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_armv6.S
@@ -0,0 +1,245 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARMv6 optimizations. The instructions are scheduled for ARM11 pipeline.
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_sbc_analyze_4_armv6, export=1
+        @ r0 = in, r1 = out, r2 = consts
+        push            {r1, r3-r7, lr}
+        push            {r8-r12, r14}
+        ldrd            r4,  r5,  [r0, #0]
+        ldrd            r6,  r7,  [r2, #0]
+        ldrd            r8,  r9,  [r0, #16]
+        ldrd            r10, r11, [r2, #16]
+        mov             r14, #0x8000
+        smlad           r3,  r4,  r6,  r14
+        smlad           r12, r5,  r7,  r14
+        ldrd            r4,  r5,  [r0, #32]
+        ldrd            r6,  r7,  [r2, #32]
+        smlad           r3,  r8,  r10, r3
+        smlad           r12, r9,  r11, r12
+        ldrd            r8,  r9,  [r0, #48]
+        ldrd            r10, r11, [r2, #48]
+        smlad           r3,  r4,  r6,  r3
+        smlad           r12, r5,  r7,  r12
+        ldrd            r4,  r5,  [r0, #64]
+        ldrd            r6,  r7,  [r2, #64]
+        smlad           r3,  r8,  r10, r3
+        smlad           r12, r9,  r11, r12
+        ldrd            r8,  r9,  [r0, #8]
+        ldrd            r10, r11, [r2, #8]
+        smlad           r3,  r4,  r6,  r3        @ t1[0] is done
+        smlad           r12, r5,  r7,  r12       @ t1[1] is done
+        ldrd            r4,  r5,  [r0, #24]
+        ldrd            r6,  r7,  [r2, #24]
+        pkhtb           r3,  r12, r3, asr #16    @ combine t1[0] and t1[1]
+        smlad           r12, r8,  r10, r14
+        smlad           r14, r9,  r11, r14
+        ldrd            r8,  r9,  [r0, #40]
+        ldrd            r10, r11, [r2, #40]
+        smlad           r12, r4,  r6,  r12
+        smlad           r14, r5,  r7,  r14
+        ldrd            r4,  r5,  [r0, #56]
+        ldrd            r6,  r7,  [r2, #56]
+        smlad           r12, r8,  r10, r12
+        smlad           r14, r9,  r11, r14
+        ldrd            r8,  r9,  [r0, #72]
+        ldrd            r10, r11, [r2, #72]
+        smlad           r12, r4,  r6,  r12
+        smlad           r14, r5,  r7,  r14
+        ldrd            r4,  r5,  [r2, #80]      @ start loading cos table
+        smlad           r12, r8,  r10, r12       @ t1[2] is done
+        smlad           r14, r9,  r11, r14       @ t1[3] is done
+        ldrd            r6,  r7,  [r2, #88]
+        ldrd            r8,  r9,  [r2, #96]
+        ldrd            r10, r11, [r2, #104]     @ cos table fully loaded
+        pkhtb           r12, r14, r12, asr #16   @ combine t1[2] and t1[3]
+        smuad           r4,  r3,  r4
+        smuad           r5,  r3,  r5
+        smlad           r4,  r12, r8,  r4
+        smlad           r5,  r12, r9,  r5
+        smuad           r6,  r3,  r6
+        smuad           r7,  r3,  r7
+        smlad           r6,  r12, r10, r6
+        smlad           r7,  r12, r11, r7
+        pop             {r8-r12, r14}
+        stmia           r1, {r4, r5, r6, r7}
+        pop             {r1, r3-r7, pc}
+endfunc
+
+function ff_sbc_analyze_8_armv6, export=1
+        @ r0 = in, r1 = out, r2 = consts
+        push            {r1, r3-r7, lr}
+        push            {r8-r12, r14}
+        ldrd            r4,  r5,  [r0, #24]
+        ldrd            r6,  r7,  [r2, #24]
+        ldrd            r8,  r9,  [r0, #56]
+        ldrd            r10, r11, [r2, #56]
+        mov             r14, #0x8000
+        smlad           r3,  r4,  r6,  r14
+        smlad           r12, r5,  r7,  r14
+        ldrd            r4,  r5,  [r0, #88]
+        ldrd            r6,  r7,  [r2, #88]
+        smlad           r3,  r8,  r10, r3
+        smlad           r12, r9,  r11, r12
+        ldrd            r8,  r9,  [r0, #120]
+        ldrd            r10, r11, [r2, #120]
+        smlad           r3,  r4,  r6,  r3
+        smlad           r12, r5,  r7,  r12
+        ldrd            r4,  r5,  [r0, #152]
+        ldrd            r6,  r7,  [r2, #152]
+        smlad           r3,  r8,  r10, r3
+        smlad           r12, r9,  r11, r12
+        ldrd            r8,  r9,  [r0, #16]
+        ldrd            r10, r11, [r2, #16]
+        smlad           r3,  r4,  r6,  r3        @ t1[6] is done
+        smlad           r12, r5,  r7,  r12       @ t1[7] is done
+        ldrd            r4,  r5,  [r0, #48]
+        ldrd            r6,  r7,  [r2, #48]
+        pkhtb           r3,  r12, r3, asr #16    @ combine t1[6] and t1[7]
+        str             r3,  [sp, #-4]!          @ save to stack
+        smlad           r3,  r8,  r10, r14
+        smlad           r12, r9,  r11, r14
+        ldrd            r8,  r9,  [r0, #80]
+        ldrd            r10, r11, [r2, #80]
+        smlad           r3,  r4,  r6,  r3
+        smlad           r12, r5,  r7,  r12
+        ldrd            r4,  r5,  [r0, #112]
+        ldrd            r6,  r7,  [r2, #112]
+        smlad           r3,  r8,  r10, r3
+        smlad           r12, r9,  r11, r12
+        ldrd            r8,  r9,  [r0, #144]
+        ldrd            r10, r11, [r2, #144]
+        smlad           r3,  r4,  r6,  r3
+        smlad           r12, r5,  r7,  r12
+        ldrd            r4,  r5,  [r0, #0]
+        ldrd            r6,  r7,  [r2, #0]
+        smlad           r3,  r8,  r10, r3        @ t1[4] is done
+        smlad           r12, r9,  r11, r12       @ t1[5] is done
+        ldrd            r8,  r9,  [r0, #32]
+        ldrd            r10, r11, [r2, #32]
+        pkhtb           r3,  r12, r3, asr #16    @ combine t1[4] and t1[5]
+        str             r3,  [sp, #-4]!          @ save to stack
+        smlad           r3,  r4,  r6,  r14
+        smlad           r12, r5,  r7,  r14
+        ldrd            r4,  r5,  [r0, #64]
+        ldrd            r6,  r7,  [r2, #64]
+        smlad           r3,  r8,  r10, r3
+        smlad           r12, r9,  r11, r12
+        ldrd            r8,  r9,  [r0, #96]
+        ldrd            r10, r11, [r2, #96]
+        smlad           r3,  r4,  r6,  r3
+        smlad           r12, r5,  r7,  r12
+        ldrd            r4,  r5,  [r0, #128]
+        ldrd            r6,  r7,  [r2, #128]
+        smlad           r3,  r8,  r10, r3
+        smlad           r12, r9,  r11, r12
+        ldrd            r8,  r9,  [r0, #8]
+        ldrd            r10, r11, [r2, #8]
+        smlad           r3,  r4,  r6,  r3        @ t1[0] is done
+        smlad           r12, r5,  r7,  r12       @ t1[1] is done
+        ldrd            r4,  r5,  [r0, #40]
+        ldrd            r6,  r7,  [r2, #40]
+        pkhtb           r3,  r12, r3, asr #16    @ combine t1[0] and t1[1]
+        smlad           r12, r8,  r10, r14
+        smlad           r14, r9,  r11, r14
+        ldrd            r8,  r9,  [r0, #72]
+        ldrd            r10, r11, [r2, #72]
+        smlad           r12, r4,  r6,  r12
+        smlad           r14, r5,  r7,  r14
+        ldrd            r4,  r5,  [r0, #104]
+        ldrd            r6,  r7,  [r2, #104]
+        smlad           r12, r8,  r10, r12
+        smlad           r14, r9,  r11, r14
+        ldrd            r8,  r9,  [r0, #136]
+        ldrd            r10, r11, [r2, #136]!
+        smlad           r12, r4,  r6,  r12
+        smlad           r14, r5,  r7,  r14
+        ldrd            r4,  r5,  [r2, #(160 - 136 + 0)]
+        smlad           r12, r8,  r10, r12       @ t1[2] is done
+        smlad           r14, r9,  r11, r14       @ t1[3] is done
+        ldrd            r6,  r7,  [r2, #(160 - 136 + 8)]
+        smuad           r4,  r3,  r4
+        smuad           r5,  r3,  r5
+        pkhtb           r12, r14, r12, asr #16   @ combine t1[2] and t1[3]
+                                                 @ r3  = t2[0:1]
+                                                 @ r12 = t2[2:3]
+        pop             {r0, r14}                @ t2[4:5], t2[6:7]
+        ldrd            r8,  r9,  [r2, #(160 - 136 + 32)]
+        smuad           r6,  r3,  r6
+        smuad           r7,  r3,  r7
+        ldrd            r10, r11, [r2, #(160 - 136 + 40)]
+        smlad           r4,  r12, r8,  r4
+        smlad           r5,  r12, r9,  r5
+        ldrd            r8,  r9,  [r2, #(160 - 136 + 64)]
+        smlad           r6,  r12, r10, r6
+        smlad           r7,  r12, r11, r7
+        ldrd            r10, r11, [r2, #(160 - 136 + 72)]
+        smlad           r4,  r0,  r8,  r4
+        smlad           r5,  r0,  r9,  r5
+        ldrd            r8,  r9,  [r2, #(160 - 136 + 96)]
+        smlad           r6,  r0,  r10, r6
+        smlad           r7,  r0,  r11, r7
+        ldrd            r10, r11, [r2, #(160 - 136 + 104)]
+        smlad           r4,  r14, r8,  r4
+        smlad           r5,  r14, r9,  r5
+        ldrd            r8,  r9,  [r2, #(160 - 136 + 16 + 0)]
+        smlad           r6,  r14, r10, r6
+        smlad           r7,  r14, r11, r7
+        ldrd            r10, r11, [r2, #(160 - 136 + 16 + 8)]
+        stmia           r1!, {r4, r5}
+        smuad           r4,  r3,  r8
+        smuad           r5,  r3,  r9
+        ldrd            r8,  r9,  [r2, #(160 - 136 + 16 + 32)]
+        stmia           r1!, {r6, r7}
+        smuad           r6,  r3,  r10
+        smuad           r7,  r3,  r11
+        ldrd            r10, r11, [r2, #(160 - 136 + 16 + 40)]
+        smlad           r4,  r12, r8,  r4
+        smlad           r5,  r12, r9,  r5
+        ldrd            r8,  r9,  [r2, #(160 - 136 + 16 + 64)]
+        smlad           r6,  r12, r10, r6
+        smlad           r7,  r12, r11, r7
+        ldrd            r10, r11, [r2, #(160 - 136 + 16 + 72)]
+        smlad           r4,  r0,  r8,  r4
+        smlad           r5,  r0,  r9,  r5
+        ldrd            r8,  r9,  [r2, #(160 - 136 + 16 + 96)]
+        smlad           r6,  r0,  r10, r6
+        smlad           r7,  r0,  r11, r7
+        ldrd            r10, r11, [r2, #(160 - 136 + 16 + 104)]
+        smlad           r4,  r14, r8,  r4
+        smlad           r5,  r14, r9,  r5
+        smlad           r6,  r14, r10, r6
+        smlad           r7,  r14, r11, r7
+        pop             {r8-r12, r14}
+        stmia           r1!, {r4, r5, r6, r7}
+        pop             {r1, r3-r7, pc}
+endfunc
diff --git a/libavcodec/arm/sbcdsp_init_arm.c b/libavcodec/arm/sbcdsp_init_arm.c
new file mode 100644
index 0000000..6bf7e72
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_init_arm.c
@@ -0,0 +1,105 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARMv6 optimization for some basic "building bricks"
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/sbcdsp.h"
+
+void ff_sbc_analyze_4_armv6(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_armv6(const int16_t *in, int32_t *out, const int16_t *consts);
+
+void ff_sbc_analyze_4_neon(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_neon(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_calc_scalefactors_neon(int32_t sb_sample_f[16][2][8],
+                                   uint32_t scale_factor[2][8],
+                                   int blocks, int channels, int subbands);
+int ff_sbc_calc_scalefactors_j_neon(int32_t sb_sample_f[16][2][8],
+                                    uint32_t scale_factor[2][8],
+                                    int blocks, int subbands);
+int ff_sbc_enc_process_input_4s_neon(int position, const uint8_t *pcm,
+                                     int16_t X[2][SBC_X_BUFFER_SIZE],
+                                     int nsamples, int nchannels);
+int ff_sbc_enc_process_input_8s_neon(int position, const uint8_t *pcm,
+                                     int16_t X[2][SBC_X_BUFFER_SIZE],
+                                     int nsamples, int nchannels);
+
+DECLARE_ALIGNED(SBC_ALIGN, int32_t, ff_sbcdsp_joint_bits_mask)[8] = {
+    8,   4,  2,  1, 128, 64, 32, 16
+};
+
+#if HAVE_BIGENDIAN
+#define PERM(a, b, c, d) {        \
+        (a * 2) + 1, (a * 2) + 0, \
+        (b * 2) + 1, (b * 2) + 0, \
+        (c * 2) + 1, (c * 2) + 0, \
+        (d * 2) + 1, (d * 2) + 0  \
+    }
+#else
+#define PERM(a, b, c, d) {        \
+        (a * 2) + 0, (a * 2) + 1, \
+        (b * 2) + 0, (b * 2) + 1, \
+        (c * 2) + 0, (c * 2) + 1, \
+        (d * 2) + 0, (d * 2) + 1  \
+    }
+#endif
+
+DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_4)[2][8] = {
+    PERM(7, 3, 6, 4),
+    PERM(0, 2, 1, 5)
+};
+
+DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_8)[4][8] = {
+    PERM(15, 7, 14,  8),
+    PERM(13, 9, 12, 10),
+    PERM(11, 3,  6,  0),
+    PERM( 5, 1,  4,  2)
+};
+
+av_cold void ff_sbcdsp_init_arm(SBCDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_armv6(cpu_flags)) {
+        s->sbc_analyze_4 = ff_sbc_analyze_4_armv6;
+        s->sbc_analyze_8 = ff_sbc_analyze_8_armv6;
+    }
+
+    if (have_neon(cpu_flags)) {
+        s->sbc_analyze_4 = ff_sbc_analyze_4_neon;
+        s->sbc_analyze_8 = ff_sbc_analyze_8_neon;
+        s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_neon;
+        s->sbc_calc_scalefactors_j = ff_sbc_calc_scalefactors_j_neon;
+        if (s->increment != 1) {
+            s->sbc_enc_process_input_4s = ff_sbc_enc_process_input_4s_neon;
+            s->sbc_enc_process_input_8s = ff_sbc_enc_process_input_8s_neon;
+        }
+    }
+}
diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
new file mode 100644
index 0000000..d83d21d
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -0,0 +1,714 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARM NEON optimizations
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define SBC_PROTO_FIXED_SCALE 16
+
+function ff_sbc_analyze_4_neon, export=1
+        /* TODO: merge even and odd cases (or even merge all four calls to this
+         * function) in order to have only aligned reads from 'in' array
+         * and reduce number of load instructions */
+        vld1.16         {d4, d5}, [r0, :64]!
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmull.s16       q0, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmull.s16       q1, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+
+        vmlal.s16       q0, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q1, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q0, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q1, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+
+        vmlal.s16       q0, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q1, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q0, d4, d8
+        vmlal.s16       q1, d5, d9
+
+        vpadd.s32       d0, d0, d1
+        vpadd.s32       d1, d2, d3
+
+        vrshrn.s32      d0, q0, SBC_PROTO_FIXED_SCALE
+
+        vld1.16         {d2, d3, d4, d5}, [r2, :128]!
+
+        vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
+        vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
+
+        vmull.s16       q3, d2, d0
+        vmull.s16       q4, d3, d0
+        vmlal.s16       q3, d4, d1
+        vmlal.s16       q4, d5, d1
+
+        vpadd.s32       d0, d6, d7 /* TODO: can be eliminated */
+        vpadd.s32       d1, d8, d9 /* TODO: can be eliminated */
+
+        vst1.32         {d0, d1}, [r1, :128]
+
+        bx              lr
+endfunc
+
+function ff_sbc_analyze_8_neon, export=1
+        /* TODO: merge even and odd cases (or even merge all four calls to this
+         * function) in order to have only aligned reads from 'in' array
+         * and reduce number of load instructions */
+        vld1.16         {d4, d5}, [r0, :64]!
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmull.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmull.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmull.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmull.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+        vmlal.s16       q8, d6, d10
+        vld1.16         {d4, d5}, [r0, :64]!
+        vmlal.s16       q9, d7, d11
+        vld1.16         {d8, d9}, [r2, :128]!
+
+        vmlal.s16       q6, d4, d8
+        vld1.16         {d6,  d7}, [r0, :64]!
+        vmlal.s16       q7, d5, d9
+        vld1.16         {d10, d11}, [r2, :128]!
+
+        vmlal.s16       q8, d6, d10
+        vmlal.s16       q9, d7, d11
+
+        vpadd.s32       d0, d12, d13
+        vpadd.s32       d1, d14, d15
+        vpadd.s32       d2, d16, d17
+        vpadd.s32       d3, d18, d19
+
+        vrshr.s32       q0, q0, SBC_PROTO_FIXED_SCALE
+        vrshr.s32       q1, q1, SBC_PROTO_FIXED_SCALE
+        vmovn.s32       d0, q0
+        vmovn.s32       d1, q1
+
+        vdup.i32        d3, d1[1]  /* TODO: can be eliminated */
+        vdup.i32        d2, d1[0]  /* TODO: can be eliminated */
+        vdup.i32        d1, d0[1]  /* TODO: can be eliminated */
+        vdup.i32        d0, d0[0]  /* TODO: can be eliminated */
+
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmull.s16       q6, d4, d0
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmull.s16       q7, d5, d0
+        vmull.s16       q8, d6, d0
+        vmull.s16       q9, d7, d0
+
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d1
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d1
+        vmlal.s16       q8, d6, d1
+        vmlal.s16       q9, d7, d1
+
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d2
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d2
+        vmlal.s16       q8, d6, d2
+        vmlal.s16       q9, d7, d2
+
+        vld1.16         {d4, d5}, [r2, :128]!
+        vmlal.s16       q6, d4, d3
+        vld1.16         {d6, d7}, [r2, :128]!
+        vmlal.s16       q7, d5, d3
+        vmlal.s16       q8, d6, d3
+        vmlal.s16       q9, d7, d3
+
+        vpadd.s32       d0, d12, d13 /* TODO: can be eliminated */
+        vpadd.s32       d1, d14, d15 /* TODO: can be eliminated */
+        vpadd.s32       d2, d16, d17 /* TODO: can be eliminated */
+        vpadd.s32       d3, d18, d19 /* TODO: can be eliminated */
+
+        vst1.32         {d0, d1, d2, d3}, [r1, :128]
+
+        bx              lr
+endfunc
+
+function ff_sbc_calc_scalefactors_neon, export=1
+        @ parameters
+        @ r0 = sb_sample_f
+        @ r1 = scale_factor
+        @ r2 = blocks
+        @ r3 = channels
+        @ r4 = subbands
+        @ local variables
+        @ r5 = in_loop_1
+        @ r6 = in
+        @ r7 = out_loop_1
+        @ r8 = out
+        @ r9 = ch
+        @ r10 = sb
+        @ r11 = inc
+        @ r12 = blk
+
+        push            {r1-r2, r4-r12}
+        ldr             r4,  [sp, #44]
+        mov             r11, #64
+
+        mov             r9,  #0
+1:
+        add             r5,  r0,  r9, lsl#5
+        add             r7,  r1,  r9, lsl#5
+
+        mov             r10,  #0
+2:
+        add             r6,  r5,  r10, lsl#2
+        add             r8,  r7,  r10, lsl#2
+        mov             r12, r2
+
+        vmov.s32        q0,  #0
+        vmov.s32        q1,  #0x8000            @ 1 << SCALE_OUT_BITS
+        vmov.s32        q14, #1
+        vmov.s32        q15, #16                @ 31 - SCALE_OUT_BITS
+        vadd.s32        q1,  q1,  q14
+3:
+        vld1.32         {d16, d17}, [r6, :128], r11
+        vabs.s32        q8,  q8
+        vld1.32         {d18, d19}, [r6, :128], r11
+        vabs.s32        q9,  q9
+        vld1.32         {d20, d21}, [r6, :128], r11
+        vabs.s32        q10, q10
+        vld1.32         {d22, d23}, [r6, :128], r11
+        vabs.s32        q11, q11
+        vmax.s32        q0,  q0,  q8
+        vmax.s32        q1,  q1,  q9
+        vmax.s32        q0,  q0,  q10
+        vmax.s32        q1,  q1,  q11
+        subs            r12, r12, #4
+        bgt             3b
+        vmax.s32        q0,  q0,  q1
+        vsub.s32        q0,  q0,  q14
+        vclz.s32        q0,  q0
+        vsub.s32        q0,  q15, q0
+        vst1.32         {d0, d1}, [r8, :128]
+
+        add             r10, r10, #4
+        cmp             r10, r4
+        blt             2b
+
+        add             r9,  r9,  #1
+        cmp             r9,  r3
+        blt             1b
+
+        pop             {r1-r2, r4-r12}
+        bx              lr
+endfunc
+
+/*
+ * constants: q13 = (31 - SCALE_OUT_BITS)
+ *            q14 = 1
+ * input:     q0  - ((1 << SCALE_OUT_BITS) + 1)
+ *            r5  - samples for channel 0
+ *            r6  - samples for shannel 1
+ * output:    q0, q1 - scale factors without joint stereo
+ *            q2, q3 - scale factors with joint stereo
+ *            q15    - joint stereo selection mask
+ */
+.macro calc_scalefactors
+        vmov.s32        q1,  q0
+        vmov.s32        q2,  q0
+        vmov.s32        q3,  q0
+        mov             r3,  r2
+1:
+        vld1.32         {d18, d19}, [r6, :128], r11
+        vbic.s32        q11, q9,  q14
+        vld1.32         {d16, d17}, [r5, :128], r11
+        vhadd.s32       q10, q8,  q11
+        vhsub.s32       q11, q8,  q11
+        vabs.s32        q8,  q8
+        vabs.s32        q9,  q9
+        vabs.s32        q10, q10
+        vabs.s32        q11, q11
+        vmax.s32        q0,  q0,  q8
+        vmax.s32        q1,  q1,  q9
+        vmax.s32        q2,  q2,  q10
+        vmax.s32        q3,  q3,  q11
+        subs            r3,  r3,  #1
+        bgt             1b
+        vsub.s32        q0,  q0,  q14
+        vsub.s32        q1,  q1,  q14
+        vsub.s32        q2,  q2,  q14
+        vsub.s32        q3,  q3,  q14
+        vclz.s32        q0,  q0
+        vclz.s32        q1,  q1
+        vclz.s32        q2,  q2
+        vclz.s32        q3,  q3
+        vsub.s32        q0,  q13, q0
+        vsub.s32        q1,  q13, q1
+        vsub.s32        q2,  q13, q2
+        vsub.s32        q3,  q13, q3
+.endm
+
+/*
+ * constants: q14 = 1
+ * input: q15 - joint stereo selection mask
+ *        r5  - value set by calc_scalefactors macro
+ *        r6  - value set by calc_scalefactors macro
+ */
+.macro update_joint_stereo_samples
+        sub             r8,  r6,  r11
+        sub             r7,  r5,  r11
+        sub             r6,  r6,  r11, asl #1
+        sub             r5,  r5,  r11, asl #1
+        vld1.32         {d18, d19}, [r6, :128]
+        vbic.s32        q11, q9,  q14
+        vld1.32         {d16, d17}, [r5, :128]
+        vld1.32         {d2, d3}, [r8, :128]
+        vbic.s32        q3,  q1,  q14
+        vld1.32         {d0, d1}, [r7, :128]
+        vhsub.s32       q10, q8,  q11
+        vhadd.s32       q11, q8,  q11
+        vhsub.s32       q2,  q0,  q3
+        vhadd.s32       q3,  q0,  q3
+        vbif.s32        q10, q9,  q15
+        vbif.s32        d22, d16, d30
+        sub             r11, r10, r11, asl #1
+        sub             r3,  r2,  #2
+2:
+        vbif.s32        d23, d17, d31
+        vst1.32         {d20, d21}, [r6, :128], r11
+        vbif.s32        d4,  d2,  d30
+        vld1.32         {d18, d19}, [r6, :128]
+        vbif.s32        d5,  d3,  d31
+        vst1.32         {d22, d23}, [r5, :128], r11
+        vbif.s32        d6,  d0,  d30
+        vld1.32         {d16, d17}, [r5, :128]
+        vbif.s32        d7,  d1,  d31
+        vst1.32         {d4, d5}, [r8, :128], r11
+        vbic.s32        q11, q9,  q14
+        vld1.32         {d2, d3}, [r8, :128]
+        vst1.32         {d6, d7}, [r7, :128], r11
+        vbic.s32        q3,  q1,  q14
+        vld1.32         {d0, d1}, [r7, :128]
+        vhsub.s32       q10, q8,  q11
+        vhadd.s32       q11, q8,  q11
+        vhsub.s32       q2,  q0,  q3
+        vhadd.s32       q3,  q0,  q3
+        vbif.s32        q10, q9,  q15
+        vbif.s32        d22, d16, d30
+        subs            r3,  r3,  #2
+        bgt             2b
+        sub             r11, r10, r11, asr #1
+        vbif.s32        d23, d17, d31
+        vst1.32         {d20, d21}, [r6, :128]
+        vbif.s32        q2,  q1,  q15
+        vst1.32         {d22, d23}, [r5, :128]
+        vbif.s32        q3,  q0,  q15
+        vst1.32         {d4, d5}, [r8, :128]
+        vst1.32         {d6, d7}, [r7, :128]
+.endm
+
+function ff_sbc_calc_scalefactors_j_neon, export=1
+        @ parameters
+        @ r0 = in = sb_sample_f
+        @ r1 = out = scale_factor
+        @ r2 = blocks
+        @ r3 = subbands
+        @ local variables
+        @ r4 = consts = ff_sbcdsp_joint_bits_mask
+        @ r5 = in0
+        @ r6 = in1
+        @ r7 = out0
+        @ r8 = out1
+        @ r10 = zero
+        @ r11 = inc
+        @ return r0 = joint
+
+        push            {r3-r11}
+        movrelx         r4,  X(ff_sbcdsp_joint_bits_mask)
+        mov             r10, #0
+        mov             r11, #64
+
+        vmov.s32        q14, #1
+        vmov.s32        q13, #16    @ 31 - SCALE_OUT_BITS
+
+        cmp             r3, #4
+        bne             8f
+
+4:      @ 4 subbands
+        add             r5,  r0,  #0
+        add             r6,  r0,  #32
+        add             r7,  r1,  #0
+        add             r8,  r1,  #32
+        vmov.s32        q0,  #0x8000    @ 1 << SCALE_OUT_BITS
+        vadd.s32        q0,  q0,  q14
+
+        calc_scalefactors
+
+        @ check whether to use joint stereo for subbands 0, 1, 2
+        vadd.s32        q15, q0,  q1
+        vadd.s32        q9,  q2,  q3
+        vmov.s32        d31[1], r10    @ last subband -> no joint
+        vld1.32         {d16, d17}, [r4, :128]!
+        vcgt.s32        q15, q15, q9
+
+        @ calculate and save to memory 'joint' variable
+        @ update and save scale factors to memory
+        vand.s32        q8,  q8,  q15
+        vbit.s32        q0,  q2,  q15
+        vpadd.s32       d16, d16, d17
+        vbit.s32        q1,  q3,  q15
+        vpadd.s32       d16, d16, d16
+        vst1.32         {d0, d1}, [r7, :128]
+        vst1.32         {d2, d3}, [r8, :128]
+        vmov.32         r0, d16[0]
+
+        update_joint_stereo_samples
+        b               9f
+
+8:      @ 8 subbands
+        add             r5,  r0,  #16
+        add             r6,  r0,  #48
+        add             r7,  r1,  #16
+        add             r8,  r1,  #48
+        vmov.s32        q0,  #0x8000    @ 1 << SCALE_OUT_BITS
+        vadd.s32        q0,  q0,  q14
+
+        calc_scalefactors
+
+        @ check whether to use joint stereo for subbands 4, 5, 6
+        vadd.s32        q15, q0,  q1
+        vadd.s32        q9,  q2,  q3
+        vmov.s32        d31[1], r10    @ last subband -> no joint
+        vld1.32         {d16, d17}, [r4, :128]!
+        vcgt.s32        q15, q15, q9
+
+        @ calculate part of 'joint' variable and save it to d24
+        @ update and save scale factors to memory
+        vand.s32        q8,  q8,  q15
+        vbit.s32        q0,  q2,  q15
+        vpadd.s32       d16, d16, d17
+        vbit.s32        q1,  q3,  q15
+        vst1.32         {d0, d1}, [r7, :128]
+        vst1.32         {d2, d3}, [r8, :128]
+        vpadd.s32       d24, d16, d16
+
+        update_joint_stereo_samples
+
+        add             r5,  r0,  #0
+        add             r6,  r0,  #32
+        add             r7,  r1,  #0
+        add             r8,  r1,  #32
+        vmov.s32        q0,  #0x8000    @ 1 << SCALE_OUT_BITS
+        vadd.s32        q0,  q0,  q14
+
+        calc_scalefactors
+
+        @ check whether to use joint stereo for subbands 0, 1, 2, 3
+        vadd.s32        q15, q0,  q1
+        vadd.s32        q9,  q2,  q3
+        vld1.32         {d16, d17}, [r4, :128]!
+        vcgt.s32        q15, q15, q9
+
+        @ combine last part of 'joint' with d24 and save to memory
+        @ update and save scale factors to memory
+        vand.s32        q8,  q8,  q15
+        vbit.s32        q0,  q2,  q15
+        vpadd.s32       d16, d16, d17
+        vbit.s32        q1,  q3,  q15
+        vpadd.s32       d16, d16, d16
+        vst1.32         {d0, d1}, [r7, :128]
+        vadd.s32        d16, d16, d24
+        vst1.32         {d2, d3}, [r8, :128]
+        vmov.32         r0,  d16[0]
+
+        update_joint_stereo_samples
+9:
+        pop             {r3-r11}
+        bx              lr
+endfunc
+
+function ff_sbc_enc_process_input_4s_neon, export=1
+        @ parameters
+        @ r0 = positioin
+        @ r1 = pcm
+        @ r2 = X
+        @ r3 = nsamples
+        @ r4 = nchannels
+        @ local variables
+        @ r5 = ff_sbc_input_perm_4
+        @ r6 = src / x
+        @ r7 = dst / y
+
+        push            {r1, r3-r7}
+        ldr             r4,  [sp, #24]
+        movrelx         r5,  X(ff_sbc_input_perm_4)
+
+        @ handle X buffer wraparound
+        cmp             r0,  r3
+        bge             1f                     @ if (position < nsamples)
+        add             r7,  r2,  #576         @ &X[0][SBC_X_BUFFER_SIZE - 40]
+        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0}, [r6, :64]!
+        vst1.16         {d0}, [r7, :64]!
+        cmp             r4,  #1
+        ble             2f                     @ if (nchannels > 1)
+        add             r7,  r2,  #1232        @ &X[1][SBC_X_BUFFER_SIZE - 40]
+        add             r6,  r2,  #656
+        add             r6,  r6,  r0, lsl#1    @ &X[1][position]
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0}, [r6, :64]!
+        vst1.16         {d0}, [r7, :64]!
+2:
+        mov             r0,  #288              @ SBC_X_BUFFER_SIZE - 40
+1:
+
+        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
+        add             r7,  r6,  #656         @ &X[1][position]
+
+        cmp             r4,  #1
+        ble             8f                     @ if (nchannels > 1)
+        tst             r1,  #1
+        beq             7f                     @ if (pcm & 1)
+        @ poor 'pcm' alignment
+        vld1.8          {d0, d1}, [r5, :128]
+1:
+        sub             r6,  r6,  #16
+        sub             r7,  r7,  #16
+        sub             r0,  r0,  #8
+        vld1.8          {d4, d5}, [r1]!
+        vuzp.16         d4,  d5
+        vld1.8          {d20, d21}, [r1]!
+        vuzp.16         d20, d21
+        vswp            d5,  d20
+        vtbl.8          d16, {d4, d5}, d0
+        vtbl.8          d17, {d4, d5}, d1
+        vtbl.8          d18, {d20, d21}, d0
+        vtbl.8          d19, {d20, d21}, d1
+        vst1.16         {d16, d17}, [r6, :128]
+        vst1.16         {d18, d19}, [r7, :128]
+        subs            r3,  r3,  #8
+        bgt             1b
+        b               9f
+7:
+        @ proper 'pcm' alignment
+        vld1.8          {d0, d1}, [r5, :128]
+1:
+        sub             r6,  r6,  #16
+        sub             r7,  r7,  #16
+        sub             r0,  r0,  #8
+        vld2.16         {d4, d5}, [r1]!
+        vld2.16         {d20, d21}, [r1]!
+        vswp            d5,  d20
+        vtbl.8          d16, {d4, d5}, d0
+        vtbl.8          d17, {d4, d5}, d1
+        vtbl.8          d18, {d20, d21}, d0
+        vtbl.8          d19, {d20, d21}, d1
+        vst1.16         {d16, d17}, [r6, :128]
+        vst1.16         {d18, d19}, [r7, :128]
+        subs            r3,  r3,  #8
+        bgt             1b
+        b               9f
+8:
+        @ mono
+        vld1.8          {d0, d1}, [r5, :128]
+1:
+        sub             r6,  r6,  #16
+        sub             r0,  r0,  #8
+        vld1.8          {d4, d5}, [r1]!
+        vtbl.8          d16, {d4, d5}, d0
+        vtbl.8          d17, {d4, d5}, d1
+        vst1.16         {d16, d17}, [r6, :128]
+        subs            r3,  r3,  #8
+        bgt             1b
+9:
+        pop             {r1, r3-r7}
+        bx              lr
+endfunc
+
+function ff_sbc_enc_process_input_8s_neon, export=1
+        @ parameters
+        @ r0 = positioin
+        @ r1 = pcm
+        @ r2 = X
+        @ r3 = nsamples
+        @ r4 = nchannels
+        @ local variables
+        @ r5 = ff_sbc_input_perm_8
+        @ r6 = src
+        @ r7 = dst
+
+        push            {r1, r3-r7}
+        ldr             r4,  [sp, #24]
+        movrelx         r5,  X(ff_sbc_input_perm_8)
+
+        @ handle X buffer wraparound
+        cmp             r0,  r3
+        bge             1f                     @ if (position < nsamples)
+        add             r7,  r2,  #512         @ &X[0][SBC_X_BUFFER_SIZE - 72]
+        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1}, [r6, :128]!
+        vst1.16         {d0, d1}, [r7, :128]!
+        cmp             r4,  #1
+        ble             2f                     @ if (nchannels > 1)
+        add             r7,  r2,  #1168        @ &X[1][SBC_X_BUFFER_SIZE - 72]
+        add             r6,  r2,  #656
+        add             r6,  r6,  r0, lsl#1    @ &X[1][position]
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1, d2, d3}, [r6, :128]!
+        vst1.16         {d0, d1, d2, d3}, [r7, :128]!
+        vld1.16         {d0, d1}, [r6, :128]!
+        vst1.16         {d0, d1}, [r7, :128]!
+2:
+        mov             r0,  #256              @ SBC_X_BUFFER_SIZE - 72
+1:
+
+        add             r6,  r2,  r0, lsl#1    @ &X[0][position]
+        add             r7,  r6,  #656         @ &X[1][position]
+
+        cmp             r4,  #1
+        ble             8f                     @ if (nchannels > 1)
+        tst             r1,  #1
+        beq             7f                     @ if (pcm & 1)
+        @ poor 'pcm' alignment
+        vld1.8          {d0, d1, d2, d3}, [r5, :128]
+1:
+        sub             r6,  r6,  #32
+        sub             r7,  r7,  #32
+        sub             r0,  r0,  #16
+        vld1.8          {d4, d5, d6, d7}, [r1]!
+        vuzp.16         q2,  q3
+        vld1.8          {d20, d21, d22, d23}, [r1]!
+        vuzp.16         q10, q11
+        vswp            q3,  q10
+        vtbl.8          d16, {d4, d5, d6, d7}, d0
+        vtbl.8          d17, {d4, d5, d6, d7}, d1
+        vtbl.8          d18, {d4, d5, d6, d7}, d2
+        vtbl.8          d19, {d4, d5, d6, d7}, d3
+        vst1.16         {d16, d17, d18, d19}, [r6, :128]
+        vtbl.8          d16, {d20, d21, d22, d23}, d0
+        vtbl.8          d17, {d20, d21, d22, d23}, d1
+        vtbl.8          d18, {d20, d21, d22, d23}, d2
+        vtbl.8          d19, {d20, d21, d22, d23}, d3
+        vst1.16         {d16, d17, d18, d19}, [r7, :128]
+        subs            r3,  r3,  #16
+        bgt             1b
+        b 9f
+7:
+        @ proper 'pcm' alignment
+        vld1.8          {d0, d1, d2, d3}, [r5, :128]
+1:
+        sub             r6,  r6,  #32
+        sub             r7,  r7,  #32
+        sub             r0,  r0,  #16
+        vld2.16         {d4, d5, d6, d7}, [r1]!
+        vld2.16         {d20, d21, d22, d23}, [r1]!
+        vswp            q3,  q10
+        vtbl.8          d16, {d4, d5, d6, d7}, d0
+        vtbl.8          d17, {d4, d5, d6, d7}, d1
+        vtbl.8          d18, {d4, d5, d6, d7}, d2
+        vtbl.8          d19, {d4, d5, d6, d7}, d3
+        vst1.16         {d16, d17, d18, d19}, [r6, :128]
+        vtbl.8          d16, {d20, d21, d22, d23}, d0
+        vtbl.8          d17, {d20, d21, d22, d23}, d1
+        vtbl.8          d18, {d20, d21, d22, d23}, d2
+        vtbl.8          d19, {d20, d21, d22, d23}, d3
+        vst1.16         {d16, d17, d18, d19}, [r7, :128]
+        subs            r3,  r3,  #16
+        bgt             1b
+        b               9f
+8:
+        @ mono
+        vld1.8          {d0, d1, d2, d3}, [r5, :128]
+1:
+        sub             r6,  r6,  #32
+        sub             r0,  r0,  #16
+        vld1.8          {d4, d5, d6, d7}, [r1]!
+        vtbl.8          d16, {d4, d5, d6, d7}, d0
+        vtbl.8          d17, {d4, d5, d6, d7}, d1
+        vtbl.8          d18, {d4, d5, d6, d7}, d2
+        vtbl.8          d19, {d4, d5, d6, d7}, d3
+        vst1.16         {d16, d17, d18, d19}, [r6, :128]
+        subs            r3,  r3,  #16
+        bgt             1b
+9:
+        pop             {r1, r3-r7}
+        bx              lr
+endfunc
diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c
index 4da7967..4fb69f9 100644
--- a/libavcodec/arm/sbrdsp_init_arm.c
+++ b/libavcodec/arm/sbrdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S
index 610397f..003b04e 100644
--- a/libavcodec/arm/sbrdsp_neon.S
+++ b/libavcodec/arm/sbrdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -336,11 +336,11 @@ function ff_sbr_hf_apply_noise_0_neon, export=1
         vld1.32         {d0},     [r0,:64]
         vld1.32         {d6},     [lr,:64]
         vld1.32         {d2[]},   [r1,:32]!
-        vld1.32         {d3[]},   [r2,:32]!
+        vld1.32         {d18[]},  [r2,:32]!
         vceq.f32        d4,  d2,  #0
         veor            d2,  d2,  d3
         vmov            d1,  d0
-        vmla.f32        d0,  d6,  d3
+        vmla.f32        d0,  d6,  d18
         vadd.f32        s2,  s2,  s4
         vbif            d0,  d1,  d4
         vst1.32         {d0},     [r0,:64]!
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
index a651927..42d79ab 100644
--- a/libavcodec/arm/simple_idct_arm.S
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -4,22 +4,22 @@
  * Author: Frederic Boulay <dilb@handhelds.org>
  *
  * The function defined in this file is derived from the simple_idct function
- * from the libavcodec library part of the Libav project.
+ * from the libavcodec library part of the FFmpeg project.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
index b196833..a8d0346 100644
--- a/libavcodec/arm/simple_idct_armv5te.S
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index a8de990..f95c20d 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index 9e0a97a..726d4cb 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -6,20 +6,20 @@
  * Based on Simple IDCT
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h
index d7996c1..cf25d9d 100644
--- a/libavcodec/arm/startcode.h
+++ b/libavcodec/arm/startcode.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
index 64078b2..a46f009 100644
--- a/libavcodec/arm/startcode_armv6.S
+++ b/libavcodec/arm/startcode_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c
index bf0d9b4..ea0ce14 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/synth_filter_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,20 +22,9 @@
 
 #include "libavutil/arm/cpu.h"
 #include "libavutil/attributes.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
-void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs);
-
-void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
-                                SynthFilterContext *synth, FFTContext *imdct,
-                                float synth_buf_ptr[512],
-                                int *synth_buf_offset, float synth_buf2[32],
-                                const float window[512], float *samples_out,
-                                float raXin[32], float scale);
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
 
 void ff_synth_filter_float_vfp(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
@@ -49,21 +38,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float out[32], const float in[32],
                                 float scale);
 
-av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_vfp_vm(cpu_flags)) {
-        s->lfe_fir[0]      = ff_dca_lfe_fir32_vfp;
-        s->lfe_fir[1]      = ff_dca_lfe_fir64_vfp;
-        s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
-    }
-    if (have_neon(cpu_flags)) {
-        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
-        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
-    }
-}
-
 av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S
index 62bb667..5417be7 100644
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
index 5d79e50..596734c 100644
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp.h b/libavcodec/arm/vc1dsp.h
index 30f059f..cd01ac5 100644
--- a/libavcodec/arm/vc1dsp.h
+++ b/libavcodec/arm/vc1dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
index a6a97c8..5f2c759 100644
--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         ff_vc1dsp_init_neon(dsp);
 }
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 1c06597..2cca784 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,40 +35,38 @@ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int rnd);
 
-void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
+#define DECL_PUT(X, Y) \
+void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \
+                                      ptrdiff_t stride, int rnd); \
+static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \
+                                         ptrdiff_t stride, int rnd) \
+{ \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+  dst += 8*stride; src += 8*stride; \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+}
+
+DECL_PUT(1, 0)
+DECL_PUT(2, 0)
+DECL_PUT(3, 0)
+
+DECL_PUT(0, 1)
+DECL_PUT(0, 2)
+DECL_PUT(0, 3)
+
+DECL_PUT(1, 1)
+DECL_PUT(1, 2)
+DECL_PUT(1, 3)
+
+DECL_PUT(2, 1)
+DECL_PUT(2, 2)
+DECL_PUT(2, 3)
+
+DECL_PUT(3, 1)
+DECL_PUT(3, 2)
+DECL_PUT(3, 3)
 
 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
@@ -79,6 +77,10 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
 
+#define FN_ASSIGN(X, Y) \
+    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+
 av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
 {
     dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
@@ -90,22 +92,25 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
 
-    dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = ff_put_vc1_mspel_mc01_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = ff_put_vc1_mspel_mc11_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = ff_put_vc1_mspel_mc21_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = ff_put_vc1_mspel_mc31_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = ff_put_vc1_mspel_mc02_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = ff_put_vc1_mspel_mc12_neon;
-    dsp->put_vc1_mspel_pixels_tab[10] = ff_put_vc1_mspel_mc22_neon;
-    dsp->put_vc1_mspel_pixels_tab[11] = ff_put_vc1_mspel_mc32_neon;
-    dsp->put_vc1_mspel_pixels_tab[12] = ff_put_vc1_mspel_mc03_neon;
-    dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon;
-    dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon;
-    dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon;
+    dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
+    FN_ASSIGN(1, 0);
+    FN_ASSIGN(2, 0);
+    FN_ASSIGN(3, 0);
+
+    FN_ASSIGN(0, 1);
+    FN_ASSIGN(1, 1);
+    FN_ASSIGN(2, 1);
+    FN_ASSIGN(3, 1);
+
+    FN_ASSIGN(0, 2);
+    FN_ASSIGN(1, 2);
+    FN_ASSIGN(2, 2);
+    FN_ASSIGN(3, 2);
+
+    FN_ASSIGN(0, 3);
+    FN_ASSIGN(1, 3);
+    FN_ASSIGN(2, 3);
+    FN_ASSIGN(3, 3);
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index 71cc3f4..93f043b 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_arm.h b/libavcodec/arm/videodsp_arm.h
index a708759..112cbb8 100644
--- a/libavcodec/arm/videodsp_arm.h
+++ b/libavcodec/arm/videodsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_armv5te.S b/libavcodec/arm/videodsp_armv5te.S
index 0510019..aff1161 100644
--- a/libavcodec/arm/videodsp_armv5te.S
+++ b/libavcodec/arm/videodsp_armv5te.S
@@ -2,20 +2,20 @@
 @ ARMv5te-optimized core video DSP functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/videodsp_init_arm.c b/libavcodec/arm/videodsp_init_arm.c
index 20c6e4a..a89abb2 100644
--- a/libavcodec/arm/videodsp_init_arm.c
+++ b/libavcodec/arm/videodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_init_armv5te.c b/libavcodec/arm/videodsp_init_armv5te.c
index 832191f..1ea1f34 100644
--- a/libavcodec/arm/videodsp_init_armv5te.c
+++ b/libavcodec/arm/videodsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,5 +27,7 @@ void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
 
 av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
 {
+#if HAVE_ARMV5TE_EXTERNAL
     ctx->prefetch = ff_prefetch_arm;
+#endif
 }
diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/vorbisdsp_init_arm.c
index 853ba2d..f4b3d80 100644
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/vorbisdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vorbisdsp_neon.S b/libavcodec/arm/vorbisdsp_neon.S
index 7df876c..79ce54f 100644
--- a/libavcodec/arm/vorbisdsp_neon.S
+++ b/libavcodec/arm/vorbisdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 1c91434..65ea53f 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index 58bd97d..2942d48 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index 6bc9456..feb1247 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_init_arm.c b/libavcodec/arm/vp6dsp_init_arm.c
index 7e26150..a59d612 100644
--- a/libavcodec/arm/vp6dsp_init_arm.c
+++ b/libavcodec/arm/vp6dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_neon.S b/libavcodec/arm/vp6dsp_neon.S
index 10b4d0f..03dd28d 100644
--- a/libavcodec/arm/vp6dsp_neon.S
+++ b/libavcodec/arm/vp6dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h
index 93b2788..965342d 100644
--- a/libavcodec/arm/vp8.h
+++ b/libavcodec/arm/vp8.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
index 3863dc3..e7d25a4 100644
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp.h b/libavcodec/arm/vp8dsp.h
index 0d55e0f..7281d0b 100644
--- a/libavcodec/arm/vp8dsp.h
+++ b/libavcodec/arm/vp8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S
index 9eb9734..2320bf4 100644
--- a/libavcodec/arm/vp8dsp_armv6.S
+++ b/libavcodec/arm/vp8dsp_armv6.S
@@ -5,20 +5,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * This code was partially ported from libvpx, which uses this license:
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index aa77dba..8b80176 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_armv6.c b/libavcodec/arm/vp8dsp_init_armv6.c
index febe4e7..a5bcd73 100644
--- a/libavcodec/arm/vp8dsp_init_armv6.c
+++ b/libavcodec/arm/vp8dsp_init_armv6.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_neon.c b/libavcodec/arm/vp8dsp_init_neon.c
index 2b6c775..53f1f23 100644
--- a/libavcodec/arm/vp8dsp_init_neon.c
+++ b/libavcodec/arm/vp8dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index b707d19..7cedfc2 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp9dsp_init.h b/libavcodec/arm/vp9dsp_init.h
new file mode 100644
index 0000000..0dc1c2d
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP9DSP_INIT_H
+#define AVCODEC_ARM_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_arm(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_arm(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_ARM_VP9DSP_INIT_H */
diff --git a/libavcodec/arm/vp9dsp_init_10bpp_arm.c b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
new file mode 100644
index 0000000..b8cb293
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_12bpp_arm.c b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
new file mode 100644
index 0000000..fa65eb2
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
new file mode 100644
index 0000000..1b00078
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/arm/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix)                                          \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp)                                                   \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp)                                      \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src,                 \
+                                                ptrdiff_t src_stride,               \
+                                                int h, int mx, int my)              \
+{                                                                                   \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]);         \
+    /* We only need h + 7 lines, but the horizontal filter assumes an               \
+     * even number of rows, so filter h + 8 lines here. */                          \
+    ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz,                          \
+                                             src - 3 * src_stride, src_stride,      \
+                                             h + 8, mx, 0);                         \
+    ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride,                    \
+                                                temp + 3 * 2 * sz, 2 * sz,          \
+                                                h, 0, my);                          \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp)  \
+    decl_mc_func(op, regular, dir, sz, bpp); \
+    decl_mc_func(op, sharp,   dir, sz, bpp); \
+    decl_mc_func(op, smooth,  dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp)      \
+    decl_filter_funcs(put, h,  sz, bpp); \
+    decl_filter_funcs(avg, h,  sz, bpp); \
+    decl_filter_funcs(put, v,  sz, bpp); \
+    decl_filter_funcs(avg, v,  sz, bpp); \
+    decl_filter_funcs(put, hv, sz, bpp); \
+    decl_filter_funcs(avg, hv, sz, bpp)
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64,  );
+declare_fpel(copy, 32,  );
+declare_fpel(copy, 16,  );
+declare_fpel(copy, 8,   );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8,  _16);
+declare_fpel(avg, 4,  _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8,  BPP);
+decl_mc_funcs(4,  BPP);
+
+#define define_8tap_2d_funcs(sz, bpp)        \
+    define_8tap_2d_fn(put, regular, sz, bpp) \
+    define_8tap_2d_fn(put, sharp,   sz, bpp) \
+    define_8tap_2d_fn(put, smooth,  sz, bpp) \
+    define_8tap_2d_fn(avg, regular, sz, bpp) \
+    define_8tap_2d_fn(avg, sharp,   sz, bpp) \
+    define_8tap_2d_fn(avg, smooth,  sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8,  BPP)
+define_8tap_2d_funcs(4,  BPP)
+
+
+static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix##_neon
+
+#define init_copy_avg(idx, sz1, sz2) \
+    init_fpel(idx, 0, sz2, copy, );  \
+    init_fpel(idx, 1, sz1, avg, _16)
+
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp)            \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, hv, 1, 1, sz,        , bpp)
+
+        init_copy_avg(0, 64, 128);
+        init_copy_avg(1, 32, 64);
+        init_copy_avg(2, 16, 32);
+        init_copy_avg(3, 8,  16);
+        init_copy_avg(4, 4,  8);
+
+        init_mc_funcs_dirs(0, 64, BPP);
+        init_mc_funcs_dirs(1, 32, BPP);
+        init_mc_funcs_dirs(2, 16, BPP);
+        init_mc_funcs_dirs(3, 8,  BPP);
+        init_mc_funcs_dirs(4, 4,  BPP);
+    }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp)                                     \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst,    \
+                                                                 ptrdiff_t stride, \
+                                                                 int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp)      \
+    define_itxfm(idct,  idct,  sz, bpp); \
+    define_itxfm(iadst, idct,  sz, bpp); \
+    define_itxfm(idct,  iadst, sz, bpp); \
+    define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4,  BPP);
+define_itxfm_funcs(8,  BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4,  BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp)                                               \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_##bpp##_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp)     \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+        init_itxfm(TX_4X4,   4x4,   BPP);
+        init_itxfm(TX_8X8,   8x8,   BPP);
+        init_itxfm(TX_16X16, 16x16, BPP);
+        init_idct(TX_32X32, idct_idct_32x32, BPP);
+        init_idct(4,        iwht_iwht_4x4,   BPP);
+    }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+    define_loop_filter(h, wd, size, bpp);  \
+    define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4,  8,  BPP);
+define_loop_filters(8,  8,  BPP);
+define_loop_filters(16, 8,  BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+    dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+    init_lpf_func_8(idx, 0, h, wd, bpp);  \
+    init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp)   \
+    init_lpf_func_16(0, h, bpp); \
+    init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+    init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp);  \
+    init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp)        \
+    init_lpf_funcs_8_wd(0, 4,  bpp); \
+    init_lpf_funcs_8_wd(1, 8,  bpp); \
+    init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp)           \
+    init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+    init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+    init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+    init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+        init_lpf_funcs_8(BPP);
+        init_lpf_funcs_16(BPP);
+        init_lpf_funcs_mix2(BPP);
+    }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+    vp9dsp_mc_init_arm(dsp);
+    vp9dsp_loopfilter_init_arm(dsp);
+    vp9dsp_itxfm_init_arm(dsp);
+}
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
index 1ede170..cb7f48d 100644
--- a/libavcodec/arm/vp9dsp_init_arm.c
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -1,28 +1,30 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/internal.h"
 #include "libavutil/arm/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
 
 #define declare_fpel(type, sz)                                          \
 void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
@@ -239,8 +241,17 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
     }
 }
 
-av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
+av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp)
 {
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_arm(dsp);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_arm(dsp);
+        return;
+    } else if (bpp != 8)
+        return;
+
     vp9dsp_mc_init_arm(dsp);
     vp9dsp_loopfilter_init_arm(dsp);
     vp9dsp_itxfm_init_arm(dsp);
diff --git a/libavcodec/arm/vp9itxfm_16bpp_neon.S b/libavcodec/arm/vp9itxfm_16bpp_neon.S
new file mode 100644
index 0000000..b4f615e
--- /dev/null
+++ b/libavcodec/arm/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,1945 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+@ Do two 4x4 transposes, using q registers for the subtransposes that don't
+@ need to address the individual d registers.
+@ r0,r1 == rq1, r2,r3 == rq1, etc
+.macro transpose32_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vswp             \r1,  \r4  @ vtrn.64 \rq0, \rq2
+        vswp             \r3,  \r6  @ vtrn.64 \rq1, \rq3
+        vswp             \r9,  \r12 @ vtrn.64 \rq4, \rq6
+        vswp             \r11, \r14 @ vtrn.64 \rq5, \rq7
+        vtrn.32          \rq0, \rq1
+        vtrn.32          \rq2, \rq3
+        vtrn.32          \rq4, \rq5
+        vtrn.32          \rq6, \rq7
+.endm
+
+@ Do eight 2x2 transposes.
+.macro transpose32_8x_2x2 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vtrn.32          \r0,  \r1
+        vtrn.32          \r2,  \r3
+        vtrn.32          \r4,  \r5
+        vtrn.32          \r6,  \r7
+        vtrn.32          \r8,  \r9
+        vtrn.32          \r10, \r11
+        vtrn.32          \r12, \r13
+        vtrn.32          \r14, \r15
+.endm
+
+@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ in/out are d registers
+.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
+        vadd.s32        \tmpd1, \in1,  \in2
+        vsub.s32        \tmpd2, \in1,  \in2
+.if \neg > 0
+        vneg.s32        \tmpd1, \tmpd1
+.endif
+        vmull.s32       \tmpq3, \tmpd1, d0[0]
+        vmull.s32       \tmpq4, \tmpd2, d0[0]
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq4, #14
+.endm
+
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+        vmull.s32       \tmpq3, \in1, d0[0]
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq3, #14
+.endm
+
+@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ Same as mbutterfly0, but with input being 2 q registers, output
+@ being 4 d registers.
+@ This can do with either 4 or 6 temporary q registers.
+.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
+        vadd.s32        \tmpq1, \in1,  \in2
+        vsub.s32        \tmpq2, \in1,  \in2
+        vmull.s32       \tmpq3, \tmpd11, d0[0]
+        vmull.s32       \tmpq4, \tmpd12, d0[0]
+.ifb \tmpq5
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq4, #14
+        vmull.s32       \tmpq3, \tmpd21, d0[0]
+        vmull.s32       \tmpq4, \tmpd22, d0[0]
+        vrshrn.s64      \out3, \tmpq3, #14
+        vrshrn.s64      \out4, \tmpq4, #14
+.else
+        vmull.s32       \tmpq5, \tmpd21, d0[0]
+        vmull.s32       \tmpq6, \tmpd22, d0[0]
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq4, #14
+        vrshrn.s64      \out3, \tmpq5, #14
+        vrshrn.s64      \out4, \tmpq6, #14
+.endif
+.endm
+
+@ out1 = in1 * coef1 - in2 * coef2
+@ out2 = in1 * coef2 + in2 * coef1
+@ out are 2 q registers, in are 2 d registers
+.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2, neg=0
+        vmull.s32       \out1, \in1, \coef1
+        vmlsl.s32       \out1, \in2, \coef2
+.if \neg
+        vmov.s64        \out2, #0
+        vmlsl.s32       \out2, \in1, \coef2
+        vmlsl.s32       \out2, \in2, \coef1
+.else
+        vmull.s32       \out2, \in1, \coef2
+        vmlal.s32       \out2, \in2, \coef1
+.endif
+.endm
+
+@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
+@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
+@ out are 4 q registers, in are 4 d registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
+        vmull.s32       \out1, \in1, \coef1
+        vmull.s32       \out2, \in2, \coef1
+        vmull.s32       \out3, \in1, \coef2
+        vmull.s32       \out4, \in2, \coef2
+        vmlsl.s32       \out1, \in3, \coef2
+        vmlsl.s32       \out2, \in4, \coef2
+        vmlal.s32       \out3, \in3, \coef1
+        vmlal.s32       \out4, \in4, \coef1
+.endm
+
+@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+@ inout are 2 d registers, tmp are 2 q registers
+.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
+        mbutterfly_l    \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2, \neg
+        vrshrn.s64      \inout1, \tmp1,  #14
+        vrshrn.s64      \inout2, \tmp2,  #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s32       \tmp1,   \inout1, \coef1
+        vmull.s32       \tmp2,   \inout1, \coef2
+        vrshrn.s64      \inout1, \tmp1,   #14
+        vrshrn.s64      \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmov.s64        \tmp1,   #0
+        vmull.s32       \tmp2,   \inout2, \coef1
+        vmlsl.s32       \tmp1,   \inout2, \coef2
+        vrshrn.s64      \inout2, \tmp2,   #14
+        vrshrn.s64      \inout1, \tmp1,   #14
+.endm
+
+@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
+@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
+@ inout are 4 d registers, tmp are 4 q registers
+.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
+        vrshrn.s64      \inout1, \tmp1,  #14
+        vrshrn.s64      \inout2, \tmp2,  #14
+        vrshrn.s64      \inout3, \tmp3,  #14
+        vrshrn.s64      \inout4, \tmp4,  #14
+.endm
+
+@ out1 = in1 + in2
+@ out2 = in1 - in2
+.macro butterfly out1, out2, in1, in2
+        vadd.s32        \out1, \in1, \in2
+        vsub.s32        \out2, \in1, \in2
+.endm
+
+@ out1 = in1 - in2
+@ out2 = in1 + in2
+.macro butterfly_r out1, out2, in1, in2
+        vsub.s32        \out1, \in1, \in2
+        vadd.s32        \out2, \in1, \in2
+.endm
+
+@ out1 = (in1 + in2 + (1 << 13)) >> 14
+@ out2 = (in1 - in2 + (1 << 13)) >> 14
+@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
+.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
+        vadd.s64        \tmp1, \in1, \in2
+        vsub.s64        \tmp2, \in1, \in2
+        vrshrn.s64      \out1, \tmp1,  #14
+        vrshrn.s64      \out2, \tmp2,  #14
+.endm
+
+@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
+.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        vadd.s64        \tmp1, \in1, \in3
+        vadd.s64        \tmp2, \in2, \in4
+        vsub.s64        \tmp3, \in1, \in3
+        vsub.s64        \tmp4, \in2, \in4
+        vrshrn.s64      \out1, \tmp1,  #14
+        vrshrn.s64      \out2, \tmp2,  #14
+        vrshrn.s64      \out3, \tmp3,  #14
+        vrshrn.s64      \out4, \tmp4,  #14
+.endm
+
+
+.macro iwht4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vadd.i32        \c0,  \c0,  \c1
+        vsub.i32        q11,  \c2,  \c3
+        vsub.i32        q10,  \c0,  q11
+        vshr.s32        q10,  q10,  #1
+        vsub.i32        \c2,  q10,  \c1
+        vsub.i32        \c1,  q10,  \c3
+        vadd.i32        \c3,  q11,  \c2
+        vsub.i32        \c0,  \c0,  \c1
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        iwht4_10        \c0, \c1, \c2, \c3, \cd0, \cd1, \cd2, \cd3, \cd4, \cd5, \cd6, \cd7
+.endm
+
+@ c0 == cd0,cd1, c1 == cd2,cd3
+.macro idct4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmul.s32        q13,  \c1,  d1[1]
+        vmul.s32        q11,  \c1,  d1[0]
+        vadd.i32        q14,  \c0,  \c2
+        vsub.i32        q15,  \c0,  \c2
+        vmla.s32        q13,  \c3,  d1[0]
+        vmul.s32        q12,  q14,  d0[0]
+        vmul.s32        q10,  q15,  d0[0]
+        vmls.s32        q11,  \c3,  d1[1]
+        vrshr.s32       q13,  q13,  #14
+        vrshr.s32       q12,  q12,  #14
+        vrshr.s32       q10,  q10,  #14
+        vrshr.s32       q11,  q11,  #14
+        vadd.i32        \c0,  q12,  q13
+        vsub.i32        \c3,  q12,  q13
+        vadd.i32        \c1,  q10,  q11
+        vsub.i32        \c2,  q10,  q11
+.endm
+
+.macro idct4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmull.s32       q13,  \cd2, d1[1]
+        vmull.s32       q15,  \cd3, d1[1]
+        vmull.s32       q11,  \cd2, d1[0]
+        vmull.s32       q3,   \cd3, d1[0]
+        vadd.i32        q14,  \c0,  \c2
+        vsub.i32        q2,   \c0,  \c2
+        vmlal.s32       q13,  \cd6, d1[0]
+        vmlal.s32       q15,  \cd7, d1[0]
+        vmull.s32       q12,  d28,  d0[0]
+        vmull.s32       q14,  d29,  d0[0]
+        vmull.s32       q10,  d4,   d0[0]
+        vmull.s32       q8,   d5,   d0[0]
+        vmlsl.s32       q11,  \cd6, d1[1]
+        vmlsl.s32       q3,   \cd7, d1[1]
+        vrshrn.s64      d26,  q13,  #14
+        vrshrn.s64      d27,  q15,  #14
+        vrshrn.s64      d24,  q12,  #14
+        vrshrn.s64      d25,  q14,  #14
+        vrshrn.s64      d20,  q10,  #14
+        vrshrn.s64      d21,  q8,   #14
+        vrshrn.s64      d22,  q11,  #14
+        vrshrn.s64      d23,  q3,   #14
+        vadd.i32        \c0,  q12,  q13
+        vsub.i32        \c3,  q12,  q13
+        vadd.i32        \c1,  q10,  q11
+        vsub.i32        \c2,  q10,  q11
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmul.s32        q10,  \c0,  d2[0]
+        vmla.s32        q10,  \c2,  d2[1]
+        vmla.s32        q10,  \c3,  d3[0]
+        vmul.s32        q11,  \c0,  d3[0]
+        vmls.s32        q11,  \c2,  d2[0]
+        vsub.s32        \c0,  \c0,  \c2
+        vmls.s32        q11,  \c3,  d2[1]
+        vadd.s32        \c0,  \c0,  \c3
+        vmul.s32        q13,  \c1,  d3[1]
+        vmul.s32        q12,  \c0,  d3[1]
+        vadd.s32        q14,  q10,  q13
+        vadd.s32        q15,  q11,  q13
+        vrshr.s32       \c0,  q14,  #14
+        vadd.s32        q10,  q10,  q11
+        vrshr.s32       \c1,  q15,  #14
+        vsub.s32        q10,  q10,  q13
+        vrshr.s32       \c2,  q12,  #14
+        vrshr.s32       \c3,  q10,  #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmull.s32       q10,  \cd0, d2[0]
+        vmull.s32       q4,   \cd1, d2[0]
+        vmlal.s32       q10,  \cd4, d2[1]
+        vmlal.s32       q4,   \cd5, d2[1]
+        vmlal.s32       q10,  \cd6, d3[0]
+        vmlal.s32       q4,   \cd7, d3[0]
+        vmull.s32       q11,  \cd0, d3[0]
+        vmull.s32       q5,   \cd1, d3[0]
+        vmlsl.s32       q11,  \cd4, d2[0]
+        vmlsl.s32       q5,   \cd5, d2[0]
+        vsub.s32        \c0,  \c0,  \c2
+        vmlsl.s32       q11,  \cd6, d2[1]
+        vmlsl.s32       q5,   \cd7, d2[1]
+        vadd.s32        \c0,  \c0,  \c3
+        vmull.s32       q13,  \cd2, d3[1]
+        vmull.s32       q6,   \cd3, d3[1]
+        vmull.s32       q12,  \cd0, d3[1]
+        vmull.s32       q7,   \cd1, d3[1]
+        vadd.s64        q14,  q10,  q13
+        vadd.s64        q2,   q4,   q6
+        vadd.s64        q15,  q11,  q13
+        vadd.s64        q3,   q5,   q6
+        vrshrn.s64      \cd1, q2,   #14
+        vrshrn.s64      \cd0, q14,  #14
+        vadd.s64        q10,  q10,  q11
+        vadd.s64        q4,   q4,   q5
+        vrshrn.s64      \cd3, q3,   #14
+        vrshrn.s64      \cd2, q15,  #14
+        vsub.s64        q10,  q10,  q13
+        vsub.s64        q4,   q4,   q6
+        vrshrn.s64      \cd4, q12,  #14
+        vrshrn.s64      \cd5, q7,   #14
+        vrshrn.s64      \cd6, q10,  #14
+        vrshrn.s64      \cd7, q4,   #14
+.endm
+
+@ The public functions in this file have got the following signature:
+@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {d0}, [r12,:64]
+        vmovl.s16       q0,  d0
+.endif
+.ifc \txfm1,iadst
+        movrel          r12, iadst4_coeffs
+        vld1.16         {d1}, [r12,:64]
+        vmovl.s16       q1,  d1
+.endif
+.else
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+.if \bpp > 10
+.ifnc \txfm1\()_\txfm2,idct_idct
+        @ iadst4_12 needs q4-q7
+        vpush           {q4-q7}
+.endif
+.endif
+
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        bne             1f
+        @ DC-only for idct/idct
+        vld1.32         {d4[]},   [r2,:32]
+        vmull.s32       q2,  d4,  d0[0]
+        vrshrn.s64      d4,  q2,  #14
+        vmull.s32       q2,  d4,  d0[0]
+        vrshrn.s64      d4,  q2,  #14
+        vst1.32         {d30[0]}, [r2,:32]
+        vdup.32         q2,  d4[0]
+        vmov            q3,  q2
+        vmov            q8,  q2
+        vmov            q9,  q2
+        b               2f
+.endif
+
+1:
+        vld1.32         {q2-q3},   [r2,:128]
+        vst1.32         {q14-q15}, [r2,:128]!
+        vld1.32         {q8-q9},   [r2,:128]
+
+.ifc \txfm1,iwht
+        vshr.s32        q2,  q2,  #2
+        vshr.s32        q3,  q3,  #2
+        vshr.s32        q8,  q8,  #2
+        vshr.s32        q9,  q9,  #2
+.endif
+
+        vst1.16         {q14-q15}, [r2,:128]!
+        \txfm1\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
+
+        @ Transpose 4x4 with 32 bit elements
+        vtrn.32         q2,  q3
+        vtrn.32         q8,  q9
+        vswp            d5,  d16
+        vswp            d7,  d18
+
+        \txfm2\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
+2:
+        vmvn.u16        q15, #((0xffff << \bpp) & 0xffff)
+        vld1.16         {d0},  [r0,:64], r1
+        vld1.16         {d1},  [r0,:64], r1
+.ifnc \txfm1,iwht
+        vrshr.s32       q2,  q2,  #4
+        vrshr.s32       q3,  q3,  #4
+        vrshr.s32       q8,  q8,  #4
+        vrshr.s32       q9,  q9,  #4
+.endif
+        vaddw.u16       q2,  q2,  d0
+        vaddw.u16       q3,  q3,  d1
+        vld1.16         {d2},  [r0,:64], r1
+        vld1.16         {d3},  [r0,:64], r1
+        vqmovun.s32     d0,  q2
+        vqmovun.s32     d1,  q3
+        sub             r0,  r0,  r1, lsl #2
+
+        vaddw.u16       q8,  q8,  d2
+        vmin.u16        q0,  q0,  q15
+        vaddw.u16       q9,  q9,  d3
+        vst1.16         {d0},  [r0,:64], r1
+        vqmovun.s32     d2,  q8
+        vqmovun.s32     d3,  q9
+        vmin.u16        q1,  q1,  q15
+
+        vst1.16         {d1},  [r0,:64], r1
+        vst1.16         {d2},  [r0,:64], r1
+        vst1.16         {d3},  [r0,:64], r1
+
+.if \bpp > 10
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q7}
+.endif
+.endif
+        bx              lr
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct,  idct,  \bpp
+itxfm_func4x4 iadst, idct,  \bpp
+itxfm_func4x4 idct,  iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht,  iwht,  \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+.macro idct8
+        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
+        dmbutterfly     d20, d21, d28, d29, d1[0], d1[1], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
+        dmbutterfly     d18, d19, d30, d31, d2[0], d2[1], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
+        dmbutterfly     d26, d27, d22, d23, d3[0], d3[1], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
+
+        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
+        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
+        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
+        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
+
+        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
+
+        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
+
+        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
+        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
+        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
+.endm
+
+.macro iadst8
+        movrel          r12, iadst8_coeffs
+        vld1.16         {q1}, [r12,:128]!
+        vmovl.s16       q0,  d2
+        vmovl.s16       q1,  d3
+
+        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d0[1], d0[0] @ q4,q5  = t1a, q2,q3 = t0a
+        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d2[1], d2[0] @ q8,q15 = t5a, q6,q7 = t4a
+
+        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
+
+        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
+
+        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d1[1], d1[0] @ q6,q7 = t3a, q4,q5 = t2a
+        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[1], d3[0] @ q10,q13 = t7a, q8,q15 = t6a
+
+        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
+        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
+
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
+        vneg.s32        q15, q15          @ q15 = out[7]
+        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
+
+        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d1[0], d1[1] @ q10,q11 = t5a, q5,q7 = t4a
+        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d1[1], d1[0] @ q2,q3 = t6a, q13,q14 = t7a
+
+        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
+
+        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
+        vneg.s32        q11, q11      @ q11 = out[3]
+
+        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
+        vneg.s32        q9,  q9       @ q9 = out[1]
+
+        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
+        vneg.s32        q13, q13      @ q13 = out[5]
+.endm
+
+function idct8x8_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i32        q2,  #0
+        vmovl.s16       q0,  d0
+
+        vld1.32         {d16[]}, [r2,:32]
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vdup.32         q8,  d16[0]
+        vst1.32         {d4[0]}, [r2,:32]
+
+        vrshr.s32       q8,  q8,  #5
+        vdup.s16        q15, r8
+
+        mov             r3,  r0
+        mov             r12, #8
+1:
+        @ Loop to add the constant from q8 into all 8x8 outputs
+        subs            r12, r12, #2
+        vld1.16         {q2},  [r0,:128], r1
+        vaddw.u16       q10, q8,  d4
+        vld1.16         {q3},  [r0,:128], r1
+        vaddw.u16       q11, q8,  d5
+        vaddw.u16       q12, q8,  d6
+        vaddw.u16       q13, q8,  d7
+        vqmovun.s32     d4,  q10
+        vqmovun.s32     d5,  q11
+        vqmovun.s32     d6,  q12
+        vqmovun.s32     d7,  q13
+        vmin.u16        q2,  q2,  q15
+        vst1.16         {q2},  [r3,:128], r1
+        vmin.u16        q3,  q3,  q15
+        vst1.16         {q3},  [r3,:128], r1
+        bne             1b
+
+        pop             {r4-r8,pc}
+endfunc
+.ltorg
+
+.macro itxfm8_1d_funcs txfm
+@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
+@ transpose into a horizontal 8x4 slice and store.
+@ r0 = dst (temp buffer)
+@ r1 = slice offset
+@ r2 = src
+function \txfm\()8_1d_4x8_pass1_neon
+        mov             r12, #32
+        vmov.s32        q2,  #0
+.irp i, 8, 9, 10, 11, 12, 13, 14, 15
+        vld1.32         {q\i}, [r2,:128]
+        vst1.32         {q2},  [r2,:128], r12
+.endr
+
+        \txfm\()8
+
+        @ Do two 4x4 transposes. Originally, q8-q15 contain the
+        @ 8 rows. Afterwards, q8-q11, q12-q15 contain the transposed
+        @ 4x4 blocks.
+        transpose32_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        cmp             r1,  #4
+        beq             1f
+.irp i, 8, 12, 9, 13, 10, 14, 11, 15
+        vst1.32         {q\i}, [r0,:128]!
+.endr
+        bx              lr
+1:
+        @ Special case: For the last input column (r1 == 4),
+        @ which would be stored as the last row in the temp buffer,
+        @ don't store the first 4x4 block, but keep it in registers
+        @ for the first slice of the second pass (where it is the
+        @ last 4x4 block).
+.irp i, 12, 13, 14, 15
+        add             r0,  r0,  #16
+        vst1.32         {q\i}, [r0,:128]!
+.endr
+        vmov            q12, q8
+        vmov            q13, q9
+        vmov            q14, q10
+        vmov            q15, q11
+        bx              lr
+endfunc
+
+@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
+@ load the destination pixels (from a similar 4x8 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+@ r3 = slice offset
+function \txfm\()8_1d_4x8_pass2_neon
+        mov             r12, #32
+.irp i, 8, 9, 10, 11
+        vld1.32         {q\i}, [r2,:128], r12
+.endr
+        cmp             r3,  #0
+        beq             1f
+.irp i, 12, 13, 14, 15
+        vld1.32         {q\i}, [r2,:128], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        \txfm\()8
+
+        vdup.s16        q4,  r8
+.macro load_add_store coef0, coef1, coef2, coef3
+        vld1.16         {d4},   [r0,:64], r1
+        vld1.16         {d5},   [r3,:64], r1
+        vld1.16         {d6},   [r0,:64], r1
+        vld1.16         {d7},   [r3,:64], r1
+
+        vrshr.s32       \coef0, \coef0, #5
+        vrshr.s32       \coef1, \coef1, #5
+        vrshr.s32       \coef2, \coef2, #5
+        vrshr.s32       \coef3, \coef3, #5
+
+        vaddw.u16       \coef0, \coef0, d4
+        vaddw.u16       \coef1, \coef1, d5
+        vaddw.u16       \coef2, \coef2, d6
+        vaddw.u16       \coef3, \coef3, d7
+
+        sub             r0,  r0,  r1, lsl #1
+        sub             r3,  r3,  r1, lsl #1
+
+        vqmovun.s32     d4,  \coef0
+        vqmovun.s32     d5,  \coef1
+        vqmovun.s32     d6,  \coef2
+        vqmovun.s32     d7,  \coef3
+
+        vmin.u16        q2,  q2,  q4
+        vmin.u16        q3,  q3,  q4
+
+        vst1.16         {d4},  [r0,:64], r1
+        vst1.16         {d5},  [r3,:64], r1
+        vst1.16         {d6},  [r0,:64], r1
+        vst1.16         {d7},  [r3,:64], r1
+.endm
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+.purgem load_add_store
+
+        bx              lr
+endfunc
+.endm
+
+itxfm8_1d_funcs idct
+itxfm8_1d_funcs iadst
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        beq             idct8x8_dc_add_neon
+.endif
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpush           {q4-q7}
+.else
+        vpush           {q4-q5}
+.endif
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #256
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.ifc \txfm1,idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+
+.irp i, 0, 4
+        add             r0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 4
+        cmp             r3,  #12
+        ble             1f
+.endif
+.endif
+        mov             r1,  #\i
+        add             r2,  r6,  #(\i*4)
+        bl              \txfm1\()8_1d_4x8_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        @ For all-zero slices in pass 1, set q12-q15 to zero, for the in-register
+        @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
+        vmov.i32        q12, #0
+        vmov.i32        q13, #0
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+.rept 4
+        vst1.32         {q12-q13}, [r0,:128]!
+.endr
+3:
+.endif
+.ifc \txfm1\()_\txfm2,iadst_idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+.irp i, 0, 4
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        mov             r3,  #\i
+        bl              \txfm2\()8_1d_4x8_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q7}
+.else
+        vpop            {q4-q5}
+.endif
+        pop             {r4-r8,pc}
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+        push            {r4-r8,lr}
+        movw            r8,  #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+        push            {r4-r8,lr}
+        movw            r8,  #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+function idct16x16_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i32        q2,  #0
+        vmovl.s16       q0,  d0
+
+        vld1.32         {d16[]}, [r2,:32]
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vdup.32         q8,  d16[0]
+        vst1.32         {d4[0]}, [r2,:32]
+
+        vrshr.s32       q8,  q8,  #6
+        vdup.s16        q15, r9
+
+        mov             r3,  r0
+        mov             r12, #16
+1:
+        @ Loop to add the constant from q8 into all 16x16 outputs
+        subs            r12, r12, #2
+        vld1.16         {q0-q1},  [r0,:128], r1
+        vaddw.u16       q9,  q8,  d0
+        vaddw.u16       q10, q8,  d1
+        vld1.16         {q2-q3},  [r0,:128], r1
+        vaddw.u16       q11, q8,  d2
+        vaddw.u16       q12, q8,  d3
+        vaddw.u16       q13, q8,  d4
+        vaddw.u16       q14, q8,  d5
+        vqmovun.s32     d0,  q9
+        vaddw.u16       q9,  q8,  d6
+        vqmovun.s32     d1,  q10
+        vaddw.u16       q10, q8,  d7
+        vqmovun.s32     d2,  q11
+        vqmovun.s32     d3,  q12
+        vqmovun.s32     d4,  q13
+        vqmovun.s32     d5,  q14
+        vmin.u16        q0,  q0,  q15
+        vmin.u16        q1,  q1,  q15
+        vqmovun.s32     d6,  q9
+        vqmovun.s32     d7,  q10
+        vst1.16         {q0-q1},  [r3,:128], r1
+        vmin.u16        q2,  q2,  q15
+        vmin.u16        q3,  q3,  q15
+        vst1.16         {q2-q3},  [r3,:128], r1
+        bne             1b
+
+        pop             {r4-r9,pc}
+endfunc
+.ltorg
+
+.macro idct16_end
+        butterfly       d18, d11, d8,  d11               @ d18 = t0a,  d11 = t7a
+        butterfly       d19, d22, d9,  d22               @ d19 = t1a,  d22 = t6
+        butterfly       d8,  d26, d20, d26               @ d8  = t2a,  d26 = t5
+        butterfly       d9,  d10, d28, d10               @ d9  = t3a,  d10 = t4
+        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
+        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
+        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
+        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
+
+        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
+        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
+
+        vswp            d27, d29                         @ d27 = t12, d29 = t13a
+        vswp            d28, d27                         @ d28 = t12, d27 = t11
+        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
+        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
+        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
+        butterfly       d23, d24, d11, d20               @ d23 = out[7], d24 = out[8]
+        butterfly       d18, d29, d8,  d29               @ d18 = out[2], d29 = out[13]
+        butterfly       d19, d28, d9,  d28               @ d19 = out[3], d28 = out[12]
+        vmov            d8,  d21                         @ d8  = t10a
+        butterfly       d20, d27, d10, d27               @ d20 = out[4], d27 = out[11]
+        butterfly       d21, d26, d26, d8                @ d21 = out[5], d26 = out[10]
+        bx              lr
+.endm
+
+function idct16
+        mbutterfly0     d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
+        mbutterfly      d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
+        mbutterfly      d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
+        mbutterfly      d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
+        mbutterfly      d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
+        mbutterfly      d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
+        mbutterfly      d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
+        mbutterfly      d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
+
+        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
+        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
+        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
+        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        mbutterfly0_h   d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
+        mbutterfly_h1   d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
+        mbutterfly_h1   d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
+        mbutterfly_h2   d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
+        mbutterfly_h1   d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
+        mbutterfly_h2   d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
+        mbutterfly_h1   d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
+        mbutterfly_h2   d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
+
+        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
+        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
+        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
+        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        vmov.s64        q12, #0
+        vmull.s32       q4,  d17, d4[0]
+        vmull.s32       q5,  d18, d2[1]
+        vmull.s32       q15, d18, d2[0]
+        vmlsl.s32       q12, d19, d7[1]
+        vmull.s32       q14, d17, d4[1]
+        vmull.s32       q13, d19, d7[0]
+        vmull.s32       q11, d16, d0[0]
+        vrshrn.s64      d16, q4,  #14
+        vrshrn.s64      d11, q5,  #14
+        vrshrn.s64      d10, q15, #14
+        vrshrn.s64      d24, q12, #14
+        vrshrn.s64      d29, q14, #14
+        vrshrn.s64      d17, q13, #14
+        vrshrn.s64      d28, q11, #14
+
+        mbutterfly_l    q10, q11, d17, d24, d1[0], d1[1], neg=1
+        mbutterfly_l    q9,  q15, d29, d16, d1[0], d1[1]
+        vrshrn.s64      d27, q10, #14
+        vrshrn.s64      d21, q11, #14
+        vrshrn.s64      d23, q9,  #14
+        vrshrn.s64      d25, q15, #14
+        vmov            d8,  d28
+        vmov            d9,  d28
+        mbutterfly0     d22, d26, d11, d10, d18, d30, q9,  q15
+        vmov            d20, d28
+        idct16_end
+endfunc
+
+function iadst16
+        movrel          r12, iadst16_coeffs
+        vld1.16         {q0},  [r12,:128]!
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
+        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
+        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
+        mbutterfly_l    q7,  q6,  d29, d18, d1[1], d1[0] @ q7  = t3,   q6  = t2
+        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
+        mbutterfly_l    q3,  q2,  d21, d26, d3[1], d3[0] @ q3  = t11,  q2  = t10
+
+        vld1.16         {q0},  [r12,:128]!
+        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        mbutterfly_l    q5,  q4,  d27, d20, d0[1], d0[0] @ q5  = t5,   q4  = t4
+        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
+
+        mbutterfly_l    q7,  q6,  d19, d28, d2[1], d2[0] @ q7  = t13,  q6  = t12
+        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
+        mbutterfly_l    q3,  q2,  d25, d22, d1[1], d1[0] @ q3  = t7,   q2  = t6
+        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
+
+        mbutterfly_l    q5,  q4,  d17, d30, d3[1], d3[0] @ q5  = t15,  q4  = t14
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
+        mbutterfly_l    q7,  q6,  d23, d24, d2[0], d2[1] @ q7  = t9,   q6  = t8
+        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
+
+        mbutterfly_l    q2,  q3,  d28, d19, d2[1], d2[0] @ q2  = t12,  q3  = t13
+        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
+        mbutterfly_l    q5,  q4,  d21, d26, d3[0], d3[1] @ q5  = t11,  q4  = t10
+        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
+        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
+
+        mbutterfly_l    q6,  q7,  d30, d17, d3[1], d3[0] @ q6  = t14,  q7  = t15
+        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
+        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
+        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
+
+        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
+        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
+
+        mbutterfly_l    q5,  q4,  d19, d28, d1[0], d1[1] @ q5  = t13,  q4  = t12
+        mbutterfly_l    q6,  q7,  d30, d17, d1[1], d1[0] @ q6  = t14,  q7  = t15
+
+        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
+        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
+        vneg.s32        d29, d29                         @ d29 = out[13]
+
+        mbutterfly_l    q5,  q4,  d4,  d5,  d1[0], d1[1] @ q5  = t5a,  q4  = t4a
+        mbutterfly_l    q6,  q7,  d7,  d6,  d1[1], d1[0] @ q6  = t6a,  q7  = t7a
+
+        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
+        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
+
+        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
+        vneg.s32        d19, d19                         @ d19 = out[3]
+        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
+
+        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
+        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
+
+        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
+        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
+        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
+        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
+
+        vneg.s32        d31, d5                          @ d31 = out[15]
+        vneg.s32        d17, d3                          @ d17 = out[1]
+
+        vmov            d16, d2
+        vmov            d30, d4
+        bx              lr
+endfunc
+
+.macro itxfm16_1d_funcs txfm, suffix
+@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
+@ transpose into a horizontal 16x2 slice and store.
+@ r0 = dst (temp buffer)
+@ r2 = src
+function \txfm\()16_1d_2x16_pass1\suffix\()_neon
+        push            {lr}
+
+        mov             r12, #64
+        vmov.s32        q4,  #0
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+
+        bl              \txfm\()16\suffix
+
+        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
+        @ transposed 2x2 blocks.
+        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 2x2 blocks horizontally.
+.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
+        vst1.32         {d\i}, [r0,:64]!
+.endr
+        pop             {pc}
+endfunc
+
+@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
+@ load the destination pixels (from a similar 2x16 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function \txfm\()16_1d_2x16_pass2\suffix\()_neon
+        push            {lr}
+
+        mov             r12, #64
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19, 20
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+.endif
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              \txfm\()16\suffix
+
+.macro load_add_store coef0, coef1, coef2, coef3
+        vrshr.s32       \coef0, \coef0, #6
+        vrshr.s32       \coef1, \coef1, #6
+
+        vld1.32         {d8[]},   [r0,:32], r1
+        vld1.32         {d8[1]},  [r3,:32], r1
+        vrshr.s32       \coef2, \coef2, #6
+        vrshr.s32       \coef3, \coef3, #6
+        vld1.32         {d9[]},   [r0,:32], r1
+        vld1.32         {d9[1]},  [r3,:32], r1
+        vaddw.u16       \coef0, \coef0, d8
+        vld1.32         {d10[]},  [r0,:32], r1
+        vld1.32         {d10[1]}, [r3,:32], r1
+        vaddw.u16       \coef1, \coef1, d9
+        vld1.32         {d11[]},  [r0,:32], r1
+        vld1.32         {d11[1]}, [r3,:32], r1
+
+        vqmovun.s32     d8,  \coef0
+        vdup.s16        q8,  r9
+        vqmovun.s32     d9,  \coef1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r3,  r3,  r1, lsl #2
+        vaddw.u16       \coef2, \coef2, d10
+        vaddw.u16       \coef3, \coef3, d11
+        vmin.u16        q4,  q4,  q8
+        vst1.32         {d8[0]},  [r0,:32], r1
+        vst1.32         {d8[1]},  [r3,:32], r1
+        vqmovun.s32     d10, \coef2
+        vst1.32         {d9[0]},  [r0,:32], r1
+        vst1.32         {d9[1]},  [r3,:32], r1
+        vqmovun.s32     d11, \coef3
+        vmin.u16        q5,  q5,  q8
+
+        vst1.32         {d10[0]}, [r0,:32], r1
+        vst1.32         {d10[1]}, [r3,:32], r1
+        vst1.32         {d11[0]}, [r0,:32], r1
+        vst1.32         {d11[1]}, [r3,:32], r1
+.endm
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+.purgem load_add_store
+
+        pop             {pc}
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+itxfm16_1d_funcs idct, _quarter
+itxfm16_1d_funcs idct, _half
+.ltorg
+
+@ This is the minimum eob value for each subpartition, in increments of 2
+const min_eob_idct_idct_16, align=4
+        .short  0, 3, 10, 22, 38, 62, 89, 121
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        beq             idct16x16_dc_add_neon
+.endif
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpush           {q4-q7}
+.else
+        vpush           {q4-q5}
+.endif
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #1024
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.ifc \txfm1,idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #10
+        ble             idct16x16_quarter_add_16_neon
+        cmp             r3,  #38
+        ble             idct16x16_half_add_16_neon
+
+        movrel          r8,  min_eob_idct_idct_16 + 2
+.endif
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+        add             r0,  sp,  #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(16 - \i)/2
+        ble             1f
+.endif
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              \txfm1\()16_1d_2x16_pass1_neon
+.endr
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+2:
+        subs            r1,  r1,  #1
+        @ Unroll for 2 lines
+.rept 2
+        @ Fill one line with zeros
+        vst1.32         {q14-q15}, [r0,:128]!
+        vst1.32         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.endif
+
+.ifc \txfm1\()_\txfm2,iadst_idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              \txfm2\()16_1d_2x16_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q7}
+.else
+        vpop            {q4-q5}
+.endif
+        pop             {r4-r9,pc}
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+.ltorg
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+.irp i, 0, 2
+        add             r0,  sp,  #(\i*64)
+.ifc \size,quarter
+.if \i == 2
+        cmp             r3,  #3
+        ble             1f
+.endif
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct16_1d_2x16_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 4, 6
+        add             r0,  sp,  #(\i*64)
+.if \i == 6
+        cmp             r3,  #22
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct16_1d_2x16_pass1_\size\()_neon
+.endr
+.endif
+
+        b               3f
+1:
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+
+        @ Unroll for 2 lines
+.rept 2
+        @ Fill one line with zeros
+        vst1.32         {q14-q15}, [r0,:128]!
+        vst1.32         {q14-q15}, [r0,:128]!
+.endr
+
+3:
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              idct16_1d_2x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q5}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i32        q2,  #0
+        vmovl.s16       q0,  d0
+
+        vld1.32         {d16[]}, [r2,:32]
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vdup.32         q8,  d16[0]
+        vst1.32         {d4[0]}, [r2,:32]
+
+        vrshr.s32       q8,  q8,  #6
+        vdup.s16        q15, r9
+
+        mov             r3,  r0
+        mov             r12, #32
+        sub             r1,  r1,  #32
+1:
+        @ Loop to add the constant from q8 into all 32x32 outputs
+        subs            r12, r12, #1
+        vld1.16         {q0-q1},  [r0,:128]!
+        vaddw.u16       q9,  q8,  d0
+        vaddw.u16       q10, q8,  d1
+        vld1.16         {q2-q3},  [r0,:128], r1
+        vaddw.u16       q11, q8,  d2
+        vaddw.u16       q12, q8,  d3
+        vaddw.u16       q13, q8,  d4
+        vaddw.u16       q14, q8,  d5
+        vqmovun.s32     d0,  q9
+        vaddw.u16       q9,  q8,  d6
+        vqmovun.s32     d1,  q10
+        vaddw.u16       q10, q8,  d7
+        vqmovun.s32     d2,  q11
+        vqmovun.s32     d3,  q12
+        vqmovun.s32     d4,  q13
+        vqmovun.s32     d5,  q14
+        vmin.u16        q0,  q0,  q15
+        vmin.u16        q1,  q1,  q15
+        vqmovun.s32     d6,  q9
+        vqmovun.s32     d7,  q10
+        vst1.16         {q0-q1},  [r3,:128]!
+        vmin.u16        q2,  q2,  q15
+        vmin.u16        q3,  q3,  q15
+        vst1.16         {q2-q3},  [r3,:128], r1
+        bne             1b
+
+        pop             {r4-r9,pc}
+endfunc
+
+.macro idct32_end
+        butterfly       d16, d9,  d8,  d9  @ d16 = t16a, d9  = t19a
+        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
+        butterfly       d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
+        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
+        butterfly       d8,  d28, d28, d30 @ d8  = t24a, d28 = t27a
+        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
+        butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
+        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
+
+        mbutterfly      d27, d20, d1[0], d1[1], q12, q15        @ d27 = t18a, d20 = t29a
+        mbutterfly      d29, d9,  d1[0], d1[1], q12, q15        @ d29 = t19,  d9  = t28
+        mbutterfly      d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27,  d10 = t20
+        mbutterfly      d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
+
+        butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
+        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
+        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
+        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
+        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
+        butterfly_r     d27, d28, d9,  d28 @ d27 = t27a, d28 = t28a
+        butterfly       d8,  d26, d20, d26 @ d8  = t29,  d26 = t26
+        butterfly       d19, d20, d29, d10 @ d19 = t19a, d20 = t20
+        vmov            d29, d8            @ d29 = t29
+
+        mbutterfly0     d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27,  d20 = t20
+        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
+        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
+        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
+        bx              lr
+.endm
+
+function idct32_odd
+        movrel          r12, idct_coeffs
+
+        @ Overwrite the idct16 coeffs with the stored ones for idct32
+        vmovl.s16       q0,  d12
+        vmovl.s16       q1,  d13
+        vmovl.s16       q2,  d14
+        vmovl.s16       q3,  d15
+
+        mbutterfly      d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly      d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly      d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly      d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly      d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly      d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly      d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly      d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
+
+        @ Reload the idct16 coefficients. We could swap the coefficients between
+        @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just
+        @ loading and lengthening.
+        vld1.16         {q0-q1}, [r12,:128]
+
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        movrel          r12, idct_coeffs
+
+        vmovl.s16       q0,  d12
+        vmovl.s16       q1,  d13
+        vmovl.s16       q2,  d14
+        vmovl.s16       q3,  d15
+
+        mbutterfly_h1   d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly_h2   d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly_h1   d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly_h2   d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly_h1   d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly_h2   d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly_h1   d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly_h2   d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
+
+        vld1.16         {q0-q1}, [r12,:128]
+
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        movrel          r12, idct_coeffs
+
+        vmovl.s16       q0,  d12
+        vmovl.s16       q1,  d13
+        vmovl.s16       q2,  d14
+        vmovl.s16       q3,  d15
+
+        vmov.s64        q14, #0
+        vmov.s64        q5,  #0
+
+        vmull.s32       q4,  d16, d0[0]
+        vmlsl.s32       q14, d19, d3[1]
+        vmull.s32       q15, d16, d0[1]
+        vmull.s32       q11, d17, d7[0]
+        vmlsl.s32       q5,  d17, d7[1]
+        vmull.s32       q13, d19, d3[0]
+        vmull.s32       q10, d18, d4[0]
+        vmull.s32       q12, d18, d4[1]
+
+        vld1.16         {q0-q1}, [r12,:128]
+
+        vrshrn.s64      d8,  q4,  #14
+        vrshrn.s64      d9,  q14, #14
+        vrshrn.s64      d29, q15, #14
+        vrshrn.s64      d28, q11, #14
+
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        vrshrn.s64      d11, q5,  #14
+        vrshrn.s64      d31, q13, #14
+        vrshrn.s64      d10, q10, #14
+        vrshrn.s64      d30, q12, #14
+
+        mbutterfly_l    q8,  q9,  d29, d8,  d2[0], d2[1]
+        mbutterfly_l    q13, q10, d31, d9,  d2[0], d2[1], neg=1
+        vrshrn.s64      d23, q8,  #14
+        vrshrn.s64      d24, q9,  #14
+        vrshrn.s64      d27, q13, #14
+        vrshrn.s64      d20, q10, #14
+        mbutterfly_l    q8,  q9,  d30, d10, d3[0], d3[1]
+        vrshrn.s64      d21, q8,  #14
+        vrshrn.s64      d26, q9,  #14
+        mbutterfly_l    q8,  q9,  d28, d11, d3[0], d3[1], neg=1
+        vrshrn.s64      d25, q8,  #14
+        vrshrn.s64      d22, q9,  #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
+@ We don't have register space to do a single pass IDCT of 2x32 though,
+@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
+@ a normal IDCT16 with every other input component (the even ones, with
+@ each output written twice), followed by a separate 16-point IDCT
+@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
+@ r0 = dst (temp buffer)
+@ r1 = unused
+@ r2 = src
+function idct32_1d_2x32_pass1\suffix\()_neon
+        push            {lr}
+
+        @ Double stride of the input, since we only read every other line
+        mov             r12, #256
+        vmov.s32        d8,  #0
+
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
+        @ transposed 2x2 blocks.
+        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the registers a, b, c, d, e, f, g, h horizontally, followed
+        @ by the same registers h, g, f, e, d, c, b, a mirrored.
+.macro store_rev a, b, c, d, e, f, g, h
+.irp i, \a, \b, \c, \d, \e, \f, \g, \h
+        vst1.32         {d\i}, [r0,:64]!
+        vrev64.32       d\i, d\i
+.endr
+.irp i, \h, \g, \f, \e, \d, \c, \b, \a
+        vst1.32         {d\i}, [r0,:64]!
+.endr
+.endm
+        store_rev       16, 18, 20, 22, 24, 26, 28, 30
+        store_rev       17, 19, 21, 23, 25, 27, 29, 31
+        sub             r0,  r0,  #256
+.purgem store_rev
+
+        @ Move r2 back to the start of the input, and move
+        @ to the first odd row
+.ifb \suffix
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             r2,  r2,  r12, lsl #3
+.endif
+        add             r2,  r2,  #128
+
+        vmov.s32        d8,  #0
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+
+        @ Store the registers a, b, c, d, e, f, g, h horizontally,
+        @ adding into the output first, and then mirrored, subtracted
+        @ from the output.
+.macro store_rev a, b, c, d, e, f, g, h
+.irp i, \a, \b, \c, \d, \e, \f, \g, \h
+        vld1.32         {d8},  [r0,:64]
+        vadd.s32        d8, d8, d\i
+        vst1.32         {d8},  [r0,:64]!
+        vrev64.32       d\i, d\i
+.endr
+.irp i, \h, \g, \f, \e, \d, \c, \b, \a
+        vld1.32         {d8},  [r0,:64]
+        vsub.s32        d8, d8, d\i
+        vst1.32         {d8},  [r0,:64]!
+.endr
+.endm
+
+        store_rev       31, 29, 27, 25, 23, 21, 19, 17
+        store_rev       30, 28, 26, 24, 22, 20, 18, 16
+.purgem store_rev
+        pop             {pc}
+endfunc
+.ltorg
+
+@ This is mostly the same as 2x32_pass1, but without the transpose,
+@ and use the source as temp buffer between the two idct passes, and
+@ add into the destination.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function idct32_1d_2x32_pass2\suffix\()_neon
+        push            {lr}
+
+        mov             r12, #256
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vst1.32         {d\i}, [r2,:64], r12
+.endr
+
+        sub             r2,  r2,  r12, lsl #4
+        add             r2,  r2,  #128
+
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
+        sub             r2,  r2,  #128
+
+        bl              idct32_odd\suffix
+
+        @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
+        @ allow clobbering q2-q3 below.
+        vmovn.s32       d0,  q0
+        vmovn.s32       d1,  q1
+        vmovn.s32       d2,  q2
+        vmovn.s32       d3,  q3
+
+        mov             r12, #256
+        vdup.s16        q4,  r9
+.macro load_acc_store a, b, c, d, neg=0
+        vld1.32         {d4},  [r2,:64], r12
+        vld1.32         {d5},  [r2,:64], r12
+.if \neg == 0
+        vadd.s32        d4,  d4,  d\a
+        vld1.32         {d6},  [r2,:64], r12
+        vadd.s32        d5,  d5,  d\b
+        vld1.32         {d7},  [r2,:64], r12
+        vadd.s32        d6,  d6,  d\c
+        vadd.s32        d7,  d7,  d\d
+.else
+        vsub.s32        d4,  d4,  d\a
+        vld1.32         {d6},  [r2,:64], r12
+        vsub.s32        d5,  d5,  d\b
+        vld1.32         {d7},  [r2,:64], r12
+        vsub.s32        d6,  d6,  d\c
+        vsub.s32        d7,  d7,  d\d
+.endif
+        vld1.32         {d10[]},  [r0,:32], r1
+        vld1.32         {d10[1]}, [r0,:32], r1
+        vrshr.s32       q2,  q2,  #6
+        vld1.32         {d11[]},  [r0,:32], r1
+        vrshr.s32       q3,  q3,  #6
+        vld1.32         {d11[1]}, [r0,:32], r1
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u16       q2,  q2,  d10
+        vaddw.u16       q3,  q3,  d11
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vmin.u16        q2,  q2,  q4
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r0,:32], r1
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r0,:32], r1
+.endm
+        load_acc_store  31, 30, 29, 28
+        load_acc_store  27, 26, 25, 24
+        load_acc_store  23, 22, 21, 20
+        load_acc_store  19, 18, 17, 16
+        sub             r2,  r2,  r12
+        neg             r12, r12
+        load_acc_store  16, 17, 18, 19, 1
+        load_acc_store  20, 21, 22, 23, 1
+        load_acc_store  24, 25, 26, 27, 1
+        load_acc_store  28, 29, 30, 31, 1
+.purgem load_acc_store
+        @ Lengthen the idct16 coeffs back into 32 bit form
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        pop             {pc}
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 3, 9, 21, 34, 51, 70, 98, 135, 176, 240, 258, 336, 357, 448, 472
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+        cmp             r3,  #1
+        beq             idct32x32_dc_add_neon
+        vpush           {q4-q7}
+        movrel          r8,  min_eob_idct_idct_32 + 2
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #4096
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]!
+        vld1.16         {q6-q7}, [r12,:128]
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        cmp             r3,  #34
+        ble             idct32x32_quarter_add_16_neon
+        cmp             r3,  #135
+        ble             idct32x32_half_add_16_neon
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        add             r0,  sp,  #(\i*128)
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(32 - \i)/2
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct32_1d_2x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 2
+        @ Fill one line with zeros
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              idct32_1d_2x32_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x03ff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x0fff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size, rows
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 2, 4, 6
+        add             r0,  sp,  #(\i*128)
+.ifc \size,quarter
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(\rows - \i)/2
+        ble             1f
+.endif
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct32_1d_2x32_pass1_\size\()_neon
+.endr
+.ifc \size,half
+        add             r8,  r8,  #8
+.irp i, 8, 10, 12, 14
+        add             r0,  sp,  #(\i*128)
+.if \i > 8
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(\rows - \i)/2
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct32_1d_2x32_pass1_\size\()_neon
+.endr
+.endif
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 2
+        @ Fill one line with zeros
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              idct32_1d_2x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+idct32_partial quarter, 8
+idct32_partial half, 16
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 67a4754..6c09922 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp9lpf_16bpp_neon.S b/libavcodec/arm/vp9lpf_16bpp_neon.S
new file mode 100644
index 0000000..7d2571d
--- /dev/null
+++ b/libavcodec/arm/vp9lpf_16bpp_neon.S
@@ -0,0 +1,1044 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vswp             \r1,  \r8  @ vtrn.64 \rq0, \rq4
+        vswp             \r3,  \r10 @ vtrn.64 \rq1, \rq5
+        vswp             \r5,  \r12 @ vtrn.64 \rq2, \rq6
+        vswp             \r7,  \r14 @ vtrn.64 \rq3, \rq7
+        vtrn.32          \rq0, \rq2
+        vtrn.32          \rq1, \rq3
+        vtrn.32          \rq4, \rq6
+        vtrn.32          \rq5, \rq7
+        vtrn.16          \rq0, \rq1
+        vtrn.16          \rq2, \rq3
+        vtrn.16          \rq4, \rq5
+        vtrn.16          \rq6, \rq7
+.endm
+
+.macro transpose16_4x4 r0, r1, r2, r3
+        vtrn.32          \r0, \r2
+        vtrn.32          \r1, \r3
+        vtrn.16          \r0, \r1
+        vtrn.16          \r2, \r3
+.endm
+
+@ Do a 4x4 transpose, using q registers for the subtransposes that don't
+@ need to address the indiviudal d registers.
+@ r0,r1 == rq0, r2,r3 == rq1
+.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3
+        vtrn.32         \rq0, \rq1
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+.endm
+
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q wd
+        vdup.u16        q0,  r2          @ E
+        vdup.u16        q1,  r3          @ I
+
+        vabd.u16        q2,  q8,  q9     @ abs(p3 - p2)
+        vabd.u16        q3,  q9,  q10    @ abs(p2 - p1)
+        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
+        vabd.u16        q5,  q12, q13    @ abs(q0 - q1)
+        vabd.u16        q6,  q13, q14    @ abs(q1 - q2)
+        vabd.u16        q7,  q14, q15    @ abs(q2 - q3)
+        vmax.u16        q2,  q2,  q3
+        vmax.u16        q3,  q4,  q5
+        vmax.u16        q4,  q6,  q7
+        vabd.u16        q5,  q11, q12    @ abs(p0 - q0)
+        vmax.u16        q2,  q2,  q3
+        vadd.u16        q5,  q5,  q5     @ abs(p0 - q0) * 2
+        vabd.u16        q6,  q10, q13    @ abs(p1 - q1)
+        vmax.u16        q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
+        vshr.u16        q6,  q6,  #1
+        vcle.u16        q2,  q2,  q1     @ max(abs()) <= I
+        vadd.u16        q5,  q5,  q6     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u16        q5,  q5,  q0
+        vand            q2,  q2,  q5     @ fm
+
+        vmovn.u16       d10, q2
+        vmov            r8,  r9,  d10
+        orrs            r8,  r8,  r9
+        @ If no pixels need filtering, just exit as soon as possible
+        beq             9f
+
+.if \wd >= 8
+        vdup.u16        q0,  r5
+
+        vabd.u16        q1,  q8,  q11    @ abs(p3 - p0)
+        vabd.u16        q3,  q9,  q11    @ abs(p2 - p0)
+        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
+        vabd.u16        q5,  q13, q12    @ abs(q1 - q0)
+        vabd.u16        q6,  q14, q12    @ abs(q2 - q0)
+        vabd.u16        q7,  q15, q12    @ abs(q3 - q0)
+        vmax.u16        q1,  q1,  q3
+        vmax.u16        q4,  q4,  q5
+        vmax.u16        q6,  q6,  q7
+        @ The rest of the calculation of flat8in is interleaved below
+.endif
+
+        @ Calculate the normal inner loop filter for 2 or 4 pixels
+        vabd.u16        q3,  q10, q11    @ abs(p1 - p0)
+.if \wd == 8
+        vmax.u16        q1,  q1,  q4
+.endif
+        vabd.u16        q4,  q13, q12    @ abs(q1 - q0)
+.if \wd == 8
+        vmax.u16        q1,  q1,  q6
+.endif
+
+        vsub.u16        q5,  q10, q13    @ p1 - q1
+        vmax.u16        q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
+        vdup.u16        q4,  r4          @ H
+        vsub.u16        q6,  q12, q11    @ q0 - p0
+.if \wd == 8
+        vcle.u16        q1,  q1,  q0     @ flat8in
+.endif
+        vdup.u16        q0,  r6          @ left shift for saturation
+        vcle.u16        q3,  q3,  q4     @ !hev
+.if \wd == 8
+        vand            q1,  q1,  q2     @ flat8in && fm
+.endif
+        vneg.s16        q4,  q0          @ negative left shift after saturation
+        vqshl.s16       q5,  q5,  q0
+.if \wd == 8
+        vbic            q2,  q2,  q1     @ fm && !flat8in
+.endif
+        vmov.s16        q7,  #3
+        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
+        vshl.s16        q5,  q5,  q4     @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        vmul.s16        q6,  q6,  q7     @ 3 * (q0 - p0)
+        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int2p = 0
+        vadd.s16        q6,  q6,  q5     @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
+        vmov.s16        q5,  #4
+        vqshl.s16       q6,  q6,  q0
+        vmov.s16        q0,  #3
+        vshl.s16        q6,  q6,  q4     @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        vdup.u16        q4,  r7          @ max pixel value
+
+        vshr.u16        q4,  q4,  #1     @ (1 << (BIT_DEPTH - 1)) - 1)
+
+        vadd.s16        q5,  q6,  q5     @ f + 4
+        vadd.s16        q0,  q6,  q0     @ f + 3
+        vmov.s16        q6,  #0
+        vmin.s16        q5,  q5,  q4     @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        vmin.s16        q0,  q0,  q4     @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        vdup.u16        q4,  r7          @ max pixel value
+        vshr.s16        q5,  q5,  #3     @ f1
+        vshr.s16        q0,  q0,  #3     @ f2
+
+        vadd.s16        q0,  q11, q0     @ p0 + f2
+        vsub.s16        q7,  q12, q5     @ q0 - f1
+        vmin.s16        q0,  q0,  q4
+        vmin.s16        q7,  q7,  q4
+        vrshr.s16       q5,  q5,  #1     @ f = (f1 + 1) >> 1
+        vmax.s16        q0,  q0,  q6     @ out p0
+        vmax.s16        q7,  q7,  q6     @ out q0
+        vbit            q11, q0,  q2     @ if (fm && !flat8in)
+        vbit            q12, q7,  q2
+.if \wd >= 8
+        vmovn.u16       d4,  q1
+.endif
+
+        vadd.s16        q0,  q10, q5     @ p1 + f
+        vsub.s16        q7,  q13, q5     @ q1 - f
+.if \wd >= 8
+        vmov            r8,  r9,  d4
+.endif
+        vmin.s16        q0,  q0,  q4
+        vmin.s16        q7,  q7,  q4
+.if \wd >= 8
+        orrs            r8,  r8,  r9
+.endif
+        vmax.s16        q0,  q0,  q6     @ out p1
+        vmax.s16        q7,  q7,  q6     @ out q1
+        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
+        vbit            q13, q7,  q3
+
+.if \wd >= 8
+        @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels
+        beq             6f
+
+        @ flat8in
+        vadd.u16        q2,  q8,  q9
+        vadd.u16        q3,  q10, q13
+        vadd.u16        q4,  q8,  q10
+        vadd.u16        q5,  q11, q14
+        vadd.u16        q0,  q2,  q2
+        vadd.u16        q0,  q0,  q11
+        vadd.u16        q0,  q0,  q12
+        vadd.u16        q0,  q0,  q4
+        vsub.s16        q3,  q3,  q2
+        vsub.s16        q5,  q5,  q4
+        vrshr.u16       q6,  q0,  #3     @ out p2
+
+        vadd.u16        q0,  q0,  q3
+        vadd.u16        q2,  q8,  q11
+        vadd.u16        q3,  q12, q15
+        vrshr.u16       q7,  q0,  #3     @ out p1
+
+        vadd.u16        q0,  q0,  q5
+        vsub.s16        q3,  q3,  q2
+        vadd.u16        q4,  q9,  q12
+        vbit            q9,  q6,  q1
+        vadd.u16        q5,  q13, q15
+        vrshr.u16       q6,  q0,  #3     @ out p0
+
+        vadd.u16        q0,  q0,  q3
+        vsub.s16        q5,  q5,  q4
+        vadd.u16        q2,  q10, q13
+        vbit            q10, q7,  q1
+        vadd.u16        q3,  q14, q15
+        vrshr.u16       q7,  q0,  #3     @ out q0
+
+        vadd.u16        q0,  q0,  q5
+        vsub.s16        q3,  q3,  q2
+        vbit            q11, q6,  q1
+        vrshr.u16       q6,  q0,  #3     @ out q1
+
+        vadd.u16        q0,  q0,  q3
+        vbit            q12, q7,  q1
+        vrshr.u16       q7,  q0,  #3     @ out q2
+        vbit            q13, q6,  q1
+        vbit            q14, q7,  q1
+.endif
+.endm
+
+@ The input to and output from this macro is in the registers d16-d31,
+@ and d0-d7 are used as scratch registers.
+@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
+@ Depending on the width of the loop filter, we either use d16-d19
+@ and d28-d31 as temp registers, or d8-d15.
+@ In practice, this is only ever instantiated once, so the macro parameters
+@ could be hardcoded, but keeping them as is, to keep similarities to the
+@ 8 bpp and aarch64 versions.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+        vdup.u16        d0,  r2          @ E
+        vdup.u16        d2,  r3          @ I
+
+        vabd.u16        d4,  d20, d21    @ abs(p3 - p2)
+        vabd.u16        d5,  d21, d22    @ abs(p2 - p1)
+        vabd.u16        d6,  d22, d23    @ abs(p1 - p0)
+        vabd.u16        d7,  d24, d25    @ abs(q0 - q1)
+        vabd.u16        \tmp1,  d25, d26 @ abs(q1 - q2)
+        vabd.u16        \tmp2,  d26, d27 @ abs(q2 - q3)
+        vmax.u16        d4,  d4,  d5
+        vmax.u16        d5,  d6,  d7
+        vmax.u16        \tmp1,  \tmp1,  \tmp2
+        vabd.u16        d6,  d23, d24    @ abs(p0 - q0)
+        vmax.u16        d4,  d4,  d5
+        vadd.u16        d6,  d6,  d6     @ abs(p0 - q0) * 2
+        vabd.u16        d5,  d22, d25    @ abs(p1 - q1)
+        vmax.u16        d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
+        vshr.u16        d5,  d5,  #1
+        vcle.u16        d4,  d4,  d2     @ max(abs()) <= I
+        vadd.u16        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u16        d6,  d6,  d0
+        vand            d4,  d4,  d6     @ fm
+
+        vdup.u16        d3,  r4          @ H
+        vmov            r8,  r9,  d4
+        orrs            r8,  r8,  r9
+        @ If no pixels need filtering, just exit as soon as possible
+        beq             9f
+
+.if \wd >= 8
+        vdup.u16        d0,  r5
+
+        vabd.u16        d6,  d20, d23    @ abs(p3 - p0)
+        vabd.u16        d2,  d21, d23    @ abs(p2 - p0)
+        vabd.u16        d1,  d22, d23    @ abs(p1 - p0)
+        vabd.u16        \tmp1,  d25, d24 @ abs(q1 - q0)
+        vabd.u16        \tmp2,  d26, d24 @ abs(q2 - q0)
+        vabd.u16        \tmp3,  d27, d24 @ abs(q3 - q0)
+        vmax.u16        d6,  d6,  d2
+        vmax.u16        d1,  d1,  \tmp1
+        vmax.u16        \tmp2,  \tmp2,  \tmp3
+.if \wd == 16
+        vabd.u16        d7,  d16, d23    @ abs(p7 - p0)
+        vmax.u16        d6,  d6,  d1
+        vabd.u16        d2,  d17, d23    @ abs(p6 - p0)
+        vmax.u16        d6,  d6,  \tmp2
+        vabd.u16        d1,  d18, d23    @ abs(p5 - p0)
+        vcle.u16        d6,  d6,  d0     @ flat8in
+        vabd.u16        d8,  d19, d23    @ abs(p4 - p0)
+        vand            d6,  d6,  d4     @ flat8in && fm
+        vabd.u16        d9,  d28, d24    @ abs(q4 - q0)
+        vbic            d4,  d4,  d6     @ fm && !flat8in
+        vabd.u16        d10, d29, d24    @ abs(q5 - q0)
+        vabd.u16        d11, d30, d24    @ abs(q6 - q0)
+        vabd.u16        d12, d31, d24    @ abs(q7 - q0)
+
+        vmax.u16        d7,  d7,  d2
+        vmax.u16        d1,  d1,  d8
+        vmax.u16        d9,  d9,  d10
+        vmax.u16        d11, d11, d12
+        @ The rest of the calculation of flat8out is interleaved below
+.else
+        @ The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+        @ Calculate the normal inner loop filter for 2 or 4 pixels
+        vabd.u16        d5,  d22, d23           @ abs(p1 - p0)
+.if \wd == 16
+        vmax.u16        d7,  d7,  d1
+        vmax.u16        d9,  d9,  d11
+.elseif \wd == 8
+        vmax.u16        d6,  d6,  d1
+.endif
+        vabd.u16        d1,  d25, d24           @ abs(q1 - q0)
+.if \wd == 16
+        vmax.u16        d7,  d7,  d9
+.elseif \wd == 8
+        vmax.u16        d6,  d6,  \tmp2
+.endif
+        vdup.u16        \tmp2,  r6              @ left shift for saturation
+        vsub.u16        \tmp1,  d22, d25        @ p1 - q1
+        vneg.s16        \tmp6,  \tmp2           @ negative left shift after saturation
+        vmax.u16        d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
+        vsub.u16        \tmp3,   d24, d23       @ q0 - p0
+        vmov.s16        \tmp5,  #3
+.if \wd == 8
+        vcle.u16        d6,  d6,  d0            @ flat8in
+.endif
+        vcle.u16        d5,  d5,  d3            @ !hev
+.if \wd == 8
+        vand            d6,  d6,  d4            @ flat8in && fm
+.endif
+        vqshl.s16       \tmp1,  \tmp1,  \tmp2
+.if \wd == 16
+        vcle.u16        d7,  d7,  d0            @ flat8out
+.elseif \wd == 8
+        vbic            d4,  d4,  d6            @ fm && !flat8in
+.endif
+        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
+.if \wd == 16
+        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
+.endif
+        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        vmul.s16        \tmp3,  \tmp3,  \tmp5   @ 3 * (q0 - p0)
+        vbic            \tmp1,  \tmp1,   d5     @ if (!hev) av_clip_int2p = 0
+        vmov.s16        d2,  #4
+        vadd.s16        \tmp3,  \tmp3,  \tmp1   @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
+        vmov.s16        d3,  #3
+        vqshl.s16       \tmp1,  \tmp3,  \tmp2
+        vmov.s16        \tmp5,  #0
+        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        vdup.u16        \tmp6,  r7              @ max pixel value
+.if \wd == 16
+        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
+.endif
+
+        vshr.u16        \tmp2,  \tmp6,  #1      @ (1 << (BIT_DEPTH - 1)) - 1
+
+        vadd.s16        \tmp3,  \tmp1,  d2      @ f + 4
+        vadd.s16        \tmp4,  \tmp1,  d3      @ f + 3
+        vmin.s16        \tmp3,  \tmp3,  \tmp2   @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        vmin.s16        \tmp4,  \tmp4,  \tmp2   @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        vshr.s16        \tmp3,  \tmp3,  #3      @ f1
+        vshr.s16        \tmp4,  \tmp4,  #3      @ f2
+
+        vadd.s16        d0,  d23, \tmp4         @ p0 + f2
+        vsub.s16        d2,  d24, \tmp3         @ q0 - f1
+        vmin.s16        d0,  d0,  \tmp6
+        vmin.s16        d2,  d2,  \tmp6
+        vrshr.s16       \tmp3,  \tmp3,  #1      @ f = (f1 + 1) >> 1
+        vmax.s16        d0,  d0,  \tmp5         @ out p0
+        vmax.s16        d2,  d2,  \tmp5         @ out q0
+        vbit            d23, d0,  d4            @ if (fm && !flat8in)
+        vbit            d24, d2,  d4
+
+        vadd.s16        d0,  d22, \tmp3         @ p1 + f
+        vsub.s16        d2,  d25, \tmp3         @ q1 - f
+.if \wd >= 8
+        vmov            r8,  r9,  d6
+.endif
+        vmin.s16        d0,  d0,  \tmp6
+        vmin.s16        d2,  d2,  \tmp6
+.if \wd >= 8
+        orrs            r8,  r8,  r9
+.endif
+        vmax.s16        d0,  d0,  \tmp5         @ out p1
+        vmax.s16        d2,  d2,  \tmp5         @ out q1
+        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
+        vbit            d25, d2,  d5
+
+.if \wd >= 8
+        @ If no pixels need flat8in, jump to flat8out
+        @ (or to a writeout of the inner 4 pixels, for wd=8)
+        beq             6f
+
+        @ flat8in
+        vadd.u16        \tmp1,  d20, d21
+        vadd.u16        \tmp3,  d22, d25
+        vadd.u16        \tmp5,  d20, d22
+        vadd.u16        \tmp7,  d23, d26
+        vadd.u16        d0,  \tmp1,  \tmp1
+        vadd.u16        d0,  d0,  d23
+        vadd.u16        d0,  d0,  d24
+        vadd.u16        d0,  d0,  \tmp5
+        vsub.s16        \tmp3,  \tmp3,  \tmp1
+        vsub.s16        \tmp7,  \tmp7,  \tmp5
+        vrshr.u16       d2,  d0,  #3            @ out p2
+
+        vadd.u16        d0,  d0,  \tmp3
+        vadd.u16        \tmp1,  d20, d23
+        vadd.u16        \tmp3,  d24, d27
+        vrshr.u16       d3,  d0,  #3            @ out p1
+
+        vadd.u16        d0,  d0,  \tmp7
+        vsub.s16        \tmp3,  \tmp3,  \tmp1
+        vadd.u16        \tmp5,  d21, d24
+        vadd.u16        \tmp7,  d25, d27
+        vrshr.u16       d4,  d0,  #3            @ out p0
+
+        vadd.u16        d0,  d0,  \tmp3
+        vsub.s16        \tmp7,  \tmp7,  \tmp5
+        vadd.u16        \tmp1,  d22, d25
+        vadd.u16        \tmp3,  d26, d27
+        vrshr.u16       d5,  d0,  #3            @ out d0
+
+        vadd.u16        d0,  d0,  \tmp7
+        vsub.s16        \tmp3,  \tmp3,  \tmp1
+        vrshr.u16       \tmp5,  d0,  #3         @ out q1
+
+        vadd.u16        d0,  d0,  \tmp3
+        @ The output here is written back into the input registers. This doesn't
+        @ matter for the flat8out part below, since we only update those pixels
+        @ which won't be touched below.
+        vbit            d21, d2,  d6
+        vbit            d22, d3,  d6
+        vbit            d23, d4,  d6
+        vrshr.u16       \tmp6,  d0,  #3         @ out q2
+        vbit            d24, d5,  d6
+        vbit            d25, \tmp5,  d6
+        vbit            d26, \tmp6,  d6
+.endif
+.if \wd == 16
+6:
+        vorr            d2,  d6,  d7
+        vmov            r8,  r9,  d2
+        orrs            r8,  r8,  r9
+        @ If no pixels needed flat8in nor flat8out, jump to a
+        @ writeout of the inner 4 pixels
+        beq             7f
+        vmov            r8,  r9,  d7
+        orrs            r8,  r8,  r9
+        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+        beq             8f
+
+        @ flat8out
+        @ This writes all outputs into d2-d17 (skipping d6 and d16).
+        @ If this part is skipped, the output is read from d21-d26 (which is the input
+        @ to this section).
+        vshl.u16        d0,  d16, #3  @ 8 * d16
+        vsub.u16        d0,  d0,  d16 @ 7 * d16
+        vadd.u16        d0,  d0,  d17
+        vadd.u16        d8,  d17, d18
+        vadd.u16        d10, d19, d20
+        vadd.s16        d0,  d0,  d8
+        vadd.u16        d8,  d16, d17
+        vadd.u16        d12, d21, d22
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d10, d18, d25
+        vadd.u16        d14, d23, d24
+        vsub.s16        d10, d10, d8
+        vadd.s16        d0,  d0,  d12
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d12, d16, d18
+        vadd.u16        d14, d19, d26
+        vrshr.u16       d2,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d8,  d16, d19
+        vadd.u16        d10, d20, d27
+        vsub.s16        d14, d14, d12
+        vbif            d2,  d17, d7
+        vrshr.u16       d3,  d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d12, d16, d20
+        vadd.u16        d14, d21, d28
+        vsub.s16        d10, d10, d8
+        vbif            d3,  d18, d7
+        vrshr.u16       d4,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d8,  d16, d21
+        vadd.u16        d10, d22, d29
+        vsub.s16        d14, d14, d12
+        vbif            d4,  d19, d7
+        vrshr.u16       d5,  d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d12, d16, d22
+        vadd.u16        d14, d23, d30
+        vsub.s16        d10, d10, d8
+        vbif            d5,  d20, d7
+        vrshr.u16       d6,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d10, d16, d23
+        vsub.s16        d14, d14, d12
+        vadd.u16        d12, d24, d31
+        vbif            d6,  d21, d7
+        vrshr.u16       d8,  d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vsub.s16        d10, d12, d10
+        vadd.u16        d12, d17, d24
+        vadd.u16        d14, d25, d31
+        vbif            d8,  d22, d7
+        vrshr.u16       d9,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vsub.s16        d14, d14, d12
+        vadd.u16        d12, d26, d31
+        vbif            d9,  d23, d7
+        vrshr.u16       d10, d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d14, d18, d25
+        vadd.u16        d18, d19, d26
+        vsub.s16        d12, d12, d14
+        vadd.u16        d14, d27, d31
+        vbif            d10, d24, d7
+        vrshr.u16       d11, d0,  #4
+
+        vadd.s16        d0,  d0,  d12
+        vadd.u16        d12, d20, d27
+        vsub.s16        d14, d14, d18
+        vadd.u16        d18, d28, d31
+        vbif            d11, d25, d7
+        vsub.s16        d18, d18, d12
+        vrshr.u16       d12, d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d14, d21, d28
+        vadd.u16        d20, d29, d31
+        vbif            d12, d26, d7
+        vrshr.u16       d13, d0,  #4
+
+        vadd.s16        d0,  d0,  d18
+        vsub.s16        d20, d20, d14
+        vadd.u16        d18, d22, d29
+        vadd.u16        d22, d30, d31
+        vbif            d13, d27, d7
+        vrshr.u16       d14, d0,  #4
+
+        vadd.s16        d0,  d0,  d20
+        vsub.s16        d22, d22, d18
+        vbif            d14, d28, d7
+        vrshr.u16       d15, d0,  #4
+
+        vadd.s16        d0,  d0,  d22
+        vbif            d15, d29, d7
+        vrshr.u16       d17, d0,  #4
+        vbif            d17, d30, d7
+.endif
+.endm
+
+.macro loop_filter_q_4
+        loop_filter_q   4
+.endm
+
+.macro loop_filter_q_8
+        loop_filter_q   8
+.endm
+
+.macro loop_filter_16
+        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15
+.endm
+
+
+@ The public functions in this file have got the following signature:
+@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp
+function ff_\func\()_\bpp\()_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        vpush           {q4-q7}
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        mov             r5,  #1 << (\bpp - 8)
+        mov             r6,  #16 - \bpp
+        movw            r7,  #((1 << \bpp) - 1)
+        bl              \func\()_16_neon
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends func
+        bpp_frontend    \func, 10
+        bpp_frontend    \func, 12
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        vpush           {q4-q7}
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        mov             r5,  #1 << (\bpp - 8)
+        mov             r6,  #16 - \bpp
+        movw            r7,  #((1 << \bpp) - 1)
+        bl              \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+        add             r0,  r0,  r1, lsl #2
+.else
+        add             r0,  r0,  #8
+.endif
+        bl              \func\()_\int_suffix\()_16_neon
+.if \rep >= 4
+.ifc \dir,h
+        add             r0,  r0,  r1, lsl #2
+        bl              \func\()_\int_suffix\()_16_neon
+        add             r0,  r0,  r1, lsl #2
+        bl              \func\()_\int_suffix\()_16_neon
+.else
+        add             r0,  r0,  #8
+        bl              \func\()_\int_suffix\()_16_neon
+        add             r0,  r0,  #8
+        bl              \func\()_\int_suffix\()_16_neon
+.endif
+.endif
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir
+        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10
+        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        vpush           {q4-q7}
+        push            {r2, r3, r4}
+        and             r2,  r2,  #0xff
+        and             r3,  r3,  #0xff
+        and             r4,  r4,  #0xff
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        mov             r5,  #1 << (\bpp - 8)
+        mov             r6,  #16 - \bpp
+        movw            r7,  #((1 << \bpp) - 1)
+        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+        add             r0,  r0,  r1, lsl #3
+.else
+        add             r0,  r0,  #16
+.endif
+        pop             {r2, r3, r4}
+        lsr             r2,  r2,  #8
+        lsr             r3,  r3,  #8
+        lsr             r4,  r4,  #8
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+        bpp_frontend_mix2 \wd1, \wd2, v, 10
+        bpp_frontend_mix2 \wd1, \wd2, v, 12
+        bpp_frontend_mix2 \wd1, \wd2, h, 10
+        bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+        sub             r12, r0,  r1, lsl #2
+        vld1.16         {q8},  [r12,:128], r1 @ p3
+        vld1.16         {q12}, [r0, :128], r1 @ q0
+        vld1.16         {q9},  [r12,:128], r1 @ p2
+        vld1.16         {q13}, [r0, :128], r1 @ q1
+        vld1.16         {q10}, [r12,:128], r1 @ p1
+        vld1.16         {q14}, [r0, :128], r1 @ q2
+        vld1.16         {q11}, [r12,:128], r1 @ p0
+        vld1.16         {q15}, [r0, :128], r1 @ q3
+        sub             r0,  r0,  r1, lsl #2
+        sub             r12, r12, r1, lsl #1
+
+        loop_filter_q_4
+
+        vst1.16         {q10}, [r12,:128], r1
+        vst1.16         {q12}, [r0, :128], r1
+        vst1.16         {q11}, [r12,:128], r1
+        vst1.16         {q13}, [r0, :128], r1
+        sub             r0,  r0,  r1, lsl #1
+9:
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+
+function vp9_loop_filter_h_4_8_16_neon
+        sub             r12, r0,  #8
+        add             r0,  r12, r1, lsl #2
+        vld1.16         {q8},  [r12,:64], r1
+        vld1.16         {q12}, [r0, :64], r1
+        vld1.16         {q9},  [r12,:64], r1
+        vld1.16         {q13}, [r0, :64], r1
+        vld1.16         {q10}, [r12,:64], r1
+        vld1.16         {q14}, [r0, :64], r1
+        vld1.16         {q11}, [r12,:64], r1
+        vld1.16         {q15}, [r0, :64], r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+        @ outermost 2 pixels since they aren't changed.
+        add             r12, r12, #4
+        add             r0,  r0,  #4
+
+        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        loop_filter_q_4
+
+        @ We only will write the mid 4 pixels back; after the loop filter,
+        @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels).
+        @ We need to transpose them to columns, done with a
+        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
+        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
+        transpose16_4x4 q10, q11, q12, q13
+
+        vst1.16         {d20}, [r12], r1
+        vst1.16         {d21}, [r0],  r1
+        vst1.16         {d22}, [r12], r1
+        vst1.16         {d23}, [r0],  r1
+        vst1.16         {d24}, [r12], r1
+        vst1.16         {d25}, [r0],  r1
+        vst1.16         {d26}, [r12], r1
+        vst1.16         {d27}, [r0],  r1
+        sub             r12, r12, r1, lsl #2
+9:
+        add             r0,  r12, #4
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+
+function vp9_loop_filter_v_8_8_16_neon
+        sub             r12, r0,  r1, lsl #2
+        vld1.16         {q8},  [r12,:128], r1 @ p3
+        vld1.16         {q12}, [r0, :128], r1 @ q0
+        vld1.16         {q9},  [r12,:128], r1 @ p2
+        vld1.16         {q13}, [r0, :128], r1 @ q1
+        vld1.16         {q10}, [r12,:128], r1 @ p1
+        vld1.16         {q14}, [r0, :128], r1 @ q2
+        vld1.16         {q11}, [r12,:128], r1 @ p0
+        vld1.16         {q15}, [r0, :128], r1 @ q3
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r12, r12, r1
+
+        loop_filter_q_8
+
+        vst1.16         {q9},  [r12,:128], r1
+        vst1.16         {q12}, [r0, :128], r1
+        vst1.16         {q10}, [r12,:128], r1
+        vst1.16         {q13}, [r0, :128], r1
+        vst1.16         {q11}, [r12,:128], r1
+        vst1.16         {q14}, [r0, :128], r1
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+9:
+        bx              lr
+6:
+        sub             r12, r0,  r1, lsl #1
+        vst1.16         {q10}, [r12,:128], r1
+        vst1.16         {q12}, [r0, :128], r1
+        vst1.16         {q11}, [r12,:128], r1
+        vst1.16         {q13}, [r0, :128], r1
+        sub             r0,  r0,  r1, lsl #1
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+
+function vp9_loop_filter_h_8_8_16_neon
+        sub             r12, r0,  #8
+        add             r0,  r12, r1, lsl #2
+        vld1.16         {q8},  [r12,:64], r1
+        vld1.16         {q12}, [r0, :64], r1
+        vld1.16         {q9},  [r12,:64], r1
+        vld1.16         {q13}, [r0, :64], r1
+        vld1.16         {q10}, [r12,:64], r1
+        vld1.16         {q14}, [r0, :64], r1
+        vld1.16         {q11}, [r12,:64], r1
+        vld1.16         {q15}, [r0, :64], r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+
+        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        loop_filter_q_8
+
+        @ Even though only 6 pixels per row have been changed, we write the
+        @ full 8 pixel registers.
+        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        vst1.16         {q8},  [r12,:64], r1
+        vst1.16         {q12}, [r0, :64], r1
+        vst1.16         {q9},  [r12,:64], r1
+        vst1.16         {q13}, [r0, :64], r1
+        vst1.16         {q10}, [r12,:64], r1
+        vst1.16         {q14}, [r0, :64], r1
+        vst1.16         {q11}, [r12,:64], r1
+        vst1.16         {q15}, [r0, :64], r1
+        sub             r12, r12, r1, lsl #2
+9:
+        add             r0,  r12, #8
+        bx              lr
+6:
+        @ If we didn't need to do the flat8in part, we use the same writeback
+        @ as in loop_filter_h_4_8.
+        add             r12, r12, #4
+        add             r0,  r0,  #4
+        transpose16_4x4 q10, q11, q12, q13
+
+        vst1.16         {d20}, [r12], r1
+        vst1.16         {d21}, [r0],  r1
+        vst1.16         {d22}, [r12], r1
+        vst1.16         {d23}, [r0],  r1
+        vst1.16         {d24}, [r12], r1
+        vst1.16         {d25}, [r0],  r1
+        vst1.16         {d26}, [r12], r1
+        vst1.16         {d27}, [r0],  r1
+        sub             r12, r12, r1, lsl #2
+        add             r0,  r12, #4
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_4_16_neon
+        sub             r12, r0,  r1, lsl #3
+        @ Read p7-p0 using r12 and q0-q7 using r0
+        vld1.16         {d16}, [r12,:64], r1 @ p7
+        vld1.16         {d24}, [r0, :64], r1 @ q0
+        vld1.16         {d17}, [r12,:64], r1 @ p6
+        vld1.16         {d25}, [r0, :64], r1 @ q1
+        vld1.16         {d18}, [r12,:64], r1 @ p5
+        vld1.16         {d26}, [r0, :64], r1 @ q2
+        vld1.16         {d19}, [r12,:64], r1 @ p4
+        vld1.16         {d27}, [r0, :64], r1 @ q3
+        vld1.16         {d20}, [r12,:64], r1 @ p3
+        vld1.16         {d28}, [r0, :64], r1 @ q4
+        vld1.16         {d21}, [r12,:64], r1 @ p2
+        vld1.16         {d29}, [r0, :64], r1 @ q5
+        vld1.16         {d22}, [r12,:64], r1 @ p1
+        vld1.16         {d30}, [r0, :64], r1 @ q6
+        vld1.16         {d23}, [r12,:64], r1 @ p0
+        vld1.16         {d31}, [r0, :64], r1 @ q7
+        sub             r12, r12, r1, lsl #3
+        sub             r0,  r0,  r1, lsl #3
+        add             r12, r12, r1
+
+        loop_filter_16
+
+        @ If we did the flat8out part, we get the output in
+        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
+        @ store d2-d9 there, and d10-d17 into r0.
+        vst1.16         {d2},  [r12,:64], r1
+        vst1.16         {d10}, [r0, :64], r1
+        vst1.16         {d3},  [r12,:64], r1
+        vst1.16         {d11}, [r0, :64], r1
+        vst1.16         {d4},  [r12,:64], r1
+        vst1.16         {d12}, [r0, :64], r1
+        vst1.16         {d5},  [r12,:64], r1
+        vst1.16         {d13}, [r0, :64], r1
+        vst1.16         {d6},  [r12,:64], r1
+        vst1.16         {d14}, [r0, :64], r1
+        vst1.16         {d8},  [r12,:64], r1
+        vst1.16         {d15}, [r0, :64], r1
+        vst1.16         {d9},  [r12,:64], r1
+        vst1.16         {d17}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  r1
+
+9:
+        bx              lr
+
+8:
+        add             r12, r12, r1, lsl #2
+        @ If we didn't do the flat8out part, the output is left in the
+        @ input registers.
+        vst1.16         {d21}, [r12,:64], r1
+        vst1.16         {d24}, [r0, :64], r1
+        vst1.16         {d22}, [r12,:64], r1
+        vst1.16         {d25}, [r0, :64], r1
+        vst1.16         {d23}, [r12,:64], r1
+        vst1.16         {d26}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+        bx              lr
+7:
+        sub             r12, r0,  r1, lsl #1
+        vst1.16         {d22}, [r12,:64], r1
+        vst1.16         {d24}, [r0, :64], r1
+        vst1.16         {d23}, [r12,:64], r1
+        vst1.16         {d25}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #1
+        bx              lr
+endfunc
+
+bpp_frontends_rep vp9_loop_filter_v_16, 8,  4, 2, v
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v
+
+function vp9_loop_filter_h_16_4_16_neon
+        sub             r12, r0,  #16
+        sub             r0,  r0,  #8
+        vld1.16         {d16}, [r12,:64], r1
+        vld1.16         {d20}, [r0, :64], r1
+        vld1.16         {d17}, [r12,:64], r1
+        vld1.16         {d21}, [r0, :64], r1
+        vld1.16         {d18}, [r12,:64], r1
+        vld1.16         {d22}, [r0, :64], r1
+        vld1.16         {d19}, [r12,:64], r1
+        vld1.16         {d23}, [r0, :64], r1
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r12, r12, #16
+        add             r0,  r0,  #16
+        vld1.16         {d24}, [r12,:64], r1
+        vld1.16         {d28}, [r0, :64], r1
+        vld1.16         {d25}, [r12,:64], r1
+        vld1.16         {d29}, [r0, :64], r1
+        vld1.16         {d26}, [r12,:64], r1
+        vld1.16         {d30}, [r0, :64], r1
+        vld1.16         {d27}, [r12,:64], r1
+        vld1.16         {d31}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r12, r12, r1, lsl #2
+        sub             r12, r12, #16
+        sub             r0,  r0,  #16
+
+        @ The 16x4 pixels read above is in four 4x4 blocks
+        transpose16_q_4x4 q8,  q9,  d16, d17, d18, d19
+        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
+        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
+        transpose16_q_4x4 q14, q15, d28, d29, d30, d31
+
+        loop_filter_16
+
+        @ Transpose back; this is the same transpose as above, but
+        @ we can't take advantage of q registers for the transpose, since
+        @ all d registers in the transpose aren't consecutive.
+        transpose16_4x4 d16, d2,  d3,  d4
+        transpose16_4x4 d5,  d6,  d8,  d9
+        transpose16_4x4 d10, d11, d12, d13
+        transpose16_4x4 d14, d15, d17, d31
+
+        vst1.16         {d16}, [r12,:64], r1
+        vst1.16         {d5},  [r0, :64], r1
+
+        vst1.16         {d2},  [r12,:64], r1
+        vst1.16         {d6},  [r0, :64], r1
+
+        vst1.16         {d3},  [r12,:64], r1
+        vst1.16         {d8},  [r0, :64], r1
+
+        vst1.16         {d4},  [r12,:64], r1
+        vst1.16         {d9},  [r0, :64], r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r12, r12, #16
+        add             r0,  r0,  #16
+
+        vst1.16         {d10}, [r12,:64], r1
+        vst1.16         {d14}, [r0, :64], r1
+
+        vst1.16         {d11}, [r12,:64], r1
+        vst1.16         {d15}, [r0, :64], r1
+
+        vst1.16         {d12}, [r12,:64], r1
+        vst1.16         {d17}, [r0, :64], r1
+
+        vst1.16         {d13}, [r12,:64], r1
+        vst1.16         {d31}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r0,  r0,  #8
+        bx              lr
+9:
+        add             r0,  r0,  #8
+        bx              lr
+8:
+        add             r12, r12, #8
+        add             r0,  r0,  #8
+        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
+        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
+
+        vst1.16         {d20}, [r12,:64], r1
+        vst1.16         {d24}, [r0, :64], r1
+        vst1.16         {d21}, [r12,:64], r1
+        vst1.16         {d25}, [r0, :64], r1
+        vst1.16         {d22}, [r12,:64], r1
+        vst1.16         {d26}, [r0, :64], r1
+        vst1.16         {d23}, [r12,:64], r1
+        vst1.16         {d27}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        bx              lr
+7:
+        add             r12, r12, #12
+        add             r0,  r12, r1, lsl #1
+        transpose16_q_4x4 q11, q12, d22, d23, d24, d25
+
+        vst1.16         {d22}, [r12], r1
+        vst1.16         {d24}, [r0],  r1
+        vst1.16         {d23}, [r12], r1
+        vst1.16         {d25}, [r0],  r1
+        sub             r0,  r0,  r1, lsl #2
+        add             r0,  r0,  #4
+        bx              lr
+endfunc
+
+bpp_frontends_rep vp9_loop_filter_h_16, 8,  4, 2, h
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index ae782b2..4b36080 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp9mc_16bpp_neon.S b/libavcodec/arm/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000..f6ec037
--- /dev/null
+++ b/libavcodec/arm/vp9mc_16bpp_neon.S
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@                            const uint8_t *ref, ptrdiff_t ref_stride,
+@                            int h, int mx, int my);
+
+function ff_vp9_copy128_neon, export=1
+        ldr             r12, [sp]
+        sub             r1,  r1,  #96
+        sub             r3,  r3,  #96
+1:
+        subs            r12, r12, #1
+        vld1.16         {q0,  q1},  [r2]!
+        vst1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q2,  q3},  [r2]!
+        vst1.16         {q2,  q3},  [r0, :128]!
+        vld1.16         {q8,  q9},  [r2]!
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2], r3
+        vst1.16         {q10, q11}, [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        sub             r1,  r1,  #96
+        sub             r3,  r3,  #96
+        mov             lr,  r0
+1:
+        subs            r12, r12, #1
+        vld1.16         {q8,  q9},  [r2]!
+        vld1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2]!
+        vrhadd.u16      q0,  q0,  q8
+        vld1.16         {q2,  q3},  [r0, :128]!
+        vrhadd.u16      q1,  q1,  q9
+        vld1.16         {q12, q13}, [r2]!
+        vrhadd.u16      q2,  q2,  q10
+        vst1.16         {q0,  q1},  [lr, :128]!
+        vrhadd.u16      q3,  q3,  q11
+        vld1.16         {q8,  q9},  [r0, :128]!
+        vst1.16         {q2,  q3},  [lr, :128]!
+        vrhadd.u16      q8,  q8,  q12
+        vld1.16         {q14, q15}, [r2], r3
+        vrhadd.u16      q9,  q9,  q13
+        vld1.16         {q10, q11}, [r0, :128], r1
+        vrhadd.u16      q10, q10, q14
+        vst1.16         {q8,  q9},  [lr, :128]!
+        vrhadd.u16      q11, q11, q15
+        vst1.16         {q10, q11}, [lr, :128], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+        mov             lr,  r0
+1:
+        subs            r12, r12, #1
+        vld1.16         {q8,  q9},  [r2]!
+        vld1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2], r3
+        vrhadd.u16      q0,  q0,  q8
+        vld1.16         {q2,  q3},  [r0, :128], r1
+        vrhadd.u16      q1,  q1,  q9
+        vrhadd.u16      q2,  q2,  q10
+        vst1.16         {q0, q1},  [lr, :128]!
+        vrhadd.u16      q3,  q3,  q11
+        vst1.16         {q2, q3},  [lr, :128], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+        ldr             r12, [sp]
+1:
+        subs            r12, r12, #1
+        vld1.16         {q2,  q3},  [r2], r3
+        vld1.16         {q0,  q1},  [r0, :128]
+        vrhadd.u16      q0,  q0,  q2
+        vrhadd.u16      q1,  q1,  q3
+        vst1.16         {q0,  q1},  [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        mov             lr,  r0
+1:
+        subs            r12, r12, #2
+        vld1.16         {q2},  [r2], r3
+        vld1.16         {q0},  [r0, :128], r1
+        vld1.16         {q3},  [r2], r3
+        vrhadd.u16      q0,  q0,  q2
+        vld1.16         {q1},  [r0, :128], r1
+        vrhadd.u16      q1,  q1,  q3
+        vst1.16         {q0},  [lr, :128], r1
+        vst1.16         {q1},  [lr, :128], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+        ldr             r12, [sp]
+1:
+        subs            r12, r12, #2
+        vld1.16         {d2},  [r2], r3
+        vld1.16         {d0},  [r0, :64], r1
+        vld1.16         {d3},  [r2], r3
+        vrhadd.u16      d0,  d0,  d2
+        vld1.16         {d1},  [r0, :64]
+        sub             r0,  r0,  r1
+        vrhadd.u16      d1,  d1,  d3
+        vst1.16         {d0},  [r0, :64], r1
+        vst1.16         {d1},  [r0, :64], r1
+        bne             1b
+        bx              lr
+endfunc
+
+@ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index
+.macro vmull_lane dst, src, idx
+.if \idx < 4
+       vmull.s16        \dst, \src, d0[\idx]
+.else
+       vmull.s16        \dst, \src, d1[\idx - 4]
+.endif
+.endm
+.macro vmlal_lane dst, src, idx
+.if \idx < 4
+       vmlal.s16        \dst, \src, d0[\idx]
+.else
+       vmlal.s16        \dst, \src, d1[\idx - 4]
+.endif
+.endm
+
+@ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate
+@ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8)
+.macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size
+        vext.8          q14, \src1, \src2, #(2*\offset)
+        vext.8          q15, \src3, \src4, #(2*\offset)
+        vmlal_lane      \dst1,  d28, \offset
+        vmlal_lane      \dst3,  d30, \offset
+.if \size >= 8
+        vmlal_lane      \dst2,  d29, \offset
+        vmlal_lane      \dst4,  d31, \offset
+.endif
+.endm
+
+
+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4 or 8 pixels in parallel; for larger
+@ widths it will do 8 pixels at a time and loop horizontally.
+@ The actual width (in bytes) is passed in r5, the height in r4 and
+@ the filter coefficients in r12.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+        sub             r2,  r2,  #6
+        add             r6,  r0,  r1
+        add             r7,  r2,  r3
+        add             r1,  r1,  r1
+        add             r3,  r3,  r3
+        @ Only size >= 8 loops horizontally and needs
+        @ reduced dst stride
+.if \size >= 8
+        sub             r1,  r1,  r5
+.endif
+        @ size >= 8 loads two qwords and increments r2,
+        @ for size 4 it's enough with three dwords and no
+        @ postincrement
+.if \size >= 8
+        sub             r3,  r3,  r5
+        sub             r3,  r3,  #16
+.endif
+        @ Load the filter vector
+        vld1.16         {q0},  [r12,:128]
+1:
+.if \size >= 8
+        mov             r12, r5
+.endif
+        @ Load src
+.if \size >= 8
+        vld1.16         {q8,  q9},  [r2]!
+        vld1.16         {q10, q11}, [r7]!
+.else
+        vld1.16         {d16, d17, d18}, [r2]
+        vld1.16         {d20, d21, d22}, [r7]
+.endif
+2:
+
+        vmull.s16       q1,  d16, d0[0]
+        vmull.s16       q12, d20, d0[0]
+.if \size >= 8
+        vmull.s16       q2,  d17, d0[0]
+        vmull.s16       q13, d21, d0[0]
+.endif
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 1, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 2, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 3, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 4, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 5, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 6, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 7, \size
+
+        @ Round, shift and saturate.
+        @ The vqrshrun takes care of clamping negative values to zero, but
+        @ we manually need to do vmin with the max pixel value.
+        vqrshrun.s32    d2,  q1,  #7
+        vqrshrun.s32    d24, q12, #7
+.if \size >= 8
+        vqrshrun.s32    d3,  q2,  #7
+        vqrshrun.s32    d25, q13, #7
+        vmin.u16        q1,  q1,  q3
+        vmin.u16        q12, q12, q3
+.else
+        vmin.u16        d2,  d2,  d6
+        vmin.u16        d24, d24, d6
+.endif
+        @ Average
+.ifc \type,avg
+.if \size >= 8
+        vld1.16         {q14}, [r0,:128]
+        vld1.16         {q15}, [r6,:128]
+        vrhadd.u16      q1,  q1,  q14
+        vrhadd.u16      q12, q12, q15
+.else
+        vld1.16         {d28}, [r0,:64]
+        vld1.16         {d30}, [r6,:64]
+        vrhadd.u16      d2,  d2,  d28
+        vrhadd.u16      d24, d24, d30
+.endif
+.endif
+        @ Store and loop horizontally (for size >= 8)
+.if \size >= 8
+        subs            r12, r12, #16
+        vst1.16         {q1},  [r0,:128]!
+        vst1.16         {q12}, [r6,:128]!
+        beq             3f
+        vmov            q8,  q9
+        vmov            q10, q11
+        vld1.16         {q9},  [r2]!
+        vld1.16         {q11}, [r7]!
+        b               2b
+.else @ \size == 4
+        vst1.16         {d2},  [r0,:64]
+        vst1.16         {d24}, [r6,:64]
+.endif
+3:
+        @ Loop vertically
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        add             r2,  r2,  r3
+        add             r7,  r7,  r3
+        subs            r4,  r4,  #2
+        bne             1b
+        pop             {r4-r7}
+        bx              lr
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+        push            {r4-r7}
+        ldr             r4,  [sp, #16]
+        ldr             r5,  [sp, #20]
+        vmvn.u16        q3,  #((0xffff << \bpp) & 0xffff)
+        movrelx         r12, X(ff_vp9_subpel_filters), r6
+        add             r12, r12, 256*\offset
+        add             r12, r12, r5, lsl #4
+        mov             r5,  #2*\size
+.if \size >= 8
+        b               \type\()_8tap_8h
+.else
+        b               \type\()_8tap_4h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp,   2, \size, \bpp
+do_8tap_h_func avg, sharp,   2, \size, \bpp
+do_8tap_h_func put, smooth,  0, \size, \bpp
+do_8tap_h_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8,  \bpp
+do_8tap_h_filters 4,  \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+.ltorg
+
+@ Vertical filters
+
+@ Round, shift and saturate and store qreg1-4
+.macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+        vqrshrun.s32    \dreg1,  \qreg1, #7
+        vqrshrun.s32    \dreg2,  \qreg2, #7
+        vqrshrun.s32    \dreg3,  \qreg3, #7
+        vqrshrun.s32    \dreg4,  \qreg4, #7
+.ifc \type,avg
+        vld1.16         {\tmp1},  [r6,:64], r1
+        vld1.16         {\tmp2},  [r6,:64], r1
+        vld1.16         {\tmp3},  [r6,:64], r1
+        vld1.16         {\tmp4},  [r6,:64], r1
+.endif
+        vmin.u16        \dreg1,  \dreg1,  \minreg
+        vmin.u16        \dreg2,  \dreg2,  \minreg
+        vmin.u16        \dreg3,  \dreg3,  \minreg
+        vmin.u16        \dreg4,  \dreg4,  \minreg
+.ifc \type,avg
+        vrhadd.u16      \dreg1,  \dreg1,  \tmp1
+        vrhadd.u16      \dreg2,  \dreg2,  \tmp2
+        vrhadd.u16      \dreg3,  \dreg3,  \tmp3
+        vrhadd.u16      \dreg4,  \dreg4,  \tmp4
+.endif
+        vst1.16         {\dreg1}, [r0,:64], r1
+        vst1.16         {\dreg2}, [r0,:64], r1
+        vst1.16         {\dreg3}, [r0,:64], r1
+        vst1.16         {\dreg4}, [r0,:64], r1
+.endm
+
+@ Round, shift and saturate and store qreg1-4
+@ qreg1-2 belong to one line and qreg3-4 to the second line.
+@ dreg1-2 == qreg1, dreg3-4 == qreg2.
+.macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type
+        vqrshrun.s32    \dreg1,  \qreg1, #7
+        vqrshrun.s32    \dreg2,  \qreg2, #7
+        vqrshrun.s32    \dreg3,  \qreg3, #7
+        vqrshrun.s32    \dreg4,  \qreg4, #7
+.ifc \type,avg
+        vld1.16         {\qreg3},  [r6,:128], r1
+        vld1.16         {\qreg4},  [r6,:128], r1
+.endif
+        vmin.u16        \qreg1,  \qreg1,  \minreg
+        vmin.u16        \qreg2,  \qreg2,  \minreg
+.ifc \type,avg
+        vrhadd.u16      \qreg1,  \qreg1,  \qreg3
+        vrhadd.u16      \qreg2,  \qreg2,  \qreg4
+.endif
+        vst1.16         {\qreg1}, [r0,:128], r1
+        vst1.16         {\qreg2}, [r0,:128], r1
+.endm
+
+@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+@ (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+        vmull.s16       \dst1, \src1, d0[0]
+        vmull.s16       \dst2, \src2, d0[0]
+        vmull.s16       \tmp1, \src2, d0[1]
+        vmull.s16       \tmp2, \src3, d0[1]
+        vmlal.s16       \dst1, \src3, d0[2]
+        vmlal.s16       \dst2, \src4, d0[2]
+        vmlal.s16       \tmp1, \src4, d0[3]
+        vmlal.s16       \tmp2, \src5, d0[3]
+        vmlal.s16       \dst1, \src5, d1[0]
+        vmlal.s16       \dst2, \src6, d1[0]
+        vmlal.s16       \tmp1, \src6, d1[1]
+        vmlal.s16       \tmp2, \src7, d1[1]
+        vmlal.s16       \dst1, \src7, d1[2]
+        vmlal.s16       \dst2, \src8, d1[2]
+        vmlal.s16       \tmp1, \src8, d1[3]
+        vmlal.s16       \tmp2, \src9, d1[3]
+        vadd.s32        \dst1, \dst1, \tmp1
+        vadd.s32        \dst2, \dst2, \tmp2
+.endm
+
+@ Evaluate the filter twice in parallel. This does the same as convolve4 above,
+@ but with double width (two input/output registers per row).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18
+        vmull.s16       \dst1, \src1,  d0[0]
+        vmull.s16       \dst2, \src2,  d0[0]
+        vmull.s16       \dst3, \src3,  d0[0]
+        vmull.s16       \dst4, \src4,  d0[0]
+        vmlal.s16       \dst1, \src3,  d0[1]
+        vmlal.s16       \dst2, \src4,  d0[1]
+        vmlal.s16       \dst3, \src5,  d0[1]
+        vmlal.s16       \dst4, \src6,  d0[1]
+        vmlal.s16       \dst1, \src5,  d0[2]
+        vmlal.s16       \dst2, \src6,  d0[2]
+        vmlal.s16       \dst3, \src7,  d0[2]
+        vmlal.s16       \dst4, \src8,  d0[2]
+        vmlal.s16       \dst1, \src7,  d0[3]
+        vmlal.s16       \dst2, \src8,  d0[3]
+        vmlal.s16       \dst3, \src9,  d0[3]
+        vmlal.s16       \dst4, \src10, d0[3]
+        vmlal.s16       \dst1, \src9,  d1[0]
+        vmlal.s16       \dst2, \src10, d1[0]
+        vmlal.s16       \dst3, \src11, d1[0]
+        vmlal.s16       \dst4, \src12, d1[0]
+        vmlal.s16       \dst1, \src11, d1[1]
+        vmlal.s16       \dst2, \src12, d1[1]
+        vmlal.s16       \dst3, \src13, d1[1]
+        vmlal.s16       \dst4, \src14, d1[1]
+        vmlal.s16       \dst1, \src13, d1[2]
+        vmlal.s16       \dst2, \src14, d1[2]
+        vmlal.s16       \dst3, \src15, d1[2]
+        vmlal.s16       \dst4, \src16, d1[2]
+        vmlal.s16       \dst1, \src15, d1[3]
+        vmlal.s16       \dst2, \src16, d1[3]
+        vmlal.s16       \dst3, \src17, d1[3]
+        vmlal.s16       \dst4, \src18, d1[3]
+.endm
+
+@ Instantiate a vertical filter function for filtering 8 pixels at a time.
+@ The height is passed in r4, the width in r5 and the filter coefficients
+@ in r12.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        vld1.16         {q0},  [r12, :128]
+1:
+.ifc \type,avg
+        mov             r6,  r0
+.endif
+        mov             r12, r4
+
+        vld1.16         {q5},  [r2], r3
+        vld1.16         {q6},  [r2], r3
+        vld1.16         {q7},  [r2], r3
+        vld1.16         {q8},  [r2], r3
+        vld1.16         {q9},  [r2], r3
+        vld1.16         {q10}, [r2], r3
+        vld1.16         {q11}, [r2], r3
+2:
+        vld1.16         {q12}, [r2], r3
+        vld1.16         {q13}, [r2], r3
+        vld1.16         {q14}, [r2], r3
+        vld1.16         {q15}, [r2], r3
+        convolve8       q2,  q3,  q4,  q5,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27
+        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
+        convolve8       q2,  q3,  q4,  q5,  d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
+
+        subs            r12, r12, #4
+        beq             8f
+
+        vld1.16         {q4},  [r2], r3
+        vld1.16         {q5},  [r2], r3
+        vld1.16         {q6},  [r2], r3
+        vld1.16         {q7},  [r2], r3
+        convolve8       q2,  q3,  q8,  q9,  d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11
+        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
+        convolve8       q2,  q3,  q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15
+        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
+
+        subs            r12, r12, #4
+        beq             8f
+
+        vld1.16         {q8},  [r2], r3
+        vld1.16         {q9},  [r2], r3
+        vld1.16         {q10}, [r2], r3
+        vld1.16         {q11}, [r2], r3
+        convolve8       q2,  q3,  q12, q13, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19
+        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
+        convolve8       q2,  q3,  q12, q13, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23
+        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
+
+        subs            r12, r12, #4
+        bne             2b
+
+8:
+        subs            r5,  r5,  #8
+        beq             9f
+        @ r0 -= h * dst_stride
+        mls             r0,  r1,  r4, r0
+        @ r2 -= h * src_stride
+        mls             r2,  r3,  r4, r2
+        @ r2 -= 8 * src_stride
+        sub             r2,  r2,  r3, lsl #3
+        @ r2 += 1 * src_stride
+        add             r2,  r2,  r3
+        add             r2,  r2,  #16
+        add             r0,  r0,  #16
+        b               1b
+9:
+        vpop            {q4-q7}
+        pop             {r4-r6}
+        bx              lr
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        vld1.16         {q0},  [r12, :128]
+.ifc \type,avg
+        mov             r6,  r0
+.endif
+
+        vld1.16         {d16}, [r2], r3
+        vld1.16         {d17}, [r2], r3
+        vld1.16         {d18}, [r2], r3
+        vld1.16         {d19}, [r2], r3
+        vld1.16         {d20}, [r2], r3
+        vld1.16         {d21}, [r2], r3
+        vld1.16         {d22}, [r2], r3
+        vld1.16         {d23}, [r2], r3
+        vld1.16         {d24}, [r2], r3
+        vld1.16         {d25}, [r2], r3
+        vld1.16         {d26}, [r2], r3
+        convolve4       q2,  q3,  d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15
+        convolve4       q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8,  q9
+        do_store4       q2,  d4,  q3,  d6,  q14, d28, q15, d30, d5,  d7,  d29, d31, d2,  \type
+
+        subs            r4,  r4,  #4
+        beq             9f
+
+        vld1.16         {d27}, [r2], r3
+        vld1.16         {d28}, [r2], r3
+        vld1.16         {d29}, [r2], r3
+        vld1.16         {d30}, [r2], r3
+        convolve4       q2,  q3,  d20, d21, d22, d23, d24, d25, d26, d27, d28, q8,  q9
+        convolve4       q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11
+        do_store4       q2,  d4,  q3,  d6,  q8,  d16, q9,  d18, d5,  d7,  d17, d19, d2,  \type
+
+9:
+        pop             {r4-r6}
+        bx              lr
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+        push            {r4-r6}
+        ldr             r4,  [sp, #12]
+        ldr             r5,  [sp, #20]
+.if \size >= 8
+        vpush           {q4-q7}
+.endif
+        vmvn.u16        q1,  #((0xffff << \bpp) & 0xffff)
+        movrelx         r12, X(ff_vp9_subpel_filters), r6
+        add             r12, r12, 256*\offset
+        add             r12, r12, r5, lsl #4
+        mov             r5,  #\size
+.if \size >= 8
+        b               \type\()_8tap_8v
+.else
+        b               \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp,   2, \size, \bpp
+do_8tap_v_func avg, sharp,   2, \size, \bpp
+do_8tap_v_func put, smooth,  0, \size, \bpp
+do_8tap_v_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8,  \bpp
+do_8tap_v_filters 4,  \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
index 8d43ff1..bd8cda7 100644
--- a/libavcodec/arm/vp9mc_neon.S
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -286,8 +286,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2
         sub             r3,  r3,  #8
 .endif
         @ Load the filter vector
-        vld1.8          {d0},  [r12,:64]
-        vmovl.s8        q0,  d0
+        vld1.16         {q0},  [r12,:128]
 1:
 .if \size >= 16
         mov             r12, r5
@@ -416,9 +415,9 @@ function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
         ldr             r5,  [sp, #20]
 .endif
         movrelx         r12, X(ff_vp9_subpel_filters), r6
-        add             r12, r12, 120*\offset - 8
+        add             r12, r12, 256*\offset
         cmp             r5,  #8
-        add             r12, r12, r5, lsl #3
+        add             r12, r12, r5, lsl #4
         mov             r5,  #\size
 .if \size >= 16
         bge             \type\()_8tap_16h_34
@@ -551,8 +550,7 @@ do_8tap_h_filters 4
 function \type\()_8tap_8v_\idx1\idx2
         sub             r2,  r2,  r3, lsl #1
         sub             r2,  r2,  r3
-        vld1.8          {d0},  [r12, :64]
-        vmovl.s8        q0,  d0
+        vld1.16         {q0},  [r12, :128]
 1:
         mov             r12, r4
 
@@ -622,8 +620,7 @@ do_8tap_8v avg, 4, 3
 function \type\()_8tap_4v_\idx1\idx2
         sub             r2,  r2,  r3, lsl #1
         sub             r2,  r2,  r3
-        vld1.8          {d0},  [r12, :64]
-        vmovl.s8        q0,  d0
+        vld1.16         {q0},  [r12, :128]
 
         vld1.32         {d2[]},   [r2], r3
         vld1.32         {d3[]},   [r2], r3
@@ -693,8 +690,8 @@ function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
         ldr             r4,  [sp, #72]
         movrelx         r12, X(ff_vp9_subpel_filters), r5
         ldr             r5,  [sp, #80]
-        add             r12, r12, 120*\offset - 8
-        add             r12, r12, r5, lsl #3
+        add             r12, r12, 256*\offset
+        add             r12, r12, r5, lsl #4
         cmp             r5,  #8
         mov             r5,  #\size
 .if \size >= 8
diff --git a/libavcodec/ass.c b/libavcodec/ass.c
index def11f0..b4f081c 100644
--- a/libavcodec/ass.c
+++ b/libavcodec/ass.c
@@ -2,65 +2,75 @@
  * SSA/ASS common functions
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "ass.h"
+#include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/common.h"
 
-/**
- * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS.
- *
- * @param avctx pointer to the AVCodecContext
- * @param font name of the default font face to use
- * @param font_size default font size to use
- * @param color default text color to use (ABGR)
- * @param back_color default background color to use (ABGR)
- * @param bold 1 for bold text, 0 for normal text
- * @param italic 1 for italic text, 0 for normal text
- * @param underline 1 for underline text, 0 for normal text
- * @param alignment position of the text (left, center, top...), defined after
- *                  the layout of the numpad (1-3 sub, 4-6 mid, 7-9 top)
- * @return >= 0 on success otherwise an error code <0
- */
-static int ass_subtitle_header(AVCodecContext *avctx,
-                               const char *font, int font_size,
-                               int color, int back_color,
-                               int bold, int italic, int underline,
-                               int alignment)
+int ff_ass_subtitle_header(AVCodecContext *avctx,
+                           const char *font, int font_size,
+                           int color, int back_color,
+                           int bold, int italic, int underline,
+                           int border_style, int alignment)
 {
-    char header[512];
-
-    snprintf(header, sizeof(header),
+    avctx->subtitle_header = av_asprintf(
              "[Script Info]\r\n"
+             "; Script generated by FFmpeg/Lavc%s\r\n"
              "ScriptType: v4.00+\r\n"
+             "PlayResX: %d\r\n"
+             "PlayResY: %d\r\n"
              "\r\n"
              "[V4+ Styles]\r\n"
-             "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, AlphaLevel, Encoding\r\n"
-             "Style: Default,%s,%d,&H%x,&H%x,&H%x,&H%x,%d,%d,%d,1,1,0,%d,10,10,10,0,0\r\n"
+
+             /* ASSv4 header */
+             "Format: Name, "
+             "Fontname, Fontsize, "
+             "PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+             "Bold, Italic, Underline, StrikeOut, "
+             "ScaleX, ScaleY, "
+             "Spacing, Angle, "
+             "BorderStyle, Outline, Shadow, "
+             "Alignment, MarginL, MarginR, MarginV, "
+             "Encoding\r\n"
+
+             "Style: "
+             "Default,"             /* Name */
+             "%s,%d,"               /* Font{name,size} */
+             "&H%x,&H%x,&H%x,&H%x," /* {Primary,Secondary,Outline,Back}Colour */
+             "%d,%d,%d,0,"          /* Bold, Italic, Underline, StrikeOut */
+             "100,100,"             /* Scale{X,Y} */
+             "0,0,"                 /* Spacing, Angle */
+             "%d,1,0,"              /* BorderStyle, Outline, Shadow */
+             "%d,10,10,10,"         /* Alignment, Margin[LRV] */
+             "0\r\n"                /* Encoding */
+
              "\r\n"
              "[Events]\r\n"
-             "Format: Layer, Start, End, Text\r\n",
+             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n",
+             !(avctx->flags & AV_CODEC_FLAG_BITEXACT) ? AV_STRINGIFY(LIBAVCODEC_VERSION) : "",
+             ASS_DEFAULT_PLAYRESX, ASS_DEFAULT_PLAYRESY,
              font, font_size, color, color, back_color, back_color,
-             -bold, -italic, -underline, alignment);
+             -bold, -italic, -underline, border_style, alignment);
 
-    avctx->subtitle_header = av_strdup(header);
     if (!avctx->subtitle_header)
         return AVERROR(ENOMEM);
     avctx->subtitle_header_size = strlen(avctx->subtitle_header);
@@ -69,57 +79,89 @@ static int ass_subtitle_header(AVCodecContext *avctx,
 
 int ff_ass_subtitle_header_default(AVCodecContext *avctx)
 {
-    return ass_subtitle_header(avctx, ASS_DEFAULT_FONT,
+    return ff_ass_subtitle_header(avctx, ASS_DEFAULT_FONT,
                                ASS_DEFAULT_FONT_SIZE,
                                ASS_DEFAULT_COLOR,
                                ASS_DEFAULT_BACK_COLOR,
                                ASS_DEFAULT_BOLD,
                                ASS_DEFAULT_ITALIC,
                                ASS_DEFAULT_UNDERLINE,
+                               ASS_DEFAULT_BORDERSTYLE,
                                ASS_DEFAULT_ALIGNMENT);
 }
 
-void ff_ass_init(AVSubtitle *sub)
-{
-    memset(sub, 0, sizeof(*sub));
-}
-
-static int ts_to_string(char *str, int strlen, int ts)
+char *ff_ass_get_dialog(int readorder, int layer, const char *style,
+                        const char *speaker, const char *text)
 {
-    int h, m, s;
-    h = ts/360000;  ts -= 360000*h;
-    m = ts/  6000;  ts -=   6000*m;
-    s = ts/   100;  ts -=    100*s;
-    return snprintf(str, strlen, "%d:%02d:%02d.%02d", h, m, s, ts);
+    return av_asprintf("%d,%d,%s,%s,0,0,0,,%s",
+                       readorder, layer, style ? style : "Default",
+                       speaker ? speaker : "", text);
 }
 
 int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
-                    int ts_start, int ts_end, int raw)
+                    int readorder, int layer, const char *style,
+                    const char *speaker)
 {
-    int len = 0, dlen, duration = ts_end - ts_start;
-    char s_start[16], s_end[16], header[48] = {0};
+    char *ass_str;
     AVSubtitleRect **rects;
 
-    if (!raw) {
-        ts_to_string(s_start, sizeof(s_start), ts_start);
-        ts_to_string(s_end,   sizeof(s_end),   ts_end  );
-        len = snprintf(header, sizeof(header), "Dialogue: 0,%s,%s,",
-                       s_start, s_end);
-    }
-
-    dlen = strcspn(dialog, "\n");
-    dlen += dialog[dlen] == '\n';
-
-    rects = av_realloc(sub->rects, (sub->num_rects+1) * sizeof(*sub->rects));
+    rects = av_realloc_array(sub->rects, (sub->num_rects+1), sizeof(*sub->rects));
     if (!rects)
         return AVERROR(ENOMEM);
     sub->rects = rects;
-    sub->end_display_time = FFMAX(sub->end_display_time, 10 * duration);
     rects[sub->num_rects]       = av_mallocz(sizeof(*rects[0]));
+    if (!rects[sub->num_rects])
+        return AVERROR(ENOMEM);
     rects[sub->num_rects]->type = SUBTITLE_ASS;
-    rects[sub->num_rects]->ass  = av_malloc(len + dlen + 1);
-    strcpy (rects[sub->num_rects]->ass      , header);
-    av_strlcpy(rects[sub->num_rects]->ass + len, dialog, dlen + 1);
+    ass_str = ff_ass_get_dialog(readorder, layer, style, speaker, dialog);
+    if (!ass_str)
+        return AVERROR(ENOMEM);
+    rects[sub->num_rects]->ass = ass_str;
     sub->num_rects++;
-    return dlen;
+    return 0;
+}
+
+void ff_ass_decoder_flush(AVCodecContext *avctx)
+{
+    FFASSDecoderContext *s = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        s->readorder = 0;
+}
+
+void ff_ass_bprint_text_event(AVBPrint *buf, const char *p, int size,
+                             const char *linebreaks, int keep_ass_markup)
+{
+    const char *p_end = p + size;
+
+    for (; p < p_end && *p; p++) {
+
+        /* forced custom line breaks, not accounted as "normal" EOL */
+        if (linebreaks && strchr(linebreaks, *p)) {
+            av_bprintf(buf, "\\N");
+
+        /* standard ASS escaping so random characters don't get mis-interpreted
+         * as ASS */
+        } else if (!keep_ass_markup && strchr("{}\\", *p)) {
+            av_bprintf(buf, "\\%c", *p);
+
+        /* some packets might end abruptly (no \0 at the end, like for example
+         * in some cases of demuxing from a classic video container), some
+         * might be terminated with \n or \r\n which we have to remove (for
+         * consistency with those who haven't), and we also have to deal with
+         * evil cases such as \r at the end of the buffer (and no \0 terminated
+         * character) */
+        } else if (p[0] == '\n') {
+            /* some stuff left so we can insert a line break */
+            if (p < p_end - 1)
+                av_bprintf(buf, "\\N");
+        } else if (p[0] == '\r' && p < p_end - 1 && p[1] == '\n') {
+            /* \r followed by a \n, we can skip it. We don't insert the \N yet
+             * because we don't know if it is followed by more text */
+            continue;
+
+        /* finally, a sane character */
+        } else {
+            av_bprint_chars(buf, *p, 1);
+        }
+    }
 }
diff --git a/libavcodec/ass.h b/libavcodec/ass.h
index 1302a04..314b43b 100644
--- a/libavcodec/ass.h
+++ b/libavcodec/ass.h
@@ -2,20 +2,20 @@
  * SSA/ASS common functions
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,10 @@
 #define AVCODEC_ASS_H
 
 #include "avcodec.h"
+#include "libavutil/bprint.h"
+
+#define ASS_DEFAULT_PLAYRESX 384
+#define ASS_DEFAULT_PLAYRESY 288
 
 /**
  * @name Default values for ASS style
@@ -36,8 +40,34 @@
 #define ASS_DEFAULT_ITALIC      0
 #define ASS_DEFAULT_UNDERLINE   0
 #define ASS_DEFAULT_ALIGNMENT   2
+#define ASS_DEFAULT_BORDERSTYLE 1
 /** @} */
 
+typedef struct FFASSDecoderContext {
+    int readorder;
+} FFASSDecoderContext;
+
+/**
+ * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS.
+ *
+ * @param avctx pointer to the AVCodecContext
+ * @param font name of the default font face to use
+ * @param font_size default font size to use
+ * @param color default text color to use (ABGR)
+ * @param back_color default background color to use (ABGR)
+ * @param bold 1 for bold text, 0 for normal text
+ * @param italic 1 for italic text, 0 for normal text
+ * @param underline 1 for underline text, 0 for normal text
+ * @param alignment position of the text (left, center, top...), defined after
+ *                  the layout of the numpad (1-3 sub, 4-6 mid, 7-9 top)
+ * @return >= 0 on success otherwise an error code <0
+ */
+int ff_ass_subtitle_header(AVCodecContext *avctx,
+                           const char *font, int font_size,
+                           int color, int back_color,
+                           int bold, int italic, int underline,
+                           int border_style, int alignment);
+
 /**
  * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS
  * with default style.
@@ -48,29 +78,34 @@
 int ff_ass_subtitle_header_default(AVCodecContext *avctx);
 
 /**
- * Initialize an AVSubtitle structure for use with ff_ass_add_rect().
- *
- * @param sub pointer to the AVSubtitle
+ * Craft an ASS dialog string.
  */
-void ff_ass_init(AVSubtitle *sub);
+char *ff_ass_get_dialog(int readorder, int layer, const char *style,
+                        const char *speaker, const char *text);
 
 /**
- * Add an ASS dialog line to an AVSubtitle as a new AVSubtitleRect.
- *
- * @param sub pointer to the AVSubtitle
- * @param dialog ASS dialog to add to sub
- * @param ts_start start timestamp for this dialog (in 1/100 second unit)
- * @param ts_end end timestamp for this dialog (in 1/100 second unit)
- * @param raw when set to 1, it indicates that dialog contains a whole ASS
- *                           dialog line which should be copied as is.
- *            when set to 0, it indicates that dialog contains only the Text
- *                           part of the ASS dialog line, the rest of the line
- *                           will be generated.
- * @return number of characters read from dialog. It can be less than the whole
- *         length of dialog, if dialog contains several lines of text.
- *         A negative value indicates an error.
+ * Add an ASS dialog to a subtitle.
  */
 int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
-                    int ts_start, int ts_end, int raw);
+                    int readorder, int layer, const char *style,
+                    const char *speaker);
+
+/**
+ * Helper to flush a text subtitles decoder making use of the
+ * FFASSDecoderContext.
+ */
+void ff_ass_decoder_flush(AVCodecContext *avctx);
 
+/**
+ * Escape a text subtitle using ASS syntax into an AVBPrint buffer.
+ * Newline characters will be escaped to \N.
+ *
+ * @param buf pointer to an initialized AVBPrint buffer
+ * @param p source text
+ * @param size size of the source text
+ * @param linebreaks additional newline chars, which will be escaped to \N
+ * @param keep_ass_markup braces and backslash will not be escaped if set
+ */
+void ff_ass_bprint_text_event(AVBPrint *buf, const char *p, int size,
+                             const char *linebreaks, int keep_ass_markup);
 #endif /* AVCODEC_ASS_H */
diff --git a/libavcodec/ass_split.c b/libavcodec/ass_split.c
new file mode 100644
index 0000000..67da7c6
--- /dev/null
+++ b/libavcodec/ass_split.c
@@ -0,0 +1,598 @@
+/*
+ * SSA/ASS spliting functions
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass_split.h"
+
+typedef enum {
+    ASS_STR,
+    ASS_INT,
+    ASS_FLT,
+    ASS_COLOR,
+    ASS_TIMESTAMP,
+    ASS_ALGN,
+} ASSFieldType;
+
+typedef struct {
+    const char *name;
+    int type;
+    int offset;
+} ASSFields;
+
+typedef struct {
+    const char *section;
+    const char *format_header;
+    const char *fields_header;
+    int         size;
+    int         offset;
+    int         offset_count;
+    ASSFields   fields[24];
+} ASSSection;
+
+static const ASSSection ass_sections[] = {
+    { .section       = "Script Info",
+      .offset        = offsetof(ASS, script_info),
+      .fields = {{"ScriptType", ASS_STR, offsetof(ASSScriptInfo, script_type)},
+                 {"Collisions", ASS_STR, offsetof(ASSScriptInfo, collisions) },
+                 {"PlayResX",   ASS_INT, offsetof(ASSScriptInfo, play_res_x) },
+                 {"PlayResY",   ASS_INT, offsetof(ASSScriptInfo, play_res_y) },
+                 {"Timer",      ASS_FLT, offsetof(ASSScriptInfo, timer)      },
+                 {0},
+        }
+    },
+    { .section       = "V4+ Styles",
+      .format_header = "Format",
+      .fields_header = "Style",
+      .size          = sizeof(ASSStyle),
+      .offset        = offsetof(ASS, styles),
+      .offset_count  = offsetof(ASS, styles_count),
+      .fields = {{"Name",            ASS_STR,   offsetof(ASSStyle, name)           },
+                 {"Fontname",        ASS_STR,   offsetof(ASSStyle, font_name)      },
+                 {"Fontsize",        ASS_INT,   offsetof(ASSStyle, font_size)      },
+                 {"PrimaryColour",   ASS_COLOR, offsetof(ASSStyle, primary_color)  },
+                 {"SecondaryColour", ASS_COLOR, offsetof(ASSStyle, secondary_color)},
+                 {"OutlineColour",   ASS_COLOR, offsetof(ASSStyle, outline_color)  },
+                 {"BackColour",      ASS_COLOR, offsetof(ASSStyle, back_color)     },
+                 {"Bold",            ASS_INT,   offsetof(ASSStyle, bold)           },
+                 {"Italic",          ASS_INT,   offsetof(ASSStyle, italic)         },
+                 {"Underline",       ASS_INT,   offsetof(ASSStyle, underline)      },
+                 {"StrikeOut",       ASS_INT,   offsetof(ASSStyle, strikeout)      },
+                 {"ScaleX",          ASS_FLT,   offsetof(ASSStyle, scalex)         },
+                 {"ScaleY",          ASS_FLT,   offsetof(ASSStyle, scaley)         },
+                 {"Spacing",         ASS_FLT,   offsetof(ASSStyle, spacing)        },
+                 {"Angle",           ASS_FLT,   offsetof(ASSStyle, angle)          },
+                 {"BorderStyle",     ASS_INT,   offsetof(ASSStyle, border_style)   },
+                 {"Outline",         ASS_FLT,   offsetof(ASSStyle, outline)        },
+                 {"Shadow",          ASS_FLT,   offsetof(ASSStyle, shadow)         },
+                 {"Alignment",       ASS_INT,   offsetof(ASSStyle, alignment)      },
+                 {"MarginL",         ASS_INT,   offsetof(ASSStyle, margin_l)       },
+                 {"MarginR",         ASS_INT,   offsetof(ASSStyle, margin_r)       },
+                 {"MarginV",         ASS_INT,   offsetof(ASSStyle, margin_v)       },
+                 {"Encoding",        ASS_INT,   offsetof(ASSStyle, encoding)       },
+                 {0},
+        }
+    },
+    { .section       = "V4 Styles",
+      .format_header = "Format",
+      .fields_header = "Style",
+      .size          = sizeof(ASSStyle),
+      .offset        = offsetof(ASS, styles),
+      .offset_count  = offsetof(ASS, styles_count),
+      .fields = {{"Name",            ASS_STR,   offsetof(ASSStyle, name)           },
+                 {"Fontname",        ASS_STR,   offsetof(ASSStyle, font_name)      },
+                 {"Fontsize",        ASS_INT,   offsetof(ASSStyle, font_size)      },
+                 {"PrimaryColour",   ASS_COLOR, offsetof(ASSStyle, primary_color)  },
+                 {"SecondaryColour", ASS_COLOR, offsetof(ASSStyle, secondary_color)},
+                 {"TertiaryColour",  ASS_COLOR, offsetof(ASSStyle, outline_color)  },
+                 {"BackColour",      ASS_COLOR, offsetof(ASSStyle, back_color)     },
+                 {"Bold",            ASS_INT,   offsetof(ASSStyle, bold)           },
+                 {"Italic",          ASS_INT,   offsetof(ASSStyle, italic)         },
+                 {"BorderStyle",     ASS_INT,   offsetof(ASSStyle, border_style)   },
+                 {"Outline",         ASS_FLT,   offsetof(ASSStyle, outline)        },
+                 {"Shadow",          ASS_FLT,   offsetof(ASSStyle, shadow)         },
+                 {"Alignment",       ASS_ALGN,  offsetof(ASSStyle, alignment)      },
+                 {"MarginL",         ASS_INT,   offsetof(ASSStyle, margin_l)       },
+                 {"MarginR",         ASS_INT,   offsetof(ASSStyle, margin_r)       },
+                 {"MarginV",         ASS_INT,   offsetof(ASSStyle, margin_v)       },
+                 {"AlphaLevel",      ASS_INT,   offsetof(ASSStyle, alpha_level)    },
+                 {"Encoding",        ASS_INT,   offsetof(ASSStyle, encoding)       },
+                 {0},
+        }
+    },
+    { .section       = "Events",
+      .format_header = "Format",
+      .fields_header = "Dialogue",
+      .size          = sizeof(ASSDialog),
+      .offset        = offsetof(ASS, dialogs),
+      .offset_count  = offsetof(ASS, dialogs_count),
+      .fields = {{"Layer",   ASS_INT,        offsetof(ASSDialog, layer)   },
+                 {"Start",   ASS_TIMESTAMP,  offsetof(ASSDialog, start)   },
+                 {"End",     ASS_TIMESTAMP,  offsetof(ASSDialog, end)     },
+                 {"Style",   ASS_STR,        offsetof(ASSDialog, style)   },
+                 {"Name",    ASS_STR,        offsetof(ASSDialog, name)    },
+                 {"MarginL", ASS_INT,        offsetof(ASSDialog, margin_l)},
+                 {"MarginR", ASS_INT,        offsetof(ASSDialog, margin_r)},
+                 {"MarginV", ASS_INT,        offsetof(ASSDialog, margin_v)},
+                 {"Effect",  ASS_STR,        offsetof(ASSDialog, effect)  },
+                 {"Text",    ASS_STR,        offsetof(ASSDialog, text)    },
+                 {0},
+        }
+    },
+};
+
+
+typedef int (*ASSConvertFunc)(void *dest, const char *buf, int len);
+
+static int convert_str(void *dest, const char *buf, int len)
+{
+    char *str = av_malloc(len + 1);
+    if (str) {
+        memcpy(str, buf, len);
+        str[len] = 0;
+        if (*(void **)dest)
+            av_free(*(void **)dest);
+        *(char **)dest = str;
+    }
+    return !str;
+}
+static int convert_int(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "%d", (int *)dest) == 1;
+}
+static int convert_flt(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "%f", (float *)dest) == 1;
+}
+static int convert_color(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "&H%8x", (int *)dest) == 1 ||
+           sscanf(buf, "%d",    (int *)dest) == 1;
+}
+static int convert_timestamp(void *dest, const char *buf, int len)
+{
+    int c, h, m, s, cs;
+    if ((c = sscanf(buf, "%d:%02d:%02d.%02d", &h, &m, &s, &cs)) == 4)
+        *(int *)dest = 360000*h + 6000*m + 100*s + cs;
+    return c == 4;
+}
+static int convert_alignment(void *dest, const char *buf, int len)
+{
+    int a;
+    if (sscanf(buf, "%d", &a) == 1) {
+        /* convert V4 Style alignment to V4+ Style */
+        *(int *)dest = a + ((a&4) >> 1) - 5*!!(a&8);
+        return 1;
+    }
+    return 0;
+}
+
+static const ASSConvertFunc convert_func[] = {
+    [ASS_STR]       = convert_str,
+    [ASS_INT]       = convert_int,
+    [ASS_FLT]       = convert_flt,
+    [ASS_COLOR]     = convert_color,
+    [ASS_TIMESTAMP] = convert_timestamp,
+    [ASS_ALGN]      = convert_alignment,
+};
+
+
+struct ASSSplitContext {
+    ASS ass;
+    int current_section;
+    int field_number[FF_ARRAY_ELEMS(ass_sections)];
+    int *field_order[FF_ARRAY_ELEMS(ass_sections)];
+};
+
+
+static uint8_t *realloc_section_array(ASSSplitContext *ctx)
+{
+    const ASSSection *section = &ass_sections[ctx->current_section];
+    int *count = (int *)((uint8_t *)&ctx->ass + section->offset_count);
+    void **section_ptr = (void **)((uint8_t *)&ctx->ass + section->offset);
+    uint8_t *tmp = av_realloc_array(*section_ptr, (*count+1), section->size);
+    if (!tmp)
+        return NULL;
+    *section_ptr = tmp;
+    tmp += *count * section->size;
+    memset(tmp, 0, section->size);
+    (*count)++;
+    return tmp;
+}
+
+static inline int is_eol(char buf)
+{
+    return buf == '\r' || buf == '\n' || buf == 0;
+}
+
+static inline const char *skip_space(const char *buf)
+{
+    while (*buf == ' ')
+        buf++;
+    return buf;
+}
+
+static int *get_default_field_orders(const ASSSection *section, int *number)
+{
+    int i;
+    int *order = av_malloc_array(FF_ARRAY_ELEMS(section->fields), sizeof(*order));
+
+    if (!order)
+        return NULL;
+    for (i = 0; section->fields[i].name; i++)
+        order[i] = i;
+    *number = i;
+    while (i < FF_ARRAY_ELEMS(section->fields))
+        order[i++] = -1;
+    return order;
+}
+
+static const char *ass_split_section(ASSSplitContext *ctx, const char *buf)
+{
+    const ASSSection *section = &ass_sections[ctx->current_section];
+    int *number = &ctx->field_number[ctx->current_section];
+    int *order = ctx->field_order[ctx->current_section];
+    int i, len;
+
+    while (buf && *buf) {
+        if (buf[0] == '[') {
+            ctx->current_section = -1;
+            break;
+        }
+        if (buf[0] == ';' || (buf[0] == '!' && buf[1] == ':'))
+            goto next_line; // skip comments
+
+        len = strcspn(buf, ":\r\n");
+        if (buf[len] == ':' &&
+            (!section->fields_header || strncmp(buf, section->fields_header, len))) {
+            for (i = 0; i < FF_ARRAY_ELEMS(ass_sections); i++) {
+                if (ass_sections[i].fields_header &&
+                    !strncmp(buf, ass_sections[i].fields_header, len)) {
+                    ctx->current_section = i;
+                    section = &ass_sections[ctx->current_section];
+                    number = &ctx->field_number[ctx->current_section];
+                    order = ctx->field_order[ctx->current_section];
+                    break;
+                }
+            }
+        }
+        if (section->format_header && !order) {
+            len = strlen(section->format_header);
+            if (!strncmp(buf, section->format_header, len) && buf[len] == ':') {
+                buf += len + 1;
+                while (!is_eol(*buf)) {
+                    buf = skip_space(buf);
+                    len = strcspn(buf, ", \r\n");
+                    if (av_reallocp_array(&order, (*number + 1), sizeof(*order)) != 0)
+                        return NULL;
+
+                    order[*number] = -1;
+                    for (i=0; section->fields[i].name; i++)
+                        if (!strncmp(buf, section->fields[i].name, len)) {
+                            order[*number] = i;
+                            break;
+                        }
+                    (*number)++;
+                    buf = skip_space(buf + len + (buf[len] == ','));
+                }
+                ctx->field_order[ctx->current_section] = order;
+                goto next_line;
+            }
+        }
+        if (section->fields_header) {
+            len = strlen(section->fields_header);
+            if (!strncmp(buf, section->fields_header, len) && buf[len] == ':') {
+                uint8_t *ptr, *struct_ptr = realloc_section_array(ctx);
+                if (!struct_ptr)  return NULL;
+
+                /* No format header line found so far, assume default */
+                if (!order) {
+                    order = get_default_field_orders(section, number);
+                    if (!order)
+                        return NULL;
+                    ctx->field_order[ctx->current_section] = order;
+                }
+
+                buf += len + 1;
+                for (i=0; !is_eol(*buf) && i < *number; i++) {
+                    int last = i == *number - 1;
+                    buf = skip_space(buf);
+                    len = strcspn(buf, last ? "\r\n" : ",\r\n");
+                    if (order[i] >= 0) {
+                        ASSFieldType type = section->fields[order[i]].type;
+                        ptr = struct_ptr + section->fields[order[i]].offset;
+                        convert_func[type](ptr, buf, len);
+                    }
+                    buf += len;
+                    if (!last && *buf) buf++;
+                    buf = skip_space(buf);
+                }
+            }
+        } else {
+            len = strcspn(buf, ":\r\n");
+            if (buf[len] == ':') {
+                for (i=0; section->fields[i].name; i++)
+                    if (!strncmp(buf, section->fields[i].name, len)) {
+                        ASSFieldType type = section->fields[i].type;
+                        uint8_t *ptr = (uint8_t *)&ctx->ass + section->offset;
+                        ptr += section->fields[i].offset;
+                        buf = skip_space(buf + len + 1);
+                        convert_func[type](ptr, buf, strcspn(buf, "\r\n"));
+                        break;
+                    }
+            }
+        }
+next_line:
+        buf += strcspn(buf, "\n");
+        buf += !!*buf;
+    }
+    return buf;
+}
+
+static int ass_split(ASSSplitContext *ctx, const char *buf)
+{
+    char c, section[16];
+    int i;
+
+    if (ctx->current_section >= 0)
+        buf = ass_split_section(ctx, buf);
+
+    while (buf && *buf) {
+        if (sscanf(buf, "[%15[0-9A-Za-z+ ]]%c", section, &c) == 2) {
+            buf += strcspn(buf, "\n");
+            buf += !!*buf;
+            for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++)
+                if (!strcmp(section, ass_sections[i].section)) {
+                    ctx->current_section = i;
+                    buf = ass_split_section(ctx, buf);
+                }
+        } else {
+            buf += strcspn(buf, "\n");
+            buf += !!*buf;
+        }
+    }
+    return buf ? 0 : AVERROR_INVALIDDATA;
+}
+
+ASSSplitContext *ff_ass_split(const char *buf)
+{
+    ASSSplitContext *ctx = av_mallocz(sizeof(*ctx));
+    if (!ctx)
+        return NULL;
+    ctx->current_section = -1;
+    if (ass_split(ctx, buf) < 0) {
+        ff_ass_split_free(ctx);
+        return NULL;
+    }
+    return ctx;
+}
+
+static void free_section(ASSSplitContext *ctx, const ASSSection *section)
+{
+    uint8_t *ptr = (uint8_t *)&ctx->ass + section->offset;
+    int i, j, *count, c = 1;
+
+    if (section->format_header) {
+        ptr   = *(void **)ptr;
+        count = (int *)((uint8_t *)&ctx->ass + section->offset_count);
+    } else
+        count = &c;
+
+    if (ptr)
+        for (i=0; i<*count; i++, ptr += section->size)
+            for (j=0; section->fields[j].name; j++) {
+                const ASSFields *field = &section->fields[j];
+                if (field->type == ASS_STR)
+                    av_freep(ptr + field->offset);
+            }
+    *count = 0;
+
+    if (section->format_header)
+        av_freep((uint8_t *)&ctx->ass + section->offset);
+}
+
+ASSDialog *ff_ass_split_dialog(ASSSplitContext *ctx, const char *buf,
+                               int cache, int *number)
+{
+    ASSDialog *dialog = NULL;
+    int i, count;
+    if (!cache)
+        for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++)
+            if (!strcmp(ass_sections[i].section, "Events")) {
+                free_section(ctx, &ass_sections[i]);
+                break;
+            }
+    count = ctx->ass.dialogs_count;
+    if (ass_split(ctx, buf) == 0)
+        dialog = ctx->ass.dialogs + count;
+    if (number)
+        *number = ctx->ass.dialogs_count - count;
+    return dialog;
+}
+
+void ff_ass_free_dialog(ASSDialog **dialogp)
+{
+    ASSDialog *dialog = *dialogp;
+    if (!dialog)
+        return;
+    av_freep(&dialog->style);
+    av_freep(&dialog->name);
+    av_freep(&dialog->effect);
+    av_freep(&dialog->text);
+    av_freep(dialogp);
+}
+
+ASSDialog *ff_ass_split_dialog2(ASSSplitContext *ctx, const char *buf)
+{
+    int i;
+    static const ASSFields fields[] = {
+        {"ReadOrder", ASS_INT, offsetof(ASSDialog, readorder)},
+        {"Layer",     ASS_INT, offsetof(ASSDialog, layer)    },
+        {"Style",     ASS_STR, offsetof(ASSDialog, style)    },
+        {"Name",      ASS_STR, offsetof(ASSDialog, name)     },
+        {"MarginL",   ASS_INT, offsetof(ASSDialog, margin_l) },
+        {"MarginR",   ASS_INT, offsetof(ASSDialog, margin_r) },
+        {"MarginV",   ASS_INT, offsetof(ASSDialog, margin_v) },
+        {"Effect",    ASS_STR, offsetof(ASSDialog, effect)   },
+        {"Text",      ASS_STR, offsetof(ASSDialog, text)     },
+    };
+
+    ASSDialog *dialog = av_mallocz(sizeof(*dialog));
+    if (!dialog)
+        return NULL;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(fields); i++) {
+        size_t len;
+        const int last = i == FF_ARRAY_ELEMS(fields) - 1;
+        const ASSFieldType type = fields[i].type;
+        uint8_t *ptr = (uint8_t *)dialog + fields[i].offset;
+        buf = skip_space(buf);
+        len = last ? strlen(buf) : strcspn(buf, ",");
+        if (len >= INT_MAX) {
+            ff_ass_free_dialog(&dialog);
+            return NULL;
+        }
+        convert_func[type](ptr, buf, len);
+        buf += len;
+        if (*buf) buf++;
+    }
+    return dialog;
+}
+
+void ff_ass_split_free(ASSSplitContext *ctx)
+{
+    if (ctx) {
+        int i;
+        for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++) {
+            free_section(ctx, &ass_sections[i]);
+            av_freep(&(ctx->field_order[i]));
+        }
+        av_free(ctx);
+    }
+}
+
+
+int ff_ass_split_override_codes(const ASSCodesCallbacks *callbacks, void *priv,
+                                const char *buf)
+{
+    const char *text = NULL;
+    char new_line[2];
+    int text_len = 0;
+
+    while (buf && *buf) {
+        if (text && callbacks->text &&
+            (sscanf(buf, "\\%1[nN]", new_line) == 1 ||
+             !strncmp(buf, "{\\", 2))) {
+            callbacks->text(priv, text, text_len);
+            text = NULL;
+        }
+        if (sscanf(buf, "\\%1[nN]", new_line) == 1) {
+            if (callbacks->new_line)
+                callbacks->new_line(priv, new_line[0] == 'N');
+            buf += 2;
+        } else if (!strncmp(buf, "{\\", 2)) {
+            buf++;
+            while (*buf == '\\') {
+                char style[2], c[2], sep[2], c_num[2] = "0", tmp[128] = {0};
+                unsigned int color = 0xFFFFFFFF;
+                int len, size = -1, an = -1, alpha = -1;
+                int x1, y1, x2, y2, t1 = -1, t2 = -1;
+                if (sscanf(buf, "\\%1[bisu]%1[01\\}]%n", style, c, &len) > 1) {
+                    int close = c[0] == '0' ? 1 : c[0] == '1' ? 0 : -1;
+                    len += close != -1;
+                    if (callbacks->style)
+                        callbacks->style(priv, style[0], close);
+                } else if (sscanf(buf, "\\c%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\c&H%X&%1[\\}]%n", &color, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]c%1[\\}]%n", c_num, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]c&H%X&%1[\\}]%n", c_num, &color, sep, &len) > 2) {
+                    if (callbacks->color)
+                        callbacks->color(priv, color, c_num[0] - '0');
+                } else if (sscanf(buf, "\\alpha%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\alpha&H%2X&%1[\\}]%n", &alpha, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]a%1[\\}]%n", c_num, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]a&H%2X&%1[\\}]%n", c_num, &alpha, sep, &len) > 2) {
+                    if (callbacks->alpha)
+                        callbacks->alpha(priv, alpha, c_num[0] - '0');
+                } else if (sscanf(buf, "\\fn%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\fn%127[^\\}]%1[\\}]%n", tmp, sep, &len) > 1) {
+                    if (callbacks->font_name)
+                        callbacks->font_name(priv, tmp[0] ? tmp : NULL);
+                } else if (sscanf(buf, "\\fs%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\fs%u%1[\\}]%n", &size, sep, &len) > 1) {
+                    if (callbacks->font_size)
+                        callbacks->font_size(priv, size);
+                } else if (sscanf(buf, "\\a%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\a%2u%1[\\}]%n", &an, sep, &len) > 1 ||
+                           sscanf(buf, "\\an%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\an%1u%1[\\}]%n", &an, sep, &len) > 1) {
+                    if (an != -1 && buf[2] != 'n')
+                        an = (an&3) + (an&4 ? 6 : an&8 ? 3 : 0);
+                    if (callbacks->alignment)
+                        callbacks->alignment(priv, an);
+                } else if (sscanf(buf, "\\r%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\r%127[^\\}]%1[\\}]%n", tmp, sep, &len) > 1) {
+                    if (callbacks->cancel_overrides)
+                        callbacks->cancel_overrides(priv, tmp);
+                } else if (sscanf(buf, "\\move(%d,%d,%d,%d)%1[\\}]%n", &x1, &y1, &x2, &y2, sep, &len) > 4 ||
+                           sscanf(buf, "\\move(%d,%d,%d,%d,%d,%d)%1[\\}]%n", &x1, &y1, &x2, &y2, &t1, &t2, sep, &len) > 6) {
+                    if (callbacks->move)
+                        callbacks->move(priv, x1, y1, x2, y2, t1, t2);
+                } else if (sscanf(buf, "\\pos(%d,%d)%1[\\}]%n", &x1, &y1, sep, &len) > 2) {
+                    if (callbacks->move)
+                        callbacks->move(priv, x1, y1, x1, y1, -1, -1);
+                } else if (sscanf(buf, "\\org(%d,%d)%1[\\}]%n", &x1, &y1, sep, &len) > 2) {
+                    if (callbacks->origin)
+                        callbacks->origin(priv, x1, y1);
+                } else {
+                    len = strcspn(buf+1, "\\}") + 2;  /* skip unknown code */
+                }
+                buf += len - 1;
+            }
+            if (*buf++ != '}')
+                return AVERROR_INVALIDDATA;
+        } else {
+            if (!text) {
+                text = buf;
+                text_len = 1;
+            } else
+                text_len++;
+            buf++;
+        }
+    }
+    if (text && callbacks->text)
+        callbacks->text(priv, text, text_len);
+    if (callbacks->end)
+        callbacks->end(priv);
+    return 0;
+}
+
+ASSStyle *ff_ass_style_get(ASSSplitContext *ctx, const char *style)
+{
+    ASS *ass = &ctx->ass;
+    int i;
+
+    if (!style || !*style)
+        style = "Default";
+    for (i=0; i<ass->styles_count; i++)
+        if (ass->styles[i].name && !strcmp(ass->styles[i].name, style))
+            return ass->styles + i;
+    return NULL;
+}
diff --git a/libavcodec/ass_split.h b/libavcodec/ass_split.h
new file mode 100644
index 0000000..30ce772
--- /dev/null
+++ b/libavcodec/ass_split.h
@@ -0,0 +1,207 @@
+/*
+ * SSA/ASS spliting functions
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ASS_SPLIT_H
+#define AVCODEC_ASS_SPLIT_H
+
+/**
+ * fields extracted from the [Script Info] section
+ */
+typedef struct {
+    char *script_type;    /**< SSA script format version (eg. v4.00) */
+    char *collisions;     /**< how subtitles are moved to prevent collisions */
+    int   play_res_x;     /**< video width that ASS coords are referring to */
+    int   play_res_y;     /**< video height that ASS coords are referring to */
+    float timer;          /**< time multiplier to apply to SSA clock (in %) */
+} ASSScriptInfo;
+
+/**
+ * fields extracted from the [V4(+) Styles] section
+ */
+typedef struct {
+    char *name;           /**< name of the tyle (case sensitive) */
+    char *font_name;      /**< font face (case sensitive) */
+    int   font_size;      /**< font height */
+    int   primary_color;  /**< color that a subtitle will normally appear in */
+    int   secondary_color;
+    int   outline_color;  /**< color for outline in ASS, called tertiary in SSA */
+    int   back_color;     /**< color of the subtitle outline or shadow */
+    int   bold;           /**< whether text is bold (1) or not (0) */
+    int   italic;         /**< whether text is italic (1) or not (0) */
+    int   underline;      /**< whether text is underlined (1) or not (0) */
+    int   strikeout;
+    float scalex;
+    float scaley;
+    float spacing;
+    float angle;
+    int   border_style;
+    float outline;
+    float shadow;
+    int   alignment;      /**< position of the text (left, center, top...),
+                               defined after the layout of the numpad
+                               (1-3 sub, 4-6 mid, 7-9 top) */
+    int   margin_l;
+    int   margin_r;
+    int   margin_v;
+    int   alpha_level;
+    int   encoding;
+} ASSStyle;
+
+/**
+ * fields extracted from the [Events] section
+ */
+typedef struct {
+    int   readorder;
+    int   layer;    /**< higher numbered layers are drawn over lower numbered */
+    int   start;    /**< start time of the dialog in centiseconds */
+    int   end;      /**< end time of the dialog in centiseconds */
+    char *style;    /**< name of the ASSStyle to use with this dialog */
+    char *name;
+    int   margin_l;
+    int   margin_r;
+    int   margin_v;
+    char *effect;
+    char *text;     /**< actual text which will be displayed as a subtitle,
+                         can include style override control codes (see
+                         ff_ass_split_override_codes()) */
+} ASSDialog;
+
+/**
+ * structure containing the whole split ASS data
+ */
+typedef struct {
+    ASSScriptInfo script_info;   /**< general information about the SSA script*/
+    ASSStyle     *styles;        /**< array of split out styles */
+    int           styles_count;  /**< number of ASSStyle in the styles array */
+    ASSDialog    *dialogs;       /**< array of split out dialogs */
+    int           dialogs_count; /**< number of ASSDialog in the dialogs array*/
+} ASS;
+
+/**
+ * This struct can be casted to ASS to access to the split data.
+ */
+typedef struct ASSSplitContext ASSSplitContext;
+
+/**
+ * Split a full ASS file or a ASS header from a string buffer and store
+ * the split structure in a newly allocated context.
+ *
+ * @param buf String containing the ASS formatted data.
+ * @return Newly allocated struct containing split data.
+ */
+ASSSplitContext *ff_ass_split(const char *buf);
+
+/**
+ * Split one or several ASS "Dialogue" lines from a string buffer and store
+ * them in an already initialized context.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param buf String containing the ASS "Dialogue" lines.
+ * @param cache Set to 1 to keep all the previously split ASSDialog in
+ *              the context, or set to 0 to free all the previously split
+ *              ASSDialog.
+ * @param number If not NULL, the pointed integer will be set to the number
+ *               of split ASSDialog.
+ * @return Pointer to the first split ASSDialog.
+ */
+ASSDialog *ff_ass_split_dialog(ASSSplitContext *ctx, const char *buf,
+                               int cache, int *number);
+
+/**
+ * Free a dialogue obtained from ff_ass_split_dialog2().
+ */
+void ff_ass_free_dialog(ASSDialog **dialogp);
+
+/**
+ * Split one ASS Dialogue line from a string buffer.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param buf String containing the ASS "Dialogue" line.
+ * @return Pointer to the split ASSDialog. Must be freed with ff_ass_free_dialog()
+ */
+ASSDialog *ff_ass_split_dialog2(ASSSplitContext *ctx, const char *buf);
+
+/**
+ * Free all the memory allocated for an ASSSplitContext.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ */
+void ff_ass_split_free(ASSSplitContext *ctx);
+
+
+/**
+ * Set of callback functions corresponding to each override codes that can
+ * be encountered in a "Dialogue" Text field.
+ */
+typedef struct {
+    /**
+     * @defgroup ass_styles    ASS styles
+     * @{
+     */
+    void (*text)(void *priv, const char *text, int len);
+    void (*new_line)(void *priv, int forced);
+    void (*style)(void *priv, char style, int close);
+    void (*color)(void *priv, unsigned int /* color */, unsigned int color_id);
+    void (*alpha)(void *priv, int alpha, int alpha_id);
+    void (*font_name)(void *priv, const char *name);
+    void (*font_size)(void *priv, int size);
+    void (*alignment)(void *priv, int alignment);
+    void (*cancel_overrides)(void *priv, const char *style);
+    /** @} */
+
+    /**
+     * @defgroup ass_functions    ASS functions
+     * @{
+     */
+    void (*move)(void *priv, int x1, int y1, int x2, int y2, int t1, int t2);
+    void (*origin)(void *priv, int x, int y);
+    /** @} */
+
+    /**
+     * @defgroup ass_end    end of Dialogue Event
+     * @{
+     */
+    void (*end)(void *priv);
+    /** @} */
+} ASSCodesCallbacks;
+
+/**
+ * Split override codes out of a ASS "Dialogue" Text field.
+ *
+ * @param callbacks Set of callback functions called for each override code
+ *                  encountered.
+ * @param priv Opaque pointer passed to the callback functions.
+ * @param buf The ASS "Dialogue" Text field to split.
+ * @return >= 0 on success otherwise an error code <0
+ */
+int ff_ass_split_override_codes(const ASSCodesCallbacks *callbacks, void *priv,
+                                const char *buf);
+
+/**
+ * Find an ASSStyle structure by its name.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param style name of the style to search for.
+ * @return the ASSStyle corresponding to style, or NULL if style can't be found
+ */
+ASSStyle *ff_ass_style_get(ASSSplitContext *ctx, const char *style);
+
+#endif /* AVCODEC_ASS_SPLIT_H */
diff --git a/libavcodec/assdec.c b/libavcodec/assdec.c
index 48fe32e..3178f29 100644
--- a/libavcodec/assdec.c
+++ b/libavcodec/assdec.c
@@ -2,20 +2,20 @@
  * SSA/ASS decoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,11 @@
 
 static av_cold int ass_decode_init(AVCodecContext *avctx)
 {
-    avctx->subtitle_header = av_malloc(avctx->extradata_size);
+    avctx->subtitle_header = av_malloc(avctx->extradata_size + 1);
     if (!avctx->subtitle_header)
         return AVERROR(ENOMEM);
     memcpy(avctx->subtitle_header, avctx->extradata, avctx->extradata_size);
+    avctx->subtitle_header[avctx->extradata_size] = 0;
     avctx->subtitle_header_size = avctx->extradata_size;
     return 0;
 }
@@ -39,28 +40,44 @@ static av_cold int ass_decode_init(AVCodecContext *avctx)
 static int ass_decode_frame(AVCodecContext *avctx, void *data, int *got_sub_ptr,
                             AVPacket *avpkt)
 {
-    const char *ptr = avpkt->data;
-    int len, size = avpkt->size;
+    AVSubtitle *sub = data;
 
-    ff_ass_init(data);
+    if (avpkt->size <= 0)
+        return avpkt->size;
 
-    while (size > 0) {
-        len = ff_ass_add_rect(data, ptr, 0, 0/* FIXME: duration */, 1);
-        if (len < 0)
-            return len;
-        ptr  += len;
-        size -= len;
-    }
-
-    *got_sub_ptr = avpkt->size > 0;
+    sub->rects = av_malloc(sizeof(*sub->rects));
+    if (!sub->rects)
+        return AVERROR(ENOMEM);
+    sub->rects[0] = av_mallocz(sizeof(*sub->rects[0]));
+    if (!sub->rects[0])
+        return AVERROR(ENOMEM);
+    sub->num_rects = 1;
+    sub->rects[0]->type = SUBTITLE_ASS;
+    sub->rects[0]->ass  = av_strdup(avpkt->data);
+    if (!sub->rects[0]->ass)
+        return AVERROR(ENOMEM);
+    *got_sub_ptr = 1;
     return avpkt->size;
 }
 
+#if CONFIG_SSA_DECODER
+AVCodec ff_ssa_decoder = {
+    .name         = "ssa",
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_ASS,
+    .init         = ass_decode_init,
+    .decode       = ass_decode_frame,
+};
+#endif
+
+#if CONFIG_ASS_DECODER
 AVCodec ff_ass_decoder = {
     .name         = "ass",
-    .long_name    = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
-    .id           = AV_CODEC_ID_SSA,
+    .id           = AV_CODEC_ID_ASS,
     .init         = ass_decode_init,
     .decode       = ass_decode_frame,
 };
+#endif
diff --git a/libavcodec/assenc.c b/libavcodec/assenc.c
index caf266e..e54c1d8 100644
--- a/libavcodec/assenc.c
+++ b/libavcodec/assenc.c
@@ -2,37 +2,43 @@
  * SSA/ASS encoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
 #include "avcodec.h"
+#include "ass.h"
 #include "libavutil/avstring.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
+typedef struct {
+    int id; ///< current event id, ReadOrder field
+} ASSEncodeContext;
+
 static av_cold int ass_encode_init(AVCodecContext *avctx)
 {
-    avctx->extradata = av_malloc(avctx->subtitle_header_size);
+    avctx->extradata = av_malloc(avctx->subtitle_header_size + 1);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
     memcpy(avctx->extradata, avctx->subtitle_header, avctx->subtitle_header_size);
     avctx->extradata_size = avctx->subtitle_header_size;
+    avctx->extradata[avctx->extradata_size] = 0;
     return 0;
 }
 
@@ -40,19 +46,54 @@ static int ass_encode_frame(AVCodecContext *avctx,
                             unsigned char *buf, int bufsize,
                             const AVSubtitle *sub)
 {
+    ASSEncodeContext *s = avctx->priv_data;
     int i, len, total_len = 0;
 
     for (i=0; i<sub->num_rects; i++) {
+        char ass_line[2048];
+        const char *ass = sub->rects[i]->ass;
+        long int layer;
+        char *p;
+
         if (sub->rects[i]->type != SUBTITLE_ASS) {
             av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
-            return -1;
+            return AVERROR(EINVAL);
+        }
+
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            if (i > 0) {
+                av_log(avctx, AV_LOG_ERROR, "ASS encoder supports only one "
+                       "ASS rectangle field.\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            ass += 10; // skip "Dialogue: "
+            /* parse Layer field. If it's a Marked field, the content
+             * will be "Marked=N" instead of the layer num, so we will
+             * have layer=0, which is fine. */
+            layer = strtol(ass, &p, 10);
+
+#define SKIP_ENTRY(ptr) do {        \
+    char *sep = strchr(ptr, ',');   \
+    if (sep)                        \
+        ptr = sep + 1;              \
+} while (0)
+
+            SKIP_ENTRY(p); // skip layer or marked
+            SKIP_ENTRY(p); // skip start timestamp
+            SKIP_ENTRY(p); // skip end timestamp
+            snprintf(ass_line, sizeof(ass_line), "%d,%ld,%s", ++s->id, layer, p);
+            ass_line[strcspn(ass_line, "\r\n")] = 0;
+            ass = ass_line;
         }
+#endif
 
-        len = av_strlcpy(buf+total_len, sub->rects[i]->ass, bufsize-total_len);
+        len = av_strlcpy(buf+total_len, ass, bufsize-total_len);
 
         if (len > bufsize-total_len-1) {
             av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
 
         total_len += len;
@@ -61,11 +102,26 @@ static int ass_encode_frame(AVCodecContext *avctx,
     return total_len;
 }
 
+#if CONFIG_SSA_ENCODER
+AVCodec ff_ssa_encoder = {
+    .name         = "ssa",
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_ASS,
+    .init         = ass_encode_init,
+    .encode_sub   = ass_encode_frame,
+    .priv_data_size = sizeof(ASSEncodeContext),
+};
+#endif
+
+#if CONFIG_ASS_ENCODER
 AVCodec ff_ass_encoder = {
     .name         = "ass",
-    .long_name    = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
-    .id           = AV_CODEC_ID_SSA,
+    .id           = AV_CODEC_ID_ASS,
     .init         = ass_encode_init,
     .encode_sub   = ass_encode_frame,
+    .priv_data_size = sizeof(ASSEncodeContext),
 };
+#endif
diff --git a/libavcodec/asv.c b/libavcodec/asv.c
index b9e93f7..14fdf73 100644
--- a/libavcodec/asv.c
+++ b/libavcodec/asv.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/asv.h b/libavcodec/asv.h
index 7c4e4fd..a1366b6 100644
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,11 +31,11 @@
 #include "libavutil/mem.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "blockdsp.h"
 #include "bswapdsp.h"
 #include "fdctdsp.h"
 #include "idctdsp.h"
+#include "get_bits.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
 
@@ -47,14 +47,14 @@ typedef struct ASV1Context {
     IDCTDSPContext idsp;
     PixblockDSPContext pdsp;
     PutBitContext pb;
-    BitstreamContext bc;
+    GetBitContext gb;
     ScanTable scantable;
     int inv_qscale;
     int mb_width;
     int mb_height;
     int mb_width2;
     int mb_height2;
-    DECLARE_ALIGNED(16, int16_t, block)[6][64];
+    DECLARE_ALIGNED(32, int16_t, block)[6][64];
     uint16_t intra_matrix[64];
     int q_intra_matrix[64];
     uint8_t *bitstream_buffer;
diff --git a/libavcodec/asvdec.c b/libavcodec/asvdec.c
index 7aa16f0..9a11446 100644
--- a/libavcodec/asvdec.c
+++ b/libavcodec/asvdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,6 @@
 #include "asv.h"
 #include "avcodec.h"
 #include "blockdsp.h"
-#include "put_bits.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -70,27 +69,27 @@ static av_cold void init_vlcs(ASV1Context *a)
 }
 
 // FIXME write a reversed bitstream reader to avoid the double reverse
-static inline int asv2_get_bits(BitstreamContext *bc, int n)
+static inline int asv2_get_bits(GetBitContext *gb, int n)
 {
-    return ff_reverse[bitstream_read(bc, n) << (8 - n)];
+    return ff_reverse[get_bits(gb, n) << (8 - n)];
 }
 
-static inline int asv1_get_level(BitstreamContext *bc)
+static inline int asv1_get_level(GetBitContext *gb)
 {
-    int code = bitstream_read_vlc(bc, level_vlc.table, VLC_BITS, 1);
+    int code = get_vlc2(gb, level_vlc.table, VLC_BITS, 1);
 
     if (code == 3)
-        return bitstream_read_signed(bc, 8);
+        return get_sbits(gb, 8);
     else
         return code - 3;
 }
 
-static inline int asv2_get_level(BitstreamContext *bc)
+static inline int asv2_get_level(GetBitContext *gb)
 {
-    int code = bitstream_read_vlc(bc, asv2_level_vlc.table, ASV2_LEVEL_VLC_BITS, 1);
+    int code = get_vlc2(gb, asv2_level_vlc.table, ASV2_LEVEL_VLC_BITS, 1);
 
     if (code == 31)
-        return (int8_t) asv2_get_bits(bc, 8);
+        return (int8_t) asv2_get_bits(gb, 8);
     else
         return code - 31;
 }
@@ -99,10 +98,10 @@ static inline int asv1_decode_block(ASV1Context *a, int16_t block[64])
 {
     int i;
 
-    block[0] = 8 * bitstream_read(&a->bc, 8);
+    block[0] = 8 * get_bits(&a->gb, 8);
 
     for (i = 0; i < 11; i++) {
-        const int ccp = bitstream_read_vlc(&a->bc, ccp_vlc.table, VLC_BITS, 1);
+        const int ccp = get_vlc2(&a->gb, ccp_vlc.table, VLC_BITS, 1);
 
         if (ccp) {
             if (ccp == 16)
@@ -113,13 +112,13 @@ static inline int asv1_decode_block(ASV1Context *a, int16_t block[64])
             }
 
             if (ccp & 8)
-                block[a->scantable.permutated[4 * i + 0]] = (asv1_get_level(&a->bc) * a->intra_matrix[4 * i + 0]) >> 4;
+                block[a->scantable.permutated[4 * i + 0]] = (asv1_get_level(&a->gb) * a->intra_matrix[4 * i + 0]) >> 4;
             if (ccp & 4)
-                block[a->scantable.permutated[4 * i + 1]] = (asv1_get_level(&a->bc) * a->intra_matrix[4 * i + 1]) >> 4;
+                block[a->scantable.permutated[4 * i + 1]] = (asv1_get_level(&a->gb) * a->intra_matrix[4 * i + 1]) >> 4;
             if (ccp & 2)
-                block[a->scantable.permutated[4 * i + 2]] = (asv1_get_level(&a->bc) * a->intra_matrix[4 * i + 2]) >> 4;
+                block[a->scantable.permutated[4 * i + 2]] = (asv1_get_level(&a->gb) * a->intra_matrix[4 * i + 2]) >> 4;
             if (ccp & 1)
-                block[a->scantable.permutated[4 * i + 3]] = (asv1_get_level(&a->bc) * a->intra_matrix[4 * i + 3]) >> 4;
+                block[a->scantable.permutated[4 * i + 3]] = (asv1_get_level(&a->gb) * a->intra_matrix[4 * i + 3]) >> 4;
         }
     }
 
@@ -130,32 +129,32 @@ static inline int asv2_decode_block(ASV1Context *a, int16_t block[64])
 {
     int i, count, ccp;
 
-    count = asv2_get_bits(&a->bc, 4);
+    count = asv2_get_bits(&a->gb, 4);
 
-    block[0] = 8 * asv2_get_bits(&a->bc, 8);
+    block[0] = 8 * asv2_get_bits(&a->gb, 8);
 
-    ccp = bitstream_read_vlc(&a->bc, dc_ccp_vlc.table, VLC_BITS, 1);
+    ccp = get_vlc2(&a->gb, dc_ccp_vlc.table, VLC_BITS, 1);
     if (ccp) {
         if (ccp & 4)
-            block[a->scantable.permutated[1]] = (asv2_get_level(&a->bc) * a->intra_matrix[1]) >> 4;
+            block[a->scantable.permutated[1]] = (asv2_get_level(&a->gb) * a->intra_matrix[1]) >> 4;
         if (ccp & 2)
-            block[a->scantable.permutated[2]] = (asv2_get_level(&a->bc) * a->intra_matrix[2]) >> 4;
+            block[a->scantable.permutated[2]] = (asv2_get_level(&a->gb) * a->intra_matrix[2]) >> 4;
         if (ccp & 1)
-            block[a->scantable.permutated[3]] = (asv2_get_level(&a->bc) * a->intra_matrix[3]) >> 4;
+            block[a->scantable.permutated[3]] = (asv2_get_level(&a->gb) * a->intra_matrix[3]) >> 4;
     }
 
     for (i = 1; i < count + 1; i++) {
-        const int ccp = bitstream_read_vlc(&a->bc, ac_ccp_vlc.table, VLC_BITS, 1);
+        const int ccp = get_vlc2(&a->gb, ac_ccp_vlc.table, VLC_BITS, 1);
 
         if (ccp) {
             if (ccp & 8)
-                block[a->scantable.permutated[4 * i + 0]] = (asv2_get_level(&a->bc) * a->intra_matrix[4 * i + 0]) >> 4;
+                block[a->scantable.permutated[4 * i + 0]] = (asv2_get_level(&a->gb) * a->intra_matrix[4 * i + 0]) >> 4;
             if (ccp & 4)
-                block[a->scantable.permutated[4 * i + 1]] = (asv2_get_level(&a->bc) * a->intra_matrix[4 * i + 1]) >> 4;
+                block[a->scantable.permutated[4 * i + 1]] = (asv2_get_level(&a->gb) * a->intra_matrix[4 * i + 1]) >> 4;
             if (ccp & 2)
-                block[a->scantable.permutated[4 * i + 2]] = (asv2_get_level(&a->bc) * a->intra_matrix[4 * i + 2]) >> 4;
+                block[a->scantable.permutated[4 * i + 2]] = (asv2_get_level(&a->gb) * a->intra_matrix[4 * i + 2]) >> 4;
             if (ccp & 1)
-                block[a->scantable.permutated[4 * i + 3]] = (asv2_get_level(&a->bc) * a->intra_matrix[4 * i + 3]) >> 4;
+                block[a->scantable.permutated[4 * i + 3]] = (asv2_get_level(&a->gb) * a->intra_matrix[4 * i + 3]) >> 4;
         }
     }
 
@@ -164,19 +163,19 @@ static inline int asv2_decode_block(ASV1Context *a, int16_t block[64])
 
 static inline int decode_mb(ASV1Context *a, int16_t block[6][64])
 {
-    int i;
+    int i, ret;
 
     a->bdsp.clear_blocks(block[0]);
 
     if (a->avctx->codec_id == AV_CODEC_ID_ASV1) {
         for (i = 0; i < 6; i++) {
-            if (asv1_decode_block(a, block[i]) < 0)
-                return -1;
+            if ((ret = asv1_decode_block(a, block[i])) < 0)
+                return ret;
         }
     } else {
         for (i = 0; i < 6; i++) {
-            if (asv2_decode_block(a, block[i]) < 0)
-                return -1;
+            if ((ret = asv2_decode_block(a, block[i])) < 0)
+                return ret;
         }
     }
     return 0;
@@ -211,10 +210,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *const p = data;
     int mb_x, mb_y, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (buf_size * 8LL < a->mb_height * a->mb_width * 13LL)
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -232,7 +232,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             a->bitstream_buffer[i] = ff_reverse[buf[i]];
     }
 
-    bitstream_init8(&a->bc, a->bitstream_buffer, buf_size);
+    init_get_bits(&a->gb, a->bitstream_buffer, buf_size * 8);
 
     for (mb_y = 0; mb_y < a->mb_height2; mb_y++) {
         for (mb_x = 0; mb_x < a->mb_width2; mb_x++) {
@@ -267,7 +267,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     emms_c();
 
-    return (bitstream_tell(&a->bc) + 31) / 32 * 4;
+    return (get_bits_count(&a->gb) + 31) / 32 * 4;
 }
 
 static av_cold int decode_init(AVCodecContext *avctx)
@@ -277,19 +277,17 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int i;
 
     if (avctx->extradata_size < 1) {
-        av_log(avctx, AV_LOG_ERROR, "No extradata provided\n");
-        return AVERROR_INVALIDDATA;
+        av_log(avctx, AV_LOG_WARNING, "No extradata provided\n");
     }
 
     ff_asv_common_init(avctx);
-    ff_blockdsp_init(&a->bdsp);
+    ff_blockdsp_init(&a->bdsp, avctx);
     ff_idctdsp_init(&a->idsp, avctx);
     init_vlcs(a);
     ff_init_scantable(a->idsp.idct_permutation, &a->scantable, ff_asv_scantab);
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
-    a->inv_qscale = avctx->extradata[0];
-    if (a->inv_qscale == 0) {
+    if (avctx->extradata_size < 1 || (a->inv_qscale = avctx->extradata[0]) == 0) {
         av_log(avctx, AV_LOG_ERROR, "illegal qscale 0\n");
         if (avctx->codec_id == AV_CODEC_ID_ASV1)
             a->inv_qscale = 6;
@@ -317,6 +315,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+#if CONFIG_ASV1_DECODER
 AVCodec ff_asv1_decoder = {
     .name           = "asv1",
     .long_name      = NULL_IF_CONFIG_SMALL("ASUS V1"),
@@ -328,7 +327,9 @@ AVCodec ff_asv1_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
 
+#if CONFIG_ASV2_DECODER
 AVCodec ff_asv2_decoder = {
     .name           = "asv2",
     .long_name      = NULL_IF_CONFIG_SMALL("ASUS V2"),
@@ -340,3 +341,4 @@ AVCodec ff_asv2_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index ac7c317..3cc94bf 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,10 @@
 #include "libavutil/attributes.h"
 #include "libavutil/mem.h"
 
+#include "aandcttab.h"
 #include "asv.h"
 #include "avcodec.h"
+#include "dct.h"
 #include "fdctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -50,7 +52,7 @@ static inline void asv1_put_level(PutBitContext *pb, int level)
     }
 }
 
-static inline void asv2_put_level(PutBitContext *pb, int level)
+static inline void asv2_put_level(ASV1Context *a, PutBitContext *pb, int level)
 {
     unsigned int index = level + 31;
 
@@ -58,6 +60,10 @@ static inline void asv2_put_level(PutBitContext *pb, int level)
         put_bits(pb, ff_asv2_level_tab[index][1], ff_asv2_level_tab[index][0]);
     } else {
         put_bits(pb, ff_asv2_level_tab[31][1], ff_asv2_level_tab[31][0]);
+        if (level < -128 || level > 127) {
+            av_log(a->avctx, AV_LOG_WARNING, "Clipping level %d, increase qscale\n", level);
+            level = av_clip_int8(level);
+        }
         asv2_put_bits(pb, 8, level & 0xFF);
     }
 }
@@ -108,7 +114,7 @@ static inline void asv1_encode_block(ASV1Context *a, int16_t block[64])
     put_bits(&a->pb, ff_asv_ccp_tab[16][1], ff_asv_ccp_tab[16][0]);
 }
 
-static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
+static inline void asv2_encode_block(ASV1Context *a, int16_t block[64])
 {
     int i;
     int count = 0;
@@ -142,8 +148,7 @@ static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
                                  a->q_intra_matrix[index + 9] + (1 << 15)) >> 16))
             ccp |= 1;
 
-        if (!i && ccp >= 8)
-            return AVERROR_BUG;
+        av_assert2(i || ccp < 8);
         if (i)
             put_bits(&a->pb, ff_asv_ac_ccp_tab[ccp][1], ff_asv_ac_ccp_tab[ccp][0]);
         else
@@ -151,38 +156,31 @@ static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
 
         if (ccp) {
             if (ccp & 8)
-                asv2_put_level(&a->pb, block[index + 0]);
+                asv2_put_level(a, &a->pb, block[index + 0]);
             if (ccp & 4)
-                asv2_put_level(&a->pb, block[index + 8]);
+                asv2_put_level(a, &a->pb, block[index + 8]);
             if (ccp & 2)
-                asv2_put_level(&a->pb, block[index + 1]);
+                asv2_put_level(a, &a->pb, block[index + 1]);
             if (ccp & 1)
-                asv2_put_level(&a->pb, block[index + 9]);
+                asv2_put_level(a, &a->pb, block[index + 9]);
         }
     }
-
-    return 0;
 }
 
 #define MAX_MB_SIZE (30 * 16 * 16 * 3 / 2 / 8)
 
 static inline int encode_mb(ASV1Context *a, int16_t block[6][64])
 {
-    int i, ret;
+    int i;
 
-    if (a->pb.buf_end - a->pb.buf - (put_bits_count(&a->pb) >> 3) < MAX_MB_SIZE) {
-        av_log(a->avctx, AV_LOG_ERROR, "encoded frame too large\n");
-        return -1;
-    }
+    av_assert0(a->pb.buf_end - a->pb.buf - (put_bits_count(&a->pb) >> 3) >= MAX_MB_SIZE);
 
     if (a->avctx->codec_id == AV_CODEC_ID_ASV1) {
         for (i = 0; i < 6; i++)
             asv1_encode_block(a, block[i]);
     } else {
         for (i = 0; i < 6; i++) {
-            ret = asv2_encode_block(a, block[i]);
-            if (ret < 0)
-                return ret;
+            asv2_encode_block(a, block[i]);
         }
     }
     return 0;
@@ -221,13 +219,52 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int size, ret;
     int mb_x, mb_y;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, a->mb_height * a->mb_width * MAX_MB_SIZE +
-                             AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if (pict->width % 16 || pict->height % 16) {
+        AVFrame *clone = av_frame_alloc();
+        int i;
+
+        if (!clone)
+            return AVERROR(ENOMEM);
+        clone->format = pict->format;
+        clone->width  = FFALIGN(pict->width, 16);
+        clone->height = FFALIGN(pict->height, 16);
+        ret = av_frame_get_buffer(clone, 32);
+        if (ret < 0) {
+            av_frame_free(&clone);
+            return ret;
+        }
+
+        ret = av_frame_copy(clone, pict);
+        if (ret < 0) {
+            av_frame_free(&clone);
+            return ret;
+        }
+
+        for (i = 0; i<3; i++) {
+            int x, y;
+            int w  = AV_CEIL_RSHIFT(pict->width, !!i);
+            int h  = AV_CEIL_RSHIFT(pict->height, !!i);
+            int w2 = AV_CEIL_RSHIFT(clone->width, !!i);
+            int h2 = AV_CEIL_RSHIFT(clone->height, !!i);
+            for (y=0; y<h; y++)
+                for (x=w; x<w2; x++)
+                    clone->data[i][x + y*clone->linesize[i]] =
+                        clone->data[i][w - 1 + y*clone->linesize[i]];
+            for (y=h; y<h2; y++)
+                for (x=0; x<w2; x++)
+                    clone->data[i][x + y*clone->linesize[i]] =
+                        clone->data[i][x + (h-1)*clone->linesize[i]];
+        }
+        ret = encode_frame(avctx, pkt, clone, got_packet);
+
+        av_frame_free(&clone);
         return ret;
     }
 
+    if ((ret = ff_alloc_packet2(avctx, pkt, a->mb_height * a->mb_width * MAX_MB_SIZE +
+                                AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
     init_put_bits(&a->pb, pkt->data, pkt->size);
 
     for (mb_y = 0; mb_y < a->mb_height2; mb_y++) {
@@ -282,18 +319,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int i;
     const int scale = avctx->codec_id == AV_CODEC_ID_ASV1 ? 1 : 2;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
     ff_asv_common_init(avctx);
     ff_fdctdsp_init(&a->fdsp, avctx);
     ff_pixblockdsp_init(&a->pdsp, avctx);
 
-    if (avctx->global_quality == 0)
+    if (avctx->global_quality <= 0)
         avctx->global_quality = 4 * FF_QUALITY_SCALE;
 
     a->inv_qscale = (32 * scale * FF_QUALITY_SCALE +
@@ -307,8 +337,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
     ((uint32_t *) avctx->extradata)[1] = av_le2ne32(AV_RL32("ASUS"));
 
     for (i = 0; i < 64; i++) {
-        int q = 32 * scale * ff_mpeg1_default_intra_matrix[i];
-        a->q_intra_matrix[i] = ((a->inv_qscale << 16) + q / 2) / q;
+        if (a->fdsp.fdct == ff_fdct_ifast) {
+            int q = 32LL * scale * ff_mpeg1_default_intra_matrix[i] * ff_aanscales[i];
+            a->q_intra_matrix[i] = (((int64_t)a->inv_qscale << 30) + q / 2) / q;
+        } else {
+            int q = 32 * scale * ff_mpeg1_default_intra_matrix[i];
+            a->q_intra_matrix[i] = ((a->inv_qscale << 16) + q / 2) / q;
+        }
     }
 
     return 0;
diff --git a/libavcodec/atrac.c b/libavcodec/atrac.c
index f36db9e..12e8997 100644
--- a/libavcodec/atrac.c
+++ b/libavcodec/atrac.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2013 Maxim Poliakovski
  * Copyright (c) 2006-2008 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -124,7 +124,8 @@ void ff_atrac_gain_compensation(AtracGCContext *gctx, float *in, float *prev,
     memcpy(prev, &in[num_samples], num_samples * sizeof(float));
 }
 
-void ff_atrac_iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float *delayBuf, float *temp)
+void ff_atrac_iqmf(float *inlo, float *inhi, unsigned int nIn, float *pOut,
+                   float *delayBuf, float *temp)
 {
     int   i, j;
     float   *p1, *p3;
diff --git a/libavcodec/atrac.h b/libavcodec/atrac.h
index 8909323..05208bb 100644
--- a/libavcodec/atrac.h
+++ b/libavcodec/atrac.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2009-2013 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -91,6 +91,7 @@ void ff_atrac_gain_compensation(AtracGCContext *gctx, float *in, float *prev,
  * @param delayBuf  delayBuf buffer
  * @param temp      temp buffer
  */
-void ff_atrac_iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float *delayBuf, float *temp);
+void ff_atrac_iqmf(float *inlo, float *inhi, unsigned int nIn, float *pOut,
+                   float *delayBuf, float *temp);
 
 #endif /* AVCODEC_ATRAC_H */
diff --git a/libavcodec/atrac1.c b/libavcodec/atrac1.c
index 52d43e2..a8c8c91 100644
--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,9 +33,8 @@
 #include <stdio.h>
 
 #include "libavutil/float_dsp.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "fft.h"
 #include "internal.h"
 #include "sinewin.h"
@@ -66,7 +65,7 @@ typedef struct AT1SUCtx {
     DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
     DECLARE_ALIGNED(32, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
     DECLARE_ALIGNED(32, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
-    DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
+    DECLARE_ALIGNED(32, float, last_qmf_delay)[256+39];    ///< delay line for the last stacked QMF filter
 } AT1SUCtx;
 
 /**
@@ -81,7 +80,7 @@ typedef struct AT1Ctx {
     DECLARE_ALIGNED(32, float, high)[512];
     float*              bands[3];
     FFTContext          mdct_ctx[3];
-    AVFloatDSPContext   fdsp;
+    AVFloatDSPContext   *fdsp;
 } AT1Ctx;
 
 /** size of the transform in samples in the long mode for each QMF band */
@@ -141,7 +140,7 @@ static int at1_imdct_block(AT1SUCtx* su, AT1Ctx *q)
             at1_imdct(q, &q->spec[pos], &su->spectrum[0][ref_pos + start_pos], nbits, band_num);
 
             /* overlap and window */
-            q->fdsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
+            q->fdsp->vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
                                        &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16);
 
             prev_buf = &su->spectrum[0][ref_pos+start_pos + 16];
@@ -165,31 +164,30 @@ static int at1_imdct_block(AT1SUCtx* su, AT1Ctx *q)
  * Parse the block size mode byte
  */
 
-static int at1_parse_bsm(BitstreamContext *bc,
-                         int log2_block_cnt[AT1_QMF_BANDS])
+static int at1_parse_bsm(GetBitContext* gb, int log2_block_cnt[AT1_QMF_BANDS])
 {
     int log2_block_count_tmp, i;
 
     for (i = 0; i < 2; i++) {
         /* low and mid band */
-        log2_block_count_tmp = bitstream_read(bc, 2);
+        log2_block_count_tmp = get_bits(gb, 2);
         if (log2_block_count_tmp & 1)
             return AVERROR_INVALIDDATA;
         log2_block_cnt[i] = 2 - log2_block_count_tmp;
     }
 
     /* high band */
-    log2_block_count_tmp = bitstream_read(bc, 2);
+    log2_block_count_tmp = get_bits(gb, 2);
     if (log2_block_count_tmp != 0 && log2_block_count_tmp != 3)
         return AVERROR_INVALIDDATA;
     log2_block_cnt[IDX_HIGH_BAND] = 3 - log2_block_count_tmp;
 
-    bitstream_skip(bc, 2);
+    skip_bits(gb, 2);
     return 0;
 }
 
 
-static int at1_unpack_dequant(BitstreamContext *bc, AT1SUCtx *su,
+static int at1_unpack_dequant(GetBitContext* gb, AT1SUCtx* su,
                               float spec[AT1_SU_SAMPLES])
 {
     int bits_used, band_num, bfu_num, i;
@@ -197,22 +195,22 @@ static int at1_unpack_dequant(BitstreamContext *bc, AT1SUCtx *su,
     uint8_t idsfs[AT1_MAX_BFU];                 ///< the scalefactor indexes for each BFU
 
     /* parse the info byte (2nd byte) telling how much BFUs were coded */
-    su->num_bfus = bfu_amount_tab1[bitstream_read(bc, 3)];
+    su->num_bfus = bfu_amount_tab1[get_bits(gb, 3)];
 
     /* calc number of consumed bits:
         num_BFUs * (idwl(4bits) + idsf(6bits)) + log2_block_count(8bits) + info_byte(8bits)
         + info_byte_copy(8bits) + log2_block_count_copy(8bits) */
     bits_used = su->num_bfus * 10 + 32 +
-                bfu_amount_tab2[bitstream_read(bc, 2)] +
-                (bfu_amount_tab3[bitstream_read(bc, 3)] << 1);
+                bfu_amount_tab2[get_bits(gb, 2)] +
+                (bfu_amount_tab3[get_bits(gb, 3)] << 1);
 
     /* get word length index (idwl) for each BFU */
     for (i = 0; i < su->num_bfus; i++)
-        idwls[i] = bitstream_read(bc, 4);
+        idwls[i] = get_bits(gb, 4);
 
     /* get scalefactor index (idsf) for each BFU */
     for (i = 0; i < su->num_bfus; i++)
-        idsfs[i] = bitstream_read(bc, 6);
+        idsfs[i] = get_bits(gb, 6);
 
     /* zero idwl/idsf for empty BFUs */
     for (i = su->num_bfus; i < AT1_MAX_BFU; i++)
@@ -242,9 +240,9 @@ static int at1_unpack_dequant(BitstreamContext *bc, AT1SUCtx *su,
                     /* read in a quantized spec and convert it to
                      * signed int and then inverse quantization
                      */
-                    spec[pos+i] = bitstream_read_signed(bc, word_len) * scale_factor * max_quant;
+                    spec[pos+i] = get_sbits(gb, word_len) * scale_factor * max_quant;
                 }
-            } else { /* word_len = 0 -> empty BFU, zero all specs in the emty BFU */
+            } else { /* word_len = 0 -> empty BFU, zero all specs in the empty BFU */
                 memset(&spec[pos], 0, num_specs * sizeof(float));
             }
         }
@@ -262,9 +260,9 @@ static void at1_subband_synthesis(AT1Ctx *q, AT1SUCtx* su, float *pOut)
     /* combine low and middle bands */
     ff_atrac_iqmf(q->bands[0], q->bands[1], 128, temp, su->fst_qmf_delay, iqmf_temp);
 
-    /* delay the signal of the high band by 23 samples */
-    memcpy( su->last_qmf_delay,    &su->last_qmf_delay[256], sizeof(float) *  23);
-    memcpy(&su->last_qmf_delay[23], q->bands[2],             sizeof(float) * 256);
+    /* delay the signal of the high band by 39 samples */
+    memcpy( su->last_qmf_delay,    &su->last_qmf_delay[256], sizeof(float) *  39);
+    memcpy(&su->last_qmf_delay[39], q->bands[2],             sizeof(float) * 256);
 
     /* combine (low + middle) and high bands */
     ff_atrac_iqmf(temp, su->last_qmf_delay, 256, pOut, su->snd_qmf_delay, iqmf_temp);
@@ -279,7 +277,7 @@ static int atrac1_decode_frame(AVCodecContext *avctx, void *data,
     int buf_size       = avpkt->size;
     AT1Ctx *q          = avctx->priv_data;
     int ch, ret;
-    BitstreamContext bc;
+    GetBitContext gb;
 
 
     if (buf_size < 212 * avctx->channels) {
@@ -289,22 +287,20 @@ static int atrac1_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = AT1_SU_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (ch = 0; ch < avctx->channels; ch++) {
         AT1SUCtx* su = &q->SUs[ch];
 
-        bitstream_init8(&bc, &buf[212 * ch], 212);
+        init_get_bits(&gb, &buf[212 * ch], 212 * 8);
 
         /* parse block_size_mode, 1st byte */
-        ret = at1_parse_bsm(&bc, su->log2_block_count);
+        ret = at1_parse_bsm(&gb, su->log2_block_count);
         if (ret < 0)
             return ret;
 
-        ret = at1_unpack_dequant(&bc, su, q->spec);
+        ret = at1_unpack_dequant(&gb, su, q->spec);
         if (ret < 0)
             return ret;
 
@@ -328,6 +324,8 @@ static av_cold int atrac1_decode_end(AVCodecContext * avctx)
     ff_mdct_end(&q->mdct_ctx[1]);
     ff_mdct_end(&q->mdct_ctx[2]);
 
+    av_freep(&q->fdsp);
+
     return 0;
 }
 
@@ -345,6 +343,11 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
+    if (avctx->block_align <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported block align.");
+        return AVERROR_PATCHWELCOME;
+    }
+
     /* Init the mdct transforms */
     if ((ret = ff_mdct_init(&q->mdct_ctx[0], 6, 1, -1.0/ (1 << 15))) ||
         (ret = ff_mdct_init(&q->mdct_ctx[1], 8, 1, -1.0/ (1 << 15))) ||
@@ -358,7 +361,7 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
 
     ff_atrac_generate_tables();
 
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     q->bands[0] = q->low;
     q->bands[1] = q->mid;
diff --git a/libavcodec/atrac1data.h b/libavcodec/atrac1data.h
index 539867b..62c218b 100644
--- a/libavcodec/atrac1data.h
+++ b/libavcodec/atrac1data.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3.c b/libavcodec/atrac3.c
index d0661c8..6cdcdf1 100644
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2008 Maxim Poliakovski
  * Copyright (c) 2006-2008 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,19 +38,22 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/float_dsp.h"
-
+#include "libavutil/libm.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
 #include "fft.h"
+#include "get_bits.h"
 #include "internal.h"
-#include "vlc.h"
 
 #include "atrac.h"
 #include "atrac3data.h"
 
+#define MIN_CHANNELS    1
+#define MAX_CHANNELS    8
+#define MAX_JS_PAIRS    8 / 2
+
 #define JOINT_STEREO    0x12
-#define STEREO          0x2
+#define SINGLE          0x2
 
 #define SAMPLES_PER_FRAME 1024
 #define MDCT_SIZE          512
@@ -82,7 +85,7 @@ typedef struct ChannelUnit {
 } ChannelUnit;
 
 typedef struct ATRAC3Context {
-    BitstreamContext bc;
+    GetBitContext gb;
     //@{
     /** stream data */
     int coding_mode;
@@ -91,10 +94,10 @@ typedef struct ATRAC3Context {
     //@}
     //@{
     /** joint-stereo related variables */
-    int matrix_coeff_index_prev[4];
-    int matrix_coeff_index_now[4];
-    int matrix_coeff_index_next[4];
-    int weighting_delay[6];
+    int matrix_coeff_index_prev[MAX_JS_PAIRS][4];
+    int matrix_coeff_index_now[MAX_JS_PAIRS][4];
+    int matrix_coeff_index_next[MAX_JS_PAIRS][4];
+    int weighting_delay[MAX_JS_PAIRS][6];
     //@}
     //@{
     /** data buffers */
@@ -106,9 +109,9 @@ typedef struct ATRAC3Context {
     int scrambled_stream;
     //@}
 
-    AtracGCContext  gainc_ctx;
-    FFTContext mdct_ctx;
-    AVFloatDSPContext fdsp;
+    AtracGCContext    gainc_ctx;
+    FFTContext        mdct_ctx;
+    AVFloatDSPContext *fdsp;
 } ATRAC3Context;
 
 static DECLARE_ALIGNED(32, float, mdct_window)[MDCT_SIZE];
@@ -141,7 +144,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
     q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input);
 
     /* Perform windowing on the output. */
-    q->fdsp.vector_fmul(output, output, mdct_window, MDCT_SIZE);
+    q->fdsp->vector_fmul(output, output, mdct_window, MDCT_SIZE);
 }
 
 /*
@@ -189,8 +192,9 @@ static av_cold int atrac3_decode_close(AVCodecContext *avctx)
 {
     ATRAC3Context *q = avctx->priv_data;
 
-    av_free(q->units);
-    av_free(q->decoded_bytes_buffer);
+    av_freep(&q->units);
+    av_freep(&q->decoded_bytes_buffer);
+    av_freep(&q->fdsp);
 
     ff_mdct_end(&q->mdct_ctx);
 
@@ -205,7 +209,7 @@ static av_cold int atrac3_decode_close(AVCodecContext *avctx)
  * @param mantissas    mantissa output table
  * @param num_codes    number of values to get
  */
-static void read_quant_spectral_coeffs(BitstreamContext *bc, int selector,
+static void read_quant_spectral_coeffs(GetBitContext *gb, int selector,
                                        int coding_flag, int *mantissas,
                                        int num_codes)
 {
@@ -221,7 +225,7 @@ static void read_quant_spectral_coeffs(BitstreamContext *bc, int selector,
         if (selector > 1) {
             for (i = 0; i < num_codes; i++) {
                 if (num_bits)
-                    code = bitstream_read_signed(bc, num_bits);
+                    code = get_sbits(gb, num_bits);
                 else
                     code = 0;
                 mantissas[i] = code;
@@ -229,7 +233,7 @@ static void read_quant_spectral_coeffs(BitstreamContext *bc, int selector,
         } else {
             for (i = 0; i < num_codes; i++) {
                 if (num_bits)
-                    code = bitstream_read(bc, num_bits); // num_bits is always 4 in this case
+                    code = get_bits(gb, num_bits); // num_bits is always 4 in this case
                 else
                     code = 0;
                 mantissas[i * 2    ] = mantissa_clc_tab[code >> 2];
@@ -240,8 +244,8 @@ static void read_quant_spectral_coeffs(BitstreamContext *bc, int selector,
         /* variable length coding (VLC) */
         if (selector != 1) {
             for (i = 0; i < num_codes; i++) {
-                huff_symb = bitstream_read_vlc(bc, spectral_coeff_tab[selector-1].table,
-                                               spectral_coeff_tab[selector-1].bits, 3);
+                huff_symb = get_vlc2(gb, spectral_coeff_tab[selector-1].table,
+                                     spectral_coeff_tab[selector-1].bits, 3);
                 huff_symb += 1;
                 code = huff_symb >> 1;
                 if (huff_symb & 1)
@@ -250,8 +254,8 @@ static void read_quant_spectral_coeffs(BitstreamContext *bc, int selector,
             }
         } else {
             for (i = 0; i < num_codes; i++) {
-                huff_symb = bitstream_read_vlc(bc, spectral_coeff_tab[selector - 1].table,
-                                               spectral_coeff_tab[selector - 1].bits, 3);
+                huff_symb = get_vlc2(gb, spectral_coeff_tab[selector - 1].table,
+                                     spectral_coeff_tab[selector - 1].bits, 3);
                 mantissas[i * 2    ] = mantissa_vlc_tab[huff_symb * 2    ];
                 mantissas[i * 2 + 1] = mantissa_vlc_tab[huff_symb * 2 + 1];
             }
@@ -264,24 +268,24 @@ static void read_quant_spectral_coeffs(BitstreamContext *bc, int selector,
  *
  * @return subband count, fix for broken specification/files
  */
-static int decode_spectrum(BitstreamContext *bc, float *output)
+static int decode_spectrum(GetBitContext *gb, float *output)
 {
     int num_subbands, coding_mode, i, j, first, last, subband_size;
     int subband_vlc_index[32], sf_index[32];
     int mantissas[128];
     float scale_factor;
 
-    num_subbands = bitstream_read(bc, 5);   // number of coded subbands
-    coding_mode  = bitstream_read_bit(bc);  // coding Mode: 0 - VLC/ 1 - CLC
+    num_subbands = get_bits(gb, 5);  // number of coded subbands
+    coding_mode  = get_bits1(gb);    // coding Mode: 0 - VLC/ 1-CLC
 
     /* get the VLC selector table for the subbands, 0 means not coded */
     for (i = 0; i <= num_subbands; i++)
-        subband_vlc_index[i] = bitstream_read(bc, 3);
+        subband_vlc_index[i] = get_bits(gb, 3);
 
     /* read the scale factor indexes from the stream */
     for (i = 0; i <= num_subbands; i++) {
         if (subband_vlc_index[i] != 0)
-            sf_index[i] = bitstream_read(bc, 6);
+            sf_index[i] = get_bits(gb, 6);
     }
 
     for (i = 0; i <= num_subbands; i++) {
@@ -294,7 +298,7 @@ static int decode_spectrum(BitstreamContext *bc, float *output)
             /* decode spectral coefficients for this subband */
             /* TODO: This can be done faster is several blocks share the
              * same VLC selector (subband_vlc_index) */
-            read_quant_spectral_coeffs(bc, subband_vlc_index[i], coding_mode,
+            read_quant_spectral_coeffs(gb, subband_vlc_index[i], coding_mode,
                                        mantissas, subband_size);
 
             /* decode the scale factor for this subband */
@@ -322,7 +326,7 @@ static int decode_spectrum(BitstreamContext *bc, float *output)
  * @param components tonal components
  * @param num_bands  number of coded bands
  */
-static int decode_tonal_components(BitstreamContext *bc,
+static int decode_tonal_components(GetBitContext *gb,
                                    TonalComponent *components, int num_bands)
 {
     int i, b, c, m;
@@ -330,13 +334,13 @@ static int decode_tonal_components(BitstreamContext *bc,
     int band_flags[4], mantissa[8];
     int component_count = 0;
 
-    nb_components = bitstream_read(bc, 5);
+    nb_components = get_bits(gb, 5);
 
     /* no tonal components */
     if (nb_components == 0)
         return 0;
 
-    coding_mode_selector = bitstream_read(bc, 2);
+    coding_mode_selector = get_bits(gb, 2);
     if (coding_mode_selector == 2)
         return AVERROR_INVALIDDATA;
 
@@ -346,16 +350,16 @@ static int decode_tonal_components(BitstreamContext *bc,
         int coded_values_per_component, quant_step_index;
 
         for (b = 0; b <= num_bands; b++)
-            band_flags[b] = bitstream_read_bit(bc);
+            band_flags[b] = get_bits1(gb);
 
-        coded_values_per_component = bitstream_read(bc, 3);
+        coded_values_per_component = get_bits(gb, 3);
 
-        quant_step_index = bitstream_read(bc, 3);
+        quant_step_index = get_bits(gb, 3);
         if (quant_step_index <= 1)
             return AVERROR_INVALIDDATA;
 
         if (coding_mode_selector == 3)
-            coding_mode = bitstream_read_bit(bc);
+            coding_mode = get_bits1(gb);
 
         for (b = 0; b < (num_bands + 1) * 4; b++) {
             int coded_components;
@@ -363,18 +367,18 @@ static int decode_tonal_components(BitstreamContext *bc,
             if (band_flags[b >> 2] == 0)
                 continue;
 
-            coded_components = bitstream_read(bc, 3);
+            coded_components = get_bits(gb, 3);
 
             for (c = 0; c < coded_components; c++) {
                 TonalComponent *cmp = &components[component_count];
                 int sf_index, coded_values, max_coded_values;
                 float scale_factor;
 
-                sf_index = bitstream_read(bc, 6);
+                sf_index = get_bits(gb, 6);
                 if (component_count >= 64)
                     return AVERROR_INVALIDDATA;
 
-                cmp->pos = b * 64 + bitstream_read(bc, 6);
+                cmp->pos = b * 64 + get_bits(gb, 6);
 
                 max_coded_values = SAMPLES_PER_FRAME - cmp->pos;
                 coded_values     = coded_values_per_component + 1;
@@ -383,7 +387,7 @@ static int decode_tonal_components(BitstreamContext *bc,
                 scale_factor = ff_atrac_sf_table[sf_index] *
                                inv_max_quant[quant_step_index];
 
-                read_quant_spectral_coeffs(bc, quant_step_index, coding_mode,
+                read_quant_spectral_coeffs(gb, quant_step_index, coding_mode,
                                            mantissa, coded_values);
 
                 cmp->num_coefs = coded_values;
@@ -406,30 +410,30 @@ static int decode_tonal_components(BitstreamContext *bc,
  * @param block      the gainblock for the current band
  * @param num_bands  amount of coded bands
  */
-static int decode_gain_control(BitstreamContext *bc, GainBlock *block,
+static int decode_gain_control(GetBitContext *gb, GainBlock *block,
                                int num_bands)
 {
-    int i, j;
+    int b, j;
     int *level, *loc;
 
     AtracGainInfo *gain = block->g_block;
 
-    for (i = 0; i <= num_bands; i++) {
-        gain[i].num_points    = bitstream_read(bc, 3);
-        level                 = gain[i].lev_code;
-        loc                   = gain[i].loc_code;
+    for (b = 0; b <= num_bands; b++) {
+        gain[b].num_points = get_bits(gb, 3);
+        level              = gain[b].lev_code;
+        loc                = gain[b].loc_code;
 
-        for (j = 0; j < gain[i].num_points; j++) {
-            level[j] = bitstream_read(bc, 4);
-            loc[j]   = bitstream_read(bc, 5);
+        for (j = 0; j < gain[b].num_points; j++) {
+            level[j] = get_bits(gb, 4);
+            loc[j]   = get_bits(gb, 5);
             if (j && loc[j] <= loc[j - 1])
                 return AVERROR_INVALIDDATA;
         }
     }
 
     /* Clear the unused blocks. */
-    for (; i < 4 ; i++)
-        gain[i].num_points = 0;
+    for (; b < 4 ; b++)
+        gain[b].num_points = 0;
 
     return 0;
 }
@@ -520,7 +524,7 @@ static void reverse_matrixing(float *su1, float *su2, int *prev_code,
             }
             break;
         default:
-            assert(0);
+            av_assert1(0);
         }
     }
 }
@@ -567,9 +571,9 @@ static void channel_weighting(float *su1, float *su2, int *p3)
  * @param snd           the channel unit to be used
  * @param output        the decoded samples before IQMF in float representation
  * @param channel_num   channel number
- * @param coding_mode   the coding mode (JOINT_STEREO or regular stereo/mono)
+ * @param coding_mode   the coding mode (JOINT_STEREO or single channels)
  */
-static int decode_channel_sound_unit(ATRAC3Context *q, BitstreamContext *bc,
+static int decode_channel_sound_unit(ATRAC3Context *q, GetBitContext *gb,
                                      ChannelUnit *snd, float *output,
                                      int channel_num, int coding_mode)
 {
@@ -577,31 +581,31 @@ static int decode_channel_sound_unit(ATRAC3Context *q, BitstreamContext *bc,
     GainBlock *gain1 = &snd->gain_block[    snd->gc_blk_switch];
     GainBlock *gain2 = &snd->gain_block[1 - snd->gc_blk_switch];
 
-    if (coding_mode == JOINT_STEREO && channel_num == 1) {
-        if (bitstream_read(bc, 2) != 3) {
+    if (coding_mode == JOINT_STEREO && (channel_num % 2) == 1) {
+        if (get_bits(gb, 2) != 3) {
             av_log(NULL,AV_LOG_ERROR,"JS mono Sound Unit id != 3.\n");
             return AVERROR_INVALIDDATA;
         }
     } else {
-        if (bitstream_read(bc, 6) != 0x28) {
+        if (get_bits(gb, 6) != 0x28) {
             av_log(NULL,AV_LOG_ERROR,"Sound Unit id != 0x28.\n");
             return AVERROR_INVALIDDATA;
         }
     }
 
     /* number of coded QMF bands */
-    snd->bands_coded = bitstream_read(bc, 2);
+    snd->bands_coded = get_bits(gb, 2);
 
-    ret = decode_gain_control(bc, gain2, snd->bands_coded);
+    ret = decode_gain_control(gb, gain2, snd->bands_coded);
     if (ret)
         return ret;
 
-    snd->num_components = decode_tonal_components(bc, snd->components,
+    snd->num_components = decode_tonal_components(gb, snd->components,
                                                   snd->bands_coded);
     if (snd->num_components < 0)
         return snd->num_components;
 
-    num_subbands = decode_spectrum(bc, snd->spectrum);
+    num_subbands = decode_spectrum(gb, snd->spectrum);
 
     /* Merge the decoded spectrum and tonal components. */
     last_tonal = add_tonal_components(snd->spectrum, snd->num_components,
@@ -640,77 +644,95 @@ static int decode_frame(AVCodecContext *avctx, const uint8_t *databuf,
                         float **out_samples)
 {
     ATRAC3Context *q = avctx->priv_data;
-    int ret, i;
+    int ret, i, ch;
     uint8_t *ptr1;
 
     if (q->coding_mode == JOINT_STEREO) {
         /* channel coupling mode */
-        /* decode Sound Unit 1 */
-        bitstream_init8(&q->bc, databuf, avctx->block_align);
 
-        ret = decode_channel_sound_unit(q, &q->bc, q->units, out_samples[0], 0,
-                                        JOINT_STEREO);
-        if (ret != 0)
-            return ret;
+        /* Decode sound unit pairs (channels are expected to be even).
+         * Multichannel joint stereo interleaves pairs (6ch: 2ch + 2ch + 2ch) */
+        const uint8_t *js_databuf;
+        int js_pair, js_block_align;
 
-        /* Framedata of the su2 in the joint-stereo mode is encoded in
-         * reverse byte order so we need to swap it first. */
-        if (databuf == q->decoded_bytes_buffer) {
-            uint8_t *ptr2 = q->decoded_bytes_buffer + avctx->block_align - 1;
-            ptr1          = q->decoded_bytes_buffer;
-            for (i = 0; i < avctx->block_align / 2; i++, ptr1++, ptr2--)
-                FFSWAP(uint8_t, *ptr1, *ptr2);
-        } else {
-            const uint8_t *ptr2 = databuf + avctx->block_align - 1;
-            for (i = 0; i < avctx->block_align; i++)
-                q->decoded_bytes_buffer[i] = *ptr2--;
-        }
+        js_block_align = (avctx->block_align / avctx->channels) * 2; /* block pair */
 
-        /* Skip the sync codes (0xF8). */
-        ptr1 = q->decoded_bytes_buffer;
-        for (i = 4; *ptr1 == 0xF8; i++, ptr1++) {
-            if (i >= avctx->block_align)
-                return AVERROR_INVALIDDATA;
-        }
+        for (ch = 0; ch < avctx->channels; ch = ch + 2) {
+            js_pair = ch/2;
+            js_databuf = databuf + js_pair * js_block_align; /* align to current pair */
 
+            /* Set the bitstream reader at the start of first channel sound unit. */
+            init_get_bits(&q->gb,
+                          js_databuf, js_block_align * 8);
 
-        /* set the bitstream reader at the start of the second Sound Unit*/
-        bitstream_init8(&q->bc, ptr1, avctx->block_align - i);
+            /* decode Sound Unit 1 */
+            ret = decode_channel_sound_unit(q, &q->gb, &q->units[ch],
+                                            out_samples[ch], ch, JOINT_STEREO);
+            if (ret != 0)
+                return ret;
 
-        /* Fill the Weighting coeffs delay buffer */
-        memmove(q->weighting_delay, &q->weighting_delay[2],
-                4 * sizeof(*q->weighting_delay));
-        q->weighting_delay[4] = bitstream_read_bit(&q->bc);
-        q->weighting_delay[5] = bitstream_read(&q->bc, 3);
+            /* Framedata of the su2 in the joint-stereo mode is encoded in
+             * reverse byte order so we need to swap it first. */
+            if (js_databuf == q->decoded_bytes_buffer) {
+                uint8_t *ptr2 = q->decoded_bytes_buffer + js_block_align - 1;
+                ptr1          = q->decoded_bytes_buffer;
+                for (i = 0; i < js_block_align / 2; i++, ptr1++, ptr2--)
+                    FFSWAP(uint8_t, *ptr1, *ptr2);
+            } else {
+                const uint8_t *ptr2 = js_databuf + js_block_align - 1;
+                for (i = 0; i < js_block_align; i++)
+                    q->decoded_bytes_buffer[i] = *ptr2--;
+            }
 
-        for (i = 0; i < 4; i++) {
-            q->matrix_coeff_index_prev[i] = q->matrix_coeff_index_now[i];
-            q->matrix_coeff_index_now[i]  = q->matrix_coeff_index_next[i];
-            q->matrix_coeff_index_next[i] = bitstream_read(&q->bc, 2);
-        }
+            /* Skip the sync codes (0xF8). */
+            ptr1 = q->decoded_bytes_buffer;
+            for (i = 4; *ptr1 == 0xF8; i++, ptr1++) {
+                if (i >= js_block_align)
+                    return AVERROR_INVALIDDATA;
+            }
 
-        /* Decode Sound Unit 2. */
-        ret = decode_channel_sound_unit(q, &q->bc, &q->units[1],
-                                        out_samples[1], 1, JOINT_STEREO);
-        if (ret != 0)
-            return ret;
 
-        /* Reconstruct the channel coefficients. */
-        reverse_matrixing(out_samples[0], out_samples[1],
-                          q->matrix_coeff_index_prev,
-                          q->matrix_coeff_index_now);
+            /* set the bitstream reader at the start of the second Sound Unit */
+            ret = init_get_bits8(&q->gb,
+                           ptr1, q->decoded_bytes_buffer + js_block_align - ptr1);
+            if (ret < 0)
+                return ret;
+
+            /* Fill the Weighting coeffs delay buffer */
+            memmove(q->weighting_delay[js_pair], &q->weighting_delay[js_pair][2],
+                    4 * sizeof(*q->weighting_delay[js_pair]));
+            q->weighting_delay[js_pair][4] = get_bits1(&q->gb);
+            q->weighting_delay[js_pair][5] = get_bits(&q->gb, 3);
 
-        channel_weighting(out_samples[0], out_samples[1], q->weighting_delay);
+            for (i = 0; i < 4; i++) {
+                q->matrix_coeff_index_prev[js_pair][i] = q->matrix_coeff_index_now[js_pair][i];
+                q->matrix_coeff_index_now[js_pair][i]  = q->matrix_coeff_index_next[js_pair][i];
+                q->matrix_coeff_index_next[js_pair][i] = get_bits(&q->gb, 2);
+            }
+
+            /* Decode Sound Unit 2. */
+            ret = decode_channel_sound_unit(q, &q->gb, &q->units[ch+1],
+                                            out_samples[ch+1], ch+1, JOINT_STEREO);
+            if (ret != 0)
+                return ret;
+
+            /* Reconstruct the channel coefficients. */
+            reverse_matrixing(out_samples[ch], out_samples[ch+1],
+                              q->matrix_coeff_index_prev[js_pair],
+                              q->matrix_coeff_index_now[js_pair]);
+
+            channel_weighting(out_samples[ch], out_samples[ch+1], q->weighting_delay[js_pair]);
+        }
     } else {
-        /* normal stereo mode or mono */
+        /* single channels */
         /* Decode the channel sound units. */
         for (i = 0; i < avctx->channels; i++) {
             /* Set the bitstream reader at the start of a channel sound unit. */
-            bitstream_init8(&q->bc,
-                            databuf + i * avctx->block_align / avctx->channels,
-                            avctx->block_align / avctx->channels);
+            init_get_bits(&q->gb,
+                          databuf + i * avctx->block_align / avctx->channels,
+                          avctx->block_align * 8 / avctx->channels);
 
-            ret = decode_channel_sound_unit(q, &q->bc, &q->units[i],
+            ret = decode_channel_sound_unit(q, &q->gb, &q->units[i],
                                             out_samples[i], i, q->coding_mode);
             if (ret != 0)
                 return ret;
@@ -731,6 +753,40 @@ static int decode_frame(AVCodecContext *avctx, const uint8_t *databuf,
     return 0;
 }
 
+static int al_decode_frame(AVCodecContext *avctx, const uint8_t *databuf,
+                           int size, float **out_samples)
+{
+    ATRAC3Context *q = avctx->priv_data;
+    int ret, i;
+
+    /* Set the bitstream reader at the start of a channel sound unit. */
+    init_get_bits(&q->gb, databuf, size * 8);
+    /* single channels */
+    /* Decode the channel sound units. */
+    for (i = 0; i < avctx->channels; i++) {
+        ret = decode_channel_sound_unit(q, &q->gb, &q->units[i],
+                                        out_samples[i], i, q->coding_mode);
+        if (ret != 0)
+            return ret;
+        while (i < avctx->channels && get_bits_left(&q->gb) > 6 && show_bits(&q->gb, 6) != 0x28) {
+            skip_bits(&q->gb, 1);
+        }
+    }
+
+    /* Apply the iQMF synthesis filter. */
+    for (i = 0; i < avctx->channels; i++) {
+        float *p1 = out_samples[i];
+        float *p2 = p1 + 256;
+        float *p3 = p2 + 256;
+        float *p4 = p3 + 256;
+        ff_atrac_iqmf(p1, p2, 256, p1, q->units[i].delay_buf1, q->temp_buf);
+        ff_atrac_iqmf(p4, p3, 256, p3, q->units[i].delay_buf2, q->temp_buf);
+        ff_atrac_iqmf(p1, p3, 512, p1, q->units[i].delay_buf3, q->temp_buf);
+    }
+
+    return 0;
+}
+
 static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
                                int *got_frame_ptr, AVPacket *avpkt)
 {
@@ -749,10 +805,8 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = SAMPLES_PER_FRAME;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /* Check if we need to descramble and what buffer to pass on. */
     if (q->scrambled_stream) {
@@ -764,7 +818,7 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
 
     ret = decode_frame(avctx, databuf, (float **)frame->extended_data);
     if (ret) {
-        av_log(NULL, AV_LOG_ERROR, "Frame decoding error!\n");
+        av_log(avctx, AV_LOG_ERROR, "Frame decoding error!\n");
         return ret;
     }
 
@@ -773,7 +827,29 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
     return avctx->block_align;
 }
 
-static av_cold void atrac3_init_static_data(AVCodec *codec)
+static int atrac3al_decode_frame(AVCodecContext *avctx, void *data,
+                                 int *got_frame_ptr, AVPacket *avpkt)
+{
+    AVFrame *frame = data;
+    int ret;
+
+    frame->nb_samples = SAMPLES_PER_FRAME;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    ret = al_decode_frame(avctx, avpkt->data, avpkt->size,
+                          (float **)frame->extended_data);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "Frame decoding error!\n");
+        return ret;
+    }
+
+    *got_frame_ptr = 1;
+
+    return avpkt->size;
+}
+
+static av_cold void atrac3_init_static_data(void)
 {
     int i;
 
@@ -793,18 +869,28 @@ static av_cold void atrac3_init_static_data(AVCodec *codec)
 
 static av_cold int atrac3_decode_init(AVCodecContext *avctx)
 {
-    int i, ret;
+    static int static_init_done;
+    int i, js_pair, ret;
     int version, delay, samples_per_frame, frame_factor;
     const uint8_t *edata_ptr = avctx->extradata;
     ATRAC3Context *q = avctx->priv_data;
 
-    if (avctx->channels <= 0 || avctx->channels > 2) {
+    if (avctx->channels < MIN_CHANNELS || avctx->channels > MAX_CHANNELS) {
         av_log(avctx, AV_LOG_ERROR, "Channel configuration error!\n");
         return AVERROR(EINVAL);
     }
 
+    if (!static_init_done)
+        atrac3_init_static_data();
+    static_init_done = 1;
+
     /* Take care of the codec-specific extradata. */
-    if (avctx->extradata_size == 14) {
+    if (avctx->codec_id == AV_CODEC_ID_ATRAC3AL) {
+        version           = 4;
+        samples_per_frame = SAMPLES_PER_FRAME * avctx->channels;
+        delay             = 0x88E;
+        q->coding_mode    = SINGLE;
+    } else if (avctx->extradata_size == 14) {
         /* Parse the extradata, WAV format */
         av_log(avctx, AV_LOG_DEBUG, "[0-1] %d\n",
                bytestream_get_le16(&edata_ptr));  // Unknown value always 1
@@ -820,7 +906,7 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
         samples_per_frame    = SAMPLES_PER_FRAME * avctx->channels;
         version              = 4;
         delay                = 0x88E;
-        q->coding_mode       = q->coding_mode ? JOINT_STEREO : STEREO;
+        q->coding_mode       = q->coding_mode ? JOINT_STEREO : SINGLE;
         q->scrambled_stream  = 0;
 
         if (avctx->block_align !=  96 * avctx->channels * frame_factor &&
@@ -831,7 +917,7 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
                    avctx->channels, frame_factor);
             return AVERROR_INVALIDDATA;
         }
-    } else if (avctx->extradata_size == 10) {
+    } else if (avctx->extradata_size == 12 || avctx->extradata_size == 10) {
         /* Parse the extradata, RM format. */
         version                = bytestream_get_be32(&edata_ptr);
         samples_per_frame      = bytestream_get_be16(&edata_ptr);
@@ -840,7 +926,7 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
         q->scrambled_stream    = 1;
 
     } else {
-        av_log(NULL, AV_LOG_ERROR, "Unknown extradata size %d.\n",
+        av_log(avctx, AV_LOG_ERROR, "Unknown extradata size %d.\n",
                avctx->extradata_size);
         return AVERROR(EINVAL);
     }
@@ -852,8 +938,7 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    if (samples_per_frame != SAMPLES_PER_FRAME &&
-        samples_per_frame != SAMPLES_PER_FRAME * 2) {
+    if (samples_per_frame != SAMPLES_PER_FRAME * avctx->channels) {
         av_log(avctx, AV_LOG_ERROR, "Unknown amount of samples per frame %d.\n",
                samples_per_frame);
         return AVERROR_INVALIDDATA;
@@ -865,11 +950,13 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    if (q->coding_mode == STEREO)
-        av_log(avctx, AV_LOG_DEBUG, "Normal stereo detected.\n");
+    if (q->coding_mode == SINGLE)
+        av_log(avctx, AV_LOG_DEBUG, "Single channels detected.\n");
     else if (q->coding_mode == JOINT_STEREO) {
-        if (avctx->channels != 2)
+        if (avctx->channels % 2 == 1) { /* Joint stereo channels must be even */
+            av_log(avctx, AV_LOG_ERROR, "Invalid joint stereo channel configuration.\n");
             return AVERROR_INVALIDDATA;
+        }
         av_log(avctx, AV_LOG_DEBUG, "Joint stereo detected.\n");
     } else {
         av_log(avctx, AV_LOG_ERROR, "Unknown channel coding mode %x!\n",
@@ -895,24 +982,26 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
     }
 
     /* init the joint-stereo decoding data */
-    q->weighting_delay[0] = 0;
-    q->weighting_delay[1] = 7;
-    q->weighting_delay[2] = 0;
-    q->weighting_delay[3] = 7;
-    q->weighting_delay[4] = 0;
-    q->weighting_delay[5] = 7;
-
-    for (i = 0; i < 4; i++) {
-        q->matrix_coeff_index_prev[i] = 3;
-        q->matrix_coeff_index_now[i]  = 3;
-        q->matrix_coeff_index_next[i] = 3;
+    for (js_pair = 0; js_pair < MAX_JS_PAIRS; js_pair++) {
+        q->weighting_delay[js_pair][0] = 0;
+        q->weighting_delay[js_pair][1] = 7;
+        q->weighting_delay[js_pair][2] = 0;
+        q->weighting_delay[js_pair][3] = 7;
+        q->weighting_delay[js_pair][4] = 0;
+        q->weighting_delay[js_pair][5] = 7;
+
+        for (i = 0; i < 4; i++) {
+            q->matrix_coeff_index_prev[js_pair][i] = 3;
+            q->matrix_coeff_index_now[js_pair][i]  = 3;
+            q->matrix_coeff_index_next[js_pair][i] = 3;
+        }
     }
 
     ff_atrac_init_gain_compensation(&q->gainc_ctx, 4, 3);
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
-    q->units = av_mallocz(sizeof(*q->units) * avctx->channels);
-    if (!q->units) {
+    q->units = av_mallocz_array(avctx->channels, sizeof(*q->units));
+    if (!q->units || !q->fdsp) {
         atrac3_decode_close(avctx);
         return AVERROR(ENOMEM);
     }
@@ -927,10 +1016,23 @@ AVCodec ff_atrac3_decoder = {
     .id               = AV_CODEC_ID_ATRAC3,
     .priv_data_size   = sizeof(ATRAC3Context),
     .init             = atrac3_decode_init,
-    .init_static_data = atrac3_init_static_data,
     .close            = atrac3_decode_close,
     .decode           = atrac3_decode_frame,
     .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
     .sample_fmts      = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                         AV_SAMPLE_FMT_NONE },
 };
+
+AVCodec ff_atrac3al_decoder = {
+    .name             = "atrac3al",
+    .long_name        = NULL_IF_CONFIG_SMALL("ATRAC3 AL (Adaptive TRansform Acoustic Coding 3 Advanced Lossless)"),
+    .type             = AVMEDIA_TYPE_AUDIO,
+    .id               = AV_CODEC_ID_ATRAC3AL,
+    .priv_data_size   = sizeof(ATRAC3Context),
+    .init             = atrac3_decode_init,
+    .close            = atrac3_decode_close,
+    .decode           = atrac3al_decode_frame,
+    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+    .sample_fmts      = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                        AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/atrac3data.h b/libavcodec/atrac3data.h
index 4f5c122..5d91274 100644
--- a/libavcodec/atrac3data.h
+++ b/libavcodec/atrac3data.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Maxim Poliakovski
  * Copyright (c) 2006-2007 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3plus.c b/libavcodec/atrac3plus.c
index 9b7b28b..3e3bba8 100644
--- a/libavcodec/atrac3plus.c
+++ b/libavcodec/atrac3plus.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,10 +26,8 @@
  */
 
 #include "libavutil/avassert.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
-#include "vlc.h"
+#include "get_bits.h"
 #include "atrac3plus.h"
 #include "atrac3plus_data.h"
 
@@ -79,7 +77,7 @@ static av_cold void build_canonical_huff(const uint8_t *cb, const uint8_t *xlat,
     *tab_offset += 1 << max_len;
 }
 
-av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
+av_cold void ff_atrac3p_init_vlcs(void)
 {
     int i, wl_vlc_offs, ct_vlc_offs, sf_vlc_offs, tab_offset;
 
@@ -214,20 +212,20 @@ av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
 /**
  * Decode number of coded quantization units.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] chan          ptr to the channel parameters
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int num_coded_units(BitstreamContext *bc, Atrac3pChanParams *chan,
+static int num_coded_units(GetBitContext *gb, Atrac3pChanParams *chan,
                            Atrac3pChanUnitCtx *ctx, AVCodecContext *avctx)
 {
-    chan->fill_mode = bitstream_read(bc, 2);
+    chan->fill_mode = get_bits(gb, 2);
     if (!chan->fill_mode) {
         chan->num_coded_vals = ctx->num_quant_units;
     } else {
-        chan->num_coded_vals = bitstream_read(bc, 5);
+        chan->num_coded_vals = get_bits(gb, 5);
         if (chan->num_coded_vals > ctx->num_quant_units) {
             av_log(avctx, AV_LOG_ERROR,
                    "Invalid number of transmitted units!\n");
@@ -235,7 +233,7 @@ static int num_coded_units(BitstreamContext *bc, Atrac3pChanParams *chan,
         }
 
         if (chan->fill_mode == 3)
-            chan->split_point = bitstream_read(bc, 2) + (chan->ch_num << 1) + 1;
+            chan->split_point = get_bits(gb, 2) + (chan->ch_num << 1) + 1;
     }
 
     return 0;
@@ -320,21 +318,21 @@ static inline void unpack_vq_shape(int start_val, const int8_t *shape_vec,
     }
 }
 
-#define UNPACK_SF_VQ_SHAPE(bc, dst, num_vals)                                  \
-    start_val = bitstream_read((bc), 6);                                       \
-    unpack_vq_shape(start_val, &atrac3p_sf_shapes[bitstream_read((bc), 6)][0], \
+#define UNPACK_SF_VQ_SHAPE(gb, dst, num_vals)                            \
+    start_val = get_bits((gb), 6);                                       \
+    unpack_vq_shape(start_val, &atrac3p_sf_shapes[get_bits((gb), 6)][0], \
                     (dst), (num_vals))
 
 /**
  * Decode word length for each quantization unit of a channel.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     ch_num        channel to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_channel_wordlen(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_channel_wordlen(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                   int ch_num, AVCodecContext *avctx)
 {
     int i, weight_idx = 0, delta, diff, pos, delta_bits, min_val, flag,
@@ -345,107 +343,107 @@ static int decode_channel_wordlen(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 
     chan->fill_mode = 0;
 
-    switch (bitstream_read(bc, 2)) { /* switch according to coding mode */
+    switch (get_bits(gb, 2)) { /* switch according to coding mode */
     case 0: /* coded using constant number of bits */
         for (i = 0; i < ctx->num_quant_units; i++)
-            chan->qu_wordlen[i] = bitstream_read(bc, 3);
+            chan->qu_wordlen[i] = get_bits(gb, 3);
         break;
     case 1:
         if (ch_num) {
-            if ((ret = num_coded_units(bc, chan, ctx, avctx)) < 0)
+            if ((ret = num_coded_units(gb, chan, ctx, avctx)) < 0)
                 return ret;
 
             if (chan->num_coded_vals) {
-                vlc_tab = &wl_vlc_tabs[bitstream_read(bc, 2)];
+                vlc_tab = &wl_vlc_tabs[get_bits(gb, 2)];
 
                 for (i = 0; i < chan->num_coded_vals; i++) {
-                    delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                    delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                     chan->qu_wordlen[i] = (ref_chan->qu_wordlen[i] + delta) & 7;
                 }
             }
         } else {
-            weight_idx = bitstream_read(bc, 2);
-            if ((ret = num_coded_units(bc, chan, ctx, avctx)) < 0)
+            weight_idx = get_bits(gb, 2);
+            if ((ret = num_coded_units(gb, chan, ctx, avctx)) < 0)
                 return ret;
 
             if (chan->num_coded_vals) {
-                pos = bitstream_read(bc, 5);
+                pos = get_bits(gb, 5);
                 if (pos > chan->num_coded_vals) {
                     av_log(avctx, AV_LOG_ERROR,
                            "WL mode 1: invalid position!\n");
                     return AVERROR_INVALIDDATA;
                 }
 
-                delta_bits = bitstream_read(bc, 2);
-                min_val    = bitstream_read(bc, 3);
+                delta_bits = get_bits(gb, 2);
+                min_val    = get_bits(gb, 3);
 
                 for (i = 0; i < pos; i++)
-                    chan->qu_wordlen[i] = bitstream_read(bc, 3);
+                    chan->qu_wordlen[i] = get_bits(gb, 3);
 
                 for (i = pos; i < chan->num_coded_vals; i++)
-                    chan->qu_wordlen[i] = (min_val + bitstream_read(bc, delta_bits)) & 7;
+                    chan->qu_wordlen[i] = (min_val + get_bitsz(gb, delta_bits)) & 7;
             }
         }
         break;
     case 2:
-        if ((ret = num_coded_units(bc, chan, ctx, avctx)) < 0)
+        if ((ret = num_coded_units(gb, chan, ctx, avctx)) < 0)
             return ret;
 
         if (ch_num && chan->num_coded_vals) {
-            vlc_tab = &wl_vlc_tabs[bitstream_read(bc, 2)];
-            delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+            vlc_tab = &wl_vlc_tabs[get_bits(gb, 2)];
+            delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
             chan->qu_wordlen[0] = (ref_chan->qu_wordlen[0] + delta) & 7;
 
             for (i = 1; i < chan->num_coded_vals; i++) {
                 diff = ref_chan->qu_wordlen[i] - ref_chan->qu_wordlen[i - 1];
-                delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                 chan->qu_wordlen[i] = (chan->qu_wordlen[i - 1] + diff + delta) & 7;
             }
         } else if (chan->num_coded_vals) {
-            flag    = bitstream_read(bc, 1);
-            vlc_tab = &wl_vlc_tabs[bitstream_read(bc, 1)];
+            flag    = get_bits(gb, 1);
+            vlc_tab = &wl_vlc_tabs[get_bits(gb, 1)];
 
-            start_val = bitstream_read(bc, 3);
+            start_val = get_bits(gb, 3);
             unpack_vq_shape(start_val,
-                            &atrac3p_wl_shapes[start_val][bitstream_read(bc, 4)][0],
+                            &atrac3p_wl_shapes[start_val][get_bits(gb, 4)][0],
                             chan->qu_wordlen, chan->num_coded_vals);
 
             if (!flag) {
                 for (i = 0; i < chan->num_coded_vals; i++) {
-                    delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                    delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                     chan->qu_wordlen[i] = (chan->qu_wordlen[i] + delta) & 7;
                 }
             } else {
                 for (i = 0; i < (chan->num_coded_vals & - 2); i += 2)
-                    if (!bitstream_read_bit(bc)) {
+                    if (!get_bits1(gb)) {
                         chan->qu_wordlen[i]     = (chan->qu_wordlen[i] +
-                                                   bitstream_read_vlc(bc, vlc_tab->table,
-                                                                      vlc_tab->bits, 1)) & 7;
+                                                   get_vlc2(gb, vlc_tab->table,
+                                                            vlc_tab->bits, 1)) & 7;
                         chan->qu_wordlen[i + 1] = (chan->qu_wordlen[i + 1] +
-                                                   bitstream_read_vlc(bc, vlc_tab->table,
-                                                                      vlc_tab->bits, 1)) & 7;
+                                                   get_vlc2(gb, vlc_tab->table,
+                                                            vlc_tab->bits, 1)) & 7;
                     }
 
                 if (chan->num_coded_vals & 1)
                     chan->qu_wordlen[i] = (chan->qu_wordlen[i] +
-                                           bitstream_read_vlc(bc, vlc_tab->table,
-                                                              vlc_tab->bits, 1)) & 7;
+                                           get_vlc2(gb, vlc_tab->table,
+                                                    vlc_tab->bits, 1)) & 7;
             }
         }
         break;
     case 3:
-        weight_idx = bitstream_read(bc, 2);
-        if ((ret = num_coded_units(bc, chan, ctx, avctx)) < 0)
+        weight_idx = get_bits(gb, 2);
+        if ((ret = num_coded_units(gb, chan, ctx, avctx)) < 0)
             return ret;
 
         if (chan->num_coded_vals) {
-            vlc_tab = &wl_vlc_tabs[bitstream_read(bc, 2)];
+            vlc_tab = &wl_vlc_tabs[get_bits(gb, 2)];
 
             /* first coefficient is coded directly */
-            chan->qu_wordlen[0] = bitstream_read(bc, 3);
+            chan->qu_wordlen[0] = get_bits(gb, 3);
 
             for (i = 1; i < chan->num_coded_vals; i++) {
-                delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                 chan->qu_wordlen[i] = (chan->qu_wordlen[i - 1] + delta) & 7;
             }
         }
@@ -454,7 +452,7 @@ static int decode_channel_wordlen(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 
     if (chan->fill_mode == 2) {
         for (i = chan->num_coded_vals; i < ctx->num_quant_units; i++)
-            chan->qu_wordlen[i] = ch_num ? bitstream_read_bit(bc) : 1;
+            chan->qu_wordlen[i] = ch_num ? get_bits1(gb) : 1;
     } else if (chan->fill_mode == 3) {
         pos = ch_num ? chan->num_coded_vals + chan->split_point
                      : ctx->num_quant_units - chan->split_point;
@@ -471,13 +469,13 @@ static int decode_channel_wordlen(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode scale factor indexes for each quant unit of a channel.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     ch_num        channel to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_channel_sf_idx(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_channel_sf_idx(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                  int ch_num, AVCodecContext *avctx)
 {
     int i, weight_idx = 0, delta, diff, num_long_vals,
@@ -486,40 +484,40 @@ static int decode_channel_sf_idx(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
     Atrac3pChanParams *chan     = &ctx->channels[ch_num];
     Atrac3pChanParams *ref_chan = &ctx->channels[0];
 
-    switch (bitstream_read(bc, 2)) { /* switch according to coding mode */
+    switch (get_bits(gb, 2)) { /* switch according to coding mode */
     case 0: /* coded using constant number of bits */
         for (i = 0; i < ctx->used_quant_units; i++)
-            chan->qu_sf_idx[i] = bitstream_read(bc, 6);
+            chan->qu_sf_idx[i] = get_bits(gb, 6);
         break;
     case 1:
         if (ch_num) {
-            vlc_tab = &sf_vlc_tabs[bitstream_read(bc, 2)];
+            vlc_tab = &sf_vlc_tabs[get_bits(gb, 2)];
 
             for (i = 0; i < ctx->used_quant_units; i++) {
-                delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                 chan->qu_sf_idx[i] = (ref_chan->qu_sf_idx[i] + delta) & 0x3F;
             }
         } else {
-            weight_idx = bitstream_read(bc, 2);
+            weight_idx = get_bits(gb, 2);
             if (weight_idx == 3) {
-                UNPACK_SF_VQ_SHAPE(bc, chan->qu_sf_idx, ctx->used_quant_units);
+                UNPACK_SF_VQ_SHAPE(gb, chan->qu_sf_idx, ctx->used_quant_units);
 
-                num_long_vals = bitstream_read(bc, 5);
-                delta_bits    = bitstream_read(bc, 2);
-                min_val       = bitstream_read(bc, 4) - 7;
+                num_long_vals = get_bits(gb, 5);
+                delta_bits    = get_bits(gb, 2);
+                min_val       = get_bits(gb, 4) - 7;
 
                 for (i = 0; i < num_long_vals; i++)
                     chan->qu_sf_idx[i] = (chan->qu_sf_idx[i] +
-                                          bitstream_read(bc, 4) - 7) & 0x3F;
+                                          get_bits(gb, 4) - 7) & 0x3F;
 
                 /* all others are: min_val + delta */
                 for (i = num_long_vals; i < ctx->used_quant_units; i++)
                     chan->qu_sf_idx[i] = (chan->qu_sf_idx[i] + min_val +
-                                          bitstream_read(bc, delta_bits)) & 0x3F;
+                                          get_bitsz(gb, delta_bits)) & 0x3F;
             } else {
-                num_long_vals = bitstream_read(bc, 5);
-                delta_bits    = bitstream_read(bc, 3);
-                min_val       = bitstream_read(bc, 6);
+                num_long_vals = get_bits(gb, 5);
+                delta_bits    = get_bits(gb, 3);
+                min_val       = get_bits(gb, 6);
                 if (num_long_vals > ctx->used_quant_units || delta_bits == 7) {
                     av_log(avctx, AV_LOG_ERROR,
                            "SF mode 1: invalid parameters!\n");
@@ -528,34 +526,34 @@ static int decode_channel_sf_idx(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 
                 /* read full-precision SF indexes */
                 for (i = 0; i < num_long_vals; i++)
-                    chan->qu_sf_idx[i] = bitstream_read(bc, 6);
+                    chan->qu_sf_idx[i] = get_bits(gb, 6);
 
                 /* all others are: min_val + delta */
                 for (i = num_long_vals; i < ctx->used_quant_units; i++)
                     chan->qu_sf_idx[i] = (min_val +
-                                          bitstream_read(bc, delta_bits)) & 0x3F;
+                                          get_bitsz(gb, delta_bits)) & 0x3F;
             }
         }
         break;
     case 2:
         if (ch_num) {
-            vlc_tab = &sf_vlc_tabs[bitstream_read(bc, 2)];
+            vlc_tab = &sf_vlc_tabs[get_bits(gb, 2)];
 
-            delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+            delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
             chan->qu_sf_idx[0] = (ref_chan->qu_sf_idx[0] + delta) & 0x3F;
 
             for (i = 1; i < ctx->used_quant_units; i++) {
                 diff  = ref_chan->qu_sf_idx[i] - ref_chan->qu_sf_idx[i - 1];
-                delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                 chan->qu_sf_idx[i] = (chan->qu_sf_idx[i - 1] + diff + delta) & 0x3F;
             }
         } else {
-            vlc_tab = &sf_vlc_tabs[bitstream_read(bc, 2) + 4];
+            vlc_tab = &sf_vlc_tabs[get_bits(gb, 2) + 4];
 
-            UNPACK_SF_VQ_SHAPE(bc, chan->qu_sf_idx, ctx->used_quant_units);
+            UNPACK_SF_VQ_SHAPE(gb, chan->qu_sf_idx, ctx->used_quant_units);
 
             for (i = 0; i < ctx->used_quant_units; i++) {
-                delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                 chan->qu_sf_idx[i] = (chan->qu_sf_idx[i] +
                                       sign_extend(delta, 4)) & 0x3F;
             }
@@ -567,29 +565,29 @@ static int decode_channel_sf_idx(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
             for (i = 0; i < ctx->used_quant_units; i++)
                 chan->qu_sf_idx[i] = ref_chan->qu_sf_idx[i];
         } else {
-            weight_idx = bitstream_read(bc, 2);
-            vlc_sel    = bitstream_read(bc, 2);
+            weight_idx = get_bits(gb, 2);
+            vlc_sel    = get_bits(gb, 2);
             vlc_tab    = &sf_vlc_tabs[vlc_sel];
 
             if (weight_idx == 3) {
                 vlc_tab = &sf_vlc_tabs[vlc_sel + 4];
 
-                UNPACK_SF_VQ_SHAPE(bc, chan->qu_sf_idx, ctx->used_quant_units);
+                UNPACK_SF_VQ_SHAPE(gb, chan->qu_sf_idx, ctx->used_quant_units);
 
-                diff               = (bitstream_read(bc, 4) + 56)   & 0x3F;
-                chan->qu_sf_idx[0] = (chan->qu_sf_idx[0]    + diff) & 0x3F;
+                diff               = (get_bits(gb, 4)    + 56)   & 0x3F;
+                chan->qu_sf_idx[0] = (chan->qu_sf_idx[0] + diff) & 0x3F;
 
                 for (i = 1; i < ctx->used_quant_units; i++) {
-                    delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                    delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                     diff               = (diff + sign_extend(delta, 4)) & 0x3F;
                     chan->qu_sf_idx[i] = (diff + chan->qu_sf_idx[i])    & 0x3F;
                 }
             } else {
                 /* 1st coefficient is coded directly */
-                chan->qu_sf_idx[0] = bitstream_read(bc, 6);
+                chan->qu_sf_idx[0] = get_bits(gb, 6);
 
                 for (i = 1; i < ctx->used_quant_units; i++) {
-                    delta = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                    delta = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
                     chan->qu_sf_idx[i] = (chan->qu_sf_idx[i - 1] + delta) & 0x3F;
                 }
             }
@@ -606,13 +604,13 @@ static int decode_channel_sf_idx(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode word length information for each channel.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_quant_wordlen(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_quant_wordlen(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                 int num_channels, AVCodecContext *avctx)
 {
     int ch_num, i, ret;
@@ -621,7 +619,7 @@ static int decode_quant_wordlen(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
         memset(ctx->channels[ch_num].qu_wordlen, 0,
                sizeof(ctx->channels[ch_num].qu_wordlen));
 
-        if ((ret = decode_channel_wordlen(bc, ctx, ch_num, avctx)) < 0)
+        if ((ret = decode_channel_wordlen(gb, ctx, ch_num, avctx)) < 0)
             return ret;
     }
 
@@ -639,13 +637,13 @@ static int decode_quant_wordlen(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode scale factor indexes for each channel.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_scale_factors(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_scale_factors(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                 int num_channels, AVCodecContext *avctx)
 {
     int ch_num, ret;
@@ -657,7 +655,7 @@ static int decode_scale_factors(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
         memset(ctx->channels[ch_num].qu_sf_idx, 0,
                sizeof(ctx->channels[ch_num].qu_sf_idx));
 
-        if ((ret = decode_channel_sf_idx(bc, ctx, ch_num, avctx)) < 0)
+        if ((ret = decode_channel_sf_idx(gb, ctx, ch_num, avctx)) < 0)
             return ret;
     }
 
@@ -667,18 +665,18 @@ static int decode_scale_factors(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode number of code table values.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int get_num_ct_values(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int get_num_ct_values(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                              AVCodecContext *avctx)
 {
     int num_coded_vals;
 
-    if (bitstream_read_bit(bc)) {
-        num_coded_vals = bitstream_read(bc, 5);
+    if (get_bits1(gb)) {
+        num_coded_vals = get_bits(gb, 5);
         if (num_coded_vals > ctx->used_quant_units) {
             av_log(avctx, AV_LOG_ERROR,
                    "Invalid number of code table indexes: %d!\n", num_coded_vals);
@@ -690,7 +688,7 @@ static int get_num_ct_values(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 }
 
 #define DEC_CT_IDX_COMMON(OP)                                           \
-    num_vals = get_num_ct_values(bc, ctx, avctx);                       \
+    num_vals = get_num_ct_values(gb, ctx, avctx);                       \
     if (num_vals < 0)                                                   \
         return num_vals;                                                \
                                                                         \
@@ -699,33 +697,33 @@ static int get_num_ct_values(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
             chan->qu_tab_idx[i] = OP;                                   \
         } else if (ch_num && ref_chan->qu_wordlen[i])                   \
             /* get clone master flag */                                 \
-            chan->qu_tab_idx[i] = bitstream_read_bit(bc);               \
+            chan->qu_tab_idx[i] = get_bits1(gb);                        \
     }
 
-#define CODING_DIRECT bitstream_read(bc, num_bits)
+#define CODING_DIRECT get_bits(gb, num_bits)
 
-#define CODING_VLC bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1)
+#define CODING_VLC get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1)
 
 #define CODING_VLC_DELTA                                                \
     (!i) ? CODING_VLC                                                   \
-         : (pred + bitstream_read_vlc(bc, delta_vlc->table,             \
-                                      delta_vlc->bits, 1)) & mask;      \
+         : (pred + get_vlc2(gb, delta_vlc->table,                       \
+                            delta_vlc->bits, 1)) & mask;                \
     pred = chan->qu_tab_idx[i]
 
 #define CODING_VLC_DIFF                                                 \
     (ref_chan->qu_tab_idx[i] +                                          \
-     bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1)) & mask
+     get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1)) & mask
 
 /**
  * Decode code table indexes for each quant unit of a channel.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     ch_num        channel to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_channel_code_tab(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_channel_code_tab(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                    int ch_num, AVCodecContext *avctx)
 {
     int i, num_vals, num_bits, pred;
@@ -734,9 +732,9 @@ static int decode_channel_code_tab(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
     Atrac3pChanParams *chan     = &ctx->channels[ch_num];
     Atrac3pChanParams *ref_chan = &ctx->channels[0];
 
-    chan->table_type = bitstream_read_bit(bc);
+    chan->table_type = get_bits1(gb);
 
-    switch (bitstream_read(bc, 2)) { /* switch according to coding mode */
+    switch (get_bits(gb, 2)) { /* switch according to coding mode */
     case 0: /* directly coded */
         num_bits = ctx->use_full_table + 2;
         DEC_CT_IDX_COMMON(CODING_DIRECT);
@@ -772,13 +770,13 @@ static int decode_channel_code_tab(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
 /**
  * Decode code table indexes for each channel.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_code_table_indexes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_code_table_indexes(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                      int num_channels, AVCodecContext *avctx)
 {
     int ch_num, ret;
@@ -786,13 +784,13 @@ static int decode_code_table_indexes(BitstreamContext *bc, Atrac3pChanUnitCtx *c
     if (!ctx->used_quant_units)
         return 0;
 
-    ctx->use_full_table = bitstream_read_bit(bc);
+    ctx->use_full_table = get_bits1(gb);
 
     for (ch_num = 0; ch_num < num_channels; ch_num++) {
         memset(ctx->channels[ch_num].qu_tab_idx, 0,
                sizeof(ctx->channels[ch_num].qu_tab_idx));
 
-        if ((ret = decode_channel_code_tab(bc, ctx, ch_num, avctx)) < 0)
+        if ((ret = decode_channel_code_tab(gb, ctx, ch_num, avctx)) < 0)
             return ret;
     }
 
@@ -805,13 +803,13 @@ static int decode_code_table_indexes(BitstreamContext *bc, Atrac3pChanUnitCtx *c
  * This is a generalized version for all known coding modes.
  * Its speed can be improved by creating separate functions for each mode.
  *
- * @param[in]   bc          the Bitstream context
+ * @param[in]   gb          the GetBit context
  * @param[in]   tab         code table telling how to decode spectral lines
  * @param[in]   vlc_tab     ptr to the huffman table associated with the code table
  * @param[out]  out         pointer to buffer where decoded data should be stored
  * @param[in]   num_specs   number of spectral lines to decode
  */
-static void decode_qu_spectra(BitstreamContext *bc, const Atrac3pSpecCodeTab *tab,
+static void decode_qu_spectra(GetBitContext *gb, const Atrac3pSpecCodeTab *tab,
                               VLC *vlc_tab, int16_t *out, const int num_specs)
 {
     int i, j, pos, cf;
@@ -819,18 +817,18 @@ static void decode_qu_spectra(BitstreamContext *bc, const Atrac3pSpecCodeTab *ta
     int num_coeffs = tab->num_coeffs;
     int bits       = tab->bits;
     int is_signed  = tab->is_signed;
-    unsigned val, mask = (1 << bits) - 1;
+    unsigned val;
 
     for (pos = 0; pos < num_specs;) {
-        if (group_size == 1 || bitstream_read_bit(bc)) {
+        if (group_size == 1 || get_bits1(gb)) {
             for (j = 0; j < group_size; j++) {
-                val = bitstream_read_vlc(bc, vlc_tab->table, vlc_tab->bits, 1);
+                val = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
 
                 for (i = 0; i < num_coeffs; i++) {
-                    cf = val & mask;
+                    cf = av_mod_uintp2(val, bits);
                     if (is_signed)
                         cf = sign_extend(cf, bits);
-                    else if (cf && bitstream_read_bit(bc))
+                    else if (cf && get_bits1(gb))
                         cf = -cf;
 
                     out[pos++] = cf;
@@ -845,12 +843,12 @@ static void decode_qu_spectra(BitstreamContext *bc, const Atrac3pSpecCodeTab *ta
 /**
  * Decode huffman-coded IMDCT spectrum for all channels.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  * @param[in]     avctx         ptr to the AVCodecContext
  */
-static void decode_spectrum(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static void decode_spectrum(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                             int num_channels, AVCodecContext *avctx)
 {
     int i, ch_num, qu, wordlen, codetab, tab_index, num_specs;
@@ -882,7 +880,7 @@ static void decode_spectrum(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
                 if (tab->redirect >= 0)
                     tab_index = tab->redirect;
 
-                decode_qu_spectra(bc, tab, &spec_vlc_tabs[tab_index],
+                decode_qu_spectra(gb, tab, &spec_vlc_tabs[tab_index],
                                   &chan->spectrum[ff_atrac3p_qu_to_spec_pos[qu]],
                                   num_specs);
             } else if (ch_num && ctx->channels[0].qu_wordlen[qu] && !codetab) {
@@ -902,7 +900,7 @@ static void decode_spectrum(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
         if (ctx->used_quant_units > 2) {
             num_specs = atrac3p_subband_to_num_powgrps[ctx->num_coded_subbands - 1];
             for (i = 0; i < num_specs; i++)
-                chan->power_levs[i] = bitstream_read(bc, 4);
+                chan->power_levs[i] = get_bits(gb, 4);
         }
     }
 }
@@ -915,22 +913,22 @@ static void decode_spectrum(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
  * Otherwise, all necessary bits will be directly stored
  * prefixed by two signal bits = 1,1.
  *
- * @param[in]   bc              ptr to the BitstreamContext
+ * @param[in]   gb              ptr to the GetBitContext
  * @param[out]  out             where to place decoded flags
  * @param[in]   num_flags       number of flags to process
  * @return: 0 = all flag bits are zero, 1 = there is at least one non-zero flag bit
  */
-static int get_subband_flags(BitstreamContext *bc, uint8_t *out, int num_flags)
+static int get_subband_flags(GetBitContext *gb, uint8_t *out, int num_flags)
 {
     int i, result;
 
     memset(out, 0, num_flags);
 
-    result = bitstream_read_bit(bc);
+    result = get_bits1(gb);
     if (result) {
-        if (bitstream_read_bit(bc))
+        if (get_bits1(gb))
             for (i = 0; i < num_flags; i++)
-                out[i] = bitstream_read_bit(bc);
+                out[i] = get_bits1(gb);
         else
             memset(out, 1, num_flags);
     }
@@ -941,63 +939,63 @@ static int get_subband_flags(BitstreamContext *bc, uint8_t *out, int num_flags)
 /**
  * Decode mdct window shape flags for all channels.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  */
-static void decode_window_shape(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static void decode_window_shape(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                 int num_channels)
 {
     int ch_num;
 
     for (ch_num = 0; ch_num < num_channels; ch_num++)
-        get_subband_flags(bc, ctx->channels[ch_num].wnd_shape,
+        get_subband_flags(gb, ctx->channels[ch_num].wnd_shape,
                           ctx->num_subbands);
 }
 
 /**
  * Decode number of gain control points.
  *
- * @param[in]     bc              the Bitstream context
+ * @param[in]     gb              the GetBit context
  * @param[in,out] ctx             ptr to the channel unit context
  * @param[in]     ch_num          channel to process
  * @param[in]     coded_subbands  number of subbands to process
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_gainc_npoints(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_gainc_npoints(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                 int ch_num, int coded_subbands)
 {
     int i, delta, delta_bits, min_val;
     Atrac3pChanParams *chan     = &ctx->channels[ch_num];
     Atrac3pChanParams *ref_chan = &ctx->channels[0];
 
-    switch (bitstream_read(bc, 2)) { /* switch according to coding mode */
+    switch (get_bits(gb, 2)) { /* switch according to coding mode */
     case 0: /* fixed-length coding */
         for (i = 0; i < coded_subbands; i++)
-            chan->gain_data[i].num_points = bitstream_read(bc, 3);
+            chan->gain_data[i].num_points = get_bits(gb, 3);
         break;
     case 1: /* variable-length coding */
         for (i = 0; i < coded_subbands; i++)
             chan->gain_data[i].num_points =
-                bitstream_read_vlc(bc, gain_vlc_tabs[0].table,
-                                   gain_vlc_tabs[0].bits, 1);
+                get_vlc2(gb, gain_vlc_tabs[0].table,
+                         gain_vlc_tabs[0].bits, 1);
         break;
     case 2:
         if (ch_num) { /* VLC modulo delta to master channel */
             for (i = 0; i < coded_subbands; i++) {
-                delta = bitstream_read_vlc(bc, gain_vlc_tabs[1].table,
-                                           gain_vlc_tabs[1].bits, 1);
+                delta = get_vlc2(gb, gain_vlc_tabs[1].table,
+                                 gain_vlc_tabs[1].bits, 1);
                 chan->gain_data[i].num_points =
                     (ref_chan->gain_data[i].num_points + delta) & 7;
             }
         } else { /* VLC modulo delta to previous */
             chan->gain_data[0].num_points =
-                bitstream_read_vlc(bc, gain_vlc_tabs[0].table,
-                                   gain_vlc_tabs[0].bits, 1);
+                get_vlc2(gb, gain_vlc_tabs[0].table,
+                         gain_vlc_tabs[0].bits, 1);
 
             for (i = 1; i < coded_subbands; i++) {
-                delta = bitstream_read_vlc(bc, gain_vlc_tabs[1].table,
-                                           gain_vlc_tabs[1].bits, 1);
+                delta = get_vlc2(gb, gain_vlc_tabs[1].table,
+                                 gain_vlc_tabs[1].bits, 1);
                 chan->gain_data[i].num_points =
                     (chan->gain_data[i - 1].num_points + delta) & 7;
             }
@@ -1009,11 +1007,11 @@ static int decode_gainc_npoints(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
                 chan->gain_data[i].num_points =
                     ref_chan->gain_data[i].num_points;
         } else { /* shorter delta to min */
-            delta_bits = bitstream_read(bc, 2);
-            min_val    = bitstream_read(bc, 3);
+            delta_bits = get_bits(gb, 2);
+            min_val    = get_bits(gb, 3);
 
             for (i = 0; i < coded_subbands; i++) {
-                chan->gain_data[i].num_points = min_val + bitstream_read(bc, delta_bits);
+                chan->gain_data[i].num_points = min_val + get_bitsz(gb, delta_bits);
                 if (chan->gain_data[i].num_points > 7)
                     return AVERROR_INVALIDDATA;
             }
@@ -1040,23 +1038,23 @@ static inline void gainc_level_mode3s(AtracGainInfo *dst, AtracGainInfo *ref)
 /**
  * Implements coding mode 1 (master) for gain compensation levels.
  *
- * @param[in]     bc     the Bitstream context
+ * @param[in]     gb     the GetBit context
  * @param[in]     ctx    ptr to the channel unit context
  * @param[out]    dst    ptr to the output array
  */
-static inline void gainc_level_mode1m(BitstreamContext *bc,
+static inline void gainc_level_mode1m(GetBitContext *gb,
                                       Atrac3pChanUnitCtx *ctx,
                                       AtracGainInfo *dst)
 {
     int i, delta;
 
     if (dst->num_points > 0)
-        dst->lev_code[0] = bitstream_read_vlc(bc, gain_vlc_tabs[2].table,
-                                              gain_vlc_tabs[2].bits, 1);
+        dst->lev_code[0] = get_vlc2(gb, gain_vlc_tabs[2].table,
+                                    gain_vlc_tabs[2].bits, 1);
 
     for (i = 1; i < dst->num_points; i++) {
-        delta = bitstream_read_vlc(bc, gain_vlc_tabs[3].table,
-                                   gain_vlc_tabs[3].bits, 1);
+        delta = get_vlc2(gb, gain_vlc_tabs[3].table,
+                         gain_vlc_tabs[3].bits, 1);
         dst->lev_code[i] = (dst->lev_code[i - 1] + delta) & 0xF;
     }
 }
@@ -1064,58 +1062,58 @@ static inline void gainc_level_mode1m(BitstreamContext *bc,
 /**
  * Decode level code for each gain control point.
  *
- * @param[in]     bc              the Bitstream context
+ * @param[in]     gb              the GetBit context
  * @param[in,out] ctx             ptr to the channel unit context
  * @param[in]     ch_num          channel to process
  * @param[in]     coded_subbands  number of subbands to process
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_gainc_levels(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_gainc_levels(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                int ch_num, int coded_subbands)
 {
     int sb, i, delta, delta_bits, min_val, pred;
     Atrac3pChanParams *chan     = &ctx->channels[ch_num];
     Atrac3pChanParams *ref_chan = &ctx->channels[0];
 
-    switch (bitstream_read(bc, 2)) { /* switch according to coding mode */
+    switch (get_bits(gb, 2)) { /* switch according to coding mode */
     case 0: /* fixed-length coding */
         for (sb = 0; sb < coded_subbands; sb++)
             for (i = 0; i < chan->gain_data[sb].num_points; i++)
-                chan->gain_data[sb].lev_code[i] = bitstream_read(bc, 4);
+                chan->gain_data[sb].lev_code[i] = get_bits(gb, 4);
         break;
     case 1:
         if (ch_num) { /* VLC modulo delta to master channel */
             for (sb = 0; sb < coded_subbands; sb++)
                 for (i = 0; i < chan->gain_data[sb].num_points; i++) {
-                    delta = bitstream_read_vlc(bc, gain_vlc_tabs[5].table,
-                                               gain_vlc_tabs[5].bits, 1);
+                    delta = get_vlc2(gb, gain_vlc_tabs[5].table,
+                                     gain_vlc_tabs[5].bits, 1);
                     pred = (i >= ref_chan->gain_data[sb].num_points)
                            ? 7 : ref_chan->gain_data[sb].lev_code[i];
                     chan->gain_data[sb].lev_code[i] = (pred + delta) & 0xF;
                 }
         } else { /* VLC modulo delta to previous */
             for (sb = 0; sb < coded_subbands; sb++)
-                gainc_level_mode1m(bc, ctx, &chan->gain_data[sb]);
+                gainc_level_mode1m(gb, ctx, &chan->gain_data[sb]);
         }
         break;
     case 2:
         if (ch_num) { /* VLC modulo delta to previous or clone master */
             for (sb = 0; sb < coded_subbands; sb++)
                 if (chan->gain_data[sb].num_points > 0) {
-                    if (bitstream_read_bit(bc))
-                        gainc_level_mode1m(bc, ctx, &chan->gain_data[sb]);
+                    if (get_bits1(gb))
+                        gainc_level_mode1m(gb, ctx, &chan->gain_data[sb]);
                     else
                         gainc_level_mode3s(&chan->gain_data[sb],
                                            &ref_chan->gain_data[sb]);
                 }
         } else { /* VLC modulo delta to lev_codes of previous subband */
             if (chan->gain_data[0].num_points > 0)
-                gainc_level_mode1m(bc, ctx, &chan->gain_data[0]);
+                gainc_level_mode1m(gb, ctx, &chan->gain_data[0]);
 
             for (sb = 1; sb < coded_subbands; sb++)
                 for (i = 0; i < chan->gain_data[sb].num_points; i++) {
-                    delta = bitstream_read_vlc(bc, gain_vlc_tabs[4].table,
-                                               gain_vlc_tabs[4].bits, 1);
+                    delta = get_vlc2(gb, gain_vlc_tabs[4].table,
+                                     gain_vlc_tabs[4].bits, 1);
                     pred = (i >= chan->gain_data[sb - 1].num_points)
                            ? 7 : chan->gain_data[sb - 1].lev_code[i];
                     chan->gain_data[sb].lev_code[i] = (pred + delta) & 0xF;
@@ -1128,12 +1126,12 @@ static int decode_gainc_levels(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
                 gainc_level_mode3s(&chan->gain_data[sb],
                                    &ref_chan->gain_data[sb]);
         } else { /* shorter delta to min */
-            delta_bits = bitstream_read(bc, 2);
-            min_val    = bitstream_read(bc, 4);
+            delta_bits = get_bits(gb, 2);
+            min_val    = get_bits(gb, 4);
 
             for (sb = 0; sb < coded_subbands; sb++)
                 for (i = 0; i < chan->gain_data[sb].num_points; i++) {
-                    chan->gain_data[sb].lev_code[i] = min_val + bitstream_read(bc, delta_bits);
+                    chan->gain_data[sb].lev_code[i] = min_val + get_bitsz(gb, delta_bits);
                     if (chan->gain_data[sb].lev_code[i] > 15)
                         return AVERROR_INVALIDDATA;
                 }
@@ -1147,35 +1145,35 @@ static int decode_gainc_levels(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Implements coding mode 0 for gain compensation locations.
  *
- * @param[in]     bc     the Bitstream context
+ * @param[in]     gb     the GetBit context
  * @param[in]     ctx    ptr to the channel unit context
  * @param[out]    dst    ptr to the output array
  * @param[in]     pos    position of the value to be processed
  */
-static inline void gainc_loc_mode0(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static inline void gainc_loc_mode0(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                    AtracGainInfo *dst, int pos)
 {
     int delta_bits;
 
     if (!pos || dst->loc_code[pos - 1] < 15)
-        dst->loc_code[pos] = bitstream_read(bc, 5);
+        dst->loc_code[pos] = get_bits(gb, 5);
     else if (dst->loc_code[pos - 1] >= 30)
         dst->loc_code[pos] = 31;
     else {
         delta_bits         = av_log2(30 - dst->loc_code[pos - 1]) + 1;
         dst->loc_code[pos] = dst->loc_code[pos - 1] +
-                             bitstream_read(bc, delta_bits) + 1;
+                             get_bits(gb, delta_bits) + 1;
     }
 }
 
 /**
  * Implements coding mode 1 for gain compensation locations.
  *
- * @param[in]     bc     the Bitstream context
+ * @param[in]     gb     the GetBit context
  * @param[in]     ctx    ptr to the channel unit context
  * @param[out]    dst    ptr to the output array
  */
-static inline void gainc_loc_mode1(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static inline void gainc_loc_mode1(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                    AtracGainInfo *dst)
 {
     int i;
@@ -1183,7 +1181,7 @@ static inline void gainc_loc_mode1(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
 
     if (dst->num_points > 0) {
         /* 1st coefficient is stored directly */
-        dst->loc_code[0] = bitstream_read(bc, 5);
+        dst->loc_code[0] = get_bits(gb, 5);
 
         for (i = 1; i < dst->num_points; i++) {
             /* switch VLC according to the curve direction
@@ -1192,7 +1190,7 @@ static inline void gainc_loc_mode1(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
                                ? &gain_vlc_tabs[7]
                                : &gain_vlc_tabs[9];
             dst->loc_code[i] = dst->loc_code[i - 1] +
-                               bitstream_read_vlc(bc, tab->table, tab->bits, 1);
+                               get_vlc2(gb, tab->table, tab->bits, 1);
         }
     }
 }
@@ -1200,14 +1198,14 @@ static inline void gainc_loc_mode1(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
 /**
  * Decode location code for each gain control point.
  *
- * @param[in]     bc              the Bitstream context
+ * @param[in]     gb              the GetBit context
  * @param[in,out] ctx             ptr to the channel unit context
  * @param[in]     ch_num          channel to process
  * @param[in]     coded_subbands  number of subbands to process
  * @param[in]     avctx           ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_gainc_loc_codes(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                   int ch_num, int coded_subbands,
                                   AVCodecContext *avctx)
 {
@@ -1217,11 +1215,11 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
     Atrac3pChanParams *chan     = &ctx->channels[ch_num];
     Atrac3pChanParams *ref_chan = &ctx->channels[0];
 
-    switch (bitstream_read(bc, 2)) { /* switch according to coding mode */
+    switch (get_bits(gb, 2)) { /* switch according to coding mode */
     case 0: /* sequence of numbers in ascending order */
         for (sb = 0; sb < coded_subbands; sb++)
             for (i = 0; i < chan->gain_data[sb].num_points; i++)
-                gainc_loc_mode0(bc, ctx, &chan->gain_data[sb], i);
+                gainc_loc_mode0(gb, ctx, &chan->gain_data[sb], i);
         break;
     case 1:
         if (ch_num) {
@@ -1232,8 +1230,8 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
                 ref = &ref_chan->gain_data[sb];
 
                 /* 1st value is vlc-coded modulo delta to master */
-                delta = bitstream_read_vlc(bc, gain_vlc_tabs[10].table,
-                                           gain_vlc_tabs[10].bits, 1);
+                delta = get_vlc2(gb, gain_vlc_tabs[10].table,
+                                 gain_vlc_tabs[10].bits, 1);
                 pred = ref->num_points > 0 ? ref->loc_code[0] : 0;
                 dst->loc_code[0] = (pred + delta) & 0x1F;
 
@@ -1243,19 +1241,19 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
                         /* ascending curve */
                         if (more_than_ref) {
                             delta =
-                                bitstream_read_vlc(bc, gain_vlc_tabs[9].table,
-                                                   gain_vlc_tabs[9].bits, 1);
+                                get_vlc2(gb, gain_vlc_tabs[9].table,
+                                         gain_vlc_tabs[9].bits, 1);
                             dst->loc_code[i] = dst->loc_code[i - 1] + delta;
                         } else {
-                            if (bitstream_read_bit(bc))
-                                gainc_loc_mode0(bc, ctx, dst, i);  // direct coding
+                            if (get_bits1(gb))
+                                gainc_loc_mode0(gb, ctx, dst, i);  // direct coding
                             else
                                 dst->loc_code[i] = ref->loc_code[i];  // clone master
                         }
                     } else { /* descending curve */
                         tab   = more_than_ref ? &gain_vlc_tabs[7]
                                               : &gain_vlc_tabs[10];
-                        delta = bitstream_read_vlc(bc, tab->table, tab->bits, 1);
+                        delta = get_vlc2(gb, tab->table, tab->bits, 1);
                         if (more_than_ref)
                             dst->loc_code[i] = dst->loc_code[i - 1] + delta;
                         else
@@ -1265,7 +1263,7 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
             }
         } else /* VLC delta to previous */
             for (sb = 0; sb < coded_subbands; sb++)
-                gainc_loc_mode1(bc, ctx, &chan->gain_data[sb]);
+                gainc_loc_mode1(gb, ctx, &chan->gain_data[sb]);
         break;
     case 2:
         if (ch_num) {
@@ -1274,8 +1272,8 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
                     continue;
                 dst = &chan->gain_data[sb];
                 ref = &ref_chan->gain_data[sb];
-                if (dst->num_points > ref->num_points || bitstream_read_bit(bc))
-                    gainc_loc_mode1(bc, ctx, dst);
+                if (dst->num_points > ref->num_points || get_bits1(gb))
+                    gainc_loc_mode1(gb, ctx, dst);
                 else /* clone master for the whole subband */
                     for (i = 0; i < chan->gain_data[sb].num_points; i++)
                         dst->loc_code[i] = ref->loc_code[i];
@@ -1283,7 +1281,7 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
         } else {
             /* data for the first subband is coded directly */
             for (i = 0; i < chan->gain_data[0].num_points; i++)
-                gainc_loc_mode0(bc, ctx, &chan->gain_data[0], i);
+                gainc_loc_mode0(gb, ctx, &chan->gain_data[0], i);
 
             for (sb = 1; sb < coded_subbands; sb++) {
                 if (chan->gain_data[sb].num_points <= 0)
@@ -1292,8 +1290,8 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 
                 /* 1st value is vlc-coded modulo delta to the corresponding
                  * value of the previous subband if any or zero */
-                delta = bitstream_read_vlc(bc, gain_vlc_tabs[6].table,
-                                           gain_vlc_tabs[6].bits, 1);
+                delta = get_vlc2(gb, gain_vlc_tabs[6].table,
+                                 gain_vlc_tabs[6].bits, 1);
                 pred             = dst[-1].num_points > 0
                                    ? dst[-1].loc_code[0] : 0;
                 dst->loc_code[0] = (pred + delta) & 0x1F;
@@ -1304,7 +1302,7 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
                      * presence of prediction. */
                     tab = &gain_vlc_tabs[(dst->lev_code[i] > dst->lev_code[i - 1]) *
                                                    2 + more_than_ref + 6];
-                    delta = bitstream_read_vlc(bc, tab->table, tab->bits, 1);
+                    delta = get_vlc2(gb, tab->table, tab->bits, 1);
                     if (more_than_ref)
                         dst->loc_code[i] = dst->loc_code[i - 1] + delta;
                     else
@@ -1318,19 +1316,19 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
             for (sb = 0; sb < coded_subbands; sb++)
                 for (i = 0; i < chan->gain_data[sb].num_points; i++) {
                     if (i >= ref_chan->gain_data[sb].num_points)
-                        gainc_loc_mode0(bc, ctx, &chan->gain_data[sb], i);
+                        gainc_loc_mode0(gb, ctx, &chan->gain_data[sb], i);
                     else
                         chan->gain_data[sb].loc_code[i] =
                             ref_chan->gain_data[sb].loc_code[i];
                 }
         } else { /* shorter delta to min */
-            delta_bits = bitstream_read(bc, 2) + 1;
-            min_val    = bitstream_read(bc, 5);
+            delta_bits = get_bits(gb, 2) + 1;
+            min_val    = get_bits(gb, 5);
 
             for (sb = 0; sb < coded_subbands; sb++)
                 for (i = 0; i < chan->gain_data[sb].num_points; i++)
                     chan->gain_data[sb].loc_code[i] = min_val + i +
-                                                      bitstream_read(bc, delta_bits);
+                                                      get_bits(gb, delta_bits);
         }
         break;
     }
@@ -1355,13 +1353,13 @@ static int decode_gainc_loc_codes(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode gain control data for all channels.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_gainc_data(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_gainc_data(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                              int num_channels, AVCodecContext *avctx)
 {
     int ch_num, coded_subbands, sb, ret;
@@ -1370,16 +1368,16 @@ static int decode_gainc_data(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
         memset(ctx->channels[ch_num].gain_data, 0,
                sizeof(*ctx->channels[ch_num].gain_data) * ATRAC3P_SUBBANDS);
 
-        if (bitstream_read_bit(bc)) { /* gain control data present? */
-            coded_subbands = bitstream_read(bc, 4) + 1;
-            if (bitstream_read_bit(bc)) /* is high band gain data replication on? */
-                ctx->channels[ch_num].num_gain_subbands = bitstream_read(bc, 4) + 1;
+        if (get_bits1(gb)) { /* gain control data present? */
+            coded_subbands = get_bits(gb, 4) + 1;
+            if (get_bits1(gb)) /* is high band gain data replication on? */
+                ctx->channels[ch_num].num_gain_subbands = get_bits(gb, 4) + 1;
             else
                 ctx->channels[ch_num].num_gain_subbands = coded_subbands;
 
-            if ((ret = decode_gainc_npoints(bc, ctx, ch_num, coded_subbands)) < 0 ||
-                (ret = decode_gainc_levels(bc, ctx, ch_num, coded_subbands))  < 0 ||
-                (ret = decode_gainc_loc_codes(bc, ctx, ch_num, coded_subbands, avctx)) < 0)
+            if ((ret = decode_gainc_npoints(gb, ctx, ch_num, coded_subbands)) < 0 ||
+                (ret = decode_gainc_levels(gb, ctx, ch_num, coded_subbands))  < 0 ||
+                (ret = decode_gainc_loc_codes(gb, ctx, ch_num, coded_subbands, avctx)) < 0)
                 return ret;
 
             if (coded_subbands > 0) { /* propagate gain data if requested */
@@ -1398,29 +1396,29 @@ static int decode_gainc_data(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode envelope for all tones of a channel.
  *
- * @param[in]     bc                the Bitstream context
+ * @param[in]     gb                the GetBit context
  * @param[in,out] ctx               ptr to the channel unit context
  * @param[in]     ch_num            channel to process
  * @param[in]     band_has_tones    ptr to an array of per-band-flags:
  *                                  1 - tone data present
  */
-static void decode_tones_envelope(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static void decode_tones_envelope(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                   int ch_num, int band_has_tones[])
 {
     int sb;
     Atrac3pWavesData *dst = ctx->channels[ch_num].tones_info;
     Atrac3pWavesData *ref = ctx->channels[0].tones_info;
 
-    if (!ch_num || !bitstream_read_bit(bc)) { /* mode 0: fixed-length coding */
+    if (!ch_num || !get_bits1(gb)) { /* mode 0: fixed-length coding */
         for (sb = 0; sb < ctx->waves_info->num_tone_bands; sb++) {
             if (!band_has_tones[sb])
                 continue;
-            dst[sb].pend_env.has_start_point = bitstream_read_bit(bc);
+            dst[sb].pend_env.has_start_point = get_bits1(gb);
             dst[sb].pend_env.start_pos       = dst[sb].pend_env.has_start_point
-                                               ? bitstream_read(bc, 5) : -1;
-            dst[sb].pend_env.has_stop_point  = bitstream_read_bit(bc);
+                                               ? get_bits(gb, 5) : -1;
+            dst[sb].pend_env.has_stop_point  = get_bits1(gb);
             dst[sb].pend_env.stop_pos        = dst[sb].pend_env.has_stop_point
-                                               ? bitstream_read(bc, 5) : 32;
+                                               ? get_bits(gb, 5) : 32;
         }
     } else { /* mode 1(slave only): copy master */
         for (sb = 0; sb < ctx->waves_info->num_tone_bands; sb++) {
@@ -1437,7 +1435,7 @@ static void decode_tones_envelope(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode number of tones for each subband of a channel.
  *
- * @param[in]     bc                the Bitstream context
+ * @param[in]     gb                the GetBit context
  * @param[in,out] ctx               ptr to the channel unit context
  * @param[in]     ch_num            channel to process
  * @param[in]     band_has_tones    ptr to an array of per-band-flags:
@@ -1445,7 +1443,7 @@ static void decode_tones_envelope(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
  * @param[in]     avctx             ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_band_numwavs(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_band_numwavs(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                int ch_num, int band_has_tones[],
                                AVCodecContext *avctx)
 {
@@ -1453,25 +1451,25 @@ static int decode_band_numwavs(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
     Atrac3pWavesData *dst = ctx->channels[ch_num].tones_info;
     Atrac3pWavesData *ref = ctx->channels[0].tones_info;
 
-    mode = bitstream_read(bc, ch_num + 1);
+    mode = get_bits(gb, ch_num + 1);
     switch (mode) {
     case 0: /** fixed-length coding */
         for (sb = 0; sb < ctx->waves_info->num_tone_bands; sb++)
             if (band_has_tones[sb])
-                dst[sb].num_wavs = bitstream_read(bc, 4);
+                dst[sb].num_wavs = get_bits(gb, 4);
         break;
     case 1: /** variable-length coding */
         for (sb = 0; sb < ctx->waves_info->num_tone_bands; sb++)
             if (band_has_tones[sb])
                 dst[sb].num_wavs =
-                    bitstream_read_vlc(bc, tone_vlc_tabs[1].table,
-                                       tone_vlc_tabs[1].bits, 1);
+                    get_vlc2(gb, tone_vlc_tabs[1].table,
+                             tone_vlc_tabs[1].bits, 1);
         break;
     case 2: /** VLC modulo delta to master (slave only) */
         for (sb = 0; sb < ctx->waves_info->num_tone_bands; sb++)
             if (band_has_tones[sb]) {
-                delta = bitstream_read_vlc(bc, tone_vlc_tabs[2].table,
-                                           tone_vlc_tabs[2].bits, 1);
+                delta = get_vlc2(gb, tone_vlc_tabs[2].table,
+                                 tone_vlc_tabs[2].bits, 1);
                 delta = sign_extend(delta, 3);
                 dst[sb].num_wavs = (ref[sb].num_wavs + delta) & 0xF;
             }
@@ -1503,13 +1501,13 @@ static int decode_band_numwavs(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
 /**
  * Decode frequency information for each subband of a channel.
  *
- * @param[in]     bc                the Bitstream context
+ * @param[in]     gb                the GetBit context
  * @param[in,out] ctx               ptr to the channel unit context
  * @param[in]     ch_num            channel to process
  * @param[in]     band_has_tones    ptr to an array of per-band-flags:
  *                                  1 - tone data present
  */
-static void decode_tones_frequency(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static void decode_tones_frequency(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                    int ch_num, int band_has_tones[])
 {
     int sb, i, direction, nbits, pred, delta;
@@ -1517,26 +1515,26 @@ static void decode_tones_frequency(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
     Atrac3pWavesData *dst = ctx->channels[ch_num].tones_info;
     Atrac3pWavesData *ref = ctx->channels[0].tones_info;
 
-    if (!ch_num || !bitstream_read_bit(bc)) { /* mode 0: fixed-length coding */
+    if (!ch_num || !get_bits1(gb)) { /* mode 0: fixed-length coding */
         for (sb = 0; sb < ctx->waves_info->num_tone_bands; sb++) {
             if (!band_has_tones[sb] || !dst[sb].num_wavs)
                 continue;
             iwav      = &ctx->waves_info->waves[dst[sb].start_index];
-            direction = (dst[sb].num_wavs > 1) ? bitstream_read_bit(bc) : 0;
+            direction = (dst[sb].num_wavs > 1) ? get_bits1(gb) : 0;
             if (direction) { /** packed numbers in descending order */
                 if (dst[sb].num_wavs)
-                    iwav[dst[sb].num_wavs - 1].freq_index = bitstream_read(bc, 10);
+                    iwav[dst[sb].num_wavs - 1].freq_index = get_bits(gb, 10);
                 for (i = dst[sb].num_wavs - 2; i >= 0 ; i--) {
                     nbits = av_log2(iwav[i+1].freq_index) + 1;
-                    iwav[i].freq_index = bitstream_read(bc, nbits);
+                    iwav[i].freq_index = get_bits(gb, nbits);
                 }
             } else { /** packed numbers in ascending order */
                 for (i = 0; i < dst[sb].num_wavs; i++) {
                     if (!i || iwav[i - 1].freq_index < 512)
-                        iwav[i].freq_index = bitstream_read(bc, 10);
+                        iwav[i].freq_index = get_bits(gb, 10);
                     else {
                         nbits = av_log2(1023 - iwav[i - 1].freq_index) + 1;
-                        iwav[i].freq_index = bitstream_read(bc, nbits) +
+                        iwav[i].freq_index = get_bits(gb, nbits) +
                                              1024 - (1 << nbits);
                     }
                 }
@@ -1549,8 +1547,8 @@ static void decode_tones_frequency(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
             iwav = &ctx->waves_info->waves[ref[sb].start_index];
             owav = &ctx->waves_info->waves[dst[sb].start_index];
             for (i = 0; i < dst[sb].num_wavs; i++) {
-                delta = bitstream_read_vlc(bc, tone_vlc_tabs[6].table,
-                                           tone_vlc_tabs[6].bits, 1);
+                delta = get_vlc2(gb, tone_vlc_tabs[6].table,
+                                 tone_vlc_tabs[6].bits, 1);
                 delta = sign_extend(delta, 8);
                 pred  = (i < ref[sb].num_wavs) ? iwav[i].freq_index :
                         (ref[sb].num_wavs ? iwav[ref[sb].num_wavs - 1].freq_index : 0);
@@ -1563,13 +1561,13 @@ static void decode_tones_frequency(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
 /**
  * Decode amplitude information for each subband of a channel.
  *
- * @param[in]     bc                the Bitstream context
+ * @param[in]     gb                the GetBit context
  * @param[in,out] ctx               ptr to the channel unit context
  * @param[in]     ch_num            channel to process
  * @param[in]     band_has_tones    ptr to an array of per-band-flags:
  *                                  1 - tone data present
  */
-static void decode_tones_amplitude(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static void decode_tones_amplitude(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                    int ch_num, int band_has_tones[])
 {
     int mode, sb, j, i, diff, maxdiff, fi, delta, pred;
@@ -1603,7 +1601,7 @@ static void decode_tones_amplitude(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
         }
     }
 
-    mode = bitstream_read(bc, ch_num + 1);
+    mode = get_bits(gb, ch_num + 1);
 
     switch (mode) {
     case 0: /** fixed-length coding */
@@ -1612,9 +1610,9 @@ static void decode_tones_amplitude(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
                 continue;
             if (ctx->waves_info->amplitude_mode)
                 for (i = 0; i < dst[sb].num_wavs; i++)
-                    ctx->waves_info->waves[dst[sb].start_index + i].amp_sf = bitstream_read(bc, 6);
+                    ctx->waves_info->waves[dst[sb].start_index + i].amp_sf = get_bits(gb, 6);
             else
-                ctx->waves_info->waves[dst[sb].start_index].amp_sf = bitstream_read(bc, 6);
+                ctx->waves_info->waves[dst[sb].start_index].amp_sf = get_bits(gb, 6);
         }
         break;
     case 1: /** min + VLC delta */
@@ -1624,12 +1622,12 @@ static void decode_tones_amplitude(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
             if (ctx->waves_info->amplitude_mode)
                 for (i = 0; i < dst[sb].num_wavs; i++)
                     ctx->waves_info->waves[dst[sb].start_index + i].amp_sf =
-                        bitstream_read_vlc(bc, tone_vlc_tabs[3].table,
-                                           tone_vlc_tabs[3].bits, 1) + 20;
+                        get_vlc2(gb, tone_vlc_tabs[3].table,
+                                 tone_vlc_tabs[3].bits, 1) + 20;
             else
                 ctx->waves_info->waves[dst[sb].start_index].amp_sf =
-                    bitstream_read_vlc(bc, tone_vlc_tabs[4].table,
-                                       tone_vlc_tabs[4].bits, 1) + 24;
+                    get_vlc2(gb, tone_vlc_tabs[4].table,
+                             tone_vlc_tabs[4].bits, 1) + 24;
         }
         break;
     case 2: /** VLC modulo delta to master (slave only) */
@@ -1637,8 +1635,8 @@ static void decode_tones_amplitude(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
             if (!band_has_tones[sb] || !dst[sb].num_wavs)
                 continue;
             for (i = 0; i < dst[sb].num_wavs; i++) {
-                delta = bitstream_read_vlc(bc, tone_vlc_tabs[5].table,
-                                           tone_vlc_tabs[5].bits, 1);
+                delta = get_vlc2(gb, tone_vlc_tabs[5].table,
+                                 tone_vlc_tabs[5].bits, 1);
                 delta = sign_extend(delta, 5);
                 pred  = refwaves[dst[sb].start_index + i] >= 0 ?
                         ctx->waves_info->waves[refwaves[dst[sb].start_index + i]].amp_sf : 34;
@@ -1663,13 +1661,13 @@ static void decode_tones_amplitude(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
 /**
  * Decode phase information for each subband of a channel.
  *
- * @param[in]     bc                the Bitstream context
+ * @param[in]     gb                the GetBit context
  * @param[in,out] ctx               ptr to the channel unit context
  * @param[in]     ch_num            channel to process
  * @param[in]     band_has_tones    ptr to an array of per-band-flags:
  *                                  1 - tone data present
  */
-static void decode_tones_phase(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static void decode_tones_phase(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                int ch_num, int band_has_tones[])
 {
     int sb, i;
@@ -1681,20 +1679,20 @@ static void decode_tones_phase(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
             continue;
         wparam = &ctx->waves_info->waves[dst[sb].start_index];
         for (i = 0; i < dst[sb].num_wavs; i++)
-            wparam[i].phase_index = bitstream_read(bc, 5);
+            wparam[i].phase_index = get_bits(gb, 5);
     }
 }
 
 /**
  * Decode tones info for all channels.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-static int decode_tones_info(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+static int decode_tones_info(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                              int num_channels, AVCodecContext *avctx)
 {
     int ch_num, i, ret;
@@ -1704,30 +1702,26 @@ static int decode_tones_info(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
         memset(ctx->channels[ch_num].tones_info, 0,
                sizeof(*ctx->channels[ch_num].tones_info) * ATRAC3P_SUBBANDS);
 
-    ctx->waves_info->tones_present = bitstream_read_bit(bc);
+    ctx->waves_info->tones_present = get_bits1(gb);
     if (!ctx->waves_info->tones_present)
         return 0;
 
     memset(ctx->waves_info->waves, 0, sizeof(ctx->waves_info->waves));
 
-    ctx->waves_info->amplitude_mode = bitstream_read_bit(bc);
+    ctx->waves_info->amplitude_mode = get_bits1(gb);
     if (!ctx->waves_info->amplitude_mode) {
         avpriv_report_missing_feature(avctx, "GHA amplitude mode 0");
         return AVERROR_PATCHWELCOME;
     }
 
     ctx->waves_info->num_tone_bands =
-        bitstream_read_vlc(bc, tone_vlc_tabs[0].table,
-                           tone_vlc_tabs[0].bits, 1) + 1;
+        get_vlc2(gb, tone_vlc_tabs[0].table,
+                 tone_vlc_tabs[0].bits, 1) + 1;
 
     if (num_channels == 2) {
-        get_subband_flags(bc, ctx->waves_info->tone_sharing, ctx->waves_info->num_tone_bands);
-        get_subband_flags(bc, ctx->waves_info->tone_master,  ctx->waves_info->num_tone_bands);
-        if (get_subband_flags(bc, ctx->waves_info->phase_shift,
-                              ctx->waves_info->num_tone_bands)) {
-            avpriv_report_missing_feature(avctx, "GHA Phase shifting");
-            return AVERROR_PATCHWELCOME;
-        }
+        get_subband_flags(gb, ctx->waves_info->tone_sharing, ctx->waves_info->num_tone_bands);
+        get_subband_flags(gb, ctx->waves_info->tone_master,  ctx->waves_info->num_tone_bands);
+        get_subband_flags(gb, ctx->waves_info->invert_phase, ctx->waves_info->num_tone_bands);
     }
 
     ctx->waves_info->tones_index = 0;
@@ -1736,14 +1730,14 @@ static int decode_tones_info(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
         for (i = 0; i < ctx->waves_info->num_tone_bands; i++)
             band_has_tones[i] = !ch_num ? 1 : !ctx->waves_info->tone_sharing[i];
 
-        decode_tones_envelope(bc, ctx, ch_num, band_has_tones);
-        if ((ret = decode_band_numwavs(bc, ctx, ch_num, band_has_tones,
+        decode_tones_envelope(gb, ctx, ch_num, band_has_tones);
+        if ((ret = decode_band_numwavs(gb, ctx, ch_num, band_has_tones,
                                        avctx)) < 0)
             return ret;
 
-        decode_tones_frequency(bc, ctx, ch_num, band_has_tones);
-        decode_tones_amplitude(bc, ctx, ch_num, band_has_tones);
-        decode_tones_phase(bc, ctx, ch_num, band_has_tones);
+        decode_tones_frequency(gb, ctx, ch_num, band_has_tones);
+        decode_tones_amplitude(gb, ctx, ch_num, band_has_tones);
+        decode_tones_phase(gb, ctx, ch_num, band_has_tones);
     }
 
     if (num_channels == 2) {
@@ -1760,13 +1754,13 @@ static int decode_tones_info(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
     return 0;
 }
 
-int ff_atrac3p_decode_channel_unit(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
+int ff_atrac3p_decode_channel_unit(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
                                    int num_channels, AVCodecContext *avctx)
 {
     int ret;
 
     /* parse sound header */
-    ctx->num_quant_units = bitstream_read(bc, 5) + 1;
+    ctx->num_quant_units = get_bits(gb, 5) + 1;
     if (ctx->num_quant_units > 28 && ctx->num_quant_units < 32) {
         av_log(avctx, AV_LOG_ERROR,
                "Invalid number of quantization units: %d!\n",
@@ -1774,10 +1768,10 @@ int ff_atrac3p_decode_channel_unit(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
         return AVERROR_INVALIDDATA;
     }
 
-    ctx->mute_flag = bitstream_read_bit(bc);
+    ctx->mute_flag = get_bits1(gb);
 
     /* decode various sound parameters */
-    if ((ret = decode_quant_wordlen(bc, ctx, num_channels, avctx)) < 0)
+    if ((ret = decode_quant_wordlen(gb, ctx, num_channels, avctx)) < 0)
         return ret;
 
     ctx->num_subbands       = atrac3p_qu_to_subband[ctx->num_quant_units - 1] + 1;
@@ -1785,32 +1779,32 @@ int ff_atrac3p_decode_channel_unit(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx
                               ? atrac3p_qu_to_subband[ctx->used_quant_units - 1] + 1
                               : 0;
 
-    if ((ret = decode_scale_factors(bc, ctx, num_channels, avctx)) < 0)
+    if ((ret = decode_scale_factors(gb, ctx, num_channels, avctx)) < 0)
         return ret;
 
-    if ((ret = decode_code_table_indexes(bc, ctx, num_channels, avctx)) < 0)
+    if ((ret = decode_code_table_indexes(gb, ctx, num_channels, avctx)) < 0)
         return ret;
 
-    decode_spectrum(bc, ctx, num_channels, avctx);
+    decode_spectrum(gb, ctx, num_channels, avctx);
 
     if (num_channels == 2) {
-        get_subband_flags(bc, ctx->swap_channels, ctx->num_coded_subbands);
-        get_subband_flags(bc, ctx->negate_coeffs, ctx->num_coded_subbands);
+        get_subband_flags(gb, ctx->swap_channels, ctx->num_coded_subbands);
+        get_subband_flags(gb, ctx->negate_coeffs, ctx->num_coded_subbands);
     }
 
-    decode_window_shape(bc, ctx, num_channels);
+    decode_window_shape(gb, ctx, num_channels);
 
-    if ((ret = decode_gainc_data(bc, ctx, num_channels, avctx)) < 0)
+    if ((ret = decode_gainc_data(gb, ctx, num_channels, avctx)) < 0)
         return ret;
 
-    if ((ret = decode_tones_info(bc, ctx, num_channels, avctx)) < 0)
+    if ((ret = decode_tones_info(gb, ctx, num_channels, avctx)) < 0)
         return ret;
 
     /* decode global noise info */
-    ctx->noise_present = bitstream_read_bit(bc);
+    ctx->noise_present = get_bits1(gb);
     if (ctx->noise_present) {
-        ctx->noise_level_index = bitstream_read(bc, 4);
-        ctx->noise_table_index = bitstream_read(bc, 4);
+        ctx->noise_level_index = get_bits(gb, 4);
+        ctx->noise_table_index = get_bits(gb, 4);
     }
 
     return 0;
diff --git a/libavcodec/atrac3plus.h b/libavcodec/atrac3plus.h
index cca399a..3c39e29 100644
--- a/libavcodec/atrac3plus.h
+++ b/libavcodec/atrac3plus.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,11 +31,10 @@
 #include <stdint.h>
 
 #include "libavutil/float_dsp.h"
-
 #include "atrac.h"
-#include "bitstream.h"
 #include "avcodec.h"
 #include "fft.h"
+#include "get_bits.h"
 
 /** Global unit sizes */
 #define ATRAC3P_SUBBANDS        16  ///< number of PQF subbands
@@ -123,7 +122,7 @@ typedef struct Atrac3pWaveSynthParams {
     int num_tone_bands;                     ///< number of PQF bands with tones
     uint8_t tone_sharing[ATRAC3P_SUBBANDS]; ///< 1 - subband-wise tone sharing flags
     uint8_t tone_master[ATRAC3P_SUBBANDS];  ///< 1 - subband-wise tone channel swapping
-    uint8_t phase_shift[ATRAC3P_SUBBANDS];  ///< 1 - subband-wise 180° phase shifting
+    uint8_t invert_phase[ATRAC3P_SUBBANDS]; ///< 1 - subband-wise phase inversion
     int tones_index;                        ///< total sum of tones in this unit
     Atrac3pWaveParam waves[48];
 } Atrac3pWaveSynthParams;
@@ -156,22 +155,20 @@ typedef struct Atrac3pChanUnitCtx {
 
 /**
  * Initialize VLC tables for bitstream parsing.
- *
- * @param[in]   codec    ptr to the AVCodec
  */
-void ff_atrac3p_init_vlcs(AVCodec *codec);
+void ff_atrac3p_init_vlcs(void);
 
 /**
  * Decode bitstream data of a channel unit.
  *
- * @param[in]     bc            the Bitstream context
+ * @param[in]     gb            the GetBit context
  * @param[in,out] ctx           ptr to the channel unit context
  * @param[in]     num_channels  number of channels to process
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-int  ff_atrac3p_decode_channel_unit(BitstreamContext *bc, Atrac3pChanUnitCtx *ctx,
-                                    int num_channels, AVCodecContext *avctx);
+int ff_atrac3p_decode_channel_unit(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
+                                   int num_channels, AVCodecContext *avctx);
 
 /**
  * Initialize IMDCT transform.
@@ -202,13 +199,14 @@ void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit, AVFloatDSPContext *f
  * Perform power compensation aka noise dithering.
  *
  * @param[in]      ctx         ptr to the channel context
+ * @param[in]      fdsp        pointer to float DSP context
  * @param[in]      ch_index    which channel to process
  * @param[in,out]  sp          ptr to channel spectrum to process
  * @param[in]      rng_index   indicates which RNG table to use
  * @param[in]      sb_num      which subband to process
  */
-void ff_atrac3p_power_compensation(Atrac3pChanUnitCtx *ctx, int ch_index,
-                                   float *sp, int rng_index, int sb_num);
+void ff_atrac3p_power_compensation(Atrac3pChanUnitCtx *ctx, AVFloatDSPContext *fdsp,
+                                   int ch_index, float *sp, int rng_index, int sb_num);
 
 /**
  * Regular IMDCT and windowing without overlapping,
diff --git a/libavcodec/atrac3plus_data.h b/libavcodec/atrac3plus_data.h
index 5026a59..2a107ee 100644
--- a/libavcodec/atrac3plus_data.h
+++ b/libavcodec/atrac3plus_data.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3plusdec.c b/libavcodec/atrac3plusdec.c
index 17774d5..666d1a5 100644
--- a/libavcodec/atrac3plusdec.c
+++ b/libavcodec/atrac3plusdec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,16 +39,15 @@
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "atrac.h"
 #include "atrac3plus.h"
 
 typedef struct ATRAC3PContext {
-    BitstreamContext bc;
-    AVFloatDSPContext fdsp;
+    GetBitContext gb;
+    AVFloatDSPContext *fdsp;
 
     DECLARE_ALIGNED(32, float, samples)[2][ATRAC3P_FRAME_SAMPLES];  ///< quantized MDCT spectrum
     DECLARE_ALIGNED(32, float, mdct_buf)[2][ATRAC3P_FRAME_SAMPLES]; ///< output of the IMDCT
@@ -68,7 +67,13 @@ typedef struct ATRAC3PContext {
 
 static av_cold int atrac3p_decode_close(AVCodecContext *avctx)
 {
-    av_free(((ATRAC3PContext *)(avctx->priv_data))->ch_units);
+    ATRAC3PContext *ctx = avctx->priv_data;
+
+    av_freep(&ctx->ch_units);
+    av_freep(&ctx->fdsp);
+
+    ff_mdct_end(&ctx->mdct_ctx);
+    ff_mdct_end(&ctx->ipqf_dct_ctx);
 
     return 0;
 }
@@ -149,7 +154,7 @@ static av_cold int atrac3p_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    avpriv_float_dsp_init(&ctx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    ff_atrac3p_init_vlcs();
 
     /* initialize IPQF */
     ff_mdct_init(&ctx->ipqf_dct_ctx, 5, 1, 32.0 / 32768.0);
@@ -165,9 +170,10 @@ static av_cold int atrac3p_decode_init(AVCodecContext *avctx)
 
     ctx->my_channel_layout = avctx->channel_layout;
 
-    ctx->ch_units = av_mallocz(sizeof(*ctx->ch_units) *
-                               ctx->num_channel_blocks);
-    if (!ctx->ch_units) {
+    ctx->ch_units = av_mallocz_array(ctx->num_channel_blocks, sizeof(*ctx->ch_units));
+    ctx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+
+    if (!ctx->ch_units || !ctx->fdsp) {
         atrac3p_decode_close(avctx);
         return AVERROR(ENOMEM);
     }
@@ -192,7 +198,7 @@ static av_cold int atrac3p_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static void decode_residual_spectrum(Atrac3pChanUnitCtx *ctx,
+static void decode_residual_spectrum(ATRAC3PContext *ctx, Atrac3pChanUnitCtx *ch_unit,
                                      float out[2][ATRAC3P_FRAME_SAMPLES],
                                      int num_channels,
                                      AVCodecContext *avctx)
@@ -203,17 +209,17 @@ static void decode_residual_spectrum(Atrac3pChanUnitCtx *ctx,
     /* calculate RNG table index for each subband */
     int sb_RNG_index[ATRAC3P_SUBBANDS] = { 0 };
 
-    if (ctx->mute_flag) {
+    if (ch_unit->mute_flag) {
         for (ch = 0; ch < num_channels; ch++)
             memset(out[ch], 0, ATRAC3P_FRAME_SAMPLES * sizeof(*out[ch]));
         return;
     }
 
-    for (qu = 0, RNG_index = 0; qu < ctx->used_quant_units; qu++)
-        RNG_index += ctx->channels[0].qu_sf_idx[qu] +
-                     ctx->channels[1].qu_sf_idx[qu];
+    for (qu = 0, RNG_index = 0; qu < ch_unit->used_quant_units; qu++)
+        RNG_index += ch_unit->channels[0].qu_sf_idx[qu] +
+                     ch_unit->channels[1].qu_sf_idx[qu];
 
-    for (sb = 0; sb < ctx->num_coded_subbands; sb++, RNG_index += 128)
+    for (sb = 0; sb < ch_unit->num_coded_subbands; sb++, RNG_index += 128)
         sb_RNG_index[sb] = RNG_index & 0x3FC;
 
     /* inverse quant and power compensation */
@@ -221,35 +227,35 @@ static void decode_residual_spectrum(Atrac3pChanUnitCtx *ctx,
         /* clear channel's residual spectrum */
         memset(out[ch], 0, ATRAC3P_FRAME_SAMPLES * sizeof(*out[ch]));
 
-        for (qu = 0; qu < ctx->used_quant_units; qu++) {
-            src        = &ctx->channels[ch].spectrum[ff_atrac3p_qu_to_spec_pos[qu]];
+        for (qu = 0; qu < ch_unit->used_quant_units; qu++) {
+            src        = &ch_unit->channels[ch].spectrum[ff_atrac3p_qu_to_spec_pos[qu]];
             dst        = &out[ch][ff_atrac3p_qu_to_spec_pos[qu]];
             nspeclines = ff_atrac3p_qu_to_spec_pos[qu + 1] -
                          ff_atrac3p_qu_to_spec_pos[qu];
 
-            if (ctx->channels[ch].qu_wordlen[qu] > 0) {
-                q = ff_atrac3p_sf_tab[ctx->channels[ch].qu_sf_idx[qu]] *
-                    ff_atrac3p_mant_tab[ctx->channels[ch].qu_wordlen[qu]];
+            if (ch_unit->channels[ch].qu_wordlen[qu] > 0) {
+                q = ff_atrac3p_sf_tab[ch_unit->channels[ch].qu_sf_idx[qu]] *
+                    ff_atrac3p_mant_tab[ch_unit->channels[ch].qu_wordlen[qu]];
                 for (i = 0; i < nspeclines; i++)
                     dst[i] = src[i] * q;
             }
         }
 
-        for (sb = 0; sb < ctx->num_coded_subbands; sb++)
-            ff_atrac3p_power_compensation(ctx, ch, &out[ch][0],
+        for (sb = 0; sb < ch_unit->num_coded_subbands; sb++)
+            ff_atrac3p_power_compensation(ch_unit, ctx->fdsp, ch, &out[ch][0],
                                           sb_RNG_index[sb], sb);
     }
 
-    if (ctx->unit_type == CH_UNIT_STEREO) {
-        for (sb = 0; sb < ctx->num_coded_subbands; sb++) {
-            if (ctx->swap_channels[sb]) {
+    if (ch_unit->unit_type == CH_UNIT_STEREO) {
+        for (sb = 0; sb < ch_unit->num_coded_subbands; sb++) {
+            if (ch_unit->swap_channels[sb]) {
                 for (i = 0; i < ATRAC3P_SUBBAND_SAMPLES; i++)
                     FFSWAP(float, out[0][sb * ATRAC3P_SUBBAND_SAMPLES + i],
                                   out[1][sb * ATRAC3P_SUBBAND_SAMPLES + i]);
             }
 
             /* flip coefficients' sign if requested */
-            if (ctx->negate_coeffs[sb])
+            if (ch_unit->negate_coeffs[sb])
                 for (i = 0; i < ATRAC3P_SUBBAND_SAMPLES; i++)
                     out[1][sb * ATRAC3P_SUBBAND_SAMPLES + i] = -(out[1][sb * ATRAC3P_SUBBAND_SAMPLES + i]);
         }
@@ -264,7 +270,7 @@ static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx *ch_unit,
     for (ch = 0; ch < num_channels; ch++) {
         for (sb = 0; sb < ch_unit->num_subbands; sb++) {
             /* inverse transform and windowing */
-            ff_atrac3p_imdct(&ctx->fdsp, &ctx->mdct_ctx,
+            ff_atrac3p_imdct(ctx->fdsp, &ctx->mdct_ctx,
                              &ctx->samples[ch][sb * ATRAC3P_SUBBAND_SAMPLES],
                              &ctx->mdct_buf[ch][sb * ATRAC3P_SUBBAND_SAMPLES],
                              (ch_unit->channels[ch].wnd_shape_prev[sb] << 1) +
@@ -298,7 +304,7 @@ static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx *ch_unit,
             for (sb = 0; sb < ch_unit->num_subbands; sb++)
                 if (ch_unit->channels[ch].tones_info[sb].num_wavs ||
                     ch_unit->channels[ch].tones_info_prev[sb].num_wavs) {
-                    ff_atrac3p_generate_tones(ch_unit, &ctx->fdsp, ch, sb,
+                    ff_atrac3p_generate_tones(ch_unit, ctx->fdsp, ch, sb,
                                               &ctx->time_buf[ch][sb * 128]);
                 }
         }
@@ -330,21 +336,19 @@ static int atrac3p_decode_frame(AVCodecContext *avctx, void *data,
     float **samples_p = (float **)frame->extended_data;
 
     frame->nb_samples = ATRAC3P_FRAME_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
-    if ((ret = bitstream_init8(&ctx->bc, avpkt->data, avpkt->size)) < 0)
+    if ((ret = init_get_bits8(&ctx->gb, avpkt->data, avpkt->size)) < 0)
         return ret;
 
-    if (bitstream_read_bit(&ctx->bc)) {
+    if (get_bits1(&ctx->gb)) {
         av_log(avctx, AV_LOG_ERROR, "Invalid start bit!\n");
         return AVERROR_INVALIDDATA;
     }
 
-    while (bitstream_bits_left(&ctx->bc) >= 2 &&
-           (ch_unit_id = bitstream_read(&ctx->bc, 2)) != CH_UNIT_TERMINATOR) {
+    while (get_bits_left(&ctx->gb) >= 2 &&
+           (ch_unit_id = get_bits(&ctx->gb, 2)) != CH_UNIT_TERMINATOR) {
         if (ch_unit_id == CH_UNIT_EXTENSION) {
             avpriv_report_missing_feature(avctx, "Channel unit extension");
             return AVERROR_PATCHWELCOME;
@@ -359,13 +363,13 @@ static int atrac3p_decode_frame(AVCodecContext *avctx, void *data,
         ctx->ch_units[ch_block].unit_type = ch_unit_id;
         channels_to_process               = ch_unit_id + 1;
 
-        if ((ret = ff_atrac3p_decode_channel_unit(&ctx->bc,
+        if ((ret = ff_atrac3p_decode_channel_unit(&ctx->gb,
                                                   &ctx->ch_units[ch_block],
                                                   channels_to_process,
                                                   avctx)) < 0)
             return ret;
 
-        decode_residual_spectrum(&ctx->ch_units[ch_block], ctx->samples,
+        decode_residual_spectrum(ctx, &ctx->ch_units[ch_block], ctx->samples,
                                  channels_to_process, avctx);
         reconstruct_frame(ctx, &ctx->ch_units[ch_block],
                           channels_to_process, avctx);
@@ -380,18 +384,29 @@ static int atrac3p_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return avctx->block_align;
+    return avctx->codec_id == AV_CODEC_ID_ATRAC3P ? FFMIN(avctx->block_align, avpkt->size) : avpkt->size;
 }
 
 AVCodec ff_atrac3p_decoder = {
-    .name             = "atrac3plus",
-    .long_name        = NULL_IF_CONFIG_SMALL("ATRAC3+ (Adaptive TRansform Acoustic Coding 3+)"),
-    .type             = AVMEDIA_TYPE_AUDIO,
-    .id               = AV_CODEC_ID_ATRAC3P,
-    .capabilities     = AV_CODEC_CAP_DR1,
-    .priv_data_size   = sizeof(ATRAC3PContext),
-    .init             = atrac3p_decode_init,
-    .init_static_data = ff_atrac3p_init_vlcs,
-    .close            = atrac3p_decode_close,
-    .decode           = atrac3p_decode_frame,
+    .name           = "atrac3plus",
+    .long_name      = NULL_IF_CONFIG_SMALL("ATRAC3+ (Adaptive TRansform Acoustic Coding 3+)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ATRAC3P,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(ATRAC3PContext),
+    .init           = atrac3p_decode_init,
+    .close          = atrac3p_decode_close,
+    .decode         = atrac3p_decode_frame,
+};
+
+AVCodec ff_atrac3pal_decoder = {
+    .name           = "atrac3plusal",
+    .long_name      = NULL_IF_CONFIG_SMALL("ATRAC3+ AL (Adaptive TRansform Acoustic Coding 3+ Advanced Lossless)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ATRAC3PAL,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(ATRAC3PContext),
+    .init           = atrac3p_decode_init,
+    .close          = atrac3p_decode_close,
+    .decode         = atrac3p_decode_frame,
 };
diff --git a/libavcodec/atrac3plusdsp.c b/libavcodec/atrac3plusdsp.c
index 468f098..96aa402 100644
--- a/libavcodec/atrac3plusdsp.c
+++ b/libavcodec/atrac3plusdsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include <math.h>
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "sinewin.h"
 #include "fft.h"
@@ -107,7 +108,7 @@ av_cold void ff_atrac3p_init_wave_synth(void)
 
     /* generate amplitude scalefactors table */
     for (i = 0; i < 64; i++)
-        amp_sf_tab[i] = pow(2.0f, ((double)i - 3) / 4.0f);
+        amp_sf_tab[i] = exp2f((i - 3) / 4.0f);
 }
 
 /**
@@ -116,14 +117,16 @@ av_cold void ff_atrac3p_init_wave_synth(void)
  *  @param[in]    synth_param   ptr to common synthesis parameters
  *  @param[in]    waves_info    parameters for each sine wave
  *  @param[in]    envelope      envelope data for all waves in a group
- *  @param[in]    phase_shift   flag indicates 180° phase shift
+ *  @param[in]    fdsp          ptr to floating-point DSP context
+ *  @param[in]    invert_phase  flag indicating 180° phase shift
  *  @param[in]    reg_offset    region offset for trimming envelope data
  *  @param[out]   out           receives sythesized data
  */
 static void waves_synth(Atrac3pWaveSynthParams *synth_param,
                         Atrac3pWavesData *waves_info,
                         Atrac3pWaveEnvelope *envelope,
-                        int phase_shift, int reg_offset, float *out)
+                        AVFloatDSPContext *fdsp,
+                        int invert_phase, int reg_offset, float *out)
 {
     int i, wn, inc, pos;
     double amp;
@@ -146,6 +149,10 @@ static void waves_synth(Atrac3pWaveSynthParams *synth_param,
         }
     }
 
+    /* invert phase if requested */
+    if (invert_phase)
+        fdsp->vector_fmul_scalar(out, out, -1.0f, 128);
+
     /* fade in with steep Hann window if requested */
     if (envelope->has_start_point) {
         pos = (envelope->start_pos << 2) - reg_offset;
@@ -216,12 +223,12 @@ void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit, AVFloatDSPContext *f
     /* synthesize waves for both overlapping regions */
     if (tones_now->num_wavs && reg1_env_nonzero)
         waves_synth(ch_unit->waves_info_prev, tones_now, &tones_now->curr_env,
-                    ch_unit->waves_info_prev->phase_shift[sb] & ch_num,
+                    fdsp, ch_unit->waves_info_prev->invert_phase[sb] & ch_num,
                     128, wavreg1);
 
     if (tones_next->num_wavs && reg2_env_nonzero)
-        waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env,
-                    ch_unit->waves_info->phase_shift[sb] & ch_num, 0, wavreg2);
+        waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env, fdsp,
+                    ch_unit->waves_info->invert_phase[sb] & ch_num, 0, wavreg2);
 
     /* Hann windowing for non-faded wave signals */
     if (tones_now->num_wavs && tones_next->num_wavs &&
@@ -408,11 +415,12 @@ static const int subband_to_qu[17] = {
     0, 8, 12, 16, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
 };
 
-void ff_atrac3p_power_compensation(Atrac3pChanUnitCtx *ctx, int ch_index,
-                                   float *sp, int rng_index, int sb)
+void ff_atrac3p_power_compensation(Atrac3pChanUnitCtx *ctx, AVFloatDSPContext *fdsp,
+                                   int ch_index, float *sp, int rng_index, int sb)
 {
     AtracGainInfo *g1, *g2;
-    float pwcsp[ATRAC3P_SUBBAND_SAMPLES], *dst, grp_lev, qu_lev;
+    LOCAL_ALIGNED_32(float, pwcsp, [ATRAC3P_SUBBAND_SAMPLES]);
+    float *dst, grp_lev, qu_lev;
     int i, gain_lev, gcv = 0, qu, nsp;
     int swap_ch = (ctx->unit_type == CH_UNIT_STEREO && ctx->swap_channels[sb]) ? 1 : 0;
 
@@ -449,8 +457,7 @@ void ff_atrac3p_power_compensation(Atrac3pChanUnitCtx *ctx, int ch_index,
         dst = &sp[ff_atrac3p_qu_to_spec_pos[qu]];
         nsp = ff_atrac3p_qu_to_spec_pos[qu + 1] - ff_atrac3p_qu_to_spec_pos[qu];
 
-        for (i = 0; i < nsp; i++)
-            dst[i] += pwcsp[i] * qu_lev;
+        fdsp->vector_fmac_scalar(dst, pwcsp, qu_lev, nsp);
     }
 }
 
@@ -599,8 +606,8 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
                      const float *in, float *out)
 {
     int i, s, sb, t, pos_now, pos_next;
-    DECLARE_ALIGNED(32, float, idct_in)[ATRAC3P_SUBBANDS];
-    DECLARE_ALIGNED(32, float, idct_out)[ATRAC3P_SUBBANDS];
+    LOCAL_ALIGNED(32, float, idct_in, [ATRAC3P_SUBBANDS]);
+    LOCAL_ALIGNED(32, float, idct_out, [ATRAC3P_SUBBANDS]);
 
     memset(out, 0, ATRAC3P_FRAME_SAMPLES * sizeof(*out));
 
diff --git a/libavcodec/atrac9dec.c b/libavcodec/atrac9dec.c
new file mode 100644
index 0000000..805d46f
--- /dev/null
+++ b/libavcodec/atrac9dec.c
@@ -0,0 +1,954 @@
+/*
+ * ATRAC9 decoder
+ * Copyright (c) 2018 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "internal.h"
+#include "get_bits.h"
+#include "fft.h"
+#include "atrac9tab.h"
+#include "libavutil/lfg.h"
+#include "libavutil/float_dsp.h"
+
+typedef struct ATRAC9ChannelData {
+    int band_ext;
+    int q_unit_cnt;
+    int band_ext_data[4];
+    int32_t scalefactors[31];
+    int32_t scalefactors_prev[31];
+
+    int precision_coarse[30];
+    int precision_fine[30];
+    int precision_mask[30];
+
+    int codebookset[30];
+
+    int32_t q_coeffs_coarse[256];
+    int32_t q_coeffs_fine[256];
+
+    DECLARE_ALIGNED(32, float, coeffs  )[256];
+    DECLARE_ALIGNED(32, float, prev_win)[128];
+} ATRAC9ChannelData;
+
+typedef struct ATRAC9BlockData {
+    ATRAC9ChannelData channel[2];
+
+    /* Base */
+    int band_count;
+    int q_unit_cnt;
+    int q_unit_cnt_prev;
+
+    /* Stereo block only */
+    int stereo_q_unit;
+
+    /* Band extension only */
+    int has_band_ext;
+    int has_band_ext_data;
+    int band_ext_q_unit;
+
+    /* Gradient */
+    int grad_mode;
+    int grad_boundary;
+    int gradient[31];
+
+    /* Stereo */
+    int cpe_base_channel;
+    int is_signs[30];
+
+} ATRAC9BlockData;
+
+typedef struct ATRAC9Context {
+    AVCodecContext *avctx;
+    AVFloatDSPContext *fdsp;
+    FFTContext imdct;
+    ATRAC9BlockData block[5];
+    AVLFG lfg;
+
+    /* Set on init */
+    int frame_log2;
+    int avg_frame_size;
+    int frame_count;
+    int samplerate_idx;
+    const ATRAC9BlockConfig *block_config;
+
+    /* Generated on init */
+    VLC sf_vlc[2][8];            /* Signed/unsigned, length */
+    VLC coeff_vlc[2][8][4];      /* Cookbook, precision, cookbook index */
+    uint8_t alloc_curve[48][48];
+    DECLARE_ALIGNED(32, float, imdct_win)[256];
+
+    DECLARE_ALIGNED(32, float, temp)[256];
+} ATRAC9Context;
+
+static inline int parse_gradient(ATRAC9Context *s, ATRAC9BlockData *b,
+                                 GetBitContext *gb)
+{
+    int grad_range[2];
+    int grad_value[2];
+    int values, sign, base;
+    uint8_t *curve;
+    float scale;
+
+    b->grad_mode = get_bits(gb, 2);
+    if (b->grad_mode) {
+        grad_range[0] = get_bits(gb, 5);
+        grad_range[1] = 31;
+        grad_value[0] = get_bits(gb, 5);
+        grad_value[1] = 31;
+    } else {
+        grad_range[0] = get_bits(gb, 6);
+        grad_range[1] = get_bits(gb, 6) + 1;
+        grad_value[0] = get_bits(gb, 5);
+        grad_value[1] = get_bits(gb, 5);
+    }
+    b->grad_boundary = get_bits(gb, 4);
+
+    if (grad_range[0] >= grad_range[1] || grad_range[1] > 47)
+        return AVERROR_INVALIDDATA;
+
+    if (grad_value[0] > 31 || grad_value[1] > 31)
+        return AVERROR_INVALIDDATA;
+
+    if (b->grad_boundary > b->q_unit_cnt)
+        return AVERROR_INVALIDDATA;
+
+    values    = grad_value[1] - grad_value[0];
+    sign      = 1 - 2*(values < 0);
+    base      = grad_value[0] + sign;
+    scale     = (FFABS(values) - 1) / 31.0f;
+    curve     = s->alloc_curve[grad_range[1] - grad_range[0] - 1];
+
+    for (int i = 0; i <= b->q_unit_cnt; i++)
+        b->gradient[i] = grad_value[i >= grad_range[0]];
+
+    for (int i = grad_range[0]; i < grad_range[1]; i++)
+        b->gradient[i] = base + sign*((int)(scale*curve[i - grad_range[0]]));
+
+    return 0;
+}
+
+static inline void calc_precision(ATRAC9Context *s, ATRAC9BlockData *b,
+                                  ATRAC9ChannelData *c)
+{
+    memset(c->precision_mask, 0, sizeof(c->precision_mask));
+    for (int i = 1; i < b->q_unit_cnt; i++) {
+        const int delta = FFABS(c->scalefactors[i] - c->scalefactors[i - 1]) - 1;
+        if (delta > 0) {
+            const int neg = c->scalefactors[i - 1] > c->scalefactors[i];
+            c->precision_mask[i - neg] += FFMIN(delta, 5);
+        }
+    }
+
+    if (b->grad_mode) {
+        for (int i = 0; i < b->q_unit_cnt; i++) {
+            c->precision_coarse[i] = c->scalefactors[i];
+            c->precision_coarse[i] += c->precision_mask[i] - b->gradient[i];
+            if (c->precision_coarse[i] < 0)
+                continue;
+            switch (b->grad_mode) {
+            case 1:
+                c->precision_coarse[i] >>= 1;
+                break;
+            case 2:
+                c->precision_coarse[i] = (3 * c->precision_coarse[i]) >> 3;
+                break;
+            case 3:
+                c->precision_coarse[i] >>= 2;
+                break;
+            }
+        }
+    } else {
+        for (int i = 0; i < b->q_unit_cnt; i++)
+            c->precision_coarse[i] = c->scalefactors[i] - b->gradient[i];
+    }
+
+
+    for (int i = 0; i < b->q_unit_cnt; i++)
+        c->precision_coarse[i] = FFMAX(c->precision_coarse[i], 1);
+
+    for (int i = 0; i < b->grad_boundary; i++)
+        c->precision_coarse[i]++;
+
+    for (int i = 0; i < b->q_unit_cnt; i++) {
+        c->precision_fine[i] = 0;
+        if (c->precision_coarse[i] > 15) {
+            c->precision_fine[i] = c->precision_coarse[i] - 15;
+            c->precision_coarse[i] = 15;
+        }
+    }
+}
+
+static inline int parse_band_ext(ATRAC9Context *s, ATRAC9BlockData *b,
+                                 GetBitContext *gb, int stereo)
+{
+    int ext_band = 0;
+
+    if (b->has_band_ext) {
+        ext_band = at9_tab_band_ext_group[b->q_unit_cnt - 13][2];
+        if (stereo) {
+            b->channel[1].band_ext = get_bits(gb, 2);
+            b->channel[1].band_ext = ext_band > 2 ? b->channel[1].band_ext : 4;
+        } else {
+            skip_bits1(gb);
+        }
+    }
+
+    b->has_band_ext_data = get_bits1(gb);
+    if (!b->has_band_ext_data)
+        return 0;
+
+    if (!b->has_band_ext) {
+        skip_bits(gb, 2);
+        skip_bits_long(gb, get_bits(gb, 5));
+        return 0;
+    }
+
+    b->channel[0].band_ext = get_bits(gb, 2);
+    b->channel[0].band_ext = ext_band > 2 ? b->channel[0].band_ext : 4;
+
+    if (!get_bits(gb, 5))
+        return 0;
+
+    for (int i = 0; i <= stereo; i++) {
+        ATRAC9ChannelData *c = &b->channel[i];
+        const int count = at9_tab_band_ext_cnt[c->band_ext][ext_band];
+        for (int j = 0; j < count; j++) {
+            int len = at9_tab_band_ext_lengths[c->band_ext][ext_band][j];
+            c->band_ext_data[j] = get_bits(gb, len);
+        }
+    }
+
+    return 0;
+}
+
+static inline int read_scalefactors(ATRAC9Context *s, ATRAC9BlockData *b,
+                                    ATRAC9ChannelData *c, GetBitContext *gb,
+                                    int channel_idx, int first_in_pkt)
+{
+    static const int mode_map[2][4] = { { 0, 1, 2, 3 }, { 0, 2, 3, 4 } };
+    const int mode = mode_map[channel_idx][get_bits(gb, 2)];
+
+    memset(c->scalefactors, 0, sizeof(c->scalefactors));
+
+    if (first_in_pkt && (mode == 4 || ((mode == 3) && !channel_idx))) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid scalefactor coding mode!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    switch (mode) {
+    case 0: { /* VLC delta offset */
+        const uint8_t *sf_weights = at9_tab_sf_weights[get_bits(gb, 3)];
+        const int base = get_bits(gb, 5);
+        const int len = get_bits(gb, 2) + 3;
+        const VLC *tab = &s->sf_vlc[0][len];
+
+        c->scalefactors[0] = get_bits(gb, len);
+
+        for (int i = 1; i < b->band_ext_q_unit; i++) {
+            int val = c->scalefactors[i - 1] + get_vlc2(gb, tab->table, 9, 2);
+            c->scalefactors[i] = val & ((1 << len) - 1);
+        }
+
+        for (int i = 0; i < b->band_ext_q_unit; i++)
+            c->scalefactors[i] += base - sf_weights[i];
+
+        break;
+    }
+    case 1: { /* CLC offset */
+        const int len = get_bits(gb, 2) + 2;
+        const int base = len < 5 ? get_bits(gb, 5) : 0;
+        for (int i = 0; i < b->band_ext_q_unit; i++)
+            c->scalefactors[i] = base + get_bits(gb, len);
+        break;
+    }
+    case 2:
+    case 4: { /* VLC dist to baseline */
+        const int *baseline = mode == 4 ? c->scalefactors_prev :
+                              channel_idx ? b->channel[0].scalefactors :
+                              c->scalefactors_prev;
+        const int baseline_len = mode == 4 ? b->q_unit_cnt_prev :
+                                 channel_idx ? b->band_ext_q_unit :
+                                 b->q_unit_cnt_prev;
+
+        const int len = get_bits(gb, 2) + 2;
+        const int unit_cnt = FFMIN(b->band_ext_q_unit, baseline_len);
+        const VLC *tab = &s->sf_vlc[1][len];
+
+        for (int i = 0; i < unit_cnt; i++) {
+            int dist = get_vlc2(gb, tab->table, 9, 2);
+            c->scalefactors[i] = baseline[i] + dist;
+        }
+
+        for (int i = unit_cnt; i < b->band_ext_q_unit; i++)
+            c->scalefactors[i] = get_bits(gb, 5);
+
+        break;
+    }
+    case 3: { /* VLC offset with baseline */
+        const int *baseline = channel_idx ? b->channel[0].scalefactors :
+                              c->scalefactors_prev;
+        const int baseline_len = channel_idx ? b->band_ext_q_unit :
+                                 b->q_unit_cnt_prev;
+
+        const int base = get_bits(gb, 5) - (1 << (5 - 1));
+        const int len = get_bits(gb, 2) + 1;
+        const int unit_cnt = FFMIN(b->band_ext_q_unit, baseline_len);
+        const VLC *tab = &s->sf_vlc[0][len];
+
+        c->scalefactors[0] = get_bits(gb, len);
+
+        for (int i = 1; i < unit_cnt; i++) {
+            int val = c->scalefactors[i - 1] + get_vlc2(gb, tab->table, 9, 2);
+            c->scalefactors[i] = val & ((1 << len) - 1);
+        }
+
+        for (int i = 0; i < unit_cnt; i++)
+            c->scalefactors[i] += base + baseline[i];
+
+        for (int i = unit_cnt; i < b->band_ext_q_unit; i++)
+            c->scalefactors[i] = get_bits(gb, 5);
+        break;
+    }
+    }
+
+    for (int i = 0; i < b->band_ext_q_unit; i++)
+        if (c->scalefactors[i] < 0 || c->scalefactors[i] > 31)
+            return AVERROR_INVALIDDATA;
+
+    memcpy(c->scalefactors_prev, c->scalefactors, sizeof(c->scalefactors));
+
+    return 0;
+}
+
+static inline void calc_codebook_idx(ATRAC9Context *s, ATRAC9BlockData *b,
+                                     ATRAC9ChannelData *c)
+{
+    int avg = 0;
+    const int last_sf = c->scalefactors[c->q_unit_cnt];
+
+    memset(c->codebookset, 0, sizeof(c->codebookset));
+
+    if (c->q_unit_cnt <= 1)
+        return;
+    if (s->samplerate_idx > 7)
+        return;
+
+    c->scalefactors[c->q_unit_cnt] = c->scalefactors[c->q_unit_cnt - 1];
+
+    if (c->q_unit_cnt > 12) {
+        for (int i = 0; i < 12; i++)
+            avg += c->scalefactors[i];
+        avg = (avg + 6) / 12;
+    }
+
+    for (int i = 8; i < c->q_unit_cnt; i++) {
+        const int prev = c->scalefactors[i - 1];
+        const int cur  = c->scalefactors[i    ];
+        const int next = c->scalefactors[i + 1];
+        const int min  = FFMIN(prev, next);
+        if ((cur - min >= 3 || 2*cur - prev - next >= 3))
+            c->codebookset[i] = 1;
+    }
+
+
+    for (int i = 12; i < c->q_unit_cnt; i++) {
+        const int cur = c->scalefactors[i];
+        const int cnd = at9_q_unit_to_coeff_cnt[i] == 16;
+        const int min = FFMIN(c->scalefactors[i + 1], c->scalefactors[i - 1]);
+        if (c->codebookset[i])
+            continue;
+
+        c->codebookset[i] = (((cur - min) >= 2) && (cur >= (avg - cnd)));
+    }
+
+    c->scalefactors[c->q_unit_cnt] = last_sf;
+}
+
+static inline void read_coeffs_coarse(ATRAC9Context *s, ATRAC9BlockData *b,
+                                      ATRAC9ChannelData *c, GetBitContext *gb)
+{
+    const int max_prec = s->samplerate_idx > 7 ? 1 : 7;
+
+    memset(c->q_coeffs_coarse, 0, sizeof(c->q_coeffs_coarse));
+
+    for (int i = 0; i < c->q_unit_cnt; i++) {
+        int *coeffs = &c->q_coeffs_coarse[at9_q_unit_to_coeff_idx[i]];
+        const int bands = at9_q_unit_to_coeff_cnt[i];
+        const int prec = c->precision_coarse[i] + 1;
+
+        if (prec <= max_prec) {
+            const int cb = c->codebookset[i];
+            const int cbi = at9_q_unit_to_codebookidx[i];
+            const VLC *tab = &s->coeff_vlc[cb][prec][cbi];
+            const HuffmanCodebook *huff = &at9_huffman_coeffs[cb][prec][cbi];
+            const int groups = bands >> huff->value_cnt_pow;
+
+            for (int j = 0; j < groups; j++) {
+                uint16_t val = get_vlc2(gb, tab->table, 9, huff->max_bit_size);
+
+                for (int k = 0; k < huff->value_cnt; k++) {
+                    coeffs[k] = sign_extend(val, huff->value_bits);
+                    val >>= huff->value_bits;
+                }
+
+                coeffs += huff->value_cnt;
+            }
+        } else {
+            for (int j = 0; j < bands; j++)
+                coeffs[j] = sign_extend(get_bits(gb, prec), prec);
+        }
+    }
+}
+
+static inline void read_coeffs_fine(ATRAC9Context *s, ATRAC9BlockData *b,
+                                    ATRAC9ChannelData *c, GetBitContext *gb)
+{
+    memset(c->q_coeffs_fine, 0, sizeof(c->q_coeffs_fine));
+
+    for (int i = 0; i < c->q_unit_cnt; i++) {
+        const int start = at9_q_unit_to_coeff_idx[i + 0];
+        const int end   = at9_q_unit_to_coeff_idx[i + 1];
+        const int len   = c->precision_fine[i] + 1;
+
+        if (c->precision_fine[i] <= 0)
+            continue;
+
+        for (int j = start; j < end; j++)
+            c->q_coeffs_fine[j] = sign_extend(get_bits(gb, len), len);
+    }
+}
+
+static inline void dequantize(ATRAC9Context *s, ATRAC9BlockData *b,
+                              ATRAC9ChannelData *c)
+{
+    memset(c->coeffs, 0, sizeof(c->coeffs));
+
+    for (int i = 0; i < c->q_unit_cnt; i++) {
+        const int start = at9_q_unit_to_coeff_idx[i + 0];
+        const int end   = at9_q_unit_to_coeff_idx[i + 1];
+
+        const float coarse_c = at9_quant_step_coarse[c->precision_coarse[i]];
+        const float fine_c   = at9_quant_step_fine[c->precision_fine[i]];
+
+        for (int j = start; j < end; j++) {
+            const float vc = c->q_coeffs_coarse[j] * coarse_c;
+            const float vf = c->q_coeffs_fine[j]   * fine_c;
+            c->coeffs[j] = vc + vf;
+        }
+    }
+}
+
+static inline void apply_intensity_stereo(ATRAC9Context *s, ATRAC9BlockData *b,
+                                          const int stereo)
+{
+    float *src = b->channel[ b->cpe_base_channel].coeffs;
+    float *dst = b->channel[!b->cpe_base_channel].coeffs;
+
+    if (!stereo)
+        return;
+
+    if (b->q_unit_cnt <= b->stereo_q_unit)
+        return;
+
+    for (int i = b->stereo_q_unit; i < b->q_unit_cnt; i++) {
+        const int sign  = b->is_signs[i];
+        const int start = at9_q_unit_to_coeff_idx[i + 0];
+        const int end   = at9_q_unit_to_coeff_idx[i + 1];
+        for (int j = start; j < end; j++)
+            dst[j] = sign*src[j];
+    }
+}
+
+static inline void apply_scalefactors(ATRAC9Context *s, ATRAC9BlockData *b,
+                                      const int stereo)
+{
+    for (int i = 0; i <= stereo; i++) {
+        float *coeffs = b->channel[i].coeffs;
+        for (int j = 0; j < b->q_unit_cnt; j++) {
+            const int start = at9_q_unit_to_coeff_idx[j + 0];
+            const int end   = at9_q_unit_to_coeff_idx[j + 1];
+            const int scalefactor = b->channel[i].scalefactors[j];
+            const float scale = at9_scalefactor_c[scalefactor];
+            for (int k = start; k < end; k++)
+                coeffs[k] *= scale;
+        }
+    }
+}
+
+static inline void fill_with_noise(ATRAC9Context *s, ATRAC9ChannelData *c,
+                                   int start, int count)
+{
+    float maxval = 0.0f;
+    for (int i = 0; i < count; i += 2) {
+        double tmp[2];
+        av_bmg_get(&s->lfg, tmp);
+        c->coeffs[start + i + 0] = tmp[0];
+        c->coeffs[start + i + 1] = tmp[1];
+        maxval = FFMAX(FFMAX(FFABS(tmp[0]), FFABS(tmp[1])), maxval);
+    }
+    /* Normalize */
+    for (int i = 0; i < count; i++)
+        c->coeffs[start + i] /= maxval;
+}
+
+static inline void scale_band_ext_coeffs(ATRAC9ChannelData *c, float sf[6],
+                                         const int s_unit, const int e_unit)
+{
+    for (int i = s_unit; i < e_unit; i++) {
+        const int start = at9_q_unit_to_coeff_idx[i + 0];
+        const int end   = at9_q_unit_to_coeff_idx[i + 1];
+        for (int j = start; j < end; j++)
+            c->coeffs[j] *= sf[i - s_unit];
+    }
+}
+
+static inline void apply_band_extension(ATRAC9Context *s, ATRAC9BlockData *b,
+                                       const int stereo)
+{
+    const int g_units[4] = { /* A, B, C, total units */
+        b->q_unit_cnt,
+        at9_tab_band_ext_group[b->q_unit_cnt - 13][0],
+        at9_tab_band_ext_group[b->q_unit_cnt - 13][1],
+        FFMAX(g_units[2], 22),
+    };
+
+    const int g_bins[4] = { /* A, B, C, total bins */
+        at9_q_unit_to_coeff_idx[g_units[0]],
+        at9_q_unit_to_coeff_idx[g_units[1]],
+        at9_q_unit_to_coeff_idx[g_units[2]],
+        at9_q_unit_to_coeff_idx[g_units[3]],
+    };
+
+    if (!b->has_band_ext || !b->has_band_ext_data)
+        return;
+
+    for (int ch = 0; ch <= stereo; ch++) {
+        ATRAC9ChannelData *c = &b->channel[ch];
+
+        /* Mirror the spectrum */
+        for (int i = 0; i < 3; i++)
+            for (int j = 0; j < (g_bins[i + 1] - g_bins[i + 0]); j++)
+                c->coeffs[g_bins[i] + j] = c->coeffs[g_bins[i] - j - 1];
+
+        switch (c->band_ext) {
+        case 0: {
+            float sf[6] = { 0.0f };
+            const int l = g_units[3] - g_units[0] - 1;
+            const int n_start = at9_q_unit_to_coeff_idx[g_units[3] - 1];
+            const int n_cnt   = at9_q_unit_to_coeff_cnt[g_units[3] - 1];
+            switch (at9_tab_band_ext_group[b->q_unit_cnt - 13][2]) {
+            case 3:
+                sf[0] = at9_band_ext_scales_m0[0][0][c->band_ext_data[0]];
+                sf[1] = at9_band_ext_scales_m0[0][1][c->band_ext_data[0]];
+                sf[2] = at9_band_ext_scales_m0[0][2][c->band_ext_data[1]];
+                sf[3] = at9_band_ext_scales_m0[0][3][c->band_ext_data[2]];
+                sf[4] = at9_band_ext_scales_m0[0][4][c->band_ext_data[3]];
+                break;
+            case 4:
+                sf[0] = at9_band_ext_scales_m0[1][0][c->band_ext_data[0]];
+                sf[1] = at9_band_ext_scales_m0[1][1][c->band_ext_data[0]];
+                sf[2] = at9_band_ext_scales_m0[1][2][c->band_ext_data[1]];
+                sf[3] = at9_band_ext_scales_m0[1][3][c->band_ext_data[2]];
+                sf[4] = at9_band_ext_scales_m0[1][4][c->band_ext_data[3]];
+                break;
+            case 5:
+                sf[0] = at9_band_ext_scales_m0[2][0][c->band_ext_data[0]];
+                sf[1] = at9_band_ext_scales_m0[2][1][c->band_ext_data[1]];
+                sf[2] = at9_band_ext_scales_m0[2][2][c->band_ext_data[1]];
+                break;
+            }
+
+            sf[l] = at9_scalefactor_c[c->scalefactors[g_units[0]]];
+
+            fill_with_noise(s, c, n_start, n_cnt);
+            scale_band_ext_coeffs(c, sf, g_units[0], g_units[3]);
+            break;
+        }
+        case 1: {
+            float sf[6];
+            for (int i = g_units[0]; i < g_units[3]; i++)
+                sf[i - g_units[0]] = at9_scalefactor_c[c->scalefactors[i]];
+
+            fill_with_noise(s, c, g_bins[0], g_bins[3] - g_bins[0]);
+            scale_band_ext_coeffs(c, sf, g_units[0], g_units[3]);
+            break;
+        }
+        case 2: {
+            const float g_sf[2] = {
+                at9_band_ext_scales_m2[c->band_ext_data[0]],
+                at9_band_ext_scales_m2[c->band_ext_data[1]],
+            };
+
+            for (int i = 0; i < 2; i++)
+                for (int j = g_bins[i + 0]; j < g_bins[i + 1]; j++)
+                    c->coeffs[j] *= g_sf[i];
+            break;
+        }
+        case 3: {
+            float scale = at9_band_ext_scales_m3[c->band_ext_data[0]][0];
+            float rate  = at9_band_ext_scales_m3[c->band_ext_data[1]][1];
+            rate = pow(2, rate);
+            for (int i = g_bins[0]; i < g_bins[3]; i++) {
+                scale *= rate;
+                c->coeffs[i] *= scale;
+            }
+            break;
+        }
+        case 4: {
+            const float m = at9_band_ext_scales_m4[c->band_ext_data[0]];
+            const float g_sf[3] = { 0.7079468f*m, 0.5011902f*m, 0.3548279f*m };
+
+            for (int i = 0; i < 3; i++)
+                for (int j = g_bins[i + 0]; j < g_bins[i + 1]; j++)
+                    c->coeffs[j] *= g_sf[i];
+            break;
+        }
+        }
+    }
+}
+
+static int atrac9_decode_block(ATRAC9Context *s, GetBitContext *gb,
+                               ATRAC9BlockData *b, AVFrame *frame,
+                               int frame_idx, int block_idx)
+{
+    const int first_in_pkt = !get_bits1(gb);
+    const int reuse_params =  get_bits1(gb);
+    const int stereo = s->block_config->type[block_idx] == ATRAC9_BLOCK_TYPE_CPE;
+
+    if (s->block_config->type[block_idx] == ATRAC9_BLOCK_TYPE_LFE) {
+        ATRAC9ChannelData *c = &b->channel[0];
+        const int precision = reuse_params ? 8 : 4;
+        c->q_unit_cnt = b->q_unit_cnt = 2;
+
+        memset(c->scalefactors, 0, sizeof(c->scalefactors));
+        memset(c->q_coeffs_fine, 0, sizeof(c->q_coeffs_fine));
+        memset(c->q_coeffs_coarse, 0, sizeof(c->q_coeffs_coarse));
+
+        for (int i = 0; i < b->q_unit_cnt; i++) {
+            c->scalefactors[i] = get_bits(gb, 5);
+            c->precision_coarse[i] = precision;
+            c->precision_fine[i] = 0;
+        }
+
+        for (int i = 0; i < c->q_unit_cnt; i++) {
+            const int start = at9_q_unit_to_coeff_idx[i + 0];
+            const int end   = at9_q_unit_to_coeff_idx[i + 1];
+            for (int j = start; j < end; j++)
+                c->q_coeffs_coarse[j] = get_bits(gb, c->precision_coarse[i] + 1);
+        }
+
+        dequantize        (s, b, c);
+        apply_scalefactors(s, b, 0);
+
+        goto imdct;
+    }
+
+    if (first_in_pkt && reuse_params) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid block flags!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* Band parameters */
+    if (!reuse_params) {
+        int stereo_band, ext_band;
+        const int min_band_count = s->samplerate_idx > 7 ? 1 : 3;
+        b->band_count = get_bits(gb, 4) + min_band_count;
+        b->q_unit_cnt = at9_tab_band_q_unit_map[b->band_count];
+
+        b->band_ext_q_unit = b->stereo_q_unit = b->q_unit_cnt;
+
+        if (b->band_count > at9_tab_sri_max_bands[s->samplerate_idx]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid band count %i!\n",
+                   b->band_count);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (stereo) {
+            stereo_band = get_bits(gb, 4) + min_band_count;
+            if (stereo_band > b->band_count) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid stereo band %i!\n",
+                       stereo_band);
+                return AVERROR_INVALIDDATA;
+            }
+            b->stereo_q_unit = at9_tab_band_q_unit_map[stereo_band];
+        }
+
+        b->has_band_ext = get_bits1(gb);
+        if (b->has_band_ext) {
+            ext_band = get_bits(gb, 4) + min_band_count;
+            if (ext_band < b->band_count) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid extension band %i!\n",
+                       ext_band);
+                return AVERROR_INVALIDDATA;
+            }
+            b->band_ext_q_unit = at9_tab_band_q_unit_map[ext_band];
+        }
+    }
+
+    /* Calculate bit alloc gradient */
+    if (parse_gradient(s, b, gb))
+        return AVERROR_INVALIDDATA;
+
+    /* IS data */
+    b->cpe_base_channel = 0;
+    if (stereo) {
+        b->cpe_base_channel = get_bits1(gb);
+        if (get_bits1(gb)) {
+            for (int i = b->stereo_q_unit; i < b->q_unit_cnt; i++)
+                b->is_signs[i] = 1 - 2*get_bits1(gb);
+        } else {
+            for (int i = 0; i < FF_ARRAY_ELEMS(b->is_signs); i++)
+                b->is_signs[i] = 1;
+        }
+    }
+
+    /* Band extension */
+    if (parse_band_ext(s, b, gb, stereo))
+        return AVERROR_INVALIDDATA;
+
+    /* Scalefactors */
+    for (int i = 0; i <= stereo; i++) {
+        ATRAC9ChannelData *c = &b->channel[i];
+        c->q_unit_cnt = i == b->cpe_base_channel ? b->q_unit_cnt :
+                                                   b->stereo_q_unit;
+        if (read_scalefactors(s, b, c, gb, i, first_in_pkt))
+            return AVERROR_INVALIDDATA;
+
+        calc_precision    (s, b, c);
+        calc_codebook_idx (s, b, c);
+        read_coeffs_coarse(s, b, c, gb);
+        read_coeffs_fine  (s, b, c, gb);
+        dequantize        (s, b, c);
+    }
+
+    b->q_unit_cnt_prev = b->has_band_ext ? b->band_ext_q_unit : b->q_unit_cnt;
+
+    apply_intensity_stereo(s, b, stereo);
+    apply_scalefactors    (s, b, stereo);
+    apply_band_extension  (s, b, stereo);
+
+imdct:
+    for (int i = 0; i <= stereo; i++) {
+        ATRAC9ChannelData *c = &b->channel[i];
+        const int dst_idx = s->block_config->plane_map[block_idx][i];
+        const int wsize = 1 << s->frame_log2;
+        const ptrdiff_t offset = wsize*frame_idx*sizeof(float);
+        float *dst = (float *)(frame->extended_data[dst_idx] + offset);
+
+        s->imdct.imdct_half(&s->imdct, s->temp, c->coeffs);
+        s->fdsp->vector_fmul_window(dst, c->prev_win, s->temp,
+                                    s->imdct_win, wsize >> 1);
+        memcpy(c->prev_win, s->temp + (wsize >> 1), sizeof(float)*wsize >> 1);
+    }
+
+    return 0;
+}
+
+static int atrac9_decode_frame(AVCodecContext *avctx, void *data,
+                               int *got_frame_ptr, AVPacket *avpkt)
+{
+    int ret;
+    GetBitContext gb;
+    AVFrame *frame = data;
+    ATRAC9Context *s = avctx->priv_data;
+    const int frames = FFMIN(avpkt->size / s->avg_frame_size, s->frame_count);
+
+    frame->nb_samples = (1 << s->frame_log2) * frames;
+    ret = ff_get_buffer(avctx, frame, 0);
+    if (ret < 0)
+        return ret;
+
+    init_get_bits8(&gb, avpkt->data, avpkt->size);
+
+    for (int i = 0; i < frames; i++) {
+        for (int j = 0; j < s->block_config->count; j++) {
+            ret = atrac9_decode_block(s, &gb, &s->block[j], frame, i, j);
+            if (ret)
+                return ret;
+            align_get_bits(&gb);
+        }
+    }
+
+    *got_frame_ptr = 1;
+
+    return avctx->block_align;
+}
+
+static void atrac9_decode_flush(AVCodecContext *avctx)
+{
+    ATRAC9Context *s = avctx->priv_data;
+
+    for (int j = 0; j < s->block_config->count; j++) {
+        ATRAC9BlockData *b = &s->block[j];
+        const int stereo = s->block_config->type[j] == ATRAC9_BLOCK_TYPE_CPE;
+        for (int i = 0; i <= stereo; i++) {
+            ATRAC9ChannelData *c = &b->channel[i];
+            memset(c->prev_win, 0, sizeof(c->prev_win));
+        }
+    }
+}
+
+static av_cold int atrac9_decode_close(AVCodecContext *avctx)
+{
+    ATRAC9Context *s = avctx->priv_data;
+
+    for (int i = 1; i < 7; i++)
+        ff_free_vlc(&s->sf_vlc[0][i]);
+    for (int i = 2; i < 6; i++)
+        ff_free_vlc(&s->sf_vlc[1][i]);
+    for (int i = 0; i < 2; i++)
+        for (int j = 0; j < 8; j++)
+            for (int k = 0; k < 4; k++)
+                ff_free_vlc(&s->coeff_vlc[i][j][k]);
+
+    ff_mdct_end(&s->imdct);
+    av_free(s->fdsp);
+
+    return 0;
+}
+
+static av_cold int atrac9_decode_init(AVCodecContext *avctx)
+{
+    GetBitContext gb;
+    ATRAC9Context *s = avctx->priv_data;
+    int version, block_config_idx, superframe_idx, alloc_c_len;
+
+    s->avctx = avctx;
+
+    av_lfg_init(&s->lfg, 0xFBADF00D);
+
+    if (avctx->extradata_size != 12) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid extradata length!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    version = AV_RL32(avctx->extradata);
+    if (version > 2) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported version (%i)!\n", version);
+        return AVERROR_INVALIDDATA;
+    }
+
+    init_get_bits8(&gb, avctx->extradata + 4, avctx->extradata_size);
+
+    if (get_bits(&gb, 8) != 0xFE) {
+        av_log(avctx, AV_LOG_ERROR, "Incorrect magic byte!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->samplerate_idx = get_bits(&gb, 4);
+    avctx->sample_rate = at9_tab_samplerates[s->samplerate_idx];
+
+    block_config_idx = get_bits(&gb, 3);
+    if (block_config_idx > 5) {
+        av_log(avctx, AV_LOG_ERROR, "Incorrect block config!\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->block_config = &at9_block_layout[block_config_idx];
+
+    avctx->channel_layout = s->block_config->channel_layout;
+    avctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;
+
+    if (get_bits1(&gb)) {
+        av_log(avctx, AV_LOG_ERROR, "Incorrect verification bit!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* Average frame size in bytes */
+    s->avg_frame_size = get_bits(&gb, 11) + 1;
+
+    superframe_idx = get_bits(&gb, 2);
+    if (superframe_idx & 1) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid superframe index!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->frame_count = 1 << superframe_idx;
+    s->frame_log2  = at9_tab_sri_frame_log2[s->samplerate_idx];
+
+    if (ff_mdct_init(&s->imdct, s->frame_log2 + 1, 1, 1.0f / 32768.0f))
+        return AVERROR(ENOMEM);
+
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+
+    /* iMDCT window */
+    for (int i = 0; i < (1 << s->frame_log2); i++) {
+        const int   len  = 1 << s->frame_log2;
+        const float sidx = (      i + 0.5f) / len;
+        const float eidx = (len - i - 0.5f) / len;
+        const float s_c  = sinf(sidx*M_PI - M_PI_2)*0.5f + 0.5f;
+        const float e_c  = sinf(eidx*M_PI - M_PI_2)*0.5f + 0.5f;
+        s->imdct_win[i]  = s_c / ((s_c * s_c) + (e_c * e_c));
+    }
+
+    /* Allocation curve */
+    alloc_c_len = FF_ARRAY_ELEMS(at9_tab_b_dist);
+    for (int i = 1; i <= alloc_c_len; i++)
+        for (int j = 0; j < i; j++)
+            s->alloc_curve[i - 1][j] = at9_tab_b_dist[(j * alloc_c_len) / i];
+
+    /* Unsigned scalefactor VLCs */
+    for (int i = 1; i < 7; i++) {
+        const HuffmanCodebook *hf = &at9_huffman_sf_unsigned[i];
+
+        init_vlc(&s->sf_vlc[0][i], 9, hf->size, hf->bits, 1, 1, hf->codes,
+                 2, 2, 0);
+    }
+
+    /* Signed scalefactor VLCs */
+    for (int i = 2; i < 6; i++) {
+        const HuffmanCodebook *hf = &at9_huffman_sf_signed[i];
+
+        int nums = hf->size;
+        int16_t sym[32];
+        for (int j = 0; j < nums; j++)
+            sym[j] = sign_extend(j, hf->value_bits);
+
+        ff_init_vlc_sparse(&s->sf_vlc[1][i], 9, hf->size, hf->bits, 1, 1,
+                           hf->codes, 2, 2, sym, sizeof(*sym), sizeof(*sym), 0);
+    }
+
+    /* Coefficient VLCs */
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 8; j++) {
+            for (int k = 0; k < 4; k++) {
+                const HuffmanCodebook *hf = &at9_huffman_coeffs[i][j][k];
+                init_vlc(&s->coeff_vlc[i][j][k], 9, hf->size, hf->bits, 1, 1,
+                         hf->codes, 2, 2, 0);
+            }
+        }
+    }
+
+    return 0;
+}
+
+AVCodec ff_atrac9_decoder = {
+    .name           = "atrac9",
+    .long_name      = NULL_IF_CONFIG_SMALL("ATRAC9 (Adaptive TRansform Acoustic Coding 9)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ATRAC9,
+    .priv_data_size = sizeof(ATRAC9Context),
+    .init           = atrac9_decode_init,
+    .close          = atrac9_decode_close,
+    .decode         = atrac9_decode_frame,
+    .flush          = atrac9_decode_flush,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/atrac9tab.h b/libavcodec/atrac9tab.h
new file mode 100644
index 0000000..d25c6f1
--- /dev/null
+++ b/libavcodec/atrac9tab.h
@@ -0,0 +1,1633 @@
+/*
+ * ATRAC9 decoder
+ * Copyright (c) 2018 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ATRAC9TAB_H
+#define AVCODEC_ATRAC9TAB_H
+
+#include <stdint.h>
+
+#include "libavutil/channel_layout.h"
+
+enum ATRAC9BlockType {
+    ATRAC9_BLOCK_TYPE_SCE, /* Single channel */
+    ATRAC9_BLOCK_TYPE_CPE, /* 2 coupled channels */
+    ATRAC9_BLOCK_TYPE_LFE, /* Single LFE channel */
+};
+
+typedef struct ATRAC9BlockConfig {
+    uint64_t channel_layout;
+    enum ATRAC9BlockType type[5];
+    int plane_map[5][2];
+    int count;
+} ATRAC9BlockConfig;
+
+static const ATRAC9BlockConfig at9_block_layout[] = {
+    { /* Mono */
+        AV_CH_LAYOUT_MONO,
+        {
+            ATRAC9_BLOCK_TYPE_SCE,
+        },
+        { { 0 }, },
+        1,
+    },
+    { /* Dual Mono */
+        AV_CH_LAYOUT_STEREO,
+        {
+            ATRAC9_BLOCK_TYPE_SCE,
+            ATRAC9_BLOCK_TYPE_SCE,
+        },
+        { { 0 }, { 1 }, },
+        2,
+    },
+    { /* Stereo */
+        AV_CH_LAYOUT_STEREO,
+        {
+            ATRAC9_BLOCK_TYPE_CPE,
+        },
+        { { 0, 1 }, },
+        1,
+    },
+    { /* 5.1 */
+        AV_CH_LAYOUT_5POINT1,
+        {
+            ATRAC9_BLOCK_TYPE_CPE,
+            ATRAC9_BLOCK_TYPE_SCE,
+            ATRAC9_BLOCK_TYPE_LFE,
+            ATRAC9_BLOCK_TYPE_CPE,
+        },
+        { { 0, 1 }, { 2 }, { 3 }, { 4, 5 }, },
+        4,
+    },
+    { /* 5.1 */
+        AV_CH_LAYOUT_7POINT1,
+        {
+            ATRAC9_BLOCK_TYPE_CPE,
+            ATRAC9_BLOCK_TYPE_SCE,
+            ATRAC9_BLOCK_TYPE_LFE,
+            ATRAC9_BLOCK_TYPE_CPE,
+            ATRAC9_BLOCK_TYPE_CPE,
+        },
+        { { 0, 1 }, { 2 }, { 3 }, { 4, 5 }, { 6, 7 }, },
+        5,
+    },
+    { /* Quad */
+        AV_CH_LAYOUT_QUAD,
+        {
+            ATRAC9_BLOCK_TYPE_CPE,
+            ATRAC9_BLOCK_TYPE_CPE,
+        },
+        { { 0, 1 }, { 2, 3 }, },
+        2,
+    },
+};
+
+static const uint8_t at9_tab_sri_frame_log2[] = {
+    6, 6, 7, 7, 7, 8, 8, 8, 6, 6, 7, 7, 7, 8, 8, 8,
+};
+
+static const uint8_t at9_tab_band_q_unit_map[] = {
+    0, 4, 8, 10, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24, 25, 26, 28, 30,
+};
+
+static const uint8_t at9_q_unit_to_coeff_cnt[] = {
+    2, 2, 2, 2, 2,  2,  2,  2,  4,  4,  4,  4,  8,  8,  8,
+    8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+};
+
+static const int at9_q_unit_to_coeff_idx[] = {
+    0, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64,
+    72, 80, 88, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256
+};
+
+const uint8_t at9_q_unit_to_codebookidx[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+    2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+static const uint8_t at9_tab_sri_max_bands[] = {
+    8, 8, 12, 12, 12, 18, 18, 18, 8, 8, 12, 12, 12, 16, 16, 16,
+};
+
+static const int at9_tab_samplerates[] = {
+    11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 44100, 48000,
+    64000, 88200, 96000, 128000, 176400, 192000,
+};
+
+static const uint8_t at9_tab_band_ext_cnt[][6] = {
+    { 0, 0, 0, 4, 4, 2 },
+    { 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 2, 2, 1 },
+    { 0, 0, 0, 2, 2, 2 },
+    { 1, 1, 1, 0, 0, 0 },
+};
+
+/* B unit, C unit, Band count */
+static const uint8_t at9_tab_band_ext_group[][3] = {
+    { 16, 21, 0 },
+    { 18, 22, 1 },
+    { 20, 22, 2 },
+    { 21, 22, 3 },
+    { 21, 22, 3 },
+    { 23, 24, 4 },
+    { 23, 24, 4 },
+    { 24, 24, 5 },
+};
+
+static const uint8_t at9_tab_band_ext_lengths[][6][4] = {
+    {
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 5, 4, 3, 3 },
+        { 4, 4, 3, 4 },
+        { 4, 5, 0, 0 },
+    },
+    {
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+    },
+    {
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 6, 6, 0, 0 },
+        { 6, 6, 0, 0 },
+        { 6, 0, 0, 0 },
+    },
+    {
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 4, 4, 0, 0 },
+        { 4, 4, 0, 0 },
+        { 4, 4, 0, 0 },
+    },
+    {
+        { 3, 0, 0, 0 },
+        { 3, 0, 0, 0 },
+        { 3, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 },
+    },
+};
+
+static const float at9_band_ext_scales_m0[][5][32] = {
+    {
+        {
+            0.000000e+0f, 1.988220e-1f, 2.514343e-1f, 2.960510e-1f,
+            3.263550e-1f, 3.771362e-1f, 3.786926e-1f, 4.540405e-1f,
+            4.877625e-1f, 5.262451e-1f, 5.447083e-1f, 5.737000e-1f,
+            6.212158e-1f, 6.222839e-1f, 6.560974e-1f, 6.896667e-1f,
+            7.555542e-1f, 7.677917e-1f, 7.918091e-1f, 7.971497e-1f,
+            8.188171e-1f, 8.446045e-1f, 9.790649e-1f, 9.822083e-1f,
+            9.846191e-1f, 9.859314e-1f, 9.863586e-1f, 9.863892e-1f,
+            9.873352e-1f, 9.881287e-1f, 9.898682e-1f, 9.913330e-1f,
+        }, {
+            0.000000e+0f, 9.982910e-1f, 7.592773e-2f, 7.179565e-1f,
+            9.851379e-1f, 5.340271e-1f, 9.013672e-1f, 6.349182e-1f,
+            7.226257e-1f, 1.948547e-1f, 7.628174e-1f, 9.873657e-1f,
+            8.112183e-1f, 2.715454e-1f, 9.734192e-1f, 1.443787e-1f,
+            4.640198e-1f, 3.249207e-1f, 3.790894e-1f, 8.276367e-2f,
+            5.954590e-1f, 2.864380e-1f, 9.806824e-1f, 7.929077e-1f,
+            6.292114e-1f, 4.887085e-1f, 2.905273e-1f, 1.301880e-1f,
+            3.140869e-1f, 5.482483e-1f, 4.210815e-1f, 1.182861e-1f,
+        }, {
+            0.000000e+0f, 3.155518e-2f, 8.581543e-2f, 1.364746e-1f,
+            1.858826e-1f, 2.368469e-1f, 2.888184e-1f, 3.432617e-1f,
+            4.012451e-1f, 4.623108e-1f, 5.271301e-1f, 5.954895e-1f,
+            6.681213e-1f, 7.448425e-1f, 8.245239e-1f, 9.097290e-1f,
+        }, {
+            0.000000e+0f, 4.418945e-2f, 1.303711e-1f, 2.273560e-1f,
+            3.395996e-1f, 4.735718e-1f, 6.267090e-1f, 8.003845e-1f,
+        }, {
+            0.000000e+0f, 2.804565e-2f, 9.683228e-2f, 1.849976e-1f,
+            3.005981e-1f, 4.470520e-1f, 6.168518e-1f, 8.007813e-1f,
+        },
+    },
+    {
+        {
+            0.000000e+0f, 2.708740e-1f, 3.479614e-1f, 3.578186e-1f,
+            5.083618e-1f, 5.299072e-1f, 5.819092e-1f, 6.381836e-1f,
+            7.276917e-1f, 7.595520e-1f, 7.878723e-1f, 9.707336e-1f,
+            9.713135e-1f, 9.736023e-1f, 9.759827e-1f, 9.832458e-1f,
+        }, {
+            0.000000e+0f, 2.330627e-1f, 5.891418e-1f, 7.170410e-1f,
+            2.036438e-1f, 1.613464e-1f, 6.668701e-1f, 9.481201e-1f,
+            9.769897e-1f, 5.111694e-1f, 3.522644e-1f, 8.209534e-1f,
+            2.933960e-1f, 9.757690e-1f, 5.289917e-1f, 4.372253e-1f,
+        }, {
+            0.000000e+0f, 4.360962e-2f, 1.056519e-1f, 1.590576e-1f,
+            2.078857e-1f, 2.572937e-1f, 3.082581e-1f, 3.616028e-1f,
+            4.191589e-1f, 4.792175e-1f, 5.438538e-1f, 6.125183e-1f,
+            6.841125e-1f, 7.589417e-1f, 8.365173e-1f, 9.148254e-1f,
+        }, {
+            0.000000e+0f, 4.074097e-2f, 1.164551e-1f, 2.077026e-1f,
+            3.184509e-1f, 4.532166e-1f, 6.124268e-1f, 7.932129e-1f,
+        }, {
+            0.000000e+0f, 8.880615e-3f, 2.932739e-2f, 5.593872e-2f,
+            8.825684e-2f, 1.259155e-1f, 1.721497e-1f, 2.270813e-1f,
+            2.901611e-1f, 3.579712e-1f, 4.334106e-1f, 5.147095e-1f,
+            6.023254e-1f, 6.956177e-1f, 7.952881e-1f, 8.977356e-1f,
+        },
+    },
+    {
+        {
+            0.000000e+0f, 7.379150e-2f, 1.806335e-1f, 2.687073e-1f,
+            3.407898e-1f, 4.047546e-1f, 4.621887e-1f, 5.168762e-1f,
+            5.703125e-1f, 6.237488e-1f, 6.763611e-1f, 7.288208e-1f,
+            7.808533e-1f, 8.337708e-1f, 8.874512e-1f, 9.418030e-1f,
+        }, {
+            0.000000e+0f, 7.980347e-2f, 1.615295e-1f, 1.665649e-1f,
+            1.822205e-1f, 2.185669e-1f, 2.292175e-1f, 2.456665e-1f,
+            2.666321e-1f, 3.306580e-1f, 3.330688e-1f, 3.765259e-1f,
+            4.085083e-1f, 4.400024e-1f, 4.407654e-1f, 4.817505e-1f,
+            4.924011e-1f, 5.320740e-1f, 5.893860e-1f, 6.131287e-1f,
+            6.212463e-1f, 6.278076e-1f, 6.308899e-1f, 7.660828e-1f,
+            7.850647e-1f, 7.910461e-1f, 7.929382e-1f, 8.038330e-1f,
+            9.834900e-1f, 9.846191e-1f, 9.852295e-1f, 9.862671e-1f,
+        }, {
+            0.000000e+0f, 6.084290e-1f, 3.672791e-1f, 3.151855e-1f,
+            1.488953e-1f, 2.571716e-1f, 5.103455e-1f, 3.311157e-1f,
+            5.426025e-2f, 4.254456e-1f, 7.998352e-1f, 7.873230e-1f,
+            5.418701e-1f, 2.925110e-1f, 8.468628e-2f, 1.410522e-1f,
+            9.819641e-1f, 9.609070e-1f, 3.530884e-2f, 9.729004e-2f,
+            5.758362e-1f, 9.941711e-1f, 7.215576e-1f, 7.183228e-1f,
+            2.028809e-1f, 9.588623e-2f, 2.032166e-1f, 1.338806e-1f,
+            5.003357e-1f, 1.874390e-1f, 9.804993e-1f, 1.107788e-1f,
+        },
+    },
+};
+
+static const float at9_band_ext_scales_m2[] = {
+    4.272461e-4f, 1.312256e-3f, 2.441406e-3f, 3.692627e-3f,
+    4.913330e-3f, 6.134033e-3f, 7.507324e-3f, 8.972168e-3f,
+    1.049805e-2f, 1.223755e-2f, 1.406860e-2f, 1.599121e-2f,
+    1.800537e-2f, 2.026367e-2f, 2.264404e-2f, 2.517700e-2f,
+    2.792358e-2f, 3.073120e-2f, 3.344727e-2f, 3.631592e-2f,
+    3.952026e-2f, 4.275513e-2f, 4.608154e-2f, 4.968262e-2f,
+    5.355835e-2f, 5.783081e-2f, 6.195068e-2f, 6.677246e-2f,
+    7.196045e-2f, 7.745361e-2f, 8.319092e-2f, 8.993530e-2f,
+    9.759521e-2f, 1.056213e-1f, 1.138916e-1f, 1.236267e-1f,
+    1.348267e-1f, 1.470337e-1f, 1.603394e-1f, 1.755676e-1f,
+    1.905823e-1f, 2.071228e-1f, 2.245178e-1f, 2.444153e-1f,
+    2.658997e-1f, 2.897644e-1f, 3.146057e-1f, 3.450012e-1f,
+    3.766174e-1f, 4.122620e-1f, 4.505615e-1f, 4.893799e-1f,
+    5.305481e-1f, 5.731201e-1f, 6.157837e-1f, 6.580811e-1f,
+    6.985168e-1f, 7.435303e-1f, 7.865906e-1f, 8.302612e-1f,
+    8.718567e-1f, 9.125671e-1f, 9.575806e-1f, 9.996643e-1f,
+};
+
+static const float at9_band_ext_scales_m3[][2] = {
+    { 3.491211e-1f, -2.913818e-1f, }, { 5.371094e-1f, -2.541504e-1f, },
+    { 6.782227e-1f, -1.664429e-1f, }, { 7.910156e-1f, -1.476440e-1f, },
+    { 9.057617e-1f, -1.342163e-1f, }, { 1.024902e+0f, -1.220703e-1f, },
+    { 1.156250e+0f, -1.117554e-1f, }, { 1.290527e+0f, -1.026611e-1f, },
+    { 1.458984e+0f, -9.436035e-2f, }, { 1.664551e+0f, -8.483887e-2f, },
+    { 1.929688e+0f, -7.476807e-2f, }, { 2.278320e+0f, -6.304932e-2f, },
+    { 2.831543e+0f, -4.492188e-2f, }, { 3.659180e+0f, -2.447510e-2f, },
+    { 5.257813e+0f, +1.831055e-4f, }, { 8.373047e+0f, +4.174805e-2f, },
+};
+
+static const float at9_band_ext_scales_m4[] = {
+    3.610229e-2f, 1.260681e-1f, 2.227478e-1f, 3.338318e-1f,
+    4.662170e-1f, 6.221313e-1f, 7.989197e-1f, 9.939575e-1f,
+};
+
+static const float at9_quant_step_coarse[] = {
+    2.0000000000000000e+0f, 6.6666666666666663e-1f, 2.8571428571428570e-1f,
+    1.3333333333333333e-1f, 6.4516129032258063e-2f, 3.1746031746031744e-2f,
+    1.5748031496062992e-2f, 7.8431372549019607e-3f, 3.9138943248532287e-3f,
+    1.9550342130987292e-3f, 9.7703957010258913e-4f, 4.8840048840048840e-4f,
+    2.4417043096081065e-4f, 1.2207776353537203e-4f, 6.1037018951994385e-5f,
+    3.0518043793392844e-5f,
+};
+
+static const float at9_quant_step_fine[] = {
+    3.0518043793392844e-05f, 1.0172681264464281e-05f, 4.3597205419132631e-06f,
+    2.0345362528928561e-06f, 9.8445302559331759e-07f, 4.8441339354591809e-07f,
+    2.4029955742829012e-07f, 1.1967860311134448e-07f, 5.9722199204291275e-08f,
+    2.9831909866464167e-08f, 1.4908668194134265e-08f, 7.4525137468602791e-09f,
+    3.7258019525568114e-09f, 1.8627872668859698e-09f, 9.3136520869755679e-10f,
+    4.6567549848772173e-10f,
+};
+
+static const float at9_scalefactor_c[] = {
+    3.0517578125e-5f, 6.1035156250e-5f, 1.2207031250e-4f, 2.4414062500e-4f,
+    4.8828125000e-4f, 9.7656250000e-4f, 1.9531250000e-3f, 3.9062500000e-3f,
+    7.8125000000e-3f, 1.5625000000e-2f, 3.1250000000e-2f, 6.2500000000e-2f,
+    1.2500000000e-1f, 2.5000000000e-1f, 5.0000000000e-1f, 1.0000000000e+0f,
+    2.0000000000e+0f, 4.0000000000e+0f, 8.0000000000e+0f, 1.6000000000e+1f,
+    3.2000000000e+1f, 6.4000000000e+1f, 1.2800000000e+2f, 2.5600000000e+2f,
+    5.1200000000e+2f, 1.0240000000e+3f, 2.0480000000e+3f, 4.0960000000e+3f,
+    8.1920000000e+3f, 1.6384000000e+4f, 3.2768000000e+4f, 6.5536000000e+4f,
+};
+
+static const uint8_t at9_tab_sf_weights[][32] = {
+    {
+        0,  0,  0,  1,  1,  2,  2,  2,  2,  2,  2,  3,  2,  3,  3,  4,  4,  4,
+        4,  4,  4,  5,  5,  6,  6,  7,  7,  8, 10, 12, 12, 12,
+    },
+    {
+        3,  2,  2,  1,  1,  1,  1,  1,  0,  1,  1,  1,  0,  0,  0,  1,  0,  1,
+        1,  1,  1,  1,  1,  2,  3,  3,  4,  5,  7, 10, 10, 10,
+    },
+    {
+        0,  2,  4,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+        6,  6,  6,  6,  6,  7,  7,  7,  7,  8,  9, 12, 12, 12,
+    },
+    {
+        0,  1,  1,  2,  2,  2,  3,  3,  3,  3,  3,  4,  4,  4,  5,  5,  5,  6,
+        6,  6,  6,  7,  8,  8, 10, 11, 11, 12, 13, 13, 13, 13,
+    },
+    {
+        0,  2,  2,  3,  3,  4,  4,  5,  4,  5,  5,  5,  5,  6,  7,  8,  8,  8,
+        8,  9,  9,  9, 10, 10, 11, 12, 12, 13, 13, 14, 14, 14,
+    },
+    {
+        1,  1,  0,  0,  0,  0,  1,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,
+        2,  3,  3,  3,  4,  4,  5,  6,  7,  7,  9, 11, 11, 11,
+    },
+    {
+        0,  5,  8, 10, 11, 11, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 13, 15, 15, 15,
+    },
+    {
+        0,  2,  3,  4,  5,  6,  6,  7,  7,  8,  8,  8,  9,  9, 10, 10, 10, 11,
+        11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 15, 15, 15,
+    },
+};
+
+static const uint8_t at9_tab_b_dist[] = {
+         1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  4,  4,  5,  5,  6,  7,  8,
+         9, 10, 11, 12, 13, 15,  6, 18, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27,
+        27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30,
+};
+
+static const uint8_t huff_sfb_a1_bits[] = {
+    1, 1,
+};
+
+static const uint16_t huff_sfb_a1_codes[] = {
+    0x00, 0x01,
+};
+
+static const uint8_t huff_sfb_a2_bits[] = {
+    1, 3, 3, 2,
+};
+
+static const uint16_t huff_sfb_a2_codes[] = {
+    0x00, 0x06, 0x07, 0x02,
+};
+
+static const uint8_t huff_sfb_a3_bits[] = {
+    2, 2, 4, 6, 6, 5, 3, 2,
+};
+
+static const uint16_t huff_sfb_a3_codes[] = {
+    0x00, 0x01, 0x0E, 0x3E, 0x3F, 0x1E, 0x06, 0x02,
+};
+
+static const uint8_t huff_sfb_a4_bits[] = {
+    2, 2, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 6, 5, 4, 2,
+};
+
+static const uint16_t huff_sfb_a4_codes[] = {
+    0x01, 0x02, 0x00, 0x06, 0x0F, 0x13, 0x23, 0x24,
+    0x25, 0x22, 0x21, 0x20, 0x0E, 0x05, 0x01, 0x03,
+};
+
+static const uint8_t huff_sfb_a5_bits[] = {
+    2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 6, 5, 5, 4, 3,
+};
+
+static const uint16_t huff_sfb_a5_codes[] = {
+    0x02, 0x01, 0x07, 0x0D, 0x0C, 0x18, 0x1B, 0x21, 0x3F, 0x6A, 0x6B, 0x68,
+    0x73, 0x79, 0x7C, 0x7D, 0x7A, 0x7B, 0x78, 0x72, 0x44, 0x45, 0x47, 0x46,
+    0x69, 0x38, 0x20, 0x1D, 0x19, 0x09, 0x05, 0x00,
+};
+
+static const uint8_t huff_sfb_a6_bits[] = {
+    3, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 5, 5, 5, 4, 4, 4,
+};
+
+static const uint16_t huff_sfb_a6_codes[] = {
+    0x00, 0x01, 0x04, 0x05, 0x12, 0x13, 0x2E, 0x2F, 0x30, 0x66, 0x67, 0xD6,
+    0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2,
+    0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE,
+    0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA,
+    0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 0x68, 0x69, 0x6A, 0x31, 0x32, 0x14, 0x15,
+    0x16, 0x06, 0x07, 0x08,
+};
+
+static const uint8_t huff_sfb_b2_bits[] = {
+    1, 2, 0, 2,
+};
+
+static const uint16_t huff_sfb_b2_codes[] = {
+    0x00, 0x03, 0x00, 0x02,
+};
+
+static const uint8_t huff_sfb_b3_bits[] = {
+    1, 3, 5, 6, 0, 6, 4, 2,
+};
+
+static const uint16_t huff_sfb_b3_codes[] = {
+    0x01, 0x00, 0x04, 0x0B, 0x00, 0x0A, 0x03, 0x01,
+};
+
+static const uint8_t huff_sfb_b4_bits[] = {
+    1, 3, 4, 5, 5, 7, 8, 8, 0, 8, 8, 7, 6, 6, 4, 3,
+};
+
+static const uint16_t huff_sfb_b4_codes[] = {
+    0x01, 0x01, 0x04, 0x0E, 0x0F, 0x2C, 0x5A, 0x5D, 0x00, 0x5C, 0x5B, 0x2F,
+    0x15, 0x14, 0x06, 0x00,
+};
+
+static const uint8_t huff_sfb_b5_bits[] = {
+    3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 6, 7, 7, 7, 8, 8,
+    8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 3,
+};
+
+static const uint16_t huff_sfb_b5_codes[] = {
+    0x00, 0x05, 0x07, 0x0C, 0x04, 0x02, 0x03, 0x05, 0x09, 0x10, 0x23, 0x33,
+    0x36, 0x6E, 0x60, 0x65, 0x62, 0x61, 0x63, 0x64, 0x6F, 0x6D, 0x6C, 0x6B,
+    0x6A, 0x68, 0x69, 0x45, 0x44, 0x37, 0x1A, 0x07,
+};
+
+typedef struct HuffmanCodebook {
+    const uint8_t *bits;
+    const uint16_t *codes;
+    const int size;
+    const int value_cnt;
+    const int value_cnt_pow;
+    const int value_bits;
+    const int max_bit_size;
+} HuffmanCodebook;
+
+static const HuffmanCodebook at9_huffman_sf_unsigned[] = {
+    { 0 },
+    { huff_sfb_a1_bits, huff_sfb_a1_codes,  2,  1,  0,  1,  1, },
+    { huff_sfb_a2_bits, huff_sfb_a2_codes,  4,  1,  0,  2,  3, },
+    { huff_sfb_a3_bits, huff_sfb_a3_codes,  8,  1,  0,  3,  6, },
+    { huff_sfb_a4_bits, huff_sfb_a4_codes, 16,  1,  0,  4,  8, },
+    { huff_sfb_a5_bits, huff_sfb_a5_codes, 32,  1,  0,  5,  8, },
+    { huff_sfb_a6_bits, huff_sfb_a6_codes, 64,  1,  0,  6,  8, },
+};
+
+static const HuffmanCodebook at9_huffman_sf_signed[] = {
+    { 0 },
+    { 0 },
+    { huff_sfb_b2_bits, huff_sfb_b2_codes,  4,  1,  0,  2,  2, },
+    { huff_sfb_b3_bits, huff_sfb_b3_codes,  8,  1,  0,  3,  6, },
+    { huff_sfb_b4_bits, huff_sfb_b4_codes, 16,  1,  0,  4,  8, },
+    { huff_sfb_b5_bits, huff_sfb_b5_codes, 32,  1,  0,  5,  8, },
+};
+
+static const uint8_t huff_spec_a21_bits[] = {
+    0, 3, 0, 3, 3, 3, 0, 3, 0, 0, 0, 0, 3, 3, 0, 3,
+};
+
+static const uint16_t huff_spec_a21_codes[] = {
+    0x00, 0x00, 0x00, 0x01, 0x03, 0x07, 0x00, 0x04,
+    0x00, 0x00, 0x00, 0x00, 0x02, 0x05, 0x00, 0x06,
+};
+
+static const uint8_t huff_spec_a22_bits[] = {
+    0, 4, 0, 4, 5, 6, 0, 6, 0, 0, 0, 0, 5, 6, 0, 6,
+    5, 6, 0, 6, 6, 7, 0, 7, 0, 0, 0, 0, 6, 7, 0, 7,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    5, 6, 0, 6, 6, 7, 0, 7, 0, 0, 0, 0, 6, 7, 0, 7,
+    5, 6, 0, 6, 7, 7, 0, 7, 0, 0, 0, 0, 6, 7, 0, 7,
+    6, 7, 0, 7, 7, 8, 0, 8, 0, 0, 0, 0, 7, 8, 0, 7,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    6, 7, 0, 7, 7, 8, 0, 8, 0, 0, 0, 0, 7, 7, 0, 8,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    5, 6, 0, 6, 6, 7, 0, 7, 0, 0, 0, 0, 7, 7, 0, 7,
+    6, 7, 0, 7, 7, 8, 0, 7, 0, 0, 0, 0, 7, 8, 0, 8,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    6, 7, 0, 7, 7, 7, 0, 8, 0, 0, 0, 0, 7, 8, 0, 8,
+};
+
+static const uint16_t huff_spec_a22_codes[] = {
+    0x00, 0x02, 0x00, 0x03, 0x10, 0x3C, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x11, 0x3E, 0x00, 0x3D,
+    0x0E, 0x00, 0x00, 0x39, 0x18, 0x26, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x1B, 0x24, 0x00, 0x6D,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x0F, 0x38, 0x00, 0x01, 0x1A, 0x6C, 0x00, 0x25, 0x00, 0x00, 0x00, 0x00, 0x19, 0x74, 0x00, 0x27,
+    0x16, 0x14, 0x00, 0x17, 0x76, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00, 0x35, 0x64, 0x00, 0x6F,
+    0x26, 0x04, 0x00, 0x63, 0x22, 0xA2, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x67, 0xA0, 0x00, 0x0D,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x2B, 0x52, 0x00, 0x0B, 0x20, 0x92, 0x00, 0x91, 0x00, 0x00, 0x00, 0x00, 0x61, 0x0E, 0x00, 0x95,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x17, 0x16, 0x00, 0x15, 0x34, 0x6E, 0x00, 0x65, 0x00, 0x00, 0x00, 0x00, 0x77, 0x08, 0x00, 0x07,
+    0x2A, 0x0A, 0x00, 0x53, 0x60, 0x94, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x00, 0x21, 0x90, 0x00, 0x93,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x27, 0x62, 0x00, 0x05, 0x66, 0x0C, 0x00, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x23, 0x96, 0x00, 0xA3,
+};
+
+static const uint8_t huff_spec_a23_bits[] = {
+    3, 4, 0, 4, 5, 6, 0, 6, 0, 0, 0, 0, 5, 6, 0, 6,
+    5, 7, 0, 6, 6, 8, 0, 7, 0, 0, 0, 0, 6, 8, 0, 7,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    5, 6, 0, 7, 6, 7, 0, 8, 0, 0, 0, 0, 6, 7, 0, 8,
+    5, 6, 0, 6, 7, 8, 0, 8, 0, 0, 0, 0, 6, 7, 0, 7,
+    6, 8, 0, 7, 8, 9, 0, 9, 0, 0, 0, 0, 7, 9, 0, 8,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    6, 8, 0, 8, 8, 9, 0, 9, 0, 0, 0, 0, 7, 8, 0, 9,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    5, 6, 0, 6, 6, 7, 0, 7, 0, 0, 0, 0, 7, 8, 0, 8,
+    6, 8, 0, 8, 7, 9, 0, 8, 0, 0, 0, 0, 8, 9, 0, 9,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    6, 7, 0, 8, 7, 8, 0, 9, 0, 0, 0, 0, 8, 9, 0, 9,
+};
+
+static const uint16_t huff_spec_a23_codes[] = {
+    0x006, 0x002, 0x000, 0x003, 0x016, 0x01E, 0x000, 0x021, 0x000, 0x000, 0x000, 0x000,
+    0x017, 0x020, 0x000, 0x01F, 0x01C, 0x054, 0x000, 0x027, 0x010, 0x0A6, 0x000, 0x027,
+    0x000, 0x000, 0x000, 0x000, 0x015, 0x0A4, 0x000, 0x02D, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x01D, 0x026, 0x000, 0x055, 0x014, 0x02C, 0x000, 0x0A5, 0x000, 0x000, 0x000, 0x000,
+    0x011, 0x026, 0x000, 0x0A7, 0x01E, 0x000, 0x000, 0x003, 0x04A, 0x074, 0x000, 0x071,
+    0x000, 0x000, 0x000, 0x000, 0x023, 0x00A, 0x000, 0x009, 0x018, 0x072, 0x000, 0x00D,
+    0x0A2, 0x15A, 0x000, 0x123, 0x000, 0x000, 0x000, 0x000, 0x00F, 0x158, 0x000, 0x05D,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x01B, 0x0AE, 0x000, 0x077, 0x092, 0x140, 0x000, 0x121,
+    0x000, 0x000, 0x000, 0x000, 0x025, 0x05E, 0x000, 0x143, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x01F, 0x002, 0x000, 0x001, 0x022, 0x008, 0x000, 0x00B, 0x000, 0x000, 0x000, 0x000,
+    0x04B, 0x070, 0x000, 0x075, 0x01A, 0x076, 0x000, 0x0AF, 0x024, 0x142, 0x000, 0x05F,
+    0x000, 0x000, 0x000, 0x000, 0x093, 0x120, 0x000, 0x141, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x019, 0x00C, 0x000, 0x073, 0x00E, 0x05C, 0x000, 0x159, 0x000, 0x000, 0x000, 0x000,
+    0x0A3, 0x122, 0x000, 0x15B,
+};
+
+static const uint8_t huff_spec_a24_bits[] = {
+    2,  4,  0,  4,  5,  6,  0,  6,  0,  0,  0,  0,  5,  6,  0,  6,
+    5,  7,  0,  6,  6,  8,  0,  8,  0,  0,  0,  0,  6,  8,  0,  8,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    5,  6,  0,  7,  6,  8,  0,  8,  0,  0,  0,  0,  6,  8,  0,  8,
+    5,  7,  0,  7,  7,  9,  0,  9,  0,  0,  0,  0,  6,  8,  0,  8,
+    6,  9,  0,  8,  8, 10,  0, 10,  0,  0,  0,  0,  8, 10,  0,  9,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    6,  8,  0,  9,  9, 10,  0, 10,  0,  0,  0,  0,  8,  9,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    5,  7,  0,  7,  6,  8,  0,  8,  0,  0,  0,  0,  7,  9,  0,  9,
+    6,  9,  0,  8,  8, 10,  0,  9,  0,  0,  0,  0,  9, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    6,  8,  0,  9,  8,  9,  0, 10,  0,  0,  0,  0,  8, 10,  0, 10,
+};
+
+static const uint16_t huff_spec_a24_codes[] = {
+    0x002, 0x002, 0x000, 0x003, 0x01E, 0x010, 0x000, 0x013, 0x000, 0x000, 0x000, 0x000,
+    0x01F, 0x012, 0x000, 0x011, 0x01A, 0x030, 0x000, 0x01B, 0x000, 0x064, 0x000, 0x0C1,
+    0x000, 0x000, 0x000, 0x000, 0x003, 0x052, 0x000, 0x07D, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x01B, 0x01A, 0x000, 0x031, 0x002, 0x07C, 0x000, 0x053, 0x000, 0x000, 0x000, 0x000,
+    0x001, 0x0C0, 0x000, 0x065, 0x01C, 0x062, 0x000, 0x065, 0x02A, 0x198, 0x000, 0x19B,
+    0x000, 0x000, 0x000, 0x000, 0x017, 0x078, 0x000, 0x07B, 0x004, 0x0FE, 0x000, 0x077,
+    0x050, 0x33A, 0x000, 0x1F9, 0x000, 0x000, 0x000, 0x000, 0x073, 0x338, 0x000, 0x0E1,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x007, 0x066, 0x000, 0x187, 0x19E, 0x308, 0x000, 0x30B,
+    0x000, 0x000, 0x000, 0x000, 0x075, 0x0E2, 0x000, 0x1FB, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x01D, 0x064, 0x000, 0x063, 0x016, 0x07A, 0x000, 0x079, 0x000, 0x000, 0x000, 0x000,
+    0x02B, 0x19A, 0x000, 0x199, 0x006, 0x186, 0x000, 0x067, 0x074, 0x1FA, 0x000, 0x0E3,
+    0x000, 0x000, 0x000, 0x000, 0x19F, 0x30A, 0x000, 0x309, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x005, 0x076, 0x000, 0x0FF, 0x072, 0x0E0, 0x000, 0x339, 0x000, 0x000, 0x000, 0x000,
+    0x051, 0x1F8, 0x000, 0x33B,
+};
+
+static const uint8_t huff_spec_a31_bits[] = {
+    0, 0, 4, 5, 0, 5, 4, 0, 0, 0, 5, 5, 0, 5, 5, 0,
+    5, 5, 6, 6, 0, 6, 5, 5, 5, 6, 6, 7, 0, 7, 6, 6,
+    0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 7, 0, 7, 6, 6,
+    5, 5, 5, 6, 0, 6, 6, 5, 0, 0, 5, 5, 0, 5, 5, 0,
+};
+
+static const uint16_t huff_spec_a31_codes[] = {
+    0x00, 0x00, 0x02, 0x18, 0x00, 0x19, 0x03, 0x00, 0x00, 0x00, 0x12, 0x02, 0x00, 0x09, 0x15, 0x00,
+    0x1A, 0x0A, 0x3E, 0x2C, 0x00, 0x2F, 0x01, 0x0D, 0x0E, 0x38, 0x20, 0x78, 0x00, 0x7B, 0x23, 0x3B,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x3A, 0x22, 0x7A, 0x00, 0x79, 0x21, 0x39,
+    0x1B, 0x0C, 0x00, 0x2E, 0x00, 0x2D, 0x3F, 0x0B, 0x00, 0x00, 0x14, 0x08, 0x00, 0x03, 0x13, 0x00,
+};
+
+static const uint8_t huff_spec_a32_bits[] = {
+    4, 5, 5, 6, 0, 6, 5, 5, 5, 6, 5, 6, 0, 6, 5, 5,
+    5, 5, 6, 7, 0, 7, 6, 5, 6, 6, 7, 7, 0, 7, 7, 6,
+    0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 7, 7, 0, 7, 7, 6,
+    5, 5, 6, 7, 0, 7, 6, 5, 5, 5, 5, 6, 0, 6, 5, 6,
+};
+
+static const uint16_t huff_spec_a32_codes[] = {
+    0x0D, 0x18, 0x16, 0x3A, 0x00, 0x3B, 0x17, 0x19, 0x12, 0x3E, 0x08, 0x1C, 0x00, 0x1B, 0x07, 0x01,
+    0x10, 0x02, 0x28, 0x78, 0x00, 0x7B, 0x1F, 0x05, 0x2A, 0x16, 0x72, 0x2A, 0x00, 0x29, 0x71, 0x19,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2B, 0x18, 0x70, 0x28, 0x00, 0x2B, 0x73, 0x17,
+    0x11, 0x04, 0x1E, 0x7A, 0x00, 0x79, 0x29, 0x03, 0x13, 0x00, 0x06, 0x1A, 0x00, 0x1D, 0x09, 0x3F,
+};
+
+static const uint8_t huff_spec_a33_bits[] = {
+    3, 4, 5, 6, 0, 6, 5, 4, 4, 5, 6, 7, 0, 7, 6, 5,
+    5, 6, 6, 7, 0, 7, 6, 6, 6, 7, 8, 8, 0, 8, 8, 7,
+    0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 8, 0, 8, 8, 7,
+    5, 6, 6, 7, 0, 7, 6, 6, 4, 5, 6, 7, 0, 7, 6, 5,
+};
+
+static const uint16_t huff_spec_a33_codes[] = {
+    0x05, 0x06, 0x10, 0x08, 0x00, 0x09, 0x11, 0x07, 0x04, 0x12, 0x3E, 0x6A, 0x00, 0x6D, 0x3D, 0x19,
+    0x06, 0x3A, 0x06, 0x02, 0x00, 0x01, 0x05, 0x39, 0x02, 0x16, 0xDC, 0x2A, 0x00, 0x29, 0xDF, 0x69,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x68, 0xDE, 0x28, 0x00, 0x2B, 0xDD, 0x17,
+    0x07, 0x38, 0x04, 0x00, 0x00, 0x03, 0x07, 0x3B, 0x05, 0x18, 0x3C, 0x6C, 0x00, 0x6B, 0x3F, 0x13,
+};
+
+static const uint8_t huff_spec_a34_bits[] = {
+    2,  4,  5,  7,  0,  7,  5,  4,  4,  5,  6,  8,  0,  8,  6,  5,
+    5,  6,  7,  8,  0,  8,  7,  6,  7,  8,  8, 10,  0, 10,  9,  8,
+    0,  0,  0,  0,  0,  0,  0,  0,  7,  8,  9, 10,  0, 10,  8,  8,
+    5,  6,  7,  8,  0,  8,  7,  6,  4,  5,  6,  8,  0,  8,  6,  5,
+};
+
+static const uint16_t huff_spec_a34_codes[] = {
+    0x000, 0x00A, 0x00A, 0x034, 0x000, 0x035, 0x00B, 0x00B, 0x008, 0x01C, 0x032, 0x0DA,
+    0x000, 0x0DD, 0x035, 0x01F, 0x008, 0x01E, 0x03A, 0x06C, 0x000, 0x063, 0x039, 0x031,
+    0x032, 0x06E, 0x060, 0x37A, 0x000, 0x379, 0x1BF, 0x0D9, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x033, 0x0D8, 0x1BE, 0x378, 0x000, 0x37B, 0x061, 0x06F,
+    0x009, 0x030, 0x038, 0x062, 0x000, 0x06D, 0x03B, 0x01F, 0x009, 0x01E, 0x034, 0x0DC,
+    0x000, 0x0DB, 0x033, 0x01D,
+};
+
+static const uint8_t huff_spec_a41_bits[] = {
+    0, 0, 0, 0, 6, 6, 7, 7, 0, 7, 7, 6, 6, 0, 0, 0,
+    0, 0, 0, 0, 7, 7, 7, 7, 0, 7, 7, 7, 6, 0, 0, 0,
+    0, 0, 0, 0, 7, 7, 7, 8, 0, 8, 7, 7, 7, 0, 0, 0,
+    0, 0, 0, 0, 7, 7, 8, 8, 0, 8, 8, 7, 7, 0, 0, 0,
+    7, 7, 7, 8, 7, 8, 8, 8, 0, 8, 8, 8, 7, 8, 7, 7,
+    7, 7, 7, 7, 8, 8, 8, 9, 0, 8, 8, 8, 8, 7, 7, 7,
+    7, 7, 8, 8, 8, 8, 9, 9, 0, 9, 8, 8, 8, 8, 8, 7,
+    8, 8, 8, 8, 8, 9, 9, 9, 0, 9, 9, 9, 8, 8, 8, 8,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    8, 8, 8, 8, 8, 9, 9, 9, 0, 9, 9, 9, 8, 8, 8, 8,
+    7, 7, 8, 8, 8, 8, 8, 9, 0, 9, 9, 8, 8, 8, 8, 7,
+    7, 7, 7, 7, 8, 8, 8, 8, 0, 9, 8, 8, 8, 7, 7, 7,
+    7, 7, 7, 8, 7, 8, 8, 8, 0, 8, 8, 8, 7, 8, 7, 7,
+    0, 0, 0, 0, 7, 7, 8, 8, 0, 8, 8, 7, 7, 0, 0, 0,
+    0, 0, 0, 0, 7, 7, 7, 8, 0, 8, 7, 7, 7, 0, 0, 0,
+    0, 0, 0, 0, 6, 7, 7, 7, 0, 7, 7, 7, 7, 0, 0, 0,
+};
+
+static const uint16_t huff_spec_a41_codes[] = {
+    0x000, 0x000, 0x000, 0x000, 0x018, 0x00E, 0x05E, 0x028, 0x000, 0x029, 0x05F, 0x00F,
+    0x019, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x076, 0x06E, 0x03E, 0x004,
+    0x000, 0x017, 0x045, 0x07B, 0x013, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x04A, 0x048, 0x010, 0x0CE, 0x000, 0x0E1, 0x023, 0x055, 0x053, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x008, 0x018, 0x0D6, 0x09E, 0x000, 0x09D, 0x0E5, 0x02B,
+    0x01B, 0x000, 0x000, 0x000, 0x07C, 0x05C, 0x038, 0x0FC, 0x002, 0x0D2, 0x09A, 0x05C,
+    0x000, 0x06B, 0x0A3, 0x0D9, 0x00F, 0x0FF, 0x03D, 0x061, 0x074, 0x056, 0x036, 0x000,
+    0x0CC, 0x08C, 0x058, 0x1E2, 0x000, 0x00F, 0x05F, 0x0A1, 0x0D5, 0x00D, 0x03B, 0x059,
+    0x040, 0x014, 0x0DA, 0x0B6, 0x084, 0x040, 0x1E0, 0x196, 0x000, 0x1A1, 0x00D, 0x043,
+    0x087, 0x0C7, 0x0E3, 0x00B, 0x0F2, 0x0C4, 0x08E, 0x05A, 0x024, 0x1CC, 0x194, 0x168,
+    0x000, 0x16B, 0x1A3, 0x1CF, 0x027, 0x069, 0x099, 0x0C9, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x0F3, 0x0C8, 0x098, 0x068, 0x026, 0x1CE, 0x1A2, 0x16A, 0x000, 0x169, 0x195, 0x1CD,
+    0x025, 0x05B, 0x08F, 0x0C5, 0x041, 0x00A, 0x0E2, 0x0C6, 0x086, 0x042, 0x00C, 0x1A0,
+    0x000, 0x197, 0x1E1, 0x041, 0x085, 0x0B7, 0x0DB, 0x015, 0x075, 0x058, 0x03A, 0x00C,
+    0x0D4, 0x0A0, 0x05E, 0x00E, 0x000, 0x1E3, 0x059, 0x08D, 0x0CD, 0x001, 0x037, 0x057,
+    0x07D, 0x060, 0x03C, 0x0FE, 0x00E, 0x0D8, 0x0A2, 0x06A, 0x000, 0x05D, 0x09B, 0x0D3,
+    0x003, 0x0FD, 0x039, 0x05D, 0x000, 0x000, 0x000, 0x000, 0x01A, 0x02A, 0x0E4, 0x09C,
+    0x000, 0x09F, 0x0D7, 0x019, 0x009, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x052, 0x054, 0x022, 0x0E0, 0x000, 0x0CF, 0x011, 0x049, 0x04B, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x012, 0x07A, 0x044, 0x016, 0x000, 0x005, 0x03F, 0x06F,
+    0x077, 0x000, 0x000, 0x000,
+};
+
+static const uint8_t huff_spec_a42_bits[] = {
+    5,  6,  7,  7,  7,  7,  8,  8,  0,  8,  8,  7,  7,  7,  7,  6,
+    6,  7,  7,  8,  7,  7,  8,  8,  0,  8,  8,  7,  7,  8,  7,  7,
+    7,  7,  8,  8,  7,  8,  8,  9,  0,  9,  8,  8,  7,  8,  8,  7,
+    8,  8,  8,  8,  8,  8,  8,  9,  0,  9,  8,  8,  8,  8,  8,  8,
+    7,  7,  7,  8,  8,  8,  9,  9,  0,  9,  9,  8,  8,  8,  7,  7,
+    7,  7,  8,  8,  8,  9,  9,  9,  0,  9,  9,  9,  8,  8,  8,  7,
+    8,  8,  8,  8,  9,  9,  9, 10,  0, 10,  9,  9,  9,  8,  8,  8,
+    8,  8,  9,  9,  9,  9, 10, 10,  0, 10, 10,  9,  9,  9,  9,  9,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    8,  9,  9,  9,  9,  9, 10, 10,  0, 10, 10,  9,  9,  9,  9,  8,
+    8,  8,  8,  8,  9,  9,  9, 10,  0, 10,  9,  9,  9,  8,  8,  8,
+    7,  7,  8,  8,  8,  9,  9,  9,  0,  9,  9,  9,  8,  8,  8,  7,
+    7,  7,  7,  8,  8,  8,  9,  9,  0,  9,  9,  8,  8,  8,  7,  7,
+    8,  8,  8,  8,  8,  8,  8,  9,  0,  9,  8,  8,  8,  8,  8,  8,
+    7,  7,  8,  8,  7,  8,  8,  9,  0,  9,  8,  8,  7,  8,  8,  7,
+    6,  7,  7,  8,  7,  7,  8,  8,  0,  8,  8,  7,  7,  8,  7,  7,
+};
+
+static const uint16_t huff_spec_a42_codes[] = {
+    0x003, 0x018, 0x058, 0x000, 0x066, 0x03C, 0x0D6, 0x07C, 0x000, 0x07D, 0x0D7, 0x03D,
+    0x067, 0x001, 0x059, 0x019, 0x002, 0x064, 0x036, 0x0DA, 0x04C, 0x01C, 0x0BE, 0x02C,
+    0x000, 0x037, 0x0C5, 0x029, 0x04B, 0x0E7, 0x03B, 0x069, 0x044, 0x02E, 0x0FA, 0x092,
+    0x020, 0x0F8, 0x086, 0x1FC, 0x000, 0x1E7, 0x07F, 0x0F5, 0x023, 0x0AD, 0x0FD, 0x02D,
+    0x0F6, 0x0DC, 0x09C, 0x03E, 0x0F0, 0x0B6, 0x026, 0x186, 0x000, 0x18D, 0x02F, 0x0B5,
+    0x0E1, 0x03D, 0x0AF, 0x0D9, 0x054, 0x040, 0x014, 0x0EC, 0x0BC, 0x054, 0x1C6, 0x108,
+    0x000, 0x10B, 0x1C5, 0x069, 0x0B9, 0x0DF, 0x019, 0x047, 0x026, 0x008, 0x0E4, 0x0A2,
+    0x056, 0x1DC, 0x142, 0x06A, 0x000, 0x091, 0x123, 0x1DF, 0x04B, 0x0A7, 0x0EB, 0x00B,
+    0x0C0, 0x09E, 0x06A, 0x022, 0x1AA, 0x140, 0x092, 0x3CA, 0x000, 0x3A7, 0x04B, 0x121,
+    0x18F, 0x007, 0x071, 0x0A5, 0x020, 0x004, 0x1A8, 0x174, 0x0E4, 0x068, 0x3A4, 0x2EE,
+    0x000, 0x2ED, 0x3C9, 0x049, 0x0E7, 0x185, 0x1D1, 0x1FF, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x021, 0x1FE, 0x1D0, 0x184, 0x0E6, 0x048, 0x3C8, 0x2EC, 0x000, 0x2EF, 0x3A5, 0x069,
+    0x0E5, 0x175, 0x1A9, 0x005, 0x0C1, 0x0A4, 0x070, 0x006, 0x18E, 0x120, 0x04A, 0x3A6,
+    0x000, 0x3CB, 0x093, 0x141, 0x1AB, 0x023, 0x06B, 0x09F, 0x027, 0x00A, 0x0EA, 0x0A6,
+    0x04A, 0x1DE, 0x122, 0x090, 0x000, 0x06B, 0x143, 0x1DD, 0x057, 0x0A3, 0x0E5, 0x009,
+    0x055, 0x046, 0x018, 0x0DE, 0x0B8, 0x068, 0x1C4, 0x10A, 0x000, 0x109, 0x1C7, 0x055,
+    0x0BD, 0x0ED, 0x015, 0x041, 0x0F7, 0x0D8, 0x0AE, 0x03C, 0x0E0, 0x0B4, 0x02E, 0x18C,
+    0x000, 0x187, 0x027, 0x0B7, 0x0F1, 0x03F, 0x09D, 0x0DD, 0x045, 0x02C, 0x0FC, 0x0AC,
+    0x022, 0x0F4, 0x07E, 0x1E6, 0x000, 0x1FD, 0x087, 0x0F9, 0x021, 0x093, 0x0FB, 0x02F,
+    0x003, 0x068, 0x03A, 0x0E6, 0x04A, 0x028, 0x0C4, 0x036, 0x000, 0x02D, 0x0BF, 0x01D,
+    0x04D, 0x0DB, 0x037, 0x065,
+};
+
+static const uint8_t huff_spec_a43_bits[] = {
+    4,  6,  6,  7,  7,  8,  8,  9,  0,  9,  8,  8,  7,  7,  6,  6,
+    5,  6,  7,  7,  7,  8,  8,  9,  0,  9,  8,  8,  7,  7,  7,  6,
+    6,  7,  7,  7,  8,  8,  9,  9,  0,  9,  9,  8,  8,  7,  7,  7,
+    7,  7,  7,  8,  8,  8,  9, 10,  0, 10,  9,  9,  8,  8,  7,  7,
+    7,  7,  8,  8,  8,  9, 10, 10,  0, 10, 10,  9,  8,  8,  8,  7,
+    8,  8,  8,  9,  9,  9, 10, 10,  0, 10, 10,  9,  9,  9,  8,  8,
+    8,  9,  9,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  9,  9,
+    9,  9, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  9,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    9,  9, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  9,
+    8,  9,  9,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  9,  9,
+    8,  8,  8,  9,  9,  9, 10, 10,  0, 10, 10,  9,  9,  9,  8,  8,
+    7,  7,  8,  8,  8,  9, 10, 10,  0, 10, 10,  9,  8,  8,  8,  7,
+    7,  7,  7,  8,  8,  9,  9, 10,  0, 10,  9,  8,  8,  8,  7,  7,
+    6,  7,  7,  7,  8,  8,  9,  9,  0,  9,  9,  8,  8,  7,  7,  7,
+    5,  6,  7,  7,  7,  8,  8,  9,  0,  9,  8,  8,  7,  7,  7,  6,
+};
+
+static const uint16_t huff_spec_a43_codes[] = {
+    0x002, 0x03E, 0x016, 0x060, 0x04E, 0x0DC, 0x04A, 0x130, 0x000, 0x131, 0x04B, 0x0DD,
+    0x04F, 0x061, 0x017, 0x03F, 0x002, 0x02C, 0x076, 0x042, 0x034, 0x0CE, 0x002, 0x0E8,
+    0x000, 0x0CF, 0x001, 0x0D1, 0x037, 0x045, 0x07B, 0x02F, 0x014, 0x072, 0x052, 0x01A,
+    0x0E0, 0x080, 0x198, 0x01E, 0x000, 0x01D, 0x19B, 0x083, 0x0DF, 0x019, 0x055, 0x079,
+    0x050, 0x03C, 0x004, 0x0C4, 0x096, 0x00C, 0x0EA, 0x34A, 0x000, 0x34F, 0x0ED, 0x1D7,
+    0x095, 0x0AF, 0x003, 0x03F, 0x046, 0x026, 0x0D6, 0x092, 0x046, 0x15A, 0x3A8, 0x108,
+    0x000, 0x10F, 0x3A3, 0x135, 0x039, 0x091, 0x0D9, 0x031, 0x0D4, 0x0CA, 0x072, 0x1C6,
+    0x136, 0x090, 0x2B2, 0x104, 0x000, 0x103, 0x111, 0x08B, 0x133, 0x1D3, 0x071, 0x0C9,
+    0x03E, 0x1B4, 0x18C, 0x0CC, 0x38A, 0x2B0, 0x106, 0x0F2, 0x000, 0x0EF, 0x101, 0x113,
+    0x3A1, 0x0CB, 0x18F, 0x1B7, 0x0EE, 0x092, 0x388, 0x348, 0x10A, 0x0F4, 0x0F0, 0x0EA,
+    0x000, 0x0E9, 0x0ED, 0x0F7, 0x10D, 0x34D, 0x3AB, 0x0C9, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x0EF, 0x0C8, 0x3AA, 0x34C, 0x10C, 0x0F6, 0x0EC, 0x0E8, 0x000, 0x0EB, 0x0F1, 0x0F5,
+    0x10B, 0x349, 0x389, 0x093, 0x03F, 0x1B6, 0x18E, 0x0CA, 0x3A0, 0x112, 0x100, 0x0EE,
+    0x000, 0x0F3, 0x107, 0x2B1, 0x38B, 0x0CD, 0x18D, 0x1B5, 0x0D5, 0x0C8, 0x070, 0x1D2,
+    0x132, 0x08A, 0x110, 0x102, 0x000, 0x105, 0x2B3, 0x091, 0x137, 0x1C7, 0x073, 0x0CB,
+    0x047, 0x030, 0x0D8, 0x090, 0x038, 0x134, 0x3A2, 0x10E, 0x000, 0x109, 0x3A9, 0x15B,
+    0x047, 0x093, 0x0D7, 0x027, 0x051, 0x03E, 0x002, 0x0AE, 0x094, 0x1D6, 0x0EC, 0x34E,
+    0x000, 0x34B, 0x0EB, 0x00D, 0x097, 0x0C5, 0x005, 0x03D, 0x015, 0x078, 0x054, 0x018,
+    0x0DE, 0x082, 0x19A, 0x01C, 0x000, 0x01F, 0x199, 0x081, 0x0E1, 0x01B, 0x053, 0x073,
+    0x003, 0x02E, 0x07A, 0x044, 0x036, 0x0D0, 0x000, 0x0CE, 0x000, 0x0E9, 0x003, 0x0CF,
+    0x035, 0x043, 0x077, 0x02D,
+};
+
+static const uint8_t huff_spec_a44_bits[] = {
+    4,  5,  6,  7,  7,  8,  9, 10,  0, 10,  9,  8,  7,  7,  6,  5,
+    5,  6,  6,  7,  7,  8,  9, 10,  0, 10,  9,  8,  7,  7,  6,  6,
+    6,  6,  7,  7,  8,  9, 10, 10,  0, 10, 10,  9,  8,  7,  7,  6,
+    7,  7,  7,  8,  8,  9, 10, 10,  0, 10, 10,  9,  8,  8,  7,  7,
+    7,  8,  8,  8,  9, 10, 10, 10,  0, 10, 10, 10,  9,  8,  8,  7,
+    8,  8,  9,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  9,  8,
+    9,  9, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  9,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    9,  9, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  9,
+    8,  8,  9,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  9,  8,
+    7,  7,  8,  8,  9, 10, 10, 10,  0, 10, 10, 10,  9,  8,  8,  8,
+    7,  7,  7,  8,  8,  9, 10, 10,  0, 10, 10,  9,  8,  8,  7,  7,
+    6,  6,  7,  7,  8,  9, 10, 10,  0, 10, 10,  9,  8,  7,  7,  6,
+    5,  6,  6,  7,  7,  8,  9, 10,  0, 10,  9,  8,  7,  7,  6,  6,
+};
+
+static const uint16_t huff_spec_a44_codes[] = {
+    0x00A, 0x012, 0x030, 0x06E, 0x024, 0x074, 0x0EC, 0x07E, 0x000, 0x07F, 0x0ED, 0x075,
+    0x025, 0x06F, 0x031, 0x013, 0x010, 0x03C, 0x018, 0x05A, 0x002, 0x046, 0x09E, 0x07C,
+    0x000, 0x079, 0x0E5, 0x04D, 0x007, 0x065, 0x01B, 0x03F, 0x02E, 0x016, 0x072, 0x01A,
+    0x0D6, 0x1C6, 0x3B4, 0x066, 0x000, 0x06B, 0x3B7, 0x1D9, 0x0D5, 0x021, 0x075, 0x015,
+    0x06C, 0x03E, 0x01E, 0x0CC, 0x044, 0x0F2, 0x082, 0x05C, 0x000, 0x05F, 0x087, 0x0F5,
+    0x031, 0x0CF, 0x017, 0x059, 0x01C, 0x0EE, 0x0D0, 0x024, 0x1C0, 0x08E, 0x06E, 0x048,
+    0x000, 0x04D, 0x06D, 0x089, 0x0F7, 0x033, 0x0D3, 0x001, 0x070, 0x028, 0x1C2, 0x0F0,
+    0x08A, 0x074, 0x054, 0x040, 0x000, 0x043, 0x053, 0x073, 0x099, 0x0EF, 0x1C5, 0x02B,
+    0x0E6, 0x04E, 0x08C, 0x080, 0x068, 0x058, 0x046, 0x02A, 0x000, 0x029, 0x045, 0x051,
+    0x065, 0x085, 0x09B, 0x09D, 0x07A, 0x076, 0x060, 0x056, 0x04E, 0x02C, 0x024, 0x022,
+    0x000, 0x021, 0x027, 0x02F, 0x04B, 0x05B, 0x063, 0x071, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x07B, 0x070, 0x062, 0x05A, 0x04A, 0x02E, 0x026, 0x020, 0x000, 0x023, 0x025, 0x02D,
+    0x04F, 0x057, 0x061, 0x077, 0x0E7, 0x09C, 0x09A, 0x084, 0x064, 0x050, 0x044, 0x028,
+    0x000, 0x02B, 0x047, 0x059, 0x069, 0x081, 0x08D, 0x04F, 0x071, 0x02A, 0x1C4, 0x0EE,
+    0x098, 0x072, 0x052, 0x042, 0x000, 0x041, 0x055, 0x075, 0x08B, 0x0F1, 0x1C3, 0x029,
+    0x01D, 0x000, 0x0D2, 0x032, 0x0F6, 0x088, 0x06C, 0x04C, 0x000, 0x049, 0x06F, 0x08F,
+    0x1C1, 0x025, 0x0D1, 0x0EF, 0x06D, 0x058, 0x016, 0x0CE, 0x030, 0x0F4, 0x086, 0x05E,
+    0x000, 0x05D, 0x083, 0x0F3, 0x045, 0x0CD, 0x01F, 0x03F, 0x02F, 0x014, 0x074, 0x020,
+    0x0D4, 0x1D8, 0x3B6, 0x06A, 0x000, 0x067, 0x3B5, 0x1C7, 0x0D7, 0x01B, 0x073, 0x017,
+    0x011, 0x03E, 0x01A, 0x064, 0x006, 0x04C, 0x0E4, 0x078, 0x000, 0x07D, 0x09F, 0x047,
+    0x003, 0x05B, 0x019, 0x03D,
+};
+
+static const uint8_t huff_spec_a51_bits[] = {
+    5, 5, 5, 5, 5, 6, 6, 6, 4, 4, 5, 5, 5, 5, 5, 5,
+    0, 5, 5, 5, 5, 5, 5, 4, 4, 6, 6, 6, 5, 5, 5, 5,
+};
+
+static const uint16_t huff_spec_a51_codes[] = {
+    0x19, 0x16, 0x12, 0x0E, 0x06, 0x3A, 0x38, 0x30, 0x00, 0x04, 0x1E, 0x1A,
+    0x14, 0x10, 0x0C, 0x04, 0x00, 0x05, 0x0D, 0x11, 0x15, 0x1B, 0x1F, 0x05,
+    0x01, 0x31, 0x39, 0x3B, 0x07, 0x0F, 0x13, 0x17,
+};
+
+static const uint8_t huff_spec_a52_bits[] = {
+    4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,
+    0, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4,
+};
+
+static const uint16_t huff_spec_a52_codes[] = {
+    0x09, 0x04, 0x00, 0x1E, 0x1A, 0x14, 0x0C, 0x06, 0x18, 0x16, 0x0E, 0x04,
+    0x3A, 0x38, 0x22, 0x20, 0x00, 0x21, 0x23, 0x39, 0x3B, 0x05, 0x0F, 0x17,
+    0x19, 0x07, 0x0D, 0x15, 0x1B, 0x1F, 0x01, 0x05,
+};
+
+static const uint8_t huff_spec_a53_bits[] = {
+    3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7,
+    0, 7, 7, 7, 7, 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+};
+
+static const uint16_t huff_spec_a53_codes[] = {
+    0x00, 0x0C, 0x08, 0x04, 0x1E, 0x16, 0x14, 0x06, 0x0C, 0x04, 0x38, 0x1E,
+    0x76, 0x74, 0x3A, 0x38, 0x00, 0x39, 0x3B, 0x75, 0x77, 0x1F, 0x39, 0x05,
+    0x0D, 0x07, 0x15, 0x17, 0x1F, 0x05, 0x09, 0x0D,
+};
+
+static const uint8_t huff_spec_a54_bits[] = {
+    3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8,
+    0, 8, 8, 7, 7, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4,
+};
+
+static const uint16_t huff_spec_a54_codes[] = {
+    0x02, 0x0E, 0x0A, 0x08, 0x02, 0x1A, 0x0E, 0x02, 0x00, 0x30, 0x18, 0x66,
+    0x36, 0x34, 0xCA, 0xC8, 0x00, 0xC9, 0xCB, 0x35, 0x37, 0x67, 0x19, 0x31,
+    0x01, 0x03, 0x0F, 0x1B, 0x03, 0x09, 0x0B, 0x0F,
+};
+
+static const uint8_t huff_spec_a61_bits[] = {
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7,
+    5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5,
+    5, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
+};
+
+static const uint16_t huff_spec_a61_codes[] = {
+    0x35, 0x30, 0x2A, 0x28, 0x24, 0x20, 0x18, 0x0E, 0x0C, 0x7E, 0x7C, 0x72,
+    0x70, 0x68, 0x5E, 0x5C, 0x04, 0x0E, 0x08, 0x00, 0x3C, 0x3A, 0x36, 0x32,
+    0x2C, 0x26, 0x22, 0x1A, 0x16, 0x14, 0x06, 0x04, 0x00, 0x05, 0x07, 0x15,
+    0x17, 0x1B, 0x23, 0x27, 0x2D, 0x33, 0x37, 0x3B, 0x3D, 0x01, 0x09, 0x0F,
+    0x05, 0x5D, 0x5F, 0x69, 0x71, 0x73, 0x7D, 0x7F, 0x0D, 0x0F, 0x19, 0x21,
+    0x25, 0x29, 0x2B, 0x31,
+};
+
+static const uint8_t huff_spec_a62_bits[] = {
+    5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+    0, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5,
+};
+
+static const uint16_t huff_spec_a62_codes[] = {
+    0x14, 0x0E, 0x08, 0x04, 0x02, 0x3E, 0x3C, 0x38, 0x34, 0x30, 0x2A, 0x24,
+    0x1A, 0x18, 0x0E, 0x02, 0x32, 0x36, 0x2C, 0x26, 0x20, 0x16, 0x0C, 0x00,
+    0x76, 0x74, 0x5E, 0x5C, 0x46, 0x44, 0x2A, 0x28, 0x00, 0x29, 0x2B, 0x45,
+    0x47, 0x5D, 0x5F, 0x75, 0x77, 0x01, 0x0D, 0x17, 0x21, 0x27, 0x2D, 0x37,
+    0x33, 0x03, 0x0F, 0x19, 0x1B, 0x25, 0x2B, 0x31, 0x35, 0x39, 0x3D, 0x3F,
+    0x03, 0x05, 0x09, 0x0F,
+};
+
+static const uint8_t huff_spec_a63_bits[] = {
+    4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8,
+    0, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
+};
+
+static const uint16_t huff_spec_a63_codes[] = {
+    0x00, 0x1C, 0x18, 0x14, 0x10, 0x0A, 0x08, 0x02, 0x3E, 0x36, 0x2E, 0x2C,
+    0x24, 0x1C, 0x0E, 0x08, 0x1E, 0x1A, 0x0C, 0x7A, 0x6A, 0x68, 0x4C, 0x32,
+    0x16, 0x14, 0xF2, 0xF0, 0x9E, 0x9C, 0x62, 0x60, 0x00, 0x61, 0x63, 0x9D,
+    0x9F, 0xF1, 0xF3, 0x15, 0x17, 0x33, 0x4D, 0x69, 0x6B, 0x7B, 0x0D, 0x1B,
+    0x1F, 0x09, 0x0F, 0x1D, 0x25, 0x2D, 0x2F, 0x37, 0x3F, 0x03, 0x09, 0x0B,
+    0x11, 0x15, 0x19, 0x1D,
+};
+
+static const uint8_t huff_spec_a64_bits[] = {
+    4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7,
+    6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9,
+    0, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+    6, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4,
+};
+
+static const uint16_t huff_spec_a64_codes[] = {
+    0x006, 0x002, 0x01C, 0x01A, 0x016, 0x012, 0x00E, 0x00A, 0x002, 0x03E,
+    0x032, 0x02A, 0x022, 0x020, 0x010, 0x07A, 0x000, 0x078, 0x060, 0x050,
+    0x024, 0x006, 0x0C6, 0x0C4, 0x0A4, 0x04E, 0x00A, 0x008, 0x14E, 0x14C,
+    0x09A, 0x098, 0x000, 0x099, 0x09B, 0x14D, 0x14F, 0x009, 0x00B, 0x04F,
+    0x0A5, 0x0C5, 0x0C7, 0x007, 0x025, 0x051, 0x061, 0x079, 0x001, 0x07B,
+    0x011, 0x021, 0x023, 0x02B, 0x033, 0x03F, 0x003, 0x00B, 0x00F, 0x013,
+    0x017, 0x01B, 0x01D, 0x003,
+};
+
+static const uint8_t huff_spec_a71_bits[] = {
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+    6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+static const uint16_t huff_spec_a71_codes[] = {
+    0x6C, 0x66, 0x62, 0x5C, 0x56, 0x50, 0x52, 0x4E, 0x48, 0x3E, 0x36, 0x34, 0x2A, 0x26, 0x1E, 0x16,
+    0x0E, 0x08, 0x00, 0xF6, 0xF4, 0xEE, 0xEC, 0xE2, 0xE0, 0xDA, 0xD2, 0xD0, 0xBE, 0xBC, 0xB2, 0xB0,
+    0x0C, 0x20, 0x1C, 0x16, 0x10, 0x08, 0x02, 0x7E, 0x7C, 0x78, 0x74, 0x72, 0x6E, 0x6A, 0x64, 0x60,
+    0x5A, 0x54, 0x4C, 0x4A, 0x46, 0x44, 0x3C, 0x32, 0x30, 0x28, 0x24, 0x1C, 0x14, 0x0C, 0x0A, 0x02,
+    0x00, 0x03, 0x0B, 0x0D, 0x15, 0x1D, 0x25, 0x29, 0x31, 0x33, 0x3D, 0x45, 0x47, 0x4B, 0x4D, 0x55,
+    0x5B, 0x61, 0x65, 0x6B, 0x6F, 0x73, 0x75, 0x79, 0x7D, 0x7F, 0x03, 0x09, 0x11, 0x17, 0x1D, 0x21,
+    0x0D, 0xB1, 0xB3, 0xBD, 0xBF, 0xD1, 0xD3, 0xDB, 0xE1, 0xE3, 0xED, 0xEF, 0xF5, 0xF7, 0x01, 0x09,
+    0x0F, 0x17, 0x1F, 0x27, 0x2B, 0x35, 0x37, 0x3F, 0x49, 0x4F, 0x53, 0x51, 0x57, 0x5D, 0x63, 0x67,
+};
+
+static const uint8_t huff_spec_a72_bits[] = {
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
+};
+
+static const uint16_t huff_spec_a72_codes[] = {
+    0x2A, 0x24, 0x1C, 0x18, 0x12, 0x0E, 0x0A, 0x06, 0x02, 0x7E, 0x7C, 0x7A, 0x76, 0x72, 0x70, 0x6A,
+    0x68, 0x62, 0x5C, 0x5A, 0x52, 0x4E, 0x46, 0x42, 0x3C, 0x34, 0x2A, 0x28, 0x20, 0x12, 0x10, 0x08,
+    0x66, 0x74, 0x6C, 0x64, 0x5E, 0x58, 0x50, 0x44, 0x40, 0x36, 0x2C, 0x22, 0x1A, 0x0A, 0x02, 0x00,
+    0xF2, 0xF0, 0xDE, 0xDC, 0xC2, 0xC0, 0xAE, 0xAC, 0x9A, 0x98, 0x7E, 0x7C, 0x5E, 0x5C, 0x32, 0x30,
+    0x00, 0x31, 0x33, 0x5D, 0x5F, 0x7D, 0x7F, 0x99, 0x9B, 0xAD, 0xAF, 0xC1, 0xC3, 0xDD, 0xDF, 0xF1,
+    0xF3, 0x01, 0x03, 0x0B, 0x1B, 0x23, 0x2D, 0x37, 0x41, 0x45, 0x51, 0x59, 0x5F, 0x65, 0x6D, 0x75,
+    0x67, 0x09, 0x11, 0x13, 0x21, 0x29, 0x2B, 0x35, 0x3D, 0x43, 0x47, 0x4F, 0x53, 0x5B, 0x5D, 0x63,
+    0x69, 0x6B, 0x71, 0x73, 0x77, 0x7B, 0x7D, 0x7F, 0x03, 0x07, 0x0B, 0x0F, 0x13, 0x19, 0x1D, 0x25,
+};
+
+static const uint8_t huff_spec_a73_bits[] = {
+    5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+};
+
+static const uint16_t huff_spec_a73_codes[] = {
+    0x003, 0x03E, 0x038, 0x034, 0x030, 0x02C, 0x028, 0x024, 0x020, 0x01C, 0x016, 0x014,
+    0x00E, 0x00A, 0x004, 0x000, 0x07A, 0x076, 0x06E, 0x06C, 0x064, 0x05E, 0x056, 0x04E,
+    0x04C, 0x044, 0x036, 0x030, 0x022, 0x018, 0x012, 0x004, 0x03C, 0x03E, 0x032, 0x024,
+    0x020, 0x010, 0x0F2, 0x0F0, 0x0E8, 0x0CE, 0x0BA, 0x0B8, 0x0A8, 0x08C, 0x06A, 0x04E,
+    0x04C, 0x034, 0x00E, 0x00C, 0x1D6, 0x1D4, 0x19A, 0x198, 0x156, 0x154, 0x11E, 0x11C,
+    0x0D2, 0x0D0, 0x06E, 0x06C, 0x000, 0x06D, 0x06F, 0x0D1, 0x0D3, 0x11D, 0x11F, 0x155,
+    0x157, 0x199, 0x19B, 0x1D5, 0x1D7, 0x00D, 0x00F, 0x035, 0x04D, 0x04F, 0x06B, 0x08D,
+    0x0A9, 0x0B9, 0x0BB, 0x0CF, 0x0E9, 0x0F1, 0x0F3, 0x011, 0x021, 0x025, 0x033, 0x03F,
+    0x03D, 0x005, 0x013, 0x019, 0x023, 0x031, 0x037, 0x045, 0x04D, 0x04F, 0x057, 0x05F,
+    0x065, 0x06D, 0x06F, 0x077, 0x07B, 0x001, 0x005, 0x00B, 0x00F, 0x015, 0x017, 0x01D,
+    0x021, 0x025, 0x029, 0x02D, 0x031, 0x035, 0x039, 0x03F,
+};
+
+static const uint8_t huff_spec_a74_bits[] = {
+    5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,
+    7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,
+    9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,
+    9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    7,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,
+};
+
+static const uint16_t huff_spec_a74_codes[] = {
+    0x00D, 0x00A, 0x004, 0x000, 0x03A, 0x036, 0x032, 0x030, 0x02C, 0x028, 0x026, 0x022,
+    0x01E, 0x018, 0x012, 0x00E, 0x006, 0x07E, 0x07A, 0x070, 0x06A, 0x05E, 0x056, 0x054,
+    0x048, 0x040, 0x038, 0x022, 0x01A, 0x00A, 0x0F8, 0x0E6, 0x008, 0x0FA, 0x0F0, 0x0D2,
+    0x0BA, 0x0B8, 0x094, 0x084, 0x074, 0x042, 0x032, 0x1E6, 0x1CA, 0x1C8, 0x1A2, 0x12E,
+    0x10E, 0x10C, 0x0EC, 0x082, 0x062, 0x060, 0x3CA, 0x3C8, 0x342, 0x340, 0x25A, 0x258,
+    0x1DE, 0x1DC, 0x102, 0x100, 0x000, 0x101, 0x103, 0x1DD, 0x1DF, 0x259, 0x25B, 0x341,
+    0x343, 0x3C9, 0x3CB, 0x061, 0x063, 0x083, 0x0ED, 0x10D, 0x10F, 0x12F, 0x1A3, 0x1C9,
+    0x1CB, 0x1E7, 0x033, 0x043, 0x075, 0x085, 0x095, 0x0B9, 0x0BB, 0x0D3, 0x0F1, 0x0FB,
+    0x009, 0x0E7, 0x0F9, 0x00B, 0x01B, 0x023, 0x039, 0x041, 0x049, 0x055, 0x057, 0x05F,
+    0x06B, 0x071, 0x07B, 0x07F, 0x007, 0x00F, 0x013, 0x019, 0x01F, 0x023, 0x027, 0x029,
+    0x02D, 0x031, 0x033, 0x037, 0x03B, 0x001, 0x005, 0x00B,
+};
+
+static const uint8_t huff_spec_b22_bits[] = {
+    0,  4,  0,  4,  4,  5,  0,  5,  0,  0,  0,  0,  4,  5,  0,  5,
+    4,  7,  0,  6,  6,  9,  0,  7,  0,  0,  0,  0,  6,  9,  0,  7,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    4,  6,  0,  7,  6,  7,  0,  9,  0,  0,  0,  0,  6,  7,  0,  9,
+    4,  8,  0,  8,  8, 10,  0, 10,  0,  0,  0,  0,  6,  9,  0,  9,
+    5, 10,  0,  9,  9, 10,  0, 10,  0,  0,  0,  0,  7, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    6,  9,  0, 10,  9, 10,  0, 10,  0,  0,  0,  0,  7, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    4,  8,  0,  8,  6,  9,  0,  9,  0,  0,  0,  0,  8, 10,  0, 10,
+    6, 10,  0,  9,  7, 10,  0, 10,  0,  0,  0,  0,  9, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    5,  9,  0, 10,  7, 10,  0, 10,  0,  0,  0,  0,  9, 10,  0, 10,
+};
+
+static const uint16_t huff_spec_b22_codes[] = {
+    0x000, 0x00E, 0x000, 0x00F, 0x008, 0x006, 0x000, 0x00B, 0x000, 0x000, 0x000, 0x000,
+    0x009, 0x00A, 0x000, 0x007, 0x006, 0x00A, 0x000, 0x029, 0x006, 0x158, 0x000, 0x023,
+    0x000, 0x000, 0x000, 0x000, 0x013, 0x174, 0x000, 0x021, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x007, 0x028, 0x000, 0x00B, 0x012, 0x020, 0x000, 0x175, 0x000, 0x000, 0x000, 0x000,
+    0x007, 0x022, 0x000, 0x159, 0x00C, 0x0BC, 0x000, 0x0BF, 0x022, 0x2B8, 0x000, 0x2BB,
+    0x000, 0x000, 0x000, 0x000, 0x00B, 0x170, 0x000, 0x15B, 0x000, 0x04E, 0x000, 0x15F,
+    0x042, 0x04A, 0x000, 0x041, 0x000, 0x000, 0x000, 0x000, 0x055, 0x044, 0x000, 0x04D,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x02D, 0x172, 0x000, 0x2ED, 0x040, 0x042, 0x000, 0x047,
+    0x000, 0x000, 0x000, 0x000, 0x013, 0x2EE, 0x000, 0x049, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x00D, 0x0BE, 0x000, 0x0BD, 0x00A, 0x15A, 0x000, 0x171, 0x000, 0x000, 0x000, 0x000,
+    0x023, 0x2BA, 0x000, 0x2B9, 0x02C, 0x2EC, 0x000, 0x173, 0x012, 0x048, 0x000, 0x2EF,
+    0x000, 0x000, 0x000, 0x000, 0x041, 0x046, 0x000, 0x043, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x001, 0x15E, 0x000, 0x04F, 0x054, 0x04C, 0x000, 0x045, 0x000, 0x000, 0x000, 0x000,
+    0x043, 0x040, 0x000, 0x04B,
+};
+
+static const uint8_t huff_spec_b23_bits[] = {
+    2,  4,  0,  4,  4,  6,  0,  6,  0,  0,  0,  0,  4,  6,  0,  6,
+    4,  9,  0,  7,  7,  9,  0,  8,  0,  0,  0,  0,  7,  9,  0,  8,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    4,  7,  0,  9,  7,  8,  0,  9,  0,  0,  0,  0,  7,  8,  0,  9,
+    4,  8,  0,  8,  9, 10,  0, 10,  0,  0,  0,  0,  7, 10,  0, 10,
+    7, 10,  0, 10, 10, 10,  0, 10,  0,  0,  0,  0,  9, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    7, 10,  0, 10, 10, 10,  0, 10,  0,  0,  0,  0,  8, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    4,  8,  0,  8,  7, 10,  0, 10,  0,  0,  0,  0,  9, 10,  0, 10,
+    7, 10,  0, 10,  8, 10,  0, 10,  0,  0,  0,  0, 10, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    7, 10,  0, 10,  9, 10,  0, 10,  0,  0,  0,  0, 10, 10,  0, 10,
+};
+
+static const uint16_t huff_spec_b23_codes[] = {
+    0x003, 0x008, 0x000, 0x009, 0x002, 0x018, 0x000, 0x01B, 0x000, 0x000, 0x000, 0x000,
+    0x003, 0x01A, 0x000, 0x019, 0x000, 0x17C, 0x000, 0x055, 0x056, 0x0E8, 0x000, 0x07D,
+    0x000, 0x000, 0x000, 0x000, 0x059, 0x0F6, 0x000, 0x07F, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x001, 0x054, 0x000, 0x17D, 0x058, 0x07E, 0x000, 0x0F7, 0x000, 0x000, 0x000, 0x000,
+    0x057, 0x07C, 0x000, 0x0E9, 0x004, 0x0A2, 0x000, 0x0A1, 0x17A, 0x1DA, 0x000, 0x1D9,
+    0x000, 0x000, 0x000, 0x000, 0x053, 0x1E8, 0x000, 0x2F3, 0x05C, 0x1D6, 0x000, 0x1E7,
+    0x1EA, 0x1E2, 0x000, 0x1CF, 0x000, 0x000, 0x000, 0x000, 0x17F, 0x1CA, 0x000, 0x1DD,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x05B, 0x2F0, 0x000, 0x1DF, 0x1E4, 0x1CC, 0x000, 0x1D5,
+    0x000, 0x000, 0x000, 0x000, 0x071, 0x1E0, 0x000, 0x1C9, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x005, 0x0A0, 0x000, 0x0A3, 0x052, 0x2F2, 0x000, 0x1E9, 0x000, 0x000, 0x000, 0x000,
+    0x17B, 0x1D8, 0x000, 0x1DB, 0x05A, 0x1DE, 0x000, 0x2F1, 0x070, 0x1C8, 0x000, 0x1E1,
+    0x000, 0x000, 0x000, 0x000, 0x1E5, 0x1D4, 0x000, 0x1CD, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x05D, 0x1E6, 0x000, 0x1D7, 0x17E, 0x1DC, 0x000, 0x1CB, 0x000, 0x000, 0x000, 0x000,
+    0x1EB, 0x1CE, 0x000, 0x1E3,
+};
+
+static const uint8_t huff_spec_b24_bits[] = {
+    1,  4,  0,  4,  5,  7,  0,  7,  0,  0,  0,  0,  5,  7,  0,  7,
+    5,  9,  0,  7,  8, 10,  0,  9,  0,  0,  0,  0,  7, 10,  0,  9,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    5,  7,  0,  9,  7,  9,  0, 10,  0,  0,  0,  0,  8,  9,  0, 10,
+    5,  9,  0,  8,  9, 10,  0, 10,  0,  0,  0,  0,  7, 10,  0, 10,
+    7, 10,  0, 10, 10, 10,  0, 10,  0,  0,  0,  0, 10, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    7, 10,  0, 10, 10, 10,  0, 10,  0,  0,  0,  0, 10, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    5,  8,  0,  9,  7, 10,  0, 10,  0,  0,  0,  0,  9, 10,  0, 10,
+    7, 10,  0, 10, 10, 10,  0, 10,  0,  0,  0,  0, 10, 10,  0, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    7, 10,  0, 10, 10, 10,  0, 10,  0,  0,  0,  0, 10, 10,  0, 10,
+};
+
+static const uint16_t huff_spec_b24_codes[] = {
+    0x001, 0x000, 0x000, 0x001, 0x00A, 0x01C, 0x000, 0x033, 0x000, 0x000, 0x000, 0x000,
+    0x00B, 0x032, 0x000, 0x01D, 0x008, 0x0D8, 0x000, 0x031, 0x06E, 0x0FA, 0x000, 0x0D7,
+    0x000, 0x000, 0x000, 0x000, 0x011, 0x0F4, 0x000, 0x0D5, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x009, 0x030, 0x000, 0x0D9, 0x010, 0x0D4, 0x000, 0x0F5, 0x000, 0x000, 0x000, 0x000,
+    0x06F, 0x0D6, 0x000, 0x0FB, 0x00E, 0x0DA, 0x000, 0x025, 0x0D2, 0x0D4, 0x000, 0x0DB,
+    0x000, 0x000, 0x000, 0x000, 0x017, 0x0FE, 0x000, 0x0FD, 0x014, 0x0DC, 0x000, 0x0F9,
+    0x0F2, 0x0D6, 0x000, 0x09B, 0x000, 0x000, 0x000, 0x000, 0x1A3, 0x09C, 0x000, 0x0D3,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x019, 0x0F6, 0x000, 0x0D9, 0x0F0, 0x09E, 0x000, 0x0D1,
+    0x000, 0x000, 0x000, 0x000, 0x1A1, 0x0DE, 0x000, 0x099, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x00F, 0x024, 0x000, 0x0DB, 0x016, 0x0FC, 0x000, 0x0FF, 0x000, 0x000, 0x000, 0x000,
+    0x0D3, 0x0DA, 0x000, 0x0D5, 0x018, 0x0D8, 0x000, 0x0F7, 0x1A0, 0x098, 0x000, 0x0DF,
+    0x000, 0x000, 0x000, 0x000, 0x0F1, 0x0D0, 0x000, 0x09F, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x015, 0x0F8, 0x000, 0x0DD, 0x1A2, 0x0D2, 0x000, 0x09D, 0x000, 0x000, 0x000, 0x000,
+    0x0F3, 0x09A, 0x000, 0x0D7
+};
+
+static const uint8_t huff_spec_b32_bits[] = {
+    2, 4, 5, 6, 0, 6, 5, 4, 5, 6, 6, 7, 0, 6, 5, 6,
+    5, 6, 7, 7, 0, 8, 7, 6, 6, 7, 8, 9, 0, 9, 8, 7,
+    0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 0, 9, 8, 7,
+    5, 6, 7, 8, 0, 7, 7, 6, 5, 6, 5, 6, 0, 7, 6, 6,
+};
+
+static const uint16_t huff_spec_b32_codes[] = {
+    0x001, 0x002, 0x01E, 0x02A, 0x000, 0x02B, 0x01F, 0x003, 0x016, 0x020, 0x03A, 0x064,
+    0x000, 0x005, 0x001, 0x023, 0x01A, 0x026, 0x070, 0x00C, 0x000, 0x0CF, 0x073, 0x031,
+    0x024, 0x00E, 0x0CC, 0x146, 0x000, 0x145, 0x0A1, 0x053, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x025, 0x052, 0x0A0, 0x144, 0x000, 0x147, 0x0CD, 0x00F,
+    0x01B, 0x030, 0x072, 0x0CE, 0x000, 0x00D, 0x071, 0x027, 0x017, 0x022, 0x000, 0x004,
+    0x000, 0x065, 0x03B, 0x021,
+};
+
+static const uint8_t huff_spec_b33_bits[] = {
+    2,  4,  5,  7,  0,  7,  5,  4,  4,  5,  6,  8,  0,  7,  6,  5,
+    5,  6,  7,  9,  0,  8,  7,  6,  7,  8,  9, 10,  0, 10,  9,  8,
+    0,  0,  0,  0,  0,  0,  0,  0,  7,  8,  9, 10,  0, 10,  9,  8,
+    5,  6,  7,  8,  0,  9,  7,  6,  4,  5,  6,  7,  0,  8,  6,  5,
+};
+
+static const uint16_t huff_spec_b33_codes[] = {
+    0x003, 0x008, 0x014, 0x05E, 0x000, 0x05F, 0x015, 0x009, 0x004, 0x002, 0x01C, 0x0BA,
+    0x000, 0x011, 0x01F, 0x001, 0x00C, 0x00C, 0x014, 0x166, 0x000, 0x02D, 0x013, 0x00F,
+    0x05A, 0x0B0, 0x05E, 0x0B8, 0x000, 0x0BB, 0x165, 0x0B9, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x05B, 0x0B8, 0x164, 0x0BA, 0x000, 0x0B9, 0x05F, 0x0B1,
+    0x00D, 0x00E, 0x012, 0x02C, 0x000, 0x167, 0x015, 0x00D, 0x005, 0x000, 0x01E, 0x010,
+    0x000, 0x0BB, 0x01D, 0x003
+};
+
+static const uint8_t huff_spec_b34_bits[] = {
+    1,  4,  6,  8,  0,  8,  6,  4,  4,  6,  7,  9,  0,  8,  7,  6,
+    6,  7,  8, 10,  0, 10,  8,  7,  8,  9, 10, 10,  0, 10, 10,  9,
+    0,  0,  0,  0,  0,  0,  0,  0,  8,  9, 10, 10,  0, 10, 10,  9,
+    6,  7,  8, 10,  0, 10,  8,  7,  4,  6,  7,  8,  0,  9,  7,  6,
+};
+
+static const uint16_t huff_spec_b34_codes[] = {
+    0x000, 0x00A, 0x038, 0x0EE, 0x000, 0x0EF, 0x039, 0x00B, 0x008, 0x03C, 0x06E, 0x1D8,
+    0x000, 0x0C1, 0x075, 0x03F, 0x032, 0x068, 0x0C4, 0x358, 0x000, 0x30F, 0x0C7, 0x06D,
+    0x0D4, 0x1AE, 0x30C, 0x308, 0x000, 0x30B, 0x35B, 0x1DB, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x0D5, 0x1DA, 0x35A, 0x30A, 0x000, 0x309, 0x30D, 0x1AF,
+    0x033, 0x06C, 0x0C6, 0x30E, 0x000, 0x359, 0x0C5, 0x069, 0x009, 0x03E, 0x074, 0x0C0,
+    0x000, 0x1D9, 0x06F, 0x03D,
+};
+
+static const uint8_t huff_spec_b42_bits[] = {
+    4,  5,  6,  8,  6,  7,  8,  8,  0,  8,  8,  7,  6,  8,  6,  5,
+    5,  6,  7,  8,  7,  7,  8,  9,  0,  8,  8,  7,  7,  8,  7,  6,
+    7,  7,  8,  9,  7,  8,  9,  9,  0,  9,  9,  8,  7,  9,  8,  7,
+    8,  9,  9, 10,  8,  8,  9, 10,  0, 10,  9,  8,  8, 10,  9,  8,
+    6,  7,  8,  8,  9,  9, 10, 10,  0, 10, 10,  9,  9,  8,  8,  7,
+    7,  7,  8,  9,  9, 10, 10, 10,  0, 10, 10, 10,  9,  9,  8,  7,
+    8,  8,  9,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  9,  8,
+    8,  9,  9, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10,  9,  9,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    8,  9,  9, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10,  9,  9,
+    8,  8,  9,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  9,  8,
+    7,  7,  8,  9,  9, 10, 10, 10,  0, 10, 10, 10,  9,  9,  8,  7,
+    6,  7,  8,  8,  9,  9, 10, 10,  0, 10, 10,  9,  9,  8,  8,  7,
+    8,  8,  9, 10,  8,  8,  9, 10,  0, 10,  9,  8,  8, 10,  9,  9,
+    7,  7,  8,  9,  7,  8,  9,  9,  0,  9,  9,  8,  7,  9,  8,  7,
+    5,  6,  7,  8,  7,  7,  8,  8,  0,  9,  8,  7,  7,  8,  7,  6,
+};
+
+static const uint16_t huff_spec_b42_codes[] = {
+    0x00E, 0x018, 0x010, 0x0F0, 0x024, 0x05A, 0x0F6, 0x078, 0x000, 0x079, 0x0F7, 0x05B,
+    0x025, 0x0F1, 0x011, 0x019, 0x00C, 0x014, 0x01C, 0x036, 0x05C, 0x012, 0x09E, 0x1E4,
+    0x000, 0x00B, 0x0A9, 0x03B, 0x05F, 0x071, 0x019, 0x017, 0x06E, 0x000, 0x03E, 0x114,
+    0x002, 0x0B0, 0x1AA, 0x07A, 0x000, 0x099, 0x1E7, 0x0B3, 0x00B, 0x131, 0x07F, 0x00D,
+    0x0D8, 0x1FE, 0x112, 0x22E, 0x086, 0x010, 0x134, 0x35C, 0x000, 0x35F, 0x133, 0x013,
+    0x081, 0x22D, 0x119, 0x07B, 0x00A, 0x050, 0x0F8, 0x04E, 0x1B4, 0x154, 0x3EC, 0x0D2,
+    0x000, 0x0D7, 0x3D7, 0x137, 0x1FD, 0x073, 0x0FD, 0x057, 0x052, 0x010, 0x08E, 0x1E8,
+    0x11A, 0x3EE, 0x0F2, 0x03C, 0x000, 0x03F, 0x0F1, 0x3D5, 0x111, 0x1F5, 0x09D, 0x025,
+    0x0D2, 0x082, 0x1A0, 0x0F8, 0x36E, 0x0D4, 0x072, 0x03A, 0x000, 0x027, 0x071, 0x07D,
+    0x36D, 0x0FB, 0x1AD, 0x085, 0x00C, 0x1A8, 0x03C, 0x346, 0x0D0, 0x076, 0x024, 0x020,
+    0x000, 0x023, 0x039, 0x075, 0x07F, 0x345, 0x09B, 0x157, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x00D, 0x156, 0x09A, 0x344, 0x07E, 0x074, 0x038, 0x022, 0x000, 0x021, 0x025, 0x077,
+    0x0D1, 0x347, 0x03D, 0x1A9, 0x0D3, 0x084, 0x1AC, 0x0FA, 0x36C, 0x07C, 0x070, 0x026,
+    0x000, 0x03B, 0x073, 0x0D5, 0x36F, 0x0F9, 0x1A1, 0x083, 0x053, 0x024, 0x09C, 0x1F4,
+    0x110, 0x3D4, 0x0F0, 0x03E, 0x000, 0x03D, 0x0F3, 0x3EF, 0x11B, 0x1E9, 0x08F, 0x011,
+    0x00B, 0x056, 0x0FC, 0x072, 0x1FC, 0x136, 0x3D6, 0x0D6, 0x000, 0x0D3, 0x3ED, 0x155,
+    0x1B5, 0x04F, 0x0F9, 0x051, 0x0D9, 0x07A, 0x118, 0x22C, 0x080, 0x012, 0x132, 0x35E,
+    0x000, 0x35D, 0x135, 0x011, 0x087, 0x22F, 0x113, 0x1FF, 0x06F, 0x00C, 0x07E, 0x130,
+    0x00A, 0x0B2, 0x1E6, 0x098, 0x000, 0x07B, 0x1AB, 0x0B1, 0x003, 0x115, 0x03F, 0x001,
+    0x00D, 0x016, 0x018, 0x070, 0x05E, 0x03A, 0x0A8, 0x00A, 0x000, 0x1E5, 0x09F, 0x013,
+    0x05D, 0x037, 0x01D, 0x015,
+};
+
+static const uint8_t huff_spec_b43_bits[] = {
+    2,  5,  6,  7,  7,  8,  8,  9,  0,  9,  8,  8,  7,  7,  6,  5,
+    5,  6,  7,  8,  7,  8,  9, 10,  0, 10,  9,  8,  7,  8,  7,  6,
+    6,  7,  8,  9,  8,  9, 10, 10,  0, 10, 10,  9,  8,  9,  8,  7,
+    7,  8,  9, 10,  9,  9, 10, 10,  0, 10, 10, 10,  9, 10,  9,  8,
+    7,  8,  8,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  8,  7,
+    8,  8,  9, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10,  9,  8,
+    9,  9, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  9,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    9,  9, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  9,
+    8,  8,  9, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10,  9,  8,
+    7,  7,  8,  9, 10, 10, 10, 10,  0, 10, 10, 10, 10,  9,  8,  8,
+    7,  8,  9, 10,  9, 10, 10, 10,  0, 10, 10,  9,  9, 10,  9,  8,
+    6,  7,  8,  9,  8,  9, 10, 10,  0, 10, 10,  9,  8,  9,  8,  7,
+    5,  6,  7,  8,  7,  8,  9, 10,  0, 10,  9,  8,  7,  8,  7,  6,
+};
+
+static const uint16_t huff_spec_b43_codes[] = {
+    0x001, 0x01E, 0x022, 0x018, 0x064, 0x0EC, 0x008, 0x100, 0x000, 0x101, 0x009, 0x0ED,
+    0x065, 0x019, 0x023, 0x01F, 0x01A, 0x030, 0x056, 0x09A, 0x00A, 0x090, 0x12C, 0x0A6,
+    0x000, 0x0A9, 0x12F, 0x093, 0x00F, 0x09F, 0x059, 0x039, 0x00E, 0x054, 0x0BC, 0x19E,
+    0x082, 0x176, 0x0AC, 0x088, 0x000, 0x08B, 0x0AF, 0x19D, 0x095, 0x1D1, 0x0BF, 0x051,
+    0x002, 0x098, 0x1D4, 0x0B8, 0x170, 0x046, 0x090, 0x060, 0x000, 0x067, 0x095, 0x0BD,
+    0x173, 0x0B5, 0x1D3, 0x09D, 0x052, 0x0EE, 0x034, 0x174, 0x0BA, 0x09C, 0x080, 0x044,
+    0x000, 0x047, 0x06D, 0x099, 0x0BF, 0x16F, 0x085, 0x001, 0x0CC, 0x036, 0x16C, 0x0B0,
+    0x09A, 0x084, 0x04E, 0x03E, 0x000, 0x037, 0x04B, 0x06B, 0x0A1, 0x0B3, 0x16B, 0x087,
+    0x1D6, 0x102, 0x0A4, 0x092, 0x068, 0x04C, 0x034, 0x030, 0x000, 0x02D, 0x03D, 0x049,
+    0x083, 0x097, 0x0AB, 0x169, 0x0B6, 0x09E, 0x06E, 0x064, 0x040, 0x038, 0x02E, 0x02A,
+    0x000, 0x029, 0x033, 0x03B, 0x043, 0x063, 0x087, 0x0A3, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x0B7, 0x0A2, 0x086, 0x062, 0x042, 0x03A, 0x032, 0x028, 0x000, 0x02B, 0x02F, 0x039,
+    0x041, 0x065, 0x06F, 0x09F, 0x1D7, 0x168, 0x0AA, 0x096, 0x082, 0x048, 0x03C, 0x02C,
+    0x000, 0x031, 0x035, 0x04D, 0x069, 0x093, 0x0A5, 0x103, 0x0CD, 0x086, 0x16A, 0x0B2,
+    0x0A0, 0x06A, 0x04A, 0x036, 0x000, 0x03F, 0x04F, 0x085, 0x09B, 0x0B1, 0x16D, 0x037,
+    0x053, 0x000, 0x084, 0x16E, 0x0BE, 0x098, 0x06C, 0x046, 0x000, 0x045, 0x081, 0x09D,
+    0x0BB, 0x175, 0x035, 0x0EF, 0x003, 0x09C, 0x1D2, 0x0B4, 0x172, 0x0BC, 0x094, 0x066,
+    0x000, 0x061, 0x091, 0x047, 0x171, 0x0B9, 0x1D5, 0x099, 0x00F, 0x050, 0x0BE, 0x1D0,
+    0x094, 0x19C, 0x0AE, 0x08A, 0x000, 0x089, 0x0AD, 0x177, 0x083, 0x19F, 0x0BD, 0x055,
+    0x01B, 0x038, 0x058, 0x09E, 0x00E, 0x092, 0x12E, 0x0A8, 0x000, 0x0A7, 0x12D, 0x091,
+    0x00B, 0x09B, 0x057, 0x031,
+};
+
+static const uint8_t huff_spec_b44_bits[] = {
+    2,  4,  6,  7,  7,  8, 10, 10,  0, 10, 10,  8,  7,  7,  6,  4,
+    5,  5,  7,  8,  8, 10, 10, 10,  0, 10, 10, 10,  8,  8,  7,  5,
+    6,  7,  8,  9,  9, 10, 10, 10,  0, 10, 10, 10, 10,  9,  8,  7,
+    8,  8,  9, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  8,
+    8,  8, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  8,
+    9, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    9, 10, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10, 10,
+    8,  8, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10, 10,  8,
+    8,  8, 10, 10, 10, 10, 10, 10,  0, 10, 10, 10, 10, 10,  9,  8,
+    6,  7,  8,  9, 10, 10, 10, 10,  0, 10, 10, 10,  9,  9,  8,  7,
+    5,  5,  7,  8,  8, 10, 10, 10,  0, 10, 10, 10,  8,  8,  7,  5,
+};
+
+static const uint16_t huff_spec_b44_codes[] = {
+    0x002, 0x002, 0x030, 0x000, 0x002, 0x00C, 0x1D2, 0x1AE, 0x000, 0x1AF, 0x1D3, 0x00D,
+    0x003, 0x001, 0x031, 0x003, 0x01E, 0x002, 0x070, 0x0C8, 0x07E, 0x1E8, 0x1C0, 0x176,
+    0x000, 0x17F, 0x1C3, 0x1EB, 0x0CF, 0x0D3, 0x073, 0x009, 0x018, 0x06A, 0x0EC, 0x1DE,
+    0x1A2, 0x1CA, 0x1AA, 0x164, 0x000, 0x16D, 0x1AD, 0x1D1, 0x1EF, 0x1DD, 0x0EB, 0x06D,
+    0x0E8, 0x0CA, 0x1BE, 0x1CE, 0x1DA, 0x1B6, 0x170, 0x154, 0x000, 0x153, 0x173, 0x1B1,
+    0x1D7, 0x1D5, 0x343, 0x0CD, 0x0DC, 0x078, 0x340, 0x1CC, 0x1BA, 0x1A8, 0x156, 0x148,
+    0x000, 0x145, 0x15F, 0x1A1, 0x1BD, 0x1D9, 0x1ED, 0x07D, 0x1BC, 0x1DC, 0x1C4, 0x1B2,
+    0x17C, 0x15A, 0x14A, 0x03A, 0x000, 0x039, 0x147, 0x16B, 0x17B, 0x1B5, 0x1C9, 0x1DF,
+    0x1C6, 0x1B8, 0x1A2, 0x168, 0x160, 0x14C, 0x02E, 0x024, 0x000, 0x027, 0x03D, 0x151,
+    0x15D, 0x16F, 0x1A7, 0x1BF, 0x1A4, 0x174, 0x162, 0x14E, 0x140, 0x02C, 0x02A, 0x022,
+    0x000, 0x021, 0x029, 0x03F, 0x143, 0x159, 0x167, 0x179, 0x000, 0x000, 0x000, 0x000,
+    0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
+    0x1A5, 0x178, 0x166, 0x158, 0x142, 0x03E, 0x028, 0x020, 0x000, 0x023, 0x02B, 0x02D,
+    0x141, 0x14F, 0x163, 0x175, 0x1C7, 0x1BE, 0x1A6, 0x16E, 0x15C, 0x150, 0x03C, 0x026,
+    0x000, 0x025, 0x02F, 0x14D, 0x161, 0x169, 0x1A3, 0x1B9, 0x1BD, 0x1DE, 0x1C8, 0x1B4,
+    0x17A, 0x16A, 0x146, 0x038, 0x000, 0x03B, 0x14B, 0x15B, 0x17D, 0x1B3, 0x1C5, 0x1DD,
+    0x0DD, 0x07C, 0x1EC, 0x1D8, 0x1BC, 0x1A0, 0x15E, 0x144, 0x000, 0x149, 0x157, 0x1A9,
+    0x1BB, 0x1CD, 0x341, 0x079, 0x0E9, 0x0CC, 0x342, 0x1D4, 0x1D6, 0x1B0, 0x172, 0x152,
+    0x000, 0x155, 0x171, 0x1B7, 0x1DB, 0x1CF, 0x1BF, 0x0CB, 0x019, 0x06C, 0x0EA, 0x1DC,
+    0x1EE, 0x1D0, 0x1AC, 0x16C, 0x000, 0x165, 0x1AB, 0x1CB, 0x1A3, 0x1DF, 0x0ED, 0x06B,
+    0x01F, 0x008, 0x072, 0x0D2, 0x0CE, 0x1EA, 0x1C2, 0x17E, 0x000, 0x177, 0x1C1, 0x1E9,
+    0x07F, 0x0C9, 0x071, 0x003,
+};
+
+static const uint8_t huff_spec_b52_bits[] = {
+    3, 4, 4, 4, 5, 5, 6, 6, 5, 5, 5, 6, 6, 6, 7, 7,
+    0, 7, 7, 6, 6, 6, 5, 5, 5, 6, 6, 5, 5, 4, 4, 4,
+};
+
+static const uint16_t huff_spec_b52_codes[] = {
+    0x06, 0x0E, 0x06, 0x00, 0x0A, 0x04, 0x2C, 0x12, 0x14, 0x10, 0x06, 0x2E, 0x24, 0x10, 0x4E, 0x4C,
+    0x00, 0x4D, 0x4F, 0x11, 0x25, 0x2F, 0x07, 0x11, 0x15, 0x13, 0x2D, 0x05, 0x0B, 0x01, 0x07, 0x0F,
+};
+
+static const uint8_t huff_spec_b53_bits[] = {
+    2, 3, 4, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8,
+    0, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 4, 3,
+};
+
+static const uint16_t huff_spec_b53_codes[] = {
+    0x02, 0x00, 0x06, 0x1C, 0x18, 0x3E, 0x16, 0x10, 0x3C, 0x36, 0x14, 0x6A, 0x26, 0x24, 0xD2, 0xD0,
+    0x00, 0xD1, 0xD3, 0x25, 0x27, 0x6B, 0x15, 0x37, 0x3D, 0x11, 0x17, 0x3F, 0x19, 0x1D, 0x07, 0x01,
+};
+
+static const uint8_t huff_spec_b54_bits[] = {
+    2, 3, 4, 4, 5, 6, 6, 7, 6, 6, 7, 8, 8, 8, 9, 9,
+    0, 9, 9, 8, 8, 8, 7, 6, 6, 7, 6, 6, 5, 4, 4, 3,
+};
+
+static const uint16_t huff_spec_b54_codes[] = {
+    0x003, 0x002, 0x008, 0x000, 0x014, 0x02E, 0x00E, 0x05A, 0x00A, 0x008, 0x01A, 0x0B2,
+    0x032, 0x030, 0x162, 0x160, 0x000, 0x161, 0x163, 0x031, 0x033, 0x0B3, 0x01B, 0x009,
+    0x00B, 0x05B, 0x00F, 0x02F, 0x015, 0x001, 0x009, 0x003,
+};
+
+static const uint8_t huff_spec_b62_bits[] = {
+    4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7,
+    6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+    0, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+    6, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 4,
+};
+
+static const uint16_t huff_spec_b62_codes[] = {
+    0x0D, 0x06, 0x1C, 0x14, 0x0A, 0x04, 0x3E, 0x2E, 0x22, 0x0E, 0x06, 0x00, 0x5A, 0x4E, 0x40, 0x20,
+    0x30, 0x32, 0x24, 0x12, 0x0C, 0x02, 0x78, 0x58, 0x42, 0x22, 0x0A, 0x08, 0xF6, 0xF4, 0x9A, 0x98,
+    0x00, 0x99, 0x9B, 0xF5, 0xF7, 0x09, 0x0B, 0x23, 0x43, 0x59, 0x79, 0x03, 0x0D, 0x13, 0x25, 0x33,
+    0x31, 0x21, 0x41, 0x4F, 0x5B, 0x01, 0x07, 0x0F, 0x23, 0x2F, 0x3F, 0x05, 0x0B, 0x15, 0x1D, 0x07,
+};
+
+static const uint8_t huff_spec_b63_bits[] = {
+    3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8,
+    6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9,
+    0, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6,
+    6, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4,
+};
+
+static const uint16_t huff_spec_b63_codes[] = {
+    0x006, 0x00E, 0x004, 0x014, 0x010, 0x006, 0x000, 0x026, 0x01C, 0x018, 0x004, 0x05C,
+    0x04A, 0x03C, 0x016, 0x0BC, 0x006, 0x008, 0x058, 0x03E, 0x036, 0x014, 0x0B6, 0x0B4,
+    0x090, 0x068, 0x17E, 0x17C, 0x126, 0x124, 0x0D6, 0x0D4, 0x000, 0x0D5, 0x0D7, 0x125,
+    0x127, 0x17D, 0x17F, 0x069, 0x091, 0x0B5, 0x0B7, 0x015, 0x037, 0x03F, 0x059, 0x009,
+    0x007, 0x0BD, 0x017, 0x03D, 0x04B, 0x05D, 0x005, 0x019, 0x01D, 0x027, 0x001, 0x007,
+    0x011, 0x015, 0x005, 0x00F,
+};
+
+static const uint8_t huff_spec_b64_bits[] = {
+    3,  3,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  7,  8,
+    7,  7,  7,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
+    0, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  7,  7,
+    7,  8,  7,  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  4,  3,
+};
+
+static const uint16_t huff_spec_b64_codes[] = {
+    0x007, 0x000, 0x008, 0x01A, 0x014, 0x00C, 0x032, 0x02E, 0x01E, 0x014, 0x062, 0x05A,
+    0x03A, 0x026, 0x020, 0x0B2, 0x038, 0x02C, 0x022, 0x0C0, 0x05E, 0x04A, 0x186, 0x184,
+    0x160, 0x0BA, 0x092, 0x090, 0x2C6, 0x2C4, 0x172, 0x170, 0x000, 0x171, 0x173, 0x2C5,
+    0x2C7, 0x091, 0x093, 0x0BB, 0x161, 0x185, 0x187, 0x04B, 0x05F, 0x0C1, 0x023, 0x02D,
+    0x039, 0x0B3, 0x021, 0x027, 0x03B, 0x05B, 0x063, 0x015, 0x01F, 0x02F, 0x033, 0x00D,
+    0x015, 0x01B, 0x009, 0x001,
+};
+
+static const uint8_t huff_spec_b72_bits[] = {
+    5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5,
+};
+
+static const uint16_t huff_spec_b72_codes[] = {
+    0x01E, 0x016, 0x00C, 0x000, 0x038, 0x032, 0x028, 0x022, 0x01C, 0x012, 0x00E, 0x006,
+    0x076, 0x06C, 0x060, 0x04E, 0x03E, 0x02A, 0x022, 0x01A, 0x012, 0x00A, 0x0FC, 0x0DC,
+    0x0C6, 0x0A8, 0x094, 0x086, 0x058, 0x042, 0x040, 0x02A, 0x068, 0x07C, 0x06A, 0x056,
+    0x048, 0x040, 0x02E, 0x028, 0x016, 0x010, 0x008, 0x0EA, 0x0DE, 0x0AA, 0x09A, 0x096,
+    0x07A, 0x078, 0x05A, 0x032, 0x030, 0x028, 0x1FE, 0x1FC, 0x1D2, 0x1D0, 0x18A, 0x188,
+    0x132, 0x130, 0x10A, 0x108, 0x000, 0x109, 0x10B, 0x131, 0x133, 0x189, 0x18B, 0x1D1,
+    0x1D3, 0x1FD, 0x1FF, 0x029, 0x031, 0x033, 0x05B, 0x079, 0x07B, 0x097, 0x09B, 0x0AB,
+    0x0DF, 0x0EB, 0x009, 0x011, 0x017, 0x029, 0x02F, 0x041, 0x049, 0x057, 0x06B, 0x07D,
+    0x069, 0x02B, 0x041, 0x043, 0x059, 0x087, 0x095, 0x0A9, 0x0C7, 0x0DD, 0x0FD, 0x00B,
+    0x013, 0x01B, 0x023, 0x02B, 0x03F, 0x04F, 0x061, 0x06D, 0x077, 0x007, 0x00F, 0x013,
+    0x01D, 0x023, 0x029, 0x033, 0x039, 0x001, 0x00D, 0x017,
+};
+
+static const uint8_t huff_spec_b73_bits[] = {
+    3,  4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,
+    8,  7,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,
+    9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,
+    9,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  7,
+    8,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  5,  5,  5,  4,
+};
+
+static const uint16_t huff_spec_b73_codes[] = {
+    0x000, 0x006, 0x018, 0x010, 0x004, 0x03A, 0x034, 0x02A, 0x026, 0x014, 0x010, 0x07E,
+    0x072, 0x06E, 0x05C, 0x052, 0x04A, 0x02C, 0x024, 0x018, 0x0F4, 0x0E0, 0x0DA, 0x0B6,
+    0x0B2, 0x0A0, 0x05E, 0x04E, 0x038, 0x034, 0x1E6, 0x1B2, 0x0FA, 0x01E, 0x0F8, 0x0F0,
+    0x0BE, 0x0B4, 0x0A2, 0x090, 0x04C, 0x03A, 0x1EE, 0x1E4, 0x1C6, 0x1B0, 0x178, 0x162,
+    0x126, 0x124, 0x0B8, 0x06C, 0x3DA, 0x3D8, 0x38A, 0x388, 0x2F6, 0x2F4, 0x2C2, 0x2C0,
+    0x176, 0x174, 0x0DC, 0x0DE, 0x000, 0x0DF, 0x0DD, 0x175, 0x177, 0x2C1, 0x2C3, 0x2F5,
+    0x2F7, 0x389, 0x38B, 0x3D9, 0x3DB, 0x06D, 0x0B9, 0x125, 0x127, 0x163, 0x179, 0x1B1,
+    0x1C7, 0x1E5, 0x1EF, 0x03B, 0x04D, 0x091, 0x0A3, 0x0B5, 0x0BF, 0x0F1, 0x0F9, 0x01F,
+    0x0FB, 0x1B3, 0x1E7, 0x035, 0x039, 0x04F, 0x05F, 0x0A1, 0x0B3, 0x0B7, 0x0DB, 0x0E1,
+    0x0F5, 0x019, 0x025, 0x02D, 0x04B, 0x053, 0x05D, 0x06F, 0x073, 0x07F, 0x011, 0x015,
+    0x027, 0x02B, 0x035, 0x03B, 0x005, 0x011, 0x019, 0x007,
+};
+
+static const uint8_t huff_spec_b74_bits[] = {
+    3,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,
+    7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,
+    8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,
+    8,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,
+    7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,
+};
+
+static const uint16_t huff_spec_b74_codes[] = {
+    0x001, 0x008, 0x01E, 0x018, 0x00C, 0x002, 0x03A, 0x034, 0x02C, 0x01E, 0x016, 0x012,
+    0x072, 0x06E, 0x05E, 0x056, 0x050, 0x038, 0x022, 0x004, 0x0E2, 0x0DA, 0x0BA, 0x0A8,
+    0x076, 0x054, 0x050, 0x002, 0x000, 0x1C0, 0x1B0, 0x156, 0x0A4, 0x0A6, 0x074, 0x052,
+    0x004, 0x1C2, 0x1B2, 0x170, 0x154, 0x0AE, 0x0AC, 0x086, 0x2E6, 0x2E4, 0x10A, 0x108,
+    0x106, 0x104, 0x102, 0x100, 0x03E, 0x03A, 0x03C, 0x038, 0x036, 0x034, 0x032, 0x030,
+    0x01E, 0x01A, 0x01C, 0x018, 0x000, 0x019, 0x01D, 0x01B, 0x01F, 0x031, 0x033, 0x035,
+    0x037, 0x039, 0x03D, 0x03B, 0x03F, 0x101, 0x103, 0x105, 0x107, 0x109, 0x10B, 0x2E5,
+    0x2E7, 0x087, 0x0AD, 0x0AF, 0x155, 0x171, 0x1B3, 0x1C3, 0x005, 0x053, 0x075, 0x0A7,
+    0x0A5, 0x157, 0x1B1, 0x1C1, 0x001, 0x003, 0x051, 0x055, 0x077, 0x0A9, 0x0BB, 0x0DB,
+    0x0E3, 0x005, 0x023, 0x039, 0x051, 0x057, 0x05F, 0x06F, 0x073, 0x013, 0x017, 0x01F,
+    0x02D, 0x035, 0x03B, 0x003, 0x00D, 0x019, 0x01F, 0x009,
+};
+
+static const HuffmanCodebook at9_huffman_coeffs[][8][4] = {
+    {
+        { { 0 } },
+        { { 0 } },
+        {
+            { huff_spec_a21_bits, huff_spec_a21_codes,  16,   2,   1,   2,   3, },
+            { huff_spec_a22_bits, huff_spec_a22_codes, 256,   4,   2,   2,   8, },
+            { huff_spec_a23_bits, huff_spec_a23_codes, 256,   4,   2,   2,   9, },
+            { huff_spec_a24_bits, huff_spec_a24_codes, 256,   4,   2,   2,  10, },
+        },
+        {
+            { huff_spec_a31_bits, huff_spec_a31_codes,  64,   2,   1,   3,   7, },
+            { huff_spec_a32_bits, huff_spec_a32_codes,  64,   2,   1,   3,   7, },
+            { huff_spec_a33_bits, huff_spec_a33_codes,  64,   2,   1,   3,   8, },
+            { huff_spec_a34_bits, huff_spec_a34_codes,  64,   2,   1,   3,  10, },
+        },
+        {
+            { huff_spec_a41_bits, huff_spec_a41_codes, 256,   2,   1,   4,   9, },
+            { huff_spec_a42_bits, huff_spec_a42_codes, 256,   2,   1,   4,  10, },
+            { huff_spec_a43_bits, huff_spec_a43_codes, 256,   2,   1,   4,  10, },
+            { huff_spec_a44_bits, huff_spec_a44_codes, 256,   2,   1,   4,  10, },
+        },
+        {
+            { huff_spec_a51_bits, huff_spec_a51_codes,  32,   1,   0,   5,   6, },
+            { huff_spec_a52_bits, huff_spec_a52_codes,  32,   1,   0,   5,   6, },
+            { huff_spec_a53_bits, huff_spec_a53_codes,  32,   1,   0,   5,   7, },
+            { huff_spec_a54_bits, huff_spec_a54_codes,  32,   1,   0,   5,   8, },
+        },
+        {
+            { huff_spec_a61_bits, huff_spec_a61_codes,  64,   1,   0,   6,   7, },
+            { huff_spec_a62_bits, huff_spec_a62_codes,  64,   1,   0,   6,   7, },
+            { huff_spec_a63_bits, huff_spec_a63_codes,  64,   1,   0,   6,   8, },
+            { huff_spec_a64_bits, huff_spec_a64_codes,  64,   1,   0,   6,   9, },
+        },
+        {
+            { huff_spec_a71_bits, huff_spec_a71_codes, 128,   1,   0,   7,   8, },
+            { huff_spec_a72_bits, huff_spec_a72_codes, 128,   1,   0,   7,   8, },
+            { huff_spec_a73_bits, huff_spec_a73_codes, 128,   1,   0,   7,   9, },
+            { huff_spec_a74_bits, huff_spec_a74_codes, 128,   1,   0,   7,  10, },
+        },
+    },
+    {
+        { { 0 } },
+        { { 0 } },
+        {
+            { 0 },
+            { huff_spec_b22_bits, huff_spec_b22_codes,  256,  4,   2,   2,  10, },
+            { huff_spec_b23_bits, huff_spec_b23_codes,  256,  4,   2,   2,  10, },
+            { huff_spec_b24_bits, huff_spec_b24_codes,  256,  4,   2,   2,  10, },
+        },
+        {
+            { 0 },
+            { huff_spec_b32_bits, huff_spec_b32_codes,  64,   2,   1,   3,   9, },
+            { huff_spec_b33_bits, huff_spec_b33_codes,  64,   2,   1,   3,  10, },
+            { huff_spec_b34_bits, huff_spec_b34_codes,  64,   2,   1,   3,  10, },
+        },
+        {
+            { 0 },
+            { huff_spec_b42_bits, huff_spec_b42_codes, 256,   2,   1,   4,  10, },
+            { huff_spec_b43_bits, huff_spec_b43_codes, 256,   2,   1,   4,  10, },
+            { huff_spec_b44_bits, huff_spec_b44_codes, 256,   2,   1,   4,  10, },
+        },
+        {
+            { 0 },
+            { huff_spec_b52_bits, huff_spec_b52_codes,  32,   1,   0,   5,   7, },
+            { huff_spec_b53_bits, huff_spec_b53_codes,  32,   1,   0,   5,   8, },
+            { huff_spec_b54_bits, huff_spec_b54_codes,  32,   1,   0,   5,   9, },
+        },
+        {
+            { 0 },
+            { huff_spec_b62_bits, huff_spec_b62_codes,  64,   1,   0,   6,   8, },
+            { huff_spec_b63_bits, huff_spec_b63_codes,  64,   1,   0,   6,   9, },
+            { huff_spec_b64_bits, huff_spec_b64_codes,  64,   1,   0,   6,  10, },
+        },
+        {
+            { 0 },
+            { huff_spec_b72_bits, huff_spec_b72_codes, 128,   1,   0,   7,   9, },
+            { huff_spec_b73_bits, huff_spec_b73_codes, 128,   1,   0,   7,  10, },
+            { huff_spec_b74_bits, huff_spec_b74_codes, 128,   1,   0,   7,  10, },
+        },
+    },
+};
+
+#endif /* AVCODEC_ATRAC9TAB_H */
diff --git a/libavcodec/audio_frame_queue.c b/libavcodec/audio_frame_queue.c
index c4ca02b..f2ccd69 100644
--- a/libavcodec/audio_frame_queue.c
+++ b/libavcodec/audio_frame_queue.c
@@ -2,110 +2,72 @@
  * Audio Frame Queue
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
-#include "libavutil/mathematics.h"
-#include "internal.h"
 #include "audio_frame_queue.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
 
 av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq)
 {
-    afq->avctx             = avctx;
-    afq->next_pts          = AV_NOPTS_VALUE;
+    afq->avctx = avctx;
     afq->remaining_delay   = avctx->initial_padding;
     afq->remaining_samples = avctx->initial_padding;
-    afq->frame_queue       = NULL;
-}
-
-static void delete_next_frame(AudioFrameQueue *afq)
-{
-    AudioFrame *f = afq->frame_queue;
-    if (f) {
-        afq->frame_queue = f->next;
-        f->next = NULL;
-        av_freep(&f);
-    }
+    afq->frame_count       = 0;
 }
 
 void ff_af_queue_close(AudioFrameQueue *afq)
 {
-    /* remove/free any remaining frames */
-    while (afq->frame_queue)
-        delete_next_frame(afq);
+    if(afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "%d frames left in the queue on closing\n", afq->frame_count);
+    av_freep(&afq->frames);
     memset(afq, 0, sizeof(*afq));
 }
 
-#ifdef DEBUG
-static void af_queue_log_state(AudioFrameQueue *afq)
-{
-    AudioFrame *f;
-    ff_dlog(afq->avctx, "remaining delay   = %d\n", afq->remaining_delay);
-    ff_dlog(afq->avctx, "remaining samples = %d\n", afq->remaining_samples);
-    ff_dlog(afq->avctx, "frames:\n");
-    f = afq->frame_queue;
-    while (f) {
-        ff_dlog(afq->avctx, "  [ pts=%9"PRId64" duration=%d ]\n",
-                f->pts, f->duration);
-        f = f->next;
-    }
-}
-#endif /* DEBUG */
-
 int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f)
 {
-    AudioFrame *new_frame;
-    AudioFrame *queue_end = afq->frame_queue;
-
-    /* find the end of the queue */
-    while (queue_end && queue_end->next)
-        queue_end = queue_end->next;
-
-    /* allocate new frame queue entry */
-    if (!(new_frame = av_malloc(sizeof(*new_frame))))
+    AudioFrame *new = av_fast_realloc(afq->frames, &afq->frame_alloc, sizeof(*afq->frames)*(afq->frame_count+1));
+    if(!new)
         return AVERROR(ENOMEM);
+    afq->frames = new;
+    new += afq->frame_count;
 
     /* get frame parameters */
-    new_frame->next = NULL;
-    new_frame->duration = f->nb_samples;
+    new->duration = f->nb_samples;
+    new->duration += afq->remaining_delay;
     if (f->pts != AV_NOPTS_VALUE) {
-        new_frame->pts = av_rescale_q(f->pts,
+        new->pts = av_rescale_q(f->pts,
                                       afq->avctx->time_base,
                                       (AVRational){ 1, afq->avctx->sample_rate });
-        afq->next_pts = new_frame->pts + new_frame->duration;
+        new->pts -= afq->remaining_delay;
+        if(afq->frame_count && new[-1].pts >= new->pts)
+            av_log(afq->avctx, AV_LOG_WARNING, "Queue input is backward in time\n");
     } else {
-        new_frame->pts = AV_NOPTS_VALUE;
-        afq->next_pts  = AV_NOPTS_VALUE;
+        new->pts = AV_NOPTS_VALUE;
     }
-
-    /* add new frame to the end of the queue */
-    if (!queue_end)
-        afq->frame_queue = new_frame;
-    else
-        queue_end->next = new_frame;
+    afq->remaining_delay = 0;
 
     /* add frame sample count */
     afq->remaining_samples += f->nb_samples;
 
-#ifdef DEBUG
-    af_queue_log_state(afq);
-#endif
+    afq->frame_count++;
 
     return 0;
 }
@@ -115,50 +77,37 @@ void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
 {
     int64_t out_pts = AV_NOPTS_VALUE;
     int removed_samples = 0;
+    int i;
 
-#ifdef DEBUG
-    af_queue_log_state(afq);
-#endif
-
-    /* get output pts from the next frame or generated pts */
-    if (afq->frame_queue) {
-        if (afq->frame_queue->pts != AV_NOPTS_VALUE)
-            out_pts = afq->frame_queue->pts - afq->remaining_delay;
-    } else {
-        if (afq->next_pts != AV_NOPTS_VALUE)
-            out_pts = afq->next_pts - afq->remaining_delay;
+    if (afq->frame_count || afq->frame_alloc) {
+        if (afq->frames->pts != AV_NOPTS_VALUE)
+            out_pts = afq->frames->pts;
     }
-    if (pts) {
-        if (out_pts != AV_NOPTS_VALUE)
-            *pts = ff_samples_to_time_base(afq->avctx, out_pts);
-        else
-            *pts = AV_NOPTS_VALUE;
-    }
-
-    /* if the delay is larger than the packet duration, we use up delay samples
-       for the output packet and leave all frames in the queue */
-    if (afq->remaining_delay >= nb_samples) {
-        removed_samples      += nb_samples;
-        afq->remaining_delay -= nb_samples;
-    }
-    /* remove frames from the queue until we have enough to cover the
-       requested number of samples or until the queue is empty */
-    while (removed_samples < nb_samples && afq->frame_queue) {
-        removed_samples += afq->frame_queue->duration;
-        delete_next_frame(afq);
+    if(!afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "Trying to remove %d samples, but the queue is empty\n", nb_samples);
+    if (pts)
+        *pts = ff_samples_to_time_base(afq->avctx, out_pts);
+
+    for(i=0; nb_samples && i<afq->frame_count; i++){
+        int n= FFMIN(afq->frames[i].duration, nb_samples);
+        afq->frames[i].duration -= n;
+        nb_samples              -= n;
+        removed_samples         += n;
+        if(afq->frames[i].pts != AV_NOPTS_VALUE)
+            afq->frames[i].pts      += n;
     }
     afq->remaining_samples -= removed_samples;
-
-    /* if there are no frames left and we have room for more samples, use
-       any remaining delay samples */
-    if (removed_samples < nb_samples && afq->remaining_samples > 0) {
-        int add_samples = FFMIN(afq->remaining_samples,
-                                nb_samples - removed_samples);
-        removed_samples        += add_samples;
-        afq->remaining_samples -= add_samples;
+    i -= i && afq->frames[i-1].duration;
+    memmove(afq->frames, afq->frames + i, sizeof(*afq->frames) * (afq->frame_count - i));
+    afq->frame_count -= i;
+
+    if(nb_samples){
+        av_assert0(!afq->frame_count);
+        av_assert0(afq->remaining_samples == afq->remaining_delay);
+        if(afq->frames && afq->frames[0].pts != AV_NOPTS_VALUE)
+            afq->frames[0].pts += nb_samples;
+        av_log(afq->avctx, AV_LOG_DEBUG, "Trying to remove %d more samples than there are in the queue\n", nb_samples);
     }
-    if (removed_samples > nb_samples)
-        av_log(afq->avctx, AV_LOG_WARNING, "frame_size is too large\n");
     if (duration)
         *duration = ff_samples_to_time_base(afq->avctx, removed_samples);
 }
diff --git a/libavcodec/audio_frame_queue.h b/libavcodec/audio_frame_queue.h
index 1250ec2..d8076ea 100644
--- a/libavcodec/audio_frame_queue.h
+++ b/libavcodec/audio_frame_queue.h
@@ -2,20 +2,20 @@
  * Audio Frame Queue
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,15 @@
 typedef struct AudioFrame {
     int64_t pts;
     int duration;
-    struct AudioFrame *next;
 } AudioFrame;
 
 typedef struct AudioFrameQueue {
     AVCodecContext *avctx;
-    int64_t next_pts;
     int remaining_delay;
     int remaining_samples;
-    AudioFrame *frame_queue;
+    AudioFrame *frames;
+    unsigned frame_count;
+    unsigned frame_alloc;
 } AudioFrameQueue;
 
 /**
diff --git a/libavcodec/audiodsp.c b/libavcodec/audiodsp.c
index 776cd11..3c7a3a7 100644
--- a/libavcodec/audiodsp.c
+++ b/libavcodec/audiodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/audiodsp.h b/libavcodec/audiodsp.h
index 2b4f9d4..aa6fa78 100644
--- a/libavcodec/audiodsp.h
+++ b/libavcodec/audiodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/audiotoolboxdec.c b/libavcodec/audiotoolboxdec.c
new file mode 100644
index 0000000..5c0a9de
--- /dev/null
+++ b/libavcodec/audiotoolboxdec.c
@@ -0,0 +1,617 @@
+/*
+ * Audio Toolbox system codecs
+ *
+ * copyright (c) 2016 Rodger Combs
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <AudioToolbox/AudioToolbox.h>
+
+#include "config.h"
+#include "avcodec.h"
+#include "ac3_parser_internal.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "mpegaudiodecheader.h"
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+#include "libavutil/log.h"
+
+#if __MAC_OS_X_VERSION_MIN_REQUIRED < 101100
+#define kAudioFormatEnhancedAC3 'ec-3'
+#endif
+
+typedef struct ATDecodeContext {
+    AVClass *av_class;
+
+    AudioConverterRef converter;
+    AudioStreamPacketDescription pkt_desc;
+    AVPacket in_pkt;
+    AVPacket new_in_pkt;
+    char *decoded_data;
+    int channel_map[64];
+
+    uint8_t *extradata;
+    int extradata_size;
+
+    int64_t last_pts;
+    int eof;
+} ATDecodeContext;
+
+static UInt32 ffat_get_format_id(enum AVCodecID codec, int profile)
+{
+    switch (codec) {
+    case AV_CODEC_ID_AAC:
+        return kAudioFormatMPEG4AAC;
+    case AV_CODEC_ID_AC3:
+        return kAudioFormatAC3;
+    case AV_CODEC_ID_ADPCM_IMA_QT:
+        return kAudioFormatAppleIMA4;
+    case AV_CODEC_ID_ALAC:
+        return kAudioFormatAppleLossless;
+    case AV_CODEC_ID_AMR_NB:
+        return kAudioFormatAMR;
+    case AV_CODEC_ID_EAC3:
+        return kAudioFormatEnhancedAC3;
+    case AV_CODEC_ID_GSM_MS:
+        return kAudioFormatMicrosoftGSM;
+    case AV_CODEC_ID_ILBC:
+        return kAudioFormatiLBC;
+    case AV_CODEC_ID_MP1:
+        return kAudioFormatMPEGLayer1;
+    case AV_CODEC_ID_MP2:
+        return kAudioFormatMPEGLayer2;
+    case AV_CODEC_ID_MP3:
+        return kAudioFormatMPEGLayer3;
+    case AV_CODEC_ID_PCM_ALAW:
+        return kAudioFormatALaw;
+    case AV_CODEC_ID_PCM_MULAW:
+        return kAudioFormatULaw;
+    case AV_CODEC_ID_QDMC:
+        return kAudioFormatQDesign;
+    case AV_CODEC_ID_QDM2:
+        return kAudioFormatQDesign2;
+    default:
+        av_assert0(!"Invalid codec ID!");
+        return 0;
+    }
+}
+
+static int ffat_get_channel_id(AudioChannelLabel label)
+{
+    if (label == 0)
+        return -1;
+    else if (label <= kAudioChannelLabel_LFEScreen)
+        return label - 1;
+    else if (label <= kAudioChannelLabel_RightSurround)
+        return label + 4;
+    else if (label <= kAudioChannelLabel_CenterSurround)
+        return label + 1;
+    else if (label <= kAudioChannelLabel_RightSurroundDirect)
+        return label + 23;
+    else if (label <= kAudioChannelLabel_TopBackRight)
+        return label - 1;
+    else if (label < kAudioChannelLabel_RearSurroundLeft)
+        return -1;
+    else if (label <= kAudioChannelLabel_RearSurroundRight)
+        return label - 29;
+    else if (label <= kAudioChannelLabel_RightWide)
+        return label - 4;
+    else if (label == kAudioChannelLabel_LFE2)
+        return ff_ctzll(AV_CH_LOW_FREQUENCY_2);
+    else if (label == kAudioChannelLabel_Mono)
+        return ff_ctzll(AV_CH_FRONT_CENTER);
+    else
+        return -1;
+}
+
+static int ffat_compare_channel_descriptions(const void* a, const void* b)
+{
+    const AudioChannelDescription* da = a;
+    const AudioChannelDescription* db = b;
+    return ffat_get_channel_id(da->mChannelLabel) - ffat_get_channel_id(db->mChannelLabel);
+}
+
+static AudioChannelLayout *ffat_convert_layout(AudioChannelLayout *layout, UInt32* size)
+{
+    AudioChannelLayoutTag tag = layout->mChannelLayoutTag;
+    AudioChannelLayout *new_layout;
+    if (tag == kAudioChannelLayoutTag_UseChannelDescriptions)
+        return layout;
+    else if (tag == kAudioChannelLayoutTag_UseChannelBitmap)
+        AudioFormatGetPropertyInfo(kAudioFormatProperty_ChannelLayoutForBitmap,
+                                   sizeof(UInt32), &layout->mChannelBitmap, size);
+    else
+        AudioFormatGetPropertyInfo(kAudioFormatProperty_ChannelLayoutForTag,
+                                   sizeof(AudioChannelLayoutTag), &tag, size);
+    new_layout = av_malloc(*size);
+    if (!new_layout) {
+        av_free(layout);
+        return NULL;
+    }
+    if (tag == kAudioChannelLayoutTag_UseChannelBitmap)
+        AudioFormatGetProperty(kAudioFormatProperty_ChannelLayoutForBitmap,
+                               sizeof(UInt32), &layout->mChannelBitmap, size, new_layout);
+    else
+        AudioFormatGetProperty(kAudioFormatProperty_ChannelLayoutForTag,
+                               sizeof(AudioChannelLayoutTag), &tag, size, new_layout);
+    new_layout->mChannelLayoutTag = kAudioChannelLayoutTag_UseChannelDescriptions;
+    av_free(layout);
+    return new_layout;
+}
+
+static int ffat_update_ctx(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioStreamBasicDescription format;
+    UInt32 size = sizeof(format);
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterCurrentInputStreamDescription,
+                                   &size, &format)) {
+        if (format.mSampleRate)
+            avctx->sample_rate = format.mSampleRate;
+        avctx->channels = format.mChannelsPerFrame;
+        avctx->channel_layout = av_get_default_channel_layout(avctx->channels);
+        avctx->frame_size = format.mFramesPerPacket;
+    }
+
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterCurrentOutputStreamDescription,
+                                   &size, &format)) {
+        format.mSampleRate = avctx->sample_rate;
+        format.mChannelsPerFrame = avctx->channels;
+        AudioConverterSetProperty(at->converter,
+                                  kAudioConverterCurrentOutputStreamDescription,
+                                  size, &format);
+    }
+
+    if (!AudioConverterGetPropertyInfo(at->converter, kAudioConverterOutputChannelLayout,
+                                       &size, NULL) && size) {
+        AudioChannelLayout *layout = av_malloc(size);
+        uint64_t layout_mask = 0;
+        int i;
+        if (!layout)
+            return AVERROR(ENOMEM);
+        AudioConverterGetProperty(at->converter, kAudioConverterOutputChannelLayout,
+                                  &size, layout);
+        if (!(layout = ffat_convert_layout(layout, &size)))
+            return AVERROR(ENOMEM);
+        for (i = 0; i < layout->mNumberChannelDescriptions; i++) {
+            int id = ffat_get_channel_id(layout->mChannelDescriptions[i].mChannelLabel);
+            if (id < 0)
+                goto done;
+            if (layout_mask & (1 << id))
+                goto done;
+            layout_mask |= 1 << id;
+            layout->mChannelDescriptions[i].mChannelFlags = i; // Abusing flags as index
+        }
+        avctx->channel_layout = layout_mask;
+        qsort(layout->mChannelDescriptions, layout->mNumberChannelDescriptions,
+              sizeof(AudioChannelDescription), &ffat_compare_channel_descriptions);
+        for (i = 0; i < layout->mNumberChannelDescriptions; i++)
+            at->channel_map[i] = layout->mChannelDescriptions[i].mChannelFlags;
+done:
+        av_free(layout);
+    }
+
+    if (!avctx->frame_size)
+        avctx->frame_size = 2048;
+
+    return 0;
+}
+
+static void put_descr(PutByteContext *pb, int tag, unsigned int size)
+{
+    int i = 3;
+    bytestream2_put_byte(pb, tag);
+    for (; i > 0; i--)
+        bytestream2_put_byte(pb, (size >> (7 * i)) | 0x80);
+    bytestream2_put_byte(pb, size & 0x7F);
+}
+
+static uint8_t* ffat_get_magic_cookie(AVCodecContext *avctx, UInt32 *cookie_size)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (avctx->codec_id == AV_CODEC_ID_AAC) {
+        char *extradata;
+        PutByteContext pb;
+        *cookie_size = 5 + 3 + 5+13 + 5+at->extradata_size;
+        if (!(extradata = av_malloc(*cookie_size)))
+            return NULL;
+
+        bytestream2_init_writer(&pb, extradata, *cookie_size);
+
+        // ES descriptor
+        put_descr(&pb, 0x03, 3 + 5+13 + 5+at->extradata_size);
+        bytestream2_put_be16(&pb, 0);
+        bytestream2_put_byte(&pb, 0x00); // flags (= no flags)
+
+        // DecoderConfig descriptor
+        put_descr(&pb, 0x04, 13 + 5+at->extradata_size);
+
+        // Object type indication
+        bytestream2_put_byte(&pb, 0x40);
+
+        bytestream2_put_byte(&pb, 0x15); // flags (= Audiostream)
+
+        bytestream2_put_be24(&pb, 0); // Buffersize DB
+
+        bytestream2_put_be32(&pb, 0); // maxbitrate
+        bytestream2_put_be32(&pb, 0); // avgbitrate
+
+        // DecoderSpecific info descriptor
+        put_descr(&pb, 0x05, at->extradata_size);
+        bytestream2_put_buffer(&pb, at->extradata, at->extradata_size);
+        return extradata;
+    } else {
+        *cookie_size = at->extradata_size;
+        return at->extradata;
+    }
+}
+
+static av_cold int ffat_usable_extradata(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    return at->extradata_size &&
+           (avctx->codec_id == AV_CODEC_ID_ALAC ||
+            avctx->codec_id == AV_CODEC_ID_QDM2 ||
+            avctx->codec_id == AV_CODEC_ID_QDMC ||
+            avctx->codec_id == AV_CODEC_ID_AAC);
+}
+
+static int ffat_set_extradata(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (ffat_usable_extradata(avctx)) {
+        OSStatus status;
+        UInt32 cookie_size;
+        uint8_t *cookie = ffat_get_magic_cookie(avctx, &cookie_size);
+        if (!cookie)
+            return AVERROR(ENOMEM);
+
+        status = AudioConverterSetProperty(at->converter,
+                                           kAudioConverterDecompressionMagicCookie,
+                                           cookie_size, cookie);
+        if (status != 0)
+            av_log(avctx, AV_LOG_WARNING, "AudioToolbox cookie error: %i\n", (int)status);
+
+        if (cookie != at->extradata)
+            av_free(cookie);
+    }
+    return 0;
+}
+
+static av_cold int ffat_create_decoder(AVCodecContext *avctx, AVPacket *pkt)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    OSStatus status;
+    int i;
+
+    enum AVSampleFormat sample_fmt = (avctx->bits_per_raw_sample == 32) ?
+                                     AV_SAMPLE_FMT_S32 : AV_SAMPLE_FMT_S16;
+
+    AudioStreamBasicDescription in_format = {
+        .mFormatID = ffat_get_format_id(avctx->codec_id, avctx->profile),
+        .mBytesPerPacket = (avctx->codec_id == AV_CODEC_ID_ILBC) ? avctx->block_align : 0,
+    };
+    AudioStreamBasicDescription out_format = {
+        .mFormatID = kAudioFormatLinearPCM,
+        .mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked,
+        .mFramesPerPacket = 1,
+        .mBitsPerChannel = av_get_bytes_per_sample(sample_fmt) * 8,
+    };
+
+    avctx->sample_fmt = sample_fmt;
+
+    if (ffat_usable_extradata(avctx)) {
+        UInt32 format_size = sizeof(in_format);
+        UInt32 cookie_size;
+        uint8_t *cookie = ffat_get_magic_cookie(avctx, &cookie_size);
+        if (!cookie)
+            return AVERROR(ENOMEM);
+        status = AudioFormatGetProperty(kAudioFormatProperty_FormatInfo,
+                                        cookie_size, cookie, &format_size, &in_format);
+        if (cookie != at->extradata)
+            av_free(cookie);
+        if (status != 0) {
+            av_log(avctx, AV_LOG_ERROR, "AudioToolbox header-parse error: %i\n", (int)status);
+            return AVERROR_UNKNOWN;
+        }
+#if CONFIG_MP1_AT_DECODER || CONFIG_MP2_AT_DECODER || CONFIG_MP3_AT_DECODER
+    } else if (pkt && pkt->size >= 4 &&
+               (avctx->codec_id == AV_CODEC_ID_MP1 ||
+                avctx->codec_id == AV_CODEC_ID_MP2 ||
+                avctx->codec_id == AV_CODEC_ID_MP3)) {
+        enum AVCodecID codec_id;
+        int bit_rate;
+        if (ff_mpa_decode_header(AV_RB32(pkt->data), &avctx->sample_rate,
+                                 &in_format.mChannelsPerFrame, &avctx->frame_size,
+                                 &bit_rate, &codec_id) < 0)
+            return AVERROR_INVALIDDATA;
+        avctx->bit_rate = bit_rate;
+        in_format.mSampleRate = avctx->sample_rate;
+#endif
+#if CONFIG_AC3_AT_DECODER || CONFIG_EAC3_AT_DECODER
+    } else if (pkt && pkt->size >= 7 &&
+               (avctx->codec_id == AV_CODEC_ID_AC3 ||
+                avctx->codec_id == AV_CODEC_ID_EAC3)) {
+        AC3HeaderInfo hdr;
+        GetBitContext gbc;
+        init_get_bits(&gbc, pkt->data, pkt->size);
+        if (ff_ac3_parse_header(&gbc, &hdr) < 0)
+            return AVERROR_INVALIDDATA;
+        in_format.mSampleRate = hdr.sample_rate;
+        in_format.mChannelsPerFrame = hdr.channels;
+        avctx->frame_size = hdr.num_blocks * 256;
+        avctx->bit_rate = hdr.bit_rate;
+#endif
+    } else {
+        in_format.mSampleRate = avctx->sample_rate ? avctx->sample_rate : 44100;
+        in_format.mChannelsPerFrame = avctx->channels ? avctx->channels : 1;
+    }
+
+    avctx->sample_rate = out_format.mSampleRate = in_format.mSampleRate;
+    avctx->channels = out_format.mChannelsPerFrame = in_format.mChannelsPerFrame;
+
+    if (avctx->codec_id == AV_CODEC_ID_ADPCM_IMA_QT)
+        in_format.mFramesPerPacket = 64;
+
+    status = AudioConverterNew(&in_format, &out_format, &at->converter);
+
+    if (status != 0) {
+        av_log(avctx, AV_LOG_ERROR, "AudioToolbox init error: %i\n", (int)status);
+        return AVERROR_UNKNOWN;
+    }
+
+    if ((status = ffat_set_extradata(avctx)) < 0)
+        return status;
+
+    for (i = 0; i < (sizeof(at->channel_map) / sizeof(at->channel_map[0])); i++)
+        at->channel_map[i] = i;
+
+    ffat_update_ctx(avctx);
+
+    if(!(at->decoded_data = av_malloc(av_get_bytes_per_sample(avctx->sample_fmt)
+                                      * avctx->frame_size * avctx->channels)))
+        return AVERROR(ENOMEM);
+
+    at->last_pts = AV_NOPTS_VALUE;
+
+    return 0;
+}
+
+static av_cold int ffat_init_decoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (avctx->extradata_size) {
+        at->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!at->extradata)
+            return AVERROR(ENOMEM);
+        at->extradata_size = avctx->extradata_size;
+        memcpy(at->extradata, avctx->extradata, avctx->extradata_size);
+    }
+
+    if ((avctx->channels && avctx->sample_rate) || ffat_usable_extradata(avctx))
+        return ffat_create_decoder(avctx, NULL);
+    else
+        return 0;
+}
+
+static OSStatus ffat_decode_callback(AudioConverterRef converter, UInt32 *nb_packets,
+                                     AudioBufferList *data,
+                                     AudioStreamPacketDescription **packets,
+                                     void *inctx)
+{
+    AVCodecContext *avctx = inctx;
+    ATDecodeContext *at = avctx->priv_data;
+
+    if (at->eof) {
+        *nb_packets = 0;
+        if (packets) {
+            *packets = &at->pkt_desc;
+            at->pkt_desc.mDataByteSize = 0;
+        }
+        return 0;
+    }
+
+    av_packet_unref(&at->in_pkt);
+    av_packet_move_ref(&at->in_pkt, &at->new_in_pkt);
+
+    if (!at->in_pkt.data) {
+        *nb_packets = 0;
+        return 1;
+    }
+
+    data->mNumberBuffers              = 1;
+    data->mBuffers[0].mNumberChannels = 0;
+    data->mBuffers[0].mDataByteSize   = at->in_pkt.size;
+    data->mBuffers[0].mData           = at->in_pkt.data;
+    *nb_packets = 1;
+
+    if (packets) {
+        *packets = &at->pkt_desc;
+        at->pkt_desc.mDataByteSize = at->in_pkt.size;
+    }
+
+    return 0;
+}
+
+#define COPY_SAMPLES(type) \
+    type *in_ptr = (type*)at->decoded_data; \
+    type *end_ptr = in_ptr + frame->nb_samples * avctx->channels; \
+    type *out_ptr = (type*)frame->data[0]; \
+    for (; in_ptr < end_ptr; in_ptr += avctx->channels, out_ptr += avctx->channels) { \
+        int c; \
+        for (c = 0; c < avctx->channels; c++) \
+            out_ptr[c] = in_ptr[at->channel_map[c]]; \
+    }
+
+static void ffat_copy_samples(AVCodecContext *avctx, AVFrame *frame)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (avctx->sample_fmt == AV_SAMPLE_FMT_S32) {
+        COPY_SAMPLES(int32_t);
+    } else {
+        COPY_SAMPLES(int16_t);
+    }
+}
+
+static int ffat_decode(AVCodecContext *avctx, void *data,
+                       int *got_frame_ptr, AVPacket *avpkt)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AVFrame *frame = data;
+    int pkt_size = avpkt->size;
+    OSStatus ret;
+    AudioBufferList out_buffers;
+
+    if (avctx->codec_id == AV_CODEC_ID_AAC) {
+        if (!at->extradata_size) {
+            uint8_t *side_data;
+            int side_data_size = 0;
+
+            side_data = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
+                                                &side_data_size);
+            if (side_data_size) {
+                at->extradata = av_mallocz(side_data_size + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!at->extradata)
+                    return AVERROR(ENOMEM);
+                at->extradata_size = side_data_size;
+                memcpy(at->extradata, side_data, side_data_size);
+            }
+        }
+    }
+
+    if (!at->converter) {
+        if ((ret = ffat_create_decoder(avctx, avpkt)) < 0) {
+            return ret;
+        }
+    }
+
+    out_buffers = (AudioBufferList){
+        .mNumberBuffers = 1,
+        .mBuffers = {
+            {
+                .mNumberChannels = avctx->channels,
+                .mDataByteSize = av_get_bytes_per_sample(avctx->sample_fmt) * avctx->frame_size
+                                 * avctx->channels,
+            }
+        }
+    };
+
+    av_packet_unref(&at->new_in_pkt);
+
+    if (avpkt->size) {
+        if ((ret = av_packet_ref(&at->new_in_pkt, avpkt)) < 0) {
+            return ret;
+        }
+    } else {
+        at->eof = 1;
+    }
+
+    frame->sample_rate = avctx->sample_rate;
+
+    frame->nb_samples = avctx->frame_size;
+
+    out_buffers.mBuffers[0].mData = at->decoded_data;
+
+    ret = AudioConverterFillComplexBuffer(at->converter, ffat_decode_callback, avctx,
+                                          &frame->nb_samples, &out_buffers, NULL);
+    if ((!ret || ret == 1) && frame->nb_samples) {
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+        ffat_copy_samples(avctx, frame);
+        *got_frame_ptr = 1;
+        if (at->last_pts != AV_NOPTS_VALUE) {
+            frame->pts = at->last_pts;
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+            frame->pkt_pts = at->last_pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+            at->last_pts = avpkt->pts;
+        }
+    } else if (ret && ret != 1) {
+        av_log(avctx, AV_LOG_WARNING, "Decode error: %i\n", ret);
+    } else {
+        at->last_pts = avpkt->pts;
+    }
+
+    return pkt_size;
+}
+
+static av_cold void ffat_decode_flush(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioConverterReset(at->converter);
+    av_packet_unref(&at->new_in_pkt);
+    av_packet_unref(&at->in_pkt);
+}
+
+static av_cold int ffat_close_decoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (at->converter)
+        AudioConverterDispose(at->converter);
+    av_packet_unref(&at->new_in_pkt);
+    av_packet_unref(&at->in_pkt);
+    av_freep(&at->decoded_data);
+    av_freep(&at->extradata);
+    return 0;
+}
+
+#define FFAT_DEC_CLASS(NAME) \
+    static const AVClass ffat_##NAME##_dec_class = { \
+        .class_name = "at_" #NAME "_dec", \
+        .version    = LIBAVUTIL_VERSION_INT, \
+    };
+
+#define FFAT_DEC(NAME, ID, bsf_name) \
+    FFAT_DEC_CLASS(NAME) \
+    AVCodec ff_##NAME##_at_decoder = { \
+        .name           = #NAME "_at", \
+        .long_name      = NULL_IF_CONFIG_SMALL(#NAME " (AudioToolbox)"), \
+        .type           = AVMEDIA_TYPE_AUDIO, \
+        .id             = ID, \
+        .priv_data_size = sizeof(ATDecodeContext), \
+        .init           = ffat_init_decoder, \
+        .close          = ffat_close_decoder, \
+        .decode         = ffat_decode, \
+        .flush          = ffat_decode_flush, \
+        .priv_class     = &ffat_##NAME##_dec_class, \
+        .bsfs           = bsf_name, \
+        .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY, \
+        .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP, \
+        .wrapper_name   = "at", \
+    };
+
+FFAT_DEC(aac,          AV_CODEC_ID_AAC, "aac_adtstoasc")
+FFAT_DEC(ac3,          AV_CODEC_ID_AC3, NULL)
+FFAT_DEC(adpcm_ima_qt, AV_CODEC_ID_ADPCM_IMA_QT, NULL)
+FFAT_DEC(alac,         AV_CODEC_ID_ALAC, NULL)
+FFAT_DEC(amr_nb,       AV_CODEC_ID_AMR_NB, NULL)
+FFAT_DEC(eac3,         AV_CODEC_ID_EAC3, NULL)
+FFAT_DEC(gsm_ms,       AV_CODEC_ID_GSM_MS, NULL)
+FFAT_DEC(ilbc,         AV_CODEC_ID_ILBC, NULL)
+FFAT_DEC(mp1,          AV_CODEC_ID_MP1, NULL)
+FFAT_DEC(mp2,          AV_CODEC_ID_MP2, NULL)
+FFAT_DEC(mp3,          AV_CODEC_ID_MP3, NULL)
+FFAT_DEC(pcm_alaw,     AV_CODEC_ID_PCM_ALAW, NULL)
+FFAT_DEC(pcm_mulaw,    AV_CODEC_ID_PCM_MULAW, NULL)
+FFAT_DEC(qdmc,         AV_CODEC_ID_QDMC, NULL)
+FFAT_DEC(qdm2,         AV_CODEC_ID_QDM2, NULL)
diff --git a/libavcodec/audiotoolboxenc.c b/libavcodec/audiotoolboxenc.c
new file mode 100644
index 0000000..2c18916
--- /dev/null
+++ b/libavcodec/audiotoolboxenc.c
@@ -0,0 +1,661 @@
+/*
+ * Audio Toolbox system codecs
+ *
+ * copyright (c) 2016 Rodger Combs
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <AudioToolbox/AudioToolbox.h>
+
+#define FF_BUFQUEUE_SIZE 256
+#include "libavfilter/bufferqueue.h"
+
+#include "config.h"
+#include "audio_frame_queue.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "libavformat/isom.h"
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+#include "libavutil/log.h"
+
+typedef struct ATDecodeContext {
+    AVClass *av_class;
+    int mode;
+    int quality;
+
+    AudioConverterRef converter;
+    struct FFBufQueue frame_queue;
+    struct FFBufQueue used_frame_queue;
+
+    unsigned pkt_size;
+    AudioFrameQueue afq;
+    int eof;
+    int frame_size;
+
+    AVFrame* encoding_frame;
+} ATDecodeContext;
+
+static UInt32 ffat_get_format_id(enum AVCodecID codec, int profile)
+{
+    switch (codec) {
+    case AV_CODEC_ID_AAC:
+        switch (profile) {
+        case FF_PROFILE_AAC_LOW:
+        default:
+            return kAudioFormatMPEG4AAC;
+        case FF_PROFILE_AAC_HE:
+            return kAudioFormatMPEG4AAC_HE;
+        case FF_PROFILE_AAC_HE_V2:
+            return kAudioFormatMPEG4AAC_HE_V2;
+        case FF_PROFILE_AAC_LD:
+            return kAudioFormatMPEG4AAC_LD;
+        case FF_PROFILE_AAC_ELD:
+            return kAudioFormatMPEG4AAC_ELD;
+        }
+    case AV_CODEC_ID_ADPCM_IMA_QT:
+        return kAudioFormatAppleIMA4;
+    case AV_CODEC_ID_ALAC:
+        return kAudioFormatAppleLossless;
+    case AV_CODEC_ID_ILBC:
+        return kAudioFormatiLBC;
+    case AV_CODEC_ID_PCM_ALAW:
+        return kAudioFormatALaw;
+    case AV_CODEC_ID_PCM_MULAW:
+        return kAudioFormatULaw;
+    default:
+        av_assert0(!"Invalid codec ID!");
+        return 0;
+    }
+}
+
+static void ffat_update_ctx(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    UInt32 size = sizeof(unsigned);
+    AudioConverterPrimeInfo prime_info;
+    AudioStreamBasicDescription out_format;
+
+    AudioConverterGetProperty(at->converter,
+                              kAudioConverterPropertyMaximumOutputPacketSize,
+                              &size, &at->pkt_size);
+
+    if (at->pkt_size <= 0)
+        at->pkt_size = 1024 * 50;
+
+    size = sizeof(prime_info);
+
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterPrimeInfo,
+                                   &size, &prime_info)) {
+        avctx->initial_padding = prime_info.leadingFrames;
+    }
+
+    size = sizeof(out_format);
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterCurrentOutputStreamDescription,
+                                   &size, &out_format)) {
+        if (out_format.mFramesPerPacket)
+            avctx->frame_size = out_format.mFramesPerPacket;
+        if (out_format.mBytesPerPacket && avctx->codec_id == AV_CODEC_ID_ILBC)
+            avctx->block_align = out_format.mBytesPerPacket;
+    }
+
+    at->frame_size = avctx->frame_size;
+    if (avctx->codec_id == AV_CODEC_ID_PCM_MULAW ||
+        avctx->codec_id == AV_CODEC_ID_PCM_ALAW) {
+        at->pkt_size *= 1024;
+        avctx->frame_size *= 1024;
+    }
+}
+
+static int read_descr(GetByteContext *gb, int *tag)
+{
+    int len = 0;
+    int count = 4;
+    *tag = bytestream2_get_byte(gb);
+    while (count--) {
+        int c = bytestream2_get_byte(gb);
+        len = (len << 7) | (c & 0x7f);
+        if (!(c & 0x80))
+            break;
+    }
+    return len;
+}
+
+static int get_ilbc_mode(AVCodecContext *avctx)
+{
+    if (avctx->block_align == 38)
+        return 20;
+    else if (avctx->block_align == 50)
+        return 30;
+    else if (avctx->bit_rate > 0)
+        return avctx->bit_rate <= 14000 ? 30 : 20;
+    else
+        return 30;
+}
+
+static av_cold int get_channel_label(int channel)
+{
+    uint64_t map = 1 << channel;
+    if (map <= AV_CH_LOW_FREQUENCY)
+        return channel + 1;
+    else if (map <= AV_CH_BACK_RIGHT)
+        return channel + 29;
+    else if (map <= AV_CH_BACK_CENTER)
+        return channel - 1;
+    else if (map <= AV_CH_SIDE_RIGHT)
+        return channel - 4;
+    else if (map <= AV_CH_TOP_BACK_RIGHT)
+        return channel + 1;
+    else if (map <= AV_CH_STEREO_RIGHT)
+        return -1;
+    else if (map <= AV_CH_WIDE_RIGHT)
+        return channel + 4;
+    else if (map <= AV_CH_SURROUND_DIRECT_RIGHT)
+        return channel - 23;
+    else if (map == AV_CH_LOW_FREQUENCY_2)
+        return kAudioChannelLabel_LFE2;
+    else
+        return -1;
+}
+
+static int remap_layout(AudioChannelLayout *layout, uint64_t in_layout, int count)
+{
+    int i;
+    int c = 0;
+    layout->mChannelLayoutTag = kAudioChannelLayoutTag_UseChannelDescriptions;
+    layout->mNumberChannelDescriptions = count;
+    for (i = 0; i < count; i++) {
+        int label;
+        while (!(in_layout & (1 << c)) && c < 64)
+            c++;
+        if (c == 64)
+            return AVERROR(EINVAL); // This should never happen
+        label = get_channel_label(c);
+        layout->mChannelDescriptions[i].mChannelLabel = label;
+        if (label < 0)
+            return AVERROR(EINVAL);
+        c++;
+    }
+    return 0;
+}
+
+static int get_aac_tag(uint64_t in_layout)
+{
+    switch (in_layout) {
+    case AV_CH_LAYOUT_MONO:
+        return kAudioChannelLayoutTag_Mono;
+    case AV_CH_LAYOUT_STEREO:
+        return kAudioChannelLayoutTag_Stereo;
+    case AV_CH_LAYOUT_QUAD:
+        return kAudioChannelLayoutTag_AAC_Quadraphonic;
+    case AV_CH_LAYOUT_OCTAGONAL:
+        return kAudioChannelLayoutTag_AAC_Octagonal;
+    case AV_CH_LAYOUT_SURROUND:
+        return kAudioChannelLayoutTag_AAC_3_0;
+    case AV_CH_LAYOUT_4POINT0:
+        return kAudioChannelLayoutTag_AAC_4_0;
+    case AV_CH_LAYOUT_5POINT0:
+        return kAudioChannelLayoutTag_AAC_5_0;
+    case AV_CH_LAYOUT_5POINT1:
+        return kAudioChannelLayoutTag_AAC_5_1;
+    case AV_CH_LAYOUT_6POINT0:
+        return kAudioChannelLayoutTag_AAC_6_0;
+    case AV_CH_LAYOUT_6POINT1:
+        return kAudioChannelLayoutTag_AAC_6_1;
+    case AV_CH_LAYOUT_7POINT0:
+        return kAudioChannelLayoutTag_AAC_7_0;
+    case AV_CH_LAYOUT_7POINT1_WIDE_BACK:
+        return kAudioChannelLayoutTag_AAC_7_1;
+    case AV_CH_LAYOUT_7POINT1:
+        return kAudioChannelLayoutTag_MPEG_7_1_C;
+    default:
+        return 0;
+    }
+}
+
+static av_cold int ffat_init_encoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    OSStatus status;
+
+    AudioStreamBasicDescription in_format = {
+        .mSampleRate = avctx->sample_rate,
+        .mFormatID = kAudioFormatLinearPCM,
+        .mFormatFlags = ((avctx->sample_fmt == AV_SAMPLE_FMT_FLT ||
+                          avctx->sample_fmt == AV_SAMPLE_FMT_DBL) ? kAudioFormatFlagIsFloat
+                        : avctx->sample_fmt == AV_SAMPLE_FMT_U8 ? 0
+                        : kAudioFormatFlagIsSignedInteger)
+                        | kAudioFormatFlagIsPacked,
+        .mBytesPerPacket = av_get_bytes_per_sample(avctx->sample_fmt) * avctx->channels,
+        .mFramesPerPacket = 1,
+        .mBytesPerFrame = av_get_bytes_per_sample(avctx->sample_fmt) * avctx->channels,
+        .mChannelsPerFrame = avctx->channels,
+        .mBitsPerChannel = av_get_bytes_per_sample(avctx->sample_fmt) * 8,
+    };
+    AudioStreamBasicDescription out_format = {
+        .mSampleRate = avctx->sample_rate,
+        .mFormatID = ffat_get_format_id(avctx->codec_id, avctx->profile),
+        .mChannelsPerFrame = in_format.mChannelsPerFrame,
+    };
+    UInt32 layout_size = sizeof(AudioChannelLayout) +
+                         sizeof(AudioChannelDescription) * avctx->channels;
+    AudioChannelLayout *channel_layout = av_malloc(layout_size);
+
+    if (!channel_layout)
+        return AVERROR(ENOMEM);
+
+    if (avctx->codec_id == AV_CODEC_ID_ILBC) {
+        int mode = get_ilbc_mode(avctx);
+        out_format.mFramesPerPacket  = 8000 * mode / 1000;
+        out_format.mBytesPerPacket   = (mode == 20 ? 38 : 50);
+    }
+
+    status = AudioConverterNew(&in_format, &out_format, &at->converter);
+
+    if (status != 0) {
+        av_log(avctx, AV_LOG_ERROR, "AudioToolbox init error: %i\n", (int)status);
+        av_free(channel_layout);
+        return AVERROR_UNKNOWN;
+    }
+
+    if (!avctx->channel_layout)
+        avctx->channel_layout = av_get_default_channel_layout(avctx->channels);
+
+    if ((status = remap_layout(channel_layout, avctx->channel_layout, avctx->channels)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid channel layout\n");
+        av_free(channel_layout);
+        return status;
+    }
+
+    if (AudioConverterSetProperty(at->converter, kAudioConverterInputChannelLayout,
+                                  layout_size, channel_layout)) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported input channel layout\n");
+        av_free(channel_layout);
+        return AVERROR(EINVAL);
+    }
+    if (avctx->codec_id == AV_CODEC_ID_AAC) {
+        int tag = get_aac_tag(avctx->channel_layout);
+        if (tag) {
+            channel_layout->mChannelLayoutTag = tag;
+            channel_layout->mNumberChannelDescriptions = 0;
+        }
+    }
+    if (AudioConverterSetProperty(at->converter, kAudioConverterOutputChannelLayout,
+                                  layout_size, channel_layout)) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported output channel layout\n");
+        av_free(channel_layout);
+        return AVERROR(EINVAL);
+    }
+    av_free(channel_layout);
+
+    if (avctx->bits_per_raw_sample)
+        AudioConverterSetProperty(at->converter,
+                                  kAudioConverterPropertyBitDepthHint,
+                                  sizeof(avctx->bits_per_raw_sample),
+                                  &avctx->bits_per_raw_sample);
+
+#if !TARGET_OS_IPHONE
+    if (at->mode == -1)
+        at->mode = (avctx->flags & AV_CODEC_FLAG_QSCALE) ?
+                   kAudioCodecBitRateControlMode_Variable :
+                   kAudioCodecBitRateControlMode_Constant;
+
+    AudioConverterSetProperty(at->converter, kAudioCodecPropertyBitRateControlMode,
+                              sizeof(at->mode), &at->mode);
+
+    if (at->mode == kAudioCodecBitRateControlMode_Variable) {
+        int q = avctx->global_quality / FF_QP2LAMBDA;
+        if (q < 0 || q > 14) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "VBR quality %d out of range, should be 0-14\n", q);
+            q = av_clip(q, 0, 14);
+        }
+        q = 127 - q * 9;
+        AudioConverterSetProperty(at->converter, kAudioCodecPropertySoundQualityForVBR,
+                                  sizeof(q), &q);
+    } else
+#endif
+    if (avctx->bit_rate > 0) {
+        UInt32 rate = avctx->bit_rate;
+        UInt32 size;
+        status = AudioConverterGetPropertyInfo(at->converter,
+                                               kAudioConverterApplicableEncodeBitRates,
+                                               &size, NULL);
+        if (!status && size) {
+            UInt32 new_rate = rate;
+            int count;
+            int i;
+            AudioValueRange *ranges = av_malloc(size);
+            if (!ranges)
+                return AVERROR(ENOMEM);
+            AudioConverterGetProperty(at->converter,
+                                      kAudioConverterApplicableEncodeBitRates,
+                                      &size, ranges);
+            count = size / sizeof(AudioValueRange);
+            for (i = 0; i < count; i++) {
+                AudioValueRange *range = &ranges[i];
+                if (rate >= range->mMinimum && rate <= range->mMaximum) {
+                    new_rate = rate;
+                    break;
+                } else if (rate > range->mMaximum) {
+                    new_rate = range->mMaximum;
+                } else {
+                    new_rate = range->mMinimum;
+                    break;
+                }
+            }
+            if (new_rate != rate) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Bitrate %u not allowed; changing to %u\n", rate, new_rate);
+                rate = new_rate;
+            }
+            av_free(ranges);
+        }
+        AudioConverterSetProperty(at->converter, kAudioConverterEncodeBitRate,
+                                  sizeof(rate), &rate);
+    }
+
+    at->quality = 96 - at->quality * 32;
+    AudioConverterSetProperty(at->converter, kAudioConverterCodecQuality,
+                              sizeof(at->quality), &at->quality);
+
+    if (!AudioConverterGetPropertyInfo(at->converter, kAudioConverterCompressionMagicCookie,
+                                       &avctx->extradata_size, NULL) &&
+        avctx->extradata_size) {
+        int extradata_size = avctx->extradata_size;
+        uint8_t *extradata;
+        if (!(avctx->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE)))
+            return AVERROR(ENOMEM);
+        if (avctx->codec_id == AV_CODEC_ID_ALAC) {
+            avctx->extradata_size = 0x24;
+            AV_WB32(avctx->extradata,     0x24);
+            AV_WB32(avctx->extradata + 4, MKBETAG('a','l','a','c'));
+            extradata = avctx->extradata + 12;
+            avctx->extradata_size = 0x24;
+        } else {
+            extradata = avctx->extradata;
+        }
+        status = AudioConverterGetProperty(at->converter,
+                                           kAudioConverterCompressionMagicCookie,
+                                           &extradata_size, extradata);
+        if (status != 0) {
+            av_log(avctx, AV_LOG_ERROR, "AudioToolbox cookie error: %i\n", (int)status);
+            return AVERROR_UNKNOWN;
+        } else if (avctx->codec_id == AV_CODEC_ID_AAC) {
+            GetByteContext gb;
+            int tag, len;
+            bytestream2_init(&gb, extradata, extradata_size);
+            do {
+                len = read_descr(&gb, &tag);
+                if (tag == MP4DecConfigDescrTag) {
+                    bytestream2_skip(&gb, 13);
+                    len = read_descr(&gb, &tag);
+                    if (tag == MP4DecSpecificDescrTag) {
+                        len = FFMIN(gb.buffer_end - gb.buffer, len);
+                        memmove(extradata, gb.buffer, len);
+                        avctx->extradata_size = len;
+                        break;
+                    }
+                } else if (tag == MP4ESDescrTag) {
+                    int flags;
+                    bytestream2_skip(&gb, 2);
+                    flags = bytestream2_get_byte(&gb);
+                    if (flags & 0x80) //streamDependenceFlag
+                        bytestream2_skip(&gb, 2);
+                    if (flags & 0x40) //URL_Flag
+                        bytestream2_skip(&gb, bytestream2_get_byte(&gb));
+                    if (flags & 0x20) //OCRstreamFlag
+                        bytestream2_skip(&gb, 2);
+                }
+            } while (bytestream2_get_bytes_left(&gb));
+        } else if (avctx->codec_id != AV_CODEC_ID_ALAC) {
+            avctx->extradata_size = extradata_size;
+        }
+    }
+
+    ffat_update_ctx(avctx);
+
+#if !TARGET_OS_IPHONE && defined(__MAC_10_9)
+    if (at->mode == kAudioCodecBitRateControlMode_Variable && avctx->rc_max_rate) {
+        UInt32 max_size = avctx->rc_max_rate * avctx->frame_size / avctx->sample_rate;
+        if (max_size)
+            AudioConverterSetProperty(at->converter, kAudioCodecPropertyPacketSizeLimitForVBR,
+                                      sizeof(max_size), &max_size);
+    }
+#endif
+
+    ff_af_queue_init(avctx, &at->afq);
+
+    at->encoding_frame = av_frame_alloc();
+    if (!at->encoding_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static OSStatus ffat_encode_callback(AudioConverterRef converter, UInt32 *nb_packets,
+                                     AudioBufferList *data,
+                                     AudioStreamPacketDescription **packets,
+                                     void *inctx)
+{
+    AVCodecContext *avctx = inctx;
+    ATDecodeContext *at = avctx->priv_data;
+    AVFrame *frame;
+    int ret;
+
+    if (!at->frame_queue.available) {
+        if (at->eof) {
+            *nb_packets = 0;
+            return 0;
+        } else {
+            *nb_packets = 0;
+            return 1;
+        }
+    }
+
+    frame = ff_bufqueue_get(&at->frame_queue);
+
+    data->mNumberBuffers              = 1;
+    data->mBuffers[0].mNumberChannels = avctx->channels;
+    data->mBuffers[0].mDataByteSize   = frame->nb_samples *
+                                        av_get_bytes_per_sample(avctx->sample_fmt) *
+                                        avctx->channels;
+    data->mBuffers[0].mData           = frame->data[0];
+    if (*nb_packets > frame->nb_samples)
+        *nb_packets = frame->nb_samples;
+
+    av_frame_unref(at->encoding_frame);
+    ret = av_frame_ref(at->encoding_frame, frame);
+    if (ret < 0) {
+        *nb_packets = 0;
+        return ret;
+    }
+
+    ff_bufqueue_add(avctx, &at->used_frame_queue, frame);
+
+    return 0;
+}
+
+static int ffat_encode(AVCodecContext *avctx, AVPacket *avpkt,
+                       const AVFrame *frame, int *got_packet_ptr)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    OSStatus ret;
+
+    AudioBufferList out_buffers = {
+        .mNumberBuffers = 1,
+        .mBuffers = {
+            {
+                .mNumberChannels = avctx->channels,
+                .mDataByteSize = at->pkt_size,
+            }
+        }
+    };
+    AudioStreamPacketDescription out_pkt_desc = {0};
+
+    if (frame) {
+        AVFrame *in_frame;
+
+        if (ff_bufqueue_is_full(&at->frame_queue)) {
+            /*
+             * The frame queue is significantly larger than needed in practice,
+             * but no clear way to determine the minimum number of samples to
+             * get output from AudioConverterFillComplexBuffer().
+             */
+            av_log(avctx, AV_LOG_ERROR, "Bug: frame queue is too small.\n");
+            return AVERROR_BUG;
+        }
+
+        if ((ret = ff_af_queue_add(&at->afq, frame)) < 0)
+            return ret;
+
+        in_frame = av_frame_clone(frame);
+        if (!in_frame)
+            return AVERROR(ENOMEM);
+
+        ff_bufqueue_add(avctx, &at->frame_queue, in_frame);
+    } else {
+        at->eof = 1;
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, at->pkt_size, 0)) < 0)
+        return ret;
+
+
+    out_buffers.mBuffers[0].mData = avpkt->data;
+
+    *got_packet_ptr = avctx->frame_size / at->frame_size;
+
+    ret = AudioConverterFillComplexBuffer(at->converter, ffat_encode_callback, avctx,
+                                          got_packet_ptr, &out_buffers,
+                                          (avctx->frame_size > at->frame_size) ? NULL : &out_pkt_desc);
+
+    ff_bufqueue_discard_all(&at->used_frame_queue);
+
+    if ((!ret || ret == 1) && *got_packet_ptr) {
+        avpkt->size = out_buffers.mBuffers[0].mDataByteSize;
+        ff_af_queue_remove(&at->afq, out_pkt_desc.mVariableFramesInPacket ?
+                                     out_pkt_desc.mVariableFramesInPacket :
+                                     avctx->frame_size,
+                           &avpkt->pts,
+                           &avpkt->duration);
+    } else if (ret && ret != 1) {
+        av_log(avctx, AV_LOG_WARNING, "Encode error: %i\n", ret);
+    }
+
+    return 0;
+}
+
+static av_cold void ffat_encode_flush(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioConverterReset(at->converter);
+    ff_bufqueue_discard_all(&at->frame_queue);
+    ff_bufqueue_discard_all(&at->used_frame_queue);
+}
+
+static av_cold int ffat_close_encoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioConverterDispose(at->converter);
+    ff_bufqueue_discard_all(&at->frame_queue);
+    ff_bufqueue_discard_all(&at->used_frame_queue);
+    ff_af_queue_close(&at->afq);
+    av_frame_free(&at->encoding_frame);
+    return 0;
+}
+
+static const AVProfile aac_profiles[] = {
+    { FF_PROFILE_AAC_LOW,   "LC"       },
+    { FF_PROFILE_AAC_HE,    "HE-AAC"   },
+    { FF_PROFILE_AAC_HE_V2, "HE-AACv2" },
+    { FF_PROFILE_AAC_LD,    "LD"       },
+    { FF_PROFILE_AAC_ELD,   "ELD"      },
+    { FF_PROFILE_UNKNOWN },
+};
+
+#define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+#if !TARGET_OS_IPHONE
+    {"aac_at_mode", "ratecontrol mode", offsetof(ATDecodeContext, mode), AV_OPT_TYPE_INT, {.i64 = -1}, -1, kAudioCodecBitRateControlMode_Variable, AE, "mode"},
+        {"auto", "VBR if global quality is given; CBR otherwise", 0, AV_OPT_TYPE_CONST, {.i64 = -1}, INT_MIN, INT_MAX, AE, "mode"},
+        {"cbr",  "constant bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_Constant}, INT_MIN, INT_MAX, AE, "mode"},
+        {"abr",  "long-term average bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_LongTermAverage}, INT_MIN, INT_MAX, AE, "mode"},
+        {"cvbr", "constrained variable bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_VariableConstrained}, INT_MIN, INT_MAX, AE, "mode"},
+        {"vbr" , "variable bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_Variable}, INT_MIN, INT_MAX, AE, "mode"},
+#endif
+    {"aac_at_quality", "quality vs speed control", offsetof(ATDecodeContext, quality), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 2, AE},
+    { NULL },
+};
+
+#define FFAT_ENC_CLASS(NAME) \
+    static const AVClass ffat_##NAME##_enc_class = { \
+        .class_name = "at_" #NAME "_enc", \
+        .item_name  = av_default_item_name, \
+        .option     = options, \
+        .version    = LIBAVUTIL_VERSION_INT, \
+    };
+
+#define FFAT_ENC(NAME, ID, PROFILES, ...) \
+    FFAT_ENC_CLASS(NAME) \
+    AVCodec ff_##NAME##_at_encoder = { \
+        .name           = #NAME "_at", \
+        .long_name      = NULL_IF_CONFIG_SMALL(#NAME " (AudioToolbox)"), \
+        .type           = AVMEDIA_TYPE_AUDIO, \
+        .id             = ID, \
+        .priv_data_size = sizeof(ATDecodeContext), \
+        .init           = ffat_init_encoder, \
+        .close          = ffat_close_encoder, \
+        .encode2        = ffat_encode, \
+        .flush          = ffat_encode_flush, \
+        .priv_class     = &ffat_##NAME##_enc_class, \
+        .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY __VA_ARGS__, \
+        .sample_fmts    = (const enum AVSampleFormat[]) { \
+            AV_SAMPLE_FMT_S16, \
+            AV_SAMPLE_FMT_U8,  AV_SAMPLE_FMT_NONE \
+        }, \
+        .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE, \
+        .profiles       = PROFILES, \
+        .wrapper_name   = "at", \
+    };
+
+static const uint64_t aac_at_channel_layouts[] = {
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_LAYOUT_4POINT0,
+    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_6POINT0,
+    AV_CH_LAYOUT_6POINT1,
+    AV_CH_LAYOUT_7POINT0,
+    AV_CH_LAYOUT_7POINT1_WIDE_BACK,
+    AV_CH_LAYOUT_QUAD,
+    AV_CH_LAYOUT_OCTAGONAL,
+    0,
+};
+
+FFAT_ENC(aac,          AV_CODEC_ID_AAC,          aac_profiles, , .channel_layouts = aac_at_channel_layouts)
+//FFAT_ENC(adpcm_ima_qt, AV_CODEC_ID_ADPCM_IMA_QT, NULL)
+FFAT_ENC(alac,         AV_CODEC_ID_ALAC,         NULL, | AV_CODEC_CAP_VARIABLE_FRAME_SIZE | AV_CODEC_CAP_LOSSLESS)
+FFAT_ENC(ilbc,         AV_CODEC_ID_ILBC,         NULL)
+FFAT_ENC(pcm_alaw,     AV_CODEC_ID_PCM_ALAW,     NULL)
+FFAT_ENC(pcm_mulaw,    AV_CODEC_ID_PCM_MULAW,    NULL)
diff --git a/libavcodec/aura.c b/libavcodec/aura.c
index 6a03f8f..5ef9316 100644
--- a/libavcodec/aura.c
+++ b/libavcodec/aura.c
@@ -1,20 +1,20 @@
 /*
  * Aura 2 decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,10 +59,8 @@ static int aura_decode_frame(AVCodecContext *avctx,
     /* pixel data starts 48 bytes in, after 3x16-byte tables */
     buf += 48;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     Y = frame->data[0];
     U = frame->data[1];
diff --git a/libavcodec/av1.h b/libavcodec/av1.h
new file mode 100644
index 0000000..f2ec39c
--- /dev/null
+++ b/libavcodec/av1.h
@@ -0,0 +1,130 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AV1 common definitions
+ */
+
+#ifndef AVCODEC_AV1_H
+#define AVCODEC_AV1_H
+
+// OBU types (section 6.2.2).
+typedef enum {
+    // 0 reserved.
+    AV1_OBU_SEQUENCE_HEADER        = 1,
+    AV1_OBU_TEMPORAL_DELIMITER     = 2,
+    AV1_OBU_FRAME_HEADER           = 3,
+    AV1_OBU_TILE_GROUP             = 4,
+    AV1_OBU_METADATA               = 5,
+    AV1_OBU_FRAME                  = 6,
+    AV1_OBU_REDUNDANT_FRAME_HEADER = 7,
+    AV1_OBU_TILE_LIST              = 8,
+    // 9-14 reserved.
+    AV1_OBU_PADDING                = 15,
+} AV1_OBU_Type;
+
+// Metadata types (section 6.7.1).
+enum {
+    AV1_METADATA_TYPE_HDR_CLL     = 1,
+    AV1_METADATA_TYPE_HDR_MDCV    = 2,
+    AV1_METADATA_TYPE_SCALABILITY = 3,
+    AV1_METADATA_TYPE_ITUT_T35    = 4,
+    AV1_METADATA_TYPE_TIMECODE    = 5,
+};
+
+// Frame types (section 6.8.2).
+enum {
+    AV1_FRAME_KEY        = 0,
+    AV1_FRAME_INTER      = 1,
+    AV1_FRAME_INTRA_ONLY = 2,
+    AV1_FRAME_SWITCH     = 3,
+};
+
+// Reference frames (section 6.10.24).
+enum {
+    AV1_REF_FRAME_INTRA   = 0,
+    AV1_REF_FRAME_LAST    = 1,
+    AV1_REF_FRAME_LAST2   = 2,
+    AV1_REF_FRAME_LAST3   = 3,
+    AV1_REF_FRAME_GOLDEN  = 4,
+    AV1_REF_FRAME_BWDREF  = 5,
+    AV1_REF_FRAME_ALTREF2 = 6,
+    AV1_REF_FRAME_ALTREF  = 7,
+};
+
+// Constants (section 3).
+enum {
+    AV1_MAX_OPERATING_POINTS = 32,
+
+    AV1_MAX_SB_SIZE    = 128,
+    AV1_MI_SIZE        = 4,
+
+    AV1_MAX_TILE_WIDTH = 4096,
+    AV1_MAX_TILE_AREA  = 4096 * 2304,
+    AV1_MAX_TILE_ROWS  = 64,
+    AV1_MAX_TILE_COLS  = 64,
+
+    AV1_NUM_REF_FRAMES       = 8,
+    AV1_REFS_PER_FRAME       = 7,
+    AV1_TOTAL_REFS_PER_FRAME = 8,
+    AV1_PRIMARY_REF_NONE     = 7,
+
+    AV1_MAX_SEGMENTS = 8,
+    AV1_SEG_LVL_MAX  = 8,
+
+    AV1_SEG_LVL_ALT_Q      = 0,
+    AV1_SEG_LVL_ALT_LF_Y_V = 1,
+    AV1_SEG_LVL_REF_FRAME  = 5,
+    AV1_SEG_LVL_SKIP       = 6,
+    AV1_SEG_LVL_GLOBAL_MV  = 7,
+
+    AV1_SELECT_SCREEN_CONTENT_TOOLS = 2,
+    AV1_SELECT_INTEGER_MV           = 2,
+
+    AV1_SUPERRES_NUM       = 8,
+    AV1_SUPERRES_DENOM_MIN = 9,
+
+    AV1_INTERPOLATION_FILTER_SWITCHABLE = 4,
+
+    AV1_GM_ABS_ALPHA_BITS       = 12,
+    AV1_GM_ALPHA_PREC_BITS      = 15,
+    AV1_GM_ABS_TRANS_ONLY_BITS  = 9,
+    AV1_GM_TRANS_ONLY_PREC_BITS = 3,
+    AV1_GM_ABS_TRANS_BITS       = 12,
+    AV1_GM_TRANS_PREC_BITS      = 6,
+    AV1_WARPEDMODEL_PREC_BITS   = 16,
+
+    AV1_WARP_MODEL_IDENTITY    = 0,
+    AV1_WARP_MODEL_TRANSLATION = 1,
+    AV1_WARP_MODEL_ROTZOOM     = 2,
+    AV1_WARP_MODEL_AFFINE      = 3,
+};
+
+
+// The main colour configuration information uses the same ISO/IEC 23001-8
+// (H.273) enums as FFmpeg does, so separate definitions are not required.
+
+// Chroma sample position.
+enum {
+    AV1_CSP_UNKNOWN   = 0,
+    AV1_CSP_VERTICAL  = 1, // -> AVCHROMA_LOC_LEFT.
+    AV1_CSP_COLOCATED = 2, // -> AVCHROMA_LOC_TOPLEFT.
+};
+
+#endif /* AVCODEC_AV1_H */
diff --git a/libavcodec/av1_metadata_bsf.c b/libavcodec/av1_metadata_bsf.c
new file mode 100644
index 0000000..2b74b69
--- /dev/null
+++ b/libavcodec/av1_metadata_bsf.c
@@ -0,0 +1,300 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/opt.h"
+
+#include "bsf.h"
+#include "cbs.h"
+#include "cbs_av1.h"
+
+enum {
+    PASS,
+    INSERT,
+    REMOVE,
+};
+
+typedef struct AV1MetadataContext {
+    const AVClass *class;
+
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment access_unit;
+
+    int td;
+
+    int color_primaries;
+    int transfer_characteristics;
+    int matrix_coefficients;
+
+    int color_range;
+    int chroma_sample_position;
+
+    AVRational tick_rate;
+    int num_ticks_per_picture;
+} AV1MetadataContext;
+
+
+static int av1_metadata_update_sequence_header(AVBSFContext *bsf,
+                                               AV1RawSequenceHeader *seq)
+{
+    AV1MetadataContext *ctx = bsf->priv_data;
+    AV1RawColorConfig  *clc = &seq->color_config;
+    AV1RawTimingInfo   *tim = &seq->timing_info;
+
+    if (ctx->color_primaries >= 0          ||
+        ctx->transfer_characteristics >= 0 ||
+        ctx->matrix_coefficients >= 0) {
+        if (!clc->color_description_present_flag) {
+            clc->color_description_present_flag = 1;
+            clc->color_primaries          = AVCOL_PRI_UNSPECIFIED;
+            clc->transfer_characteristics = AVCOL_TRC_UNSPECIFIED;
+            clc->matrix_coefficients      = AVCOL_SPC_UNSPECIFIED;
+        }
+
+        if (ctx->color_primaries >= 0)
+            clc->color_primaries = ctx->color_primaries;
+        if (ctx->transfer_characteristics >= 0)
+            clc->transfer_characteristics = ctx->transfer_characteristics;
+        if (ctx->matrix_coefficients >= 0)
+            clc->matrix_coefficients = ctx->matrix_coefficients;
+    }
+
+    if (ctx->color_range >= 0) {
+        if (clc->color_primaries          == AVCOL_PRI_BT709        &&
+            clc->transfer_characteristics == AVCOL_TRC_IEC61966_2_1 &&
+            clc->matrix_coefficients      == AVCOL_SPC_RGB) {
+            av_log(bsf, AV_LOG_WARNING, "Warning: color_range cannot be set "
+                   "on RGB streams encoded in BT.709 sRGB.\n");
+        } else {
+            clc->color_range = ctx->color_range;
+        }
+    }
+
+    if (ctx->chroma_sample_position >= 0) {
+        if (clc->mono_chrome || !clc->subsampling_x || !clc->subsampling_y) {
+            av_log(bsf, AV_LOG_WARNING, "Warning: chroma_sample_position "
+                   "can only be set for 4:2:0 streams.\n");
+        } else {
+            clc->chroma_sample_position = ctx->chroma_sample_position;
+        }
+    }
+
+    if (ctx->tick_rate.num && ctx->tick_rate.den) {
+        int num, den;
+
+        av_reduce(&num, &den, ctx->tick_rate.num, ctx->tick_rate.den,
+                  UINT32_MAX > INT_MAX ? UINT32_MAX : INT_MAX);
+
+        tim->time_scale                = num;
+        tim->num_units_in_display_tick = den;
+        seq->timing_info_present_flag  = 1;
+
+        if (ctx->num_ticks_per_picture > 0) {
+            tim->equal_picture_interval = 1;
+            tim->num_ticks_per_picture_minus_1 =
+                ctx->num_ticks_per_picture - 1;
+        }
+    }
+
+    return 0;
+}
+
+static int av1_metadata_filter(AVBSFContext *bsf, AVPacket *out)
+{
+    AV1MetadataContext *ctx = bsf->priv_data;
+    AVPacket *in = NULL;
+    CodedBitstreamFragment *frag = &ctx->access_unit;
+    AV1RawOBU td, *obu;
+    int err, i;
+
+    err = ff_bsf_get_packet(bsf, &in);
+    if (err < 0)
+        return err;
+
+    err = ff_cbs_read_packet(ctx->cbc, frag, in);
+    if (err < 0) {
+        av_log(bsf, AV_LOG_ERROR, "Failed to read packet.\n");
+        goto fail;
+    }
+
+    for (i = 0; i < frag->nb_units; i++) {
+        if (frag->units[i].type == AV1_OBU_SEQUENCE_HEADER) {
+            obu = frag->units[i].content;
+            err = av1_metadata_update_sequence_header(bsf, &obu->obu.sequence_header);
+            if (err < 0)
+                goto fail;
+        }
+    }
+
+    // If a Temporal Delimiter is present, it must be the first OBU.
+    if (frag->units[0].type == AV1_OBU_TEMPORAL_DELIMITER) {
+        if (ctx->td == REMOVE)
+            ff_cbs_delete_unit(ctx->cbc, frag, 0);
+    } else if (ctx->td == INSERT) {
+        td = (AV1RawOBU) {
+            .header.obu_type = AV1_OBU_TEMPORAL_DELIMITER,
+        };
+
+        err = ff_cbs_insert_unit_content(ctx->cbc, frag, 0, AV1_OBU_TEMPORAL_DELIMITER,
+                                         &td, NULL);
+        if (err < 0) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to insert Temporal Delimiter.\n");
+            goto fail;
+        }
+    }
+
+    err = ff_cbs_write_packet(ctx->cbc, out, frag);
+    if (err < 0) {
+        av_log(bsf, AV_LOG_ERROR, "Failed to write packet.\n");
+        goto fail;
+    }
+
+    err = av_packet_copy_props(out, in);
+    if (err < 0)
+        goto fail;
+
+    err = 0;
+fail:
+    ff_cbs_fragment_reset(ctx->cbc, frag);
+
+    if (err < 0)
+        av_packet_unref(out);
+    av_packet_free(&in);
+
+    return err;
+}
+
+static int av1_metadata_init(AVBSFContext *bsf)
+{
+    AV1MetadataContext *ctx = bsf->priv_data;
+    CodedBitstreamFragment *frag = &ctx->access_unit;
+    AV1RawOBU *obu;
+    int err, i;
+
+    err = ff_cbs_init(&ctx->cbc, AV_CODEC_ID_AV1, bsf);
+    if (err < 0)
+        return err;
+
+    if (bsf->par_in->extradata) {
+        err = ff_cbs_read_extradata(ctx->cbc, frag, bsf->par_in);
+        if (err < 0) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to read extradata.\n");
+            goto fail;
+        }
+
+        for (i = 0; i < frag->nb_units; i++) {
+            if (frag->units[i].type == AV1_OBU_SEQUENCE_HEADER) {
+                obu = frag->units[i].content;
+                err = av1_metadata_update_sequence_header(bsf, &obu->obu.sequence_header);
+                if (err < 0)
+                    goto fail;
+            }
+        }
+
+        err = ff_cbs_write_extradata(ctx->cbc, bsf->par_out, frag);
+        if (err < 0) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to write extradata.\n");
+            goto fail;
+        }
+    }
+
+    err = 0;
+fail:
+    ff_cbs_fragment_reset(ctx->cbc, frag);
+    return err;
+}
+
+static void av1_metadata_close(AVBSFContext *bsf)
+{
+    AV1MetadataContext *ctx = bsf->priv_data;
+
+    ff_cbs_fragment_free(ctx->cbc, &ctx->access_unit);
+    ff_cbs_close(&ctx->cbc);
+}
+
+#define OFFSET(x) offsetof(AV1MetadataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
+static const AVOption av1_metadata_options[] = {
+    { "td", "Temporal Delimiter OBU",
+        OFFSET(td), AV_OPT_TYPE_INT,
+        { .i64 = PASS }, PASS, REMOVE, FLAGS, "td" },
+    { "pass",   NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = PASS   }, .flags = FLAGS, .unit = "td" },
+    { "insert", NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = INSERT }, .flags = FLAGS, .unit = "td" },
+    { "remove", NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = REMOVE }, .flags = FLAGS, .unit = "td" },
+
+    { "color_primaries", "Set color primaries (section 6.4.2)",
+        OFFSET(color_primaries), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, 255, FLAGS },
+    { "transfer_characteristics", "Set transfer characteristics (section 6.4.2)",
+        OFFSET(transfer_characteristics), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, 255, FLAGS },
+    { "matrix_coefficients", "Set matrix coefficients (section 6.4.2)",
+        OFFSET(matrix_coefficients), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, 255, FLAGS },
+
+    { "color_range", "Set color range flag (section 6.4.2)",
+        OFFSET(color_range), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, 1, FLAGS, "cr" },
+    { "tv", "TV (limited) range", 0, AV_OPT_TYPE_CONST,
+        { .i64 = 0 }, .flags = FLAGS, .unit = "cr" },
+    { "pc", "PC (full) range",    0, AV_OPT_TYPE_CONST,
+        { .i64 = 1 }, .flags = FLAGS, .unit = "cr" },
+
+    { "chroma_sample_position", "Set chroma sample position (section 6.4.2)",
+        OFFSET(chroma_sample_position), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, 3, FLAGS, "csp" },
+    { "unknown",   "Unknown chroma sample position",  0, AV_OPT_TYPE_CONST,
+        { .i64 = AV1_CSP_UNKNOWN },   .flags = FLAGS, .unit = "csp" },
+    { "vertical",  "Left chroma sample position",     0, AV_OPT_TYPE_CONST,
+        { .i64 = AV1_CSP_VERTICAL },  .flags = FLAGS, .unit = "csp" },
+    { "colocated", "Top-left chroma sample position", 0, AV_OPT_TYPE_CONST,
+        { .i64 = AV1_CSP_COLOCATED }, .flags = FLAGS, .unit = "csp" },
+
+    { "tick_rate", "Set display tick rate (num_units_in_display_tick / time_scale)",
+        OFFSET(tick_rate), AV_OPT_TYPE_RATIONAL,
+        { .dbl = 0.0 }, 0, UINT_MAX, FLAGS },
+    { "num_ticks_per_picture", "Set display ticks per picture for CFR streams",
+        OFFSET(num_ticks_per_picture), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, INT_MAX, FLAGS },
+
+    { NULL }
+};
+
+static const AVClass av1_metadata_class = {
+    .class_name = "av1_metadata_bsf",
+    .item_name  = av_default_item_name,
+    .option     = av1_metadata_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const enum AVCodecID av1_metadata_codec_ids[] = {
+    AV_CODEC_ID_AV1, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_av1_metadata_bsf = {
+    .name           = "av1_metadata",
+    .priv_data_size = sizeof(AV1MetadataContext),
+    .priv_class     = &av1_metadata_class,
+    .init           = &av1_metadata_init,
+    .close          = &av1_metadata_close,
+    .filter         = &av1_metadata_filter,
+    .codec_ids      = av1_metadata_codec_ids,
+};
diff --git a/libavcodec/av1_parse.c b/libavcodec/av1_parse.c
new file mode 100644
index 0000000..cdd524b
--- /dev/null
+++ b/libavcodec/av1_parse.c
@@ -0,0 +1,107 @@
+/*
+ * AV1 common parsing code
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/mem.h"
+
+#include "av1.h"
+#include "av1_parse.h"
+#include "bytestream.h"
+
+int ff_av1_extract_obu(AV1OBU *obu, const uint8_t *buf, int length, void *logctx)
+{
+    int64_t obu_size;
+    int start_pos, type, temporal_id, spatial_id;
+    int len;
+
+    len = parse_obu_header(buf, length, &obu_size, &start_pos,
+                           &type, &temporal_id, &spatial_id);
+    if (len < 0)
+        return len;
+
+    obu->type        = type;
+    obu->temporal_id = temporal_id;
+    obu->spatial_id  = spatial_id;
+
+    obu->data     = buf + start_pos;
+    obu->size     = obu_size;
+    obu->raw_data = buf;
+    obu->raw_size = len;
+
+    av_log(logctx, AV_LOG_DEBUG,
+           "obu_type: %d, temporal_id: %d, spatial_id: %d, payload size: %d\n",
+           obu->type, obu->temporal_id, obu->spatial_id, obu->size);
+
+    return len;
+}
+
+int ff_av1_packet_split(AV1Packet *pkt, const uint8_t *buf, int length, void *logctx)
+{
+    GetByteContext bc;
+    int ret, consumed;
+
+    bytestream2_init(&bc, buf, length);
+    pkt->nb_obus = 0;
+
+    while (bytestream2_get_bytes_left(&bc) > 0) {
+        AV1OBU *obu;
+
+        if (pkt->obus_allocated < pkt->nb_obus + 1) {
+            int new_size = pkt->obus_allocated + 1;
+            AV1OBU *tmp = av_realloc_array(pkt->obus, new_size, sizeof(*tmp));
+            if (!tmp)
+                return AVERROR(ENOMEM);
+
+            pkt->obus = tmp;
+            memset(pkt->obus + pkt->obus_allocated, 0,
+                   (new_size - pkt->obus_allocated) * sizeof(*tmp));
+            pkt->obus_allocated = new_size;
+        }
+        obu = &pkt->obus[pkt->nb_obus];
+
+        consumed = ff_av1_extract_obu(obu, bc.buffer, bytestream2_get_bytes_left(&bc), logctx);
+        if (consumed < 0)
+            return consumed;
+
+        bytestream2_skip(&bc, consumed);
+
+        obu->size_bits = get_obu_bit_length(obu->data, obu->size, obu->type);
+
+        if (obu->size_bits < 0 || (!obu->size_bits && obu->type != AV1_OBU_TEMPORAL_DELIMITER)) {
+            av_log(logctx, AV_LOG_ERROR, "Invalid OBU of type %d, skipping.\n", obu->type);
+            continue;
+        }
+
+        pkt->nb_obus++;
+
+        ret = init_get_bits(&obu->gb, obu->data, obu->size_bits);
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+void ff_av1_packet_uninit(AV1Packet *pkt)
+{
+    av_freep(&pkt->obus);
+    pkt->obus_allocated = 0;
+}
diff --git a/libavcodec/av1_parse.h b/libavcodec/av1_parse.h
new file mode 100644
index 0000000..864308f
--- /dev/null
+++ b/libavcodec/av1_parse.h
@@ -0,0 +1,174 @@
+/*
+ * AV1 common parsing code
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AV1_PARSE_H
+#define AVCODEC_AV1_PARSE_H
+
+#include <stdint.h>
+
+#include "av1.h"
+#include "avcodec.h"
+#include "get_bits.h"
+
+typedef struct AV1OBU {
+    /** Size of payload */
+    int size;
+    const uint8_t *data;
+
+    /**
+     * Size, in bits, of just the data, excluding the trailing_one_bit and
+     * any trailing padding.
+     */
+    int size_bits;
+
+    /** Size of entire OBU, including header */
+    int raw_size;
+    const uint8_t *raw_data;
+
+    /** GetBitContext initialized to the start of the payload */
+    GetBitContext gb;
+
+    int type;
+
+    int temporal_id;
+    int spatial_id;
+} AV1OBU;
+
+/** An input packet split into OBUs */
+typedef struct AV1Packet {
+    AV1OBU *obus;
+    int nb_obus;
+    int obus_allocated;
+} AV1Packet;
+
+/**
+ * Extract an OBU from a raw bitstream.
+ *
+ * @note This function does not copy or store any bitstream data. All
+ *       the pointers in the AV1OBU structure will be valid as long
+ *       as the input buffer also is.
+ */
+int ff_av1_extract_obu(AV1OBU *obu, const uint8_t *buf, int length,
+                       void *logctx);
+
+/**
+ * Split an input packet into OBUs.
+ *
+ * @note This function does not copy or store any bitstream data. All
+ *       the pointers in the AV1Packet structure will be valid as
+ *       long as the input buffer also is.
+ */
+int ff_av1_packet_split(AV1Packet *pkt, const uint8_t *buf, int length,
+                        void *logctx);
+
+/**
+ * Free all the allocated memory in the packet.
+ */
+void ff_av1_packet_uninit(AV1Packet *pkt);
+
+static inline int64_t leb128(GetBitContext *gb) {
+    int64_t ret = 0;
+    int i;
+
+    for (i = 0; i < 8; i++) {
+        int byte = get_bits(gb, 8);
+        ret |= (int64_t)(byte & 0x7f) << (i * 7);
+        if (!(byte & 0x80))
+            break;
+    }
+    return ret;
+}
+
+static inline int parse_obu_header(const uint8_t *buf, int buf_size,
+                                   int64_t *obu_size, int *start_pos, int *type,
+                                   int *temporal_id, int *spatial_id)
+{
+    GetBitContext gb;
+    int ret, extension_flag, has_size_flag;
+    int64_t size;
+
+    ret = init_get_bits8(&gb, buf, FFMIN(buf_size, 2 + 8)); // OBU header fields + max leb128 length
+    if (ret < 0)
+        return ret;
+
+    if (get_bits1(&gb) != 0) // obu_forbidden_bit
+        return AVERROR_INVALIDDATA;
+
+    *type      = get_bits(&gb, 4);
+    extension_flag = get_bits1(&gb);
+    has_size_flag  = get_bits1(&gb);
+    skip_bits1(&gb); // obu_reserved_1bit
+
+    if (extension_flag) {
+        *temporal_id = get_bits(&gb, 3);
+        *spatial_id  = get_bits(&gb, 2);
+        skip_bits(&gb, 3); // extension_header_reserved_3bits
+    } else {
+        *temporal_id = *spatial_id = 0;
+    }
+
+    *obu_size  = has_size_flag ? leb128(&gb)
+                               : buf_size - 1 - extension_flag;
+
+    if (get_bits_left(&gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    *start_pos = get_bits_count(&gb) / 8;
+
+    size = *obu_size + *start_pos;
+
+    if (size > buf_size)
+        return AVERROR_INVALIDDATA;
+
+    return size;
+}
+
+static inline int get_obu_bit_length(const uint8_t *buf, int size, int type)
+{
+    int v;
+
+    /* There are no trailing bits on these */
+    if (type == AV1_OBU_TILE_GROUP || type == AV1_OBU_FRAME) {
+        if (size > INT_MAX / 8)
+            return AVERROR(ERANGE);
+        else
+            return size * 8;
+    }
+
+    while (size > 0 && buf[size - 1] == 0)
+        size--;
+
+    if (!size)
+        return 0;
+
+    v = buf[size - 1];
+
+    if (size > INT_MAX / 8)
+        return AVERROR(ERANGE);
+    size *= 8;
+
+    /* Remove the trailing_one_bit and following trailing zeros */
+    if (v)
+        size -= ff_ctz(v) + 1;
+
+    return size;
+}
+
+#endif /* AVCODEC_AV1_PARSE_H */
diff --git a/libavcodec/av1_parser.c b/libavcodec/av1_parser.c
new file mode 100644
index 0000000..bb8737a
--- /dev/null
+++ b/libavcodec/av1_parser.c
@@ -0,0 +1,229 @@
+/*
+ * AV1 parser
+ *
+ * Copyright (C) 2018 James Almer <jamrial@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "av1_parse.h"
+#include "cbs.h"
+#include "cbs_av1.h"
+#include "parser.h"
+
+typedef struct AV1ParseContext {
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment temporal_unit;
+    int parsed_extradata;
+} AV1ParseContext;
+
+static const enum AVPixelFormat pix_fmts_8bit[2][2] = {
+    { AV_PIX_FMT_YUV444P, AV_PIX_FMT_NONE },
+    { AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV420P },
+};
+static const enum AVPixelFormat pix_fmts_10bit[2][2] = {
+    { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_NONE },
+    { AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV420P10 },
+};
+static const enum AVPixelFormat pix_fmts_12bit[2][2] = {
+    { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_NONE },
+    { AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV420P12 },
+};
+
+static int av1_parser_parse(AVCodecParserContext *ctx,
+                            AVCodecContext *avctx,
+                            const uint8_t **out_data, int *out_size,
+                            const uint8_t *data, int size)
+{
+    AV1ParseContext *s = ctx->priv_data;
+    CodedBitstreamFragment *td = &s->temporal_unit;
+    CodedBitstreamAV1Context *av1 = s->cbc->priv_data;
+    int ret;
+
+    *out_data = data;
+    *out_size = size;
+
+    ctx->key_frame         = -1;
+    ctx->pict_type         = AV_PICTURE_TYPE_NONE;
+    ctx->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
+
+    s->cbc->log_ctx = avctx;
+
+    if (avctx->extradata_size && !s->parsed_extradata) {
+        s->parsed_extradata = 1;
+
+        ret = ff_cbs_read(s->cbc, td, avctx->extradata, avctx->extradata_size);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to parse extradata.\n");
+            goto end;
+        }
+
+        ff_cbs_fragment_reset(s->cbc, td);
+    }
+
+    ret = ff_cbs_read(s->cbc, td, data, size);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to parse temporal unit.\n");
+        goto end;
+    }
+
+    if (!av1->sequence_header) {
+        av_log(avctx, AV_LOG_ERROR, "No sequence header available\n");
+        goto end;
+    }
+
+    for (int i = 0; i < td->nb_units; i++) {
+        CodedBitstreamUnit *unit = &td->units[i];
+        AV1RawOBU *obu = unit->content;
+        AV1RawSequenceHeader *seq = av1->sequence_header;
+        AV1RawColorConfig *color = &seq->color_config;
+        AV1RawFrameHeader *frame;
+        int frame_type;
+
+        if (unit->type == AV1_OBU_FRAME)
+            frame = &obu->obu.frame.header;
+        else if (unit->type == AV1_OBU_FRAME_HEADER)
+            frame = &obu->obu.frame_header;
+        else
+            continue;
+
+        if (frame->show_existing_frame) {
+            AV1ReferenceFrameState *ref = &av1->ref[frame->frame_to_show_map_idx];
+
+            if (!ref->valid) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid reference frame\n");
+                goto end;
+            }
+
+            ctx->width  = ref->frame_width;
+            ctx->height = ref->frame_height;
+            frame_type  = ref->frame_type;
+
+            ctx->key_frame = 0;
+        } else if (!frame->show_frame) {
+            continue;
+        } else {
+            ctx->width  = av1->frame_width;
+            ctx->height = av1->frame_height;
+            frame_type  = frame->frame_type;
+
+            ctx->key_frame = frame_type == AV1_FRAME_KEY;
+        }
+
+        avctx->profile = seq->seq_profile;
+        avctx->level   = seq->seq_level_idx[0];
+
+        switch (frame_type) {
+        case AV1_FRAME_KEY:
+        case AV1_FRAME_INTRA_ONLY:
+            ctx->pict_type = AV_PICTURE_TYPE_I;
+            break;
+        case AV1_FRAME_INTER:
+            ctx->pict_type = AV_PICTURE_TYPE_P;
+            break;
+        case AV1_FRAME_SWITCH:
+            ctx->pict_type = AV_PICTURE_TYPE_SP;
+            break;
+        }
+        ctx->picture_structure = AV_PICTURE_STRUCTURE_FRAME;
+
+        switch (av1->bit_depth) {
+        case 8:
+            ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY8
+                                             : pix_fmts_8bit [color->subsampling_x][color->subsampling_y];
+            break;
+        case 10:
+            ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY10
+                                             : pix_fmts_10bit[color->subsampling_x][color->subsampling_y];
+            break;
+        case 12:
+            ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY12
+                                             : pix_fmts_12bit[color->subsampling_x][color->subsampling_y];
+            break;
+        }
+        av_assert2(ctx->format != AV_PIX_FMT_NONE);
+    }
+
+end:
+    ff_cbs_fragment_reset(s->cbc, td);
+
+    s->cbc->log_ctx = NULL;
+
+    return size;
+}
+
+static const CodedBitstreamUnitType decompose_unit_types[] = {
+    AV1_OBU_TEMPORAL_DELIMITER,
+    AV1_OBU_SEQUENCE_HEADER,
+    AV1_OBU_FRAME_HEADER,
+    AV1_OBU_TILE_GROUP,
+    AV1_OBU_FRAME,
+};
+
+static av_cold int av1_parser_init(AVCodecParserContext *ctx)
+{
+    AV1ParseContext *s = ctx->priv_data;
+    int ret;
+
+    ret = ff_cbs_init(&s->cbc, AV_CODEC_ID_AV1, NULL);
+    if (ret < 0)
+        return ret;
+
+    s->cbc->decompose_unit_types    = (CodedBitstreamUnitType *)decompose_unit_types;
+    s->cbc->nb_decompose_unit_types = FF_ARRAY_ELEMS(decompose_unit_types);
+
+    return 0;
+}
+
+static void av1_parser_close(AVCodecParserContext *ctx)
+{
+    AV1ParseContext *s = ctx->priv_data;
+
+    ff_cbs_fragment_free(s->cbc, &s->temporal_unit);
+    ff_cbs_close(&s->cbc);
+}
+
+static int av1_parser_split(AVCodecContext *avctx,
+                            const uint8_t *buf, int buf_size)
+{
+    AV1OBU obu;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
+
+    while (ptr < end) {
+        int len = ff_av1_extract_obu(&obu, ptr, buf_size, avctx);
+        if (len < 0)
+            break;
+
+        if (obu.type == AV1_OBU_FRAME_HEADER ||
+            obu.type == AV1_OBU_FRAME) {
+            return ptr - buf;
+        }
+        ptr      += len;
+        buf_size -= len;
+    }
+
+    return 0;
+}
+
+AVCodecParser ff_av1_parser = {
+    .codec_ids      = { AV_CODEC_ID_AV1 },
+    .priv_data_size = sizeof(AV1ParseContext),
+    .parser_init    = av1_parser_init,
+    .parser_close   = av1_parser_close,
+    .parser_parse   = av1_parser_parse,
+    .split          = av1_parser_split,
+};
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 727e1c4..0ce22ec 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 #include "libavutil/avutil.h"
 #include "libavutil/buffer.h"
 #include "libavutil/cpu.h"
+#include "libavutil/channel_layout.h"
 #include "libavutil/dict.h"
 #include "libavutil/frame.h"
 #include "libavutil/hwcontext.h"
@@ -43,7 +44,9 @@
 #include "version.h"
 
 /**
- * @defgroup libavc Encoding/Decoding Library
+ * @defgroup libavc libavcodec
+ * Encoding/Decoding Library
+ *
  * @{
  *
  * @defgroup lavc_decoding Decoding
@@ -87,7 +90,7 @@
  * - Send valid input:
  *   - For decoding, call avcodec_send_packet() to give the decoder raw
  *     compressed data in an AVPacket.
- *   - For encoding, call avcodec_send_frame() to give the decoder an AVFrame
+ *   - For encoding, call avcodec_send_frame() to give the encoder an AVFrame
  *     containing uncompressed audio or video.
  *   In both cases, it is recommended that AVPackets and AVFrames are
  *   refcounted, or libavcodec might have to copy the input data. (libavformat
@@ -140,8 +143,9 @@
  *
  * Not all codecs will follow a rigid and predictable dataflow; the only
  * guarantee is that an AVERROR(EAGAIN) return value on a send/receive call on
- * one end implies that a receive/send call on the other end will succeed. In
- * general, no codec will permit unlimited buffering of input or output.
+ * one end implies that a receive/send call on the other end will succeed, or
+ * at least will not fail with AVERROR(EAGAIN). In general, no codec will
+ * permit unlimited buffering of input or output.
  *
  * This API replaces the following legacy functions:
  * - avcodec_decode_video2() and avcodec_decode_audio4():
@@ -150,7 +154,7 @@
  *   Unlike with the old video decoding API, multiple frames might result from
  *   a packet. For audio, splitting the input packet into frames by partially
  *   decoding packets becomes transparent to the API user. You never need to
- *   feed an AVPacket to the API twice (unless it is rejected with EAGAIN - then
+ *   feed an AVPacket to the API twice (unless it is rejected with AVERROR(EAGAIN) - then
  *   no data was read from the packet).
  *   Additionally, sending a flush/draining packet is required only once.
  * - avcodec_encode_video2()/avcodec_encode_audio2():
@@ -161,15 +165,15 @@
  * - The new API does not handle subtitles yet.
  *
  * Mixing new and old function calls on the same AVCodecContext is not allowed,
- * and will result in arbitrary behavior.
+ * and will result in undefined behavior.
  *
  * Some codecs might require using the new API; using the old API will return
  * an error when calling it. All codecs support the new API.
  *
- * A codec is not allowed to return EAGAIN for both sending and receiving. This
+ * A codec is not allowed to return AVERROR(EAGAIN) for both sending and receiving. This
  * would be an invalid state, which could put the codec user into an endless
  * loop. The API has no concept of time either: it cannot happen that trying to
- * do avcodec_send_packet() results in EAGAIN, but a repeated call 1 second
+ * do avcodec_send_packet() results in AVERROR(EAGAIN), but a repeated call 1 second
  * later accepts the packet (with no other receive/flush API calls involved).
  * The API is a strict state machine, and the passage of time is not supposed
  * to influence it. Some timing-dependent behavior might still be deemed
@@ -178,7 +182,7 @@
  * avoided that the current state is "unstable" and can "flip-flop" between
  * the send/receive APIs allowing progress. For example, it's not allowed that
  * the codec randomly decides that it actually wants to consume a packet now
- * instead of returning a frame, after it just returned EAGAIN on an
+ * instead of returning a frame, after it just returned AVERROR(EAGAIN) on an
  * avcodec_send_packet() call.
  * @}
  */
@@ -202,8 +206,8 @@
  * details.
  *
  * If you add a codec ID to this list, add it so that
- * 1. no value of a existing codec ID changes (that would break ABI),
- * 2. it is as close as possible to similar codecs.
+ * 1. no value of an existing codec ID changes (that would break ABI),
+ * 2. it is as close as possible to similar codecs
  *
  * After adding new codec IDs, do not forget to add an entry to the codec
  * descriptor list and bump libavcodec minor version.
@@ -348,7 +352,7 @@ enum AVCodecID {
     AV_CODEC_ID_ANM,
     AV_CODEC_ID_BINKVIDEO,
     AV_CODEC_ID_IFF_ILBM,
-    AV_CODEC_ID_IFF_BYTERUN1,
+#define AV_CODEC_ID_IFF_BYTERUN1 AV_CODEC_ID_IFF_ILBM
     AV_CODEC_ID_KGV1,
     AV_CODEC_ID_YOP,
     AV_CODEC_ID_VP8,
@@ -386,6 +390,7 @@ enum AVCodecID {
     AV_CODEC_ID_WEBP,
     AV_CODEC_ID_HNM4_VIDEO,
     AV_CODEC_ID_HEVC,
+#define AV_CODEC_ID_H265 AV_CODEC_ID_HEVC
     AV_CODEC_ID_FIC,
     AV_CODEC_ID_ALIAS_PIX,
     AV_CODEC_ID_BRENDER_PIX,
@@ -404,13 +409,51 @@ enum AVCodecID {
     AV_CODEC_ID_DXV,
     AV_CODEC_ID_SCREENPRESSO,
     AV_CODEC_ID_RSCC,
-    AV_CODEC_ID_MAGICYUV,
+    AV_CODEC_ID_AVS2,
+
+    AV_CODEC_ID_Y41P = 0x8000,
+    AV_CODEC_ID_AVRP,
+    AV_CODEC_ID_012V,
+    AV_CODEC_ID_AVUI,
+    AV_CODEC_ID_AYUV,
+    AV_CODEC_ID_TARGA_Y216,
+    AV_CODEC_ID_V308,
+    AV_CODEC_ID_V408,
+    AV_CODEC_ID_YUV4,
+    AV_CODEC_ID_AVRN,
+    AV_CODEC_ID_CPIA,
+    AV_CODEC_ID_XFACE,
+    AV_CODEC_ID_SNOW,
+    AV_CODEC_ID_SMVJPEG,
+    AV_CODEC_ID_APNG,
+    AV_CODEC_ID_DAALA,
+    AV_CODEC_ID_CFHD,
     AV_CODEC_ID_TRUEMOTION2RT,
-    AV_CODEC_ID_AV1,
+    AV_CODEC_ID_M101,
+    AV_CODEC_ID_MAGICYUV,
+    AV_CODEC_ID_SHEERVIDEO,
+    AV_CODEC_ID_YLC,
+    AV_CODEC_ID_PSD,
     AV_CODEC_ID_PIXLET,
-    AV_CODEC_ID_CFHD,
+    AV_CODEC_ID_SPEEDHQ,
     AV_CODEC_ID_FMVC,
+    AV_CODEC_ID_SCPR,
     AV_CODEC_ID_CLEARVIDEO,
+    AV_CODEC_ID_XPM,
+    AV_CODEC_ID_AV1,
+    AV_CODEC_ID_BITPACKED,
+    AV_CODEC_ID_MSCC,
+    AV_CODEC_ID_SRGC,
+    AV_CODEC_ID_SVG,
+    AV_CODEC_ID_GDV,
+    AV_CODEC_ID_FITS,
+    AV_CODEC_ID_IMM4,
+    AV_CODEC_ID_PROSUMER,
+    AV_CODEC_ID_MWSC,
+    AV_CODEC_ID_WCMV,
+    AV_CODEC_ID_RASC,
+    AV_CODEC_ID_HYMT,
+    AV_CODEC_ID_ARBC,
 
     /* various PCM "codecs" */
     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
@@ -446,6 +489,12 @@ enum AVCodecID {
     AV_CODEC_ID_PCM_S32LE_PLANAR,
     AV_CODEC_ID_PCM_S16BE_PLANAR,
 
+    AV_CODEC_ID_PCM_S64LE = 0x10800,
+    AV_CODEC_ID_PCM_S64BE,
+    AV_CODEC_ID_PCM_F16LE,
+    AV_CODEC_ID_PCM_F24LE,
+    AV_CODEC_ID_PCM_VIDC,
+
     /* various ADPCM codecs */
     AV_CODEC_ID_ADPCM_IMA_QT = 0x11000,
     AV_CODEC_ID_ADPCM_IMA_WAV,
@@ -479,6 +528,17 @@ enum AVCodecID {
     AV_CODEC_ID_ADPCM_IMA_APC,
     AV_CODEC_ID_ADPCM_VIMA,
 
+    AV_CODEC_ID_ADPCM_AFC = 0x11800,
+    AV_CODEC_ID_ADPCM_IMA_OKI,
+    AV_CODEC_ID_ADPCM_DTK,
+    AV_CODEC_ID_ADPCM_IMA_RAD,
+    AV_CODEC_ID_ADPCM_G726LE,
+    AV_CODEC_ID_ADPCM_THP_LE,
+    AV_CODEC_ID_ADPCM_PSX,
+    AV_CODEC_ID_ADPCM_AICA,
+    AV_CODEC_ID_ADPCM_IMA_DAT4,
+    AV_CODEC_ID_ADPCM_MTAF,
+
     /* AMR */
     AV_CODEC_ID_AMR_NB = 0x12000,
     AV_CODEC_ID_AMR_WB,
@@ -493,6 +553,9 @@ enum AVCodecID {
     AV_CODEC_ID_XAN_DPCM,
     AV_CODEC_ID_SOL_DPCM,
 
+    AV_CODEC_ID_SDX2_DPCM = 0x14800,
+    AV_CODEC_ID_GREMLIN_DPCM,
+
     /* audio codecs */
     AV_CODEC_ID_MP2 = 0x15000,
     AV_CODEC_ID_MP3, ///< preferred ID for decoding MPEG audio layer 1, 2 or 3
@@ -561,6 +624,30 @@ enum AVCodecID {
     AV_CODEC_ID_PAF_AUDIO,
     AV_CODEC_ID_ON2AVC,
     AV_CODEC_ID_DSS_SP,
+    AV_CODEC_ID_CODEC2,
+
+    AV_CODEC_ID_FFWAVESYNTH = 0x15800,
+    AV_CODEC_ID_SONIC,
+    AV_CODEC_ID_SONIC_LS,
+    AV_CODEC_ID_EVRC,
+    AV_CODEC_ID_SMV,
+    AV_CODEC_ID_DSD_LSBF,
+    AV_CODEC_ID_DSD_MSBF,
+    AV_CODEC_ID_DSD_LSBF_PLANAR,
+    AV_CODEC_ID_DSD_MSBF_PLANAR,
+    AV_CODEC_ID_4GV,
+    AV_CODEC_ID_INTERPLAY_ACM,
+    AV_CODEC_ID_XMA1,
+    AV_CODEC_ID_XMA2,
+    AV_CODEC_ID_DST,
+    AV_CODEC_ID_ATRAC3AL,
+    AV_CODEC_ID_ATRAC3PAL,
+    AV_CODEC_ID_DOLBY_E,
+    AV_CODEC_ID_APTX,
+    AV_CODEC_ID_APTX_HD,
+    AV_CODEC_ID_SBC,
+    AV_CODEC_ID_ATRAC9,
+    AV_CODEC_ID_HCOM,
 
     /* subtitle codecs */
     AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
@@ -574,10 +661,39 @@ enum AVCodecID {
     AV_CODEC_ID_DVB_TELETEXT,
     AV_CODEC_ID_SRT,
 
+    AV_CODEC_ID_MICRODVD   = 0x17800,
+    AV_CODEC_ID_EIA_608,
+    AV_CODEC_ID_JACOSUB,
+    AV_CODEC_ID_SAMI,
+    AV_CODEC_ID_REALTEXT,
+    AV_CODEC_ID_STL,
+    AV_CODEC_ID_SUBVIEWER1,
+    AV_CODEC_ID_SUBVIEWER,
+    AV_CODEC_ID_SUBRIP,
+    AV_CODEC_ID_WEBVTT,
+    AV_CODEC_ID_MPL2,
+    AV_CODEC_ID_VPLAYER,
+    AV_CODEC_ID_PJS,
+    AV_CODEC_ID_ASS,
+    AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
+    AV_CODEC_ID_TTML,
+    AV_CODEC_ID_ARIB_CAPTION,
+
     /* other specific kind of codecs (generally used for attachments) */
     AV_CODEC_ID_FIRST_UNKNOWN = 0x18000,           ///< A dummy ID pointing at the start of various fake codecs.
     AV_CODEC_ID_TTF = 0x18000,
 
+    AV_CODEC_ID_SCTE_35, ///< Contain timestamp estimated through PCR of program stream.
+    AV_CODEC_ID_BINTEXT    = 0x18800,
+    AV_CODEC_ID_XBIN,
+    AV_CODEC_ID_IDF,
+    AV_CODEC_ID_OTF,
+    AV_CODEC_ID_SMPTE_KLV,
+    AV_CODEC_ID_DVD_NAV,
+    AV_CODEC_ID_TIMED_ID3,
+    AV_CODEC_ID_BIN_DATA,
+
+
     AV_CODEC_ID_PROBE = 0x19000, ///< codec_id is not known (like AV_CODEC_ID_NONE) but lavf should attempt to identify it
 
     AV_CODEC_ID_MPEG2TS = 0x20000, /**< _FAKE_ codec to indicate a raw MPEG-2 TS
@@ -611,6 +727,12 @@ typedef struct AVCodecDescriptor {
      */
     int             props;
     /**
+     * MIME type(s) associated with the codec.
+     * May be NULL; if not, a NULL-terminated array of MIME types.
+     * The first item is always non-NULL and is the preferred MIME type.
+     */
+    const char *const *mime_types;
+    /**
      * If non-NULL, an array of profiles recognized for this codec.
      * Terminated with FF_PROFILE_UNKNOWN.
      */
@@ -619,7 +741,7 @@ typedef struct AVCodecDescriptor {
 
 /**
  * Codec uses only intra compression.
- * Video codecs only.
+ * Video and audio codecs only.
  */
 #define AV_CODEC_PROP_INTRA_ONLY    (1 << 0)
 /**
@@ -642,6 +764,16 @@ typedef struct AVCodecDescriptor {
  * equal.
  */
 #define AV_CODEC_PROP_REORDER       (1 << 3)
+/**
+ * Subtitle codec is bitmap based
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->pict field.
+ */
+#define AV_CODEC_PROP_BITMAP_SUB    (1 << 16)
+/**
+ * Subtitle codec is text based.
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->ass field.
+ */
+#define AV_CODEC_PROP_TEXT_SUB      (1 << 17)
 
 /**
  * @ingroup lavc_decoding
@@ -651,7 +783,7 @@ typedef struct AVCodecDescriptor {
  * Note: If the first 23 bits of the additional bytes are not 0, then damaged
  * MPEG bitstreams could cause overread and segfault.
  */
-#define AV_INPUT_BUFFER_PADDING_SIZE 8
+#define AV_INPUT_BUFFER_PADDING_SIZE 64
 
 /**
  * @ingroup lavc_encoding
@@ -670,6 +802,7 @@ enum AVDiscard{
     AVDISCARD_DEFAULT =  0, ///< discard useless packets like 0 size packets in avi
     AVDISCARD_NONREF  =  8, ///< discard all non reference
     AVDISCARD_BIDIR   = 16, ///< discard all bidirectional frames
+    AVDISCARD_NONINTRA= 24, ///< discard all non intra frames
     AVDISCARD_NONKEY  = 32, ///< discard all frames except keyframes
     AVDISCARD_ALL     = 48, ///< discard all
 };
@@ -773,7 +906,7 @@ typedef struct RcOverride{
  * interlaced motion estimation
  */
 #define AV_CODEC_FLAG_INTERLACED_ME   (1 << 29)
-#define AV_CODEC_FLAG_CLOSED_GOP      (1 << 31)
+#define AV_CODEC_FLAG_CLOSED_GOP      (1U << 31)
 
 /**
  * Allow non spec compliant speedup tricks.
@@ -787,6 +920,12 @@ typedef struct RcOverride{
  * Place global headers at every keyframe instead of in extradata.
  */
 #define AV_CODEC_FLAG2_LOCAL_HEADER   (1 <<  3)
+
+/**
+ * timecode is in drop frame format. DEPRECATED!!!!
+ */
+#define AV_CODEC_FLAG2_DROP_FRAME_TIMECODE (1 << 13)
+
 /**
  * Input bitstream might be truncated at a packet boundaries
  * instead of only at frame boundaries.
@@ -797,6 +936,23 @@ typedef struct RcOverride{
  */
 #define AV_CODEC_FLAG2_IGNORE_CROP    (1 << 16)
 
+/**
+ * Show all frames before the first keyframe
+ */
+#define AV_CODEC_FLAG2_SHOW_ALL       (1 << 22)
+/**
+ * Export motion vectors through frame side data
+ */
+#define AV_CODEC_FLAG2_EXPORT_MVS     (1 << 28)
+/**
+ * Do not skip samples and export skip information as frame side data
+ */
+#define AV_CODEC_FLAG2_SKIP_MANUAL    (1 << 29)
+/**
+ * Do not reset ASS ReadOrder field on flush (subtitles decoding)
+ */
+#define AV_CODEC_FLAG2_RO_FLUSH_NOOP  (1 << 30)
+
 /* Unsupported options :
  *              Syntax Arithmetic coding (SAC)
  *              Reference Picture Selection
@@ -844,6 +1000,7 @@ typedef struct RcOverride{
  * This can be used to prevent truncation of the last audio samples.
  */
 #define AV_CODEC_CAP_SMALL_LAST_FRAME    (1 <<  6)
+
 /**
  * Codec can output multiple frames per AVPacket
  * Normally demuxers return one frame at a time, demuxers which do not do
@@ -885,33 +1042,52 @@ typedef struct RcOverride{
  * Audio encoder supports receiving a different number of samples in each call.
  */
 #define AV_CODEC_CAP_VARIABLE_FRAME_SIZE (1 << 16)
+/**
+ * Decoder is not a preferred choice for probing.
+ * This indicates that the decoder is not a good choice for probing.
+ * It could for example be an expensive to spin up hardware decoder,
+ * or it could simply not provide a lot of useful information about
+ * the stream.
+ * A decoder marked with this flag should only be used as last resort
+ * choice for probing.
+ */
+#define AV_CODEC_CAP_AVOID_PROBING       (1 << 17)
+/**
+ * Codec is intra only.
+ */
+#define AV_CODEC_CAP_INTRA_ONLY       0x40000000
+/**
+ * Codec is lossless.
+ */
+#define AV_CODEC_CAP_LOSSLESS         0x80000000
 
 /**
  * Codec is backed by a hardware implementation. Typically used to
- * identify a non-hwaccel hardware decoder.
+ * identify a non-hwaccel hardware decoder. For information about hwaccels, use
+ * avcodec_get_hw_config() instead.
  */
-#define AV_CODEC_CAP_HARDWARE            (1 << 17)
+#define AV_CODEC_CAP_HARDWARE            (1 << 18)
 
 /**
  * Codec is potentially backed by a hardware implementation, but not
  * necessarily. This is used instead of AV_CODEC_CAP_HARDWARE, if the
  * implementation provides some sort of internal fallback.
  */
-#define AV_CODEC_CAP_HYBRID              (1 << 18)
+#define AV_CODEC_CAP_HYBRID              (1 << 19)
 
 /**
  * This codec takes the reordered_opaque field from input AVFrames
  * and returns it in the corresponding field in AVCodecContext after
  * encoding.
  */
-#define AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE (1 << 19)
+#define AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE (1 << 20)
 
 /**
  * Pan Scan area.
  * This specifies the area which should be displayed.
  * Note there may be multiple such areas for one frame.
  */
-typedef struct AVPanScan{
+typedef struct AVPanScan {
     /**
      * id
      * - encoding: Set by user.
@@ -933,7 +1109,7 @@ typedef struct AVPanScan{
      * - decoding: Set by libavcodec.
      */
     int16_t position[3][2];
-}AVPanScan;
+} AVPanScan;
 
 /**
  * This structure describes the bitrate properties of an encoded bitstream. It
@@ -945,17 +1121,29 @@ typedef struct AVCPBProperties {
      * Maximum bitrate of the stream, in bits per second.
      * Zero if unknown or unspecified.
      */
+#if FF_API_UNSANITIZED_BITRATES
     int max_bitrate;
+#else
+    int64_t max_bitrate;
+#endif
     /**
      * Minimum bitrate of the stream, in bits per second.
      * Zero if unknown or unspecified.
      */
+#if FF_API_UNSANITIZED_BITRATES
     int min_bitrate;
+#else
+    int64_t min_bitrate;
+#endif
     /**
      * Average bitrate of the stream, in bits per second.
      * Zero if unknown or unspecified.
      */
+#if FF_API_UNSANITIZED_BITRATES
     int avg_bitrate;
+#else
+    int64_t avg_bitrate;
+#endif
 
     /**
      * The size of the buffer to which the ratecontrol is applied, in bits.
@@ -1067,11 +1255,16 @@ enum AVPacketSideDataType {
     AV_PKT_DATA_AUDIO_SERVICE_TYPE,
 
     /**
-     * This side data contains an integer value representing the quality
-     * factor of the compressed frame. Allowed range is between 1 (good)
-     * and FF_LAMBDA_MAX (bad).
+     * This side data contains quality related information from the encoder.
+     * @code
+     * u32le quality factor of the compressed frame. Allowed range is between 1 (good) and FF_LAMBDA_MAX (bad).
+     * u8    picture type
+     * u8    error count
+     * u16   reserved
+     * u64le[error count] sum of squared differences between encoder in and output
+     * @endcode
      */
-    AV_PKT_DATA_QUALITY_FACTOR,
+    AV_PKT_DATA_QUALITY_STATS,
 
     /**
      * This side data contains an integer value representing the stream index
@@ -1087,12 +1280,134 @@ enum AVPacketSideDataType {
     AV_PKT_DATA_CPB_PROPERTIES,
 
     /**
+     * Recommmends skipping the specified number of samples
+     * @code
+     * u32le number of samples to skip from start of this packet
+     * u32le number of samples to skip from end of this packet
+     * u8    reason for start skip
+     * u8    reason for end   skip (0=padding silence, 1=convergence)
+     * @endcode
+     */
+    AV_PKT_DATA_SKIP_SAMPLES,
+
+    /**
+     * An AV_PKT_DATA_JP_DUALMONO side data packet indicates that
+     * the packet may contain "dual mono" audio specific to Japanese DTV
+     * and if it is true, recommends only the selected channel to be used.
+     * @code
+     * u8    selected channels (0=mail/left, 1=sub/right, 2=both)
+     * @endcode
+     */
+    AV_PKT_DATA_JP_DUALMONO,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop.
+     */
+    AV_PKT_DATA_STRINGS_METADATA,
+
+    /**
+     * Subtitle event position
+     * @code
+     * u32le x1
+     * u32le y1
+     * u32le x2
+     * u32le y2
+     * @endcode
+     */
+    AV_PKT_DATA_SUBTITLE_POSITION,
+
+    /**
+     * Data found in BlockAdditional element of matroska container. There is
+     * no end marker for the data, so it is required to rely on the side data
+     * size to recognize the end. 8 byte id (as found in BlockAddId) followed
+     * by data.
+     */
+    AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+
+    /**
+     * The optional first identifier line of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_IDENTIFIER,
+
+    /**
+     * The optional settings (rendering instructions) that immediately
+     * follow the timestamp specifier of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_SETTINGS,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop. This
+     * side data includes updated metadata which appeared in the stream.
+     */
+    AV_PKT_DATA_METADATA_UPDATE,
+
+    /**
+     * MPEGTS stream ID as uint8_t, this is required to pass the stream ID
+     * information from the demuxer to the corresponding muxer.
+     */
+    AV_PKT_DATA_MPEGTS_STREAM_ID,
+
+    /**
+     * Mastering display metadata (based on SMPTE-2086:2014). This metadata
+     * should be associated with a video stream and contains data in the form
+     * of the AVMasteringDisplayMetadata struct.
+     */
+    AV_PKT_DATA_MASTERING_DISPLAY_METADATA,
+
+    /**
      * This side data should be associated with a video stream and corresponds
      * to the AVSphericalMapping structure.
      */
     AV_PKT_DATA_SPHERICAL,
+
+    /**
+     * Content light level (based on CTA-861.3). This metadata should be
+     * associated with a video stream and contains data in the form of the
+     * AVContentLightMetadata struct.
+     */
+    AV_PKT_DATA_CONTENT_LIGHT_LEVEL,
+
+    /**
+     * ATSC A53 Part 4 Closed Captions. This metadata should be associated with
+     * a video stream. A53 CC bitstream is stored as uint8_t in AVPacketSideData.data.
+     * The number of bytes of CC data is AVPacketSideData.size.
+     */
+    AV_PKT_DATA_A53_CC,
+
+    /**
+     * This side data is encryption initialization data.
+     * The format is not part of ABI, use av_encryption_init_info_* methods to
+     * access.
+     */
+    AV_PKT_DATA_ENCRYPTION_INIT_INFO,
+
+    /**
+     * This side data contains encryption info for how to decrypt the packet.
+     * The format is not part of ABI, use av_encryption_info_* methods to access.
+     */
+    AV_PKT_DATA_ENCRYPTION_INFO,
+
+    /**
+     * Active Format Description data consisting of a single byte as specified
+     * in ETSI TS 101 154 using AVActiveFormatDescription enum.
+     */
+    AV_PKT_DATA_AFD,
+
+    /**
+     * The number of side data types.
+     * This is not part of the public API/ABI in the sense that it may
+     * change when new side data types are added.
+     * This must stay the last enum value.
+     * If its value becomes huge, some code using it
+     * needs to be updated as it assumes it to be smaller than other limits.
+     */
+    AV_PKT_DATA_NB
 };
 
+#define AV_PKT_DATA_QUALITY_FACTOR AV_PKT_DATA_QUALITY_STATS //DEPRECATED
+
 typedef struct AVPacketSideData {
     uint8_t *data;
     int      size;
@@ -1109,7 +1424,7 @@ typedef struct AVPacketSideData {
  * packets, with no compressed data, containing only side data
  * (e.g. to update some stream parameters at the end of encoding).
  *
- * AVPacket is one of the few structs in Libav, whose size is a part of public
+ * AVPacket is one of the few structs in FFmpeg, whose size is a part of public
  * ABI. Thus it may be allocated on stack and no new fields can be added to it
  * without libavcodec and libavformat major bump.
  *
@@ -1184,6 +1499,25 @@ typedef struct AVPacket {
 } AVPacket;
 #define AV_PKT_FLAG_KEY     0x0001 ///< The packet contains a keyframe
 #define AV_PKT_FLAG_CORRUPT 0x0002 ///< The packet content is corrupted
+/**
+ * Flag is used to discard packets which are required to maintain valid
+ * decoder state but are not required for output and should be dropped
+ * after decoding.
+ **/
+#define AV_PKT_FLAG_DISCARD   0x0004
+/**
+ * The packet comes from a trusted source.
+ *
+ * Otherwise-unsafe constructs such as arbitrary pointers to data
+ * outside the packet may be followed.
+ */
+#define AV_PKT_FLAG_TRUSTED   0x0008
+/**
+ * Flag is used to indicate packets that contain frames that can
+ * be discarded by the decoder.  I.e. Non-reference frames.
+ */
+#define AV_PKT_FLAG_DISPOSABLE 0x0010
+
 
 enum AVSideDataParamChangeFlags {
     AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT  = 0x0001,
@@ -1211,6 +1545,12 @@ enum AVFieldOrder {
  * New fields can be added to the end with minor version bumps.
  * Removal, reordering and changes to existing fields require a major
  * version bump.
+ * You can use AVOptions (av_opt* / av_set/get*()) to access these fields from user
+ * applications.
+ * The name string for AVOptions options matches the associated command line
+ * parameter name and can be found in libavcodec/options_table.h
+ * The AVOption/command line parameter names differ in some cases from the C
+ * structure field names for historic reasons or brevity.
  * sizeof(AVCodecContext) must not be used outside libav*.
  */
 typedef struct AVCodecContext {
@@ -1260,9 +1600,10 @@ typedef struct AVCodecContext {
     /**
      * the average bitrate
      * - encoding: Set by user; unused for constant quantizer encoding.
-     * - decoding: Set by libavcodec. 0 or some bitrate if this info is available in the stream.
+     * - decoding: Set by user, may be overwritten by libavcodec
+     *             if this info is available in the stream
      */
-    int bit_rate;
+    int64_t bit_rate;
 
     /**
      * number of bits the bitstream is allowed to diverge from the reference.
@@ -1309,6 +1650,7 @@ typedef struct AVCodecContext {
      * The allocated memory should be AV_INPUT_BUFFER_PADDING_SIZE bytes larger
      * than extradata_size to avoid problems if it is read with the bitstream reader.
      * The bytewise contents of extradata must not depend on the architecture or CPU endianness.
+     * Must be allocated with the av_malloc() family of functions.
      * - encoding: Set/allocated/freed by libavcodec.
      * - decoding: Set/allocated/freed by user.
      */
@@ -1320,6 +1662,16 @@ typedef struct AVCodecContext {
      * of which frame timestamps are represented. For fixed-fps content,
      * timebase should be 1/framerate and timestamp increments should be
      * identically 1.
+     * This often, but not always is the inverse of the frame rate or field rate
+     * for video. 1/time_base is not the average frame rate if the frame rate is not
+     * constant.
+     *
+     * Like containers, elementary streams also can store timestamps, 1/time_base
+     * is the unit in which these timestamps are specified.
+     * As example of such codec time base see ISO/IEC 14496-2:2001(E)
+     * vop_time_increment_resolution and fixed_vop_rate
+     * (fixed_vop_rate == 0 implies that it is different from the framerate)
+     *
      * - encoding: MUST be set by user.
      * - decoding: the use of this field for decoding is deprecated.
      *             Use framerate instead.
@@ -1338,6 +1690,11 @@ typedef struct AVCodecContext {
     /**
      * Codec delay.
      *
+     * Encoding: Number of frames delay there will be from the encoder input to
+     *           the decoder output. (we assume the decoder matches the spec)
+     * Decoding: Number of frames delay in addition to what a standard decoder
+     *           as specified in the spec would produce.
+     *
      * Video:
      *   Number of frames the decoded output will be delayed relative to the
      *   encoded input.
@@ -1373,7 +1730,7 @@ typedef struct AVCodecContext {
 
     /**
      * Bitstream width / height, may be different from width/height e.g. when
-     * the decoded frame is cropped before being output.
+     * the decoded frame is cropped before being output or lowres is enabled.
      *
      * @note Those field may not match the value of the last
      * AVFrame output by avcodec_receive_frame() due frame
@@ -1601,20 +1958,23 @@ typedef struct AVCodecContext {
      * - decoding: unused
      */
     int ildct_cmp;
-#define FF_CMP_SAD    0
-#define FF_CMP_SSE    1
-#define FF_CMP_SATD   2
-#define FF_CMP_DCT    3
-#define FF_CMP_PSNR   4
-#define FF_CMP_BIT    5
-#define FF_CMP_RD     6
-#define FF_CMP_ZERO   7
-#define FF_CMP_VSAD   8
-#define FF_CMP_VSSE   9
-#define FF_CMP_NSSE   10
-#define FF_CMP_DCTMAX 13
-#define FF_CMP_DCT264 14
-#define FF_CMP_CHROMA 256
+#define FF_CMP_SAD          0
+#define FF_CMP_SSE          1
+#define FF_CMP_SATD         2
+#define FF_CMP_DCT          3
+#define FF_CMP_PSNR         4
+#define FF_CMP_BIT          5
+#define FF_CMP_RD           6
+#define FF_CMP_ZERO         7
+#define FF_CMP_VSAD         8
+#define FF_CMP_VSSE         9
+#define FF_CMP_NSSE         10
+#define FF_CMP_W53          11
+#define FF_CMP_W97          12
+#define FF_CMP_DCTMAX       13
+#define FF_CMP_DCT264       14
+#define FF_CMP_MEDIAN_SAD   15
+#define FF_CMP_CHROMA       256
 
     /**
      * ME diamond size & shape
@@ -1713,7 +2073,7 @@ typedef struct AVCodecContext {
     /**
      * precision of the intra DC coefficient - 8
      * - encoding: Set by user.
-     * - decoding: unused
+     * - decoding: Set by libavcodec
      */
     int intra_dc_precision;
 
@@ -1844,7 +2204,7 @@ typedef struct AVCodecContext {
 
     /** Field order
      * - encoding: set by libavcodec
-     * - decoding: Set by libavcodec
+     * - decoding: Set by user.
      */
     enum AVFieldOrder field_order;
 
@@ -1898,7 +2258,7 @@ typedef struct AVCodecContext {
     /**
      * Audio channel layout.
      * - encoding: set by user.
-     * - decoding: set by libavcodec.
+     * - decoding: set by user, may be overwritten by libavcodec.
      */
     uint64_t channel_layout;
 
@@ -1917,9 +2277,10 @@ typedef struct AVCodecContext {
     enum AVAudioServiceType audio_service_type;
 
     /**
-     * Used to request a sample format from the decoder.
-     * - encoding: unused.
+     * desired sample format
+     * - encoding: Not used.
      * - decoding: Set by user.
+     * Decoder will decode to this format if it can.
      */
     enum AVSampleFormat request_sample_fmt;
 
@@ -1977,6 +2338,8 @@ typedef struct AVCodecContext {
      * avcodec_align_dimensions2() should be used to find the required width and
      * height, as they normally need to be rounded up to the next multiple of 16.
      *
+     * Some decoders do not support linesizes changing between frames.
+     *
      * If frame multithreading is used and thread_safe_callbacks is set,
      * this callback may be called from a different thread, but not from more
      * than one at once. Does not need to be reentrant.
@@ -2016,7 +2379,8 @@ typedef struct AVCodecContext {
      * - encoding: unused
      * - decoding: set by the caller before avcodec_open2().
      */
-    attribute_deprecated int refcounted_frames;
+    attribute_deprecated
+    int refcounted_frames;
 
     /* - encoding parameters */
     float qcompress;  ///< amount of qscale change between easy & hard scenes (0.0-1.0)
@@ -2061,16 +2425,16 @@ typedef struct AVCodecContext {
     /**
      * maximum bitrate
      * - encoding: Set by user.
-     * - decoding: unused
+     * - decoding: Set by user, may be overwritten by libavcodec.
      */
-    int rc_max_rate;
+    int64_t rc_max_rate;
 
     /**
      * minimum bitrate
      * - encoding: Set by user.
      * - decoding: unused
      */
-    int rc_min_rate;
+    int64_t rc_min_rate;
 
     /**
      * Ratecontrol attempt to use, at maximum, <value> of what can be used without an underflow.
@@ -2234,6 +2598,7 @@ typedef struct AVCodecContext {
 #define FF_BUG_DC_CLIP          4096
 #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
 #define FF_BUG_TRUNCATED       16384
+#define FF_BUG_IEDGE           32768
 
     /**
      * strictly follow the standard (MPEG-4, ...).
@@ -2262,6 +2627,7 @@ typedef struct AVCodecContext {
     int error_concealment;
 #define FF_EC_GUESS_MVS   1
 #define FF_EC_DEBLOCK     2
+#define FF_EC_FAVOR_INTER 256
 
     /**
      * debug
@@ -2274,14 +2640,38 @@ typedef struct AVCodecContext {
 #define FF_DEBUG_BITSTREAM   4
 #define FF_DEBUG_MB_TYPE     8
 #define FF_DEBUG_QP          16
+#if FF_API_DEBUG_MV
+/**
+ * @deprecated this option does nothing
+ */
+#define FF_DEBUG_MV          32
+#endif
 #define FF_DEBUG_DCT_COEFF   0x00000040
 #define FF_DEBUG_SKIP        0x00000080
 #define FF_DEBUG_STARTCODE   0x00000100
 #define FF_DEBUG_ER          0x00000400
 #define FF_DEBUG_MMCO        0x00000800
 #define FF_DEBUG_BUGS        0x00001000
+#if FF_API_DEBUG_MV
+#define FF_DEBUG_VIS_QP      0x00002000
+#define FF_DEBUG_VIS_MB_TYPE 0x00004000
+#endif
 #define FF_DEBUG_BUFFERS     0x00008000
 #define FF_DEBUG_THREADS     0x00010000
+#define FF_DEBUG_GREEN_MD    0x00800000
+#define FF_DEBUG_NOMC        0x01000000
+
+#if FF_API_DEBUG_MV
+    /**
+     * debug
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int debug_mv;
+#define FF_DEBUG_VIS_MV_P_FOR  0x00000001 // visualize forward predicted MVs of P-frames
+#define FF_DEBUG_VIS_MV_B_FOR  0x00000002 // visualize forward predicted MVs of B-frames
+#define FF_DEBUG_VIS_MV_B_BACK 0x00000004 // visualize backward predicted MVs of B-frames
+#endif
 
     /**
      * Error recognition; may misdetect some more or less valid parts as errors.
@@ -2297,9 +2687,15 @@ typedef struct AVCodecContext {
  * decoder returning an error.
  */
 #define AV_EF_CRCCHECK  (1<<0)
-#define AV_EF_BITSTREAM (1<<1)
-#define AV_EF_BUFFER    (1<<2)
-#define AV_EF_EXPLODE   (1<<3)
+#define AV_EF_BITSTREAM (1<<1)          ///< detect bitstream specification deviations
+#define AV_EF_BUFFER    (1<<2)          ///< detect improper bitstream length
+#define AV_EF_EXPLODE   (1<<3)          ///< abort decoding on minor error detection
+
+#define AV_EF_IGNORE_ERR (1<<15)        ///< ignore errors and continue
+#define AV_EF_CAREFUL    (1<<16)        ///< consider things that violate the spec, are fast to calculate and have not been seen in the wild as errors
+#define AV_EF_COMPLIANT  (1<<17)        ///< consider all spec non compliances as errors
+#define AV_EF_AGGRESSIVE (1<<18)        ///< consider things that a sane encoder should not do as an error
+
 
     /**
      * opaque 64-bit number (generally a PTS) that will be reordered and
@@ -2323,8 +2719,8 @@ typedef struct AVCodecContext {
      * Hardware accelerator context.
      * For some hardware accelerators, a global context needs to be
      * provided by the user. In that case, this holds display-dependent
-     * data Libav cannot instantiate itself. Please refer to the
-     * Libav HW accelerator documentation to know how to fill this
+     * data FFmpeg cannot instantiate itself. Please refer to the
+     * FFmpeg HW accelerator documentation to know how to fill this
      * is. e.g. for VA API, this is a struct vaapi_context.
      * - encoding: unused
      * - decoding: Set by user
@@ -2369,6 +2765,8 @@ typedef struct AVCodecContext {
 #define FF_IDCT_SIMPLEARMV6   17
 #define FF_IDCT_FAAN          20
 #define FF_IDCT_SIMPLENEON    22
+#define FF_IDCT_NONE          24 /* Used by XvMC to extract IDCT coefficients with FF_IDCT_PERM_NONE */
+#define FF_IDCT_SIMPLEAUTO    128
 
     /**
      * bits per sample/pixel from the demuxer (needed for huffyuv).
@@ -2384,6 +2782,15 @@ typedef struct AVCodecContext {
      */
     int bits_per_raw_sample;
 
+#if FF_API_LOWRES
+    /**
+     * low resolution decoding, 1-> 1/2 size, 2->1/4 size
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+     int lowres;
+#endif
+
 #if FF_API_CODED_FRAME
     /**
      * the picture in the bitstream
@@ -2490,6 +2897,13 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_MPEG2_AAC_LOW 128
 #define FF_PROFILE_MPEG2_AAC_HE  131
 
+#define FF_PROFILE_DNXHD         0
+#define FF_PROFILE_DNXHR_LB      1
+#define FF_PROFILE_DNXHR_SQ      2
+#define FF_PROFILE_DNXHR_HQ      3
+#define FF_PROFILE_DNXHR_HQX     4
+#define FF_PROFILE_DNXHR_444     5
+
 #define FF_PROFILE_DTS         20
 #define FF_PROFILE_DTS_ES      30
 #define FF_PROFILE_DTS_96_24   40
@@ -2565,6 +2979,24 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_AV1_HIGH                         1
 #define FF_PROFILE_AV1_PROFESSIONAL                 2
 
+#define FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT            0xc0
+#define FF_PROFILE_MJPEG_HUFFMAN_EXTENDED_SEQUENTIAL_DCT 0xc1
+#define FF_PROFILE_MJPEG_HUFFMAN_PROGRESSIVE_DCT         0xc2
+#define FF_PROFILE_MJPEG_HUFFMAN_LOSSLESS                0xc3
+#define FF_PROFILE_MJPEG_JPEG_LS                         0xf7
+
+#define FF_PROFILE_SBC_MSBC                         1
+
+#define FF_PROFILE_PRORES_PROXY     0
+#define FF_PROFILE_PRORES_LT        1
+#define FF_PROFILE_PRORES_STANDARD  2
+#define FF_PROFILE_PRORES_HQ        3
+#define FF_PROFILE_PRORES_4444      4
+#define FF_PROFILE_PRORES_XQ        5
+
+#define FF_PROFILE_ARIB_PROFILE_A 0
+#define FF_PROFILE_ARIB_PROFILE_C 1
+
     /**
      * level
      * - encoding: Set by user.
@@ -2574,18 +3006,21 @@ typedef struct AVCodecContext {
 #define FF_LEVEL_UNKNOWN -99
 
     /**
+     * Skip loop filtering for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
     enum AVDiscard skip_loop_filter;
 
     /**
+     * Skip IDCT/dequantization for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
     enum AVDiscard skip_idct;
 
     /**
+     * Skip decoding for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
@@ -2647,7 +3082,7 @@ typedef struct AVCodecContext {
      */
     int initial_padding;
 
-    /*
+    /**
      * - decoding: For codecs that store a framerate value in the compressed
      *             bitstream, the decoder may export it here. { 0, 1} when
      *             unknown.
@@ -2664,6 +3099,123 @@ typedef struct AVCodecContext {
     enum AVPixelFormat sw_pix_fmt;
 
     /**
+     * Timebase in which pkt_dts/pts and AVPacket.dts/pts are.
+     * - encoding unused.
+     * - decoding set by user.
+     */
+    AVRational pkt_timebase;
+
+    /**
+     * AVCodecDescriptor
+     * - encoding: unused.
+     * - decoding: set by libavcodec.
+     */
+    const AVCodecDescriptor *codec_descriptor;
+
+#if !FF_API_LOWRES
+    /**
+     * low resolution decoding, 1-> 1/2 size, 2->1/4 size
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+     int lowres;
+#endif
+
+    /**
+     * Current statistics for PTS correction.
+     * - decoding: maintained and used by libavcodec, not intended to be used by user apps
+     * - encoding: unused
+     */
+    int64_t pts_correction_num_faulty_pts; /// Number of incorrect PTS values so far
+    int64_t pts_correction_num_faulty_dts; /// Number of incorrect DTS values so far
+    int64_t pts_correction_last_pts;       /// PTS of the last frame
+    int64_t pts_correction_last_dts;       /// DTS of the last frame
+
+    /**
+     * Character encoding of the input subtitles file.
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    char *sub_charenc;
+
+    /**
+     * Subtitles character encoding mode. Formats or codecs might be adjusting
+     * this setting (if they are doing the conversion themselves for instance).
+     * - decoding: set by libavcodec
+     * - encoding: unused
+     */
+    int sub_charenc_mode;
+#define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
+#define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
+#define FF_SUB_CHARENC_MODE_PRE_DECODER  1  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
+#define FF_SUB_CHARENC_MODE_IGNORE       2  ///< neither convert the subtitles, nor check them for valid UTF-8
+
+    /**
+     * Skip processing alpha if supported by codec.
+     * Note that if the format uses pre-multiplied alpha (common with VP6,
+     * and recommended due to better video quality/compression)
+     * the image will look as if alpha-blended onto a black background.
+     * However for formats that do not use pre-multiplied alpha
+     * there might be serious artefacts (though e.g. libswscale currently
+     * assumes pre-multiplied alpha anyway).
+     *
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int skip_alpha;
+
+    /**
+     * Number of samples to skip after a discontinuity
+     * - decoding: unused
+     * - encoding: set by libavcodec
+     */
+    int seek_preroll;
+
+#if !FF_API_DEBUG_MV
+    /**
+     * debug motion vectors
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int debug_mv;
+#define FF_DEBUG_VIS_MV_P_FOR  0x00000001 //visualize forward predicted MVs of P frames
+#define FF_DEBUG_VIS_MV_B_FOR  0x00000002 //visualize forward predicted MVs of B frames
+#define FF_DEBUG_VIS_MV_B_BACK 0x00000004 //visualize backward predicted MVs of B frames
+#endif
+
+    /**
+     * custom intra quantization matrix
+     * - encoding: Set by user, can be NULL.
+     * - decoding: unused.
+     */
+    uint16_t *chroma_intra_matrix;
+
+    /**
+     * dump format separator.
+     * can be ", " or "\n      " or anything else
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    uint8_t *dump_separator;
+
+    /**
+     * ',' separated list of allowed decoders.
+     * If NULL then all are allowed
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    char *codec_whitelist;
+
+    /**
+     * Properties of the stream that gets decoded
+     * - encoding: unused
+     * - decoding: set by libavcodec
+     */
+    unsigned properties;
+#define FF_CODEC_PROPERTY_LOSSLESS        0x00000001
+#define FF_CODEC_PROPERTY_CLOSED_CAPTIONS 0x00000002
+
+    /**
      * Additional data associated with the entire coded stream.
      *
      * - decoding: unused
@@ -2697,31 +3249,34 @@ typedef struct AVCodecContext {
     AVBufferRef *hw_frames_ctx;
 
     /**
-     * Video decoding only. Certain video codecs support cropping, meaning that
-     * only a sub-rectangle of the decoded frame is intended for display.  This
-     * option controls how cropping is handled by libavcodec.
-     *
-     * When set to 1 (the default), libavcodec will apply cropping internally.
-     * I.e. it will modify the output frame width/height fields and offset the
-     * data pointers (only by as much as possible while preserving alignment, or
-     * by the full amount if the AV_CODEC_FLAG_UNALIGNED flag is set) so that
-     * the frames output by the decoder refer only to the cropped area. The
-     * crop_* fields of the output frames will be zero.
-     *
-     * When set to 0, the width/height fields of the output frames will be set
-     * to the coded dimensions and the crop_* fields will describe the cropping
-     * rectangle. Applying the cropping is left to the caller.
+     * Control the form of AVSubtitle.rects[N]->ass
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int sub_text_format;
+#define FF_SUB_TEXT_FMT_ASS              0
+#if FF_API_ASS_TIMING
+#define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+#endif
+
+    /**
+     * Audio only. The amount of padding (in samples) appended by the encoder to
+     * the end of the audio. I.e. this number of decoded samples must be
+     * discarded by the caller from the end of the stream to get the original
+     * audio without any trailing padding.
      *
-     * @warning When hardware acceleration with opaque output frames is used,
-     * libavcodec is unable to apply cropping from the top/left border.
+     * - decoding: unused
+     * - encoding: unused
+     */
+    int trailing_padding;
+
+    /**
+     * The number of pixels per image to maximally accept.
      *
-     * @note when this option is set to zero, the width/height fields of the
-     * AVCodecContext and output AVFrames have different meanings. The codec
-     * context fields store display dimensions (with the coded dimensions in
-     * coded_width/height), while the frame fields store the coded dimensions
-     * (with the display dimensions being determined by the crop_* fields).
+     * - decoding: set by user
+     * - encoding: set by user
      */
-    int apply_cropping;
+    int64_t max_pixels;
 
     /**
      * A reference to the AVHWDeviceContext describing the device which will
@@ -2755,6 +3310,33 @@ typedef struct AVCodecContext {
     int hwaccel_flags;
 
     /**
+     * Video decoding only. Certain video codecs support cropping, meaning that
+     * only a sub-rectangle of the decoded frame is intended for display.  This
+     * option controls how cropping is handled by libavcodec.
+     *
+     * When set to 1 (the default), libavcodec will apply cropping internally.
+     * I.e. it will modify the output frame width/height fields and offset the
+     * data pointers (only by as much as possible while preserving alignment, or
+     * by the full amount if the AV_CODEC_FLAG_UNALIGNED flag is set) so that
+     * the frames output by the decoder refer only to the cropped area. The
+     * crop_* fields of the output frames will be zero.
+     *
+     * When set to 0, the width/height fields of the output frames will be set
+     * to the coded dimensions and the crop_* fields will describe the cropping
+     * rectangle. Applying the cropping is left to the caller.
+     *
+     * @warning When hardware acceleration with opaque output frames is used,
+     * libavcodec is unable to apply cropping from the top/left border.
+     *
+     * @note when this option is set to zero, the width/height fields of the
+     * AVCodecContext and output AVFrames have different meanings. The codec
+     * context fields store display dimensions (with the coded dimensions in
+     * coded_width/height), while the frame fields store the coded dimensions
+     * (with the display dimensions being determined by the crop_* fields).
+     */
+    int apply_cropping;
+
+    /*
      * Video decoding only.  Sets the number of extra hardware frames which
      * the decoder will allocate for use by the caller.  This must be set
      * before avcodec_open2() is called.
@@ -2767,8 +3349,52 @@ typedef struct AVCodecContext {
      * used as reference pictures).
      */
     int extra_hw_frames;
+
+    /**
+     * The percentage of damaged samples to discard a frame.
+     *
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int discard_damaged_percentage;
 } AVCodecContext;
 
+#if FF_API_CODEC_GET_SET
+/**
+ * Accessors for some AVCodecContext fields. These used to be provided for ABI
+ * compatibility, and do not need to be used anymore.
+ */
+attribute_deprecated
+AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+attribute_deprecated
+void       av_codec_set_pkt_timebase         (AVCodecContext *avctx, AVRational val);
+
+attribute_deprecated
+const AVCodecDescriptor *av_codec_get_codec_descriptor(const AVCodecContext *avctx);
+attribute_deprecated
+void                     av_codec_set_codec_descriptor(AVCodecContext *avctx, const AVCodecDescriptor *desc);
+
+attribute_deprecated
+unsigned av_codec_get_codec_properties(const AVCodecContext *avctx);
+
+#if FF_API_LOWRES
+attribute_deprecated
+int  av_codec_get_lowres(const AVCodecContext *avctx);
+attribute_deprecated
+void av_codec_set_lowres(AVCodecContext *avctx, int val);
+#endif
+
+attribute_deprecated
+int  av_codec_get_seek_preroll(const AVCodecContext *avctx);
+attribute_deprecated
+void av_codec_set_seek_preroll(AVCodecContext *avctx, int val);
+
+attribute_deprecated
+uint16_t *av_codec_get_chroma_intra_matrix(const AVCodecContext *avctx);
+attribute_deprecated
+void av_codec_set_chroma_intra_matrix(AVCodecContext *avctx, uint16_t *val);
+#endif
+
 /**
  * AVProfile.
  */
@@ -2864,6 +3490,7 @@ typedef struct AVCodec {
     const int *supported_samplerates;       ///< array of supported audio samplerates, or NULL if unknown, array is terminated by 0
     const enum AVSampleFormat *sample_fmts; ///< array of supported sample formats, or NULL if unknown, array is terminated by -1
     const uint64_t *channel_layouts;         ///< array of support channel layouts, or NULL if unknown. array is terminated by 0
+    uint8_t max_lowres;                     ///< maximum value for lowres supported by the decoder
     const AVClass *priv_class;              ///< AVClass for the private context
     const AVProfile *profiles;              ///< array of recognized profiles, or NULL if unknown, array is terminated by {FF_PROFILE_UNKNOWN}
 
@@ -2873,7 +3500,7 @@ typedef struct AVCodec {
      * wrapper uses some kind of external implementation for the codec, such
      * as an external library, or a codec implementation provided by the OS or
      * the hardware.
-     * If this field is NULL, this is a builtin, libavcodec native decoder.
+     * If this field is NULL, this is a builtin, libavcodec native codec.
      * If non-NULL, this will be the suffix in AVCodec.name in most cases
      * (usually AVCodec.name will be of the form "<codec_name>_<wrapper_name>").
      */
@@ -2915,6 +3542,9 @@ typedef struct AVCodec {
 
     /**
      * Initialize codec static data, called from avcodec_register().
+     *
+     * This is not intended for time consuming operations as it is
+     * run for every codec regardless of that codec being used.
      */
     void (*init_static_data)(struct AVCodec *codec);
 
@@ -2979,6 +3609,13 @@ typedef struct AVCodec {
     const struct AVCodecHWConfigInternal **hw_configs;
 } AVCodec;
 
+#if FF_API_CODEC_GET_SET
+attribute_deprecated
+int av_codec_get_max_lowres(const AVCodec *codec);
+#endif
+
+struct MpegEncContext;
+
 /**
  * Retrieve supported hardware configurations for a codec.
  *
@@ -3027,7 +3664,7 @@ typedef struct AVHWAccel {
 
     /**
      * Hardware accelerated codec capabilities.
-     * see FF_HWACCEL_CODEC_CAP_*
+     * see AV_HWACCEL_CODEC_CAP_*
      */
     int capabilities;
 
@@ -3038,7 +3675,6 @@ typedef struct AVHWAccel {
      * New public fields should be added right above.
      *****************************************************************
      */
-    struct AVHWAccel *next;
 
     /**
      * Allocate a custom buffer
@@ -3062,10 +3698,25 @@ typedef struct AVHWAccel {
     int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
 
     /**
+     * Callback for parameter data (SPS/PPS/VPS etc).
+     *
+     * Useful for hardware decoders which keep persistent state about the
+     * video parameters, and need to receive any changes to update that state.
+     *
+     * @param avctx the codec context
+     * @param type the nal unit type
+     * @param buf the nal unit data buffer
+     * @param buf_size the size of the nal unit in bytes
+     * @return zero if successful, a negative value otherwise
+     */
+    int (*decode_params)(AVCodecContext *avctx, int type, const uint8_t *buf, uint32_t buf_size);
+
+    /**
      * Callback for each slice.
      *
      * Meaningful slice information (codec specific) is guaranteed to
      * be parsed at this point. This function is mandatory.
+     * The only exception is XvMC, that works on MB level.
      *
      * @param avctx the codec context
      * @param buf the slice data buffer base
@@ -3095,6 +3746,17 @@ typedef struct AVHWAccel {
     int frame_priv_data_size;
 
     /**
+     * Called for every Macroblock in a slice.
+     *
+     * XvMC uses it to replace the ff_mpv_reconstruct_mb().
+     * Instead of decoding to raw picture, MB parameters are
+     * stored in an array provided by the video driver.
+     *
+     * @param s the mpeg context
+     */
+    void (*decode_mb)(struct MpegEncContext *s);
+
+    /**
      * Initialize the hwaccel private data.
      *
      * This will be called from ff_get_format(), after hwaccel and
@@ -3134,9 +3796,18 @@ typedef struct AVHWAccel {
 } AVHWAccel;
 
 /**
+ * HWAccel is experimental and is thus avoided in favor of non experimental
+ * codecs
+ */
+#define AV_HWACCEL_CODEC_CAP_EXPERIMENTAL 0x0200
+
+/**
  * Hardware acceleration should be used for decoding even if the codec level
  * used is unknown or higher than the maximum supported level reported by the
  * hardware driver.
+ *
+ * It's generally a good idea to pass this flag unless you have a specific
+ * reason not to, as hardware tends to under-report supported levels.
  */
 #define AV_HWACCEL_FLAG_IGNORE_LEVEL (1 << 0)
 
@@ -3173,13 +3844,15 @@ typedef struct AVHWAccel {
  */
 
 /**
- * four components are given, that's all.
- * the last component is alpha
- * @deprecated Use the imgutils functions
+ * Picture data structure.
+ *
+ * Up to four components can be stored into it, the last component is
+ * alpha.
+ * @deprecated use AVFrame or imgutils functions instead
  */
 typedef struct AVPicture {
     attribute_deprecated
-    uint8_t *data[AV_NUM_DATA_POINTERS];
+    uint8_t *data[AV_NUM_DATA_POINTERS];    ///< pointers to the image data planes
     attribute_deprecated
     int linesize[AV_NUM_DATA_POINTERS];     ///< number of bytes per line
 } AVPicture;
@@ -3189,9 +3862,6 @@ typedef struct AVPicture {
  */
 #endif
 
-#define AVPALETTE_SIZE 1024
-#define AVPALETTE_COUNT 256
-
 enum AVSubtitleType {
     SUBTITLE_NONE,
 
@@ -3243,6 +3913,7 @@ typedef struct AVSubtitleRect {
      * struct.
      */
     char *ass;
+
     int flags;
 } AVSubtitleRect;
 
@@ -3299,11 +3970,35 @@ typedef struct AVCodecParameters {
     /**
      * The average bitrate of the encoded data (in bits per second).
      */
-    int bit_rate;
+    int64_t bit_rate;
 
+    /**
+     * The number of bits per sample in the codedwords.
+     *
+     * This is basically the bitrate per sample. It is mandatory for a bunch of
+     * formats to actually decode them. It's the number of bits for one sample in
+     * the actual coded bitstream.
+     *
+     * This could be for example 4 for ADPCM
+     * For PCM formats this matches bits_per_raw_sample
+     * Can be 0
+     */
     int bits_per_coded_sample;
 
     /**
+     * This is the number of valid bits in each output sample. If the
+     * sample format has more bits, the least significant bits are additional
+     * padding bits, which are always 0. Use right shifts to reduce the sample
+     * to its actual size. For example, audio formats with 24 bit samples will
+     * have bits_per_raw_sample set to 24, and format set to AV_SAMPLE_FMT_S32.
+     * To get the original sample use "(int32_t)sample >> 8"."
+     *
+     * For ADPCM this might be 12 or 16 or similar
+     * Can be 0
+     */
+    int bits_per_raw_sample;
+
+    /**
      * Codec-specific bitstream restrictions that the stream conforms to.
      */
     int profile;
@@ -3339,6 +4034,11 @@ typedef struct AVCodecParameters {
     enum AVChromaLocation              chroma_location;
 
     /**
+     * Video only. Number of delayed frames.
+     */
+    int video_delay;
+
+    /**
      * Audio only. The channel layout bitmask. May be 0 if the channel layout is
      * unknown or unspecified, otherwise the number of bits set must be equal to
      * the channels field.
@@ -3359,6 +4059,10 @@ typedef struct AVCodecParameters {
      * Corresponds to nBlockAlign in WAVEFORMATEX.
      */
     int      block_align;
+    /**
+     * Audio only. Audio frame size, if known. Required by some formats to be static.
+     */
+    int      frame_size;
 
     /**
      * Audio only. The amount of padding (in samples) inserted by the encoder at
@@ -3374,14 +4078,32 @@ typedef struct AVCodecParameters {
      * audio without any trailing padding.
      */
     int trailing_padding;
+    /**
+     * Audio only. Number of samples to skip after a discontinuity.
+     */
+    int seek_preroll;
 } AVCodecParameters;
 
 /**
+ * Iterate over all registered codecs.
+ *
+ * @param opaque a pointer where libavcodec will store the iteration state. Must
+ *               point to NULL to start the iteration.
+ *
+ * @return the next registered codec or NULL when the iteration is
+ *         finished
+ */
+const AVCodec *av_codec_iterate(void **opaque);
+
+#if FF_API_NEXT
+/**
  * If c is NULL, returns the first registered codec,
  * if c is non-NULL, returns the next registered codec after c,
  * or NULL if c is the last one.
  */
+attribute_deprecated
 AVCodec *av_codec_next(const AVCodec *c);
+#endif
 
 /**
  * Return the LIBAVCODEC_VERSION_INT constant.
@@ -3398,6 +4120,7 @@ const char *avcodec_configuration(void);
  */
 const char *avcodec_license(void);
 
+#if FF_API_NEXT
 /**
  * Register the codec codec and initialize libavcodec.
  *
@@ -3406,6 +4129,7 @@ const char *avcodec_license(void);
  *
  * @see avcodec_register_all()
  */
+attribute_deprecated
 void avcodec_register(AVCodec *codec);
 
 /**
@@ -3418,7 +4142,9 @@ void avcodec_register(AVCodec *codec);
  * @see av_register_codec_parser
  * @see av_register_bitstream_filter
  */
+attribute_deprecated
 void avcodec_register_all(void);
+#endif
 
 /**
  * Allocate an AVCodecContext and set its fields to default values. The
@@ -3460,13 +4186,29 @@ const AVClass *avcodec_get_class(void);
 
 #if FF_API_COPY_CONTEXT
 /**
+ * Get the AVClass for AVFrame. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_frame_class(void);
+
+/**
+ * Get the AVClass for AVSubtitleRect. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_subtitle_rect_class(void);
+
+/**
  * Copy the settings of the source AVCodecContext into the destination
  * AVCodecContext. The resulting destination codec context will be
  * unopened, i.e. you are required to call avcodec_open2() before you
  * can use this AVCodecContext to decode/encode video/audio data.
  *
  * @param dest target codec context, should be initialized with
- *             avcodec_alloc_context3(), but otherwise uninitialized
+ *             avcodec_alloc_context3(NULL), but otherwise uninitialized
  * @param src source codec context
  * @return AVERROR() on error (e.g. memory allocation error), 0 on success
  *
@@ -3683,11 +4425,31 @@ int av_packet_from_data(AVPacket *pkt, uint8_t *data, int size);
  * @warning This is a hack - the packet memory allocation stuff is broken. The
  * packet is allocated if it was not really allocated.
  *
- * @deprecated Use av_packet_ref
+ * @deprecated Use av_packet_ref or av_packet_make_refcounted
  */
 attribute_deprecated
 int av_dup_packet(AVPacket *pkt);
 /**
+ * Copy packet, including contents
+ *
+ * @return 0 on success, negative AVERROR on fail
+ *
+ * @deprecated Use av_packet_ref
+ */
+attribute_deprecated
+int av_copy_packet(AVPacket *dst, const AVPacket *src);
+
+/**
+ * Copy packet side data
+ *
+ * @return 0 on success, negative AVERROR on fail
+ *
+ * @deprecated Use av_packet_copy_props
+ */
+attribute_deprecated
+int av_copy_packet_side_data(AVPacket *dst, const AVPacket *src);
+
+/**
  * Free a packet.
  *
  * @deprecated Use av_packet_unref
@@ -3743,9 +4505,38 @@ int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
  * @param size pointer for side information size to store (optional)
  * @return pointer to data if present or NULL otherwise
  */
-uint8_t* av_packet_get_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
+uint8_t* av_packet_get_side_data(const AVPacket *pkt, enum AVPacketSideDataType type,
                                  int *size);
 
+#if FF_API_MERGE_SD_API
+attribute_deprecated
+int av_packet_merge_side_data(AVPacket *pkt);
+
+attribute_deprecated
+int av_packet_split_side_data(AVPacket *pkt);
+#endif
+
+const char *av_packet_side_data_name(enum AVPacketSideDataType type);
+
+/**
+ * Pack a dictionary for use in side_data.
+ *
+ * @param dict The dictionary to pack.
+ * @param size pointer to store the size of the returned data
+ * @return pointer to data if successful, NULL otherwise
+ */
+uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size);
+/**
+ * Unpack a dictionary from side_data.
+ *
+ * @param data data from side_data
+ * @param size size of the data
+ * @param dict the metadata storage dictionary
+ * @return 0 on success, < 0 on failure
+ */
+int av_packet_unpack_dictionary(const uint8_t *data, int size, AVDictionary **dict);
+
+
 /**
  * Convenience function to free all the side data stored.
  * All the other fields stay untouched.
@@ -3806,6 +4597,33 @@ void av_packet_move_ref(AVPacket *dst, AVPacket *src);
 int av_packet_copy_props(AVPacket *dst, const AVPacket *src);
 
 /**
+ * Ensure the data described by a given packet is reference counted.
+ *
+ * @note This function does not ensure that the reference will be writable.
+ *       Use av_packet_make_writable instead for that purpose.
+ *
+ * @see av_packet_ref
+ * @see av_packet_make_writable
+ *
+ * @param pkt packet whose data should be made reference counted.
+ *
+ * @return 0 on success, a negative AVERROR on error. On failure, the
+ *         packet is unchanged.
+ */
+int av_packet_make_refcounted(AVPacket *pkt);
+
+/**
+ * Create a writable reference for the data described by a given packet,
+ * avoiding data copy if possible.
+ *
+ * @param pkt Packet whose data should be made writable.
+ *
+ * @return 0 on success, a negative AVERROR on failure. On failure, the
+ *         packet is unchanged.
+ */
+int av_packet_make_writable(AVPacket *pkt);
+
+/**
  * Convert valid timing fields (timestamps / durations) in a packet from one
  * timebase to another. Timestamps with unknown values (AV_NOPTS_VALUE) will be
  * ignored.
@@ -3870,6 +4688,28 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
                                int linesize_align[AV_NUM_DATA_POINTERS]);
 
 /**
+ * Converts AVChromaLocation to swscale x/y chroma position.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos);
+
+/**
+ * Converts swscale x/y chroma position to AVChromaLocation.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos);
+
+/**
  * Decode the audio frame of size avpkt->size from avpkt->data into frame.
  *
  * Some decoders may support multiple frames in a single AVPacket. Such
@@ -3924,7 +4764,7 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
  */
 attribute_deprecated
 int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
-                          int *got_frame_ptr, AVPacket *avpkt);
+                          int *got_frame_ptr, const AVPacket *avpkt);
 
 /**
  * Decode the video frame of size avpkt->size from avpkt->data into picture.
@@ -3974,7 +4814,7 @@ int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
 attribute_deprecated
 int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
                          int *got_picture_ptr,
-                         AVPacket *avpkt);
+                         const AVPacket *avpkt);
 
 /**
  * Decode a subtitle message.
@@ -3986,12 +4826,20 @@ int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
  * and reusing a get_buffer written for video codecs would probably perform badly
  * due to a potentially very different allocation pattern.
  *
+ * Some decoders (those marked with AV_CODEC_CAP_DELAY) have a delay between input
+ * and output. This means that for some packets they will not immediately
+ * produce decoded output and need to be flushed at the end of decoding to get
+ * all the decoded data. Flushing is done by calling this function with packets
+ * with avpkt->data set to NULL and avpkt->size set to 0 until it stops
+ * returning subtitles. It is safe to flush even those decoders that are not
+ * marked with AV_CODEC_CAP_DELAY, then no subtitles will be returned.
+ *
  * @note The AVCodecContext MUST have been opened with @ref avcodec_open2()
  * before packets may be fed to the decoder.
  *
  * @param avctx the codec context
- * @param[out] sub The AVSubtitle in which the decoded subtitle will be stored, must be
-                   freed with avsubtitle_free if *got_sub_ptr is set.
+ * @param[out] sub The Preallocated AVSubtitle in which the decoded subtitle will be stored,
+ *                 must be freed with avsubtitle_free if *got_sub_ptr is set.
  * @param[in,out] got_sub_ptr Zero if no subtitle could be decompressed, otherwise, it is nonzero.
  * @param[in] avpkt The input AVPacket containing the input buffer.
  */
@@ -4279,6 +5127,7 @@ typedef struct AVCodecParserContext {
 #define PARSER_FLAG_ONCE                      0x0002
 /// Set if the parser has a valid file offset
 #define PARSER_FLAG_FETCHED_OFFSET            0x0004
+#define PARSER_FLAG_USE_CODEC_TS              0x1000
 
     int64_t offset;      ///< byte offset from starting packet start
     int64_t cur_frame_end[AV_PARSER_PTS_NB];
@@ -4422,8 +5271,21 @@ typedef struct AVCodecParser {
     struct AVCodecParser *next;
 } AVCodecParser;
 
+/**
+ * Iterate over all registered codec parsers.
+ *
+ * @param opaque a pointer where libavcodec will store the iteration state. Must
+ *               point to NULL to start the iteration.
+ *
+ * @return the next registered codec parser or NULL when the iteration is
+ *         finished
+ */
+const AVCodecParser *av_parser_iterate(void **opaque);
+
+attribute_deprecated
 AVCodecParser *av_parser_next(const AVCodecParser *c);
 
+attribute_deprecated
 void av_register_codec_parser(AVCodecParser *parser);
 AVCodecParserContext *av_parser_init(int codec_id);
 
@@ -4435,7 +5297,10 @@ AVCodecParserContext *av_parser_init(int codec_id);
  * @param poutbuf       set to pointer to parsed buffer or NULL if not yet finished.
  * @param poutbuf_size  set to size of parsed buffer or zero if not yet finished.
  * @param buf           input buffer.
- * @param buf_size      input length, to signal EOF, this should be 0 (so that the last frame can be output).
+ * @param buf_size      buffer size in bytes without the padding. I.e. the full buffer
+                        size is assumed to be buf_size + AV_INPUT_BUFFER_PADDING_SIZE.
+                        To signal EOF, this should be 0 (so that the last frame
+                        can be output).
  * @param pts           input presentation timestamp.
  * @param dts           input decoding timestamp.
  * @param pos           input byte position in stream.
@@ -4464,7 +5329,7 @@ int av_parser_parse2(AVCodecParserContext *s,
 
 /**
  * @return 0 if the output buffer is a subset of the input, 1 if it is allocated and must be freed
- * @deprecated use AVBitstreamFilter
+ * @deprecated use AVBitStreamFilter
  */
 int av_parser_change(AVCodecParserContext *s,
                      AVCodecContext *avctx,
@@ -4511,11 +5376,12 @@ AVCodec *avcodec_find_encoder_by_name(const char *name);
  *                  The user can supply an output buffer by setting
  *                  avpkt->data and avpkt->size prior to calling the
  *                  function, but if the size of the user-provided data is not
- *                  large enough, encoding will fail. All other AVPacket fields
- *                  will be reset by the encoder using av_init_packet(). If
- *                  avpkt->data is NULL, the encoder will allocate it.
- *                  The encoder will set avpkt->size to the size of the
- *                  output packet.
+ *                  large enough, encoding will fail. If avpkt->data and
+ *                  avpkt->size are set, avpkt->destruct must also be set. All
+ *                  other AVPacket fields will be reset by the encoder using
+ *                  av_init_packet(). If avpkt->data is NULL, the encoder will
+ *                  allocate it. The encoder will set avpkt->size to the size
+ *                  of the output packet.
  *
  *                  If this function fails or produces no output, avpkt will be
  *                  freed using av_packet_unref().
@@ -4610,14 +5476,14 @@ void avpicture_free(AVPicture *picture);
  * @deprecated use av_image_fill_arrays() instead.
  */
 attribute_deprecated
-int avpicture_fill(AVPicture *picture, uint8_t *ptr,
+int avpicture_fill(AVPicture *picture, const uint8_t *ptr,
                    enum AVPixelFormat pix_fmt, int width, int height);
 
 /**
  * @deprecated use av_image_copy_to_buffer() instead.
  */
 attribute_deprecated
-int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt,
+int avpicture_layout(const AVPicture *src, enum AVPixelFormat pix_fmt,
                      int width, int height,
                      unsigned char *dest, int dest_size);
 
@@ -4669,6 +5535,15 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
  * @{
  */
 
+#if FF_API_GETCHROMA
+/**
+ * @deprecated Use av_pix_fmt_get_chroma_sub_sample
+ */
+
+attribute_deprecated
+void avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift);
+#endif
+
 /**
  * Return a value representing the fourCC code associated to the
  * pixel format pix_fmt, or 0 if no associated fourCC code can be
@@ -4676,29 +5551,8 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
  */
 unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat pix_fmt);
 
-#define FF_LOSS_RESOLUTION  0x0001 /**< loss due to resolution change */
-#define FF_LOSS_DEPTH       0x0002 /**< loss due to color depth change */
-#define FF_LOSS_COLORSPACE  0x0004 /**< loss due to color space conversion */
-#define FF_LOSS_ALPHA       0x0008 /**< loss of alpha bits */
-#define FF_LOSS_COLORQUANT  0x0010 /**< loss due to color quantization */
-#define FF_LOSS_CHROMA      0x0020 /**< loss of chroma (e.g. RGB to gray conversion) */
-
-/**
- * Compute what kind of losses will occur when converting from one specific
- * pixel format to another.
- * When converting from one pixel format to another, information loss may occur.
- * For example, when converting from RGB24 to GRAY, the color information will
- * be lost. Similarly, other losses occur when converting from some formats to
- * other formats. These losses can involve loss of chroma, but also loss of
- * resolution, loss of color depth, loss due to the color space conversion, loss
- * of the alpha bits or loss due to color quantization.
- * avcodec_get_fix_fmt_loss() informs you about the various types of losses
- * which will occur when converting from one pixel format to another.
- *
- * @param[in] dst_pix_fmt destination pixel format
- * @param[in] src_pix_fmt source pixel format
- * @param[in] has_alpha Whether the source pixel format alpha channel is used.
- * @return Combination of flags informing you what kind of losses will occur.
+/**
+ * @deprecated see av_get_pix_fmt_loss()
  */
 int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat src_pix_fmt,
                              int has_alpha);
@@ -4708,7 +5562,7 @@ int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat
  * format.  When converting from one pixel format to another, information loss
  * may occur.  For example, when converting from RGB24 to GRAY, the color
  * information will be lost. Similarly, other losses occur when converting from
- * some formats to other formats. avcodec_find_best_pix_fmt2() searches which of
+ * some formats to other formats. avcodec_find_best_pix_fmt_of_2() searches which of
  * the given pixel formats should be used to suffer the least amount of loss.
  * The pixel formats from which it chooses one, are determined by the
  * pix_fmt_list parameter.
@@ -4720,9 +5574,19 @@ int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat
  * @param[out] loss_ptr Combination of flags informing you what kind of losses will occur.
  * @return The best pixel format to convert to or -1 if none was found.
  */
-enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat *pix_fmt_list,
-                                              enum AVPixelFormat src_pix_fmt,
-                                              int has_alpha, int *loss_ptr);
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list,
+                                            enum AVPixelFormat src_pix_fmt,
+                                            int has_alpha, int *loss_ptr);
+
+/**
+ * @deprecated see av_find_best_pix_fmt_of_2()
+ */
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+
+attribute_deprecated
+enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
 
 enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const enum AVPixelFormat * fmt);
 
@@ -4730,6 +5594,7 @@ enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const en
  * @}
  */
 
+#if FF_API_TAG_STRING
 /**
  * Put a string representing the codec tag codec_tag in buf.
  *
@@ -4738,8 +5603,12 @@ enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const en
  * @param codec_tag codec tag to assign
  * @return the length of the string that would have been generated if
  * enough space had been available, excluding the trailing null
+ *
+ * @deprecated see av_fourcc_make_string() and av_fourcc2str().
  */
+attribute_deprecated
 size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_tag);
+#endif
 
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode);
 
@@ -4770,7 +5639,12 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
 //FIXME func typedef
 
 /**
- * Fill audio frame data and linesize.
+ * Fill AVFrame audio data and linesize pointers.
+ *
+ * The buffer buf must be a preallocated buffer with a size big enough
+ * to contain the specified samples amount. The filled AVFrame data
+ * pointers will point to this buffer.
+ *
  * AVFrame extended_data channel pointers are allocated if necessary for
  * planar audio.
  *
@@ -4783,7 +5657,9 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
  * @param buf         buffer to use for frame data
  * @param buf_size    size of buffer
  * @param align       plane size sample alignment (0 = default)
- * @return            0 on success, negative error code on failure
+ * @return            >=0 on success, negative error code on failure
+ * @todo return the size in bytes required to store the samples in
+ * case of success, at the next libavutil bump
  */
 int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
                              enum AVSampleFormat sample_fmt, const uint8_t *buf,
@@ -4809,6 +5685,14 @@ void avcodec_flush_buffers(AVCodecContext *avctx);
 int av_get_bits_per_sample(enum AVCodecID codec_id);
 
 /**
+ * Return the PCM codec associated with a sample format.
+ * @param be  endianness, 0 for little, 1 for big,
+ *            -1 (or anything else) for native
+ * @return  AV_CODEC_ID_PCM_* or AV_CODEC_ID_NONE
+ */
+enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be);
+
+/**
  * Return codec bits per sample.
  * Only return non-zero if the bits per sample is exactly correct, not an
  * approximation.
@@ -4837,9 +5721,14 @@ int av_get_audio_frame_duration2(AVCodecParameters *par, int frame_bytes);
 #if FF_API_OLD_BSF
 typedef struct AVBitStreamFilterContext {
     void *priv_data;
-    struct AVBitStreamFilter *filter;
+    const struct AVBitStreamFilter *filter;
     AVCodecParserContext *parser;
     struct AVBitStreamFilterContext *next;
+    /**
+     * Internal default arguments, used if NULL is passed to av_bitstream_filter_filter().
+     * Not for access by library users.
+     */
+    char *args;
 } AVBitStreamFilterContext;
 #endif
 
@@ -4947,17 +5836,37 @@ typedef struct AVBitStreamFilter {
  */
 attribute_deprecated
 void av_register_bitstream_filter(AVBitStreamFilter *bsf);
+/**
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_get_by_name(), av_bsf_alloc(), and av_bsf_init()
+ * from the new bitstream filtering API (using AVBSFContext).
+ */
 attribute_deprecated
 AVBitStreamFilterContext *av_bitstream_filter_init(const char *name);
+/**
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_send_packet() and av_bsf_receive_packet() from the
+ * new bitstream filtering API (using AVBSFContext).
+ */
 attribute_deprecated
 int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
                                AVCodecContext *avctx, const char *args,
                                uint8_t **poutbuf, int *poutbuf_size,
                                const uint8_t *buf, int buf_size, int keyframe);
+/**
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_free() from the new bitstream filtering API (using
+ * AVBSFContext).
+ */
 attribute_deprecated
 void av_bitstream_filter_close(AVBitStreamFilterContext *bsf);
+/**
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_iterate() from the new bitstream filtering API (using
+ * AVBSFContext).
+ */
 attribute_deprecated
-AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f);
+const AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f);
 #endif
 
 /**
@@ -4975,7 +5884,11 @@ const AVBitStreamFilter *av_bsf_get_by_name(const char *name);
  * @return the next registered bitstream filter or NULL when the iteration is
  *         finished
  */
+const AVBitStreamFilter *av_bsf_iterate(void **opaque);
+#if FF_API_NEXT
+attribute_deprecated
 const AVBitStreamFilter *av_bsf_next(void **opaque);
+#endif
 
 /**
  * Allocate a context for a given bitstream filter. The caller must fill in the
@@ -5059,17 +5972,109 @@ void av_bsf_free(AVBSFContext **ctx);
  */
 const AVClass *av_bsf_get_class(void);
 
-/* memory */
+/**
+ * Structure for chain/list of bitstream filters.
+ * Empty list can be allocated by av_bsf_list_alloc().
+ */
+typedef struct AVBSFList AVBSFList;
+
+/**
+ * Allocate empty list of bitstream filters.
+ * The list must be later freed by av_bsf_list_free()
+ * or finalized by av_bsf_list_finalize().
+ *
+ * @return Pointer to @ref AVBSFList on success, NULL in case of failure
+ */
+AVBSFList *av_bsf_list_alloc(void);
+
+/**
+ * Free list of bitstream filters.
+ *
+ * @param lst Pointer to pointer returned by av_bsf_list_alloc()
+ */
+void av_bsf_list_free(AVBSFList **lst);
+
+/**
+ * Append bitstream filter to the list of bitstream filters.
+ *
+ * @param lst List to append to
+ * @param bsf Filter context to be appended
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_append(AVBSFList *lst, AVBSFContext *bsf);
 
 /**
- * Allocate a buffer with padding, reusing the given one if large enough.
+ * Construct new bitstream filter context given it's name and options
+ * and append it to the list of bitstream filters.
  *
+ * @param lst      List to append to
+ * @param bsf_name Name of the bitstream filter
+ * @param options  Options for the bitstream filter, can be set to NULL
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_append2(AVBSFList *lst, const char * bsf_name, AVDictionary **options);
+/**
+ * Finalize list of bitstream filters.
+ *
+ * This function will transform @ref AVBSFList to single @ref AVBSFContext,
+ * so the whole chain of bitstream filters can be treated as single filter
+ * freshly allocated by av_bsf_alloc().
+ * If the call is successful, @ref AVBSFList structure is freed and lst
+ * will be set to NULL. In case of failure, caller is responsible for
+ * freeing the structure by av_bsf_list_free()
+ *
+ * @param      lst Filter list structure to be transformed
+ * @param[out] bsf Pointer to be set to newly created @ref AVBSFContext structure
+ *                 representing the chain of bitstream filters
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_finalize(AVBSFList **lst, AVBSFContext **bsf);
+
+/**
+ * Parse string describing list of bitstream filters and create single
+ * @ref AVBSFContext describing the whole chain of bitstream filters.
+ * Resulting @ref AVBSFContext can be treated as any other @ref AVBSFContext freshly
+ * allocated by av_bsf_alloc().
+ *
+ * @param      str String describing chain of bitstream filters in format
+ *                 `bsf1[=opt1=val1:opt2=val2][,bsf2]`
+ * @param[out] bsf Pointer to be set to newly created @ref AVBSFContext structure
+ *                 representing the chain of bitstream filters
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_parse_str(const char *str, AVBSFContext **bsf);
+
+/**
+ * Get null/pass-through bitstream filter.
+ *
+ * @param[out] bsf Pointer to be set to new instance of pass-through bitstream filter
+ *
+ * @return
+ */
+int av_bsf_get_null_filter(AVBSFContext **bsf);
+
+/* memory */
+
+/**
  * Same behaviour av_fast_malloc but the buffer has additional
- * AV_INPUT_PADDING_SIZE at the end which will always memset to 0.
+ * AV_INPUT_BUFFER_PADDING_SIZE at the end which will always be 0.
+ *
+ * In addition the whole buffer will initially and after resizes
+ * be 0-initialized so that no uninitialized data will ever appear.
  */
 void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size);
 
 /**
+ * Same behaviour av_fast_padded_malloc except that buffer will always
+ * be 0-initialized after call.
+ */
+void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size);
+
+/**
  * Encode extradata length to a buffer. Used by xiph codecs.
  *
  * @param s buffer to write to; must be at least (v/255+1) bytes long
@@ -5099,9 +6104,11 @@ attribute_deprecated
 AVHWAccel *av_hwaccel_next(const AVHWAccel *hwaccel);
 #endif
 
-
+#if FF_API_LOCKMGR
 /**
  * Lock operation used by lockmgr
+ *
+ * @deprecated Deprecated together with av_lockmgr_register().
  */
 enum AVLockOp {
   AV_LOCK_CREATE,  ///< Create a mutex
@@ -5132,8 +6139,13 @@ enum AVLockOp {
  *           mechanism (i.e. do not use a single static object to
  *           implement your lock manager). If cb is set to NULL the
  *           lockmgr will be unregistered.
+ *
+ * @deprecated This function does nothing, and always returns 0. Be sure to
+ *             build with thread support to get basic thread safety.
  */
+attribute_deprecated
 int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op));
+#endif
 
 /**
  * Get the type of the given codec.
@@ -5141,6 +6153,12 @@ int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op));
 enum AVMediaType avcodec_get_type(enum AVCodecID codec_id);
 
 /**
+ * Get the name of a codec.
+ * @return  a static string identifying the codec; never NULL
+ */
+const char *avcodec_get_name(enum AVCodecID id);
+
+/**
  * @return a positive value if s is open (i.e. avcodec_open2() was called on it
  * with no corresponding avcodec_close()), 0 otherwise.
  */
diff --git a/libavcodec/avcodecres.rc b/libavcodec/avcodecres.rc
new file mode 100644
index 0000000..4b69686
--- /dev/null
+++ b/libavcodec/avcodecres.rc
@@ -0,0 +1,55 @@
+/*
+ * Windows resource file for libavcodec
+ *
+ * Copyright (C) 2012 James Almer
+ * Copyright (C) 2013 Tiancheng "Timothy" Gu
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <windows.h>
+#include "libavcodec/version.h"
+#include "libavutil/ffversion.h"
+#include "config.h"
+
+1 VERSIONINFO
+FILEVERSION     LIBAVCODEC_VERSION_MAJOR, LIBAVCODEC_VERSION_MINOR, LIBAVCODEC_VERSION_MICRO, 0
+PRODUCTVERSION  LIBAVCODEC_VERSION_MAJOR, LIBAVCODEC_VERSION_MINOR, LIBAVCODEC_VERSION_MICRO, 0
+FILEFLAGSMASK   VS_FFI_FILEFLAGSMASK
+FILEOS          VOS_NT_WINDOWS32
+FILETYPE        VFT_DLL
+{
+    BLOCK "StringFileInfo"
+    {
+        BLOCK "040904B0"
+        {
+            VALUE "CompanyName",      "FFmpeg Project"
+            VALUE "FileDescription",  "FFmpeg codec library"
+            VALUE "FileVersion",      AV_STRINGIFY(LIBAVCODEC_VERSION)
+            VALUE "InternalName",     "libavcodec"
+            VALUE "LegalCopyright",   "Copyright (C) 2000-" AV_STRINGIFY(CONFIG_THIS_YEAR) " FFmpeg Project"
+            VALUE "OriginalFilename", "avcodec" BUILDSUF "-" AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) SLIBSUF
+            VALUE "ProductName",      "FFmpeg"
+            VALUE "ProductVersion",   FFMPEG_VERSION
+        }
+    }
+
+    BLOCK "VarFileInfo"
+    {
+        VALUE "Translation", 0x0409, 0x04B0
+    }
+}
diff --git a/libavcodec/avdct.c b/libavcodec/avdct.c
new file mode 100644
index 0000000..47e5f71
--- /dev/null
+++ b/libavcodec/avdct.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "idctdsp.h"
+#include "fdctdsp.h"
+#include "pixblockdsp.h"
+#include "avdct.h"
+
+#define OFFSET(x) offsetof(AVDCT,x)
+#define DEFAULT 0 //should be NAN but it does not work as it is not a constant in glibc as required by ANSI/ISO C
+//these names are too long to be readable
+#define V AV_OPT_FLAG_VIDEO_PARAM
+#define A AV_OPT_FLAG_AUDIO_PARAM
+#define E AV_OPT_FLAG_ENCODING_PARAM
+#define D AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption avdct_options[] = {
+{"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"},
+{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
+{"fastint", "fast integer (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"},
+{"int", "accurate integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_INT }, INT_MIN, INT_MAX, V|E, "dct"},
+{"mmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"},
+{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, "dct"},
+{"faan", "floating point AAN DCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, "dct"},
+
+{"idct", "select IDCT implementation", OFFSET(idct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E|D, "idct"},
+{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_AUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"int", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_INT }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simple", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplemmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"arm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ALTIVEC }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplearm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplearmv5te", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV5TE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplearmv6", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV6 }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simpleneon", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLENEON }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"xvid", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"xvidmmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"faani", "floating point AAN IDCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"},
+{"simpleauto", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
+
+{"bits_per_sample", "", OFFSET(bits_per_sample), AV_OPT_TYPE_INT, {.i64 = 8 }, 0, 14, 0,},
+{NULL},
+};
+
+static const AVClass avdct_class = {
+    .class_name              = "AVDCT",
+    .option                  = avdct_options,
+    .version                 = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_dct_get_class(void)
+{
+    return &avdct_class;
+}
+
+AVDCT *avcodec_dct_alloc(void)
+{
+    AVDCT *dsp = av_mallocz(sizeof(AVDCT));
+
+    if (!dsp)
+        return NULL;
+
+    dsp->av_class = &avdct_class;
+    av_opt_set_defaults(dsp);
+
+    return dsp;
+}
+
+int avcodec_dct_init(AVDCT *dsp)
+{
+    AVCodecContext *avctx = avcodec_alloc_context3(NULL);
+
+    if (!avctx)
+        return AVERROR(ENOMEM);
+
+    avctx->idct_algo = dsp->idct_algo;
+    avctx->dct_algo  = dsp->dct_algo;
+    avctx->bits_per_raw_sample = dsp->bits_per_sample;
+
+#define COPY(src, name) memcpy(&dsp->name, &src.name, sizeof(dsp->name))
+
+#if CONFIG_IDCTDSP
+    {
+        IDCTDSPContext idsp;
+        ff_idctdsp_init(&idsp, avctx);
+        COPY(idsp, idct);
+        COPY(idsp, idct_permutation);
+    }
+#endif
+
+#if CONFIG_FDCTDSP
+    {
+        FDCTDSPContext fdsp;
+        ff_fdctdsp_init(&fdsp, avctx);
+        COPY(fdsp, fdct);
+    }
+#endif
+
+#if CONFIG_PIXBLOCKDSP
+    {
+        PixblockDSPContext pdsp;
+        ff_pixblockdsp_init(&pdsp, avctx);
+        COPY(pdsp, get_pixels);
+    }
+#endif
+
+    avcodec_free_context(&avctx);
+
+    return 0;
+}
diff --git a/libavcodec/avdct.h b/libavcodec/avdct.h
new file mode 100644
index 0000000..272422e
--- /dev/null
+++ b/libavcodec/avdct.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVDCT_H
+#define AVCODEC_AVDCT_H
+
+#include "libavutil/opt.h"
+
+/**
+ * AVDCT context.
+ * @note function pointers can be NULL if the specific features have been
+ *       disabled at build time.
+ */
+typedef struct AVDCT {
+    const AVClass *av_class;
+
+    void (*idct)(int16_t *block /* align 16 */);
+
+    /**
+     * IDCT input permutation.
+     * Several optimized IDCTs need a permutated input (relative to the
+     * normal order of the reference IDCT).
+     * This permutation must be performed before the idct_put/add.
+     * Note, normally this can be merged with the zigzag/alternate scan<br>
+     * An example to avoid confusion:
+     * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...)
+     * - (x -> reference DCT -> reference IDCT -> x)
+     * - (x -> reference DCT -> simple_mmx_perm = idct_permutation
+     *    -> simple_idct_mmx -> x)
+     * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant
+     *    -> simple_idct_mmx -> ...)
+     */
+    uint8_t idct_permutation[64];
+
+    void (*fdct)(int16_t *block /* align 16 */);
+
+
+    /**
+     * DCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int dct_algo;
+
+    /**
+     * IDCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int idct_algo;
+
+    void (*get_pixels)(int16_t *block /* align 16 */,
+                       const uint8_t *pixels /* align 8 */,
+                       ptrdiff_t line_size);
+
+    int bits_per_sample;
+} AVDCT;
+
+/**
+ * Allocates a AVDCT context.
+ * This needs to be initialized with avcodec_dct_init() after optionally
+ * configuring it with AVOptions.
+ *
+ * To free it use av_free()
+ */
+AVDCT *avcodec_dct_alloc(void);
+int avcodec_dct_init(AVDCT *);
+
+const AVClass *avcodec_dct_get_class(void);
+
+#endif /* AVCODEC_AVDCT_H */
diff --git a/libavcodec/avfft.c b/libavcodec/avfft.c
index 513f57e..2200f37 100644
--- a/libavcodec/avfft.c
+++ b/libavcodec/avfft.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 FFTContext *av_fft_init(int nbits, int inverse)
 {
-    FFTContext *s = av_malloc(sizeof(*s));
+    FFTContext *s = av_mallocz(sizeof(*s));
 
     if (s && ff_fft_init(s, nbits, inverse))
         av_freep(&s);
diff --git a/libavcodec/avfft.h b/libavcodec/avfft.h
index e2e727d..0c0f9b8 100644
--- a/libavcodec/avfft.h
+++ b/libavcodec/avfft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/avpacket.c b/libavcodec/avpacket.c
index c705df3..8f0603d 100644
--- a/libavcodec/avpacket.c
+++ b/libavcodec/avpacket.c
@@ -2,20 +2,20 @@
  * AVPacket functions for libavcodec
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/mem.h"
 #include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
 
 void av_init_packet(AVPacket *pkt)
 {
@@ -108,24 +110,39 @@ int av_grow_packet(AVPacket *pkt, int grow_by)
 {
     int new_size;
     av_assert0((unsigned)pkt->size <= INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!pkt->size)
-        return av_new_packet(pkt, grow_by);
     if ((unsigned)grow_by >
         INT_MAX - (pkt->size + AV_INPUT_BUFFER_PADDING_SIZE))
-        return -1;
+        return AVERROR(ENOMEM);
 
     new_size = pkt->size + grow_by + AV_INPUT_BUFFER_PADDING_SIZE;
     if (pkt->buf) {
-        int ret = av_buffer_realloc(&pkt->buf, new_size);
-        if (ret < 0)
-            return ret;
+        size_t data_offset;
+        uint8_t *old_data = pkt->data;
+        if (pkt->data == NULL) {
+            data_offset = 0;
+            pkt->data = pkt->buf->data;
+        } else {
+            data_offset = pkt->data - pkt->buf->data;
+            if (data_offset > INT_MAX - new_size)
+                return AVERROR(ENOMEM);
+        }
+
+        if (new_size + data_offset > pkt->buf->size) {
+            int ret = av_buffer_realloc(&pkt->buf, new_size + data_offset);
+            if (ret < 0) {
+                pkt->data = old_data;
+                return ret;
+            }
+            pkt->data = pkt->buf->data + data_offset;
+        }
     } else {
         pkt->buf = av_buffer_alloc(new_size);
         if (!pkt->buf)
             return AVERROR(ENOMEM);
-        memcpy(pkt->buf->data, pkt->data, FFMIN(pkt->size, pkt->size + grow_by));
+        if (pkt->size > 0)
+            memcpy(pkt->buf->data, pkt->data, pkt->size);
+        pkt->data = pkt->buf->data;
     }
-    pkt->data  = pkt->buf->data;
     pkt->size += grow_by;
     memset(pkt->data + pkt->size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
@@ -177,38 +194,76 @@ do {                                         \
         dst = data;                                                     \
     } while (0)
 
-int av_dup_packet(AVPacket *pkt)
+/* Makes duplicates of data, side_data, but does not copy any other fields */
+static int copy_packet_data(AVPacket *pkt, const AVPacket *src, int dup)
 {
-    AVPacket tmp_pkt;
-
-    if (!pkt->buf && pkt->data) {
-        tmp_pkt = *pkt;
-
-        pkt->data      = NULL;
-        pkt->side_data = NULL;
-        DUP_DATA(pkt->data, tmp_pkt.data, pkt->size, 1, ALLOC_BUF);
+    pkt->data      = NULL;
+    pkt->side_data = NULL;
+    pkt->side_data_elems = 0;
+    if (pkt->buf) {
+        AVBufferRef *ref = av_buffer_ref(src->buf);
+        if (!ref)
+            return AVERROR(ENOMEM);
+        pkt->buf  = ref;
+        pkt->data = ref->data;
+    } else {
+        DUP_DATA(pkt->data, src->data, pkt->size, 1, ALLOC_BUF);
+    }
+    if (src->side_data_elems && dup) {
+        pkt->side_data = src->side_data;
+        pkt->side_data_elems = src->side_data_elems;
+    }
+    if (src->side_data_elems && !dup) {
+        return av_copy_packet_side_data(pkt, src);
+    }
+    return 0;
 
-        if (pkt->side_data_elems) {
-            int i;
+failed_alloc:
+    av_packet_unref(pkt);
+    return AVERROR(ENOMEM);
+}
 
-            DUP_DATA(pkt->side_data, tmp_pkt.side_data,
-                     pkt->side_data_elems * sizeof(*pkt->side_data), 0, ALLOC_MALLOC);
+int av_copy_packet_side_data(AVPacket *pkt, const AVPacket *src)
+{
+    if (src->side_data_elems) {
+        int i;
+        DUP_DATA(pkt->side_data, src->side_data,
+                src->side_data_elems * sizeof(*src->side_data), 0, ALLOC_MALLOC);
+        if (src != pkt) {
             memset(pkt->side_data, 0,
-                   pkt->side_data_elems * sizeof(*pkt->side_data));
-            for (i = 0; i < pkt->side_data_elems; i++) {
-                DUP_DATA(pkt->side_data[i].data, tmp_pkt.side_data[i].data,
-                         tmp_pkt.side_data[i].size, 1, ALLOC_MALLOC);
-                pkt->side_data[i].size = tmp_pkt.side_data[i].size;
-                pkt->side_data[i].type = tmp_pkt.side_data[i].type;
-            }
+                   src->side_data_elems * sizeof(*src->side_data));
+        }
+        for (i = 0; i < src->side_data_elems; i++) {
+            DUP_DATA(pkt->side_data[i].data, src->side_data[i].data,
+                    src->side_data[i].size, 1, ALLOC_MALLOC);
+            pkt->side_data[i].size = src->side_data[i].size;
+            pkt->side_data[i].type = src->side_data[i].type;
         }
     }
+    pkt->side_data_elems = src->side_data_elems;
     return 0;
 
 failed_alloc:
     av_packet_unref(pkt);
     return AVERROR(ENOMEM);
 }
+
+int av_dup_packet(AVPacket *pkt)
+{
+    AVPacket tmp_pkt;
+
+    if (!pkt->buf && pkt->data) {
+        tmp_pkt = *pkt;
+        return copy_packet_data(pkt, &tmp_pkt, 1);
+    }
+    return 0;
+}
+
+int av_copy_packet(AVPacket *dst, const AVPacket *src)
+{
+    *dst = *src;
+    return copy_packet_data(dst, src, 0);
+}
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
@@ -216,7 +271,7 @@ void av_packet_free_side_data(AVPacket *pkt)
 {
     int i;
     for (i = 0; i < pkt->side_data_elems; i++)
-        av_free(pkt->side_data[i].data);
+        av_freep(&pkt->side_data[i].data);
     av_freep(&pkt->side_data);
     pkt->side_data_elems = 0;
 }
@@ -241,9 +296,20 @@ int av_packet_add_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
                             uint8_t *data, size_t size)
 {
     AVPacketSideData *tmp;
-    int elems = pkt->side_data_elems;
+    int i, elems = pkt->side_data_elems;
+
+    for (i = 0; i < elems; i++) {
+        AVPacketSideData *sd = &pkt->side_data[i];
 
-    if ((unsigned)elems + 1 > INT_MAX / sizeof(*pkt->side_data))
+        if (sd->type == type) {
+            av_free(sd->data);
+            sd->data = data;
+            sd->size = size;
+            return 0;
+        }
+    }
+
+    if ((unsigned)elems + 1 > AV_PKT_DATA_NB)
         return AVERROR(ERANGE);
 
     tmp = av_realloc(pkt->side_data, (elems + 1) * sizeof(*tmp));
@@ -266,12 +332,11 @@ uint8_t *av_packet_new_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
     int ret;
     uint8_t *data;
 
-    if (!size || (unsigned)size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE)
+    if ((unsigned)size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE)
         return NULL;
-    data = av_malloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+    data = av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!data)
         return NULL;
-    memset(data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
     ret = av_packet_add_side_data(pkt, type, data, size);
     if (ret < 0) {
@@ -282,7 +347,7 @@ uint8_t *av_packet_new_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
     return data;
 }
 
-uint8_t *av_packet_get_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
+uint8_t *av_packet_get_side_data(const AVPacket *pkt, enum AVPacketSideDataType type,
                                  int *size)
 {
     int i;
@@ -294,9 +359,192 @@ uint8_t *av_packet_get_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
             return pkt->side_data[i].data;
         }
     }
+    if (size)
+        *size = 0;
+    return NULL;
+}
+
+const char *av_packet_side_data_name(enum AVPacketSideDataType type)
+{
+    switch(type) {
+    case AV_PKT_DATA_PALETTE:                    return "Palette";
+    case AV_PKT_DATA_NEW_EXTRADATA:              return "New Extradata";
+    case AV_PKT_DATA_PARAM_CHANGE:               return "Param Change";
+    case AV_PKT_DATA_H263_MB_INFO:               return "H263 MB Info";
+    case AV_PKT_DATA_REPLAYGAIN:                 return "Replay Gain";
+    case AV_PKT_DATA_DISPLAYMATRIX:              return "Display Matrix";
+    case AV_PKT_DATA_STEREO3D:                   return "Stereo 3D";
+    case AV_PKT_DATA_AUDIO_SERVICE_TYPE:         return "Audio Service Type";
+    case AV_PKT_DATA_QUALITY_STATS:              return "Quality stats";
+    case AV_PKT_DATA_FALLBACK_TRACK:             return "Fallback track";
+    case AV_PKT_DATA_CPB_PROPERTIES:             return "CPB properties";
+    case AV_PKT_DATA_SKIP_SAMPLES:               return "Skip Samples";
+    case AV_PKT_DATA_JP_DUALMONO:                return "JP Dual Mono";
+    case AV_PKT_DATA_STRINGS_METADATA:           return "Strings Metadata";
+    case AV_PKT_DATA_SUBTITLE_POSITION:          return "Subtitle Position";
+    case AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL:   return "Matroska BlockAdditional";
+    case AV_PKT_DATA_WEBVTT_IDENTIFIER:          return "WebVTT ID";
+    case AV_PKT_DATA_WEBVTT_SETTINGS:            return "WebVTT Settings";
+    case AV_PKT_DATA_METADATA_UPDATE:            return "Metadata Update";
+    case AV_PKT_DATA_MPEGTS_STREAM_ID:           return "MPEGTS Stream ID";
+    case AV_PKT_DATA_MASTERING_DISPLAY_METADATA: return "Mastering display metadata";
+    case AV_PKT_DATA_CONTENT_LIGHT_LEVEL:        return "Content light level metadata";
+    case AV_PKT_DATA_SPHERICAL:                  return "Spherical Mapping";
+    case AV_PKT_DATA_A53_CC:                     return "A53 Closed Captions";
+    case AV_PKT_DATA_ENCRYPTION_INIT_INFO:       return "Encryption initialization data";
+    case AV_PKT_DATA_ENCRYPTION_INFO:            return "Encryption info";
+    case AV_PKT_DATA_AFD:                        return "Active Format Description data";
+    }
     return NULL;
 }
 
+#if FF_API_MERGE_SD_API
+
+#define FF_MERGE_MARKER 0x8c4d9d108e25e9feULL
+
+int av_packet_merge_side_data(AVPacket *pkt){
+    if(pkt->side_data_elems){
+        AVBufferRef *buf;
+        int i;
+        uint8_t *p;
+        uint64_t size= pkt->size + 8LL + AV_INPUT_BUFFER_PADDING_SIZE;
+        AVPacket old= *pkt;
+        for (i=0; i<old.side_data_elems; i++) {
+            size += old.side_data[i].size + 5LL;
+        }
+        if (size > INT_MAX)
+            return AVERROR(EINVAL);
+        buf = av_buffer_alloc(size);
+        if (!buf)
+            return AVERROR(ENOMEM);
+        pkt->buf = buf;
+        pkt->data = p = buf->data;
+        pkt->size = size - AV_INPUT_BUFFER_PADDING_SIZE;
+        bytestream_put_buffer(&p, old.data, old.size);
+        for (i=old.side_data_elems-1; i>=0; i--) {
+            bytestream_put_buffer(&p, old.side_data[i].data, old.side_data[i].size);
+            bytestream_put_be32(&p, old.side_data[i].size);
+            *p++ = old.side_data[i].type | ((i==old.side_data_elems-1)*128);
+        }
+        bytestream_put_be64(&p, FF_MERGE_MARKER);
+        av_assert0(p-pkt->data == pkt->size);
+        memset(p, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        av_packet_unref(&old);
+        pkt->side_data_elems = 0;
+        pkt->side_data = NULL;
+        return 1;
+    }
+    return 0;
+}
+
+int av_packet_split_side_data(AVPacket *pkt){
+    if (!pkt->side_data_elems && pkt->size >12 && AV_RB64(pkt->data + pkt->size - 8) == FF_MERGE_MARKER){
+        int i;
+        unsigned int size;
+        uint8_t *p;
+
+        p = pkt->data + pkt->size - 8 - 5;
+        for (i=1; ; i++){
+            size = AV_RB32(p);
+            if (size>INT_MAX - 5 || p - pkt->data < size)
+                return 0;
+            if (p[4]&128)
+                break;
+            if (p - pkt->data < size + 5)
+                return 0;
+            p-= size+5;
+        }
+
+        if (i > AV_PKT_DATA_NB)
+            return AVERROR(ERANGE);
+
+        pkt->side_data = av_malloc_array(i, sizeof(*pkt->side_data));
+        if (!pkt->side_data)
+            return AVERROR(ENOMEM);
+
+        p= pkt->data + pkt->size - 8 - 5;
+        for (i=0; ; i++){
+            size= AV_RB32(p);
+            av_assert0(size<=INT_MAX - 5 && p - pkt->data >= size);
+            pkt->side_data[i].data = av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE);
+            pkt->side_data[i].size = size;
+            pkt->side_data[i].type = p[4]&127;
+            if (!pkt->side_data[i].data)
+                return AVERROR(ENOMEM);
+            memcpy(pkt->side_data[i].data, p-size, size);
+            pkt->size -= size + 5;
+            if(p[4]&128)
+                break;
+            p-= size+5;
+        }
+        pkt->size -= 8;
+        pkt->side_data_elems = i+1;
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size)
+{
+    AVDictionaryEntry *t = NULL;
+    uint8_t *data = NULL;
+    *size = 0;
+
+    if (!dict)
+        return NULL;
+
+    while ((t = av_dict_get(dict, "", t, AV_DICT_IGNORE_SUFFIX))) {
+        const size_t keylen   = strlen(t->key);
+        const size_t valuelen = strlen(t->value);
+        const size_t new_size = *size + keylen + 1 + valuelen + 1;
+        uint8_t *const new_data = av_realloc(data, new_size);
+
+        if (!new_data)
+            goto fail;
+        data = new_data;
+        if (new_size > INT_MAX)
+            goto fail;
+
+        memcpy(data + *size, t->key, keylen + 1);
+        memcpy(data + *size + keylen + 1, t->value, valuelen + 1);
+
+        *size = new_size;
+    }
+
+    return data;
+
+fail:
+    av_freep(&data);
+    *size = 0;
+    return NULL;
+}
+
+int av_packet_unpack_dictionary(const uint8_t *data, int size, AVDictionary **dict)
+{
+    const uint8_t *end = data + size;
+    int ret = 0;
+
+    if (!dict || !data || !size)
+        return ret;
+    if (size && end[-1])
+        return AVERROR_INVALIDDATA;
+    while (data < end) {
+        const uint8_t *key = data;
+        const uint8_t *val = data + strlen(key) + 1;
+
+        if (val >= end || !*key)
+            return AVERROR_INVALIDDATA;
+
+        ret = av_dict_set(dict, key, val, 0);
+        if (ret < 0)
+            break;
+        data = val + strlen(val) + 1;
+    }
+
+    return ret;
+}
+
 int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
                                int size)
 {
@@ -329,11 +577,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
     dst->flags                = src->flags;
     dst->stream_index         = src->stream_index;
 
+    dst->side_data            = NULL;
+    dst->side_data_elems      = 0;
     for (i = 0; i < src->side_data_elems; i++) {
-         enum AVPacketSideDataType type = src->side_data[i].type;
-         int size          = src->side_data[i].size;
-         uint8_t *src_data = src->side_data[i].data;
-         uint8_t *dst_data = av_packet_new_side_data(dst, type, size);
+        enum AVPacketSideDataType type = src->side_data[i].type;
+        int size          = src->side_data[i].size;
+        uint8_t *src_data = src->side_data[i].data;
+        uint8_t *dst_data = av_packet_new_side_data(dst, type, size);
 
         if (!dst_data) {
             av_packet_free_side_data(dst);
@@ -366,7 +616,9 @@ int av_packet_ref(AVPacket *dst, const AVPacket *src)
         ret = packet_alloc(&dst->buf, src->size);
         if (ret < 0)
             goto fail;
-        memcpy(dst->buf->data, src->data, src->size);
+        av_assert1(!src->size || src->data);
+        if (src->size)
+            memcpy(dst->buf->data, src->data, src->size);
 
         dst->data = dst->buf->data;
     } else {
@@ -407,6 +659,47 @@ void av_packet_move_ref(AVPacket *dst, AVPacket *src)
     src->size = 0;
 }
 
+int av_packet_make_refcounted(AVPacket *pkt)
+{
+    int ret;
+
+    if (pkt->buf)
+        return 0;
+
+    ret = packet_alloc(&pkt->buf, pkt->size);
+    if (ret < 0)
+        return ret;
+    av_assert1(!pkt->size || pkt->data);
+    if (pkt->size)
+        memcpy(pkt->buf->data, pkt->data, pkt->size);
+
+    pkt->data = pkt->buf->data;
+
+    return 0;
+}
+
+int av_packet_make_writable(AVPacket *pkt)
+{
+    AVBufferRef *buf = NULL;
+    int ret;
+
+    if (pkt->buf && av_buffer_is_writable(pkt->buf))
+        return 0;
+
+    ret = packet_alloc(&buf, pkt->size);
+    if (ret < 0)
+        return ret;
+    av_assert1(!pkt->size || pkt->data);
+    if (pkt->size)
+        memcpy(buf->data, pkt->data, pkt->size);
+
+    av_buffer_unref(&pkt->buf);
+    pkt->buf  = buf;
+    pkt->data = buf->data;
+
+    return 0;
+}
+
 void av_packet_rescale_ts(AVPacket *pkt, AVRational src_tb, AVRational dst_tb)
 {
     if (pkt->pts != AV_NOPTS_VALUE)
@@ -422,3 +715,28 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 }
+
+int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type)
+{
+    uint8_t *side_data;
+    int side_data_size;
+    int i;
+
+    side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_QUALITY_STATS, &side_data_size);
+    if (!side_data) {
+        side_data_size = 4+4+8*error_count;
+        side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_STATS,
+                                            side_data_size);
+    }
+
+    if (!side_data || side_data_size < 4+4+8*error_count)
+        return AVERROR(ENOMEM);
+
+    AV_WL32(side_data   , quality  );
+    side_data[4] = pict_type;
+    side_data[5] = error_count;
+    for (i = 0; i<error_count; i++)
+        AV_WL64(side_data+8 + 8*i , error[i]);
+
+    return 0;
+}
diff --git a/libavcodec/avpicture.c b/libavcodec/avpicture.c
index 786d740..56435f4 100644
--- a/libavcodec/avpicture.c
+++ b/libavcodec/avpicture.c
@@ -2,20 +2,20 @@
  * AVPicture management routines
  * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,19 +34,18 @@
 
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
-int avpicture_fill(AVPicture *picture, uint8_t *ptr,
+int avpicture_fill(AVPicture *picture, const uint8_t *ptr,
                    enum AVPixelFormat pix_fmt, int width, int height)
 {
     return av_image_fill_arrays(picture->data, picture->linesize,
                                 ptr, pix_fmt, width, height, 1);
 }
 
-int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt,
-                     int width, int height,
+int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt, int width, int height,
                      unsigned char *dest, int dest_size)
 {
     return av_image_copy_to_buffer(dest, dest_size,
-                                   src->data, src->linesize,
+                                   (const uint8_t * const*)src->data, src->linesize,
                                    pix_fmt, width, height, 1);
 }
 
@@ -70,13 +69,13 @@ int avpicture_alloc(AVPicture *picture,
 
 void avpicture_free(AVPicture *picture)
 {
-    av_free(picture->data[0]);
+    av_freep(&picture->data[0]);
 }
 
 void av_picture_copy(AVPicture *dst, const AVPicture *src,
                      enum AVPixelFormat pix_fmt, int width, int height)
 {
-    av_image_copy(dst->data, dst->linesize, src->data,
+    av_image_copy(dst->data, dst->linesize, (const uint8_t **)src->data,
                   src->linesize, pix_fmt, width, height);
 }
 FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/libavcodec/avr32/mathops.h b/libavcodec/avr32/mathops.h
index 528b7ad..85f42b5 100644
--- a/libavcodec/avr32/mathops.h
+++ b/libavcodec/avr32/mathops.h
@@ -2,20 +2,20 @@
  * Simple math operations
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/avrndec.c b/libavcodec/avrndec.c
new file mode 100644
index 0000000..104ff2d
--- /dev/null
+++ b/libavcodec/avrndec.c
@@ -0,0 +1,173 @@
+/*
+ * AVRn decoder
+ * Copyright (c) 2012 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "mjpeg.h"
+#include "mjpegdec.h"
+#include "libavutil/imgutils.h"
+
+typedef struct {
+    AVCodecContext *mjpeg_avctx;
+    int is_mjpeg;
+    int interlace;
+    int tff;
+} AVRnContext;
+
+static av_cold int init(AVCodecContext *avctx)
+{
+    AVRnContext *a = avctx->priv_data;
+    int ret;
+
+    // Support "Resolution 1:1" for Avid AVI Codec
+    a->is_mjpeg = avctx->extradata_size < 31 || memcmp(&avctx->extradata[28], "1:1", 3);
+
+    if(!a->is_mjpeg && avctx->lowres) {
+        av_log(avctx, AV_LOG_ERROR, "lowres is not possible with rawvideo\n");
+        return AVERROR(EINVAL);
+    }
+
+    if(a->is_mjpeg) {
+        AVCodec *codec = avcodec_find_decoder(AV_CODEC_ID_MJPEG);
+        AVDictionary *thread_opt = NULL;
+        if (!codec) {
+            av_log(avctx, AV_LOG_ERROR, "MJPEG codec not found\n");
+            return AVERROR_DECODER_NOT_FOUND;
+        }
+
+        a->mjpeg_avctx = avcodec_alloc_context3(codec);
+
+        av_dict_set(&thread_opt, "threads", "1", 0); // Is this needed ?
+        a->mjpeg_avctx->refcounted_frames = 1;
+        a->mjpeg_avctx->flags = avctx->flags;
+        a->mjpeg_avctx->idct_algo = avctx->idct_algo;
+        a->mjpeg_avctx->lowres = avctx->lowres;
+        a->mjpeg_avctx->width = avctx->width;
+        a->mjpeg_avctx->height = avctx->height;
+
+        if ((ret = ff_codec_open2_recursive(a->mjpeg_avctx, codec, &thread_opt)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "MJPEG codec failed to open\n");
+        }
+        av_dict_free(&thread_opt);
+
+        return ret;
+    }
+
+    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return ret;
+
+    avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+
+    if(avctx->extradata_size >= 9 && avctx->extradata[4]+28 < avctx->extradata_size) {
+        int ndx = avctx->extradata[4] + 4;
+        a->interlace = !memcmp(avctx->extradata + ndx, "1:1(", 4);
+        if(a->interlace) {
+            a->tff = avctx->extradata[ndx + 24] == 1;
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int end(AVCodecContext *avctx)
+{
+    AVRnContext *a = avctx->priv_data;
+
+    avcodec_close(a->mjpeg_avctx);
+    av_freep(&a->mjpeg_avctx);
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame, AVPacket *avpkt)
+{
+    AVRnContext *a = avctx->priv_data;
+    AVFrame *p = data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+    int y, ret, true_height;
+
+    if(a->is_mjpeg) {
+        ret = avcodec_decode_video2(a->mjpeg_avctx, data, got_frame, avpkt);
+
+        if (ret >= 0 && *got_frame && avctx->width <= p->width && avctx->height <= p->height) {
+            int shift = p->height - avctx->height;
+            int subsample_h, subsample_v;
+
+            av_pix_fmt_get_chroma_sub_sample(p->format, &subsample_h, &subsample_v);
+
+            p->data[0] += p->linesize[0] * shift;
+            if (p->data[2]) {
+                p->data[1] += p->linesize[1] * (shift>>subsample_v);
+                p->data[2] += p->linesize[2] * (shift>>subsample_v);
+            }
+
+            p->width  = avctx->width;
+            p->height = avctx->height;
+        }
+        avctx->pix_fmt = a->mjpeg_avctx->pix_fmt;
+        return ret;
+    }
+
+    true_height    = buf_size / (2*avctx->width);
+
+    if(buf_size < 2*avctx->width * avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
+    p->pict_type= AV_PICTURE_TYPE_I;
+    p->key_frame= 1;
+
+    if(a->interlace) {
+        buf += (true_height - avctx->height)*avctx->width;
+        for(y = 0; y < avctx->height-1; y+=2) {
+            memcpy(p->data[0] + (y+ a->tff)*p->linesize[0], buf                             , 2*avctx->width);
+            memcpy(p->data[0] + (y+!a->tff)*p->linesize[0], buf + avctx->width*true_height+4, 2*avctx->width);
+            buf += 2*avctx->width;
+        }
+    } else {
+        buf += (true_height - avctx->height)*avctx->width*2;
+        for(y = 0; y < avctx->height; y++) {
+            memcpy(p->data[0] + y*p->linesize[0], buf, 2*avctx->width);
+            buf += 2*avctx->width;
+        }
+    }
+
+    *got_frame      = 1;
+    return buf_size;
+}
+
+AVCodec ff_avrn_decoder = {
+    .name           = "avrn",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRN,
+    .priv_data_size = sizeof(AVRnContext),
+    .init           = init,
+    .close          = end,
+    .decode         = decode_frame,
+    .max_lowres     = 3,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
diff --git a/libavcodec/avs.c b/libavcodec/avs.c
index edd91ef..66724d4 100644
--- a/libavcodec/avs.c
+++ b/libavcodec/avs.c
@@ -2,25 +2,25 @@
  * AVS video decoder.
  * Copyright (c) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 typedef struct AvsContext {
@@ -57,12 +57,10 @@ avs_decode_frame(AVCodecContext * avctx,
     int i, j, x, y, stride, ret, vect_w = 3, vect_h = 3;
     AvsVideoSubType sub_type;
     AvsBlockType type;
-    BitstreamContext change_map;
+    GetBitContext change_map = {0}; //init to silence warning
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_P;
     p->key_frame = 0;
 
@@ -84,8 +82,10 @@ avs_decode_frame(AVCodecContext * avctx,
         if (first >= 256 || last > 256 || buf_end - buf < 4 + 4 + 3 * (last - first))
             return AVERROR_INVALIDDATA;
         buf += 4;
-        for (i=first; i<last; i++, buf+=3)
+        for (i=first; i<last; i++, buf+=3) {
             pal[i] = (buf[0] << 18) | (buf[1] << 10) | (buf[2] << 2);
+            pal[i] |= 0xFFU << 24 | (pal[i] >> 6) & 0x30303;
+        }
 
         sub_type = buf[0];
         type = buf[1];
@@ -125,13 +125,13 @@ avs_decode_frame(AVCodecContext * avctx,
         int map_size = ((318 / vect_w + 7) / 8) * (198 / vect_h);
         if (buf_end - table < map_size)
             return AVERROR_INVALIDDATA;
-        bitstream_init8(&change_map, table, map_size);
+        init_get_bits(&change_map, table, map_size * 8);
         table += map_size;
     }
 
     for (y=0; y<198; y+=vect_h) {
         for (x=0; x<318; x+=vect_w) {
-            if (sub_type == AVS_I_FRAME || bitstream_read_bit(&change_map)) {
+            if (sub_type == AVS_I_FRAME || get_bits1(&change_map)) {
                 if (buf_end - table < 1)
                     return AVERROR_INVALIDDATA;
                 vect = &buf[*table++ * (vect_w * vect_h)];
@@ -145,7 +145,7 @@ avs_decode_frame(AVCodecContext * avctx,
             }
         }
         if (sub_type != AVS_I_FRAME)
-            bitstream_align(&change_map);
+            align_get_bits(&change_map);
     }
 
     if ((ret = av_frame_ref(picture, p)) < 0)
diff --git a/libavcodec/avs2_parser.c b/libavcodec/avs2_parser.c
new file mode 100644
index 0000000..1c9b342
--- /dev/null
+++ b/libavcodec/avs2_parser.c
@@ -0,0 +1,95 @@
+/*
+ * AVS2-P2/IEEE1857.4 video parser.
+ * Copyright (c) 2018  Huiwen Ren <hwrenx@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "parser.h"
+
+#define SLICE_MAX_START_CODE    0x000001af
+
+#define ISPIC(x)  ((x) == 0xB3 || (x) == 0xB6)
+#define ISUNIT(x) ((x) == 0xB0 || (x) == 0xB1 || (x) == 0xB2 || ISPIC(x))
+
+static int avs2_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size)
+{
+    int pic_found  = pc->frame_start_found;
+    uint32_t state = pc->state;
+    int cur = 0;
+
+    if (!pic_found) {
+        for (; cur < buf_size; ++cur) {
+            state = (state<<8) | buf[cur];
+            if (ISUNIT(buf[cur])){
+                ++cur;
+                pic_found = 1;
+                break;
+            }
+        }
+    }
+
+    if (pic_found) {
+        if (!buf_size)
+            return END_NOT_FOUND;
+        for (; cur < buf_size; ++cur) {
+            state = (state << 8) | buf[cur];
+            if ((state & 0xFFFFFF00) == 0x100 && state > SLICE_MAX_START_CODE) {
+                pc->frame_start_found = 0;
+                pc->state = -1;
+                return cur - 3;
+            }
+        }
+    }
+
+    pc->frame_start_found = pic_found;
+    pc->state = state;
+
+    return END_NOT_FOUND;
+}
+
+static int avs2_parse(AVCodecParserContext *s, AVCodecContext *avctx,
+                      const uint8_t **poutbuf, int *poutbuf_size,
+                      const uint8_t *buf, int buf_size)
+{
+    ParseContext *pc = s->priv_data;
+    int next;
+
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES)  {
+        next = buf_size;
+    } else {
+        next = avs2_find_frame_end(pc, buf, buf_size);
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
+    }
+
+    *poutbuf = buf;
+    *poutbuf_size = buf_size;
+
+    return next;
+}
+
+AVCodecParser ff_avs2_parser = {
+    .codec_ids      = { AV_CODEC_ID_AVS2 },
+    .priv_data_size = sizeof(ParseContext),
+    .parser_parse   = avs2_parse,
+    .parser_close   = ff_parse_close,
+    .split          = ff_mpeg4video_split,
+};
diff --git a/libavcodec/avuidec.c b/libavcodec/avuidec.c
new file mode 100644
index 0000000..4cf620d
--- /dev/null
+++ b/libavcodec/avuidec.c
@@ -0,0 +1,131 @@
+/*
+ * AVID Meridien decoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int avui_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+    return 0;
+}
+
+static int avui_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    int ret;
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data, *extradata = avctx->extradata;
+    const uint8_t *srca;
+    uint8_t *y, *u, *v, *a;
+    int transparent, interlaced = 1, skip, opaque_length, i, j, k;
+    uint32_t extradata_size = avctx->extradata_size;
+
+    while (extradata_size >= 24) {
+        uint32_t atom_size = AV_RB32(extradata);
+        if (!memcmp(&extradata[4], "APRGAPRG0001", 12)) {
+            interlaced = extradata[19] != 1;
+            break;
+        }
+        if (atom_size && atom_size <= extradata_size) {
+            extradata      += atom_size;
+            extradata_size -= atom_size;
+        } else {
+            break;
+        }
+    }
+    if (avctx->height == 486) {
+        skip = 10;
+    } else {
+        skip = 16;
+    }
+    opaque_length = 2 * avctx->width * (avctx->height + skip) + 4 * interlaced;
+    if (avpkt->size < opaque_length) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+    transparent = avctx->bits_per_coded_sample == 32 &&
+                  avpkt->size >= opaque_length * 2 + 4;
+    srca = src + opaque_length + 5;
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    if (!interlaced) {
+        src  += avctx->width * skip;
+        srca += avctx->width * skip;
+    }
+
+    for (i = 0; i < interlaced + 1; i++) {
+        src  += avctx->width * skip;
+        srca += avctx->width * skip;
+        if (interlaced && avctx->height == 486) {
+            y = pic->data[0] + (1 - i) * pic->linesize[0];
+            u = pic->data[1] + (1 - i) * pic->linesize[1];
+            v = pic->data[2] + (1 - i) * pic->linesize[2];
+            a = pic->data[3] + (1 - i) * pic->linesize[3];
+        } else {
+            y = pic->data[0] + i * pic->linesize[0];
+            u = pic->data[1] + i * pic->linesize[1];
+            v = pic->data[2] + i * pic->linesize[2];
+            a = pic->data[3] + i * pic->linesize[3];
+        }
+
+        for (j = 0; j < avctx->height >> interlaced; j++) {
+            for (k = 0; k < avctx->width >> 1; k++) {
+                u[    k    ] = *src++;
+                y[2 * k    ] = *src++;
+                a[2 * k    ] = 0xFF - (transparent ? *srca++ : 0);
+                srca++;
+                v[    k    ] = *src++;
+                y[2 * k + 1] = *src++;
+                a[2 * k + 1] = 0xFF - (transparent ? *srca++ : 0);
+                srca++;
+            }
+
+            y += (interlaced + 1) * pic->linesize[0];
+            u += (interlaced + 1) * pic->linesize[1];
+            v += (interlaced + 1) * pic->linesize[2];
+            a += (interlaced + 1) * pic->linesize[3];
+        }
+        src  += 4;
+        srca += 4;
+    }
+    *got_frame       = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_avui_decoder = {
+    .name         = "avui",
+    .long_name    = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AVUI,
+    .init         = avui_decode_init,
+    .decode       = avui_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
diff --git a/libavcodec/avuienc.c b/libavcodec/avuienc.c
new file mode 100644
index 0000000..b219906
--- /dev/null
+++ b/libavcodec/avuienc.c
@@ -0,0 +1,103 @@
+/*
+ * AVID Meridien encoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int avui_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width != 720 || avctx->height != 486 && avctx->height != 576) {
+        av_log(avctx, AV_LOG_ERROR, "Only 720x486 and 720x576 are supported.\n");
+        return AVERROR(EINVAL);
+    }
+    if (!(avctx->extradata = av_mallocz(144 + AV_INPUT_BUFFER_PADDING_SIZE)))
+        return AVERROR(ENOMEM);
+    avctx->extradata_size = 144;
+    memcpy(avctx->extradata, "\0\0\0\x18""APRGAPRG0001", 16);
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        avctx->extradata[19] = 2;
+    } else {
+        avctx->extradata[19] = 1;
+    }
+    memcpy(avctx->extradata + 24, "\0\0\0\x78""ARESARES0001""\0\0\0\x98", 20);
+    AV_WB32(avctx->extradata + 44, avctx->width);
+    AV_WB32(avctx->extradata + 48, avctx->height);
+    memcpy(avctx->extradata + 52, "\0\0\0\x1\0\0\0\x20\0\0\0\x2", 12);
+
+
+    return 0;
+}
+
+static int avui_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    int i, j, skip, ret, size, interlaced;
+
+    interlaced = avctx->field_order > AV_FIELD_PROGRESSIVE;
+
+    if (avctx->height == 486) {
+        skip = 10;
+    } else {
+        skip = 16;
+    }
+    size = 2 * avctx->width * (avctx->height + skip) + 8 * interlaced;
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+    dst = pkt->data;
+    if (!interlaced) {
+        memset(dst, 0, avctx->width * skip);
+        dst += avctx->width * skip;
+    }
+
+    for (i = 0; i <= interlaced; i++) {
+        uint8_t *src;
+        if (interlaced && avctx->height == 486) {
+            src = pic->data[0] + (1 - i) * pic->linesize[0];
+        } else {
+            src = pic->data[0] + i * pic->linesize[0];
+        }
+        memset(dst, 0, avctx->width * skip + 4 * i);
+        dst += avctx->width * skip + 4 * i;
+        for (j = 0; j < avctx->height; j += interlaced + 1) {
+            memcpy(dst, src, avctx->width * 2);
+            src += (interlaced + 1) * pic->linesize[0];
+            dst += avctx->width * 2;
+        }
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+AVCodec ff_avui_encoder = {
+    .name         = "avui",
+    .long_name    = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AVUI,
+    .init         = avui_encode_init,
+    .encode2      = avui_encode_frame,
+    .capabilities = AV_CODEC_CAP_EXPERIMENTAL | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_UYVY422, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/bethsoftvideo.c b/libavcodec/bethsoftvideo.c
index 61f098b..e5a73f5 100644
--- a/libavcodec/bethsoftvideo.c
+++ b/libavcodec/bethsoftvideo.c
@@ -2,20 +2,20 @@
  * Bethesda VID video decoder
  * Copyright (C) 2007 Nicholas Tung
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,7 +59,8 @@ static int set_palette(BethsoftvidContext *ctx)
         return AVERROR_INVALIDDATA;
 
     for(a = 0; a < 256; a++){
-        palette[a] = bytestream2_get_be24u(&ctx->g) * 4;
+        palette[a] = 0xFFU << 24 | bytestream2_get_be24u(&ctx->g) * 4;
+        palette[a] |= palette[a] >> 6 & 0x30303;
     }
     ctx->frame->palette_has_changed = 1;
     return 0;
@@ -78,10 +79,8 @@ static int bethsoftvid_decode_frame(AVCodecContext *avctx,
     int code, ret;
     int yoffset;
 
-    if ((ret = ff_reget_buffer(avctx, vid->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, vid->frame)) < 0)
         return ret;
-    }
     wrap_to_next_line = vid->frame->linesize[0] - avctx->width;
 
     if (avpkt->side_data_elems > 0 &&
@@ -110,6 +109,11 @@ static int bethsoftvid_decode_frame(AVCodecContext *avctx,
             if(yoffset >= avctx->height)
                 return AVERROR_INVALIDDATA;
             dst += vid->frame->linesize[0] * yoffset;
+        case VIDEO_P_FRAME:
+        case VIDEO_I_FRAME:
+            break;
+        default:
+            return AVERROR_INVALIDDATA;
     }
 
     // main code
diff --git a/libavcodec/bethsoftvideo.h b/libavcodec/bethsoftvideo.h
index 5cbbdfd..d5b5d0a 100644
--- a/libavcodec/bethsoftvideo.h
+++ b/libavcodec/bethsoftvideo.h
@@ -2,20 +2,20 @@
  * Bethesda VID video decoder
  * Copyright (C) 2007 Nicholas Tung
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bfi.c b/libavcodec/bfi.c
index 0ce73b1..a4cb002 100644
--- a/libavcodec/bfi.c
+++ b/libavcodec/bfi.c
@@ -2,20 +2,20 @@
  * Brute Force & Ignorance (BFI) video decoder
  * Copyright (c) 2008 Sisir Koppaka
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
 typedef struct BFIContext {
     AVCodecContext *avctx;
     uint8_t *dst;
+    uint32_t pal[256];
 } BFIContext;
 
 static av_cold int bfi_decode_init(AVCodecContext *avctx)
@@ -41,6 +42,8 @@ static av_cold int bfi_decode_init(AVCodecContext *avctx)
     BFIContext *bfi = avctx->priv_data;
     avctx->pix_fmt  = AV_PIX_FMT_PAL8;
     bfi->dst        = av_mallocz(avctx->width * avctx->height);
+    if (!bfi->dst)
+        return AVERROR(ENOMEM);
     return 0;
 }
 
@@ -57,10 +60,8 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
     uint32_t *pal;
     int i, j, ret, height = avctx->height;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init(&g, avpkt->data, buf_size);
 
@@ -70,22 +71,25 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
         frame->key_frame = 1;
         /* Setting the palette */
         if (avctx->extradata_size > 768) {
-            av_log(NULL, AV_LOG_ERROR, "Palette is too large.\n");
+            av_log(avctx, AV_LOG_ERROR, "Palette is too large.\n");
             return AVERROR_INVALIDDATA;
         }
         pal = (uint32_t *)frame->data[1];
         for (i = 0; i < avctx->extradata_size / 3; i++) {
             int shift = 16;
-            *pal = 0;
+            *pal = 0xFFU << 24;
             for (j = 0; j < 3; j++, shift -= 8)
                 *pal += ((avctx->extradata[i * 3 + j] << 2) |
                          (avctx->extradata[i * 3 + j] >> 4)) << shift;
             pal++;
         }
+        memcpy(bfi->pal, frame->data[1], sizeof(bfi->pal));
         frame->palette_has_changed = 1;
     } else {
         frame->pict_type = AV_PICTURE_TYPE_P;
         frame->key_frame = 0;
+        frame->palette_has_changed = 0;
+        memcpy(frame->data[1], bfi->pal, sizeof(bfi->pal));
     }
 
     bytestream2_skip(&g, 4); // Unpacked size, not required.
@@ -167,7 +171,7 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
 static av_cold int bfi_decode_close(AVCodecContext *avctx)
 {
     BFIContext *bfi = avctx->priv_data;
-    av_free(bfi->dst);
+    av_freep(&bfi->dst);
     return 0;
 }
 
diff --git a/libavcodec/bfin/README b/libavcodec/bfin/README
new file mode 100644
index 0000000..afb3461
--- /dev/null
+++ b/libavcodec/bfin/README
@@ -0,0 +1,6 @@
+BFIN optimizations have been removed in
+commit 880e2aa23645ed9871c66ee1cbd00f93c72d2d73
+The last revission with the optimizations is fa4e17c14035ebf43130fb369e1728cdd98d0b72
+
+If you want to maintain these (or other) BFIN optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/bgmc.c b/libavcodec/bgmc.c
index 1de6753..1a6817b 100644
--- a/libavcodec/bgmc.c
+++ b/libavcodec/bgmc.c
@@ -1,33 +1,31 @@
 /*
  * Block Gilbert-Moore decoder
- * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Block Gilbert-Moore decoder as used by MPEG-4 ALS
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 #include "libavutil/attributes.h"
-
-#include "bitstream.h"
 #include "bgmc.h"
 
 #define FREQ_BITS  14                      // bits used by frequency counters
@@ -487,26 +485,24 @@ av_cold void ff_bgmc_end(uint8_t **cf_lut, int **cf_lut_status)
 
 
 /** Initialize decoding and reads the first value */
-void ff_bgmc_decode_init(BitstreamContext *bc, unsigned int *h,
+void ff_bgmc_decode_init(GetBitContext *gb, unsigned int *h,
                          unsigned int *l, unsigned int *v)
 {
     *h = TOP_VALUE;
     *l = 0;
-    *v = bitstream_read(bc, VALUE_BITS);
+    *v = get_bits_long(gb, VALUE_BITS);
 }
 
 
 /** Finish decoding */
-void ff_bgmc_decode_end(BitstreamContext *bc)
+void ff_bgmc_decode_end(GetBitContext *gb)
 {
-    unsigned pos = bitstream_tell(bc) - VALUE_BITS + 2;
-
-    bitstream_seek(bc, pos);
+    skip_bits_long(gb, -(VALUE_BITS - 2));
 }
 
 
 /** Read and decode a block Gilbert-Moore coded symbol */
-void ff_bgmc_decode(BitstreamContext *bc, unsigned int num, int32_t *dst,
+void ff_bgmc_decode(GetBitContext *gb, unsigned int num, int32_t *dst,
                     int delta, unsigned int sx,
                     unsigned int *h, unsigned int *l, unsigned int *v,
                     uint8_t *cf_lut, int *cf_lut_status)
@@ -551,7 +547,7 @@ void ff_bgmc_decode(BitstreamContext *bc, unsigned int num, int32_t *dst,
 
             low  *= 2;
             high  = 2 * high + 1;
-            value = 2 * value + bitstream_read_bit(bc);
+            value = 2 * value + get_bits1(gb);
         }
 
         *dst++ = symbol;
diff --git a/libavcodec/bgmc.h b/libavcodec/bgmc.h
index 68a4cf1..4893736 100644
--- a/libavcodec/bgmc.h
+++ b/libavcodec/bgmc.h
@@ -1,28 +1,28 @@
 /*
  * Block Gilbert-Moore decoder
- * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Block Gilbert-Moore decoder header
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 
@@ -31,7 +31,7 @@
 
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 
 
 int ff_bgmc_init(AVCodecContext *avctx, uint8_t **cf_lut, int **cf_lut_status);
@@ -40,17 +40,17 @@ int ff_bgmc_init(AVCodecContext *avctx, uint8_t **cf_lut, int **cf_lut_status);
 void ff_bgmc_end(uint8_t **cf_lut, int **cf_lut_status);
 
 
-void ff_bgmc_decode_init(BitstreamContext *bc,
-                         unsigned int *h, unsigned int *l, unsigned int *v);
+void ff_bgmc_decode_init(GetBitContext *gb,
+                      unsigned int *h, unsigned int *l, unsigned int *v);
 
 
-void ff_bgmc_decode_end(BitstreamContext *bc);
+void ff_bgmc_decode_end(GetBitContext *gb);
 
 
-void ff_bgmc_decode(BitstreamContext *bc, unsigned int num, int32_t *dst,
-                    int delta, unsigned int sx,
-                    unsigned int *h, unsigned int *l, unsigned int *v,
-                    uint8_t *cf_lut, int *cf_lut_status);
+void ff_bgmc_decode(GetBitContext *gb, unsigned int num, int32_t *dst,
+                 int delta, unsigned int sx,
+                 unsigned int *h, unsigned int *l, unsigned int *v,
+                 uint8_t *cf_lut, int *cf_lut_status);
 
 
 #endif /* AVCODEC_BGMC_H */
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index 98fc46e..6673afa 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Konstantin Shishkov
  * Copyright (C) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,11 @@
 #include "avcodec.h"
 #include "binkdata.h"
 #include "binkdsp.h"
-#include "bitstream.h"
 #include "blockdsp.h"
+#include "get_bits.h"
 #include "hpeldsp.h"
 #include "internal.h"
 #include "mathops.h"
-#include "vlc.h"
 
 #define BINK_FLAG_ALPHA 0x00100000
 #define BINK_FLAG_GRAY  0x00020000
@@ -94,9 +93,8 @@ typedef struct Tree {
     uint8_t syms[16]; ///< leaf value to symbol mapping
 } Tree;
 
-#define GET_HUFF(bc, tree)                                                \
-    (tree).syms[bitstream_read_vlc(bc, bink_trees[(tree).vlc_num].table,  \
-                                   bink_trees[(tree).vlc_num].bits, 1)]
+#define GET_HUFF(gb, tree)  (tree).syms[get_vlc2(gb, bink_trees[(tree).vlc_num].table,\
+                                                 bink_trees[(tree).vlc_num].bits, 1)]
 
 /**
  * data structure used for decoding single Bink data type
@@ -122,6 +120,7 @@ typedef struct BinkContext {
     int            version;              ///< internal Bink file version
     int            has_alpha;
     int            swap_planes;
+    unsigned       frame_num;
 
     Bundle         bundle[BINKB_NB_SRC]; ///< bundles for decoding all data types
     Tree           col_high[16];         ///< trees for decoding high nibble in "colours" data type
@@ -145,7 +144,7 @@ enum BlockTypes {
 };
 
 /**
- * Initialize length length in all bundles.
+ * Initialize length in all bundles.
  *
  * @param c     decoder context
  * @param width plane width
@@ -176,7 +175,7 @@ static void init_lengths(BinkContext *c, int width, int bw)
  *
  * @param c decoder context
  */
-static av_cold void init_bundles(BinkContext *c)
+static av_cold int init_bundles(BinkContext *c)
 {
     int bw, bh, blocks;
     int i;
@@ -186,9 +185,13 @@ static av_cold void init_bundles(BinkContext *c)
     blocks = bw * bh;
 
     for (i = 0; i < BINKB_NB_SRC; i++) {
-        c->bundle[i].data = av_malloc(blocks * 64);
+        c->bundle[i].data = av_mallocz(blocks * 64);
+        if (!c->bundle[i].data)
+            return AVERROR(ENOMEM);
         c->bundle[i].data_end = c->bundle[i].data + blocks * 64;
     }
+
+    return 0;
 }
 
 /**
@@ -206,18 +209,18 @@ static av_cold void free_bundles(BinkContext *c)
 /**
  * Merge two consequent lists of equal size depending on bits read.
  *
- * @param bc   context for reading bits
+ * @param gb   context for reading bits
  * @param dst  buffer where merged list will be written to
  * @param src  pointer to the head of the first list (the second lists starts at src+size)
  * @param size input lists size
  */
-static void merge(BitstreamContext *bc, uint8_t *dst, uint8_t *src, int size)
+static void merge(GetBitContext *gb, uint8_t *dst, uint8_t *src, int size)
 {
     uint8_t *src2 = src + size;
     int size2 = size;
 
     do {
-        if (!bitstream_read_bit(bc)) {
+        if (!get_bits1(gb)) {
             *dst++ = *src++;
             size--;
         } else {
@@ -235,37 +238,37 @@ static void merge(BitstreamContext *bc, uint8_t *dst, uint8_t *src, int size)
 /**
  * Read information about Huffman tree used to decode data.
  *
- * @param bc   context for reading bits
+ * @param gb   context for reading bits
  * @param tree pointer for storing tree data
  */
-static void read_tree(BitstreamContext *bc, Tree *tree)
+static void read_tree(GetBitContext *gb, Tree *tree)
 {
     uint8_t tmp1[16] = { 0 }, tmp2[16], *in = tmp1, *out = tmp2;
     int i, t, len;
 
-    tree->vlc_num = bitstream_read(bc, 4);
+    tree->vlc_num = get_bits(gb, 4);
     if (!tree->vlc_num) {
         for (i = 0; i < 16; i++)
             tree->syms[i] = i;
         return;
     }
-    if (bitstream_read_bit(bc)) {
-        len = bitstream_read(bc, 3);
+    if (get_bits1(gb)) {
+        len = get_bits(gb, 3);
         for (i = 0; i <= len; i++) {
-            tree->syms[i] = bitstream_read(bc, 4);
+            tree->syms[i] = get_bits(gb, 4);
             tmp1[tree->syms[i]] = 1;
         }
         for (i = 0; i < 16 && len < 16 - 1; i++)
             if (!tmp1[i])
                 tree->syms[++len] = i;
     } else {
-        len = bitstream_read(bc, 2);
+        len = get_bits(gb, 2);
         for (i = 0; i < 16; i++)
             in[i] = i;
         for (i = 0; i <= len; i++) {
             int size = 1 << i;
             for (t = 0; t < 16; t += size << 1)
-                merge(bc, out + t, in + t, size);
+                merge(gb, out + t, in + t, size);
             FFSWAP(uint8_t*, in, out);
         }
         memcpy(tree->syms, in, 16);
@@ -275,21 +278,21 @@ static void read_tree(BitstreamContext *bc, Tree *tree)
 /**
  * Prepare bundle for decoding data.
  *
- * @param bc          context for reading bits
+ * @param gb          context for reading bits
  * @param c           decoder context
  * @param bundle_num  number of the bundle to initialize
  */
-static void read_bundle(BitstreamContext *bc, BinkContext *c, int bundle_num)
+static void read_bundle(GetBitContext *gb, BinkContext *c, int bundle_num)
 {
     int i;
 
     if (bundle_num == BINK_SRC_COLORS) {
         for (i = 0; i < 16; i++)
-            read_tree(bc, &c->col_high[i]);
+            read_tree(gb, &c->col_high[i]);
         c->col_lastval = 0;
     }
     if (bundle_num != BINK_SRC_INTRA_DC && bundle_num != BINK_SRC_INTER_DC)
-        read_tree(bc, &c->bundle[bundle_num].tree);
+        read_tree(gb, &c->bundle[bundle_num].tree);
     c->bundle[bundle_num].cur_dec =
     c->bundle[bundle_num].cur_ptr = c->bundle[bundle_num].data;
 }
@@ -297,64 +300,66 @@ static void read_bundle(BitstreamContext *bc, BinkContext *c, int bundle_num)
 /**
  * common check before starting decoding bundle data
  *
- * @param bc context for reading bits
+ * @param gb context for reading bits
  * @param b  bundle
  * @param t  variable where number of elements to decode will be stored
  */
-#define CHECK_READ_VAL(bc, b, t) \
+#define CHECK_READ_VAL(gb, b, t) \
     if (!b->cur_dec || (b->cur_dec > b->cur_ptr)) \
         return 0; \
-    t = bitstream_read(bc, b->len); \
+    t = get_bits(gb, b->len); \
     if (!t) { \
         b->cur_dec = NULL; \
         return 0; \
     } \
 
-static int read_runs(AVCodecContext *avctx, BitstreamContext *bc, Bundle *b)
+static int read_runs(AVCodecContext *avctx, GetBitContext *gb, Bundle *b)
 {
     int t, v;
     const uint8_t *dec_end;
 
-    CHECK_READ_VAL(bc, b, t);
+    CHECK_READ_VAL(gb, b, t);
     dec_end = b->cur_dec + t;
     if (dec_end > b->data_end) {
         av_log(avctx, AV_LOG_ERROR, "Run value went out of bounds\n");
         return AVERROR_INVALIDDATA;
     }
-    if (bitstream_read_bit(bc)) {
-        v = bitstream_read(bc, 4);
+    if (get_bits1(gb)) {
+        v = get_bits(gb, 4);
         memset(b->cur_dec, v, t);
         b->cur_dec += t;
     } else {
         while (b->cur_dec < dec_end)
-            *b->cur_dec++ = GET_HUFF(bc, b->tree);
+            *b->cur_dec++ = GET_HUFF(gb, b->tree);
     }
     return 0;
 }
 
-static int read_motion_values(AVCodecContext *avctx, BitstreamContext *bc, Bundle *b)
+static int read_motion_values(AVCodecContext *avctx, GetBitContext *gb, Bundle *b)
 {
-    int t, v;
+    int t, sign, v;
     const uint8_t *dec_end;
 
-    CHECK_READ_VAL(bc, b, t);
+    CHECK_READ_VAL(gb, b, t);
     dec_end = b->cur_dec + t;
     if (dec_end > b->data_end) {
         av_log(avctx, AV_LOG_ERROR, "Too many motion values\n");
         return AVERROR_INVALIDDATA;
     }
-    if (bitstream_read_bit(bc)) {
-        v = bitstream_read(bc, 4);
+    if (get_bits1(gb)) {
+        v = get_bits(gb, 4);
         if (v) {
-            v = bitstream_apply_sign(bc, v);
+            sign = -get_bits1(gb);
+            v = (v ^ sign) - sign;
         }
         memset(b->cur_dec, v, t);
         b->cur_dec += t;
     } else {
         while (b->cur_dec < dec_end) {
-            v = GET_HUFF(bc, b->tree);
+            v = GET_HUFF(gb, b->tree);
             if (v) {
-                v = bitstream_apply_sign(bc, v);
+                sign = -get_bits1(gb);
+                v = (v ^ sign) - sign;
             }
             *b->cur_dec++ = v;
         }
@@ -364,25 +369,33 @@ static int read_motion_values(AVCodecContext *avctx, BitstreamContext *bc, Bundl
 
 static const uint8_t bink_rlelens[4] = { 4, 8, 12, 32 };
 
-static int read_block_types(AVCodecContext *avctx, BitstreamContext *bc, Bundle *b)
+static int read_block_types(AVCodecContext *avctx, GetBitContext *gb, Bundle *b)
 {
+    BinkContext * const c = avctx->priv_data;
     int t, v;
     int last = 0;
     const uint8_t *dec_end;
 
-    CHECK_READ_VAL(bc, b, t);
+    CHECK_READ_VAL(gb, b, t);
+    if (c->version == 'k') {
+        t ^= 0xBBu;
+        if (t == 0) {
+            b->cur_dec = NULL;
+            return 0;
+        }
+    }
     dec_end = b->cur_dec + t;
     if (dec_end > b->data_end) {
         av_log(avctx, AV_LOG_ERROR, "Too many block type values\n");
         return AVERROR_INVALIDDATA;
     }
-    if (bitstream_read_bit(bc)) {
-        v = bitstream_read(bc, 4);
+    if (get_bits1(gb)) {
+        v = get_bits(gb, 4);
         memset(b->cur_dec, v, t);
         b->cur_dec += t;
     } else {
         while (b->cur_dec < dec_end) {
-            v = GET_HUFF(bc, b->tree);
+            v = GET_HUFF(gb, b->tree);
             if (v < 12) {
                 last = v;
                 *b->cur_dec++ = v;
@@ -399,40 +412,40 @@ static int read_block_types(AVCodecContext *avctx, BitstreamContext *bc, Bundle
     return 0;
 }
 
-static int read_patterns(AVCodecContext *avctx, BitstreamContext *bc, Bundle *b)
+static int read_patterns(AVCodecContext *avctx, GetBitContext *gb, Bundle *b)
 {
     int t, v;
     const uint8_t *dec_end;
 
-    CHECK_READ_VAL(bc, b, t);
+    CHECK_READ_VAL(gb, b, t);
     dec_end = b->cur_dec + t;
     if (dec_end > b->data_end) {
         av_log(avctx, AV_LOG_ERROR, "Too many pattern values\n");
         return AVERROR_INVALIDDATA;
     }
     while (b->cur_dec < dec_end) {
-        v  = GET_HUFF(bc, b->tree);
-        v |= GET_HUFF(bc, b->tree) << 4;
+        v  = GET_HUFF(gb, b->tree);
+        v |= GET_HUFF(gb, b->tree) << 4;
         *b->cur_dec++ = v;
     }
 
     return 0;
 }
 
-static int read_colors(BitstreamContext *bc, Bundle *b, BinkContext *c)
+static int read_colors(GetBitContext *gb, Bundle *b, BinkContext *c)
 {
     int t, sign, v;
     const uint8_t *dec_end;
 
-    CHECK_READ_VAL(bc, b, t);
+    CHECK_READ_VAL(gb, b, t);
     dec_end = b->cur_dec + t;
     if (dec_end > b->data_end) {
         av_log(c->avctx, AV_LOG_ERROR, "Too many color values\n");
         return AVERROR_INVALIDDATA;
     }
-    if (bitstream_read_bit(bc)) {
-        c->col_lastval = GET_HUFF(bc, c->col_high[c->col_lastval]);
-        v = GET_HUFF(bc, b->tree);
+    if (get_bits1(gb)) {
+        c->col_lastval = GET_HUFF(gb, c->col_high[c->col_lastval]);
+        v = GET_HUFF(gb, b->tree);
         v = (c->col_lastval << 4) | v;
         if (c->version < 'i') {
             sign = ((int8_t) v) >> 7;
@@ -443,8 +456,8 @@ static int read_colors(BitstreamContext *bc, Bundle *b, BinkContext *c)
         b->cur_dec += t;
     } else {
         while (b->cur_dec < dec_end) {
-            c->col_lastval = GET_HUFF(bc, c->col_high[c->col_lastval]);
-            v = GET_HUFF(bc, b->tree);
+            c->col_lastval = GET_HUFF(gb, c->col_high[c->col_lastval]);
+            v = GET_HUFF(gb, b->tree);
             v = (c->col_lastval << 4) | v;
             if (c->version < 'i') {
                 sign = ((int8_t) v) >> 7;
@@ -460,17 +473,18 @@ static int read_colors(BitstreamContext *bc, Bundle *b, BinkContext *c)
 /** number of bits used to store first DC value in bundle */
 #define DC_START_BITS 11
 
-static int read_dcs(AVCodecContext *avctx, BitstreamContext *bc, Bundle *b,
+static int read_dcs(AVCodecContext *avctx, GetBitContext *gb, Bundle *b,
                     int start_bits, int has_sign)
 {
-    int i, j, len, len2, bsize, v, v2;
+    int i, j, len, len2, bsize, sign, v, v2;
     int16_t *dst     = (int16_t*)b->cur_dec;
     int16_t *dst_end = (int16_t*)b->data_end;
 
-    CHECK_READ_VAL(bc, b, len);
-    v = bitstream_read(bc, start_bits - has_sign);
+    CHECK_READ_VAL(gb, b, len);
+    v = get_bits(gb, start_bits - has_sign);
     if (v && has_sign) {
-        v = bitstream_apply_sign(bc, v);
+        sign = -get_bits1(gb);
+        v = (v ^ sign) - sign;
     }
     if (dst_end - dst < 1)
         return AVERROR_INVALIDDATA;
@@ -480,12 +494,13 @@ static int read_dcs(AVCodecContext *avctx, BitstreamContext *bc, Bundle *b,
         len2 = FFMIN(len - i, 8);
         if (dst_end - dst < len2)
             return AVERROR_INVALIDDATA;
-        bsize = bitstream_read(bc, 4);
+        bsize = get_bits(gb, 4);
         if (bsize) {
             for (j = 0; j < len2; j++) {
-                v2 = bitstream_read(bc, bsize);
+                v2 = get_bits(gb, bsize);
                 if (v2) {
-                    v2 = bitstream_apply_sign(bc, v2);
+                    sign = -get_bits1(gb);
+                    v2 = (v2 ^ sign) - sign;
                 }
                 v += v2;
                 *dst++ = v;
@@ -537,7 +552,7 @@ static av_cold void binkb_init_bundles(BinkContext *c)
         binkb_init_bundle(c, i);
 }
 
-static int binkb_read_bundle(BinkContext *c, BitstreamContext *bc, int bundle_num)
+static int binkb_read_bundle(BinkContext *c, GetBitContext *gb, int bundle_num)
 {
     const int bits = binkb_bundle_sizes[bundle_num];
     const int mask = 1 << (bits - 1);
@@ -545,26 +560,26 @@ static int binkb_read_bundle(BinkContext *c, BitstreamContext *bc, int bundle_nu
     Bundle *b = &c->bundle[bundle_num];
     int i, len;
 
-    CHECK_READ_VAL(bc, b, len);
+    CHECK_READ_VAL(gb, b, len);
     if (b->data_end - b->cur_dec < len * (1 + (bits > 8)))
         return AVERROR_INVALIDDATA;
     if (bits <= 8) {
         if (!issigned) {
             for (i = 0; i < len; i++)
-                *b->cur_dec++ = bitstream_read(bc, bits);
+                *b->cur_dec++ = get_bits(gb, bits);
         } else {
             for (i = 0; i < len; i++)
-                *b->cur_dec++ = bitstream_read(bc, bits) - mask;
+                *b->cur_dec++ = get_bits(gb, bits) - mask;
         }
     } else {
         int16_t *dst = (int16_t*)b->cur_dec;
 
         if (!issigned) {
             for (i = 0; i < len; i++)
-                *dst++ = bitstream_read(bc, bits);
+                *dst++ = get_bits(gb, bits);
         } else {
             for (i = 0; i < len; i++)
-                *dst++ = bitstream_read(bc, bits) - mask;
+                *dst++ = get_bits(gb, bits) - mask;
         }
         b->cur_dec = (uint8_t*)dst;
     }
@@ -588,19 +603,19 @@ static inline int binkb_get_value(BinkContext *c, int bundle_num)
 /**
  * Read 8x8 block of DCT coefficients.
  *
- * @param bc       context for reading bits
+ * @param gb       context for reading bits
  * @param block    place for storing coefficients
  * @param scan     scan order table
  * @param quant_matrices quantization matrices
  * @return 0 for success, negative value in other cases
  */
-static int read_dct_coeffs(BitstreamContext *bc, int32_t block[64],
+static int read_dct_coeffs(BinkContext *c, GetBitContext *gb, int32_t block[64],
                            const uint8_t *scan, int *coef_count_,
                            int coef_idx[64], int q)
 {
     int coef_list[128];
     int mode_list[128];
-    int i, t, bits, ccoef, mode;
+    int i, t, bits, ccoef, mode, sign;
     int list_start = 64, list_end = 64, list_pos;
     int coef_count = 0;
     int quant_idx;
@@ -612,10 +627,10 @@ static int read_dct_coeffs(BitstreamContext *bc, int32_t block[64],
     coef_list[list_end] = 2;  mode_list[list_end++] = 3;
     coef_list[list_end] = 3;  mode_list[list_end++] = 3;
 
-    for (bits = bitstream_read(bc, 4) - 1; bits >= 0; bits--) {
+    for (bits = get_bits(gb, 4) - 1; bits >= 0; bits--) {
         list_pos = list_start;
         while (list_pos < list_end) {
-            if (!(mode_list[list_pos] | coef_list[list_pos]) || !bitstream_read_bit(bc)) {
+            if (!(mode_list[list_pos] | coef_list[list_pos]) || !get_bits1(gb)) {
                 list_pos++;
                 continue;
             }
@@ -631,15 +646,16 @@ static int read_dct_coeffs(BitstreamContext *bc, int32_t block[64],
                     mode_list[list_pos++] = 0;
                 }
                 for (i = 0; i < 4; i++, ccoef++) {
-                    if (bitstream_read_bit(bc)) {
+                    if (get_bits1(gb)) {
                         coef_list[--list_start] = ccoef;
                         mode_list[  list_start] = 3;
                     } else {
                         if (!bits) {
-                            t = 1 - (bitstream_read_bit(bc) << 1);
+                            t = 1 - (get_bits1(gb) << 1);
                         } else {
-                            t = bitstream_read(bc, bits) | 1 << bits;
-                            t = bitstream_apply_sign(bc, t);
+                            t = get_bits(gb, bits) | 1 << bits;
+                            sign = -get_bits1(gb);
+                            t = (t ^ sign) - sign;
                         }
                         block[scan[ccoef]] = t;
                         coef_idx[coef_count++] = ccoef;
@@ -656,10 +672,11 @@ static int read_dct_coeffs(BitstreamContext *bc, int32_t block[64],
                 break;
             case 3:
                 if (!bits) {
-                    t = 1 - (bitstream_read_bit(bc) << 1);
+                    t = 1 - (get_bits1(gb) << 1);
                 } else {
-                    t = bitstream_read(bc, bits) | 1 << bits;
-                    t = bitstream_apply_sign(bc, t);
+                    t = get_bits(gb, bits) | 1 << bits;
+                    sign = -get_bits1(gb);
+                    t = (t ^ sign) - sign;
                 }
                 block[scan[ccoef]] = t;
                 coef_idx[coef_count++] = ccoef;
@@ -671,14 +688,15 @@ static int read_dct_coeffs(BitstreamContext *bc, int32_t block[64],
     }
 
     if (q == -1) {
-        quant_idx = bitstream_read(bc, 4);
+        quant_idx = get_bits(gb, 4);
     } else {
         quant_idx = q;
+        if (quant_idx > 15U) {
+            av_log(c->avctx, AV_LOG_ERROR, "quant_index %d out of range\n", quant_idx);
+            return AVERROR_INVALIDDATA;
+        }
     }
 
-    if (quant_idx >= 16)
-        return AVERROR_INVALIDDATA;
-
     *coef_count_ = coef_count;
 
     return quant_idx;
@@ -699,16 +717,16 @@ static void unquantize_dct_coeffs(int32_t block[64], const int32_t quant[64],
 /**
  * Read 8x8 block with residue after motion compensation.
  *
- * @param bc          context for reading bits
+ * @param gb          context for reading bits
  * @param block       place to store read data
  * @param masks_count number of masks to decode
  * @return 0 on success, negative value in other cases
  */
-static int read_residue(BitstreamContext *bc, int16_t block[64], int masks_count)
+static int read_residue(GetBitContext *gb, int16_t block[64], int masks_count)
 {
     int coef_list[128];
     int mode_list[128];
-    int i, mask, ccoef, mode;
+    int i, sign, mask, ccoef, mode;
     int list_start = 64, list_end = 64, list_pos;
     int nz_coeff[64];
     int nz_coeff_count = 0;
@@ -718,9 +736,9 @@ static int read_residue(BitstreamContext *bc, int16_t block[64], int masks_count
     coef_list[list_end] = 44; mode_list[list_end++] = 0;
     coef_list[list_end] =  0; mode_list[list_end++] = 2;
 
-    for (mask = 1 << bitstream_read(bc, 3); mask; mask >>= 1) {
+    for (mask = 1 << get_bits(gb, 3); mask; mask >>= 1) {
         for (i = 0; i < nz_coeff_count; i++) {
-            if (!bitstream_read_bit(bc))
+            if (!get_bits1(gb))
                 continue;
             if (block[nz_coeff[i]] < 0)
                 block[nz_coeff[i]] -= mask;
@@ -732,7 +750,7 @@ static int read_residue(BitstreamContext *bc, int16_t block[64], int masks_count
         }
         list_pos = list_start;
         while (list_pos < list_end) {
-            if (!(coef_list[list_pos] | mode_list[list_pos]) || !bitstream_read_bit(bc)) {
+            if (!(coef_list[list_pos] | mode_list[list_pos]) || !get_bits1(gb)) {
                 list_pos++;
                 continue;
             }
@@ -748,12 +766,13 @@ static int read_residue(BitstreamContext *bc, int16_t block[64], int masks_count
                     mode_list[list_pos++] = 0;
                 }
                 for (i = 0; i < 4; i++, ccoef++) {
-                    if (bitstream_read_bit(bc)) {
+                    if (get_bits1(gb)) {
                         coef_list[--list_start] = ccoef;
                         mode_list[  list_start] = 3;
                     } else {
                         nz_coeff[nz_coeff_count++] = bink_scan[ccoef];
-                        block[bink_scan[ccoef]] = bitstream_apply_sign(bc, mask);
+                        sign = -get_bits1(gb);
+                        block[bink_scan[ccoef]] = (mask ^ sign) - sign;
                         masks_count--;
                         if (masks_count < 0)
                             return 0;
@@ -770,7 +789,8 @@ static int read_residue(BitstreamContext *bc, int16_t block[64], int masks_count
                 break;
             case 3:
                 nz_coeff[nz_coeff_count++] = bink_scan[ccoef];
-                block[bink_scan[ccoef]] = bitstream_apply_sign(bc, mask);
+                sign = -get_bits1(gb);
+                block[bink_scan[ccoef]] = (mask ^ sign) - sign;
                 coef_list[list_pos]   = 0;
                 mode_list[list_pos++] = 0;
                 masks_count--;
@@ -797,7 +817,7 @@ static inline void put_pixels8x8_overlapped(uint8_t *dst, uint8_t *src, int stri
         memcpy(dst + i*stride, tmp + i*8, 8);
 }
 
-static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *bc,
+static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                               int plane_idx, int is_key, int is_chroma)
 {
     int blk, ret;
@@ -806,7 +826,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *
     int v, col[2];
     const uint8_t *scan;
     int xoff, yoff;
-    LOCAL_ALIGNED_16(int16_t, block, [64]);
+    LOCAL_ALIGNED_32(int16_t, block, [64]);
     LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
     int coordmap[64];
     int ybias = is_key ? -15 : 0;
@@ -825,7 +845,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *
 
     for (by = 0; by < bh; by++) {
         for (i = 0; i < BINKB_NB_SRC; i++) {
-            if ((ret = binkb_read_bundle(c, bc, i)) < 0)
+            if ((ret = binkb_read_bundle(c, gb, i)) < 0)
                 return ret;
         }
 
@@ -836,11 +856,13 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *
             case 0:
                 break;
             case 1:
-                scan = bink_patterns[bitstream_read(bc, 4)];
+                scan = bink_patterns[get_bits(gb, 4)];
                 i = 0;
                 do {
-                    int mode = bitstream_read_bit(bc);
-                    int run  = bitstream_read(bc, binkb_runbits[i]) + 1;
+                    int mode, run;
+
+                    mode = get_bits1(gb);
+                    run = get_bits(gb, binkb_runbits[i]) + 1;
 
                     i += run;
                     if (i > 64) {
@@ -863,7 +885,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = binkb_get_value(c, BINKB_SRC_INTRA_DC);
                 qp = binkb_get_value(c, BINKB_SRC_INTRA_Q);
-                if ((quant_idx = read_dct_coeffs(bc, dctblock, bink_scan, &coef_count, coef_idx, qp)) < 0)
+                if ((quant_idx = read_dct_coeffs(c, gb, dctblock, bink_scan, &coef_count, coef_idx, qp)) < 0)
                     return quant_idx;
                 unquantize_dct_coeffs(dctblock, binkb_intra_quant[quant_idx], coef_count, coef_idx, bink_scan);
                 c->binkdsp.idct_put(dst, stride, dctblock);
@@ -881,7 +903,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *
                 }
                 c->bdsp.clear_block(block);
                 v = binkb_get_value(c, BINKB_SRC_INTER_COEFS);
-                read_residue(bc, block, v);
+                read_residue(gb, block, v);
                 c->binkdsp.add_pixels8(dst, block, stride);
                 break;
             case 4:
@@ -898,7 +920,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = binkb_get_value(c, BINKB_SRC_INTER_DC);
                 qp = binkb_get_value(c, BINKB_SRC_INTER_Q);
-                if ((quant_idx = read_dct_coeffs(bc, dctblock, bink_scan, &coef_count, coef_idx, qp)) < 0)
+                if ((quant_idx = read_dct_coeffs(c, gb, dctblock, bink_scan, &coef_count, coef_idx, qp)) < 0)
                     return quant_idx;
                 unquantize_dct_coeffs(dctblock, binkb_inter_quant[quant_idx], coef_count, coef_idx, bink_scan);
                 c->binkdsp.idct_add(dst, stride, dctblock);
@@ -939,8 +961,8 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *
             }
         }
     }
-    if (bitstream_tell(bc) & 0x1F) // next plane data starts at 32-bit boundary
-        bitstream_skip(bc, 32 - (bitstream_tell(bc) & 0x1F));
+    if (get_bits_count(gb) & 0x1F) //next plane data starts at 32-bit boundary
+        skip_bits_long(gb, 32 - (get_bits_count(gb) & 0x1F));
 
     return 0;
 }
@@ -963,7 +985,7 @@ static int bink_put_pixels(BinkContext *c,
     return 0;
 }
 
-static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *bc,
+static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                              int plane_idx, int is_chroma)
 {
     int blk, ret;
@@ -971,7 +993,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
     uint8_t *dst, *prev, *ref_start, *ref_end;
     int v, col[2];
     const uint8_t *scan;
-    LOCAL_ALIGNED_16(int16_t, block, [64]);
+    LOCAL_ALIGNED_32(int16_t, block, [64]);
     LOCAL_ALIGNED_16(uint8_t, ublock, [64]);
     LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
     int coordmap[64], quant_idx, coef_count, coef_idx[64];
@@ -980,10 +1002,21 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
     int bw = is_chroma ? (c->avctx->width  + 15) >> 4 : (c->avctx->width  + 7) >> 3;
     int bh = is_chroma ? (c->avctx->height + 15) >> 4 : (c->avctx->height + 7) >> 3;
     int width = c->avctx->width >> is_chroma;
+    int height = c->avctx->height >> is_chroma;
+
+    if (c->version == 'k' && get_bits1(gb)) {
+        int fill = get_bits(gb, 8);
+
+        dst = frame->data[plane_idx];
+
+        for (i = 0; i < height; i++)
+            memset(dst + i * stride, fill, width);
+        goto end;
+    }
 
     init_lengths(c, FFMAX(width, 8), bw);
     for (i = 0; i < BINK_NB_SRC; i++)
-        read_bundle(bc, c, i);
+        read_bundle(gb, c, i);
 
     ref_start = c->last->data[plane_idx] ? c->last->data[plane_idx]
                                          : frame->data[plane_idx];
@@ -994,23 +1027,23 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
         coordmap[i] = (i & 7) + (i >> 3) * stride;
 
     for (by = 0; by < bh; by++) {
-        if ((ret = read_block_types(c->avctx, bc, &c->bundle[BINK_SRC_BLOCK_TYPES])) < 0)
+        if ((ret = read_block_types(c->avctx, gb, &c->bundle[BINK_SRC_BLOCK_TYPES])) < 0)
             return ret;
-        if ((ret = read_block_types(c->avctx, bc, &c->bundle[BINK_SRC_SUB_BLOCK_TYPES])) < 0)
+        if ((ret = read_block_types(c->avctx, gb, &c->bundle[BINK_SRC_SUB_BLOCK_TYPES])) < 0)
             return ret;
-        if ((ret = read_colors(bc, &c->bundle[BINK_SRC_COLORS], c)) < 0)
+        if ((ret = read_colors(gb, &c->bundle[BINK_SRC_COLORS], c)) < 0)
             return ret;
-        if ((ret = read_patterns(c->avctx, bc, &c->bundle[BINK_SRC_PATTERN])) < 0)
+        if ((ret = read_patterns(c->avctx, gb, &c->bundle[BINK_SRC_PATTERN])) < 0)
             return ret;
-        if ((ret = read_motion_values(c->avctx, bc, &c->bundle[BINK_SRC_X_OFF])) < 0)
+        if ((ret = read_motion_values(c->avctx, gb, &c->bundle[BINK_SRC_X_OFF])) < 0)
             return ret;
-        if ((ret = read_motion_values(c->avctx, bc, &c->bundle[BINK_SRC_Y_OFF])) < 0)
+        if ((ret = read_motion_values(c->avctx, gb, &c->bundle[BINK_SRC_Y_OFF])) < 0)
             return ret;
-        if ((ret = read_dcs(c->avctx, bc, &c->bundle[BINK_SRC_INTRA_DC], DC_START_BITS, 0)) < 0)
+        if ((ret = read_dcs(c->avctx, gb, &c->bundle[BINK_SRC_INTRA_DC], DC_START_BITS, 0)) < 0)
             return ret;
-        if ((ret = read_dcs(c->avctx, bc, &c->bundle[BINK_SRC_INTER_DC], DC_START_BITS, 1)) < 0)
+        if ((ret = read_dcs(c->avctx, gb, &c->bundle[BINK_SRC_INTER_DC], DC_START_BITS, 1)) < 0)
             return ret;
-        if ((ret = read_runs(c->avctx, bc, &c->bundle[BINK_SRC_RUN])) < 0)
+        if ((ret = read_runs(c->avctx, gb, &c->bundle[BINK_SRC_RUN])) < 0)
             return ret;
 
         if (by == bh)
@@ -1035,7 +1068,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
                 blk = get_value(c, BINK_SRC_SUB_BLOCK_TYPES);
                 switch (blk) {
                 case RUN_BLOCK:
-                    scan = bink_patterns[bitstream_read(bc, 4)];
+                    scan = bink_patterns[get_bits(gb, 4)];
                     i = 0;
                     do {
                         int run = get_value(c, BINK_SRC_RUN) + 1;
@@ -1045,7 +1078,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
                             av_log(c->avctx, AV_LOG_ERROR, "Run went out of bounds\n");
                             return AVERROR_INVALIDDATA;
                         }
-                        if (bitstream_read_bit(bc)) {
+                        if (get_bits1(gb)) {
                             v = get_value(c, BINK_SRC_COLORS);
                             for (j = 0; j < run; j++)
                                 ublock[*scan++] = v;
@@ -1060,7 +1093,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
                 case INTRA_BLOCK:
                     memset(dctblock, 0, sizeof(*dctblock) * 64);
                     dctblock[0] = get_value(c, BINK_SRC_INTRA_DC);
-                    if ((quant_idx = read_dct_coeffs(bc, dctblock, bink_scan, &coef_count, coef_idx, -1)) < 0)
+                    if ((quant_idx = read_dct_coeffs(c, gb, dctblock, bink_scan, &coef_count, coef_idx, -1)) < 0)
                         return quant_idx;
                     unquantize_dct_coeffs(dctblock, bink_intra_quant[quant_idx], coef_count, coef_idx, bink_scan);
                     c->binkdsp.idct_put(ublock, 8, dctblock);
@@ -1100,7 +1133,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
                     return ret;
                 break;
             case RUN_BLOCK:
-                scan = bink_patterns[bitstream_read(bc, 4)];
+                scan = bink_patterns[get_bits(gb, 4)];
                 i = 0;
                 do {
                     int run = get_value(c, BINK_SRC_RUN) + 1;
@@ -1110,7 +1143,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
                         av_log(c->avctx, AV_LOG_ERROR, "Run went out of bounds\n");
                         return AVERROR_INVALIDDATA;
                     }
-                    if (bitstream_read_bit(bc)) {
+                    if (get_bits1(gb)) {
                         v = get_value(c, BINK_SRC_COLORS);
                         for (j = 0; j < run; j++)
                             dst[coordmap[*scan++]] = v;
@@ -1128,14 +1161,14 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
                 if (ret < 0)
                     return ret;
                 c->bdsp.clear_block(block);
-                v = bitstream_read(bc, 7);
-                read_residue(bc, block, v);
+                v = get_bits(gb, 7);
+                read_residue(gb, block, v);
                 c->binkdsp.add_pixels8(dst, block, stride);
                 break;
             case INTRA_BLOCK:
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = get_value(c, BINK_SRC_INTRA_DC);
-                if ((quant_idx = read_dct_coeffs(bc, dctblock, bink_scan, &coef_count, coef_idx, -1)) < 0)
+                if ((quant_idx = read_dct_coeffs(c, gb, dctblock, bink_scan, &coef_count, coef_idx, -1)) < 0)
                     return quant_idx;
                 unquantize_dct_coeffs(dctblock, bink_intra_quant[quant_idx], coef_count, coef_idx, bink_scan);
                 c->binkdsp.idct_put(dst, stride, dctblock);
@@ -1151,7 +1184,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
                     return ret;
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = get_value(c, BINK_SRC_INTER_DC);
-                if ((quant_idx = read_dct_coeffs(bc, dctblock, bink_scan, &coef_count, coef_idx, -1)) < 0)
+                if ((quant_idx = read_dct_coeffs(c, gb, dctblock, bink_scan, &coef_count, coef_idx, -1)) < 0)
                     return quant_idx;
                 unquantize_dct_coeffs(dctblock, bink_inter_quant[quant_idx], coef_count, coef_idx, bink_scan);
                 c->binkdsp.idct_add(dst, stride, dctblock);
@@ -1176,8 +1209,10 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, BitstreamContext *b
             }
         }
     }
-    if (bitstream_tell(bc) & 0x1F) // next plane data starts at 32-bit boundary
-        bitstream_skip(bc, 32 - (bitstream_tell(bc) & 0x1F));
+
+end:
+    if (get_bits_count(gb) & 0x1F) //next plane data starts at 32-bit boundary
+        skip_bits_long(gb, 32 - (get_bits_count(gb) & 0x1F));
 
     return 0;
 }
@@ -1186,46 +1221,44 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
 {
     BinkContext * const c = avctx->priv_data;
     AVFrame *frame = data;
-    BitstreamContext bc;
+    GetBitContext gb;
     int plane, plane_idx, ret;
     int bits_count = pkt->size << 3;
 
     if (c->version > 'b') {
-        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
             return ret;
-        }
     } else {
-        if ((ret = ff_reget_buffer(avctx, c->last)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+        if ((ret = ff_reget_buffer(avctx, c->last)) < 0)
             return ret;
-        }
         if ((ret = av_frame_ref(frame, c->last)) < 0)
             return ret;
     }
 
-    bitstream_init(&bc, pkt->data, bits_count);
+    init_get_bits(&gb, pkt->data, bits_count);
     if (c->has_alpha) {
         if (c->version >= 'i')
-            bitstream_skip(&bc, 32);
-        if ((ret = bink_decode_plane(c, frame, &bc, 3, 0)) < 0)
+            skip_bits_long(&gb, 32);
+        if ((ret = bink_decode_plane(c, frame, &gb, 3, 0)) < 0)
             return ret;
     }
     if (c->version >= 'i')
-        bitstream_skip(&bc, 32);
+        skip_bits_long(&gb, 32);
+
+    c->frame_num++;
 
     for (plane = 0; plane < 3; plane++) {
         plane_idx = (!plane || !c->swap_planes) ? plane : (plane ^ 3);
 
         if (c->version > 'b') {
-            if ((ret = bink_decode_plane(c, frame, &bc, plane_idx, !!plane)) < 0)
+            if ((ret = bink_decode_plane(c, frame, &gb, plane_idx, !!plane)) < 0)
                 return ret;
         } else {
-            if ((ret = binkb_decode_plane(c, frame, &bc, plane_idx,
-                                          !avctx->frame_number, !!plane)) < 0)
+            if ((ret = binkb_decode_plane(c, frame, &gb, plane_idx,
+                                          c->frame_num == 1, !!plane)) < 0)
                 return ret;
         }
-        if (bitstream_tell(&bc) >= bits_count)
+        if (get_bits_count(&gb) >= bits_count)
             break;
     }
     emms_c();
@@ -1248,41 +1281,28 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
 static av_cold void binkb_calc_quant(void)
 {
     uint8_t inv_bink_scan[64];
-    double s[64];
+    static const int s[64]={
+        1073741824,1489322693,1402911301,1262586814,1073741824, 843633538, 581104888, 296244703,
+        1489322693,2065749918,1945893874,1751258219,1489322693,1170153332, 806015634, 410903207,
+        1402911301,1945893874,1832991949,1649649171,1402911301,1102260336, 759250125, 387062357,
+        1262586814,1751258219,1649649171,1484645031,1262586814, 992008094, 683307060, 348346918,
+        1073741824,1489322693,1402911301,1262586814,1073741824, 843633538, 581104888, 296244703,
+         843633538,1170153332,1102260336, 992008094, 843633538, 662838617, 456571181, 232757969,
+         581104888, 806015634, 759250125, 683307060, 581104888, 456571181, 314491699, 160326478,
+         296244703, 410903207, 387062357, 348346918, 296244703, 232757969, 160326478,  81733730,
+    };
     int i, j;
-
-    for (j = 0; j < 8; j++) {
-        for (i = 0; i < 8; i++) {
-            if (j && j != 4)
-               if (i && i != 4)
-                   s[j*8 + i] = cos(j * M_PI/16.0) * cos(i * M_PI/16.0) * 2.0;
-               else
-                   s[j*8 + i] = cos(j * M_PI/16.0) * sqrt(2.0);
-            else
-               if (i && i != 4)
-                   s[j*8 + i] = cos(i * M_PI/16.0) * sqrt(2.0);
-               else
-                   s[j*8 + i] = 1.0;
-        }
-    }
-
+#define C (1LL<<30)
     for (i = 0; i < 64; i++)
         inv_bink_scan[bink_scan[i]] = i;
 
     for (j = 0; j < 16; j++) {
         for (i = 0; i < 64; i++) {
             int k = inv_bink_scan[i];
-            if (s[i] == 1.0) {
-                binkb_intra_quant[j][k] = (1L << 12) * binkb_intra_seed[i] *
-                                          binkb_num[j]/binkb_den[j];
-                binkb_inter_quant[j][k] = (1L << 12) * binkb_inter_seed[i] *
-                                          binkb_num[j]/binkb_den[j];
-            } else {
-                binkb_intra_quant[j][k] = (1L << 12) * binkb_intra_seed[i] * s[i] *
-                                          binkb_num[j]/(double)binkb_den[j];
-                binkb_inter_quant[j][k] = (1L << 12) * binkb_inter_seed[i] * s[i] *
-                                          binkb_num[j]/(double)binkb_den[j];
-            }
+            binkb_intra_quant[j][k] = binkb_intra_seed[i] * (int64_t)s[i] *
+                                        binkb_num[j]/(binkb_den[j] * (C>>12));
+            binkb_inter_quant[j][k] = binkb_inter_seed[i] * (int64_t)s[i] *
+                                        binkb_num[j]/(binkb_den[j] * (C>>12));
         }
     }
 }
@@ -1323,12 +1343,16 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return ret;
 
     avctx->pix_fmt = c->has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
+    avctx->color_range = c->version == 'k' ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
 
-    ff_blockdsp_init(&c->bdsp);
+    ff_blockdsp_init(&c->bdsp, avctx);
     ff_hpeldsp_init(&c->hdsp, avctx->flags);
     ff_binkdsp_init(&c->binkdsp);
 
-    init_bundles(c);
+    if ((ret = init_bundles(c)) < 0) {
+        free_bundles(c);
+        return ret;
+    }
 
     if (c->version == 'b') {
         if (!binkb_initialised) {
@@ -1350,6 +1374,13 @@ static av_cold int decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+static void flush(AVCodecContext *avctx)
+{
+    BinkContext * const c = avctx->priv_data;
+
+    c->frame_num = 0;
+}
+
 AVCodec ff_bink_decoder = {
     .name           = "binkvideo",
     .long_name      = NULL_IF_CONFIG_SMALL("Bink video"),
@@ -1359,5 +1390,6 @@ AVCodec ff_bink_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/binkaudio.c b/libavcodec/binkaudio.c
index 51fb6c8..96cf968 100644
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007-2011 Peter Ross (pross@xvid.org)
  * Copyright (c) 2009 Daniel Verkamp (daniel@drv.nu)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,9 +33,9 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "dct.h"
 #include "decode.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "rdft.h"
 #include "wma_freqs.h"
@@ -46,7 +46,7 @@ static float quant_table[96];
 #define BINK_BLOCK_MAX_SIZE (MAX_CHANNELS << 11)
 
 typedef struct BinkAudioContext {
-    BitstreamContext bc;
+    GetBitContext gb;
     int version_b;          ///< Bink version 'b'
     int first;
     int channels;
@@ -83,14 +83,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
         frame_len_bits = 11;
     }
 
-    if (avctx->channels > MAX_CHANNELS) {
-        av_log(avctx, AV_LOG_ERROR, "too many channels: %d\n", avctx->channels);
-        return -1;
+    if (avctx->channels < 1 || avctx->channels > MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels: %d\n", avctx->channels);
+        return AVERROR_INVALIDDATA;
     }
     avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO :
                                                    AV_CH_LAYOUT_STEREO;
 
-    s->version_b = avctx->extradata && avctx->extradata[3] == 'b';
+    s->version_b = avctx->extradata_size >= 4 && avctx->extradata[3] == 'b';
 
     if (avctx->codec->id == AV_CODEC_ID_BINKAUDIO_RDFT) {
         // audio is already interleaved for the RDFT format variant
@@ -139,7 +139,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     else if (CONFIG_BINKAUDIO_DCT_DECODER)
         ff_dct_init(&s->trans.dct, frame_len_bits, DCT_III);
     else
-        return -1;
+        av_assert0(0);
 
     s->pkt = av_packet_alloc();
     if (!s->pkt)
@@ -148,11 +148,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static float get_float(BitstreamContext *bc)
+static float get_float(GetBitContext *gb)
 {
-    int power = bitstream_read(bc, 5);
-    float f = ldexpf(bitstream_read(bc, 23), power - 23);
-    if (bitstream_read_bit(bc))
+    int power = get_bits(gb, 5);
+    float f = ldexpf(get_bits_long(gb, 23), power - 23);
+    if (get_bits1(gb))
         f = -f;
     return f;
 }
@@ -171,30 +171,30 @@ static int decode_block(BinkAudioContext *s, float **out, int use_dct)
     int ch, i, j, k;
     float q, quant[25];
     int width, coeff;
-    BitstreamContext *bc = &s->bc;
+    GetBitContext *gb = &s->gb;
 
     if (use_dct)
-        bitstream_skip(bc, 2);
+        skip_bits(gb, 2);
 
     for (ch = 0; ch < s->channels; ch++) {
         FFTSample *coeffs = out[ch];
 
         if (s->version_b) {
-            if (bitstream_bits_left(bc) < 64)
+            if (get_bits_left(gb) < 64)
                 return AVERROR_INVALIDDATA;
-            coeffs[0] = av_int2float(bitstream_read(bc, 32)) * s->root;
-            coeffs[1] = av_int2float(bitstream_read(bc, 32)) * s->root;
+            coeffs[0] = av_int2float(get_bits_long(gb, 32)) * s->root;
+            coeffs[1] = av_int2float(get_bits_long(gb, 32)) * s->root;
         } else {
-            if (bitstream_bits_left(bc) < 58)
+            if (get_bits_left(gb) < 58)
                 return AVERROR_INVALIDDATA;
-            coeffs[0] = get_float(bc) * s->root;
-            coeffs[1] = get_float(bc) * s->root;
+            coeffs[0] = get_float(gb) * s->root;
+            coeffs[1] = get_float(gb) * s->root;
         }
 
-        if (bitstream_bits_left(bc) < s->num_bands * 8)
+        if (get_bits_left(gb) < s->num_bands * 8)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < s->num_bands; i++) {
-            int value = bitstream_read(bc, 8);
+            int value = get_bits(gb, 8);
             quant[i]  = quant_table[FFMIN(value, 95)];
         }
 
@@ -207,9 +207,9 @@ static int decode_block(BinkAudioContext *s, float **out, int use_dct)
             if (s->version_b) {
                 j = i + 16;
             } else {
-                int v = bitstream_read_bit(bc);
+                int v = get_bits1(gb);
                 if (v) {
-                    v = bitstream_read(bc, 4);
+                    v = get_bits(gb, 4);
                     j = i + rle_length_tab[v] * 8;
                 } else {
                     j = i + 8;
@@ -218,7 +218,7 @@ static int decode_block(BinkAudioContext *s, float **out, int use_dct)
 
             j = FFMIN(j, s->frame_len);
 
-            width = bitstream_read(bc, 4);
+            width = get_bits(gb, 4);
             if (width == 0) {
                 memset(coeffs + i, 0, (j - i) * sizeof(*coeffs));
                 i = j;
@@ -228,10 +228,10 @@ static int decode_block(BinkAudioContext *s, float **out, int use_dct)
                 while (i < j) {
                     if (s->bands[k] == i)
                         q = quant[k++];
-                    coeff = bitstream_read(bc, width);
+                    coeff = get_bits(gb, width);
                     if (coeff) {
                         int v;
-                        v = bitstream_read_bit(bc);
+                        v = get_bits1(gb);
                         if (v)
                             coeffs[i] = -q * coeff;
                         else
@@ -284,17 +284,16 @@ static av_cold int decode_end(AVCodecContext *avctx)
     return 0;
 }
 
-static void get_bits_align32(BitstreamContext *s)
+static void get_bits_align32(GetBitContext *s)
 {
-    int n = (-bitstream_tell(s)) & 31;
-    if (n)
-        bitstream_skip(s, n);
+    int n = (-get_bits_count(s)) & 31;
+    if (n) skip_bits(s, n);
 }
 
 static int binkaudio_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 {
     BinkAudioContext *s = avctx->priv_data;
-    BitstreamContext *bc = &s->bc;
+    GetBitContext *gb = &s->gb;
     int ret;
 
     if (!s->pkt->data) {
@@ -308,29 +307,27 @@ static int binkaudio_receive_frame(AVCodecContext *avctx, AVFrame *frame)
             goto fail;
         }
 
-        ret = bitstream_init8(bc, s->pkt->data, s->pkt->size);
+        ret = init_get_bits8(gb, s->pkt->data, s->pkt->size);
         if (ret < 0)
             goto fail;
 
         /* skip reported size */
-        bitstream_skip(bc, 32);
+        skip_bits_long(gb, 32);
     }
 
     /* get output buffer */
     frame->nb_samples = s->frame_len;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (decode_block(s, (float **)frame->extended_data,
                      avctx->codec->id == AV_CODEC_ID_BINKAUDIO_DCT)) {
         av_log(avctx, AV_LOG_ERROR, "Incomplete packet\n");
         return AVERROR_INVALIDDATA;
     }
-    get_bits_align32(bc);
-    if (!bitstream_bits_left(bc)) {
-        memset(bc, 0, sizeof(*bc));
+    get_bits_align32(gb);
+    if (!get_bits_left(gb)) {
+        memset(gb, 0, sizeof(*gb));
         av_packet_unref(s->pkt);
     }
 
diff --git a/libavcodec/binkdata.h b/libavcodec/binkdata.h
index 3da6b7e..57619be 100644
--- a/libavcodec/binkdata.h
+++ b/libavcodec/binkdata.h
@@ -2,20 +2,20 @@
  * Bink video decoder
  * Copyright (C) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/binkdsp.c b/libavcodec/binkdsp.c
index 0dfe12c..9d70e23 100644
--- a/libavcodec/binkdsp.c
+++ b/libavcodec/binkdsp.c
@@ -2,20 +2,20 @@
  * Bink DSP routines
  * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -129,7 +129,7 @@ static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align
     }
 }
 
-static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
+static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
                           int line_size)
 {
     int i;
diff --git a/libavcodec/binkdsp.h b/libavcodec/binkdsp.h
index 9524fe2..b089a98 100644
--- a/libavcodec/binkdsp.h
+++ b/libavcodec/binkdsp.h
@@ -2,20 +2,20 @@
  * Bink DSP routines
  * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ typedef struct BinkDSPContext {
     void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, int32_t *block/*align 16*/);
     void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, int32_t *block/*align 16*/);
     void (*scale_block)(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize);
-    void (*add_pixels8)(uint8_t *restrict pixels, int16_t *block, int line_size);
+    void (*add_pixels8)(uint8_t *av_restrict pixels, int16_t *block, int line_size);
 } BinkDSPContext;
 
 void ff_binkdsp_init(BinkDSPContext *c);
diff --git a/libavcodec/bintext.c b/libavcodec/bintext.c
new file mode 100644
index 0000000..d85f2c2
--- /dev/null
+++ b/libavcodec/bintext.c
@@ -0,0 +1,246 @@
+/*
+ * Binary text decoder
+ * eXtended BINary text (XBIN) decoder
+ * iCEDraw File decoder
+ * Copyright (c) 2010 Peter Ross (pross@xvid.org)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Binary text decoder
+ * eXtended BINary text (XBIN) decoder
+ * iCEDraw File decoder
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/xga_font_data.h"
+#include "avcodec.h"
+#include "cga_data.h"
+#include "bintext.h"
+#include "internal.h"
+
+#define FONT_WIDTH 8
+
+typedef struct XbinContext {
+    AVFrame *frame;
+    int palette[16];
+    int flags;
+    int font_height;
+    const uint8_t *font;
+    int x, y;
+} XbinContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+    uint8_t *p;
+    int i;
+
+    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    p = avctx->extradata;
+    if (p) {
+        s->font_height = p[0];
+        s->flags = p[1];
+        p += 2;
+        if(avctx->extradata_size < 2 + (!!(s->flags & BINTEXT_PALETTE))*3*16
+                                     + (!!(s->flags & BINTEXT_FONT))*s->font_height*256) {
+            av_log(avctx, AV_LOG_ERROR, "not enough extradata\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        s->font_height = 8;
+        s->flags = 0;
+    }
+
+    if ((s->flags & BINTEXT_PALETTE)) {
+        for (i = 0; i < 16; i++) {
+            s->palette[i] = 0xFF000000 | (AV_RB24(p) << 2) | ((AV_RB24(p) >> 4) & 0x30303);
+            p += 3;
+        }
+    } else {
+        for (i = 0; i < 16; i++)
+            s->palette[i] = 0xFF000000 | ff_cga_palette[i];
+    }
+
+    if ((s->flags & BINTEXT_FONT)) {
+        s->font = p;
+    } else {
+        switch(s->font_height) {
+        default:
+            av_log(avctx, AV_LOG_WARNING, "font height %i not supported\n", s->font_height);
+            s->font_height = 8;
+        case 8:
+            s->font = avpriv_cga_font;
+            break;
+        case 16:
+            s->font = avpriv_vga16_font;
+            break;
+        }
+    }
+    if (avctx->width < FONT_WIDTH || avctx->height < s->font_height)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+#define DEFAULT_BG_COLOR 0
+av_unused static void hscroll(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+    if (s->y < avctx->height - s->font_height) {
+        s->y += s->font_height;
+    } else {
+        memmove(s->frame->data[0], s->frame->data[0] + s->font_height*s->frame->linesize[0],
+            (avctx->height - s->font_height)*s->frame->linesize[0]);
+        memset(s->frame->data[0] + (avctx->height - s->font_height)*s->frame->linesize[0],
+            DEFAULT_BG_COLOR, s->font_height * s->frame->linesize[0]);
+    }
+}
+
+/**
+ * Draw character to screen
+ */
+static void draw_char(AVCodecContext *avctx, int c, int a)
+{
+    XbinContext *s = avctx->priv_data;
+    if (s->y > avctx->height - s->font_height)
+        return;
+    ff_draw_pc_font(s->frame->data[0] + s->y * s->frame->linesize[0] + s->x,
+                    s->frame->linesize[0], s->font, s->font_height, c,
+                    a & 0x0F, a >> 4);
+    s->x += FONT_WIDTH;
+    if (s->x > avctx->width - FONT_WIDTH) {
+        s->x = 0;
+        s->y += s->font_height;
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_frame,
+                            AVPacket *avpkt)
+{
+    XbinContext *s = avctx->priv_data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    const uint8_t *buf_end = buf+buf_size;
+    int ret;
+
+    if ((avctx->width / FONT_WIDTH) * (avctx->height / s->font_height) / 256 > buf_size)
+        return AVERROR_INVALIDDATA;
+
+    s->frame = data;
+    s->x = s->y = 0;
+    if ((ret = ff_get_buffer(avctx, s->frame, 0)) < 0)
+        return ret;
+    s->frame->pict_type           = AV_PICTURE_TYPE_I;
+    s->frame->palette_has_changed = 1;
+    memcpy(s->frame->data[1], s->palette, 16 * 4);
+
+    if (avctx->codec_id == AV_CODEC_ID_XBIN) {
+        while (buf + 2 < buf_end) {
+            int i,c,a;
+            int type  = *buf >> 6;
+            int count = (*buf & 0x3F) + 1;
+            buf++;
+            switch (type) {
+            case 0: //no compression
+                for (i = 0; i < count && buf + 1 < buf_end; i++) {
+                    draw_char(avctx, buf[0], buf[1]);
+                    buf += 2;
+                }
+                break;
+            case 1: //character compression
+                c = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, c, *buf++);
+                break;
+            case 2: //attribute compression
+                a = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, *buf++, a);
+                break;
+            case 3: //character/attribute compression
+                c = *buf++;
+                a = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, c, a);
+                break;
+            }
+        }
+    } else if (avctx->codec_id == AV_CODEC_ID_IDF) {
+        while (buf + 2 < buf_end) {
+            if (AV_RL16(buf) == 1) {
+               int i;
+               if (buf + 6 > buf_end)
+                   break;
+               for (i = 0; i < buf[2]; i++)
+                   draw_char(avctx, buf[4], buf[5]);
+               buf += 6;
+            } else {
+               draw_char(avctx, buf[0], buf[1]);
+               buf += 2;
+            }
+        }
+    } else {
+        while (buf + 1 < buf_end) {
+            draw_char(avctx, buf[0], buf[1]);
+            buf += 2;
+        }
+    }
+
+    *got_frame      = 1;
+    return buf_size;
+}
+
+#if CONFIG_BINTEXT_DECODER
+AVCodec ff_bintext_decoder = {
+    .name           = "bintext",
+    .long_name      = NULL_IF_CONFIG_SMALL("Binary text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_BINTEXT,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_XBIN_DECODER
+AVCodec ff_xbin_decoder = {
+    .name           = "xbin",
+    .long_name      = NULL_IF_CONFIG_SMALL("eXtended BINary text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XBIN,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_IDF_DECODER
+AVCodec ff_idf_decoder = {
+    .name           = "idf",
+    .long_name      = NULL_IF_CONFIG_SMALL("iCEDraw text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_IDF,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
diff --git a/libavcodec/bintext.h b/libavcodec/bintext.h
new file mode 100644
index 0000000..21428ba
--- /dev/null
+++ b/libavcodec/bintext.h
@@ -0,0 +1,37 @@
+/*
+ * Binary text decoder
+ * Copyright (c) 2010 Peter Ross (pross@xvid.org)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Binary text decoder
+ */
+
+#ifndef AVCODEC_BINTEXT_H
+#define AVCODEC_BINTEXT_H
+
+/* flag values passed between avformat and avcodec;
+ * while these are identical to the XBIN flags, they are also used
+ * for the BINTEXT and IDF decoders.
+ */
+#define BINTEXT_PALETTE  0x1
+#define BINTEXT_FONT     0x2
+
+#endif /* AVCODEC_BINTEXT_H */
diff --git a/libavcodec/bit_depth_template.c b/libavcodec/bit_depth_template.c
index 27e658b..d44d47e 100644
--- a/libavcodec/bit_depth_template.c
+++ b/libavcodec/bit_depth_template.c
@@ -1,23 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "mathops.h"
 #include "rnd_avg.h"
+#include "libavutil/intreadwrite.h"
 
 #ifndef BIT_DEPTH
 #define BIT_DEPTH 8
@@ -28,6 +29,7 @@
 #   undef pixel2
 #   undef pixel4
 #   undef dctcoef
+#   undef idctin
 #   undef INIT_CLIP
 #   undef no_rnd_avg_pixel4
 #   undef rnd_avg_pixel4
@@ -52,6 +54,16 @@
 #   define pixel4 uint64_t
 #   define dctcoef int32_t
 
+#ifdef IN_IDCT_DEPTH
+#if IN_IDCT_DEPTH == 32
+#   define idctin int32_t
+#else
+#   define idctin int16_t
+#endif
+#else
+#   define idctin int16_t
+#endif
+
 #   define INIT_CLIP
 #   define no_rnd_avg_pixel4 no_rnd_avg64
 #   define    rnd_avg_pixel4    rnd_avg64
@@ -70,8 +82,9 @@
 #   define pixel2 uint16_t
 #   define pixel4 uint32_t
 #   define dctcoef int16_t
+#   define idctin  int16_t
 
-#   define INIT_CLIP const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+#   define INIT_CLIP
 #   define no_rnd_avg_pixel4 no_rnd_avg32
 #   define    rnd_avg_pixel4    rnd_avg32
 #   define AV_RN2P  AV_RN16
@@ -83,10 +96,13 @@
 #   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
 
 #   define av_clip_pixel(a) av_clip_uint8(a)
-#   define CLIP(a) cm[a]
+#   define CLIP(a) av_clip_uint8(a)
 #endif
 
-#define FUNC3(a, b, c)  a ## _ ## b ## c
+#define FUNC3(a, b, c)  a ## _ ## b ##  c
 #define FUNC2(a, b, c)  FUNC3(a, b, c)
 #define FUNC(a)  FUNC2(a, BIT_DEPTH,)
 #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+#define FUNC4(a, b, c)  a ## _int ## b ## _ ## c ## bit
+#define FUNC5(a, b, c)  FUNC4(a, b, c)
+#define FUNC6(a)  FUNC5(a, IN_IDCT_DEPTH, BIT_DEPTH)
diff --git a/libavcodec/bitpacked.c b/libavcodec/bitpacked.c
new file mode 100644
index 0000000..f0b417d
--- /dev/null
+++ b/libavcodec/bitpacked.c
@@ -0,0 +1,149 @@
+/*
+ * Unpack bit-packed streams to formats supported by FFmpeg
+ * Copyright (c) 2017 Savoir-faire Linux, Inc
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Development sponsored by CBC/Radio-Canada */
+
+/**
+ * @file
+ * Bitpacked
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "libavutil/imgutils.h"
+
+struct BitpackedContext {
+    int (*decode)(AVCodecContext *avctx, AVFrame *frame,
+                  AVPacket *pkt);
+};
+
+/* For this format, it's a simple passthrough */
+static int bitpacked_decode_uyvy422(AVCodecContext *avctx, AVFrame *frame,
+                                    AVPacket *avpkt)
+{
+    int ret;
+
+    /* there is no need to copy as the data already match
+     * a known pixel format */
+    frame->buf[0] = av_buffer_ref(avpkt->buf);
+    ret = av_image_fill_arrays(frame->data, frame->linesize, avpkt->data,
+                               avctx->pix_fmt, avctx->width, avctx->height, 1);
+    if (ret < 0) {
+        av_buffer_unref(&frame->buf[0]);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int bitpacked_decode_yuv422p10(AVCodecContext *avctx, AVFrame *frame,
+                                      AVPacket *avpkt)
+{
+    uint64_t frame_size = (uint64_t)avctx->width * (uint64_t)avctx->height * 20;
+    uint64_t packet_size = (uint64_t)avpkt->size * 8;
+    GetBitContext bc;
+    uint16_t *y, *u, *v;
+    int ret, i, j;
+
+    ret = ff_get_buffer(avctx, frame, 0);
+    if (ret < 0)
+        return ret;
+
+    if (frame_size > packet_size)
+        return AVERROR_INVALIDDATA;
+
+    if (avctx->width % 2)
+        return AVERROR_PATCHWELCOME;
+
+    ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height * 20);
+    if (ret)
+        return ret;
+
+    for (i = 0; i < avctx->height; i++) {
+        y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]);
+        u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]);
+        v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]);
+
+        for (j = 0; j < avctx->width; j += 2) {
+            *u++ = get_bits(&bc, 10);
+            *y++ = get_bits(&bc, 10);
+            *v++ = get_bits(&bc, 10);
+            *y++ = get_bits(&bc, 10);
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int bitpacked_init_decoder(AVCodecContext *avctx)
+{
+    struct BitpackedContext *bc = avctx->priv_data;
+
+    if (!avctx->codec_tag || !avctx->width || !avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    if (avctx->codec_tag == MKTAG('U', 'Y', 'V', 'Y')) {
+        if (avctx->bits_per_coded_sample == 16 &&
+            avctx->pix_fmt == AV_PIX_FMT_UYVY422)
+            bc->decode = bitpacked_decode_uyvy422;
+        else if (avctx->bits_per_coded_sample == 20 &&
+                 avctx->pix_fmt == AV_PIX_FMT_YUV422P10)
+            bc->decode = bitpacked_decode_yuv422p10;
+        else
+            return AVERROR_INVALIDDATA;
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int bitpacked_decode(AVCodecContext *avctx, void *data, int *got_frame,
+                            AVPacket *avpkt)
+{
+    struct BitpackedContext *bc = avctx->priv_data;
+    int buf_size = avpkt->size;
+    AVFrame *frame = data;
+    int res;
+
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+
+    res = bc->decode(avctx, frame, avpkt);
+    if (res)
+        return res;
+
+    *got_frame = 1;
+    return buf_size;
+
+}
+
+AVCodec ff_bitpacked_decoder = {
+    .name   = "bitpacked",
+    .long_name = NULL_IF_CONFIG_SMALL("Bitpacked"),
+    .type = AVMEDIA_TYPE_VIDEO,
+    .id = AV_CODEC_ID_BITPACKED,
+    .priv_data_size        = sizeof(struct BitpackedContext),
+    .init = bitpacked_init_decoder,
+    .decode = bitpacked_decode,
+    .capabilities = AV_CODEC_CAP_EXPERIMENTAL,
+};
diff --git a/libavcodec/bitstream.c b/libavcodec/bitstream.c
index c7eba29..8762e5f 100644
--- a/libavcodec/bitstream.c
+++ b/libavcodec/bitstream.c
@@ -6,20 +6,20 @@
  *
  * alternative bitstream reader & writer by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@
  * bitstream api.
  */
 
+#include "libavutil/avassert.h"
+#include "libavutil/qsort.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
@@ -68,6 +70,8 @@ void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
     if (length == 0)
         return;
 
+    av_assert0(length <= put_bits_left(pb));
+
     if (CONFIG_SMALL || words < 16 || put_bits_count(pb) & 7) {
         for (i = 0; i < words; i++)
             put_bits(pb, 16, AV_RB16(src + 2 * i));
@@ -94,9 +98,11 @@ void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
     case 2:                                                 \
         v = *(const uint16_t *)ptr;                         \
         break;                                              \
-    default:                                                \
+    case 4:                                                 \
         v = *(const uint32_t *)ptr;                         \
         break;                                              \
+    default:                                                \
+        av_assert1(0);                                      \
     }                                                       \
 }
 
@@ -107,29 +113,20 @@ static int alloc_table(VLC *vlc, int size, int use_static)
 
     vlc->table_size += size;
     if (vlc->table_size > vlc->table_allocated) {
-        int err;
         if (use_static)
-            return AVERROR_BUG;
+            abort(); // cannot do anything, init_vlc() is used with too little memory
         vlc->table_allocated += (1 << vlc->bits);
-        if ((err = av_reallocp(&vlc->table,
-                               sizeof(VLC_TYPE) * 2 *
-                               vlc->table_allocated)) < 0) {
+        vlc->table = av_realloc_f(vlc->table, vlc->table_allocated, sizeof(VLC_TYPE) * 2);
+        if (!vlc->table) {
             vlc->table_allocated = 0;
             vlc->table_size = 0;
-            return err;
+            return AVERROR(ENOMEM);
         }
+        memset(vlc->table + vlc->table_allocated - (1 << vlc->bits), 0, sizeof(VLC_TYPE) * 2 << vlc->bits);
     }
     return index;
 }
 
-static av_always_inline uint32_t bitswap_32(uint32_t x)
-{
-    return (uint32_t)ff_reverse[ x        & 0xFF] << 24 |
-           (uint32_t)ff_reverse[(x >> 8)  & 0xFF] << 16 |
-           (uint32_t)ff_reverse[(x >> 16) & 0xFF] << 8  |
-           (uint32_t)ff_reverse[ x >> 24];
-}
-
 typedef struct VLCcode {
     uint8_t bits;
     uint16_t symbol;
@@ -163,19 +160,16 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
     int table_size, table_index, index, code_prefix, symbol, subtable_bits;
     int i, j, k, n, nb, inc;
     uint32_t code;
-    VLC_TYPE (*table)[2];
+    volatile VLC_TYPE (* volatile table)[2]; // the double volatile is needed to prevent an internal compiler error in gcc 4.2
 
     table_size = 1 << table_nb_bits;
+    if (table_nb_bits > 30)
+       return AVERROR(EINVAL);
     table_index = alloc_table(vlc, table_size, flags & INIT_VLC_USE_NEW_STATIC);
     ff_dlog(NULL, "new table index=%d size=%d\n", table_index, table_size);
     if (table_index < 0)
         return table_index;
-    table = &vlc->table[table_index];
-
-    for (i = 0; i < table_size; i++) {
-        table[i][1] = 0; //bits
-        table[i][0] = -1; //codes
-    }
+    table = (volatile VLC_TYPE (*)[2])&vlc->table[table_index];
 
     /* first pass: map codes and compute auxiliary table sizes */
     for (i = 0; i < nb_codes; i++) {
@@ -193,8 +187,9 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
                 inc = 1 << n;
             }
             for (k = 0; k < nb; k++) {
+                int bits = table[j][1];
                 ff_dlog(NULL, "%4x: code=%d n=%d\n", j, i, n);
-                if (table[j][1] /*bits*/ != 0) {
+                if (bits != 0 && bits != n) {
                     av_log(NULL, AV_LOG_ERROR, "incorrect codes\n");
                     return AVERROR_INVALIDDATA;
                 }
@@ -229,11 +224,17 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
             if (index < 0)
                 return index;
             /* note: realloc has been done, so reload tables */
-            table = &vlc->table[table_index];
+            table = (volatile VLC_TYPE (*)[2])&vlc->table[table_index];
             table[j][0] = index; //code
             i = k-1;
         }
     }
+
+    for (i = 0; i < table_size; i++) {
+        if (table[i][1] == 0) //bits
+            table[i][0] = -1; //codes
+    }
+
     return table_index;
 }
 
@@ -256,15 +257,15 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
    'bits' or 'codes' tables.
 
    'xxx_size' : gives the number of bytes of each entry of the 'bits'
-   or 'codes' tables.
+   or 'codes' tables. Currently 1,2 and 4 are supported.
 
-   'wrap' and 'size' allows to use any memory configuration and types
+   'wrap' and 'size' make it possible to use any memory configuration and types
    (byte/word/long) to store the 'bits', 'codes', and 'symbols' tables.
 
    'use_static' should be set to 1 for tables, which should be freed
    with av_free_static(), 0 if ff_free_vlc() will be used.
 */
-int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes,
+int ff_init_vlc_sparse(VLC *vlc_arg, int nb_bits, int nb_codes,
                        const void *bits, int bits_wrap, int bits_size,
                        const void *codes, int codes_wrap, int codes_size,
                        const void *symbols, int symbols_wrap, int symbols_size,
@@ -272,61 +273,80 @@ int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes,
 {
     VLCcode *buf;
     int i, j, ret;
+    VLCcode localbuf[1500]; // the maximum currently needed is 1296 by rv34
+    VLC localvlc, *vlc;
 
+    vlc = vlc_arg;
     vlc->bits = nb_bits;
     if (flags & INIT_VLC_USE_NEW_STATIC) {
-        if (vlc->table_size && vlc->table_size == vlc->table_allocated) {
-            return 0;
-        } else if (vlc->table_size) {
-            return AVERROR_BUG;
-        }
+        av_assert0(nb_codes + 1 <= FF_ARRAY_ELEMS(localbuf));
+        buf = localbuf;
+        localvlc = *vlc_arg;
+        vlc = &localvlc;
+        vlc->table_size = 0;
     } else {
         vlc->table           = NULL;
         vlc->table_allocated = 0;
         vlc->table_size      = 0;
-    }
 
-    ff_dlog(NULL, "build table nb_codes=%d\n", nb_codes);
+        buf = av_malloc_array((nb_codes + 1), sizeof(VLCcode));
+        if (!buf)
+            return AVERROR(ENOMEM);
+    }
 
-    buf = av_malloc((nb_codes + 1) * sizeof(VLCcode));
-    if (!buf)
-        return AVERROR(ENOMEM);
 
-    assert(symbols_size <= 2 || !symbols);
+    av_assert0(symbols_size <= 2 || !symbols);
     j = 0;
-#define COPY(condition)                                                     \
+#define COPY(condition)\
     for (i = 0; i < nb_codes; i++) {                                        \
         GET_DATA(buf[j].bits, bits, i, bits_wrap, bits_size);               \
         if (!(condition))                                                   \
             continue;                                                       \
+        if (buf[j].bits > 3*nb_bits || buf[j].bits>32) {                    \
+            av_log(NULL, AV_LOG_ERROR, "Too long VLC (%d) in init_vlc\n", buf[j].bits);\
+            if (!(flags & INIT_VLC_USE_NEW_STATIC))                         \
+                av_free(buf);                                               \
+            return AVERROR(EINVAL);                                         \
+        }                                                                   \
         GET_DATA(buf[j].code, codes, i, codes_wrap, codes_size);            \
+        if (buf[j].code >= (1LL<<buf[j].bits)) {                            \
+            av_log(NULL, AV_LOG_ERROR, "Invalid code %"PRIx32" for %d in "  \
+                   "init_vlc\n", buf[j].code, i);                           \
+            if (!(flags & INIT_VLC_USE_NEW_STATIC))                         \
+                av_free(buf);                                               \
+            return AVERROR(EINVAL);                                         \
+        }                                                                   \
         if (flags & INIT_VLC_LE)                                            \
             buf[j].code = bitswap_32(buf[j].code);                          \
         else                                                                \
             buf[j].code <<= 32 - buf[j].bits;                               \
         if (symbols)                                                        \
             GET_DATA(buf[j].symbol, symbols, i, symbols_wrap, symbols_size) \
-            else                                                            \
-                buf[j].symbol = i;                                          \
+        else                                                                \
+            buf[j].symbol = i;                                              \
         j++;                                                                \
     }
     COPY(buf[j].bits > nb_bits);
     // qsort is the slowest part of init_vlc, and could probably be improved or avoided
-    qsort(buf, j, sizeof(VLCcode), compare_vlcspec);
+    AV_QSORT(buf, j, struct VLCcode, compare_vlcspec);
     COPY(buf[j].bits && buf[j].bits <= nb_bits);
     nb_codes = j;
 
     ret = build_table(vlc, nb_bits, nb_codes, buf, flags);
 
-    av_free(buf);
-    if (ret < 0) {
-        av_freep(&vlc->table);
-        return ret;
+    if (flags & INIT_VLC_USE_NEW_STATIC) {
+        if(vlc->table_size != vlc->table_allocated)
+            av_log(NULL, AV_LOG_ERROR, "needed %d had %d\n", vlc->table_size, vlc->table_allocated);
+
+        av_assert0(ret >= 0);
+        *vlc_arg = *vlc;
+    } else {
+        av_free(buf);
+        if (ret < 0) {
+            av_freep(&vlc->table);
+            return ret;
+        }
     }
-    if ((flags & INIT_VLC_USE_NEW_STATIC) &&
-        vlc->table_size != vlc->table_allocated)
-        av_log(NULL, AV_LOG_ERROR, "needed %d had %d\n",
-               vlc->table_size, vlc->table_allocated);
     return 0;
 }
 
diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
deleted file mode 100644
index 1b23cb2..0000000
--- a/libavcodec/bitstream.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2016 Alexandra Hájková
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * functions for reading bits from a buffer
- */
-
-#ifndef AVCODEC_BITSTREAM_H
-#define AVCODEC_BITSTREAM_H
-
-#include <stdint.h>
-
-#include "libavutil/common.h"
-#include "libavutil/intreadwrite.h"
-#include "libavutil/log.h"
-
-#include "mathops.h"
-
-typedef struct BitstreamContext {
-    uint64_t bits;      // stores bits read from the buffer
-    const uint8_t *buffer, *buffer_end;
-    const uint8_t *ptr; // position inside a buffer
-    unsigned bits_left; // number of bits left in bits field
-    unsigned size_in_bits;
-} BitstreamContext;
-
-static inline void refill_64(BitstreamContext *bc)
-{
-    if (bc->ptr >= bc->buffer_end)
-        return;
-
-#ifdef BITSTREAM_READER_LE
-    bc->bits       = AV_RL64(bc->ptr);
-#else
-    bc->bits       = AV_RB64(bc->ptr);
-#endif
-    bc->ptr       += 8;
-    bc->bits_left  = 64;
-}
-
-static inline void refill_32(BitstreamContext *bc)
-{
-    if (bc->ptr >= bc->buffer_end)
-        return;
-
-#ifdef BITSTREAM_READER_LE
-    bc->bits       = (uint64_t)AV_RL32(bc->ptr) << bc->bits_left | bc->bits;
-#else
-    bc->bits       = bc->bits | (uint64_t)AV_RB32(bc->ptr) << (32 - bc->bits_left);
-#endif
-    bc->ptr       += 4;
-    bc->bits_left += 32;
-}
-
-/* Initialize BitstreamContext. Input buffer must have an additional zero
- * padding of AV_INPUT_BUFFER_PADDING_SIZE bytes at the end. */
-static inline int bitstream_init(BitstreamContext *bc, const uint8_t *buffer,
-                                 unsigned bit_size)
-{
-    unsigned buffer_size;
-
-    if (bit_size > INT_MAX - 7 || !buffer) {
-        buffer        =
-        bc->buffer    =
-        bc->ptr       = NULL;
-        bc->bits_left = 0;
-        return AVERROR_INVALIDDATA;
-    }
-
-    buffer_size = (bit_size + 7) >> 3;
-
-    bc->buffer       = buffer;
-    bc->buffer_end   = buffer + buffer_size;
-    bc->ptr          = bc->buffer;
-    bc->size_in_bits = bit_size;
-    bc->bits_left    = 0;
-    bc->bits         = 0;
-
-    refill_64(bc);
-
-    return 0;
-}
-
-/* Initialize BitstreamContext with buffer size in bytes instead of bits. */
-static inline int bitstream_init8(BitstreamContext *bc, const uint8_t *buffer,
-                                  unsigned byte_size)
-{
-    if (byte_size > INT_MAX / 8)
-        return AVERROR_INVALIDDATA;
-    return bitstream_init(bc, buffer, byte_size * 8);
-}
-
-/* Return number of bits already read. */
-static inline int bitstream_tell(const BitstreamContext *bc)
-{
-    return (bc->ptr - bc->buffer) * 8 - bc->bits_left;
-}
-
-/* Return buffer size in bits. */
-static inline int bitstream_tell_size(const BitstreamContext *bc)
-{
-    return bc->size_in_bits;
-}
-
-/* Return the number of the bits left in a buffer. */
-static inline int bitstream_bits_left(const BitstreamContext *bc)
-{
-    return (bc->buffer - bc->ptr) * 8 + bc->size_in_bits + bc->bits_left;
-}
-
-static inline uint64_t get_val(BitstreamContext *bc, unsigned n)
-{
-#ifdef BITSTREAM_READER_LE
-    uint64_t ret = bc->bits & ((UINT64_C(1) << n) - 1);
-    bc->bits >>= n;
-#else
-    uint64_t ret = bc->bits >> (64 - n);
-    bc->bits <<= n;
-#endif
-    bc->bits_left -= n;
-
-    return ret;
-}
-
-/* Return one bit from the buffer. */
-static inline unsigned bitstream_read_bit(BitstreamContext *bc)
-{
-    if (!bc->bits_left)
-        refill_64(bc);
-
-    return get_val(bc, 1);
-}
-
-/* Return n bits from the buffer. n has to be in the 0-63 range. */
-static inline uint64_t bitstream_read_63(BitstreamContext *bc, unsigned n)
-{
-    uint64_t ret = 0;
-#ifdef BITSTREAM_READER_LE
-    uint64_t left = 0;
-#endif
-
-    if (!n)
-        return 0;
-
-    if (n > bc->bits_left) {
-        n -= bc->bits_left;
-#ifdef BITSTREAM_READER_LE
-        left = bc->bits_left;
-#endif
-        ret = get_val(bc, bc->bits_left);
-        refill_64(bc);
-    }
-
-#ifdef BITSTREAM_READER_LE
-    ret = get_val(bc, n) << left | ret;
-#else
-    ret = get_val(bc, n) | ret << n;
-#endif
-
-    return ret;
-}
-
-/* Return n bits from the buffer. n has to be in the 0-32 range. */
-static inline uint32_t bitstream_read(BitstreamContext *bc, unsigned n)
-{
-    if (!n)
-        return 0;
-
-    if (n > bc->bits_left) {
-        refill_32(bc);
-        if (bc->bits_left < 32)
-            bc->bits_left = n;
-    }
-
-    return get_val(bc, n);
-}
-
-/* Return n bits from the buffer as a signed integer.
- * n has to be in the 0-32 range. */
-static inline int32_t bitstream_read_signed(BitstreamContext *bc, unsigned n)
-{
-    return sign_extend(bitstream_read(bc, n), n);
-}
-
-static inline unsigned show_val(const BitstreamContext *bc, unsigned n)
-{
-#ifdef BITSTREAM_READER_LE
-    return bc->bits & ((UINT64_C(1) << n) - 1);
-#else
-    return bc->bits >> (64 - n);
-#endif
-}
-
-/* Return n bits from the buffer, but do not change the buffer state.
- * n has to be in the 0-32 range. */
-static inline unsigned bitstream_peek(BitstreamContext *bc, unsigned n)
-{
-    if (n > bc->bits_left)
-        refill_32(bc);
-
-    return show_val(bc, n);
-}
-
-/* Return n bits from the buffer as a signed integer, but do not change the
- * buffer state. n has to be in the 0-32 range. */
-static inline int bitstream_peek_signed(BitstreamContext *bc, unsigned n)
-{
-    return sign_extend(bitstream_peek(bc, n), n);
-}
-
-static inline void skip_remaining(BitstreamContext *bc, unsigned n)
-{
-#ifdef BITSTREAM_READER_LE
-    bc->bits >>= n;
-#else
-    bc->bits <<= n;
-#endif
-    bc->bits_left -= n;
-}
-
-/* Skip n bits in the buffer. */
-static inline void bitstream_skip(BitstreamContext *bc, unsigned n)
-{
-    if (n < bc->bits_left)
-        skip_remaining(bc, n);
-    else {
-        n -= bc->bits_left;
-        bc->bits      = 0;
-        bc->bits_left = 0;
-
-        if (n >= 64) {
-            unsigned skip = n / 8;
-
-            n -= skip * 8;
-            bc->ptr += skip;
-        }
-        refill_64(bc);
-        if (n)
-            skip_remaining(bc, n);
-    }
-}
-
-/* Seek to the given bit position. */
-static inline void bitstream_seek(BitstreamContext *bc, unsigned pos)
-{
-    bc->ptr       = bc->buffer;
-    bc->bits      = 0;
-    bc->bits_left = 0;
-
-    bitstream_skip(bc, pos);
-}
-
-/* Skip bits to a byte boundary. */
-static inline const uint8_t *bitstream_align(BitstreamContext *bc)
-{
-    unsigned n = -bitstream_tell(bc) & 7;
-    if (n)
-        bitstream_skip(bc, n);
-    return bc->buffer + (bitstream_tell(bc) >> 3);
-}
-
-/* Read MPEG-1 dc-style VLC (sign bit + mantissa with no MSB).
- * If MSB not set it is negative. */
-static inline int bitstream_read_xbits(BitstreamContext *bc, unsigned length)
-{
-    int32_t cache = bitstream_peek(bc, 32);
-    int sign = ~cache >> 31;
-    skip_remaining(bc, length);
-
-    return ((((uint32_t)(sign ^ cache)) >> (32 - length)) ^ sign) - sign;
-}
-
-/* Return decoded truncated unary code for the values 0, 1, 2. */
-static inline int bitstream_decode012(BitstreamContext *bc)
-{
-    if (!bitstream_read_bit(bc))
-        return 0;
-    else
-        return bitstream_read_bit(bc) + 1;
-}
-
-/* Return decoded truncated unary code for the values 2, 1, 0. */
-static inline int bitstream_decode210(BitstreamContext *bc)
-{
-    if (bitstream_read_bit(bc))
-        return 0;
-    else
-        return 2 - bitstream_read_bit(bc);
-}
-
-/* Read sign bit and flip the sign of the provided value accordingly. */
-static inline int bitstream_apply_sign(BitstreamContext *bc, int val)
-{
-    int sign = bitstream_read_signed(bc, 1);
-    return (val ^ sign) - sign;
-}
-
-/* Unwind the cache so a refill_32 can fill it again. */
-static inline void bitstream_unwind(BitstreamContext *bc)
-{
-    int unwind = 4;
-    int unwind_bits = unwind * 8;
-
-    if (bc->bits_left < unwind_bits)
-        return;
-
-    bc->bits      >>= unwind_bits;
-    bc->bits      <<= unwind_bits;
-    bc->bits_left  -= unwind_bits;
-    bc->ptr        -= unwind;
-}
-
-/* Unget up to 32 bits. */
-static inline void bitstream_unget(BitstreamContext *bc, uint64_t value,
-                                   size_t amount)
-{
-    size_t cache_size = sizeof(bc->bits) * 8;
-
-    if (bc->bits_left + amount > cache_size)
-        bitstream_unwind(bc);
-
-    bc->bits       = (bc->bits >> amount) | (value << (cache_size - amount));
-    bc->bits_left += amount;
-}
-
-#endif /* AVCODEC_BITSTREAM_H */
diff --git a/libavcodec/bitstream_filter.c b/libavcodec/bitstream_filter.c
index ab608a9..ca11ed3 100644
--- a/libavcodec/bitstream_filter.c
+++ b/libavcodec/bitstream_filter.c
@@ -1,42 +1,42 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
 #include "avcodec.h"
-
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
+#include "libavutil/opt.h"
 
 #if FF_API_OLD_BSF
 FF_DISABLE_DEPRECATION_WARNINGS
 
-AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f)
+const AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f)
 {
     const AVBitStreamFilter *filter = NULL;
     void *opaque = NULL;
 
     while (filter != f)
-        filter = av_bsf_next(&opaque);
+        filter = av_bsf_iterate(&opaque);
 
-    return av_bsf_next(&opaque);
+    return av_bsf_iterate(&opaque);
 }
 
 void av_register_bitstream_filter(AVBitStreamFilter *bsf)
@@ -45,6 +45,7 @@ void av_register_bitstream_filter(AVBitStreamFilter *bsf)
 
 typedef struct BSFCompatContext {
     AVBSFContext *ctx;
+    int extradata_updated;
 } BSFCompatContext;
 
 AVBitStreamFilterContext *av_bitstream_filter_init(const char *name)
@@ -81,7 +82,12 @@ fail:
 
 void av_bitstream_filter_close(AVBitStreamFilterContext *bsfc)
 {
-    BSFCompatContext *priv = bsfc->priv_data;
+    BSFCompatContext *priv;
+
+    if (!bsfc)
+        return;
+
+    priv = bsfc->priv_data;
 
     av_bsf_free(&priv->ctx);
     av_freep(&bsfc->priv_data);
@@ -108,23 +114,24 @@ int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
 
         priv->ctx->time_base_in = avctx->time_base;
 
+        if (bsfc->args && bsfc->filter->priv_class) {
+            const AVOption *opt = av_opt_next(priv->ctx->priv_data, NULL);
+            const char * shorthand[2] = {NULL};
+
+            if (opt)
+                shorthand[0] = opt->name;
+
+            ret = av_opt_set_from_string(priv->ctx->priv_data, bsfc->args, shorthand, "=", ":");
+            if (ret < 0)
+                return ret;
+        }
+
         ret = av_bsf_init(priv->ctx);
         if (ret < 0)
             return ret;
-
-        if (priv->ctx->par_out->extradata_size) {
-            av_freep(&avctx->extradata);
-            avctx->extradata_size = 0;
-            avctx->extradata = av_mallocz(priv->ctx->par_out->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!avctx->extradata)
-                return AVERROR(ENOMEM);
-            memcpy(avctx->extradata, priv->ctx->par_out->extradata,
-                   priv->ctx->par_out->extradata_size);
-            avctx->extradata_size = priv->ctx->par_out->extradata_size;
-        }
     }
 
-    pkt.data = buf;
+    pkt.data = (uint8_t *)buf;
     pkt.size = buf_size;
 
     ret = av_bsf_send_packet(priv->ctx, &pkt);
@@ -157,6 +164,21 @@ int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
         av_packet_unref(&pkt);
     }
 
+    if (!priv->extradata_updated) {
+        /* update extradata in avctx from the output codec parameters */
+        if (priv->ctx->par_out->extradata_size && (!args || !strstr(args, "private_spspps_buf"))) {
+            av_freep(&avctx->extradata);
+            avctx->extradata_size = 0;
+            avctx->extradata = av_mallocz(priv->ctx->par_out->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!avctx->extradata)
+                return AVERROR(ENOMEM);
+            memcpy(avctx->extradata, priv->ctx->par_out->extradata, priv->ctx->par_out->extradata_size);
+            avctx->extradata_size = priv->ctx->par_out->extradata_size;
+        }
+
+        priv->extradata_updated = 1;
+    }
+
     return 1;
 }
 FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/libavcodec/bitstream_filters.c b/libavcodec/bitstream_filters.c
index 7841705..2c999d3 100644
--- a/libavcodec/bitstream_filters.c
+++ b/libavcodec/bitstream_filters.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,31 +25,41 @@
 #include "bsf.h"
 
 extern const AVBitStreamFilter ff_aac_adtstoasc_bsf;
+extern const AVBitStreamFilter ff_av1_metadata_bsf;
 extern const AVBitStreamFilter ff_chomp_bsf;
 extern const AVBitStreamFilter ff_dump_extradata_bsf;
+extern const AVBitStreamFilter ff_dca_core_bsf;
+extern const AVBitStreamFilter ff_eac3_core_bsf;
 extern const AVBitStreamFilter ff_extract_extradata_bsf;
+extern const AVBitStreamFilter ff_filter_units_bsf;
 extern const AVBitStreamFilter ff_h264_metadata_bsf;
 extern const AVBitStreamFilter ff_h264_mp4toannexb_bsf;
 extern const AVBitStreamFilter ff_h264_redundant_pps_bsf;
+extern const AVBitStreamFilter ff_hapqa_extract_bsf;
 extern const AVBitStreamFilter ff_hevc_metadata_bsf;
 extern const AVBitStreamFilter ff_hevc_mp4toannexb_bsf;
 extern const AVBitStreamFilter ff_imx_dump_header_bsf;
 extern const AVBitStreamFilter ff_mjpeg2jpeg_bsf;
 extern const AVBitStreamFilter ff_mjpega_dump_header_bsf;
-extern const AVBitStreamFilter ff_mov2textsub_bsf;
+extern const AVBitStreamFilter ff_mp3_header_decompress_bsf;
 extern const AVBitStreamFilter ff_mpeg2_metadata_bsf;
-extern const AVBitStreamFilter ff_null_bsf;
-extern const AVBitStreamFilter ff_text2movsub_bsf;
+extern const AVBitStreamFilter ff_mpeg4_unpack_bframes_bsf;
+extern const AVBitStreamFilter ff_mov2textsub_bsf;
 extern const AVBitStreamFilter ff_noise_bsf;
+extern const AVBitStreamFilter ff_null_bsf;
+extern const AVBitStreamFilter ff_prores_metadata_bsf;
 extern const AVBitStreamFilter ff_remove_extradata_bsf;
+extern const AVBitStreamFilter ff_text2movsub_bsf;
 extern const AVBitStreamFilter ff_trace_headers_bsf;
+extern const AVBitStreamFilter ff_truehd_core_bsf;
+extern const AVBitStreamFilter ff_vp9_metadata_bsf;
 extern const AVBitStreamFilter ff_vp9_raw_reorder_bsf;
 extern const AVBitStreamFilter ff_vp9_superframe_bsf;
 extern const AVBitStreamFilter ff_vp9_superframe_split_bsf;
 
 #include "libavcodec/bsf_list.c"
 
-const AVBitStreamFilter *av_bsf_next(void **opaque)
+const AVBitStreamFilter *av_bsf_iterate(void **opaque)
 {
     uintptr_t i = (uintptr_t)*opaque;
     const AVBitStreamFilter *f = bitstream_filters[i];
@@ -60,12 +70,21 @@ const AVBitStreamFilter *av_bsf_next(void **opaque)
     return f;
 }
 
+#if FF_API_NEXT
+const AVBitStreamFilter *av_bsf_next(void **opaque) {
+    return av_bsf_iterate(opaque);
+}
+#endif
+
 const AVBitStreamFilter *av_bsf_get_by_name(const char *name)
 {
-    int i;
+    const AVBitStreamFilter *f = NULL;
+    void *i = 0;
 
-    for (i = 0; bitstream_filters[i]; i++) {
-        const AVBitStreamFilter *f = bitstream_filters[i];
+    if (!name)
+        return NULL;
+
+    while ((f = av_bsf_iterate(&i))) {
         if (!strcmp(f->name, name))
             return f;
     }
@@ -75,19 +94,20 @@ const AVBitStreamFilter *av_bsf_get_by_name(const char *name)
 
 const AVClass *ff_bsf_child_class_next(const AVClass *prev)
 {
-    int i;
+    const AVBitStreamFilter *f = NULL;
+    void *i = 0;
 
     /* find the filter that corresponds to prev */
-    for (i = 0; prev && bitstream_filters[i]; i++) {
-        if (bitstream_filters[i]->priv_class == prev) {
-            i++;
+    while (prev && (f = av_bsf_iterate(&i))) {
+        if (f->priv_class == prev) {
             break;
         }
     }
 
     /* find next filter with priv options */
-    for (; bitstream_filters[i]; i++)
-        if (bitstream_filters[i]->priv_class)
-            return bitstream_filters[i]->priv_class;
+    while ((f = av_bsf_iterate(&i))) {
+        if (f->priv_class)
+            return f->priv_class;
+    }
     return NULL;
 }
diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c
index a2c0e57..c7efe7e 100644
--- a/libavcodec/blockdsp.c
+++ b/libavcodec/blockdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,17 +20,17 @@
 #include <string.h>
 
 #include "config.h"
-
 #include "libavutil/attributes.h"
-
+#include "avcodec.h"
 #include "blockdsp.h"
+#include "version.h"
 
-static void clear_block_8_c(int16_t *block)
+static void clear_block_c(int16_t *block)
 {
     memset(block, 0, sizeof(int16_t) * 64);
 }
 
-static void clear_blocks_8_c(int16_t *blocks)
+static void clear_blocks_c(int16_t *blocks)
 {
     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 }
@@ -57,18 +57,22 @@ static void fill_block8_c(uint8_t *block, uint8_t value, ptrdiff_t line_size,
     }
 }
 
-av_cold void ff_blockdsp_init(BlockDSPContext *c)
+av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
 {
-    c->clear_block  = clear_block_8_c;
-    c->clear_blocks = clear_blocks_8_c;
+    c->clear_block  = clear_block_c;
+    c->clear_blocks = clear_blocks_c;
 
     c->fill_block_tab[0] = fill_block16_c;
     c->fill_block_tab[1] = fill_block8_c;
 
+    if (ARCH_ALPHA)
+        ff_blockdsp_init_alpha(c);
     if (ARCH_ARM)
         ff_blockdsp_init_arm(c);
     if (ARCH_PPC)
         ff_blockdsp_init_ppc(c);
     if (ARCH_X86)
-        ff_blockdsp_init_x86(c);
+        ff_blockdsp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_blockdsp_init_mips(c);
 }
diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h
index ecce2c3..26fc2ea 100644
--- a/libavcodec/blockdsp.h
+++ b/libavcodec/blockdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,9 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "avcodec.h"
+#include "version.h"
+
 /* add and put pixel (decoding)
  * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
  * h for op_pixels_func is limited to { width / 2, width },
@@ -30,16 +33,18 @@ typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
                              uint8_t value, ptrdiff_t line_size, int h);
 
 typedef struct BlockDSPContext {
-    void (*clear_block)(int16_t *block /* align 16 */);
-    void (*clear_blocks)(int16_t *blocks /* align 16 */);
+    void (*clear_block)(int16_t *block /* align 32 */);
+    void (*clear_blocks)(int16_t *blocks /* align 32 */);
 
     op_fill_func fill_block_tab[2];
 } BlockDSPContext;
 
-void ff_blockdsp_init(BlockDSPContext *c);
+void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx);
 
+void ff_blockdsp_init_alpha(BlockDSPContext *c);
 void ff_blockdsp_init_arm(BlockDSPContext *c);
 void ff_blockdsp_init_ppc(BlockDSPContext *c);
-void ff_blockdsp_init_x86(BlockDSPContext *c);
+void ff_blockdsp_init_x86(BlockDSPContext *c, AVCodecContext *avctx);
+void ff_blockdsp_init_mips(BlockDSPContext *c);
 
 #endif /* AVCODEC_BLOCKDSP_H */
diff --git a/libavcodec/bmp.c b/libavcodec/bmp.c
index 5c7acb6..65d239e 100644
--- a/libavcodec/bmp.c
+++ b/libavcodec/bmp.c
@@ -2,20 +2,20 @@
  * BMP image format decoder
  * Copyright (c) 2005 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,7 +40,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     BiCompression comp;
     unsigned int ihsize;
     int i, j, n, linesize, ret;
-    uint32_t rgb[3];
+    uint32_t rgb[3] = {0};
+    uint32_t alpha = 0;
     uint8_t *ptr;
     int dsize;
     const uint8_t *buf0 = buf;
@@ -69,7 +70,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
 
     hsize  = bytestream_get_le32(&buf); /* header size */
     ihsize = bytestream_get_le32(&buf); /* more header size */
-    if (ihsize + 14 > hsize) {
+    if (ihsize + 14LL > hsize) {
         av_log(avctx, AV_LOG_ERROR, "invalid header size %u\n", hsize);
         return AVERROR_INVALIDDATA;
     }
@@ -86,7 +87,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     }
 
     switch (ihsize) {
-    case  40: // windib v3
+    case  40: // windib
+    case  56: // windib v3
     case  64: // OS/2 v2
     case 108: // windib v4
     case 124: // windib v5
@@ -111,7 +113,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
 
     depth = bytestream_get_le16(&buf);
 
-    if (ihsize == 40)
+    if (ihsize >= 40)
         comp = bytestream_get_le32(&buf);
     else
         comp = BMP_RGB;
@@ -127,31 +129,37 @@ static int bmp_decode_frame(AVCodecContext *avctx,
         rgb[0] = bytestream_get_le32(&buf);
         rgb[1] = bytestream_get_le32(&buf);
         rgb[2] = bytestream_get_le32(&buf);
+        if (ihsize > 40)
+        alpha = bytestream_get_le32(&buf);
     }
 
-    avctx->width  = width;
-    avctx->height = height > 0 ? height : -height;
+    ret = ff_set_dimensions(avctx, width, height > 0 ? height : -(unsigned)height);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions %d %d\n", width, height);
+        return AVERROR_INVALIDDATA;
+    }
 
     avctx->pix_fmt = AV_PIX_FMT_NONE;
 
     switch (depth) {
     case 32:
         if (comp == BMP_BITFIELDS) {
-            rgb[0] = (rgb[0] >> 15) & 3;
-            rgb[1] = (rgb[1] >> 15) & 3;
-            rgb[2] = (rgb[2] >> 15) & 3;
-
-            if (rgb[0] + rgb[1] + rgb[2] != 3 ||
-                rgb[0] == rgb[1] || rgb[0] == rgb[2] || rgb[1] == rgb[2]) {
-                break;
+            if (rgb[0] == 0xFF000000 && rgb[1] == 0x00FF0000 && rgb[2] == 0x0000FF00)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_ABGR : AV_PIX_FMT_0BGR;
+            else if (rgb[0] == 0x00FF0000 && rgb[1] == 0x0000FF00 && rgb[2] == 0x000000FF)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_BGRA : AV_PIX_FMT_BGR0;
+            else if (rgb[0] == 0x0000FF00 && rgb[1] == 0x00FF0000 && rgb[2] == 0xFF000000)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_ARGB : AV_PIX_FMT_0RGB;
+            else if (rgb[0] == 0x000000FF && rgb[1] == 0x0000FF00 && rgb[2] == 0x00FF0000)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_RGBA : AV_PIX_FMT_RGB0;
+            else {
+                av_log(avctx, AV_LOG_ERROR, "Unknown bitfields "
+                       "%0"PRIX32" %0"PRIX32" %0"PRIX32"\n", rgb[0], rgb[1], rgb[2]);
+                return AVERROR(EINVAL);
             }
         } else {
-            rgb[0] = 2;
-            rgb[1] = 1;
-            rgb[2] = 0;
+            avctx->pix_fmt = AV_PIX_FMT_BGRA;
         }
-
-        avctx->pix_fmt = AV_PIX_FMT_BGR24;
         break;
     case 24:
         avctx->pix_fmt = AV_PIX_FMT_BGR24;
@@ -200,10 +208,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -211,12 +217,16 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     dsize = buf_size - hsize;
 
     /* Line size in file multiple of 4 */
-    n = ((avctx->width * depth) / 8 + 3) & ~3;
+    n = ((avctx->width * depth + 31) / 8) & ~3;
 
     if (n * avctx->height > dsize && comp != BMP_RLE4 && comp != BMP_RLE8) {
-        av_log(avctx, AV_LOG_ERROR, "not enough data (%d < %d)\n",
-               dsize, n * avctx->height);
-        return AVERROR_INVALIDDATA;
+        n = (avctx->width * depth + 7) / 8;
+        if (n * avctx->height > dsize) {
+            av_log(avctx, AV_LOG_ERROR, "not enough data (%d < %d)\n",
+                   dsize, n * avctx->height);
+            return AVERROR_INVALIDDATA;
+        }
+        av_log(avctx, AV_LOG_ERROR, "data size too small, assuming missing line alignment\n");
     }
 
     // RLE may skip decoding some picture areas, so blank picture before decoding
@@ -247,20 +257,26 @@ static int bmp_decode_frame(AVCodecContext *avctx,
             } else if (t) {
                 colors = t;
             }
+        } else {
+            colors = FFMIN(256, (hsize-ihsize-14) / 3);
         }
         buf = buf0 + 14 + ihsize; //palette location
         // OS/2 bitmap, 3 bytes per palette entry
         if ((hsize-ihsize-14) < (colors << 2)) {
+            if ((hsize-ihsize-14) < colors * 3) {
+                av_log(avctx, AV_LOG_ERROR, "palette doesn't fit in packet\n");
+                return AVERROR_INVALIDDATA;
+            }
             for (i = 0; i < colors; i++)
-                ((uint32_t*)p->data[1])[i] = bytestream_get_le24(&buf);
+                ((uint32_t*)p->data[1])[i] = (0xFFU<<24) | bytestream_get_le24(&buf);
         } else {
             for (i = 0; i < colors; i++)
-                ((uint32_t*)p->data[1])[i] = bytestream_get_le32(&buf);
+                ((uint32_t*)p->data[1])[i] = 0xFFU << 24 | bytestream_get_le32(&buf);
         }
         buf = buf0 + hsize;
     }
     if (comp == BMP_RLE4 || comp == BMP_RLE8) {
-        if (height < 0) {
+        if (comp == BMP_RLE8 && height < 0) {
             p->data[0]    +=  p->linesize[0] * (avctx->height - 1);
             p->linesize[0] = -p->linesize[0];
         }
@@ -291,6 +307,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
             break;
         case 8:
         case 24:
+        case 32:
             for (i = 0; i < avctx->height; i++) {
                 memcpy(ptr, buf, n);
                 buf += n;
@@ -320,28 +337,25 @@ static int bmp_decode_frame(AVCodecContext *avctx,
                 ptr += linesize;
             }
             break;
-        case 32:
-            for (i = 0; i < avctx->height; i++) {
-                const uint8_t *src = buf;
-                uint8_t *dst       = ptr;
-
-                for (j = 0; j < avctx->width; j++) {
-                    dst[0] = src[rgb[2]];
-                    dst[1] = src[rgb[1]];
-                    dst[2] = src[rgb[0]];
-                    dst += 3;
-                    src += 4;
-                }
-
-                buf += n;
-                ptr += linesize;
-            }
-            break;
         default:
             av_log(avctx, AV_LOG_ERROR, "BMP decoder is broken\n");
             return AVERROR_INVALIDDATA;
         }
     }
+    if (avctx->pix_fmt == AV_PIX_FMT_BGRA) {
+        for (i = 0; i < avctx->height; i++) {
+            int j;
+            uint8_t *ptr = p->data[0] + p->linesize[0]*i + 3;
+            for (j = 0; j < avctx->width; j++) {
+                if (ptr[4*j])
+                    break;
+            }
+            if (j < avctx->width)
+                break;
+        }
+        if (i == avctx->height)
+            avctx->pix_fmt = p->format = AV_PIX_FMT_BGR0;
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/bmp.h b/libavcodec/bmp.h
index a472f59..fb21090 100644
--- a/libavcodec/bmp.h
+++ b/libavcodec/bmp.h
@@ -2,20 +2,20 @@
  * internals for BMP codecs
  * Copyright (c) 2005 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bmp_parser.c b/libavcodec/bmp_parser.c
index b85dd8b..cd65f02 100644
--- a/libavcodec/bmp_parser.c
+++ b/libavcodec/bmp_parser.c
@@ -2,20 +2,20 @@
  * BMP parser
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,21 +45,37 @@ static int bmp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     int i = 0;
 
     *poutbuf_size = 0;
-    if (buf_size == 0)
-        return 0;
 
-    if (!bpc->pc.frame_start_found) {
+restart:
+    if (bpc->pc.frame_start_found <= 2+4+4) {
         for (; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state >> 48) == (('B' << 8) | 'M')) {
-                bpc->fsize = av_bswap32(state >> 16);
-                bpc->pc.frame_start_found = 1;
-                if (bpc->fsize > buf_size - i + 7)
-                    bpc->remaining_size = bpc->fsize - buf_size + i - 7;
-                else
-                    next = bpc->fsize + i - 7;
-                break;
-            }
+            if (bpc->pc.frame_start_found == 0) {
+                if ((state >> 48) == (('B' << 8) | 'M')) {
+                    bpc->fsize = av_bswap32(state >> 16);
+                    if (bpc->fsize > 17)
+                        bpc->pc.frame_start_found = 1;
+                }
+            } else if (bpc->pc.frame_start_found == 2+4+4) {
+//                 unsigned hsize = av_bswap32(state>>32);
+                unsigned ihsize = av_bswap32(state);
+                if (ihsize < 12 || ihsize > 200) {
+                    bpc->pc.frame_start_found = 0;
+                    continue;
+                }
+                bpc->pc.frame_start_found++;
+                bpc->remaining_size = bpc->fsize + i - 17;
+
+                if (bpc->pc.index + i > 17) {
+                    next = i - 17;
+                    state = 0;
+                    break;
+                } else {
+                    bpc->pc.state64 = 0;
+                    goto restart;
+                }
+            } else if (bpc->pc.frame_start_found)
+                bpc->pc.frame_start_found++;
         }
         bpc->pc.state64 = state;
     } else {
@@ -68,7 +84,9 @@ static int bmp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
             bpc->remaining_size -= i;
             if (bpc->remaining_size)
                 goto flush;
-            next = i;
+
+            bpc->pc.frame_start_found = 0;
+            goto restart;
         }
     }
 
@@ -76,7 +94,10 @@ flush:
     if (ff_combine_frame(&bpc->pc, next, &buf, &buf_size) < 0)
         return buf_size;
 
-    bpc->pc.frame_start_found = 0;
+    if (next != END_NOT_FOUND && next < 0)
+        bpc->pc.frame_start_found = FFMAX(bpc->pc.frame_start_found - i - 1, 0);
+    else
+        bpc->pc.frame_start_found = 0;
 
     *poutbuf      = buf;
     *poutbuf_size = buf_size;
diff --git a/libavcodec/bmpenc.c b/libavcodec/bmpenc.c
index 915c396..e829d68 100644
--- a/libavcodec/bmpenc.c
+++ b/libavcodec/bmpenc.c
@@ -3,24 +3,25 @@
  * Copyright (c) 2006, 2007 Michel Bardiaux
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "bmp.h"
@@ -32,6 +33,9 @@ static const uint32_t rgb444_masks[]  = { 0x0F00, 0x00F0, 0x000F };
 
 static av_cold int bmp_encode_init(AVCodecContext *avctx){
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_BGRA:
+        avctx->bits_per_coded_sample = 32;
+        break;
     case AV_PIX_FMT_BGR24:
         avctx->bits_per_coded_sample = 24;
         break;
@@ -53,7 +57,7 @@ static av_cold int bmp_encode_init(AVCodecContext *avctx){
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     return 0;
@@ -65,6 +69,7 @@ static int bmp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame * const p = pict;
     int n_bytes_image, n_bytes_per_row, n_bytes, i, n, hsize, ret;
     const uint32_t *pal = NULL;
+    uint32_t palette256[256];
     int pad_bytes_per_row, pal_entries = 0, compression = BMP_RGB;
     int bit_count = avctx->bits_per_coded_sample;
     uint8_t *ptr, *buf;
@@ -91,7 +96,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
     case AV_PIX_FMT_RGB4_BYTE:
     case AV_PIX_FMT_BGR4_BYTE:
     case AV_PIX_FMT_GRAY8:
-        avpriv_set_systematic_pal2((uint32_t*)p->data[1], avctx->pix_fmt);
+        av_assert1(bit_count == 8);
+        avpriv_set_systematic_pal2(palette256, avctx->pix_fmt);
+        pal = palette256;
+        break;
     case AV_PIX_FMT_PAL8:
         pal = (uint32_t *)p->data[1];
         break;
@@ -110,10 +118,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define SIZE_BITMAPINFOHEADER 40
     hsize = SIZE_BITMAPFILEHEADER + SIZE_BITMAPINFOHEADER + (pal_entries << 2);
     n_bytes = n_bytes_image + hsize;
-    if ((ret = ff_alloc_packet(pkt, n_bytes)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, n_bytes, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
     bytestream_put_byte(&buf, 'B');                   // BITMAPFILEHEADER.bfType
     bytestream_put_byte(&buf, 'M');                   // do.
@@ -165,8 +171,8 @@ AVCodec ff_bmp_encoder = {
     .init           = bmp_encode_init,
     .encode2        = bmp_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_BGR24,
-        AV_PIX_FMT_RGB555, AV_PIX_FMT_RGB444, AV_PIX_FMT_RGB565,
+        AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR24,
+        AV_PIX_FMT_RGB565, AV_PIX_FMT_RGB555, AV_PIX_FMT_RGB444,
         AV_PIX_FMT_RGB8, AV_PIX_FMT_BGR8, AV_PIX_FMT_RGB4_BYTE, AV_PIX_FMT_BGR4_BYTE, AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_MONOBLACK,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/bmvaudio.c b/libavcodec/bmvaudio.c
index 8b4bd78..b1587ab 100644
--- a/libavcodec/bmvaudio.c
+++ b/libavcodec/bmvaudio.c
@@ -2,20 +2,20 @@
  * Discworld II BMV audio decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,10 +58,8 @@ static int bmv_aud_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = total_blocks * 32;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples = (int16_t *)frame->data[0];
 
     for (blocks = 0; blocks < total_blocks; blocks++) {
diff --git a/libavcodec/bmvvideo.c b/libavcodec/bmvvideo.c
index 698bc56..679b142 100644
--- a/libavcodec/bmvvideo.c
+++ b/libavcodec/bmvvideo.c
@@ -2,23 +2,24 @@
  * Discworld II BMV video decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 
 #include "avcodec.h"
@@ -50,7 +51,7 @@ typedef struct BMVDecContext {
     const uint8_t *stream;
 } BMVDecContext;
 
-#define NEXT_BYTE(v) v = forward ? v + 1 : v - 1;
+#define NEXT_BYTE(v) (v) = forward ? (v) + 1 : (v) - 1;
 
 static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame, int frame_off)
 {
@@ -100,11 +101,13 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
         }
         if (!(val & 0xC)) {
             for (;;) {
+                if(shift>22)
+                    return -1;
                 if (!read_two_nibbles) {
                     if (src < source || src >= source_end)
                         return AVERROR_INVALIDDATA;
                     shift += 2;
-                    val |= *src << shift;
+                    val |= (unsigned)*src << shift;
                     if (*src & 0xC)
                         break;
                 }
@@ -133,6 +136,7 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
         }
         advance_mode = val & 1;
         len = (val >> 1) - 1;
+        av_assert0(len>0);
         mode += 1 + advance_mode;
         if (mode >= 4)
             mode -= 3;
@@ -185,8 +189,6 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
                 memset(dst, val, len);
             }
             break;
-        default:
-            break;
         }
         if (dst == dst_end)
             return 0;
@@ -226,7 +228,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
         }
         for (i = 0; i < 256; i++)
-            c->pal[i] = bytestream_get_be24(&c->stream);
+            c->pal[i] = 0xFFU << 24 | bytestream_get_be24(&c->stream);
     }
     if (type & BMV_SCROLL) {
         if (c->stream - pkt->data > pkt->size - 2) {
@@ -240,10 +242,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         scr_off = 0;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (decode_bmv_frame(c->stream, pkt->size - (c->stream - pkt->data), c->frame, scr_off)) {
         av_log(avctx, AV_LOG_ERROR, "Error decoding frame data\n");
@@ -275,6 +275,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
+    if (avctx->width != SCREEN_WIDE || avctx->height != SCREEN_HIGH) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid dimension %dx%d\n", avctx->width, avctx->height);
+        return AVERROR_INVALIDDATA;
+    }
+
     c->frame = c->frame_base + 640;
 
     return 0;
diff --git a/libavcodec/brenderpix.c b/libavcodec/brenderpix.c
index a4b4c87..0556858 100644
--- a/libavcodec/brenderpix.c
+++ b/libavcodec/brenderpix.c
@@ -2,20 +2,20 @@
  * BRender PIX (.pix) image decoder
  * Copyright (c) 2012 Aleksi Nurmi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -134,7 +134,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 {
     AVFrame *frame = data;
 
-    int ret, i, j;
+    int ret, i;
     GetByteContext gb;
 
     unsigned int bytes_pp;
@@ -142,6 +142,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     unsigned int chunk_type;
     unsigned int data_len;
     unsigned int bytes_per_scanline;
+    unsigned int bytes_left;
     PixHeader hdr;
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
@@ -168,7 +169,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     ret = pix_decode_header(&hdr, &gb);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Invalid header length.\n");
-        return AVERROR_INVALIDDATA;
+        return ret;
     }
     switch (hdr.format) {
     case 3:
@@ -187,7 +188,10 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->pix_fmt = AV_PIX_FMT_RGB24;
         bytes_pp = 3;
         break;
-    case 7: // XRGB
+    case 7:
+        avctx->pix_fmt = AV_PIX_FMT_0RGB;
+        bytes_pp = 4;
+        break;
     case 8: // ARGB
         avctx->pix_fmt = AV_PIX_FMT_ARGB;
         bytes_pp = 4;
@@ -219,22 +223,21 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ret = pix_decode_header(&palhdr, &gb);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "Invalid palette header length.\n");
-            return AVERROR_INVALIDDATA;
+            return ret;
         }
         if (palhdr.format != 7)
             avpriv_request_sample(avctx, "Palette not in RGB format");
 
         chunk_type = bytestream2_get_be32(&gb);
         data_len = bytestream2_get_be32(&gb);
-        if (chunk_type != IMAGE_DATA_CHUNK ||
-            bytestream2_get_bytes_left(&gb) < data_len) {
+        bytestream2_skip(&gb, 8);
+        if (chunk_type != IMAGE_DATA_CHUNK || data_len != 1032 ||
+            bytestream2_get_bytes_left(&gb) < 1032) {
             av_log(avctx, AV_LOG_ERROR, "Invalid palette data.\n");
             return AVERROR_INVALIDDATA;
         }
-
         // palette data is surrounded by 8 null bytes (both top and bottom)
-        bytestream2_skip(&gb, 8);
-        // convert to machine endian format (ARGB)
+        // convert 0RGB to machine endian format (ARGB32)
         for (i = 0; i < 256; ++i)
             *pal_out++ = (0xFFU << 24) | bytestream2_get_be32u(&gb);
         bytestream2_skip(&gb, 8);
@@ -259,9 +262,10 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // read the image data to the buffer
     bytes_per_scanline = bytes_pp * hdr.width;
-    if (chunk_type != IMAGE_DATA_CHUNK ||
-        data_len < bytes_per_scanline * hdr.height ||
-        bytestream2_get_bytes_left(&gb) < data_len) {
+    bytes_left = bytestream2_get_bytes_left(&gb);
+
+    if (chunk_type != IMAGE_DATA_CHUNK || data_len != bytes_left ||
+        bytes_left / bytes_per_scanline < hdr.height) {
         av_log(avctx, AV_LOG_ERROR, "Invalid image data.\n");
         return AVERROR_INVALIDDATA;
     }
@@ -271,12 +275,6 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         bytes_per_scanline,
                         bytes_per_scanline, hdr.height);
 
-    // make alpha opaque for XRGB
-    if (hdr.format == 7)
-        for (j = 0; j < frame->height; j++)
-            for (i = 0; i < frame->linesize[0]; i += 4)
-                frame->data[0][j * frame->linesize[0] + i] = 0xFF;
-
     frame->pict_type = AV_PICTURE_TYPE_I;
     frame->key_frame = 1;
     *got_frame = 1;
diff --git a/libavcodec/bsf.c b/libavcodec/bsf.c
index f6f894b..5081307 100644
--- a/libavcodec/bsf.c
+++ b/libavcodec/bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,8 @@
 #include "libavutil/log.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 
 #include "avcodec.h"
 #include "bsf.h"
@@ -182,7 +184,9 @@ void av_bsf_flush(AVBSFContext *ctx)
 
 int av_bsf_send_packet(AVBSFContext *ctx, AVPacket *pkt)
 {
-    if (!pkt || !pkt->data) {
+    int ret;
+
+    if (!pkt || (!pkt->data && !pkt->side_data_elems)) {
         ctx->internal->eof = 1;
         return 0;
     }
@@ -196,6 +200,9 @@ int av_bsf_send_packet(AVBSFContext *ctx, AVPacket *pkt)
         ctx->internal->buffer_pkt->side_data_elems)
         return AVERROR(EAGAIN);
 
+    ret = av_packet_make_refcounted(pkt);
+    if (ret < 0)
+        return ret;
     av_packet_move_ref(ctx->internal->buffer_pkt, pkt);
 
     return 0;
@@ -243,3 +250,318 @@ int ff_bsf_get_packet_ref(AVBSFContext *ctx, AVPacket *pkt)
 
     return 0;
 }
+
+typedef struct BSFListContext {
+    const AVClass *class;
+
+    AVBSFContext **bsfs;
+    int nb_bsfs;
+
+    unsigned idx;           // index of currently processed BSF
+    unsigned flushed_idx;   // index of BSF being flushed
+
+    char * item_name;
+} BSFListContext;
+
+
+static int bsf_list_init(AVBSFContext *bsf)
+{
+    BSFListContext *lst = bsf->priv_data;
+    int ret, i;
+    const AVCodecParameters *cod_par = bsf->par_in;
+    AVRational tb = bsf->time_base_in;
+
+    for (i = 0; i < lst->nb_bsfs; ++i) {
+        ret = avcodec_parameters_copy(lst->bsfs[i]->par_in, cod_par);
+        if (ret < 0)
+            goto fail;
+
+        lst->bsfs[i]->time_base_in = tb;
+
+        ret = av_bsf_init(lst->bsfs[i]);
+        if (ret < 0)
+            goto fail;
+
+        cod_par = lst->bsfs[i]->par_out;
+        tb = lst->bsfs[i]->time_base_out;
+    }
+
+    bsf->time_base_out = tb;
+    ret = avcodec_parameters_copy(bsf->par_out, cod_par);
+
+fail:
+    return ret;
+}
+
+static int bsf_list_filter(AVBSFContext *bsf, AVPacket *out)
+{
+    BSFListContext *lst = bsf->priv_data;
+    int ret;
+
+    if (!lst->nb_bsfs)
+        return ff_bsf_get_packet_ref(bsf, out);
+
+    while (1) {
+        if (lst->idx > lst->flushed_idx) {
+            ret = av_bsf_receive_packet(lst->bsfs[lst->idx-1], out);
+            if (ret == AVERROR(EAGAIN)) {
+                /* no more packets from idx-1, try with previous */
+                ret = 0;
+                lst->idx--;
+                continue;
+            } else if (ret == AVERROR_EOF) {
+                /* filter idx-1 is done, continue with idx...nb_bsfs */
+                lst->flushed_idx = lst->idx;
+                continue;
+            }else if (ret < 0) {
+                /* filtering error */
+                break;
+            }
+        } else {
+            ret = ff_bsf_get_packet_ref(bsf, out);
+            if (ret == AVERROR_EOF) {
+                lst->idx = lst->flushed_idx;
+            } else if (ret < 0)
+                break;
+        }
+
+        if (lst->idx < lst->nb_bsfs) {
+            AVPacket *pkt;
+            if (ret == AVERROR_EOF && lst->idx == lst->flushed_idx) {
+                /* ff_bsf_get_packet_ref returned EOF and idx is first
+                 * filter of yet not flushed filter chain */
+                pkt = NULL;
+            } else {
+                pkt = out;
+            }
+            ret = av_bsf_send_packet(lst->bsfs[lst->idx], pkt);
+            if (ret < 0)
+                break;
+            lst->idx++;
+        } else {
+            /* The end of filter chain, break to return result */
+            break;
+        }
+    }
+
+    if (ret < 0)
+        av_packet_unref(out);
+
+    return ret;
+}
+
+static void bsf_list_flush(AVBSFContext *bsf)
+{
+    BSFListContext *lst = bsf->priv_data;
+
+    for (int i = 0; i < lst->nb_bsfs; i++)
+        av_bsf_flush(lst->bsfs[i]);
+    lst->idx = lst->flushed_idx = 0;
+}
+
+static void bsf_list_close(AVBSFContext *bsf)
+{
+    BSFListContext *lst = bsf->priv_data;
+    int i;
+
+    for (i = 0; i < lst->nb_bsfs; ++i)
+        av_bsf_free(&lst->bsfs[i]);
+    av_freep(&lst->bsfs);
+    av_freep(&lst->item_name);
+}
+
+static const char *bsf_list_item_name(void *ctx)
+{
+    static const char *null_filter_name = "null";
+    AVBSFContext *bsf_ctx = ctx;
+    BSFListContext *lst = bsf_ctx->priv_data;
+
+    if (!lst->nb_bsfs)
+        return null_filter_name;
+
+    if (!lst->item_name) {
+        int i;
+        AVBPrint bp;
+        av_bprint_init(&bp, 16, 128);
+
+        av_bprintf(&bp, "bsf_list(");
+        for (i = 0; i < lst->nb_bsfs; i++)
+            av_bprintf(&bp, i ? ",%s" : "%s", lst->bsfs[i]->filter->name);
+        av_bprintf(&bp, ")");
+
+        av_bprint_finalize(&bp, &lst->item_name);
+    }
+
+    return lst->item_name;
+}
+
+static const AVClass bsf_list_class = {
+        .class_name = "bsf_list",
+        .item_name  = bsf_list_item_name,
+        .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const AVBitStreamFilter ff_list_bsf = {
+        .name           = "bsf_list",
+        .priv_data_size = sizeof(BSFListContext),
+        .priv_class     = &bsf_list_class,
+        .init           = bsf_list_init,
+        .filter         = bsf_list_filter,
+        .flush          = bsf_list_flush,
+        .close          = bsf_list_close,
+};
+
+struct AVBSFList {
+    AVBSFContext **bsfs;
+    int nb_bsfs;
+};
+
+AVBSFList *av_bsf_list_alloc(void)
+{
+    return av_mallocz(sizeof(AVBSFList));
+}
+
+void av_bsf_list_free(AVBSFList **lst)
+{
+    int i;
+
+    if (!*lst)
+        return;
+
+    for (i = 0; i < (*lst)->nb_bsfs; ++i)
+        av_bsf_free(&(*lst)->bsfs[i]);
+    av_free((*lst)->bsfs);
+    av_freep(lst);
+}
+
+int av_bsf_list_append(AVBSFList *lst, AVBSFContext *bsf)
+{
+    return av_dynarray_add_nofree(&lst->bsfs, &lst->nb_bsfs, bsf);
+}
+
+int av_bsf_list_append2(AVBSFList *lst, const char *bsf_name, AVDictionary ** options)
+{
+    int ret;
+    const AVBitStreamFilter *filter;
+    AVBSFContext *bsf;
+
+    filter = av_bsf_get_by_name(bsf_name);
+    if (!filter)
+        return AVERROR_BSF_NOT_FOUND;
+
+    ret = av_bsf_alloc(filter, &bsf);
+    if (ret < 0)
+        return ret;
+
+    if (options) {
+        ret = av_opt_set_dict2(bsf, options, AV_OPT_SEARCH_CHILDREN);
+        if (ret < 0)
+            goto end;
+    }
+
+    ret = av_bsf_list_append(lst, bsf);
+
+end:
+    if (ret < 0)
+        av_bsf_free(&bsf);
+
+    return ret;
+}
+
+int av_bsf_list_finalize(AVBSFList **lst, AVBSFContext **bsf)
+{
+    int ret = 0;
+    BSFListContext *ctx;
+
+    if ((*lst)->nb_bsfs == 1) {
+        *bsf = (*lst)->bsfs[0];
+        av_freep(&(*lst)->bsfs);
+        (*lst)->nb_bsfs = 0;
+        goto end;
+    }
+
+    ret = av_bsf_alloc(&ff_list_bsf, bsf);
+    if (ret < 0)
+        return ret;
+
+    ctx = (*bsf)->priv_data;
+
+    ctx->bsfs = (*lst)->bsfs;
+    ctx->nb_bsfs = (*lst)->nb_bsfs;
+
+end:
+    av_freep(lst);
+    return ret;
+}
+
+static int bsf_parse_single(const char *str, AVBSFList *bsf_lst)
+{
+    char *bsf_name, *bsf_options_str, *buf;
+    AVDictionary *bsf_options = NULL;
+    int ret = 0;
+
+    if (!(buf = av_strdup(str)))
+        return AVERROR(ENOMEM);
+
+    bsf_name = av_strtok(buf, "=", &bsf_options_str);
+    if (!bsf_name) {
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    if (bsf_options_str) {
+        ret = av_dict_parse_string(&bsf_options, bsf_options_str, "=", ":", 0);
+        if (ret < 0)
+            goto end;
+    }
+
+    ret = av_bsf_list_append2(bsf_lst, bsf_name, &bsf_options);
+
+    av_dict_free(&bsf_options);
+end:
+    av_free(buf);
+    return ret;
+}
+
+int av_bsf_list_parse_str(const char *str, AVBSFContext **bsf_lst)
+{
+    AVBSFList *lst;
+    char *bsf_str, *buf, *dup, *saveptr;
+    int ret;
+
+    if (!str)
+        return av_bsf_get_null_filter(bsf_lst);
+
+    lst = av_bsf_list_alloc();
+    if (!lst)
+        return AVERROR(ENOMEM);
+
+    if (!(dup = buf = av_strdup(str))) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    while (1) {
+        bsf_str = av_strtok(buf, ",", &saveptr);
+        if (!bsf_str)
+            break;
+
+        ret = bsf_parse_single(bsf_str, lst);
+        if (ret < 0)
+            goto end;
+
+        buf = NULL;
+    }
+
+    ret = av_bsf_list_finalize(&lst, bsf_lst);
+end:
+    if (ret < 0)
+        av_bsf_list_free(&lst);
+    av_free(dup);
+    return ret;
+}
+
+int av_bsf_get_null_filter(AVBSFContext **bsf)
+{
+    return av_bsf_alloc(&ff_list_bsf, bsf);
+}
diff --git a/libavcodec/bsf.h b/libavcodec/bsf.h
index 39301a2..af035ee 100644
--- a/libavcodec/bsf.h
+++ b/libavcodec/bsf.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bswapdsp.c b/libavcodec/bswapdsp.c
index 6700cfd..a6e1ec0 100644
--- a/libavcodec/bswapdsp.c
+++ b/libavcodec/bswapdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bswapdsp.h b/libavcodec/bswapdsp.h
index fd10a88..4d19092 100644
--- a/libavcodec/bswapdsp.h
+++ b/libavcodec/bswapdsp.h
@@ -1,23 +1,23 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_BSWAP_BUF_H
-#define AVCODEC_BSWAP_BUF_H
+#ifndef AVCODEC_BSWAPDSP_H
+#define AVCODEC_BSWAPDSP_H
 
 #include <stdint.h>
 
@@ -29,4 +29,4 @@ typedef struct BswapDSPContext {
 void ff_bswapdsp_init(BswapDSPContext *c);
 void ff_bswapdsp_init_x86(BswapDSPContext *c);
 
-#endif /* AVCODEC_BSWAP_BUF_H */
+#endif /* AVCODEC_BSWAPDSP_H */
diff --git a/libavcodec/bytestream.h b/libavcodec/bytestream.h
index cb3573b..7be7fc2 100644
--- a/libavcodec/bytestream.h
+++ b/libavcodec/bytestream.h
@@ -3,20 +3,20 @@
  * copyright (c) 2006 Baptiste Coudurier <baptiste.coudurier@free.fr>
  * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
@@ -93,7 +94,7 @@ DEF(unsigned int, be24, 3, AV_RB24, AV_WB24)
 DEF(unsigned int, be16, 2, AV_RB16, AV_WB16)
 DEF(unsigned int, byte, 1, AV_RB8 , AV_WB8)
 
-#if HAVE_BIGENDIAN
+#if AV_HAVE_BIGENDIAN
 #   define bytestream2_get_ne16  bytestream2_get_be16
 #   define bytestream2_get_ne24  bytestream2_get_be24
 #   define bytestream2_get_ne32  bytestream2_get_be32
@@ -133,6 +134,7 @@ static av_always_inline void bytestream2_init(GetByteContext *g,
                                               const uint8_t *buf,
                                               int buf_size)
 {
+    av_assert0(buf_size >= 0);
     g->buffer       = buf;
     g->buffer_start = buf;
     g->buffer_end   = buf + buf_size;
@@ -142,6 +144,7 @@ static av_always_inline void bytestream2_init_writer(PutByteContext *p,
                                                      uint8_t *buf,
                                                      int buf_size)
 {
+    av_assert0(buf_size >= 0);
     p->buffer       = buf;
     p->buffer_start = buf;
     p->buffer_end   = buf + buf_size;
diff --git a/libavcodec/c93.c b/libavcodec/c93.c
index e751483..b708659 100644
--- a/libavcodec/c93.c
+++ b/libavcodec/c93.c
@@ -2,20 +2,20 @@
  * Interplay C93 video decoder
  * Copyright (c) 2007 Anssi Hannula <anssi.hannula@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -133,12 +133,13 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *out;
     int stride, ret, i, x, y, b, bt = 0;
 
+    if ((ret = ff_set_dimensions(avctx, WIDTH, HEIGHT)) < 0)
+        return ret;
+
     c93->currentpic ^= 1;
 
-    if ((ret = ff_reget_buffer(avctx, newpic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, newpic)) < 0)
         return ret;
-    }
 
     stride = newpic->linesize[0];
 
@@ -176,7 +177,14 @@ static int decode_frame(AVCodecContext *avctx, void *data,
             case C93_4X4_FROM_PREV:
                 for (j = 0; j < 8; j += 4) {
                     for (i = 0; i < 8; i += 4) {
-                        offset = bytestream2_get_le16(&gb);
+                        int offset = bytestream2_get_le16(&gb);
+                        int from_x = offset % WIDTH;
+                        int from_y = offset / WIDTH;
+                        if (block_type == C93_4X4_FROM_CURR && from_y == y+j &&
+                            (FFABS(from_x - x-i) < 4 || FFABS(from_x - x-i) > WIDTH-4)) {
+                            avpriv_request_sample(avctx, "block overlap %d %d %d %d", from_x, x+i, from_y, y+j);
+                            return AVERROR_INVALIDDATA;
+                        }
                         if ((ret = copy_block(avctx, &out[j*stride+i],
                                               copy_from, offset, 4, stride)) < 0)
                             return ret;
@@ -236,7 +244,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (b & C93_HAS_PALETTE) {
         uint32_t *palette = (uint32_t *) newpic->data[1];
         for (i = 0; i < 256; i++) {
-            palette[i] = bytestream2_get_be24(&gb);
+            palette[i] = 0xFFU << 24 | bytestream2_get_be24(&gb);
         }
         newpic->palette_has_changed = 1;
     } else {
diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c
index 5c59003..e51139d 100644
--- a/libavcodec/cabac.c
+++ b/libavcodec/cabac.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,11 +27,12 @@
 #include <string.h>
 
 #include "libavutil/common.h"
+#include "libavutil/timer.h"
 
 #include "cabac.h"
 #include "cabac_functions.h"
 
-const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63] = {
+DECLARE_ASM_ALIGNED(1, const uint8_t, ff_h264_cabac_tables)[512 + 4*2*64 + 4*64 + 63] = {
     9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
     4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
@@ -160,7 +161,20 @@ const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63] = {
 /**
  * @param buf_size size of buf in bits
  */
-void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
+void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size){
+    init_put_bits(&c->pb, buf, buf_size);
+
+    c->low= 0;
+    c->range= 0x1FE;
+    c->outstanding_count= 0;
+    c->pb.bit_left++; //avoids firstBitFlag
+}
+
+/**
+ *
+ * @param buf_size size of buf in bits
+ */
+int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
     c->bytestream_start=
     c->bytestream= buf;
     c->bytestream_end= buf + buf_size;
@@ -168,9 +182,21 @@ void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
 #if CABAC_BITS == 16
     c->low =  (*c->bytestream++)<<18;
     c->low+=  (*c->bytestream++)<<10;
+    // Keep our fetches on a 2-byte boundary as this should avoid ever having to
+    // do unaligned loads if the compiler (or asm) optimises the double byte
+    // load into a single instruction
+    if(((uintptr_t)c->bytestream & 1) == 0) {
+        c->low += (1 << 9);
+    }
+    else {
+        c->low += ((*c->bytestream++) << 2) + 2;
+    }
 #else
     c->low =  (*c->bytestream++)<<10;
-#endif
     c->low+= ((*c->bytestream++)<<2) + 2;
+#endif
     c->range= 0x1FE;
+    if ((c->range<<(CABAC_BITS+1)) < c->low)
+        return AVERROR_INVALIDDATA;
+    return 0;
 }
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 40eefed..1bf1c62 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,11 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 typedef struct CABACContext{
     int low;
     int range;
+    int outstanding_count;
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
+    PutBitContext pb;
 }CABACContext;
 
-void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
+void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size);
+int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
 
 #endif /* AVCODEC_CABAC_H */
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index beb5016..bb2b421 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 #include "cabac.h"
 #include "config.h"
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
 #if ARCH_AARCH64
 #   include "aarch64/cabac.h"
 #endif
@@ -41,12 +45,16 @@
 #if ARCH_X86
 #   include "x86/cabac.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/cabac.h"
+#endif
 
 static const uint8_t * const ff_h264_norm_shift = ff_h264_cabac_tables + H264_NORM_SHIFT_OFFSET;
 static const uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS_RANGE_OFFSET;
 static const uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
 static const uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
 
+#if !defined(get_cabac_bypass) || !defined(get_cabac_terminate)
 static void refill(CABACContext *c){
 #if CABAC_BITS == 16
         c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
@@ -54,10 +62,14 @@ static void refill(CABACContext *c){
         c->low+= c->bytestream[0]<<1;
 #endif
     c->low -= CABAC_MASK;
+#if !UNCHECKED_BITSTREAM_READER
     if (c->bytestream < c->bytestream_end)
+#endif
         c->bytestream += CABAC_BITS / 8;
 }
+#endif
 
+#ifndef get_cabac_terminate
 static inline void renorm_cabac_decoder_once(CABACContext *c){
     int shift= (uint32_t)(c->range - 0x100)>>31;
     c->range<<= shift;
@@ -65,13 +77,18 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){
     if(!(c->low & CABAC_MASK))
         refill(c);
 }
+#endif
 
 #ifndef get_cabac_inline
 static void refill2(CABACContext *c){
-    int i, x;
-
+    int i;
+    unsigned x;
+#if !HAVE_FAST_CLZ
     x= c->low ^ (c->low-1);
     i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
+#else
+    i = ff_ctz(c->low) - CABAC_BITS;
+#endif
 
     x= -CABAC_MASK;
 
@@ -82,10 +99,14 @@ static void refill2(CABACContext *c){
 #endif
 
     c->low += x<<i;
+#if !UNCHECKED_BITSTREAM_READER
     if (c->bytestream < c->bytestream_end)
+#endif
         c->bytestream += CABAC_BITS/8;
 }
+#endif
 
+#ifndef get_cabac_inline
 static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
     int s = *state;
     int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
@@ -156,6 +177,7 @@ static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
 /**
  * @return the number of bytes read or 0 if no end
  */
+#ifndef get_cabac_terminate
 static int av_unused get_cabac_terminate(CABACContext *c){
     c->range -= 2;
     if(c->low < c->range<<(CABAC_BITS+1)){
@@ -165,11 +187,13 @@ static int av_unused get_cabac_terminate(CABACContext *c){
         return c->bytestream - c->bytestream_start;
     }
 }
+#endif
 
 /**
  * Skip @p n bytes and reset the decoder.
  * @return the address of the first skipped byte or NULL if there's less than @p n bytes left
  */
+#ifndef skip_bytes
 static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
     const uint8_t *ptr = c->bytestream;
 
@@ -181,9 +205,11 @@ static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
 #endif
     if ((int) (c->bytestream_end - ptr) < n)
         return NULL;
-    ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n);
+    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
+        return NULL;
 
     return ptr;
 }
+#endif
 
 #endif /* AVCODEC_CABAC_FUNCTIONS_H */
diff --git a/libavcodec/canopus.c b/libavcodec/canopus.c
index 729e7ef..ea6cc64 100644
--- a/libavcodec/canopus.c
+++ b/libavcodec/canopus.c
@@ -2,20 +2,20 @@
  * Canopus common routines
  * Copyright (c) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/canopus.h b/libavcodec/canopus.h
index 9e5702d..faa1e8d 100644
--- a/libavcodec/canopus.h
+++ b/libavcodec/canopus.h
@@ -2,20 +2,20 @@
  * Canopus common routines
  * Copyright (c) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c
index 3230913..ba2b6ef 100644
--- a/libavcodec/cavs.c
+++ b/libavcodec/cavs.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "golomb.h"
 #include "h264chroma.h"
 #include "idctdsp.h"
@@ -75,15 +75,16 @@ static inline int get_bs(cavs_vector *mvP, cavs_vector *mvQ, int b)
 {
     if ((mvP->ref == REF_INTRA) || (mvQ->ref == REF_INTRA))
         return 2;
-    if ((abs(mvP->x - mvQ->x) >= 4) || (abs(mvP->y - mvQ->y) >= 4))
+    if((abs(mvP->x - mvQ->x) >= 4) ||
+       (abs(mvP->y - mvQ->y) >= 4) ||
+       (mvP->ref != mvQ->ref))
         return 1;
     if (b) {
         mvP += MV_BWD_OFFS;
         mvQ += MV_BWD_OFFS;
-        if ((abs(mvP->x - mvQ->x) >= 4) || (abs(mvP->y - mvQ->y) >= 4))
-            return 1;
-    } else {
-        if (mvP->ref != mvQ->ref)
+        if((abs(mvP->x - mvQ->x) >= 4) ||
+           (abs(mvP->y - mvQ->y) >= 4) ||
+           (mvP->ref != mvQ->ref))
             return 1;
     }
     return 0;
@@ -148,6 +149,8 @@ void ff_cavs_filter(AVSContext *h, enum cavs_mb mb_type)
                 qp_avg = (h->qp + h->left_qp + 1) >> 1;
                 SET_PARAMS;
                 h->cdsp.cavs_filter_lv(h->cy, h->l_stride, alpha, beta, tc, bs[0], bs[1]);
+                qp_avg = (ff_cavs_chroma_qp[h->qp] + ff_cavs_chroma_qp[h->left_qp] + 1) >> 1;
+                SET_PARAMS;
                 h->cdsp.cavs_filter_cv(h->cu, h->c_stride, alpha, beta, tc, bs[0], bs[1]);
                 h->cdsp.cavs_filter_cv(h->cv, h->c_stride, alpha, beta, tc, bs[0], bs[1]);
             }
@@ -160,6 +163,8 @@ void ff_cavs_filter(AVSContext *h, enum cavs_mb mb_type)
                 qp_avg = (h->qp + h->top_qp[h->mbx] + 1) >> 1;
                 SET_PARAMS;
                 h->cdsp.cavs_filter_lh(h->cy, h->l_stride, alpha, beta, tc, bs[4], bs[5]);
+                qp_avg = (ff_cavs_chroma_qp[h->qp] + ff_cavs_chroma_qp[h->top_qp[h->mbx]] + 1) >> 1;
+                SET_PARAMS;
                 h->cdsp.cavs_filter_ch(h->cu, h->c_stride, alpha, beta, tc, bs[4], bs[5]);
                 h->cdsp.cavs_filter_ch(h->cv, h->c_stride, alpha, beta, tc, bs[4], bs[5]);
             }
@@ -233,9 +238,14 @@ void ff_cavs_load_intra_pred_chroma(AVSContext *h)
     /* extend borders by one pixel */
     h->left_border_u[9]              = h->left_border_u[8];
     h->left_border_v[9]              = h->left_border_v[8];
-    h->top_border_u[h->mbx * 10 + 9] = h->top_border_u[h->mbx * 10 + 8];
-    h->top_border_v[h->mbx * 10 + 9] = h->top_border_v[h->mbx * 10 + 8];
-    if (h->mbx && h->mby) {
+    if(h->flags & C_AVAIL) {
+        h->top_border_u[h->mbx*10 + 9] = h->top_border_u[h->mbx*10 + 11];
+        h->top_border_v[h->mbx*10 + 9] = h->top_border_v[h->mbx*10 + 11];
+    } else {
+        h->top_border_u[h->mbx * 10 + 9] = h->top_border_u[h->mbx * 10 + 8];
+        h->top_border_v[h->mbx * 10 + 9] = h->top_border_v[h->mbx * 10 + 8];
+    }
+    if((h->flags & A_AVAIL) && (h->flags & B_AVAIL)) {
         h->top_border_u[h->mbx * 10] = h->left_border_u[0] = h->topleft_border_u;
         h->top_border_v[h->mbx * 10] = h->left_border_v[0] = h->topleft_border_v;
     } else {
@@ -527,8 +537,7 @@ void ff_cavs_inter(AVSContext *h, enum cavs_mb mb_type)
 static inline void scale_mv(AVSContext *h, int *d_x, int *d_y,
                             cavs_vector *src, int distp)
 {
-    int den = h->scale_den[src->ref];
-
+    int64_t den = h->scale_den[FFMAX(src->ref, 0)];
     *d_x = (src->x * distp * den + 256 + FF_SIGNBIT(src->x)) >> 9;
     *d_y = (src->y * distp * den + 256 + FF_SIGNBIT(src->y)) >> 9;
 }
@@ -574,7 +583,7 @@ void ff_cavs_mv(AVSContext *h, enum cavs_mv_loc nP, enum cavs_mv_loc nC,
 
     mvP->ref  = ref;
     mvP->dist = h->dist[mvP->ref];
-    if (mvC->ref == NOT_AVAIL)
+    if (mvC->ref == NOT_AVAIL || (nP == MV_FWD_X3) || (nP == MV_BWD_X3 ))
         mvC = &h->mv[nP - 5];  // set to top-left (mvD)
     if (mode == MV_PRED_PSKIP &&
         (mvA->ref == NOT_AVAIL ||
@@ -603,8 +612,15 @@ void ff_cavs_mv(AVSContext *h, enum cavs_mv_loc nP, enum cavs_mv_loc nC,
         mv_pred_median(h, mvP, mvA, mvB, mvC);
 
     if (mode < MV_PRED_PSKIP) {
-        mvP->x += get_se_golomb(&h->bc);
-        mvP->y += get_se_golomb(&h->bc);
+        int mx = get_se_golomb(&h->gb) + (unsigned)mvP->x;
+        int my = get_se_golomb(&h->gb) + (unsigned)mvP->y;
+
+        if (mx != (int16_t)mx || my != (int16_t)my) {
+            av_log(h->avctx, AV_LOG_ERROR, "MV %d %d out of supported range\n", mx, my);
+        } else {
+            mvP->x = mx;
+            mvP->y = my;
+        }
     }
     set_mvs(mvP, size);
 }
@@ -704,7 +720,7 @@ int ff_cavs_next_mb(AVSContext *h)
  *
  ****************************************************************************/
 
-void ff_cavs_init_pic(AVSContext *h)
+int ff_cavs_init_pic(AVSContext *h)
 {
     int i;
 
@@ -725,6 +741,8 @@ void ff_cavs_init_pic(AVSContext *h)
     h->luma_scan[3]   = 8 * h->l_stride + 8;
     h->mbx            = h->mby = h->mbidx = 0;
     h->flags          = 0;
+
+    return 0;
 }
 
 /*****************************************************************************
@@ -738,29 +756,46 @@ void ff_cavs_init_pic(AVSContext *h)
  * this data has to be stored for one complete row of macroblocks
  * and this storage space is allocated here
  */
-void ff_cavs_init_top_lines(AVSContext *h)
+int ff_cavs_init_top_lines(AVSContext *h)
 {
     /* alloc top line of predictors */
     h->top_qp       = av_mallocz(h->mb_width);
-    h->top_mv[0]    = av_mallocz((h->mb_width * 2 + 1) * sizeof(cavs_vector));
-    h->top_mv[1]    = av_mallocz((h->mb_width * 2 + 1) * sizeof(cavs_vector));
-    h->top_pred_Y   = av_mallocz(h->mb_width * 2 * sizeof(*h->top_pred_Y));
-    h->top_border_y = av_mallocz((h->mb_width + 1) * 16);
-    h->top_border_u = av_mallocz(h->mb_width * 10);
-    h->top_border_v = av_mallocz(h->mb_width * 10);
+    h->top_mv[0]    = av_mallocz_array(h->mb_width * 2 + 1,  sizeof(cavs_vector));
+    h->top_mv[1]    = av_mallocz_array(h->mb_width * 2 + 1,  sizeof(cavs_vector));
+    h->top_pred_Y   = av_mallocz_array(h->mb_width * 2,  sizeof(*h->top_pred_Y));
+    h->top_border_y = av_mallocz_array(h->mb_width + 1,  16);
+    h->top_border_u = av_mallocz_array(h->mb_width,  10);
+    h->top_border_v = av_mallocz_array(h->mb_width,  10);
 
     /* alloc space for co-located MVs and types */
-    h->col_mv        = av_mallocz(h->mb_width * h->mb_height * 4 *
-                                  sizeof(cavs_vector));
+    h->col_mv        = av_mallocz_array(h->mb_width * h->mb_height,
+                                        4 * sizeof(cavs_vector));
     h->col_type_base = av_mallocz(h->mb_width * h->mb_height);
     h->block         = av_mallocz(64 * sizeof(int16_t));
+
+    if (!h->top_qp || !h->top_mv[0] || !h->top_mv[1] || !h->top_pred_Y ||
+        !h->top_border_y || !h->top_border_u || !h->top_border_v ||
+        !h->col_mv || !h->col_type_base || !h->block) {
+        av_freep(&h->top_qp);
+        av_freep(&h->top_mv[0]);
+        av_freep(&h->top_mv[1]);
+        av_freep(&h->top_pred_Y);
+        av_freep(&h->top_border_y);
+        av_freep(&h->top_border_u);
+        av_freep(&h->top_border_v);
+        av_freep(&h->col_mv);
+        av_freep(&h->col_type_base);
+        av_freep(&h->block);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
 }
 
 av_cold int ff_cavs_init(AVCodecContext *avctx)
 {
     AVSContext *h = avctx->priv_data;
 
-    ff_blockdsp_init(&h->bdsp);
+    ff_blockdsp_init(&h->bdsp, avctx);
     ff_h264chroma_init(&h->h264chroma, 8);
     ff_idctdsp_init(&h->idsp, avctx);
     ff_videodsp_init(&h->vdsp, 8);
@@ -810,16 +845,16 @@ av_cold int ff_cavs_end(AVCodecContext *avctx)
     av_frame_free(&h->DPB[0].f);
     av_frame_free(&h->DPB[1].f);
 
-    av_free(h->top_qp);
-    av_free(h->top_mv[0]);
-    av_free(h->top_mv[1]);
-    av_free(h->top_pred_Y);
-    av_free(h->top_border_y);
-    av_free(h->top_border_u);
-    av_free(h->top_border_v);
-    av_free(h->col_mv);
-    av_free(h->col_type_base);
-    av_free(h->block);
+    av_freep(&h->top_qp);
+    av_freep(&h->top_mv[0]);
+    av_freep(&h->top_mv[1]);
+    av_freep(&h->top_pred_Y);
+    av_freep(&h->top_border_y);
+    av_freep(&h->top_border_u);
+    av_freep(&h->top_border_v);
+    av_freep(&h->col_mv);
+    av_freep(&h->col_type_base);
+    av_freep(&h->block);
     av_freep(&h->edge_emu_buffer);
     return 0;
 }
diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h
index cb549f1..6bfb22c 100644
--- a/libavcodec/cavs.h
+++ b/libavcodec/cavs.h
@@ -2,31 +2,31 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_CAVS_H
 #define AVCODEC_CAVS_H
 
-#include "bitstream.h"
 #include "cavsdsp.h"
 #include "blockdsp.h"
 #include "h264chroma.h"
 #include "idctdsp.h"
+#include "get_bits.h"
 #include "videodsp.h"
 
 #define SLICE_MAX_START_CODE    0x000001af
@@ -167,7 +167,7 @@ typedef struct AVSContext {
     IDCTDSPContext idsp;
     VideoDSPContext vdsp;
     CAVSDSPContext  cdsp;
-    BitstreamContext bc;
+    GetBitContext gb;
     AVSFrame cur;     ///< currently decoded frame
     AVSFrame DPB[2];  ///< reference frames
     int dist[2];     ///< temporal distances from current frame to ref frames
@@ -216,6 +216,7 @@ typedef struct AVSContext {
     int luma_scan[4];
     int qp;
     int qp_fixed;
+    int pic_qp_fixed;
     int cbp;
     ScanTable scantable;
 
@@ -241,6 +242,7 @@ typedef struct AVSContext {
     int16_t *block;
 } AVSContext;
 
+extern const uint8_t     ff_cavs_chroma_qp[64];
 extern const uint8_t     ff_cavs_partition_flags[30];
 extern const cavs_vector ff_cavs_intra_mv;
 extern const cavs_vector ff_cavs_dir_mv;
@@ -269,8 +271,8 @@ void ff_cavs_mv(AVSContext *h, enum cavs_mv_loc nP, enum cavs_mv_loc nC,
                 enum cavs_mv_pred mode, enum cavs_block size, int ref);
 void ff_cavs_init_mb(AVSContext *h);
 int  ff_cavs_next_mb(AVSContext *h);
-void ff_cavs_init_pic(AVSContext *h);
-void ff_cavs_init_top_lines(AVSContext *h);
+int ff_cavs_init_pic(AVSContext *h);
+int ff_cavs_init_top_lines(AVSContext *h);
 int ff_cavs_init(AVCodecContext *avctx);
 int ff_cavs_end (AVCodecContext *avctx);
 
diff --git a/libavcodec/cavs_parser.c b/libavcodec/cavs_parser.c
index 84f647c..6067a39 100644
--- a/libavcodec/cavs_parser.c
+++ b/libavcodec/cavs_parser.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) parser.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavsdata.c b/libavcodec/cavsdata.c
index 4e4a131..2835a4b 100644
--- a/libavcodec/cavsdata.c
+++ b/libavcodec/cavsdata.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,6 +54,13 @@ const uint8_t ff_cavs_partition_flags[30] = {
                       SPLITH|SPLITV, //B_8X8 = 29
 };
 
+const uint8_t ff_cavs_chroma_qp[64] = {
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44,
+  45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51
+};
+
 /** mark block as "no prediction from this direction"
     e.g. forward motion vector in BWD partition */
 const cavs_vector ff_cavs_dir_mv   = {0,0,1,REF_DIR};
diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c
index 7f584ac..5f3b354 100644
--- a/libavcodec/cavsdec.c
+++ b/libavcodec/cavsdec.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,10 +25,11 @@
  * @author Stefan Gehrer <stefan.gehrer@gmx.de>
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
-#include "bitstream.h"
-#include "cavs.h"
+#include "get_bits.h"
 #include "golomb.h"
+#include "cavs.h"
 #include "internal.h"
 #include "mpeg12data.h"
 
@@ -50,13 +51,6 @@ static const uint8_t cbp_tab[64][2] = {
 
 static const uint8_t scan3x3[4] = { 4, 5, 7, 8 };
 
-static const uint8_t cavs_chroma_qp[64] = {
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44,
-  45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51
-};
-
 static const uint8_t dequant_shift[64] = {
   14, 14, 14, 14, 14, 14, 14, 14,
   13, 13, 13, 13, 13, 13, 13, 13,
@@ -471,7 +465,7 @@ static inline void mv_pred_direct(AVSContext *h, cavs_vector *pmv_fw,
                                   cavs_vector *col_mv)
 {
     cavs_vector *pmv_bw = pmv_fw + MV_BWD_OFFS;
-    int den = h->direct_den[col_mv->ref];
+    unsigned den = h->direct_den[col_mv->ref];
     int m = FF_SIGNBIT(col_mv->x);
 
     pmv_fw->dist = h->dist[1];
@@ -506,13 +500,17 @@ static inline void mv_pred_sym(AVSContext *h, cavs_vector *src,
  ****************************************************************************/
 
 /** kth-order exponential golomb code */
-static inline int get_ue_code(BitstreamContext *bc, int order)
+static inline int get_ue_code(GetBitContext *gb, int order)
 {
+    unsigned ret = get_ue_golomb(gb);
+    if (ret >= ((1U<<31)>>order)) {
+        av_log(NULL, AV_LOG_ERROR, "get_ue_code: value too larger\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (order) {
-        int ret = get_ue_golomb(bc) << order;
-        return ret + bitstream_read(bc, order);
+        return (ret<<order) + get_bits(gb, order);
     }
-    return get_ue_golomb(bc);
+    return ret;
 }
 
 static inline int dequant(AVSContext *h, int16_t *level_buf, uint8_t *run_buf,
@@ -545,33 +543,41 @@ static inline int dequant(AVSContext *h, int16_t *level_buf, uint8_t *run_buf,
  * @param dst location of sample block
  * @param stride line stride in frame buffer
  */
-static int decode_residual_block(AVSContext *h, BitstreamContext *bc,
+static int decode_residual_block(AVSContext *h, GetBitContext *gb,
                                  const struct dec_2dvlc *r, int esc_golomb_order,
                                  int qp, uint8_t *dst, ptrdiff_t stride)
 {
-    int i, level_code, esc_code, level, run, mask, ret;
+    int i, esc_code, level, mask, ret;
+    unsigned int level_code, run;
     int16_t level_buf[65];
     uint8_t run_buf[65];
     int16_t *block = h->block;
 
-    for (i = 0;i < 65; i++) {
-        level_code = get_ue_code(bc, r->golomb_order);
+    for (i = 0; i < 65; i++) {
+        level_code = get_ue_code(gb, r->golomb_order);
         if (level_code >= ESCAPE_CODE) {
             run      = ((level_code - ESCAPE_CODE) >> 1) + 1;
-            esc_code = get_ue_code(bc, esc_golomb_order);
+            if(run > 64) {
+                av_log(h->avctx, AV_LOG_ERROR, "run %d is too large\n", run);
+                return AVERROR_INVALIDDATA;
+            }
+            esc_code = get_ue_code(gb, esc_golomb_order);
+            if (esc_code < 0 || esc_code > 32767) {
+                av_log(h->avctx, AV_LOG_ERROR, "esc_code invalid\n");
+                return AVERROR_INVALIDDATA;
+            }
+
             level    = esc_code + (run > r->max_run ? 1 : r->level_add[run]);
             while (level > r->inc_limit)
                 r++;
             mask  = -(level_code & 1);
             level = (level ^ mask) - mask;
-        } else if (level_code >= 0) {
+        } else {
             level = r->rltab[level_code][0];
             if (!level) //end of block signal
                 break;
             run = r->rltab[level_code][1];
             r  += r->rltab[level_code][2];
-        } else {
-            break;
         }
         level_buf[i] = level;
         run_buf[i]   = run;
@@ -585,14 +591,21 @@ static int decode_residual_block(AVSContext *h, BitstreamContext *bc,
 }
 
 
-static inline void decode_residual_chroma(AVSContext *h)
+static inline int decode_residual_chroma(AVSContext *h)
 {
-    if (h->cbp & (1 << 4))
-        decode_residual_block(h, &h->bc, chroma_dec, 0,
-                              cavs_chroma_qp[h->qp], h->cu, h->c_stride);
-    if (h->cbp & (1 << 5))
-        decode_residual_block(h, &h->bc, chroma_dec, 0,
-                              cavs_chroma_qp[h->qp], h->cv, h->c_stride);
+    if (h->cbp & (1 << 4)) {
+        int ret = decode_residual_block(h, &h->gb, chroma_dec, 0,
+                              ff_cavs_chroma_qp[h->qp], h->cu, h->c_stride);
+        if (ret < 0)
+            return ret;
+    }
+    if (h->cbp & (1 << 5)) {
+        int ret = decode_residual_block(h, &h->gb, chroma_dec, 0,
+                              ff_cavs_chroma_qp[h->qp], h->cv, h->c_stride);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
 }
 
 static inline int decode_residual_inter(AVSContext *h)
@@ -600,8 +613,8 @@ static inline int decode_residual_inter(AVSContext *h)
     int block;
 
     /* get coded block pattern */
-    int cbp = get_ue_golomb(&h->bc);
-    if (cbp > 63 || cbp < 0) {
+    int cbp = get_ue_golomb(&h->gb);
+    if (cbp > 63U) {
         av_log(h->avctx, AV_LOG_ERROR, "illegal inter cbp %d\n", cbp);
         return AVERROR_INVALIDDATA;
     }
@@ -609,10 +622,10 @@ static inline int decode_residual_inter(AVSContext *h)
 
     /* get quantizer */
     if (h->cbp && !h->qp_fixed)
-        h->qp = (h->qp + get_se_golomb(&h->bc)) & 63;
+        h->qp = (h->qp + (unsigned)get_se_golomb(&h->gb)) & 63;
     for (block = 0; block < 4; block++)
         if (h->cbp & (1 << block))
-            decode_residual_block(h, &h->bc, inter_dec, 0, h->qp,
+            decode_residual_block(h, &h->gb, inter_dec, 0, h->qp,
                                   h->cy + h->luma_scan[block], h->l_stride);
     decode_residual_chroma(h);
 
@@ -637,12 +650,13 @@ static inline void set_mv_intra(AVSContext *h)
 
 static int decode_mb_i(AVSContext *h, int cbp_code)
 {
-    BitstreamContext *bc = &h->bc;
+    GetBitContext *gb = &h->gb;
     unsigned pred_mode_uv;
     int block;
     uint8_t top[18];
     uint8_t *left = NULL;
     uint8_t *d;
+    int ret;
 
     ff_cavs_init_mb(h);
 
@@ -656,13 +670,13 @@ static int decode_mb_i(AVSContext *h, int cbp_code)
         predpred = FFMIN(nA, nB);
         if (predpred == NOT_AVAIL) // if either is not available
             predpred = INTRA_L_LP;
-        if (!bitstream_read_bit(bc)) {
-            int rem_mode = bitstream_read(bc, 2);
+        if (!get_bits1(gb)) {
+            int rem_mode = get_bits(gb, 2);
             predpred     = rem_mode + (rem_mode >= predpred);
         }
         h->pred_mode_Y[pos] = predpred;
     }
-    pred_mode_uv = get_ue_golomb(bc);
+    pred_mode_uv = get_ue_golomb(gb);
     if (pred_mode_uv > 6) {
         av_log(h->avctx, AV_LOG_ERROR, "illegal intra chroma pred mode\n");
         return AVERROR_INVALIDDATA;
@@ -671,14 +685,14 @@ static int decode_mb_i(AVSContext *h, int cbp_code)
 
     /* get coded block pattern */
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I)
-        cbp_code = get_ue_golomb(bc);
-    if (cbp_code > 63 || cbp_code < 0) {
+        cbp_code = get_ue_golomb(gb);
+    if (cbp_code > 63U) {
         av_log(h->avctx, AV_LOG_ERROR, "illegal intra cbp\n");
         return AVERROR_INVALIDDATA;
     }
     h->cbp = cbp_tab[cbp_code][0];
     if (h->cbp && !h->qp_fixed)
-        h->qp = (h->qp + get_se_golomb(bc)) & 63; // qp_delta
+        h->qp = (h->qp + (unsigned)get_se_golomb(gb)) & 63; //qp_delta
 
     /* luma intra prediction interleaved with residual decode/transform/add */
     for (block = 0; block < 4; block++) {
@@ -686,8 +700,11 @@ static int decode_mb_i(AVSContext *h, int cbp_code)
         ff_cavs_load_intra_pred_luma(h, top, &left, block);
         h->intra_pred_l[h->pred_mode_Y[scan3x3[block]]]
             (d, top, left, h->l_stride);
-        if (h->cbp & (1<<block))
-            decode_residual_block(h, bc, intra_dec, 1, h->qp, d, h->l_stride);
+        if (h->cbp & (1<<block)) {
+            ret = decode_residual_block(h, gb, intra_dec, 1, h->qp, d, h->l_stride);
+            if (ret < 0)
+                return ret;
+        }
     }
 
     /* chroma intra prediction */
@@ -697,7 +714,9 @@ static int decode_mb_i(AVSContext *h, int cbp_code)
     h->intra_pred_c[pred_mode_uv](h->cv, &h->top_border_v[h->mbx * 10],
                                   h->left_border_v, h->c_stride);
 
-    decode_residual_chroma(h);
+    ret = decode_residual_chroma(h);
+    if (ret < 0)
+        return ret;
     ff_cavs_filter(h, I_8X8);
     set_mv_intra(h);
     return 0;
@@ -716,7 +735,7 @@ static inline void set_intra_mode_default(AVSContext *h)
 
 static void decode_mb_p(AVSContext *h, enum cavs_mb mb_type)
 {
-    BitstreamContext *bc = &h->bc;
+    GetBitContext *gb = &h->gb;
     int ref[4];
 
     ff_cavs_init_mb(h);
@@ -725,26 +744,26 @@ static void decode_mb_p(AVSContext *h, enum cavs_mb mb_type)
         ff_cavs_mv(h, MV_FWD_X0, MV_FWD_C2, MV_PRED_PSKIP,  BLK_16X16, 0);
         break;
     case P_16X16:
-        ref[0] = h->ref_flag ? 0 : bitstream_read_bit(bc);
+        ref[0] = h->ref_flag ? 0 : get_bits1(gb);
         ff_cavs_mv(h, MV_FWD_X0, MV_FWD_C2, MV_PRED_MEDIAN, BLK_16X16, ref[0]);
         break;
     case P_16X8:
-        ref[0] = h->ref_flag ? 0 : bitstream_read_bit(bc);
-        ref[2] = h->ref_flag ? 0 : bitstream_read_bit(bc);
+        ref[0] = h->ref_flag ? 0 : get_bits1(gb);
+        ref[2] = h->ref_flag ? 0 : get_bits1(gb);
         ff_cavs_mv(h, MV_FWD_X0, MV_FWD_C2, MV_PRED_TOP,    BLK_16X8, ref[0]);
         ff_cavs_mv(h, MV_FWD_X2, MV_FWD_A1, MV_PRED_LEFT,   BLK_16X8, ref[2]);
         break;
     case P_8X16:
-        ref[0] = h->ref_flag ? 0 : bitstream_read_bit(bc);
-        ref[1] = h->ref_flag ? 0 : bitstream_read_bit(bc);
+        ref[0] = h->ref_flag ? 0 : get_bits1(gb);
+        ref[1] = h->ref_flag ? 0 : get_bits1(gb);
         ff_cavs_mv(h, MV_FWD_X0, MV_FWD_B3, MV_PRED_LEFT,     BLK_8X16, ref[0]);
         ff_cavs_mv(h, MV_FWD_X1, MV_FWD_C2, MV_PRED_TOPRIGHT, BLK_8X16, ref[1]);
         break;
     case P_8X8:
-        ref[0] = h->ref_flag ? 0 : bitstream_read_bit(bc);
-        ref[1] = h->ref_flag ? 0 : bitstream_read_bit(bc);
-        ref[2] = h->ref_flag ? 0 : bitstream_read_bit(bc);
-        ref[3] = h->ref_flag ? 0 : bitstream_read_bit(bc);
+        ref[0] = h->ref_flag ? 0 : get_bits1(gb);
+        ref[1] = h->ref_flag ? 0 : get_bits1(gb);
+        ref[2] = h->ref_flag ? 0 : get_bits1(gb);
+        ref[3] = h->ref_flag ? 0 : get_bits1(gb);
         ff_cavs_mv(h, MV_FWD_X0, MV_FWD_B3, MV_PRED_MEDIAN,   BLK_8X8, ref[0]);
         ff_cavs_mv(h, MV_FWD_X1, MV_FWD_C2, MV_PRED_MEDIAN,   BLK_8X8, ref[1]);
         ff_cavs_mv(h, MV_FWD_X2, MV_FWD_X1, MV_PRED_MEDIAN,   BLK_8X8, ref[2]);
@@ -759,7 +778,7 @@ static void decode_mb_p(AVSContext *h, enum cavs_mb mb_type)
     h->col_type_base[h->mbidx] = mb_type;
 }
 
-static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
+static int decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
 {
     int block;
     enum cavs_sub_mb sub_type[4];
@@ -796,18 +815,39 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
         ff_cavs_mv(h, MV_BWD_X0, MV_BWD_C2, MV_PRED_MEDIAN, BLK_16X16, 0);
         break;
     case B_8X8:
+#define TMP_UNUSED_INX  7
+        flags = 0;
         for (block = 0; block < 4; block++)
-            sub_type[block] = bitstream_read(&h->bc, 2);
+            sub_type[block] = get_bits(&h->gb, 2);
         for (block = 0; block < 4; block++) {
             switch (sub_type[block]) {
             case B_SUB_DIRECT:
                 if (!h->col_type_base[h->mbidx]) {
                     /* intra MB at co-location, do in-plane prediction */
-                    ff_cavs_mv(h, mv_scan[block], mv_scan[block] - 3,
-                               MV_PRED_BSKIP, BLK_8X8, 1);
-                    ff_cavs_mv(h, mv_scan[block] + MV_BWD_OFFS,
-                               mv_scan[block] - 3 + MV_BWD_OFFS,
-                               MV_PRED_BSKIP, BLK_8X8, 0);
+                    if(flags==0) {
+                        // if col-MB is a Intra MB, current Block size is 16x16.
+                        // AVS standard section 9.9.1
+                        if(block>0){
+                            h->mv[TMP_UNUSED_INX              ] = h->mv[MV_FWD_X0              ];
+                            h->mv[TMP_UNUSED_INX + MV_BWD_OFFS] = h->mv[MV_FWD_X0 + MV_BWD_OFFS];
+                        }
+                        ff_cavs_mv(h, MV_FWD_X0, MV_FWD_C2,
+                                   MV_PRED_BSKIP, BLK_8X8, 1);
+                        ff_cavs_mv(h, MV_FWD_X0+MV_BWD_OFFS,
+                                   MV_FWD_C2+MV_BWD_OFFS,
+                                   MV_PRED_BSKIP, BLK_8X8, 0);
+                        if(block>0) {
+                            flags = mv_scan[block];
+                            h->mv[flags              ] = h->mv[MV_FWD_X0              ];
+                            h->mv[flags + MV_BWD_OFFS] = h->mv[MV_FWD_X0 + MV_BWD_OFFS];
+                            h->mv[MV_FWD_X0              ] = h->mv[TMP_UNUSED_INX              ];
+                            h->mv[MV_FWD_X0 + MV_BWD_OFFS] = h->mv[TMP_UNUSED_INX + MV_BWD_OFFS];
+                        } else
+                            flags = MV_FWD_X0;
+                    } else {
+                        h->mv[mv_scan[block]              ] = h->mv[flags              ];
+                        h->mv[mv_scan[block] + MV_BWD_OFFS] = h->mv[flags + MV_BWD_OFFS];
+                    }
                 } else
                     mv_pred_direct(h, &h->mv[mv_scan[block]],
                                    &h->col_mv[h->mbidx * 4 + block]);
@@ -823,6 +863,7 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
                 break;
             }
         }
+#undef TMP_UNUSED_INX
         for (block = 0; block < 4; block++) {
             if (sub_type[block] == B_SUB_BWD)
                 ff_cavs_mv(h, mv_scan[block] + MV_BWD_OFFS,
@@ -831,7 +872,11 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
         }
         break;
     default:
-        assert((mb_type > B_SYM_16X16) && (mb_type < B_8X8));
+        if (mb_type <= B_SYM_16X16) {
+            av_log(h->avctx, AV_LOG_ERROR, "Invalid mb_type %d in B frame\n", mb_type);
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert2(mb_type < B_8X8);
         flags = ff_cavs_partition_flags[mb_type];
         if (mb_type & 1) { /* 16x8 macroblock types */
             if (flags & FWD0)
@@ -866,6 +911,8 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
     if (mb_type != B_SKIP)
         decode_residual_inter(h);
     ff_cavs_filter(h, mb_type);
+
+    return 0;
 }
 
 /*****************************************************************************
@@ -874,23 +921,29 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
  *
  ****************************************************************************/
 
-static inline int decode_slice_header(AVSContext *h, BitstreamContext *bc)
+static inline int decode_slice_header(AVSContext *h, GetBitContext *gb)
 {
     if (h->stc > 0xAF)
         av_log(h->avctx, AV_LOG_ERROR, "unexpected start code 0x%02x\n", h->stc);
+
+    if (h->stc >= h->mb_height) {
+        av_log(h->avctx, AV_LOG_ERROR, "stc 0x%02x is too large\n", h->stc);
+        return AVERROR_INVALIDDATA;
+    }
+
     h->mby   = h->stc;
     h->mbidx = h->mby * h->mb_width;
 
     /* mark top macroblocks as unavailable */
     h->flags &= ~(B_AVAIL | C_AVAIL);
-    if ((h->mby == 0) && (!h->qp_fixed)) {
-        h->qp_fixed = bitstream_read_bit(bc);
-        h->qp       = bitstream_read(bc, 6);
+    if (!h->pic_qp_fixed) {
+        h->qp_fixed = get_bits1(gb);
+        h->qp       = get_bits(gb, 6);
     }
     /* inter frame or second slice can have weighting params */
     if ((h->cur.f->pict_type != AV_PICTURE_TYPE_I) ||
         (!h->pic_structure && h->mby >= h->mb_width / 2))
-        if (bitstream_read_bit(bc)) { // slice_weighting_flag
+        if (get_bits1(gb)) { //slice_weighting_flag
             av_log(h->avctx, AV_LOG_ERROR,
                    "weighted prediction not yet supported\n");
         }
@@ -899,21 +952,21 @@ static inline int decode_slice_header(AVSContext *h, BitstreamContext *bc)
 
 static inline int check_for_slice(AVSContext *h)
 {
-    BitstreamContext *bc = &h->bc;
+    GetBitContext *gb = &h->gb;
     int align;
 
     if (h->mbx)
         return 0;
-    align = (-bitstream_tell(bc)) & 7;
+    align = (-get_bits_count(gb)) & 7;
     /* check for stuffing byte */
-    if (!align && (bitstream_peek(bc, 8) == 0x80))
+    if (!align && (show_bits(gb, 8) == 0x80))
         align = 8;
-    if ((bitstream_peek(bc, 24 + align) & 0xFFFFFF) == 0x000001) {
-        bitstream_skip(bc, 24 + align);
-        h->stc = bitstream_read(bc, 8);
+    if ((show_bits_long(gb, 24 + align) & 0xFFFFFF) == 0x000001) {
+        skip_bits_long(gb, 24 + align);
+        h->stc = get_bits(gb, 8);
         if (h->stc >= h->mb_height)
             return 0;
-        decode_slice_header(h, bc);
+        decode_slice_header(h, gb);
         return 1;
     }
     return 0;
@@ -938,9 +991,9 @@ static int decode_pic(AVSContext *h)
 
     av_frame_unref(h->cur.f);
 
-    bitstream_skip(&h->bc, 16); // bbv_dwlay
+    skip_bits(&h->gb, 16);//bbv_dwlay
     if (h->stc == PIC_PB_START_CODE) {
-        h->cur.f->pict_type = bitstream_read(&h->bc, 2) + AV_PICTURE_TYPE_I;
+        h->cur.f->pict_type = get_bits(&h->gb, 2) + AV_PICTURE_TYPE_I;
         if (h->cur.f->pict_type > AV_PICTURE_TYPE_B) {
             av_log(h->avctx, AV_LOG_ERROR, "illegal picture type\n");
             return AVERROR_INVALIDDATA;
@@ -951,17 +1004,17 @@ static int decode_pic(AVSContext *h)
             return AVERROR_INVALIDDATA;
     } else {
         h->cur.f->pict_type = AV_PICTURE_TYPE_I;
-        if (bitstream_read_bit(&h->bc))
-            bitstream_skip(&h->bc, 24); // time_code
+        if (get_bits1(&h->gb))
+            skip_bits(&h->gb, 24);//time_code
         /* old sample clips were all progressive and no low_delay,
            bump stream revision if detected otherwise */
-        if (h->low_delay || !(bitstream_peek(&h->bc, 9) & 1))
+        if (h->low_delay || !(show_bits(&h->gb, 9) & 1))
             h->stream_revision = 1;
         /* similarly test top_field_first and repeat_first_field */
-        else if (bitstream_peek(&h->bc, 11) & 3)
+        else if (show_bits(&h->gb, 11) & 3)
             h->stream_revision = 1;
         if (h->stream_revision > 0)
-            bitstream_skip(&h->bc, 1); // marker_bit
+            skip_bits(&h->gb, 1); //marker_bit
     }
 
     ret = ff_get_buffer(h->avctx, h->cur.f, h->cur.f->pict_type == AV_PICTURE_TYPE_B ?
@@ -976,98 +1029,118 @@ static int decode_pic(AVSContext *h)
             return AVERROR(ENOMEM);
     }
 
-    ff_cavs_init_pic(h);
-    h->cur.poc = bitstream_read(&h->bc, 8) * 2;
+    if ((ret = ff_cavs_init_pic(h)) < 0)
+        return ret;
+    h->cur.poc = get_bits(&h->gb, 8) * 2;
 
     /* get temporal distances and MV scaling factors */
     if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-        h->dist[0] = (h->cur.poc - h->DPB[0].poc  + 512) % 512;
+        h->dist[0] = (h->cur.poc - h->DPB[0].poc) & 511;
     } else {
-        h->dist[0] = (h->DPB[0].poc  - h->cur.poc + 512) % 512;
+        h->dist[0] = (h->DPB[0].poc  - h->cur.poc) & 511;
     }
-    h->dist[1] = (h->cur.poc - h->DPB[1].poc  + 512) % 512;
+    h->dist[1] = (h->cur.poc - h->DPB[1].poc) & 511;
     h->scale_den[0] = h->dist[0] ? 512/h->dist[0] : 0;
     h->scale_den[1] = h->dist[1] ? 512/h->dist[1] : 0;
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_B) {
         h->sym_factor = h->dist[0] * h->scale_den[1];
+        if (FFABS(h->sym_factor) > 32768) {
+            av_log(h->avctx, AV_LOG_ERROR, "sym_factor %d too large\n", h->sym_factor);
+            return AVERROR_INVALIDDATA;
+        }
     } else {
         h->direct_den[0] = h->dist[0] ? 16384 / h->dist[0] : 0;
         h->direct_den[1] = h->dist[1] ? 16384 / h->dist[1] : 0;
     }
 
     if (h->low_delay)
-        get_ue_golomb(&h->bc); // bbv_check_times
-    h->progressive   = bitstream_read_bit(&h->bc);
+        get_ue_golomb(&h->gb); //bbv_check_times
+    h->progressive   = get_bits1(&h->gb);
     h->pic_structure = 1;
     if (!h->progressive)
-        h->pic_structure = bitstream_read_bit(&h->bc);
+        h->pic_structure = get_bits1(&h->gb);
     if (!h->pic_structure && h->stc == PIC_PB_START_CODE)
-        bitstream_skip(&h->bc, 1);  // advanced_pred_mode_disable
-    bitstream_skip(&h->bc, 1);      // top_field_first
-    bitstream_skip(&h->bc, 1);      // repeat_first_field
-    h->qp_fixed = bitstream_read_bit(&h->bc);
-    h->qp       = bitstream_read(&h->bc, 6);
+        skip_bits1(&h->gb);     //advanced_pred_mode_disable
+    skip_bits1(&h->gb);        //top_field_first
+    skip_bits1(&h->gb);        //repeat_first_field
+    h->pic_qp_fixed =
+    h->qp_fixed = get_bits1(&h->gb);
+    h->qp       = get_bits(&h->gb, 6);
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
         if (!h->progressive && !h->pic_structure)
-            bitstream_skip(&h->bc, 1);  // what is this?
-        bitstream_skip(&h->bc, 4);      // reserved bits
+            skip_bits1(&h->gb);//what is this?
+        skip_bits(&h->gb, 4);   //reserved bits
     } else {
         if (!(h->cur.f->pict_type == AV_PICTURE_TYPE_B && h->pic_structure == 1))
-            h->ref_flag        = bitstream_read_bit(&h->bc);
-        bitstream_skip(&h->bc, 4);  // reserved bits
-        h->skip_mode_flag      = bitstream_read_bit(&h->bc);
+            h->ref_flag        = get_bits1(&h->gb);
+        skip_bits(&h->gb, 4);   //reserved bits
+        h->skip_mode_flag      = get_bits1(&h->gb);
     }
-    h->loop_filter_disable     = bitstream_read_bit(&h->bc);
-    if (!h->loop_filter_disable && bitstream_read_bit(&h->bc)) {
-        h->alpha_offset        = get_se_golomb(&h->bc);
-        h->beta_offset         = get_se_golomb(&h->bc);
+    h->loop_filter_disable     = get_bits1(&h->gb);
+    if (!h->loop_filter_disable && get_bits1(&h->gb)) {
+        h->alpha_offset        = get_se_golomb(&h->gb);
+        h->beta_offset         = get_se_golomb(&h->gb);
+        if (   h->alpha_offset < -64 || h->alpha_offset > 64
+            || h-> beta_offset < -64 || h-> beta_offset > 64) {
+            h->alpha_offset = h->beta_offset  = 0;
+            return AVERROR_INVALIDDATA;
+        }
     } else {
         h->alpha_offset = h->beta_offset  = 0;
     }
+
+    ret = 0;
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
         do {
             check_for_slice(h);
-            decode_mb_i(h, 0);
+            ret = decode_mb_i(h, 0);
+            if (ret < 0)
+                break;
         } while (ff_cavs_next_mb(h));
     } else if (h->cur.f->pict_type == AV_PICTURE_TYPE_P) {
         do {
             if (check_for_slice(h))
                 skip_count = -1;
             if (h->skip_mode_flag && (skip_count < 0))
-                skip_count = get_ue_golomb(&h->bc);
+                skip_count = get_ue_golomb(&h->gb);
             if (h->skip_mode_flag && skip_count--) {
                 decode_mb_p(h, P_SKIP);
             } else {
-                mb_type = get_ue_golomb(&h->bc) + P_SKIP + h->skip_mode_flag;
+                mb_type = get_ue_golomb(&h->gb) + P_SKIP + h->skip_mode_flag;
                 if (mb_type > P_8X8)
-                    decode_mb_i(h, mb_type - P_8X8 - 1);
+                    ret = decode_mb_i(h, mb_type - P_8X8 - 1);
                 else
                     decode_mb_p(h, mb_type);
             }
+            if (ret < 0)
+                break;
         } while (ff_cavs_next_mb(h));
     } else { /* AV_PICTURE_TYPE_B */
         do {
             if (check_for_slice(h))
                 skip_count = -1;
             if (h->skip_mode_flag && (skip_count < 0))
-                skip_count = get_ue_golomb(&h->bc);
+                skip_count = get_ue_golomb(&h->gb);
             if (h->skip_mode_flag && skip_count--) {
-                decode_mb_b(h, B_SKIP);
+                ret = decode_mb_b(h, B_SKIP);
             } else {
-                mb_type = get_ue_golomb(&h->bc) + B_SKIP + h->skip_mode_flag;
+                mb_type = get_ue_golomb(&h->gb) + B_SKIP + h->skip_mode_flag;
                 if (mb_type > B_8X8)
-                    decode_mb_i(h, mb_type - B_8X8 - 1);
+                    ret = decode_mb_i(h, mb_type - B_8X8 - 1);
                 else
-                    decode_mb_b(h, mb_type);
+                    ret = decode_mb_b(h, mb_type);
             }
+            if (ret < 0)
+                break;
         } while (ff_cavs_next_mb(h));
     }
-    if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
+    emms_c();
+    if (ret >= 0 && h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
         av_frame_unref(h->DPB[1].f);
         FFSWAP(AVSFrame, h->cur, h->DPB[1]);
         FFSWAP(AVSFrame, h->DPB[0], h->DPB[1]);
     }
-    return 0;
+    return ret;
 }
 
 /*****************************************************************************
@@ -1080,36 +1153,49 @@ static int decode_seq_header(AVSContext *h)
 {
     int frame_rate_code;
     int width, height;
+    int ret;
 
-    h->profile = bitstream_read(&h->bc, 8);
-    h->level   = bitstream_read(&h->bc, 8);
-    bitstream_skip(&h->bc, 1); // progressive sequence
+    h->profile = get_bits(&h->gb, 8);
+    h->level   = get_bits(&h->gb, 8);
+    skip_bits1(&h->gb); //progressive sequence
 
-    width  = bitstream_read(&h->bc, 14);
-    height = bitstream_read(&h->bc, 14);
+    width  = get_bits(&h->gb, 14);
+    height = get_bits(&h->gb, 14);
     if ((h->width || h->height) && (h->width != width || h->height != height)) {
         avpriv_report_missing_feature(h->avctx,
                                       "Width/height changing in CAVS");
         return AVERROR_PATCHWELCOME;
     }
+    if (width <= 0 || height <= 0) {
+        av_log(h->avctx, AV_LOG_ERROR, "Dimensions invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
+    skip_bits(&h->gb, 2); //chroma format
+    skip_bits(&h->gb, 3); //sample_precision
+    h->aspect_ratio = get_bits(&h->gb, 4);
+    frame_rate_code = get_bits(&h->gb, 4);
+    if (frame_rate_code == 0 || frame_rate_code > 13) {
+        av_log(h->avctx, AV_LOG_WARNING,
+               "frame_rate_code %d is invalid\n", frame_rate_code);
+        frame_rate_code = 1;
+    }
+
+    skip_bits(&h->gb, 18); //bit_rate_lower
+    skip_bits1(&h->gb);    //marker_bit
+    skip_bits(&h->gb, 12); //bit_rate_upper
+    h->low_delay =  get_bits1(&h->gb);
+
+    ret = ff_set_dimensions(h->avctx, width, height);
+    if (ret < 0)
+        return ret;
+
     h->width  = width;
     h->height = height;
-
-    bitstream_skip(&h->bc, 2); // chroma format
-    bitstream_skip(&h->bc, 3); // sample_precision
-    h->aspect_ratio = bitstream_read(&h->bc, 4);
-    frame_rate_code = bitstream_read(&h->bc, 4);
-    bitstream_skip(&h->bc, 18); // bit_rate_lower
-    bitstream_skip(&h->bc, 1);  // marker_bit
-    bitstream_skip(&h->bc, 12); // bit_rate_upper
-    h->low_delay =  bitstream_read_bit(&h->bc);
     h->mb_width  = (h->width  + 15) >> 4;
     h->mb_height = (h->height + 15) >> 4;
     h->avctx->framerate = ff_mpeg12_frame_rate_tab[frame_rate_code];
-    h->avctx->width  = h->width;
-    h->avctx->height = h->height;
     if (!h->top_qp)
-        ff_cavs_init_top_lines(h);
+        return ff_cavs_init_top_lines(h);
     return 0;
 }
 
@@ -1138,16 +1224,21 @@ static int cavs_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return 0;
     }
 
+    h->stc = 0;
+
     buf_ptr = buf;
     buf_end = buf + buf_size;
     for(;;) {
         buf_ptr = avpriv_find_start_code(buf_ptr, buf_end, &stc);
-        if ((stc & 0xFFFFFE00) || buf_ptr == buf_end)
+        if ((stc & 0xFFFFFE00) || buf_ptr == buf_end) {
+            if (!h->stc)
+                av_log(h->avctx, AV_LOG_WARNING, "no frame decoded\n");
             return FFMAX(0, buf_ptr - buf);
+        }
         input_size = (buf_end - buf_ptr) * 8;
         switch (stc) {
         case CAVS_START_CODE:
-            bitstream_init(&h->bc, buf_ptr, input_size);
+            init_get_bits(&h->gb, buf_ptr, input_size);
             decode_seq_header(h);
             break;
         case PIC_I_START_CODE:
@@ -1157,17 +1248,19 @@ static int cavs_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 h->got_keyframe = 1;
             }
         case PIC_PB_START_CODE:
+            if (*got_frame)
+                av_frame_unref(data);
             *got_frame = 0;
             if (!h->got_keyframe)
                 break;
-            bitstream_init(&h->bc, buf_ptr, input_size);
+            init_get_bits(&h->gb, buf_ptr, input_size);
             h->stc = stc;
             if (decode_pic(h))
                 break;
             *got_frame = 1;
             if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-                if (h->DPB[1].f->data[0]) {
-                    if ((ret = av_frame_ref(data, h->DPB[1].f)) < 0)
+                if (h->DPB[!h->low_delay].f->data[0]) {
+                    if ((ret = av_frame_ref(data, h->DPB[!h->low_delay].f)) < 0)
                         return ret;
                 } else {
                     *got_frame = 0;
@@ -1184,8 +1277,8 @@ static int cavs_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             break;
         default:
             if (stc <= SLICE_MAX_START_CODE) {
-                bitstream_init(&h->bc, buf_ptr, input_size);
-                decode_slice_header(h, &h->bc);
+                init_get_bits(&h->gb, buf_ptr, input_size);
+                decode_slice_header(h, &h->gb);
             }
             break;
         }
diff --git a/libavcodec/cavsdsp.c b/libavcodec/cavsdsp.c
index a374dec..90a67e9 100644
--- a/libavcodec/cavsdsp.c
+++ b/libavcodec/cavsdsp.c
@@ -5,20 +5,20 @@
  *
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -197,7 +197,6 @@ static void cavs_idct8_add_c(uint8_t *dst, int16_t *block, ptrdiff_t stride)
 {
     int i;
     int16_t (*src)[8] = (int16_t(*)[8])block;
-    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
 
     src[0][0] += 8;
 
@@ -252,14 +251,14 @@ static void cavs_idct8_add_c(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         const int b2 = a5 - a7;
         const int b3 = a4 - a6;
 
-        dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b4) >> 7)];
-        dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b1 + b5) >> 7)];
-        dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b2 + b6) >> 7)];
-        dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b3 + b7) >> 7)];
-        dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b3 - b7) >> 7)];
-        dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b2 - b6) >> 7)];
-        dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b1 - b5) >> 7)];
-        dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b4) >> 7)];
+        dst[i + 0*stride] = av_clip_uint8( dst[i + 0*stride] + ((b0 + b4) >> 7));
+        dst[i + 1*stride] = av_clip_uint8( dst[i + 1*stride] + ((b1 + b5) >> 7));
+        dst[i + 2*stride] = av_clip_uint8( dst[i + 2*stride] + ((b2 + b6) >> 7));
+        dst[i + 3*stride] = av_clip_uint8( dst[i + 3*stride] + ((b3 + b7) >> 7));
+        dst[i + 4*stride] = av_clip_uint8( dst[i + 4*stride] + ((b3 - b7) >> 7));
+        dst[i + 5*stride] = av_clip_uint8( dst[i + 5*stride] + ((b2 - b6) >> 7));
+        dst[i + 6*stride] = av_clip_uint8( dst[i + 6*stride] + ((b1 - b5) >> 7));
+        dst[i + 7*stride] = av_clip_uint8( dst[i + 7*stride] + ((b0 - b4) >> 7));
     }
 }
 
diff --git a/libavcodec/cavsdsp.h b/libavcodec/cavsdsp.h
index 65aa38a..9ccaa0a 100644
--- a/libavcodec/cavsdsp.h
+++ b/libavcodec/cavsdsp.h
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cbrt_data.c b/libavcodec/cbrt_data.c
new file mode 100644
index 0000000..d2e36cd
--- /dev/null
+++ b/libavcodec/cbrt_data.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "cbrt_data.h"
+
+#include "libavutil/libm.h"
+
+#if CONFIG_HARDCODED_TABLES
+#include "libavcodec/cbrt_tables.h"
+#else
+#include "cbrt_tablegen.h"
+#endif
diff --git a/libavcodec/cbrt_data.h b/libavcodec/cbrt_data.h
new file mode 100644
index 0000000..89117f8
--- /dev/null
+++ b/libavcodec/cbrt_data.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CBRT_DATA_H
+#define AVCODEC_CBRT_DATA_H
+
+#include <stdint.h>
+
+#include "config.h"
+
+#if CONFIG_HARDCODED_TABLES
+#define ff_cbrt_tableinit_fixed()
+#define ff_cbrt_tableinit()
+extern const uint32_t ff_cbrt_tab[1 << 13];
+extern const uint32_t ff_cbrt_tab_fixed[1 << 13];
+#else
+void ff_cbrt_tableinit(void);
+void ff_cbrt_tableinit_fixed(void);
+extern uint32_t ff_cbrt_tab[1 << 13];
+extern uint32_t ff_cbrt_tab_fixed[1 << 13];
+#endif
+
+#endif
diff --git a/libavcodec/cbrt_data_fixed.c b/libavcodec/cbrt_data_fixed.c
new file mode 100644
index 0000000..d661b25
--- /dev/null
+++ b/libavcodec/cbrt_data_fixed.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "cbrt_data.h"
+
+#if CONFIG_HARDCODED_TABLES
+#include "libavcodec/cbrt_fixed_tables.h"
+#else
+#define USE_FIXED 1
+#include "cbrt_tablegen.h"
+#endif
diff --git a/libavcodec/cbrt_fixed_tablegen.c b/libavcodec/cbrt_fixed_tablegen.c
new file mode 100644
index 0000000..24d2fbb
--- /dev/null
+++ b/libavcodec/cbrt_fixed_tablegen.c
@@ -0,0 +1,24 @@
+/*
+ * Generate a header file for hardcoded AAC cube-root table
+ *
+ * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "cbrt_tablegen_template.c"
diff --git a/libavcodec/cbrt_tablegen.c b/libavcodec/cbrt_tablegen.c
index e92c0f1..8c2235e 100644
--- a/libavcodec/cbrt_tablegen.c
+++ b/libavcodec/cbrt_tablegen.c
@@ -3,35 +3,22 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "cbrt_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    cbrt_tableinit();
-
-    write_fileheader();
-
-    WRITE_ARRAY("static const", uint32_t, cbrt_tab);
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "cbrt_tablegen_template.c"
diff --git a/libavcodec/cbrt_tablegen.h b/libavcodec/cbrt_tablegen.h
index 60d900a..9af18d8 100644
--- a/libavcodec/cbrt_tablegen.h
+++ b/libavcodec/cbrt_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,28 +25,49 @@
 
 #include <stdint.h>
 #include <math.h>
+#include "libavutil/attributes.h"
+#include "libavutil/intfloat.h"
+#include "libavcodec/aac_defines.h"
 
-#if CONFIG_HARDCODED_TABLES
-#define cbrt_tableinit()
-#include "libavcodec/cbrt_tables.h"
+#if USE_FIXED
+#define CBRT(x) lrint((x) * 8192)
 #else
-static uint32_t cbrt_tab[1 << 13];
+#define CBRT(x) av_float2int((float)(x))
+#endif
 
-static void cbrt_tableinit(void)
+uint32_t AAC_RENAME(ff_cbrt_tab)[1 << 13];
+
+av_cold void AAC_RENAME(ff_cbrt_tableinit)(void)
 {
-    if (!cbrt_tab[(1<<13) - 1]) {
-        int i;
-        /* cbrtf() isn't available on all systems, so we use powf(). */
-        for (i = 0; i < 1<<13; i++) {
-            union {
-                float f;
-                uint32_t i;
-            } f;
-            f.f = powf(i, 1.0 / 3.0) * i;
-            cbrt_tab[i] = f.i;
+    static double cbrt_tab_dbl[1 << 13];
+    if (!AAC_RENAME(ff_cbrt_tab)[(1<<13) - 1]) {
+        int i, j, k;
+        double cbrt_val;
+
+        for (i = 1; i < 1<<13; i++)
+            cbrt_tab_dbl[i] = 1;
+
+        /* have to take care of non-squarefree numbers */
+        for (i = 2; i < 90; i++) {
+            if (cbrt_tab_dbl[i] == 1) {
+                cbrt_val = i * cbrt(i);
+                for (k = i; k < 1<<13; k *= i)
+                    for (j = k; j < 1<<13; j += k)
+                        cbrt_tab_dbl[j] *= cbrt_val;
+            }
         }
+
+        for (i = 91; i <= 8191; i+= 2) {
+            if (cbrt_tab_dbl[i] == 1) {
+                cbrt_val = i * cbrt(i);
+                for (j = i; j < 1<<13; j += i)
+                    cbrt_tab_dbl[j] *= cbrt_val;
+            }
+        }
+
+        for (i = 0; i < 1<<13; i++)
+            AAC_RENAME(ff_cbrt_tab)[i] = CBRT(cbrt_tab_dbl[i]);
     }
 }
-#endif /* CONFIG_HARDCODED_TABLES */
 
 #endif /* AVCODEC_CBRT_TABLEGEN_H */
diff --git a/libavcodec/cbrt_tablegen_template.c b/libavcodec/cbrt_tablegen_template.c
new file mode 100644
index 0000000..21ed2a6
--- /dev/null
+++ b/libavcodec/cbrt_tablegen_template.c
@@ -0,0 +1,42 @@
+/*
+ * Generate a header file for hardcoded AAC cube-root table
+ *
+ * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#define CONFIG_HARDCODED_TABLES 0
+#include "libavutil/tablegen.h"
+#include "cbrt_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    AAC_RENAME(ff_cbrt_tableinit)();
+
+    write_fileheader();
+
+#if USE_FIXED
+    WRITE_ARRAY("const", uint32_t, ff_cbrt_tab_fixed);
+#else
+    WRITE_ARRAY("const", uint32_t, ff_cbrt_tab);
+#endif
+
+    return 0;
+}
diff --git a/libavcodec/cbs.c b/libavcodec/cbs.c
index 04ad2df..c388be8 100644
--- a/libavcodec/cbs.c
+++ b/libavcodec/cbs.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,15 +29,46 @@
 
 
 static const CodedBitstreamType *cbs_type_table[] = {
+#if CONFIG_CBS_AV1
+    &ff_cbs_type_av1,
+#endif
 #if CONFIG_CBS_H264
     &ff_cbs_type_h264,
 #endif
 #if CONFIG_CBS_H265
     &ff_cbs_type_h265,
 #endif
+#if CONFIG_CBS_JPEG
+    &ff_cbs_type_jpeg,
+#endif
 #if CONFIG_CBS_MPEG2
     &ff_cbs_type_mpeg2,
 #endif
+#if CONFIG_CBS_VP9
+    &ff_cbs_type_vp9,
+#endif
+};
+
+const enum AVCodecID ff_cbs_all_codec_ids[] = {
+#if CONFIG_CBS_AV1
+    AV_CODEC_ID_AV1,
+#endif
+#if CONFIG_CBS_H264
+    AV_CODEC_ID_H264,
+#endif
+#if CONFIG_CBS_H265
+    AV_CODEC_ID_H265,
+#endif
+#if CONFIG_CBS_JPEG
+    AV_CODEC_ID_MJPEG,
+#endif
+#if CONFIG_CBS_MPEG2
+    AV_CODEC_ID_MPEG2VIDEO,
+#endif
+#if CONFIG_CBS_VP9
+    AV_CODEC_ID_VP9,
+#endif
+    AV_CODEC_ID_NONE
 };
 
 int ff_cbs_init(CodedBitstreamContext **ctx_ptr,
@@ -105,14 +136,13 @@ static void cbs_unit_uninit(CodedBitstreamContext *ctx,
     unit->data_bit_padding = 0;
 }
 
-void ff_cbs_fragment_uninit(CodedBitstreamContext *ctx,
-                            CodedBitstreamFragment *frag)
+void ff_cbs_fragment_reset(CodedBitstreamContext *ctx,
+                           CodedBitstreamFragment *frag)
 {
     int i;
 
     for (i = 0; i < frag->nb_units; i++)
         cbs_unit_uninit(ctx, &frag->units[i]);
-    av_freep(&frag->units);
     frag->nb_units = 0;
 
     av_buffer_unref(&frag->data_ref);
@@ -121,32 +151,45 @@ void ff_cbs_fragment_uninit(CodedBitstreamContext *ctx,
     frag->data_bit_padding = 0;
 }
 
+void ff_cbs_fragment_free(CodedBitstreamContext *ctx,
+                          CodedBitstreamFragment *frag)
+{
+    ff_cbs_fragment_reset(ctx, frag);
+
+    av_freep(&frag->units);
+    frag->nb_units_allocated = 0;
+}
+
 static int cbs_read_fragment_content(CodedBitstreamContext *ctx,
                                      CodedBitstreamFragment *frag)
 {
     int err, i, j;
 
     for (i = 0; i < frag->nb_units; i++) {
+        CodedBitstreamUnit *unit = &frag->units[i];
+
         if (ctx->decompose_unit_types) {
             for (j = 0; j < ctx->nb_decompose_unit_types; j++) {
-                if (ctx->decompose_unit_types[j] == frag->units[i].type)
+                if (ctx->decompose_unit_types[j] == unit->type)
                     break;
             }
             if (j >= ctx->nb_decompose_unit_types)
                 continue;
         }
 
-        av_buffer_unref(&frag->units[i].content_ref);
-        frag->units[i].content = NULL;
+        av_buffer_unref(&unit->content_ref);
+        unit->content = NULL;
+
+        av_assert0(unit->data && unit->data_ref);
 
-        err = ctx->codec->read_unit(ctx, &frag->units[i]);
+        err = ctx->codec->read_unit(ctx, unit);
         if (err == AVERROR(ENOSYS)) {
             av_log(ctx->log_ctx, AV_LOG_VERBOSE,
                    "Decomposition unimplemented for unit %d "
-                   "(type %"PRIu32").\n", i, frag->units[i].type);
+                   "(type %"PRIu32").\n", i, unit->type);
         } else if (err < 0) {
             av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to read unit %d "
-                   "(type %"PRIu32").\n", i, frag->units[i].type);
+                   "(type %"PRIu32").\n", i, unit->type);
             return err;
         }
     }
@@ -154,27 +197,6 @@ static int cbs_read_fragment_content(CodedBitstreamContext *ctx,
     return 0;
 }
 
-int ff_cbs_read_extradata(CodedBitstreamContext *ctx,
-                          CodedBitstreamFragment *frag,
-                          const AVCodecParameters *par)
-{
-    int err;
-
-    memset(frag, 0, sizeof(*frag));
-
-    frag->data      = par->extradata;
-    frag->data_size = par->extradata_size;
-
-    err = ctx->codec->split_fragment(ctx, frag, 1);
-    if (err < 0)
-        return err;
-
-    frag->data      = NULL;
-    frag->data_size = 0;
-
-    return cbs_read_fragment_content(ctx, frag);
-}
-
 static int cbs_fill_fragment_data(CodedBitstreamContext *ctx,
                                   CodedBitstreamFragment *frag,
                                   const uint8_t *data, size_t size)
@@ -196,14 +218,30 @@ static int cbs_fill_fragment_data(CodedBitstreamContext *ctx,
     return 0;
 }
 
+int ff_cbs_read_extradata(CodedBitstreamContext *ctx,
+                          CodedBitstreamFragment *frag,
+                          const AVCodecParameters *par)
+{
+    int err;
+
+    err = cbs_fill_fragment_data(ctx, frag, par->extradata,
+                                 par->extradata_size);
+    if (err < 0)
+        return err;
+
+    err = ctx->codec->split_fragment(ctx, frag, 1);
+    if (err < 0)
+        return err;
+
+    return cbs_read_fragment_content(ctx, frag);
+}
+
 int ff_cbs_read_packet(CodedBitstreamContext *ctx,
                        CodedBitstreamFragment *frag,
                        const AVPacket *pkt)
 {
     int err;
 
-    memset(frag, 0, sizeof(*frag));
-
     if (pkt->buf) {
         frag->data_ref = av_buffer_ref(pkt->buf);
         if (!frag->data_ref)
@@ -231,8 +269,6 @@ int ff_cbs_read(CodedBitstreamContext *ctx,
 {
     int err;
 
-    memset(frag, 0, sizeof(*frag));
-
     err = cbs_fill_fragment_data(ctx, frag, data, size);
     if (err < 0)
         return err;
@@ -265,6 +301,7 @@ int ff_cbs_write_fragment_data(CodedBitstreamContext *ctx,
                    "(type %"PRIu32").\n", i, unit->type);
             return err;
         }
+        av_assert0(unit->data && unit->data_ref);
     }
 
     av_buffer_unref(&frag->data_ref);
@@ -275,6 +312,7 @@ int ff_cbs_write_fragment_data(CodedBitstreamContext *ctx,
         av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to assemble fragment.\n");
         return err;
     }
+    av_assert0(frag->data && frag->data_ref);
 
     return 0;
 }
@@ -308,17 +346,20 @@ int ff_cbs_write_packet(CodedBitstreamContext *ctx,
                         AVPacket *pkt,
                         CodedBitstreamFragment *frag)
 {
+    AVBufferRef *buf;
     int err;
 
     err = ff_cbs_write_fragment_data(ctx, frag);
     if (err < 0)
         return err;
 
-    err = av_new_packet(pkt, frag->data_size);
-    if (err < 0)
-        return err;
+    buf = av_buffer_ref(frag->data_ref);
+    if (!buf)
+        return AVERROR(ENOMEM);
 
-    memcpy(pkt->data, frag->data, frag->data_size);
+    av_init_packet(pkt);
+    pkt->buf  = buf;
+    pkt->data = frag->data;
     pkt->size = frag->data_size;
 
     return 0;
@@ -335,17 +376,43 @@ void ff_cbs_trace_header(CodedBitstreamContext *ctx,
 }
 
 void ff_cbs_trace_syntax_element(CodedBitstreamContext *ctx, int position,
-                                 const char *name, const char *bits,
-                                 int64_t value)
+                                 const char *str, const int *subscripts,
+                                 const char *bits, int64_t value)
 {
+    char name[256];
     size_t name_len, bits_len;
-    int pad;
+    int pad, subs, i, j, k, n;
 
     if (!ctx->trace_enable)
         return;
 
     av_assert0(value >= INT_MIN && value <= UINT32_MAX);
 
+    subs = subscripts ? subscripts[0] : 0;
+    n = 0;
+    for (i = j = 0; str[i];) {
+        if (str[i] == '[') {
+            if (n < subs) {
+                ++n;
+                k = snprintf(name + j, sizeof(name) - j, "[%d", subscripts[n]);
+                av_assert0(k > 0 && j + k < sizeof(name));
+                j += k;
+                for (++i; str[i] && str[i] != ']'; i++);
+                av_assert0(str[i] == ']');
+            } else {
+                while (str[i] && str[i] != ']')
+                    name[j++] = str[i++];
+                av_assert0(str[i] == ']');
+            }
+        } else {
+            av_assert0(j + 1 < sizeof(name));
+            name[j++] = str[i++];
+        }
+    }
+    av_assert0(j + 1 < sizeof(name));
+    name[j] = 0;
+    av_assert0(n == subs);
+
     name_len = strlen(name);
     bits_len = strlen(bits);
 
@@ -358,25 +425,26 @@ void ff_cbs_trace_syntax_element(CodedBitstreamContext *ctx, int position,
            position, name, pad, bits, value);
 }
 
-int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, BitstreamContext *bc,
-                         int width, const char *name, uint32_t *write_to,
+int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                         int width, const char *name,
+                         const int *subscripts, uint32_t *write_to,
                          uint32_t range_min, uint32_t range_max)
 {
     uint32_t value;
     int position;
 
-    av_assert0(width <= 32);
+    av_assert0(width > 0 && width <= 32);
 
-    if (bitstream_bits_left(bc) < width) {
+    if (get_bits_left(gbc) < width) {
         av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid value at "
                "%s: bitstream ended.\n", name);
         return AVERROR_INVALIDDATA;
     }
 
     if (ctx->trace_enable)
-        position = bitstream_tell(bc);
+        position = get_bits_count(gbc);
 
-    value = bitstream_read(bc, width);
+    value = get_bits_long(gbc, width);
 
     if (ctx->trace_enable) {
         char bits[33];
@@ -385,7 +453,8 @@ int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, BitstreamContext *bc,
             bits[i] = value >> (width - i - 1) & 1 ? '1' : '0';
         bits[i] = 0;
 
-        ff_cbs_trace_syntax_element(ctx, position, name, bits, value);
+        ff_cbs_trace_syntax_element(ctx, position, name, subscripts,
+                                    bits, value);
     }
 
     if (value < range_min || value > range_max) {
@@ -400,10 +469,11 @@ int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, BitstreamContext *bc,
 }
 
 int ff_cbs_write_unsigned(CodedBitstreamContext *ctx, PutBitContext *pbc,
-                          int width, const char *name, uint32_t value,
+                          int width, const char *name,
+                          const int *subscripts, uint32_t value,
                           uint32_t range_min, uint32_t range_max)
 {
-    av_assert0(width <= 32);
+    av_assert0(width > 0 && width <= 32);
 
     if (value < range_min || value > range_max) {
         av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
@@ -422,7 +492,8 @@ int ff_cbs_write_unsigned(CodedBitstreamContext *ctx, PutBitContext *pbc,
             bits[i] = value >> (width - i - 1) & 1 ? '1' : '0';
         bits[i] = 0;
 
-        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc), name, bits, value);
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, subscripts, bits, value);
     }
 
     if (width < 32)
@@ -479,20 +550,34 @@ static int cbs_insert_unit(CodedBitstreamContext *ctx,
 {
     CodedBitstreamUnit *units;
 
-    units = av_malloc_array(frag->nb_units + 1, sizeof(*units));
-    if (!units)
-        return AVERROR(ENOMEM);
+    if (frag->nb_units < frag->nb_units_allocated) {
+        units = frag->units;
+
+        if (position < frag->nb_units)
+            memmove(units + position + 1, units + position,
+                    (frag->nb_units - position) * sizeof(*units));
+    } else {
+        units = av_malloc_array(frag->nb_units + 1, sizeof(*units));
+        if (!units)
+            return AVERROR(ENOMEM);
 
-    if (position > 0)
-        memcpy(units, frag->units, position * sizeof(*units));
-    if (position < frag->nb_units)
-        memcpy(units + position + 1, frag->units + position,
-               (frag->nb_units - position) * sizeof(*units));
+        ++frag->nb_units_allocated;
+
+        if (position > 0)
+            memcpy(units, frag->units, position * sizeof(*units));
+
+        if (position < frag->nb_units)
+            memcpy(units + position + 1, frag->units + position,
+                   (frag->nb_units - position) * sizeof(*units));
+    }
 
     memset(units + position, 0, sizeof(*units));
 
-    av_freep(&frag->units);
-    frag->units = units;
+    if (units != frag->units) {
+        av_free(frag->units);
+        frag->units = units;
+    }
+
     ++frag->nb_units;
 
     return 0;
@@ -583,16 +668,10 @@ int ff_cbs_delete_unit(CodedBitstreamContext *ctx,
 
     --frag->nb_units;
 
-    if (frag->nb_units == 0) {
-        av_freep(&frag->units);
-
-    } else {
+    if (frag->nb_units > 0)
         memmove(frag->units + position,
                 frag->units + position + 1,
                 (frag->nb_units - position) * sizeof(*frag->units));
 
-        // Don't bother reallocating the unit array.
-    }
-
     return 0;
 }
diff --git a/libavcodec/cbs.h b/libavcodec/cbs.h
index 6505386..967dcd1 100644
--- a/libavcodec/cbs.h
+++ b/libavcodec/cbs.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,6 +48,7 @@ struct CodedBitstreamType;
  * H.264 / AVC: nal_unit_type
  * H.265 / HEVC: nal_unit_type
  * MPEG-2: start code value (without prefix)
+ * VP9: unused, set to zero (every unit is a frame)
  */
 typedef uint32_t CodedBitstreamUnitType;
 
@@ -84,8 +85,9 @@ typedef struct CodedBitstreamUnit {
      */
     size_t   data_bit_padding;
     /**
-     * If data is reference counted, a reference to the buffer containing
-     * data.  Null if data is not reference counted.
+     * A reference to the buffer containing data.
+     *
+     * Must be set if data is not NULL.
      */
     AVBufferRef *data_ref;
 
@@ -130,8 +132,9 @@ typedef struct CodedBitstreamFragment {
      */
     size_t data_bit_padding;
     /**
-     * If data is reference counted, a reference to the buffer containing
-     * data.  Null if data is not reference counted.
+     * A reference to the buffer containing data.
+     *
+     * Must be set if data is not NULL.
      */
     AVBufferRef *data_ref;
 
@@ -142,10 +145,19 @@ typedef struct CodedBitstreamFragment {
      * and has not been decomposed.
      */
     int              nb_units;
+
     /**
-     * Pointer to an array of units of length nb_units.
+     * Number of allocated units.
      *
-     * Must be NULL if nb_units is zero.
+     * Must always be >= nb_units; designed for internal use by cbs.
+     */
+     int             nb_units_allocated;
+
+    /**
+     * Pointer to an array of units of length nb_units_allocated.
+     * Only the first nb_units are valid.
+     *
+     * Must be NULL if nb_units_allocated is zero.
      */
     CodedBitstreamUnit *units;
 } CodedBitstreamFragment;
@@ -202,6 +214,14 @@ typedef struct CodedBitstreamContext {
 
 
 /**
+ * Table of all supported codec IDs.
+ *
+ * Terminated by AV_CODEC_ID_NONE.
+ */
+extern const enum AVCodecID ff_cbs_all_codec_ids[];
+
+
+/**
  * Create and initialise a new context for the given codec.
  */
 int ff_cbs_init(CodedBitstreamContext **ctx,
@@ -220,6 +240,9 @@ void ff_cbs_close(CodedBitstreamContext **ctx);
  * This also updates the internal state, so will need to be called for
  * codecs with extradata to read parameter sets necessary for further
  * parsing even if the fragment itself is not desired.
+ *
+ * The fragment must have been zeroed or reset via ff_cbs_fragment_reset
+ * before use.
  */
 int ff_cbs_read_extradata(CodedBitstreamContext *ctx,
                           CodedBitstreamFragment *frag,
@@ -232,6 +255,9 @@ int ff_cbs_read_extradata(CodedBitstreamContext *ctx,
  * This also updates the internal state of the coded bitstream context
  * with any persistent data from the fragment which may be required to
  * read following fragments (e.g. parameter sets).
+ *
+ * The fragment must have been zeroed or reset via ff_cbs_fragment_reset
+ * before use.
  */
 int ff_cbs_read_packet(CodedBitstreamContext *ctx,
                        CodedBitstreamFragment *frag,
@@ -244,6 +270,9 @@ int ff_cbs_read_packet(CodedBitstreamContext *ctx,
  * This also updates the internal state of the coded bitstream context
  * with any persistent data from the fragment which may be required to
  * read following fragments (e.g. parameter sets).
+ *
+ * The fragment must have been zeroed or reset via ff_cbs_fragment_reset
+ * before use.
  */
 int ff_cbs_read(CodedBitstreamContext *ctx,
                 CodedBitstreamFragment *frag,
@@ -283,11 +312,18 @@ int ff_cbs_write_packet(CodedBitstreamContext *ctx,
 
 
 /**
- * Free all allocated memory in a fragment.
+ * Free the units contained in a fragment as well as the fragment's
+ * own data buffer, but not the units array itself.
  */
-void ff_cbs_fragment_uninit(CodedBitstreamContext *ctx,
+void ff_cbs_fragment_reset(CodedBitstreamContext *ctx,
                             CodedBitstreamFragment *frag);
 
+/**
+ * Free the units array of a fragment in addition to what
+ * ff_cbs_fragment_reset does.
+ */
+void ff_cbs_fragment_free(CodedBitstreamContext *ctx,
+                          CodedBitstreamFragment *frag);
 
 /**
  * Allocate a new internal content buffer of the given size in the unit.
diff --git a/libavcodec/cbs_av1.c b/libavcodec/cbs_av1.c
new file mode 100644
index 0000000..02f168b
--- /dev/null
+++ b/libavcodec/cbs_av1.c
@@ -0,0 +1,1338 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/pixfmt.h"
+
+#include "cbs.h"
+#include "cbs_internal.h"
+#include "cbs_av1.h"
+#include "internal.h"
+
+
+static int cbs_av1_read_uvlc(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                             const char *name, uint32_t *write_to,
+                             uint32_t range_min, uint32_t range_max)
+{
+    uint32_t zeroes, bits_value, value;
+    int position;
+
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    zeroes = 0;
+    while (1) {
+        if (get_bits_left(gbc) < 1) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at "
+                   "%s: bitstream ended.\n", name);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (get_bits1(gbc))
+            break;
+        ++zeroes;
+    }
+
+    if (zeroes >= 32) {
+        value = MAX_UINT_BITS(32);
+    } else {
+        if (get_bits_left(gbc) < zeroes) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at "
+                   "%s: bitstream ended.\n", name);
+            return AVERROR_INVALIDDATA;
+        }
+
+        bits_value = get_bits_long(gbc, zeroes);
+        value = bits_value + (UINT32_C(1) << zeroes) - 1;
+    }
+
+    if (ctx->trace_enable) {
+        char bits[65];
+        int i, j, k;
+
+        if (zeroes >= 32) {
+            while (zeroes > 32) {
+                k = FFMIN(zeroes - 32, 32);
+                for (i = 0; i < k; i++)
+                    bits[i] = '0';
+                bits[i] = 0;
+                ff_cbs_trace_syntax_element(ctx, position, name,
+                                            NULL, bits, 0);
+                zeroes -= k;
+                position += k;
+            }
+        }
+
+        for (i = 0; i < zeroes; i++)
+            bits[i] = '0';
+        bits[i++] = '1';
+
+        if (zeroes < 32) {
+            for (j = 0; j < zeroes; j++)
+                bits[i++] = (bits_value >> (zeroes - j - 1) & 1) ? '1' : '0';
+        }
+
+        bits[i] = 0;
+        ff_cbs_trace_syntax_element(ctx, position, name,
+                                    NULL, bits, value);
+    }
+
+    if (value < range_min || value > range_max) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
+               "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n",
+               name, value, range_min, range_max);
+        return AVERROR_INVALIDDATA;
+    }
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_av1_write_uvlc(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                              const char *name, uint32_t value,
+                              uint32_t range_min, uint32_t range_max)
+{
+    uint32_t v;
+    int position, zeroes;
+
+    if (value < range_min || value > range_max) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
+               "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n",
+               name, value, range_min, range_max);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ctx->trace_enable)
+        position = put_bits_count(pbc);
+
+    if (value == 0) {
+        zeroes = 0;
+        put_bits(pbc, 1, 1);
+    } else {
+        zeroes = av_log2(value + 1);
+        v = value - (1 << zeroes) + 1;
+        put_bits(pbc, zeroes + 1, 1);
+        put_bits(pbc, zeroes, v);
+    }
+
+    if (ctx->trace_enable) {
+        char bits[65];
+        int i, j;
+        i = 0;
+        for (j = 0; j < zeroes; j++)
+            bits[i++] = '0';
+        bits[i++] = '1';
+        for (j = 0; j < zeroes; j++)
+            bits[i++] = (v >> (zeroes - j - 1) & 1) ? '1' : '0';
+        bits[i++] = 0;
+        ff_cbs_trace_syntax_element(ctx, position, name, NULL,
+                                    bits, value);
+    }
+
+    return 0;
+}
+
+static int cbs_av1_read_leb128(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                               const char *name, uint64_t *write_to)
+{
+    uint64_t value;
+    int position, err, i;
+
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    value = 0;
+    for (i = 0; i < 8; i++) {
+        int subscript[2] = { 1, i };
+        uint32_t byte;
+        err = ff_cbs_read_unsigned(ctx, gbc, 8, "leb128_byte[i]", subscript,
+                                   &byte, 0x00, 0xff);
+        if (err < 0)
+            return err;
+
+        value |= (uint64_t)(byte & 0x7f) << (i * 7);
+        if (!(byte & 0x80))
+            break;
+    }
+
+    if (ctx->trace_enable)
+        ff_cbs_trace_syntax_element(ctx, position, name, NULL, "", value);
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_av1_write_leb128(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                                const char *name, uint64_t value)
+{
+    int position, err, len, i;
+    uint8_t byte;
+
+    len = (av_log2(value) + 7) / 7;
+
+    if (ctx->trace_enable)
+        position = put_bits_count(pbc);
+
+    for (i = 0; i < len; i++) {
+        int subscript[2] = { 1, i };
+
+        byte = value >> (7 * i) & 0x7f;
+        if (i < len - 1)
+            byte |= 0x80;
+
+        err = ff_cbs_write_unsigned(ctx, pbc, 8, "leb128_byte[i]", subscript,
+                                    byte, 0x00, 0xff);
+        if (err < 0)
+            return err;
+    }
+
+    if (ctx->trace_enable)
+        ff_cbs_trace_syntax_element(ctx, position, name, NULL, "", value);
+
+    return 0;
+}
+
+static int cbs_av1_read_su(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                           int width, const char *name,
+                           const int *subscripts, int32_t *write_to)
+{
+    int position;
+    int32_t value;
+
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    if (get_bits_left(gbc) < width) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid signed value at "
+               "%s: bitstream ended.\n", name);
+        return AVERROR_INVALIDDATA;
+    }
+
+    value = get_sbits(gbc, width);
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (i = 0; i < width; i++)
+            bits[i] = value & (1 << (width - i - 1)) ? '1' : '0';
+        bits[i] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, position,
+                                    name, subscripts, bits, value);
+    }
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_av1_write_su(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                            int width, const char *name,
+                            const int *subscripts, int32_t value)
+{
+    if (put_bits_left(pbc) < width)
+        return AVERROR(ENOSPC);
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (i = 0; i < width; i++)
+            bits[i] = value & (1 << (width - i - 1)) ? '1' : '0';
+        bits[i] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, subscripts, bits, value);
+    }
+
+    put_sbits(pbc, width, value);
+
+    return 0;
+}
+
+static int cbs_av1_read_ns(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                           uint32_t n, const char *name,
+                           const int *subscripts, uint32_t *write_to)
+{
+    uint32_t w, m, v, extra_bit, value;
+    int position;
+
+    av_assert0(n > 0);
+
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    w = av_log2(n) + 1;
+    m = (1 << w) - n;
+
+    if (get_bits_left(gbc) < w) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid non-symmetric value at "
+               "%s: bitstream ended.\n", name);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (w - 1 > 0)
+        v = get_bits(gbc, w - 1);
+    else
+        v = 0;
+
+    if (v < m) {
+        value = v;
+    } else {
+        extra_bit = get_bits1(gbc);
+        value = (v << 1) - m + extra_bit;
+    }
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (i = 0; i < w - 1; i++)
+            bits[i] = (v >> i & 1) ? '1' : '0';
+        if (v >= m)
+            bits[i++] = extra_bit ? '1' : '0';
+        bits[i] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, position,
+                                    name, subscripts, bits, value);
+    }
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_av1_write_ns(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                            uint32_t n, const char *name,
+                            const int *subscripts, uint32_t value)
+{
+    uint32_t w, m, v, extra_bit;
+    int position;
+
+    if (value > n) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
+               "%"PRIu32", but must be in [0,%"PRIu32"].\n",
+               name, value, n);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ctx->trace_enable)
+        position = put_bits_count(pbc);
+
+    w = av_log2(n) + 1;
+    m = (1 << w) - n;
+
+    if (put_bits_left(pbc) < w)
+        return AVERROR(ENOSPC);
+
+    if (value < m) {
+        v = value;
+        put_bits(pbc, w - 1, v);
+    } else {
+        v = m + ((value - m) >> 1);
+        extra_bit = (value - m) & 1;
+        put_bits(pbc, w - 1, v);
+        put_bits(pbc, 1, extra_bit);
+    }
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (i = 0; i < w - 1; i++)
+            bits[i] = (v >> i & 1) ? '1' : '0';
+        if (value >= m)
+            bits[i++] = extra_bit ? '1' : '0';
+        bits[i] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, position,
+                                    name, subscripts, bits, value);
+    }
+
+    return 0;
+}
+
+static int cbs_av1_read_increment(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                                  uint32_t range_min, uint32_t range_max,
+                                  const char *name, uint32_t *write_to)
+{
+    uint32_t value;
+    int position, i;
+    char bits[33];
+
+    av_assert0(range_min <= range_max && range_max - range_min < sizeof(bits) - 1);
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    for (i = 0, value = range_min; value < range_max;) {
+        if (get_bits_left(gbc) < 1) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid increment value at "
+                   "%s: bitstream ended.\n", name);
+            return AVERROR_INVALIDDATA;
+        }
+        if (get_bits1(gbc)) {
+            bits[i++] = '1';
+            ++value;
+        } else {
+            bits[i++] = '0';
+            break;
+        }
+    }
+
+    if (ctx->trace_enable) {
+        bits[i] = 0;
+        ff_cbs_trace_syntax_element(ctx, position,
+                                    name, NULL, bits, value);
+    }
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_av1_write_increment(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                                   uint32_t range_min, uint32_t range_max,
+                                   const char *name, uint32_t value)
+{
+    int len;
+
+    av_assert0(range_min <= range_max && range_max - range_min < 32);
+    if (value < range_min || value > range_max) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
+               "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n",
+               name, value, range_min, range_max);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (value == range_max)
+        len = range_max - range_min;
+    else
+        len = value - range_min + 1;
+    if (put_bits_left(pbc) < len)
+        return AVERROR(ENOSPC);
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (i = 0; i < len; i++) {
+            if (range_min + i == value)
+                bits[i] = '0';
+            else
+                bits[i] = '1';
+        }
+        bits[i] = 0;
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, NULL, bits, value);
+    }
+
+    if (len > 0)
+        put_bits(pbc, len, (1 << len) - 1 - (value != range_max));
+
+    return 0;
+}
+
+static int cbs_av1_read_subexp(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                               uint32_t range_max, const char *name,
+                               const int *subscripts, uint32_t *write_to)
+{
+    uint32_t value;
+    int position, err;
+    uint32_t max_len, len, range_offset, range_bits;
+
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    av_assert0(range_max > 0);
+    max_len = av_log2(range_max - 1) - 3;
+
+    err = cbs_av1_read_increment(ctx, gbc, 0, max_len,
+                                 "subexp_more_bits", &len);
+    if (err < 0)
+        return err;
+
+    if (len) {
+        range_bits   = 2 + len;
+        range_offset = 1 << range_bits;
+    } else {
+        range_bits   = 3;
+        range_offset = 0;
+    }
+
+    if (len < max_len) {
+        err = ff_cbs_read_unsigned(ctx, gbc, range_bits,
+                                   "subexp_bits", NULL, &value,
+                                   0, MAX_UINT_BITS(range_bits));
+        if (err < 0)
+            return err;
+
+    } else {
+        err = cbs_av1_read_ns(ctx, gbc, range_max - range_offset,
+                              "subexp_final_bits", NULL, &value);
+        if (err < 0)
+            return err;
+    }
+    value += range_offset;
+
+    if (ctx->trace_enable)
+        ff_cbs_trace_syntax_element(ctx, position,
+                                    name, subscripts, "", value);
+
+    *write_to = value;
+    return err;
+}
+
+static int cbs_av1_write_subexp(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                                uint32_t range_max, const char *name,
+                                const int *subscripts, uint32_t value)
+{
+    int position, err;
+    uint32_t max_len, len, range_offset, range_bits;
+
+    if (value > range_max) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
+               "%"PRIu32", but must be in [0,%"PRIu32"].\n",
+               name, value, range_max);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ctx->trace_enable)
+        position = put_bits_count(pbc);
+
+    av_assert0(range_max > 0);
+    max_len = av_log2(range_max - 1) - 3;
+
+    if (value < 8) {
+        range_bits   = 3;
+        range_offset = 0;
+        len = 0;
+    } else {
+        range_bits = av_log2(value);
+        len = range_bits - 2;
+        if (len > max_len) {
+            // The top bin is combined with the one below it.
+            av_assert0(len == max_len + 1);
+            --range_bits;
+            len = max_len;
+        }
+        range_offset = 1 << range_bits;
+    }
+
+    err = cbs_av1_write_increment(ctx, pbc, 0, max_len,
+                                  "subexp_more_bits", len);
+    if (err < 0)
+        return err;
+
+    if (len < max_len) {
+        err = ff_cbs_write_unsigned(ctx, pbc, range_bits,
+                                    "subexp_bits", NULL,
+                                    value - range_offset,
+                                    0, MAX_UINT_BITS(range_bits));
+        if (err < 0)
+            return err;
+
+    } else {
+        err = cbs_av1_write_ns(ctx, pbc, range_max - range_offset,
+                               "subexp_final_bits", NULL,
+                               value - range_offset);
+        if (err < 0)
+            return err;
+    }
+
+    if (ctx->trace_enable)
+        ff_cbs_trace_syntax_element(ctx, position,
+                                    name, subscripts, "", value);
+
+    return err;
+}
+
+
+static int cbs_av1_tile_log2(int blksize, int target)
+{
+    int k;
+    for (k = 0; (blksize << k) < target; k++);
+    return k;
+}
+
+static int cbs_av1_get_relative_dist(const AV1RawSequenceHeader *seq,
+                                     unsigned int a, unsigned int b)
+{
+    unsigned int diff, m;
+    if (!seq->enable_order_hint)
+        return 0;
+    diff = a - b;
+    m = 1 << seq->order_hint_bits_minus_1;
+    diff = (diff & (m - 1)) - (diff & m);
+    return diff;
+}
+
+
+#define HEADER(name) do { \
+        ff_cbs_trace_header(ctx, name); \
+    } while (0)
+
+#define CHECK(call) do { \
+        err = (call); \
+        if (err < 0) \
+            return err; \
+    } while (0)
+
+#define FUNC_NAME(rw, codec, name) cbs_ ## codec ## _ ## rw ## _ ## name
+#define FUNC_AV1(rw, name) FUNC_NAME(rw, av1, name)
+#define FUNC(name) FUNC_AV1(READWRITE, name)
+
+#define SUBSCRIPTS(subs, ...) (subs > 0 ? ((int[subs + 1]){ subs, __VA_ARGS__ }) : NULL)
+
+#define fb(width, name) \
+        xf(width, name, current->name, 0, MAX_UINT_BITS(width), 0)
+#define fc(width, name, range_min, range_max) \
+        xf(width, name, current->name, range_min, range_max, 0)
+#define flag(name) fb(1, name)
+#define su(width, name) \
+        xsu(width, name, current->name, 0)
+
+#define fbs(width, name, subs, ...) \
+        xf(width, name, current->name, 0, MAX_UINT_BITS(width), subs, __VA_ARGS__)
+#define fcs(width, name, range_min, range_max, subs, ...) \
+        xf(width, name, current->name, range_min, range_max, subs, __VA_ARGS__)
+#define flags(name, subs, ...) \
+        xf(1, name, current->name, 0, 1, subs, __VA_ARGS__)
+#define sus(width, name, subs, ...) \
+        xsu(width, name, current->name, subs, __VA_ARGS__)
+
+#define fixed(width, name, value) do { \
+        av_unused uint32_t fixed_value = value; \
+        xf(width, name, fixed_value, value, value, 0); \
+    } while (0)
+
+
+#define READ
+#define READWRITE read
+#define RWContext GetBitContext
+
+#define xf(width, name, var, range_min, range_max, subs, ...) do { \
+        uint32_t value = range_min; \
+        CHECK(ff_cbs_read_unsigned(ctx, rw, width, #name, \
+                                   SUBSCRIPTS(subs, __VA_ARGS__), \
+                                   &value, range_min, range_max)); \
+        var = value; \
+    } while (0)
+
+#define xsu(width, name, var, subs, ...) do { \
+        int32_t value = 0; \
+        CHECK(cbs_av1_read_su(ctx, rw, width, #name, \
+                              SUBSCRIPTS(subs, __VA_ARGS__), &value)); \
+        var = value; \
+    } while (0)
+
+#define uvlc(name, range_min, range_max) do { \
+        uint32_t value = range_min; \
+        CHECK(cbs_av1_read_uvlc(ctx, rw, #name, \
+                                &value, range_min, range_max)); \
+        current->name = value; \
+    } while (0)
+
+#define ns(max_value, name, subs, ...) do { \
+        uint32_t value = 0; \
+        CHECK(cbs_av1_read_ns(ctx, rw, max_value, #name, \
+                              SUBSCRIPTS(subs, __VA_ARGS__), &value)); \
+        current->name = value; \
+    } while (0)
+
+#define increment(name, min, max) do { \
+        uint32_t value = 0; \
+        CHECK(cbs_av1_read_increment(ctx, rw, min, max, #name, &value)); \
+        current->name = value; \
+    } while (0)
+
+#define subexp(name, max, subs, ...) do { \
+        uint32_t value = 0; \
+        CHECK(cbs_av1_read_subexp(ctx, rw, max, #name, \
+                                  SUBSCRIPTS(subs, __VA_ARGS__), &value)); \
+        current->name = value; \
+    } while (0)
+
+#define delta_q(name) do { \
+        uint8_t delta_coded; \
+        int8_t delta_q; \
+        xf(1, name.delta_coded, delta_coded, 0, 1, 0); \
+        if (delta_coded) \
+            xsu(1 + 6, name.delta_q, delta_q, 0); \
+        else \
+            delta_q = 0; \
+        current->name = delta_q; \
+    } while (0)
+
+#define leb128(name) do { \
+        uint64_t value = 0; \
+        CHECK(cbs_av1_read_leb128(ctx, rw, #name, &value)); \
+        current->name = value; \
+    } while (0)
+
+#define infer(name, value) do { \
+        current->name = value; \
+    } while (0)
+
+#define byte_alignment(rw) (get_bits_count(rw) % 8)
+
+#include "cbs_av1_syntax_template.c"
+
+#undef READ
+#undef READWRITE
+#undef RWContext
+#undef xf
+#undef xsu
+#undef uvlc
+#undef leb128
+#undef ns
+#undef increment
+#undef subexp
+#undef delta_q
+#undef leb128
+#undef infer
+#undef byte_alignment
+
+
+#define WRITE
+#define READWRITE write
+#define RWContext PutBitContext
+
+#define xf(width, name, var, range_min, range_max, subs, ...) do { \
+        CHECK(ff_cbs_write_unsigned(ctx, rw, width, #name, \
+                                    SUBSCRIPTS(subs, __VA_ARGS__), \
+                                    var, range_min, range_max)); \
+    } while (0)
+
+#define xsu(width, name, var, subs, ...) do { \
+        CHECK(cbs_av1_write_su(ctx, rw, width, #name, \
+                               SUBSCRIPTS(subs, __VA_ARGS__), var)); \
+    } while (0)
+
+#define uvlc(name, range_min, range_max) do { \
+        CHECK(cbs_av1_write_uvlc(ctx, rw, #name, current->name, \
+                                 range_min, range_max)); \
+    } while (0)
+
+#define ns(max_value, name, subs, ...) do { \
+        CHECK(cbs_av1_write_ns(ctx, rw, max_value, #name, \
+                               SUBSCRIPTS(subs, __VA_ARGS__), \
+                               current->name)); \
+    } while (0)
+
+#define increment(name, min, max) do { \
+        CHECK(cbs_av1_write_increment(ctx, rw, min, max, #name, \
+                                      current->name)); \
+    } while (0)
+
+#define subexp(name, max, subs, ...) do { \
+        CHECK(cbs_av1_write_subexp(ctx, rw, max, #name, \
+                                   SUBSCRIPTS(subs, __VA_ARGS__), \
+                                   current->name)); \
+    } while (0)
+
+#define delta_q(name) do { \
+        xf(1, name.delta_coded, current->name != 0, 0, 1, 0); \
+        if (current->name) \
+            xsu(1 + 6, name.delta_q, current->name, 0); \
+    } while (0)
+
+#define leb128(name) do { \
+        CHECK(cbs_av1_write_leb128(ctx, rw, #name, current->name)); \
+    } while (0)
+
+#define infer(name, value) do { \
+        if (current->name != (value)) { \
+            av_log(ctx->log_ctx, AV_LOG_WARNING, "Warning: " \
+                   "%s does not match inferred value: " \
+                   "%"PRId64", but should be %"PRId64".\n", \
+                   #name, (int64_t)current->name, (int64_t)(value)); \
+        } \
+    } while (0)
+
+#define byte_alignment(rw) (put_bits_count(rw) % 8)
+
+#include "cbs_av1_syntax_template.c"
+
+#undef READ
+#undef READWRITE
+#undef RWContext
+#undef xf
+#undef xsu
+#undef uvlc
+#undef leb128
+#undef ns
+#undef increment
+#undef subexp
+#undef delta_q
+#undef infer
+#undef byte_alignment
+
+
+static int cbs_av1_split_fragment(CodedBitstreamContext *ctx,
+                                  CodedBitstreamFragment *frag,
+                                  int header)
+{
+    GetBitContext gbc;
+    uint8_t *data;
+    size_t size;
+    uint64_t obu_length;
+    int pos, err, trace;
+
+    // Don't include this parsing in trace output.
+    trace = ctx->trace_enable;
+    ctx->trace_enable = 0;
+
+    data = frag->data;
+    size = frag->data_size;
+
+    if (INT_MAX / 8 < size) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid fragment: "
+               "too large (%"SIZE_SPECIFIER" bytes).\n", size);
+        err = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    while (size > 0) {
+        AV1RawOBUHeader header;
+        uint64_t obu_size;
+
+        init_get_bits(&gbc, data, 8 * size);
+
+        err = cbs_av1_read_obu_header(ctx, &gbc, &header);
+        if (err < 0)
+            goto fail;
+
+        if (get_bits_left(&gbc) < 8) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid OBU: fragment "
+                   "too short (%"SIZE_SPECIFIER" bytes).\n", size);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+
+        if (header.obu_has_size_field) {
+            err = cbs_av1_read_leb128(ctx, &gbc, "obu_size", &obu_size);
+            if (err < 0)
+                goto fail;
+        } else
+            obu_size = size - 1 - header.obu_extension_flag;
+
+        pos = get_bits_count(&gbc);
+        av_assert0(pos % 8 == 0 && pos / 8 <= size);
+
+        obu_length = pos / 8 + obu_size;
+
+        if (size < obu_length) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid OBU length: "
+                   "%"PRIu64", but only %"SIZE_SPECIFIER" bytes remaining in fragment.\n",
+                   obu_length, size);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+
+        err = ff_cbs_insert_unit_data(ctx, frag, -1, header.obu_type,
+                                      data, obu_length, frag->data_ref);
+        if (err < 0)
+            goto fail;
+
+        data += obu_length;
+        size -= obu_length;
+    }
+
+    err = 0;
+fail:
+    ctx->trace_enable = trace;
+    return err;
+}
+
+static void cbs_av1_free_tile_data(AV1RawTileData *td)
+{
+    av_buffer_unref(&td->data_ref);
+}
+
+static void cbs_av1_free_metadata(AV1RawMetadata *md)
+{
+    switch (md->metadata_type) {
+    case AV1_METADATA_TYPE_ITUT_T35:
+        av_buffer_unref(&md->metadata.itut_t35.payload_ref);
+        break;
+    }
+}
+
+static void cbs_av1_free_obu(void *unit, uint8_t *content)
+{
+    AV1RawOBU *obu = (AV1RawOBU*)content;
+
+    switch (obu->header.obu_type) {
+    case AV1_OBU_TILE_GROUP:
+        cbs_av1_free_tile_data(&obu->obu.tile_group.tile_data);
+        break;
+    case AV1_OBU_FRAME:
+        cbs_av1_free_tile_data(&obu->obu.frame.tile_group.tile_data);
+        break;
+    case AV1_OBU_TILE_LIST:
+        cbs_av1_free_tile_data(&obu->obu.tile_list.tile_data);
+        break;
+    case AV1_OBU_METADATA:
+        cbs_av1_free_metadata(&obu->obu.metadata);
+        break;
+    }
+
+    av_freep(&obu);
+}
+
+static int cbs_av1_ref_tile_data(CodedBitstreamContext *ctx,
+                                 CodedBitstreamUnit *unit,
+                                 GetBitContext *gbc,
+                                 AV1RawTileData *td)
+{
+    int pos;
+
+    pos = get_bits_count(gbc);
+    if (pos >= 8 * unit->data_size) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Bitstream ended before "
+               "any data in tile group (%d bits read).\n", pos);
+        return AVERROR_INVALIDDATA;
+    }
+    // Must be byte-aligned at this point.
+    av_assert0(pos % 8 == 0);
+
+    td->data_ref = av_buffer_ref(unit->data_ref);
+    if (!td->data_ref)
+        return AVERROR(ENOMEM);
+
+    td->data      = unit->data      + pos / 8;
+    td->data_size = unit->data_size - pos / 8;
+
+    return 0;
+}
+
+static int cbs_av1_read_unit(CodedBitstreamContext *ctx,
+                             CodedBitstreamUnit *unit)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    AV1RawOBU *obu;
+    GetBitContext gbc;
+    int err, start_pos, end_pos;
+
+    err = ff_cbs_alloc_unit_content(ctx, unit, sizeof(*obu),
+                                    &cbs_av1_free_obu);
+    if (err < 0)
+        return err;
+    obu = unit->content;
+
+    err = init_get_bits(&gbc, unit->data, 8 * unit->data_size);
+    if (err < 0)
+        return err;
+
+    err = cbs_av1_read_obu_header(ctx, &gbc, &obu->header);
+    if (err < 0)
+        return err;
+    av_assert0(obu->header.obu_type == unit->type);
+
+    if (obu->header.obu_has_size_field) {
+        uint64_t obu_size;
+        err = cbs_av1_read_leb128(ctx, &gbc, "obu_size", &obu_size);
+        if (err < 0)
+            return err;
+        obu->obu_size = obu_size;
+    } else {
+        if (unit->data_size < 1 + obu->header.obu_extension_flag) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid OBU length: "
+                   "unit too short (%"SIZE_SPECIFIER").\n", unit->data_size);
+            return AVERROR_INVALIDDATA;
+        }
+        obu->obu_size = unit->data_size - 1 - obu->header.obu_extension_flag;
+    }
+
+    start_pos = get_bits_count(&gbc);
+
+    if (obu->header.obu_extension_flag) {
+        priv->temporal_id = obu->header.temporal_id;
+        priv->spatial_id  = obu->header.temporal_id;
+
+        if (obu->header.obu_type != AV1_OBU_SEQUENCE_HEADER &&
+            obu->header.obu_type != AV1_OBU_TEMPORAL_DELIMITER &&
+            priv->operating_point_idc) {
+            int in_temporal_layer =
+                (priv->operating_point_idc >>  priv->temporal_id    ) & 1;
+            int in_spatial_layer  =
+                (priv->operating_point_idc >> (priv->spatial_id + 8)) & 1;
+            if (!in_temporal_layer || !in_spatial_layer) {
+                // Decoding will drop this OBU at this operating point.
+            }
+        }
+    } else {
+        priv->temporal_id = 0;
+        priv->spatial_id  = 0;
+    }
+
+    switch (obu->header.obu_type) {
+    case AV1_OBU_SEQUENCE_HEADER:
+        {
+            err = cbs_av1_read_sequence_header_obu(ctx, &gbc,
+                                                   &obu->obu.sequence_header);
+            if (err < 0)
+                return err;
+
+            av_buffer_unref(&priv->sequence_header_ref);
+            priv->sequence_header = NULL;
+
+            priv->sequence_header_ref = av_buffer_ref(unit->content_ref);
+            if (!priv->sequence_header_ref)
+                return AVERROR(ENOMEM);
+            priv->sequence_header = &obu->obu.sequence_header;
+        }
+        break;
+    case AV1_OBU_TEMPORAL_DELIMITER:
+        {
+            err = cbs_av1_read_temporal_delimiter_obu(ctx, &gbc);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_FRAME_HEADER:
+    case AV1_OBU_REDUNDANT_FRAME_HEADER:
+        {
+            err = cbs_av1_read_frame_header_obu(ctx, &gbc,
+                                                &obu->obu.frame_header,
+                                                obu->header.obu_type ==
+                                                AV1_OBU_REDUNDANT_FRAME_HEADER,
+                                                unit->data_ref);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_TILE_GROUP:
+        {
+            err = cbs_av1_read_tile_group_obu(ctx, &gbc,
+                                              &obu->obu.tile_group);
+            if (err < 0)
+                return err;
+
+            err = cbs_av1_ref_tile_data(ctx, unit, &gbc,
+                                        &obu->obu.tile_group.tile_data);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_FRAME:
+        {
+            err = cbs_av1_read_frame_obu(ctx, &gbc, &obu->obu.frame,
+                                         unit->data_ref);
+            if (err < 0)
+                return err;
+
+            err = cbs_av1_ref_tile_data(ctx, unit, &gbc,
+                                        &obu->obu.frame.tile_group.tile_data);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_TILE_LIST:
+        {
+            err = cbs_av1_read_tile_list_obu(ctx, &gbc,
+                                             &obu->obu.tile_list);
+            if (err < 0)
+                return err;
+
+            err = cbs_av1_ref_tile_data(ctx, unit, &gbc,
+                                        &obu->obu.tile_list.tile_data);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_METADATA:
+        {
+            err = cbs_av1_read_metadata_obu(ctx, &gbc, &obu->obu.metadata);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_PADDING:
+    default:
+        return AVERROR(ENOSYS);
+    }
+
+    end_pos = get_bits_count(&gbc);
+    av_assert0(end_pos <= unit->data_size * 8);
+
+    if (obu->obu_size > 0 &&
+        obu->header.obu_type != AV1_OBU_TILE_GROUP &&
+        obu->header.obu_type != AV1_OBU_FRAME) {
+        int nb_bits = obu->obu_size * 8 + start_pos - end_pos;
+
+        if (nb_bits <= 0)
+            return AVERROR_INVALIDDATA;
+
+        err = cbs_av1_read_trailing_bits(ctx, &gbc, nb_bits);
+        if (err < 0)
+            return err;
+    }
+
+    return 0;
+}
+
+static int cbs_av1_write_obu(CodedBitstreamContext *ctx,
+                             CodedBitstreamUnit *unit,
+                             PutBitContext *pbc)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    AV1RawOBU *obu = unit->content;
+    PutBitContext pbc_tmp;
+    AV1RawTileData *td;
+    size_t header_size;
+    int err, start_pos, end_pos, data_pos;
+
+    // OBUs in the normal bitstream format must contain a size field
+    // in every OBU (in annex B it is optional, but we don't support
+    // writing that).
+    obu->header.obu_has_size_field = 1;
+
+    err = cbs_av1_write_obu_header(ctx, pbc, &obu->header);
+    if (err < 0)
+        return err;
+
+    if (obu->header.obu_has_size_field) {
+        pbc_tmp = *pbc;
+        // Add space for the size field to fill later.
+        put_bits32(pbc, 0);
+        put_bits32(pbc, 0);
+    }
+
+    td = NULL;
+    start_pos = put_bits_count(pbc);
+
+    switch (obu->header.obu_type) {
+    case AV1_OBU_SEQUENCE_HEADER:
+        {
+            err = cbs_av1_write_sequence_header_obu(ctx, pbc,
+                                                    &obu->obu.sequence_header);
+            if (err < 0)
+                return err;
+
+            av_buffer_unref(&priv->sequence_header_ref);
+            priv->sequence_header = NULL;
+
+            priv->sequence_header_ref = av_buffer_ref(unit->content_ref);
+            if (!priv->sequence_header_ref)
+                return AVERROR(ENOMEM);
+            priv->sequence_header = &obu->obu.sequence_header;
+        }
+        break;
+    case AV1_OBU_TEMPORAL_DELIMITER:
+        {
+            err = cbs_av1_write_temporal_delimiter_obu(ctx, pbc);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_FRAME_HEADER:
+    case AV1_OBU_REDUNDANT_FRAME_HEADER:
+        {
+            err = cbs_av1_write_frame_header_obu(ctx, pbc,
+                                                 &obu->obu.frame_header,
+                                                 obu->header.obu_type ==
+                                                 AV1_OBU_REDUNDANT_FRAME_HEADER,
+                                                 NULL);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_TILE_GROUP:
+        {
+            err = cbs_av1_write_tile_group_obu(ctx, pbc,
+                                               &obu->obu.tile_group);
+            if (err < 0)
+                return err;
+
+            td = &obu->obu.tile_group.tile_data;
+        }
+        break;
+    case AV1_OBU_FRAME:
+        {
+            err = cbs_av1_write_frame_obu(ctx, pbc, &obu->obu.frame, NULL);
+            if (err < 0)
+                return err;
+
+            td = &obu->obu.frame.tile_group.tile_data;
+        }
+        break;
+    case AV1_OBU_TILE_LIST:
+        {
+            err = cbs_av1_write_tile_list_obu(ctx, pbc, &obu->obu.tile_list);
+            if (err < 0)
+                return err;
+
+            td = &obu->obu.tile_list.tile_data;
+        }
+        break;
+    case AV1_OBU_METADATA:
+        {
+            err = cbs_av1_write_metadata_obu(ctx, pbc, &obu->obu.metadata);
+            if (err < 0)
+                return err;
+        }
+        break;
+    case AV1_OBU_PADDING:
+    default:
+        return AVERROR(ENOSYS);
+    }
+
+    end_pos = put_bits_count(pbc);
+    header_size = (end_pos - start_pos + 7) / 8;
+    if (td) {
+        obu->obu_size = header_size + td->data_size;
+    } else if (header_size > 0) {
+        // Add trailing bits and recalculate.
+        err = cbs_av1_write_trailing_bits(ctx, pbc, 8 - end_pos % 8);
+        if (err < 0)
+            return err;
+        end_pos = put_bits_count(pbc);
+        obu->obu_size = header_size = (end_pos - start_pos + 7) / 8;
+    } else {
+        // Empty OBU.
+        obu->obu_size = 0;
+    }
+
+    end_pos = put_bits_count(pbc);
+    // Must now be byte-aligned.
+    av_assert0(end_pos % 8 == 0);
+    flush_put_bits(pbc);
+    start_pos /= 8;
+    end_pos   /= 8;
+
+    *pbc = pbc_tmp;
+    err = cbs_av1_write_leb128(ctx, pbc, "obu_size", obu->obu_size);
+    if (err < 0)
+        return err;
+
+    data_pos = put_bits_count(pbc) / 8;
+    flush_put_bits(pbc);
+    av_assert0(data_pos <= start_pos);
+
+    if (8 * obu->obu_size > put_bits_left(pbc))
+        return AVERROR(ENOSPC);
+
+    if (obu->obu_size > 0) {
+        memmove(priv->write_buffer + data_pos,
+                priv->write_buffer + start_pos, header_size);
+        skip_put_bytes(pbc, header_size);
+
+        if (td) {
+            memcpy(priv->write_buffer + data_pos + header_size,
+                   td->data, td->data_size);
+            skip_put_bytes(pbc, td->data_size);
+        }
+    }
+
+    return 0;
+}
+
+static int cbs_av1_write_unit(CodedBitstreamContext *ctx,
+                              CodedBitstreamUnit *unit)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    PutBitContext pbc;
+    int err;
+
+    if (!priv->write_buffer) {
+        // Initial write buffer size is 1MB.
+        priv->write_buffer_size = 1024 * 1024;
+
+    reallocate_and_try_again:
+        err = av_reallocp(&priv->write_buffer, priv->write_buffer_size);
+        if (err < 0) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Unable to allocate a "
+                   "sufficiently large write buffer (last attempt "
+                   "%"SIZE_SPECIFIER" bytes).\n", priv->write_buffer_size);
+            return err;
+        }
+    }
+
+    init_put_bits(&pbc, priv->write_buffer, priv->write_buffer_size);
+
+    err = cbs_av1_write_obu(ctx, unit, &pbc);
+    if (err == AVERROR(ENOSPC)) {
+        // Overflow.
+        priv->write_buffer_size *= 2;
+        goto reallocate_and_try_again;
+    }
+    if (err < 0)
+        return err;
+
+    // Overflow but we didn't notice.
+    av_assert0(put_bits_count(&pbc) <= 8 * priv->write_buffer_size);
+
+    // OBU data must be byte-aligned.
+    av_assert0(put_bits_count(&pbc) % 8 == 0);
+
+    unit->data_size = put_bits_count(&pbc) / 8;
+    flush_put_bits(&pbc);
+
+    err = ff_cbs_alloc_unit_data(ctx, unit, unit->data_size);
+    if (err < 0)
+        return err;
+
+    memcpy(unit->data, priv->write_buffer, unit->data_size);
+
+    return 0;
+}
+
+static int cbs_av1_assemble_fragment(CodedBitstreamContext *ctx,
+                                     CodedBitstreamFragment *frag)
+{
+    size_t size, pos;
+    int i;
+
+    size = 0;
+    for (i = 0; i < frag->nb_units; i++)
+        size += frag->units[i].data_size;
+
+    frag->data_ref = av_buffer_alloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!frag->data_ref)
+        return AVERROR(ENOMEM);
+    frag->data = frag->data_ref->data;
+    memset(frag->data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+    pos = 0;
+    for (i = 0; i < frag->nb_units; i++) {
+        memcpy(frag->data + pos, frag->units[i].data,
+               frag->units[i].data_size);
+        pos += frag->units[i].data_size;
+    }
+    av_assert0(pos == size);
+    frag->data_size = size;
+
+    return 0;
+}
+
+static void cbs_av1_close(CodedBitstreamContext *ctx)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+
+    av_buffer_unref(&priv->sequence_header_ref);
+    av_buffer_unref(&priv->frame_header_ref);
+
+    av_freep(&priv->write_buffer);
+}
+
+const CodedBitstreamType ff_cbs_type_av1 = {
+    .codec_id          = AV_CODEC_ID_AV1,
+
+    .priv_data_size    = sizeof(CodedBitstreamAV1Context),
+
+    .split_fragment    = &cbs_av1_split_fragment,
+    .read_unit         = &cbs_av1_read_unit,
+    .write_unit        = &cbs_av1_write_unit,
+    .assemble_fragment = &cbs_av1_assemble_fragment,
+
+    .close             = &cbs_av1_close,
+};
diff --git a/libavcodec/cbs_av1.h b/libavcodec/cbs_av1.h
new file mode 100644
index 0000000..71ceff9
--- /dev/null
+++ b/libavcodec/cbs_av1.h
@@ -0,0 +1,432 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CBS_AV1_H
+#define AVCODEC_CBS_AV1_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "av1.h"
+#include "cbs.h"
+
+
+typedef struct AV1RawOBUHeader {
+    uint8_t obu_forbidden_bit;
+    uint8_t obu_type;
+    uint8_t obu_extension_flag;
+    uint8_t obu_has_size_field;
+    uint8_t obu_reserved_1bit;
+
+    uint8_t temporal_id;
+    uint8_t spatial_id;
+    uint8_t extension_header_reserved_3bits;
+} AV1RawOBUHeader;
+
+typedef struct AV1RawColorConfig {
+    uint8_t high_bitdepth;
+    uint8_t twelve_bit;
+    uint8_t mono_chrome;
+
+    uint8_t color_description_present_flag;
+    uint8_t color_primaries;
+    uint8_t transfer_characteristics;
+    uint8_t matrix_coefficients;
+
+    uint8_t color_range;
+    uint8_t subsampling_x;
+    uint8_t subsampling_y;
+    uint8_t chroma_sample_position;
+    uint8_t separate_uv_delta_q;
+} AV1RawColorConfig;
+
+typedef struct AV1RawTimingInfo {
+    uint32_t num_units_in_display_tick;
+    uint32_t time_scale;
+
+    uint8_t equal_picture_interval;
+    uint32_t num_ticks_per_picture_minus_1;
+} AV1RawTimingInfo;
+
+typedef struct AV1RawDecoderModelInfo {
+    uint8_t  buffer_delay_length_minus_1;
+    uint32_t num_units_in_decoding_tick;
+    uint8_t  buffer_removal_time_length_minus_1;
+    uint8_t  frame_presentation_time_length_minus_1;
+} AV1RawDecoderModelInfo;
+
+typedef struct AV1RawSequenceHeader {
+    uint8_t seq_profile;
+    uint8_t still_picture;
+    uint8_t reduced_still_picture_header;
+
+    uint8_t timing_info_present_flag;
+    uint8_t decoder_model_info_present_flag;
+    uint8_t initial_display_delay_present_flag;
+    uint8_t operating_points_cnt_minus_1;
+
+    AV1RawTimingInfo       timing_info;
+    AV1RawDecoderModelInfo decoder_model_info;
+
+    uint16_t operating_point_idc[AV1_MAX_OPERATING_POINTS];
+    uint8_t  seq_level_idx[AV1_MAX_OPERATING_POINTS];
+    uint8_t  seq_tier[AV1_MAX_OPERATING_POINTS];
+    uint8_t  decoder_model_present_for_this_op[AV1_MAX_OPERATING_POINTS];
+    uint32_t decoder_buffer_delay[AV1_MAX_OPERATING_POINTS];
+    uint32_t encoder_buffer_delay[AV1_MAX_OPERATING_POINTS];
+    uint8_t  low_delay_mode_flag[AV1_MAX_OPERATING_POINTS];
+    uint8_t  initial_display_delay_present_for_this_op[AV1_MAX_OPERATING_POINTS];
+    uint8_t  initial_display_delay_minus_1[AV1_MAX_OPERATING_POINTS];
+
+    uint8_t  frame_width_bits_minus_1;
+    uint8_t  frame_height_bits_minus_1;
+    uint16_t max_frame_width_minus_1;
+    uint16_t max_frame_height_minus_1;
+
+    uint8_t frame_id_numbers_present_flag;
+    uint8_t delta_frame_id_length_minus_2;
+    uint8_t additional_frame_id_length_minus_1;
+
+    uint8_t use_128x128_superblock;
+    uint8_t enable_filter_intra;
+    uint8_t enable_intra_edge_filter;
+    uint8_t enable_intraintra_compound;
+    uint8_t enable_masked_compound;
+    uint8_t enable_warped_motion;
+    uint8_t enable_dual_filter;
+
+    uint8_t enable_order_hint;
+    uint8_t enable_jnt_comp;
+    uint8_t enable_ref_frame_mvs;
+
+    uint8_t seq_choose_screen_content_tools;
+    uint8_t seq_force_screen_content_tools;
+    uint8_t seq_choose_integer_mv;
+    uint8_t seq_force_integer_mv;
+
+    uint8_t order_hint_bits_minus_1;
+
+    uint8_t enable_superres;
+    uint8_t enable_cdef;
+    uint8_t enable_restoration;
+
+    AV1RawColorConfig color_config;
+
+    uint8_t film_grain_params_present;
+} AV1RawSequenceHeader;
+
+typedef struct AV1RawFrameHeader {
+    uint8_t  show_existing_frame;
+    uint8_t  frame_to_show_map_idx;
+    uint32_t frame_presentation_time;
+    uint32_t display_frame_id;
+
+    uint8_t frame_type;
+    uint8_t show_frame;
+    uint8_t showable_frame;
+
+    uint8_t error_resilient_mode;
+    uint8_t disable_cdf_update;
+    uint8_t allow_screen_content_tools;
+    uint8_t force_integer_mv;
+
+    uint32_t current_frame_id;
+    uint8_t  frame_size_override_flag;
+    uint8_t  order_hint;
+
+    uint8_t  buffer_removal_time_present_flag;
+    uint32_t buffer_removal_time[AV1_MAX_OPERATING_POINTS];
+
+    uint8_t  primary_ref_frame;
+    uint16_t frame_width_minus_1;
+    uint16_t frame_height_minus_1;
+    uint8_t  use_superres;
+    uint8_t  coded_denom;
+    uint8_t  render_and_frame_size_different;
+    uint8_t  render_width_minus_1;
+    uint8_t  render_height_minus_1;
+
+    uint8_t found_ref[AV1_REFS_PER_FRAME];
+
+    uint8_t refresh_frame_flags;
+    uint8_t allow_intrabc;
+    uint8_t ref_order_hint[AV1_NUM_REF_FRAMES];
+    uint8_t frame_refs_short_signaling;
+    uint8_t last_frame_idx;
+    uint8_t golden_frame_idx;
+    int8_t  ref_frame_idx[AV1_REFS_PER_FRAME];
+    uint32_t delta_frame_id_minus1[AV1_REFS_PER_FRAME];
+
+    uint8_t allow_high_precision_mv;
+    uint8_t is_filter_switchable;
+    uint8_t interpolation_filter;
+    uint8_t is_motion_mode_switchable;
+    uint8_t use_ref_frame_mvs;
+
+    uint8_t disable_frame_end_update_cdf;
+
+    uint8_t uniform_tile_spacing_flag;
+    uint8_t tile_cols_log2;
+    uint8_t tile_rows_log2;
+    uint8_t width_in_sbs_minus_1[AV1_MAX_TILE_COLS];
+    uint8_t height_in_sbs_minus_1[AV1_MAX_TILE_ROWS];
+    uint16_t context_update_tile_id;
+    uint8_t tile_size_bytes_minus1;
+
+    // These are derived values, but it's very unhelpful to have to
+    // recalculate them all the time so we store them here.
+    uint16_t tile_cols;
+    uint16_t tile_rows;
+
+    uint8_t base_q_idx;
+    int8_t  delta_q_y_dc;
+    uint8_t diff_uv_delta;
+    int8_t  delta_q_u_dc;
+    int8_t  delta_q_u_ac;
+    int8_t  delta_q_v_dc;
+    int8_t  delta_q_v_ac;
+    uint8_t using_qmatrix;
+    uint8_t qm_y;
+    uint8_t qm_u;
+    uint8_t qm_v;
+
+    uint8_t segmentation_enabled;
+    uint8_t segmentation_update_map;
+    uint8_t segmentation_temporal_update;
+    uint8_t segmentation_update_data;
+    uint8_t feature_enabled[AV1_MAX_SEGMENTS][AV1_SEG_LVL_MAX];
+    int16_t feature_value[AV1_MAX_SEGMENTS][AV1_SEG_LVL_MAX];
+
+    uint8_t delta_q_present;
+    uint8_t delta_q_res;
+    uint8_t delta_lf_present;
+    uint8_t delta_lf_res;
+    uint8_t delta_lf_multi;
+
+    uint8_t loop_filter_level[4];
+    uint8_t loop_filter_sharpness;
+    uint8_t loop_filter_delta_enabled;
+    uint8_t loop_filter_delta_update;
+    uint8_t update_ref_delta[AV1_TOTAL_REFS_PER_FRAME];
+    int8_t  loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME];
+    uint8_t update_mode_delta[2];
+    int8_t  loop_filter_mode_deltas[2];
+
+    uint8_t cdef_damping_minus_3;
+    uint8_t cdef_bits;
+    uint8_t cdef_y_pri_strength[8];
+    uint8_t cdef_y_sec_strength[8];
+    uint8_t cdef_uv_pri_strength[8];
+    uint8_t cdef_uv_sec_strength[8];
+
+    uint8_t lr_type[3];
+    uint8_t lr_unit_shift;
+    uint8_t lr_uv_shift;
+
+    uint8_t tx_mode;
+    uint8_t reference_select;
+    uint8_t skip_mode_present;
+
+    uint8_t allow_warped_motion;
+    uint8_t reduced_tx_set;
+
+    uint8_t is_global[AV1_TOTAL_REFS_PER_FRAME];
+    uint8_t is_rot_zoom[AV1_TOTAL_REFS_PER_FRAME];
+    uint8_t is_translation[AV1_TOTAL_REFS_PER_FRAME];
+    //AV1RawSubexp gm_params[AV1_TOTAL_REFS_PER_FRAME][6];
+    uint32_t gm_params[AV1_TOTAL_REFS_PER_FRAME][6];
+
+    uint8_t  apply_grain;
+    uint16_t grain_seed;
+    uint8_t  update_grain;
+    uint8_t  film_grain_params_ref_idx;
+    uint8_t  num_y_points;
+    uint8_t  point_y_value[16];
+    uint8_t  point_y_scaling[16];
+    uint8_t  chroma_scaling_from_luma;
+    uint8_t  num_cb_points;
+    uint8_t  point_cb_value[16];
+    uint8_t  point_cb_scaling[16];
+    uint8_t  num_cr_points;
+    uint8_t  point_cr_value[16];
+    uint8_t  point_cr_scaling[16];
+    uint8_t  grain_scaling_minus_8;
+    uint8_t  ar_coeff_lag;
+    uint8_t  ar_coeffs_y_plus_128[24];
+    uint8_t  ar_coeffs_cb_plus_128[24];
+    uint8_t  ar_coeffs_cr_plus_128[24];
+    uint8_t  ar_coeff_shift_minus_6;
+    uint8_t  grain_scale_shift;
+    uint8_t  cb_mult;
+    uint8_t  cb_luma_mult;
+    uint16_t cb_offset;
+    uint8_t  cr_mult;
+    uint8_t  cr_luma_mult;
+    uint16_t cr_offset;
+    uint8_t  overlap_flag;
+    uint8_t  clip_to_restricted_range;
+} AV1RawFrameHeader;
+
+typedef struct AV1RawTileData {
+    uint8_t     *data;
+    size_t       data_size;
+    AVBufferRef *data_ref;
+} AV1RawTileData;
+
+typedef struct AV1RawTileGroup {
+    uint8_t  tile_start_and_end_present_flag;
+    uint16_t tg_start;
+    uint16_t tg_end;
+
+    AV1RawTileData tile_data;
+} AV1RawTileGroup;
+
+typedef struct AV1RawFrame {
+    AV1RawFrameHeader header;
+    AV1RawTileGroup   tile_group;
+} AV1RawFrame;
+
+typedef struct AV1RawTileList {
+    uint8_t output_frame_width_in_tiles_minus_1;
+    uint8_t output_frame_height_in_tiles_minus_1;
+    uint16_t tile_count_minus_1;
+
+    AV1RawTileData tile_data;
+} AV1RawTileList;
+
+typedef struct AV1RawMetadataHDRCLL {
+    uint16_t max_cll;
+    uint16_t max_fall;
+} AV1RawMetadataHDRCLL;
+
+typedef struct AV1RawMetadataHDRMDCV {
+    uint16_t primary_chromaticity_x[3];
+    uint16_t primary_chromaticity_y[3];
+    uint16_t white_point_chromaticity_x;
+    uint16_t white_point_chromaticity_y;
+    uint32_t luminance_max;
+    uint32_t luminance_min;
+} AV1RawMetadataHDRMDCV;
+
+typedef struct AV1RawMetadataScalability {
+    uint8_t scalability_mode_idc;
+    // TODO: more stuff.
+} AV1RawMetadataScalability;
+
+typedef struct AV1RawMetadataITUTT35 {
+    uint8_t itu_t_t35_country_code;
+    uint8_t itu_t_t35_country_code_extension_byte;
+
+    uint8_t     *payload;
+    size_t       payload_size;
+    AVBufferRef *payload_ref;
+} AV1RawMetadataITUTT35;
+
+typedef struct AV1RawMetadataTimecode {
+    uint8_t  counting_type;
+    uint8_t  full_timestamp_flag;
+    uint8_t  discontinuity_flag;
+    uint8_t  cnt_dropped_flag;
+    uint16_t n_frames;
+    uint8_t  seconds_value;
+    uint8_t  minutes_value;
+    uint8_t  hours_value;
+    uint8_t  seconds_flag;
+    uint8_t  minutes_flag;
+    uint8_t  hours_flag;
+    uint8_t  time_offset_length;
+    uint32_t time_offset_value;
+} AV1RawMetadataTimecode;
+
+typedef struct AV1RawMetadata {
+    uint64_t metadata_type;
+    union {
+        AV1RawMetadataHDRCLL      hdr_cll;
+        AV1RawMetadataHDRMDCV     hdr_mdcv;
+        AV1RawMetadataScalability scalability;
+        AV1RawMetadataITUTT35     itut_t35;
+        AV1RawMetadataTimecode    timecode;
+    } metadata;
+} AV1RawMetadata;
+
+
+typedef struct AV1RawOBU {
+    AV1RawOBUHeader header;
+
+    size_t obu_size;
+
+    union {
+        AV1RawSequenceHeader sequence_header;
+        AV1RawFrameHeader    frame_header;
+        AV1RawFrame          frame;
+        AV1RawTileGroup      tile_group;
+        AV1RawTileList       tile_list;
+        AV1RawMetadata       metadata;
+    } obu;
+} AV1RawOBU;
+
+typedef struct AV1ReferenceFrameState {
+    int valid;          // RefValid
+    int frame_id;       // RefFrameId
+    int upscaled_width; // RefUpscaledWidth
+    int frame_width;    // RefFrameWidth
+    int frame_height;   // RefFrameHeight
+    int render_width;   // RefRenderWidth
+    int render_height;  // RefRenderHeight
+    int frame_type;     // RefFrameType
+    int subsampling_x;  // RefSubsamplingX
+    int subsampling_y;  // RefSubsamplingY
+    int bit_depth;      // RefBitDepth
+    int order_hint;     // RefOrderHint
+} AV1ReferenceFrameState;
+
+typedef struct CodedBitstreamAV1Context {
+    AV1RawSequenceHeader *sequence_header;
+    AVBufferRef          *sequence_header_ref;
+
+    int     seen_frame_header;
+    AVBufferRef *frame_header_ref;
+    uint8_t     *frame_header;
+    size_t       frame_header_size;
+
+    int temporal_id;
+    int spatial_id;
+    int operating_point_idc;
+
+    int bit_depth;
+    int frame_width;
+    int frame_height;
+    int upscaled_width;
+    int render_width;
+    int render_height;
+
+    int num_planes;
+    int coded_lossless;
+    int all_lossless;
+    int tile_cols;
+    int tile_rows;
+
+    AV1ReferenceFrameState ref[AV1_NUM_REF_FRAMES];
+
+    // Write buffer.
+    uint8_t *write_buffer;
+    size_t   write_buffer_size;
+} CodedBitstreamAV1Context;
+
+
+#endif /* AVCODEC_CBS_AV1_H */
diff --git a/libavcodec/cbs_av1_syntax_template.c b/libavcodec/cbs_av1_syntax_template.c
new file mode 100644
index 0000000..48f4fab
--- /dev/null
+++ b/libavcodec/cbs_av1_syntax_template.c
@@ -0,0 +1,1762 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static int FUNC(obu_header)(CodedBitstreamContext *ctx, RWContext *rw,
+                            AV1RawOBUHeader *current)
+{
+    int err;
+    av_unused int zero = 0;
+
+    HEADER("OBU header");
+
+    fc(1, obu_forbidden_bit, 0, 0);
+
+    fc(4, obu_type, 0, AV1_OBU_PADDING);
+    flag(obu_extension_flag);
+    flag(obu_has_size_field);
+
+    fc(1, obu_reserved_1bit, 0, 0);
+
+    if (current->obu_extension_flag) {
+        fb(3, temporal_id);
+        fb(2, spatial_id);
+        fc(3, extension_header_reserved_3bits, 0, 0);
+    }
+
+    return 0;
+}
+
+static int FUNC(trailing_bits)(CodedBitstreamContext *ctx, RWContext *rw, int nb_bits)
+{
+    int err;
+
+    av_assert0(nb_bits > 0);
+
+    fixed(1, trailing_one_bit, 1);
+    --nb_bits;
+
+    while (nb_bits > 0) {
+        fixed(1, trailing_zero_bit, 0);
+        --nb_bits;
+    }
+
+    return 0;
+}
+
+static int FUNC(byte_alignment)(CodedBitstreamContext *ctx, RWContext *rw)
+{
+    int err;
+
+    while (byte_alignment(rw) != 0)
+        fixed(1, zero_bit, 0);
+
+    return 0;
+}
+
+static int FUNC(color_config)(CodedBitstreamContext *ctx, RWContext *rw,
+                              AV1RawColorConfig *current, int seq_profile)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    int err;
+
+    flag(high_bitdepth);
+
+    if (seq_profile == FF_PROFILE_AV1_PROFESSIONAL &&
+        current->high_bitdepth) {
+        flag(twelve_bit);
+        priv->bit_depth = current->twelve_bit ? 12 : 10;
+    } else {
+        priv->bit_depth = current->high_bitdepth ? 10 : 8;
+    }
+
+    if (seq_profile == FF_PROFILE_AV1_HIGH)
+        infer(mono_chrome, 0);
+    else
+        flag(mono_chrome);
+    priv->num_planes = current->mono_chrome ? 1 : 3;
+
+    flag(color_description_present_flag);
+    if (current->color_description_present_flag) {
+        fb(8, color_primaries);
+        fb(8, transfer_characteristics);
+        fb(8, matrix_coefficients);
+    } else {
+        infer(color_primaries,          AVCOL_PRI_UNSPECIFIED);
+        infer(transfer_characteristics, AVCOL_TRC_UNSPECIFIED);
+        infer(matrix_coefficients,      AVCOL_SPC_UNSPECIFIED);
+    }
+
+    if (current->mono_chrome) {
+        flag(color_range);
+
+        infer(subsampling_x, 1);
+        infer(subsampling_y, 1);
+        infer(chroma_sample_position, AV1_CSP_UNKNOWN);
+        infer(separate_uv_delta_q, 0);
+
+    } else if (current->color_primaries          == AVCOL_PRI_BT709 &&
+               current->transfer_characteristics == AVCOL_TRC_IEC61966_2_1 &&
+               current->matrix_coefficients      == AVCOL_SPC_RGB) {
+        infer(color_range,   1);
+        infer(subsampling_x, 0);
+        infer(subsampling_y, 0);
+        flag(separate_uv_delta_q);
+
+    } else {
+        flag(color_range);
+
+        if (seq_profile == FF_PROFILE_AV1_MAIN) {
+            infer(subsampling_x, 1);
+            infer(subsampling_y, 1);
+        } else if (seq_profile == FF_PROFILE_AV1_HIGH) {
+            infer(subsampling_x, 0);
+            infer(subsampling_y, 0);
+        } else {
+            if (priv->bit_depth == 12) {
+                fb(1, subsampling_x);
+                if (current->subsampling_x)
+                    fb(1, subsampling_y);
+                else
+                    infer(subsampling_y, 0);
+            } else {
+                infer(subsampling_x, 1);
+                infer(subsampling_y, 0);
+            }
+        }
+        if (current->subsampling_x && current->subsampling_y) {
+            fc(2, chroma_sample_position, AV1_CSP_UNKNOWN,
+                                          AV1_CSP_COLOCATED);
+        }
+
+        flag(separate_uv_delta_q);
+    }
+
+    return 0;
+}
+
+static int FUNC(timing_info)(CodedBitstreamContext *ctx, RWContext *rw,
+                             AV1RawTimingInfo *current)
+{
+    int err;
+
+    fc(32, num_units_in_display_tick, 1, MAX_UINT_BITS(32));
+    fc(32, time_scale,                1, MAX_UINT_BITS(32));
+
+    flag(equal_picture_interval);
+    if (current->equal_picture_interval)
+        uvlc(num_ticks_per_picture_minus_1, 0, MAX_UINT_BITS(32) - 1);
+
+    return 0;
+}
+
+static int FUNC(decoder_model_info)(CodedBitstreamContext *ctx, RWContext *rw,
+                                    AV1RawDecoderModelInfo *current)
+{
+    int err;
+
+    fb(5, buffer_delay_length_minus_1);
+    fb(32, num_units_in_decoding_tick);
+    fb(5,  buffer_removal_time_length_minus_1);
+    fb(5,  frame_presentation_time_length_minus_1);
+
+    return 0;
+}
+
+static int FUNC(sequence_header_obu)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     AV1RawSequenceHeader *current)
+{
+    int i, err;
+
+    HEADER("Sequence Header");
+
+    fc(3, seq_profile, FF_PROFILE_AV1_MAIN,
+                       FF_PROFILE_AV1_PROFESSIONAL);
+    flag(still_picture);
+    flag(reduced_still_picture_header);
+
+    if (current->reduced_still_picture_header) {
+        infer(timing_info_present_flag,           0);
+        infer(decoder_model_info_present_flag,    0);
+        infer(initial_display_delay_present_flag, 0);
+        infer(operating_points_cnt_minus_1,       0);
+        infer(operating_point_idc[0],             0);
+
+        fb(5, seq_level_idx[0]);
+
+        infer(seq_tier[0], 0);
+        infer(decoder_model_present_for_this_op[0],         0);
+        infer(initial_display_delay_present_for_this_op[0], 0);
+
+    } else {
+        flag(timing_info_present_flag);
+        if (current->timing_info_present_flag) {
+            CHECK(FUNC(timing_info)(ctx, rw, &current->timing_info));
+
+            flag(decoder_model_info_present_flag);
+            if (current->decoder_model_info_present_flag) {
+                CHECK(FUNC(decoder_model_info)
+                          (ctx, rw, &current->decoder_model_info));
+            }
+        } else {
+            infer(decoder_model_info_present_flag, 0);
+        }
+
+        flag(initial_display_delay_present_flag);
+
+        fb(5, operating_points_cnt_minus_1);
+        for (i = 0; i <= current->operating_points_cnt_minus_1; i++) {
+            fbs(12, operating_point_idc[i], 1, i);
+            fbs(5,  seq_level_idx[i], 1, i);
+
+            if (current->seq_level_idx[i] > 7)
+                flags(seq_tier[i], 1, i);
+            else
+                infer(seq_tier[i], 0);
+
+            if (current->decoder_model_info_present_flag) {
+                flags(decoder_model_present_for_this_op[i], 1, i);
+                if (current->decoder_model_present_for_this_op[i]) {
+                    int n = current->decoder_model_info.buffer_delay_length_minus_1 + 1;
+                    fbs(n, decoder_buffer_delay[i], 1, i);
+                    fbs(n, encoder_buffer_delay[i], 1, i);
+                    flags(low_delay_mode_flag[i], 1, i);
+                }
+            } else {
+                infer(decoder_model_present_for_this_op[i], 0);
+            }
+
+            if (current->initial_display_delay_present_flag) {
+                flags(initial_display_delay_present_for_this_op[i], 1, i);
+                if (current->initial_display_delay_present_for_this_op[i])
+                    fbs(4, initial_display_delay_minus_1[i], 1, i);
+            }
+        }
+    }
+
+    fb(4, frame_width_bits_minus_1);
+    fb(4, frame_height_bits_minus_1);
+
+    fb(current->frame_width_bits_minus_1  + 1, max_frame_width_minus_1);
+    fb(current->frame_height_bits_minus_1 + 1, max_frame_height_minus_1);
+
+    if (current->reduced_still_picture_header)
+        infer(frame_id_numbers_present_flag, 0);
+    else
+        flag(frame_id_numbers_present_flag);
+    if (current->frame_id_numbers_present_flag) {
+        fb(4, delta_frame_id_length_minus_2);
+        fb(3, additional_frame_id_length_minus_1);
+    }
+
+    flag(use_128x128_superblock);
+    flag(enable_filter_intra);
+    flag(enable_intra_edge_filter);
+
+    if (current->reduced_still_picture_header) {
+        infer(enable_intraintra_compound, 0);
+        infer(enable_masked_compound,     0);
+        infer(enable_warped_motion,       0);
+        infer(enable_dual_filter,         0);
+        infer(enable_order_hint,          0);
+        infer(enable_jnt_comp,            0);
+        infer(enable_ref_frame_mvs,       0);
+
+        infer(seq_force_screen_content_tools,
+              AV1_SELECT_SCREEN_CONTENT_TOOLS);
+        infer(seq_force_integer_mv,
+              AV1_SELECT_INTEGER_MV);
+    } else {
+        flag(enable_intraintra_compound);
+        flag(enable_masked_compound);
+        flag(enable_warped_motion);
+        flag(enable_dual_filter);
+
+        flag(enable_order_hint);
+        if (current->enable_order_hint) {
+            flag(enable_jnt_comp);
+            flag(enable_ref_frame_mvs);
+        } else {
+            infer(enable_jnt_comp,      0);
+            infer(enable_ref_frame_mvs, 0);
+        }
+
+        flag(seq_choose_screen_content_tools);
+        if (current->seq_choose_screen_content_tools)
+            infer(seq_force_screen_content_tools,
+                  AV1_SELECT_SCREEN_CONTENT_TOOLS);
+        else
+            fb(1, seq_force_screen_content_tools);
+        if (current->seq_force_screen_content_tools > 0) {
+            flag(seq_choose_integer_mv);
+            if (current->seq_choose_integer_mv)
+                infer(seq_force_integer_mv,
+                      AV1_SELECT_INTEGER_MV);
+            else
+                fb(1, seq_force_integer_mv);
+        } else {
+            infer(seq_force_integer_mv, AV1_SELECT_INTEGER_MV);
+        }
+
+        if (current->enable_order_hint)
+            fb(3, order_hint_bits_minus_1);
+    }
+
+    flag(enable_superres);
+    flag(enable_cdef);
+    flag(enable_restoration);
+
+    CHECK(FUNC(color_config)(ctx, rw, &current->color_config,
+                             current->seq_profile));
+
+    flag(film_grain_params_present);
+
+    return 0;
+}
+
+static int FUNC(temporal_delimiter_obu)(CodedBitstreamContext *ctx, RWContext *rw)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+
+    HEADER("Temporal Delimiter");
+
+    priv->seen_frame_header = 0;
+
+    return 0;
+}
+
+static int FUNC(superres_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                 AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int denom, err;
+
+    if (seq->enable_superres)
+        flag(use_superres);
+    else
+        infer(use_superres, 0);
+
+    if (current->use_superres) {
+        fb(3, coded_denom);
+        denom = current->coded_denom + AV1_SUPERRES_DENOM_MIN;
+    } else {
+        denom = AV1_SUPERRES_NUM;
+    }
+
+    priv->upscaled_width = priv->frame_width;
+    priv->frame_width = (priv->upscaled_width * AV1_SUPERRES_NUM +
+                         denom / 2) / denom;
+
+    return 0;
+}
+
+static int FUNC(frame_size)(CodedBitstreamContext *ctx, RWContext *rw,
+                            AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int err;
+
+    if (current->frame_size_override_flag) {
+        fb(seq->frame_width_bits_minus_1 + 1,  frame_width_minus_1);
+        fb(seq->frame_height_bits_minus_1 + 1, frame_height_minus_1);
+
+        priv->frame_width  = current->frame_width_minus_1  + 1;
+        priv->frame_height = current->frame_height_minus_1 + 1;
+    } else {
+        priv->frame_width  = seq->max_frame_width_minus_1  + 1;
+        priv->frame_height = seq->max_frame_height_minus_1 + 1;
+    }
+
+    CHECK(FUNC(superres_params)(ctx, rw, current));
+
+    return 0;
+}
+
+static int FUNC(render_size)(CodedBitstreamContext *ctx, RWContext *rw,
+                             AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    int err;
+
+    flag(render_and_frame_size_different);
+
+    if (current->render_and_frame_size_different) {
+        fb(16, render_width_minus_1);
+        fb(16, render_height_minus_1);
+
+        priv->render_width  = current->render_width_minus_1  + 1;
+        priv->render_height = current->render_height_minus_1 + 1;
+    } else {
+        priv->render_width  = priv->upscaled_width;
+        priv->render_height = priv->frame_height;
+    }
+
+    return 0;
+}
+
+static int FUNC(frame_size_with_refs)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    int i, err;
+
+    for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+        flags(found_ref[i], 1, i);
+        if (current->found_ref[i]) {
+            AV1ReferenceFrameState *ref =
+                &priv->ref[current->ref_frame_idx[i]];
+
+            if (!ref->valid) {
+                av_log(ctx->log_ctx, AV_LOG_ERROR,
+                       "Missing reference frame needed for frame size "
+                       "(ref = %d, ref_frame_idx = %d).\n",
+                       i, current->ref_frame_idx[i]);
+                return AVERROR_INVALIDDATA;
+            }
+
+            priv->upscaled_width = ref->upscaled_width;
+            priv->frame_width    = ref->frame_width;
+            priv->frame_height   = ref->frame_height;
+            priv->render_width   = ref->render_width;
+            priv->render_height  = ref->render_height;
+            break;
+        }
+    }
+
+    if (i >= AV1_REFS_PER_FRAME) {
+        CHECK(FUNC(frame_size)(ctx, rw, current));
+        CHECK(FUNC(render_size)(ctx, rw, current));
+    } else {
+        CHECK(FUNC(superres_params)(ctx, rw, current));
+    }
+
+    return 0;
+}
+
+static int FUNC(interpolation_filter)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      AV1RawFrameHeader *current)
+{
+    int err;
+
+    flag(is_filter_switchable);
+    if (current->is_filter_switchable)
+        infer(interpolation_filter,
+              AV1_INTERPOLATION_FILTER_SWITCHABLE);
+    else
+        fb(2, interpolation_filter);
+
+    return 0;
+}
+
+static int FUNC(tile_info)(CodedBitstreamContext *ctx, RWContext *rw,
+                           AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int mi_cols, mi_rows, sb_cols, sb_rows, sb_shift, sb_size;
+    int max_tile_width_sb, max_tile_height_sb, max_tile_area_sb;
+    int min_log2_tile_cols, max_log2_tile_cols, max_log2_tile_rows;
+    int min_log2_tiles, min_log2_tile_rows;
+    int i, err;
+
+    mi_cols = 2 * ((priv->frame_width  + 7) >> 3);
+    mi_rows = 2 * ((priv->frame_height + 7) >> 3);
+
+    sb_cols = seq->use_128x128_superblock ? ((mi_cols + 31) >> 5)
+                                          : ((mi_cols + 15) >> 4);
+    sb_rows = seq->use_128x128_superblock ? ((mi_rows + 31) >> 5)
+                                          : ((mi_rows + 15) >> 4);
+
+    sb_shift = seq->use_128x128_superblock ? 5 : 4;
+    sb_size  = sb_shift + 2;
+
+    max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size;
+    max_tile_area_sb  = AV1_MAX_TILE_AREA  >> (2 * sb_size);
+
+    min_log2_tile_cols = cbs_av1_tile_log2(max_tile_width_sb, sb_cols);
+    max_log2_tile_cols = cbs_av1_tile_log2(1, FFMIN(sb_cols, AV1_MAX_TILE_COLS));
+    max_log2_tile_rows = cbs_av1_tile_log2(1, FFMIN(sb_rows, AV1_MAX_TILE_ROWS));
+    min_log2_tiles = FFMAX(min_log2_tile_cols,
+                           cbs_av1_tile_log2(max_tile_area_sb, sb_rows * sb_cols));
+
+    flag(uniform_tile_spacing_flag);
+
+    if (current->uniform_tile_spacing_flag) {
+        int tile_width_sb, tile_height_sb;
+
+        increment(tile_cols_log2, min_log2_tile_cols, max_log2_tile_cols);
+
+        tile_width_sb = (sb_cols + (1 << current->tile_cols_log2) - 1) >>
+            current->tile_cols_log2;
+        current->tile_cols = (sb_cols + tile_width_sb - 1) / tile_width_sb;
+
+        min_log2_tile_rows = FFMAX(min_log2_tiles - current->tile_cols_log2, 0);
+
+        increment(tile_rows_log2, min_log2_tile_rows, max_log2_tile_rows);
+
+        tile_height_sb = (sb_rows + (1 << current->tile_rows_log2) - 1) >>
+            current->tile_rows_log2;
+        current->tile_rows = (sb_rows + tile_height_sb - 1) / tile_height_sb;
+
+    } else {
+        int widest_tile_sb, start_sb, size_sb, max_width, max_height;
+
+        widest_tile_sb = 0;
+
+        start_sb = 0;
+        for (i = 0; start_sb < sb_cols && i < AV1_MAX_TILE_COLS; i++) {
+            max_width = FFMIN(sb_cols - start_sb, max_tile_width_sb);
+            ns(max_width, width_in_sbs_minus_1[i], 1, i);
+            size_sb = current->width_in_sbs_minus_1[i] + 1;
+            widest_tile_sb = FFMAX(size_sb, widest_tile_sb);
+            start_sb += size_sb;
+        }
+        current->tile_cols_log2 = cbs_av1_tile_log2(1, i);
+        current->tile_cols = i;
+
+        if (min_log2_tiles > 0)
+            max_tile_area_sb = (sb_rows * sb_cols) >> (min_log2_tiles + 1);
+        else
+            max_tile_area_sb = sb_rows * sb_cols;
+        max_tile_height_sb = FFMAX(max_tile_area_sb / widest_tile_sb, 1);
+
+        start_sb = 0;
+        for (i = 0; start_sb < sb_rows && i < AV1_MAX_TILE_ROWS; i++) {
+            max_height = FFMIN(sb_rows - start_sb, max_tile_height_sb);
+            ns(max_height, height_in_sbs_minus_1[i], 1, i);
+            size_sb = current->height_in_sbs_minus_1[i] + 1;
+            start_sb += size_sb;
+        }
+        current->tile_rows_log2 = cbs_av1_tile_log2(1, i);
+        current->tile_rows = i;
+    }
+
+    if (current->tile_cols_log2 > 0 ||
+        current->tile_rows_log2 > 0) {
+        fb(current->tile_cols_log2 + current->tile_rows_log2,
+           context_update_tile_id);
+        fb(2, tile_size_bytes_minus1);
+    } else {
+        infer(context_update_tile_id, 0);
+    }
+
+    priv->tile_cols = current->tile_cols;
+    priv->tile_rows = current->tile_rows;
+
+    return 0;
+}
+
+static int FUNC(quantization_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int err;
+
+    fb(8, base_q_idx);
+
+    delta_q(delta_q_y_dc);
+
+    if (priv->num_planes > 1) {
+        if (seq->color_config.separate_uv_delta_q)
+            flag(diff_uv_delta);
+        else
+            infer(diff_uv_delta, 0);
+
+        delta_q(delta_q_u_dc);
+        delta_q(delta_q_u_ac);
+
+        if (current->diff_uv_delta) {
+            delta_q(delta_q_v_dc);
+            delta_q(delta_q_v_ac);
+        } else {
+            infer(delta_q_v_dc, current->delta_q_u_dc);
+            infer(delta_q_v_ac, current->delta_q_u_ac);
+        }
+    } else {
+        infer(delta_q_u_dc, 0);
+        infer(delta_q_u_ac, 0);
+        infer(delta_q_v_dc, 0);
+        infer(delta_q_v_ac, 0);
+    }
+
+    flag(using_qmatrix);
+    if (current->using_qmatrix) {
+        fb(4, qm_y);
+        fb(4, qm_u);
+        if (seq->color_config.separate_uv_delta_q)
+            fb(4, qm_v);
+        else
+            infer(qm_v, current->qm_u);
+    }
+
+    return 0;
+}
+
+static int FUNC(segmentation_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     AV1RawFrameHeader *current)
+{
+    static const uint8_t bits[AV1_SEG_LVL_MAX] = { 8, 6, 6, 6, 6, 3, 0, 0 };
+    static const uint8_t sign[AV1_SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0, 0 };
+    int i, j, err;
+
+    flag(segmentation_enabled);
+
+    if (current->segmentation_enabled) {
+        if (current->primary_ref_frame == AV1_PRIMARY_REF_NONE) {
+            infer(segmentation_update_map,      1);
+            infer(segmentation_temporal_update, 0);
+            infer(segmentation_update_data,     1);
+        } else {
+            flag(segmentation_update_map);
+            if (current->segmentation_update_map)
+                flag(segmentation_temporal_update);
+            else
+                infer(segmentation_temporal_update, 0);
+            flag(segmentation_update_data);
+        }
+
+        if (current->segmentation_update_data) {
+            for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
+                for (j = 0; j < AV1_SEG_LVL_MAX; j++) {
+                    flags(feature_enabled[i][j], 2, i, j);
+
+                    if (current->feature_enabled[i][j] && bits[j] > 0) {
+                        if (sign[j])
+                            sus(1 + bits[j], feature_value[i][j], 2, i, j);
+                        else
+                            fbs(bits[j], feature_value[i][j], 2, i, j);
+                    } else {
+                        infer(feature_value[i][j], 0);
+                    }
+                }
+            }
+        }
+    } else {
+        for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
+            for (j = 0; j < AV1_SEG_LVL_MAX; j++) {
+                infer(feature_enabled[i][j], 0);
+                infer(feature_value[i][j],   0);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(delta_q_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                AV1RawFrameHeader *current)
+{
+    int err;
+
+    if (current->base_q_idx > 0)
+        flag(delta_q_present);
+    else
+        infer(delta_q_present, 0);
+
+    if (current->delta_q_present)
+        fb(2, delta_q_res);
+
+    return 0;
+}
+
+static int FUNC(delta_lf_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                 AV1RawFrameHeader *current)
+{
+    int err;
+
+    if (current->delta_q_present) {
+        if (!current->allow_intrabc)
+            flag(delta_lf_present);
+        else
+            infer(delta_lf_present, 0);
+        if (current->delta_lf_present) {
+            fb(2, delta_lf_res);
+            flag(delta_lf_multi);
+        } else {
+            infer(delta_lf_res,   0);
+            infer(delta_lf_multi, 0);
+        }
+    } else {
+        infer(delta_lf_present, 0);
+        infer(delta_lf_res,     0);
+        infer(delta_lf_multi,   0);
+    }
+
+    return 0;
+}
+
+static int FUNC(loop_filter_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                    AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    int i, err;
+
+    if (priv->coded_lossless || current->allow_intrabc) {
+        infer(loop_filter_level[0], 0);
+        infer(loop_filter_level[1], 0);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_INTRA],    1);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_LAST],     0);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_LAST2],    0);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_LAST3],    0);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_BWDREF],   0);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_GOLDEN],  -1);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_ALTREF],  -1);
+        infer(loop_filter_ref_deltas[AV1_REF_FRAME_ALTREF2], -1);
+        for (i = 0; i < 2; i++)
+            infer(loop_filter_mode_deltas[i], 0);
+        return 0;
+    }
+
+    fb(6, loop_filter_level[0]);
+    fb(6, loop_filter_level[1]);
+
+    if (priv->num_planes > 1) {
+        if (current->loop_filter_level[0] ||
+            current->loop_filter_level[1]) {
+            fb(6, loop_filter_level[2]);
+            fb(6, loop_filter_level[3]);
+        }
+    }
+
+    fb(3, loop_filter_sharpness);
+
+    flag(loop_filter_delta_enabled);
+    if (current->loop_filter_delta_enabled) {
+        flag(loop_filter_delta_update);
+        if (current->loop_filter_delta_update) {
+            for (i = 0; i < AV1_TOTAL_REFS_PER_FRAME; i++) {
+                flags(update_ref_delta[i], 1, i);
+                if (current->update_ref_delta[i])
+                    sus(1 + 6, loop_filter_ref_deltas[i], 1, i);
+            }
+            for (i = 0; i < 2; i++) {
+                flags(update_mode_delta[i], 1, i);
+                if (current->update_mode_delta[i])
+                    sus(1 + 6, loop_filter_mode_deltas[i], 1, i);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(cdef_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                             AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int i, err;
+
+    if (priv->coded_lossless || current->allow_intrabc ||
+        !seq->enable_cdef) {
+        infer(cdef_damping_minus_3, 0);
+        infer(cdef_bits, 0);
+        infer(cdef_y_pri_strength[0],  0);
+        infer(cdef_y_sec_strength[0],  0);
+        infer(cdef_uv_pri_strength[0], 0);
+        infer(cdef_uv_sec_strength[0], 0);
+
+        return 0;
+    }
+
+    fb(2, cdef_damping_minus_3);
+    fb(2, cdef_bits);
+
+    for (i = 0; i < (1 << current->cdef_bits); i++) {
+        fbs(4, cdef_y_pri_strength[i], 1, i);
+        fbs(2, cdef_y_sec_strength[i], 1, i);
+
+        if (priv->num_planes > 1) {
+            fbs(4, cdef_uv_pri_strength[i], 1, i);
+            fbs(2, cdef_uv_sec_strength[i], 1, i);
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(lr_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                           AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int uses_lr,  uses_chroma_lr;
+    int i, err;
+
+    if (priv->all_lossless || current->allow_intrabc ||
+        !seq->enable_restoration) {
+        return 0;
+    }
+
+    uses_lr = uses_chroma_lr = 0;
+    for (i = 0; i < priv->num_planes; i++) {
+        fbs(2, lr_type[i], 1, i);
+
+        if (current->lr_type[i] != 0) {
+            uses_lr = 1;
+            if (i > 0)
+                uses_chroma_lr = 1;
+        }
+    }
+
+    if (uses_lr) {
+        if (seq->use_128x128_superblock)
+            increment(lr_unit_shift, 1, 2);
+        else
+            increment(lr_unit_shift, 0, 2);
+
+        if(seq->color_config.subsampling_x &&
+           seq->color_config.subsampling_y && uses_chroma_lr) {
+            fb(1, lr_uv_shift);
+        } else {
+            infer(lr_uv_shift, 0);
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(read_tx_mode)(CodedBitstreamContext *ctx, RWContext *rw,
+                              AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    int err;
+
+    if (priv->coded_lossless)
+        infer(tx_mode, 0);
+    else
+        increment(tx_mode, 1, 2);
+
+    return 0;
+}
+
+static int FUNC(frame_reference_mode)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      AV1RawFrameHeader *current)
+{
+    int err;
+
+    if (current->frame_type == AV1_FRAME_INTRA_ONLY ||
+        current->frame_type == AV1_FRAME_KEY)
+        infer(reference_select, 0);
+    else
+        flag(reference_select);
+
+    return 0;
+}
+
+static int FUNC(skip_mode_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                  AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int skip_mode_allowed;
+    int err;
+
+    if (current->frame_type == AV1_FRAME_KEY ||
+        current->frame_type == AV1_FRAME_INTRA_ONLY ||
+        !current->reference_select || !seq->enable_order_hint) {
+        skip_mode_allowed = 0;
+    } else {
+        int forward_idx,  backward_idx;
+        int forward_hint, backward_hint;
+        int ref_hint, dist, i;
+
+        forward_idx  = -1;
+        backward_idx = -1;
+        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+            ref_hint = priv->ref[i].order_hint;
+            dist = cbs_av1_get_relative_dist(seq, ref_hint,
+                                             current->order_hint);
+            if (dist < 0) {
+                if (forward_idx < 0 ||
+                    cbs_av1_get_relative_dist(seq, ref_hint,
+                                              forward_hint) > 0) {
+                    forward_idx  = i;
+                    forward_hint = ref_hint;
+                }
+            } else if (dist > 0) {
+                if (backward_idx < 0 ||
+                    cbs_av1_get_relative_dist(seq, ref_hint,
+                                              backward_hint) < 0) {
+                    backward_idx  = i;
+                    backward_hint = ref_hint;
+                }
+            }
+        }
+
+        if (forward_idx < 0) {
+            skip_mode_allowed = 0;
+        } else if (backward_idx >= 0) {
+            skip_mode_allowed = 1;
+            // Frames for skip mode are forward_idx and backward_idx.
+        } else {
+            int second_forward_idx;
+            int second_forward_hint;
+
+            second_forward_idx = -1;
+            for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+                ref_hint = priv->ref[i].order_hint;
+                if (cbs_av1_get_relative_dist(seq, ref_hint,
+                                              forward_hint) < 0) {
+                    if (second_forward_idx < 0 ||
+                        cbs_av1_get_relative_dist(seq, ref_hint,
+                                                  second_forward_hint) > 0) {
+                        second_forward_idx  = i;
+                        second_forward_hint = ref_hint;
+                    }
+                }
+            }
+
+            if (second_forward_idx < 0) {
+                skip_mode_allowed = 0;
+            } else {
+                skip_mode_allowed = 1;
+                // Frames for skip mode are forward_idx and second_forward_idx.
+            }
+        }
+    }
+
+    if (skip_mode_allowed)
+        flag(skip_mode_present);
+    else
+        infer(skip_mode_present, 0);
+
+    return 0;
+}
+
+static int FUNC(global_motion_param)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     AV1RawFrameHeader *current,
+                                     int type, int ref, int idx)
+{
+    uint32_t abs_bits, prec_bits, num_syms;
+    int err;
+
+    if (idx < 2) {
+        if (type == AV1_WARP_MODEL_TRANSLATION) {
+            abs_bits  = AV1_GM_ABS_TRANS_ONLY_BITS  - !current->allow_high_precision_mv;
+            prec_bits = AV1_GM_TRANS_ONLY_PREC_BITS - !current->allow_high_precision_mv;
+        } else {
+            abs_bits  = AV1_GM_ABS_TRANS_BITS;
+            prec_bits = AV1_GM_TRANS_PREC_BITS;
+        }
+    } else {
+        abs_bits  = AV1_GM_ABS_ALPHA_BITS;
+        prec_bits = AV1_GM_ALPHA_PREC_BITS;
+    }
+
+    num_syms = 2 * (1 << abs_bits) + 1;
+    subexp(gm_params[ref][idx], num_syms, 2, ref, idx);
+
+    // Actual gm_params value is not reconstructed here.
+    (void)prec_bits;
+
+    return 0;
+}
+
+static int FUNC(global_motion_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      AV1RawFrameHeader *current)
+{
+    int ref, type;
+    int err;
+
+    if (current->frame_type == AV1_FRAME_KEY ||
+        current->frame_type == AV1_FRAME_INTRA_ONLY)
+        return 0;
+
+    for (ref = AV1_REF_FRAME_LAST; ref <= AV1_REF_FRAME_ALTREF; ref++) {
+        flags(is_global[ref], 1, ref);
+        if (current->is_global[ref]) {
+            flags(is_rot_zoom[ref], 1, ref);
+            if (current->is_rot_zoom[ref]) {
+                type = AV1_WARP_MODEL_ROTZOOM;
+            } else {
+                flags(is_translation[ref], 1, ref);
+                type = current->is_translation[ref] ? AV1_WARP_MODEL_TRANSLATION
+                                                    : AV1_WARP_MODEL_AFFINE;
+            }
+        } else {
+            type = AV1_WARP_MODEL_IDENTITY;
+        }
+
+        if (type >= AV1_WARP_MODEL_ROTZOOM) {
+            CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 2));
+            CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 3));
+            if (type == AV1_WARP_MODEL_AFFINE) {
+                CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 4));
+                CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 5));
+            } else {
+                // gm_params[ref][4] = -gm_params[ref][3]
+                // gm_params[ref][5] =  gm_params[ref][2]
+            }
+        }
+        if (type >= AV1_WARP_MODEL_TRANSLATION) {
+            CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 0));
+            CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 1));
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(film_grain_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                   AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context  *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq = priv->sequence_header;
+    int num_pos_luma, num_pos_chroma;
+    int i, err;
+
+    if (!seq->film_grain_params_present ||
+        (!current->show_frame && !current->showable_frame))
+        return 0;
+
+    flag(apply_grain);
+
+    if (!current->apply_grain)
+        return 0;
+
+    fb(16, grain_seed);
+
+    if (current->frame_type == AV1_FRAME_INTER)
+        flag(update_grain);
+    else
+        infer(update_grain, 1);
+
+    if (!current->update_grain) {
+        fb(3, film_grain_params_ref_idx);
+        return 0;
+    }
+
+    fb(4, num_y_points);
+    for (i = 0; i < current->num_y_points; i++) {
+        fbs(8, point_y_value[i],   1, i);
+        fbs(8, point_y_scaling[i], 1, i);
+    }
+
+    if (seq->color_config.mono_chrome)
+        infer(chroma_scaling_from_luma, 0);
+    else
+        flag(chroma_scaling_from_luma);
+
+    if (seq->color_config.mono_chrome ||
+        current->chroma_scaling_from_luma ||
+        (seq->color_config.subsampling_x == 1 &&
+         seq->color_config.subsampling_y == 1 &&
+         current->num_y_points == 0)) {
+        infer(num_cb_points, 0);
+        infer(num_cr_points, 0);
+    } else {
+        fb(4, num_cb_points);
+        for (i = 0; i < current->num_cb_points; i++) {
+            fbs(8, point_cb_value[i],   1, i);
+            fbs(8, point_cb_scaling[i], 1, i);
+        }
+        fb(4, num_cr_points);
+        for (i = 0; i < current->num_cr_points; i++) {
+            fbs(8, point_cr_value[i],   1, i);
+            fbs(8, point_cr_scaling[i], 1, i);
+        }
+    }
+
+    fb(2, grain_scaling_minus_8);
+    fb(2, ar_coeff_lag);
+    num_pos_luma = 2 * current->ar_coeff_lag * (current->ar_coeff_lag + 1);
+    if (current->num_y_points) {
+        num_pos_chroma = num_pos_luma + 1;
+        for (i = 0; i < num_pos_luma; i++)
+            fbs(8, ar_coeffs_y_plus_128[i], 1, i);
+    } else {
+        num_pos_chroma = num_pos_luma;
+    }
+    if (current->chroma_scaling_from_luma || current->num_cb_points) {
+        for (i = 0; i < num_pos_chroma; i++)
+            fbs(8, ar_coeffs_cb_plus_128[i], 1, i);
+    }
+    if (current->chroma_scaling_from_luma || current->num_cr_points) {
+        for (i = 0; i < num_pos_chroma; i++)
+            fbs(8, ar_coeffs_cr_plus_128[i], 1, i);
+    }
+    fb(2, ar_coeff_shift_minus_6);
+    fb(2, grain_scale_shift);
+    if (current->num_cb_points) {
+        fb(8, cb_mult);
+        fb(8, cb_luma_mult);
+        fb(9, cb_offset);
+    }
+    if (current->num_cr_points) {
+        fb(8, cr_mult);
+        fb(8, cr_luma_mult);
+        fb(9, cr_offset);
+    }
+
+    flag(overlap_flag);
+    flag(clip_to_restricted_range);
+
+    return 0;
+}
+
+static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     AV1RawFrameHeader *current)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    const AV1RawSequenceHeader *seq;
+    int id_len, diff_len, all_frames, frame_is_intra, order_hint_bits;
+    int i, err;
+
+    if (!priv->sequence_header) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "No sequence header available: "
+               "unable to decode frame header.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    seq = priv->sequence_header;
+
+    id_len = seq->additional_frame_id_length_minus_1 +
+             seq->delta_frame_id_length_minus_2 + 3;
+    all_frames = (1 << AV1_NUM_REF_FRAMES) - 1;
+
+    if (seq->reduced_still_picture_header) {
+        infer(show_existing_frame, 0);
+        infer(frame_type,     AV1_FRAME_KEY);
+        infer(show_frame,     1);
+        infer(showable_frame, 0);
+        frame_is_intra = 1;
+
+    } else {
+        flag(show_existing_frame);
+
+        if (current->show_existing_frame) {
+            AV1ReferenceFrameState *frame;
+
+            fb(3, frame_to_show_map_idx);
+            frame = &priv->ref[current->frame_to_show_map_idx];
+
+            if (seq->decoder_model_info_present_flag &&
+                !seq->timing_info.equal_picture_interval) {
+                fb(seq->decoder_model_info.frame_presentation_time_length_minus_1 + 1,
+                   frame_presentation_time);
+            }
+
+            if (seq->frame_id_numbers_present_flag)
+                fb(id_len, display_frame_id);
+
+            if (frame->frame_type == AV1_FRAME_KEY)
+                infer(refresh_frame_flags, all_frames);
+            else
+                infer(refresh_frame_flags, 0);
+
+            return 0;
+        }
+
+        fb(2, frame_type);
+        frame_is_intra = (current->frame_type == AV1_FRAME_INTRA_ONLY ||
+                          current->frame_type == AV1_FRAME_KEY);
+
+        flag(show_frame);
+        if (current->show_frame &&
+            seq->decoder_model_info_present_flag &&
+            !seq->timing_info.equal_picture_interval) {
+            fb(seq->decoder_model_info.frame_presentation_time_length_minus_1 + 1,
+               frame_presentation_time);
+        }
+        if (current->show_frame)
+            infer(showable_frame, current->frame_type != AV1_FRAME_KEY);
+        else
+            flag(showable_frame);
+
+        if (current->frame_type == AV1_FRAME_SWITCH ||
+            (current->frame_type == AV1_FRAME_KEY && current->show_frame))
+            infer(error_resilient_mode, 1);
+        else
+            flag(error_resilient_mode);
+    }
+
+    if (current->frame_type == AV1_FRAME_KEY && current->show_frame) {
+        for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
+            priv->ref[i].valid = 0;
+            priv->ref[i].order_hint = 0;
+        }
+    }
+
+    flag(disable_cdf_update);
+
+    if (seq->seq_force_screen_content_tools ==
+        AV1_SELECT_SCREEN_CONTENT_TOOLS) {
+        flag(allow_screen_content_tools);
+    } else {
+        infer(allow_screen_content_tools,
+              seq->seq_force_screen_content_tools);
+    }
+    if (current->allow_screen_content_tools) {
+        if (seq->seq_force_integer_mv == AV1_SELECT_INTEGER_MV)
+            flag(force_integer_mv);
+        else
+            infer(force_integer_mv, seq->seq_force_integer_mv);
+    } else {
+        infer(force_integer_mv, 0);
+    }
+
+    if (seq->frame_id_numbers_present_flag) {
+        fb(id_len, current_frame_id);
+
+        diff_len = seq->delta_frame_id_length_minus_2 + 2;
+        for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
+            if (current->current_frame_id > (1 << diff_len)) {
+                if (priv->ref[i].frame_id > current->current_frame_id ||
+                    priv->ref[i].frame_id < (current->current_frame_id -
+                                             (1 << diff_len)))
+                    priv->ref[i].valid = 0;
+            } else {
+                if (priv->ref[i].frame_id > current->current_frame_id &&
+                    priv->ref[i].frame_id < ((1 << id_len) +
+                                             current->current_frame_id -
+                                             (1 << diff_len)))
+                    priv->ref[i].valid = 0;
+            }
+        }
+    } else {
+        infer(current_frame_id, 0);
+    }
+
+    if (current->frame_type == AV1_FRAME_SWITCH)
+        infer(frame_size_override_flag, 1);
+    else if(seq->reduced_still_picture_header)
+        infer(frame_size_override_flag, 0);
+    else
+        flag(frame_size_override_flag);
+
+    order_hint_bits =
+        seq->enable_order_hint ? seq->order_hint_bits_minus_1 + 1 : 0;
+    if (order_hint_bits > 0)
+        fb(order_hint_bits, order_hint);
+    else
+        infer(order_hint, 0);
+
+    if (frame_is_intra || current->error_resilient_mode)
+        infer(primary_ref_frame, AV1_PRIMARY_REF_NONE);
+    else
+        fb(3, primary_ref_frame);
+
+    if (seq->decoder_model_info_present_flag) {
+        flag(buffer_removal_time_present_flag);
+        if (current->buffer_removal_time_present_flag) {
+            for (i = 0; i <= seq->operating_points_cnt_minus_1; i++) {
+                if (seq->decoder_model_present_for_this_op[i]) {
+                    int op_pt_idc = seq->operating_point_idc[i];
+                    int in_temporal_layer = (op_pt_idc >>  priv->temporal_id    ) & 1;
+                    int in_spatial_layer  = (op_pt_idc >> (priv->spatial_id + 8)) & 1;
+                    if (seq->operating_point_idc[i] == 0 ||
+                        in_temporal_layer || in_spatial_layer) {
+                        fbs(seq->decoder_model_info.buffer_removal_time_length_minus_1 + 1,
+                            buffer_removal_time[i], 1, i);
+                    }
+                }
+            }
+        }
+    }
+
+    if (current->frame_type == AV1_FRAME_SWITCH ||
+        (current->frame_type == AV1_FRAME_KEY && current->show_frame))
+        infer(refresh_frame_flags, all_frames);
+    else
+        fb(8, refresh_frame_flags);
+
+    if (!frame_is_intra || current->refresh_frame_flags != all_frames) {
+        if (current->error_resilient_mode && seq->enable_order_hint) {
+            for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
+                fbs(order_hint_bits, ref_order_hint[i], 1, i);
+                if (current->ref_order_hint[i] != priv->ref[i].order_hint)
+                    priv->ref[i].valid = 0;
+            }
+        }
+    }
+
+    if (current->frame_type == AV1_FRAME_KEY ||
+        current->frame_type == AV1_FRAME_INTRA_ONLY) {
+        CHECK(FUNC(frame_size)(ctx, rw, current));
+        CHECK(FUNC(render_size)(ctx, rw, current));
+
+        if (current->allow_screen_content_tools &&
+            priv->upscaled_width == priv->frame_width)
+            flag(allow_intrabc);
+        else
+            infer(allow_intrabc, 0);
+
+    } else {
+        if (!seq->enable_order_hint) {
+            infer(frame_refs_short_signaling, 0);
+        } else {
+            flag(frame_refs_short_signaling);
+            if (current->frame_refs_short_signaling) {
+                fb(3, last_frame_idx);
+                fb(3, golden_frame_idx);
+
+                for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+                    if (i == 0)
+                        infer(ref_frame_idx[i], current->last_frame_idx);
+                    else if (i == AV1_REF_FRAME_GOLDEN -
+                                  AV1_REF_FRAME_LAST)
+                        infer(ref_frame_idx[i], current->golden_frame_idx);
+                    else
+                        infer(ref_frame_idx[i], -1);
+                }
+            }
+        }
+
+        for (i = 0; i < AV1_REFS_PER_FRAME; i++) {
+            if (!current->frame_refs_short_signaling)
+                fbs(3, ref_frame_idx[i], 1, i);
+            if (seq->frame_id_numbers_present_flag) {
+                fbs(seq->delta_frame_id_length_minus_2 + 2,
+                    delta_frame_id_minus1[i], 1, i);
+            }
+        }
+
+        if (current->frame_size_override_flag &&
+            !current->error_resilient_mode) {
+            CHECK(FUNC(frame_size_with_refs)(ctx, rw, current));
+        } else {
+            CHECK(FUNC(frame_size)(ctx, rw, current));
+            CHECK(FUNC(render_size)(ctx, rw, current));
+        }
+
+        if (current->force_integer_mv)
+            infer(allow_high_precision_mv, 0);
+        else
+            flag(allow_high_precision_mv);
+
+        CHECK(FUNC(interpolation_filter)(ctx, rw, current));
+
+        flag(is_motion_mode_switchable);
+
+        if (current->error_resilient_mode ||
+            !seq->enable_ref_frame_mvs)
+            infer(use_ref_frame_mvs, 0);
+        else
+            flag(use_ref_frame_mvs);
+
+        infer(allow_intrabc, 0);
+    }
+
+    if (!frame_is_intra) {
+        // Derive reference frame sign biases.
+    }
+
+    if (seq->reduced_still_picture_header || current->disable_cdf_update)
+        infer(disable_frame_end_update_cdf, 1);
+    else
+        flag(disable_frame_end_update_cdf);
+
+    if (current->primary_ref_frame == AV1_PRIMARY_REF_NONE) {
+        // Init non-coeff CDFs.
+        // Setup past independence.
+    } else {
+        // Load CDF tables from previous frame.
+        // Load params from previous frame.
+    }
+
+    if (current->use_ref_frame_mvs) {
+        // Perform motion field estimation process.
+    }
+
+    CHECK(FUNC(tile_info)(ctx, rw, current));
+
+    CHECK(FUNC(quantization_params)(ctx, rw, current));
+
+    CHECK(FUNC(segmentation_params)(ctx, rw, current));
+
+    CHECK(FUNC(delta_q_params)(ctx, rw, current));
+
+    CHECK(FUNC(delta_lf_params)(ctx, rw, current));
+
+    // Init coeff CDFs / load previous segments.
+
+    priv->coded_lossless = 1;
+    for (i = 0; i < AV1_MAX_SEGMENTS; i++) {
+        int qindex;
+        if (current->feature_enabled[i][AV1_SEG_LVL_ALT_Q]) {
+            qindex = (current->base_q_idx +
+                      current->feature_value[i][AV1_SEG_LVL_ALT_Q]);
+        } else {
+            qindex = current->base_q_idx;
+        }
+        qindex = av_clip_uintp2(qindex, 8);
+
+        if (qindex                || current->delta_q_y_dc ||
+            current->delta_q_u_ac || current->delta_q_u_dc ||
+            current->delta_q_v_ac || current->delta_q_v_dc) {
+            priv->coded_lossless = 0;
+        }
+    }
+    priv->all_lossless = priv->coded_lossless &&
+        priv->frame_width == priv->upscaled_width;
+
+    CHECK(FUNC(loop_filter_params)(ctx, rw, current));
+
+    CHECK(FUNC(cdef_params)(ctx, rw, current));
+
+    CHECK(FUNC(lr_params)(ctx, rw, current));
+
+    CHECK(FUNC(read_tx_mode)(ctx, rw, current));
+
+    CHECK(FUNC(frame_reference_mode)(ctx, rw, current));
+
+    CHECK(FUNC(skip_mode_params)(ctx, rw, current));
+
+    if (frame_is_intra || current->error_resilient_mode ||
+        !seq->enable_warped_motion)
+        infer(allow_warped_motion, 0);
+    else
+        flag(allow_warped_motion);
+
+    flag(reduced_tx_set);
+
+    CHECK(FUNC(global_motion_params)(ctx, rw, current));
+
+    CHECK(FUNC(film_grain_params)(ctx, rw, current));
+
+    for (i = 0; i < AV1_NUM_REF_FRAMES; i++) {
+        if (current->refresh_frame_flags & (1 << i)) {
+            priv->ref[i] = (AV1ReferenceFrameState) {
+                .valid          = 1,
+                .frame_id       = current->current_frame_id,
+                .upscaled_width = priv->upscaled_width,
+                .frame_width    = priv->frame_width,
+                .frame_height   = priv->frame_height,
+                .render_width   = priv->render_width,
+                .render_height  = priv->render_height,
+                .frame_type     = current->frame_type,
+                .subsampling_x  = seq->color_config.subsampling_x,
+                .subsampling_y  = seq->color_config.subsampling_y,
+                .bit_depth      = priv->bit_depth,
+                .order_hint     = current->order_hint,
+            };
+        }
+    }
+
+    av_log(ctx->log_ctx, AV_LOG_DEBUG, "Frame %d:  size %dx%d  "
+           "upscaled %d  render %dx%d  subsample %dx%d  "
+           "bitdepth %d  tiles %dx%d.\n", current->order_hint,
+           priv->frame_width, priv->frame_height, priv->upscaled_width,
+           priv->render_width, priv->render_height,
+           seq->color_config.subsampling_x + 1,
+           seq->color_config.subsampling_y + 1, priv->bit_depth,
+           priv->tile_rows, priv->tile_cols);
+
+    return 0;
+}
+
+static int FUNC(frame_header_obu)(CodedBitstreamContext *ctx, RWContext *rw,
+                                  AV1RawFrameHeader *current, int redundant,
+                                  AVBufferRef *rw_buffer_ref)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    int start_pos, fh_bits, fh_bytes, err;
+    uint8_t *fh_start;
+
+    if (priv->seen_frame_header) {
+        if (!redundant) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid repeated "
+                   "frame header OBU.\n");
+            return AVERROR_INVALIDDATA;
+        } else {
+            GetBitContext fh;
+            size_t i, b;
+            uint32_t val;
+
+            HEADER("Redundant Frame Header");
+
+            av_assert0(priv->frame_header_ref && priv->frame_header);
+
+            init_get_bits(&fh, priv->frame_header,
+                          priv->frame_header_size);
+            for (i = 0; i < priv->frame_header_size; i += 8) {
+                b = FFMIN(priv->frame_header_size - i, 8);
+                val = get_bits(&fh, b);
+                xf(b, frame_header_copy[i],
+                   val, val, val, 1, i / 8);
+            }
+        }
+    } else {
+        if (redundant)
+            HEADER("Redundant Frame Header (used as Frame Header)");
+        else
+            HEADER("Frame Header");
+
+        priv->seen_frame_header = 1;
+
+#ifdef READ
+        start_pos = get_bits_count(rw);
+#else
+        start_pos = put_bits_count(rw);
+#endif
+
+        CHECK(FUNC(uncompressed_header)(ctx, rw, current));
+
+        if (current->show_existing_frame) {
+            priv->seen_frame_header = 0;
+        } else {
+            priv->seen_frame_header = 1;
+
+            av_buffer_unref(&priv->frame_header_ref);
+
+#ifdef READ
+            fh_bits  = get_bits_count(rw) - start_pos;
+            fh_start = (uint8_t*)rw->buffer + start_pos / 8;
+#else
+            // Need to flush the bitwriter so that we can copy its output,
+            // but use a copy so we don't affect the caller's structure.
+            {
+                PutBitContext tmp = *rw;
+                flush_put_bits(&tmp);
+            }
+
+            fh_bits  = put_bits_count(rw) - start_pos;
+            fh_start = rw->buf + start_pos / 8;
+#endif
+            fh_bytes = (fh_bits + 7) / 8;
+
+            priv->frame_header_size = fh_bits;
+
+            if (rw_buffer_ref) {
+                priv->frame_header_ref = av_buffer_ref(rw_buffer_ref);
+                if (!priv->frame_header_ref)
+                    return AVERROR(ENOMEM);
+                priv->frame_header = fh_start;
+            } else {
+                priv->frame_header_ref =
+                    av_buffer_alloc(fh_bytes + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!priv->frame_header_ref)
+                    return AVERROR(ENOMEM);
+                priv->frame_header = priv->frame_header_ref->data;
+                memcpy(priv->frame_header, fh_start, fh_bytes);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(tile_group_obu)(CodedBitstreamContext *ctx, RWContext *rw,
+                                AV1RawTileGroup *current)
+{
+    CodedBitstreamAV1Context *priv = ctx->priv_data;
+    int num_tiles, tile_bits;
+    int err;
+
+    HEADER("Tile Group");
+
+    num_tiles = priv->tile_cols * priv->tile_rows;
+    if (num_tiles > 1)
+        flag(tile_start_and_end_present_flag);
+    else
+        infer(tile_start_and_end_present_flag, 0);
+
+    if (num_tiles == 1 || !current->tile_start_and_end_present_flag) {
+        infer(tg_start, 0);
+        infer(tg_end, num_tiles - 1);
+    } else {
+        tile_bits = cbs_av1_tile_log2(1, priv->tile_cols) +
+                    cbs_av1_tile_log2(1, priv->tile_rows);
+        fb(tile_bits, tg_start);
+        fb(tile_bits, tg_end);
+    }
+
+    CHECK(FUNC(byte_alignment)(ctx, rw));
+
+    // Reset header for next frame.
+    if (current->tg_end == num_tiles - 1)
+        priv->seen_frame_header = 0;
+
+    // Tile data follows.
+
+    return 0;
+}
+
+static int FUNC(frame_obu)(CodedBitstreamContext *ctx, RWContext *rw,
+                           AV1RawFrame *current,
+                           AVBufferRef *rw_buffer_ref)
+{
+    int err;
+
+    CHECK(FUNC(frame_header_obu)(ctx, rw, &current->header,
+                                 0, rw_buffer_ref));
+
+    CHECK(FUNC(byte_alignment)(ctx, rw));
+
+    CHECK(FUNC(tile_group_obu)(ctx, rw, &current->tile_group));
+
+    return 0;
+}
+
+static int FUNC(tile_list_obu)(CodedBitstreamContext *ctx, RWContext *rw,
+                               AV1RawTileList *current)
+{
+    int err;
+
+    fb(8, output_frame_width_in_tiles_minus_1);
+    fb(8, output_frame_height_in_tiles_minus_1);
+
+    fb(16, tile_count_minus_1);
+
+    // Tile data follows.
+
+    return 0;
+}
+
+static int FUNC(metadata_hdr_cll)(CodedBitstreamContext *ctx, RWContext *rw,
+                                  AV1RawMetadataHDRCLL *current)
+{
+    int err;
+
+    fb(16, max_cll);
+    fb(16, max_fall);
+
+    return 0;
+}
+
+static int FUNC(metadata_hdr_mdcv)(CodedBitstreamContext *ctx, RWContext *rw,
+                                   AV1RawMetadataHDRMDCV *current)
+{
+    int err, i;
+
+    for (i = 0; i < 3; i++) {
+        fcs(16, primary_chromaticity_x[i], 0, 50000, 1, i);
+        fcs(16, primary_chromaticity_y[i], 0, 50000, 1, i);
+    }
+
+    fc(16, white_point_chromaticity_x, 0, 50000);
+    fc(16, white_point_chromaticity_y, 0, 50000);
+
+    fc(32, luminance_max, 1, MAX_UINT_BITS(32));
+    fc(32, luminance_min, 0, current->luminance_max >> 6);
+
+    return 0;
+}
+
+static int FUNC(metadata_scalability)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      AV1RawMetadataScalability *current)
+{
+    // TODO: scalability metadata.
+
+    return AVERROR_PATCHWELCOME;
+}
+
+static int FUNC(metadata_itut_t35)(CodedBitstreamContext *ctx, RWContext *rw,
+                                   AV1RawMetadataITUTT35 *current)
+{
+    int err;
+    size_t i;
+
+    fb(8, itu_t_t35_country_code);
+    if (current->itu_t_t35_country_code == 0xff)
+        fb(8, itu_t_t35_country_code_extension_byte);
+
+#ifdef READ
+    // The payload runs up to the start of the trailing bits, but there might
+    // be arbitrarily many trailing zeroes so we need to read through twice.
+    {
+        GetBitContext tmp = *rw;
+        current->payload_size = 0;
+        for (i = 0; get_bits_left(rw) >= 8; i++) {
+            if (get_bits(rw, 8))
+                current->payload_size = i;
+        }
+        *rw = tmp;
+    }
+
+    current->payload_ref = av_buffer_alloc(current->payload_size);
+    if (!current->payload_ref)
+        return AVERROR(ENOMEM);
+    current->payload = current->payload_ref->data;
+#endif
+
+    for (i = 0; i < current->payload_size; i++)
+        xf(8, itu_t_t35_payload_bytes[i], current->payload[i],
+           0x00, 0xff, 1, i);
+
+    return 0;
+}
+
+static int FUNC(metadata_timecode)(CodedBitstreamContext *ctx, RWContext *rw,
+                                   AV1RawMetadataTimecode *current)
+{
+    int err;
+
+    fb(5, counting_type);
+    flag(full_timestamp_flag);
+    flag(discontinuity_flag);
+    flag(cnt_dropped_flag);
+    fb(9, n_frames);
+
+    if (current->full_timestamp_flag) {
+        fb(6, seconds_value);
+        fb(6, minutes_value);
+        fb(5, hours_value);
+    } else {
+        flag(seconds_flag);
+        if (current->seconds_flag) {
+            fb(6, seconds_value);
+            flag(minutes_flag);
+            if (current->minutes_flag) {
+                fb(6, minutes_value);
+                flag(hours_flag);
+                if (current->hours_flag)
+                    fb(5, hours_value);
+            }
+        }
+    }
+
+    fb(5, time_offset_length);
+    if (current->time_offset_length > 0)
+        fb(current->time_offset_length, time_offset_value);
+
+    return 0;
+}
+
+static int FUNC(metadata_obu)(CodedBitstreamContext *ctx, RWContext *rw,
+                              AV1RawMetadata *current)
+{
+    int err;
+
+    leb128(metadata_type);
+
+    switch (current->metadata_type) {
+    case AV1_METADATA_TYPE_HDR_CLL:
+        CHECK(FUNC(metadata_hdr_cll)(ctx, rw, &current->metadata.hdr_cll));
+        break;
+    case AV1_METADATA_TYPE_HDR_MDCV:
+        CHECK(FUNC(metadata_hdr_mdcv)(ctx, rw, &current->metadata.hdr_mdcv));
+        break;
+    case AV1_METADATA_TYPE_SCALABILITY:
+        CHECK(FUNC(metadata_scalability)(ctx, rw, &current->metadata.scalability));
+        break;
+    case AV1_METADATA_TYPE_ITUT_T35:
+        CHECK(FUNC(metadata_itut_t35)(ctx, rw, &current->metadata.itut_t35));
+        break;
+    case AV1_METADATA_TYPE_TIMECODE:
+        CHECK(FUNC(metadata_timecode)(ctx, rw, &current->metadata.timecode));
+        break;
+    default:
+        // Unknown metadata type.
+        return AVERROR_PATCHWELCOME;
+    }
+
+    return 0;
+}
diff --git a/libavcodec/cbs_h264.h b/libavcodec/cbs_h264.h
index 5a7dc27..92277e4 100644
--- a/libavcodec/cbs_h264.h
+++ b/libavcodec/cbs_h264.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -264,6 +264,17 @@ typedef struct H264RawSEIPicTiming {
     H264RawSEIPicTimestamp timestamp[3];
 } H264RawSEIPicTiming;
 
+typedef struct H264RawSEIPanScanRect {
+    uint32_t pan_scan_rect_id;
+    uint8_t  pan_scan_rect_cancel_flag;
+    uint8_t  pan_scan_cnt_minus1;
+    int32_t  pan_scan_rect_left_offset[3];
+    int32_t  pan_scan_rect_right_offset[3];
+    int32_t  pan_scan_rect_top_offset[3];
+    int32_t  pan_scan_rect_bottom_offset[3];
+    uint16_t pan_scan_rect_repetition_period;
+} H264RawSEIPanScanRect;
+
 typedef struct H264RawSEIUserDataRegistered {
     uint8_t itu_t_t35_country_code;
     uint8_t itu_t_t35_country_code_extension_byte;
@@ -295,17 +306,28 @@ typedef struct H264RawSEIDisplayOrientation {
     uint8_t display_orientation_extension_flag;
 } H264RawSEIDisplayOrientation;
 
+typedef struct H264RawSEIMasteringDisplayColourVolume {
+    uint16_t display_primaries_x[3];
+    uint16_t display_primaries_y[3];
+    uint16_t white_point_x;
+    uint16_t white_point_y;
+    uint32_t max_display_mastering_luminance;
+    uint32_t min_display_mastering_luminance;
+} H264RawSEIMasteringDisplayColourVolume;
+
 typedef struct H264RawSEIPayload {
     uint32_t payload_type;
     uint32_t payload_size;
     union {
         H264RawSEIBufferingPeriod buffering_period;
         H264RawSEIPicTiming pic_timing;
+        H264RawSEIPanScanRect pan_scan_rect;
         // H264RawSEIFiller filler -> no fields.
         H264RawSEIUserDataRegistered user_data_registered;
         H264RawSEIUserDataUnregistered user_data_unregistered;
         H264RawSEIRecoveryPoint recovery_point;
         H264RawSEIDisplayOrientation display_orientation;
+        H264RawSEIMasteringDisplayColourVolume mastering_display_colour_volume;
         struct {
             uint8_t *data;
             size_t data_length;
@@ -421,6 +443,8 @@ typedef struct CodedBitstreamH264Context {
 
     // All currently available parameter sets.  These are updated when
     // any parameter set NAL unit is read/written with this context.
+    AVBufferRef *sps_ref[H264_MAX_SPS_COUNT];
+    AVBufferRef *pps_ref[H264_MAX_PPS_COUNT];
     H264RawSPS *sps[H264_MAX_SPS_COUNT];
     H264RawPPS *pps[H264_MAX_PPS_COUNT];
 
diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
index d6131a1..e74f8dc 100644
--- a/libavcodec/cbs_h2645.c
+++ b/libavcodec/cbs_h2645.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,10 +29,12 @@
 #include "h264_sei.h"
 #include "h2645_parse.h"
 #include "hevc.h"
+#include "hevc_sei.h"
 
 
-static int cbs_read_ue_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
-                              const char *name, uint32_t *write_to,
+static int cbs_read_ue_golomb(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                              const char *name, const int *subscripts,
+                              uint32_t *write_to,
                               uint32_t range_min, uint32_t range_max)
 {
     uint32_t value;
@@ -40,15 +42,15 @@ static int cbs_read_ue_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
     unsigned int k;
     char bits[65];
 
-    position = bitstream_tell(bc);
+    position = get_bits_count(gbc);
 
     for (i = 0; i < 32; i++) {
-        if (bitstream_bits_left(bc) < i + 1) {
+        if (get_bits_left(gbc) < i + 1) {
             av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid ue-golomb code at "
                    "%s: bitstream ended.\n", name);
             return AVERROR_INVALIDDATA;
         }
-        k = bitstream_read_bit(bc);
+        k = get_bits1(gbc);
         bits[i] = k ? '1' : '0';
         if (k)
             break;
@@ -60,7 +62,7 @@ static int cbs_read_ue_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
     }
     value = 1;
     for (j = 0; j < i; j++) {
-        k = bitstream_read_bit(bc);
+        k = get_bits1(gbc);
         bits[i + j + 1] = k ? '1' : '0';
         value = value << 1 | k;
     }
@@ -68,7 +70,8 @@ static int cbs_read_ue_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
     --value;
 
     if (ctx->trace_enable)
-        ff_cbs_trace_syntax_element(ctx, position, name, bits, value);
+        ff_cbs_trace_syntax_element(ctx, position, name, subscripts,
+                                    bits, value);
 
     if (value < range_min || value > range_max) {
         av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
@@ -81,8 +84,9 @@ static int cbs_read_ue_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
     return 0;
 }
 
-static int cbs_read_se_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
-                              const char *name, int32_t *write_to,
+static int cbs_read_se_golomb(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                              const char *name, const int *subscripts,
+                              int32_t *write_to,
                               int32_t range_min, int32_t range_max)
 {
     int32_t value;
@@ -91,15 +95,15 @@ static int cbs_read_se_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
     uint32_t v;
     char bits[65];
 
-    position = bitstream_tell(bc);
+    position = get_bits_count(gbc);
 
     for (i = 0; i < 32; i++) {
-        if (bitstream_bits_left(bc) < i + 1) {
+        if (get_bits_left(gbc) < i + 1) {
             av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid se-golomb code at "
                    "%s: bitstream ended.\n", name);
             return AVERROR_INVALIDDATA;
         }
-        k = bitstream_read_bit(bc);
+        k = get_bits1(gbc);
         bits[i] = k ? '1' : '0';
         if (k)
             break;
@@ -111,7 +115,7 @@ static int cbs_read_se_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
     }
     v = 1;
     for (j = 0; j < i; j++) {
-        k = bitstream_read_bit(bc);
+        k = get_bits1(gbc);
         bits[i + j + 1] = k ? '1' : '0';
         v = v << 1 | k;
     }
@@ -122,7 +126,8 @@ static int cbs_read_se_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
         value = v / 2;
 
     if (ctx->trace_enable)
-        ff_cbs_trace_syntax_element(ctx, position, name, bits, value);
+        ff_cbs_trace_syntax_element(ctx, position, name, subscripts,
+                                    bits, value);
 
     if (value < range_min || value > range_max) {
         av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
@@ -136,7 +141,8 @@ static int cbs_read_se_golomb(CodedBitstreamContext *ctx, BitstreamContext *bc,
 }
 
 static int cbs_write_ue_golomb(CodedBitstreamContext *ctx, PutBitContext *pbc,
-                               const char *name, uint32_t value,
+                               const char *name, const int *subscripts,
+                               uint32_t value,
                                uint32_t range_min, uint32_t range_max)
 {
     int len;
@@ -164,7 +170,8 @@ static int cbs_write_ue_golomb(CodedBitstreamContext *ctx, PutBitContext *pbc,
             bits[len + i + 1] = (value + 1) >> (len - i - 1) & 1 ? '1' : '0';
         bits[len + len + 1] = 0;
 
-        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc), name, bits, value);
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, subscripts, bits, value);
     }
 
     put_bits(pbc, len, 0);
@@ -177,7 +184,8 @@ static int cbs_write_ue_golomb(CodedBitstreamContext *ctx, PutBitContext *pbc,
 }
 
 static int cbs_write_se_golomb(CodedBitstreamContext *ctx, PutBitContext *pbc,
-                               const char *name, int32_t value,
+                               const char *name, const int *subscripts,
+                               int32_t value,
                                int32_t range_min, int32_t range_max)
 {
     int len;
@@ -213,7 +221,8 @@ static int cbs_write_se_golomb(CodedBitstreamContext *ctx, PutBitContext *pbc,
             bits[len + i + 1] = (uvalue + 1) >> (len - i - 1) & 1 ? '1' : '0';
         bits[len + len + 1] = 0;
 
-        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc), name, bits, value);
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, subscripts, bits, value);
     }
 
     put_bits(pbc, len, 0);
@@ -239,59 +248,79 @@ static int cbs_write_se_golomb(CodedBitstreamContext *ctx, PutBitContext *pbc,
 #define FUNC_H264(rw, name) FUNC_NAME(rw, h264, name)
 #define FUNC_H265(rw, name) FUNC_NAME(rw, h265, name)
 
+#define SUBSCRIPTS(subs, ...) (subs > 0 ? ((int[subs + 1]){ subs, __VA_ARGS__ }) : NULL)
+
+#define u(width, name, range_min, range_max) \
+        xu(width, name, current->name, range_min, range_max, 0)
+#define flag(name) u(1, name, 0, 1)
+#define ue(name, range_min, range_max) \
+        xue(name, current->name, range_min, range_max, 0)
+#define se(name, range_min, range_max) \
+        xse(name, current->name, range_min, range_max, 0)
+
+#define us(width, name, range_min, range_max, subs, ...) \
+        xu(width, name, current->name, range_min, range_max, subs, __VA_ARGS__)
+#define flags(name, subs, ...) \
+        xu(1, name, current->name, 0, 1, subs, __VA_ARGS__)
+#define ues(name, range_min, range_max, subs, ...) \
+        xue(name, current->name, range_min, range_max, subs, __VA_ARGS__)
+#define ses(name, range_min, range_max, subs, ...) \
+        xse(name, current->name, range_min, range_max, subs, __VA_ARGS__)
+
+#define fixed(width, name, value) do { \
+        av_unused uint32_t fixed_value = value; \
+        xu(width, name, fixed_value, value, value, 0); \
+    } while (0)
+
 
 #define READ
 #define READWRITE read
-#define RWContext BitstreamContext
+#define RWContext GetBitContext
 
-#define xu(width, name, var, range_min, range_max) do { \
+#define xu(width, name, var, range_min, range_max, subs, ...) do { \
         uint32_t value = range_min; \
         CHECK(ff_cbs_read_unsigned(ctx, rw, width, #name, \
+                                   SUBSCRIPTS(subs, __VA_ARGS__), \
                                    &value, range_min, range_max)); \
         var = value; \
     } while (0)
-#define xue(name, var, range_min, range_max) do { \
+#define xue(name, var, range_min, range_max, subs, ...) do { \
         uint32_t value = range_min; \
         CHECK(cbs_read_ue_golomb(ctx, rw, #name, \
+                                 SUBSCRIPTS(subs, __VA_ARGS__), \
                                  &value, range_min, range_max)); \
         var = value; \
     } while (0)
-#define xse(name, var, range_min, range_max) do { \
+#define xse(name, var, range_min, range_max, subs, ...) do { \
         int32_t value = range_min; \
         CHECK(cbs_read_se_golomb(ctx, rw, #name, \
+                                 SUBSCRIPTS(subs, __VA_ARGS__), \
                                  &value, range_min, range_max)); \
         var = value; \
     } while (0)
 
 
-#define u(width, name, range_min, range_max) \
-        xu(width, name, current->name, range_min, range_max)
-#define flag(name) u(1, name, 0, 1)
-#define ue(name, range_min, range_max) \
-        xue(name, current->name, range_min, range_max)
-#define se(name, range_min, range_max) \
-        xse(name, current->name, range_min, range_max)
-
 #define infer(name, value) do { \
         current->name = value; \
     } while (0)
 
-static int cbs_h2645_read_more_rbsp_data(BitstreamContext *bc)
+static int cbs_h2645_read_more_rbsp_data(GetBitContext *gbc)
 {
-    int bits_left = bitstream_bits_left(bc);
+    int bits_left = get_bits_left(gbc);
     if (bits_left > 8)
         return 1;
-    if (bitstream_peek(bc, bits_left) == 1 << (bits_left - 1))
+    if (show_bits(gbc, bits_left) == 1 << (bits_left - 1))
         return 0;
     return 1;
 }
 
 #define more_rbsp_data(var) ((var) = cbs_h2645_read_more_rbsp_data(rw))
 
-#define byte_alignment(rw) (bitstream_tell(rw) % 8)
+#define byte_alignment(rw) (get_bits_count(rw) % 8)
 
 #define allocate(name, size) do { \
-        name ## _ref = av_buffer_allocz(size); \
+        name ## _ref = av_buffer_allocz(size + \
+                                        AV_INPUT_BUFFER_PADDING_SIZE); \
         if (!name ## _ref) \
             return AVERROR(ENOMEM); \
         name = name ## _ref->data; \
@@ -311,10 +340,6 @@ static int cbs_h2645_read_more_rbsp_data(BitstreamContext *bc)
 #undef xu
 #undef xue
 #undef xse
-#undef u
-#undef flag
-#undef ue
-#undef se
 #undef infer
 #undef more_rbsp_data
 #undef byte_alignment
@@ -325,30 +350,25 @@ static int cbs_h2645_read_more_rbsp_data(BitstreamContext *bc)
 #define READWRITE write
 #define RWContext PutBitContext
 
-#define xu(width, name, var, range_min, range_max) do { \
+#define xu(width, name, var, range_min, range_max, subs, ...) do { \
         uint32_t value = var; \
         CHECK(ff_cbs_write_unsigned(ctx, rw, width, #name, \
+                                    SUBSCRIPTS(subs, __VA_ARGS__), \
                                     value, range_min, range_max)); \
     } while (0)
-#define xue(name, var, range_min, range_max) do { \
+#define xue(name, var, range_min, range_max, subs, ...) do { \
         uint32_t value = var; \
         CHECK(cbs_write_ue_golomb(ctx, rw, #name, \
+                                  SUBSCRIPTS(subs, __VA_ARGS__), \
                                   value, range_min, range_max)); \
     } while (0)
-#define xse(name, var, range_min, range_max) do { \
+#define xse(name, var, range_min, range_max, subs, ...) do { \
         int32_t value = var; \
         CHECK(cbs_write_se_golomb(ctx, rw, #name, \
+                                  SUBSCRIPTS(subs, __VA_ARGS__), \
                                   value, range_min, range_max)); \
     } while (0)
 
-#define u(width, name, range_min, range_max) \
-        xu(width, name, current->name, range_min, range_max)
-#define flag(name) u(1, name, 0, 1)
-#define ue(name, range_min, range_max) \
-        xue(name, current->name, range_min, range_max)
-#define se(name, range_min, range_max) \
-        xse(name, current->name, range_min, range_max)
-
 #define infer(name, value) do { \
         if (current->name != (value)) { \
             av_log(ctx->log_ctx, AV_LOG_WARNING, "Warning: " \
@@ -406,8 +426,10 @@ static void cbs_h264_free_sei_payload(H264RawSEIPayload *payload)
     switch (payload->payload_type) {
     case H264_SEI_TYPE_BUFFERING_PERIOD:
     case H264_SEI_TYPE_PIC_TIMING:
+    case H264_SEI_TYPE_PAN_SCAN_RECT:
     case H264_SEI_TYPE_RECOVERY_POINT:
     case H264_SEI_TYPE_DISPLAY_ORIENTATION:
+    case H264_SEI_TYPE_MASTERING_DISPLAY_COLOUR_VOLUME:
         break;
     case H264_SEI_TYPE_USER_DATA_REGISTERED:
         av_buffer_unref(&payload->payload.user_data_registered.data_ref);
@@ -465,6 +487,42 @@ static void cbs_h265_free_slice(void *unit, uint8_t *content)
     av_freep(&content);
 }
 
+static void cbs_h265_free_sei_payload(H265RawSEIPayload *payload)
+{
+    switch (payload->payload_type) {
+    case HEVC_SEI_TYPE_BUFFERING_PERIOD:
+    case HEVC_SEI_TYPE_PICTURE_TIMING:
+    case HEVC_SEI_TYPE_PAN_SCAN_RECT:
+    case HEVC_SEI_TYPE_RECOVERY_POINT:
+    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
+    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
+    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
+    case HEVC_SEI_TYPE_TIME_CODE:
+    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
+    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
+    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
+        break;
+    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
+        av_buffer_unref(&payload->payload.user_data_registered.data_ref);
+        break;
+    case HEVC_SEI_TYPE_USER_DATA_UNREGISTERED:
+        av_buffer_unref(&payload->payload.user_data_unregistered.data_ref);
+        break;
+    default:
+        av_buffer_unref(&payload->payload.other.data_ref);
+        break;
+    }
+}
+
+static void cbs_h265_free_sei(void *unit, uint8_t *content)
+{
+    H265RawSEI *sei = (H265RawSEI*)content;
+    int i;
+    for (i = 0; i < sei->payload_count; i++)
+        cbs_h265_free_sei_payload(&sei->payload[i]);
+    av_freep(&content);
+}
+
 static int cbs_h2645_fragment_add_nals(CodedBitstreamContext *ctx,
                                        CodedBitstreamFragment *frag,
                                        const H2645Packet *packet)
@@ -473,26 +531,21 @@ static int cbs_h2645_fragment_add_nals(CodedBitstreamContext *ctx,
 
     for (i = 0; i < packet->nb_nals; i++) {
         const H2645NAL *nal = &packet->nals[i];
+        AVBufferRef *ref;
         size_t size = nal->size;
-        uint8_t *data;
 
         // Remove trailing zeroes.
         while (size > 0 && nal->data[size - 1] == 0)
             --size;
         av_assert0(size > 0);
 
-        data = av_malloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!data)
-            return AVERROR(ENOMEM);
-        memcpy(data, nal->data, size);
-        memset(data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        ref = (nal->data == nal->raw_data) ? frag->data_ref
+                                           : packet->rbsp.rbsp_buffer_ref;
 
         err = ff_cbs_insert_unit_data(ctx, frag, -1, nal->type,
-                                      data, nal->size, NULL);
-        if (err < 0) {
-            av_freep(&data);
+                            (uint8_t*)nal->data, size, ref);
+        if (err < 0)
             return err;
-        }
     }
 
     return 0;
@@ -548,7 +601,7 @@ static int cbs_h2645_split_fragment(CodedBitstreamContext *ctx,
 
         err = ff_h2645_packet_split(&priv->read_packet,
                                     frag->data + start, end - start,
-                                    ctx->log_ctx, 1, 2, AV_CODEC_ID_H264);
+                                    ctx->log_ctx, 1, 2, AV_CODEC_ID_H264, 1, 1);
         if (err < 0) {
             av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to split AVCC SPS array.\n");
             return err;
@@ -572,7 +625,7 @@ static int cbs_h2645_split_fragment(CodedBitstreamContext *ctx,
 
         err = ff_h2645_packet_split(&priv->read_packet,
                                     frag->data + start, end - start,
-                                    ctx->log_ctx, 1, 2, AV_CODEC_ID_H264);
+                                    ctx->log_ctx, 1, 2, AV_CODEC_ID_H264, 1, 1);
         if (err < 0) {
             av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to split AVCC PPS array.\n");
             return err;
@@ -626,7 +679,7 @@ static int cbs_h2645_split_fragment(CodedBitstreamContext *ctx,
 
             err = ff_h2645_packet_split(&priv->read_packet,
                                         frag->data + start, end - start,
-                                        ctx->log_ctx, 1, 2, AV_CODEC_ID_HEVC);
+                                        ctx->log_ctx, 1, 2, AV_CODEC_ID_HEVC, 1, 1);
             if (err < 0) {
                 av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to split "
                        "HVCC array %d (%d NAL units of type %d).\n",
@@ -645,7 +698,7 @@ static int cbs_h2645_split_fragment(CodedBitstreamContext *ctx,
                                     frag->data, frag->data_size,
                                     ctx->log_ctx,
                                     priv->mp4, priv->nal_length_size,
-                                    codec_id);
+                                    codec_id, 1, 1);
         if (err < 0)
             return err;
 
@@ -659,9 +712,10 @@ static int cbs_h2645_split_fragment(CodedBitstreamContext *ctx,
 
 #define cbs_h2645_replace_ps(h26n, ps_name, ps_var, id_element) \
 static int cbs_h26 ## h26n ## _replace_ ## ps_var(CodedBitstreamContext *ctx, \
-                                                  const H26 ## h26n ## Raw ## ps_name *ps_var)  \
+                                                  CodedBitstreamUnit *unit)  \
 { \
     CodedBitstreamH26 ## h26n ## Context *priv = ctx->priv_data; \
+    H26 ## h26n ## Raw ## ps_name *ps_var = unit->content; \
     unsigned int id = ps_var->id_element; \
     if (id > FF_ARRAY_ELEMS(priv->ps_var)) { \
         av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid " #ps_name \
@@ -670,11 +724,16 @@ static int cbs_h26 ## h26n ## _replace_ ## ps_var(CodedBitstreamContext *ctx, \
     } \
     if (priv->ps_var[id] == priv->active_ ## ps_var) \
         priv->active_ ## ps_var = NULL ; \
-    av_freep(&priv->ps_var[id]); \
-    priv->ps_var[id] = av_malloc(sizeof(*ps_var)); \
-    if (!priv->ps_var[id]) \
+    av_buffer_unref(&priv->ps_var ## _ref[id]); \
+    if (unit->content_ref) \
+        priv->ps_var ## _ref[id] = av_buffer_ref(unit->content_ref); \
+    else \
+        priv->ps_var ## _ref[id] = av_buffer_alloc(sizeof(*ps_var)); \
+    if (!priv->ps_var ## _ref[id]) \
         return AVERROR(ENOMEM); \
-    memcpy(priv->ps_var[id], ps_var, sizeof(*ps_var)); \
+    priv->ps_var[id] = (H26 ## h26n ## Raw ## ps_name *)priv->ps_var ## _ref[id]->data; \
+    if (!unit->content_ref) \
+        memcpy(priv->ps_var[id], ps_var, sizeof(*ps_var)); \
     return 0; \
 }
 
@@ -687,10 +746,10 @@ cbs_h2645_replace_ps(5, PPS, pps, pps_pic_parameter_set_id)
 static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
                                   CodedBitstreamUnit *unit)
 {
-    BitstreamContext bc;
+    GetBitContext gbc;
     int err;
 
-    err = bitstream_init(&bc, unit->data, 8 * unit->data_size);
+    err = init_get_bits(&gbc, unit->data, 8 * unit->data_size);
     if (err < 0)
         return err;
 
@@ -704,11 +763,11 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
                 return err;
             sps = unit->content;
 
-            err = cbs_h264_read_sps(ctx, &bc, sps);
+            err = cbs_h264_read_sps(ctx, &gbc, sps);
             if (err < 0)
                 return err;
 
-            err = cbs_h264_replace_sps(ctx, sps);
+            err = cbs_h264_replace_sps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -722,7 +781,7 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h264_read_sps_extension(ctx, &bc, unit->content);
+            err = cbs_h264_read_sps_extension(ctx, &gbc, unit->content);
             if (err < 0)
                 return err;
         }
@@ -738,11 +797,11 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
                 return err;
             pps = unit->content;
 
-            err = cbs_h264_read_pps(ctx, &bc, pps);
+            err = cbs_h264_read_pps(ctx, &gbc, pps);
             if (err < 0)
                 return err;
 
-            err = cbs_h264_replace_pps(ctx, pps);
+            err = cbs_h264_replace_pps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -761,11 +820,11 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
                 return err;
             slice = unit->content;
 
-            err = cbs_h264_read_slice_header(ctx, &bc, &slice->header);
+            err = cbs_h264_read_slice_header(ctx, &gbc, &slice->header);
             if (err < 0)
                 return err;
 
-            pos = bitstream_tell(&bc);
+            pos = get_bits_count(&gbc);
             len = unit->data_size;
             if (!unit->data[len - 1]) {
                 int z;
@@ -776,15 +835,10 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
             }
 
             slice->data_size = len - pos / 8;
-            slice->data_ref  = av_buffer_alloc(slice->data_size +
-                                               AV_INPUT_BUFFER_PADDING_SIZE);
+            slice->data_ref  = av_buffer_ref(unit->data_ref);
             if (!slice->data_ref)
                 return AVERROR(ENOMEM);
-            slice->data = slice->data_ref->data;
-            memcpy(slice->data,
-                   unit->data + pos / 8, slice->data_size);
-            memset(slice->data + slice->data_size, 0,
-                   AV_INPUT_BUFFER_PADDING_SIZE);
+            slice->data = unit->data + pos / 8;
             slice->data_bit_start = pos % 8;
         }
         break;
@@ -796,7 +850,7 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h264_read_aud(ctx, &bc, unit->content);
+            err = cbs_h264_read_aud(ctx, &gbc, unit->content);
             if (err < 0)
                 return err;
         }
@@ -809,7 +863,7 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h264_read_sei(ctx, &bc, unit->content);
+            err = cbs_h264_read_sei(ctx, &gbc, unit->content);
             if (err < 0)
                 return err;
         }
@@ -822,7 +876,24 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h264_read_filler(ctx, &bc, unit->content);
+            err = cbs_h264_read_filler(ctx, &gbc, unit->content);
+            if (err < 0)
+                return err;
+        }
+        break;
+
+    case H264_NAL_END_SEQUENCE:
+    case H264_NAL_END_STREAM:
+        {
+            err = ff_cbs_alloc_unit_content(ctx, unit,
+                                            sizeof(H264RawNALUnitHeader),
+                                            NULL);
+            if (err < 0)
+                return err;
+
+            err = (unit->type == H264_NAL_END_SEQUENCE ?
+                   cbs_h264_read_end_of_sequence :
+                   cbs_h264_read_end_of_stream)(ctx, &gbc, unit->content);
             if (err < 0)
                 return err;
         }
@@ -838,10 +909,10 @@ static int cbs_h264_read_nal_unit(CodedBitstreamContext *ctx,
 static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
                                   CodedBitstreamUnit *unit)
 {
-    BitstreamContext bc;
+    GetBitContext gbc;
     int err;
 
-    err = bitstream_init(&bc, unit->data, 8 * unit->data_size);
+    err = init_get_bits(&gbc, unit->data, 8 * unit->data_size);
     if (err < 0)
         return err;
 
@@ -856,11 +927,11 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
                 return err;
             vps = unit->content;
 
-            err = cbs_h265_read_vps(ctx, &bc, vps);
+            err = cbs_h265_read_vps(ctx, &gbc, vps);
             if (err < 0)
                 return err;
 
-            err = cbs_h265_replace_vps(ctx, vps);
+            err = cbs_h265_replace_vps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -875,11 +946,11 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
                 return err;
             sps = unit->content;
 
-            err = cbs_h265_read_sps(ctx, &bc, sps);
+            err = cbs_h265_read_sps(ctx, &gbc, sps);
             if (err < 0)
                 return err;
 
-            err = cbs_h265_replace_sps(ctx, sps);
+            err = cbs_h265_replace_sps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -895,11 +966,11 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
                 return err;
             pps = unit->content;
 
-            err = cbs_h265_read_pps(ctx, &bc, pps);
+            err = cbs_h265_read_pps(ctx, &gbc, pps);
             if (err < 0)
                 return err;
 
-            err = cbs_h265_replace_pps(ctx, pps);
+            err = cbs_h265_replace_pps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -931,11 +1002,11 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
                 return err;
             slice = unit->content;
 
-            err = cbs_h265_read_slice_segment_header(ctx, &bc, &slice->header);
+            err = cbs_h265_read_slice_segment_header(ctx, &gbc, &slice->header);
             if (err < 0)
                 return err;
 
-            pos = bitstream_tell(&bc);
+            pos = get_bits_count(&gbc);
             len = unit->data_size;
             if (!unit->data[len - 1]) {
                 int z;
@@ -946,15 +1017,10 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
             }
 
             slice->data_size = len - pos / 8;
-            slice->data_ref  = av_buffer_alloc(slice->data_size +
-                                               AV_INPUT_BUFFER_PADDING_SIZE);
+            slice->data_ref  = av_buffer_ref(unit->data_ref);
             if (!slice->data_ref)
                 return AVERROR(ENOMEM);
-            slice->data = slice->data_ref->data;
-            memcpy(slice->data,
-                   unit->data + pos / 8, slice->data_size);
-            memset(slice->data + slice->data_size, 0,
-                   AV_INPUT_BUFFER_PADDING_SIZE);
+            slice->data = unit->data + pos / 8;
             slice->data_bit_start = pos % 8;
         }
         break;
@@ -966,7 +1032,24 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h265_read_aud(ctx, &bc, unit->content);
+            err = cbs_h265_read_aud(ctx, &gbc, unit->content);
+            if (err < 0)
+                return err;
+        }
+        break;
+
+    case HEVC_NAL_SEI_PREFIX:
+    case HEVC_NAL_SEI_SUFFIX:
+        {
+            err = ff_cbs_alloc_unit_content(ctx, unit, sizeof(H265RawSEI),
+                                            &cbs_h265_free_sei);
+
+            if (err < 0)
+                return err;
+
+            err = cbs_h265_read_sei(ctx, &gbc, unit->content,
+                                    unit->type == HEVC_NAL_SEI_PREFIX);
+
             if (err < 0)
                 return err;
         }
@@ -979,6 +1062,64 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
     return 0;
 }
 
+static int cbs_h2645_write_slice_data(CodedBitstreamContext *ctx,
+                                      PutBitContext *pbc, const uint8_t *data,
+                                      size_t data_size, int data_bit_start)
+{
+    size_t rest  = data_size - (data_bit_start + 7) / 8;
+    const uint8_t *pos = data + data_bit_start / 8;
+
+    av_assert0(data_bit_start >= 0 &&
+               8 * data_size > data_bit_start);
+
+    if (data_size * 8 + 8 > put_bits_left(pbc))
+        return AVERROR(ENOSPC);
+
+    if (!rest)
+        goto rbsp_stop_one_bit;
+
+    // First copy the remaining bits of the first byte
+    // The above check ensures that we do not accidentally
+    // copy beyond the rbsp_stop_one_bit.
+    if (data_bit_start % 8)
+        put_bits(pbc, 8 - data_bit_start % 8,
+                 *pos++ & MAX_UINT_BITS(8 - data_bit_start % 8));
+
+    if (put_bits_count(pbc) % 8 == 0) {
+        // If the writer is aligned at this point,
+        // memcpy can be used to improve performance.
+        // This happens normally for CABAC.
+        flush_put_bits(pbc);
+        memcpy(put_bits_ptr(pbc), pos, rest);
+        skip_put_bytes(pbc, rest);
+    } else {
+        // If not, we have to copy manually.
+        // rbsp_stop_one_bit forces us to special-case
+        // the last byte.
+        uint8_t temp;
+        int i;
+
+        for (; rest > 4; rest -= 4, pos += 4)
+            put_bits32(pbc, AV_RB32(pos));
+
+        for (; rest > 1; rest--, pos++)
+            put_bits(pbc, 8, *pos);
+
+    rbsp_stop_one_bit:
+        temp = rest ? *pos : *pos & MAX_UINT_BITS(8 - data_bit_start % 8);
+
+        av_assert0(temp);
+        i = ff_ctz(*pos);
+        temp = temp >> i;
+        i = rest ? (8 - i) : (8 - i - data_bit_start % 8);
+        put_bits(pbc, i, temp);
+        if (put_bits_count(pbc) % 8)
+            put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0);
+    }
+
+    return 0;
+}
+
 static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
                                    CodedBitstreamUnit *unit,
                                    PutBitContext *pbc)
@@ -994,7 +1135,7 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h264_replace_sps(ctx, sps);
+            err = cbs_h264_replace_sps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -1018,7 +1159,7 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h264_replace_pps(ctx, pps);
+            err = cbs_h264_replace_pps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -1029,37 +1170,17 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
     case H264_NAL_AUXILIARY_SLICE:
         {
             H264RawSlice *slice = unit->content;
-            BitstreamContext bc;
-            int bits_left, end, zeroes;
 
             err = cbs_h264_write_slice_header(ctx, pbc, &slice->header);
             if (err < 0)
                 return err;
 
             if (slice->data) {
-                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
-                    return AVERROR(ENOSPC);
-
-                bitstream_init(&bc, slice->data, slice->data_size * 8);
-                bitstream_skip(&bc, slice->data_bit_start);
-
-                // Copy in two-byte blocks, but stop before copying the
-                // rbsp_stop_one_bit in the final byte.
-                while (bitstream_bits_left(&bc) > 23)
-                    put_bits(pbc, 16, bitstream_read(&bc, 16));
-
-                bits_left = bitstream_bits_left(&bc);
-                end = bitstream_read(&bc, bits_left);
-
-                // rbsp_stop_one_bit must be present here.
-                av_assert0(end);
-                zeroes = ff_ctz(end);
-                if (bits_left > zeroes + 1)
-                    put_bits(pbc, bits_left - zeroes - 1,
-                             end >> (zeroes + 1));
-                put_bits(pbc, 1, 1);
-                while (put_bits_count(pbc) % 8 != 0)
-                    put_bits(pbc, 1, 0);
+                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
+                                                 slice->data_size,
+                                                 slice->data_bit_start);
+                if (err < 0)
+                    return err;
             } else {
                 // No slice data - that was just the header.
                 // (Bitstream may be unaligned!)
@@ -1091,6 +1212,22 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
         }
         break;
 
+    case H264_NAL_END_SEQUENCE:
+        {
+            err = cbs_h264_write_end_of_sequence(ctx, pbc, unit->content);
+            if (err < 0)
+                return err;
+        }
+        break;
+
+    case H264_NAL_END_STREAM:
+        {
+            err = cbs_h264_write_end_of_stream(ctx, pbc, unit->content);
+            if (err < 0)
+                return err;
+        }
+        break;
+
     default:
         av_log(ctx->log_ctx, AV_LOG_ERROR, "Write unimplemented for "
                "NAL unit type %"PRIu32".\n", unit->type);
@@ -1115,7 +1252,7 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h265_replace_vps(ctx, vps);
+            err = cbs_h265_replace_vps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -1129,7 +1266,7 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h265_replace_sps(ctx, sps);
+            err = cbs_h265_replace_sps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -1143,7 +1280,7 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
             if (err < 0)
                 return err;
 
-            err = cbs_h265_replace_pps(ctx, pps);
+            err = cbs_h265_replace_pps(ctx, unit);
             if (err < 0)
                 return err;
         }
@@ -1167,37 +1304,17 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
     case HEVC_NAL_CRA_NUT:
         {
             H265RawSlice *slice = unit->content;
-            BitstreamContext bc;
-            int bits_left, end, zeroes;
 
             err = cbs_h265_write_slice_segment_header(ctx, pbc, &slice->header);
             if (err < 0)
                 return err;
 
             if (slice->data) {
-                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
-                    return AVERROR(ENOSPC);
-
-                bitstream_init(&bc, slice->data, slice->data_size * 8);
-                bitstream_skip(&bc, slice->data_bit_start);
-
-                // Copy in two-byte blocks, but stop before copying the
-                // rbsp_stop_one_bit in the final byte.
-                while (bitstream_bits_left(&bc) > 23)
-                    put_bits(pbc, 16, bitstream_read(&bc, 16));
-
-                bits_left = bitstream_bits_left(&bc);
-                end = bitstream_read(&bc, bits_left);
-
-                // rbsp_stop_one_bit must be present here.
-                av_assert0(end);
-                zeroes = ff_ctz(end);
-                if (bits_left > zeroes + 1)
-                    put_bits(pbc, bits_left - zeroes - 1,
-                             end >> (zeroes + 1));
-                put_bits(pbc, 1, 1);
-                while (put_bits_count(pbc) % 8 != 0)
-                    put_bits(pbc, 1, 0);
+                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
+                                                 slice->data_size,
+                                                 slice->data_bit_start);
+                if (err < 0)
+                    return err;
             } else {
                 // No slice data - that was just the header.
             }
@@ -1212,6 +1329,17 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
         }
         break;
 
+    case HEVC_NAL_SEI_PREFIX:
+    case HEVC_NAL_SEI_SUFFIX:
+        {
+            err = cbs_h265_write_sei(ctx, pbc, unit->content,
+                                     unit->type == HEVC_NAL_SEI_PREFIX);
+
+            if (err < 0)
+                return err;
+        }
+        break;
+
     default:
         av_log(ctx->log_ctx, AV_LOG_ERROR, "Write unimplemented for "
                "NAL unit type %"PRIu32".\n", unit->type);
@@ -1238,7 +1366,7 @@ static int cbs_h2645_write_nal_unit(CodedBitstreamContext *ctx,
         if (err < 0) {
             av_log(ctx->log_ctx, AV_LOG_ERROR, "Unable to allocate a "
                    "sufficiently large write buffer (last attempt "
-                   "%zu bytes).\n", priv->write_buffer_size);
+                   "%"SIZE_SPECIFIER" bytes).\n", priv->write_buffer_size);
             return err;
         }
     }
@@ -1298,7 +1426,7 @@ static int cbs_h2645_assemble_fragment(CodedBitstreamContext *ctx,
         max_size += 3 + frag->units[i].data_size * 3 / 2;
     }
 
-    data = av_malloc(max_size);
+    data = av_malloc(max_size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!data)
         return AVERROR(ENOMEM);
 
@@ -1349,11 +1477,13 @@ static int cbs_h2645_assemble_fragment(CodedBitstreamContext *ctx,
     }
 
     av_assert0(dp <= max_size);
-    err = av_reallocp(&data, dp);
+    err = av_reallocp(&data, dp + AV_INPUT_BUFFER_PADDING_SIZE);
     if (err)
         return err;
+    memset(data + dp, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
-    frag->data_ref = av_buffer_create(data, dp, NULL, NULL, 0);
+    frag->data_ref = av_buffer_create(data, dp + AV_INPUT_BUFFER_PADDING_SIZE,
+                                      NULL, NULL, 0);
     if (!frag->data_ref) {
         av_freep(&data);
         return AVERROR(ENOMEM);
@@ -1375,9 +1505,9 @@ static void cbs_h264_close(CodedBitstreamContext *ctx)
     av_freep(&h264->common.write_buffer);
 
     for (i = 0; i < FF_ARRAY_ELEMS(h264->sps); i++)
-        av_freep(&h264->sps[i]);
+        av_buffer_unref(&h264->sps_ref[i]);
     for (i = 0; i < FF_ARRAY_ELEMS(h264->pps); i++)
-        av_freep(&h264->pps[i]);
+        av_buffer_unref(&h264->pps_ref[i]);
 }
 
 static void cbs_h265_close(CodedBitstreamContext *ctx)
@@ -1390,11 +1520,11 @@ static void cbs_h265_close(CodedBitstreamContext *ctx)
     av_freep(&h265->common.write_buffer);
 
     for (i = 0; i < FF_ARRAY_ELEMS(h265->vps); i++)
-        av_freep(&h265->vps[i]);
+        av_buffer_unref(&h265->vps_ref[i]);
     for (i = 0; i < FF_ARRAY_ELEMS(h265->sps); i++)
-        av_freep(&h265->sps[i]);
+        av_buffer_unref(&h265->sps_ref[i]);
     for (i = 0; i < FF_ARRAY_ELEMS(h265->pps); i++)
-        av_freep(&h265->pps[i]);
+        av_buffer_unref(&h265->pps_ref[i]);
 }
 
 const CodedBitstreamType ff_cbs_type_h264 = {
diff --git a/libavcodec/cbs_h2645.h b/libavcodec/cbs_h2645.h
index 750247b..f4cf65b 100644
--- a/libavcodec/cbs_h2645.h
+++ b/libavcodec/cbs_h2645.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cbs_h264_syntax_template.c b/libavcodec/cbs_h264_syntax_template.c
index 1aa7888..4da4c5d 100644
--- a/libavcodec/cbs_h264_syntax_template.c
+++ b/libavcodec/cbs_h264_syntax_template.c
@@ -1,28 +1,28 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 static int FUNC(rbsp_trailing_bits)(CodedBitstreamContext *ctx, RWContext *rw)
 {
     int err;
-    av_unused int one = 1, zero = 0;
-    xu(1, rbsp_stop_one_bit, one, 1, 1);
+
+    fixed(1, rbsp_stop_one_bit, 1);
     while (byte_alignment(rw) != 0)
-        xu(1, rbsp_alignment_zero_bit, zero, 0, 0);
+        fixed(1, rbsp_alignment_zero_bit, 0);
 
     return 0;
 }
@@ -76,7 +76,7 @@ static int FUNC(scaling_list)(CodedBitstreamContext *ctx, RWContext *rw,
 
     scale = 8;
     for (i = 0; i < size_of_scaling_list; i++) {
-        xse(delta_scale, current->delta_scale[i], -128, +127);
+        ses(delta_scale[i], -128, +127, 1, i);
         scale = (scale + current->delta_scale[i] + 256) % 256;
         if (scale == 0)
             break;
@@ -95,9 +95,9 @@ static int FUNC(hrd_parameters)(CodedBitstreamContext *ctx, RWContext *rw,
     u(4, cpb_size_scale, 0, 15);
 
     for (i = 0; i <= current->cpb_cnt_minus1; i++) {
-        ue(bit_rate_value_minus1[i], 0, UINT32_MAX - 1);
-        ue(cpb_size_value_minus1[i], 0, UINT32_MAX - 1);
-        flag(cbr_flag[i]);
+        ues(bit_rate_value_minus1[i], 0, UINT32_MAX - 1, 1, i);
+        ues(cpb_size_value_minus1[i], 0, UINT32_MAX - 1, 1, i);
+        flags(cbr_flag[i], 1, i);
     }
 
     u(5, initial_cpb_removal_delay_length_minus1, 0, 31);
@@ -185,6 +185,8 @@ static int FUNC(vui_parameters)(CodedBitstreamContext *ctx, RWContext *rw,
         flag(motion_vectors_over_pic_boundaries_flag);
         ue(max_bytes_per_pic_denom, 0, 16);
         ue(max_bits_per_mb_denom,   0, 16);
+        // The current version of the standard constrains this to be in
+        // [0,15], but older versions allow 16.
         ue(log2_max_mv_length_horizontal, 0, 16);
         ue(log2_max_mv_length_vertical,   0, 16);
         ue(max_num_reorder_frames,  0, H264_MAX_DPB_FRAMES);
@@ -193,11 +195,11 @@ static int FUNC(vui_parameters)(CodedBitstreamContext *ctx, RWContext *rw,
         infer(motion_vectors_over_pic_boundaries_flag, 1);
         infer(max_bytes_per_pic_denom, 2);
         infer(max_bits_per_mb_denom,   1);
-        infer(log2_max_mv_length_horizontal, 16);
-        infer(log2_max_mv_length_vertical,   16);
+        infer(log2_max_mv_length_horizontal, 15);
+        infer(log2_max_mv_length_vertical,   15);
 
         if ((sps->profile_idc ==  44 || sps->profile_idc ==  86 ||
-             sps->profile_idc == 110 || sps->profile_idc == 110 ||
+             sps->profile_idc == 100 || sps->profile_idc == 110 ||
              sps->profile_idc == 122 || sps->profile_idc == 244) &&
             sps->constraint_set3_flag) {
             infer(max_num_reorder_frames,  0);
@@ -211,6 +213,46 @@ static int FUNC(vui_parameters)(CodedBitstreamContext *ctx, RWContext *rw,
     return 0;
 }
 
+static int FUNC(vui_parameters_default)(CodedBitstreamContext *ctx,
+                                        RWContext *rw, H264RawVUI *current,
+                                        H264RawSPS *sps)
+{
+    infer(aspect_ratio_idc, 0);
+
+    infer(video_format,             5);
+    infer(video_full_range_flag,    0);
+    infer(colour_primaries,         2);
+    infer(transfer_characteristics, 2);
+    infer(matrix_coefficients,      2);
+
+    infer(chroma_sample_loc_type_top_field,    0);
+    infer(chroma_sample_loc_type_bottom_field, 0);
+
+    infer(fixed_frame_rate_flag, 0);
+    infer(low_delay_hrd_flag,    1);
+
+    infer(pic_struct_present_flag, 0);
+
+    infer(motion_vectors_over_pic_boundaries_flag, 1);
+    infer(max_bytes_per_pic_denom, 2);
+    infer(max_bits_per_mb_denom,   1);
+    infer(log2_max_mv_length_horizontal, 15);
+    infer(log2_max_mv_length_vertical,   15);
+
+    if ((sps->profile_idc ==  44 || sps->profile_idc ==  86 ||
+         sps->profile_idc == 100 || sps->profile_idc == 110 ||
+         sps->profile_idc == 122 || sps->profile_idc == 244) &&
+        sps->constraint_set3_flag) {
+        infer(max_num_reorder_frames,  0);
+        infer(max_dec_frame_buffering, 0);
+    } else {
+        infer(max_num_reorder_frames,  H264_MAX_DPB_FRAMES);
+        infer(max_dec_frame_buffering, H264_MAX_DPB_FRAMES);
+    }
+
+    return 0;
+}
+
 static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
                      H264RawSPS *current)
 {
@@ -256,7 +298,7 @@ static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
         flag(seq_scaling_matrix_present_flag);
         if (current->seq_scaling_matrix_present_flag) {
             for (i = 0; i < ((current->chroma_format_idc != 3) ? 8 : 12); i++) {
-                flag(seq_scaling_list_present_flag[i]);
+                flags(seq_scaling_list_present_flag[i], 1, i);
                 if (current->seq_scaling_list_present_flag[i]) {
                     if (i < 6)
                         CHECK(FUNC(scaling_list)(ctx, rw,
@@ -289,7 +331,7 @@ static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
         ue(num_ref_frames_in_pic_order_cnt_cycle, 0, 255);
 
         for (i = 0; i < current->num_ref_frames_in_pic_order_cnt_cycle; i++)
-            se(offset_for_ref_frame[i], INT32_MIN + 1, INT32_MAX);
+            ses(offset_for_ref_frame[i], INT32_MIN + 1, INT32_MAX, 1, i);
     }
 
     ue(max_num_ref_frames, 0, H264_MAX_DPB_FRAMES);
@@ -315,6 +357,8 @@ static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
     flag(vui_parameters_present_flag);
     if (current->vui_parameters_present_flag)
         CHECK(FUNC(vui_parameters)(ctx, rw, &current->vui, current));
+    else
+        CHECK(FUNC(vui_parameters_default)(ctx, rw, &current->vui, current));
 
     CHECK(FUNC(rbsp_trailing_bits)(ctx, rw));
 
@@ -342,8 +386,8 @@ static int FUNC(sps_extension)(CodedBitstreamContext *ctx, RWContext *rw,
         flag(alpha_incr_flag);
 
         bits = current->bit_depth_aux_minus8 + 9;
-        u(bits, alpha_opaque_value,      0, (1 << bits) - 1);
-        u(bits, alpha_transparent_value, 0, (1 << bits) - 1);
+        u(bits, alpha_opaque_value,      0, MAX_UINT_BITS(bits));
+        u(bits, alpha_transparent_value, 0, MAX_UINT_BITS(bits));
     }
 
     flag(additional_extension_flag);
@@ -390,12 +434,13 @@ static int FUNC(pps)(CodedBitstreamContext *ctx, RWContext *rw,
 
         if (current->slice_group_map_type == 0) {
             for (iGroup = 0; iGroup <= current->num_slice_groups_minus1; iGroup++)
-                ue(run_length_minus1[iGroup], 0, pic_size - 1);
+                ues(run_length_minus1[iGroup], 0, pic_size - 1, 1, iGroup);
 
         } else if (current->slice_group_map_type == 2) {
             for (iGroup = 0; iGroup < current->num_slice_groups_minus1; iGroup++) {
-                ue(top_left[iGroup],     0,                         pic_size - 1);
-                ue(bottom_right[iGroup], current->top_left[iGroup], pic_size - 1);
+                ues(top_left[iGroup],       0, pic_size - 1, 1, iGroup);
+                ues(bottom_right[iGroup],
+                    current->top_left[iGroup], pic_size - 1, 1, iGroup);
             }
         } else if (current->slice_group_map_type == 3 ||
                    current->slice_group_map_type == 4 ||
@@ -408,8 +453,8 @@ static int FUNC(pps)(CodedBitstreamContext *ctx, RWContext *rw,
             allocate(current->slice_group_id,
                      current->pic_size_in_map_units_minus1 + 1);
             for (i = 0; i <= current->pic_size_in_map_units_minus1; i++)
-                u(av_log2(2 * current->num_slice_groups_minus1 + 1),
-                  slice_group_id[i], 0, current->num_slice_groups_minus1);
+                us(av_log2(2 * current->num_slice_groups_minus1 + 1),
+                   slice_group_id[i], 0, current->num_slice_groups_minus1, 1, i);
         }
     }
 
@@ -435,7 +480,7 @@ static int FUNC(pps)(CodedBitstreamContext *ctx, RWContext *rw,
         if (current->pic_scaling_matrix_present_flag) {
             for (i = 0; i < 6 + (((sps->chroma_format_idc != 3) ? 2 : 6) *
                                  current->transform_8x8_mode_flag); i++) {
-                flag(pic_scaling_list_present_flag[i]);
+                flags(pic_scaling_list_present_flag[i], 1, i);
                 if (current->pic_scaling_list_present_flag[i]) {
                     if (i < 6)
                         CHECK(FUNC(scaling_list)(ctx, rw,
@@ -468,6 +513,8 @@ static int FUNC(sei_buffering_period)(CodedBitstreamContext *ctx, RWContext *rw,
     const H264RawSPS *sps;
     int err, i, length;
 
+    HEADER("Buffering Period");
+
     ue(seq_parameter_set_id, 0, 31);
 
     sps = h264->sps[current->seq_parameter_set_id];
@@ -483,10 +530,10 @@ static int FUNC(sei_buffering_period)(CodedBitstreamContext *ctx, RWContext *rw,
             length = sps->vui.nal_hrd_parameters.initial_cpb_removal_delay_length_minus1 + 1;
             xu(length, initial_cpb_removal_delay[SchedSelIdx],
                current->nal.initial_cpb_removal_delay[i],
-               0, (1 << (uint64_t)length) - 1);
+               1, MAX_UINT_BITS(length), 1, i);
             xu(length, initial_cpb_removal_delay_offset[SchedSelIdx],
                current->nal.initial_cpb_removal_delay_offset[i],
-               0, (1 << (uint64_t)length) - 1);
+               0, MAX_UINT_BITS(length), 1, i);
         }
     }
 
@@ -495,10 +542,10 @@ static int FUNC(sei_buffering_period)(CodedBitstreamContext *ctx, RWContext *rw,
             length = sps->vui.vcl_hrd_parameters.initial_cpb_removal_delay_length_minus1 + 1;
             xu(length, initial_cpb_removal_delay[SchedSelIdx],
                current->vcl.initial_cpb_removal_delay[i],
-               0, (1 << (uint64_t)length) - 1);
+               1, MAX_UINT_BITS(length), 1, i);
             xu(length, initial_cpb_removal_delay_offset[SchedSelIdx],
                current->vcl.initial_cpb_removal_delay_offset[i],
-               0, (1 << (uint64_t)length) - 1);
+               0, MAX_UINT_BITS(length), 1, i);
         }
     }
 
@@ -506,10 +553,9 @@ static int FUNC(sei_buffering_period)(CodedBitstreamContext *ctx, RWContext *rw,
 }
 
 static int FUNC(sei_pic_timestamp)(CodedBitstreamContext *ctx, RWContext *rw,
-                                   H264RawSEIPicTimestamp *current)
+                                   H264RawSEIPicTimestamp *current,
+                                   const H264RawSPS *sps)
 {
-    CodedBitstreamH264Context *h264 = ctx->priv_data;
-    const H264RawSPS *sps;
     uint8_t time_offset_length;
     int err;
 
@@ -538,7 +584,6 @@ static int FUNC(sei_pic_timestamp)(CodedBitstreamContext *ctx, RWContext *rw,
         }
     }
 
-    sps = h264->active_sps;
     if (sps->vui.nal_hrd_parameters_present_flag)
         time_offset_length = sps->vui.nal_hrd_parameters.time_offset_length;
     else if (sps->vui.vcl_hrd_parameters_present_flag)
@@ -548,7 +593,7 @@ static int FUNC(sei_pic_timestamp)(CodedBitstreamContext *ctx, RWContext *rw,
 
     if (time_offset_length > 0)
         u(time_offset_length, time_offset,
-          0, (1 << (uint64_t)time_offset_length) - 1);
+          0, MAX_UINT_BITS(time_offset_length));
     else
         infer(time_offset, 0);
 
@@ -562,6 +607,8 @@ static int FUNC(sei_pic_timing)(CodedBitstreamContext *ctx, RWContext *rw,
     const H264RawSPS *sps;
     int err;
 
+    HEADER("Picture Timing");
+
     sps = h264->active_sps;
     if (!sps) {
         // If there is exactly one possible SPS but it is not yet active
@@ -600,9 +647,9 @@ static int FUNC(sei_pic_timing)(CodedBitstreamContext *ctx, RWContext *rw,
         }
 
         u(hrd->cpb_removal_delay_length_minus1 + 1, cpb_removal_delay,
-          0, (1 << (uint64_t)hrd->cpb_removal_delay_length_minus1) + 1);
+          0, MAX_UINT_BITS(hrd->cpb_removal_delay_length_minus1 + 1));
         u(hrd->dpb_output_delay_length_minus1 + 1, dpb_output_delay,
-          0, (1 << (uint64_t)hrd->dpb_output_delay_length_minus1) + 1);
+          0, MAX_UINT_BITS(hrd->dpb_output_delay_length_minus1 + 1));
     }
 
     if (sps->vui.pic_struct_present_flag) {
@@ -616,10 +663,37 @@ static int FUNC(sei_pic_timing)(CodedBitstreamContext *ctx, RWContext *rw,
             return AVERROR_INVALIDDATA;
 
         for (i = 0; i < num_clock_ts[current->pic_struct]; i++) {
-            flag(clock_timestamp_flag[i]);
+            flags(clock_timestamp_flag[i], 1, i);
             if (current->clock_timestamp_flag[i])
-                CHECK(FUNC(sei_pic_timestamp)(ctx, rw, &current->timestamp[i]));
+                CHECK(FUNC(sei_pic_timestamp)(ctx, rw,
+                                              &current->timestamp[i], sps));
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(sei_pan_scan_rect)(CodedBitstreamContext *ctx, RWContext *rw,
+                                   H264RawSEIPanScanRect *current)
+{
+    int err, i;
+
+    HEADER("Pan-Scan Rectangle");
+
+    ue(pan_scan_rect_id, 0, UINT32_MAX - 1);
+    flag(pan_scan_rect_cancel_flag);
+
+    if (!current->pan_scan_rect_cancel_flag) {
+        ue(pan_scan_cnt_minus1, 0, 2);
+
+        for (i = 0; i <= current->pan_scan_cnt_minus1; i++) {
+            ses(pan_scan_rect_left_offset[i],   INT32_MIN + 1, INT32_MAX, 1, i);
+            ses(pan_scan_rect_right_offset[i],  INT32_MIN + 1, INT32_MAX, 1, i);
+            ses(pan_scan_rect_top_offset[i],    INT32_MIN + 1, INT32_MAX, 1, i);
+            ses(pan_scan_rect_bottom_offset[i], INT32_MIN + 1, INT32_MAX, 1, i);
         }
+
+        ue(pan_scan_rect_repetition_period, 0, 16384);
     }
 
     return 0;
@@ -631,6 +705,8 @@ static int FUNC(sei_user_data_registered)(CodedBitstreamContext *ctx, RWContext
 {
     int err, i, j;
 
+    HEADER("User Data Registered ITU-T T.35");
+
     u(8, itu_t_t35_country_code, 0x00, 0xff);
     if (current->itu_t_t35_country_code != 0xff)
         i = 1;
@@ -652,7 +728,7 @@ static int FUNC(sei_user_data_registered)(CodedBitstreamContext *ctx, RWContext
 
     allocate(current->data, current->data_length);
     for (j = 0; j < current->data_length; j++)
-        xu(8, itu_t_t35_payload_byte, current->data[j], 0x00, 0xff);
+        xu(8, itu_t_t35_payload_byte[i], current->data[j], 0x00, 0xff, 1, i + j);
 
     return 0;
 }
@@ -663,6 +739,8 @@ static int FUNC(sei_user_data_unregistered)(CodedBitstreamContext *ctx, RWContex
 {
     int err, i;
 
+    HEADER("User Data Unregistered");
+
 #ifdef READ
     if (*payload_size < 16) {
         av_log(ctx->log_ctx, AV_LOG_ERROR,
@@ -674,15 +752,13 @@ static int FUNC(sei_user_data_unregistered)(CodedBitstreamContext *ctx, RWContex
     *payload_size = 16 + current->data_length;
 #endif
 
-    for (i = 0; i < 16; i++) {
-        xu(8, uuid_iso_iec_11578,
-           current->uuid_iso_iec_11578[i], 0x00, 0xff);
-    }
+    for (i = 0; i < 16; i++)
+        us(8, uuid_iso_iec_11578[i], 0x00, 0xff, 1, i);
 
     allocate(current->data, current->data_length);
 
     for (i = 0; i < current->data_length; i++)
-        xu(8, user_data_payload_byte, current->data[i], 0x00, 0xff);
+        xu(8, user_data_payload_byte[i], current->data[i], 0x00, 0xff, 1, i);
 
     return 0;
 }
@@ -692,6 +768,8 @@ static int FUNC(sei_recovery_point)(CodedBitstreamContext *ctx, RWContext *rw,
 {
     int err;
 
+    HEADER("Recovery Point");
+
     ue(recovery_frame_cnt, 0, 65535);
     flag(exact_match_flag);
     flag(broken_link_flag);
@@ -705,6 +783,8 @@ static int FUNC(sei_display_orientation)(CodedBitstreamContext *ctx, RWContext *
 {
     int err;
 
+    HEADER("Display Orientation");
+
     flag(display_orientation_cancel_flag);
     if (!current->display_orientation_cancel_flag) {
         flag(hor_flip);
@@ -717,6 +797,27 @@ static int FUNC(sei_display_orientation)(CodedBitstreamContext *ctx, RWContext *
     return 0;
 }
 
+static int FUNC(sei_mastering_display_colour_volume)(CodedBitstreamContext *ctx, RWContext *rw,
+                                                     H264RawSEIMasteringDisplayColourVolume *current)
+{
+    int err, c;
+
+    HEADER("Mastering Display Colour Volume");
+
+    for (c = 0; c < 3; c++) {
+        us(16, display_primaries_x[c], 0, 50000, 1, c);
+        us(16, display_primaries_y[c], 0, 50000, 1, c);
+    }
+
+    u(16, white_point_x, 0, 50000);
+    u(16, white_point_y, 0, 50000);
+
+    u(32, max_display_mastering_luminance, 1, MAX_UINT_BITS(32));
+    u(32, min_display_mastering_luminance, 0, current->max_display_mastering_luminance - 1);
+
+    return 0;
+}
+
 static int FUNC(sei_payload)(CodedBitstreamContext *ctx, RWContext *rw,
                              H264RawSEIPayload *current)
 {
@@ -724,7 +825,7 @@ static int FUNC(sei_payload)(CodedBitstreamContext *ctx, RWContext *rw,
     int start_position, end_position;
 
 #ifdef READ
-    start_position = bitstream_tell(rw);
+    start_position = get_bits_count(rw);
 #else
     start_position = put_bits_count(rw);
 #endif
@@ -738,11 +839,14 @@ static int FUNC(sei_payload)(CodedBitstreamContext *ctx, RWContext *rw,
         CHECK(FUNC(sei_pic_timing)
               (ctx, rw, &current->payload.pic_timing));
         break;
+    case H264_SEI_TYPE_PAN_SCAN_RECT:
+        CHECK(FUNC(sei_pan_scan_rect)
+              (ctx, rw, &current->payload.pan_scan_rect));
+        break;
     case H264_SEI_TYPE_FILLER_PAYLOAD:
         {
-            av_unused int ff_byte = 0xff;
             for (i = 0; i  < current->payload_size; i++)
-                xu(8, ff_byte, ff_byte, 0xff, 0xff);
+                fixed(8, ff_byte, 0xff);
         }
         break;
     case H264_SEI_TYPE_USER_DATA_REGISTERED:
@@ -761,26 +865,32 @@ static int FUNC(sei_payload)(CodedBitstreamContext *ctx, RWContext *rw,
         CHECK(FUNC(sei_display_orientation)
               (ctx, rw, &current->payload.display_orientation));
         break;
+    case H264_SEI_TYPE_MASTERING_DISPLAY_COLOUR_VOLUME:
+        CHECK(FUNC(sei_mastering_display_colour_volume)
+              (ctx, rw, &current->payload.mastering_display_colour_volume));
+        break;
     default:
         {
-            allocate(current->payload.other.data, current->payload_size);
-            for (i = 0; i < current->payload_size; i++)
-                xu(8, payload_byte, current->payload.other.data[i], 0, 255);
+#ifdef READ
+            current->payload.other.data_length = current->payload_size;
+#endif
+            allocate(current->payload.other.data, current->payload.other.data_length);
+            for (i = 0; i < current->payload.other.data_length; i++)
+                xu(8, payload_byte[i], current->payload.other.data[i], 0, 255, 1, i);
         }
     }
 
     if (byte_alignment(rw)) {
-        av_unused int one = 1, zero = 0;
-        xu(1, bit_equal_to_one, one, 1, 1);
+        fixed(1, bit_equal_to_one, 1);
         while (byte_alignment(rw))
-            xu(1, bit_equal_to_zero, zero, 0, 0);
+            fixed(1, bit_equal_to_zero, 0);
     }
 
 #ifdef READ
-    end_position = bitstream_tell(rw);
+    end_position = get_bits_count(rw);
     if (end_position < start_position + 8 * current->payload_size) {
         av_log(ctx->log_ctx, AV_LOG_ERROR, "Incorrect SEI payload length: "
-               "header %d bits, actually %d bits.\n",
+               "header %"PRIu32" bits, actually %d bits.\n",
                8 * current->payload_size,
                end_position - start_position);
         return AVERROR_INVALIDDATA;
@@ -809,18 +919,18 @@ static int FUNC(sei)(CodedBitstreamContext *ctx, RWContext *rw,
         uint32_t payload_size = 0;
         uint32_t tmp;
 
-        while (bitstream_peek(rw, 8) == 0xff) {
-            xu(8, ff_byte, tmp, 0xff, 0xff);
+        while (show_bits(rw, 8) == 0xff) {
+            fixed(8, ff_byte, 0xff);
             payload_type += 255;
         }
-        xu(8, last_payload_type_byte, tmp, 0, 254);
+        xu(8, last_payload_type_byte, tmp, 0, 254, 0);
         payload_type += tmp;
 
-        while (bitstream_peek(rw, 8) == 0xff) {
-            xu(8, ff_byte, tmp, 0xff, 0xff);
+        while (show_bits(rw, 8) == 0xff) {
+            fixed(8, ff_byte, 0xff);
             payload_size += 255;
         }
-        xu(8, last_payload_size_byte, tmp, 0, 254);
+        xu(8, last_payload_size_byte, tmp, 0, 254, 0);
         payload_size += tmp;
 
         current->payload[k].payload_type = payload_type;
@@ -853,17 +963,17 @@ static int FUNC(sei)(CodedBitstreamContext *ctx, RWContext *rw,
 
             tmp = current->payload[k].payload_type;
             while (tmp >= 255) {
-                xu(8, ff_byte, 0xff, 0xff, 0xff);
+                fixed(8, ff_byte, 0xff);
                 tmp -= 255;
             }
-            xu(8, last_payload_type_byte, tmp, 0, 254);
+            xu(8, last_payload_type_byte, tmp, 0, 254, 0);
 
             tmp = current->payload[k].payload_size;
             while (tmp >= 255) {
-                xu(8, ff_byte, 0xff, 0xff, 0xff);
+                fixed(8, ff_byte, 0xff);
                 tmp -= 255;
             }
-            xu(8, last_payload_size_byte, tmp, 0, 254);
+            xu(8, last_payload_size_byte, tmp, 0, 254, 0);
 
             CHECK(FUNC(sei_payload)(ctx, rw, &current->payload[k]));
         }
@@ -905,7 +1015,7 @@ static int FUNC(ref_pic_list_modification)(CodedBitstreamContext *ctx, RWContext
         if (current->ref_pic_list_modification_flag_l0) {
             for (i = 0; i < H264_MAX_RPLM_COUNT; i++) {
                 xue(modification_of_pic_nums_idc,
-                    current->rplm_l0[i].modification_of_pic_nums_idc, 0, 3);
+                    current->rplm_l0[i].modification_of_pic_nums_idc, 0, 3, 0);
 
                 mopn = current->rplm_l0[i].modification_of_pic_nums_idc;
                 if (mopn == 3)
@@ -915,11 +1025,11 @@ static int FUNC(ref_pic_list_modification)(CodedBitstreamContext *ctx, RWContext
                     xue(abs_diff_pic_num_minus1,
                         current->rplm_l0[i].abs_diff_pic_num_minus1,
                         0, (1 + current->field_pic_flag) *
-                        (1 << (sps->log2_max_frame_num_minus4 + 4)));
+                        (1 << (sps->log2_max_frame_num_minus4 + 4)), 0);
                 else if (mopn == 2)
                     xue(long_term_pic_num,
                         current->rplm_l0[i].long_term_pic_num,
-                        0, sps->max_num_ref_frames - 1);
+                        0, sps->max_num_ref_frames - 1, 0);
             }
         }
     }
@@ -929,7 +1039,7 @@ static int FUNC(ref_pic_list_modification)(CodedBitstreamContext *ctx, RWContext
         if (current->ref_pic_list_modification_flag_l1) {
             for (i = 0; i < H264_MAX_RPLM_COUNT; i++) {
                 xue(modification_of_pic_nums_idc,
-                    current->rplm_l1[i].modification_of_pic_nums_idc, 0, 3);
+                    current->rplm_l1[i].modification_of_pic_nums_idc, 0, 3, 0);
 
                 mopn = current->rplm_l1[i].modification_of_pic_nums_idc;
                 if (mopn == 3)
@@ -939,11 +1049,11 @@ static int FUNC(ref_pic_list_modification)(CodedBitstreamContext *ctx, RWContext
                     xue(abs_diff_pic_num_minus1,
                         current->rplm_l1[i].abs_diff_pic_num_minus1,
                         0, (1 + current->field_pic_flag) *
-                        (1 << (sps->log2_max_frame_num_minus4 + 4)));
+                        (1 << (sps->log2_max_frame_num_minus4 + 4)), 0);
                 else if (mopn == 2)
                     xue(long_term_pic_num,
                         current->rplm_l1[i].long_term_pic_num,
-                        0, sps->max_num_ref_frames - 1);
+                        0, sps->max_num_ref_frames - 1, 0);
             }
         }
     }
@@ -966,17 +1076,17 @@ static int FUNC(pred_weight_table)(CodedBitstreamContext *ctx, RWContext *rw,
         ue(chroma_log2_weight_denom, 0, 7);
 
     for (i = 0; i <= current->num_ref_idx_l0_active_minus1; i++) {
-        flag(luma_weight_l0_flag[i]);
+        flags(luma_weight_l0_flag[i], 1, i);
         if (current->luma_weight_l0_flag[i]) {
-            se(luma_weight_l0[i], -128, +127);
-            se(luma_offset_l0[i], -128, +127);
+            ses(luma_weight_l0[i], -128, +127, 1, i);
+            ses(luma_offset_l0[i], -128, +127, 1, i);
         }
         if (chroma) {
-            flag(chroma_weight_l0_flag[i]);
+            flags(chroma_weight_l0_flag[i], 1, i);
             if (current->chroma_weight_l0_flag[i]) {
                 for (j = 0; j < 2; j++) {
-                    se(chroma_weight_l0[i][j], -128, +127);
-                    se(chroma_offset_l0[i][j], -128, +127);
+                    ses(chroma_weight_l0[i][j], -128, +127, 2, i, j);
+                    ses(chroma_offset_l0[i][j], -128, +127, 2, i, j);
                 }
             }
         }
@@ -984,17 +1094,17 @@ static int FUNC(pred_weight_table)(CodedBitstreamContext *ctx, RWContext *rw,
 
     if (current->slice_type % 5 == 1) {
         for (i = 0; i <= current->num_ref_idx_l1_active_minus1; i++) {
-            flag(luma_weight_l1_flag[i]);
+            flags(luma_weight_l1_flag[i], 1, i);
             if (current->luma_weight_l1_flag[i]) {
-                se(luma_weight_l1[i], -128, +127);
-                se(luma_offset_l1[i], -128, +127);
+                ses(luma_weight_l1[i], -128, +127, 1, i);
+                ses(luma_offset_l1[i], -128, +127, 1, i);
             }
             if (chroma) {
-                flag(chroma_weight_l1_flag[i]);
+                flags(chroma_weight_l1_flag[i], 1, i);
                 if (current->chroma_weight_l1_flag[i]) {
                     for (j = 0; j < 2; j++) {
-                        se(chroma_weight_l1[i][j], -128, +127);
-                        se(chroma_offset_l1[i][j], -128, +127);
+                        ses(chroma_weight_l1[i][j], -128, +127, 2, i, j);
+                        ses(chroma_offset_l1[i][j], -128, +127, 2, i, j);
                     }
                 }
             }
@@ -1021,7 +1131,7 @@ static int FUNC(dec_ref_pic_marking)(CodedBitstreamContext *ctx, RWContext *rw,
             for (i = 0; i < H264_MAX_MMCO_COUNT; i++) {
                 xue(memory_management_control_operation,
                     current->mmco[i].memory_management_control_operation,
-                    0, 6);
+                    0, 6, 0);
 
                 mmco = current->mmco[i].memory_management_control_operation;
                 if (mmco == 0)
@@ -1030,19 +1140,19 @@ static int FUNC(dec_ref_pic_marking)(CodedBitstreamContext *ctx, RWContext *rw,
                 if (mmco == 1 || mmco == 3)
                     xue(difference_of_pic_nums_minus1,
                         current->mmco[i].difference_of_pic_nums_minus1,
-                        0, INT32_MAX);
+                        0, INT32_MAX, 0);
                 if (mmco == 2)
                     xue(long_term_pic_num,
                         current->mmco[i].long_term_pic_num,
-                        0, sps->max_num_ref_frames - 1);
+                        0, sps->max_num_ref_frames - 1, 0);
                 if (mmco == 3 || mmco == 6)
                     xue(long_term_frame_idx,
                         current->mmco[i].long_term_frame_idx,
-                        0, sps->max_num_ref_frames - 1);
+                        0, sps->max_num_ref_frames - 1, 0);
                 if (mmco == 4)
                     xue(max_long_term_frame_idx_plus1,
                         current->mmco[i].max_long_term_frame_idx_plus1,
-                        0, sps->max_num_ref_frames);
+                        0, sps->max_num_ref_frames, 0);
             }
             if (i == H264_MAX_MMCO_COUNT) {
                 av_log(ctx->log_ctx, AV_LOG_ERROR, "Too many "
@@ -1080,11 +1190,10 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
                    "in the same access unit.\n");
             return AVERROR_INVALIDDATA;
         }
+        idr_pic_flag = h264->last_slice_nal_unit_type == H264_NAL_IDR_SLICE;
     } else {
-        h264->last_slice_nal_unit_type =
-            current->nal_unit_header.nal_unit_type;
+        idr_pic_flag = current->nal_unit_header.nal_unit_type == H264_NAL_IDR_SLICE;
     }
-    idr_pic_flag = h264->last_slice_nal_unit_type == H264_NAL_IDR_SLICE;
 
     ue(first_mb_in_slice, 0, H264_MAX_MB_PIC_SIZE - 1);
     ue(slice_type, 0, 9);
@@ -1123,7 +1232,7 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
         u(2, colour_plane_id, 0, 2);
 
     u(sps->log2_max_frame_num_minus4 + 4, frame_num,
-      0, (1 << (sps->log2_max_frame_num_minus4 + 4)) - 1);
+      0, MAX_UINT_BITS(sps->log2_max_frame_num_minus4 + 4));
 
     if (!sps->frame_mbs_only_flag) {
         flag(field_pic_flag);
@@ -1141,7 +1250,7 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
 
     if (sps->pic_order_cnt_type == 0) {
         u(sps->log2_max_pic_order_cnt_lsb_minus4 + 4, pic_order_cnt_lsb,
-          0, (1 << (sps->log2_max_pic_order_cnt_lsb_minus4 + 4)) - 1);
+          0, MAX_UINT_BITS(sps->log2_max_pic_order_cnt_lsb_minus4 + 4));
         if (pps->bottom_field_pic_order_in_frame_present_flag &&
             !current->field_pic_flag)
             se(delta_pic_order_cnt_bottom, INT32_MIN + 1, INT32_MAX);
@@ -1162,6 +1271,13 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
 
     if (pps->redundant_pic_cnt_present_flag)
         ue(redundant_pic_cnt, 0, 127);
+    else
+        infer(redundant_pic_cnt, 0);
+
+    if (current->nal_unit_header.nal_unit_type != H264_NAL_AUXILIARY_SLICE
+        && !current->redundant_pic_cnt)
+        h264->last_slice_nal_unit_type =
+            current->nal_unit_header.nal_unit_type;
 
     if (slice_type_b)
         flag(direct_spatial_mv_pred_flag);
@@ -1240,9 +1356,8 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
     }
 
     if (pps->entropy_coding_mode_flag) {
-        av_unused int one = 1;
         while (byte_alignment(rw))
-            xu(1, cabac_alignment_one_bit, one, 1, 1);
+            fixed(1, cabac_alignment_one_bit, 1);
     }
 
     return 0;
@@ -1251,7 +1366,6 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
 static int FUNC(filler)(CodedBitstreamContext *ctx, RWContext *rw,
                         H264RawFiller *current)
 {
-    av_unused int ff_byte = 0xff;
     int err;
 
     HEADER("Filler Data");
@@ -1260,15 +1374,15 @@ static int FUNC(filler)(CodedBitstreamContext *ctx, RWContext *rw,
                                 1 << H264_NAL_FILLER_DATA));
 
 #ifdef READ
-    while (bitstream_peek(rw, 8) == 0xff) {
-        xu(8, ff_byte, ff_byte, 0xff, 0xff);
+    while (show_bits(rw, 8) == 0xff) {
+        fixed(8, ff_byte, 0xff);
         ++current->filler_size;
     }
 #else
     {
         uint32_t i;
         for (i = 0; i < current->filler_size; i++)
-            xu(8, ff_byte, ff_byte, 0xff, 0xff);
+            fixed(8, ff_byte, 0xff);
     }
 #endif
 
@@ -1276,3 +1390,21 @@ static int FUNC(filler)(CodedBitstreamContext *ctx, RWContext *rw,
 
     return 0;
 }
+
+static int FUNC(end_of_sequence)(CodedBitstreamContext *ctx, RWContext *rw,
+                                 H264RawNALUnitHeader *current)
+{
+    HEADER("End of Sequence");
+
+    return FUNC(nal_unit_header)(ctx, rw, current,
+                                 1 << H264_NAL_END_SEQUENCE);
+}
+
+static int FUNC(end_of_stream)(CodedBitstreamContext *ctx, RWContext *rw,
+                               H264RawNALUnitHeader *current)
+{
+    HEADER("End of Stream");
+
+    return FUNC(nal_unit_header)(ctx, rw, current,
+                                 1 << H264_NAL_END_STREAM);
+}
diff --git a/libavcodec/cbs_h265.h b/libavcodec/cbs_h265.h
index 0628748..d216cac 100644
--- a/libavcodec/cbs_h265.h
+++ b/libavcodec/cbs_h265.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,14 @@
 #include "cbs_h2645.h"
 #include "hevc.h"
 
+enum {
+    // This limit is arbitrary - it is sufficient for one message of each
+    // type plus some repeats, and will therefore easily cover all sane
+    // streams.  However, it is possible to make technically-valid streams
+    // for which it will fail (for example, by including a large number of
+    // user-data-unregistered messages).
+    H265_MAX_SEI_PAYLOADS = 64,
+};
 
 typedef struct H265RawNALUnitHeader {
     uint8_t forbidden_zero_bit;
@@ -63,7 +71,31 @@ typedef struct H265RawProfileTierLevel {
     uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
     uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
 
-    // TODO: much of that again for each sub-layer.
+    uint8_t sub_layer_profile_space[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_tier_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_profile_idc[HEVC_MAX_SUB_LAYERS];
+
+    uint8_t sub_layer_profile_compatibility_flag[HEVC_MAX_SUB_LAYERS][32];
+
+    uint8_t sub_layer_progressive_source_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_interlaced_source_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_non_packed_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_frame_only_constraint_flag[HEVC_MAX_SUB_LAYERS];
+
+    uint8_t sub_layer_max_12bit_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_max_10bit_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_max_8bit_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_max_422chroma_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_max_420chroma_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_max_monochrome_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_intra_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_one_picture_only_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_lower_bit_rate_constraint_flag[HEVC_MAX_SUB_LAYERS];
+    uint8_t sub_layer_max_14bit_constraint_flag[HEVC_MAX_SUB_LAYERS];
+
+    uint8_t sub_layer_inbld_flag[HEVC_MAX_SUB_LAYERS];
+
+    uint8_t sub_layer_level_idc[HEVC_MAX_SUB_LAYERS];
 } H265RawProfileTierLevel;
 
 typedef struct H265RawSubLayerHRDParameters {
@@ -517,12 +549,179 @@ typedef struct H265RawSlice {
 } H265RawSlice;
 
 
+typedef struct H265RawSEIBufferingPeriod {
+    uint8_t  bp_seq_parameter_set_id;
+    uint8_t  irap_cpb_params_present_flag;
+    uint32_t cpb_delay_offset;
+    uint32_t dpb_delay_offset;
+    uint8_t  concatenation_flag;
+    uint32_t au_cpb_removal_delay_delta_minus1;
+
+    uint32_t nal_initial_cpb_removal_delay[HEVC_MAX_CPB_CNT];
+    uint32_t nal_initial_cpb_removal_offset[HEVC_MAX_CPB_CNT];
+    uint32_t nal_initial_alt_cpb_removal_delay[HEVC_MAX_CPB_CNT];
+    uint32_t nal_initial_alt_cpb_removal_offset[HEVC_MAX_CPB_CNT];
+
+    uint32_t vcl_initial_cpb_removal_delay[HEVC_MAX_CPB_CNT];
+    uint32_t vcl_initial_cpb_removal_offset[HEVC_MAX_CPB_CNT];
+    uint32_t vcl_initial_alt_cpb_removal_delay[HEVC_MAX_CPB_CNT];
+    uint32_t vcl_initial_alt_cpb_removal_offset[HEVC_MAX_CPB_CNT];
+
+    uint8_t  use_alt_cpb_params_flag;
+} H265RawSEIBufferingPeriod;
+
+typedef struct H265RawSEIPicTiming {
+    uint8_t pic_struct;
+    uint8_t source_scan_type;
+    uint8_t duplicate_flag;
+
+    uint32_t au_cpb_removal_delay_minus1;
+    uint32_t pic_dpb_output_delay;
+    uint32_t pic_dpb_output_du_delay;
+
+    uint16_t num_decoding_units_minus1;
+    uint8_t  du_common_cpb_removal_delay_flag;
+    uint32_t du_common_cpb_removal_delay_increment_minus1;
+    uint16_t num_nalus_in_du_minus1[HEVC_MAX_SLICE_SEGMENTS];
+    uint32_t du_cpb_removal_delay_increment_minus1[HEVC_MAX_SLICE_SEGMENTS];
+} H265RawSEIPicTiming;
+
+typedef struct H265RawSEIPanScanRect {
+    uint32_t pan_scan_rect_id;
+    uint8_t  pan_scan_rect_cancel_flag;
+    uint8_t  pan_scan_cnt_minus1;
+    int32_t  pan_scan_rect_left_offset[3];
+    int32_t  pan_scan_rect_right_offset[3];
+    int32_t  pan_scan_rect_top_offset[3];
+    int32_t  pan_scan_rect_bottom_offset[3];
+    uint16_t pan_scan_rect_persistence_flag;
+} H265RawSEIPanScanRect;
+
+typedef struct H265RawSEIUserDataRegistered {
+    uint8_t itu_t_t35_country_code;
+    uint8_t itu_t_t35_country_code_extension_byte;
+    uint8_t     *data;
+    size_t       data_length;
+    AVBufferRef *data_ref;
+} H265RawSEIUserDataRegistered;
+
+typedef struct H265RawSEIUserDataUnregistered {
+    uint8_t uuid_iso_iec_11578[16];
+    uint8_t     *data;
+    size_t       data_length;
+    AVBufferRef *data_ref;
+} H265RawSEIUserDataUnregistered;
+
+typedef struct H265RawSEIRecoveryPoint {
+    int16_t recovery_poc_cnt;
+    uint8_t exact_match_flag;
+    uint8_t broken_link_flag;
+} H265RawSEIRecoveryPoint;
+
+typedef struct H265RawSEIDisplayOrientation {
+    uint8_t display_orientation_cancel_flag;
+    uint8_t hor_flip;
+    uint8_t ver_flip;
+    uint16_t anticlockwise_rotation;
+    uint16_t display_orientation_repetition_period;
+    uint8_t display_orientation_persistence_flag;
+} H265RawSEIDisplayOrientation;
+
+typedef struct H265RawSEIActiveParameterSets {
+    uint8_t active_video_parameter_set_id;
+    uint8_t self_contained_cvs_flag;
+    uint8_t no_parameter_set_update_flag;
+    uint8_t num_sps_ids_minus1;
+    uint8_t active_seq_parameter_set_id[HEVC_MAX_SPS_COUNT];
+    uint8_t layer_sps_idx[HEVC_MAX_LAYERS];
+} H265RawSEIActiveParameterSets;
+
+typedef struct H265RawSEIDecodedPictureHash {
+    uint8_t  hash_type;
+    uint8_t  picture_md5[3][16];
+    uint16_t picture_crc[3];
+    uint32_t picture_checksum[3];
+} H265RawSEIDecodedPictureHash;
+
+typedef struct H265RawSEITimeCode {
+    uint8_t  num_clock_ts;
+    uint8_t  clock_timestamp_flag[3];
+    uint8_t  units_field_based_flag[3];
+    uint8_t  counting_type[3];
+    uint8_t  full_timestamp_flag[3];
+    uint8_t  discontinuity_flag[3];
+    uint8_t  cnt_dropped_flag[3];
+    uint16_t n_frames[3];
+    uint8_t  seconds_value[3];
+    uint8_t  minutes_value[3];
+    uint8_t  hours_value[3];
+    uint8_t  seconds_flag[3];
+    uint8_t  minutes_flag[3];
+    uint8_t  hours_flag[3];
+    uint8_t  time_offset_length[3];
+    uint32_t time_offset_value[3];
+} H265RawSEITimeCode;
+
+typedef struct H265RawSEIMasteringDisplayColourVolume {
+    uint16_t display_primaries_x[3];
+    uint16_t display_primaries_y[3];
+    uint16_t white_point_x;
+    uint16_t white_point_y;
+    uint32_t max_display_mastering_luminance;
+    uint32_t min_display_mastering_luminance;
+} H265RawSEIMasteringDisplayColourVolume;
+
+typedef struct H265RawSEIContentLightLevelInfo {
+    uint16_t max_content_light_level;
+    uint16_t max_pic_average_light_level;
+} H265RawSEIContentLightLevelInfo;
+
+typedef struct H265RawSEIAlternativeTransferCharacteristics {
+    uint8_t preferred_transfer_characteristics;
+} H265RawSEIAlternativeTransferCharacteristics;
+
+typedef struct H265RawSEIPayload {
+    uint32_t payload_type;
+    uint32_t payload_size;
+    union {
+        H265RawSEIBufferingPeriod buffering_period;
+        H265RawSEIPicTiming pic_timing;
+        H265RawSEIPanScanRect pan_scan_rect;
+        H265RawSEIUserDataRegistered user_data_registered;
+        H265RawSEIUserDataUnregistered user_data_unregistered;
+        H265RawSEIRecoveryPoint recovery_point;
+        H265RawSEIDisplayOrientation display_orientation;
+        H265RawSEIActiveParameterSets active_parameter_sets;
+        H265RawSEIDecodedPictureHash decoded_picture_hash;
+        H265RawSEITimeCode time_code;
+        H265RawSEIMasteringDisplayColourVolume mastering_display;
+        H265RawSEIContentLightLevelInfo content_light_level;
+        H265RawSEIAlternativeTransferCharacteristics
+            alternative_transfer_characteristics;
+        struct {
+            uint8_t *data;
+            size_t data_length;
+            AVBufferRef *data_ref;
+        } other;
+    } payload;
+} H265RawSEIPayload;
+
+typedef struct H265RawSEI {
+    H265RawNALUnitHeader nal_unit_header;
+
+    H265RawSEIPayload payload[H265_MAX_SEI_PAYLOADS];
+    uint8_t payload_count;
+} H265RawSEI;
+
 typedef struct CodedBitstreamH265Context {
     // Reader/writer context in common with the H.264 implementation.
     CodedBitstreamH2645Context common;
 
     // All currently available parameter sets.  These are updated when
     // any parameter set NAL unit is read/written with this context.
+    AVBufferRef *vps_ref[HEVC_MAX_VPS_COUNT];
+    AVBufferRef *sps_ref[HEVC_MAX_SPS_COUNT];
+    AVBufferRef *pps_ref[HEVC_MAX_PPS_COUNT];
     H265RawVPS *vps[HEVC_MAX_VPS_COUNT];
     H265RawSPS *sps[HEVC_MAX_SPS_COUNT];
     H265RawPPS *pps[HEVC_MAX_PPS_COUNT];
diff --git a/libavcodec/cbs_h265_syntax_template.c b/libavcodec/cbs_h265_syntax_template.c
index a194887..f1e1bb0 100644
--- a/libavcodec/cbs_h265_syntax_template.c
+++ b/libavcodec/cbs_h265_syntax_template.c
@@ -1,28 +1,28 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 static int FUNC(rbsp_trailing_bits)(CodedBitstreamContext *ctx, RWContext *rw)
 {
     int err;
-    av_unused int one = 1, zero = 0;
-    xu(1, rbsp_stop_one_bit, one, 1, 1);
+
+    fixed(1, rbsp_stop_one_bit, 1);
     while (byte_alignment(rw) != 0)
-        xu(1, rbsp_alignment_zero_bit, zero, 0, 0);
+        fixed(1, rbsp_alignment_zero_bit, 0);
 
     return 0;
 }
@@ -50,10 +50,10 @@ static int FUNC(nal_unit_header)(CodedBitstreamContext *ctx, RWContext *rw,
 static int FUNC(byte_alignment)(CodedBitstreamContext *ctx, RWContext *rw)
 {
     int err;
-    av_unused int one = 1, zero = 0;
-    xu(1, alignment_bit_equal_to_one, one, 1, 1);
+
+    fixed(1, alignment_bit_equal_to_one, 1);
     while (byte_alignment(rw) != 0)
-        xu(1, alignment_bit_equal_to_zero, zero, 0, 0);
+        fixed(1, alignment_bit_equal_to_zero, 0);
 
     return 0;
 }
@@ -64,23 +64,23 @@ static int FUNC(extension_data)(CodedBitstreamContext *ctx, RWContext *rw,
     int err;
     size_t k;
 #ifdef READ
-    BitstreamContext start;
+    GetBitContext start;
     uint8_t bit;
     start = *rw;
     for (k = 0; cbs_h2645_read_more_rbsp_data(rw); k++)
-        bitstream_skip(rw, 1);
+        skip_bits(rw, 1);
     current->bit_length = k;
     if (k > 0) {
         *rw = start;
         allocate(current->data, (current->bit_length + 7) / 8);
         for (k = 0; k < current->bit_length; k++) {
-            xu(1, extension_data, bit, 0, 1);
+            xu(1, extension_data, bit, 0, 1, 0);
             current->data[k / 8] |= bit << (7 - k % 8);
         }
     }
 #else
     for (k = 0; k < current->bit_length; k++)
-        xu(1, extension_data, current->data[k / 8] >> (7 - k % 8), 0, 1);
+        xu(1, extension_data, current->data[k / 8] >> (7 - k % 8), 0, 1, 0);
 #endif
     return 0;
 }
@@ -90,7 +90,6 @@ static int FUNC(profile_tier_level)(CodedBitstreamContext *ctx, RWContext *rw,
                                     int profile_present_flag,
                                     int max_num_sub_layers_minus1)
 {
-    av_unused unsigned int zero = 0;
     int err, i, j;
 
     if (profile_present_flag) {
@@ -99,7 +98,7 @@ static int FUNC(profile_tier_level)(CodedBitstreamContext *ctx, RWContext *rw,
         u(5, general_profile_idc, 0, 31);
 
         for (j = 0; j < 32; j++)
-            flag(general_profile_compatibility_flag[j]);
+            flags(general_profile_compatibility_flag[j], 1, j);
 
         flag(general_progressive_source_flag);
         flag(general_interlaced_source_flag);
@@ -125,15 +124,20 @@ static int FUNC(profile_tier_level)(CodedBitstreamContext *ctx, RWContext *rw,
             if (profile_compatible(5) || profile_compatible(9) ||
                 profile_compatible(10)) {
                 flag(general_max_14bit_constraint_flag);
-                xu(24, general_reserved_zero_33bits, zero, 0, 0);
-                xu(9, general_reserved_zero_33bits, zero, 0, 0);
+                fixed(24, general_reserved_zero_33bits, 0);
+                fixed( 9, general_reserved_zero_33bits, 0);
             } else {
-                xu(24, general_reserved_zero_34bits, zero, 0, 0);
-                xu(10, general_reserved_zero_34bits, zero, 0, 0);
+                fixed(24, general_reserved_zero_34bits, 0);
+                fixed(10, general_reserved_zero_34bits, 0);
             }
+        } else if (profile_compatible(2)) {
+            fixed(7, general_reserved_zero_7bits, 0);
+            flag(general_one_picture_only_constraint_flag);
+            fixed(24, general_reserved_zero_35bits, 0);
+            fixed(11, general_reserved_zero_35bits, 0);
         } else {
-            xu(24, general_reserved_zero_43bits, zero, 0, 0);
-            xu(19, general_reserved_zero_43bits, zero, 0, 0);
+            fixed(24, general_reserved_zero_43bits, 0);
+            fixed(19, general_reserved_zero_43bits, 0);
         }
 
         if (profile_compatible(1) || profile_compatible(2) ||
@@ -141,7 +145,7 @@ static int FUNC(profile_tier_level)(CodedBitstreamContext *ctx, RWContext *rw,
             profile_compatible(5) || profile_compatible(9)) {
             flag(general_inbld_flag);
         } else {
-            xu(1, general_reserved_zero_bit, zero, 0, 0);
+            fixed(1, general_reserved_zero_bit, 0);
         }
 #undef profile_compatible
     }
@@ -149,22 +153,74 @@ static int FUNC(profile_tier_level)(CodedBitstreamContext *ctx, RWContext *rw,
     u(8, general_level_idc, 0, 255);
 
     for (i = 0; i < max_num_sub_layers_minus1; i++) {
-        flag(sub_layer_profile_present_flag[i]);
-        flag(sub_layer_level_present_flag[i]);
+        flags(sub_layer_profile_present_flag[i], 1, i);
+        flags(sub_layer_level_present_flag[i],   1, i);
     }
 
     if (max_num_sub_layers_minus1 > 0) {
-        for (i = max_num_sub_layers_minus1; i < 8; i++) {
-            av_unused int zero = 0;
-            xu(2, reserved_zero_2bits, zero, 0, 0);
-        }
+        for (i = max_num_sub_layers_minus1; i < 8; i++)
+            fixed(2, reserved_zero_2bits, 0);
     }
 
     for (i = 0; i < max_num_sub_layers_minus1; i++) {
-        if (current->sub_layer_profile_present_flag[i])
-            return AVERROR_PATCHWELCOME;
+        if (current->sub_layer_profile_present_flag[i]) {
+            us(2, sub_layer_profile_space[i], 0, 0, 1, i);
+            flags(sub_layer_tier_flag[i],           1, i);
+            us(5, sub_layer_profile_idc[i], 0, 31,  1, i);
+
+            for (j = 0; j < 32; j++)
+                flags(sub_layer_profile_compatibility_flag[i][j], 2, i, j);
+
+            flags(sub_layer_progressive_source_flag[i],    1, i);
+            flags(sub_layer_interlaced_source_flag[i],     1, i);
+            flags(sub_layer_non_packed_constraint_flag[i], 1, i);
+            flags(sub_layer_frame_only_constraint_flag[i], 1, i);
+
+#define profile_compatible(x) (current->sub_layer_profile_idc[i] == (x) ||   \
+                               current->sub_layer_profile_compatibility_flag[i][x])
+            if (profile_compatible(4) || profile_compatible(5) ||
+                profile_compatible(6) || profile_compatible(7) ||
+                profile_compatible(8) || profile_compatible(9) ||
+                profile_compatible(10)) {
+                flags(sub_layer_max_12bit_constraint_flag[i],        1, i);
+                flags(sub_layer_max_10bit_constraint_flag[i],        1, i);
+                flags(sub_layer_max_8bit_constraint_flag[i],         1, i);
+                flags(sub_layer_max_422chroma_constraint_flag[i],    1, i);
+                flags(sub_layer_max_420chroma_constraint_flag[i],    1, i);
+                flags(sub_layer_max_monochrome_constraint_flag[i],   1, i);
+                flags(sub_layer_intra_constraint_flag[i],            1, i);
+                flags(sub_layer_one_picture_only_constraint_flag[i], 1, i);
+                flags(sub_layer_lower_bit_rate_constraint_flag[i],   1, i);
+
+                if (profile_compatible(5)) {
+                    flags(sub_layer_max_14bit_constraint_flag[i], 1, i);
+                    fixed(24, sub_layer_reserved_zero_33bits, 0);
+                    fixed( 9, sub_layer_reserved_zero_33bits, 0);
+                } else {
+                    fixed(24, sub_layer_reserved_zero_34bits, 0);
+                    fixed(10, sub_layer_reserved_zero_34bits, 0);
+                }
+            } else if (profile_compatible(2)) {
+                fixed(7, sub_layer_reserved_zero_7bits, 0);
+                flags(sub_layer_one_picture_only_constraint_flag[i], 1, i);
+                fixed(24, sub_layer_reserved_zero_43bits, 0);
+                fixed(11, sub_layer_reserved_zero_43bits, 0);
+            } else {
+                fixed(24, sub_layer_reserved_zero_43bits, 0);
+                fixed(19, sub_layer_reserved_zero_43bits, 0);
+            }
+
+            if (profile_compatible(1) || profile_compatible(2) ||
+                profile_compatible(3) || profile_compatible(4) ||
+                profile_compatible(5) || profile_compatible(9)) {
+                flags(sub_layer_inbld_flag[i], 1, i);
+            } else {
+                fixed(1, sub_layer_reserved_zero_bit, 0);
+            }
+#undef profile_compatible
+        }
         if (current->sub_layer_level_present_flag[i])
-            return AVERROR_PATCHWELCOME;
+            us(8, sub_layer_level_idc[i], 0, 255, 1, i);
     }
 
     return 0;
@@ -183,13 +239,13 @@ static int FUNC(sub_layer_hrd_parameters)(CodedBitstreamContext *ctx, RWContext
         current = &hrd->vcl_sub_layer_hrd_parameters[sub_layer_id];
 
     for (i = 0; i <= hrd->cpb_cnt_minus1[sub_layer_id]; i++) {
-        ue(bit_rate_value_minus1[i], 0, UINT32_MAX - 1);
-        ue(cpb_size_value_minus1[i], 0, UINT32_MAX - 1);
+        ues(bit_rate_value_minus1[i], 0, UINT32_MAX - 1, 1, i);
+        ues(cpb_size_value_minus1[i], 0, UINT32_MAX - 1, 1, i);
         if (hrd->sub_pic_hrd_params_present_flag) {
-            ue(cpb_size_du_value_minus1[i], 0, UINT32_MAX - 1);
-            ue(bit_rate_du_value_minus1[i], 0, UINT32_MAX - 1);
+            ues(cpb_size_du_value_minus1[i], 0, UINT32_MAX - 1, 1, i);
+            ues(bit_rate_du_value_minus1[i], 0, UINT32_MAX - 1, 1, i);
         }
-        flag(cbr_flag[i]);
+        flags(cbr_flag[i], 1, i);
     }
 
     return 0;
@@ -233,21 +289,21 @@ static int FUNC(hrd_parameters)(CodedBitstreamContext *ctx, RWContext *rw,
     }
 
     for (i = 0; i <= max_num_sub_layers_minus1; i++) {
-        flag(fixed_pic_rate_general_flag[i]);
+        flags(fixed_pic_rate_general_flag[i], 1, i);
 
         if (!current->fixed_pic_rate_general_flag[i])
-            flag(fixed_pic_rate_within_cvs_flag[i]);
+            flags(fixed_pic_rate_within_cvs_flag[i], 1, i);
         else
             infer(fixed_pic_rate_within_cvs_flag[i], 1);
 
         if (current->fixed_pic_rate_within_cvs_flag[i]) {
-            ue(elemental_duration_in_tc_minus1[i], 0, 2047);
+            ues(elemental_duration_in_tc_minus1[i], 0, 2047, 1, i);
             infer(low_delay_hrd_flag[i], 0);
         } else
-            flag(low_delay_hrd_flag[i]);
+            flags(low_delay_hrd_flag[i], 1, i);
 
         if (!current->low_delay_hrd_flag[i])
-            ue(cpb_cnt_minus1[i], 0, 31);
+            ues(cpb_cnt_minus1[i], 0, 31, 1, i);
         else
             infer(cpb_cnt_minus1[i], 0);
 
@@ -386,10 +442,7 @@ static int FUNC(vps)(CodedBitstreamContext *ctx, RWContext *rw,
         return AVERROR_INVALIDDATA;
     }
 
-    {
-        av_unused uint16_t ffff = 0xffff;
-        xu(16, vps_reserved_0xffff_16bits, ffff, 0xffff, 0xffff);
-    }
+    fixed(16, vps_reserved_0xffff_16bits, 0xffff);
 
     CHECK(FUNC(profile_tier_level)(ctx, rw, &current->profile_tier_level,
                                    1, current->vps_max_sub_layers_minus1));
@@ -398,9 +451,12 @@ static int FUNC(vps)(CodedBitstreamContext *ctx, RWContext *rw,
     for (i = (current->vps_sub_layer_ordering_info_present_flag ?
               0 : current->vps_max_sub_layers_minus1);
          i <= current->vps_max_sub_layers_minus1; i++) {
-        ue(vps_max_dec_pic_buffering_minus1[i], 0, HEVC_MAX_DPB_SIZE - 1);
-        ue(vps_max_num_reorder_pics[i],         0, current->vps_max_dec_pic_buffering_minus1[i]);
-        ue(vps_max_latency_increase_plus1[i],   0, UINT32_MAX - 1);
+        ues(vps_max_dec_pic_buffering_minus1[i],
+            0, HEVC_MAX_DPB_SIZE - 1,                        1, i);
+        ues(vps_max_num_reorder_pics[i],
+            0, current->vps_max_dec_pic_buffering_minus1[i], 1, i);
+        ues(vps_max_latency_increase_plus1[i],
+            0, UINT32_MAX - 1,                               1, i);
     }
     if (!current->vps_sub_layer_ordering_info_present_flag) {
         for (i = 0; i < current->vps_max_sub_layers_minus1; i++) {
@@ -417,7 +473,7 @@ static int FUNC(vps)(CodedBitstreamContext *ctx, RWContext *rw,
     ue(vps_num_layer_sets_minus1, 0, HEVC_MAX_LAYER_SETS - 1);
     for (i = 1; i <= current->vps_num_layer_sets_minus1; i++) {
         for (j = 0; j <= current->vps_max_layer_id; j++)
-            flag(layer_id_included_flag[i][j]);
+            flags(layer_id_included_flag[i][j], 2, i, j);
     }
     for (j = 0; j <= current->vps_max_layer_id; j++)
         infer(layer_id_included_flag[0][j], j == 0);
@@ -431,11 +487,11 @@ static int FUNC(vps)(CodedBitstreamContext *ctx, RWContext *rw,
             ue(vps_num_ticks_poc_diff_one_minus1, 0, UINT32_MAX - 1);
         ue(vps_num_hrd_parameters, 0, current->vps_num_layer_sets_minus1 + 1);
         for (i = 0; i < current->vps_num_hrd_parameters; i++) {
-            ue(hrd_layer_set_idx[i],
-               current->vps_base_layer_internal_flag ? 0 : 1,
-               current->vps_num_layer_sets_minus1);
+            ues(hrd_layer_set_idx[i],
+                current->vps_base_layer_internal_flag ? 0 : 1,
+                current->vps_num_layer_sets_minus1, 1, i);
             if (i > 0)
-                flag(cprms_present_flag[i]);
+                flags(cprms_present_flag[i], 1, i);
             else
                 infer(cprms_present_flag[0], 1);
 
@@ -489,9 +545,9 @@ static int FUNC(st_ref_pic_set)(CodedBitstreamContext *ctx, RWContext *rw,
             (current->abs_delta_rps_minus1 + 1);
 
         for (j = 0; j <= num_delta_pocs; j++) {
-            flag(used_by_curr_pic_flag[j]);
+            flags(used_by_curr_pic_flag[j], 1, j);
             if (!current->used_by_curr_pic_flag[j])
-                flag(use_delta_flag[j]);
+                flags(use_delta_flag[j], 1, j);
             else
                 infer(use_delta_flag[j], 1);
         }
@@ -586,13 +642,13 @@ static int FUNC(st_ref_pic_set)(CodedBitstreamContext *ctx, RWContext *rw,
         ue(num_positive_pics, 0, 15 - current->num_negative_pics);
 
         for (i = 0; i < current->num_negative_pics; i++) {
-            ue(delta_poc_s0_minus1[i], 0, INT16_MAX);
-            flag(used_by_curr_pic_s0_flag[i]);
+            ues(delta_poc_s0_minus1[i], 0, INT16_MAX, 1, i);
+            flags(used_by_curr_pic_s0_flag[i],        1, i);
         }
 
         for (i = 0; i < current->num_positive_pics; i++) {
-            ue(delta_poc_s1_minus1[i], 0, INT16_MAX);
-            flag(used_by_curr_pic_s1_flag[i]);
+            ues(delta_poc_s1_minus1[i], 0, INT16_MAX, 1, i);
+            flags(used_by_curr_pic_s1_flag[i],        1, i);
         }
     }
 
@@ -607,18 +663,21 @@ static int FUNC(scaling_list_data)(CodedBitstreamContext *ctx, RWContext *rw,
 
     for (sizeId = 0; sizeId < 4; sizeId++) {
         for (matrixId = 0; matrixId < 6; matrixId += (sizeId == 3 ? 3 : 1)) {
-            flag(scaling_list_pred_mode_flag[sizeId][matrixId]);
+            flags(scaling_list_pred_mode_flag[sizeId][matrixId],
+                  2, sizeId, matrixId);
             if (!current->scaling_list_pred_mode_flag[sizeId][matrixId]) {
-                ue(scaling_list_pred_matrix_id_delta[sizeId][matrixId],
-                   0, sizeId == 3 ? matrixId / 3 : matrixId);
+                ues(scaling_list_pred_matrix_id_delta[sizeId][matrixId],
+                    0, sizeId == 3 ? matrixId / 3 : matrixId,
+                    2, sizeId, matrixId);
             } else {
                 n = FFMIN(64, 1 << (4 + (sizeId << 1)));
-                if (sizeId > 1)
-                    se(scaling_list_dc_coef_minus8[sizeId - 2][matrixId], -7, +247);
+                if (sizeId > 1) {
+                    ses(scaling_list_dc_coef_minus8[sizeId - 2][matrixId], -7, +247,
+                        2, sizeId - 2, matrixId);
+                }
                 for (i = 0; i < n; i++) {
-                    xse(scaling_list_delta_coeff,
-                        current->scaling_list_delta_coeff[sizeId][matrixId][i],
-                        -128, +127);
+                    ses(scaling_list_delta_coeff[sizeId][matrixId][i],
+                        -128, +127, 3, sizeId, matrixId, i);
                 }
             }
         }
@@ -664,8 +723,8 @@ static int FUNC(sps_scc_extension)(CodedBitstreamContext *ctx, RWContext *rw,
                 int bit_depth = comp == 0 ? current->bit_depth_luma_minus8 + 8
                                           : current->bit_depth_chroma_minus8 + 8;
                 for (i = 0; i <= current->sps_num_palette_predictor_initializer_minus1; i++)
-                    u(bit_depth, sps_palette_predictor_initializers[comp][i],
-                      0, (1 << bit_depth) - 1);
+                    us(bit_depth, sps_palette_predictor_initializers[comp][i],
+                       0, MAX_UINT_BITS(bit_depth), 2, comp, i);
             }
         }
     }
@@ -748,9 +807,12 @@ static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
     for (i = (current->sps_sub_layer_ordering_info_present_flag ?
               0 : current->sps_max_sub_layers_minus1);
          i <= current->sps_max_sub_layers_minus1; i++) {
-        ue(sps_max_dec_pic_buffering_minus1[i], 0, HEVC_MAX_DPB_SIZE - 1);
-        ue(sps_max_num_reorder_pics[i],         0, current->sps_max_dec_pic_buffering_minus1[i]);
-        ue(sps_max_latency_increase_plus1[i],   0, UINT32_MAX - 1);
+        ues(sps_max_dec_pic_buffering_minus1[i],
+            0, HEVC_MAX_DPB_SIZE - 1,                        1, i);
+        ues(sps_max_num_reorder_pics[i],
+            0, current->sps_max_dec_pic_buffering_minus1[i], 1, i);
+        ues(sps_max_latency_increase_plus1[i],
+            0, UINT32_MAX - 1,                               1, i);
     }
     if (!current->sps_sub_layer_ordering_info_present_flag) {
         for (i = 0; i < current->sps_max_sub_layers_minus1; i++) {
@@ -825,10 +887,10 @@ static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
     if (current->long_term_ref_pics_present_flag) {
         ue(num_long_term_ref_pics_sps, 0, HEVC_MAX_LONG_TERM_REF_PICS);
         for (i = 0; i < current->num_long_term_ref_pics_sps; i++) {
-            u(current->log2_max_pic_order_cnt_lsb_minus4 + 4,
-              lt_ref_pic_poc_lsb_sps[i],
-              0, (1 << (current->log2_max_pic_order_cnt_lsb_minus4 + 4)) - 1);
-            flag(used_by_curr_pic_lt_sps_flag[i]);
+            us(current->log2_max_pic_order_cnt_lsb_minus4 + 4,
+               lt_ref_pic_poc_lsb_sps[i],
+               0, MAX_UINT_BITS(current->log2_max_pic_order_cnt_lsb_minus4 + 4), 1, i);
+            flags(used_by_curr_pic_lt_sps_flag[i], 1, i);
         }
     }
 
@@ -845,7 +907,7 @@ static int FUNC(sps)(CodedBitstreamContext *ctx, RWContext *rw,
         flag(sps_multilayer_extension_flag);
         flag(sps_3d_extension_flag);
         flag(sps_scc_extension_flag);
-        u(4, sps_extension_4bits, 0, (1 << 4) - 1);
+        u(4, sps_extension_4bits, 0, MAX_UINT_BITS(4));
     }
 
     if (current->sps_range_extension_flag)
@@ -872,7 +934,7 @@ static int FUNC(pps_range_extension)(CodedBitstreamContext *ctx, RWContext *rw,
     int err, i;
 
     if (current->transform_skip_enabled_flag)
-        ue(log2_max_transform_skip_block_size_minus2, 0, 4);
+        ue(log2_max_transform_skip_block_size_minus2, 0, 3);
     flag(cross_component_prediction_enabled_flag);
 
     flag(chroma_qp_offset_list_enabled_flag);
@@ -881,8 +943,8 @@ static int FUNC(pps_range_extension)(CodedBitstreamContext *ctx, RWContext *rw,
            0, sps->log2_diff_max_min_luma_coding_block_size);
         ue(chroma_qp_offset_list_len_minus1, 0, 5);
         for (i = 0; i <= current->chroma_qp_offset_list_len_minus1; i++) {
-            se(cb_qp_offset_list[i], -12, +12);
-            se(cr_qp_offset_list[i], -12, +12);
+            ses(cb_qp_offset_list[i], -12, +12, 1, i);
+            ses(cr_qp_offset_list[i], -12, +12, 1, i);
         }
     }
 
@@ -924,8 +986,8 @@ static int FUNC(pps_scc_extension)(CodedBitstreamContext *ctx, RWContext *rw,
                 int bit_depth = comp == 0 ? current->luma_bit_depth_entry_minus8 + 8
                                           : current->chroma_bit_depth_entry_minus8 + 8;
                 for (i = 0; i < current->pps_num_palette_predictor_initializer; i++)
-                    u(bit_depth, pps_palette_predictor_initializers[comp][i],
-                      0, (1 << bit_depth) - 1);
+                    us(bit_depth, pps_palette_predictor_initializers[comp][i],
+                       0, MAX_UINT_BITS(bit_depth), 2, comp, i);
             }
         }
     }
@@ -991,9 +1053,9 @@ static int FUNC(pps)(CodedBitstreamContext *ctx, RWContext *rw,
         flag(uniform_spacing_flag);
         if (!current->uniform_spacing_flag) {
             for (i = 0; i < current->num_tile_columns_minus1; i++)
-                ue(column_width_minus1[i], 0, sps->pic_width_in_luma_samples);
+                ues(column_width_minus1[i], 0, sps->pic_width_in_luma_samples,  1, i);
             for (i = 0; i < current->num_tile_rows_minus1; i++)
-                ue(row_height_minus1[i],   0, sps->pic_height_in_luma_samples);
+                ues(row_height_minus1[i],   0, sps->pic_height_in_luma_samples, 1, i);
         }
         flag(loop_filter_across_tiles_enabled_flag);
     } else {
@@ -1038,7 +1100,7 @@ static int FUNC(pps)(CodedBitstreamContext *ctx, RWContext *rw,
         flag(pps_multilayer_extension_flag);
         flag(pps_3d_extension_flag);
         flag(pps_scc_extension_flag);
-        u(4, pps_extension_4bits, 0, (1 << 4) - 1);
+        u(4, pps_extension_4bits, 0, MAX_UINT_BITS(4));
     }
     if (current->pps_range_extension_flag)
         CHECK(FUNC(pps_range_extension)(ctx, rw, current));
@@ -1084,14 +1146,14 @@ static int FUNC(ref_pic_lists_modification)(CodedBitstreamContext *ctx, RWContex
     flag(ref_pic_list_modification_flag_l0);
     if (current->ref_pic_list_modification_flag_l0) {
         for (i = 0; i <= current->num_ref_idx_l0_active_minus1; i++)
-            u(entry_size, list_entry_l0[i], 0, num_pic_total_curr - 1);
+            us(entry_size, list_entry_l0[i], 0, num_pic_total_curr - 1, 1, i);
     }
 
     if (current->slice_type == HEVC_SLICE_B) {
         flag(ref_pic_list_modification_flag_l1);
         if (current->ref_pic_list_modification_flag_l1) {
             for (i = 0; i <= current->num_ref_idx_l1_active_minus1; i++)
-                u(entry_size, list_entry_l1[i], 0, num_pic_total_curr - 1);
+                us(entry_size, list_entry_l1[i], 0, num_pic_total_curr - 1, 1, i);
         }
     }
 
@@ -1115,14 +1177,14 @@ static int FUNC(pred_weight_table)(CodedBitstreamContext *ctx, RWContext *rw,
 
     for (i = 0; i <= current->num_ref_idx_l0_active_minus1; i++) {
         if (1 /* is not same POC and same layer_id */)
-            flag(luma_weight_l0_flag[i]);
+            flags(luma_weight_l0_flag[i], 1, i);
         else
             infer(luma_weight_l0_flag[i], 0);
     }
     if (chroma) {
         for (i = 0; i <= current->num_ref_idx_l0_active_minus1; i++) {
             if (1 /* is not same POC and same layer_id */)
-                flag(chroma_weight_l0_flag[i]);
+                flags(chroma_weight_l0_flag[i], 1, i);
             else
                 infer(chroma_weight_l0_flag[i], 0);
         }
@@ -1130,20 +1192,20 @@ static int FUNC(pred_weight_table)(CodedBitstreamContext *ctx, RWContext *rw,
 
     for (i = 0; i <= current->num_ref_idx_l0_active_minus1; i++) {
         if (current->luma_weight_l0_flag[i]) {
-            se(delta_luma_weight_l0[i], -128, +127);
-            se(luma_offset_l0[i],
-               -(1 << (sps->bit_depth_luma_minus8 + 8 - 1)),
-               ((1 << (sps->bit_depth_luma_minus8 + 8 - 1)) - 1));
+            ses(delta_luma_weight_l0[i], -128, +127, 1, i);
+            ses(luma_offset_l0[i],
+                -(1 << (sps->bit_depth_luma_minus8 + 8 - 1)),
+                ((1 << (sps->bit_depth_luma_minus8 + 8 - 1)) - 1), 1, i);
         } else {
             infer(delta_luma_weight_l0[i], 0);
             infer(luma_offset_l0[i],       0);
         }
         if (current->chroma_weight_l0_flag[i]) {
             for (j = 0; j < 2; j++) {
-                se(delta_chroma_weight_l0[i][j], -128, +127);
-                se(chroma_offset_l0[i][j],
-                   -(4 << (sps->bit_depth_chroma_minus8 + 8 - 1)),
-                   ((4 << (sps->bit_depth_chroma_minus8 + 8 - 1)) - 1));
+                ses(delta_chroma_weight_l0[i][j], -128, +127, 2, i, j);
+                ses(chroma_offset_l0[i][j],
+                    -(4 << (sps->bit_depth_chroma_minus8 + 8 - 1)),
+                    ((4 << (sps->bit_depth_chroma_minus8 + 8 - 1)) - 1), 2, i, j);
             }
         } else {
             for (j = 0; j < 2; j++) {
@@ -1156,14 +1218,14 @@ static int FUNC(pred_weight_table)(CodedBitstreamContext *ctx, RWContext *rw,
     if (current->slice_type == HEVC_SLICE_B) {
         for (i = 0; i <= current->num_ref_idx_l1_active_minus1; i++) {
             if (1 /* RefPicList1[i] is not CurrPic, nor is it in a different layer */)
-                flag(luma_weight_l1_flag[i]);
+                flags(luma_weight_l1_flag[i], 1, i);
             else
                 infer(luma_weight_l1_flag[i], 0);
         }
         if (chroma) {
             for (i = 0; i <= current->num_ref_idx_l1_active_minus1; i++) {
                 if (1 /* RefPicList1[i] is not CurrPic, nor is it in a different layer */)
-                    flag(chroma_weight_l1_flag[i]);
+                    flags(chroma_weight_l1_flag[i], 1, i);
                 else
                     infer(chroma_weight_l1_flag[i], 0);
             }
@@ -1171,20 +1233,20 @@ static int FUNC(pred_weight_table)(CodedBitstreamContext *ctx, RWContext *rw,
 
         for (i = 0; i <= current->num_ref_idx_l1_active_minus1; i++) {
             if (current->luma_weight_l1_flag[i]) {
-                se(delta_luma_weight_l1[i], -128, +127);
-                se(luma_offset_l1[i],
-                   -(1 << (sps->bit_depth_luma_minus8 + 8 - 1)),
-                   ((1 << (sps->bit_depth_luma_minus8 + 8 - 1)) - 1));
+                ses(delta_luma_weight_l1[i], -128, +127, 1, i);
+                ses(luma_offset_l1[i],
+                    -(1 << (sps->bit_depth_luma_minus8 + 8 - 1)),
+                    ((1 << (sps->bit_depth_luma_minus8 + 8 - 1)) - 1), 1, i);
             } else {
                 infer(delta_luma_weight_l1[i], 0);
                 infer(luma_offset_l1[i],       0);
             }
             if (current->chroma_weight_l1_flag[i]) {
                 for (j = 0; j < 2; j++) {
-                    se(delta_chroma_weight_l1[i][j], -128, +127);
-                    se(chroma_offset_l1[i][j],
-                       -(4 << (sps->bit_depth_chroma_minus8 + 8 - 1)),
-                       ((4 << (sps->bit_depth_chroma_minus8 + 8 - 1)) - 1));
+                    ses(delta_chroma_weight_l1[i][j], -128, +127, 2, i, j);
+                    ses(chroma_offset_l1[i][j],
+                        -(4 << (sps->bit_depth_chroma_minus8 + 8 - 1)),
+                        ((4 << (sps->bit_depth_chroma_minus8 + 8 - 1)) - 1), 2, i, j);
                 }
             } else {
                 for (j = 0; j < 2; j++) {
@@ -1259,7 +1321,7 @@ static int FUNC(slice_segment_header)(CodedBitstreamContext *ctx, RWContext *rw,
 
     if (!current->dependent_slice_segment_flag) {
         for (i = 0; i < pps->num_extra_slice_header_bits; i++)
-            flag(slice_reserved_flag[i]);
+            flags(slice_reserved_flag[i], 1, i);
 
         ue(slice_type, 0, 2);
 
@@ -1274,7 +1336,7 @@ static int FUNC(slice_segment_header)(CodedBitstreamContext *ctx, RWContext *rw,
             const H265RawSTRefPicSet *rps;
 
             u(sps->log2_max_pic_order_cnt_lsb_minus4 + 4, slice_pic_order_cnt_lsb,
-              0, (1 << (sps->log2_max_pic_order_cnt_lsb_minus4 + 4)) - 1);
+              0, MAX_UINT_BITS(sps->log2_max_pic_order_cnt_lsb_minus4 + 4));
 
             flag(short_term_ref_pic_set_sps_flag);
             if (!current->short_term_ref_pic_set_sps_flag) {
@@ -1315,20 +1377,20 @@ static int FUNC(slice_segment_header)(CodedBitstreamContext *ctx, RWContext *rw,
                                 current->num_long_term_pics; i++) {
                     if (i < current->num_long_term_sps) {
                         if (sps->num_long_term_ref_pics_sps > 1)
-                            u(idx_size, lt_idx_sps[i],
-                              0, sps->num_long_term_ref_pics_sps - 1);
+                            us(idx_size, lt_idx_sps[i],
+                               0, sps->num_long_term_ref_pics_sps - 1, 1, i);
                         if (sps->used_by_curr_pic_lt_sps_flag[current->lt_idx_sps[i]])
                             ++num_pic_total_curr;
                     } else {
-                        u(sps->log2_max_pic_order_cnt_lsb_minus4 + 4, poc_lsb_lt[i],
-                          0, (1 << (sps->log2_max_pic_order_cnt_lsb_minus4 + 4)) - 1);
-                        flag(used_by_curr_pic_lt_flag[i]);
+                        us(sps->log2_max_pic_order_cnt_lsb_minus4 + 4, poc_lsb_lt[i],
+                           0, MAX_UINT_BITS(sps->log2_max_pic_order_cnt_lsb_minus4 + 4), 1, i);
+                        flags(used_by_curr_pic_lt_flag[i], 1, i);
                         if (current->used_by_curr_pic_lt_flag[i])
                             ++num_pic_total_curr;
                     }
-                    flag(delta_poc_msb_present_flag[i]);
+                    flags(delta_poc_msb_present_flag[i], 1, i);
                     if (current->delta_poc_msb_present_flag[i])
-                        ue(delta_poc_msb_cycle_lt[i], 0, UINT32_MAX - 1);
+                        ues(delta_poc_msb_cycle_lt[i], 0, UINT32_MAX - 1, 1, i);
                     else
                         infer(delta_poc_msb_cycle_lt[i], 0);
                 }
@@ -1486,18 +1548,673 @@ static int FUNC(slice_segment_header)(CodedBitstreamContext *ctx, RWContext *rw,
         if (current->num_entry_point_offsets > 0) {
             ue(offset_len_minus1, 0, 31);
             for (i = 0; i < current->num_entry_point_offsets; i++)
-                u(current->offset_len_minus1 + 1, entry_point_offset_minus1[i],
-                  0, (1 << (current->offset_len_minus1 + 1)) - 1);
+                us(current->offset_len_minus1 + 1, entry_point_offset_minus1[i],
+                   0, MAX_UINT_BITS(current->offset_len_minus1 + 1), 1, i);
         }
     }
 
     if (pps->slice_segment_header_extension_present_flag) {
         ue(slice_segment_header_extension_length, 0, 256);
         for (i = 0; i < current->slice_segment_header_extension_length; i++)
-            u(8, slice_segment_header_extension_data_byte[i], 0x00, 0xff);
+            us(8, slice_segment_header_extension_data_byte[i], 0x00, 0xff, 1, i);
     }
 
     CHECK(FUNC(byte_alignment)(ctx, rw));
 
     return 0;
 }
+
+static int FUNC(sei_buffering_period)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      H265RawSEIBufferingPeriod *current,
+                                      uint32_t *payload_size)
+{
+    CodedBitstreamH265Context *h265 = ctx->priv_data;
+    const H265RawSPS *sps;
+    const H265RawHRDParameters *hrd;
+    int err, i, length;
+
+#ifdef READ
+    int start_pos, end_pos, bits_left;
+    start_pos = get_bits_count(rw);
+#endif
+
+    HEADER("Buffering Period");
+
+    ue(bp_seq_parameter_set_id, 0, HEVC_MAX_SPS_COUNT - 1);
+
+    sps = h265->sps[current->bp_seq_parameter_set_id];
+    if (!sps) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "SPS id %d not available.\n",
+               current->bp_seq_parameter_set_id);
+        return AVERROR_INVALIDDATA;
+    }
+    h265->active_sps = sps;
+
+    if (!sps->vui_parameters_present_flag ||
+        !sps->vui.vui_hrd_parameters_present_flag) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Buffering period SEI requires "
+               "HRD parameters to be present in SPS.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    hrd = &sps->vui.hrd_parameters;
+    if (!hrd->nal_hrd_parameters_present_flag &&
+        !hrd->vcl_hrd_parameters_present_flag) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Buffering period SEI requires "
+               "NAL or VCL HRD parameters to be present.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!hrd->sub_pic_hrd_params_present_flag)
+        flag(irap_cpb_params_present_flag);
+    else
+        infer(irap_cpb_params_present_flag, 0);
+    if (current->irap_cpb_params_present_flag) {
+        length = hrd->au_cpb_removal_delay_length_minus1 + 1;
+        u(length, cpb_delay_offset, 0, MAX_UINT_BITS(length));
+        length = hrd->dpb_output_delay_length_minus1 + 1;
+        u(length, dpb_delay_offset, 0, MAX_UINT_BITS(length));
+    } else {
+        infer(cpb_delay_offset, 0);
+        infer(dpb_delay_offset, 0);
+    }
+
+    flag(concatenation_flag);
+
+    length = hrd->au_cpb_removal_delay_length_minus1 + 1;
+    u(length, au_cpb_removal_delay_delta_minus1, 0, MAX_UINT_BITS(length));
+
+    if (hrd->nal_hrd_parameters_present_flag) {
+        for (i = 0; i <= hrd->cpb_cnt_minus1[0]; i++) {
+            length = hrd->initial_cpb_removal_delay_length_minus1 + 1;
+
+            us(length, nal_initial_cpb_removal_delay[i],
+               0, MAX_UINT_BITS(length), 1, i);
+            us(length, nal_initial_cpb_removal_offset[i],
+               0, MAX_UINT_BITS(length), 1, i);
+
+            if (hrd->sub_pic_hrd_params_present_flag ||
+                current->irap_cpb_params_present_flag) {
+                us(length, nal_initial_alt_cpb_removal_delay[i],
+                   0, MAX_UINT_BITS(length), 1, i);
+                us(length, nal_initial_alt_cpb_removal_offset[i],
+                   0, MAX_UINT_BITS(length), 1, i);
+            }
+        }
+    }
+    if (hrd->vcl_hrd_parameters_present_flag) {
+        for (i = 0; i <= hrd->cpb_cnt_minus1[0]; i++) {
+            length = hrd->initial_cpb_removal_delay_length_minus1 + 1;
+
+            us(length, vcl_initial_cpb_removal_delay[i],
+               0, MAX_UINT_BITS(length), 1, i);
+            us(length, vcl_initial_cpb_removal_offset[i],
+               0, MAX_UINT_BITS(length), 1, i);
+
+            if (hrd->sub_pic_hrd_params_present_flag ||
+                current->irap_cpb_params_present_flag) {
+                us(length, vcl_initial_alt_cpb_removal_delay[i],
+                   0, MAX_UINT_BITS(length), 1, i);
+                us(length, vcl_initial_alt_cpb_removal_offset[i],
+                   0, MAX_UINT_BITS(length), 1, i);
+            }
+        }
+    }
+
+#ifdef READ
+    // payload_extension_present() - true if we are before the last 1-bit
+    // in the payload structure, which must be in the last byte.
+    end_pos = get_bits_count(rw);
+    bits_left = *payload_size * 8 - (end_pos - start_pos);
+    if (bits_left > 0 &&
+        (bits_left > 7 || ff_ctz(show_bits(rw, bits_left)) < bits_left - 1))
+        flag(use_alt_cpb_params_flag);
+    else
+        infer(use_alt_cpb_params_flag, 0);
+#else
+    if (current->use_alt_cpb_params_flag)
+        flag(use_alt_cpb_params_flag);
+#endif
+
+    return 0;
+}
+
+static int FUNC(sei_pic_timing)(CodedBitstreamContext *ctx, RWContext *rw,
+                                H265RawSEIPicTiming *current)
+{
+    CodedBitstreamH265Context *h265 = ctx->priv_data;
+    const H265RawSPS *sps;
+    const H265RawHRDParameters *hrd;
+    int err, expected_source_scan_type, i, length;
+
+    HEADER("Picture Timing");
+
+    sps = h265->active_sps;
+    if (!sps) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR,
+               "No active SPS for pic_timing.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    expected_source_scan_type = 2 -
+        2 * sps->profile_tier_level.general_interlaced_source_flag -
+        sps->profile_tier_level.general_progressive_source_flag;
+
+    if (sps->vui.frame_field_info_present_flag) {
+        u(4, pic_struct, 0, 12);
+        u(2, source_scan_type,
+          expected_source_scan_type >= 0 ? expected_source_scan_type : 0,
+          expected_source_scan_type >= 0 ? expected_source_scan_type : 2);
+        flag(duplicate_flag);
+    } else {
+        infer(pic_struct, 0);
+        infer(source_scan_type,
+              expected_source_scan_type >= 0 ? expected_source_scan_type : 2);
+        infer(duplicate_flag, 0);
+    }
+
+    if (sps->vui_parameters_present_flag &&
+        sps->vui.vui_hrd_parameters_present_flag)
+        hrd = &sps->vui.hrd_parameters;
+    else
+        hrd = NULL;
+    if (hrd && (hrd->nal_hrd_parameters_present_flag ||
+                hrd->vcl_hrd_parameters_present_flag)) {
+        length = hrd->au_cpb_removal_delay_length_minus1 + 1;
+        u(length, au_cpb_removal_delay_minus1, 0, MAX_UINT_BITS(length));
+
+        length = hrd->dpb_output_delay_length_minus1 + 1;
+        u(length, pic_dpb_output_delay, 0, MAX_UINT_BITS(length));
+
+        if (hrd->sub_pic_hrd_params_present_flag) {
+            length = hrd->dpb_output_delay_du_length_minus1 + 1;
+            u(length, pic_dpb_output_du_delay, 0, MAX_UINT_BITS(length));
+        }
+
+        if (hrd->sub_pic_hrd_params_present_flag &&
+            hrd->sub_pic_cpb_params_in_pic_timing_sei_flag) {
+            // Each decoding unit must contain at least one slice segment.
+            ue(num_decoding_units_minus1, 0, HEVC_MAX_SLICE_SEGMENTS);
+            flag(du_common_cpb_removal_delay_flag);
+
+            length = hrd->du_cpb_removal_delay_increment_length_minus1 + 1;
+            if (current->du_common_cpb_removal_delay_flag)
+                u(length, du_common_cpb_removal_delay_increment_minus1,
+                  0, MAX_UINT_BITS(length));
+
+            for (i = 0; i <= current->num_decoding_units_minus1; i++) {
+                ues(num_nalus_in_du_minus1[i],
+                    0, HEVC_MAX_SLICE_SEGMENTS, 1, i);
+                if (!current->du_common_cpb_removal_delay_flag &&
+                    i < current->num_decoding_units_minus1)
+                    us(length, du_cpb_removal_delay_increment_minus1[i],
+                      0, MAX_UINT_BITS(length), 1, i);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(sei_pan_scan_rect)(CodedBitstreamContext *ctx, RWContext *rw,
+                                   H265RawSEIPanScanRect *current)
+{
+    int err, i;
+
+    HEADER("Pan-Scan Rectangle");
+
+    ue(pan_scan_rect_id, 0, UINT32_MAX - 1);
+    flag(pan_scan_rect_cancel_flag);
+
+    if (!current->pan_scan_rect_cancel_flag) {
+        ue(pan_scan_cnt_minus1, 0, 2);
+
+        for (i = 0; i <= current->pan_scan_cnt_minus1; i++) {
+            ses(pan_scan_rect_left_offset[i],   INT32_MIN + 1, INT32_MAX, 1, i);
+            ses(pan_scan_rect_right_offset[i],  INT32_MIN + 1, INT32_MAX, 1, i);
+            ses(pan_scan_rect_top_offset[i],    INT32_MIN + 1, INT32_MAX, 1, i);
+            ses(pan_scan_rect_bottom_offset[i], INT32_MIN + 1, INT32_MAX, 1, i);
+        }
+
+        flag(pan_scan_rect_persistence_flag);
+    }
+
+    return 0;
+}
+
+static int FUNC(sei_user_data_registered)(CodedBitstreamContext *ctx, RWContext *rw,
+                                          H265RawSEIUserDataRegistered *current,
+                                          uint32_t *payload_size)
+{
+    int err, i, j;
+
+    HEADER("User Data Registered ITU-T T.35");
+
+    u(8, itu_t_t35_country_code, 0x00, 0xff);
+    if (current->itu_t_t35_country_code != 0xff)
+        i = 1;
+    else {
+        u(8, itu_t_t35_country_code_extension_byte, 0x00, 0xff);
+        i = 2;
+    }
+
+#ifdef READ
+    if (*payload_size < i) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR,
+               "Invalid SEI user data registered payload.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    current->data_length = *payload_size - i;
+#else
+    *payload_size = i + current->data_length;
+#endif
+
+    allocate(current->data, current->data_length);
+    for (j = 0; j < current->data_length; j++)
+        xu(8, itu_t_t35_payload_byte[i], current->data[j], 0x00, 0xff, 1, i + j);
+
+    return 0;
+}
+
+static int FUNC(sei_user_data_unregistered)(CodedBitstreamContext *ctx, RWContext *rw,
+                                            H265RawSEIUserDataUnregistered *current,
+                                            uint32_t *payload_size)
+{
+    int err, i;
+
+    HEADER("User Data Unregistered");
+
+#ifdef READ
+    if (*payload_size < 16) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR,
+               "Invalid SEI user data unregistered payload.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    current->data_length = *payload_size - 16;
+#else
+    *payload_size = 16 + current->data_length;
+#endif
+
+    for (i = 0; i < 16; i++)
+        us(8, uuid_iso_iec_11578[i], 0x00, 0xff, 1, i);
+
+    allocate(current->data, current->data_length);
+
+    for (i = 0; i < current->data_length; i++)
+        xu(8, user_data_payload_byte[i], current->data[i], 0x00, 0xff, 1, i);
+
+    return 0;
+}
+
+static int FUNC(sei_recovery_point)(CodedBitstreamContext *ctx, RWContext *rw,
+                                    H265RawSEIRecoveryPoint *current)
+{
+    int err;
+
+    HEADER("Recovery Point");
+
+    se(recovery_poc_cnt, -32768, 32767);
+
+    flag(exact_match_flag);
+    flag(broken_link_flag);
+
+    return 0;
+}
+
+static int FUNC(sei_display_orientation)(CodedBitstreamContext *ctx, RWContext *rw,
+                                         H265RawSEIDisplayOrientation *current)
+{
+    int err;
+
+    HEADER("Display Orientation");
+
+    flag(display_orientation_cancel_flag);
+    if (!current->display_orientation_cancel_flag) {
+        flag(hor_flip);
+        flag(ver_flip);
+        u(16, anticlockwise_rotation, 0, 65535);
+        flag(display_orientation_persistence_flag);
+    }
+
+    return 0;
+}
+
+static int FUNC(sei_active_parameter_sets)(CodedBitstreamContext *ctx, RWContext *rw,
+                                           H265RawSEIActiveParameterSets *current)
+{
+    CodedBitstreamH265Context *h265 = ctx->priv_data;
+    const H265RawVPS *vps;
+    int err, i;
+
+    HEADER("Active Parameter Sets");
+
+    u(4, active_video_parameter_set_id, 0, HEVC_MAX_VPS_COUNT);
+    vps = h265->vps[current->active_video_parameter_set_id];
+    if (!vps) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "VPS id %d not available for active "
+               "parameter sets.\n", current->active_video_parameter_set_id);
+        return AVERROR_INVALIDDATA;
+    }
+    h265->active_vps = vps;
+
+    flag(self_contained_cvs_flag);
+    flag(no_parameter_set_update_flag);
+
+    ue(num_sps_ids_minus1, 0, HEVC_MAX_SPS_COUNT - 1);
+    for (i = 0; i <= current->num_sps_ids_minus1; i++)
+        ues(active_seq_parameter_set_id[i], 0, HEVC_MAX_SPS_COUNT - 1, 1, i);
+
+    for (i = vps->vps_base_layer_internal_flag;
+         i <= FFMIN(62, vps->vps_max_layers_minus1); i++) {
+        ues(layer_sps_idx[i], 0, current->num_sps_ids_minus1, 1, i);
+
+        if (i == 0)
+            h265->active_sps = h265->sps[current->active_seq_parameter_set_id[current->layer_sps_idx[0]]];
+    }
+
+    return 0;
+}
+
+static int FUNC(sei_decoded_picture_hash)(CodedBitstreamContext *ctx, RWContext *rw,
+                                          H265RawSEIDecodedPictureHash *current)
+{
+    CodedBitstreamH265Context *h265 = ctx->priv_data;
+    const H265RawSPS *sps = h265->active_sps;
+    int err, c, i;
+
+    HEADER("Decoded Picture Hash");
+
+    if (!sps) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR,
+               "No active SPS for decoded picture hash.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    u(8, hash_type, 0, 2);
+
+    for (c = 0; c < (sps->chroma_format_idc == 0 ? 1 : 3); c++) {
+        if (current->hash_type == 0) {
+            for (i = 0; i < 16; i++)
+                us(8, picture_md5[c][i], 0x00, 0xff, 2, c, i);
+        } else if (current->hash_type == 1) {
+            us(16, picture_crc[c], 0x0000, 0xffff, 1, c);
+        } else if (current->hash_type == 2) {
+            us(32, picture_checksum[c], 0x00000000, 0xffffffff, 1, c);
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(sei_time_code)(CodedBitstreamContext *ctx, RWContext *rw,
+                               H265RawSEITimeCode *current)
+{
+    int err, i;
+
+    HEADER("Time Code");
+
+    u(2, num_clock_ts, 1, 3);
+
+    for (i = 0; i < current->num_clock_ts; i++) {
+        flags(clock_timestamp_flag[i],   1, i);
+
+        if (current->clock_timestamp_flag[i]) {
+            flags(units_field_based_flag[i], 1, i);
+            us(5, counting_type[i], 0, 6,    1, i);
+            flags(full_timestamp_flag[i],    1, i);
+            flags(discontinuity_flag[i],     1, i);
+            flags(cnt_dropped_flag[i],       1, i);
+
+            us(9, n_frames[i], 0, MAX_UINT_BITS(9), 1, i);
+
+            if (current->full_timestamp_flag[i]) {
+                us(6, seconds_value[i], 0, 59, 1, i);
+                us(6, minutes_value[i], 0, 59, 1, i);
+                us(5, hours_value[i],   0, 23, 1, i);
+            } else {
+                flags(seconds_flag[i], 1, i);
+                if (current->seconds_flag[i]) {
+                    us(6, seconds_value[i], 0, 59, 1, i);
+                    flags(minutes_flag[i], 1, i);
+                    if (current->minutes_flag[i]) {
+                        us(6, minutes_value[i], 0, 59, 1, i);
+                        flags(hours_flag[i], 1, i);
+                        if (current->hours_flag[i])
+                            us(5, hours_value[i], 0, 23, 1, i);
+                    }
+                }
+            }
+
+            us(5, time_offset_length[i], 0, 31, 1, i);
+            if (current->time_offset_length[i] > 0)
+                us(current->time_offset_length[i], time_offset_value[i],
+                   0, MAX_UINT_BITS(current->time_offset_length[i]), 1, i);
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(sei_mastering_display)(CodedBitstreamContext *ctx, RWContext *rw,
+                                       H265RawSEIMasteringDisplayColourVolume *current)
+{
+    int err, c;
+
+    HEADER("Mastering Display Colour Volume");
+
+    for (c = 0; c < 3; c++) {
+        us(16, display_primaries_x[c], 0, 50000, 1, c);
+        us(16, display_primaries_y[c], 0, 50000, 1, c);
+    }
+
+    u(16, white_point_x, 0, 50000);
+    u(16, white_point_y, 0, 50000);
+
+    u(32, max_display_mastering_luminance,
+      1, MAX_UINT_BITS(32));
+    u(32, min_display_mastering_luminance,
+      0, current->max_display_mastering_luminance - 1);
+
+    return 0;
+}
+
+static int FUNC(sei_content_light_level)(CodedBitstreamContext *ctx, RWContext *rw,
+                                         H265RawSEIContentLightLevelInfo *current)
+{
+    int err;
+
+    HEADER("Content Light Level");
+
+    u(16, max_content_light_level, 0, MAX_UINT_BITS(16));
+    u(16, max_pic_average_light_level, 0, MAX_UINT_BITS(16));
+
+    return 0;
+}
+
+static int FUNC(sei_alternative_transfer_characteristics)(CodedBitstreamContext *ctx,
+                                                          RWContext *rw,
+                                                          H265RawSEIAlternativeTransferCharacteristics *current)
+{
+    int err;
+
+    HEADER("Alternative Transfer Characteristics");
+
+    u(8, preferred_transfer_characteristics, 0, 255);
+
+    return 0;
+}
+
+static int FUNC(sei_payload)(CodedBitstreamContext *ctx, RWContext *rw,
+                             H265RawSEIPayload *current, int prefix)
+{
+    int err, i;
+    int start_position, end_position;
+
+#ifdef READ
+    start_position = get_bits_count(rw);
+#else
+    start_position = put_bits_count(rw);
+#endif
+
+    switch (current->payload_type) {
+#define SEI_TYPE_CHECK_VALID(name, prefix_valid, suffix_valid) do { \
+            if (prefix && !prefix_valid) { \
+                av_log(ctx->log_ctx, AV_LOG_ERROR, "SEI type %s invalid " \
+                       "as prefix SEI!\n", #name); \
+                return AVERROR_INVALIDDATA; \
+            } \
+            if (!prefix && !suffix_valid) { \
+                av_log(ctx->log_ctx, AV_LOG_ERROR, "SEI type %s invalid " \
+                       "as suffix SEI!\n", #name); \
+                return AVERROR_INVALIDDATA; \
+            } \
+        } while (0)
+#define SEI_TYPE_N(type, prefix_valid, suffix_valid, name) \
+    case HEVC_SEI_TYPE_ ## type: \
+        SEI_TYPE_CHECK_VALID(name, prefix_valid, suffix_valid); \
+        CHECK(FUNC(sei_ ## name)(ctx, rw, &current->payload.name)); \
+        break
+#define SEI_TYPE_S(type, prefix_valid, suffix_valid, name) \
+    case HEVC_SEI_TYPE_ ## type: \
+        SEI_TYPE_CHECK_VALID(name, prefix_valid, suffix_valid); \
+        CHECK(FUNC(sei_ ## name)(ctx, rw, &current->payload.name, \
+                                 &current->payload_size)); \
+        break
+
+        SEI_TYPE_S(BUFFERING_PERIOD,         1, 0, buffering_period);
+        SEI_TYPE_N(PICTURE_TIMING,           1, 0, pic_timing);
+        SEI_TYPE_N(PAN_SCAN_RECT,            1, 0, pan_scan_rect);
+        SEI_TYPE_S(USER_DATA_REGISTERED_ITU_T_T35,
+                                             1, 1, user_data_registered);
+        SEI_TYPE_S(USER_DATA_UNREGISTERED,   1, 1, user_data_unregistered);
+        SEI_TYPE_N(RECOVERY_POINT,           1, 0, recovery_point);
+        SEI_TYPE_N(DISPLAY_ORIENTATION,      1, 0, display_orientation);
+        SEI_TYPE_N(ACTIVE_PARAMETER_SETS,    1, 0, active_parameter_sets);
+        SEI_TYPE_N(DECODED_PICTURE_HASH,     0, 1, decoded_picture_hash);
+        SEI_TYPE_N(TIME_CODE,                1, 0, time_code);
+        SEI_TYPE_N(MASTERING_DISPLAY_INFO,   1, 0, mastering_display);
+        SEI_TYPE_N(CONTENT_LIGHT_LEVEL_INFO, 1, 0, content_light_level);
+        SEI_TYPE_N(ALTERNATIVE_TRANSFER_CHARACTERISTICS,
+                                             1, 0, alternative_transfer_characteristics);
+
+#undef SEI_TYPE
+    default:
+        {
+#ifdef READ
+            current->payload.other.data_length = current->payload_size;
+#endif
+            allocate(current->payload.other.data, current->payload.other.data_length);
+
+            for (i = 0; i < current->payload_size; i++)
+                xu(8, payload_byte[i], current->payload.other.data[i], 0, 255,
+                   1, i);
+        }
+    }
+
+    if (byte_alignment(rw)) {
+        fixed(1, bit_equal_to_one, 1);
+        while (byte_alignment(rw))
+            fixed(1, bit_equal_to_zero, 0);
+    }
+
+#ifdef READ
+    end_position = get_bits_count(rw);
+    if (end_position < start_position + 8 * current->payload_size) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Incorrect SEI payload length: "
+               "header %"PRIu32" bits, actually %d bits.\n",
+               8 * current->payload_size,
+               end_position - start_position);
+        return AVERROR_INVALIDDATA;
+    }
+#else
+    end_position = put_bits_count(rw);
+    current->payload_size = (end_position - start_position) >> 3;
+#endif
+
+    return 0;
+}
+
+static int FUNC(sei)(CodedBitstreamContext *ctx, RWContext *rw,
+                     H265RawSEI *current, int prefix)
+{
+    int err, k;
+
+    if (prefix)
+        HEADER("Prefix Supplemental Enhancement Information");
+    else
+        HEADER("Suffix Supplemental Enhancement Information");
+
+    CHECK(FUNC(nal_unit_header)(ctx, rw, &current->nal_unit_header,
+                                prefix ? HEVC_NAL_SEI_PREFIX
+                                       : HEVC_NAL_SEI_SUFFIX));
+
+#ifdef READ
+    for (k = 0; k < H265_MAX_SEI_PAYLOADS; k++) {
+        uint32_t payload_type = 0;
+        uint32_t payload_size = 0;
+        uint32_t tmp;
+
+        while (show_bits(rw, 8) == 0xff) {
+            fixed(8, ff_byte, 0xff);
+            payload_type += 255;
+        }
+        xu(8, last_payload_type_byte, tmp, 0, 254, 0);
+        payload_type += tmp;
+
+        while (show_bits(rw, 8) == 0xff) {
+            fixed(8, ff_byte, 0xff);
+            payload_size += 255;
+        }
+        xu(8, last_payload_size_byte, tmp, 0, 254, 0);
+        payload_size += tmp;
+
+        current->payload[k].payload_type = payload_type;
+        current->payload[k].payload_size = payload_size;
+
+        CHECK(FUNC(sei_payload)(ctx, rw, &current->payload[k], prefix));
+
+        if (!cbs_h2645_read_more_rbsp_data(rw))
+            break;
+    }
+    if (k >= H265_MAX_SEI_PAYLOADS) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Too many payloads in "
+               "SEI message: found %d.\n", k);
+        return AVERROR_INVALIDDATA;
+    }
+    current->payload_count = k + 1;
+#else
+    for (k = 0; k < current->payload_count; k++) {
+        PutBitContext start_state;
+        uint32_t tmp;
+        int need_size, i;
+
+        // Somewhat clumsy: we write the payload twice when
+        // we don't know the size in advance.  This will mess
+        // with trace output, but is otherwise harmless.
+        start_state = *rw;
+        need_size = !current->payload[k].payload_size;
+        for (i = 0; i < 1 + need_size; i++) {
+            *rw = start_state;
+
+            tmp = current->payload[k].payload_type;
+            while (tmp >= 255) {
+                fixed(8, ff_byte, 0xff);
+                tmp -= 255;
+            }
+            xu(8, last_payload_type_byte, tmp, 0, 254, 0);
+
+            tmp = current->payload[k].payload_size;
+            while (tmp >= 255) {
+                fixed(8, ff_byte, 0xff);
+                tmp -= 255;
+            }
+            xu(8, last_payload_size_byte, tmp, 0, 254, 0);
+
+            CHECK(FUNC(sei_payload)(ctx, rw, &current->payload[k], prefix));
+        }
+    }
+#endif
+
+    CHECK(FUNC(rbsp_trailing_bits)(ctx, rw));
+
+    return 0;
+}
diff --git a/libavcodec/cbs_internal.h b/libavcodec/cbs_internal.h
index 4c6f421..53f2e5d 100644
--- a/libavcodec/cbs_internal.h
+++ b/libavcodec/cbs_internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,8 +20,8 @@
 #define AVCODEC_CBS_INTERNAL_H
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "cbs.h"
+#include "get_bits.h"
 #include "put_bits.h"
 
 
@@ -63,26 +63,35 @@ typedef struct CodedBitstreamType {
 void ff_cbs_trace_header(CodedBitstreamContext *ctx,
                          const char *name);
 
-void ff_cbs_trace_syntax_element(CodedBitstreamContext *ctx,
-                                 int position, const char *name,
+void ff_cbs_trace_syntax_element(CodedBitstreamContext *ctx, int position,
+                                 const char *name, const int *subscripts,
                                  const char *bitstring, int64_t value);
 
 
 // Helper functions for read/write of common bitstream elements, including
 // generation of trace output.
 
-int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, BitstreamContext *bc,
-                         int width, const char *name, uint32_t *write_to,
+int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                         int width, const char *name,
+                         const int *subscripts, uint32_t *write_to,
                          uint32_t range_min, uint32_t range_max);
 
 int ff_cbs_write_unsigned(CodedBitstreamContext *ctx, PutBitContext *pbc,
-                          int width, const char *name, uint32_t value,
+                          int width, const char *name,
+                          const int *subscripts, uint32_t value,
                           uint32_t range_min, uint32_t range_max);
 
+// The largest value representable in N bits, suitable for use as
+// range_max in the above functions.
+#define MAX_UINT_BITS(length) ((UINT64_C(1) << (length)) - 1)
 
+
+extern const CodedBitstreamType ff_cbs_type_av1;
 extern const CodedBitstreamType ff_cbs_type_h264;
 extern const CodedBitstreamType ff_cbs_type_h265;
+extern const CodedBitstreamType ff_cbs_type_jpeg;
 extern const CodedBitstreamType ff_cbs_type_mpeg2;
+extern const CodedBitstreamType ff_cbs_type_vp9;
 
 
 #endif /* AVCODEC_CBS_INTERNAL_H */
diff --git a/libavcodec/cbs_jpeg.c b/libavcodec/cbs_jpeg.c
new file mode 100644
index 0000000..5a72f0e
--- /dev/null
+++ b/libavcodec/cbs_jpeg.c
@@ -0,0 +1,520 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "cbs.h"
+#include "cbs_internal.h"
+#include "cbs_jpeg.h"
+
+
+#define HEADER(name) do { \
+        ff_cbs_trace_header(ctx, name); \
+    } while (0)
+
+#define CHECK(call) do { \
+        err = (call); \
+        if (err < 0) \
+            return err; \
+    } while (0)
+
+#define SUBSCRIPTS(subs, ...) (subs > 0 ? ((int[subs + 1]){ subs, __VA_ARGS__ }) : NULL)
+
+#define u(width, name, range_min, range_max) \
+    xu(width, name, range_min, range_max, 0)
+#define us(width, name, sub, range_min, range_max) \
+    xu(width, name, range_min, range_max, 1, sub)
+
+
+#define READ
+#define READWRITE read
+#define RWContext GetBitContext
+#define FUNC(name) cbs_jpeg_read_ ## name
+
+#define xu(width, name, range_min, range_max, subs, ...) do { \
+        uint32_t value = range_min; \
+        CHECK(ff_cbs_read_unsigned(ctx, rw, width, #name, \
+                                   SUBSCRIPTS(subs, __VA_ARGS__), \
+                                   &value, range_min, range_max)); \
+        current->name = value; \
+    } while (0)
+
+#include "cbs_jpeg_syntax_template.c"
+
+#undef READ
+#undef READWRITE
+#undef RWContext
+#undef FUNC
+#undef xu
+
+#define WRITE
+#define READWRITE write
+#define RWContext PutBitContext
+#define FUNC(name) cbs_jpeg_write_ ## name
+
+#define xu(width, name, range_min, range_max, subs, ...) do { \
+        uint32_t value = current->name; \
+        CHECK(ff_cbs_write_unsigned(ctx, rw, width, #name, \
+                                    SUBSCRIPTS(subs, __VA_ARGS__), \
+                                    value, range_min, range_max)); \
+    } while (0)
+
+
+#include "cbs_jpeg_syntax_template.c"
+
+#undef READ
+#undef READWRITE
+#undef RWContext
+#undef FUNC
+#undef xu
+
+
+static void cbs_jpeg_free_application_data(void *unit, uint8_t *content)
+{
+    JPEGRawApplicationData *ad = (JPEGRawApplicationData*)content;
+    av_buffer_unref(&ad->Ap_ref);
+    av_freep(&content);
+}
+
+static void cbs_jpeg_free_comment(void *unit, uint8_t *content)
+{
+    JPEGRawComment *comment = (JPEGRawComment*)content;
+    av_buffer_unref(&comment->Cm_ref);
+    av_freep(&content);
+}
+
+static void cbs_jpeg_free_scan(void *unit, uint8_t *content)
+{
+    JPEGRawScan *scan = (JPEGRawScan*)content;
+    av_buffer_unref(&scan->data_ref);
+    av_freep(&content);
+}
+
+static int cbs_jpeg_split_fragment(CodedBitstreamContext *ctx,
+                                   CodedBitstreamFragment *frag,
+                                   int header)
+{
+    AVBufferRef *data_ref;
+    uint8_t *data;
+    size_t data_size;
+    int unit, start, end, marker, next_start, next_marker;
+    int err, i, j, length;
+
+    if (frag->data_size < 4) {
+        // Definitely too short to be meaningful.
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i = 0; i + 1 < frag->data_size && frag->data[i] != 0xff; i++);
+    if (i > 0) {
+        av_log(ctx->log_ctx, AV_LOG_WARNING, "Discarding %d bytes at "
+               "beginning of image.\n", i);
+    }
+    for (++i; i + 1 < frag->data_size && frag->data[i] == 0xff; i++);
+    if (i + 1 >= frag->data_size && frag->data[i]) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
+               "no SOI marker found.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    marker = frag->data[i];
+    if (marker != JPEG_MARKER_SOI) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: first "
+               "marker is %02x, should be SOI.\n", marker);
+        return AVERROR_INVALIDDATA;
+    }
+    for (++i; i + 1 < frag->data_size && frag->data[i] == 0xff; i++);
+    if (i + 1 >= frag->data_size) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
+               "no image content found.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    marker = frag->data[i];
+    start  = i + 1;
+
+    for (unit = 0;; unit++) {
+        if (marker == JPEG_MARKER_EOI) {
+            break;
+        } else if (marker == JPEG_MARKER_SOS) {
+            for (i = start; i + 1 < frag->data_size; i++) {
+                if (frag->data[i] != 0xff)
+                    continue;
+                end = i;
+                for (++i; i + 1 < frag->data_size &&
+                          frag->data[i] == 0xff; i++);
+                if (i + 1 >= frag->data_size) {
+                    next_marker = -1;
+                } else {
+                    if (frag->data[i] == 0x00)
+                        continue;
+                    next_marker = frag->data[i];
+                    next_start  = i + 1;
+                }
+                break;
+            }
+        } else {
+            i = start;
+            if (i + 2 > frag->data_size) {
+                av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
+                       "truncated at %02x marker.\n", marker);
+                return AVERROR_INVALIDDATA;
+            }
+            length = AV_RB16(frag->data + i);
+            if (i + length > frag->data_size) {
+                av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid JPEG image: "
+                       "truncated at %02x marker segment.\n", marker);
+                return AVERROR_INVALIDDATA;
+            }
+            end = start + length;
+
+            i = end;
+            if (frag->data[i] != 0xff) {
+                next_marker = -1;
+            } else {
+                for (++i; i + 1 < frag->data_size &&
+                          frag->data[i] == 0xff; i++);
+                if (i + 1 >= frag->data_size) {
+                    next_marker = -1;
+                } else {
+                    next_marker = frag->data[i];
+                    next_start  = i + 1;
+                }
+            }
+        }
+
+        if (marker == JPEG_MARKER_SOS) {
+            length = AV_RB16(frag->data + start);
+
+            data_ref = NULL;
+            data     = av_malloc(end - start +
+                                 AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!data)
+                return AVERROR(ENOMEM);
+
+            memcpy(data, frag->data + start, length);
+            for (i = start + length, j = length; i < end; i++, j++) {
+                if (frag->data[i] == 0xff) {
+                    while (frag->data[i] == 0xff)
+                        ++i;
+                    data[j] = 0xff;
+                } else {
+                    data[j] = frag->data[i];
+                }
+            }
+            data_size = j;
+
+            memset(data + data_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+        } else {
+            data      = frag->data + start;
+            data_size = end - start;
+            data_ref  = frag->data_ref;
+        }
+
+        err = ff_cbs_insert_unit_data(ctx, frag, unit, marker,
+                                      data, data_size, data_ref);
+        if (err < 0) {
+            if (!data_ref)
+                av_freep(&data);
+            return err;
+        }
+
+        if (next_marker == -1)
+            break;
+        marker = next_marker;
+        start  = next_start;
+    }
+
+    return 0;
+}
+
+static int cbs_jpeg_read_unit(CodedBitstreamContext *ctx,
+                              CodedBitstreamUnit *unit)
+{
+    GetBitContext gbc;
+    int err;
+
+    err = init_get_bits(&gbc, unit->data, 8 * unit->data_size);
+    if (err < 0)
+        return err;
+
+    if (unit->type >= JPEG_MARKER_SOF0 &&
+        unit->type <= JPEG_MARKER_SOF3) {
+        err = ff_cbs_alloc_unit_content(ctx, unit,
+                                        sizeof(JPEGRawFrameHeader),
+                                        NULL);
+        if (err < 0)
+            return err;
+
+        err = cbs_jpeg_read_frame_header(ctx, &gbc, unit->content);
+        if (err < 0)
+            return err;
+
+    } else if (unit->type >= JPEG_MARKER_APPN &&
+               unit->type <= JPEG_MARKER_APPN + 15) {
+        err = ff_cbs_alloc_unit_content(ctx, unit,
+                                        sizeof(JPEGRawApplicationData),
+                                        &cbs_jpeg_free_application_data);
+        if (err < 0)
+            return err;
+
+        err = cbs_jpeg_read_application_data(ctx, &gbc, unit->content);
+        if (err < 0)
+            return err;
+
+    } else if (unit->type == JPEG_MARKER_SOS) {
+        JPEGRawScan *scan;
+        int pos;
+
+        err = ff_cbs_alloc_unit_content(ctx, unit,
+                                        sizeof(JPEGRawScan),
+                                        &cbs_jpeg_free_scan);
+        if (err < 0)
+            return err;
+        scan = unit->content;
+
+        err = cbs_jpeg_read_scan_header(ctx, &gbc, &scan->header);
+        if (err < 0)
+            return err;
+
+        pos = get_bits_count(&gbc);
+        av_assert0(pos % 8 == 0);
+        if (pos > 0) {
+            scan->data_size = unit->data_size - pos / 8;
+            scan->data_ref  = av_buffer_ref(unit->data_ref);
+            if (!scan->data_ref)
+                return AVERROR(ENOMEM);
+            scan->data = unit->data + pos / 8;
+        }
+
+    } else {
+        switch (unit->type) {
+#define SEGMENT(marker, type, func, free) \
+        case JPEG_MARKER_ ## marker: \
+            { \
+                err = ff_cbs_alloc_unit_content(ctx, unit, \
+                                                sizeof(type), free); \
+                if (err < 0) \
+                    return err; \
+                err = cbs_jpeg_read_ ## func(ctx, &gbc, unit->content); \
+                if (err < 0) \
+                    return err; \
+            } \
+            break
+            SEGMENT(DQT, JPEGRawQuantisationTableSpecification, dqt, NULL);
+            SEGMENT(DHT, JPEGRawHuffmanTableSpecification,      dht, NULL);
+            SEGMENT(COM, JPEGRawComment,  comment, &cbs_jpeg_free_comment);
+#undef SEGMENT
+        default:
+            return AVERROR(ENOSYS);
+        }
+    }
+
+    return 0;
+}
+
+static int cbs_jpeg_write_scan(CodedBitstreamContext *ctx,
+                               CodedBitstreamUnit *unit,
+                               PutBitContext *pbc)
+{
+    JPEGRawScan *scan = unit->content;
+    int i, err;
+
+    err = cbs_jpeg_write_scan_header(ctx, pbc, &scan->header);
+    if (err < 0)
+        return err;
+
+    if (scan->data) {
+        if (scan->data_size * 8 > put_bits_left(pbc))
+            return AVERROR(ENOSPC);
+
+        for (i = 0; i < scan->data_size; i++)
+            put_bits(pbc, 8, scan->data[i]);
+    }
+
+    return 0;
+}
+
+static int cbs_jpeg_write_segment(CodedBitstreamContext *ctx,
+                                  CodedBitstreamUnit *unit,
+                                  PutBitContext *pbc)
+{
+    int err;
+
+    if (unit->type >= JPEG_MARKER_SOF0 &&
+        unit->type <= JPEG_MARKER_SOF3) {
+        err = cbs_jpeg_write_frame_header(ctx, pbc, unit->content);
+    } else if (unit->type >= JPEG_MARKER_APPN &&
+               unit->type <= JPEG_MARKER_APPN + 15) {
+        err = cbs_jpeg_write_application_data(ctx, pbc, unit->content);
+    } else {
+        switch (unit->type) {
+#define SEGMENT(marker, func) \
+            case JPEG_MARKER_ ## marker: \
+                err = cbs_jpeg_write_ ## func(ctx, pbc, unit->content); \
+                break;
+            SEGMENT(DQT, dqt);
+            SEGMENT(DHT, dht);
+            SEGMENT(COM, comment);
+        default:
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+
+    return err;
+}
+
+static int cbs_jpeg_write_unit(CodedBitstreamContext *ctx,
+                                CodedBitstreamUnit *unit)
+{
+    CodedBitstreamJPEGContext *priv = ctx->priv_data;
+    PutBitContext pbc;
+    int err;
+
+    if (!priv->write_buffer) {
+        // Initial write buffer size is 1MB.
+        priv->write_buffer_size = 1024 * 1024;
+
+    reallocate_and_try_again:
+        err = av_reallocp(&priv->write_buffer, priv->write_buffer_size);
+        if (err < 0) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Unable to allocate a "
+                   "sufficiently large write buffer (last attempt "
+                   "%"SIZE_SPECIFIER" bytes).\n", priv->write_buffer_size);
+            return err;
+        }
+    }
+
+    init_put_bits(&pbc, priv->write_buffer, priv->write_buffer_size);
+
+    if (unit->type == JPEG_MARKER_SOS)
+        err = cbs_jpeg_write_scan(ctx, unit, &pbc);
+    else
+        err = cbs_jpeg_write_segment(ctx, unit, &pbc);
+
+    if (err == AVERROR(ENOSPC)) {
+        // Overflow.
+        priv->write_buffer_size *= 2;
+        goto reallocate_and_try_again;
+    }
+    if (err < 0) {
+        // Write failed for some other reason.
+        return err;
+    }
+
+    if (put_bits_count(&pbc) % 8)
+        unit->data_bit_padding = 8 - put_bits_count(&pbc) % 8;
+    else
+        unit->data_bit_padding = 0;
+
+    unit->data_size = (put_bits_count(&pbc) + 7) / 8;
+    flush_put_bits(&pbc);
+
+    err = ff_cbs_alloc_unit_data(ctx, unit, unit->data_size);
+    if (err < 0)
+        return err;
+
+    memcpy(unit->data, priv->write_buffer, unit->data_size);
+
+    return 0;
+}
+
+static int cbs_jpeg_assemble_fragment(CodedBitstreamContext *ctx,
+                                       CodedBitstreamFragment *frag)
+{
+    const CodedBitstreamUnit *unit;
+    uint8_t *data;
+    size_t size, dp, sp;
+    int i;
+
+    size = 4; // SOI + EOI.
+    for (i = 0; i < frag->nb_units; i++) {
+        unit = &frag->units[i];
+        size += 2 + unit->data_size;
+        if (unit->type == JPEG_MARKER_SOS) {
+            for (sp = 0; sp < unit->data_size; sp++) {
+                if (unit->data[sp] == 0xff)
+                    ++size;
+            }
+        }
+    }
+
+    frag->data_ref = av_buffer_alloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!frag->data_ref)
+        return AVERROR(ENOMEM);
+    data = frag->data_ref->data;
+
+    dp = 0;
+
+    data[dp++] = 0xff;
+    data[dp++] = JPEG_MARKER_SOI;
+
+    for (i = 0; i < frag->nb_units; i++) {
+        unit = &frag->units[i];
+
+        data[dp++] = 0xff;
+        data[dp++] = unit->type;
+
+        if (unit->type != JPEG_MARKER_SOS) {
+            memcpy(data + dp, unit->data, unit->data_size);
+            dp += unit->data_size;
+        } else {
+            sp = AV_RB16(unit->data);
+            av_assert0(sp <= unit->data_size);
+            memcpy(data + dp, unit->data, sp);
+            dp += sp;
+
+            for (; sp < unit->data_size; sp++) {
+                if (unit->data[sp] == 0xff) {
+                    data[dp++] = 0xff;
+                    data[dp++] = 0x00;
+                } else {
+                    data[dp++] = unit->data[sp];
+                }
+            }
+        }
+    }
+
+    data[dp++] = 0xff;
+    data[dp++] = JPEG_MARKER_EOI;
+
+    av_assert0(dp == size);
+
+    memset(data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    frag->data      = data;
+    frag->data_size = size;
+
+    return 0;
+}
+
+static void cbs_jpeg_close(CodedBitstreamContext *ctx)
+{
+    CodedBitstreamJPEGContext *priv = ctx->priv_data;
+
+    av_freep(&priv->write_buffer);
+}
+
+const CodedBitstreamType ff_cbs_type_jpeg = {
+    .codec_id          = AV_CODEC_ID_MJPEG,
+
+    .priv_data_size    = sizeof(CodedBitstreamJPEGContext),
+
+    .split_fragment    = &cbs_jpeg_split_fragment,
+    .read_unit         = &cbs_jpeg_read_unit,
+    .write_unit        = &cbs_jpeg_write_unit,
+    .assemble_fragment = &cbs_jpeg_assemble_fragment,
+
+    .close             = &cbs_jpeg_close,
+};
diff --git a/libavcodec/cbs_jpeg.h b/libavcodec/cbs_jpeg.h
new file mode 100644
index 0000000..913d3f9
--- /dev/null
+++ b/libavcodec/cbs_jpeg.h
@@ -0,0 +1,130 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CBS_JPEG_H
+#define AVCODEC_CBS_JPEG_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/buffer.h"
+
+
+enum {
+    JPEG_MARKER_SOF0    = 0xc0,
+    JPEG_MARKER_SOF1    = 0xc1,
+    JPEG_MARKER_SOF2    = 0xc2,
+    JPEG_MARKER_SOF3    = 0xc3,
+
+    JPEG_MARKER_DHT     = 0xc4,
+    JPEG_MARKER_SOI     = 0xd8,
+    JPEG_MARKER_EOI     = 0xd9,
+    JPEG_MARKER_SOS     = 0xda,
+    JPEG_MARKER_DQT     = 0xdb,
+
+    JPEG_MARKER_APPN    = 0xe0,
+    JPEG_MARKER_JPGN    = 0xf0,
+    JPEG_MARKER_COM     = 0xfe,
+};
+
+enum {
+    JPEG_MAX_COMPONENTS = 255,
+
+    JPEG_MAX_HEIGHT = 65535,
+    JPEG_MAX_WIDTH  = 65535,
+};
+
+
+typedef struct JPEGRawFrameHeader {
+    uint16_t Lf;
+    uint8_t  P;
+    uint16_t Y;
+    uint16_t X;
+    uint16_t Nf;
+
+    uint8_t  C [JPEG_MAX_COMPONENTS];
+    uint8_t  H [JPEG_MAX_COMPONENTS];
+    uint8_t  V [JPEG_MAX_COMPONENTS];
+    uint8_t  Tq[JPEG_MAX_COMPONENTS];
+} JPEGRawFrameHeader;
+
+typedef struct JPEGRawScanHeader {
+    uint16_t Ls;
+    uint8_t  Ns;
+
+    uint8_t  Cs[JPEG_MAX_COMPONENTS];
+    uint8_t  Td[JPEG_MAX_COMPONENTS];
+    uint8_t  Ta[JPEG_MAX_COMPONENTS];
+
+    uint8_t  Ss;
+    uint8_t  Se;
+    uint8_t  Ah;
+    uint8_t  Al;
+} JPEGRawScanHeader;
+
+typedef struct JPEGRawScan {
+    JPEGRawScanHeader header;
+    uint8_t          *data;
+    size_t            data_size;
+    AVBufferRef      *data_ref;
+} JPEGRawScan;
+
+typedef struct JPEGRawQuantisationTable {
+    uint8_t  Pq;
+    uint8_t  Tq;
+    uint16_t Q[64];
+} JPEGRawQuantisationTable;
+
+typedef struct JPEGRawQuantisationTableSpecification {
+    uint16_t Lq;
+    JPEGRawQuantisationTable table[4];
+} JPEGRawQuantisationTableSpecification;
+
+typedef struct JPEGRawHuffmanTable {
+    uint8_t  Tc;
+    uint8_t  Th;
+    uint8_t  L[16];
+    uint8_t  V[224];
+} JPEGRawHuffmanTable;
+
+typedef struct JPEGRawHuffmanTableSpecification {
+    uint16_t Lh;
+    JPEGRawHuffmanTable table[8];
+} JPEGRawHuffmanTableSpecification;
+
+typedef struct JPEGRawApplicationData {
+    uint16_t     Lp;
+    uint8_t     *Ap;
+    AVBufferRef *Ap_ref;
+} JPEGRawApplicationData;
+
+typedef struct JPEGRawComment {
+    uint16_t     Lc;
+    uint8_t     *Cm;
+    AVBufferRef *Cm_ref;
+} JPEGRawComment;
+
+
+typedef struct CodedBitstreamJPEGContext {
+    // Write buffer.
+    uint8_t *write_buffer;
+    size_t write_buffer_size;
+} CodedBitstreamJPEGContext;
+
+
+#endif /* AVCODEC_CBS_JPEG_H */
diff --git a/libavcodec/cbs_jpeg_syntax_template.c b/libavcodec/cbs_jpeg_syntax_template.c
new file mode 100644
index 0000000..d3cd9ff
--- /dev/null
+++ b/libavcodec/cbs_jpeg_syntax_template.c
@@ -0,0 +1,191 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static int FUNC(frame_header)(CodedBitstreamContext *ctx, RWContext *rw,
+                              JPEGRawFrameHeader *current)
+{
+    int err, i;
+
+    HEADER("Frame Header");
+
+    u(16, Lf, 8, 8 + 3 * JPEG_MAX_COMPONENTS);
+
+    u(8,  P,  2, 16);
+    u(16, Y,  0, JPEG_MAX_HEIGHT);
+    u(16, X,  1, JPEG_MAX_WIDTH);
+    u(8,  Nf, 1, JPEG_MAX_COMPONENTS);
+
+    for (i = 0; i < current->Nf; i++) {
+        us(8, C[i],  i, 0, JPEG_MAX_COMPONENTS);
+        us(4, H[i],  i, 1, 4);
+        us(4, V[i],  i, 1, 4);
+        us(8, Tq[i], i, 0, 3);
+    }
+
+    return 0;
+}
+
+static int FUNC(quantisation_table)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     JPEGRawQuantisationTable *current)
+{
+    int err, i;
+
+    u(4, Pq, 0, 1);
+    u(4, Tq, 0, 3);
+
+    if (current->Pq) {
+        for (i = 0; i < 64; i++)
+            us(16, Q[i], i, 1, 255);
+    } else {
+        for (i = 0; i < 64; i++)
+            us(8,  Q[i], i, 1, 255);
+    }
+
+    return 0;
+}
+
+static int FUNC(dqt)(CodedBitstreamContext *ctx, RWContext *rw,
+                     JPEGRawQuantisationTableSpecification *current)
+{
+    int err, i, n;
+
+    HEADER("Quantisation Tables");
+
+    u(16, Lq, 2, 2 + 4 * 65);
+    n = current->Lq / 65;
+
+    for (i = 0; i < n; i++)
+        CHECK(FUNC(quantisation_table)(ctx, rw, &current->table[i]));
+
+    return 0;
+}
+
+static int FUNC(huffman_table)(CodedBitstreamContext *ctx, RWContext *rw,
+                               JPEGRawHuffmanTable *current)
+{
+    int err, i, j, ij;
+
+    u(4, Tc, 0, 1);
+    u(4, Th, 0, 3);
+
+    for (i = 0; i < 16; i++)
+        us(8, L[i], i, 0, 224);
+
+    ij = 0;
+    for (i = 0; i < 16; i++) {
+        for (j = 0; j < current->L[i]; j++) {
+            us(8, V[ij], ij, 0, 255);
+            ++ij;
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(dht)(CodedBitstreamContext *ctx, RWContext *rw,
+                     JPEGRawHuffmanTableSpecification *current)
+{
+    int err, i, j, n;
+
+    HEADER("Huffman Tables");
+
+    u(16, Lh, 2, 2 + 8 * (1 + 16 + 256));
+
+    n = 2;
+    for (i = 0; n < current->Lh; i++) {
+        CHECK(FUNC(huffman_table)(ctx, rw, &current->table[i]));
+
+        ++n;
+        for (j = 0; j < 16; j++)
+            n += 1 + current->table[i].L[j];
+    }
+
+    return 0;
+}
+
+static int FUNC(scan_header)(CodedBitstreamContext *ctx, RWContext *rw,
+                             JPEGRawScanHeader *current)
+{
+    int err, j;
+
+    HEADER("Scan");
+
+    u(16, Ls, 6, 6 + 2 * JPEG_MAX_COMPONENTS);
+
+    u(8, Ns, 1, 4);
+    for (j = 0; j < current->Ns; j++) {
+        us(8, Cs[j], j, 0, JPEG_MAX_COMPONENTS);
+        us(4, Td[j], j, 0, 3);
+        us(4, Ta[j], j, 0, 3);
+    }
+
+    u(8, Ss, 0, 63);
+    u(8, Se, 0, 63);
+    u(4, Ah, 0, 13);
+    u(4, Al, 0, 15);
+
+    return 0;
+}
+
+static int FUNC(application_data)(CodedBitstreamContext *ctx, RWContext *rw,
+                                  JPEGRawApplicationData *current)
+{
+    int err, i;
+
+    HEADER("Application Data");
+
+    u(16, Lp, 2, 65535);
+
+    if (current->Lp > 2) {
+#ifdef READ
+        current->Ap_ref = av_buffer_alloc(current->Lp - 2);
+        if (!current->Ap_ref)
+            return AVERROR(ENOMEM);
+        current->Ap = current->Ap_ref->data;
+#endif
+
+        for (i = 0; i < current->Lp - 2; i++)
+            us(8, Ap[i], i, 0, 255);
+    }
+
+    return 0;
+}
+
+static int FUNC(comment)(CodedBitstreamContext *ctx, RWContext *rw,
+                         JPEGRawComment *current)
+{
+    int err, i;
+
+    HEADER("Comment");
+
+    u(16, Lc, 2, 65535);
+
+    if (current->Lc > 2) {
+#ifdef READ
+        current->Cm_ref = av_buffer_alloc(current->Lc - 2);
+        if (!current->Cm_ref)
+            return AVERROR(ENOMEM);
+        current->Cm = current->Cm_ref->data;
+#endif
+
+        for (i = 0; i < current->Lc - 2; i++)
+            us(8, Cm[i], i, 0, 255);
+    }
+
+    return 0;
+}
diff --git a/libavcodec/cbs_mpeg2.c b/libavcodec/cbs_mpeg2.c
index 3db10c5..8b8b266 100644
--- a/libavcodec/cbs_mpeg2.c
+++ b/libavcodec/cbs_mpeg2.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,29 +38,34 @@
 #define FUNC_MPEG2(rw, name) FUNC_NAME(rw, mpeg2, name)
 #define FUNC(name) FUNC_MPEG2(READWRITE, name)
 
+#define SUBSCRIPTS(subs, ...) (subs > 0 ? ((int[subs + 1]){ subs, __VA_ARGS__ }) : NULL)
+
+#define ui(width, name) \
+        xui(width, name, current->name, 0)
+#define uis(width, name, subs, ...) \
+        xui(width, name, current->name, subs, __VA_ARGS__)
+
 
 #define READ
 #define READWRITE read
-#define RWContext BitstreamContext
+#define RWContext GetBitContext
 
-#define xui(width, name, var) do { \
+#define xui(width, name, var, subs, ...) do { \
         uint32_t value = 0; \
         CHECK(ff_cbs_read_unsigned(ctx, rw, width, #name, \
+                                   SUBSCRIPTS(subs, __VA_ARGS__), \
                                    &value, 0, (1 << width) - 1)); \
         var = value; \
     } while (0)
 
-#define ui(width, name) \
-        xui(width, name, current->name)
-
 #define marker_bit() do { \
         av_unused uint32_t one; \
-        CHECK(ff_cbs_read_unsigned(ctx, rw, 1, "marker_bit", &one, 1, 1)); \
+        CHECK(ff_cbs_read_unsigned(ctx, rw, 1, "marker_bit", NULL, &one, 1, 1)); \
     } while (0)
 
 #define nextbits(width, compare, var) \
-    (bitstream_bits_left(rw) >= width && \
-     (var = bitstream_peek(rw, width)) == (compare))
+    (get_bits_left(rw) >= width && \
+     (var = show_bits(rw, width)) == (compare))
 
 #include "cbs_mpeg2_syntax_template.c"
 
@@ -68,7 +73,6 @@
 #undef READWRITE
 #undef RWContext
 #undef xui
-#undef ui
 #undef marker_bit
 #undef nextbits
 
@@ -77,16 +81,14 @@
 #define READWRITE write
 #define RWContext PutBitContext
 
-#define xui(width, name, var) do { \
+#define xui(width, name, var, subs, ...) do { \
         CHECK(ff_cbs_write_unsigned(ctx, rw, width, #name, \
+                                    SUBSCRIPTS(subs, __VA_ARGS__), \
                                     var, 0, (1 << width) - 1)); \
     } while (0)
 
-#define ui(width, name) \
-        xui(width, name, current->name)
-
 #define marker_bit() do { \
-        CHECK(ff_cbs_write_unsigned(ctx, rw, 1, "marker_bit", 1, 1, 1)); \
+        CHECK(ff_cbs_write_unsigned(ctx, rw, 1, "marker_bit", NULL, 1, 1, 1)); \
     } while (0)
 
 #define nextbits(width, compare, var) (var)
@@ -97,7 +99,6 @@
 #undef READWRITE
 #undef RWContext
 #undef xui
-#undef ui
 #undef marker_bit
 #undef nextbits
 
@@ -146,18 +147,12 @@ static int cbs_mpeg2_split_fragment(CodedBitstreamContext *ctx,
             unit_size = (end - 4) - (start - 1);
         }
 
-        unit_data = av_malloc(unit_size + AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!unit_data)
-            return AVERROR(ENOMEM);
-        memcpy(unit_data, start - 1, unit_size);
-        memset(unit_data + unit_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        unit_data = (uint8_t *)start - 1;
 
         err = ff_cbs_insert_unit_data(ctx, frag, i, unit_type,
-                                      unit_data, unit_size, NULL);
-        if (err < 0) {
-            av_freep(&unit_data);
+                                      unit_data, unit_size, frag->data_ref);
+        if (err < 0)
             return err;
-        }
 
         if (end == frag->data + frag->data_size)
             break;
@@ -172,10 +167,10 @@ static int cbs_mpeg2_split_fragment(CodedBitstreamContext *ctx,
 static int cbs_mpeg2_read_unit(CodedBitstreamContext *ctx,
                                CodedBitstreamUnit *unit)
 {
-    BitstreamContext bc;
+    GetBitContext gbc;
     int err;
 
-    err = bitstream_init(&bc, unit->data, 8 * unit->data_size);
+    err = init_get_bits(&gbc, unit->data, 8 * unit->data_size);
     if (err < 0)
         return err;
 
@@ -189,24 +184,19 @@ static int cbs_mpeg2_read_unit(CodedBitstreamContext *ctx,
             return err;
         slice = unit->content;
 
-        err = cbs_mpeg2_read_slice_header(ctx, &bc, &slice->header);
+        err = cbs_mpeg2_read_slice_header(ctx, &gbc, &slice->header);
         if (err < 0)
             return err;
 
-        pos = bitstream_tell(&bc);
+        pos = get_bits_count(&gbc);
         len = unit->data_size;
 
         slice->data_size = len - pos / 8;
-        slice->data_ref  = av_buffer_alloc(slice->data_size +
-                                           AV_INPUT_BUFFER_PADDING_SIZE);
+        slice->data_ref  = av_buffer_ref(unit->data_ref);
         if (!slice->data_ref)
             return AVERROR(ENOMEM);
-        slice->data = slice->data_ref->data;
+        slice->data = unit->data + pos / 8;
 
-        memcpy(slice->data,
-               unit->data + pos / 8, slice->data_size);
-        memset(slice->data + slice->data_size, 0,
-               AV_INPUT_BUFFER_PADDING_SIZE);
         slice->data_bit_start = pos % 8;
 
     } else {
@@ -220,7 +210,7 @@ static int cbs_mpeg2_read_unit(CodedBitstreamContext *ctx,
                 if (err < 0) \
                     return err; \
                 header = unit->content; \
-                err = cbs_mpeg2_read_ ## read_func(ctx, &bc, header); \
+                err = cbs_mpeg2_read_ ## read_func(ctx, &gbc, header); \
                 if (err < 0) \
                     return err; \
             } \
@@ -274,8 +264,6 @@ static int cbs_mpeg2_write_slice(CodedBitstreamContext *ctx,
                                  PutBitContext *pbc)
 {
     MPEG2RawSlice *slice = unit->content;
-    BitstreamContext bc;
-    size_t bits_left;
     int err;
 
     err = cbs_mpeg2_write_slice_header(ctx, pbc, &slice->header);
@@ -283,21 +271,38 @@ static int cbs_mpeg2_write_slice(CodedBitstreamContext *ctx,
         return err;
 
     if (slice->data) {
+        size_t rest = slice->data_size - (slice->data_bit_start + 7) / 8;
+        uint8_t *pos = slice->data + slice->data_bit_start / 8;
+
+        av_assert0(slice->data_bit_start >= 0 &&
+                   8 * slice->data_size > slice->data_bit_start);
+
         if (slice->data_size * 8 + 8 > put_bits_left(pbc))
             return AVERROR(ENOSPC);
 
-        bitstream_init(&bc, slice->data, slice->data_size * 8);
-        bitstream_skip(&bc, slice->data_bit_start);
-
-        while (bitstream_bits_left(&bc) > 15)
-            put_bits(pbc, 16, bitstream_read(&bc, 16));
+        // First copy the remaining bits of the first byte
+        if (slice->data_bit_start % 8)
+            put_bits(pbc, 8 - slice->data_bit_start % 8,
+                     *pos++ & MAX_UINT_BITS(8 - slice->data_bit_start % 8));
+
+        if (put_bits_count(pbc) % 8 == 0) {
+            // If the writer is aligned at this point,
+            // memcpy can be used to improve performance.
+            // This is the normal case.
+            flush_put_bits(pbc);
+            memcpy(put_bits_ptr(pbc), pos, rest);
+            skip_put_bytes(pbc, rest);
+        } else {
+            // If not, we have to copy manually:
+            for (; rest > 3; rest -= 4, pos += 4)
+                put_bits32(pbc, AV_RB32(pos));
 
-        bits_left = bitstream_bits_left(&bc);
-        put_bits(pbc, bits_left, bitstream_read(&bc, bits_left));
+            for (; rest; rest--, pos++)
+                put_bits(pbc, 8, *pos);
 
-        // Align with zeroes.
-        while (put_bits_count(pbc) % 8 != 0)
-            put_bits(pbc, 1, 0);
+            // Align with zeros
+            put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0);
+        }
     }
 
     return 0;
@@ -319,7 +324,7 @@ static int cbs_mpeg2_write_unit(CodedBitstreamContext *ctx,
         if (err < 0) {
             av_log(ctx->log_ctx, AV_LOG_ERROR, "Unable to allocate a "
                    "sufficiently large write buffer (last attempt "
-                   "%zu bytes).\n", priv->write_buffer_size);
+                   "%"SIZE_SPECIFIER" bytes).\n", priv->write_buffer_size);
             return err;
         }
     }
@@ -362,14 +367,14 @@ static int cbs_mpeg2_assemble_fragment(CodedBitstreamContext *ctx,
                                        CodedBitstreamFragment *frag)
 {
     uint8_t *data;
-    size_t size, dp, sp;
+    size_t size, dp;
     int i;
 
     size = 0;
     for (i = 0; i < frag->nb_units; i++)
         size += 3 + frag->units[i].data_size;
 
-    frag->data_ref = av_buffer_alloc(size);
+    frag->data_ref = av_buffer_alloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!frag->data_ref)
         return AVERROR(ENOMEM);
     data = frag->data_ref->data;
@@ -382,12 +387,13 @@ static int cbs_mpeg2_assemble_fragment(CodedBitstreamContext *ctx,
         data[dp++] = 0;
         data[dp++] = 1;
 
-        for (sp = 0; sp < unit->data_size; sp++)
-            data[dp++] = unit->data[sp];
+        memcpy(data + dp, unit->data, unit->data_size);
+        dp += unit->data_size;
     }
 
     av_assert0(dp == size);
 
+    memset(data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     frag->data      = data;
     frag->data_size = size;
 
diff --git a/libavcodec/cbs_mpeg2.h b/libavcodec/cbs_mpeg2.h
index 31e57ce..92caa99 100644
--- a/libavcodec/cbs_mpeg2.h
+++ b/libavcodec/cbs_mpeg2.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cbs_mpeg2_syntax_template.c b/libavcodec/cbs_mpeg2_syntax_template.c
index b61fc8b..88cf453 100644
--- a/libavcodec/cbs_mpeg2_syntax_template.c
+++ b/libavcodec/cbs_mpeg2_syntax_template.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,13 +44,13 @@ static int FUNC(sequence_header)(CodedBitstreamContext *ctx, RWContext *rw,
     ui(1, load_intra_quantiser_matrix);
     if (current->load_intra_quantiser_matrix) {
         for (i = 0; i < 64; i++)
-            ui(8, intra_quantiser_matrix[i]);
+            uis(8, intra_quantiser_matrix[i], 1, i);
     }
 
     ui(1, load_non_intra_quantiser_matrix);
     if (current->load_non_intra_quantiser_matrix) {
         for (i = 0; i < 64; i++)
-            ui(8, non_intra_quantiser_matrix[i]);
+            uis(8, non_intra_quantiser_matrix[i], 1, i);
     }
 
     return 0;
@@ -67,11 +67,11 @@ static int FUNC(user_data)(CodedBitstreamContext *ctx, RWContext *rw,
     ui(8, user_data_start_code);
 
 #ifdef READ
-    k = bitstream_bits_left(rw);
+    k = get_bits_left(rw);
     av_assert0(k % 8 == 0);
     current->user_data_length = k /= 8;
     if (k > 0) {
-        current->user_data_ref = av_buffer_alloc(k);
+        current->user_data_ref = av_buffer_allocz(k + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!current->user_data_ref)
             return AVERROR(ENOMEM);
         current->user_data = current->user_data_ref->data;
@@ -79,7 +79,7 @@ static int FUNC(user_data)(CodedBitstreamContext *ctx, RWContext *rw,
 #endif
 
     for (k = 0; k < current->user_data_length; k++)
-        xui(8, user_data, current->user_data[k]);
+        xui(8, user_data, current->user_data[k], 0);
 
     return 0;
 }
@@ -250,25 +250,25 @@ static int FUNC(quant_matrix_extension)(CodedBitstreamContext *ctx, RWContext *r
     ui(1, load_intra_quantiser_matrix);
     if (current->load_intra_quantiser_matrix) {
         for (i = 0; i < 64; i++)
-            ui(8, intra_quantiser_matrix[i]);
+            uis(8, intra_quantiser_matrix[i], 1, i);
     }
 
     ui(1, load_non_intra_quantiser_matrix);
     if (current->load_non_intra_quantiser_matrix) {
         for (i = 0; i < 64; i++)
-            ui(8, non_intra_quantiser_matrix[i]);
+            uis(8, non_intra_quantiser_matrix[i], 1, i);
     }
 
     ui(1, load_chroma_intra_quantiser_matrix);
     if (current->load_chroma_intra_quantiser_matrix) {
         for (i = 0; i < 64; i++)
-            ui(8, intra_quantiser_matrix[i]);
+            uis(8, intra_quantiser_matrix[i], 1, i);
     }
 
     ui(1, load_chroma_non_intra_quantiser_matrix);
     if (current->load_chroma_non_intra_quantiser_matrix) {
         for (i = 0; i < 64; i++)
-            ui(8, chroma_non_intra_quantiser_matrix[i]);
+            uis(8, chroma_non_intra_quantiser_matrix[i], 1, i);
     }
 
     return 0;
@@ -353,11 +353,11 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
         {
             size_t k;
 #ifdef READ
-            BitstreamContext start;
+            GetBitContext start;
             uint8_t bit;
             start = *rw;
             for (k = 0; nextbits(1, 1, bit); k++)
-                bitstream_skip(rw, 8);
+                skip_bits(rw, 8);
             current->extra_information_length = k;
             if (k > 0) {
                 *rw = start;
@@ -366,15 +366,16 @@ static int FUNC(slice_header)(CodedBitstreamContext *ctx, RWContext *rw,
                 if (!current->extra_information)
                     return AVERROR(ENOMEM);
                 for (k = 0; k < current->extra_information_length; k++) {
-                    xui(1, extra_bit_slice, bit);
-                    xui(8, extra_information_slice,
-                        current->extra_information[k]);
+                    xui(1, extra_bit_slice, bit, 0);
+                    xui(8, extra_information_slice[k],
+                        current->extra_information[k], 1, k);
                 }
             }
 #else
             for (k = 0; k < current->extra_information_length; k++) {
-                xui(1, extra_bit_slice, 1);
-                xui(8, extra_information_slice, current->extra_information[k]);
+                xui(1, extra_bit_slice, 1, 0);
+                xui(8, extra_information_slice[k],
+                    current->extra_information[k], 1, k);
             }
 #endif
         }
diff --git a/libavcodec/cbs_vp9.c b/libavcodec/cbs_vp9.c
new file mode 100644
index 0000000..0b5f137
--- /dev/null
+++ b/libavcodec/cbs_vp9.c
@@ -0,0 +1,692 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "cbs.h"
+#include "cbs_internal.h"
+#include "cbs_vp9.h"
+#include "internal.h"
+
+
+static int cbs_vp9_read_s(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                          int width, const char *name,
+                          const int *subscripts, int32_t *write_to)
+{
+    uint32_t magnitude;
+    int position, sign;
+    int32_t value;
+
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    if (get_bits_left(gbc) < width + 1) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid signed value at "
+               "%s: bitstream ended.\n", name);
+        return AVERROR_INVALIDDATA;
+    }
+
+    magnitude = get_bits(gbc, width);
+    sign      = get_bits1(gbc);
+    value     = sign ? -(int32_t)magnitude : magnitude;
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (i = 0; i < width; i++)
+            bits[i] = magnitude >> (width - i - 1) & 1 ? '1' : '0';
+        bits[i] = sign ? '1' : '0';
+        bits[i + 1] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, position, name, subscripts,
+                                    bits, value);
+    }
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_vp9_write_s(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                           int width, const char *name,
+                           const int *subscripts, int32_t value)
+{
+    uint32_t magnitude;
+    int sign;
+
+    if (put_bits_left(pbc) < width + 1)
+        return AVERROR(ENOSPC);
+
+    sign      = value < 0;
+    magnitude = sign ? -value : value;
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (i = 0; i < width; i++)
+            bits[i] = magnitude >> (width - i - 1) & 1 ? '1' : '0';
+        bits[i] = sign ? '1' : '0';
+        bits[i + 1] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, subscripts, bits, value);
+    }
+
+    put_bits(pbc, width, magnitude);
+    put_bits(pbc, 1, sign);
+
+    return 0;
+}
+
+static int cbs_vp9_read_increment(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                                  uint32_t range_min, uint32_t range_max,
+                                  const char *name, uint32_t *write_to)
+{
+    uint32_t value;
+    int position, i;
+    char bits[8];
+
+    av_assert0(range_min <= range_max && range_max - range_min < sizeof(bits) - 1);
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    for (i = 0, value = range_min; value < range_max;) {
+        if (get_bits_left(gbc) < 1) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid increment value at "
+                   "%s: bitstream ended.\n", name);
+            return AVERROR_INVALIDDATA;
+        }
+        if (get_bits1(gbc)) {
+            bits[i++] = '1';
+            ++value;
+        } else {
+            bits[i++] = '0';
+            break;
+        }
+    }
+
+    if (ctx->trace_enable) {
+        bits[i] = 0;
+        ff_cbs_trace_syntax_element(ctx, position, name, NULL, bits, value);
+    }
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_vp9_write_increment(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                                   uint32_t range_min, uint32_t range_max,
+                                   const char *name, uint32_t value)
+{
+    int len;
+
+    av_assert0(range_min <= range_max && range_max - range_min < 8);
+    if (value < range_min || value > range_max) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: "
+               "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n",
+               name, value, range_min, range_max);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (value == range_max)
+        len = range_max - range_min;
+    else
+        len = value - range_min + 1;
+    if (put_bits_left(pbc) < len)
+        return AVERROR(ENOSPC);
+
+    if (ctx->trace_enable) {
+        char bits[8];
+        int i;
+        for (i = 0; i < len; i++) {
+            if (range_min + i == value)
+                bits[i] = '0';
+            else
+                bits[i] = '1';
+        }
+        bits[i] = 0;
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, NULL, bits, value);
+    }
+
+    if (len > 0)
+        put_bits(pbc, len, (1 << len) - 1 - (value != range_max));
+
+    return 0;
+}
+
+static int cbs_vp9_read_le(CodedBitstreamContext *ctx, GetBitContext *gbc,
+                           int width, const char *name,
+                           const int *subscripts, uint32_t *write_to)
+{
+    uint32_t value;
+    int position, b;
+
+    av_assert0(width % 8 == 0);
+
+    if (ctx->trace_enable)
+        position = get_bits_count(gbc);
+
+    if (get_bits_left(gbc) < width) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid le value at "
+               "%s: bitstream ended.\n", name);
+        return AVERROR_INVALIDDATA;
+    }
+
+    value = 0;
+    for (b = 0; b < width; b += 8)
+        value |= get_bits(gbc, 8) << b;
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (b = 0; b < width; b += 8)
+            for (i = 0; i < 8; i++)
+                bits[b + i] = value >> (b + i) & 1 ? '1' : '0';
+        bits[b] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, position, name, subscripts,
+                                    bits, value);
+    }
+
+    *write_to = value;
+    return 0;
+}
+
+static int cbs_vp9_write_le(CodedBitstreamContext *ctx, PutBitContext *pbc,
+                            int width, const char *name,
+                            const int *subscripts, uint32_t value)
+{
+    int b;
+
+    av_assert0(width % 8 == 0);
+
+    if (put_bits_left(pbc) < width)
+        return AVERROR(ENOSPC);
+
+    if (ctx->trace_enable) {
+        char bits[33];
+        int i;
+        for (b = 0; b < width; b += 8)
+            for (i = 0; i < 8; i++)
+                bits[b + i] = value >> (b + i) & 1 ? '1' : '0';
+        bits[b] = 0;
+
+        ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc),
+                                    name, subscripts, bits, value);
+    }
+
+    for (b = 0; b < width; b += 8)
+        put_bits(pbc, 8, value >> b & 0xff);
+
+    return 0;
+}
+
+#define HEADER(name) do { \
+        ff_cbs_trace_header(ctx, name); \
+    } while (0)
+
+#define CHECK(call) do { \
+        err = (call); \
+        if (err < 0) \
+            return err; \
+    } while (0)
+
+#define FUNC_NAME(rw, codec, name) cbs_ ## codec ## _ ## rw ## _ ## name
+#define FUNC_VP9(rw, name) FUNC_NAME(rw, vp9, name)
+#define FUNC(name) FUNC_VP9(READWRITE, name)
+
+#define SUBSCRIPTS(subs, ...) (subs > 0 ? ((int[subs + 1]){ subs, __VA_ARGS__ }) : NULL)
+
+#define f(width, name) \
+        xf(width, name, current->name, 0)
+#define s(width, name) \
+        xs(width, name, current->name, 0)
+#define fs(width, name, subs, ...) \
+        xf(width, name, current->name, subs, __VA_ARGS__)
+#define ss(width, name, subs, ...) \
+        xs(width, name, current->name, subs, __VA_ARGS__)
+
+
+#define READ
+#define READWRITE read
+#define RWContext GetBitContext
+
+#define xf(width, name, var, subs, ...) do { \
+        uint32_t value = 0; \
+        CHECK(ff_cbs_read_unsigned(ctx, rw, width, #name, \
+                                   SUBSCRIPTS(subs, __VA_ARGS__), \
+                                   &value, 0, (1 << width) - 1)); \
+        var = value; \
+    } while (0)
+#define xs(width, name, var, subs, ...) do { \
+        int32_t value = 0; \
+        CHECK(cbs_vp9_read_s(ctx, rw, width, #name, \
+                             SUBSCRIPTS(subs, __VA_ARGS__), &value)); \
+        var = value; \
+    } while (0)
+
+
+#define increment(name, min, max) do { \
+        uint32_t value = 0; \
+        CHECK(cbs_vp9_read_increment(ctx, rw, min, max, #name, &value)); \
+        current->name = value; \
+    } while (0)
+
+#define fle(width, name, subs, ...) do { \
+        CHECK(cbs_vp9_read_le(ctx, rw, width, #name, \
+                              SUBSCRIPTS(subs, __VA_ARGS__), &current->name)); \
+    } while (0)
+
+#define delta_q(name) do { \
+        uint8_t delta_coded; \
+        int8_t delta_q; \
+        xf(1, name.delta_coded, delta_coded, 0); \
+        if (delta_coded) \
+            xs(4, name.delta_q, delta_q, 0); \
+        else \
+            delta_q = 0; \
+        current->name = delta_q; \
+    } while (0)
+
+#define prob(name, subs, ...) do { \
+        uint8_t prob_coded; \
+        uint8_t prob; \
+        xf(1, name.prob_coded, prob_coded, subs, __VA_ARGS__); \
+        if (prob_coded) \
+            xf(8, name.prob, prob, subs, __VA_ARGS__); \
+        else \
+            prob = 255; \
+        current->name = prob; \
+    } while (0)
+
+#define fixed(width, name, value) do { \
+        av_unused uint32_t fixed_value = value; \
+        CHECK(ff_cbs_read_unsigned(ctx, rw, width, #name, \
+                                   0, &fixed_value, value, value)); \
+    } while (0)
+
+#define infer(name, value) do { \
+        current->name = value; \
+    } while (0)
+
+#define byte_alignment(rw) (get_bits_count(rw) % 8)
+
+#include "cbs_vp9_syntax_template.c"
+
+#undef READ
+#undef READWRITE
+#undef RWContext
+#undef xf
+#undef xs
+#undef increment
+#undef fle
+#undef delta_q
+#undef prob
+#undef fixed
+#undef infer
+#undef byte_alignment
+
+
+#define WRITE
+#define READWRITE write
+#define RWContext PutBitContext
+
+#define xf(width, name, var, subs, ...) do { \
+        CHECK(ff_cbs_write_unsigned(ctx, rw, width, #name, \
+                                    SUBSCRIPTS(subs, __VA_ARGS__), \
+                                    var, 0, (1 << width) - 1)); \
+    } while (0)
+#define xs(width, name, var, subs, ...) do { \
+        CHECK(cbs_vp9_write_s(ctx, rw, width, #name, \
+                              SUBSCRIPTS(subs, __VA_ARGS__), var)); \
+    } while (0)
+
+#define increment(name, min, max) do { \
+        CHECK(cbs_vp9_write_increment(ctx, rw, min, max, #name, current->name)); \
+    } while (0)
+
+#define fle(width, name, subs, ...) do { \
+        CHECK(cbs_vp9_write_le(ctx, rw, width, #name, \
+                               SUBSCRIPTS(subs, __VA_ARGS__), current->name)); \
+    } while (0)
+
+#define delta_q(name) do { \
+        xf(1, name.delta_coded, !!current->name, 0); \
+        if (current->name) \
+            xs(4, name.delta_q, current->name, 0); \
+    } while (0)
+
+#define prob(name, subs, ...) do { \
+        xf(1, name.prob_coded, current->name != 255, subs, __VA_ARGS__); \
+        if (current->name != 255) \
+            xf(8, name.prob, current->name, subs, __VA_ARGS__); \
+    } while (0)
+
+#define fixed(width, name, value) do { \
+        CHECK(ff_cbs_write_unsigned(ctx, rw, width, #name, \
+                                    0, value, value, value)); \
+    } while (0)
+
+#define infer(name, value) do { \
+        if (current->name != (value)) { \
+            av_log(ctx->log_ctx, AV_LOG_WARNING, "Warning: " \
+                   "%s does not match inferred value: " \
+                   "%"PRId64", but should be %"PRId64".\n", \
+                   #name, (int64_t)current->name, (int64_t)(value)); \
+        } \
+    } while (0)
+
+#define byte_alignment(rw) (put_bits_count(rw) % 8)
+
+#include "cbs_vp9_syntax_template.c"
+
+#undef READ
+#undef READWRITE
+#undef RWContext
+#undef xf
+#undef xs
+#undef increment
+#undef fle
+#undef delta_q
+#undef prob
+#undef fixed
+#undef infer
+#undef byte_alignment
+
+
+static int cbs_vp9_split_fragment(CodedBitstreamContext *ctx,
+                                  CodedBitstreamFragment *frag,
+                                  int header)
+{
+    uint8_t superframe_header;
+    int err;
+
+    // Last byte in the packet.
+    superframe_header = frag->data[frag->data_size - 1];
+
+    if ((superframe_header & 0xe0) == 0xc0) {
+        VP9RawSuperframeIndex sfi;
+        GetBitContext gbc;
+        size_t index_size, pos;
+        int i;
+
+        index_size = 2 + (((superframe_header & 0x18) >> 3) + 1) *
+                          ((superframe_header & 0x07) + 1);
+
+        err = init_get_bits(&gbc, frag->data + frag->data_size - index_size,
+                            8 * index_size);
+        if (err < 0)
+            return err;
+
+        err = cbs_vp9_read_superframe_index(ctx, &gbc, &sfi);
+        if (err < 0)
+            return err;
+
+        pos = 0;
+        for (i = 0; i <= sfi.frames_in_superframe_minus_1; i++) {
+            if (pos + sfi.frame_sizes[i] + index_size > frag->data_size) {
+                av_log(ctx->log_ctx, AV_LOG_ERROR, "Frame %d too large "
+                       "in superframe: %"PRIu32" bytes.\n",
+                       i, sfi.frame_sizes[i]);
+                return AVERROR_INVALIDDATA;
+            }
+
+            err = ff_cbs_insert_unit_data(ctx, frag, -1, 0,
+                                          frag->data + pos,
+                                          sfi.frame_sizes[i],
+                                          frag->data_ref);
+            if (err < 0)
+                return err;
+
+            pos += sfi.frame_sizes[i];
+        }
+        if (pos + index_size != frag->data_size) {
+            av_log(ctx->log_ctx, AV_LOG_WARNING, "Extra padding at "
+                   "end of superframe: %"SIZE_SPECIFIER" bytes.\n",
+                   frag->data_size - (pos + index_size));
+        }
+
+        return 0;
+
+    } else {
+        err = ff_cbs_insert_unit_data(ctx, frag, -1, 0,
+                                      frag->data, frag->data_size,
+                                      frag->data_ref);
+        if (err < 0)
+            return err;
+    }
+
+    return 0;
+}
+
+static void cbs_vp9_free_frame(void *unit, uint8_t *content)
+{
+    VP9RawFrame *frame = (VP9RawFrame*)content;
+    av_buffer_unref(&frame->data_ref);
+    av_freep(&frame);
+}
+
+static int cbs_vp9_read_unit(CodedBitstreamContext *ctx,
+                             CodedBitstreamUnit *unit)
+{
+    VP9RawFrame *frame;
+    GetBitContext gbc;
+    int err, pos;
+
+    err = init_get_bits(&gbc, unit->data, 8 * unit->data_size);
+    if (err < 0)
+        return err;
+
+    err = ff_cbs_alloc_unit_content(ctx, unit, sizeof(*frame),
+                                    &cbs_vp9_free_frame);
+    if (err < 0)
+        return err;
+    frame = unit->content;
+
+    err = cbs_vp9_read_frame(ctx, &gbc, frame);
+    if (err < 0)
+        return err;
+
+    pos = get_bits_count(&gbc);
+    av_assert0(pos % 8 == 0);
+    pos /= 8;
+    av_assert0(pos <= unit->data_size);
+
+    if (pos == unit->data_size) {
+        // No data (e.g. a show-existing-frame frame).
+    } else {
+        frame->data_ref = av_buffer_ref(unit->data_ref);
+        if (!frame->data_ref)
+            return AVERROR(ENOMEM);
+
+        frame->data      = unit->data      + pos;
+        frame->data_size = unit->data_size - pos;
+    }
+
+    return 0;
+}
+
+static int cbs_vp9_write_unit(CodedBitstreamContext *ctx,
+                              CodedBitstreamUnit *unit)
+{
+    CodedBitstreamVP9Context *priv = ctx->priv_data;
+    VP9RawFrame *frame = unit->content;
+    PutBitContext pbc;
+    int err;
+
+    if (!priv->write_buffer) {
+        // Initial write buffer size is 1MB.
+        priv->write_buffer_size = 1024 * 1024;
+
+    reallocate_and_try_again:
+        err = av_reallocp(&priv->write_buffer, priv->write_buffer_size);
+        if (err < 0) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Unable to allocate a "
+                   "sufficiently large write buffer (last attempt "
+                   "%"SIZE_SPECIFIER" bytes).\n", priv->write_buffer_size);
+            return err;
+        }
+    }
+
+    init_put_bits(&pbc, priv->write_buffer, priv->write_buffer_size);
+
+    err = cbs_vp9_write_frame(ctx, &pbc, frame);
+    if (err == AVERROR(ENOSPC)) {
+        priv->write_buffer_size *= 2;
+        goto reallocate_and_try_again;
+    }
+    if (err < 0)
+        return err;
+
+    // Frame must be byte-aligned.
+    av_assert0(put_bits_count(&pbc) % 8 == 0);
+
+    unit->data_size        = put_bits_count(&pbc) / 8;
+    unit->data_bit_padding = 0;
+    flush_put_bits(&pbc);
+
+    if (frame->data) {
+        if (unit->data_size + frame->data_size >
+            priv->write_buffer_size) {
+            priv->write_buffer_size *= 2;
+            goto reallocate_and_try_again;
+        }
+
+        memcpy(priv->write_buffer + unit->data_size,
+               frame->data, frame->data_size);
+        unit->data_size += frame->data_size;
+    }
+
+    err = ff_cbs_alloc_unit_data(ctx, unit, unit->data_size);
+    if (err < 0)
+        return err;
+
+    memcpy(unit->data, priv->write_buffer, unit->data_size);
+
+    return 0;
+}
+
+static int cbs_vp9_assemble_fragment(CodedBitstreamContext *ctx,
+                                     CodedBitstreamFragment *frag)
+{
+    int err;
+
+    if (frag->nb_units == 1) {
+        // Output is just the content of the single frame.
+
+        CodedBitstreamUnit *frame = &frag->units[0];
+
+        frag->data_ref = av_buffer_ref(frame->data_ref);
+        if (!frag->data_ref)
+            return AVERROR(ENOMEM);
+
+        frag->data      = frame->data;
+        frag->data_size = frame->data_size;
+
+    } else {
+        // Build superframe out of frames.
+
+        VP9RawSuperframeIndex sfi;
+        PutBitContext pbc;
+        AVBufferRef *ref;
+        uint8_t *data;
+        size_t size, max, pos;
+        int i, size_len;
+
+        if (frag->nb_units > 8) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Too many frames to "
+                   "make superframe: %d.\n", frag->nb_units);
+            return AVERROR(EINVAL);
+        }
+
+        max = 0;
+        for (i = 0; i < frag->nb_units; i++)
+            if (max < frag->units[i].data_size)
+                max = frag->units[i].data_size;
+
+        if (max < 2)
+            size_len = 1;
+        else
+            size_len = av_log2(max) / 8 + 1;
+        av_assert0(size_len <= 4);
+
+        sfi.superframe_marker            = VP9_SUPERFRAME_MARKER;
+        sfi.bytes_per_framesize_minus_1  = size_len - 1;
+        sfi.frames_in_superframe_minus_1 = frag->nb_units - 1;
+
+        size = 2;
+        for (i = 0; i < frag->nb_units; i++) {
+            size += size_len + frag->units[i].data_size;
+            sfi.frame_sizes[i] = frag->units[i].data_size;
+        }
+
+        ref = av_buffer_alloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!ref)
+            return AVERROR(ENOMEM);
+        data = ref->data;
+        memset(data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+        pos = 0;
+        for (i = 0; i < frag->nb_units; i++) {
+            av_assert0(size - pos > frag->units[i].data_size);
+            memcpy(data + pos, frag->units[i].data,
+                   frag->units[i].data_size);
+            pos += frag->units[i].data_size;
+        }
+        av_assert0(size - pos == 2 + frag->nb_units * size_len);
+
+        init_put_bits(&pbc, data + pos, size - pos);
+
+        err = cbs_vp9_write_superframe_index(ctx, &pbc, &sfi);
+        if (err < 0) {
+            av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to write "
+                   "superframe index.\n");
+            av_buffer_unref(&ref);
+            return err;
+        }
+
+        av_assert0(put_bits_left(&pbc) == 0);
+        flush_put_bits(&pbc);
+
+        frag->data_ref  = ref;
+        frag->data      = data;
+        frag->data_size = size;
+    }
+
+    return 0;
+}
+
+static void cbs_vp9_close(CodedBitstreamContext *ctx)
+{
+    CodedBitstreamVP9Context *priv = ctx->priv_data;
+
+    av_freep(&priv->write_buffer);
+}
+
+const CodedBitstreamType ff_cbs_type_vp9 = {
+    .codec_id          = AV_CODEC_ID_VP9,
+
+    .priv_data_size    = sizeof(CodedBitstreamVP9Context),
+
+    .split_fragment    = &cbs_vp9_split_fragment,
+    .read_unit         = &cbs_vp9_read_unit,
+    .write_unit        = &cbs_vp9_write_unit,
+    .assemble_fragment = &cbs_vp9_assemble_fragment,
+
+    .close             = &cbs_vp9_close,
+};
diff --git a/libavcodec/cbs_vp9.h b/libavcodec/cbs_vp9.h
new file mode 100644
index 0000000..4c9b2f8
--- /dev/null
+++ b/libavcodec/cbs_vp9.h
@@ -0,0 +1,217 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CBS_VP9_H
+#define AVCODEC_CBS_VP9_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "cbs.h"
+
+
+// Miscellaneous constants (section 3).
+enum {
+    VP9_REFS_PER_FRAME = 3,
+
+    VP9_MIN_TILE_WIDTH_B64 = 4,
+    VP9_MAX_TILE_WIDTH_B64 = 64,
+
+    VP9_NUM_REF_FRAMES = 8,
+    VP9_MAX_REF_FRAMES = 4,
+
+    VP9_MAX_SEGMENTS = 8,
+    VP9_SEG_LVL_MAX  = 4,
+};
+
+// Frame types (section 7.2).
+enum {
+    VP9_KEY_FRAME     = 0,
+    VP9_NON_KEY_FRAME = 1,
+};
+
+// Frame sync bytes (section 7.2.1).
+enum {
+    VP9_FRAME_SYNC_0 = 0x49,
+    VP9_FRAME_SYNC_1 = 0x83,
+    VP9_FRAME_SYNC_2 = 0x42,
+};
+
+// Color space values (section 7.2.2).
+enum {
+    VP9_CS_UNKNOWN   = 0,
+    VP9_CS_BT_601    = 1,
+    VP9_CS_BT_709    = 2,
+    VP9_CS_SMPTE_170 = 3,
+    VP9_CS_SMPTE_240 = 4,
+    VP9_CS_BT_2020   = 5,
+    VP9_CS_RESERVED  = 6,
+    VP9_CS_RGB       = 7,
+};
+
+// Reference frame types (section 7.4.12).
+enum {
+    VP9_INTRA_FRAME  = 0,
+    VP9_LAST_FRAME   = 1,
+    VP9_GOLDEN_FRAME = 2,
+    VP9_ALTREF_FRAME = 3,
+};
+
+// Superframe properties (section B.3).
+enum {
+    VP9_MAX_FRAMES_IN_SUPERFRAME = 8,
+
+    VP9_SUPERFRAME_MARKER = 6,
+};
+
+
+typedef struct VP9RawFrameHeader {
+    uint8_t frame_marker;
+    uint8_t profile_low_bit;
+    uint8_t profile_high_bit;
+
+    uint8_t show_existing_frame;
+    uint8_t frame_to_show_map_idx;
+
+    uint8_t frame_type;
+    uint8_t show_frame;
+    uint8_t error_resilient_mode;
+
+    // Color config.
+    uint8_t ten_or_twelve_bit;
+    uint8_t color_space;
+    uint8_t color_range;
+    uint8_t subsampling_x;
+    uint8_t subsampling_y;
+
+    uint8_t refresh_frame_flags;
+
+    uint8_t intra_only;
+    uint8_t reset_frame_context;
+
+    uint8_t ref_frame_idx[VP9_REFS_PER_FRAME];
+    uint8_t ref_frame_sign_bias[VP9_MAX_REF_FRAMES];
+
+    uint8_t allow_high_precision_mv;
+
+    uint8_t refresh_frame_context;
+    uint8_t frame_parallel_decoding_mode;
+
+    uint8_t frame_context_idx;
+
+    // Frame/render size.
+    uint8_t found_ref[VP9_REFS_PER_FRAME];
+    uint16_t frame_width_minus_1;
+    uint16_t frame_height_minus_1;
+    uint8_t render_and_frame_size_different;
+    uint16_t render_width_minus_1;
+    uint16_t render_height_minus_1;
+
+    // Interpolation filter.
+    uint8_t is_filter_switchable;
+    uint8_t raw_interpolation_filter_type;
+
+    // Loop filter params.
+    uint8_t loop_filter_level;
+    uint8_t loop_filter_sharpness;
+    uint8_t loop_filter_delta_enabled;
+    uint8_t loop_filter_delta_update;
+    uint8_t update_ref_delta[VP9_MAX_REF_FRAMES];
+    int8_t loop_filter_ref_deltas[VP9_MAX_REF_FRAMES];
+    uint8_t update_mode_delta[2];
+    int8_t loop_filter_mode_deltas[2];
+
+    // Quantization params.
+    uint8_t base_q_idx;
+    int8_t delta_q_y_dc;
+    int8_t delta_q_uv_dc;
+    int8_t delta_q_uv_ac;
+
+    // Segmentation params.
+    uint8_t segmentation_enabled;
+    uint8_t segmentation_update_map;
+    uint8_t segmentation_tree_probs[7];
+    uint8_t segmentation_temporal_update;
+    uint8_t segmentation_pred_prob[3];
+    uint8_t segmentation_update_data;
+    uint8_t segmentation_abs_or_delta_update;
+    uint8_t feature_enabled[VP9_MAX_SEGMENTS][VP9_SEG_LVL_MAX];
+    uint8_t feature_value[VP9_MAX_SEGMENTS][VP9_SEG_LVL_MAX];
+    uint8_t feature_sign[VP9_MAX_SEGMENTS][VP9_SEG_LVL_MAX];
+
+    // Tile info.
+    uint8_t tile_cols_log2;
+    uint8_t tile_rows_log2;
+
+    uint16_t header_size_in_bytes;
+} VP9RawFrameHeader;
+
+typedef struct VP9RawFrame {
+    VP9RawFrameHeader header;
+
+    uint8_t     *data;
+    size_t       data_size;
+    AVBufferRef *data_ref;
+} VP9RawFrame;
+
+typedef struct VP9RawSuperframeIndex {
+    uint8_t superframe_marker;
+    uint8_t bytes_per_framesize_minus_1;
+    uint8_t frames_in_superframe_minus_1;
+    uint32_t frame_sizes[VP9_MAX_FRAMES_IN_SUPERFRAME];
+} VP9RawSuperframeIndex;
+
+typedef struct VP9RawSuperframe {
+    VP9RawFrame frames[VP9_MAX_FRAMES_IN_SUPERFRAME];
+    VP9RawSuperframeIndex index;
+} VP9RawSuperframe;
+
+typedef struct VP9ReferenceFrameState {
+    int frame_width;    // RefFrameWidth
+    int frame_height;   // RefFrameHeight
+    int subsampling_x;  // RefSubsamplingX
+    int subsampling_y;  // RefSubsamplingY
+    int bit_depth;      // RefBitDepth
+} VP9ReferenceFrameState;
+
+typedef struct CodedBitstreamVP9Context {
+    int profile;
+
+    // Frame dimensions in 8x8 mode info blocks.
+    uint16_t mi_cols;
+    uint16_t mi_rows;
+    // Frame dimensions in 64x64 superblocks.
+    uint16_t sb64_cols;
+    uint16_t sb64_rows;
+
+    int frame_width;
+    int frame_height;
+
+    uint8_t subsampling_x;
+    uint8_t subsampling_y;
+    int bit_depth;
+
+    VP9ReferenceFrameState ref[VP9_NUM_REF_FRAMES];
+
+    // Write buffer.
+    uint8_t *write_buffer;
+    size_t write_buffer_size;
+} CodedBitstreamVP9Context;
+
+
+#endif /* AVCODEC_CBS_VP9_H */
diff --git a/libavcodec/cbs_vp9_syntax_template.c b/libavcodec/cbs_vp9_syntax_template.c
new file mode 100644
index 0000000..898cede
--- /dev/null
+++ b/libavcodec/cbs_vp9_syntax_template.c
@@ -0,0 +1,442 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static int FUNC(frame_sync_code)(CodedBitstreamContext *ctx, RWContext *rw,
+                                 VP9RawFrameHeader *current)
+{
+    uint8_t frame_sync_byte_0 = VP9_FRAME_SYNC_0;
+    uint8_t frame_sync_byte_1 = VP9_FRAME_SYNC_1;
+    uint8_t frame_sync_byte_2 = VP9_FRAME_SYNC_2;
+    int err;
+
+    xf(8, frame_sync_byte_0, frame_sync_byte_0, 0);
+    xf(8, frame_sync_byte_1, frame_sync_byte_1, 0);
+    xf(8, frame_sync_byte_2, frame_sync_byte_2, 0);
+
+    if (frame_sync_byte_0 != VP9_FRAME_SYNC_0 ||
+        frame_sync_byte_1 != VP9_FRAME_SYNC_1 ||
+        frame_sync_byte_2 != VP9_FRAME_SYNC_2) {
+        av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid frame sync code: "
+               "%02x %02x %02x.\n", frame_sync_byte_0,
+               frame_sync_byte_1, frame_sync_byte_2);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int FUNC(color_config)(CodedBitstreamContext *ctx, RWContext *rw,
+                              VP9RawFrameHeader *current, int profile)
+{
+    CodedBitstreamVP9Context *vp9 = ctx->priv_data;
+    int err;
+
+    if (profile >= 2) {
+        f(1, ten_or_twelve_bit);
+        vp9->bit_depth = current->ten_or_twelve_bit ? 12 : 10;
+    } else
+        vp9->bit_depth = 8;
+
+    f(3, color_space);
+
+    if (current->color_space != VP9_CS_RGB) {
+        f(1, color_range);
+        if (profile == 1 || profile == 3) {
+            f(1, subsampling_x);
+            f(1, subsampling_y);
+            fixed(1, reserved_zero, 0);
+        } else {
+            infer(subsampling_x, 1);
+            infer(subsampling_y, 1);
+        }
+    } else {
+        infer(color_range, 1);
+        if (profile == 1 || profile == 3) {
+            infer(subsampling_x, 0);
+            infer(subsampling_y, 0);
+            fixed(1, reserved_zero, 0);
+        }
+    }
+
+    vp9->subsampling_x = current->subsampling_x;
+    vp9->subsampling_y = current->subsampling_y;
+
+    return 0;
+}
+
+static int FUNC(frame_size)(CodedBitstreamContext *ctx, RWContext *rw,
+                            VP9RawFrameHeader *current)
+{
+    CodedBitstreamVP9Context *vp9 = ctx->priv_data;
+    int err;
+
+    f(16, frame_width_minus_1);
+    f(16, frame_height_minus_1);
+
+    vp9->frame_width  = current->frame_width_minus_1  + 1;
+    vp9->frame_height = current->frame_height_minus_1 + 1;
+
+    vp9->mi_cols = (vp9->frame_width  + 7) >> 3;
+    vp9->mi_rows = (vp9->frame_height + 7) >> 3;
+    vp9->sb64_cols = (vp9->mi_cols + 7) >> 3;
+    vp9->sb64_rows = (vp9->mi_rows + 7) >> 3;
+
+    return 0;
+}
+
+static int FUNC(render_size)(CodedBitstreamContext *ctx, RWContext *rw,
+                             VP9RawFrameHeader *current)
+{
+    int err;
+
+    f(1, render_and_frame_size_different);
+
+    if (current->render_and_frame_size_different) {
+        f(16, render_width_minus_1);
+        f(16, render_height_minus_1);
+    }
+
+    return 0;
+}
+
+static int FUNC(frame_size_with_refs)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      VP9RawFrameHeader *current)
+{
+    CodedBitstreamVP9Context *vp9 = ctx->priv_data;
+    int err, i;
+
+    for (i = 0; i < VP9_REFS_PER_FRAME; i++) {
+        fs(1, found_ref[i], 1, i);
+        if (current->found_ref[i]) {
+            VP9ReferenceFrameState *ref =
+                &vp9->ref[current->ref_frame_idx[i]];
+
+            vp9->frame_width   = ref->frame_width;
+            vp9->frame_height  = ref->frame_height;
+
+            vp9->subsampling_x = ref->subsampling_x;
+            vp9->subsampling_y = ref->subsampling_y;
+            vp9->bit_depth     = ref->bit_depth;
+
+            break;
+        }
+    }
+    if (i >= VP9_REFS_PER_FRAME)
+        CHECK(FUNC(frame_size)(ctx, rw, current));
+    else {
+        vp9->mi_cols = (vp9->frame_width  + 7) >> 3;
+        vp9->mi_rows = (vp9->frame_height + 7) >> 3;
+        vp9->sb64_cols = (vp9->mi_cols + 7) >> 3;
+        vp9->sb64_rows = (vp9->mi_rows + 7) >> 3;
+    }
+    CHECK(FUNC(render_size)(ctx, rw, current));
+
+    return 0;
+}
+
+static int FUNC(interpolation_filter)(CodedBitstreamContext *ctx, RWContext *rw,
+                                      VP9RawFrameHeader *current)
+{
+    int err;
+
+    f(1, is_filter_switchable);
+    if (!current->is_filter_switchable)
+        f(2, raw_interpolation_filter_type);
+
+    return 0;
+}
+
+static int FUNC(loop_filter_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                    VP9RawFrameHeader *current)
+{
+    int err, i;
+
+    f(6, loop_filter_level);
+    f(3, loop_filter_sharpness);
+
+    f(1, loop_filter_delta_enabled);
+    if (current->loop_filter_delta_enabled) {
+        f(1, loop_filter_delta_update);
+        if (current->loop_filter_delta_update) {
+            for (i = 0; i < VP9_MAX_REF_FRAMES; i++) {
+                fs(1, update_ref_delta[i], 1, i);
+                if (current->update_ref_delta[i])
+                    ss(6, loop_filter_ref_deltas[i], 1, i);
+            }
+            for (i = 0; i < 2; i++) {
+                fs(1, update_mode_delta[i], 1, i);
+                if (current->update_mode_delta[i])
+                    ss(6, loop_filter_mode_deltas[i], 1, i);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(quantization_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     VP9RawFrameHeader *current)
+{
+    int err;
+
+    f(8, base_q_idx);
+
+    delta_q(delta_q_y_dc);
+    delta_q(delta_q_uv_dc);
+    delta_q(delta_q_uv_ac);
+
+    return 0;
+}
+
+static int FUNC(segmentation_params)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     VP9RawFrameHeader *current)
+{
+    static const int segmentation_feature_bits[VP9_SEG_LVL_MAX]   = { 8, 6, 2, 0 };
+    static const int segmentation_feature_signed[VP9_SEG_LVL_MAX] = { 1, 1, 0, 0 };
+
+    int err, i, j;
+
+    f(1, segmentation_enabled);
+
+    if (current->segmentation_enabled) {
+        f(1, segmentation_update_map);
+        if (current->segmentation_update_map) {
+            for (i = 0; i < 7; i++)
+                prob(segmentation_tree_probs[i], 1, i);
+            f(1, segmentation_temporal_update);
+            for (i = 0; i < 3; i++) {
+                if (current->segmentation_temporal_update)
+                    prob(segmentation_pred_prob[i], 1, i);
+                else
+                    infer(segmentation_pred_prob[i], 255);
+            }
+        }
+
+        f(1, segmentation_update_data);
+        if (current->segmentation_update_data) {
+            f(1, segmentation_abs_or_delta_update);
+            for (i = 0; i < VP9_MAX_SEGMENTS; i++) {
+                for (j = 0; j < VP9_SEG_LVL_MAX; j++) {
+                    fs(1, feature_enabled[i][j], 2, i, j);
+                    if (current->feature_enabled[i][j] &&
+                        segmentation_feature_bits[j]) {
+                        fs(segmentation_feature_bits[j],
+                           feature_value[i][j], 2, i, j);
+                        if (segmentation_feature_signed[j])
+                            fs(1, feature_sign[i][j], 2, i, j);
+                        else
+                            infer(feature_sign[i][j], 0);
+                    } else {
+                        infer(feature_value[i][j], 0);
+                        infer(feature_sign[i][j],  0);
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int FUNC(tile_info)(CodedBitstreamContext *ctx, RWContext *rw,
+                           VP9RawFrameHeader *current)
+{
+    CodedBitstreamVP9Context *vp9 = ctx->priv_data;
+    int min_log2_tile_cols, max_log2_tile_cols;
+    int err;
+
+    min_log2_tile_cols = 0;
+    while ((VP9_MAX_TILE_WIDTH_B64 << min_log2_tile_cols) < vp9->sb64_cols)
+        ++min_log2_tile_cols;
+    max_log2_tile_cols = 0;
+    while ((vp9->sb64_cols >> (max_log2_tile_cols + 1)) >= VP9_MIN_TILE_WIDTH_B64)
+        ++max_log2_tile_cols;
+
+    increment(tile_cols_log2, min_log2_tile_cols, max_log2_tile_cols);
+
+    increment(tile_rows_log2, 0, 2);
+
+    return 0;
+}
+
+static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw,
+                                     VP9RawFrameHeader *current)
+{
+    CodedBitstreamVP9Context *vp9 = ctx->priv_data;
+    int err, i;
+
+    f(2, frame_marker);
+
+    f(1, profile_low_bit);
+    f(1, profile_high_bit);
+    vp9->profile = (current->profile_high_bit << 1) + current->profile_low_bit;
+    if (vp9->profile == 3)
+        fixed(1, reserved_zero, 0);
+
+    f(1, show_existing_frame);
+    if (current->show_existing_frame) {
+        f(3, frame_to_show_map_idx);
+        infer(header_size_in_bytes, 0);
+        infer(refresh_frame_flags,  0x00);
+        infer(loop_filter_level,    0);
+        return 0;
+    }
+
+    f(1, frame_type);
+    f(1, show_frame);
+    f(1, error_resilient_mode);
+
+    if (current->frame_type == VP9_KEY_FRAME) {
+        CHECK(FUNC(frame_sync_code)(ctx, rw, current));
+        CHECK(FUNC(color_config)(ctx, rw, current, vp9->profile));
+        CHECK(FUNC(frame_size)(ctx, rw, current));
+        CHECK(FUNC(render_size)(ctx, rw, current));
+
+        infer(refresh_frame_flags, 0xff);
+
+    } else {
+         if (current->show_frame == 0)
+             f(1, intra_only);
+         else
+             infer(intra_only, 0);
+
+         if (current->error_resilient_mode == 0)
+             f(2, reset_frame_context);
+         else
+             infer(reset_frame_context, 0);
+
+         if (current->intra_only == 1) {
+             CHECK(FUNC(frame_sync_code)(ctx, rw, current));
+
+             if (vp9->profile > 0) {
+                 CHECK(FUNC(color_config)(ctx, rw, current, vp9->profile));
+             } else {
+                 infer(color_space,   1);
+                 infer(subsampling_x, 1);
+                 infer(subsampling_y, 1);
+                 vp9->bit_depth = 8;
+
+                 vp9->subsampling_x = current->subsampling_x;
+                 vp9->subsampling_y = current->subsampling_y;
+             }
+
+             f(8, refresh_frame_flags);
+
+             CHECK(FUNC(frame_size)(ctx, rw, current));
+             CHECK(FUNC(render_size)(ctx, rw, current));
+         } else {
+             f(8, refresh_frame_flags);
+
+             for (i = 0; i < VP9_REFS_PER_FRAME; i++) {
+                 fs(3, ref_frame_idx[i], 1, i);
+                 fs(1, ref_frame_sign_bias[VP9_LAST_FRAME + i],
+                    1, VP9_LAST_FRAME + i);
+             }
+
+             CHECK(FUNC(frame_size_with_refs)(ctx, rw, current));
+             f(1, allow_high_precision_mv);
+             CHECK(FUNC(interpolation_filter)(ctx, rw, current));
+         }
+    }
+
+    if (current->error_resilient_mode == 0) {
+        f(1, refresh_frame_context);
+        f(1, frame_parallel_decoding_mode);
+    } else {
+        infer(refresh_frame_context,        0);
+        infer(frame_parallel_decoding_mode, 1);
+    }
+
+    f(2, frame_context_idx);
+
+    CHECK(FUNC(loop_filter_params)(ctx, rw, current));
+    CHECK(FUNC(quantization_params)(ctx, rw, current));
+    CHECK(FUNC(segmentation_params)(ctx, rw, current));
+    CHECK(FUNC(tile_info)(ctx, rw, current));
+
+    f(16, header_size_in_bytes);
+
+    for (i = 0; i < VP9_NUM_REF_FRAMES; i++) {
+        if (current->refresh_frame_flags & (1 << i)) {
+            vp9->ref[i] = (VP9ReferenceFrameState) {
+                .frame_width    = vp9->frame_width,
+                .frame_height   = vp9->frame_height,
+                .subsampling_x  = vp9->subsampling_x,
+                .subsampling_y  = vp9->subsampling_y,
+                .bit_depth      = vp9->bit_depth,
+            };
+        }
+    }
+
+    av_log(ctx->log_ctx, AV_LOG_DEBUG, "Frame:  size %dx%d  "
+           "subsample %dx%d  bit_depth %d  tiles %dx%d.\n",
+           vp9->frame_width, vp9->frame_height,
+           vp9->subsampling_x, vp9->subsampling_y,
+           vp9->bit_depth, 1 << current->tile_cols_log2,
+           1 << current->tile_rows_log2);
+
+    return 0;
+}
+
+static int FUNC(trailing_bits)(CodedBitstreamContext *ctx, RWContext *rw)
+{
+    int err;
+    av_unused int zero = 0;
+    while (byte_alignment(rw) != 0)
+        xf(1, zero_bit, zero, 0);
+
+    return 0;
+}
+
+static int FUNC(frame)(CodedBitstreamContext *ctx, RWContext *rw,
+                       VP9RawFrame *current)
+{
+    int err;
+
+    HEADER("Frame");
+
+    CHECK(FUNC(uncompressed_header)(ctx, rw, &current->header));
+
+    CHECK(FUNC(trailing_bits)(ctx, rw));
+
+    return 0;
+}
+
+static int FUNC(superframe_index)(CodedBitstreamContext *ctx, RWContext *rw,
+                                  VP9RawSuperframeIndex *current)
+{
+    int err, i;
+
+    HEADER("Superframe Index");
+
+    f(3, superframe_marker);
+    f(2, bytes_per_framesize_minus_1);
+    f(3, frames_in_superframe_minus_1);
+
+    for (i = 0; i <= current->frames_in_superframe_minus_1; i++) {
+        // Surprise little-endian!
+        fle(8 * (current->bytes_per_framesize_minus_1 + 1),
+            frame_sizes[i], 1, i);
+    }
+
+    f(3, superframe_marker);
+    f(2, bytes_per_framesize_minus_1);
+    f(3, frames_in_superframe_minus_1);
+
+    return 0;
+}
diff --git a/libavcodec/ccaption_dec.c b/libavcodec/ccaption_dec.c
new file mode 100644
index 0000000..09ceb1b
--- /dev/null
+++ b/libavcodec/ccaption_dec.c
@@ -0,0 +1,843 @@
+/*
+ * Closed Caption Decoding
+ * Copyright (c) 2015 Anshul Maheshwari
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/opt.h"
+
+#define SCREEN_ROWS 15
+#define SCREEN_COLUMNS 32
+
+#define SET_FLAG(var, val)   ( (var) |=   ( 1 << (val)) )
+#define UNSET_FLAG(var, val) ( (var) &=  ~( 1 << (val)) )
+#define CHECK_FLAG(var, val) ( (var) &    ( 1 << (val)) )
+
+static const AVRational ms_tb = {1, 1000};
+
+/*
+ * TODO list
+ * 1) handle font and color completely
+ */
+enum cc_mode {
+    CCMODE_POPON,
+    CCMODE_PAINTON,
+    CCMODE_ROLLUP,
+    CCMODE_TEXT,
+};
+
+enum cc_color_code {
+    CCCOL_WHITE,
+    CCCOL_GREEN,
+    CCCOL_BLUE,
+    CCCOL_CYAN,
+    CCCOL_RED,
+    CCCOL_YELLOW,
+    CCCOL_MAGENTA,
+    CCCOL_USERDEFINED,
+    CCCOL_BLACK,
+    CCCOL_TRANSPARENT,
+};
+
+enum cc_font {
+    CCFONT_REGULAR,
+    CCFONT_ITALICS,
+    CCFONT_UNDERLINED,
+    CCFONT_UNDERLINED_ITALICS,
+};
+
+enum cc_charset {
+    CCSET_BASIC_AMERICAN,
+    CCSET_SPECIAL_AMERICAN,
+    CCSET_EXTENDED_SPANISH_FRENCH_MISC,
+    CCSET_EXTENDED_PORTUGUESE_GERMAN_DANISH,
+};
+
+static const char *charset_overrides[4][128] =
+{
+    [CCSET_BASIC_AMERICAN] = {
+        [0x27] = "\u2019",
+        [0x2a] = "\u00e1",
+        [0x5c] = "\u00e9",
+        [0x5e] = "\u00ed",
+        [0x5f] = "\u00f3",
+        [0x60] = "\u00fa",
+        [0x7b] = "\u00e7",
+        [0x7c] = "\u00f7",
+        [0x7d] = "\u00d1",
+        [0x7e] = "\u00f1",
+        [0x7f] = "\u2588"
+    },
+    [CCSET_SPECIAL_AMERICAN] = {
+        [0x30] = "\u00ae",
+        [0x31] = "\u00b0",
+        [0x32] = "\u00bd",
+        [0x33] = "\u00bf",
+        [0x34] = "\u2122",
+        [0x35] = "\u00a2",
+        [0x36] = "\u00a3",
+        [0x37] = "\u266a",
+        [0x38] = "\u00e0",
+        [0x39] = "\u00A0",
+        [0x3a] = "\u00e8",
+        [0x3b] = "\u00e2",
+        [0x3c] = "\u00ea",
+        [0x3d] = "\u00ee",
+        [0x3e] = "\u00f4",
+        [0x3f] = "\u00fb",
+    },
+    [CCSET_EXTENDED_SPANISH_FRENCH_MISC] = {
+        [0x20] = "\u00c1",
+        [0x21] = "\u00c9",
+        [0x22] = "\u00d3",
+        [0x23] = "\u00da",
+        [0x24] = "\u00dc",
+        [0x25] = "\u00fc",
+        [0x26] = "\u00b4",
+        [0x27] = "\u00a1",
+        [0x28] = "*",
+        [0x29] = "\u2018",
+        [0x2a] = "-",
+        [0x2b] = "\u00a9",
+        [0x2c] = "\u2120",
+        [0x2d] = "\u00b7",
+        [0x2e] = "\u201c",
+        [0x2f] = "\u201d",
+        [0x30] = "\u00c0",
+        [0x31] = "\u00c2",
+        [0x32] = "\u00c7",
+        [0x33] = "\u00c8",
+        [0x34] = "\u00ca",
+        [0x35] = "\u00cb",
+        [0x36] = "\u00eb",
+        [0x37] = "\u00ce",
+        [0x38] = "\u00cf",
+        [0x39] = "\u00ef",
+        [0x3a] = "\u00d4",
+        [0x3b] = "\u00d9",
+        [0x3c] = "\u00f9",
+        [0x3d] = "\u00db",
+        [0x3e] = "\u00ab",
+        [0x3f] = "\u00bb",
+    },
+    [CCSET_EXTENDED_PORTUGUESE_GERMAN_DANISH] = {
+        [0x20] = "\u00c3",
+        [0x21] = "\u00e3",
+        [0x22] = "\u00cd",
+        [0x23] = "\u00cc",
+        [0x24] = "\u00ec",
+        [0x25] = "\u00d2",
+        [0x26] = "\u00f2",
+        [0x27] = "\u00d5",
+        [0x28] = "\u00f5",
+        [0x29] = "{",
+        [0x2a] = "}",
+        [0x2b] = "\\",
+        [0x2c] = "^",
+        [0x2d] = "_",
+        [0x2e] = "|",
+        [0x2f] = "~",
+        [0x30] = "\u00c4",
+        [0x31] = "\u00e4",
+        [0x32] = "\u00d6",
+        [0x33] = "\u00f6",
+        [0x34] = "\u00df",
+        [0x35] = "\u00a5",
+        [0x36] = "\u00a4",
+        [0x37] = "\u00a6",
+        [0x38] = "\u00c5",
+        [0x39] = "\u00e5",
+        [0x3a] = "\u00d8",
+        [0x3b] = "\u00f8",
+        [0x3c] = "\u250c",
+        [0x3d] = "\u2510",
+        [0x3e] = "\u2514",
+        [0x3f] = "\u2518",
+    },
+};
+
+static const unsigned char pac2_attribs[32][3] = // Color, font, ident
+{
+    { CCCOL_WHITE,   CCFONT_REGULAR,            0 },  // 0x40 || 0x60
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         0 },  // 0x41 || 0x61
+    { CCCOL_GREEN,   CCFONT_REGULAR,            0 },  // 0x42 || 0x62
+    { CCCOL_GREEN,   CCFONT_UNDERLINED,         0 },  // 0x43 || 0x63
+    { CCCOL_BLUE,    CCFONT_REGULAR,            0 },  // 0x44 || 0x64
+    { CCCOL_BLUE,    CCFONT_UNDERLINED,         0 },  // 0x45 || 0x65
+    { CCCOL_CYAN,    CCFONT_REGULAR,            0 },  // 0x46 || 0x66
+    { CCCOL_CYAN,    CCFONT_UNDERLINED,         0 },  // 0x47 || 0x67
+    { CCCOL_RED,     CCFONT_REGULAR,            0 },  // 0x48 || 0x68
+    { CCCOL_RED,     CCFONT_UNDERLINED,         0 },  // 0x49 || 0x69
+    { CCCOL_YELLOW,  CCFONT_REGULAR,            0 },  // 0x4a || 0x6a
+    { CCCOL_YELLOW,  CCFONT_UNDERLINED,         0 },  // 0x4b || 0x6b
+    { CCCOL_MAGENTA, CCFONT_REGULAR,            0 },  // 0x4c || 0x6c
+    { CCCOL_MAGENTA, CCFONT_UNDERLINED,         0 },  // 0x4d || 0x6d
+    { CCCOL_WHITE,   CCFONT_ITALICS,            0 },  // 0x4e || 0x6e
+    { CCCOL_WHITE,   CCFONT_UNDERLINED_ITALICS, 0 },  // 0x4f || 0x6f
+    { CCCOL_WHITE,   CCFONT_REGULAR,            0 },  // 0x50 || 0x70
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         0 },  // 0x51 || 0x71
+    { CCCOL_WHITE,   CCFONT_REGULAR,            4 },  // 0x52 || 0x72
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         4 },  // 0x53 || 0x73
+    { CCCOL_WHITE,   CCFONT_REGULAR,            8 },  // 0x54 || 0x74
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         8 },  // 0x55 || 0x75
+    { CCCOL_WHITE,   CCFONT_REGULAR,           12 },  // 0x56 || 0x76
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        12 },  // 0x57 || 0x77
+    { CCCOL_WHITE,   CCFONT_REGULAR,           16 },  // 0x58 || 0x78
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        16 },  // 0x59 || 0x79
+    { CCCOL_WHITE,   CCFONT_REGULAR,           20 },  // 0x5a || 0x7a
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        20 },  // 0x5b || 0x7b
+    { CCCOL_WHITE,   CCFONT_REGULAR,           24 },  // 0x5c || 0x7c
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        24 },  // 0x5d || 0x7d
+    { CCCOL_WHITE,   CCFONT_REGULAR,           28 },  // 0x5e || 0x7e
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        28 }   // 0x5f || 0x7f
+    /* total 32 entries */
+};
+
+struct Screen {
+    /* +1 is used to compensate null character of string */
+    uint8_t characters[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t charsets[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t colors[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t fonts[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    /*
+     * Bitmask of used rows; if a bit is not set, the
+     * corresponding row is not used.
+     * for setting row 1  use row | (1 << 0)
+     * for setting row 15 use row | (1 << 14)
+     */
+    int16_t row_used;
+};
+
+typedef struct CCaptionSubContext {
+    AVClass *class;
+    int real_time;
+    struct Screen screen[2];
+    int active_screen;
+    uint8_t cursor_row;
+    uint8_t cursor_column;
+    uint8_t cursor_color;
+    uint8_t cursor_font;
+    uint8_t cursor_charset;
+    AVBPrint buffer;
+    int buffer_changed;
+    int rollup;
+    enum cc_mode mode;
+    int64_t start_time;
+    /* visible screen time */
+    int64_t startv_time;
+    int64_t end_time;
+    int screen_touched;
+    int64_t last_real_time;
+    char prev_cmd[2];
+    /* buffer to store pkt data */
+    uint8_t *pktbuf;
+    int pktbuf_size;
+    int readorder;
+} CCaptionSubContext;
+
+
+static av_cold int init_decoder(AVCodecContext *avctx)
+{
+    int ret;
+    CCaptionSubContext *ctx = avctx->priv_data;
+
+    av_bprint_init(&ctx->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    /* taking by default roll up to 2 */
+    ctx->mode = CCMODE_ROLLUP;
+    ctx->rollup = 2;
+    ctx->cursor_row = 10;
+    ret = ff_ass_subtitle_header(avctx, "Monospace",
+                                 ASS_DEFAULT_FONT_SIZE,
+                                 ASS_DEFAULT_COLOR,
+                                 ASS_DEFAULT_BACK_COLOR,
+                                 ASS_DEFAULT_BOLD,
+                                 ASS_DEFAULT_ITALIC,
+                                 ASS_DEFAULT_UNDERLINE,
+                                 3,
+                                 ASS_DEFAULT_ALIGNMENT);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return ret;
+}
+
+static av_cold int close_decoder(AVCodecContext *avctx)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    av_bprint_finalize(&ctx->buffer, NULL);
+    av_freep(&ctx->pktbuf);
+    ctx->pktbuf_size = 0;
+    return 0;
+}
+
+static void flush_decoder(AVCodecContext *avctx)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    ctx->screen[0].row_used = 0;
+    ctx->screen[1].row_used = 0;
+    ctx->prev_cmd[0] = 0;
+    ctx->prev_cmd[1] = 0;
+    ctx->mode = CCMODE_ROLLUP;
+    ctx->rollup = 2;
+    ctx->cursor_row = 10;
+    ctx->cursor_column = 0;
+    ctx->cursor_font = 0;
+    ctx->cursor_color = 0;
+    ctx->cursor_charset = 0;
+    ctx->active_screen = 0;
+    ctx->last_real_time = 0;
+    ctx->screen_touched = 0;
+    ctx->buffer_changed = 0;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        ctx->readorder = 0;
+    av_bprint_clear(&ctx->buffer);
+}
+
+/**
+ * @param ctx closed caption context just to print log
+ */
+static void write_char(CCaptionSubContext *ctx, struct Screen *screen, char ch)
+{
+    uint8_t col = ctx->cursor_column;
+    char *row = screen->characters[ctx->cursor_row];
+    char *font = screen->fonts[ctx->cursor_row];
+    char *charset = screen->charsets[ctx->cursor_row];
+
+    if (col < SCREEN_COLUMNS) {
+        row[col] = ch;
+        font[col] = ctx->cursor_font;
+        charset[col] = ctx->cursor_charset;
+        ctx->cursor_charset = CCSET_BASIC_AMERICAN;
+        if (ch) ctx->cursor_column++;
+        return;
+    }
+    /* We have extra space at end only for null character */
+    else if (col == SCREEN_COLUMNS && ch == 0) {
+        row[col] = ch;
+        return;
+    }
+    else {
+        av_log(ctx, AV_LOG_WARNING, "Data Ignored since exceeding screen width\n");
+        return;
+    }
+}
+
+/**
+ * This function after validating parity bit, also remove it from data pair.
+ * The first byte doesn't pass parity, we replace it with a solid blank
+ * and process the pair.
+ * If the second byte doesn't pass parity, it returns INVALIDDATA
+ * user can ignore the whole pair and pass the other pair.
+ */
+static int validate_cc_data_pair(uint8_t *cc_data_pair)
+{
+    uint8_t cc_valid = (*cc_data_pair & 4) >>2;
+    uint8_t cc_type = *cc_data_pair & 3;
+
+    if (!cc_valid)
+        return AVERROR_INVALIDDATA;
+
+    // if EIA-608 data then verify parity.
+    if (cc_type==0 || cc_type==1) {
+        if (!av_parity(cc_data_pair[2])) {
+            return AVERROR_INVALIDDATA;
+        }
+        if (!av_parity(cc_data_pair[1])) {
+            cc_data_pair[1]=0x7F;
+        }
+    }
+
+    //Skip non-data
+    if ((cc_data_pair[0] == 0xFA || cc_data_pair[0] == 0xFC || cc_data_pair[0] == 0xFD)
+         && (cc_data_pair[1] & 0x7F) == 0 && (cc_data_pair[2] & 0x7F) == 0)
+        return AVERROR_PATCHWELCOME;
+
+    //skip 708 data
+    if (cc_type == 3 || cc_type == 2)
+        return AVERROR_PATCHWELCOME;
+
+    /* remove parity bit */
+    cc_data_pair[1] &= 0x7F;
+    cc_data_pair[2] &= 0x7F;
+
+    return 0;
+}
+
+static struct Screen *get_writing_screen(CCaptionSubContext *ctx)
+{
+    switch (ctx->mode) {
+    case CCMODE_POPON:
+        // use Inactive screen
+        return ctx->screen + !ctx->active_screen;
+    case CCMODE_PAINTON:
+    case CCMODE_ROLLUP:
+    case CCMODE_TEXT:
+        // use active screen
+        return ctx->screen + ctx->active_screen;
+    }
+    /* It was never an option */
+    return NULL;
+}
+
+static void roll_up(CCaptionSubContext *ctx)
+{
+    struct Screen *screen;
+    int i, keep_lines;
+
+    if (ctx->mode == CCMODE_TEXT)
+        return;
+
+    screen = get_writing_screen(ctx);
+
+    /* +1 signify cursor_row starts from 0
+     * Can't keep lines less then row cursor pos
+     */
+    keep_lines = FFMIN(ctx->cursor_row + 1, ctx->rollup);
+
+    for (i = 0; i < SCREEN_ROWS; i++) {
+        if (i > ctx->cursor_row - keep_lines && i <= ctx->cursor_row)
+            continue;
+        UNSET_FLAG(screen->row_used, i);
+    }
+
+    for (i = 0; i < keep_lines && screen->row_used; i++) {
+        const int i_row = ctx->cursor_row - keep_lines + i + 1;
+
+        memcpy(screen->characters[i_row], screen->characters[i_row+1], SCREEN_COLUMNS);
+        memcpy(screen->colors[i_row], screen->colors[i_row+1], SCREEN_COLUMNS);
+        memcpy(screen->fonts[i_row], screen->fonts[i_row+1], SCREEN_COLUMNS);
+        memcpy(screen->charsets[i_row], screen->charsets[i_row+1], SCREEN_COLUMNS);
+        if (CHECK_FLAG(screen->row_used, i_row + 1))
+            SET_FLAG(screen->row_used, i_row);
+    }
+
+    UNSET_FLAG(screen->row_used, ctx->cursor_row);
+}
+
+static int capture_screen(CCaptionSubContext *ctx)
+{
+    int i, j, tab = 0;
+    struct Screen *screen = ctx->screen + ctx->active_screen;
+    enum cc_font prev_font = CCFONT_REGULAR;
+    av_bprint_clear(&ctx->buffer);
+
+    for (i = 0; screen->row_used && i < SCREEN_ROWS; i++)
+    {
+        if (CHECK_FLAG(screen->row_used, i)) {
+            const char *row = screen->characters[i];
+            const char *charset = screen->charsets[i];
+            j = 0;
+            while (row[j] == ' ' && charset[j] == CCSET_BASIC_AMERICAN)
+                j++;
+            if (!tab || j < tab)
+                tab = j;
+        }
+    }
+
+    for (i = 0; screen->row_used && i < SCREEN_ROWS; i++)
+    {
+        if (CHECK_FLAG(screen->row_used, i)) {
+            const char *row = screen->characters[i];
+            const char *font = screen->fonts[i];
+            const char *charset = screen->charsets[i];
+            const char *override;
+            int x, y, seen_char = 0;
+            j = 0;
+
+            /* skip leading space */
+            while (row[j] == ' ' && charset[j] == CCSET_BASIC_AMERICAN && j < tab)
+                j++;
+
+            x = ASS_DEFAULT_PLAYRESX * (0.1 + 0.0250 * j);
+            y = ASS_DEFAULT_PLAYRESY * (0.1 + 0.0533 * i);
+            av_bprintf(&ctx->buffer, "{\\an7}{\\pos(%d,%d)}", x, y);
+
+            for (; j < SCREEN_COLUMNS; j++) {
+                const char *e_tag = "", *s_tag = "";
+
+                if (row[j] == 0)
+                    break;
+
+                if (prev_font != font[j]) {
+                    switch (prev_font) {
+                    case CCFONT_ITALICS:
+                        e_tag = "{\\i0}";
+                        break;
+                    case CCFONT_UNDERLINED:
+                        e_tag = "{\\u0}";
+                        break;
+                    case CCFONT_UNDERLINED_ITALICS:
+                        e_tag = "{\\u0}{\\i0}";
+                        break;
+                    }
+                    switch (font[j]) {
+                    case CCFONT_ITALICS:
+                        s_tag = "{\\i1}";
+                        break;
+                    case CCFONT_UNDERLINED:
+                        s_tag = "{\\u1}";
+                        break;
+                    case CCFONT_UNDERLINED_ITALICS:
+                        s_tag = "{\\u1}{\\i1}";
+                        break;
+                    }
+                }
+                prev_font = font[j];
+                override = charset_overrides[(int)charset[j]][(int)row[j]];
+                if (override) {
+                    av_bprintf(&ctx->buffer, "%s%s%s", e_tag, s_tag, override);
+                    seen_char = 1;
+                } else if (row[j] == ' ' && !seen_char) {
+                    av_bprintf(&ctx->buffer, "%s%s\\h", e_tag, s_tag);
+                } else {
+                    av_bprintf(&ctx->buffer, "%s%s%c", e_tag, s_tag, row[j]);
+                    seen_char = 1;
+                }
+
+            }
+            av_bprintf(&ctx->buffer, "\\N");
+        }
+    }
+    if (!av_bprint_is_complete(&ctx->buffer))
+        return AVERROR(ENOMEM);
+    if (screen->row_used && ctx->buffer.len >= 2) {
+        ctx->buffer.len -= 2;
+        ctx->buffer.str[ctx->buffer.len] = 0;
+    }
+    ctx->buffer_changed = 1;
+    return 0;
+}
+
+static int reap_screen(CCaptionSubContext *ctx, int64_t pts)
+{
+    ctx->start_time = ctx->startv_time;
+    ctx->startv_time = pts;
+    ctx->end_time = pts;
+    return capture_screen(ctx);
+}
+
+static void handle_textattr(CCaptionSubContext *ctx, uint8_t hi, uint8_t lo)
+{
+    int i = lo - 0x20;
+    struct Screen *screen = get_writing_screen(ctx);
+
+    if (i >= 32)
+        return;
+
+    ctx->cursor_color = pac2_attribs[i][0];
+    ctx->cursor_font = pac2_attribs[i][1];
+
+    SET_FLAG(screen->row_used, ctx->cursor_row);
+    write_char(ctx, screen, ' ');
+}
+
+static void handle_pac(CCaptionSubContext *ctx, uint8_t hi, uint8_t lo)
+{
+    static const int8_t row_map[] = {
+        11, -1, 1, 2, 3, 4, 12, 13, 14, 15, 5, 6, 7, 8, 9, 10
+    };
+    const int index = ( (hi<<1) & 0x0e) | ( (lo>>5) & 0x01 );
+    struct Screen *screen = get_writing_screen(ctx);
+    int indent, i;
+
+    if (row_map[index] <= 0) {
+        av_log(ctx, AV_LOG_DEBUG, "Invalid pac index encountered\n");
+        return;
+    }
+
+    lo &= 0x1f;
+
+    ctx->cursor_row = row_map[index] - 1;
+    ctx->cursor_color =  pac2_attribs[lo][0];
+    ctx->cursor_font = pac2_attribs[lo][1];
+    ctx->cursor_charset = CCSET_BASIC_AMERICAN;
+    ctx->cursor_column = 0;
+    indent = pac2_attribs[lo][2];
+    for (i = 0; i < indent; i++) {
+        write_char(ctx, screen, ' ');
+    }
+}
+
+/**
+ * @param pts it is required to set end time
+ */
+static void handle_edm(CCaptionSubContext *ctx, int64_t pts)
+{
+    struct Screen *screen = ctx->screen + ctx->active_screen;
+
+    // In buffered mode, keep writing to screen until it is wiped.
+    // Before wiping the display, capture contents to emit subtitle.
+    if (!ctx->real_time)
+        reap_screen(ctx, pts);
+
+    screen->row_used = 0;
+
+    // In realtime mode, emit an empty caption so the last one doesn't
+    // stay on the screen.
+    if (ctx->real_time)
+        reap_screen(ctx, pts);
+}
+
+static void handle_eoc(CCaptionSubContext *ctx, int64_t pts)
+{
+    // In buffered mode, we wait til the *next* EOC and
+    // reap what was already on the screen since the last EOC.
+    if (!ctx->real_time)
+        handle_edm(ctx,pts);
+
+    ctx->active_screen = !ctx->active_screen;
+    ctx->cursor_column = 0;
+
+    // In realtime mode, we display the buffered contents (after
+    // flipping the buffer to active above) as soon as EOC arrives.
+    if (ctx->real_time)
+        reap_screen(ctx, pts);
+}
+
+static void handle_delete_end_of_row(CCaptionSubContext *ctx, char hi, char lo)
+{
+    struct Screen *screen = get_writing_screen(ctx);
+    write_char(ctx, screen, 0);
+}
+
+static void handle_char(CCaptionSubContext *ctx, char hi, char lo, int64_t pts)
+{
+    struct Screen *screen = get_writing_screen(ctx);
+
+    SET_FLAG(screen->row_used, ctx->cursor_row);
+
+    switch (hi) {
+      case 0x11:
+        ctx->cursor_charset = CCSET_SPECIAL_AMERICAN;
+        break;
+      case 0x12:
+        if (ctx->cursor_column > 0)
+            ctx->cursor_column -= 1;
+        ctx->cursor_charset = CCSET_EXTENDED_SPANISH_FRENCH_MISC;
+        break;
+      case 0x13:
+        if (ctx->cursor_column > 0)
+            ctx->cursor_column -= 1;
+        ctx->cursor_charset = CCSET_EXTENDED_PORTUGUESE_GERMAN_DANISH;
+        break;
+      default:
+        ctx->cursor_charset = CCSET_BASIC_AMERICAN;
+        write_char(ctx, screen, hi);
+        break;
+    }
+
+    if (lo) {
+        write_char(ctx, screen, lo);
+    }
+    write_char(ctx, screen, 0);
+
+    if (ctx->mode != CCMODE_POPON)
+        ctx->screen_touched = 1;
+
+    if (lo)
+       ff_dlog(ctx, "(%c,%c)\n", hi, lo);
+    else
+       ff_dlog(ctx, "(%c)\n", hi);
+}
+
+static void process_cc608(CCaptionSubContext *ctx, int64_t pts, uint8_t hi, uint8_t lo)
+{
+    if (hi == ctx->prev_cmd[0] && lo == ctx->prev_cmd[1]) {
+        /* ignore redundant command */
+        return;
+    }
+
+    /* set prev command */
+    ctx->prev_cmd[0] = hi;
+    ctx->prev_cmd[1] = lo;
+
+    if ( (hi == 0x10 && (lo >= 0x40 && lo <= 0x5f)) ||
+       ( (hi >= 0x11 && hi <= 0x17) && (lo >= 0x40 && lo <= 0x7f) ) ) {
+        handle_pac(ctx, hi, lo);
+    } else if ( ( hi == 0x11 && lo >= 0x20 && lo <= 0x2f ) ||
+                ( hi == 0x17 && lo >= 0x2e && lo <= 0x2f) ) {
+        handle_textattr(ctx, hi, lo);
+    } else if (hi == 0x14 || hi == 0x15 || hi == 0x1c) {
+        switch (lo) {
+        case 0x20:
+            /* resume caption loading */
+            ctx->mode = CCMODE_POPON;
+            break;
+        case 0x24:
+            handle_delete_end_of_row(ctx, hi, lo);
+            break;
+        case 0x25:
+        case 0x26:
+        case 0x27:
+            ctx->rollup = lo - 0x23;
+            ctx->mode = CCMODE_ROLLUP;
+            break;
+        case 0x29:
+            /* resume direct captioning */
+            ctx->mode = CCMODE_PAINTON;
+            break;
+        case 0x2b:
+            /* resume text display */
+            ctx->mode = CCMODE_TEXT;
+            break;
+        case 0x2c:
+            /* erase display memory */
+            handle_edm(ctx, pts);
+            break;
+        case 0x2d:
+            /* carriage return */
+            ff_dlog(ctx, "carriage return\n");
+            if (!ctx->real_time)
+                reap_screen(ctx, pts);
+            roll_up(ctx);
+            ctx->cursor_column = 0;
+            break;
+        case 0x2e:
+            /* erase buffered (non displayed) memory */
+            // Only in realtime mode. In buffered mode, we re-use the inactive screen
+            // for our own buffering.
+            if (ctx->real_time) {
+                struct Screen *screen = ctx->screen + !ctx->active_screen;
+                screen->row_used = 0;
+            }
+            break;
+        case 0x2f:
+            /* end of caption */
+            ff_dlog(ctx, "handle_eoc\n");
+            handle_eoc(ctx, pts);
+            break;
+        default:
+            ff_dlog(ctx, "Unknown command 0x%hhx 0x%hhx\n", hi, lo);
+            break;
+        }
+    } else if (hi >= 0x11 && hi <= 0x13) {
+        /* Special characters */
+        handle_char(ctx, hi, lo, pts);
+    } else if (hi >= 0x20) {
+        /* Standard characters (always in pairs) */
+        handle_char(ctx, hi, lo, pts);
+        ctx->prev_cmd[0] = ctx->prev_cmd[1] = 0;
+    } else if (hi == 0x17 && lo >= 0x21 && lo <= 0x23) {
+        int i;
+        /* Tab offsets (spacing) */
+        for (i = 0; i < lo - 0x20; i++) {
+            handle_char(ctx, ' ', 0, pts);
+        }
+    } else {
+        /* Ignoring all other non data code */
+        ff_dlog(ctx, "Unknown command 0x%hhx 0x%hhx\n", hi, lo);
+    }
+}
+
+static int decode(AVCodecContext *avctx, void *data, int *got_sub, AVPacket *avpkt)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    AVSubtitle *sub = data;
+    const int64_t start_time = sub->pts;
+    uint8_t *bptr = NULL;
+    int len = avpkt->size;
+    int ret = 0;
+    int i;
+
+    av_fast_padded_malloc(&ctx->pktbuf, &ctx->pktbuf_size, len);
+    if (!ctx->pktbuf) {
+        av_log(ctx, AV_LOG_WARNING, "Insufficient Memory of %d truncated to %d\n", len, ctx->pktbuf_size);
+        return AVERROR(ENOMEM);
+    }
+    memcpy(ctx->pktbuf, avpkt->data, len);
+    bptr = ctx->pktbuf;
+
+    for (i  = 0; i < len; i += 3) {
+        uint8_t cc_type = *(bptr + i) & 3;
+        if (validate_cc_data_pair(bptr + i))
+            continue;
+        /* ignoring data field 1 */
+        if(cc_type == 1)
+            continue;
+        else
+            process_cc608(ctx, start_time, *(bptr + i + 1) & 0x7f, *(bptr + i + 2) & 0x7f);
+
+        if (!ctx->buffer_changed)
+            continue;
+        ctx->buffer_changed = 0;
+
+        if (*ctx->buffer.str || ctx->real_time)
+        {
+            ff_dlog(ctx, "cdp writing data (%s)\n",ctx->buffer.str);
+            ret = ff_ass_add_rect(sub, ctx->buffer.str, ctx->readorder++, 0, NULL, NULL);
+            if (ret < 0)
+                return ret;
+            sub->pts = ctx->start_time;
+            if (!ctx->real_time)
+                sub->end_display_time = av_rescale_q(ctx->end_time - ctx->start_time,
+                                                     AV_TIME_BASE_Q, ms_tb);
+            else
+                sub->end_display_time = -1;
+            ctx->buffer_changed = 0;
+            ctx->last_real_time = sub->pts;
+            ctx->screen_touched = 0;
+        }
+    }
+
+    if (ctx->real_time && ctx->screen_touched &&
+        sub->pts > ctx->last_real_time + av_rescale_q(200, ms_tb, AV_TIME_BASE_Q)) {
+        ctx->last_real_time = sub->pts;
+        ctx->screen_touched = 0;
+
+        capture_screen(ctx);
+        ctx->buffer_changed = 0;
+
+        ret = ff_ass_add_rect(sub, ctx->buffer.str, ctx->readorder++, 0, NULL, NULL);
+        if (ret < 0)
+            return ret;
+        sub->end_display_time = -1;
+    }
+
+    *got_sub = sub->num_rects > 0;
+    return ret;
+}
+
+#define OFFSET(x) offsetof(CCaptionSubContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "real_time", "emit subtitle events as they are decoded for real-time display", OFFSET(real_time), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, SD },
+    {NULL}
+};
+
+static const AVClass ccaption_dec_class = {
+    .class_name = "Closed caption Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ccaption_decoder = {
+    .name           = "cc_dec",
+    .long_name      = NULL_IF_CONFIG_SMALL("Closed Caption (EIA-608 / CEA-708)"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_EIA_608,
+    .priv_data_size = sizeof(CCaptionSubContext),
+    .init           = init_decoder,
+    .close          = close_decoder,
+    .flush          = flush_decoder,
+    .decode         = decode,
+    .priv_class     = &ccaption_dec_class,
+};
diff --git a/libavcodec/cdgraphics.c b/libavcodec/cdgraphics.c
index 3b68f45..cf3f01a 100644
--- a/libavcodec/cdgraphics.c
+++ b/libavcodec/cdgraphics.c
@@ -2,20 +2,20 @@
  * CD Graphics Video Decoder
  * Copyright (c) 2009 Michael Tison
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,6 +49,7 @@
 #define CDG_INST_TILE_BLOCK        6
 #define CDG_INST_SCROLL_PRESET    20
 #define CDG_INST_SCROLL_COPY      24
+#define CDG_INST_TRANSPARENT_COL  28
 #define CDG_INST_LOAD_PAL_LO      30
 #define CDG_INST_LOAD_PAL_HIGH    31
 #define CDG_INST_TILE_BLOCK_XOR   38
@@ -67,6 +68,8 @@ typedef struct CDGraphicsContext {
     AVFrame *frame;
     int hscroll;
     int vscroll;
+    int transparency;
+    int cleared;
 } CDGraphicsContext;
 
 static av_cold int cdg_decode_init(AVCodecContext *avctx)
@@ -76,12 +79,10 @@ static av_cold int cdg_decode_init(AVCodecContext *avctx)
     cc->frame = av_frame_alloc();
     if (!cc->frame)
         return AVERROR(ENOMEM);
+    cc->transparency = -1;
 
-    avctx->width   = CDG_FULL_WIDTH;
-    avctx->height  = CDG_FULL_HEIGHT;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
-
-    return 0;
+    return ff_set_dimensions(avctx, CDG_FULL_WIDTH, CDG_FULL_HEIGHT);
 }
 
 static void cdg_border_preset(CDGraphicsContext *cc, uint8_t *data)
@@ -119,7 +120,9 @@ static void cdg_load_palette(CDGraphicsContext *cc, uint8_t *data, int low)
         r = ((color >> 8) & 0x000F) * 17;
         g = ((color >> 4) & 0x000F) * 17;
         b = ((color     ) & 0x000F) * 17;
-        palette[i + array_offset] = r << 16 | g << 8 | b;
+        palette[i + array_offset] = 0xFFU << 24 | r << 16 | g << 8 | b;
+        if (cc->transparency >= 0)
+            palette[cc->transparency] &= 0xFFFFFF;
     }
     cc->frame->palette_has_changed = 1;
 }
@@ -265,20 +268,28 @@ static int cdg_decode_frame(AVCodecContext *avctx,
     int buf_size       = avpkt->size;
     int ret;
     uint8_t command, inst;
-    uint8_t cdg_data[CDG_DATA_SIZE];
+    uint8_t cdg_data[CDG_DATA_SIZE] = {0};
     AVFrame *frame = data;
     CDGraphicsContext *cc = avctx->priv_data;
 
-    bytestream2_init(&gb, avpkt->data, avpkt->size);
+    if (buf_size < CDG_MINIMUM_PKT_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "buffer too small for decoder\n");
+        return AVERROR(EINVAL);
+    }
+    if (buf_size > CDG_HEADER_SIZE + CDG_DATA_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "buffer too big for decoder\n");
+        return AVERROR(EINVAL);
+    }
 
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-    ret = ff_reget_buffer(avctx, cc->frame);
-    if (ret) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, cc->frame)) < 0)
         return ret;
-    }
-    if (!avctx->frame_number)
+    if (!cc->cleared) {
         memset(cc->frame->data[0], 0, cc->frame->linesize[0] * avctx->height);
+        memset(cc->frame->data[1], 0, AVPALETTE_SIZE);
+        cc->cleared = 1;
+    }
 
     command = bytestream2_get_byte(&gb);
     inst    = bytestream2_get_byte(&gb);
@@ -325,11 +336,8 @@ static int cdg_decode_frame(AVCodecContext *avctx,
                 return AVERROR(EINVAL);
             }
 
-            ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF);
-            if (ret) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
                 return ret;
-            }
 
             cdg_scroll(cc, cdg_data, frame, inst == CDG_INST_SCROLL_COPY);
             av_frame_unref(cc->frame);
@@ -337,6 +345,9 @@ static int cdg_decode_frame(AVCodecContext *avctx,
             if (ret < 0)
                 return ret;
             break;
+        case CDG_INST_TRANSPARENT_COL:
+            cc->transparency = cdg_data[0] & 0xF;
+            break;
         default:
             break;
         }
diff --git a/libavcodec/cdxl.c b/libavcodec/cdxl.c
index e7cb79c..78f5d50 100644
--- a/libavcodec/cdxl.c
+++ b/libavcodec/cdxl.c
@@ -2,33 +2,41 @@
  * CDXL video decoder
  * Copyright (c) 2011-2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * @file
+ * Commodore CDXL video decoder
+ * @author Paul B Mahol
+ */
+
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #define BIT_PLANAR   0x00
-#define BYTE_PLANAR  0x20
-#define CHUNKY       0x40
+#define CHUNKY       0x20
+#define BYTE_PLANAR  0x40
 #define BIT_LINE     0x80
 #define BYTE_LINE    0xC0
 
@@ -64,40 +72,53 @@ static void import_palette(CDXLVideoContext *c, uint32_t *new_palette)
         unsigned r   = ((rgb >> 8) & 0xF) * 0x11;
         unsigned g   = ((rgb >> 4) & 0xF) * 0x11;
         unsigned b   =  (rgb       & 0xF) * 0x11;
-        AV_WN32(&new_palette[i], (r << 16) | (g << 8) | b);
+        AV_WN32(&new_palette[i], (0xFFU << 24) | (r << 16) | (g << 8) | b);
     }
 }
 
 static void bitplanar2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int x, y, plane;
 
-    bitstream_init8(&bc, c->video, c->video_size);
+    if (init_get_bits8(&gb, c->video, c->video_size) < 0)
+        return;
     for (plane = 0; plane < c->bpp; plane++) {
         for (y = 0; y < c->avctx->height; y++) {
             for (x = 0; x < c->avctx->width; x++)
-                out[linesize * y + x] |= bitstream_read_bit(&bc) << plane;
-            bitstream_skip(&bc, c->padded_bits);
+                out[linesize * y + x] |= get_bits1(&gb) << plane;
+            skip_bits(&gb, c->padded_bits);
         }
     }
 }
 
 static void bitline2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
 {
-    BitstreamContext bc;
+    GetBitContext  gb;
     int x, y, plane;
 
-    bitstream_init8(&bc, c->video, c->video_size);
+    if (init_get_bits8(&gb, c->video, c->video_size) < 0)
+        return;
     for (y = 0; y < c->avctx->height; y++) {
         for (plane = 0; plane < c->bpp; plane++) {
             for (x = 0; x < c->avctx->width; x++)
-                out[linesize * y + x] |= bitstream_read_bit(&bc) << plane;
-            bitstream_skip(&bc, c->padded_bits);
+                out[linesize * y + x] |= get_bits1(&gb) << plane;
+            skip_bits(&gb, c->padded_bits);
         }
     }
 }
 
+static void chunky2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
+{
+    GetByteContext gb;
+    int y;
+
+    bytestream2_init(&gb, c->video, c->video_size);
+    for (y = 0; y < c->avctx->height; y++) {
+        bytestream2_get_buffer(&gb, out + linesize * y, c->avctx->width * 3);
+    }
+}
+
 static void import_format(CDXLVideoContext *c, int linesize, uint8_t *out)
 {
     memset(out, 0, linesize * c->avctx->height);
@@ -109,6 +130,9 @@ static void import_format(CDXLVideoContext *c, int linesize, uint8_t *out)
     case BIT_LINE:
         bitline2chunky(c, linesize, out);
         break;
+    case CHUNKY:
+        chunky2chunky(c, linesize, out);
+        break;
     }
 }
 
@@ -116,10 +140,16 @@ static void cdxl_decode_rgb(CDXLVideoContext *c, AVFrame *frame)
 {
     uint32_t *new_palette = (uint32_t *)frame->data[1];
 
+    memset(frame->data[1], 0, AVPALETTE_SIZE);
     import_palette(c, new_palette);
     import_format(c, frame->linesize[0], frame->data[0]);
 }
 
+static void cdxl_decode_raw(CDXLVideoContext *c, AVFrame *frame)
+{
+    import_format(c, frame->linesize[0], frame->data[0]);
+}
+
 static void cdxl_decode_ham6(CDXLVideoContext *c, AVFrame *frame)
 {
     AVCodecContext *avctx = c->avctx;
@@ -232,7 +262,7 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     if (c->bpp < 1)
         return AVERROR_INVALIDDATA;
-    if (c->format != BIT_PLANAR && c->format != BIT_LINE) {
+    if (c->format != BIT_PLANAR && c->format != BIT_LINE && c->format != CHUNKY) {
         avpriv_request_sample(avctx, "Pixel format 0x%0x", c->format);
         return AVERROR_PATCHWELCOME;
     }
@@ -240,26 +270,30 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    aligned_width = FFALIGN(c->avctx->width, 16);
+    if (c->format == CHUNKY)
+        aligned_width = avctx->width;
+    else
+        aligned_width = FFALIGN(c->avctx->width, 16);
     c->padded_bits  = aligned_width - c->avctx->width;
-    if (c->video_size < aligned_width * avctx->height * c->bpp / 8)
+    if (c->video_size < aligned_width * avctx->height * (int64_t)c->bpp / 8)
         return AVERROR_INVALIDDATA;
-    if (!encoding && c->palette_size && c->bpp <= 8) {
+    if (!encoding && c->palette_size && c->bpp <= 8 && c->format != CHUNKY) {
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
-    } else if (encoding == 1 && (c->bpp == 6 || c->bpp == 8)) {
+    } else if (encoding == 1 && (c->bpp == 6 || c->bpp == 8) && c->format != CHUNKY) {
         if (c->palette_size != (1 << (c->bpp - 1)))
             return AVERROR_INVALIDDATA;
         avctx->pix_fmt = AV_PIX_FMT_BGR24;
+    } else if (!encoding && c->bpp == 24 && c->format == CHUNKY &&
+               !c->palette_size) {
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
     } else {
-        avpriv_request_sample(avctx, "Encoding %d and bpp %d",
-                              encoding, c->bpp);
+        avpriv_request_sample(avctx, "Encoding %d, bpp %d and format 0x%x",
+                              encoding, c->bpp, c->format);
         return AVERROR_PATCHWELCOME;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
 
     if (encoding) {
@@ -271,8 +305,10 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
             cdxl_decode_ham8(c, p);
         else
             cdxl_decode_ham6(c, p);
-    } else {
+    } else if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
         cdxl_decode_rgb(c, p);
+    } else {
+        cdxl_decode_raw(c, p);
     }
     *got_frame = 1;
 
@@ -283,7 +319,7 @@ static av_cold int cdxl_decode_end(AVCodecContext *avctx)
 {
     CDXLVideoContext *c = avctx->priv_data;
 
-    av_free(c->new_video);
+    av_freep(&c->new_video);
 
     return 0;
 }
diff --git a/libavcodec/celp_filters.c b/libavcodec/celp_filters.c
index 61474f5..fafedd9 100644
--- a/libavcodec/celp_filters.c
+++ b/libavcodec/celp_filters.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include "avcodec.h"
 #include "celp_filters.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 
 void ff_celp_convolve_circ(int16_t* fc_out, const int16_t* fc_in,
@@ -66,7 +67,7 @@ int ff_celp_lp_synthesis_filter(int16_t *out, const int16_t *filter_coeffs,
     for (n = 0; n < buffer_length; n++) {
         int sum = -rounder, sum1;
         for (i = 1; i <= filter_length; i++)
-            sum += filter_coeffs[i-1] * out[n-i];
+            sum += (unsigned)(filter_coeffs[i-1] * out[n-i]);
 
         sum1 = ((-sum >> 12) + in[n]) >> shift;
         sum  = av_clip_int16(sum1);
@@ -104,6 +105,8 @@ void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs,
     c -= filter_coeffs[1] * filter_coeffs[0];
     c -= filter_coeffs[0] * b;
 
+    av_assert2((filter_length&1)==0 && filter_length>=4);
+
     old_out0 = out[-4];
     old_out1 = out[-3];
     old_out2 = out[-2];
@@ -205,3 +208,12 @@ void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs,
             out[n] += filter_coeffs[i-1] * in[n-i];
     }
 }
+
+void ff_celp_filter_init(CELPFContext *c)
+{
+    c->celp_lp_synthesis_filterf        = ff_celp_lp_synthesis_filterf;
+    c->celp_lp_zero_synthesis_filterf   = ff_celp_lp_zero_synthesis_filterf;
+
+    if(HAVE_MIPSFPU)
+        ff_celp_filter_init_mips(c);
+}
diff --git a/libavcodec/celp_filters.h b/libavcodec/celp_filters.h
index c328258..f644ec3 100644
--- a/libavcodec/celp_filters.h
+++ b/libavcodec/celp_filters.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,55 @@
 
 #include <stdint.h>
 
+typedef struct CELPFContext {
+    /**
+     * LP synthesis filter.
+     * @param[out] out pointer to output buffer
+     *        - the array out[-filter_length, -1] must
+     *        contain the previous result of this filter
+     * @param filter_coeffs filter coefficients.
+     * @param in input signal
+     * @param buffer_length amount of data to process
+     * @param filter_length filter length (10 for 10th order LP filter). Must be
+     *                      greater than 4 and even.
+     *
+     * @note Output buffer must contain filter_length samples of past
+     *       speech data before pointer.
+     *
+     * Routine applies 1/A(z) filter to given speech data.
+     */
+    void (*celp_lp_synthesis_filterf)(float *out, const float *filter_coeffs,
+                                      const float *in, int buffer_length,
+                                      int filter_length);
+
+    /**
+     * LP zero synthesis filter.
+     * @param[out] out pointer to output buffer
+     * @param filter_coeffs filter coefficients.
+     * @param in input signal
+     *        - the array in[-filter_length, -1] must
+     *        contain the previous input of this filter
+     * @param buffer_length amount of data to process (should be a multiple of eight)
+     * @param filter_length filter length (10 for 10th order LP filter;
+     *                                      should be a multiple of two)
+     *
+     * @note Output buffer must contain filter_length samples of past
+     *       speech data before pointer.
+     *
+     * Routine applies A(z) filter to given speech data.
+     */
+    void (*celp_lp_zero_synthesis_filterf)(float *out, const float *filter_coeffs,
+                                           const float *in, int buffer_length,
+                                           int filter_length);
+
+}CELPFContext;
+
+/**
+ * Initialize CELPFContext.
+ */
+void ff_celp_filter_init(CELPFContext *c);
+void ff_celp_filter_init_mips(CELPFContext *c);
+
 /**
  * Circularly convolve fixed vector with a phase dispersion impulse
  *        response filter (D.6.2 of G.729 and 6.1.5 of AMR).
diff --git a/libavcodec/celp_math.c b/libavcodec/celp_math.c
index 8a788f5..a96b1ae 100644
--- a/libavcodec/celp_math.c
+++ b/libavcodec/celp_math.c
@@ -3,31 +3,30 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 #include <limits.h>
-#include <assert.h>
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
-#include "celp_math.h"
 #include "mathops.h"
-
+#include "celp_math.h"
 #include "libavutil/common.h"
 
 static const uint16_t exp2a[]=
@@ -50,7 +49,7 @@ int ff_exp2(uint16_t power)
 {
     unsigned int result= exp2a[power>>10] + 0x10000;
 
-    assert(power <= 0x7fff);
+    av_assert2(power <= 0x7fff);
 
     result= (result<<3) + ((result*exp2b[(power>>5)&31])>>17);
     return result + ((result*(power&31)*89)>>22);
@@ -63,10 +62,17 @@ int ff_exp2(uint16_t power)
  */
 static const uint16_t tab_log2[33] =
 {
+#ifdef G729_BITEXACT
+      0,   1455,   2866,   4236,   5568,   6863,   8124,   9352,
+  10549,  11716,  12855,  13967,  15054,  16117,  17156,  18172,
+  19167,  20142,  21097,  22033,  22951,  23852,  24735,  25603,
+  26455,  27291,  28113,  28922,  29716,  30497,  31266,  32023,  32767,
+#else
       4,   1459,   2870,   4240,   5572,   6867,   8127,   9355,
   10552,  11719,  12858,  13971,  15057,  16120,  17158,  18175,
   19170,  20145,  21100,  22036,  22954,  23854,  24738,  25605,
   26457,  27294,  28116,  28924,  29719,  30500,  31269,  32025,  32769,
+#endif
 };
 
 int ff_log2_q15(uint32_t value)
@@ -99,3 +105,22 @@ int64_t ff_dot_product(const int16_t *a, const int16_t *b, int length)
 
     return sum;
 }
+
+float ff_dot_productf(const float* a, const float* b, int length)
+{
+    float sum = 0;
+    int i;
+
+    for(i=0; i<length; i++)
+        sum += a[i] * b[i];
+
+    return sum;
+}
+
+void ff_celp_math_init(CELPMContext *c)
+{
+    c->dot_productf   = ff_dot_productf;
+
+    if(HAVE_MIPSFPU)
+        ff_celp_math_init_mips(c);
+}
diff --git a/libavcodec/celp_math.h b/libavcodec/celp_math.h
index 9cebdfe..18888a4 100644
--- a/libavcodec/celp_math.h
+++ b/libavcodec/celp_math.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,25 @@
 
 #include <stdint.h>
 
+typedef struct CELPMContext {
+    /**
+     * Return the dot product.
+     * @param a input data array
+     * @param b input data array
+     * @param length number of elements
+     *
+     * @return dot product = sum of elementwise products
+     */
+    float (*dot_productf)(const float* a, const float* b, int length);
+
+}CELPMContext;
+
+/**
+ * Initialize CELPMContext.
+ */
+void ff_celp_math_init(CELPMContext *c);
+void ff_celp_math_init_mips(CELPMContext *c);
+
 /**
  * fixed-point implementation of exp2(x) in [0; 1] domain.
  * @param power argument to exp2, 0 <= power <= 0x7fff
@@ -65,4 +84,14 @@ static inline int bidir_sal(int value, int offset)
     else           return value <<  offset;
 }
 
+/**
+ * Return the dot product.
+ * @param a input data array
+ * @param b input data array
+ * @param length number of elements
+ *
+ * @return dot product = sum of elementwise products
+ */
+float ff_dot_productf(const float* a, const float* b, int length);
+
 #endif /* AVCODEC_CELP_MATH_H */
diff --git a/libavcodec/cfhd.c b/libavcodec/cfhd.c
index e122deb..846d334 100644
--- a/libavcodec/cfhd.c
+++ b/libavcodec/cfhd.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015-2016 Kieran Kunhya <kieran@kunhya.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,12 +31,15 @@
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "thread.h"
 #include "cfhd.h"
 
+#define ALPHA_COMPAND_DC_OFFSET 256
+#define ALPHA_COMPAND_GAIN 9400
+
 enum CFHDParam {
     ChannelCount     =  12,
     SubbandCount     =  14,
@@ -46,20 +49,21 @@ enum CFHDParam {
     SubbandNumber    =  48,
     Quantization     =  53,
     ChannelNumber    =  62,
+    SampleFlags      =  68,
     BitsPerComponent = 101,
     ChannelWidth     = 104,
     ChannelHeight    = 105,
     PrescaleShift    = 109,
 };
 
+
+
 static av_cold int cfhd_init(AVCodecContext *avctx)
 {
     CFHDContext *s = avctx->priv_data;
 
-    memset(s, 0, sizeof(*s));
-
-    s->avctx                   = avctx;
     avctx->bits_per_raw_sample = 10;
+    s->avctx                   = avctx;
 
     return ff_cfhd_init_vlcs(s);
 }
@@ -71,9 +75,15 @@ static void init_plane_defaults(CFHDContext *s)
     s->subband_num_actual = 0;
 }
 
+static void init_peak_table_defaults(CFHDContext *s)
+{
+    s->peak.level  = 0;
+    s->peak.offset = 0;
+    memset(&s->peak.base, 0, sizeof(s->peak.base));
+}
+
 static void init_frame_defaults(CFHDContext *s)
 {
-    s->coded_format      = AV_PIX_FMT_YUV422P10;
     s->coded_width       = 0;
     s->coded_height      = 0;
     s->cropped_height    = 0;
@@ -83,21 +93,61 @@ static void init_frame_defaults(CFHDContext *s)
     s->channel_num       = 0;
     s->lowpass_precision = 16;
     s->quantisation      = 1;
-    s->prescale_shift[0] = 0;
-    s->prescale_shift[1] = 0;
-    s->prescale_shift[2] = 0;
     s->wavelet_depth     = 3;
     s->pshift            = 1;
     s->codebook          = 0;
+    s->difference_coding = 0;
+    s->progressive       = 0;
     init_plane_defaults(s);
+    init_peak_table_defaults(s);
 }
 
 /* TODO: merge with VLC tables or use LUT */
-static inline int dequant_and_decompand(int level, int quantisation)
+static inline int dequant_and_decompand(int level, int quantisation, int codebook)
+{
+    if (codebook == 0 || codebook == 1) {
+        int64_t abslevel = abs(level);
+        if (level < 264)
+            return (abslevel + ((768 * abslevel * abslevel * abslevel) / (255 * 255 * 255))) *
+               FFSIGN(level) * quantisation;
+        else
+            return level * quantisation;
+    } else
+        return level * quantisation;
+}
+
+static inline void difference_coding(int16_t *band, int width, int height)
 {
-    int64_t abslevel = abs(level);
-    return (abslevel + ((768 * abslevel * abslevel * abslevel) / (255 * 255 * 255))) *
-           FFSIGN(level) * quantisation;
+
+    int i,j;
+    for (i = 0; i < height; i++) {
+        for (j = 1; j < width; j++) {
+          band[j] += band[j-1];
+        }
+        band += width;
+    }
+}
+
+static inline void peak_table(int16_t *band, Peak *peak, int length)
+{
+    int i;
+    for (i = 0; i < length; i++)
+        if (abs(band[i]) > peak->level)
+            band[i] = bytestream2_get_le16(&peak->base);
+}
+
+static inline void process_alpha(int16_t *alpha, int width)
+{
+    int i, channel;
+    for (i = 0; i < width; i++) {
+        channel   = alpha[i];
+        channel  -= ALPHA_COMPAND_DC_OFFSET;
+        channel <<= 3;
+        channel  *= ALPHA_COMPAND_GAIN;
+        channel >>= 16;
+        channel   = av_clip_uintp2(channel, 12);
+        alpha[i]  = channel;
+    }
 }
 
 static inline void filter(int16_t *output, ptrdiff_t out_stride,
@@ -110,33 +160,51 @@ static inline void filter(int16_t *output, ptrdiff_t out_stride,
 
     for (i = 0; i < len; i++) {
         if (i == 0) {
-            tmp = (11 * low[0 * low_stride] - 4 * low[1 * low_stride] + low[2 * low_stride] + 4) >> 3;
-            output[(2 * i + 0) * out_stride] = (tmp + high[0 * high_stride]) >> 1;
-        } else if (i == len - 1) {
-            tmp = (5 * low[i * low_stride] + 4 * low[(i - 1) * low_stride] - low[(i - 2) * low_stride] + 4) >> 3;
-            output[(2 * i + 0) * out_stride] = (tmp + high[i * high_stride]) >> 1;
+            tmp = (11*low[0*low_stride] - 4*low[1*low_stride] + low[2*low_stride] + 4) >> 3;
+            output[(2*i+0)*out_stride] = (tmp + high[0*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+            tmp = ( 5*low[0*low_stride] + 4*low[1*low_stride] - low[2*low_stride] + 4) >> 3;
+            output[(2*i+1)*out_stride] = (tmp - high[0*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+        } else if (i == len-1) {
+            tmp = ( 5*low[i*low_stride] + 4*low[(i-1)*low_stride] - low[(i-2)*low_stride] + 4) >> 3;
+            output[(2*i+0)*out_stride] = (tmp + high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+            tmp = (11*low[i*low_stride] - 4*low[(i-1)*low_stride] + low[(i-2)*low_stride] + 4) >> 3;
+            output[(2*i+1)*out_stride] = (tmp - high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
         } else {
-            tmp = (low[(i - 1) * low_stride] - low[(i + 1) * low_stride] + 4) >> 3;
-            output[(2 * i + 0) * out_stride] = (tmp + low[i * low_stride] + high[i * high_stride]) >> 1;
+            tmp = (low[(i-1)*low_stride] - low[(i+1)*low_stride] + 4) >> 3;
+            output[(2*i+0)*out_stride] = (tmp + low[i*low_stride] + high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+            tmp = (low[(i+1)*low_stride] - low[(i-1)*low_stride] + 4) >> 3;
+            output[(2*i+1)*out_stride] = (tmp + low[i*low_stride] - high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
         }
-        if (clip)
-            output[(2 * i + 0) * out_stride] = av_clip_uintp2_c(output[(2 * i + 0) * out_stride], clip);
-
-        if (i == 0) {
-            tmp = (5 * low[0 * low_stride] + 4 * low[1 * low_stride] - low[2 * low_stride] + 4) >> 3;
-            output[(2 * i + 1) * out_stride] = (tmp - high[0 * high_stride]) >> 1;
-        } else if (i == len - 1) {
-            tmp = (11 * low[i * low_stride] - 4 * low[(i - 1) * low_stride] + low[(i - 2) * low_stride] + 4) >> 3;
-            output[(2 * i + 1) * out_stride] = (tmp - high[i * high_stride]) >> 1;
-        } else {
-            tmp = (low[(i + 1) * low_stride] - low[(i - 1) * low_stride] + 4) >> 3;
-            output[(2 * i + 1) * out_stride] = (tmp + low[i * low_stride] - high[i * high_stride]) >> 1;
-        }
-        if (clip)
-            output[(2 * i + 1) * out_stride] = av_clip_uintp2_c(output[(2 * i + 1) * out_stride], clip);
     }
 }
 
+static inline void interlaced_vertical_filter(int16_t *output, int16_t *low, int16_t *high,
+                         int width, int linesize, int plane)
+{
+    int i;
+    int16_t even, odd;
+    for (i = 0; i < width; i++) {
+        even = (low[i] - high[i])/2;
+        odd  = (low[i] + high[i])/2;
+        output[i]            = av_clip_uintp2(even, 10);
+        output[i + linesize] = av_clip_uintp2(odd, 10);
+    }
+}
 static void horiz_filter(int16_t *output, int16_t *low, int16_t *high,
                          int width)
 {
@@ -158,22 +226,33 @@ static void vert_filter(int16_t *output, ptrdiff_t out_stride,
 
 static void free_buffers(CFHDContext *s)
 {
-    unsigned i;
+    int i, j;
 
     for (i = 0; i < FF_ARRAY_ELEMS(s->plane); i++) {
         av_freep(&s->plane[i].idwt_buf);
         av_freep(&s->plane[i].idwt_tmp);
+
+        for (j = 0; j < 9; j++)
+            s->plane[i].subband[j] = NULL;
+
+        for (j = 0; j < 8; j++)
+            s->plane[i].l_h[j] = NULL;
     }
     s->a_height = 0;
     s->a_width  = 0;
 }
 
-static int alloc_buffers(CFHDContext *s)
+static int alloc_buffers(AVCodecContext *avctx)
 {
+    CFHDContext *s = avctx->priv_data;
     int i, j, ret, planes;
     int chroma_x_shift, chroma_y_shift;
     unsigned k;
 
+    if ((ret = ff_set_dimensions(avctx, s->coded_width, s->coded_height)) < 0)
+        return ret;
+    avctx->pix_fmt = s->coded_format;
+
     if ((ret = av_pix_fmt_get_chroma_sub_sample(s->coded_format,
                                                 &chroma_x_shift,
                                                 &chroma_y_shift)) < 0)
@@ -182,23 +261,24 @@ static int alloc_buffers(CFHDContext *s)
 
     for (i = 0; i < planes; i++) {
         int w8, h8, w4, h4, w2, h2;
-        int width  = i ? s->coded_width  >> chroma_x_shift : s->coded_width;
-        int height = i ? s->coded_height >> chroma_y_shift : s->coded_height;
+        int width  = i ? avctx->width  >> chroma_x_shift : avctx->width;
+        int height = i ? avctx->height >> chroma_y_shift : avctx->height;
         ptrdiff_t stride = FFALIGN(width  / 8, 8) * 8;
-        height           = FFALIGN(height / 8, 2) * 8;
+        if (chroma_y_shift)
+            height = FFALIGN(height / 8, 2) * 8;
         s->plane[i].width  = width;
         s->plane[i].height = height;
         s->plane[i].stride = stride;
 
         w8 = FFALIGN(s->plane[i].width  / 8, 8);
-        h8 = FFALIGN(s->plane[i].height / 8, 2);
+        h8 = height / 8;
         w4 = w8 * 2;
         h4 = h8 * 2;
         w2 = w4 * 2;
         h2 = h4 * 2;
 
         s->plane[i].idwt_buf =
-            av_malloc_array(height * stride, sizeof(*s->plane[i].idwt_buf));
+            av_mallocz_array(height * stride, sizeof(*s->plane[i].idwt_buf));
         s->plane[i].idwt_tmp =
             av_malloc_array(height * stride, sizeof(*s->plane[i].idwt_tmp));
         if (!s->plane[i].idwt_buf || !s->plane[i].idwt_tmp)
@@ -240,386 +320,462 @@ static int alloc_buffers(CFHDContext *s)
     return 0;
 }
 
-static int parse_tag(CFHDContext *s, GetByteContext *gb,
-                     int16_t *tag_, uint16_t *value, int *planes)
+static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
+                       AVPacket *avpkt)
 {
-    /* Bit weird but implement the tag parsing as the spec says */
-    uint16_t tagu   = bytestream2_get_be16(gb);
-    int16_t tag     = tagu;
-    int8_t tag8     = tagu >> 8;
-    uint16_t abstag = abs(tag);
-    int8_t abs_tag8 = abs(tag8);
-    uint16_t data   = bytestream2_get_be16(gb);
-    *tag_ = tag;
-    *value = data;
-
-    if (abs_tag8 >= 0x60 && abs_tag8 <= 0x6F) {
-        av_log(s->avctx, AV_LOG_DEBUG, "large len %"PRIX16"\n",
-               ((tagu & 0xFF) << 16) | data);
-        return 0;
-    } else if (abstag >= 0x4000 && abstag <= 0x40FF) {
-        av_log(s->avctx, AV_LOG_DEBUG, "Small chunk length %"PRIu16" %s\n",
-               data * 4, tag < 0 ? "optional" : "required");
-        bytestream2_skipu(gb, data * 4);
-        return 0;
-    }
+    CFHDContext *s = avctx->priv_data;
+    GetByteContext gb;
+    ThreadFrame frame = { .f = data };
+    AVFrame *pic = data;
+    int ret = 0, i, j, planes, plane, got_buffer = 0;
+    int16_t *coeff_data;
 
-    switch (tag) {
-    case 1:
-        av_log(s->avctx, AV_LOG_DEBUG, "Sample type? %"PRIu16"\n", data);
-        break;
-    case 2:
-    {
-        int i;
-        av_log(s->avctx, AV_LOG_DEBUG,
-               "tag=2 header - skipping %"PRIu16" tag/value pairs\n", data);
-        if (data > bytestream2_get_bytes_left(gb) / 4) {
-            av_log(s->avctx, AV_LOG_ERROR,
-                   "Too many tag/value pairs (%"PRIu16")\n", data);
-            return AVERROR_INVALIDDATA;
-        }
-        for (i = 0; i < data; i++) {
-            uint16_t tag2 = bytestream2_get_be16(gb);
-            uint16_t val2 = bytestream2_get_be16(gb);
-            av_log(s->avctx, AV_LOG_DEBUG, "Tag/Value = %"PRIX16" %"PRIX16"\n",
-                   tag2, val2);
-        }
-        break;
-    }
-    case 10:
-        if (data != 0) {
-            avpriv_report_missing_feature(s->avctx, "Transform type %"PRIu16, data);
-            return AVERROR_PATCHWELCOME;
-        }
-        av_log(s->avctx, AV_LOG_DEBUG, "Transform-type? %"PRIu16"\n", data);
-        break;
-    case ChannelCount:
-        av_log(s->avctx, AV_LOG_DEBUG, "Channel count: %"PRIu16"\n", data);
-        if (data > 4) {
-            avpriv_report_missing_feature(s->avctx, "Channel count %"PRIu16, data);
-            return AVERROR_PATCHWELCOME;
-        }
-        s->channel_cnt = data;
-        break;
-    case SubbandCount:
-        av_log(s->avctx, AV_LOG_DEBUG, "Subband count: %"PRIu16"\n", data);
-        if (data != SUBBAND_COUNT) {
-            avpriv_report_missing_feature(s->avctx, "Subband count %"PRIu16, data);
-            return AVERROR_PATCHWELCOME;
-        }
-        break;
-    case ImageWidth:
-        av_log(s->avctx, AV_LOG_DEBUG, "Width %"PRIu16"\n", data);
-        s->coded_width = data;
-        break;
-    case ImageHeight:
-        av_log(s->avctx, AV_LOG_DEBUG, "Height %"PRIu16"\n", data);
-        s->coded_height = data;
-        break;
-    case 23:
-        avpriv_report_missing_feature(s->avctx, "Skip frame");
-        return AVERROR_PATCHWELCOME;
-    case 27:
-        av_log(s->avctx, AV_LOG_DEBUG, "Lowpass width %"PRIu16"\n", data);
-        if (data < 2 || data > s->plane[s->channel_num].band[0][0].a_width) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid lowpass width\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->plane[s->channel_num].band[0][0].width  = data;
-        s->plane[s->channel_num].band[0][0].stride = data;
-        break;
-    case 28:
-        av_log(s->avctx, AV_LOG_DEBUG, "Lowpass height %"PRIu16"\n", data);
-        if (data < 2 || data > s->plane[s->channel_num].band[0][0].a_height) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid lowpass height\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->plane[s->channel_num].band[0][0].height = data;
-        break;
-    case LowpassPrecision:
-        av_log(s->avctx, AV_LOG_DEBUG, "Lowpass precision bits: %"PRIu16"\n", data);
-        break;
-    case 41:
-    case 49:
-        av_log(s->avctx, AV_LOG_DEBUG,
-               "Highpass width%s %"PRIu16" channel %i level %i subband %i\n",
-               tag == 49 ? "2" : "", data,
-               s->channel_num, s->level, s->subband_num);
-        if (data < 2) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid highpass width%s\n", tag == 49 ? "2" : "");
-            return AVERROR_INVALIDDATA;
-        }
-        s->plane[s->channel_num].band[s->level][s->subband_num].width  = data;
-        s->plane[s->channel_num].band[s->level][s->subband_num].stride = FFALIGN(data, 8);
-        break;
-    case 42:
-    case 50:
-        av_log(s->avctx, AV_LOG_DEBUG, "Highpass height%s %"PRIu16"\n", tag == 50 ? "2" : "", data);
-        if (data < 2) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid highpass height%s\n", tag == 50 ? "2" : "");
-            return AVERROR_INVALIDDATA;
-        }
-        s->plane[s->channel_num].band[s->level][s->subband_num].height = data;
-        break;
-    case SubbandNumber:
-        av_log(s->avctx, AV_LOG_DEBUG, "Subband number %"PRIu16"\n", data);
-        if (data > 3) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid subband number\n");
-            return AVERROR_INVALIDDATA;
-        }
-        if (s->subband_num != 0 && data == 1) {
-            if (s->level + 1 >= DWT_LEVELS) {
-                av_log(s->avctx, AV_LOG_ERROR, "Invalid level\n");
-                return AVERROR_INVALIDDATA;
-            }
+    s->coded_format = AV_PIX_FMT_YUV422P10;
+    init_frame_defaults(s);
+    planes = av_pix_fmt_count_planes(s->coded_format);
 
-            s->level++;
-        }
-        s->subband_num = data;
-        break;
-    case 51:
-        av_log(s->avctx, AV_LOG_DEBUG, "Subband number actual %"PRIu16"\n", data);
-        if (data >= SUBBAND_COUNT) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid subband number actual\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->subband_num_actual = data;
-        break;
-    case Quantization:
-        s->quantisation = data;
-        av_log(s->avctx, AV_LOG_DEBUG, "Quantisation: %"PRIu16"\n", data);
-        break;
-    case ChannelNumber:
-        av_log(s->avctx, AV_LOG_DEBUG, "Channel number %"PRIu16"\n", data);
-        if (data >= *planes) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid channel number\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->channel_num = data;
-        init_plane_defaults(s);
-        break;
-    case 70:
-        av_log(s->avctx, AV_LOG_DEBUG,
-               "Subsampling or bit-depth flag? %"PRIu16"\n", data);
-        if (!(data == 10 || data == 12)) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid bits per channel\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->bpc = data;
-        break;
-    case 71:
-        s->codebook = data;
-        av_log(s->avctx, AV_LOG_DEBUG, "Codebook %i\n", s->codebook);
-        break;
-    case 72:
-        s->codebook = data;
-        av_log(s->avctx, AV_LOG_DEBUG, "Other codebook? %i\n", s->codebook);
-        break;
-    case 84:
-        av_log(s->avctx, AV_LOG_DEBUG, "Sample format? %"PRIu16"\n", data);
-        switch (data) {
-        case 1:
-            s->coded_format = AV_PIX_FMT_YUV422P10;
-            break;
-        case 3:
-            s->coded_format = AV_PIX_FMT_GBRP12;
-            break;
-        case 4:
-            s->coded_format = AV_PIX_FMT_GBRAP12;
-            break;
-        default:
-            avpriv_report_missing_feature(s->avctx, "Sample format %"PRIu16, data);
-            return AVERROR_PATCHWELCOME;
-        }
-        *planes = av_pix_fmt_count_planes(s->coded_format);
-        break;
-    case -85:
-        av_log(s->avctx, AV_LOG_DEBUG, "Cropped height %"PRIu16"\n", data);
-        s->cropped_height = data;
-        break;
-    case 101:
-        av_log(s->avctx, AV_LOG_DEBUG, "Bits per component: %"PRIu16"\n", data);
-        s->bpc = data;
-        break;
-    case PrescaleShift:
-        s->prescale_shift[0] = (data >> 0) & 0x7;
-        s->prescale_shift[1] = (data >> 3) & 0x7;
-        s->prescale_shift[2] = (data >> 6) & 0x7;
-        av_log(s->avctx, AV_LOG_DEBUG, "Prescale shift (VC-5): %"PRIX16"\n", data);
-        break;
-    default:
-        av_log(s->avctx, AV_LOG_DEBUG, "Unknown tag %"PRIu16" data %"PRIX16"\n",
-               tag, data);
-    }
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-    return 0;
-}
+    while (bytestream2_get_bytes_left(&gb) > 4) {
+        /* Bit weird but implement the tag parsing as the spec says */
+        uint16_t tagu   = bytestream2_get_be16(&gb);
+        int16_t tag     = (int16_t)tagu;
+        int8_t tag8     = (int8_t)(tagu >> 8);
+        uint16_t abstag = abs(tag);
+        int8_t abs_tag8 = abs(tag8);
+        uint16_t data   = bytestream2_get_be16(&gb);
+        if (abs_tag8 >= 0x60 && abs_tag8 <= 0x6f) {
+            av_log(avctx, AV_LOG_DEBUG, "large len %x\n", ((tagu & 0xff) << 16) | data);
+        } else if (tag == SampleFlags) {
+            av_log(avctx, AV_LOG_DEBUG, "Progressive?%"PRIu16"\n", data);
+            s->progressive = data & 0x0001;
+        } else if (tag == ImageWidth) {
+            av_log(avctx, AV_LOG_DEBUG, "Width %"PRIu16"\n", data);
+            s->coded_width = data;
+        } else if (tag == ImageHeight) {
+            av_log(avctx, AV_LOG_DEBUG, "Height %"PRIu16"\n", data);
+            s->coded_height = data;
+        } else if (tag == 101) {
+            av_log(avctx, AV_LOG_DEBUG, "Bits per component: %"PRIu16"\n", data);
+            if (data < 1 || data > 31) {
+                av_log(avctx, AV_LOG_ERROR, "Bits per component %d is invalid\n", data);
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->bpc = data;
+        } else if (tag == ChannelCount) {
+            av_log(avctx, AV_LOG_DEBUG, "Channel Count: %"PRIu16"\n", data);
+            s->channel_cnt = data;
+            if (data > 4) {
+                av_log(avctx, AV_LOG_ERROR, "Channel Count of %"PRIu16" is unsupported\n", data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+        } else if (tag == SubbandCount) {
+            av_log(avctx, AV_LOG_DEBUG, "Subband Count: %"PRIu16"\n", data);
+            if (data != SUBBAND_COUNT) {
+                av_log(avctx, AV_LOG_ERROR, "Subband Count of %"PRIu16" is unsupported\n", data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+        } else if (tag == ChannelNumber) {
+            s->channel_num = data;
+            av_log(avctx, AV_LOG_DEBUG, "Channel number %"PRIu16"\n", data);
+            if (s->channel_num >= planes) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid channel number\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            init_plane_defaults(s);
+        } else if (tag == SubbandNumber) {
+            if (s->subband_num != 0 && data == 1)  // hack
+                s->level++;
+            av_log(avctx, AV_LOG_DEBUG, "Subband number %"PRIu16"\n", data);
+            s->subband_num = data;
+            if (s->level >= DWT_LEVELS) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid level\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            if (s->subband_num > 3) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid subband number\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 51) {
+            av_log(avctx, AV_LOG_DEBUG, "Subband number actual %"PRIu16"\n", data);
+            s->subband_num_actual = data;
+            if (s->subband_num_actual >= 10) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid subband number actual\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == LowpassPrecision)
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass precision bits: %"PRIu16"\n", data);
+        else if (tag == Quantization) {
+            s->quantisation = data;
+            av_log(avctx, AV_LOG_DEBUG, "Quantisation: %"PRIu16"\n", data);
+        } else if (tag == PrescaleShift) {
+            s->prescale_shift[0] = (data >> 0) & 0x7;
+            s->prescale_shift[1] = (data >> 3) & 0x7;
+            s->prescale_shift[2] = (data >> 6) & 0x7;
+            av_log(avctx, AV_LOG_DEBUG, "Prescale shift (VC-5): %x\n", data);
+        } else if (tag == 27) {
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass width %"PRIu16"\n", data);
+            if (data < 3 || data > s->plane[s->channel_num].band[0][0].a_width) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid lowpass width\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->plane[s->channel_num].band[0][0].width  = data;
+            s->plane[s->channel_num].band[0][0].stride = data;
+        } else if (tag == 28) {
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass height %"PRIu16"\n", data);
+            if (data < 3 || data > s->plane[s->channel_num].band[0][0].a_height) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid lowpass height\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->plane[s->channel_num].band[0][0].height = data;
+        } else if (tag == 1)
+            av_log(avctx, AV_LOG_DEBUG, "Sample type? %"PRIu16"\n", data);
+        else if (tag == 10) {
+            if (data != 0) {
+                avpriv_report_missing_feature(avctx, "Transform type of %"PRIu16, data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+            av_log(avctx, AV_LOG_DEBUG, "Transform-type? %"PRIu16"\n", data);
+        } else if (abstag >= 0x4000 && abstag <= 0x40ff) {
+            if (abstag == 0x4001)
+                s->peak.level = 0;
+            av_log(avctx, AV_LOG_DEBUG, "Small chunk length %d %s\n", data * 4, tag < 0 ? "optional" : "required");
+            bytestream2_skipu(&gb, data * 4);
+        } else if (tag == 23) {
+            av_log(avctx, AV_LOG_DEBUG, "Skip frame\n");
+            avpriv_report_missing_feature(avctx, "Skip frame");
+            ret = AVERROR_PATCHWELCOME;
+            break;
+        } else if (tag == 2) {
+            av_log(avctx, AV_LOG_DEBUG, "tag=2 header - skipping %i tag/value pairs\n", data);
+            if (data > bytestream2_get_bytes_left(&gb) / 4) {
+                av_log(avctx, AV_LOG_ERROR, "too many tag/value pairs (%d)\n", data);
+                ret = AVERROR_INVALIDDATA;
+                break;
+            }
+            for (i = 0; i < data; i++) {
+                uint16_t tag2 = bytestream2_get_be16(&gb);
+                uint16_t val2 = bytestream2_get_be16(&gb);
+                av_log(avctx, AV_LOG_DEBUG, "Tag/Value = %x %x\n", tag2, val2);
+            }
+        } else if (tag == 41) {
+            av_log(avctx, AV_LOG_DEBUG, "Highpass width %i channel %i level %i subband %i\n", data, s->channel_num, s->level, s->subband_num);
+            if (data < 3) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass width\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->plane[s->channel_num].band[s->level][s->subband_num].width  = data;
+            s->plane[s->channel_num].band[s->level][s->subband_num].stride = FFALIGN(data, 8);
+        } else if (tag == 42) {
+            av_log(avctx, AV_LOG_DEBUG, "Highpass height %i\n", data);
+            if (data < 3) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass height\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->plane[s->channel_num].band[s->level][s->subband_num].height = data;
+        } else if (tag == 49) {
+            av_log(avctx, AV_LOG_DEBUG, "Highpass width2 %i\n", data);
+            if (data < 3) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass width2\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->plane[s->channel_num].band[s->level][s->subband_num].width  = data;
+            s->plane[s->channel_num].band[s->level][s->subband_num].stride = FFALIGN(data, 8);
+        } else if (tag == 50) {
+            av_log(avctx, AV_LOG_DEBUG, "Highpass height2 %i\n", data);
+            if (data < 3) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass height2\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->plane[s->channel_num].band[s->level][s->subband_num].height = data;
+        } else if (tag == 71) {
+            s->codebook = data;
+            av_log(avctx, AV_LOG_DEBUG, "Codebook %i\n", s->codebook);
+        } else if (tag == 72) {
+            s->codebook = data & 0xf;
+            s->difference_coding = (data >> 4) & 1;
+            av_log(avctx, AV_LOG_DEBUG, "Other codebook? %i\n", s->codebook);
+        } else if (tag == 70) {
+            av_log(avctx, AV_LOG_DEBUG, "Subsampling or bit-depth flag? %i\n", data);
+            if (!(data == 10 || data == 12)) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid bits per channel\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            s->bpc = data;
+        } else if (tag == 84) {
+            av_log(avctx, AV_LOG_DEBUG, "Sample format? %i\n", data);
+            if (data == 1)
+                s->coded_format = AV_PIX_FMT_YUV422P10;
+            else if (data == 3)
+                s->coded_format = AV_PIX_FMT_GBRP12;
+            else if (data == 4)
+                s->coded_format = AV_PIX_FMT_GBRAP12;
+            else {
+                avpriv_report_missing_feature(avctx, "Sample format of %"PRIu16, data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+            planes = av_pix_fmt_count_planes(s->coded_format);
+        } else if (tag == -85) {
+            av_log(avctx, AV_LOG_DEBUG, "Cropped height %"PRIu16"\n", data);
+            s->cropped_height = data;
+        } else if (tag == -75) {
+            s->peak.offset &= ~0xffff;
+            s->peak.offset |= (data & 0xffff);
+            s->peak.base    = gb;
+            s->peak.level   = 0;
+        } else if (tag == -76) {
+            s->peak.offset &= 0xffff;
+            s->peak.offset |= (data & 0xffffU)<<16;
+            s->peak.base    = gb;
+            s->peak.level   = 0;
+        } else if (tag == -74 && s->peak.offset) {
+            s->peak.level = data;
+            bytestream2_seek(&s->peak.base, s->peak.offset - 4, SEEK_CUR);
+        } else
+            av_log(avctx, AV_LOG_DEBUG,  "Unknown tag %i data %x\n", tag, data);
 
-static int read_lowpass_coeffs(CFHDContext *s, GetByteContext *gb,
-                               int16_t *coeff_data)
-{
-    int i, j;
-    int lowpass_height   = s->plane[s->channel_num].band[0][0].height;
-    int lowpass_width    = s->plane[s->channel_num].band[0][0].width;
-    int lowpass_a_height = s->plane[s->channel_num].band[0][0].a_height;
-    int lowpass_a_width  = s->plane[s->channel_num].band[0][0].a_width;
-
-    if (lowpass_height > lowpass_a_height ||
-        lowpass_width  > lowpass_a_width  ||
-        lowpass_a_width * lowpass_a_height * sizeof(*coeff_data) > bytestream2_get_bytes_left(gb)) {
-        av_log(s->avctx, AV_LOG_ERROR, "Too many lowpass coefficients\n");
-        return AVERROR_INVALIDDATA;
-    }
+        /* Some kind of end of header tag */
+        if (tag == 4 && data == 0x1a4a && s->coded_width && s->coded_height &&
+            s->coded_format != AV_PIX_FMT_NONE) {
+            if (s->a_width != s->coded_width || s->a_height != s->coded_height ||
+                s->a_format != s->coded_format) {
+                free_buffers(s);
+                if ((ret = alloc_buffers(avctx)) < 0) {
+                    free_buffers(s);
+                    return ret;
+                }
+            }
+            ret = ff_set_dimensions(avctx, s->coded_width, s->coded_height);
+            if (ret < 0)
+                return ret;
+            if (s->cropped_height)
+                avctx->height = s->cropped_height;
+            frame.f->width =
+            frame.f->height = 0;
 
-    av_log(s->avctx, AV_LOG_DEBUG,
-           "Start of lowpass coeffs component %d height:%d, width:%d\n",
-           s->channel_num, lowpass_height, lowpass_width);
-    for (i = 0; i < lowpass_height; i++) {
-        for (j = 0; j < lowpass_width; j++)
-            coeff_data[j] = bytestream2_get_be16u(gb);
+            if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+                return ret;
 
-        coeff_data += lowpass_width;
-    }
+            s->coded_width = 0;
+            s->coded_height = 0;
+            s->coded_format = AV_PIX_FMT_NONE;
+            got_buffer = 1;
+        }
+        coeff_data = s->plane[s->channel_num].subband[s->subband_num_actual];
 
-    /* Align to mod-4 position to continue reading tags */
-    bytestream2_seek(gb, bytestream2_tell(gb) & 3, SEEK_CUR);
+        /* Lowpass coefficients */
+        if (tag == 4 && data == 0xf0f && s->a_width && s->a_height) {
+            int lowpass_height = s->plane[s->channel_num].band[0][0].height;
+            int lowpass_width  = s->plane[s->channel_num].band[0][0].width;
+            int lowpass_a_height = s->plane[s->channel_num].band[0][0].a_height;
+            int lowpass_a_width  = s->plane[s->channel_num].band[0][0].a_width;
+
+            if (!got_buffer) {
+                av_log(avctx, AV_LOG_ERROR, "No end of header tag found\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
 
-    /* Copy last coefficient line if height is odd. */
-    if (lowpass_height & 1) {
-        int16_t *last_line = &coeff_data[lowpass_height * lowpass_width];
-        memcpy(last_line, &last_line[-lowpass_width],
-               lowpass_width * sizeof(*coeff_data));
-    }
+            if (lowpass_height > lowpass_a_height || lowpass_width > lowpass_a_width ||
+                lowpass_a_width * lowpass_a_height * sizeof(int16_t) > bytestream2_get_bytes_left(&gb)) {
+                av_log(avctx, AV_LOG_ERROR, "Too many lowpass coefficients\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
 
-    av_log(s->avctx, AV_LOG_DEBUG, "Lowpass coefficients %i\n",
-           lowpass_width * lowpass_height);
+            av_log(avctx, AV_LOG_DEBUG, "Start of lowpass coeffs component %d height:%d, width:%d\n", s->channel_num, lowpass_height, lowpass_width);
+            for (i = 0; i < lowpass_height; i++) {
+                for (j = 0; j < lowpass_width; j++)
+                    coeff_data[j] = bytestream2_get_be16u(&gb);
 
-    return 0;
-}
+                coeff_data += lowpass_width;
+            }
 
-#define DECODE_SUBBAND_COEFFS(TABLE, COND)                              \
-    while (1) {                                                         \
-        int level, run, coeff;                                          \
-        BITSTREAM_RL_VLC(level, run, &s->bc, s->TABLE, VLC_BITS, 3);    \
-                                                                        \
-        /* escape */                                                    \
-        if (COND)                                                       \
-            break;                                                      \
-                                                                        \
-        count += run;                                                   \
-                                                                        \
-        if (count > expected) {                                         \
-            av_log(s->avctx, AV_LOG_ERROR, "Escape codeword not found, " \
-                   "probably corrupt data\n");                          \
-            return AVERROR_INVALIDDATA;                                 \
-        }                                                               \
-                                                                        \
-        coeff = dequant_and_decompand(level, s->quantisation);          \
-        for (i = 0; i < run; i++)                                       \
-            *coeff_data++ = coeff;                                      \
-    }                                                                   \
-
-static int read_highpass_coeffs(CFHDContext *s, GetByteContext *gb,
-                                int16_t *coeff_data)
-{
-    int i, ret;
-    int highpass_height       = s->plane[s->channel_num].band[s->level][s->subband_num].height;
-    int highpass_width        = s->plane[s->channel_num].band[s->level][s->subband_num].width;
-    int highpass_a_width      = s->plane[s->channel_num].band[s->level][s->subband_num].a_width;
-    int highpass_a_height     = s->plane[s->channel_num].band[s->level][s->subband_num].a_height;
-    ptrdiff_t highpass_stride = s->plane[s->channel_num].band[s->level][s->subband_num].stride;
-    int expected   = highpass_height   * highpass_stride;
-    int a_expected = highpass_a_height * highpass_a_width;
-    int count = 0;
-    unsigned bytes;
-
-    if (highpass_height > highpass_a_height ||
-        highpass_width  > highpass_a_width  ||
-        a_expected      < expected) {
-        av_log(s->avctx, AV_LOG_ERROR, "Too many highpass coefficients\n");
-        return AVERROR_INVALIDDATA;
-    }
+            /* Align to mod-4 position to continue reading tags */
+            bytestream2_seek(&gb, bytestream2_tell(&gb) & 3, SEEK_CUR);
 
-    av_log(s->avctx, AV_LOG_DEBUG,
-           "Start subband coeffs plane %i level %i codebook %i expected %i\n",
-           s->channel_num, s->level, s->codebook, expected);
+            /* Copy last line of coefficients if odd height */
+            if (lowpass_height & 1) {
+                memcpy(&coeff_data[lowpass_height * lowpass_width],
+                       &coeff_data[(lowpass_height - 1) * lowpass_width],
+                       lowpass_width * sizeof(*coeff_data));
+            }
 
-    if ((ret = bitstream_init8(&s->bc, gb->buffer,
-                               bytestream2_get_bytes_left(gb))) < 0)
-        return ret;
-    if (!s->codebook) {
-        DECODE_SUBBAND_COEFFS(table_9_rl_vlc, level == 64)
-    } else {
-        DECODE_SUBBAND_COEFFS(table_18_rl_vlc, level == 255 && run == 2)
-    }
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass coefficients %d\n", lowpass_width * lowpass_height);
+        }
 
-    bytes = FFALIGN(AV_CEIL_RSHIFT(bitstream_tell(&s->bc), 3), 4);
-    if (bytes > bytestream2_get_bytes_left(gb)) {
-        av_log(s->avctx, AV_LOG_ERROR, "Bitstream overread error\n");
-        return AVERROR_INVALIDDATA;
-    } else
-        bytestream2_seek(gb, bytes, SEEK_CUR);
+        if (tag == 55 && s->subband_num_actual != 255 && s->a_width && s->a_height) {
+            int highpass_height = s->plane[s->channel_num].band[s->level][s->subband_num].height;
+            int highpass_width  = s->plane[s->channel_num].band[s->level][s->subband_num].width;
+            int highpass_a_width = s->plane[s->channel_num].band[s->level][s->subband_num].a_width;
+            int highpass_a_height = s->plane[s->channel_num].band[s->level][s->subband_num].a_height;
+            int highpass_stride = s->plane[s->channel_num].band[s->level][s->subband_num].stride;
+            int expected;
+            int a_expected = highpass_a_height * highpass_a_width;
+            int level, run, coeff;
+            int count = 0, bytes;
+
+            if (!got_buffer) {
+                av_log(avctx, AV_LOG_ERROR, "No end of header tag found\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
 
-    av_log(s->avctx, AV_LOG_DEBUG, "End subband coeffs %i extra %i\n",
-           count, count - expected);
-    s->codebook = 0;
+            if (highpass_height > highpass_a_height || highpass_width > highpass_a_width || a_expected < highpass_height * (uint64_t)highpass_stride) {
+                av_log(avctx, AV_LOG_ERROR, "Too many highpass coefficients\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+            expected = highpass_height * highpass_stride;
+
+            av_log(avctx, AV_LOG_DEBUG, "Start subband coeffs plane %i level %i codebook %i expected %i\n", s->channel_num, s->level, s->codebook, expected);
+
+            init_get_bits(&s->gb, gb.buffer, bytestream2_get_bytes_left(&gb) * 8);
+            {
+                OPEN_READER(re, &s->gb);
+                if (!s->codebook) {
+                    while (1) {
+                        UPDATE_CACHE(re, &s->gb);
+                        GET_RL_VLC(level, run, re, &s->gb, s->table_9_rl_vlc,
+                                   VLC_BITS, 3, 1);
+
+                        /* escape */
+                        if (level == 64)
+                            break;
+
+                        count += run;
+
+                        if (count > expected)
+                            break;
+
+                        coeff = dequant_and_decompand(level, s->quantisation, 0);
+                        for (i = 0; i < run; i++)
+                            *coeff_data++ = coeff;
+                    }
+                } else {
+                    while (1) {
+                        UPDATE_CACHE(re, &s->gb);
+                        GET_RL_VLC(level, run, re, &s->gb, s->table_18_rl_vlc,
+                                   VLC_BITS, 3, 1);
+
+                        /* escape */
+                        if (level == 255 && run == 2)
+                            break;
+
+                        count += run;
+
+                        if (count > expected)
+                            break;
+
+                        coeff = dequant_and_decompand(level, s->quantisation, s->codebook);
+                        for (i = 0; i < run; i++)
+                            *coeff_data++ = coeff;
+                    }
+                }
+                CLOSE_READER(re, &s->gb);
+            }
 
-    /* Copy last coefficient line if height is odd. */
-    if (highpass_height & 1) {
-        int16_t *last_line = &coeff_data[expected];
-        memcpy(last_line, &last_line[-highpass_stride],
-               highpass_stride * sizeof(*coeff_data));
+            if (count > expected) {
+                av_log(avctx, AV_LOG_ERROR, "Escape codeword not found, probably corrupt data\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+            if (s->peak.level)
+                peak_table(coeff_data - count, &s->peak, count);
+            if (s->difference_coding)
+                difference_coding(s->plane[s->channel_num].subband[s->subband_num_actual], highpass_width, highpass_height);
+
+            bytes = FFALIGN(AV_CEIL_RSHIFT(get_bits_count(&s->gb), 3), 4);
+            if (bytes > bytestream2_get_bytes_left(&gb)) {
+                av_log(avctx, AV_LOG_ERROR, "Bitstream overread error\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            } else
+                bytestream2_seek(&gb, bytes, SEEK_CUR);
+
+            av_log(avctx, AV_LOG_DEBUG, "End subband coeffs %i extra %i\n", count, count - expected);
+            s->codebook = 0;
+
+            /* Copy last line of coefficients if odd height */
+            if (highpass_height & 1) {
+                memcpy(&coeff_data[highpass_height * highpass_stride],
+                       &coeff_data[(highpass_height - 1) * highpass_stride],
+                       highpass_stride * sizeof(*coeff_data));
+            }
+        }
     }
 
-    return 0;
-}
-
-static int reconstruct_level(CFHDContext *s, AVFrame *pic, int plane, int level)
-{
-    int i, j, idx = level - 1, idx2 = level > 1 ? 1 : 0;
-    int16_t *low, *high, *output, *dst;
-    int lowpass_height        = s->plane[plane].band[idx][idx2].height;
-    int lowpass_width         = s->plane[plane].band[idx][idx2].width;
-    ptrdiff_t highpass_stride = s->plane[plane].band[idx][1].stride;
-
-    if (lowpass_height                     > s->plane[plane].band[idx][idx2].a_height ||
-        lowpass_width                      > s->plane[plane].band[idx][idx2].a_width  ||
-        s->plane[plane].band[idx][1].width > s->plane[plane].band[idx][1].a_width     ||
-        !highpass_stride) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid plane dimensions\n");
-        return AVERROR_INVALIDDATA;
+    if (!s->a_width || !s->a_height || s->a_format == AV_PIX_FMT_NONE ||
+        s->coded_width || s->coded_height || s->coded_format != AV_PIX_FMT_NONE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid dimensions\n");
+        ret = AVERROR(EINVAL);
+        goto end;
     }
 
-    av_log(s->avctx, AV_LOG_DEBUG, "Level %d plane %i %i %i %ti\n",
-           level, plane, lowpass_height, lowpass_width, highpass_stride);
-
-    low    = s->plane[plane].subband[0];
-    high   = s->plane[plane].subband[2 + 3 * idx];
-    output = s->plane[plane].l_h[3 * idx];
-    for (i = 0; i < lowpass_width; i++) {
-        vert_filter(output, lowpass_width, low, lowpass_width, high,
-                    highpass_stride, lowpass_height);
-        low++;
-        high++;
-        output++;
+    if (!got_buffer) {
+        av_log(avctx, AV_LOG_ERROR, "No end of header tag found\n");
+        ret = AVERROR(EINVAL);
+        goto end;
     }
 
-    low    = s->plane[plane].subband[1 + 3 * idx];
-    high   = s->plane[plane].subband[3 + 3 * idx];
-    output = s->plane[plane].l_h[1 + 3 * idx];
-    for (i = 0; i < lowpass_width; i++) {
-        // note the stride of "low" is highpass_stride
-        vert_filter(output, lowpass_width, low, highpass_stride, high,
-                    highpass_stride, lowpass_height);
-        low++;
-        high++;
-        output++;
-    }
+    planes = av_pix_fmt_count_planes(avctx->pix_fmt);
+    for (plane = 0; plane < planes && !ret; plane++) {
+        /* level 1 */
+        int lowpass_height  = s->plane[plane].band[0][0].height;
+        int lowpass_width   = s->plane[plane].band[0][0].width;
+        int highpass_stride = s->plane[plane].band[0][1].stride;
+        int act_plane = plane == 1 ? 2 : plane == 2 ? 1 : plane;
+        int16_t *low, *high, *output, *dst;
 
-    low  = s->plane[plane].l_h[0 + 3 * idx];
-    high = s->plane[plane].l_h[1 + 3 * idx];
+        if (lowpass_height > s->plane[plane].band[0][0].a_height || lowpass_width > s->plane[plane].band[0][0].a_width ||
+            !highpass_stride || s->plane[plane].band[0][1].width > s->plane[plane].band[0][1].a_width) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid plane dimensions\n");
+            ret = AVERROR(EINVAL);
+            goto end;
+        }
+
+        av_log(avctx, AV_LOG_DEBUG, "Decoding level 1 plane %i %i %i %i\n", plane, lowpass_height, lowpass_width, highpass_stride);
+
+        low    = s->plane[plane].subband[0];
+        high   = s->plane[plane].subband[2];
+        output = s->plane[plane].l_h[0];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
+
+        low    = s->plane[plane].subband[1];
+        high   = s->plane[plane].subband[3];
+        output = s->plane[plane].l_h[1];
+
+        for (i = 0; i < lowpass_width; i++) {
+            // note the stride of "low" is highpass_stride
+            vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
 
-    if (level != 3) {
+        low    = s->plane[plane].l_h[0];
+        high   = s->plane[plane].l_h[1];
         output = s->plane[plane].subband[0];
         for (i = 0; i < lowpass_height * 2; i++) {
             horiz_filter(output, low, high, lowpass_width);
@@ -627,117 +783,150 @@ static int reconstruct_level(CFHDContext *s, AVFrame *pic, int plane, int level)
             high   += lowpass_width;
             output += lowpass_width * 2;
         }
-        if (s->bpc == 12 || level == 2) {
+        if (s->bpc == 12) {
             output = s->plane[plane].subband[0];
             for (i = 0; i < lowpass_height * 2; i++) {
                 for (j = 0; j < lowpass_width * 2; j++)
-                    output[j] <<= 2;
+                    output[j] *= 4;
 
                 output += lowpass_width * 2;
             }
         }
-    } else {
-        int act_plane = plane == 1 ? 2 : plane == 2 ? 1 : plane;
-        dst = (int16_t *)pic->data[act_plane];
-        for (i = 0; i < lowpass_height * 2; i++) {
-            horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
-            low  += lowpass_width;
-            high += lowpass_width;
-            dst  += pic->linesize[act_plane] / 2;
-        }
-    }
 
-    return 0;
-}
-
-static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
-                       AVPacket *avpkt)
-{
-    CFHDContext *s = avctx->priv_data;
-    GetByteContext gb;
-    ThreadFrame frame = { .f = data };
-    int ret = 0, planes, plane;
-    int16_t tag;
-    uint16_t value;
+        /* level 2 */
+        lowpass_height  = s->plane[plane].band[1][1].height;
+        lowpass_width   = s->plane[plane].band[1][1].width;
+        highpass_stride = s->plane[plane].band[1][1].stride;
+
+        if (lowpass_height > s->plane[plane].band[1][1].a_height || lowpass_width > s->plane[plane].band[1][1].a_width ||
+            !highpass_stride || s->plane[plane].band[1][1].width > s->plane[plane].band[1][1].a_width) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid plane dimensions\n");
+            ret = AVERROR(EINVAL);
+            goto end;
+        }
 
-    init_frame_defaults(s);
-    planes = av_pix_fmt_count_planes(s->coded_format);
+        av_log(avctx, AV_LOG_DEBUG, "Level 2 plane %i %i %i %i\n", plane, lowpass_height, lowpass_width, highpass_stride);
 
-    bytestream2_init(&gb, avpkt->data, avpkt->size);
-
-    while (bytestream2_get_bytes_left(&gb) > 4) {
-        if ((ret = parse_tag(s, &gb, &tag, &value, &planes)) < 0)
-            return ret;
+        low    = s->plane[plane].subband[0];
+        high   = s->plane[plane].subband[5];
+        output = s->plane[plane].l_h[3];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
 
-        /* Some kind of end of header tag */
-        if (tag == 4 && value == 0x1A4A)
-            break;
-    }
+        low    = s->plane[plane].subband[4];
+        high   = s->plane[plane].subband[6];
+        output = s->plane[plane].l_h[4];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
 
-    if (s->coded_width <= 0 || s->coded_height <= 0 || s->coded_format == AV_PIX_FMT_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Video dimensions/format missing or invalid\n");
-        return AVERROR_INVALIDDATA;
-    }
+        low    = s->plane[plane].l_h[3];
+        high   = s->plane[plane].l_h[4];
+        output = s->plane[plane].subband[0];
+        for (i = 0; i < lowpass_height * 2; i++) {
+            horiz_filter(output, low, high, lowpass_width);
+            low    += lowpass_width;
+            high   += lowpass_width;
+            output += lowpass_width * 2;
+        }
 
-    ret = ff_set_dimensions(s->avctx, s->coded_width, s->coded_height);
-    if (ret < 0)
-        return ret;
-    if (s->cropped_height)
-        s->avctx->height = s->cropped_height;
+        output = s->plane[plane].subband[0];
+        for (i = 0; i < lowpass_height * 2; i++) {
+            for (j = 0; j < lowpass_width * 2; j++)
+                output[j] *= 4;
 
-    s->avctx->pix_fmt = s->coded_format;
+            output += lowpass_width * 2;
+        }
 
-    if (s->a_width != s->coded_width || s->a_height != s->coded_height ||
-        s->a_format != s->coded_format) {
-        free_buffers(s);
-        if ((ret = alloc_buffers(s)) < 0) {
-            free_buffers(s);
-            return ret;
+        /* level 3 */
+        lowpass_height  = s->plane[plane].band[2][1].height;
+        lowpass_width   = s->plane[plane].band[2][1].width;
+        highpass_stride = s->plane[plane].band[2][1].stride;
+
+        if (lowpass_height > s->plane[plane].band[2][1].a_height || lowpass_width > s->plane[plane].band[2][1].a_width ||
+            !highpass_stride || s->plane[plane].band[2][1].width > s->plane[plane].band[2][1].a_width) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid plane dimensions\n");
+            ret = AVERROR(EINVAL);
+            goto end;
         }
-    }
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
-        return ret;
+        av_log(avctx, AV_LOG_DEBUG, "Level 3 plane %i %i %i %i\n", plane, lowpass_height, lowpass_width, highpass_stride);
+        if (s->progressive) {
+            low    = s->plane[plane].subband[0];
+            high   = s->plane[plane].subband[8];
+            output = s->plane[plane].l_h[6];
+            for (i = 0; i < lowpass_width; i++) {
+                vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
+                low++;
+                high++;
+                output++;
+            }
 
-    s->coded_width  = 0;
-    s->coded_height = 0;
-    s->coded_format = AV_PIX_FMT_NONE;
+            low    = s->plane[plane].subband[7];
+            high   = s->plane[plane].subband[9];
+            output = s->plane[plane].l_h[7];
+            for (i = 0; i < lowpass_width; i++) {
+                vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
+                low++;
+                high++;
+                output++;
+            }
 
-    while (bytestream2_get_bytes_left(&gb) > 4) {
-        int16_t *coeff_data;
+            dst = (int16_t *)pic->data[act_plane];
+            low  = s->plane[plane].l_h[6];
+            high = s->plane[plane].l_h[7];
+            for (i = 0; i < lowpass_height * 2; i++) {
+                horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+                low  += lowpass_width;
+                high += lowpass_width;
+                dst  += pic->linesize[act_plane] / 2;
+            }
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "interlaced frame ? %d", pic->interlaced_frame);
+            pic->interlaced_frame = 1;
+            low    = s->plane[plane].subband[0];
+            high   = s->plane[plane].subband[7];
+            output = s->plane[plane].l_h[6];
+            for (i = 0; i < lowpass_height; i++) {
+                horiz_filter(output, low, high, lowpass_width);
+                low    += lowpass_width;
+                high   += lowpass_width;
+                output += lowpass_width * 2;
+            }
 
-        if ((ret = parse_tag(s, &gb, &tag, &value, &planes)) < 0)
-            return ret;
+            low    = s->plane[plane].subband[8];
+            high   = s->plane[plane].subband[9];
+            output = s->plane[plane].l_h[7];
+            for (i = 0; i < lowpass_height; i++) {
+                horiz_filter(output, low, high, lowpass_width);
+                low    += lowpass_width;
+                high   += lowpass_width;
+                output += lowpass_width * 2;
+            }
 
-        coeff_data = s->plane[s->channel_num].subband[s->subband_num_actual];
-        if (tag == 4 && value == 0x0F0F) {
-            if ((ret = read_lowpass_coeffs(s, &gb, coeff_data)) < 0)
-                return ret;
-        } else if (tag == 55 && s->subband_num_actual != 255) {
-            if ((ret = read_highpass_coeffs(s, &gb, coeff_data)) < 0)
-                return ret;
+            dst  = (int16_t *)pic->data[act_plane];
+            low  = s->plane[plane].l_h[6];
+            high = s->plane[plane].l_h[7];
+            for (i = 0; i < lowpass_height; i++) {
+                interlaced_vertical_filter(dst, low, high, lowpass_width * 2,  pic->linesize[act_plane]/2, act_plane);
+                low  += lowpass_width * 2;
+                high += lowpass_width * 2;
+                dst  += pic->linesize[act_plane];
+            }
         }
     }
 
-    if (s->coded_width || s->coded_height || s->coded_format != AV_PIX_FMT_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid dimensions\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    planes = av_pix_fmt_count_planes(avctx->pix_fmt);
-    for (plane = 0; plane < planes; plane++) {
-        /* level 1 */
-        if ((ret = reconstruct_level(s, data, plane, 1)) < 0)
-            return ret;
-
-        /* level 2 */
-        if ((ret = reconstruct_level(s, data, plane, 2)) < 0)
-            return ret;
 
-        /* level 3 */
-        if ((ret = reconstruct_level(s, data, plane, 3)) < 0)
-            return ret;
-    }
+end:
+    if (ret < 0)
+        return ret;
 
     *got_frame = 1;
     return avpkt->size;
@@ -749,8 +938,10 @@ static av_cold int cfhd_close(AVCodecContext *avctx)
 
     free_buffers(s);
 
-    ff_free_vlc(&s->vlc_9);
-    ff_free_vlc(&s->vlc_18);
+    if (!avctx->internal->is_copy) {
+        ff_free_vlc(&s->vlc_9);
+        ff_free_vlc(&s->vlc_18);
+    }
 
     return 0;
 }
@@ -762,7 +953,6 @@ AVCodec ff_cfhd_decoder = {
     .id               = AV_CODEC_ID_CFHD,
     .priv_data_size   = sizeof(CFHDContext),
     .init             = cfhd_init,
-    .init_thread_copy = ONLY_IF_THREADS_ENABLED(cfhd_init),
     .close            = cfhd_close,
     .decode           = cfhd_decode,
     .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
diff --git a/libavcodec/cfhd.h b/libavcodec/cfhd.h
index 27f047d..4f2c82d 100644
--- a/libavcodec/cfhd.h
+++ b/libavcodec/cfhd.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Kieran Kunhya
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,8 +23,11 @@
 
 #include <stdint.h>
 
+#include "libavutil/avassert.h"
+
 #include "avcodec.h"
-#include "bitstream.h"
+#include "bytestream.h"
+#include "get_bits.h"
 #include "vlc.h"
 
 #define VLC_BITS       9
@@ -66,6 +69,12 @@ typedef struct Plane {
     SubBand band[DWT_LEVELS][4];
 } Plane;
 
+typedef struct Peak {
+    int level;
+    int offset;
+    GetByteContext base;
+} Peak;
+
 typedef struct CFHDContext {
     AVCodecContext *avctx;
 
@@ -75,12 +84,13 @@ typedef struct CFHDContext {
     CFHD_RL_VLC_ELEM table_18_rl_vlc[4572];
     VLC vlc_18;
 
-    BitstreamContext bc;
+    GetBitContext gb;
 
     int coded_width;
     int coded_height;
     int cropped_height;
     enum AVPixelFormat coded_format;
+    int progressive;
 
     int a_width;
     int a_height;
@@ -96,12 +106,14 @@ typedef struct CFHDContext {
     int pshift;
 
     int codebook;
+    int difference_coding;
     int subband_num;
     int level;
     int subband_num_actual;
 
     uint8_t prescale_shift[3];
     Plane plane[4];
+    Peak peak;
 } CFHDContext;
 
 int ff_cfhd_init_vlcs(CFHDContext *s);
diff --git a/libavcodec/cfhddata.c b/libavcodec/cfhddata.c
index 3a9d6bd..5df68d4 100644
--- a/libavcodec/cfhddata.c
+++ b/libavcodec/cfhddata.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Kieran Kunhya
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,14 @@
 
 #include "cfhd.h"
 
+/* some special codewords, not sure what they all mean */
+#define TABLE_9_BAND_END1 0x1C7859Eh
+#define TABLE_9_BAND_END_LEN1 25
+#define TABLE_9_BAND_END2 0x38F0B3Fh
+#define TABLE_9_BAND_END_LEN2 26
+#define TABLE_9_BAND_END3 0x38F0B3Eh
+#define TABLE_9_BAND_END_LEN3 26
+
 #define NB_VLC_TABLE_9   (71 + 3)
 #define NB_VLC_TABLE_18 (263 + 1)
 
@@ -265,73 +273,99 @@ static const uint8_t table_18_vlc_level[NB_VLC_TABLE_18] = {
     220,  195,  161,  231,  173,  226,  116,  255,
 };
 
-static int init_vlc_signed(VLC *vlc, CFHD_RL_VLC_ELEM table_rl_vlc[],
-                           unsigned bound,
-                           const uint32_t table_vlc_bits[],
-                           const uint8_t table_vlc_len[],
-                           const uint16_t table_vlc_run[],
-                           const uint8_t table_vlc_level[])
+av_cold int ff_cfhd_init_vlcs(CFHDContext *s)
 {
-    uint32_t  vlc_bits[NB_VLC_TABLE_18 * 2];
-    uint8_t    vlc_len[NB_VLC_TABLE_18 * 2];
-    uint16_t   vlc_run[NB_VLC_TABLE_18 * 2];
-    int16_t  vlc_level[NB_VLC_TABLE_18 * 2];
-    unsigned i, j;
-    int ret;
+    int i, j, ret = 0;
+    uint32_t new_cfhd_vlc_bits[NB_VLC_TABLE_18 * 2];
+    uint8_t  new_cfhd_vlc_len[NB_VLC_TABLE_18 * 2];
+    uint16_t new_cfhd_vlc_run[NB_VLC_TABLE_18 * 2];
+    int16_t  new_cfhd_vlc_level[NB_VLC_TABLE_18 * 2];
+
+    /** Similar to dv.c, generate signed VLC tables **/
 
-    for (i = 0, j = 0; i < bound; i++, j++) {
-        vlc_bits[j]  = table_vlc_bits[i];
-        vlc_len[j]   = table_vlc_len[i];
-        vlc_run[j]   = table_vlc_run[i];
-        vlc_level[j] = table_vlc_level[i];
+    /* Table 9 */
+    for (i = 0, j = 0; i < NB_VLC_TABLE_9; i++, j++) {
+        new_cfhd_vlc_bits[j]  = table_9_vlc_bits[i];
+        new_cfhd_vlc_len[j]   = table_9_vlc_len[i];
+        new_cfhd_vlc_run[j]   = table_9_vlc_run[i];
+        new_cfhd_vlc_level[j] = table_9_vlc_level[i];
 
         /* Don't include the zero level nor escape bits */
-        if (table_vlc_level[i] &&
-            vlc_bits[j] != table_vlc_bits[bound - 1]) {
-            vlc_bits[j] <<= 1;
-            vlc_len[j]++;
+        if (table_9_vlc_level[i] &&
+            new_cfhd_vlc_bits[j] != table_9_vlc_bits[NB_VLC_TABLE_9-1]) {
+            new_cfhd_vlc_bits[j] <<= 1;
+            new_cfhd_vlc_len[j]++;
             j++;
-            vlc_bits[j]  = (table_vlc_bits[i] << 1) | 1;
-            vlc_len[j]   =  table_vlc_len[i] + 1;
-            vlc_run[j]   =  table_vlc_run[i];
-            vlc_level[j] = -table_vlc_level[i];
+            new_cfhd_vlc_bits[j]  = (table_9_vlc_bits[i] << 1) | 1;
+            new_cfhd_vlc_len[j]   =  table_9_vlc_len[i] + 1;
+            new_cfhd_vlc_run[j]   =  table_9_vlc_run[i];
+            new_cfhd_vlc_level[j] = -table_9_vlc_level[i];
         }
     }
 
-    if ((ret = init_vlc(vlc, VLC_BITS, j, vlc_len, 1, 1, vlc_bits, 4, 4, 0)) < 0)
+    ret = init_vlc(&s->vlc_9, VLC_BITS, j, new_cfhd_vlc_len,
+                   1, 1, new_cfhd_vlc_bits, 4, 4, 0);
+    if (ret < 0)
         return ret;
-
-    for (i = 0; i < (*vlc).table_size; i++) {
-        int code = (*vlc).table[i][0];
-        int len  = (*vlc).table[i][1];
+    for (i = 0; i < s->vlc_9.table_size; i++) {
+        int code = s->vlc_9.table[i][0];
+        int len  = s->vlc_9.table[i][1];
         int level, run;
 
         if (len < 0) { // more bits needed
             run   = 0;
             level = code;
         } else {
-            run   = vlc_run[code];
-            level = vlc_level[code];
+            run   = new_cfhd_vlc_run[code];
+            level = new_cfhd_vlc_level[code];
         }
-        table_rl_vlc[i].len   = len;
-        table_rl_vlc[i].level = level;
-        table_rl_vlc[i].run   = run;
+        s->table_9_rl_vlc[i].len   = len;
+        s->table_9_rl_vlc[i].level = level;
+        s->table_9_rl_vlc[i].run   = run;
     }
 
-    return 0;
-}
+    /* Table 18 */
+    for (i = 0, j = 0; i < NB_VLC_TABLE_18; i++, j++) {
+        new_cfhd_vlc_bits[j]  = table_18_vlc_bits[i];
+        new_cfhd_vlc_len[j]   = table_18_vlc_len[i];
+        new_cfhd_vlc_run[j]   = table_18_vlc_run[i];
+        new_cfhd_vlc_level[j] = table_18_vlc_level[i];
 
-av_cold int ff_cfhd_init_vlcs(CFHDContext *s)
-{
-    /** Similar to dv.c, generate signed VLC tables **/
+        /* Don't include the zero level nor escape bits */
+        if (table_18_vlc_level[i] &&
+            new_cfhd_vlc_bits[j] != table_18_vlc_bits[NB_VLC_TABLE_18-1]) {
+            new_cfhd_vlc_bits[j] <<= 1;
+            new_cfhd_vlc_len[j]++;
+            j++;
+            new_cfhd_vlc_bits[j]  = (table_18_vlc_bits[i] << 1) | 1;
+            new_cfhd_vlc_len[j]   =  table_18_vlc_len[i] + 1;
+            new_cfhd_vlc_run[j]   =  table_18_vlc_run[i];
+            new_cfhd_vlc_level[j] = -table_18_vlc_level[i];
+        }
+    }
 
-    int ret = init_vlc_signed(&s->vlc_9, s->table_9_rl_vlc, NB_VLC_TABLE_9,
-                              table_9_vlc_bits, table_9_vlc_len,
-                              table_9_vlc_run, table_9_vlc_level);
+    ret = init_vlc(&s->vlc_18, VLC_BITS, j, new_cfhd_vlc_len,
+                   1, 1, new_cfhd_vlc_bits, 4, 4, 0);
     if (ret < 0)
         return ret;
+    av_assert0(s->vlc_18.table_size == 4572);
+
+    for (i = 0; i < s->vlc_18.table_size; i++) {
+        int code = s->vlc_18.table[i][0];
+        int len  = s->vlc_18.table[i][1];
+        int level, run;
+
+        if (len < 0) { // more bits needed
+            run   = 0;
+            level = code;
+        } else {
+            run   = new_cfhd_vlc_run[code];
+            level = new_cfhd_vlc_level[code];
+        }
+        s->table_18_rl_vlc[i].len   = len;
+        s->table_18_rl_vlc[i].level = level;
+        s->table_18_rl_vlc[i].run   = run;
+    }
 
-    return init_vlc_signed(&s->vlc_18, s->table_18_rl_vlc, NB_VLC_TABLE_18,
-                           table_18_vlc_bits, table_18_vlc_len,
-                           table_18_vlc_run, table_18_vlc_level);
+    return ret;
 }
diff --git a/libavcodec/cga_data.c b/libavcodec/cga_data.c
index 2c63ff2..023a86b 100644
--- a/libavcodec/cga_data.c
+++ b/libavcodec/cga_data.c
@@ -1,435 +1,46 @@
 /*
  * CGA/EGA/VGA ROM data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * CGA/EGA/VGA ROM data
+ * @note fonts are in libavutil/xga_font_data.[ch]
  */
 
 #include <stdint.h>
 #include "cga_data.h"
 
-const uint8_t ff_cga_font[2048] = {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0xbd, 0x99, 0x81, 0x7e,
- 0x7e, 0xff, 0xdb, 0xff, 0xc3, 0xe7, 0xff, 0x7e, 0x6c, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10, 0x00,
- 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x38, 0x7c, 0x38, 0xfe, 0xfe, 0x7c, 0x38, 0x7c,
- 0x10, 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x7c, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x18, 0x00, 0x00,
- 0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00,
- 0xff, 0xc3, 0x99, 0xbd, 0xbd, 0x99, 0xc3, 0xff, 0x0f, 0x07, 0x0f, 0x7d, 0xcc, 0xcc, 0xcc, 0x78,
- 0x3c, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x70, 0xf0, 0xe0,
- 0x7f, 0x63, 0x7f, 0x63, 0x63, 0x67, 0xe6, 0xc0, 0x99, 0x5a, 0x3c, 0xe7, 0xe7, 0x3c, 0x5a, 0x99,
- 0x80, 0xe0, 0xf8, 0xfe, 0xf8, 0xe0, 0x80, 0x00, 0x02, 0x0e, 0x3e, 0xfe, 0x3e, 0x0e, 0x02, 0x00,
- 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x66, 0x00,
- 0x7f, 0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x00, 0x3e, 0x63, 0x38, 0x6c, 0x6c, 0x38, 0xcc, 0x78,
- 0x00, 0x00, 0x00, 0x00, 0x7e, 0x7e, 0x7e, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x7e, 0x3c, 0x18, 0xff,
- 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00,
- 0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00,
- 0x00, 0x00, 0xc0, 0xc0, 0xc0, 0xfe, 0x00, 0x00, 0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00,
- 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x00, 0x00, 0x00, 0xff, 0xff, 0x7e, 0x3c, 0x18, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x78, 0x78, 0x30, 0x30, 0x00, 0x30, 0x00,
- 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6c, 0x6c, 0xfe, 0x6c, 0xfe, 0x6c, 0x6c, 0x00,
- 0x30, 0x7c, 0xc0, 0x78, 0x0c, 0xf8, 0x30, 0x00, 0x00, 0xc6, 0xcc, 0x18, 0x30, 0x66, 0xc6, 0x00,
- 0x38, 0x6c, 0x38, 0x76, 0xdc, 0xcc, 0x76, 0x00, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x30, 0x60, 0x60, 0x60, 0x30, 0x18, 0x00, 0x60, 0x30, 0x18, 0x18, 0x18, 0x30, 0x60, 0x00,
- 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00, 0x00, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00,
- 0x7c, 0xc6, 0xce, 0xde, 0xf6, 0xe6, 0x7c, 0x00, 0x30, 0x70, 0x30, 0x30, 0x30, 0x30, 0xfc, 0x00,
- 0x78, 0xcc, 0x0c, 0x38, 0x60, 0xcc, 0xfc, 0x00, 0x78, 0xcc, 0x0c, 0x38, 0x0c, 0xcc, 0x78, 0x00,
- 0x1c, 0x3c, 0x6c, 0xcc, 0xfe, 0x0c, 0x1e, 0x00, 0xfc, 0xc0, 0xf8, 0x0c, 0x0c, 0xcc, 0x78, 0x00,
- 0x38, 0x60, 0xc0, 0xf8, 0xcc, 0xcc, 0x78, 0x00, 0xfc, 0xcc, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x00,
- 0x78, 0xcc, 0xcc, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x78, 0xcc, 0xcc, 0x7c, 0x0c, 0x18, 0x70, 0x00,
- 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x60,
- 0x18, 0x30, 0x60, 0xc0, 0x60, 0x30, 0x18, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0xfc, 0x00, 0x00,
- 0x60, 0x30, 0x18, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x78, 0xcc, 0x0c, 0x18, 0x30, 0x00, 0x30, 0x00,
- 0x7c, 0xc6, 0xde, 0xde, 0xde, 0xc0, 0x78, 0x00, 0x30, 0x78, 0xcc, 0xcc, 0xfc, 0xcc, 0xcc, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x66, 0x66, 0xfc, 0x00, 0x3c, 0x66, 0xc0, 0xc0, 0xc0, 0x66, 0x3c, 0x00,
- 0xf8, 0x6c, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0xfe, 0x62, 0x68, 0x78, 0x68, 0x62, 0xfe, 0x00,
- 0xfe, 0x62, 0x68, 0x78, 0x68, 0x60, 0xf0, 0x00, 0x3c, 0x66, 0xc0, 0xc0, 0xce, 0x66, 0x3e, 0x00,
- 0xcc, 0xcc, 0xcc, 0xfc, 0xcc, 0xcc, 0xcc, 0x00, 0x78, 0x30, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x1e, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0x78, 0x00, 0xe6, 0x66, 0x6c, 0x78, 0x6c, 0x66, 0xe6, 0x00,
- 0xf0, 0x60, 0x60, 0x60, 0x62, 0x66, 0xfe, 0x00, 0xc6, 0xee, 0xfe, 0xfe, 0xd6, 0xc6, 0xc6, 0x00,
- 0xc6, 0xe6, 0xf6, 0xde, 0xce, 0xc6, 0xc6, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x38, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xdc, 0x78, 0x1c, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x6c, 0x66, 0xe6, 0x00, 0x78, 0xcc, 0xe0, 0x70, 0x1c, 0xcc, 0x78, 0x00,
- 0xfc, 0xb4, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xfc, 0x00,
- 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x00, 0xc6, 0xc6, 0xc6, 0xd6, 0xfe, 0xee, 0xc6, 0x00,
- 0xc6, 0xc6, 0x6c, 0x38, 0x38, 0x6c, 0xc6, 0x00, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x30, 0x78, 0x00,
- 0xfe, 0xc6, 0x8c, 0x18, 0x32, 0x66, 0xfe, 0x00, 0x78, 0x60, 0x60, 0x60, 0x60, 0x60, 0x78, 0x00,
- 0xc0, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x02, 0x00, 0x78, 0x18, 0x18, 0x18, 0x18, 0x18, 0x78, 0x00,
- 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
- 0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x76, 0x00,
- 0xe0, 0x60, 0x60, 0x7c, 0x66, 0x66, 0xdc, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xc0, 0xcc, 0x78, 0x00,
- 0x1c, 0x0c, 0x0c, 0x7c, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00,
- 0x38, 0x6c, 0x60, 0xf0, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8,
- 0xe0, 0x60, 0x6c, 0x76, 0x66, 0x66, 0xe6, 0x00, 0x30, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x0c, 0x00, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0x78, 0xe0, 0x60, 0x66, 0x6c, 0x78, 0x6c, 0xe6, 0x00,
- 0x70, 0x30, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00, 0x00, 0x00, 0xcc, 0xfe, 0xfe, 0xd6, 0xc6, 0x00,
- 0x00, 0x00, 0xf8, 0xcc, 0xcc, 0xcc, 0xcc, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0xdc, 0x66, 0x66, 0x7c, 0x60, 0xf0, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0x7c, 0x0c, 0x1e,
- 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x7c, 0xc0, 0x78, 0x0c, 0xf8, 0x00,
- 0x10, 0x30, 0x7c, 0x30, 0x30, 0x34, 0x18, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00,
- 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x00, 0x00, 0x00, 0xc6, 0xd6, 0xfe, 0xfe, 0x6c, 0x00,
- 0x00, 0x00, 0xc6, 0x6c, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8,
- 0x00, 0x00, 0xfc, 0x98, 0x30, 0x64, 0xfc, 0x00, 0x1c, 0x30, 0x30, 0xe0, 0x30, 0x30, 0x1c, 0x00,
- 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00, 0xe0, 0x30, 0x30, 0x1c, 0x30, 0x30, 0xe0, 0x00,
- 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0x00,
- 0x78, 0xcc, 0xc0, 0xcc, 0x78, 0x18, 0x0c, 0x78, 0x00, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x1c, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00, 0x7e, 0xc3, 0x3c, 0x06, 0x3e, 0x66, 0x3f, 0x00,
- 0xcc, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0xe0, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00,
- 0x30, 0x30, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0x00, 0x00, 0x78, 0xc0, 0xc0, 0x78, 0x0c, 0x38,
- 0x7e, 0xc3, 0x3c, 0x66, 0x7e, 0x60, 0x3c, 0x00, 0xcc, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00,
- 0xe0, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00, 0xcc, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x7c, 0xc6, 0x38, 0x18, 0x18, 0x18, 0x3c, 0x00, 0xe0, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0xc6, 0x38, 0x6c, 0xc6, 0xfe, 0xc6, 0xc6, 0x00, 0x30, 0x30, 0x00, 0x78, 0xcc, 0xfc, 0xcc, 0x00,
- 0x1c, 0x00, 0xfc, 0x60, 0x78, 0x60, 0xfc, 0x00, 0x00, 0x00, 0x7f, 0x0c, 0x7f, 0xcc, 0x7f, 0x00,
- 0x3e, 0x6c, 0xcc, 0xfe, 0xcc, 0xcc, 0xce, 0x00, 0x78, 0xcc, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0xcc, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0xe0, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00,
- 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00, 0x00, 0xe0, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x00, 0xcc, 0x00, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8, 0xc3, 0x18, 0x3c, 0x66, 0x66, 0x3c, 0x18, 0x00,
- 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x18, 0x18, 0x7e, 0xc0, 0xc0, 0x7e, 0x18, 0x18,
- 0x38, 0x6c, 0x64, 0xf0, 0x60, 0xe6, 0xfc, 0x00, 0xcc, 0xcc, 0x78, 0xfc, 0x30, 0xfc, 0x30, 0x30,
- 0xf8, 0xcc, 0xcc, 0xfa, 0xc6, 0xcf, 0xc6, 0xc7, 0x0e, 0x1b, 0x18, 0x3c, 0x18, 0x18, 0xd8, 0x70,
- 0x1c, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0x38, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x00, 0x1c, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x1c, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x00, 0xf8, 0x00, 0xf8, 0xcc, 0xcc, 0xcc, 0x00, 0xfc, 0x00, 0xcc, 0xec, 0xfc, 0xdc, 0xcc, 0x00,
- 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00, 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x7c, 0x00, 0x00,
- 0x30, 0x00, 0x30, 0x60, 0xc0, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00, 0xfc, 0xc0, 0xc0, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0xfc, 0x0c, 0x0c, 0x00, 0x00, 0xc3, 0xc6, 0xcc, 0xde, 0x33, 0x66, 0xcc, 0x0f,
- 0xc3, 0xc6, 0xcc, 0xdb, 0x37, 0x6f, 0xcf, 0x03, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00,
- 0x00, 0x33, 0x66, 0xcc, 0x66, 0x33, 0x00, 0x00, 0x00, 0xcc, 0x66, 0x33, 0x66, 0xcc, 0x00, 0x00,
- 0x22, 0x88, 0x22, 0x88, 0x22, 0x88, 0x22, 0x88, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
- 0xdb, 0x77, 0xdb, 0xee, 0xdb, 0x77, 0xdb, 0xee, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x36, 0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0xfe, 0x06, 0xf6, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, 0x37, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0xf7, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x3f, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
- 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0xc8, 0xdc, 0x76, 0x00, 0x00, 0x78, 0xcc, 0xf8, 0xcc, 0xf8, 0xc0, 0xc0,
- 0x00, 0xfc, 0xcc, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00,
- 0xfc, 0xcc, 0x60, 0x30, 0x60, 0xcc, 0xfc, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8, 0xd8, 0x70, 0x00,
- 0x00, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0xc0, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x00,
- 0xfc, 0x30, 0x78, 0xcc, 0xcc, 0x78, 0x30, 0xfc, 0x38, 0x6c, 0xc6, 0xfe, 0xc6, 0x6c, 0x38, 0x00,
- 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x6c, 0xee, 0x00, 0x1c, 0x30, 0x18, 0x7c, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0x7e, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x06, 0x0c, 0x7e, 0xdb, 0xdb, 0x7e, 0x60, 0xc0,
- 0x38, 0x60, 0xc0, 0xf8, 0xc0, 0x60, 0x38, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x00,
- 0x00, 0xfc, 0x00, 0xfc, 0x00, 0xfc, 0x00, 0x00, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x00, 0xfc, 0x00,
- 0x60, 0x30, 0x18, 0x30, 0x60, 0x00, 0xfc, 0x00, 0x18, 0x30, 0x60, 0x30, 0x18, 0x00, 0xfc, 0x00,
- 0x0e, 0x1b, 0x1b, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0x70,
- 0x30, 0x30, 0x00, 0xfc, 0x00, 0x30, 0x30, 0x00, 0x00, 0x76, 0xdc, 0x00, 0x76, 0xdc, 0x00, 0x00,
- 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0f, 0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x3c, 0x1c,
- 0x78, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x70, 0x18, 0x30, 0x60, 0x78, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x3c, 0x3c, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-};
-
-const uint8_t ff_vga16_font[4096] = {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd, 0x99, 0x81, 0x81, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7e, 0xff, 0xdb, 0xff, 0xff, 0xc3, 0xe7, 0xff, 0xff, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0xe7, 0xe7, 0xe7, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x7e, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xc3, 0x99, 0xbd, 0xbd, 0x99, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x1e, 0x0e, 0x1a, 0x32, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x30, 0x30, 0x70, 0xf0, 0xe0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7f, 0x63, 0x7f, 0x63, 0x63, 0x63, 0x63, 0x67, 0xe7, 0xe6, 0xc0, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x18, 0xdb, 0x3c, 0xe7, 0x3c, 0xdb, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfe, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x02, 0x06, 0x0e, 0x1e, 0x3e, 0xfe, 0x3e, 0x1e, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7f, 0xdb, 0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x7c, 0xc6, 0x60, 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0xc0, 0xc0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x38, 0x7c, 0x7c, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0x7c, 0x7c, 0x38, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x66, 0x66, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x7c, 0xc6, 0xc2, 0xc0, 0x7c, 0x06, 0x06, 0x86, 0xc6, 0x7c, 0x18, 0x18, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xc2, 0xc6, 0x0c, 0x18, 0x30, 0x60, 0xc6, 0x86, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x76, 0xdc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x30, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x18, 0x0c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x30, 0x18, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x02, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc3, 0xc3, 0xdb, 0xdb, 0xc3, 0xc3, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x38, 0x78, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0x06, 0x06, 0x3c, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0c, 0x1c, 0x3c, 0x6c, 0xcc, 0xfe, 0x0c, 0x0c, 0x0c, 0x1e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc0, 0xc0, 0xc0, 0xfc, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x60, 0xc0, 0xc0, 0xfc, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc6, 0x06, 0x06, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x06, 0x06, 0x0c, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x0c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xde, 0xde, 0xde, 0xdc, 0xc0, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x66, 0x66, 0x66, 0x66, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xf8, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xde, 0xc6, 0xc6, 0x66, 0x3a, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1e, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xe6, 0x66, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xf0, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xe7, 0xff, 0xff, 0xdb, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xd6, 0xde, 0x7c, 0x0c, 0x0e, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x6c, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x60, 0x38, 0x0c, 0x06, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xff, 0xdb, 0x99, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18, 0x3c, 0x66, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xff, 0xc3, 0x86, 0x0c, 0x18, 0x30, 0x60, 0xc1, 0xc3, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x80, 0xc0, 0xe0, 0x70, 0x38, 0x1c, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
- 0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x78, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc0, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1c, 0x0c, 0x0c, 0x3c, 0x6c, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x6c, 0x76, 0x66, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x06, 0x06, 0x00, 0x0e, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x66, 0x66, 0x3c, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xdb, 0xdb, 0xdb, 0xdb, 0xdb, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0x0c, 0x1e, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0x60, 0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x10, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x30, 0x30, 0x36, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0x3c, 0x66, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0xf8, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xcc, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0e, 0x18, 0x18, 0x18, 0x70, 0x18, 0x18, 0x18, 0x18, 0x0e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x70, 0x18, 0x18, 0x18, 0x0e, 0x18, 0x18, 0x18, 0x18, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x0c, 0x06, 0x7c, 0x00, 0x00,
- 0x00, 0x00, 0xcc, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0c, 0x18, 0x30, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xcc, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x38, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x60, 0x60, 0x66, 0x3c, 0x0c, 0x06, 0x3c, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x66, 0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x3c, 0x66, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc6, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x38, 0x6c, 0x38, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x30, 0x60, 0x00, 0xfe, 0x66, 0x60, 0x7c, 0x60, 0x60, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0x3b, 0x1b, 0x7e, 0xd8, 0xdc, 0x77, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3e, 0x6c, 0xcc, 0xcc, 0xfe, 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x30, 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0x78, 0x00,
- 0x00, 0xc6, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc6, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x18, 0x7e, 0xc3, 0xc0, 0xc0, 0xc0, 0xc3, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xe6, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xfc, 0x66, 0x66, 0x7c, 0x62, 0x66, 0x6f, 0x66, 0x66, 0x66, 0xf3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0e, 0x1b, 0x18, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0x70, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0c, 0x18, 0x30, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x76, 0xdc, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x30, 0x30, 0x00, 0x30, 0x30, 0x60, 0xc0, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x60, 0xce, 0x9b, 0x06, 0x0c, 0x1f, 0x00, 0x00,
- 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x66, 0xce, 0x96, 0x3e, 0x06, 0x06, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x6c, 0xd8, 0x6c, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, 0x6c, 0x36, 0x6c, 0xd8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44,
- 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
- 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
- 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0xd8, 0xd8, 0xd8, 0xdc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xd8, 0xcc, 0xc6, 0xc6, 0xc6, 0xcc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc6, 0xc6, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0xfe, 0xc6, 0x60, 0x30, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7e, 0x18, 0x3c, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x6c, 0x6c, 0x6c, 0xee, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1e, 0x30, 0x18, 0x0c, 0x3e, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xdb, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x03, 0x06, 0x7e, 0xdb, 0xdb, 0xf3, 0x7e, 0x60, 0xc0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1c, 0x30, 0x60, 0x60, 0x7c, 0x60, 0x60, 0x60, 0x30, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0e, 0x1b, 0x1b, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x7e, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0f, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x6c, 0x3c, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xd8, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x70, 0xd8, 0x30, 0x60, 0xc8, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
 const uint32_t ff_cga_palette[16] = {
-    0x000000, 0x0000AA, 0x00AA00, 0x00AAAA, 0xAA0000, 0xAA00AA, 0xAA5500, 0xAAAAAA,
-    0x555555, 0x5555FF, 0x55FF55, 0x55FFFF, 0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF,
+    0xFF000000, 0xFF0000AA, 0xFF00AA00, 0xFF00AAAA, 0xFFAA0000, 0xFFAA00AA, 0xFFAA5500, 0xFFAAAAAA,
+    0xFF555555, 0xFF5555FF, 0xFF55FF55, 0xFF55FFFF, 0xFFFF5555, 0xFFFF55FF, 0xFFFFFF55, 0xFFFFFFFF,
 };
 
 const uint32_t ff_ega_palette[64] = {
-    0x000000, 0x0000AA, 0x00AA00, 0x00AAAA, 0xAA0000, 0xAA00AA, 0xAAAA00, 0xAAAAAA,
-    0x000055, 0x0000FF, 0x00AA55, 0x00AAFF, 0xAA0055, 0xAA00FF, 0xAAAA55, 0xAAAAFF,
-    0x005500, 0x0055AA, 0x00FF00, 0x00FFAA, 0xAA5500, 0xAA55AA, 0xAAFF00, 0xAAFFAA,
-    0x005555, 0x0055FF, 0x00FF55, 0x00FFFF, 0xAA5555, 0xAA55FF, 0xAAFF55, 0xAAFFFF,
-    0x550000, 0x5500AA, 0x55AA00, 0x55AAAA, 0xFF0000, 0xFF00AA, 0xFFAA00, 0xFFAAAA,
-    0x550055, 0x5500FF, 0x55AA55, 0x55AAFF, 0xFF0055, 0xFF00FF, 0xFFAA55, 0xFFAAFF,
-    0x555500, 0x5555AA, 0x55FF00, 0x55FFAA, 0xFF5500, 0xFF55AA, 0xFFFF00, 0xFFFFAA,
-    0x555555, 0x5555FF, 0x55FF55, 0x55FFFF, 0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF
+    0xFF000000, 0xFF0000AA, 0xFF00AA00, 0xFF00AAAA, 0xFFAA0000, 0xFFAA00AA, 0xFFAAAA00, 0xFFAAAAAA,
+    0xFF000055, 0xFF0000FF, 0xFF00AA55, 0xFF00AAFF, 0xFFAA0055, 0xFFAA00FF, 0xFFAAAA55, 0xFFAAAAFF,
+    0xFF005500, 0xFF0055AA, 0xFF00FF00, 0xFF00FFAA, 0xFFAA5500, 0xFFAA55AA, 0xFFAAFF00, 0xFFAAFFAA,
+    0xFF005555, 0xFF0055FF, 0xFF00FF55, 0xFF00FFFF, 0xFFAA5555, 0xFFAA55FF, 0xFFAAFF55, 0xFFAAFFFF,
+    0xFF550000, 0xFF5500AA, 0xFF55AA00, 0xFF55AAAA, 0xFFFF0000, 0xFFFF00AA, 0xFFFFAA00, 0xFFFFAAAA,
+    0xFF550055, 0xFF5500FF, 0xFF55AA55, 0xFF55AAFF, 0xFFFF0055, 0xFFFF00FF, 0xFFFFAA55, 0xFFFFAAFF,
+    0xFF555500, 0xFF5555AA, 0xFF55FF00, 0xFF55FFAA, 0xFFFF5500, 0xFFFF55AA, 0xFFFFFF00, 0xFFFFFFAA,
+    0xFF555555, 0xFF5555FF, 0xFF55FF55, 0xFF55FFFF, 0xFFFF5555, 0xFFFF55FF, 0xFFFFFF55, 0xFFFFFFFF
 };
 
 void ff_draw_pc_font(uint8_t *dst, int linesize, const uint8_t *font, int font_height, int ch, int fg, int bg)
diff --git a/libavcodec/cga_data.h b/libavcodec/cga_data.h
index 2149cfd..3f5281a 100644
--- a/libavcodec/cga_data.h
+++ b/libavcodec/cga_data.h
@@ -1,26 +1,27 @@
 /*
  * CGA/EGA/VGA ROM data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * CGA/EGA/VGA ROM data
+ * @note fonts are in libavutil/xga_font_data.[ch]
  */
 
 #ifndef AVCODEC_CGA_DATA_H
@@ -28,8 +29,6 @@
 
 #include <stdint.h>
 
-extern const uint8_t ff_cga_font[2048];
-extern const uint8_t ff_vga16_font[4096];
 extern const uint32_t ff_cga_palette[16];
 extern const uint32_t ff_ega_palette[64];
 
diff --git a/libavcodec/chomp_bsf.c b/libavcodec/chomp_bsf.c
index 2e76113..3ba45f3 100644
--- a/libavcodec/chomp_bsf.c
+++ b/libavcodec/chomp_bsf.c
@@ -2,20 +2,20 @@
  * Chomp bitstream filter
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,20 +23,16 @@
 #include "bsf.h"
 #include "internal.h"
 
-static int chomp_filter(AVBSFContext *ctx, AVPacket *out)
+static int chomp_filter(AVBSFContext *ctx, AVPacket *pkt)
 {
-    AVPacket *in;
     int ret;
 
-    ret = ff_bsf_get_packet(ctx, &in);
+    ret = ff_bsf_get_packet_ref(ctx, pkt);
     if (ret < 0)
         return ret;
 
-    while (in->size > 0 && !in->data[in->size - 1])
-        in->size--;
-
-    av_packet_move_ref(out, in);
-    av_packet_free(&in);
+    while (pkt->size > 0 && !pkt->data[pkt->size - 1])
+        pkt->size--;
 
     return 0;
 }
diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c
index 611ffe5..9b00774 100644
--- a/libavcodec/cinepak.c
+++ b/libavcodec/cinepak.c
@@ -2,20 +2,20 @@
  * Cinepak Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,9 @@
  *   http://www.csse.monash.edu.au/~timf/
  * @see For more information on the quirky data inside Sega FILM/CPK files, visit:
  *   http://wiki.multimedia.cx/index.php?title=Sega_FILM
+ *
+ * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB
+ * @author Cinepak colorspace, Rl, Aetey Global Technologies AB
  */
 
 #include <stdio.h>
@@ -40,10 +43,7 @@
 #include "internal.h"
 
 
-typedef struct cvid_codebook {
-    uint8_t  y0, y1, y2, y3;
-    uint8_t  u, v;
-} cvid_codebook;
+typedef uint8_t cvid_codebook[12];
 
 #define MAX_STRIPS      32
 
@@ -79,12 +79,14 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
     const uint8_t *eod = (data + size);
     uint32_t flag, mask;
     int      i, n;
+    uint8_t *p;
 
     /* check if this chunk contains 4- or 6-element vectors */
     n    = (chunk_id & 0x04) ? 4 : 6;
     flag = 0;
     mask = 0;
 
+    p = codebook[0];
     for (i=0; i < 256; i++) {
         if ((chunk_id & 0x01) && !(mask >>= 1)) {
             if ((data + 4) > eod)
@@ -96,28 +98,33 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
         }
 
         if (!(chunk_id & 0x01) || (flag & mask)) {
+            int k, kk;
+
             if ((data + n) > eod)
                 break;
 
+            for (k = 0; k < 4; ++k) {
+                int r = *data++;
+                for (kk = 0; kk < 3; ++kk)
+                    *p++ = r;
+            }
             if (n == 6) {
-                codebook[i].y0 = *data++;
-                codebook[i].y1 = *data++;
-                codebook[i].y2 = *data++;
-                codebook[i].y3 = *data++;
-                codebook[i].u  = 128 + *data++;
-                codebook[i].v  = 128 + *data++;
-            } else {
-                /* this codebook type indicates either greyscale or
-                 * palettized video; if palettized, U & V components will
-                 * not be used so it is safe to set them to 128 for the
-                 * benefit of greyscale rendering in YUV420P */
-                codebook[i].y0 = *data++;
-                codebook[i].y1 = *data++;
-                codebook[i].y2 = *data++;
-                codebook[i].y3 = *data++;
-                codebook[i].u  = 128;
-                codebook[i].v  = 128;
+                int r, g, b, u, v;
+                u = *(int8_t *)data++;
+                v = *(int8_t *)data++;
+                p -= 12;
+                for(k=0; k<4; ++k) {
+                    r = *p++ + v*2;
+                    g = *p++ - (u/2) - v;
+                    b = *p   + u*2;
+                    p -= 2;
+                    *p++ = av_clip_uint8(r);
+                    *p++ = av_clip_uint8(g);
+                    *p++ = av_clip_uint8(b);
+                }
             }
+        } else {
+            p += 12;
         }
     }
 }
@@ -127,25 +134,31 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
 {
     const uint8_t   *eod = (data + size);
     uint32_t         flag, mask;
-    cvid_codebook   *codebook;
-    unsigned int     x, y;
-    uint32_t         iy[4];
-    uint32_t         iu[2];
-    uint32_t         iv[2];
+    uint8_t         *cb0, *cb1, *cb2, *cb3;
+    int             x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
 
     flag = 0;
     mask = 0;
 
     for (y=strip->y1; y < strip->y2; y+=4) {
 
-        iy[0] = strip->x1 + (y * s->frame->linesize[0]);
-        iy[1] = iy[0] + s->frame->linesize[0];
-        iy[2] = iy[1] + s->frame->linesize[0];
-        iy[3] = iy[2] + s->frame->linesize[0];
-        iu[0] = (strip->x1/2) + ((y/2) * s->frame->linesize[1]);
-        iu[1] = iu[0] + s->frame->linesize[1];
-        iv[0] = (strip->x1/2) + ((y/2) * s->frame->linesize[2]);
-        iv[1] = iv[0] + s->frame->linesize[2];
+/* take care of y dimension not being multiple of 4, such streams exist */
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+          (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]);
+        if(s->avctx->height - y > 1) {
+            ip1 = ip0 + s->frame->linesize[0];
+            if(s->avctx->height - y > 2) {
+                ip2 = ip1 + s->frame->linesize[0];
+                if(s->avctx->height - y > 3) {
+                    ip3 = ip2 + s->frame->linesize[0];
+                }
+            }
+        }
+/* to get the correct picture for not-multiple-of-4 cases let us fill each
+ * block from the bottom up, thus possibly overwriting the bottommost line
+ * more than once but ending with the correct data in place
+ * (instead of in-loop checking) */
 
         for (x=strip->x1; x < strip->x2; x+=4) {
             if ((chunk_id & 0x01) && !(mask >>= 1)) {
@@ -168,97 +181,82 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
                 }
 
                 if ((chunk_id & 0x02) || (~flag & mask)) {
+                    uint8_t *p;
                     if (data >= eod)
                         return AVERROR_INVALIDDATA;
 
-                    codebook = &strip->v1_codebook[*data++];
-                    s->frame->data[0][iy[0] + 0] = codebook->y0;
-                    s->frame->data[0][iy[0] + 1] = codebook->y0;
-                    s->frame->data[0][iy[1] + 0] = codebook->y0;
-                    s->frame->data[0][iy[1] + 1] = codebook->y0;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0]] = codebook->u;
-                        s->frame->data[2][iv[0]] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[0] + 2] = codebook->y1;
-                    s->frame->data[0][iy[0] + 3] = codebook->y1;
-                    s->frame->data[0][iy[1] + 2] = codebook->y1;
-                    s->frame->data[0][iy[1] + 3] = codebook->y1;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0] + 1] = codebook->u;
-                        s->frame->data[2][iv[0] + 1] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[2] + 0] = codebook->y2;
-                    s->frame->data[0][iy[2] + 1] = codebook->y2;
-                    s->frame->data[0][iy[3] + 0] = codebook->y2;
-                    s->frame->data[0][iy[3] + 1] = codebook->y2;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1]] = codebook->u;
-                        s->frame->data[2][iv[1]] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[2] + 2] = codebook->y3;
-                    s->frame->data[0][iy[2] + 3] = codebook->y3;
-                    s->frame->data[0][iy[3] + 2] = codebook->y3;
-                    s->frame->data[0][iy[3] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1] + 1] = codebook->u;
-                        s->frame->data[2][iv[1] + 1] = codebook->v;
+                    p = strip->v1_codebook[*data++];
+                    if (s->palette_video) {
+                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6];
+                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9];
+                        ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
+                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3];
+                    } else {
+                        p += 6;
+                        memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
+                        memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
+                        p += 3; /* ... + 9 */
+                        memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
+                        memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
+                        p -= 9; /* ... + 0 */
+                        memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
+                        memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
+                        p += 3; /* ... + 3 */
+                        memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
+                        memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
                     }
 
                 } else if (flag & mask) {
                     if ((data + 4) > eod)
                         return AVERROR_INVALIDDATA;
 
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[0] + 0] = codebook->y0;
-                    s->frame->data[0][iy[0] + 1] = codebook->y1;
-                    s->frame->data[0][iy[1] + 0] = codebook->y2;
-                    s->frame->data[0][iy[1] + 1] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0]] = codebook->u;
-                        s->frame->data[2][iv[0]] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[0] + 2] = codebook->y0;
-                    s->frame->data[0][iy[0] + 3] = codebook->y1;
-                    s->frame->data[0][iy[1] + 2] = codebook->y2;
-                    s->frame->data[0][iy[1] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0] + 1] = codebook->u;
-                        s->frame->data[2][iv[0] + 1] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[2] + 0] = codebook->y0;
-                    s->frame->data[0][iy[2] + 1] = codebook->y1;
-                    s->frame->data[0][iy[3] + 0] = codebook->y2;
-                    s->frame->data[0][iy[3] + 1] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1]] = codebook->u;
-                        s->frame->data[2][iv[1]] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[2] + 2] = codebook->y0;
-                    s->frame->data[0][iy[2] + 3] = codebook->y1;
-                    s->frame->data[0][iy[3] + 2] = codebook->y2;
-                    s->frame->data[0][iy[3] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1] + 1] = codebook->u;
-                        s->frame->data[2][iv[1] + 1] = codebook->v;
+                    cb0 = strip->v4_codebook[*data++];
+                    cb1 = strip->v4_codebook[*data++];
+                    cb2 = strip->v4_codebook[*data++];
+                    cb3 = strip->v4_codebook[*data++];
+                    if (s->palette_video) {
+                        uint8_t *p;
+                        p = ip3;
+                        *p++ = cb2[6];
+                        *p++ = cb2[9];
+                        *p++ = cb3[6];
+                        *p   = cb3[9];
+                        p = ip2;
+                        *p++ = cb2[0];
+                        *p++ = cb2[3];
+                        *p++ = cb3[0];
+                        *p   = cb3[3];
+                        p = ip1;
+                        *p++ = cb0[6];
+                        *p++ = cb0[9];
+                        *p++ = cb1[6];
+                        *p   = cb1[9];
+                        p = ip0;
+                        *p++ = cb0[0];
+                        *p++ = cb0[3];
+                        *p++ = cb1[0];
+                        *p   = cb1[3];
+                    } else {
+                        memcpy(ip3 + 0, cb2 + 6, 6);
+                        memcpy(ip3 + 6, cb3 + 6, 6);
+                        memcpy(ip2 + 0, cb2 + 0, 6);
+                        memcpy(ip2 + 6, cb3 + 0, 6);
+                        memcpy(ip1 + 0, cb0 + 6, 6);
+                        memcpy(ip1 + 6, cb1 + 6, 6);
+                        memcpy(ip0 + 0, cb0 + 0, 6);
+                        memcpy(ip0 + 6, cb1 + 0, 6);
                     }
 
                 }
             }
 
-            iy[0] += 4;  iy[1] += 4;
-            iy[2] += 4;  iy[3] += 4;
-            iu[0] += 2;  iu[1] += 2;
-            iv[0] += 2;  iv[1] += 2;
+            if (s->palette_video) {
+                ip0 += 4;  ip1 += 4;
+                ip2 += 4;  ip3 += 4;
+            } else {
+                ip0 += 12;  ip1 += 12;
+                ip2 += 12;  ip3 += 12;
+            }
         }
     }
 
@@ -317,17 +315,11 @@ static int cinepak_decode_strip (CinepakContext *s,
     return AVERROR_INVALIDDATA;
 }
 
-static int cinepak_decode (CinepakContext *s)
+static int cinepak_predecode_check (CinepakContext *s)
 {
-    const uint8_t  *eod = (s->data + s->size);
-    int           i, result, strip_size, frame_flags, num_strips;
-    int           y0 = 0;
+    int           num_strips;
     int           encoded_buf_size;
 
-    if (s->size < 10)
-        return AVERROR_INVALIDDATA;
-
-    frame_flags = s->data[0];
     num_strips  = AV_RB16 (&s->data[8]);
     encoded_buf_size = AV_RB24(&s->data[1]);
 
@@ -358,19 +350,42 @@ static int cinepak_decode (CinepakContext *s)
             s->sega_film_skip_bytes = 0;
     }
 
+    if (s->size < 10 + s->sega_film_skip_bytes + num_strips * 12)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+static int cinepak_decode (CinepakContext *s)
+{
+    const uint8_t  *eod = (s->data + s->size);
+    int           i, result, strip_size, frame_flags, num_strips;
+    int           y0 = 0;
+
+    frame_flags = s->data[0];
+    num_strips  = AV_RB16 (&s->data[8]);
+
     s->data += 10 + s->sega_film_skip_bytes;
 
     num_strips = FFMIN(num_strips, MAX_STRIPS);
 
+    s->frame->key_frame = 0;
+
     for (i=0; i < num_strips; i++) {
         if ((s->data + 12) > eod)
             return AVERROR_INVALIDDATA;
 
         s->strips[i].id = s->data[0];
-        s->strips[i].y1 = y0;
-        s->strips[i].x1 = 0;
-        s->strips[i].y2 = y0 + AV_RB16 (&s->data[8]);
-        s->strips[i].x2 = s->avctx->width;
+/* zero y1 means "relative to the previous stripe" */
+        if (!(s->strips[i].y1 = AV_RB16 (&s->data[4])))
+            s->strips[i].y2 = (s->strips[i].y1 = y0) + AV_RB16 (&s->data[8]);
+        else
+            s->strips[i].y2 = AV_RB16 (&s->data[8]);
+        s->strips[i].x1 = AV_RB16 (&s->data[6]);
+        s->strips[i].x2 = AV_RB16 (&s->data[10]);
+
+        if (s->strips[i].id == 0x10)
+            s->frame->key_frame = 1;
 
         strip_size = AV_RB24 (&s->data[1]) - 12;
         if (strip_size < 0)
@@ -403,12 +418,13 @@ static av_cold int cinepak_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     s->width = (avctx->width + 3) & ~3;
     s->height = (avctx->height + 3) & ~3;
+
     s->sega_film_skip_bytes = -1;  /* uninitialized state */
 
     // check for paletted data
     if (avctx->bits_per_coded_sample != 8) {
         s->palette_video = 0;
-        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
     } else {
         s->palette_video = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -428,24 +444,42 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int ret = 0, buf_size = avpkt->size;
     CinepakContext *s = avctx->priv_data;
+    int num_strips;
 
     s->data = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame))) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if (s->size < 10)
+        return AVERROR_INVALIDDATA;
+
+    num_strips = AV_RB16 (&s->data[8]);
+
+    //Empty frame, do not waste time
+    if (!num_strips && (!s->palette_video || !av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL)))
+        return buf_size;
+
+    if ((ret = cinepak_predecode_check(s)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "cinepak_predecode_check failed\n");
         return ret;
     }
 
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+        return ret;
+
     if (s->palette_video) {
-        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
-        if (pal) {
+        int size;
+        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &size);
+        if (pal && size == AVPALETTE_SIZE) {
             s->frame->palette_has_changed = 1;
             memcpy(s->pal, pal, AVPALETTE_SIZE);
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
         }
     }
 
-    cinepak_decode(s);
+    if ((ret = cinepak_decode(s)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "cinepak_decode failed\n");
+    }
 
     if (s->palette_video)
         memcpy (s->frame->data[1], s->pal, AVPALETTE_SIZE);
diff --git a/libavcodec/cinepakenc.c b/libavcodec/cinepakenc.c
index c323bde..93917fa 100644
--- a/libavcodec/cinepakenc.c
+++ b/libavcodec/cinepakenc.c
@@ -102,6 +102,7 @@ typedef struct strip_info {
 } strip_info;
 
 typedef struct CinepakEncContext {
+    const AVClass *class;
     AVCodecContext *avctx;
     unsigned char *pict_bufs[4], *strip_buf, *frame_buf;
     AVFrame *last_frame;
@@ -133,7 +134,7 @@ static const AVOption options[] = {
     { "max_extra_cb_iterations", "Max extra codebook recalculation passes, more is better and slower",
       OFFSET(max_extra_cb_iterations),  AV_OPT_TYPE_INT, { .i64 =          2 },          0, INT_MAX,                 VE },
     { "skip_empty_cb",           "Avoid wasting bytes, ignore vintage MacOS decoder",
-      OFFSET(skip_empty_cb),            AV_OPT_TYPE_INT, { .i64 =          0 },          0, 1,                       VE },
+      OFFSET(skip_empty_cb),            AV_OPT_TYPE_BOOL, { .i64 =         0 },          0, 1,                       VE },
     { "max_strips",              "Limit strips/frame, vintage compatible is 1..3, otherwise the more the better",
       OFFSET(max_max_strips),           AV_OPT_TYPE_INT, { .i64 =          3 }, MIN_STRIPS, MAX_STRIPS,              VE },
     { "min_strips",              "Enforce min strips/frame, more is worse and faster, must be <= max_strips",
@@ -775,8 +776,8 @@ static int quantize(CinepakEncContext *s, int h, uint8_t *data[4],
     if (i < size)
         size = i;
 
-    ff_init_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
-    ff_do_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
+    avpriv_init_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
+    avpriv_do_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
 
     // set up vq_data, which contains a single MB
     vq_data[0]     = vq_pict_buf;
@@ -1155,7 +1156,7 @@ static int cinepak_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     s->lambda = frame->quality ? frame->quality - 1 : 2 * FF_LAMBDA_SCALE;
 
-    if ((ret = ff_alloc_packet(pkt, s->frame_buf_size)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->frame_buf_size, 0)) < 0)
         return ret;
     ret       = rd_frame(s, frame, (s->curframe == 0), pkt->data, s->frame_buf_size);
     pkt->size = ret;
diff --git a/libavcodec/clearvideo.c b/libavcodec/clearvideo.c
index 7c51439..ad3012f 100644
--- a/libavcodec/clearvideo.c
+++ b/libavcodec/clearvideo.c
@@ -1,21 +1,21 @@
 /*
  * ClearVideo decoder
- * Copyright (c) 2012 Konstantin Shishkov
+ * Copyright (c) 2012-2018 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,122 +25,69 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "idctdsp.h"
 #include "internal.h"
-#include "vlc.h"
-
-#define NUM_DC_CODES 127
-#define NUM_AC_CODES 103
-
-static const uint8_t clv_dc_codes[NUM_DC_CODES] = {
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-    0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
-    0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x07, 0x0B,
-    0x0C, 0x08, 0x08, 0x09, 0x04, 0x06, 0x07, 0x05,
-    0x04, 0x05, 0x04, 0x06, 0x05, 0x06, 0x07, 0x05,
-    0x06, 0x07, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08,
-    0x09, 0x0A, 0x0B, 0x07, 0x08, 0x09, 0x07, 0x08,
-    0x06, 0x07, 0x08, 0x06, 0x04, 0x05, 0x02, 0x01,
-    0x03, 0x06, 0x07, 0x07, 0x09, 0x0A, 0x0B, 0x09,
-    0x0A, 0x0B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x09,
-    0x0D, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x07,
-    0x08, 0x09, 0x0A, 0x0B, 0x06, 0x07, 0x06, 0x08,
-    0x07, 0x09, 0x0A, 0x0B, 0x09, 0x0A, 0x0B, 0x0C,
-    0x14, 0x0D, 0x0D, 0x0E, 0x0F, 0x15, 0x15, 0x16,
-    0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
-    0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25,
-};
-
-static const uint8_t clv_dc_bits[NUM_DC_CODES] = {
-    22, 22, 22, 22, 22, 22, 22, 22,
-    22, 22, 22, 22, 22, 22, 22, 22,
-    22, 22, 22, 21, 22, 22, 19, 20,
-    20, 19, 18, 18, 15, 17, 17, 16,
-    14, 15, 12, 13, 14, 14, 14, 12,
-    12, 12, 11, 11, 11, 10, 10, 10,
-    10, 10, 10,  9,  9,  9,  8,  8,
-     7,  7,  7,  6,  5,  5,  3,  1,
-     3,  5,  5,  6,  7,  7,  7,  8,
-     8,  8,  9,  9,  9,  9, 10, 11,
-    10, 11, 11, 12, 12, 12, 12, 13,
-    14, 14, 14, 14, 15, 15, 16, 17,
-    16, 17, 18, 18, 19, 19, 19, 19,
-    21, 19, 20, 19, 19, 21, 22, 22,
-    22, 22, 22, 22, 22, 22, 22, 22,
-    22, 22, 22, 22, 22, 22, 22,
-};
-
-static const uint16_t clv_ac_syms[NUM_AC_CODES] = {
-    0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
-    0x0009, 0x000A, 0x000B, 0x000C, 0x0011, 0x0012, 0x0013, 0x0014,
-    0x0015, 0x0016, 0x0021, 0x0022, 0x0023, 0x0024, 0x0031, 0x0032,
-    0x0033, 0x0041, 0x0042, 0x0043, 0x0051, 0x0052, 0x0053, 0x0061,
-    0x0062, 0x0063, 0x0071, 0x0072, 0x0081, 0x0082, 0x0091, 0x0092,
-    0x00A1, 0x00A2, 0x00B1, 0x00C1, 0x00D1, 0x00E1, 0x00F1, 0x0101,
-    0x0111, 0x0121, 0x0131, 0x0141, 0x0151, 0x0161, 0x0171, 0x0181,
-    0x0191, 0x01A1, 0x1001, 0x1002, 0x1003, 0x1011, 0x1012, 0x1021,
-    0x1031, 0x1041, 0x1051, 0x1061, 0x1071, 0x1081, 0x1091, 0x10A1,
-    0x10B1, 0x10C1, 0x10D1, 0x10E1, 0x10F1, 0x1101, 0x1111, 0x1121,
-    0x1131, 0x1141, 0x1151, 0x1161, 0x1171, 0x1181, 0x1191, 0x11A1,
-    0x11B1, 0x11C1, 0x11D1, 0x11E1, 0x11F1, 0x1201, 0x1211, 0x1221,
-    0x1231, 0x1241, 0x1251, 0x1261, 0x1271, 0x1281, 0x1BFF,
-};
-
-static const uint8_t clv_ac_codes[NUM_AC_CODES] = {
-    0x02, 0x0F, 0x15, 0x17, 0x1F, 0x25, 0x24, 0x21,
-    0x20, 0x07, 0x06, 0x20, 0x06, 0x14, 0x1E, 0x0F,
-    0x21, 0x50, 0x0E, 0x1D, 0x0E, 0x51, 0x0D, 0x23,
-    0x0D, 0x0C, 0x22, 0x52, 0x0B, 0x0C, 0x53, 0x13,
-    0x0B, 0x54, 0x12, 0x0A, 0x11, 0x09, 0x10, 0x08,
-    0x16, 0x55, 0x15, 0x14, 0x1C, 0x1B, 0x21, 0x20,
-    0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x22, 0x23,
-    0x56, 0x57, 0x07, 0x19, 0x05, 0x0F, 0x04, 0x0E,
-    0x0D, 0x0C, 0x13, 0x12, 0x11, 0x10, 0x1A, 0x19,
-    0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x18, 0x17,
-    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x07, 0x06,
-    0x05, 0x04, 0x24, 0x25, 0x26, 0x27, 0x58, 0x59,
-    0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x03,
-};
-
-static const uint8_t clv_ac_bits[NUM_AC_CODES] = {
-     2,  4,  6,  7,  8,  9,  9, 10,
-    10, 11, 11, 11,  3,  6,  8, 10,
-    11, 12,  4,  8, 10, 12,  5,  9,
-    10,  5,  9, 12,  5, 10, 12,  6,
-    10, 12,  6, 10,  6, 10,  6, 10,
-     7, 12,  7,  7,  8,  8,  9,  9,
-     9,  9,  9,  9,  9,  9, 11, 11,
-    12, 12,  4,  9, 11,  6, 11,  6,
-     6,  6,  7,  7,  7,  7,  8,  8,
-     8,  8,  8,  8,  8,  8,  9,  9,
-     9,  9,  9,  9,  9,  9, 10, 10,
-    10, 10, 11, 11, 11, 11, 12, 12,
-    12, 12, 12, 12, 12, 12,  7,
-};
+#include "mathops.h"
+#include "clearvideodata.h"
+
+typedef struct LevelCodes {
+    uint16_t    mv_esc;
+    uint16_t    bias_esc;
+    VLC         flags_cb;
+    VLC         mv_cb;
+    VLC         bias_cb;
+} LevelCodes;
+
+typedef struct MV {
+    int16_t x, y;
+} MV;
+
+static const MV zero_mv = { 0 };
+
+typedef struct MVInfo {
+    int mb_w;
+    int mb_h;
+    int mb_size;
+    int mb_stride;
+    int top;
+    MV  *mv;
+} MVInfo;
+
+typedef struct TileInfo {
+    uint16_t        flags;
+    int16_t         bias;
+    MV              mv;
+    struct TileInfo *child[4];
+} TileInfo;
 
 typedef struct CLVContext {
     AVCodecContext *avctx;
     IDCTDSPContext idsp;
     AVFrame        *pic;
-    BitstreamContext bc;
+    AVFrame        *prev;
+    GetBitContext  gb;
     int            mb_width, mb_height;
+    int            pmb_width, pmb_height;
+    MVInfo         mvi;
+    int            tile_size;
+    int            tile_shift;
     VLC            dc_vlc, ac_vlc;
+    LevelCodes     ylev[4], ulev[3], vlev[3];
     int            luma_dc_quant, chroma_dc_quant, ac_quant;
     DECLARE_ALIGNED(16, int16_t, block)[64];
     int            top_dc[3], left_dc[4];
-    int            iframes_warning;
 } CLVContext;
 
 static inline int decode_block(CLVContext *ctx, int16_t *blk, int has_ac,
                                int ac_quant)
 {
-    BitstreamContext *bc = &ctx->bc;
+    GetBitContext *gb = &ctx->gb;
     int idx = 1, last = 0, val, skip;
 
     memset(blk, 0, sizeof(*blk) * 64);
-    blk[0] = bitstream_read_vlc(bc, ctx->dc_vlc.table, 9, 3);
+    blk[0] = get_vlc2(gb, ctx->dc_vlc.table, 9, 3);
     if (blk[0] < 0)
         return AVERROR_INVALIDDATA;
     blk[0] -= 63;
@@ -149,19 +96,19 @@ static inline int decode_block(CLVContext *ctx, int16_t *blk, int has_ac,
         return 0;
 
     while (idx < 64 && !last) {
-        val = bitstream_read_vlc(bc, ctx->ac_vlc.table, 9, 2);
+        val = get_vlc2(gb, ctx->ac_vlc.table, 9, 2);
         if (val < 0)
             return AVERROR_INVALIDDATA;
         if (val != 0x1BFF) {
             last =  val >> 12;
             skip = (val >> 4) & 0xFF;
             val &= 0xF;
-            if (bitstream_read_bit(bc))
+            if (get_bits1(gb))
                 val = -val;
         } else {
-            last = bitstream_read_bit(bc);
-            skip = bitstream_read(bc, 6);
-            val  = bitstream_read_signed(bc, 8);
+            last = get_bits1(gb);
+            skip = get_bits(gb, 6);
+            val  = get_sbits(gb, 8);
         }
         if (val) {
             int aval = FFABS(val), sign = val < 0;
@@ -187,12 +134,12 @@ static inline int decode_block(CLVContext *ctx, int16_t *blk, int has_ac,
     const int t3 = OP(2408 * blk[5 * step] - 1609 * blk[3 * step]);     \
     const int t4 = OP(1108 * blk[2 * step] - 2676 * blk[6 * step]);     \
     const int t5 = OP(2676 * blk[2 * step] + 1108 * blk[6 * step]);     \
-    const int t6 = ((blk[0 * step] + blk[4 * step]) << dshift) + bias;  \
-    const int t7 = ((blk[0 * step] - blk[4 * step]) << dshift) + bias;  \
+    const int t6 = ((blk[0 * step] + blk[4 * step]) * (1 << dshift)) + bias;  \
+    const int t7 = ((blk[0 * step] - blk[4 * step]) * (1 << dshift)) + bias;  \
     const int t8 = t0 + t2;                                             \
     const int t9 = t0 - t2;                                             \
-    const int tA = 181 * (t9 + (t1 - t3)) + 0x80 >> 8;                  \
-    const int tB = 181 * (t9 - (t1 - t3)) + 0x80 >> 8;                  \
+    const int tA = (int)(181U * (t9 + (t1 - t3)) + 0x80) >> 8;          \
+    const int tB = (int)(181U * (t9 - (t1 - t3)) + 0x80) >> 8;          \
     const int tC = t1 + t3;                                             \
                                                                         \
     blk[0 * step] = (t6 + t5 + t8) >> shift;                            \
@@ -230,7 +177,7 @@ static int decode_mb(CLVContext *c, int x, int y)
     int i, has_ac[6], off;
 
     for (i = 0; i < 6; i++)
-        has_ac[i] = bitstream_read_bit(&c->bc);
+        has_ac[i] = get_bits1(&c->gb);
 
     off = x * 16 + y * 16 * c->pic->linesize[0];
     for (i = 0; i < 4; i++) {
@@ -272,6 +219,283 @@ static int decode_mb(CLVContext *c, int x, int y)
     return 0;
 }
 
+static int copy_block(AVCodecContext *avctx, AVFrame *dst, AVFrame *src,
+                      int plane, int x, int y, int dx, int dy, int size)
+{
+    int shift = plane > 0;
+    int sx = x + dx;
+    int sy = y + dy;
+    int sstride, dstride, soff, doff;
+    uint8_t *sbuf, *dbuf;
+    int i;
+
+    if (x < 0 || sx < 0 || y < 0 || sy < 0 ||
+        x + size > avctx->coded_width >> shift ||
+        y + size > avctx->coded_height >> shift ||
+        sx + size > avctx->coded_width >> shift ||
+        sy + size > avctx->coded_height >> shift)
+        return AVERROR_INVALIDDATA;
+
+    sstride = src->linesize[plane];
+    dstride = dst->linesize[plane];
+    soff    = sx + sy * sstride;
+    sbuf    = src->data[plane];
+    doff    = x + y * dstride;
+    dbuf    = dst->data[plane];
+
+    for (i = 0; i < size; i++) {
+        uint8_t *dptr = &dbuf[doff];
+        uint8_t *sptr = &sbuf[soff];
+
+        memcpy(dptr, sptr, size);
+        doff += dstride;
+        soff += sstride;
+    }
+
+    return 0;
+}
+
+static int copyadd_block(AVCodecContext *avctx, AVFrame *dst, AVFrame *src,
+                         int plane, int x, int y, int dx, int dy, int size, int bias)
+{
+    int shift = plane > 0;
+    int sx = x + dx;
+    int sy = y + dy;
+    int sstride   = src->linesize[plane];
+    int dstride   = dst->linesize[plane];
+    int soff      = sx + sy * sstride;
+    uint8_t *sbuf = src->data[plane];
+    int doff      = x + y * dstride;
+    uint8_t *dbuf = dst->data[plane];
+    int i, j;
+
+    if (x < 0 || sx < 0 || y < 0 || sy < 0 ||
+        x + size > avctx->coded_width >> shift ||
+        y + size > avctx->coded_height >> shift ||
+        sx + size > avctx->coded_width >> shift ||
+        sy + size > avctx->coded_height >> shift)
+        return AVERROR_INVALIDDATA;
+
+    for (j = 0; j < size; j++) {
+        uint8_t *dptr = &dbuf[doff];
+        uint8_t *sptr = &sbuf[soff];
+
+        for (i = 0; i < size; i++) {
+            int val = sptr[i] + bias;
+
+            dptr[i] = av_clip_uint8(val);
+        }
+
+        doff += dstride;
+        soff += sstride;
+    }
+
+    return 0;
+}
+
+static MV mvi_predict(MVInfo *mvi, int mb_x, int mb_y, MV diff)
+{
+    MV res, pred_mv;
+    int left_mv, right_mv, top_mv, bot_mv;
+
+    if (mvi->top) {
+        if (mb_x > 0) {
+            pred_mv = mvi->mv[mvi->mb_stride + mb_x - 1];
+        } else {
+            pred_mv = zero_mv;
+        }
+    } else if ((mb_x == 0) || (mb_x == mvi->mb_w - 1)) {
+        pred_mv = mvi->mv[mb_x];
+    } else {
+        MV A = mvi->mv[mvi->mb_stride + mb_x - 1];
+        MV B = mvi->mv[                 mb_x    ];
+        MV C = mvi->mv[                 mb_x + 1];
+        pred_mv.x = mid_pred(A.x, B.x, C.x);
+        pred_mv.y = mid_pred(A.y, B.y, C.y);
+    }
+
+    res = pred_mv;
+
+    left_mv = -((mb_x * mvi->mb_size));
+    right_mv = ((mvi->mb_w - mb_x - 1) * mvi->mb_size);
+    if (res.x < left_mv) {
+        res.x = left_mv;
+    }
+    if (res.x > right_mv) {
+        res.x = right_mv;
+    }
+    top_mv = -((mb_y * mvi->mb_size));
+    bot_mv = ((mvi->mb_h - mb_y - 1) * mvi->mb_size);
+    if (res.y < top_mv) {
+        res.y = top_mv;
+    }
+    if (res.y > bot_mv) {
+        res.y = bot_mv;
+    }
+
+    mvi->mv[mvi->mb_stride + mb_x].x = res.x + diff.x;
+    mvi->mv[mvi->mb_stride + mb_x].y = res.y + diff.y;
+
+    return res;
+}
+
+static void mvi_reset(MVInfo *mvi, int mb_w, int mb_h, int mb_size)
+{
+    mvi->top       = 1;
+    mvi->mb_w      = mb_w;
+    mvi->mb_h      = mb_h;
+    mvi->mb_size   = mb_size;
+    mvi->mb_stride = mb_w;
+    memset(mvi->mv, 0, sizeof(MV) * mvi->mb_stride * 2);
+}
+
+static void mvi_update_row(MVInfo *mvi)
+{
+    int i;
+
+    mvi->top = 0;
+    for (i = 0 ; i < mvi->mb_stride; i++) {
+        mvi->mv[i] = mvi->mv[mvi->mb_stride + i];
+    }
+}
+
+static TileInfo* decode_tile_info(GetBitContext *gb, LevelCodes *lc, int level)
+{
+    TileInfo *ti;
+    int i, flags = 0;
+    int16_t bias = 0;
+    MV mv = { 0 };
+
+    if (lc[level].flags_cb.table) {
+        flags = get_vlc2(gb, lc[level].flags_cb.table, lc[level].flags_cb.bits, 2);
+    }
+
+    if (lc[level].mv_cb.table) {
+        uint16_t mv_code = get_vlc2(gb, lc[level].mv_cb.table, lc[level].mv_cb.bits, 3);
+
+        if (mv_code != lc[level].mv_esc) {
+            mv.x = (int8_t)(mv_code & 0xff);
+            mv.y = (int8_t)(mv_code >> 8);
+        } else {
+            mv.x = get_sbits(gb, 8);
+            mv.y = get_sbits(gb, 8);
+        }
+    }
+
+    if (lc[level].bias_cb.table) {
+        uint16_t bias_val = get_vlc2(gb, lc[level].bias_cb.table, lc[level].bias_cb.bits, 2);
+
+        if (bias_val != lc[level].bias_esc) {
+            bias = (int16_t)(bias_val);
+        } else {
+            bias = get_sbits(gb, 16);
+        }
+    }
+
+    ti = av_calloc(1, sizeof(*ti));
+    if (!ti)
+        return NULL;
+
+    ti->flags = flags;
+    ti->mv = mv;
+    ti->bias = bias;
+
+    if (ti->flags) {
+        for (i = 0; i < 4; i++) {
+            if (ti->flags & (1 << i)) {
+                TileInfo *subti = decode_tile_info(gb, lc, level + 1);
+                ti->child[i] = subti;
+            }
+        }
+    }
+
+    return ti;
+}
+
+static int tile_do_block(AVCodecContext *avctx, AVFrame *dst, AVFrame *src,
+                         int plane, int x, int y, int dx, int dy, int size, int bias)
+{
+    int ret;
+
+    if (!bias) {
+        ret = copy_block(avctx, dst, src, plane, x, y, dx, dy, size);
+    } else {
+        ret = copyadd_block(avctx, dst, src, plane, x, y, dx, dy, size, bias);
+    }
+
+    return ret;
+}
+
+static int restore_tree(AVCodecContext *avctx, AVFrame *dst, AVFrame *src,
+                        int plane, int x, int y, int size,
+                        TileInfo *tile, MV root_mv)
+{
+    int ret;
+    MV mv;
+
+    mv.x = root_mv.x + tile->mv.x;
+    mv.y = root_mv.y + tile->mv.y;
+
+    if (!tile->flags) {
+        ret = tile_do_block(avctx, dst, src, plane, x, y, mv.x, mv.y, size, tile->bias);
+    } else {
+        int i, hsize = size >> 1;
+
+        for (i = 0; i < 4; i++) {
+            int xoff = (i & 2) == 0 ? 0 : hsize;
+            int yoff = (i & 1) == 0 ? 0 : hsize;
+
+            if (tile->child[i]) {
+                ret = restore_tree(avctx, dst, src, plane, x + xoff, y + yoff, hsize, tile->child[i], root_mv);
+                av_freep(&tile->child[i]);
+            } else {
+                ret = tile_do_block(avctx, dst, src, plane, x + xoff, y + yoff, mv.x, mv.y, hsize, tile->bias);
+            }
+        }
+    }
+
+    return ret;
+}
+
+static void extend_edges(AVFrame *buf, int tile_size)
+{
+    int comp, i, j;
+
+    for (comp = 0; comp < 3; comp++) {
+        int shift = comp > 0;
+        int w = buf->width  >> shift;
+        int h = buf->height >> shift;
+        int size = comp == 0 ? tile_size : tile_size >> 1;
+        int stride = buf->linesize[comp];
+        uint8_t *framebuf = buf->data[comp];
+
+        int right  = size - (w & (size - 1));
+        int bottom = size - (h & (size - 1));
+
+        if ((right == size) && (bottom == size)) {
+            return;
+        }
+        if (right != size) {
+            int off = w;
+            for (j = 0; j < h; j++) {
+                for (i = 0; i < right; i++) {
+                    framebuf[off + i] = 0x80;
+                }
+                off += stride;
+            }
+        }
+        if (bottom != size) {
+            int off = h * stride;
+            for (j = 0; j < bottom; j++) {
+                for (i = 0; i < stride; i++) {
+                    framebuf[off + i] = 0x80;
+                }
+                off += stride;
+            }
+        }
+    }
+}
+
 static int clv_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
 {
@@ -281,6 +505,7 @@ static int clv_decode_frame(AVCodecContext *avctx, void *data,
     GetByteContext gb;
     uint32_t frame_type;
     int i, j, ret;
+    int mb_ret = 0;
 
     bytestream2_init(&gb, buf, buf_size);
     if (avctx->codec_tag == MKTAG('C', 'L', 'V', '1')) {
@@ -289,21 +514,29 @@ static int clv_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame_type = bytestream2_get_byte(&gb);
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
-        return ret;
 
-    c->pic->key_frame = frame_type & 0x20 ? 1 : 0;
-    c->pic->pict_type = frame_type & 0x20 ? AV_PICTURE_TYPE_I
-                                          : AV_PICTURE_TYPE_P;
+    if ((frame_type & 0x7f) == 0x30) {
+        *got_frame = 0;
+        return buf_size;
+    } else if (frame_type & 0x2) {
+        if (buf_size < c->mb_width * c->mb_height) {
+            av_log(avctx, AV_LOG_ERROR, "Packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
+            return ret;
+
+        c->pic->key_frame = 1;
+        c->pic->pict_type = AV_PICTURE_TYPE_I;
 
-    if (frame_type & 0x2) {
         bytestream2_get_be32(&gb); // frame size;
         c->ac_quant        = bytestream2_get_byte(&gb);
         c->luma_dc_quant   = 32;
         c->chroma_dc_quant = 32;
 
-        if ((ret = bitstream_init8(&c->bc, buf + bytestream2_tell(&gb),
-                                   buf_size - bytestream2_tell(&gb))) < 0)
+        if ((ret = init_get_bits8(&c->gb, buf + bytestream2_tell(&gb),
+                                  buf_size - bytestream2_tell(&gb))) < 0)
             return ret;
 
         for (i = 0; i < 3; i++)
@@ -313,37 +546,145 @@ static int clv_decode_frame(AVCodecContext *avctx, void *data,
 
         for (j = 0; j < c->mb_height; j++) {
             for (i = 0; i < c->mb_width; i++) {
-                ret |= decode_mb(c, i, j);
+                ret = decode_mb(c, i, j);
+                if (ret < 0)
+                    mb_ret = ret;
             }
         }
+        extend_edges(c->pic, c->tile_size);
     } else {
-        if (!c->iframes_warning)
-            avpriv_report_missing_feature(avctx, "Non-I-frames in Clearvideo");
-        c->iframes_warning = 1;
-        return AVERROR_PATCHWELCOME;
+        int plane;
+
+        if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
+            return ret;
+
+        ret = av_frame_copy(c->pic, c->prev);
+        if (ret < 0)
+            return ret;
+
+        if ((ret = init_get_bits8(&c->gb, buf + bytestream2_tell(&gb),
+                                  buf_size - bytestream2_tell(&gb))) < 0)
+            return ret;
+
+        mvi_reset(&c->mvi, c->pmb_width, c->pmb_height, 1 << c->tile_shift);
+
+        for (j = 0; j < c->pmb_height; j++) {
+            for (i = 0; i < c->pmb_width; i++) {
+                if (get_bits_left(&c->gb) <= 0)
+                    return AVERROR_INVALIDDATA;
+                if (get_bits1(&c->gb)) {
+                    MV mv = mvi_predict(&c->mvi, i, j, zero_mv);
+
+                    for (plane = 0; plane < 3; plane++) {
+                        int16_t x = plane == 0 ? i << c->tile_shift : i << (c->tile_shift - 1);
+                        int16_t y = plane == 0 ? j << c->tile_shift : j << (c->tile_shift - 1);
+                        int16_t size = plane == 0 ? 1 << c->tile_shift : 1 << (c->tile_shift - 1);
+                        int16_t mx = plane == 0 ? mv.x : mv.x / 2;
+                        int16_t my = plane == 0 ? mv.y : mv.y / 2;
+
+                        ret = copy_block(avctx, c->pic, c->prev, plane, x, y, mx, my, size);
+                        if (ret < 0)
+                            mb_ret = ret;
+                    }
+                } else {
+                    int x = i << c->tile_shift;
+                    int y = j << c->tile_shift;
+                    int size = 1 << c->tile_shift;
+                    TileInfo *tile;
+                    MV mv, cmv;
+
+                    tile = decode_tile_info(&c->gb, c->ylev, 0);
+                    if (!tile)
+                        return AVERROR(ENOMEM);
+                    mv = mvi_predict(&c->mvi, i, j, tile->mv);
+                    ret = restore_tree(avctx, c->pic, c->prev, 0, x, y, size, tile, mv);
+                    if (ret < 0)
+                        mb_ret = ret;
+                    x = i << (c->tile_shift - 1);
+                    y = j << (c->tile_shift - 1);
+                    size = 1 << (c->tile_shift - 1);
+                    cmv.x = mv.x + tile->mv.x;
+                    cmv.y = mv.y + tile->mv.y;
+                    cmv.x /= 2;
+                    cmv.y /= 2;
+                    av_freep(&tile);
+                    tile = decode_tile_info(&c->gb, c->ulev, 0);
+                    if (!tile)
+                        return AVERROR(ENOMEM);
+                    ret = restore_tree(avctx, c->pic, c->prev, 1, x, y, size, tile, cmv);
+                    if (ret < 0)
+                        mb_ret = ret;
+                    av_freep(&tile);
+                    tile = decode_tile_info(&c->gb, c->vlev, 0);
+                    if (!tile)
+                        return AVERROR(ENOMEM);
+                    ret = restore_tree(avctx, c->pic, c->prev, 2, x, y, size, tile, cmv);
+                    if (ret < 0)
+                        mb_ret = ret;
+                    av_freep(&tile);
+                }
+            }
+            mvi_update_row(&c->mvi);
+        }
+        extend_edges(c->pic, c->tile_size);
+
+        c->pic->key_frame = 0;
+        c->pic->pict_type = AV_PICTURE_TYPE_P;
     }
 
     if ((ret = av_frame_ref(data, c->pic)) < 0)
         return ret;
 
+    FFSWAP(AVFrame *, c->pic, c->prev);
+
     *got_frame = 1;
 
-    return ret < 0 ? ret : buf_size;
+    if (get_bits_left(&c->gb) < 0)
+        av_log(c->avctx, AV_LOG_WARNING, "overread %d\n", -get_bits_left(&c->gb));
+
+    return mb_ret < 0 ? mb_ret : buf_size;
 }
 
 static av_cold int clv_decode_init(AVCodecContext *avctx)
 {
     CLVContext *const c = avctx->priv_data;
-    int ret;
+    int ret, w, h;
+
+    if (avctx->extradata_size == 110) {
+        c->tile_size = AV_RL32(&avctx->extradata[94]);
+    } else if (avctx->extradata_size == 150) {
+        c->tile_size = AV_RB32(&avctx->extradata[134]);
+    } else if (!avctx->extradata_size) {
+        c->tile_size = 16;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported extradata size: %d\n", avctx->extradata_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    c->tile_shift = av_log2(c->tile_size);
+    if (1 << c->tile_shift != c->tile_size) {
+        av_log(avctx, AV_LOG_ERROR, "Tile size: %d, is not power of 2.\n", c->tile_size);
+        return AVERROR_INVALIDDATA;
+    }
 
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+    w = avctx->width;
+    h = avctx->height;
+    ret = ff_set_dimensions(avctx, FFALIGN(w, 1 << c->tile_shift), FFALIGN(h, 1 << c->tile_shift));
+    if (ret < 0)
+        return ret;
+    avctx->width  = w;
+    avctx->height = h;
 
     c->avctx           = avctx;
     c->mb_width        = FFALIGN(avctx->width,  16) >> 4;
     c->mb_height       = FFALIGN(avctx->height, 16) >> 4;
-    c->iframes_warning = 0;
+    c->pmb_width       = (w + c->tile_size - 1) >> c->tile_shift;
+    c->pmb_height      = (h + c->tile_size - 1) >> c->tile_shift;
     c->pic             = av_frame_alloc();
-    if (!c->pic)
+    c->prev            = av_frame_alloc();
+    c->mvi.mv          = av_calloc(c->pmb_width * 2, sizeof(*c->mvi.mv));
+    if (!c->pic || !c->prev || !c->mvi.mv)
         return AVERROR(ENOMEM);
 
     ff_idctdsp_init(&c->idsp, avctx);
@@ -363,17 +704,198 @@ static av_cold int clv_decode_init(AVCodecContext *avctx)
         return ret;
     }
 
+    ret = init_vlc(&c->ylev[0].flags_cb, 9, FF_ARRAY_ELEMS(clv_flagsy_0_bits),
+                   clv_flagsy_0_bits,  1, 1,
+                   clv_flagsy_0_codes, 2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = init_vlc(&c->ylev[1].flags_cb, 9, FF_ARRAY_ELEMS(clv_flagsy_1_bits),
+                   clv_flagsy_1_bits,  1, 1,
+                   clv_flagsy_1_codes, 2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = init_vlc(&c->ylev[2].flags_cb, 9, FF_ARRAY_ELEMS(clv_flagsy_2_bits),
+                   clv_flagsy_2_bits,  1, 1,
+                   clv_flagsy_2_codes, 2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = init_vlc(&c->ulev[0].flags_cb, 9, FF_ARRAY_ELEMS(clv_flagsu_0_bits),
+                   clv_flagsu_0_bits,  1, 1,
+                   clv_flagsu_0_codes, 2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = init_vlc(&c->ulev[1].flags_cb, 9, FF_ARRAY_ELEMS(clv_flagsu_1_bits),
+                   clv_flagsu_1_bits,  1, 1,
+                   clv_flagsu_1_codes, 2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = init_vlc(&c->vlev[0].flags_cb, 9, FF_ARRAY_ELEMS(clv_flagsv_0_bits),
+                   clv_flagsv_0_bits,  1, 1,
+                   clv_flagsv_0_codes, 2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = init_vlc(&c->vlev[1].flags_cb, 9, FF_ARRAY_ELEMS(clv_flagsv_1_bits),
+                   clv_flagsv_1_bits,  1, 1,
+                   clv_flagsv_1_codes, 2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ylev[0].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvy_0_bits),
+                             clv_mvy_0_bits,  1, 1,
+                             clv_mvy_0_codes, 2, 2,
+                             clv_mvy_0_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ylev[1].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvy_1_bits),
+                             clv_mvy_1_bits,  1, 1,
+                             clv_mvy_1_codes, 2, 2,
+                             clv_mvy_1_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ylev[2].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvy_2_bits),
+                             clv_mvy_2_bits,  1, 1,
+                             clv_mvy_2_codes, 2, 2,
+                             clv_mvy_2_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ylev[3].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvy_3_bits),
+                             clv_mvy_3_bits,  1, 1,
+                             clv_mvy_3_codes, 2, 2,
+                             clv_mvy_3_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ulev[1].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvu_1_bits),
+                             clv_mvu_1_bits,  1, 1,
+                             clv_mvu_1_codes, 2, 2,
+                             clv_mvu_1_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ulev[2].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvu_2_bits),
+                             clv_mvu_2_bits,  1, 1,
+                             clv_mvu_2_codes, 2, 2,
+                             clv_mvu_2_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->vlev[1].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvv_1_bits),
+                             clv_mvv_1_bits,  1, 1,
+                             clv_mvv_1_codes, 2, 2,
+                             clv_mvv_1_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->vlev[2].mv_cb, 9, FF_ARRAY_ELEMS(clv_mvv_2_bits),
+                             clv_mvv_2_bits,  1, 1,
+                             clv_mvv_2_codes, 2, 2,
+                             clv_mvv_2_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ylev[1].bias_cb, 9, FF_ARRAY_ELEMS(clv_biasy_1_bits),
+                             clv_biasy_1_bits,  1, 1,
+                             clv_biasy_1_codes, 2, 2,
+                             clv_biasy_1_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ylev[2].bias_cb, 9, FF_ARRAY_ELEMS(clv_biasy_2_bits),
+                             clv_biasy_2_bits,  1, 1,
+                             clv_biasy_2_codes, 2, 2,
+                             clv_biasy_2_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ylev[3].bias_cb, 9, FF_ARRAY_ELEMS(clv_biasy_3_bits),
+                             clv_biasy_3_bits,  1, 1,
+                             clv_biasy_3_codes, 2, 2,
+                             clv_biasy_3_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ulev[1].bias_cb, 9, FF_ARRAY_ELEMS(clv_biasu_1_bits),
+                             clv_biasu_1_bits,  1, 1,
+                             clv_biasu_1_codes, 2, 2,
+                             clv_biasu_1_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->ulev[2].bias_cb, 9, FF_ARRAY_ELEMS(clv_biasu_2_bits),
+                             clv_biasu_2_bits,  1, 1,
+                             clv_biasu_2_codes, 2, 2,
+                             clv_biasu_2_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->vlev[1].bias_cb, 9, FF_ARRAY_ELEMS(clv_biasv_1_bits),
+                             clv_biasv_1_bits,  1, 1,
+                             clv_biasv_1_codes, 2, 2,
+                             clv_biasv_1_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    ret = ff_init_vlc_sparse(&c->vlev[2].bias_cb, 9, FF_ARRAY_ELEMS(clv_biasv_2_bits),
+                             clv_biasv_2_bits,  1, 1,
+                             clv_biasv_2_codes, 2, 2,
+                             clv_biasv_2_syms,  2, 2, 0);
+    if (ret)
+        return ret;
+
+    c->ylev[0].mv_esc = 0x0909;
+    c->ylev[1].mv_esc = 0x0A0A;
+    c->ylev[2].mv_esc = 0x1010;
+    c->ylev[3].mv_esc = 0x1313;
+    c->ulev[1].mv_esc = 0x0808;
+    c->ulev[2].mv_esc = 0x0B0B;
+    c->vlev[1].mv_esc = 0x0808;
+    c->vlev[2].mv_esc = 0x0B0B;
+
+    c->ylev[1].bias_esc = 0x100;
+    c->ylev[2].bias_esc = 0x100;
+    c->ylev[3].bias_esc = 0x100;
+    c->ulev[1].bias_esc = 0x100;
+    c->ulev[2].bias_esc = 0x100;
+    c->vlev[1].bias_esc = 0x100;
+    c->vlev[2].bias_esc = 0x100;
+
     return 0;
 }
 
 static av_cold int clv_decode_end(AVCodecContext *avctx)
 {
     CLVContext *const c = avctx->priv_data;
+    int i;
 
+    av_frame_free(&c->prev);
     av_frame_free(&c->pic);
 
+    av_freep(&c->mvi.mv);
+
     ff_free_vlc(&c->dc_vlc);
     ff_free_vlc(&c->ac_vlc);
+    for (i = 0; i < 4; i++) {
+        ff_free_vlc(&c->ylev[i].mv_cb);
+        ff_free_vlc(&c->ylev[i].flags_cb);
+        ff_free_vlc(&c->ylev[i].bias_cb);
+    }
+    for (i = 0; i < 3; i++) {
+        ff_free_vlc(&c->ulev[i].mv_cb);
+        ff_free_vlc(&c->ulev[i].flags_cb);
+        ff_free_vlc(&c->ulev[i].bias_cb);
+        ff_free_vlc(&c->vlev[i].mv_cb);
+        ff_free_vlc(&c->vlev[i].flags_cb);
+        ff_free_vlc(&c->vlev[i].bias_cb);
+    }
 
     return 0;
 }
diff --git a/libavcodec/clearvideodata.h b/libavcodec/clearvideodata.h
new file mode 100644
index 0000000..43d12de
--- /dev/null
+++ b/libavcodec/clearvideodata.h
@@ -0,0 +1,1832 @@
+/*
+ * ClearVideo decoder
+ * Copyright (c) 2012-2018 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CLEARVIDEODATA_H
+#define AVCODEC_CLEARVIDEODATA_H
+
+#include "libavutil/common.h"
+
+#define NUM_DC_CODES 127
+#define NUM_AC_CODES 103
+
+static const uint8_t clv_dc_codes[NUM_DC_CODES] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+    0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x07, 0x0B,
+    0x0C, 0x08, 0x08, 0x09, 0x04, 0x06, 0x07, 0x05,
+    0x04, 0x05, 0x04, 0x06, 0x05, 0x06, 0x07, 0x05,
+    0x06, 0x07, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08,
+    0x09, 0x0A, 0x0B, 0x07, 0x08, 0x09, 0x07, 0x08,
+    0x06, 0x07, 0x08, 0x06, 0x04, 0x05, 0x02, 0x01,
+    0x03, 0x06, 0x07, 0x07, 0x09, 0x0A, 0x0B, 0x09,
+    0x0A, 0x0B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x09,
+    0x0D, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x07,
+    0x08, 0x09, 0x0A, 0x0B, 0x06, 0x07, 0x06, 0x08,
+    0x07, 0x09, 0x0A, 0x0B, 0x09, 0x0A, 0x0B, 0x0C,
+    0x14, 0x0D, 0x0D, 0x0E, 0x0F, 0x15, 0x15, 0x16,
+    0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
+    0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25,
+};
+
+static const uint8_t clv_dc_bits[NUM_DC_CODES] = {
+    22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 21, 22, 22, 19, 20,
+    20, 19, 18, 18, 15, 17, 17, 16,
+    14, 15, 12, 13, 14, 14, 14, 12,
+    12, 12, 11, 11, 11, 10, 10, 10,
+    10, 10, 10,  9,  9,  9,  8,  8,
+     7,  7,  7,  6,  5,  5,  3,  1,
+     3,  5,  5,  6,  7,  7,  7,  8,
+     8,  8,  9,  9,  9,  9, 10, 11,
+    10, 11, 11, 12, 12, 12, 12, 13,
+    14, 14, 14, 14, 15, 15, 16, 17,
+    16, 17, 18, 18, 19, 19, 19, 19,
+    21, 19, 20, 19, 19, 21, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22,
+};
+
+static const uint16_t clv_ac_syms[NUM_AC_CODES] = {
+    0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
+    0x0009, 0x000A, 0x000B, 0x000C, 0x0011, 0x0012, 0x0013, 0x0014,
+    0x0015, 0x0016, 0x0021, 0x0022, 0x0023, 0x0024, 0x0031, 0x0032,
+    0x0033, 0x0041, 0x0042, 0x0043, 0x0051, 0x0052, 0x0053, 0x0061,
+    0x0062, 0x0063, 0x0071, 0x0072, 0x0081, 0x0082, 0x0091, 0x0092,
+    0x00A1, 0x00A2, 0x00B1, 0x00C1, 0x00D1, 0x00E1, 0x00F1, 0x0101,
+    0x0111, 0x0121, 0x0131, 0x0141, 0x0151, 0x0161, 0x0171, 0x0181,
+    0x0191, 0x01A1, 0x1001, 0x1002, 0x1003, 0x1011, 0x1012, 0x1021,
+    0x1031, 0x1041, 0x1051, 0x1061, 0x1071, 0x1081, 0x1091, 0x10A1,
+    0x10B1, 0x10C1, 0x10D1, 0x10E1, 0x10F1, 0x1101, 0x1111, 0x1121,
+    0x1131, 0x1141, 0x1151, 0x1161, 0x1171, 0x1181, 0x1191, 0x11A1,
+    0x11B1, 0x11C1, 0x11D1, 0x11E1, 0x11F1, 0x1201, 0x1211, 0x1221,
+    0x1231, 0x1241, 0x1251, 0x1261, 0x1271, 0x1281, 0x1BFF,
+};
+
+static const uint8_t clv_ac_codes[NUM_AC_CODES] = {
+    0x02, 0x0F, 0x15, 0x17, 0x1F, 0x25, 0x24, 0x21,
+    0x20, 0x07, 0x06, 0x20, 0x06, 0x14, 0x1E, 0x0F,
+    0x21, 0x50, 0x0E, 0x1D, 0x0E, 0x51, 0x0D, 0x23,
+    0x0D, 0x0C, 0x22, 0x52, 0x0B, 0x0C, 0x53, 0x13,
+    0x0B, 0x54, 0x12, 0x0A, 0x11, 0x09, 0x10, 0x08,
+    0x16, 0x55, 0x15, 0x14, 0x1C, 0x1B, 0x21, 0x20,
+    0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x22, 0x23,
+    0x56, 0x57, 0x07, 0x19, 0x05, 0x0F, 0x04, 0x0E,
+    0x0D, 0x0C, 0x13, 0x12, 0x11, 0x10, 0x1A, 0x19,
+    0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x18, 0x17,
+    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x07, 0x06,
+    0x05, 0x04, 0x24, 0x25, 0x26, 0x27, 0x58, 0x59,
+    0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x03,
+};
+
+static const uint8_t clv_ac_bits[NUM_AC_CODES] = {
+     2,  4,  6,  7,  8,  9,  9, 10,
+    10, 11, 11, 11,  3,  6,  8, 10,
+    11, 12,  4,  8, 10, 12,  5,  9,
+    10,  5,  9, 12,  5, 10, 12,  6,
+    10, 12,  6, 10,  6, 10,  6, 10,
+     7, 12,  7,  7,  8,  8,  9,  9,
+     9,  9,  9,  9,  9,  9, 11, 11,
+    12, 12,  4,  9, 11,  6, 11,  6,
+     6,  6,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  8,  8,  8,  9,  9,
+     9,  9,  9,  9,  9,  9, 10, 10,
+    10, 10, 11, 11, 11, 11, 12, 12,
+    12, 12, 12, 12, 12, 12,  7,
+};
+
+static const uint8_t clv_flagsy_0_bits[] = {
+     3,  4,  4,  4,  4,  4,  6,  5,  4,  7,  4,  5,  4,  7,  5,  2,
+};
+
+static const uint16_t clv_flagsy_0_codes[] = {
+    0x0002, 0x0009, 0x000B, 0x0006, 0x000C, 0x0007, 0x003E, 0x001C,
+    0x000D, 0x007E, 0x000A, 0x001D, 0x0008, 0x007F, 0x001E, 0x0000,
+};
+
+static const uint8_t clv_flagsy_1_bits[] = {
+     2,  4,  4,  3,  4,  4,  7,  6,  4,  6,  4,  6,  4,  8,  8,  3,
+};
+
+static const uint16_t clv_flagsy_1_codes[] = {
+    0x0000, 0x000A, 0x000C, 0x0003, 0x000B, 0x0009, 0x007E, 0x003D,
+    0x000D, 0x003E, 0x000E, 0x003C, 0x0008, 0x00FE, 0x00FF, 0x0002,
+};
+
+static const uint8_t clv_flagsy_2_bits[] = {
+     1,  4,  4,  4,  4,  5,  7,  5,  4,  6,  5,  8,  4,  9, 10, 10,
+};
+
+static const uint16_t clv_flagsy_2_codes[] = {
+    0x0000, 0x000C, 0x000B, 0x0008, 0x000A, 0x001C, 0x007E, 0x001D,
+    0x000D, 0x003E, 0x001E, 0x00FE, 0x0009, 0x01FE, 0x03FE, 0x03FF,
+};
+
+static const uint8_t clv_flagsu_0_bits[] = {
+     1,  4,  4,  4,  5,  5,  9,  7,  5,  9,  4,  7,  4,  8,  7,  4,
+};
+
+static const uint16_t clv_flagsu_0_codes[] = {
+    0x0000, 0x000B, 0x000D, 0x0009, 0x001D, 0x001C, 0x01FF, 0x007D,
+    0x001E, 0x01FE, 0x000C, 0x007C, 0x000A, 0x00FE, 0x007E, 0x0008,
+};
+
+static const uint8_t clv_flagsu_1_bits[] = {
+     1,  4,  4,  4,  4,  4,  8,  6,  4,  8,  5,  8,  4, 10,  9, 10,
+};
+
+static const uint16_t clv_flagsu_1_codes[] = {
+    0x0000, 0x000C, 0x0008, 0x000A, 0x000B, 0x000E, 0x00FD, 0x003E,
+    0x000D, 0x00FC, 0x001E, 0x00FE, 0x0009, 0x03FE, 0x01FE, 0x03FF,
+};
+
+static const uint8_t clv_flagsv_0_bits[] = {
+     1,  4,  5,  4,  5,  5,  8, 10,  5,  9,  5,  6,  4, 10,  7,  3,
+};
+
+static const uint16_t clv_flagsv_0_codes[] = {
+    0x0000, 0x000A, 0x001B, 0x000C, 0x001E, 0x001C, 0x00FE, 0x03FE,
+    0x001D, 0x01FE, 0x001A, 0x003E, 0x000B, 0x03FF, 0x007E, 0x0004,
+};
+
+static const uint8_t clv_flagsv_1_bits[] = {
+     1,  4,  4,  4,  4,  5,  8,  6,  3,  7,  5, 10,  5, 11,  9, 11,
+};
+
+static const uint16_t clv_flagsv_1_codes[] = {
+    0x0000, 0x000D, 0x000C, 0x000A, 0x000B, 0x001D, 0x00FE, 0x003E,
+    0x0004, 0x007E, 0x001E, 0x03FE, 0x001C, 0x07FE, 0x01FE, 0x07FF,
+};
+
+static const uint8_t clv_mvy_0_bits[] = {
+    16, 14, 13, 13, 13, 12, 11, 11,  9, 11, 11, 12, 13, 13, 13, 14,
+    16, 15, 14, 14, 14, 13, 13, 12, 10,  7, 10, 12, 13, 13, 14, 14,
+    14, 15, 15, 14, 14, 14, 13, 13, 11, 10,  7, 10, 11, 13, 13, 14,
+    14, 14, 15, 15, 14, 14, 13, 13, 12, 11, 10,  7, 10, 11, 12, 13,
+    13, 14, 14, 15, 16, 15, 14, 12, 12, 12, 11, 10,  6, 10, 11, 12,
+    12, 12, 14, 15, 16, 15, 14, 13, 13, 12, 11, 10,  9,  6,  9, 10,
+    11, 12, 13, 13, 14, 15, 14, 14, 13, 12, 12, 11, 10,  8,  6,  8,
+    10, 11, 12, 12, 13, 14, 14, 14, 13, 13, 13, 11, 11,  9,  7,  4,
+     7,  9, 11, 11, 12, 13, 13, 14, 11, 10, 10,  9,  9,  8,  7,  5,
+     1,  5,  7,  8,  9,  9, 10, 10, 11, 14, 13, 13, 12, 11, 11,  9,
+     7,  4,  7,  9, 11, 11, 13, 13, 13, 14, 14, 14, 13, 12, 12, 11,
+    10,  8,  6,  8, 10, 11, 12, 12, 13, 14, 14, 15, 14, 13, 13, 12,
+    11, 10,  9,  7,  9, 10, 11, 12, 13, 13, 14, 15, 16, 15, 14, 12,
+    12, 12, 11, 10,  6, 10, 11, 12, 12, 12, 14, 15, 16, 15, 14, 14,
+    13, 13, 12, 11, 10,  7, 10, 11, 12, 13, 13, 14, 14, 15, 15, 14,
+    14, 14, 13, 13, 11, 10,  7, 10, 11, 13, 13, 14, 14, 14, 15, 15,
+    14, 14, 14, 13, 13, 12, 10,  7, 10, 12, 13, 13, 14, 14, 14, 15,
+    16, 14, 13, 13, 13, 12, 11, 11,  9, 11, 11, 12, 13, 13, 13, 14,
+    16,  6,
+};
+
+static const uint16_t clv_mvy_0_codes[] = {
+    0xFFFD, 0x3FE5, 0x1FD8, 0x1FC4, 0x1FBC, 0x0FCB, 0x07CF, 0x07C4,
+    0x01D7, 0x07C6, 0x07CE, 0x0FCA, 0x1FBD, 0x1FC2, 0x1FD9, 0x3FE4,
+    0xFFFE, 0x7FF0, 0x3FEF, 0x3FD2, 0x3FC9, 0x1FCC, 0x1FC0, 0x0FB6,
+    0x03D6, 0x0070, 0x03D7, 0x0FB7, 0x1FC1, 0x1FCD, 0x3FCB, 0x3FD0,
+    0x3FED, 0x7FF2, 0x7FFB, 0x3FDC, 0x3FD9, 0x3FD4, 0x1FB6, 0x1FAE,
+    0x07C0, 0x03BC, 0x006D, 0x03BD, 0x07C1, 0x1FAF, 0x1FB7, 0x3FD1,
+    0x3FDB, 0x3FDF, 0x7FF9, 0x7FEE, 0x3FF0, 0x3FC7, 0x1FC9, 0x1FA7,
+    0x0FAD, 0x07D2, 0x03CE, 0x006C, 0x03CF, 0x07D0, 0x0FAF, 0x1FA6,
+    0x1FC6, 0x3FC4, 0x3FF1, 0x7FED, 0xFFFB, 0x7FF6, 0x3FE6, 0x0FCC,
+    0x0FC4, 0x0FB0, 0x07B0, 0x03C6, 0x0031, 0x03C7, 0x07B1, 0x0FB1,
+    0x0FC5, 0x0FCD, 0x3FEA, 0x7FF7, 0xFFF9, 0x7FE9, 0x3FCE, 0x1FCF,
+    0x1FB2, 0x0FB8, 0x07BC, 0x03D0, 0x01DA, 0x002F, 0x01DB, 0x03D1,
+    0x07BE, 0x0FBA, 0x1FB4, 0x1FD0, 0x3FCD, 0x7FEB, 0x3FE1, 0x3FC1,
+    0x1FD3, 0x0FC3, 0x0FBE, 0x07B6, 0x03C4, 0x00E4, 0x002D, 0x00E5,
+    0x03C5, 0x07B7, 0x0FBF, 0x0FC1, 0x1FD2, 0x3FC3, 0x3FE2, 0x3FBF,
+    0x1FDB, 0x1FAD, 0x1FA5, 0x07CB, 0x07BB, 0x01D5, 0x0068, 0x0008,
+    0x0065, 0x01D2, 0x07B8, 0x07C8, 0x0FD0, 0x1FAA, 0x1FDA, 0x3FBC,
+    0x07D4, 0x03CA, 0x03C0, 0x01D8, 0x01D0, 0x00E6, 0x0069, 0x0014,
+    0x0000, 0x0015, 0x006A, 0x00E7, 0x01D1, 0x01D9, 0x03C1, 0x03CB,
+    0x07D5, 0x3FBE, 0x1FDC, 0x1FAB, 0x0FD1, 0x07C9, 0x07B9, 0x01D3,
+    0x0066, 0x0009, 0x0067, 0x01D4, 0x07BA, 0x07CA, 0x1FA4, 0x1FAC,
+    0x1FDD, 0x3FBD, 0x3FE0, 0x3FC0, 0x1FD5, 0x0FC0, 0x0FBC, 0x07B4,
+    0x03C2, 0x00E2, 0x002C, 0x00E3, 0x03C3, 0x07B5, 0x0FBD, 0x0FC2,
+    0x1FD7, 0x3FC2, 0x3FE3, 0x7FEA, 0x3FCC, 0x1FCE, 0x1FB3, 0x0FB9,
+    0x07BD, 0x03D2, 0x01DC, 0x0064, 0x01DD, 0x03D3, 0x07BF, 0x0FBB,
+    0x1FB5, 0x1FD1, 0x3FCF, 0x7FE8, 0xFFFA, 0x7FF4, 0x3FEB, 0x0FCE,
+    0x0FC6, 0x0FB2, 0x07B2, 0x03C8, 0x0030, 0x03C9, 0x07B3, 0x0FB3,
+    0x0FC7, 0x0FCF, 0x3FE9, 0x7FF5, 0xFFF8, 0x7FF3, 0x3FF3, 0x3FC6,
+    0x1FC8, 0x1FA8, 0x0FAC, 0x07D1, 0x03CC, 0x006B, 0x03CD, 0x07D3,
+    0x0FAE, 0x1FA9, 0x1FC7, 0x3FC5, 0x3FF2, 0x7FEC, 0x7FFA, 0x3FDE,
+    0x3FDA, 0x3FD7, 0x1FB9, 0x1FB0, 0x07C2, 0x03BE, 0x006E, 0x03BF,
+    0x07C3, 0x1FB1, 0x1FB8, 0x3FD3, 0x3FD8, 0x3FDD, 0x7FF8, 0x7FEF,
+    0x3FEE, 0x3FD6, 0x3FC8, 0x1FCB, 0x1FBE, 0x0FB5, 0x03D4, 0x006F,
+    0x03D5, 0x0FB4, 0x1FBF, 0x1FCA, 0x3FCA, 0x3FD5, 0x3FEC, 0x7FF1,
+    0xFFFF, 0x3FE8, 0x1FD4, 0x1FC5, 0x1FBA, 0x0FC9, 0x07CD, 0x07C7,
+    0x01D6, 0x07C5, 0x07CC, 0x0FC8, 0x1FBB, 0x1FC3, 0x1FD6, 0x3FE7,
+    0xFFFC, 0x002E,
+};
+
+static const uint16_t clv_mvy_0_syms[] = {
+    0xF8F8, 0xF9F8, 0xFAF8, 0xFBF8, 0xFCF8, 0xFDF8, 0xFEF8, 0xFFF8,
+    0x00F8, 0x01F8, 0x02F8, 0x03F8, 0x04F8, 0x05F8, 0x06F8, 0x07F8,
+    0x08F8, 0xF8F9, 0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9,
+    0xFFF9, 0x00F9, 0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9,
+    0x07F9, 0x08F9, 0xF8FA, 0xF9FA, 0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA,
+    0xFEFA, 0xFFFA, 0x00FA, 0x01FA, 0x02FA, 0x03FA, 0x04FA, 0x05FA,
+    0x06FA, 0x07FA, 0x08FA, 0xF8FB, 0xF9FB, 0xFAFB, 0xFBFB, 0xFCFB,
+    0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB, 0x03FB, 0x04FB,
+    0x05FB, 0x06FB, 0x07FB, 0x08FB, 0xF8FC, 0xF9FC, 0xFAFC, 0xFBFC,
+    0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC, 0x00FC, 0x01FC, 0x02FC, 0x03FC,
+    0x04FC, 0x05FC, 0x06FC, 0x07FC, 0x08FC, 0xF8FD, 0xF9FD, 0xFAFD,
+    0xFBFD, 0xFCFD, 0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD,
+    0x03FD, 0x04FD, 0x05FD, 0x06FD, 0x07FD, 0x08FD, 0xF8FE, 0xF9FE,
+    0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE, 0xFEFE, 0xFFFE, 0x00FE, 0x01FE,
+    0x02FE, 0x03FE, 0x04FE, 0x05FE, 0x06FE, 0x07FE, 0x08FE, 0xF8FF,
+    0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF, 0xFFFF, 0x00FF,
+    0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF, 0x07FF, 0x08FF,
+    0xF800, 0xF900, 0xFA00, 0xFB00, 0xFC00, 0xFD00, 0xFE00, 0xFF00,
+    0x0000, 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700,
+    0x0800, 0xF801, 0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01,
+    0xFF01, 0x0001, 0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601,
+    0x0701, 0x0801, 0xF802, 0xF902, 0xFA02, 0xFB02, 0xFC02, 0xFD02,
+    0xFE02, 0xFF02, 0x0002, 0x0102, 0x0202, 0x0302, 0x0402, 0x0502,
+    0x0602, 0x0702, 0x0802, 0xF803, 0xF903, 0xFA03, 0xFB03, 0xFC03,
+    0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203, 0x0303, 0x0403,
+    0x0503, 0x0603, 0x0703, 0x0803, 0xF804, 0xF904, 0xFA04, 0xFB04,
+    0xFC04, 0xFD04, 0xFE04, 0xFF04, 0x0004, 0x0104, 0x0204, 0x0304,
+    0x0404, 0x0504, 0x0604, 0x0704, 0x0804, 0xF805, 0xF905, 0xFA05,
+    0xFB05, 0xFC05, 0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205,
+    0x0305, 0x0405, 0x0505, 0x0605, 0x0705, 0x0805, 0xF806, 0xF906,
+    0xFA06, 0xFB06, 0xFC06, 0xFD06, 0xFE06, 0xFF06, 0x0006, 0x0106,
+    0x0206, 0x0306, 0x0406, 0x0506, 0x0606, 0x0706, 0x0806, 0xF807,
+    0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07, 0xFF07, 0x0007,
+    0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607, 0x0707, 0x0807,
+    0xF808, 0xF908, 0xFA08, 0xFB08, 0xFC08, 0xFD08, 0xFE08, 0xFF08,
+    0x0008, 0x0108, 0x0208, 0x0308, 0x0408, 0x0508, 0x0608, 0x0708,
+    0x0808, 0x0909,
+};
+
+static const uint8_t clv_mvy_1_bits[] = {
+    15, 15, 15, 15, 14, 14, 13, 13, 11,  9, 11, 13, 13, 14, 14, 15,
+    15, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 10,  9, 10, 12, 12,
+    12, 13, 13, 14, 14, 15, 15, 15, 14, 14, 13, 13, 13, 12, 11,  8,
+    11, 12, 13, 13, 13, 14, 14, 15, 15, 14, 14, 14, 14, 13, 12, 12,
+    12, 10,  8, 10, 12, 12, 12, 13, 14, 14, 14, 14, 15, 14, 14, 13,
+    13, 12, 12, 11, 10,  8, 10, 11, 12, 12, 13, 13, 14, 14, 15, 14,
+    14, 13, 13, 13, 12, 12, 11,  9,  7,  9, 11, 12, 12, 13, 13, 13,
+    14, 14, 14, 14, 13, 13, 13, 12, 11, 10,  9,  7,  9, 10, 11, 12,
+    13, 13, 13, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10,  8,  7,  8,
+    10, 11, 11, 12, 12, 13, 13, 14, 13, 13, 13, 12, 11, 11, 10,  9,
+     6,  4,  6,  9, 10, 11, 12, 12, 13, 13, 13, 12, 11, 10, 10, 10,
+     9,  9,  7,  5,  1,  5,  7,  9,  9, 10, 10, 10, 11, 12, 13, 13,
+    13, 12, 11, 11, 10,  9,  6,  4,  6,  9, 10, 11, 11, 12, 13, 13,
+    13, 14, 13, 13, 12, 12, 11, 11, 10,  8,  7,  8, 10, 11, 11, 12,
+    12, 13, 13, 14, 14, 14, 13, 13, 13, 12, 11, 10,  9,  7,  9, 10,
+    11, 12, 13, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11,  9,
+     7,  9, 11, 12, 12, 13, 13, 13, 14, 14, 15, 14, 14, 13, 13, 12,
+    12, 11, 10,  8, 10, 11, 12, 12, 13, 13, 14, 14, 15, 14, 14, 14,
+    14, 13, 12, 12, 12, 10,  8, 10, 12, 12, 12, 13, 14, 14, 14, 14,
+    15, 15, 14, 14, 13, 13, 13, 12, 11,  8, 11, 12, 13, 13, 13, 14,
+    14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 10,  9, 10, 12, 12,
+    12, 13, 13, 14, 14, 15, 15, 15, 15, 15, 14, 14, 13, 13, 11,  9,
+    11, 13, 13, 14, 14, 15, 15, 15, 15,  5,
+};
+
+static const uint16_t clv_mvy_1_codes[] = {
+    0x7FF9, 0x7FF6, 0x7FEB, 0x7FE3, 0x3FCF, 0x3FB3, 0x1FBD, 0x1FA1,
+    0x07AD, 0x01CE, 0x07AF, 0x1FA0, 0x1FBB, 0x3FB0, 0x3FCC, 0x7FE2,
+    0x7FE9, 0x7FF4, 0x7FFB, 0x7FF1, 0x3FE7, 0x3FBD, 0x1FA5, 0x1F9B,
+    0x0FB4, 0x0FAF, 0x0FAA, 0x03CC, 0x01CD, 0x03CD, 0x0FAB, 0x0FAD,
+    0x0FB1, 0x1F9C, 0x1FA3, 0x3FBE, 0x3FE6, 0x7FF0, 0x7FFC, 0x7FE5,
+    0x3FB5, 0x3FAE, 0x1FB4, 0x1FAA, 0x1F97, 0x0F85, 0x07A2, 0x00DD,
+    0x07A3, 0x0F86, 0x1F99, 0x1FAD, 0x1FB2, 0x3FAC, 0x3FB7, 0x7FE4,
+    0x7FFD, 0x3FEA, 0x3FD8, 0x3FC3, 0x3FBB, 0x1FC9, 0x0FBC, 0x0F97,
+    0x0F8F, 0x03B8, 0x00DA, 0x03B9, 0x0F90, 0x0F98, 0x0FB9, 0x1FC6,
+    0x3FBA, 0x3FC0, 0x3FD9, 0x3FEB, 0x7FEF, 0x3FEE, 0x3FD7, 0x1FC3,
+    0x1F96, 0x0FC0, 0x0FA8, 0x07AA, 0x03BE, 0x00D9, 0x03BF, 0x07AB,
+    0x0FA7, 0x0FBF, 0x1F98, 0x1FC5, 0x3FD6, 0x3FEF, 0x7FEE, 0x3FDC,
+    0x3FCA, 0x1FBF, 0x1F8B, 0x1F87, 0x0FA2, 0x0F94, 0x07A5, 0x01D4,
+    0x0069, 0x01D5, 0x07A6, 0x0F95, 0x0FA3, 0x1F89, 0x1F8D, 0x1FC0,
+    0x3FC6, 0x3FDE, 0x3FE0, 0x3FD3, 0x1FB8, 0x1F8F, 0x1F84, 0x0F89,
+    0x07BC, 0x03C6, 0x01C6, 0x0067, 0x01C7, 0x03C7, 0x07BD, 0x0F87,
+    0x1F82, 0x1F8A, 0x1FB6, 0x3FD1, 0x3FE2, 0x3FC5, 0x1FCE, 0x1FAE,
+    0x0FB5, 0x0F8B, 0x07B4, 0x07B0, 0x03B4, 0x00DE, 0x0064, 0x00DF,
+    0x03B5, 0x07B1, 0x07B5, 0x0F8C, 0x0FB6, 0x1FAF, 0x1FD1, 0x3FCB,
+    0x1FD3, 0x1FCC, 0x1FA7, 0x0F9B, 0x07BE, 0x079C, 0x03C0, 0x01C8,
+    0x002E, 0x0008, 0x002F, 0x01C9, 0x03C1, 0x079D, 0x0F82, 0x0F9A,
+    0x1FA9, 0x1FCA, 0x1FD4, 0x0F9F, 0x07B6, 0x03C8, 0x03B2, 0x03B0,
+    0x01D6, 0x01D0, 0x006A, 0x0014, 0x0000, 0x0015, 0x006B, 0x01D1,
+    0x01D7, 0x03B1, 0x03B3, 0x03C9, 0x07B7, 0x0FA0, 0x1FD5, 0x1FCB,
+    0x1FAB, 0x0F9C, 0x07BF, 0x079E, 0x03C2, 0x01CA, 0x0030, 0x0009,
+    0x0031, 0x01CB, 0x03C3, 0x079F, 0x07C0, 0x0F9D, 0x1FAC, 0x1FCD,
+    0x1FD2, 0x3FC8, 0x1FD0, 0x1FB0, 0x0FB7, 0x0F8D, 0x07B8, 0x07B2,
+    0x03B6, 0x00E0, 0x0065, 0x00E1, 0x03B7, 0x07B3, 0x07B9, 0x0F8E,
+    0x0FB8, 0x1FB1, 0x1FCF, 0x3FC9, 0x3FE1, 0x3FD2, 0x1FB7, 0x1F8E,
+    0x1F83, 0x0F88, 0x07BA, 0x03C4, 0x01C4, 0x0066, 0x01C5, 0x03C5,
+    0x07BB, 0x0F8A, 0x1F85, 0x1F90, 0x1FB9, 0x3FD0, 0x3FE3, 0x3FDD,
+    0x3FC7, 0x1FC1, 0x1F91, 0x1F88, 0x0FA4, 0x0F96, 0x07A7, 0x01D2,
+    0x0068, 0x01D3, 0x07A4, 0x0F93, 0x0FA1, 0x1F86, 0x1F8C, 0x1FBE,
+    0x3FC4, 0x3FDF, 0x7FED, 0x3FEC, 0x3FD4, 0x1FC4, 0x1F92, 0x0FBD,
+    0x0FA5, 0x07A8, 0x03BC, 0x00D8, 0x03BD, 0x07A9, 0x0FA6, 0x0FBE,
+    0x1F93, 0x1FC2, 0x3FD5, 0x3FED, 0x7FEC, 0x3FE8, 0x3FDB, 0x3FC1,
+    0x3FB9, 0x1FC7, 0x0FBA, 0x0F9E, 0x0F91, 0x03BA, 0x00DB, 0x03BB,
+    0x0F92, 0x0F99, 0x0FBB, 0x1FC8, 0x3FB8, 0x3FC2, 0x3FDA, 0x3FE9,
+    0x7FFF, 0x7FE7, 0x3FB6, 0x3FAF, 0x1FB3, 0x1FA6, 0x1F94, 0x0F83,
+    0x07A0, 0x00DC, 0x07A1, 0x0F84, 0x1F95, 0x1FA8, 0x1FB5, 0x3FAD,
+    0x3FB4, 0x7FE6, 0x7FFE, 0x7FF3, 0x3FE5, 0x3FBC, 0x1FA4, 0x1F9D,
+    0x0FB2, 0x0FAE, 0x0FA9, 0x03CA, 0x01CC, 0x03CB, 0x0FAC, 0x0FB0,
+    0x0FB3, 0x1F9A, 0x1FA2, 0x3FBF, 0x3FE4, 0x7FF2, 0x7FF8, 0x7FF5,
+    0x7FEA, 0x7FE0, 0x3FCD, 0x3FB1, 0x1FBA, 0x1F9F, 0x07AE, 0x01CF,
+    0x07AC, 0x1F9E, 0x1FBC, 0x3FB2, 0x3FCE, 0x7FE1, 0x7FE8, 0x7FF7,
+    0x7FFA, 0x0016,
+};
+
+static const uint16_t clv_mvy_1_syms[] = {
+    0xF7F7, 0xF8F7, 0xF9F7, 0xFAF7, 0xFBF7, 0xFCF7, 0xFDF7, 0xFEF7,
+    0xFFF7, 0x00F7, 0x01F7, 0x02F7, 0x03F7, 0x04F7, 0x05F7, 0x06F7,
+    0x07F7, 0x08F7, 0x09F7, 0xF7F8, 0xF8F8, 0xF9F8, 0xFAF8, 0xFBF8,
+    0xFCF8, 0xFDF8, 0xFEF8, 0xFFF8, 0x00F8, 0x01F8, 0x02F8, 0x03F8,
+    0x04F8, 0x05F8, 0x06F8, 0x07F8, 0x08F8, 0x09F8, 0xF7F9, 0xF8F9,
+    0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9, 0xFFF9, 0x00F9,
+    0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9, 0x07F9, 0x08F9,
+    0x09F9, 0xF7FA, 0xF8FA, 0xF9FA, 0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA,
+    0xFEFA, 0xFFFA, 0x00FA, 0x01FA, 0x02FA, 0x03FA, 0x04FA, 0x05FA,
+    0x06FA, 0x07FA, 0x08FA, 0x09FA, 0xF7FB, 0xF8FB, 0xF9FB, 0xFAFB,
+    0xFBFB, 0xFCFB, 0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB,
+    0x03FB, 0x04FB, 0x05FB, 0x06FB, 0x07FB, 0x08FB, 0x09FB, 0xF7FC,
+    0xF8FC, 0xF9FC, 0xFAFC, 0xFBFC, 0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC,
+    0x00FC, 0x01FC, 0x02FC, 0x03FC, 0x04FC, 0x05FC, 0x06FC, 0x07FC,
+    0x08FC, 0x09FC, 0xF7FD, 0xF8FD, 0xF9FD, 0xFAFD, 0xFBFD, 0xFCFD,
+    0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD, 0x03FD, 0x04FD,
+    0x05FD, 0x06FD, 0x07FD, 0x08FD, 0x09FD, 0xF7FE, 0xF8FE, 0xF9FE,
+    0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE, 0xFEFE, 0xFFFE, 0x00FE, 0x01FE,
+    0x02FE, 0x03FE, 0x04FE, 0x05FE, 0x06FE, 0x07FE, 0x08FE, 0x09FE,
+    0xF7FF, 0xF8FF, 0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF,
+    0xFFFF, 0x00FF, 0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF,
+    0x07FF, 0x08FF, 0x09FF, 0xF700, 0xF800, 0xF900, 0xFA00, 0xFB00,
+    0xFC00, 0xFD00, 0xFE00, 0xFF00, 0x0000, 0x0100, 0x0200, 0x0300,
+    0x0400, 0x0500, 0x0600, 0x0700, 0x0800, 0x0900, 0xF701, 0xF801,
+    0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01, 0xFF01, 0x0001,
+    0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601, 0x0701, 0x0801,
+    0x0901, 0xF702, 0xF802, 0xF902, 0xFA02, 0xFB02, 0xFC02, 0xFD02,
+    0xFE02, 0xFF02, 0x0002, 0x0102, 0x0202, 0x0302, 0x0402, 0x0502,
+    0x0602, 0x0702, 0x0802, 0x0902, 0xF703, 0xF803, 0xF903, 0xFA03,
+    0xFB03, 0xFC03, 0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203,
+    0x0303, 0x0403, 0x0503, 0x0603, 0x0703, 0x0803, 0x0903, 0xF704,
+    0xF804, 0xF904, 0xFA04, 0xFB04, 0xFC04, 0xFD04, 0xFE04, 0xFF04,
+    0x0004, 0x0104, 0x0204, 0x0304, 0x0404, 0x0504, 0x0604, 0x0704,
+    0x0804, 0x0904, 0xF705, 0xF805, 0xF905, 0xFA05, 0xFB05, 0xFC05,
+    0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205, 0x0305, 0x0405,
+    0x0505, 0x0605, 0x0705, 0x0805, 0x0905, 0xF706, 0xF806, 0xF906,
+    0xFA06, 0xFB06, 0xFC06, 0xFD06, 0xFE06, 0xFF06, 0x0006, 0x0106,
+    0x0206, 0x0306, 0x0406, 0x0506, 0x0606, 0x0706, 0x0806, 0x0906,
+    0xF707, 0xF807, 0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07,
+    0xFF07, 0x0007, 0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607,
+    0x0707, 0x0807, 0x0907, 0xF708, 0xF808, 0xF908, 0xFA08, 0xFB08,
+    0xFC08, 0xFD08, 0xFE08, 0xFF08, 0x0008, 0x0108, 0x0208, 0x0308,
+    0x0408, 0x0508, 0x0608, 0x0708, 0x0808, 0x0908, 0xF709, 0xF809,
+    0xF909, 0xFA09, 0xFB09, 0xFC09, 0xFD09, 0xFE09, 0xFF09, 0x0009,
+    0x0109, 0x0209, 0x0309, 0x0409, 0x0509, 0x0609, 0x0709, 0x0809,
+    0x0909, 0x0A0A,
+};
+
+static const uint8_t clv_mvy_2_bits[] = {
+    16, 16, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 13, 13, 12, 11,
+    12, 13, 13, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16,
+    15, 15, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 11, 10, 11,
+    13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 11, 10, 11, 13,
+    13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15,
+    15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 11,  9, 11, 12, 13,
+    13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 16, 16, 15, 15, 14,
+    14, 13, 13, 13, 13, 13, 13, 13, 12, 11, 11,  9, 11, 11, 12, 13,
+    13, 13, 13, 13, 13, 13, 14, 14, 14, 15, 16, 16, 15, 15, 15, 14,
+    13, 13, 13, 13, 13, 13, 13, 12, 11, 11,  9, 11, 11, 12, 13, 13,
+    13, 13, 13, 13, 13, 14, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14,
+    14, 13, 13, 13, 12, 12, 12, 11, 11,  9, 11, 11, 12, 12, 12, 13,
+    13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 14, 13,
+    13, 13, 12, 12, 12, 12, 11, 10,  8, 10, 11, 12, 12, 12, 12, 13,
+    13, 13, 14, 14, 14, 15, 15, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+    13, 12, 12, 12, 11, 11, 10,  8, 10, 11, 11, 12, 12, 12, 13, 13,
+    13, 13, 14, 14, 14, 15, 16, 15, 15, 14, 14, 13, 13, 13, 13, 13,
+    12, 12, 12, 11, 10,  9,  8,  9, 10, 11, 12, 12, 12, 13, 13, 13,
+    13, 13, 14, 14, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 13, 12,
+    12, 11, 11, 10, 10,  7, 10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+    13, 14, 14, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12,
+    11, 11, 10,  9,  7,  9, 10, 11, 11, 12, 12, 12, 12, 12, 13, 13,
+    14, 14, 14, 14, 15, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+    11, 10,  8,  7,  8, 10, 11, 11, 11, 11, 12, 12, 12, 13, 13, 13,
+    13, 13, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+     9,  8,  6,  8,  9, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 14,
+    14, 14, 16, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,  9,  9,  8,
+     6,  4,  6,  8,  9,  9, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13,
+    16, 12, 12, 12, 12, 11, 11, 10, 10,  9,  9,  8,  8,  8,  7,  4,
+     2,  4,  7,  8,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 12, 12,
+    16, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,  9,  9,  8,  6,  4,
+     6,  8,  9,  9, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 16, 14,
+    14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,  9,  8,  6,  8,
+     9, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 14, 14, 14, 15, 13,
+    13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10,  8,  7,  8, 10,
+    11, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 13, 15, 14, 14, 14,
+    14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 10,  9,  7,  9, 10, 11,
+    11, 12, 12, 12, 12, 12, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14,
+    13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10,  7, 10, 10, 11, 11,
+    12, 12, 13, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 14, 14, 13,
+    13, 13, 13, 13, 12, 12, 12, 11, 10,  9,  8,  9, 10, 11, 12, 12,
+    12, 13, 13, 13, 13, 13, 14, 14, 15, 15, 16, 15, 14, 14, 14, 13,
+    13, 13, 13, 12, 12, 12, 11, 11, 10,  8, 10, 11, 11, 12, 12, 12,
+    13, 13, 13, 13, 14, 14, 14, 15, 16, 16, 15, 15, 14, 14, 14, 13,
+    13, 13, 12, 12, 12, 12, 11, 10,  8, 10, 11, 12, 12, 12, 12, 13,
+    13, 13, 14, 14, 14, 15, 15, 16, 16, 15, 15, 15, 15, 14, 14, 13,
+    13, 13, 12, 12, 12, 11, 11,  9, 11, 11, 12, 12, 12, 13, 13, 13,
+    14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 15, 14, 13, 13, 13, 13,
+    13, 13, 13, 12, 11, 11,  9, 11, 11, 12, 13, 13, 13, 13, 13, 13,
+    13, 14, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 13,
+    13, 13, 12, 11, 11,  9, 11, 11, 12, 13, 13, 13, 13, 13, 13, 13,
+    14, 14, 14, 15, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13,
+    13, 13, 12, 11,  9, 11, 12, 13, 13, 13, 14, 14, 14, 14, 14, 14,
+    15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13,
+    13, 13, 11, 10, 11, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15,
+    15, 15, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+    13, 12, 10, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 13, 13,
+    12, 11, 12, 13, 13, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 16,
+    16,  7,
+};
+
+static const uint16_t clv_mvy_2_codes[] = {
+    0xFFF5, 0xFFD8, 0x7FE6, 0x7FB9, 0x7FB5, 0x7FB0, 0x7FA0, 0x7F99,
+    0x7F93, 0x3FAA, 0x3F9B, 0x3F52, 0x1F76, 0x1EF5, 0x0F0B, 0x06F0,
+    0x0F08, 0x1EF0, 0x1F75, 0x3F53, 0x3F9A, 0x3FA8, 0x7F94, 0x7F98,
+    0x7F9E, 0x7FAE, 0x7FAF, 0x7FB7, 0x7FE9, 0xFFDB, 0xFFF6, 0xFFFD,
+    0x7FD9, 0x7FCC, 0x7FC6, 0x7F9C, 0x7F80, 0x3FA5, 0x3F80, 0x3F6A,
+    0x3F31, 0x1F54, 0x1F40, 0x1F11, 0x1F05, 0x075E, 0x0360, 0x075F,
+    0x1F07, 0x1F12, 0x1F43, 0x1F56, 0x3F33, 0x3F68, 0x3F83, 0x3FA6,
+    0x7F7F, 0x7F9A, 0x7FC9, 0x7FCA, 0x7FDB, 0xFFF8, 0xFFEC, 0x7FDE,
+    0x7FE2, 0x7FA6, 0x7F6F, 0x3FA1, 0x3F8D, 0x3F5C, 0x3F39, 0x3F21,
+    0x3F18, 0x1F58, 0x1F1E, 0x1EF1, 0x0740, 0x035A, 0x0741, 0x1EF2,
+    0x1F1F, 0x1F5A, 0x3F19, 0x3F22, 0x3F3B, 0x3F5E, 0x3F8E, 0x3FA3,
+    0x7F6B, 0x7FA2, 0x7FE3, 0x7FE1, 0xFFEE, 0xFFFC, 0x7FC3, 0x7FBC,
+    0x7F71, 0x3F96, 0x3F86, 0x3F7A, 0x3F72, 0x3F59, 0x3F46, 0x1F0A,
+    0x1EFD, 0x1ED0, 0x0F02, 0x0712, 0x019F, 0x0713, 0x0F03, 0x1ED3,
+    0x1EFF, 0x1F09, 0x3F4A, 0x3F5A, 0x3F76, 0x3F7B, 0x3F87, 0x3F97,
+    0x7F73, 0x7FBB, 0x7FBF, 0xFFFB, 0xFFEB, 0x7F88, 0x7F5C, 0x3F7C,
+    0x3F3C, 0x1F60, 0x1F4C, 0x1F14, 0x1F0C, 0x1F00, 0x1EF9, 0x1ED8,
+    0x0F42, 0x075A, 0x0714, 0x0186, 0x0715, 0x075B, 0x0F43, 0x1EDA,
+    0x1EFA, 0x1F01, 0x1F0E, 0x1F15, 0x1F4D, 0x1F62, 0x3F3D, 0x3F7D,
+    0x3FAC, 0x7F86, 0xFFE8, 0xFFE7, 0x7FA7, 0x7F8C, 0x7F68, 0x3F9C,
+    0x1F7C, 0x1F6C, 0x1F69, 0x1EEC, 0x1EE4, 0x1ED5, 0x1ECD, 0x0EF0,
+    0x0752, 0x06F6, 0x018C, 0x06F7, 0x0753, 0x0EF1, 0x1ECE, 0x1ED6,
+    0x1EE5, 0x1EED, 0x1F6A, 0x1F6E, 0x1F7D, 0x3F9F, 0x7F66, 0x7F8A,
+    0x7FA5, 0xFFE6, 0xFFDC, 0x7FDA, 0x7FC0, 0x7FAC, 0x7F61, 0x3F42,
+    0x3F0E, 0x1F45, 0x1F2C, 0x1ECA, 0x0F27, 0x0EF6, 0x0EEE, 0x072E,
+    0x06F4, 0x0185, 0x06F5, 0x072F, 0x0EEF, 0x0EF7, 0x0F28, 0x1ECB,
+    0x1F2F, 0x1F46, 0x3F0F, 0x3F40, 0x7F5F, 0x7FB4, 0x7FC2, 0x7FDC,
+    0xFFDA, 0xFFE0, 0x7F72, 0x7F63, 0x3F70, 0x3F1C, 0x3F16, 0x1F82,
+    0x1EE2, 0x1EB2, 0x0F4C, 0x0EFC, 0x0EE0, 0x0ED3, 0x0722, 0x036C,
+    0x00BF, 0x036D, 0x0723, 0x0ECC, 0x0EE1, 0x0EFF, 0x0F4D, 0x1EB3,
+    0x1EE3, 0x1F83, 0x3F17, 0x3F1F, 0x3F75, 0x7F65, 0x7F70, 0xFFE2,
+    0xFFD7, 0x7F76, 0x3F2B, 0x3F13, 0x3F0A, 0x1F33, 0x1F23, 0x1EB4,
+    0x1EA0, 0x0F46, 0x0F32, 0x0F1A, 0x0756, 0x0728, 0x0356, 0x00B0,
+    0x0357, 0x0729, 0x0757, 0x0F1B, 0x0F33, 0x0F47, 0x1EA1, 0x1EB5,
+    0x1F20, 0x1F30, 0x3F08, 0x3F10, 0x3F28, 0x7F77, 0xFFD5, 0x7FD5,
+    0x7FD1, 0x3F5D, 0x3F25, 0x1F34, 0x1F24, 0x1EE8, 0x1EBC, 0x1EA8,
+    0x0F3A, 0x0F2E, 0x0EE2, 0x071C, 0x0374, 0x01A0, 0x00AE, 0x01A1,
+    0x0375, 0x071D, 0x0EE3, 0x0F2F, 0x0F3C, 0x1EA9, 0x1EBD, 0x1EE9,
+    0x1F25, 0x1F36, 0x3F24, 0x3F61, 0x7FCE, 0x7FD2, 0x7F91, 0x7F7D,
+    0x3F6C, 0x3F34, 0x1F72, 0x1F61, 0x1EDD, 0x1EC5, 0x1EA5, 0x0F05,
+    0x0ED6, 0x0750, 0x073E, 0x0368, 0x034A, 0x0052, 0x034B, 0x0369,
+    0x073F, 0x0751, 0x0ED7, 0x0F07, 0x1EA7, 0x1EC7, 0x1EDF, 0x1F65,
+    0x1F70, 0x3F36, 0x3F6F, 0x7F7C, 0x7F8F, 0x3F90, 0x3F66, 0x3F58,
+    0x3F4E, 0x1F48, 0x1EBB, 0x0F40, 0x0F18, 0x0F10, 0x0EDA, 0x0ECF,
+    0x0732, 0x0704, 0x0354, 0x0190, 0x004F, 0x0191, 0x0355, 0x0705,
+    0x0733, 0x0ED0, 0x0EDB, 0x0F11, 0x0F19, 0x0F41, 0x1EB8, 0x1F4B,
+    0x3F4F, 0x3F55, 0x3F65, 0x3F92, 0x7F85, 0x1F51, 0x1F39, 0x1F2B,
+    0x1F18, 0x1EC2, 0x0F38, 0x0F14, 0x0ECA, 0x074C, 0x0736, 0x0700,
+    0x06FC, 0x0350, 0x00BA, 0x004D, 0x00BB, 0x0351, 0x06FD, 0x0701,
+    0x0737, 0x074D, 0x0ECB, 0x0F15, 0x0F39, 0x1EC3, 0x1F1B, 0x1F2E,
+    0x1F3A, 0x1F53, 0x7F82, 0x3F8A, 0x3F47, 0x3F2E, 0x1F5E, 0x1E9E,
+    0x0F24, 0x0F20, 0x0EC6, 0x0746, 0x0726, 0x070E, 0x0370, 0x035E,
+    0x018A, 0x00AC, 0x0021, 0x00AD, 0x018B, 0x035F, 0x0371, 0x070F,
+    0x0727, 0x0747, 0x0EC7, 0x0F21, 0x0F25, 0x1E9F, 0x1F5F, 0x3F2D,
+    0x3F48, 0x3F8B, 0xFFF1, 0x1F78, 0x1F3D, 0x1EAD, 0x0F2B, 0x0EF9,
+    0x0EE9, 0x0739, 0x0719, 0x0709, 0x0363, 0x019B, 0x0195, 0x00B3,
+    0x0023, 0x0005, 0x0024, 0x00B4, 0x0196, 0x019C, 0x0364, 0x070A,
+    0x071A, 0x073A, 0x0EEA, 0x0EFA, 0x0F2C, 0x1EAE, 0x1F3E, 0x1F79,
+    0xFFF0, 0x0F0C, 0x0EE6, 0x0EDC, 0x0EC2, 0x0748, 0x0706, 0x0372,
+    0x034C, 0x0198, 0x0192, 0x00C0, 0x00BC, 0x00B6, 0x0053, 0x0006,
+    0x0000, 0x0007, 0x0054, 0x00B7, 0x00BD, 0x00C1, 0x0193, 0x0199,
+    0x034D, 0x0373, 0x0707, 0x0749, 0x0EC3, 0x0EDD, 0x0EE7, 0x0F0D,
+    0xFFF2, 0x1F7A, 0x1F3F, 0x1EAF, 0x0F2D, 0x0EFB, 0x0EEB, 0x073B,
+    0x071B, 0x070B, 0x0365, 0x019D, 0x0197, 0x00B5, 0x0025, 0x0004,
+    0x0022, 0x00B2, 0x0194, 0x019A, 0x0362, 0x0708, 0x0718, 0x0738,
+    0x0EE8, 0x0EF8, 0x0F2A, 0x1EAC, 0x1F3C, 0x1F7B, 0xFFF3, 0x3F89,
+    0x3F44, 0x3F2F, 0x1F5C, 0x1E9C, 0x0F22, 0x0F1E, 0x0EC4, 0x0744,
+    0x0724, 0x070C, 0x036E, 0x035C, 0x0188, 0x00AA, 0x0020, 0x00AB,
+    0x0189, 0x035D, 0x036F, 0x070D, 0x0725, 0x0745, 0x0EC5, 0x0F1F,
+    0x0F23, 0x1E9D, 0x1F5D, 0x3F2C, 0x3F45, 0x3F88, 0x7F81, 0x1F52,
+    0x1F38, 0x1F28, 0x1F19, 0x1EC0, 0x0F36, 0x0F12, 0x0EC8, 0x074A,
+    0x0734, 0x06FE, 0x06FA, 0x034E, 0x00B8, 0x004C, 0x00B9, 0x034F,
+    0x06FB, 0x06FF, 0x0735, 0x074B, 0x0EC9, 0x0F13, 0x0F37, 0x1EC1,
+    0x1F1A, 0x1F29, 0x1F3B, 0x1F50, 0x7F84, 0x3F91, 0x3F64, 0x3F54,
+    0x3F4C, 0x1F49, 0x1EB9, 0x0F3B, 0x0F16, 0x0F0E, 0x0ED8, 0x0ECD,
+    0x0730, 0x0702, 0x0352, 0x018E, 0x004E, 0x018F, 0x0353, 0x0703,
+    0x0731, 0x0ECE, 0x0ED9, 0x0F0F, 0x0F17, 0x0F3D, 0x1EBA, 0x1F4A,
+    0x3F4D, 0x3F51, 0x3F67, 0x3F93, 0x7F90, 0x7F7A, 0x3F6E, 0x3F37,
+    0x1F71, 0x1F63, 0x1EDC, 0x1EC4, 0x1EA4, 0x0F04, 0x0ED4, 0x074E,
+    0x073C, 0x0366, 0x0348, 0x0051, 0x0349, 0x0367, 0x073D, 0x074F,
+    0x0ED5, 0x0F06, 0x1EA6, 0x1EC6, 0x1EDE, 0x1F64, 0x1F73, 0x3F35,
+    0x3F6D, 0x7F7B, 0x7F8E, 0x7FD4, 0x7FD0, 0x3F5F, 0x3F26, 0x1F35,
+    0x1F27, 0x1EEA, 0x1EBE, 0x1EAA, 0x0F3E, 0x0F30, 0x0EE4, 0x071E,
+    0x0376, 0x01A2, 0x00AF, 0x01A3, 0x0377, 0x071F, 0x0EE5, 0x0F31,
+    0x0F3F, 0x1EAB, 0x1EBF, 0x1EEB, 0x1F26, 0x1F37, 0x3F27, 0x3F62,
+    0x7FCF, 0x7FD3, 0xFFD4, 0x7F78, 0x3F29, 0x3F11, 0x3F0B, 0x1F32,
+    0x1F22, 0x1EB6, 0x1EA2, 0x0F48, 0x0F34, 0x0F1C, 0x0758, 0x072A,
+    0x0358, 0x00B1, 0x0359, 0x072B, 0x0759, 0x0F1D, 0x0F35, 0x0F49,
+    0x1EA3, 0x1EB7, 0x1F21, 0x1F31, 0x3F09, 0x3F12, 0x3F2A, 0x7F79,
+    0xFFD6, 0xFFE1, 0x7F6D, 0x7F64, 0x3F73, 0x3F1D, 0x3F14, 0x1F81,
+    0x1EE0, 0x1EB0, 0x0F4A, 0x0EFD, 0x0EDE, 0x0ED1, 0x0720, 0x036A,
+    0x00BE, 0x036B, 0x0721, 0x0ED2, 0x0EDF, 0x0EFE, 0x0F4B, 0x1EB1,
+    0x1EE1, 0x1F7E, 0x3F15, 0x3F1E, 0x3F74, 0x7F62, 0x7F75, 0xFFE3,
+    0xFFDE, 0x7FDD, 0x7FBE, 0x7FB3, 0x7F60, 0x3F43, 0x3F0C, 0x1F47,
+    0x1F2D, 0x1EC8, 0x0F26, 0x0EF4, 0x0EEC, 0x072C, 0x06F2, 0x0184,
+    0x06F3, 0x072D, 0x0EED, 0x0EF5, 0x0F29, 0x1EC9, 0x1F2A, 0x1F44,
+    0x3F0D, 0x3F41, 0x7F5E, 0x7FB1, 0x7FC1, 0x7FD7, 0xFFDF, 0xFFEA,
+    0x7FA3, 0x7F8B, 0x7F69, 0x3F9E, 0x1F7F, 0x1F6D, 0x1F6B, 0x1EEE,
+    0x1EE6, 0x1ED4, 0x1ECF, 0x0EF2, 0x0754, 0x06F8, 0x018D, 0x06F9,
+    0x0755, 0x0EF3, 0x1ECC, 0x1ED7, 0x1EE7, 0x1EEF, 0x1F68, 0x1F6F,
+    0x1F80, 0x3F9D, 0x7F67, 0x7F8D, 0x7FA8, 0xFFE9, 0xFFE5, 0x7F89,
+    0x7F5D, 0x3F7F, 0x3F3F, 0x1F67, 0x1F4F, 0x1F17, 0x1F0F, 0x1F02,
+    0x1EFB, 0x1ED9, 0x0F45, 0x075C, 0x0716, 0x0187, 0x0717, 0x075D,
+    0x0F44, 0x1EDB, 0x1EF8, 0x1F03, 0x1F0D, 0x1F16, 0x1F4E, 0x1F66,
+    0x3F3E, 0x3F7E, 0x3FAD, 0x7F87, 0xFFE4, 0xFFF9, 0x7FC4, 0x7FBA,
+    0x7F6E, 0x3F95, 0x3F85, 0x3F78, 0x3F77, 0x3F5B, 0x3F49, 0x1F08,
+    0x1EFE, 0x1ED2, 0x0F01, 0x0710, 0x019E, 0x0711, 0x0F00, 0x1ED1,
+    0x1EFC, 0x1F0B, 0x3F4B, 0x3F57, 0x3F71, 0x3F79, 0x3F84, 0x3F94,
+    0x7F74, 0x7FBD, 0x7FC5, 0xFFFE, 0xFFED, 0x7FE0, 0x7FDF, 0x7FA4,
+    0x7F6A, 0x3FA0, 0x3F8F, 0x3F63, 0x3F3A, 0x3F23, 0x3F1A, 0x1F59,
+    0x1F1D, 0x1EF3, 0x0743, 0x035B, 0x0742, 0x1EF7, 0x1F1C, 0x1F5B,
+    0x3F1B, 0x3F20, 0x3F38, 0x3F60, 0x3F8C, 0x3FA2, 0x7F6C, 0x7FA9,
+    0x7FE5, 0x7FE4, 0xFFEF, 0xFFFF, 0x7FD6, 0x7FCB, 0x7FC7, 0x7F9B,
+    0x7F83, 0x3FA4, 0x3F81, 0x3F69, 0x3F30, 0x1F57, 0x1F41, 0x1F13,
+    0x1F06, 0x0EC0, 0x0361, 0x0EC1, 0x1F04, 0x1F10, 0x1F42, 0x1F55,
+    0x3F32, 0x3F6B, 0x3F82, 0x3FA7, 0x7F7E, 0x7F9D, 0x7FC8, 0x7FCD,
+    0x7FD8, 0xFFFA, 0xFFF7, 0xFFD9, 0x7FE8, 0x7FB6, 0x7FAB, 0x7FAA,
+    0x7FA1, 0x7F96, 0x7F95, 0x3FAB, 0x3F98, 0x3F50, 0x1F77, 0x1EF4,
+    0x0F0A, 0x06F1, 0x0F09, 0x1EF6, 0x1F74, 0x3F56, 0x3F99, 0x3FA9,
+    0x7F92, 0x7F97, 0x7F9F, 0x7FAD, 0x7FB2, 0x7FB8, 0x7FE7, 0xFFDD,
+    0xFFF4, 0x0050,
+};
+
+static const uint16_t clv_mvy_2_syms[] = {
+    0xF1F1, 0xF2F1, 0xF3F1, 0xF4F1, 0xF5F1, 0xF6F1, 0xF7F1, 0xF8F1,
+    0xF9F1, 0xFAF1, 0xFBF1, 0xFCF1, 0xFDF1, 0xFEF1, 0xFFF1, 0x00F1,
+    0x01F1, 0x02F1, 0x03F1, 0x04F1, 0x05F1, 0x06F1, 0x07F1, 0x08F1,
+    0x09F1, 0x0AF1, 0x0BF1, 0x0CF1, 0x0DF1, 0x0EF1, 0x0FF1, 0xF1F2,
+    0xF2F2, 0xF3F2, 0xF4F2, 0xF5F2, 0xF6F2, 0xF7F2, 0xF8F2, 0xF9F2,
+    0xFAF2, 0xFBF2, 0xFCF2, 0xFDF2, 0xFEF2, 0xFFF2, 0x00F2, 0x01F2,
+    0x02F2, 0x03F2, 0x04F2, 0x05F2, 0x06F2, 0x07F2, 0x08F2, 0x09F2,
+    0x0AF2, 0x0BF2, 0x0CF2, 0x0DF2, 0x0EF2, 0x0FF2, 0xF1F3, 0xF2F3,
+    0xF3F3, 0xF4F3, 0xF5F3, 0xF6F3, 0xF7F3, 0xF8F3, 0xF9F3, 0xFAF3,
+    0xFBF3, 0xFCF3, 0xFDF3, 0xFEF3, 0xFFF3, 0x00F3, 0x01F3, 0x02F3,
+    0x03F3, 0x04F3, 0x05F3, 0x06F3, 0x07F3, 0x08F3, 0x09F3, 0x0AF3,
+    0x0BF3, 0x0CF3, 0x0DF3, 0x0EF3, 0x0FF3, 0xF1F4, 0xF2F4, 0xF3F4,
+    0xF4F4, 0xF5F4, 0xF6F4, 0xF7F4, 0xF8F4, 0xF9F4, 0xFAF4, 0xFBF4,
+    0xFCF4, 0xFDF4, 0xFEF4, 0xFFF4, 0x00F4, 0x01F4, 0x02F4, 0x03F4,
+    0x04F4, 0x05F4, 0x06F4, 0x07F4, 0x08F4, 0x09F4, 0x0AF4, 0x0BF4,
+    0x0CF4, 0x0DF4, 0x0EF4, 0x0FF4, 0xF1F5, 0xF2F5, 0xF3F5, 0xF4F5,
+    0xF5F5, 0xF6F5, 0xF7F5, 0xF8F5, 0xF9F5, 0xFAF5, 0xFBF5, 0xFCF5,
+    0xFDF5, 0xFEF5, 0xFFF5, 0x00F5, 0x01F5, 0x02F5, 0x03F5, 0x04F5,
+    0x05F5, 0x06F5, 0x07F5, 0x08F5, 0x09F5, 0x0AF5, 0x0BF5, 0x0CF5,
+    0x0DF5, 0x0EF5, 0x0FF5, 0xF1F6, 0xF2F6, 0xF3F6, 0xF4F6, 0xF5F6,
+    0xF6F6, 0xF7F6, 0xF8F6, 0xF9F6, 0xFAF6, 0xFBF6, 0xFCF6, 0xFDF6,
+    0xFEF6, 0xFFF6, 0x00F6, 0x01F6, 0x02F6, 0x03F6, 0x04F6, 0x05F6,
+    0x06F6, 0x07F6, 0x08F6, 0x09F6, 0x0AF6, 0x0BF6, 0x0CF6, 0x0DF6,
+    0x0EF6, 0x0FF6, 0xF1F7, 0xF2F7, 0xF3F7, 0xF4F7, 0xF5F7, 0xF6F7,
+    0xF7F7, 0xF8F7, 0xF9F7, 0xFAF7, 0xFBF7, 0xFCF7, 0xFDF7, 0xFEF7,
+    0xFFF7, 0x00F7, 0x01F7, 0x02F7, 0x03F7, 0x04F7, 0x05F7, 0x06F7,
+    0x07F7, 0x08F7, 0x09F7, 0x0AF7, 0x0BF7, 0x0CF7, 0x0DF7, 0x0EF7,
+    0x0FF7, 0xF1F8, 0xF2F8, 0xF3F8, 0xF4F8, 0xF5F8, 0xF6F8, 0xF7F8,
+    0xF8F8, 0xF9F8, 0xFAF8, 0xFBF8, 0xFCF8, 0xFDF8, 0xFEF8, 0xFFF8,
+    0x00F8, 0x01F8, 0x02F8, 0x03F8, 0x04F8, 0x05F8, 0x06F8, 0x07F8,
+    0x08F8, 0x09F8, 0x0AF8, 0x0BF8, 0x0CF8, 0x0DF8, 0x0EF8, 0x0FF8,
+    0xF1F9, 0xF2F9, 0xF3F9, 0xF4F9, 0xF5F9, 0xF6F9, 0xF7F9, 0xF8F9,
+    0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9, 0xFFF9, 0x00F9,
+    0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9, 0x07F9, 0x08F9,
+    0x09F9, 0x0AF9, 0x0BF9, 0x0CF9, 0x0DF9, 0x0EF9, 0x0FF9, 0xF1FA,
+    0xF2FA, 0xF3FA, 0xF4FA, 0xF5FA, 0xF6FA, 0xF7FA, 0xF8FA, 0xF9FA,
+    0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA, 0xFEFA, 0xFFFA, 0x00FA, 0x01FA,
+    0x02FA, 0x03FA, 0x04FA, 0x05FA, 0x06FA, 0x07FA, 0x08FA, 0x09FA,
+    0x0AFA, 0x0BFA, 0x0CFA, 0x0DFA, 0x0EFA, 0x0FFA, 0xF1FB, 0xF2FB,
+    0xF3FB, 0xF4FB, 0xF5FB, 0xF6FB, 0xF7FB, 0xF8FB, 0xF9FB, 0xFAFB,
+    0xFBFB, 0xFCFB, 0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB,
+    0x03FB, 0x04FB, 0x05FB, 0x06FB, 0x07FB, 0x08FB, 0x09FB, 0x0AFB,
+    0x0BFB, 0x0CFB, 0x0DFB, 0x0EFB, 0x0FFB, 0xF1FC, 0xF2FC, 0xF3FC,
+    0xF4FC, 0xF5FC, 0xF6FC, 0xF7FC, 0xF8FC, 0xF9FC, 0xFAFC, 0xFBFC,
+    0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC, 0x00FC, 0x01FC, 0x02FC, 0x03FC,
+    0x04FC, 0x05FC, 0x06FC, 0x07FC, 0x08FC, 0x09FC, 0x0AFC, 0x0BFC,
+    0x0CFC, 0x0DFC, 0x0EFC, 0x0FFC, 0xF1FD, 0xF2FD, 0xF3FD, 0xF4FD,
+    0xF5FD, 0xF6FD, 0xF7FD, 0xF8FD, 0xF9FD, 0xFAFD, 0xFBFD, 0xFCFD,
+    0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD, 0x03FD, 0x04FD,
+    0x05FD, 0x06FD, 0x07FD, 0x08FD, 0x09FD, 0x0AFD, 0x0BFD, 0x0CFD,
+    0x0DFD, 0x0EFD, 0x0FFD, 0xF1FE, 0xF2FE, 0xF3FE, 0xF4FE, 0xF5FE,
+    0xF6FE, 0xF7FE, 0xF8FE, 0xF9FE, 0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE,
+    0xFEFE, 0xFFFE, 0x00FE, 0x01FE, 0x02FE, 0x03FE, 0x04FE, 0x05FE,
+    0x06FE, 0x07FE, 0x08FE, 0x09FE, 0x0AFE, 0x0BFE, 0x0CFE, 0x0DFE,
+    0x0EFE, 0x0FFE, 0xF1FF, 0xF2FF, 0xF3FF, 0xF4FF, 0xF5FF, 0xF6FF,
+    0xF7FF, 0xF8FF, 0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF,
+    0xFFFF, 0x00FF, 0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF,
+    0x07FF, 0x08FF, 0x09FF, 0x0AFF, 0x0BFF, 0x0CFF, 0x0DFF, 0x0EFF,
+    0x0FFF, 0xF100, 0xF200, 0xF300, 0xF400, 0xF500, 0xF600, 0xF700,
+    0xF800, 0xF900, 0xFA00, 0xFB00, 0xFC00, 0xFD00, 0xFE00, 0xFF00,
+    0x0000, 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700,
+    0x0800, 0x0900, 0x0A00, 0x0B00, 0x0C00, 0x0D00, 0x0E00, 0x0F00,
+    0xF101, 0xF201, 0xF301, 0xF401, 0xF501, 0xF601, 0xF701, 0xF801,
+    0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01, 0xFF01, 0x0001,
+    0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601, 0x0701, 0x0801,
+    0x0901, 0x0A01, 0x0B01, 0x0C01, 0x0D01, 0x0E01, 0x0F01, 0xF102,
+    0xF202, 0xF302, 0xF402, 0xF502, 0xF602, 0xF702, 0xF802, 0xF902,
+    0xFA02, 0xFB02, 0xFC02, 0xFD02, 0xFE02, 0xFF02, 0x0002, 0x0102,
+    0x0202, 0x0302, 0x0402, 0x0502, 0x0602, 0x0702, 0x0802, 0x0902,
+    0x0A02, 0x0B02, 0x0C02, 0x0D02, 0x0E02, 0x0F02, 0xF103, 0xF203,
+    0xF303, 0xF403, 0xF503, 0xF603, 0xF703, 0xF803, 0xF903, 0xFA03,
+    0xFB03, 0xFC03, 0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203,
+    0x0303, 0x0403, 0x0503, 0x0603, 0x0703, 0x0803, 0x0903, 0x0A03,
+    0x0B03, 0x0C03, 0x0D03, 0x0E03, 0x0F03, 0xF104, 0xF204, 0xF304,
+    0xF404, 0xF504, 0xF604, 0xF704, 0xF804, 0xF904, 0xFA04, 0xFB04,
+    0xFC04, 0xFD04, 0xFE04, 0xFF04, 0x0004, 0x0104, 0x0204, 0x0304,
+    0x0404, 0x0504, 0x0604, 0x0704, 0x0804, 0x0904, 0x0A04, 0x0B04,
+    0x0C04, 0x0D04, 0x0E04, 0x0F04, 0xF105, 0xF205, 0xF305, 0xF405,
+    0xF505, 0xF605, 0xF705, 0xF805, 0xF905, 0xFA05, 0xFB05, 0xFC05,
+    0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205, 0x0305, 0x0405,
+    0x0505, 0x0605, 0x0705, 0x0805, 0x0905, 0x0A05, 0x0B05, 0x0C05,
+    0x0D05, 0x0E05, 0x0F05, 0xF106, 0xF206, 0xF306, 0xF406, 0xF506,
+    0xF606, 0xF706, 0xF806, 0xF906, 0xFA06, 0xFB06, 0xFC06, 0xFD06,
+    0xFE06, 0xFF06, 0x0006, 0x0106, 0x0206, 0x0306, 0x0406, 0x0506,
+    0x0606, 0x0706, 0x0806, 0x0906, 0x0A06, 0x0B06, 0x0C06, 0x0D06,
+    0x0E06, 0x0F06, 0xF107, 0xF207, 0xF307, 0xF407, 0xF507, 0xF607,
+    0xF707, 0xF807, 0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07,
+    0xFF07, 0x0007, 0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607,
+    0x0707, 0x0807, 0x0907, 0x0A07, 0x0B07, 0x0C07, 0x0D07, 0x0E07,
+    0x0F07, 0xF108, 0xF208, 0xF308, 0xF408, 0xF508, 0xF608, 0xF708,
+    0xF808, 0xF908, 0xFA08, 0xFB08, 0xFC08, 0xFD08, 0xFE08, 0xFF08,
+    0x0008, 0x0108, 0x0208, 0x0308, 0x0408, 0x0508, 0x0608, 0x0708,
+    0x0808, 0x0908, 0x0A08, 0x0B08, 0x0C08, 0x0D08, 0x0E08, 0x0F08,
+    0xF109, 0xF209, 0xF309, 0xF409, 0xF509, 0xF609, 0xF709, 0xF809,
+    0xF909, 0xFA09, 0xFB09, 0xFC09, 0xFD09, 0xFE09, 0xFF09, 0x0009,
+    0x0109, 0x0209, 0x0309, 0x0409, 0x0509, 0x0609, 0x0709, 0x0809,
+    0x0909, 0x0A09, 0x0B09, 0x0C09, 0x0D09, 0x0E09, 0x0F09, 0xF10A,
+    0xF20A, 0xF30A, 0xF40A, 0xF50A, 0xF60A, 0xF70A, 0xF80A, 0xF90A,
+    0xFA0A, 0xFB0A, 0xFC0A, 0xFD0A, 0xFE0A, 0xFF0A, 0x000A, 0x010A,
+    0x020A, 0x030A, 0x040A, 0x050A, 0x060A, 0x070A, 0x080A, 0x090A,
+    0x0A0A, 0x0B0A, 0x0C0A, 0x0D0A, 0x0E0A, 0x0F0A, 0xF10B, 0xF20B,
+    0xF30B, 0xF40B, 0xF50B, 0xF60B, 0xF70B, 0xF80B, 0xF90B, 0xFA0B,
+    0xFB0B, 0xFC0B, 0xFD0B, 0xFE0B, 0xFF0B, 0x000B, 0x010B, 0x020B,
+    0x030B, 0x040B, 0x050B, 0x060B, 0x070B, 0x080B, 0x090B, 0x0A0B,
+    0x0B0B, 0x0C0B, 0x0D0B, 0x0E0B, 0x0F0B, 0xF10C, 0xF20C, 0xF30C,
+    0xF40C, 0xF50C, 0xF60C, 0xF70C, 0xF80C, 0xF90C, 0xFA0C, 0xFB0C,
+    0xFC0C, 0xFD0C, 0xFE0C, 0xFF0C, 0x000C, 0x010C, 0x020C, 0x030C,
+    0x040C, 0x050C, 0x060C, 0x070C, 0x080C, 0x090C, 0x0A0C, 0x0B0C,
+    0x0C0C, 0x0D0C, 0x0E0C, 0x0F0C, 0xF10D, 0xF20D, 0xF30D, 0xF40D,
+    0xF50D, 0xF60D, 0xF70D, 0xF80D, 0xF90D, 0xFA0D, 0xFB0D, 0xFC0D,
+    0xFD0D, 0xFE0D, 0xFF0D, 0x000D, 0x010D, 0x020D, 0x030D, 0x040D,
+    0x050D, 0x060D, 0x070D, 0x080D, 0x090D, 0x0A0D, 0x0B0D, 0x0C0D,
+    0x0D0D, 0x0E0D, 0x0F0D, 0xF10E, 0xF20E, 0xF30E, 0xF40E, 0xF50E,
+    0xF60E, 0xF70E, 0xF80E, 0xF90E, 0xFA0E, 0xFB0E, 0xFC0E, 0xFD0E,
+    0xFE0E, 0xFF0E, 0x000E, 0x010E, 0x020E, 0x030E, 0x040E, 0x050E,
+    0x060E, 0x070E, 0x080E, 0x090E, 0x0A0E, 0x0B0E, 0x0C0E, 0x0D0E,
+    0x0E0E, 0x0F0E, 0xF10F, 0xF20F, 0xF30F, 0xF40F, 0xF50F, 0xF60F,
+    0xF70F, 0xF80F, 0xF90F, 0xFA0F, 0xFB0F, 0xFC0F, 0xFD0F, 0xFE0F,
+    0xFF0F, 0x000F, 0x010F, 0x020F, 0x030F, 0x040F, 0x050F, 0x060F,
+    0x070F, 0x080F, 0x090F, 0x0A0F, 0x0B0F, 0x0C0F, 0x0D0F, 0x0E0F,
+    0x0F0F, 0x1010,
+};
+
+static const uint8_t clv_mvy_3_bits[] = {
+    16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 14, 14, 13,
+    13, 12, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+    15, 15, 14, 14, 14, 13, 12, 11, 12, 13, 14, 14, 14, 15, 15, 15,
+    15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 12, 11, 12, 14, 14,
+    14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+    16, 16, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13,
+    12, 11, 12, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+    15, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 13, 13, 12, 10, 12, 13, 13, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+    14, 14, 14, 14, 13, 13, 13, 13, 13, 12, 12, 10, 12, 12, 13, 13,
+    13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 15, 15,
+    15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 11,
+    10, 11, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 15,
+    15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12,
+    12, 12, 12, 11, 10,  9, 10, 11, 12, 12, 12, 12, 12, 13, 13, 13,
+    13, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 13, 13,
+    12, 12, 12, 12, 12, 12, 12, 11, 11, 10,  9, 10, 11, 11, 12, 12,
+    12, 12, 12, 12, 12, 13, 13, 14, 14, 14, 15, 15, 15, 16, 15, 15,
+    15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 10,  9,
+    10, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 14, 14, 15, 15, 15,
+    15, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12,
+    11, 11, 11, 10,  9, 10, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13,
+    14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 14, 14, 14, 13, 13, 12,
+    12, 12, 12, 12, 12, 11, 11, 11, 10,  8, 10, 11, 11, 11, 12, 12,
+    12, 12, 12, 12, 13, 13, 14, 14, 14, 15, 15, 15, 15, 15, 14, 14,
+    14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10,  8, 10,
+    10, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 14, 14, 14, 14, 15,
+    15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+    10, 10,  9,  7,  9, 10, 10, 11, 11, 11, 12, 12, 12, 12, 13, 13,
+    13, 14, 14, 14, 14, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12,
+    12, 12, 11, 11, 11, 10, 10,  9,  7,  9, 10, 10, 11, 11, 11, 12,
+    12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 14, 14,
+    13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10,  9,  9,  7,  9,  9,
+    10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 14, 14, 14, 15,
+    15, 14, 14, 14, 13, 13, 13, 13, 12, 11, 11, 11, 11, 10, 10, 10,
+     9,  8,  6,  8,  9, 10, 10, 10, 11, 11, 11, 11, 12, 13, 13, 13,
+    13, 14, 14, 14, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+    11, 10, 10,  9,  9,  8,  6,  4,  6,  8,  9,  9, 10, 10, 11, 11,
+    11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 13, 13, 13, 12, 12, 12,
+    12, 11, 10, 10, 10, 10,  9,  9,  8,  8,  7,  5,  2,  5,  7,  8,
+     8,  9,  9, 10, 10, 10, 10, 11, 12, 12, 12, 12, 13, 13, 13, 15,
+    14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,  9,  9,  8,
+     6,  4,  6,  8,  9,  9, 10, 10, 11, 11, 11, 11, 12, 12, 13, 13,
+    14, 14, 14, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 11, 11, 11,
+    11, 10, 10, 10,  9,  8,  6,  8,  9, 10, 10, 10, 11, 11, 11, 11,
+    12, 13, 13, 13, 13, 14, 14, 14, 15, 15, 14, 14, 14, 14, 13, 13,
+    12, 12, 12, 12, 11, 11, 11, 11, 10,  9,  9,  7,  9,  9, 10, 11,
+    11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 14, 14, 14, 15, 15, 14,
+    14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10,  9,
+     7,  9, 10, 10, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 14, 14,
+    14, 14, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+    11, 11, 10, 10,  9,  7,  9, 10, 10, 11, 11, 11, 12, 12, 12, 12,
+    13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 14, 14, 13, 13,
+    12, 12, 12, 12, 12, 11, 11, 11, 10, 10,  8, 10, 10, 11, 11, 11,
+    12, 12, 12, 12, 12, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 15,
+    14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10,  8,
+    10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 14, 14, 14, 15,
+    15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12,
+    11, 11, 11, 10,  9, 10, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13,
+    14, 14, 14, 14, 14, 15, 15, 16, 15, 15, 15, 15, 14, 14, 13, 13,
+    12, 12, 12, 12, 12, 11, 11, 11, 10,  9, 10, 11, 11, 11, 12, 12,
+    12, 12, 12, 13, 13, 14, 14, 15, 15, 15, 15, 16, 15, 15, 15, 14,
+    14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10,  9, 10,
+    11, 11, 12, 12, 12, 12, 12, 12, 12, 13, 13, 14, 14, 14, 15, 15,
+    15, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12,
+    12, 11, 10,  9, 10, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 14,
+    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14,
+    13, 13, 13, 13, 13, 13, 13, 11, 10, 11, 13, 13, 13, 13, 13, 13,
+    14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 15, 15,
+    15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 12, 12, 10, 12, 12,
+    13, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+    16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13,
+    13, 12, 10, 12, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15,
+    15, 15, 15, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 14,
+    14, 14, 14, 14, 13, 13, 12, 11, 12, 13, 13, 14, 14, 14, 14, 14,
+    15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 12, 11, 12, 14, 14,
+    14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 13,
+    12, 11, 12, 13, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+    15, 14, 14, 13, 13, 12, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15,
+    15, 15, 16, 16, 16, 16, 16, 16, 16,  7,
+};
+
+static const uint16_t clv_mvy_3_codes[] = {
+    0xFFF6, 0xFFEE, 0xFFDC, 0xFFD7, 0xFFB4, 0xFFAA, 0xFFA0, 0x7FCC,
+    0x7F8E, 0x7F7B, 0x7F77, 0x7F13, 0x7F11, 0x3F56, 0x3F48, 0x1F16,
+    0x1E94, 0x0F28, 0x06F0, 0x0F2A, 0x1E96, 0x1F1C, 0x3F46, 0x3F58,
+    0x7F15, 0x7F0E, 0x7F76, 0x7F80, 0x7F90, 0x7FC8, 0xFFA2, 0xFFA7,
+    0xFFB5, 0xFFD8, 0xFFDD, 0xFFEC, 0xFFF5, 0xFFF1, 0xFFEA, 0xFFE4,
+    0xFFD1, 0xFFC1, 0xFFA8, 0x7F70, 0x7F5C, 0x7F44, 0x7F40, 0x7F2A,
+    0x7F16, 0x7EDF, 0x3ED6, 0x3ECA, 0x3ECC, 0x1EB5, 0x0EDE, 0x06D3,
+    0x0EDF, 0x1EB6, 0x3ECE, 0x3ED0, 0x3ED8, 0x7EE5, 0x7F19, 0x7F31,
+    0x7F3E, 0x7F45, 0x7F5B, 0x7F6F, 0xFFA9, 0xFFC0, 0xFFCE, 0xFFE5,
+    0xFFE8, 0xFFF2, 0xFFFE, 0xFFE2, 0xFFBE, 0x7FBB, 0x7F75, 0x7F6B,
+    0x7F58, 0x7EF9, 0x7EDC, 0x3F68, 0x3F27, 0x3F28, 0x3EDF, 0x3ED2,
+    0x3EC1, 0x3EA6, 0x3E87, 0x0F0F, 0x06B3, 0x0F10, 0x3E89, 0x3EA8,
+    0x3EC3, 0x3ED5, 0x3EE0, 0x3F2B, 0x3F26, 0x3F67, 0x7EDA, 0x7EFB,
+    0x7F56, 0x7F6A, 0x7F72, 0x7FC1, 0xFFBC, 0xFFE0, 0xFFFF, 0xFFF8,
+    0xFFD3, 0xFFB9, 0x7FAC, 0x7F94, 0x7F8B, 0x7F62, 0x7F4A, 0x7F05,
+    0x7EEE, 0x3F44, 0x3EA4, 0x3E78, 0x3E6F, 0x3E54, 0x1EEF, 0x1EA9,
+    0x0ED4, 0x06A8, 0x0ED5, 0x1EA8, 0x1EEC, 0x3E51, 0x3E69, 0x3E75,
+    0x3EA3, 0x3F43, 0x7EF1, 0x7F04, 0x7F4B, 0x7F5D, 0x7F89, 0x7F95,
+    0x7FAA, 0xFFBB, 0xFFC8, 0xFFF9, 0xFFD9, 0xFFCA, 0x7FC3, 0x7F8C,
+    0x7F38, 0x7F02, 0x3F5A, 0x3F4A, 0x3F30, 0x3EFF, 0x3EF4, 0x3EE9,
+    0x3E95, 0x3E73, 0x3E43, 0x1F08, 0x1E81, 0x0E84, 0x0349, 0x0E85,
+    0x1E82, 0x1F07, 0x3E45, 0x3E74, 0x3E96, 0x3EEA, 0x3EF1, 0x3F00,
+    0x3F32, 0x3F4F, 0x3F5C, 0x7F01, 0x7F3B, 0x7F8D, 0x7FC2, 0xFFCF,
+    0xFFD6, 0xFFC7, 0xFFB1, 0xFFA6, 0x7FA1, 0x7F2F, 0x7F24, 0x7F0A,
+    0x3EF6, 0x3E97, 0x3E83, 0x3E7F, 0x1F04, 0x1EE1, 0x1ECB, 0x1EAE,
+    0x1E7E, 0x0EBA, 0x0E8C, 0x0333, 0x0E8D, 0x0EBB, 0x1E7F, 0x1EAF,
+    0x1ECC, 0x1EE2, 0x1F05, 0x3E80, 0x3E84, 0x3E98, 0x3EF9, 0x7F06,
+    0x7F1B, 0x7F2D, 0x7F9F, 0xFFAF, 0xFFB0, 0xFFC6, 0x7FB5, 0x7FB1,
+    0x7F36, 0x7F25, 0x3F63, 0x3F3A, 0x3F1B, 0x3EBB, 0x3E63, 0x3E5B,
+    0x3E40, 0x1F14, 0x1EF5, 0x1EEB, 0x1EBE, 0x1E92, 0x1E6A, 0x070C,
+    0x032D, 0x070D, 0x1E6B, 0x1E93, 0x1EBF, 0x1EE8, 0x1EF3, 0x1F15,
+    0x3E3F, 0x3E5C, 0x3E64, 0x3EB9, 0x3F1C, 0x3F3C, 0x3F62, 0x7F22,
+    0x7F39, 0x7FAF, 0x7FAE, 0x7FC6, 0x7F7F, 0x7F53, 0x7F32, 0x3F66,
+    0x3EF0, 0x3E65, 0x1EDA, 0x1ED3, 0x1E98, 0x1E74, 0x0F20, 0x0F1D,
+    0x0EEA, 0x0EA9, 0x0E70, 0x0718, 0x033E, 0x0179, 0x033F, 0x0719,
+    0x0E71, 0x0EAB, 0x0EE9, 0x0F1F, 0x0F22, 0x1E76, 0x1E9B, 0x1ED5,
+    0x1EDD, 0x3E68, 0x3EF3, 0x3F6C, 0x7F33, 0x7F4E, 0x7F7D, 0x7FCB,
+    0x7FBD, 0x7F21, 0x7EF6, 0x3F0C, 0x3EBE, 0x3EAE, 0x1EA3, 0x1E85,
+    0x0F0C, 0x0F06, 0x0EEE, 0x0EE6, 0x0ECA, 0x0EA2, 0x0E78, 0x0720,
+    0x06BA, 0x0326, 0x015D, 0x0327, 0x06BB, 0x0721, 0x0E79, 0x0EA3,
+    0x0ECB, 0x0EE7, 0x0EEF, 0x0F07, 0x0F0D, 0x1E86, 0x1EA4, 0x3EAF,
+    0x3EBF, 0x3F0A, 0x7EF8, 0x7F1E, 0x7FBF, 0xFF9F, 0x7FB3, 0x7F96,
+    0x7F0B, 0x7EF5, 0x3E85, 0x3E4E, 0x1EB3, 0x1E6E, 0x0F2D, 0x0EBD,
+    0x0E97, 0x0E93, 0x0E86, 0x071A, 0x06E4, 0x06BC, 0x0338, 0x015A,
+    0x0339, 0x06BD, 0x06E5, 0x071B, 0x0E87, 0x0E94, 0x0E98, 0x0EBE,
+    0x0F2E, 0x1E6F, 0x1EB4, 0x3E4F, 0x3E88, 0x7EF4, 0x7F07, 0x7F99,
+    0x7FB7, 0xFF9D, 0x7FA8, 0x7EEB, 0x3F5D, 0x3EB2, 0x3EAA, 0x3E5D,
+    0x3E49, 0x1EFE, 0x1E89, 0x0F16, 0x0F12, 0x0EE0, 0x0E7A, 0x0E6A,
+    0x070E, 0x06FA, 0x06B4, 0x0314, 0x0158, 0x0315, 0x06B5, 0x06FB,
+    0x070F, 0x0E6B, 0x0E7B, 0x0EE1, 0x0F13, 0x0F17, 0x1E8A, 0x1EFF,
+    0x3E4A, 0x3E5E, 0x3EAB, 0x3EB3, 0x3F59, 0x7EEC, 0x7FA2, 0x7F82,
+    0x7F5E, 0x7F28, 0x3EDA, 0x3EC9, 0x3E7A, 0x1ED6, 0x1ECE, 0x0EFC,
+    0x0EF0, 0x0E9A, 0x0E7E, 0x0E66, 0x0E5E, 0x0722, 0x06C0, 0x06A0,
+    0x02FA, 0x00A8, 0x02FB, 0x06A1, 0x06C1, 0x0723, 0x0E5F, 0x0E67,
+    0x0E7F, 0x0E9B, 0x0EF1, 0x0EFD, 0x1ECF, 0x1ED7, 0x3E79, 0x3ECD,
+    0x3EDB, 0x7F26, 0x7F5A, 0x7F83, 0x7F54, 0x7EE8, 0x3F54, 0x3F0E,
+    0x3EFB, 0x3E47, 0x1EC1, 0x1EB9, 0x0EF9, 0x0EF3, 0x0EC1, 0x0E8F,
+    0x0E74, 0x0714, 0x06F4, 0x06B0, 0x0336, 0x030A, 0x009F, 0x030B,
+    0x0337, 0x06B1, 0x06F5, 0x0715, 0x0E75, 0x0E91, 0x0EC3, 0x0EF6,
+    0x0EFB, 0x1EBB, 0x1EC3, 0x3E41, 0x3EF8, 0x3F10, 0x3F4D, 0x7EE9,
+    0x7F52, 0x7F9A, 0x3F3F, 0x3F1F, 0x3F03, 0x3EA0, 0x1F0F, 0x1E72,
+    0x1E62, 0x0EDA, 0x0ED2, 0x0EB2, 0x0E64, 0x0708, 0x06EA, 0x06DA,
+    0x0346, 0x032A, 0x0176, 0x004B, 0x0177, 0x032B, 0x0347, 0x06DB,
+    0x06EB, 0x0709, 0x0E65, 0x0EB3, 0x0ED3, 0x0EDB, 0x1E63, 0x1E6C,
+    0x1F0E, 0x3E9E, 0x3F01, 0x3F1D, 0x3F3D, 0x7F9B, 0x7EDE, 0x3F36,
+    0x3F2E, 0x3F07, 0x3E99, 0x1F0D, 0x1EA1, 0x1E8C, 0x0EB4, 0x0EAC,
+    0x0E5A, 0x0E50, 0x0702, 0x06D4, 0x06C6, 0x032E, 0x0310, 0x0162,
+    0x0046, 0x0163, 0x0311, 0x032F, 0x06C7, 0x06D5, 0x0703, 0x0E51,
+    0x0E5B, 0x0EAD, 0x0EB5, 0x1E8F, 0x1EA7, 0x1F0C, 0x3E9A, 0x3F02,
+    0x3F2D, 0x3F35, 0x7EE0, 0x7F66, 0x3F11, 0x3EE1, 0x3EBC, 0x3E56,
+    0x1EC4, 0x1E64, 0x0F04, 0x0EC7, 0x0E56, 0x0E4C, 0x06EC, 0x06DC,
+    0x069C, 0x0694, 0x0320, 0x016C, 0x0154, 0x0044, 0x0155, 0x016D,
+    0x0321, 0x0695, 0x069D, 0x06DD, 0x06ED, 0x0E4D, 0x0E57, 0x0EC4,
+    0x0F05, 0x1E65, 0x1EC5, 0x3E55, 0x3EB7, 0x3EE3, 0x3F13, 0x7F67,
+    0x7FA7, 0x3F49, 0x3F22, 0x3EE5, 0x1EF6, 0x1EE5, 0x1E9C, 0x1E78,
+    0x0EA4, 0x06F6, 0x06DE, 0x06CE, 0x06AA, 0x0340, 0x0318, 0x02FE,
+    0x015E, 0x009A, 0x001C, 0x009B, 0x015F, 0x02FF, 0x0319, 0x0341,
+    0x06AB, 0x06CF, 0x06DF, 0x06F7, 0x0EA5, 0x1E79, 0x1E9D, 0x1EE7,
+    0x1EF7, 0x3EE6, 0x3F23, 0x3F4B, 0x7FA3, 0x7F49, 0x3F14, 0x3E8D,
+    0x3E6B, 0x1F17, 0x1EF8, 0x0F24, 0x0ECC, 0x06FE, 0x06CA, 0x06A4,
+    0x0698, 0x030C, 0x0302, 0x0170, 0x0168, 0x00A0, 0x001E, 0x0004,
+    0x001F, 0x00A1, 0x0169, 0x0171, 0x0303, 0x030D, 0x0699, 0x06A5,
+    0x06CB, 0x06FF, 0x0ECD, 0x0F25, 0x1EF9, 0x1F18, 0x3E6C, 0x3E8E,
+    0x3F15, 0x7F46, 0x1EF0, 0x1EDE, 0x1EC8, 0x0F1A, 0x0F00, 0x0E9E,
+    0x0E54, 0x06C4, 0x031E, 0x031C, 0x0306, 0x02F8, 0x017A, 0x0166,
+    0x00A6, 0x00A4, 0x0048, 0x000C, 0x0000, 0x000D, 0x0049, 0x00A5,
+    0x00A7, 0x0167, 0x017B, 0x02F9, 0x0307, 0x031D, 0x031F, 0x06C5,
+    0x0E55, 0x0E9F, 0x0F01, 0x0F1B, 0x1EC9, 0x1EDF, 0x1EF1, 0x7F47,
+    0x3F12, 0x3E8F, 0x3E6D, 0x1F1B, 0x1EFA, 0x0F26, 0x0ECE, 0x0700,
+    0x06CC, 0x06A6, 0x069A, 0x030E, 0x0304, 0x0172, 0x016A, 0x00A2,
+    0x0020, 0x0005, 0x0021, 0x00A3, 0x016B, 0x0173, 0x0305, 0x030F,
+    0x069B, 0x06A7, 0x06CD, 0x0701, 0x0ECF, 0x0F27, 0x1EFB, 0x1F1D,
+    0x3E6E, 0x3E90, 0x3F17, 0x7F48, 0x7FA9, 0x3F50, 0x3F24, 0x3EE7,
+    0x1EFC, 0x1EE9, 0x1E9E, 0x1E7A, 0x0EA6, 0x06F8, 0x06E0, 0x06D0,
+    0x06AC, 0x0342, 0x031A, 0x0300, 0x0160, 0x009C, 0x001D, 0x009D,
+    0x0161, 0x0301, 0x031B, 0x0343, 0x06AD, 0x06D1, 0x06E1, 0x06F9,
+    0x0EA7, 0x1E7B, 0x1E9F, 0x1EEA, 0x1EFD, 0x3EE8, 0x3F21, 0x3F52,
+    0x7F9E, 0x7F68, 0x3F16, 0x3EE2, 0x3EBA, 0x3E57, 0x1EC6, 0x1E66,
+    0x0F08, 0x0EC5, 0x0E58, 0x0E4E, 0x06EE, 0x06E2, 0x069E, 0x0696,
+    0x0322, 0x016E, 0x0156, 0x0045, 0x0157, 0x016F, 0x0323, 0x0697,
+    0x069F, 0x06E3, 0x06EF, 0x0E4F, 0x0E59, 0x0EC6, 0x0F09, 0x1E67,
+    0x1EC7, 0x3E58, 0x3EB8, 0x3EE4, 0x3F18, 0x7F69, 0x7EE2, 0x3F38,
+    0x3F33, 0x3F04, 0x3E9C, 0x1F0A, 0x1EA5, 0x1E8D, 0x0EB6, 0x0EAE,
+    0x0E5C, 0x0E52, 0x0704, 0x06D6, 0x06C8, 0x0330, 0x0312, 0x0164,
+    0x0047, 0x0165, 0x0313, 0x0331, 0x06C9, 0x06D7, 0x0705, 0x0E53,
+    0x0E5D, 0x0EAF, 0x0EB7, 0x1E8E, 0x1EA6, 0x1F0B, 0x3E9B, 0x3F05,
+    0x3F31, 0x3F39, 0x7EE1, 0x7F9C, 0x3F40, 0x3F1E, 0x3F06, 0x3E9D,
+    0x1F10, 0x1E70, 0x1E60, 0x0ED8, 0x0ED0, 0x0EB0, 0x0E62, 0x0706,
+    0x06E8, 0x06D8, 0x0344, 0x0328, 0x0174, 0x004A, 0x0175, 0x0329,
+    0x0345, 0x06D9, 0x06E9, 0x0707, 0x0E63, 0x0EB1, 0x0ED1, 0x0ED9,
+    0x1E61, 0x1E71, 0x1F11, 0x3E9F, 0x3F08, 0x3F20, 0x3F3E, 0x7F9D,
+    0x7F4F, 0x7EE6, 0x3F53, 0x3F0D, 0x3EFA, 0x3E46, 0x1EC0, 0x1EB8,
+    0x0EF8, 0x0EF2, 0x0EC0, 0x0E8E, 0x0E72, 0x0712, 0x06F2, 0x06AE,
+    0x0334, 0x0308, 0x009E, 0x0309, 0x0335, 0x06AF, 0x06F3, 0x0713,
+    0x0E73, 0x0E90, 0x0EC2, 0x0EF4, 0x0EFA, 0x1EBA, 0x1EC2, 0x3E48,
+    0x3EFC, 0x3F0B, 0x3F51, 0x7EE7, 0x7F51, 0x7F84, 0x7F61, 0x7F27,
+    0x3EDC, 0x3EC5, 0x3E7C, 0x1ED8, 0x1ED0, 0x0EFE, 0x0EF5, 0x0E9C,
+    0x0E80, 0x0E68, 0x0E60, 0x0724, 0x06C2, 0x06A2, 0x02FC, 0x00A9,
+    0x02FD, 0x06A3, 0x06C3, 0x0725, 0x0E61, 0x0E69, 0x0E81, 0x0E9D,
+    0x0EF7, 0x0EFF, 0x1ED1, 0x1ED9, 0x3E7B, 0x3ECF, 0x3ED9, 0x7F29,
+    0x7F65, 0x7F85, 0x7FA0, 0x7EEA, 0x3F60, 0x3EB4, 0x3EAC, 0x3E5F,
+    0x3E4B, 0x1F00, 0x1E8B, 0x0F18, 0x0F14, 0x0EE2, 0x0E7C, 0x0E6C,
+    0x0710, 0x06FC, 0x06B6, 0x0316, 0x0159, 0x0317, 0x06B7, 0x06FD,
+    0x0711, 0x0E6D, 0x0E7D, 0x0EE3, 0x0F15, 0x0F19, 0x1E88, 0x1F01,
+    0x3E4C, 0x3E60, 0x3EA9, 0x3EB1, 0x3F5B, 0x7EED, 0x7FA5, 0xFF9E,
+    0x7FBE, 0x7F98, 0x7F0D, 0x7EF3, 0x3E8C, 0x3E50, 0x1EB7, 0x1E73,
+    0x0F2F, 0x0EBF, 0x0E99, 0x0E95, 0x0E88, 0x071C, 0x06E6, 0x06BE,
+    0x033A, 0x015B, 0x033B, 0x06BF, 0x06E7, 0x071D, 0x0E89, 0x0E92,
+    0x0E96, 0x0EBC, 0x0F2C, 0x1E6D, 0x1EB0, 0x3E4D, 0x3E8B, 0x7EF2,
+    0x7F08, 0x7F97, 0x7FB0, 0xFF9C, 0x7FB2, 0x7F23, 0x7EFA, 0x3F0F,
+    0x3EBD, 0x3EAD, 0x1EA0, 0x1E87, 0x0F0A, 0x0F02, 0x0EED, 0x0EE4,
+    0x0EC8, 0x0EA0, 0x0E76, 0x071E, 0x06B8, 0x0324, 0x015C, 0x0325,
+    0x06B9, 0x071F, 0x0E77, 0x0EA1, 0x0EC9, 0x0EE5, 0x0EEC, 0x0F03,
+    0x0F0B, 0x1E84, 0x1EA2, 0x3EB0, 0x3EC4, 0x3F09, 0x7EF7, 0x7F1D,
+    0x7FBC, 0x7FCD, 0x7F81, 0x7F50, 0x7F34, 0x3F65, 0x3EED, 0x3E67,
+    0x1EDB, 0x1ED2, 0x1E99, 0x1E77, 0x0F21, 0x0F1C, 0x0EE8, 0x0EA8,
+    0x0E6E, 0x0716, 0x033C, 0x0178, 0x033D, 0x0717, 0x0E6F, 0x0EAA,
+    0x0EEB, 0x0F1E, 0x0F23, 0x1E75, 0x1E9A, 0x1ED4, 0x1EDC, 0x3E66,
+    0x3EEE, 0x3F6B, 0x7F35, 0x7F55, 0x7F7A, 0x7FCA, 0x7FB6, 0x7FB8,
+    0x7F37, 0x7F1F, 0x3F61, 0x3F37, 0x3F1A, 0x3EB5, 0x3E62, 0x3E5A,
+    0x1F1E, 0x1F13, 0x1EF2, 0x1EE6, 0x1EBC, 0x1E91, 0x1E68, 0x070A,
+    0x032C, 0x070B, 0x1E69, 0x1E90, 0x1EBD, 0x1EE4, 0x1EF4, 0x1F12,
+    0x3E3E, 0x3E59, 0x3E61, 0x3EB6, 0x3F19, 0x3F3B, 0x3F64, 0x7F1C,
+    0x7F3D, 0x7FB4, 0x7FB9, 0xFFC5, 0xFFB6, 0xFFAD, 0x7FA4, 0x7F2C,
+    0x7F20, 0x7F09, 0x3EF5, 0x3E93, 0x3E82, 0x3E7E, 0x1F02, 0x1EE3,
+    0x1ECA, 0x1EAD, 0x1E7C, 0x0EB8, 0x0E8A, 0x0332, 0x0E8B, 0x0EB9,
+    0x1E7D, 0x1EAC, 0x1ECD, 0x1EE0, 0x1F03, 0x3E7D, 0x3E81, 0x3E91,
+    0x3EF7, 0x7F0C, 0x7F1A, 0x7F2E, 0x7FA6, 0xFFA4, 0xFFB7, 0xFFC4,
+    0xFFD4, 0xFFCC, 0x7FC5, 0x7F8A, 0x7F3C, 0x7EFF, 0x3F5E, 0x3F4E,
+    0x3F2F, 0x3EFD, 0x3EEF, 0x3EEB, 0x3E92, 0x3E71, 0x3E42, 0x1F09,
+    0x1E83, 0x0E82, 0x0348, 0x0E83, 0x1E80, 0x1F06, 0x3E44, 0x3E72,
+    0x3E94, 0x3EEC, 0x3EF2, 0x3EFE, 0x3F34, 0x3F4C, 0x3F5F, 0x7EFE,
+    0x7F3A, 0x7F86, 0x7FC4, 0xFFD2, 0xFFD5, 0xFFFA, 0xFFCD, 0xFFBA,
+    0x7FAB, 0x7F92, 0x7F87, 0x7F63, 0x7F4C, 0x7F03, 0x7EF0, 0x3F42,
+    0x3EA1, 0x3E76, 0x3E6A, 0x3E52, 0x1EEE, 0x1EAB, 0x0ED6, 0x06A9,
+    0x0ED7, 0x1EAA, 0x1EED, 0x3E53, 0x3E70, 0x3E77, 0x3EA2, 0x3F41,
+    0x7EEF, 0x7F00, 0x7F4D, 0x7F60, 0x7F88, 0x7F93, 0x7FAD, 0xFFB8,
+    0xFFCB, 0xFFFB, 0xFFFC, 0xFFE1, 0xFFBF, 0x7FBA, 0x7F73, 0x7F6C,
+    0x7F57, 0x7EFC, 0x7EDD, 0x3F6A, 0x3F2A, 0x3F25, 0x3EDD, 0x3ED3,
+    0x3EC2, 0x3EA7, 0x3E86, 0x0F11, 0x06B2, 0x0F0E, 0x3E8A, 0x3EA5,
+    0x3EC0, 0x3ED1, 0x3EDE, 0x3F2C, 0x3F29, 0x3F69, 0x7EDB, 0x7EFD,
+    0x7F59, 0x7F6D, 0x7F74, 0x7FC0, 0xFFC3, 0xFFE3, 0xFFFD, 0xFFF0,
+    0xFFE9, 0xFFE7, 0xFFC9, 0xFFBD, 0xFFAE, 0x7F71, 0x7F5F, 0x7F42,
+    0x7F41, 0x7F30, 0x7F17, 0x7EE4, 0x3ED7, 0x3EC7, 0x3ECB, 0x1EB1,
+    0x0EDC, 0x06D2, 0x0EDD, 0x1EB2, 0x3EC6, 0x3EC8, 0x3ED4, 0x7EE3,
+    0x7F18, 0x7F2B, 0x7F3F, 0x7F43, 0x7F64, 0x7F6E, 0xFFAB, 0xFFC2,
+    0xFFD0, 0xFFE6, 0xFFEF, 0xFFF3, 0xFFF7, 0xFFEB, 0xFFDE, 0xFFDA,
+    0xFFB3, 0xFFAC, 0xFFA3, 0x7FC9, 0x7F8F, 0x7F7C, 0x7F79, 0x7F0F,
+    0x7F10, 0x3F55, 0x3F45, 0x1F1A, 0x1E95, 0x0F2B, 0x06F1, 0x0F29,
+    0x1E97, 0x1F19, 0x3F47, 0x3F57, 0x7F14, 0x7F12, 0x7F78, 0x7F7E,
+    0x7F91, 0x7FC7, 0xFFA1, 0xFFA5, 0xFFB2, 0xFFDB, 0xFFDF, 0xFFED,
+    0xFFF4, 0x004C,
+};
+
+static const uint16_t clv_mvy_3_syms[] = {
+    0xEEEE, 0xEFEE, 0xF0EE, 0xF1EE, 0xF2EE, 0xF3EE, 0xF4EE, 0xF5EE,
+    0xF6EE, 0xF7EE, 0xF8EE, 0xF9EE, 0xFAEE, 0xFBEE, 0xFCEE, 0xFDEE,
+    0xFEEE, 0xFFEE, 0x00EE, 0x01EE, 0x02EE, 0x03EE, 0x04EE, 0x05EE,
+    0x06EE, 0x07EE, 0x08EE, 0x09EE, 0x0AEE, 0x0BEE, 0x0CEE, 0x0DEE,
+    0x0EEE, 0x0FEE, 0x10EE, 0x11EE, 0x12EE, 0xEEEF, 0xEFEF, 0xF0EF,
+    0xF1EF, 0xF2EF, 0xF3EF, 0xF4EF, 0xF5EF, 0xF6EF, 0xF7EF, 0xF8EF,
+    0xF9EF, 0xFAEF, 0xFBEF, 0xFCEF, 0xFDEF, 0xFEEF, 0xFFEF, 0x00EF,
+    0x01EF, 0x02EF, 0x03EF, 0x04EF, 0x05EF, 0x06EF, 0x07EF, 0x08EF,
+    0x09EF, 0x0AEF, 0x0BEF, 0x0CEF, 0x0DEF, 0x0EEF, 0x0FEF, 0x10EF,
+    0x11EF, 0x12EF, 0xEEF0, 0xEFF0, 0xF0F0, 0xF1F0, 0xF2F0, 0xF3F0,
+    0xF4F0, 0xF5F0, 0xF6F0, 0xF7F0, 0xF8F0, 0xF9F0, 0xFAF0, 0xFBF0,
+    0xFCF0, 0xFDF0, 0xFEF0, 0xFFF0, 0x00F0, 0x01F0, 0x02F0, 0x03F0,
+    0x04F0, 0x05F0, 0x06F0, 0x07F0, 0x08F0, 0x09F0, 0x0AF0, 0x0BF0,
+    0x0CF0, 0x0DF0, 0x0EF0, 0x0FF0, 0x10F0, 0x11F0, 0x12F0, 0xEEF1,
+    0xEFF1, 0xF0F1, 0xF1F1, 0xF2F1, 0xF3F1, 0xF4F1, 0xF5F1, 0xF6F1,
+    0xF7F1, 0xF8F1, 0xF9F1, 0xFAF1, 0xFBF1, 0xFCF1, 0xFDF1, 0xFEF1,
+    0xFFF1, 0x00F1, 0x01F1, 0x02F1, 0x03F1, 0x04F1, 0x05F1, 0x06F1,
+    0x07F1, 0x08F1, 0x09F1, 0x0AF1, 0x0BF1, 0x0CF1, 0x0DF1, 0x0EF1,
+    0x0FF1, 0x10F1, 0x11F1, 0x12F1, 0xEEF2, 0xEFF2, 0xF0F2, 0xF1F2,
+    0xF2F2, 0xF3F2, 0xF4F2, 0xF5F2, 0xF6F2, 0xF7F2, 0xF8F2, 0xF9F2,
+    0xFAF2, 0xFBF2, 0xFCF2, 0xFDF2, 0xFEF2, 0xFFF2, 0x00F2, 0x01F2,
+    0x02F2, 0x03F2, 0x04F2, 0x05F2, 0x06F2, 0x07F2, 0x08F2, 0x09F2,
+    0x0AF2, 0x0BF2, 0x0CF2, 0x0DF2, 0x0EF2, 0x0FF2, 0x10F2, 0x11F2,
+    0x12F2, 0xEEF3, 0xEFF3, 0xF0F3, 0xF1F3, 0xF2F3, 0xF3F3, 0xF4F3,
+    0xF5F3, 0xF6F3, 0xF7F3, 0xF8F3, 0xF9F3, 0xFAF3, 0xFBF3, 0xFCF3,
+    0xFDF3, 0xFEF3, 0xFFF3, 0x00F3, 0x01F3, 0x02F3, 0x03F3, 0x04F3,
+    0x05F3, 0x06F3, 0x07F3, 0x08F3, 0x09F3, 0x0AF3, 0x0BF3, 0x0CF3,
+    0x0DF3, 0x0EF3, 0x0FF3, 0x10F3, 0x11F3, 0x12F3, 0xEEF4, 0xEFF4,
+    0xF0F4, 0xF1F4, 0xF2F4, 0xF3F4, 0xF4F4, 0xF5F4, 0xF6F4, 0xF7F4,
+    0xF8F4, 0xF9F4, 0xFAF4, 0xFBF4, 0xFCF4, 0xFDF4, 0xFEF4, 0xFFF4,
+    0x00F4, 0x01F4, 0x02F4, 0x03F4, 0x04F4, 0x05F4, 0x06F4, 0x07F4,
+    0x08F4, 0x09F4, 0x0AF4, 0x0BF4, 0x0CF4, 0x0DF4, 0x0EF4, 0x0FF4,
+    0x10F4, 0x11F4, 0x12F4, 0xEEF5, 0xEFF5, 0xF0F5, 0xF1F5, 0xF2F5,
+    0xF3F5, 0xF4F5, 0xF5F5, 0xF6F5, 0xF7F5, 0xF8F5, 0xF9F5, 0xFAF5,
+    0xFBF5, 0xFCF5, 0xFDF5, 0xFEF5, 0xFFF5, 0x00F5, 0x01F5, 0x02F5,
+    0x03F5, 0x04F5, 0x05F5, 0x06F5, 0x07F5, 0x08F5, 0x09F5, 0x0AF5,
+    0x0BF5, 0x0CF5, 0x0DF5, 0x0EF5, 0x0FF5, 0x10F5, 0x11F5, 0x12F5,
+    0xEEF6, 0xEFF6, 0xF0F6, 0xF1F6, 0xF2F6, 0xF3F6, 0xF4F6, 0xF5F6,
+    0xF6F6, 0xF7F6, 0xF8F6, 0xF9F6, 0xFAF6, 0xFBF6, 0xFCF6, 0xFDF6,
+    0xFEF6, 0xFFF6, 0x00F6, 0x01F6, 0x02F6, 0x03F6, 0x04F6, 0x05F6,
+    0x06F6, 0x07F6, 0x08F6, 0x09F6, 0x0AF6, 0x0BF6, 0x0CF6, 0x0DF6,
+    0x0EF6, 0x0FF6, 0x10F6, 0x11F6, 0x12F6, 0xEEF7, 0xEFF7, 0xF0F7,
+    0xF1F7, 0xF2F7, 0xF3F7, 0xF4F7, 0xF5F7, 0xF6F7, 0xF7F7, 0xF8F7,
+    0xF9F7, 0xFAF7, 0xFBF7, 0xFCF7, 0xFDF7, 0xFEF7, 0xFFF7, 0x00F7,
+    0x01F7, 0x02F7, 0x03F7, 0x04F7, 0x05F7, 0x06F7, 0x07F7, 0x08F7,
+    0x09F7, 0x0AF7, 0x0BF7, 0x0CF7, 0x0DF7, 0x0EF7, 0x0FF7, 0x10F7,
+    0x11F7, 0x12F7, 0xEEF8, 0xEFF8, 0xF0F8, 0xF1F8, 0xF2F8, 0xF3F8,
+    0xF4F8, 0xF5F8, 0xF6F8, 0xF7F8, 0xF8F8, 0xF9F8, 0xFAF8, 0xFBF8,
+    0xFCF8, 0xFDF8, 0xFEF8, 0xFFF8, 0x00F8, 0x01F8, 0x02F8, 0x03F8,
+    0x04F8, 0x05F8, 0x06F8, 0x07F8, 0x08F8, 0x09F8, 0x0AF8, 0x0BF8,
+    0x0CF8, 0x0DF8, 0x0EF8, 0x0FF8, 0x10F8, 0x11F8, 0x12F8, 0xEEF9,
+    0xEFF9, 0xF0F9, 0xF1F9, 0xF2F9, 0xF3F9, 0xF4F9, 0xF5F9, 0xF6F9,
+    0xF7F9, 0xF8F9, 0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9,
+    0xFFF9, 0x00F9, 0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9,
+    0x07F9, 0x08F9, 0x09F9, 0x0AF9, 0x0BF9, 0x0CF9, 0x0DF9, 0x0EF9,
+    0x0FF9, 0x10F9, 0x11F9, 0x12F9, 0xEEFA, 0xEFFA, 0xF0FA, 0xF1FA,
+    0xF2FA, 0xF3FA, 0xF4FA, 0xF5FA, 0xF6FA, 0xF7FA, 0xF8FA, 0xF9FA,
+    0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA, 0xFEFA, 0xFFFA, 0x00FA, 0x01FA,
+    0x02FA, 0x03FA, 0x04FA, 0x05FA, 0x06FA, 0x07FA, 0x08FA, 0x09FA,
+    0x0AFA, 0x0BFA, 0x0CFA, 0x0DFA, 0x0EFA, 0x0FFA, 0x10FA, 0x11FA,
+    0x12FA, 0xEEFB, 0xEFFB, 0xF0FB, 0xF1FB, 0xF2FB, 0xF3FB, 0xF4FB,
+    0xF5FB, 0xF6FB, 0xF7FB, 0xF8FB, 0xF9FB, 0xFAFB, 0xFBFB, 0xFCFB,
+    0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB, 0x03FB, 0x04FB,
+    0x05FB, 0x06FB, 0x07FB, 0x08FB, 0x09FB, 0x0AFB, 0x0BFB, 0x0CFB,
+    0x0DFB, 0x0EFB, 0x0FFB, 0x10FB, 0x11FB, 0x12FB, 0xEEFC, 0xEFFC,
+    0xF0FC, 0xF1FC, 0xF2FC, 0xF3FC, 0xF4FC, 0xF5FC, 0xF6FC, 0xF7FC,
+    0xF8FC, 0xF9FC, 0xFAFC, 0xFBFC, 0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC,
+    0x00FC, 0x01FC, 0x02FC, 0x03FC, 0x04FC, 0x05FC, 0x06FC, 0x07FC,
+    0x08FC, 0x09FC, 0x0AFC, 0x0BFC, 0x0CFC, 0x0DFC, 0x0EFC, 0x0FFC,
+    0x10FC, 0x11FC, 0x12FC, 0xEEFD, 0xEFFD, 0xF0FD, 0xF1FD, 0xF2FD,
+    0xF3FD, 0xF4FD, 0xF5FD, 0xF6FD, 0xF7FD, 0xF8FD, 0xF9FD, 0xFAFD,
+    0xFBFD, 0xFCFD, 0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD,
+    0x03FD, 0x04FD, 0x05FD, 0x06FD, 0x07FD, 0x08FD, 0x09FD, 0x0AFD,
+    0x0BFD, 0x0CFD, 0x0DFD, 0x0EFD, 0x0FFD, 0x10FD, 0x11FD, 0x12FD,
+    0xEEFE, 0xEFFE, 0xF0FE, 0xF1FE, 0xF2FE, 0xF3FE, 0xF4FE, 0xF5FE,
+    0xF6FE, 0xF7FE, 0xF8FE, 0xF9FE, 0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE,
+    0xFEFE, 0xFFFE, 0x00FE, 0x01FE, 0x02FE, 0x03FE, 0x04FE, 0x05FE,
+    0x06FE, 0x07FE, 0x08FE, 0x09FE, 0x0AFE, 0x0BFE, 0x0CFE, 0x0DFE,
+    0x0EFE, 0x0FFE, 0x10FE, 0x11FE, 0x12FE, 0xEEFF, 0xEFFF, 0xF0FF,
+    0xF1FF, 0xF2FF, 0xF3FF, 0xF4FF, 0xF5FF, 0xF6FF, 0xF7FF, 0xF8FF,
+    0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF, 0xFFFF, 0x00FF,
+    0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF, 0x07FF, 0x08FF,
+    0x09FF, 0x0AFF, 0x0BFF, 0x0CFF, 0x0DFF, 0x0EFF, 0x0FFF, 0x10FF,
+    0x11FF, 0x12FF, 0xEE00, 0xEF00, 0xF000, 0xF100, 0xF200, 0xF300,
+    0xF400, 0xF500, 0xF600, 0xF700, 0xF800, 0xF900, 0xFA00, 0xFB00,
+    0xFC00, 0xFD00, 0xFE00, 0xFF00, 0x0000, 0x0100, 0x0200, 0x0300,
+    0x0400, 0x0500, 0x0600, 0x0700, 0x0800, 0x0900, 0x0A00, 0x0B00,
+    0x0C00, 0x0D00, 0x0E00, 0x0F00, 0x1000, 0x1100, 0x1200, 0xEE01,
+    0xEF01, 0xF001, 0xF101, 0xF201, 0xF301, 0xF401, 0xF501, 0xF601,
+    0xF701, 0xF801, 0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01,
+    0xFF01, 0x0001, 0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601,
+    0x0701, 0x0801, 0x0901, 0x0A01, 0x0B01, 0x0C01, 0x0D01, 0x0E01,
+    0x0F01, 0x1001, 0x1101, 0x1201, 0xEE02, 0xEF02, 0xF002, 0xF102,
+    0xF202, 0xF302, 0xF402, 0xF502, 0xF602, 0xF702, 0xF802, 0xF902,
+    0xFA02, 0xFB02, 0xFC02, 0xFD02, 0xFE02, 0xFF02, 0x0002, 0x0102,
+    0x0202, 0x0302, 0x0402, 0x0502, 0x0602, 0x0702, 0x0802, 0x0902,
+    0x0A02, 0x0B02, 0x0C02, 0x0D02, 0x0E02, 0x0F02, 0x1002, 0x1102,
+    0x1202, 0xEE03, 0xEF03, 0xF003, 0xF103, 0xF203, 0xF303, 0xF403,
+    0xF503, 0xF603, 0xF703, 0xF803, 0xF903, 0xFA03, 0xFB03, 0xFC03,
+    0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203, 0x0303, 0x0403,
+    0x0503, 0x0603, 0x0703, 0x0803, 0x0903, 0x0A03, 0x0B03, 0x0C03,
+    0x0D03, 0x0E03, 0x0F03, 0x1003, 0x1103, 0x1203, 0xEE04, 0xEF04,
+    0xF004, 0xF104, 0xF204, 0xF304, 0xF404, 0xF504, 0xF604, 0xF704,
+    0xF804, 0xF904, 0xFA04, 0xFB04, 0xFC04, 0xFD04, 0xFE04, 0xFF04,
+    0x0004, 0x0104, 0x0204, 0x0304, 0x0404, 0x0504, 0x0604, 0x0704,
+    0x0804, 0x0904, 0x0A04, 0x0B04, 0x0C04, 0x0D04, 0x0E04, 0x0F04,
+    0x1004, 0x1104, 0x1204, 0xEE05, 0xEF05, 0xF005, 0xF105, 0xF205,
+    0xF305, 0xF405, 0xF505, 0xF605, 0xF705, 0xF805, 0xF905, 0xFA05,
+    0xFB05, 0xFC05, 0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205,
+    0x0305, 0x0405, 0x0505, 0x0605, 0x0705, 0x0805, 0x0905, 0x0A05,
+    0x0B05, 0x0C05, 0x0D05, 0x0E05, 0x0F05, 0x1005, 0x1105, 0x1205,
+    0xEE06, 0xEF06, 0xF006, 0xF106, 0xF206, 0xF306, 0xF406, 0xF506,
+    0xF606, 0xF706, 0xF806, 0xF906, 0xFA06, 0xFB06, 0xFC06, 0xFD06,
+    0xFE06, 0xFF06, 0x0006, 0x0106, 0x0206, 0x0306, 0x0406, 0x0506,
+    0x0606, 0x0706, 0x0806, 0x0906, 0x0A06, 0x0B06, 0x0C06, 0x0D06,
+    0x0E06, 0x0F06, 0x1006, 0x1106, 0x1206, 0xEE07, 0xEF07, 0xF007,
+    0xF107, 0xF207, 0xF307, 0xF407, 0xF507, 0xF607, 0xF707, 0xF807,
+    0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07, 0xFF07, 0x0007,
+    0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607, 0x0707, 0x0807,
+    0x0907, 0x0A07, 0x0B07, 0x0C07, 0x0D07, 0x0E07, 0x0F07, 0x1007,
+    0x1107, 0x1207, 0xEE08, 0xEF08, 0xF008, 0xF108, 0xF208, 0xF308,
+    0xF408, 0xF508, 0xF608, 0xF708, 0xF808, 0xF908, 0xFA08, 0xFB08,
+    0xFC08, 0xFD08, 0xFE08, 0xFF08, 0x0008, 0x0108, 0x0208, 0x0308,
+    0x0408, 0x0508, 0x0608, 0x0708, 0x0808, 0x0908, 0x0A08, 0x0B08,
+    0x0C08, 0x0D08, 0x0E08, 0x0F08, 0x1008, 0x1108, 0x1208, 0xEE09,
+    0xEF09, 0xF009, 0xF109, 0xF209, 0xF309, 0xF409, 0xF509, 0xF609,
+    0xF709, 0xF809, 0xF909, 0xFA09, 0xFB09, 0xFC09, 0xFD09, 0xFE09,
+    0xFF09, 0x0009, 0x0109, 0x0209, 0x0309, 0x0409, 0x0509, 0x0609,
+    0x0709, 0x0809, 0x0909, 0x0A09, 0x0B09, 0x0C09, 0x0D09, 0x0E09,
+    0x0F09, 0x1009, 0x1109, 0x1209, 0xEE0A, 0xEF0A, 0xF00A, 0xF10A,
+    0xF20A, 0xF30A, 0xF40A, 0xF50A, 0xF60A, 0xF70A, 0xF80A, 0xF90A,
+    0xFA0A, 0xFB0A, 0xFC0A, 0xFD0A, 0xFE0A, 0xFF0A, 0x000A, 0x010A,
+    0x020A, 0x030A, 0x040A, 0x050A, 0x060A, 0x070A, 0x080A, 0x090A,
+    0x0A0A, 0x0B0A, 0x0C0A, 0x0D0A, 0x0E0A, 0x0F0A, 0x100A, 0x110A,
+    0x120A, 0xEE0B, 0xEF0B, 0xF00B, 0xF10B, 0xF20B, 0xF30B, 0xF40B,
+    0xF50B, 0xF60B, 0xF70B, 0xF80B, 0xF90B, 0xFA0B, 0xFB0B, 0xFC0B,
+    0xFD0B, 0xFE0B, 0xFF0B, 0x000B, 0x010B, 0x020B, 0x030B, 0x040B,
+    0x050B, 0x060B, 0x070B, 0x080B, 0x090B, 0x0A0B, 0x0B0B, 0x0C0B,
+    0x0D0B, 0x0E0B, 0x0F0B, 0x100B, 0x110B, 0x120B, 0xEE0C, 0xEF0C,
+    0xF00C, 0xF10C, 0xF20C, 0xF30C, 0xF40C, 0xF50C, 0xF60C, 0xF70C,
+    0xF80C, 0xF90C, 0xFA0C, 0xFB0C, 0xFC0C, 0xFD0C, 0xFE0C, 0xFF0C,
+    0x000C, 0x010C, 0x020C, 0x030C, 0x040C, 0x050C, 0x060C, 0x070C,
+    0x080C, 0x090C, 0x0A0C, 0x0B0C, 0x0C0C, 0x0D0C, 0x0E0C, 0x0F0C,
+    0x100C, 0x110C, 0x120C, 0xEE0D, 0xEF0D, 0xF00D, 0xF10D, 0xF20D,
+    0xF30D, 0xF40D, 0xF50D, 0xF60D, 0xF70D, 0xF80D, 0xF90D, 0xFA0D,
+    0xFB0D, 0xFC0D, 0xFD0D, 0xFE0D, 0xFF0D, 0x000D, 0x010D, 0x020D,
+    0x030D, 0x040D, 0x050D, 0x060D, 0x070D, 0x080D, 0x090D, 0x0A0D,
+    0x0B0D, 0x0C0D, 0x0D0D, 0x0E0D, 0x0F0D, 0x100D, 0x110D, 0x120D,
+    0xEE0E, 0xEF0E, 0xF00E, 0xF10E, 0xF20E, 0xF30E, 0xF40E, 0xF50E,
+    0xF60E, 0xF70E, 0xF80E, 0xF90E, 0xFA0E, 0xFB0E, 0xFC0E, 0xFD0E,
+    0xFE0E, 0xFF0E, 0x000E, 0x010E, 0x020E, 0x030E, 0x040E, 0x050E,
+    0x060E, 0x070E, 0x080E, 0x090E, 0x0A0E, 0x0B0E, 0x0C0E, 0x0D0E,
+    0x0E0E, 0x0F0E, 0x100E, 0x110E, 0x120E, 0xEE0F, 0xEF0F, 0xF00F,
+    0xF10F, 0xF20F, 0xF30F, 0xF40F, 0xF50F, 0xF60F, 0xF70F, 0xF80F,
+    0xF90F, 0xFA0F, 0xFB0F, 0xFC0F, 0xFD0F, 0xFE0F, 0xFF0F, 0x000F,
+    0x010F, 0x020F, 0x030F, 0x040F, 0x050F, 0x060F, 0x070F, 0x080F,
+    0x090F, 0x0A0F, 0x0B0F, 0x0C0F, 0x0D0F, 0x0E0F, 0x0F0F, 0x100F,
+    0x110F, 0x120F, 0xEE10, 0xEF10, 0xF010, 0xF110, 0xF210, 0xF310,
+    0xF410, 0xF510, 0xF610, 0xF710, 0xF810, 0xF910, 0xFA10, 0xFB10,
+    0xFC10, 0xFD10, 0xFE10, 0xFF10, 0x0010, 0x0110, 0x0210, 0x0310,
+    0x0410, 0x0510, 0x0610, 0x0710, 0x0810, 0x0910, 0x0A10, 0x0B10,
+    0x0C10, 0x0D10, 0x0E10, 0x0F10, 0x1010, 0x1110, 0x1210, 0xEE11,
+    0xEF11, 0xF011, 0xF111, 0xF211, 0xF311, 0xF411, 0xF511, 0xF611,
+    0xF711, 0xF811, 0xF911, 0xFA11, 0xFB11, 0xFC11, 0xFD11, 0xFE11,
+    0xFF11, 0x0011, 0x0111, 0x0211, 0x0311, 0x0411, 0x0511, 0x0611,
+    0x0711, 0x0811, 0x0911, 0x0A11, 0x0B11, 0x0C11, 0x0D11, 0x0E11,
+    0x0F11, 0x1011, 0x1111, 0x1211, 0xEE12, 0xEF12, 0xF012, 0xF112,
+    0xF212, 0xF312, 0xF412, 0xF512, 0xF612, 0xF712, 0xF812, 0xF912,
+    0xFA12, 0xFB12, 0xFC12, 0xFD12, 0xFE12, 0xFF12, 0x0012, 0x0112,
+    0x0212, 0x0312, 0x0412, 0x0512, 0x0612, 0x0712, 0x0812, 0x0912,
+    0x0A12, 0x0B12, 0x0C12, 0x0D12, 0x0E12, 0x0F12, 0x1012, 0x1112,
+    0x1212, 0x1313,
+};
+
+static const uint8_t clv_mvu_1_bits[] = {
+    16, 14, 14, 14, 13, 12, 12, 10, 12, 12, 13, 14, 14, 14, 16, 15,
+    13, 13, 12, 12, 11, 11,  7, 11, 11, 12, 12, 13, 13, 15, 16, 16,
+    16, 13, 12, 10, 10,  6, 10, 10, 12, 13, 16, 16, 16, 14, 14, 11,
+    11, 11, 11,  9,  7,  9, 11, 11, 11, 11, 14, 14, 15, 13, 13, 12,
+     9,  8,  8,  4,  8,  8,  9, 12, 13, 13, 15, 14, 14, 11, 11, 10,
+     9,  8,  4,  8,  9, 10, 11, 12, 14, 14, 12, 12, 11, 10, 10,  8,
+     6,  3,  6,  8, 10, 10, 11, 12, 12, 11, 10,  9,  6,  6,  6,  5,
+     4,  5,  6,  6,  6,  9, 10, 11, 12, 12, 11, 10, 10,  8,  6,  3,
+     6,  7, 10, 10, 11, 12, 12, 14, 14, 11, 11, 10,  9,  8,  4,  8,
+     9, 10, 11, 11, 14, 14, 15, 13, 13, 12,  9,  8,  8,  4,  8,  8,
+     9, 12, 13, 13, 15, 14, 14, 11, 11, 11, 11,  9,  7,  9, 11, 11,
+    11, 11, 14, 14, 16, 16, 16, 13, 12, 10, 10,  6, 10, 10, 12, 13,
+    16, 16, 16, 15, 13, 13, 12, 12, 11, 11,  7, 11, 11, 12, 12, 13,
+    13, 15, 16, 14, 14, 14, 13, 12, 12, 10, 12, 12, 13, 14, 14, 14,
+    16,  7,
+};
+
+static const uint16_t clv_mvu_1_codes[] = {
+    0xFFFC, 0x3FED, 0x3FE5, 0x3FE3, 0x1FD9, 0x0FD9, 0x0FD6, 0x03CE,
+    0x0FD3, 0x0FD8, 0x1FD6, 0x3FE0, 0x3FE8, 0x3FEC, 0xFFFE, 0x7FF3,
+    0x1FE7, 0x1FDA, 0x0FCF, 0x0FCC, 0x07DD, 0x07CC, 0x006B, 0x07CD,
+    0x07DE, 0x0FCE, 0x0FD2, 0x1FDD, 0x1FEC, 0x7FF0, 0xFFF6, 0xFFFA,
+    0xFFF2, 0x1FDE, 0x0FDB, 0x03D8, 0x03CA, 0x002E, 0x03CB, 0x03D9,
+    0x0FDC, 0x1FDF, 0xFFF3, 0xFFF9, 0xFFF5, 0x3FF3, 0x3FDD, 0x07DA,
+    0x07D2, 0x07CA, 0x07C2, 0x01DE, 0x0069, 0x01DF, 0x07C3, 0x07CB,
+    0x07D3, 0x07DB, 0x3FDF, 0x3FF6, 0x7FF7, 0x1FED, 0x1FE5, 0x0FDF,
+    0x01D8, 0x00E3, 0x00DF, 0x0007, 0x00E0, 0x00E4, 0x01D9, 0x0FE0,
+    0x1FE3, 0x1FE9, 0x7FF4, 0x3FF5, 0x3FE7, 0x07E4, 0x07C6, 0x03D2,
+    0x01E2, 0x00E9, 0x0006, 0x00EA, 0x01E3, 0x03D3, 0x07C7, 0x0FCA,
+    0x3FE9, 0x3FF0, 0x0FE8, 0x0FE5, 0x07D5, 0x03DD, 0x03D5, 0x00DC,
+    0x002B, 0x0001, 0x002C, 0x00DD, 0x03D6, 0x03DE, 0x07D6, 0x0FE4,
+    0x0FE9, 0x07E0, 0x03C8, 0x01D6, 0x0032, 0x0030, 0x0028, 0x0012,
+    0x0004, 0x0013, 0x0029, 0x0031, 0x0033, 0x01D7, 0x03C9, 0x07E1,
+    0x0FEA, 0x0FE6, 0x07D7, 0x03DF, 0x03D7, 0x00DE, 0x002D, 0x0000,
+    0x002A, 0x006D, 0x03D4, 0x03DC, 0x07D4, 0x0FE3, 0x0FE7, 0x3FF1,
+    0x3FE4, 0x07E2, 0x07C4, 0x03D0, 0x01E0, 0x00E7, 0x0005, 0x00E8,
+    0x01E1, 0x03D1, 0x07C5, 0x07E3, 0x3FEA, 0x3FF7, 0x7FF6, 0x1FE6,
+    0x1FE4, 0x0FE1, 0x01DA, 0x00E5, 0x00E1, 0x0008, 0x00E2, 0x00E6,
+    0x01DB, 0x0FE2, 0x1FE2, 0x1FEB, 0x7FF5, 0x3FF4, 0x3FDC, 0x07D9,
+    0x07D0, 0x07C8, 0x07C0, 0x01DC, 0x0068, 0x01DD, 0x07C1, 0x07C9,
+    0x07D1, 0x07D8, 0x3FDE, 0x3FF2, 0xFFFB, 0xFFF4, 0xFFF0, 0x1FE1,
+    0x0FDD, 0x03DA, 0x03CC, 0x002F, 0x03CD, 0x03DB, 0x0FDE, 0x1FE0,
+    0xFFF1, 0xFFF7, 0xFFF8, 0x7FF2, 0x1FEA, 0x1FDC, 0x0FD1, 0x0FCD,
+    0x07DC, 0x07CF, 0x006C, 0x07CE, 0x07DF, 0x0FCB, 0x0FD0, 0x1FDB,
+    0x1FE8, 0x7FF1, 0xFFFD, 0x3FEE, 0x3FEB, 0x3FE1, 0x1FD7, 0x0FD7,
+    0x0FD5, 0x03CF, 0x0FD4, 0x0FDA, 0x1FD8, 0x3FE2, 0x3FE6, 0x3FEF,
+    0xFFFF, 0x006A,
+};
+
+static const uint16_t clv_mvu_1_syms[] = {
+    0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9, 0xFFF9, 0x00F9,
+    0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9, 0x07F9, 0xF9FA,
+    0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA, 0xFEFA, 0xFFFA, 0x00FA, 0x01FA,
+    0x02FA, 0x03FA, 0x04FA, 0x05FA, 0x06FA, 0x07FA, 0xF9FB, 0xFAFB,
+    0xFBFB, 0xFCFB, 0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB,
+    0x03FB, 0x04FB, 0x05FB, 0x06FB, 0x07FB, 0xF9FC, 0xFAFC, 0xFBFC,
+    0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC, 0x00FC, 0x01FC, 0x02FC, 0x03FC,
+    0x04FC, 0x05FC, 0x06FC, 0x07FC, 0xF9FD, 0xFAFD, 0xFBFD, 0xFCFD,
+    0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD, 0x03FD, 0x04FD,
+    0x05FD, 0x06FD, 0x07FD, 0xF9FE, 0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE,
+    0xFEFE, 0xFFFE, 0x00FE, 0x01FE, 0x02FE, 0x03FE, 0x04FE, 0x05FE,
+    0x06FE, 0x07FE, 0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF,
+    0xFFFF, 0x00FF, 0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF,
+    0x07FF, 0xF900, 0xFA00, 0xFB00, 0xFC00, 0xFD00, 0xFE00, 0xFF00,
+    0x0000, 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700,
+    0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01, 0xFF01, 0x0001,
+    0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601, 0x0701, 0xF902,
+    0xFA02, 0xFB02, 0xFC02, 0xFD02, 0xFE02, 0xFF02, 0x0002, 0x0102,
+    0x0202, 0x0302, 0x0402, 0x0502, 0x0602, 0x0702, 0xF903, 0xFA03,
+    0xFB03, 0xFC03, 0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203,
+    0x0303, 0x0403, 0x0503, 0x0603, 0x0703, 0xF904, 0xFA04, 0xFB04,
+    0xFC04, 0xFD04, 0xFE04, 0xFF04, 0x0004, 0x0104, 0x0204, 0x0304,
+    0x0404, 0x0504, 0x0604, 0x0704, 0xF905, 0xFA05, 0xFB05, 0xFC05,
+    0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205, 0x0305, 0x0405,
+    0x0505, 0x0605, 0x0705, 0xF906, 0xFA06, 0xFB06, 0xFC06, 0xFD06,
+    0xFE06, 0xFF06, 0x0006, 0x0106, 0x0206, 0x0306, 0x0406, 0x0506,
+    0x0606, 0x0706, 0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07,
+    0xFF07, 0x0007, 0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607,
+    0x0707, 0x0808,
+};
+
+static const uint8_t clv_mvu_2_bits[] = {
+    16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 15, 15,
+    15, 15, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13,
+    13, 14, 14, 14, 14, 14, 15, 15, 15, 16, 16, 15, 15, 14, 14, 14,
+    14, 14, 14, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 16, 16,
+    15, 15, 14, 13, 13, 13, 13, 13, 12, 12, 12, 13, 13, 13, 13, 13,
+    14, 15, 15, 16, 14, 14, 14, 14, 13, 12, 12, 12, 12, 11, 10, 11,
+    12, 12, 12, 12, 13, 14, 14, 14, 14, 14, 14, 14, 13, 12, 12, 12,
+    12, 11, 10,  9, 10, 11, 12, 12, 12, 12, 13, 14, 14, 14, 15, 14,
+    13, 13, 12, 12, 12, 12, 11, 10,  8, 10, 11, 12, 12, 12, 12, 13,
+    13, 14, 15, 14, 14, 14, 13, 12, 12, 11, 11, 10,  9,  7,  9, 10,
+    11, 11, 12, 12, 13, 14, 14, 14, 14, 14, 13, 12, 12, 11, 11, 10,
+     9,  8,  7,  8,  9, 10, 11, 11, 12, 12, 13, 14, 14, 14, 13, 13,
+    12, 11,  9,  9,  8,  7,  6,  5,  6,  7,  8,  9,  9, 11, 12, 13,
+    13, 14, 14, 13, 13, 13, 11, 11, 10,  8,  7,  4,  1,  4,  7,  8,
+    10, 11, 11, 13, 13, 13, 14, 14, 13, 13, 12, 11,  9,  9,  8,  7,
+     6,  5,  6,  7,  8,  9,  9, 11, 12, 13, 13, 14, 14, 14, 13, 12,
+    11, 11, 11,  9,  9,  8,  7,  8,  9, 10, 11, 11, 12, 12, 13, 14,
+    14, 14, 14, 14, 13, 12, 12, 11, 11, 10,  9,  7,  9, 10, 11, 11,
+    12, 12, 13, 14, 14, 14, 15, 14, 13, 13, 12, 12, 12, 12, 11, 10,
+     9, 10, 11, 12, 12, 12, 12, 13, 13, 14, 15, 14, 14, 14, 13, 12,
+    12, 12, 12, 11, 10,  8, 10, 11, 12, 12, 12, 12, 13, 14, 14, 14,
+    14, 14, 14, 14, 13, 12, 12, 12, 12, 11, 10, 11, 12, 12, 12, 12,
+    13, 14, 14, 14, 14, 16, 15, 15, 14, 13, 13, 13, 13, 13, 12, 12,
+    12, 13, 13, 13, 13, 13, 14, 15, 15, 16, 16, 15, 15, 14, 14, 14,
+    14, 14, 14, 13, 12, 13, 14, 14, 14, 14, 14, 14, 15, 15, 16, 16,
+    15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 14, 14, 14, 14, 14,
+    15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 16, 16, 16,  6,
+};
+
+static const uint16_t clv_mvu_2_codes[] = {
+    0xFFF2, 0xFFEF, 0xFFEA, 0x7FEC, 0x7FD5, 0x7FC5, 0x7FCF, 0x3FD3,
+    0x3FC9, 0x3FB4, 0x3F72, 0x3FAE, 0x3FDC, 0x3FE1, 0x7FC4, 0x7FC8,
+    0x7FD7, 0x7FF0, 0xFFE9, 0xFFEC, 0xFFF8, 0xFFF4, 0x7FF3, 0x7FDE,
+    0x7FD3, 0x3FC0, 0x3F99, 0x3FA8, 0x3FAC, 0x3F8E, 0x1FA8, 0x1F79,
+    0x1FA2, 0x3F89, 0x3F93, 0x3F95, 0x3F9C, 0x3FB8, 0x7FD2, 0x7FE6,
+    0x7FDA, 0xFFF7, 0xFFFE, 0x7FED, 0x7FE2, 0x3FD5, 0x3FD7, 0x3FB3,
+    0x3FA2, 0x3F80, 0x3F7A, 0x1F88, 0x1F70, 0x1F8A, 0x3F83, 0x3F84,
+    0x3FAA, 0x3FC4, 0x3FDA, 0x3FDF, 0x7FDB, 0x7FE3, 0xFFF1, 0xFFFB,
+    0x7FE1, 0x7FC7, 0x3FB0, 0x1FAA, 0x1FAB, 0x1FA3, 0x1F8E, 0x1F81,
+    0x0FA7, 0x0F7F, 0x0FA8, 0x1F82, 0x1F8F, 0x1FA4, 0x1FAC, 0x1FAD,
+    0x3FB5, 0x7FC6, 0x7FDD, 0xFFF3, 0x3FDD, 0x3FBE, 0x3FB9, 0x3F7C,
+    0x1F77, 0x0FB5, 0x0F9D, 0x0F99, 0x0F90, 0x0794, 0x03BE, 0x0795,
+    0x0F92, 0x0F9A, 0x0F9E, 0x0FB6, 0x1F78, 0x3F78, 0x3FB2, 0x3FBF,
+    0x3FD2, 0x3FA7, 0x3F8C, 0x3F75, 0x1F9E, 0x0F93, 0x0F94, 0x0F7B,
+    0x0F73, 0x07B4, 0x03BB, 0x01D9, 0x03BC, 0x07B5, 0x0F74, 0x0F7C,
+    0x0F95, 0x0F96, 0x1FA0, 0x3F76, 0x3F8F, 0x3FA5, 0x7FE5, 0x3FC7,
+    0x1F95, 0x1F71, 0x0FAB, 0x0FAC, 0x0F9F, 0x0F7D, 0x0796, 0x03BF,
+    0x00DE, 0x03C0, 0x0797, 0x0F7E, 0x0FA0, 0x0FAD, 0x0FAE, 0x1F72,
+    0x1F97, 0x3FD1, 0x7FD8, 0x3FA9, 0x3FA0, 0x3F6D, 0x1F99, 0x0F87,
+    0x0F77, 0x07A8, 0x079C, 0x03C5, 0x01CA, 0x0067, 0x01CB, 0x03C6,
+    0x079D, 0x07A9, 0x0F78, 0x0F8A, 0x1F9F, 0x3F6E, 0x3F98, 0x3F9A,
+    0x3FCA, 0x3F70, 0x1FAF, 0x0F8D, 0x0F6F, 0x07AC, 0x07A0, 0x03B7,
+    0x01CE, 0x00DA, 0x0063, 0x00DB, 0x01CF, 0x03B8, 0x07A1, 0x07AD,
+    0x0F70, 0x0F91, 0x1FB2, 0x3F73, 0x3FD6, 0x3F7D, 0x1F91, 0x1F85,
+    0x0FA3, 0x07A2, 0x01D4, 0x01D0, 0x00E0, 0x0068, 0x002C, 0x0014,
+    0x002D, 0x0069, 0x00E1, 0x01D1, 0x01D5, 0x07A3, 0x0FA4, 0x1F89,
+    0x1F92, 0x3F81, 0x3FA6, 0x1F7B, 0x1F7C, 0x1F7D, 0x07B0, 0x07AE,
+    0x03C7, 0x00DC, 0x0064, 0x0008, 0x0000, 0x0009, 0x0065, 0x00DD,
+    0x03C8, 0x07AF, 0x07B1, 0x1F7E, 0x1F7F, 0x1F80, 0x3F9E, 0x3F85,
+    0x1F93, 0x1F8B, 0x0FA5, 0x07A4, 0x01D6, 0x01D2, 0x00E2, 0x006A,
+    0x002E, 0x0015, 0x002F, 0x006B, 0x00E3, 0x01D3, 0x01D7, 0x07A5,
+    0x0FA6, 0x1F8C, 0x1F94, 0x3F87, 0x3FCE, 0x3F77, 0x1FB4, 0x0F83,
+    0x07B6, 0x07AA, 0x079E, 0x01DA, 0x01CC, 0x00D8, 0x0062, 0x00D9,
+    0x01CD, 0x03B6, 0x079F, 0x07AB, 0x0F6E, 0x0F84, 0x1FA9, 0x3F6A,
+    0x3FCD, 0x3F90, 0x3F92, 0x3F6B, 0x1F96, 0x0F85, 0x0F75, 0x07A6,
+    0x079A, 0x03C3, 0x01C8, 0x0066, 0x01C9, 0x03C4, 0x079B, 0x07A7,
+    0x0F76, 0x0F86, 0x1F9D, 0x3F6C, 0x3F96, 0x3F97, 0x7FE9, 0x3FD8,
+    0x1F98, 0x1F73, 0x0FAF, 0x0FB0, 0x0FA1, 0x0F80, 0x0798, 0x03C1,
+    0x01D8, 0x03C2, 0x0799, 0x0F81, 0x0FA2, 0x0FB1, 0x0FB2, 0x1F74,
+    0x1F9A, 0x3FE0, 0x7FEE, 0x3F94, 0x3F8B, 0x3F6F, 0x1F9B, 0x0F88,
+    0x0F89, 0x0F79, 0x0F71, 0x07B2, 0x03B9, 0x00DF, 0x03BA, 0x07B3,
+    0x0F72, 0x0F7A, 0x0F8B, 0x0F8C, 0x1F9C, 0x3F71, 0x3F8A, 0x3F9B,
+    0x3FC8, 0x3FBD, 0x3FC5, 0x3F79, 0x1F75, 0x0FB3, 0x0F9C, 0x0F97,
+    0x0F8E, 0x0792, 0x03BD, 0x0793, 0x0F8F, 0x0F98, 0x0F9B, 0x0FB4,
+    0x1F76, 0x3F82, 0x3FC3, 0x3FBA, 0x3FC6, 0xFFFD, 0x7FDF, 0x7FCC,
+    0x3FBB, 0x1FB0, 0x1FAE, 0x1FA1, 0x1F90, 0x1F83, 0x0FAA, 0x0F82,
+    0x0FA9, 0x1F84, 0x1F8D, 0x1FA5, 0x1FB1, 0x1FB3, 0x3FC2, 0x7FCA,
+    0x7FE8, 0xFFF5, 0xFFFF, 0x7FDC, 0x7FD9, 0x3FCC, 0x3FD9, 0x3FB7,
+    0x3F9F, 0x3F7E, 0x3F86, 0x1F86, 0x0FB7, 0x1F87, 0x3F7B, 0x3F7F,
+    0x3FA1, 0x3FBC, 0x3FCB, 0x3FD4, 0x7FF1, 0x7FF2, 0xFFFA, 0xFFFC,
+    0x7FE4, 0x7FE7, 0x7FD4, 0x3FAF, 0x3FA3, 0x3F91, 0x3F9D, 0x3F88,
+    0x1FA6, 0x1F7A, 0x1FA7, 0x3F8D, 0x3FAB, 0x3FAD, 0x3FA4, 0x3FB6,
+    0x7FD0, 0x7FE0, 0x7FEB, 0xFFF0, 0xFFF9, 0xFFEB, 0xFFED, 0x7FEA,
+    0x7FD6, 0x7FC9, 0x7FCB, 0x3FD0, 0x3FDE, 0x3FB1, 0x3F74, 0x3FC1,
+    0x3FCF, 0x3FDB, 0x7FCD, 0x7FCE, 0x7FD1, 0x7FEF, 0xFFE8, 0xFFEE,
+    0xFFF6, 0x0030,
+};
+
+static const uint16_t clv_mvu_2_syms[] = {
+    0xF6F6, 0xF7F6, 0xF8F6, 0xF9F6, 0xFAF6, 0xFBF6, 0xFCF6, 0xFDF6,
+    0xFEF6, 0xFFF6, 0x00F6, 0x01F6, 0x02F6, 0x03F6, 0x04F6, 0x05F6,
+    0x06F6, 0x07F6, 0x08F6, 0x09F6, 0x0AF6, 0xF6F7, 0xF7F7, 0xF8F7,
+    0xF9F7, 0xFAF7, 0xFBF7, 0xFCF7, 0xFDF7, 0xFEF7, 0xFFF7, 0x00F7,
+    0x01F7, 0x02F7, 0x03F7, 0x04F7, 0x05F7, 0x06F7, 0x07F7, 0x08F7,
+    0x09F7, 0x0AF7, 0xF6F8, 0xF7F8, 0xF8F8, 0xF9F8, 0xFAF8, 0xFBF8,
+    0xFCF8, 0xFDF8, 0xFEF8, 0xFFF8, 0x00F8, 0x01F8, 0x02F8, 0x03F8,
+    0x04F8, 0x05F8, 0x06F8, 0x07F8, 0x08F8, 0x09F8, 0x0AF8, 0xF6F9,
+    0xF7F9, 0xF8F9, 0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9,
+    0xFFF9, 0x00F9, 0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9,
+    0x07F9, 0x08F9, 0x09F9, 0x0AF9, 0xF6FA, 0xF7FA, 0xF8FA, 0xF9FA,
+    0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA, 0xFEFA, 0xFFFA, 0x00FA, 0x01FA,
+    0x02FA, 0x03FA, 0x04FA, 0x05FA, 0x06FA, 0x07FA, 0x08FA, 0x09FA,
+    0x0AFA, 0xF6FB, 0xF7FB, 0xF8FB, 0xF9FB, 0xFAFB, 0xFBFB, 0xFCFB,
+    0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB, 0x03FB, 0x04FB,
+    0x05FB, 0x06FB, 0x07FB, 0x08FB, 0x09FB, 0x0AFB, 0xF6FC, 0xF7FC,
+    0xF8FC, 0xF9FC, 0xFAFC, 0xFBFC, 0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC,
+    0x00FC, 0x01FC, 0x02FC, 0x03FC, 0x04FC, 0x05FC, 0x06FC, 0x07FC,
+    0x08FC, 0x09FC, 0x0AFC, 0xF6FD, 0xF7FD, 0xF8FD, 0xF9FD, 0xFAFD,
+    0xFBFD, 0xFCFD, 0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD,
+    0x03FD, 0x04FD, 0x05FD, 0x06FD, 0x07FD, 0x08FD, 0x09FD, 0x0AFD,
+    0xF6FE, 0xF7FE, 0xF8FE, 0xF9FE, 0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE,
+    0xFEFE, 0xFFFE, 0x00FE, 0x01FE, 0x02FE, 0x03FE, 0x04FE, 0x05FE,
+    0x06FE, 0x07FE, 0x08FE, 0x09FE, 0x0AFE, 0xF6FF, 0xF7FF, 0xF8FF,
+    0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF, 0xFFFF, 0x00FF,
+    0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF, 0x07FF, 0x08FF,
+    0x09FF, 0x0AFF, 0xF600, 0xF700, 0xF800, 0xF900, 0xFA00, 0xFB00,
+    0xFC00, 0xFD00, 0xFE00, 0xFF00, 0x0000, 0x0100, 0x0200, 0x0300,
+    0x0400, 0x0500, 0x0600, 0x0700, 0x0800, 0x0900, 0x0A00, 0xF601,
+    0xF701, 0xF801, 0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01,
+    0xFF01, 0x0001, 0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601,
+    0x0701, 0x0801, 0x0901, 0x0A01, 0xF602, 0xF702, 0xF802, 0xF902,
+    0xFA02, 0xFB02, 0xFC02, 0xFD02, 0xFE02, 0xFF02, 0x0002, 0x0102,
+    0x0202, 0x0302, 0x0402, 0x0502, 0x0602, 0x0702, 0x0802, 0x0902,
+    0x0A02, 0xF603, 0xF703, 0xF803, 0xF903, 0xFA03, 0xFB03, 0xFC03,
+    0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203, 0x0303, 0x0403,
+    0x0503, 0x0603, 0x0703, 0x0803, 0x0903, 0x0A03, 0xF604, 0xF704,
+    0xF804, 0xF904, 0xFA04, 0xFB04, 0xFC04, 0xFD04, 0xFE04, 0xFF04,
+    0x0004, 0x0104, 0x0204, 0x0304, 0x0404, 0x0504, 0x0604, 0x0704,
+    0x0804, 0x0904, 0x0A04, 0xF605, 0xF705, 0xF805, 0xF905, 0xFA05,
+    0xFB05, 0xFC05, 0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205,
+    0x0305, 0x0405, 0x0505, 0x0605, 0x0705, 0x0805, 0x0905, 0x0A05,
+    0xF606, 0xF706, 0xF806, 0xF906, 0xFA06, 0xFB06, 0xFC06, 0xFD06,
+    0xFE06, 0xFF06, 0x0006, 0x0106, 0x0206, 0x0306, 0x0406, 0x0506,
+    0x0606, 0x0706, 0x0806, 0x0906, 0x0A06, 0xF607, 0xF707, 0xF807,
+    0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07, 0xFF07, 0x0007,
+    0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607, 0x0707, 0x0807,
+    0x0907, 0x0A07, 0xF608, 0xF708, 0xF808, 0xF908, 0xFA08, 0xFB08,
+    0xFC08, 0xFD08, 0xFE08, 0xFF08, 0x0008, 0x0108, 0x0208, 0x0308,
+    0x0408, 0x0508, 0x0608, 0x0708, 0x0808, 0x0908, 0x0A08, 0xF609,
+    0xF709, 0xF809, 0xF909, 0xFA09, 0xFB09, 0xFC09, 0xFD09, 0xFE09,
+    0xFF09, 0x0009, 0x0109, 0x0209, 0x0309, 0x0409, 0x0509, 0x0609,
+    0x0709, 0x0809, 0x0909, 0x0A09, 0xF60A, 0xF70A, 0xF80A, 0xF90A,
+    0xFA0A, 0xFB0A, 0xFC0A, 0xFD0A, 0xFE0A, 0xFF0A, 0x000A, 0x010A,
+    0x020A, 0x030A, 0x040A, 0x050A, 0x060A, 0x070A, 0x080A, 0x090A,
+    0x0A0A, 0x0B0B,
+};
+
+static const uint8_t clv_mvv_1_bits[] = {
+    16, 15, 13, 13, 13, 12, 10, 10, 10, 12, 13, 13, 13, 15, 16, 16,
+    15, 14, 13, 12, 11, 10,  9, 10, 11, 12, 13, 14, 15, 16, 15, 14,
+    13, 13, 11, 10, 10,  5, 10, 10, 11, 13, 13, 14, 15, 12, 12, 12,
+    11, 10, 10,  9,  5,  9, 10, 10, 11, 12, 12, 12, 14, 12, 12, 12,
+    11,  9,  8,  5,  8,  9, 11, 12, 12, 12, 14, 14, 11, 11,  9,  9,
+     9,  7,  5,  7,  9,  9,  9, 11, 11, 14, 13, 12, 11, 10, 10,  8,
+     6,  3,  6,  8, 10, 10, 11, 12, 13, 11, 10,  9,  7,  6,  6,  4,
+     4,  4,  6,  7,  7, 10, 10, 11, 13, 12, 11, 10, 10,  8,  6,  3,
+     6,  8, 10, 10, 11, 12, 13, 14, 11, 11,  9,  9,  9,  7,  5,  7,
+     9,  9,  9, 11, 11, 14, 14, 12, 12, 12, 11,  9,  8,  5,  8,  9,
+    11, 12, 12, 12, 14, 12, 12, 12, 11, 10, 10,  9,  5,  9, 10, 10,
+    11, 12, 12, 12, 15, 14, 13, 13, 11, 10, 10,  5, 10, 10, 11, 13,
+    13, 14, 15, 16, 15, 14, 13, 12, 11, 10,  9, 10, 11, 12, 13, 14,
+    15, 16, 16, 15, 13, 13, 13, 12, 10, 10, 10, 12, 13, 13, 13, 15,
+    16,  7,
+};
+
+static const uint16_t clv_mvv_1_codes[] = {
+    0xFFFD, 0x7FF8, 0x1FF2, 0x1FDC, 0x1FDB, 0x0FD2, 0x03D6, 0x03BF,
+    0x03D3, 0x0FD0, 0x1FDA, 0x1FDE, 0x1FF0, 0x7FF9, 0xFFFE, 0xFFFA,
+    0x7FFB, 0x3FF3, 0x1FE9, 0x0FD6, 0x07CB, 0x03E1, 0x01C8, 0x03E2,
+    0x07CC, 0x0FD9, 0x1FE8, 0x3FF6, 0x7FFA, 0xFFF9, 0x7FF1, 0x3FEE,
+    0x1FE4, 0x1FE0, 0x07D4, 0x03DB, 0x03CB, 0x0014, 0x03CC, 0x03DC,
+    0x07D6, 0x1FE3, 0x1FE7, 0x3FEC, 0x7FF3, 0x0FEA, 0x0FE0, 0x0FDE,
+    0x07DE, 0x03C9, 0x03C3, 0x01DC, 0x0013, 0x01DD, 0x03C4, 0x03CA,
+    0x07DF, 0x0FDF, 0x0FE3, 0x0FEB, 0x3FF1, 0x0FE7, 0x0FCF, 0x0FC8,
+    0x07D8, 0x01D2, 0x00E0, 0x0010, 0x00E1, 0x01D4, 0x07D9, 0x0FC9,
+    0x0FCC, 0x0FE6, 0x3FF5, 0x3FEA, 0x07E2, 0x07D2, 0x01D7, 0x01D0,
+    0x01CC, 0x006A, 0x000F, 0x006B, 0x01CD, 0x01D1, 0x01D9, 0x07D3,
+    0x07E3, 0x3FEB, 0x1FEE, 0x0FD5, 0x07C7, 0x03D8, 0x03D0, 0x00DD,
+    0x002D, 0x0001, 0x002E, 0x00DE, 0x03D1, 0x03D9, 0x07C8, 0x0FD8,
+    0x1FEF, 0x07CE, 0x03C5, 0x01DE, 0x006C, 0x0032, 0x0030, 0x0005,
+    0x0004, 0x0006, 0x0031, 0x0066, 0x006D, 0x03BE, 0x03C6, 0x07CF,
+    0x1FEC, 0x0FDA, 0x07C9, 0x03DA, 0x03D2, 0x00DF, 0x002F, 0x0000,
+    0x002C, 0x00DC, 0x03CF, 0x03D7, 0x07C6, 0x0FD4, 0x1FED, 0x3FE9,
+    0x07E0, 0x07D0, 0x01D3, 0x01CE, 0x01CA, 0x0068, 0x000E, 0x0069,
+    0x01CB, 0x01CF, 0x01D5, 0x07D1, 0x07E1, 0x3FE8, 0x3FF4, 0x0FE4,
+    0x0FCD, 0x0FCB, 0x07DA, 0x01D6, 0x00E2, 0x0011, 0x00E3, 0x01D8,
+    0x07DB, 0x0FCA, 0x0FCE, 0x0FE5, 0x3FF7, 0x0FE8, 0x0FE1, 0x0FDD,
+    0x07DD, 0x03C7, 0x03C1, 0x01DA, 0x0012, 0x01DB, 0x03C2, 0x03C8,
+    0x07DC, 0x0FDC, 0x0FE2, 0x0FE9, 0x7FF0, 0x3FEF, 0x1FE5, 0x1FE1,
+    0x07D7, 0x03DD, 0x03CD, 0x0015, 0x03CE, 0x03DE, 0x07D5, 0x1FE2,
+    0x1FE6, 0x3FED, 0x7FF2, 0xFFF8, 0x7FF4, 0x3FF2, 0x1FEB, 0x0FD7,
+    0x07CD, 0x03DF, 0x01C9, 0x03E0, 0x07CA, 0x0FDB, 0x1FEA, 0x3FF0,
+    0x7FF5, 0xFFFB, 0xFFFC, 0x7FF6, 0x1FF3, 0x1FDD, 0x1FD9, 0x0FD1,
+    0x03D5, 0x03C0, 0x03D4, 0x0FD3, 0x1FD8, 0x1FDF, 0x1FF1, 0x7FF7,
+    0xFFFF, 0x0067,
+};
+
+static const uint16_t clv_mvv_1_syms[] = {
+    0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9, 0xFFF9, 0x00F9,
+    0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9, 0x07F9, 0xF9FA,
+    0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA, 0xFEFA, 0xFFFA, 0x00FA, 0x01FA,
+    0x02FA, 0x03FA, 0x04FA, 0x05FA, 0x06FA, 0x07FA, 0xF9FB, 0xFAFB,
+    0xFBFB, 0xFCFB, 0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB,
+    0x03FB, 0x04FB, 0x05FB, 0x06FB, 0x07FB, 0xF9FC, 0xFAFC, 0xFBFC,
+    0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC, 0x00FC, 0x01FC, 0x02FC, 0x03FC,
+    0x04FC, 0x05FC, 0x06FC, 0x07FC, 0xF9FD, 0xFAFD, 0xFBFD, 0xFCFD,
+    0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD, 0x03FD, 0x04FD,
+    0x05FD, 0x06FD, 0x07FD, 0xF9FE, 0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE,
+    0xFEFE, 0xFFFE, 0x00FE, 0x01FE, 0x02FE, 0x03FE, 0x04FE, 0x05FE,
+    0x06FE, 0x07FE, 0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF,
+    0xFFFF, 0x00FF, 0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF,
+    0x07FF, 0xF900, 0xFA00, 0xFB00, 0xFC00, 0xFD00, 0xFE00, 0xFF00,
+    0x0000, 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700,
+    0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01, 0xFF01, 0x0001,
+    0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601, 0x0701, 0xF902,
+    0xFA02, 0xFB02, 0xFC02, 0xFD02, 0xFE02, 0xFF02, 0x0002, 0x0102,
+    0x0202, 0x0302, 0x0402, 0x0502, 0x0602, 0x0702, 0xF903, 0xFA03,
+    0xFB03, 0xFC03, 0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203,
+    0x0303, 0x0403, 0x0503, 0x0603, 0x0703, 0xF904, 0xFA04, 0xFB04,
+    0xFC04, 0xFD04, 0xFE04, 0xFF04, 0x0004, 0x0104, 0x0204, 0x0304,
+    0x0404, 0x0504, 0x0604, 0x0704, 0xF905, 0xFA05, 0xFB05, 0xFC05,
+    0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205, 0x0305, 0x0405,
+    0x0505, 0x0605, 0x0705, 0xF906, 0xFA06, 0xFB06, 0xFC06, 0xFD06,
+    0xFE06, 0xFF06, 0x0006, 0x0106, 0x0206, 0x0306, 0x0406, 0x0506,
+    0x0606, 0x0706, 0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07,
+    0xFF07, 0x0007, 0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607,
+    0x0707, 0x0808,
+};
+
+static const uint8_t clv_mvv_2_bits[] = {
+    16, 15, 15, 15, 15, 15, 14, 14, 14, 13, 12, 13, 14, 14, 14, 15,
+    15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12,
+    12, 13, 14, 14, 14, 15, 15, 16, 16, 16, 16, 16, 16, 14, 14, 14,
+    14, 14, 14, 13, 13, 13, 14, 14, 14, 14, 14, 14, 16, 16, 16, 15,
+    15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 13, 13, 13, 13, 14,
+    14, 14, 15, 15, 16, 14, 14, 14, 13, 12, 12, 12, 10, 10, 10, 10,
+    10, 12, 12, 12, 13, 14, 14, 14, 16, 14, 14, 14, 13, 13, 12, 12,
+    12, 10, 10,  7, 10, 10, 12, 12, 12, 13, 13, 14, 14, 14, 14, 14,
+    13, 12, 12, 12, 12, 10,  9,  8,  7,  8,  9, 10, 12, 12, 12, 12,
+    13, 14, 14, 14, 14, 13, 12, 12, 12, 12, 10,  9,  8,  7,  8,  9,
+    10, 12, 12, 12, 12, 13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 10,
+     9,  8,  7,  8,  9, 10, 11, 12, 12, 13, 13, 14, 14, 14, 13, 13,
+    12, 10, 10,  9,  8,  7,  6,  5,  6,  7,  8,  9, 10, 10, 12, 13,
+    13, 14, 13, 13, 13, 13, 11, 10,  9,  8,  7,  5,  1,  5,  7,  8,
+     9, 10, 11, 13, 13, 13, 13, 14, 13, 13, 12, 10, 10,  9,  8,  7,
+     6,  5,  6,  7,  8,  9, 10, 10, 12, 13, 13, 14, 14, 14, 13, 13,
+    12, 12, 11,  9,  9,  8,  7,  8,  9,  9, 11, 12, 12, 13, 13, 14,
+    14, 14, 14, 13, 12, 12, 12, 12, 10,  9,  8,  7,  8,  9, 10, 12,
+    12, 12, 12, 13, 14, 14, 14, 14, 13, 12, 12, 12, 12, 10,  9,  8,
+     7,  8,  9, 10, 12, 12, 12, 12, 13, 14, 14, 14, 14, 14, 13, 13,
+    12, 12, 12, 10, 10,  7, 10, 10, 12, 12, 12, 13, 13, 14, 14, 14,
+    16, 14, 14, 14, 13, 12, 12, 12, 10, 10, 10, 10, 10, 12, 12, 12,
+    13, 14, 14, 14, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12,
+    12, 13, 13, 13, 13, 14, 14, 14, 15, 15, 16, 16, 16, 14, 14, 14,
+    14, 14, 14, 13, 13, 13, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16,
+    16, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 13, 14, 14, 14, 15,
+    15, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 13, 10, 13,
+    14, 14, 14, 15, 15, 15, 15, 15, 16,  6,
+};
+
+static const uint16_t clv_mvv_2_codes[] = {
+    0xFFFF, 0x7FE7, 0x7FD9, 0x7FE6, 0x7FE5, 0x7FCE, 0x3FD6, 0x3FD3,
+    0x3F9C, 0x1FB2, 0x0F7A, 0x1FB5, 0x3FA8, 0x3FDD, 0x3FE5, 0x7FD0,
+    0x7FEA, 0x7FEC, 0x7FEF, 0x7FDB, 0xFFF3, 0xFFF5, 0xFFE2, 0xFFEB,
+    0x7FEB, 0x7FE0, 0x3FA7, 0x3F84, 0x3F79, 0x1FAE, 0x1F70, 0x0F78,
+    0x0FAA, 0x1FAA, 0x3F76, 0x3F7E, 0x3FAC, 0x7FE1, 0x7FDD, 0xFFEC,
+    0xFFEE, 0xFFF8, 0xFFF9, 0xFFEA, 0xFFE4, 0x3FE1, 0x3FBA, 0x3FC5,
+    0x3FB9, 0x3FA1, 0x3FAA, 0x1F8B, 0x1F8D, 0x1F8E, 0x3FA6, 0x3FA9,
+    0x3FC4, 0x3FBC, 0x3FC6, 0x3FDB, 0xFFE3, 0xFFE1, 0xFFFB, 0x7FD4,
+    0x7FCC, 0x3FCD, 0x3F88, 0x3F7C, 0x1FA1, 0x1FA2, 0x1F95, 0x1F77,
+    0x0F95, 0x0F79, 0x0F97, 0x1F78, 0x1F96, 0x1FA3, 0x1FA4, 0x3F7F,
+    0x3F8B, 0x3FCB, 0x7FCF, 0x7FD5, 0xFFF6, 0x3FD7, 0x3FE0, 0x3F91,
+    0x1F7B, 0x0FB4, 0x0FA5, 0x0FA6, 0x03D5, 0x03CB, 0x03BF, 0x03CC,
+    0x03D6, 0x0FA7, 0x0FA8, 0x0FB5, 0x1F7D, 0x3F87, 0x3FD2, 0x3FDF,
+    0xFFF0, 0x3F95, 0x3F8A, 0x3F96, 0x1FB7, 0x1F9F, 0x0F9F, 0x0F8F,
+    0x0F76, 0x03D1, 0x03BC, 0x0067, 0x03BD, 0x03D2, 0x0F77, 0x0F90,
+    0x0FA0, 0x1FA0, 0x1FB9, 0x3F97, 0x3F98, 0x3F99, 0x3FC3, 0x3FAF,
+    0x1F81, 0x0FA9, 0x0F91, 0x0F7E, 0x0F68, 0x03B2, 0x01C6, 0x00D4,
+    0x0062, 0x00D5, 0x01C7, 0x03B3, 0x0F6A, 0x0F81, 0x0F93, 0x0FAC,
+    0x1F83, 0x3FB3, 0x3FB4, 0x3FC7, 0x3FBB, 0x1F86, 0x0FAF, 0x0F98,
+    0x0F84, 0x0F6D, 0x03B8, 0x01CC, 0x00DA, 0x0065, 0x00DB, 0x01CD,
+    0x03B9, 0x0F6F, 0x0F86, 0x0F9B, 0x0FB1, 0x1F88, 0x3FB8, 0x3FC9,
+    0x3FDC, 0x3F80, 0x1FB3, 0x1F93, 0x0F87, 0x0F72, 0x07B0, 0x03B0,
+    0x01D4, 0x00E0, 0x0061, 0x00E1, 0x01D5, 0x03B1, 0x07B1, 0x0F73,
+    0x0F88, 0x1F94, 0x1FB4, 0x3F81, 0x3FD4, 0x3F9E, 0x1F99, 0x1F73,
+    0x0F89, 0x03C4, 0x03C0, 0x01CE, 0x00D0, 0x005C, 0x0028, 0x0010,
+    0x0029, 0x005D, 0x00D1, 0x01CF, 0x03C1, 0x03C5, 0x0F8A, 0x1F74,
+    0x1F9A, 0x3FA4, 0x1FAC, 0x1F8C, 0x1F7E, 0x1F71, 0x07B2, 0x03CF,
+    0x01C4, 0x00DC, 0x005A, 0x0012, 0x0000, 0x0013, 0x005B, 0x00DD,
+    0x01C5, 0x03D0, 0x07B3, 0x1F72, 0x1F7F, 0x1F8F, 0x1FAD, 0x3FAB,
+    0x1F9B, 0x1F75, 0x0F8B, 0x03C6, 0x03C2, 0x01D0, 0x00D2, 0x005E,
+    0x002A, 0x0011, 0x002B, 0x005F, 0x00D3, 0x01D1, 0x03C3, 0x03C7,
+    0x0F8C, 0x1F76, 0x1F9C, 0x3FAD, 0x3FCF, 0x3F85, 0x1FBA, 0x1F91,
+    0x0F7D, 0x0F70, 0x07AE, 0x01D6, 0x01D2, 0x00DE, 0x0060, 0x00DF,
+    0x01D3, 0x01D7, 0x07AF, 0x0F71, 0x0F7F, 0x1F92, 0x1FAF, 0x3F7B,
+    0x3FD0, 0x3FC0, 0x3FB1, 0x1F82, 0x0FAB, 0x0F92, 0x0F80, 0x0F69,
+    0x03B4, 0x01C8, 0x00D6, 0x0063, 0x00D7, 0x01C9, 0x03B5, 0x0F6B,
+    0x0F82, 0x0F94, 0x0FAD, 0x1F84, 0x3FB5, 0x3FC2, 0x3FB6, 0x3FBF,
+    0x1F85, 0x0FAE, 0x0F96, 0x0F83, 0x0F6C, 0x03B6, 0x01CA, 0x00D8,
+    0x0064, 0x00D9, 0x01CB, 0x03B7, 0x0F6E, 0x0F85, 0x0F99, 0x0FB0,
+    0x1F87, 0x3FB0, 0x3FB2, 0x3F93, 0x3F86, 0x3F8E, 0x1FB1, 0x1F9D,
+    0x0F9A, 0x0F8D, 0x0F74, 0x03CD, 0x03BA, 0x0066, 0x03BB, 0x03CE,
+    0x0F75, 0x0F8E, 0x0F9C, 0x1F9E, 0x1FB0, 0x3F8C, 0x3F94, 0x3F8D,
+    0xFFFC, 0x3FCA, 0x3FD5, 0x3F8F, 0x1F79, 0x0FB2, 0x0FA1, 0x0FA2,
+    0x03D3, 0x03C9, 0x03BE, 0x03CA, 0x03D4, 0x0FA3, 0x0FA4, 0x0FB3,
+    0x1F7A, 0x3F90, 0x3FE3, 0x3FD8, 0xFFF2, 0x7FD7, 0x7FCD, 0x3FE4,
+    0x3F92, 0x3F82, 0x1FA6, 0x1FA8, 0x1F98, 0x1F7C, 0x0F9D, 0x0F7B,
+    0x0F9E, 0x1F80, 0x1F97, 0x1FA7, 0x1FA5, 0x3F7A, 0x3F89, 0x3FDA,
+    0x7FD2, 0x7FD6, 0xFFFE, 0xFFED, 0xFFE8, 0x3FCC, 0x3FBD, 0x3FAE,
+    0x3FC1, 0x3F9F, 0x3F9A, 0x1F89, 0x1F90, 0x1F8A, 0x3FA3, 0x3FA0,
+    0x3FC8, 0x3FBE, 0x3FB7, 0x3FD1, 0xFFE7, 0xFFE9, 0xFFFD, 0xFFF4,
+    0xFFE5, 0xFFEF, 0x7FD8, 0x7FDC, 0x3FA2, 0x3F83, 0x3F78, 0x1FA9,
+    0x0FB6, 0x0F7C, 0x0FB7, 0x1FAB, 0x3F77, 0x3F7D, 0x3F9B, 0x7FDE,
+    0x7FED, 0xFFE6, 0xFFE0, 0xFFF7, 0xFFF1, 0x7FDA, 0x7FE9, 0x7FE2,
+    0x7FE3, 0x7FD1, 0x3FD9, 0x3FE2, 0x3FA5, 0x1FB8, 0x03C8, 0x1FB6,
+    0x3F9D, 0x3FDE, 0x3FCE, 0x7FD3, 0x7FEE, 0x7FDF, 0x7FE8, 0x7FE4,
+    0xFFFA, 0x002C,
+};
+
+static const uint16_t clv_mvv_2_syms[] = {
+    0xF6F6, 0xF7F6, 0xF8F6, 0xF9F6, 0xFAF6, 0xFBF6, 0xFCF6, 0xFDF6,
+    0xFEF6, 0xFFF6, 0x00F6, 0x01F6, 0x02F6, 0x03F6, 0x04F6, 0x05F6,
+    0x06F6, 0x07F6, 0x08F6, 0x09F6, 0x0AF6, 0xF6F7, 0xF7F7, 0xF8F7,
+    0xF9F7, 0xFAF7, 0xFBF7, 0xFCF7, 0xFDF7, 0xFEF7, 0xFFF7, 0x00F7,
+    0x01F7, 0x02F7, 0x03F7, 0x04F7, 0x05F7, 0x06F7, 0x07F7, 0x08F7,
+    0x09F7, 0x0AF7, 0xF6F8, 0xF7F8, 0xF8F8, 0xF9F8, 0xFAF8, 0xFBF8,
+    0xFCF8, 0xFDF8, 0xFEF8, 0xFFF8, 0x00F8, 0x01F8, 0x02F8, 0x03F8,
+    0x04F8, 0x05F8, 0x06F8, 0x07F8, 0x08F8, 0x09F8, 0x0AF8, 0xF6F9,
+    0xF7F9, 0xF8F9, 0xF9F9, 0xFAF9, 0xFBF9, 0xFCF9, 0xFDF9, 0xFEF9,
+    0xFFF9, 0x00F9, 0x01F9, 0x02F9, 0x03F9, 0x04F9, 0x05F9, 0x06F9,
+    0x07F9, 0x08F9, 0x09F9, 0x0AF9, 0xF6FA, 0xF7FA, 0xF8FA, 0xF9FA,
+    0xFAFA, 0xFBFA, 0xFCFA, 0xFDFA, 0xFEFA, 0xFFFA, 0x00FA, 0x01FA,
+    0x02FA, 0x03FA, 0x04FA, 0x05FA, 0x06FA, 0x07FA, 0x08FA, 0x09FA,
+    0x0AFA, 0xF6FB, 0xF7FB, 0xF8FB, 0xF9FB, 0xFAFB, 0xFBFB, 0xFCFB,
+    0xFDFB, 0xFEFB, 0xFFFB, 0x00FB, 0x01FB, 0x02FB, 0x03FB, 0x04FB,
+    0x05FB, 0x06FB, 0x07FB, 0x08FB, 0x09FB, 0x0AFB, 0xF6FC, 0xF7FC,
+    0xF8FC, 0xF9FC, 0xFAFC, 0xFBFC, 0xFCFC, 0xFDFC, 0xFEFC, 0xFFFC,
+    0x00FC, 0x01FC, 0x02FC, 0x03FC, 0x04FC, 0x05FC, 0x06FC, 0x07FC,
+    0x08FC, 0x09FC, 0x0AFC, 0xF6FD, 0xF7FD, 0xF8FD, 0xF9FD, 0xFAFD,
+    0xFBFD, 0xFCFD, 0xFDFD, 0xFEFD, 0xFFFD, 0x00FD, 0x01FD, 0x02FD,
+    0x03FD, 0x04FD, 0x05FD, 0x06FD, 0x07FD, 0x08FD, 0x09FD, 0x0AFD,
+    0xF6FE, 0xF7FE, 0xF8FE, 0xF9FE, 0xFAFE, 0xFBFE, 0xFCFE, 0xFDFE,
+    0xFEFE, 0xFFFE, 0x00FE, 0x01FE, 0x02FE, 0x03FE, 0x04FE, 0x05FE,
+    0x06FE, 0x07FE, 0x08FE, 0x09FE, 0x0AFE, 0xF6FF, 0xF7FF, 0xF8FF,
+    0xF9FF, 0xFAFF, 0xFBFF, 0xFCFF, 0xFDFF, 0xFEFF, 0xFFFF, 0x00FF,
+    0x01FF, 0x02FF, 0x03FF, 0x04FF, 0x05FF, 0x06FF, 0x07FF, 0x08FF,
+    0x09FF, 0x0AFF, 0xF600, 0xF700, 0xF800, 0xF900, 0xFA00, 0xFB00,
+    0xFC00, 0xFD00, 0xFE00, 0xFF00, 0x0000, 0x0100, 0x0200, 0x0300,
+    0x0400, 0x0500, 0x0600, 0x0700, 0x0800, 0x0900, 0x0A00, 0xF601,
+    0xF701, 0xF801, 0xF901, 0xFA01, 0xFB01, 0xFC01, 0xFD01, 0xFE01,
+    0xFF01, 0x0001, 0x0101, 0x0201, 0x0301, 0x0401, 0x0501, 0x0601,
+    0x0701, 0x0801, 0x0901, 0x0A01, 0xF602, 0xF702, 0xF802, 0xF902,
+    0xFA02, 0xFB02, 0xFC02, 0xFD02, 0xFE02, 0xFF02, 0x0002, 0x0102,
+    0x0202, 0x0302, 0x0402, 0x0502, 0x0602, 0x0702, 0x0802, 0x0902,
+    0x0A02, 0xF603, 0xF703, 0xF803, 0xF903, 0xFA03, 0xFB03, 0xFC03,
+    0xFD03, 0xFE03, 0xFF03, 0x0003, 0x0103, 0x0203, 0x0303, 0x0403,
+    0x0503, 0x0603, 0x0703, 0x0803, 0x0903, 0x0A03, 0xF604, 0xF704,
+    0xF804, 0xF904, 0xFA04, 0xFB04, 0xFC04, 0xFD04, 0xFE04, 0xFF04,
+    0x0004, 0x0104, 0x0204, 0x0304, 0x0404, 0x0504, 0x0604, 0x0704,
+    0x0804, 0x0904, 0x0A04, 0xF605, 0xF705, 0xF805, 0xF905, 0xFA05,
+    0xFB05, 0xFC05, 0xFD05, 0xFE05, 0xFF05, 0x0005, 0x0105, 0x0205,
+    0x0305, 0x0405, 0x0505, 0x0605, 0x0705, 0x0805, 0x0905, 0x0A05,
+    0xF606, 0xF706, 0xF806, 0xF906, 0xFA06, 0xFB06, 0xFC06, 0xFD06,
+    0xFE06, 0xFF06, 0x0006, 0x0106, 0x0206, 0x0306, 0x0406, 0x0506,
+    0x0606, 0x0706, 0x0806, 0x0906, 0x0A06, 0xF607, 0xF707, 0xF807,
+    0xF907, 0xFA07, 0xFB07, 0xFC07, 0xFD07, 0xFE07, 0xFF07, 0x0007,
+    0x0107, 0x0207, 0x0307, 0x0407, 0x0507, 0x0607, 0x0707, 0x0807,
+    0x0907, 0x0A07, 0xF608, 0xF708, 0xF808, 0xF908, 0xFA08, 0xFB08,
+    0xFC08, 0xFD08, 0xFE08, 0xFF08, 0x0008, 0x0108, 0x0208, 0x0308,
+    0x0408, 0x0508, 0x0608, 0x0708, 0x0808, 0x0908, 0x0A08, 0xF609,
+    0xF709, 0xF809, 0xF909, 0xFA09, 0xFB09, 0xFC09, 0xFD09, 0xFE09,
+    0xFF09, 0x0009, 0x0109, 0x0209, 0x0309, 0x0409, 0x0509, 0x0609,
+    0x0709, 0x0809, 0x0909, 0x0A09, 0xF60A, 0xF70A, 0xF80A, 0xF90A,
+    0xFA0A, 0xFB0A, 0xFC0A, 0xFD0A, 0xFE0A, 0xFF0A, 0x000A, 0x010A,
+    0x020A, 0x030A, 0x040A, 0x050A, 0x060A, 0x070A, 0x080A, 0x090A,
+    0x0A0A, 0x0B0B,
+};
+
+static const uint8_t clv_biasy_1_bits[] = {
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13,
+    13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,  9,
+     8,  8,  7,  7,  5,  2,  1,  3,  5,  7,  7,  8,  9,  9, 10, 10,
+    10, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
+    14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 12,
+};
+
+static const uint16_t clv_biasy_1_codes[] = {
+    0xFFFE, 0x7FFE, 0x7FFC, 0x7FFA, 0x7FF6, 0x7FF7, 0x7FF3, 0x7FF2,
+    0x7FEF, 0x7FEE, 0x3FF5, 0x3FF3, 0x3FF1, 0x1FF7, 0x1FF5, 0x1FF2,
+    0x1FF0, 0x1FEE, 0x1FEC, 0x1FEA, 0x1FE8, 0x0FF2, 0x0FF0, 0x0FEE,
+    0x0FEB, 0x07F4, 0x07F3, 0x07F1, 0x03F7, 0x03F5, 0x03F3, 0x01F7,
+    0x00FA, 0x00F8, 0x007A, 0x0078, 0x001C, 0x0002, 0x0000, 0x0006,
+    0x001D, 0x0079, 0x007B, 0x00F9, 0x01F6, 0x01F8, 0x03F2, 0x03F4,
+    0x03F6, 0x07F0, 0x07F2, 0x0FEA, 0x0FEC, 0x0FEF, 0x0FF1, 0x0FF3,
+    0x1FE9, 0x1FEB, 0x1FED, 0x1FEF, 0x1FF1, 0x1FF3, 0x1FF4, 0x1FF6,
+    0x3FF0, 0x3FF2, 0x3FF4, 0x3FF6, 0x7FF0, 0x7FF1, 0x7FF4, 0x7FF8,
+    0x7FF5, 0x7FF9, 0x7FFB, 0x7FFD, 0xFFFF, 0x0FED,
+};
+
+static const uint16_t clv_biasy_1_syms[] = {
+    0xFF68, 0xFF6C, 0xFF70, 0xFF74, 0xFF78, 0xFF7C, 0xFF80, 0xFF84,
+    0xFF88, 0xFF8C, 0xFF90, 0xFF94, 0xFF98, 0xFF9C, 0xFFA0, 0xFFA4,
+    0xFFA8, 0xFFAC, 0xFFB0, 0xFFB4, 0xFFB8, 0xFFBC, 0xFFC0, 0xFFC4,
+    0xFFC8, 0xFFCC, 0xFFD0, 0xFFD4, 0xFFD8, 0xFFDC, 0xFFE0, 0xFFE4,
+    0xFFE8, 0xFFEC, 0xFFF0, 0xFFF4, 0xFFF8, 0xFFFC, 0x0000, 0x0004,
+    0x0008, 0x000C, 0x0010, 0x0014, 0x0018, 0x001C, 0x0020, 0x0024,
+    0x0028, 0x002C, 0x0030, 0x0034, 0x0038, 0x003C, 0x0040, 0x0044,
+    0x0048, 0x004C, 0x0050, 0x0054, 0x0058, 0x005C, 0x0060, 0x0064,
+    0x0068, 0x006C, 0x0070, 0x0074, 0x0078, 0x007C, 0x0080, 0x0084,
+    0x0088, 0x008C, 0x0090, 0x0094, 0x0098, 0x0100,
+};
+
+static const uint8_t clv_biasy_2_bits[] = {
+    16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 11,
+    11, 11, 10, 10, 10, 10,  9,  9,  8,  8,  8,  7,  6,  6,  4,  3,
+     1,  3,  4,  6,  6,  7,  8,  8,  8,  9,  9, 10, 10, 10, 10, 11,
+    11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 15,
+};
+
+static const uint16_t clv_biasy_2_codes[] = {
+    0xFFFE, 0xFFFC, 0xFFFA, 0xFFF9, 0xFFF6, 0xFFF5, 0xFFF3, 0x7FF7,
+    0x7FF5, 0x7FF1, 0x7FF3, 0x7FF0, 0x7FEE, 0x7FEC, 0x7FE9, 0x7FE6,
+    0x7FE4, 0x7FE2, 0x3FEF, 0x3FEE, 0x3FEC, 0x3FEA, 0x1FF2, 0x1FF1,
+    0x1FEF, 0x1FED, 0x0FF4, 0x0FF3, 0x0FF1, 0x0FEF, 0x0FED, 0x07F4,
+    0x07F3, 0x07F1, 0x03F6, 0x03F4, 0x03F2, 0x03F0, 0x01F6, 0x01F4,
+    0x00F8, 0x00F6, 0x00F4, 0x0078, 0x003A, 0x0038, 0x000C, 0x0004,
+    0x0000, 0x0005, 0x000D, 0x0039, 0x003B, 0x0079, 0x00F5, 0x00F7,
+    0x00F9, 0x01F5, 0x01F7, 0x03F1, 0x03F3, 0x03F5, 0x03F7, 0x07F0,
+    0x07F2, 0x07F5, 0x0FEC, 0x0FEE, 0x0FF0, 0x0FF2, 0x0FF5, 0x1FEC,
+    0x1FEE, 0x1FF0, 0x1FF3, 0x1FF4, 0x3FEB, 0x3FED, 0x3FF0, 0x7FE3,
+    0x7FE5, 0x7FE7, 0x7FEA, 0x7FEB, 0x7FED, 0x7FEF, 0x7FF4, 0x7FF2,
+    0x7FF6, 0x7FF8, 0xFFF2, 0xFFF4, 0xFFF7, 0xFFF8, 0xFFFB, 0xFFFD,
+    0xFFFF, 0x7FE8,
+};
+
+static const uint16_t clv_biasy_2_syms[] = {
+    0xFF40, 0xFF44, 0xFF48, 0xFF4C, 0xFF50, 0xFF54, 0xFF58, 0xFF5C,
+    0xFF60, 0xFF64, 0xFF68, 0xFF6C, 0xFF70, 0xFF74, 0xFF78, 0xFF7C,
+    0xFF80, 0xFF84, 0xFF88, 0xFF8C, 0xFF90, 0xFF94, 0xFF98, 0xFF9C,
+    0xFFA0, 0xFFA4, 0xFFA8, 0xFFAC, 0xFFB0, 0xFFB4, 0xFFB8, 0xFFBC,
+    0xFFC0, 0xFFC4, 0xFFC8, 0xFFCC, 0xFFD0, 0xFFD4, 0xFFD8, 0xFFDC,
+    0xFFE0, 0xFFE4, 0xFFE8, 0xFFEC, 0xFFF0, 0xFFF4, 0xFFF8, 0xFFFC,
+    0x0000, 0x0004, 0x0008, 0x000C, 0x0010, 0x0014, 0x0018, 0x001C,
+    0x0020, 0x0024, 0x0028, 0x002C, 0x0030, 0x0034, 0x0038, 0x003C,
+    0x0040, 0x0044, 0x0048, 0x004C, 0x0050, 0x0054, 0x0058, 0x005C,
+    0x0060, 0x0064, 0x0068, 0x006C, 0x0070, 0x0074, 0x0078, 0x007C,
+    0x0080, 0x0084, 0x0088, 0x008C, 0x0090, 0x0094, 0x0098, 0x009C,
+    0x00A0, 0x00A4, 0x00A8, 0x00AC, 0x00B0, 0x00B4, 0x00B8, 0x00BC,
+    0x00C0, 0x0100,
+};
+
+static const uint8_t clv_biasy_3_bits[] = {
+    16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10,
+    10,  9,  9,  9,  9,  8,  8,  7,  7,  6,  5,  4,  4,  2,  2,  3,
+     4,  5,  6,  6,  7,  7,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10,
+    11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 14,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 15,
+};
+
+static const uint16_t clv_biasy_3_codes[] = {
+    0xFFFF, 0xFFFC, 0xFFFA, 0x7FFC, 0x7FF9, 0x7FF6, 0x7FF4, 0x7FF2,
+    0x7FF1, 0x7FEF, 0x7FEC, 0x7FEB, 0x7FE9, 0x3FF3, 0x3FF0, 0x3FEE,
+    0x3FED, 0x3FEA, 0x1FF3, 0x1FF2, 0x1FF0, 0x1FEE, 0x0FF4, 0x0FF3,
+    0x0FF1, 0x07F7, 0x07F5, 0x07F3, 0x07F1, 0x03F7, 0x03F4, 0x03F2,
+    0x03F0, 0x01F6, 0x01F4, 0x01F2, 0x01F0, 0x00F6, 0x00F4, 0x0078,
+    0x0076, 0x0039, 0x001B, 0x000C, 0x000A, 0x0001, 0x0000, 0x0004,
+    0x000B, 0x001A, 0x0038, 0x003A, 0x0077, 0x0079, 0x00F5, 0x00F7,
+    0x01F1, 0x01F3, 0x01F5, 0x01F7, 0x03F1, 0x03F3, 0x03F5, 0x03F6,
+    0x07F0, 0x07F2, 0x07F4, 0x07F6, 0x0FF0, 0x0FF2, 0x0FF5, 0x0FF6,
+    0x1FEF, 0x1FF1, 0x1FF4, 0x3FEB, 0x3FEC, 0x3FEF, 0x3FF1, 0x3FF2,
+    0x7FE8, 0x7FEA, 0x7FED, 0x7FEE, 0x7FF0, 0x7FF3, 0x7FF5, 0x7FF7,
+    0x7FFA, 0x7FFB, 0xFFFB, 0xFFFD, 0xFFFE, 0x7FF8,
+};
+
+static const uint16_t clv_biasy_3_syms[] = {
+    0xFF48, 0xFF4C, 0xFF50, 0xFF54, 0xFF58, 0xFF5C, 0xFF60, 0xFF64,
+    0xFF68, 0xFF6C, 0xFF70, 0xFF74, 0xFF78, 0xFF7C, 0xFF80, 0xFF84,
+    0xFF88, 0xFF8C, 0xFF90, 0xFF94, 0xFF98, 0xFF9C, 0xFFA0, 0xFFA4,
+    0xFFA8, 0xFFAC, 0xFFB0, 0xFFB4, 0xFFB8, 0xFFBC, 0xFFC0, 0xFFC4,
+    0xFFC8, 0xFFCC, 0xFFD0, 0xFFD4, 0xFFD8, 0xFFDC, 0xFFE0, 0xFFE4,
+    0xFFE8, 0xFFEC, 0xFFF0, 0xFFF4, 0xFFF8, 0xFFFC, 0x0000, 0x0004,
+    0x0008, 0x000C, 0x0010, 0x0014, 0x0018, 0x001C, 0x0020, 0x0024,
+    0x0028, 0x002C, 0x0030, 0x0034, 0x0038, 0x003C, 0x0040, 0x0044,
+    0x0048, 0x004C, 0x0050, 0x0054, 0x0058, 0x005C, 0x0060, 0x0064,
+    0x0068, 0x006C, 0x0070, 0x0074, 0x0078, 0x007C, 0x0080, 0x0084,
+    0x0088, 0x008C, 0x0090, 0x0094, 0x0098, 0x009C, 0x00A0, 0x00A4,
+    0x00A8, 0x00AC, 0x00B0, 0x00B4, 0x00B8, 0x0100,
+};
+
+static const uint8_t clv_biasu_1_bits[] = {
+    16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 10, 10,  9,  9,  8,
+     7,  6,  5,  2,  1,  3,  5,  7,  7,  8,  9,  9, 10, 10, 11, 12,
+    12, 12, 12, 13, 13, 13, 14, 15, 15, 16,
+};
+
+static const uint16_t clv_biasu_1_codes[] = {
+    0xFFFE, 0x7FFC, 0x3FFC, 0x1FFC, 0x1FFA, 0x1FF9, 0x0FFA, 0x0FF7,
+    0x0FF8, 0x0FF5, 0x07F8, 0x03FA, 0x03F8, 0x01FA, 0x01F9, 0x00FA,
+    0x007B, 0x003C, 0x001C, 0x0002, 0x0000, 0x0006, 0x001D, 0x007A,
+    0x007C, 0x00FB, 0x01F8, 0x01FB, 0x03F9, 0x03FB, 0x07F9, 0x0FF4,
+    0x0FF6, 0x0FF9, 0x0FFB, 0x1FF8, 0x1FFB, 0x1FFD, 0x3FFD, 0x7FFD,
+    0x7FFE, 0xFFFF,
+};
+
+static const uint16_t clv_biasu_1_syms[] = {
+    0xFFB0, 0xFFB4, 0xFFB8, 0xFFBC, 0xFFC0, 0xFFC4, 0xFFC8, 0xFFCC,
+    0xFFD0, 0xFFD4, 0xFFD8, 0xFFDC, 0xFFE0, 0xFFE4, 0xFFE8, 0xFFEC,
+    0xFFF0, 0xFFF4, 0xFFF8, 0xFFFC, 0x0000, 0x0004, 0x0008, 0x000C,
+    0x0010, 0x0014, 0x0018, 0x001C, 0x0020, 0x0024, 0x0028, 0x002C,
+    0x0030, 0x0034, 0x0038, 0x003C, 0x0040, 0x0044, 0x0048, 0x004C,
+    0x0050, 0x0100,
+};
+
+static const uint8_t clv_biasu_2_bits[] = {
+    16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, 12, 12, 11, 11, 10,
+    10,  9,  9,  8,  8,  7,  6,  5,  4,  3,  1,  3,  4,  6,  6,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 14, 15, 15,
+    15, 16, 16, 16, 16, 14,
+};
+
+static const uint16_t clv_biasu_2_codes[] = {
+    0xFFFC, 0xFFF8, 0xFFFA, 0xFFFD, 0x7FF8, 0x7FFA, 0x7FF7, 0x3FF6,
+    0x3FF7, 0x3FF4, 0x1FF9, 0x0FFB, 0x0FF9, 0x07FB, 0x07F9, 0x03FA,
+    0x03F8, 0x01FA, 0x01F9, 0x00FB, 0x00F9, 0x007B, 0x003B, 0x001C,
+    0x000C, 0x0004, 0x0000, 0x0005, 0x000D, 0x003A, 0x003C, 0x007A,
+    0x00F8, 0x00FA, 0x01F8, 0x01FB, 0x03F9, 0x03FB, 0x07F8, 0x07FA,
+    0x0FF8, 0x0FFA, 0x1FF8, 0x3FF5, 0x3FF8, 0x3FF9, 0x7FFB, 0x7FF9,
+    0x7FF6, 0xFFF9, 0xFFFF, 0xFFFE, 0xFFFB, 0x3FFA,
+};
+
+static const uint16_t clv_biasu_2_syms[] = {
+    0xFF98, 0xFF9C, 0xFFA0, 0xFFA4, 0xFFA8, 0xFFAC, 0xFFB0, 0xFFB4,
+    0xFFB8, 0xFFBC, 0xFFC0, 0xFFC4, 0xFFC8, 0xFFCC, 0xFFD0, 0xFFD4,
+    0xFFD8, 0xFFDC, 0xFFE0, 0xFFE4, 0xFFE8, 0xFFEC, 0xFFF0, 0xFFF4,
+    0xFFF8, 0xFFFC, 0x0000, 0x0004, 0x0008, 0x000C, 0x0010, 0x0014,
+    0x0018, 0x001C, 0x0020, 0x0024, 0x0028, 0x002C, 0x0030, 0x0034,
+    0x0038, 0x003C, 0x0040, 0x0044, 0x0048, 0x004C, 0x0050, 0x0054,
+    0x0058, 0x005C, 0x0060, 0x0064, 0x0068, 0x0100,
+};
+
+static const uint8_t clv_biasv_1_bits[] = {
+    16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10,  9,  8,
+     7,  6,  5,  2,  1,  3,  5,  6,  8,  8,  9, 10, 10, 11, 12, 12,
+    12, 13, 13, 13, 14, 14, 15, 15, 16, 14,
+};
+
+static const uint16_t clv_biasv_1_codes[] = {
+    0xFFFF, 0x7FFD, 0x3FFD, 0x3FFB, 0x3FF9, 0x1FFB, 0x1FF8, 0x1FF6,
+    0x0FFA, 0x0FF8, 0x07FA, 0x07F8, 0x03FA, 0x03F8, 0x01FB, 0x00FB,
+    0x007C, 0x003C, 0x001C, 0x0002, 0x0000, 0x0006, 0x001D, 0x003D,
+    0x00FA, 0x00FC, 0x01FA, 0x03F9, 0x03FB, 0x07F9, 0x0FF6, 0x0FF7,
+    0x0FF9, 0x1FF7, 0x1FF9, 0x1FFA, 0x3FFA, 0x3FFC, 0x7FFC, 0x7FFE,
+    0xFFFE, 0x3FF8,
+};
+
+static const uint16_t clv_biasv_1_syms[] = {
+    0xFFB0, 0xFFB4, 0xFFB8, 0xFFBC, 0xFFC0, 0xFFC4, 0xFFC8, 0xFFCC,
+    0xFFD0, 0xFFD4, 0xFFD8, 0xFFDC, 0xFFE0, 0xFFE4, 0xFFE8, 0xFFEC,
+    0xFFF0, 0xFFF4, 0xFFF8, 0xFFFC, 0x0000, 0x0004, 0x0008, 0x000C,
+    0x0010, 0x0014, 0x0018, 0x001C, 0x0020, 0x0024, 0x0028, 0x002C,
+    0x0030, 0x0034, 0x0038, 0x003C, 0x0040, 0x0044, 0x0048, 0x004C,
+    0x0050, 0x0100,
+};
+
+static const uint8_t clv_biasv_2_bits[] = {
+    16, 15, 14, 13, 13, 13, 13, 13, 12, 12, 11, 10, 10,  9,  9,  8,
+     7,  6,  5,  4,  3,  1,  3,  4,  5,  7,  7,  8,  9,  9, 10, 10,
+    10, 12, 12, 13, 13, 13, 13, 13, 14, 16, 15, 15,
+};
+
+static const uint16_t clv_biasv_2_codes[] = {
+    0xFFFE, 0x7FFD, 0x3FFC, 0x1FFC, 0x1FFB, 0x1FF8, 0x1FF7, 0x1FF4,
+    0x0FF8, 0x0FF7, 0x07FA, 0x03FB, 0x03F8, 0x01FA, 0x01F9, 0x00FA,
+    0x007B, 0x003C, 0x001C, 0x000C, 0x0004, 0x0000, 0x0005, 0x000D,
+    0x001D, 0x007A, 0x007C, 0x00FB, 0x01F8, 0x01FB, 0x03F9, 0x03FA,
+    0x03FC, 0x0FF6, 0x0FF9, 0x1FF5, 0x1FF9, 0x1FF6, 0x1FFA, 0x1FFD,
+    0x3FFD, 0xFFFF, 0x7FFE, 0x7FFC,
+};
+
+static const uint16_t clv_biasv_2_syms[] = {
+    0xFFAC, 0xFFB0, 0xFFB4, 0xFFB8, 0xFFBC, 0xFFC0, 0xFFC4, 0xFFC8,
+    0xFFCC, 0xFFD0, 0xFFD4, 0xFFD8, 0xFFDC, 0xFFE0, 0xFFE4, 0xFFE8,
+    0xFFEC, 0xFFF0, 0xFFF4, 0xFFF8, 0xFFFC, 0x0000, 0x0004, 0x0008,
+    0x000C, 0x0010, 0x0014, 0x0018, 0x001C, 0x0020, 0x0024, 0x0028,
+    0x002C, 0x0030, 0x0034, 0x0038, 0x003C, 0x0040, 0x0044, 0x0048,
+    0x004C, 0x0050, 0x0054, 0x0100,
+};
+
+#endif /* AVCODEC_CLEARVIDEODATA_H */
diff --git a/libavcodec/cljrdec.c b/libavcodec/cljrdec.c
index d17212e..4b187f8 100644
--- a/libavcodec/cljrdec.c
+++ b/libavcodec/cljrdec.c
@@ -2,20 +2,20 @@
  * Cirrus Logic AccuPak (CLJR) decoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 static int decode_frame(AVCodecContext *avctx,
@@ -34,7 +34,7 @@ static int decode_frame(AVCodecContext *avctx,
 {
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
-    BitstreamContext bc;
+    GetBitContext gb;
     AVFrame * const p = data;
     int x, y, ret;
 
@@ -43,33 +43,31 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if (buf_size < avctx->height * avctx->width) {
+    if (buf_size / avctx->height < avctx->width) {
         av_log(avctx, AV_LOG_ERROR,
                "Resolution larger than buffer size. Invalid header?\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
-    bitstream_init8(&bc, buf, buf_size);
+    init_get_bits(&gb, buf, buf_size * 8);
 
     for (y = 0; y < avctx->height; y++) {
         uint8_t *luma = &p->data[0][y * p->linesize[0]];
         uint8_t *cb   = &p->data[1][y * p->linesize[1]];
         uint8_t *cr   = &p->data[2][y * p->linesize[2]];
         for (x = 0; x < avctx->width; x += 4) {
-            luma[3] = bitstream_read(&bc, 5) << 3;
-            luma[2] = bitstream_read(&bc, 5) << 3;
-            luma[1] = bitstream_read(&bc, 5) << 3;
-            luma[0] = bitstream_read(&bc, 5) << 3;
+            luma[3] = (get_bits(&gb, 5)*33) >> 2;
+            luma[2] = (get_bits(&gb, 5)*33) >> 2;
+            luma[1] = (get_bits(&gb, 5)*33) >> 2;
+            luma[0] = (get_bits(&gb, 5)*33) >> 2;
             luma += 4;
-            *(cb++) = bitstream_read(&bc, 6) << 2;
-            *(cr++) = bitstream_read(&bc, 6) << 2;
+            *(cb++) = get_bits(&gb, 6) << 2;
+            *(cr++) = get_bits(&gb, 6) << 2;
         }
     }
 
@@ -93,3 +91,4 @@ AVCodec ff_cljr_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+
diff --git a/libavcodec/cljrenc.c b/libavcodec/cljrenc.c
index 0687e30..a371825 100644
--- a/libavcodec/cljrenc.c
+++ b/libavcodec/cljrenc.c
@@ -2,20 +2,20 @@
  * Cirrus Logic AccuPak (CLJR) encoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,28 +25,39 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
 #include "put_bits.h"
 
+typedef struct CLJRContext {
+    AVClass        *avclass;
+    int             dither_type;
+} CLJRContext;
+
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *p, int *got_packet)
 {
+    CLJRContext *a = avctx->priv_data;
     PutBitContext pb;
     int x, y, ret;
+    uint32_t dither= avctx->frame_number;
+    static const uint32_t ordered_dither[2][2] =
+    {
+        { 0x10400000, 0x104F0000 },
+        { 0xCB2A0000, 0xCB250000 },
+    };
 
-    if ((ret = ff_alloc_packet(pkt, 32*avctx->height*avctx->width/4)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+    if (avctx->width%4 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+         av_log(avctx, AV_LOG_ERROR,
+                "Widths which are not a multiple of 4 might fail with some decoders, "
+                "use vstrict=-1 / -strict -1 to use %d anyway.\n", avctx->width);
+         return AVERROR_EXPERIMENTAL;
     }
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    if ((ret = ff_alloc_packet2(avctx, pkt, 32*avctx->height*avctx->width/4, 0)) < 0)
+        return ret;
 
     init_put_bits(&pb, pkt->data, pkt->size);
 
@@ -54,14 +65,25 @@ FF_ENABLE_DEPRECATION_WARNINGS
         uint8_t *luma = &p->data[0][y * p->linesize[0]];
         uint8_t *cb   = &p->data[1][y * p->linesize[1]];
         uint8_t *cr   = &p->data[2][y * p->linesize[2]];
+        uint8_t luma_tmp[4];
         for (x = 0; x < avctx->width; x += 4) {
-            put_bits(&pb, 5, luma[3] >> 3);
-            put_bits(&pb, 5, luma[2] >> 3);
-            put_bits(&pb, 5, luma[1] >> 3);
-            put_bits(&pb, 5, luma[0] >> 3);
+            switch (a->dither_type) {
+            case 0: dither = 0x492A0000;                       break;
+            case 1: dither = dither * 1664525 + 1013904223;    break;
+            case 2: dither = ordered_dither[ y&1 ][ (x>>2)&1 ];break;
+            }
+            if (x+3 >= avctx->width) {
+                memset(luma_tmp, 0, sizeof(luma_tmp));
+                memcpy(luma_tmp, luma, avctx->width - x);
+                luma = luma_tmp;
+            }
+            put_bits(&pb, 5, (249*(luma[3] +  (dither>>29)   )) >> 11);
+            put_bits(&pb, 5, (249*(luma[2] + ((dither>>26)&7))) >> 11);
+            put_bits(&pb, 5, (249*(luma[1] + ((dither>>23)&7))) >> 11);
+            put_bits(&pb, 5, (249*(luma[0] + ((dither>>20)&7))) >> 11);
             luma += 4;
-            put_bits(&pb, 6, *(cb++) >> 2);
-            put_bits(&pb, 6, *(cr++) >> 2);
+            put_bits(&pb, 6, (253*(*(cb++) + ((dither>>18)&3))) >> 10);
+            put_bits(&pb, 6, (253*(*(cr++) + ((dither>>16)&3))) >> 10);
         }
     }
 
@@ -73,12 +95,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+#define OFFSET(x) offsetof(CLJRContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "dither_type",   "Dither type",   OFFSET(dither_type),        AV_OPT_TYPE_INT, { .i64=1 }, 0, 2, VE},
+    { NULL },
+};
+
+static const AVClass cljr_class = {
+    .class_name = "cljr encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_cljr_encoder = {
     .name           = "cljr",
     .long_name      = NULL_IF_CONFIG_SMALL("Cirrus Logic AccuPak"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_CLJR,
+    .priv_data_size = sizeof(CLJRContext),
     .encode2        = encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV411P,
                                                    AV_PIX_FMT_NONE },
+    .priv_class     = &cljr_class,
 };
diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c
index 3c476f7..af0f6da 100644
--- a/libavcodec/cllc.c
+++ b/libavcodec/cllc.c
@@ -3,33 +3,36 @@
  *
  * Copyright (c) 2012-2013 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
 #include "libavutil/intreadwrite.h"
-
-#include "bitstream.h"
 #include "bswapdsp.h"
 #include "canopus.h"
+#include "get_bits.h"
 #include "avcodec.h"
 #include "internal.h"
-#include "vlc.h"
+#include "thread.h"
+
+#define VLC_BITS 7
+#define VLC_DEPTH 2
+
 
 typedef struct CLLCContext {
     AVCodecContext *avctx;
@@ -39,7 +42,7 @@ typedef struct CLLCContext {
     int      swapped_buf_size;
 } CLLCContext;
 
-static int read_code_table(CLLCContext *ctx, BitstreamContext *bc, VLC *vlc)
+static int read_code_table(CLLCContext *ctx, GetBitContext *gb, VLC *vlc)
 {
     uint8_t symbols[256];
     uint8_t bits[256];
@@ -51,10 +54,17 @@ static int read_code_table(CLLCContext *ctx, BitstreamContext *bc, VLC *vlc)
     count         = 0;
     num_codes_sum = 0;
 
-    num_lens = bitstream_read(bc, 5);
+    num_lens = get_bits(gb, 5);
+
+    if (num_lens > VLC_BITS * VLC_DEPTH) {
+        vlc->table = NULL;
+
+        av_log(ctx->avctx, AV_LOG_ERROR, "To long VLCs %d\n", num_lens);
+        return AVERROR_INVALIDDATA;
+    }
 
     for (i = 0; i < num_lens; i++) {
-        num_codes      = bitstream_read(bc, 9);
+        num_codes      = get_bits(gb, 9);
         num_codes_sum += num_codes;
 
         if (num_codes_sum > 256) {
@@ -66,17 +76,21 @@ static int read_code_table(CLLCContext *ctx, BitstreamContext *bc, VLC *vlc)
         }
 
         for (j = 0; j < num_codes; j++) {
-            symbols[count] = bitstream_read(bc, 8);
+            symbols[count] = get_bits(gb, 8);
             bits[count]    = i + 1;
             codes[count]   = prefix++;
 
             count++;
         }
+        if (prefix > (65535 - 256)/2) {
+            vlc->table = NULL;
+            return AVERROR_INVALIDDATA;
+        }
 
         prefix <<= 1;
     }
 
-    return ff_init_vlc_sparse(vlc, 7, count, bits, 1, 1,
+    return ff_init_vlc_sparse(vlc, VLC_BITS, count, bits, 1, 1,
                               codes, 2, 2, symbols, 1, 1, 0);
 }
 
@@ -84,7 +98,7 @@ static int read_code_table(CLLCContext *ctx, BitstreamContext *bc, VLC *vlc)
  * Unlike the RGB24 read/restore, which reads in a component at a time,
  * ARGB read/restore reads in ARGB quads.
  */
-static int read_argb_line(CLLCContext *ctx, BitstreamContext *bc, int *top_left,
+static int read_argb_line(CLLCContext *ctx, GetBitContext *gb, int *top_left,
                           VLC *vlc, uint8_t *outbuf)
 {
     uint8_t *dst;
@@ -92,6 +106,8 @@ static int read_argb_line(CLLCContext *ctx, BitstreamContext *bc, int *top_left,
     int code;
     int i;
 
+    OPEN_READER(bits, gb);
+
     dst     = outbuf;
     pred[0] = top_left[0];
     pred[1] = top_left[1];
@@ -100,7 +116,8 @@ static int read_argb_line(CLLCContext *ctx, BitstreamContext *bc, int *top_left,
 
     for (i = 0; i < ctx->avctx->width; i++) {
         /* Always get the alpha component */
-        code = bitstream_read_vlc(bc, vlc[0].table, 7, 2);
+        UPDATE_CACHE(bits, gb);
+        GET_VLC(code, bits, gb, vlc[0].table, VLC_BITS, VLC_DEPTH);
 
         pred[0] += code;
         dst[0]   = pred[0];
@@ -108,19 +125,22 @@ static int read_argb_line(CLLCContext *ctx, BitstreamContext *bc, int *top_left,
         /* Skip the components if they are  entirely transparent */
         if (dst[0]) {
             /* Red */
-            code = bitstream_read_vlc(bc, vlc[1].table, 7, 2);
+            UPDATE_CACHE(bits, gb);
+            GET_VLC(code, bits, gb, vlc[1].table, VLC_BITS, VLC_DEPTH);
 
             pred[1] += code;
             dst[1]   = pred[1];
 
             /* Green */
-            code = bitstream_read_vlc(bc, vlc[2].table, 7, 2);
+            UPDATE_CACHE(bits, gb);
+            GET_VLC(code, bits, gb, vlc[2].table, VLC_BITS, VLC_DEPTH);
 
             pred[2] += code;
             dst[2]   = pred[2];
 
             /* Blue */
-            code = bitstream_read_vlc(bc, vlc[3].table, 7, 2);
+            UPDATE_CACHE(bits, gb);
+            GET_VLC(code, bits, gb, vlc[3].table, VLC_BITS, VLC_DEPTH);
 
             pred[3] += code;
             dst[3]   = pred[3];
@@ -133,6 +153,8 @@ static int read_argb_line(CLLCContext *ctx, BitstreamContext *bc, int *top_left,
         dst += 4;
     }
 
+    CLOSE_READER(bits, gb);
+
     top_left[0]  = outbuf[0];
 
     /* Only stash components if they are not transparent */
@@ -145,55 +167,65 @@ static int read_argb_line(CLLCContext *ctx, BitstreamContext *bc, int *top_left,
     return 0;
 }
 
-static int read_rgb24_component_line(CLLCContext *ctx, BitstreamContext *bc,
+static int read_rgb24_component_line(CLLCContext *ctx, GetBitContext *gb,
                                      int *top_left, VLC *vlc, uint8_t *outbuf)
 {
     uint8_t *dst;
     int pred, code;
     int i;
 
+    OPEN_READER(bits, gb);
+
     dst  = outbuf;
     pred = *top_left;
 
     /* Simultaneously read and restore the line */
     for (i = 0; i < ctx->avctx->width; i++) {
-        code = bitstream_read_vlc(bc, vlc->table, 7, 2);
+        UPDATE_CACHE(bits, gb);
+        GET_VLC(code, bits, gb, vlc->table, VLC_BITS, VLC_DEPTH);
 
         pred  += code;
         dst[0] = pred;
         dst   += 3;
     }
 
+    CLOSE_READER(bits, gb);
+
     /* Stash the first pixel */
     *top_left = outbuf[0];
 
     return 0;
 }
 
-static int read_yuv_component_line(CLLCContext *ctx, BitstreamContext *bc,
+static int read_yuv_component_line(CLLCContext *ctx, GetBitContext *gb,
                                    int *top_left, VLC *vlc, uint8_t *outbuf,
                                    int is_chroma)
 {
     int pred, code;
     int i;
 
+    OPEN_READER(bits, gb);
+
     pred = *top_left;
 
     /* Simultaneously read and restore the line */
     for (i = 0; i < ctx->avctx->width >> is_chroma; i++) {
-        code = bitstream_read_vlc(bc, vlc->table, 7, 2);
+        UPDATE_CACHE(bits, gb);
+        GET_VLC(code, bits, gb, vlc->table, VLC_BITS, VLC_DEPTH);
 
         pred     += code;
         outbuf[i] = pred;
     }
 
+    CLOSE_READER(bits, gb);
+
     /* Stash the first pixel */
     *top_left = outbuf[0];
 
     return 0;
 }
 
-static int decode_argb_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pic)
+static int decode_argb_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
 {
     AVCodecContext *avctx = ctx->avctx;
     uint8_t *dst;
@@ -209,11 +241,11 @@ static int decode_argb_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pi
 
     dst = pic->data[0];
 
-    bitstream_skip(bc, 16);
+    skip_bits(gb, 16);
 
     /* Read in code table for each plane */
     for (i = 0; i < 4; i++) {
-        ret = read_code_table(ctx, bc, &vlc[i]);
+        ret = read_code_table(ctx, gb, &vlc[i]);
         if (ret < 0) {
             for (j = 0; j <= i; j++)
                 ff_free_vlc(&vlc[j]);
@@ -226,7 +258,7 @@ static int decode_argb_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pi
 
     /* Read in and restore every line */
     for (i = 0; i < avctx->height; i++) {
-        read_argb_line(ctx, bc, pred, vlc, dst);
+        read_argb_line(ctx, gb, pred, vlc, dst);
 
         dst += pic->linesize[0];
     }
@@ -237,7 +269,7 @@ static int decode_argb_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pi
     return 0;
 }
 
-static int decode_rgb24_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pic)
+static int decode_rgb24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
 {
     AVCodecContext *avctx = ctx->avctx;
     uint8_t *dst;
@@ -252,11 +284,11 @@ static int decode_rgb24_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *p
 
     dst = pic->data[0];
 
-    bitstream_skip(bc, 16);
+    skip_bits(gb, 16);
 
     /* Read in code table for each plane */
     for (i = 0; i < 3; i++) {
-        ret = read_code_table(ctx, bc, &vlc[i]);
+        ret = read_code_table(ctx, gb, &vlc[i]);
         if (ret < 0) {
             for (j = 0; j <= i; j++)
                 ff_free_vlc(&vlc[j]);
@@ -270,7 +302,7 @@ static int decode_rgb24_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *p
     /* Read in and restore every line */
     for (i = 0; i < avctx->height; i++) {
         for (j = 0; j < 3; j++)
-            read_rgb24_component_line(ctx, bc, &pred[j], &vlc[j], &dst[j]);
+            read_rgb24_component_line(ctx, gb, &pred[j], &vlc[j], &dst[j]);
 
         dst += pic->linesize[0];
     }
@@ -281,7 +313,7 @@ static int decode_rgb24_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *p
     return 0;
 }
 
-static int decode_yuv_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pic)
+static int decode_yuv_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
 {
     AVCodecContext *avctx = ctx->avctx;
     uint8_t block;
@@ -299,9 +331,9 @@ static int decode_yuv_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pic
     dst[1] = pic->data[1];
     dst[2] = pic->data[2];
 
-    bitstream_skip(bc, 8);
+    skip_bits(gb, 8);
 
-    block = bitstream_read(bc, 8);
+    block = get_bits(gb, 8);
     if (block) {
         avpriv_request_sample(ctx->avctx, "Blocked YUV");
         return AVERROR_PATCHWELCOME;
@@ -309,7 +341,7 @@ static int decode_yuv_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pic
 
     /* Read in code table for luma and chroma */
     for (i = 0; i < 2; i++) {
-        ret = read_code_table(ctx, bc, &vlc[i]);
+        ret = read_code_table(ctx, gb, &vlc[i]);
         if (ret < 0) {
             for (j = 0; j <= i; j++)
                 ff_free_vlc(&vlc[j]);
@@ -322,9 +354,9 @@ static int decode_yuv_frame(CLLCContext *ctx, BitstreamContext *bc, AVFrame *pic
 
     /* Read in and restore every line */
     for (i = 0; i < avctx->height; i++) {
-        read_yuv_component_line(ctx, bc, &pred[0], &vlc[0], dst[0], 0); /* Y */
-        read_yuv_component_line(ctx, bc, &pred[1], &vlc[1], dst[1], 1); /* U */
-        read_yuv_component_line(ctx, bc, &pred[2], &vlc[1], dst[2], 1); /* V */
+        read_yuv_component_line(ctx, gb, &pred[0], &vlc[0], dst[0], 0); /* Y */
+        read_yuv_component_line(ctx, gb, &pred[1], &vlc[1], dst[1], 1); /* U */
+        read_yuv_component_line(ctx, gb, &pred[2], &vlc[1], dst[2], 1); /* V */
 
         for (j = 0; j < 3; j++)
             dst[j] += pic->linesize[j];
@@ -341,10 +373,11 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
 {
     CLLCContext *ctx = avctx->priv_data;
     AVFrame *pic = data;
+    ThreadFrame frame = { .f = data };
     uint8_t *src = avpkt->data;
     uint32_t info_tag, info_offset;
     int data_size;
-    BitstreamContext bc;
+    GetBitContext gb;
     int coding_type, ret;
 
     if (avpkt->size < 4 + 4) {
@@ -382,7 +415,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
     ctx->bdsp.bswap16_buf((uint16_t *) ctx->swapped_buf, (uint16_t *) src,
                           data_size / 2);
 
-    bitstream_init8(&bc, ctx->swapped_buf, data_size);
+    if ((ret = init_get_bits8(&gb, ctx->swapped_buf, data_size)) < 0)
+        return ret;
 
     /*
      * Read in coding type. The types are as follows:
@@ -395,18 +429,18 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
     coding_type = (AV_RL32(src) >> 8) & 0xFF;
     av_log(avctx, AV_LOG_DEBUG, "Frame coding type: %d\n", coding_type);
 
+    if(get_bits_left(&gb) < avctx->height * avctx->width)
+        return AVERROR_INVALIDDATA;
+
     switch (coding_type) {
     case 0:
         avctx->pix_fmt             = AV_PIX_FMT_YUV422P;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
             return ret;
-        }
 
-        ret = decode_yuv_frame(ctx, &bc, pic);
+        ret = decode_yuv_frame(ctx, &gb, pic);
         if (ret < 0)
             return ret;
 
@@ -416,13 +450,10 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_RGB24;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
             return ret;
-        }
 
-        ret = decode_rgb24_frame(ctx, &bc, pic);
+        ret = decode_rgb24_frame(ctx, &gb, pic);
         if (ret < 0)
             return ret;
 
@@ -431,13 +462,10 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_ARGB;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
             return ret;
-        }
 
-        ret = decode_argb_frame(ctx, &bc, pic);
+        ret = decode_argb_frame(ctx, &gb, pic);
         if (ret < 0)
             return ret;
 
@@ -455,6 +483,19 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
     return avpkt->size;
 }
 
+#if HAVE_THREADS
+static int cllc_init_thread_copy(AVCodecContext *avctx)
+{
+    CLLCContext *ctx = avctx->priv_data;
+
+    ctx->avctx            = avctx;
+    ctx->swapped_buf      = NULL;
+    ctx->swapped_buf_size = 0;
+
+    return 0;
+}
+#endif
+
 static av_cold int cllc_decode_close(AVCodecContext *avctx)
 {
     CLLCContext *ctx = avctx->priv_data;
@@ -485,8 +526,9 @@ AVCodec ff_cllc_decoder = {
     .id             = AV_CODEC_ID_CLLC,
     .priv_data_size = sizeof(CLLCContext),
     .init           = cllc_decode_init,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(cllc_init_thread_copy),
     .decode         = cllc_decode_frame,
     .close          = cllc_decode_close,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/cngdec.c b/libavcodec/cngdec.c
index 482ef94..28432ac 100644
--- a/libavcodec/cngdec.c
+++ b/libavcodec/cngdec.c
@@ -2,26 +2,28 @@
  * RFC 3389 comfort noise generator
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <math.h>
 
 #include "libavutil/common.h"
+#include "libavutil/ffmath.h"
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "celp_filters.h"
 #include "internal.h"
@@ -41,11 +43,11 @@ typedef struct CNGContext {
 static av_cold int cng_decode_close(AVCodecContext *avctx)
 {
     CNGContext *p = avctx->priv_data;
-    av_free(p->refl_coef);
-    av_free(p->target_refl_coef);
-    av_free(p->lpc_coef);
-    av_free(p->filter_out);
-    av_free(p->excitation);
+    av_freep(&p->refl_coef);
+    av_freep(&p->target_refl_coef);
+    av_freep(&p->lpc_coef);
+    av_freep(&p->filter_out);
+    av_freep(&p->excitation);
     return 0;
 }
 
@@ -59,12 +61,12 @@ static av_cold int cng_decode_init(AVCodecContext *avctx)
 
     p->order            = 12;
     avctx->frame_size   = 640;
-    p->refl_coef        = av_mallocz(p->order * sizeof(*p->refl_coef));
-    p->target_refl_coef = av_mallocz(p->order * sizeof(*p->target_refl_coef));
-    p->lpc_coef         = av_mallocz(p->order * sizeof(*p->lpc_coef));
-    p->filter_out       = av_mallocz((avctx->frame_size + p->order) *
+    p->refl_coef        = av_mallocz_array(p->order, sizeof(*p->refl_coef));
+    p->target_refl_coef = av_mallocz_array(p->order, sizeof(*p->target_refl_coef));
+    p->lpc_coef         = av_mallocz_array(p->order, sizeof(*p->lpc_coef));
+    p->filter_out       = av_mallocz_array(avctx->frame_size + p->order,
                                      sizeof(*p->filter_out));
-    p->excitation       = av_mallocz(avctx->frame_size * sizeof(*p->excitation));
+    p->excitation       = av_mallocz_array(avctx->frame_size, sizeof(*p->excitation));
     if (!p->refl_coef || !p->target_refl_coef || !p->lpc_coef ||
         !p->filter_out || !p->excitation) {
         cng_decode_close(avctx);
@@ -112,13 +114,18 @@ static int cng_decode_frame(AVCodecContext *avctx, void *data,
 
     if (avpkt->size) {
         int dbov = -avpkt->data[0];
-        p->target_energy = 1081109975 * pow(10, dbov / 10.0) * 0.75;
+        p->target_energy = 1081109975 * ff_exp10(dbov / 10.0) * 0.75;
         memset(p->target_refl_coef, 0, p->order * sizeof(*p->target_refl_coef));
         for (i = 0; i < FFMIN(avpkt->size - 1, p->order); i++) {
             p->target_refl_coef[i] = (avpkt->data[1 + i] - 127) / 128.0;
         }
     }
 
+    if (avctx->internal->skip_samples > 10 * avctx->frame_size) {
+        avctx->internal->skip_samples = 0;
+        return AVERROR_INVALIDDATA;
+    }
+
     if (p->inited) {
         p->energy = p->energy / 2 + p->target_energy / 2;
         for (i = 0; i < p->order; i++)
@@ -142,13 +149,11 @@ static int cng_decode_frame(AVCodecContext *avctx, void *data,
                                  p->excitation, avctx->frame_size, p->order);
 
     frame->nb_samples = avctx->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (int16_t *)frame->data[0];
     for (i = 0; i < avctx->frame_size; i++)
-        buf_out[i] = p->filter_out[i + p->order];
+        buf_out[i] = av_clip_int16(p->filter_out[i + p->order]);
     memcpy(p->filter_out, p->filter_out + avctx->frame_size,
            p->order * sizeof(*p->filter_out));
 
diff --git a/libavcodec/cngenc.c b/libavcodec/cngenc.c
index 98f3c4e..e185c4a 100644
--- a/libavcodec/cngenc.c
+++ b/libavcodec/cngenc.c
@@ -2,20 +2,20 @@
  * RFC 3389 comfort noise generator
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,8 +56,8 @@ static av_cold int cng_encode_init(AVCodecContext *avctx)
     p->order = 10;
     if ((ret = ff_lpc_init(&p->lpc, avctx->frame_size, p->order, FF_LPC_TYPE_LEVINSON)) < 0)
         return ret;
-    p->samples32 = av_malloc(avctx->frame_size * sizeof(*p->samples32));
-    p->ref_coef = av_malloc(p->order * sizeof(*p->ref_coef));
+    p->samples32 = av_malloc_array(avctx->frame_size, sizeof(*p->samples32));
+    p->ref_coef = av_malloc_array(p->order, sizeof(*p->ref_coef));
     if (!p->samples32 || !p->ref_coef) {
         cng_encode_close(avctx);
         return AVERROR(ENOMEM);
@@ -75,7 +75,7 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int qdbov;
     int16_t *samples = (int16_t*) frame->data[0];
 
-    if ((ret = ff_alloc_packet(avpkt, 1 + p->order))) {
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 1 + p->order, 1 + p->order))) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
         return ret;
     }
@@ -87,17 +87,17 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     energy /= frame->nb_samples;
     if (energy > 0) {
         double dbov = 10 * log10(energy / 1081109975);
-        qdbov = av_clip(-floor(dbov), 0, 127);
+        qdbov = av_clip_uintp2(-floor(dbov), 7);
     } else {
         qdbov = 127;
     }
-    ret = ff_lpc_calc_ref_coefs(&p->lpc, p->samples32, p->order, p->ref_coef);
+    ff_lpc_calc_ref_coefs(&p->lpc, p->samples32, p->order, p->ref_coef);
     avpkt->data[0] = qdbov;
     for (i = 0; i < p->order; i++)
         avpkt->data[1 + i] = p->ref_coef[i] * 127 + 127;
 
     *got_packet_ptr = 1;
-    avpkt->size = 1 + p->order;
+    av_assert1(avpkt->size == 1 + p->order);
 
     return 0;
 }
diff --git a/libavcodec/codec2utils.c b/libavcodec/codec2utils.c
new file mode 100644
index 0000000..931478f
--- /dev/null
+++ b/libavcodec/codec2utils.c
@@ -0,0 +1,80 @@
+/*
+ * codec2 utility functions
+ * Copyright (c) 2017 Tomas Härdin
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include "internal.h"
+#include "libavcodec/codec2utils.h"
+
+int avpriv_codec2_mode_bit_rate(void *logctx, int mode)
+{
+    int frame_size  = avpriv_codec2_mode_frame_size(logctx, mode);
+    int block_align = avpriv_codec2_mode_block_align(logctx, mode);
+
+    if (frame_size <= 0 || block_align <= 0) {
+        return 0;
+    }
+
+    return 8 * 8000 * block_align / frame_size;
+}
+
+int avpriv_codec2_mode_frame_size(void *logctx, int mode)
+{
+    int frame_size_table[AVPRIV_CODEC2_MODE_MAX+1] = {
+        160,    // 3200
+        160,    // 2400
+        320,    // 1600
+        320,    // 1400
+        320,    // 1300
+        320,    // 1200
+        320,    // 700
+        320,    // 700B
+        320,    // 700C
+    };
+
+    if (mode < 0 || mode > AVPRIV_CODEC2_MODE_MAX) {
+        av_log(logctx, AV_LOG_ERROR, "unknown codec2 mode %i, can't find frame_size\n", mode);
+        return 0;
+    } else {
+        return frame_size_table[mode];
+    }
+}
+
+int avpriv_codec2_mode_block_align(void *logctx, int mode)
+{
+    int block_align_table[AVPRIV_CODEC2_MODE_MAX+1] = {
+        8,      // 3200
+        6,      // 2400
+        8,      // 1600
+        7,      // 1400
+        7,      // 1300
+        6,      // 1200
+        4,      // 700
+        4,      // 700B
+        4,      // 700C
+    };
+
+    if (mode < 0 || mode > AVPRIV_CODEC2_MODE_MAX) {
+        av_log(logctx, AV_LOG_ERROR, "unknown codec2 mode %i, can't find block_align\n", mode);
+        return 0;
+    } else {
+        return block_align_table[mode];
+    }
+}
diff --git a/libavcodec/codec2utils.h b/libavcodec/codec2utils.h
new file mode 100644
index 0000000..6def4d4
--- /dev/null
+++ b/libavcodec/codec2utils.h
@@ -0,0 +1,82 @@
+/*
+ * codec2 utility functions
+ * Copyright (c) 2017 Tomas Härdin
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CODEC2UTILS_H
+#define AVCODEC_CODEC2UTILS_H
+
+#include <stdint.h>
+
+//Highest mode we're willing to use.
+//Don't want to let users accidentally produce files that can't be decoded in the future.
+//CODEC2_MODE_WB (9) is experimental/unstable as of 2017-11-23.
+#define AVPRIV_CODEC2_MODE_MAX 8 //CODEC2_MODE_700C
+
+//Used by both codec2raw demuxer and libcodec2 encoder.
+//The integers match the values in codec2.h, so "3200" -> CODEC2_MODE_3000 = 0 and so on.
+//It is possible that we're linked to a version of libcodec2 that lacks some of these modes.
+//For example Debian stretch ships with libcodec2.so.0.4 which lacks CODEC2_MODE_700C.
+#define AVPRIV_CODEC2_AVOPTIONS(desc, classname, min_val, default_val, option_flags) \
+    { "mode", desc, offsetof(classname, mode), AV_OPT_TYPE_INT, {.i64 = default_val}, min_val, AVPRIV_CODEC2_MODE_MAX, .flags=option_flags, .unit="codec2_mode"},\
+    { "3200", "3200", 0, AV_OPT_TYPE_CONST, {.i64 = 0}, .flags=option_flags, .unit="codec2_mode"},\
+    { "2400", "2400", 0, AV_OPT_TYPE_CONST, {.i64 = 1}, .flags=option_flags, .unit="codec2_mode"},\
+    { "1600", "1600", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, .flags=option_flags, .unit="codec2_mode"},\
+    { "1400", "1400", 0, AV_OPT_TYPE_CONST, {.i64 = 3}, .flags=option_flags, .unit="codec2_mode"},\
+    { "1300", "1300", 0, AV_OPT_TYPE_CONST, {.i64 = 4}, .flags=option_flags, .unit="codec2_mode"},\
+    { "1200", "1200", 0, AV_OPT_TYPE_CONST, {.i64 = 5}, .flags=option_flags, .unit="codec2_mode"},\
+    { "700",  "700",  0, AV_OPT_TYPE_CONST, {.i64 = 6}, .flags=option_flags, .unit="codec2_mode"},\
+    { "700B", "700B", 0, AV_OPT_TYPE_CONST, {.i64 = 7}, .flags=option_flags, .unit="codec2_mode"},\
+    { "700C", "700C", 0, AV_OPT_TYPE_CONST, {.i64 = 8}, .flags=option_flags, .unit="codec2_mode"}
+
+//The three following functions are here to avoid needing libavformat/codec2.c to depend on libcodec2
+
+//Computes bitrate from mode, with frames rounded up to the nearest octet.
+//So 700 bit/s (28 bits/frame) becomes 800 bits/s (32 bits/frame).
+//logctx is used for av_log()
+//Returns <0 if mode is invalid
+int avpriv_codec2_mode_bit_rate(void *logctx, int mode);
+
+//Mimics codec2_samples_per_frame()
+int avpriv_codec2_mode_frame_size(void *logctx, int mode);
+
+//Mimics (codec2_bits_per_frame()+7)/8
+int avpriv_codec2_mode_block_align(void *logctx, int mode);
+
+#define AVPRIV_CODEC2_EXTRADATA_SIZE 4
+
+//Used in codec2raw demuxer and libcodec2 encoder
+static inline void avpriv_codec2_make_extradata(uint8_t *ptr, int mode) {
+    //version 0.8 as of 2017-12-23 (r3386)
+    ptr[0] = 0;     //major
+    ptr[1] = 8;     //minor
+    ptr[2] = mode;  //mode
+    ptr[3] = 0;     //flags
+}
+
+//Returns version as a 16-bit value. 0.8 -> 0x0008
+static inline uint16_t avpriv_codec2_version_from_extradata(uint8_t *ptr) {
+    return (ptr[0] << 8) + ptr[1];
+}
+
+static inline uint8_t avpriv_codec2_mode_from_extradata(uint8_t *ptr) {
+    return ptr[2];
+}
+
+#endif /* AVCODEC_CODEC2UTILS_H */
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 5061b78..a3de8e1 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * This table was generated from the long and short names of AVCodecs
+ * please see the respective codec sources for authorship
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,9 +23,11 @@
 
 #include "libavutil/common.h"
 #include "libavutil/internal.h"
-
 #include "avcodec.h"
 #include "profiles.h"
+#include "version.h"
+
+#define MT(...) (const char *const[]){ __VA_ARGS__, NULL }
 
 static const AVCodecDescriptor codec_descriptors[] = {
     /* video codecs */
@@ -75,6 +80,8 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "mjpeg",
         .long_name = NULL_IF_CONFIG_SMALL("Motion JPEG"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .mime_types= MT("image/jpeg"),
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_mjpeg_profiles),
     },
     {
         .id        = AV_CODEC_ID_MJPEGB,
@@ -84,6 +91,28 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_LJPEG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ljpeg",
+        .long_name = NULL_IF_CONFIG_SMALL("Lossless JPEG"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_SP5X,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "sp5x",
+        .long_name = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_JPEGLS,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "jpegls",
+        .long_name = NULL_IF_CONFIG_SMALL("JPEG-LS"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_MPEG4,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "mpeg4",
@@ -429,6 +458,50 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_PNG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "png",
+        .long_name = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
+    },
+    {
+        .id        = AV_CODEC_ID_PPM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ppm",
+        .long_name = NULL_IF_CONFIG_SMALL("PPM (Portable PixelMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PBM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pbm",
+        .long_name = NULL_IF_CONFIG_SMALL("PBM (Portable BitMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PGM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pgm",
+        .long_name = NULL_IF_CONFIG_SMALL("PGM (Portable GrayMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PGMYUV,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pgmyuv",
+        .long_name = NULL_IF_CONFIG_SMALL("PGMYUV (Portable GrayMap YUV) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PAM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pam",
+        .long_name = NULL_IF_CONFIG_SMALL("PAM (Portable AnyMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-portable-pixmap"),
+    },
+    {
         .id        = AV_CODEC_ID_FFVHUFF,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "ffvhuff",
@@ -498,7 +571,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "fraps",
         .long_name = NULL_IF_CONFIG_SMALL("Fraps"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_TRUEMOTION2,
@@ -513,6 +586,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "bmp",
         .long_name = NULL_IF_CONFIG_SMALL("BMP (Windows and OS/2 bitmap)"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-ms-bmp"),
     },
     {
         .id        = AV_CODEC_ID_CSCD,
@@ -584,6 +658,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("JPEG 2000"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
                      AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/jp2"),
         .profiles  = NULL_IF_CONFIG_SMALL(ff_jpeg2000_profiles),
     },
     {
@@ -615,6 +690,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_TARGA,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "targa",
+        .long_name = NULL_IF_CONFIG_SMALL("Truevision Targa image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-targa", "image/x-tga"),
+    },
+    {
         .id        = AV_CODEC_ID_DSICINVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dsicinvideo",
@@ -629,6 +712,22 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_TIFF,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "tiff",
+        .long_name = NULL_IF_CONFIG_SMALL("TIFF image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/tiff"),
+    },
+    {
+        .id        = AV_CODEC_ID_GIF,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "gif",
+        .long_name = NULL_IF_CONFIG_SMALL("CompuServe GIF (Graphics Interchange Format)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/gif"),
+    },
+    {
         .id        = AV_CODEC_ID_DXA,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dxa",
@@ -641,6 +740,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "dnxhd",
         .long_name = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_dnxhd_profiles),
     },
     {
         .id        = AV_CODEC_ID_THP,
@@ -650,6 +750,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_SGI,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "sgi",
+        .long_name = NULL_IF_CONFIG_SMALL("SGI image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_C93,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "c93",
@@ -664,6 +771,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_PTX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ptx",
+        .long_name = NULL_IF_CONFIG_SMALL("V.Flash PTX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_TXD,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "txd",
+        .long_name = NULL_IF_CONFIG_SMALL("Renderware TXD (TeXture Dictionary) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_VP6A,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "vp6a",
@@ -685,6 +806,21 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_PCX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pcx",
+        .long_name = NULL_IF_CONFIG_SMALL("PC Paintbrush PCX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-pcx"),
+    },
+    {
+        .id        = AV_CODEC_ID_SUNRAST,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "sunrast",
+        .long_name = NULL_IF_CONFIG_SMALL("Sun Rasterfile image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_INDEO4,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "indeo4",
@@ -804,6 +940,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_DPX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "dpx",
+        .long_name = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_MAD,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "mad",
@@ -856,14 +999,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_IFF_ILBM,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "iff_ilbm",
-        .long_name = NULL_IF_CONFIG_SMALL("IFF ILBM"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_IFF_BYTERUN1,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "iff_byterun1",
-        .long_name = NULL_IF_CONFIG_SMALL("IFF ByteRun1"),
+        .long_name = NULL_IF_CONFIG_SMALL("IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -888,13 +1024,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_VP9,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "vp9",
-        .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
         .id        = AV_CODEC_ID_PICTOR,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "pictor",
@@ -902,6 +1031,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_ANSI,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ansi",
+        .long_name = NULL_IF_CONFIG_SMALL("ASCII/ANSI art"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_A64_MULTI,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "a64_multi",
@@ -923,20 +1059,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_MVC1,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "mvc1",
-        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 1"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_MVC2,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "mvc2",
-        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 2"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
-    },
-    {
         .id        = AV_CODEC_ID_MXPEG,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "mxpeg",
@@ -956,6 +1078,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "prores",
         .long_name = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
     },
     {
         .id        = AV_CODEC_ID_JV,
@@ -972,6 +1095,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_WMV3IMAGE,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "wmv3image",
+        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_VC1IMAGE,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "vc1image",
+        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image v2"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_UTVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "utvideo",
@@ -1007,11 +1144,27 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_XWD,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xwd",
+        .long_name = NULL_IF_CONFIG_SMALL("XWD (X Window Dump) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-xwindowdump"),
+    },
+    {
         .id        = AV_CODEC_ID_CDXL,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "cdxl",
         .long_name = NULL_IF_CONFIG_SMALL("Commodore CDXL video"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_XBM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xbm",
+        .long_name = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-xbitmap"),
     },
     {
         .id        = AV_CODEC_ID_ZEROCODEC,
@@ -1063,6 +1216,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_VP9,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "vp9",
+        .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
+        .props     = AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
+    },
+    {
         .id        = AV_CODEC_ID_AIC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "aic",
@@ -1084,6 +1245,15 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_WEBP,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "webp",
+        .long_name = NULL_IF_CONFIG_SMALL("WebP"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/webp"),
+    },
+    {
         .id        = AV_CODEC_ID_HNM4_VIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hnm4video",
@@ -1094,7 +1264,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_HEVC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hevc",
-        .long_name = NULL_IF_CONFIG_SMALL("HEVC (High Efficiency Video Coding)"),
+        .long_name = NULL_IF_CONFIG_SMALL("H.265 / HEVC (High Efficiency Video Coding)"),
         .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER,
         .profiles  = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
     },
@@ -1106,6 +1276,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_ALIAS_PIX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "alias_pix",
+        .long_name = NULL_IF_CONFIG_SMALL("Alias/Wavefront PIX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_BRENDER_PIX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "brender_pix",
+        .long_name = NULL_IF_CONFIG_SMALL("BRender PIX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_PAF_VIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "paf_video",
@@ -1113,6 +1297,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_EXR,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "exr",
+        .long_name = NULL_IF_CONFIG_SMALL("OpenEXR image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_VP7,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "vp7",
@@ -1123,7 +1315,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_SANM,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "sanm",
-        .long_name = NULL_IF_CONFIG_SMALL("LucasArts SANM video"),
+        .long_name = NULL_IF_CONFIG_SMALL("LucasArts SANM/SMUSH video"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -1134,6 +1326,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_MVC1,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "mvc1",
+        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 1"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_MVC2,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "mvc2",
+        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 2"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_HQX,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hqx",
@@ -1141,6 +1347,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_TDSC,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "tdsc",
+        .long_name = NULL_IF_CONFIG_SMALL("TDSC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_HQ_HQA,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hq_hqa",
@@ -1151,10 +1364,18 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_HAP,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hap",
-        .long_name = NULL_IF_CONFIG_SMALL("Vidvox Hap decoder"),
+        .long_name = NULL_IF_CONFIG_SMALL("Vidvox Hap"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_DDS,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "dds",
+        .long_name = NULL_IF_CONFIG_SMALL("DirectDraw Surface image decoder"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_DXV,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dxv",
@@ -1169,13 +1390,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_WRAPPED_AVFRAME,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "wrapped_avframe",
-        .long_name = NULL_IF_CONFIG_SMALL("AVFrame to AVPacket passthrough"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_RSCC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "rscc",
@@ -1183,261 +1397,312 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_MAGICYUV,
+        .id        = AV_CODEC_ID_AVS2,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "magicyuv",
-        .long_name = NULL_IF_CONFIG_SMALL("MagicYUV video"),
+        .name      = "avs2",
+        .long_name = NULL_IF_CONFIG_SMALL("AVS2-P2/IEEE1857.4"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_Y41P,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "y41p",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_TRUEMOTION2RT,
+        .id        = AV_CODEC_ID_AVRP,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "truemotion2rt",
-        .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "avrp",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_AV1,
+        .id        = AV_CODEC_ID_012V,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "av1",
-        .long_name = NULL_IF_CONFIG_SMALL("Alliance for Open Media AV1"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "012v",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PIXLET,
+        .id        = AV_CODEC_ID_AVUI,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pixlet",
-        .long_name = NULL_IF_CONFIG_SMALL("Apple Pixlet"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .name      = "avui",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_FMVC,
+        .id        = AV_CODEC_ID_AYUV,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "fmvc",
-        .long_name = NULL_IF_CONFIG_SMALL("FM Screen Capture Codec"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
+        .name      = "ayuv",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_CLEARVIDEO,
+        .id        = AV_CODEC_ID_TARGA_Y216,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "clearvideo",
-        .long_name = NULL_IF_CONFIG_SMALL("Iterated Systems ClearVideo"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "targa_y216",
+        .long_name = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
-
-    /* image codecs */
     {
-        .id        = AV_CODEC_ID_ALIAS_PIX,
+        .id        = AV_CODEC_ID_V308,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "alias_pix",
-        .long_name = NULL_IF_CONFIG_SMALL("Alias/Wavefront PIX image"),
+        .name      = "v308",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_ANSI,
+        .id        = AV_CODEC_ID_V408,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ansi",
-        .long_name = NULL_IF_CONFIG_SMALL("ASCII/ANSI art"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "v408",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_BRENDER_PIX,
+        .id        = AV_CODEC_ID_YUV4,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "brender_pix",
-        .long_name = NULL_IF_CONFIG_SMALL("BRender PIX image"),
+        .name      = "yuv4",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_DDS,
+        .id        = AV_CODEC_ID_AVRN,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "dds",
-        .long_name = NULL_IF_CONFIG_SMALL("DirectDraw Surface image decoder"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .name      = "avrn",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
     },
     {
-        .id        = AV_CODEC_ID_DPX,
+        .id        = AV_CODEC_ID_CPIA,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "dpx",
-        .long_name = NULL_IF_CONFIG_SMALL("DPX image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "cpia",
+        .long_name = NULL_IF_CONFIG_SMALL("CPiA video format"),
     },
     {
-        .id        = AV_CODEC_ID_EXR,
+        .id        = AV_CODEC_ID_XFACE,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "exr",
-        .long_name = NULL_IF_CONFIG_SMALL("OpenEXR image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .name      = "xface",
+        .long_name = NULL_IF_CONFIG_SMALL("X-face image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_GIF,
+        .id        = AV_CODEC_ID_SNOW,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "gif",
-        .long_name = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
+        .name      = "snow",
+        .long_name = NULL_IF_CONFIG_SMALL("Snow"),
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_SMVJPEG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "smvjpeg",
+        .long_name = NULL_IF_CONFIG_SMALL("Sigmatel Motion Video"),
+    },
+    {
+        .id        = AV_CODEC_ID_APNG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "apng",
+        .long_name = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
         .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
     },
     {
-        .id        = AV_CODEC_ID_JPEGLS,
+        .id        = AV_CODEC_ID_DAALA,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "jpegls",
-        .long_name = NULL_IF_CONFIG_SMALL("JPEG-LS"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .name      = "daala",
+        .long_name = NULL_IF_CONFIG_SMALL("Daala"),
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_LJPEG,
+        .id        = AV_CODEC_ID_CFHD,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ljpeg",
-        .long_name = NULL_IF_CONFIG_SMALL("Lossless JPEG"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "cfhd",
+        .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_PAM,
+        .id        = AV_CODEC_ID_TRUEMOTION2RT,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pam",
-        .long_name = NULL_IF_CONFIG_SMALL("PAM (Portable AnyMap) image"),
+        .name      = "truemotion2rt",
+        .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_M101,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "m101",
+        .long_name = NULL_IF_CONFIG_SMALL("Matrox Uncompressed SD"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PBM,
+        .id        = AV_CODEC_ID_MAGICYUV,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pbm",
-        .long_name = NULL_IF_CONFIG_SMALL("PBM (Portable BitMap) image"),
+        .name      = "magicyuv",
+        .long_name = NULL_IF_CONFIG_SMALL("MagicYUV video"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PCX,
+        .id        = AV_CODEC_ID_SHEERVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pcx",
-        .long_name = NULL_IF_CONFIG_SMALL("PC Paintbrush PCX image"),
+        .name      = "sheervideo",
+        .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PGM,
+        .id        = AV_CODEC_ID_YLC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pgm",
-        .long_name = NULL_IF_CONFIG_SMALL("PGM (Portable GrayMap) image"),
+        .name      = "ylc",
+        .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PGMYUV,
+        .id        = AV_CODEC_ID_PSD,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pgmyuv",
-        .long_name = NULL_IF_CONFIG_SMALL("PGMYUV (Portable GrayMap YUV) image"),
+        .name      = "psd",
+        .long_name = NULL_IF_CONFIG_SMALL("Photoshop PSD file"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PNG,
+        .id        = AV_CODEC_ID_PIXLET,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "png",
-        .long_name = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
+        .name      = "pixlet",
+        .long_name = NULL_IF_CONFIG_SMALL("Apple Pixlet"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SPEEDHQ,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "speedhq",
+        .long_name = NULL_IF_CONFIG_SMALL("NewTek SpeedHQ"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_FMVC,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "fmvc",
+        .long_name = NULL_IF_CONFIG_SMALL("FM Screen Capture Codec"),
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PPM,
+        .id        = AV_CODEC_ID_SCPR,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ppm",
-        .long_name = NULL_IF_CONFIG_SMALL("PPM (Portable PixelMap) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "scpr",
+        .long_name = NULL_IF_CONFIG_SMALL("ScreenPressor"),
+        .props     = AV_CODEC_PROP_LOSSLESS | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_PTX,
+        .id        = AV_CODEC_ID_CLEARVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ptx",
-        .long_name = NULL_IF_CONFIG_SMALL("V.Flash PTX image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .name      = "clearvideo",
+        .long_name = NULL_IF_CONFIG_SMALL("Iterated Systems ClearVideo"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_SGI,
+        .id        = AV_CODEC_ID_XPM,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "sgi",
-        .long_name = NULL_IF_CONFIG_SMALL("SGI image"),
+        .name      = "xpm",
+        .long_name = NULL_IF_CONFIG_SMALL("XPM (X PixMap) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-xpixmap"),
     },
     {
-        .id        = AV_CODEC_ID_SP5X,
+        .id        = AV_CODEC_ID_AV1,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "sp5x",
-        .long_name = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .name      = "av1",
+        .long_name = NULL_IF_CONFIG_SMALL("Alliance for Open Media AV1"),
+        .props     = AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_av1_profiles),
     },
     {
-        .id        = AV_CODEC_ID_SUNRAST,
+        .id        = AV_CODEC_ID_BITPACKED,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "sunrast",
-        .long_name = NULL_IF_CONFIG_SMALL("Sun Rasterfile image"),
+        .name      = "bitpacked",
+        .long_name = NULL_IF_CONFIG_SMALL("Bitpacked"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_TARGA,
+        .id        = AV_CODEC_ID_MSCC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "targa",
-        .long_name = NULL_IF_CONFIG_SMALL("Truevision Targa image"),
+        .name      = "mscc",
+        .long_name = NULL_IF_CONFIG_SMALL("Mandsoft Screen Capture Codec"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_TDSC,
+        .id        = AV_CODEC_ID_SRGC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "tdsc",
-        .long_name = NULL_IF_CONFIG_SMALL("TDSC"),
+        .name      = "srgc",
+        .long_name = NULL_IF_CONFIG_SMALL("Screen Recorder Gold Codec"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_SVG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "svg",
+        .long_name = NULL_IF_CONFIG_SMALL("Scalable Vector Graphics"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/svg+xml"),
+    },
+    {
+        .id        = AV_CODEC_ID_GDV,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "gdv",
+        .long_name = NULL_IF_CONFIG_SMALL("Gremlin Digital Video"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_TIFF,
+        .id        = AV_CODEC_ID_FITS,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "tiff",
-        .long_name = NULL_IF_CONFIG_SMALL("TIFF image"),
+        .name      = "fits",
+        .long_name = NULL_IF_CONFIG_SMALL("FITS (Flexible Image Transport System)"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_TXD,
+        .id        = AV_CODEC_ID_IMM4,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "txd",
-        .long_name = NULL_IF_CONFIG_SMALL("Renderware TXD (TeXture Dictionary) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .name      = "imm4",
+        .long_name = NULL_IF_CONFIG_SMALL("Infinity IMM4"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_VC1IMAGE,
+        .id        = AV_CODEC_ID_PROSUMER,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "vc1image",
-        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image v2"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "prosumer",
+        .long_name = NULL_IF_CONFIG_SMALL("Brooktree ProSumer Video"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_WEBP,
+        .id        = AV_CODEC_ID_MWSC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "webp",
-        .long_name = NULL_IF_CONFIG_SMALL("WebP"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .name      = "mwsc",
+        .long_name = NULL_IF_CONFIG_SMALL("MatchWare Screen Capture Codec"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_WMV3IMAGE,
+        .id        = AV_CODEC_ID_WCMV,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "wmv3image",
-        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "wcmv",
+        .long_name = NULL_IF_CONFIG_SMALL("WinCAM Motion Video"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_XBM,
+        .id        = AV_CODEC_ID_RASC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "xbm",
-        .long_name = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "rasc",
+        .long_name = NULL_IF_CONFIG_SMALL("RemotelyAnywhere Screen Capture"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_XWD,
+        .id        = AV_CODEC_ID_HYMT,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "xwd",
-        .long_name = NULL_IF_CONFIG_SMALL("XWD (X Window Dump) image"),
+        .name      = "hymt",
+        .long_name = NULL_IF_CONFIG_SMALL("HuffYUV MT"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_CFHD,
+        .id        = AV_CODEC_ID_ARBC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "cfhd",
-        .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"),
+        .name      = "arbc",
+        .long_name = NULL_IF_CONFIG_SMALL("Gryphon's Anim Compressor"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
 
@@ -1488,13 +1753,15 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_PCM_MULAW,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_mulaw",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM mu-law"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM mu-law / G.711 mu-law"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_PCM_ALAW,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_alaw",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM A-law"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM A-law / G.711 A-law"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_PCM_S32LE,
@@ -1567,31 +1834,10 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_PCM_S16BE_PLANAR,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s16be_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit big-endian planar"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_PCM_S16LE_PLANAR,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_s16le_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM 16-bit little-endian planar"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_PCM_S24LE_PLANAR,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s24le_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 24-bit little-endian planar"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_PCM_S32LE_PLANAR,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s32le_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 32-bit little-endian planar"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit little-endian planar"),
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
@@ -1648,7 +1894,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "s302m",
         .long_name = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_PCM_S8_PLANAR,
@@ -1657,6 +1903,62 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("PCM signed 8-bit planar"),
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
+    {
+        .id        = AV_CODEC_ID_PCM_S24LE_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s24le_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 24-bit little-endian planar"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S32LE_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s32le_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 32-bit little-endian planar"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S16BE_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s16be_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit big-endian planar"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S64LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s64le",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit little-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S64BE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s64be",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit big-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_F16LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_f16le",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM 16.8 floating point little-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_F24LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_f24le",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM 24.0 floating point little-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_VIDC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_vidc",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM Archimedes VIDC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* various ADPCM codecs */
     {
@@ -1789,7 +2091,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_ADPCM_THP,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "adpcm_thp",
-        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube THP"),
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -1876,6 +2178,76 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("LucasArts VIMA audio"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_ADPCM_AFC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_afc",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube AFC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_OKI,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_oki",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Dialogic OKI"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_DTK,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_dtk",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube DTK"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_RAD,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_rad",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Radical"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_G726LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_g726le",
+        .long_name = NULL_IF_CONFIG_SMALL("G.726 ADPCM little-endian"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_THP_LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_thp_le",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP (Little-Endian)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_PSX,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_psx",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Playstation"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_AICA,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_aica",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Yamaha AICA"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_DAT4,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_dat4",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Eurocom DAT4"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_MTAF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_mtaf",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM MTAF"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* AMR */
     {
@@ -1938,6 +2310,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("DPCM Sol"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_SDX2_DPCM,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sdx2_dpcm",
+        .long_name = NULL_IF_CONFIG_SMALL("DPCM Squareroot-Delta-Exact"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_GREMLIN_DPCM,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "gremlin_dpcm",
+        .long_name = NULL_IF_CONFIG_SMALL("DPCM Gremlin"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* audio codecs */
     {
@@ -2031,7 +2417,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "flac",
         .long_name = NULL_IF_CONFIG_SMALL("FLAC (Free Lossless Audio Codec)"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_MP3ADU,
@@ -2059,7 +2445,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "alac",
         .long_name = NULL_IF_CONFIG_SMALL("ALAC (Apple Lossless Audio Codec)"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_WESTWOOD_SND1,
@@ -2101,7 +2487,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "tta",
         .long_name = NULL_IF_CONFIG_SMALL("TTA (True Audio)"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_SMACKAUDIO,
@@ -2122,7 +2508,8 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "wavpack",
         .long_name = NULL_IF_CONFIG_SMALL("WavPack"),
-        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
+        .props     = AV_CODEC_PROP_INTRA_ONLY |
+                     AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_DSICINAUDIO,
@@ -2291,6 +2678,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "aac_latm",
         .long_name = NULL_IF_CONFIG_SMALL("AAC LATM (Advanced Audio Coding LATM syntax)"),
         .props     = AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
     },
     {
         .id        = AV_CODEC_ID_QDMC,
@@ -2314,13 +2702,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_DSS_SP,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "dss_sp",
-        .long_name = NULL_IF_CONFIG_SMALL("Digital Speech Standard - Standard Play mode (DSS SP)"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
         .id        = AV_CODEC_ID_G729,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "g729",
@@ -2388,7 +2769,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "tak",
         .long_name = NULL_IF_CONFIG_SMALL("TAK (Tom's lossless Audio Kompressor)"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_METASOUND,
@@ -2411,6 +2792,171 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("On2 Audio for Video Codec"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_DSS_SP,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dss_sp",
+        .long_name = NULL_IF_CONFIG_SMALL("Digital Speech Standard - Standard Play mode (DSS SP)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_CODEC2,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "codec2",
+        .long_name = NULL_IF_CONFIG_SMALL("codec2 (very low bitrate speech codec)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_FFWAVESYNTH,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "wavesynth",
+        .long_name = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonic",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC_LS,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonicls",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic lossless"),
+    },
+    {
+        .id        = AV_CODEC_ID_EVRC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "evrc",
+        .long_name = NULL_IF_CONFIG_SMALL("EVRC (Enhanced Variable Rate Codec)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SMV,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "smv",
+        .long_name = NULL_IF_CONFIG_SMALL("SMV (Selectable Mode Vocoder)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_LSBF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_lsbf",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_MSBF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_msbf",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_LSBF_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_lsbf_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first, planar"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_MSBF_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_msbf_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first, planar"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_4GV,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "4gv",
+        .long_name = NULL_IF_CONFIG_SMALL("4GV (Fourth Generation Vocoder)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_INTERPLAY_ACM,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "interplayacm",
+        .long_name = NULL_IF_CONFIG_SMALL("Interplay ACM"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_XMA1,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "xma1",
+        .long_name = NULL_IF_CONFIG_SMALL("Xbox Media Audio 1"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_XMA2,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "xma2",
+        .long_name = NULL_IF_CONFIG_SMALL("Xbox Media Audio 2"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DST,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dst",
+        .long_name = NULL_IF_CONFIG_SMALL("DST (Direct Stream Transfer)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_ATRAC3AL,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "atrac3al",
+        .long_name = NULL_IF_CONFIG_SMALL("ATRAC3 AL (Adaptive TRansform Acoustic Coding 3 Advanced Lossless)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_ATRAC3PAL,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "atrac3pal",
+        .long_name = NULL_IF_CONFIG_SMALL("ATRAC3+ AL (Adaptive TRansform Acoustic Coding 3+ Advanced Lossless)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_DOLBY_E,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dolby_e",
+        .long_name = NULL_IF_CONFIG_SMALL("Dolby E"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_APTX,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "aptx",
+        .long_name = NULL_IF_CONFIG_SMALL("aptX (Audio Processing Technology for Bluetooth)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_APTX_HD,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "aptx_hd",
+        .long_name = NULL_IF_CONFIG_SMALL("aptX HD (Audio Processing Technology for Bluetooth)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SBC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sbc",
+        .long_name = NULL_IF_CONFIG_SMALL("SBC (low-complexity subband codec)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ATRAC9,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "atrac9",
+        .long_name = NULL_IF_CONFIG_SMALL("ATRAC9 (Adaptive TRansform Acoustic Coding 9)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_HCOM,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "hcom",
+        .long_name = NULL_IF_CONFIG_SMALL("HCOM Audio"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* subtitle codecs */
     {
@@ -2418,42 +2964,49 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "dvd_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("DVD subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_DVB_SUBTITLE,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "dvb_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("DVB subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_TEXT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "text",
         .long_name = NULL_IF_CONFIG_SMALL("raw UTF-8 text"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_XSUB,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "xsub",
         .long_name = NULL_IF_CONFIG_SMALL("XSUB"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_SSA,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "ssa",
-        .long_name = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) / ASS (Advanced SSA) subtitle"),
+        .long_name = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_MOV_TEXT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "mov_text",
         .long_name = NULL_IF_CONFIG_SMALL("MOV text"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_HDMV_PGS_SUBTITLE,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "hdmv_pgs_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("HDMV Presentation Graphic Stream subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_DVB_TELETEXT,
@@ -2465,18 +3018,218 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_SRT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "srt",
-        .long_name = NULL_IF_CONFIG_SMALL("SubRip Text"),
+        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle with embedded timing"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_MICRODVD,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "microdvd",
+        .long_name = NULL_IF_CONFIG_SMALL("MicroDVD subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_EIA_608,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "eia_608",
+        .long_name = NULL_IF_CONFIG_SMALL("EIA-608 closed captions"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_JACOSUB,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "jacosub",
+        .long_name = NULL_IF_CONFIG_SMALL("JACOsub subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SAMI,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "sami",
+        .long_name = NULL_IF_CONFIG_SMALL("SAMI subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_REALTEXT,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "realtext",
+        .long_name = NULL_IF_CONFIG_SMALL("RealText subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_STL,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "stl",
+        .long_name = NULL_IF_CONFIG_SMALL("Spruce subtitle format"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBVIEWER1,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subviewer1",
+        .long_name = NULL_IF_CONFIG_SMALL("SubViewer v1 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBVIEWER,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subviewer",
+        .long_name = NULL_IF_CONFIG_SMALL("SubViewer subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBRIP,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subrip",
+        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_WEBVTT,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "webvtt",
+        .long_name = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_MPL2,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "mpl2",
+        .long_name = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_VPLAYER,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "vplayer",
+        .long_name = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_PJS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "pjs",
+        .long_name = NULL_IF_CONFIG_SMALL("PJS (Phoenix Japanimation Society) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_ASS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "ass",
+        .long_name = NULL_IF_CONFIG_SMALL("ASS (Advanced SSA) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "hdmv_text_subtitle",
+        .long_name = NULL_IF_CONFIG_SMALL("HDMV Text subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_TTML,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "ttml",
+        .long_name = NULL_IF_CONFIG_SMALL("Timed Text Markup Language"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_ARIB_CAPTION,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "arib_caption",
+        .long_name = NULL_IF_CONFIG_SMALL("ARIB STD-B24 caption"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_arib_caption_profiles),
+    },
+
+    /* other kind of codecs and pseudo-codecs */
+    {
+        .id        = AV_CODEC_ID_TTF,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "ttf",
+        .long_name = NULL_IF_CONFIG_SMALL("TrueType font"),
+        .mime_types= MT("application/x-truetype-font", "application/x-font"),
+    },
+    {
+        .id        = AV_CODEC_ID_SCTE_35,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "scte_35",
+        .long_name = NULL_IF_CONFIG_SMALL("SCTE 35 Message Queue"),
+    },
+    {
+        .id        = AV_CODEC_ID_BINTEXT,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "bintext",
+        .long_name = NULL_IF_CONFIG_SMALL("Binary text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_XBIN,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xbin",
+        .long_name = NULL_IF_CONFIG_SMALL("eXtended BINary text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_IDF,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "idf",
+        .long_name = NULL_IF_CONFIG_SMALL("iCEDraw text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_OTF,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "otf",
+        .long_name = NULL_IF_CONFIG_SMALL("OpenType font"),
+        .mime_types= MT("application/vnd.ms-opentype"),
+    },
+    {
+        .id        = AV_CODEC_ID_SMPTE_KLV,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "klv",
+        .long_name = NULL_IF_CONFIG_SMALL("SMPTE 336M Key-Length-Value (KLV) metadata"),
+    },
+    {
+        .id        = AV_CODEC_ID_DVD_NAV,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "dvd_nav_packet",
+        .long_name = NULL_IF_CONFIG_SMALL("DVD Nav packet"),
+    },
+    {
+        .id        = AV_CODEC_ID_TIMED_ID3,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "timed_id3",
+        .long_name = NULL_IF_CONFIG_SMALL("timed ID3 metadata"),
+    },
+    {
+        .id        = AV_CODEC_ID_BIN_DATA,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "bin_data",
+        .long_name = NULL_IF_CONFIG_SMALL("binary data"),
+        .mime_types= MT("application/octet-stream"),
+    },
+    {
+        .id        = AV_CODEC_ID_WRAPPED_AVFRAME,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "wrapped_avframe",
+        .long_name = NULL_IF_CONFIG_SMALL("AVFrame to AVPacket passthrough"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
 };
 
-const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id)
+static int descriptor_compare(const void *key, const void *member)
 {
-    int i;
+    enum AVCodecID id = *(const enum AVCodecID *) key;
+    const AVCodecDescriptor *desc = member;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(codec_descriptors); i++)
-        if (codec_descriptors[i].id == id)
-            return &codec_descriptors[i];
-    return NULL;
+    return id - desc->id;
+}
+
+const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id)
+{
+    return bsearch(&id, codec_descriptors, FF_ARRAY_ELEMS(codec_descriptors),
+                   sizeof(codec_descriptors[0]), descriptor_compare);
 }
 
 const AVCodecDescriptor *avcodec_descriptor_next(const AVCodecDescriptor *prev)
diff --git a/libavcodec/cook.c b/libavcodec/cook.c
index bd8ad1d..c5f68c9 100644
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Sascha Sommer
  * Copyright (c) 2005 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,12 +47,12 @@
 
 #include "audiodsp.h"
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "bytestream.h"
 #include "fft.h"
 #include "internal.h"
 #include "sinewin.h"
-#include "vlc.h"
+#include "unary.h"
 
 #include "cookdata.h"
 
@@ -125,7 +125,7 @@ typedef struct cook {
 
     AVCodecContext*     avctx;
     AudioDSPContext     adsp;
-    BitstreamContext    bc;
+    GetBitContext       gb;
     /* stream data */
     int                 num_vectors;
     int                 samples_per_channel;
@@ -166,10 +166,17 @@ static float rootpow2tab[127];
 /* table generator */
 static av_cold void init_pow2table(void)
 {
+    /* fast way of computing 2^i and 2^(0.5*i) for -63 <= i < 64 */
     int i;
+    static const float exp2_tab[2] = {1, M_SQRT2};
+    float exp2_val = powf(2, -63);
+    float root_val = powf(2, -32);
     for (i = -63; i < 64; i++) {
-        pow2tab[63 + i] = pow(2, i);
-        rootpow2tab[63 + i] = sqrt(pow(2, i));
+        if (!(i & 1))
+            root_val *= 2;
+        pow2tab[63 + i] = exp2_val;
+        rootpow2tab[63 + i] = root_val * exp2_tab[i & 1];
+        exp2_val *= 2;
     }
 }
 
@@ -220,7 +227,7 @@ static av_cold int init_cook_mlt(COOKContext *q)
     int j, ret;
     int mlt_size = q->samples_per_channel;
 
-    if ((q->mlt_window = av_malloc(mlt_size * sizeof(*q->mlt_window))) == 0)
+    if ((q->mlt_window = av_malloc_array(mlt_size, sizeof(*q->mlt_window))) == 0)
         return AVERROR(ENOMEM);
 
     /* Initialize the MLT window: simple sine window. */
@@ -230,7 +237,7 @@ static av_cold int init_cook_mlt(COOKContext *q)
 
     /* Initialize the MDCT. */
     if ((ret = ff_mdct_init(&q->mdct_ctx, av_log2(mlt_size) + 1, 1, 1.0 / 32768.0))) {
-        av_free(q->mlt_window);
+        av_freep(&q->mlt_window);
         return ret;
     }
     av_log(q->avctx, AV_LOG_DEBUG, "MDCT initialized, order = %d.\n",
@@ -304,8 +311,8 @@ static av_cold int cook_decode_close(AVCodecContext *avctx)
     av_log(avctx, AV_LOG_DEBUG, "Deallocating memory.\n");
 
     /* Free allocated memory buffers. */
-    av_free(q->mlt_window);
-    av_free(q->decoded_bytes_buffer);
+    av_freep(&q->mlt_window);
+    av_freep(&q->decoded_bytes_buffer);
 
     /* Free the transform. */
     ff_mdct_end(&q->mdct_ctx);
@@ -326,23 +333,19 @@ static av_cold int cook_decode_close(AVCodecContext *avctx)
 /**
  * Fill the gain array for the timedomain quantization.
  *
- * @param bc          pointer to the BitstreamContext
+ * @param gb          pointer to the GetBitContext
  * @param gaininfo    array[9] of gain indexes
  */
-static void decode_gain_info(BitstreamContext *bc, int *gaininfo)
+static void decode_gain_info(GetBitContext *gb, int *gaininfo)
 {
     int i, n;
 
-    while (bitstream_read_bit(bc)) {
-        /* NOTHING */
-    }
-
-    n = bitstream_tell(bc) - 1;     // amount of elements * 2 to update
+    n = get_unary(gb, 0, get_bits_left(gb));     // amount of elements*2 to update
 
     i = 0;
     while (n--) {
-        int index = bitstream_read(bc, 3);
-        int gain = bitstream_read_bit(bc) ? bitstream_read(bc, 4) - 7 : -1;
+        int index = get_bits(gb, 3);
+        int gain = get_bits1(gb) ? get_bits(gb, 4) - 7 : -1;
 
         while (i <= index)
             gaininfo[i++] = gain;
@@ -362,7 +365,7 @@ static int decode_envelope(COOKContext *q, COOKSubpacket *p,
 {
     int i, j, vlc_index;
 
-    quant_index_table[0] = bitstream_read(&q->bc, 6) - 6; // This is used later in categorize
+    quant_index_table[0] = get_bits(&q->gb, 6) - 6; // This is used later in categorize
 
     for (i = 1; i < p->total_subbands; i++) {
         vlc_index = i;
@@ -376,8 +379,8 @@ static int decode_envelope(COOKContext *q, COOKSubpacket *p,
         if (vlc_index > 13)
             vlc_index = 13; // the VLC tables >13 are identical to No. 13
 
-        j = bitstream_read_vlc(&q->bc, q->envelope_quant_index[vlc_index - 1].table,
-                               q->envelope_quant_index[vlc_index - 1].bits, 2);
+        j = get_vlc2(&q->gb, q->envelope_quant_index[vlc_index - 1].table,
+                     q->envelope_quant_index[vlc_index - 1].bits, 2);
         quant_index_table[i] = quant_index_table[i - 1] + j - 12; // differential encoding
         if (quant_index_table[i] > 63 || quant_index_table[i] < -63) {
             av_log(q->avctx, AV_LOG_ERROR,
@@ -398,7 +401,7 @@ static int decode_envelope(COOKContext *q, COOKSubpacket *p,
  * @param category              pointer to the category array
  * @param category_index        pointer to the category_index array
  */
-static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
+static void categorize(COOKContext *q, COOKSubpacket *p, const int *quant_index_table,
                        int *category, int *category_index)
 {
     int exp_idx, bias, tmpbias1, tmpbias2, bits_left, num_bits, index, v, i, j;
@@ -409,7 +412,7 @@ static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
     int tmp_categorize_array1_idx = p->numvector_size;
     int tmp_categorize_array2_idx = p->numvector_size;
 
-    bits_left = p->bits_per_subpacket - bitstream_tell(&q->bc);
+    bits_left = p->bits_per_subpacket - get_bits_count(&q->gb);
 
     if (bits_left > q->samples_per_channel)
         bits_left = q->samples_per_channel +
@@ -422,7 +425,7 @@ static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
         num_bits = 0;
         index    = 0;
         for (j = p->total_subbands; j > 0; j--) {
-            exp_idx = av_clip((i - quant_index_table[index] + bias) / 2, 0, 7);
+            exp_idx = av_clip_uintp2((i - quant_index_table[index] + bias) / 2, 3);
             index++;
             num_bits += expbits_tab[exp_idx];
         }
@@ -433,7 +436,7 @@ static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
     /* Calculate total number of bits. */
     num_bits = 0;
     for (i = 0; i < p->total_subbands; i++) {
-        exp_idx = av_clip((bias - quant_index_table[i]) / 2, 0, 7);
+        exp_idx = av_clip_uintp2((bias - quant_index_table[i]) / 2, 3);
         num_bits += expbits_tab[exp_idx];
         exp_index1[i] = exp_idx;
         exp_index2[i] = exp_idx;
@@ -555,8 +558,8 @@ static int unpack_SQVH(COOKContext *q, COOKSubpacket *p, int category,
     vd = vd_tab[category];
     result = 0;
     for (i = 0; i < vpr_tab[category]; i++) {
-        vlc = bitstream_read_vlc(&q->bc, q->sqvh[category].table, q->sqvh[category].bits, 3);
-        if (p->bits_per_subpacket < bitstream_tell(&q->bc)) {
+        vlc = get_vlc2(&q->gb, q->sqvh[category].table, q->sqvh[category].bits, 3);
+        if (p->bits_per_subpacket < get_bits_count(&q->gb)) {
             vlc = 0;
             result = 1;
         }
@@ -567,8 +570,8 @@ static int unpack_SQVH(COOKContext *q, COOKSubpacket *p, int category,
         }
         for (j = 0; j < vd; j++) {
             if (subband_coef_index[i * vd + j]) {
-                if (bitstream_tell(&q->bc) < p->bits_per_subpacket) {
-                    subband_coef_sign[i * vd + j] = bitstream_read_bit(&q->bc);
+                if (get_bits_count(&q->gb) < p->bits_per_subpacket) {
+                    subband_coef_sign[i * vd + j] = get_bits1(&q->gb);
                 } else {
                     result = 1;
                     subband_coef_sign[i * vd + j] = 0;
@@ -631,13 +634,17 @@ static int mono_decode(COOKContext *q, COOKSubpacket *p, float *mlt_buffer)
     int category_index[128] = { 0 };
     int category[128]       = { 0 };
     int quant_index_table[102];
-    int res;
+    int res, i;
 
     if ((res = decode_envelope(q, p, quant_index_table)) < 0)
         return res;
-    q->num_vectors = bitstream_read(&q->bc, p->log2_numvector_size);
+    q->num_vectors = get_bits(&q->gb, p->log2_numvector_size);
     categorize(q, p, quant_index_table, category, category_index);
     expand_category(q, category, category_index);
+    for (i=0; i<p->total_subbands; i++) {
+        if (category[i] > 7)
+            return AVERROR_INVALIDDATA;
+    }
     decode_vectors(q, p, category, quant_index_table, mlt_buffer);
 
     return 0;
@@ -737,29 +744,35 @@ static void imlt_gain(COOKContext *q, float *inbuffer,
  * @param q                 pointer to the COOKContext
  * @param decouple_tab      decoupling array
  */
-static void decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
+static int decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
 {
     int i;
-    int vlc    = bitstream_read_bit(&q->bc);
+    int vlc    = get_bits1(&q->gb);
     int start  = cplband[p->js_subband_start];
     int end    = cplband[p->subbands - 1];
     int length = end - start + 1;
 
     if (start > end)
-        return;
+        return 0;
 
     if (vlc)
         for (i = 0; i < length; i++)
-            decouple_tab[start + i] =
-                bitstream_read_vlc(&q->bc,
-                                   p->channel_coupling.table,
-                                   p->channel_coupling.bits, 2);
+            decouple_tab[start + i] = get_vlc2(&q->gb,
+                                               p->channel_coupling.table,
+                                               p->channel_coupling.bits, 2);
     else
-        for (i = 0; i < length; i++)
-            decouple_tab[start + i] = bitstream_read(&q->bc, p->js_vlc_bits);
+        for (i = 0; i < length; i++) {
+            int v = get_bits(&q->gb, p->js_vlc_bits);
+            if (v == (1<<p->js_vlc_bits)-1) {
+                av_log(q->avctx, AV_LOG_ERROR, "decouple value too large\n");
+                return AVERROR_INVALIDDATA;
+            }
+            decouple_tab[start + i] = v;
+        }
+    return 0;
 }
 
-/*
+/**
  * function decouples a pair of signals from a single signal via multiplication.
  *
  * @param q                 pointer to the COOKContext
@@ -807,10 +820,10 @@ static int joint_decode(COOKContext *q, COOKSubpacket *p,
     /* Make sure the buffers are zeroed out. */
     memset(mlt_buffer_left,  0, 1024 * sizeof(*mlt_buffer_left));
     memset(mlt_buffer_right, 0, 1024 * sizeof(*mlt_buffer_right));
-    decouple_info(q, p, decouple_tab);
+    if ((res = decouple_info(q, p, decouple_tab)) < 0)
+        return res;
     if ((res = mono_decode(q, p, decode_buffer)) < 0)
         return res;
-
     /* The two channels are stored interleaved in decode_buffer. */
     for (i = 0; i < p->js_subband_start; i++) {
         for (j = 0; j < SUBBAND_SIZE; j++) {
@@ -852,9 +865,9 @@ static inline void decode_bytes_and_gain(COOKContext *q, COOKSubpacket *p,
 
     offset = decode_bytes(inbuffer, q->decoded_bytes_buffer,
                           p->bits_per_subpacket / 8);
-    bitstream_init(&q->bc, q->decoded_bytes_buffer + offset,
-                   p->bits_per_subpacket);
-    decode_gain_info(&q->bc, gains_ptr->now);
+    init_get_bits(&q->gb, q->decoded_bytes_buffer + offset,
+                  p->bits_per_subpacket);
+    decode_gain_info(&q->gb, gains_ptr->now);
 
     /* Swap current and previous gains */
     FFSWAP(int *, gains_ptr->now, gains_ptr->previous);
@@ -929,7 +942,7 @@ static int decode_subpacket(COOKContext *q, COOKSubpacket *p,
                           p->mono_previous_buffer1,
                           outbuffer ? outbuffer[p->ch_idx] : NULL);
 
-    if (p->num_channels == 2)
+    if (p->num_channels == 2) {
         if (p->joint_stereo)
             mlt_compensate_output(q, q->decode_buffer_2, &p->gains1,
                                   p->mono_previous_buffer2,
@@ -938,6 +951,7 @@ static int decode_subpacket(COOKContext *q, COOKSubpacket *p,
             mlt_compensate_output(q, q->decode_buffer_2, &p->gains2,
                                   p->mono_previous_buffer2,
                                   outbuffer ? outbuffer[p->ch_idx + 1] : NULL);
+    }
 
     return 0;
 }
@@ -961,10 +975,8 @@ static int cook_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     if (q->discarded_packets >= 2) {
         frame->nb_samples = q->samples_per_channel;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
         samples = (float **)frame->extended_data;
     }
 
@@ -996,7 +1008,7 @@ static int cook_decode_frame(AVCodecContext *avctx, void *data,
         offset += q->subpacket[i].size;
         chidx += q->subpacket[i].num_channels;
         av_log(avctx, AV_LOG_DEBUG, "subpacket[%i] %i %i\n",
-               i, q->subpacket[i].size * 8, bitstream_tell(&q->bc));
+               i, q->subpacket[i].size * 8, get_bits_count(&q->gb));
     }
 
     /* Discard the first two frames: no valid audio. */
@@ -1011,7 +1023,6 @@ static int cook_decode_frame(AVCodecContext *avctx, void *data,
     return avctx->block_align;
 }
 
-#ifdef DEBUG
 static void dump_cook_context(COOKContext *q)
 {
     //int i=0;
@@ -1024,7 +1035,7 @@ static void dump_cook_context(COOKContext *q)
     }
     ff_dlog(q->avctx, "COOKContext\n");
     PRINT("nb_channels", q->avctx->channels);
-    PRINT("bit_rate", q->avctx->bit_rate);
+    PRINT("bit_rate", (int)q->avctx->bit_rate);
     PRINT("sample_rate", q->avctx->sample_rate);
     PRINT("samples_per_channel", q->subpacket[0].samples_per_channel);
     PRINT("subbands", q->subpacket[0].subbands);
@@ -1033,7 +1044,6 @@ static void dump_cook_context(COOKContext *q)
     PRINT("numvector_size", q->subpacket[0].numvector_size);
     PRINT("total_subbands", q->subpacket[0].total_subbands);
 }
-#endif
 
 /**
  * Cook initialization
@@ -1046,7 +1056,7 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
     GetByteContext gb;
     int s = 0;
     unsigned int channel_mask = 0;
-    int samples_per_frame;
+    int samples_per_frame = 0;
     int ret;
     q->avctx = avctx;
 
@@ -1078,6 +1088,10 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
         q->subpacket[s].subbands         = bytestream2_get_be16(&gb);
         bytestream2_get_be32(&gb);    // Unknown unused
         q->subpacket[s].js_subband_start = bytestream2_get_be16(&gb);
+        if (q->subpacket[s].js_subband_start >= 51) {
+            av_log(avctx, AV_LOG_ERROR, "js_subband_start %d is too large\n", q->subpacket[s].js_subband_start);
+            return AVERROR_INVALIDDATA;
+        }
         q->subpacket[s].js_vlc_bits      = bytestream2_get_be16(&gb);
 
         /* Initialize extradata related variables. */
@@ -1182,15 +1196,24 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
             avpriv_request_sample(avctx, "subbands > 50");
             return AVERROR_PATCHWELCOME;
         }
+        if (q->subpacket[s].subbands == 0) {
+            avpriv_request_sample(avctx, "subbands = 0");
+            return AVERROR_PATCHWELCOME;
+        }
         q->subpacket[s].gains1.now      = q->subpacket[s].gain_1;
         q->subpacket[s].gains1.previous = q->subpacket[s].gain_2;
         q->subpacket[s].gains2.now      = q->subpacket[s].gain_3;
         q->subpacket[s].gains2.previous = q->subpacket[s].gain_4;
 
+        if (q->num_subpackets + q->subpacket[s].num_channels > q->avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "Too many subpackets %d for channels %d\n", q->num_subpackets, q->avctx->channels);
+            return AVERROR_INVALIDDATA;
+        }
+
         q->num_subpackets++;
         s++;
-        if (s > MAX_SUBPACKETS) {
-            avpriv_request_sample(avctx, "subpackets > %d", MAX_SUBPACKETS);
+        if (s > FFMIN(MAX_SUBPACKETS, avctx->block_align)) {
+            avpriv_request_sample(avctx, "subpackets > %d", FFMIN(MAX_SUBPACKETS, avctx->block_align));
             return AVERROR_PATCHWELCOME;
         }
     }
@@ -1243,9 +1266,9 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
     else
         avctx->channel_layout = (avctx->channels == 2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
-#ifdef DEBUG
+
     dump_cook_context(q);
-#endif
+
     return 0;
 }
 
@@ -1259,6 +1282,7 @@ AVCodec ff_cook_decoder = {
     .close          = cook_decode_close,
     .decode         = cook_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
diff --git a/libavcodec/cook_parser.c b/libavcodec/cook_parser.c
index f140e90..6dbbfd8 100644
--- a/libavcodec/cook_parser.c
+++ b/libavcodec/cook_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,11 +40,12 @@ static int cook_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
 {
     CookParseContext *s = s1->priv_data;
 
-    if (s->duration)
-        s1->duration = s->duration;
-    else if (avctx->extradata && avctx->extradata_size >= 8 && avctx->channels)
+    if (!s->duration &&
+                avctx->extradata && avctx->extradata_size >= 8 && avctx->channels)
         s->duration = AV_RB16(avctx->extradata + 4) / avctx->channels;
 
+    s1->duration = s->duration;
+
     /* always return the full packet. this parser isn't doing any splitting or
        combining, only setting packet duration */
     *poutbuf      = buf;
diff --git a/libavcodec/cookdata.h b/libavcodec/cookdata.h
index 614b8de..efb8a53 100644
--- a/libavcodec/cookdata.h
+++ b/libavcodec/cookdata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Sascha Sommer
  * Copyright (c) 2005 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/copy_block.h b/libavcodec/copy_block.h
index 0b69622..393d455 100644
--- a/libavcodec/copy_block.h
+++ b/libavcodec/copy_block.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,16 @@
 
 #include "libavutil/intreadwrite.h"
 
+static inline void copy_block2(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)
+{
+    int i;
+    for (i = 0; i < h; i++) {
+        AV_COPY16U(dst, src);
+        dst += dstStride;
+        src += srcStride;
+    }
+}
+
 static inline void copy_block4(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)
 {
     int i;
diff --git a/libavcodec/cos_tablegen.c b/libavcodec/cos_tablegen.c
index 92b8295..7206aad 100644
--- a/libavcodec/cos_tablegen.c
+++ b/libavcodec/cos_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,9 @@
 #include <string.h>
 #include <math.h>
 
-#define BITS 16
+#include "libavutil/mathematics.h"
+
+#define BITS 17
 #define FLOATFMT "%.18e"
 #define FIXEDFMT "%6d"
 
diff --git a/libavcodec/cpia.c b/libavcodec/cpia.c
new file mode 100644
index 0000000..58833b2
--- /dev/null
+++ b/libavcodec/cpia.c
@@ -0,0 +1,233 @@
+/*
+ * CPiA video decoder.
+ * Copyright (c) 2010 Hans de Goede <hdegoede@redhat.com>
+ *
+ * This decoder is based on the LGPL code available at
+ * https://v4l4j.googlecode.com/svn/v4l4j/trunk/libvideo/libv4lconvert/cpia1.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+
+
+#define FRAME_HEADER_SIZE 64
+#define MAGIC_0         0x19    /**< First header byte */
+#define MAGIC_1         0x68    /**< Second header byte */
+#define SUBSAMPLE_420      0
+#define SUBSAMPLE_422      1
+#define YUVORDER_YUYV      0
+#define YUVORDER_UYVY      1
+#define NOT_COMPRESSED     0
+#define COMPRESSED         1
+#define NO_DECIMATION      0
+#define DECIMATION_ENAB    1
+#define EOL             0xfd    /**< End Of Line marker */
+#define EOI             0xff    /**< End Of Image marker */
+
+
+typedef struct {
+    AVFrame *frame;
+} CpiaContext;
+
+
+static int cpia_decode_frame(AVCodecContext *avctx,
+                             void *data, int *got_frame, AVPacket* avpkt)
+{
+    CpiaContext* const cpia = avctx->priv_data;
+    int i,j,ret;
+
+    uint8_t* const header = avpkt->data;
+    uint8_t* src;
+    int src_size;
+    uint16_t linelength;
+    uint8_t skip;
+
+    AVFrame *frame = cpia->frame;
+    uint8_t *y, *u, *v, *y_end, *u_end, *v_end;
+
+    // Check header
+    if ( avpkt->size < FRAME_HEADER_SIZE
+      || header[0] != MAGIC_0 || header[1] != MAGIC_1
+      || (header[17] != SUBSAMPLE_420 && header[17] != SUBSAMPLE_422)
+      || (header[18] != YUVORDER_YUYV && header[18] != YUVORDER_UYVY)
+      || (header[28] != NOT_COMPRESSED && header[28] != COMPRESSED)
+      || (header[29] != NO_DECIMATION && header[29] != DECIMATION_ENAB)
+    ) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid header!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // currently unsupported properties
+    if (header[17] == SUBSAMPLE_422) {
+        avpriv_report_missing_feature(avctx, "4:2:2 subsampling");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (header[18] == YUVORDER_UYVY) {
+        avpriv_report_missing_feature(avctx, "YUV byte order UYVY");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (header[29] == DECIMATION_ENAB) {
+        avpriv_report_missing_feature(avctx, "Decimation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    src = header + FRAME_HEADER_SIZE;
+    src_size = avpkt->size - FRAME_HEADER_SIZE;
+
+    if (header[28] == NOT_COMPRESSED) {
+        frame->pict_type = AV_PICTURE_TYPE_I;
+        frame->key_frame = 1;
+    } else {
+        frame->pict_type = AV_PICTURE_TYPE_P;
+        frame->key_frame = 0;
+    }
+
+    // Get buffer filled with previous frame
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
+        return ret;
+
+
+    for ( i = 0;
+          i < frame->height;
+          i++, src += linelength, src_size -= linelength
+    ) {
+        // Read line length, two byte little endian
+        linelength = AV_RL16(src);
+        src += 2;
+
+        if (src_size < linelength) {
+            frame->decode_error_flags = FF_DECODE_ERROR_INVALID_BITSTREAM;
+            av_log(avctx, AV_LOG_WARNING, "Frame ended unexpectedly!\n");
+            break;
+        }
+        if (src[linelength - 1] != EOL) {
+            frame->decode_error_flags = FF_DECODE_ERROR_INVALID_BITSTREAM;
+            av_log(avctx, AV_LOG_WARNING, "Wrong line length %d or line not terminated properly (found 0x%02x)!\n", linelength, src[linelength - 1]);
+            break;
+        }
+
+        /* Update the data pointers. Y data is on every line.
+         * U and V data on every second line
+         */
+        y = &frame->data[0][i * frame->linesize[0]];
+        u = &frame->data[1][(i >> 1) * frame->linesize[1]];
+        v = &frame->data[2][(i >> 1) * frame->linesize[2]];
+        y_end = y + frame->linesize[0] - 1;
+        u_end = u + frame->linesize[1] - 1;
+        v_end = v + frame->linesize[2] - 1;
+
+        if ((i & 1) && header[17] == SUBSAMPLE_420) {
+            /* We are on an odd line and 420 subsample is used.
+             * On this line only Y values are specified, one per pixel.
+             */
+            for (j = 0; j < linelength - 1; j++) {
+                if (y > y_end) {
+                    frame->decode_error_flags = FF_DECODE_ERROR_INVALID_BITSTREAM;
+                    av_log(avctx, AV_LOG_WARNING, "Decoded data exceeded linesize!\n");
+                    break;
+                }
+                if ((src[j] & 1) && header[28] == COMPRESSED) {
+                    /* It seems that odd lines are always uncompressed, but
+                     * we do it according to specification anyways.
+                     */
+                    skip = src[j] >> 1;
+                    y += skip;
+                } else {
+                    *(y++) = src[j];
+                }
+            }
+        } else if (header[17] == SUBSAMPLE_420) {
+            /* We are on an even line and 420 subsample is used.
+             * On this line each pair of pixels is described by four bytes.
+             */
+            for (j = 0; j < linelength - 4; ) {
+                if (y + 1 > y_end || u > u_end || v > v_end) {
+                    frame->decode_error_flags = FF_DECODE_ERROR_INVALID_BITSTREAM;
+                    av_log(avctx, AV_LOG_WARNING, "Decoded data exceeded linesize!\n");
+                    break;
+                }
+                if ((src[j] & 1) && header[28] == COMPRESSED) {
+                    // Skip amount of pixels and move forward one byte
+                    skip = src[j] >> 1;
+                    y += skip;
+                    u += skip >> 1;
+                    v += skip >> 1;
+                    j++;
+                } else {
+                    // Set image data as specified and move forward 4 bytes
+                    *(y++) = src[j];
+                    *(u++) = src[j+1];
+                    *(y++) = src[j+2];
+                    *(v++) = src[j+3];
+                    j += 4;
+                }
+            }
+        }
+    }
+
+    *got_frame = 1;
+    if ((ret = av_frame_ref(data, cpia->frame)) < 0)
+        return ret;
+
+    return avpkt->size;
+}
+
+static av_cold int cpia_decode_init(AVCodecContext *avctx)
+{
+    CpiaContext *s = avctx->priv_data;
+
+    // output pixel format
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    /* The default timebase set by the v4l2 demuxer leads to probing which is buggy.
+     * Set some reasonable time_base to skip this.
+     */
+    if (avctx->time_base.num == 1 && avctx->time_base.den == 1000000) {
+        avctx->time_base.num = 1;
+        avctx->time_base.den = 60;
+    }
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int cpia_decode_end(AVCodecContext *avctx)
+{
+    CpiaContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
+AVCodec ff_cpia_decoder = {
+    .name           = "cpia",
+    .long_name      = NULL_IF_CONFIG_SMALL("CPiA video format"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_CPIA,
+    .priv_data_size = sizeof(CpiaContext),
+    .init           = cpia_decode_init,
+    .close          = cpia_decode_end,
+    .decode         = cpia_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/crystalhd.c b/libavcodec/crystalhd.c
new file mode 100644
index 0000000..e3c5955
--- /dev/null
+++ b/libavcodec/crystalhd.c
@@ -0,0 +1,816 @@
+/*
+ * - CrystalHD decoder module -
+ *
+ * Copyright(C) 2010,2011 Philip Langdale <ffmpeg.philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * - Principles of Operation -
+ *
+ * The CrystalHD decoder operates at the bitstream level - which is an even
+ * higher level than the decoding hardware you typically see in modern GPUs.
+ * This means it has a very simple interface, in principle. You feed demuxed
+ * packets in one end and get decoded picture (fields/frames) out the other.
+ *
+ * Of course, nothing is ever that simple. Due, at the very least, to b-frame
+ * dependencies in the supported formats, the hardware has a delay between
+ * when a packet goes in, and when a picture comes out. Furthermore, this delay
+ * is not just a function of time, but also one of the dependency on additional
+ * frames being fed into the decoder to satisfy the b-frame dependencies.
+ *
+ * As such, the hardware can only be used effectively with a decode API that
+ * doesn't assume a 1:1 relationship between input packets and output frames.
+ * The new avcodec decode API is such an API (an m:n API) while the old one is
+ * 1:1. Consequently, we no longer support the old API, which allows us to avoid
+ * the vicious hacks that are required to approximate 1:1 operation.
+ */
+
+/*****************************************************************************
+ * Includes
+ ****************************************************************************/
+
+#define _XOPEN_SOURCE 600
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <libcrystalhd/bc_dts_types.h>
+#include <libcrystalhd/bc_dts_defs.h>
+#include <libcrystalhd/libcrystalhd_if.h>
+
+#include "avcodec.h"
+#include "decode.h"
+#include "internal.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+/** Timeout parameter passed to DtsProcOutput() in us */
+#define OUTPUT_PROC_TIMEOUT 50
+/** Step between fake timestamps passed to hardware in units of 100ns */
+#define TIMESTAMP_UNIT 100000
+
+
+/*****************************************************************************
+ * Module private data
+ ****************************************************************************/
+
+typedef enum {
+    RET_ERROR           = -1,
+    RET_OK              = 0,
+    RET_COPY_AGAIN      = 1,
+} CopyRet;
+
+typedef struct OpaqueList {
+    struct OpaqueList *next;
+    uint64_t fake_timestamp;
+    uint64_t reordered_opaque;
+} OpaqueList;
+
+typedef struct {
+    AVClass *av_class;
+    AVCodecContext *avctx;
+    HANDLE dev;
+
+    uint8_t is_70012;
+    uint8_t need_second_field;
+    uint8_t draining;
+
+    OpaqueList *head;
+    OpaqueList *tail;
+
+    /* Options */
+    uint32_t sWidth;
+} CHDContext;
+
+static const AVOption options[] = {
+    { "crystalhd_downscale_width",
+      "Turn on downscaling to the specified width",
+      offsetof(CHDContext, sWidth),
+      AV_OPT_TYPE_INT, {.i64 = 0}, 0, UINT32_MAX,
+      AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM, },
+    { NULL, },
+};
+
+
+/*****************************************************************************
+ * Helper functions
+ ****************************************************************************/
+
+static inline BC_MEDIA_SUBTYPE id2subtype(CHDContext *priv, enum AVCodecID id)
+{
+    switch (id) {
+    case AV_CODEC_ID_MPEG4:
+        return BC_MSUBTYPE_DIVX;
+    case AV_CODEC_ID_MSMPEG4V3:
+        return BC_MSUBTYPE_DIVX311;
+    case AV_CODEC_ID_MPEG2VIDEO:
+        return BC_MSUBTYPE_MPEG2VIDEO;
+    case AV_CODEC_ID_VC1:
+        return BC_MSUBTYPE_VC1;
+    case AV_CODEC_ID_WMV3:
+        return BC_MSUBTYPE_WMV3;
+    case AV_CODEC_ID_H264:
+        return BC_MSUBTYPE_H264;
+    default:
+        return BC_MSUBTYPE_INVALID;
+    }
+}
+
+static inline void print_frame_info(CHDContext *priv, BC_DTS_PROC_OUT *output)
+{
+    av_log(priv->avctx, AV_LOG_TRACE, "\tYBuffSz: %u\n", output->YbuffSz);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tYBuffDoneSz: %u\n",
+           output->YBuffDoneSz);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tUVBuffDoneSz: %u\n",
+           output->UVBuffDoneSz);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tTimestamp: %"PRIu64"\n",
+           output->PicInfo.timeStamp);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tPicture Number: %u\n",
+           output->PicInfo.picture_number);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tWidth: %u\n",
+           output->PicInfo.width);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tHeight: %u\n",
+           output->PicInfo.height);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tChroma: 0x%03x\n",
+           output->PicInfo.chroma_format);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tPulldown: %u\n",
+           output->PicInfo.pulldown);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tFlags: 0x%08x\n",
+           output->PicInfo.flags);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tFrame Rate/Res: %u\n",
+           output->PicInfo.frame_rate);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tAspect Ratio: %u\n",
+           output->PicInfo.aspect_ratio);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tColor Primaries: %u\n",
+           output->PicInfo.colour_primaries);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tMetaData: %u\n",
+           output->PicInfo.picture_meta_payload);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tSession Number: %u\n",
+           output->PicInfo.sess_num);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tycom: %u\n",
+           output->PicInfo.ycom);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tCustom Aspect: %u\n",
+           output->PicInfo.custom_aspect_ratio_width_height);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tFrames to Drop: %u\n",
+           output->PicInfo.n_drop);
+    av_log(priv->avctx, AV_LOG_TRACE, "\tH264 Valid Fields: 0x%08x\n",
+           output->PicInfo.other.h264.valid);
+}
+
+
+/*****************************************************************************
+ * OpaqueList functions
+ ****************************************************************************/
+
+static uint64_t opaque_list_push(CHDContext *priv, uint64_t reordered_opaque)
+{
+    OpaqueList *newNode = av_mallocz(sizeof (OpaqueList));
+    if (!newNode) {
+        av_log(priv->avctx, AV_LOG_ERROR,
+               "Unable to allocate new node in OpaqueList.\n");
+        return 0;
+    }
+    if (!priv->head) {
+        newNode->fake_timestamp = TIMESTAMP_UNIT;
+        priv->head              = newNode;
+    } else {
+        newNode->fake_timestamp = priv->tail->fake_timestamp + TIMESTAMP_UNIT;
+        priv->tail->next        = newNode;
+    }
+    priv->tail = newNode;
+    newNode->reordered_opaque = reordered_opaque;
+
+    return newNode->fake_timestamp;
+}
+
+/*
+ * The OpaqueList is built in decode order, while elements will be removed
+ * in presentation order. If frames are reordered, this means we must be
+ * able to remove elements that are not the first element.
+ *
+ * Returned node must be freed by caller.
+ */
+static OpaqueList *opaque_list_pop(CHDContext *priv, uint64_t fake_timestamp)
+{
+    OpaqueList *node = priv->head;
+
+    if (!priv->head) {
+        av_log(priv->avctx, AV_LOG_ERROR,
+               "CrystalHD: Attempted to query non-existent timestamps.\n");
+        return NULL;
+    }
+
+    /*
+     * The first element is special-cased because we have to manipulate
+     * the head pointer rather than the previous element in the list.
+     */
+    if (priv->head->fake_timestamp == fake_timestamp) {
+        priv->head = node->next;
+
+        if (!priv->head->next)
+            priv->tail = priv->head;
+
+        node->next = NULL;
+        return node;
+    }
+
+    /*
+     * The list is processed at arm's length so that we have the
+     * previous element available to rewrite its next pointer.
+     */
+    while (node->next) {
+        OpaqueList *current = node->next;
+        if (current->fake_timestamp == fake_timestamp) {
+            node->next = current->next;
+
+            if (!node->next)
+               priv->tail = node;
+
+            current->next = NULL;
+            return current;
+        } else {
+            node = current;
+        }
+    }
+
+    av_log(priv->avctx, AV_LOG_VERBOSE,
+           "CrystalHD: Couldn't match fake_timestamp.\n");
+    return NULL;
+}
+
+
+/*****************************************************************************
+ * Video decoder API function definitions
+ ****************************************************************************/
+
+static void flush(AVCodecContext *avctx)
+{
+    CHDContext *priv = avctx->priv_data;
+
+    priv->need_second_field = 0;
+    priv->draining          = 0;
+
+    /* Flush mode 4 flushes all software and hardware buffers. */
+    DtsFlushInput(priv->dev, 4);
+}
+
+
+static av_cold int uninit(AVCodecContext *avctx)
+{
+    CHDContext *priv = avctx->priv_data;
+    HANDLE device;
+
+    device = priv->dev;
+    DtsStopDecoder(device);
+    DtsCloseDecoder(device);
+    DtsDeviceClose(device);
+
+    if (priv->head) {
+       OpaqueList *node = priv->head;
+       while (node) {
+          OpaqueList *next = node->next;
+          av_free(node);
+          node = next;
+       }
+    }
+
+    return 0;
+}
+
+static av_cold int init(AVCodecContext *avctx)
+{
+    CHDContext* priv;
+    BC_STATUS ret;
+    BC_INFO_CRYSTAL version;
+    BC_INPUT_FORMAT format = {
+        .FGTEnable   = FALSE,
+        .Progressive = TRUE,
+        .OptFlags    = 0x80000000 | vdecFrameRate59_94 | 0x40,
+        .width       = avctx->width,
+        .height      = avctx->height,
+    };
+
+    BC_MEDIA_SUBTYPE subtype;
+
+    uint32_t mode = DTS_PLAYBACK_MODE |
+                    DTS_LOAD_FILE_PLAY_FW |
+                    DTS_SKIP_TX_CHK_CPB |
+                    DTS_PLAYBACK_DROP_RPT_MODE |
+                    DTS_SINGLE_THREADED_MODE |
+                    DTS_DFLT_RESOLUTION(vdecRESOLUTION_1080p23_976);
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD Init for %s\n",
+           avctx->codec->name);
+
+    avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+
+    /* Initialize the library */
+    priv               = avctx->priv_data;
+    priv->avctx        = avctx;
+    priv->draining     = 0;
+
+    subtype = id2subtype(priv, avctx->codec->id);
+    switch (subtype) {
+    case BC_MSUBTYPE_H264:
+        format.startCodeSz = 4;
+        // Fall-through
+    case BC_MSUBTYPE_VC1:
+    case BC_MSUBTYPE_WVC1:
+    case BC_MSUBTYPE_WMV3:
+    case BC_MSUBTYPE_WMVA:
+    case BC_MSUBTYPE_MPEG2VIDEO:
+    case BC_MSUBTYPE_DIVX:
+    case BC_MSUBTYPE_DIVX311:
+        format.pMetaData  = avctx->extradata;
+        format.metaDataSz = avctx->extradata_size;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: Unknown codec name\n");
+        return AVERROR(EINVAL);
+    }
+    format.mSubtype = subtype;
+
+    if (priv->sWidth) {
+        format.bEnableScaling = 1;
+        format.ScalingParams.sWidth = priv->sWidth;
+    }
+
+    /* Get a decoder instance */
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: starting up\n");
+    // Initialize the Link and Decoder devices
+    ret = DtsDeviceOpen(&priv->dev, mode);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: DtsDeviceOpen failed\n");
+        goto fail;
+    }
+
+    ret = DtsCrystalHDVersion(priv->dev, &version);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "CrystalHD: DtsCrystalHDVersion failed\n");
+        goto fail;
+    }
+    priv->is_70012 = version.device == 0;
+
+    if (priv->is_70012 &&
+        (subtype == BC_MSUBTYPE_DIVX || subtype == BC_MSUBTYPE_DIVX311)) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "CrystalHD: BCM70012 doesn't support MPEG4-ASP/DivX/Xvid\n");
+        goto fail;
+    }
+
+    ret = DtsSetInputFormat(priv->dev, &format);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: SetInputFormat failed\n");
+        goto fail;
+    }
+
+    ret = DtsOpenDecoder(priv->dev, BC_STREAM_TYPE_ES);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsOpenDecoder failed\n");
+        goto fail;
+    }
+
+    ret = DtsSetColorSpace(priv->dev, OUTPUT_MODE422_YUY2);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsSetColorSpace failed\n");
+        goto fail;
+    }
+    ret = DtsStartDecoder(priv->dev);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsStartDecoder failed\n");
+        goto fail;
+    }
+    ret = DtsStartCapture(priv->dev);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsStartCapture failed\n");
+        goto fail;
+    }
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Init complete.\n");
+
+    return 0;
+
+ fail:
+    uninit(avctx);
+    return -1;
+}
+
+
+static inline CopyRet copy_frame(AVCodecContext *avctx,
+                                 BC_DTS_PROC_OUT *output,
+                                 AVFrame *frame, int *got_frame)
+{
+    BC_STATUS ret;
+    BC_DTS_STATUS decoder_status = { 0, };
+    uint8_t interlaced;
+
+    CHDContext *priv = avctx->priv_data;
+    int64_t pkt_pts  = AV_NOPTS_VALUE;
+
+    uint8_t bottom_field = (output->PicInfo.flags & VDEC_FLAG_BOTTOMFIELD) ==
+                           VDEC_FLAG_BOTTOMFIELD;
+    uint8_t bottom_first = !!(output->PicInfo.flags & VDEC_FLAG_BOTTOM_FIRST);
+
+    int width    = output->PicInfo.width;
+    int height   = output->PicInfo.height;
+    int bwidth;
+    uint8_t *src = output->Ybuff;
+    int sStride;
+    uint8_t *dst;
+    int dStride;
+
+    if (output->PicInfo.timeStamp != 0) {
+        OpaqueList *node = opaque_list_pop(priv, output->PicInfo.timeStamp);
+        if (node) {
+            pkt_pts = node->reordered_opaque;
+            av_free(node);
+        } else {
+            /*
+             * We will encounter a situation where a timestamp cannot be
+             * popped if a second field is being returned. In this case,
+             * each field has the same timestamp and the first one will
+             * cause it to be popped. We'll avoid overwriting the valid
+             * timestamp below.
+             */
+        }
+        av_log(avctx, AV_LOG_VERBOSE, "output \"pts\": %"PRIu64"\n",
+               output->PicInfo.timeStamp);
+    }
+
+    ret = DtsGetDriverStatus(priv->dev, &decoder_status);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR,
+               "CrystalHD: GetDriverStatus failed: %u\n", ret);
+       return RET_ERROR;
+    }
+
+    interlaced = output->PicInfo.flags & VDEC_FLAG_INTERLACED_SRC;
+
+    av_log(avctx, AV_LOG_VERBOSE, "Interlaced state: %d\n",
+           interlaced);
+
+    priv->need_second_field = interlaced && !priv->need_second_field;
+
+    if (!frame->data[0]) {
+        if (ff_get_buffer(avctx, frame, 0) < 0)
+            return RET_ERROR;
+    }
+
+    bwidth = av_image_get_linesize(avctx->pix_fmt, width, 0);
+    if (bwidth < 0)
+       return RET_ERROR;
+
+    if (priv->is_70012) {
+        int pStride;
+
+        if (width <= 720)
+            pStride = 720;
+        else if (width <= 1280)
+            pStride = 1280;
+        else pStride = 1920;
+        sStride = av_image_get_linesize(avctx->pix_fmt, pStride, 0);
+        if (sStride < 0)
+            return RET_ERROR;
+    } else {
+        sStride = bwidth;
+    }
+
+    dStride = frame->linesize[0];
+    dst     = frame->data[0];
+
+    av_log(priv->avctx, AV_LOG_VERBOSE, "CrystalHD: Copying out frame\n");
+
+    /*
+     * The hardware doesn't return the first sample of a picture.
+     * Ignoring why it behaves this way, it's better to copy the sample from
+     * the second line, rather than the next sample across because the chroma
+     * values should be correct (assuming the decoded video was 4:2:0, which
+     * it was).
+     */
+    *((uint32_t *)src) = *((uint32_t *)(src + sStride));
+
+    if (interlaced) {
+        int dY = 0;
+        int sY = 0;
+
+        height /= 2;
+        if (bottom_field) {
+            av_log(priv->avctx, AV_LOG_VERBOSE, "Interlaced: bottom field\n");
+            dY = 1;
+        } else {
+            av_log(priv->avctx, AV_LOG_VERBOSE, "Interlaced: top field\n");
+            dY = 0;
+        }
+
+        for (sY = 0; sY < height; dY++, sY++) {
+            memcpy(&(dst[dY * dStride]), &(src[sY * sStride]), bwidth);
+            dY++;
+        }
+    } else {
+        av_image_copy_plane(dst, dStride, src, sStride, bwidth, height);
+    }
+
+    frame->interlaced_frame = interlaced;
+    if (interlaced)
+        frame->top_field_first = !bottom_first;
+
+    frame->pts = pkt_pts;
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+    frame->pkt_pts = pkt_pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    frame->pkt_pos = -1;
+    frame->pkt_duration = 0;
+    frame->pkt_size = -1;
+
+    if (!priv->need_second_field) {
+        *got_frame       = 1;
+    } else {
+        return RET_COPY_AGAIN;
+    }
+
+    return RET_OK;
+}
+
+
+static inline CopyRet receive_frame(AVCodecContext *avctx,
+                                    AVFrame *frame, int *got_frame)
+{
+    BC_STATUS ret;
+    BC_DTS_PROC_OUT output = {
+        .PicInfo.width  = avctx->width,
+        .PicInfo.height = avctx->height,
+    };
+    CHDContext *priv = avctx->priv_data;
+    HANDLE dev       = priv->dev;
+
+    *got_frame = 0;
+
+    // Request decoded data from the driver
+    ret = DtsProcOutputNoCopy(dev, OUTPUT_PROC_TIMEOUT, &output);
+    if (ret == BC_STS_FMT_CHANGE) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Initial format change\n");
+        avctx->width  = output.PicInfo.width;
+        avctx->height = output.PicInfo.height;
+        switch ( output.PicInfo.aspect_ratio ) {
+        case vdecAspectRatioSquare:
+            avctx->sample_aspect_ratio = (AVRational) {  1,  1};
+            break;
+        case vdecAspectRatio12_11:
+            avctx->sample_aspect_ratio = (AVRational) { 12, 11};
+            break;
+        case vdecAspectRatio10_11:
+            avctx->sample_aspect_ratio = (AVRational) { 10, 11};
+            break;
+        case vdecAspectRatio16_11:
+            avctx->sample_aspect_ratio = (AVRational) { 16, 11};
+            break;
+        case vdecAspectRatio40_33:
+            avctx->sample_aspect_ratio = (AVRational) { 40, 33};
+            break;
+        case vdecAspectRatio24_11:
+            avctx->sample_aspect_ratio = (AVRational) { 24, 11};
+            break;
+        case vdecAspectRatio20_11:
+            avctx->sample_aspect_ratio = (AVRational) { 20, 11};
+            break;
+        case vdecAspectRatio32_11:
+            avctx->sample_aspect_ratio = (AVRational) { 32, 11};
+            break;
+        case vdecAspectRatio80_33:
+            avctx->sample_aspect_ratio = (AVRational) { 80, 33};
+            break;
+        case vdecAspectRatio18_11:
+            avctx->sample_aspect_ratio = (AVRational) { 18, 11};
+            break;
+        case vdecAspectRatio15_11:
+            avctx->sample_aspect_ratio = (AVRational) { 15, 11};
+            break;
+        case vdecAspectRatio64_33:
+            avctx->sample_aspect_ratio = (AVRational) { 64, 33};
+            break;
+        case vdecAspectRatio160_99:
+            avctx->sample_aspect_ratio = (AVRational) {160, 99};
+            break;
+        case vdecAspectRatio4_3:
+            avctx->sample_aspect_ratio = (AVRational) {  4,  3};
+            break;
+        case vdecAspectRatio16_9:
+            avctx->sample_aspect_ratio = (AVRational) { 16,  9};
+            break;
+        case vdecAspectRatio221_1:
+            avctx->sample_aspect_ratio = (AVRational) {221,  1};
+            break;
+        }
+        return RET_COPY_AGAIN;
+    } else if (ret == BC_STS_SUCCESS) {
+        int copy_ret = -1;
+        if (output.PoutFlags & BC_POUT_FLAGS_PIB_VALID) {
+            print_frame_info(priv, &output);
+
+            copy_ret = copy_frame(avctx, &output, frame, got_frame);
+        } else {
+            /*
+             * An invalid frame has been consumed.
+             */
+            av_log(avctx, AV_LOG_ERROR, "CrystalHD: ProcOutput succeeded with "
+                                        "invalid PIB\n");
+            copy_ret = RET_COPY_AGAIN;
+        }
+        DtsReleaseOutputBuffs(dev, NULL, FALSE);
+
+        return copy_ret;
+    } else if (ret == BC_STS_BUSY) {
+        return RET_COPY_AGAIN;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: ProcOutput failed %d\n", ret);
+        return RET_ERROR;
+    }
+}
+
+static int crystalhd_decode_packet(AVCodecContext *avctx, const AVPacket *avpkt)
+{
+    BC_STATUS bc_ret;
+    CHDContext *priv   = avctx->priv_data;
+    HANDLE dev         = priv->dev;
+    AVPacket filtered_packet = { 0 };
+    int ret = 0;
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: decode_packet\n");
+
+    if (avpkt && avpkt->size) {
+        uint64_t pts;
+
+        /*
+         * Despite being notionally opaque, either libcrystalhd or
+         * the hardware itself will mangle pts values that are too
+         * small or too large. The docs claim it should be in units
+         * of 100ns. Given that we're nominally dealing with a black
+         * box on both sides, any transform we do has no guarantee of
+         * avoiding mangling so we need to build a mapping to values
+         * we know will not be mangled.
+         */
+        pts = opaque_list_push(priv, avpkt->pts);
+        if (!pts) {
+            ret = AVERROR(ENOMEM);
+            goto exit;
+        }
+        av_log(priv->avctx, AV_LOG_VERBOSE,
+               "input \"pts\": %"PRIu64"\n", pts);
+        bc_ret = DtsProcInput(dev, avpkt->data, avpkt->size, pts, 0);
+        if (bc_ret == BC_STS_BUSY) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "CrystalHD: ProcInput returned busy\n");
+            ret = AVERROR(EAGAIN);
+            goto exit;
+        } else if (bc_ret != BC_STS_SUCCESS) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "CrystalHD: ProcInput failed: %u\n", ret);
+            ret = -1;
+            goto exit;
+        }
+    } else {
+        av_log(avctx, AV_LOG_INFO, "CrystalHD: No more input data\n");
+        priv->draining = 1;
+        ret = AVERROR_EOF;
+        goto exit;
+    }
+ exit:
+    av_packet_unref(&filtered_packet);
+    return ret;
+}
+
+static int crystalhd_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    BC_STATUS bc_ret;
+    BC_DTS_STATUS decoder_status = { 0, };
+    CopyRet rec_ret;
+    CHDContext *priv   = avctx->priv_data;
+    HANDLE dev         = priv->dev;
+    int got_frame = 0;
+    int ret = 0;
+    AVPacket pkt = {0};
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: receive_frame\n");
+
+    ret = ff_decode_get_packet(avctx, &pkt);
+    if (ret < 0 && ret != AVERROR_EOF) {
+        return ret;
+    }
+
+    while (pkt.size > DtsTxFreeSize(dev)) {
+        /*
+         * Block until there is space in the buffer for the next packet.
+         * We assume that the hardware will make forward progress at this
+         * point, although in pathological cases that may not happen.
+         */
+        av_log(avctx, AV_LOG_TRACE, "CrystalHD: Waiting for space in input buffer\n");
+    }
+
+    ret = crystalhd_decode_packet(avctx, &pkt);
+    av_packet_unref(&pkt);
+    // crystalhd_is_buffer_full() should avoid this.
+    if (ret == AVERROR(EAGAIN)) {
+        ret = AVERROR_EXTERNAL;
+    }
+    if (ret < 0 && ret != AVERROR_EOF) {
+        return ret;
+    }
+
+    do {
+        bc_ret = DtsGetDriverStatus(dev, &decoder_status);
+        if (bc_ret != BC_STS_SUCCESS) {
+            av_log(avctx, AV_LOG_ERROR, "CrystalHD: GetDriverStatus failed\n");
+            return -1;
+        }
+
+        if (decoder_status.ReadyListCount == 0) {
+            av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Insufficient frames ready. Returning\n");
+            got_frame = 0;
+            rec_ret = RET_OK;
+            break;
+        }
+
+        rec_ret = receive_frame(avctx, frame, &got_frame);
+    } while (rec_ret == RET_COPY_AGAIN);
+
+    if (rec_ret == RET_ERROR) {
+        return -1;
+    } else if (got_frame == 0) {
+        return priv->draining ? AVERROR_EOF : AVERROR(EAGAIN);
+    } else {
+        return 0;
+    }
+}
+
+#define DEFINE_CRYSTALHD_DECODER(x, X, bsf_name) \
+    static const AVClass x##_crystalhd_class = { \
+        .class_name = #x "_crystalhd", \
+        .item_name = av_default_item_name, \
+        .option = options, \
+        .version = LIBAVUTIL_VERSION_INT, \
+    }; \
+    AVCodec ff_##x##_crystalhd_decoder = { \
+        .name           = #x "_crystalhd", \
+        .long_name      = NULL_IF_CONFIG_SMALL("CrystalHD " #X " decoder"), \
+        .type           = AVMEDIA_TYPE_VIDEO, \
+        .id             = AV_CODEC_ID_##X, \
+        .priv_data_size = sizeof(CHDContext), \
+        .priv_class     = &x##_crystalhd_class, \
+        .init           = init, \
+        .close          = uninit, \
+        .receive_frame  = crystalhd_receive_frame, \
+        .flush          = flush, \
+        .bsfs           = bsf_name, \
+        .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HARDWARE, \
+        .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE}, \
+        .wrapper_name   = "crystalhd", \
+    };
+
+#if CONFIG_H264_CRYSTALHD_DECODER
+DEFINE_CRYSTALHD_DECODER(h264, H264, "h264_mp4toannexb")
+#endif
+
+#if CONFIG_MPEG2_CRYSTALHD_DECODER
+DEFINE_CRYSTALHD_DECODER(mpeg2, MPEG2VIDEO, NULL)
+#endif
+
+#if CONFIG_MPEG4_CRYSTALHD_DECODER
+DEFINE_CRYSTALHD_DECODER(mpeg4, MPEG4, "mpeg4_unpack_bframes")
+#endif
+
+#if CONFIG_MSMPEG4_CRYSTALHD_DECODER
+DEFINE_CRYSTALHD_DECODER(msmpeg4, MSMPEG4V3, NULL)
+#endif
+
+#if CONFIG_VC1_CRYSTALHD_DECODER
+DEFINE_CRYSTALHD_DECODER(vc1, VC1, NULL)
+#endif
+
+#if CONFIG_WMV3_CRYSTALHD_DECODER
+DEFINE_CRYSTALHD_DECODER(wmv3, WMV3, NULL)
+#endif
diff --git a/libavcodec/cscd.c b/libavcodec/cscd.c
index 0cb375b..8781df1 100644
--- a/libavcodec/cscd.c
+++ b/libavcodec/cscd.c
@@ -2,20 +2,20 @@
  * CamStudio decoder
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <stdio.h>
@@ -31,14 +31,16 @@
 #include "libavutil/lzo.h"
 
 typedef struct CamStudioContext {
+    AVFrame *pic;
     int linelen, height, bpp;
     unsigned int decomp_size;
     unsigned char* decomp_buf;
 } CamStudioContext;
 
-static void copy_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
-                               int linelen, int height) {
-    int i;
+static void copy_frame_default(AVFrame *f, const uint8_t *src,
+                               int linelen, int height)
+{
+    int i, src_stride = FFALIGN(linelen, 4);
     uint8_t *dst = f->data[0];
     dst += (height - 1) * f->linesize[0];
     for (i = height; i; i--) {
@@ -48,9 +50,10 @@ static void copy_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
     }
 }
 
-static void add_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
-                              int linelen, int height) {
-    int i, j;
+static void add_frame_default(AVFrame *f, const uint8_t *src,
+                              int linelen, int height)
+{
+    int i, j, src_stride = FFALIGN(linelen, 4);
     uint8_t *dst = f->data[0];
     dst += (height - 1) * f->linesize[0];
     for (i = height; i; i--) {
@@ -61,87 +64,12 @@ static void add_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
     }
 }
 
-#if !HAVE_BIGENDIAN
-#define copy_frame_16(f, s, l, h) copy_frame_default(f, s, l, l, h)
-#define copy_frame_32(f, s, l, h) copy_frame_default(f, s, l, l, h)
-#define add_frame_16(f, s, l, h) add_frame_default(f, s, l, l, h)
-#define add_frame_32(f, s, l, h) add_frame_default(f, s, l, l, h)
-#else
-static void copy_frame_16(AVFrame *f, const uint8_t *src,
-                          int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 2; j; j--) {
-          dst[0] = src[1];
-          dst[1] = src[0];
-          src += 2;
-          dst += 2;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void copy_frame_32(AVFrame *f, const uint8_t *src,
-                          int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 4; j; j--) {
-          dst[0] = src[3];
-          dst[1] = src[2];
-          dst[2] = src[1];
-          dst[3] = src[0];
-          src += 4;
-          dst += 4;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void add_frame_16(AVFrame *f, const uint8_t *src,
-                         int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 2; j; j--) {
-          dst[0] += src[1];
-          dst[1] += src[0];
-          src += 2;
-          dst += 2;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void add_frame_32(AVFrame *f, const uint8_t *src,
-                         int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 4; j; j--) {
-          dst[0] += src[3];
-          dst[1] += src[2];
-          dst[2] += src[1];
-          dst[3] += src[0];
-          src += 4;
-          dst += 4;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-#endif
-
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                        AVPacket *avpkt) {
+                        AVPacket *avpkt)
+{
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     CamStudioContext *c = avctx->priv_data;
-    AVFrame *picture = data;
     int ret;
 
     if (buf_size < 2) {
@@ -149,101 +77,92 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
 
     // decompress data
     switch ((buf[0] >> 1) & 7) {
-        case 0: { // lzo compression
-            int outlen = c->decomp_size, inlen = buf_size - 2;
-            if (av_lzo1x_decode(c->decomp_buf, &outlen, &buf[2], &inlen))
-                av_log(avctx, AV_LOG_ERROR, "error during lzo decompression\n");
-            break;
+    case 0: { // lzo compression
+        int outlen = c->decomp_size, inlen = buf_size - 2;
+        if (av_lzo1x_decode(c->decomp_buf, &outlen, &buf[2], &inlen) || outlen) {
+            av_log(avctx, AV_LOG_ERROR, "error during lzo decompression\n");
+            return AVERROR_INVALIDDATA;
         }
-        case 1: { // zlib compression
+        break;
+    }
+    case 1: { // zlib compression
 #if CONFIG_ZLIB
-            unsigned long dlen = c->decomp_size;
-            if (uncompress(c->decomp_buf, &dlen, &buf[2], buf_size - 2) != Z_OK)
-                av_log(avctx, AV_LOG_ERROR, "error during zlib decompression\n");
-            break;
+        unsigned long dlen = c->decomp_size;
+        if (uncompress(c->decomp_buf, &dlen, &buf[2], buf_size - 2) != Z_OK) {
+            av_log(avctx, AV_LOG_ERROR, "error during zlib decompression\n");
+            return AVERROR_INVALIDDATA;
+        }
+        break;
 #else
-            av_log(avctx, AV_LOG_ERROR, "compiled without zlib support\n");
-            return AVERROR(ENOSYS);
+        av_log(avctx, AV_LOG_ERROR, "compiled without zlib support\n");
+        return AVERROR(ENOSYS);
 #endif
-        }
-        default:
-            av_log(avctx, AV_LOG_ERROR, "unknown compression\n");
-            return AVERROR_INVALIDDATA;
+    }
+    default:
+        av_log(avctx, AV_LOG_ERROR, "unknown compression\n");
+        return AVERROR_INVALIDDATA;
     }
 
     // flip upside down, add difference frame
     if (buf[0] & 1) { // keyframe
-        picture->pict_type = AV_PICTURE_TYPE_I;
-        picture->key_frame = 1;
-        switch (c->bpp) {
-          case 16:
-              copy_frame_16(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          case 32:
-              copy_frame_32(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          default:
-              copy_frame_default(picture, c->decomp_buf, FFALIGN(c->linelen, 4),
+        c->pic->pict_type = AV_PICTURE_TYPE_I;
+        c->pic->key_frame = 1;
+              copy_frame_default(c->pic, c->decomp_buf,
                                  c->linelen, c->height);
-        }
     } else {
-        picture->pict_type = AV_PICTURE_TYPE_P;
-        picture->key_frame = 0;
-        switch (c->bpp) {
-          case 16:
-              add_frame_16(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          case 32:
-              add_frame_32(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          default:
-              add_frame_default(picture, c->decomp_buf, FFALIGN(c->linelen, 4),
+        c->pic->pict_type = AV_PICTURE_TYPE_P;
+        c->pic->key_frame = 0;
+              add_frame_default(c->pic, c->decomp_buf,
                                 c->linelen, c->height);
-        }
     }
 
     *got_frame = 1;
+    if ((ret = av_frame_ref(data, c->pic)) < 0)
+        return ret;
+
     return buf_size;
 }
 
-static av_cold int decode_init(AVCodecContext *avctx) {
+static av_cold int decode_init(AVCodecContext *avctx)
+{
     CamStudioContext *c = avctx->priv_data;
     int stride;
     switch (avctx->bits_per_coded_sample) {
-        case 16: avctx->pix_fmt = AV_PIX_FMT_RGB555; break;
-        case 24: avctx->pix_fmt = AV_PIX_FMT_BGR24; break;
-        case 32: avctx->pix_fmt = AV_PIX_FMT_RGB32; break;
-        default:
-            av_log(avctx, AV_LOG_ERROR,
-                   "CamStudio codec error: invalid depth %i bpp\n",
-                   avctx->bits_per_coded_sample);
-            return AVERROR_INVALIDDATA;
+    case 16: avctx->pix_fmt = AV_PIX_FMT_RGB555LE; break;
+    case 24: avctx->pix_fmt = AV_PIX_FMT_BGR24; break;
+    case 32: avctx->pix_fmt = AV_PIX_FMT_BGR0; break;
+    default:
+        av_log(avctx, AV_LOG_ERROR,
+               "CamStudio codec error: invalid depth %i bpp\n",
+               avctx->bits_per_coded_sample);
+        return AVERROR_INVALIDDATA;
     }
     c->bpp = avctx->bits_per_coded_sample;
     c->linelen = avctx->width * avctx->bits_per_coded_sample / 8;
     c->height = avctx->height;
-    stride = c->linelen;
-    if (avctx->bits_per_coded_sample == 24)
-        stride = FFALIGN(stride, 4);
+    stride = FFALIGN(c->linelen, 4);
     c->decomp_size = c->height * stride;
     c->decomp_buf = av_malloc(c->decomp_size + AV_LZO_OUTPUT_PADDING);
     if (!c->decomp_buf) {
         av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
         return AVERROR(ENOMEM);
     }
+    c->pic = av_frame_alloc();
+    if (!c->pic)
+        return AVERROR(ENOMEM);
     return 0;
 }
 
-static av_cold int decode_end(AVCodecContext *avctx) {
+static av_cold int decode_end(AVCodecContext *avctx)
+{
     CamStudioContext *c = avctx->priv_data;
     av_freep(&c->decomp_buf);
+    av_frame_free(&c->pic);
     return 0;
 }
 
@@ -256,5 +175,6 @@ AVCodec ff_cscd_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/cuvid.c b/libavcodec/cuvid.c
deleted file mode 100644
index 2d35e92..0000000
--- a/libavcodec/cuvid.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * HW decode acceleration through CUVID
- *
- * Copyright (c) 2016 Anton Khirnov
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <cuda.h>
-#include <cuviddec.h>
-
-#include "config.h"
-
-#include "libavutil/common.h"
-#include "libavutil/error.h"
-#include "libavutil/hwcontext.h"
-#include "libavutil/hwcontext_cuda.h"
-#include "libavutil/pixdesc.h"
-#include "libavutil/pixfmt.h"
-
-#include "avcodec.h"
-#include "decode.h"
-#include "cuvid.h"
-#include "internal.h"
-
-typedef struct CUVIDDecoder {
-    CUvideodecoder decoder;
-
-    AVBufferRef *hw_device_ref;
-    CUcontext    cuda_ctx;
-} CUVIDDecoder;
-
-typedef struct CUVIDFramePool {
-    unsigned int dpb_size;
-    unsigned int nb_allocated;
-} CUVIDFramePool;
-
-static int map_avcodec_id(enum AVCodecID id)
-{
-    switch (id) {
-    case AV_CODEC_ID_H264: return cudaVideoCodec_H264;
-    case AV_CODEC_ID_HEVC: return cudaVideoCodec_HEVC;
-    }
-    return -1;
-}
-
-static int map_chroma_format(enum AVPixelFormat pix_fmt)
-{
-    int shift_h = 0, shift_v = 0;
-
-    av_pix_fmt_get_chroma_sub_sample(pix_fmt, &shift_h, &shift_v);
-
-    if (shift_h == 1 && shift_v == 1)
-        return cudaVideoChromaFormat_420;
-    else if (shift_h == 1 && shift_v == 0)
-        return cudaVideoChromaFormat_422;
-    else if (shift_h == 0 && shift_v == 0)
-        return cudaVideoChromaFormat_444;
-
-    return -1;
-}
-
-static void cuvid_decoder_free(void *opaque, uint8_t *data)
-{
-    CUVIDDecoder *decoder = (CUVIDDecoder*)data;
-
-    if (decoder->decoder)
-        cuvidDestroyDecoder(decoder->decoder);
-
-    av_buffer_unref(&decoder->hw_device_ref);
-
-    av_freep(&decoder);
-}
-
-static int cuvid_decoder_create(AVBufferRef **out, AVBufferRef *hw_device_ref,
-                                CUVIDDECODECREATEINFO *params, void *logctx)
-{
-    AVHWDeviceContext  *hw_device_ctx = (AVHWDeviceContext*)hw_device_ref->data;
-    AVCUDADeviceContext *device_hwctx = hw_device_ctx->hwctx;
-
-    AVBufferRef *decoder_ref;
-    CUVIDDecoder *decoder;
-
-    CUcontext dummy;
-    CUresult err;
-    int ret;
-
-    decoder = av_mallocz(sizeof(*decoder));
-    if (!decoder)
-        return AVERROR(ENOMEM);
-
-    decoder_ref = av_buffer_create((uint8_t*)decoder, sizeof(*decoder),
-                                   cuvid_decoder_free, NULL, AV_BUFFER_FLAG_READONLY);
-    if (!decoder_ref) {
-        av_freep(&decoder);
-        return AVERROR(ENOMEM);
-    }
-
-    decoder->hw_device_ref = av_buffer_ref(hw_device_ref);
-    if (!decoder->hw_device_ref) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-    decoder->cuda_ctx = device_hwctx->cuda_ctx;
-
-    err = cuCtxPushCurrent(decoder->cuda_ctx);
-    if (err != CUDA_SUCCESS) {
-        ret = AVERROR_UNKNOWN;
-        goto fail;
-    }
-
-    err = cuvidCreateDecoder(&decoder->decoder, params);
-
-    cuCtxPopCurrent(&dummy);
-
-    if (err != CUDA_SUCCESS) {
-        av_log(logctx, AV_LOG_ERROR, "Error creating a CUVID decoder: %d\n", err);
-        ret = AVERROR_UNKNOWN;
-        goto fail;
-    }
-
-    *out = decoder_ref;
-
-    return 0;
-fail:
-    av_buffer_unref(&decoder_ref);
-    return ret;
-}
-
-static AVBufferRef *cuvid_decoder_frame_alloc(void *opaque, int size)
-{
-    CUVIDFramePool *pool = opaque;
-    AVBufferRef *ret;
-
-    if (pool->nb_allocated >= pool->dpb_size)
-        return NULL;
-
-    ret = av_buffer_alloc(sizeof(unsigned int));
-    if (!ret)
-        return NULL;
-
-    *(unsigned int*)ret->data = pool->nb_allocated++;
-
-    return ret;
-}
-
-int ff_cuvid_decode_uninit(AVCodecContext *avctx)
-{
-    CUVIDContext *ctx = avctx->internal->hwaccel_priv_data;
-
-    av_freep(&ctx->bitstream);
-    ctx->bitstream_len       = 0;
-    ctx->bitstream_allocated = 0;
-
-    av_freep(&ctx->slice_offsets);
-    ctx->nb_slices               = 0;
-    ctx->slice_offsets_allocated = 0;
-
-    av_buffer_unref(&ctx->decoder_ref);
-    av_buffer_pool_uninit(&ctx->decoder_pool);
-
-    return 0;
-}
-
-int ff_cuvid_decode_init(AVCodecContext *avctx, unsigned int dpb_size)
-{
-    CUVIDContext *ctx = avctx->internal->hwaccel_priv_data;
-
-    CUVIDFramePool      *pool;
-    AVHWFramesContext   *frames_ctx;
-    const AVPixFmtDescriptor *sw_desc;
-
-    CUVIDDECODECREATEINFO params = { 0 };
-
-    int cuvid_codec_type, cuvid_chroma_format;
-    int ret = 0;
-
-    sw_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
-    if (!sw_desc)
-        return AVERROR_BUG;
-
-    cuvid_codec_type = map_avcodec_id(avctx->codec_id);
-    if (cuvid_codec_type < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Unsupported codec ID\n");
-        return AVERROR_BUG;
-    }
-
-    cuvid_chroma_format = map_chroma_format(avctx->sw_pix_fmt);
-    if (cuvid_chroma_format < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Unsupported chroma format\n");
-        return AVERROR(ENOSYS);
-    }
-
-    if (avctx->thread_type & FF_THREAD_FRAME)
-        dpb_size += avctx->thread_count;
-
-    if (!avctx->hw_frames_ctx) {
-        AVHWFramesContext *frames_ctx;
-
-        if (!avctx->hw_device_ctx) {
-            av_log(avctx, AV_LOG_ERROR, "A hardware device or frames context "
-                   "is required for CUVID decoding.\n");
-            return AVERROR(EINVAL);
-        }
-
-        avctx->hw_frames_ctx = av_hwframe_ctx_alloc(avctx->hw_device_ctx);
-        if (!avctx->hw_frames_ctx)
-            return AVERROR(ENOMEM);
-        frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-
-        frames_ctx->format            = AV_PIX_FMT_CUDA;
-        frames_ctx->width             = avctx->coded_width;
-        frames_ctx->height            = avctx->coded_height;
-        frames_ctx->sw_format         = AV_PIX_FMT_NV12;
-        frames_ctx->sw_format         = sw_desc->comp[0].depth > 8 && HAVE_CUVIDDECODECREATEINFO_BITDEPTHMINUS8 ?
-                                        AV_PIX_FMT_P010 : AV_PIX_FMT_NV12;
-        frames_ctx->initial_pool_size = dpb_size;
-
-        ret = av_hwframe_ctx_init(avctx->hw_frames_ctx);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error initializing internal frames context\n");
-            return ret;
-        }
-    }
-    frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-
-    params.ulWidth             = avctx->coded_width;
-    params.ulHeight            = avctx->coded_height;
-    params.ulTargetWidth       = avctx->coded_width;
-    params.ulTargetHeight      = avctx->coded_height;
-#if HAVE_CUVIDDECODECREATEINFO_BITDEPTHMINUS8
-    params.bitDepthMinus8      = sw_desc->comp[0].depth - 8;
-    params.OutputFormat        = params.bitDepthMinus8 ?
-                                 cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
-#else
-    params.OutputFormat        = cudaVideoSurfaceFormat_NV12;
-#endif
-    params.CodecType           = cuvid_codec_type;
-    params.ChromaFormat        = cuvid_chroma_format;
-    params.ulNumDecodeSurfaces = dpb_size;
-    params.ulNumOutputSurfaces = 1;
-
-    ret = cuvid_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, &params, avctx);
-    if (ret < 0)
-        return ret;
-
-    pool = av_mallocz(sizeof(*pool));
-    if (!pool) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-    pool->dpb_size = dpb_size;
-
-    ctx->decoder_pool = av_buffer_pool_init2(sizeof(int), pool,
-                                             cuvid_decoder_frame_alloc, av_free);
-    if (!ctx->decoder_pool) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-
-    return 0;
-fail:
-    ff_cuvid_decode_uninit(avctx);
-    return ret;
-}
-
-static void cuvid_fdd_priv_free(void *priv)
-{
-    CUVIDFrame *cf = priv;
-
-    if (!cf)
-        return;
-
-    av_buffer_unref(&cf->idx_ref);
-    av_buffer_unref(&cf->decoder_ref);
-
-    av_freep(&priv);
-}
-
-static int cuvid_retrieve_data(void *logctx, AVFrame *frame)
-{
-    FrameDecodeData  *fdd = (FrameDecodeData*)frame->opaque_ref->data;
-    CUVIDFrame        *cf = (CUVIDFrame*)fdd->hwaccel_priv;
-    CUVIDDecoder *decoder = (CUVIDDecoder*)cf->decoder_ref->data;
-
-    CUVIDPROCPARAMS vpp = { .progressive_frame = 1 };
-
-    CUresult err;
-    CUcontext dummy;
-    CUdeviceptr devptr;
-
-    unsigned int pitch, i;
-    unsigned int offset = 0;
-    int ret = 0;
-
-    err = cuCtxPushCurrent(decoder->cuda_ctx);
-    if (err != CUDA_SUCCESS)
-        return AVERROR_UNKNOWN;
-
-    err = cuvidMapVideoFrame(decoder->decoder, cf->idx, &devptr, &pitch, &vpp);
-    if (err != CUDA_SUCCESS) {
-        av_log(logctx, AV_LOG_ERROR, "Error mapping a picture with CUVID: %d\n",
-               err);
-        ret = AVERROR_UNKNOWN;
-        goto finish;
-    }
-
-    for (i = 0; frame->data[i]; i++) {
-        CUDA_MEMCPY2D cpy = {
-            .srcMemoryType = CU_MEMORYTYPE_DEVICE,
-            .dstMemoryType = CU_MEMORYTYPE_DEVICE,
-            .srcDevice     = devptr,
-            .dstDevice     = (CUdeviceptr)frame->data[i],
-            .srcPitch      = pitch,
-            .dstPitch      = frame->linesize[i],
-            .srcY          = offset,
-            .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
-            .Height        = frame->height >> (i ? 1 : 0),
-        };
-
-        err = cuMemcpy2D(&cpy);
-        if (err != CUDA_SUCCESS) {
-            av_log(logctx, AV_LOG_ERROR, "Error copying decoded frame: %d\n",
-                   err);
-            ret = AVERROR_UNKNOWN;
-            goto copy_fail;
-        }
-
-        offset += cpy.Height;
-    }
-
-copy_fail:
-    cuvidUnmapVideoFrame(decoder->decoder, devptr);
-
-finish:
-    cuCtxPopCurrent(&dummy);
-    return ret;
-}
-
-int ff_cuvid_start_frame(AVCodecContext *avctx, AVFrame *frame)
-{
-    CUVIDContext *ctx = avctx->internal->hwaccel_priv_data;
-    FrameDecodeData *fdd = (FrameDecodeData*)frame->opaque_ref->data;
-    CUVIDFrame *cf = NULL;
-    int ret;
-
-    ctx->bitstream_len = 0;
-    ctx->nb_slices     = 0;
-
-    if (fdd->hwaccel_priv)
-        return 0;
-
-    cf = av_mallocz(sizeof(*cf));
-    if (!cf)
-        return AVERROR(ENOMEM);
-
-    cf->decoder_ref = av_buffer_ref(ctx->decoder_ref);
-    if (!cf->decoder_ref)
-        goto fail;
-
-    cf->idx_ref = av_buffer_pool_get(ctx->decoder_pool);
-    if (!cf->idx_ref) {
-        av_log(avctx, AV_LOG_ERROR, "No decoder surfaces left\n");
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-    cf->idx = *(unsigned int*)cf->idx_ref->data;
-
-    fdd->hwaccel_priv      = cf;
-    fdd->hwaccel_priv_free = cuvid_fdd_priv_free;
-    fdd->post_process      = cuvid_retrieve_data;
-
-    return 0;
-fail:
-    cuvid_fdd_priv_free(cf);
-    return ret;
-
-}
-
-int ff_cuvid_end_frame(AVCodecContext *avctx)
-{
-    CUVIDContext     *ctx = avctx->internal->hwaccel_priv_data;
-    CUVIDDecoder *decoder = (CUVIDDecoder*)ctx->decoder_ref->data;
-    CUVIDPICPARAMS    *pp = &ctx->pic_params;
-
-    CUresult err;
-    CUcontext dummy;
-
-    int ret = 0;
-
-    pp->nBitstreamDataLen = ctx->bitstream_len;
-    pp->pBitstreamData    = ctx->bitstream;
-    pp->nNumSlices        = ctx->nb_slices;
-    pp->pSliceDataOffsets = ctx->slice_offsets;
-
-    err = cuCtxPushCurrent(decoder->cuda_ctx);
-    if (err != CUDA_SUCCESS)
-        return AVERROR_UNKNOWN;
-
-    err = cuvidDecodePicture(decoder->decoder, &ctx->pic_params);
-    if (err != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Error decoding a picture with CUVID: %d\n",
-               err);
-        ret = AVERROR_UNKNOWN;
-        goto finish;
-    }
-
-finish:
-    cuCtxPopCurrent(&dummy);
-
-    return ret;
-}
diff --git a/libavcodec/cuvid.h b/libavcodec/cuvid.h
deleted file mode 100644
index 62e376b..0000000
--- a/libavcodec/cuvid.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * HW decode acceleration through CUVID
- *
- * Copyright (c) 2016 Anton Khirnov
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_CUVID_H
-#define AVCODEC_CUVID_H
-
-#include <cuviddec.h>
-#include <stdint.h>
-
-#include "libavutil/buffer.h"
-#include "libavutil/frame.h"
-
-#include "avcodec.h"
-
-typedef struct CUVIDFrame {
-    unsigned int idx;
-    AVBufferRef *idx_ref;
-    AVBufferRef *decoder_ref;
-} CUVIDFrame;
-
-typedef struct CUVIDContext {
-    CUVIDPICPARAMS pic_params;
-
-    AVBufferPool *decoder_pool;
-
-    AVBufferRef  *decoder_ref;
-
-    uint8_t      *bitstream;
-    int           bitstream_len;
-    unsigned int  bitstream_allocated;
-
-    unsigned     *slice_offsets;
-    int           nb_slices;
-    unsigned int  slice_offsets_allocated;
-} CUVIDContext;
-
-int ff_cuvid_decode_init(AVCodecContext *avctx, unsigned int dpb_size);
-int ff_cuvid_decode_uninit(AVCodecContext *avctx);
-int ff_cuvid_start_frame(AVCodecContext *avctx, AVFrame *frame);
-int ff_cuvid_end_frame(AVCodecContext *avctx);
-
-#endif /* AVCODEC_CUVID_H */
diff --git a/libavcodec/cuviddec.c b/libavcodec/cuviddec.c
new file mode 100644
index 0000000..291bb93
--- /dev/null
+++ b/libavcodec/cuviddec.c
@@ -0,0 +1,1200 @@
+/*
+ * Nvidia CUVID decoder
+ * Copyright (c) 2016 Timo Rothenpieler <timo@rothenpieler.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "compat/cuda/dynlink_loader.h"
+
+#include "libavutil/buffer.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/fifo.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avcodec.h"
+#include "decode.h"
+#include "hwaccel.h"
+#include "nvdec.h"
+#include "internal.h"
+
+#if !NVDECAPI_CHECK_VERSION(9, 0)
+#define cudaVideoSurfaceFormat_YUV444 2
+#define cudaVideoSurfaceFormat_YUV444_16Bit 3
+#endif
+
+typedef struct CuvidContext
+{
+    AVClass *avclass;
+
+    CUvideodecoder cudecoder;
+    CUvideoparser cuparser;
+
+    char *cu_gpu;
+    int nb_surfaces;
+    int drop_second_field;
+    char *crop_expr;
+    char *resize_expr;
+
+    struct {
+        int left;
+        int top;
+        int right;
+        int bottom;
+    } crop;
+
+    struct {
+        int width;
+        int height;
+    } resize;
+
+    AVBufferRef *hwdevice;
+    AVBufferRef *hwframe;
+
+    AVBSFContext *bsf;
+
+    AVFifoBuffer *frame_queue;
+
+    int deint_mode;
+    int deint_mode_current;
+    int64_t prev_pts;
+
+    int internal_error;
+    int decoder_flushing;
+
+    int *key_frame;
+
+    cudaVideoCodec codec_type;
+    cudaVideoChromaFormat chroma_format;
+
+    CUVIDDECODECAPS caps8, caps10, caps12;
+
+    CUVIDPARSERPARAMS cuparseinfo;
+    CUVIDEOFORMATEX cuparse_ext;
+
+    CudaFunctions *cudl;
+    CuvidFunctions *cvdl;
+} CuvidContext;
+
+typedef struct CuvidParsedFrame
+{
+    CUVIDPARSERDISPINFO dispinfo;
+    int second_field;
+    int is_deinterlacing;
+} CuvidParsedFrame;
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(avctx, ctx->cudl, x)
+
+static int CUDAAPI cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* format)
+{
+    AVCodecContext *avctx = opaque;
+    CuvidContext *ctx = avctx->priv_data;
+    AVHWFramesContext *hwframe_ctx = (AVHWFramesContext*)ctx->hwframe->data;
+    CUVIDDECODECAPS *caps = NULL;
+    CUVIDDECODECREATEINFO cuinfo;
+    int surface_fmt;
+    int chroma_444;
+
+    int old_width = avctx->width;
+    int old_height = avctx->height;
+
+    enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_CUDA,
+                                       AV_PIX_FMT_NONE,  // Will be updated below
+                                       AV_PIX_FMT_NONE };
+
+    av_log(avctx, AV_LOG_TRACE, "pfnSequenceCallback, progressive_sequence=%d\n", format->progressive_sequence);
+
+    memset(&cuinfo, 0, sizeof(cuinfo));
+
+    ctx->internal_error = 0;
+
+    avctx->coded_width = cuinfo.ulWidth = format->coded_width;
+    avctx->coded_height = cuinfo.ulHeight = format->coded_height;
+
+    // apply cropping
+    cuinfo.display_area.left = format->display_area.left + ctx->crop.left;
+    cuinfo.display_area.top = format->display_area.top + ctx->crop.top;
+    cuinfo.display_area.right = format->display_area.right - ctx->crop.right;
+    cuinfo.display_area.bottom = format->display_area.bottom - ctx->crop.bottom;
+
+    // width and height need to be set before calling ff_get_format
+    if (ctx->resize_expr) {
+        avctx->width = ctx->resize.width;
+        avctx->height = ctx->resize.height;
+    } else {
+        avctx->width = cuinfo.display_area.right - cuinfo.display_area.left;
+        avctx->height = cuinfo.display_area.bottom - cuinfo.display_area.top;
+    }
+
+    // target width/height need to be multiples of two
+    cuinfo.ulTargetWidth = avctx->width = (avctx->width + 1) & ~1;
+    cuinfo.ulTargetHeight = avctx->height = (avctx->height + 1) & ~1;
+
+    // aspect ratio conversion, 1:1, depends on scaled resolution
+    cuinfo.target_rect.left = 0;
+    cuinfo.target_rect.top = 0;
+    cuinfo.target_rect.right = cuinfo.ulTargetWidth;
+    cuinfo.target_rect.bottom = cuinfo.ulTargetHeight;
+
+    chroma_444 = format->chroma_format == cudaVideoChromaFormat_444;
+
+    switch (format->bit_depth_luma_minus8) {
+    case 0: // 8-bit
+        pix_fmts[1] = chroma_444 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_NV12;
+        caps = &ctx->caps8;
+        break;
+    case 2: // 10-bit
+        pix_fmts[1] = chroma_444 ? AV_PIX_FMT_YUV444P16 : AV_PIX_FMT_P010;
+        caps = &ctx->caps10;
+        break;
+    case 4: // 12-bit
+        pix_fmts[1] = chroma_444 ? AV_PIX_FMT_YUV444P16 : AV_PIX_FMT_P016;
+        caps = &ctx->caps12;
+        break;
+    default:
+        break;
+    }
+
+    if (!caps || !caps->bIsSupported) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported bit depth: %d\n",
+               format->bit_depth_luma_minus8 + 8);
+        ctx->internal_error = AVERROR(EINVAL);
+        return 0;
+    }
+
+    surface_fmt = ff_get_format(avctx, pix_fmts);
+    if (surface_fmt < 0) {
+        av_log(avctx, AV_LOG_ERROR, "ff_get_format failed: %d\n", surface_fmt);
+        ctx->internal_error = AVERROR(EINVAL);
+        return 0;
+    }
+
+    av_log(avctx, AV_LOG_VERBOSE, "Formats: Original: %s | HW: %s | SW: %s\n",
+           av_get_pix_fmt_name(avctx->pix_fmt),
+           av_get_pix_fmt_name(surface_fmt),
+           av_get_pix_fmt_name(avctx->sw_pix_fmt));
+
+    avctx->pix_fmt = surface_fmt;
+
+    // Update our hwframe ctx, as the get_format callback might have refreshed it!
+    if (avctx->hw_frames_ctx) {
+        av_buffer_unref(&ctx->hwframe);
+
+        ctx->hwframe = av_buffer_ref(avctx->hw_frames_ctx);
+        if (!ctx->hwframe) {
+            ctx->internal_error = AVERROR(ENOMEM);
+            return 0;
+        }
+
+        hwframe_ctx = (AVHWFramesContext*)ctx->hwframe->data;
+    }
+
+    ff_set_sar(avctx, av_div_q(
+        (AVRational){ format->display_aspect_ratio.x, format->display_aspect_ratio.y },
+        (AVRational){ avctx->width, avctx->height }));
+
+    ctx->deint_mode_current = format->progressive_sequence
+                              ? cudaVideoDeinterlaceMode_Weave
+                              : ctx->deint_mode;
+
+    if (!format->progressive_sequence && ctx->deint_mode_current == cudaVideoDeinterlaceMode_Weave)
+        avctx->flags |= AV_CODEC_FLAG_INTERLACED_DCT;
+    else
+        avctx->flags &= ~AV_CODEC_FLAG_INTERLACED_DCT;
+
+    if (format->video_signal_description.video_full_range_flag)
+        avctx->color_range = AVCOL_RANGE_JPEG;
+    else
+        avctx->color_range = AVCOL_RANGE_MPEG;
+
+    avctx->color_primaries = format->video_signal_description.color_primaries;
+    avctx->color_trc = format->video_signal_description.transfer_characteristics;
+    avctx->colorspace = format->video_signal_description.matrix_coefficients;
+
+    if (format->bitrate)
+        avctx->bit_rate = format->bitrate;
+
+    if (format->frame_rate.numerator && format->frame_rate.denominator) {
+        avctx->framerate.num = format->frame_rate.numerator;
+        avctx->framerate.den = format->frame_rate.denominator;
+    }
+
+    if (ctx->cudecoder
+            && avctx->coded_width == format->coded_width
+            && avctx->coded_height == format->coded_height
+            && avctx->width == old_width
+            && avctx->height == old_height
+            && ctx->chroma_format == format->chroma_format
+            && ctx->codec_type == format->codec)
+        return 1;
+
+    if (ctx->cudecoder) {
+        av_log(avctx, AV_LOG_TRACE, "Re-initializing decoder\n");
+        ctx->internal_error = CHECK_CU(ctx->cvdl->cuvidDestroyDecoder(ctx->cudecoder));
+        if (ctx->internal_error < 0)
+            return 0;
+        ctx->cudecoder = NULL;
+    }
+
+    if (hwframe_ctx->pool && (
+            hwframe_ctx->width < avctx->width ||
+            hwframe_ctx->height < avctx->height ||
+            hwframe_ctx->format != AV_PIX_FMT_CUDA ||
+            hwframe_ctx->sw_format != avctx->sw_pix_fmt)) {
+        av_log(avctx, AV_LOG_ERROR, "AVHWFramesContext is already initialized with incompatible parameters\n");
+        av_log(avctx, AV_LOG_DEBUG, "width: %d <-> %d\n", hwframe_ctx->width, avctx->width);
+        av_log(avctx, AV_LOG_DEBUG, "height: %d <-> %d\n", hwframe_ctx->height, avctx->height);
+        av_log(avctx, AV_LOG_DEBUG, "format: %s <-> cuda\n", av_get_pix_fmt_name(hwframe_ctx->format));
+        av_log(avctx, AV_LOG_DEBUG, "sw_format: %s <-> %s\n",
+               av_get_pix_fmt_name(hwframe_ctx->sw_format), av_get_pix_fmt_name(avctx->sw_pix_fmt));
+        ctx->internal_error = AVERROR(EINVAL);
+        return 0;
+    }
+
+    ctx->chroma_format = format->chroma_format;
+
+    cuinfo.CodecType = ctx->codec_type = format->codec;
+    cuinfo.ChromaFormat = format->chroma_format;
+
+    switch (avctx->sw_pix_fmt) {
+    case AV_PIX_FMT_NV12:
+        cuinfo.OutputFormat = cudaVideoSurfaceFormat_NV12;
+        break;
+    case AV_PIX_FMT_P010:
+    case AV_PIX_FMT_P016:
+        cuinfo.OutputFormat = cudaVideoSurfaceFormat_P016;
+        break;
+    case AV_PIX_FMT_YUV444P:
+        cuinfo.OutputFormat = cudaVideoSurfaceFormat_YUV444;
+        break;
+    case AV_PIX_FMT_YUV444P16:
+        cuinfo.OutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(avctx->sw_pix_fmt));
+        ctx->internal_error = AVERROR(EINVAL);
+        return 0;
+    }
+
+    cuinfo.ulNumDecodeSurfaces = ctx->nb_surfaces;
+    cuinfo.ulNumOutputSurfaces = 1;
+    cuinfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+    cuinfo.bitDepthMinus8 = format->bit_depth_luma_minus8;
+    cuinfo.DeinterlaceMode = ctx->deint_mode_current;
+
+    if (ctx->deint_mode_current != cudaVideoDeinterlaceMode_Weave && !ctx->drop_second_field)
+        avctx->framerate = av_mul_q(avctx->framerate, (AVRational){2, 1});
+
+    ctx->internal_error = CHECK_CU(ctx->cvdl->cuvidCreateDecoder(&ctx->cudecoder, &cuinfo));
+    if (ctx->internal_error < 0)
+        return 0;
+
+    if (!hwframe_ctx->pool) {
+        hwframe_ctx->format = AV_PIX_FMT_CUDA;
+        hwframe_ctx->sw_format = avctx->sw_pix_fmt;
+        hwframe_ctx->width = avctx->width;
+        hwframe_ctx->height = avctx->height;
+
+        if ((ctx->internal_error = av_hwframe_ctx_init(ctx->hwframe)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_hwframe_ctx_init failed\n");
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+static int CUDAAPI cuvid_handle_picture_decode(void *opaque, CUVIDPICPARAMS* picparams)
+{
+    AVCodecContext *avctx = opaque;
+    CuvidContext *ctx = avctx->priv_data;
+
+    av_log(avctx, AV_LOG_TRACE, "pfnDecodePicture\n");
+
+    ctx->key_frame[picparams->CurrPicIdx] = picparams->intra_pic_flag;
+
+    ctx->internal_error = CHECK_CU(ctx->cvdl->cuvidDecodePicture(ctx->cudecoder, picparams));
+    if (ctx->internal_error < 0)
+        return 0;
+
+    return 1;
+}
+
+static int CUDAAPI cuvid_handle_picture_display(void *opaque, CUVIDPARSERDISPINFO* dispinfo)
+{
+    AVCodecContext *avctx = opaque;
+    CuvidContext *ctx = avctx->priv_data;
+    CuvidParsedFrame parsed_frame = { { 0 } };
+
+    parsed_frame.dispinfo = *dispinfo;
+    ctx->internal_error = 0;
+
+    if (ctx->deint_mode_current == cudaVideoDeinterlaceMode_Weave) {
+        av_fifo_generic_write(ctx->frame_queue, &parsed_frame, sizeof(CuvidParsedFrame), NULL);
+    } else {
+        parsed_frame.is_deinterlacing = 1;
+        av_fifo_generic_write(ctx->frame_queue, &parsed_frame, sizeof(CuvidParsedFrame), NULL);
+        if (!ctx->drop_second_field) {
+            parsed_frame.second_field = 1;
+            av_fifo_generic_write(ctx->frame_queue, &parsed_frame, sizeof(CuvidParsedFrame), NULL);
+        }
+    }
+
+    return 1;
+}
+
+static int cuvid_is_buffer_full(AVCodecContext *avctx)
+{
+    CuvidContext *ctx = avctx->priv_data;
+
+    int delay = ctx->cuparseinfo.ulMaxDisplayDelay;
+    if (ctx->deint_mode != cudaVideoDeinterlaceMode_Weave && !ctx->drop_second_field)
+        delay *= 2;
+
+    return (av_fifo_size(ctx->frame_queue) / sizeof(CuvidParsedFrame)) + delay >= ctx->nb_surfaces;
+}
+
+static int cuvid_decode_packet(AVCodecContext *avctx, const AVPacket *avpkt)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
+    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    CUVIDSOURCEDATAPACKET cupkt;
+    AVPacket filter_packet = { 0 };
+    AVPacket filtered_packet = { 0 };
+    int ret = 0, eret = 0, is_flush = ctx->decoder_flushing;
+
+    av_log(avctx, AV_LOG_TRACE, "cuvid_decode_packet\n");
+
+    if (is_flush && avpkt && avpkt->size)
+        return AVERROR_EOF;
+
+    if (cuvid_is_buffer_full(avctx) && avpkt && avpkt->size)
+        return AVERROR(EAGAIN);
+
+    if (ctx->bsf && avpkt && avpkt->size) {
+        if ((ret = av_packet_ref(&filter_packet, avpkt)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_packet_ref failed\n");
+            return ret;
+        }
+
+        if ((ret = av_bsf_send_packet(ctx->bsf, &filter_packet)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_bsf_send_packet failed\n");
+            av_packet_unref(&filter_packet);
+            return ret;
+        }
+
+        if ((ret = av_bsf_receive_packet(ctx->bsf, &filtered_packet)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_bsf_receive_packet failed\n");
+            return ret;
+        }
+
+        avpkt = &filtered_packet;
+    }
+
+    ret = CHECK_CU(ctx->cudl->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0) {
+        av_packet_unref(&filtered_packet);
+        return ret;
+    }
+
+    memset(&cupkt, 0, sizeof(cupkt));
+
+    if (avpkt && avpkt->size) {
+        cupkt.payload_size = avpkt->size;
+        cupkt.payload = avpkt->data;
+
+        if (avpkt->pts != AV_NOPTS_VALUE) {
+            cupkt.flags = CUVID_PKT_TIMESTAMP;
+            if (avctx->pkt_timebase.num && avctx->pkt_timebase.den)
+                cupkt.timestamp = av_rescale_q(avpkt->pts, avctx->pkt_timebase, (AVRational){1, 10000000});
+            else
+                cupkt.timestamp = avpkt->pts;
+        }
+    } else {
+        cupkt.flags = CUVID_PKT_ENDOFSTREAM;
+        ctx->decoder_flushing = 1;
+    }
+
+    ret = CHECK_CU(ctx->cvdl->cuvidParseVideoData(ctx->cuparser, &cupkt));
+
+    av_packet_unref(&filtered_packet);
+
+    if (ret < 0)
+        goto error;
+
+    // cuvidParseVideoData doesn't return an error just because stuff failed...
+    if (ctx->internal_error) {
+        av_log(avctx, AV_LOG_ERROR, "cuvid decode callback error\n");
+        ret = ctx->internal_error;
+        goto error;
+    }
+
+error:
+    eret = CHECK_CU(ctx->cudl->cuCtxPopCurrent(&dummy));
+
+    if (eret < 0)
+        return eret;
+    else if (ret < 0)
+        return ret;
+    else if (is_flush)
+        return AVERROR_EOF;
+    else
+        return 0;
+}
+
+static int cuvid_output_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
+    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    CUdeviceptr mapped_frame = 0;
+    int ret = 0, eret = 0;
+
+    av_log(avctx, AV_LOG_TRACE, "cuvid_output_frame\n");
+
+    if (ctx->decoder_flushing) {
+        ret = cuvid_decode_packet(avctx, NULL);
+        if (ret < 0 && ret != AVERROR_EOF)
+            return ret;
+    }
+
+    if (!cuvid_is_buffer_full(avctx)) {
+        AVPacket pkt = {0};
+        ret = ff_decode_get_packet(avctx, &pkt);
+        if (ret < 0 && ret != AVERROR_EOF)
+            return ret;
+        ret = cuvid_decode_packet(avctx, &pkt);
+        av_packet_unref(&pkt);
+        // cuvid_is_buffer_full() should avoid this.
+        if (ret == AVERROR(EAGAIN))
+            ret = AVERROR_EXTERNAL;
+        if (ret < 0 && ret != AVERROR_EOF)
+            return ret;
+    }
+
+    ret = CHECK_CU(ctx->cudl->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    if (av_fifo_size(ctx->frame_queue)) {
+        const AVPixFmtDescriptor *pixdesc;
+        CuvidParsedFrame parsed_frame;
+        CUVIDPROCPARAMS params;
+        unsigned int pitch = 0;
+        int offset = 0;
+        int i;
+
+        av_fifo_generic_read(ctx->frame_queue, &parsed_frame, sizeof(CuvidParsedFrame), NULL);
+
+        memset(&params, 0, sizeof(params));
+        params.progressive_frame = parsed_frame.dispinfo.progressive_frame;
+        params.second_field = parsed_frame.second_field;
+        params.top_field_first = parsed_frame.dispinfo.top_field_first;
+
+        ret = CHECK_CU(ctx->cvdl->cuvidMapVideoFrame(ctx->cudecoder, parsed_frame.dispinfo.picture_index, &mapped_frame, &pitch, &params));
+        if (ret < 0)
+            goto error;
+
+        if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
+            ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n");
+                goto error;
+            }
+
+            ret = ff_decode_frame_props(avctx, frame);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n");
+                goto error;
+            }
+
+            pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+
+            for (i = 0; i < pixdesc->nb_components; i++) {
+                int height = avctx->height >> (i ? pixdesc->log2_chroma_h : 0);
+                CUDA_MEMCPY2D cpy = {
+                    .srcMemoryType = CU_MEMORYTYPE_DEVICE,
+                    .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+                    .srcDevice     = mapped_frame,
+                    .dstDevice     = (CUdeviceptr)frame->data[i],
+                    .srcPitch      = pitch,
+                    .dstPitch      = frame->linesize[i],
+                    .srcY          = offset,
+                    .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
+                    .Height        = height,
+                };
+
+                ret = CHECK_CU(ctx->cudl->cuMemcpy2DAsync(&cpy, device_hwctx->stream));
+                if (ret < 0)
+                    goto error;
+
+                offset += height;
+            }
+
+            ret = CHECK_CU(ctx->cudl->cuStreamSynchronize(device_hwctx->stream));
+            if (ret < 0)
+                goto error;
+        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12      ||
+                   avctx->pix_fmt == AV_PIX_FMT_P010      ||
+                   avctx->pix_fmt == AV_PIX_FMT_P016      ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P   ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P16) {
+            unsigned int offset = 0;
+            AVFrame *tmp_frame = av_frame_alloc();
+            if (!tmp_frame) {
+                av_log(avctx, AV_LOG_ERROR, "av_frame_alloc failed\n");
+                ret = AVERROR(ENOMEM);
+                goto error;
+            }
+
+            pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+
+            tmp_frame->format        = AV_PIX_FMT_CUDA;
+            tmp_frame->hw_frames_ctx = av_buffer_ref(ctx->hwframe);
+            tmp_frame->width         = avctx->width;
+            tmp_frame->height        = avctx->height;
+
+            /*
+             * Note that the following logic would not work for three plane
+             * YUV420 because the pitch value is different for the chroma
+             * planes.
+             */
+            for (i = 0; i < pixdesc->nb_components; i++) {
+                tmp_frame->data[i]     = (uint8_t*)mapped_frame + offset;
+                tmp_frame->linesize[i] = pitch;
+                offset += pitch * (avctx->height >> (i ? pixdesc->log2_chroma_h : 0));
+            }
+
+            ret = ff_get_buffer(avctx, frame, 0);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "ff_get_buffer failed\n");
+                av_frame_free(&tmp_frame);
+                goto error;
+            }
+
+            ret = av_hwframe_transfer_data(frame, tmp_frame, 0);
+            if (ret) {
+                av_log(avctx, AV_LOG_ERROR, "av_hwframe_transfer_data failed\n");
+                av_frame_free(&tmp_frame);
+                goto error;
+            }
+            av_frame_free(&tmp_frame);
+        } else {
+            ret = AVERROR_BUG;
+            goto error;
+        }
+
+        frame->key_frame = ctx->key_frame[parsed_frame.dispinfo.picture_index];
+        frame->width = avctx->width;
+        frame->height = avctx->height;
+        if (avctx->pkt_timebase.num && avctx->pkt_timebase.den)
+            frame->pts = av_rescale_q(parsed_frame.dispinfo.timestamp, (AVRational){1, 10000000}, avctx->pkt_timebase);
+        else
+            frame->pts = parsed_frame.dispinfo.timestamp;
+
+        if (parsed_frame.second_field) {
+            if (ctx->prev_pts == INT64_MIN) {
+                ctx->prev_pts = frame->pts;
+                frame->pts += (avctx->pkt_timebase.den * avctx->framerate.den) / (avctx->pkt_timebase.num * avctx->framerate.num);
+            } else {
+                int pts_diff = (frame->pts - ctx->prev_pts) / 2;
+                ctx->prev_pts = frame->pts;
+                frame->pts += pts_diff;
+            }
+        }
+
+        /* CUVIDs opaque reordering breaks the internal pkt logic.
+         * So set pkt_pts and clear all the other pkt_ fields.
+         */
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+        frame->pkt_pts = frame->pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+        frame->pkt_pos = -1;
+        frame->pkt_duration = 0;
+        frame->pkt_size = -1;
+
+        frame->interlaced_frame = !parsed_frame.is_deinterlacing && !parsed_frame.dispinfo.progressive_frame;
+
+        if (frame->interlaced_frame)
+            frame->top_field_first = parsed_frame.dispinfo.top_field_first;
+    } else if (ctx->decoder_flushing) {
+        ret = AVERROR_EOF;
+    } else {
+        ret = AVERROR(EAGAIN);
+    }
+
+error:
+    if (mapped_frame)
+        eret = CHECK_CU(ctx->cvdl->cuvidUnmapVideoFrame(ctx->cudecoder, mapped_frame));
+
+    eret = CHECK_CU(ctx->cudl->cuCtxPopCurrent(&dummy));
+
+    if (eret < 0)
+        return eret;
+    else
+        return ret;
+}
+
+static int cuvid_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret = 0;
+
+    av_log(avctx, AV_LOG_TRACE, "cuvid_decode_frame\n");
+
+    if (ctx->deint_mode_current != cudaVideoDeinterlaceMode_Weave) {
+        av_log(avctx, AV_LOG_ERROR, "Deinterlacing is not supported via the old API\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!ctx->decoder_flushing) {
+        ret = cuvid_decode_packet(avctx, avpkt);
+        if (ret < 0)
+            return ret;
+    }
+
+    ret = cuvid_output_frame(avctx, frame);
+    if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
+        *got_frame = 0;
+    } else if (ret < 0) {
+        return ret;
+    } else {
+        *got_frame = 1;
+    }
+
+    return 0;
+}
+
+static av_cold int cuvid_decode_end(AVCodecContext *avctx)
+{
+    CuvidContext *ctx = avctx->priv_data;
+
+    av_fifo_freep(&ctx->frame_queue);
+
+    if (ctx->bsf)
+        av_bsf_free(&ctx->bsf);
+
+    if (ctx->cuparser)
+        ctx->cvdl->cuvidDestroyVideoParser(ctx->cuparser);
+
+    if (ctx->cudecoder)
+        ctx->cvdl->cuvidDestroyDecoder(ctx->cudecoder);
+
+    ctx->cudl = NULL;
+
+    av_buffer_unref(&ctx->hwframe);
+    av_buffer_unref(&ctx->hwdevice);
+
+    av_freep(&ctx->key_frame);
+
+    cuvid_free_functions(&ctx->cvdl);
+
+    return 0;
+}
+
+static int cuvid_test_capabilities(AVCodecContext *avctx,
+                                   const CUVIDPARSERPARAMS *cuparseinfo,
+                                   int probed_width,
+                                   int probed_height,
+                                   int bit_depth)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    CUVIDDECODECAPS *caps;
+    int res8 = 0, res10 = 0, res12 = 0;
+
+    if (!ctx->cvdl->cuvidGetDecoderCaps) {
+        av_log(avctx, AV_LOG_WARNING, "Used Nvidia driver is too old to perform a capability check.\n");
+        av_log(avctx, AV_LOG_WARNING, "The minimum required version is "
+#if defined(_WIN32) || defined(__CYGWIN__)
+            "378.66"
+#else
+            "378.13"
+#endif
+            ". Continuing blind.\n");
+        ctx->caps8.bIsSupported = ctx->caps10.bIsSupported = 1;
+        // 12 bit was not supported before the capability check was introduced, so disable it.
+        ctx->caps12.bIsSupported = 0;
+        return 0;
+    }
+
+    ctx->caps8.eCodecType = ctx->caps10.eCodecType = ctx->caps12.eCodecType
+        = cuparseinfo->CodecType;
+    ctx->caps8.eChromaFormat = ctx->caps10.eChromaFormat = ctx->caps12.eChromaFormat
+        = cudaVideoChromaFormat_420;
+
+    ctx->caps8.nBitDepthMinus8 = 0;
+    ctx->caps10.nBitDepthMinus8 = 2;
+    ctx->caps12.nBitDepthMinus8 = 4;
+
+    res8 = CHECK_CU(ctx->cvdl->cuvidGetDecoderCaps(&ctx->caps8));
+    res10 = CHECK_CU(ctx->cvdl->cuvidGetDecoderCaps(&ctx->caps10));
+    res12 = CHECK_CU(ctx->cvdl->cuvidGetDecoderCaps(&ctx->caps12));
+
+    av_log(avctx, AV_LOG_VERBOSE, "CUVID capabilities for %s:\n", avctx->codec->name);
+    av_log(avctx, AV_LOG_VERBOSE, "8 bit: supported: %d, min_width: %d, max_width: %d, min_height: %d, max_height: %d\n",
+           ctx->caps8.bIsSupported, ctx->caps8.nMinWidth, ctx->caps8.nMaxWidth, ctx->caps8.nMinHeight, ctx->caps8.nMaxHeight);
+    av_log(avctx, AV_LOG_VERBOSE, "10 bit: supported: %d, min_width: %d, max_width: %d, min_height: %d, max_height: %d\n",
+           ctx->caps10.bIsSupported, ctx->caps10.nMinWidth, ctx->caps10.nMaxWidth, ctx->caps10.nMinHeight, ctx->caps10.nMaxHeight);
+    av_log(avctx, AV_LOG_VERBOSE, "12 bit: supported: %d, min_width: %d, max_width: %d, min_height: %d, max_height: %d\n",
+           ctx->caps12.bIsSupported, ctx->caps12.nMinWidth, ctx->caps12.nMaxWidth, ctx->caps12.nMinHeight, ctx->caps12.nMaxHeight);
+
+    switch (bit_depth) {
+    case 10:
+        caps = &ctx->caps10;
+        if (res10 < 0)
+            return res10;
+        break;
+    case 12:
+        caps = &ctx->caps12;
+        if (res12 < 0)
+            return res12;
+        break;
+    default:
+        caps = &ctx->caps8;
+        if (res8 < 0)
+            return res8;
+    }
+
+    if (!ctx->caps8.bIsSupported) {
+        av_log(avctx, AV_LOG_ERROR, "Codec %s is not supported.\n", avctx->codec->name);
+        return AVERROR(EINVAL);
+    }
+
+    if (!caps->bIsSupported) {
+        av_log(avctx, AV_LOG_ERROR, "Bit depth %d is not supported.\n", bit_depth);
+        return AVERROR(EINVAL);
+    }
+
+    if (probed_width > caps->nMaxWidth || probed_width < caps->nMinWidth) {
+        av_log(avctx, AV_LOG_ERROR, "Video width %d not within range from %d to %d\n",
+               probed_width, caps->nMinWidth, caps->nMaxWidth);
+        return AVERROR(EINVAL);
+    }
+
+    if (probed_height > caps->nMaxHeight || probed_height < caps->nMinHeight) {
+        av_log(avctx, AV_LOG_ERROR, "Video height %d not within range from %d to %d\n",
+               probed_height, caps->nMinHeight, caps->nMaxHeight);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static av_cold int cuvid_decode_init(AVCodecContext *avctx)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    AVCUDADeviceContext *device_hwctx;
+    AVHWDeviceContext *device_ctx;
+    AVHWFramesContext *hwframe_ctx;
+    CUVIDSOURCEDATAPACKET seq_pkt;
+    CUcontext cuda_ctx = NULL;
+    CUcontext dummy;
+    const AVBitStreamFilter *bsf;
+    int ret = 0;
+
+    enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_CUDA,
+                                       AV_PIX_FMT_NV12,
+                                       AV_PIX_FMT_NONE };
+
+    int probed_width = avctx->coded_width ? avctx->coded_width : 1280;
+    int probed_height = avctx->coded_height ? avctx->coded_height : 720;
+    int probed_bit_depth = 8;
+
+    const AVPixFmtDescriptor *probe_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+    if (probe_desc && probe_desc->nb_components)
+        probed_bit_depth = probe_desc->comp[0].depth;
+
+    // Accelerated transcoding scenarios with 'ffmpeg' require that the
+    // pix_fmt be set to AV_PIX_FMT_CUDA early. The sw_pix_fmt, and the
+    // pix_fmt for non-accelerated transcoding, do not need to be correct
+    // but need to be set to something. We arbitrarily pick NV12.
+    ret = ff_get_format(avctx, pix_fmts);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "ff_get_format failed: %d\n", ret);
+        return ret;
+    }
+    avctx->pix_fmt = ret;
+
+    if (ctx->resize_expr && sscanf(ctx->resize_expr, "%dx%d",
+                                   &ctx->resize.width, &ctx->resize.height) != 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid resize expressions\n");
+        ret = AVERROR(EINVAL);
+        goto error;
+    }
+
+    if (ctx->crop_expr && sscanf(ctx->crop_expr, "%dx%dx%dx%d",
+                                 &ctx->crop.top, &ctx->crop.bottom,
+                                 &ctx->crop.left, &ctx->crop.right) != 4) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid cropping expressions\n");
+        ret = AVERROR(EINVAL);
+        goto error;
+    }
+
+    ret = cuvid_load_functions(&ctx->cvdl, avctx);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed loading nvcuvid.\n");
+        goto error;
+    }
+
+    ctx->frame_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(CuvidParsedFrame));
+    if (!ctx->frame_queue) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+
+    if (avctx->hw_frames_ctx) {
+        ctx->hwframe = av_buffer_ref(avctx->hw_frames_ctx);
+        if (!ctx->hwframe) {
+            ret = AVERROR(ENOMEM);
+            goto error;
+        }
+
+        hwframe_ctx = (AVHWFramesContext*)ctx->hwframe->data;
+
+        ctx->hwdevice = av_buffer_ref(hwframe_ctx->device_ref);
+        if (!ctx->hwdevice) {
+            ret = AVERROR(ENOMEM);
+            goto error;
+        }
+    } else {
+        if (avctx->hw_device_ctx) {
+            ctx->hwdevice = av_buffer_ref(avctx->hw_device_ctx);
+            if (!ctx->hwdevice) {
+                ret = AVERROR(ENOMEM);
+                goto error;
+            }
+        } else {
+            ret = av_hwdevice_ctx_create(&ctx->hwdevice, AV_HWDEVICE_TYPE_CUDA, ctx->cu_gpu, NULL, 0);
+            if (ret < 0)
+                goto error;
+        }
+
+        ctx->hwframe = av_hwframe_ctx_alloc(ctx->hwdevice);
+        if (!ctx->hwframe) {
+            av_log(avctx, AV_LOG_ERROR, "av_hwframe_ctx_alloc failed\n");
+            ret = AVERROR(ENOMEM);
+            goto error;
+        }
+
+        hwframe_ctx = (AVHWFramesContext*)ctx->hwframe->data;
+    }
+
+    device_ctx = hwframe_ctx->device_ctx;
+    device_hwctx = device_ctx->hwctx;
+
+    cuda_ctx = device_hwctx->cuda_ctx;
+    ctx->cudl = device_hwctx->internal->cuda_dl;
+
+    memset(&ctx->cuparseinfo, 0, sizeof(ctx->cuparseinfo));
+    memset(&ctx->cuparse_ext, 0, sizeof(ctx->cuparse_ext));
+    memset(&seq_pkt, 0, sizeof(seq_pkt));
+
+    ctx->cuparseinfo.pExtVideoInfo = &ctx->cuparse_ext;
+
+    switch (avctx->codec->id) {
+#if CONFIG_H264_CUVID_DECODER
+    case AV_CODEC_ID_H264:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_H264;
+        break;
+#endif
+#if CONFIG_HEVC_CUVID_DECODER
+    case AV_CODEC_ID_HEVC:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_HEVC;
+        break;
+#endif
+#if CONFIG_MJPEG_CUVID_DECODER
+    case AV_CODEC_ID_MJPEG:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_JPEG;
+        break;
+#endif
+#if CONFIG_MPEG1_CUVID_DECODER
+    case AV_CODEC_ID_MPEG1VIDEO:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_MPEG1;
+        break;
+#endif
+#if CONFIG_MPEG2_CUVID_DECODER
+    case AV_CODEC_ID_MPEG2VIDEO:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_MPEG2;
+        break;
+#endif
+#if CONFIG_MPEG4_CUVID_DECODER
+    case AV_CODEC_ID_MPEG4:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_MPEG4;
+        break;
+#endif
+#if CONFIG_VP8_CUVID_DECODER
+    case AV_CODEC_ID_VP8:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_VP8;
+        break;
+#endif
+#if CONFIG_VP9_CUVID_DECODER
+    case AV_CODEC_ID_VP9:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_VP9;
+        break;
+#endif
+#if CONFIG_VC1_CUVID_DECODER
+    case AV_CODEC_ID_VC1:
+        ctx->cuparseinfo.CodecType = cudaVideoCodec_VC1;
+        break;
+#endif
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Invalid CUVID codec!\n");
+        return AVERROR_BUG;
+    }
+
+    if (avctx->codec->id == AV_CODEC_ID_H264 || avctx->codec->id == AV_CODEC_ID_HEVC) {
+        if (avctx->codec->id == AV_CODEC_ID_H264)
+            bsf = av_bsf_get_by_name("h264_mp4toannexb");
+        else
+            bsf = av_bsf_get_by_name("hevc_mp4toannexb");
+
+        if (!bsf) {
+            ret = AVERROR_BSF_NOT_FOUND;
+            goto error;
+        }
+        if (ret = av_bsf_alloc(bsf, &ctx->bsf)) {
+            goto error;
+        }
+        if (((ret = avcodec_parameters_from_context(ctx->bsf->par_in, avctx)) < 0) || ((ret = av_bsf_init(ctx->bsf)) < 0)) {
+            av_bsf_free(&ctx->bsf);
+            goto error;
+        }
+
+        ctx->cuparse_ext.format.seqhdr_data_length = ctx->bsf->par_out->extradata_size;
+        memcpy(ctx->cuparse_ext.raw_seqhdr_data,
+               ctx->bsf->par_out->extradata,
+               FFMIN(sizeof(ctx->cuparse_ext.raw_seqhdr_data), ctx->bsf->par_out->extradata_size));
+    } else if (avctx->extradata_size > 0) {
+        ctx->cuparse_ext.format.seqhdr_data_length = avctx->extradata_size;
+        memcpy(ctx->cuparse_ext.raw_seqhdr_data,
+               avctx->extradata,
+               FFMIN(sizeof(ctx->cuparse_ext.raw_seqhdr_data), avctx->extradata_size));
+    }
+
+    ctx->key_frame = av_mallocz(ctx->nb_surfaces * sizeof(int));
+    if (!ctx->key_frame) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+
+    ctx->cuparseinfo.ulMaxNumDecodeSurfaces = ctx->nb_surfaces;
+    ctx->cuparseinfo.ulMaxDisplayDelay = 4;
+    ctx->cuparseinfo.pUserData = avctx;
+    ctx->cuparseinfo.pfnSequenceCallback = cuvid_handle_video_sequence;
+    ctx->cuparseinfo.pfnDecodePicture = cuvid_handle_picture_decode;
+    ctx->cuparseinfo.pfnDisplayPicture = cuvid_handle_picture_display;
+
+    ret = CHECK_CU(ctx->cudl->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        goto error;
+
+    ret = cuvid_test_capabilities(avctx, &ctx->cuparseinfo,
+                                  probed_width,
+                                  probed_height,
+                                  probed_bit_depth);
+    if (ret < 0)
+        goto error;
+
+    ret = CHECK_CU(ctx->cvdl->cuvidCreateVideoParser(&ctx->cuparser, &ctx->cuparseinfo));
+    if (ret < 0)
+        goto error;
+
+    seq_pkt.payload = ctx->cuparse_ext.raw_seqhdr_data;
+    seq_pkt.payload_size = ctx->cuparse_ext.format.seqhdr_data_length;
+
+    if (seq_pkt.payload && seq_pkt.payload_size) {
+        ret = CHECK_CU(ctx->cvdl->cuvidParseVideoData(ctx->cuparser, &seq_pkt));
+        if (ret < 0)
+            goto error;
+    }
+
+    ret = CHECK_CU(ctx->cudl->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto error;
+
+    ctx->prev_pts = INT64_MIN;
+
+    if (!avctx->pkt_timebase.num || !avctx->pkt_timebase.den)
+        av_log(avctx, AV_LOG_WARNING, "Invalid pkt_timebase, passing timestamps as-is.\n");
+
+    return 0;
+
+error:
+    cuvid_decode_end(avctx);
+    return ret;
+}
+
+static void cuvid_flush(AVCodecContext *avctx)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
+    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    CUVIDSOURCEDATAPACKET seq_pkt = { 0 };
+    int ret;
+
+    ret = CHECK_CU(ctx->cudl->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        goto error;
+
+    av_fifo_freep(&ctx->frame_queue);
+
+    ctx->frame_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(CuvidParsedFrame));
+    if (!ctx->frame_queue) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to recreate frame queue on flush\n");
+        return;
+    }
+
+    if (ctx->cudecoder) {
+        ctx->cvdl->cuvidDestroyDecoder(ctx->cudecoder);
+        ctx->cudecoder = NULL;
+    }
+
+    if (ctx->cuparser) {
+        ctx->cvdl->cuvidDestroyVideoParser(ctx->cuparser);
+        ctx->cuparser = NULL;
+    }
+
+    ret = CHECK_CU(ctx->cvdl->cuvidCreateVideoParser(&ctx->cuparser, &ctx->cuparseinfo));
+    if (ret < 0)
+        goto error;
+
+    seq_pkt.payload = ctx->cuparse_ext.raw_seqhdr_data;
+    seq_pkt.payload_size = ctx->cuparse_ext.format.seqhdr_data_length;
+
+    if (seq_pkt.payload && seq_pkt.payload_size) {
+        ret = CHECK_CU(ctx->cvdl->cuvidParseVideoData(ctx->cuparser, &seq_pkt));
+        if (ret < 0)
+            goto error;
+    }
+
+    ret = CHECK_CU(ctx->cudl->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto error;
+
+    ctx->prev_pts = INT64_MIN;
+    ctx->decoder_flushing = 0;
+
+    return;
+ error:
+    av_log(avctx, AV_LOG_ERROR, "CUDA reinit on flush failed\n");
+}
+
+#define OFFSET(x) offsetof(CuvidContext, x)
+#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "deint",    "Set deinterlacing mode", OFFSET(deint_mode), AV_OPT_TYPE_INT,   { .i64 = cudaVideoDeinterlaceMode_Weave    }, cudaVideoDeinterlaceMode_Weave, cudaVideoDeinterlaceMode_Adaptive, VD, "deint" },
+    { "weave",    "Weave deinterlacing (do nothing)",        0, AV_OPT_TYPE_CONST, { .i64 = cudaVideoDeinterlaceMode_Weave    }, 0, 0, VD, "deint" },
+    { "bob",      "Bob deinterlacing",                       0, AV_OPT_TYPE_CONST, { .i64 = cudaVideoDeinterlaceMode_Bob      }, 0, 0, VD, "deint" },
+    { "adaptive", "Adaptive deinterlacing",                  0, AV_OPT_TYPE_CONST, { .i64 = cudaVideoDeinterlaceMode_Adaptive }, 0, 0, VD, "deint" },
+    { "gpu",      "GPU to be used for decoding", OFFSET(cu_gpu), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VD },
+    { "surfaces", "Maximum surfaces to be used for decoding", OFFSET(nb_surfaces), AV_OPT_TYPE_INT, { .i64 = 25 }, 0, INT_MAX, VD },
+    { "drop_second_field", "Drop second field when deinterlacing", OFFSET(drop_second_field), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VD },
+    { "crop",     "Crop (top)x(bottom)x(left)x(right)", OFFSET(crop_expr), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VD },
+    { "resize",   "Resize (width)x(height)", OFFSET(resize_expr), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VD },
+    { NULL }
+};
+
+static const AVCodecHWConfigInternal *cuvid_hw_configs[] = {
+    &(const AVCodecHWConfigInternal) {
+        .public = {
+            .pix_fmt     = AV_PIX_FMT_CUDA,
+            .methods     = AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX |
+                           AV_CODEC_HW_CONFIG_METHOD_INTERNAL,
+            .device_type = AV_HWDEVICE_TYPE_CUDA
+        },
+        .hwaccel = NULL,
+    },
+    NULL
+};
+
+#define DEFINE_CUVID_CODEC(x, X) \
+    static const AVClass x##_cuvid_class = { \
+        .class_name = #x "_cuvid", \
+        .item_name = av_default_item_name, \
+        .option = options, \
+        .version = LIBAVUTIL_VERSION_INT, \
+    }; \
+    AVCodec ff_##x##_cuvid_decoder = { \
+        .name           = #x "_cuvid", \
+        .long_name      = NULL_IF_CONFIG_SMALL("Nvidia CUVID " #X " decoder"), \
+        .type           = AVMEDIA_TYPE_VIDEO, \
+        .id             = AV_CODEC_ID_##X, \
+        .priv_data_size = sizeof(CuvidContext), \
+        .priv_class     = &x##_cuvid_class, \
+        .init           = cuvid_decode_init, \
+        .close          = cuvid_decode_end, \
+        .decode         = cuvid_decode_frame, \
+        .receive_frame  = cuvid_output_frame, \
+        .flush          = cuvid_flush, \
+        .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HARDWARE, \
+        .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, \
+                                                        AV_PIX_FMT_NV12, \
+                                                        AV_PIX_FMT_P010, \
+                                                        AV_PIX_FMT_P016, \
+                                                        AV_PIX_FMT_NONE }, \
+        .hw_configs     = cuvid_hw_configs, \
+        .wrapper_name   = "cuvid", \
+    };
+
+#if CONFIG_HEVC_CUVID_DECODER
+DEFINE_CUVID_CODEC(hevc, HEVC)
+#endif
+
+#if CONFIG_H264_CUVID_DECODER
+DEFINE_CUVID_CODEC(h264, H264)
+#endif
+
+#if CONFIG_MJPEG_CUVID_DECODER
+DEFINE_CUVID_CODEC(mjpeg, MJPEG)
+#endif
+
+#if CONFIG_MPEG1_CUVID_DECODER
+DEFINE_CUVID_CODEC(mpeg1, MPEG1VIDEO)
+#endif
+
+#if CONFIG_MPEG2_CUVID_DECODER
+DEFINE_CUVID_CODEC(mpeg2, MPEG2VIDEO)
+#endif
+
+#if CONFIG_MPEG4_CUVID_DECODER
+DEFINE_CUVID_CODEC(mpeg4, MPEG4)
+#endif
+
+#if CONFIG_VP8_CUVID_DECODER
+DEFINE_CUVID_CODEC(vp8, VP8)
+#endif
+
+#if CONFIG_VP9_CUVID_DECODER
+DEFINE_CUVID_CODEC(vp9, VP9)
+#endif
+
+#if CONFIG_VC1_CUVID_DECODER
+DEFINE_CUVID_CODEC(vc1, VC1)
+#endif
diff --git a/libavcodec/cyuv.c b/libavcodec/cyuv.c
index 2c4f98d..f2b0a7c 100644
--- a/libavcodec/cyuv.c
+++ b/libavcodec/cyuv.c
@@ -6,20 +6,20 @@
  *
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,7 +52,6 @@ static av_cold int cyuv_decode_init(AVCodecContext *avctx)
     if (s->width & 0x3)
         return AVERROR_INVALIDDATA;
     s->height = avctx->height;
-    avctx->pix_fmt = AV_PIX_FMT_YUV411P;
 
     return 0;
 }
@@ -82,6 +81,7 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
     int stream_ptr;
     unsigned char cur_byte;
     int pixel_groups;
+    int rawsize = s->height * FFALIGN(s->width,2) * 2;
     int ret;
 
     if (avctx->codec_id == AV_CODEC_ID_AURA) {
@@ -92,7 +92,11 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
      * followed by (height) lines each with 3 bytes to represent groups
      * of 4 pixels. Thus, the total size of the buffer ought to be:
      *    (3 * 16) + height * (width * 3 / 4) */
-    if (buf_size != 48 + s->height * (s->width * 3 / 4)) {
+    if (buf_size == 48 + s->height * (s->width * 3 / 4)) {
+        avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+    } else if(buf_size == rawsize ) {
+        avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+    } else {
         av_log(avctx, AV_LOG_ERROR, "got a buffer with %d bytes when %d were expected\n",
                buf_size, 48 + s->height * (s->width * 3 / 4));
         return AVERROR_INVALIDDATA;
@@ -101,15 +105,22 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
     /* pixel data starts 48 bytes in, after 3x16-byte tables */
     stream_ptr = 48;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     y_plane = frame->data[0];
     u_plane = frame->data[1];
     v_plane = frame->data[2];
 
+    if (buf_size == rawsize) {
+        int linesize = FFALIGN(s->width,2) * 2;
+        y_plane += frame->linesize[0] * s->height;
+        for (stream_ptr = 0; stream_ptr < rawsize; stream_ptr += linesize) {
+            y_plane -= frame->linesize[0];
+            memcpy(y_plane, buf+stream_ptr, linesize);
+        }
+    } else {
+
     /* iterate through each line in the height */
     for (y_ptr = 0, u_ptr = 0, v_ptr = 0;
          y_ptr < (s->height * frame->linesize[0]);
@@ -157,6 +168,7 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
 
         }
     }
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/d3d11va.c b/libavcodec/d3d11va.c
index 946de06..9967f32 100644
--- a/libavcodec/d3d11va.c
+++ b/libavcodec/d3d11va.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2015 Steve Lhomme
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/d3d11va.h b/libavcodec/d3d11va.h
index 9264ec6..6816b6c 100644
--- a/libavcodec/d3d11va.h
+++ b/libavcodec/d3d11va.h
@@ -4,20 +4,20 @@
  * copyright (c) 2009 Laurent Aimar
  * copyright (c) 2015 Steve Lhomme
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,7 @@
 
 /**
  * This structure is used to provides the necessary configurations and data
- * to the Direct3D11 Libav HWAccel implementation.
+ * to the Direct3D11 FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  *
@@ -88,7 +88,7 @@ typedef struct AVD3D11VAContext {
     uint64_t workaround;
 
     /**
-     * Private to the Libav AVHWAccel implementation
+     * Private to the FFmpeg AVHWAccel implementation
      */
     unsigned report_id;
 
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index c5daf07..a0729e6 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -1,20 +1,24 @@
 /*
  * DCA compatible decoder data
+ * Copyright (C) 2004 Gildas Bazin
+ * Copyright (C) 2004 Benjamin Zores
+ * Copyright (C) 2006 Benjamin Larsson
+ * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +28,9 @@
 #include "libavutil/error.h"
 
 #include "dca.h"
+#include "dca_core.h"
 #include "dca_syncwords.h"
+#include "get_bits.h"
 #include "put_bits.h"
 
 const uint32_t avpriv_dca_sample_rates[16] = {
@@ -32,7 +38,20 @@ const uint32_t avpriv_dca_sample_rates[16] = {
     12000, 24000, 48000, 96000, 192000
 };
 
-int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
+const uint32_t ff_dca_sampling_freqs[16] = {
+      8000,  16000, 32000, 64000, 128000, 22050,  44100,  88200,
+    176400, 352800, 12000, 24000,  48000, 96000, 192000, 384000,
+};
+
+const uint8_t ff_dca_freq_ranges[16] = {
+    0, 1, 2, 3, 4, 1, 2, 3, 4, 4, 0, 1, 2, 3, 4, 4
+};
+
+const uint8_t ff_dca_bits_per_sample[8] = {
+    16, 16, 20, 20, 0, 24, 24, 0
+};
+
+int avpriv_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
                              int max_size)
 {
     uint32_t mrk;
@@ -45,6 +64,7 @@ int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
     mrk = AV_RB32(src);
     switch (mrk) {
     case DCA_SYNCWORD_CORE_BE:
+    case DCA_SYNCWORD_SUBSTREAM:
         memcpy(dst, src, src_size);
         return src_size;
     case DCA_SYNCWORD_CORE_LE:
@@ -67,3 +87,76 @@ int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
         return AVERROR_INVALIDDATA;
     }
 }
+
+int ff_dca_parse_core_frame_header(DCACoreFrameHeader *h, GetBitContext *gb)
+{
+    if (get_bits_long(gb, 32) != DCA_SYNCWORD_CORE_BE)
+        return DCA_PARSE_ERROR_SYNC_WORD;
+
+    h->normal_frame = get_bits1(gb);
+    h->deficit_samples = get_bits(gb, 5) + 1;
+    if (h->deficit_samples != DCA_PCMBLOCK_SAMPLES)
+        return DCA_PARSE_ERROR_DEFICIT_SAMPLES;
+
+    h->crc_present = get_bits1(gb);
+    h->npcmblocks = get_bits(gb, 7) + 1;
+    if (h->npcmblocks & (DCA_SUBBAND_SAMPLES - 1))
+        return DCA_PARSE_ERROR_PCM_BLOCKS;
+
+    h->frame_size = get_bits(gb, 14) + 1;
+    if (h->frame_size < 96)
+        return DCA_PARSE_ERROR_FRAME_SIZE;
+
+    h->audio_mode = get_bits(gb, 6);
+    if (h->audio_mode >= DCA_AMODE_COUNT)
+        return DCA_PARSE_ERROR_AMODE;
+
+    h->sr_code = get_bits(gb, 4);
+    if (!avpriv_dca_sample_rates[h->sr_code])
+        return DCA_PARSE_ERROR_SAMPLE_RATE;
+
+    h->br_code = get_bits(gb, 5);
+    if (get_bits1(gb))
+        return DCA_PARSE_ERROR_RESERVED_BIT;
+
+    h->drc_present = get_bits1(gb);
+    h->ts_present = get_bits1(gb);
+    h->aux_present = get_bits1(gb);
+    h->hdcd_master = get_bits1(gb);
+    h->ext_audio_type = get_bits(gb, 3);
+    h->ext_audio_present = get_bits1(gb);
+    h->sync_ssf = get_bits1(gb);
+    h->lfe_present = get_bits(gb, 2);
+    if (h->lfe_present == DCA_LFE_FLAG_INVALID)
+        return DCA_PARSE_ERROR_LFE_FLAG;
+
+    h->predictor_history = get_bits1(gb);
+    if (h->crc_present)
+        skip_bits(gb, 16);
+    h->filter_perfect = get_bits1(gb);
+    h->encoder_rev = get_bits(gb, 4);
+    h->copy_hist = get_bits(gb, 2);
+    h->pcmr_code = get_bits(gb, 3);
+    if (!ff_dca_bits_per_sample[h->pcmr_code])
+        return DCA_PARSE_ERROR_PCM_RES;
+
+    h->sumdiff_front = get_bits1(gb);
+    h->sumdiff_surround = get_bits1(gb);
+    h->dn_code = get_bits(gb, 4);
+    return 0;
+}
+
+int avpriv_dca_parse_core_frame_header(DCACoreFrameHeader *h, const uint8_t *buf, int size)
+{
+    GetBitContext gb;
+    int ret;
+
+    ret = init_get_bits8(&gb, buf, size);
+    if (ret < 0)
+        return ret;
+
+    if (ff_dca_parse_core_frame_header(h, &gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
diff --git a/libavcodec/dca.h b/libavcodec/dca.h
index b2f5cba..e96c589 100644
--- a/libavcodec/dca.h
+++ b/libavcodec/dca.h
@@ -4,21 +4,22 @@
  * Copyright (C) 2004 Benjamin Zores
  * Copyright (C) 2006 Benjamin Larsson
  * Copyright (C) 2007 Konstantin Shishkov
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,279 +28,201 @@
 
 #include <stdint.h>
 
-#include "libavutil/float_dsp.h"
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
-#include "avcodec.h"
-#include "dcadsp.h"
-#include "fmtconvert.h"
 #include "get_bits.h"
 #include "internal.h"
 
-#define DCA_PRIM_CHANNELS_MAX  (7)
-#define DCA_ABITS_MAX         (32)      /* Should be 28 */
-#define DCA_SUBSUBFRAMES_MAX   (4)
-#define DCA_SUBFRAMES_MAX     (16)
-#define DCA_BLOCKS_MAX        (16)
-#define DCA_LFE_MAX            (3)
+#define DCA_CORE_FRAME_HEADER_SIZE      18
+
+enum DCAParseError {
+    DCA_PARSE_ERROR_SYNC_WORD       = -1,
+    DCA_PARSE_ERROR_DEFICIT_SAMPLES = -2,
+    DCA_PARSE_ERROR_PCM_BLOCKS      = -3,
+    DCA_PARSE_ERROR_FRAME_SIZE      = -4,
+    DCA_PARSE_ERROR_AMODE           = -5,
+    DCA_PARSE_ERROR_SAMPLE_RATE     = -6,
+    DCA_PARSE_ERROR_RESERVED_BIT    = -7,
+    DCA_PARSE_ERROR_LFE_FLAG        = -8,
+    DCA_PARSE_ERROR_PCM_RES         = -9,
+};
 
-#define DCA_PRIM_CHANNELS_MAX  (7)
-#define DCA_ABITS_MAX         (32)      /* Should be 28 */
-#define DCA_SUBSUBFRAMES_MAX   (4)
-#define DCA_SUBFRAMES_MAX     (16)
-#define DCA_BLOCKS_MAX        (16)
-#define DCA_LFE_MAX            (3)
-#define DCA_XLL_FBANDS_MAX     (4)
-#define DCA_XLL_SEGMENTS_MAX  (16)
-#define DCA_XLL_CHSETS_MAX    (16)
-#define DCA_XLL_CHANNELS_MAX  (16)
-#define DCA_XLL_AORDER_MAX    (15)
+typedef struct DCACoreFrameHeader {
+    uint8_t     normal_frame;       ///< Frame type
+    uint8_t     deficit_samples;    ///< Deficit sample count
+    uint8_t     crc_present;        ///< CRC present flag
+    uint8_t     npcmblocks;         ///< Number of PCM sample blocks
+    uint16_t    frame_size;         ///< Primary frame byte size
+    uint8_t     audio_mode;         ///< Audio channel arrangement
+    uint8_t     sr_code;            ///< Core audio sampling frequency
+    uint8_t     br_code;            ///< Transmission bit rate
+    uint8_t     drc_present;        ///< Embedded dynamic range flag
+    uint8_t     ts_present;         ///< Embedded time stamp flag
+    uint8_t     aux_present;        ///< Auxiliary data flag
+    uint8_t     hdcd_master;        ///< HDCD mastering flag
+    uint8_t     ext_audio_type;     ///< Extension audio descriptor flag
+    uint8_t     ext_audio_present;  ///< Extended coding flag
+    uint8_t     sync_ssf;           ///< Audio sync word insertion flag
+    uint8_t     lfe_present;        ///< Low frequency effects flag
+    uint8_t     predictor_history;  ///< Predictor history flag switch
+    uint8_t     filter_perfect;     ///< Multirate interpolator switch
+    uint8_t     encoder_rev;        ///< Encoder software revision
+    uint8_t     copy_hist;          ///< Copy history
+    uint8_t     pcmr_code;          ///< Source PCM resolution
+    uint8_t     sumdiff_front;      ///< Front sum/difference flag
+    uint8_t     sumdiff_surround;   ///< Surround sum/difference flag
+    uint8_t     dn_code;            ///< Dialog normalization / unspecified
+} DCACoreFrameHeader;
+
+enum DCASpeaker {
+    DCA_SPEAKER_C,    DCA_SPEAKER_L,    DCA_SPEAKER_R,    DCA_SPEAKER_Ls,
+    DCA_SPEAKER_Rs,   DCA_SPEAKER_LFE1, DCA_SPEAKER_Cs,   DCA_SPEAKER_Lsr,
+    DCA_SPEAKER_Rsr,  DCA_SPEAKER_Lss,  DCA_SPEAKER_Rss,  DCA_SPEAKER_Lc,
+    DCA_SPEAKER_Rc,   DCA_SPEAKER_Lh,   DCA_SPEAKER_Ch,   DCA_SPEAKER_Rh,
+    DCA_SPEAKER_LFE2, DCA_SPEAKER_Lw,   DCA_SPEAKER_Rw,   DCA_SPEAKER_Oh,
+    DCA_SPEAKER_Lhs,  DCA_SPEAKER_Rhs,  DCA_SPEAKER_Chr,  DCA_SPEAKER_Lhr,
+    DCA_SPEAKER_Rhr,  DCA_SPEAKER_Cl,   DCA_SPEAKER_Ll,   DCA_SPEAKER_Rl,
+    DCA_SPEAKER_RSV1, DCA_SPEAKER_RSV2, DCA_SPEAKER_RSV3, DCA_SPEAKER_RSV4,
+
+    DCA_SPEAKER_COUNT
+};
 
-/* Arbitrary limit; not sure what the maximum really is, but much larger. */
-#define DCA_XLL_DMIX_NCOEFFS_MAX (18)
+enum DCASpeakerMask {
+    DCA_SPEAKER_MASK_C     = 0x00000001,
+    DCA_SPEAKER_MASK_L     = 0x00000002,
+    DCA_SPEAKER_MASK_R     = 0x00000004,
+    DCA_SPEAKER_MASK_Ls    = 0x00000008,
+    DCA_SPEAKER_MASK_Rs    = 0x00000010,
+    DCA_SPEAKER_MASK_LFE1  = 0x00000020,
+    DCA_SPEAKER_MASK_Cs    = 0x00000040,
+    DCA_SPEAKER_MASK_Lsr   = 0x00000080,
+    DCA_SPEAKER_MASK_Rsr   = 0x00000100,
+    DCA_SPEAKER_MASK_Lss   = 0x00000200,
+    DCA_SPEAKER_MASK_Rss   = 0x00000400,
+    DCA_SPEAKER_MASK_Lc    = 0x00000800,
+    DCA_SPEAKER_MASK_Rc    = 0x00001000,
+    DCA_SPEAKER_MASK_Lh    = 0x00002000,
+    DCA_SPEAKER_MASK_Ch    = 0x00004000,
+    DCA_SPEAKER_MASK_Rh    = 0x00008000,
+    DCA_SPEAKER_MASK_LFE2  = 0x00010000,
+    DCA_SPEAKER_MASK_Lw    = 0x00020000,
+    DCA_SPEAKER_MASK_Rw    = 0x00040000,
+    DCA_SPEAKER_MASK_Oh    = 0x00080000,
+    DCA_SPEAKER_MASK_Lhs   = 0x00100000,
+    DCA_SPEAKER_MASK_Rhs   = 0x00200000,
+    DCA_SPEAKER_MASK_Chr   = 0x00400000,
+    DCA_SPEAKER_MASK_Lhr   = 0x00800000,
+    DCA_SPEAKER_MASK_Rhr   = 0x01000000,
+    DCA_SPEAKER_MASK_Cl    = 0x02000000,
+    DCA_SPEAKER_MASK_Ll    = 0x04000000,
+    DCA_SPEAKER_MASK_Rl    = 0x08000000,
+};
 
-#define DCA_MAX_FRAME_SIZE       16384
-#define DCA_MAX_EXSS_HEADER_SIZE  4096
+#define DCA_SPEAKER_LAYOUT_MONO         (DCA_SPEAKER_MASK_C)
+#define DCA_SPEAKER_LAYOUT_STEREO       (DCA_SPEAKER_MASK_L | DCA_SPEAKER_MASK_R)
+#define DCA_SPEAKER_LAYOUT_2POINT1      (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_LFE1)
+#define DCA_SPEAKER_LAYOUT_3_0          (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_C)
+#define DCA_SPEAKER_LAYOUT_2_1          (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_Cs)
+#define DCA_SPEAKER_LAYOUT_3_1          (DCA_SPEAKER_LAYOUT_3_0 | DCA_SPEAKER_MASK_Cs)
+#define DCA_SPEAKER_LAYOUT_2_2          (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_Ls | DCA_SPEAKER_MASK_Rs)
+#define DCA_SPEAKER_LAYOUT_5POINT0      (DCA_SPEAKER_LAYOUT_3_0 | DCA_SPEAKER_MASK_Ls | DCA_SPEAKER_MASK_Rs)
+#define DCA_SPEAKER_LAYOUT_5POINT1      (DCA_SPEAKER_LAYOUT_5POINT0 | DCA_SPEAKER_MASK_LFE1)
+#define DCA_SPEAKER_LAYOUT_7POINT0_WIDE (DCA_SPEAKER_LAYOUT_5POINT0 | DCA_SPEAKER_MASK_Lw | DCA_SPEAKER_MASK_Rw)
+#define DCA_SPEAKER_LAYOUT_7POINT1_WIDE (DCA_SPEAKER_LAYOUT_7POINT0_WIDE | DCA_SPEAKER_MASK_LFE1)
+
+#define DCA_HAS_STEREO(mask) \
+    ((mask & DCA_SPEAKER_LAYOUT_STEREO) == DCA_SPEAKER_LAYOUT_STEREO)
+
+enum DCASpeakerPair {
+    DCA_SPEAKER_PAIR_C      = 0x0001,
+    DCA_SPEAKER_PAIR_LR     = 0x0002,
+    DCA_SPEAKER_PAIR_LsRs   = 0x0004,
+    DCA_SPEAKER_PAIR_LFE1   = 0x0008,
+    DCA_SPEAKER_PAIR_Cs     = 0x0010,
+    DCA_SPEAKER_PAIR_LhRh   = 0x0020,
+    DCA_SPEAKER_PAIR_LsrRsr = 0x0040,
+    DCA_SPEAKER_PAIR_Ch     = 0x0080,
+    DCA_SPEAKER_PAIR_Oh     = 0x0100,
+    DCA_SPEAKER_PAIR_LcRc   = 0x0200,
+    DCA_SPEAKER_PAIR_LwRw   = 0x0400,
+    DCA_SPEAKER_PAIR_LssRss = 0x0800,
+    DCA_SPEAKER_PAIR_LFE2   = 0x1000,
+    DCA_SPEAKER_PAIR_LhsRhs = 0x2000,
+    DCA_SPEAKER_PAIR_Chr    = 0x4000,
+    DCA_SPEAKER_PAIR_LhrRhr = 0x8000
+};
 
-#define DCA_BUFFER_PADDING_SIZE   1024
+/**
+ * Return number of individual channels in DCASpeakerPair mask
+ */
+static inline int ff_dca_count_chs_for_mask(unsigned int mask)
+{
+    return av_popcount((mask & 0xffff) | ((mask & 0xae66) << 16));
+}
+
+enum DCARepresentationType {
+    DCA_REPR_TYPE_LtRt = 2,
+    DCA_REPR_TYPE_LhRh = 3
+};
 
 enum DCAExtensionMask {
-    DCA_EXT_CORE       = 0x001, ///< core in core substream
-    DCA_EXT_XXCH       = 0x002, ///< XXCh channels extension in core substream
-    DCA_EXT_X96        = 0x004, ///< 96/24 extension in core substream
-    DCA_EXT_XCH        = 0x008, ///< XCh channel extension in core substream
-    DCA_EXT_EXSS_CORE  = 0x010, ///< core in ExSS (extension substream)
-    DCA_EXT_EXSS_XBR   = 0x020, ///< extended bitrate extension in ExSS
-    DCA_EXT_EXSS_XXCH  = 0x040, ///< XXCh channels extension in ExSS
-    DCA_EXT_EXSS_X96   = 0x080, ///< 96/24 extension in ExSS
-    DCA_EXT_EXSS_LBR   = 0x100, ///< low bitrate component in ExSS
-    DCA_EXT_EXSS_XLL   = 0x200, ///< lossless extension in ExSS
+    DCA_CSS_CORE   = 0x001,
+    DCA_CSS_XXCH   = 0x002,
+    DCA_CSS_X96    = 0x004,
+    DCA_CSS_XCH    = 0x008,
+    DCA_CSS_MASK   = 0x00f,
+    DCA_EXSS_CORE  = 0x010,
+    DCA_EXSS_XBR   = 0x020,
+    DCA_EXSS_XXCH  = 0x040,
+    DCA_EXSS_X96   = 0x080,
+    DCA_EXSS_LBR   = 0x100,
+    DCA_EXSS_XLL   = 0x200,
+    DCA_EXSS_RSV1  = 0x400,
+    DCA_EXSS_RSV2  = 0x800,
+    DCA_EXSS_MASK  = 0xff0,
 };
 
-typedef struct XllChSetSubHeader {
-    int channels;               ///< number of channels in channel set, at most 16
-    int residual_encode;        ///< residual channel encoding
-    int bit_resolution;         ///< input sample bit-width
-    int bit_width;              ///< original input sample bit-width
-    int sampling_frequency;     ///< sampling frequency
-    int samp_freq_interp;       ///< sampling frequency interpolation multiplier
-    int replacement_set;        ///< replacement channel set group
-    int active_replace_set;     ///< current channel set is active channel set
-    int primary_ch_set;
-    int downmix_coeff_code_embedded;
-    int downmix_embedded;
-    int downmix_type;
-    int hier_chset;             ///< hierarchical channel set
-    int downmix_ncoeffs;
-    int downmix_coeffs[DCA_XLL_DMIX_NCOEFFS_MAX];
-    int ch_mask_enabled;
-    int ch_mask;
-    int mapping_coeffs_present;
-    int num_freq_bands;
-
-    /* m_nOrigChanOrder */
-    uint8_t orig_chan_order[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    uint8_t orig_chan_order_inv[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    /* Coefficients for channel pairs (at most 8), m_anPWChPairsCoeffs */
-    int8_t pw_ch_pairs_coeffs[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX/2];
-    /* m_nCurrHighestLPCOrder */
-    uint8_t adapt_order_max[DCA_XLL_FBANDS_MAX];
-    /* m_pnAdaptPredOrder */
-    uint8_t adapt_order[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    /* m_pnFixedPredOrder */
-    uint8_t fixed_order[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    /* m_pnLPCReflCoeffsQInd, unsigned version */
-    uint8_t lpc_refl_coeffs_q_ind[DCA_XLL_FBANDS_MAX]
-                                 [DCA_XLL_CHANNELS_MAX][DCA_XLL_AORDER_MAX];
-
-    int lsb_fsize[DCA_XLL_FBANDS_MAX];
-    int8_t scalable_lsbs[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    int8_t bit_width_adj_per_ch[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-} XllChSetSubHeader;
-
-typedef struct XllNavi {
-    GetBitContext gb;  // Context for parsing the data segments
-    unsigned band_size[DCA_XLL_FBANDS_MAX];
-    unsigned segment_size[DCA_XLL_FBANDS_MAX][DCA_XLL_SEGMENTS_MAX];
-    unsigned chset_size[DCA_XLL_FBANDS_MAX][DCA_XLL_SEGMENTS_MAX][DCA_XLL_CHSETS_MAX];
-} XllNavi;
-
-typedef struct QMF64_table {
-    float dct4_coeff[32][32];
-    float dct2_coeff[32][32];
-    float rcos[32];
-    float rsin[32];
-} QMF64_table;
-
-/* Primary audio coding header */
-typedef struct DCAAudioHeader {
-    int subband_activity[DCA_PRIM_CHANNELS_MAX];    ///< subband activity count
-    int vq_start_subband[DCA_PRIM_CHANNELS_MAX];    ///< high frequency vq start subband
-    int joint_intensity[DCA_PRIM_CHANNELS_MAX];     ///< joint intensity coding index
-    int transient_huffman[DCA_PRIM_CHANNELS_MAX];   ///< transient mode code book
-    int scalefactor_huffman[DCA_PRIM_CHANNELS_MAX]; ///< scale factor code book
-    int bitalloc_huffman[DCA_PRIM_CHANNELS_MAX];    ///< bit allocation quantizer select
-    int quant_index_huffman[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];  ///< quantization index codebook select
-    uint32_t scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///< scale factor adjustment
-
-    int subframes;              ///< number of subframes
-    int total_channels;         ///< number of channels including extensions
-    int prim_channels;          ///< number of primary audio channels
-} DCAAudioHeader;
-
-typedef struct DCAChan {
-    DECLARE_ALIGNED(32, int32_t, subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][SAMPLES_PER_SUBBAND];
-
-    /* Subband samples history (for ADPCM) */
-    DECLARE_ALIGNED(32, int32_t, subband_samples_hist)[DCA_SUBBANDS][4];
-    int hist_index;
-
-    /* Half size is sufficient for core decoding, but for 96 kHz data
-     * we need QMF with 64 subbands and 1024 samples. */
-    DECLARE_ALIGNED(32, float, subband_fir_hist)[1024];
-    DECLARE_ALIGNED(32, float, subband_fir_noidea)[64];
-
-    /* Primary audio coding side information */
-    int prediction_mode[DCA_SUBBANDS];    ///< prediction mode (ADPCM used or not)
-    int prediction_vq[DCA_SUBBANDS];      ///< prediction VQ coefs
-    int bitalloc[DCA_SUBBANDS];           ///< bit allocation index
-    int transition_mode[DCA_SUBBANDS];    ///< transition mode (transients)
-    int32_t scale_factor[DCA_SUBBANDS][2];///< scale factors (2 if transient)
-    int joint_huff;                       ///< joint subband scale factors codebook
-    int joint_scale_factor[DCA_SUBBANDS]; ///< joint subband scale factors
-
-    int32_t  high_freq_vq[DCA_SUBBANDS];  ///< VQ encoded high frequency subbands
-} DCAChan;
-
-
-typedef struct DCAContext {
-    AVClass *class;             ///< class for AVOptions
-    AVCodecContext *avctx;
-    /* Frame header */
-    int frame_type;             ///< type of the current frame
-    int samples_deficit;        ///< deficit sample count
-    int crc_present;            ///< crc is present in the bitstream
-    int sample_blocks;          ///< number of PCM sample blocks
-    int frame_size;             ///< primary frame byte size
-    int amode;                  ///< audio channels arrangement
-    int sample_rate;            ///< audio sampling rate
-    int bit_rate;               ///< transmission bit rate
-    int bit_rate_index;         ///< transmission bit rate index
-
-    int dynrange;               ///< embedded dynamic range flag
-    int timestamp;              ///< embedded time stamp flag
-    int aux_data;               ///< auxiliary data flag
-    int hdcd;                   ///< source material is mastered in HDCD
-    int ext_descr;              ///< extension audio descriptor flag
-    int ext_coding;             ///< extended coding flag
-    int aspf;                   ///< audio sync word insertion flag
-    int lfe;                    ///< low frequency effects flag
-    int predictor_history;      ///< predictor history flag
-    int header_crc;             ///< header crc check bytes
-    int multirate_inter;        ///< multirate interpolator switch
-    int version;                ///< encoder software revision
-    int copy_history;           ///< copy history
-    int source_pcm_res;         ///< source pcm resolution
-    int front_sum;              ///< front sum/difference flag
-    int surround_sum;           ///< surround sum/difference flag
-    int dialog_norm;            ///< dialog normalisation parameter
+enum DCADownMixType {
+    DCA_DMIX_TYPE_1_0,
+    DCA_DMIX_TYPE_LoRo,
+    DCA_DMIX_TYPE_LtRt,
+    DCA_DMIX_TYPE_3_0,
+    DCA_DMIX_TYPE_2_1,
+    DCA_DMIX_TYPE_2_2,
+    DCA_DMIX_TYPE_3_1,
 
-    /* Primary audio coding header */
-    DCAAudioHeader audio_header;
-
-    /* Primary audio coding side information */
-    int subsubframes[DCA_SUBFRAMES_MAX];                         ///< number of subsubframes
-    int partial_samples[DCA_SUBFRAMES_MAX];                      ///< partial subsubframe samples count
-    float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2];            ///< stereo downmix coefficients
-    int dynrange_coef;                                           ///< dynamic range coefficient
-
-    /* Core substream's embedded downmix coefficients (cf. ETSI TS 102 114 V1.4.1)
-     * Input:  primary audio channels (incl. LFE if present)
-     * Output: downmix audio channels (up to 4, no LFE) */
-    uint8_t  core_downmix;                                       ///< embedded downmix coefficients available
-    uint8_t  core_downmix_amode;                                 ///< audio channel arrangement of embedded downmix
-    uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4];   ///< embedded downmix coefficients (9-bit codes)
-
-
-    float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)];      ///< Low frequency effect data
-    int lfe_scale_factor;
-
-    /* Subband samples history (for ADPCM) */
-    DECLARE_ALIGNED(32, float, raXin)[32];
-
-    DCAChan dca_chan[DCA_PRIM_CHANNELS_MAX];
-
-    int output;                 ///< type of output
-
-    float *samples_chanptr[DCA_PRIM_CHANNELS_MAX + 1];
-    float *extra_channels[DCA_PRIM_CHANNELS_MAX + 1];
-    uint8_t *extra_channels_buffer;
-    unsigned int extra_channels_buffer_size;
-
-    uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];
-    int dca_buffer_size;        ///< how much data is in the dca_buffer
-
-    const int8_t *channel_order_tab;  ///< channel reordering table, lfe and non lfe
-    GetBitContext gb;
-    /* Current position in DCA frame */
-    int current_subframe;
-    int current_subsubframe;
-
-    int core_ext_mask;          ///< present extensions in the core substream
-    int exss_ext_mask;          ///< Non-core extensions
-
-    /* XCh extension information */
-    int xch_present;            ///< XCh extension present and valid
-    int xch_base_channel;       ///< index of first (only) channel containing XCH data
-    int xch_disable;            ///< whether the XCh extension should be decoded or not
-
-    /* XLL extension information */
-    int xll_disable;
-    int xll_nch_sets;           ///< number of channel sets per frame
-    int xll_channels;           ///< total number of channels (in all channel sets)
-    int xll_residual_channels;  ///< number of residual channels
-    int xll_segments;           ///< number of segments per frame
-    int xll_log_smpl_in_seg;    ///< supposedly this is "nBits4SamplLoci"
-    int xll_smpl_in_seg;        ///< samples in segment per one frequency band for the first channel set
-    int xll_bits4seg_size;      ///< number of bits used to read segment size
-    int xll_banddata_crc;       ///< presence of CRC16 within each frequency band
-    int xll_scalable_lsb;
-    int xll_bits4ch_mask;       ///< channel position mask
-    int xll_fixed_lsb_width;
-    XllChSetSubHeader xll_chsets[DCA_XLL_CHSETS_MAX];
-    XllNavi xll_navi;
-    int *xll_sample_buf;
-    unsigned int xll_sample_buf_size;
-
-    /* ExSS header parser */
-    int static_fields;          ///< static fields present
-    int mix_metadata;           ///< mixing metadata present
-    int num_mix_configs;        ///< number of mix out configurations
-    int mix_config_num_ch[4];   ///< number of channels in each mix out configuration
+    DCA_DMIX_TYPE_COUNT
+};
 
-    int profile;
-    int one2one_map_chtospkr;
+extern av_export_avcodec const uint32_t avpriv_dca_sample_rates[16];
 
-    int debug_flag;             ///< used for suppressing repeated error messages output
-    AVFloatDSPContext fdsp;
-    FFTContext imdct;
-    SynthFilterContext synth;
-    DCADSPContext dcadsp;
-    QMF64_table *qmf64_table;
-    FmtConvertContext fmt_conv;
-} DCAContext;
+extern const uint32_t ff_dca_sampling_freqs[16];
+extern const uint8_t ff_dca_freq_ranges[16];
+extern const uint8_t ff_dca_bits_per_sample[8];
 
-extern av_export_avcodec const uint32_t avpriv_dca_sample_rates[16];
 
 /**
  * Convert bitstream to one representation based on sync marker
  */
-int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
-                             int max_size);
+int avpriv_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
+                                 int max_size);
 
-void ff_dca_exss_parse_header(DCAContext *s);
+/**
+ * Parse and validate core frame header
+ * @param[out] h    Pointer to struct where header info is written.
+ * @param[in]  buf  Pointer to the data buffer
+ * @param[in]  size Size of the data buffer
+ * @return 0 on success, negative AVERROR code on failure
+ */
+int avpriv_dca_parse_core_frame_header(DCACoreFrameHeader *h, const uint8_t *buf, int size);
 
-int ff_dca_xll_decode_header(DCAContext *s);
-int ff_dca_xll_decode_navi(DCAContext *s, int asset_end);
-int ff_dca_xll_decode_audio(DCAContext *s, AVFrame *frame);
+/**
+ * Parse and validate core frame header
+ * @param[out] h   Pointer to struct where header info is written.
+ * @param[in]  gbc BitContext containing the first 120 bits of the frame.
+ * @return 0 on success, negative DCA_PARSE_ERROR_ code on failure
+ */
+int ff_dca_parse_core_frame_header(DCACoreFrameHeader *h, GetBitContext *gb);
 
 #endif /* AVCODEC_DCA_H */
diff --git a/libavcodec/dca_core.c b/libavcodec/dca_core.c
new file mode 100644
index 0000000..accc5ef
--- /dev/null
+++ b/libavcodec/dca_core.c
@@ -0,0 +1,2446 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dcaadpcm.h"
+#include "dcadec.h"
+#include "dcadata.h"
+#include "dcahuff.h"
+#include "dcamath.h"
+#include "dca_syncwords.h"
+
+#if ARCH_ARM
+#include "arm/dca.h"
+#endif
+
+enum HeaderType {
+    HEADER_CORE,
+    HEADER_XCH,
+    HEADER_XXCH
+};
+
+static const int8_t prm_ch_to_spkr_map[DCA_AMODE_COUNT][5] = {
+    { DCA_SPEAKER_C,            -1,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_C, DCA_SPEAKER_L, DCA_SPEAKER_R ,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R, DCA_SPEAKER_Cs,             -1,             -1 },
+    { DCA_SPEAKER_C, DCA_SPEAKER_L, DCA_SPEAKER_R , DCA_SPEAKER_Cs,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R, DCA_SPEAKER_Ls, DCA_SPEAKER_Rs,             -1 },
+    { DCA_SPEAKER_C, DCA_SPEAKER_L, DCA_SPEAKER_R,  DCA_SPEAKER_Ls, DCA_SPEAKER_Rs }
+};
+
+static const uint8_t audio_mode_ch_mask[DCA_AMODE_COUNT] = {
+    DCA_SPEAKER_LAYOUT_MONO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_3_0,
+    DCA_SPEAKER_LAYOUT_2_1,
+    DCA_SPEAKER_LAYOUT_3_1,
+    DCA_SPEAKER_LAYOUT_2_2,
+    DCA_SPEAKER_LAYOUT_5POINT0
+};
+
+static const uint8_t block_code_nbits[7] = {
+    7, 10, 12, 13, 15, 17, 19
+};
+
+static int dca_get_vlc(GetBitContext *s, DCAVLC *v, int i)
+{
+    return get_vlc2(s, v->vlc[i].table, v->vlc[i].bits, v->max_depth) + v->offset;
+}
+
+static void get_array(GetBitContext *s, int32_t *array, int size, int n)
+{
+    int i;
+
+    for (i = 0; i < size; i++)
+        array[i] = get_sbits(s, n);
+}
+
+// 5.3.1 - Bit stream header
+static int parse_frame_header(DCACoreDecoder *s)
+{
+    DCACoreFrameHeader h = { 0 };
+    int err = ff_dca_parse_core_frame_header(&h, &s->gb);
+
+    if (err < 0) {
+        switch (err) {
+        case DCA_PARSE_ERROR_DEFICIT_SAMPLES:
+            av_log(s->avctx, AV_LOG_ERROR, "Deficit samples are not supported\n");
+            return h.normal_frame ? AVERROR_INVALIDDATA : AVERROR_PATCHWELCOME;
+
+        case DCA_PARSE_ERROR_PCM_BLOCKS:
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported number of PCM sample blocks (%d)\n", h.npcmblocks);
+            return (h.npcmblocks < 6 || h.normal_frame) ? AVERROR_INVALIDDATA : AVERROR_PATCHWELCOME;
+
+        case DCA_PARSE_ERROR_FRAME_SIZE:
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid core frame size (%d bytes)\n", h.frame_size);
+            return AVERROR_INVALIDDATA;
+
+        case DCA_PARSE_ERROR_AMODE:
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported audio channel arrangement (%d)\n", h.audio_mode);
+            return AVERROR_PATCHWELCOME;
+
+        case DCA_PARSE_ERROR_SAMPLE_RATE:
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid core audio sampling frequency\n");
+            return AVERROR_INVALIDDATA;
+
+        case DCA_PARSE_ERROR_RESERVED_BIT:
+            av_log(s->avctx, AV_LOG_ERROR, "Reserved bit set\n");
+            return AVERROR_INVALIDDATA;
+
+        case DCA_PARSE_ERROR_LFE_FLAG:
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid low frequency effects flag\n");
+            return AVERROR_INVALIDDATA;
+
+        case DCA_PARSE_ERROR_PCM_RES:
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid source PCM resolution\n");
+            return AVERROR_INVALIDDATA;
+
+        default:
+            av_log(s->avctx, AV_LOG_ERROR, "Unknown core frame header error\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    s->crc_present          = h.crc_present;
+    s->npcmblocks           = h.npcmblocks;
+    s->frame_size           = h.frame_size;
+    s->audio_mode           = h.audio_mode;
+    s->sample_rate          = avpriv_dca_sample_rates[h.sr_code];
+    s->bit_rate             = ff_dca_bit_rates[h.br_code];
+    s->drc_present          = h.drc_present;
+    s->ts_present           = h.ts_present;
+    s->aux_present          = h.aux_present;
+    s->ext_audio_type       = h.ext_audio_type;
+    s->ext_audio_present    = h.ext_audio_present;
+    s->sync_ssf             = h.sync_ssf;
+    s->lfe_present          = h.lfe_present;
+    s->predictor_history    = h.predictor_history;
+    s->filter_perfect       = h.filter_perfect;
+    s->source_pcm_res       = ff_dca_bits_per_sample[h.pcmr_code];
+    s->es_format            = h.pcmr_code & 1;
+    s->sumdiff_front        = h.sumdiff_front;
+    s->sumdiff_surround     = h.sumdiff_surround;
+
+    return 0;
+}
+
+// 5.3.2 - Primary audio coding header
+static int parse_coding_header(DCACoreDecoder *s, enum HeaderType header, int xch_base)
+{
+    int n, ch, nchannels, header_size = 0, header_pos = get_bits_count(&s->gb);
+    unsigned int mask, index;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    switch (header) {
+    case HEADER_CORE:
+        // Number of subframes
+        s->nsubframes = get_bits(&s->gb, 4) + 1;
+
+        // Number of primary audio channels
+        s->nchannels = get_bits(&s->gb, 3) + 1;
+        if (s->nchannels != ff_dca_channels[s->audio_mode]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of primary audio channels (%d) for audio channel arrangement (%d)\n", s->nchannels, s->audio_mode);
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert1(s->nchannels <= DCA_CHANNELS - 2);
+
+        s->ch_mask = audio_mode_ch_mask[s->audio_mode];
+
+        // Add LFE channel if present
+        if (s->lfe_present)
+            s->ch_mask |= DCA_SPEAKER_MASK_LFE1;
+        break;
+
+    case HEADER_XCH:
+        s->nchannels = ff_dca_channels[s->audio_mode] + 1;
+        av_assert1(s->nchannels <= DCA_CHANNELS - 1);
+        s->ch_mask |= DCA_SPEAKER_MASK_Cs;
+        break;
+
+    case HEADER_XXCH:
+        // Channel set header length
+        header_size = get_bits(&s->gb, 7) + 1;
+
+        // Check CRC
+        if (s->xxch_crc_present
+            && ff_dca_check_crc(s->avctx, &s->gb, header_pos, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH channel set header checksum\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Number of channels in a channel set
+        nchannels = get_bits(&s->gb, 3) + 1;
+        if (nchannels > DCA_XXCH_CHANNELS_MAX) {
+            avpriv_request_sample(s->avctx, "%d XXCH channels", nchannels);
+            return AVERROR_PATCHWELCOME;
+        }
+        s->nchannels = ff_dca_channels[s->audio_mode] + nchannels;
+        av_assert1(s->nchannels <= DCA_CHANNELS);
+
+        // Loudspeaker layout mask
+        mask = get_bits_long(&s->gb, s->xxch_mask_nbits - DCA_SPEAKER_Cs);
+        s->xxch_spkr_mask = mask << DCA_SPEAKER_Cs;
+
+        if (av_popcount(s->xxch_spkr_mask) != nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH speaker layout mask (%#x)\n", s->xxch_spkr_mask);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (s->xxch_core_mask & s->xxch_spkr_mask) {
+            av_log(s->avctx, AV_LOG_ERROR, "XXCH speaker layout mask (%#x) overlaps with core (%#x)\n", s->xxch_spkr_mask, s->xxch_core_mask);
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Combine core and XXCH masks together
+        s->ch_mask = s->xxch_core_mask | s->xxch_spkr_mask;
+
+        // Downmix coefficients present in stream
+        if (get_bits1(&s->gb)) {
+            int *coeff_ptr = s->xxch_dmix_coeff;
+
+            // Downmix already performed by encoder
+            s->xxch_dmix_embedded = get_bits1(&s->gb);
+
+            // Downmix scale factor
+            index = get_bits(&s->gb, 6) * 4 - FF_DCA_DMIXTABLE_OFFSET - 3;
+            if (index >= FF_DCA_INV_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH downmix scale index (%d)\n", index);
+                return AVERROR_INVALIDDATA;
+            }
+            s->xxch_dmix_scale_inv = ff_dca_inv_dmixtable[index];
+
+            // Downmix channel mapping mask
+            for (ch = 0; ch < nchannels; ch++) {
+                mask = get_bits_long(&s->gb, s->xxch_mask_nbits);
+                if ((mask & s->xxch_core_mask) != mask) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH downmix channel mapping mask (%#x)\n", mask);
+                    return AVERROR_INVALIDDATA;
+                }
+                s->xxch_dmix_mask[ch] = mask;
+            }
+
+            // Downmix coefficients
+            for (ch = 0; ch < nchannels; ch++) {
+                for (n = 0; n < s->xxch_mask_nbits; n++) {
+                    if (s->xxch_dmix_mask[ch] & (1U << n)) {
+                        int code = get_bits(&s->gb, 7);
+                        int sign = (code >> 6) - 1;
+                        if (code &= 63) {
+                            index = code * 4 - 3;
+                            if (index >= FF_DCA_DMIXTABLE_SIZE) {
+                                av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH downmix coefficient index (%d)\n", index);
+                                return AVERROR_INVALIDDATA;
+                            }
+                            *coeff_ptr++ = (ff_dca_dmixtable[index] ^ sign) - sign;
+                        } else {
+                            *coeff_ptr++ = 0;
+                        }
+                    }
+                }
+            }
+        } else {
+            s->xxch_dmix_embedded = 0;
+        }
+
+        break;
+    }
+
+    // Subband activity count
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        s->nsubbands[ch] = get_bits(&s->gb, 5) + 2;
+        if (s->nsubbands[ch] > DCA_SUBBANDS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid subband activity count\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // High frequency VQ start subband
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        s->subband_vq_start[ch] = get_bits(&s->gb, 5) + 1;
+
+    // Joint intensity coding index
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        if ((n = get_bits(&s->gb, 3)) && header == HEADER_XXCH)
+            n += xch_base - 1;
+        if (n > s->nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid joint intensity coding index\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->joint_intensity_index[ch] = n;
+    }
+
+    // Transient mode code book
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        s->transition_mode_sel[ch] = get_bits(&s->gb, 2);
+
+    // Scale factor code book
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        s->scale_factor_sel[ch] = get_bits(&s->gb, 3);
+        if (s->scale_factor_sel[ch] == 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid scale factor code book\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Bit allocation quantizer select
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        s->bit_allocation_sel[ch] = get_bits(&s->gb, 3);
+        if (s->bit_allocation_sel[ch] == 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid bit allocation quantizer select\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Quantization index codebook select
+    for (n = 0; n < DCA_CODE_BOOKS; n++)
+        for (ch = xch_base; ch < s->nchannels; ch++)
+            s->quant_index_sel[ch][n] = get_bits(&s->gb, ff_dca_quant_index_sel_nbits[n]);
+
+    // Scale factor adjustment index
+    for (n = 0; n < DCA_CODE_BOOKS; n++)
+        for (ch = xch_base; ch < s->nchannels; ch++)
+            if (s->quant_index_sel[ch][n] < ff_dca_quant_index_group_size[n])
+                s->scale_factor_adj[ch][n] = ff_dca_scale_factor_adj[get_bits(&s->gb, 2)];
+
+    if (header == HEADER_XXCH) {
+        // Reserved
+        // Byte align
+        // CRC16 of channel set header
+        if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of XXCH channel set header\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        // Audio header CRC check word
+        if (s->crc_present)
+            skip_bits(&s->gb, 16);
+    }
+
+    return 0;
+}
+
+static inline int parse_scale(DCACoreDecoder *s, int *scale_index, int sel)
+{
+    const uint32_t *scale_table;
+    unsigned int scale_size;
+
+    // Select the root square table
+    if (sel > 5) {
+        scale_table = ff_dca_scale_factor_quant7;
+        scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant7);
+    } else {
+        scale_table = ff_dca_scale_factor_quant6;
+        scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant6);
+    }
+
+    // If Huffman code was used, the difference of scales was encoded
+    if (sel < 5)
+        *scale_index += dca_get_vlc(&s->gb, &ff_dca_vlc_scale_factor, sel);
+    else
+        *scale_index = get_bits(&s->gb, sel + 1);
+
+    // Look up scale factor from the root square table
+    if ((unsigned int)*scale_index >= scale_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid scale factor index\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return scale_table[*scale_index];
+}
+
+static inline int parse_joint_scale(DCACoreDecoder *s, int sel)
+{
+    int scale_index;
+
+    // Absolute value was encoded even when Huffman code was used
+    if (sel < 5)
+        scale_index = dca_get_vlc(&s->gb, &ff_dca_vlc_scale_factor, sel);
+    else
+        scale_index = get_bits(&s->gb, sel + 1);
+
+    // Bias by 64
+    scale_index += 64;
+
+    // Look up joint scale factor
+    if ((unsigned int)scale_index >= FF_ARRAY_ELEMS(ff_dca_joint_scale_factors)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid joint scale factor index\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return ff_dca_joint_scale_factors[scale_index];
+}
+
+// 5.4.1 - Primary audio coding side information
+static int parse_subframe_header(DCACoreDecoder *s, int sf,
+                                 enum HeaderType header, int xch_base)
+{
+    int ch, band, ret;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    if (header == HEADER_CORE) {
+        // Subsubframe count
+        s->nsubsubframes[sf] = get_bits(&s->gb, 2) + 1;
+
+        // Partial subsubframe sample count
+        skip_bits(&s->gb, 3);
+    }
+
+    // Prediction mode
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        for (band = 0; band < s->nsubbands[ch]; band++)
+            s->prediction_mode[ch][band] = get_bits1(&s->gb);
+
+    // Prediction coefficients VQ address
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        for (band = 0; band < s->nsubbands[ch]; band++)
+            if (s->prediction_mode[ch][band])
+                s->prediction_vq_index[ch][band] = get_bits(&s->gb, 12);
+
+    // Bit allocation index
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int sel = s->bit_allocation_sel[ch];
+
+        for (band = 0; band < s->subband_vq_start[ch]; band++) {
+            int abits;
+
+            if (sel < 5)
+                abits = dca_get_vlc(&s->gb, &ff_dca_vlc_bit_allocation, sel);
+            else
+                abits = get_bits(&s->gb, sel - 1);
+
+            if (abits > DCA_ABITS_MAX) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid bit allocation index\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            s->bit_allocation[ch][band] = abits;
+        }
+    }
+
+    // Transition mode
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        // Clear transition mode for all subbands
+        memset(s->transition_mode[sf][ch], 0, sizeof(s->transition_mode[0][0]));
+
+        // Transient possible only if more than one subsubframe
+        if (s->nsubsubframes[sf] > 1) {
+            int sel = s->transition_mode_sel[ch];
+            for (band = 0; band < s->subband_vq_start[ch]; band++)
+                if (s->bit_allocation[ch][band])
+                    s->transition_mode[sf][ch][band] = dca_get_vlc(&s->gb, &ff_dca_vlc_transition_mode, sel);
+        }
+    }
+
+    // Scale factors
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int sel = s->scale_factor_sel[ch];
+        int scale_index = 0;
+
+        // Extract scales for subbands up to VQ
+        for (band = 0; band < s->subband_vq_start[ch]; band++) {
+            if (s->bit_allocation[ch][band]) {
+                if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                    return ret;
+                s->scale_factors[ch][band][0] = ret;
+                if (s->transition_mode[sf][ch][band]) {
+                    if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                        return ret;
+                    s->scale_factors[ch][band][1] = ret;
+                }
+            } else {
+                s->scale_factors[ch][band][0] = 0;
+            }
+        }
+
+        // High frequency VQ subbands
+        for (band = s->subband_vq_start[ch]; band < s->nsubbands[ch]; band++) {
+            if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                return ret;
+            s->scale_factors[ch][band][0] = ret;
+        }
+    }
+
+    // Joint subband codebook select
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        if (s->joint_intensity_index[ch]) {
+            s->joint_scale_sel[ch] = get_bits(&s->gb, 3);
+            if (s->joint_scale_sel[ch] == 7) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid joint scale factor code book\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Scale factors for joint subband coding
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            int sel = s->joint_scale_sel[ch];
+            for (band = s->nsubbands[ch]; band < s->nsubbands[src_ch]; band++) {
+                if ((ret = parse_joint_scale(s, sel)) < 0)
+                    return ret;
+                s->joint_scale_factors[ch][band] = ret;
+            }
+        }
+    }
+
+    // Dynamic range coefficient
+    if (s->drc_present && header == HEADER_CORE)
+        skip_bits(&s->gb, 8);
+
+    // Side information CRC check word
+    if (s->crc_present)
+        skip_bits(&s->gb, 16);
+
+    return 0;
+}
+
+#ifndef decode_blockcodes
+static inline int decode_blockcodes(int code1, int code2, int levels, int32_t *audio)
+{
+    int offset = (levels - 1) / 2;
+    int n, div;
+
+    for (n = 0; n < DCA_SUBBAND_SAMPLES / 2; n++) {
+        div = FASTDIV(code1, levels);
+        audio[n] = code1 - div * levels - offset;
+        code1 = div;
+    }
+    for (; n < DCA_SUBBAND_SAMPLES; n++) {
+        div = FASTDIV(code2, levels);
+        audio[n] = code2 - div * levels - offset;
+        code2 = div;
+    }
+
+    return code1 | code2;
+}
+#endif
+
+static inline int parse_block_codes(DCACoreDecoder *s, int32_t *audio, int abits)
+{
+    // Extract block code indices from the bit stream
+    int code1 = get_bits(&s->gb, block_code_nbits[abits - 1]);
+    int code2 = get_bits(&s->gb, block_code_nbits[abits - 1]);
+    int levels = ff_dca_quant_levels[abits];
+
+    // Look up samples from the block code book
+    if (decode_blockcodes(code1, code2, levels, audio)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Failed to decode block code(s)\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static inline int parse_huffman_codes(DCACoreDecoder *s, int32_t *audio, int abits, int sel)
+{
+    int i;
+
+    // Extract Huffman codes from the bit stream
+    for (i = 0; i < DCA_SUBBAND_SAMPLES; i++)
+        audio[i] = dca_get_vlc(&s->gb, &ff_dca_vlc_quant_index[abits - 1], sel);
+
+    return 1;
+}
+
+static inline int extract_audio(DCACoreDecoder *s, int32_t *audio, int abits, int ch)
+{
+    av_assert1(abits >= 0 && abits <= DCA_ABITS_MAX);
+
+    if (abits == 0) {
+        // No bits allocated
+        memset(audio, 0, DCA_SUBBAND_SAMPLES * sizeof(*audio));
+        return 0;
+    }
+
+    if (abits <= DCA_CODE_BOOKS) {
+        int sel = s->quant_index_sel[ch][abits - 1];
+        if (sel < ff_dca_quant_index_group_size[abits - 1]) {
+            // Huffman codes
+            return parse_huffman_codes(s, audio, abits, sel);
+        }
+        if (abits <= 7) {
+            // Block codes
+            return parse_block_codes(s, audio, abits);
+        }
+    }
+
+    // No further encoding
+    get_array(&s->gb, audio, DCA_SUBBAND_SAMPLES, abits - 3);
+    return 0;
+}
+
+static inline void inverse_adpcm(int32_t **subband_samples,
+                                 const int16_t *vq_index,
+                                 const int8_t *prediction_mode,
+                                 int sb_start, int sb_end,
+                                 int ofs, int len)
+{
+    int i, j;
+
+    for (i = sb_start; i < sb_end; i++) {
+        if (prediction_mode[i]) {
+            const int pred_id = vq_index[i];
+            int32_t *ptr = subband_samples[i] + ofs;
+            for (j = 0; j < len; j++) {
+                int32_t x = ff_dcaadpcm_predict(pred_id, ptr + j - DCA_ADPCM_COEFFS);
+                ptr[j] = clip23(ptr[j] + x);
+            }
+        }
+    }
+}
+
+// 5.5 - Primary audio data arrays
+static int parse_subframe_audio(DCACoreDecoder *s, int sf, enum HeaderType header,
+                                int xch_base, int *sub_pos, int *lfe_pos)
+{
+    int32_t audio[16], scale;
+    int n, ssf, ofs, ch, band;
+
+    // Check number of subband samples in this subframe
+    int nsamples = s->nsubsubframes[sf] * DCA_SUBBAND_SAMPLES;
+    if (*sub_pos + nsamples > s->npcmblocks) {
+        av_log(s->avctx, AV_LOG_ERROR, "Subband sample buffer overflow\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // VQ encoded subbands
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int32_t vq_index[DCA_SUBBANDS];
+
+        for (band = s->subband_vq_start[ch]; band < s->nsubbands[ch]; band++)
+            // Extract the VQ address from the bit stream
+            vq_index[band] = get_bits(&s->gb, 10);
+
+        if (s->subband_vq_start[ch] < s->nsubbands[ch]) {
+            s->dcadsp->decode_hf(s->subband_samples[ch], vq_index,
+                                 ff_dca_high_freq_vq, s->scale_factors[ch],
+                                 s->subband_vq_start[ch], s->nsubbands[ch],
+                                 *sub_pos, nsamples);
+        }
+    }
+
+    // Low frequency effect data
+    if (s->lfe_present && header == HEADER_CORE) {
+        unsigned int index;
+
+        // Determine number of LFE samples in this subframe
+        int nlfesamples = 2 * s->lfe_present * s->nsubsubframes[sf];
+        av_assert1((unsigned int)nlfesamples <= FF_ARRAY_ELEMS(audio));
+
+        // Extract LFE samples from the bit stream
+        get_array(&s->gb, audio, nlfesamples, 8);
+
+        // Extract scale factor index from the bit stream
+        index = get_bits(&s->gb, 8);
+        if (index >= FF_ARRAY_ELEMS(ff_dca_scale_factor_quant7)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE scale factor index\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Look up the 7-bit root square quantization table
+        scale = ff_dca_scale_factor_quant7[index];
+
+        // Account for quantizer step size which is 0.035
+        scale = mul23(4697620 /* 0.035 * (1 << 27) */, scale);
+
+        // Scale and take the LFE samples
+        for (n = 0, ofs = *lfe_pos; n < nlfesamples; n++, ofs++)
+            s->lfe_samples[ofs] = clip23(audio[n] * scale >> 4);
+
+        // Advance LFE sample pointer for the next subframe
+        *lfe_pos = ofs;
+    }
+
+    // Audio data
+    for (ssf = 0, ofs = *sub_pos; ssf < s->nsubsubframes[sf]; ssf++) {
+        for (ch = xch_base; ch < s->nchannels; ch++) {
+            if (get_bits_left(&s->gb) < 0)
+                return AVERROR_INVALIDDATA;
+
+            // Not high frequency VQ subbands
+            for (band = 0; band < s->subband_vq_start[ch]; band++) {
+                int ret, trans_ssf, abits = s->bit_allocation[ch][band];
+                int32_t step_size;
+
+                // Extract bits from the bit stream
+                if ((ret = extract_audio(s, audio, abits, ch)) < 0)
+                    return ret;
+
+                // Select quantization step size table and look up
+                // quantization step size
+                if (s->bit_rate == 3)
+                    step_size = ff_dca_lossless_quant[abits];
+                else
+                    step_size = ff_dca_lossy_quant[abits];
+
+                // Identify transient location
+                trans_ssf = s->transition_mode[sf][ch][band];
+
+                // Determine proper scale factor
+                if (trans_ssf == 0 || ssf < trans_ssf)
+                    scale = s->scale_factors[ch][band][0];
+                else
+                    scale = s->scale_factors[ch][band][1];
+
+                // Adjust scale factor when SEL indicates Huffman code
+                if (ret > 0) {
+                    int64_t adj = s->scale_factor_adj[ch][abits - 1];
+                    scale = clip23(adj * scale >> 22);
+                }
+
+                ff_dca_core_dequantize(s->subband_samples[ch][band] + ofs,
+                           audio, step_size, scale, 0, DCA_SUBBAND_SAMPLES);
+            }
+        }
+
+        // DSYNC
+        if ((ssf == s->nsubsubframes[sf] - 1 || s->sync_ssf) && get_bits(&s->gb, 16) != 0xffff) {
+            av_log(s->avctx, AV_LOG_ERROR, "DSYNC check failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        ofs += DCA_SUBBAND_SAMPLES;
+    }
+
+    // Inverse ADPCM
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        inverse_adpcm(s->subband_samples[ch], s->prediction_vq_index[ch],
+                      s->prediction_mode[ch], 0, s->nsubbands[ch],
+                      *sub_pos, nsamples);
+    }
+
+    // Joint subband coding
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            s->dcadsp->decode_joint(s->subband_samples[ch], s->subband_samples[src_ch],
+                                    s->joint_scale_factors[ch], s->nsubbands[ch],
+                                    s->nsubbands[src_ch], *sub_pos, nsamples);
+        }
+    }
+
+    // Advance subband sample pointer for the next subframe
+    *sub_pos = ofs;
+    return 0;
+}
+
+static void erase_adpcm_history(DCACoreDecoder *s)
+{
+    int ch, band;
+
+    // Erase ADPCM history from previous frame if
+    // predictor history switch was disabled
+    for (ch = 0; ch < DCA_CHANNELS; ch++)
+        for (band = 0; band < DCA_SUBBANDS; band++)
+            AV_ZERO128(s->subband_samples[ch][band] - DCA_ADPCM_COEFFS);
+
+    emms_c();
+}
+
+static int alloc_sample_buffer(DCACoreDecoder *s)
+{
+    int nchsamples = DCA_ADPCM_COEFFS + s->npcmblocks;
+    int nframesamples = nchsamples * DCA_CHANNELS * DCA_SUBBANDS;
+    int nlfesamples = DCA_LFE_HISTORY + s->npcmblocks / 2;
+    unsigned int size = s->subband_size;
+    int ch, band;
+
+    // Reallocate subband sample buffer
+    av_fast_mallocz(&s->subband_buffer, &s->subband_size,
+                    (nframesamples + nlfesamples) * sizeof(int32_t));
+    if (!s->subband_buffer)
+        return AVERROR(ENOMEM);
+
+    if (size != s->subband_size) {
+        for (ch = 0; ch < DCA_CHANNELS; ch++)
+            for (band = 0; band < DCA_SUBBANDS; band++)
+                s->subband_samples[ch][band] = s->subband_buffer +
+                    (ch * DCA_SUBBANDS + band) * nchsamples + DCA_ADPCM_COEFFS;
+        s->lfe_samples = s->subband_buffer + nframesamples;
+    }
+
+    if (!s->predictor_history)
+        erase_adpcm_history(s);
+
+    return 0;
+}
+
+static int parse_frame_data(DCACoreDecoder *s, enum HeaderType header, int xch_base)
+{
+    int sf, ch, ret, band, sub_pos, lfe_pos;
+
+    if ((ret = parse_coding_header(s, header, xch_base)) < 0)
+        return ret;
+
+    for (sf = 0, sub_pos = 0, lfe_pos = DCA_LFE_HISTORY; sf < s->nsubframes; sf++) {
+        if ((ret = parse_subframe_header(s, sf, header, xch_base)) < 0)
+            return ret;
+        if ((ret = parse_subframe_audio(s, sf, header, xch_base, &sub_pos, &lfe_pos)) < 0)
+            return ret;
+    }
+
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        // Determine number of active subbands for this channel
+        int nsubbands = s->nsubbands[ch];
+        if (s->joint_intensity_index[ch])
+            nsubbands = FFMAX(nsubbands, s->nsubbands[s->joint_intensity_index[ch] - 1]);
+
+        // Update history for ADPCM
+        for (band = 0; band < nsubbands; band++) {
+            int32_t *samples = s->subband_samples[ch][band] - DCA_ADPCM_COEFFS;
+            AV_COPY128(samples, samples + s->npcmblocks);
+        }
+
+        // Clear inactive subbands
+        for (; band < DCA_SUBBANDS; band++) {
+            int32_t *samples = s->subband_samples[ch][band] - DCA_ADPCM_COEFFS;
+            memset(samples, 0, (DCA_ADPCM_COEFFS + s->npcmblocks) * sizeof(int32_t));
+        }
+    }
+
+    emms_c();
+
+    return 0;
+}
+
+static int parse_xch_frame(DCACoreDecoder *s)
+{
+    int ret;
+
+    if (s->ch_mask & DCA_SPEAKER_MASK_Cs) {
+        av_log(s->avctx, AV_LOG_ERROR, "XCH with Cs speaker already present\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = parse_frame_data(s, HEADER_XCH, s->nchannels)) < 0)
+        return ret;
+
+    // Seek to the end of core frame, don't trust XCH frame size
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XCH frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_xxch_frame(DCACoreDecoder *s)
+{
+    int xxch_nchsets, xxch_frame_size;
+    int ret, mask, header_size, header_pos = get_bits_count(&s->gb);
+
+    // XXCH sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_XXCH) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // XXCH frame header length
+    header_size = get_bits(&s->gb, 6) + 1;
+
+    // Check XXCH frame header CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos + 32, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH frame header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // CRC presence flag for channel set header
+    s->xxch_crc_present = get_bits1(&s->gb);
+
+    // Number of bits for loudspeaker mask
+    s->xxch_mask_nbits = get_bits(&s->gb, 5) + 1;
+    if (s->xxch_mask_nbits <= DCA_SPEAKER_Cs) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid number of bits for XXCH speaker mask (%d)\n", s->xxch_mask_nbits);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of channel sets
+    xxch_nchsets = get_bits(&s->gb, 2) + 1;
+    if (xxch_nchsets > 1) {
+        avpriv_request_sample(s->avctx, "%d XXCH channel sets", xxch_nchsets);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Channel set 0 data byte size
+    xxch_frame_size = get_bits(&s->gb, 14) + 1;
+
+    // Core loudspeaker activity mask
+    s->xxch_core_mask = get_bits_long(&s->gb, s->xxch_mask_nbits);
+
+    // Validate the core mask
+    mask = s->ch_mask;
+
+    if ((mask & DCA_SPEAKER_MASK_Ls) && (s->xxch_core_mask & DCA_SPEAKER_MASK_Lss))
+        mask = (mask & ~DCA_SPEAKER_MASK_Ls) | DCA_SPEAKER_MASK_Lss;
+
+    if ((mask & DCA_SPEAKER_MASK_Rs) && (s->xxch_core_mask & DCA_SPEAKER_MASK_Rss))
+        mask = (mask & ~DCA_SPEAKER_MASK_Rs) | DCA_SPEAKER_MASK_Rss;
+
+    if (mask != s->xxch_core_mask) {
+        av_log(s->avctx, AV_LOG_ERROR, "XXCH core speaker activity mask (%#x) disagrees with core (%#x)\n", s->xxch_core_mask, mask);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Reserved
+    // Byte align
+    // CRC16 of XXCH frame header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XXCH frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Parse XXCH channel set 0
+    if ((ret = parse_frame_data(s, HEADER_XXCH, s->nchannels)) < 0)
+        return ret;
+
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8 + xxch_frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XXCH channel set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_xbr_subframe(DCACoreDecoder *s, int xbr_base_ch, int xbr_nchannels,
+                              int *xbr_nsubbands, int xbr_transition_mode, int sf, int *sub_pos)
+{
+    int     xbr_nabits[DCA_CHANNELS];
+    int     xbr_bit_allocation[DCA_CHANNELS][DCA_SUBBANDS];
+    int     xbr_scale_nbits[DCA_CHANNELS];
+    int32_t xbr_scale_factors[DCA_CHANNELS][DCA_SUBBANDS][2];
+    int     ssf, ch, band, ofs;
+
+    // Check number of subband samples in this subframe
+    if (*sub_pos + s->nsubsubframes[sf] * DCA_SUBBAND_SAMPLES > s->npcmblocks) {
+        av_log(s->avctx, AV_LOG_ERROR, "Subband sample buffer overflow\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // Number of bits for XBR bit allocation index
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++)
+        xbr_nabits[ch] = get_bits(&s->gb, 2) + 2;
+
+    // XBR bit allocation index
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+        for (band = 0; band < xbr_nsubbands[ch]; band++) {
+            xbr_bit_allocation[ch][band] = get_bits(&s->gb, xbr_nabits[ch]);
+            if (xbr_bit_allocation[ch][band] > DCA_ABITS_MAX) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR bit allocation index\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Number of bits for scale indices
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+        xbr_scale_nbits[ch] = get_bits(&s->gb, 3);
+        if (!xbr_scale_nbits[ch]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of bits for XBR scale factor index\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // XBR scale factors
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+        const uint32_t *scale_table;
+        int scale_size;
+
+        // Select the root square table
+        if (s->scale_factor_sel[ch] > 5) {
+            scale_table = ff_dca_scale_factor_quant7;
+            scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant7);
+        } else {
+            scale_table = ff_dca_scale_factor_quant6;
+            scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant6);
+        }
+
+        // Parse scale factor indices and look up scale factors from the root
+        // square table
+        for (band = 0; band < xbr_nsubbands[ch]; band++) {
+            if (xbr_bit_allocation[ch][band]) {
+                int scale_index = get_bits(&s->gb, xbr_scale_nbits[ch]);
+                if (scale_index >= scale_size) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR scale factor index\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                xbr_scale_factors[ch][band][0] = scale_table[scale_index];
+                if (xbr_transition_mode && s->transition_mode[sf][ch][band]) {
+                    scale_index = get_bits(&s->gb, xbr_scale_nbits[ch]);
+                    if (scale_index >= scale_size) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR scale factor index\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    xbr_scale_factors[ch][band][1] = scale_table[scale_index];
+                }
+            }
+        }
+    }
+
+    // Audio data
+    for (ssf = 0, ofs = *sub_pos; ssf < s->nsubsubframes[sf]; ssf++) {
+        for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+            if (get_bits_left(&s->gb) < 0)
+                return AVERROR_INVALIDDATA;
+
+            for (band = 0; band < xbr_nsubbands[ch]; band++) {
+                int ret, trans_ssf, abits = xbr_bit_allocation[ch][band];
+                int32_t audio[DCA_SUBBAND_SAMPLES], step_size, scale;
+
+                // Extract bits from the bit stream
+                if (abits > 7) {
+                    // No further encoding
+                    get_array(&s->gb, audio, DCA_SUBBAND_SAMPLES, abits - 3);
+                } else if (abits > 0) {
+                    // Block codes
+                    if ((ret = parse_block_codes(s, audio, abits)) < 0)
+                        return ret;
+                } else {
+                    // No bits allocated
+                    continue;
+                }
+
+                // Look up quantization step size
+                step_size = ff_dca_lossless_quant[abits];
+
+                // Identify transient location
+                if (xbr_transition_mode)
+                    trans_ssf = s->transition_mode[sf][ch][band];
+                else
+                    trans_ssf = 0;
+
+                // Determine proper scale factor
+                if (trans_ssf == 0 || ssf < trans_ssf)
+                    scale = xbr_scale_factors[ch][band][0];
+                else
+                    scale = xbr_scale_factors[ch][band][1];
+
+                ff_dca_core_dequantize(s->subband_samples[ch][band] + ofs,
+                           audio, step_size, scale, 1, DCA_SUBBAND_SAMPLES);
+            }
+        }
+
+        // DSYNC
+        if ((ssf == s->nsubsubframes[sf] - 1 || s->sync_ssf) && get_bits(&s->gb, 16) != 0xffff) {
+            av_log(s->avctx, AV_LOG_ERROR, "XBR-DSYNC check failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        ofs += DCA_SUBBAND_SAMPLES;
+    }
+
+    // Advance subband sample pointer for the next subframe
+    *sub_pos = ofs;
+    return 0;
+}
+
+static int parse_xbr_frame(DCACoreDecoder *s)
+{
+    int     xbr_frame_size[DCA_EXSS_CHSETS_MAX];
+    int     xbr_nchannels[DCA_EXSS_CHSETS_MAX];
+    int     xbr_nsubbands[DCA_EXSS_CHSETS_MAX * DCA_EXSS_CHANNELS_MAX];
+    int     xbr_nchsets, xbr_transition_mode, xbr_band_nbits, xbr_base_ch;
+    int     i, ch1, ch2, ret, header_size, header_pos = get_bits_count(&s->gb);
+
+    // XBR sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_XBR) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // XBR frame header length
+    header_size = get_bits(&s->gb, 6) + 1;
+
+    // Check XBR frame header CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos + 32, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR frame header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of channel sets
+    xbr_nchsets = get_bits(&s->gb, 2) + 1;
+
+    // Channel set data byte size
+    for (i = 0; i < xbr_nchsets; i++)
+        xbr_frame_size[i] = get_bits(&s->gb, 14) + 1;
+
+    // Transition mode flag
+    xbr_transition_mode = get_bits1(&s->gb);
+
+    // Channel set headers
+    for (i = 0, ch2 = 0; i < xbr_nchsets; i++) {
+        xbr_nchannels[i] = get_bits(&s->gb, 3) + 1;
+        xbr_band_nbits = get_bits(&s->gb, 2) + 5;
+        for (ch1 = 0; ch1 < xbr_nchannels[i]; ch1++, ch2++) {
+            xbr_nsubbands[ch2] = get_bits(&s->gb, xbr_band_nbits) + 1;
+            if (xbr_nsubbands[ch2] > DCA_SUBBANDS) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid number of active XBR subbands (%d)\n", xbr_nsubbands[ch2]);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Reserved
+    // Byte align
+    // CRC16 of XBR frame header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XBR frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Channel set data
+    for (i = 0, xbr_base_ch = 0; i < xbr_nchsets; i++) {
+        header_pos = get_bits_count(&s->gb);
+
+        if (xbr_base_ch + xbr_nchannels[i] <= s->nchannels) {
+            int sf, sub_pos;
+
+            for (sf = 0, sub_pos = 0; sf < s->nsubframes; sf++) {
+                if ((ret = parse_xbr_subframe(s, xbr_base_ch,
+                                              xbr_base_ch + xbr_nchannels[i],
+                                              xbr_nsubbands, xbr_transition_mode,
+                                              sf, &sub_pos)) < 0)
+                    return ret;
+            }
+        }
+
+        xbr_base_ch += xbr_nchannels[i];
+
+        if (ff_dca_seek_bits(&s->gb, header_pos + xbr_frame_size[i] * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of XBR channel set\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    return 0;
+}
+
+// Modified ISO/IEC 9899 linear congruential generator
+// Returns pseudorandom integer in range [-2^30, 2^30 - 1]
+static int rand_x96(DCACoreDecoder *s)
+{
+    s->x96_rand = 1103515245U * s->x96_rand + 12345U;
+    return (s->x96_rand & 0x7fffffff) - 0x40000000;
+}
+
+static int parse_x96_subframe_audio(DCACoreDecoder *s, int sf, int xch_base, int *sub_pos)
+{
+    int n, ssf, ch, band, ofs;
+
+    // Check number of subband samples in this subframe
+    int nsamples = s->nsubsubframes[sf] * DCA_SUBBAND_SAMPLES;
+    if (*sub_pos + nsamples > s->npcmblocks) {
+        av_log(s->avctx, AV_LOG_ERROR, "Subband sample buffer overflow\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // VQ encoded or unallocated subbands
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+            // Get the sample pointer and scale factor
+            int32_t *samples = s->x96_subband_samples[ch][band] + *sub_pos;
+            int32_t scale    = s->scale_factors[ch][band >> 1][band & 1];
+
+            switch (s->bit_allocation[ch][band]) {
+            case 0: // No bits allocated for subband
+                if (scale <= 1)
+                    memset(samples, 0, nsamples * sizeof(int32_t));
+                else for (n = 0; n < nsamples; n++)
+                    // Generate scaled random samples
+                    samples[n] = mul31(rand_x96(s), scale);
+                break;
+
+            case 1: // VQ encoded subband
+                for (ssf = 0; ssf < (s->nsubsubframes[sf] + 1) / 2; ssf++) {
+                    // Extract the VQ address from the bit stream and look up
+                    // the VQ code book for up to 16 subband samples
+                    const int8_t *vq_samples = ff_dca_high_freq_vq[get_bits(&s->gb, 10)];
+                    // Scale and take the samples
+                    for (n = 0; n < FFMIN(nsamples - ssf * 16, 16); n++)
+                        *samples++ = clip23(vq_samples[n] * scale + (1 << 3) >> 4);
+                }
+                break;
+            }
+        }
+    }
+
+    // Audio data
+    for (ssf = 0, ofs = *sub_pos; ssf < s->nsubsubframes[sf]; ssf++) {
+        for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+            if (get_bits_left(&s->gb) < 0)
+                return AVERROR_INVALIDDATA;
+
+            for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+                int ret, abits = s->bit_allocation[ch][band] - 1;
+                int32_t audio[DCA_SUBBAND_SAMPLES], step_size, scale;
+
+                // Not VQ encoded or unallocated subbands
+                if (abits < 1)
+                    continue;
+
+                // Extract bits from the bit stream
+                if ((ret = extract_audio(s, audio, abits, ch)) < 0)
+                    return ret;
+
+                // Select quantization step size table and look up quantization
+                // step size
+                if (s->bit_rate == 3)
+                    step_size = ff_dca_lossless_quant[abits];
+                else
+                    step_size = ff_dca_lossy_quant[abits];
+
+                // Get the scale factor
+                scale = s->scale_factors[ch][band >> 1][band & 1];
+
+                ff_dca_core_dequantize(s->x96_subband_samples[ch][band] + ofs,
+                           audio, step_size, scale, 0, DCA_SUBBAND_SAMPLES);
+            }
+        }
+
+        // DSYNC
+        if ((ssf == s->nsubsubframes[sf] - 1 || s->sync_ssf) && get_bits(&s->gb, 16) != 0xffff) {
+            av_log(s->avctx, AV_LOG_ERROR, "X96-DSYNC check failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        ofs += DCA_SUBBAND_SAMPLES;
+    }
+
+    // Inverse ADPCM
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        inverse_adpcm(s->x96_subband_samples[ch], s->prediction_vq_index[ch],
+                      s->prediction_mode[ch], s->x96_subband_start, s->nsubbands[ch],
+                      *sub_pos, nsamples);
+    }
+
+    // Joint subband coding
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            s->dcadsp->decode_joint(s->x96_subband_samples[ch], s->x96_subband_samples[src_ch],
+                                    s->joint_scale_factors[ch], s->nsubbands[ch],
+                                    s->nsubbands[src_ch], *sub_pos, nsamples);
+        }
+    }
+
+    // Advance subband sample pointer for the next subframe
+    *sub_pos = ofs;
+    return 0;
+}
+
+static void erase_x96_adpcm_history(DCACoreDecoder *s)
+{
+    int ch, band;
+
+    // Erase ADPCM history from previous frame if
+    // predictor history switch was disabled
+    for (ch = 0; ch < DCA_CHANNELS; ch++)
+        for (band = 0; band < DCA_SUBBANDS_X96; band++)
+            AV_ZERO128(s->x96_subband_samples[ch][band] - DCA_ADPCM_COEFFS);
+
+    emms_c();
+}
+
+static int alloc_x96_sample_buffer(DCACoreDecoder *s)
+{
+    int nchsamples = DCA_ADPCM_COEFFS + s->npcmblocks;
+    int nframesamples = nchsamples * DCA_CHANNELS * DCA_SUBBANDS_X96;
+    unsigned int size = s->x96_subband_size;
+    int ch, band;
+
+    // Reallocate subband sample buffer
+    av_fast_mallocz(&s->x96_subband_buffer, &s->x96_subband_size,
+                    nframesamples * sizeof(int32_t));
+    if (!s->x96_subband_buffer)
+        return AVERROR(ENOMEM);
+
+    if (size != s->x96_subband_size) {
+        for (ch = 0; ch < DCA_CHANNELS; ch++)
+            for (band = 0; band < DCA_SUBBANDS_X96; band++)
+                s->x96_subband_samples[ch][band] = s->x96_subband_buffer +
+                    (ch * DCA_SUBBANDS_X96 + band) * nchsamples + DCA_ADPCM_COEFFS;
+    }
+
+    if (!s->predictor_history)
+        erase_x96_adpcm_history(s);
+
+    return 0;
+}
+
+static int parse_x96_subframe_header(DCACoreDecoder *s, int xch_base)
+{
+    int ch, band, ret;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // Prediction mode
+    for (ch = xch_base; ch < s->x96_nchannels; ch++)
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++)
+            s->prediction_mode[ch][band] = get_bits1(&s->gb);
+
+    // Prediction coefficients VQ address
+    for (ch = xch_base; ch < s->x96_nchannels; ch++)
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++)
+            if (s->prediction_mode[ch][band])
+                s->prediction_vq_index[ch][band] = get_bits(&s->gb, 12);
+
+    // Bit allocation index
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int sel = s->bit_allocation_sel[ch];
+        int abits = 0;
+
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+            // If Huffman code was used, the difference of abits was encoded
+            if (sel < 7)
+                abits += dca_get_vlc(&s->gb, &ff_dca_vlc_quant_index[5 + 2 * s->x96_high_res], sel);
+            else
+                abits = get_bits(&s->gb, 3 + s->x96_high_res);
+
+            if (abits < 0 || abits > 7 + 8 * s->x96_high_res) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 bit allocation index\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            s->bit_allocation[ch][band] = abits;
+        }
+    }
+
+    // Scale factors
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int sel = s->scale_factor_sel[ch];
+        int scale_index = 0;
+
+        // Extract scales for subbands which are transmitted even for
+        // unallocated subbands
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+            if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                return ret;
+            s->scale_factors[ch][band >> 1][band & 1] = ret;
+        }
+    }
+
+    // Joint subband codebook select
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        if (s->joint_intensity_index[ch]) {
+            s->joint_scale_sel[ch] = get_bits(&s->gb, 3);
+            if (s->joint_scale_sel[ch] == 7) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 joint scale factor code book\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Scale factors for joint subband coding
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            int sel = s->joint_scale_sel[ch];
+            for (band = s->nsubbands[ch]; band < s->nsubbands[src_ch]; band++) {
+                if ((ret = parse_joint_scale(s, sel)) < 0)
+                    return ret;
+                s->joint_scale_factors[ch][band] = ret;
+            }
+        }
+    }
+
+    // Side information CRC check word
+    if (s->crc_present)
+        skip_bits(&s->gb, 16);
+
+    return 0;
+}
+
+static int parse_x96_coding_header(DCACoreDecoder *s, int exss, int xch_base)
+{
+    int n, ch, header_size = 0, header_pos = get_bits_count(&s->gb);
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    if (exss) {
+        // Channel set header length
+        header_size = get_bits(&s->gb, 7) + 1;
+
+        // Check CRC
+        if (s->x96_crc_present
+            && ff_dca_check_crc(s->avctx, &s->gb, header_pos, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 channel set header checksum\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // High resolution flag
+    s->x96_high_res = get_bits1(&s->gb);
+
+    // First encoded subband
+    if (s->x96_rev_no < 8) {
+        s->x96_subband_start = get_bits(&s->gb, 5);
+        if (s->x96_subband_start > 27) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 subband start index (%d)\n", s->x96_subband_start);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        s->x96_subband_start = DCA_SUBBANDS;
+    }
+
+    // Subband activity count
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        s->nsubbands[ch] = get_bits(&s->gb, 6) + 1;
+        if (s->nsubbands[ch] < DCA_SUBBANDS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 subband activity count (%d)\n", s->nsubbands[ch]);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Joint intensity coding index
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        if ((n = get_bits(&s->gb, 3)) && xch_base)
+            n += xch_base - 1;
+        if (n > s->x96_nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 joint intensity coding index\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->joint_intensity_index[ch] = n;
+    }
+
+    // Scale factor code book
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        s->scale_factor_sel[ch] = get_bits(&s->gb, 3);
+        if (s->scale_factor_sel[ch] >= 6) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 scale factor code book\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Bit allocation quantizer select
+    for (ch = xch_base; ch < s->x96_nchannels; ch++)
+        s->bit_allocation_sel[ch] = get_bits(&s->gb, 3);
+
+    // Quantization index codebook select
+    for (n = 0; n < 6 + 4 * s->x96_high_res; n++)
+        for (ch = xch_base; ch < s->x96_nchannels; ch++)
+            s->quant_index_sel[ch][n] = get_bits(&s->gb, ff_dca_quant_index_sel_nbits[n]);
+
+    if (exss) {
+        // Reserved
+        // Byte align
+        // CRC16 of channel set header
+        if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 channel set header\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        if (s->crc_present)
+            skip_bits(&s->gb, 16);
+    }
+
+    return 0;
+}
+
+static int parse_x96_frame_data(DCACoreDecoder *s, int exss, int xch_base)
+{
+    int sf, ch, ret, band, sub_pos;
+
+    if ((ret = parse_x96_coding_header(s, exss, xch_base)) < 0)
+        return ret;
+
+    for (sf = 0, sub_pos = 0; sf < s->nsubframes; sf++) {
+        if ((ret = parse_x96_subframe_header(s, xch_base)) < 0)
+            return ret;
+        if ((ret = parse_x96_subframe_audio(s, sf, xch_base, &sub_pos)) < 0)
+            return ret;
+    }
+
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        // Determine number of active subbands for this channel
+        int nsubbands = s->nsubbands[ch];
+        if (s->joint_intensity_index[ch])
+            nsubbands = FFMAX(nsubbands, s->nsubbands[s->joint_intensity_index[ch] - 1]);
+
+        // Update history for ADPCM and clear inactive subbands
+        for (band = 0; band < DCA_SUBBANDS_X96; band++) {
+            int32_t *samples = s->x96_subband_samples[ch][band] - DCA_ADPCM_COEFFS;
+            if (band >= s->x96_subband_start && band < nsubbands)
+                AV_COPY128(samples, samples + s->npcmblocks);
+            else
+                memset(samples, 0, (DCA_ADPCM_COEFFS + s->npcmblocks) * sizeof(int32_t));
+        }
+    }
+
+    emms_c();
+
+    return 0;
+}
+
+static int parse_x96_frame(DCACoreDecoder *s)
+{
+    int ret;
+
+    // Revision number
+    s->x96_rev_no = get_bits(&s->gb, 4);
+    if (s->x96_rev_no < 1 || s->x96_rev_no > 8) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 revision (%d)\n", s->x96_rev_no);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->x96_crc_present = 0;
+    s->x96_nchannels = s->nchannels;
+
+    if ((ret = alloc_x96_sample_buffer(s)) < 0)
+        return ret;
+
+    if ((ret = parse_x96_frame_data(s, 0, 0)) < 0)
+        return ret;
+
+    // Seek to the end of core frame
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_x96_frame_exss(DCACoreDecoder *s)
+{
+    int     x96_frame_size[DCA_EXSS_CHSETS_MAX];
+    int     x96_nchannels[DCA_EXSS_CHSETS_MAX];
+    int     x96_nchsets, x96_base_ch;
+    int     i, ret, header_size, header_pos = get_bits_count(&s->gb);
+
+    // X96 sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_X96) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // X96 frame header length
+    header_size = get_bits(&s->gb, 6) + 1;
+
+    // Check X96 frame header CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos + 32, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 frame header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Revision number
+    s->x96_rev_no = get_bits(&s->gb, 4);
+    if (s->x96_rev_no < 1 || s->x96_rev_no > 8) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 revision (%d)\n", s->x96_rev_no);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // CRC presence flag for channel set header
+    s->x96_crc_present = get_bits1(&s->gb);
+
+    // Number of channel sets
+    x96_nchsets = get_bits(&s->gb, 2) + 1;
+
+    // Channel set data byte size
+    for (i = 0; i < x96_nchsets; i++)
+        x96_frame_size[i] = get_bits(&s->gb, 12) + 1;
+
+    // Number of channels in channel set
+    for (i = 0; i < x96_nchsets; i++)
+        x96_nchannels[i] = get_bits(&s->gb, 3) + 1;
+
+    // Reserved
+    // Byte align
+    // CRC16 of X96 frame header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = alloc_x96_sample_buffer(s)) < 0)
+        return ret;
+
+    // Channel set data
+    s->x96_nchannels = 0;
+    for (i = 0, x96_base_ch = 0; i < x96_nchsets; i++) {
+        header_pos = get_bits_count(&s->gb);
+
+        if (x96_base_ch + x96_nchannels[i] <= s->nchannels) {
+            s->x96_nchannels = x96_base_ch + x96_nchannels[i];
+            if ((ret = parse_x96_frame_data(s, 1, x96_base_ch)) < 0)
+                return ret;
+        }
+
+        x96_base_ch += x96_nchannels[i];
+
+        if (ff_dca_seek_bits(&s->gb, header_pos + x96_frame_size[i] * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 channel set\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    return 0;
+}
+
+static int parse_aux_data(DCACoreDecoder *s)
+{
+    int aux_pos;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // Auxiliary data byte count (can't be trusted)
+    skip_bits(&s->gb, 6);
+
+    // 4-byte align
+    skip_bits_long(&s->gb, -get_bits_count(&s->gb) & 31);
+
+    // Auxiliary data sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_REV1AUX) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid auxiliary data sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    aux_pos = get_bits_count(&s->gb);
+
+    // Auxiliary decode time stamp flag
+    if (get_bits1(&s->gb))
+        skip_bits_long(&s->gb, 47);
+
+    // Auxiliary dynamic downmix flag
+    if (s->prim_dmix_embedded = get_bits1(&s->gb)) {
+        int i, m, n;
+
+        // Auxiliary primary channel downmix type
+        s->prim_dmix_type = get_bits(&s->gb, 3);
+        if (s->prim_dmix_type >= DCA_DMIX_TYPE_COUNT) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid primary channel set downmix type\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Size of downmix coefficients matrix
+        m = ff_dca_dmix_primary_nch[s->prim_dmix_type];
+        n = ff_dca_channels[s->audio_mode] + !!s->lfe_present;
+
+        // Dynamic downmix code coefficients
+        for (i = 0; i < m * n; i++) {
+            int code = get_bits(&s->gb, 9);
+            int sign = (code >> 8) - 1;
+            unsigned int index = code & 0xff;
+            if (index >= FF_DCA_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid downmix coefficient index\n");
+                return AVERROR_INVALIDDATA;
+            }
+            s->prim_dmix_coeff[i] = (ff_dca_dmixtable[index] ^ sign) - sign;
+        }
+    }
+
+    // Byte align
+    skip_bits(&s->gb, -get_bits_count(&s->gb) & 7);
+
+    // CRC16 of auxiliary data
+    skip_bits(&s->gb, 16);
+
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, aux_pos, get_bits_count(&s->gb))) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid auxiliary data checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_optional_info(DCACoreDecoder *s)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    int ret = -1;
+
+    // Time code stamp
+    if (s->ts_present)
+        skip_bits_long(&s->gb, 32);
+
+    // Auxiliary data
+    if (s->aux_present && (ret = parse_aux_data(s)) < 0
+        && (s->avctx->err_recognition & AV_EF_EXPLODE))
+        return ret;
+
+    if (ret < 0)
+        s->prim_dmix_embedded = 0;
+
+    // Core extensions
+    if (s->ext_audio_present && !dca->core_only) {
+        int sync_pos = FFMIN(s->frame_size / 4, s->gb.size_in_bits / 32) - 1;
+        int last_pos = get_bits_count(&s->gb) / 32;
+        int size, dist;
+        uint32_t w1, w2 = 0;
+
+        // Search for extension sync words aligned on 4-byte boundary. Search
+        // must be done backwards from the end of core frame to work around
+        // sync word aliasing issues.
+        switch (s->ext_audio_type) {
+        case DCA_EXT_AUDIO_XCH:
+            if (dca->request_channel_layout)
+                break;
+
+            // The distance between XCH sync word and end of the core frame
+            // must be equal to XCH frame size. Off by one error is allowed for
+            // compatibility with legacy bitstreams. Minimum XCH frame size is
+            // 96 bytes. AMODE and PCHS are further checked to reduce
+            // probability of alias sync detection.
+            for (; sync_pos >= last_pos; sync_pos--, w2 = w1) {
+                w1 = AV_RB32(s->gb.buffer + sync_pos * 4);
+                if (w1 == DCA_SYNCWORD_XCH) {
+                    size = (w2 >> 22) + 1;
+                    dist = s->frame_size - sync_pos * 4;
+                    if (size >= 96
+                        && (size == dist || size - 1 == dist)
+                        && (w2 >> 15 & 0x7f) == 0x08) {
+                        s->xch_pos = sync_pos * 32 + 49;
+                        break;
+                    }
+                }
+            }
+
+            if (!s->xch_pos) {
+                av_log(s->avctx, AV_LOG_ERROR, "XCH sync word not found\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+            break;
+
+        case DCA_EXT_AUDIO_X96:
+            // The distance between X96 sync word and end of the core frame
+            // must be equal to X96 frame size. Minimum X96 frame size is 96
+            // bytes.
+            for (; sync_pos >= last_pos; sync_pos--, w2 = w1) {
+                w1 = AV_RB32(s->gb.buffer + sync_pos * 4);
+                if (w1 == DCA_SYNCWORD_X96) {
+                    size = (w2 >> 20) + 1;
+                    dist = s->frame_size - sync_pos * 4;
+                    if (size >= 96 && size == dist) {
+                        s->x96_pos = sync_pos * 32 + 44;
+                        break;
+                    }
+                }
+            }
+
+            if (!s->x96_pos) {
+                av_log(s->avctx, AV_LOG_ERROR, "X96 sync word not found\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+            break;
+
+        case DCA_EXT_AUDIO_XXCH:
+            if (dca->request_channel_layout)
+                break;
+
+            // XXCH frame header CRC must be valid. Minimum XXCH frame header
+            // size is 11 bytes.
+            for (; sync_pos >= last_pos; sync_pos--, w2 = w1) {
+                w1 = AV_RB32(s->gb.buffer + sync_pos * 4);
+                if (w1 == DCA_SYNCWORD_XXCH) {
+                    size = (w2 >> 26) + 1;
+                    dist = s->gb.size_in_bits / 8 - sync_pos * 4;
+                    if (size >= 11 && size <= dist &&
+                        !av_crc(dca->crctab, 0xffff, s->gb.buffer +
+                                (sync_pos + 1) * 4, size - 4)) {
+                        s->xxch_pos = sync_pos * 32;
+                        break;
+                    }
+                }
+            }
+
+            if (!s->xxch_pos) {
+                av_log(s->avctx, AV_LOG_ERROR, "XXCH sync word not found\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+            break;
+        }
+    }
+
+    return 0;
+}
+
+int ff_dca_core_parse(DCACoreDecoder *s, uint8_t *data, int size)
+{
+    int ret;
+
+    s->ext_audio_mask = 0;
+    s->xch_pos = s->xxch_pos = s->x96_pos = 0;
+
+    if ((ret = init_get_bits8(&s->gb, data, size)) < 0)
+        return ret;
+    s->gb_in = s->gb;
+
+    if ((ret = parse_frame_header(s)) < 0)
+        return ret;
+    if ((ret = alloc_sample_buffer(s)) < 0)
+        return ret;
+    if ((ret = parse_frame_data(s, HEADER_CORE, 0)) < 0)
+        return ret;
+    if ((ret = parse_optional_info(s)) < 0)
+        return ret;
+
+    // Workaround for DTS in WAV
+    if (s->frame_size > size)
+        s->frame_size = size;
+
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of core frame\n");
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+int ff_dca_core_parse_exss(DCACoreDecoder *s, uint8_t *data, DCAExssAsset *asset)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    int exss_mask = asset ? asset->extension_mask : 0;
+    int ret = 0, ext = 0;
+
+    // Parse (X)XCH unless downmixing
+    if (!dca->request_channel_layout) {
+        if (exss_mask & DCA_EXSS_XXCH) {
+            if ((ret = init_get_bits8(&s->gb, data + asset->xxch_offset, asset->xxch_size)) < 0)
+                return ret;
+            ret = parse_xxch_frame(s);
+            ext = DCA_EXSS_XXCH;
+        } else if (s->xxch_pos) {
+            s->gb = s->gb_in;
+            skip_bits_long(&s->gb, s->xxch_pos);
+            ret = parse_xxch_frame(s);
+            ext = DCA_CSS_XXCH;
+        } else if (s->xch_pos) {
+            s->gb = s->gb_in;
+            skip_bits_long(&s->gb, s->xch_pos);
+            ret = parse_xch_frame(s);
+            ext = DCA_CSS_XCH;
+        }
+
+        // Revert to primary channel set in case (X)XCH parsing fails
+        if (ret < 0) {
+            if (avctx->err_recognition & AV_EF_EXPLODE)
+                return ret;
+            s->nchannels = ff_dca_channels[s->audio_mode];
+            s->ch_mask = audio_mode_ch_mask[s->audio_mode];
+            if (s->lfe_present)
+                s->ch_mask |= DCA_SPEAKER_MASK_LFE1;
+        } else {
+            s->ext_audio_mask |= ext;
+        }
+    }
+
+    // Parse XBR
+    if (exss_mask & DCA_EXSS_XBR) {
+        if ((ret = init_get_bits8(&s->gb, data + asset->xbr_offset, asset->xbr_size)) < 0)
+            return ret;
+        if ((ret = parse_xbr_frame(s)) < 0) {
+            if (avctx->err_recognition & AV_EF_EXPLODE)
+                return ret;
+        } else {
+            s->ext_audio_mask |= DCA_EXSS_XBR;
+        }
+    }
+
+    // Parse X96 unless decoding XLL
+    if (!(dca->packet & DCA_PACKET_XLL)) {
+        if (exss_mask & DCA_EXSS_X96) {
+            if ((ret = init_get_bits8(&s->gb, data + asset->x96_offset, asset->x96_size)) < 0)
+                return ret;
+            if ((ret = parse_x96_frame_exss(s)) < 0) {
+                if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->ext_audio_mask |= DCA_EXSS_X96;
+            }
+        } else if (s->x96_pos) {
+            s->gb = s->gb_in;
+            skip_bits_long(&s->gb, s->x96_pos);
+            if ((ret = parse_x96_frame(s)) < 0) {
+                if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->ext_audio_mask |= DCA_CSS_X96;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int map_prm_ch_to_spkr(DCACoreDecoder *s, int ch)
+{
+    int pos, spkr;
+
+    // Try to map this channel to core first
+    pos = ff_dca_channels[s->audio_mode];
+    if (ch < pos) {
+        spkr = prm_ch_to_spkr_map[s->audio_mode][ch];
+        if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH)) {
+            if (s->xxch_core_mask & (1U << spkr))
+                return spkr;
+            if (spkr == DCA_SPEAKER_Ls && (s->xxch_core_mask & DCA_SPEAKER_MASK_Lss))
+                return DCA_SPEAKER_Lss;
+            if (spkr == DCA_SPEAKER_Rs && (s->xxch_core_mask & DCA_SPEAKER_MASK_Rss))
+                return DCA_SPEAKER_Rss;
+            return -1;
+        }
+        return spkr;
+    }
+
+    // Then XCH
+    if ((s->ext_audio_mask & DCA_CSS_XCH) && ch == pos)
+        return DCA_SPEAKER_Cs;
+
+    // Then XXCH
+    if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH)) {
+        for (spkr = DCA_SPEAKER_Cs; spkr < s->xxch_mask_nbits; spkr++)
+            if (s->xxch_spkr_mask & (1U << spkr))
+                if (pos++ == ch)
+                    return spkr;
+    }
+
+    // No mapping
+    return -1;
+}
+
+static void erase_dsp_history(DCACoreDecoder *s)
+{
+    memset(s->dcadsp_data, 0, sizeof(s->dcadsp_data));
+    s->output_history_lfe_fixed = 0;
+    s->output_history_lfe_float = 0;
+}
+
+static void set_filter_mode(DCACoreDecoder *s, int mode)
+{
+    if (s->filter_mode != mode) {
+        erase_dsp_history(s);
+        s->filter_mode = mode;
+    }
+}
+
+int ff_dca_core_filter_fixed(DCACoreDecoder *s, int x96_synth)
+{
+    int n, ch, spkr, nsamples, x96_nchannels = 0;
+    const int32_t *filter_coeff;
+    int32_t *ptr;
+
+    // Externally set x96_synth flag implies that X96 synthesis should be
+    // enabled, yet actual X96 subband data should be discarded. This is a
+    // special case for lossless residual decoder that ignores X96 data if
+    // present.
+    if (!x96_synth && (s->ext_audio_mask & (DCA_CSS_X96 | DCA_EXSS_X96))) {
+        x96_nchannels = s->x96_nchannels;
+        x96_synth = 1;
+    }
+    if (x96_synth < 0)
+        x96_synth = 0;
+
+    s->output_rate = s->sample_rate << x96_synth;
+    s->npcmsamples = nsamples = (s->npcmblocks * DCA_PCMBLOCK_SAMPLES) << x96_synth;
+
+    // Reallocate PCM output buffer
+    av_fast_malloc(&s->output_buffer, &s->output_size,
+                   nsamples * av_popcount(s->ch_mask) * sizeof(int32_t));
+    if (!s->output_buffer)
+        return AVERROR(ENOMEM);
+
+    ptr = (int32_t *)s->output_buffer;
+    for (spkr = 0; spkr < DCA_SPEAKER_COUNT; spkr++) {
+        if (s->ch_mask & (1U << spkr)) {
+            s->output_samples[spkr] = ptr;
+            ptr += nsamples;
+        } else {
+            s->output_samples[spkr] = NULL;
+        }
+    }
+
+    // Handle change of filtering mode
+    set_filter_mode(s, x96_synth | DCA_FILTER_MODE_FIXED);
+
+    // Select filter
+    if (x96_synth)
+        filter_coeff = ff_dca_fir_64bands_fixed;
+    else if (s->filter_perfect)
+        filter_coeff = ff_dca_fir_32bands_perfect_fixed;
+    else
+        filter_coeff = ff_dca_fir_32bands_nonperfect_fixed;
+
+    // Filter primary channels
+    for (ch = 0; ch < s->nchannels; ch++) {
+        // Map this primary channel to speaker
+        spkr = map_prm_ch_to_spkr(s, ch);
+        if (spkr < 0)
+            return AVERROR(EINVAL);
+
+        // Filter bank reconstruction
+        s->dcadsp->sub_qmf_fixed[x96_synth](
+            &s->synth,
+            &s->dcadct,
+            s->output_samples[spkr],
+            s->subband_samples[ch],
+            ch < x96_nchannels ? s->x96_subband_samples[ch] : NULL,
+            s->dcadsp_data[ch].u.fix.hist1,
+            &s->dcadsp_data[ch].offset,
+            s->dcadsp_data[ch].u.fix.hist2,
+            filter_coeff,
+            s->npcmblocks);
+    }
+
+    // Filter LFE channel
+    if (s->lfe_present) {
+        int32_t *samples = s->output_samples[DCA_SPEAKER_LFE1];
+        int nlfesamples = s->npcmblocks >> 1;
+
+        // Check LFF
+        if (s->lfe_present == DCA_LFE_FLAG_128) {
+            av_log(s->avctx, AV_LOG_ERROR, "Fixed point mode doesn't support LFF=1\n");
+            return AVERROR(EINVAL);
+        }
+
+        // Offset intermediate buffer for X96
+        if (x96_synth)
+            samples += nsamples / 2;
+
+        // Interpolate LFE channel
+        s->dcadsp->lfe_fir_fixed(samples, s->lfe_samples + DCA_LFE_HISTORY,
+                                 ff_dca_lfe_fir_64_fixed, s->npcmblocks);
+
+        if (x96_synth) {
+            // Filter 96 kHz oversampled LFE PCM to attenuate high frequency
+            // (47.6 - 48.0 kHz) components of interpolation image
+            s->dcadsp->lfe_x96_fixed(s->output_samples[DCA_SPEAKER_LFE1],
+                                     samples, &s->output_history_lfe_fixed,
+                                     nsamples / 2);
+
+        }
+
+        // Update LFE history
+        for (n = DCA_LFE_HISTORY - 1; n >= 0; n--)
+            s->lfe_samples[n] = s->lfe_samples[nlfesamples + n];
+    }
+
+    return 0;
+}
+
+static int filter_frame_fixed(DCACoreDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    int i, n, ch, ret, spkr, nsamples;
+
+    // Don't filter twice when falling back from XLL
+    if (!(dca->packet & DCA_PACKET_XLL) && (ret = ff_dca_core_filter_fixed(s, 0)) < 0)
+        return ret;
+
+    avctx->sample_rate = s->output_rate;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
+    avctx->bits_per_raw_sample = 24;
+
+    frame->nb_samples = nsamples = s->npcmsamples;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Undo embedded XCH downmix
+    if (s->es_format && (s->ext_audio_mask & DCA_CSS_XCH)
+        && s->audio_mode >= DCA_AMODE_2F2R) {
+        s->dcadsp->dmix_sub_xch(s->output_samples[DCA_SPEAKER_Ls],
+                                s->output_samples[DCA_SPEAKER_Rs],
+                                s->output_samples[DCA_SPEAKER_Cs],
+                                nsamples);
+
+    }
+
+    // Undo embedded XXCH downmix
+    if ((s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH))
+        && s->xxch_dmix_embedded) {
+        int scale_inv   = s->xxch_dmix_scale_inv;
+        int *coeff_ptr  = s->xxch_dmix_coeff;
+        int xch_base    = ff_dca_channels[s->audio_mode];
+        av_assert1(s->nchannels - xch_base <= DCA_XXCH_CHANNELS_MAX);
+
+        // Undo embedded core downmix pre-scaling
+        for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+            if (s->xxch_core_mask & (1U << spkr)) {
+                s->dcadsp->dmix_scale_inv(s->output_samples[spkr],
+                                          scale_inv, nsamples);
+            }
+        }
+
+        // Undo downmix
+        for (ch = xch_base; ch < s->nchannels; ch++) {
+            int src_spkr = map_prm_ch_to_spkr(s, ch);
+            if (src_spkr < 0)
+                return AVERROR(EINVAL);
+            for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+                if (s->xxch_dmix_mask[ch - xch_base] & (1U << spkr)) {
+                    int coeff = mul16(*coeff_ptr++, scale_inv);
+                    if (coeff) {
+                        s->dcadsp->dmix_sub(s->output_samples[spkr    ],
+                                            s->output_samples[src_spkr],
+                                            coeff, nsamples);
+                    }
+                }
+            }
+        }
+    }
+
+    if (!(s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH | DCA_EXSS_XXCH))) {
+        // Front sum/difference decoding
+        if ((s->sumdiff_front && s->audio_mode > DCA_AMODE_MONO)
+            || s->audio_mode == DCA_AMODE_STEREO_SUMDIFF) {
+            s->fixed_dsp->butterflies_fixed(s->output_samples[DCA_SPEAKER_L],
+                                            s->output_samples[DCA_SPEAKER_R],
+                                            nsamples);
+        }
+
+        // Surround sum/difference decoding
+        if (s->sumdiff_surround && s->audio_mode >= DCA_AMODE_2F2R) {
+            s->fixed_dsp->butterflies_fixed(s->output_samples[DCA_SPEAKER_Ls],
+                                            s->output_samples[DCA_SPEAKER_Rs],
+                                            nsamples);
+        }
+    }
+
+    // Downmix primary channel set to stereo
+    if (s->request_mask != s->ch_mask) {
+        ff_dca_downmix_to_stereo_fixed(s->dcadsp,
+                                       s->output_samples,
+                                       s->prim_dmix_coeff,
+                                       nsamples, s->ch_mask);
+    }
+
+    for (i = 0; i < avctx->channels; i++) {
+        int32_t *samples = s->output_samples[s->ch_remap[i]];
+        int32_t *plane = (int32_t *)frame->extended_data[i];
+        for (n = 0; n < nsamples; n++)
+            plane[n] = clip23(samples[n]) * (1 << 8);
+    }
+
+    return 0;
+}
+
+static int filter_frame_float(DCACoreDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    int x96_nchannels = 0, x96_synth = 0;
+    int i, n, ch, ret, spkr, nsamples, nchannels;
+    float *output_samples[DCA_SPEAKER_COUNT] = { NULL }, *ptr;
+    const float *filter_coeff;
+
+    if (s->ext_audio_mask & (DCA_CSS_X96 | DCA_EXSS_X96)) {
+        x96_nchannels = s->x96_nchannels;
+        x96_synth = 1;
+    }
+
+    avctx->sample_rate = s->sample_rate << x96_synth;
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    avctx->bits_per_raw_sample = 0;
+
+    frame->nb_samples = nsamples = (s->npcmblocks * DCA_PCMBLOCK_SAMPLES) << x96_synth;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Build reverse speaker to channel mapping
+    for (i = 0; i < avctx->channels; i++)
+        output_samples[s->ch_remap[i]] = (float *)frame->extended_data[i];
+
+    // Allocate space for extra channels
+    nchannels = av_popcount(s->ch_mask) - avctx->channels;
+    if (nchannels > 0) {
+        av_fast_malloc(&s->output_buffer, &s->output_size,
+                       nsamples * nchannels * sizeof(float));
+        if (!s->output_buffer)
+            return AVERROR(ENOMEM);
+
+        ptr = (float *)s->output_buffer;
+        for (spkr = 0; spkr < DCA_SPEAKER_COUNT; spkr++) {
+            if (!(s->ch_mask & (1U << spkr)))
+                continue;
+            if (output_samples[spkr])
+                continue;
+            output_samples[spkr] = ptr;
+            ptr += nsamples;
+        }
+    }
+
+    // Handle change of filtering mode
+    set_filter_mode(s, x96_synth);
+
+    // Select filter
+    if (x96_synth)
+        filter_coeff = ff_dca_fir_64bands;
+    else if (s->filter_perfect)
+        filter_coeff = ff_dca_fir_32bands_perfect;
+    else
+        filter_coeff = ff_dca_fir_32bands_nonperfect;
+
+    // Filter primary channels
+    for (ch = 0; ch < s->nchannels; ch++) {
+        // Map this primary channel to speaker
+        spkr = map_prm_ch_to_spkr(s, ch);
+        if (spkr < 0)
+            return AVERROR(EINVAL);
+
+        // Filter bank reconstruction
+        s->dcadsp->sub_qmf_float[x96_synth](
+            &s->synth,
+            &s->imdct[x96_synth],
+            output_samples[spkr],
+            s->subband_samples[ch],
+            ch < x96_nchannels ? s->x96_subband_samples[ch] : NULL,
+            s->dcadsp_data[ch].u.flt.hist1,
+            &s->dcadsp_data[ch].offset,
+            s->dcadsp_data[ch].u.flt.hist2,
+            filter_coeff,
+            s->npcmblocks,
+            1.0f / (1 << (17 - x96_synth)));
+    }
+
+    // Filter LFE channel
+    if (s->lfe_present) {
+        int dec_select = (s->lfe_present == DCA_LFE_FLAG_128);
+        float *samples = output_samples[DCA_SPEAKER_LFE1];
+        int nlfesamples = s->npcmblocks >> (dec_select + 1);
+
+        // Offset intermediate buffer for X96
+        if (x96_synth)
+            samples += nsamples / 2;
+
+        // Select filter
+        if (dec_select)
+            filter_coeff = ff_dca_lfe_fir_128;
+        else
+            filter_coeff = ff_dca_lfe_fir_64;
+
+        // Interpolate LFE channel
+        s->dcadsp->lfe_fir_float[dec_select](
+            samples, s->lfe_samples + DCA_LFE_HISTORY,
+            filter_coeff, s->npcmblocks);
+
+        if (x96_synth) {
+            // Filter 96 kHz oversampled LFE PCM to attenuate high frequency
+            // (47.6 - 48.0 kHz) components of interpolation image
+            s->dcadsp->lfe_x96_float(output_samples[DCA_SPEAKER_LFE1],
+                                     samples, &s->output_history_lfe_float,
+                                     nsamples / 2);
+        }
+
+        // Update LFE history
+        for (n = DCA_LFE_HISTORY - 1; n >= 0; n--)
+            s->lfe_samples[n] = s->lfe_samples[nlfesamples + n];
+    }
+
+    // Undo embedded XCH downmix
+    if (s->es_format && (s->ext_audio_mask & DCA_CSS_XCH)
+        && s->audio_mode >= DCA_AMODE_2F2R) {
+        s->float_dsp->vector_fmac_scalar(output_samples[DCA_SPEAKER_Ls],
+                                         output_samples[DCA_SPEAKER_Cs],
+                                         -M_SQRT1_2, nsamples);
+        s->float_dsp->vector_fmac_scalar(output_samples[DCA_SPEAKER_Rs],
+                                         output_samples[DCA_SPEAKER_Cs],
+                                         -M_SQRT1_2, nsamples);
+    }
+
+    // Undo embedded XXCH downmix
+    if ((s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH))
+        && s->xxch_dmix_embedded) {
+        float scale_inv = s->xxch_dmix_scale_inv * (1.0f / (1 << 16));
+        int *coeff_ptr  = s->xxch_dmix_coeff;
+        int xch_base    = ff_dca_channels[s->audio_mode];
+        av_assert1(s->nchannels - xch_base <= DCA_XXCH_CHANNELS_MAX);
+
+        // Undo downmix
+        for (ch = xch_base; ch < s->nchannels; ch++) {
+            int src_spkr = map_prm_ch_to_spkr(s, ch);
+            if (src_spkr < 0)
+                return AVERROR(EINVAL);
+            for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+                if (s->xxch_dmix_mask[ch - xch_base] & (1U << spkr)) {
+                    int coeff = *coeff_ptr++;
+                    if (coeff) {
+                        s->float_dsp->vector_fmac_scalar(output_samples[    spkr],
+                                                         output_samples[src_spkr],
+                                                         coeff * (-1.0f / (1 << 15)),
+                                                         nsamples);
+                    }
+                }
+            }
+        }
+
+        // Undo embedded core downmix pre-scaling
+        for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+            if (s->xxch_core_mask & (1U << spkr)) {
+                s->float_dsp->vector_fmul_scalar(output_samples[spkr],
+                                                 output_samples[spkr],
+                                                 scale_inv, nsamples);
+            }
+        }
+    }
+
+    if (!(s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH | DCA_EXSS_XXCH))) {
+        // Front sum/difference decoding
+        if ((s->sumdiff_front && s->audio_mode > DCA_AMODE_MONO)
+            || s->audio_mode == DCA_AMODE_STEREO_SUMDIFF) {
+            s->float_dsp->butterflies_float(output_samples[DCA_SPEAKER_L],
+                                            output_samples[DCA_SPEAKER_R],
+                                            nsamples);
+        }
+
+        // Surround sum/difference decoding
+        if (s->sumdiff_surround && s->audio_mode >= DCA_AMODE_2F2R) {
+            s->float_dsp->butterflies_float(output_samples[DCA_SPEAKER_Ls],
+                                            output_samples[DCA_SPEAKER_Rs],
+                                            nsamples);
+        }
+    }
+
+    // Downmix primary channel set to stereo
+    if (s->request_mask != s->ch_mask) {
+        ff_dca_downmix_to_stereo_float(s->float_dsp, output_samples,
+                                       s->prim_dmix_coeff,
+                                       nsamples, s->ch_mask);
+    }
+
+    return 0;
+}
+
+int ff_dca_core_filter_frame(DCACoreDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    DCAExssAsset *asset = &dca->exss.assets[0];
+    enum AVMatrixEncoding matrix_encoding;
+    int ret;
+
+    // Handle downmixing to stereo request
+    if (dca->request_channel_layout == DCA_SPEAKER_LAYOUT_STEREO
+        && s->audio_mode > DCA_AMODE_MONO && s->prim_dmix_embedded
+        && (s->prim_dmix_type == DCA_DMIX_TYPE_LoRo ||
+            s->prim_dmix_type == DCA_DMIX_TYPE_LtRt))
+        s->request_mask = DCA_SPEAKER_LAYOUT_STEREO;
+    else
+        s->request_mask = s->ch_mask;
+    if (!ff_dca_set_channel_layout(avctx, s->ch_remap, s->request_mask))
+        return AVERROR(EINVAL);
+
+    // Force fixed point mode when falling back from XLL
+    if ((avctx->flags & AV_CODEC_FLAG_BITEXACT) || ((dca->packet & DCA_PACKET_EXSS)
+                                                    && (asset->extension_mask & DCA_EXSS_XLL)))
+        ret = filter_frame_fixed(s, frame);
+    else
+        ret = filter_frame_float(s, frame);
+    if (ret < 0)
+        return ret;
+
+    // Set profile, bit rate, etc
+    if (s->ext_audio_mask & DCA_EXSS_MASK)
+        avctx->profile = FF_PROFILE_DTS_HD_HRA;
+    else if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH))
+        avctx->profile = FF_PROFILE_DTS_ES;
+    else if (s->ext_audio_mask & DCA_CSS_X96)
+        avctx->profile = FF_PROFILE_DTS_96_24;
+    else
+        avctx->profile = FF_PROFILE_DTS;
+
+    if (s->bit_rate > 3 && !(s->ext_audio_mask & DCA_EXSS_MASK))
+        avctx->bit_rate = s->bit_rate;
+    else
+        avctx->bit_rate = 0;
+
+    if (s->audio_mode == DCA_AMODE_STEREO_TOTAL || (s->request_mask != s->ch_mask &&
+                                                    s->prim_dmix_type == DCA_DMIX_TYPE_LtRt))
+        matrix_encoding = AV_MATRIX_ENCODING_DOLBY;
+    else
+        matrix_encoding = AV_MATRIX_ENCODING_NONE;
+    if ((ret = ff_side_data_update_matrix_encoding(frame, matrix_encoding)) < 0)
+        return ret;
+
+    return 0;
+}
+
+av_cold void ff_dca_core_flush(DCACoreDecoder *s)
+{
+    if (s->subband_buffer) {
+        erase_adpcm_history(s);
+        memset(s->lfe_samples, 0, DCA_LFE_HISTORY * sizeof(int32_t));
+    }
+
+    if (s->x96_subband_buffer)
+        erase_x96_adpcm_history(s);
+
+    erase_dsp_history(s);
+}
+
+av_cold int ff_dca_core_init(DCACoreDecoder *s)
+{
+    if (!(s->float_dsp = avpriv_float_dsp_alloc(0)))
+        return -1;
+    if (!(s->fixed_dsp = avpriv_alloc_fixed_dsp(0)))
+        return -1;
+
+    ff_dcadct_init(&s->dcadct);
+    if (ff_mdct_init(&s->imdct[0], 6, 1, 1.0) < 0)
+        return -1;
+    if (ff_mdct_init(&s->imdct[1], 7, 1, 1.0) < 0)
+        return -1;
+    ff_synth_filter_init(&s->synth);
+
+    s->x96_rand = 1;
+    return 0;
+}
+
+av_cold void ff_dca_core_close(DCACoreDecoder *s)
+{
+    av_freep(&s->float_dsp);
+    av_freep(&s->fixed_dsp);
+
+    ff_mdct_end(&s->imdct[0]);
+    ff_mdct_end(&s->imdct[1]);
+
+    av_freep(&s->subband_buffer);
+    s->subband_size = 0;
+
+    av_freep(&s->x96_subband_buffer);
+    s->x96_subband_size = 0;
+
+    av_freep(&s->output_buffer);
+    s->output_size = 0;
+}
diff --git a/libavcodec/dca_core.h b/libavcodec/dca_core.h
new file mode 100644
index 0000000..10128d1
--- /dev/null
+++ b/libavcodec/dca_core.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_CORE_H
+#define AVCODEC_DCA_CORE_H
+
+#include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dca_exss.h"
+#include "dcadsp.h"
+#include "dcadct.h"
+#include "dcamath.h"
+#include "dcahuff.h"
+#include "fft.h"
+#include "synth_filter.h"
+
+#define DCA_CHANNELS            7
+#define DCA_SUBBANDS            32
+#define DCA_SUBBANDS_X96        64
+#define DCA_SUBFRAMES           16
+#define DCA_SUBBAND_SAMPLES     8
+#define DCA_PCMBLOCK_SAMPLES    32
+#define DCA_LFE_HISTORY         8
+#define DCA_ABITS_MAX           26
+
+#define DCA_CORE_CHANNELS_MAX       6
+#define DCA_DMIX_CHANNELS_MAX       4
+#define DCA_XXCH_CHANNELS_MAX       2
+#define DCA_EXSS_CHANNELS_MAX       8
+#define DCA_EXSS_CHSETS_MAX         4
+
+#define DCA_FILTER_MODE_X96     0x01
+#define DCA_FILTER_MODE_FIXED   0x02
+
+enum DCACoreAudioMode {
+    DCA_AMODE_MONO,             // Mode 0: A (mono)
+    DCA_AMODE_MONO_DUAL,        // Mode 1: A + B (dual mono)
+    DCA_AMODE_STEREO,           // Mode 2: L + R (stereo)
+    DCA_AMODE_STEREO_SUMDIFF,   // Mode 3: (L+R) + (L-R) (sum-diff)
+    DCA_AMODE_STEREO_TOTAL,     // Mode 4: LT + RT (left and right total)
+    DCA_AMODE_3F,               // Mode 5: C + L + R
+    DCA_AMODE_2F1R,             // Mode 6: L + R + S
+    DCA_AMODE_3F1R,             // Mode 7: C + L + R + S
+    DCA_AMODE_2F2R,             // Mode 8: L + R + SL + SR
+    DCA_AMODE_3F2R,             // Mode 9: C + L + R + SL + SR
+
+    DCA_AMODE_COUNT
+};
+
+enum DCACoreExtAudioType {
+    DCA_EXT_AUDIO_XCH   = 0,
+    DCA_EXT_AUDIO_X96   = 2,
+    DCA_EXT_AUDIO_XXCH  = 6
+};
+
+enum DCACoreLFEFlag {
+    DCA_LFE_FLAG_NONE,
+    DCA_LFE_FLAG_128,
+    DCA_LFE_FLAG_64,
+    DCA_LFE_FLAG_INVALID
+};
+
+typedef struct DCADSPData {
+    union {
+        struct {
+            DECLARE_ALIGNED(32, float, hist1)[1024];
+            DECLARE_ALIGNED(32, float, hist2)[64];
+        } flt;
+        struct {
+            DECLARE_ALIGNED(32, int32_t, hist1)[1024];
+            DECLARE_ALIGNED(32, int32_t, hist2)[64];
+        } fix;
+    } u;
+    int offset;
+} DCADSPData;
+
+typedef struct DCACoreDecoder {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+    GetBitContext   gb_in;
+
+    // Bit stream header
+    int     crc_present;        ///< CRC present flag
+    int     npcmblocks;         ///< Number of PCM sample blocks
+    int     frame_size;         ///< Primary frame byte size
+    int     audio_mode;         ///< Audio channel arrangement
+    int     sample_rate;        ///< Core audio sampling frequency
+    int     bit_rate;           ///< Transmission bit rate
+    int     drc_present;        ///< Embedded dynamic range flag
+    int     ts_present;         ///< Embedded time stamp flag
+    int     aux_present;        ///< Auxiliary data flag
+    int     ext_audio_type;     ///< Extension audio descriptor flag
+    int     ext_audio_present;  ///< Extended coding flag
+    int     sync_ssf;           ///< Audio sync word insertion flag
+    int     lfe_present;        ///< Low frequency effects flag
+    int     predictor_history;  ///< Predictor history flag switch
+    int     filter_perfect;     ///< Multirate interpolator switch
+    int     source_pcm_res;     ///< Source PCM resolution
+    int     es_format;          ///< Extended surround (ES) mastering flag
+    int     sumdiff_front;      ///< Front sum/difference flag
+    int     sumdiff_surround;   ///< Surround sum/difference flag
+
+    // Primary audio coding header
+    int         nsubframes;     ///< Number of subframes
+    int         nchannels;      ///< Number of primary audio channels (incl. extension channels)
+    int         ch_mask;        ///< Speaker layout mask (incl. LFE and extension channels)
+    int8_t      nsubbands[DCA_CHANNELS];                ///< Subband activity count
+    int8_t      subband_vq_start[DCA_CHANNELS];         ///< High frequency VQ start subband
+    int8_t      joint_intensity_index[DCA_CHANNELS];    ///< Joint intensity coding index
+    int8_t      transition_mode_sel[DCA_CHANNELS];      ///< Transient mode code book
+    int8_t      scale_factor_sel[DCA_CHANNELS];         ///< Scale factor code book
+    int8_t      bit_allocation_sel[DCA_CHANNELS];       ///< Bit allocation quantizer select
+    int8_t      quant_index_sel[DCA_CHANNELS][DCA_CODE_BOOKS];  ///< Quantization index codebook select
+    int32_t     scale_factor_adj[DCA_CHANNELS][DCA_CODE_BOOKS]; ///< Scale factor adjustment
+
+    // Primary audio coding side information
+    int8_t      nsubsubframes[DCA_SUBFRAMES];   ///< Subsubframe count for each subframe
+    int8_t      prediction_mode[DCA_CHANNELS][DCA_SUBBANDS_X96];            ///< Prediction mode
+    int16_t     prediction_vq_index[DCA_CHANNELS][DCA_SUBBANDS_X96];        ///< Prediction coefficients VQ address
+    int8_t      bit_allocation[DCA_CHANNELS][DCA_SUBBANDS_X96];             ///< Bit allocation index
+    int8_t      transition_mode[DCA_SUBFRAMES][DCA_CHANNELS][DCA_SUBBANDS]; ///< Transition mode
+    int32_t     scale_factors[DCA_CHANNELS][DCA_SUBBANDS][2];               ///< Scale factors (2x for transients and X96)
+    int8_t      joint_scale_sel[DCA_CHANNELS];                              ///< Joint subband codebook select
+    int32_t     joint_scale_factors[DCA_CHANNELS][DCA_SUBBANDS_X96];        ///< Scale factors for joint subband coding
+
+    // Auxiliary data
+    int     prim_dmix_embedded; ///< Auxiliary dynamic downmix flag
+    int     prim_dmix_type;     ///< Auxiliary primary channel downmix type
+    int     prim_dmix_coeff[DCA_DMIX_CHANNELS_MAX * DCA_CORE_CHANNELS_MAX]; ///< Dynamic downmix code coefficients
+
+    // Core extensions
+    int     ext_audio_mask;     ///< Bit mask of fully decoded core extensions
+
+    // XCH extension data
+    int     xch_pos;    ///< Bit position of XCH frame in core substream
+
+    // XXCH extension data
+    int     xxch_crc_present;       ///< CRC presence flag for XXCH channel set header
+    int     xxch_mask_nbits;        ///< Number of bits for loudspeaker mask
+    int     xxch_core_mask;         ///< Core loudspeaker activity mask
+    int     xxch_spkr_mask;         ///< Loudspeaker layout mask
+    int     xxch_dmix_embedded;     ///< Downmix already performed by encoder
+    int     xxch_dmix_scale_inv;    ///< Downmix scale factor
+    int     xxch_dmix_mask[DCA_XXCH_CHANNELS_MAX];  ///< Downmix channel mapping mask
+    int     xxch_dmix_coeff[DCA_XXCH_CHANNELS_MAX * DCA_CORE_CHANNELS_MAX];     ///< Downmix coefficients
+    int     xxch_pos;   ///< Bit position of XXCH frame in core substream
+
+    // X96 extension data
+    int     x96_rev_no;         ///< X96 revision number
+    int     x96_crc_present;    ///< CRC presence flag for X96 channel set header
+    int     x96_nchannels;      ///< Number of primary channels in X96 extension
+    int     x96_high_res;       ///< X96 high resolution flag
+    int     x96_subband_start;  ///< First encoded subband in X96 extension
+    int     x96_rand;           ///< Random seed for generating samples for unallocated X96 subbands
+    int     x96_pos;            ///< Bit position of X96 frame in core substream
+
+    // Sample buffers
+    unsigned int    x96_subband_size;
+    int32_t         *x96_subband_buffer;    ///< X96 subband sample buffer base
+    int32_t         *x96_subband_samples[DCA_CHANNELS][DCA_SUBBANDS_X96];   ///< X96 subband samples
+
+    unsigned int    subband_size;
+    int32_t         *subband_buffer;    ///< Subband sample buffer base
+    int32_t         *subband_samples[DCA_CHANNELS][DCA_SUBBANDS];   ///< Subband samples
+    int32_t         *lfe_samples;    ///< Decimated LFE samples
+
+    // DSP contexts
+    DCADSPData              dcadsp_data[DCA_CHANNELS];    ///< FIR history buffers
+    DCADSPContext           *dcadsp;
+    DCADCTContext           dcadct;
+    FFTContext              imdct[2];
+    SynthFilterContext      synth;
+    AVFloatDSPContext       *float_dsp;
+    AVFixedDSPContext       *fixed_dsp;
+
+    // PCM output data
+    unsigned int    output_size;
+    void            *output_buffer;                         ///< PCM output buffer base
+    int32_t         *output_samples[DCA_SPEAKER_COUNT];     ///< PCM output for fixed point mode
+    int32_t         output_history_lfe_fixed;               ///< LFE PCM history for X96 filter
+    float           output_history_lfe_float;               ///< LFE PCM history for X96 filter
+
+    int     ch_remap[DCA_SPEAKER_COUNT];   ///< Channel to speaker map
+    int     request_mask;   ///< Requested channel layout (for stereo downmix)
+
+    int     npcmsamples;    ///< Number of PCM samples per channel
+    int     output_rate;    ///< Output sample rate (1x or 2x header rate)
+
+    int     filter_mode;    ///< Previous filtering mode for detecting changes
+} DCACoreDecoder;
+
+static inline int ff_dca_core_map_spkr(DCACoreDecoder *core, int spkr)
+{
+    if (core->ch_mask & (1U << spkr))
+        return spkr;
+    if (spkr == DCA_SPEAKER_Lss && (core->ch_mask & DCA_SPEAKER_MASK_Ls))
+        return DCA_SPEAKER_Ls;
+    if (spkr == DCA_SPEAKER_Rss && (core->ch_mask & DCA_SPEAKER_MASK_Rs))
+        return DCA_SPEAKER_Rs;
+    return -1;
+}
+
+static inline void ff_dca_core_dequantize(int32_t *output, const int32_t *input,
+                                          int32_t step_size, int32_t scale, int residual, int len)
+{
+    // Account for quantizer step size
+    int64_t step_scale = (int64_t)step_size * scale;
+    int n, shift = 0;
+
+    // Limit scale factor resolution to 22 bits
+    if (step_scale > (1 << 23)) {
+        shift = av_log2(step_scale >> 23) + 1;
+        step_scale >>= shift;
+    }
+
+    // Scale the samples
+    if (residual) {
+        for (n = 0; n < len; n++)
+            output[n] += clip23(norm__(input[n] * step_scale, 22 - shift));
+    } else {
+        for (n = 0; n < len; n++)
+            output[n]  = clip23(norm__(input[n] * step_scale, 22 - shift));
+    }
+}
+
+int ff_dca_core_parse(DCACoreDecoder *s, uint8_t *data, int size);
+int ff_dca_core_parse_exss(DCACoreDecoder *s, uint8_t *data, DCAExssAsset *asset);
+int ff_dca_core_filter_fixed(DCACoreDecoder *s, int x96_synth);
+int ff_dca_core_filter_frame(DCACoreDecoder *s, AVFrame *frame);
+av_cold void ff_dca_core_flush(DCACoreDecoder *s);
+av_cold int ff_dca_core_init(DCACoreDecoder *s);
+av_cold void ff_dca_core_close(DCACoreDecoder *s);
+
+#endif
diff --git a/libavcodec/dca_core_bsf.c b/libavcodec/dca_core_bsf.c
new file mode 100644
index 0000000..8565796
--- /dev/null
+++ b/libavcodec/dca_core_bsf.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "bytestream.h"
+#include "dca_syncwords.h"
+#include "libavutil/mem.h"
+
+static int dca_core_filter(AVBSFContext *ctx, AVPacket *pkt)
+{
+    GetByteContext gb;
+    uint32_t syncword;
+    int core_size = 0, ret;
+
+    ret = ff_bsf_get_packet_ref(ctx, pkt);
+    if (ret < 0)
+        return ret;
+
+    bytestream2_init(&gb, pkt->data, pkt->size);
+    syncword = bytestream2_get_be32(&gb);
+    bytestream2_skip(&gb, 1);
+
+    switch (syncword) {
+    case DCA_SYNCWORD_CORE_BE:
+        core_size = ((bytestream2_get_be24(&gb) >> 4) & 0x3fff) + 1;
+        break;
+    }
+
+    if (core_size > 0 && core_size <= pkt->size) {
+        pkt->size = core_size;
+    }
+
+    return 0;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_DTS, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_dca_core_bsf = {
+    .name      = "dca_core",
+    .filter    = dca_core_filter,
+    .codec_ids = codec_ids,
+};
diff --git a/libavcodec/dca_exss.c b/libavcodec/dca_exss.c
index 2895e20..e873088 100644
--- a/libavcodec/dca_exss.c
+++ b/libavcodec/dca_exss.c
@@ -1,368 +1,514 @@
 /*
- * DCA ExSS extension
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/common.h"
-#include "libavutil/log.h"
-
-#include "dca.h"
-#include "dca_syncwords.h"
-#include "get_bits.h"
-
-/* extensions that reside in core substream */
-#define DCA_CORE_EXTS (DCA_EXT_XCH | DCA_EXT_XXCH | DCA_EXT_X96)
-
-/* these are unconfirmed but should be mostly correct */
-enum DCAExSSSpeakerMask {
-    DCA_EXSS_FRONT_CENTER          = 0x0001,
-    DCA_EXSS_FRONT_LEFT_RIGHT      = 0x0002,
-    DCA_EXSS_SIDE_REAR_LEFT_RIGHT  = 0x0004,
-    DCA_EXSS_LFE                   = 0x0008,
-    DCA_EXSS_REAR_CENTER           = 0x0010,
-    DCA_EXSS_FRONT_HIGH_LEFT_RIGHT = 0x0020,
-    DCA_EXSS_REAR_LEFT_RIGHT       = 0x0040,
-    DCA_EXSS_FRONT_HIGH_CENTER     = 0x0080,
-    DCA_EXSS_OVERHEAD              = 0x0100,
-    DCA_EXSS_CENTER_LEFT_RIGHT     = 0x0200,
-    DCA_EXSS_WIDE_LEFT_RIGHT       = 0x0400,
-    DCA_EXSS_SIDE_LEFT_RIGHT       = 0x0800,
-    DCA_EXSS_LFE2                  = 0x1000,
-    DCA_EXSS_SIDE_HIGH_LEFT_RIGHT  = 0x2000,
-    DCA_EXSS_REAR_HIGH_CENTER      = 0x4000,
-    DCA_EXSS_REAR_HIGH_LEFT_RIGHT  = 0x8000,
-};
-
-/**
- * Return the number of channels in an ExSS speaker mask (HD)
- */
-static int dca_exss_mask2count(int mask)
+#include "dcadec.h"
+
+static void parse_xll_parameters(DCAExssParser *s, DCAExssAsset *asset)
 {
-    /* count bits that mean speaker pairs twice */
-    return av_popcount(mask) +
-           av_popcount(mask & (DCA_EXSS_CENTER_LEFT_RIGHT      |
-                               DCA_EXSS_FRONT_LEFT_RIGHT       |
-                               DCA_EXSS_FRONT_HIGH_LEFT_RIGHT  |
-                               DCA_EXSS_WIDE_LEFT_RIGHT        |
-                               DCA_EXSS_SIDE_LEFT_RIGHT        |
-                               DCA_EXSS_SIDE_HIGH_LEFT_RIGHT   |
-                               DCA_EXSS_SIDE_REAR_LEFT_RIGHT   |
-                               DCA_EXSS_REAR_LEFT_RIGHT        |
-                               DCA_EXSS_REAR_HIGH_LEFT_RIGHT));
+    // Size of XLL data in extension substream
+    asset->xll_size = get_bits(&s->gb, s->exss_size_nbits) + 1;
+
+    // XLL sync word present flag
+    if (asset->xll_sync_present = get_bits1(&s->gb)) {
+        int xll_delay_nbits;
+
+        // Peak bit rate smoothing buffer size
+        skip_bits(&s->gb, 4);
+
+        // Number of bits for XLL decoding delay
+        xll_delay_nbits = get_bits(&s->gb, 5) + 1;
+
+        // Initial XLL decoding delay in frames
+        asset->xll_delay_nframes = get_bits_long(&s->gb, xll_delay_nbits);
+
+        // Number of bytes offset to XLL sync
+        asset->xll_sync_offset = get_bits(&s->gb, s->exss_size_nbits);
+    } else {
+        asset->xll_delay_nframes = 0;
+        asset->xll_sync_offset = 0;
+    }
 }
 
-/**
- * Skip mixing coefficients of a single mix out configuration (HD)
- */
-static void dca_exss_skip_mix_coeffs(GetBitContext *gb, int channels, int out_ch)
+static void parse_lbr_parameters(DCAExssParser *s, DCAExssAsset *asset)
 {
-    int i;
+    // Size of LBR component in extension substream
+    asset->lbr_size = get_bits(&s->gb, 14) + 1;
 
-    for (i = 0; i < channels; i++) {
-        int mix_map_mask = get_bits(gb, out_ch);
-        int num_coeffs = av_popcount(mix_map_mask);
-        skip_bits_long(gb, num_coeffs * 6);
-    }
+    // LBR sync word present flag
+    if (get_bits1(&s->gb))
+        // LBR sync distance
+        skip_bits(&s->gb, 2);
 }
 
-/**
- * Parse extension substream asset header (HD)
- */
-static int dca_exss_parse_asset_header(DCAContext *s)
+static int parse_descriptor(DCAExssParser *s, DCAExssAsset *asset)
 {
-    int header_pos = get_bits_count(&s->gb);
-    int header_size;
-    int channels = 0;
-    int embedded_stereo = 0;
-    int embedded_6ch    = 0;
-    int drc_code_present;
-    int extensions_mask = 0;
-    int i, j;
-
-    if (get_bits_left(&s->gb) < 16)
-        return AVERROR_INVALIDDATA;
+    int i, j, drc_present, descr_size, descr_pos = get_bits_count(&s->gb);
+
+    // Size of audio asset descriptor in bytes
+    descr_size = get_bits(&s->gb, 9) + 1;
 
-    /* We will parse just enough to get to the extensions bitmask with which
-     * we can set the profile value. */
+    // Audio asset identifier
+    asset->asset_index = get_bits(&s->gb, 3);
 
-    header_size = get_bits(&s->gb, 9) + 1;
-    skip_bits(&s->gb, 3); // asset index
+    //
+    // Per stream static metadata
+    //
 
-    if (s->static_fields) {
+    if (s->static_fields_present) {
+        // Asset type descriptor presence
         if (get_bits1(&s->gb))
-            skip_bits(&s->gb, 4); // asset type descriptor
+            // Asset type descriptor
+            skip_bits(&s->gb, 4);
+
+        // Language descriptor presence
         if (get_bits1(&s->gb))
-            skip_bits_long(&s->gb, 24); // language descriptor
+            // Language descriptor
+            skip_bits(&s->gb, 24);
 
+        // Additional textual information presence
         if (get_bits1(&s->gb)) {
-            /* How can one fit 1024 bytes of text here if the maximum value
-             * for the asset header size field above was 512 bytes? */
-            int text_length = get_bits(&s->gb, 10) + 1;
-            if (get_bits_left(&s->gb) < text_length * 8)
+            // Byte size of additional text info
+            int text_size = get_bits(&s->gb, 10) + 1;
+
+            // Sanity check available size
+            if (get_bits_left(&s->gb) < text_size * 8)
                 return AVERROR_INVALIDDATA;
-            skip_bits_long(&s->gb, text_length * 8); // info text
+
+            // Additional textual information string
+            skip_bits_long(&s->gb, text_size * 8);
         }
 
-        skip_bits(&s->gb, 5); // bit resolution - 1
-        skip_bits(&s->gb, 4); // max sample rate code
-        channels = get_bits(&s->gb, 8) + 1;
+        // PCM bit resolution
+        asset->pcm_bit_res = get_bits(&s->gb, 5) + 1;
 
-        s->one2one_map_chtospkr = get_bits1(&s->gb);
-        if (s->one2one_map_chtospkr) {
-            int spkr_remap_sets;
-            int spkr_mask_size = 16;
-            int num_spkrs[7];
+        // Maximum sample rate
+        asset->max_sample_rate = ff_dca_sampling_freqs[get_bits(&s->gb, 4)];
 
-            if (channels > 2)
-                embedded_stereo = get_bits1(&s->gb);
-            if (channels > 6)
-                embedded_6ch = get_bits1(&s->gb);
+        // Total number of channels
+        asset->nchannels_total = get_bits(&s->gb, 8) + 1;
 
-            if (get_bits1(&s->gb)) {
-                spkr_mask_size = (get_bits(&s->gb, 2) + 1) << 2;
-                skip_bits(&s->gb, spkr_mask_size); // spkr activity mask
-            }
+        // One to one map channel to speakers
+        if (asset->one_to_one_map_ch_to_spkr = get_bits1(&s->gb)) {
+            int spkr_mask_nbits = 0;
+            int spkr_remap_nsets;
+            int nspeakers[8];
 
-            spkr_remap_sets = get_bits(&s->gb, 3);
+            // Embedded stereo flag
+            asset->embedded_stereo = asset->nchannels_total > 2 && get_bits1(&s->gb);
 
-            for (i = 0; i < spkr_remap_sets; i++) {
-                /* std layout mask for each remap set */
-                num_spkrs[i] = dca_exss_mask2count(get_bits(&s->gb, spkr_mask_size));
+            // Embedded 6 channels flag
+            asset->embedded_6ch = asset->nchannels_total > 6 && get_bits1(&s->gb);
+
+            // Speaker mask enabled flag
+            if (asset->spkr_mask_enabled = get_bits1(&s->gb)) {
+                // Number of bits for speaker activity mask
+                spkr_mask_nbits = (get_bits(&s->gb, 2) + 1) << 2;
+
+                // Loudspeaker activity mask
+                asset->spkr_mask = get_bits(&s->gb, spkr_mask_nbits);
+            }
+
+            // Number of speaker remapping sets
+            if ((spkr_remap_nsets = get_bits(&s->gb, 3)) && !spkr_mask_nbits) {
+                if (s->avctx)
+                    av_log(s->avctx, AV_LOG_ERROR, "Speaker mask disabled yet there are remapping sets\n");
+                return AVERROR_INVALIDDATA;
             }
 
-            for (i = 0; i < spkr_remap_sets; i++) {
-                int num_dec_ch_remaps = get_bits(&s->gb, 5) + 1;
-                if (get_bits_left(&s->gb) < 0)
-                    return AVERROR_INVALIDDATA;
+            // Standard loudspeaker layout mask
+            for (i = 0; i < spkr_remap_nsets; i++)
+                nspeakers[i] = ff_dca_count_chs_for_mask(get_bits(&s->gb, spkr_mask_nbits));
+
+            for (i = 0; i < spkr_remap_nsets; i++) {
+                // Number of channels to be decoded for speaker remapping
+                int nch_for_remaps = get_bits(&s->gb, 5) + 1;
 
-                for (j = 0; j < num_spkrs[i]; j++) {
-                    int remap_dec_ch_mask = get_bits_long(&s->gb, num_dec_ch_remaps);
-                    int num_dec_ch = av_popcount(remap_dec_ch_mask);
-                    skip_bits_long(&s->gb, num_dec_ch * 5); // remap codes
+                for (j = 0; j < nspeakers[i]; j++) {
+                    // Decoded channels to output speaker mapping mask
+                    int remap_ch_mask = get_bits_long(&s->gb, nch_for_remaps);
+
+                    // Loudspeaker remapping codes
+                    skip_bits_long(&s->gb, av_popcount(remap_ch_mask) * 5);
                 }
             }
         } else {
-            skip_bits(&s->gb, 3); // representation type
+            asset->embedded_stereo = 0;
+            asset->embedded_6ch = 0;
+            asset->spkr_mask_enabled = 0;
+            asset->spkr_mask = 0;
+
+            // Representation type
+            asset->representation_type = get_bits(&s->gb, 3);
         }
     }
 
-    drc_code_present = get_bits1(&s->gb);
-    if (drc_code_present)
-        get_bits(&s->gb, 8); // drc code
+    //
+    // DRC, DNC and mixing metadata
+    //
+
+    // Dynamic range coefficient presence flag
+    drc_present = get_bits1(&s->gb);
 
+    // Code for dynamic range coefficient
+    if (drc_present)
+        skip_bits(&s->gb, 8);
+
+    // Dialog normalization presence flag
     if (get_bits1(&s->gb))
-        skip_bits(&s->gb, 5); // dialog normalization code
+        // Dialog normalization code
+        skip_bits(&s->gb, 5);
 
-    if (drc_code_present && embedded_stereo)
-        get_bits(&s->gb, 8); // drc stereo code
+    // DRC for stereo downmix
+    if (drc_present && asset->embedded_stereo)
+        skip_bits(&s->gb, 8);
 
-    if (s->mix_metadata && get_bits1(&s->gb)) {
-        skip_bits(&s->gb, 1); // external mix
-        skip_bits(&s->gb, 6); // post mix gain code
+    // Mixing metadata presence flag
+    if (s->mix_metadata_enabled && get_bits1(&s->gb)) {
+        int nchannels_dmix;
 
-        if (get_bits(&s->gb, 2) != 3) // mixer drc code
-            skip_bits(&s->gb, 3); // drc limit
-        else
-            skip_bits(&s->gb, 8); // custom drc code
+        // External mixing flag
+        skip_bits1(&s->gb);
+
+        // Post mixing / replacement gain adjustment
+        skip_bits(&s->gb, 6);
 
-        if (get_bits1(&s->gb)) // channel specific scaling
-            for (i = 0; i < s->num_mix_configs; i++)
-                skip_bits_long(&s->gb, s->mix_config_num_ch[i] * 6); // scale codes
+        // DRC prior to mixing
+        if (get_bits(&s->gb, 2) == 3)
+            // Custom code for mixing DRC
+            skip_bits(&s->gb, 8);
         else
-            skip_bits_long(&s->gb, s->num_mix_configs * 6); // scale codes
+            // Limit for mixing DRC
+            skip_bits(&s->gb, 3);
 
-        for (i = 0; i < s->num_mix_configs; i++) {
-            if (get_bits_left(&s->gb) < 0)
+        // Scaling type for channels of main audio
+        // Scaling parameters of main audio
+        if (get_bits1(&s->gb))
+            for (i = 0; i < s->nmixoutconfigs; i++)
+                skip_bits_long(&s->gb, 6 * s->nmixoutchs[i]);
+        else
+            skip_bits_long(&s->gb, 6 * s->nmixoutconfigs);
+
+        nchannels_dmix = asset->nchannels_total;
+        if (asset->embedded_6ch)
+            nchannels_dmix += 6;
+        if (asset->embedded_stereo)
+            nchannels_dmix += 2;
+
+        for (i = 0; i < s->nmixoutconfigs; i++) {
+            if (!s->nmixoutchs[i]) {
+                if (s->avctx)
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid speaker layout mask for mixing configuration\n");
                 return AVERROR_INVALIDDATA;
-            dca_exss_skip_mix_coeffs(&s->gb, channels, s->mix_config_num_ch[i]);
-            if (embedded_6ch)
-                dca_exss_skip_mix_coeffs(&s->gb, 6, s->mix_config_num_ch[i]);
-            if (embedded_stereo)
-                dca_exss_skip_mix_coeffs(&s->gb, 2, s->mix_config_num_ch[i]);
+            }
+            for (j = 0; j < nchannels_dmix; j++) {
+                // Mix output mask
+                int mix_map_mask = get_bits(&s->gb, s->nmixoutchs[i]);
+
+                // Mixing coefficients
+                skip_bits_long(&s->gb, av_popcount(mix_map_mask) * 6);
+            }
         }
     }
 
-    switch (get_bits(&s->gb, 2)) {
-    case 0:
-        extensions_mask = get_bits(&s->gb, 12);
+    //
+    // Decoder navigation data
+    //
+
+    // Coding mode for the asset
+    asset->coding_mode = get_bits(&s->gb, 2);
+
+    // Coding components used in asset
+    switch (asset->coding_mode) {
+    case 0: // Coding mode that may contain multiple coding components
+        asset->extension_mask = get_bits(&s->gb, 12);
+
+        if (asset->extension_mask & DCA_EXSS_CORE) {
+            // Size of core component in extension substream
+            asset->core_size = get_bits(&s->gb, 14) + 1;
+            // Core sync word present flag
+            if (get_bits1(&s->gb))
+                // Core sync distance
+                skip_bits(&s->gb, 2);
+        }
+
+        if (asset->extension_mask & DCA_EXSS_XBR)
+            // Size of XBR extension in extension substream
+            asset->xbr_size = get_bits(&s->gb, 14) + 1;
+
+        if (asset->extension_mask & DCA_EXSS_XXCH)
+            // Size of XXCH extension in extension substream
+            asset->xxch_size = get_bits(&s->gb, 14) + 1;
+
+        if (asset->extension_mask & DCA_EXSS_X96)
+            // Size of X96 extension in extension substream
+            asset->x96_size = get_bits(&s->gb, 12) + 1;
+
+        if (asset->extension_mask & DCA_EXSS_LBR)
+            parse_lbr_parameters(s, asset);
+
+        if (asset->extension_mask & DCA_EXSS_XLL)
+            parse_xll_parameters(s, asset);
+
+        if (asset->extension_mask & DCA_EXSS_RSV1)
+            skip_bits(&s->gb, 16);
+
+        if (asset->extension_mask & DCA_EXSS_RSV2)
+            skip_bits(&s->gb, 16);
         break;
-    case 1:
-        extensions_mask = DCA_EXT_EXSS_XLL;
+
+    case 1: // Loss-less coding mode without CBR component
+        asset->extension_mask = DCA_EXSS_XLL;
+        parse_xll_parameters(s, asset);
         break;
-    case 2:
-        extensions_mask = DCA_EXT_EXSS_LBR;
+
+    case 2: // Low bit rate mode
+        asset->extension_mask = DCA_EXSS_LBR;
+        parse_lbr_parameters(s, asset);
         break;
-    case 3:
-        extensions_mask = 0; /* aux coding */
+
+    case 3: // Auxiliary coding mode
+        asset->extension_mask = 0;
+
+        // Size of auxiliary coded data
+        skip_bits(&s->gb, 14);
+
+        // Auxiliary codec identification
+        skip_bits(&s->gb, 8);
+
+        // Aux sync word present flag
+        if (get_bits1(&s->gb))
+            // Aux sync distance
+            skip_bits(&s->gb, 3);
         break;
     }
 
-    /* not parsed further, we were only interested in the extensions mask */
-
-    if (get_bits_left(&s->gb) < 0)
+    if (asset->extension_mask & DCA_EXSS_XLL)
+        // DTS-HD stream ID
+        asset->hd_stream_id = get_bits(&s->gb, 3);
+
+    // One to one mixing flag
+    // Per channel main audio scaling flag
+    // Main audio scaling codes
+    // Decode asset in secondary decoder flag
+    // Revision 2 DRC metadata
+    // Reserved
+    // Zero pad
+    if (ff_dca_seek_bits(&s->gb, descr_pos + descr_size * 8)) {
+        if (s->avctx)
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of EXSS asset descriptor\n");
         return AVERROR_INVALIDDATA;
+    }
 
-    if (get_bits_count(&s->gb) - header_pos > header_size * 8) {
-        av_log(s->avctx, AV_LOG_WARNING, "Asset header size mismatch.\n");
-        return AVERROR_INVALIDDATA;
+    return 0;
+}
+
+static int set_exss_offsets(DCAExssAsset *asset)
+{
+    int offs = asset->asset_offset;
+    int size = asset->asset_size;
+
+    if (asset->extension_mask & DCA_EXSS_CORE) {
+        asset->core_offset = offs;
+        if (asset->core_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->core_size;
+        size -= asset->core_size;
+    }
+
+    if (asset->extension_mask & DCA_EXSS_XBR) {
+        asset->xbr_offset = offs;
+        if (asset->xbr_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->xbr_size;
+        size -= asset->xbr_size;
+    }
+
+    if (asset->extension_mask & DCA_EXSS_XXCH) {
+        asset->xxch_offset = offs;
+        if (asset->xxch_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->xxch_size;
+        size -= asset->xxch_size;
+    }
+
+    if (asset->extension_mask & DCA_EXSS_X96) {
+        asset->x96_offset = offs;
+        if (asset->x96_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->x96_size;
+        size -= asset->x96_size;
     }
-    skip_bits_long(&s->gb, header_pos + header_size * 8 - get_bits_count(&s->gb));
 
-    if (extensions_mask & DCA_EXT_EXSS_XLL)
-        s->profile = FF_PROFILE_DTS_HD_MA;
-    else if (extensions_mask & (DCA_EXT_EXSS_XBR | DCA_EXT_EXSS_X96 |
-                                DCA_EXT_EXSS_XXCH))
-        s->profile = FF_PROFILE_DTS_HD_HRA;
+    if (asset->extension_mask & DCA_EXSS_LBR) {
+        asset->lbr_offset = offs;
+        if (asset->lbr_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->lbr_size;
+        size -= asset->lbr_size;
+    }
 
-    if (!(extensions_mask & DCA_EXT_CORE))
-        av_log(s->avctx, AV_LOG_WARNING, "DTS core detection mismatch.\n");
-    if ((extensions_mask & DCA_CORE_EXTS) != s->core_ext_mask)
-        av_log(s->avctx, AV_LOG_WARNING,
-               "DTS extensions detection mismatch (%d, %d)\n",
-               extensions_mask & DCA_CORE_EXTS, s->core_ext_mask);
+    if (asset->extension_mask & DCA_EXSS_XLL) {
+        asset->xll_offset = offs;
+        if (asset->xll_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->xll_size;
+        size -= asset->xll_size;
+    }
 
     return 0;
 }
 
-/**
- * Parse extension substream header (HD)
- */
-void ff_dca_exss_parse_header(DCAContext *s)
+int ff_dca_exss_parse(DCAExssParser *s, const uint8_t *data, int size)
 {
-    int asset_size[8];
-    int ss_index;
-    int blownup;
-    int num_audiop = 1;
-    int num_assets = 1;
-    int active_ss_mask[8];
-    int i, j;
-    int start_pos;
-    int hdrsize;
-    uint32_t mkr;
-
-    if (get_bits_left(&s->gb) < 52)
-        return;
-
-    start_pos = get_bits_count(&s->gb) - 32;
-
-    skip_bits(&s->gb, 8); // user data
-    ss_index = get_bits(&s->gb, 2);
-
-    blownup = get_bits1(&s->gb);
-    hdrsize = get_bits(&s->gb,  8 + 4 * blownup) + 1; // header_size
-    skip_bits(&s->gb, 16 + 4 * blownup); // hd_size
-
-    s->static_fields = get_bits1(&s->gb);
-    if (s->static_fields) {
-        skip_bits(&s->gb, 2); // reference clock code
-        skip_bits(&s->gb, 3); // frame duration code
+    int i, ret, offset, wide_hdr, header_size;
 
-        if (get_bits1(&s->gb))
-            skip_bits_long(&s->gb, 36); // timestamp
+    if ((ret = init_get_bits8(&s->gb, data, size)) < 0)
+        return ret;
+
+    // Extension substream sync word
+    skip_bits_long(&s->gb, 32);
+
+    // User defined bits
+    skip_bits(&s->gb, 8);
 
-        /* a single stream can contain multiple audio assets that can be
-         * combined to form multiple audio presentations */
+    // Extension substream index
+    s->exss_index = get_bits(&s->gb, 2);
 
-        num_audiop = get_bits(&s->gb, 3) + 1;
-        if (num_audiop > 1) {
-            avpriv_request_sample(s->avctx,
-                                  "Multiple DTS-HD audio presentations");
-            /* ignore such streams for now */
-            return;
+    // Flag indicating short or long header size
+    wide_hdr = get_bits1(&s->gb);
+
+    // Extension substream header length
+    header_size = get_bits(&s->gb, 8 + 4 * wide_hdr) + 1;
+
+    // Check CRC
+    if (s->avctx && ff_dca_check_crc(s->avctx, &s->gb, 32 + 8, header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid EXSS header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->exss_size_nbits = 16 + 4 * wide_hdr;
+
+    // Number of bytes of extension substream
+    s->exss_size = get_bits(&s->gb, s->exss_size_nbits) + 1;
+    if (s->exss_size > size) {
+        if (s->avctx)
+            av_log(s->avctx, AV_LOG_ERROR, "Packet too short for EXSS frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Per stream static fields presence flag
+    if (s->static_fields_present = get_bits1(&s->gb)) {
+        int active_exss_mask[8];
+
+        // Reference clock code
+        skip_bits(&s->gb, 2);
+
+        // Extension substream frame duration
+        skip_bits(&s->gb, 3);
+
+        // Timecode presence flag
+        if (get_bits1(&s->gb))
+            // Timecode data
+            skip_bits_long(&s->gb, 36);
+
+        // Number of defined audio presentations
+        s->npresents = get_bits(&s->gb, 3) + 1;
+        if (s->npresents > 1) {
+            if (s->avctx)
+                avpriv_request_sample(s->avctx, "%d audio presentations", s->npresents);
+            return AVERROR_PATCHWELCOME;
         }
 
-        num_assets = get_bits(&s->gb, 3) + 1;
-        if (num_assets > 1) {
-            avpriv_request_sample(s->avctx, "Multiple DTS-HD audio assets");
-            /* ignore such streams for now */
-            return;
+        // Number of audio assets in extension substream
+        s->nassets = get_bits(&s->gb, 3) + 1;
+        if (s->nassets > 1) {
+            if (s->avctx)
+                avpriv_request_sample(s->avctx, "%d audio assets", s->nassets);
+            return AVERROR_PATCHWELCOME;
         }
 
-        for (i = 0; i < num_audiop; i++)
-            active_ss_mask[i] = get_bits(&s->gb, ss_index + 1);
+        // Active extension substream mask for audio presentation
+        for (i = 0; i < s->npresents; i++)
+            active_exss_mask[i] = get_bits(&s->gb, s->exss_index + 1);
 
-        for (i = 0; i < num_audiop; i++)
-            for (j = 0; j <= ss_index; j++)
-                if (active_ss_mask[i] & (1 << j))
-                    skip_bits(&s->gb, 8); // active asset mask
+        // Active audio asset mask
+        for (i = 0; i < s->npresents; i++)
+            skip_bits_long(&s->gb, av_popcount(active_exss_mask[i]) * 8);
 
-        s->mix_metadata = get_bits1(&s->gb);
-        if (s->mix_metadata) {
-            int mix_out_mask_size;
+        // Mixing metadata enable flag
+        if (s->mix_metadata_enabled = get_bits1(&s->gb)) {
+            int spkr_mask_nbits;
 
-            skip_bits(&s->gb, 2); // adjustment level
-            mix_out_mask_size  = (get_bits(&s->gb, 2) + 1) << 2;
-            s->num_mix_configs =  get_bits(&s->gb, 2) + 1;
+            // Mixing metadata adjustment level
+            skip_bits(&s->gb, 2);
 
-            for (i = 0; i < s->num_mix_configs; i++) {
-                int mix_out_mask        = get_bits(&s->gb, mix_out_mask_size);
-                s->mix_config_num_ch[i] = dca_exss_mask2count(mix_out_mask);
-            }
-        }
-    }
+            // Number of bits for mixer output speaker activity mask
+            spkr_mask_nbits = (get_bits(&s->gb, 2) + 1) << 2;
 
-    for (i = 0; i < num_assets; i++)
-        asset_size[i] = get_bits_long(&s->gb, 16 + 4 * blownup) + 1;
+            // Number of mixing configurations
+            s->nmixoutconfigs = get_bits(&s->gb, 2) + 1;
 
-    for (i = 0; i < num_assets; i++) {
-        if (dca_exss_parse_asset_header(s))
-            return;
+            // Speaker layout mask for mixer output channels
+            for (i = 0; i < s->nmixoutconfigs; i++)
+                s->nmixoutchs[i] = ff_dca_count_chs_for_mask(get_bits(&s->gb, spkr_mask_nbits));
+        }
+    } else {
+        s->npresents = 1;
+        s->nassets = 1;
     }
 
-    if (num_assets > 0) {
-        j = get_bits_count(&s->gb);
-        if (start_pos + hdrsize * 8 > j)
-            skip_bits_long(&s->gb, start_pos + hdrsize * 8 - j);
-
-        for (i = 0; i < num_assets; i++) {
-            int end_pos;
-            start_pos = get_bits_count(&s->gb);
-            end_pos   = start_pos + asset_size[i] * 8;
-            mkr       = get_bits_long(&s->gb, 32);
-
-            /* parse extensions that we know about */
-            switch (mkr) {
-            case DCA_SYNCWORD_XLL:
-                if (s->xll_disable) {
-                    av_log(s->avctx, AV_LOG_DEBUG,
-                           "DTS-XLL: ignoring XLL extension\n");
-                    break;
-                }
-                av_log(s->avctx, AV_LOG_DEBUG,
-                       "DTS-XLL: decoding XLL extension\n");
-                if (ff_dca_xll_decode_header(s)        == 0 &&
-                    ff_dca_xll_decode_navi(s, end_pos) == 0)
-                    s->exss_ext_mask |= DCA_EXT_EXSS_XLL;
-                break;
-            case DCA_SYNCWORD_XBR:
-            case DCA_SYNCWORD_XXCH:
-            default:
-                av_log(s->avctx, AV_LOG_VERBOSE,
-                       "DTS-ExSS: unknown marker = 0x%08"PRIx32"\n", mkr);
-            }
+    // Size of encoded asset data in bytes
+    offset = header_size;
+    for (i = 0; i < s->nassets; i++) {
+        s->assets[i].asset_offset = offset;
+        s->assets[i].asset_size = get_bits(&s->gb, s->exss_size_nbits) + 1;
+        offset += s->assets[i].asset_size;
+        if (offset > s->exss_size) {
+            if (s->avctx)
+                av_log(s->avctx, AV_LOG_ERROR, "EXSS asset out of bounds\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
 
-            /* skip to end of block */
-            j = get_bits_count(&s->gb);
-            if (j > end_pos)
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "DTS-ExSS: Processed asset too long.\n");
-            if (j < end_pos)
-                skip_bits_long(&s->gb, end_pos - j);
+    // Audio asset descriptor
+    for (i = 0; i < s->nassets; i++) {
+        if ((ret = parse_descriptor(s, &s->assets[i])) < 0)
+            return ret;
+        if ((ret = set_exss_offsets(&s->assets[i])) < 0) {
+            if (s->avctx)
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid extension size in EXSS asset descriptor\n");
+            return ret;
         }
     }
+
+    // Backward compatible core present
+    // Backward compatible core substream index
+    // Backward compatible core asset index
+    // Reserved
+    // Byte align
+    // CRC16 of extension substream header
+    if (ff_dca_seek_bits(&s->gb, header_size * 8)) {
+        if (s->avctx)
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of EXSS header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
 }
diff --git a/libavcodec/dca_exss.h b/libavcodec/dca_exss.h
new file mode 100644
index 0000000..208fae1
--- /dev/null
+++ b/libavcodec/dca_exss.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_EXSS_H
+#define AVCODEC_DCA_EXSS_H
+
+#include "libavutil/common.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+
+typedef struct DCAExssAsset {
+    int     asset_offset;   ///< Offset to asset data from start of substream
+    int     asset_size;     ///< Size of encoded asset data
+    int     asset_index;    ///< Audio asset identifier
+
+    int     pcm_bit_res;                ///< PCM bit resolution
+    int     max_sample_rate;            ///< Maximum sample rate
+    int     nchannels_total;            ///< Total number of channels
+    int     one_to_one_map_ch_to_spkr;  ///< One to one channel to speaker mapping flag
+    int     embedded_stereo;            ///< Embedded stereo flag
+    int     embedded_6ch;               ///< Embedded 6 channels flag
+    int     spkr_mask_enabled;          ///< Speaker mask enabled flag
+    int     spkr_mask;                  ///< Loudspeaker activity mask
+    int     representation_type;        ///< Representation type
+
+    int     coding_mode;        ///< Coding mode for the asset
+    int     extension_mask;     ///< Coding components used in asset
+
+    int     core_offset;    ///< Offset to core component from start of substream
+    int     core_size;      ///< Size of core component in extension substream
+
+    int     xbr_offset;     ///< Offset to XBR extension from start of substream
+    int     xbr_size;       ///< Size of XBR extension in extension substream
+
+    int     xxch_offset;    ///< Offset to XXCH extension from start of substream
+    int     xxch_size;      ///< Size of XXCH extension in extension substream
+
+    int     x96_offset;     ///< Offset to X96 extension from start of substream
+    int     x96_size;       ///< Size of X96 extension in extension substream
+
+    int     lbr_offset;     ///< Offset to LBR component from start of substream
+    int     lbr_size;       ///< Size of LBR component in extension substream
+
+    int     xll_offset;         ///< Offset to XLL data from start of substream
+    int     xll_size;           ///< Size of XLL data in extension substream
+    int     xll_sync_present;   ///< XLL sync word present flag
+    int     xll_delay_nframes;  ///< Initial XLL decoding delay in frames
+    int     xll_sync_offset;    ///< Number of bytes offset to XLL sync
+
+    int     hd_stream_id;   ///< DTS-HD stream ID
+} DCAExssAsset;
+
+typedef struct DCAExssParser {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    int     exss_index;         ///< Extension substream index
+    int     exss_size_nbits;    ///< Number of bits for extension substream size
+    int     exss_size;          ///< Number of bytes of extension substream
+
+    int     static_fields_present;  ///< Per stream static fields presence flag
+    int     npresents;  ///< Number of defined audio presentations
+    int     nassets;    ///< Number of audio assets in extension substream
+
+    int     mix_metadata_enabled;   ///< Mixing metadata enable flag
+    int     nmixoutconfigs;         ///< Number of mixing configurations
+    int     nmixoutchs[4];          ///< Speaker layout mask for mixer output channels
+
+    DCAExssAsset   assets[1];    ///< Audio asset descriptors
+} DCAExssParser;
+
+int ff_dca_exss_parse(DCAExssParser *s, const uint8_t *data, int size);
+
+#endif
diff --git a/libavcodec/dca_lbr.c b/libavcodec/dca_lbr.c
new file mode 100644
index 0000000..3b50a99
--- /dev/null
+++ b/libavcodec/dca_lbr.c
@@ -0,0 +1,1814 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_READER_LE
+
+#include "libavutil/channel_layout.h"
+
+#include "dcadec.h"
+#include "dcadata.h"
+#include "dcahuff.h"
+#include "dca_syncwords.h"
+#include "bytestream.h"
+
+#define AMP_MAX     56
+
+enum LBRFlags {
+    LBR_FLAG_24_BIT             = 0x01,
+    LBR_FLAG_LFE_PRESENT        = 0x02,
+    LBR_FLAG_BAND_LIMIT_2_3     = 0x04,
+    LBR_FLAG_BAND_LIMIT_1_2     = 0x08,
+    LBR_FLAG_BAND_LIMIT_1_3     = 0x0c,
+    LBR_FLAG_BAND_LIMIT_1_4     = 0x10,
+    LBR_FLAG_BAND_LIMIT_1_8     = 0x18,
+    LBR_FLAG_BAND_LIMIT_NONE    = 0x14,
+    LBR_FLAG_BAND_LIMIT_MASK    = 0x1c,
+    LBR_FLAG_DMIX_STEREO        = 0x20,
+    LBR_FLAG_DMIX_MULTI_CH      = 0x40
+};
+
+enum LBRChunkTypes {
+    LBR_CHUNK_NULL              = 0x00,
+    LBR_CHUNK_PAD               = 0x01,
+    LBR_CHUNK_FRAME             = 0x04,
+    LBR_CHUNK_FRAME_NO_CSUM     = 0x06,
+    LBR_CHUNK_LFE               = 0x0a,
+    LBR_CHUNK_ECS               = 0x0b,
+    LBR_CHUNK_RESERVED_1        = 0x0c,
+    LBR_CHUNK_RESERVED_2        = 0x0d,
+    LBR_CHUNK_SCF               = 0x0e,
+    LBR_CHUNK_TONAL             = 0x10,
+    LBR_CHUNK_TONAL_GRP_1       = 0x11,
+    LBR_CHUNK_TONAL_GRP_2       = 0x12,
+    LBR_CHUNK_TONAL_GRP_3       = 0x13,
+    LBR_CHUNK_TONAL_GRP_4       = 0x14,
+    LBR_CHUNK_TONAL_GRP_5       = 0x15,
+    LBR_CHUNK_TONAL_SCF         = 0x16,
+    LBR_CHUNK_TONAL_SCF_GRP_1   = 0x17,
+    LBR_CHUNK_TONAL_SCF_GRP_2   = 0x18,
+    LBR_CHUNK_TONAL_SCF_GRP_3   = 0x19,
+    LBR_CHUNK_TONAL_SCF_GRP_4   = 0x1a,
+    LBR_CHUNK_TONAL_SCF_GRP_5   = 0x1b,
+    LBR_CHUNK_RES_GRID_LR       = 0x30,
+    LBR_CHUNK_RES_GRID_LR_LAST  = 0x3f,
+    LBR_CHUNK_RES_GRID_HR       = 0x40,
+    LBR_CHUNK_RES_GRID_HR_LAST  = 0x4f,
+    LBR_CHUNK_RES_TS_1          = 0x50,
+    LBR_CHUNK_RES_TS_1_LAST     = 0x5f,
+    LBR_CHUNK_RES_TS_2          = 0x60,
+    LBR_CHUNK_RES_TS_2_LAST     = 0x6f,
+    LBR_CHUNK_EXTENSION         = 0x7f
+};
+
+typedef struct LBRChunk {
+    int id, len;
+    const uint8_t *data;
+} LBRChunk;
+
+static const int8_t channel_reorder_nolfe[7][5] = {
+    { 0, -1, -1, -1, -1 },  // C
+    { 0,  1, -1, -1, -1 },  // LR
+    { 0,  1,  2, -1, -1 },  // LR C
+    { 0,  1, -1, -1, -1 },  // LsRs
+    { 1,  2,  0, -1, -1 },  // LsRs C
+    { 0,  1,  2,  3, -1 },  // LR LsRs
+    { 0,  1,  3,  4,  2 },  // LR LsRs C
+};
+
+static const int8_t channel_reorder_lfe[7][5] = {
+    { 0, -1, -1, -1, -1 },  // C
+    { 0,  1, -1, -1, -1 },  // LR
+    { 0,  1,  2, -1, -1 },  // LR C
+    { 1,  2, -1, -1, -1 },  // LsRs
+    { 2,  3,  0, -1, -1 },  // LsRs C
+    { 0,  1,  3,  4, -1 },  // LR LsRs
+    { 0,  1,  4,  5,  2 },  // LR LsRs C
+};
+
+static const uint8_t lfe_index[7] = {
+    1, 2, 3, 0, 1, 2, 3
+};
+
+static const uint8_t channel_counts[7] = {
+    1, 2, 3, 2, 3, 4, 5
+};
+
+static const uint16_t channel_layouts[7] = {
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,
+    AV_CH_FRONT_CENTER | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,
+    AV_CH_LAYOUT_2_2,
+    AV_CH_LAYOUT_5POINT0
+};
+
+static float    cos_tab[256];
+static float    lpc_tab[16];
+
+static av_cold void init_tables(void)
+{
+    static int initialized;
+    int i;
+
+    if (initialized)
+        return;
+
+    for (i = 0; i < 256; i++)
+        cos_tab[i] = cos(M_PI * i / 128);
+
+    for (i = 0; i < 16; i++)
+        lpc_tab[i] = sin((i - 8) * (M_PI / ((i < 8) ? 17 : 15)));
+
+    initialized = 1;
+}
+
+static int parse_lfe_24(DCALbrDecoder *s)
+{
+    int step_max = FF_ARRAY_ELEMS(ff_dca_lfe_step_size_24) - 1;
+    int i, ps, si, code, step_i;
+    float step, value, delta;
+
+    ps = get_bits(&s->gb, 24);
+    si = ps >> 23;
+
+    value = (((ps & 0x7fffff) ^ -si) + si) * (1.0f / 0x7fffff);
+
+    step_i = get_bits(&s->gb, 8);
+    if (step_i > step_max) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE step size index\n");
+        return -1;
+    }
+
+    step = ff_dca_lfe_step_size_24[step_i];
+
+    for (i = 0; i < 64; i++) {
+        code = get_bits(&s->gb, 6);
+
+        delta = step * 0.03125f;
+        if (code & 16)
+            delta += step;
+        if (code & 8)
+            delta += step * 0.5f;
+        if (code & 4)
+            delta += step * 0.25f;
+        if (code & 2)
+            delta += step * 0.125f;
+        if (code & 1)
+            delta += step * 0.0625f;
+
+        if (code & 32) {
+            value -= delta;
+            if (value < -3.0f)
+                value = -3.0f;
+        } else {
+            value += delta;
+            if (value > 3.0f)
+                value = 3.0f;
+        }
+
+        step_i += ff_dca_lfe_delta_index_24[code & 31];
+        step_i = av_clip(step_i, 0, step_max);
+
+        step = ff_dca_lfe_step_size_24[step_i];
+        s->lfe_data[i] = value * s->lfe_scale;
+    }
+
+    return 0;
+}
+
+static int parse_lfe_16(DCALbrDecoder *s)
+{
+    int step_max = FF_ARRAY_ELEMS(ff_dca_lfe_step_size_16) - 1;
+    int i, ps, si, code, step_i;
+    float step, value, delta;
+
+    ps = get_bits(&s->gb, 16);
+    si = ps >> 15;
+
+    value = (((ps & 0x7fff) ^ -si) + si) * (1.0f / 0x7fff);
+
+    step_i = get_bits(&s->gb, 8);
+    if (step_i > step_max) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE step size index\n");
+        return -1;
+    }
+
+    step = ff_dca_lfe_step_size_16[step_i];
+
+    for (i = 0; i < 64; i++) {
+        code = get_bits(&s->gb, 4);
+
+        delta = step * 0.125f;
+        if (code & 4)
+            delta += step;
+        if (code & 2)
+            delta += step * 0.5f;
+        if (code & 1)
+            delta += step * 0.25f;
+
+        if (code & 8) {
+            value -= delta;
+            if (value < -3.0f)
+                value = -3.0f;
+        } else {
+            value += delta;
+            if (value > 3.0f)
+                value = 3.0f;
+        }
+
+        step_i += ff_dca_lfe_delta_index_16[code & 7];
+        step_i = av_clip(step_i, 0, step_max);
+
+        step = ff_dca_lfe_step_size_16[step_i];
+        s->lfe_data[i] = value * s->lfe_scale;
+    }
+
+    return 0;
+}
+
+static int parse_lfe_chunk(DCALbrDecoder *s, LBRChunk *chunk)
+{
+    if (!(s->flags & LBR_FLAG_LFE_PRESENT))
+        return 0;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Determine bit depth from chunk size
+    if (chunk->len >= 52)
+        return parse_lfe_24(s);
+    if (chunk->len >= 35)
+        return parse_lfe_16(s);
+
+    av_log(s->avctx, AV_LOG_ERROR, "LFE chunk too short\n");
+    return -1;
+}
+
+static inline int parse_vlc(GetBitContext *s, VLC *vlc, int max_depth)
+{
+    int v = get_vlc2(s, vlc->table, vlc->bits, max_depth);
+    if (v > 0)
+        return v - 1;
+    // Rare value
+    return get_bits(s, get_bits(s, 3) + 1);
+}
+
+static int parse_tonal(DCALbrDecoder *s, int group)
+{
+    unsigned int amp[DCA_LBR_CHANNELS_TOTAL];
+    unsigned int phs[DCA_LBR_CHANNELS_TOTAL];
+    unsigned int diff, main_amp, shift;
+    int sf, sf_idx, ch, main_ch, freq;
+    int ch_nbits = av_ceil_log2(s->nchannels_total);
+
+    // Parse subframes for this group
+    for (sf = 0; sf < 1 << group; sf += diff ? 8 : 1) {
+        sf_idx = ((s->framenum << group) + sf) & 31;
+        s->tonal_bounds[group][sf_idx][0] = s->ntones;
+
+        // Parse tones for this subframe
+        for (freq = 1;; freq++) {
+            if (get_bits_left(&s->gb) < 1) {
+                av_log(s->avctx, AV_LOG_ERROR, "Tonal group chunk too short\n");
+                return -1;
+            }
+
+            diff = parse_vlc(&s->gb, &ff_dca_vlc_tnl_grp[group], 2);
+            if (diff >= FF_ARRAY_ELEMS(ff_dca_fst_amp)) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid tonal frequency diff\n");
+                return -1;
+            }
+
+            diff = get_bitsz(&s->gb, diff >> 2) + ff_dca_fst_amp[diff];
+            if (diff <= 1)
+                break;  // End of subframe
+
+            freq += diff - 2;
+            if (freq >> (5 - group) > s->nsubbands * 4 - 6) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid spectral line offset\n");
+                return -1;
+            }
+
+            // Main channel
+            main_ch = get_bitsz(&s->gb, ch_nbits);
+            main_amp = parse_vlc(&s->gb, &ff_dca_vlc_tnl_scf, 2)
+                + s->tonal_scf[ff_dca_freq_to_sb[freq >> (7 - group)]]
+                + s->limited_range - 2;
+            amp[main_ch] = main_amp < AMP_MAX ? main_amp : 0;
+            phs[main_ch] = get_bits(&s->gb, 3);
+
+            // Secondary channels
+            for (ch = 0; ch < s->nchannels_total; ch++) {
+                if (ch == main_ch)
+                    continue;
+                if (get_bits1(&s->gb)) {
+                    amp[ch] = amp[main_ch] - parse_vlc(&s->gb, &ff_dca_vlc_damp, 1);
+                    phs[ch] = phs[main_ch] - parse_vlc(&s->gb, &ff_dca_vlc_dph,  1);
+                } else {
+                    amp[ch] = 0;
+                    phs[ch] = 0;
+                }
+            }
+
+            if (amp[main_ch]) {
+                // Allocate new tone
+                DCALbrTone *t = &s->tones[s->ntones];
+                s->ntones = (s->ntones + 1) & (DCA_LBR_TONES - 1);
+
+                t->x_freq = freq >> (5 - group);
+                t->f_delt = (freq & ((1 << (5 - group)) - 1)) << group;
+                t->ph_rot = 256 - (t->x_freq & 1) * 128 - t->f_delt * 4;
+
+                shift = ff_dca_ph0_shift[(t->x_freq & 3) * 2 + (freq & 1)]
+                    - ((t->ph_rot << (5 - group)) - t->ph_rot);
+
+                for (ch = 0; ch < s->nchannels; ch++) {
+                    t->amp[ch] = amp[ch] < AMP_MAX ? amp[ch] : 0;
+                    t->phs[ch] = 128 - phs[ch] * 32 + shift;
+                }
+            }
+        }
+
+        s->tonal_bounds[group][sf_idx][1] = s->ntones;
+    }
+
+    return 0;
+}
+
+static int parse_tonal_chunk(DCALbrDecoder *s, LBRChunk *chunk)
+{
+    int sb, group;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Scale factors
+    if (chunk->id == LBR_CHUNK_SCF || chunk->id == LBR_CHUNK_TONAL_SCF) {
+        if (get_bits_left(&s->gb) < 36) {
+            av_log(s->avctx, AV_LOG_ERROR, "Tonal scale factor chunk too short\n");
+            return -1;
+        }
+        for (sb = 0; sb < 6; sb++)
+            s->tonal_scf[sb] = get_bits(&s->gb, 6);
+    }
+
+    // Tonal groups
+    if (chunk->id == LBR_CHUNK_TONAL || chunk->id == LBR_CHUNK_TONAL_SCF)
+        for (group = 0; group < 5; group++)
+            if (parse_tonal(s, group) < 0)
+                return -1;
+
+    return 0;
+}
+
+static int parse_tonal_group(DCALbrDecoder *s, LBRChunk *chunk)
+{
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    return parse_tonal(s, chunk->id);
+}
+
+/**
+ * Check point to ensure that enough bits are left. Aborts decoding
+ * by skipping to the end of chunk otherwise.
+ */
+static int ensure_bits(GetBitContext *s, int n)
+{
+    int left = get_bits_left(s);
+    if (left < 0)
+        return -1;
+    if (left < n) {
+        skip_bits_long(s, left);
+        return 1;
+    }
+    return 0;
+}
+
+static int parse_scale_factors(DCALbrDecoder *s, uint8_t *scf)
+{
+    int i, sf, prev, next, dist;
+
+    // Truncated scale factors remain zero
+    if (ensure_bits(&s->gb, 20))
+        return 0;
+
+    // Initial scale factor
+    prev = parse_vlc(&s->gb, &ff_dca_vlc_fst_rsd_amp, 2);
+
+    for (sf = 0; sf < 7; sf += dist) {
+        scf[sf] = prev; // Store previous value
+
+        if (ensure_bits(&s->gb, 20))
+            return 0;
+
+        // Interpolation distance
+        dist = parse_vlc(&s->gb, &ff_dca_vlc_rsd_apprx, 1) + 1;
+        if (dist > 7 - sf) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid scale factor distance\n");
+            return -1;
+        }
+
+        if (ensure_bits(&s->gb, 20))
+            return 0;
+
+        // Final interpolation point
+        next = parse_vlc(&s->gb, &ff_dca_vlc_rsd_amp, 2);
+
+        if (next & 1)
+            next = prev + ((next + 1) >> 1);
+        else
+            next = prev - ( next      >> 1);
+
+        // Interpolate
+        switch (dist) {
+        case 2:
+            if (next > prev)
+                scf[sf + 1] = prev + ((next - prev) >> 1);
+            else
+                scf[sf + 1] = prev - ((prev - next) >> 1);
+            break;
+
+        case 4:
+            if (next > prev) {
+                scf[sf + 1] = prev + ( (next - prev)      >> 2);
+                scf[sf + 2] = prev + ( (next - prev)      >> 1);
+                scf[sf + 3] = prev + (((next - prev) * 3) >> 2);
+            } else {
+                scf[sf + 1] = prev - ( (prev - next)      >> 2);
+                scf[sf + 2] = prev - ( (prev - next)      >> 1);
+                scf[sf + 3] = prev - (((prev - next) * 3) >> 2);
+            }
+            break;
+
+        default:
+            for (i = 1; i < dist; i++)
+                scf[sf + i] = prev + (next - prev) * i / dist;
+            break;
+        }
+
+        prev = next;
+    }
+
+    scf[sf] = next; // Store final value
+
+    return 0;
+}
+
+static int parse_st_code(GetBitContext *s, int min_v)
+{
+    unsigned int v = parse_vlc(s, &ff_dca_vlc_st_grid, 2) + min_v;
+
+    if (v & 1)
+        v = 16 + (v >> 1);
+    else
+        v = 16 - (v >> 1);
+
+    if (v >= FF_ARRAY_ELEMS(ff_dca_st_coeff))
+        v = 16;
+    return v;
+}
+
+static int parse_grid_1_chunk(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    int ch, sb, sf, nsubbands;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Scale factors
+    nsubbands = ff_dca_scf_to_grid_1[s->nsubbands - 1] + 1;
+    for (sb = 2; sb < nsubbands; sb++) {
+        if (parse_scale_factors(s, s->grid_1_scf[ch1][sb]) < 0)
+            return -1;
+        if (ch1 != ch2 && ff_dca_grid_1_to_scf[sb] < s->min_mono_subband
+            && parse_scale_factors(s, s->grid_1_scf[ch2][sb]) < 0)
+            return -1;
+    }
+
+    if (get_bits_left(&s->gb) < 1)
+        return 0;   // Should not happen, but a sample exists that proves otherwise
+
+    // Average values for third grid
+    for (sb = 0; sb < s->nsubbands - 4; sb++) {
+        s->grid_3_avg[ch1][sb] = parse_vlc(&s->gb, &ff_dca_vlc_avg_g3, 2) - 16;
+        if (ch1 != ch2) {
+            if (sb + 4 < s->min_mono_subband)
+                s->grid_3_avg[ch2][sb] = parse_vlc(&s->gb, &ff_dca_vlc_avg_g3, 2) - 16;
+            else
+                s->grid_3_avg[ch2][sb] = s->grid_3_avg[ch1][sb];
+        }
+    }
+
+    if (get_bits_left(&s->gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "First grid chunk too short\n");
+        return -1;
+    }
+
+    // Stereo image for partial mono mode
+    if (ch1 != ch2) {
+        int min_v[2];
+
+        if (ensure_bits(&s->gb, 8))
+            return 0;
+
+        min_v[0] = get_bits(&s->gb, 4);
+        min_v[1] = get_bits(&s->gb, 4);
+
+        nsubbands = (s->nsubbands - s->min_mono_subband + 3) / 4;
+        for (sb = 0; sb < nsubbands; sb++)
+            for (ch = ch1; ch <= ch2; ch++)
+                for (sf = 1; sf <= 4; sf++)
+                    s->part_stereo[ch][sb][sf] = parse_st_code(&s->gb, min_v[ch - ch1]);
+
+        if (get_bits_left(&s->gb) >= 0)
+            s->part_stereo_pres |= 1 << ch1;
+    }
+
+    // Low resolution spatial information is not decoded
+
+    return 0;
+}
+
+static int parse_grid_1_sec_ch(DCALbrDecoder *s, int ch2)
+{
+    int sb, nsubbands;
+
+    // Scale factors
+    nsubbands = ff_dca_scf_to_grid_1[s->nsubbands - 1] + 1;
+    for (sb = 2; sb < nsubbands; sb++) {
+        if (ff_dca_grid_1_to_scf[sb] >= s->min_mono_subband
+            && parse_scale_factors(s, s->grid_1_scf[ch2][sb]) < 0)
+            return -1;
+    }
+
+    // Average values for third grid
+    for (sb = 0; sb < s->nsubbands - 4; sb++) {
+        if (sb + 4 >= s->min_mono_subband) {
+            if (ensure_bits(&s->gb, 20))
+                return 0;
+            s->grid_3_avg[ch2][sb] = parse_vlc(&s->gb, &ff_dca_vlc_avg_g3, 2) - 16;
+        }
+    }
+
+    return 0;
+}
+
+static void parse_grid_3(DCALbrDecoder *s, int ch1, int ch2, int sb, int flag)
+{
+    int i, ch;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        if ((ch != ch1 && sb + 4 >= s->min_mono_subband) != flag)
+            continue;
+
+        if (s->grid_3_pres[ch] & (1U << sb))
+            continue;   // Already parsed
+
+        for (i = 0; i < 8; i++) {
+            if (ensure_bits(&s->gb, 20))
+                return;
+            s->grid_3_scf[ch][sb][i] = parse_vlc(&s->gb, &ff_dca_vlc_grid_3, 2) - 16;
+        }
+
+        // Flag scale factors for this subband parsed
+        s->grid_3_pres[ch] |= 1U << sb;
+    }
+}
+
+static float lbr_rand(DCALbrDecoder *s, int sb)
+{
+    s->lbr_rand = 1103515245U * s->lbr_rand + 12345U;
+    return s->lbr_rand * s->sb_scf[sb];
+}
+
+/**
+ * Parse time samples for one subband, filling truncated samples with randomness
+ */
+static void parse_ch(DCALbrDecoder *s, int ch, int sb, int quant_level, int flag)
+{
+    float *samples = s->time_samples[ch][sb];
+    int i, j, code, nblocks, coding_method;
+
+    if (ensure_bits(&s->gb, 20))
+        return; // Too few bits left
+
+    coding_method = get_bits1(&s->gb);
+
+    switch (quant_level) {
+    case 1:
+        nblocks = FFMIN(get_bits_left(&s->gb) / 8, DCA_LBR_TIME_SAMPLES / 8);
+        for (i = 0; i < nblocks; i++, samples += 8) {
+            code = get_bits(&s->gb, 8);
+            for (j = 0; j < 8; j++)
+                samples[j] = ff_dca_rsd_level_2a[(code >> j) & 1];
+        }
+        i = nblocks * 8;
+        break;
+
+    case 2:
+        if (coding_method) {
+            for (i = 0; i < DCA_LBR_TIME_SAMPLES && get_bits_left(&s->gb) >= 2; i++) {
+                if (get_bits1(&s->gb))
+                    samples[i] = ff_dca_rsd_level_2b[get_bits1(&s->gb)];
+                else
+                    samples[i] = 0;
+            }
+        } else {
+            nblocks = FFMIN(get_bits_left(&s->gb) / 8, (DCA_LBR_TIME_SAMPLES + 4) / 5);
+            for (i = 0; i < nblocks; i++, samples += 5) {
+                code = ff_dca_rsd_pack_5_in_8[get_bits(&s->gb, 8)];
+                for (j = 0; j < 5; j++)
+                    samples[j] = ff_dca_rsd_level_3[(code >> j * 2) & 3];
+            }
+            i = nblocks * 5;
+        }
+        break;
+
+    case 3:
+        nblocks = FFMIN(get_bits_left(&s->gb) / 7, (DCA_LBR_TIME_SAMPLES + 2) / 3);
+        for (i = 0; i < nblocks; i++, samples += 3) {
+            code = get_bits(&s->gb, 7);
+            for (j = 0; j < 3; j++)
+                samples[j] = ff_dca_rsd_level_5[ff_dca_rsd_pack_3_in_7[code][j]];
+        }
+        i = nblocks * 3;
+        break;
+
+    case 4:
+        for (i = 0; i < DCA_LBR_TIME_SAMPLES && get_bits_left(&s->gb) >= 6; i++)
+            samples[i] = ff_dca_rsd_level_8[get_vlc2(&s->gb, ff_dca_vlc_rsd.table, 6, 1)];
+        break;
+
+    case 5:
+        nblocks = FFMIN(get_bits_left(&s->gb) / 4, DCA_LBR_TIME_SAMPLES);
+        for (i = 0; i < nblocks; i++)
+            samples[i] = ff_dca_rsd_level_16[get_bits(&s->gb, 4)];
+        break;
+
+    default:
+        av_assert0(0);
+    }
+
+    if (flag && get_bits_left(&s->gb) < 20)
+        return; // Skip incomplete mono subband
+
+    for (; i < DCA_LBR_TIME_SAMPLES; i++)
+        s->time_samples[ch][sb][i] = lbr_rand(s, sb);
+
+    s->ch_pres[ch] |= 1U << sb;
+}
+
+static int parse_ts(DCALbrDecoder *s, int ch1, int ch2,
+                    int start_sb, int end_sb, int flag)
+{
+    int sb, sb_g3, sb_reorder, quant_level;
+
+    for (sb = start_sb; sb < end_sb; sb++) {
+        // Subband number before reordering
+        if (sb < 6) {
+            sb_reorder = sb;
+        } else if (flag && sb < s->max_mono_subband) {
+            sb_reorder = s->sb_indices[sb];
+        } else {
+            if (ensure_bits(&s->gb, 28))
+                break;
+            sb_reorder = get_bits(&s->gb, s->limited_range + 3);
+            if (sb_reorder < 6)
+                sb_reorder = 6;
+            s->sb_indices[sb] = sb_reorder;
+        }
+        if (sb_reorder >= s->nsubbands)
+            return -1;
+
+        // Third grid scale factors
+        if (sb == 12) {
+            for (sb_g3 = 0; sb_g3 < s->g3_avg_only_start_sb - 4; sb_g3++)
+                parse_grid_3(s, ch1, ch2, sb_g3, flag);
+        } else if (sb < 12 && sb_reorder >= 4) {
+            parse_grid_3(s, ch1, ch2, sb_reorder - 4, flag);
+        }
+
+        // Secondary channel flags
+        if (ch1 != ch2) {
+            if (ensure_bits(&s->gb, 20))
+                break;
+            if (!flag || sb_reorder >= s->max_mono_subband)
+                s->sec_ch_sbms[ch1 / 2][sb_reorder] = get_bits(&s->gb, 8);
+            if (flag && sb_reorder >= s->min_mono_subband)
+                s->sec_ch_lrms[ch1 / 2][sb_reorder] = get_bits(&s->gb, 8);
+        }
+
+        quant_level = s->quant_levels[ch1 / 2][sb];
+        if (!quant_level)
+            return -1;
+
+        // Time samples for one or both channels
+        if (sb < s->max_mono_subband && sb_reorder >= s->min_mono_subband) {
+            if (!flag)
+                parse_ch(s, ch1, sb_reorder, quant_level, 0);
+            else if (ch1 != ch2)
+                parse_ch(s, ch2, sb_reorder, quant_level, 1);
+        } else {
+            parse_ch(s, ch1, sb_reorder, quant_level, 0);
+            if (ch1 != ch2)
+                parse_ch(s, ch2, sb_reorder, quant_level, 0);
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * Convert from reflection coefficients to direct form coefficients
+ */
+static void convert_lpc(float *coeff, const int *codes)
+{
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        float rc = lpc_tab[codes[i]];
+        for (j = 0; j < (i + 1) / 2; j++) {
+            float tmp1 = coeff[    j    ];
+            float tmp2 = coeff[i - j - 1];
+            coeff[    j    ] = tmp1 + rc * tmp2;
+            coeff[i - j - 1] = tmp2 + rc * tmp1;
+        }
+        coeff[i] = rc;
+    }
+}
+
+static int parse_lpc(DCALbrDecoder *s, int ch1, int ch2, int start_sb, int end_sb)
+{
+    int f = s->framenum & 1;
+    int i, sb, ch, codes[16];
+
+    // First two subbands have two sets of coefficients, third subband has one
+    for (sb = start_sb; sb < end_sb; sb++) {
+        int ncodes = 8 * (1 + (sb < 2));
+        for (ch = ch1; ch <= ch2; ch++) {
+            if (ensure_bits(&s->gb, 4 * ncodes))
+                return 0;
+            for (i = 0; i < ncodes; i++)
+                codes[i] = get_bits(&s->gb, 4);
+            for (i = 0; i < ncodes / 8; i++)
+                convert_lpc(s->lpc_coeff[f][ch][sb][i], &codes[i * 8]);
+        }
+    }
+
+    return 0;
+}
+
+static int parse_high_res_grid(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    int quant_levels[DCA_LBR_SUBBANDS];
+    int sb, ch, ol, st, max_sb, profile;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Quantizer profile
+    profile = get_bits(&s->gb, 8);
+    // Overall level
+    ol = (profile >> 3) & 7;
+    // Steepness
+    st = profile >> 6;
+    // Max energy subband
+    max_sb = profile & 7;
+
+    // Calculate quantization levels
+    for (sb = 0; sb < s->nsubbands; sb++) {
+        int f = sb * s->limited_rate / s->nsubbands;
+        int a = 18000 / (12 * f / 1000 + 100 + 40 * st) + 20 * ol;
+        if (a <= 95)
+            quant_levels[sb] = 1;
+        else if (a <= 140)
+            quant_levels[sb] = 2;
+        else if (a <= 180)
+            quant_levels[sb] = 3;
+        else if (a <= 230)
+            quant_levels[sb] = 4;
+        else
+            quant_levels[sb] = 5;
+    }
+
+    // Reorder quantization levels for lower subbands
+    for (sb = 0; sb < 8; sb++)
+        s->quant_levels[ch1 / 2][sb] = quant_levels[ff_dca_sb_reorder[max_sb][sb]];
+    for (; sb < s->nsubbands; sb++)
+        s->quant_levels[ch1 / 2][sb] = quant_levels[sb];
+
+    // LPC for the first two subbands
+    if (parse_lpc(s, ch1, ch2, 0, 2) < 0)
+        return -1;
+
+    // Time-samples for the first two subbands of main channel
+    if (parse_ts(s, ch1, ch2, 0, 2, 0) < 0)
+        return -1;
+
+    // First two bands of the first grid
+    for (sb = 0; sb < 2; sb++)
+        for (ch = ch1; ch <= ch2; ch++)
+            if (parse_scale_factors(s, s->grid_1_scf[ch][sb]) < 0)
+                return -1;
+
+    return 0;
+}
+
+static int parse_grid_2(DCALbrDecoder *s, int ch1, int ch2,
+                        int start_sb, int end_sb, int flag)
+{
+    int i, j, sb, ch, nsubbands;
+
+    nsubbands = ff_dca_scf_to_grid_2[s->nsubbands - 1] + 1;
+    if (end_sb > nsubbands)
+        end_sb = nsubbands;
+
+    for (sb = start_sb; sb < end_sb; sb++) {
+        for (ch = ch1; ch <= ch2; ch++) {
+            uint8_t *g2_scf = s->grid_2_scf[ch][sb];
+
+            if ((ch != ch1 && ff_dca_grid_2_to_scf[sb] >= s->min_mono_subband) != flag) {
+                if (!flag)
+                    memcpy(g2_scf, s->grid_2_scf[ch1][sb], 64);
+                continue;
+            }
+
+            // Scale factors in groups of 8
+            for (i = 0; i < 8; i++, g2_scf += 8) {
+                if (get_bits_left(&s->gb) < 1) {
+                    memset(g2_scf, 0, 64 - i * 8);
+                    break;
+                }
+                // Bit indicating if whole group has zero values
+                if (get_bits1(&s->gb)) {
+                    for (j = 0; j < 8; j++) {
+                        if (ensure_bits(&s->gb, 20))
+                            break;
+                        g2_scf[j] = parse_vlc(&s->gb, &ff_dca_vlc_grid_2, 2);
+                    }
+                } else {
+                    memset(g2_scf, 0, 8);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int parse_ts1_chunk(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    if (!chunk->len)
+        return 0;
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+    if (parse_lpc(s, ch1, ch2, 2, 3) < 0)
+        return -1;
+    if (parse_ts(s, ch1, ch2, 2, 4, 0) < 0)
+        return -1;
+    if (parse_grid_2(s, ch1, ch2, 0, 1, 0) < 0)
+        return -1;
+    if (parse_ts(s, ch1, ch2, 4, 6, 0) < 0)
+        return -1;
+    return 0;
+}
+
+static int parse_ts2_chunk(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    if (!chunk->len)
+        return 0;
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+    if (parse_grid_2(s, ch1, ch2, 1, 3, 0) < 0)
+        return -1;
+    if (parse_ts(s, ch1, ch2, 6, s->max_mono_subband, 0) < 0)
+        return -1;
+    if (ch1 != ch2) {
+        if (parse_grid_1_sec_ch(s, ch2) < 0)
+            return -1;
+        if (parse_grid_2(s, ch1, ch2, 0, 3, 1) < 0)
+            return -1;
+    }
+    if (parse_ts(s, ch1, ch2, s->min_mono_subband, s->nsubbands, 1) < 0)
+        return -1;
+    return 0;
+}
+
+static int init_sample_rate(DCALbrDecoder *s)
+{
+    double scale = (-1.0 / (1 << 17)) * sqrt(1 << (2 - s->limited_range));
+    int i, br_per_ch = s->bit_rate_scaled / s->nchannels_total;
+
+    ff_mdct_end(&s->imdct);
+
+    if (ff_mdct_init(&s->imdct, s->freq_range + 6, 1, scale) < 0)
+        return -1;
+
+    for (i = 0; i < 32 << s->freq_range; i++)
+        s->window[i] = ff_dca_long_window[i << (2 - s->freq_range)];
+
+    if (br_per_ch < 14000)
+        scale = 0.85;
+    else if (br_per_ch < 32000)
+        scale = (br_per_ch - 14000) * (1.0 / 120000) + 0.85;
+    else
+        scale = 1.0;
+
+    scale *= 1.0 / INT_MAX;
+
+    for (i = 0; i < s->nsubbands; i++) {
+        if (i < 2)
+            s->sb_scf[i] = 0;   // The first two subbands are always zero
+        else if (i < 5)
+            s->sb_scf[i] = (i - 1) * 0.25 * 0.785 * scale;
+        else
+            s->sb_scf[i] = 0.785 * scale;
+    }
+
+    s->lfe_scale = (16 << s->freq_range) * 0.0000078265894;
+
+    return 0;
+}
+
+static int alloc_sample_buffer(DCALbrDecoder *s)
+{
+    // Reserve space for history and padding
+    int nchsamples = DCA_LBR_TIME_SAMPLES + DCA_LBR_TIME_HISTORY * 2;
+    int nsamples = nchsamples * s->nchannels * s->nsubbands;
+    int ch, sb;
+    float *ptr;
+
+    // Reallocate time sample buffer
+    av_fast_mallocz(&s->ts_buffer, &s->ts_size, nsamples * sizeof(float));
+    if (!s->ts_buffer)
+        return -1;
+
+    ptr = s->ts_buffer + DCA_LBR_TIME_HISTORY;
+    for (ch = 0; ch < s->nchannels; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            s->time_samples[ch][sb] = ptr;
+            ptr += nchsamples;
+        }
+    }
+
+    return 0;
+}
+
+static int parse_decoder_init(DCALbrDecoder *s, GetByteContext *gb)
+{
+    int old_rate = s->sample_rate;
+    int old_band_limit = s->band_limit;
+    int old_nchannels = s->nchannels;
+    int version, bit_rate_hi;
+    unsigned int sr_code;
+
+    // Sample rate of LBR audio
+    sr_code = bytestream2_get_byte(gb);
+    if (sr_code >= FF_ARRAY_ELEMS(ff_dca_sampling_freqs)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR sample rate\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->sample_rate = ff_dca_sampling_freqs[sr_code];
+    if (s->sample_rate > 48000) {
+        avpriv_report_missing_feature(s->avctx, "%d Hz LBR sample rate", s->sample_rate);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // LBR speaker mask
+    s->ch_mask = bytestream2_get_le16(gb);
+    if (!(s->ch_mask & 0x7)) {
+        avpriv_report_missing_feature(s->avctx, "LBR channel mask %#x", s->ch_mask);
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((s->ch_mask & 0xfff0) && !(s->warned & 1)) {
+        avpriv_report_missing_feature(s->avctx, "LBR channel mask %#x", s->ch_mask);
+        s->warned |= 1;
+    }
+
+    // LBR bitstream version
+    version = bytestream2_get_le16(gb);
+    if ((version & 0xff00) != 0x0800) {
+        avpriv_report_missing_feature(s->avctx, "LBR stream version %#x", version);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Flags for LBR decoder initialization
+    s->flags = bytestream2_get_byte(gb);
+    if (s->flags & LBR_FLAG_DMIX_MULTI_CH) {
+        avpriv_report_missing_feature(s->avctx, "LBR multi-channel downmix");
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((s->flags & LBR_FLAG_LFE_PRESENT) && s->sample_rate != 48000) {
+        if (!(s->warned & 2)) {
+            avpriv_report_missing_feature(s->avctx, "%d Hz LFE interpolation", s->sample_rate);
+            s->warned |= 2;
+        }
+        s->flags &= ~LBR_FLAG_LFE_PRESENT;
+    }
+
+    // Most significant bit rate nibbles
+    bit_rate_hi = bytestream2_get_byte(gb);
+
+    // Least significant original bit rate word
+    s->bit_rate_orig = bytestream2_get_le16(gb) | ((bit_rate_hi & 0x0F) << 16);
+
+    // Least significant scaled bit rate word
+    s->bit_rate_scaled = bytestream2_get_le16(gb) | ((bit_rate_hi & 0xF0) << 12);
+
+    // Setup number of fullband channels
+    s->nchannels_total = ff_dca_count_chs_for_mask(s->ch_mask & ~DCA_SPEAKER_PAIR_LFE1);
+    s->nchannels = FFMIN(s->nchannels_total, DCA_LBR_CHANNELS);
+
+    // Setup band limit
+    switch (s->flags & LBR_FLAG_BAND_LIMIT_MASK) {
+    case LBR_FLAG_BAND_LIMIT_NONE:
+        s->band_limit = 0;
+        break;
+    case LBR_FLAG_BAND_LIMIT_1_2:
+        s->band_limit = 1;
+        break;
+    case LBR_FLAG_BAND_LIMIT_1_4:
+        s->band_limit = 2;
+        break;
+    default:
+        avpriv_report_missing_feature(s->avctx, "LBR band limit %#x", s->flags & LBR_FLAG_BAND_LIMIT_MASK);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Setup frequency range
+    s->freq_range = ff_dca_freq_ranges[sr_code];
+
+    // Setup resolution profile
+    if (s->bit_rate_orig >= 44000 * (s->nchannels_total + 2))
+        s->res_profile = 2;
+    else if (s->bit_rate_orig >= 25000 * (s->nchannels_total + 2))
+        s->res_profile = 1;
+    else
+        s->res_profile = 0;
+
+    // Setup limited sample rate, number of subbands, etc
+    s->limited_rate = s->sample_rate >> s->band_limit;
+    s->limited_range = s->freq_range - s->band_limit;
+    if (s->limited_range < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR band limit for frequency range\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->nsubbands = 8 << s->limited_range;
+
+    s->g3_avg_only_start_sb = s->nsubbands * ff_dca_avg_g3_freqs[s->res_profile] / (s->limited_rate / 2);
+    if (s->g3_avg_only_start_sb > s->nsubbands)
+        s->g3_avg_only_start_sb = s->nsubbands;
+
+    s->min_mono_subband = s->nsubbands *  2000 / (s->limited_rate / 2);
+    if (s->min_mono_subband > s->nsubbands)
+        s->min_mono_subband = s->nsubbands;
+
+    s->max_mono_subband = s->nsubbands * 14000 / (s->limited_rate / 2);
+    if (s->max_mono_subband > s->nsubbands)
+        s->max_mono_subband = s->nsubbands;
+
+    // Handle change of sample rate
+    if ((old_rate != s->sample_rate || old_band_limit != s->band_limit) && init_sample_rate(s) < 0)
+        return AVERROR(ENOMEM);
+
+    // Setup stereo downmix
+    if (s->flags & LBR_FLAG_DMIX_STEREO) {
+        DCAContext *dca = s->avctx->priv_data;
+
+        if (s->nchannels_total < 3 || s->nchannels_total > DCA_LBR_CHANNELS_TOTAL - 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of channels for LBR stereo downmix\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // This decoder doesn't support ECS chunk
+        if (dca->request_channel_layout != DCA_SPEAKER_LAYOUT_STEREO && !(s->warned & 4)) {
+            avpriv_report_missing_feature(s->avctx, "Embedded LBR stereo downmix");
+            s->warned |= 4;
+        }
+
+        // Account for extra downmixed channel pair
+        s->nchannels_total += 2;
+        s->nchannels = 2;
+        s->ch_mask = DCA_SPEAKER_PAIR_LR;
+        s->flags &= ~LBR_FLAG_LFE_PRESENT;
+    }
+
+    // Handle change of sample rate or number of channels
+    if (old_rate != s->sample_rate
+        || old_band_limit != s->band_limit
+        || old_nchannels != s->nchannels) {
+        if (alloc_sample_buffer(s) < 0)
+            return AVERROR(ENOMEM);
+        ff_dca_lbr_flush(s);
+    }
+
+    return 0;
+}
+
+int ff_dca_lbr_parse(DCALbrDecoder *s, uint8_t *data, DCAExssAsset *asset)
+{
+    struct {
+        LBRChunk    lfe;
+        LBRChunk    tonal;
+        LBRChunk    tonal_grp[5];
+        LBRChunk    grid1[DCA_LBR_CHANNELS / 2];
+        LBRChunk    hr_grid[DCA_LBR_CHANNELS / 2];
+        LBRChunk    ts1[DCA_LBR_CHANNELS / 2];
+        LBRChunk    ts2[DCA_LBR_CHANNELS / 2];
+    } chunk = { {0} };
+
+    GetByteContext gb;
+
+    int i, ch, sb, sf, ret, group, chunk_id, chunk_len;
+
+    bytestream2_init(&gb, data + asset->lbr_offset, asset->lbr_size);
+
+    // LBR sync word
+    if (bytestream2_get_be32(&gb) != DCA_SYNCWORD_LBR) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // LBR header type
+    switch (bytestream2_get_byte(&gb)) {
+    case DCA_LBR_HEADER_SYNC_ONLY:
+        if (!s->sample_rate) {
+            av_log(s->avctx, AV_LOG_ERROR, "LBR decoder not initialized\n");
+            return AVERROR_INVALIDDATA;
+        }
+        break;
+    case DCA_LBR_HEADER_DECODER_INIT:
+        if ((ret = parse_decoder_init(s, &gb)) < 0) {
+            s->sample_rate = 0;
+            return ret;
+        }
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR header type\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // LBR frame chunk header
+    chunk_id = bytestream2_get_byte(&gb);
+    chunk_len = (chunk_id & 0x80) ? bytestream2_get_be16(&gb) : bytestream2_get_byte(&gb);
+
+    if (chunk_len > bytestream2_get_bytes_left(&gb)) {
+        chunk_len = bytestream2_get_bytes_left(&gb);
+        av_log(s->avctx, AV_LOG_WARNING, "LBR frame chunk was truncated\n");
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_init(&gb, gb.buffer, chunk_len);
+
+    switch (chunk_id & 0x7f) {
+    case LBR_CHUNK_FRAME:
+        if (s->avctx->err_recognition & (AV_EF_CRCCHECK | AV_EF_CAREFUL)) {
+            int checksum = bytestream2_get_be16(&gb);
+            uint16_t res = chunk_id;
+            res += (chunk_len >> 8) & 0xff;
+            res += chunk_len & 0xff;
+            for (i = 0; i < chunk_len - 2; i++)
+                res += gb.buffer[i];
+            if (checksum != res) {
+                av_log(s->avctx, AV_LOG_WARNING, "Invalid LBR checksum\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+        } else {
+            bytestream2_skip(&gb, 2);
+        }
+        break;
+    case LBR_CHUNK_FRAME_NO_CSUM:
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR frame chunk ID\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Clear current frame
+    memset(s->quant_levels, 0, sizeof(s->quant_levels));
+    memset(s->sb_indices, 0xff, sizeof(s->sb_indices));
+    memset(s->sec_ch_sbms, 0, sizeof(s->sec_ch_sbms));
+    memset(s->sec_ch_lrms, 0, sizeof(s->sec_ch_lrms));
+    memset(s->ch_pres, 0, sizeof(s->ch_pres));
+    memset(s->grid_1_scf, 0, sizeof(s->grid_1_scf));
+    memset(s->grid_2_scf, 0, sizeof(s->grid_2_scf));
+    memset(s->grid_3_avg, 0, sizeof(s->grid_3_avg));
+    memset(s->grid_3_scf, 0, sizeof(s->grid_3_scf));
+    memset(s->grid_3_pres, 0, sizeof(s->grid_3_pres));
+    memset(s->tonal_scf, 0, sizeof(s->tonal_scf));
+    memset(s->lfe_data, 0, sizeof(s->lfe_data));
+    s->part_stereo_pres = 0;
+    s->framenum = (s->framenum + 1) & 31;
+
+    for (ch = 0; ch < s->nchannels; ch++) {
+        for (sb = 0; sb < s->nsubbands / 4; sb++) {
+            s->part_stereo[ch][sb][0] = s->part_stereo[ch][sb][4];
+            s->part_stereo[ch][sb][4] = 16;
+        }
+    }
+
+    memset(s->lpc_coeff[s->framenum & 1], 0, sizeof(s->lpc_coeff[0]));
+
+    for (group = 0; group < 5; group++) {
+        for (sf = 0; sf < 1 << group; sf++) {
+            int sf_idx = ((s->framenum << group) + sf) & 31;
+            s->tonal_bounds[group][sf_idx][0] =
+            s->tonal_bounds[group][sf_idx][1] = s->ntones;
+        }
+    }
+
+    // Parse chunk headers
+    while (bytestream2_get_bytes_left(&gb) > 0) {
+        chunk_id = bytestream2_get_byte(&gb);
+        chunk_len = (chunk_id & 0x80) ? bytestream2_get_be16(&gb) : bytestream2_get_byte(&gb);
+        chunk_id &= 0x7f;
+
+        if (chunk_len > bytestream2_get_bytes_left(&gb)) {
+            chunk_len = bytestream2_get_bytes_left(&gb);
+            av_log(s->avctx, AV_LOG_WARNING, "LBR chunk %#x was truncated\n", chunk_id);
+            if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                return AVERROR_INVALIDDATA;
+        }
+
+        switch (chunk_id) {
+        case LBR_CHUNK_LFE:
+            chunk.lfe.len  = chunk_len;
+            chunk.lfe.data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_SCF:
+        case LBR_CHUNK_TONAL:
+        case LBR_CHUNK_TONAL_SCF:
+            chunk.tonal.id   = chunk_id;
+            chunk.tonal.len  = chunk_len;
+            chunk.tonal.data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_TONAL_GRP_1:
+        case LBR_CHUNK_TONAL_GRP_2:
+        case LBR_CHUNK_TONAL_GRP_3:
+        case LBR_CHUNK_TONAL_GRP_4:
+        case LBR_CHUNK_TONAL_GRP_5:
+            i = LBR_CHUNK_TONAL_GRP_5 - chunk_id;
+            chunk.tonal_grp[i].id   = i;
+            chunk.tonal_grp[i].len  = chunk_len;
+            chunk.tonal_grp[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_TONAL_SCF_GRP_1:
+        case LBR_CHUNK_TONAL_SCF_GRP_2:
+        case LBR_CHUNK_TONAL_SCF_GRP_3:
+        case LBR_CHUNK_TONAL_SCF_GRP_4:
+        case LBR_CHUNK_TONAL_SCF_GRP_5:
+            i = LBR_CHUNK_TONAL_SCF_GRP_5 - chunk_id;
+            chunk.tonal_grp[i].id   = i;
+            chunk.tonal_grp[i].len  = chunk_len;
+            chunk.tonal_grp[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_GRID_LR:
+        case LBR_CHUNK_RES_GRID_LR + 1:
+        case LBR_CHUNK_RES_GRID_LR + 2:
+            i = chunk_id - LBR_CHUNK_RES_GRID_LR;
+            chunk.grid1[i].len  = chunk_len;
+            chunk.grid1[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_GRID_HR:
+        case LBR_CHUNK_RES_GRID_HR + 1:
+        case LBR_CHUNK_RES_GRID_HR + 2:
+            i = chunk_id - LBR_CHUNK_RES_GRID_HR;
+            chunk.hr_grid[i].len  = chunk_len;
+            chunk.hr_grid[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_TS_1:
+        case LBR_CHUNK_RES_TS_1 + 1:
+        case LBR_CHUNK_RES_TS_1 + 2:
+            i = chunk_id - LBR_CHUNK_RES_TS_1;
+            chunk.ts1[i].len  = chunk_len;
+            chunk.ts1[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_TS_2:
+        case LBR_CHUNK_RES_TS_2 + 1:
+        case LBR_CHUNK_RES_TS_2 + 2:
+            i = chunk_id - LBR_CHUNK_RES_TS_2;
+            chunk.ts2[i].len  = chunk_len;
+            chunk.ts2[i].data = gb.buffer;
+            break;
+        }
+
+        bytestream2_skip(&gb, chunk_len);
+    }
+
+    // Parse the chunks
+    ret = parse_lfe_chunk(s, &chunk.lfe);
+
+    ret |= parse_tonal_chunk(s, &chunk.tonal);
+
+    for (i = 0; i < 5; i++)
+        ret |= parse_tonal_group(s, &chunk.tonal_grp[i]);
+
+    for (i = 0; i < (s->nchannels + 1) / 2; i++) {
+        int ch1 = i * 2;
+        int ch2 = FFMIN(ch1 + 1, s->nchannels - 1);
+
+        if (parse_grid_1_chunk (s, &chunk.grid1  [i], ch1, ch2) < 0 ||
+            parse_high_res_grid(s, &chunk.hr_grid[i], ch1, ch2) < 0) {
+            ret = -1;
+            continue;
+        }
+
+        // TS chunks depend on both grids. TS_2 depends on TS_1.
+        if (!chunk.grid1[i].len || !chunk.hr_grid[i].len || !chunk.ts1[i].len)
+            continue;
+
+        if (parse_ts1_chunk(s, &chunk.ts1[i], ch1, ch2) < 0 ||
+            parse_ts2_chunk(s, &chunk.ts2[i], ch1, ch2) < 0) {
+            ret = -1;
+            continue;
+        }
+    }
+
+    if (ret < 0 && (s->avctx->err_recognition & AV_EF_EXPLODE))
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+/**
+ * Reconstruct high-frequency resolution grid from first and third grids
+ */
+static void decode_grid(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, ch, sb;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            int g1_sb = ff_dca_scf_to_grid_1[sb];
+
+            uint8_t *g1_scf_a = s->grid_1_scf[ch][g1_sb    ];
+            uint8_t *g1_scf_b = s->grid_1_scf[ch][g1_sb + 1];
+
+            int w1 = ff_dca_grid_1_weights[g1_sb    ][sb];
+            int w2 = ff_dca_grid_1_weights[g1_sb + 1][sb];
+
+            uint8_t *hr_scf = s->high_res_scf[ch][sb];
+
+            if (sb < 4) {
+                for (i = 0; i < 8; i++) {
+                    int scf = w1 * g1_scf_a[i] + w2 * g1_scf_b[i];
+                    hr_scf[i] = scf >> 7;
+                }
+            } else {
+                int8_t *g3_scf = s->grid_3_scf[ch][sb - 4];
+                int g3_avg = s->grid_3_avg[ch][sb - 4];
+
+                for (i = 0; i < 8; i++) {
+                    int scf = w1 * g1_scf_a[i] + w2 * g1_scf_b[i];
+                    hr_scf[i] = (scf >> 7) - g3_avg - g3_scf[i];
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Fill unallocated subbands with randomness
+ */
+static void random_ts(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, j, k, ch, sb;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            float *samples = s->time_samples[ch][sb];
+
+            if (s->ch_pres[ch] & (1U << sb))
+                continue;   // Skip allocated subband
+
+            if (sb < 2) {
+                // The first two subbands are always zero
+                memset(samples, 0, DCA_LBR_TIME_SAMPLES * sizeof(float));
+            } else if (sb < 10) {
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES; i++)
+                    samples[i] = lbr_rand(s, sb);
+            } else {
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES / 8; i++, samples += 8) {
+                    float accum[8] = { 0 };
+
+                    // Modulate by subbands 2-5 in blocks of 8
+                    for (k = 2; k < 6; k++) {
+                        float *other = &s->time_samples[ch][k][i * 8];
+                        for (j = 0; j < 8; j++)
+                            accum[j] += fabs(other[j]);
+                    }
+
+                    for (j = 0; j < 8; j++)
+                        samples[j] = (accum[j] * 0.25f + 0.5f) * lbr_rand(s, sb);
+                }
+            }
+        }
+    }
+}
+
+static void predict(float *samples, const float *coeff, int nsamples)
+{
+    int i, j;
+
+    for (i = 0; i < nsamples; i++) {
+        float res = 0;
+        for (j = 0; j < 8; j++)
+            res += coeff[j] * samples[i - j - 1];
+        samples[i] -= res;
+    }
+}
+
+static void synth_lpc(DCALbrDecoder *s, int ch1, int ch2, int sb)
+{
+    int f = s->framenum & 1;
+    int ch;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        float *samples = s->time_samples[ch][sb];
+
+        if (!(s->ch_pres[ch] & (1U << sb)))
+            continue;
+
+        if (sb < 2) {
+            predict(samples,      s->lpc_coeff[f^1][ch][sb][1],  16);
+            predict(samples + 16, s->lpc_coeff[f  ][ch][sb][0],  64);
+            predict(samples + 80, s->lpc_coeff[f  ][ch][sb][1],  48);
+        } else {
+            predict(samples,      s->lpc_coeff[f^1][ch][sb][0],  16);
+            predict(samples + 16, s->lpc_coeff[f  ][ch][sb][0], 112);
+        }
+    }
+}
+
+static void filter_ts(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, j, sb, ch;
+
+    for (sb = 0; sb < s->nsubbands; sb++) {
+        // Scale factors
+        for (ch = ch1; ch <= ch2; ch++) {
+            float *samples = s->time_samples[ch][sb];
+            uint8_t *hr_scf = s->high_res_scf[ch][sb];
+            if (sb < 4) {
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES / 16; i++, samples += 16) {
+                    unsigned int scf = hr_scf[i];
+                    if (scf > AMP_MAX)
+                        scf = AMP_MAX;
+                    for (j = 0; j < 16; j++)
+                        samples[j] *= ff_dca_quant_amp[scf];
+                }
+            } else {
+                uint8_t *g2_scf = s->grid_2_scf[ch][ff_dca_scf_to_grid_2[sb]];
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES / 2; i++, samples += 2) {
+                    unsigned int scf = hr_scf[i / 8] - g2_scf[i];
+                    if (scf > AMP_MAX)
+                        scf = AMP_MAX;
+                    samples[0] *= ff_dca_quant_amp[scf];
+                    samples[1] *= ff_dca_quant_amp[scf];
+                }
+            }
+        }
+
+        // Mid-side stereo
+        if (ch1 != ch2) {
+            float *samples_l = s->time_samples[ch1][sb];
+            float *samples_r = s->time_samples[ch2][sb];
+            int ch2_pres = s->ch_pres[ch2] & (1U << sb);
+
+            for (i = 0; i < DCA_LBR_TIME_SAMPLES / 16; i++) {
+                int sbms = (s->sec_ch_sbms[ch1 / 2][sb] >> i) & 1;
+                int lrms = (s->sec_ch_lrms[ch1 / 2][sb] >> i) & 1;
+
+                if (sb >= s->min_mono_subband) {
+                    if (lrms && ch2_pres) {
+                        if (sbms) {
+                            for (j = 0; j < 16; j++) {
+                                float tmp = samples_l[j];
+                                samples_l[j] =  samples_r[j];
+                                samples_r[j] = -tmp;
+                            }
+                        } else {
+                            for (j = 0; j < 16; j++) {
+                                float tmp = samples_l[j];
+                                samples_l[j] =  samples_r[j];
+                                samples_r[j] =  tmp;
+                            }
+                        }
+                    } else if (!ch2_pres) {
+                        if (sbms && (s->part_stereo_pres & (1 << ch1))) {
+                            for (j = 0; j < 16; j++)
+                                samples_r[j] = -samples_l[j];
+                        } else {
+                            for (j = 0; j < 16; j++)
+                                samples_r[j] =  samples_l[j];
+                        }
+                    }
+                } else if (sbms && ch2_pres) {
+                    for (j = 0; j < 16; j++) {
+                        float tmp = samples_l[j];
+                        samples_l[j] = (tmp + samples_r[j]) * 0.5f;
+                        samples_r[j] = (tmp - samples_r[j]) * 0.5f;
+                    }
+                }
+
+                samples_l += 16;
+                samples_r += 16;
+            }
+        }
+
+        // Inverse prediction
+        if (sb < 3)
+            synth_lpc(s, ch1, ch2, sb);
+    }
+}
+
+/**
+ * Modulate by interpolated partial stereo coefficients
+ */
+static void decode_part_stereo(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, ch, sb, sf;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        for (sb = s->min_mono_subband; sb < s->nsubbands; sb++) {
+            uint8_t *pt_st = s->part_stereo[ch][(sb - s->min_mono_subband) / 4];
+            float *samples = s->time_samples[ch][sb];
+
+            if (s->ch_pres[ch2] & (1U << sb))
+                continue;
+
+            for (sf = 1; sf <= 4; sf++, samples += 32) {
+                float prev = ff_dca_st_coeff[pt_st[sf - 1]];
+                float next = ff_dca_st_coeff[pt_st[sf    ]];
+
+                for (i = 0; i < 32; i++)
+                    samples[i] *= (32 - i) * prev + i * next;
+            }
+        }
+    }
+}
+
+/**
+ * Synthesise tones in the given group for the given tonal subframe
+ */
+static void synth_tones(DCALbrDecoder *s, int ch, float *values,
+                        int group, int group_sf, int synth_idx)
+{
+    int i, start, count;
+
+    if (synth_idx < 0)
+        return;
+
+    start =  s->tonal_bounds[group][group_sf][0];
+    count = (s->tonal_bounds[group][group_sf][1] - start) & (DCA_LBR_TONES - 1);
+
+    for (i = 0; i < count; i++) {
+        DCALbrTone *t = &s->tones[(start + i) & (DCA_LBR_TONES - 1)];
+
+        if (t->amp[ch]) {
+            float amp = ff_dca_synth_env[synth_idx] * ff_dca_quant_amp[t->amp[ch]];
+            float c = amp * cos_tab[(t->phs[ch]     ) & 255];
+            float s = amp * cos_tab[(t->phs[ch] + 64) & 255];
+            const float *cf = ff_dca_corr_cf[t->f_delt];
+            int x_freq = t->x_freq;
+
+            switch (x_freq) {
+            case 0:
+                goto p0;
+            case 1:
+                values[3] += cf[0] * -s;
+                values[2] += cf[1] *  c;
+                values[1] += cf[2] *  s;
+                values[0] += cf[3] * -c;
+                goto p1;
+            case 2:
+                values[2] += cf[0] * -s;
+                values[1] += cf[1] *  c;
+                values[0] += cf[2] *  s;
+                goto p2;
+            case 3:
+                values[1] += cf[0] * -s;
+                values[0] += cf[1] *  c;
+                goto p3;
+            case 4:
+                values[0] += cf[0] * -s;
+                goto p4;
+            }
+
+            values[x_freq - 5] += cf[ 0] * -s;
+        p4: values[x_freq - 4] += cf[ 1] *  c;
+        p3: values[x_freq - 3] += cf[ 2] *  s;
+        p2: values[x_freq - 2] += cf[ 3] * -c;
+        p1: values[x_freq - 1] += cf[ 4] * -s;
+        p0: values[x_freq    ] += cf[ 5] *  c;
+            values[x_freq + 1] += cf[ 6] *  s;
+            values[x_freq + 2] += cf[ 7] * -c;
+            values[x_freq + 3] += cf[ 8] * -s;
+            values[x_freq + 4] += cf[ 9] *  c;
+            values[x_freq + 5] += cf[10] *  s;
+        }
+
+        t->phs[ch] += t->ph_rot;
+    }
+}
+
+/**
+ * Synthesise all tones in all groups for the given residual subframe
+ */
+static void base_func_synth(DCALbrDecoder *s, int ch, float *values, int sf)
+{
+    int group;
+
+    // Tonal vs residual shift is 22 subframes
+    for (group = 0; group < 5; group++) {
+        int group_sf = (s->framenum << group) + ((sf - 22) >> (5 - group));
+        int synth_idx = ((((sf - 22) & 31) << group) & 31) + (1 << group) - 1;
+
+        synth_tones(s, ch, values, group, (group_sf - 1) & 31, 30 - synth_idx);
+        synth_tones(s, ch, values, group, (group_sf    ) & 31,      synth_idx);
+    }
+}
+
+static void transform_channel(DCALbrDecoder *s, int ch, float *output)
+{
+    LOCAL_ALIGNED_32(float, values, [DCA_LBR_SUBBANDS    ], [4]);
+    LOCAL_ALIGNED_32(float, result, [DCA_LBR_SUBBANDS * 2], [4]);
+    int sf, sb, nsubbands = s->nsubbands, noutsubbands = 8 << s->freq_range;
+
+    // Clear inactive subbands
+    if (nsubbands < noutsubbands)
+        memset(values[nsubbands], 0, (noutsubbands - nsubbands) * sizeof(values[0]));
+
+    for (sf = 0; sf < DCA_LBR_TIME_SAMPLES / 4; sf++) {
+        // Hybrid filterbank
+        s->dcadsp->lbr_bank(values, s->time_samples[ch],
+                            ff_dca_bank_coeff, sf * 4, nsubbands);
+
+        base_func_synth(s, ch, values[0], sf);
+
+        s->imdct.imdct_calc(&s->imdct, result[0], values[0]);
+
+        // Long window and overlap-add
+        s->fdsp->vector_fmul_add(output, result[0], s->window,
+                                 s->history[ch], noutsubbands * 4);
+        s->fdsp->vector_fmul_reverse(s->history[ch], result[noutsubbands],
+                                     s->window, noutsubbands * 4);
+        output += noutsubbands * 4;
+    }
+
+    // Update history for LPC and forward MDCT
+    for (sb = 0; sb < nsubbands; sb++) {
+        float *samples = s->time_samples[ch][sb] - DCA_LBR_TIME_HISTORY;
+        memcpy(samples, samples + DCA_LBR_TIME_SAMPLES, DCA_LBR_TIME_HISTORY * sizeof(float));
+    }
+}
+
+int ff_dca_lbr_filter_frame(DCALbrDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    int i, ret, nchannels, ch_conf = (s->ch_mask & 0x7) - 1;
+    const int8_t *reorder;
+
+    avctx->channel_layout = channel_layouts[ch_conf];
+    avctx->channels = nchannels = channel_counts[ch_conf];
+    avctx->sample_rate = s->sample_rate;
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    avctx->bits_per_raw_sample = 0;
+    avctx->profile = FF_PROFILE_DTS_EXPRESS;
+    avctx->bit_rate = s->bit_rate_scaled;
+
+    if (s->flags & LBR_FLAG_LFE_PRESENT) {
+        avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
+        avctx->channels++;
+        reorder = channel_reorder_lfe[ch_conf];
+    } else {
+        reorder = channel_reorder_nolfe[ch_conf];
+    }
+
+    frame->nb_samples = 1024 << s->freq_range;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Filter fullband channels
+    for (i = 0; i < (s->nchannels + 1) / 2; i++) {
+        int ch1 = i * 2;
+        int ch2 = FFMIN(ch1 + 1, s->nchannels - 1);
+
+        decode_grid(s, ch1, ch2);
+
+        random_ts(s, ch1, ch2);
+
+        filter_ts(s, ch1, ch2);
+
+        if (ch1 != ch2 && (s->part_stereo_pres & (1 << ch1)))
+            decode_part_stereo(s, ch1, ch2);
+
+        if (ch1 < nchannels)
+            transform_channel(s, ch1, (float *)frame->extended_data[reorder[ch1]]);
+
+        if (ch1 != ch2 && ch2 < nchannels)
+            transform_channel(s, ch2, (float *)frame->extended_data[reorder[ch2]]);
+    }
+
+    // Interpolate LFE channel
+    if (s->flags & LBR_FLAG_LFE_PRESENT) {
+        s->dcadsp->lfe_iir((float *)frame->extended_data[lfe_index[ch_conf]],
+                           s->lfe_data, ff_dca_lfe_iir,
+                           s->lfe_history, 16 << s->freq_range);
+    }
+
+    if ((ret = ff_side_data_update_matrix_encoding(frame, AV_MATRIX_ENCODING_NONE)) < 0)
+        return ret;
+
+    return 0;
+}
+
+av_cold void ff_dca_lbr_flush(DCALbrDecoder *s)
+{
+    int ch, sb;
+
+    if (!s->sample_rate)
+        return;
+
+    // Clear history
+    memset(s->part_stereo, 16, sizeof(s->part_stereo));
+    memset(s->lpc_coeff, 0, sizeof(s->lpc_coeff));
+    memset(s->history, 0, sizeof(s->history));
+    memset(s->tonal_bounds, 0, sizeof(s->tonal_bounds));
+    memset(s->lfe_history, 0, sizeof(s->lfe_history));
+    s->framenum = 0;
+    s->ntones = 0;
+
+    for (ch = 0; ch < s->nchannels; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            float *samples = s->time_samples[ch][sb] - DCA_LBR_TIME_HISTORY;
+            memset(samples, 0, DCA_LBR_TIME_HISTORY * sizeof(float));
+        }
+    }
+}
+
+av_cold int ff_dca_lbr_init(DCALbrDecoder *s)
+{
+    init_tables();
+
+    if (!(s->fdsp = avpriv_float_dsp_alloc(0)))
+        return -1;
+
+    s->lbr_rand = 1;
+    return 0;
+}
+
+av_cold void ff_dca_lbr_close(DCALbrDecoder *s)
+{
+    s->sample_rate = 0;
+
+    av_freep(&s->ts_buffer);
+    s->ts_size = 0;
+
+    av_freep(&s->fdsp);
+    ff_mdct_end(&s->imdct);
+}
diff --git a/libavcodec/dca_lbr.h b/libavcodec/dca_lbr.h
new file mode 100644
index 0000000..6d4c0a8
--- /dev/null
+++ b/libavcodec/dca_lbr.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_LBR_H
+#define AVCODEC_DCA_LBR_H
+
+#include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dca_exss.h"
+#include "dcadsp.h"
+#include "fft.h"
+
+#define DCA_LBR_CHANNELS        6
+#define DCA_LBR_CHANNELS_TOTAL  32
+#define DCA_LBR_SUBBANDS        32
+#define DCA_LBR_TONES           512
+
+#define DCA_LBR_TIME_SAMPLES    128
+#define DCA_LBR_TIME_HISTORY    8
+
+enum DCALBRHeader {
+    DCA_LBR_HEADER_SYNC_ONLY    = 1,
+    DCA_LBR_HEADER_DECODER_INIT = 2
+};
+
+typedef struct DCALbrTone {
+    uint8_t     x_freq;     ///< Spectral line offset
+    uint8_t     f_delt;     ///< Difference between original and center frequency
+    uint8_t     ph_rot;     ///< Phase rotation
+    uint8_t     pad;        ///< Padding field
+    uint8_t     amp[DCA_LBR_CHANNELS];  ///< Per-channel amplitude
+    uint8_t     phs[DCA_LBR_CHANNELS];  ///< Per-channel phase
+} DCALbrTone;
+
+typedef struct DCALbrDecoder {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    int     sample_rate;        ///< Sample rate of LBR audio
+    int     ch_mask;            ///< LBR speaker mask
+    int     flags;              ///< Flags for LBR decoder initialization
+    int     bit_rate_orig;      ///< Original bit rate
+    int     bit_rate_scaled;    ///< Scaled bit rate
+
+    int     nchannels;          ///< Number of fullband channels to decode
+    int     nchannels_total;    ///< Total number of fullband channels
+    int     freq_range;         ///< Frequency range of LBR audio
+    int     band_limit;         ///< Band limit factor
+    int     limited_rate;       ///< Band limited sample rate
+    int     limited_range;      ///< Band limited frequency range
+    int     res_profile;        ///< Resolution profile
+    int     nsubbands;          ///< Number of encoded subbands
+    int     g3_avg_only_start_sb;   ///< Subband index where grid 3 scale factors end
+    int     min_mono_subband;   ///< Subband index where mono encoding starts
+    int     max_mono_subband;   ///< Subband index where mono encoding ends
+
+    int     framenum;   ///< Lower 5 bits of current frame number
+    int     lbr_rand;   ///< Seed for subband randomization
+    int     warned;     ///< Flags for warning suppression
+
+    uint8_t     quant_levels[DCA_LBR_CHANNELS / 2][DCA_LBR_SUBBANDS];   ///< Quantization levels
+    uint8_t     sb_indices[DCA_LBR_SUBBANDS];   ///< Subband reordering indices
+
+    uint8_t     sec_ch_sbms[DCA_LBR_CHANNELS / 2][DCA_LBR_SUBBANDS];    ///< Right channel inversion or mid/side decoding flags
+    uint8_t     sec_ch_lrms[DCA_LBR_CHANNELS / 2][DCA_LBR_SUBBANDS];    ///< Flags indicating if left/right channel are swapped
+    uint32_t    ch_pres[DCA_LBR_CHANNELS];  ///< Subband allocation flags
+
+    uint8_t     grid_1_scf[DCA_LBR_CHANNELS][12][8];    ///< Grid 1 scale factors
+    uint8_t     grid_2_scf[DCA_LBR_CHANNELS][3][64];    ///< Grid 2 scale factors
+
+    int8_t      grid_3_avg[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS - 4];     ///< Grid 3 average values
+    int8_t      grid_3_scf[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS - 4][8];  ///< Grid 3 scale factors
+    uint32_t    grid_3_pres[DCA_LBR_CHANNELS];  ///< Grid 3 scale factors presence flags
+
+    uint8_t     high_res_scf[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS][8];    ///< High-frequency resolution scale factors
+
+    uint8_t     part_stereo[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS / 4][5]; ///< Partial stereo coefficients
+    uint8_t     part_stereo_pres;   ///< Partial stereo coefficients presence flags
+
+    float       lpc_coeff[2][DCA_LBR_CHANNELS][3][2][8];    ///< Predictor coefficients
+
+    float       sb_scf[DCA_LBR_SUBBANDS];   ///< Subband randomization scale factors
+
+    float       *time_samples[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS]; ///< Time samples
+
+    float           *ts_buffer; ///< Time sample buffer base
+    unsigned int    ts_size;    ///< Time sample buffer size
+
+    DECLARE_ALIGNED(32, float, history)[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS * 4];    ///< IMDCT history
+    DECLARE_ALIGNED(32, float, window)[DCA_LBR_SUBBANDS * 4];   ///< Long window for IMDCT
+
+    DECLARE_ALIGNED(32, float, lfe_data)[64];       ///< Decimated LFE samples
+    DECLARE_ALIGNED(32, float, lfe_history)[5][2];  ///< LFE IIR filter history
+    float lfe_scale;    ///< Scale factor of LFE samples before IIR filter
+
+    uint8_t     tonal_scf[6];           ///< Tonal scale factors
+    uint16_t    tonal_bounds[5][32][2]; ///< Per-group per-subframe start/end positions of tones
+    DCALbrTone  tones[DCA_LBR_TONES];   ///< Circular buffer of tones
+    int         ntones;                 ///< Circular buffer head position
+
+    FFTContext          imdct;
+    AVFloatDSPContext   *fdsp;
+    DCADSPContext       *dcadsp;
+} DCALbrDecoder;
+
+int ff_dca_lbr_parse(DCALbrDecoder *s, uint8_t *data, DCAExssAsset *asset);
+int ff_dca_lbr_filter_frame(DCALbrDecoder *s, AVFrame *frame);
+av_cold void ff_dca_lbr_flush(DCALbrDecoder *s);
+av_cold int ff_dca_lbr_init(DCALbrDecoder *s);
+av_cold void ff_dca_lbr_close(DCALbrDecoder *s);
+
+#endif
diff --git a/libavcodec/dca_parser.c b/libavcodec/dca_parser.c
index c33cc9a..80d6659 100644
--- a/libavcodec/dca_parser.c
+++ b/libavcodec/dca_parser.c
@@ -5,24 +5,27 @@
  * Copyright (C) 2006 Benjamin Larsson
  * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "dca.h"
+#include "dca_core.h"
+#include "dca_exss.h"
+#include "dca_lbr.h"
 #include "dca_syncwords.h"
 #include "get_bits.h"
 #include "parser.h"
@@ -32,6 +35,9 @@ typedef struct DCAParseContext {
     uint32_t lastmarker;
     int size;
     int framesize;
+    unsigned int startpos;
+    DCAExssParser exss;
+    unsigned int sr_code;
 } DCAParseContext;
 
 #define IS_CORE_MARKER(state) \
@@ -47,6 +53,14 @@ typedef struct DCAParseContext {
 #define CORE_MARKER(state)      ((state >> 16) & 0xFFFFFFFF)
 #define EXSS_MARKER(state)      (state & 0xFFFFFFFF)
 
+#define STATE_LE(state)     (((state & 0xFF00FF00) >> 8) | ((state & 0x00FF00FF) << 8))
+#define STATE_14(state)     (((state & 0x3FFF0000) >> 8) | ((state & 0x00003FFF) >> 6))
+
+#define CORE_FRAMESIZE(state)   (((state >> 4) & 0x3FFF) + 1)
+#define EXSS_FRAMESIZE(state)   ((state & 0x2000000000) ? \
+                                 ((state >>  5) & 0xFFFFF) + 1 : \
+                                 ((state >> 13) & 0x0FFFF) + 1)
+
 /**
  * Find the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame, or -1
@@ -54,41 +68,105 @@ typedef struct DCAParseContext {
 static int dca_find_frame_end(DCAParseContext *pc1, const uint8_t *buf,
                               int buf_size)
 {
-    int start_found, i;
+    int start_found, size, i;
     uint64_t state;
     ParseContext *pc = &pc1->pc;
 
     start_found = pc->frame_start_found;
     state       = pc->state64;
+    size        = pc1->size;
 
     i = 0;
     if (!start_found) {
-        for (i = 0; i < buf_size; i++) {
+        for (; i < buf_size; i++) {
+            size++;
             state = (state << 8) | buf[i];
-            if (IS_MARKER(state)) {
-                if (!pc1->lastmarker ||
-                    pc1->lastmarker == CORE_MARKER(state) ||
-                    pc1->lastmarker == DCA_SYNCWORD_SUBSTREAM) {
-                    start_found = 1;
-                    if (IS_EXSS_MARKER(state))
-                        pc1->lastmarker = EXSS_MARKER(state);
-                    else
-                        pc1->lastmarker = CORE_MARKER(state);
-                    i++;
-                    break;
-                }
+
+            if (IS_MARKER(state) &&
+                (!pc1->lastmarker ||
+                  pc1->lastmarker == CORE_MARKER(state) ||
+                  pc1->lastmarker == DCA_SYNCWORD_SUBSTREAM)) {
+                if (!pc1->lastmarker)
+                    pc1->startpos = IS_EXSS_MARKER(state) ? size - 4 : size - 6;
+
+                if (IS_EXSS_MARKER(state))
+                    pc1->lastmarker = EXSS_MARKER(state);
+                else
+                    pc1->lastmarker = CORE_MARKER(state);
+
+                start_found = 1;
+                size        = 0;
+
+                i++;
+                break;
             }
         }
     }
+
     if (start_found) {
         for (; i < buf_size; i++) {
-            pc1->size++;
+            size++;
             state = (state << 8) | buf[i];
+
+            if (start_found == 1) {
+                switch (pc1->lastmarker) {
+                case DCA_SYNCWORD_CORE_BE:
+                    if (size == 2) {
+                        pc1->framesize = CORE_FRAMESIZE(state);
+                        start_found    = 2;
+                    }
+                    break;
+                case DCA_SYNCWORD_CORE_LE:
+                    if (size == 2) {
+                        pc1->framesize = CORE_FRAMESIZE(STATE_LE(state));
+                        start_found    = 4;
+                    }
+                    break;
+                case DCA_SYNCWORD_CORE_14B_BE:
+                    if (size == 4) {
+                        pc1->framesize = CORE_FRAMESIZE(STATE_14(state));
+                        start_found    = 4;
+                    }
+                    break;
+                case DCA_SYNCWORD_CORE_14B_LE:
+                    if (size == 4) {
+                        pc1->framesize = CORE_FRAMESIZE(STATE_14(STATE_LE(state)));
+                        start_found    = 4;
+                    }
+                    break;
+                case DCA_SYNCWORD_SUBSTREAM:
+                    if (size == 6) {
+                        pc1->framesize = EXSS_FRAMESIZE(state);
+                        start_found    = 4;
+                    }
+                    break;
+                default:
+                    av_assert0(0);
+                }
+                continue;
+            }
+
+            if (start_found == 2 && IS_EXSS_MARKER(state) &&
+                pc1->framesize <= size + 2) {
+                pc1->framesize  = size + 2;
+                start_found     = 3;
+                continue;
+            }
+
+            if (start_found == 3) {
+                if (size == pc1->framesize + 4) {
+                    pc1->framesize += EXSS_FRAMESIZE(state);
+                    start_found     = 4;
+                }
+                continue;
+            }
+
+            if (pc1->framesize > size)
+                continue;
+
             if (IS_MARKER(state) &&
                 (pc1->lastmarker == CORE_MARKER(state) ||
                  pc1->lastmarker == DCA_SYNCWORD_SUBSTREAM)) {
-                if (pc1->framesize > pc1->size)
-                    continue;
                 pc->frame_start_found = 0;
                 pc->state64           = -1;
                 pc1->size             = 0;
@@ -96,8 +174,10 @@ static int dca_find_frame_end(DCAParseContext *pc1, const uint8_t *buf,
             }
         }
     }
+
     pc->frame_start_found = start_found;
     pc->state64           = state;
+    pc1->size             = size;
     return END_NOT_FOUND;
 }
 
@@ -106,40 +186,120 @@ static av_cold int dca_parse_init(AVCodecParserContext *s)
     DCAParseContext *pc1 = s->priv_data;
 
     pc1->lastmarker = 0;
+    pc1->sr_code = -1;
     return 0;
 }
 
-static int dca_parse_params(const uint8_t *buf, int buf_size, int *duration,
-                            int *sample_rate, int *framesize)
+static int dca_parse_params(DCAParseContext *pc1, const uint8_t *buf,
+                            int buf_size, int *duration, int *sample_rate,
+                            int *profile)
 {
+    DCAExssAsset *asset = &pc1->exss.assets[0];
     GetBitContext gb;
-    uint8_t hdr[12 + AV_INPUT_BUFFER_PADDING_SIZE] = { 0 };
-    int ret, sample_blocks, sr_code;
+    DCACoreFrameHeader h;
+    uint8_t hdr[DCA_CORE_FRAME_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE] = { 0 };
+    int ret, frame_size;
 
-    if (buf_size < 12)
+    if (buf_size < DCA_CORE_FRAME_HEADER_SIZE)
         return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_dca_convert_bitstream(buf, 12, hdr, 12)) < 0)
-        return ret;
+    if (AV_RB32(buf) == DCA_SYNCWORD_SUBSTREAM) {
+        if ((ret = ff_dca_exss_parse(&pc1->exss, buf, buf_size)) < 0)
+            return ret;
 
-    init_get_bits(&gb, hdr, 96);
+        if (asset->extension_mask & DCA_EXSS_LBR) {
+            if ((ret = init_get_bits8(&gb, buf + asset->lbr_offset, asset->lbr_size)) < 0)
+                return ret;
 
-    skip_bits_long(&gb, 39);
-    sample_blocks = get_bits(&gb, 7) + 1;
-    if (sample_blocks < 8)
-        return AVERROR_INVALIDDATA;
-    *duration = 256 * (sample_blocks / 8);
+            if (get_bits_long(&gb, 32) != DCA_SYNCWORD_LBR)
+                return AVERROR_INVALIDDATA;
+
+            switch (get_bits(&gb, 8)) {
+            case DCA_LBR_HEADER_DECODER_INIT:
+                pc1->sr_code = get_bits(&gb, 8);
+            case DCA_LBR_HEADER_SYNC_ONLY:
+                break;
+            default:
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (pc1->sr_code >= FF_ARRAY_ELEMS(ff_dca_sampling_freqs))
+                return AVERROR_INVALIDDATA;
+
+            *sample_rate = ff_dca_sampling_freqs[pc1->sr_code];
+            *duration = 1024 << ff_dca_freq_ranges[pc1->sr_code];
+            *profile = FF_PROFILE_DTS_EXPRESS;
+            return 0;
+        }
+
+        if (asset->extension_mask & DCA_EXSS_XLL) {
+            int nsamples_log2;
+
+            if ((ret = init_get_bits8(&gb, buf + asset->xll_offset, asset->xll_size)) < 0)
+                return ret;
+
+            if (get_bits_long(&gb, 32) != DCA_SYNCWORD_XLL)
+                return AVERROR_INVALIDDATA;
+
+            if (get_bits(&gb, 4))
+                return AVERROR_INVALIDDATA;
+
+            skip_bits(&gb, 8);
+            skip_bits_long(&gb, get_bits(&gb, 5) + 1);
+            skip_bits(&gb, 4);
+            nsamples_log2 = get_bits(&gb, 4) + get_bits(&gb, 4);
+            if (nsamples_log2 > 24)
+                return AVERROR_INVALIDDATA;
+
+            *sample_rate = asset->max_sample_rate;
+            *duration = (1 + (*sample_rate > 96000)) << nsamples_log2;
+            *profile = FF_PROFILE_DTS_HD_MA;
+            return 0;
+        }
 
-    *framesize = get_bits(&gb, 14) + 1;
-    if (*framesize < 95)
         return AVERROR_INVALIDDATA;
+    }
 
-    skip_bits(&gb, 6);
-    sr_code      = get_bits(&gb, 4);
-    *sample_rate = avpriv_dca_sample_rates[sr_code];
-    if (*sample_rate == 0)
+    if ((ret = avpriv_dca_convert_bitstream(buf, DCA_CORE_FRAME_HEADER_SIZE,
+                                            hdr, DCA_CORE_FRAME_HEADER_SIZE)) < 0)
+        return ret;
+    if (avpriv_dca_parse_core_frame_header(&h, hdr, ret) < 0)
         return AVERROR_INVALIDDATA;
 
+    *duration = h.npcmblocks * DCA_PCMBLOCK_SAMPLES;
+    *sample_rate = avpriv_dca_sample_rates[h.sr_code];
+    if (*profile != FF_PROFILE_UNKNOWN)
+        return 0;
+
+    *profile = FF_PROFILE_DTS;
+    if (h.ext_audio_present) {
+        switch (h.ext_audio_type) {
+        case DCA_EXT_AUDIO_XCH:
+        case DCA_EXT_AUDIO_XXCH:
+            *profile = FF_PROFILE_DTS_ES;
+            break;
+        case DCA_EXT_AUDIO_X96:
+            *profile = FF_PROFILE_DTS_96_24;
+            break;
+        }
+    }
+
+    frame_size = FFALIGN(h.frame_size, 4);
+    if (buf_size - 4 < frame_size)
+        return 0;
+
+    buf      += frame_size;
+    buf_size -= frame_size;
+    if (AV_RB32(buf) != DCA_SYNCWORD_SUBSTREAM)
+        return 0;
+    if (ff_dca_exss_parse(&pc1->exss, buf, buf_size) < 0)
+        return 0;
+
+    if (asset->extension_mask & DCA_EXSS_XLL)
+        *profile = FF_PROFILE_DTS_HD_MA;
+    else if (asset->extension_mask & (DCA_EXSS_XBR | DCA_EXSS_XXCH | DCA_EXSS_X96))
+        *profile = FF_PROFILE_DTS_HD_HRA;
+
     return 0;
 }
 
@@ -161,12 +321,20 @@ static int dca_parse(AVCodecParserContext *s, AVCodecContext *avctx,
             *poutbuf_size = 0;
             return buf_size;
         }
+
+        /* skip initial padding */
+        if (buf_size  > pc1->startpos) {
+            buf      += pc1->startpos;
+            buf_size -= pc1->startpos;
+        }
+        pc1->startpos = 0;
     }
 
     /* read the duration and sample rate from the frame header */
-    if (!dca_parse_params(buf, buf_size, &duration, &sample_rate, &pc1->framesize)) {
-        s->duration        = duration;
-        avctx->sample_rate = sample_rate;
+    if (!dca_parse_params(pc1, buf, buf_size, &duration, &sample_rate, &avctx->profile)) {
+        if (!avctx->sample_rate)
+            avctx->sample_rate = sample_rate;
+        s->duration = av_rescale(duration, avctx->sample_rate, sample_rate);
     } else
         s->duration = 0;
 
diff --git a/libavcodec/dca_syncwords.h b/libavcodec/dca_syncwords.h
index 07b60e0..4d2cd5f 100644
--- a/libavcodec/dca_syncwords.h
+++ b/libavcodec/dca_syncwords.h
@@ -1,37 +1,36 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DCA_SYNCWORDS_H
 #define AVCODEC_DCA_SYNCWORDS_H
 
-enum DCASyncwords {
-    DCA_SYNCWORD_CORE_BE        = 0x7FFE8001U,
-    DCA_SYNCWORD_CORE_LE        = 0xFE7F0180U,
-    DCA_SYNCWORD_CORE_14B_BE    = 0x1FFFE800U,
-    DCA_SYNCWORD_CORE_14B_LE    = 0xFF1F00E8U,
-    DCA_SYNCWORD_XCH            = 0x5A5A5A5AU,
-    DCA_SYNCWORD_XXCH           = 0x47004A03U,
-    DCA_SYNCWORD_X96            = 0x1D95F262U,
-    DCA_SYNCWORD_XBR            = 0x655E315EU,
-    DCA_SYNCWORD_LBR            = 0x0A801921U,
-    DCA_SYNCWORD_XLL            = 0x41A29547U,
-    DCA_SYNCWORD_SUBSTREAM      = 0x64582025U,
-    DCA_SYNCWORD_SUBSTREAM_CORE = 0x02B09261U,
-};
+#define    DCA_SYNCWORD_CORE_BE              0x7FFE8001U
+#define    DCA_SYNCWORD_CORE_LE              0xFE7F0180U
+#define    DCA_SYNCWORD_CORE_14B_BE          0x1FFFE800U
+#define    DCA_SYNCWORD_CORE_14B_LE          0xFF1F00E8U
+#define    DCA_SYNCWORD_XCH                  0x5A5A5A5AU
+#define    DCA_SYNCWORD_XXCH                 0x47004A03U
+#define    DCA_SYNCWORD_X96                  0x1D95F262U
+#define    DCA_SYNCWORD_XBR                  0x655E315EU
+#define    DCA_SYNCWORD_LBR                  0x0A801921U
+#define    DCA_SYNCWORD_XLL                  0x41A29547U
+#define    DCA_SYNCWORD_SUBSTREAM            0x64582025U
+#define    DCA_SYNCWORD_SUBSTREAM_CORE       0x02B09261U
+#define    DCA_SYNCWORD_REV1AUX              0x9A1105A0U
 
 #endif /* AVCODEC_DCA_SYNCWORDS_H */
diff --git a/libavcodec/dca_xll.c b/libavcodec/dca_xll.c
index 5d76793..d265cab 100644
--- a/libavcodec/dca_xll.c
+++ b/libavcodec/dca_xll.c
@@ -1,747 +1,1493 @@
 /*
- * DCA XLL extension
+ * Copyright (C) 2016 foo86
  *
- * Copyright (C) 2012 Paul B Mahol
- * Copyright (C) 2014 Niels Möller
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/attributes.h"
-#include "libavutil/common.h"
-#include "libavutil/internal.h"
-
-#include "avcodec.h"
-#include "dca.h"
+#include "dcadec.h"
 #include "dcadata.h"
-#include "get_bits.h"
-#include "unary_legacy.h"
+#include "dcamath.h"
+#include "dca_syncwords.h"
+#include "unary.h"
 
-/* Sign as bit 0 */
-static inline int get_bits_sm(GetBitContext *s, unsigned n)
+static int get_linear(GetBitContext *gb, int n)
 {
-    int x = get_bits(s, n);
-    if (x & 1)
-        return -(x >> 1) - 1;
-    else
-        return x >> 1;
-}
-
-/* Return -1 on error. */
-static int32_t get_dmix_coeff(DCAContext *s, int inverse)
-{
-    unsigned code = get_bits(&s->gb, 9);
-    int32_t sign = (int32_t) (code >> 8) - 1;
-    unsigned idx = code & 0xff;
-    int inv_offset = FF_DCA_DMIXTABLE_SIZE -FF_DCA_INV_DMIXTABLE_SIZE;
-    if (idx >= FF_DCA_DMIXTABLE_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "XLL: Invalid channel set downmix code %x\n", code);
-        return -1;
-    } else if (!inverse) {
-        return (ff_dca_dmixtable[idx] ^ sign) - sign;
-    } else if (idx < inv_offset) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "XLL: Invalid channel set inverse downmix code %x\n", code);
-        return -1;
-    } else {
-        return (ff_dca_inv_dmixtable[idx - inv_offset] ^ sign) - sign;
-    }
+    unsigned int v = get_bits_long(gb, n);
+    return (v >> 1) ^ -(v & 1);
+}
+
+static int get_rice_un(GetBitContext *gb, int k)
+{
+    unsigned int v = get_unary(gb, 1, get_bits_left(gb));
+    return (v << k) | get_bits_long(gb, k);
 }
 
-static int32_t dca_get_dmix_coeff(DCAContext *s)
+static int get_rice(GetBitContext *gb, int k)
 {
-    return get_dmix_coeff(s, 0);
+    unsigned int v = get_rice_un(gb, k);
+    return (v >> 1) ^ -(v & 1);
 }
 
-static int32_t dca_get_inv_dmix_coeff(DCAContext *s)
+static void get_array(GetBitContext *gb, int32_t *array, int size, int n)
 {
-    return get_dmix_coeff(s, 1);
+    int i;
+
+    for (i = 0; i < size; i++)
+        array[i] = get_bits(gb, n);
 }
 
-/* parse XLL header */
-int ff_dca_xll_decode_header(DCAContext *s)
+static void get_linear_array(GetBitContext *gb, int32_t *array, int size, int n)
 {
-    int hdr_pos, hdr_size;
-    av_unused int version, frame_size;
-    int i, chset_index;
+    int i;
 
-    /* get bit position of sync header */
-    hdr_pos    = get_bits_count(&s->gb) - 32;
+    if (n == 0)
+        memset(array, 0, sizeof(*array) * size);
+    else for (i = 0; i < size; i++)
+        array[i] = get_linear(gb, n);
+}
 
-    version    = get_bits(&s->gb, 4) + 1;
-    hdr_size   = get_bits(&s->gb, 8) + 1;
+static void get_rice_array(GetBitContext *gb, int32_t *array, int size, int k)
+{
+    int i;
 
-    frame_size = get_bits_long(&s->gb, get_bits(&s->gb, 5) + 1) + 1;
+    for (i = 0; i < size; i++)
+        array[i] = get_rice(gb, k);
+}
 
-    s->xll_channels          =
-    s->xll_residual_channels = 0;
-    s->xll_nch_sets          = get_bits(&s->gb, 4) + 1;
-    s->xll_segments          = 1 << get_bits(&s->gb, 4);
-    s->xll_log_smpl_in_seg   = get_bits(&s->gb, 4);
-    s->xll_smpl_in_seg       = 1 << s->xll_log_smpl_in_seg;
-    s->xll_bits4seg_size     = get_bits(&s->gb, 5) + 1;
-    s->xll_banddata_crc      = get_bits(&s->gb, 2);
-    s->xll_scalable_lsb      = get_bits1(&s->gb);
-    s->xll_bits4ch_mask      = get_bits(&s->gb, 5) + 1;
+static int parse_dmix_coeffs(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    // Size of downmix coefficient matrix
+    int m = c->primary_chset ? ff_dca_dmix_primary_nch[c->dmix_type] : c->hier_ofs;
+    int i, j, *coeff_ptr = c->dmix_coeff;
+
+    for (i = 0; i < m; i++) {
+        int code, sign, coeff, scale, scale_inv = 0;
+        unsigned int index;
+
+        // Downmix scale (only for non-primary channel sets)
+        if (!c->primary_chset) {
+            code = get_bits(&s->gb, 9);
+            sign = (code >> 8) - 1;
+            index = (code & 0xff) - FF_DCA_DMIXTABLE_OFFSET;
+            if (index >= FF_DCA_INV_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL downmix scale index\n");
+                return AVERROR_INVALIDDATA;
+            }
+            scale = ff_dca_dmixtable[index + FF_DCA_DMIXTABLE_OFFSET];
+            scale_inv = ff_dca_inv_dmixtable[index];
+            c->dmix_scale[i] = (scale ^ sign) - sign;
+            c->dmix_scale_inv[i] = (scale_inv ^ sign) - sign;
+        }
 
-    if (s->xll_scalable_lsb) {
-        s->xll_fixed_lsb_width = get_bits(&s->gb, 4);
-        if (s->xll_fixed_lsb_width)
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "XLL: fixed lsb width = %d, non-zero not supported.\n",
-                   s->xll_fixed_lsb_width);
+        // Downmix coefficients
+        for (j = 0; j < c->nchannels; j++) {
+            code = get_bits(&s->gb, 9);
+            sign = (code >> 8) - 1;
+            index = code & 0xff;
+            if (index >= FF_DCA_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL downmix coefficient index\n");
+                return AVERROR_INVALIDDATA;
+            }
+            coeff = ff_dca_dmixtable[index];
+            if (!c->primary_chset)
+                // Multiply by |InvDmixScale| to get |UndoDmixScale|
+                coeff = mul16(scale_inv, coeff);
+            *coeff_ptr++ = (coeff ^ sign) - sign;
+        }
     }
-    /* skip to the end of the common header */
-    i = get_bits_count(&s->gb);
-    if (hdr_pos + hdr_size * 8 > i)
-        skip_bits_long(&s->gb, hdr_pos + hdr_size * 8 - i);
 
-    for (chset_index = 0; chset_index < s->xll_nch_sets; chset_index++) {
-        XllChSetSubHeader *chset = &s->xll_chsets[chset_index];
-        hdr_pos  = get_bits_count(&s->gb);
-        hdr_size = get_bits(&s->gb, 10) + 1;
+    return 0;
+}
+
+static int chs_parse_header(DCAXllDecoder *s, DCAXllChSet *c, DCAExssAsset *asset)
+{
+    int i, j, k, ret, band, header_size, header_pos = get_bits_count(&s->gb);
+    DCAXllChSet *p = &s->chset[0];
+    DCAXllBand *b;
 
-        chset->channels           = get_bits(&s->gb, 4) + 1;
-        chset->residual_encode    = get_bits(&s->gb, chset->channels);
-        chset->bit_resolution     = get_bits(&s->gb, 5) + 1;
-        chset->bit_width          = get_bits(&s->gb, 5) + 1;
-        chset->sampling_frequency = ff_dca_sampling_freqs[get_bits(&s->gb, 4)];
-        chset->samp_freq_interp   = get_bits(&s->gb, 2);
-        chset->replacement_set    = get_bits(&s->gb, 2);
-        if (chset->replacement_set)
-            chset->active_replace_set = get_bits(&s->gb, 1);
+    // Size of channel set sub-header
+    header_size = get_bits(&s->gb, 10) + 1;
 
-        if (s->one2one_map_chtospkr) {
-            chset->primary_ch_set              = get_bits(&s->gb, 1);
-            chset->downmix_coeff_code_embedded = get_bits(&s->gb, 1);
-            if (chset->downmix_coeff_code_embedded) {
-                chset->downmix_embedded = get_bits(&s->gb, 1);
-                if (chset->primary_ch_set) {
-                    chset->downmix_type = get_bits(&s->gb, 3);
-                    if (chset->downmix_type > 6) {
-                        av_log(s->avctx, AV_LOG_ERROR,
-                               "XLL: Invalid channel set downmix type\n");
-                        return AVERROR_INVALIDDATA;
-                    }
-                }
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL sub-header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of channels in the channel set
+    c->nchannels = get_bits(&s->gb, 4) + 1;
+    if (c->nchannels > DCA_XLL_CHANNELS_MAX) {
+        avpriv_request_sample(s->avctx, "%d XLL channels", c->nchannels);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Residual type
+    c->residual_encode = get_bits(&s->gb, c->nchannels);
+
+    // PCM bit resolution
+    c->pcm_bit_res = get_bits(&s->gb, 5) + 1;
+
+    // Storage unit width
+    c->storage_bit_res = get_bits(&s->gb, 5) + 1;
+    if (c->storage_bit_res != 16 && c->storage_bit_res != 20 && c->storage_bit_res != 24) {
+        avpriv_request_sample(s->avctx, "%d-bit XLL storage resolution", c->storage_bit_res);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (c->pcm_bit_res > c->storage_bit_res) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid PCM bit resolution for XLL channel set (%d > %d)\n", c->pcm_bit_res, c->storage_bit_res);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Original sampling frequency
+    c->freq = ff_dca_sampling_freqs[get_bits(&s->gb, 4)];
+    if (c->freq > 192000) {
+        avpriv_request_sample(s->avctx, "%d Hz XLL sampling frequency", c->freq);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Sampling frequency modifier
+    if (get_bits(&s->gb, 2)) {
+        avpriv_request_sample(s->avctx, "XLL sampling frequency modifier");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Which replacement set this channel set is member of
+    if (get_bits(&s->gb, 2)) {
+        avpriv_request_sample(s->avctx, "XLL replacement set");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (asset->one_to_one_map_ch_to_spkr) {
+        // Primary channel set flag
+        c->primary_chset = get_bits1(&s->gb);
+        if (c->primary_chset != (c == p)) {
+            av_log(s->avctx, AV_LOG_ERROR, "The first (and only) XLL channel set must be primary\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Downmix coefficients present in stream
+        c->dmix_coeffs_present = get_bits1(&s->gb);
+
+        // Downmix already performed by encoder
+        c->dmix_embedded = c->dmix_coeffs_present && get_bits1(&s->gb);
+
+        // Downmix type
+        if (c->dmix_coeffs_present && c->primary_chset) {
+            c->dmix_type = get_bits(&s->gb, 3);
+            if (c->dmix_type >= DCA_DMIX_TYPE_COUNT) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL primary channel set downmix type\n");
+                return AVERROR_INVALIDDATA;
             }
-            chset->hier_chset = get_bits(&s->gb, 1);
-
-            if (chset->downmix_coeff_code_embedded) {
-                /* nDownmixCoeffs is specified as N * M. For a primary
-                 * channel set, it appears that N = number of
-                 * channels, and M is the number of downmix channels.
-                 *
-                 * For a non-primary channel set, N is specified as
-                 * number of channels + 1, and M is derived from the
-                 * channel set hierarchy, and at least in simple cases
-                 * M is the number of channels in preceding channel
-                 * sets. */
-                if (chset->primary_ch_set) {
-                    static const char dmix_table[7] = { 1, 2, 2, 3, 3, 4, 4 };
-                    chset->downmix_ncoeffs = chset->channels * dmix_table[chset->downmix_type];
-                } else
-                    chset->downmix_ncoeffs = (chset->channels + 1) * s->xll_channels;
-
-                if (chset->downmix_ncoeffs > DCA_XLL_DMIX_NCOEFFS_MAX) {
-                    avpriv_request_sample(s->avctx,
-                                          "XLL: More than %d downmix coefficients",
-                                          DCA_XLL_DMIX_NCOEFFS_MAX);
-                    return AVERROR_PATCHWELCOME;
-                } else if (chset->primary_ch_set) {
-                    for (i = 0; i < chset->downmix_ncoeffs; i++)
-                        if ((chset->downmix_coeffs[i] = dca_get_dmix_coeff(s)) == -1)
-                            return AVERROR_INVALIDDATA;
-                } else {
-                    unsigned c, r;
-                    for (c = 0, i = 0; c < s->xll_channels; c++, i += chset->channels + 1) {
-                        if ((chset->downmix_coeffs[i] = dca_get_inv_dmix_coeff(s)) == -1)
-                            return AVERROR_INVALIDDATA;
-                        for (r = 1; r <= chset->channels; r++) {
-                            int32_t coeff = dca_get_dmix_coeff(s);
-                            if (coeff == -1)
-                                return AVERROR_INVALIDDATA;
-                            chset->downmix_coeffs[i + r] =
-                                (chset->downmix_coeffs[i] * (int64_t) coeff + (1 << 15)) >> 16;
-                        }
-                    }
+        }
+
+        // Whether the channel set is part of a hierarchy
+        c->hier_chset = get_bits1(&s->gb);
+        if (!c->hier_chset && s->nchsets != 1) {
+            avpriv_request_sample(s->avctx, "XLL channel set outside of hierarchy");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        // Downmix coefficients
+        if (c->dmix_coeffs_present && (ret = parse_dmix_coeffs(s, c)) < 0)
+            return ret;
+
+        // Channel mask enabled
+        if (!get_bits1(&s->gb)) {
+            avpriv_request_sample(s->avctx, "Disabled XLL channel mask");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        // Channel mask for set
+        c->ch_mask = get_bits_long(&s->gb, s->ch_mask_nbits);
+        if (av_popcount(c->ch_mask) != c->nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL channel mask\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Build the channel to speaker map
+        for (i = 0, j = 0; i < s->ch_mask_nbits; i++)
+            if (c->ch_mask & (1U << i))
+                c->ch_remap[j++] = i;
+    } else {
+        // Mapping coeffs present flag
+        if (c->nchannels != 2 || s->nchsets != 1 || get_bits1(&s->gb)) {
+            avpriv_request_sample(s->avctx, "Custom XLL channel to speaker mapping");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        // Setup for LtRt decoding
+        c->primary_chset = 1;
+        c->dmix_coeffs_present = 0;
+        c->dmix_embedded = 0;
+        c->hier_chset = 0;
+        c->ch_mask = DCA_SPEAKER_LAYOUT_STEREO;
+        c->ch_remap[0] = DCA_SPEAKER_L;
+        c->ch_remap[1] = DCA_SPEAKER_R;
+    }
+
+    if (c->freq > 96000) {
+        // Extra frequency bands flag
+        if (get_bits1(&s->gb)) {
+            avpriv_request_sample(s->avctx, "Extra XLL frequency bands");
+            return AVERROR_PATCHWELCOME;
+        }
+        c->nfreqbands = 2;
+    } else {
+        c->nfreqbands = 1;
+    }
+
+    // Set the sampling frequency to that of the first frequency band.
+    // Frequency will be doubled again after bands assembly.
+    c->freq >>= c->nfreqbands - 1;
+
+    // Verify that all channel sets have the same audio characteristics
+    if (c != p && (c->nfreqbands != p->nfreqbands || c->freq != p->freq
+                   || c->pcm_bit_res != p->pcm_bit_res
+                   || c->storage_bit_res != p->storage_bit_res)) {
+        avpriv_request_sample(s->avctx, "Different XLL audio characteristics");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Determine number of bits to read bit allocation coding parameter
+    if (c->storage_bit_res > 16)
+        c->nabits = 5;
+    else if (c->storage_bit_res > 8)
+        c->nabits = 4;
+    else
+        c->nabits = 3;
+
+    // Account for embedded downmix and decimator saturation
+    if ((s->nchsets > 1 || c->nfreqbands > 1) && c->nabits < 5)
+        c->nabits++;
+
+    for (band = 0, b = c->bands; band < c->nfreqbands; band++, b++) {
+        // Pairwise channel decorrelation
+        if ((b->decor_enabled = get_bits1(&s->gb)) && c->nchannels > 1) {
+            int ch_nbits = av_ceil_log2(c->nchannels);
+
+            // Original channel order
+            for (i = 0; i < c->nchannels; i++) {
+                b->orig_order[i] = get_bits(&s->gb, ch_nbits);
+                if (b->orig_order[i] >= c->nchannels) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL original channel order\n");
+                    return AVERROR_INVALIDDATA;
                 }
             }
-            chset->ch_mask_enabled = get_bits(&s->gb, 1);
-            if (chset->ch_mask_enabled)
-                chset->ch_mask = get_bits(&s->gb, s->xll_bits4ch_mask);
-            else
-                /* Skip speaker configuration bits */
-                skip_bits_long(&s->gb, 25 * chset->channels);
+
+            // Pairwise channel coefficients
+            for (i = 0; i < c->nchannels / 2; i++)
+                b->decor_coeff[i] = get_bits1(&s->gb) ? get_linear(&s->gb, 7) : 0;
         } else {
-            chset->primary_ch_set              = 1;
-            chset->downmix_coeff_code_embedded = 0;
-            /* Spec: NumChHierChSet = 0, NumDwnMixCodeCoeffs = 0, whatever that means. */
-            chset->mapping_coeffs_present = get_bits(&s->gb, 1);
-            if (chset->mapping_coeffs_present) {
-                avpriv_report_missing_feature(s->avctx, "XLL: mapping coefficients");
-                return AVERROR_PATCHWELCOME;
-            }
+            for (i = 0; i < c->nchannels; i++)
+                b->orig_order[i] = i;
+            for (i = 0; i < c->nchannels / 2; i++)
+                b->decor_coeff[i] = 0;
         }
-        if (chset->sampling_frequency > 96000)
-            chset->num_freq_bands = 2 * (1 + get_bits(&s->gb, 1));
-        else
-            chset->num_freq_bands = 1;
 
-        if (chset->num_freq_bands > 1) {
-            avpriv_report_missing_feature(s->avctx, "XLL: num_freq_bands > 1");
-            return AVERROR_PATCHWELCOME;
+        // Adaptive predictor order
+        b->highest_pred_order = 0;
+        for (i = 0; i < c->nchannels; i++) {
+            b->adapt_pred_order[i] = get_bits(&s->gb, 4);
+            if (b->adapt_pred_order[i] > b->highest_pred_order)
+                b->highest_pred_order = b->adapt_pred_order[i];
+        }
+        if (b->highest_pred_order > s->nsegsamples) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL adaptive predicition order\n");
+            return AVERROR_INVALIDDATA;
         }
 
-        if (get_bits(&s->gb, 1)) { /* pw_ch_decor_enabled */
-            int bits = av_ceil_log2(chset->channels);
-            for (i = 0; i < chset->channels; i++) {
-                unsigned j = get_bits(&s->gb, bits);
-                if (j >= chset->channels) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Original channel order value %u too large, only %d channels.\n",
-                           j, chset->channels);
+        // Fixed predictor order
+        for (i = 0; i < c->nchannels; i++)
+            b->fixed_pred_order[i] = b->adapt_pred_order[i] ? 0 : get_bits(&s->gb, 2);
+
+        // Adaptive predictor quantized reflection coefficients
+        for (i = 0; i < c->nchannels; i++) {
+            for (j = 0; j < b->adapt_pred_order[i]; j++) {
+                k = get_linear(&s->gb, 8);
+                if (k == -128) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL reflection coefficient index\n");
                     return AVERROR_INVALIDDATA;
                 }
-                chset->orig_chan_order[0][i]     = j;
-                chset->orig_chan_order_inv[0][j] = i;
-            }
-            for (i = 0; i < chset->channels / 2; i++) {
-                if (get_bits(&s->gb, 1)) /* bChPFlag */
-                    chset->pw_ch_pairs_coeffs[0][i] = get_bits_sm(&s->gb, 7);
+                if (k < 0)
+                    b->adapt_refl_coeff[i][j] = -(int)ff_dca_xll_refl_coeff[-k];
                 else
-                    chset->pw_ch_pairs_coeffs[0][i] = 0;
+                    b->adapt_refl_coeff[i][j] =  (int)ff_dca_xll_refl_coeff[ k];
             }
-        } else {
-            for (i = 0; i < chset->channels; i++)
-                chset->orig_chan_order[0][i]     =
-                chset->orig_chan_order_inv[0][i] = i;
-            for (i = 0; i < chset->channels / 2; i++)
-                chset->pw_ch_pairs_coeffs[0][i] = 0;
-        }
-        /* Adaptive prediction order */
-        chset->adapt_order_max[0] = 0;
-        for (i = 0; i < chset->channels; i++) {
-            chset->adapt_order[0][i] = get_bits(&s->gb, 4);
-            if (chset->adapt_order_max[0] < chset->adapt_order[0][i])
-                chset->adapt_order_max[0] = chset->adapt_order[0][i];
-        }
-        /* Fixed prediction order, used in case the adaptive order
-         * above is zero */
-        for (i = 0; i < chset->channels; i++)
-            chset->fixed_order[0][i] =
-                chset->adapt_order[0][i] ? 0 : get_bits(&s->gb, 2);
-
-        for (i = 0; i < chset->channels; i++) {
-            unsigned j;
-            for (j = 0; j < chset->adapt_order[0][i]; j++)
-                chset->lpc_refl_coeffs_q_ind[0][i][j] = get_bits(&s->gb, 8);
-        }
-
-        if (s->xll_scalable_lsb) {
-            chset->lsb_fsize[0] = get_bits(&s->gb, s->xll_bits4seg_size);
-
-            for (i = 0; i < chset->channels; i++)
-                chset->scalable_lsbs[0][i] = get_bits(&s->gb, 4);
-            for (i = 0; i < chset->channels; i++)
-                chset->bit_width_adj_per_ch[0][i] = get_bits(&s->gb, 4);
-        } else {
-            memset(chset->scalable_lsbs[0], 0,
-                   chset->channels * sizeof(chset->scalable_lsbs[0][0]));
-            memset(chset->bit_width_adj_per_ch[0], 0,
-                   chset->channels * sizeof(chset->bit_width_adj_per_ch[0][0]));
         }
 
-        s->xll_channels          += chset->channels;
-        s->xll_residual_channels += chset->channels -
-                                    av_popcount(chset->residual_encode);
+        // Downmix performed by encoder in extension frequency band
+        b->dmix_embedded = c->dmix_embedded && (band == 0 || get_bits1(&s->gb));
+
+        // MSB/LSB split flag in extension frequency band
+        if ((band == 0 && s->scalable_lsbs) || (band != 0 && get_bits1(&s->gb))) {
+            // Size of LSB section in any segment
+            b->lsb_section_size = get_bits_long(&s->gb, s->seg_size_nbits);
+            if (b->lsb_section_size < 0 || b->lsb_section_size > s->frame_size) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid LSB section size\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-        /* FIXME: Parse header data for extra frequency bands. */
+            // Account for optional CRC bytes after LSB section
+            if (b->lsb_section_size && (s->band_crc_present > 2 ||
+                                        (band == 0 && s->band_crc_present > 1)))
+                b->lsb_section_size += 2;
 
-        /* Skip to end of channel set sub header. */
-        i = get_bits_count(&s->gb);
-        if (hdr_pos + 8 * hdr_size < i) {
-            av_log(s->avctx, AV_LOG_ERROR,
-                   "chset header too large, %d bits, should be <= %d bits\n",
-                   i - hdr_pos, 8 * hdr_size);
-            return AVERROR_INVALIDDATA;
+            // Number of bits to represent the samples in LSB part
+            for (i = 0; i < c->nchannels; i++) {
+                b->nscalablelsbs[i] = get_bits(&s->gb, 4);
+                if (b->nscalablelsbs[i] && !b->lsb_section_size) {
+                    av_log(s->avctx, AV_LOG_ERROR, "LSB section missing with non-zero LSB width\n");
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        } else {
+            b->lsb_section_size = 0;
+            for (i = 0; i < c->nchannels; i++)
+                b->nscalablelsbs[i] = 0;
+        }
+
+        // Scalable resolution flag in extension frequency band
+        if ((band == 0 && s->scalable_lsbs) || (band != 0 && get_bits1(&s->gb))) {
+            // Number of bits discarded by authoring
+            for (i = 0; i < c->nchannels; i++)
+                b->bit_width_adjust[i] = get_bits(&s->gb, 4);
+        } else {
+            for (i = 0; i < c->nchannels; i++)
+                b->bit_width_adjust[i] = 0;
         }
-        if (hdr_pos + 8 * hdr_size > i)
-            skip_bits_long(&s->gb, hdr_pos + 8 * hdr_size - i);
     }
+
+    // Reserved
+    // Byte align
+    // CRC16 of channel set sub-header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL sub-header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
-/* parse XLL navigation table */
-int ff_dca_xll_decode_navi(DCAContext *s, int asset_end)
+static int chs_alloc_msb_band_data(DCAXllDecoder *s, DCAXllChSet *c)
 {
-    int nbands, band, chset, seg, data_start;
-
-    /* FIXME: Supports only a single frequency band */
-    nbands = 1;
-
-    for (band = 0; band < nbands; band++) {
-        s->xll_navi.band_size[band] = 0;
-        for (seg = 0; seg < s->xll_segments; seg++) {
-            /* Note: The spec, ETSI TS 102 114 V1.4.1 (2012-09), says
-             * we should read a base value for segment_size from the
-             * stream, before reading the sizes of the channel sets.
-             * But that's apparently incorrect. */
-            s->xll_navi.segment_size[band][seg] = 0;
+    int ndecisamples = c->nfreqbands > 1 ? DCA_XLL_DECI_HISTORY_MAX : 0;
+    int nchsamples = s->nframesamples + ndecisamples;
+    int i, j, nsamples = nchsamples * c->nchannels * c->nfreqbands;
+    int32_t *ptr;
+
+    // Reallocate MSB sample buffer
+    av_fast_malloc(&c->sample_buffer[0], &c->sample_size[0], nsamples * sizeof(int32_t));
+    if (!c->sample_buffer[0])
+        return AVERROR(ENOMEM);
 
-            for (chset = 0; chset < s->xll_nch_sets; chset++)
-                if (band < s->xll_chsets[chset].num_freq_bands) {
-                    s->xll_navi.chset_size[band][seg][chset] =
-                        get_bits(&s->gb, s->xll_bits4seg_size) + 1;
-                    s->xll_navi.segment_size[band][seg] +=
-                        s->xll_navi.chset_size[band][seg][chset];
-                }
-            s->xll_navi.band_size[band] += s->xll_navi.segment_size[band][seg];
+    ptr = c->sample_buffer[0] + ndecisamples;
+    for (i = 0; i < c->nfreqbands; i++) {
+        for (j = 0; j < c->nchannels; j++) {
+            c->bands[i].msb_sample_buffer[j] = ptr;
+            ptr += nchsamples;
         }
     }
-    /* Align to 8 bits and skip 16-bit CRC. */
-    skip_bits_long(&s->gb, 16 + ((-get_bits_count(&s->gb)) & 7));
 
-    data_start = get_bits_count(&s->gb);
-    if (data_start + 8 * s->xll_navi.band_size[0] > asset_end) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "XLL: Data in NAVI table exceeds containing asset\n"
-               "start: %d (bit), size %u (bytes), end %d (bit), error %u\n",
-               data_start, s->xll_navi.band_size[0], asset_end,
-               data_start + 8 * s->xll_navi.band_size[0] - asset_end);
-        return AVERROR_INVALIDDATA;
-    }
-    init_get_bits(&s->xll_navi.gb, s->gb.buffer + data_start / 8,
-                  8 * s->xll_navi.band_size[0]);
     return 0;
 }
 
-static void dca_xll_inv_adapt_pred(int *samples, int nsamples, unsigned order,
-                                   const int *prev, const uint8_t *q_ind)
-{
-    static const uint16_t table[0x81] = {
-            0,  3070,  5110,  7140,  9156, 11154, 13132, 15085,
-        17010, 18904, 20764, 22588, 24373, 26117, 27818, 29474,
-        31085, 32648, 34164, 35631, 37049, 38418, 39738, 41008,
-        42230, 43404, 44530, 45609, 46642, 47630, 48575, 49477,
-        50337, 51157, 51937, 52681, 53387, 54059, 54697, 55302,
-        55876, 56421, 56937, 57426, 57888, 58326, 58741, 59132,
-        59502, 59852, 60182, 60494, 60789, 61066, 61328, 61576,
-        61809, 62029, 62236, 62431, 62615, 62788, 62951, 63105,
-        63250, 63386, 63514, 63635, 63749, 63855, 63956, 64051,
-        64140, 64224, 64302, 64376, 64446, 64512, 64573, 64631,
-        64686, 64737, 64785, 64830, 64873, 64913, 64950, 64986,
-        65019, 65050, 65079, 65107, 65133, 65157, 65180, 65202,
-        65222, 65241, 65259, 65275, 65291, 65306, 65320, 65333,
-        65345, 65357, 65368, 65378, 65387, 65396, 65405, 65413,
-        65420, 65427, 65434, 65440, 65446, 65451, 65456, 65461,
-        65466, 65470, 65474, 65478, 65481, 65485, 65488, 65491,
-        65535, /* Final value is for the -128 corner case, see below. */
-    };
-    int c[DCA_XLL_AORDER_MAX];
-    int64_t s;
-    unsigned i, j;
-
-    for (i = 0; i < order; i++) {
-        if (q_ind[i] & 1)
-            /* The index value 0xff corresponds to a lookup of entry 0x80 in
-             * the table, and no value is provided in the specification. */
-            c[i] = -table[(q_ind[i] >> 1) + 1];
-        else
-            c[i] = table[q_ind[i] >> 1];
-    }
-    /* The description in the spec is a bit convoluted. We can convert
-     * the reflected values to direct values in place, using a
-     * sequence of reflections operating on two values. */
-    for (i = 1; i < order; i++) {
-        /* i = 1: scale c[0]
-         * i = 2: reflect c[0] <-> c[1]
-         * i = 3: scale c[1], reflect c[0] <-> c[2]
-         * i = 4: reflect c[0] <-> c[3] reflect c[1] <-> c[2]
-         * ... */
-        if (i & 1)
-            c[i / 2] += ((int64_t) c[i] * c[i / 2] + 0x8000) >> 16;
-        for (j = 0; j < i / 2; j++) {
-            int r0 = c[j];
-            int r1 = c[i - j - 1];
-            c[j]         += ((int64_t) c[i] * r1 + 0x8000) >> 16;
-            c[i - j - 1] += ((int64_t) c[i] * r0 + 0x8000) >> 16;
-        }
-    }
-    /* Apply predictor. */
-    /* NOTE: Processing samples in this order means that the
-     * predictor is applied to the newly reconstructed samples. */
-    if (prev) {
-        for (i = 0; i < order; i++) {
-            for (j = s = 0; j < i; j++)
-                s += (int64_t) c[j] * samples[i - 1 - j];
-            for (; j < order; j++)
-                s += (int64_t) c[j] * prev[DCA_XLL_AORDER_MAX + i - 1 - j];
-
-            samples[i] -= av_clip((s + 0x8000) >> 16, -0x1000000, 0xffffff);
-        }
-    }
-    for (i = order; i < nsamples; i++) {
-        for (j = s = 0; j < order; j++)
-            s += (int64_t) c[j] * samples[i - 1 - j];
-
-        /* NOTE: Equations seem to imply addition, while the
-         * pseudocode seems to use subtraction.*/
-        samples[i] -= av_clip((s + 0x8000) >> 16, -0x1000000, 0xffffff);
-    }
-}
-
-int ff_dca_xll_decode_audio(DCAContext *s, AVFrame *frame)
-{
-    /* FIXME: Decodes only the first frequency band. */
-    int seg, chset_i;
-
-    /* Coding parameters for each channel set. */
-    struct coding_params {
-        int seg_type;
-        int rice_code_flag[16];
-        int pancAuxABIT[16];
-        int pancABIT0[16];  /* Not sure what this is */
-        int pancABIT[16];   /* Not sure what this is */
-        int nSamplPart0[16];
-    } param_state[16];
-
-    GetBitContext *gb = &s->xll_navi.gb;
-    int *history;
-
-    /* Layout: First the sample buffer for one segment per channel,
-     * followed by history buffers of DCA_XLL_AORDER_MAX samples for
-     * each channel. */
-    av_fast_malloc(&s->xll_sample_buf, &s->xll_sample_buf_size,
-                   (s->xll_smpl_in_seg + DCA_XLL_AORDER_MAX) *
-                   s->xll_channels * sizeof(*s->xll_sample_buf));
-    if (!s->xll_sample_buf)
+static int chs_alloc_lsb_band_data(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    int i, j, nsamples = 0;
+    int32_t *ptr;
+
+    // Determine number of frequency bands that have MSB/LSB split
+    for (i = 0; i < c->nfreqbands; i++)
+        if (c->bands[i].lsb_section_size)
+            nsamples += s->nframesamples * c->nchannels;
+    if (!nsamples)
+        return 0;
+
+    // Reallocate LSB sample buffer
+    av_fast_malloc(&c->sample_buffer[1], &c->sample_size[1], nsamples * sizeof(int32_t));
+    if (!c->sample_buffer[1])
         return AVERROR(ENOMEM);
 
-    history = s->xll_sample_buf + s->xll_smpl_in_seg * s->xll_channels;
-
-    for (seg = 0; seg < s->xll_segments; seg++) {
-        unsigned in_channel;
-
-        for (chset_i = in_channel = 0; chset_i < s->xll_nch_sets; chset_i++) {
-            /* The spec isn't very explicit, but I think the NAVI sizes are in bytes. */
-            int end_pos = get_bits_count(gb) +
-                          8 * s->xll_navi.chset_size[0][seg][chset_i];
-            int i, j;
-            struct coding_params *params = &param_state[chset_i];
-            /* I think this flag means that we should keep seg_type and
-             * other parameters from the previous segment. */
-            int use_seg_state_code_param;
-            XllChSetSubHeader *chset = &s->xll_chsets[chset_i];
-            if (in_channel >= s->avctx->channels)
-                /* FIXME: Could go directly to next segment */
-                goto next_chset;
-
-            if (s->avctx->sample_rate != chset->sampling_frequency) {
-                av_log(s->avctx, AV_LOG_WARNING,
-                       "XLL: unexpected chset sample rate %d, expected %d\n",
-                       chset->sampling_frequency, s->avctx->sample_rate);
-                goto next_chset;
+    ptr = c->sample_buffer[1];
+    for (i = 0; i < c->nfreqbands; i++) {
+        if (c->bands[i].lsb_section_size) {
+            for (j = 0; j < c->nchannels; j++) {
+                c->bands[i].lsb_sample_buffer[j] = ptr;
+                ptr += s->nframesamples;
             }
-            if (seg != 0)
-                use_seg_state_code_param = get_bits(gb, 1);
+        } else {
+            for (j = 0; j < c->nchannels; j++)
+                c->bands[i].lsb_sample_buffer[j] = NULL;
+        }
+    }
+
+    return 0;
+}
+
+static int chs_parse_band_data(DCAXllDecoder *s, DCAXllChSet *c, int band, int seg, int band_data_end)
+{
+    DCAXllBand *b = &c->bands[band];
+    int i, j, k;
+
+    // Start unpacking MSB portion of the segment
+    if (!(seg && get_bits1(&s->gb))) {
+        // Unpack segment type
+        // 0 - distinct coding parameters for each channel
+        // 1 - common coding parameters for all channels
+        c->seg_common = get_bits1(&s->gb);
+
+        // Determine number of coding parameters encoded in segment
+        k = c->seg_common ? 1 : c->nchannels;
+
+        // Unpack Rice coding parameters
+        for (i = 0; i < k; i++) {
+            // Unpack Rice coding flag
+            // 0 - linear code, 1 - Rice code
+            c->rice_code_flag[i] = get_bits1(&s->gb);
+            // Unpack Hybrid Rice coding flag
+            // 0 - Rice code, 1 - Hybrid Rice code
+            if (!c->seg_common && c->rice_code_flag[i] && get_bits1(&s->gb))
+                // Unpack binary code length for isolated samples
+                c->bitalloc_hybrid_linear[i] = get_bits(&s->gb, c->nabits) + 1;
             else
-                use_seg_state_code_param = 0;
+                // 0 indicates no Hybrid Rice coding
+                c->bitalloc_hybrid_linear[i] = 0;
+        }
 
-            if (!use_seg_state_code_param) {
-                int num_param_sets, i;
-                unsigned bits4ABIT;
+        // Unpack coding parameters
+        for (i = 0; i < k; i++) {
+            if (seg == 0) {
+                // Unpack coding parameter for part A of segment 0
+                c->bitalloc_part_a[i] = get_bits(&s->gb, c->nabits);
 
-                params->seg_type = get_bits(gb, 1);
-                num_param_sets   = params->seg_type ? 1 : chset->channels;
+                // Adjust for the linear code
+                if (!c->rice_code_flag[i] && c->bitalloc_part_a[i])
+                    c->bitalloc_part_a[i]++;
 
-                if (chset->bit_width > 16) {
-                    bits4ABIT = 5;
-                } else {
-                    if (chset->bit_width > 8)
-                        bits4ABIT = 4;
-                    else
-                        bits4ABIT = 3;
-                    if (s->xll_nch_sets > 1)
-                        bits4ABIT++;
+                if (!c->seg_common)
+                    c->nsamples_part_a[i] = b->adapt_pred_order[i];
+                else
+                    c->nsamples_part_a[i] = b->highest_pred_order;
+            } else {
+                c->bitalloc_part_a[i] = 0;
+                c->nsamples_part_a[i] = 0;
+            }
+
+            // Unpack coding parameter for part B of segment
+            c->bitalloc_part_b[i] = get_bits(&s->gb, c->nabits);
+
+            // Adjust for the linear code
+            if (!c->rice_code_flag[i] && c->bitalloc_part_b[i])
+                c->bitalloc_part_b[i]++;
+        }
+    }
+
+    // Unpack entropy codes
+    for (i = 0; i < c->nchannels; i++) {
+        int32_t *part_a, *part_b;
+        int nsamples_part_b;
+
+        // Select index of coding parameters
+        k = c->seg_common ? 0 : i;
+
+        // Slice the segment into parts A and B
+        part_a = b->msb_sample_buffer[i] + seg * s->nsegsamples;
+        part_b = part_a + c->nsamples_part_a[k];
+        nsamples_part_b = s->nsegsamples - c->nsamples_part_a[k];
+
+        if (get_bits_left(&s->gb) < 0)
+            return AVERROR_INVALIDDATA;
+
+        if (!c->rice_code_flag[k]) {
+            // Linear codes
+            // Unpack all residuals of part A of segment 0
+            get_linear_array(&s->gb, part_a, c->nsamples_part_a[k],
+                             c->bitalloc_part_a[k]);
+
+            // Unpack all residuals of part B of segment 0 and others
+            get_linear_array(&s->gb, part_b, nsamples_part_b,
+                             c->bitalloc_part_b[k]);
+        } else {
+            // Rice codes
+            // Unpack all residuals of part A of segment 0
+            get_rice_array(&s->gb, part_a, c->nsamples_part_a[k],
+                           c->bitalloc_part_a[k]);
+
+            if (c->bitalloc_hybrid_linear[k]) {
+                // Hybrid Rice codes
+                // Unpack the number of isolated samples
+                int nisosamples = get_bits(&s->gb, s->nsegsamples_log2);
+
+                // Set all locations to 0
+                memset(part_b, 0, sizeof(*part_b) * nsamples_part_b);
+
+                // Extract the locations of isolated samples and flag by -1
+                for (j = 0; j < nisosamples; j++) {
+                    int loc = get_bits(&s->gb, s->nsegsamples_log2);
+                    if (loc >= nsamples_part_b) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid isolated sample location\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    part_b[loc] = -1;
                 }
 
-                for (i = 0; i < num_param_sets; i++) {
-                    params->rice_code_flag[i] = get_bits(gb, 1);
-                    if (!params->seg_type && params->rice_code_flag[i] && get_bits(gb, 1))
-                        params->pancAuxABIT[i] = get_bits(gb, bits4ABIT) + 1;
+                // Unpack all residuals of part B of segment 0 and others
+                for (j = 0; j < nsamples_part_b; j++) {
+                    if (part_b[j])
+                        part_b[j] = get_linear(&s->gb, c->bitalloc_hybrid_linear[k]);
                     else
-                        params->pancAuxABIT[i] = 0;
+                        part_b[j] = get_rice(&s->gb, c->bitalloc_part_b[k]);
                 }
+            } else {
+                // Rice codes
+                // Unpack all residuals of part B of segment 0 and others
+                get_rice_array(&s->gb, part_b, nsamples_part_b, c->bitalloc_part_b[k]);
+            }
+        }
+    }
 
-                for (i = 0; i < num_param_sets; i++) {
-                    if (!seg) {
-                        /* Parameters for part 1 */
-                        params->pancABIT0[i] = get_bits(gb, bits4ABIT);
-                        if (params->rice_code_flag[i] == 0 && params->pancABIT0[i] > 0)
-                            /* For linear code */
-                            params->pancABIT0[i]++;
-
-                        /* NOTE: In the spec, not indexed by band??? */
-                        if (params->seg_type == 0)
-                            params->nSamplPart0[i] = chset->adapt_order[0][i];
-                        else
-                            params->nSamplPart0[i] = chset->adapt_order_max[0];
-                    } else
-                        params->nSamplPart0[i] = 0;
-
-                    /* Parameters for part 2 */
-                    params->pancABIT[i] = get_bits(gb, bits4ABIT);
-                    if (params->rice_code_flag[i] == 0 && params->pancABIT[i] > 0)
-                        /* For linear code */
-                        params->pancABIT[i]++;
-                }
+    // Unpack decimator history for frequency band 1
+    if (seg == 0 && band == 1) {
+        int nbits = get_bits(&s->gb, 5) + 1;
+        for (i = 0; i < c->nchannels; i++)
+            for (j = 1; j < DCA_XLL_DECI_HISTORY_MAX; j++)
+                c->deci_history[i][j] = get_sbits_long(&s->gb, nbits);
+    }
+
+    // Start unpacking LSB portion of the segment
+    if (b->lsb_section_size) {
+        // Skip to the start of LSB portion
+        if (ff_dca_seek_bits(&s->gb, band_data_end - b->lsb_section_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL band data\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Unpack all LSB parts of residuals of this segment
+        for (i = 0; i < c->nchannels; i++) {
+            if (b->nscalablelsbs[i]) {
+                get_array(&s->gb,
+                          b->lsb_sample_buffer[i] + seg * s->nsegsamples,
+                          s->nsegsamples, b->nscalablelsbs[i]);
             }
-            for (i = 0; i < chset->channels; i++) {
-                int param_index = params->seg_type ? 0 : i;
-                int part0       = params->nSamplPart0[param_index];
-                int bits        = part0 ? params->pancABIT0[param_index] : 0;
-                int *sample_buf = s->xll_sample_buf +
-                                  (in_channel + i) * s->xll_smpl_in_seg;
-
-                if (!params->rice_code_flag[param_index]) {
-                    /* Linear code */
-                    if (bits)
-                        for (j = 0; j < part0; j++)
-                            sample_buf[j] = get_bits_sm(gb, bits);
-                    else
-                        memset(sample_buf, 0, part0 * sizeof(sample_buf[0]));
+        }
+    }
 
-                    /* Second part */
-                    bits = params->pancABIT[param_index];
-                    if (bits)
-                        for (j = part0; j < s->xll_smpl_in_seg; j++)
-                            sample_buf[j] = get_bits_sm(gb, bits);
-                    else
-                        memset(sample_buf + part0, 0,
-                               (s->xll_smpl_in_seg - part0) * sizeof(sample_buf[0]));
-                } else {
-                    int aux_bits = params->pancAuxABIT[param_index];
-
-                    for (j = 0; j < part0; j++) {
-                        /* FIXME: Is this identical to Golomb code? */
-                        int t = get_unary(gb, 1, 33) << bits;
-                        /* FIXME: Could move this test outside of the loop, for efficiency. */
-                        if (bits)
-                            t |= get_bits(gb, bits);
-                        sample_buf[j] = (t & 1) ? -(t >> 1) - 1 : (t >> 1);
-                    }
+    // Skip to the end of band data
+    if (ff_dca_seek_bits(&s->gb, band_data_end)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL band data\n");
+        return AVERROR_INVALIDDATA;
+    }
 
-                    /* Second part */
-                    bits = params->pancABIT[param_index];
-
-                    /* Follow the spec's suggestion of using the
-                     * buffer also to store the hybrid-rice flags. */
-                    memset(sample_buf + part0, 0,
-                           (s->xll_smpl_in_seg - part0) * sizeof(sample_buf[0]));
-
-                    if (aux_bits > 0) {
-                        /* For hybrid rice encoding, some samples are linearly
-                         * coded. According to the spec, "nBits4SamplLoci" bits
-                         * are used for each index, but this value is not
-                         * defined. I guess we should use log2(xll_smpl_in_seg)
-                         * bits. */
-                        int count = get_bits(gb, s->xll_log_smpl_in_seg);
-                        av_log(s->avctx, AV_LOG_DEBUG, "aux count %d (bits %d)\n",
-                               count, s->xll_log_smpl_in_seg);
-
-                        for (j = 0; j < count; j++)
-                            sample_buf[get_bits(gb, s->xll_log_smpl_in_seg)] = 1;
-                    }
-                    for (j = part0; j < s->xll_smpl_in_seg; j++) {
-                        if (!sample_buf[j]) {
-                            int t = get_unary(gb, 1, 33);
-                            if (bits)
-                                t = (t << bits) | get_bits(gb, bits);
-                            sample_buf[j] = (t & 1) ? -(t >> 1) - 1 : (t >> 1);
-                        } else
-                            sample_buf[j] = get_bits_sm(gb, aux_bits);
-                    }
+    return 0;
+}
+
+static av_cold void chs_clear_band_data(DCAXllDecoder *s, DCAXllChSet *c, int band, int seg)
+{
+    DCAXllBand *b = &c->bands[band];
+    int i, offset, nsamples;
+
+    if (seg < 0) {
+        offset = 0;
+        nsamples = s->nframesamples;
+    } else {
+        offset = seg * s->nsegsamples;
+        nsamples = s->nsegsamples;
+    }
+
+    for (i = 0; i < c->nchannels; i++) {
+        memset(b->msb_sample_buffer[i] + offset, 0, nsamples * sizeof(int32_t));
+        if (b->lsb_section_size)
+            memset(b->lsb_sample_buffer[i] + offset, 0, nsamples * sizeof(int32_t));
+    }
+
+    if (seg <= 0 && band)
+        memset(c->deci_history, 0, sizeof(c->deci_history));
+
+    if (seg < 0) {
+        memset(b->nscalablelsbs, 0, sizeof(b->nscalablelsbs));
+        memset(b->bit_width_adjust, 0, sizeof(b->bit_width_adjust));
+    }
+}
+
+static void chs_filter_band_data(DCAXllDecoder *s, DCAXllChSet *c, int band)
+{
+    DCAXllBand *b = &c->bands[band];
+    int nsamples = s->nframesamples;
+    int i, j, k;
+
+    // Inverse adaptive or fixed prediction
+    for (i = 0; i < c->nchannels; i++) {
+        int32_t *buf = b->msb_sample_buffer[i];
+        int order = b->adapt_pred_order[i];
+        if (order > 0) {
+            int coeff[DCA_XLL_ADAPT_PRED_ORDER_MAX];
+            // Conversion from reflection coefficients to direct form coefficients
+            for (j = 0; j < order; j++) {
+                int rc = b->adapt_refl_coeff[i][j];
+                for (k = 0; k < (j + 1) / 2; k++) {
+                    int tmp1 = coeff[    k    ];
+                    int tmp2 = coeff[j - k - 1];
+                    coeff[    k    ] = tmp1 + mul16(rc, tmp2);
+                    coeff[j - k - 1] = tmp2 + mul16(rc, tmp1);
                 }
+                coeff[j] = rc;
+            }
+            // Inverse adaptive prediction
+            for (j = 0; j < nsamples - order; j++) {
+                int64_t err = 0;
+                for (k = 0; k < order; k++)
+                    err += (int64_t)buf[j + k] * coeff[order - k - 1];
+                buf[j + k] -= (SUINT)clip23(norm16(err));
             }
+        } else {
+            // Inverse fixed coefficient prediction
+            for (j = 0; j < b->fixed_pred_order[i]; j++)
+                for (k = 1; k < nsamples; k++)
+                    buf[k] += (unsigned)buf[k - 1];
+        }
+    }
 
-            for (i = 0; i < chset->channels; i++) {
-                unsigned adapt_order = chset->adapt_order[0][i];
-                int *sample_buf = s->xll_sample_buf +
-                                  (in_channel + i) * s->xll_smpl_in_seg;
-                int *prev = history + (in_channel + i) * DCA_XLL_AORDER_MAX;
-
-                if (!adapt_order) {
-                    unsigned order;
-                    for (order = chset->fixed_order[0][i]; order > 0; order--) {
-                        unsigned j;
-                        for (j = 1; j < s->xll_smpl_in_seg; j++)
-                            sample_buf[j] += sample_buf[j - 1];
-                    }
-                } else
-                    /* Inverse adaptive prediction, in place. */
-                    dca_xll_inv_adapt_pred(sample_buf, s->xll_smpl_in_seg,
-                                           adapt_order, seg ? prev : NULL,
-                                           chset->lpc_refl_coeffs_q_ind[0][i]);
-                memcpy(prev, sample_buf + s->xll_smpl_in_seg - DCA_XLL_AORDER_MAX,
-                       DCA_XLL_AORDER_MAX * sizeof(*prev));
+    // Inverse pairwise channel decorrellation
+    if (b->decor_enabled) {
+        int32_t *tmp[DCA_XLL_CHANNELS_MAX];
+
+        for (i = 0; i < c->nchannels / 2; i++) {
+            int coeff = b->decor_coeff[i];
+            if (coeff) {
+                s->dcadsp->decor(b->msb_sample_buffer[i * 2 + 1],
+                                 b->msb_sample_buffer[i * 2    ],
+                                 coeff, nsamples);
             }
-            for (i = 1; i < chset->channels; i += 2) {
-                int coeff = chset->pw_ch_pairs_coeffs[0][i / 2];
-                if (coeff != 0) {
-                    int *sample_buf = s->xll_sample_buf +
-                                      (in_channel + i) * s->xll_smpl_in_seg;
-                    int *prev = sample_buf - s->xll_smpl_in_seg;
-                    unsigned j;
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        /* Shift is unspecified, but should apparently be 3. */
-                        sample_buf[j] += ((int64_t) coeff * prev[j] + 4) >> 3;
-                }
+        }
+
+        // Reorder channel pointers to the original order
+        for (i = 0; i < c->nchannels; i++)
+            tmp[i] = b->msb_sample_buffer[i];
+
+        for (i = 0; i < c->nchannels; i++)
+            b->msb_sample_buffer[b->orig_order[i]] = tmp[i];
+    }
+
+    // Map output channel pointers for frequency band 0
+    if (c->nfreqbands == 1)
+        for (i = 0; i < c->nchannels; i++)
+            s->output_samples[c->ch_remap[i]] = b->msb_sample_buffer[i];
+}
+
+static int chs_get_lsb_width(DCAXllDecoder *s, DCAXllChSet *c, int band, int ch)
+{
+    int adj = c->bands[band].bit_width_adjust[ch];
+    int shift = c->bands[band].nscalablelsbs[ch];
+
+    if (s->fixed_lsb_width)
+        shift = s->fixed_lsb_width;
+    else if (shift && adj)
+        shift += adj - 1;
+    else
+        shift += adj;
+
+    return shift;
+}
+
+static void chs_assemble_msbs_lsbs(DCAXllDecoder *s, DCAXllChSet *c, int band)
+{
+    DCAXllBand *b = &c->bands[band];
+    int n, ch, nsamples = s->nframesamples;
+
+    for (ch = 0; ch < c->nchannels; ch++) {
+        int shift = chs_get_lsb_width(s, c, band, ch);
+        if (shift) {
+            int32_t *msb = b->msb_sample_buffer[ch];
+            if (b->nscalablelsbs[ch]) {
+                int32_t *lsb = b->lsb_sample_buffer[ch];
+                int adj = b->bit_width_adjust[ch];
+                for (n = 0; n < nsamples; n++)
+                    msb[n] = msb[n] * (SUINT)(1 << shift) + (lsb[n] << adj);
+            } else {
+                for (n = 0; n < nsamples; n++)
+                    msb[n] = msb[n] * (SUINT)(1 << shift);
             }
+        }
+    }
+}
 
-            if (s->xll_scalable_lsb) {
-                int lsb_start = end_pos - 8 * chset->lsb_fsize[0] -
-                                8 * (s->xll_banddata_crc & 2);
-                int done;
-                i = get_bits_count(gb);
-                if (i > lsb_start) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "chset data lsb exceeds NAVI size, end_pos %d, lsb_start %d, pos %d\n",
-                           end_pos, lsb_start, i);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (i < lsb_start)
-                    skip_bits_long(gb, lsb_start - i);
-
-                for (i = done = 0; i < chset->channels; i++) {
-                    int bits = chset->scalable_lsbs[0][i];
-                    if (bits > 0) {
-                        /* The channel reordering is conceptually done
-                         * before adding the lsb:s, so we need to do
-                         * the inverse permutation here. */
-                        unsigned pi = chset->orig_chan_order_inv[0][i];
-                        int *sample_buf = s->xll_sample_buf +
-                                          (in_channel + pi) * s->xll_smpl_in_seg;
-                        int adj = chset->bit_width_adj_per_ch[0][i];
-                        int msb_shift = bits;
-                        unsigned j;
-
-                        if (adj > 0)
-                            msb_shift += adj - 1;
-
-                        for (j = 0; j < s->xll_smpl_in_seg; j++)
-                            sample_buf[j] = (sample_buf[j] << msb_shift) +
-                                            (get_bits(gb, bits) << adj);
-
-                        done += bits * s->xll_smpl_in_seg;
+static int chs_assemble_freq_bands(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    int ch, nsamples = s->nframesamples;
+    int32_t *ptr;
+
+    av_assert1(c->nfreqbands > 1);
+
+    // Reallocate frequency band assembly buffer
+    av_fast_malloc(&c->sample_buffer[2], &c->sample_size[2],
+                   2 * nsamples * c->nchannels * sizeof(int32_t));
+    if (!c->sample_buffer[2])
+        return AVERROR(ENOMEM);
+
+    // Assemble frequency bands 0 and 1
+    ptr = c->sample_buffer[2];
+    for (ch = 0; ch < c->nchannels; ch++) {
+        int32_t *band0 = c->bands[0].msb_sample_buffer[ch];
+        int32_t *band1 = c->bands[1].msb_sample_buffer[ch];
+
+        // Copy decimator history
+        memcpy(band0 - DCA_XLL_DECI_HISTORY_MAX,
+               c->deci_history[ch], sizeof(c->deci_history[0]));
+
+        // Filter
+        s->dcadsp->assemble_freq_bands(ptr, band0, band1,
+                                       ff_dca_xll_band_coeff,
+                                       nsamples);
+
+        // Remap output channel pointer to assembly buffer
+        s->output_samples[c->ch_remap[ch]] = ptr;
+        ptr += nsamples * 2;
+    }
+
+    return 0;
+}
+
+static int parse_common_header(DCAXllDecoder *s)
+{
+    int stream_ver, header_size, frame_size_nbits, nframesegs_log2;
+
+    // XLL extension sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_XLL) {
+        av_log(s->avctx, AV_LOG_VERBOSE, "Invalid XLL sync word\n");
+        return AVERROR(EAGAIN);
+    }
+
+    // Version number
+    stream_ver = get_bits(&s->gb, 4) + 1;
+    if (stream_ver > 1) {
+        avpriv_request_sample(s->avctx, "XLL stream version %d", stream_ver);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Lossless frame header length
+    header_size = get_bits(&s->gb, 8) + 1;
+
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, 32, header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL common header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of bits used to read frame size
+    frame_size_nbits = get_bits(&s->gb, 5) + 1;
+
+    // Number of bytes in a lossless frame
+    s->frame_size = get_bits_long(&s->gb, frame_size_nbits);
+    if (s->frame_size < 0 || s->frame_size >= DCA_XLL_PBR_BUFFER_MAX) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL frame size (%d bytes)\n", s->frame_size);
+        return AVERROR_INVALIDDATA;
+    }
+    s->frame_size++;
+
+    // Number of channels sets per frame
+    s->nchsets = get_bits(&s->gb, 4) + 1;
+    if (s->nchsets > DCA_XLL_CHSETS_MAX) {
+        avpriv_request_sample(s->avctx, "%d XLL channel sets", s->nchsets);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Number of segments per frame
+    nframesegs_log2 = get_bits(&s->gb, 4);
+    s->nframesegs = 1 << nframesegs_log2;
+    if (s->nframesegs > 1024) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many segments per XLL frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Samples in segment per one frequency band for the first channel set
+    // Maximum value is 256 for sampling frequencies <= 48 kHz
+    // Maximum value is 512 for sampling frequencies > 48 kHz
+    s->nsegsamples_log2 = get_bits(&s->gb, 4);
+    if (!s->nsegsamples_log2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too few samples per XLL segment\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->nsegsamples = 1 << s->nsegsamples_log2;
+    if (s->nsegsamples > 512) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many samples per XLL segment\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Samples in frame per one frequency band for the first channel set
+    s->nframesamples_log2 = s->nsegsamples_log2 + nframesegs_log2;
+    s->nframesamples = 1 << s->nframesamples_log2;
+    if (s->nframesamples > 65536) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many samples per XLL frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of bits used to read segment size
+    s->seg_size_nbits = get_bits(&s->gb, 5) + 1;
+
+    // Presence of CRC16 within each frequency band
+    // 0 - No CRC16 within band
+    // 1 - CRC16 placed at the end of MSB0
+    // 2 - CRC16 placed at the end of MSB0 and LSB0
+    // 3 - CRC16 placed at the end of MSB0 and LSB0 and other frequency bands
+    s->band_crc_present = get_bits(&s->gb, 2);
+
+    // MSB/LSB split flag
+    s->scalable_lsbs = get_bits1(&s->gb);
+
+    // Channel position mask
+    s->ch_mask_nbits = get_bits(&s->gb, 5) + 1;
+
+    // Fixed LSB width
+    if (s->scalable_lsbs)
+        s->fixed_lsb_width = get_bits(&s->gb, 4);
+    else
+        s->fixed_lsb_width = 0;
+
+    // Reserved
+    // Byte align
+    // Header CRC16 protection
+    if (ff_dca_seek_bits(&s->gb, header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL common header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int is_hier_dmix_chset(DCAXllChSet *c)
+{
+    return !c->primary_chset && c->dmix_embedded && c->hier_chset;
+}
+
+static DCAXllChSet *find_next_hier_dmix_chset(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    if (c->hier_chset)
+        while (++c < &s->chset[s->nchsets])
+            if (is_hier_dmix_chset(c))
+                return c;
+
+    return NULL;
+}
+
+static void prescale_down_mix(DCAXllChSet *c, DCAXllChSet *o)
+{
+    int i, j, *coeff_ptr = c->dmix_coeff;
+
+    for (i = 0; i < c->hier_ofs; i++) {
+        int scale = o->dmix_scale[i];
+        int scale_inv = o->dmix_scale_inv[i];
+        c->dmix_scale[i] = mul15(c->dmix_scale[i], scale);
+        c->dmix_scale_inv[i] = mul16(c->dmix_scale_inv[i], scale_inv);
+        for (j = 0; j < c->nchannels; j++) {
+            int coeff = mul16(*coeff_ptr, scale_inv);
+            *coeff_ptr++ = mul15(coeff, o->dmix_scale[c->hier_ofs + j]);
+        }
+    }
+}
+
+static int parse_sub_headers(DCAXllDecoder *s, DCAExssAsset *asset)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    DCAXllChSet *c;
+    int i, ret;
+
+    // Parse channel set headers
+    s->nfreqbands = 0;
+    s->nchannels = 0;
+    s->nreschsets = 0;
+    for (i = 0, c = s->chset; i < s->nchsets; i++, c++) {
+        c->hier_ofs = s->nchannels;
+        if ((ret = chs_parse_header(s, c, asset)) < 0)
+            return ret;
+        if (c->nfreqbands > s->nfreqbands)
+            s->nfreqbands = c->nfreqbands;
+        if (c->hier_chset)
+            s->nchannels += c->nchannels;
+        if (c->residual_encode != (1 << c->nchannels) - 1)
+            s->nreschsets++;
+    }
+
+    // Pre-scale downmixing coefficients for all non-primary channel sets
+    for (i = s->nchsets - 1, c = &s->chset[i]; i > 0; i--, c--) {
+        if (is_hier_dmix_chset(c)) {
+            DCAXllChSet *o = find_next_hier_dmix_chset(s, c);
+            if (o)
+                prescale_down_mix(c, o);
+        }
+    }
+
+    // Determine number of active channel sets to decode
+    switch (dca->request_channel_layout) {
+    case DCA_SPEAKER_LAYOUT_STEREO:
+        s->nactivechsets = 1;
+        break;
+    case DCA_SPEAKER_LAYOUT_5POINT0:
+    case DCA_SPEAKER_LAYOUT_5POINT1:
+        s->nactivechsets = (s->chset[0].nchannels < 5 && s->nchsets > 1) ? 2 : 1;
+        break;
+    default:
+        s->nactivechsets = s->nchsets;
+        break;
+    }
+
+    return 0;
+}
+
+static int parse_navi_table(DCAXllDecoder *s)
+{
+    int chs, seg, band, navi_nb, navi_pos, *navi_ptr;
+    DCAXllChSet *c;
+
+    // Determine size of NAVI table
+    navi_nb = s->nfreqbands * s->nframesegs * s->nchsets;
+    if (navi_nb > 1024) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many NAVI entries (%d)\n", navi_nb);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Reallocate NAVI table
+    av_fast_malloc(&s->navi, &s->navi_size, navi_nb * sizeof(*s->navi));
+    if (!s->navi)
+        return AVERROR(ENOMEM);
+
+    // Parse NAVI
+    navi_pos = get_bits_count(&s->gb);
+    navi_ptr = s->navi;
+    for (band = 0; band < s->nfreqbands; band++) {
+        for (seg = 0; seg < s->nframesegs; seg++) {
+            for (chs = 0, c = s->chset; chs < s->nchsets; chs++, c++) {
+                int size = 0;
+                if (c->nfreqbands > band) {
+                    size = get_bits_long(&s->gb, s->seg_size_nbits);
+                    if (size < 0 || size >= s->frame_size) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid NAVI segment size (%d bytes)\n", size);
+                        return AVERROR_INVALIDDATA;
                     }
+                    size++;
                 }
-                if (done > 8 * chset->lsb_fsize[0]) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "chset lsb exceeds lsb_size\n");
-                    return AVERROR_INVALIDDATA;
-                }
+                *navi_ptr++ = size;
             }
+        }
+    }
+
+    // Byte align
+    // CRC16
+    skip_bits(&s->gb, -get_bits_count(&s->gb) & 7);
+    skip_bits(&s->gb, 16);
+
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, navi_pos, get_bits_count(&s->gb))) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid NAVI checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_band_data(DCAXllDecoder *s)
+{
+    int ret, chs, seg, band, navi_pos, *navi_ptr;
+    DCAXllChSet *c;
+
+    for (chs = 0, c = s->chset; chs < s->nactivechsets; chs++, c++) {
+        if ((ret = chs_alloc_msb_band_data(s, c)) < 0)
+            return ret;
+        if ((ret = chs_alloc_lsb_band_data(s, c)) < 0)
+            return ret;
+    }
 
-            /* Store output. */
-            for (i = 0; i < chset->channels; i++) {
-                int *sample_buf = s->xll_sample_buf +
-                                  (in_channel + i) * s->xll_smpl_in_seg;
-                int shift = 1 - chset->bit_resolution;
-                int out_channel = chset->orig_chan_order[0][i];
-                float *out;
-
-                /* XLL uses the channel order C, L, R, and we want L,
-                 * R, C. FIXME: Generalize. */
-                if (chset->ch_mask_enabled &&
-                    (chset->ch_mask & 7) == 7 && out_channel < 3)
-                    out_channel = out_channel ? out_channel - 1 : 2;
-
-                out_channel += in_channel;
-                if (out_channel >= s->avctx->channels)
-                    continue;
-
-                out  = (float *) frame->extended_data[out_channel];
-                out += seg * s->xll_smpl_in_seg;
-
-                /* NOTE: A one bit means residual encoding is *not* used. */
-                if ((chset->residual_encode >> i) & 1) {
-                    /* Replace channel samples.
-                     * FIXME: Most likely not the right thing to do. */
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        out[j] = ldexpf(sample_buf[j], shift);
-                } else {
-                    /* Add residual signal to core channel */
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        out[j] += ldexpf(sample_buf[j], shift);
+    navi_pos = get_bits_count(&s->gb);
+    navi_ptr = s->navi;
+    for (band = 0; band < s->nfreqbands; band++) {
+        for (seg = 0; seg < s->nframesegs; seg++) {
+            for (chs = 0, c = s->chset; chs < s->nchsets; chs++, c++) {
+                if (c->nfreqbands > band) {
+                    navi_pos += *navi_ptr * 8;
+                    if (navi_pos > s->gb.size_in_bits) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid NAVI position\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    if (chs < s->nactivechsets &&
+                        (ret = chs_parse_band_data(s, c, band, seg, navi_pos)) < 0) {
+                        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                            return ret;
+                        chs_clear_band_data(s, c, band, seg);
+                    }
+                    skip_bits_long(&s->gb, navi_pos - get_bits_count(&s->gb));
                 }
+                navi_ptr++;
             }
+        }
+    }
 
-            if (chset->downmix_coeff_code_embedded &&
-                !chset->primary_ch_set && chset->hier_chset) {
-                /* Undo hierarchical downmix of earlier channels. */
-                unsigned mix_channel;
-                for (mix_channel = 0; mix_channel < in_channel; mix_channel++) {
-                    float *mix_buf;
-                    const int *col;
-                    float coeff;
-                    unsigned row;
-                    /* Similar channel reorder C, L, R vs L, R, C reorder. */
-                    if (chset->ch_mask_enabled &&
-                        (chset->ch_mask & 7) == 7 && mix_channel < 3)
-                        mix_buf = (float *) frame->extended_data[mix_channel ? mix_channel - 1 : 2];
-                    else
-                        mix_buf = (float *) frame->extended_data[mix_channel];
-
-                    mix_buf += seg * s->xll_smpl_in_seg;
-                    col = &chset->downmix_coeffs[mix_channel * (chset->channels + 1)];
-
-                    /* Scale */
-                    coeff = ldexpf(col[0], -16);
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        mix_buf[j] *= coeff;
-
-                    for (row = 0;
-                         row < chset->channels && in_channel + row < s->avctx->channels;
-                         row++)
-                        if (col[row + 1]) {
-                            const float *new_channel =
-                                (const float *) frame->extended_data[in_channel + row];
-                            new_channel += seg * s->xll_smpl_in_seg;
-                            coeff        = ldexpf(col[row + 1], -15);
-                            for (j = 0; j < s->xll_smpl_in_seg; j++)
-                                mix_buf[j] -= coeff * new_channel[j];
-                        }
+    return 0;
+}
+
+static int parse_frame(DCAXllDecoder *s, uint8_t *data, int size, DCAExssAsset *asset)
+{
+    int ret;
+
+    if ((ret = init_get_bits8(&s->gb, data, size)) < 0)
+        return ret;
+    if ((ret = parse_common_header(s)) < 0)
+        return ret;
+    if ((ret = parse_sub_headers(s, asset)) < 0)
+        return ret;
+    if ((ret = parse_navi_table(s)) < 0)
+        return ret;
+    if ((ret = parse_band_data(s)) < 0)
+        return ret;
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+    return ret;
+}
+
+static void clear_pbr(DCAXllDecoder *s)
+{
+    s->pbr_length = 0;
+    s->pbr_delay = 0;
+}
+
+static int copy_to_pbr(DCAXllDecoder *s, uint8_t *data, int size, int delay)
+{
+    if (size > DCA_XLL_PBR_BUFFER_MAX)
+        return AVERROR(ENOSPC);
+
+    if (!s->pbr_buffer && !(s->pbr_buffer = av_malloc(DCA_XLL_PBR_BUFFER_MAX + AV_INPUT_BUFFER_PADDING_SIZE)))
+        return AVERROR(ENOMEM);
+
+    memcpy(s->pbr_buffer, data, size);
+    s->pbr_length = size;
+    s->pbr_delay = delay;
+    return 0;
+}
+
+static int parse_frame_no_pbr(DCAXllDecoder *s, uint8_t *data, int size, DCAExssAsset *asset)
+{
+    int ret = parse_frame(s, data, size, asset);
+
+    // If XLL packet data didn't start with a sync word, we must have jumped
+    // right into the middle of PBR smoothing period
+    if (ret == AVERROR(EAGAIN) && asset->xll_sync_present && asset->xll_sync_offset < size) {
+        // Skip to the next sync word in this packet
+        data += asset->xll_sync_offset;
+        size -= asset->xll_sync_offset;
+
+        // If decoding delay is set, put the frame into PBR buffer and return
+        // failure code. Higher level decoder is expected to switch to lossy
+        // core decoding or mute its output until decoding delay expires.
+        if (asset->xll_delay_nframes > 0) {
+            if ((ret = copy_to_pbr(s, data, size, asset->xll_delay_nframes)) < 0)
+                return ret;
+            return AVERROR(EAGAIN);
+        }
+
+        // No decoding delay, just parse the frame in place
+        ret = parse_frame(s, data, size, asset);
+    }
+
+    if (ret < 0)
+        return ret;
+
+    if (s->frame_size > size)
+        return AVERROR(EINVAL);
+
+    // If the XLL decoder didn't consume full packet, start PBR smoothing period
+    if (s->frame_size < size)
+        if ((ret = copy_to_pbr(s, data + s->frame_size, size - s->frame_size, 0)) < 0)
+            return ret;
+
+    return 0;
+}
+
+static int parse_frame_pbr(DCAXllDecoder *s, uint8_t *data, int size, DCAExssAsset *asset)
+{
+    int ret;
+
+    if (size > DCA_XLL_PBR_BUFFER_MAX - s->pbr_length) {
+        ret = AVERROR(ENOSPC);
+        goto fail;
+    }
+
+    memcpy(s->pbr_buffer + s->pbr_length, data, size);
+    s->pbr_length += size;
+
+    // Respect decoding delay after synchronization error
+    if (s->pbr_delay > 0 && --s->pbr_delay)
+        return AVERROR(EAGAIN);
+
+    if ((ret = parse_frame(s, s->pbr_buffer, s->pbr_length, asset)) < 0)
+        goto fail;
+
+    if (s->frame_size > s->pbr_length) {
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    if (s->frame_size == s->pbr_length) {
+        // End of PBR smoothing period
+        clear_pbr(s);
+    } else {
+        s->pbr_length -= s->frame_size;
+        memmove(s->pbr_buffer, s->pbr_buffer + s->frame_size, s->pbr_length);
+    }
+
+    return 0;
+
+fail:
+    // For now, throw out all PBR state on failure.
+    // Perhaps we can be smarter and try to resync somehow.
+    clear_pbr(s);
+    return ret;
+}
+
+int ff_dca_xll_parse(DCAXllDecoder *s, uint8_t *data, DCAExssAsset *asset)
+{
+    int ret;
+
+    if (s->hd_stream_id != asset->hd_stream_id) {
+        clear_pbr(s);
+        s->hd_stream_id = asset->hd_stream_id;
+    }
+
+    if (s->pbr_length)
+        ret = parse_frame_pbr(s, data + asset->xll_offset, asset->xll_size, asset);
+    else
+        ret = parse_frame_no_pbr(s, data + asset->xll_offset, asset->xll_size, asset);
+
+    return ret;
+}
+
+static void undo_down_mix(DCAXllDecoder *s, DCAXllChSet *o, int band)
+{
+    int i, j, k, nchannels = 0, *coeff_ptr = o->dmix_coeff;
+    DCAXllChSet *c;
+
+    for (i = 0, c = s->chset; i < s->nactivechsets; i++, c++) {
+        if (!c->hier_chset)
+            continue;
+
+        av_assert1(band < c->nfreqbands);
+        for (j = 0; j < c->nchannels; j++) {
+            for (k = 0; k < o->nchannels; k++) {
+                int coeff = *coeff_ptr++;
+                if (coeff) {
+                    s->dcadsp->dmix_sub(c->bands[band].msb_sample_buffer[j],
+                                        o->bands[band].msb_sample_buffer[k],
+                                        coeff, s->nframesamples);
+                    if (band)
+                        s->dcadsp->dmix_sub(c->deci_history[j],
+                                            o->deci_history[k],
+                                            coeff, DCA_XLL_DECI_HISTORY_MAX);
                 }
             }
+        }
 
-next_chset:
-            in_channel += chset->channels;
-            /* Skip to next channel set using the NAVI info. */
-            i = get_bits_count(gb);
-            if (i > end_pos) {
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "chset data exceeds NAVI size\n");
-                return AVERROR_INVALIDDATA;
+        nchannels += c->nchannels;
+        if (nchannels >= o->hier_ofs)
+            break;
+    }
+}
+
+static void scale_down_mix(DCAXllDecoder *s, DCAXllChSet *o, int band)
+{
+    int i, j, nchannels = 0;
+    DCAXllChSet *c;
+
+    for (i = 0, c = s->chset; i < s->nactivechsets; i++, c++) {
+        if (!c->hier_chset)
+            continue;
+
+        av_assert1(band < c->nfreqbands);
+        for (j = 0; j < c->nchannels; j++) {
+            int scale = o->dmix_scale[nchannels++];
+            if (scale != (1 << 15)) {
+                s->dcadsp->dmix_scale(c->bands[band].msb_sample_buffer[j],
+                                      scale, s->nframesamples);
+                if (band)
+                    s->dcadsp->dmix_scale(c->deci_history[j],
+                                          scale, DCA_XLL_DECI_HISTORY_MAX);
             }
-            if (i < end_pos)
-                skip_bits_long(gb, end_pos - i);
         }
+
+        if (nchannels >= o->hier_ofs)
+            break;
+    }
+}
+
+// Clear all band data and replace non-residual encoded channels with lossy
+// counterparts
+static av_cold void force_lossy_output(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    int band, ch;
+
+    for (band = 0; band < c->nfreqbands; band++)
+        chs_clear_band_data(s, c, band, -1);
+
+    for (ch = 0; ch < c->nchannels; ch++) {
+        if (!(c->residual_encode & (1 << ch)))
+            continue;
+        if (ff_dca_core_map_spkr(&dca->core, c->ch_remap[ch]) < 0)
+            continue;
+        c->residual_encode &= ~(1 << ch);
+    }
+}
+
+static int combine_residual_frame(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    int ch, nsamples = s->nframesamples;
+    DCAXllChSet *o;
+
+    // Verify that core is compatible
+    if (!(dca->packet & DCA_PACKET_CORE)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Residual encoded channels are present without core\n");
+        return AVERROR(EINVAL);
     }
+
+    if (c->freq != dca->core.output_rate) {
+        av_log(s->avctx, AV_LOG_WARNING, "Sample rate mismatch between core (%d Hz) and XLL (%d Hz)\n", dca->core.output_rate, c->freq);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (nsamples != dca->core.npcmsamples) {
+        av_log(s->avctx, AV_LOG_WARNING, "Number of samples per frame mismatch between core (%d) and XLL (%d)\n", dca->core.npcmsamples, nsamples);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // See if this channel set is downmixed and find the next channel set in
+    // hierarchy. If downmixed, undo core pre-scaling before combining with
+    // residual (residual is not scaled).
+    o = find_next_hier_dmix_chset(s, c);
+
+    // Reduce core bit width and combine with residual
+    for (ch = 0; ch < c->nchannels; ch++) {
+        int n, spkr, shift, round;
+        int32_t *src, *dst;
+
+        if (c->residual_encode & (1 << ch))
+            continue;
+
+        // Map this channel to core speaker
+        spkr = ff_dca_core_map_spkr(&dca->core, c->ch_remap[ch]);
+        if (spkr < 0) {
+            av_log(s->avctx, AV_LOG_WARNING, "Residual encoded channel (%d) references unavailable core channel\n", c->ch_remap[ch]);
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Account for LSB width
+        shift = 24 - c->pcm_bit_res + chs_get_lsb_width(s, c, 0, ch);
+        if (shift > 24) {
+            av_log(s->avctx, AV_LOG_WARNING, "Invalid core shift (%d bits)\n", shift);
+            return AVERROR_INVALIDDATA;
+        }
+
+        round = shift > 0 ? 1 << (shift - 1) : 0;
+
+        src = dca->core.output_samples[spkr];
+        dst = c->bands[0].msb_sample_buffer[ch];
+        if (o) {
+            // Undo embedded core downmix pre-scaling
+            int scale_inv = o->dmix_scale_inv[c->hier_ofs + ch];
+            for (n = 0; n < nsamples; n++)
+                dst[n] += (SUINT)clip23((mul16(src[n], scale_inv) + round) >> shift);
+        } else {
+            // No downmix scaling
+            for (n = 0; n < nsamples; n++)
+                dst[n] += (unsigned)((src[n] + round) >> shift);
+        }
+    }
+
+    return 0;
+}
+
+int ff_dca_xll_filter_frame(DCAXllDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    DCAExssAsset *asset = &dca->exss.assets[0];
+    DCAXllChSet *p = &s->chset[0], *c;
+    enum AVMatrixEncoding matrix_encoding = AV_MATRIX_ENCODING_NONE;
+    int i, j, k, ret, shift, nsamples, request_mask;
+    int ch_remap[DCA_SPEAKER_COUNT];
+
+    // Force lossy downmixed output during recovery
+    if (dca->packet & DCA_PACKET_RECOVERY) {
+        for (i = 0, c = s->chset; i < s->nchsets; i++, c++) {
+            if (i < s->nactivechsets)
+                force_lossy_output(s, c);
+
+            if (!c->primary_chset)
+                c->dmix_embedded = 0;
+        }
+
+        s->scalable_lsbs = 0;
+        s->fixed_lsb_width = 0;
+    }
+
+    // Filter frequency bands for active channel sets
+    s->output_mask = 0;
+    for (i = 0, c = s->chset; i < s->nactivechsets; i++, c++) {
+        chs_filter_band_data(s, c, 0);
+
+        if (c->residual_encode != (1 << c->nchannels) - 1
+            && (ret = combine_residual_frame(s, c)) < 0)
+            return ret;
+
+        if (s->scalable_lsbs)
+            chs_assemble_msbs_lsbs(s, c, 0);
+
+        if (c->nfreqbands > 1) {
+            chs_filter_band_data(s, c, 1);
+            chs_assemble_msbs_lsbs(s, c, 1);
+        }
+
+        s->output_mask |= c->ch_mask;
+    }
+
+    // Undo hierarchial downmix and/or apply scaling
+    for (i = 1, c = &s->chset[1]; i < s->nchsets; i++, c++) {
+        if (!is_hier_dmix_chset(c))
+            continue;
+
+        if (i >= s->nactivechsets) {
+            for (j = 0; j < c->nfreqbands; j++)
+                if (c->bands[j].dmix_embedded)
+                    scale_down_mix(s, c, j);
+            break;
+        }
+
+        for (j = 0; j < c->nfreqbands; j++)
+            if (c->bands[j].dmix_embedded)
+                undo_down_mix(s, c, j);
+    }
+
+    // Assemble frequency bands for active channel sets
+    if (s->nfreqbands > 1) {
+        for (i = 0; i < s->nactivechsets; i++)
+            if ((ret = chs_assemble_freq_bands(s, &s->chset[i])) < 0)
+                return ret;
+    }
+
+    // Normalize to regular 5.1 layout if downmixing
+    if (dca->request_channel_layout) {
+        if (s->output_mask & DCA_SPEAKER_MASK_Lss) {
+            s->output_samples[DCA_SPEAKER_Ls] = s->output_samples[DCA_SPEAKER_Lss];
+            s->output_mask = (s->output_mask & ~DCA_SPEAKER_MASK_Lss) | DCA_SPEAKER_MASK_Ls;
+        }
+        if (s->output_mask & DCA_SPEAKER_MASK_Rss) {
+            s->output_samples[DCA_SPEAKER_Rs] = s->output_samples[DCA_SPEAKER_Rss];
+            s->output_mask = (s->output_mask & ~DCA_SPEAKER_MASK_Rss) | DCA_SPEAKER_MASK_Rs;
+        }
+    }
+
+    // Handle downmixing to stereo request
+    if (dca->request_channel_layout == DCA_SPEAKER_LAYOUT_STEREO
+        && DCA_HAS_STEREO(s->output_mask) && p->dmix_embedded
+        && (p->dmix_type == DCA_DMIX_TYPE_LoRo ||
+            p->dmix_type == DCA_DMIX_TYPE_LtRt))
+        request_mask = DCA_SPEAKER_LAYOUT_STEREO;
+    else
+        request_mask = s->output_mask;
+    if (!ff_dca_set_channel_layout(avctx, ch_remap, request_mask))
+        return AVERROR(EINVAL);
+
+    avctx->sample_rate = p->freq << (s->nfreqbands - 1);
+
+    switch (p->storage_bit_res) {
+    case 16:
+        avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
+        shift = 16 - p->pcm_bit_res;
+        break;
+    case 20:
+    case 24:
+        avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
+        shift = 24 - p->pcm_bit_res;
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    avctx->bits_per_raw_sample = p->storage_bit_res;
+    avctx->profile = FF_PROFILE_DTS_HD_MA;
+    avctx->bit_rate = 0;
+
+    frame->nb_samples = nsamples = s->nframesamples << (s->nfreqbands - 1);
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Downmix primary channel set to stereo
+    if (request_mask != s->output_mask) {
+        ff_dca_downmix_to_stereo_fixed(s->dcadsp, s->output_samples,
+                                       p->dmix_coeff, nsamples,
+                                       s->output_mask);
+    }
+
+    for (i = 0; i < avctx->channels; i++) {
+        int32_t *samples = s->output_samples[ch_remap[i]];
+        if (frame->format == AV_SAMPLE_FMT_S16P) {
+            int16_t *plane = (int16_t *)frame->extended_data[i];
+            for (k = 0; k < nsamples; k++)
+                plane[k] = av_clip_int16(samples[k] * (SUINT)(1 << shift));
+        } else {
+            int32_t *plane = (int32_t *)frame->extended_data[i];
+            for (k = 0; k < nsamples; k++)
+                plane[k] = clip23(samples[k] * (SUINT)(1 << shift)) * (1 << 8);
+        }
+    }
+
+    if (!asset->one_to_one_map_ch_to_spkr) {
+        if (asset->representation_type == DCA_REPR_TYPE_LtRt)
+            matrix_encoding = AV_MATRIX_ENCODING_DOLBY;
+        else if (asset->representation_type == DCA_REPR_TYPE_LhRh)
+            matrix_encoding = AV_MATRIX_ENCODING_DOLBYHEADPHONE;
+    } else if (request_mask != s->output_mask && p->dmix_type == DCA_DMIX_TYPE_LtRt) {
+        matrix_encoding = AV_MATRIX_ENCODING_DOLBY;
+    }
+    if ((ret = ff_side_data_update_matrix_encoding(frame, matrix_encoding)) < 0)
+        return ret;
+
     return 0;
 }
+
+av_cold void ff_dca_xll_flush(DCAXllDecoder *s)
+{
+    clear_pbr(s);
+}
+
+av_cold void ff_dca_xll_close(DCAXllDecoder *s)
+{
+    DCAXllChSet *c;
+    int i, j;
+
+    for (i = 0, c = s->chset; i < DCA_XLL_CHSETS_MAX; i++, c++) {
+        for (j = 0; j < DCA_XLL_SAMPLE_BUFFERS_MAX; j++) {
+            av_freep(&c->sample_buffer[j]);
+            c->sample_size[j] = 0;
+        }
+    }
+
+    av_freep(&s->navi);
+    s->navi_size = 0;
+
+    av_freep(&s->pbr_buffer);
+    clear_pbr(s);
+}
diff --git a/libavcodec/dca_xll.h b/libavcodec/dca_xll.h
new file mode 100644
index 0000000..bc0aa65
--- /dev/null
+++ b/libavcodec/dca_xll.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_XLL_H
+#define AVCODEC_DCA_XLL_H
+
+#include "libavutil/common.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dcadsp.h"
+#include "dca_exss.h"
+
+#define DCA_XLL_CHSETS_MAX              3
+#define DCA_XLL_CHANNELS_MAX            8
+#define DCA_XLL_BANDS_MAX               2
+#define DCA_XLL_ADAPT_PRED_ORDER_MAX    16
+#define DCA_XLL_DECI_HISTORY_MAX        8
+#define DCA_XLL_DMIX_SCALES_MAX         ((DCA_XLL_CHSETS_MAX - 1) * DCA_XLL_CHANNELS_MAX)
+#define DCA_XLL_DMIX_COEFFS_MAX         (DCA_XLL_DMIX_SCALES_MAX * DCA_XLL_CHANNELS_MAX)
+#define DCA_XLL_PBR_BUFFER_MAX          (240 << 10)
+#define DCA_XLL_SAMPLE_BUFFERS_MAX      3
+
+typedef struct DCAXllBand {
+    int     decor_enabled;                          ///< Pairwise channel decorrelation flag
+    int     orig_order[DCA_XLL_CHANNELS_MAX];       ///< Original channel order
+    int     decor_coeff[DCA_XLL_CHANNELS_MAX / 2];  ///< Pairwise channel coefficients
+
+    int     adapt_pred_order[DCA_XLL_CHANNELS_MAX]; ///< Adaptive predictor order
+    int     highest_pred_order;                     ///< Highest adaptive predictor order
+    int     fixed_pred_order[DCA_XLL_CHANNELS_MAX]; ///< Fixed predictor order
+    int     adapt_refl_coeff[DCA_XLL_CHANNELS_MAX][DCA_XLL_ADAPT_PRED_ORDER_MAX];   ///< Adaptive predictor reflection coefficients
+
+    int     dmix_embedded;  ///< Downmix performed by encoder in frequency band
+
+    int     lsb_section_size;                       ///< Size of LSB section in any segment
+    int     nscalablelsbs[DCA_XLL_CHANNELS_MAX];    ///< Number of bits to represent the samples in LSB part
+    int     bit_width_adjust[DCA_XLL_CHANNELS_MAX]; ///< Number of bits discarded by authoring
+
+    int32_t *msb_sample_buffer[DCA_XLL_CHANNELS_MAX];   ///< MSB sample buffer pointers
+    int32_t *lsb_sample_buffer[DCA_XLL_CHANNELS_MAX];   ///< LSB sample buffer pointers or NULL
+} DCAXllBand;
+
+typedef struct DCAXllChSet {
+    // Channel set header
+    int     nchannels;          ///< Number of channels in the channel set (N)
+    int     residual_encode;    ///< Residual encoding mask (0 - residual, 1 - full channel)
+    int     pcm_bit_res;        ///< PCM bit resolution (variable)
+    int     storage_bit_res;    ///< Storage bit resolution (16 or 24)
+    int     freq;               ///< Original sampling frequency (max. 96000 Hz)
+
+    int     primary_chset;          ///< Primary channel set flag
+    int     dmix_coeffs_present;    ///< Downmix coefficients present in stream
+    int     dmix_embedded;          ///< Downmix already performed by encoder
+    int     dmix_type;              ///< Primary channel set downmix type
+    int     hier_chset;             ///< Whether the channel set is part of a hierarchy
+    int     hier_ofs;               ///< Number of preceding channels in a hierarchy (M)
+    int     dmix_coeff[DCA_XLL_DMIX_COEFFS_MAX];       ///< Downmixing coefficients
+    int     dmix_scale[DCA_XLL_DMIX_SCALES_MAX];       ///< Downmixing scales
+    int     dmix_scale_inv[DCA_XLL_DMIX_SCALES_MAX];   ///< Inverse downmixing scales
+    int     ch_mask;                ///< Channel mask for set
+    int     ch_remap[DCA_XLL_CHANNELS_MAX];    ///< Channel to speaker map
+
+    int     nfreqbands; ///< Number of frequency bands (1 or 2)
+    int     nabits;     ///< Number of bits to read bit allocation coding parameter
+
+    DCAXllBand     bands[DCA_XLL_BANDS_MAX];   ///< Frequency bands
+
+    // Frequency band coding parameters
+    int     seg_common;                                     ///< Segment type
+    int     rice_code_flag[DCA_XLL_CHANNELS_MAX];           ///< Rice coding flag
+    int     bitalloc_hybrid_linear[DCA_XLL_CHANNELS_MAX];   ///< Binary code length for isolated samples
+    int     bitalloc_part_a[DCA_XLL_CHANNELS_MAX];          ///< Coding parameter for part A of segment
+    int     bitalloc_part_b[DCA_XLL_CHANNELS_MAX];          ///< Coding parameter for part B of segment
+    int     nsamples_part_a[DCA_XLL_CHANNELS_MAX];          ///< Number of samples in part A of segment
+
+    // Decimator history
+    DECLARE_ALIGNED(32, int32_t, deci_history)[DCA_XLL_CHANNELS_MAX][DCA_XLL_DECI_HISTORY_MAX]; ///< Decimator history for frequency band 1
+
+    // Sample buffers
+    unsigned int    sample_size[DCA_XLL_SAMPLE_BUFFERS_MAX];
+    int32_t         *sample_buffer[DCA_XLL_SAMPLE_BUFFERS_MAX];
+} DCAXllChSet;
+
+typedef struct DCAXllDecoder {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    int     frame_size;             ///< Number of bytes in a lossless frame
+    int     nchsets;                ///< Number of channels sets per frame
+    int     nframesegs;             ///< Number of segments per frame
+    int     nsegsamples_log2;       ///< log2(nsegsamples)
+    int     nsegsamples;            ///< Samples in segment per one frequency band
+    int     nframesamples_log2;     ///< log2(nframesamples)
+    int     nframesamples;          ///< Samples in frame per one frequency band
+    int     seg_size_nbits;         ///< Number of bits used to read segment size
+    int     band_crc_present;       ///< Presence of CRC16 within each frequency band
+    int     scalable_lsbs;          ///< MSB/LSB split flag
+    int     ch_mask_nbits;          ///< Number of bits used to read channel mask
+    int     fixed_lsb_width;        ///< Fixed LSB width
+
+    DCAXllChSet    chset[DCA_XLL_CHSETS_MAX]; ///< Channel sets
+
+    int             *navi;          ///< NAVI table
+    unsigned int    navi_size;
+
+    int     nfreqbands;     ///< Highest number of frequency bands
+    int     nchannels;      ///< Total number of channels in a hierarchy
+    int     nreschsets;     ///< Number of channel sets that have residual encoded channels
+    int     nactivechsets;  ///< Number of active channel sets to decode
+
+    int     hd_stream_id;   ///< Previous DTS-HD stream ID for detecting changes
+
+    uint8_t     *pbr_buffer;        ///< Peak bit rate (PBR) smoothing buffer
+    int         pbr_length;         ///< Length in bytes of data currently buffered
+    int         pbr_delay;          ///< Delay in frames before decoding buffered data
+
+    DCADSPContext   *dcadsp;
+
+    int     output_mask;
+    int32_t *output_samples[DCA_SPEAKER_COUNT];
+} DCAXllDecoder;
+
+int ff_dca_xll_parse(DCAXllDecoder *s, uint8_t *data, DCAExssAsset *asset);
+int ff_dca_xll_filter_frame(DCAXllDecoder *s, AVFrame *frame);
+av_cold void ff_dca_xll_flush(DCAXllDecoder *s);
+av_cold void ff_dca_xll_close(DCAXllDecoder *s);
+
+#endif
diff --git a/libavcodec/dcaadpcm.c b/libavcodec/dcaadpcm.c
new file mode 100644
index 0000000..9f615e3
--- /dev/null
+++ b/libavcodec/dcaadpcm.c
@@ -0,0 +1,231 @@
+/*
+ * DCA ADPCM engine
+ * Copyright (C) 2017 Daniil Cherednik
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "dcaadpcm.h"
+#include "dcaenc.h"
+#include "dca_core.h"
+#include "mathops.h"
+
+typedef int32_t premultiplied_coeffs[10];
+
+//assume we have DCA_ADPCM_COEFFS values before x
+static inline int64_t calc_corr(const int32_t *x, int len, int j, int k)
+{
+    int n;
+    int64_t s = 0;
+    for (n = 0; n < len; n++)
+        s += MUL64(x[n-j], x[n-k]);
+    return s;
+}
+
+static inline int64_t apply_filter(const int16_t a[DCA_ADPCM_COEFFS], const int64_t corr[15], const int32_t aa[10])
+{
+    int64_t err = 0;
+    int64_t tmp = 0;
+
+    err = corr[0];
+
+    tmp += MUL64(a[0], corr[1]);
+    tmp += MUL64(a[1], corr[2]);
+    tmp += MUL64(a[2], corr[3]);
+    tmp += MUL64(a[3], corr[4]);
+
+    tmp = norm__(tmp, 13);
+    tmp += tmp;
+
+    err -= tmp;
+    tmp = 0;
+
+    tmp += MUL64(corr[5], aa[0]);
+    tmp += MUL64(corr[6], aa[1]);
+    tmp += MUL64(corr[7], aa[2]);
+    tmp += MUL64(corr[8], aa[3]);
+
+    tmp += MUL64(corr[9], aa[4]);
+    tmp += MUL64(corr[10], aa[5]);
+    tmp += MUL64(corr[11], aa[6]);
+
+    tmp += MUL64(corr[12], aa[7]);
+    tmp += MUL64(corr[13], aa[8]);
+
+    tmp += MUL64(corr[14], aa[9]);
+
+    tmp = norm__(tmp, 26);
+
+    err += tmp;
+
+    return llabs(err);
+}
+
+static int64_t find_best_filter(const DCAADPCMEncContext *s, const int32_t *in, int len)
+{
+    const premultiplied_coeffs *precalc_data = s->private_data;
+    int i, j, k = 0;
+    int vq = -1;
+    int64_t err;
+    int64_t min_err = 1ll << 62;
+    int64_t corr[15];
+
+    for (i = 0; i <= DCA_ADPCM_COEFFS; i++)
+        for (j = i; j <= DCA_ADPCM_COEFFS; j++)
+            corr[k++] = calc_corr(in+4, len, i, j);
+
+    for (i = 0; i < DCA_ADPCM_VQCODEBOOK_SZ; i++) {
+        err = apply_filter(ff_dca_adpcm_vb[i], corr, *precalc_data);
+        if (err < min_err) {
+            min_err = err;
+            vq = i;
+        }
+        precalc_data++;
+    }
+
+    return vq;
+}
+
+static inline int64_t calc_prediction_gain(int pred_vq, const int32_t *in, int32_t *out, int len)
+{
+    int i;
+    int32_t error;
+
+    int64_t signal_energy = 0;
+    int64_t error_energy = 0;
+
+    for (i = 0; i < len; i++) {
+        error = in[DCA_ADPCM_COEFFS + i] - ff_dcaadpcm_predict(pred_vq, in + i);
+        out[i] = error;
+        signal_energy += MUL64(in[DCA_ADPCM_COEFFS + i], in[DCA_ADPCM_COEFFS + i]);
+        error_energy += MUL64(error, error);
+    }
+
+    if (!error_energy)
+        return -1;
+
+    return signal_energy / error_energy;
+}
+
+int ff_dcaadpcm_subband_analysis(const DCAADPCMEncContext *s, const int32_t *in, int len, int *diff)
+{
+    int pred_vq, i;
+    int32_t input_buffer[16 + DCA_ADPCM_COEFFS];
+    int32_t input_buffer2[16 + DCA_ADPCM_COEFFS];
+
+    int32_t max = 0;
+    int shift_bits;
+    uint64_t pg = 0;
+
+    for (i = 0; i < len + DCA_ADPCM_COEFFS; i++)
+        max |= FFABS(in[i]);
+
+    // normalize input to simplify apply_filter
+    shift_bits = av_log2(max) - 11;
+
+    for (i = 0; i < len + DCA_ADPCM_COEFFS; i++) {
+        input_buffer[i] = norm__(in[i], 7);
+        input_buffer2[i] = norm__(in[i], shift_bits);
+    }
+
+    pred_vq = find_best_filter(s, input_buffer2, len);
+
+    if (pred_vq < 0)
+        return -1;
+
+    pg = calc_prediction_gain(pred_vq, input_buffer, diff, len);
+
+    // Greater than 10db (10*log(10)) prediction gain to use ADPCM.
+    // TODO: Tune it.
+    if (pg < 10)
+        return -1;
+
+    for (i = 0; i < len; i++)
+        diff[i] <<= 7;
+
+    return pred_vq;
+}
+
+static void precalc(premultiplied_coeffs *data)
+{
+    int i, j, k;
+
+    for (i = 0; i < DCA_ADPCM_VQCODEBOOK_SZ; i++) {
+        int id = 0;
+        int32_t t = 0;
+        for (j = 0; j < DCA_ADPCM_COEFFS; j++) {
+            for (k = j; k < DCA_ADPCM_COEFFS; k++) {
+                t = (int32_t)ff_dca_adpcm_vb[i][j] * (int32_t)ff_dca_adpcm_vb[i][k];
+                if (j != k)
+                    t *= 2;
+                (*data)[id++] = t;
+             }
+        }
+        data++;
+    }
+}
+
+int ff_dcaadpcm_do_real(int pred_vq_index,
+                        softfloat quant, int32_t scale_factor, int32_t step_size,
+                        const int32_t *prev_hist, const int32_t *in, int32_t *next_hist, int32_t *out,
+                        int len, int32_t peak)
+{
+    int i;
+    int64_t delta;
+    int32_t dequant_delta;
+    int32_t work_bufer[16 + DCA_ADPCM_COEFFS];
+
+    memcpy(work_bufer, prev_hist, sizeof(int32_t) * DCA_ADPCM_COEFFS);
+
+    for (i = 0; i < len; i++) {
+        work_bufer[DCA_ADPCM_COEFFS + i] = ff_dcaadpcm_predict(pred_vq_index, &work_bufer[i]);
+
+        delta = (int64_t)in[i] - ((int64_t)work_bufer[DCA_ADPCM_COEFFS + i] << 7);
+
+        out[i] = quantize_value(av_clip64(delta, -peak, peak), quant);
+
+        ff_dca_core_dequantize(&dequant_delta, &out[i], step_size, scale_factor, 0, 1);
+
+        work_bufer[DCA_ADPCM_COEFFS+i] += dequant_delta;
+    }
+
+    memcpy(next_hist, &work_bufer[len], sizeof(int32_t) * DCA_ADPCM_COEFFS);
+
+    return 0;
+}
+
+av_cold int ff_dcaadpcm_init(DCAADPCMEncContext *s)
+{
+    if (!s)
+        return -1;
+
+    s->private_data = av_malloc(sizeof(premultiplied_coeffs) * DCA_ADPCM_VQCODEBOOK_SZ);
+    if (!s->private_data)
+        return AVERROR(ENOMEM);
+
+    precalc(s->private_data);
+    return 0;
+}
+
+av_cold void ff_dcaadpcm_free(DCAADPCMEncContext *s)
+{
+    if (!s)
+        return;
+
+    av_freep(&s->private_data);
+}
diff --git a/libavcodec/dcaadpcm.h b/libavcodec/dcaadpcm.h
new file mode 100644
index 0000000..23bfa79
--- /dev/null
+++ b/libavcodec/dcaadpcm.h
@@ -0,0 +1,54 @@
+/*
+ * DCA ADPCM engine
+ * Copyright (C) 2017 Daniil Cherednik
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCAADPCM_H
+#define AVCODEC_DCAADPCM_H
+
+#include "dcamath.h"
+#include "dcadata.h"
+#include "dcaenc.h"
+
+typedef struct DCAADPCMEncContext {
+    void *private_data;
+} DCAADPCMEncContext;
+
+static inline int64_t ff_dcaadpcm_predict(int pred_vq_index, const int32_t *input)
+{
+    int i;
+    const int16_t *coeff = ff_dca_adpcm_vb[pred_vq_index];
+    int64_t pred = 0;
+    for (i = 0; i < DCA_ADPCM_COEFFS; i++)
+        pred += (int64_t)input[DCA_ADPCM_COEFFS - 1 - i] * coeff[i];
+
+    return clip23(norm13(pred));
+}
+
+int ff_dcaadpcm_subband_analysis(const DCAADPCMEncContext *s, const int32_t *input, int len, int *diff);
+
+int ff_dcaadpcm_do_real(int pred_vq_index,
+                        softfloat quant, int32_t scale_factor, int32_t step_size,
+                        const int32_t *prev_hist, const int32_t *in, int32_t *next_hist, int32_t *out,
+                        int len, int32_t peak);
+
+av_cold int ff_dcaadpcm_init(DCAADPCMEncContext *s);
+av_cold void ff_dcaadpcm_free(DCAADPCMEncContext *s);
+
+#endif /* AVCODEC_DCAADPCM_H */
diff --git a/libavcodec/dcadata.c b/libavcodec/dcadata.c
index 2369f55..1b646a7 100644
--- a/libavcodec/dcadata.c
+++ b/libavcodec/dcadata.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Gildas Bazin
  * Copyright (c) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include "libavutil/mem.h"
 
+#include "dca.h"
 #include "dcadata.h"
 
 /* Generic tables */
@@ -41,14 +42,22 @@ const uint8_t ff_dca_channels[16] = {
     1, 2, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 6, 7, 8, 8
 };
 
-const uint8_t ff_dca_bits_per_sample[7] = {
-    16, 16, 20, 20, 0, 24, 24
+const uint8_t ff_dca_dmix_primary_nch[8] = {
+    1, 2, 2, 3, 3, 4, 4, 0
+};
+
+const uint8_t ff_dca_quant_index_sel_nbits[DCA_CODE_BOOKS] = {
+    1, 2, 2, 2, 2, 3, 3, 3, 3, 3
+};
+
+const uint8_t ff_dca_quant_index_group_size[DCA_CODE_BOOKS] = {
+    1, 3, 3, 3, 3, 7, 7, 7, 7, 7
 };
 
 /* ADPCM data */
 
 /* 16 bits signed fractional Q13 binary codes */
-const int16_t ff_dca_adpcm_vb[4096][4] = {
+const int16_t ff_dca_adpcm_vb[DCA_ADPCM_VQCODEBOOK_SZ][DCA_ADPCM_COEFFS] = {
     {   9928,  -2618,  -1093, -1263 },
     {  11077,  -2876,  -1747,  -308 },
     {  10503,  -1082,  -1426, -1167 },
@@ -4179,6 +4188,37 @@ const uint32_t ff_dca_scale_factor_quant7[128] = {
     5011872, 5688529, 6456542, 7328245, 8317638,       0,       0,       0
 };
 
+const uint32_t ff_dca_joint_scale_factors[129] = {
+       3288,    3490,    3691,    3909,    4144,    4387,    4647,    4924,
+       5218,    5528,    5855,    6199,    6568,    6963,    7374,    7810,
+       8271,    8758,    9278,    9831,   10410,   11031,   11685,   12373,
+      13103,   13883,   14705,   15578,   16500,   17482,   18514,   19613,
+      20770,   22003,   23312,   24688,   26156,   27699,   29343,   31080,
+      32925,   34871,   36943,   39133,   41448,   43906,   46506,   49258,
+      52177,   55273,   58544,   62017,   65691,   69584,   73711,   78073,
+      82703,   87602,   92795,   98289,  104111,  110285,  116820,  123740,
+     131072,  138840,  147069,  155776,  165012,  174785,  185145,  196117,
+     207735,  220042,  233086,  246894,  261523,  277017,  293434,  310823,
+     329236,  348748,  369409,  391303,  414490,  439043,  465064,  492621,
+     521805,  552725,  585475,  620170,  656920,  695843,  737073,  780745,
+     827008,  876014,  927923,  982902, 1041144, 1102834, 1168181, 1237404,
+    1310720, 1388382, 1470649, 1557790, 1650098, 1747876, 1851441, 1961147,
+    2077355, 2200441, 2330825, 2468935, 2615232, 2770195, 2934335, 3108206,
+    3292378, 3487463, 3694108, 3913000, 4144862, 4390455, 4650611, 4926176,
+    5218066
+};
+
+const uint32_t ff_dca_scale_factor_adj[4] = {
+    4194304, 4718592, 5242880, 6029312
+};
+
+const uint32_t ff_dca_quant_levels[32] = {
+          1,       3,       5,     7,      9,     13,     17,      25,
+         32,      64,     128,   256,    512,   1024,   2048,    4096,
+       8192,   16384,   32768, 65536, 131072, 262144, 524288, 1048576,
+    2097152, 4194304, 8388608,     0,      0,      0,      0,       0
+};
+
 /* 20 bits unsigned fractional binary codes */
 const uint32_t ff_dca_lossy_quant[32] = {
          0, 6710886, 4194304, 3355443, 2474639, 2097152, 1761608, 1426063,
@@ -7507,76 +7547,6 @@ DECLARE_ALIGNED(16, const float, ff_dca_lfe_fir_128)[256] = {
 };
 #undef SCALE
 
-
-#define SCALE(c) ((float)(c) / (256.0f * 32768.0f * 8388608.0f))
-DECLARE_ALIGNED(16, const float, ff_dca_lfe_xll_fir_64)[256] = {
-    SCALE(   6103), SCALE(  52170), SCALE(-558064), SCALE(1592440),
-    SCALE(6290049), SCALE(1502534), SCALE(-546669), SCALE(  53047),
-    SCALE(   1930), SCALE(  51089), SCALE(-568920), SCALE(1683709),
-    SCALE(6286575), SCALE(1414057), SCALE(-534782), SCALE(  53729),
-    SCALE(   2228), SCALE(  49794), SCALE(-579194), SCALE(1776276),
-    SCALE(6279634), SCALE(1327070), SCALE(-522445), SCALE(  54228),
-    SCALE(   2552), SCALE(  48275), SCALE(-588839), SCALE(1870070),
-    SCALE(6269231), SCALE(1241632), SCALE(-509702), SCALE(  54550),
-    SCALE(   2904), SCALE(  46523), SCALE(-597808), SCALE(1965017),
-    SCALE(6255380), SCALE(1157798), SCALE(-496595), SCALE(  54708),
-    SCALE(   3287), SCALE(  44529), SCALE(-606054), SCALE(2061044),
-    SCALE(6238099), SCALE(1075621), SCALE(-483164), SCALE(  54710),
-    SCALE(   3704), SCALE(  42282), SCALE(-613529), SCALE(2158071),
-    SCALE(6217408), SCALE( 995149), SCALE(-469451), SCALE(  54566),
-    SCALE(   4152), SCALE(  39774), SCALE(-620186), SCALE(2256019),
-    SCALE(6193332), SCALE( 916430), SCALE(-455494), SCALE(  54285),
-    SCALE(   4631), SCALE(  36995), SCALE(-625976), SCALE(2354805),
-    SCALE(6165900), SCALE( 839507), SCALE(-441330), SCALE(  53876),
-    SCALE(   5139), SCALE(  33937), SCALE(-630850), SCALE(2454343),
-    SCALE(6135146), SCALE( 764419), SCALE(-426998), SCALE(  53348),
-    SCALE(   5682), SCALE(  30591), SCALE(-634759), SCALE(2554547),
-    SCALE(6101107), SCALE( 691203), SCALE(-412531), SCALE(  52711),
-    SCALE(   6264), SCALE(  26948), SCALE(-637655), SCALE(2655326),
-    SCALE(6063824), SCALE( 619894), SCALE(-397966), SCALE(  51972),
-    SCALE(   6886), SCALE(  23001), SCALE(-639488), SCALE(2756591),
-    SCALE(6023343), SCALE( 550521), SCALE(-383335), SCALE(  51140),
-    SCALE(   7531), SCALE(  18741), SCALE(-640210), SCALE(2858248),
-    SCALE(5979711), SCALE( 483113), SCALE(-368671), SCALE(  50224),
-    SCALE(   8230), SCALE(  14162), SCALE(-639772), SCALE(2960201),
-    SCALE(5932981), SCALE( 417692), SCALE(-354003), SCALE(  49231),
-    SCALE(   8959), SCALE(   9257), SCALE(-638125), SCALE(3062355),
-    SCALE(5883210), SCALE( 354281), SCALE(-339362), SCALE(  48168),
-    SCALE(   9727), SCALE(   4018), SCALE(-635222), SCALE(3164612),
-    SCALE(5830457), SCALE( 292897), SCALE(-324777), SCALE(  47044),
-    SCALE(  10535), SCALE(  -1558), SCALE(-631014), SCALE(3266872),
-    SCALE(5774785), SCALE( 233555), SCALE(-310273), SCALE(  45866),
-    SCALE(  11381), SCALE(  -7480), SCALE(-625455), SCALE(3369035),
-    SCALE(5716260), SCALE( 176267), SCALE(-295877), SCALE(  44640),
-    SCALE(  12267), SCALE( -13750), SCALE(-618499), SCALE(3471000),
-    SCALE(5654952), SCALE( 121042), SCALE(-281613), SCALE(  43373),
-    SCALE(  13190), SCALE( -20372), SCALE(-610098), SCALE(3572664),
-    SCALE(5590933), SCALE(  67886), SCALE(-267505), SCALE(  42072),
-    SCALE(  14152), SCALE( -27352), SCALE(-600209), SCALE(3673924),
-    SCALE(5524280), SCALE(  16800), SCALE(-253574), SCALE(  40743),
-    SCALE(  15153), SCALE( -34691), SCALE(-588788), SCALE(3774676),
-    SCALE(5455069), SCALE( -32214), SCALE(-239840), SCALE(  39391),
-    SCALE(  16192), SCALE( -42390), SCALE(-575791), SCALE(3874816),
-    SCALE(5383383), SCALE( -79159), SCALE(-226323), SCALE(  38022),
-    SCALE(  17267), SCALE( -50453), SCALE(-561178), SCALE(3974239),
-    SCALE(5309305), SCALE(-124041), SCALE(-213041), SCALE(  36642),
-    SCALE(  18377), SCALE( -58879), SCALE(-544906), SCALE(4072841),
-    SCALE(5232922), SCALE(-166869), SCALE(-200010), SCALE(  35256),
-    SCALE(  19525), SCALE( -67667), SCALE(-526937), SCALE(4170517),
-    SCALE(5154321), SCALE(-207653), SCALE(-187246), SCALE(  33866),
-    SCALE(  20704), SCALE( -76817), SCALE(-507233), SCALE(4267162),
-    SCALE(5073593), SCALE(-246406), SCALE(-174764), SCALE(  32480),
-    SCALE(  21915), SCALE( -86327), SCALE(-485757), SCALE(4362672),
-    SCALE(4990831), SCALE(-283146), SCALE(-162575), SCALE(  31101),
-    SCALE(  23157), SCALE( -96193), SCALE(-462476), SCALE(4456942),
-    SCALE(4906129), SCALE(-317890), SCALE(-150692), SCALE(  29732),
-    SCALE(  24426), SCALE(-106412), SCALE(-437356), SCALE(4549871),
-    SCALE(4819584), SCALE(-350658), SCALE(-139125), SCALE(  28376),
-    SCALE(  25721), SCALE(-116977), SCALE(-410365), SCALE(4641355),
-    SCALE(4731293), SCALE(-381475), SCALE(-127884), SCALE(  27038),
-};
-#undef SCALE
-
 DECLARE_ALIGNED(16, const float, ff_dca_fir_64bands)[1024] = {
     /* Bank 0 */
     -7.1279389866041690e-8, -7.0950903150874990e-8,
@@ -8101,6 +8071,562 @@ DECLARE_ALIGNED(16, const float, ff_dca_fir_64bands)[1024] = {
      7.0950903150874990e-8,  7.1279389866041690e-8,
 };
 
+DECLARE_ALIGNED(16, const int32_t, ff_dca_fir_32bands_perfect_fixed)[512] = {
+           0,        0,       -3,      -10,
+         -35,     -105,     -218,     -141,
+        -170,     -216,     -239,     -254,
+        -257,     -251,     -235,     -212,
+        -267,     -317,     -362,     -400,
+        -425,     -434,     -427,     -373,
+        -339,     -593,     -321,     -120,
+         -39,      -16,        0,        1,
+           1,        1,       -3,       -1,
+          -6,      -38,      -93,     -496,
+        -723,     -970,    -1235,    -1501,
+       -1753,    -1978,    -2163,    -2295,
+       -2891,    -2915,    -2860,    -2726,
+       -2517,    -2243,    -1915,    -1590,
+       -1192,     -252,     -117,      -22,
+          -6,      -13,       12,       14,
+          32,       25,      469,      942,
+        1403,     1421,     1239,     2838,
+        3539,     4259,     5002,     5716,
+        6365,     6908,     7311,     7545,
+       11680,    12355,    12785,    12951,
+       12841,    12453,    11803,    10864,
+        9762,     7099,     6725,     5954,
+        4284,     2584,      215,      379,
+         557,      701,      -29,     -687,
+       -1578,    -2749,    -4076,    -7933,
+      -10049,   -12133,   -14039,   -15752,
+      -17213,   -18400,   -19291,   -19878,
+       -1444,    -3005,    -4523,    -5927,
+       -7143,    -8093,    -8713,    -8939,
+       -8700,    -9481,    -7515,    -5279,
+       -2725,       61,     5763,     6113,
+        7571,     6735,    17126,    20165,
+       23328,    26775,    30310,    32639,
+       35464,    38064,    40423,    42512,
+       44261,    45632,    46578,    46974,
+      -45572,   -45008,   -43753,   -41661,
+      -38655,   -34660,   -29587,   -23375,
+      -15998,    -7631,     2472,    13757,
+       26188,    39942,    49789,    67293,
+       84699,   101701,   127325,   148404,
+      170391,   193280,   217044,   241451,
+      266537,   292144,   318161,   344417,
+      370786,   397082,   423133,   448757,
+      475085,   499136,   522007,   543516,
+      563424,   581467,   597422,   611005,
+      621975,   630083,   634996,   636457,
+      634311,   628147,   619871,   604524,
+      585077,   561301,   529204,   494129,
+      453552,   407189,   354920,   296502,
+      231916,   161012,    83700,      -86,
+      -90377,  -187193,  -290528,  -400329,
+      516487,   639054,   767835,   902710,
+     1043512,  1190048,  1342100,  1499418,
+     1661729,  1828700,  2000071,  2175433,
+     2354437,  2536630,  2721120,  2908704,
+     3098059,  3288764,  3480801,  3672922,
+     3864970,  4056432,  4246767,  4435454,
+     4621921,  4805642,  4986073,  5162677,
+     5334921,  5502279,  5664239,  5820300,
+     5969913,  6112723,  6248225,  6375985,
+     6495593,  6606663,  6708832,  6801769,
+     6885168,  6958762,  7022294,  7075566,
+     7118382,  7150633,  7172314,  7183082,
+     7183082,  7172314,  7150633,  7118382,
+     7075566,  7022294,  6958762,  6885168,
+     6801769,  6708832,  6606663,  6495593,
+     6375985,  6248225,  6112723,  5969913,
+    -5820300, -5664239, -5502279, -5334921,
+    -5162677, -4986073, -4805642, -4621921,
+    -4435454, -4246767, -4056432, -3864970,
+    -3672922, -3480801, -3288764, -3098059,
+    -2908704, -2721120, -2536630, -2354437,
+    -2175433, -2000071, -1828700, -1661729,
+    -1499418, -1342100, -1190048, -1043512,
+     -902710,  -767835,  -639054,  -516487,
+     -400329,  -290528,  -187193,   -90377,
+         -86,    83700,   161012,   231916,
+      296502,   354920,   407189,   453552,
+      494129,   529204,   561301,   585077,
+      604524,   619871,   628147,   634311,
+      636457,   634996,   630083,   621975,
+      611005,   597422,   581467,   563424,
+      543516,   522007,   499136,   475085,
+     -448757,  -423133,  -397082,  -370786,
+     -344417,  -318161,  -292144,  -266537,
+     -241451,  -217044,  -193280,  -170391,
+     -148404,  -127325,  -101701,   -84699,
+      -67293,   -49789,   -39942,   -26188,
+      -13757,    -2472,     7631,    15998,
+       23375,    29587,    34660,    38655,
+       41661,    43753,    45008,    45572,
+       46974,    46578,    45632,    44261,
+       42512,    40423,    38064,    35464,
+       32639,    30310,    26775,    23328,
+       20165,    17126,     6735,     7571,
+        6113,     5763,       61,    -2725,
+       -5279,    -7515,    -9481,    -8700,
+       -8939,    -8713,    -8093,    -7143,
+       -5927,    -4523,    -3005,    -1444,
+       19878,    19291,    18400,    17213,
+       15752,    14039,    12133,    10049,
+        7933,     4076,     2749,     1578,
+         687,       29,     -701,     -557,
+        -379,     -215,    -2584,    -4284,
+       -5954,    -6725,    -7099,    -9762,
+      -10864,   -11803,   -12453,   -12841,
+      -12951,   -12785,   -12355,   -11680,
+        7545,     7311,     6908,     6365,
+        5716,     5002,     4259,     3539,
+        2838,     1239,     1421,     1403,
+         942,      469,       25,       32,
+          14,       12,      -13,       -6,
+         -22,     -117,     -252,    -1192,
+       -1590,    -1915,    -2243,    -2517,
+       -2726,    -2860,    -2915,    -2891,
+        2295,     2163,     1978,     1753,
+        1501,     1235,      970,      723,
+         496,       93,       38,        6,
+           1,        3,       -1,       -1,
+          -1,        0,       16,       39,
+         120,      321,      593,      339,
+         373,      427,      434,      425,
+         400,      362,      317,      267,
+        -212,     -235,     -251,     -257,
+        -254,     -239,     -216,     -170,
+        -141,     -218,     -105,      -35,
+         -10,       -3,        0,        0
+};
+
+DECLARE_ALIGNED(16, const int32_t, ff_dca_fir_32bands_nonperfect_fixed)[512] = {
+         -53,      -64,      -77,      -91,
+        -107,     -124,     -144,     -165,
+        -189,     -215,     -244,     -277,
+        -313,     -353,     -397,     -447,
+         502,      563,      631,      706,
+         789,      881,      983,     1095,
+        1218,     1354,     1502,     1665,
+        1843,     2036,     2247,     2475,
+        2723,     2990,     3277,     3586,
+        3916,     4270,     4646,     5046,
+        5470,     5918,     6390,     6886,
+        7405,     7947,     8510,     9094,
+        9698,    10319,    10955,    11605,
+       12265,    12933,    13605,    14277,
+       14945,    15604,    16250,    16877,
+       17480,    18051,    18585,    19075,
+       19513,    19891,    20202,    20436,
+       20587,    20643,    20597,    20439,
+       20160,    19749,    19198,    18496,
+       17634,    16603,    15393,    13996,
+      -12403,   -10605,    -8595,    -6366,
+       -3911,    -1225,     1697,     4860,
+        8265,    11916,    15812,    19953,
+       24337,    28961,    33819,    38904,
+       44210,    49725,    55437,    61334,
+       67398,    73614,    79961,    86417,
+       92960,    99563,   106198,   112837,
+      119446,   125994,   132443,   138758,
+      144898,   150823,   156491,   161858,
+      166879,   171507,   175697,   179400,
+      182566,   185149,   187097,   188363,
+      188899,   188654,   187581,   185635,
+      182770,   178943,   174112,   168238,
+      161285,   153218,   144007,   133624,
+      122046,   109254,    95232,    79969,
+       63462,    45709,    26715,     6492,
+       14943,    37567,    61350,    86256,
+      112242,   139258,   167246,   196143,
+      225877,   256368,   287532,   319275,
+      351496,   384088,   416936,   449919,
+      482909,   515770,   548362,   580539,
+      612148,   643030,   673024,   701963,
+      729674,   755985,   780717,   803690,
+      824721,   843628,   860226,   874332,
+      885761,   894330,   899861,   902174,
+      901096,   896456,   888088,   875832,
+      859535,   839050,   814237,   784966,
+      751116,   712574,   669239,   621021,
+      567840,   509632,   446341,   377927,
+      304365,   225641,   141757,    52732,
+      -41403,  -140599,  -244793,  -353905,
+     -467840,  -586486,  -709716,  -837385,
+      969336,  1105393,  1245366,  1389049,
+     1536224,  1686655,  1840096,  1996285,
+     2154949,  2315802,  2478547,  2642877,
+     2808475,  2975015,  3142163,  3309579,
+     3476914,  3643818,  3809934,  3974901,
+     4138360,  4299948,  4459303,  4616064,
+     4769873,  4920374,  5067219,  5210063,
+     5348569,  5482406,  5611255,  5734805,
+     5852757,  5964823,  6070729,  6170216,
+     6263037,  6348961,  6427777,  6499286,
+     6563310,  6619688,  6668279,  6708963,
+     6741632,  6766206,  6782623,  6790843,
+     6790843,  6782623,  6766206,  6741632,
+     6708963,  6668279,  6619688,  6563310,
+     6499286,  6427777,  6348961,  6263037,
+     6170216,  6070729,  5964823,  5852757,
+    -5734805, -5611255, -5482406, -5348569,
+    -5210063, -5067219, -4920374, -4769873,
+    -4616064, -4459303, -4299948, -4138360,
+    -3974901, -3809934, -3643818, -3476914,
+    -3309579, -3142163, -2975015, -2808475,
+    -2642877, -2478547, -2315802, -2154949,
+    -1996285, -1840096, -1686655, -1536224,
+    -1389049, -1245366, -1105393,  -969336,
+     -837385,  -709716,  -586486,  -467840,
+     -353905,  -244793,  -140599,   -41403,
+       52732,   141757,   225641,   304365,
+      377927,   446341,   509632,   567840,
+      621021,   669239,   712574,   751116,
+      784966,   814237,   839050,   859535,
+      875832,   888088,   896456,   901096,
+      902174,   899861,   894330,   885761,
+     -874332,  -860226,  -843628,  -824721,
+     -803690,  -780717,  -755985,  -729674,
+     -701963,  -673024,  -643030,  -612148,
+     -580539,  -548362,  -515770,  -482909,
+     -449919,  -416936,  -384088,  -351496,
+     -319275,  -287532,  -256368,  -225877,
+     -196143,  -167246,  -139258,  -112242,
+      -86256,   -61350,   -37567,   -14943,
+        6492,    26715,    45709,    63462,
+       79969,    95232,   109254,   122046,
+      133624,   144007,   153218,   161285,
+      168238,   174112,   178943,   182770,
+      185635,   187581,   188654,   188899,
+      188363,   187097,   185149,   182566,
+      179400,   175697,   171507,   166879,
+      161858,   156491,   150823,   144898,
+     -138758,  -132443,  -125994,  -119446,
+     -112837,  -106198,   -99563,   -92960,
+      -86417,   -79961,   -73614,   -67398,
+      -61334,   -55437,   -49725,   -44210,
+      -38904,   -33819,   -28961,   -24337,
+      -19953,   -15812,   -11916,    -8265,
+       -4860,    -1697,     1225,     3911,
+        6366,     8595,    10605,    12403,
+       13996,    15393,    16603,    17634,
+       18496,    19198,    19749,    20160,
+       20439,    20597,    20643,    20587,
+       20436,    20202,    19891,    19513,
+       19075,    18585,    18051,    17480,
+       16877,    16250,    15604,    14945,
+       14277,    13605,    12933,    12265,
+       11605,    10955,    10319,     9698,
+       -9094,    -8510,    -7947,    -7405,
+       -6886,    -6390,    -5918,    -5470,
+       -5046,    -4646,    -4270,    -3916,
+       -3586,    -3277,    -2990,    -2723,
+       -2475,    -2247,    -2036,    -1843,
+       -1665,    -1502,    -1354,    -1218,
+       -1095,     -983,     -881,     -789,
+        -706,     -631,     -563,     -502,
+        -447,     -397,     -353,     -313,
+        -277,     -244,     -215,     -189,
+        -165,     -144,     -124,     -107,
+         -91,      -77,      -64,      -53
+};
+
+DECLARE_ALIGNED(16, const int32_t, ff_dca_lfe_fir_64_fixed)[256] = {
+     6103,   52170, -558064, 1592440, 6290049, 1502534, -546669, 53047,
+     1930,   51089, -568920, 1683709, 6286575, 1414057, -534782, 53729,
+     2228,   49794, -579194, 1776276, 6279634, 1327070, -522445, 54228,
+     2552,   48275, -588839, 1870070, 6269231, 1241632, -509702, 54550,
+     2904,   46523, -597808, 1965017, 6255380, 1157798, -496595, 54708,
+     3287,   44529, -606054, 2061044, 6238099, 1075621, -483164, 54710,
+     3704,   42282, -613529, 2158071, 6217408,  995149, -469451, 54566,
+     4152,   39774, -620186, 2256019, 6193332,  916430, -455494, 54285,
+     4631,   36995, -625976, 2354805, 6165900,  839507, -441330, 53876,
+     5139,   33937, -630850, 2454343, 6135146,  764419, -426998, 53348,
+     5682,   30591, -634759, 2554547, 6101107,  691203, -412531, 52711,
+     6264,   26948, -637655, 2655326, 6063824,  619894, -397966, 51972,
+     6886,   23001, -639488, 2756591, 6023343,  550521, -383335, 51140,
+     7531,   18741, -640210, 2858248, 5979711,  483113, -368671, 50224,
+     8230,   14162, -639772, 2960201, 5932981,  417692, -354003, 49231,
+     8959,    9257, -638125, 3062355, 5883210,  354281, -339362, 48168,
+     9727,    4018, -635222, 3164612, 5830457,  292897, -324777, 47044,
+    10535,   -1558, -631014, 3266872, 5774785,  233555, -310273, 45866,
+    11381,   -7480, -625455, 3369035, 5716260,  176267, -295877, 44640,
+    12267,  -13750, -618499, 3471000, 5654952,  121042, -281613, 43373,
+    13190,  -20372, -610098, 3572664, 5590933,   67886, -267505, 42072,
+    14152,  -27352, -600209, 3673924, 5524280,   16800, -253574, 40743,
+    15153,  -34691, -588788, 3774676, 5455069,  -32214, -239840, 39391,
+    16192,  -42390, -575791, 3874816, 5383383,  -79159, -226323, 38022,
+    17267,  -50453, -561178, 3974239, 5309305, -124041, -213041, 36642,
+    18377,  -58879, -544906, 4072841, 5232922, -166869, -200010, 35256,
+    19525,  -67667, -526937, 4170517, 5154321, -207653, -187246, 33866,
+    20704,  -76817, -507233, 4267162, 5073593, -246406, -174764, 32480,
+    21915,  -86327, -485757, 4362672, 4990831, -283146, -162575, 31101,
+    23157,  -96193, -462476, 4456942, 4906129, -317890, -150692, 29732,
+    24426, -106412, -437356, 4549871, 4819584, -350658, -139125, 28376,
+    25721, -116977, -410365, 4641355, 4731293, -381475, -127884, 27038
+};
+
+DECLARE_ALIGNED(16, const int32_t, ff_dca_fir_64bands_fixed)[1024] = {
+         -38,      -38,      -43,      -48,
+         -52,      -57,      -62,      -67,
+         -73,      -79,      -85,      -91,
+         -98,     -105,     -113,     -121,
+        -129,     -138,     -147,     -157,
+        -167,     -178,     -190,     -202,
+        -215,     -228,     -242,     -257,
+        -273,     -289,     -307,     -325,
+         345,      365,      387,      410,
+         433,      459,      485,      513,
+         543,      574,      606,      640,
+         676,      714,      753,      795,
+         839,      884,      932,      983,
+        1035,     1090,     1148,     1208,
+        1271,     1336,     1405,     1476,
+        1550,     1628,     1709,     1793,
+        1880,     1971,     2065,     2163,
+        2265,     2370,     2479,     2592,
+        2709,     2830,     2955,     3084,
+        3217,     3354,     3496,     3642,
+        3791,     3946,     4104,     4267,
+        4433,     4604,     4780,     4959,
+        5143,     5330,     5522,     5717,
+        5916,     6119,     6326,     6536,
+        6749,     6966,     7186,     7408,
+        7633,     7861,     8090,     8322,
+        8556,     8791,     9027,     9264,
+        9501,     9739,     9977,    10214,
+       10450,    10685,    10918,    11149,
+       11377,    11603,    11825,    12042,
+       12255,    12463,    12665,    12861,
+       13050,    13231,    13405,    13569,
+       13724,    13869,    14002,    14125,
+       14235,    14331,    14415,    14483,
+       14536,    14573,    14594,    14596,
+       14580,    14544,    14488,    14412,
+       14313,    14191,    14046,    13877,
+       13682,    13461,    13213,    12937,
+       12632,    12298,    11934,    11538,
+       11111,    10650,    10156,     9628,
+       -9065,    -8466,    -7830,    -7158,
+       -6447,    -5698,    -4910,    -4083,
+       -3215,    -2306,    -1357,     -366,
+         668,     1743,     2861,     4022,
+        5226,     6474,     7764,     9098,
+       10476,    11897,    13361,    14868,
+       16418,    18011,    19645,    21322,
+       23039,    24798,    26596,    28433,
+       30309,    32222,    34172,    36158,
+       38177,    40231,    42315,    44431,
+       46575,    48747,    50945,    53167,
+       55411,    57676,    59959,    62258,
+       64571,    66897,    69231,    71573,
+       73919,    76268,    78615,    80959,
+       83296,    85624,    87939,    90239,
+       92519,    94778,    97011,    99215,
+      101386,   103521,   105616,   107668,
+      109673,   111626,   113524,   115362,
+      117138,   118847,   120484,   122045,
+      123527,   124925,   126234,   127451,
+      128571,   129591,   130504,   131308,
+      131997,   132568,   133016,   133338,
+      133528,   133582,   133495,   133265,
+      132886,   132355,   131668,   130820,
+      129807,   128626,   127274,   125746,
+      124038,   122148,   120071,   117806,
+      115348,   112694,   109843,   106790,
+      103534,   100071,    96401,    92520,
+       88426,    84119,    79597,    74857,
+       69900,    64723,    59327,    53711,
+       47875,    41818,    35542,    29045,
+       22330,    15397,     8247,      881,
+        6697,    14487,    22487,    30692,
+       39101,    47711,    56517,    65516,
+       74704,    84076,    93628,   103355,
+      113251,   123311,   133528,   143897,
+      154410,   165061,   175843,   186747,
+      197766,   208892,   220116,   231429,
+      242822,   254285,   265810,   277384,
+      288999,   300644,   312306,   323976,
+      335641,   347289,   358909,   370488,
+      382013,   393471,   404848,   416133,
+      427310,   438366,   449286,   460057,
+      470663,   481090,   491323,   501347,
+      511147,   520707,   530011,   539044,
+      547790,   556233,   564357,   572146,
+      579584,   586654,   593341,   599627,
+      605498,   610936,   615925,   620449,
+      624491,   628037,   631069,   633571,
+      635529,   636925,   637745,   637972,
+      637593,   636592,   634953,   632662,
+      629705,   626068,   621737,   616698,
+      610938,   604443,   597202,   589202,
+      580431,   570877,   560530,   549378,
+      537411,   524620,   510994,   496525,
+      481205,   465026,   447979,   430058,
+      411256,   391569,   370989,   349514,
+      327137,   303857,   279670,   254573,
+      228564,   201644,   173811,   145065,
+      115408,    84840,    53365,    20984,
+      -12298,   -46478,   -81550,  -117508,
+     -154347,  -192060,  -230638,  -270073,
+     -310356,  -351478,  -393427,  -436192,
+     -479762,  -524124,  -569264,  -615168,
+      661821,   709209,   757314,   806121,
+      855611,   905766,   956569,  1007998,
+     1060035,  1112658,  1165846,  1219578,
+     1273830,  1328580,  1383805,  1439479,
+     1495578,  1552077,  1608950,  1666171,
+     1723714,  1781550,  1839653,  1897995,
+     1956546,  2015279,  2074163,  2133170,
+     2192270,  2251432,  2310626,  2369822,
+     2428988,  2488093,  2547106,  2605996,
+     2664731,  2723279,  2781607,  2839685,
+     2897481,  2954962,  3012096,  3068851,
+     3125195,  3181097,  3236524,  3291445,
+     3345829,  3399643,  3452858,  3505441,
+     3557362,  3608591,  3659098,  3708853,
+     3757825,  3805987,  3853309,  3899763,
+     3945322,  3989957,  4033642,  4076350,
+     4118055,  4158733,  4198357,  4236904,
+     4274351,  4310673,  4345850,  4379859,
+     4412678,  4444289,  4474670,  4503803,
+     4531671,  4558255,  4583539,  4607508,
+     4630146,  4651438,  4671373,  4689936,
+     4707117,  4722905,  4737290,  4750262,
+     4761813,  4771936,  4780625,  4787874,
+     4793679,  4798036,  4800943,  4802396,
+     4802396,  4800943,  4798036,  4793679,
+     4787874,  4780625,  4771936,  4761813,
+     4750262,  4737290,  4722905,  4707117,
+     4689936,  4671373,  4651438,  4630146,
+     4607508,  4583539,  4558255,  4531671,
+     4503803,  4474670,  4444289,  4412678,
+     4379859,  4345850,  4310673,  4274351,
+     4236904,  4198357,  4158733,  4118055,
+    -4076350, -4033642, -3989957, -3945322,
+    -3899763, -3853309, -3805987, -3757825,
+    -3708853, -3659098, -3608591, -3557362,
+    -3505441, -3452858, -3399643, -3345829,
+    -3291445, -3236524, -3181097, -3125195,
+    -3068851, -3012096, -2954962, -2897481,
+    -2839685, -2781607, -2723279, -2664731,
+    -2605996, -2547106, -2488093, -2428988,
+    -2369822, -2310626, -2251432, -2192270,
+    -2133170, -2074163, -2015279, -1956546,
+    -1897995, -1839653, -1781550, -1723714,
+    -1666171, -1608950, -1552077, -1495578,
+    -1439479, -1383805, -1328580, -1273830,
+    -1219578, -1165846, -1112658, -1060035,
+    -1007998,  -956569,  -905766,  -855611,
+     -806121,  -757314,  -709209,  -661821,
+     -615168,  -569264,  -524124,  -479762,
+     -436192,  -393427,  -351478,  -310356,
+     -270073,  -230638,  -192060,  -154347,
+     -117508,   -81550,   -46478,   -12298,
+       20984,    53365,    84840,   115408,
+      145065,   173811,   201644,   228564,
+      254573,   279670,   303857,   327137,
+      349514,   370989,   391569,   411256,
+      430058,   447979,   465026,   481205,
+      496525,   510994,   524620,   537411,
+      549378,   560530,   570877,   580431,
+      589202,   597202,   604443,   610938,
+      616698,   621737,   626068,   629705,
+      632662,   634953,   636592,   637593,
+      637972,   637745,   636925,   635529,
+      633571,   631069,   628037,   624491,
+     -620449,  -615925,  -610936,  -605498,
+     -599627,  -593341,  -586654,  -579584,
+     -572146,  -564357,  -556233,  -547790,
+     -539044,  -530011,  -520707,  -511147,
+     -501347,  -491323,  -481090,  -470663,
+     -460057,  -449286,  -438366,  -427310,
+     -416133,  -404848,  -393471,  -382013,
+     -370488,  -358909,  -347289,  -335641,
+     -323976,  -312306,  -300644,  -288999,
+     -277384,  -265810,  -254285,  -242822,
+     -231429,  -220116,  -208892,  -197766,
+     -186747,  -175843,  -165061,  -154410,
+     -143897,  -133528,  -123311,  -113251,
+     -103355,   -93628,   -84076,   -74704,
+      -65516,   -56517,   -47711,   -39101,
+      -30692,   -22487,   -14487,    -6697,
+         881,     8247,    15397,    22330,
+       29045,    35542,    41818,    47875,
+       53711,    59327,    64723,    69900,
+       74857,    79597,    84119,    88426,
+       92520,    96401,   100071,   103534,
+      106790,   109843,   112694,   115348,
+      117806,   120071,   122148,   124038,
+      125746,   127274,   128626,   129807,
+      130820,   131668,   132355,   132886,
+      133265,   133495,   133582,   133528,
+      133338,   133016,   132568,   131997,
+      131308,   130504,   129591,   128571,
+      127451,   126234,   124925,   123527,
+      122045,   120484,   118847,   117138,
+      115362,   113524,   111626,   109673,
+      107668,   105616,   103521,   101386,
+      -99215,   -97011,   -94778,   -92519,
+      -90239,   -87939,   -85624,   -83296,
+      -80959,   -78615,   -76268,   -73919,
+      -71573,   -69231,   -66897,   -64571,
+      -62258,   -59959,   -57676,   -55411,
+      -53167,   -50945,   -48747,   -46575,
+      -44431,   -42315,   -40231,   -38177,
+      -36158,   -34172,   -32222,   -30309,
+      -28433,   -26596,   -24798,   -23039,
+      -21322,   -19645,   -18011,   -16418,
+      -14868,   -13361,   -11897,   -10476,
+       -9098,    -7764,    -6474,    -5226,
+       -4022,    -2861,    -1743,     -668,
+         366,     1357,     2306,     3215,
+        4083,     4910,     5698,     6447,
+        7158,     7830,     8466,     9065,
+        9628,    10156,    10650,    11111,
+       11538,    11934,    12298,    12632,
+       12937,    13213,    13461,    13682,
+       13877,    14046,    14191,    14313,
+       14412,    14488,    14544,    14580,
+       14596,    14594,    14573,    14536,
+       14483,    14415,    14331,    14235,
+       14125,    14002,    13869,    13724,
+       13569,    13405,    13231,    13050,
+       12861,    12665,    12463,    12255,
+       12042,    11825,    11603,    11377,
+       11149,    10918,    10685,    10450,
+       10214,     9977,     9739,     9501,
+        9264,     9027,     8791,     8556,
+        8322,     8090,     7861,     7633,
+        7408,     7186,     6966,     6749,
+       -6536,    -6326,    -6119,    -5916,
+       -5717,    -5522,    -5330,    -5143,
+       -4959,    -4780,    -4604,    -4433,
+       -4267,    -4104,    -3946,    -3791,
+       -3642,    -3496,    -3354,    -3217,
+       -3084,    -2955,    -2830,    -2709,
+       -2592,    -2479,    -2370,    -2265,
+       -2163,    -2065,    -1971,    -1880,
+       -1793,    -1709,    -1628,    -1550,
+       -1476,    -1405,    -1336,    -1271,
+       -1208,    -1148,    -1090,    -1035,
+        -983,     -932,     -884,     -839,
+        -795,     -753,     -714,     -676,
+        -640,     -606,     -574,     -543,
+        -513,     -485,     -459,     -433,
+        -410,     -387,     -365,     -345,
+        -325,     -307,     -289,     -273,
+        -257,     -242,     -228,     -215,
+        -202,     -190,     -178,     -167,
+        -157,     -147,     -138,     -129,
+        -121,     -113,     -105,      -98,
+         -91,      -85,      -79,      -73,
+         -67,      -62,      -57,      -52,
+         -48,      -43,      -38,      -38
+};
+
 /*
  * D.11 Look-up Table for Downmix Scale Factors
  *
@@ -8176,216 +8702,495 @@ const uint32_t ff_dca_inv_dmixtable[FF_DCA_INV_DMIXTABLE_SIZE] = {
       65536,
 };
 
-const float ff_dca_default_coeffs[10][6][2] = {
-    { { 0.707107, 0.707107 }, { 0.000000, 0.000000 },                                                                                                 }, // A [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // A + B (dual mono) [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // L + R (stereo) [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // (L+R) + (L-R) (sum-difference) [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // LT + RT (left and right total) [LFE]
-    { { 0.501187, 0.501187 }, { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.000000, 0.000000 },                                                 }, // C + L + R [LFE]
-    { { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.501187 }, { 0.000000, 0.000000 },                                                 }, // L + R + S [LFE]
-    { { 0.501187, 0.501187 }, { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.501187 }, { 0.000000, 0.000000 },                         }, // C + L + R + S [LFE]
-    { { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.000000 }, { 0.000000, 0.501187 }, { 0.000000, 0.000000 },                         }, // L + R + SL + SR [LFE]
-    { { 0.501187, 0.501187 }, { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.000000 }, { 0.000000, 0.501187 }, { 0.000000, 0.000000 }, }, // C + L + R + SL + SR [LFE]
+const uint16_t ff_dca_xll_refl_coeff[128] = {
+        0,  3070,  5110,  7140,  9156, 11154, 13132, 15085,
+    17010, 18904, 20764, 22588, 24373, 26117, 27818, 29474,
+    31085, 32648, 34164, 35631, 37049, 38418, 39738, 41008,
+    42230, 43404, 44530, 45609, 46642, 47630, 48575, 49477,
+    50337, 51157, 51937, 52681, 53387, 54059, 54697, 55302,
+    55876, 56421, 56937, 57426, 57888, 58326, 58741, 59132,
+    59502, 59852, 60182, 60494, 60789, 61066, 61328, 61576,
+    61809, 62029, 62236, 62431, 62615, 62788, 62951, 63105,
+    63250, 63386, 63514, 63635, 63749, 63855, 63956, 64051,
+    64140, 64224, 64302, 64376, 64446, 64512, 64573, 64631,
+    64686, 64737, 64785, 64830, 64873, 64913, 64950, 64986,
+    65019, 65050, 65079, 65107, 65133, 65157, 65180, 65202,
+    65222, 65241, 65259, 65275, 65291, 65306, 65320, 65333,
+    65345, 65357, 65368, 65378, 65387, 65396, 65405, 65413,
+    65420, 65427, 65434, 65440, 65446, 65451, 65456, 65461,
+    65466, 65470, 65474, 65478, 65481, 65485, 65488, 65491
 };
 
-const int32_t ff_dca_sampling_freqs[16] = {
-      8000,  16000, 32000, 64000, 128000, 22050,  44100,  88200,
-    176400, 352800, 12000, 24000,  48000, 96000, 192000, 384000,
+const int32_t ff_dca_xll_band_coeff[20] = {
+      868669, -5931642, -1228483,  4194304,
+      -20577,   122631,  -393647,   904476,
+    -1696305,  2825313, -4430736,  6791313,
+       41153,  -245210,   785564, -1788164,
+     3259333, -5074941,  6928550, -8204883
 };
 
-/* downmix coeffs
- *
- * TABLE 9
- * ______________________________________
- * Down-mix coefficients for 8-channel source
- * audio (5 + 3 format)
- * lt
- * cen- rt lt ctr rt
- * lt ter ctr center
- * rt srd srd srd
- * ______________________________________
- * 1 0.71 0.74 1.0 0.71 0.71 0.58 0.58 0.58
- * 2 left 1.0 0.89 0.71 0.46 0.71 0.50
- * rt 0.45 0.71 0.89 1.0 0.50 0.71
- * 3 lt 1.0 0.89 0.71 0.45
- * rt 0.45 0.71 0.89 1.0
- * srd 0.71 0.71 0.71
- * 4 lt 1.0 0.89 0.71 0.45
- * rt 0.45 0.71 0.89 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 0.71
- * 4 lt 1.0 0.5
- * ctr 0.87 1.0 0.87
- * rt 0.5 1.0
- * srd 0.71 0.71 0.71
- * 5 lt 1.0 0.5
- * ctr 0.87 1.0 0.87
- * rt 0.5 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 1.0
- * 6 lt 1.0 0.5
- * lt ctr 0.87 0.71
- * rt ctr 0.71 0.87
- * rt 0.5 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 1.0
- * 6 lt 1.0 0.5
- * ctr 0.86 1.0 0.86
- * rt 0.5 1.0
- * lt srd 1.0
- * ctr srd 1.0
- * rt srd 1.0
- * 7 lt 1.0
- * lt ctr 1.0
- * ctr 1.0
- * rt ctr 1.0
- * rt 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 1.0
- * 7 lt 1.0 0.5
- * lt ctr 0.87 0.71
- * rt ctr 0.71 0.87
- * rt 0.5 1.0
- * lt srd 1.0
- * ctr srd 1.0
- * rt srd 1.0
- * 8 lt 1.0 0.5
- * lt ctr 0.87 0.71
- * rt ctr 0.71 0.87
- * rt 0.5 1.0
- * lt 1 srd 0.87 0.35
- * lt 2 srd 0.5 0.61
- * rt 2 srd 0.61 0.50
- * rt 2 srd 0.35 0.87
- *
- * Generation of Lt Rt
- *
- * In the case when the playback system has analog or digital surround
- * multi-channel capability, a down matrix from 5, 4, or 3 channel to
- * Lt Rt may be desirable. In the case when the number of decoded audio
- * channels exceeds 5, 4 or 3 respectively a first stage down mix to 5,
- * 4 or 3 chs should be used as described above.
- *
- * The down matrixing equations for 5-channel source audio to a
- * two-channel Lt Rt playback system are given by:
- *
- * Left  = left  + 0.7 * center - 0.7 * (lt surround + rt surround)
- *
- * Right = right + 0.7 * center + 0.7 * (lt surround + rt surround)
- *
- * Embedded mixing to 2-channel
- *
- * One concern arising from the proliferation of multi-channel audio
- * systems is that most home systems presently have only two channel
- * playback capability. To accommodate this a fixed 2-channel down
- * matrix processes is commonly used following the multi-channel
- * decoding stage. However, for music only applications the image
- * quality etc. of the down matrixed signal may not match that of an
- * equivalent stereo recording found on CD.
- *
- * The concept of embedded mixing is to allow the producer to
- * dynamically specify the matrixing coefficients within the audio
- * frame itself. In this way the stereo down mix at the decoder may be
- * better matched to a 2-channel playback environment.
- *
- * CHS*2, 7-bit down mix indexes (MCOEFFS) are transmitted along with
- * the multi-channel audio once in every frame. The indexes are
- * converted to attenuation factors using a 7 bit LUT. The 2-ch down
- * mix equations are as follows,
- *
- * Left Ch  = sum (MCOEFF[n]       * Ch[n]) for n=1, CHS
- *
- * Right Ch = sum (MCOEFF[n + CHS] * Ch[n]) for n=1, CHS
- *
- * where Ch(n) represents the subband samples in the (n)th audio channel.
- */
+const uint16_t ff_dca_avg_g3_freqs[3] = { 16000, 18000, 24000 };
+
+const uint16_t ff_dca_fst_amp[44] = {
+       0,    1,    2,    3,
+       4,    6,    8,   10,
+      12,   16,   20,   24,
+      28,   36,   44,   52,
+      60,   76,   92,  108,
+     124,  156,  188,  220,
+     252,  316,  380,  444,
+     508,  636,  764,  892,
+    1020, 1276, 1532, 1788,
+    2044, 2556, 3068, 3580,
+    4092, 5116, 6140, 7164
+};
+
+const uint8_t ff_dca_freq_to_sb[32] = {
+    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+};
+
+const int8_t ff_dca_ph0_shift[8] = {
+    -32, +96, -96, +32, +96, -32, +32, -96
+};
+
+const uint8_t ff_dca_grid_1_to_scf[11] = {
+    0, 1, 2, 3, 4, 6, 7, 10, 14, 19, 26
+};
+
+const uint8_t ff_dca_grid_2_to_scf[3] = {
+    4, 10, 18
+};
+
+const uint8_t ff_dca_scf_to_grid_1[32] = {
+    0, 1, 2, 3, 4, 4, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7,
+    7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
+const uint8_t ff_dca_scf_to_grid_2[32] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+    1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+const uint8_t ff_dca_grid_1_weights[12][32] = {
+    {
+        128,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0, 128,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0, 128,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0, 128,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0, 128, 128,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0, 128,  85,
+         43,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,  43,
+         85, 128,  96,  64,  32,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,  32,  64,  96, 128, 102,  77,
+         51,  26,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,  26,  51,
+         77, 102, 128, 107,  85,  64,  43,  21,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,  21,  43,  64,  85, 107,
+        128, 110,  91,  73,  55,  37,  18,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,  18,  37,  55,  73,  91, 110, 128,
+    }, {
+        0 /* empty */
+    }
+};
+
+const uint8_t ff_dca_sb_reorder[8][8] = {
+    { 0, 1, 2, 3, 4, 5, 6, 7 },
+    { 1, 0, 2, 3, 4, 5, 6, 7 },
+    { 3, 1, 0, 2, 4, 5, 6, 7 },
+    { 1, 2, 3, 0, 4, 5, 6, 7 },
+    { 1, 2, 5, 3, 0, 4, 6, 7 },
+    { 1, 2, 2, 5, 3, 0, 4, 6 },
+    { 1, 2, 2, 6, 5, 3, 0, 4 },
+    { 1, 2, 2, 6, 5, 4, 0, 3 }
+};
+
+const int8_t ff_dca_lfe_delta_index_16[8] = {
+    -4, -3, -2, -1, 2, 4, 6, 8
+};
+
+const int8_t ff_dca_lfe_delta_index_24[32] = {
+    -8, -8, -7, -7, -6, -6, -5, -5, -4, -4, -3, -3, -2, -2, -1, -1,
+     1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8
+};
+
+const uint16_t ff_dca_rsd_pack_5_in_8[256] = {
+    0x0000, 0x0100, 0x0200, 0x0040, 0x0140, 0x0240, 0x0080, 0x0180,
+    0x0280, 0x0010, 0x0110, 0x0210, 0x0050, 0x0150, 0x0250, 0x0090,
+    0x0190, 0x0290, 0x0020, 0x0120, 0x0220, 0x0060, 0x0160, 0x0260,
+    0x00a0, 0x01a0, 0x02a0, 0x0004, 0x0104, 0x0204, 0x0044, 0x0144,
+    0x0244, 0x0084, 0x0184, 0x0284, 0x0014, 0x0114, 0x0214, 0x0054,
+    0x0154, 0x0254, 0x0094, 0x0194, 0x0294, 0x0024, 0x0124, 0x0224,
+    0x0064, 0x0164, 0x0264, 0x00a4, 0x01a4, 0x02a4, 0x0008, 0x0108,
+    0x0208, 0x0048, 0x0148, 0x0248, 0x0088, 0x0188, 0x0288, 0x0018,
+    0x0118, 0x0218, 0x0058, 0x0158, 0x0258, 0x0098, 0x0198, 0x0298,
+    0x0028, 0x0128, 0x0228, 0x0068, 0x0168, 0x0268, 0x00a8, 0x01a8,
+    0x02a8, 0x0001, 0x0101, 0x0201, 0x0041, 0x0141, 0x0241, 0x0081,
+    0x0181, 0x0281, 0x0011, 0x0111, 0x0211, 0x0051, 0x0151, 0x0251,
+    0x0091, 0x0191, 0x0291, 0x0021, 0x0121, 0x0221, 0x0061, 0x0161,
+    0x0261, 0x00a1, 0x01a1, 0x02a1, 0x0005, 0x0105, 0x0205, 0x0045,
+    0x0145, 0x0245, 0x0085, 0x0185, 0x0285, 0x0015, 0x0115, 0x0215,
+    0x0055, 0x0155, 0x0255, 0x0095, 0x0195, 0x0295, 0x0025, 0x0125,
+    0x0225, 0x0065, 0x0165, 0x0265, 0x00a5, 0x01a5, 0x02a5, 0x0009,
+    0x0109, 0x0209, 0x0049, 0x0149, 0x0249, 0x0089, 0x0189, 0x0289,
+    0x0019, 0x0119, 0x0219, 0x0059, 0x0159, 0x0259, 0x0099, 0x0199,
+    0x0299, 0x0029, 0x0129, 0x0229, 0x0069, 0x0169, 0x0269, 0x00a9,
+    0x01a9, 0x02a9, 0x0002, 0x0102, 0x0202, 0x0042, 0x0142, 0x0242,
+    0x0082, 0x0182, 0x0282, 0x0012, 0x0112, 0x0212, 0x0052, 0x0152,
+    0x0252, 0x0092, 0x0192, 0x0292, 0x0022, 0x0122, 0x0222, 0x0062,
+    0x0162, 0x0262, 0x00a2, 0x01a2, 0x02a2, 0x0006, 0x0106, 0x0206,
+    0x0046, 0x0146, 0x0246, 0x0086, 0x0186, 0x0286, 0x0016, 0x0116,
+    0x0216, 0x0056, 0x0156, 0x0256, 0x0096, 0x0196, 0x0296, 0x0026,
+    0x0126, 0x0226, 0x0066, 0x0166, 0x0266, 0x00a6, 0x01a6, 0x02a6,
+    0x000a, 0x010a, 0x020a, 0x004a, 0x014a, 0x024a, 0x008a, 0x018a,
+    0x028a, 0x001a, 0x011a, 0x021a, 0x005a, 0x015a, 0x025a, 0x009a,
+    0x019a, 0x029a, 0x002a, 0x012a, 0x022a, 0x006a, 0x016a, 0x026a,
+    0x00aa, 0x01aa, 0x02aa, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155,
+    0x0155, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155
+};
+
+const uint8_t ff_dca_rsd_pack_3_in_7[128][3] = {
+    { 0, 0, 0 }, { 0, 0, 1 }, { 0, 0, 2 }, { 0, 0, 3 },
+    { 0, 0, 4 }, { 0, 1, 0 }, { 0, 1, 1 }, { 0, 1, 2 },
+    { 0, 1, 3 }, { 0, 1, 4 }, { 0, 2, 0 }, { 0, 2, 1 },
+    { 0, 2, 2 }, { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 0 },
+    { 0, 3, 1 }, { 0, 3, 2 }, { 0, 3, 3 }, { 0, 3, 4 },
+    { 0, 4, 0 }, { 0, 4, 1 }, { 0, 4, 2 }, { 0, 4, 3 },
+    { 0, 4, 4 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 0, 2 },
+    { 1, 0, 3 }, { 1, 0, 4 }, { 1, 1, 0 }, { 1, 1, 1 },
+    { 1, 1, 2 }, { 1, 1, 3 }, { 1, 1, 4 }, { 1, 2, 0 },
+    { 1, 2, 1 }, { 1, 2, 2 }, { 1, 2, 3 }, { 1, 2, 4 },
+    { 1, 3, 0 }, { 1, 3, 1 }, { 1, 3, 2 }, { 1, 3, 3 },
+    { 1, 3, 4 }, { 1, 4, 0 }, { 1, 4, 1 }, { 1, 4, 2 },
+    { 1, 4, 3 }, { 1, 4, 4 }, { 2, 0, 0 }, { 2, 0, 1 },
+    { 2, 0, 2 }, { 2, 0, 3 }, { 2, 0, 4 }, { 2, 1, 0 },
+    { 2, 1, 1 }, { 2, 1, 2 }, { 2, 1, 3 }, { 2, 1, 4 },
+    { 2, 2, 0 }, { 2, 2, 1 }, { 2, 2, 2 }, { 2, 2, 3 },
+    { 2, 2, 4 }, { 2, 3, 0 }, { 2, 3, 1 }, { 2, 3, 2 },
+    { 2, 3, 3 }, { 2, 3, 4 }, { 2, 4, 0 }, { 2, 4, 1 },
+    { 2, 4, 2 }, { 2, 4, 3 }, { 2, 4, 4 }, { 3, 0, 0 },
+    { 3, 0, 1 }, { 3, 0, 2 }, { 3, 0, 3 }, { 3, 0, 4 },
+    { 3, 1, 0 }, { 3, 1, 1 }, { 3, 1, 2 }, { 3, 1, 3 },
+    { 3, 1, 4 }, { 3, 2, 0 }, { 3, 2, 1 }, { 3, 2, 2 },
+    { 3, 2, 3 }, { 3, 2, 4 }, { 3, 3, 0 }, { 3, 3, 1 },
+    { 3, 3, 2 }, { 3, 3, 3 }, { 3, 3, 4 }, { 3, 4, 0 },
+    { 3, 4, 1 }, { 3, 4, 2 }, { 3, 4, 3 }, { 3, 4, 4 },
+    { 4, 0, 0 }, { 4, 0, 1 }, { 4, 0, 2 }, { 4, 0, 3 },
+    { 4, 0, 4 }, { 4, 1, 0 }, { 4, 1, 1 }, { 4, 1, 2 },
+    { 4, 1, 3 }, { 4, 1, 4 }, { 4, 2, 0 }, { 4, 2, 1 },
+    { 4, 2, 2 }, { 4, 2, 3 }, { 4, 2, 4 }, { 4, 3, 0 },
+    { 4, 3, 1 }, { 4, 3, 2 }, { 4, 3, 3 }, { 4, 3, 4 },
+    { 4, 4, 0 }, { 4, 4, 1 }, { 4, 4, 2 }, { 4, 4, 3 },
+    { 4, 4, 4 }, { 2, 2, 2 }, { 2, 2, 2 }, { 2, 2, 2 }
+};
+
+const float ff_dca_rsd_level_2a[2] = {
+    -0.47, 0.47
+};
+
+const float ff_dca_rsd_level_2b[2] = {
+    -0.645, 0.645
+};
+
+const float ff_dca_rsd_level_3[3] = {
+    -0.645, 0.0, 0.645
+};
+
+const float ff_dca_rsd_level_5[5] = {
+    -0.875, -0.375, 0.0, 0.375, 0.875
+};
+
+const float ff_dca_rsd_level_8[8] = {
+    -1.0, -0.625, -0.291666667, 0.0, 0.25, 0.5, 0.75, 1.0
+};
+
+const float ff_dca_rsd_level_16[16] = {
+    -1.3125, -1.1375, -0.9625, -0.7875,
+    -0.6125, -0.4375, -0.2625, -0.0875,
+     0.0875,  0.2625,  0.4375,  0.6125,
+     0.7875,  0.9625,  1.1375,  1.3125
+};
+
+const float ff_dca_synth_env[32] = {
+    0.00240763666390, 0.00960735979838, 0.02152983213390, 0.03806023374436,
+    0.05903936782582, 0.08426519384873, 0.11349477331863, 0.14644660940673,
+    0.18280335791818, 0.22221488349020, 0.26430163158700, 0.30865828381746,
+    0.35485766137277, 0.40245483899194, 0.45099142983522, 0.5,
+    0.54900857016478, 0.59754516100806, 0.64514233862723, 0.69134171618254,
+    0.73569836841300, 0.77778511650980, 0.81719664208182, 0.85355339059327,
+    0.88650522668137, 0.91573480615127, 0.94096063217418, 0.96193976625564,
+    0.97847016786610, 0.99039264020162, 0.99759236333610, 1.0
+};
+
+const float ff_dca_corr_cf[32][11] = {
+    {-0.01179, 0.04281, 0.46712, 0.46345,-3.94525, 3.94525,
+     -0.46345,-0.46712,-0.04281, 0.01179,-0.00299 },
+    {-0.00929, 0.04882, 0.45252, 0.37972,-3.85446, 4.03189,
+     -0.55069,-0.48040,-0.03599, 0.01445,-0.00229 },
+    {-0.00696, 0.05403, 0.43674, 0.29961,-3.75975, 4.11413,
+     -0.64135,-0.49221,-0.02834, 0.01726,-0.00156 },
+    {-0.00481, 0.05847, 0.41993, 0.22319,-3.66138, 4.19175,
+     -0.73529,-0.50241,-0.01983, 0.02021,-0.00080 },
+    {-0.00284, 0.06216, 0.40224, 0.15053,-3.55963, 4.26452,
+     -0.83239,-0.51085,-0.01047, 0.02328,-0.00003 },
+    {-0.00105, 0.06515, 0.38378, 0.08168,-3.45475, 4.33225,
+     -0.93249,-0.51738,-0.00024, 0.02646, 0.00074 },
+    { 0.00054, 0.06745, 0.36471, 0.01668,-3.34703, 4.39475,
+     -1.03543,-0.52184, 0.01085, 0.02973, 0.00152 },
+    { 0.00195, 0.06912, 0.34515,-0.04445,-3.23676, 4.45185,
+     -1.14105,-0.52410, 0.02280, 0.03306, 0.00228 },
+    { 0.00318, 0.07017, 0.32521,-0.10168,-3.12422, 4.50339,
+     -1.24914,-0.52400, 0.03561, 0.03643, 0.00302 },
+    { 0.00422, 0.07065, 0.30503,-0.15503,-3.00969, 4.54921,
+     -1.35952,-0.52141, 0.04925, 0.03981, 0.00373 },
+    { 0.00508, 0.07061, 0.28471,-0.20450,-2.89348, 4.58919,
+     -1.47197,-0.51618, 0.06370, 0.04319, 0.00440 },
+    { 0.00577, 0.07007, 0.26436,-0.25013,-2.77587, 4.62321,
+     -1.58627,-0.50818, 0.07895, 0.04652, 0.00501 },
+    { 0.00629, 0.06909, 0.24410,-0.29194,-2.65716, 4.65118,
+     -1.70219,-0.49727, 0.09494, 0.04979, 0.00556 },
+    { 0.00666, 0.06769, 0.22400,-0.33000,-2.53764, 4.67302,
+     -1.81949,-0.48335, 0.11166, 0.05295, 0.00604 },
+    { 0.00687, 0.06592, 0.20416,-0.36435,-2.41760, 4.68866,
+     -1.93791,-0.46627, 0.12904, 0.05597, 0.00642 },
+    { 0.00694, 0.06383, 0.18468,-0.39506,-2.29732, 4.69806,
+     -2.05720,-0.44593, 0.14705, 0.05881, 0.00671 },
+    { 0.00689, 0.06144, 0.16561,-0.42223,-2.17710, 4.70120,
+     -2.17710,-0.42223, 0.16561, 0.06144, 0.00689 },
+    { 0.00671, 0.05881, 0.14705,-0.44593,-2.05720, 4.69806,
+     -2.29732,-0.39506, 0.18468, 0.06383, 0.00694 },
+    { 0.00642, 0.05597, 0.12904,-0.46627,-1.93791, 4.68865,
+     -2.41759,-0.36435, 0.20416, 0.06592, 0.00687 },
+    { 0.00604, 0.05295, 0.11166,-0.48334,-1.81949, 4.67301,
+     -2.53763,-0.33000, 0.22400, 0.06769, 0.00666 },
+    { 0.00556, 0.04979, 0.09494,-0.49727,-1.70219, 4.65117,
+     -2.65715,-0.29194, 0.24409, 0.06909, 0.00629 },
+    { 0.00501, 0.04652, 0.07894,-0.50818,-1.58627, 4.62321,
+     -2.77587,-0.25013, 0.26436, 0.07007, 0.00577 },
+    { 0.00440, 0.04319, 0.06370,-0.51618,-1.47197, 4.58919,
+     -2.89348,-0.20450, 0.28471, 0.07061, 0.00508 },
+    { 0.00373, 0.03981, 0.04925,-0.52141,-1.35952, 4.54921,
+     -3.00970,-0.15503, 0.30503, 0.07065, 0.00422 },
+    { 0.00302, 0.03643, 0.03561,-0.52400,-1.24915, 4.50339,
+     -3.12422,-0.10168, 0.32521, 0.07017, 0.00318 },
+    { 0.00228, 0.03306, 0.02280,-0.52410,-1.14105, 4.45186,
+     -3.23677,-0.04445, 0.34515, 0.06912, 0.00195 },
+    { 0.00152, 0.02973, 0.01085,-0.52184,-1.03544, 4.39477,
+     -3.34704, 0.01668, 0.36471, 0.06745, 0.00054 },
+    { 0.00074, 0.02646,-0.00024,-0.51738,-0.93249, 4.33226,
+     -3.45476, 0.08168, 0.38378, 0.06515,-0.00105 },
+    {-0.00003, 0.02328,-0.01047,-0.51085,-0.83239, 4.26452,
+     -3.55963, 0.15053, 0.40224, 0.06216,-0.00284 },
+    {-0.00080, 0.02021,-0.01983,-0.50241,-0.73529, 4.19174,
+     -3.66138, 0.22319, 0.41993, 0.05847,-0.00481 },
+    {-0.00156, 0.01726,-0.02834,-0.49221,-0.64135, 4.11413,
+     -3.75974, 0.29961, 0.43674, 0.05403,-0.00696 },
+    {-0.00229, 0.01445,-0.03599,-0.48040,-0.55069, 4.03188,
+     -3.85445, 0.37972, 0.45251, 0.04882,-0.00929 },
+};
+
+const float ff_dca_quant_amp[57] = {
+    4.88281250E-04, 1.46484375E-03, 2.32267031E-03, 3.28475167E-03,
+    4.64534014E-03, 6.56950334E-03, 9.29068029E-03, 1.31390067E-02,
+    1.85813606E-02, 2.62780134E-02, 3.71627212E-02, 5.25560267E-02,
+    7.43254423E-02, 1.05112053E-01, 1.48650885E-01, 2.10224107E-01,
+    2.97301769E-01, 4.20448214E-01, 5.94603539E-01, 8.40896428E-01,
+    1.18920708E+00, 1.68179286E+00, 2.37841415E+00, 3.36358571E+00,
+    4.75682831E+00, 6.72717142E+00, 9.51365662E+00, 1.34543428E+01,
+    1.90273132E+01, 2.69086857E+01, 3.80546265E+01, 5.38173714E+01,
+    7.61092529E+01, 1.07634743E+02, 1.52218506E+02, 2.15269485E+02,
+    3.04437012E+02, 4.30538971E+02, 6.08874023E+02, 8.61077942E+02,
+    1.21774805E+03, 1.72215588E+03, 2.43549609E+03, 3.44431177E+03,
+    4.87099219E+03, 6.88862354E+03, 9.74198438E+03, 1.37772471E+04,
+    1.94839688E+04, 2.75544941E+04, 3.89679375E+04, 5.51089883E+04,
+    7.79358750E+04, 1.10217977E+05, 1.55871750E+05, 2.20435953E+05,
+    0.00000000E+00,
+};
 
-const int8_t ff_dca_lfe_index[16] = {
-    1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3
+const float ff_dca_st_coeff[34] = {
+    2.69086857E+01, 2.69086857E+01, 1.34543419E+01, 6.72717142E+00,
+    3.36358571E+00, 1.68179286E+00, 8.40896428E-01, 5.94603479E-01,
+    4.20448214E-01, 2.97301799E-01, 2.10224107E-01, 1.48650900E-01,
+    1.05112098E-01, 7.43253976E-02, 5.25560006E-02, 3.71626988E-02,
+    3.12500000E-02, 2.62780003E-02, 1.85813997E-02, 1.31390002E-02,
+    9.29069985E-03, 6.56950008E-03, 4.64530010E-03, 3.28480010E-03,
+    2.32270011E-03, 1.64240005E-03, 1.16130000E-03, 5.80699998E-04,
+    2.90299999E-04, 1.45200000E-04, 7.25999998E-05, 3.62999999E-05,
+    1.82000003E-05, 0.00000000E+00,
 };
 
-const int8_t ff_dca_channel_reorder_lfe[16][9] = {
-    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4, -1, -1, -1, -1, -1 },
-    { 0,  1,  3,  4, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4,  5, -1, -1, -1, -1 },
-    { 3,  4,  0,  1,  5,  6, -1, -1, -1 },
-    { 2,  0,  1,  4,  5,  6, -1, -1, -1 },
-    { 0,  6,  4,  5,  2,  3, -1, -1, -1 },
-    { 4,  2,  5,  0,  1,  6,  7, -1, -1 },
-    { 5,  6,  0,  1,  7,  3,  8,  4, -1 },
-    { 4,  2,  5,  0,  1,  6,  8,  7, -1 },
+const float ff_dca_long_window[128] = {
+    0.00000000E+00, 7.42882412E-06, 5.28020973E-05, 1.71007006E-04,
+    3.96653224E-04, 7.63946096E-04, 1.30655791E-03, 2.05750111E-03,
+    3.04900459E-03, 4.31239139E-03, 5.87796280E-03, 7.77488295E-03,
+    1.00310687E-02, 1.26730874E-02, 1.57260559E-02, 1.92135461E-02,
+    2.31574941E-02, 2.75781266E-02, 3.24938744E-02, 3.79213169E-02,
+    4.38751020E-02, 5.03679104E-02, 5.74104004E-02, 6.50111660E-02,
+    7.31767192E-02, 8.19114447E-02, 9.12176073E-02, 1.01095326E-01,
+    1.11542597E-01, 1.22555278E-01, 1.34127125E-01, 1.46249816E-01,
+    1.58912972E-01, 1.72104210E-01, 1.85809180E-01, 2.00011641E-01,
+    2.14693516E-01, 2.29834959E-01, 2.45414421E-01, 2.61408776E-01,
+    2.77793378E-01, 2.94542134E-01, 3.11627686E-01, 3.29021394E-01,
+    3.46693635E-01, 3.64613682E-01, 3.82750064E-01, 4.01070446E-01,
+    4.19541985E-01, 4.38131332E-01, 4.56804723E-01, 4.75528270E-01,
+    4.94267941E-01, 5.12989700E-01, 5.31659782E-01, 5.50244689E-01,
+    5.68711281E-01, 5.87027133E-01, 6.05160415E-01, 6.23080134E-01,
+    6.40756190E-01, 6.58159554E-01, 6.75262392E-01, 6.92038059E-01,
+    7.08461344E-01, 7.24508464E-01, 7.40157187E-01, 7.55386829E-01,
+    7.70178556E-01, 7.84515142E-01, 7.98381269E-01, 8.11763465E-01,
+    8.24650168E-01, 8.37031603E-01, 8.48900259E-01, 8.60250235E-01,
+    8.71077836E-01, 8.81381273E-01, 8.91160548E-01, 9.00417745E-01,
+    9.09156621E-01, 9.17382956E-01, 9.25104082E-01, 9.32328999E-01,
+    9.39068437E-01, 9.45334494E-01, 9.51140642E-01, 9.56501782E-01,
+    9.61433768E-01, 9.65953648E-01, 9.70079303E-01, 9.73829389E-01,
+    9.77223217E-01, 9.80280578E-01, 9.83021557E-01, 9.85466540E-01,
+    9.87635851E-01, 9.89549816E-01, 9.91228402E-01, 9.92691338E-01,
+    9.93957877E-01, 9.95046616E-01, 9.95975435E-01, 9.96761382E-01,
+    9.97420728E-01, 9.97968733E-01, 9.98419642E-01, 9.98786569E-01,
+    9.99081731E-01, 9.99315977E-01, 9.99499321E-01, 9.99640644E-01,
+    9.99747574E-01, 9.99826968E-01, 9.99884665E-01, 9.99925494E-01,
+    9.99953628E-01, 9.99972343E-01, 9.99984324E-01, 9.99991655E-01,
+    9.99995887E-01, 9.99998152E-01, 9.99999285E-01, 9.99999762E-01,
+    9.99999940E-01, 1.00000000E+00, 1.00000000E+00, 1.00000000E+00,
 };
 
-const int8_t ff_dca_channel_reorder_lfe_xch[16][9] = {
-    { 0,  2, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4, -1, -1, -1, -1, -1 },
-    { 0,  1,  3,  4, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4,  5, -1, -1, -1, -1 },
-    { 0,  1,  4,  5,  3, -1, -1, -1, -1 },
-    { 2,  0,  1,  5,  6,  4, -1, -1, -1 },
-    { 3,  4,  0,  1,  6,  7,  5, -1, -1 },
-    { 2,  0,  1,  4,  5,  6,  7, -1, -1 },
-    { 0,  6,  4,  5,  2,  3,  7, -1, -1 },
-    { 4,  2,  5,  0,  1,  7,  8,  6, -1 },
-    { 5,  6,  0,  1,  8,  3,  9,  4,  7 },
-    { 4,  2,  5,  0,  1,  6,  9,  8,  7 },
+const float ff_dca_lfe_step_size_16[101] = {
+    2.1362956633198035E-004, 2.4414807580797754E-004, 2.7466658528397473E-004,
+    2.7466658528397473E-004, 3.0518509475997192E-004, 3.3570360423596911E-004,
+    3.9674062318796350E-004, 4.2725913266396069E-004, 4.5777764213995788E-004,
+    5.1881466109195227E-004, 5.7985168004394665E-004, 6.1037018951994385E-004,
+    6.7140720847193823E-004, 7.6296273689992981E-004, 8.2399975585192419E-004,
+    9.1555528427991577E-004, 1.0071108127079073E-003, 1.0986663411358989E-003,
+    1.2207403790398877E-003, 1.3428144169438765E-003, 1.4648884548478652E-003,
+    1.6174810022278512E-003, 1.7700735496078372E-003, 1.9531846064638203E-003,
+    2.1362956633198035E-003, 2.3499252296517838E-003, 2.5940733054597613E-003,
+    2.8687398907437361E-003, 3.1434064760277108E-003, 3.4485915707876827E-003,
+    3.7842951750236518E-003, 4.1810357982116153E-003, 4.6082949308755760E-003,
+    5.0660725730155339E-003, 5.5543687246314890E-003, 6.1037018951994385E-003,
+    6.7445905941953795E-003, 7.4159978026673177E-003, 8.1484420300912512E-003,
+    8.9419232764671782E-003, 9.8574785607470940E-003, 1.0834070863979004E-002,
+    1.1932737205114903E-002, 1.3122959074678793E-002, 1.4435254982146673E-002,
+    1.5869624927518540E-002, 1.7456587420270394E-002, 1.9196142460402233E-002,
+    2.1118808557390057E-002, 2.3224585711233862E-002, 2.5543992431409649E-002,
+    2.8107547227393413E-002, 3.0915250099185155E-002, 3.4028138065736867E-002,
+    3.7415692617572556E-002, 4.1169469283120215E-002, 4.5258949552903834E-002,
+    4.9806207464827418E-002, 5.4780724509414958E-002, 6.0274056215094456E-002,
+    6.6286202581865905E-002, 7.2908719138157288E-002, 8.0202642902920618E-002,
+    8.8229010895107887E-002, 9.7048860133671075E-002, 1.0675374614703818E-001,
+    1.1743522446363720E-001, 1.2918485061189611E-001, 1.4209418012024294E-001,
+    1.5628528702658162E-001, 1.7191076387829218E-001, 1.8912320322275461E-001,
+    2.0804467909787286E-001, 2.2882778405102694E-001, 2.5171666615802485E-001,
+    2.7689443647572254E-001, 3.0457472457045198E-001, 3.3503219702749720E-001,
+    3.6854152043214211E-001, 4.0537736136967073E-001, 4.4593646046327096E-001,
+    4.9052400280770286E-001, 5.3956724753563035E-001, 5.9352397228919340E-001,
+    6.5288247322000792E-001, 7.1816156498916595E-001, 7.9000213629566329E-001,
+    8.6898403881954400E-001, 9.5590075380718409E-001, 1.0514847254860074E+000,
+    1.1566209906308176E+000, 1.2722861415448470E+000, 1.3995178075502792E+000,
+    1.5394756920072024E+000, 1.6934110538041323E+000, 1.8627582628864405E+000,
+    2.0490432447279274E+000, 2.2539445173497725E+000, 2.4793237098300120E+000,
+    2.7272865993224893E+000, 3.0000000000000000E+000
 };
 
-const int8_t ff_dca_channel_reorder_nolfe[16][9] = {
-    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3, -1, -1, -1, -1, -1 },
-    { 0,  1,  2,  3, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3,  4, -1, -1, -1, -1 },
-    { 2,  3,  0,  1,  4,  5, -1, -1, -1 },
-    { 2,  0,  1,  3,  4,  5, -1, -1, -1 },
-    { 0,  5,  3,  4,  1,  2, -1, -1, -1 },
-    { 3,  2,  4,  0,  1,  5,  6, -1, -1 },
-    { 4,  5,  0,  1,  6,  2,  7,  3, -1 },
-    { 3,  2,  4,  0,  1,  5,  7,  6, -1 },
+const float ff_dca_lfe_step_size_24[144] = {
+    3.5762791128491298E-006, 3.9339070241340428E-006, 4.4107442391805934E-006,
+    4.7683721504655064E-006, 5.2452093655120570E-006, 5.8412558843202453E-006,
+    6.4373024031284336E-006, 7.0333489219366219E-006, 7.7486047445064479E-006,
+    8.4638605670762738E-006, 9.4175349971693751E-006, 1.0252000123500839E-005,
+    1.1324883857355578E-005, 1.2516976894971954E-005, 1.3709069932588331E-005,
+    1.5139581577727983E-005, 1.6570093222867636E-005, 1.8239023475530564E-005,
+    2.0146372335716766E-005, 2.2053721195902969E-005, 2.4318697967374082E-005,
+    2.6702884042606836E-005, 2.9444698029124504E-005, 3.2305721319403807E-005,
+    3.5643581824729662E-005, 3.9100651633817152E-005, 4.3034558657951193E-005,
+    4.7326093593370149E-005, 5.2094465743835655E-005, 5.7339675109347712E-005,
+    6.3061721689906320E-005, 6.9379814789273121E-005, 7.6293954407448102E-005,
+    8.3923349848192912E-005, 9.2268001111507552E-005, 1.0156632680491529E-004,
+    1.1169911762465449E-004, 1.2290479217824841E-004, 1.3518335046569711E-004,
+    1.4865400179076216E-004, 1.6355516476096688E-004, 1.7988683937631122E-004,
+    1.9788744424431852E-004, 2.1767618866875036E-004, 2.3949149125713007E-004,
+    2.6345256131321922E-004, 2.8979781744454115E-004, 3.1876567825861912E-004,
+    3.5059456236297636E-004, 3.8564209766889782E-004, 4.2426591208766842E-004,
+    4.6670442422681142E-004, 5.1331526199761173E-004, 5.6469447191887759E-004,
+    6.2108047259813216E-004, 6.8318851985794547E-004, 7.5149545091336386E-004,
+    8.2671652158695713E-004, 9.0932856909377204E-004, 1.0002852678639017E-003,
+    1.1003018737199156E-003, 1.2103320610919071E-003, 1.3314487137137310E-003,
+    1.4646055060154803E-003, 1.6109945310347714E-003, 1.7721655097205054E-003,
+    1.9493105351102991E-003, 2.1442177467605765E-003, 2.3586752842277626E-003,
+    2.5945904963720436E-003, 2.8539899413573674E-003, 3.1393770145627278E-003,
+    3.4533743206708813E-003, 3.7987236736683454E-003, 4.1785245154529228E-003,
+    4.5963531251374630E-003, 5.0560242004423382E-003, 5.5617100669992049E-003,
+    6.1178214690472445E-003, 6.7296036159519689E-003, 7.4025401356864135E-003,
+    8.1428299120461841E-003, 8.9571486660419298E-003, 9.8527681652031147E-003,
+    1.0838033060793050E-002, 1.1921884050593860E-002, 1.3114096297513997E-002,
+    1.4425517848195773E-002, 1.5868069633015350E-002, 1.7454864675386508E-002,
+    1.9200327301064409E-002, 2.1120431556753107E-002, 2.3232462791498040E-002,
+    2.5555613703204836E-002, 2.8111222757246822E-002, 3.0922297349250002E-002,
+    3.4014586688826884E-002, 3.7415985753057691E-002, 4.1157608170224208E-002,
+    4.5273428591898514E-002, 4.9800759530157987E-002, 5.4780847404104160E-002,
+    6.0258872539862694E-002, 6.6284783635709721E-002, 7.2913297762071824E-002,
+    8.0204615617348624E-002, 8.8225017574431602E-002, 9.7047578936526643E-002,
+    1.0675228914645780E-001, 1.1742748229831246E-001, 1.2917031397465634E-001,
+    1.4208735729305236E-001, 1.5629603341770570E-001, 1.7192568444319778E-001,
+    1.8911816944100493E-001, 2.0803001022696618E-001, 2.2883310661710579E-001,
+    2.5171640535788598E-001, 2.7688804589367461E-001, 3.0457679087839018E-001,
+    3.3503452957088109E-001, 3.6853794676517804E-001, 4.0539174144169587E-001,
+    4.4593089174400469E-001, 4.9052399283933557E-001, 5.3957635636047796E-001,
+    5.9353406352210802E-001, 6.5288742219059737E-001, 7.1817609288407480E-001,
+    7.8999373793527339E-001, 8.6899314749159184E-001, 9.5589243839889027E-001,
+    1.0514817299225008E+000, 1.1566298194682383E+000, 1.2722928848615747E+000,
+    1.3995221137430804E+000, 1.5394743131964581E+000, 1.6934218041207556E+000,
+    1.8627639845328312E+000, 2.0490403233814627E+000, 2.2539444272451910E+000,
+    2.4793389414952922E+000, 2.7272728356448215E+000, 2.9999998807906962E+000
 };
 
-const int8_t ff_dca_channel_reorder_nolfe_xch[16][9] = {
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3, -1, -1, -1, -1, -1 },
-    { 0,  1,  2,  3, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3,  4, -1, -1, -1, -1 },
-    { 0,  1,  3,  4,  2, -1, -1, -1, -1 },
-    { 2,  0,  1,  4,  5,  3, -1, -1, -1 },
-    { 2,  3,  0,  1,  5,  6,  4, -1, -1 },
-    { 2,  0,  1,  3,  4,  5,  6, -1, -1 },
-    { 0,  5,  3,  4,  1,  2,  6, -1, -1 },
-    { 3,  2,  4,  0,  1,  6,  7,  5, -1 },
-    { 4,  5,  0,  1,  7,  2,  8,  3,  6 },
-    { 3,  2,  4,  0,  1,  5,  8,  7,  6 },
+const float ff_dca_bank_coeff[10] = {
+    0.022810893, 0.41799772, 0.90844810, 0.99973983,
+    0.068974845, 0.34675997, 0.29396889, 0.19642374,
+    0.308658270, 0.038060233
 };
 
-const uint16_t ff_dca_vlc_offs[63] = {
-        0,   512,   640,   768,  1282,  1794,  2436,  3080,  3770,  4454,  5364,
-     5372,  5380,  5388,  5392,  5396,  5412,  5420,  5428,  5460,  5492,  5508,
-     5572,  5604,  5668,  5796,  5860,  5892,  6412,  6668,  6796,  7308,  7564,
-     7820,  8076,  8620,  9132,  9388,  9910, 10166, 10680, 11196, 11726, 12240,
-    12752, 13298, 13810, 14326, 14840, 15500, 16022, 16540, 17158, 17678, 18264,
-    18796, 19352, 19926, 20468, 21472, 22398, 23014, 23622,
+const float ff_dca_lfe_iir[5][4] = {
+    { -0.98618466, 1.9861259, 1.0, -1.9840510 },
+    { -0.98883152, 1.9887193, 1.0, -1.9979848 },
+    { -0.99252087, 1.9923381, 1.0, -1.9990897 },
+    { -0.99591690, 1.9956781, 1.0, -1.9993745 },
+    { -0.99872285, 1.9984550, 1.0, -1.9994639 }
 };
diff --git a/libavcodec/dcadata.h b/libavcodec/dcadata.h
index 0a3139e..5aa85b3 100644
--- a/libavcodec/dcadata.h
+++ b/libavcodec/dcadata.h
@@ -1,20 +1,20 @@
 /*
  * DCA compatible decoder data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,17 +23,31 @@
 
 #include <stdint.h>
 
+#include "dcahuff.h"
+
+#define DCA_ADPCM_COEFFS        4
+#define DCA_ADPCM_VQCODEBOOK_SZ 4096
+
 extern const uint32_t ff_dca_bit_rates[32];
 
 extern const uint8_t ff_dca_channels[16];
 
-extern const uint8_t ff_dca_bits_per_sample[7];
+extern const uint8_t ff_dca_dmix_primary_nch[8];
+
+extern const uint8_t ff_dca_quant_index_sel_nbits[DCA_CODE_BOOKS];
+extern const uint8_t ff_dca_quant_index_group_size[DCA_CODE_BOOKS];
 
-extern const int16_t ff_dca_adpcm_vb[4096][4];
+extern const int16_t ff_dca_adpcm_vb[DCA_ADPCM_VQCODEBOOK_SZ][DCA_ADPCM_COEFFS];
 
 extern const uint32_t ff_dca_scale_factor_quant6[64];
 extern const uint32_t ff_dca_scale_factor_quant7[128];
 
+extern const uint32_t ff_dca_joint_scale_factors[129];
+
+extern const uint32_t ff_dca_scale_factor_adj[4];
+
+extern const uint32_t ff_dca_quant_levels[32];
+
 extern const uint32_t ff_dca_lossy_quant[32];
 
 extern const uint32_t ff_dca_lossless_quant[32];
@@ -45,26 +59,69 @@ extern const float ff_dca_fir_32bands_nonperfect[512];
 
 extern const float ff_dca_lfe_fir_64[256];
 extern const float ff_dca_lfe_fir_128[256];
-extern const float ff_dca_lfe_xll_fir_64[256];
 extern const float ff_dca_fir_64bands[1024];
 
-#define FF_DCA_DMIXTABLE_SIZE      242
-#define FF_DCA_INV_DMIXTABLE_SIZE  201
+extern const int32_t ff_dca_fir_32bands_perfect_fixed[512];
+extern const int32_t ff_dca_fir_32bands_nonperfect_fixed[512];
+extern const int32_t ff_dca_lfe_fir_64_fixed[256];
+extern const int32_t ff_dca_fir_64bands_fixed[1024];
+
+#define FF_DCA_DMIXTABLE_SIZE       242U
+#define FF_DCA_INV_DMIXTABLE_SIZE   201U
+#define FF_DCA_DMIXTABLE_OFFSET     (FF_DCA_DMIXTABLE_SIZE - FF_DCA_INV_DMIXTABLE_SIZE)
 
 extern const uint16_t ff_dca_dmixtable[FF_DCA_DMIXTABLE_SIZE];
 extern const uint32_t ff_dca_inv_dmixtable[FF_DCA_INV_DMIXTABLE_SIZE];
 
-extern const float ff_dca_default_coeffs[10][6][2];
+extern const uint16_t ff_dca_xll_refl_coeff[128];
+
+extern const int32_t ff_dca_xll_band_coeff[20];
+
+extern const uint16_t ff_dca_avg_g3_freqs[3];
+
+extern const uint16_t ff_dca_fst_amp[44];
+
+extern const uint8_t ff_dca_freq_to_sb[32];
+
+extern const int8_t ff_dca_ph0_shift[8];
+
+extern const uint8_t ff_dca_grid_1_to_scf[11];
+extern const uint8_t ff_dca_grid_2_to_scf[3];
+
+extern const uint8_t ff_dca_scf_to_grid_1[32];
+extern const uint8_t ff_dca_scf_to_grid_2[32];
+
+extern const uint8_t ff_dca_grid_1_weights[12][32];
+
+extern const uint8_t ff_dca_sb_reorder[8][8];
+
+extern const int8_t ff_dca_lfe_delta_index_16[8];
+extern const int8_t ff_dca_lfe_delta_index_24[32];
+
+extern const uint16_t ff_dca_rsd_pack_5_in_8[256];
+extern const uint8_t ff_dca_rsd_pack_3_in_7[128][3];
+
+extern const float ff_dca_rsd_level_2a[2];
+extern const float ff_dca_rsd_level_2b[2];
+extern const float ff_dca_rsd_level_3[3];
+extern const float ff_dca_rsd_level_5[5];
+extern const float ff_dca_rsd_level_8[8];
+extern const float ff_dca_rsd_level_16[16];
+
+extern const float ff_dca_synth_env[32];
+
+extern const float ff_dca_corr_cf[32][11];
+
+extern const float ff_dca_quant_amp[57];
 
-extern const int32_t ff_dca_sampling_freqs[16];
+extern const float ff_dca_st_coeff[34];
 
-extern const int8_t ff_dca_lfe_index[16];
+extern const float ff_dca_long_window[128];
 
-extern const int8_t ff_dca_channel_reorder_lfe[16][9];
-extern const int8_t ff_dca_channel_reorder_lfe_xch[16][9];
-extern const int8_t ff_dca_channel_reorder_nolfe[16][9];
-extern const int8_t ff_dca_channel_reorder_nolfe_xch[16][9];
+extern const float ff_dca_lfe_step_size_16[101];
+extern const float ff_dca_lfe_step_size_24[144];
 
-extern const uint16_t ff_dca_vlc_offs[63];
+extern const float ff_dca_bank_coeff[10];
+extern const float ff_dca_lfe_iir[5][4];
 
 #endif /* AVCODEC_DCADATA_H */
diff --git a/libavcodec/dcadct.c b/libavcodec/dcadct.c
new file mode 100644
index 0000000..1082aa8
--- /dev/null
+++ b/libavcodec/dcadct.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+
+#include "dcadct.h"
+#include "dcamath.h"
+
+static void sum_a(const int *input, int *output, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        output[i] = input[2 * i] + input[2 * i + 1];
+}
+
+static void sum_b(const int *input, int *output, int len)
+{
+    int i;
+
+    output[0] = input[0];
+    for (i = 1; i < len; i++)
+        output[i] = input[2 * i] + input[2 * i - 1];
+}
+
+static void sum_c(const int *input, int *output, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        output[i] = input[2 * i];
+}
+
+static void sum_d(const int *input, int *output, int len)
+{
+    int i;
+
+    output[0] = input[1];
+    for (i = 1; i < len; i++)
+        output[i] = input[2 * i - 1] + input[2 * i + 1];
+}
+
+static void dct_a(const int *input, int *output)
+{
+    static const int cos_mod[8][8] = {
+         { 8348215,  8027397,  7398092,  6484482,  5321677,  3954362,  2435084,   822227 },
+         { 8027397,  5321677,   822227, -3954362, -7398092, -8348215, -6484482, -2435084 },
+         { 7398092,   822227, -6484482, -8027397, -2435084,  5321677,  8348215,  3954362 },
+         { 6484482, -3954362, -8027397,   822227,  8348215,  2435084, -7398092, -5321677 },
+         { 5321677, -7398092, -2435084,  8348215,  -822227, -8027397,  3954362,  6484482 },
+         { 3954362, -8348215,  5321677,  2435084, -8027397,  6484482,   822227, -7398092 },
+         { 2435084, -6484482,  8348215, -7398092,  3954362,   822227, -5321677,  8027397 },
+         {  822227, -2435084,  3954362, -5321677,  6484482, -7398092,  8027397, -8348215 }
+    };
+
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        int64_t res = 0;
+        for (j = 0; j < 8; j++)
+            res += (int64_t)cos_mod[i][j] * input[j];
+        output[i] = norm23(res);
+    }
+}
+
+static void dct_b(const int *input, int *output)
+{
+    static const int cos_mod[8][7] = {
+        {  8227423,  7750063,  6974873,  5931642,  4660461,  3210181,  1636536 },
+        {  6974873,  3210181, -1636536, -5931642, -8227423, -7750063, -4660461 },
+        {  4660461, -3210181, -8227423, -5931642,  1636536,  7750063,  6974873 },
+        {  1636536, -7750063, -4660461,  5931642,  6974873, -3210181, -8227423 },
+        { -1636536, -7750063,  4660461,  5931642, -6974873, -3210181,  8227423 },
+        { -4660461, -3210181,  8227423, -5931642, -1636536,  7750063, -6974873 },
+        { -6974873,  3210181,  1636536, -5931642,  8227423, -7750063,  4660461 },
+        { -8227423,  7750063, -6974873,  5931642, -4660461,  3210181, -1636536 }
+    };
+
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        int64_t res = input[0] * (INT64_C(1) << 23);
+        for (j = 0; j < 7; j++)
+            res += (int64_t)cos_mod[i][j] * input[1 + j];
+        output[i] = norm23(res);
+    }
+}
+
+static void mod_a(const int *input, int *output)
+{
+    static const int cos_mod[16] = {
+          4199362,   4240198,   4323885,   4454708,
+          4639772,   4890013,   5221943,   5660703,
+         -6245623,  -7040975,  -8158494,  -9809974,
+        -12450076, -17261920, -28585092, -85479984
+    };
+
+    int i, k;
+
+    for (i = 0; i < 8; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[8 + i]);
+
+    for (i = 8, k = 7; i < 16; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[8 + k]);
+}
+
+static void mod_b(int *input, int *output)
+{
+    static const int cos_mod[8] = {
+        4214598,  4383036,  4755871,  5425934,
+        6611520,  8897610, 14448934, 42791536
+    };
+
+    int i, k;
+
+    for (i = 0; i < 8; i++)
+        input[8 + i] = mul23(cos_mod[i], input[8 + i]);
+
+    for (i = 0; i < 8; i++)
+        output[i] = input[i] + input[8 + i];
+
+    for (i = 8, k = 7; i < 16; i++, k--)
+        output[i] = input[k] - input[8 + k];
+}
+
+static void mod_c(const int *input, int *output)
+{
+    static const int cos_mod[32] = {
+         1048892,  1051425,   1056522,   1064244,
+         1074689,  1087987,   1104313,   1123884,
+         1146975,  1173922,   1205139,   1241133,
+         1282529,  1330095,   1384791,   1447815,
+        -1520688, -1605358,  -1704360,  -1821051,
+        -1959964, -2127368,  -2332183,  -2587535,
+        -2913561, -3342802,  -3931480,  -4785806,
+        -6133390, -8566050, -14253820, -42727120
+    };
+
+    int i, k;
+
+    for (i = 0; i < 16; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[16 + i]);
+
+    for (i = 16, k = 15; i < 32; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[16 + k]);
+}
+
+static void clp_v(int *input, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        input[i] = clip23(input[i]);
+}
+
+static void imdct_half_32(int32_t *output, const int32_t *input)
+{
+    int buf_a[32], buf_b[32];
+    int i, k, mag, shift, round;
+
+    mag = 0;
+    for (i = 0; i < 32; i++)
+        mag += abs(input[i]);
+
+    shift = mag > 0x400000 ? 2 : 0;
+    round = shift > 0 ? 1 << (shift - 1) : 0;
+
+    for (i = 0; i < 32; i++)
+        buf_a[i] = (input[i] + round) >> shift;
+
+    sum_a(buf_a, buf_b +  0, 16);
+    sum_b(buf_a, buf_b + 16, 16);
+    clp_v(buf_b, 32);
+
+    sum_a(buf_b +  0, buf_a +  0, 8);
+    sum_b(buf_b +  0, buf_a +  8, 8);
+    sum_c(buf_b + 16, buf_a + 16, 8);
+    sum_d(buf_b + 16, buf_a + 24, 8);
+    clp_v(buf_a, 32);
+
+    dct_a(buf_a +  0, buf_b +  0);
+    dct_b(buf_a +  8, buf_b +  8);
+    dct_b(buf_a + 16, buf_b + 16);
+    dct_b(buf_a + 24, buf_b + 24);
+    clp_v(buf_b, 32);
+
+    mod_a(buf_b +  0, buf_a +  0);
+    mod_b(buf_b + 16, buf_a + 16);
+    clp_v(buf_a, 32);
+
+    mod_c(buf_a, buf_b);
+
+    for (i = 0; i < 32; i++)
+        buf_b[i] = clip23(buf_b[i] * (1 << shift));
+
+    for (i = 0, k = 31; i < 16; i++, k--) {
+        output[     i] = clip23(buf_b[i] - buf_b[k]);
+        output[16 + i] = clip23(buf_b[i] + buf_b[k]);
+    }
+}
+
+static void mod64_a(const int *input, int *output)
+{
+    static const int cos_mod[32] = {
+          4195568,   4205700,   4226086,    4256977,
+          4298755,   4351949,   4417251,    4495537,
+          4587901,   4695690,   4820557,    4964534,
+          5130115,   5320382,   5539164,    5791261,
+         -6082752,  -6421430,  -6817439,   -7284203,
+         -7839855,  -8509474,  -9328732,  -10350140,
+        -11654242, -13371208, -15725922,  -19143224,
+        -24533560, -34264200, -57015280, -170908480
+    };
+
+    int i, k;
+
+    for (i = 0; i < 16; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[16 + i]);
+
+    for (i = 16, k = 15; i < 32; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[16 + k]);
+}
+
+static void mod64_b(int *input, int *output)
+{
+    static const int cos_mod[16] = {
+         4199362,  4240198,  4323885,  4454708,
+         4639772,  4890013,  5221943,  5660703,
+         6245623,  7040975,  8158494,  9809974,
+        12450076, 17261920, 28585092, 85479984
+    };
+
+    int i, k;
+
+    for (i = 0; i < 16; i++)
+        input[16 + i] = mul23(cos_mod[i], input[16 + i]);
+
+    for (i = 0; i < 16; i++)
+        output[i] = input[i] + input[16 + i];
+
+    for (i = 16, k = 15; i < 32; i++, k--)
+        output[i] = input[k] - input[16 + k];
+}
+
+static void mod64_c(const int *input, int *output)
+{
+    static const int cos_mod[64] = {
+          741511,    741958,    742853,    744199,
+          746001,    748262,    750992,    754197,
+          757888,    762077,    766777,    772003,
+          777772,    784105,    791021,    798546,
+          806707,    815532,    825054,    835311,
+          846342,    858193,    870912,    884554,
+          899181,    914860,    931667,    949686,
+          969011,    989747,   1012012,   1035941,
+        -1061684,  -1089412,  -1119320,  -1151629,
+        -1186595,  -1224511,  -1265719,  -1310613,
+        -1359657,  -1413400,  -1472490,  -1537703,
+        -1609974,  -1690442,  -1780506,  -1881904,
+        -1996824,  -2128058,  -2279225,  -2455101,
+        -2662128,  -2909200,  -3208956,  -3579983,
+        -4050785,  -4667404,  -5509372,  -6726913,
+        -8641940, -12091426, -20144284, -60420720
+    };
+
+    int i, k;
+
+    for (i = 0; i < 32; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[32 + i]);
+
+    for (i = 32, k = 31; i < 64; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[32 + k]);
+}
+
+static void imdct_half_64(int32_t *output, const int32_t *input)
+{
+    int buf_a[64], buf_b[64];
+    int i, k, mag, shift, round;
+
+    mag = 0;
+    for (i = 0; i < 64; i++)
+        mag += abs(input[i]);
+
+    shift = mag > 0x400000 ? 2 : 0;
+    round = shift > 0 ? 1 << (shift - 1) : 0;
+
+    for (i = 0; i < 64; i++)
+        buf_a[i] = (input[i] + round) >> shift;
+
+    sum_a(buf_a, buf_b +  0, 32);
+    sum_b(buf_a, buf_b + 32, 32);
+    clp_v(buf_b, 64);
+
+    sum_a(buf_b +  0, buf_a +  0, 16);
+    sum_b(buf_b +  0, buf_a + 16, 16);
+    sum_c(buf_b + 32, buf_a + 32, 16);
+    sum_d(buf_b + 32, buf_a + 48, 16);
+    clp_v(buf_a, 64);
+
+    sum_a(buf_a +  0, buf_b +  0, 8);
+    sum_b(buf_a +  0, buf_b +  8, 8);
+    sum_c(buf_a + 16, buf_b + 16, 8);
+    sum_d(buf_a + 16, buf_b + 24, 8);
+    sum_c(buf_a + 32, buf_b + 32, 8);
+    sum_d(buf_a + 32, buf_b + 40, 8);
+    sum_c(buf_a + 48, buf_b + 48, 8);
+    sum_d(buf_a + 48, buf_b + 56, 8);
+    clp_v(buf_b, 64);
+
+    dct_a(buf_b +  0, buf_a +  0);
+    dct_b(buf_b +  8, buf_a +  8);
+    dct_b(buf_b + 16, buf_a + 16);
+    dct_b(buf_b + 24, buf_a + 24);
+    dct_b(buf_b + 32, buf_a + 32);
+    dct_b(buf_b + 40, buf_a + 40);
+    dct_b(buf_b + 48, buf_a + 48);
+    dct_b(buf_b + 56, buf_a + 56);
+    clp_v(buf_a, 64);
+
+    mod_a(buf_a +  0, buf_b +  0);
+    mod_b(buf_a + 16, buf_b + 16);
+    mod_b(buf_a + 32, buf_b + 32);
+    mod_b(buf_a + 48, buf_b + 48);
+    clp_v(buf_b, 64);
+
+    mod64_a(buf_b +  0, buf_a +  0);
+    mod64_b(buf_b + 32, buf_a + 32);
+    clp_v(buf_a, 64);
+
+    mod64_c(buf_a, buf_b);
+
+    for (i = 0; i < 64; i++)
+        buf_b[i] = clip23(buf_b[i] * (1 << shift));
+
+    for (i = 0, k = 63; i < 32; i++, k--) {
+        output[     i] = clip23(buf_b[i] - buf_b[k]);
+        output[32 + i] = clip23(buf_b[i] + buf_b[k]);
+    }
+}
+
+av_cold void ff_dcadct_init(DCADCTContext *c)
+{
+    c->imdct_half[0] = imdct_half_32;
+    c->imdct_half[1] = imdct_half_64;
+}
diff --git a/libavcodec/dcadct.h b/libavcodec/dcadct.h
new file mode 100644
index 0000000..518c9f9
--- /dev/null
+++ b/libavcodec/dcadct.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCADCT_H
+#define AVCODEC_DCADCT_H
+
+#include "libavutil/common.h"
+
+typedef struct DCADCTContext {
+    void (*imdct_half[2])(int32_t *output, const int32_t *input);
+} DCADCTContext;
+
+av_cold void ff_dcadct_init(DCADCTContext *c);
+
+#endif
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 9c1f878..4146a85 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -1,1606 +1,400 @@
 /*
- * DCA compatible decoder
- * Copyright (C) 2004 Gildas Bazin
- * Copyright (C) 2004 Benjamin Zores
- * Copyright (C) 2006 Benjamin Larsson
- * Copyright (C) 2007 Konstantin Shishkov
- * Copyright (C) 2012 Paul B Mahol
- * Copyright (C) 2014 Niels Möller
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <math.h>
-#include <stddef.h>
-#include <stdio.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/channel_layout.h"
-#include "libavutil/common.h"
-#include "libavutil/float_dsp.h"
-#include "libavutil/internal.h"
-#include "libavutil/intreadwrite.h"
-#include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
-#include "libavutil/samplefmt.h"
+#include "libavutil/channel_layout.h"
 
-#include "avcodec.h"
-#include "dca.h"
-#include "dca_syncwords.h"
-#include "dcadata.h"
-#include "dcadsp.h"
+#include "dcadec.h"
 #include "dcahuff.h"
-#include "fft.h"
-#include "fmtconvert.h"
-#include "get_bits.h"
-#include "internal.h"
-#include "mathops.h"
+#include "dca_syncwords.h"
 #include "profiles.h"
-#include "put_bits.h"
-#include "synth_filter.h"
-
-#if ARCH_ARM
-#   include "arm/dca.h"
-#endif
-
-enum DCAMode {
-    DCA_MONO = 0,
-    DCA_CHANNEL,
-    DCA_STEREO,
-    DCA_STEREO_SUMDIFF,
-    DCA_STEREO_TOTAL,
-    DCA_3F,
-    DCA_2F1R,
-    DCA_3F1R,
-    DCA_2F2R,
-    DCA_3F2R,
-    DCA_4F2R
-};
-
-/* -1 are reserved or unknown */
-static const int dca_ext_audio_descr_mask[] = {
-    DCA_EXT_XCH,
-    -1,
-    DCA_EXT_X96,
-    DCA_EXT_XCH | DCA_EXT_X96,
-    -1,
-    -1,
-    DCA_EXT_XXCH,
-    -1,
-};
-
-/* Tables for mapping dts channel configurations to libavcodec multichannel api.
- * Some compromises have been made for special configurations. Most configurations
- * are never used so complete accuracy is not needed.
- *
- * L = left, R = right, C = center, S = surround, F = front, R = rear, T = total, OV = overhead.
- * S  -> side, when both rear and back are configured move one of them to the side channel
- * OV -> center back
- * All 2 channel configurations -> AV_CH_LAYOUT_STEREO
- */
-static const uint64_t dca_core_channel_layout[] = {
-    AV_CH_FRONT_CENTER,                                                     ///< 1, A
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, A + B (dual mono)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, L + R (stereo)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, (L + R) + (L - R) (sum-difference)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, LT + RT (left and right total)
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER,                               ///< 3, C + L + R
-    AV_CH_LAYOUT_STEREO | AV_CH_BACK_CENTER,                                ///< 3, L + R + S
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_BACK_CENTER,           ///< 4, C + L + R + S
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,               ///< 4, L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_SIDE_LEFT |
-    AV_CH_SIDE_RIGHT,                                                       ///< 5, C + L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER,               ///< 6, CL + CR + L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT |
-    AV_CH_FRONT_CENTER  | AV_CH_BACK_CENTER,                                ///< 6, C + L + R + LR + RR + OV
-
-    AV_CH_FRONT_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_BACK_CENTER   |
-    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 6, CF + CR + LF + RF + LR + RR
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
-    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
-    AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,                                     ///< 7, CL + C + CR + L + R + SL + SR
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
-    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 8, CL + CR + L + R + SL1 + SL2 + SR1 + SR2
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
-    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
-    AV_CH_SIDE_LEFT | AV_CH_BACK_CENTER | AV_CH_SIDE_RIGHT,                 ///< 8, CL + C + CR + L + R + SL + S + SR
-};
-
-#define DCA_DOLBY                  101           /* FIXME */
-
-#define DCA_CHANNEL_BITS             6
-#define DCA_CHANNEL_MASK          0x3F
-
-#define DCA_LFE                   0x80
 
-#define HEADER_SIZE                 14
+#define MIN_PACKET_SIZE     16
+#define MAX_PACKET_SIZE     0x104000
 
-#define DCA_NSYNCAUX        0x9A1105A0
-
-/** Bit allocation */
-typedef struct BitAlloc {
-    int offset;                 ///< code values offset
-    int maxbits[8];             ///< max bits in VLC
-    int wrap;                   ///< wrap for get_vlc2()
-    VLC vlc[8];                 ///< actual codes
-} BitAlloc;
-
-static BitAlloc dca_bitalloc_index;    ///< indexes for samples VLC select
-static BitAlloc dca_tmode;             ///< transition mode VLCs
-static BitAlloc dca_scalefactor;       ///< scalefactor VLCs
-static BitAlloc dca_smpl_bitalloc[11]; ///< samples VLCs
-
-static av_always_inline int get_bitalloc(GetBitContext *gb, BitAlloc *ba,
-                                         int idx)
+int ff_dca_set_channel_layout(AVCodecContext *avctx, int *ch_remap, int dca_mask)
 {
-    return get_vlc2(gb, ba->vlc[idx].table, ba->vlc[idx].bits, ba->wrap) +
-           ba->offset;
-}
-
-static av_cold void dca_init_vlcs(void)
-{
-    static int vlcs_initialized = 0;
-    int i, j, c = 14;
-    static VLC_TYPE dca_table[23622][2];
-
-    if (vlcs_initialized)
-        return;
-
-    dca_bitalloc_index.offset = 1;
-    dca_bitalloc_index.wrap   = 2;
-    for (i = 0; i < 5; i++) {
-        dca_bitalloc_index.vlc[i].table           = &dca_table[ff_dca_vlc_offs[i]];
-        dca_bitalloc_index.vlc[i].table_allocated = ff_dca_vlc_offs[i + 1] - ff_dca_vlc_offs[i];
-        init_vlc(&dca_bitalloc_index.vlc[i], bitalloc_12_vlc_bits[i], 12,
-                 bitalloc_12_bits[i], 1, 1,
-                 bitalloc_12_codes[i], 2, 2, INIT_VLC_USE_NEW_STATIC);
-    }
-    dca_scalefactor.offset = -64;
-    dca_scalefactor.wrap   = 2;
-    for (i = 0; i < 5; i++) {
-        dca_scalefactor.vlc[i].table           = &dca_table[ff_dca_vlc_offs[i + 5]];
-        dca_scalefactor.vlc[i].table_allocated = ff_dca_vlc_offs[i + 6] - ff_dca_vlc_offs[i + 5];
-        init_vlc(&dca_scalefactor.vlc[i], SCALES_VLC_BITS, 129,
-                 scales_bits[i], 1, 1,
-                 scales_codes[i], 2, 2, INIT_VLC_USE_NEW_STATIC);
-    }
-    dca_tmode.offset = 0;
-    dca_tmode.wrap   = 1;
-    for (i = 0; i < 4; i++) {
-        dca_tmode.vlc[i].table           = &dca_table[ff_dca_vlc_offs[i + 10]];
-        dca_tmode.vlc[i].table_allocated = ff_dca_vlc_offs[i + 11] - ff_dca_vlc_offs[i + 10];
-        init_vlc(&dca_tmode.vlc[i], tmode_vlc_bits[i], 4,
-                 tmode_bits[i], 1, 1,
-                 tmode_codes[i], 2, 2, INIT_VLC_USE_NEW_STATIC);
+    static const uint8_t dca2wav_norm[28] = {
+         2,  0, 1, 9, 10,  3,  8,  4,  5,  9, 10, 6, 7, 12,
+        13, 14, 3, 6,  7, 11, 12, 14, 16, 15, 17, 8, 4,  5,
+    };
+
+    static const uint8_t dca2wav_wide[28] = {
+         2,  0, 1, 4,  5,  3,  8,  4,  5,  9, 10, 6, 7, 12,
+        13, 14, 3, 9, 10, 11, 12, 14, 16, 15, 17, 8, 4,  5,
+    };
+
+    int dca_ch, wav_ch, nchannels = 0;
+
+    if (avctx->request_channel_layout & AV_CH_LAYOUT_NATIVE) {
+        for (dca_ch = 0; dca_ch < DCA_SPEAKER_COUNT; dca_ch++)
+            if (dca_mask & (1U << dca_ch))
+                ch_remap[nchannels++] = dca_ch;
+        avctx->channel_layout = dca_mask;
+    } else {
+        int wav_mask = 0;
+        int wav_map[18];
+        const uint8_t *dca2wav;
+        if (dca_mask == DCA_SPEAKER_LAYOUT_7POINT0_WIDE ||
+            dca_mask == DCA_SPEAKER_LAYOUT_7POINT1_WIDE)
+            dca2wav = dca2wav_wide;
+        else
+            dca2wav = dca2wav_norm;
+        for (dca_ch = 0; dca_ch < 28; dca_ch++) {
+            if (dca_mask & (1 << dca_ch)) {
+                wav_ch = dca2wav[dca_ch];
+                if (!(wav_mask & (1 << wav_ch))) {
+                    wav_map[wav_ch] = dca_ch;
+                    wav_mask |= 1 << wav_ch;
+                }
+            }
+        }
+        for (wav_ch = 0; wav_ch < 18; wav_ch++)
+            if (wav_mask & (1 << wav_ch))
+                ch_remap[nchannels++] = wav_map[wav_ch];
+        avctx->channel_layout = wav_mask;
     }
 
-    for (i = 0; i < 10; i++)
-        for (j = 0; j < 7; j++) {
-            if (!bitalloc_codes[i][j])
-                break;
-            dca_smpl_bitalloc[i + 1].offset                 = bitalloc_offsets[i];
-            dca_smpl_bitalloc[i + 1].wrap                   = 1 + (j > 4);
-            dca_smpl_bitalloc[i + 1].vlc[j].table           = &dca_table[ff_dca_vlc_offs[c]];
-            dca_smpl_bitalloc[i + 1].vlc[j].table_allocated = ff_dca_vlc_offs[c + 1] - ff_dca_vlc_offs[c];
-
-            init_vlc(&dca_smpl_bitalloc[i + 1].vlc[j], bitalloc_maxbits[i][j],
-                     bitalloc_sizes[i],
-                     bitalloc_bits[i][j], 1, 1,
-                     bitalloc_codes[i][j], 2, 2, INIT_VLC_USE_NEW_STATIC);
-            c++;
-        }
-    vlcs_initialized = 1;
+    avctx->channels = nchannels;
+    return nchannels;
 }
 
-static inline void get_array(GetBitContext *gb, int *dst, int len, int bits)
+void ff_dca_downmix_to_stereo_fixed(DCADSPContext *dcadsp, int32_t **samples,
+                                    int *coeff_l, int nsamples, int ch_mask)
 {
-    while (len--)
-        *dst++ = get_bits(gb, bits);
-}
+    int pos, spkr, max_spkr = av_log2(ch_mask);
+    int *coeff_r = coeff_l + av_popcount(ch_mask);
 
-static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
-{
-    int i, j;
-    static const uint8_t adj_table[4] = { 16, 18, 20, 23 };
-    static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
-    static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
+    av_assert0(DCA_HAS_STEREO(ch_mask));
 
-    s->audio_header.total_channels = get_bits(&s->gb, 3) + 1 + base_channel;
-    s->audio_header.prim_channels  = s->audio_header.total_channels;
+    // Scale left and right channels
+    pos = (ch_mask & DCA_SPEAKER_MASK_C);
+    dcadsp->dmix_scale(samples[DCA_SPEAKER_L], coeff_l[pos    ], nsamples);
+    dcadsp->dmix_scale(samples[DCA_SPEAKER_R], coeff_r[pos + 1], nsamples);
 
-    if (s->audio_header.prim_channels > DCA_PRIM_CHANNELS_MAX)
-        s->audio_header.prim_channels = DCA_PRIM_CHANNELS_MAX;
+    // Downmix remaining channels
+    for (spkr = 0; spkr <= max_spkr; spkr++) {
+        if (!(ch_mask & (1U << spkr)))
+            continue;
 
-    for (i = base_channel; i < s->audio_header.prim_channels; i++) {
-        s->audio_header.subband_activity[i] = get_bits(&s->gb, 5) + 2;
-        if (s->audio_header.subband_activity[i] > DCA_SUBBANDS)
-            s->audio_header.subband_activity[i] = DCA_SUBBANDS;
-    }
-    for (i = base_channel; i < s->audio_header.prim_channels; i++) {
-        s->audio_header.vq_start_subband[i] = get_bits(&s->gb, 5) + 1;
-        if (s->audio_header.vq_start_subband[i] > DCA_SUBBANDS)
-            s->audio_header.vq_start_subband[i] = DCA_SUBBANDS;
-    }
-    get_array(&s->gb, s->audio_header.joint_intensity + base_channel,
-              s->audio_header.prim_channels - base_channel, 3);
-    get_array(&s->gb, s->audio_header.transient_huffman + base_channel,
-              s->audio_header.prim_channels - base_channel, 2);
-    get_array(&s->gb, s->audio_header.scalefactor_huffman + base_channel,
-              s->audio_header.prim_channels - base_channel, 3);
-    get_array(&s->gb, s->audio_header.bitalloc_huffman + base_channel,
-              s->audio_header.prim_channels - base_channel, 3);
-
-    /* Get codebooks quantization indexes */
-    if (!base_channel)
-        memset(s->audio_header.quant_index_huffman, 0, sizeof(s->audio_header.quant_index_huffman));
-    for (j = 1; j < 11; j++)
-        for (i = base_channel; i < s->audio_header.prim_channels; i++)
-            s->audio_header.quant_index_huffman[i][j] = get_bits(&s->gb, bitlen[j]);
-
-    /* Get scale factor adjustment */
-    for (j = 0; j < 11; j++)
-        for (i = base_channel; i < s->audio_header.prim_channels; i++)
-            s->audio_header.scalefactor_adj[i][j] = 16;
-
-    for (j = 1; j < 11; j++)
-        for (i = base_channel; i < s->audio_header.prim_channels; i++)
-            if (s->audio_header.quant_index_huffman[i][j] < thr[j])
-                s->audio_header.scalefactor_adj[i][j] = adj_table[get_bits(&s->gb, 2)];
-
-    if (s->crc_present) {
-        /* Audio header CRC check */
-        get_bits(&s->gb, 16);
-    }
+        if (*coeff_l && spkr != DCA_SPEAKER_L)
+            dcadsp->dmix_add(samples[DCA_SPEAKER_L], samples[spkr],
+                             *coeff_l, nsamples);
 
-    s->current_subframe    = 0;
-    s->current_subsubframe = 0;
+        if (*coeff_r && spkr != DCA_SPEAKER_R)
+            dcadsp->dmix_add(samples[DCA_SPEAKER_R], samples[spkr],
+                             *coeff_r, nsamples);
 
-    return 0;
+        coeff_l++;
+        coeff_r++;
+    }
 }
 
-static int dca_parse_frame_header(DCAContext *s)
+void ff_dca_downmix_to_stereo_float(AVFloatDSPContext *fdsp, float **samples,
+                                    int *coeff_l, int nsamples, int ch_mask)
 {
-    init_get_bits(&s->gb, s->dca_buffer, s->dca_buffer_size * 8);
-
-    /* Sync code */
-    skip_bits_long(&s->gb, 32);
-
-    /* Frame header */
-    s->frame_type        = get_bits(&s->gb, 1);
-    s->samples_deficit   = get_bits(&s->gb, 5) + 1;
-    s->crc_present       = get_bits(&s->gb, 1);
-    s->sample_blocks     = get_bits(&s->gb, 7) + 1;
-    s->frame_size        = get_bits(&s->gb, 14) + 1;
-    if (s->frame_size < 95)
-        return AVERROR_INVALIDDATA;
-    s->amode             = get_bits(&s->gb, 6);
-    s->sample_rate       = avpriv_dca_sample_rates[get_bits(&s->gb, 4)];
-    if (!s->sample_rate)
-        return AVERROR_INVALIDDATA;
-    s->bit_rate_index    = get_bits(&s->gb, 5);
-    s->bit_rate          = ff_dca_bit_rates[s->bit_rate_index];
-    if (!s->bit_rate)
-        return AVERROR_INVALIDDATA;
+    int pos, spkr, max_spkr = av_log2(ch_mask);
+    int *coeff_r = coeff_l + av_popcount(ch_mask);
+    const float scale = 1.0f / (1 << 15);
 
-    skip_bits1(&s->gb); // always 0 (reserved, cf. ETSI TS 102 114 V1.4.1)
-    s->dynrange          = get_bits(&s->gb, 1);
-    s->timestamp         = get_bits(&s->gb, 1);
-    s->aux_data          = get_bits(&s->gb, 1);
-    s->hdcd              = get_bits(&s->gb, 1);
-    s->ext_descr         = get_bits(&s->gb, 3);
-    s->ext_coding        = get_bits(&s->gb, 1);
-    s->aspf              = get_bits(&s->gb, 1);
-    s->lfe               = get_bits(&s->gb, 2);
-    s->predictor_history = get_bits(&s->gb, 1);
-
-    if (s->lfe > 2) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE value: %d\n", s->lfe);
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(DCA_HAS_STEREO(ch_mask));
 
-    /* TODO: check CRC */
-    if (s->crc_present)
-        s->header_crc    = get_bits(&s->gb, 16);
+    // Scale left and right channels
+    pos = (ch_mask & DCA_SPEAKER_MASK_C);
+    fdsp->vector_fmul_scalar(samples[DCA_SPEAKER_L], samples[DCA_SPEAKER_L],
+                             coeff_l[pos    ] * scale, nsamples);
+    fdsp->vector_fmul_scalar(samples[DCA_SPEAKER_R], samples[DCA_SPEAKER_R],
+                             coeff_r[pos + 1] * scale, nsamples);
 
-    s->multirate_inter   = get_bits(&s->gb, 1);
-    s->version           = get_bits(&s->gb, 4);
-    s->copy_history      = get_bits(&s->gb, 2);
-    s->source_pcm_res    = get_bits(&s->gb, 3);
-    s->front_sum         = get_bits(&s->gb, 1);
-    s->surround_sum      = get_bits(&s->gb, 1);
-    s->dialog_norm       = get_bits(&s->gb, 4);
+    // Downmix remaining channels
+    for (spkr = 0; spkr <= max_spkr; spkr++) {
+        if (!(ch_mask & (1U << spkr)))
+            continue;
 
-    /* FIXME: channels mixing levels */
-    s->output = s->amode;
-    if (s->lfe)
-        s->output |= DCA_LFE;
+        if (*coeff_l && spkr != DCA_SPEAKER_L)
+            fdsp->vector_fmac_scalar(samples[DCA_SPEAKER_L], samples[spkr],
+                                     *coeff_l * scale, nsamples);
 
-    /* Primary audio coding header */
-    s->audio_header.subframes = get_bits(&s->gb, 4) + 1;
+        if (*coeff_r && spkr != DCA_SPEAKER_R)
+            fdsp->vector_fmac_scalar(samples[DCA_SPEAKER_R], samples[spkr],
+                                     *coeff_r * scale, nsamples);
 
-    return dca_parse_audio_coding_header(s, 0);
-}
-
-static inline int get_scale(GetBitContext *gb, int level, int value, int log2range)
-{
-    if (level < 5) {
-        /* huffman encoded */
-        value += get_bitalloc(gb, &dca_scalefactor, level);
-        value  = av_clip(value, 0, (1 << log2range) - 1);
-    } else if (level < 8) {
-        if (level + 1 > log2range) {
-            skip_bits(gb, level + 1 - log2range);
-            value = get_bits(gb, log2range);
-        } else {
-            value = get_bits(gb, level + 1);
-        }
+        coeff_l++;
+        coeff_r++;
     }
-    return value;
 }
 
-static int dca_subframe_header(DCAContext *s, int base_channel, int block_index)
+static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
+                               int *got_frame_ptr, AVPacket *avpkt)
 {
-    /* Primary audio coding side information */
-    int j, k;
-
-    if (get_bits_left(&s->gb) < 0)
+    DCAContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    uint8_t *input = avpkt->data;
+    int input_size = avpkt->size;
+    int i, ret, prev_packet = s->packet;
+    uint32_t mrk;
+
+    if (input_size < MIN_PACKET_SIZE || input_size > MAX_PACKET_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid packet size\n");
         return AVERROR_INVALIDDATA;
-
-    if (!base_channel) {
-        s->subsubframes[s->current_subframe]    = get_bits(&s->gb, 2) + 1;
-        s->partial_samples[s->current_subframe] = get_bits(&s->gb, 3);
-    }
-
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++)
-            s->dca_chan[j].prediction_mode[k] = get_bits(&s->gb, 1);
     }
 
-    /* Get prediction codebook */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++) {
-            if (s->dca_chan[j].prediction_mode[k] > 0) {
-                /* (Prediction coefficient VQ address) */
-                s->dca_chan[j].prediction_vq[k] = get_bits(&s->gb, 12);
-            }
-        }
-    }
+    // Convert input to BE format
+    mrk = AV_RB32(input);
+    if (mrk != DCA_SYNCWORD_CORE_BE && mrk != DCA_SYNCWORD_SUBSTREAM) {
+        av_fast_padded_malloc(&s->buffer, &s->buffer_size, input_size);
+        if (!s->buffer)
+            return AVERROR(ENOMEM);
 
-    /* Bit allocation index */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.vq_start_subband[j]; k++) {
-            if (s->audio_header.bitalloc_huffman[j] == 6)
-                s->dca_chan[j].bitalloc[k] = get_bits(&s->gb, 5);
-            else if (s->audio_header.bitalloc_huffman[j] == 5)
-                s->dca_chan[j].bitalloc[k] = get_bits(&s->gb, 4);
-            else if (s->audio_header.bitalloc_huffman[j] == 7) {
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "Invalid bit allocation index\n");
-                return AVERROR_INVALIDDATA;
-            } else {
-                s->dca_chan[j].bitalloc[k] =
-                    get_bitalloc(&s->gb, &dca_bitalloc_index, s->audio_header.bitalloc_huffman[j]);
-            }
+        for (i = 0, ret = AVERROR_INVALIDDATA; i < input_size - MIN_PACKET_SIZE + 1 && ret < 0; i++)
+            ret = avpriv_dca_convert_bitstream(input + i, input_size - i, s->buffer, s->buffer_size);
 
-            if (s->dca_chan[j].bitalloc[k] > 26) {
-                ff_dlog(s->avctx, "bitalloc index [%i][%i] too big (%i)\n",
-                        j, k, s->dca_chan[j].bitalloc[k]);
-                return AVERROR_INVALIDDATA;
-            }
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Not a valid DCA frame\n");
+            return ret;
         }
-    }
 
-    /* Transition mode */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++) {
-            s->dca_chan[j].transition_mode[k] = 0;
-            if (s->subsubframes[s->current_subframe] > 1 &&
-                k < s->audio_header.vq_start_subband[j] && s->dca_chan[j].bitalloc[k] > 0) {
-                s->dca_chan[j].transition_mode[k] =
-                    get_bitalloc(&s->gb, &dca_tmode, s->audio_header.transient_huffman[j]);
-            }
-        }
+        input      = s->buffer;
+        input_size = ret;
     }
 
-    if (get_bits_left(&s->gb) < 0)
-        return AVERROR_INVALIDDATA;
-
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        const uint32_t *scale_table;
-        int scale_sum, log_size;
-
-        memset(s->dca_chan[j].scale_factor, 0,
-               s->audio_header.subband_activity[j] * sizeof(s->dca_chan[j].scale_factor[0][0]) * 2);
+    s->packet = 0;
 
-        if (s->audio_header.scalefactor_huffman[j] == 6) {
-            scale_table = ff_dca_scale_factor_quant7;
-            log_size    = 7;
-        } else {
-            scale_table = ff_dca_scale_factor_quant6;
-            log_size    = 6;
-        }
+    // Parse backward compatible core sub-stream
+    if (AV_RB32(input) == DCA_SYNCWORD_CORE_BE) {
+        int frame_size;
 
-        /* When huffman coded, only the difference is encoded */
-        scale_sum = 0;
+        if ((ret = ff_dca_core_parse(&s->core, input, input_size)) < 0)
+            return ret;
 
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++) {
-            if (k >= s->audio_header.vq_start_subband[j] || s->dca_chan[j].bitalloc[k] > 0) {
-                scale_sum = get_scale(&s->gb, s->audio_header.scalefactor_huffman[j], scale_sum, log_size);
-                s->dca_chan[j].scale_factor[k][0] = scale_table[scale_sum];
-            }
+        s->packet |= DCA_PACKET_CORE;
 
-            if (k < s->audio_header.vq_start_subband[j] && s->dca_chan[j].transition_mode[k]) {
-                /* Get second scale factor */
-                scale_sum = get_scale(&s->gb, s->audio_header.scalefactor_huffman[j], scale_sum, log_size);
-                s->dca_chan[j].scale_factor[k][1] = scale_table[scale_sum];
-            }
+        // EXXS data must be aligned on 4-byte boundary
+        frame_size = FFALIGN(s->core.frame_size, 4);
+        if (input_size - 4 > frame_size) {
+            input      += frame_size;
+            input_size -= frame_size;
         }
     }
 
-    /* Joint subband scale factor codebook select */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        /* Transmitted only if joint subband coding enabled */
-        if (s->audio_header.joint_intensity[j] > 0)
-            s->dca_chan[j].joint_huff = get_bits(&s->gb, 3);
-    }
+    if (!s->core_only) {
+        DCAExssAsset *asset = NULL;
 
-    if (get_bits_left(&s->gb) < 0)
-        return AVERROR_INVALIDDATA;
-
-    /* Scale factors for joint subband coding */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        int source_channel;
-
-        /* Transmitted only if joint subband coding enabled */
-        if (s->audio_header.joint_intensity[j] > 0) {
-            int scale = 0;
-            source_channel = s->audio_header.joint_intensity[j] - 1;
-
-            /* When huffman coded, only the difference is encoded
-             * (is this valid as well for joint scales ???) */
-
-            for (k = s->audio_header.subband_activity[j];
-                 k < s->audio_header.subband_activity[source_channel]; k++) {
-                scale = get_scale(&s->gb, s->dca_chan[j].joint_huff, 64 /* bias */, 7);
-                s->dca_chan[j].joint_scale_factor[k] = scale;    /*joint_scale_table[scale]; */
-            }
-
-            if (!(s->debug_flag & 0x02)) {
-                av_log(s->avctx, AV_LOG_DEBUG,
-                       "Joint stereo coding not supported\n");
-                s->debug_flag |= 0x02;
+        // Parse extension sub-stream (EXSS)
+        if (AV_RB32(input) == DCA_SYNCWORD_SUBSTREAM) {
+            if ((ret = ff_dca_exss_parse(&s->exss, input, input_size)) < 0) {
+                if (avctx->err_recognition & AV_EF_EXPLODE)
+                    return ret;
+            } else {
+                s->packet |= DCA_PACKET_EXSS;
+                asset = &s->exss.assets[0];
             }
         }
-    }
-
-    /* Dynamic range coefficient */
-    if (!base_channel && s->dynrange)
-        s->dynrange_coef = get_bits(&s->gb, 8);
-
-    /* Side information CRC check word */
-    if (s->crc_present) {
-        get_bits(&s->gb, 16);
-    }
-
-    /*
-     * Primary audio data arrays
-     */
-
-    /* VQ encoded high frequency subbands */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++)
-        for (k = s->audio_header.vq_start_subband[j]; k < s->audio_header.subband_activity[j]; k++)
-            /* 1 vector -> 32 samples */
-            s->dca_chan[j].high_freq_vq[k] = get_bits(&s->gb, 10);
-
-    /* Low frequency effect data */
-    if (!base_channel && s->lfe) {
-        /* LFE samples */
-        int lfe_samples    = 2 * s->lfe * (4 + block_index);
-        int lfe_end_sample = 2 * s->lfe * (4 + block_index + s->subsubframes[s->current_subframe]);
-        float lfe_scale;
-
-        for (j = lfe_samples; j < lfe_end_sample; j++) {
-            /* Signed 8 bits int */
-            s->lfe_data[j] = get_sbits(&s->gb, 8);
-        }
-
-        /* Scale factor index */
-        skip_bits(&s->gb, 1);
-        s->lfe_scale_factor = ff_dca_scale_factor_quant7[get_bits(&s->gb, 7)];
-
-        /* Quantization step size * scale factor */
-        lfe_scale = 0.035 * s->lfe_scale_factor;
-
-        for (j = lfe_samples; j < lfe_end_sample; j++)
-            s->lfe_data[j] *= lfe_scale;
-    }
-
-    return 0;
-}
-
-static void qmf_32_subbands(DCAContext *s, int chans,
-                            float samples_in[DCA_SUBBANDS][SAMPLES_PER_SUBBAND], float *samples_out,
-                            float scale)
-{
-    const float *prCoeff;
-
-    int sb_act = s->audio_header.subband_activity[chans];
-
-    scale *= sqrt(1 / 8.0);
-
-    /* Select filter */
-    if (!s->multirate_inter)    /* Non-perfect reconstruction */
-        prCoeff = ff_dca_fir_32bands_nonperfect;
-    else                        /* Perfect reconstruction */
-        prCoeff = ff_dca_fir_32bands_perfect;
-
-    s->dcadsp.qmf_32_subbands(samples_in, sb_act, &s->synth, &s->imdct,
-                              s->dca_chan[chans].subband_fir_hist,
-                              &s->dca_chan[chans].hist_index,
-                              s->dca_chan[chans].subband_fir_noidea, prCoeff,
-                              samples_out, s->raXin, scale);
-}
-
-static QMF64_table *qmf64_precompute(void)
-{
-    unsigned i, j;
-    QMF64_table *table = av_malloc(sizeof(*table));
-    if (!table)
-        return NULL;
-
-    for (i = 0; i < 32; i++)
-        for (j = 0; j < 32; j++)
-            table->dct4_coeff[i][j] = cos((2 * i + 1) * (2 * j + 1) * M_PI / 128);
-    for (i = 0; i < 32; i++)
-        for (j = 0; j < 32; j++)
-            table->dct2_coeff[i][j] = cos((2 * i + 1) *      j      * M_PI /  64);
-
-    /* FIXME: Is the factor 0.125 = 1/8 right? */
-    for (i = 0; i < 32; i++)
-        table->rcos[i] =  0.125 / cos((2 * i + 1) * M_PI / 256);
-    for (i = 0; i < 32; i++)
-        table->rsin[i] = -0.125 / sin((2 * i + 1) * M_PI / 256);
-
-    return table;
-}
-
-/* FIXME: Totally unoptimized. Based on the reference code and
- * http://multimedia.cx/mirror/dca-transform.pdf, with guessed tweaks
- * for doubling the size. */
-static void qmf_64_subbands(DCAContext *s, int chans,
-                            float samples_in[DCA_SUBBANDS_X96K][SAMPLES_PER_SUBBAND],
-                            float *samples_out, float scale)
-{
-    float raXin[64];
-    float A[32], B[32];
-    float *raX = s->dca_chan[chans].subband_fir_hist;
-    float *raZ = s->dca_chan[chans].subband_fir_noidea;
-    unsigned i, j, k, subindex;
-
-    for (i = s->audio_header.subband_activity[chans]; i < DCA_SUBBANDS_X96K; i++)
-        raXin[i] = 0.0;
-    for (subindex = 0; subindex < SAMPLES_PER_SUBBAND; subindex++) {
-        for (i = 0; i < s->audio_header.subband_activity[chans]; i++)
-            raXin[i] = samples_in[i][subindex];
-
-        for (k = 0; k < 32; k++) {
-            A[k] = 0.0;
-            for (i = 0; i < 32; i++)
-                A[k] += (raXin[2 * i] + raXin[2 * i + 1]) * s->qmf64_table->dct4_coeff[k][i];
-        }
-        for (k = 0; k < 32; k++) {
-            B[k] = raXin[0] * s->qmf64_table->dct2_coeff[k][0];
-            for (i = 1; i < 32; i++)
-                B[k] += (raXin[2 * i] + raXin[2 * i - 1]) * s->qmf64_table->dct2_coeff[k][i];
-        }
-        for (k = 0; k < 32; k++) {
-            raX[k]      = s->qmf64_table->rcos[k] * (A[k] + B[k]);
-            raX[63 - k] = s->qmf64_table->rsin[k] * (A[k] - B[k]);
-        }
-
-        for (i = 0; i < DCA_SUBBANDS_X96K; i++) {
-            float out = raZ[i];
-            for (j = 0; j < 1024; j += 128)
-                out += ff_dca_fir_64bands[j + i] * (raX[j + i] - raX[j + 63 - i]);
-            *samples_out++ = out * scale;
-        }
-
-        for (i = 0; i < DCA_SUBBANDS_X96K; i++) {
-            float hist = 0.0;
-            for (j = 0; j < 1024; j += 128)
-                hist += ff_dca_fir_64bands[64 + j + i] * (-raX[i + j] - raX[j + 63 - i]);
-
-            raZ[i] = hist;
-        }
-
-        /* FIXME: Make buffer circular, to avoid this move. */
-        memmove(raX + 64, raX, (1024 - 64) * sizeof(*raX));
-    }
-}
-
-static void lfe_interpolation_fir(DCAContext *s, const float *samples_in,
-                                  float *samples_out)
-{
-    /* samples_in: An array holding decimated samples.
-     *   Samples in current subframe starts from samples_in[0],
-     *   while samples_in[-1], samples_in[-2], ..., stores samples
-     *   from last subframe as history.
-     *
-     * samples_out: An array holding interpolated samples
-     */
-
-    int idx;
-    const float *prCoeff;
-    int deciindex;
-
-    /* Select decimation filter */
-    if (s->lfe == 1) {
-        idx     = 1;
-        prCoeff = ff_dca_lfe_fir_128;
-    } else {
-        idx = 0;
-        if (s->exss_ext_mask & DCA_EXT_EXSS_XLL)
-            prCoeff = ff_dca_lfe_xll_fir_64;
-        else
-            prCoeff = ff_dca_lfe_fir_64;
-    }
-    /* Interpolation */
-    for (deciindex = 0; deciindex < 2 * s->lfe; deciindex++) {
-        s->dcadsp.lfe_fir[idx](samples_out, samples_in, prCoeff);
-        samples_in++;
-        samples_out += 2 * 32 * (1 + idx);
-    }
-}
-
-/* downmixing routines */
-#define MIX_REAR1(samples, s1, rs, coef)            \
-    samples[0][i] += samples[s1][i] * coef[rs][0];  \
-    samples[1][i] += samples[s1][i] * coef[rs][1];
-
-#define MIX_REAR2(samples, s1, s2, rs, coef)                                          \
-    samples[0][i] += samples[s1][i] * coef[rs][0] + samples[s2][i] * coef[rs + 1][0]; \
-    samples[1][i] += samples[s1][i] * coef[rs][1] + samples[s2][i] * coef[rs + 1][1];
-
-#define MIX_FRONT3(samples, coef)                                      \
-    t = samples[c][i];                                                 \
-    u = samples[l][i];                                                 \
-    v = samples[r][i];                                                 \
-    samples[0][i] = t * coef[0][0] + u * coef[1][0] + v * coef[2][0];  \
-    samples[1][i] = t * coef[0][1] + u * coef[1][1] + v * coef[2][1];
-
-#define DOWNMIX_TO_STEREO(op1, op2)             \
-    for (i = 0; i < 256; i++) {                 \
-        op1                                     \
-        op2                                     \
-    }
-
-static void dca_downmix(float **samples, int srcfmt, int lfe_present,
-                        float coef[DCA_PRIM_CHANNELS_MAX + 1][2],
-                        const int8_t *channel_mapping)
-{
-    int c, l, r, sl, sr, s;
-    int i;
-    float t, u, v;
-
-    switch (srcfmt) {
-    case DCA_MONO:
-    case DCA_4F2R:
-        av_log(NULL, 0, "Not implemented!\n");
-        break;
-    case DCA_CHANNEL:
-    case DCA_STEREO:
-    case DCA_STEREO_TOTAL:
-    case DCA_STEREO_SUMDIFF:
-        break;
-    case DCA_3F:
-        c = channel_mapping[0];
-        l = channel_mapping[1];
-        r = channel_mapping[2];
-        DOWNMIX_TO_STEREO(MIX_FRONT3(samples, coef), );
-        break;
-    case DCA_2F1R:
-        s = channel_mapping[2];
-        DOWNMIX_TO_STEREO(MIX_REAR1(samples, s, 2, coef), );
-        break;
-    case DCA_3F1R:
-        c = channel_mapping[0];
-        l = channel_mapping[1];
-        r = channel_mapping[2];
-        s = channel_mapping[3];
-        DOWNMIX_TO_STEREO(MIX_FRONT3(samples, coef),
-                          MIX_REAR1(samples, s, 3, coef));
-        break;
-    case DCA_2F2R:
-        sl = channel_mapping[2];
-        sr = channel_mapping[3];
-        DOWNMIX_TO_STEREO(MIX_REAR2(samples, sl, sr, 2, coef), );
-        break;
-    case DCA_3F2R:
-        c  = channel_mapping[0];
-        l  = channel_mapping[1];
-        r  = channel_mapping[2];
-        sl = channel_mapping[3];
-        sr = channel_mapping[4];
-        DOWNMIX_TO_STEREO(MIX_FRONT3(samples, coef),
-                          MIX_REAR2(samples, sl, sr, 3, coef));
-        break;
-    }
-    if (lfe_present) {
-        int lf_buf = ff_dca_lfe_index[srcfmt];
-        int lf_idx =  ff_dca_channels[srcfmt];
-        for (i = 0; i < 256; i++) {
-            samples[0][i] += samples[lf_buf][i] * coef[lf_idx][0];
-            samples[1][i] += samples[lf_buf][i] * coef[lf_idx][1];
-        }
-    }
-}
-
-#ifndef decode_blockcodes
-/* Very compact version of the block code decoder that does not use table
- * look-up but is slightly slower */
-static int decode_blockcode(int code, int levels, int32_t *values)
-{
-    int i;
-    int offset = (levels - 1) >> 1;
 
-    for (i = 0; i < 4; i++) {
-        int div = FASTDIV(code, levels);
-        values[i] = code - offset - div * levels;
-        code      = div;
-    }
-
-    return code;
-}
-
-static int decode_blockcodes(int code1, int code2, int levels, int32_t *values)
-{
-    return decode_blockcode(code1, levels, values) |
-           decode_blockcode(code2, levels, values + 4);
-}
-#endif
-
-static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
-static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
-
-static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
-{
-    int k, l;
-    int subsubframe = s->current_subsubframe;
-    const uint32_t *quant_step_table;
-
-    /*
-     * Audio data
-     */
-
-    /* Select quantization step size table */
-    if (s->bit_rate_index == 0x1f)
-        quant_step_table = ff_dca_lossless_quant;
-    else
-        quant_step_table = ff_dca_lossy_quant;
-
-    for (k = base_channel; k < s->audio_header.prim_channels; k++) {
-        int32_t (*subband_samples)[8] = s->dca_chan[k].subband_samples[block_index];
-
-        if (get_bits_left(&s->gb) < 0)
-            return AVERROR_INVALIDDATA;
-
-        for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
-            int m;
-
-            /* Select the mid-tread linear quantizer */
-            int abits = s->dca_chan[k].bitalloc[l];
-
-            uint32_t quant_step_size = quant_step_table[abits];
-
-            /*
-             * Extract bits from the bit stream
-             */
-            if (!abits)
-                memset(subband_samples[l], 0, SAMPLES_PER_SUBBAND *
-                       sizeof(subband_samples[l][0]));
-            else {
-                uint32_t rscale;
-                /* Deal with transients */
-                int sfi = s->dca_chan[k].transition_mode[l] &&
-                    subsubframe >= s->dca_chan[k].transition_mode[l];
-                /* Determine quantization index code book and its type.
-                   Select quantization index code book */
-                int sel = s->audio_header.quant_index_huffman[k][abits];
-
-                rscale = (s->dca_chan[k].scale_factor[l][sfi] *
-                          s->audio_header.scalefactor_adj[k][sel] + 8) >> 4;
-
-                if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table) {
-                    if (abits <= 7) {
-                        /* Block code */
-                        int block_code1, block_code2, size, levels, err;
-
-                        size   = abits_sizes[abits - 1];
-                        levels = abits_levels[abits - 1];
-
-                        block_code1 = get_bits(&s->gb, size);
-                        block_code2 = get_bits(&s->gb, size);
-                        err         = decode_blockcodes(block_code1, block_code2,
-                                                        levels, subband_samples[l]);
-                        if (err) {
-                            av_log(s->avctx, AV_LOG_ERROR,
-                                   "ERROR: block code look-up failed\n");
-                            return AVERROR_INVALIDDATA;
-                        }
-                    } else {
-                        /* no coding */
-                        for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
-                            subband_samples[l][m] = get_sbits(&s->gb, abits - 3);
-                    }
-                } else {
-                    /* Huffman coded */
-                    for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
-                        subband_samples[l][m] = get_bitalloc(&s->gb,
-                                                             &dca_smpl_bitalloc[abits], sel);
-                }
-                s->dcadsp.dequantize(subband_samples[l], quant_step_size, rscale);
+        // Parse XLL component in EXSS
+        if (asset && (asset->extension_mask & DCA_EXSS_XLL)) {
+            if ((ret = ff_dca_xll_parse(&s->xll, input, asset)) < 0) {
+                // Conceal XLL synchronization error
+                if (ret == AVERROR(EAGAIN)
+                    && (prev_packet & DCA_PACKET_XLL)
+                    && (s->packet & DCA_PACKET_CORE))
+                    s->packet |= DCA_PACKET_XLL | DCA_PACKET_RECOVERY;
+                else if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->packet |= DCA_PACKET_XLL;
             }
         }
 
-        for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
-            int m;
-            /*
-             * Inverse ADPCM if in prediction mode
-             */
-            if (s->dca_chan[k].prediction_mode[l]) {
-                int n;
-                if (s->predictor_history)
-                    subband_samples[l][0] += (ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][3] +
-                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][2] +
-                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][1] +
-                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][0]) +
-                                              (1 << 12) >> 13;
-                for (m = 1; m < SAMPLES_PER_SUBBAND; m++) {
-                    int64_t sum = ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
-                                  (int64_t)subband_samples[l][m - 1];
-                    for (n = 2; n <= 4; n++)
-                        if (m >= n)
-                            sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
-                                   (int64_t)subband_samples[l][m - n];
-                        else if (s->predictor_history)
-                            sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
-                                   (int64_t)s->dca_chan[k].subband_samples_hist[l][m - n + 4];
-                    subband_samples[l][m] += (int32_t)(sum + (1 << 12) >> 13);
-                }
-            }
-
-        }
-        /* Backup predictor history for adpcm */
-        for (l = 0; l < DCA_SUBBANDS; l++)
-            AV_COPY128(s->dca_chan[k].subband_samples_hist[l], &subband_samples[l][4]);
-
-
-        /*
-         * Decode VQ encoded high frequencies
-         */
-        if (s->audio_header.subband_activity[k] > s->audio_header.vq_start_subband[k]) {
-            if (!s->debug_flag & 0x01) {
-                av_log(s->avctx, AV_LOG_DEBUG,
-                       "Stream with high frequencies VQ coding\n");
-                s->debug_flag |= 0x01;
+        // Parse LBR component in EXSS
+        if (asset && (asset->extension_mask & DCA_EXSS_LBR)) {
+            if ((ret = ff_dca_lbr_parse(&s->lbr, input, asset)) < 0) {
+                if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->packet |= DCA_PACKET_LBR;
             }
-
-            s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
-                                ff_dca_high_freq_vq,
-                                subsubframe * SAMPLES_PER_SUBBAND,
-                                s->dca_chan[k].scale_factor,
-                                s->audio_header.vq_start_subband[k],
-                                s->audio_header.subband_activity[k]);
         }
-    }
 
-    /* Check for DSYNC after subsubframe */
-    if (s->aspf || subsubframe == s->subsubframes[s->current_subframe] - 1) {
-        if (get_bits(&s->gb, 16) != 0xFFFF) {
-            av_log(s->avctx, AV_LOG_ERROR, "Didn't get subframe DSYNC\n");
-            return AVERROR_INVALIDDATA;
-        }
+        // Parse core extensions in EXSS or backward compatible core sub-stream
+        if ((s->packet & DCA_PACKET_CORE)
+            && (ret = ff_dca_core_parse_exss(&s->core, input, asset)) < 0)
+            return ret;
     }
 
-    return 0;
-}
-
-static int dca_filter_channels(DCAContext *s, int block_index, int upsample, int downmix)
-{
-    int k;
-
-    if (upsample) {
-        LOCAL_ALIGNED(32, float, samples, [DCA_SUBBANDS_X96K], [SAMPLES_PER_SUBBAND]);
+    // Filter the frame
+    if (s->packet & DCA_PACKET_LBR) {
+        if ((ret = ff_dca_lbr_filter_frame(&s->lbr, frame)) < 0)
+            return ret;
+    } else if (s->packet & DCA_PACKET_XLL) {
+        if (s->packet & DCA_PACKET_CORE) {
+            int x96_synth = -1;
+
+            // Enable X96 synthesis if needed
+            if (s->xll.chset[0].freq == 96000 && s->core.sample_rate == 48000)
+                x96_synth = 1;
+
+            if ((ret = ff_dca_core_filter_fixed(&s->core, x96_synth)) < 0)
+                return ret;
+
+            // Force lossy downmixed output on the first core frame filtered.
+            // This prevents audible clicks when seeking and is consistent with
+            // what reference decoder does when there are multiple channel sets.
+            if (!(prev_packet & DCA_PACKET_RESIDUAL) && s->xll.nreschsets > 0
+                && s->xll.nchsets > 1) {
+                av_log(avctx, AV_LOG_VERBOSE, "Forcing XLL recovery mode\n");
+                s->packet |= DCA_PACKET_RECOVERY;
+            }
 
-        if (!s->qmf64_table) {
-            s->qmf64_table = qmf64_precompute();
-            if (!s->qmf64_table)
-                return AVERROR(ENOMEM);
+            // Set 'residual ok' flag for the next frame
+            s->packet |= DCA_PACKET_RESIDUAL;
         }
 
-        /* 64 subbands QMF */
-        for (k = 0; k < s->audio_header.prim_channels; k++) {
-            int channel = s->channel_order_tab[k];
-            int32_t (*subband_samples)[SAMPLES_PER_SUBBAND] =
-                     s->dca_chan[k].subband_samples[block_index];
-
-            s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
-                                       DCA_SUBBANDS_X96K * SAMPLES_PER_SUBBAND);
-
-            if (channel >= 0)
-                qmf_64_subbands(s, k, samples,
-                                s->samples_chanptr[channel],
-                                /* Upsampling needs a factor 2 here. */
-                                M_SQRT2 / 32768.0);
+        if ((ret = ff_dca_xll_filter_frame(&s->xll, frame)) < 0) {
+            // Fall back to core unless hard error
+            if (!(s->packet & DCA_PACKET_CORE))
+                return ret;
+            if (ret != AVERROR_INVALIDDATA || (avctx->err_recognition & AV_EF_EXPLODE))
+                return ret;
+            if ((ret = ff_dca_core_filter_frame(&s->core, frame)) < 0)
+                return ret;
         }
+    } else if (s->packet & DCA_PACKET_CORE) {
+        if ((ret = ff_dca_core_filter_frame(&s->core, frame)) < 0)
+            return ret;
+        if (s->core.filter_mode & DCA_FILTER_MODE_FIXED)
+            s->packet |= DCA_PACKET_RESIDUAL;
     } else {
-        /* 32 subbands QMF */
-        LOCAL_ALIGNED(32, float, samples, [DCA_SUBBANDS], [SAMPLES_PER_SUBBAND]);
-
-        for (k = 0; k < s->audio_header.prim_channels; k++) {
-            int channel = s->channel_order_tab[k];
-            int32_t (*subband_samples)[SAMPLES_PER_SUBBAND] =
-                     s->dca_chan[k].subband_samples[block_index];
-
-            s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
-                                       DCA_SUBBANDS * SAMPLES_PER_SUBBAND);
-
-            if (channel >= 0)
-                qmf_32_subbands(s, k, samples,
-                                s->samples_chanptr[channel],
-                                M_SQRT1_2 / 32768.0);
-        }
-    }
-
-    /* Generate LFE samples for this subsubframe FIXME!!! */
-    if (s->lfe) {
-        float *samples = s->samples_chanptr[ff_dca_lfe_index[s->amode]];
-        lfe_interpolation_fir(s,
-                              s->lfe_data + 2 * s->lfe * (block_index + 4),
-                              samples);
-        if (upsample) {
-            unsigned i;
-            /* Should apply the filter in Table 6-11 when upsampling. For
-             * now, just duplicate. */
-            for (i = 511; i > 0; i--) {
-                samples[2 * i]     =
-                samples[2 * i + 1] = samples[i];
-            }
-            samples[1] = samples[0];
-        }
-    }
-
-    /* FIXME: This downmixing is probably broken with upsample.
-     * Probably totally broken also with XLL in general. */
-    /* Downmixing to Stereo */
-    if (downmix) {
-        dca_downmix(s->samples_chanptr, s->amode, !!s->lfe, s->downmix_coef,
-                    s->channel_order_tab);
-    }
-
-    return 0;
-}
-
-static int dca_subframe_footer(DCAContext *s, int base_channel)
-{
-    int in, out, aux_data_count, aux_data_end, reserved;
-    uint32_t nsyncaux;
-
-    /*
-     * Unpack optional information
-     */
-
-    /* presumably optional information only appears in the core? */
-    if (!base_channel) {
-        if (s->timestamp)
-            skip_bits_long(&s->gb, 32);
-
-        if (s->aux_data) {
-            aux_data_count = get_bits(&s->gb, 6);
-
-            // align (32-bit)
-            skip_bits_long(&s->gb, (-get_bits_count(&s->gb)) & 31);
-
-            aux_data_end = 8 * aux_data_count + get_bits_count(&s->gb);
-
-            if ((nsyncaux = get_bits_long(&s->gb, 32)) != DCA_NSYNCAUX) {
-                av_log(s->avctx, AV_LOG_ERROR, "nSYNCAUX mismatch %#"PRIx32"\n",
-                       nsyncaux);
-                return AVERROR_INVALIDDATA;
-            }
-
-            if (get_bits1(&s->gb)) { // bAUXTimeStampFlag
-                avpriv_request_sample(s->avctx,
-                                      "Auxiliary Decode Time Stamp Flag");
-                // align (4-bit)
-                skip_bits(&s->gb, (-get_bits_count(&s->gb)) & 4);
-                // 44 bits: nMSByte (8), nMarker (4), nLSByte (28), nMarker (4)
-                skip_bits_long(&s->gb, 44);
-            }
-
-            if ((s->core_downmix = get_bits1(&s->gb))) {
-                int am = get_bits(&s->gb, 3);
-                switch (am) {
-                case 0:
-                    s->core_downmix_amode = DCA_MONO;
-                    break;
-                case 1:
-                    s->core_downmix_amode = DCA_STEREO;
-                    break;
-                case 2:
-                    s->core_downmix_amode = DCA_STEREO_TOTAL;
-                    break;
-                case 3:
-                    s->core_downmix_amode = DCA_3F;
-                    break;
-                case 4:
-                    s->core_downmix_amode = DCA_2F1R;
-                    break;
-                case 5:
-                    s->core_downmix_amode = DCA_2F2R;
-                    break;
-                case 6:
-                    s->core_downmix_amode = DCA_3F1R;
-                    break;
-                default:
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Invalid mode %d for embedded downmix coefficients\n",
-                           am);
-                    return AVERROR_INVALIDDATA;
-                }
-                for (out = 0; out < ff_dca_channels[s->core_downmix_amode]; out++) {
-                    for (in = 0; in < s->audio_header.prim_channels + !!s->lfe; in++) {
-                        uint16_t tmp = get_bits(&s->gb, 9);
-                        if ((tmp & 0xFF) > 241) {
-                            av_log(s->avctx, AV_LOG_ERROR,
-                                   "Invalid downmix coefficient code %"PRIu16"\n",
-                                   tmp);
-                            return AVERROR_INVALIDDATA;
-                        }
-                        s->core_downmix_codes[in][out] = tmp;
-                    }
-                }
-            }
-
-            align_get_bits(&s->gb); // byte align
-            skip_bits(&s->gb, 16);  // nAUXCRC16
-
-            /*
-             * additional data (reserved, cf. ETSI TS 102 114 V1.4.1)
-             *
-             * Note: don't check for overreads, aux_data_count can't be trusted.
-             */
-            if ((reserved = (aux_data_end - get_bits_count(&s->gb))) > 0) {
-                avpriv_request_sample(s->avctx,
-                                      "Core auxiliary data reserved content");
-                skip_bits_long(&s->gb, reserved);
-            }
-        }
-
-        if (s->crc_present && s->dynrange)
-            get_bits(&s->gb, 16);
-    }
-
-    return 0;
-}
-
-/**
- * Decode a dca frame block
- *
- * @param s     pointer to the DCAContext
- */
-
-static int dca_decode_block(DCAContext *s, int base_channel, int block_index)
-{
-    int ret;
-
-    /* Sanity check */
-    if (s->current_subframe >= s->audio_header.subframes) {
-        av_log(s->avctx, AV_LOG_DEBUG, "check failed: %i>%i",
-               s->current_subframe, s->audio_header.subframes);
+        av_log(avctx, AV_LOG_ERROR, "No valid DCA sub-stream found\n");
+        if (s->core_only)
+            av_log(avctx, AV_LOG_WARNING, "Consider disabling 'core_only' option\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (!s->current_subsubframe) {
-        /* Read subframe header */
-        if ((ret = dca_subframe_header(s, base_channel, block_index)))
-            return ret;
-    }
-
-    /* Read subsubframe */
-    if ((ret = dca_subsubframe(s, base_channel, block_index)))
-        return ret;
-
-    /* Update state */
-    s->current_subsubframe++;
-    if (s->current_subsubframe >= s->subsubframes[s->current_subframe]) {
-        s->current_subsubframe = 0;
-        s->current_subframe++;
-    }
-    if (s->current_subframe >= s->audio_header.subframes) {
-        /* Read subframe footer */
-        if ((ret = dca_subframe_footer(s, base_channel)))
-            return ret;
-    }
-
-    return 0;
-}
+    *got_frame_ptr = 1;
 
-static float dca_dmix_code(unsigned code)
-{
-    int sign = (code >> 8) - 1;
-    code &= 0xff;
-    return ((ff_dca_dmixtable[code] ^ sign) - sign) * (1.0 / (1U << 15));
+    return avpkt->size;
 }
 
-static int scan_for_extensions(AVCodecContext *avctx)
+static av_cold void dcadec_flush(AVCodecContext *avctx)
 {
     DCAContext *s = avctx->priv_data;
-    int core_ss_end, ret = 0;
-
-    core_ss_end = FFMIN(s->frame_size, s->dca_buffer_size) * 8;
-
-    /* only scan for extensions if ext_descr was unknown or indicated a
-     * supported XCh extension */
-    if (s->core_ext_mask < 0 || s->core_ext_mask & DCA_EXT_XCH) {
-        /* if ext_descr was unknown, clear s->core_ext_mask so that the
-         * extensions scan can fill it up */
-        s->core_ext_mask = FFMAX(s->core_ext_mask, 0);
-
-        /* extensions start at 32-bit boundaries into bitstream */
-        skip_bits_long(&s->gb, (-get_bits_count(&s->gb)) & 31);
-
-        while (core_ss_end - get_bits_count(&s->gb) >= 32) {
-            uint32_t bits = get_bits_long(&s->gb, 32);
-            int i;
-
-            switch (bits) {
-            case DCA_SYNCWORD_XCH: {
-                int ext_amode, xch_fsize;
-
-                s->xch_base_channel = s->audio_header.prim_channels;
-
-                /* validate sync word using XCHFSIZE field */
-                xch_fsize = show_bits(&s->gb, 10);
-                if ((s->frame_size != (get_bits_count(&s->gb) >> 3) - 4 + xch_fsize) &&
-                    (s->frame_size != (get_bits_count(&s->gb) >> 3) - 4 + xch_fsize + 1))
-                    continue;
-
-                /* skip length-to-end-of-frame field for the moment */
-                skip_bits(&s->gb, 10);
-
-                s->core_ext_mask |= DCA_EXT_XCH;
-
-                /* extension amode(number of channels in extension) should be 1 */
-                /* AFAIK XCh is not used for more channels */
-                if ((ext_amode = get_bits(&s->gb, 4)) != 1) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "XCh extension amode %d not supported!\n",
-                           ext_amode);
-                    continue;
-                }
-
-                /* much like core primary audio coding header */
-                dca_parse_audio_coding_header(s, s->xch_base_channel);
 
-                for (i = 0; i < (s->sample_blocks / 8); i++)
-                    if ((ret = dca_decode_block(s, s->xch_base_channel, i))) {
-                        av_log(avctx, AV_LOG_ERROR, "error decoding XCh extension\n");
-                        continue;
-                    }
+    ff_dca_core_flush(&s->core);
+    ff_dca_xll_flush(&s->xll);
+    ff_dca_lbr_flush(&s->lbr);
 
-                s->xch_present = 1;
-                break;
-            }
-            case DCA_SYNCWORD_XXCH:
-                /* XXCh: extended channels */
-                /* usually found either in core or HD part in DTS-HD HRA streams,
-                 * but not in DTS-ES which contains XCh extensions instead */
-                s->core_ext_mask |= DCA_EXT_XXCH;
-                break;
-
-            case 0x1d95f262: {
-                int fsize96 = show_bits(&s->gb, 12) + 1;
-                if (s->frame_size != (get_bits_count(&s->gb) >> 3) - 4 + fsize96)
-                    continue;
-
-                av_log(avctx, AV_LOG_DEBUG, "X96 extension found at %d bits\n",
-                       get_bits_count(&s->gb));
-                skip_bits(&s->gb, 12);
-                av_log(avctx, AV_LOG_DEBUG, "FSIZE96 = %d bytes\n", fsize96);
-                av_log(avctx, AV_LOG_DEBUG, "REVNO = %d\n", get_bits(&s->gb, 4));
-
-                s->core_ext_mask |= DCA_EXT_X96;
-                break;
-            }
-            }
-
-            skip_bits_long(&s->gb, (-get_bits_count(&s->gb)) & 31);
-        }
-    } else {
-        /* no supported extensions, skip the rest of the core substream */
-        skip_bits_long(&s->gb, core_ss_end - get_bits_count(&s->gb));
-    }
-
-    if (s->core_ext_mask & DCA_EXT_X96)
-        s->profile = FF_PROFILE_DTS_96_24;
-    else if (s->core_ext_mask & (DCA_EXT_XCH | DCA_EXT_XXCH))
-        s->profile = FF_PROFILE_DTS_ES;
-
-    /* check for ExSS (HD part) */
-    if (s->dca_buffer_size - s->frame_size > 32 &&
-        get_bits_long(&s->gb, 32) == DCA_SYNCWORD_SUBSTREAM)
-        ff_dca_exss_parse_header(s);
-
-    return ret;
+    s->packet &= DCA_PACKET_MASK;
 }
 
-static int set_channel_layout(AVCodecContext *avctx, int channels)
+static av_cold int dcadec_close(AVCodecContext *avctx)
 {
     DCAContext *s = avctx->priv_data;
-    int num_core_channels = s->audio_header.prim_channels;
-    int i;
-
-    if (s->amode < 16) {
-        avctx->channel_layout = dca_core_channel_layout[s->amode];
-
-        if (s->audio_header.prim_channels + !!s->lfe > 2 &&
-            avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-            /*
-             * Neither the core's auxiliary data nor our default tables contain
-             * downmix coefficients for the additional channel coded in the XCh
-             * extension, so when we're doing a Stereo downmix, don't decode it.
-             */
-            s->xch_disable = 1;
-        }
 
-        if (s->xch_present && !s->xch_disable) {
-            avctx->channel_layout |= AV_CH_BACK_CENTER;
-            if (s->lfe) {
-                avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
-                s->channel_order_tab = ff_dca_channel_reorder_lfe_xch[s->amode];
-            } else {
-                s->channel_order_tab = ff_dca_channel_reorder_nolfe_xch[s->amode];
-            }
-        } else {
-            channels       = num_core_channels + !!s->lfe;
-            s->xch_present = 0; /* disable further xch processing */
-            if (s->lfe) {
-                avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
-                s->channel_order_tab = ff_dca_channel_reorder_lfe[s->amode];
-            } else
-                s->channel_order_tab = ff_dca_channel_reorder_nolfe[s->amode];
-        }
+    ff_dca_core_close(&s->core);
+    ff_dca_xll_close(&s->xll);
+    ff_dca_lbr_close(&s->lbr);
 
-        if (channels < ff_dca_channels[s->amode] + !!s->lfe)
-            return AVERROR_INVALIDDATA;
-
-        if (channels > !!s->lfe &&
-            s->channel_order_tab[channels - 1 - !!s->lfe] < 0)
-            return AVERROR_INVALIDDATA;
-
-        if (num_core_channels + !!s->lfe > 2 &&
-            avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-            channels              = 2;
-            s->output             = s->audio_header.prim_channels == 2 ? s->amode : DCA_STEREO;
-            avctx->channel_layout = AV_CH_LAYOUT_STEREO;
-
-            /* Stereo downmix coefficients
-             *
-             * The decoder can only downmix to 2-channel, so we need to ensure
-             * embedded downmix coefficients are actually targeting 2-channel.
-             */
-            if (s->core_downmix && (s->core_downmix_amode == DCA_STEREO ||
-                                    s->core_downmix_amode == DCA_STEREO_TOTAL)) {
-                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                    /* Range checked earlier */
-                    s->downmix_coef[i][0] = dca_dmix_code(s->core_downmix_codes[i][0]);
-                    s->downmix_coef[i][1] = dca_dmix_code(s->core_downmix_codes[i][1]);
-                }
-                s->output = s->core_downmix_amode;
-            } else {
-                int am = s->amode & DCA_CHANNEL_MASK;
-                if (am >= FF_ARRAY_ELEMS(ff_dca_default_coeffs)) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Invalid channel mode %d\n", am);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (num_core_channels + !!s->lfe >
-                    FF_ARRAY_ELEMS(ff_dca_default_coeffs[0])) {
-                    avpriv_request_sample(s->avctx, "Downmixing %d channels",
-                                          s->audio_header.prim_channels + !!s->lfe);
-                    return AVERROR_PATCHWELCOME;
-                }
-                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                    s->downmix_coef[i][0] = ff_dca_default_coeffs[am][i][0];
-                    s->downmix_coef[i][1] = ff_dca_default_coeffs[am][i][1];
-                }
-            }
-            ff_dlog(s->avctx, "Stereo downmix coeffs:\n");
-            for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                ff_dlog(s->avctx, "L, input channel %d = %f\n", i,
-                        s->downmix_coef[i][0]);
-                ff_dlog(s->avctx, "R, input channel %d = %f\n", i,
-                        s->downmix_coef[i][1]);
-            }
-            ff_dlog(s->avctx, "\n");
-        }
-    } else {
-        av_log(avctx, AV_LOG_ERROR, "Nonstandard configuration %d !\n", s->amode);
-        return AVERROR_INVALIDDATA;
-    }
+    av_freep(&s->buffer);
+    s->buffer_size = 0;
 
     return 0;
 }
 
-/**
- * Main frame decoding function
- * FIXME add arguments
- */
-static int dca_decode_frame(AVCodecContext *avctx, void *data,
-                            int *got_frame_ptr, AVPacket *avpkt)
+static av_cold int dcadec_init(AVCodecContext *avctx)
 {
-    AVFrame *frame     = data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-
-    int lfe_samples;
-    int i, ret;
-    float  **samples_flt;
     DCAContext *s = avctx->priv_data;
-    int channels, full_channels;
-    int upsample = 0;
-    int downmix;
-
-    s->exss_ext_mask = 0;
-    s->xch_present   = 0;
 
-    s->dca_buffer_size = ff_dca_convert_bitstream(buf, buf_size, s->dca_buffer,
-                                                  DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE);
-    if (s->dca_buffer_size == AVERROR_INVALIDDATA) {
-        av_log(avctx, AV_LOG_ERROR, "Not a valid DCA frame\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    if ((ret = dca_parse_frame_header(s)) < 0) {
-        // seems like the frame is corrupt, try with the next one
-        return ret;
-    }
-    // set AVCodec values with parsed data
-    avctx->sample_rate = s->sample_rate;
-    avctx->bit_rate    = s->bit_rate;
-
-    s->profile = FF_PROFILE_DTS;
-
-    for (i = 0; i < (s->sample_blocks / SAMPLES_PER_SUBBAND); i++) {
-        if ((ret = dca_decode_block(s, 0, i))) {
-            av_log(avctx, AV_LOG_ERROR, "error decoding block\n");
-            return ret;
-        }
-    }
-
-    if (s->ext_coding)
-        s->core_ext_mask = dca_ext_audio_descr_mask[s->ext_descr];
-    else
-        s->core_ext_mask = 0;
-
-    ret = scan_for_extensions(avctx);
-
-    avctx->profile = s->profile;
-
-    full_channels = channels = s->audio_header.prim_channels + !!s->lfe;
-
-    ret = set_channel_layout(avctx, channels);
-    if (ret < 0)
-        return ret;
-    avctx->channels = channels;
-
-    /* get output buffer */
-    frame->nb_samples = 256 * (s->sample_blocks / SAMPLES_PER_SUBBAND);
-    if (s->exss_ext_mask & DCA_EXT_EXSS_XLL) {
-        int xll_nb_samples = s->xll_segments * s->xll_smpl_in_seg;
-        /* Check for invalid/unsupported conditions first */
-        if (s->xll_residual_channels > channels) {
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "DCA: too many residual channels (%d, core channels %d). Disabling XLL\n",
-                   s->xll_residual_channels, channels);
-            s->exss_ext_mask &= ~DCA_EXT_EXSS_XLL;
-        } else if (xll_nb_samples != frame->nb_samples &&
-                   2 * frame->nb_samples != xll_nb_samples) {
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "DCA: unsupported upsampling (%d XLL samples, %d core samples). Disabling XLL\n",
-                   xll_nb_samples, frame->nb_samples);
-            s->exss_ext_mask &= ~DCA_EXT_EXSS_XLL;
-        } else {
-            if (2 * frame->nb_samples == xll_nb_samples) {
-                av_log(s->avctx, AV_LOG_INFO,
-                       "XLL: upsampling core channels by a factor of 2\n");
-                upsample = 1;
-
-                frame->nb_samples = xll_nb_samples;
-                // FIXME: Is it good enough to copy from the first channel set?
-                avctx->sample_rate = s->xll_chsets[0].sampling_frequency;
-            }
-            /* If downmixing to stereo, don't decode additional channels.
-             * FIXME: Using the xch_disable flag for this doesn't seem right. */
-            if (!s->xch_disable)
-                avctx->channels += s->xll_channels - s->xll_residual_channels;
-        }
-    }
-
-    /* FIXME: This is an ugly hack, to just revert to the default
-     * layout if we have additional channels. Need to convert the XLL
-     * channel masks to libav channel_layout mask. */
-    if (av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels)
-        avctx->channel_layout = 0;
-
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
-    samples_flt = (float **) frame->extended_data;
-
-    /* allocate buffer for extra channels if downmixing */
-    if (avctx->channels < full_channels) {
-        ret = av_samples_get_buffer_size(NULL, full_channels - channels,
-                                         frame->nb_samples,
-                                         avctx->sample_fmt, 0);
-        if (ret < 0)
-            return ret;
-
-        av_fast_malloc(&s->extra_channels_buffer,
-                       &s->extra_channels_buffer_size, ret);
-        if (!s->extra_channels_buffer)
-            return AVERROR(ENOMEM);
-
-        ret = av_samples_fill_arrays((uint8_t **) s->extra_channels, NULL,
-                                     s->extra_channels_buffer,
-                                     full_channels - channels,
-                                     frame->nb_samples, avctx->sample_fmt, 0);
-        if (ret < 0)
-            return ret;
-    }
-
-    downmix = s->audio_header.prim_channels > 2 &&
-              avctx->request_channel_layout == AV_CH_LAYOUT_STEREO;
-
-    /* filter to get final output */
-    for (i = 0; i < (s->sample_blocks / SAMPLES_PER_SUBBAND); i++) {
-        int ch;
-        unsigned block = upsample ? 512 : 256;
-        for (ch = 0; ch < channels; ch++)
-            s->samples_chanptr[ch] = samples_flt[ch] + i * block;
-        for (; ch < full_channels; ch++)
-            s->samples_chanptr[ch] = s->extra_channels[ch - channels] + i * block;
-
-        dca_filter_channels(s, i, upsample, downmix);
-
-        /* If this was marked as a DTS-ES stream we need to subtract back- */
-        /* channel from SL & SR to remove matrixed back-channel signal */
-        if ((s->source_pcm_res & 1) && s->xch_present) {
-            float *back_chan = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel]];
-            float *lt_chan   = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel - 2]];
-            float *rt_chan   = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel - 1]];
-            s->fdsp.vector_fmac_scalar(lt_chan, back_chan, -M_SQRT1_2, 256);
-            s->fdsp.vector_fmac_scalar(rt_chan, back_chan, -M_SQRT1_2, 256);
-        }
-    }
-
-    /* update lfe history */
-    lfe_samples = 2 * s->lfe * (s->sample_blocks / SAMPLES_PER_SUBBAND);
-    for (i = 0; i < 2 * s->lfe * 4; i++)
-        s->lfe_data[i] = s->lfe_data[i + lfe_samples];
-
-    if (s->exss_ext_mask & DCA_EXT_EXSS_XLL) {
-        ret = ff_dca_xll_decode_audio(s, frame);
-        if (ret < 0)
-            return ret;
-    }
-    /* AVMatrixEncoding
-     *
-     * DCA_STEREO_TOTAL (Lt/Rt) is equivalent to Dolby Surround */
-    ret = ff_side_data_update_matrix_encoding(frame,
-                                              (s->output & ~DCA_LFE) == DCA_STEREO_TOTAL ?
-                                              AV_MATRIX_ENCODING_DOLBY : AV_MATRIX_ENCODING_NONE);
-    if (ret < 0)
-        return ret;
-
-    *got_frame_ptr = 1;
-
-    return buf_size;
-}
+    s->avctx = avctx;
+    s->core.avctx = avctx;
+    s->exss.avctx = avctx;
+    s->xll.avctx = avctx;
+    s->lbr.avctx = avctx;
 
-/**
- * DCA initialization
- *
- * @param avctx     pointer to the AVCodecContext
- */
+    ff_dca_init_vlcs();
 
-static av_cold int dca_decode_init(AVCodecContext *avctx)
-{
-    DCAContext *s = avctx->priv_data;
+    if (ff_dca_core_init(&s->core) < 0)
+        return AVERROR(ENOMEM);
 
-    s->avctx = avctx;
-    dca_init_vlcs();
+    if (ff_dca_lbr_init(&s->lbr) < 0)
+        return AVERROR(ENOMEM);
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-    ff_mdct_init(&s->imdct, 6, 1, 1.0);
-    ff_synth_filter_init(&s->synth);
     ff_dcadsp_init(&s->dcadsp);
-    ff_fmt_convert_init(&s->fmt_conv, avctx);
+    s->core.dcadsp = &s->dcadsp;
+    s->xll.dcadsp = &s->dcadsp;
+    s->lbr.dcadsp = &s->dcadsp;
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    s->crctab = av_crc_get_table(AV_CRC_16_CCITT);
 
-    /* allow downmixing to stereo */
-    if (avctx->channels > 2 &&
-        avctx->request_channel_layout == AV_CH_LAYOUT_STEREO)
-        avctx->channels = 2;
+    switch (avctx->request_channel_layout & ~AV_CH_LAYOUT_NATIVE) {
+    case 0:
+        s->request_channel_layout = 0;
+        break;
+    case AV_CH_LAYOUT_STEREO:
+    case AV_CH_LAYOUT_STEREO_DOWNMIX:
+        s->request_channel_layout = DCA_SPEAKER_LAYOUT_STEREO;
+        break;
+    case AV_CH_LAYOUT_5POINT0:
+        s->request_channel_layout = DCA_SPEAKER_LAYOUT_5POINT0;
+        break;
+    case AV_CH_LAYOUT_5POINT1:
+        s->request_channel_layout = DCA_SPEAKER_LAYOUT_5POINT1;
+        break;
+    default:
+        av_log(avctx, AV_LOG_WARNING, "Invalid request_channel_layout\n");
+        break;
+    }
 
     return 0;
 }
 
-static av_cold int dca_decode_end(AVCodecContext *avctx)
-{
-    DCAContext *s = avctx->priv_data;
-    ff_mdct_end(&s->imdct);
-    av_freep(&s->extra_channels_buffer);
-    av_freep(&s->xll_sample_buf);
-    av_freep(&s->qmf64_table);
-    return 0;
-}
+#define OFFSET(x) offsetof(DCAContext, x)
+#define PARAM AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
-static const AVOption options[] = {
-    { "disable_xch", "disable decoding of the XCh extension", offsetof(DCAContext, xch_disable), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM },
-    { "disable_xll", "disable decoding of the XLL extension", offsetof(DCAContext, xll_disable), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM },
-    { NULL },
+static const AVOption dcadec_options[] = {
+    { "core_only", "Decode core only without extensions", OFFSET(core_only), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, PARAM },
+    { NULL }
 };
 
-static const AVClass dca_decoder_class = {
+static const AVClass dcadec_class = {
     .class_name = "DCA decoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = dcadec_options,
     .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DECODER,
 };
 
 AVCodec ff_dca_decoder = {
-    .name            = "dca",
-    .long_name       = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"),
-    .type            = AVMEDIA_TYPE_AUDIO,
-    .id              = AV_CODEC_ID_DTS,
-    .priv_data_size  = sizeof(DCAContext),
-    .init            = dca_decode_init,
-    .decode          = dca_decode_frame,
-    .close           = dca_decode_end,
-    .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
-    .sample_fmts     = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                       AV_SAMPLE_FMT_NONE },
-    .profiles        = NULL_IF_CONFIG_SMALL(ff_dca_profiles),
-    .priv_class      = &dca_decoder_class,
+    .name           = "dca",
+    .long_name      = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_DTS,
+    .priv_data_size = sizeof(DCAContext),
+    .init           = dcadec_init,
+    .decode         = dcadec_decode_frame,
+    .close          = dcadec_close,
+    .flush          = dcadec_flush,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_CHANNEL_CONF,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S32P,
+                                                      AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_NONE },
+    .priv_class     = &dcadec_class,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_dca_profiles),
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
diff --git a/libavcodec/dcadec.h b/libavcodec/dcadec.h
new file mode 100644
index 0000000..9da8d3b
--- /dev/null
+++ b/libavcodec/dcadec.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCADEC_H
+#define AVCODEC_DCADEC_H
+
+#include "libavutil/common.h"
+#include "libavutil/crc.h"
+#include "libavutil/float_dsp.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dcadsp.h"
+#include "dca_core.h"
+#include "dca_exss.h"
+#include "dca_xll.h"
+#include "dca_lbr.h"
+
+#define DCA_PACKET_CORE         0x01
+#define DCA_PACKET_EXSS         0x02
+#define DCA_PACKET_XLL          0x04
+#define DCA_PACKET_LBR          0x08
+#define DCA_PACKET_MASK         0x0f
+
+#define DCA_PACKET_RECOVERY     0x10    ///< Sync error recovery flag
+#define DCA_PACKET_RESIDUAL     0x20    ///< Core valid for residual decoding
+
+typedef struct DCAContext {
+    const AVClass   *class;       ///< class for AVOptions
+    AVCodecContext  *avctx;
+
+    DCACoreDecoder core;  ///< Core decoder context
+    DCAExssParser  exss;  ///< EXSS parser context
+    DCAXllDecoder  xll;   ///< XLL decoder context
+    DCALbrDecoder  lbr;   ///< LBR decoder context
+
+    DCADSPContext   dcadsp;
+
+    const AVCRC     *crctab;
+
+    uint8_t         *buffer;    ///< Packet buffer
+    unsigned int    buffer_size;
+
+    int     packet; ///< Packet flags
+
+    int     request_channel_layout; ///< Converted from avctx.request_channel_layout
+    int     core_only;              ///< Core only decoding flag
+} DCAContext;
+
+int ff_dca_set_channel_layout(AVCodecContext *avctx, int *ch_remap, int dca_mask);
+
+void ff_dca_downmix_to_stereo_fixed(DCADSPContext *dcadsp, int32_t **samples,
+                                    int *coeff_l, int nsamples, int ch_mask);
+void ff_dca_downmix_to_stereo_float(AVFloatDSPContext *fdsp, float **samples,
+                                    int *coeff_l, int nsamples, int ch_mask);
+
+static inline int ff_dca_check_crc(AVCodecContext *avctx, GetBitContext *s,
+                                   int p1, int p2)
+{
+    DCAContext *dca = avctx->priv_data;
+
+    if (!(avctx->err_recognition & (AV_EF_CRCCHECK | AV_EF_CAREFUL)))
+        return 0;
+    if (((p1 | p2) & 7) || p1 < 0 || p2 > s->size_in_bits || p2 - p1 < 16)
+        return -1;
+    if (av_crc(dca->crctab, 0xffff, s->buffer + p1 / 8, (p2 - p1) / 8))
+        return -1;
+    return 0;
+}
+
+static inline int ff_dca_seek_bits(GetBitContext *s, int p)
+{
+    if (p < get_bits_count(s) || p > s->size_in_bits)
+        return -1;
+    skip_bits_long(s, p - get_bits_count(s));
+    return 0;
+}
+
+#endif
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index beec200..fade1a6 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -1,134 +1,490 @@
 /*
- * Copyright (c) 2004 Gildas Bazin
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
 
 #include "dcadsp.h"
 #include "dcamath.h"
 
-static void decode_hf_c(int32_t dst[DCA_SUBBANDS][SAMPLES_PER_SUBBAND],
-                        const int32_t vq_num[DCA_SUBBANDS],
-                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
-                        int32_t scale[DCA_SUBBANDS][2],
-                        intptr_t start, intptr_t end)
+static void decode_hf_c(int32_t **dst,
+                        const int32_t *vq_index,
+                        const int8_t hf_vq[1024][32],
+                        int32_t scale_factors[32][2],
+                        ptrdiff_t sb_start, ptrdiff_t sb_end,
+                        ptrdiff_t ofs, ptrdiff_t len)
+{
+    int i, j;
+
+    for (i = sb_start; i < sb_end; i++) {
+        const int8_t *coeff = hf_vq[vq_index[i]];
+        int32_t scale = scale_factors[i][0];
+        for (j = 0; j < len; j++)
+            dst[i][j + ofs] = clip23(coeff[j] * scale + (1 << 3) >> 4);
+    }
+}
+
+static void decode_joint_c(int32_t **dst, int32_t **src,
+                           const int32_t *scale_factors,
+                           ptrdiff_t sb_start, ptrdiff_t sb_end,
+                           ptrdiff_t ofs, ptrdiff_t len)
+{
+    int i, j;
+
+    for (i = sb_start; i < sb_end; i++) {
+        int32_t scale = scale_factors[i];
+        for (j = 0; j < len; j++)
+            dst[i][j + ofs] = clip23(mul17(src[i][j + ofs], scale));
+    }
+}
+
+static void lfe_fir_float_c(float *pcm_samples, int32_t *lfe_samples,
+                            const float *filter_coeff, ptrdiff_t npcmblocks,
+                            int dec_select)
+{
+    // Select decimation factor
+    int factor = 64 << dec_select;
+    int ncoeffs = 8 >> dec_select;
+    int nlfesamples = npcmblocks >> (dec_select + 1);
+    int i, j, k;
+
+    for (i = 0; i < nlfesamples; i++) {
+        // One decimated sample generates 64 or 128 interpolated ones
+        for (j = 0; j < factor / 2; j++) {
+            float a = 0;
+            float b = 0;
+
+            for (k = 0; k < ncoeffs; k++) {
+                a += filter_coeff[      j * ncoeffs + k] * lfe_samples[-k];
+                b += filter_coeff[255 - j * ncoeffs - k] * lfe_samples[-k];
+            }
+
+            pcm_samples[             j] = a;
+            pcm_samples[factor / 2 + j] = b;
+        }
+
+        lfe_samples++;
+        pcm_samples += factor;
+    }
+}
+
+static void lfe_fir0_float_c(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks)
+{
+    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 0);
+}
+
+static void lfe_fir1_float_c(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks)
+{
+    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 1);
+}
+
+static void lfe_x96_float_c(float *dst, const float *src,
+                            float *hist, ptrdiff_t len)
+{
+    float prev = *hist;
+    int i;
+
+    for (i = 0; i < len; i++) {
+        float a = 0.25f * src[i] + 0.75f * prev;
+        float b = 0.75f * src[i] + 0.25f * prev;
+        prev = src[i];
+        *dst++ = a;
+        *dst++ = b;
+    }
+
+    *hist = prev;
+}
+
+static void sub_qmf32_float_c(SynthFilterContext *synth,
+                              FFTContext *imdct,
+                              float *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              float *hist1, int *offset, float *hist2,
+                              const float *filter_coeff, ptrdiff_t npcmblocks,
+                              float scale)
 {
+    LOCAL_ALIGNED_32(float, input, [32]);
     int i, j;
 
-    for (j = start; j < end; j++) {
-        const int8_t *ptr = &hf_vq[vq_num[j]][vq_offset];
-        for (i = 0; i < 8; i++)
-            dst[j][i] = ptr[i] * scale[j][0] + 8 >> 4;
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        for (i = 0; i < 32; i++) {
+            if ((i - 1) & 2)
+                input[i] = -subband_samples_lo[i][j];
+            else
+                input[i] =  subband_samples_lo[i][j];
+        }
+
+        // One subband sample generates 32 interpolated ones
+        synth->synth_filter_float(imdct, hist1, offset,
+                                  hist2, filter_coeff,
+                                  pcm_samples, input, scale);
+        pcm_samples += 32;
     }
 }
 
-static inline void dca_lfe_fir(float *out, const float *in, const float *coefs,
-                               int decifactor)
+static void sub_qmf64_float_c(SynthFilterContext *synth,
+                              FFTContext *imdct,
+                              float *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              float *hist1, int *offset, float *hist2,
+                              const float *filter_coeff, ptrdiff_t npcmblocks,
+                              float scale)
 {
-    float *out2    = out + 2 * decifactor - 1;
-    int num_coeffs = 256 / decifactor;
-    int j, k;
+    LOCAL_ALIGNED_32(float, input, [64]);
+    int i, j;
 
-    /* One decimated sample generates 2*decifactor interpolated ones */
-    for (k = 0; k < decifactor; k++) {
-        float v0 = 0.0;
-        float v1 = 0.0;
-        for (j = 0; j < num_coeffs; j++, coefs++) {
-            v0 += in[-j]                 * *coefs;
-            v1 += in[j + 1 - num_coeffs] * *coefs;
+    if (!subband_samples_hi)
+        memset(&input[32], 0, sizeof(input[0]) * 32);
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        if (subband_samples_hi) {
+            // Full 64 subbands, first 32 are residual coded
+            for (i =  0; i < 32; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_lo[i][j] - subband_samples_hi[i][j];
+                else
+                    input[i] =  subband_samples_lo[i][j] + subband_samples_hi[i][j];
+            }
+            for (i = 32; i < 64; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_hi[i][j];
+                else
+                    input[i] =  subband_samples_hi[i][j];
+            }
+        } else {
+            // Only first 32 subbands
+            for (i =  0; i < 32; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_lo[i][j];
+                else
+                    input[i] =  subband_samples_lo[i][j];
+            }
         }
-        *out++  = v0;
-        *out2-- = v1;
+
+        // One subband sample generates 64 interpolated ones
+        synth->synth_filter_float_64(imdct, hist1, offset,
+                                     hist2, filter_coeff,
+                                     pcm_samples, input, scale);
+        pcm_samples += 64;
     }
 }
 
-static void dca_qmf_32_subbands(float samples_in[DCA_SUBBANDS][SAMPLES_PER_SUBBAND], int sb_act,
-                                SynthFilterContext *synth, FFTContext *imdct,
-                                float synth_buf_ptr[512],
-                                int *synth_buf_offset, float synth_buf2[32],
-                                const float window[512], float *samples_out,
-                                float raXin[32], float scale)
+static void lfe_fir_fixed_c(int32_t *pcm_samples, int32_t *lfe_samples,
+                            const int32_t *filter_coeff, ptrdiff_t npcmblocks)
 {
+    // Select decimation factor
+    int nlfesamples = npcmblocks >> 1;
+    int i, j, k;
+
+    for (i = 0; i < nlfesamples; i++) {
+        // One decimated sample generates 64 interpolated ones
+        for (j = 0; j < 32; j++) {
+            int64_t a = 0;
+            int64_t b = 0;
+
+            for (k = 0; k < 8; k++) {
+                a += (int64_t)filter_coeff[      j * 8 + k] * lfe_samples[-k];
+                b += (int64_t)filter_coeff[255 - j * 8 - k] * lfe_samples[-k];
+            }
+
+            pcm_samples[     j] = clip23(norm23(a));
+            pcm_samples[32 + j] = clip23(norm23(b));
+        }
+
+        lfe_samples++;
+        pcm_samples += 64;
+    }
+}
+
+static void lfe_x96_fixed_c(int32_t *dst, const int32_t *src,
+                            int32_t *hist, ptrdiff_t len)
+{
+    int32_t prev = *hist;
     int i;
-    int subindex;
-
-    for (i = sb_act; i < 32; i++)
-        raXin[i] = 0.0;
-
-    /* Reconstructed channel sample index */
-    for (subindex = 0; subindex < 8; subindex++) {
-        /* Load in one sample from each subband and clear inactive subbands */
-        for (i = 0; i < sb_act; i++) {
-            unsigned sign = (i - 1) & 2;
-            uint32_t v    = AV_RN32A(&samples_in[i][subindex]) ^ sign << 30;
-            AV_WN32A(&raXin[i], v);
+
+    for (i = 0; i < len; i++) {
+        int64_t a = INT64_C(2097471) * src[i] + INT64_C(6291137) * prev;
+        int64_t b = INT64_C(6291137) * src[i] + INT64_C(2097471) * prev;
+        prev = src[i];
+        *dst++ = clip23(norm23(a));
+        *dst++ = clip23(norm23(b));
+    }
+
+    *hist = prev;
+}
+
+static void sub_qmf32_fixed_c(SynthFilterContext *synth,
+                              DCADCTContext *imdct,
+                              int32_t *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              int32_t *hist1, int *offset, int32_t *hist2,
+                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
+{
+    LOCAL_ALIGNED_32(int32_t, input, [32]);
+    int i, j;
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        for (i = 0; i < 32; i++)
+            input[i] = subband_samples_lo[i][j];
+
+        // One subband sample generates 32 interpolated ones
+        synth->synth_filter_fixed(imdct, hist1, offset,
+                                  hist2, filter_coeff,
+                                  pcm_samples, input);
+        pcm_samples += 32;
+    }
+}
+
+static void sub_qmf64_fixed_c(SynthFilterContext *synth,
+                              DCADCTContext *imdct,
+                              int32_t *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              int32_t *hist1, int *offset, int32_t *hist2,
+                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
+{
+    LOCAL_ALIGNED_32(int32_t, input, [64]);
+    int i, j;
+
+    if (!subband_samples_hi)
+        memset(&input[32], 0, sizeof(input[0]) * 32);
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        if (subband_samples_hi) {
+            // Full 64 subbands, first 32 are residual coded
+            for (i =  0; i < 32; i++)
+                input[i] = subband_samples_lo[i][j] + subband_samples_hi[i][j];
+            for (i = 32; i < 64; i++)
+                input[i] = subband_samples_hi[i][j];
+        } else {
+            // Only first 32 subbands
+            for (i =  0; i < 32; i++)
+                input[i] = subband_samples_lo[i][j];
         }
 
-        synth->synth_filter_float(imdct, synth_buf_ptr, synth_buf_offset,
-                                  synth_buf2, window, samples_out, raXin,
-                                  scale);
-        samples_out += 32;
+        // One subband sample generates 64 interpolated ones
+        synth->synth_filter_fixed_64(imdct, hist1, offset,
+                                     hist2, filter_coeff,
+                                     pcm_samples, input);
+        pcm_samples += 64;
+    }
+}
+
+static void decor_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] += (SUINT)((int)(src[i] * (SUINT)coeff + (1 << 2)) >> 3);
+}
+
+static void dmix_sub_xch_c(int32_t *dst1, int32_t *dst2,
+                           const int32_t *src, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++) {
+        int32_t cs = mul23(src[i], 5931520 /* M_SQRT1_2 * (1 << 23) */);
+        dst1[i] -= cs;
+        dst2[i] -= cs;
     }
 }
 
-static void dequantize_c(int32_t *samples, uint32_t step_size, uint32_t scale)
+static void dmix_sub_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] -= (unsigned)mul15(src[i], coeff);
+}
+
+static void dmix_add_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] += mul15(src[i], coeff);
+}
+
+static void dmix_scale_c(int32_t *dst, int scale, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] = mul15(dst[i], scale);
+}
+
+static void dmix_scale_inv_c(int32_t *dst, int scale_inv, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] = mul16(dst[i], scale_inv);
+}
+
+static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] -= mul22(src[i], coeff);
+}
+
+static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
 {
-    int64_t step = (int64_t)step_size * scale;
-    int shift, i;
-    int32_t step_scale;
+    int i;
 
-    if (step > (1 << 23))
-        shift = av_log2(step >> 23) + 1;
-    else
-        shift = 0;
-    step_scale = (int32_t)(step >> shift);
+    for (i = 0; i < len; i++)
+        dst[i] -= mul23(src[i], coeff);
+}
 
-    for (i = 0; i < SAMPLES_PER_SUBBAND; i++)
-        samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale, 22 - shift));
+static void assemble_freq_bands_c(int32_t *dst, int32_t *src0, int32_t *src1,
+                                  const int32_t *coeff, ptrdiff_t len)
+{
+    int i;
+
+    filter0(src0, src1, coeff[0], len);
+    filter0(src1, src0, coeff[1], len);
+    filter0(src0, src1, coeff[2], len);
+    filter0(src1, src0, coeff[3], len);
+
+    for (i = 0; i < 8; i++, src0--) {
+        filter1(src0, src1, coeff[i +  4], len);
+        filter1(src1, src0, coeff[i + 12], len);
+        filter1(src0, src1, coeff[i +  4], len);
+    }
+
+    for (i = 0; i < len; i++) {
+        *dst++ = *src1++;
+        *dst++ = *++src0;
+    }
 }
 
-static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs)
+static void lbr_bank_c(float output[32][4], float **input,
+                       const float *coeff, ptrdiff_t ofs, ptrdiff_t len)
 {
-    dca_lfe_fir(out, in, coefs, 32);
+    float SW0 = coeff[0];
+    float SW1 = coeff[1];
+    float SW2 = coeff[2];
+    float SW3 = coeff[3];
+
+    float C1  = coeff[4];
+    float C2  = coeff[5];
+    float C3  = coeff[6];
+    float C4  = coeff[7];
+
+    float AL1 = coeff[8];
+    float AL2 = coeff[9];
+
+    int i;
+
+    // Short window and 8 point forward MDCT
+    for (i = 0; i < len; i++) {
+        float *src = input[i] + ofs;
+
+        float a = src[-4] * SW0 - src[-1] * SW3;
+        float b = src[-3] * SW1 - src[-2] * SW2;
+        float c = src[ 2] * SW1 + src[ 1] * SW2;
+        float d = src[ 3] * SW0 + src[ 0] * SW3;
+
+        output[i][0] = C1 * b - C2 * c + C4 * a - C3 * d;
+        output[i][1] = C1 * d - C2 * a - C4 * b - C3 * c;
+        output[i][2] = C3 * b + C2 * d - C4 * c + C1 * a;
+        output[i][3] = C3 * a - C2 * b + C4 * d - C1 * c;
+    }
+
+    // Aliasing cancellation for high frequencies
+    for (i = 12; i < len - 1; i++) {
+        float a = output[i  ][3] * AL1;
+        float b = output[i+1][0] * AL1;
+        output[i  ][3] += b - a;
+        output[i+1][0] -= b + a;
+        a = output[i  ][2] * AL2;
+        b = output[i+1][1] * AL2;
+        output[i  ][2] += b - a;
+        output[i+1][1] -= b + a;
+    }
 }
 
-static void dca_lfe_fir1_c(float *out, const float *in, const float *coefs)
+static void lfe_iir_c(float *output, const float *input,
+                      const float iir[5][4], float hist[5][2],
+                      ptrdiff_t factor)
 {
-    dca_lfe_fir(out, in, coefs, 64);
+    float res, tmp;
+    int i, j, k;
+
+    for (i = 0; i < 64; i++) {
+        res = *input++;
+
+        for (j = 0; j < factor; j++) {
+            for (k = 0; k < 5; k++) {
+                tmp = hist[k][0] * iir[k][0] + hist[k][1] * iir[k][1] + res;
+                res = hist[k][0] * iir[k][2] + hist[k][1] * iir[k][3] + tmp;
+
+                hist[k][0] = hist[k][1];
+                hist[k][1] = tmp;
+            }
+
+            *output++ = res;
+            res = 0;
+        }
+    }
 }
 
 av_cold void ff_dcadsp_init(DCADSPContext *s)
 {
-    s->lfe_fir[0]      = dca_lfe_fir0_c;
-    s->lfe_fir[1]      = dca_lfe_fir1_c;
-    s->qmf_32_subbands = dca_qmf_32_subbands;
-    s->decode_hf       = decode_hf_c;
-    s->dequantize      = dequantize_c;
+    s->decode_hf     = decode_hf_c;
+    s->decode_joint  = decode_joint_c;
+
+    s->lfe_fir_float[0] = lfe_fir0_float_c;
+    s->lfe_fir_float[1] = lfe_fir1_float_c;
+    s->lfe_x96_float    = lfe_x96_float_c;
+    s->sub_qmf_float[0] = sub_qmf32_float_c;
+    s->sub_qmf_float[1] = sub_qmf64_float_c;
+
+    s->lfe_fir_fixed    = lfe_fir_fixed_c;
+    s->lfe_x96_fixed    = lfe_x96_fixed_c;
+    s->sub_qmf_fixed[0] = sub_qmf32_fixed_c;
+    s->sub_qmf_fixed[1] = sub_qmf64_fixed_c;
+
+    s->decor   = decor_c;
+
+    s->dmix_sub_xch   = dmix_sub_xch_c;
+    s->dmix_sub       = dmix_sub_c;
+    s->dmix_add       = dmix_add_c;
+    s->dmix_scale     = dmix_scale_c;
+    s->dmix_scale_inv = dmix_scale_inv_c;
+
+    s->assemble_freq_bands = assemble_freq_bands_c;
+
+    s->lbr_bank = lbr_bank_c;
+    s->lfe_iir = lfe_iir_c;
 
-    if (ARCH_AARCH64)
-        ff_dcadsp_init_aarch64(s);
-    if (ARCH_ARM)
-        ff_dcadsp_init_arm(s);
     if (ARCH_X86)
         ff_dcadsp_init_x86(s);
 }
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index 9ea89ea..8f2f467 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -1,51 +1,99 @@
 /*
- * This file is part of Libav.
+ * Copyright (C) 2016 foo86
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DCADSP_H
 #define AVCODEC_DCADSP_H
 
-#include "avfft.h"
+#include "libavutil/common.h"
+
+#include "fft.h"
+#include "dcadct.h"
 #include "synth_filter.h"
 
-#define DCA_SUBBANDS_X96K  64
-#define DCA_SUBBANDS       32
-#define SAMPLES_PER_SUBBAND 8 // number of samples per subband per subsubframe
+typedef struct DCADSPContext {
+    void (*decode_hf)(int32_t **dst,
+                      const int32_t *vq_index,
+                      const int8_t hf_vq[1024][32],
+                      int32_t scale_factors[32][2],
+                      ptrdiff_t sb_start, ptrdiff_t sb_end,
+                      ptrdiff_t ofs, ptrdiff_t len);
 
+    void (*decode_joint)(int32_t **dst, int32_t **src,
+                         const int32_t *scale_factors,
+                         ptrdiff_t sb_start, ptrdiff_t sb_end,
+                         ptrdiff_t ofs, ptrdiff_t len);
 
-typedef struct DCADSPContext {
-    void (*lfe_fir[2])(float *out, const float *in, const float *coefs);
-    void (*qmf_32_subbands)(float samples_in[DCA_SUBBANDS][SAMPLES_PER_SUBBAND], int sb_act,
-                            SynthFilterContext *synth, FFTContext *imdct,
-                            float synth_buf_ptr[512],
-                            int *synth_buf_offset, float synth_buf2[32],
-                            const float window[512], float *samples_out,
-                            float raXin[32], float scale);
-    void (*decode_hf)(int32_t dst[DCA_SUBBANDS][SAMPLES_PER_SUBBAND],
-                      const int32_t vq_num[DCA_SUBBANDS],
-                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
-                      int32_t scale[DCA_SUBBANDS][2],
-                      intptr_t start, intptr_t end);
-    void (*dequantize)(int32_t *samples, uint32_t step_size, uint32_t scale);
+    void (*lfe_fir_float[2])(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
+
+    void (*lfe_x96_float)(float *dst, const float *src,
+                          float *hist, ptrdiff_t len);
+
+    void (*sub_qmf_float[2])(SynthFilterContext *synth,
+                             FFTContext *imdct,
+                             float *pcm_samples,
+                             int32_t **subband_samples_lo,
+                             int32_t **subband_samples_hi,
+                             float *hist1, int *offset, float *hist2,
+                             const float *filter_coeff, ptrdiff_t npcmblocks,
+                             float scale);
+
+    void (*lfe_fir_fixed)(int32_t *pcm_samples, int32_t *lfe_samples,
+                          const int32_t *filter_coeff, ptrdiff_t npcmblocks);
+
+    void (*lfe_x96_fixed)(int32_t *dst, const int32_t *src,
+                          int32_t *hist, ptrdiff_t len);
+
+    void (*sub_qmf_fixed[2])(SynthFilterContext *synth,
+                             DCADCTContext *imdct,
+                             int32_t *pcm_samples,
+                             int32_t **subband_samples_lo,
+                             int32_t **subband_samples_hi,
+                             int32_t *hist1, int *offset, int32_t *hist2,
+                             const int32_t *filter_coeff, ptrdiff_t npcmblocks);
+
+    void (*decor)(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len);
+
+    void (*dmix_sub_xch)(int32_t *dst1, int32_t *dst2,
+                         const int32_t *src, ptrdiff_t len);
+
+    void (*dmix_sub)(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len);
+
+    void (*dmix_add)(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len);
+
+    void (*dmix_scale)(int32_t *dst, int scale, ptrdiff_t len);
+
+    void (*dmix_scale_inv)(int32_t *dst, int scale_inv, ptrdiff_t len);
+
+    void (*assemble_freq_bands)(int32_t *dst, int32_t *src0, int32_t *src1,
+                                const int32_t *coeff, ptrdiff_t len);
+
+    void (*lbr_bank)(float output[32][4], float **input,
+                     const float *coeff, ptrdiff_t ofs, ptrdiff_t len);
+
+    void (*lfe_iir)(float *output, const float *input,
+                    const float iir[5][4], float hist[5][2],
+                    ptrdiff_t factor);
 } DCADSPContext;
 
-void ff_dcadsp_init(DCADSPContext *s);
-void ff_dcadsp_init_aarch64(DCADSPContext *s);
-void ff_dcadsp_init_arm(DCADSPContext *s);
-void ff_dcadsp_init_x86(DCADSPContext *s);
+av_cold void ff_dcadsp_init(DCADSPContext *s);
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s);
 
-#endif /* AVCODEC_DCADSP_H */
+#endif
diff --git a/libavcodec/dcaenc.c b/libavcodec/dcaenc.c
new file mode 100644
index 0000000..34b3e94
--- /dev/null
+++ b/libavcodec/dcaenc.c
@@ -0,0 +1,1262 @@
+/*
+ * DCA encoder
+ * Copyright (C) 2008-2012 Alexander E. Patrakov
+ *               2010 Benjamin Larsson
+ *               2011 Xiang Wang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+
+#include "libavutil/avassert.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/common.h"
+#include "libavutil/ffmath.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "dca.h"
+#include "dcaadpcm.h"
+#include "dcamath.h"
+#include "dca_core.h"
+#include "dcadata.h"
+#include "dcaenc.h"
+#include "fft.h"
+#include "internal.h"
+#include "mathops.h"
+#include "put_bits.h"
+
+#define MAX_CHANNELS 6
+#define DCA_MAX_FRAME_SIZE 16384
+#define DCA_HEADER_SIZE 13
+#define DCA_LFE_SAMPLES 8
+
+#define DCAENC_SUBBANDS 32
+#define SUBFRAMES 1
+#define SUBSUBFRAMES 2
+#define SUBBAND_SAMPLES (SUBFRAMES * SUBSUBFRAMES * 8)
+#define AUBANDS 25
+
+#define COS_T(x) (c->cos_table[(x) & 2047])
+
+typedef struct CompressionOptions {
+    int adpcm_mode;
+} CompressionOptions;
+
+typedef struct DCAEncContext {
+    AVClass *class;
+    PutBitContext pb;
+    DCAADPCMEncContext adpcm_ctx;
+    FFTContext mdct;
+    CompressionOptions options;
+    int frame_size;
+    int frame_bits;
+    int fullband_channels;
+    int channels;
+    int lfe_channel;
+    int samplerate_index;
+    int bitrate_index;
+    int channel_config;
+    const int32_t *band_interpolation;
+    const int32_t *band_spectrum;
+    int lfe_scale_factor;
+    softfloat lfe_quant;
+    int32_t lfe_peak_cb;
+    const int8_t *channel_order_tab;  ///< channel reordering table, lfe and non lfe
+
+    int32_t prediction_mode[MAX_CHANNELS][DCAENC_SUBBANDS];
+    int32_t adpcm_history[MAX_CHANNELS][DCAENC_SUBBANDS][DCA_ADPCM_COEFFS * 2];
+    int32_t history[MAX_CHANNELS][512]; /* This is a circular buffer */
+    int32_t *subband[MAX_CHANNELS][DCAENC_SUBBANDS];
+    int32_t quantized[MAX_CHANNELS][DCAENC_SUBBANDS][SUBBAND_SAMPLES];
+    int32_t peak_cb[MAX_CHANNELS][DCAENC_SUBBANDS];
+    int32_t diff_peak_cb[MAX_CHANNELS][DCAENC_SUBBANDS]; ///< expected peak of residual signal
+    int32_t downsampled_lfe[DCA_LFE_SAMPLES];
+    int32_t masking_curve_cb[SUBSUBFRAMES][256];
+    int32_t bit_allocation_sel[MAX_CHANNELS];
+    int abits[MAX_CHANNELS][DCAENC_SUBBANDS];
+    int scale_factor[MAX_CHANNELS][DCAENC_SUBBANDS];
+    softfloat quant[MAX_CHANNELS][DCAENC_SUBBANDS];
+    int32_t quant_index_sel[MAX_CHANNELS][DCA_CODE_BOOKS];
+    int32_t eff_masking_curve_cb[256];
+    int32_t band_masking_cb[32];
+    int32_t worst_quantization_noise;
+    int32_t worst_noise_ever;
+    int consumed_bits;
+    int consumed_adpcm_bits; ///< Number of bits to transmit ADPCM related info
+
+    int32_t cos_table[2048];
+    int32_t band_interpolation_tab[2][512];
+    int32_t band_spectrum_tab[2][8];
+    int32_t auf[9][AUBANDS][256];
+    int32_t cb_to_add[256];
+    int32_t cb_to_level[2048];
+    int32_t lfe_fir_64i[512];
+} DCAEncContext;
+
+/* Transfer function of outer and middle ear, Hz -> dB */
+static double hom(double f)
+{
+    double f1 = f / 1000;
+
+    return -3.64 * pow(f1, -0.8)
+           + 6.8 * exp(-0.6 * (f1 - 3.4) * (f1 - 3.4))
+           - 6.0 * exp(-0.15 * (f1 - 8.7) * (f1 - 8.7))
+           - 0.0006 * (f1 * f1) * (f1 * f1);
+}
+
+static double gammafilter(int i, double f)
+{
+    double h = (f - fc[i]) / erb[i];
+
+    h = 1 + h * h;
+    h = 1 / (h * h);
+    return 20 * log10(h);
+}
+
+static int subband_bufer_alloc(DCAEncContext *c)
+{
+    int ch, band;
+    int32_t *bufer = av_calloc(MAX_CHANNELS * DCAENC_SUBBANDS *
+                               (SUBBAND_SAMPLES + DCA_ADPCM_COEFFS),
+                               sizeof(int32_t));
+    if (!bufer)
+        return AVERROR(ENOMEM);
+
+    /* we need a place for DCA_ADPCM_COEFF samples from previous frame
+     * to calc prediction coefficients for each subband */
+    for (ch = 0; ch < MAX_CHANNELS; ch++) {
+        for (band = 0; band < DCAENC_SUBBANDS; band++) {
+            c->subband[ch][band] = bufer +
+                                   ch * DCAENC_SUBBANDS * (SUBBAND_SAMPLES + DCA_ADPCM_COEFFS) +
+                                   band * (SUBBAND_SAMPLES + DCA_ADPCM_COEFFS) + DCA_ADPCM_COEFFS;
+        }
+    }
+    return 0;
+}
+
+static void subband_bufer_free(DCAEncContext *c)
+{
+    if (c->subband[0][0]) {
+        int32_t *bufer = c->subband[0][0] - DCA_ADPCM_COEFFS;
+        av_free(bufer);
+        c->subband[0][0] = NULL;
+    }
+}
+
+static int encode_init(AVCodecContext *avctx)
+{
+    DCAEncContext *c = avctx->priv_data;
+    uint64_t layout = avctx->channel_layout;
+    int i, j, k, min_frame_bits;
+    int ret;
+
+    if ((ret = subband_bufer_alloc(c)) < 0)
+        return ret;
+
+    c->fullband_channels = c->channels = avctx->channels;
+    c->lfe_channel = (avctx->channels == 3 || avctx->channels == 6);
+    c->band_interpolation = c->band_interpolation_tab[1];
+    c->band_spectrum = c->band_spectrum_tab[1];
+    c->worst_quantization_noise = -2047;
+    c->worst_noise_ever = -2047;
+    c->consumed_adpcm_bits = 0;
+
+    if (ff_dcaadpcm_init(&c->adpcm_ctx))
+        return AVERROR(ENOMEM);
+
+    if (!layout) {
+        av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The "
+                                      "encoder will guess the layout, but it "
+                                      "might be incorrect.\n");
+        layout = av_get_default_channel_layout(avctx->channels);
+    }
+    switch (layout) {
+    case AV_CH_LAYOUT_MONO:         c->channel_config = 0; break;
+    case AV_CH_LAYOUT_STEREO:       c->channel_config = 2; break;
+    case AV_CH_LAYOUT_2_2:          c->channel_config = 8; break;
+    case AV_CH_LAYOUT_5POINT0:      c->channel_config = 9; break;
+    case AV_CH_LAYOUT_5POINT1:      c->channel_config = 9; break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported channel layout!\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (c->lfe_channel) {
+        c->fullband_channels--;
+        c->channel_order_tab = channel_reorder_lfe[c->channel_config];
+    } else {
+        c->channel_order_tab = channel_reorder_nolfe[c->channel_config];
+    }
+
+    for (i = 0; i < MAX_CHANNELS; i++) {
+        for (j = 0; j < DCA_CODE_BOOKS; j++) {
+            c->quant_index_sel[i][j] = ff_dca_quant_index_group_size[j];
+        }
+        /* 6 - no Huffman */
+        c->bit_allocation_sel[i] = 6;
+
+        for (j = 0; j < DCAENC_SUBBANDS; j++) {
+            /* -1 - no ADPCM */
+            c->prediction_mode[i][j] = -1;
+            memset(c->adpcm_history[i][j], 0, sizeof(int32_t)*DCA_ADPCM_COEFFS);
+        }
+    }
+
+    for (i = 0; i < 9; i++) {
+        if (sample_rates[i] == avctx->sample_rate)
+            break;
+    }
+    if (i == 9)
+        return AVERROR(EINVAL);
+    c->samplerate_index = i;
+
+    if (avctx->bit_rate < 32000 || avctx->bit_rate > 3840000) {
+        av_log(avctx, AV_LOG_ERROR, "Bit rate %"PRId64" not supported.", avctx->bit_rate);
+        return AVERROR(EINVAL);
+    }
+    for (i = 0; ff_dca_bit_rates[i] < avctx->bit_rate; i++)
+        ;
+    c->bitrate_index = i;
+    c->frame_bits = FFALIGN((avctx->bit_rate * 512 + avctx->sample_rate - 1) / avctx->sample_rate, 32);
+    min_frame_bits = 132 + (493 + 28 * 32) * c->fullband_channels + c->lfe_channel * 72;
+    if (c->frame_bits < min_frame_bits || c->frame_bits > (DCA_MAX_FRAME_SIZE << 3))
+        return AVERROR(EINVAL);
+
+    c->frame_size = (c->frame_bits + 7) / 8;
+
+    avctx->frame_size = 32 * SUBBAND_SAMPLES;
+
+    if ((ret = ff_mdct_init(&c->mdct, 9, 0, 1.0)) < 0)
+        return ret;
+
+    /* Init all tables */
+    c->cos_table[0] = 0x7fffffff;
+    c->cos_table[512] = 0;
+    c->cos_table[1024] = -c->cos_table[0];
+    for (i = 1; i < 512; i++) {
+        c->cos_table[i]   = (int32_t)(0x7fffffff * cos(M_PI * i / 1024));
+        c->cos_table[1024-i] = -c->cos_table[i];
+        c->cos_table[1024+i] = -c->cos_table[i];
+        c->cos_table[2048-i] = +c->cos_table[i];
+    }
+
+    for (i = 0; i < 2048; i++)
+        c->cb_to_level[i] = (int32_t)(0x7fffffff * ff_exp10(-0.005 * i));
+
+    for (k = 0; k < 32; k++) {
+        for (j = 0; j < 8; j++) {
+            c->lfe_fir_64i[64 * j + k] = (int32_t)(0xffffff800000ULL * ff_dca_lfe_fir_64[8 * k + j]);
+            c->lfe_fir_64i[64 * (7-j) + (63 - k)] = (int32_t)(0xffffff800000ULL * ff_dca_lfe_fir_64[8 * k + j]);
+        }
+    }
+
+    for (i = 0; i < 512; i++) {
+        c->band_interpolation_tab[0][i] = (int32_t)(0x1000000000ULL * ff_dca_fir_32bands_perfect[i]);
+        c->band_interpolation_tab[1][i] = (int32_t)(0x1000000000ULL * ff_dca_fir_32bands_nonperfect[i]);
+    }
+
+    for (i = 0; i < 9; i++) {
+        for (j = 0; j < AUBANDS; j++) {
+            for (k = 0; k < 256; k++) {
+                double freq = sample_rates[i] * (k + 0.5) / 512;
+
+                c->auf[i][j][k] = (int32_t)(10 * (hom(freq) + gammafilter(j, freq)));
+            }
+        }
+    }
+
+    for (i = 0; i < 256; i++) {
+        double add = 1 + ff_exp10(-0.01 * i);
+        c->cb_to_add[i] = (int32_t)(100 * log10(add));
+    }
+    for (j = 0; j < 8; j++) {
+        double accum = 0;
+        for (i = 0; i < 512; i++) {
+            double reconst = ff_dca_fir_32bands_perfect[i] * ((i & 64) ? (-1) : 1);
+            accum += reconst * cos(2 * M_PI * (i + 0.5 - 256) * (j + 0.5) / 512);
+        }
+        c->band_spectrum_tab[0][j] = (int32_t)(200 * log10(accum));
+    }
+    for (j = 0; j < 8; j++) {
+        double accum = 0;
+        for (i = 0; i < 512; i++) {
+            double reconst = ff_dca_fir_32bands_nonperfect[i] * ((i & 64) ? (-1) : 1);
+            accum += reconst * cos(2 * M_PI * (i + 0.5 - 256) * (j + 0.5) / 512);
+        }
+        c->band_spectrum_tab[1][j] = (int32_t)(200 * log10(accum));
+    }
+
+    return 0;
+}
+
+static av_cold int encode_close(AVCodecContext *avctx)
+{
+    DCAEncContext *c = avctx->priv_data;
+    ff_mdct_end(&c->mdct);
+    subband_bufer_free(c);
+    ff_dcaadpcm_free(&c->adpcm_ctx);
+
+    return 0;
+}
+
+static void subband_transform(DCAEncContext *c, const int32_t *input)
+{
+    int ch, subs, i, k, j;
+
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        /* History is copied because it is also needed for PSY */
+        int32_t hist[512];
+        int hist_start = 0;
+        const int chi = c->channel_order_tab[ch];
+
+        memcpy(hist, &c->history[ch][0], 512 * sizeof(int32_t));
+
+        for (subs = 0; subs < SUBBAND_SAMPLES; subs++) {
+            int32_t accum[64];
+            int32_t resp;
+            int band;
+
+            /* Calculate the convolutions at once */
+            memset(accum, 0, 64 * sizeof(int32_t));
+
+            for (k = 0, i = hist_start, j = 0;
+                    i < 512; k = (k + 1) & 63, i++, j++)
+                accum[k] += mul32(hist[i], c->band_interpolation[j]);
+            for (i = 0; i < hist_start; k = (k + 1) & 63, i++, j++)
+                accum[k] += mul32(hist[i], c->band_interpolation[j]);
+
+            for (k = 16; k < 32; k++)
+                accum[k] = accum[k] - accum[31 - k];
+            for (k = 32; k < 48; k++)
+                accum[k] = accum[k] + accum[95 - k];
+
+            for (band = 0; band < 32; band++) {
+                resp = 0;
+                for (i = 16; i < 48; i++) {
+                    int s = (2 * band + 1) * (2 * (i + 16) + 1);
+                    resp += mul32(accum[i], COS_T(s << 3)) >> 3;
+                }
+
+                c->subband[ch][band][subs] = ((band + 1) & 2) ? -resp : resp;
+            }
+
+            /* Copy in 32 new samples from input */
+            for (i = 0; i < 32; i++)
+                hist[i + hist_start] = input[(subs * 32 + i) * c->channels + chi];
+
+            hist_start = (hist_start + 32) & 511;
+        }
+    }
+}
+
+static void lfe_downsample(DCAEncContext *c, const int32_t *input)
+{
+    /* FIXME: make 128x LFE downsampling possible */
+    const int lfech = lfe_index[c->channel_config];
+    int i, j, lfes;
+    int32_t hist[512];
+    int32_t accum;
+    int hist_start = 0;
+
+    memcpy(hist, &c->history[c->channels - 1][0], 512 * sizeof(int32_t));
+
+    for (lfes = 0; lfes < DCA_LFE_SAMPLES; lfes++) {
+        /* Calculate the convolution */
+        accum = 0;
+
+        for (i = hist_start, j = 0; i < 512; i++, j++)
+            accum += mul32(hist[i], c->lfe_fir_64i[j]);
+        for (i = 0; i < hist_start; i++, j++)
+            accum += mul32(hist[i], c->lfe_fir_64i[j]);
+
+        c->downsampled_lfe[lfes] = accum;
+
+        /* Copy in 64 new samples from input */
+        for (i = 0; i < 64; i++)
+            hist[i + hist_start] = input[(lfes * 64 + i) * c->channels + lfech];
+
+        hist_start = (hist_start + 64) & 511;
+    }
+}
+
+static int32_t get_cb(DCAEncContext *c, int32_t in)
+{
+    int i, res = 0;
+    in = FFABS(in);
+
+    for (i = 1024; i > 0; i >>= 1) {
+        if (c->cb_to_level[i + res] >= in)
+            res += i;
+    }
+    return -res;
+}
+
+static int32_t add_cb(DCAEncContext *c, int32_t a, int32_t b)
+{
+    if (a < b)
+        FFSWAP(int32_t, a, b);
+
+    if (a - b >= 256)
+        return a;
+    return a + c->cb_to_add[a - b];
+}
+
+static void calc_power(DCAEncContext *c,
+                       const int32_t in[2 * 256], int32_t power[256])
+{
+    int i;
+    LOCAL_ALIGNED_32(int32_t, data,  [512]);
+    LOCAL_ALIGNED_32(int32_t, coeff, [256]);
+
+    for (i = 0; i < 512; i++)
+        data[i] = norm__(mul32(in[i], 0x3fffffff - (COS_T(4 * i + 2) >> 1)), 4);
+
+    c->mdct.mdct_calc(&c->mdct, coeff, data);
+    for (i = 0; i < 256; i++) {
+        const int32_t cb = get_cb(c, coeff[i]);
+        power[i] = add_cb(c, cb, cb);
+    }
+}
+
+static void adjust_jnd(DCAEncContext *c,
+                       const int32_t in[512], int32_t out_cb[256])
+{
+    int32_t power[256];
+    int32_t out_cb_unnorm[256];
+    int32_t denom;
+    const int32_t ca_cb = -1114;
+    const int32_t cs_cb = 928;
+    const int samplerate_index = c->samplerate_index;
+    int i, j;
+
+    calc_power(c, in, power);
+
+    for (j = 0; j < 256; j++)
+        out_cb_unnorm[j] = -2047; /* and can only grow */
+
+    for (i = 0; i < AUBANDS; i++) {
+        denom = ca_cb; /* and can only grow */
+        for (j = 0; j < 256; j++)
+            denom = add_cb(c, denom, power[j] + c->auf[samplerate_index][i][j]);
+        for (j = 0; j < 256; j++)
+            out_cb_unnorm[j] = add_cb(c, out_cb_unnorm[j],
+                                      -denom + c->auf[samplerate_index][i][j]);
+    }
+
+    for (j = 0; j < 256; j++)
+        out_cb[j] = add_cb(c, out_cb[j], -out_cb_unnorm[j] - ca_cb - cs_cb);
+}
+
+typedef void (*walk_band_t)(DCAEncContext *c, int band1, int band2, int f,
+                            int32_t spectrum1, int32_t spectrum2, int channel,
+                            int32_t * arg);
+
+static void walk_band_low(DCAEncContext *c, int band, int channel,
+                          walk_band_t walk, int32_t *arg)
+{
+    int f;
+
+    if (band == 0) {
+        for (f = 0; f < 4; f++)
+            walk(c, 0, 0, f, 0, -2047, channel, arg);
+    } else {
+        for (f = 0; f < 8; f++)
+            walk(c, band, band - 1, 8 * band - 4 + f,
+                    c->band_spectrum[7 - f], c->band_spectrum[f], channel, arg);
+    }
+}
+
+static void walk_band_high(DCAEncContext *c, int band, int channel,
+                           walk_band_t walk, int32_t *arg)
+{
+    int f;
+
+    if (band == 31) {
+        for (f = 0; f < 4; f++)
+            walk(c, 31, 31, 256 - 4 + f, 0, -2047, channel, arg);
+    } else {
+        for (f = 0; f < 8; f++)
+            walk(c, band, band + 1, 8 * band + 4 + f,
+                    c->band_spectrum[f], c->band_spectrum[7 - f], channel, arg);
+    }
+}
+
+static void update_band_masking(DCAEncContext *c, int band1, int band2,
+                                int f, int32_t spectrum1, int32_t spectrum2,
+                                int channel, int32_t * arg)
+{
+    int32_t value = c->eff_masking_curve_cb[f] - spectrum1;
+
+    if (value < c->band_masking_cb[band1])
+        c->band_masking_cb[band1] = value;
+}
+
+static void calc_masking(DCAEncContext *c, const int32_t *input)
+{
+    int i, k, band, ch, ssf;
+    int32_t data[512];
+
+    for (i = 0; i < 256; i++)
+        for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+            c->masking_curve_cb[ssf][i] = -2047;
+
+    for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+        for (ch = 0; ch < c->fullband_channels; ch++) {
+            const int chi = c->channel_order_tab[ch];
+
+            for (i = 0, k = 128 + 256 * ssf; k < 512; i++, k++)
+                data[i] = c->history[ch][k];
+            for (k -= 512; i < 512; i++, k++)
+                data[i] = input[k * c->channels + chi];
+            adjust_jnd(c, data, c->masking_curve_cb[ssf]);
+        }
+    for (i = 0; i < 256; i++) {
+        int32_t m = 2048;
+
+        for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+            if (c->masking_curve_cb[ssf][i] < m)
+                m = c->masking_curve_cb[ssf][i];
+        c->eff_masking_curve_cb[i] = m;
+    }
+
+    for (band = 0; band < 32; band++) {
+        c->band_masking_cb[band] = 2048;
+        walk_band_low(c, band, 0, update_band_masking, NULL);
+        walk_band_high(c, band, 0, update_band_masking, NULL);
+    }
+}
+
+static inline int32_t find_peak(DCAEncContext *c, const int32_t *in, int len)
+{
+    int sample;
+    int32_t m = 0;
+    for (sample = 0; sample < len; sample++) {
+        int32_t s = abs(in[sample]);
+        if (m < s)
+            m = s;
+    }
+    return get_cb(c, m);
+}
+
+static void find_peaks(DCAEncContext *c)
+{
+    int band, ch;
+
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++)
+            c->peak_cb[ch][band] = find_peak(c, c->subband[ch][band],
+                                             SUBBAND_SAMPLES);
+    }
+
+    if (c->lfe_channel)
+        c->lfe_peak_cb = find_peak(c, c->downsampled_lfe, DCA_LFE_SAMPLES);
+}
+
+static void adpcm_analysis(DCAEncContext *c)
+{
+    int ch, band;
+    int pred_vq_id;
+    int32_t *samples;
+    int32_t estimated_diff[SUBBAND_SAMPLES];
+
+    c->consumed_adpcm_bits = 0;
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            samples = c->subband[ch][band] - DCA_ADPCM_COEFFS;
+            pred_vq_id = ff_dcaadpcm_subband_analysis(&c->adpcm_ctx, samples,
+                                                      SUBBAND_SAMPLES, estimated_diff);
+            if (pred_vq_id >= 0) {
+                c->prediction_mode[ch][band] = pred_vq_id;
+                c->consumed_adpcm_bits += 12; //12 bits to transmit prediction vq index
+                c->diff_peak_cb[ch][band] = find_peak(c, estimated_diff, 16);
+            } else {
+                c->prediction_mode[ch][band] = -1;
+            }
+        }
+    }
+}
+
+static const int snr_fudge = 128;
+#define USED_1ABITS 1
+#define USED_26ABITS 4
+
+static inline int32_t get_step_size(DCAEncContext *c, int ch, int band)
+{
+    int32_t step_size;
+
+    if (c->bitrate_index == 3)
+        step_size = ff_dca_lossless_quant[c->abits[ch][band]];
+    else
+        step_size = ff_dca_lossy_quant[c->abits[ch][band]];
+
+    return step_size;
+}
+
+static int calc_one_scale(DCAEncContext *c, int32_t peak_cb, int abits,
+                          softfloat *quant)
+{
+    int32_t peak;
+    int our_nscale, try_remove;
+    softfloat our_quant;
+
+    av_assert0(peak_cb <= 0);
+    av_assert0(peak_cb >= -2047);
+
+    our_nscale = 127;
+    peak = c->cb_to_level[-peak_cb];
+
+    for (try_remove = 64; try_remove > 0; try_remove >>= 1) {
+        if (scalefactor_inv[our_nscale - try_remove].e + stepsize_inv[abits].e <= 17)
+            continue;
+        our_quant.m = mul32(scalefactor_inv[our_nscale - try_remove].m, stepsize_inv[abits].m);
+        our_quant.e = scalefactor_inv[our_nscale - try_remove].e + stepsize_inv[abits].e - 17;
+        if ((ff_dca_quant_levels[abits] - 1) / 2 < quantize_value(peak, our_quant))
+            continue;
+        our_nscale -= try_remove;
+    }
+
+    if (our_nscale >= 125)
+        our_nscale = 124;
+
+    quant->m = mul32(scalefactor_inv[our_nscale].m, stepsize_inv[abits].m);
+    quant->e = scalefactor_inv[our_nscale].e + stepsize_inv[abits].e - 17;
+    av_assert0((ff_dca_quant_levels[abits] - 1) / 2 >= quantize_value(peak, *quant));
+
+    return our_nscale;
+}
+
+static inline void quantize_adpcm_subband(DCAEncContext *c, int ch, int band)
+{
+    int32_t step_size;
+    int32_t diff_peak_cb = c->diff_peak_cb[ch][band];
+    c->scale_factor[ch][band] = calc_one_scale(c, diff_peak_cb,
+                                               c->abits[ch][band],
+                                               &c->quant[ch][band]);
+
+    step_size = get_step_size(c, ch, band);
+    ff_dcaadpcm_do_real(c->prediction_mode[ch][band],
+                        c->quant[ch][band],
+                        ff_dca_scale_factor_quant7[c->scale_factor[ch][band]],
+                        step_size, c->adpcm_history[ch][band], c->subband[ch][band],
+                        c->adpcm_history[ch][band] + 4, c->quantized[ch][band],
+                        SUBBAND_SAMPLES, c->cb_to_level[-diff_peak_cb]);
+}
+
+static void quantize_adpcm(DCAEncContext *c)
+{
+    int band, ch;
+
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < 32; band++)
+            if (c->prediction_mode[ch][band] >= 0)
+                quantize_adpcm_subband(c, ch, band);
+}
+
+static void quantize_pcm(DCAEncContext *c)
+{
+    int sample, band, ch;
+
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            if (c->prediction_mode[ch][band] == -1) {
+                for (sample = 0; sample < SUBBAND_SAMPLES; sample++) {
+                    int32_t val = quantize_value(c->subband[ch][band][sample],
+                                                 c->quant[ch][band]);
+                    c->quantized[ch][band][sample] = val;
+                }
+            }
+        }
+    }
+}
+
+static void accumulate_huff_bit_consumption(int abits, int32_t *quantized,
+                                            uint32_t *result)
+{
+    uint8_t sel, id = abits - 1;
+    for (sel = 0; sel < ff_dca_quant_index_group_size[id]; sel++)
+        result[sel] += ff_dca_vlc_calc_quant_bits(quantized, SUBBAND_SAMPLES,
+                                                  sel, id);
+}
+
+static uint32_t set_best_code(uint32_t vlc_bits[DCA_CODE_BOOKS][7],
+                              uint32_t clc_bits[DCA_CODE_BOOKS],
+                              int32_t res[DCA_CODE_BOOKS])
+{
+    uint8_t i, sel;
+    uint32_t best_sel_bits[DCA_CODE_BOOKS];
+    int32_t best_sel_id[DCA_CODE_BOOKS];
+    uint32_t t, bits = 0;
+
+    for (i = 0; i < DCA_CODE_BOOKS; i++) {
+
+        av_assert0(!((!!vlc_bits[i][0]) ^ (!!clc_bits[i])));
+        if (vlc_bits[i][0] == 0) {
+            /* do not transmit adjustment index for empty codebooks */
+            res[i] = ff_dca_quant_index_group_size[i];
+            /* and skip it */
+            continue;
+        }
+
+        best_sel_bits[i] = vlc_bits[i][0];
+        best_sel_id[i] = 0;
+        for (sel = 0; sel < ff_dca_quant_index_group_size[i]; sel++) {
+            if (best_sel_bits[i] > vlc_bits[i][sel] && vlc_bits[i][sel]) {
+                best_sel_bits[i] = vlc_bits[i][sel];
+                best_sel_id[i] = sel;
+            }
+        }
+
+        /* 2 bits to transmit scale factor adjustment index */
+        t = best_sel_bits[i] + 2;
+        if (t < clc_bits[i]) {
+            res[i] = best_sel_id[i];
+            bits += t;
+        } else {
+            res[i] = ff_dca_quant_index_group_size[i];
+            bits += clc_bits[i];
+        }
+    }
+    return bits;
+}
+
+static uint32_t set_best_abits_code(int abits[DCAENC_SUBBANDS], int bands,
+                                    int32_t *res)
+{
+    uint8_t i;
+    uint32_t t;
+    int32_t best_sel = 6;
+    int32_t best_bits = bands * 5;
+
+    /* Check do we have subband which cannot be encoded by Huffman tables */
+    for (i = 0; i < bands; i++) {
+        if (abits[i] > 12 || abits[i] == 0) {
+            *res = best_sel;
+            return best_bits;
+        }
+    }
+
+    for (i = 0; i < DCA_BITALLOC_12_COUNT; i++) {
+        t = ff_dca_vlc_calc_alloc_bits(abits, bands, i);
+        if (t < best_bits) {
+            best_bits = t;
+            best_sel = i;
+        }
+    }
+
+    *res = best_sel;
+    return best_bits;
+}
+
+static int init_quantization_noise(DCAEncContext *c, int noise, int forbid_zero)
+{
+    int ch, band, ret = USED_26ABITS | USED_1ABITS;
+    uint32_t huff_bit_count_accum[MAX_CHANNELS][DCA_CODE_BOOKS][7];
+    uint32_t clc_bit_count_accum[MAX_CHANNELS][DCA_CODE_BOOKS];
+    uint32_t bits_counter = 0;
+
+    c->consumed_bits = 132 + 333 * c->fullband_channels;
+    c->consumed_bits += c->consumed_adpcm_bits;
+    if (c->lfe_channel)
+        c->consumed_bits += 72;
+
+    /* attempt to guess the bit distribution based on the prevoius frame */
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            int snr_cb = c->peak_cb[ch][band] - c->band_masking_cb[band] - noise;
+
+            if (snr_cb >= 1312) {
+                c->abits[ch][band] = 26;
+                ret &= ~USED_1ABITS;
+            } else if (snr_cb >= 222) {
+                c->abits[ch][band] = 8 + mul32(snr_cb - 222, 69000000);
+                ret &= ~(USED_26ABITS | USED_1ABITS);
+            } else if (snr_cb >= 0) {
+                c->abits[ch][band] = 2 + mul32(snr_cb, 106000000);
+                ret &= ~(USED_26ABITS | USED_1ABITS);
+            } else if (forbid_zero || snr_cb >= -140) {
+                c->abits[ch][band] = 1;
+                ret &= ~USED_26ABITS;
+            } else {
+                c->abits[ch][band] = 0;
+                ret &= ~(USED_26ABITS | USED_1ABITS);
+            }
+        }
+        c->consumed_bits += set_best_abits_code(c->abits[ch], 32,
+                                                &c->bit_allocation_sel[ch]);
+    }
+
+    /* Recalc scale_factor each time to get bits consumption in case of Huffman coding.
+       It is suboptimal solution */
+    /* TODO: May be cache scaled values */
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            if (c->prediction_mode[ch][band] == -1) {
+                c->scale_factor[ch][band] = calc_one_scale(c, c->peak_cb[ch][band],
+                                                           c->abits[ch][band],
+                                                           &c->quant[ch][band]);
+            }
+        }
+    }
+    quantize_adpcm(c);
+    quantize_pcm(c);
+
+    memset(huff_bit_count_accum, 0, MAX_CHANNELS * DCA_CODE_BOOKS * 7 * sizeof(uint32_t));
+    memset(clc_bit_count_accum, 0, MAX_CHANNELS * DCA_CODE_BOOKS * sizeof(uint32_t));
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            if (c->abits[ch][band] && c->abits[ch][band] <= DCA_CODE_BOOKS) {
+                accumulate_huff_bit_consumption(c->abits[ch][band],
+                                                c->quantized[ch][band],
+                                                huff_bit_count_accum[ch][c->abits[ch][band] - 1]);
+                clc_bit_count_accum[ch][c->abits[ch][band] - 1] += bit_consumption[c->abits[ch][band]];
+            } else {
+                bits_counter += bit_consumption[c->abits[ch][band]];
+            }
+        }
+    }
+
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        bits_counter += set_best_code(huff_bit_count_accum[ch],
+                                      clc_bit_count_accum[ch],
+                                      c->quant_index_sel[ch]);
+    }
+
+    c->consumed_bits += bits_counter;
+
+    return ret;
+}
+
+static void assign_bits(DCAEncContext *c)
+{
+    /* Find the bounds where the binary search should work */
+    int low, high, down;
+    int used_abits = 0;
+    int forbid_zero = 1;
+restart:
+    init_quantization_noise(c, c->worst_quantization_noise, forbid_zero);
+    low = high = c->worst_quantization_noise;
+    if (c->consumed_bits > c->frame_bits) {
+        while (c->consumed_bits > c->frame_bits) {
+            if (used_abits == USED_1ABITS && forbid_zero) {
+                forbid_zero = 0;
+                goto restart;
+            }
+            low = high;
+            high += snr_fudge;
+            used_abits = init_quantization_noise(c, high, forbid_zero);
+        }
+    } else {
+        while (c->consumed_bits <= c->frame_bits) {
+            high = low;
+            if (used_abits == USED_26ABITS)
+                goto out; /* The requested bitrate is too high, pad with zeros */
+            low -= snr_fudge;
+            used_abits = init_quantization_noise(c, low, forbid_zero);
+        }
+    }
+
+    /* Now do a binary search between low and high to see what fits */
+    for (down = snr_fudge >> 1; down; down >>= 1) {
+        init_quantization_noise(c, high - down, forbid_zero);
+        if (c->consumed_bits <= c->frame_bits)
+            high -= down;
+    }
+    init_quantization_noise(c, high, forbid_zero);
+out:
+    c->worst_quantization_noise = high;
+    if (high > c->worst_noise_ever)
+        c->worst_noise_ever = high;
+}
+
+static void shift_history(DCAEncContext *c, const int32_t *input)
+{
+    int k, ch;
+
+    for (k = 0; k < 512; k++)
+        for (ch = 0; ch < c->channels; ch++) {
+            const int chi = c->channel_order_tab[ch];
+
+            c->history[ch][k] = input[k * c->channels + chi];
+        }
+}
+
+static void fill_in_adpcm_bufer(DCAEncContext *c)
+{
+     int ch, band;
+     int32_t step_size;
+     /* We fill in ADPCM work buffer for subbands which hasn't been ADPCM coded
+      * in current frame - we need this data if subband of next frame is
+      * ADPCM
+      */
+     for (ch = 0; ch < c->channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            int32_t *samples = c->subband[ch][band] - DCA_ADPCM_COEFFS;
+            if (c->prediction_mode[ch][band] == -1) {
+                step_size = get_step_size(c, ch, band);
+
+                ff_dca_core_dequantize(c->adpcm_history[ch][band],
+                                       c->quantized[ch][band]+12, step_size,
+                                       ff_dca_scale_factor_quant7[c->scale_factor[ch][band]], 0, 4);
+            } else {
+                AV_COPY128U(c->adpcm_history[ch][band], c->adpcm_history[ch][band]+4);
+            }
+            /* Copy dequantized values for LPC analysis.
+             * It reduces artifacts in case of extreme quantization,
+             * example: in current frame abits is 1 and has no prediction flag,
+             * but end of this frame is sine like signal. In this case, if LPC analysis uses
+             * original values, likely LPC analysis returns good prediction gain, and sets prediction flag.
+             * But there are no proper value in decoder history, so likely result will be no good.
+             * Bitstream has "Predictor history flag switch", but this flag disables history for all subbands
+             */
+            samples[0] = c->adpcm_history[ch][band][0] << 7;
+            samples[1] = c->adpcm_history[ch][band][1] << 7;
+            samples[2] = c->adpcm_history[ch][band][2] << 7;
+            samples[3] = c->adpcm_history[ch][band][3] << 7;
+        }
+     }
+}
+
+static void calc_lfe_scales(DCAEncContext *c)
+{
+    if (c->lfe_channel)
+        c->lfe_scale_factor = calc_one_scale(c, c->lfe_peak_cb, 11, &c->lfe_quant);
+}
+
+static void put_frame_header(DCAEncContext *c)
+{
+    /* SYNC */
+    put_bits(&c->pb, 16, 0x7ffe);
+    put_bits(&c->pb, 16, 0x8001);
+
+    /* Frame type: normal */
+    put_bits(&c->pb, 1, 1);
+
+    /* Deficit sample count: none */
+    put_bits(&c->pb, 5, 31);
+
+    /* CRC is not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Number of PCM sample blocks */
+    put_bits(&c->pb, 7, SUBBAND_SAMPLES - 1);
+
+    /* Primary frame byte size */
+    put_bits(&c->pb, 14, c->frame_size - 1);
+
+    /* Audio channel arrangement */
+    put_bits(&c->pb, 6, c->channel_config);
+
+    /* Core audio sampling frequency */
+    put_bits(&c->pb, 4, bitstream_sfreq[c->samplerate_index]);
+
+    /* Transmission bit rate */
+    put_bits(&c->pb, 5, c->bitrate_index);
+
+    /* Embedded down mix: disabled */
+    put_bits(&c->pb, 1, 0);
+
+    /* Embedded dynamic range flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Embedded time stamp flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Auxiliary data flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* HDCD source: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Extension audio ID: N/A */
+    put_bits(&c->pb, 3, 0);
+
+    /* Extended audio data: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Audio sync word insertion flag: after each sub-frame */
+    put_bits(&c->pb, 1, 0);
+
+    /* Low frequency effects flag: not present or 64x subsampling */
+    put_bits(&c->pb, 2, c->lfe_channel ? 2 : 0);
+
+    /* Predictor history switch flag: on */
+    put_bits(&c->pb, 1, 1);
+
+    /* No CRC */
+    /* Multirate interpolator switch: non-perfect reconstruction */
+    put_bits(&c->pb, 1, 0);
+
+    /* Encoder software revision: 7 */
+    put_bits(&c->pb, 4, 7);
+
+    /* Copy history: 0 */
+    put_bits(&c->pb, 2, 0);
+
+    /* Source PCM resolution: 16 bits, not DTS ES */
+    put_bits(&c->pb, 3, 0);
+
+    /* Front sum/difference coding: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Surrounds sum/difference coding: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Dialog normalization: 0 dB */
+    put_bits(&c->pb, 4, 0);
+}
+
+static void put_primary_audio_header(DCAEncContext *c)
+{
+    int ch, i;
+    /* Number of subframes */
+    put_bits(&c->pb, 4, SUBFRAMES - 1);
+
+    /* Number of primary audio channels */
+    put_bits(&c->pb, 3, c->fullband_channels - 1);
+
+    /* Subband activity count */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 5, DCAENC_SUBBANDS - 2);
+
+    /* High frequency VQ start subband */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 5, DCAENC_SUBBANDS - 1);
+
+    /* Joint intensity coding index: 0, 0 */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 0);
+
+    /* Transient mode codebook: A4, A4 (arbitrary) */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 2, 0);
+
+    /* Scale factor code book: 7 bit linear, 7-bit sqrt table (for each channel) */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 6);
+
+    /* Bit allocation quantizer select: linear 5-bit */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, c->bit_allocation_sel[ch]);
+
+    /* Quantization index codebook select */
+    for (i = 0; i < DCA_CODE_BOOKS; i++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            put_bits(&c->pb, ff_dca_quant_index_sel_nbits[i], c->quant_index_sel[ch][i]);
+
+    /* Scale factor adjustment index: transmitted in case of Huffman coding */
+    for (i = 0; i < DCA_CODE_BOOKS; i++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            if (c->quant_index_sel[ch][i] < ff_dca_quant_index_group_size[i])
+                put_bits(&c->pb, 2, 0);
+
+    /* Audio header CRC check word: not transmitted */
+}
+
+static void put_subframe_samples(DCAEncContext *c, int ss, int band, int ch)
+{
+    int i, j, sum, bits, sel;
+    if (c->abits[ch][band] <= DCA_CODE_BOOKS) {
+        av_assert0(c->abits[ch][band] > 0);
+        sel = c->quant_index_sel[ch][c->abits[ch][band] - 1];
+        // Huffman codes
+        if (sel < ff_dca_quant_index_group_size[c->abits[ch][band] - 1]) {
+            ff_dca_vlc_enc_quant(&c->pb, &c->quantized[ch][band][ss * 8], 8,
+                                 sel, c->abits[ch][band] - 1);
+            return;
+        }
+
+        // Block codes
+        if (c->abits[ch][band] <= 7) {
+            for (i = 0; i < 8; i += 4) {
+                sum = 0;
+                for (j = 3; j >= 0; j--) {
+                    sum *= ff_dca_quant_levels[c->abits[ch][band]];
+                    sum += c->quantized[ch][band][ss * 8 + i + j];
+                    sum += (ff_dca_quant_levels[c->abits[ch][band]] - 1) / 2;
+                }
+                put_bits(&c->pb, bit_consumption[c->abits[ch][band]] / 4, sum);
+            }
+            return;
+        }
+    }
+
+    for (i = 0; i < 8; i++) {
+        bits = bit_consumption[c->abits[ch][band]] / 16;
+        put_sbits(&c->pb, bits, c->quantized[ch][band][ss * 8 + i]);
+    }
+}
+
+static void put_subframe(DCAEncContext *c, int subframe)
+{
+    int i, band, ss, ch;
+
+    /* Subsubframes count */
+    put_bits(&c->pb, 2, SUBSUBFRAMES -1);
+
+    /* Partial subsubframe sample count: dummy */
+    put_bits(&c->pb, 3, 0);
+
+    /* Prediction mode: no ADPCM, in each channel and subband */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            put_bits(&c->pb, 1, !(c->prediction_mode[ch][band] == -1));
+
+    /* Prediction VQ address */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            if (c->prediction_mode[ch][band] >= 0)
+                put_bits(&c->pb, 12, c->prediction_mode[ch][band]);
+
+    /* Bit allocation index */
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        if (c->bit_allocation_sel[ch] == 6) {
+            for (band = 0; band < DCAENC_SUBBANDS; band++) {
+                put_bits(&c->pb, 5, c->abits[ch][band]);
+            }
+        } else {
+            ff_dca_vlc_enc_alloc(&c->pb, c->abits[ch], DCAENC_SUBBANDS,
+                                 c->bit_allocation_sel[ch]);
+        }
+    }
+
+    if (SUBSUBFRAMES > 1) {
+        /* Transition mode: none for each channel and subband */
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            for (band = 0; band < DCAENC_SUBBANDS; band++)
+                if (c->abits[ch][band])
+                    put_bits(&c->pb, 1, 0); /* codebook A4 */
+    }
+
+    /* Scale factors */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            if (c->abits[ch][band])
+                put_bits(&c->pb, 7, c->scale_factor[ch][band]);
+
+    /* Joint subband scale factor codebook select: not transmitted */
+    /* Scale factors for joint subband coding: not transmitted */
+    /* Stereo down-mix coefficients: not transmitted */
+    /* Dynamic range coefficient: not transmitted */
+    /* Stde information CRC check word: not transmitted */
+    /* VQ encoded high frequency subbands: not transmitted */
+
+    /* LFE data: 8 samples and scalefactor */
+    if (c->lfe_channel) {
+        for (i = 0; i < DCA_LFE_SAMPLES; i++)
+            put_bits(&c->pb, 8, quantize_value(c->downsampled_lfe[i], c->lfe_quant) & 0xff);
+        put_bits(&c->pb, 8, c->lfe_scale_factor);
+    }
+
+    /* Audio data (subsubframes) */
+    for (ss = 0; ss < SUBSUBFRAMES ; ss++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            for (band = 0; band < DCAENC_SUBBANDS; band++)
+                if (c->abits[ch][band])
+                    put_subframe_samples(c, ss, band, ch);
+
+    /* DSYNC */
+    put_bits(&c->pb, 16, 0xffff);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                        const AVFrame *frame, int *got_packet_ptr)
+{
+    DCAEncContext *c = avctx->priv_data;
+    const int32_t *samples;
+    int ret, i;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, c->frame_size, 0)) < 0)
+        return ret;
+
+    samples = (const int32_t *)frame->data[0];
+
+    subband_transform(c, samples);
+    if (c->lfe_channel)
+        lfe_downsample(c, samples);
+
+    calc_masking(c, samples);
+    if (c->options.adpcm_mode)
+        adpcm_analysis(c);
+    find_peaks(c);
+    assign_bits(c);
+    calc_lfe_scales(c);
+    shift_history(c, samples);
+
+    init_put_bits(&c->pb, avpkt->data, avpkt->size);
+    fill_in_adpcm_bufer(c);
+    put_frame_header(c);
+    put_primary_audio_header(c);
+    for (i = 0; i < SUBFRAMES; i++)
+        put_subframe(c, i);
+
+
+    for (i = put_bits_count(&c->pb); i < 8*c->frame_size; i++)
+        put_bits(&c->pb, 1, 0);
+
+    flush_put_bits(&c->pb);
+
+    avpkt->pts      = frame->pts;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    avpkt->size     = put_bits_count(&c->pb) >> 3;
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+#define DCAENC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+
+static const AVOption options[] = {
+    { "dca_adpcm", "Use ADPCM encoding", offsetof(DCAEncContext, options.adpcm_mode), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, DCAENC_FLAGS },
+    { NULL },
+};
+
+static const AVClass dcaenc_class = {
+    .class_name = "DCA (DTS Coherent Acoustics)",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVCodecDefault defaults[] = {
+    { "b",          "1411200" },
+    { NULL },
+};
+
+AVCodec ff_dca_encoder = {
+    .name                  = "dca",
+    .long_name             = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_DTS,
+    .priv_data_size        = sizeof(DCAEncContext),
+    .init                  = encode_init,
+    .close                 = encode_close,
+    .encode2               = encode_frame,
+    .capabilities          = AV_CODEC_CAP_EXPERIMENTAL,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S32,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = sample_rates,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO,
+                                                  AV_CH_LAYOUT_2_2,
+                                                  AV_CH_LAYOUT_5POINT0,
+                                                  AV_CH_LAYOUT_5POINT1,
+                                                  0 },
+    .defaults              = defaults,
+    .priv_class            = &dcaenc_class,
+};
diff --git a/libavcodec/dcaenc.h b/libavcodec/dcaenc.h
new file mode 100644
index 0000000..63fdaf0
--- /dev/null
+++ b/libavcodec/dcaenc.h
@@ -0,0 +1,158 @@
+/*
+ * DCA encoder tables
+ * Copyright (C) 2008-2012 Alexander E. Patrakov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCAENC_H
+#define AVCODEC_DCAENC_H
+
+#include <stdint.h>
+
+#include "dcamath.h"
+
+typedef struct {
+    int32_t m;
+    int32_t e;
+} softfloat;
+
+static const int sample_rates[] = {
+    8000, 16000, 32000, 11025, 22050, 44100, 12000, 24000, 48000, 0,
+};
+
+static const uint8_t bitstream_sfreq[] = { 1, 2, 3, 6, 7, 8, 11, 12, 13 };
+
+/* Auditory filter center frequencies and bandwidths, in Hz.
+ * The last two are made up, because there is no scientific data.
+ */
+static const uint16_t fc[] = {
+    50, 150, 250, 350, 450, 570, 700, 840, 1000, 1170, 1370, 1600, 1850, 2150,
+    2500, 2900, 3400, 4000, 4800, 5800, 7000, 8500, 10500, 13500, 17000
+};
+
+static const uint16_t erb[] = {
+    80, 100, 100, 100, 110, 120, 140, 150, 160, 190, 210, 240, 280,
+    320, 380, 450, 550, 700, 900, 1100, 1300, 1800, 2500, 3500, 4500
+};
+
+static const softfloat stepsize_inv[27] = {
+    {0, 0}, {1342177360, 21}, {2147483647, 21}, {1342177360, 20},
+    {1819901661, 20}, {2147483647, 20}, {1278263843, 19}, {1579032492, 19},
+    {1412817763, 18}, {1220162327, 17}, {1118482133, 16}, {1917391412, 16},
+    {1766017772, 15}, {1525212826, 14}, {1290553940, 13}, {2097179000, 13},
+    {1677683200, 12}, {1497972244, 11}, {1310893147, 10}, {1165354136, 9},
+    {1748031204, 9}, {1542092044, 8}, {1636178017, 7}, {1636178017, 6},
+    {1636178017, 5}, {1636178017, 4}, {1636178017, 3},
+};
+
+static const softfloat scalefactor_inv[128] = {
+    {2147483647, 1}, {2147483647, 1}, {2147483647, 2}, {2147483647, 2},
+    {2147483647, 2}, {2147483647, 2}, {1431655765, 2}, {1431655765, 2},
+    {1431655765, 2}, {2147483647, 3}, {2147483647, 3}, {1717986918, 3},
+    {1431655765, 3}, {1227133513, 3}, {1227133513, 3}, {2147483647, 4},
+    {1717986918, 4}, {1561806289, 4}, {1431655765, 4}, {1227133513, 4},
+    {2147483647, 5}, {1908874353, 5}, {1717986918, 5}, {1493901668, 5},
+    {1321528398, 5}, {1145324612, 5}, {2021161080, 6}, {1808407282, 6},
+    {1561806289, 6}, {1374389534, 6}, {1227133513, 6}, {2147483647, 7},
+    {1908874353, 7}, {1676084798, 7}, {1477838209, 7}, {1296593900, 7},
+    {1145324612, 7}, {2021161080, 8}, {1773405851, 8}, {1561806289, 8},
+    {1374389534, 8}, {1216273924, 8}, {2139127680, 9}, {1882725390, 9},
+    {1660893697, 9}, {1462116526, 9}, {1287484341, 9}, {1135859119, 9},
+    {1999112050, 10}, {1762037865, 10}, {1552982525, 10}, {1367551775, 10},
+    {1205604855, 10}, {2124660150, 11}, {1871509153, 11}, {1648443220, 11},
+    {1452459217, 11}, {1279990253, 11}, {1127704233, 11}, {1987368509, 12},
+    {1750814693, 12}, {1542632939, 12}, {1359099663, 12}, {1197398995, 12},
+    {2109880792, 13}, {1858853132, 13}, {1638006149, 13}, {1443165385, 13},
+    {1271479187, 13}, {1120235993, 13}, {1973767086, 14}, {1739045674, 14},
+    {1532153461, 14}, {1349922194, 14}, {1189384493, 14}, {2095804865, 15},
+    {1846464029, 15}, {1626872524, 15}, {1433347133, 15}, {1262853884, 15},
+    {1112619678, 15}, {1960569045, 16}, {1727349015, 16}, {1521881227, 16},
+    {1340842289, 16}, {1181357555, 16}, {2081669156, 17}, {1834047752, 17},
+    {1615889229, 17}, {1423675973, 17}, {1254322457, 17}, {1105123583, 17},
+    {1947330755, 18}, {1715693602, 18}, {1511607799, 18}, {1331801790, 18},
+    {1173384427, 18}, {2067616532, 19}, {1821667648, 19}, {1604980024, 19},
+    {1414066955, 19}, {1245861410, 19}, {1097665748, 19}, {1934193616, 20},
+    {1704119624, 20}, {1501412075, 20}, {1322817107, 20}, {1165466323, 20},
+    {2053666205, 21}, {1809379407, 21}, {1594151671, 21}, {1404526328, 21},
+    {1237455941, 21}, {1090259329, 21}, {1921143210, 22}, {1692621231, 22},
+    {1491281857, 22}, {1313892269, 22}, {1157603482, 22}, {2039810470, 23},
+    {1797172644, 23}, {1583396912, 23}, {1395050052, 23}, {1229107276, 23},
+    {1082903494, 23}, {1082903494, 23}, {1082903494, 23}, {1082903494, 23},
+};
+
+/* manually derived from
+ * Table B.5: Selection of quantization levels and codebooks
+ */
+static const int bit_consumption[27] = {
+    -8, 28, 40, 48, 52, 60, 68, 76, 80, 96,
+    112, 128, 144, 160, 176, 192, 208, 224, 240, 256,
+    272, 288, 304, 320, 336, 352, 368,
+};
+
+static const int8_t lfe_index[16] = {
+    1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3
+};
+
+static const int8_t channel_reorder_lfe[16][9] = {
+    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  4, -1, -1, -1, -1, -1 },
+    { 0,  1,  3,  4, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  4,  5, -1, -1, -1, -1 },
+    { 3,  4,  0,  1,  5,  6, -1, -1, -1 },
+    { 2,  0,  1,  4,  5,  6, -1, -1, -1 },
+    { 0,  6,  4,  5,  2,  3, -1, -1, -1 },
+    { 4,  2,  5,  0,  1,  6,  7, -1, -1 },
+    { 5,  6,  0,  1,  7,  3,  8,  4, -1 },
+    { 4,  2,  5,  0,  1,  6,  8,  7, -1 },
+};
+
+static const int8_t channel_reorder_nolfe[16][9] = {
+    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  3, -1, -1, -1, -1, -1 },
+    { 0,  1,  2,  3, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  3,  4, -1, -1, -1, -1 },
+    { 2,  3,  0,  1,  4,  5, -1, -1, -1 },
+    { 2,  0,  1,  3,  4,  5, -1, -1, -1 },
+    { 0,  5,  3,  4,  1,  2, -1, -1, -1 },
+    { 3,  2,  4,  0,  1,  5,  6, -1, -1 },
+    { 4,  5,  0,  1,  6,  2,  7,  3, -1 },
+    { 3,  2,  4,  0,  1,  5,  7,  6, -1 },
+};
+
+static inline int32_t quantize_value(int32_t value, softfloat quant)
+{
+    int32_t offset = 1 << (quant.e - 1);
+
+    value = mul32(value, quant.m) + offset;
+    value = value >> quant.e;
+    return value;
+}
+
+#endif /* AVCODEC_DCAENC_H */
diff --git a/libavcodec/dcahuff.c b/libavcodec/dcahuff.c
new file mode 100644
index 0000000..0a3eeb4
--- /dev/null
+++ b/libavcodec/dcahuff.c
@@ -0,0 +1,1378 @@
+/*
+ * DCA compatible decoder - huffman tables
+ * Copyright (C) 2004 Gildas Bazin
+ * Copyright (C) 2007 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "dcahuff.h"
+
+#define TMODE_COUNT 4
+static const uint8_t tmode_vlc_bits[TMODE_COUNT] = { 3, 3, 3, 2 };
+static const uint16_t tmode_codes[TMODE_COUNT][4] = {
+    { 0x0000, 0x0002, 0x0006, 0x0007 },
+    { 0x0002, 0x0006, 0x0007, 0x0000 },
+    { 0x0006, 0x0007, 0x0000, 0x0002 },
+    { 0x0000, 0x0001, 0x0002, 0x0003 }
+};
+
+static const uint8_t tmode_bits[TMODE_COUNT][4] = {
+    { 1, 2, 3, 3 },
+    { 2, 3, 3, 1 },
+    { 3, 3, 1, 2 },
+    { 2, 2, 2, 2 }
+};
+
+#define BITALLOC_12_VLC_BITS 9
+static const uint8_t bitalloc_12_vlc_bits[DCA_BITALLOC_12_COUNT] = {
+    9, 7, 7, 9, 9
+};
+
+static const uint16_t bitalloc_12_codes[DCA_BITALLOC_12_COUNT][12] = {
+    { 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E, 0x00FF, 0x00FE,
+      0x01FB, 0x01FA, 0x01F9, 0x01F8, },
+    { 0x0001, 0x0000, 0x0002, 0x000F, 0x000C, 0x001D, 0x0039, 0x0038,
+      0x0037, 0x0036, 0x0035, 0x0034, },
+    { 0x0000, 0x0007, 0x0005, 0x0004, 0x0002, 0x000D, 0x000C, 0x0006,
+      0x000F, 0x001D, 0x0039, 0x0038, },
+    { 0x0003, 0x0002, 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E,
+      0x007E, 0x00FE, 0x01FF, 0x01FE, },
+    { 0x0001, 0x0000, 0x0002, 0x0006, 0x000E, 0x003F, 0x003D, 0x007C,
+      0x0079, 0x0078, 0x00FB, 0x00FA, }
+};
+
+static const uint8_t bitalloc_12_bits[DCA_BITALLOC_12_COUNT][12] = {
+    { 1, 2, 3, 4, 5, 6, 8, 8, 9, 9,  9,  9 },
+    { 1, 2, 3, 5, 5, 6, 7, 7, 7, 7,  7,  7 },
+    { 2, 3, 3, 3, 3, 4, 4, 4, 5, 6,  7,  7 },
+    { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10 },
+    { 1, 2, 3, 4, 5, 7, 7, 8, 8, 8,  9,  9 }
+};
+
+#define SCALES_COUNT    5
+#define SCALES_VLC_BITS 9
+static const uint16_t scales_codes[SCALES_COUNT][129] = {
+    { 0x3AB0, 0x3AB2, 0x3AB4, 0x3AB6, 0x3AB8, 0x3ABA, 0x3ABC, 0x3ABE,
+      0x3AC0, 0x3AC2, 0x3AC4, 0x3AC6, 0x3AC8, 0x3ACA, 0x3ACC, 0x3ACE,
+      0x3AD0, 0x3AD2, 0x3AD4, 0x3AD6, 0x3AD8, 0x3ADA, 0x3ADC, 0x3ADE,
+      0x3AE0, 0x3AE2, 0x3AE4, 0x3AE6, 0x3AE8, 0x3AEA, 0x3AEC, 0x3AEE,
+      0x3AF0, 0x3AF2, 0x3AF4, 0x3AF6, 0x3AF8, 0x3AFA, 0x3AFC, 0x3AFE,
+      0x0540, 0x0542, 0x0544, 0x0546, 0x0548, 0x054A, 0x054C, 0x054E,
+      0x0558, 0x055E, 0x02AD, 0x0154, 0x0754, 0x03A8, 0x0056, 0x0028,
+      0x00E8, 0x004A, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
+      0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x004B,
+      0x00E9, 0x0029, 0x0057, 0x03A9, 0x0755, 0x0155, 0x02AE, 0x055F,
+      0x0559, 0x054F, 0x054D, 0x054B, 0x0549, 0x0547, 0x0545, 0x0543,
+      0x0541, 0x3AFF, 0x3AFD, 0x3AFB, 0x3AF9, 0x3AF7, 0x3AF5, 0x3AF3,
+      0x3AF1, 0x3AEF, 0x3AED, 0x3AEB, 0x3AE9, 0x3AE7, 0x3AE5, 0x3AE3,
+      0x3AE1, 0x3ADF, 0x3ADD, 0x3ADB, 0x3AD9, 0x3AD7, 0x3AD5, 0x3AD3,
+      0x3AD1, 0x3ACF, 0x3ACD, 0x3ACB, 0x3AC9, 0x3AC7, 0x3AC5, 0x3AC3,
+      0x3AC1, 0x3ABF, 0x3ABD, 0x3ABB, 0x3AB9, 0x3AB7, 0x3AB5, 0x3AB3,
+      0x3AB1, },
+    { 0x0F60, 0x0F62, 0x0F64, 0x0F66, 0x0F68, 0x0F6A, 0x0F6C, 0x0F6E,
+      0x0F70, 0x0F72, 0x0F74, 0x0F76, 0x0F78, 0x0F7A, 0x0F7C, 0x0F7E,
+      0x0F80, 0x0F82, 0x0F84, 0x0F86, 0x0F88, 0x0F8A, 0x0F8C, 0x0F8E,
+      0x0F90, 0x0F92, 0x0F94, 0x0F96, 0x0F98, 0x0F9A, 0x0F9C, 0x0F9E,
+      0x0FA0, 0x0FA2, 0x0FA4, 0x0FA6, 0x0FA8, 0x0FAA, 0x0FAC, 0x0FAE,
+      0x0FB0, 0x0FB2, 0x0FB4, 0x0FB6, 0x0FB8, 0x0FBA, 0x0FBC, 0x0FBE,
+      0x07A0, 0x07A2, 0x03D2, 0x01EA, 0x00FC, 0x007F, 0x001C, 0x000C,
+      0x0004, 0x0034, 0x0010, 0x001B, 0x0009, 0x000B, 0x000E, 0x0001,
+      0x0003, 0x0002, 0x000F, 0x000C, 0x000A, 0x0000, 0x0011, 0x0035,
+      0x0005, 0x000D, 0x001D, 0x003C, 0x00FD, 0x01EB, 0x03D3, 0x07A3,
+      0x07A1, 0x0FBF, 0x0FBD, 0x0FBB, 0x0FB9, 0x0FB7, 0x0FB5, 0x0FB3,
+      0x0FB1, 0x0FAF, 0x0FAD, 0x0FAB, 0x0FA9, 0x0FA7, 0x0FA5, 0x0FA3,
+      0x0FA1, 0x0F9F, 0x0F9D, 0x0F9B, 0x0F99, 0x0F97, 0x0F95, 0x0F93,
+      0x0F91, 0x0F8F, 0x0F8D, 0x0F8B, 0x0F89, 0x0F87, 0x0F85, 0x0F83,
+      0x0F81, 0x0F7F, 0x0F7D, 0x0F7B, 0x0F79, 0x0F77, 0x0F75, 0x0F73,
+      0x0F71, 0x0F6F, 0x0F6D, 0x0F6B, 0x0F69, 0x0F67, 0x0F65, 0x0F63,
+      0x0F61, },
+    { 0x51D0, 0x51D2, 0x51D4, 0x51D6, 0x51D8, 0x51DA, 0x51DC, 0x51DE,
+      0x51E0, 0x51E2, 0x51E4, 0x51E6, 0x51E8, 0x51EA, 0x51EC, 0x51EE,
+      0x51F0, 0x51F2, 0x51F4, 0x51F6, 0x51F8, 0x51FA, 0x51FC, 0x51FE,
+      0x70C0, 0x70C2, 0x70C4, 0x70C6, 0x70C8, 0x70CA, 0x70CC, 0x70CE,
+      0x70EC, 0x10EA, 0x3868, 0x3877, 0x0876, 0x1C35, 0x0434, 0x0A34,
+      0x0E1B, 0x021B, 0x051B, 0x070F, 0x010F, 0x0380, 0x0080, 0x0140,
+      0x01C1, 0x0041, 0x00A1, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
+      0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
+      0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
+      0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A2, 0x0042,
+      0x01C2, 0x0141, 0x0081, 0x0381, 0x028C, 0x010C, 0x051C, 0x021C,
+      0x0E1C, 0x0A35, 0x0435, 0x1C3A, 0x0877, 0x0874, 0x3869, 0x10EB,
+      0x70ED, 0x70CF, 0x70CD, 0x70CB, 0x70C9, 0x70C7, 0x70C5, 0x70C3,
+      0x70C1, 0x51FF, 0x51FD, 0x51FB, 0x51F9, 0x51F7, 0x51F5, 0x51F3,
+      0x51F1, 0x51EF, 0x51ED, 0x51EB, 0x51E9, 0x51E7, 0x51E5, 0x51E3,
+      0x51E1, 0x51DF, 0x51DD, 0x51DB, 0x51D9, 0x51D7, 0x51D5, 0x51D3,
+      0x51D1, },
+    { 0x6F64, 0x6F66, 0x6F68, 0x6F6A, 0x6F6C, 0x6F6E, 0x6F70, 0x6F72,
+      0x6F74, 0x6F76, 0x6F78, 0x6F7A, 0x6F7C, 0x6F7E, 0x6F80, 0x6F82,
+      0x6F84, 0x6F86, 0x6F88, 0x6F8A, 0x6F8C, 0x6F8E, 0x6F90, 0x6F92,
+      0x6F94, 0x6F96, 0x6F98, 0x6F9A, 0x6F9C, 0x6F9E, 0x6FA0, 0x6FA2,
+      0x6FA4, 0x6FA6, 0x6FA8, 0x6FAA, 0x6FAC, 0x6FAE, 0x6FB0, 0x6FB2,
+      0x6FB4, 0x6FB6, 0x17B4, 0x37DC, 0x0BDB, 0x1BEF, 0x05EE, 0x0DF8,
+      0x02F8, 0x06FD, 0x017D, 0x037F, 0x00BF, 0x0040, 0x00C0, 0x0021,
+      0x0061, 0x0011, 0x0031, 0x0009, 0x0019, 0x0006, 0x000E, 0x0004,
+      0x0000, 0x0005, 0x000F, 0x0007, 0x001A, 0x000A, 0x0036, 0x0016,
+      0x006E, 0x002E, 0x00C1, 0x0041, 0x01BC, 0x00BC, 0x037A, 0x017A,
+      0x02F9, 0x0DF9, 0x05EF, 0x05EC, 0x1BD8, 0x37DD, 0x17B5, 0x6FB7,
+      0x6FB5, 0x6FB3, 0x6FB1, 0x6FAF, 0x6FAD, 0x6FAB, 0x6FA9, 0x6FA7,
+      0x6FA5, 0x6FA3, 0x6FA1, 0x6F9F, 0x6F9D, 0x6F9B, 0x6F99, 0x6F97,
+      0x6F95, 0x6F93, 0x6F91, 0x6F8F, 0x6F8D, 0x6F8B, 0x6F89, 0x6F87,
+      0x6F85, 0x6F83, 0x6F81, 0x6F7F, 0x6F7D, 0x6F7B, 0x6F79, 0x6F77,
+      0x6F75, 0x6F73, 0x6F71, 0x6F6F, 0x6F6D, 0x6F6B, 0x6F69, 0x6F67,
+      0x6F65, },
+    { 0xDF54, 0xDF56, 0xDFC8, 0xDFCA, 0xDFCC, 0xDFCE, 0xDFD0, 0xDFD2,
+      0xDFD4, 0xDFD6, 0xDFD8, 0xDFDA, 0xDFDC, 0xDFDE, 0xDFE0, 0xDFE2,
+      0x0FE8, 0x2FEA, 0x6FA8, 0x6FF6, 0x07F5, 0x07F7, 0x37D2, 0x37F9,
+      0x03F8, 0x0BF8, 0x0BFB, 0x1BEB, 0x01FA, 0x05FA, 0x09FA, 0x0DFA,
+      0x0DFF, 0x00FF, 0x02FF, 0x06FB, 0x007C, 0x017C, 0x027C, 0x027F,
+      0x003C, 0x00BC, 0x013C, 0x01BC, 0x001C, 0x005C, 0x009C, 0x00DC,
+      0x000C, 0x002C, 0x004C, 0x006C, 0x0004, 0x0014, 0x0024, 0x0034,
+      0x0000, 0x0008, 0x0010, 0x0018, 0x001E, 0x0002, 0x0006, 0x000A,
+      0x000E, 0x000B, 0x0007, 0x0003, 0x001F, 0x0019, 0x0011, 0x0009,
+      0x0001, 0x0035, 0x0025, 0x0015, 0x0005, 0x006D, 0x004D, 0x002D,
+      0x000D, 0x00DD, 0x009D, 0x005D, 0x001D, 0x01BD, 0x013D, 0x00BD,
+      0x003D, 0x037C, 0x027D, 0x017D, 0x007D, 0x06FC, 0x04FC, 0x02FC,
+      0x00FC, 0x0DFB, 0x09FB, 0x05FB, 0x01FB, 0x1BF8, 0x1BE8, 0x0BF9,
+      0x03F9, 0x37FA, 0x37D3, 0x17F4, 0x07F6, 0x6FF7, 0x6FA9, 0x2FEB,
+      0x0FE9, 0xDFE3, 0xDFE1, 0xDFDF, 0xDFDD, 0xDFDB, 0xDFD9, 0xDFD7,
+      0xDFD5, 0xDFD3, 0xDFD1, 0xDFCF, 0xDFCD, 0xDFCB, 0xDFC9, 0xDF57,
+      0xDF55, }
+};
+
+static const uint8_t scales_bits[SCALES_COUNT][129] = {
+    { 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 12, 11, 11, 10,  9,  8,
+       8,  7,  6,  6,  5,  4,  4,  3,
+       2,  3,  3,  4,  5,  5,  6,  7,
+       8,  8,  9, 10, 11, 11, 12, 13,
+      13, 13, 13, 13, 13, 13, 13, 13,
+      13, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, },
+    { 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      14, 14, 13, 12, 11, 10,  8,  7,
+       6,  6,  5,  5,  4,  4,  4,  3,
+       3,  3,  4,  4,  4,  4,  5,  6,
+       6,  7,  8,  9, 11, 12, 13, 14,
+      14, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, },
+    { 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 14, 14, 14, 13, 13, 12, 12,
+      12, 11, 11, 11, 10, 10,  9,  9,
+       9,  8,  8,  8,  7,  7,  7,  6,
+       6,  6,  5,  5,  5,  4,  4,  3,
+       3,  3,  4,  4,  5,  5,  5,  6,
+       6,  6,  7,  7,  7,  8,  8,  8,
+       9,  9,  9, 10, 10, 10, 11, 11,
+      12, 12, 12, 13, 13, 13, 14, 14,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, },
+    { 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 14, 14, 13, 13, 12, 12,
+      11, 11, 10, 10,  9,  8,  8,  7,
+       7,  6,  6,  5,  5,  4,  4,  3,
+       2,  3,  4,  4,  5,  5,  6,  6,
+       7,  7,  8,  8,  9,  9, 10, 10,
+      11, 12, 12, 12, 13, 14, 14, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, },
+    { 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16,
+      15, 15, 15, 15, 14, 14, 14, 14,
+      13, 13, 13, 13, 12, 12, 12, 12,
+      12, 11, 11, 11, 10, 10, 10, 10,
+       9,  9,  9,  9,  8,  8,  8,  8,
+       7,  7,  7,  7,  6,  6,  6,  6,
+       5,  5,  5,  5,  5,  4,  4,  4,
+       4,  4,  4,  4,  5,  5,  5,  5,
+       5,  6,  6,  6,  6,  7,  7,  7,
+       7,  8,  8,  8,  8,  9,  9,  9,
+       9, 10, 10, 10, 10, 11, 11, 11,
+      11, 12, 12, 12, 12, 13, 13, 13,
+      13, 14, 14, 14, 14, 15, 15, 15,
+      15, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16,
+      16,
+    }
+};
+
+static const uint16_t bitalloc_3_codes[3] = {
+    0x0003, 0x0000, 0x0002,
+};
+
+static const uint8_t bitalloc_3_bits[3] = {
+    2,  1,  2,
+};
+
+static const uint16_t bitalloc_5_codes_a[5] = {
+    0x000F, 0x0006, 0x0000, 0x0002, 0x000E,
+};
+
+static const uint16_t bitalloc_5_codes_b[5] = {
+    0x0007, 0x0001, 0x0002, 0x0000, 0x0006,
+};
+
+static const uint16_t bitalloc_5_codes_c[5] = {
+    0x0007, 0x0005, 0x0000, 0x0004, 0x0006,
+};
+
+static const uint8_t bitalloc_5_bits_a[5] = {
+    4,  3,  1,  2,  4,
+};
+
+static const uint8_t bitalloc_5_bits_b[5] = {
+    3,  2,  2,  2,  3,
+};
+
+static const uint8_t bitalloc_5_bits_c[5] = {
+    3,  3,  1,  3,  3,
+};
+
+static const uint16_t bitalloc_7_codes_a[7] = {
+    0x001E, 0x000E, 0x0005, 0x0000, 0x0006, 0x0004, 0x001F,
+};
+
+static const uint16_t bitalloc_7_codes_b[7] = {
+    0x0014, 0x000B, 0x0000, 0x0003, 0x0001, 0x0004, 0x0015,
+};
+
+static const uint16_t bitalloc_7_codes_c[7] = {
+    0x0000, 0x0002, 0x0001, 0x0003, 0x0002, 0x0003, 0x0001,
+};
+
+static const uint8_t bitalloc_7_bits_a[7] = {
+    5,  4,  3,  1,  3,  3,  5,
+};
+
+static const uint8_t bitalloc_7_bits_b[7] = {
+    5,  4,  2,  2,  2,  3,  5,
+};
+
+static const uint8_t bitalloc_7_bits_c[7] = {
+    4,  4,  2,  2,  2,  4,  4,
+};
+
+static const uint16_t bitalloc_9_codes_a[9] = {
+    0x0030, 0x0019, 0x0009, 0x0005, 0x0000, 0x0007, 0x000D, 0x0008,
+    0x0031,
+};
+
+static const uint16_t bitalloc_9_codes_b[9] = {
+    0x0018, 0x001A, 0x0002, 0x0007, 0x0002, 0x0000, 0x0003, 0x001B,
+    0x0019,
+};
+
+static const uint16_t bitalloc_9_codes_c[9] = {
+    0x001C, 0x000F, 0x0002, 0x0007, 0x0002, 0x0000, 0x0006, 0x0006,
+    0x001D,
+};
+
+static const uint8_t bitalloc_9_bits_a[9] = {
+    6,  5,  4,  3,  1,  3,  4,  4,  6,
+};
+
+static const uint8_t bitalloc_9_bits_b[9] = {
+    5,  5,  3,  3,  2,  2,  3,  5,  5,
+};
+
+static const uint8_t bitalloc_9_bits_c[9] = {
+    6,  5,  3,  3,  2,  2,  3,  4,  6,
+};
+
+static const uint16_t bitalloc_13_codes_a[13] = {
+    0x0070, 0x002E, 0x0039, 0x001D, 0x000C, 0x000F, 0x0000, 0x0004,
+    0x000D, 0x000A, 0x0016, 0x002F, 0x0071,
+};
+
+static const uint16_t bitalloc_13_codes_b[13] = {
+    0x0038, 0x0010, 0x001D, 0x0007, 0x000F, 0x0005, 0x0000, 0x0006,
+    0x0002, 0x0009, 0x0006, 0x0011, 0x0039,
+};
+
+static const uint16_t bitalloc_13_codes_c[13] = {
+    0x0004, 0x001A, 0x0003, 0x000E, 0x0000, 0x0003, 0x0005, 0x0004,
+    0x0002, 0x000F, 0x000C, 0x001B, 0x0005,
+};
+
+static const uint8_t bitalloc_13_bits_a[13] = {
+     7,  6,  6,  5,  4,  4,  1,  3,  4,  4,  5,  6,  7,
+};
+
+static const uint8_t bitalloc_13_bits_b[13] = {
+     6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  4,  5,  6,
+};
+
+static const uint8_t bitalloc_13_bits_c[13] = {
+     5,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  5,
+};
+
+static const uint16_t bitalloc_17_codes_a[17] = {
+    0x0154, 0x00AB, 0x002B, 0x000B, 0x0003, 0x000A, 0x0001, 0x0006,
+    0x0001, 0x0007, 0x0004, 0x000B, 0x0000, 0x0004, 0x0014, 0x0054,
+    0x0155,
+};
+
+static const uint16_t bitalloc_17_codes_b[17] = {
+    0x007C, 0x003F, 0x0019, 0x000D, 0x001C, 0x0008, 0x000F, 0x0005,
+    0x0000, 0x0006, 0x0002, 0x0009, 0x001D, 0x000E, 0x001E, 0x0018,
+    0x007D,
+};
+
+static const uint16_t bitalloc_17_codes_c[17] = {
+    0x002C, 0x0017, 0x0005, 0x001C, 0x0003, 0x000A, 0x000F, 0x0003,
+    0x0006, 0x0004, 0x0000, 0x000B, 0x0004, 0x001D, 0x000A, 0x0004,
+    0x002D,
+};
+
+static const uint16_t bitalloc_17_codes_d[17] = {
+    0x0100, 0x0102, 0x0082, 0x0042, 0x0022, 0x0012, 0x000A, 0x0006,
+    0x0000, 0x0007, 0x000B, 0x0013, 0x0023, 0x0043, 0x0083, 0x0103,
+    0x0101,
+};
+
+static const uint16_t bitalloc_17_codes_e[17] = {
+    0x00E8, 0x00F6, 0x0075, 0x0034, 0x003B, 0x001B, 0x001F, 0x0004,
+    0x0000, 0x0005, 0x000C, 0x001C, 0x003C, 0x0035, 0x007A, 0x00F7,
+    0x00E9,
+};
+
+static const uint16_t bitalloc_17_codes_f[17] = {
+    0x0004, 0x0003, 0x001E, 0x0001, 0x0001, 0x000E, 0x0001, 0x0004,
+    0x0006, 0x0005, 0x0002, 0x000F, 0x0006, 0x000E, 0x001F, 0x0000,
+    0x0005,
+};
+
+static const uint16_t bitalloc_17_codes_g[17] = {
+    0x0060, 0x007E, 0x0031, 0x0019, 0x000D, 0x0004, 0x0000, 0x0006,
+    0x0002, 0x0007, 0x0001, 0x0005, 0x000E, 0x001E, 0x003E, 0x007F,
+    0x0061,
+};
+
+static const uint8_t bitalloc_17_bits_a[17] = {
+    12, 11,  9,  7,  5,  4,  3,  3,  2,  3,  3,  4,  4,  6,  8, 10,
+    12,
+};
+
+static const uint8_t bitalloc_17_bits_b[17] = {
+    8,  7,  6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  5,  5,  6,  6,
+    8,
+};
+
+static const uint8_t bitalloc_17_bits_c[17] = {
+    7,  6,  5,  5,  4,  4,  4,  3,  3,  3,  3,  4,  4,  5,  5,  5,
+    7,
+};
+
+static const uint8_t bitalloc_17_bits_d[17] = {
+    9,  9,  8,  7,  6,  5,  4,  3,  1,  3,  4,  5,  6,  7,  8,  9,
+    9,
+};
+
+static const uint8_t bitalloc_17_bits_e[17] = {
+    8,  8,  7,  6,  6,  5,  5,  3,  1,  3,  4,  5,  6,  6,  7,  8,
+    8,
+};
+
+static const uint8_t bitalloc_17_bits_f[17] = {
+    8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  6,  6,
+    8,
+};
+
+static const uint8_t bitalloc_17_bits_g[17] = {
+    8,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,  5,  6,  7,  8,
+    8,
+};
+
+static const uint16_t bitalloc_25_codes_a[25] = {
+    0x2854, 0x142B, 0x050B, 0x0143, 0x00A2, 0x0052, 0x002E, 0x0015,
+    0x0004, 0x000E, 0x0000, 0x0003, 0x0006, 0x0004, 0x0001, 0x000F,
+    0x0005, 0x0016, 0x002F, 0x0053, 0x00A3, 0x00A0, 0x0284, 0x0A14,
+    0x2855,
+};
+
+static const uint16_t bitalloc_25_codes_b[25] = {
+    0x001C, 0x000F, 0x0005, 0x0000, 0x0030, 0x0036, 0x000E, 0x0019,
+    0x0001, 0x0008, 0x000E, 0x0001, 0x0005, 0x0002, 0x000F, 0x0009,
+    0x0006, 0x001A, 0x000F, 0x0037, 0x0031, 0x0001, 0x0006, 0x0004,
+    0x001D,
+};
+
+static const uint16_t bitalloc_25_codes_c[25] = {
+    0x004C, 0x0027, 0x006D, 0x0028, 0x0037, 0x000E, 0x0015, 0x0000,
+    0x0005, 0x0008, 0x000B, 0x000E, 0x0001, 0x000F, 0x000C, 0x0009,
+    0x0006, 0x0001, 0x001A, 0x000F, 0x0008, 0x0029, 0x0012, 0x006C,
+    0x004D,
+};
+
+static const uint16_t bitalloc_25_codes_d[25] = {
+    0x0780, 0x0782, 0x03C2, 0x01E2, 0x00FE, 0x0079, 0x003D, 0x001C,
+    0x000C, 0x0004, 0x0000, 0x0006, 0x0002, 0x0007, 0x0001, 0x0005,
+    0x000D, 0x001D, 0x003E, 0x007E, 0x00FF, 0x01E3, 0x03C3, 0x0783,
+    0x0781,
+};
+
+static const uint16_t bitalloc_25_codes_e[25] = {
+    0x003C, 0x0092, 0x0018, 0x001F, 0x004E, 0x000D, 0x0025, 0x0004,
+    0x0010, 0x0000, 0x000A, 0x0002, 0x0003, 0x0003, 0x000B, 0x0001,
+    0x0011, 0x0005, 0x0026, 0x000E, 0x004F, 0x0048, 0x0019, 0x0093,
+    0x003D,
+};
+
+static const uint16_t bitalloc_25_codes_f[25] = {
+    0x0324, 0x0193, 0x00CE, 0x0065, 0x0024, 0x000C, 0x0013, 0x0004,
+    0x0007, 0x000A, 0x000D, 0x000F, 0x0001, 0x0000, 0x000E, 0x000B,
+    0x0008, 0x0005, 0x0018, 0x000D, 0x0025, 0x0066, 0x00CF, 0x00C8,
+    0x0325,
+};
+
+static const uint16_t bitalloc_25_codes_g[25] = {
+    0x03A8, 0x03AE, 0x01D5, 0x0094, 0x0014, 0x004B, 0x000B, 0x003B,
+    0x0013, 0x0003, 0x000F, 0x0005, 0x0001, 0x0006, 0x0000, 0x0008,
+    0x001C, 0x0004, 0x0024, 0x0074, 0x0015, 0x0095, 0x01D6, 0x03AF,
+    0x03A9,
+};
+
+static const uint8_t bitalloc_25_bits_a[25] = {
+    14, 13, 11,  9,  8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,
+     4,  5,  6,  7,  8,  8, 10, 12, 14,
+};
+
+static const uint8_t bitalloc_25_bits_b[25] = {
+    9,  8,  7,  6,  6,  6,  5,  5,  4,  4,  4,  3,  3,  3,  4,  4,
+    4,  5,  5,  6,  6,  6,  7,  7,  9,
+};
+
+static const uint8_t bitalloc_25_bits_c[25] = {
+    8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  4,  4,  3,  4,  4,  4,
+    4,  4,  5,  5,  5,  6,  6,  7,  8,
+};
+
+static const uint8_t bitalloc_25_bits_d[25] = {
+    12, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,
+     5,  6,  7,  8,  9, 10, 11, 12, 12,
+};
+
+static const uint8_t bitalloc_25_bits_e[25] = {
+    8,  8,  7,  7,  7,  6,  6,  5,  5,  4,  4,  3,  2,  3,  4,  4,
+    5,  5,  6,  6,  7,  7,  7,  8,  8,
+};
+
+static const uint8_t bitalloc_25_bits_f[25] = {
+    10,  9,  8,  7,  6,  5,  5,  4,  4,  4,  4,  4,  3,  3,  4,  4,
+     4,  4,  5,  5,  6,  7,  8,  8, 10,
+};
+
+static const uint8_t bitalloc_25_bits_g[25] = {
+    10, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,  2,  3,  3,  4,
+     5,  5,  6,  7,  7,  8,  9, 10, 10,
+};
+
+static const uint16_t bitalloc_33_codes_a[33] = {
+    0x1580, 0x1582, 0x0AC2, 0x0562, 0x02B2, 0x015E, 0x00AD, 0x0054,
+    0x001C, 0x003C, 0x000F, 0x001F, 0x0008, 0x000B, 0x000D, 0x0000,
+    0x0002, 0x0001, 0x000E, 0x000C, 0x0009, 0x0006, 0x0014, 0x003D,
+    0x001D, 0x0055, 0x00AE, 0x015F, 0x02B3, 0x0563, 0x0AC3, 0x1583,
+    0x1581,
+};
+
+static const uint16_t bitalloc_33_codes_b[33] = {
+    0x030C, 0x0187, 0x006D, 0x0028, 0x0037, 0x0066, 0x0015, 0x0031,
+    0x0000, 0x000B, 0x0012, 0x001A, 0x0001, 0x0007, 0x000A, 0x000E,
+    0x0001, 0x000F, 0x000B, 0x0008, 0x0004, 0x001B, 0x0013, 0x000C,
+    0x0001, 0x0032, 0x001A, 0x0067, 0x0060, 0x0029, 0x00C2, 0x006C,
+    0x030D,
+};
+
+static const uint16_t bitalloc_33_codes_c[33] = {
+    0x00CC, 0x0067, 0x0005, 0x0070, 0x0003, 0x001A, 0x0039, 0x003F,
+    0x000A, 0x0012, 0x0018, 0x001D, 0x0001, 0x0003, 0x0007, 0x000A,
+    0x000D, 0x000B, 0x0008, 0x0004, 0x0002, 0x001E, 0x0019, 0x0013,
+    0x000B, 0x0000, 0x003E, 0x001B, 0x0018, 0x0071, 0x0032, 0x0004,
+    0x00CD,
+};
+
+static const uint16_t bitalloc_33_codes_d[33] = {
+    0x3AF8, 0x3AFA, 0x1D7E, 0x0EBC, 0x075C, 0x03AC, 0x01D4, 0x0094,
+    0x0014, 0x004B, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
+    0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x0074,
+    0x0015, 0x0095, 0x01D5, 0x03AD, 0x075D, 0x0EBD, 0x1D7F, 0x3AFB,
+    0x3AF9,
+};
+
+static const uint16_t bitalloc_33_codes_e[33] = {
+    0x01C8, 0x01E6, 0x0064, 0x00E2, 0x00E5, 0x0030, 0x0033, 0x0073,
+    0x007A, 0x001A, 0x003A, 0x0002, 0x001A, 0x001F, 0x0007, 0x0001,
+    0x0002, 0x0002, 0x000C, 0x0000, 0x001B, 0x0003, 0x003B, 0x001B,
+    0x007B, 0x0078, 0x0070, 0x0031, 0x00F2, 0x00E3, 0x0065, 0x01E7,
+    0x01C9,
+};
+
+static const uint16_t bitalloc_33_codes_f[33] = {
+    0x0724, 0x0393, 0x01CE, 0x00E5, 0x002C, 0x0008, 0x0017, 0x003E,
+    0x0005, 0x0014, 0x001D, 0x0000, 0x0003, 0x0006, 0x0008, 0x000B,
+    0x000D, 0x000C, 0x0009, 0x0007, 0x0004, 0x0001, 0x001E, 0x0015,
+    0x000A, 0x003F, 0x0038, 0x0009, 0x002D, 0x00E6, 0x01CF, 0x01C8,
+    0x0725,
+};
+
+static const uint16_t bitalloc_33_codes_g[33] = {
+    0x0284, 0x0042, 0x0140, 0x0143, 0x003E, 0x00BE, 0x0011, 0x0051,
+    0x0009, 0x0029, 0x0005, 0x0015, 0x0000, 0x0008, 0x000E, 0x0002,
+    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0016, 0x0006, 0x002E,
+    0x000E, 0x005E, 0x001E, 0x00BF, 0x003F, 0x0020, 0x0141, 0x0043,
+    0x0285,
+};
+
+static const uint8_t bitalloc_33_bits_a[33] = {
+    13, 13, 12, 11, 10,  9,  8,  7,  6,  6,  5,  5,  4,  4,  4,  3,
+     3,  3,  4,  4,  4,  4,  5,  6,  6,  7,  8,  9, 10, 11, 12, 13,
+    13,
+};
+
+static const uint8_t bitalloc_33_bits_b[33] = {
+    10,  9,  8,  7,  7,  7,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
+     3,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  7,  7,  7,  8,  8,
+    10,
+};
+
+static const uint8_t bitalloc_33_bits_c[33] = {
+    9,  8,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,  7,
+    9,
+};
+
+static const uint8_t bitalloc_33_bits_d[33] = {
+    14, 14, 13, 12, 11, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,
+     2,  3,  3,  4,  5,  5,  6,  7,  7,  8,  9, 10, 11, 12, 13, 14,
+    14,
+};
+
+static const uint8_t bitalloc_33_bits_e[33] = {
+    9,  9,  8,  8,  8,  7,  7,  7,  7,  6,  6,  5,  5,  5,  4,  3,
+    2,  3,  4,  4,  5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,
+    9,
+};
+
+static const uint8_t bitalloc_33_bits_f[33] = {
+    11, 10,  9,  8,  7,  6,  6,  6,  5,  5,  5,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  8,  9,  9,
+    11,
+};
+
+static const uint8_t bitalloc_33_bits_g[33] = {
+    10,  9,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
+     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  9,  9,
+    10,
+};
+
+static const uint16_t bitalloc_65_codes_a[65] = {
+    0x9E5C, 0x9E5E, 0x4F2C, 0x2794, 0x13C4, 0x1E44, 0x09E3, 0x0F23,
+    0x04F3, 0x0792, 0x027E, 0x03CE, 0x013D, 0x01E5, 0x009C, 0x00CC,
+    0x0040, 0x0058, 0x0067, 0x001E, 0x0021, 0x002D, 0x003D, 0x0007,
+    0x0011, 0x0014, 0x0017, 0x001A, 0x001C, 0x001F, 0x0001, 0x0004,
+    0x0006, 0x0005, 0x0002, 0x0000, 0x001D, 0x001B, 0x0018, 0x0015,
+    0x0012, 0x000E, 0x0006, 0x0032, 0x0026, 0x001F, 0x0078, 0x0059,
+    0x0041, 0x00CD, 0x009D, 0x01E6, 0x013E, 0x03CF, 0x027F, 0x0793,
+    0x0790, 0x04F0, 0x09E4, 0x1E45, 0x13C5, 0x2795, 0x4F2D, 0x9E5F,
+    0x9E5D,
+};
+
+static const uint16_t bitalloc_65_codes_b[65] = {
+    0x0A8C, 0x0547, 0x01B5, 0x0008, 0x00DB, 0x0152, 0x0005, 0x000B,
+    0x008E, 0x00AE, 0x00E4, 0x0003, 0x0037, 0x0039, 0x0055, 0x006C,
+    0x0073, 0x0003, 0x0015, 0x001D, 0x0028, 0x0030, 0x0037, 0x003E,
+    0x0006, 0x000B, 0x000F, 0x0012, 0x0016, 0x0019, 0x001D, 0x0001,
+    0x0004, 0x0002, 0x001E, 0x001A, 0x0017, 0x0013, 0x0010, 0x000C,
+    0x0007, 0x003F, 0x0038, 0x0031, 0x0029, 0x0022, 0x001A, 0x0014,
+    0x0000, 0x006D, 0x0056, 0x0046, 0x0038, 0x0004, 0x00E5, 0x00AF,
+    0x008F, 0x006C, 0x000A, 0x0153, 0x0150, 0x0009, 0x02A2, 0x01B4,
+    0x0A8D,
+};
+
+static const uint16_t bitalloc_65_codes_c[65] = {
+    0x045C, 0x022F, 0x03F5, 0x01BC, 0x01FB, 0x0059, 0x00D0, 0x00DF,
+    0x000A, 0x002D, 0x002F, 0x0052, 0x0069, 0x0078, 0x007F, 0x000A,
+    0x0010, 0x001C, 0x0023, 0x002A, 0x0035, 0x003A, 0x003D, 0x0000,
+    0x0003, 0x0006, 0x0009, 0x000C, 0x000F, 0x0012, 0x0016, 0x0018,
+    0x001C, 0x0019, 0x0017, 0x0013, 0x0010, 0x000D, 0x000A, 0x0007,
+    0x0004, 0x0001, 0x003E, 0x003B, 0x0036, 0x002B, 0x0028, 0x001D,
+    0x0011, 0x000B, 0x0004, 0x0079, 0x006E, 0x0053, 0x0044, 0x002E,
+    0x000B, 0x00FC, 0x00D1, 0x008A, 0x0058, 0x01BD, 0x0116, 0x03F4,
+    0x045D,
+};
+
+static const uint16_t bitalloc_65_codes_d[65] = {
+    0x70B0, 0x70B2, 0x70B4, 0x2852, 0x385B, 0x142E, 0x1C2E, 0x0A15,
+    0x0E14, 0x0214, 0x0704, 0x0104, 0x010B, 0x0383, 0x0083, 0x0143,
+    0x01C3, 0x0043, 0x00A2, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
+    0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
+    0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
+    0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A3, 0x00A0,
+    0x0040, 0x01C0, 0x0084, 0x0384, 0x0284, 0x0105, 0x0705, 0x0215,
+    0x0E15, 0x0A16, 0x1C2F, 0x142F, 0x1428, 0x2853, 0x70B5, 0x70B3,
+    0x70B1,
+};
+
+static const uint16_t bitalloc_65_codes_e[65] = {
+    0x032C, 0x0332, 0x0378, 0x037E, 0x008C, 0x014A, 0x0188, 0x0197,
+    0x019E, 0x01BD, 0x0044, 0x0047, 0x00AA, 0x00C5, 0x00CD, 0x00DC,
+    0x001C, 0x002C, 0x0053, 0x0063, 0x0068, 0x0008, 0x000F, 0x0017,
+    0x002B, 0x0035, 0x0005, 0x0009, 0x0016, 0x001C, 0x0006, 0x000F,
+    0x0004, 0x0000, 0x0007, 0x001D, 0x0017, 0x000A, 0x0006, 0x0036,
+    0x0030, 0x0028, 0x0010, 0x0009, 0x0069, 0x0064, 0x0054, 0x002D,
+    0x001D, 0x00DD, 0x00CE, 0x00CA, 0x00AB, 0x00A4, 0x0045, 0x01BE,
+    0x019F, 0x0198, 0x0189, 0x014B, 0x008D, 0x037F, 0x0379, 0x0333,
+    0x032D,
+};
+
+static const uint16_t bitalloc_65_codes_f[65] = {
+    0x0FE0, 0x0FE2, 0x0FE8, 0x0FEA, 0x0FEC, 0x0FEE, 0x0FF0, 0x0FF2,
+    0x0FF4, 0x2FF2, 0x07F2, 0x07FB, 0x03F6, 0x0BFA, 0x0BFD, 0x01FF,
+    0x05FF, 0x02FC, 0x007C, 0x017C, 0x003C, 0x00BC, 0x001C, 0x005C,
+    0x000C, 0x002C, 0x0004, 0x0014, 0x0000, 0x0008, 0x000E, 0x0002,
+    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0015, 0x0005, 0x002D,
+    0x000D, 0x005D, 0x001D, 0x00BD, 0x003D, 0x017D, 0x007D, 0x02FD,
+    0x00FC, 0x05FC, 0x01FA, 0x0BFB, 0x03F7, 0x17F8, 0x07F3, 0x2FF3,
+    0x0FF5, 0x0FF3, 0x0FF1, 0x0FEF, 0x0FED, 0x0FEB, 0x0FE9, 0x0FE3,
+    0x0FE1,
+};
+
+static const uint16_t bitalloc_65_codes_g[65] = {
+    0x010C, 0x038A, 0x0608, 0x0786, 0x0084, 0x0087, 0x0302, 0x0305,
+    0x0040, 0x00E0, 0x00E3, 0x0183, 0x001E, 0x005E, 0x009E, 0x00DE,
+    0x00F1, 0x0011, 0x0039, 0x0061, 0x0079, 0x0009, 0x001D, 0x0031,
+    0x003D, 0x0005, 0x000F, 0x0019, 0x001F, 0x0003, 0x0006, 0x000A,
+    0x000E, 0x000B, 0x0008, 0x0004, 0x0000, 0x001A, 0x0012, 0x000A,
+    0x0002, 0x0036, 0x0026, 0x0016, 0x0006, 0x006E, 0x004E, 0x002E,
+    0x000E, 0x00DF, 0x009F, 0x005F, 0x001F, 0x01E0, 0x0180, 0x00E1,
+    0x0041, 0x03C2, 0x0303, 0x01C4, 0x0085, 0x0787, 0x0609, 0x038B,
+    0x010D,
+};
+
+static const uint8_t bitalloc_65_bits_a[65] = {
+    16, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10,  9,  9,  8,  8,
+     7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,  4,
+     4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,
+     7,  8,  8,  9,  9, 10, 10, 11, 11, 11, 12, 13, 13, 14, 15, 16,
+    16,
+};
+
+static const uint8_t bitalloc_65_bits_b[65] = {
+    12, 11, 10,  9,  9,  9,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
+     7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,
+     4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,
+     6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9, 10, 10,
+    12,
+};
+
+static const uint8_t bitalloc_65_bits_c[65] = {
+    11, 10, 10,  9,  9,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  6,
+     6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10,
+    11,
+};
+
+static const uint8_t bitalloc_65_bits_d[65] = {
+    15, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,  9,  9,
+     9,  8,  8,  8,  7,  7,  7,  6,  6,  6,  5,  5,  5,  4,  4,  3,
+     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  8,  8,  8,
+     8,  9,  9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 15, 15,
+    15,
+};
+
+static const uint8_t bitalloc_65_bits_e[65] = {
+    10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,
+     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,
+     3,  3,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  7,  7,
+     7,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10,
+    10,
+};
+
+static const uint8_t bitalloc_65_bits_f[65] = {
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+    11, 10,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
+     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10,
+    10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14,
+};
+
+static const uint8_t bitalloc_65_bits_g[65] = {
+    11, 11, 11, 11, 10, 10, 10, 10,  9,  9,  9,  9,  8,  8,  8,  8,
+     8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,
+     4,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,
+     7,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
+    11,
+};
+
+static const uint16_t bitalloc_129_codes_a[129] = {
+    0x0660, 0x0666, 0x06EC, 0x0722, 0x0760, 0x076E, 0x004C, 0x004E,
+    0x00F4, 0x010A, 0x0148, 0x0156, 0x01D4, 0x01F2, 0x0331, 0x0370,
+    0x0377, 0x0396, 0x03B1, 0x0024, 0x0064, 0x007B, 0x008A, 0x00A5,
+    0x00D4, 0x00EB, 0x00FA, 0x019A, 0x01B9, 0x01C9, 0x01D9, 0x0010,
+    0x0030, 0x0033, 0x0043, 0x0053, 0x006B, 0x007A, 0x00CA, 0x00D2,
+    0x00DE, 0x00E6, 0x00F6, 0x000E, 0x001F, 0x0023, 0x002B, 0x003B,
+    0x003F, 0x0067, 0x0070, 0x0077, 0x0005, 0x000D, 0x0012, 0x001B,
+    0x002C, 0x0035, 0x003A, 0x0004, 0x000B, 0x0017, 0x001F, 0x0009,
+    0x0008, 0x000A, 0x0000, 0x0018, 0x000C, 0x0005, 0x003C, 0x0036,
+    0x002D, 0x001C, 0x0013, 0x000E, 0x0006, 0x007A, 0x0071, 0x0068,
+    0x0064, 0x003C, 0x0034, 0x0028, 0x0020, 0x000F, 0x00F7, 0x00E7,
+    0x00DF, 0x00D3, 0x00CB, 0x007B, 0x0074, 0x0054, 0x0044, 0x003C,
+    0x0031, 0x0011, 0x01DA, 0x01CA, 0x01BA, 0x019B, 0x00FB, 0x00F8,
+    0x00D5, 0x00AA, 0x008B, 0x0084, 0x0065, 0x0025, 0x03B6, 0x0397,
+    0x0390, 0x0371, 0x0332, 0x01F3, 0x01D5, 0x0157, 0x0149, 0x010B,
+    0x00F5, 0x004F, 0x004D, 0x076F, 0x0761, 0x0723, 0x06ED, 0x0667,
+    0x0661,
+};
+
+static const uint16_t bitalloc_129_codes_b[129] = {
+    0x29DC, 0x14EF, 0x0455, 0x0E9C, 0x022B, 0x0489, 0x0740, 0x074F,
+    0x0172, 0x0245, 0x0247, 0x030A, 0x03A1, 0x001C, 0x008B, 0x00D6,
+    0x010C, 0x0148, 0x014F, 0x0186, 0x01D1, 0x0008, 0x000F, 0x0046,
+    0x005D, 0x0078, 0x0087, 0x0096, 0x00A5, 0x00BC, 0x00D8, 0x00DE,
+    0x00F6, 0x0005, 0x0014, 0x0024, 0x002F, 0x003A, 0x003D, 0x0049,
+    0x0050, 0x0058, 0x005F, 0x0066, 0x006D, 0x0075, 0x007C, 0x0004,
+    0x000B, 0x0013, 0x0018, 0x001B, 0x001F, 0x0022, 0x0026, 0x002A,
+    0x002D, 0x0031, 0x0034, 0x0038, 0x003B, 0x003F, 0x0003, 0x0006,
+    0x000A, 0x0007, 0x0004, 0x0000, 0x003C, 0x0039, 0x0035, 0x0032,
+    0x002E, 0x002B, 0x0027, 0x0023, 0x0020, 0x001C, 0x0019, 0x0016,
+    0x0010, 0x0005, 0x007D, 0x007A, 0x006E, 0x0067, 0x0060, 0x0059,
+    0x0051, 0x004A, 0x0042, 0x003B, 0x0034, 0x0025, 0x0015, 0x0006,
+    0x00F7, 0x00DF, 0x00D9, 0x00BD, 0x00A6, 0x0097, 0x0090, 0x0079,
+    0x006A, 0x0047, 0x0044, 0x0009, 0x01D2, 0x0187, 0x0184, 0x0149,
+    0x010D, 0x00D7, 0x00B8, 0x001D, 0x03A6, 0x030B, 0x029C, 0x0246,
+    0x0173, 0x0114, 0x0741, 0x053A, 0x0488, 0x0E9D, 0x0A76, 0x0454,
+    0x29DD,
+};
+
+static const uint16_t bitalloc_129_codes_c[129] = {
+    0x0E5C, 0x072F, 0x001D, 0x0724, 0x000F, 0x010D, 0x0324, 0x0393,
+    0x03E9, 0x0080, 0x0087, 0x00FA, 0x0164, 0x0193, 0x01DE, 0x01F5,
+    0x0010, 0x002A, 0x0041, 0x0064, 0x0073, 0x008E, 0x00A4, 0x00B3,
+    0x00D6, 0x00E5, 0x00F4, 0x00FB, 0x0002, 0x0009, 0x0013, 0x001E,
+    0x0026, 0x002C, 0x0033, 0x003F, 0x0041, 0x004C, 0x0053, 0x005E,
+    0x0065, 0x0070, 0x0073, 0x0078, 0x007B, 0x007E, 0x0002, 0x0005,
+    0x0007, 0x000B, 0x000D, 0x0011, 0x0014, 0x0017, 0x001A, 0x001D,
+    0x0021, 0x0024, 0x0027, 0x002A, 0x002D, 0x0030, 0x0033, 0x0036,
+    0x003A, 0x0037, 0x0034, 0x0031, 0x002E, 0x002B, 0x0028, 0x0025,
+    0x0022, 0x001E, 0x001B, 0x0018, 0x0015, 0x0012, 0x000E, 0x000C,
+    0x0008, 0x0006, 0x0003, 0x007F, 0x007C, 0x0079, 0x0076, 0x0071,
+    0x006A, 0x005F, 0x0058, 0x004D, 0x0046, 0x0040, 0x0038, 0x002D,
+    0x0027, 0x001F, 0x0014, 0x0012, 0x0003, 0x0000, 0x00F5, 0x00EE,
+    0x00D7, 0x00C8, 0x00A5, 0x008F, 0x007C, 0x0065, 0x0042, 0x002B,
+    0x0011, 0x0002, 0x01DF, 0x01C8, 0x0165, 0x00FB, 0x00E4, 0x0081,
+    0x0006, 0x03E8, 0x0325, 0x01CA, 0x010C, 0x0725, 0x0396, 0x001C,
+    0x0E5D,
+};
+
+static const uint16_t bitalloc_129_codes_d[129] = {
+    0xA598, 0xA59A, 0xA59C, 0xA59E, 0xC598, 0xE586, 0x3ACC, 0x52CA,
+    0x62CD, 0x0D48, 0x1D67, 0x2978, 0x3167, 0x3966, 0x06A5, 0x0EBC,
+    0x14BD, 0x1CB1, 0x0350, 0x0353, 0x075F, 0x0A5F, 0x0C5E, 0x0E5E,
+    0x01AE, 0x03AD, 0x052D, 0x062D, 0x072D, 0x00D5, 0x01D4, 0x0294,
+    0x0314, 0x0394, 0x0014, 0x0094, 0x0114, 0x0174, 0x01B4, 0x01F4,
+    0x000B, 0x004B, 0x008B, 0x00BB, 0x00DB, 0x00FB, 0x001B, 0x003B,
+    0x0053, 0x0063, 0x0073, 0x0003, 0x0013, 0x0023, 0x002F, 0x0037,
+    0x003F, 0x0007, 0x000F, 0x0015, 0x0019, 0x001D, 0x0001, 0x0005,
+    0x0009, 0x0006, 0x0002, 0x001E, 0x001A, 0x0016, 0x0010, 0x0008,
+    0x0000, 0x0038, 0x0030, 0x0028, 0x001C, 0x000C, 0x007C, 0x006C,
+    0x005C, 0x0044, 0x0024, 0x0004, 0x00E4, 0x00C4, 0x00A4, 0x0074,
+    0x0034, 0x01F5, 0x01B5, 0x0175, 0x0115, 0x0095, 0x0015, 0x0395,
+    0x0315, 0x0295, 0x01D5, 0x00D6, 0x072E, 0x062E, 0x052E, 0x03AE,
+    0x01AF, 0x0E5F, 0x0C5F, 0x0C58, 0x0A58, 0x0758, 0x0351, 0x1CB2,
+    0x18B2, 0x0EBD, 0x0EB2, 0x3967, 0x3960, 0x2979, 0x2964, 0x0D49,
+    0x72C2, 0x52CB, 0x3ACD, 0xE587, 0xC599, 0xA59F, 0xA59D, 0xA59B,
+    0xA599,
+};
+
+static const uint16_t bitalloc_129_codes_e[129] = {
+    0xA13C, 0xC720, 0xA13F, 0xA13E, 0xA13D, 0xE722, 0x5090, 0x6393,
+    0x7392, 0x2849, 0x31CE, 0x39CE, 0x1425, 0x18E5, 0x1CE5, 0x0844,
+    0x0A1C, 0x0C7C, 0x036C, 0x0423, 0x050F, 0x063F, 0x01B7, 0x0216,
+    0x0285, 0x031D, 0x039D, 0x0109, 0x0140, 0x0180, 0x01C8, 0x01CF,
+    0x007A, 0x008A, 0x00A2, 0x00C1, 0x00E5, 0x0014, 0x0037, 0x0043,
+    0x004E, 0x0056, 0x0061, 0x006C, 0x007C, 0x000B, 0x001C, 0x001F,
+    0x0023, 0x0025, 0x0029, 0x002C, 0x002E, 0x0032, 0x0034, 0x0037,
+    0x003A, 0x003C, 0x003F, 0x0001, 0x0003, 0x0006, 0x0008, 0x000A,
+    0x000C, 0x000B, 0x0009, 0x0007, 0x0004, 0x0002, 0x0000, 0x003D,
+    0x003B, 0x0038, 0x0035, 0x0033, 0x002F, 0x002D, 0x002A, 0x0026,
+    0x0024, 0x0020, 0x001D, 0x001A, 0x007D, 0x006D, 0x0062, 0x0057,
+    0x004F, 0x0044, 0x003C, 0x0015, 0x00E6, 0x00C6, 0x00A3, 0x008B,
+    0x007B, 0x006C, 0x01C9, 0x0181, 0x0141, 0x010A, 0x00DA, 0x031E,
+    0x0286, 0x0217, 0x0210, 0x0738, 0x0638, 0x0508, 0x036D, 0x0C7D,
+    0x0A1D, 0x0845, 0x1CE6, 0x18E6, 0x1426, 0x39CF, 0x31CF, 0x284E,
+    0x7393, 0x7390, 0x5091, 0xE723, 0xC724, 0xC725, 0xC722, 0xC723,
+    0xC721,
+};
+
+static const uint16_t bitalloc_129_codes_f[129] = {
+    0x762C, 0x3B17, 0x1555, 0x0608, 0x0AAB, 0x0FF2, 0x0305, 0x0307,
+    0x0763, 0x0046, 0x010C, 0x01BC, 0x02AB, 0x03B6, 0x03FD, 0x0080,
+    0x0087, 0x00DF, 0x0156, 0x01D9, 0x01F8, 0x01FF, 0x002A, 0x0041,
+    0x0061, 0x0094, 0x00D4, 0x00EA, 0x00F2, 0x00FD, 0x0009, 0x000B,
+    0x001A, 0x0026, 0x0031, 0x0040, 0x004B, 0x006B, 0x0073, 0x0077,
+    0x007A, 0x007C, 0x0000, 0x0002, 0x0006, 0x0008, 0x000B, 0x000E,
+    0x0011, 0x0014, 0x0016, 0x0019, 0x001C, 0x001E, 0x0021, 0x0023,
+    0x0026, 0x0028, 0x002B, 0x002D, 0x002F, 0x0031, 0x0033, 0x0036,
+    0x0038, 0x0037, 0x0034, 0x0032, 0x0030, 0x002E, 0x002C, 0x0029,
+    0x0027, 0x0024, 0x0022, 0x001F, 0x001D, 0x001A, 0x0017, 0x0015,
+    0x0012, 0x000F, 0x000C, 0x0009, 0x0007, 0x0003, 0x0001, 0x007D,
+    0x007B, 0x0078, 0x0074, 0x0072, 0x0054, 0x0041, 0x0036, 0x0027,
+    0x001B, 0x0014, 0x000A, 0x00FE, 0x00F3, 0x00EB, 0x00D5, 0x0095,
+    0x006E, 0x0042, 0x002B, 0x0010, 0x01F9, 0x01DA, 0x0157, 0x0154,
+    0x00C0, 0x0081, 0x0022, 0x03B7, 0x03B0, 0x01BD, 0x010D, 0x0047,
+    0x07F8, 0x0554, 0x0306, 0x0FF3, 0x0EC4, 0x0609, 0x1D8A, 0x1554,
+    0x762D,
+};
+
+static const uint16_t bitalloc_129_codes_g[129] = {
+    0x1E20, 0x1E5E, 0x031C, 0x051A, 0x0718, 0x0916, 0x0B14, 0x0D12,
+    0x0F11, 0x0090, 0x018F, 0x028E, 0x038D, 0x048C, 0x058B, 0x068A,
+    0x0789, 0x0049, 0x00C8, 0x0148, 0x01C7, 0x0247, 0x02C6, 0x0346,
+    0x03C5, 0x0025, 0x0065, 0x00A5, 0x00E4, 0x0124, 0x0164, 0x01A4,
+    0x01E3, 0x0013, 0x0033, 0x0053, 0x0073, 0x0093, 0x00B3, 0x00D3,
+    0x00F3, 0x000A, 0x001A, 0x002A, 0x003A, 0x004A, 0x005A, 0x006A,
+    0x007A, 0x0006, 0x000E, 0x0016, 0x001E, 0x0026, 0x002E, 0x0036,
+    0x003E, 0x0004, 0x0008, 0x000C, 0x0010, 0x0014, 0x0018, 0x001C,
+    0x0000, 0x001D, 0x0019, 0x0015, 0x0011, 0x000D, 0x0009, 0x0005,
+    0x003F, 0x0037, 0x002F, 0x0027, 0x001F, 0x0017, 0x000F, 0x0007,
+    0x007B, 0x006B, 0x005B, 0x004B, 0x003B, 0x002B, 0x001B, 0x000B,
+    0x0008, 0x00F0, 0x00D0, 0x00B0, 0x0090, 0x0070, 0x0050, 0x0030,
+    0x01E4, 0x01A5, 0x0165, 0x0125, 0x00E5, 0x00E2, 0x00A2, 0x0062,
+    0x03CA, 0x0347, 0x02C7, 0x02C4, 0x0244, 0x0149, 0x00C9, 0x00C6,
+    0x0796, 0x068B, 0x0688, 0x048D, 0x048A, 0x028F, 0x028C, 0x0091,
+    0x0F2E, 0x0D13, 0x0B15, 0x0917, 0x0719, 0x051B, 0x031D, 0x1E5F,
+    0x1E21,
+};
+
+static const uint8_t bitalloc_129_bits_a[129] = {
+    11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,
+     4,  4,  4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
+    11,
+};
+
+static const uint8_t bitalloc_129_bits_b[129] = {
+    14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,
+     9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+     5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12,
+    14,
+};
+
+static const uint8_t bitalloc_129_bits_c[129] = {
+    13, 12, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
+    13,
+};
+
+static const uint8_t bitalloc_129_bits_d[129] = {
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10,
+    10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  7,  7,
+     7,  7,  7,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
+     4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,
+     7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
+    16,
+};
+
+static const uint8_t bitalloc_129_bits_e[129] = {
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+    12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,
+     8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
+    12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
+    16,
+};
+
+static const uint8_t bitalloc_129_bits_f[129] = {
+    15, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,  9,
+     9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,
+     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+     7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13,
+    15,
+};
+
+static const uint8_t bitalloc_129_bits_g[129] = {
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
+     9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
+     7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,
+     4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
+     7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,
+     9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 13,
+    13,
+};
+
+static const uint8_t bitalloc_sizes[DCA_CODE_BOOKS] = {
+    3, 5, 7, 9, 13, 17, 25, 33, 65, 129
+};
+
+static const int8_t bitalloc_offsets[DCA_CODE_BOOKS] = {
+    -1, -2, -3, -4, -6, -8, -12, -16, -32, -64
+};
+
+static const uint8_t bitalloc_maxbits[DCA_CODE_BOOKS][7] = {
+    { 2 },
+    { 4, 3, 3 },
+    { 5, 5, 4 },
+    { 6, 5, 6 },
+    { 7, 6, 5 },
+    { 9, 8, 7, 9, 8, 8, 8 },
+    { 9, 9, 8, 9, 8, 9, 9 },
+    { 9, 9, 9, 9, 9, 9, 9 },
+    { 9, 9, 9, 9, 9, 9, 9 },
+    { 9, 9, 9, 9, 9, 9, 9 }
+};
+
+static const uint16_t *const bitalloc_codes[DCA_CODE_BOOKS][8] = {
+    { bitalloc_3_codes,     NULL },
+    { bitalloc_5_codes_a,   bitalloc_5_codes_b,   bitalloc_5_codes_c,   NULL },
+    { bitalloc_7_codes_a,   bitalloc_7_codes_b,   bitalloc_7_codes_c,   NULL },
+    { bitalloc_9_codes_a,   bitalloc_9_codes_b,   bitalloc_9_codes_c,   NULL },
+    { bitalloc_13_codes_a,  bitalloc_13_codes_b,  bitalloc_13_codes_c,  NULL },
+    { bitalloc_17_codes_a,  bitalloc_17_codes_b,  bitalloc_17_codes_c,  bitalloc_17_codes_d,
+      bitalloc_17_codes_e,  bitalloc_17_codes_f,  bitalloc_17_codes_g,  NULL },
+    { bitalloc_25_codes_a,  bitalloc_25_codes_b,  bitalloc_25_codes_c,  bitalloc_25_codes_d,
+      bitalloc_25_codes_e,  bitalloc_25_codes_f,  bitalloc_25_codes_g,  NULL },
+    { bitalloc_33_codes_a,  bitalloc_33_codes_b,  bitalloc_33_codes_c,  bitalloc_33_codes_d,
+      bitalloc_33_codes_e,  bitalloc_33_codes_f,  bitalloc_33_codes_g,  NULL },
+    { bitalloc_65_codes_a,  bitalloc_65_codes_b,  bitalloc_65_codes_c,  bitalloc_65_codes_d,
+      bitalloc_65_codes_e,  bitalloc_65_codes_f,  bitalloc_65_codes_g,  NULL },
+    { bitalloc_129_codes_a, bitalloc_129_codes_b, bitalloc_129_codes_c, bitalloc_129_codes_d,
+      bitalloc_129_codes_e, bitalloc_129_codes_f, bitalloc_129_codes_g, NULL }
+};
+
+static const uint8_t *const bitalloc_bits[DCA_CODE_BOOKS][8] = {
+    { bitalloc_3_bits,     NULL },
+    { bitalloc_5_bits_a,   bitalloc_5_bits_b,   bitalloc_5_bits_c,   NULL },
+    { bitalloc_7_bits_a,   bitalloc_7_bits_b,   bitalloc_7_bits_c,   NULL },
+    { bitalloc_9_bits_a,   bitalloc_9_bits_b,   bitalloc_9_bits_c,   NULL },
+    { bitalloc_13_bits_a,  bitalloc_13_bits_b,  bitalloc_13_bits_c,  NULL },
+    { bitalloc_17_bits_a,  bitalloc_17_bits_b,  bitalloc_17_bits_c,  bitalloc_17_bits_d,
+      bitalloc_17_bits_e,  bitalloc_17_bits_f,  bitalloc_17_bits_g,  NULL },
+    { bitalloc_25_bits_a,  bitalloc_25_bits_b,  bitalloc_25_bits_c,  bitalloc_25_bits_d,
+      bitalloc_25_bits_e,  bitalloc_25_bits_f,  bitalloc_25_bits_g,  NULL },
+    { bitalloc_33_bits_a,  bitalloc_33_bits_b,  bitalloc_33_bits_c,  bitalloc_33_bits_d,
+      bitalloc_33_bits_e,  bitalloc_33_bits_f,  bitalloc_33_bits_g,  NULL },
+    { bitalloc_65_bits_a,  bitalloc_65_bits_b,  bitalloc_65_bits_c,  bitalloc_65_bits_d,
+      bitalloc_65_bits_e,  bitalloc_65_bits_f,  bitalloc_65_bits_g,  NULL },
+    { bitalloc_129_bits_a, bitalloc_129_bits_b, bitalloc_129_bits_c, bitalloc_129_bits_d,
+      bitalloc_129_bits_e, bitalloc_129_bits_f, bitalloc_129_bits_g, NULL }
+};
+
+static const uint16_t tnl_grp_0_codes[37] = {
+    0x0000, 0x0003, 0x0004, 0x0007, 0x0001, 0x0009, 0x000a, 0x000d,
+    0x000e, 0x0006, 0x0012, 0x0005, 0x0015, 0x0016, 0x0022, 0x0025,
+    0x0035, 0x0076, 0x0002, 0x0042, 0x00b6, 0x0036, 0x00c2, 0x0136,
+    0x0182, 0x01c2, 0x03c2, 0x0482, 0x0682, 0x0082, 0x0882, 0x0a82,
+    0x0282, 0x2282, 0x3282, 0x1282, 0x5282,
+};
+
+static const uint16_t tnl_grp_1_codes[34] = {
+    0x0001, 0x0003, 0x0006, 0x0000, 0x0002, 0x0004, 0x0005, 0x0007,
+    0x0008, 0x000f, 0x001a, 0x001c, 0x001d, 0x000a, 0x002c, 0x002d,
+    0x000d, 0x002a, 0x004c, 0x004d, 0x006a, 0x008c, 0x00cd, 0x00ea,
+    0x000c, 0x010c, 0x01ea, 0x020c, 0x030c, 0x07ea, 0x0bea, 0x03ea,
+    0x13ea, 0x33ea,
+};
+
+static const uint16_t tnl_grp_2_codes[31] = {
+    0x0001, 0x0003, 0x0006, 0x0007, 0x0004, 0x0008, 0x000c, 0x0010,
+    0x0012, 0x001a, 0x0022, 0x0000, 0x000a, 0x0020, 0x0040, 0x004a,
+    0x006a, 0x0002, 0x002a, 0x0042, 0x0082, 0x00aa, 0x00e0, 0x0060,
+    0x00c2, 0x01c2, 0x0160, 0x0360, 0x0f60, 0x0760, 0x1760,
+};
+
+static const uint16_t tnl_grp_3_codes[28] = {
+    0x0001, 0x0006, 0x0008, 0x0014, 0x001c, 0x0000, 0x0002, 0x0004,
+    0x000a, 0x000c, 0x0010, 0x0012, 0x001a, 0x0020, 0x002a, 0x002c,
+    0x0032, 0x003a, 0x0022, 0x0030, 0x0062, 0x0064, 0x0070, 0x0024,
+    0x00a4, 0x01a4, 0x03a4, 0x07a4,
+};
+
+static const uint16_t tnl_grp_4_codes[23] = {
+    0x0001, 0x0000, 0x000a, 0x0006, 0x0012, 0x001e, 0x0022, 0x002e,
+    0x0036, 0x003e, 0x0002, 0x0016, 0x0032, 0x004e, 0x0056, 0x000e,
+    0x0042, 0x0072, 0x00c2, 0x00f2, 0x008e, 0x018e, 0x038e,
+};
+
+static const uint16_t tnl_scf_codes[20] = {
+    0x0000, 0x0001, 0x0002, 0x0005, 0x0006, 0x0007, 0x000b, 0x000c,
+    0x0013, 0x0014, 0x0003, 0x0004, 0x0023, 0x0064, 0x00a4, 0x0024,
+    0x0124, 0x0324, 0x0724, 0x0f24,
+};
+
+static const uint16_t damp_codes[7] = {
+    0x0001, 0x0000, 0x0002, 0x0006, 0x000e, 0x001e, 0x003e,
+};
+
+static const uint16_t dph_codes[9] = {
+    0x0000, 0x0002, 0x0003, 0x0001, 0x0009, 0x000d, 0x0005, 0x0015,
+    0x0035,
+};
+
+static const uint16_t fst_rsd_amp_codes[24] = {
+    0x0003, 0x0005, 0x0006, 0x0007, 0x0000, 0x0001, 0x0002, 0x0008,
+    0x0009, 0x000a, 0x0014, 0x0004, 0x001a, 0x001c, 0x0024, 0x002c,
+    0x003a, 0x000c, 0x003c, 0x004c, 0x00fc, 0x007c, 0x017c, 0x037c,
+};
+
+static const uint16_t rsd_apprx_codes[6] = {
+    0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f,
+};
+
+static const uint16_t rsd_amp_codes[33] = {
+    0x0001, 0x0000, 0x0002, 0x0003, 0x0004, 0x000e, 0x000f, 0x0016,
+    0x0007, 0x0027, 0x0037, 0x0026, 0x0066, 0x0006, 0x0017, 0x0046,
+    0x0097, 0x00d7, 0x0086, 0x00c6, 0x01c6, 0x0157, 0x0186, 0x0257,
+    0x0357, 0x0057, 0x0786, 0x0386, 0x0b86, 0x0457, 0x0c57, 0x1457,
+    0x1c57,
+};
+
+static const uint16_t avg_g3_codes[18] = {
+    0x0001, 0x0002, 0x0003, 0x0000, 0x000c, 0x0014, 0x0018, 0x0004,
+    0x0008, 0x0028, 0x0068, 0x0024, 0x00a4, 0x00e4, 0x0164, 0x0064,
+    0x0264, 0x0664,
+};
+
+static const uint16_t st_grid_codes[22] = {
+    0x0001, 0x0002, 0x0000, 0x0004, 0x0008, 0x001c, 0x004c, 0x006c,
+    0x000c, 0x002c, 0x008c, 0x00ac, 0x012c, 0x018c, 0x01ac, 0x038c,
+    0x03ac, 0x032c, 0x072c, 0x0f2c, 0x172c, 0x1f2c,
+};
+
+static const uint16_t grid_2_codes[20] = {
+    0x0000, 0x0002, 0x0003, 0x0001, 0x0005, 0x000d, 0x003d, 0x005d,
+    0x009d, 0x011d, 0x001d, 0x061d, 0x041d, 0x0c1d, 0x0a1d, 0x121d,
+    0x021d, 0x1a1d, 0x221d, 0x3a1d,
+};
+
+static const uint16_t grid_3_codes[13] = {
+    0x0001, 0x0002, 0x0000, 0x0004, 0x000c, 0x001c, 0x007c, 0x003c,
+    0x01bc, 0x00bc, 0x06bc, 0x02bc, 0x0abc,
+};
+
+static const uint16_t rsd_codes[9] = {
+    0x0001, 0x0003, 0x0000, 0x0002, 0x0006, 0x0004, 0x000c, 0x001c,
+    0x003c,
+};
+
+static const uint8_t tnl_grp_0_bitvals[74] = {
+     3,  5,  3,  9,  3,  4,  3,  6,  4, 10,  4, 13,  4,  7,  4, 11,
+     4,  8,  5, 12,  5, 14,  6, 15,  6, 18,  6,  1,  6, 17,  6, 16,
+     6, 21,  7, 20,  8, 19,  8, 22,  8, 25,  9, 26,  9, 23,  9,  3,
+     9, 24, 10, 29, 10, 27, 11, 28, 11, 30, 12, 33, 12, 31, 12, 32,
+    14, 34, 14, 37, 14, 36, 15, 35, 15,  0,
+};
+
+static const uint8_t tnl_grp_1_bitvals[68] = {
+     3,  9,  3,  6,  3,  5,  4,  4,  4,  8,  4, 10,  4,  1,  4, 11,
+     4,  7,  4, 13,  5, 12,  5, 14,  5, 17,  6, 16,  6, 15,  6, 18,
+     7, 20,  7, 19,  7, 21,  8, 25,  8, 23,  8, 22,  8, 24,  9, 26,
+    10,  3, 10, 29, 10, 30, 10, 27, 10, 28, 11, 31, 12, 32, 13, 33,
+    14, 34, 14,  0,
+};
+
+static const uint8_t tnl_grp_2_bitvals[62] = {
+     2,  1,  3,  6,  3,  5,  3,  7,  4,  9,  4,  8,  4,  4,  5, 10,
+     5, 11,  5, 13,  6, 12,  7, 14,  7, 16,  7, 15,  7, 17,  7, 18,
+     7, 19,  8, 22,  8, 20,  8, 21,  8,  3,  8, 24,  8, 25,  9, 23,
+     9, 26,  9, 27, 10, 28, 11, 29, 12, 31, 13, 30, 13,  0,
+};
+
+static const uint8_t tnl_grp_3_bitvals[56] = {
+     1,  1,  3,  6,  4,  5,  5,  9,  5,  4,  6,  8,  6, 14,  6, 10,
+     6, 21,  6, 13,  6,  7,  6,  3,  6, 16,  6,  2,  6, 18,  6, 17,
+     6, 11,  6, 15,  7, 19,  7, 23,  7, 24,  7, 22,  7, 12,  8, 20,
+     9, 25, 10, 26, 11, 27, 11,  0,
+};
+
+static const uint8_t tnl_grp_4_bitvals[46] = {
+     1,  1,  2,  2,  4,  4,  5,  5,  6,  6,  6,  8,  6,  3,  6, 19,
+     6, 20,  6,  9,  7,  7,  7, 11,  7, 13,  7, 17,  7, 10,  8, 12,
+     8, 15,  8, 14,  8, 21,  8, 18,  9, 16, 10, 22, 10,  0,
+};
+
+static const uint8_t tnl_scf_bitvals[40] = {
+     3,  3,  3,  1,  3,  2,  3,  5,  3,  4,  3,  6,  4,  8,  4,  7,
+     5, 10,  5,  9,  6, 12,  6, 11,  6, 13,  7, 14,  8, 15,  9, 16,
+    10, 17, 11, 18, 12, 19, 12,  0,
+};
+
+static const uint8_t damp_bitvals[14] = {
+     1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  6,  0,
+};
+
+static const uint8_t dph_bitvals[18] = {
+     2,  2,  2,  1,  2,  8,  4,  3,  4,  7,  4,  4,  5,  6,  6,  5,
+     6,  0,
+};
+
+static const uint8_t fst_rsd_amp_bitvals[48] = {
+     3, 13,  3, 15,  3, 16,  3, 14,  4, 12,  4, 10,  4, 11,  4, 17,
+     4, 18,  5, 19,  5,  9,  6,  1,  6,  7,  6,  6,  6,  8,  6,  5,
+     6,  4,  7, 20,  7,  2,  7,  3,  8, 21,  9, 22, 10, 23, 10,  0,
+};
+
+static const uint8_t rsd_apprx_bitvals[12] = {
+     1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  5,  0,
+};
+
+static const uint8_t rsd_amp_bitvals[66] = {
+     2,  3,  3,  2,  3,  5,  3,  4,  3,  1,  4,  7,  4,  6,  5,  9,
+     6,  8,  6, 11,  6, 10,  7, 12,  7, 13,  8, 14,  8, 18,  8, 16,
+     8, 15,  8, 22,  9, 20,  9, 24,  9, 17, 10, 28, 10, 26, 10, 21,
+    10, 23, 11, 30, 11, 19, 12, 25, 12, 32, 13, 36, 13, 29, 13, 34,
+    13,  0,
+};
+
+static const uint8_t avg_g3_bitvals[36] = {
+     2, 15,  2, 16,  2, 17,  4, 14,  4, 18,  5, 12,  5, 13,  6, 10,
+     6, 11,  7, 19,  7,  9,  8, 20,  8,  8,  8,  7,  9, 21, 10,  6,
+    11, 23, 11,  0,
+};
+
+static const uint8_t st_grid_bitvals[44] = {
+     1,  6,  2,  1,  4,  4,  4,  8,  4,  3,  5, 10,  7, 12,  7,  5,
+     8, 14,  9, 16,  9,  7,  9, 18, 10, 11, 10,  9, 10, 20, 10, 22,
+    10,  2, 11, 13, 13, 17, 13, 24, 13, 15, 13,  0,
+};
+
+static const uint8_t grid_2_bitvals[40] = {
+     2,  3,  2,  2,  2,  1,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
+     8,  9,  9, 10, 11, 11, 11, 12, 12, 13, 12, 17, 13, 15, 13, 18,
+    14, 19, 14, 16, 14, 14, 14,  0,
+};
+
+static const uint8_t grid_3_bitvals[26] = {
+     1, 17,  2, 16,  3, 18,  4, 15,  5, 19,  6, 14,  7, 20,  8, 13,
+     9, 21, 10, 12, 11, 22, 12, 11, 12,  0,
+};
+
+static const uint8_t rsd_bitvals[18] = {
+     2,  2,  2,  3,  3,  1,  3,  4,  3,  0,  4,  5,  5,  6,  6,  7,
+     6,  4,
+};
+
+static const uint16_t vlc_offs[80] = {
+        0,   512,   640,   768,  1282,  1794,  2436,  3080,  3770,  4454,  5364,
+     5372,  5380,  5388,  5392,  5396,  5412,  5420,  5428,  5460,  5492,  5508,
+     5572,  5604,  5668,  5796,  5860,  5892,  6412,  6668,  6796,  7308,  7564,
+     7820,  8076,  8620,  9132,  9388,  9910, 10166, 10680, 11196, 11726, 12240,
+    12752, 13298, 13810, 14326, 14840, 15500, 16022, 16540, 17158, 17678, 18264,
+    18796, 19352, 19926, 20468, 21472, 22398, 23014, 23622, 24200, 24748, 25276,
+    25792, 26306, 26826, 26890, 26954, 27468, 27500, 28038, 28554, 29086, 29630,
+    30150, 30214
+};
+
+DCAVLC  ff_dca_vlc_bit_allocation;
+DCAVLC  ff_dca_vlc_transition_mode;
+DCAVLC  ff_dca_vlc_scale_factor;
+DCAVLC  ff_dca_vlc_quant_index[DCA_CODE_BOOKS];
+
+VLC     ff_dca_vlc_tnl_grp[5];
+VLC     ff_dca_vlc_tnl_scf;
+VLC     ff_dca_vlc_damp;
+VLC     ff_dca_vlc_dph;
+VLC     ff_dca_vlc_fst_rsd_amp;
+VLC     ff_dca_vlc_rsd_apprx;
+VLC     ff_dca_vlc_rsd_amp;
+VLC     ff_dca_vlc_avg_g3;
+VLC     ff_dca_vlc_st_grid;
+VLC     ff_dca_vlc_grid_2;
+VLC     ff_dca_vlc_grid_3;
+VLC     ff_dca_vlc_rsd;
+
+av_cold void ff_dca_init_vlcs(void)
+{
+    static VLC_TYPE dca_table[30214][2];
+    static int vlcs_initialized = 0;
+    int i, j, k = 0;
+
+    if (vlcs_initialized)
+        return;
+
+#define DCA_INIT_VLC(vlc, a, b, c, d)                                       \
+    do {                                                                    \
+        vlc.table           = &dca_table[vlc_offs[k]];                      \
+        vlc.table_allocated = vlc_offs[k + 1] - vlc_offs[k];                \
+        init_vlc(&vlc, a, b, c, 1, 1, d, 2, 2, INIT_VLC_USE_NEW_STATIC);    \
+        k++;                                                                \
+    } while (0)
+
+    ff_dca_vlc_bit_allocation.offset    = 1;
+    ff_dca_vlc_bit_allocation.max_depth = 2;
+    for (i = 0; i < 5; i++)
+        DCA_INIT_VLC(ff_dca_vlc_bit_allocation.vlc[i], bitalloc_12_vlc_bits[i], 12,
+                     bitalloc_12_bits[i], bitalloc_12_codes[i]);
+
+    ff_dca_vlc_scale_factor.offset    = -64;
+    ff_dca_vlc_scale_factor.max_depth = 2;
+    for (i = 0; i < 5; i++)
+        DCA_INIT_VLC(ff_dca_vlc_scale_factor.vlc[i], SCALES_VLC_BITS, 129,
+                     scales_bits[i], scales_codes[i]);
+
+    ff_dca_vlc_transition_mode.offset    = 0;
+    ff_dca_vlc_transition_mode.max_depth = 1;
+    for (i = 0; i < 4; i++)
+        DCA_INIT_VLC(ff_dca_vlc_transition_mode.vlc[i], tmode_vlc_bits[i], 4,
+                     tmode_bits[i], tmode_codes[i]);
+
+    for (i = 0; i < DCA_CODE_BOOKS; i++) {
+        ff_dca_vlc_quant_index[i].offset    = bitalloc_offsets[i];
+        ff_dca_vlc_quant_index[i].max_depth = 1 + (i > 4);
+        for (j = 0; bitalloc_codes[i][j]; j++)
+            DCA_INIT_VLC(ff_dca_vlc_quant_index[i].vlc[j], bitalloc_maxbits[i][j],
+                         bitalloc_sizes[i], bitalloc_bits[i][j], bitalloc_codes[i][j]);
+    }
+
+#define LBR_INIT_VLC(vlc, tab, nb_bits)                                 \
+    do {                                                                \
+        vlc.table           = &dca_table[vlc_offs[k]];                  \
+        vlc.table_allocated = vlc_offs[k + 1] - vlc_offs[k];            \
+        ff_init_vlc_sparse(&vlc, nb_bits, FF_ARRAY_ELEMS(tab##_codes),  \
+                           &tab##_bitvals[0], 2, 1,                     \
+                           tab##_codes, 2, 2,                           \
+                           &tab##_bitvals[1], 2, 1,                     \
+                           INIT_VLC_LE | INIT_VLC_USE_NEW_STATIC);      \
+        k++;                                                            \
+    } while (0)
+
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[0],  tnl_grp_0,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[1],  tnl_grp_1,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[2],  tnl_grp_2,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[3],  tnl_grp_3,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[4],  tnl_grp_4,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_scf,     tnl_scf,     9);
+    LBR_INIT_VLC(ff_dca_vlc_damp,        damp,        6);
+    LBR_INIT_VLC(ff_dca_vlc_dph,         dph,         6);
+    LBR_INIT_VLC(ff_dca_vlc_fst_rsd_amp, fst_rsd_amp, 9);
+    LBR_INIT_VLC(ff_dca_vlc_rsd_apprx,   rsd_apprx,   5);
+    LBR_INIT_VLC(ff_dca_vlc_rsd_amp,     rsd_amp,     9);
+    LBR_INIT_VLC(ff_dca_vlc_avg_g3,      avg_g3,      9);
+    LBR_INIT_VLC(ff_dca_vlc_st_grid,     st_grid,     9);
+    LBR_INIT_VLC(ff_dca_vlc_grid_2,      grid_2,      9);
+    LBR_INIT_VLC(ff_dca_vlc_grid_3,      grid_3,      9);
+    LBR_INIT_VLC(ff_dca_vlc_rsd,         rsd,         6);
+
+    vlcs_initialized = 1;
+}
+
+uint32_t ff_dca_vlc_calc_quant_bits(int *values, uint8_t n, uint8_t sel, uint8_t table)
+{
+    uint8_t i, id;
+    uint32_t sum = 0;
+    for (i = 0; i < n; i++) {
+        id = values[i] - bitalloc_offsets[table];
+        av_assert0(id < bitalloc_sizes[table]);
+        sum += bitalloc_bits[table][sel][id];
+    }
+    return sum;
+}
+
+void ff_dca_vlc_enc_quant(PutBitContext *pb, int *values, uint8_t n, uint8_t sel, uint8_t table)
+{
+    uint8_t i, id;
+    for (i = 0; i < n; i++) {
+        id = values[i] - bitalloc_offsets[table];
+        av_assert0(id < bitalloc_sizes[table]);
+        put_bits(pb, bitalloc_bits[table][sel][id], bitalloc_codes[table][sel][id]);
+    }
+}
+
+uint32_t ff_dca_vlc_calc_alloc_bits(int *values, uint8_t n, uint8_t sel)
+{
+    uint8_t i, id;
+    uint32_t sum = 0;
+    for (i = 0; i < n; i++) {
+        id = values[i] - 1;
+        sum += bitalloc_12_bits[sel][id];
+    }
+    return sum;
+}
+
+void ff_dca_vlc_enc_alloc(PutBitContext *pb, int *values, uint8_t n, uint8_t sel)
+{
+    uint8_t i, id;
+    for (i = 0; i < n; i++) {
+        id = values[i] - 1;
+        put_bits(pb, bitalloc_12_bits[sel][id], bitalloc_12_codes[sel][id]);
+    }
+}
diff --git a/libavcodec/dcahuff.h b/libavcodec/dcahuff.h
index 79be493..02b0e37 100644
--- a/libavcodec/dcahuff.h
+++ b/libavcodec/dcahuff.h
@@ -3,1039 +3,63 @@
  * Copyright (C) 2004 Gildas Bazin
  * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DCAHUFF_H
 #define AVCODEC_DCAHUFF_H
 
-#include <stdint.h>
-#include <stdlib.h>
-
-#define TMODE_COUNT 4
-static const uint8_t tmode_vlc_bits[TMODE_COUNT] = { 3, 3, 3, 2 };
-static const uint16_t tmode_codes[TMODE_COUNT][4] = {
-    { 0x0000, 0x0002, 0x0006, 0x0007 },
-    { 0x0002, 0x0006, 0x0007, 0x0000 },
-    { 0x0006, 0x0007, 0x0000, 0x0002 },
-    { 0x0000, 0x0001, 0x0002, 0x0003 }
-};
-
-static const uint8_t tmode_bits[TMODE_COUNT][4] = {
-    { 1, 2, 3, 3 },
-    { 2, 3, 3, 1 },
-    { 3, 3, 1, 2 },
-    { 2, 2, 2, 2 }
-};
-
-#define BITALLOC_12_COUNT    5
-#define BITALLOC_12_VLC_BITS 9
-static const uint8_t bitalloc_12_vlc_bits[BITALLOC_12_COUNT] = {
-    9, 7, 7, 9, 9
-};
-
-static const uint16_t bitalloc_12_codes[BITALLOC_12_COUNT][12] = {
-    { 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E, 0x00FF, 0x00FE,
-      0x01FB, 0x01FA, 0x01F9, 0x01F8, },
-    { 0x0001, 0x0000, 0x0002, 0x000F, 0x000C, 0x001D, 0x0039, 0x0038,
-      0x0037, 0x0036, 0x0035, 0x0034, },
-    { 0x0000, 0x0007, 0x0005, 0x0004, 0x0002, 0x000D, 0x000C, 0x0006,
-      0x000F, 0x001D, 0x0039, 0x0038, },
-    { 0x0003, 0x0002, 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E,
-      0x007E, 0x00FE, 0x01FF, 0x01FE, },
-    { 0x0001, 0x0000, 0x0002, 0x0006, 0x000E, 0x003F, 0x003D, 0x007C,
-      0x0079, 0x0078, 0x00FB, 0x00FA, }
-};
-
-static const uint8_t bitalloc_12_bits[BITALLOC_12_COUNT][12] = {
-    { 1, 2, 3, 4, 5, 6, 8, 8, 9, 9,  9,  9 },
-    { 1, 2, 3, 5, 5, 6, 7, 7, 7, 7,  7,  7 },
-    { 2, 3, 3, 3, 3, 4, 4, 4, 5, 6,  7,  7 },
-    { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10 },
-    { 1, 2, 3, 4, 5, 7, 7, 8, 8, 8,  9,  9 }
-};
-
-#define SCALES_COUNT    5
-#define SCALES_VLC_BITS 9
-static const uint16_t scales_codes[SCALES_COUNT][129] = {
-    { 0x3AB0, 0x3AB2, 0x3AB4, 0x3AB6, 0x3AB8, 0x3ABA, 0x3ABC, 0x3ABE,
-      0x3AC0, 0x3AC2, 0x3AC4, 0x3AC6, 0x3AC8, 0x3ACA, 0x3ACC, 0x3ACE,
-      0x3AD0, 0x3AD2, 0x3AD4, 0x3AD6, 0x3AD8, 0x3ADA, 0x3ADC, 0x3ADE,
-      0x3AE0, 0x3AE2, 0x3AE4, 0x3AE6, 0x3AE8, 0x3AEA, 0x3AEC, 0x3AEE,
-      0x3AF0, 0x3AF2, 0x3AF4, 0x3AF6, 0x3AF8, 0x3AFA, 0x3AFC, 0x3AFE,
-      0x0540, 0x0542, 0x0544, 0x0546, 0x0548, 0x054A, 0x054C, 0x054E,
-      0x0558, 0x055E, 0x02AD, 0x0154, 0x0754, 0x03A8, 0x0056, 0x0028,
-      0x00E8, 0x004A, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
-      0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x004B,
-      0x00E9, 0x0029, 0x0057, 0x03A9, 0x0755, 0x0155, 0x02AE, 0x055F,
-      0x0559, 0x054F, 0x054D, 0x054B, 0x0549, 0x0547, 0x0545, 0x0543,
-      0x0541, 0x3AFF, 0x3AFD, 0x3AFB, 0x3AF9, 0x3AF7, 0x3AF5, 0x3AF3,
-      0x3AF1, 0x3AEF, 0x3AED, 0x3AEB, 0x3AE9, 0x3AE7, 0x3AE5, 0x3AE3,
-      0x3AE1, 0x3ADF, 0x3ADD, 0x3ADB, 0x3AD9, 0x3AD7, 0x3AD5, 0x3AD3,
-      0x3AD1, 0x3ACF, 0x3ACD, 0x3ACB, 0x3AC9, 0x3AC7, 0x3AC5, 0x3AC3,
-      0x3AC1, 0x3ABF, 0x3ABD, 0x3ABB, 0x3AB9, 0x3AB7, 0x3AB5, 0x3AB3,
-      0x3AB1, },
-    { 0x0F60, 0x0F62, 0x0F64, 0x0F66, 0x0F68, 0x0F6A, 0x0F6C, 0x0F6E,
-      0x0F70, 0x0F72, 0x0F74, 0x0F76, 0x0F78, 0x0F7A, 0x0F7C, 0x0F7E,
-      0x0F80, 0x0F82, 0x0F84, 0x0F86, 0x0F88, 0x0F8A, 0x0F8C, 0x0F8E,
-      0x0F90, 0x0F92, 0x0F94, 0x0F96, 0x0F98, 0x0F9A, 0x0F9C, 0x0F9E,
-      0x0FA0, 0x0FA2, 0x0FA4, 0x0FA6, 0x0FA8, 0x0FAA, 0x0FAC, 0x0FAE,
-      0x0FB0, 0x0FB2, 0x0FB4, 0x0FB6, 0x0FB8, 0x0FBA, 0x0FBC, 0x0FBE,
-      0x07A0, 0x07A2, 0x03D2, 0x01EA, 0x00FC, 0x007F, 0x001C, 0x000C,
-      0x0004, 0x0034, 0x0010, 0x001B, 0x0009, 0x000B, 0x000E, 0x0001,
-      0x0003, 0x0002, 0x000F, 0x000C, 0x000A, 0x0000, 0x0011, 0x0035,
-      0x0005, 0x000D, 0x001D, 0x003C, 0x00FD, 0x01EB, 0x03D3, 0x07A3,
-      0x07A1, 0x0FBF, 0x0FBD, 0x0FBB, 0x0FB9, 0x0FB7, 0x0FB5, 0x0FB3,
-      0x0FB1, 0x0FAF, 0x0FAD, 0x0FAB, 0x0FA9, 0x0FA7, 0x0FA5, 0x0FA3,
-      0x0FA1, 0x0F9F, 0x0F9D, 0x0F9B, 0x0F99, 0x0F97, 0x0F95, 0x0F93,
-      0x0F91, 0x0F8F, 0x0F8D, 0x0F8B, 0x0F89, 0x0F87, 0x0F85, 0x0F83,
-      0x0F81, 0x0F7F, 0x0F7D, 0x0F7B, 0x0F79, 0x0F77, 0x0F75, 0x0F73,
-      0x0F71, 0x0F6F, 0x0F6D, 0x0F6B, 0x0F69, 0x0F67, 0x0F65, 0x0F63,
-      0x0F61, },
-    { 0x51D0, 0x51D2, 0x51D4, 0x51D6, 0x51D8, 0x51DA, 0x51DC, 0x51DE,
-      0x51E0, 0x51E2, 0x51E4, 0x51E6, 0x51E8, 0x51EA, 0x51EC, 0x51EE,
-      0x51F0, 0x51F2, 0x51F4, 0x51F6, 0x51F8, 0x51FA, 0x51FC, 0x51FE,
-      0x70C0, 0x70C2, 0x70C4, 0x70C6, 0x70C8, 0x70CA, 0x70CC, 0x70CE,
-      0x70EC, 0x10EA, 0x3868, 0x3877, 0x0876, 0x1C35, 0x0434, 0x0A34,
-      0x0E1B, 0x021B, 0x051B, 0x070F, 0x010F, 0x0380, 0x0080, 0x0140,
-      0x01C1, 0x0041, 0x00A1, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
-      0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
-      0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
-      0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A2, 0x0042,
-      0x01C2, 0x0141, 0x0081, 0x0381, 0x028C, 0x010C, 0x051C, 0x021C,
-      0x0E1C, 0x0A35, 0x0435, 0x1C3A, 0x0877, 0x0874, 0x3869, 0x10EB,
-      0x70ED, 0x70CF, 0x70CD, 0x70CB, 0x70C9, 0x70C7, 0x70C5, 0x70C3,
-      0x70C1, 0x51FF, 0x51FD, 0x51FB, 0x51F9, 0x51F7, 0x51F5, 0x51F3,
-      0x51F1, 0x51EF, 0x51ED, 0x51EB, 0x51E9, 0x51E7, 0x51E5, 0x51E3,
-      0x51E1, 0x51DF, 0x51DD, 0x51DB, 0x51D9, 0x51D7, 0x51D5, 0x51D3,
-      0x51D1, },
-    { 0x6F64, 0x6F66, 0x6F68, 0x6F6A, 0x6F6C, 0x6F6E, 0x6F70, 0x6F72,
-      0x6F74, 0x6F76, 0x6F78, 0x6F7A, 0x6F7C, 0x6F7E, 0x6F80, 0x6F82,
-      0x6F84, 0x6F86, 0x6F88, 0x6F8A, 0x6F8C, 0x6F8E, 0x6F90, 0x6F92,
-      0x6F94, 0x6F96, 0x6F98, 0x6F9A, 0x6F9C, 0x6F9E, 0x6FA0, 0x6FA2,
-      0x6FA4, 0x6FA6, 0x6FA8, 0x6FAA, 0x6FAC, 0x6FAE, 0x6FB0, 0x6FB2,
-      0x6FB4, 0x6FB6, 0x17B4, 0x37DC, 0x0BDB, 0x1BEF, 0x05EE, 0x0DF8,
-      0x02F8, 0x06FD, 0x017D, 0x037F, 0x00BF, 0x0040, 0x00C0, 0x0021,
-      0x0061, 0x0011, 0x0031, 0x0009, 0x0019, 0x0006, 0x000E, 0x0004,
-      0x0000, 0x0005, 0x000F, 0x0007, 0x001A, 0x000A, 0x0036, 0x0016,
-      0x006E, 0x002E, 0x00C1, 0x0041, 0x01BC, 0x00BC, 0x037A, 0x017A,
-      0x02F9, 0x0DF9, 0x05EF, 0x05EC, 0x1BD8, 0x37DD, 0x17B5, 0x6FB7,
-      0x6FB5, 0x6FB3, 0x6FB1, 0x6FAF, 0x6FAD, 0x6FAB, 0x6FA9, 0x6FA7,
-      0x6FA5, 0x6FA3, 0x6FA1, 0x6F9F, 0x6F9D, 0x6F9B, 0x6F99, 0x6F97,
-      0x6F95, 0x6F93, 0x6F91, 0x6F8F, 0x6F8D, 0x6F8B, 0x6F89, 0x6F87,
-      0x6F85, 0x6F83, 0x6F81, 0x6F7F, 0x6F7D, 0x6F7B, 0x6F79, 0x6F77,
-      0x6F75, 0x6F73, 0x6F71, 0x6F6F, 0x6F6D, 0x6F6B, 0x6F69, 0x6F67,
-      0x6F65, },
-    { 0xDF54, 0xDF56, 0xDFC8, 0xDFCA, 0xDFCC, 0xDFCE, 0xDFD0, 0xDFD2,
-      0xDFD4, 0xDFD6, 0xDFD8, 0xDFDA, 0xDFDC, 0xDFDE, 0xDFE0, 0xDFE2,
-      0x0FE8, 0x2FEA, 0x6FA8, 0x6FF6, 0x07F5, 0x07F7, 0x37D2, 0x37F9,
-      0x03F8, 0x0BF8, 0x0BFB, 0x1BEB, 0x01FA, 0x05FA, 0x09FA, 0x0DFA,
-      0x0DFF, 0x00FF, 0x02FF, 0x06FB, 0x007C, 0x017C, 0x027C, 0x027F,
-      0x003C, 0x00BC, 0x013C, 0x01BC, 0x001C, 0x005C, 0x009C, 0x00DC,
-      0x000C, 0x002C, 0x004C, 0x006C, 0x0004, 0x0014, 0x0024, 0x0034,
-      0x0000, 0x0008, 0x0010, 0x0018, 0x001E, 0x0002, 0x0006, 0x000A,
-      0x000E, 0x000B, 0x0007, 0x0003, 0x001F, 0x0019, 0x0011, 0x0009,
-      0x0001, 0x0035, 0x0025, 0x0015, 0x0005, 0x006D, 0x004D, 0x002D,
-      0x000D, 0x00DD, 0x009D, 0x005D, 0x001D, 0x01BD, 0x013D, 0x00BD,
-      0x003D, 0x037C, 0x027D, 0x017D, 0x007D, 0x06FC, 0x04FC, 0x02FC,
-      0x00FC, 0x0DFB, 0x09FB, 0x05FB, 0x01FB, 0x1BF8, 0x1BE8, 0x0BF9,
-      0x03F9, 0x37FA, 0x37D3, 0x17F4, 0x07F6, 0x6FF7, 0x6FA9, 0x2FEB,
-      0x0FE9, 0xDFE3, 0xDFE1, 0xDFDF, 0xDFDD, 0xDFDB, 0xDFD9, 0xDFD7,
-      0xDFD5, 0xDFD3, 0xDFD1, 0xDFCF, 0xDFCD, 0xDFCB, 0xDFC9, 0xDF57,
-      0xDF55, }
-};
-
-static const uint8_t scales_bits[SCALES_COUNT][129] = {
-    { 14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      13, 13, 13, 13, 13, 13, 13, 13,
-      13, 13, 12, 11, 11, 10,  9,  8,
-       8,  7,  6,  6,  5,  4,  4,  3,
-       2,  3,  3,  4,  5,  5,  6,  7,
-       8,  8,  9, 10, 11, 11, 12, 13,
-      13, 13, 13, 13, 13, 13, 13, 13,
-      13, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, },
-    { 15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      14, 14, 13, 12, 11, 10,  8,  7,
-       6,  6,  5,  5,  4,  4,  4,  3,
-       3,  3,  4,  4,  4,  4,  5,  6,
-       6,  7,  8,  9, 11, 12, 13, 14,
-      14, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, },
-    { 15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 14, 14, 14, 13, 13, 12, 12,
-      12, 11, 11, 11, 10, 10,  9,  9,
-       9,  8,  8,  8,  7,  7,  7,  6,
-       6,  6,  5,  5,  5,  4,  4,  3,
-       3,  3,  4,  4,  5,  5,  5,  6,
-       6,  6,  7,  7,  7,  8,  8,  8,
-       9,  9,  9, 10, 10, 10, 11, 11,
-      12, 12, 12, 13, 13, 13, 14, 14,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, },
-    { 15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 14, 14, 13, 13, 12, 12,
-      11, 11, 10, 10,  9,  8,  8,  7,
-       7,  6,  6,  5,  5,  4,  4,  3,
-       2,  3,  4,  4,  5,  5,  6,  6,
-       7,  7,  8,  8,  9,  9, 10, 10,
-      11, 12, 12, 12, 13, 14, 14, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, },
-    { 16, 16, 16, 16, 16, 16, 16, 16,
-      16, 16, 16, 16, 16, 16, 16, 16,
-      15, 15, 15, 15, 14, 14, 14, 14,
-      13, 13, 13, 13, 12, 12, 12, 12,
-      12, 11, 11, 11, 10, 10, 10, 10,
-       9,  9,  9,  9,  8,  8,  8,  8,
-       7,  7,  7,  7,  6,  6,  6,  6,
-       5,  5,  5,  5,  5,  4,  4,  4,
-       4,  4,  4,  4,  5,  5,  5,  5,
-       5,  6,  6,  6,  6,  7,  7,  7,
-       7,  8,  8,  8,  8,  9,  9,  9,
-       9, 10, 10, 10, 10, 11, 11, 11,
-      11, 12, 12, 12, 12, 13, 13, 13,
-      13, 14, 14, 14, 14, 15, 15, 15,
-      15, 16, 16, 16, 16, 16, 16, 16,
-      16, 16, 16, 16, 16, 16, 16, 16,
-      16,
-    }
-};
-
-static const uint16_t bitalloc_3_codes[3] = {
-    0x0003, 0x0000, 0x0002,
-};
-
-static const uint8_t bitalloc_3_bits[3] = {
-    2,  1,  2,
-};
-
-static const uint16_t bitalloc_5_codes_a[5] = {
-    0x000F, 0x0006, 0x0000, 0x0002, 0x000E,
-};
-
-static const uint16_t bitalloc_5_codes_b[5] = {
-    0x0007, 0x0001, 0x0002, 0x0000, 0x0006,
-};
-
-static const uint16_t bitalloc_5_codes_c[5] = {
-    0x0007, 0x0005, 0x0000, 0x0004, 0x0006,
-};
-
-static const uint8_t bitalloc_5_bits_a[5] = {
-    4,  3,  1,  2,  4,
-};
-
-static const uint8_t bitalloc_5_bits_b[5] = {
-    3,  2,  2,  2,  3,
-};
-
-static const uint8_t bitalloc_5_bits_c[5] = {
-    3,  3,  1,  3,  3,
-};
-
-static const uint16_t bitalloc_7_codes_a[7] = {
-    0x001E, 0x000E, 0x0005, 0x0000, 0x0006, 0x0004, 0x001F,
-};
-
-static const uint16_t bitalloc_7_codes_b[7] = {
-    0x0014, 0x000B, 0x0000, 0x0003, 0x0001, 0x0004, 0x0015,
-};
-
-static const uint16_t bitalloc_7_codes_c[7] = {
-    0x0000, 0x0002, 0x0001, 0x0003, 0x0002, 0x0003, 0x0001,
-};
-
-static const uint8_t bitalloc_7_bits_a[7] = {
-    5,  4,  3,  1,  3,  3,  5,
-};
-
-static const uint8_t bitalloc_7_bits_b[7] = {
-    5,  4,  2,  2,  2,  3,  5,
-};
-
-static const uint8_t bitalloc_7_bits_c[7] = {
-    4,  4,  2,  2,  2,  4,  4,
-};
-
-static const uint16_t bitalloc_9_codes_a[9] = {
-    0x0030, 0x0019, 0x0009, 0x0005, 0x0000, 0x0007, 0x000D, 0x0008,
-    0x0031,
-};
-
-static const uint16_t bitalloc_9_codes_b[9] = {
-    0x0018, 0x001A, 0x0002, 0x0007, 0x0002, 0x0000, 0x0003, 0x001B,
-    0x0019,
-};
-
-static const uint16_t bitalloc_9_codes_c[9] = {
-    0x001C, 0x000F, 0x0002, 0x0007, 0x0002, 0x0000, 0x0006, 0x0006,
-    0x001D,
-};
-
-static const uint8_t bitalloc_9_bits_a[9] = {
-    6,  5,  4,  3,  1,  3,  4,  4,  6,
-};
-
-static const uint8_t bitalloc_9_bits_b[9] = {
-    5,  5,  3,  3,  2,  2,  3,  5,  5,
-};
-
-static const uint8_t bitalloc_9_bits_c[9] = {
-    6,  5,  3,  3,  2,  2,  3,  4,  6,
-};
-
-static const uint16_t bitalloc_13_codes_a[13] = {
-    0x0070, 0x002E, 0x0039, 0x001D, 0x000C, 0x000F, 0x0000, 0x0004,
-    0x000D, 0x000A, 0x0016, 0x002F, 0x0071,
-};
-
-static const uint16_t bitalloc_13_codes_b[13] = {
-    0x0038, 0x0010, 0x001D, 0x0007, 0x000F, 0x0005, 0x0000, 0x0006,
-    0x0002, 0x0009, 0x0006, 0x0011, 0x0039,
-};
-
-static const uint16_t bitalloc_13_codes_c[13] = {
-    0x0004, 0x001A, 0x0003, 0x000E, 0x0000, 0x0003, 0x0005, 0x0004,
-    0x0002, 0x000F, 0x000C, 0x001B, 0x0005,
-};
-
-static const uint8_t bitalloc_13_bits_a[13] = {
-     7,  6,  6,  5,  4,  4,  1,  3,  4,  4,  5,  6,  7,
-};
-
-static const uint8_t bitalloc_13_bits_b[13] = {
-     6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  4,  5,  6,
-};
-
-static const uint8_t bitalloc_13_bits_c[13] = {
-     5,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  5,
-};
-
-static const uint16_t bitalloc_17_codes_a[17] = {
-    0x0154, 0x00AB, 0x002B, 0x000B, 0x0003, 0x000A, 0x0001, 0x0006,
-    0x0001, 0x0007, 0x0004, 0x000B, 0x0000, 0x0004, 0x0014, 0x0054,
-    0x0155,
-};
-
-static const uint16_t bitalloc_17_codes_b[17] = {
-    0x007C, 0x003F, 0x0019, 0x000D, 0x001C, 0x0008, 0x000F, 0x0005,
-    0x0000, 0x0006, 0x0002, 0x0009, 0x001D, 0x000E, 0x001E, 0x0018,
-    0x007D,
-};
-
-static const uint16_t bitalloc_17_codes_c[17] = {
-    0x002C, 0x0017, 0x0005, 0x001C, 0x0003, 0x000A, 0x000F, 0x0003,
-    0x0006, 0x0004, 0x0000, 0x000B, 0x0004, 0x001D, 0x000A, 0x0004,
-    0x002D,
-};
-
-static const uint16_t bitalloc_17_codes_d[17] = {
-    0x0100, 0x0102, 0x0082, 0x0042, 0x0022, 0x0012, 0x000A, 0x0006,
-    0x0000, 0x0007, 0x000B, 0x0013, 0x0023, 0x0043, 0x0083, 0x0103,
-    0x0101,
-};
-
-static const uint16_t bitalloc_17_codes_e[17] = {
-    0x00E8, 0x00F6, 0x0075, 0x0034, 0x003B, 0x001B, 0x001F, 0x0004,
-    0x0000, 0x0005, 0x000C, 0x001C, 0x003C, 0x0035, 0x007A, 0x00F7,
-    0x00E9,
-};
-
-static const uint16_t bitalloc_17_codes_f[17] = {
-    0x0004, 0x0003, 0x001E, 0x0001, 0x0001, 0x000E, 0x0001, 0x0004,
-    0x0006, 0x0005, 0x0002, 0x000F, 0x0006, 0x000E, 0x001F, 0x0000,
-    0x0005,
-};
-
-static const uint16_t bitalloc_17_codes_g[17] = {
-    0x0060, 0x007E, 0x0031, 0x0019, 0x000D, 0x0004, 0x0000, 0x0006,
-    0x0002, 0x0007, 0x0001, 0x0005, 0x000E, 0x001E, 0x003E, 0x007F,
-    0x0061,
-};
-
-static const uint8_t bitalloc_17_bits_a[17] = {
-    12, 11,  9,  7,  5,  4,  3,  3,  2,  3,  3,  4,  4,  6,  8, 10,
-    12,
-};
-
-static const uint8_t bitalloc_17_bits_b[17] = {
-    8,  7,  6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  5,  5,  6,  6,
-    8,
-};
-
-static const uint8_t bitalloc_17_bits_c[17] = {
-    7,  6,  5,  5,  4,  4,  4,  3,  3,  3,  3,  4,  4,  5,  5,  5,
-    7,
-};
-
-static const uint8_t bitalloc_17_bits_d[17] = {
-    9,  9,  8,  7,  6,  5,  4,  3,  1,  3,  4,  5,  6,  7,  8,  9,
-    9,
-};
-
-static const uint8_t bitalloc_17_bits_e[17] = {
-    8,  8,  7,  6,  6,  5,  5,  3,  1,  3,  4,  5,  6,  6,  7,  8,
-    8,
-};
-
-static const uint8_t bitalloc_17_bits_f[17] = {
-    8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  6,  6,
-    8,
-};
-
-static const uint8_t bitalloc_17_bits_g[17] = {
-    8,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,  5,  6,  7,  8,
-    8,
-};
-
-static const uint16_t bitalloc_25_codes_a[25] = {
-    0x2854, 0x142B, 0x050B, 0x0143, 0x00A2, 0x0052, 0x002E, 0x0015,
-    0x0004, 0x000E, 0x0000, 0x0003, 0x0006, 0x0004, 0x0001, 0x000F,
-    0x0005, 0x0016, 0x002F, 0x0053, 0x00A3, 0x00A0, 0x0284, 0x0A14,
-    0x2855,
-};
-
-static const uint16_t bitalloc_25_codes_b[25] = {
-    0x001C, 0x000F, 0x0005, 0x0000, 0x0030, 0x0036, 0x000E, 0x0019,
-    0x0001, 0x0008, 0x000E, 0x0001, 0x0005, 0x0002, 0x000F, 0x0009,
-    0x0006, 0x001A, 0x000F, 0x0037, 0x0031, 0x0001, 0x0006, 0x0004,
-    0x001D,
-};
-
-static const uint16_t bitalloc_25_codes_c[25] = {
-    0x004C, 0x0027, 0x006D, 0x0028, 0x0037, 0x000E, 0x0015, 0x0000,
-    0x0005, 0x0008, 0x000B, 0x000E, 0x0001, 0x000F, 0x000C, 0x0009,
-    0x0006, 0x0001, 0x001A, 0x000F, 0x0008, 0x0029, 0x0012, 0x006C,
-    0x004D,
-};
-
-static const uint16_t bitalloc_25_codes_d[25] = {
-    0x0780, 0x0782, 0x03C2, 0x01E2, 0x00FE, 0x0079, 0x003D, 0x001C,
-    0x000C, 0x0004, 0x0000, 0x0006, 0x0002, 0x0007, 0x0001, 0x0005,
-    0x000D, 0x001D, 0x003E, 0x007E, 0x00FF, 0x01E3, 0x03C3, 0x0783,
-    0x0781,
-};
-
-static const uint16_t bitalloc_25_codes_e[25] = {
-    0x003C, 0x0092, 0x0018, 0x001F, 0x004E, 0x000D, 0x0025, 0x0004,
-    0x0010, 0x0000, 0x000A, 0x0002, 0x0003, 0x0003, 0x000B, 0x0001,
-    0x0011, 0x0005, 0x0026, 0x000E, 0x004F, 0x0048, 0x0019, 0x0093,
-    0x003D,
-};
-
-static const uint16_t bitalloc_25_codes_f[25] = {
-    0x0324, 0x0193, 0x00CE, 0x0065, 0x0024, 0x000C, 0x0013, 0x0004,
-    0x0007, 0x000A, 0x000D, 0x000F, 0x0001, 0x0000, 0x000E, 0x000B,
-    0x0008, 0x0005, 0x0018, 0x000D, 0x0025, 0x0066, 0x00CF, 0x00C8,
-    0x0325,
-};
-
-static const uint16_t bitalloc_25_codes_g[25] = {
-    0x03A8, 0x03AE, 0x01D5, 0x0094, 0x0014, 0x004B, 0x000B, 0x003B,
-    0x0013, 0x0003, 0x000F, 0x0005, 0x0001, 0x0006, 0x0000, 0x0008,
-    0x001C, 0x0004, 0x0024, 0x0074, 0x0015, 0x0095, 0x01D6, 0x03AF,
-    0x03A9,
-};
-
-static const uint8_t bitalloc_25_bits_a[25] = {
-    14, 13, 11,  9,  8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,
-     4,  5,  6,  7,  8,  8, 10, 12, 14,
-};
-
-static const uint8_t bitalloc_25_bits_b[25] = {
-    9,  8,  7,  6,  6,  6,  5,  5,  4,  4,  4,  3,  3,  3,  4,  4,
-    4,  5,  5,  6,  6,  6,  7,  7,  9,
-};
-
-static const uint8_t bitalloc_25_bits_c[25] = {
-    8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  4,  4,  3,  4,  4,  4,
-    4,  4,  5,  5,  5,  6,  6,  7,  8,
-};
-
-static const uint8_t bitalloc_25_bits_d[25] = {
-    12, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,
-     5,  6,  7,  8,  9, 10, 11, 12, 12,
-};
-
-static const uint8_t bitalloc_25_bits_e[25] = {
-    8,  8,  7,  7,  7,  6,  6,  5,  5,  4,  4,  3,  2,  3,  4,  4,
-    5,  5,  6,  6,  7,  7,  7,  8,  8,
-};
-
-static const uint8_t bitalloc_25_bits_f[25] = {
-    10,  9,  8,  7,  6,  5,  5,  4,  4,  4,  4,  4,  3,  3,  4,  4,
-     4,  4,  5,  5,  6,  7,  8,  8, 10,
-};
-
-static const uint8_t bitalloc_25_bits_g[25] = {
-    10, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,  2,  3,  3,  4,
-     5,  5,  6,  7,  7,  8,  9, 10, 10,
-};
-
-static const uint16_t bitalloc_33_codes_a[33] = {
-    0x1580, 0x1582, 0x0AC2, 0x0562, 0x02B2, 0x015E, 0x00AD, 0x0054,
-    0x001C, 0x003C, 0x000F, 0x001F, 0x0008, 0x000B, 0x000D, 0x0000,
-    0x0002, 0x0001, 0x000E, 0x000C, 0x0009, 0x0006, 0x0014, 0x003D,
-    0x001D, 0x0055, 0x00AE, 0x015F, 0x02B3, 0x0563, 0x0AC3, 0x1583,
-    0x1581,
-};
-
-static const uint16_t bitalloc_33_codes_b[33] = {
-    0x030C, 0x0187, 0x006D, 0x0028, 0x0037, 0x0066, 0x0015, 0x0031,
-    0x0000, 0x000B, 0x0012, 0x001A, 0x0001, 0x0007, 0x000A, 0x000E,
-    0x0001, 0x000F, 0x000B, 0x0008, 0x0004, 0x001B, 0x0013, 0x000C,
-    0x0001, 0x0032, 0x001A, 0x0067, 0x0060, 0x0029, 0x00C2, 0x006C,
-    0x030D,
-};
-
-static const uint16_t bitalloc_33_codes_c[33] = {
-    0x00CC, 0x0067, 0x0005, 0x0070, 0x0003, 0x001A, 0x0039, 0x003F,
-    0x000A, 0x0012, 0x0018, 0x001D, 0x0001, 0x0003, 0x0007, 0x000A,
-    0x000D, 0x000B, 0x0008, 0x0004, 0x0002, 0x001E, 0x0019, 0x0013,
-    0x000B, 0x0000, 0x003E, 0x001B, 0x0018, 0x0071, 0x0032, 0x0004,
-    0x00CD,
-};
-
-static const uint16_t bitalloc_33_codes_d[33] = {
-    0x3AF8, 0x3AFA, 0x1D7E, 0x0EBC, 0x075C, 0x03AC, 0x01D4, 0x0094,
-    0x0014, 0x004B, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
-    0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x0074,
-    0x0015, 0x0095, 0x01D5, 0x03AD, 0x075D, 0x0EBD, 0x1D7F, 0x3AFB,
-    0x3AF9,
-};
-
-static const uint16_t bitalloc_33_codes_e[33] = {
-    0x01C8, 0x01E6, 0x0064, 0x00E2, 0x00E5, 0x0030, 0x0033, 0x0073,
-    0x007A, 0x001A, 0x003A, 0x0002, 0x001A, 0x001F, 0x0007, 0x0001,
-    0x0002, 0x0002, 0x000C, 0x0000, 0x001B, 0x0003, 0x003B, 0x001B,
-    0x007B, 0x0078, 0x0070, 0x0031, 0x00F2, 0x00E3, 0x0065, 0x01E7,
-    0x01C9,
-};
-
-static const uint16_t bitalloc_33_codes_f[33] = {
-    0x0724, 0x0393, 0x01CE, 0x00E5, 0x002C, 0x0008, 0x0017, 0x003E,
-    0x0005, 0x0014, 0x001D, 0x0000, 0x0003, 0x0006, 0x0008, 0x000B,
-    0x000D, 0x000C, 0x0009, 0x0007, 0x0004, 0x0001, 0x001E, 0x0015,
-    0x000A, 0x003F, 0x0038, 0x0009, 0x002D, 0x00E6, 0x01CF, 0x01C8,
-    0x0725,
-};
-
-static const uint16_t bitalloc_33_codes_g[33] = {
-    0x0284, 0x0042, 0x0140, 0x0143, 0x003E, 0x00BE, 0x0011, 0x0051,
-    0x0009, 0x0029, 0x0005, 0x0015, 0x0000, 0x0008, 0x000E, 0x0002,
-    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0016, 0x0006, 0x002E,
-    0x000E, 0x005E, 0x001E, 0x00BF, 0x003F, 0x0020, 0x0141, 0x0043,
-    0x0285,
-};
-
-static const uint8_t bitalloc_33_bits_a[33] = {
-    13, 13, 12, 11, 10,  9,  8,  7,  6,  6,  5,  5,  4,  4,  4,  3,
-     3,  3,  4,  4,  4,  4,  5,  6,  6,  7,  8,  9, 10, 11, 12, 13,
-    13,
-};
-
-static const uint8_t bitalloc_33_bits_b[33] = {
-    10,  9,  8,  7,  7,  7,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
-     3,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  7,  7,  7,  8,  8,
-    10,
-};
-
-static const uint8_t bitalloc_33_bits_c[33] = {
-    9,  8,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
-    4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,  7,
-    9,
-};
-
-static const uint8_t bitalloc_33_bits_d[33] = {
-    14, 14, 13, 12, 11, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,
-     2,  3,  3,  4,  5,  5,  6,  7,  7,  8,  9, 10, 11, 12, 13, 14,
-    14,
-};
-
-static const uint8_t bitalloc_33_bits_e[33] = {
-    9,  9,  8,  8,  8,  7,  7,  7,  7,  6,  6,  5,  5,  5,  4,  3,
-    2,  3,  4,  4,  5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,
-    9,
-};
-
-static const uint8_t bitalloc_33_bits_f[33] = {
-    11, 10,  9,  8,  7,  6,  6,  6,  5,  5,  5,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  8,  9,  9,
-    11,
-};
-
-static const uint8_t bitalloc_33_bits_g[33] = {
-    10,  9,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
-     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  9,  9,
-    10,
-};
-
-static const uint16_t bitalloc_65_codes_a[65] = {
-    0x9E5C, 0x9E5E, 0x4F2C, 0x2794, 0x13C4, 0x1E44, 0x09E3, 0x0F23,
-    0x04F3, 0x0792, 0x027E, 0x03CE, 0x013D, 0x01E5, 0x009C, 0x00CC,
-    0x0040, 0x0058, 0x0067, 0x001E, 0x0021, 0x002D, 0x003D, 0x0007,
-    0x0011, 0x0014, 0x0017, 0x001A, 0x001C, 0x001F, 0x0001, 0x0004,
-    0x0006, 0x0005, 0x0002, 0x0000, 0x001D, 0x001B, 0x0018, 0x0015,
-    0x0012, 0x000E, 0x0006, 0x0032, 0x0026, 0x001F, 0x0078, 0x0059,
-    0x0041, 0x00CD, 0x009D, 0x01E6, 0x013E, 0x03CF, 0x027F, 0x0793,
-    0x0790, 0x04F0, 0x09E4, 0x1E45, 0x13C5, 0x2795, 0x4F2D, 0x9E5F,
-    0x9E5D,
-};
-
-static const uint16_t bitalloc_65_codes_b[65] = {
-    0x0A8C, 0x0547, 0x01B5, 0x0008, 0x00DB, 0x0152, 0x0005, 0x000B,
-    0x008E, 0x00AE, 0x00E4, 0x0003, 0x0037, 0x0039, 0x0055, 0x006C,
-    0x0073, 0x0003, 0x0015, 0x001D, 0x0028, 0x0030, 0x0037, 0x003E,
-    0x0006, 0x000B, 0x000F, 0x0012, 0x0016, 0x0019, 0x001D, 0x0001,
-    0x0004, 0x0002, 0x001E, 0x001A, 0x0017, 0x0013, 0x0010, 0x000C,
-    0x0007, 0x003F, 0x0038, 0x0031, 0x0029, 0x0022, 0x001A, 0x0014,
-    0x0000, 0x006D, 0x0056, 0x0046, 0x0038, 0x0004, 0x00E5, 0x00AF,
-    0x008F, 0x006C, 0x000A, 0x0153, 0x0150, 0x0009, 0x02A2, 0x01B4,
-    0x0A8D,
-};
-
-static const uint16_t bitalloc_65_codes_c[65] = {
-    0x045C, 0x022F, 0x03F5, 0x01BC, 0x01FB, 0x0059, 0x00D0, 0x00DF,
-    0x000A, 0x002D, 0x002F, 0x0052, 0x0069, 0x0078, 0x007F, 0x000A,
-    0x0010, 0x001C, 0x0023, 0x002A, 0x0035, 0x003A, 0x003D, 0x0000,
-    0x0003, 0x0006, 0x0009, 0x000C, 0x000F, 0x0012, 0x0016, 0x0018,
-    0x001C, 0x0019, 0x0017, 0x0013, 0x0010, 0x000D, 0x000A, 0x0007,
-    0x0004, 0x0001, 0x003E, 0x003B, 0x0036, 0x002B, 0x0028, 0x001D,
-    0x0011, 0x000B, 0x0004, 0x0079, 0x006E, 0x0053, 0x0044, 0x002E,
-    0x000B, 0x00FC, 0x00D1, 0x008A, 0x0058, 0x01BD, 0x0116, 0x03F4,
-    0x045D,
-};
-
-static const uint16_t bitalloc_65_codes_d[65] = {
-    0x70B0, 0x70B2, 0x70B4, 0x2852, 0x385B, 0x142E, 0x1C2E, 0x0A15,
-    0x0E14, 0x0214, 0x0704, 0x0104, 0x010B, 0x0383, 0x0083, 0x0143,
-    0x01C3, 0x0043, 0x00A2, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
-    0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
-    0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
-    0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A3, 0x00A0,
-    0x0040, 0x01C0, 0x0084, 0x0384, 0x0284, 0x0105, 0x0705, 0x0215,
-    0x0E15, 0x0A16, 0x1C2F, 0x142F, 0x1428, 0x2853, 0x70B5, 0x70B3,
-    0x70B1,
-};
-
-static const uint16_t bitalloc_65_codes_e[65] = {
-    0x032C, 0x0332, 0x0378, 0x037E, 0x008C, 0x014A, 0x0188, 0x0197,
-    0x019E, 0x01BD, 0x0044, 0x0047, 0x00AA, 0x00C5, 0x00CD, 0x00DC,
-    0x001C, 0x002C, 0x0053, 0x0063, 0x0068, 0x0008, 0x000F, 0x0017,
-    0x002B, 0x0035, 0x0005, 0x0009, 0x0016, 0x001C, 0x0006, 0x000F,
-    0x0004, 0x0000, 0x0007, 0x001D, 0x0017, 0x000A, 0x0006, 0x0036,
-    0x0030, 0x0028, 0x0010, 0x0009, 0x0069, 0x0064, 0x0054, 0x002D,
-    0x001D, 0x00DD, 0x00CE, 0x00CA, 0x00AB, 0x00A4, 0x0045, 0x01BE,
-    0x019F, 0x0198, 0x0189, 0x014B, 0x008D, 0x037F, 0x0379, 0x0333,
-    0x032D,
-};
-
-static const uint16_t bitalloc_65_codes_f[65] = {
-    0x0FE0, 0x0FE2, 0x0FE8, 0x0FEA, 0x0FEC, 0x0FEE, 0x0FF0, 0x0FF2,
-    0x0FF4, 0x2FF2, 0x07F2, 0x07FB, 0x03F6, 0x0BFA, 0x0BFD, 0x01FF,
-    0x05FF, 0x02FC, 0x007C, 0x017C, 0x003C, 0x00BC, 0x001C, 0x005C,
-    0x000C, 0x002C, 0x0004, 0x0014, 0x0000, 0x0008, 0x000E, 0x0002,
-    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0015, 0x0005, 0x002D,
-    0x000D, 0x005D, 0x001D, 0x00BD, 0x003D, 0x017D, 0x007D, 0x02FD,
-    0x00FC, 0x05FC, 0x01FA, 0x0BFB, 0x03F7, 0x17F8, 0x07F3, 0x2FF3,
-    0x0FF5, 0x0FF3, 0x0FF1, 0x0FEF, 0x0FED, 0x0FEB, 0x0FE9, 0x0FE3,
-    0x0FE1,
-};
-
-static const uint16_t bitalloc_65_codes_g[65] = {
-    0x010C, 0x038A, 0x0608, 0x0786, 0x0084, 0x0087, 0x0302, 0x0305,
-    0x0040, 0x00E0, 0x00E3, 0x0183, 0x001E, 0x005E, 0x009E, 0x00DE,
-    0x00F1, 0x0011, 0x0039, 0x0061, 0x0079, 0x0009, 0x001D, 0x0031,
-    0x003D, 0x0005, 0x000F, 0x0019, 0x001F, 0x0003, 0x0006, 0x000A,
-    0x000E, 0x000B, 0x0008, 0x0004, 0x0000, 0x001A, 0x0012, 0x000A,
-    0x0002, 0x0036, 0x0026, 0x0016, 0x0006, 0x006E, 0x004E, 0x002E,
-    0x000E, 0x00DF, 0x009F, 0x005F, 0x001F, 0x01E0, 0x0180, 0x00E1,
-    0x0041, 0x03C2, 0x0303, 0x01C4, 0x0085, 0x0787, 0x0609, 0x038B,
-    0x010D,
-};
-
-static const uint8_t bitalloc_65_bits_a[65] = {
-    16, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10,  9,  9,  8,  8,
-     7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,  4,
-     4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,
-     7,  8,  8,  9,  9, 10, 10, 11, 11, 11, 12, 13, 13, 14, 15, 16,
-    16,
-};
-
-static const uint8_t bitalloc_65_bits_b[65] = {
-    12, 11, 10,  9,  9,  9,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
-     7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,
-     4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,
-     6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9, 10, 10,
-    12,
-};
-
-static const uint8_t bitalloc_65_bits_c[65] = {
-    11, 10, 10,  9,  9,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  6,
-     6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10,
-    11,
-};
-
-static const uint8_t bitalloc_65_bits_d[65] = {
-    15, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,  9,  9,
-     9,  8,  8,  8,  7,  7,  7,  6,  6,  6,  5,  5,  5,  4,  4,  3,
-     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  8,  8,  8,
-     8,  9,  9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 15, 15,
-    15,
-};
-
-static const uint8_t bitalloc_65_bits_e[65] = {
-    10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,
-     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,
-     3,  3,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  7,  7,
-     7,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10,
-    10,
-};
-
-static const uint8_t bitalloc_65_bits_f[65] = {
-    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11,
-    11, 10,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
-     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10,
-    10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-    14,
-};
-
-static const uint8_t bitalloc_65_bits_g[65] = {
-    11, 11, 11, 11, 10, 10, 10, 10,  9,  9,  9,  9,  8,  8,  8,  8,
-     8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,
-     4,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,
-     7,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
-    11,
-};
-
-static const uint16_t bitalloc_129_codes_a[129] = {
-    0x0660, 0x0666, 0x06EC, 0x0722, 0x0760, 0x076E, 0x004C, 0x004E,
-    0x00F4, 0x010A, 0x0148, 0x0156, 0x01D4, 0x01F2, 0x0331, 0x0370,
-    0x0377, 0x0396, 0x03B1, 0x0024, 0x0064, 0x007B, 0x008A, 0x00A5,
-    0x00D4, 0x00EB, 0x00FA, 0x019A, 0x01B9, 0x01C9, 0x01D9, 0x0010,
-    0x0030, 0x0033, 0x0043, 0x0053, 0x006B, 0x007A, 0x00CA, 0x00D2,
-    0x00DE, 0x00E6, 0x00F6, 0x000E, 0x001F, 0x0023, 0x002B, 0x003B,
-    0x003F, 0x0067, 0x0070, 0x0077, 0x0005, 0x000D, 0x0012, 0x001B,
-    0x002C, 0x0035, 0x003A, 0x0004, 0x000B, 0x0017, 0x001F, 0x0009,
-    0x0008, 0x000A, 0x0000, 0x0018, 0x000C, 0x0005, 0x003C, 0x0036,
-    0x002D, 0x001C, 0x0013, 0x000E, 0x0006, 0x007A, 0x0071, 0x0068,
-    0x0064, 0x003C, 0x0034, 0x0028, 0x0020, 0x000F, 0x00F7, 0x00E7,
-    0x00DF, 0x00D3, 0x00CB, 0x007B, 0x0074, 0x0054, 0x0044, 0x003C,
-    0x0031, 0x0011, 0x01DA, 0x01CA, 0x01BA, 0x019B, 0x00FB, 0x00F8,
-    0x00D5, 0x00AA, 0x008B, 0x0084, 0x0065, 0x0025, 0x03B6, 0x0397,
-    0x0390, 0x0371, 0x0332, 0x01F3, 0x01D5, 0x0157, 0x0149, 0x010B,
-    0x00F5, 0x004F, 0x004D, 0x076F, 0x0761, 0x0723, 0x06ED, 0x0667,
-    0x0661,
-};
-
-static const uint16_t bitalloc_129_codes_b[129] = {
-    0x29DC, 0x14EF, 0x0455, 0x0E9C, 0x022B, 0x0489, 0x0740, 0x074F,
-    0x0172, 0x0245, 0x0247, 0x030A, 0x03A1, 0x001C, 0x008B, 0x00D6,
-    0x010C, 0x0148, 0x014F, 0x0186, 0x01D1, 0x0008, 0x000F, 0x0046,
-    0x005D, 0x0078, 0x0087, 0x0096, 0x00A5, 0x00BC, 0x00D8, 0x00DE,
-    0x00F6, 0x0005, 0x0014, 0x0024, 0x002F, 0x003A, 0x003D, 0x0049,
-    0x0050, 0x0058, 0x005F, 0x0066, 0x006D, 0x0075, 0x007C, 0x0004,
-    0x000B, 0x0013, 0x0018, 0x001B, 0x001F, 0x0022, 0x0026, 0x002A,
-    0x002D, 0x0031, 0x0034, 0x0038, 0x003B, 0x003F, 0x0003, 0x0006,
-    0x000A, 0x0007, 0x0004, 0x0000, 0x003C, 0x0039, 0x0035, 0x0032,
-    0x002E, 0x002B, 0x0027, 0x0023, 0x0020, 0x001C, 0x0019, 0x0016,
-    0x0010, 0x0005, 0x007D, 0x007A, 0x006E, 0x0067, 0x0060, 0x0059,
-    0x0051, 0x004A, 0x0042, 0x003B, 0x0034, 0x0025, 0x0015, 0x0006,
-    0x00F7, 0x00DF, 0x00D9, 0x00BD, 0x00A6, 0x0097, 0x0090, 0x0079,
-    0x006A, 0x0047, 0x0044, 0x0009, 0x01D2, 0x0187, 0x0184, 0x0149,
-    0x010D, 0x00D7, 0x00B8, 0x001D, 0x03A6, 0x030B, 0x029C, 0x0246,
-    0x0173, 0x0114, 0x0741, 0x053A, 0x0488, 0x0E9D, 0x0A76, 0x0454,
-    0x29DD,
-};
-
-static const uint16_t bitalloc_129_codes_c[129] = {
-    0x0E5C, 0x072F, 0x001D, 0x0724, 0x000F, 0x010D, 0x0324, 0x0393,
-    0x03E9, 0x0080, 0x0087, 0x00FA, 0x0164, 0x0193, 0x01DE, 0x01F5,
-    0x0010, 0x002A, 0x0041, 0x0064, 0x0073, 0x008E, 0x00A4, 0x00B3,
-    0x00D6, 0x00E5, 0x00F4, 0x00FB, 0x0002, 0x0009, 0x0013, 0x001E,
-    0x0026, 0x002C, 0x0033, 0x003F, 0x0041, 0x004C, 0x0053, 0x005E,
-    0x0065, 0x0070, 0x0073, 0x0078, 0x007B, 0x007E, 0x0002, 0x0005,
-    0x0007, 0x000B, 0x000D, 0x0011, 0x0014, 0x0017, 0x001A, 0x001D,
-    0x0021, 0x0024, 0x0027, 0x002A, 0x002D, 0x0030, 0x0033, 0x0036,
-    0x003A, 0x0037, 0x0034, 0x0031, 0x002E, 0x002B, 0x0028, 0x0025,
-    0x0022, 0x001E, 0x001B, 0x0018, 0x0015, 0x0012, 0x000E, 0x000C,
-    0x0008, 0x0006, 0x0003, 0x007F, 0x007C, 0x0079, 0x0076, 0x0071,
-    0x006A, 0x005F, 0x0058, 0x004D, 0x0046, 0x0040, 0x0038, 0x002D,
-    0x0027, 0x001F, 0x0014, 0x0012, 0x0003, 0x0000, 0x00F5, 0x00EE,
-    0x00D7, 0x00C8, 0x00A5, 0x008F, 0x007C, 0x0065, 0x0042, 0x002B,
-    0x0011, 0x0002, 0x01DF, 0x01C8, 0x0165, 0x00FB, 0x00E4, 0x0081,
-    0x0006, 0x03E8, 0x0325, 0x01CA, 0x010C, 0x0725, 0x0396, 0x001C,
-    0x0E5D,
-};
-
-static const uint16_t bitalloc_129_codes_d[129] = {
-    0xA598, 0xA59A, 0xA59C, 0xA59E, 0xC598, 0xE586, 0x3ACC, 0x52CA,
-    0x62CD, 0x0D48, 0x1D67, 0x2978, 0x3167, 0x3966, 0x06A5, 0x0EBC,
-    0x14BD, 0x1CB1, 0x0350, 0x0353, 0x075F, 0x0A5F, 0x0C5E, 0x0E5E,
-    0x01AE, 0x03AD, 0x052D, 0x062D, 0x072D, 0x00D5, 0x01D4, 0x0294,
-    0x0314, 0x0394, 0x0014, 0x0094, 0x0114, 0x0174, 0x01B4, 0x01F4,
-    0x000B, 0x004B, 0x008B, 0x00BB, 0x00DB, 0x00FB, 0x001B, 0x003B,
-    0x0053, 0x0063, 0x0073, 0x0003, 0x0013, 0x0023, 0x002F, 0x0037,
-    0x003F, 0x0007, 0x000F, 0x0015, 0x0019, 0x001D, 0x0001, 0x0005,
-    0x0009, 0x0006, 0x0002, 0x001E, 0x001A, 0x0016, 0x0010, 0x0008,
-    0x0000, 0x0038, 0x0030, 0x0028, 0x001C, 0x000C, 0x007C, 0x006C,
-    0x005C, 0x0044, 0x0024, 0x0004, 0x00E4, 0x00C4, 0x00A4, 0x0074,
-    0x0034, 0x01F5, 0x01B5, 0x0175, 0x0115, 0x0095, 0x0015, 0x0395,
-    0x0315, 0x0295, 0x01D5, 0x00D6, 0x072E, 0x062E, 0x052E, 0x03AE,
-    0x01AF, 0x0E5F, 0x0C5F, 0x0C58, 0x0A58, 0x0758, 0x0351, 0x1CB2,
-    0x18B2, 0x0EBD, 0x0EB2, 0x3967, 0x3960, 0x2979, 0x2964, 0x0D49,
-    0x72C2, 0x52CB, 0x3ACD, 0xE587, 0xC599, 0xA59F, 0xA59D, 0xA59B,
-    0xA599,
-};
-
-static const uint16_t bitalloc_129_codes_e[129] = {
-    0xA13C, 0xC720, 0xA13F, 0xA13E, 0xA13D, 0xE722, 0x5090, 0x6393,
-    0x7392, 0x2849, 0x31CE, 0x39CE, 0x1425, 0x18E5, 0x1CE5, 0x0844,
-    0x0A1C, 0x0C7C, 0x036C, 0x0423, 0x050F, 0x063F, 0x01B7, 0x0216,
-    0x0285, 0x031D, 0x039D, 0x0109, 0x0140, 0x0180, 0x01C8, 0x01CF,
-    0x007A, 0x008A, 0x00A2, 0x00C1, 0x00E5, 0x0014, 0x0037, 0x0043,
-    0x004E, 0x0056, 0x0061, 0x006C, 0x007C, 0x000B, 0x001C, 0x001F,
-    0x0023, 0x0025, 0x0029, 0x002C, 0x002E, 0x0032, 0x0034, 0x0037,
-    0x003A, 0x003C, 0x003F, 0x0001, 0x0003, 0x0006, 0x0008, 0x000A,
-    0x000C, 0x000B, 0x0009, 0x0007, 0x0004, 0x0002, 0x0000, 0x003D,
-    0x003B, 0x0038, 0x0035, 0x0033, 0x002F, 0x002D, 0x002A, 0x0026,
-    0x0024, 0x0020, 0x001D, 0x001A, 0x007D, 0x006D, 0x0062, 0x0057,
-    0x004F, 0x0044, 0x003C, 0x0015, 0x00E6, 0x00C6, 0x00A3, 0x008B,
-    0x007B, 0x006C, 0x01C9, 0x0181, 0x0141, 0x010A, 0x00DA, 0x031E,
-    0x0286, 0x0217, 0x0210, 0x0738, 0x0638, 0x0508, 0x036D, 0x0C7D,
-    0x0A1D, 0x0845, 0x1CE6, 0x18E6, 0x1426, 0x39CF, 0x31CF, 0x284E,
-    0x7393, 0x7390, 0x5091, 0xE723, 0xC724, 0xC725, 0xC722, 0xC723,
-    0xC721,
-};
-
-static const uint16_t bitalloc_129_codes_f[129] = {
-    0x762C, 0x3B17, 0x1555, 0x0608, 0x0AAB, 0x0FF2, 0x0305, 0x0307,
-    0x0763, 0x0046, 0x010C, 0x01BC, 0x02AB, 0x03B6, 0x03FD, 0x0080,
-    0x0087, 0x00DF, 0x0156, 0x01D9, 0x01F8, 0x01FF, 0x002A, 0x0041,
-    0x0061, 0x0094, 0x00D4, 0x00EA, 0x00F2, 0x00FD, 0x0009, 0x000B,
-    0x001A, 0x0026, 0x0031, 0x0040, 0x004B, 0x006B, 0x0073, 0x0077,
-    0x007A, 0x007C, 0x0000, 0x0002, 0x0006, 0x0008, 0x000B, 0x000E,
-    0x0011, 0x0014, 0x0016, 0x0019, 0x001C, 0x001E, 0x0021, 0x0023,
-    0x0026, 0x0028, 0x002B, 0x002D, 0x002F, 0x0031, 0x0033, 0x0036,
-    0x0038, 0x0037, 0x0034, 0x0032, 0x0030, 0x002E, 0x002C, 0x0029,
-    0x0027, 0x0024, 0x0022, 0x001F, 0x001D, 0x001A, 0x0017, 0x0015,
-    0x0012, 0x000F, 0x000C, 0x0009, 0x0007, 0x0003, 0x0001, 0x007D,
-    0x007B, 0x0078, 0x0074, 0x0072, 0x0054, 0x0041, 0x0036, 0x0027,
-    0x001B, 0x0014, 0x000A, 0x00FE, 0x00F3, 0x00EB, 0x00D5, 0x0095,
-    0x006E, 0x0042, 0x002B, 0x0010, 0x01F9, 0x01DA, 0x0157, 0x0154,
-    0x00C0, 0x0081, 0x0022, 0x03B7, 0x03B0, 0x01BD, 0x010D, 0x0047,
-    0x07F8, 0x0554, 0x0306, 0x0FF3, 0x0EC4, 0x0609, 0x1D8A, 0x1554,
-    0x762D,
-};
-
-static const uint16_t bitalloc_129_codes_g[129] = {
-    0x1E20, 0x1E5E, 0x031C, 0x051A, 0x0718, 0x0916, 0x0B14, 0x0D12,
-    0x0F11, 0x0090, 0x018F, 0x028E, 0x038D, 0x048C, 0x058B, 0x068A,
-    0x0789, 0x0049, 0x00C8, 0x0148, 0x01C7, 0x0247, 0x02C6, 0x0346,
-    0x03C5, 0x0025, 0x0065, 0x00A5, 0x00E4, 0x0124, 0x0164, 0x01A4,
-    0x01E3, 0x0013, 0x0033, 0x0053, 0x0073, 0x0093, 0x00B3, 0x00D3,
-    0x00F3, 0x000A, 0x001A, 0x002A, 0x003A, 0x004A, 0x005A, 0x006A,
-    0x007A, 0x0006, 0x000E, 0x0016, 0x001E, 0x0026, 0x002E, 0x0036,
-    0x003E, 0x0004, 0x0008, 0x000C, 0x0010, 0x0014, 0x0018, 0x001C,
-    0x0000, 0x001D, 0x0019, 0x0015, 0x0011, 0x000D, 0x0009, 0x0005,
-    0x003F, 0x0037, 0x002F, 0x0027, 0x001F, 0x0017, 0x000F, 0x0007,
-    0x007B, 0x006B, 0x005B, 0x004B, 0x003B, 0x002B, 0x001B, 0x000B,
-    0x0008, 0x00F0, 0x00D0, 0x00B0, 0x0090, 0x0070, 0x0050, 0x0030,
-    0x01E4, 0x01A5, 0x0165, 0x0125, 0x00E5, 0x00E2, 0x00A2, 0x0062,
-    0x03CA, 0x0347, 0x02C7, 0x02C4, 0x0244, 0x0149, 0x00C9, 0x00C6,
-    0x0796, 0x068B, 0x0688, 0x048D, 0x048A, 0x028F, 0x028C, 0x0091,
-    0x0F2E, 0x0D13, 0x0B15, 0x0917, 0x0719, 0x051B, 0x031D, 0x1E5F,
-    0x1E21,
-};
-
-static const uint8_t bitalloc_129_bits_a[129] = {
-    11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-    10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,
-     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
-     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,
-     4,  4,  4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,
-     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-     8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
-    11,
-};
-
-static const uint8_t bitalloc_129_bits_b[129] = {
-    14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,
-     9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-     8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,
-     5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
-     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12,
-    14,
-};
-
-static const uint8_t bitalloc_129_bits_c[129] = {
-    13, 12, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
-     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
-     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-     8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
-    13,
-};
-
-static const uint8_t bitalloc_129_bits_d[129] = {
-    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13,
-    13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10,
-    10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  7,  7,
-     7,  7,  7,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
-     4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,
-     7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10,
-    10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13,
-    13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
-    16,
-};
-
-static const uint8_t bitalloc_129_bits_e[129] = {
-    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 12,
-    12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,
-     8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,
-     8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
-    12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
-    16,
-};
-
-static const uint8_t bitalloc_129_bits_f[129] = {
-    15, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,  9,
-     9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,
-     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-     7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
-     9,  9,  9, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13,
-    15,
-};
-
-static const uint8_t bitalloc_129_bits_g[129] = {
-    13, 13, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
-    11, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
-     9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
-     7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,
-     4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
-     7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,
-     9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
-    11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 13,
-    13,
-};
-
-static const uint8_t bitalloc_sizes[10] = {
-    3, 5, 7, 9, 13, 17, 25, 33, 65, 129
-};
-
-static const int8_t bitalloc_offsets[10] = {
-    -1, -2, -3, -4, -6, -8, -12, -16, -32, -64
-};
-
-static const uint8_t bitalloc_maxbits[10][7] = {
-    { 2 },
-    { 4, 3, 3 },
-    { 5, 5, 4 },
-    { 6, 5, 6 },
-    { 7, 6, 5 },
-    { 9, 8, 7, 9, 8, 8, 8 },
-    { 9, 9, 8, 9, 8, 9, 9 },
-    { 9, 9, 9, 9, 9, 9, 9 },
-    { 9, 9, 9, 9, 9, 9, 9 },
-    { 9, 9, 9, 9, 9, 9, 9 }
-};
-
-static const uint16_t *const bitalloc_codes[10][8] = {
-    { bitalloc_3_codes,     NULL },
-    { bitalloc_5_codes_a,   bitalloc_5_codes_b,   bitalloc_5_codes_c,   NULL },
-    { bitalloc_7_codes_a,   bitalloc_7_codes_b,   bitalloc_7_codes_c,   NULL },
-    { bitalloc_9_codes_a,   bitalloc_9_codes_b,   bitalloc_9_codes_c,   NULL },
-    { bitalloc_13_codes_a,  bitalloc_13_codes_b,  bitalloc_13_codes_c,  NULL },
-    { bitalloc_17_codes_a,  bitalloc_17_codes_b,  bitalloc_17_codes_c,  bitalloc_17_codes_d,
-      bitalloc_17_codes_e,  bitalloc_17_codes_f,  bitalloc_17_codes_g,  NULL },
-    { bitalloc_25_codes_a,  bitalloc_25_codes_b,  bitalloc_25_codes_c,  bitalloc_25_codes_d,
-      bitalloc_25_codes_e,  bitalloc_25_codes_f,  bitalloc_25_codes_g,  NULL },
-    { bitalloc_33_codes_a,  bitalloc_33_codes_b,  bitalloc_33_codes_c,  bitalloc_33_codes_d,
-      bitalloc_33_codes_e,  bitalloc_33_codes_f,  bitalloc_33_codes_g,  NULL },
-    { bitalloc_65_codes_a,  bitalloc_65_codes_b,  bitalloc_65_codes_c,  bitalloc_65_codes_d,
-      bitalloc_65_codes_e,  bitalloc_65_codes_f,  bitalloc_65_codes_g,  NULL },
-    { bitalloc_129_codes_a, bitalloc_129_codes_b, bitalloc_129_codes_c, bitalloc_129_codes_d,
-      bitalloc_129_codes_e, bitalloc_129_codes_f, bitalloc_129_codes_g, NULL }
-};
-
-static const uint8_t *const bitalloc_bits[10][8] = {
-    { bitalloc_3_bits,     NULL },
-    { bitalloc_5_bits_a,   bitalloc_5_bits_b,   bitalloc_5_bits_c,   NULL },
-    { bitalloc_7_bits_a,   bitalloc_7_bits_b,   bitalloc_7_bits_c,   NULL },
-    { bitalloc_9_bits_a,   bitalloc_9_bits_b,   bitalloc_9_bits_c,   NULL },
-    { bitalloc_13_bits_a,  bitalloc_13_bits_b,  bitalloc_13_bits_c,  NULL },
-    { bitalloc_17_bits_a,  bitalloc_17_bits_b,  bitalloc_17_bits_c,  bitalloc_17_bits_d,
-      bitalloc_17_bits_e,  bitalloc_17_bits_f,  bitalloc_17_bits_g,  NULL },
-    { bitalloc_25_bits_a,  bitalloc_25_bits_b,  bitalloc_25_bits_c,  bitalloc_25_bits_d,
-      bitalloc_25_bits_e,  bitalloc_25_bits_f,  bitalloc_25_bits_g,  NULL },
-    { bitalloc_33_bits_a,  bitalloc_33_bits_b,  bitalloc_33_bits_c,  bitalloc_33_bits_d,
-      bitalloc_33_bits_e,  bitalloc_33_bits_f,  bitalloc_33_bits_g,  NULL },
-    { bitalloc_65_bits_a,  bitalloc_65_bits_b,  bitalloc_65_bits_c,  bitalloc_65_bits_d,
-      bitalloc_65_bits_e,  bitalloc_65_bits_f,  bitalloc_65_bits_g,  NULL },
-    { bitalloc_129_bits_a, bitalloc_129_bits_b, bitalloc_129_bits_c, bitalloc_129_bits_d,
-      bitalloc_129_bits_e, bitalloc_129_bits_f, bitalloc_129_bits_g, NULL }
-};
+#include "libavutil/common.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "put_bits.h"
+
+#define DCA_CODE_BOOKS      10
+#define DCA_BITALLOC_12_COUNT    5
+
+typedef struct DCAVLC {
+    int offset;         ///< Code values offset
+    int max_depth;      ///< Parameter for get_vlc2()
+    VLC vlc[7];         ///< Actual codes
+} DCAVLC;
+
+extern DCAVLC   ff_dca_vlc_bit_allocation;
+extern DCAVLC   ff_dca_vlc_transition_mode;
+extern DCAVLC   ff_dca_vlc_scale_factor;
+extern DCAVLC   ff_dca_vlc_quant_index[DCA_CODE_BOOKS];
+
+extern VLC  ff_dca_vlc_tnl_grp[5];
+extern VLC  ff_dca_vlc_tnl_scf;
+extern VLC  ff_dca_vlc_damp;
+extern VLC  ff_dca_vlc_dph;
+extern VLC  ff_dca_vlc_fst_rsd_amp;
+extern VLC  ff_dca_vlc_rsd_apprx;
+extern VLC  ff_dca_vlc_rsd_amp;
+extern VLC  ff_dca_vlc_avg_g3;
+extern VLC  ff_dca_vlc_st_grid;
+extern VLC  ff_dca_vlc_grid_2;
+extern VLC  ff_dca_vlc_grid_3;
+extern VLC  ff_dca_vlc_rsd;
+
+av_cold void ff_dca_init_vlcs(void);
+uint32_t ff_dca_vlc_calc_quant_bits(int *values, uint8_t n, uint8_t sel, uint8_t abits);
+void ff_dca_vlc_enc_quant(PutBitContext *pb, int *values, uint8_t n, uint8_t sel, uint8_t abits);
+uint32_t ff_dca_vlc_calc_alloc_bits(int *values, uint8_t n, uint8_t sel);
+void ff_dca_vlc_enc_alloc(PutBitContext *pb, int *values, uint8_t n, uint8_t sel);
 
 #endif /* AVCODEC_DCAHUFF_H */
diff --git a/libavcodec/dcamath.h b/libavcodec/dcamath.h
index e21eb07..38fa9a6 100644
--- a/libavcodec/dcamath.h
+++ b/libavcodec/dcamath.h
@@ -1,31 +1,30 @@
 /*
- * This file is part of Libav.
+ * Copyright (C) 2016 foo86
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/common.h"
-
+#ifndef AVCODEC_DCAMATH_H
+#define AVCODEC_DCAMATH_H
 
-// clip a signed integer into the (-2^23), (2^23-1) range
-static inline int dca_clip23(int a)
-{
-    return av_clip_intp2(a, 23);
-}
+#include "libavutil/common.h"
+#include "libavutil/intmath.h"
 
-static inline int32_t dca_norm(int64_t a, int bits)
+static inline int32_t norm__(int64_t a, int bits)
 {
     if (bits > 0)
         return (int32_t)((a + (INT64_C(1) << (bits - 1))) >> bits);
@@ -33,10 +32,25 @@ static inline int32_t dca_norm(int64_t a, int bits)
         return (int32_t)a;
 }
 
-static inline int64_t dca_round(int64_t a, int bits)
+static inline int32_t mul__(int32_t a, int32_t b, int bits)
 {
-    if (bits > 0)
-        return (a + (INT64_C(1) << (bits - 1))) & ~((INT64_C(1) << bits) - 1);
-    else
-        return a;
+    return norm__((int64_t)a * b, bits);
 }
+
+static inline int32_t norm13(int64_t a) { return norm__(a, 13); }
+static inline int32_t norm16(int64_t a) { return norm__(a, 16); }
+static inline int32_t norm20(int64_t a) { return norm__(a, 20); }
+static inline int32_t norm21(int64_t a) { return norm__(a, 21); }
+static inline int32_t norm23(int64_t a) { return norm__(a, 23); }
+
+static inline int32_t mul15(int32_t a, int32_t b) { return mul__(a, b, 15); }
+static inline int32_t mul16(int32_t a, int32_t b) { return mul__(a, b, 16); }
+static inline int32_t mul17(int32_t a, int32_t b) { return mul__(a, b, 17); }
+static inline int32_t mul22(int32_t a, int32_t b) { return mul__(a, b, 22); }
+static inline int32_t mul23(int32_t a, int32_t b) { return mul__(a, b, 23); }
+static inline int32_t mul31(int32_t a, int32_t b) { return mul__(a, b, 31); }
+static inline int32_t mul32(int32_t a, int32_t b) { return mul__(a, b, 32); }
+
+static inline int32_t clip23(int32_t a) { return av_clip_intp2(a, 23); }
+
+#endif
diff --git a/libavcodec/dct.c b/libavcodec/dct.c
index 180477e..52f082d 100644
--- a/libavcodec/dct.c
+++ b/libavcodec/dct.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -178,6 +178,7 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
 {
     int n = 1 << nbits;
     int i;
+    int ret;
 
     memset(s, 0, sizeof(*s));
 
@@ -190,13 +191,13 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
         ff_init_ff_cos_tabs(nbits + 2);
 
         s->costab = ff_cos_tabs[nbits + 2];
-        s->csc2   = av_malloc(n / 2 * sizeof(FFTSample));
+        s->csc2   = av_malloc_array(n / 2, sizeof(FFTSample));
         if (!s->csc2)
             return AVERROR(ENOMEM);
 
-        if (ff_rdft_init(&s->rdft, nbits, inverse == DCT_III) < 0) {
-            av_free(s->csc2);
-            return -1;
+        if ((ret = ff_rdft_init(&s->rdft, nbits, inverse == DCT_III)) < 0) {
+            av_freep(&s->csc2);
+            return ret;
         }
 
         for (i = 0; i < n / 2; i++)
@@ -220,5 +221,5 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
 av_cold void ff_dct_end(DCTContext *s)
 {
     ff_rdft_end(&s->rdft);
-    av_free(s->csc2);
+    av_freep(&s->csc2);
 }
diff --git a/libavcodec/dct.h b/libavcodec/dct.h
index 46893a6..0a03e25 100644
--- a/libavcodec/dct.h
+++ b/libavcodec/dct.h
@@ -4,24 +4,24 @@
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-#ifndef AVCODEC_DCT_H
+#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
 #define AVCODEC_DCT_H
 
 #include <stddef.h>
@@ -60,6 +60,9 @@ void ff_fdct248_islow_8(int16_t *data);
 void ff_fdct248_islow_10(int16_t *data);
 
 void ff_j_rev_dct(int16_t *data);
+void ff_j_rev_dct4(int16_t *data);
+void ff_j_rev_dct2(int16_t *data);
+void ff_j_rev_dct1(int16_t *data);
 void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
diff --git a/libavcodec/dct32.h b/libavcodec/dct32.h
index 8bf6880..61bf223 100644
--- a/libavcodec/dct32.h
+++ b/libavcodec/dct32.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_fixed.c b/libavcodec/dct32_fixed.c
index 64efe8b..9025d5e 100644
--- a/libavcodec/dct32_fixed.c
+++ b/libavcodec/dct32_fixed.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_float.c b/libavcodec/dct32_float.c
index ef37ce9..597c9bb 100644
--- a/libavcodec/dct32_float.c
+++ b/libavcodec/dct32_float.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_template.c b/libavcodec/dct32_template.c
index 272e0db..51cebc0 100644
--- a/libavcodec/dct32_template.c
+++ b/libavcodec/dct32_template.c
@@ -2,36 +2,47 @@
  * Template for the Discrete Cosine Transform for 32 samples
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "dct32.h"
 #include "mathops.h"
+#include "libavutil/internal.h"
+
+#ifdef CHECKED
+#define SUINT   int
+#define SUINT32 int32_t
+#else
+#define SUINT   unsigned
+#define SUINT32 uint32_t
+#endif
 
 #if DCT32_FLOAT
 #   define dct32 ff_dct32_float
 #   define FIXHR(x)       ((float)(x))
 #   define MULH3(x, y, s) ((s)*(y)*(x))
 #   define INTFLOAT float
+#   define SUINTFLOAT float
 #else
 #   define dct32 ff_dct32_fixed
 #   define FIXHR(a)       ((int)((a) * (1LL<<32) + 0.5))
 #   define MULH3(x, y, s) MULH((s)*(x), y)
 #   define INTFLOAT int
+#   define SUINTFLOAT SUINT
 #endif
 
 
@@ -73,7 +84,7 @@
 #define COS3_0 FIXHR(0.54119610014619698439/2)
 #define COS3_1 FIXHR(1.30656296487637652785/4)
 
-#define COS4_0 FIXHR(0.70710678118654752439/2)
+#define COS4_0 FIXHR(M_SQRT1_2/2)
 
 /* butterfly operator */
 #define BF(a, b, c, s)\
@@ -112,11 +123,12 @@
 #define ADD(a, b) val##a += val##b
 
 /* DCT32 without 1/sqrt(2) coef zero scaling. */
-void dct32(INTFLOAT *out, const INTFLOAT *tab)
+void dct32(INTFLOAT *out, const INTFLOAT *tab_arg)
 {
-    INTFLOAT tmp0, tmp1;
+    const SUINTFLOAT *tab = tab_arg;
+    SUINTFLOAT tmp0, tmp1;
 
-    INTFLOAT val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7 ,
+    SUINTFLOAT val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7 ,
              val8 , val9 , val10, val11, val12, val13, val14, val15,
              val16, val17, val18, val19, val20, val21, val22, val23,
              val24, val25, val26, val27, val28, val29, val30, val31;
diff --git a/libavcodec/dctref.c b/libavcodec/dctref.c
index ae3dec5..851014b 100644
--- a/libavcodec/dctref.c
+++ b/libavcodec/dctref.c
@@ -2,20 +2,20 @@
  * reference discrete cosine transform (double precision)
  * Copyright (C) 2009 Dylan Yudaken
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dctref.h b/libavcodec/dctref.h
index a93b70d..f6fde88 100644
--- a/libavcodec/dctref.h
+++ b/libavcodec/dctref.h
@@ -2,20 +2,20 @@
  * reference discrete cosine transform (double precision)
  * Copyright (C) 2009 Dylan Yudaken
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dds.c b/libavcodec/dds.c
index 91e0c24..f026f9c 100644
--- a/libavcodec/dds.c
+++ b/libavcodec/dds.c
@@ -2,20 +2,20 @@
  * DirectDraw Surface image decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 
 #include <stdint.h>
 
+#include "libavutil/libm.h"
 #include "libavutil/imgutils.h"
 
 #include "avcodec.h"
@@ -38,13 +39,14 @@
 
 #define DDPF_FOURCC    (1 <<  2)
 #define DDPF_PALETTE   (1 <<  5)
-#define DDPF_NORMALMAP (1 << 31)
+#define DDPF_NORMALMAP (1U << 31)
 
 enum DDSPostProc {
     DDS_NONE = 0,
     DDS_ALPHA_EXP,
     DDS_NORMAL_MAP,
     DDS_RAW_YCOCG,
+    DDS_SWAP_ALPHA,
     DDS_SWIZZLE_A2XY,
     DDS_SWIZZLE_RBXG,
     DDS_SWIZZLE_RGXB,
@@ -100,6 +102,7 @@ typedef struct DDSContext {
 
     int compressed;
     int paletted;
+    int bpp;
     enum DDSPostProc postproc;
 
     const uint8_t *tex_data; // Compressed texture
@@ -114,7 +117,6 @@ static int parse_pixel_format(AVCodecContext *avctx)
 {
     DDSContext *ctx = avctx->priv_data;
     GetByteContext *gbc = &ctx->gbc;
-    char buf[32];
     uint32_t flags, fourcc, gimp_tag;
     enum DDSDXGIFormat dxgi;
     int size, bpp, r, g, b, a;
@@ -146,7 +148,7 @@ static int parse_pixel_format(AVCodecContext *avctx)
         ctx->paletted = 0;
     }
 
-    bpp = bytestream2_get_le32(gbc); // rgbbitcount
+    bpp = ctx->bpp = bytestream2_get_le32(gbc); // rgbbitcount
     r   = bytestream2_get_le32(gbc); // rbitmask
     g   = bytestream2_get_le32(gbc); // gbitmask
     b   = bytestream2_get_le32(gbc); // bbitmask
@@ -158,13 +160,10 @@ static int parse_pixel_format(AVCodecContext *avctx)
     bytestream2_skip(gbc, 4); // caps4
     bytestream2_skip(gbc, 4); // reserved2
 
-    av_get_codec_tag_string(buf, sizeof(buf), fourcc);
     av_log(avctx, AV_LOG_VERBOSE, "fourcc %s bpp %d "
-           "r 0x%x g 0x%x b 0x%x a 0x%x\n", buf, bpp, r, g, b, a);
-    if (gimp_tag) {
-        av_get_codec_tag_string(buf, sizeof(buf), gimp_tag);
-        av_log(avctx, AV_LOG_VERBOSE, "and GIMP-DDS tag %s\n", buf);
-    }
+           "r 0x%x g 0x%x b 0x%x a 0x%x\n", av_fourcc2str(fourcc), bpp, r, g, b, a);
+    if (gimp_tag)
+        av_log(avctx, AV_LOG_VERBOSE, "and GIMP-DDS tag %s\n", av_fourcc2str(gimp_tag));
 
     if (ctx->compressed)
         avctx->pix_fmt = AV_PIX_FMT_RGBA;
@@ -341,7 +340,7 @@ static int parse_pixel_format(AVCodecContext *avctx)
             }
             break;
         default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported %s fourcc.\n", buf);
+            av_log(avctx, AV_LOG_ERROR, "Unsupported %s fourcc.\n", av_fourcc2str(fourcc));
             return AVERROR_INVALIDDATA;
         }
     } else if (ctx->paletted) {
@@ -352,14 +351,21 @@ static int parse_pixel_format(AVCodecContext *avctx)
             return AVERROR_INVALIDDATA;
         }
     } else {
+        /*  4 bpp */
+        if (bpp == 4 && r == 0 && g == 0 && b == 0 && a == 0)
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
         /*  8 bpp */
-        if (bpp == 8 && r == 0xff && g == 0 && b == 0 && a == 0)
+        else if (bpp == 8 && r == 0xff && g == 0 && b == 0 && a == 0)
             avctx->pix_fmt = AV_PIX_FMT_GRAY8;
         else if (bpp == 8 && r == 0 && g == 0 && b == 0 && a == 0xff)
             avctx->pix_fmt = AV_PIX_FMT_GRAY8;
         /* 16 bpp */
         else if (bpp == 16 && r == 0xff && g == 0 && b == 0 && a == 0xff00)
             avctx->pix_fmt = AV_PIX_FMT_YA8;
+        else if (bpp == 16 && r == 0xff00 && g == 0 && b == 0 && a == 0xff) {
+            avctx->pix_fmt = AV_PIX_FMT_YA8;
+            ctx->postproc = DDS_SWAP_ALPHA;
+        }
         else if (bpp == 16 && r == 0xffff && g == 0 && b == 0 && a == 0)
             avctx->pix_fmt = AV_PIX_FMT_GRAY16LE;
         else if (bpp == 16 && r == 0x7c00 && g == 0x3e0 && b == 0x1f && a == 0)
@@ -373,9 +379,9 @@ static int parse_pixel_format(AVCodecContext *avctx)
             avctx->pix_fmt = AV_PIX_FMT_BGR24;
         /* 32 bpp */
         else if (bpp == 32 && r == 0xff0000 && g == 0xff00 && b == 0xff && a == 0)
-            avctx->pix_fmt = AV_PIX_FMT_BGRA; // opaque
+            avctx->pix_fmt = AV_PIX_FMT_BGR0; // opaque
         else if (bpp == 32 && r == 0xff && g == 0xff00 && b == 0xff0000 && a == 0)
-            avctx->pix_fmt = AV_PIX_FMT_RGBA; // opaque
+            avctx->pix_fmt = AV_PIX_FMT_RGB0; // opaque
         else if (bpp == 32 && r == 0xff0000 && g == 0xff00 && b == 0xff && a == 0xff000000)
             avctx->pix_fmt = AV_PIX_FMT_BGRA;
         else if (bpp == 32 && r == 0xff && g == 0xff00 && b == 0xff0000 && a == 0xff000000)
@@ -515,7 +521,7 @@ static void run_postproc(AVCodecContext *avctx, AVFrame *frame)
 
             int d = (255 * 255 - x * x - y * y) / 2;
             if (d > 0)
-                z = rint(sqrtf(d));
+                z = lrint(sqrtf(d));
 
             src[0] = x;
             src[1] = y;
@@ -541,6 +547,15 @@ static void run_postproc(AVCodecContext *avctx, AVFrame *frame)
             src[3] = a;
         }
         break;
+    case DDS_SWAP_ALPHA:
+        /* Alpha and Luma are stored swapped. */
+        av_log(avctx, AV_LOG_DEBUG, "Post-processing swapped Luma/Alpha.\n");
+
+        for (i = 0; i < frame->linesize[0] * frame->height; i += 2) {
+            uint8_t *src = frame->data[0] + i;
+            FFSWAP(uint8_t, src[0], src[1]);
+        }
+        break;
     case DDS_SWIZZLE_A2XY:
         /* Swap R and G, often used to restore a standard RGTC2. */
         av_log(avctx, AV_LOG_DEBUG, "Post-processing A2XY swizzle.\n");
@@ -661,22 +676,50 @@ static int dds_decode(AVCodecContext *avctx, void *data,
         /* Use the decompress function on the texture, one block per thread. */
         ctx->tex_data = gbc->buffer;
         avctx->execute2(avctx, decompress_texture_thread, frame, NULL, ctx->slice_count);
+    } else if (!ctx->paletted && ctx->bpp == 4 && avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        uint8_t *dst = frame->data[0];
+        int x, y, i;
+
+        /* Use the first 64 bytes as palette, then copy the rest. */
+        bytestream2_get_buffer(gbc, frame->data[1], 16 * 4);
+        for (i = 0; i < 16; i++) {
+            AV_WN32(frame->data[1] + i*4,
+                    (frame->data[1][2+i*4]<<0)+
+                    (frame->data[1][1+i*4]<<8)+
+                    (frame->data[1][0+i*4]<<16)+
+                    ((unsigned)frame->data[1][3+i*4]<<24)
+            );
+        }
+        frame->palette_has_changed = 1;
+
+        if (bytestream2_get_bytes_left(gbc) < frame->height * frame->width / 2) {
+            av_log(avctx, AV_LOG_ERROR, "Buffer is too small (%d < %d).\n",
+                   bytestream2_get_bytes_left(gbc), frame->height * frame->width / 2);
+            return AVERROR_INVALIDDATA;
+        }
+
+        for (y = 0; y < frame->height; y++) {
+            for (x = 0; x < frame->width; x += 2) {
+                uint8_t val = bytestream2_get_byte(gbc);
+                dst[x    ] = val & 0xF;
+                dst[x + 1] = val >> 4;
+            }
+            dst += frame->linesize[0];
+        }
     } else {
         int linesize = av_image_get_linesize(avctx->pix_fmt, frame->width, 0);
 
         if (ctx->paletted) {
             int i;
-            uint32_t *p = (uint32_t*) frame->data[1];
-
             /* Use the first 1024 bytes as palette, then copy the rest. */
-            for (i = 0; i < 256; i++) {
-                uint32_t rgba = 0;
-                rgba |= bytestream2_get_byte(gbc) << 16;
-                rgba |= bytestream2_get_byte(gbc) << 8;
-                rgba |= bytestream2_get_byte(gbc) << 0;
-                rgba |= bytestream2_get_byte(gbc) << 24;
-                p[i] = rgba;
-            }
+            bytestream2_get_buffer(gbc, frame->data[1], 256 * 4);
+            for (i = 0; i < 256; i++)
+                AV_WN32(frame->data[1] + i*4,
+                        (frame->data[1][2+i*4]<<0)+
+                        (frame->data[1][1+i*4]<<8)+
+                        (frame->data[1][0+i*4]<<16)+
+                        ((unsigned)frame->data[1][3+i*4]<<24)
+                );
 
             frame->palette_has_changed = 1;
         }
diff --git a/libavcodec/decode.c b/libavcodec/decode.c
index 2dab7f2..a32ff2f 100644
--- a/libavcodec/decode.c
+++ b/libavcodec/decode.c
@@ -1,20 +1,20 @@
 /*
  * generic decoding-related code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,13 +23,20 @@
 
 #include "config.h"
 
+#if CONFIG_ICONV
+# include <iconv.h>
+#endif
+
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/common.h"
 #include "libavutil/frame.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/internal.h"
 #include "libavutil/intmath.h"
+#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "bytestream.h"
@@ -38,11 +45,12 @@
 #include "internal.h"
 #include "thread.h"
 
-static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
+static int apply_param_change(AVCodecContext *avctx, const AVPacket *avpkt)
 {
     int size = 0, ret;
     const uint8_t *data;
     uint32_t flags;
+    int64_t val;
 
     data = av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, &size);
     if (!data)
@@ -64,7 +72,13 @@ static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT) {
         if (size < 4)
             goto fail;
-        avctx->channels = bytestream_get_le32(&data);
+        val = bytestream_get_le32(&data);
+        if (val <= 0 || val > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid channel count");
+            ret = AVERROR_INVALIDDATA;
+            goto fail2;
+        }
+        avctx->channels = val;
         size -= 4;
     }
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT) {
@@ -76,7 +90,13 @@ static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE) {
         if (size < 4)
             goto fail;
-        avctx->sample_rate = bytestream_get_le32(&data);
+        val = bytestream_get_le32(&data);
+        if (val <= 0 || val > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid sample rate");
+            ret = AVERROR_INVALIDDATA;
+            goto fail2;
+        }
+        avctx->sample_rate = val;
         size -= 4;
     }
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS) {
@@ -105,10 +125,15 @@ fail2:
 
 static int extract_packet_props(AVCodecInternal *avci, const AVPacket *pkt)
 {
+    int ret = 0;
+
     av_packet_unref(avci->last_pkt_props);
-    if (pkt)
-        return av_packet_copy_props(avci->last_pkt_props, pkt);
-    return 0;
+    if (pkt) {
+        ret = av_packet_copy_props(avci->last_pkt_props, pkt);
+        if (!ret)
+            avci->last_pkt_props->size = pkt->size; // HACK: Needed for ff_decode_frame_props().
+    }
+    return ret;
 }
 
 static int unrefcount_frame(AVCodecInternal *avci, AVFrame *frame)
@@ -129,7 +154,7 @@ static int unrefcount_frame(AVCodecInternal *avci, AVFrame *frame)
     memcpy(frame->data,     avci->to_free->data,     sizeof(frame->data));
     memcpy(frame->linesize, avci->to_free->linesize, sizeof(frame->linesize));
     if (avci->to_free->extended_data != avci->to_free->data) {
-        int planes = av_get_channel_layout_nb_channels(avci->to_free->channel_layout);
+        int planes = avci->to_free->channels;
         int size   = planes * sizeof(*frame->extended_data);
 
         if (!size) {
@@ -152,6 +177,7 @@ static int unrefcount_frame(AVCodecInternal *avci, AVFrame *frame)
     frame->height         = avci->to_free->height;
     frame->channel_layout = avci->to_free->channel_layout;
     frame->nb_samples     = avci->to_free->nb_samples;
+    frame->channels       = avci->to_free->channels;
 
     return 0;
 }
@@ -170,27 +196,33 @@ int ff_decode_bsfs_init(AVCodecContext *avctx)
     while (bsfs_str && *bsfs_str) {
         AVBSFContext **tmp;
         const AVBitStreamFilter *filter;
-        char *bsf;
+        char *bsf, *bsf_options_str, *bsf_name;
 
         bsf = av_get_token(&bsfs_str, ",");
         if (!bsf) {
             ret = AVERROR(ENOMEM);
             goto fail;
         }
+        bsf_name = av_strtok(bsf, "=", &bsf_options_str);
+        if (!bsf_name) {
+            av_freep(&bsf);
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
 
-        filter = av_bsf_get_by_name(bsf);
+        filter = av_bsf_get_by_name(bsf_name);
         if (!filter) {
             av_log(avctx, AV_LOG_ERROR, "A non-existing bitstream filter %s "
                    "requested by a decoder. This is a bug, please report it.\n",
-                   bsf);
-            ret = AVERROR_BUG;
+                   bsf_name);
             av_freep(&bsf);
+            ret = AVERROR_BUG;
             goto fail;
         }
-        av_freep(&bsf);
 
         tmp = av_realloc_array(s->bsfs, s->nb_bsfs + 1, sizeof(*s->bsfs));
         if (!tmp) {
+            av_freep(&bsf);
             ret = AVERROR(ENOMEM);
             goto fail;
         }
@@ -198,8 +230,10 @@ int ff_decode_bsfs_init(AVCodecContext *avctx)
         s->nb_bsfs++;
 
         ret = av_bsf_alloc(filter, &s->bsfs[s->nb_bsfs - 1]);
-        if (ret < 0)
+        if (ret < 0) {
+            av_freep(&bsf);
             goto fail;
+        }
 
         if (s->nb_bsfs == 1) {
             /* We do not currently have an API for passing the input timebase into decoders,
@@ -213,12 +247,38 @@ int ff_decode_bsfs_init(AVCodecContext *avctx)
             ret = avcodec_parameters_copy(s->bsfs[s->nb_bsfs - 1]->par_in,
                                           s->bsfs[s->nb_bsfs - 2]->par_out);
         }
-        if (ret < 0)
+        if (ret < 0) {
+            av_freep(&bsf);
             goto fail;
+        }
+
+        if (bsf_options_str && filter->priv_class) {
+            const AVOption *opt = av_opt_next(s->bsfs[s->nb_bsfs - 1]->priv_data, NULL);
+            const char * shorthand[2] = {NULL};
+
+            if (opt)
+                shorthand[0] = opt->name;
+
+            ret = av_opt_set_from_string(s->bsfs[s->nb_bsfs - 1]->priv_data, bsf_options_str, shorthand, "=", ":");
+            if (ret < 0) {
+                if (ret != AVERROR(ENOMEM)) {
+                    av_log(avctx, AV_LOG_ERROR, "Invalid options for bitstream filter %s "
+                           "requested by the decoder. This is a bug, please report it.\n",
+                           bsf_name);
+                    ret = AVERROR_BUG;
+                }
+                av_freep(&bsf);
+                goto fail;
+            }
+        }
+        av_freep(&bsf);
 
         ret = av_bsf_init(s->bsfs[s->nb_bsfs - 1]);
         if (ret < 0)
             goto fail;
+
+        if (*bsfs_str)
+            bsfs_str++;
     }
 
     return 0;
@@ -297,6 +357,42 @@ finish:
     return ret;
 }
 
+/**
+ * Attempt to guess proper monotonic timestamps for decoded video frames
+ * which might have incorrect times. Input timestamps may wrap around, in
+ * which case the output will as well.
+ *
+ * @param pts the pts field of the decoded AVPacket, as passed through
+ * AVFrame.pts
+ * @param dts the dts field of the decoded AVPacket
+ * @return one of the input values, may be AV_NOPTS_VALUE
+ */
+static int64_t guess_correct_pts(AVCodecContext *ctx,
+                                 int64_t reordered_pts, int64_t dts)
+{
+    int64_t pts = AV_NOPTS_VALUE;
+
+    if (dts != AV_NOPTS_VALUE) {
+        ctx->pts_correction_num_faulty_dts += dts <= ctx->pts_correction_last_dts;
+        ctx->pts_correction_last_dts = dts;
+    } else if (reordered_pts != AV_NOPTS_VALUE)
+        ctx->pts_correction_last_dts = reordered_pts;
+
+    if (reordered_pts != AV_NOPTS_VALUE) {
+        ctx->pts_correction_num_faulty_pts += reordered_pts <= ctx->pts_correction_last_pts;
+        ctx->pts_correction_last_pts = reordered_pts;
+    } else if(dts != AV_NOPTS_VALUE)
+        ctx->pts_correction_last_pts = dts;
+
+    if ((ctx->pts_correction_num_faulty_pts<=ctx->pts_correction_num_faulty_dts || dts == AV_NOPTS_VALUE)
+       && reordered_pts != AV_NOPTS_VALUE)
+        pts = reordered_pts;
+    else
+        pts = dts;
+
+    return pts;
+}
+
 /*
  * The core of the receive_frame_wrapper for the decoders implementing
  * the simple API. Certain decoders might consume partial packets without
@@ -308,7 +404,8 @@ static int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame)
     AVCodecInternal   *avci = avctx->internal;
     DecodeSimpleContext *ds = &avci->ds;
     AVPacket           *pkt = ds->in_pkt;
-    int got_frame;
+    // copy to ensure we do not change pkt
+    int got_frame, actual_got_frame;
     int ret;
 
     if (!pkt->data && !avci->draining) {
@@ -337,26 +434,170 @@ static int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame)
 
         if (!(avctx->codec->caps_internal & FF_CODEC_CAP_SETS_PKT_DTS))
             frame->pkt_dts = pkt->dts;
-        /* get_buffer is supposed to set frame parameters */
-        if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1)) {
-            frame->sample_aspect_ratio = avctx->sample_aspect_ratio;
-            frame->width               = avctx->width;
-            frame->height              = avctx->height;
-            frame->format              = avctx->codec->type == AVMEDIA_TYPE_VIDEO ?
-                                         avctx->pix_fmt : avctx->sample_fmt;
+        if (avctx->codec->type == AVMEDIA_TYPE_VIDEO) {
+            if(!avctx->has_b_frames)
+                frame->pkt_pos = pkt->pos;
+            //FIXME these should be under if(!avctx->has_b_frames)
+            /* get_buffer is supposed to set frame parameters */
+            if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1)) {
+                if (!frame->sample_aspect_ratio.num)  frame->sample_aspect_ratio = avctx->sample_aspect_ratio;
+                if (!frame->width)                    frame->width               = avctx->width;
+                if (!frame->height)                   frame->height              = avctx->height;
+                if (frame->format == AV_PIX_FMT_NONE) frame->format              = avctx->pix_fmt;
+            }
         }
     }
-
     emms_c();
+    actual_got_frame = got_frame;
+
+    if (avctx->codec->type == AVMEDIA_TYPE_VIDEO) {
+        if (frame->flags & AV_FRAME_FLAG_DISCARD)
+            got_frame = 0;
+        if (got_frame)
+            frame->best_effort_timestamp = guess_correct_pts(avctx,
+                                                             frame->pts,
+                                                             frame->pkt_dts);
+    } else if (avctx->codec->type == AVMEDIA_TYPE_AUDIO) {
+        uint8_t *side;
+        int side_size;
+        uint32_t discard_padding = 0;
+        uint8_t skip_reason = 0;
+        uint8_t discard_reason = 0;
+
+        if (ret >= 0 && got_frame) {
+            frame->best_effort_timestamp = guess_correct_pts(avctx,
+                                                             frame->pts,
+                                                             frame->pkt_dts);
+            if (frame->format == AV_SAMPLE_FMT_NONE)
+                frame->format = avctx->sample_fmt;
+            if (!frame->channel_layout)
+                frame->channel_layout = avctx->channel_layout;
+            if (!frame->channels)
+                frame->channels = avctx->channels;
+            if (!frame->sample_rate)
+                frame->sample_rate = avctx->sample_rate;
+        }
+
+        side= av_packet_get_side_data(avci->last_pkt_props, AV_PKT_DATA_SKIP_SAMPLES, &side_size);
+        if(side && side_size>=10) {
+            avctx->internal->skip_samples = AV_RL32(side) * avctx->internal->skip_samples_multiplier;
+            discard_padding = AV_RL32(side + 4);
+            av_log(avctx, AV_LOG_DEBUG, "skip %d / discard %d samples due to side data\n",
+                   avctx->internal->skip_samples, (int)discard_padding);
+            skip_reason = AV_RL8(side + 8);
+            discard_reason = AV_RL8(side + 9);
+        }
+
+        if ((frame->flags & AV_FRAME_FLAG_DISCARD) && got_frame &&
+            !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) {
+            avctx->internal->skip_samples = FFMAX(0, avctx->internal->skip_samples - frame->nb_samples);
+            got_frame = 0;
+        }
+
+        if (avctx->internal->skip_samples > 0 && got_frame &&
+            !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) {
+            if(frame->nb_samples <= avctx->internal->skip_samples){
+                got_frame = 0;
+                avctx->internal->skip_samples -= frame->nb_samples;
+                av_log(avctx, AV_LOG_DEBUG, "skip whole frame, skip left: %d\n",
+                       avctx->internal->skip_samples);
+            } else {
+                av_samples_copy(frame->extended_data, frame->extended_data, 0, avctx->internal->skip_samples,
+                                frame->nb_samples - avctx->internal->skip_samples, avctx->channels, frame->format);
+                if(avctx->pkt_timebase.num && avctx->sample_rate) {
+                    int64_t diff_ts = av_rescale_q(avctx->internal->skip_samples,
+                                                   (AVRational){1, avctx->sample_rate},
+                                                   avctx->pkt_timebase);
+                    if(frame->pts!=AV_NOPTS_VALUE)
+                        frame->pts += diff_ts;
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+                    if(frame->pkt_pts!=AV_NOPTS_VALUE)
+                        frame->pkt_pts += diff_ts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+                    if(frame->pkt_dts!=AV_NOPTS_VALUE)
+                        frame->pkt_dts += diff_ts;
+                    if (frame->pkt_duration >= diff_ts)
+                        frame->pkt_duration -= diff_ts;
+                } else {
+                    av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for skipped samples.\n");
+                }
+                av_log(avctx, AV_LOG_DEBUG, "skip %d/%d samples\n",
+                       avctx->internal->skip_samples, frame->nb_samples);
+                frame->nb_samples -= avctx->internal->skip_samples;
+                avctx->internal->skip_samples = 0;
+            }
+        }
+
+        if (discard_padding > 0 && discard_padding <= frame->nb_samples && got_frame &&
+            !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) {
+            if (discard_padding == frame->nb_samples) {
+                got_frame = 0;
+            } else {
+                if(avctx->pkt_timebase.num && avctx->sample_rate) {
+                    int64_t diff_ts = av_rescale_q(frame->nb_samples - discard_padding,
+                                                   (AVRational){1, avctx->sample_rate},
+                                                   avctx->pkt_timebase);
+                    frame->pkt_duration = diff_ts;
+                } else {
+                    av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for discarded samples.\n");
+                }
+                av_log(avctx, AV_LOG_DEBUG, "discard %d/%d samples\n",
+                       (int)discard_padding, frame->nb_samples);
+                frame->nb_samples -= discard_padding;
+            }
+        }
+
+        if ((avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL) && got_frame) {
+            AVFrameSideData *fside = av_frame_new_side_data(frame, AV_FRAME_DATA_SKIP_SAMPLES, 10);
+            if (fside) {
+                AV_WL32(fside->data, avctx->internal->skip_samples);
+                AV_WL32(fside->data + 4, discard_padding);
+                AV_WL8(fside->data + 8, skip_reason);
+                AV_WL8(fside->data + 9, discard_reason);
+                avctx->internal->skip_samples = 0;
+            }
+        }
+    }
+
+    if (avctx->codec->type == AVMEDIA_TYPE_AUDIO &&
+        !avci->showed_multi_packet_warning &&
+        ret >= 0 && ret != pkt->size && !(avctx->codec->capabilities & AV_CODEC_CAP_SUBFRAMES)) {
+        av_log(avctx, AV_LOG_WARNING, "Multiple frames in a packet.\n");
+        avci->showed_multi_packet_warning = 1;
+    }
 
     if (!got_frame)
         av_frame_unref(frame);
 
-    if (ret >= 0 && avctx->codec->type == AVMEDIA_TYPE_VIDEO)
+    if (ret >= 0 && avctx->codec->type == AVMEDIA_TYPE_VIDEO && !(avctx->flags & AV_CODEC_FLAG_TRUNCATED))
         ret = pkt->size;
 
-    if (avctx->internal->draining && !got_frame)
-        avci->draining_done = 1;
+#if FF_API_AVCTX_TIMEBASE
+    if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+#endif
+
+    /* do not stop draining when actual_got_frame != 0 or ret < 0 */
+    /* got_frame == 0 but actual_got_frame != 0 when frame is discarded */
+    if (avctx->internal->draining && !actual_got_frame) {
+        if (ret < 0) {
+            /* prevent infinite loop if a decoder wrongly always return error on draining */
+            /* reasonable nb_errors_max = maximum b frames + thread count */
+            int nb_errors_max = 20 + (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME ?
+                                avctx->thread_count : 1);
+
+            if (avci->nb_draining_errors++ >= nb_errors_max) {
+                av_log(avctx, AV_LOG_ERROR, "Too many errors when draining, this is a bug. "
+                       "Stop draining and force EOF.\n");
+                avci->draining_done = 1;
+                ret = AVERROR_BUG;
+            }
+        } else {
+            avci->draining_done = 1;
+        }
+    }
 
     avci->compat_decode_consumed += ret;
 
@@ -367,6 +608,7 @@ static int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame)
 
         pkt->data                += consumed;
         pkt->size                -= consumed;
+        avci->last_pkt_props->size -= consumed; // See extract_packet_props() comment.
         pkt->pts                  = AV_NOPTS_VALUE;
         pkt->dts                  = AV_NOPTS_VALUE;
         avci->last_pkt_props->pts = AV_NOPTS_VALUE;
@@ -407,18 +649,14 @@ static int decode_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame)
     if (ret == AVERROR_EOF)
         avci->draining_done = 1;
 
-    /* unwrap the per-frame decode data and restore the original opaque_ref*/
     if (!ret) {
         /* the only case where decode data is not set should be decoders
          * that do not call ff_get_buffer() */
-        av_assert0((frame->opaque_ref && frame->opaque_ref->size == sizeof(FrameDecodeData)) ||
+        av_assert0((frame->private_ref && frame->private_ref->size == sizeof(FrameDecodeData)) ||
                    !(avctx->codec->capabilities & AV_CODEC_CAP_DR1));
 
-        if (frame->opaque_ref) {
-            FrameDecodeData *fdd;
-            AVBufferRef *user_opaque_ref;
-
-            fdd = (FrameDecodeData*)frame->opaque_ref->data;
+        if (frame->private_ref) {
+            FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
 
             if (fdd->post_process) {
                 ret = fdd->post_process(avctx, frame);
@@ -427,21 +665,19 @@ static int decode_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame)
                     return ret;
                 }
             }
-
-            user_opaque_ref = fdd->user_opaque_ref;
-            fdd->user_opaque_ref = NULL;
-            av_buffer_unref(&frame->opaque_ref);
-            frame->opaque_ref = user_opaque_ref;
         }
     }
 
+    /* free the per-frame decode data */
+    av_buffer_unref(&frame->private_ref);
+
     return ret;
 }
 
 int attribute_align_arg avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt)
 {
     AVCodecInternal *avci = avctx->internal;
-    int ret = 0;
+    int ret;
 
     if (!avcodec_is_open(avctx) || !av_codec_is_decoder(avctx->codec))
         return AVERROR(EINVAL);
@@ -449,6 +685,9 @@ int attribute_align_arg avcodec_send_packet(AVCodecContext *avctx, const AVPacke
     if (avctx->internal->draining)
         return AVERROR_EOF;
 
+    if (avpkt && !avpkt->size && avpkt->data)
+        return AVERROR(EINVAL);
+
     av_packet_unref(avci->buffer_pkt);
     if (avpkt && (avpkt->data || avpkt->side_data_elems)) {
         ret = av_packet_ref(avci->buffer_pkt, avpkt);
@@ -479,7 +718,8 @@ static int apply_cropping(AVCodecContext *avctx, AVFrame *frame)
         (frame->crop_left + frame->crop_right) >= frame->width ||
         (frame->crop_top + frame->crop_bottom) >= frame->height) {
         av_log(avctx, AV_LOG_WARNING,
-               "Invalid cropping information set by a decoder: %zu/%zu/%zu/%zu "
+               "Invalid cropping information set by a decoder: "
+               "%"SIZE_SPECIFIER"/%"SIZE_SPECIFIER"/%"SIZE_SPECIFIER"/%"SIZE_SPECIFIER" "
                "(frame size %dx%d). This is a bug, please report it\n",
                frame->crop_left, frame->crop_right, frame->crop_top, frame->crop_bottom,
                frame->width, frame->height);
@@ -529,13 +769,18 @@ int attribute_align_arg avcodec_receive_frame(AVCodecContext *avctx, AVFrame *fr
 }
 
 static int compat_decode(AVCodecContext *avctx, AVFrame *frame,
-                         int *got_frame, AVPacket *pkt)
+                         int *got_frame, const AVPacket *pkt)
 {
     AVCodecInternal *avci = avctx->internal;
     int ret = 0;
 
     av_assert0(avci->compat_decode_consumed == 0);
 
+    if (avci->draining_done && pkt && pkt->size != 0) {
+        av_log(avctx, AV_LOG_WARNING, "Got unexpected packet after EOF\n");
+        avcodec_flush_buffers(avctx);
+    }
+
     *got_frame = 0;
     avci->compat_decode = 1;
 
@@ -607,7 +852,7 @@ finish:
 
 int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
                                               int *got_picture_ptr,
-                                              AVPacket *avpkt)
+                                              const AVPacket *avpkt)
 {
     return compat_decode(avctx, picture, got_picture_ptr, avpkt);
 }
@@ -615,25 +860,254 @@ int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *pi
 int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
                                               AVFrame *frame,
                                               int *got_frame_ptr,
-                                              AVPacket *avpkt)
+                                              const AVPacket *avpkt)
 {
     return compat_decode(avctx, frame, got_frame_ptr, avpkt);
 }
 
+static void get_subtitle_defaults(AVSubtitle *sub)
+{
+    memset(sub, 0, sizeof(*sub));
+    sub->pts = AV_NOPTS_VALUE;
+}
+
+#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
+static int recode_subtitle(AVCodecContext *avctx,
+                           AVPacket *outpkt, const AVPacket *inpkt)
+{
+#if CONFIG_ICONV
+    iconv_t cd = (iconv_t)-1;
+    int ret = 0;
+    char *inb, *outb;
+    size_t inl, outl;
+    AVPacket tmp;
+#endif
+
+    if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_PRE_DECODER || inpkt->size == 0)
+        return 0;
+
+#if CONFIG_ICONV
+    cd = iconv_open("UTF-8", avctx->sub_charenc);
+    av_assert0(cd != (iconv_t)-1);
+
+    inb = inpkt->data;
+    inl = inpkt->size;
+
+    if (inl >= INT_MAX / UTF8_MAX_BYTES - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
+    if (ret < 0)
+        goto end;
+    outpkt->buf  = tmp.buf;
+    outpkt->data = tmp.data;
+    outpkt->size = tmp.size;
+    outb = outpkt->data;
+    outl = outpkt->size;
+
+    if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
+        iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
+        outl >= outpkt->size || inl != 0) {
+        ret = FFMIN(AVERROR(errno), -1);
+        av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
+               "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
+        av_packet_unref(&tmp);
+        goto end;
+    }
+    outpkt->size -= outl;
+    memset(outpkt->data + outpkt->size, 0, outl);
+
+end:
+    if (cd != (iconv_t)-1)
+        iconv_close(cd);
+    return ret;
+#else
+    av_log(avctx, AV_LOG_ERROR, "requesting subtitles recoding without iconv");
+    return AVERROR(EINVAL);
+#endif
+}
+
+static int utf8_check(const uint8_t *str)
+{
+    const uint8_t *byte;
+    uint32_t codepoint, min;
+
+    while (*str) {
+        byte = str;
+        GET_UTF8(codepoint, *(byte++), return 0;);
+        min = byte - str == 1 ? 0 : byte - str == 2 ? 0x80 :
+              1 << (5 * (byte - str) - 4);
+        if (codepoint < min || codepoint >= 0x110000 ||
+            codepoint == 0xFFFE /* BOM */ ||
+            codepoint >= 0xD800 && codepoint <= 0xDFFF /* surrogates */)
+            return 0;
+        str = byte;
+    }
+    return 1;
+}
+
+#if FF_API_ASS_TIMING
+static void insert_ts(AVBPrint *buf, int ts)
+{
+    if (ts == -1) {
+        av_bprintf(buf, "9:59:59.99,");
+    } else {
+        int h, m, s;
+
+        h = ts/360000;  ts -= 360000*h;
+        m = ts/  6000;  ts -=   6000*m;
+        s = ts/   100;  ts -=    100*s;
+        av_bprintf(buf, "%d:%02d:%02d.%02d,", h, m, s, ts);
+    }
+}
+
+static int convert_sub_to_old_ass_form(AVSubtitle *sub, const AVPacket *pkt, AVRational tb)
+{
+    int i;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < sub->num_rects; i++) {
+        char *final_dialog;
+        const char *dialog;
+        AVSubtitleRect *rect = sub->rects[i];
+        int ts_start, ts_duration = -1;
+        long int layer;
+
+        if (rect->type != SUBTITLE_ASS || !strncmp(rect->ass, "Dialogue: ", 10))
+            continue;
+
+        av_bprint_clear(&buf);
+
+        /* skip ReadOrder */
+        dialog = strchr(rect->ass, ',');
+        if (!dialog)
+            continue;
+        dialog++;
+
+        /* extract Layer or Marked */
+        layer = strtol(dialog, (char**)&dialog, 10);
+        if (*dialog != ',')
+            continue;
+        dialog++;
+
+        /* rescale timing to ASS time base (ms) */
+        ts_start = av_rescale_q(pkt->pts, tb, av_make_q(1, 100));
+        if (pkt->duration != -1)
+            ts_duration = av_rescale_q(pkt->duration, tb, av_make_q(1, 100));
+        sub->end_display_time = FFMAX(sub->end_display_time, 10 * ts_duration);
+
+        /* construct ASS (standalone file form with timestamps) string */
+        av_bprintf(&buf, "Dialogue: %ld,", layer);
+        insert_ts(&buf, ts_start);
+        insert_ts(&buf, ts_duration == -1 ? -1 : ts_start + ts_duration);
+        av_bprintf(&buf, "%s\r\n", dialog);
+
+        final_dialog = av_strdup(buf.str);
+        if (!av_bprint_is_complete(&buf) || !final_dialog) {
+            av_freep(&final_dialog);
+            av_bprint_finalize(&buf, NULL);
+            return AVERROR(ENOMEM);
+        }
+        av_freep(&rect->ass);
+        rect->ass = final_dialog;
+    }
+
+    av_bprint_finalize(&buf, NULL);
+    return 0;
+}
+#endif
+
 int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
                              int *got_sub_ptr,
                              AVPacket *avpkt)
 {
-    int ret;
+    int i, ret = 0;
 
-    ret = extract_packet_props(avctx->internal, avpkt);
-    if (ret < 0)
-        return ret;
+    if (!avpkt->data && avpkt->size) {
+        av_log(avctx, AV_LOG_ERROR, "invalid packet: NULL data, size != 0\n");
+        return AVERROR(EINVAL);
+    }
+    if (!avctx->codec)
+        return AVERROR(EINVAL);
+    if (avctx->codec->type != AVMEDIA_TYPE_SUBTITLE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid media type for subtitles\n");
+        return AVERROR(EINVAL);
+    }
 
     *got_sub_ptr = 0;
-    ret = avctx->codec->decode(avctx, sub, got_sub_ptr, avpkt);
-    if (*got_sub_ptr)
-        avctx->frame_number++;
+    get_subtitle_defaults(sub);
+
+    if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size) {
+        AVPacket pkt_recoded = *avpkt;
+
+        ret = recode_subtitle(avctx, &pkt_recoded, avpkt);
+        if (ret < 0) {
+            *got_sub_ptr = 0;
+        } else {
+             ret = extract_packet_props(avctx->internal, &pkt_recoded);
+             if (ret < 0)
+                return ret;
+
+            if (avctx->pkt_timebase.num && avpkt->pts != AV_NOPTS_VALUE)
+                sub->pts = av_rescale_q(avpkt->pts,
+                                        avctx->pkt_timebase, AV_TIME_BASE_Q);
+            ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
+            av_assert1((ret >= 0) >= !!*got_sub_ptr &&
+                       !!*got_sub_ptr >= !!sub->num_rects);
+
+#if FF_API_ASS_TIMING
+            if (avctx->sub_text_format == FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS
+                && *got_sub_ptr && sub->num_rects) {
+                const AVRational tb = avctx->pkt_timebase.num ? avctx->pkt_timebase
+                                                              : avctx->time_base;
+                int err = convert_sub_to_old_ass_form(sub, avpkt, tb);
+                if (err < 0)
+                    ret = err;
+            }
+#endif
+
+            if (sub->num_rects && !sub->end_display_time && avpkt->duration &&
+                avctx->pkt_timebase.num) {
+                AVRational ms = { 1, 1000 };
+                sub->end_display_time = av_rescale_q(avpkt->duration,
+                                                     avctx->pkt_timebase, ms);
+            }
+
+            if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB)
+                sub->format = 0;
+            else if (avctx->codec_descriptor->props & AV_CODEC_PROP_TEXT_SUB)
+                sub->format = 1;
+
+            for (i = 0; i < sub->num_rects; i++) {
+                if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_IGNORE &&
+                    sub->rects[i]->ass && !utf8_check(sub->rects[i]->ass)) {
+                    av_log(avctx, AV_LOG_ERROR,
+                           "Invalid UTF-8 in decoded subtitles text; "
+                           "maybe missing -sub_charenc option\n");
+                    avsubtitle_free(sub);
+                    ret = AVERROR_INVALIDDATA;
+                    break;
+                }
+            }
+
+            if (avpkt->data != pkt_recoded.data) { // did we recode?
+                /* prevent from destroying side data from original packet */
+                pkt_recoded.side_data = NULL;
+                pkt_recoded.side_data_elems = 0;
+
+                av_packet_unref(&pkt_recoded);
+            }
+        }
+
+        if (*got_sub_ptr)
+            avctx->frame_number++;
+    }
+
     return ret;
 }
 
@@ -807,6 +1281,13 @@ static int hwaccel_init(AVCodecContext *avctx,
     int err;
 
     hwaccel = hw_config->hwaccel;
+    if (hwaccel->capabilities & AV_HWACCEL_CODEC_CAP_EXPERIMENTAL &&
+        avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_WARNING, "Ignoring experimental hwaccel: %s\n",
+               hwaccel->name);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (hwaccel->priv_data_size) {
         avctx->internal->hwaccel_priv_data =
             av_mallocz(hwaccel->priv_data_size);
@@ -815,14 +1296,16 @@ static int hwaccel_init(AVCodecContext *avctx,
     }
 
     avctx->hwaccel = hwaccel;
-    err = hwaccel->init(avctx);
-    if (err < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Failed setup for format %s: "
-               "hwaccel initialisation returned error.\n",
-               av_get_pix_fmt_name(hw_config->public.pix_fmt));
-        av_freep(&avctx->internal->hwaccel_priv_data);
-        avctx->hwaccel = NULL;
-        return err;
+    if (hwaccel->init) {
+        err = hwaccel->init(avctx);
+        if (err < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed setup for format %s: "
+                   "hwaccel initialisation returned error.\n",
+                   av_get_pix_fmt_name(hw_config->public.pix_fmt));
+            av_freep(&avctx->internal->hwaccel_priv_data);
+            avctx->hwaccel = NULL;
+            return err;
+        }
     }
 
     return 0;
@@ -895,6 +1378,7 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
         if (i == n) {
             av_log(avctx, AV_LOG_ERROR, "Invalid return from get_format(): "
                    "%s not in possible list.\n", desc->name);
+            ret = AV_PIX_FMT_NONE;
             break;
         }
 
@@ -999,7 +1483,9 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
         do {
             // NOTE: do not align linesizes individually, this breaks e.g. assumptions
             // that linesize[0] == 2*linesize[1] in the MPEG-encoder for 4:2:2
-            av_image_fill_linesizes(linesize, avctx->pix_fmt, w);
+            ret = av_image_fill_linesizes(linesize, avctx->pix_fmt, w);
+            if (ret < 0)
+                return ret;
             // increase alignment of w for next try (rhs gives the lowest bit set in w)
             w += w & ~(w - 1);
 
@@ -1011,7 +1497,7 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
         tmpsize = av_image_fill_pointers(data, avctx->pix_fmt, h,
                                          NULL, linesize);
         if (tmpsize < 0)
-            return -1;
+            return tmpsize;
 
         for (i = 0; i < 3 && data[i + 1]; i++)
             size[i] = data[i + 1] - data[i];
@@ -1021,7 +1507,10 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
             av_buffer_pool_uninit(&pool->pools[i]);
             pool->linesize[i] = linesize[i];
             if (size[i]) {
-                pool->pools[i] = av_buffer_pool_init(size[i] + 16, NULL);
+                pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                     CONFIG_MEMORY_POISONING ?
+                                                        NULL :
+                                                        av_buffer_allocz);
                 if (!pool->pools[i]) {
                     ret = AVERROR(ENOMEM);
                     goto fail;
@@ -1035,7 +1524,7 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
         break;
         }
     case AVMEDIA_TYPE_AUDIO: {
-        int ch     = av_get_channel_layout_nb_channels(frame->channel_layout);
+        int ch     = frame->channels; //av_get_channel_layout_nb_channels(frame->channel_layout);
         int planar = av_sample_fmt_is_planar(frame->format);
         int planes = planar ? ch : 1;
 
@@ -1082,17 +1571,19 @@ static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
     frame->linesize[0] = pool->linesize[0];
 
     if (planes > AV_NUM_DATA_POINTERS) {
-        frame->extended_data = av_mallocz(planes * sizeof(*frame->extended_data));
+        frame->extended_data = av_mallocz_array(planes, sizeof(*frame->extended_data));
         frame->nb_extended_buf = planes - AV_NUM_DATA_POINTERS;
-        frame->extended_buf  = av_mallocz(frame->nb_extended_buf *
+        frame->extended_buf  = av_mallocz_array(frame->nb_extended_buf,
                                           sizeof(*frame->extended_buf));
         if (!frame->extended_data || !frame->extended_buf) {
             av_freep(&frame->extended_data);
             av_freep(&frame->extended_buf);
             return AVERROR(ENOMEM);
         }
-    } else
+    } else {
         frame->extended_data = frame->data;
+        av_assert0(frame->nb_extended_buf == 0);
+    }
 
     for (i = 0; i < FFMIN(planes, AV_NUM_DATA_POINTERS); i++) {
         frame->buf[i] = av_buffer_pool_get(pool->pools[0]);
@@ -1119,13 +1610,21 @@ fail:
 static int video_get_buffer(AVCodecContext *s, AVFrame *pic)
 {
     FramePool *pool = s->internal->pool;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pic->format);
     int i;
 
-    if (pic->data[0]) {
-        av_log(s, AV_LOG_ERROR, "pic->data[0]!=NULL in avcodec_default_get_buffer\n");
+    if (pic->data[0] || pic->data[1] || pic->data[2] || pic->data[3]) {
+        av_log(s, AV_LOG_ERROR, "pic->data[*]!=NULL in avcodec_default_get_buffer\n");
         return -1;
     }
 
+    if (!desc) {
+        av_log(s, AV_LOG_ERROR,
+            "Unable to get pixel format descriptor for format %s\n",
+            av_get_pix_fmt_name(pic->format));
+        return AVERROR(EINVAL);
+    }
+
     memset(pic->data, 0, sizeof(pic->data));
     pic->extended_data = pic->data;
 
@@ -1142,8 +1641,9 @@ static int video_get_buffer(AVCodecContext *s, AVFrame *pic)
         pic->data[i] = NULL;
         pic->linesize[i] = 0;
     }
-    if (pic->data[1] && !pic->data[2])
-        avpriv_set_systematic_pal2((uint32_t *)pic->data[1], s->pix_fmt);
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
+        ((desc->flags & FF_PSEUDOPAL) && pic->data[1]))
+        avpriv_set_systematic_pal2((uint32_t *)pic->data[1], pic->format);
 
     if (s->debug & FF_DEBUG_BUFFERS)
         av_log(s, AV_LOG_DEBUG, "default_get_buffer called on pic %p\n", pic);
@@ -1178,59 +1678,153 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
     }
 }
 
+static int add_metadata_from_side_data(const AVPacket *avpkt, AVFrame *frame)
+{
+    int size;
+    const uint8_t *side_metadata;
+
+    AVDictionary **frame_md = &frame->metadata;
+
+    side_metadata = av_packet_get_side_data(avpkt,
+                                            AV_PKT_DATA_STRINGS_METADATA, &size);
+    return av_packet_unpack_dictionary(side_metadata, size, frame_md);
+}
+
 int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
 {
-    AVPacket *pkt = avctx->internal->last_pkt_props;
+    const AVPacket *pkt = avctx->internal->last_pkt_props;
     int i;
-    struct {
+    static const struct {
         enum AVPacketSideDataType packet;
         enum AVFrameSideDataType frame;
     } sd[] = {
-        { AV_PKT_DATA_REPLAYGAIN ,   AV_FRAME_DATA_REPLAYGAIN },
-        { AV_PKT_DATA_DISPLAYMATRIX, AV_FRAME_DATA_DISPLAYMATRIX },
-        { AV_PKT_DATA_SPHERICAL,     AV_FRAME_DATA_SPHERICAL },
-        { AV_PKT_DATA_STEREO3D,      AV_FRAME_DATA_STEREO3D },
-        { AV_PKT_DATA_AUDIO_SERVICE_TYPE, AV_FRAME_DATA_AUDIO_SERVICE_TYPE },
+        { AV_PKT_DATA_REPLAYGAIN ,                AV_FRAME_DATA_REPLAYGAIN },
+        { AV_PKT_DATA_DISPLAYMATRIX,              AV_FRAME_DATA_DISPLAYMATRIX },
+        { AV_PKT_DATA_SPHERICAL,                  AV_FRAME_DATA_SPHERICAL },
+        { AV_PKT_DATA_STEREO3D,                   AV_FRAME_DATA_STEREO3D },
+        { AV_PKT_DATA_AUDIO_SERVICE_TYPE,         AV_FRAME_DATA_AUDIO_SERVICE_TYPE },
+        { AV_PKT_DATA_MASTERING_DISPLAY_METADATA, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA },
+        { AV_PKT_DATA_CONTENT_LIGHT_LEVEL,        AV_FRAME_DATA_CONTENT_LIGHT_LEVEL },
+        { AV_PKT_DATA_A53_CC,                     AV_FRAME_DATA_A53_CC },
     };
 
-    frame->color_primaries = avctx->color_primaries;
-    frame->color_trc       = avctx->color_trc;
-    frame->colorspace      = avctx->colorspace;
-    frame->color_range     = avctx->color_range;
-    frame->chroma_location = avctx->chroma_sample_location;
-
-    frame->reordered_opaque = avctx->reordered_opaque;
-
+    if (pkt) {
+        frame->pts = pkt->pts;
 #if FF_API_PKT_PTS
 FF_DISABLE_DEPRECATION_WARNINGS
-    frame->pkt_pts = pkt->pts;
+        frame->pkt_pts = pkt->pts;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
-    frame->pts     = pkt->pts;
-
-    for (i = 0; i < FF_ARRAY_ELEMS(sd); i++) {
-        int size;
-        uint8_t *packet_sd = av_packet_get_side_data(pkt, sd[i].packet, &size);
-        if (packet_sd) {
-            AVFrameSideData *frame_sd = av_frame_new_side_data(frame,
-                                                               sd[i].frame,
-                                                               size);
-            if (!frame_sd)
-                return AVERROR(ENOMEM);
-
-            memcpy(frame_sd->data, packet_sd, size);
+        frame->pkt_pos      = pkt->pos;
+        frame->pkt_duration = pkt->duration;
+        frame->pkt_size     = pkt->size;
+
+        for (i = 0; i < FF_ARRAY_ELEMS(sd); i++) {
+            int size;
+            uint8_t *packet_sd = av_packet_get_side_data(pkt, sd[i].packet, &size);
+            if (packet_sd) {
+                AVFrameSideData *frame_sd = av_frame_new_side_data(frame,
+                                                                   sd[i].frame,
+                                                                   size);
+                if (!frame_sd)
+                    return AVERROR(ENOMEM);
+
+                memcpy(frame_sd->data, packet_sd, size);
+            }
+        }
+        add_metadata_from_side_data(pkt, frame);
+
+        if (pkt->flags & AV_PKT_FLAG_DISCARD) {
+            frame->flags |= AV_FRAME_FLAG_DISCARD;
+        } else {
+            frame->flags = (frame->flags & ~AV_FRAME_FLAG_DISCARD);
         }
     }
+    frame->reordered_opaque = avctx->reordered_opaque;
+
+    if (frame->color_primaries == AVCOL_PRI_UNSPECIFIED)
+        frame->color_primaries = avctx->color_primaries;
+    if (frame->color_trc == AVCOL_TRC_UNSPECIFIED)
+        frame->color_trc = avctx->color_trc;
+    if (frame->colorspace == AVCOL_SPC_UNSPECIFIED)
+        frame->colorspace = avctx->colorspace;
+    if (frame->color_range == AVCOL_RANGE_UNSPECIFIED)
+        frame->color_range = avctx->color_range;
+    if (frame->chroma_location == AVCHROMA_LOC_UNSPECIFIED)
+        frame->chroma_location = avctx->chroma_sample_location;
+
+    switch (avctx->codec->type) {
+    case AVMEDIA_TYPE_VIDEO:
+        frame->format              = avctx->pix_fmt;
+        if (!frame->sample_aspect_ratio.num)
+            frame->sample_aspect_ratio = avctx->sample_aspect_ratio;
+
+        if (frame->width && frame->height &&
+            av_image_check_sar(frame->width, frame->height,
+                               frame->sample_aspect_ratio) < 0) {
+            av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n",
+                   frame->sample_aspect_ratio.num,
+                   frame->sample_aspect_ratio.den);
+            frame->sample_aspect_ratio = (AVRational){ 0, 1 };
+        }
+
+        break;
+    case AVMEDIA_TYPE_AUDIO:
+        if (!frame->sample_rate)
+            frame->sample_rate    = avctx->sample_rate;
+        if (frame->format < 0)
+            frame->format         = avctx->sample_fmt;
+        if (!frame->channel_layout) {
+            if (avctx->channel_layout) {
+                 if (av_get_channel_layout_nb_channels(avctx->channel_layout) !=
+                     avctx->channels) {
+                     av_log(avctx, AV_LOG_ERROR, "Inconsistent channel "
+                            "configuration.\n");
+                     return AVERROR(EINVAL);
+                 }
 
+                frame->channel_layout = avctx->channel_layout;
+            } else {
+                if (avctx->channels > FF_SANE_NB_CHANNELS) {
+                    av_log(avctx, AV_LOG_ERROR, "Too many channels: %d.\n",
+                           avctx->channels);
+                    return AVERROR(ENOSYS);
+                }
+            }
+        }
+        frame->channels = avctx->channels;
+        break;
+    }
     return 0;
 }
 
+static void validate_avframe_allocation(AVCodecContext *avctx, AVFrame *frame)
+{
+    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+        int i;
+        int num_planes = av_pix_fmt_count_planes(frame->format);
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+        int flags = desc ? desc->flags : 0;
+        if (num_planes == 1 && (flags & AV_PIX_FMT_FLAG_PAL))
+            num_planes = 2;
+        if ((flags & FF_PSEUDOPAL) && frame->data[1])
+            num_planes = 2;
+        for (i = 0; i < num_planes; i++) {
+            av_assert0(frame->data[i]);
+        }
+        // For formats without data like hwaccel allow unused pointers to be non-NULL.
+        for (i = num_planes; num_planes > 0 && i < FF_ARRAY_ELEMS(frame->data); i++) {
+            if (frame->data[i])
+                av_log(avctx, AV_LOG_ERROR, "Buffer returned by get_buffer2() did not zero unused plane pointers\n");
+            frame->data[i] = NULL;
+        }
+    }
+}
+
 static void decode_data_free(void *opaque, uint8_t *data)
 {
     FrameDecodeData *fdd = (FrameDecodeData*)data;
 
-    av_buffer_unref(&fdd->user_opaque_ref);
-
     if (fdd->post_process_opaque_free)
         fdd->post_process_opaque_free(fdd->post_process_opaque);
 
@@ -1240,11 +1834,14 @@ static void decode_data_free(void *opaque, uint8_t *data)
     av_freep(&fdd);
 }
 
-static int attach_decode_data(AVFrame *frame)
+int ff_attach_decode_data(AVFrame *frame)
 {
     AVBufferRef *fdd_buf;
     FrameDecodeData *fdd;
 
+    av_assert1(!frame->private_ref);
+    av_buffer_unref(&frame->private_ref);
+
     fdd = av_mallocz(sizeof(*fdd));
     if (!fdd)
         return AVERROR(ENOMEM);
@@ -1256,72 +1853,34 @@ static int attach_decode_data(AVFrame *frame)
         return AVERROR(ENOMEM);
     }
 
-    fdd->user_opaque_ref = frame->opaque_ref;
-    frame->opaque_ref    = fdd_buf;
+    frame->private_ref = fdd_buf;
 
     return 0;
 }
 
-int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+static int get_buffer_internal(AVCodecContext *avctx, AVFrame *frame, int flags)
 {
     const AVHWAccel *hwaccel = avctx->hwaccel;
     int override_dimensions = 1;
     int ret;
 
-    switch (avctx->codec_type) {
-    case AVMEDIA_TYPE_VIDEO:
-        if (frame->width <= 0 || frame->height <= 0) {
-            frame->width  = FFMAX(avctx->width, avctx->coded_width);
-            frame->height = FFMAX(avctx->height, avctx->coded_height);
-            override_dimensions = 0;
+    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+        if ((ret = av_image_check_size2(FFALIGN(avctx->width, STRIDE_ALIGN), avctx->height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx)) < 0 || avctx->pix_fmt<0) {
+            av_log(avctx, AV_LOG_ERROR, "video_get_buffer: image parameters invalid\n");
+            return AVERROR(EINVAL);
         }
-        if (frame->format < 0)
-            frame->format              = avctx->pix_fmt;
-        if (!frame->sample_aspect_ratio.num)
-            frame->sample_aspect_ratio = avctx->sample_aspect_ratio;
 
-        if (av_image_check_sar(frame->width, frame->height,
-                               frame->sample_aspect_ratio) < 0) {
-            av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n",
-                   frame->sample_aspect_ratio.num,
-                   frame->sample_aspect_ratio.den);
-            frame->sample_aspect_ratio = (AVRational){ 0, 1 };
+        if (frame->width <= 0 || frame->height <= 0) {
+            frame->width  = FFMAX(avctx->width,  AV_CEIL_RSHIFT(avctx->coded_width,  avctx->lowres));
+            frame->height = FFMAX(avctx->height, AV_CEIL_RSHIFT(avctx->coded_height, avctx->lowres));
+            override_dimensions = 0;
         }
 
-        if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
-            return ret;
-        break;
-    case AVMEDIA_TYPE_AUDIO:
-        if (!frame->sample_rate)
-            frame->sample_rate    = avctx->sample_rate;
-        if (frame->format < 0)
-            frame->format         = avctx->sample_fmt;
-        if (!frame->channel_layout) {
-            if (avctx->channel_layout) {
-                 if (av_get_channel_layout_nb_channels(avctx->channel_layout) !=
-                     avctx->channels) {
-                     av_log(avctx, AV_LOG_ERROR, "Inconsistent channel "
-                            "configuration.\n");
-                     return AVERROR(EINVAL);
-                 }
-
-                frame->channel_layout = avctx->channel_layout;
-            } else {
-                if (avctx->channels > FF_SANE_NB_CHANNELS) {
-                    av_log(avctx, AV_LOG_ERROR, "Too many channels: %d.\n",
-                           avctx->channels);
-                    return AVERROR(ENOSYS);
-                }
-
-                frame->channel_layout = av_get_default_channel_layout(avctx->channels);
-                if (!frame->channel_layout)
-                    frame->channel_layout = (1ULL << avctx->channels) - 1;
-            }
+        if (frame->data[0] || frame->data[1] || frame->data[2] || frame->data[3]) {
+            av_log(avctx, AV_LOG_ERROR, "pic->data[*]!=NULL in get_buffer_internal\n");
+            return AVERROR(EINVAL);
         }
-        break;
-    default: return AVERROR(EINVAL);
     }
-
     ret = ff_decode_frame_props(avctx, frame);
     if (ret < 0)
         return ret;
@@ -1338,7 +1897,9 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
     if (ret < 0)
         goto end;
 
-    ret = attach_decode_data(frame);
+    validate_avframe_allocation(avctx, frame);
+
+    ret = ff_attach_decode_data(frame);
     if (ret < 0)
         goto end;
 
@@ -1355,13 +1916,29 @@ end:
     return ret;
 }
 
-int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
+int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+{
+    int ret = get_buffer_internal(avctx, frame, flags);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        frame->width = frame->height = 0;
+    }
+    return ret;
+}
+
+static int reget_buffer_internal(AVCodecContext *avctx, AVFrame *frame)
 {
     AVFrame *tmp;
     int ret;
 
     av_assert0(avctx->codec_type == AVMEDIA_TYPE_VIDEO);
 
+    if (frame->data[0] && (frame->width != avctx->width || frame->height != avctx->height || frame->format != avctx->pix_fmt)) {
+        av_log(avctx, AV_LOG_WARNING, "Picture changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s in reget buffer()\n",
+               frame->width, frame->height, av_get_pix_fmt_name(frame->format), avctx->width, avctx->height, av_get_pix_fmt_name(avctx->pix_fmt));
+        av_frame_unref(frame);
+    }
+
     if (!frame->data[0])
         return ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF);
 
@@ -1386,6 +1963,14 @@ int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
     return 0;
 }
 
+int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
+{
+    int ret = reget_buffer_internal(avctx, frame);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    return ret;
+}
+
 static void bsfs_flush(AVCodecContext *avctx)
 {
     DecodeFilterContext *s = &avctx->internal->filter;
@@ -1398,6 +1983,7 @@ void avcodec_flush_buffers(AVCodecContext *avctx)
 {
     avctx->internal->draining      = 0;
     avctx->internal->draining_done = 0;
+    avctx->internal->nb_draining_errors = 0;
     av_frame_unref(avctx->internal->buffer_frame);
     av_frame_unref(avctx->internal->compat_decode_frame);
     av_packet_unref(avctx->internal->buffer_pkt);
@@ -1410,6 +1996,9 @@ void avcodec_flush_buffers(AVCodecContext *avctx)
     else if (avctx->codec->flush)
         avctx->codec->flush(avctx);
 
+    avctx->pts_correction_last_pts =
+    avctx->pts_correction_last_dts = INT64_MIN;
+
     bsfs_flush(avctx);
 
     if (!avctx->refcounted_frames)
diff --git a/libavcodec/decode.h b/libavcodec/decode.h
index 4a76d7a..c3e0e82 100644
--- a/libavcodec/decode.h
+++ b/libavcodec/decode.h
@@ -1,20 +1,20 @@
 /*
  * generic decoding-related code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,15 +29,10 @@
 
 /**
  * This struct stores per-frame lavc-internal data and is attached to it via
- * opaque_ref.
+ * private_ref.
  */
 typedef struct FrameDecodeData {
     /**
-     * The original user-set opaque_ref.
-     */
-    AVBufferRef *user_opaque_ref;
-
-    /**
      * The callback to perform some delayed processing on the frame right
      * before it is returned to the caller.
      *
@@ -81,4 +76,6 @@ void ff_decode_bsfs_uninit(AVCodecContext *avctx);
 int ff_decode_get_hw_frames_ctx(AVCodecContext *avctx,
                                 enum AVHWDeviceType dev_type);
 
+int ff_attach_decode_data(AVFrame *frame);
+
 #endif /* AVCODEC_DECODE_H */
diff --git a/libavcodec/dfa.c b/libavcodec/dfa.c
index 1682eb0..970175f 100644
--- a/libavcodec/dfa.c
+++ b/libavcodec/dfa.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2011 Konstantin Shishkov
  * based on work by Vladimir "VAG" Gneushev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include "bytestream.h"
 #include "internal.h"
 
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 
@@ -37,12 +38,13 @@ typedef struct DfaContext {
 static av_cold int dfa_decode_init(AVCodecContext *avctx)
 {
     DfaContext *s = avctx->priv_data;
-    int ret;
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
-    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
-        return ret;
+    if (!avctx->width || !avctx->height || FFMAX(avctx->width, avctx->height) >= (1<<16))
+        return AVERROR_INVALIDDATA;
+
+    av_assert0(av_image_check_size(avctx->width, avctx->height, 0, avctx) >= 0);
 
     s->frame_buf = av_mallocz(avctx->width * avctx->height);
     if (!s->frame_buf)
@@ -65,11 +67,14 @@ static int decode_tsw1(GetByteContext *gb, uint8_t *frame, int width, int height
     const uint8_t *frame_start = frame;
     const uint8_t *frame_end   = frame + width * height;
     int mask = 0x10000, bitbuf = 0;
-    int v, count, segments;
+    int v, count;
+    unsigned segments;
     unsigned offset;
 
     segments = bytestream2_get_le32(gb);
     offset   = bytestream2_get_le32(gb);
+    if (segments == 0 && offset == frame_end - frame)
+        return 0; // skip frame
     if (frame_end - frame <= offset)
         return AVERROR_INVALIDDATA;
     frame += offset;
@@ -247,13 +252,16 @@ static int decode_wdlt(GetByteContext *gb, uint8_t *frame, int width, int height
         segments = bytestream2_get_le16u(gb);
         while ((segments & 0xC000) == 0xC000) {
             unsigned skip_lines = -(int16_t)segments;
-            unsigned delta = -((int16_t)segments * width);
+            int64_t delta = -((int16_t)segments * (int64_t)width);
             if (frame_end - frame <= delta || y + lines + skip_lines > height)
                 return AVERROR_INVALIDDATA;
             frame    += delta;
             y        += skip_lines;
             segments = bytestream2_get_le16(gb);
         }
+
+        if (frame_end <= frame)
+            return AVERROR_INVALIDDATA;
         if (segments & 0x8000) {
             frame[width - 1] = segments & 0xFF;
             segments = bytestream2_get_le16(gb);
@@ -291,7 +299,7 @@ static int decode_wdlt(GetByteContext *gb, uint8_t *frame, int width, int height
 static int decode_tdlt(GetByteContext *gb, uint8_t *frame, int width, int height)
 {
     const uint8_t *frame_end = frame + width * height;
-    int segments = bytestream2_get_le32(gb);
+    uint32_t segments = bytestream2_get_le32(gb);
     int skip, copy;
 
     while (segments--) {
@@ -340,11 +348,10 @@ static int dfa_decode_frame(AVCodecContext *avctx,
     uint8_t *dst;
     int ret;
     int i, pal_elems;
+    int version = avctx->extradata_size==2 ? AV_RL16(avctx->extradata) : 0;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0))) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
     while (bytestream2_get_bytes_left(&gb) > 0) {
@@ -357,7 +364,7 @@ static int dfa_decode_frame(AVCodecContext *avctx,
             pal_elems = FFMIN(chunk_size / 3, 256);
             for (i = 0; i < pal_elems; i++) {
                 s->pal[i] = bytestream2_get_be24(&gb) << 2;
-                s->pal[i] |= (s->pal[i] >> 6) & 0x333;
+                s->pal[i] |= 0xFFU << 24 | (s->pal[i] >> 6) & 0x30303;
             }
             frame->palette_has_changed = 1;
         } else if (chunk_type <= 9) {
@@ -377,9 +384,17 @@ static int dfa_decode_frame(AVCodecContext *avctx,
     buf = s->frame_buf;
     dst = frame->data[0];
     for (i = 0; i < avctx->height; i++) {
-        memcpy(dst, buf, avctx->width);
+        if(version == 0x100) {
+            int j;
+            for(j = 0; j < avctx->width; j++) {
+                dst[j] = buf[ (i&3)*(avctx->width /4) + (j/4) +
+                             ((j&3)*(avctx->height/4) + (i/4))*avctx->width];
+            }
+        } else {
+            memcpy(dst, buf, avctx->width);
+            buf += avctx->width;
+        }
         dst += frame->linesize[0];
-        buf += avctx->width;
     }
     memcpy(frame->data[1], s->pal, sizeof(s->pal));
 
diff --git a/libavcodec/dirac.c b/libavcodec/dirac.c
index 5faf0a3..d5870d6 100644
--- a/libavcodec/dirac.c
+++ b/libavcodec/dirac.c
@@ -1,34 +1,34 @@
 /*
  * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
  * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Dirac Decoder
- * @author Marco Gerards <marco@gnu.org>
+ * @author Marco Gerards <marco@gnu.org>, David Conrad, Jordi Ortiz <nenjordi@gmail.com>
  */
 
 #include "libavutil/imgutils.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "dirac.h"
 #include "golomb.h"
 #include "internal.h"
@@ -56,7 +56,7 @@ typedef struct dirac_source_params {
     uint8_t color_spec_index;       ///< index into dirac_color_spec_presets[]
 } dirac_source_params;
 
-// defaults for source parameters
+/* defaults for source parameters */
 static const dirac_source_params dirac_source_parameters_defaults[] = {
     {  640,  480, 2, 0, 0,  1, 1,  640,  480, 0, 0, 1, 0 },
     {  176,  120, 2, 0, 0,  9, 2,  176,  120, 0, 0, 1, 1 },
@@ -131,35 +131,38 @@ static const struct {
     { AVCOL_PRI_BT709,     AVCOL_SPC_BT709,   AVCOL_TRC_UNSPECIFIED /* DCinema */ },
 };
 
-/* [DIRAC_STD] Table 10.2 Supported chroma sampling formats + luma Offset */
-static const enum AVPixelFormat dirac_pix_fmt[2][3] = {
-    { AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,  AV_PIX_FMT_YUV420P  },
-    { AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P },
+/* [DIRAC_STD] Table 10.2 Supported chroma sampling formats */
+static const enum AVPixelFormat dirac_pix_fmt[][3] = {
+    {AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12},
+    {AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12},
+    {AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12},
 };
 
 /* [DIRAC_STD] 10.3 Parse Source Parameters.
  * source_parameters(base_video_format) */
-static int parse_source_parameters(AVDiracSeqHeader *dsh, BitstreamContext *bc,
+static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
                                    void *log_ctx)
 {
     AVRational frame_rate = { 0, 0 };
     unsigned luma_depth = 8, luma_offset = 16;
     int idx;
+    int chroma_x_shift, chroma_y_shift;
+    int ret;
 
     /* [DIRAC_STD] 10.3.2 Frame size. frame_size(video_params) */
     /* [DIRAC_STD] custom_dimensions_flag */
-    if (bitstream_read_bit(bc)) {
-        dsh->width  = get_interleaved_ue_golomb(bc); /* [DIRAC_STD] FRAME_WIDTH  */
-        dsh->height = get_interleaved_ue_golomb(bc); /* [DIRAC_STD] FRAME_HEIGHT */
+    if (get_bits1(gb)) {
+        dsh->width  = get_interleaved_ue_golomb(gb); /* [DIRAC_STD] FRAME_WIDTH  */
+        dsh->height = get_interleaved_ue_golomb(gb); /* [DIRAC_STD] FRAME_HEIGHT */
     }
 
     /* [DIRAC_STD] 10.3.3 Chroma Sampling Format.
      *  chroma_sampling_format(video_params) */
     /* [DIRAC_STD] custom_chroma_format_flag */
-    if (bitstream_read_bit(bc))
+    if (get_bits1(gb))
         /* [DIRAC_STD] CHROMA_FORMAT_INDEX */
-        dsh->chroma_format = get_interleaved_ue_golomb(bc);
-    if (dsh->chroma_format > 2) {
+        dsh->chroma_format = get_interleaved_ue_golomb(gb);
+    if (dsh->chroma_format > 2U) {
         if (log_ctx)
             av_log(log_ctx, AV_LOG_ERROR, "Unknown chroma format %d\n",
                    dsh->chroma_format);
@@ -168,24 +171,24 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, BitstreamContext *bc,
 
     /* [DIRAC_STD] 10.3.4 Scan Format. scan_format(video_params) */
     /* [DIRAC_STD] custom_scan_format_flag */
-    if (bitstream_read_bit(bc))
+    if (get_bits1(gb))
         /* [DIRAC_STD] SOURCE_SAMPLING */
-        dsh->interlaced = get_interleaved_ue_golomb(bc);
-    if (dsh->interlaced > 1)
+        dsh->interlaced = get_interleaved_ue_golomb(gb);
+    if (dsh->interlaced > 1U)
         return AVERROR_INVALIDDATA;
 
     /* [DIRAC_STD] 10.3.5 Frame Rate. frame_rate(video_params) */
-    if (bitstream_read_bit(bc)) { /* [DIRAC_STD] custom_frame_rate_flag */
-        dsh->frame_rate_index = get_interleaved_ue_golomb(bc);
+    if (get_bits1(gb)) { /* [DIRAC_STD] custom_frame_rate_flag */
+        dsh->frame_rate_index = get_interleaved_ue_golomb(gb);
 
-        if (dsh->frame_rate_index > 10)
+        if (dsh->frame_rate_index > 10U)
             return AVERROR_INVALIDDATA;
 
         if (!dsh->frame_rate_index) {
             /* [DIRAC_STD] FRAME_RATE_NUMER */
-            frame_rate.num = get_interleaved_ue_golomb(bc);
+            frame_rate.num = get_interleaved_ue_golomb(gb);
             /* [DIRAC_STD] FRAME_RATE_DENOM */
-            frame_rate.den = get_interleaved_ue_golomb(bc);
+            frame_rate.den = get_interleaved_ue_golomb(gb);
         }
     }
     /* [DIRAC_STD] preset_frame_rate(video_params, index) */
@@ -200,16 +203,16 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, BitstreamContext *bc,
 
     /* [DIRAC_STD] 10.3.6 Pixel Aspect Ratio.
      * pixel_aspect_ratio(video_params) */
-    if (bitstream_read_bit(bc)) { /* [DIRAC_STD] custom_pixel_aspect_ratio_flag */
+    if (get_bits1(gb)) { /* [DIRAC_STD] custom_pixel_aspect_ratio_flag */
         /* [DIRAC_STD] index */
-        dsh->aspect_ratio_index = get_interleaved_ue_golomb(bc);
+        dsh->aspect_ratio_index = get_interleaved_ue_golomb(gb);
 
-        if (dsh->aspect_ratio_index > 6)
+        if (dsh->aspect_ratio_index > 6U)
             return AVERROR_INVALIDDATA;
 
         if (!dsh->aspect_ratio_index) {
-            dsh->sample_aspect_ratio.num = get_interleaved_ue_golomb(bc);
-            dsh->sample_aspect_ratio.den = get_interleaved_ue_golomb(bc);
+            dsh->sample_aspect_ratio.num = get_interleaved_ue_golomb(gb);
+            dsh->sample_aspect_ratio.den = get_interleaved_ue_golomb(gb);
         }
     }
     /* [DIRAC_STD] Take value from Table 10.4 Available preset pixel
@@ -219,33 +222,33 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, BitstreamContext *bc,
             dirac_preset_aspect_ratios[dsh->aspect_ratio_index - 1];
 
     /* [DIRAC_STD] 10.3.7 Clean area. clean_area(video_params) */
-    if (bitstream_read_bit(bc)) { /* [DIRAC_STD] custom_clean_area_flag */
+    if (get_bits1(gb)) { /* [DIRAC_STD] custom_clean_area_flag */
         /* [DIRAC_STD] CLEAN_WIDTH */
-        dsh->clean_width = get_interleaved_ue_golomb(bc);
+        dsh->clean_width = get_interleaved_ue_golomb(gb);
         /* [DIRAC_STD] CLEAN_HEIGHT */
-        dsh->clean_height = get_interleaved_ue_golomb(bc);
+        dsh->clean_height = get_interleaved_ue_golomb(gb);
         /* [DIRAC_STD] CLEAN_LEFT_OFFSET */
-        dsh->clean_left_offset = get_interleaved_ue_golomb(bc);
+        dsh->clean_left_offset = get_interleaved_ue_golomb(gb);
         /* [DIRAC_STD] CLEAN_RIGHT_OFFSET */
-        dsh->clean_right_offset = get_interleaved_ue_golomb(bc);
+        dsh->clean_right_offset = get_interleaved_ue_golomb(gb);
     }
 
     /* [DIRAC_STD] 10.3.8 Signal range. signal_range(video_params)
      * WARNING: Some adaptation seems to be done using the
      * AVCOL_RANGE_MPEG/JPEG values */
-    if (bitstream_read_bit(bc)) { /* [DIRAC_STD] custom_signal_range_flag */
+    if (get_bits1(gb)) { /* [DIRAC_STD] custom_signal_range_flag */
         /* [DIRAC_STD] index */
-        dsh->pixel_range_index = get_interleaved_ue_golomb(bc);
+        dsh->pixel_range_index = get_interleaved_ue_golomb(gb);
 
-        if (dsh->pixel_range_index > 4)
+        if (dsh->pixel_range_index > 4U)
             return AVERROR_INVALIDDATA;
 
-        // This assumes either fullrange or MPEG levels only
+        /* This assumes either fullrange or MPEG levels only */
         if (!dsh->pixel_range_index) {
-            luma_offset = get_interleaved_ue_golomb(bc);
-            luma_depth  = av_log2(get_interleaved_ue_golomb(bc)) + 1;
-            get_interleaved_ue_golomb(bc); /* chroma offset    */
-            get_interleaved_ue_golomb(bc); /* chroma excursion */
+            luma_offset = get_interleaved_ue_golomb(gb);
+            luma_depth  = av_log2(get_interleaved_ue_golomb(gb)) + 1;
+            get_interleaved_ue_golomb(gb); /* chroma offset    */
+            get_interleaved_ue_golomb(gb); /* chroma excursion */
             dsh->color_range = luma_offset ? AVCOL_RANGE_MPEG
                                            : AVCOL_RANGE_JPEG;
         }
@@ -258,17 +261,31 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, BitstreamContext *bc,
         dsh->color_range   = pixel_range_presets[idx].color_range;
     }
 
-    if (luma_depth > 8 && log_ctx)
-        av_log(log_ctx, AV_LOG_WARNING, "Bitdepth greater than 8");
+    dsh->bit_depth = luma_depth;
+
+    /* Full range 8 bts uses the same pix_fmts as limited range 8 bits */
+    dsh->pixel_range_index += dsh->pixel_range_index == 1;
 
-    dsh->pix_fmt = dirac_pix_fmt[!luma_offset][dsh->chroma_format];
+    if (dsh->pixel_range_index < 2U)
+        return AVERROR_INVALIDDATA;
+
+    dsh->pix_fmt = dirac_pix_fmt[dsh->chroma_format][dsh->pixel_range_index-2];
+    ret = av_pix_fmt_get_chroma_sub_sample(dsh->pix_fmt, &chroma_x_shift, &chroma_y_shift);
+    if (ret)
+        return ret;
+
+    if ((dsh->width % (1<<chroma_x_shift)) || (dsh->height % (1<<chroma_y_shift))) {
+        if (log_ctx)
+            av_log(log_ctx, AV_LOG_ERROR, "Dimensions must be an integer multiple of the chroma subsampling\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     /* [DIRAC_STD] 10.3.9 Colour specification. colour_spec(video_params) */
-    if (bitstream_read_bit(bc)) { /* [DIRAC_STD] custom_colour_spec_flag */
+    if (get_bits1(gb)) { /* [DIRAC_STD] custom_colour_spec_flag */
         /* [DIRAC_STD] index */
-        idx = dsh->color_spec_index = get_interleaved_ue_golomb(bc);
+        idx = dsh->color_spec_index = get_interleaved_ue_golomb(gb);
 
-        if (dsh->color_spec_index > 4)
+        if (dsh->color_spec_index > 4U)
             return AVERROR_INVALIDDATA;
 
         dsh->color_primaries = dirac_color_presets[idx].color_primaries;
@@ -277,21 +294,21 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, BitstreamContext *bc,
 
         if (!dsh->color_spec_index) {
             /* [DIRAC_STD] 10.3.9.1 Colour primaries */
-            if (bitstream_read_bit(bc)) {
-                idx = get_interleaved_ue_golomb(bc);
-                if (idx < 3)
+            if (get_bits1(gb)) {
+                idx = get_interleaved_ue_golomb(gb);
+                if (idx < 3U)
                     dsh->color_primaries = dirac_primaries[idx];
             }
             /* [DIRAC_STD] 10.3.9.2 Colour matrix */
-            if (bitstream_read_bit(bc)) {
-                idx = get_interleaved_ue_golomb(bc);
+            if (get_bits1(gb)) {
+                idx = get_interleaved_ue_golomb(gb);
                 if (!idx)
                     dsh->colorspace = AVCOL_SPC_BT709;
                 else if (idx == 1)
                     dsh->colorspace = AVCOL_SPC_BT470BG;
             }
             /* [DIRAC_STD] 10.3.9.3 Transfer function */
-            if (bitstream_read_bit(bc) && !get_interleaved_ue_golomb(bc))
+            if (get_bits1(gb) && !get_interleaved_ue_golomb(gb))
                 dsh->color_trc = AVCOL_TRC_BT709;
         }
     } else {
@@ -310,8 +327,7 @@ int av_dirac_parse_sequence_header(AVDiracSeqHeader **pdsh,
                                    void *log_ctx)
 {
     AVDiracSeqHeader *dsh;
-    BitstreamContext bc;
-    unsigned version_major;
+    GetBitContext gb;
     unsigned video_format, picture_coding_mode;
     int ret;
 
@@ -319,32 +335,30 @@ int av_dirac_parse_sequence_header(AVDiracSeqHeader **pdsh,
     if (!dsh)
         return AVERROR(ENOMEM);
 
-    ret = bitstream_init8(&bc, buf, buf_size);
+    ret = init_get_bits8(&gb, buf, buf_size);
     if (ret < 0)
         goto fail;
 
     /* [DIRAC_SPEC] 10.1 Parse Parameters. parse_parameters() */
-    version_major = get_interleaved_ue_golomb(&bc);
-    get_interleaved_ue_golomb(&bc); /* version_minor */
-    dsh->profile = get_interleaved_ue_golomb(&bc);
-    dsh->level   = get_interleaved_ue_golomb(&bc);
+    dsh->version.major = get_interleaved_ue_golomb(&gb);
+    dsh->version.minor = get_interleaved_ue_golomb(&gb);
+    dsh->profile   = get_interleaved_ue_golomb(&gb);
+    dsh->level     = get_interleaved_ue_golomb(&gb);
     /* [DIRAC_SPEC] sequence_header() -> base_video_format as defined in
      * 10.2 Base Video Format, table 10.1 Dirac predefined video formats */
-    video_format = get_interleaved_ue_golomb(&bc);
+    video_format = get_interleaved_ue_golomb(&gb);
 
-    if (log_ctx) {
-        if (version_major < 2)
-            av_log(log_ctx, AV_LOG_WARNING, "Stream is old and may not work\n");
-        else if (version_major > 2)
-            av_log(log_ctx, AV_LOG_WARNING, "Stream may have unhandled features\n");
-    }
+    if (dsh->version.major < 2 && log_ctx)
+        av_log(log_ctx, AV_LOG_WARNING, "Stream is old and may not work\n");
+    else if (dsh->version.major > 2 && log_ctx)
+        av_log(log_ctx, AV_LOG_WARNING, "Stream may have unhandled features\n");
 
-    if (video_format > 20) {
+    if (video_format > 20U) {
         ret = AVERROR_INVALIDDATA;
         goto fail;
     }
 
-    // Fill in defaults for the source parameters.
+    /* Fill in defaults for the source parameters. */
     dsh->width              = dirac_source_parameters_defaults[video_format].width;
     dsh->height             = dirac_source_parameters_defaults[video_format].height;
     dsh->chroma_format      = dirac_source_parameters_defaults[video_format].chroma_format;
@@ -361,13 +375,13 @@ int av_dirac_parse_sequence_header(AVDiracSeqHeader **pdsh,
 
     /* [DIRAC_STD] 10.3 Source Parameters
      * Override the defaults. */
-    ret = parse_source_parameters(dsh, &bc, log_ctx);
+    ret = parse_source_parameters(dsh, &gb, log_ctx);
     if (ret < 0)
         goto fail;
 
     /* [DIRAC_STD] picture_coding_mode shall be 0 for fields and 1 for frames
      * currently only used to signal field coding */
-    picture_coding_mode = get_interleaved_ue_golomb(&bc);
+    picture_coding_mode = get_interleaved_ue_golomb(&gb);
     if (picture_coding_mode != 0) {
         if (log_ctx) {
             av_log(log_ctx, AV_LOG_ERROR, "Unsupported picture coding mode %d",
diff --git a/libavcodec/dirac.h b/libavcodec/dirac.h
index 25cefdb..e6d9d34 100644
--- a/libavcodec/dirac.h
+++ b/libavcodec/dirac.h
@@ -1,21 +1,22 @@
 /*
  * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
  * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,10 +27,57 @@
  * @file
  * Interface to Dirac Decoder/Encoder
  * @author Marco Gerards <marco@gnu.org>
+ * @author David Conrad
+ * @author Jordi Ortiz
  */
 
 #include "avcodec.h"
 
+/**
+ * The spec limits the number of wavelet decompositions to 4 for both
+ * level 1 (VC-2) and 128 (long-gop default).
+ * 5 decompositions is the maximum before >16-bit buffers are needed.
+ * Schroedinger allows this for DD 9,7 and 13,7 wavelets only, limiting
+ * the others to 4 decompositions (or 3 for the fidelity filter).
+ *
+ * We use this instead of MAX_DECOMPOSITIONS to save some memory.
+ */
+#define MAX_DWT_LEVELS 5
+
+/**
+ * Parse code values:
+ *
+ * Dirac Specification ->
+ * 9.6.1  Table 9.1
+ *
+ * VC-2 Specification  ->
+ * 10.4.1 Table 10.1
+ */
+
+enum DiracParseCodes {
+    DIRAC_PCODE_SEQ_HEADER      = 0x00,
+    DIRAC_PCODE_END_SEQ         = 0x10,
+    DIRAC_PCODE_AUX             = 0x20,
+    DIRAC_PCODE_PAD             = 0x30,
+    DIRAC_PCODE_PICTURE_CODED   = 0x08,
+    DIRAC_PCODE_PICTURE_RAW     = 0x48,
+    DIRAC_PCODE_PICTURE_LOW_DEL = 0xC8,
+    DIRAC_PCODE_PICTURE_HQ      = 0xE8,
+    DIRAC_PCODE_INTER_NOREF_CO1 = 0x0A,
+    DIRAC_PCODE_INTER_NOREF_CO2 = 0x09,
+    DIRAC_PCODE_INTER_REF_CO1   = 0x0D,
+    DIRAC_PCODE_INTER_REF_CO2   = 0x0E,
+    DIRAC_PCODE_INTRA_REF_CO    = 0x0C,
+    DIRAC_PCODE_INTRA_REF_RAW   = 0x4C,
+    DIRAC_PCODE_INTRA_REF_PICT  = 0xCC,
+    DIRAC_PCODE_MAGIC           = 0x42424344,
+};
+
+typedef struct DiracVersionInfo {
+    int major;
+    int minor;
+} DiracVersionInfo;
+
 typedef struct AVDiracSeqHeader {
     unsigned width;
     unsigned height;
@@ -60,6 +108,9 @@ typedef struct AVDiracSeqHeader {
     enum AVColorPrimaries color_primaries;
     enum AVColorTransferCharacteristic color_trc;
     enum AVColorSpace colorspace;
+
+    DiracVersionInfo version;
+    int bit_depth;
 } AVDiracSeqHeader;
 
 /**
diff --git a/libavcodec/dirac_arith.c b/libavcodec/dirac_arith.c
new file mode 100644
index 0000000..36142fe
--- /dev/null
+++ b/libavcodec/dirac_arith.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Arithmetic decoder for Dirac
+ * @author Marco Gerards <marco@gnu.org>
+ */
+
+#include "dirac_arith.h"
+
+
+const uint16_t ff_dirac_prob[256] = {
+    0,    2,    5,    8,    11,   15,   20,   24,
+    29,   35,   41,   47,   53,   60,   67,   74,
+    82,   89,   97,   106,  114,  123,  132,  141,
+    150,  160,  170,  180,  190,  201,  211,  222,
+    233,  244,  256,  267,  279,  291,  303,  315,
+    327,  340,  353,  366,  379,  392,  405,  419,
+    433,  447,  461,  475,  489,  504,  518,  533,
+    548,  563,  578,  593,  609,  624,  640,  656,
+    672,  688,  705,  721,  738,  754,  771,  788,
+    805,  822,  840,  857,  875,  892,  910,  928,
+    946,  964,  983,  1001, 1020, 1038, 1057, 1076,
+    1095, 1114, 1133, 1153, 1172, 1192, 1211, 1231,
+    1251, 1271, 1291, 1311, 1332, 1352, 1373, 1393,
+    1414, 1435, 1456, 1477, 1498, 1520, 1541, 1562,
+    1584, 1606, 1628, 1649, 1671, 1694, 1716, 1738,
+    1760, 1783, 1806, 1828, 1851, 1874, 1897, 1920,
+    1935, 1942, 1949, 1955, 1961, 1968, 1974, 1980,
+    1985, 1991, 1996, 2001, 2006, 2011, 2016, 2021,
+    2025, 2029, 2033, 2037, 2040, 2044, 2047, 2050,
+    2053, 2056, 2058, 2061, 2063, 2065, 2066, 2068,
+    2069, 2070, 2071, 2072, 2072, 2072, 2072, 2072,
+    2072, 2071, 2070, 2069, 2068, 2066, 2065, 2063,
+    2060, 2058, 2055, 2052, 2049, 2045, 2042, 2038,
+    2033, 2029, 2024, 2019, 2013, 2008, 2002, 1996,
+    1989, 1982, 1975, 1968, 1960, 1952, 1943, 1934,
+    1925, 1916, 1906, 1896, 1885, 1874, 1863, 1851,
+    1839, 1827, 1814, 1800, 1786, 1772, 1757, 1742,
+    1727, 1710, 1694, 1676, 1659, 1640, 1622, 1602,
+    1582, 1561, 1540, 1518, 1495, 1471, 1447, 1422,
+    1396, 1369, 1341, 1312, 1282, 1251, 1219, 1186,
+    1151, 1114, 1077, 1037, 995,  952,  906,  857,
+    805,  750,  690,  625,  553,  471,  376,  255
+};
+
+const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT] = {
+    [CTX_ZPZN_F1]   = CTX_ZP_F2,
+    [CTX_ZPNN_F1]   = CTX_ZP_F2,
+    [CTX_ZP_F2]     = CTX_ZP_F3,
+    [CTX_ZP_F3]     = CTX_ZP_F4,
+    [CTX_ZP_F4]     = CTX_ZP_F5,
+    [CTX_ZP_F5]     = CTX_ZP_F6,
+    [CTX_ZP_F6]     = CTX_ZP_F6,
+    [CTX_NPZN_F1]   = CTX_NP_F2,
+    [CTX_NPNN_F1]   = CTX_NP_F2,
+    [CTX_NP_F2]     = CTX_NP_F3,
+    [CTX_NP_F3]     = CTX_NP_F4,
+    [CTX_NP_F4]     = CTX_NP_F5,
+    [CTX_NP_F5]     = CTX_NP_F6,
+    [CTX_NP_F6]     = CTX_NP_F6,
+    [CTX_DELTA_Q_F] = CTX_DELTA_Q_F,
+};
+
+int16_t ff_dirac_prob_branchless[256][2];
+
+av_cold void ff_dirac_init_arith_tables(void)
+{
+    int i;
+
+    for (i = 0; i < 256; i++) {
+        ff_dirac_prob_branchless[i][0] =  ff_dirac_prob[255-i];
+        ff_dirac_prob_branchless[i][1] = -ff_dirac_prob[i];
+    }
+}
+
+void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length)
+{
+    int i;
+    align_get_bits(gb);
+
+    length = FFMIN(length, get_bits_left(gb)/8);
+
+    c->bytestream     = gb->buffer + get_bits_count(gb)/8;
+    c->bytestream_end = c->bytestream + length;
+    skip_bits_long(gb, length*8);
+
+    c->low = 0;
+    for (i = 0; i < 4; i++) {
+        c->low <<= 8;
+        if (c->bytestream < c->bytestream_end)
+            c->low |= *c->bytestream++;
+        else
+            c->low |= 0xff;
+    }
+
+    c->counter = -16;
+    c->range   = 0xffff;
+    c->error   = 0;
+    c->overread= 0;
+
+    for (i = 0; i < DIRAC_CTX_COUNT; i++)
+        c->contexts[i] = 0x8000;
+}
diff --git a/libavcodec/dirac_arith.h b/libavcodec/dirac_arith.h
new file mode 100644
index 0000000..79526a7
--- /dev/null
+++ b/libavcodec/dirac_arith.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Arithmetic decoder for Dirac
+ * @author Marco Gerards <marco@gnu.org>
+ */
+
+#ifndef AVCODEC_DIRAC_ARITH_H
+#define AVCODEC_DIRAC_ARITH_H
+
+#include "libavutil/x86/asm.h"
+#include "bytestream.h"
+#include "get_bits.h"
+
+enum dirac_arith_contexts {
+    CTX_ZPZN_F1,
+    CTX_ZPNN_F1,
+    CTX_NPZN_F1,
+    CTX_NPNN_F1,
+    CTX_ZP_F2,
+    CTX_ZP_F3,
+    CTX_ZP_F4,
+    CTX_ZP_F5,
+    CTX_ZP_F6,
+    CTX_NP_F2,
+    CTX_NP_F3,
+    CTX_NP_F4,
+    CTX_NP_F5,
+    CTX_NP_F6,
+    CTX_COEFF_DATA,
+    CTX_SIGN_NEG,
+    CTX_SIGN_ZERO,
+    CTX_SIGN_POS,
+    CTX_ZERO_BLOCK,
+    CTX_DELTA_Q_F,
+    CTX_DELTA_Q_DATA,
+    CTX_DELTA_Q_SIGN,
+
+    DIRAC_CTX_COUNT
+};
+
+// Dirac resets the arith decoder between decoding various types of data,
+// so many contexts are never used simultaneously. Thus, we can reduce
+// the number of contexts needed by reusing them.
+#define CTX_SB_F1        CTX_ZP_F5
+#define CTX_SB_DATA      0
+#define CTX_PMODE_REF1   0
+#define CTX_PMODE_REF2   1
+#define CTX_GLOBAL_BLOCK 2
+#define CTX_MV_F1        CTX_ZP_F2
+#define CTX_MV_DATA      0
+#define CTX_DC_F1        CTX_ZP_F5
+#define CTX_DC_DATA      0
+
+typedef struct {
+    unsigned low;
+    uint16_t range;
+    int16_t  counter;
+
+    const uint8_t *bytestream;
+    const uint8_t *bytestream_end;
+
+    uint16_t contexts[DIRAC_CTX_COUNT];
+    int error;
+    int overread;
+} DiracArith;
+
+extern const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT];
+extern const uint16_t ff_dirac_prob[256];
+extern int16_t ff_dirac_prob_branchless[256][2];
+
+static inline void renorm(DiracArith *c)
+{
+#if HAVE_FAST_CLZ
+    int shift = 14 - av_log2_16bit(c->range-1) + ((c->range-1)>>15);
+
+    c->low    <<= shift;
+    c->range  <<= shift;
+    c->counter += shift;
+#else
+    while (c->range <= 0x4000) {
+        c->low   <<= 1;
+        c->range <<= 1;
+        c->counter++;
+    }
+#endif
+}
+
+static inline void refill(DiracArith *c)
+{
+    int counter = c->counter;
+
+    if (counter >= 0) {
+        int new = bytestream_get_be16(&c->bytestream);
+
+        // the spec defines overread bits to be 1, and streams rely on this
+        if (c->bytestream > c->bytestream_end) {
+            new |= 0xff;
+            if (c->bytestream > c->bytestream_end+1)
+                new |= 0xff00;
+
+            c->bytestream = c->bytestream_end;
+            c->overread ++;
+            if (c->overread > 4)
+                c->error = AVERROR_INVALIDDATA;
+        }
+
+        c->low += new << counter;
+        counter -= 16;
+    }
+    c->counter = counter;
+}
+
+static inline int dirac_get_arith_bit(DiracArith *c, int ctx)
+{
+    int prob_zero = c->contexts[ctx];
+    int range_times_prob, bit;
+    unsigned low = c->low;
+    int    range = c->range;
+
+    range_times_prob = (c->range * prob_zero) >> 16;
+
+#if ARCH_X86 && HAVE_FAST_CMOV && HAVE_INLINE_ASM && HAVE_6REGS
+    low   -= range_times_prob << 16;
+    range -= range_times_prob;
+    bit = 0;
+    __asm__(
+        "cmpl   %5, %4 \n\t"
+        "setae  %b0    \n\t"
+        "cmovb  %3, %2 \n\t"
+        "cmovb  %5, %1 \n\t"
+        : "+q"(bit), "+r"(range), "+r"(low)
+        : "r"(c->low), "r"(c->low>>16),
+          "r"(range_times_prob)
+    );
+#else
+    bit = (low >> 16) >= range_times_prob;
+    if (bit) {
+        low   -= range_times_prob << 16;
+        range -= range_times_prob;
+    } else {
+        range  = range_times_prob;
+    }
+#endif
+
+    c->contexts[ctx] += ff_dirac_prob_branchless[prob_zero>>8][bit];
+    c->low   = low;
+    c->range = range;
+
+    renorm(c);
+    refill(c);
+    return bit;
+}
+
+static inline int dirac_get_arith_uint(DiracArith *c, int follow_ctx, int data_ctx)
+{
+    int ret = 1;
+    while (!dirac_get_arith_bit(c, follow_ctx)) {
+        if (ret >= 0x40000000) {
+            av_log(NULL, AV_LOG_ERROR, "dirac_get_arith_uint overflow\n");
+            c->error = AVERROR_INVALIDDATA;
+            return -1;
+        }
+        ret <<= 1;
+        ret += dirac_get_arith_bit(c, data_ctx);
+        follow_ctx = ff_dirac_next_ctx[follow_ctx];
+    }
+    return ret-1;
+}
+
+static inline int dirac_get_arith_int(DiracArith *c, int follow_ctx, int data_ctx)
+{
+    int ret = dirac_get_arith_uint(c, follow_ctx, data_ctx);
+    if (ret && dirac_get_arith_bit(c, data_ctx+1))
+        ret = -ret;
+    return ret;
+}
+
+void ff_dirac_init_arith_tables(void);
+void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length);
+
+#endif /* AVCODEC_DIRAC_ARITH_H */
diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
new file mode 100644
index 0000000..cc08f88
--- /dev/null
+++ b/libavcodec/dirac_dwt.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "dirac_dwt.h"
+
+#define TEMPLATE_8bit
+#include "dirac_dwt_template.c"
+
+#define TEMPLATE_10bit
+#include "dirac_dwt_template.c"
+
+#define TEMPLATE_12bit
+#include "dirac_dwt_template.c"
+
+int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
+                         int decomposition_count, int bit_depth)
+{
+    int ret = 0;
+
+    d->buffer = p->buf;
+    d->width  = p->width;
+    d->height = p->height;
+    d->stride = p->stride;
+    d->temp   = p->tmp;
+    d->decomposition_count = decomposition_count;
+
+    if (bit_depth == 8)
+        ret = ff_spatial_idwt_init_8bit(d, type);
+    else if (bit_depth == 10)
+        ret = ff_spatial_idwt_init_10bit(d, type);
+    else if (bit_depth == 12)
+        ret = ff_spatial_idwt_init_12bit(d, type);
+    else
+        av_log(NULL, AV_LOG_WARNING, "Unsupported bit depth = %i\n", bit_depth);
+
+    if (ret) {
+        av_log(NULL, AV_LOG_ERROR, "Unknown wavelet type %d\n", type);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ARCH_X86 && bit_depth == 8)
+        ff_spatial_idwt_init_x86(d, type);
+    return 0;
+}
+
+void ff_spatial_idwt_slice2(DWTContext *d, int y)
+{
+    int level, support = d->support;
+
+    for (level = d->decomposition_count-1; level >= 0; level--) {
+        int wl = d->width  >> level;
+        int hl = d->height >> level;
+        int stride_l = d->stride << level;
+
+        while (d->cs[level].y <= FFMIN((y>>level)+support, hl))
+            d->spatial_compose(d, level, wl, hl, stride_l);
+    }
+}
diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
new file mode 100644
index 0000000..994dc21
--- /dev/null
+++ b/libavcodec/dirac_dwt.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRAC_DWT_H
+#define AVCODEC_DIRAC_DWT_H
+
+#include <stdint.h>
+
+typedef int DWTELEM;
+typedef short IDWTELEM;
+
+#define MAX_DWT_SUPPORT 8
+#define MAX_DECOMPOSITIONS 8
+
+typedef struct DWTCompose {
+    uint8_t *b[MAX_DWT_SUPPORT];
+    int y;
+} DWTCompose;
+
+typedef struct DWTPlane {
+    int width;
+    int height;
+    int stride;
+    uint8_t *buf;
+    uint8_t *buf_base;
+    uint8_t *tmp;
+} DWTPlane;
+
+struct DWTContext;
+
+// Possible prototypes for vertical_compose functions
+typedef void (*vertical_compose_2tap)(uint8_t *b0, uint8_t *b1, int width);
+typedef void (*vertical_compose_3tap)(uint8_t *b0, uint8_t *b1, uint8_t *b2, int width);
+typedef void (*vertical_compose_5tap)(uint8_t *b0, uint8_t *b1, uint8_t *b2, uint8_t *b3, uint8_t *b4, int width);
+typedef void (*vertical_compose_9tap)(uint8_t *dst, uint8_t *b[8], int width);
+
+typedef struct DWTContext {
+    uint8_t *buffer;
+    uint8_t *temp;
+    int width;
+    int height;
+    int stride;
+    int decomposition_count;
+    int support;
+
+    void (*spatial_compose)(struct DWTContext *cs, int level, int width, int height, int stride);
+    void (*vertical_compose_l0)(void);
+    void (*vertical_compose_h0)(void);
+    void (*vertical_compose_l1)(void);
+    void (*vertical_compose_h1)(void);
+    void (*vertical_compose)(void);     ///< one set of lowpass and highpass combined
+    void (*horizontal_compose)(uint8_t *b, uint8_t *tmp, int width);
+
+    DWTCompose cs[MAX_DECOMPOSITIONS];
+} DWTContext;
+
+enum dwt_type {
+    DWT_SNOW_DAUB9_7,
+    DWT_SNOW_LEGALL5_3,
+    DWT_DIRAC_DD9_7,
+    DWT_DIRAC_LEGALL5_3,
+    DWT_DIRAC_DD13_7,
+    DWT_DIRAC_HAAR0,
+    DWT_DIRAC_HAAR1,
+    DWT_DIRAC_FIDELITY,
+    DWT_DIRAC_DAUB9_7,
+    DWT_NUM_TYPES
+};
+
+// -1 if an error occurred, e.g. the dwt_type isn't recognized
+int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
+                         int decomposition_count, int bit_depth);
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
+
+void ff_spatial_idwt_slice2(DWTContext *d, int y);
+
+// shared stuff for simd optimizations
+#define COMPOSE_53iL0(b0, b1, b2)\
+    (b1 - (unsigned)((int)(b0 + (unsigned)(b2) + 2) >> 2))
+
+#define COMPOSE_DIRAC53iH0(b0, b1, b2)\
+    (b1 + (unsigned)((int)(b0 + (unsigned)(b2) + 1) >> 1))
+
+#define COMPOSE_DD97iH0(b0, b1, b2, b3, b4)\
+    (int)(((unsigned)(b2) + ((int)(9U*b1 + 9U*b3 - b4 - b0 +  8) >> 4)))
+
+#define COMPOSE_DD137iL0(b0, b1, b2, b3, b4)\
+    (int)(((unsigned)(b2) - ((int)(9U*b1 + 9U*b3 - b4 - b0 + 16) >> 5)))
+
+#define COMPOSE_HAARiL0(b0, b1)\
+    ((int)(b0 - (unsigned)((int)(b1 + 1U) >> 1)))
+
+#define COMPOSE_HAARiH0(b0, b1)\
+    ((int)(b0 + (unsigned)(b1)))
+
+#define COMPOSE_FIDELITYiL0(b0, b1, b2, b3, b4, b5, b6, b7, b8)\
+    ((unsigned)b4 - ((int)(-8*(b0+(unsigned)b8) + 21*(b1+(unsigned)b7) - 46*(b2+(unsigned)b6) + 161*(b3+(unsigned)b5) + 128) >> 8))
+
+#define COMPOSE_FIDELITYiH0(b0, b1, b2, b3, b4, b5, b6, b7, b8)\
+    ((unsigned)b4 + ((int)(-2*(b0+(unsigned)b8) + 10*(b1+(unsigned)b7) - 25*(b2+(unsigned)b6) +  81*(b3+(unsigned)b5) + 128) >> 8))
+
+#define COMPOSE_DAUB97iL1(b0, b1, b2)\
+    ((unsigned)(b1) - ((int)(1817*(b0 + (unsigned)b2) + 2048) >> 12))
+
+#define COMPOSE_DAUB97iH1(b0, b1, b2)\
+    ((unsigned)(b1) - ((int)( 113*(b0 + (unsigned)b2) + 64) >> 7))
+
+#define COMPOSE_DAUB97iL0(b0, b1, b2)\
+    ((unsigned)(b1) + ((int)( 217*(b0 + (unsigned)b2) + 2048) >> 12))
+
+#define COMPOSE_DAUB97iH0(b0, b1, b2)\
+    ((unsigned)(b1) + ((int)(6497*(b0 + (unsigned)b2) + 2048) >> 12))
+
+
+#endif /* AVCODEC_DWT_H */
diff --git a/libavcodec/dirac_dwt_template.c b/libavcodec/dirac_dwt_template.c
new file mode 100644
index 0000000..5d55d93
--- /dev/null
+++ b/libavcodec/dirac_dwt_template.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ * Copyright (C) 2015 Open Broadcast Systems Ltd.
+ * Author    (C) 2015 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#if defined(TEMPLATE_8bit)
+
+#    define RENAME(N)   N ## _8bit
+#    define TYPE        int16_t
+#    undef  TEMPLATE_8bit
+
+#elif defined(TEMPLATE_10bit)
+
+#    define RENAME(N)   N ## _10bit
+#    define TYPE        int32_t
+#    undef  TEMPLATE_10bit
+
+#elif defined(TEMPLATE_12bit)
+
+#    define RENAME(N)   N ## _12bit
+#    define TYPE        int32_t
+#    undef  TEMPLATE_12bit
+
+#endif
+
+static void RENAME(vertical_compose53iL0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                          int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    for (i = 0; i < width; i++)
+        b1[i] -= (unsigned)((int)(b0[i] + (unsigned)b2[i] + 2) >> 2);
+}
+
+static av_always_inline void RENAME(interleave)(TYPE *dst, TYPE *src0, TYPE *src1, int w2,
+                                                int add, int shift)
+{
+    int i;
+    for (i = 0; i < w2; i++) {
+        dst[2*i  ] = ((int)(src0[i] + (unsigned)add)) >> shift;
+        dst[2*i+1] = ((int)(src1[i] + (unsigned)add)) >> shift;
+    }
+}
+
+static void RENAME(horizontal_compose_dirac53i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    int x;
+    const int w2 = w >> 1;
+    TYPE *b     = (TYPE *)_b;
+    TYPE *temp  = (TYPE *)_temp;
+
+    temp[0] = COMPOSE_53iL0(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++) {
+        temp[x     ] = COMPOSE_53iL0     (b[x+w2-1], b[x     ], b[x+w2]);
+        temp[x+w2-1] = COMPOSE_DIRAC53iH0(temp[x-1], b[x+w2-1], temp[x]);
+    }
+    temp[w-1] = COMPOSE_DIRAC53iH0(temp[w2-1], b[w-1], temp[w2-1]);
+
+    RENAME(interleave)(b, temp, temp+w2, w2, 1, 1);
+}
+
+static void RENAME(horizontal_compose_dd97i)(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    int x;
+    const int w2 = w >> 1;
+    TYPE *b   = (TYPE *)_b;
+    TYPE *tmp = (TYPE *)_tmp;
+
+    tmp[0] = COMPOSE_53iL0(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++)
+        tmp[x] = COMPOSE_53iL0(b[x+w2-1], b[x], b[x+w2]);
+
+    // extend the edges
+    tmp[-1]   = tmp[0];
+    tmp[w2+1] = tmp[w2] = tmp[w2-1];
+
+    for (x = 0; x < w2; x++) {
+        b[2*x  ] = ((int)(tmp[x] + 1U))>>1;
+        b[2*x+1] = ((int)(COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1U))>>1;
+    }
+}
+
+static void RENAME(horizontal_compose_dd137i)(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    const int w2 = w >> 1;
+    int x;
+    TYPE *b   = (TYPE *)_b;
+    TYPE *tmp = (TYPE *)_tmp;
+
+    tmp[0] = COMPOSE_DD137iL0(b[w2], b[w2], b[0], b[w2  ], b[w2+1]);
+    tmp[1] = COMPOSE_DD137iL0(b[w2], b[w2], b[1], b[w2+1], b[w2+2]);
+    for (x = 2; x < w2-1; x++)
+        tmp[x] = COMPOSE_DD137iL0(b[x+w2-2], b[x+w2-1], b[x], b[x+w2], b[x+w2+1]);
+    tmp[w2-1] = COMPOSE_DD137iL0(b[w-3], b[w-2], b[w2-1], b[w-1], b[w-1]);
+
+    // extend the edges
+    tmp[-1]   = tmp[0];
+    tmp[w2+1] = tmp[w2] = tmp[w2-1];
+
+    for (x = 0; x < w2; x++) {
+        b[2*x  ] = ((int)(tmp[x] + 1U))>>1;
+        b[2*x+1] = ((int)(COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1U))>>1;
+    }
+}
+
+static av_always_inline void RENAME(horizontal_compose_haari)(TYPE *b, TYPE *temp,
+                                                              int w, int shift)
+{
+    const int w2 = w >> 1;
+    int x;
+
+    for (x = 0; x < w2; x++) {
+        temp[x   ] = COMPOSE_HAARiL0(b[x   ], b[x+w2]);
+        temp[x+w2] = COMPOSE_HAARiH0(b[x+w2], temp[x]);
+    }
+
+    RENAME(interleave)(b, temp, temp+w2, w2, shift, shift);
+}
+
+static void RENAME(horizontal_compose_haar0i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    TYPE *b    = (TYPE *)_b;
+    TYPE *temp = (TYPE *)_temp;
+    RENAME(horizontal_compose_haari)(b, temp, w, 0);
+}
+
+static void RENAME(horizontal_compose_haar1i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    TYPE *b    = (TYPE *)_b;
+    TYPE *temp = (TYPE *)_temp;
+    RENAME(horizontal_compose_haari)(b, temp, w, 1);
+}
+
+static void RENAME(horizontal_compose_fidelityi)(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    const int w2 = w >> 1;
+    int i, x;
+    TYPE v[8];
+    TYPE *b   = (TYPE *)_b;
+    TYPE *tmp = (TYPE *)_tmp;
+
+    for (x = 0; x < w2; x++) {
+        for (i = 0; i < 8; i++)
+            v[i] = b[av_clip(x-3+i, 0, w2-1)];
+        tmp[x] = COMPOSE_FIDELITYiH0(v[0], v[1], v[2], v[3], b[x+w2], v[4], v[5], v[6], v[7]);
+    }
+
+    for (x = 0; x < w2; x++) {
+        for (i = 0; i < 8; i++)
+            v[i] = tmp[av_clip(x-4+i, 0, w2-1)];
+        tmp[x+w2] = COMPOSE_FIDELITYiL0(v[0], v[1], v[2], v[3], b[x], v[4], v[5], v[6], v[7]);
+    }
+
+    RENAME(interleave)(b, tmp+w2, tmp, w2, 0, 0);
+}
+
+static void RENAME(horizontal_compose_daub97i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    const int w2 = w >> 1;
+    int x, b0, b1, b2;
+    TYPE *b    = (TYPE *)_b;
+    TYPE *temp = (TYPE *)_temp;
+
+    temp[0] = COMPOSE_DAUB97iL1(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++) {
+        temp[x     ] = COMPOSE_DAUB97iL1(b[x+w2-1], b[x     ], b[x+w2]);
+        temp[x+w2-1] = COMPOSE_DAUB97iH1(temp[x-1], b[x+w2-1], temp[x]);
+    }
+    temp[w-1] = COMPOSE_DAUB97iH1(temp[w2-1], b[w-1], temp[w2-1]);
+
+    // second stage combined with interleave and shift
+    b0 = b2 = COMPOSE_DAUB97iL0(temp[w2], temp[0], temp[w2]);
+    b[0] = ~((~b0) >> 1);
+    for (x = 1; x < w2; x++) {
+        b2 = COMPOSE_DAUB97iL0(temp[x+w2-1], temp[x     ], temp[x+w2]);
+        b1 = COMPOSE_DAUB97iH0(          b0, temp[x+w2-1], b2        );
+        b[2*x-1] = ~((~b1) >> 1);
+        b[2*x  ] = ~((~b2) >> 1);
+        b0 = b2;
+    }
+    b[w-1] = ~((~COMPOSE_DAUB97iH0(b2, temp[w-1], b2)) >> 1);
+}
+
+static void RENAME(vertical_compose_dirac53iH0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                                int width)
+{
+    int i;
+    TYPE *b0  = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_dd97iH0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                             uint8_t *_b3, uint8_t *_b4, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    TYPE *b3 = (TYPE *)_b3;
+    TYPE *b4 = (TYPE *)_b4;
+    for(i=0; i<width; i++){
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+    }
+}
+
+static void RENAME(vertical_compose_dd137iL0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                              uint8_t *_b3, uint8_t *_b4, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    TYPE *b3 = (TYPE *)_b3;
+    TYPE *b4 = (TYPE *)_b4;
+    for(i=0; i<width; i++){
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+    }
+}
+
+static void RENAME(vertical_compose_haar)(uint8_t *_b0, uint8_t *_b1, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+
+    for (i = 0; i < width; i++) {
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]);
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]);
+    }
+}
+
+static void RENAME(vertical_compose_fidelityiH0)(uint8_t *_dst, uint8_t *_b[8], int width)
+{
+    int i;
+    TYPE *dst = (TYPE *)_dst;
+    TYPE *b0  = (TYPE *)_b[0];
+    TYPE *b1  = (TYPE *)_b[1];
+    TYPE *b2  = (TYPE *)_b[2];
+    TYPE *b3  = (TYPE *)_b[3];
+    TYPE *b4  = (TYPE *)_b[4];
+    TYPE *b5  = (TYPE *)_b[5];
+    TYPE *b6  = (TYPE *)_b[6];
+    TYPE *b7  = (TYPE *)_b[7];
+    for(i=0; i<width; i++){
+        dst[i] = COMPOSE_FIDELITYiH0(b0[i], b1[i], b2[i], b3[i], dst[i], b4[i], b5[i], b6[i], b7[i]);
+    }
+}
+
+static void RENAME(vertical_compose_fidelityiL0)(uint8_t *_dst, uint8_t *_b[8], int width)
+{
+    int i;
+    TYPE *dst = (TYPE *)_dst;
+    TYPE *b0  = (TYPE *)_b[0];
+    TYPE *b1  = (TYPE *)_b[1];
+    TYPE *b2  = (TYPE *)_b[2];
+    TYPE *b3  = (TYPE *)_b[3];
+    TYPE *b4  = (TYPE *)_b[4];
+    TYPE *b5  = (TYPE *)_b[5];
+    TYPE *b6  = (TYPE *)_b[6];
+    TYPE *b7  = (TYPE *)_b[7];
+
+    for(i=0; i<width; i++){
+        dst[i] = COMPOSE_FIDELITYiL0(b0[i], b1[i], b2[i], b3[i], dst[i], b4[i], b5[i], b6[i], b7[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iH0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iH0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iH1)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iH1(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iL0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iL0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iL1)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iL1(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(spatial_compose_dd97i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_5tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    uint8_t *b[8];
+    for (i = 0; i < 6; i++)
+        b[i] = cs->b[i];
+    b[6] = d->buffer + av_clip(y+5, 0, height-2)*stride;
+    b[7] = d->buffer + av_clip(y+6, 1, height-1)*stride;
+
+    if(y+5<(unsigned)height) vertical_compose_l0(      b[5], b[6], b[7],       width);
+    if(y+1<(unsigned)height) vertical_compose_h0(b[0], b[2], b[3], b[4], b[6], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 6; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+static void RENAME(spatial_compose_dirac53i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_3tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int y= cs->y;
+    uint8_t *b[4] = { cs->b[0], cs->b[1] };
+    b[2] = d->buffer + avpriv_mirror(y+1, height-1)*stride;
+    b[3] = d->buffer + avpriv_mirror(y+2, height-1)*stride;
+
+    if(y+1<(unsigned)height) vertical_compose_l0(b[1], b[2], b[3], width);
+    if(y+0<(unsigned)height) vertical_compose_h0(b[0], b[1], b[2], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    cs->b[0] = b[2];
+    cs->b[1] = b[3];
+    cs->y += 2;
+}
+
+static void RENAME(spatial_compose_dd137i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_5tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_5tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    uint8_t *b[10];
+    for (i = 0; i < 8; i++)
+        b[i] = cs->b[i];
+    b[8] = d->buffer + av_clip(y+7, 0, height-2)*stride;
+    b[9] = d->buffer + av_clip(y+8, 1, height-1)*stride;
+
+    if(y+5<(unsigned)height) vertical_compose_l0(b[3], b[5], b[6], b[7], b[9], width);
+    if(y+1<(unsigned)height) vertical_compose_h0(b[0], b[2], b[3], b[4], b[6], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 8; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+// haar makes the assumption that height is even (always true for dirac)
+static void RENAME(spatial_compose_haari_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_2tap vertical_compose = (void*)d->vertical_compose;
+    int y = d->cs[level].y;
+    uint8_t *b0 = d->buffer + (y-1)*stride;
+    uint8_t *b1 = d->buffer + (y  )*stride;
+
+    vertical_compose(b0, b1, width);
+    d->horizontal_compose(b0, d->temp, width);
+    d->horizontal_compose(b1, d->temp, width);
+
+    d->cs[level].y += 2;
+}
+
+// Don't do sliced idwt for fidelity; the 9 tap filter makes it a bit annoying
+// Fortunately, this filter isn't used in practice.
+static void RENAME(spatial_compose_fidelity)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_9tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_9tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    int i, y;
+    uint8_t *b[8];
+
+    for (y = 1; y < height; y += 2) {
+        for (i = 0; i < 8; i++)
+            b[i] = d->buffer + av_clip((y-7 + 2*i), 0, height-2)*stride;
+        vertical_compose_h0(d->buffer + y*stride, b, width);
+    }
+
+    for (y = 0; y < height; y += 2) {
+        for (i = 0; i < 8; i++)
+            b[i] = d->buffer + av_clip((y-7 + 2*i), 1, height-1)*stride;
+        vertical_compose_l0(d->buffer + y*stride, b, width);
+    }
+
+    for (y = 0; y < height; y++)
+        d->horizontal_compose(d->buffer + y*stride, d->temp, width);
+
+    d->cs[level].y = height+1;
+}
+
+static void RENAME(spatial_compose_daub97i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_3tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    vertical_compose_3tap vertical_compose_l1 = (void*)d->vertical_compose_l1;
+    vertical_compose_3tap vertical_compose_h1 = (void*)d->vertical_compose_h1;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    uint8_t *b[6];
+    for (i = 0; i < 4; i++)
+        b[i] = cs->b[i];
+    b[4] = d->buffer + avpriv_mirror(y+3, height-1)*stride;
+    b[5] = d->buffer + avpriv_mirror(y+4, height-1)*stride;
+
+    if(y+3<(unsigned)height) vertical_compose_l1(b[3], b[4], b[5], width);
+    if(y+2<(unsigned)height) vertical_compose_h1(b[2], b[3], b[4], width);
+    if(y+1<(unsigned)height) vertical_compose_l0(b[1], b[2], b[3], width);
+    if(y+0<(unsigned)height) vertical_compose_h0(b[0], b[1], b[2], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 4; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+static void RENAME(spatial_compose97i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + avpriv_mirror(-3-1, height-1)*stride;
+    cs->b[1] = buffer + avpriv_mirror(-3  , height-1)*stride;
+    cs->b[2] = buffer + avpriv_mirror(-3+1, height-1)*stride;
+    cs->b[3] = buffer + avpriv_mirror(-3+2, height-1)*stride;
+    cs->y = -3;
+}
+
+static void RENAME(spatial_compose53i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + avpriv_mirror(-1-1, height-1)*stride;
+    cs->b[1] = buffer + avpriv_mirror(-1  , height-1)*stride;
+    cs->y = -1;
+}
+
+static void RENAME(spatial_compose_dd97i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + av_clip(-5-1, 0, height-2)*stride;
+    cs->b[1] = buffer + av_clip(-5  , 1, height-1)*stride;
+    cs->b[2] = buffer + av_clip(-5+1, 0, height-2)*stride;
+    cs->b[3] = buffer + av_clip(-5+2, 1, height-1)*stride;
+    cs->b[4] = buffer + av_clip(-5+3, 0, height-2)*stride;
+    cs->b[5] = buffer + av_clip(-5+4, 1, height-1)*stride;
+    cs->y = -5;
+}
+
+static void RENAME(spatial_compose_dd137i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + av_clip(-5-1, 0, height-2)*stride;
+    cs->b[1] = buffer + av_clip(-5  , 1, height-1)*stride;
+    cs->b[2] = buffer + av_clip(-5+1, 0, height-2)*stride;
+    cs->b[3] = buffer + av_clip(-5+2, 1, height-1)*stride;
+    cs->b[4] = buffer + av_clip(-5+3, 0, height-2)*stride;
+    cs->b[5] = buffer + av_clip(-5+4, 1, height-1)*stride;
+    cs->b[6] = buffer + av_clip(-5+5, 0, height-2)*stride;
+    cs->b[7] = buffer + av_clip(-5+6, 1, height-1)*stride;
+    cs->y = -5;
+}
+
+static int RENAME(ff_spatial_idwt_init)(DWTContext *d, enum dwt_type type)
+{
+    int level;
+
+    d->temp = (uint8_t *)(((TYPE *)d->temp) + 8);
+
+    for (level = d->decomposition_count - 1; level >= 0; level--){
+        int hl = d->height >> level;
+        int stride_l = d->stride << level;
+
+        switch(type){
+            case DWT_DIRAC_DD9_7:
+                RENAME(spatial_compose_dd97i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            case DWT_DIRAC_LEGALL5_3:
+                RENAME(spatial_compose53i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            case DWT_DIRAC_DD13_7:
+                RENAME(spatial_compose_dd137i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            case DWT_DIRAC_HAAR0:
+            case DWT_DIRAC_HAAR1:
+                d->cs[level].y = 1;
+                break;
+            case DWT_DIRAC_DAUB9_7:
+                RENAME(spatial_compose97i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            default:
+                d->cs[level].y = 0;
+                break;
+        }
+    }
+
+    switch (type) {
+        case DWT_DIRAC_DD9_7:
+            d->spatial_compose = RENAME(spatial_compose_dd97i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose53iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_dd97iH0);
+            d->horizontal_compose = RENAME(horizontal_compose_dd97i);
+            d->support = 7;
+            break;
+        case DWT_DIRAC_LEGALL5_3:
+            d->spatial_compose = RENAME(spatial_compose_dirac53i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose53iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_dirac53iH0);
+            d->horizontal_compose = RENAME(horizontal_compose_dirac53i);
+            d->support = 3;
+            break;
+        case DWT_DIRAC_DD13_7:
+            d->spatial_compose = RENAME(spatial_compose_dd137i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose_dd137iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_dd97iH0);
+            d->horizontal_compose = RENAME(horizontal_compose_dd137i);
+            d->support = 7;
+            break;
+        case DWT_DIRAC_HAAR0:
+        case DWT_DIRAC_HAAR1:
+            d->spatial_compose = RENAME(spatial_compose_haari_dy);
+            d->vertical_compose = (void*)RENAME(vertical_compose_haar);
+            if (type == DWT_DIRAC_HAAR0)
+                d->horizontal_compose = RENAME(horizontal_compose_haar0i);
+            else
+                d->horizontal_compose = RENAME(horizontal_compose_haar1i);
+            d->support = 1;
+            break;
+        case DWT_DIRAC_FIDELITY:
+            d->spatial_compose = RENAME(spatial_compose_fidelity);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose_fidelityiL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_fidelityiH0);
+            d->horizontal_compose = RENAME(horizontal_compose_fidelityi);
+            d->support = 0; // not really used
+            break;
+        case DWT_DIRAC_DAUB9_7:
+            d->spatial_compose = RENAME(spatial_compose_daub97i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose_daub97iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_daub97iH0);
+            d->vertical_compose_l1 = (void*)RENAME(vertical_compose_daub97iL1);
+            d->vertical_compose_h1 = (void*)RENAME(vertical_compose_daub97iH1);
+            d->horizontal_compose = RENAME(horizontal_compose_daub97i);
+            d->support = 5;
+            break;
+        default:
+            return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+#undef RENAME
+#undef TYPE
diff --git a/libavcodec/dirac_parser.c b/libavcodec/dirac_parser.c
index 5c9d266..1ade44a 100644
--- a/libavcodec/dirac_parser.c
+++ b/libavcodec/dirac_parser.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2007-2008 Marco Gerards <marco@gnu.org>
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -100,17 +100,36 @@ typedef struct DiracParseUnit {
 static int unpack_parse_unit(DiracParseUnit *pu, DiracParseContext *pc,
                              int offset)
 {
-    uint8_t *start = pc->buffer + offset;
-    uint8_t *end   = pc->buffer + pc->index;
-    if (start < pc->buffer || (start + 13 > end))
+    int i;
+    int8_t *start;
+    static const uint8_t valid_pu_types[] = {
+        0x00, 0x10, 0x20, 0x30, 0x08, 0x48, 0xC8, 0xE8, 0x0A, 0x0C, 0x0D, 0x0E,
+        0x4C, 0x09, 0xCC, 0x88, 0xCB
+    };
+
+    if (offset < 0 || pc->index - 13 < offset)
         return 0;
+
+    start = pc->buffer + offset;
     pu->pu_type = start[4];
 
     pu->next_pu_offset = AV_RB32(start + 5);
     pu->prev_pu_offset = AV_RB32(start + 9);
 
-    if (pu->pu_type == 0x10 && pu->next_pu_offset == 0)
-        pu->next_pu_offset = 13;
+    /* Check for valid parse code */
+    for (i = 0; i < 17; i++)
+        if (valid_pu_types[i] == pu->pu_type)
+            break;
+    if (i == 17)
+        return 0;
+
+    if (pu->pu_type == 0x10 && pu->next_pu_offset == 0x00)
+        pu->next_pu_offset = 13; /* The length of a parse info header */
+
+    /* Check if the parse offsets are somewhat sane */
+    if ((pu->next_pu_offset && pu->next_pu_offset < 13) ||
+        (pu->prev_pu_offset && pu->prev_pu_offset < 13))
+        return 0;
 
     return 1;
 }
@@ -123,7 +142,7 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
     DiracParseContext *pc = s->priv_data;
 
     if (pc->overread_index) {
-        memcpy(pc->buffer, pc->buffer + pc->overread_index,
+        memmove(pc->buffer, pc->buffer + pc->overread_index,
                pc->index - pc->overread_index);
         pc->index         -= pc->overread_index;
         pc->overread_index = 0;
@@ -139,6 +158,8 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         void *new_buffer =
             av_fast_realloc(pc->buffer, &pc->buffer_size,
                             pc->index + (*buf_size - pc->sync_offset));
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
         pc->buffer = new_buffer;
         memcpy(pc->buffer + pc->index, (*buf + pc->sync_offset),
                *buf_size - pc->sync_offset);
@@ -149,6 +170,8 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         DiracParseUnit pu1, pu;
         void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
                                            pc->index + next);
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
         pc->buffer = new_buffer;
         memcpy(pc->buffer + pc->index, *buf, next);
         pc->index += next;
@@ -161,7 +184,9 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
          * we can be pretty sure that we have a valid parse unit */
         if (!unpack_parse_unit(&pu1, pc, pc->index - 13)                     ||
             !unpack_parse_unit(&pu, pc, pc->index - 13 - pu1.prev_pu_offset) ||
-            pu.next_pu_offset != pu1.prev_pu_offset) {
+            pu.next_pu_offset != pu1.prev_pu_offset                          ||
+            pc->index < pc->dirac_unit_size + 13LL + pu1.prev_pu_offset
+        ) {
             pc->index              -= 9;
             *buf_size               = next - 9;
             pc->header_bytes_needed = 9;
@@ -184,7 +209,7 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         }
 
         /* Get the picture number to set the pts and dts*/
-        if (parse_timing_info) {
+        if (parse_timing_info && pu1.prev_pu_offset >= 13) {
             uint8_t *cur_pu = pc->buffer +
                               pc->index - 13 - pu1.prev_pu_offset;
             int pts = AV_RB32(cur_pu + 13);
@@ -245,7 +270,7 @@ static void dirac_parse_close(AVCodecParserContext *s)
     DiracParseContext *pc = s->priv_data;
 
     if (pc->buffer_size > 0)
-        av_free(pc->buffer);
+        av_freep(&pc->buffer);
 }
 
 AVCodecParser ff_dirac_parser = {
diff --git a/libavcodec/dirac_vlc.c b/libavcodec/dirac_vlc.c
new file mode 100644
index 0000000..496d817
--- /dev/null
+++ b/libavcodec/dirac_vlc.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dirac_vlc.h"
+
+#define LUT_SIZE   (1 << LUT_BITS)
+#define RSIZE_BITS (CHAR_BIT*sizeof(residual))
+
+#define CONVERT_TO_RESIDUE(a, b)                                               \
+    (((residual)(a)) << (RSIZE_BITS - (b)))
+
+#define INIT_RESIDUE(N)                                                        \
+    residual N = 0;                                                            \
+    av_unused int32_t N ## _bits  = 0
+
+#define SET_RESIDUE(N, I, B)                                                   \
+    N          = CONVERT_TO_RESIDUE(I, B);                                     \
+    N ## _bits = B
+
+#define APPEND_RESIDUE(N, M)                                                   \
+    N          |= M >> (N ## _bits);                                           \
+    N ## _bits  = (N ## _bits + (M ## _bits)) & 0x3F
+
+int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *_dst, int coeffs)
+{
+    int i, b, c_idx = 0;
+    int32_t *dst = (int32_t *)_dst;
+    DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]];
+    INIT_RESIDUE(res);
+
+    for (b = 1; b <= bytes; b++) {
+        future[0] = &lut_ctx[buf[b]];
+        future[1] = future[0] + 1*LUT_SIZE;
+        future[2] = future[0] + 2*LUT_SIZE;
+        future[3] = future[0] + 3*LUT_SIZE;
+
+        if ((c_idx + 1) > coeffs)
+            return c_idx;
+
+        /* res_bits is a hint for better branch prediction */
+        if (res_bits && l->sign) {
+            int32_t coeff = 1;
+            APPEND_RESIDUE(res, l->preamble);
+            for (i = 0; i < (res_bits >> 1) - 1; i++) {
+                coeff <<= 1;
+                coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1;
+            }
+            dst[c_idx++] = l->sign * (coeff - 1);
+            res_bits = res = 0;
+        }
+
+        memcpy(&dst[c_idx], l->ready, LUT_BITS*sizeof(int32_t));
+        c_idx += l->ready_num;
+
+        APPEND_RESIDUE(res, l->leftover);
+
+        l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1];
+    }
+
+    return c_idx;
+}
+
+int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *_dst, int coeffs)
+{
+    int i, b, c_idx = 0;
+    int16_t *dst = (int16_t *)_dst;
+    DiracGolombLUT *future[4], *l = &lut_ctx[2*LUT_SIZE + buf[0]];
+    INIT_RESIDUE(res);
+
+    for (b = 1; b <= bytes; b++) {
+        future[0] = &lut_ctx[buf[b]];
+        future[1] = future[0] + 1*LUT_SIZE;
+        future[2] = future[0] + 2*LUT_SIZE;
+        future[3] = future[0] + 3*LUT_SIZE;
+
+        if ((c_idx + 1) > coeffs)
+            return c_idx;
+
+        if (res_bits && l->sign) {
+            int32_t coeff = 1;
+            APPEND_RESIDUE(res, l->preamble);
+            for (i = 0; i < (res_bits >> 1) - 1; i++) {
+                coeff <<= 1;
+                coeff |= (res >> (RSIZE_BITS - 2*i - 2)) & 1;
+            }
+            dst[c_idx++] = l->sign * (coeff - 1);
+            res_bits = res = 0;
+        }
+
+        for (i = 0; i < LUT_BITS; i++)
+            dst[c_idx + i] = l->ready[i];
+        c_idx += l->ready_num;
+
+        APPEND_RESIDUE(res, l->leftover);
+
+        l = future[l->need_s ? 3 : !res_bits ? 2 : res_bits & 1];
+    }
+
+    return c_idx;
+}
+
+/* Searches for golomb codes in a residue */
+static inline void search_for_golomb(DiracGolombLUT *l, residual r, int bits)
+{
+    int r_count = RSIZE_BITS - 1;
+    int bits_start, bits_tot = bits, need_sign = 0;
+
+#define READ_BIT(N) (((N) >> (N ## _count--)) & 1)
+
+    while (1) {
+        int32_t coef = 1;
+        bits_start = (RSIZE_BITS - 1) - r_count;
+
+        while (1) {
+            if (!bits--)
+                goto leftover;
+            if (READ_BIT(r))
+                break;
+
+            coef <<= 1;
+
+            if (!bits--)
+                goto leftover;
+            coef |= READ_BIT(r);
+        }
+
+        l->ready[l->ready_num] = coef - 1;
+        if (l->ready[l->ready_num]) {
+            if (!bits--) {
+                need_sign = 1;
+                goto leftover;
+            }
+            l->ready[l->ready_num] *= READ_BIT(r) ? -1 : +1;
+        }
+        l->ready_num++;
+
+        if (!bits)
+            return;
+    }
+
+    leftover:
+        l->leftover      = r << bits_start;
+        l->leftover_bits = bits_tot - bits_start;
+        l->need_s        = need_sign;
+}
+
+/* Parity LUTs - even and odd bit end positions */
+static void generate_parity_lut(DiracGolombLUT *lut, int even)
+{
+    int idx;
+    for (idx = 0; idx < LUT_SIZE; idx++) {
+        DiracGolombLUT *l = &lut[idx];
+        int symbol_end_loc = -1;
+        uint32_t code;
+        int i;
+
+        INIT_RESIDUE(res);
+        SET_RESIDUE(res, idx, LUT_BITS);
+
+        for (i = 0; i < LUT_BITS; i++) {
+            const int cond = even ? (i & 1) : !(i & 1);
+            if (((res >> (RSIZE_BITS - i - 1)) & 1) && cond) {
+                symbol_end_loc = i + 2;
+                break;
+            }
+        }
+
+        if (symbol_end_loc < 0 || symbol_end_loc > LUT_BITS) {
+            l->preamble      = 0;
+            l->preamble_bits = 0;
+            l->leftover_bits = LUT_BITS;
+            l->leftover      = CONVERT_TO_RESIDUE(idx, l->leftover_bits);
+            if (even)
+                l->need_s    = idx & 1;
+            continue;
+        }
+
+        /* Gets bits 0 through to (symbol_end_loc - 1) inclusive */
+        code  = idx >> ((LUT_BITS - 1) - (symbol_end_loc - 1));
+        code &= ((1 << LUT_BITS) - 1) >> (LUT_BITS - symbol_end_loc);
+        l->preamble_bits = symbol_end_loc;
+        l->preamble      = CONVERT_TO_RESIDUE(code, l->preamble_bits);
+        l->sign = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1;
+
+        search_for_golomb(l, res << symbol_end_loc, LUT_BITS - symbol_end_loc);
+    }
+}
+
+/* Reset (off == 0) and needs-one-more-bit (off == 1) LUTs */
+static void generate_offset_lut(DiracGolombLUT *lut, int off)
+{
+    int idx;
+    for (idx = 0; idx < LUT_SIZE; idx++) {
+        DiracGolombLUT *l = &lut[idx];
+
+        INIT_RESIDUE(res);
+        SET_RESIDUE(res, idx, LUT_BITS);
+
+        l->preamble_bits = off;
+        if (off) {
+            l->preamble  = CONVERT_TO_RESIDUE(res >> (RSIZE_BITS - off), off);
+            l->sign      = ((l->preamble >> (RSIZE_BITS - l->preamble_bits)) & 1) ? -1 : +1;
+        } else {
+            l->preamble  = 0;
+            l->sign = 1;
+        }
+
+        search_for_golomb(l, res << off, LUT_BITS - off);
+    }
+}
+
+av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx)
+{
+    DiracGolombLUT *lut;
+
+    if (!(lut = av_calloc(4*LUT_SIZE, sizeof(DiracGolombLUT))))
+        return AVERROR(ENOMEM);
+
+    generate_parity_lut(&lut[0*LUT_SIZE], 0);
+    generate_parity_lut(&lut[1*LUT_SIZE], 1);
+    generate_offset_lut(&lut[2*LUT_SIZE], 0);
+    generate_offset_lut(&lut[3*LUT_SIZE], 1);
+
+    *lut_ctx = lut;
+
+    return 0;
+}
+
+av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx)
+{
+    av_freep(lut_ctx);
+}
diff --git a/libavcodec/dirac_vlc.h b/libavcodec/dirac_vlc.h
new file mode 100644
index 0000000..42ae41b
--- /dev/null
+++ b/libavcodec/dirac_vlc.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <rpehlivanov@obe.tv>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRAC_VLC_H
+#define AVCODEC_DIRAC_VLC_H
+
+#include "libavutil/avutil.h"
+
+/* Can be 32 bits wide for some performance gain on some machines, but it will
+ * incorrectly decode very long coefficients (usually only 1 or 2 per frame) */
+typedef uint64_t residual;
+
+#define LUT_BITS 8
+
+/* Exactly 64 bytes */
+typedef struct DiracGolombLUT {
+    residual preamble, leftover;
+    int32_t  ready[LUT_BITS];
+    int32_t  preamble_bits, leftover_bits, ready_num;
+    int8_t   need_s, sign;
+} DiracGolombLUT;
+
+av_cold int ff_dirac_golomb_reader_init(DiracGolombLUT **lut_ctx);
+
+int ff_dirac_golomb_read_32bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *dst, int coeffs);
+
+int ff_dirac_golomb_read_16bit(DiracGolombLUT *lut_ctx, const uint8_t *buf,
+                               int bytes, uint8_t *_dst, int coeffs);
+
+av_cold void ff_dirac_golomb_reader_end(DiracGolombLUT **lut_ctx);
+
+#endif /* AVCODEC_DIRAC_VLC_H */
diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
new file mode 100644
index 0000000..30b4bfa
--- /dev/null
+++ b/libavcodec/diracdec.c
@@ -0,0 +1,2363 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Dirac Decoder
+ * @author Marco Gerards <marco@gnu.org>, David Conrad, Jordi Ortiz <nenjordi@gmail.com>
+ */
+
+#include "libavutil/pixdesc.h"
+#include "libavutil/thread.h"
+#include "avcodec.h"
+#include "get_bits.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "golomb.h"
+#include "dirac_arith.h"
+#include "dirac_vlc.h"
+#include "mpeg12data.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideoencdsp.h"
+#include "dirac_dwt.h"
+#include "dirac.h"
+#include "diractab.h"
+#include "diracdsp.h"
+#include "videodsp.h"
+
+/**
+ * The spec limits this to 3 for frame coding, but in practice can be as high as 6
+ */
+#define MAX_REFERENCE_FRAMES 8
+#define MAX_DELAY 5         /* limit for main profile for frame coding (TODO: field coding) */
+#define MAX_FRAMES (MAX_REFERENCE_FRAMES + MAX_DELAY + 1)
+#define MAX_QUANT 255        /* max quant for VC-2 */
+#define MAX_BLOCKSIZE 32    /* maximum xblen/yblen we support */
+
+/**
+ * DiracBlock->ref flags, if set then the block does MC from the given ref
+ */
+#define DIRAC_REF_MASK_REF1   1
+#define DIRAC_REF_MASK_REF2   2
+#define DIRAC_REF_MASK_GLOBAL 4
+
+/**
+ * Value of Picture.reference when Picture is not a reference picture, but
+ * is held for delayed output.
+ */
+#define DELAYED_PIC_REF 4
+
+#define CALC_PADDING(size, depth)                       \
+    (((size + (1 << depth) - 1) >> depth) << depth)
+
+#define DIVRNDUP(a, b) (((a) + (b) - 1) / (b))
+
+typedef struct {
+    AVFrame *avframe;
+    int interpolated[3];    /* 1 if hpel[] is valid */
+    uint8_t *hpel[3][4];
+    uint8_t *hpel_base[3][4];
+    int reference;
+} DiracFrame;
+
+typedef struct {
+    union {
+        int16_t mv[2][2];
+        int16_t dc[3];
+    } u; /* anonymous unions aren't in C99 :( */
+    uint8_t ref;
+} DiracBlock;
+
+typedef struct SubBand {
+    int level;
+    int orientation;
+    int stride; /* in bytes */
+    int width;
+    int height;
+    int pshift;
+    int quant;
+    uint8_t *ibuf;
+    struct SubBand *parent;
+
+    /* for low delay */
+    unsigned length;
+    const uint8_t *coeff_data;
+} SubBand;
+
+typedef struct Plane {
+    DWTPlane idwt;
+
+    int width;
+    int height;
+    ptrdiff_t stride;
+
+    /* block length */
+    uint8_t xblen;
+    uint8_t yblen;
+    /* block separation (block n+1 starts after this many pixels in block n) */
+    uint8_t xbsep;
+    uint8_t ybsep;
+    /* amount of overspill on each edge (half of the overlap between blocks) */
+    uint8_t xoffset;
+    uint8_t yoffset;
+
+    SubBand band[MAX_DWT_LEVELS][4];
+} Plane;
+
+/* Used by Low Delay and High Quality profiles */
+typedef struct DiracSlice {
+    GetBitContext gb;
+    int slice_x;
+    int slice_y;
+    int bytes;
+} DiracSlice;
+
+typedef struct DiracContext {
+    AVCodecContext *avctx;
+    MpegvideoEncDSPContext mpvencdsp;
+    VideoDSPContext vdsp;
+    DiracDSPContext diracdsp;
+    DiracGolombLUT *reader_ctx;
+    DiracVersionInfo version;
+    GetBitContext gb;
+    AVDiracSeqHeader seq;
+    int seen_sequence_header;
+    int64_t frame_number;       /* number of the next frame to display       */
+    Plane plane[3];
+    int chroma_x_shift;
+    int chroma_y_shift;
+
+    int bit_depth;              /* bit depth                                 */
+    int pshift;                 /* pixel shift = bit_depth > 8               */
+
+    int zero_res;               /* zero residue flag                         */
+    int is_arith;               /* whether coeffs use arith or golomb coding */
+    int core_syntax;            /* use core syntax only                      */
+    int low_delay;              /* use the low delay syntax                  */
+    int hq_picture;             /* high quality picture, enables low_delay   */
+    int ld_picture;             /* use low delay picture, turns on low_delay */
+    int dc_prediction;          /* has dc prediction                         */
+    int globalmc_flag;          /* use global motion compensation            */
+    int num_refs;               /* number of reference pictures              */
+
+    /* wavelet decoding */
+    unsigned wavelet_depth;     /* depth of the IDWT                         */
+    unsigned wavelet_idx;
+
+    /**
+     * schroedinger older than 1.0.8 doesn't store
+     * quant delta if only one codebook exists in a band
+     */
+    unsigned old_delta_quant;
+    unsigned codeblock_mode;
+
+    unsigned num_x;              /* number of horizontal slices               */
+    unsigned num_y;              /* number of vertical slices                 */
+
+    uint8_t *thread_buf;         /* Per-thread buffer for coefficient storage */
+    int threads_num_buf;         /* Current # of buffers allocated            */
+    int thread_buf_size;         /* Each thread has a buffer this size        */
+
+    DiracSlice *slice_params_buf;
+    int slice_params_num_buf;
+
+    struct {
+        unsigned width;
+        unsigned height;
+    } codeblock[MAX_DWT_LEVELS+1];
+
+    struct {
+        AVRational bytes;       /* average bytes per slice                   */
+        uint8_t quant[MAX_DWT_LEVELS][4]; /* [DIRAC_STD] E.1 */
+    } lowdelay;
+
+    struct {
+        unsigned prefix_bytes;
+        uint64_t size_scaler;
+    } highquality;
+
+    struct {
+        int pan_tilt[2];        /* pan/tilt vector                           */
+        int zrs[2][2];          /* zoom/rotate/shear matrix                  */
+        int perspective[2];     /* perspective vector                        */
+        unsigned zrs_exp;
+        unsigned perspective_exp;
+    } globalmc[2];
+
+    /* motion compensation */
+    uint8_t mv_precision;       /* [DIRAC_STD] REFS_WT_PRECISION             */
+    int16_t weight[2];          /* [DIRAC_STD] REF1_WT and REF2_WT           */
+    unsigned weight_log2denom;  /* [DIRAC_STD] REFS_WT_PRECISION             */
+
+    int blwidth;                /* number of blocks (horizontally)           */
+    int blheight;               /* number of blocks (vertically)             */
+    int sbwidth;                /* number of superblocks (horizontally)      */
+    int sbheight;               /* number of superblocks (vertically)        */
+
+    uint8_t *sbsplit;
+    DiracBlock *blmotion;
+
+    uint8_t *edge_emu_buffer[4];
+    uint8_t *edge_emu_buffer_base;
+
+    uint16_t *mctmp;            /* buffer holding the MC data multiplied by OBMC weights */
+    uint8_t *mcscratch;
+    int buffer_stride;
+
+    DECLARE_ALIGNED(16, uint8_t, obmc_weight)[3][MAX_BLOCKSIZE*MAX_BLOCKSIZE];
+
+    void (*put_pixels_tab[4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*avg_pixels_tab[4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*add_obmc)(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+    dirac_weight_func weight_func;
+    dirac_biweight_func biweight_func;
+
+    DiracFrame *current_picture;
+    DiracFrame *ref_pics[2];
+
+    DiracFrame *ref_frames[MAX_REFERENCE_FRAMES+1];
+    DiracFrame *delay_frames[MAX_DELAY+1];
+    DiracFrame all_frames[MAX_FRAMES];
+} DiracContext;
+
+enum dirac_subband {
+    subband_ll = 0,
+    subband_hl = 1,
+    subband_lh = 2,
+    subband_hh = 3,
+    subband_nb,
+};
+
+/* magic number division by 3 from schroedinger */
+static inline int divide3(int x)
+{
+    return (int)((x+1U)*21845 + 10922) >> 16;
+}
+
+static DiracFrame *remove_frame(DiracFrame *framelist[], int picnum)
+{
+    DiracFrame *remove_pic = NULL;
+    int i, remove_idx = -1;
+
+    for (i = 0; framelist[i]; i++)
+        if (framelist[i]->avframe->display_picture_number == picnum) {
+            remove_pic = framelist[i];
+            remove_idx = i;
+        }
+
+    if (remove_pic)
+        for (i = remove_idx; framelist[i]; i++)
+            framelist[i] = framelist[i+1];
+
+    return remove_pic;
+}
+
+static int add_frame(DiracFrame *framelist[], int maxframes, DiracFrame *frame)
+{
+    int i;
+    for (i = 0; i < maxframes; i++)
+        if (!framelist[i]) {
+            framelist[i] = frame;
+            return 0;
+        }
+    return -1;
+}
+
+static int alloc_sequence_buffers(DiracContext *s)
+{
+    int sbwidth  = DIVRNDUP(s->seq.width,  4);
+    int sbheight = DIVRNDUP(s->seq.height, 4);
+    int i, w, h, top_padding;
+
+    /* todo: think more about this / use or set Plane here */
+    for (i = 0; i < 3; i++) {
+        int max_xblen = MAX_BLOCKSIZE >> (i ? s->chroma_x_shift : 0);
+        int max_yblen = MAX_BLOCKSIZE >> (i ? s->chroma_y_shift : 0);
+        w = s->seq.width  >> (i ? s->chroma_x_shift : 0);
+        h = s->seq.height >> (i ? s->chroma_y_shift : 0);
+
+        /* we allocate the max we support here since num decompositions can
+         * change from frame to frame. Stride is aligned to 16 for SIMD, and
+         * 1<<MAX_DWT_LEVELS top padding to avoid if(y>0) in arith decoding
+         * MAX_BLOCKSIZE padding for MC: blocks can spill up to half of that
+         * on each side */
+        top_padding = FFMAX(1<<MAX_DWT_LEVELS, max_yblen/2);
+        w = FFALIGN(CALC_PADDING(w, MAX_DWT_LEVELS), 8); /* FIXME: Should this be 16 for SSE??? */
+        h = top_padding + CALC_PADDING(h, MAX_DWT_LEVELS) + max_yblen/2;
+
+        s->plane[i].idwt.buf_base = av_mallocz_array((w+max_xblen), h * (2 << s->pshift));
+        s->plane[i].idwt.tmp      = av_malloc_array((w+16), 2 << s->pshift);
+        s->plane[i].idwt.buf      = s->plane[i].idwt.buf_base + (top_padding*w)*(2 << s->pshift);
+        if (!s->plane[i].idwt.buf_base || !s->plane[i].idwt.tmp)
+            return AVERROR(ENOMEM);
+    }
+
+    /* fixme: allocate using real stride here */
+    s->sbsplit  = av_malloc_array(sbwidth, sbheight);
+    s->blmotion = av_malloc_array(sbwidth, sbheight * 16 * sizeof(*s->blmotion));
+
+    if (!s->sbsplit || !s->blmotion)
+        return AVERROR(ENOMEM);
+    return 0;
+}
+
+static int alloc_buffers(DiracContext *s, int stride)
+{
+    int w = s->seq.width;
+    int h = s->seq.height;
+
+    av_assert0(stride >= w);
+    stride += 64;
+
+    if (s->buffer_stride >= stride)
+        return 0;
+    s->buffer_stride = 0;
+
+    av_freep(&s->edge_emu_buffer_base);
+    memset(s->edge_emu_buffer, 0, sizeof(s->edge_emu_buffer));
+    av_freep(&s->mctmp);
+    av_freep(&s->mcscratch);
+
+    s->edge_emu_buffer_base = av_malloc_array(stride, MAX_BLOCKSIZE);
+
+    s->mctmp     = av_malloc_array((stride+MAX_BLOCKSIZE), (h+MAX_BLOCKSIZE) * sizeof(*s->mctmp));
+    s->mcscratch = av_malloc_array(stride, MAX_BLOCKSIZE);
+
+    if (!s->edge_emu_buffer_base || !s->mctmp || !s->mcscratch)
+        return AVERROR(ENOMEM);
+
+    s->buffer_stride = stride;
+    return 0;
+}
+
+static void free_sequence_buffers(DiracContext *s)
+{
+    int i, j, k;
+
+    for (i = 0; i < MAX_FRAMES; i++) {
+        if (s->all_frames[i].avframe->data[0]) {
+            av_frame_unref(s->all_frames[i].avframe);
+            memset(s->all_frames[i].interpolated, 0, sizeof(s->all_frames[i].interpolated));
+        }
+
+        for (j = 0; j < 3; j++)
+            for (k = 1; k < 4; k++)
+                av_freep(&s->all_frames[i].hpel_base[j][k]);
+    }
+
+    memset(s->ref_frames, 0, sizeof(s->ref_frames));
+    memset(s->delay_frames, 0, sizeof(s->delay_frames));
+
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->plane[i].idwt.buf_base);
+        av_freep(&s->plane[i].idwt.tmp);
+    }
+
+    s->buffer_stride = 0;
+    av_freep(&s->sbsplit);
+    av_freep(&s->blmotion);
+    av_freep(&s->edge_emu_buffer_base);
+
+    av_freep(&s->mctmp);
+    av_freep(&s->mcscratch);
+}
+
+static AVOnce dirac_arith_init = AV_ONCE_INIT;
+
+static av_cold int dirac_decode_init(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    int i, ret;
+
+    s->avctx = avctx;
+    s->frame_number = -1;
+
+    s->thread_buf = NULL;
+    s->threads_num_buf = -1;
+    s->thread_buf_size = -1;
+
+    ff_dirac_golomb_reader_init(&s->reader_ctx);
+    ff_diracdsp_init(&s->diracdsp);
+    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
+    ff_videodsp_init(&s->vdsp, 8);
+
+    for (i = 0; i < MAX_FRAMES; i++) {
+        s->all_frames[i].avframe = av_frame_alloc();
+        if (!s->all_frames[i].avframe) {
+            while (i > 0)
+                av_frame_free(&s->all_frames[--i].avframe);
+            return AVERROR(ENOMEM);
+        }
+    }
+    ret = ff_thread_once(&dirac_arith_init, ff_dirac_init_arith_tables);
+    if (ret != 0)
+        return AVERROR_UNKNOWN;
+
+    return 0;
+}
+
+static void dirac_decode_flush(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    free_sequence_buffers(s);
+    s->seen_sequence_header = 0;
+    s->frame_number = -1;
+}
+
+static av_cold int dirac_decode_end(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    int i;
+
+    ff_dirac_golomb_reader_end(&s->reader_ctx);
+
+    dirac_decode_flush(avctx);
+    for (i = 0; i < MAX_FRAMES; i++)
+        av_frame_free(&s->all_frames[i].avframe);
+
+    av_freep(&s->thread_buf);
+    av_freep(&s->slice_params_buf);
+
+    return 0;
+}
+
+static inline int coeff_unpack_golomb(GetBitContext *gb, int qfactor, int qoffset)
+{
+    int coeff = dirac_get_se_golomb(gb);
+    const unsigned sign = FFSIGN(coeff);
+    if (coeff)
+        coeff = sign*((sign * coeff * qfactor + qoffset) >> 2);
+    return coeff;
+}
+
+#define SIGN_CTX(x) (CTX_SIGN_ZERO + ((x) > 0) - ((x) < 0))
+
+#define UNPACK_ARITH(n, type) \
+    static inline void coeff_unpack_arith_##n(DiracArith *c, int qfactor, int qoffset, \
+                                              SubBand *b, type *buf, int x, int y) \
+    { \
+        int sign, sign_pred = 0, pred_ctx = CTX_ZPZN_F1; \
+        unsigned coeff; \
+        const int mstride = -(b->stride >> (1+b->pshift)); \
+        if (b->parent) { \
+            const type *pbuf = (type *)b->parent->ibuf; \
+            const int stride = b->parent->stride >> (1+b->parent->pshift); \
+            pred_ctx += !!pbuf[stride * (y>>1) + (x>>1)] << 1; \
+        } \
+        if (b->orientation == subband_hl) \
+            sign_pred = buf[mstride]; \
+        if (x) { \
+            pred_ctx += !(buf[-1] | buf[mstride] | buf[-1 + mstride]); \
+            if (b->orientation == subband_lh) \
+                sign_pred = buf[-1]; \
+        } else { \
+            pred_ctx += !buf[mstride]; \
+        } \
+        coeff = dirac_get_arith_uint(c, pred_ctx, CTX_COEFF_DATA); \
+        if (coeff) { \
+            coeff = (coeff * qfactor + qoffset) >> 2; \
+            sign  = dirac_get_arith_bit(c, SIGN_CTX(sign_pred)); \
+            coeff = (coeff ^ -sign) + sign; \
+        } \
+        *buf = coeff; \
+    } \
+
+UNPACK_ARITH(8, int16_t)
+UNPACK_ARITH(10, int32_t)
+
+/**
+ * Decode the coeffs in the rectangle defined by left, right, top, bottom
+ * [DIRAC_STD] 13.4.3.2 Codeblock unpacking loop. codeblock()
+ */
+static inline int codeblock(DiracContext *s, SubBand *b,
+                             GetBitContext *gb, DiracArith *c,
+                             int left, int right, int top, int bottom,
+                             int blockcnt_one, int is_arith)
+{
+    int x, y, zero_block;
+    int qoffset, qfactor;
+    uint8_t *buf;
+
+    /* check for any coded coefficients in this codeblock */
+    if (!blockcnt_one) {
+        if (is_arith)
+            zero_block = dirac_get_arith_bit(c, CTX_ZERO_BLOCK);
+        else
+            zero_block = get_bits1(gb);
+
+        if (zero_block)
+            return 0;
+    }
+
+    if (s->codeblock_mode && !(s->old_delta_quant && blockcnt_one)) {
+        int quant;
+        if (is_arith)
+            quant = dirac_get_arith_int(c, CTX_DELTA_Q_F, CTX_DELTA_Q_DATA);
+        else
+            quant = dirac_get_se_golomb(gb);
+        if (quant > INT_MAX - b->quant || b->quant + quant < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid quant\n");
+            return AVERROR_INVALIDDATA;
+        }
+        b->quant += quant;
+    }
+
+    if (b->quant > (DIRAC_MAX_QUANT_INDEX - 1)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported quant %d\n", b->quant);
+        b->quant = 0;
+        return AVERROR_INVALIDDATA;
+    }
+
+    qfactor = ff_dirac_qscale_tab[b->quant];
+    /* TODO: context pointer? */
+    if (!s->num_refs)
+        qoffset = ff_dirac_qoffset_intra_tab[b->quant] + 2;
+    else
+        qoffset = ff_dirac_qoffset_inter_tab[b->quant] + 2;
+
+    buf = b->ibuf + top * b->stride;
+    if (is_arith) {
+        for (y = top; y < bottom; y++) {
+            if (c->error)
+                return c->error;
+            for (x = left; x < right; x++) {
+                if (b->pshift) {
+                    coeff_unpack_arith_10(c, qfactor, qoffset, b, (int32_t*)(buf)+x, x, y);
+                } else {
+                    coeff_unpack_arith_8(c, qfactor, qoffset, b, (int16_t*)(buf)+x, x, y);
+                }
+            }
+            buf += b->stride;
+        }
+    } else {
+        for (y = top; y < bottom; y++) {
+            if (get_bits_left(gb) < 1)
+                return AVERROR_INVALIDDATA;
+            for (x = left; x < right; x++) {
+                int val = coeff_unpack_golomb(gb, qfactor, qoffset);
+                if (b->pshift) {
+                    AV_WN32(&buf[4*x], val);
+                } else {
+                    AV_WN16(&buf[2*x], val);
+                }
+            }
+            buf += b->stride;
+         }
+     }
+     return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 13.3 intra_dc_prediction(band)
+ */
+#define INTRA_DC_PRED(n, type) \
+    static inline void intra_dc_prediction_##n(SubBand *b) \
+    { \
+        type *buf = (type*)b->ibuf; \
+        int x, y; \
+        \
+        for (x = 1; x < b->width; x++) \
+            buf[x] += buf[x-1]; \
+        buf += (b->stride >> (1+b->pshift)); \
+        \
+        for (y = 1; y < b->height; y++) { \
+            buf[0] += buf[-(b->stride >> (1+b->pshift))]; \
+            \
+            for (x = 1; x < b->width; x++) { \
+                int pred = buf[x - 1] + buf[x - (b->stride >> (1+b->pshift))] + buf[x - (b->stride >> (1+b->pshift))-1]; \
+                buf[x]  += divide3(pred); \
+            } \
+            buf += (b->stride >> (1+b->pshift)); \
+        } \
+    } \
+
+INTRA_DC_PRED(8, int16_t)
+INTRA_DC_PRED(10, uint32_t)
+
+/**
+ * Dirac Specification ->
+ * 13.4.2 Non-skipped subbands.  subband_coeffs()
+ */
+static av_always_inline int decode_subband_internal(DiracContext *s, SubBand *b, int is_arith)
+{
+    int cb_x, cb_y, left, right, top, bottom;
+    DiracArith c;
+    GetBitContext gb;
+    int cb_width  = s->codeblock[b->level + (b->orientation != subband_ll)].width;
+    int cb_height = s->codeblock[b->level + (b->orientation != subband_ll)].height;
+    int blockcnt_one = (cb_width + cb_height) == 2;
+    int ret;
+
+    if (!b->length)
+        return 0;
+
+    init_get_bits8(&gb, b->coeff_data, b->length);
+
+    if (is_arith)
+        ff_dirac_init_arith_decoder(&c, &gb, b->length);
+
+    top = 0;
+    for (cb_y = 0; cb_y < cb_height; cb_y++) {
+        bottom = (b->height * (cb_y+1LL)) / cb_height;
+        left = 0;
+        for (cb_x = 0; cb_x < cb_width; cb_x++) {
+            right = (b->width * (cb_x+1LL)) / cb_width;
+            ret = codeblock(s, b, &gb, &c, left, right, top, bottom, blockcnt_one, is_arith);
+            if (ret < 0)
+                return ret;
+            left = right;
+        }
+        top = bottom;
+    }
+
+    if (b->orientation == subband_ll && s->num_refs == 0) {
+        if (s->pshift) {
+            intra_dc_prediction_10(b);
+        } else {
+            intra_dc_prediction_8(b);
+        }
+    }
+    return 0;
+}
+
+static int decode_subband_arith(AVCodecContext *avctx, void *b)
+{
+    DiracContext *s = avctx->priv_data;
+    return decode_subband_internal(s, b, 1);
+}
+
+static int decode_subband_golomb(AVCodecContext *avctx, void *arg)
+{
+    DiracContext *s = avctx->priv_data;
+    SubBand **b     = arg;
+    return decode_subband_internal(s, *b, 0);
+}
+
+/**
+ * Dirac Specification ->
+ * [DIRAC_STD] 13.4.1 core_transform_data()
+ */
+static int decode_component(DiracContext *s, int comp)
+{
+    AVCodecContext *avctx = s->avctx;
+    SubBand *bands[3*MAX_DWT_LEVELS+1];
+    enum dirac_subband orientation;
+    int level, num_bands = 0;
+    int ret[3*MAX_DWT_LEVELS+1];
+    int i;
+    int damaged_count = 0;
+
+    /* Unpack all subbands at all levels. */
+    for (level = 0; level < s->wavelet_depth; level++) {
+        for (orientation = !!level; orientation < 4; orientation++) {
+            SubBand *b = &s->plane[comp].band[level][orientation];
+            bands[num_bands++] = b;
+
+            align_get_bits(&s->gb);
+            /* [DIRAC_STD] 13.4.2 subband() */
+            b->length = get_interleaved_ue_golomb(&s->gb);
+            if (b->length) {
+                b->quant = get_interleaved_ue_golomb(&s->gb);
+                if (b->quant > (DIRAC_MAX_QUANT_INDEX - 1)) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Unsupported quant %d\n", b->quant);
+                    b->quant = 0;
+                    return AVERROR_INVALIDDATA;
+                }
+                align_get_bits(&s->gb);
+                b->coeff_data = s->gb.buffer + get_bits_count(&s->gb)/8;
+                b->length = FFMIN(b->length, FFMAX(get_bits_left(&s->gb)/8, 0));
+                skip_bits_long(&s->gb, b->length*8);
+            }
+        }
+        /* arithmetic coding has inter-level dependencies, so we can only execute one level at a time */
+        if (s->is_arith)
+            avctx->execute(avctx, decode_subband_arith, &s->plane[comp].band[level][!!level],
+                           ret + 3*level + !!level, 4-!!level, sizeof(SubBand));
+    }
+    /* golomb coding has no inter-level dependencies, so we can execute all subbands in parallel */
+    if (!s->is_arith)
+        avctx->execute(avctx, decode_subband_golomb, bands, ret, num_bands, sizeof(SubBand*));
+
+    for (i = 0; i < s->wavelet_depth * 3 + 1; i++) {
+        if (ret[i] < 0)
+            damaged_count++;
+    }
+    if (damaged_count > (s->wavelet_depth * 3 + 1) /2)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+#define PARSE_VALUES(type, x, gb, ebits, buf1, buf2) \
+    type *buf = (type *)buf1; \
+    buf[x] = coeff_unpack_golomb(gb, qfactor, qoffset); \
+    if (get_bits_count(gb) >= ebits) \
+        return; \
+    if (buf2) { \
+        buf = (type *)buf2; \
+        buf[x] = coeff_unpack_golomb(gb, qfactor, qoffset); \
+        if (get_bits_count(gb) >= ebits) \
+            return; \
+    } \
+
+static void decode_subband(DiracContext *s, GetBitContext *gb, int quant,
+                           int slice_x, int slice_y, int bits_end,
+                           SubBand *b1, SubBand *b2)
+{
+    int left   = b1->width  * slice_x    / s->num_x;
+    int right  = b1->width  *(slice_x+1) / s->num_x;
+    int top    = b1->height * slice_y    / s->num_y;
+    int bottom = b1->height *(slice_y+1) / s->num_y;
+
+    int qfactor, qoffset;
+
+    uint8_t *buf1 =      b1->ibuf + top * b1->stride;
+    uint8_t *buf2 = b2 ? b2->ibuf + top * b2->stride: NULL;
+    int x, y;
+
+    if (quant > (DIRAC_MAX_QUANT_INDEX - 1)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported quant %d\n", quant);
+        return;
+    }
+    qfactor = ff_dirac_qscale_tab[quant];
+    qoffset = ff_dirac_qoffset_intra_tab[quant] + 2;
+    /* we have to constantly check for overread since the spec explicitly
+       requires this, with the meaning that all remaining coeffs are set to 0 */
+    if (get_bits_count(gb) >= bits_end)
+        return;
+
+    if (s->pshift) {
+        for (y = top; y < bottom; y++) {
+            for (x = left; x < right; x++) {
+                PARSE_VALUES(int32_t, x, gb, bits_end, buf1, buf2);
+            }
+            buf1 += b1->stride;
+            if (buf2)
+                buf2 += b2->stride;
+        }
+    }
+    else {
+        for (y = top; y < bottom; y++) {
+            for (x = left; x < right; x++) {
+                PARSE_VALUES(int16_t, x, gb, bits_end, buf1, buf2);
+            }
+            buf1 += b1->stride;
+            if (buf2)
+                buf2 += b2->stride;
+        }
+    }
+}
+
+/**
+ * Dirac Specification ->
+ * 13.5.2 Slices. slice(sx,sy)
+ */
+static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg)
+{
+    DiracContext *s = avctx->priv_data;
+    DiracSlice *slice = arg;
+    GetBitContext *gb = &slice->gb;
+    enum dirac_subband orientation;
+    int level, quant, chroma_bits, chroma_end;
+
+    int quant_base  = get_bits(gb, 7); /*[DIRAC_STD] qindex */
+    int length_bits = av_log2(8 * slice->bytes)+1;
+    int luma_bits   = get_bits_long(gb, length_bits);
+    int luma_end    = get_bits_count(gb) + FFMIN(luma_bits, get_bits_left(gb));
+
+    /* [DIRAC_STD] 13.5.5.2 luma_slice_band */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++) {
+            quant = FFMAX(quant_base - s->lowdelay.quant[level][orientation], 0);
+            decode_subband(s, gb, quant, slice->slice_x, slice->slice_y, luma_end,
+                           &s->plane[0].band[level][orientation], NULL);
+        }
+
+    /* consume any unused bits from luma */
+    skip_bits_long(gb, get_bits_count(gb) - luma_end);
+
+    chroma_bits = 8*slice->bytes - 7 - length_bits - luma_bits;
+    chroma_end  = get_bits_count(gb) + FFMIN(chroma_bits, get_bits_left(gb));
+    /* [DIRAC_STD] 13.5.5.3 chroma_slice_band */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++) {
+            quant = FFMAX(quant_base - s->lowdelay.quant[level][orientation], 0);
+            decode_subband(s, gb, quant, slice->slice_x, slice->slice_y, chroma_end,
+                           &s->plane[1].band[level][orientation],
+                           &s->plane[2].band[level][orientation]);
+        }
+
+    return 0;
+}
+
+typedef struct SliceCoeffs {
+    int left;
+    int top;
+    int tot_h;
+    int tot_v;
+    int tot;
+} SliceCoeffs;
+
+static int subband_coeffs(DiracContext *s, int x, int y, int p,
+                          SliceCoeffs c[MAX_DWT_LEVELS])
+{
+    int level, coef = 0;
+    for (level = 0; level < s->wavelet_depth; level++) {
+        SliceCoeffs *o = &c[level];
+        SubBand *b = &s->plane[p].band[level][3]; /* orientation doens't matter */
+        o->top   = b->height * y / s->num_y;
+        o->left  = b->width  * x / s->num_x;
+        o->tot_h = ((b->width  * (x + 1)) / s->num_x) - o->left;
+        o->tot_v = ((b->height * (y + 1)) / s->num_y) - o->top;
+        o->tot   = o->tot_h*o->tot_v;
+        coef    += o->tot * (4 - !!level);
+    }
+    return coef;
+}
+
+/**
+ * VC-2 Specification ->
+ * 13.5.3 hq_slice(sx,sy)
+ */
+static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
+{
+    int i, level, orientation, quant_idx;
+    int qfactor[MAX_DWT_LEVELS][4], qoffset[MAX_DWT_LEVELS][4];
+    GetBitContext *gb = &slice->gb;
+    SliceCoeffs coeffs_num[MAX_DWT_LEVELS];
+
+    skip_bits_long(gb, 8*s->highquality.prefix_bytes);
+    quant_idx = get_bits(gb, 8);
+
+    if (quant_idx > DIRAC_MAX_QUANT_INDEX - 1) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid quantization index - %i\n", quant_idx);
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* Slice quantization (slice_quantizers() in the specs) */
+    for (level = 0; level < s->wavelet_depth; level++) {
+        for (orientation = !!level; orientation < 4; orientation++) {
+            const int quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
+            qfactor[level][orientation] = ff_dirac_qscale_tab[quant];
+            qoffset[level][orientation] = ff_dirac_qoffset_intra_tab[quant] + 2;
+        }
+    }
+
+    /* Luma + 2 Chroma planes */
+    for (i = 0; i < 3; i++) {
+        int coef_num, coef_par, off = 0;
+        int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
+        int64_t bits_end = get_bits_count(gb) + 8*length;
+        const uint8_t *addr = align_get_bits(gb);
+
+        if (length*8 > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
+
+        if (s->pshift)
+            coef_par = ff_dirac_golomb_read_32bit(s->reader_ctx, addr,
+                                                  length, tmp_buf, coef_num);
+        else
+            coef_par = ff_dirac_golomb_read_16bit(s->reader_ctx, addr,
+                                                  length, tmp_buf, coef_num);
+
+        if (coef_num > coef_par) {
+            const int start_b = coef_par * (1 << (s->pshift + 1));
+            const int end_b   = coef_num * (1 << (s->pshift + 1));
+            memset(&tmp_buf[start_b], 0, end_b - start_b);
+        }
+
+        for (level = 0; level < s->wavelet_depth; level++) {
+            const SliceCoeffs *c = &coeffs_num[level];
+            for (orientation = !!level; orientation < 4; orientation++) {
+                const SubBand *b1 = &s->plane[i].band[level][orientation];
+                uint8_t *buf = b1->ibuf + c->top * b1->stride + (c->left << (s->pshift + 1));
+
+                /* Change to c->tot_h <= 4 for AVX2 dequantization */
+                const int qfunc = s->pshift + 2*(c->tot_h <= 2);
+                s->diracdsp.dequant_subband[qfunc](&tmp_buf[off], buf, b1->stride,
+                                                   qfactor[level][orientation],
+                                                   qoffset[level][orientation],
+                                                   c->tot_v, c->tot_h);
+
+                off += c->tot << (s->pshift + 1);
+            }
+        }
+
+        skip_bits_long(gb, bits_end - get_bits_count(gb));
+    }
+
+    return 0;
+}
+
+static int decode_hq_slice_row(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    int i;
+    DiracContext *s = avctx->priv_data;
+    DiracSlice *slices = ((DiracSlice *)arg) + s->num_x*jobnr;
+    uint8_t *thread_buf = &s->thread_buf[s->thread_buf_size*threadnr];
+    for (i = 0; i < s->num_x; i++)
+        decode_hq_slice(s, &slices[i], thread_buf);
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 13.5.1 low_delay_transform_data()
+ */
+static int decode_lowdelay(DiracContext *s)
+{
+    AVCodecContext *avctx = s->avctx;
+    int slice_x, slice_y, bufsize;
+    int64_t coef_buf_size, bytes = 0;
+    const uint8_t *buf;
+    DiracSlice *slices;
+    SliceCoeffs tmp[MAX_DWT_LEVELS];
+    int slice_num = 0;
+
+    if (s->slice_params_num_buf != (s->num_x * s->num_y)) {
+        s->slice_params_buf = av_realloc_f(s->slice_params_buf, s->num_x * s->num_y, sizeof(DiracSlice));
+        if (!s->slice_params_buf) {
+            av_log(s->avctx, AV_LOG_ERROR, "slice params buffer allocation failure\n");
+            s->slice_params_num_buf = 0;
+            return AVERROR(ENOMEM);
+        }
+        s->slice_params_num_buf = s->num_x * s->num_y;
+    }
+    slices = s->slice_params_buf;
+
+    /* 8 becacuse that's how much the golomb reader could overread junk data
+     * from another plane/slice at most, and 512 because SIMD */
+    coef_buf_size = subband_coeffs(s, s->num_x - 1, s->num_y - 1, 0, tmp) + 8;
+    coef_buf_size = (coef_buf_size << (1 + s->pshift)) + 512;
+
+    if (s->threads_num_buf != avctx->thread_count ||
+        s->thread_buf_size != coef_buf_size) {
+        s->threads_num_buf  = avctx->thread_count;
+        s->thread_buf_size  = coef_buf_size;
+        s->thread_buf       = av_realloc_f(s->thread_buf, avctx->thread_count, s->thread_buf_size);
+        if (!s->thread_buf) {
+            av_log(s->avctx, AV_LOG_ERROR, "thread buffer allocation failure\n");
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    align_get_bits(&s->gb);
+    /*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */
+    buf = s->gb.buffer + get_bits_count(&s->gb)/8;
+    bufsize = get_bits_left(&s->gb);
+
+    if (s->hq_picture) {
+        int i;
+
+        for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {
+            for (slice_x = 0; bufsize > 0 && slice_x < s->num_x; slice_x++) {
+                bytes = s->highquality.prefix_bytes + 1;
+                for (i = 0; i < 3; i++) {
+                    if (bytes <= bufsize/8)
+                        bytes += buf[bytes] * s->highquality.size_scaler + 1;
+                }
+                if (bytes >= INT_MAX || bytes*8 > bufsize) {
+                    av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                slices[slice_num].bytes   = bytes;
+                slices[slice_num].slice_x = slice_x;
+                slices[slice_num].slice_y = slice_y;
+                init_get_bits(&slices[slice_num].gb, buf, bufsize);
+                slice_num++;
+
+                buf     += bytes;
+                if (bufsize/8 >= bytes)
+                    bufsize -= bytes*8;
+                else
+                    bufsize = 0;
+            }
+        }
+
+        if (s->num_x*s->num_y != slice_num) {
+            av_log(s->avctx, AV_LOG_ERROR, "too few slices\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        avctx->execute2(avctx, decode_hq_slice_row, slices, NULL, s->num_y);
+    } else {
+        for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {
+            for (slice_x = 0; bufsize > 0 && slice_x < s->num_x; slice_x++) {
+                bytes = (slice_num+1) * (int64_t)s->lowdelay.bytes.num / s->lowdelay.bytes.den
+                       - slice_num    * (int64_t)s->lowdelay.bytes.num / s->lowdelay.bytes.den;
+                if (bytes >= INT_MAX || bytes*8 > bufsize) {
+                    av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                slices[slice_num].bytes   = bytes;
+                slices[slice_num].slice_x = slice_x;
+                slices[slice_num].slice_y = slice_y;
+                init_get_bits(&slices[slice_num].gb, buf, bufsize);
+                slice_num++;
+
+                buf     += bytes;
+                if (bufsize/8 >= bytes)
+                    bufsize -= bytes*8;
+                else
+                    bufsize = 0;
+            }
+        }
+        avctx->execute(avctx, decode_lowdelay_slice, slices, NULL, slice_num,
+                       sizeof(DiracSlice)); /* [DIRAC_STD] 13.5.2 Slices */
+    }
+
+    if (s->dc_prediction) {
+        if (s->pshift) {
+            intra_dc_prediction_10(&s->plane[0].band[0][0]); /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+            intra_dc_prediction_10(&s->plane[1].band[0][0]); /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+            intra_dc_prediction_10(&s->plane[2].band[0][0]); /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+        } else {
+            intra_dc_prediction_8(&s->plane[0].band[0][0]);
+            intra_dc_prediction_8(&s->plane[1].band[0][0]);
+            intra_dc_prediction_8(&s->plane[2].band[0][0]);
+        }
+    }
+
+    return 0;
+}
+
+static void init_planes(DiracContext *s)
+{
+    int i, w, h, level, orientation;
+
+    for (i = 0; i < 3; i++) {
+        Plane *p = &s->plane[i];
+
+        p->width       = s->seq.width  >> (i ? s->chroma_x_shift : 0);
+        p->height      = s->seq.height >> (i ? s->chroma_y_shift : 0);
+        p->idwt.width  = w = CALC_PADDING(p->width , s->wavelet_depth);
+        p->idwt.height = h = CALC_PADDING(p->height, s->wavelet_depth);
+        p->idwt.stride = FFALIGN(p->idwt.width, 8) << (1 + s->pshift);
+
+        for (level = s->wavelet_depth-1; level >= 0; level--) {
+            w = w>>1;
+            h = h>>1;
+            for (orientation = !!level; orientation < 4; orientation++) {
+                SubBand *b = &p->band[level][orientation];
+
+                b->pshift = s->pshift;
+                b->ibuf   = p->idwt.buf;
+                b->level  = level;
+                b->stride = p->idwt.stride << (s->wavelet_depth - level);
+                b->width  = w;
+                b->height = h;
+                b->orientation = orientation;
+
+                if (orientation & 1)
+                    b->ibuf += w << (1+b->pshift);
+                if (orientation > 1)
+                    b->ibuf += (b->stride>>1);
+
+                if (level)
+                    b->parent = &p->band[level-1][orientation];
+            }
+        }
+
+        if (i > 0) {
+            p->xblen = s->plane[0].xblen >> s->chroma_x_shift;
+            p->yblen = s->plane[0].yblen >> s->chroma_y_shift;
+            p->xbsep = s->plane[0].xbsep >> s->chroma_x_shift;
+            p->ybsep = s->plane[0].ybsep >> s->chroma_y_shift;
+        }
+
+        p->xoffset = (p->xblen - p->xbsep)/2;
+        p->yoffset = (p->yblen - p->ybsep)/2;
+    }
+}
+
+/**
+ * Unpack the motion compensation parameters
+ * Dirac Specification ->
+ * 11.2 Picture prediction data. picture_prediction()
+ */
+static int dirac_unpack_prediction_parameters(DiracContext *s)
+{
+    static const uint8_t default_blen[] = { 4, 12, 16, 24 };
+
+    GetBitContext *gb = &s->gb;
+    unsigned idx, ref;
+
+    align_get_bits(gb);
+    /* [DIRAC_STD] 11.2.2 Block parameters. block_parameters() */
+    /* Luma and Chroma are equal. 11.2.3 */
+    idx = get_interleaved_ue_golomb(gb); /* [DIRAC_STD] index */
+
+    if (idx > 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block prediction index too high\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (idx == 0) {
+        s->plane[0].xblen = get_interleaved_ue_golomb(gb);
+        s->plane[0].yblen = get_interleaved_ue_golomb(gb);
+        s->plane[0].xbsep = get_interleaved_ue_golomb(gb);
+        s->plane[0].ybsep = get_interleaved_ue_golomb(gb);
+    } else {
+        /*[DIRAC_STD] preset_block_params(index). Table 11.1 */
+        s->plane[0].xblen = default_blen[idx-1];
+        s->plane[0].yblen = default_blen[idx-1];
+        s->plane[0].xbsep = 4 * idx;
+        s->plane[0].ybsep = 4 * idx;
+    }
+    /*[DIRAC_STD] 11.2.4 motion_data_dimensions()
+      Calculated in function dirac_unpack_block_motion_data */
+
+    if (s->plane[0].xblen % (1 << s->chroma_x_shift) != 0 ||
+        s->plane[0].yblen % (1 << s->chroma_y_shift) != 0 ||
+        !s->plane[0].xblen || !s->plane[0].yblen) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "invalid x/y block length (%d/%d) for x/y chroma shift (%d/%d)\n",
+               s->plane[0].xblen, s->plane[0].yblen, s->chroma_x_shift, s->chroma_y_shift);
+        return AVERROR_INVALIDDATA;
+    }
+    if (!s->plane[0].xbsep || !s->plane[0].ybsep || s->plane[0].xbsep < s->plane[0].xblen/2 || s->plane[0].ybsep < s->plane[0].yblen/2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block separation too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (s->plane[0].xbsep > s->plane[0].xblen || s->plane[0].ybsep > s->plane[0].yblen) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block separation greater than size\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (FFMAX(s->plane[0].xblen, s->plane[0].yblen) > MAX_BLOCKSIZE) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported large block size\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    /*[DIRAC_STD] 11.2.5 Motion vector precision. motion_vector_precision()
+      Read motion vector precision */
+    s->mv_precision = get_interleaved_ue_golomb(gb);
+    if (s->mv_precision > 3) {
+        av_log(s->avctx, AV_LOG_ERROR, "MV precision finer than eighth-pel\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*[DIRAC_STD] 11.2.6 Global motion. global_motion()
+      Read the global motion compensation parameters */
+    s->globalmc_flag = get_bits1(gb);
+    if (s->globalmc_flag) {
+        memset(s->globalmc, 0, sizeof(s->globalmc));
+        /* [DIRAC_STD] pan_tilt(gparams) */
+        for (ref = 0; ref < s->num_refs; ref++) {
+            if (get_bits1(gb)) {
+                s->globalmc[ref].pan_tilt[0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].pan_tilt[1] = dirac_get_se_golomb(gb);
+            }
+            /* [DIRAC_STD] zoom_rotate_shear(gparams)
+               zoom/rotation/shear parameters */
+            if (get_bits1(gb)) {
+                s->globalmc[ref].zrs_exp   = get_interleaved_ue_golomb(gb);
+                s->globalmc[ref].zrs[0][0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[0][1] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[1][0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[1][1] = dirac_get_se_golomb(gb);
+            } else {
+                s->globalmc[ref].zrs[0][0] = 1;
+                s->globalmc[ref].zrs[1][1] = 1;
+            }
+            /* [DIRAC_STD] perspective(gparams) */
+            if (get_bits1(gb)) {
+                s->globalmc[ref].perspective_exp = get_interleaved_ue_golomb(gb);
+                s->globalmc[ref].perspective[0]  = dirac_get_se_golomb(gb);
+                s->globalmc[ref].perspective[1]  = dirac_get_se_golomb(gb);
+            }
+            if (s->globalmc[ref].perspective_exp + (uint64_t)s->globalmc[ref].zrs_exp > 30) {
+                return AVERROR_INVALIDDATA;
+            }
+
+        }
+    }
+
+    /*[DIRAC_STD] 11.2.7 Picture prediction mode. prediction_mode()
+      Picture prediction mode, not currently used. */
+    if (get_interleaved_ue_golomb(gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unknown picture prediction mode\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* [DIRAC_STD] 11.2.8 Reference picture weight. reference_picture_weights()
+       just data read, weight calculation will be done later on. */
+    s->weight_log2denom = 1;
+    s->weight[0]        = 1;
+    s->weight[1]        = 1;
+
+    if (get_bits1(gb)) {
+        s->weight_log2denom = get_interleaved_ue_golomb(gb);
+        if (s->weight_log2denom < 1 || s->weight_log2denom > 8) {
+            av_log(s->avctx, AV_LOG_ERROR, "weight_log2denom unsupported or invalid\n");
+            s->weight_log2denom = 1;
+            return AVERROR_INVALIDDATA;
+        }
+        s->weight[0] = dirac_get_se_golomb(gb);
+        if (s->num_refs == 2)
+            s->weight[1] = dirac_get_se_golomb(gb);
+    }
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 11.3 Wavelet transform data. wavelet_transform()
+ */
+static int dirac_unpack_idwt_params(DiracContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    int i, level;
+    unsigned tmp;
+
+#define CHECKEDREAD(dst, cond, errmsg) \
+    tmp = get_interleaved_ue_golomb(gb); \
+    if (cond) { \
+        av_log(s->avctx, AV_LOG_ERROR, errmsg); \
+        return AVERROR_INVALIDDATA; \
+    }\
+    dst = tmp;
+
+    align_get_bits(gb);
+
+    s->zero_res = s->num_refs ? get_bits1(gb) : 0;
+    if (s->zero_res)
+        return 0;
+
+    /*[DIRAC_STD] 11.3.1 Transform parameters. transform_parameters() */
+    CHECKEDREAD(s->wavelet_idx, tmp > 6, "wavelet_idx is too big\n")
+
+    CHECKEDREAD(s->wavelet_depth, tmp > MAX_DWT_LEVELS || tmp < 1, "invalid number of DWT decompositions\n")
+
+    if (!s->low_delay) {
+        /* Codeblock parameters (core syntax only) */
+        if (get_bits1(gb)) {
+            for (i = 0; i <= s->wavelet_depth; i++) {
+                CHECKEDREAD(s->codeblock[i].width , tmp < 1 || tmp > (s->avctx->width >>s->wavelet_depth-i), "codeblock width invalid\n")
+                CHECKEDREAD(s->codeblock[i].height, tmp < 1 || tmp > (s->avctx->height>>s->wavelet_depth-i), "codeblock height invalid\n")
+            }
+
+            CHECKEDREAD(s->codeblock_mode, tmp > 1, "unknown codeblock mode\n")
+        }
+        else {
+            for (i = 0; i <= s->wavelet_depth; i++)
+                s->codeblock[i].width = s->codeblock[i].height = 1;
+        }
+    }
+    else {
+        s->num_x        = get_interleaved_ue_golomb(gb);
+        s->num_y        = get_interleaved_ue_golomb(gb);
+        if (s->num_x * s->num_y == 0 || s->num_x * (uint64_t)s->num_y > INT_MAX ||
+            s->num_x * (uint64_t)s->avctx->width  > INT_MAX ||
+            s->num_y * (uint64_t)s->avctx->height > INT_MAX
+        ) {
+            av_log(s->avctx,AV_LOG_ERROR,"Invalid numx/y\n");
+            s->num_x = s->num_y = 0;
+            return AVERROR_INVALIDDATA;
+        }
+        if (s->ld_picture) {
+            s->lowdelay.bytes.num = get_interleaved_ue_golomb(gb);
+            s->lowdelay.bytes.den = get_interleaved_ue_golomb(gb);
+            if (s->lowdelay.bytes.den <= 0) {
+                av_log(s->avctx,AV_LOG_ERROR,"Invalid lowdelay.bytes.den\n");
+                return AVERROR_INVALIDDATA;
+            }
+        } else if (s->hq_picture) {
+            s->highquality.prefix_bytes = get_interleaved_ue_golomb(gb);
+            s->highquality.size_scaler  = get_interleaved_ue_golomb(gb);
+            if (s->highquality.prefix_bytes >= INT_MAX / 8) {
+                av_log(s->avctx,AV_LOG_ERROR,"too many prefix bytes\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+
+        /* [DIRAC_STD] 11.3.5 Quantisation matrices (low-delay syntax). quant_matrix() */
+        if (get_bits1(gb)) {
+            av_log(s->avctx,AV_LOG_DEBUG,"Low Delay: Has Custom Quantization Matrix!\n");
+            /* custom quantization matrix */
+            for (level = 0; level < s->wavelet_depth; level++) {
+                for (i = !!level; i < 4; i++) {
+                    s->lowdelay.quant[level][i] = get_interleaved_ue_golomb(gb);
+                }
+            }
+        } else {
+            if (s->wavelet_depth > 4) {
+                av_log(s->avctx,AV_LOG_ERROR,"Mandatory custom low delay matrix missing for depth %d\n", s->wavelet_depth);
+                return AVERROR_INVALIDDATA;
+            }
+            /* default quantization matrix */
+            for (level = 0; level < s->wavelet_depth; level++)
+                for (i = 0; i < 4; i++) {
+                    s->lowdelay.quant[level][i] = ff_dirac_default_qmat[s->wavelet_idx][level][i];
+                    /* haar with no shift differs for different depths */
+                    if (s->wavelet_idx == 3)
+                        s->lowdelay.quant[level][i] += 4*(s->wavelet_depth-1 - level);
+                }
+        }
+    }
+    return 0;
+}
+
+static inline int pred_sbsplit(uint8_t *sbsplit, int stride, int x, int y)
+{
+    static const uint8_t avgsplit[7] = { 0, 0, 1, 1, 1, 2, 2 };
+
+    if (!(x|y))
+        return 0;
+    else if (!y)
+        return sbsplit[-1];
+    else if (!x)
+        return sbsplit[-stride];
+
+    return avgsplit[sbsplit[-1] + sbsplit[-stride] + sbsplit[-stride-1]];
+}
+
+static inline int pred_block_mode(DiracBlock *block, int stride, int x, int y, int refmask)
+{
+    int pred;
+
+    if (!(x|y))
+        return 0;
+    else if (!y)
+        return block[-1].ref & refmask;
+    else if (!x)
+        return block[-stride].ref & refmask;
+
+    /* return the majority */
+    pred = (block[-1].ref & refmask) + (block[-stride].ref & refmask) + (block[-stride-1].ref & refmask);
+    return (pred >> 1) & refmask;
+}
+
+static inline void pred_block_dc(DiracBlock *block, int stride, int x, int y)
+{
+    int i, n = 0;
+
+    memset(block->u.dc, 0, sizeof(block->u.dc));
+
+    if (x && !(block[-1].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-1].u.dc[i];
+        n++;
+    }
+
+    if (y && !(block[-stride].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-stride].u.dc[i];
+        n++;
+    }
+
+    if (x && y && !(block[-1-stride].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-1-stride].u.dc[i];
+        n++;
+    }
+
+    if (n == 2) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] = (block->u.dc[i]+1)>>1;
+    } else if (n == 3) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] = divide3(block->u.dc[i]);
+    }
+}
+
+static inline void pred_mv(DiracBlock *block, int stride, int x, int y, int ref)
+{
+    int16_t *pred[3];
+    int refmask = ref+1;
+    int mask = refmask | DIRAC_REF_MASK_GLOBAL; /*  exclude gmc blocks */
+    int n = 0;
+
+    if (x && (block[-1].ref & mask) == refmask)
+        pred[n++] = block[-1].u.mv[ref];
+
+    if (y && (block[-stride].ref & mask) == refmask)
+        pred[n++] = block[-stride].u.mv[ref];
+
+    if (x && y && (block[-stride-1].ref & mask) == refmask)
+        pred[n++] = block[-stride-1].u.mv[ref];
+
+    switch (n) {
+    case 0:
+        block->u.mv[ref][0] = 0;
+        block->u.mv[ref][1] = 0;
+        break;
+    case 1:
+        block->u.mv[ref][0] = pred[0][0];
+        block->u.mv[ref][1] = pred[0][1];
+        break;
+    case 2:
+        block->u.mv[ref][0] = (pred[0][0] + pred[1][0] + 1) >> 1;
+        block->u.mv[ref][1] = (pred[0][1] + pred[1][1] + 1) >> 1;
+        break;
+    case 3:
+        block->u.mv[ref][0] = mid_pred(pred[0][0], pred[1][0], pred[2][0]);
+        block->u.mv[ref][1] = mid_pred(pred[0][1], pred[1][1], pred[2][1]);
+        break;
+    }
+}
+
+static void global_mv(DiracContext *s, DiracBlock *block, int x, int y, int ref)
+{
+    int ez      = s->globalmc[ref].zrs_exp;
+    int ep      = s->globalmc[ref].perspective_exp;
+    int (*A)[2] = s->globalmc[ref].zrs;
+    int *b      = s->globalmc[ref].pan_tilt;
+    int *c      = s->globalmc[ref].perspective;
+
+    int m       = (1<<ep) - (c[0]*x + c[1]*y);
+    int64_t mx  = m * (int64_t)((A[0][0] * (int64_t)x + A[0][1]*(int64_t)y) + (1<<ez) * b[0]);
+    int64_t my  = m * (int64_t)((A[1][0] * (int64_t)x + A[1][1]*(int64_t)y) + (1<<ez) * b[1]);
+
+    block->u.mv[ref][0] = (mx + (1<<(ez+ep))) >> (ez+ep);
+    block->u.mv[ref][1] = (my + (1<<(ez+ep))) >> (ez+ep);
+}
+
+static void decode_block_params(DiracContext *s, DiracArith arith[8], DiracBlock *block,
+                                int stride, int x, int y)
+{
+    int i;
+
+    block->ref  = pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_REF1);
+    block->ref ^= dirac_get_arith_bit(arith, CTX_PMODE_REF1);
+
+    if (s->num_refs == 2) {
+        block->ref |= pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_REF2);
+        block->ref ^= dirac_get_arith_bit(arith, CTX_PMODE_REF2) << 1;
+    }
+
+    if (!block->ref) {
+        pred_block_dc(block, stride, x, y);
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += (unsigned)dirac_get_arith_int(arith+1+i, CTX_DC_F1, CTX_DC_DATA);
+        return;
+    }
+
+    if (s->globalmc_flag) {
+        block->ref |= pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_GLOBAL);
+        block->ref ^= dirac_get_arith_bit(arith, CTX_GLOBAL_BLOCK) << 2;
+    }
+
+    for (i = 0; i < s->num_refs; i++)
+        if (block->ref & (i+1)) {
+            if (block->ref & DIRAC_REF_MASK_GLOBAL) {
+                global_mv(s, block, x, y, i);
+            } else {
+                pred_mv(block, stride, x, y, i);
+                block->u.mv[i][0] += (unsigned)dirac_get_arith_int(arith + 4 + 2 * i, CTX_MV_F1, CTX_MV_DATA);
+                block->u.mv[i][1] += (unsigned)dirac_get_arith_int(arith + 5 + 2 * i, CTX_MV_F1, CTX_MV_DATA);
+            }
+        }
+}
+
+/**
+ * Copies the current block to the other blocks covered by the current superblock split mode
+ */
+static void propagate_block_data(DiracBlock *block, int stride, int size)
+{
+    int x, y;
+    DiracBlock *dst = block;
+
+    for (x = 1; x < size; x++)
+        dst[x] = *block;
+
+    for (y = 1; y < size; y++) {
+        dst += stride;
+        for (x = 0; x < size; x++)
+            dst[x] = *block;
+    }
+}
+
+/**
+ * Dirac Specification ->
+ * 12. Block motion data syntax
+ */
+static int dirac_unpack_block_motion_data(DiracContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    uint8_t *sbsplit = s->sbsplit;
+    int i, x, y, q, p;
+    DiracArith arith[8];
+
+    align_get_bits(gb);
+
+    /* [DIRAC_STD] 11.2.4 and 12.2.1 Number of blocks and superblocks */
+    s->sbwidth  = DIVRNDUP(s->seq.width,  4*s->plane[0].xbsep);
+    s->sbheight = DIVRNDUP(s->seq.height, 4*s->plane[0].ybsep);
+    s->blwidth  = 4 * s->sbwidth;
+    s->blheight = 4 * s->sbheight;
+
+    /* [DIRAC_STD] 12.3.1 Superblock splitting modes. superblock_split_modes()
+       decode superblock split modes */
+    ff_dirac_init_arith_decoder(arith, gb, get_interleaved_ue_golomb(gb));     /* get_interleaved_ue_golomb(gb) is the length */
+    for (y = 0; y < s->sbheight; y++) {
+        for (x = 0; x < s->sbwidth; x++) {
+            unsigned int split  = dirac_get_arith_uint(arith, CTX_SB_F1, CTX_SB_DATA);
+            if (split > 2)
+                return AVERROR_INVALIDDATA;
+            sbsplit[x] = (split + pred_sbsplit(sbsplit+x, s->sbwidth, x, y)) % 3;
+        }
+        sbsplit += s->sbwidth;
+    }
+
+    /* setup arith decoding */
+    ff_dirac_init_arith_decoder(arith, gb, get_interleaved_ue_golomb(gb));
+    for (i = 0; i < s->num_refs; i++) {
+        ff_dirac_init_arith_decoder(arith + 4 + 2 * i, gb, get_interleaved_ue_golomb(gb));
+        ff_dirac_init_arith_decoder(arith + 5 + 2 * i, gb, get_interleaved_ue_golomb(gb));
+    }
+    for (i = 0; i < 3; i++)
+        ff_dirac_init_arith_decoder(arith+1+i, gb, get_interleaved_ue_golomb(gb));
+
+    for (y = 0; y < s->sbheight; y++)
+        for (x = 0; x < s->sbwidth; x++) {
+            int blkcnt = 1 << s->sbsplit[y * s->sbwidth + x];
+            int step   = 4 >> s->sbsplit[y * s->sbwidth + x];
+
+            for (q = 0; q < blkcnt; q++)
+                for (p = 0; p < blkcnt; p++) {
+                    int bx = 4 * x + p*step;
+                    int by = 4 * y + q*step;
+                    DiracBlock *block = &s->blmotion[by*s->blwidth + bx];
+                    decode_block_params(s, arith, block, s->blwidth, bx, by);
+                    propagate_block_data(block, s->blwidth, step);
+                }
+        }
+
+    return 0;
+}
+
+static int weight(int i, int blen, int offset)
+{
+#define ROLLOFF(i) offset == 1 ? ((i) ? 5 : 3) :        \
+    (1 + (6*(i) + offset - 1) / (2*offset - 1))
+
+    if (i < 2*offset)
+        return ROLLOFF(i);
+    else if (i > blen-1 - 2*offset)
+        return ROLLOFF(blen-1 - i);
+    return 8;
+}
+
+static void init_obmc_weight_row(Plane *p, uint8_t *obmc_weight, int stride,
+                                 int left, int right, int wy)
+{
+    int x;
+    for (x = 0; left && x < p->xblen >> 1; x++)
+        obmc_weight[x] = wy*8;
+    for (; x < p->xblen >> right; x++)
+        obmc_weight[x] = wy*weight(x, p->xblen, p->xoffset);
+    for (; x < p->xblen; x++)
+        obmc_weight[x] = wy*8;
+    for (; x < stride; x++)
+        obmc_weight[x] = 0;
+}
+
+static void init_obmc_weight(Plane *p, uint8_t *obmc_weight, int stride,
+                             int left, int right, int top, int bottom)
+{
+    int y;
+    for (y = 0; top && y < p->yblen >> 1; y++) {
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, 8);
+        obmc_weight += stride;
+    }
+    for (; y < p->yblen >> bottom; y++) {
+        int wy = weight(y, p->yblen, p->yoffset);
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, wy);
+        obmc_weight += stride;
+    }
+    for (; y < p->yblen; y++) {
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, 8);
+        obmc_weight += stride;
+    }
+}
+
+static void init_obmc_weights(DiracContext *s, Plane *p, int by)
+{
+    int top = !by;
+    int bottom = by == s->blheight-1;
+
+    /* don't bother re-initing for rows 2 to blheight-2, the weights don't change */
+    if (top || bottom || by == 1) {
+        init_obmc_weight(p, s->obmc_weight[0], MAX_BLOCKSIZE, 1, 0, top, bottom);
+        init_obmc_weight(p, s->obmc_weight[1], MAX_BLOCKSIZE, 0, 0, top, bottom);
+        init_obmc_weight(p, s->obmc_weight[2], MAX_BLOCKSIZE, 0, 1, top, bottom);
+    }
+}
+
+static const uint8_t epel_weights[4][4][4] = {
+    {{ 16,  0,  0,  0 },
+     { 12,  4,  0,  0 },
+     {  8,  8,  0,  0 },
+     {  4, 12,  0,  0 }},
+    {{ 12,  0,  4,  0 },
+     {  9,  3,  3,  1 },
+     {  6,  6,  2,  2 },
+     {  3,  9,  1,  3 }},
+    {{  8,  0,  8,  0 },
+     {  6,  2,  6,  2 },
+     {  4,  4,  4,  4 },
+     {  2,  6,  2,  6 }},
+    {{  4,  0, 12,  0 },
+     {  3,  1,  9,  3 },
+     {  2,  2,  6,  6 },
+     {  1,  3,  3,  9 }}
+};
+
+/**
+ * For block x,y, determine which of the hpel planes to do bilinear
+ * interpolation from and set src[] to the location in each hpel plane
+ * to MC from.
+ *
+ * @return the index of the put_dirac_pixels_tab function to use
+ *  0 for 1 plane (fpel,hpel), 1 for 2 planes (qpel), 2 for 4 planes (qpel), and 3 for epel
+ */
+static int mc_subpel(DiracContext *s, DiracBlock *block, const uint8_t *src[5],
+                     int x, int y, int ref, int plane)
+{
+    Plane *p = &s->plane[plane];
+    uint8_t **ref_hpel = s->ref_pics[ref]->hpel[plane];
+    int motion_x = block->u.mv[ref][0];
+    int motion_y = block->u.mv[ref][1];
+    int mx, my, i, epel, nplanes = 0;
+
+    if (plane) {
+        motion_x >>= s->chroma_x_shift;
+        motion_y >>= s->chroma_y_shift;
+    }
+
+    mx         = motion_x & ~(-1U << s->mv_precision);
+    my         = motion_y & ~(-1U << s->mv_precision);
+    motion_x >>= s->mv_precision;
+    motion_y >>= s->mv_precision;
+    /* normalize subpel coordinates to epel */
+    /* TODO: template this function? */
+    mx      <<= 3 - s->mv_precision;
+    my      <<= 3 - s->mv_precision;
+
+    x += motion_x;
+    y += motion_y;
+    epel = (mx|my)&1;
+
+    /* hpel position */
+    if (!((mx|my)&3)) {
+        nplanes = 1;
+        src[0] = ref_hpel[(my>>1)+(mx>>2)] + y*p->stride + x;
+    } else {
+        /* qpel or epel */
+        nplanes = 4;
+        for (i = 0; i < 4; i++)
+            src[i] = ref_hpel[i] + y*p->stride + x;
+
+        /* if we're interpolating in the right/bottom halves, adjust the planes as needed
+           we increment x/y because the edge changes for half of the pixels */
+        if (mx > 4) {
+            src[0] += 1;
+            src[2] += 1;
+            x++;
+        }
+        if (my > 4) {
+            src[0] += p->stride;
+            src[1] += p->stride;
+            y++;
+        }
+
+        /* hpel planes are:
+           [0]: F  [1]: H
+           [2]: V  [3]: C */
+        if (!epel) {
+            /* check if we really only need 2 planes since either mx or my is
+               a hpel position. (epel weights of 0 handle this there) */
+            if (!(mx&3)) {
+                /* mx == 0: average [0] and [2]
+                   mx == 4: average [1] and [3] */
+                src[!mx] = src[2 + !!mx];
+                nplanes = 2;
+            } else if (!(my&3)) {
+                src[0] = src[(my>>1)  ];
+                src[1] = src[(my>>1)+1];
+                nplanes = 2;
+            }
+        } else {
+            /* adjust the ordering if needed so the weights work */
+            if (mx > 4) {
+                FFSWAP(const uint8_t *, src[0], src[1]);
+                FFSWAP(const uint8_t *, src[2], src[3]);
+            }
+            if (my > 4) {
+                FFSWAP(const uint8_t *, src[0], src[2]);
+                FFSWAP(const uint8_t *, src[1], src[3]);
+            }
+            src[4] = epel_weights[my&3][mx&3];
+        }
+    }
+
+    /* fixme: v/h _edge_pos */
+    if (x + p->xblen > p->width +EDGE_WIDTH/2 ||
+        y + p->yblen > p->height+EDGE_WIDTH/2 ||
+        x < 0 || y < 0) {
+        for (i = 0; i < nplanes; i++) {
+            s->vdsp.emulated_edge_mc(s->edge_emu_buffer[i], src[i],
+                                     p->stride, p->stride,
+                                     p->xblen, p->yblen, x, y,
+                                     p->width+EDGE_WIDTH/2, p->height+EDGE_WIDTH/2);
+            src[i] = s->edge_emu_buffer[i];
+        }
+    }
+    return (nplanes>>1) + epel;
+}
+
+static void add_dc(uint16_t *dst, int dc, int stride,
+                   uint8_t *obmc_weight, int xblen, int yblen)
+{
+    int x, y;
+    dc += 128;
+
+    for (y = 0; y < yblen; y++) {
+        for (x = 0; x < xblen; x += 2) {
+            dst[x  ] += dc * obmc_weight[x  ];
+            dst[x+1] += dc * obmc_weight[x+1];
+        }
+        dst          += stride;
+        obmc_weight  += MAX_BLOCKSIZE;
+    }
+}
+
+static void block_mc(DiracContext *s, DiracBlock *block,
+                     uint16_t *mctmp, uint8_t *obmc_weight,
+                     int plane, int dstx, int dsty)
+{
+    Plane *p = &s->plane[plane];
+    const uint8_t *src[5];
+    int idx;
+
+    switch (block->ref&3) {
+    case 0: /* DC */
+        add_dc(mctmp, block->u.dc[plane], p->stride, obmc_weight, p->xblen, p->yblen);
+        return;
+    case 1:
+    case 2:
+        idx = mc_subpel(s, block, src, dstx, dsty, (block->ref&3)-1, plane);
+        s->put_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        if (s->weight_func)
+            s->weight_func(s->mcscratch, p->stride, s->weight_log2denom,
+                           s->weight[0] + s->weight[1], p->yblen);
+        break;
+    case 3:
+        idx = mc_subpel(s, block, src, dstx, dsty, 0, plane);
+        s->put_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        idx = mc_subpel(s, block, src, dstx, dsty, 1, plane);
+        if (s->biweight_func) {
+            /* fixme: +32 is a quick hack */
+            s->put_pixels_tab[idx](s->mcscratch + 32, src, p->stride, p->yblen);
+            s->biweight_func(s->mcscratch, s->mcscratch+32, p->stride, s->weight_log2denom,
+                             s->weight[0], s->weight[1], p->yblen);
+        } else
+            s->avg_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        break;
+    }
+    s->add_obmc(mctmp, s->mcscratch, p->stride, obmc_weight, p->yblen);
+}
+
+static void mc_row(DiracContext *s, DiracBlock *block, uint16_t *mctmp, int plane, int dsty)
+{
+    Plane *p = &s->plane[plane];
+    int x, dstx = p->xbsep - p->xoffset;
+
+    block_mc(s, block, mctmp, s->obmc_weight[0], plane, -p->xoffset, dsty);
+    mctmp += p->xbsep;
+
+    for (x = 1; x < s->blwidth-1; x++) {
+        block_mc(s, block+x, mctmp, s->obmc_weight[1], plane, dstx, dsty);
+        dstx  += p->xbsep;
+        mctmp += p->xbsep;
+    }
+    block_mc(s, block+x, mctmp, s->obmc_weight[2], plane, dstx, dsty);
+}
+
+static void select_dsp_funcs(DiracContext *s, int width, int height, int xblen, int yblen)
+{
+    int idx = 0;
+    if (xblen > 8)
+        idx = 1;
+    if (xblen > 16)
+        idx = 2;
+
+    memcpy(s->put_pixels_tab, s->diracdsp.put_dirac_pixels_tab[idx], sizeof(s->put_pixels_tab));
+    memcpy(s->avg_pixels_tab, s->diracdsp.avg_dirac_pixels_tab[idx], sizeof(s->avg_pixels_tab));
+    s->add_obmc = s->diracdsp.add_dirac_obmc[idx];
+    if (s->weight_log2denom > 1 || s->weight[0] != 1 || s->weight[1] != 1) {
+        s->weight_func   = s->diracdsp.weight_dirac_pixels_tab[idx];
+        s->biweight_func = s->diracdsp.biweight_dirac_pixels_tab[idx];
+    } else {
+        s->weight_func   = NULL;
+        s->biweight_func = NULL;
+    }
+}
+
+static int interpolate_refplane(DiracContext *s, DiracFrame *ref, int plane, int width, int height)
+{
+    /* chroma allocates an edge of 8 when subsampled
+       which for 4:2:2 means an h edge of 16 and v edge of 8
+       just use 8 for everything for the moment */
+    int i, edge = EDGE_WIDTH/2;
+
+    ref->hpel[plane][0] = ref->avframe->data[plane];
+    s->mpvencdsp.draw_edges(ref->hpel[plane][0], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM); /* EDGE_TOP | EDGE_BOTTOM values just copied to make it build, this needs to be ensured */
+
+    /* no need for hpel if we only have fpel vectors */
+    if (!s->mv_precision)
+        return 0;
+
+    for (i = 1; i < 4; i++) {
+        if (!ref->hpel_base[plane][i])
+            ref->hpel_base[plane][i] = av_malloc((height+2*edge) * ref->avframe->linesize[plane] + 32);
+        if (!ref->hpel_base[plane][i]) {
+            return AVERROR(ENOMEM);
+        }
+        /* we need to be 16-byte aligned even for chroma */
+        ref->hpel[plane][i] = ref->hpel_base[plane][i] + edge*ref->avframe->linesize[plane] + 16;
+    }
+
+    if (!ref->interpolated[plane]) {
+        s->diracdsp.dirac_hpel_filter(ref->hpel[plane][1], ref->hpel[plane][2],
+                                      ref->hpel[plane][3], ref->hpel[plane][0],
+                                      ref->avframe->linesize[plane], width, height);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][1], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][2], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][3], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+    }
+    ref->interpolated[plane] = 1;
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 13.0 Transform data syntax. transform_data()
+ */
+static int dirac_decode_frame_internal(DiracContext *s)
+{
+    DWTContext d;
+    int y, i, comp, dsty;
+    int ret;
+
+    if (s->low_delay) {
+        /* [DIRAC_STD] 13.5.1 low_delay_transform_data() */
+        if (!s->hq_picture) {
+            for (comp = 0; comp < 3; comp++) {
+                Plane *p = &s->plane[comp];
+                memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
+            }
+        }
+        if (!s->zero_res) {
+            if ((ret = decode_lowdelay(s)) < 0)
+                return ret;
+        }
+    }
+
+    for (comp = 0; comp < 3; comp++) {
+        Plane *p       = &s->plane[comp];
+        uint8_t *frame = s->current_picture->avframe->data[comp];
+
+        /* FIXME: small resolutions */
+        for (i = 0; i < 4; i++)
+            s->edge_emu_buffer[i] = s->edge_emu_buffer_base + i*FFALIGN(p->width, 16);
+
+        if (!s->zero_res && !s->low_delay)
+        {
+            memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
+            ret = decode_component(s, comp); /* [DIRAC_STD] 13.4.1 core_transform_data() */
+            if (ret < 0)
+                return ret;
+        }
+        ret = ff_spatial_idwt_init(&d, &p->idwt, s->wavelet_idx+2,
+                                   s->wavelet_depth, s->bit_depth);
+        if (ret < 0)
+            return ret;
+
+        if (!s->num_refs) { /* intra */
+            for (y = 0; y < p->height; y += 16) {
+                int idx = (s->bit_depth - 8) >> 1;
+                ff_spatial_idwt_slice2(&d, y+16); /* decode */
+                s->diracdsp.put_signed_rect_clamped[idx](frame + y*p->stride,
+                                                         p->stride,
+                                                         p->idwt.buf + y*p->idwt.stride,
+                                                         p->idwt.stride, p->width, 16);
+            }
+        } else { /* inter */
+            int rowheight = p->ybsep*p->stride;
+
+            select_dsp_funcs(s, p->width, p->height, p->xblen, p->yblen);
+
+            for (i = 0; i < s->num_refs; i++) {
+                int ret = interpolate_refplane(s, s->ref_pics[i], comp, p->width, p->height);
+                if (ret < 0)
+                    return ret;
+            }
+
+            memset(s->mctmp, 0, 4*p->yoffset*p->stride);
+
+            dsty = -p->yoffset;
+            for (y = 0; y < s->blheight; y++) {
+                int h     = 0,
+                    start = FFMAX(dsty, 0);
+                uint16_t *mctmp    = s->mctmp + y*rowheight;
+                DiracBlock *blocks = s->blmotion + y*s->blwidth;
+
+                init_obmc_weights(s, p, y);
+
+                if (y == s->blheight-1 || start+p->ybsep > p->height)
+                    h = p->height - start;
+                else
+                    h = p->ybsep - (start - dsty);
+                if (h < 0)
+                    break;
+
+                memset(mctmp+2*p->yoffset*p->stride, 0, 2*rowheight);
+                mc_row(s, blocks, mctmp, comp, dsty);
+
+                mctmp += (start - dsty)*p->stride + p->xoffset;
+                ff_spatial_idwt_slice2(&d, start + h); /* decode */
+                /* NOTE: add_rect_clamped hasn't been templated hence the shifts.
+                 * idwt.stride is passed as pixels, not in bytes as in the rest of the decoder */
+                s->diracdsp.add_rect_clamped(frame + start*p->stride, mctmp, p->stride,
+                                             (int16_t*)(p->idwt.buf) + start*(p->idwt.stride >> 1), (p->idwt.stride >> 1), p->width, h);
+
+                dsty += p->ybsep;
+            }
+        }
+    }
+
+
+    return 0;
+}
+
+static int get_buffer_with_edge(AVCodecContext *avctx, AVFrame *f, int flags)
+{
+    int ret, i;
+    int chroma_x_shift, chroma_y_shift;
+    ret = av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_x_shift,
+                                           &chroma_y_shift);
+    if (ret < 0)
+        return ret;
+
+    f->width  = avctx->width  + 2 * EDGE_WIDTH;
+    f->height = avctx->height + 2 * EDGE_WIDTH + 2;
+    ret = ff_get_buffer(avctx, f, flags);
+    if (ret < 0)
+        return ret;
+
+    for (i = 0; f->data[i]; i++) {
+        int offset = (EDGE_WIDTH >> (i && i<3 ? chroma_y_shift : 0)) *
+                     f->linesize[i] + 32;
+        f->data[i] += offset;
+    }
+    f->width  = avctx->width;
+    f->height = avctx->height;
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 11.1.1 Picture Header. picture_header()
+ */
+static int dirac_decode_picture_header(DiracContext *s)
+{
+    unsigned retire, picnum;
+    int i, j, ret;
+    int64_t refdist, refnum;
+    GetBitContext *gb = &s->gb;
+
+    /* [DIRAC_STD] 11.1.1 Picture Header. picture_header() PICTURE_NUM */
+    picnum = s->current_picture->avframe->display_picture_number = get_bits_long(gb, 32);
+
+
+    av_log(s->avctx,AV_LOG_DEBUG,"PICTURE_NUM: %d\n",picnum);
+
+    /* if this is the first keyframe after a sequence header, start our
+       reordering from here */
+    if (s->frame_number < 0)
+        s->frame_number = picnum;
+
+    s->ref_pics[0] = s->ref_pics[1] = NULL;
+    for (i = 0; i < s->num_refs; i++) {
+        refnum = (picnum + dirac_get_se_golomb(gb)) & 0xFFFFFFFF;
+        refdist = INT64_MAX;
+
+        /* find the closest reference to the one we want */
+        /* Jordi: this is needed if the referenced picture hasn't yet arrived */
+        for (j = 0; j < MAX_REFERENCE_FRAMES && refdist; j++)
+            if (s->ref_frames[j]
+                && FFABS(s->ref_frames[j]->avframe->display_picture_number - refnum) < refdist) {
+                s->ref_pics[i] = s->ref_frames[j];
+                refdist = FFABS(s->ref_frames[j]->avframe->display_picture_number - refnum);
+            }
+
+        if (!s->ref_pics[i] || refdist)
+            av_log(s->avctx, AV_LOG_DEBUG, "Reference not found\n");
+
+        /* if there were no references at all, allocate one */
+        if (!s->ref_pics[i])
+            for (j = 0; j < MAX_FRAMES; j++)
+                if (!s->all_frames[j].avframe->data[0]) {
+                    s->ref_pics[i] = &s->all_frames[j];
+                    ret = get_buffer_with_edge(s->avctx, s->ref_pics[i]->avframe, AV_GET_BUFFER_FLAG_REF);
+                    if (ret < 0)
+                        return ret;
+                    break;
+                }
+
+        if (!s->ref_pics[i]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Reference could not be allocated\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+    }
+
+    /* retire the reference frames that are not used anymore */
+    if (s->current_picture->reference) {
+        retire = (picnum + dirac_get_se_golomb(gb)) & 0xFFFFFFFF;
+        if (retire != picnum) {
+            DiracFrame *retire_pic = remove_frame(s->ref_frames, retire);
+
+            if (retire_pic)
+                retire_pic->reference &= DELAYED_PIC_REF;
+            else
+                av_log(s->avctx, AV_LOG_DEBUG, "Frame to retire not found\n");
+        }
+
+        /* if reference array is full, remove the oldest as per the spec */
+        while (add_frame(s->ref_frames, MAX_REFERENCE_FRAMES, s->current_picture)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Reference frame overflow\n");
+            remove_frame(s->ref_frames, s->ref_frames[0]->avframe->display_picture_number)->reference &= DELAYED_PIC_REF;
+        }
+    }
+
+    if (s->num_refs) {
+        ret = dirac_unpack_prediction_parameters(s);  /* [DIRAC_STD] 11.2 Picture Prediction Data. picture_prediction() */
+        if (ret < 0)
+            return ret;
+        ret = dirac_unpack_block_motion_data(s);      /* [DIRAC_STD] 12. Block motion data syntax                       */
+        if (ret < 0)
+            return ret;
+    }
+    ret = dirac_unpack_idwt_params(s);                /* [DIRAC_STD] 11.3 Wavelet transform data                        */
+    if (ret < 0)
+        return ret;
+
+    init_planes(s);
+    return 0;
+}
+
+static int get_delayed_pic(DiracContext *s, AVFrame *picture, int *got_frame)
+{
+    DiracFrame *out = s->delay_frames[0];
+    int i, out_idx  = 0;
+    int ret;
+
+    /* find frame with lowest picture number */
+    for (i = 1; s->delay_frames[i]; i++)
+        if (s->delay_frames[i]->avframe->display_picture_number < out->avframe->display_picture_number) {
+            out     = s->delay_frames[i];
+            out_idx = i;
+        }
+
+    for (i = out_idx; s->delay_frames[i]; i++)
+        s->delay_frames[i] = s->delay_frames[i+1];
+
+    if (out) {
+        out->reference ^= DELAYED_PIC_REF;
+        if((ret = av_frame_ref(picture, out->avframe)) < 0)
+            return ret;
+        *got_frame = 1;
+    }
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 9.6 Parse Info Header Syntax. parse_info()
+ * 4 byte start code + byte parse code + 4 byte size + 4 byte previous size
+ */
+#define DATA_UNIT_HEADER_SIZE 13
+
+/* [DIRAC_STD] dirac_decode_data_unit makes reference to the while defined in 9.3
+   inside the function parse_sequence() */
+static int dirac_decode_data_unit(AVCodecContext *avctx, const uint8_t *buf, int size)
+{
+    DiracContext *s   = avctx->priv_data;
+    DiracFrame *pic   = NULL;
+    AVDiracSeqHeader *dsh;
+    int ret, i;
+    uint8_t parse_code;
+    unsigned tmp;
+
+    if (size < DATA_UNIT_HEADER_SIZE)
+        return AVERROR_INVALIDDATA;
+
+    parse_code = buf[4];
+
+    init_get_bits(&s->gb, &buf[13], 8*(size - DATA_UNIT_HEADER_SIZE));
+
+    if (parse_code == DIRAC_PCODE_SEQ_HEADER) {
+        if (s->seen_sequence_header)
+            return 0;
+
+        /* [DIRAC_STD] 10. Sequence header */
+        ret = av_dirac_parse_sequence_header(&dsh, buf + DATA_UNIT_HEADER_SIZE, size - DATA_UNIT_HEADER_SIZE, avctx);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "error parsing sequence header");
+            return ret;
+        }
+
+        if (CALC_PADDING((int64_t)dsh->width, MAX_DWT_LEVELS) * CALC_PADDING((int64_t)dsh->height, MAX_DWT_LEVELS) * 5LL > avctx->max_pixels)
+            ret = AVERROR(ERANGE);
+        if (ret >= 0)
+            ret = ff_set_dimensions(avctx, dsh->width, dsh->height);
+        if (ret < 0) {
+            av_freep(&dsh);
+            return ret;
+        }
+
+        ff_set_sar(avctx, dsh->sample_aspect_ratio);
+        avctx->pix_fmt         = dsh->pix_fmt;
+        avctx->color_range     = dsh->color_range;
+        avctx->color_trc       = dsh->color_trc;
+        avctx->color_primaries = dsh->color_primaries;
+        avctx->colorspace      = dsh->colorspace;
+        avctx->profile         = dsh->profile;
+        avctx->level           = dsh->level;
+        avctx->framerate       = dsh->framerate;
+        s->bit_depth           = dsh->bit_depth;
+        s->version.major       = dsh->version.major;
+        s->version.minor       = dsh->version.minor;
+        s->seq                 = *dsh;
+        av_freep(&dsh);
+
+        s->pshift = s->bit_depth > 8;
+
+        ret = av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                               &s->chroma_x_shift,
+                                               &s->chroma_y_shift);
+        if (ret < 0)
+            return ret;
+
+        ret = alloc_sequence_buffers(s);
+        if (ret < 0)
+            return ret;
+
+        s->seen_sequence_header = 1;
+    } else if (parse_code == DIRAC_PCODE_END_SEQ) { /* [DIRAC_STD] End of Sequence */
+        free_sequence_buffers(s);
+        s->seen_sequence_header = 0;
+    } else if (parse_code == DIRAC_PCODE_AUX) {
+        if (buf[13] == 1) {     /* encoder implementation/version */
+            int ver[3];
+            /* versions older than 1.0.8 don't store quant delta for
+               subbands with only one codeblock */
+            if (sscanf(buf+14, "Schroedinger %d.%d.%d", ver, ver+1, ver+2) == 3)
+                if (ver[0] == 1 && ver[1] == 0 && ver[2] <= 7)
+                    s->old_delta_quant = 1;
+        }
+    } else if (parse_code & 0x8) {  /* picture data unit */
+        if (!s->seen_sequence_header) {
+            av_log(avctx, AV_LOG_DEBUG, "Dropping frame without sequence header\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        /* find an unused frame */
+        for (i = 0; i < MAX_FRAMES; i++)
+            if (s->all_frames[i].avframe->data[0] == NULL)
+                pic = &s->all_frames[i];
+        if (!pic) {
+            av_log(avctx, AV_LOG_ERROR, "framelist full\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        av_frame_unref(pic->avframe);
+
+        /* [DIRAC_STD] Defined in 9.6.1 ... */
+        tmp            =  parse_code & 0x03;                   /* [DIRAC_STD] num_refs()      */
+        if (tmp > 2) {
+            av_log(avctx, AV_LOG_ERROR, "num_refs of 3\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->num_refs      = tmp;
+        s->is_arith      = (parse_code & 0x48) == 0x08;          /* [DIRAC_STD] using_ac()            */
+        s->low_delay     = (parse_code & 0x88) == 0x88;          /* [DIRAC_STD] is_low_delay()        */
+        s->core_syntax   = (parse_code & 0x88) == 0x08;          /* [DIRAC_STD] is_core_syntax()      */
+        s->ld_picture    = (parse_code & 0xF8) == 0xC8;          /* [DIRAC_STD] is_ld_picture()       */
+        s->hq_picture    = (parse_code & 0xF8) == 0xE8;          /* [DIRAC_STD] is_hq_picture()       */
+        s->dc_prediction = (parse_code & 0x28) == 0x08;          /* [DIRAC_STD] using_dc_prediction() */
+        pic->reference   = (parse_code & 0x0C) == 0x0C;          /* [DIRAC_STD] is_reference()        */
+        pic->avframe->key_frame = s->num_refs == 0;              /* [DIRAC_STD] is_intra()            */
+        pic->avframe->pict_type = s->num_refs + 1;               /* Definition of AVPictureType in avutil.h */
+
+        /* VC-2 Low Delay has a different parse code than the Dirac Low Delay */
+        if (s->version.minor == 2 && parse_code == 0x88)
+            s->ld_picture = 1;
+
+        if (s->low_delay && !(s->ld_picture || s->hq_picture) ) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid low delay flag\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        if ((ret = get_buffer_with_edge(avctx, pic->avframe, (parse_code & 0x0C) == 0x0C ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
+            return ret;
+        s->current_picture = pic;
+        s->plane[0].stride = pic->avframe->linesize[0];
+        s->plane[1].stride = pic->avframe->linesize[1];
+        s->plane[2].stride = pic->avframe->linesize[2];
+
+        if (alloc_buffers(s, FFMAX3(FFABS(s->plane[0].stride), FFABS(s->plane[1].stride), FFABS(s->plane[2].stride))) < 0)
+            return AVERROR(ENOMEM);
+
+        /* [DIRAC_STD] 11.1 Picture parse. picture_parse() */
+        ret = dirac_decode_picture_header(s);
+        if (ret < 0)
+            return ret;
+
+        /* [DIRAC_STD] 13.0 Transform data syntax. transform_data() */
+        ret = dirac_decode_frame_internal(s);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
+}
+
+static int dirac_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *pkt)
+{
+    DiracContext *s     = avctx->priv_data;
+    AVFrame *picture    = data;
+    uint8_t *buf        = pkt->data;
+    int buf_size        = pkt->size;
+    int i, buf_idx      = 0;
+    int ret;
+    unsigned data_unit_size;
+
+    /* release unused frames */
+    for (i = 0; i < MAX_FRAMES; i++)
+        if (s->all_frames[i].avframe->data[0] && !s->all_frames[i].reference) {
+            av_frame_unref(s->all_frames[i].avframe);
+            memset(s->all_frames[i].interpolated, 0, sizeof(s->all_frames[i].interpolated));
+        }
+
+    s->current_picture = NULL;
+    *got_frame = 0;
+
+    /* end of stream, so flush delayed pics */
+    if (buf_size == 0)
+        return get_delayed_pic(s, (AVFrame *)data, got_frame);
+
+    for (;;) {
+        /*[DIRAC_STD] Here starts the code from parse_info() defined in 9.6
+          [DIRAC_STD] PARSE_INFO_PREFIX = "BBCD" as defined in ISO/IEC 646
+          BBCD start code search */
+        for (; buf_idx + DATA_UNIT_HEADER_SIZE < buf_size; buf_idx++) {
+            if (buf[buf_idx  ] == 'B' && buf[buf_idx+1] == 'B' &&
+                buf[buf_idx+2] == 'C' && buf[buf_idx+3] == 'D')
+                break;
+        }
+        /* BBCD found or end of data */
+        if (buf_idx + DATA_UNIT_HEADER_SIZE >= buf_size)
+            break;
+
+        data_unit_size = AV_RB32(buf+buf_idx+5);
+        if (data_unit_size > buf_size - buf_idx || !data_unit_size) {
+            if(data_unit_size > buf_size - buf_idx)
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Data unit with size %d is larger than input buffer, discarding\n",
+                   data_unit_size);
+            buf_idx += 4;
+            continue;
+        }
+        /* [DIRAC_STD] dirac_decode_data_unit makes reference to the while defined in 9.3 inside the function parse_sequence() */
+        ret = dirac_decode_data_unit(avctx, buf+buf_idx, data_unit_size);
+        if (ret < 0)
+        {
+            av_log(s->avctx, AV_LOG_ERROR,"Error in dirac_decode_data_unit\n");
+            return ret;
+        }
+        buf_idx += data_unit_size;
+    }
+
+    if (!s->current_picture)
+        return buf_size;
+
+    if (s->current_picture->avframe->display_picture_number > s->frame_number) {
+        DiracFrame *delayed_frame = remove_frame(s->delay_frames, s->frame_number);
+
+        s->current_picture->reference |= DELAYED_PIC_REF;
+
+        if (add_frame(s->delay_frames, MAX_DELAY, s->current_picture)) {
+            int min_num = s->delay_frames[0]->avframe->display_picture_number;
+            /* Too many delayed frames, so we display the frame with the lowest pts */
+            av_log(avctx, AV_LOG_ERROR, "Delay frame overflow\n");
+
+            for (i = 1; s->delay_frames[i]; i++)
+                if (s->delay_frames[i]->avframe->display_picture_number < min_num)
+                    min_num = s->delay_frames[i]->avframe->display_picture_number;
+
+            delayed_frame = remove_frame(s->delay_frames, min_num);
+            add_frame(s->delay_frames, MAX_DELAY, s->current_picture);
+        }
+
+        if (delayed_frame) {
+            delayed_frame->reference ^= DELAYED_PIC_REF;
+            if((ret=av_frame_ref(data, delayed_frame->avframe)) < 0)
+                return ret;
+            *got_frame = 1;
+        }
+    } else if (s->current_picture->avframe->display_picture_number == s->frame_number) {
+        /* The right frame at the right time :-) */
+        if((ret=av_frame_ref(data, s->current_picture->avframe)) < 0)
+            return ret;
+        *got_frame = 1;
+    }
+
+    if (*got_frame)
+        s->frame_number = picture->display_picture_number + 1LL;
+
+    return buf_idx;
+}
+
+AVCodec ff_dirac_decoder = {
+    .name           = "dirac",
+    .long_name      = NULL_IF_CONFIG_SMALL("BBC Dirac VC-2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DIRAC,
+    .priv_data_size = sizeof(DiracContext),
+    .init           = dirac_decode_init,
+    .close          = dirac_decode_end,
+    .decode         = dirac_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .flush          = dirac_decode_flush,
+};
diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
new file mode 100644
index 0000000..2dd56f8
--- /dev/null
+++ b/libavcodec/diracdsp.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "diracdsp.h"
+
+#define FILTER(src, stride)                                     \
+    ((21*((src)[ 0*stride] + (src)[1*stride])                   \
+      -7*((src)[-1*stride] + (src)[2*stride])                   \
+      +3*((src)[-2*stride] + (src)[3*stride])                   \
+      -1*((src)[-3*stride] + (src)[4*stride]) + 16) >> 5)
+
+static void dirac_hpel_filter(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, const uint8_t *src,
+                              int stride, int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = -3; x < width+5; x++)
+            dstv[x] = av_clip_uint8(FILTER(src+x, stride));
+
+        for (x = 0; x < width; x++)
+            dstc[x] = av_clip_uint8(FILTER(dstv+x, 1));
+
+        for (x = 0; x < width; x++)
+            dsth[x] = av_clip_uint8(FILTER(src+x, 1));
+
+        src  += stride;
+        dsth += stride;
+        dstv += stride;
+        dstc += stride;
+    }
+}
+
+#define PIXOP_BILINEAR(PFX, OP, WIDTH)                                  \
+    static void ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c(uint8_t *dst, const uint8_t *src[5], int stride, int h) \
+    {                                                                   \
+        int x;                                                          \
+        const uint8_t *s0 = src[0];                                     \
+        const uint8_t *s1 = src[1];                                     \
+        const uint8_t *s2 = src[2];                                     \
+        const uint8_t *s3 = src[3];                                     \
+        const uint8_t *w  = src[4];                                     \
+                                                                        \
+        while (h--) {                                                   \
+            for (x = 0; x < WIDTH; x++) {                               \
+                OP(dst[x], (s0[x]*w[0] + s1[x]*w[1] + s2[x]*w[2] + s3[x]*w[3] + 8) >> 4); \
+            }                                                           \
+                                                                        \
+            dst += stride;                                              \
+            s0 += stride;                                               \
+            s1 += stride;                                               \
+            s2 += stride;                                               \
+            s3 += stride;                                               \
+        }                                                               \
+    }
+
+#define OP_PUT(dst, val) (dst) = (val)
+#define OP_AVG(dst, val) (dst) = (((dst) + (val) + 1)>>1)
+
+PIXOP_BILINEAR(put, OP_PUT, 8)
+PIXOP_BILINEAR(put, OP_PUT, 16)
+PIXOP_BILINEAR(put, OP_PUT, 32)
+PIXOP_BILINEAR(avg, OP_AVG, 8)
+PIXOP_BILINEAR(avg, OP_AVG, 16)
+PIXOP_BILINEAR(avg, OP_AVG, 32)
+
+#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + (1<<(log2_denom-1))) >> log2_denom)
+#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + (1<<(log2_denom-1))) >> log2_denom)
+
+#define DIRAC_WEIGHT(W)                                                 \
+    static void weight_dirac_pixels ## W ## _c(uint8_t *block, int stride, int log2_denom, \
+                                               int weight, int h) {     \
+        int x;                                                          \
+        while (h--) {                                                   \
+            for (x = 0; x < W; x++) {                                   \
+                op_scale1(x);                                           \
+                op_scale1(x+1);                                         \
+            }                                                           \
+            block += stride;                                            \
+        }                                                               \
+    }                                                                   \
+    static void biweight_dirac_pixels ## W ## _c(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, \
+                                                 int weightd, int weights, int h) { \
+        int x;                                                          \
+        while (h--) {                                                   \
+            for (x = 0; x < W; x++) {                                   \
+                op_scale2(x);                                           \
+                op_scale2(x+1);                                         \
+            }                                                           \
+            dst += stride;                                              \
+            src += stride;                                              \
+        }                                                               \
+    }
+
+DIRAC_WEIGHT(8)
+DIRAC_WEIGHT(16)
+DIRAC_WEIGHT(32)
+
+#define ADD_OBMC(xblen)                                                 \
+    static void add_obmc ## xblen ## _c(uint16_t *dst, const uint8_t *src, int stride, \
+                                        const uint8_t *obmc_weight, int yblen) \
+    {                                                                   \
+        int x;                                                          \
+        while (yblen--) {                                               \
+            for (x = 0; x < xblen; x += 2) {                            \
+                dst[x  ] += src[x  ] * obmc_weight[x  ];                \
+                dst[x+1] += src[x+1] * obmc_weight[x+1];                \
+            }                                                           \
+            dst += stride;                                              \
+            src += stride;                                              \
+            obmc_weight += 32;                                          \
+        }                                                               \
+    }
+
+ADD_OBMC(8)
+ADD_OBMC(16)
+ADD_OBMC(32)
+
+static void put_signed_rect_clamped_8bit_c(uint8_t *dst, int dst_stride, const uint8_t *_src, int src_stride, int width, int height)
+{
+    int x, y;
+    int16_t *src = (int16_t *)_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x+=4) {
+            dst[x  ] = av_clip_uint8(src[x  ] + 128);
+            dst[x+1] = av_clip_uint8(src[x+1] + 128);
+            dst[x+2] = av_clip_uint8(src[x+2] + 128);
+            dst[x+3] = av_clip_uint8(src[x+3] + 128);
+        }
+        dst += dst_stride;
+        src += src_stride >> 1;
+    }
+}
+
+#define PUT_SIGNED_RECT_CLAMPED(PX)                                                                     \
+static void put_signed_rect_clamped_ ## PX ## bit_c(uint8_t *_dst, int dst_stride, const uint8_t *_src, \
+                                                  int src_stride, int width, int height)                \
+{                                                                                                       \
+    int x, y;                                                                                           \
+    uint16_t *dst = (uint16_t *)_dst;                                                                   \
+    int32_t *src = (int32_t *)_src;                                                                     \
+    for (y = 0; y < height; y++) {                                                                      \
+        for (x = 0; x < width; x+=4) {                                                                  \
+            dst[x  ] = av_clip_uintp2(src[x  ] + (1U << (PX - 1)), PX);                                  \
+            dst[x+1] = av_clip_uintp2(src[x+1] + (1U << (PX - 1)), PX);                                  \
+            dst[x+2] = av_clip_uintp2(src[x+2] + (1U << (PX - 1)), PX);                                  \
+            dst[x+3] = av_clip_uintp2(src[x+3] + (1U << (PX - 1)), PX);                                  \
+        }                                                                                               \
+        dst += dst_stride >> 1;                                                                         \
+        src += src_stride >> 2;                                                                         \
+    }                                                                                                   \
+}
+
+PUT_SIGNED_RECT_CLAMPED(10)
+PUT_SIGNED_RECT_CLAMPED(12)
+
+static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride,
+                               const int16_t *idwt, int idwt_stride,
+                               int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x+=2) {
+            dst[x  ] = av_clip_uint8(((src[x  ]+32)>>6) + idwt[x  ]);
+            dst[x+1] = av_clip_uint8(((src[x+1]+32)>>6) + idwt[x+1]);
+        }
+        dst += stride;
+        src += stride;
+        idwt += idwt_stride;
+    }
+}
+
+#define DEQUANT_SUBBAND(PX)                                                                \
+static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride,     \
+                                         const int qf, const int qs, int tot_v, int tot_h) \
+{                                                                                          \
+    int i, y;                                                                              \
+    for (y = 0; y < tot_v; y++) {                                                          \
+        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;                                \
+        for (i = 0; i < tot_h; i++) {                                                      \
+            c = *src_r++;                                                                  \
+            sign = FFSIGN(c)*(!!c);                                                        \
+            c = (FFABS(c)*(unsigned)qf + qs) >> 2;                                                   \
+            *dst_r++ = c*sign;                                                             \
+        }                                                                                  \
+        src += tot_h << (sizeof(PX) >> 1);                                                 \
+        dst += stride;                                                                     \
+    }                                                                                      \
+}
+
+DEQUANT_SUBBAND(int16_t)
+DEQUANT_SUBBAND(int32_t)
+
+#define PIXFUNC(PFX, WIDTH)                                             \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][2] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l4_c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][3] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c
+
+av_cold void ff_diracdsp_init(DiracDSPContext *c)
+{
+    c->dirac_hpel_filter = dirac_hpel_filter;
+    c->add_rect_clamped = add_rect_clamped_c;
+    c->put_signed_rect_clamped[0] = put_signed_rect_clamped_8bit_c;
+    c->put_signed_rect_clamped[1] = put_signed_rect_clamped_10bit_c;
+    c->put_signed_rect_clamped[2] = put_signed_rect_clamped_12bit_c;
+
+    c->add_dirac_obmc[0] = add_obmc8_c;
+    c->add_dirac_obmc[1] = add_obmc16_c;
+    c->add_dirac_obmc[2] = add_obmc32_c;
+
+    c->weight_dirac_pixels_tab[0] = weight_dirac_pixels8_c;
+    c->weight_dirac_pixels_tab[1] = weight_dirac_pixels16_c;
+    c->weight_dirac_pixels_tab[2] = weight_dirac_pixels32_c;
+    c->biweight_dirac_pixels_tab[0] = biweight_dirac_pixels8_c;
+    c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
+    c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
+
+    c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c;
+    c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c;
+
+    PIXFUNC(put, 8);
+    PIXFUNC(put, 16);
+    PIXFUNC(put, 32);
+    PIXFUNC(avg, 8);
+    PIXFUNC(avg, 16);
+    PIXFUNC(avg, 32);
+
+    if (ARCH_X86)
+        ff_diracdsp_init_x86(c);
+}
diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
new file mode 100644
index 0000000..224828d
--- /dev/null
+++ b/libavcodec/diracdsp.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRACDSP_H
+#define AVCODEC_DIRACDSP_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h);
+typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h);
+
+typedef struct {
+    void (*dirac_hpel_filter)(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, const uint8_t *src, int stride, int width, int height);
+    /**
+     * dirac_pixels_tab[width][subpel]
+     * width is 2 for 32, 1 for 16, 0 for 8
+     * subpel is 0 for fpel and hpel (only need to copy from the first plane in src)
+     *           1 if an average of the first 2 planes is needed (TODO: worth it?)
+     *           2 for general qpel (avg of 4)
+     *           3 for general epel (biweight of 4 using the weights in src[4])
+     * src[0-3] is each of the hpel planes
+     * src[4] is the 1/8 pel weights if needed
+     */
+    void (*put_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*avg_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+    void (*put_signed_rect_clamped[3])(uint8_t *dst/*align 16*/, int dst_stride, const uint8_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/);
+    void (*put_rect_clamped)(uint8_t *dst/*align 16*/, int dst_stride, const uint8_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/);
+    void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/);
+    void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
+    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
+    dirac_weight_func weight_dirac_pixels_tab[3];
+    dirac_biweight_func biweight_dirac_pixels_tab[3];
+} DiracDSPContext;
+
+#define DECL_DIRAC_PIXOP(PFX, EXT)                                      \
+    void ff_ ## PFX ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
+    void ff_ ## PFX ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
+    void ff_ ## PFX ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+
+DECL_DIRAC_PIXOP(put, c);
+DECL_DIRAC_PIXOP(avg, c);
+DECL_DIRAC_PIXOP(put, l2_c);
+DECL_DIRAC_PIXOP(avg, l2_c);
+DECL_DIRAC_PIXOP(put, l4_c);
+DECL_DIRAC_PIXOP(avg, l4_c);
+
+void ff_diracdsp_init(DiracDSPContext *c);
+void ff_diracdsp_init_x86(DiracDSPContext* c);
+
+#endif /* AVCODEC_DIRACDSP_H */
diff --git a/libavcodec/diractab.c b/libavcodec/diractab.c
new file mode 100644
index 0000000..816b939
--- /dev/null
+++ b/libavcodec/diractab.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author    (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "diractab.h"
+
+const uint8_t ff_dirac_default_qmat[7][4][4] = {
+    { { 5,  3,  3,  0}, { 0,  4,  4,  1}, { 0,  5,  5,  2}, { 0,  6,  6,  3} },
+    { { 4,  2,  2,  0}, { 0,  4,  4,  2}, { 0,  5,  5,  3}, { 0,  7,  7,  5} },
+    { { 5,  3,  3,  0}, { 0,  4,  4,  1}, { 0,  5,  5,  2}, { 0,  6,  6,  3} },
+    { { 8,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0} },
+    { { 8,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0} },
+    { { 0,  4,  4,  8}, { 0,  8,  8, 12}, { 0, 13, 13, 17}, { 0, 17, 17, 21} },
+    { { 3,  1,  1,  0}, { 0,  4,  4,  2}, { 0,  6,  6,  5}, { 0,  9,  9,  7} },
+};
+
+const int32_t ff_dirac_qscale_tab[116] = {
+    4,         5,         6,         7,         8,        10,        11,        13,
+    16,        19,        23,        27,        32,        38,        45,        54,
+    64,        76,        91,       108,       128,       152,       181,       215,
+    256,       304,       362,       431,       512,       609,       724,       861,
+    1024,      1218,      1448,      1722,      2048,      2435,      2896,      3444,
+    4096,      4871,      5793,      6889,      8192,      9742,     11585,     13777,
+    16384,     19484,     23170,     27554,     32768,     38968,     46341,     55109,
+    65536,     77936,     92682,    110218,    131072,    155872,    185364,    220436,
+    262144,    311744,    370728,    440872,    524288,    623487,    741455,    881744,
+    1048576,   1246974,   1482910,   1763488,   2097152,   2493948,   2965821,   3526975,
+    4194304,   4987896,   5931642,   7053950,   8388608,   9975792,  11863283,  14107901,
+    16777216,  19951585,  23726566,  28215802,  33554432,  39903169,  47453133,  56431603,
+    67108864,  79806339,  94906266, 112863206, 134217728, 159612677, 189812531, 225726413,
+    268435456, 319225354, 379625062, 451452825, 536870912, 638450708, 759250125, 902905651,
+    1073741824,1276901417,1518500250,1805811301,/*2147483648,2553802834,3037000500,3611622603,
+    4294967296*/
+};
+
+const int32_t ff_dirac_qoffset_intra_tab[120] = {
+    1,         2,         3,         4,         4,         5,         6,         7,
+    8,        10,        12,        14,        16,        19,        23,        27,
+    32,        38,        46,        54,        64,        76,        91,       108,
+    128,       152,       181,       216,       256,       305,       362,       431,
+    512,       609,       724,       861,      1024,      1218,      1448,      1722,
+    2048,      2436,      2897,      3445,      4096,      4871,      5793,      6889,
+    8192,      9742,     11585,     13777,     16384,     19484,     23171,     27555,
+    32768,     38968,     46341,     55109,     65536,     77936,     92682,    110218,
+    131072,    155872,    185364,    220436,    262144,    311744,    370728,    440872,
+    524288,    623487,    741455,    881744,   1048576,   1246974,   1482911,   1763488,
+    2097152,   2493948,   2965821,   3526975,   4194304,   4987896,   5931642,   7053951,
+    8388608,   9975793,  11863283,  14107901,  16777216,  19951585,  23726567,  28215802,
+    33554432,  39903170,  47453133,  56431603,  67108864,  79806339,  94906266, 112863207,
+    134217728, 159612677, 189812531, 225726413, 268435456, 319225354, 379625063, 451452826,
+    536870912, 638450709, 759250125, 902905651,1073741824,1276901417,1518500250,1805811302,
+    /*2147483648, 2553802834, 3037000500, 3611622603, 4294967296,*/
+};
+
+const int ff_dirac_qoffset_inter_tab[122] = {
+    1,         2,         2,         3,         3,         4,         4,         5,
+    6,         7,         9,        10,        12,        14,        17,        20,
+    24,        29,        34,        41,        48,        57,        68,        81,
+    96,       114,       136,       162,       192,       228,       272,       323,
+    384,       457,       543,       646,       768,       913,      1086,      1292,
+    1536,      1827,      2172,      2583,      3072,      3653,      4344,      5166,
+    6144,      7307,      8689,     10333,     12288,     14613,     17378,     20666,
+    24576,     29226,     34756,     41332,     49152,     58452,     69512,     82664,
+    98304,    116904,    139023,    165327,    196608,    233808,    278046,    330654,
+    393216,    467615,    556091,    661308,    786432,    935231,   1112183,   1322616,
+    1572864,   1870461,   2224366,   2645231,   3145728,   3740922,   4448731,   5290463,
+    6291456,   7481844,   8897462,  10580926,  12582912,  14963688,  17794925,  21161851,
+    25165824,  29927377,  35589850,  42323702,  50331648,  59854754,  71179699,  84647405,
+    100663296, 119709508, 142359398, 169294809, 201326592, 239419016, 284718797, 338589619,
+    402653184, 478838031, 569437594, 677179238, 805306368, 957676063,1138875188,1354358476,
+    1610612736, 1915352125, /*2277750375, 2708716952, 3221225472, 3830704250,*/
+};
diff --git a/libavcodec/diractab.h b/libavcodec/diractab.h
new file mode 100644
index 0000000..2423b07
--- /dev/null
+++ b/libavcodec/diractab.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author    (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRACTAB_H
+#define AVCODEC_DIRACTAB_H
+
+#include <stdint.h>
+
+/* Tables here are shared between the Dirac/VC-2 decoder and the VC-2 encoder */
+
+/* Default quantization tables for each wavelet transform */
+extern const uint8_t ff_dirac_default_qmat[7][4][4];
+
+/* Scaling factors needed for quantization/dequantization */
+extern const int32_t ff_dirac_qscale_tab[116];
+
+/* Scaling offsets needed for quantization/dequantization, for intra frames */
+extern const int32_t ff_dirac_qoffset_intra_tab[120];
+
+/* Scaling offsets needed for quantization/dequantization, for inter frames */
+extern const int ff_dirac_qoffset_inter_tab[122];
+
+#define DIRAC_MAX_QUANT_INDEX (FF_ARRAY_ELEMS(ff_dirac_qscale_tab))
+
+#endif /* AVCODEC_DIRACTAB_H */
diff --git a/libavcodec/dnxhd_parser.c b/libavcodec/dnxhd_parser.c
index 0de3561..7c16e25 100644
--- a/libavcodec/dnxhd_parser.c
+++ b/libavcodec/dnxhd_parser.c
@@ -2,20 +2,20 @@
  * DNxHD/VC-3 parser
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,19 @@
  */
 
 #include "parser.h"
+#include "dnxhddata.h"
 
-#define DNXHD_HEADER_PREFIX 0x0000028001
+typedef struct {
+    ParseContext pc;
+    int cur_byte;
+    int remaining;
+    int w, h;
+} DNXHDParserContext;
 
-static int dnxhd_find_frame_end(ParseContext *pc,
+static int dnxhd_find_frame_end(DNXHDParserContext *dctx,
                                 const uint8_t *buf, int buf_size)
 {
+    ParseContext *pc = &dctx->pc;
     uint64_t state = pc->state64;
     int pic_found = pc->frame_start_found;
     int i = 0;
@@ -38,25 +45,66 @@ static int dnxhd_find_frame_end(ParseContext *pc,
     if (!pic_found) {
         for (i = 0; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state & 0xffffffffffLL) == DNXHD_HEADER_PREFIX) {
+            if (ff_dnxhd_check_header_prefix(state & 0xffffffffff00LL) != 0) {
                 i++;
                 pic_found = 1;
+                dctx->cur_byte = 0;
+                dctx->remaining = 0;
                 break;
             }
         }
     }
 
-    if (pic_found) {
+    if (pic_found && !dctx->remaining) {
         if (!buf_size) /* EOF considered as end of frame */
             return 0;
         for (; i < buf_size; i++) {
+            dctx->cur_byte++;
             state = (state << 8) | buf[i];
-            if ((state & 0xffffffffffLL) == DNXHD_HEADER_PREFIX) {
-                pc->frame_start_found = 0;
-                pc->state64 = -1;
-                return i - 4;
+
+            if (dctx->cur_byte == 24) {
+                dctx->h = (state >> 32) & 0xFFFF;
+            } else if (dctx->cur_byte == 26) {
+                dctx->w = (state >> 32) & 0xFFFF;
+            } else if (dctx->cur_byte == 42) {
+                int cid = (state >> 32) & 0xFFFFFFFF;
+                int remaining;
+
+                if (cid <= 0)
+                    continue;
+
+                remaining = avpriv_dnxhd_get_frame_size(cid);
+                if (remaining <= 0) {
+                    remaining = avpriv_dnxhd_get_hr_frame_size(cid, dctx->w, dctx->h);
+                    if (remaining <= 0)
+                        continue;
+                }
+                dctx->remaining = remaining;
+                if (buf_size - i + 47 >= dctx->remaining) {
+                    int remaining = dctx->remaining;
+
+                    pc->frame_start_found = 0;
+                    pc->state64 = -1;
+                    dctx->cur_byte = 0;
+                    dctx->remaining = 0;
+                    return remaining;
+                } else {
+                    dctx->remaining -= buf_size;
+                }
             }
         }
+    } else if (pic_found) {
+        if (dctx->remaining > buf_size) {
+            dctx->remaining -= buf_size;
+        } else {
+            int remaining = dctx->remaining;
+
+            pc->frame_start_found = 0;
+            pc->state64 = -1;
+            dctx->cur_byte = 0;
+            dctx->remaining = 0;
+            return remaining;
+        }
     }
     pc->frame_start_found = pic_found;
     pc->state64 = state;
@@ -68,13 +116,14 @@ static int dnxhd_parse(AVCodecParserContext *s,
                        const uint8_t **poutbuf, int *poutbuf_size,
                        const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    DNXHDParserContext *dctx = s->priv_data;
+    ParseContext *pc = &dctx->pc;
     int next;
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         next = buf_size;
     } else {
-        next = dnxhd_find_frame_end(pc, buf, buf_size);
+        next = dnxhd_find_frame_end(dctx, buf, buf_size);
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf      = NULL;
             *poutbuf_size = 0;
@@ -88,7 +137,7 @@ static int dnxhd_parse(AVCodecParserContext *s,
 
 AVCodecParser ff_dnxhd_parser = {
     .codec_ids      = { AV_CODEC_ID_DNXHD },
-    .priv_data_size = sizeof(ParseContext),
+    .priv_data_size = sizeof(DNXHDParserContext),
     .parser_parse   = dnxhd_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/dnxhddata.c b/libavcodec/dnxhddata.c
index 55272e9..154be89 100644
--- a/libavcodec/dnxhddata.c
+++ b/libavcodec/dnxhddata.c
@@ -2,20 +2,20 @@
  * VC3/DNxHD data.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 
 /* The quantization tables below are in zigzag order! */
 
-/* Used in CID 1235, 1256 */
+/* Used in CID 1235, 1256, 1270 */
 static const uint8_t dnxhd_1235_luma_weight[] = {
      0, 32, 32, 32, 33, 32, 32, 32,
     32, 31, 32, 33, 33, 33, 33, 35,
@@ -49,7 +49,7 @@ static const uint8_t dnxhd_1235_chroma_weight[] = {
     90, 90, 85, 79, 73, 73, 73, 73,
 };
 
-/* Used in CID 1237, 1253, 1259 */
+/* Used in CID 1237, 1253, 1259, 1273, 1274 */
 static const uint8_t dnxhd_1237_luma_weight[] = {
      0,  32,  33,  34, 34, 36, 37, 36,
     36,  37,  38,  38, 38, 39, 41, 44,
@@ -61,7 +61,7 @@ static const uint8_t dnxhd_1237_luma_weight[] = {
     97, 100, 104, 102, 98, 98, 99, 99,
 };
 
-/* Used in CID 1237, 1253, 1259 */
+/* Used in CID 1237, 1253, 1259, 1273, 1274 */
 static const uint8_t dnxhd_1237_chroma_weight[] = {
      0,  32,  36,  39, 39, 38, 39,  41,
     45,  51,  57,  58, 53, 48, 47,  51,
@@ -73,6 +73,7 @@ static const uint8_t dnxhd_1237_chroma_weight[] = {
     97, 100, 104, 102, 98, 98, 99,  99,
 };
 
+/* Used in CID 1238, 1272 */
 static const uint8_t dnxhd_1238_luma_weight[] = {
      0, 32, 32, 33, 34, 33, 33, 33,
     33, 33, 33, 33, 33, 35, 37, 37,
@@ -84,6 +85,7 @@ static const uint8_t dnxhd_1238_luma_weight[] = {
     51, 53, 55, 57, 58, 59, 57, 57,
 };
 
+/* Used in CID 1238, 1272 */
 static const uint8_t dnxhd_1238_chroma_weight[] = {
      0, 32, 35, 35, 35, 34, 34, 35,
     39, 43, 45, 45, 41, 39, 40, 41,
@@ -95,6 +97,7 @@ static const uint8_t dnxhd_1238_chroma_weight[] = {
     82, 77, 80, 86, 84, 82, 82, 82,
 };
 
+/* Used in CID 1241, 1271 */
 static const uint8_t dnxhd_1241_luma_weight[] = {
      0, 32, 33, 34, 34, 35, 36, 37,
     36, 37, 38, 38, 38, 39, 39, 40,
@@ -106,6 +109,7 @@ static const uint8_t dnxhd_1241_luma_weight[] = {
     48, 46, 47, 48, 48, 49, 49, 49,
 };
 
+/* Used in CID 1241, 1271 */
 static const uint8_t dnxhd_1241_chroma_weight[] = {
      0, 32, 36, 38, 37, 37, 40, 41,
     40, 40, 42, 42, 41, 41, 41, 41,
@@ -229,6 +233,7 @@ static const uint8_t dnxhd_1252_chroma_weight[] = {
     114, 128, 125, 129, 134, 125, 116, 116,
 };
 
+/* Used in CID 1244, 1260 */
 static const uint8_t dnxhd_1260_luma_weight[] = {
      0, 32, 33, 34, 36, 37, 37, 36,
     34, 33, 34, 35, 37, 38, 40, 41,
@@ -240,6 +245,7 @@ static const uint8_t dnxhd_1260_luma_weight[] = {
     52, 53, 53, 50, 50, 54, 54, 54,
 };
 
+/* Used in CID 1244, 1260 */
 static const uint8_t dnxhd_1260_chroma_weight[] = {
      0, 32, 34, 38, 42, 40, 38, 36,
     35, 35, 38, 42, 43, 43, 42, 40,
@@ -251,27 +257,27 @@ static const uint8_t dnxhd_1260_chroma_weight[] = {
     56, 56, 53, 53, 53, 54, 58, 58,
 };
 
-/* Used in CID 1235, 1241, 1250, 1256 */
+/* Used in CID 1235, 1236, 1241, 1250, 1256, 1257, 1270, 1271 */
 static const uint8_t dnxhd_1235_dc_codes[14] = {
     10, 62, 11, 12, 13, 0, 1, 2, 3, 4, 14, 30, 126, 127,
 };
 
-/* Used in CID 1235, 1241, 1250, 1256 */
+/* Used in CID 1235, 1236, 1241, 1250, 1256, 1257, 1270, 1271 */
 static const uint8_t dnxhd_1235_dc_bits[14] = {
     4, 6, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 7, 7,
 };
 
-/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260 */
+/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260, 1272, 1273, 1274 */
 static const uint8_t dnxhd_1237_dc_codes[12] = {
     0, 12, 13, 1, 2, 3, 4, 5, 14, 30, 62, 63,
 };
 
-/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260 */
+/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260, 1272, 1273, 1274 */
 static const uint8_t dnxhd_1237_dc_bits[12] = {
     3, 4, 4, 3, 3, 3, 3, 3, 4, 5, 6, 6,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint16_t dnxhd_1237_ac_codes[257] = {
         0,     1,     4,     5,    12,    26,    27,    56,
        57,    58,    59,   120,   121,   244,   245,   246,
@@ -308,7 +314,7 @@ static const uint16_t dnxhd_1237_ac_codes[257] = {
     65535,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint8_t dnxhd_1237_ac_bits[257] = {
      2,  2,  3,  3,  4,  5,  5,  6,  6,  6,  6,  7,  7,  8,  8,  8,
      8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11,
@@ -329,70 +335,44 @@ static const uint8_t dnxhd_1237_ac_bits[257] = {
     16,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_level[257] = {
-     1,  1,  2,  0,  3,  4,  2,  5,  6,  7,  3,  8,  9, 10, 11, 12,
-     4,  5, 13, 14, 15, 16,  6, 17, 18, 19, 20, 21,  7, 22, 23, 24,
-    25, 26, 27,  8,  9, 28, 29, 30, 31, 32, 33, 34, 10, 11, 12, 35,
-    36, 37, 38, 39, 40, 41, 13, 14, 15, 16, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 17, 18, 19, 20, 21, 53, 54, 55, 56, 57, 58,
-    59, 60, 61, 64,  1, 22, 23, 24, 25, 26, 27, 62, 63,  2,  3,  4,
-     5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-    21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
-    37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
-    53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1238, 1243 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
+static const uint8_t dnxhd_1237_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   0, 0,   7, 0,   9, 0,   5, 2,  11, 0,
+     13, 0,  15, 0,   7, 2,  17, 0,  19, 0,  21, 0,  23, 0,  25, 0,
+      9, 2,  11, 2,  27, 0,  29, 0,  31, 0,  33, 0,  13, 2,  35, 0,
+     37, 0,  39, 0,  41, 0,  43, 0,  15, 2,  45, 0,  47, 0,  49, 0,
+     51, 0,  53, 0,  55, 0,  17, 2,  19, 2,  57, 0,  59, 0,  61, 0,
+     63, 0,  65, 0,  67, 0,  69, 0,  21, 2,  23, 2,  25, 2,  71, 0,
+     73, 0,  75, 0,  77, 0,  79, 0,  81, 0,  83, 0,  27, 2,  29, 2,
+     31, 2,  33, 2,  85, 0,  87, 0,  89, 0,  91, 0,  93, 0,  95, 0,
+     97, 0,  99, 0, 101, 0, 103, 0, 105, 0,  35, 2,  37, 2,  39, 2,
+     41, 2,  43, 2, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0, 117, 0,
+    119, 0, 121, 0, 123, 0, 129, 0,   3, 1,  45, 2,  47, 2,  49, 2,
+     51, 2,  53, 2,  55, 2, 125, 0, 127, 0,   5, 1,   7, 1,   9, 1,
+     11, 1,  13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,
+     27, 1,  29, 1,  31, 1,  33, 1,  35, 1,  37, 1,  39, 1,  41, 1,
+     43, 1,  45, 1,  47, 1,  49, 1,  51, 1,  53, 1,  55, 1,  57, 1,
+     59, 1,  61, 1,  63, 1,  65, 1,  67, 1,  69, 1,  71, 1,  73, 1,
+     75, 1,  77, 1,  79, 1,  81, 1,  83, 1,  85, 1,  87, 1,  89, 1,
+     91, 1,  93, 1,  95, 1,  97, 1,  99, 1, 101, 1, 103, 1, 105, 1,
+    107, 1, 109, 1, 111, 1, 113, 1, 115, 1, 117, 1, 119, 1, 121, 1,
+    123, 1, 125, 1, 127, 1, 129, 1,  57, 2,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
+};
+
+/* Used in CID 1238, 1240, 1243, 1272 */
 static const uint16_t dnxhd_1238_ac_codes[257] = {
         0,     1,     4,    10,    11,    24,    25,    26,
        54,    55,    56,    57,   116,   117,   118,   119,
@@ -429,7 +409,7 @@ static const uint16_t dnxhd_1238_ac_codes[257] = {
     65535,
 };
 
-/* Used in CID 1238, 1243 */
+/* Used in CID 1238, 1240, 1243, 1272 */
 static const uint8_t dnxhd_1238_ac_bits[257] = {
      2,  2,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
      8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
@@ -450,70 +430,44 @@ static const uint8_t dnxhd_1238_ac_bits[257] = {
     16,
 };
 
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21, 22,  6,  7, 23, 24,
-    25, 26, 27, 28, 29,  8,  9, 30, 31, 32, 33, 34, 35, 36, 37, 10,
-    11, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 12, 13, 14, 49,
-    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 15, 16, 17, 18,
-    62, 63, 64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
-    14, 15, 16, 19, 20, 21, 22, 23, 24, 17, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 25,
-    26, 27, 28, 29, 30, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49,
-    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
+/* Used in CID 1238, 1240, 1243, 1272 */
+static const uint8_t dnxhd_1238_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  45, 0,  13, 2,  15, 2,  47, 0,  49, 0,
+     51, 0,  53, 0,  55, 0,  57, 0,  59, 0,  17, 2,  19, 2,  61, 0,
+     63, 0,  65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  21, 2,
+     23, 2,  77, 0,  79, 0,  81, 0,  83, 0,  85, 0,  87, 0,  89, 0,
+     91, 0,  93, 0,  95, 0,  97, 0,  25, 2,  27, 2,  29, 2,  99, 0,
+    101, 0, 103, 0, 105, 0, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0,
+    117, 0, 119, 0, 121, 0, 123, 0,  31, 2,  33, 2,  35, 2,  37, 2,
+    125, 0, 127, 0, 129, 0,   3, 1,   5, 1,   7, 1,   9, 1,  11, 1,
+     13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,  27, 1,
+     29, 1,  31, 1,  33, 1,  39, 2,  41, 2,  43, 2,  45, 2,  47, 2,
+     49, 2,  35, 1,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,  47, 1,
+     49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  61, 1,  63, 1,
+     65, 1,  67, 1,  69, 1,  71, 1,  73, 1,  75, 1,  81, 1,  51, 2,
+     53, 2,  55, 2,  57, 2,  59, 2,  61, 2,  77, 1,  79, 1,  83, 1,
+     85, 1,  87, 1,  89, 1,  91, 1,  93, 1,  95, 1,  97, 1,  99, 1,
+    101, 1, 103, 1, 105, 1, 107, 1, 109, 1, 111, 1, 113, 1, 115, 1,
+    117, 1, 119, 1, 121, 1, 123, 1, 125, 1, 127, 1, 129, 1,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 }; /* 0 is EOB */
 
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1235, 1241, 1256 */
+/* Used in CID 1235, 1236, 1241, 1256, 1257, 1270, 1271 */
 static const uint16_t dnxhd_1235_ac_codes[257] = {
         0,     1,     4,    10,    11,    24,    25,    26,
        54,    55,    56,    57,   116,   117,   118,   119,
@@ -550,7 +504,7 @@ static const uint16_t dnxhd_1235_ac_codes[257] = {
     65535,
 };
 
-/* Used in CID 1235, 1241, 1256 */
+/* Used in CID 1235, 1236, 1241, 1256, 1257, 1270, 1271 */
 static const uint8_t dnxhd_1235_ac_bits[257] = {
      2,  2,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
      8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10,
@@ -571,67 +525,41 @@ static const uint8_t dnxhd_1235_ac_bits[257] = {
     16,
 };
 
-/* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21,  6,  7, 22, 23, 24,
-    25, 26, 27, 28, 29,  8,  9, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-    10, 11, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 12, 13,
-    14, 15, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,  1,
-    16, 17, 18, 19, 64,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
-    13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-    40, 41, 42, 25, 26, 27, 28, 29, 30, 31, 32, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-/* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+/* Used in CID 1235, 1241, 1256, 1270, 1271 */
+static const uint8_t dnxhd_1235_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  13, 2,  15, 2,  45, 0,  47, 0,  49, 0,
+     51, 0,  53, 0,  55, 0,  57, 0,  59, 0,  17, 2,  19, 2,  61, 0,
+     63, 0,  65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  77, 0,
+     21, 2,  23, 2,  79, 0,  81, 0,  83, 0,  85, 0,  87, 0,  89, 0,
+     91, 0,  93, 0,  95, 0,  97, 0,  99, 0, 101, 0,  25, 2,  27, 2,
+     29, 2,  31, 2, 103, 0, 105, 0, 107, 0, 109, 0, 111, 0, 113, 0,
+    115, 0, 117, 0, 119, 0, 121, 0, 123, 0, 125, 0, 127, 0,   3, 1,
+     33, 2,  35, 2,  37, 2,  39, 2, 129, 0,   5, 1,   7, 1,   9, 1,
+     11, 1,  13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,
+     27, 1,  29, 1,  31, 1,  33, 1,  35, 1,  41, 2,  43, 2,  45, 2,
+     47, 2,  49, 2,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,  47, 1,
+     49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  61, 1,  63, 1,
+     65, 1,  67, 1,  69, 1,  71, 1,  73, 1,  75, 1,  77, 1,  79, 1,
+     81, 1,  83, 1,  85, 1,  51, 2,  53, 2,  55, 2,  57, 2,  59, 2,
+     61, 2,  63, 2,  65, 2,  87, 1,  89, 1,  91, 1,  93, 1,  95, 1,
+     97, 1,  99, 1, 101, 1, 103, 1, 105, 1, 107, 1, 109, 1, 111, 1,
+    113, 1, 115, 1, 117, 1, 119, 1, 121, 1, 123, 1, 125, 1, 127, 1,
+    129, 1,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 };
 
 static const uint16_t dnxhd_1250_ac_codes[257] = {
@@ -688,62 +616,41 @@ static const uint8_t dnxhd_1250_ac_bits[257] = {
     16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
     16
 };
-static const uint8_t dnxhd_1250_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21, 22,  6, 23, 24, 25,
-    26, 27, 28, 29,  7,  8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-     9, 10, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 11,
-    12, 13, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,  1,  2,
-     3,  4,  5, 14, 15, 16, 17,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 18, 19, 20, 21,
-    27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-    43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 22, 23, 24,
-    25, 26, 27, 54, 57, 58, 59, 60, 61, 62, 63, 64, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64
-};
-static const uint8_t dnxhd_1250_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1
-};
-static const uint8_t dnxhd_1250_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1
+
+static const uint8_t dnxhd_1250_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  45, 0,  13, 2,  47, 0,  49, 0,  51, 0,
+     53, 0,  55, 0,  57, 0,  59, 0,  15, 2,  17, 2,  61, 0,  63, 0,
+     65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  77, 0,  79, 0,
+     19, 2,  21, 2,  81, 0,  83, 0,  85, 0,  87, 0,  89, 0,  91, 0,
+     93, 0,  95, 0,  97, 0,  99, 0, 101, 0, 103, 0, 105, 0,  23, 2,
+     25, 2,  27, 2, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0, 117, 0,
+    119, 0, 121, 0, 123, 0, 125, 0, 127, 0, 129, 0,   3, 1,   5, 1,
+      7, 1,   9, 1,  11, 1,  29, 2,  31, 2,  33, 2,  35, 2,  13, 1,
+     15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,  27, 1,  29, 1,
+     31, 1,  33, 1,  35, 1,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,
+     47, 1,  49, 1,  51, 1,  53, 1,  37, 2,  39, 2,  41, 2,  43, 2,
+     55, 1,  57, 1,  59, 1,  61, 1,  63, 1,  65, 1,  67, 1,  69, 1,
+     71, 1,  73, 1,  75, 1,  77, 1,  79, 1,  81, 1,  83, 1,  85, 1,
+     87, 1,  89, 1,  91, 1,  93, 1,  95, 1,  97, 1,  99, 1, 101, 1,
+    103, 1, 105, 1, 107, 1, 111, 1, 113, 1,  45, 2,  47, 2,  49, 2,
+     51, 2,  53, 2,  55, 2, 109, 1, 115, 1, 117, 1, 119, 1, 121, 1,
+    123, 1, 125, 1, 127, 1, 129, 1,  57, 2,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 };
 
 static const uint16_t dnxhd_1251_ac_codes[257] = {
@@ -802,64 +709,40 @@ static const uint8_t dnxhd_1251_ac_bits[257] = {
     16,
 };
 
-static const uint8_t dnxhd_1251_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21,  6, 22, 23, 24, 25,
-    26, 27, 28, 29,  7,  8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-    40,  9, 10, 11, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
-    12, 13, 14, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,  1,
-     2,  3,  4,  5,  6,  7,  8, 15, 16, 17,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 18,
-    19, 20, 21, 22, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-    42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-    58, 23, 24, 25, 26, 27, 28, 59, 60, 61, 62, 63, 64, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-static const uint8_t dnxhd_1251_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-static const uint8_t dnxhd_1251_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+static const uint8_t dnxhd_1251_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  13, 2,  45, 0,  47, 0,  49, 0,  51, 0,
+     53, 0,  55, 0,  57, 0,  59, 0,  15, 2,  17, 2,  61, 0,  63, 0,
+     65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  77, 0,  79, 0,
+     81, 0,  19, 2,  21, 2,  23, 2,  83, 0,  85, 0,  87, 0,  89, 0,
+     91, 0,  93, 0,  95, 0,  97, 0,  99, 0, 101, 0, 103, 0, 105, 0,
+     25, 2,  27, 2,  29, 2, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0,
+    117, 0, 119, 0, 121, 0, 123, 0, 125, 0, 127, 0, 129, 0,   3, 1,
+      5, 1,   7, 1,   9, 1,  11, 1,  13, 1,  15, 1,  17, 1,  31, 2,
+     33, 2,  35, 2,  19, 1,  21, 1,  23, 1,  25, 1,  27, 1,  29, 1,
+     31, 1,  33, 1,  35, 1,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,
+     47, 1,  49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  37, 2,
+     39, 2,  41, 2,  43, 2,  45, 2,  61, 1,  63, 1,  65, 1,  67, 1,
+     69, 1,  71, 1,  73, 1,  75, 1,  77, 1,  79, 1,  81, 1,  83, 1,
+     85, 1,  87, 1,  89, 1,  91, 1,  93, 1,  95, 1,  97, 1,  99, 1,
+    101, 1, 103, 1, 105, 1, 107, 1, 109, 1, 111, 1, 113, 1, 115, 1,
+    117, 1,  47, 2,  49, 2,  51, 2,  53, 2,  55, 2,  57, 2, 119, 1,
+    121, 1, 123, 1, 125, 1, 127, 1, 129, 1,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 };
 
 /* Used in CID 1252, 1258 */
@@ -921,69 +804,43 @@ static const uint8_t dnxhd_1252_ac_bits[257] = {
 };
 
 /* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_level[257] = {
-     1,  1,  2,  3,  2,  0,  4,  5,  6,  7,  3,  8,  9, 10, 11, 12,
-    13, 14,  4,  5, 15, 16, 17, 18,  6, 19, 20, 21, 22, 23, 24,  7,
-     8, 25, 26, 27, 28, 29, 30, 31, 32,  9, 10, 33, 34, 35, 36, 37,
-    38, 39, 40, 41, 11, 12, 13, 42, 43, 44, 45, 46, 47, 48, 49, 50,
-    51, 52, 53, 14, 15, 16, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3, 17, 18, 19, 20,  4,  5,  6,  7,  8,  9, 10, 11,
-    12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 22,
-    23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-    39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-    55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-/* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_run_flag[257] = {
-    0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-    0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1235, 1238, 1241, 1256 */
+static const uint8_t dnxhd_1252_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   5, 2,   0, 0,   9, 0,  11, 0,
+     13, 0,  15, 0,   7, 2,  17, 0,  19, 0,  21, 0,  23, 0,  25, 0,
+     27, 0,  29, 0,   9, 2,  11, 2,  31, 0,  33, 0,  35, 0,  37, 0,
+     13, 2,  39, 0,  41, 0,  43, 0,  45, 0,  47, 0,  49, 0,  15, 2,
+     17, 2,  51, 0,  53, 0,  55, 0,  57, 0,  59, 0,  61, 0,  63, 0,
+     65, 0,  19, 2,  21, 2,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,
+     77, 0,  79, 0,  81, 0,  83, 0,  23, 2,  25, 2,  27, 2,  85, 0,
+     87, 0,  89, 0,  91, 0,  93, 0,  95, 0,  97, 0,  99, 0, 101, 0,
+    103, 0, 105, 0, 107, 0,  29, 2,  31, 2,  33, 2, 109, 0, 111, 0,
+    113, 0, 115, 0, 117, 0, 119, 0, 121, 0, 123, 0, 125, 0, 127, 0,
+    129, 0,   3, 1,   5, 1,   7, 1,  35, 2,  37, 2,  39, 2,  41, 2,
+      9, 1,  11, 1,  13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,
+     25, 1,  27, 1,  29, 1,  31, 1,  33, 1,  35, 1,  37, 1,  39, 1,
+     41, 1,  43, 1,  43, 2,  45, 2,  47, 2,  49, 2,  51, 2,  45, 1,
+     47, 1,  49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  61, 1,
+     63, 1,  65, 1,  67, 1,  69, 1,  71, 1,  73, 1,  75, 1,  77, 1,
+     79, 1,  81, 1,  83, 1,  85, 1,  87, 1,  89, 1,  91, 1,  93, 1,
+     95, 1,  97, 1,  99, 1, 101, 1, 103, 1, 105, 1, 107, 1, 109, 1,
+    111, 1, 113, 1, 115, 1, 117, 1, 119, 1, 121, 1, 123, 1, 125, 1,
+    127, 1, 129, 1,  53, 2,  55, 2,  57, 2,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
+};
+
+/* Used in CID 1235, 1238, 1241, 1243, 1256, 1270, 1271, 1272 */
 static const uint16_t dnxhd_1235_run_codes[62] = {
        0,    4,   10,   11,   24,   25,   26,   27,
       56,   57,   58,   59,  120,  242,  486,  487,
@@ -995,7 +852,7 @@ static const uint16_t dnxhd_1235_run_codes[62] = {
     1018, 1019, 1020, 1021, 1022, 1023,
 };
 
-/* Used in CID 1235, 1238, 1241, 1243, 1256 */
+/* Used in CID 1235, 1238, 1241, 1243, 1256, 1270, 1271, 1272 */
 static const uint8_t dnxhd_1235_run_bits[62] = {
      1,  3,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  8,  9,  9,
      9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
@@ -1003,7 +860,7 @@ static const uint8_t dnxhd_1235_run_bits[62] = {
     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
 };
 
-/* Used in CID 1235, 1241, 1256 */
+/* Used in CID 1235, 1241, 1256, 1270, 1271 */
 static const uint8_t dnxhd_1235_run[62] = {
      1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
     18, 20, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
@@ -1011,7 +868,7 @@ static const uint8_t dnxhd_1235_run[62] = {
     49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint16_t dnxhd_1237_run_codes[62] = {
        0,    4,   10,   11,   24,   25,   26,   54,
       55,   56,   57,   58,  118,  119,  240,  482,
@@ -1023,7 +880,7 @@ static const uint16_t dnxhd_1237_run_codes[62] = {
     1018, 1019, 1020, 1021, 1022, 1023,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint8_t dnxhd_1237_run_bits[62] = {
      1,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  8,  9,
      9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
@@ -1031,7 +888,7 @@ static const uint8_t dnxhd_1237_run_bits[62] = {
     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint8_t dnxhd_1237_run[62] = {
      1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
     17, 18, 19, 20, 21, 53, 57, 58, 59, 60, 61, 62, 22, 23, 24, 25,
@@ -1039,6 +896,7 @@ static const uint8_t dnxhd_1237_run[62] = {
     42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56,
 };
 
+/* Used in CID 1238, 1243, 1272 */
 static const uint8_t dnxhd_1238_run[62] = {
      1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
     20, 21, 17, 18, 19, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
@@ -1075,104 +933,146 @@ static const uint8_t dnxhd_1250_run[62] = {
 };
 
 const CIDEntry ff_dnxhd_cid_table[] = {
-    { 1235, 1920, 1080, 0, 917504, 917504, 6, 10,
+    { 1235, 1920, 1080, 917504, 917504,
+      0, 6, 10, 4,
       dnxhd_1235_luma_weight, dnxhd_1235_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
       { 175, 185, 365, 440 } },
-    { 1237, 1920, 1080, 0, 606208, 606208, 4, 8,
+    { 1237, 1920, 1080, 606208, 606208,
+      0, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 115, 120, 145, 240, 290 } },
-    { 1238, 1920, 1080, 0, 917504, 917504, 4, 8,
+    { 1238, 1920, 1080, 917504, 917504,
+      0, 4, 8, 4,
       dnxhd_1238_luma_weight, dnxhd_1238_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
-      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
+      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
       { 175, 185, 220, 365, 440 } },
-    { 1241, 1920, 1080, 1, 917504, 458752, 6, 10,
+    { 1241, 1920, 1080, 917504, 458752,
+      DNXHD_INTERLACED, 6, 10, 4,
       dnxhd_1241_luma_weight, dnxhd_1241_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
       { 185, 220 } },
-    { 1242, 1920, 1080, 1, 606208, 303104, 4, 8,
+    { 1242, 1920, 1080, 606208, 303104,
+      DNXHD_INTERLACED, 4, 8, 3,
       dnxhd_1242_luma_weight, dnxhd_1242_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 120, 145 } },
-    { 1243, 1920, 1080, 1, 917504, 458752, 4, 8,
+    { 1243, 1920, 1080, 917504, 458752,
+      DNXHD_INTERLACED, 4, 8, 4,
       dnxhd_1243_luma_weight, dnxhd_1243_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
-      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
+      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
       { 185, 220 } },
-    { 1250, 1280,  720, 0, 458752, 458752, 6, 10,
+    { 1244, 1440, 1080, 606208, 303104,
+      DNXHD_INTERLACED, 4, 8, 3,
+      dnxhd_1260_luma_weight, dnxhd_1260_chroma_weight,
+      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
+      dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
+      { 120, 145 } },
+    { 1250, 1280,  720, 458752, 458752,
+      0, 6, 10, 4,
       dnxhd_1250_luma_weight, dnxhd_1250_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1250_ac_codes, dnxhd_1250_ac_bits, dnxhd_1250_ac_level,
-      dnxhd_1250_ac_run_flag, dnxhd_1250_ac_index_flag,
+      dnxhd_1250_ac_codes, dnxhd_1250_ac_bits, dnxhd_1250_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
       { 90, 180, 220 } },
-    { 1251, 1280,  720, 0, 458752, 458752, 4, 8,
+    { 1251, 1280,  720, 458752, 458752,
+      0, 4, 8, 4,
       dnxhd_1251_luma_weight, dnxhd_1251_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1251_ac_codes, dnxhd_1251_ac_bits, dnxhd_1251_ac_level,
-      dnxhd_1251_ac_run_flag, dnxhd_1251_ac_index_flag,
+      dnxhd_1251_ac_codes, dnxhd_1251_ac_bits, dnxhd_1251_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 90, 110, 175, 220 } },
-    { 1252, 1280,  720, 0, 303104, 303104, 4, 8,
+      { 90, 110, 180, 220 } },
+    { 1252, 1280,  720, 303104, 303104,
+      0, 4, 8, 5,
       dnxhd_1252_luma_weight, dnxhd_1252_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_level,
-      dnxhd_1252_ac_run_flag, dnxhd_1252_ac_index_flag,
+      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 60, 75, 115, 145 } },
-    { 1253, 1920, 1080, 0, 188416, 188416, 4, 8,
+      { 60, 75, 120, 145 } },
+    { 1253, 1920, 1080, 188416, 188416,
+      0, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 36, 45, 75, 90 } },
-    { 1256, 1920, 1080, 0, 1835008, 1835008, 6, 10,
-      dnxhd_1235_luma_weight, dnxhd_1235_chroma_weight,
+    { 1256, 1920, 1080, 1835008, 1835008,
+      DNXHD_444, 6, 10, 4,
+      dnxhd_1235_luma_weight, dnxhd_1235_luma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
       { 350, 390, 440, 730, 880 } },
-    { 1258, 960, 720, 0, 212992, 212992, 4, 8,
+    { 1258, 960, 720, 212992, 212992,
+      0, 4, 8, 5,
       dnxhd_1252_luma_weight, dnxhd_1252_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_level,
-      dnxhd_1252_ac_run_flag, dnxhd_1252_ac_index_flag,
+      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
       { 42, 60, 75, 115 } },
-    { 1259, 1440, 1080, 0, 417792, 417792, 4, 8,
+    { 1259, 1440, 1080, 417792, 417792,
+      0, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 63, 84, 100, 110 } },
-    { 1260, 1440, 1080, 1, 835584, 417792, 4, 8,
+    { 1260, 1440, 1080, 835584, 417792,
+      DNXHD_INTERLACED | DNXHD_MBAFF, 4, 8, 3,
       dnxhd_1260_luma_weight, dnxhd_1260_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 80, 90, 100, 110 } },
+    { 1270, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      DNXHD_444, 6, DNXHD_VARIABLE, 4,
+      dnxhd_1235_luma_weight, dnxhd_1235_luma_weight,
+      dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
+      dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
+      { 0 }, { 57344, 255} },
+    { 1271, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 6, DNXHD_VARIABLE, 4,
+      dnxhd_1241_luma_weight, dnxhd_1241_chroma_weight,
+      dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
+      dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
+      { 0 }, { 28672, 255} },
+    { 1272, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 4, 8, 4,
+      dnxhd_1238_luma_weight, dnxhd_1238_chroma_weight,
+      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
+      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_info,
+      dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
+      { 0 }, { 28672, 255} },
+    { 1273, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 4, 8, 3,
+      dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
+      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
+      dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
+      { 0 }, { 18944, 255} },
+    { 1274, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 4, 8, 3,
+      dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
+      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
+      dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
+      { 0 }, { 5888, 255} },
 };
 
 int ff_dnxhd_get_cid_table(int cid)
@@ -1184,18 +1084,74 @@ int ff_dnxhd_get_cid_table(int cid)
     return -1;
 }
 
+int avpriv_dnxhd_get_frame_size(int cid)
+{
+    int i = ff_dnxhd_get_cid_table(cid);
+    if (i<0)
+        return i;
+    return ff_dnxhd_cid_table[i].frame_size;
+}
+
+int avpriv_dnxhd_get_hr_frame_size(int cid, int w, int h)
+{
+    int result, i = ff_dnxhd_get_cid_table(cid);
+
+    if (i < 0)
+        return i;
+
+    result = ((h + 15) / 16) * ((w + 15) / 16) * (int64_t)ff_dnxhd_cid_table[i].packet_scale.num / ff_dnxhd_cid_table[i].packet_scale.den;
+    result = (result + 2048) / 4096 * 4096;
+
+    return FFMAX(result, 8192);
+}
+
+int avpriv_dnxhd_get_interlaced(int cid)
+{
+    int i = ff_dnxhd_get_cid_table(cid);
+    if (i < 0)
+        return i;
+    return ff_dnxhd_cid_table[i].flags & DNXHD_INTERLACED ? 1 : 0;
+}
+
+static int dnxhd_find_hr_cid(AVCodecContext *avctx)
+{
+    switch (avctx->profile) {
+    case FF_PROFILE_DNXHR_444:
+        return 1270;
+    case FF_PROFILE_DNXHR_HQX:
+        return 1271;
+    case FF_PROFILE_DNXHR_HQ:
+        return 1272;
+    case FF_PROFILE_DNXHR_SQ:
+        return 1273;
+    case FF_PROFILE_DNXHR_LB:
+        return 1274;
+    }
+    return 0;
+}
+
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
 {
     int i, j;
     int mbs = avctx->bit_rate / 1000000;
+
+    if (avctx->profile != FF_PROFILE_DNXHD)
+        return dnxhd_find_hr_cid(avctx);
+
     if (!mbs)
         return 0;
     for (i = 0; i < FF_ARRAY_ELEMS(ff_dnxhd_cid_table); i++) {
         const CIDEntry *cid = &ff_dnxhd_cid_table[i];
+        int interlaced = cid->flags & DNXHD_INTERLACED ? 1 : 0;
         if (cid->width == avctx->width && cid->height == avctx->height &&
-            cid->interlaced == !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) &&
-            cid->bit_depth == bit_depth) {
-            for (j = 0; j < sizeof(cid->bit_rates); j++) {
+            interlaced == !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) &&
+            !(cid->flags & DNXHD_444) && cid->bit_depth == bit_depth) {
+            if (avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL &&
+                cid->flags & DNXHD_MBAFF) {
+                av_log(avctx, AV_LOG_WARNING, "Profile selected is experimental\n");
+                continue;
+            }
+            for (j = 0; j < FF_ARRAY_ELEMS(cid->bit_rates); j++) {
                 if (cid->bit_rates[j] == mbs)
                     return cid->cid;
             }
@@ -1204,25 +1160,18 @@ int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
     return 0;
 }
 
-void ff_dnxhd_list_cid(AVCodecContext *avctx)
+void ff_dnxhd_print_profiles(AVCodecContext *avctx, int loglevel)
 {
     int i, j;
-
     for (i = 0; i < FF_ARRAY_ELEMS(ff_dnxhd_cid_table); i++) {
         const CIDEntry *cid = &ff_dnxhd_cid_table[i];
-        av_log(avctx, AV_LOG_INFO,
-               "cid %d %ux%u %dbits %s bit rates",
-               cid->cid,
-               cid->width, cid->height,
-               cid->bit_depth,
-               cid->interlaced ? "interlaced " :
-                                 "progressive");
         for (j = 0; j < FF_ARRAY_ELEMS(cid->bit_rates); j++) {
             if (!cid->bit_rates[j])
                 break;
-            av_log(avctx, AV_LOG_INFO, " %dM",
-                   cid->bit_rates[j]);
+
+            av_log(avctx, loglevel, "Frame size: %dx%d%c; bitrate: %dMbps; pixel format: %s\n",
+                   cid->width, cid->height, cid->flags & DNXHD_INTERLACED ? 'i' : 'p', cid->bit_rates[j],
+                   cid->flags & DNXHD_444 ? "yuv444p10, gbrp10" : cid->bit_depth == 10 ? "yuv422p10" : "yuv422p");
         }
-        av_log(avctx, AV_LOG_INFO, "\n");
     }
 }
diff --git a/libavcodec/dnxhddata.h b/libavcodec/dnxhddata.h
index cc877b6..cfa6b0c 100644
--- a/libavcodec/dnxhddata.h
+++ b/libavcodec/dnxhddata.h
@@ -2,20 +2,20 @@
  * VC3/DNxHD decoder.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,29 +24,74 @@
 
 #include <stdint.h>
 #include "avcodec.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+
+/** Additional profile info flags */
+#define DNXHD_INTERLACED   (1<<0)
+#define DNXHD_MBAFF        (1<<1)
+#define DNXHD_444          (1<<2)
+
+/** Frame headers, extra 0x00 added to end for parser */
+#define DNXHD_HEADER_INITIAL 0x000002800100
+#define DNXHD_HEADER_444     0x000002800200
+
+/** Indicate that a CIDEntry value must be read in the bitstream */
+#define DNXHD_VARIABLE 0
 
 typedef struct CIDEntry {
     int cid;
     unsigned int width, height;
-    int interlaced;
     unsigned int frame_size;
     unsigned int coding_unit_size;
+    uint16_t flags;
     int index_bits;
     int bit_depth;
+    int eob_index;
     const uint8_t *luma_weight, *chroma_weight;
     const uint8_t *dc_codes, *dc_bits;
     const uint16_t *ac_codes;
-    const uint8_t *ac_bits, *ac_level;
-    const uint8_t *ac_run_flag, *ac_index_flag;
+    const uint8_t *ac_bits, *ac_info;
     const uint16_t *run_codes;
     const uint8_t *run_bits, *run;
     int bit_rates[5]; ///< Helper to choose variants, rounded to nearest 5Mb/s
+    AVRational packet_scale;
 } CIDEntry;
 
 extern const CIDEntry ff_dnxhd_cid_table[];
 
 int ff_dnxhd_get_cid_table(int cid);
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth);
-void ff_dnxhd_list_cid(AVCodecContext *avctx);
+void ff_dnxhd_print_profiles(AVCodecContext *avctx, int loglevel);
+
+static av_always_inline uint64_t ff_dnxhd_check_header_prefix_hr(uint64_t prefix)
+{
+    uint64_t data_offset = prefix >> 16;
+    if ((prefix & 0xFFFF0000FFFFLL) == 0x0300 &&
+         data_offset >= 0x0280 && data_offset <= 0x2170 &&
+         (data_offset & 3) == 0)
+        return prefix;
+    return 0;
+}
+
+static av_always_inline uint64_t ff_dnxhd_check_header_prefix(uint64_t prefix)
+{
+    if (prefix == DNXHD_HEADER_INITIAL ||
+        prefix == DNXHD_HEADER_444     ||
+        ff_dnxhd_check_header_prefix_hr(prefix))
+        return prefix;
+    return 0;
+}
+
+static av_always_inline uint64_t ff_dnxhd_parse_header_prefix(const uint8_t *buf)
+{
+    uint64_t prefix = AV_RB32(buf);
+    prefix = (prefix << 16) | buf[4] << 8;
+    return ff_dnxhd_check_header_prefix(prefix);
+}
+
+int avpriv_dnxhd_get_frame_size(int cid);
+int avpriv_dnxhd_get_hr_frame_size(int cid, int w, int h);
+int avpriv_dnxhd_get_interlaced(int cid);
 
 #endif /* AVCODEC_DNXHDDATA_H */
diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c
index fd58795..1e95086 100644
--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -2,23 +2,25 @@
  * VC3/DNxHD decoder.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  * Copyright (c) 2011 MirriAd Ltd
+ * Copyright (c) 2015 Christophe Gisquet
  *
  * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
+ * Slice multithreading and MB interlaced support added by Christophe Gisquet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,63 +28,103 @@
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#define  UNCHECKED_BITSTREAM_READER 1
 #include "get_bits.h"
 #include "dnxhddata.h"
 #include "idctdsp.h"
 #include "internal.h"
+#include "profiles.h"
 #include "thread.h"
 
+typedef struct RowContext {
+    DECLARE_ALIGNED(32, int16_t, blocks)[12][64];
+    int luma_scale[64];
+    int chroma_scale[64];
+    GetBitContext gb;
+    int last_dc[3];
+    int last_qscale;
+    int errors;
+    /** -1:not set yet  0:off=RGB  1:on=YUV  2:variable */
+    int format;
+} RowContext;
+
 typedef struct DNXHDContext {
     AVCodecContext *avctx;
-    GetBitContext gb;
+    RowContext *rows;
     BlockDSPContext bdsp;
-    int cid;                            ///< compression id
+    const uint8_t* buf;
+    int buf_size;
+    int64_t cid;                        ///< compression id
     unsigned int width, height;
+    enum AVPixelFormat pix_fmt;
     unsigned int mb_width, mb_height;
-    uint32_t mb_scan_index[68];         /* max for 1080p */
+    uint32_t mb_scan_index[512];
+    int data_offset;                    // End of mb_scan_index, where macroblocks start
     int cur_field;                      ///< current interlaced field
     VLC ac_vlc, dc_vlc, run_vlc;
-    int last_dc[3];
     IDCTDSPContext idsp;
-    DECLARE_ALIGNED(16, int16_t, blocks)[12][64];
     ScanTable scantable;
     const CIDEntry *cid_table;
-    int bit_depth; // 8, 10 or 0 if not initialized at all.
+    int bit_depth; // 8, 10, 12 or 0 if not initialized at all.
     int is_444;
+    int alpha;
+    int lla;
     int mbaff;
-    void (*decode_dct_block)(struct DNXHDContext *ctx, int16_t *block,
-                             int n, int qscale);
+    int act;
+    int (*decode_dct_block)(const struct DNXHDContext *ctx,
+                            RowContext *row, int n);
 } DNXHDContext;
 
 #define DNXHD_VLC_BITS 9
 #define DNXHD_DC_VLC_BITS 7
 
-static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block,
-                                     int n, int qscale);
-static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block,
-                                      int n, int qscale);
-static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block,
-                                          int n, int qscale);
+static int dnxhd_decode_dct_block_8(const DNXHDContext *ctx,
+                                    RowContext *row, int n);
+static int dnxhd_decode_dct_block_10(const DNXHDContext *ctx,
+                                     RowContext *row, int n);
+static int dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n);
+static int dnxhd_decode_dct_block_12(const DNXHDContext *ctx,
+                                     RowContext *row, int n);
+static int dnxhd_decode_dct_block_12_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n);
 
 static av_cold int dnxhd_decode_init(AVCodecContext *avctx)
 {
     DNXHDContext *ctx = avctx->priv_data;
 
     ctx->avctx = avctx;
+    ctx->cid = -1;
+    if (avctx->colorspace == AVCOL_SPC_UNSPECIFIED) {
+        avctx->colorspace = AVCOL_SPC_BT709;
+    }
+
+    avctx->coded_width  = FFALIGN(avctx->width,  16);
+    avctx->coded_height = FFALIGN(avctx->height, 16);
+
+    ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext));
+    if (!ctx->rows)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
-static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
+static int dnxhd_init_vlc(DNXHDContext *ctx, uint32_t cid, int bitdepth)
 {
     if (cid != ctx->cid) {
         int index;
 
         if ((index = ff_dnxhd_get_cid_table(cid)) < 0) {
-            av_log(ctx->avctx, AV_LOG_ERROR, "unsupported cid %d\n", cid);
+            av_log(ctx->avctx, AV_LOG_ERROR, "unsupported cid %"PRIu32"\n", cid);
             return AVERROR(ENOSYS);
         }
+        if (ff_dnxhd_cid_table[index].bit_depth != bitdepth &&
+            ff_dnxhd_cid_table[index].bit_depth != DNXHD_VARIABLE) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "bit depth mismatches %d %d\n", ff_dnxhd_cid_table[index].bit_depth, bitdepth);
+            return AVERROR_INVALIDDATA;
+        }
         ctx->cid_table = &ff_dnxhd_cid_table[index];
-        av_log(ctx->avctx, AV_LOG_VERBOSE, "Profile cid %d.\n", cid);
+        av_log(ctx->avctx, AV_LOG_VERBOSE, "Profile cid %"PRIu32".\n", cid);
 
         ff_free_vlc(&ctx->ac_vlc);
         ff_free_vlc(&ctx->dc_vlc);
@@ -91,38 +133,67 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
         init_vlc(&ctx->ac_vlc, DNXHD_VLC_BITS, 257,
                  ctx->cid_table->ac_bits, 1, 1,
                  ctx->cid_table->ac_codes, 2, 2, 0);
-        init_vlc(&ctx->dc_vlc, DNXHD_DC_VLC_BITS, ctx->bit_depth + 4,
+        init_vlc(&ctx->dc_vlc, DNXHD_DC_VLC_BITS, bitdepth > 8 ? 14 : 12,
                  ctx->cid_table->dc_bits, 1, 1,
                  ctx->cid_table->dc_codes, 1, 1, 0);
         init_vlc(&ctx->run_vlc, DNXHD_VLC_BITS, 62,
                  ctx->cid_table->run_bits, 1, 1,
                  ctx->cid_table->run_codes, 2, 2, 0);
 
-        ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable,
-                          ff_zigzag_direct);
         ctx->cid = cid;
     }
     return 0;
 }
 
+static av_cold int dnxhd_decode_init_thread_copy(AVCodecContext *avctx)
+{
+    DNXHDContext *ctx = avctx->priv_data;
+
+    ctx->avctx = avctx;
+    // make sure VLC tables will be loaded when cid is parsed
+    ctx->cid = -1;
+
+    ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext));
+    if (!ctx->rows)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static int dnxhd_get_profile(int cid)
+{
+    switch(cid) {
+    case 1270:
+        return FF_PROFILE_DNXHR_444;
+    case 1271:
+        return FF_PROFILE_DNXHR_HQX;
+    case 1272:
+        return FF_PROFILE_DNXHR_HQ;
+    case 1273:
+        return FF_PROFILE_DNXHR_SQ;
+    case 1274:
+        return FF_PROFILE_DNXHR_LB;
+    }
+    return FF_PROFILE_DNXHD;
+}
+
 static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
                                const uint8_t *buf, int buf_size,
                                int first_field)
 {
-    static const uint8_t header_prefix[]    = { 0x00, 0x00, 0x02, 0x80, 0x01 };
-    static const uint8_t header_prefix444[] = { 0x00, 0x00, 0x02, 0x80, 0x02 };
     int i, cid, ret;
-    int old_bit_depth = ctx->bit_depth;
-
+    int old_bit_depth = ctx->bit_depth, bitdepth;
+    uint64_t header_prefix;
     if (buf_size < 0x280) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "buffer too small (%d < 640).\n",
-               buf_size);
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "buffer too small (%d < 640).\n", buf_size);
         return AVERROR_INVALIDDATA;
     }
 
-    if (memcmp(buf, header_prefix, 5) && memcmp(buf, header_prefix444, 5)) {
+    header_prefix = ff_dnxhd_parse_header_prefix(buf);
+    if (header_prefix == 0) {
         av_log(ctx->avctx, AV_LOG_ERROR,
-               "unknown header 0x%02"PRIX8" 0x%02"PRIX8" 0x%02"PRIX8" 0x%02"PRIX8" 0x%02"PRIX8"\n",
+               "unknown header 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X\n",
                buf[0], buf[1], buf[2], buf[3], buf[4]);
         return AVERROR_INVALIDDATA;
     }
@@ -131,53 +202,84 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         frame->interlaced_frame = 1;
         frame->top_field_first  = first_field ^ ctx->cur_field;
         av_log(ctx->avctx, AV_LOG_DEBUG,
-               "interlaced %"PRId8", cur field %d\n", buf[5] & 3, ctx->cur_field);
+               "interlaced %d, cur field %d\n", buf[5] & 3, ctx->cur_field);
+    } else {
+        ctx->cur_field = 0;
     }
-    ctx->mbaff = buf[0x6] & 32;
+    ctx->mbaff = (buf[0x6] >> 5) & 1;
+    ctx->alpha = buf[0x7] & 1;
+    ctx->lla   = (buf[0x7] >> 1) & 1;
+    if (ctx->alpha)
+        avpriv_request_sample(ctx->avctx, "alpha");
 
     ctx->height = AV_RB16(buf + 0x18);
     ctx->width  = AV_RB16(buf + 0x1a);
 
-    ff_dlog(ctx->avctx, "width %u, height %u\n", ctx->width, ctx->height);
+    switch(buf[0x21] >> 5) {
+    case 1: bitdepth = 8; break;
+    case 2: bitdepth = 10; break;
+    case 3: bitdepth = 12; break;
+    default:
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "Unknown bitdepth indicator (%d)\n", buf[0x21] >> 5);
+        return AVERROR_INVALIDDATA;
+    }
+
+    cid = AV_RB32(buf + 0x28);
 
-    if (buf[0x21] == 0x58) { /* 10 bit */
-        ctx->bit_depth = ctx->avctx->bits_per_raw_sample = 10;
+    ctx->avctx->profile = dnxhd_get_profile(cid);
 
-        if (buf[0x4] == 0x2) {
+    if ((ret = dnxhd_init_vlc(ctx, cid, bitdepth)) < 0)
+        return ret;
+    if (ctx->mbaff && ctx->cid_table->cid != 1260)
+        av_log(ctx->avctx, AV_LOG_WARNING,
+               "Adaptive MB interlace flag in an unsupported profile.\n");
+
+    ctx->act = buf[0x2C] & 7;
+    if (ctx->act && ctx->cid_table->cid != 1256 && ctx->cid_table->cid != 1270)
+        av_log(ctx->avctx, AV_LOG_WARNING,
+               "Adaptive color transform in an unsupported profile.\n");
+
+    ctx->is_444 = (buf[0x2C] >> 6) & 1;
+    if (ctx->is_444) {
+        if (bitdepth == 8) {
+            avpriv_request_sample(ctx->avctx, "4:4:4 8 bits");
+            return AVERROR_INVALIDDATA;
+        } else if (bitdepth == 10) {
             ctx->decode_dct_block = dnxhd_decode_dct_block_10_444;
-            ctx->avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
-            ctx->is_444 = 1;
+            ctx->pix_fmt = ctx->act ? AV_PIX_FMT_YUV444P10
+                                    : AV_PIX_FMT_GBRP10;
         } else {
-            ctx->decode_dct_block = dnxhd_decode_dct_block_10;
-            ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            ctx->decode_dct_block = dnxhd_decode_dct_block_12_444;
+            ctx->pix_fmt = ctx->act ? AV_PIX_FMT_YUV444P12
+                                    : AV_PIX_FMT_GBRP12;
         }
-    } else if (buf[0x21] == 0x38) { /* 8 bit */
-        ctx->bit_depth = ctx->avctx->bits_per_raw_sample = 8;
-
-        ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-        ctx->decode_dct_block = dnxhd_decode_dct_block_8;
+    } else if (bitdepth == 12) {
+        ctx->decode_dct_block = dnxhd_decode_dct_block_12;
+        ctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+    } else if (bitdepth == 10) {
+        if (ctx->avctx->profile == FF_PROFILE_DNXHR_HQX)
+            ctx->decode_dct_block = dnxhd_decode_dct_block_10_444;
+        else
+            ctx->decode_dct_block = dnxhd_decode_dct_block_10;
+        ctx->pix_fmt = AV_PIX_FMT_YUV422P10;
     } else {
-        av_log(ctx->avctx, AV_LOG_ERROR, "invalid bit depth value (%"PRId8").\n",
-               buf[0x21]);
-        return AVERROR_INVALIDDATA;
+        ctx->decode_dct_block = dnxhd_decode_dct_block_8;
+        ctx->pix_fmt = AV_PIX_FMT_YUV422P;
     }
+
+    ctx->avctx->bits_per_raw_sample = ctx->bit_depth = bitdepth;
     if (ctx->bit_depth != old_bit_depth) {
-        ff_blockdsp_init(&ctx->bdsp);
+        ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
         ff_idctdsp_init(&ctx->idsp, ctx->avctx);
+        ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable,
+                          ff_zigzag_direct);
     }
 
-    cid = AV_RB32(buf + 0x28);
-    ff_dlog(ctx->avctx, "compression id %d\n", cid);
-
-    if ((ret = dnxhd_init_vlc(ctx, cid)) < 0)
-        return ret;
-    if (ctx->mbaff && ctx->cid_table->cid != 1260)
-        av_log(ctx->avctx, AV_LOG_WARNING,
-               "Adaptive MB interlace flag in an unsupported profile.\n");
-
     // make sure profile size constraints are respected
     // DNx100 allows 1920->1440 and 1280->960 subsampling
-    if (ctx->width != ctx->cid_table->width) {
+    if (ctx->width != ctx->cid_table->width &&
+        ctx->cid_table->width != DNXHD_VARIABLE) {
         av_reduce(&ctx->avctx->sample_aspect_ratio.num,
                   &ctx->avctx->sample_aspect_ratio.den,
                   ctx->width, ctx->cid_table->width, 255);
@@ -190,29 +292,53 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         return AVERROR_INVALIDDATA;
     }
 
-    ctx->mb_width  = ctx->width >> 4;
-    ctx->mb_height = buf[0x16d];
-
-    ff_dlog(ctx->avctx,
-            "mb width %u, mb height %u\n", ctx->mb_width, ctx->mb_height);
+    ctx->mb_width  = (ctx->width + 15)>> 4;
+    ctx->mb_height = AV_RB16(buf + 0x16c);
 
     if ((ctx->height + 15) >> 4 == ctx->mb_height && frame->interlaced_frame)
         ctx->height <<= 1;
 
-    if (ctx->mb_height > 68 ||
-        (ctx->mb_height << frame->interlaced_frame) > (ctx->height + 15) >> 4) {
+    av_log(ctx->avctx, AV_LOG_VERBOSE, "%dx%d, 4:%s %d bits, MBAFF=%d ACT=%d\n",
+           ctx->width, ctx->height, ctx->is_444 ? "4:4" : "2:2",
+           ctx->bit_depth, ctx->mbaff, ctx->act);
+
+    // Newer format supports variable mb_scan_index sizes
+    if (ctx->mb_height > 68 && ff_dnxhd_check_header_prefix_hr(header_prefix)) {
+        ctx->data_offset = 0x170 + (ctx->mb_height << 2);
+    } else {
+        if (ctx->mb_height > 68) {
+            av_log(ctx->avctx, AV_LOG_ERROR,
+                   "mb height too big: %d\n", ctx->mb_height);
+            return AVERROR_INVALIDDATA;
+        }
+        ctx->data_offset = 0x280;
+    }
+    if ((ctx->mb_height << frame->interlaced_frame) > (ctx->height + 15) >> 4) {
         av_log(ctx->avctx, AV_LOG_ERROR,
-               "mb height too big: %u\n", ctx->mb_height);
+                "mb height too big: %d\n", ctx->mb_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (buf_size < ctx->data_offset) {
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "buffer too small (%d < %d).\n", buf_size, ctx->data_offset);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ctx->mb_height > FF_ARRAY_ELEMS(ctx->mb_scan_index)) {
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "mb_height too big (%d > %"SIZE_SPECIFIER").\n", ctx->mb_height, FF_ARRAY_ELEMS(ctx->mb_scan_index));
         return AVERROR_INVALIDDATA;
     }
 
     for (i = 0; i < ctx->mb_height; i++) {
         ctx->mb_scan_index[i] = AV_RB32(buf + 0x170 + (i << 2));
-        ff_dlog(ctx->avctx, "mb scan index %"PRIu32"\n", ctx->mb_scan_index[i]);
-        if (buf_size < ctx->mb_scan_index[i] + 0x280) {
+        ff_dlog(ctx->avctx, "mb scan index %d, pos %d: %"PRIu32"\n",
+                i, 0x170 + (i << 2), ctx->mb_scan_index[i]);
+        if (buf_size - ctx->data_offset < ctx->mb_scan_index[i]) {
             av_log(ctx->avctx, AV_LOG_ERROR,
-                   "invalid mb scan index (%d < %"PRIu32").\n",
-                   buf_size, ctx->mb_scan_index[i] + 0x280);
+                   "invalid mb scan index (%"PRIu32" vs %u).\n",
+                   ctx->mb_scan_index[i], buf_size - ctx->data_offset);
             return AVERROR_INVALIDDATA;
         }
     }
@@ -220,132 +346,185 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
     return 0;
 }
 
-static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx,
-                                                    int16_t *block, int n,
-                                                    int qscale,
-                                                    int index_bits,
-                                                    int level_bias,
-                                                    int level_shift)
+static av_always_inline int dnxhd_decode_dct_block(const DNXHDContext *ctx,
+                                                   RowContext *row,
+                                                   int n,
+                                                   int index_bits,
+                                                   int level_bias,
+                                                   int level_shift,
+                                                   int dc_shift)
 {
-    int i, j, index1, index2, len;
+    int i, j, index1, index2, len, flags;
     int level, component, sign;
+    const int *scale;
     const uint8_t *weight_matrix;
-    OPEN_READER(bs, &ctx->gb);
+    const uint8_t *ac_info = ctx->cid_table->ac_info;
+    int16_t *block = row->blocks[n];
+    const int eob_index     = ctx->cid_table->eob_index;
+    int ret = 0;
+    OPEN_READER(bs, &row->gb);
+
+    ctx->bdsp.clear_block(block);
 
     if (!ctx->is_444) {
         if (n & 2) {
             component     = 1 + (n & 1);
+            scale = row->chroma_scale;
             weight_matrix = ctx->cid_table->chroma_weight;
         } else {
             component     = 0;
+            scale = row->luma_scale;
             weight_matrix = ctx->cid_table->luma_weight;
         }
     } else {
         component = (n >> 1) % 3;
         if (component) {
+            scale = row->chroma_scale;
             weight_matrix = ctx->cid_table->chroma_weight;
         } else {
+            scale = row->luma_scale;
             weight_matrix = ctx->cid_table->luma_weight;
         }
     }
 
-    UPDATE_CACHE(bs, &ctx->gb);
-    GET_VLC(len, bs, &ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
+    UPDATE_CACHE(bs, &row->gb);
+    GET_VLC(len, bs, &row->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
+    if (len < 0) {
+        ret = len;
+        goto error;
+    }
     if (len) {
-        level = GET_CACHE(bs, &ctx->gb);
-        LAST_SKIP_BITS(bs, &ctx->gb, len);
+        level = GET_CACHE(bs, &row->gb);
+        LAST_SKIP_BITS(bs, &row->gb, len);
         sign  = ~level >> 31;
         level = (NEG_USR32(sign ^ level, len) ^ sign) - sign;
-        ctx->last_dc[component] += level;
+        row->last_dc[component] += level * (1 << dc_shift);
     }
-    block[0] = ctx->last_dc[component];
+    block[0] = row->last_dc[component];
 
-    for (i = 1; ; i++) {
-        UPDATE_CACHE(bs, &ctx->gb);
-        GET_VLC(index1, bs, &ctx->gb, ctx->ac_vlc.table,
-                DNXHD_VLC_BITS, 2);
-        level = ctx->cid_table->ac_level[index1];
-        if (!level) /* EOB */
-            break;
+    i = 0;
+
+    UPDATE_CACHE(bs, &row->gb);
+    GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table,
+            DNXHD_VLC_BITS, 2);
+
+    while (index1 != eob_index) {
+        level = ac_info[2*index1+0];
+        flags = ac_info[2*index1+1];
 
-        sign = SHOW_SBITS(bs, &ctx->gb, 1);
-        SKIP_BITS(bs, &ctx->gb, 1);
+        sign = SHOW_SBITS(bs, &row->gb, 1);
+        SKIP_BITS(bs, &row->gb, 1);
 
-        if (ctx->cid_table->ac_index_flag[index1]) {
-            level += SHOW_UBITS(bs, &ctx->gb, index_bits) << 6;
-            SKIP_BITS(bs, &ctx->gb, index_bits);
+        if (flags & 1) {
+            level += SHOW_UBITS(bs, &row->gb, index_bits) << 7;
+            SKIP_BITS(bs, &row->gb, index_bits);
         }
 
-        if (ctx->cid_table->ac_run_flag[index1]) {
-            UPDATE_CACHE(bs, &ctx->gb);
-            GET_VLC(index2, bs, &ctx->gb, ctx->run_vlc.table,
+        if (flags & 2) {
+            UPDATE_CACHE(bs, &row->gb);
+            GET_VLC(index2, bs, &row->gb, ctx->run_vlc.table,
                     DNXHD_VLC_BITS, 2);
             i += ctx->cid_table->run[index2];
         }
 
-        if (i > 63) {
+        if (++i > 63) {
             av_log(ctx->avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", n, i);
+            ret = -1;
             break;
         }
 
         j     = ctx->scantable.permutated[i];
-        level = (2 * level + 1) * qscale * weight_matrix[i];
+        level *= scale[i];
+        level += scale[i] >> 1;
         if (level_bias < 32 || weight_matrix[i] != level_bias)
-            level += level_bias;
+            level += level_bias; // 1<<(level_shift-1)
         level >>= level_shift;
 
         block[j] = (level ^ sign) - sign;
+
+        UPDATE_CACHE(bs, &row->gb);
+        GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table,
+                DNXHD_VLC_BITS, 2);
     }
+error:
+    CLOSE_READER(bs, &row->gb);
+    return ret;
+}
 
-    CLOSE_READER(bs, &ctx->gb);
+static int dnxhd_decode_dct_block_8(const DNXHDContext *ctx,
+                                    RowContext *row, int n)
+{
+    return dnxhd_decode_dct_block(ctx, row, n, 4, 32, 6, 0);
+}
+
+static int dnxhd_decode_dct_block_10(const DNXHDContext *ctx,
+                                     RowContext *row, int n)
+{
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 8, 4, 0);
 }
 
-static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block,
-                                     int n, int qscale)
+static int dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 4, 32, 6);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 32, 6, 0);
 }
 
-static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block,
-                                      int n, int qscale)
+static int dnxhd_decode_dct_block_12(const DNXHDContext *ctx,
+                                     RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 8, 4);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 8, 4, 2);
 }
 
-static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block,
-                                          int n, int qscale)
+static int dnxhd_decode_dct_block_12_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 32, 6);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 32, 4, 2);
 }
 
-static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
-                                   int x, int y)
+static int dnxhd_decode_macroblock(const DNXHDContext *ctx, RowContext *row,
+                                   AVFrame *frame, int x, int y)
 {
-    int shift1 = ctx->bit_depth == 10;
+    int shift1 = ctx->bit_depth >= 10;
     int dct_linesize_luma   = frame->linesize[0];
     int dct_linesize_chroma = frame->linesize[1];
     uint8_t *dest_y, *dest_u, *dest_v;
     int dct_y_offset, dct_x_offset;
-    int qscale, i;
+    int qscale, i, act;
     int interlaced_mb = 0;
 
     if (ctx->mbaff) {
-        interlaced_mb = get_bits1(&ctx->gb);
-        qscale = get_bits(&ctx->gb, 10);
+        interlaced_mb = get_bits1(&row->gb);
+        qscale = get_bits(&row->gb, 10);
     } else {
-        qscale = get_bits(&ctx->gb, 11);
+        qscale = get_bits(&row->gb, 11);
     }
-    skip_bits1(&ctx->gb);
-
-    for (i = 0; i < 8; i++) {
-        ctx->bdsp.clear_block(ctx->blocks[i]);
-        ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
+    act = get_bits1(&row->gb);
+    if (act) {
+        if (!ctx->act) {
+            static int act_warned;
+            if (!act_warned) {
+                act_warned = 1;
+                av_log(ctx->avctx, AV_LOG_ERROR,
+                       "ACT flag set, in violation of frame header.\n");
+            }
+        } else if (row->format == -1) {
+            row->format = act;
+        } else if (row->format != act) {
+            row->format = 2; // Variable
+        }
     }
-    if (ctx->is_444) {
-        for (; i < 12; i++) {
-            ctx->bdsp.clear_block(ctx->blocks[i]);
-            ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
+
+    if (qscale != row->last_qscale) {
+        for (i = 0; i < 64; i++) {
+            row->luma_scale[i]   = qscale * ctx->cid_table->luma_weight[i];
+            row->chroma_scale[i] = qscale * ctx->cid_table->chroma_weight[i];
         }
+        row->last_qscale = qscale;
+    }
+
+    for (i = 0; i < 8 + 4 * ctx->is_444; i++) {
+        if (ctx->decode_dct_block(ctx, row, i) < 0)
+            return AVERROR_INVALIDDATA;
     }
 
     if (frame->interlaced_frame) {
@@ -357,7 +536,7 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
     dest_u = frame->data[1] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1 + ctx->is_444));
     dest_v = frame->data[2] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1 + ctx->is_444));
 
-    if (ctx->cur_field) {
+    if (frame->interlaced_frame && ctx->cur_field) {
         dest_y += frame->linesize[0];
         dest_u += frame->linesize[1];
         dest_v += frame->linesize[2];
@@ -370,55 +549,66 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
     dct_y_offset = interlaced_mb ? frame->linesize[0] : (dct_linesize_luma << 3);
     dct_x_offset = 8 << shift1;
     if (!ctx->is_444) {
-        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[4]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, row->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, row->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, row->blocks[4]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[5]);
 
         if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3);
-            ctx->idsp.idct_put(dest_u,                dct_linesize_chroma, ctx->blocks[2]);
-            ctx->idsp.idct_put(dest_v,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]);
+            ctx->idsp.idct_put(dest_u,                dct_linesize_chroma, row->blocks[2]);
+            ctx->idsp.idct_put(dest_v,                dct_linesize_chroma, row->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, row->blocks[6]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, row->blocks[7]);
         }
     } else {
-        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[6]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, row->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, row->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, row->blocks[6]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[7]);
 
         if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3);
-            ctx->idsp.idct_put(dest_u,                               dct_linesize_chroma, ctx->blocks[2]);
-            ctx->idsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, ctx->blocks[8]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]);
-            ctx->idsp.idct_put(dest_v,                               dct_linesize_chroma, ctx->blocks[4]);
-            ctx->idsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, ctx->blocks[5]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, ctx->blocks[10]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]);
+            ctx->idsp.idct_put(dest_u,                               dct_linesize_chroma, row->blocks[2]);
+            ctx->idsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, row->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, row->blocks[8]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[9]);
+            ctx->idsp.idct_put(dest_v,                               dct_linesize_chroma, row->blocks[4]);
+            ctx->idsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, row->blocks[5]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, row->blocks[10]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[11]);
         }
     }
 
     return 0;
 }
 
-static int dnxhd_decode_macroblocks(DNXHDContext *ctx, AVFrame *frame,
-                                    const uint8_t *buf, int buf_size)
+static int dnxhd_decode_row(AVCodecContext *avctx, void *data,
+                            int rownb, int threadnb)
 {
-    int x, y;
-    for (y = 0; y < ctx->mb_height; y++) {
-        ctx->last_dc[0] =
-        ctx->last_dc[1] =
-        ctx->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1)
-        init_get_bits(&ctx->gb, buf + ctx->mb_scan_index[y], (buf_size - ctx->mb_scan_index[y]) << 3);
-        for (x = 0; x < ctx->mb_width; x++) {
-            //START_TIMER;
-            dnxhd_decode_macroblock(ctx, frame, x, y);
-            //STOP_TIMER("decode macroblock");
+    const DNXHDContext *ctx = avctx->priv_data;
+    uint32_t offset = ctx->mb_scan_index[rownb];
+    RowContext *row = ctx->rows + threadnb;
+    int x, ret;
+
+    row->last_dc[0] =
+    row->last_dc[1] =
+    row->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1)
+    ret = init_get_bits8(&row->gb, ctx->buf + offset, ctx->buf_size - offset);
+    if (ret < 0) {
+        row->errors++;
+        return ret;
+    }
+    for (x = 0; x < ctx->mb_width; x++) {
+        //START_TIMER;
+        int ret = dnxhd_decode_macroblock(ctx, row, data, x, rownb);
+        if (ret < 0) {
+            row->errors++;
+            return ret;
         }
+        //STOP_TIMER("decode macroblock");
     }
+
     return 0;
 }
 
@@ -428,16 +618,18 @@ static int dnxhd_decode_frame(AVCodecContext *avctx, void *data,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     DNXHDContext *ctx = avctx->priv_data;
-    ThreadFrame tf;
+    ThreadFrame frame = { .f = data };
+    AVFrame *picture = data;
     int first_field = 1;
-    int ret;
-
-    tf.f = data;
+    int ret, i;
 
     ff_dlog(avctx, "frame size %d\n", buf_size);
 
+    for (i = 0; i < avctx->thread_count; i++)
+        ctx->rows[i].format = -1;
+
 decode_coding_unit:
-    if ((ret = dnxhd_decode_header(ctx, tf.f, buf, buf_size, first_field)) < 0)
+    if ((ret = dnxhd_decode_header(ctx, picture, buf, buf_size, first_field)) < 0)
         return ret;
 
     if ((avctx->width || avctx->height) &&
@@ -446,31 +638,78 @@ decode_coding_unit:
                avctx->width, avctx->height, ctx->width, ctx->height);
         first_field = 1;
     }
+    if (avctx->pix_fmt != AV_PIX_FMT_NONE && avctx->pix_fmt != ctx->pix_fmt) {
+        av_log(avctx, AV_LOG_WARNING, "pix_fmt changed: %s -> %s\n",
+               av_get_pix_fmt_name(avctx->pix_fmt), av_get_pix_fmt_name(ctx->pix_fmt));
+        first_field = 1;
+    }
 
+    avctx->pix_fmt = ctx->pix_fmt;
     ret = ff_set_dimensions(avctx, ctx->width, ctx->height);
     if (ret < 0)
         return ret;
 
     if (first_field) {
-        if ((ret = ff_thread_get_buffer(avctx, &tf, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
             return ret;
-        }
-        tf.f->pict_type = AV_PICTURE_TYPE_I;
-        tf.f->key_frame = 1;
+        picture->pict_type = AV_PICTURE_TYPE_I;
+        picture->key_frame = 1;
     }
 
-    dnxhd_decode_macroblocks(ctx, tf.f, buf + 0x280, buf_size - 0x280);
+    ctx->buf_size = buf_size - ctx->data_offset;
+    ctx->buf = buf + ctx->data_offset;
+    avctx->execute2(avctx, dnxhd_decode_row, picture, NULL, ctx->mb_height);
 
-    if (first_field && tf.f->interlaced_frame) {
+    if (first_field && picture->interlaced_frame) {
         buf      += ctx->cid_table->coding_unit_size;
         buf_size -= ctx->cid_table->coding_unit_size;
         first_field = 0;
         goto decode_coding_unit;
     }
 
+    ret = 0;
+    for (i = 0; i < avctx->thread_count; i++) {
+        ret += ctx->rows[i].errors;
+        ctx->rows[i].errors = 0;
+    }
+
+    if (ctx->act) {
+        static int act_warned;
+        int format = ctx->rows[0].format;
+        for (i = 1; i < avctx->thread_count; i++) {
+            if (ctx->rows[i].format != format &&
+                ctx->rows[i].format != -1 /* not run */) {
+                format = 2;
+                break;
+            }
+        }
+        switch (format) {
+        case -1:
+        case 2:
+            if (!act_warned) {
+                act_warned = 1;
+                av_log(ctx->avctx, AV_LOG_ERROR,
+                       "Unsupported: variable ACT flag.\n");
+            }
+            break;
+        case 0:
+            ctx->pix_fmt = ctx->bit_depth==10
+                         ? AV_PIX_FMT_GBRP10 : AV_PIX_FMT_GBRP12;
+            break;
+        case 1:
+            ctx->pix_fmt = ctx->bit_depth==10
+                         ? AV_PIX_FMT_YUV444P10 : AV_PIX_FMT_YUV444P12;
+            break;
+        }
+    }
+    avctx->pix_fmt = ctx->pix_fmt;
+    if (ret) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "%d lines with errors\n", ret);
+        return AVERROR_INVALIDDATA;
+    }
+
     *got_frame = 1;
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int dnxhd_decode_close(AVCodecContext *avctx)
@@ -480,6 +719,9 @@ static av_cold int dnxhd_decode_close(AVCodecContext *avctx)
     ff_free_vlc(&ctx->ac_vlc);
     ff_free_vlc(&ctx->dc_vlc);
     ff_free_vlc(&ctx->run_vlc);
+
+    av_freep(&ctx->rows);
+
     return 0;
 }
 
@@ -492,5 +734,8 @@ AVCodec ff_dnxhd_decoder = {
     .init           = dnxhd_decode_init,
     .close          = dnxhd_decode_close,
     .decode         = dnxhd_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
+                      AV_CODEC_CAP_SLICE_THREADS,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(dnxhd_decode_init_thread_copy),
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_dnxhd_profiles),
 };
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index 639debf..41b8079 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -6,20 +6,20 @@
  * VC-3 encoder funded by the British Broadcasting Corporation
  * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
 #include "internal.h"
 #include "mpegvideo.h"
 #include "pixblockdsp.h"
+#include "profiles.h"
 #include "dnxhdenc.h"
 
 // The largest value that will not lead to overflow for 10-bit samples.
@@ -44,21 +45,36 @@
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
     { "nitris_compat", "encode with Avid Nitris compatibility",
-        offsetof(DNXHDEncContext, nitris_compat), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+        offsetof(DNXHDEncContext, nitris_compat), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "ibias", "intra quant bias",
         offsetof(DNXHDEncContext, intra_quant_bias), AV_OPT_TYPE_INT,
-        { .i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, VE },
+        { .i64 = 0 }, INT_MIN, INT_MAX, VE },
+    { "profile",       NULL, offsetof(DNXHDEncContext, profile), AV_OPT_TYPE_INT,
+        { .i64 = FF_PROFILE_DNXHD },
+        FF_PROFILE_DNXHD, FF_PROFILE_DNXHR_444, VE, "profile" },
+    { "dnxhd",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_DNXHD },
+        0, 0, VE, "profile" },
+    { "dnxhr_444", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_DNXHR_444 },
+        0, 0, VE, "profile" },
+    { "dnxhr_hqx", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_DNXHR_HQX },
+        0, 0, VE, "profile" },
+    { "dnxhr_hq",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_DNXHR_HQ },
+        0, 0, VE, "profile" },
+    { "dnxhr_sq",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_DNXHR_SQ },
+        0, 0, VE, "profile" },
+    { "dnxhr_lb",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_DNXHR_LB },
+        0, 0, VE, "profile" },
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass dnxhd_class = {
     .class_name = "dnxhd",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *restrict block,
+static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *av_restrict block,
                                           const uint8_t *pixels,
                                           ptrdiff_t line_size)
 {
@@ -82,25 +98,84 @@ static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *restrict block,
 }
 
 static av_always_inline
-void dnxhd_10bit_get_pixels_8x4_sym(int16_t *restrict block,
+void dnxhd_10bit_get_pixels_8x4_sym(int16_t *av_restrict block,
                                     const uint8_t *pixels,
                                     ptrdiff_t line_size)
 {
-    int i;
+    memcpy(block + 0 * 8, pixels + 0 * line_size, 8 * sizeof(*block));
+    memcpy(block + 7 * 8, pixels + 0 * line_size, 8 * sizeof(*block));
+    memcpy(block + 1 * 8, pixels + 1 * line_size, 8 * sizeof(*block));
+    memcpy(block + 6 * 8, pixels + 1 * line_size, 8 * sizeof(*block));
+    memcpy(block + 2 * 8, pixels + 2 * line_size, 8 * sizeof(*block));
+    memcpy(block + 5 * 8, pixels + 2 * line_size, 8 * sizeof(*block));
+    memcpy(block + 3 * 8, pixels + 3 * line_size, 8 * sizeof(*block));
+    memcpy(block + 4 * 8, pixels + 3 * line_size, 8 * sizeof(*block));
+}
 
-    block += 32;
+static int dnxhd_10bit_dct_quantize_444(MpegEncContext *ctx, int16_t *block,
+                                        int n, int qscale, int *overflow)
+{
+    int i, j, level, last_non_zero, start_i;
+    const int *qmat;
+    const uint8_t *scantable= ctx->intra_scantable.scantable;
+    int bias;
+    int max = 0;
+    unsigned int threshold1, threshold2;
 
-    for (i = 0; i < 4; i++) {
-        memcpy(block + i * 8, pixels + i * line_size, 8 * sizeof(*block));
-        memcpy(block - (i + 1) * 8, pixels + i * line_size, 8 * sizeof(*block));
+    ctx->fdsp.fdct(block);
+
+    block[0] = (block[0] + 2) >> 2;
+    start_i = 1;
+    last_non_zero = 0;
+    qmat = n < 4 ? ctx->q_intra_matrix[qscale] : ctx->q_chroma_intra_matrix[qscale];
+    bias= ctx->intra_quant_bias * (1 << (16 - 8));
+    threshold1 = (1 << 16) - bias - 1;
+    threshold2 = (threshold1 << 1);
+
+    for (i = 63; i >= start_i; i--) {
+        j = scantable[i];
+        level = block[j] * qmat[j];
+
+        if (((unsigned)(level + threshold1)) > threshold2) {
+            last_non_zero = i;
+            break;
+        } else{
+            block[j]=0;
+        }
     }
+
+    for (i = start_i; i <= last_non_zero; i++) {
+        j = scantable[i];
+        level = block[j] * qmat[j];
+
+        if (((unsigned)(level + threshold1)) > threshold2) {
+            if (level > 0) {
+                level = (bias + level) >> 16;
+                block[j] = level;
+            } else{
+                level = (bias - level) >> 16;
+                block[j] = -level;
+            }
+            max |= level;
+        } else {
+            block[j] = 0;
+        }
+    }
+    *overflow = ctx->max_qcoeff < max; //overflow might have happened
+
+    /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */
+    if (ctx->idsp.perm_type != FF_IDCT_PERM_NONE)
+        ff_block_permute(block, ctx->idsp.idct_permutation,
+                         scantable, last_non_zero);
+
+    return last_non_zero;
 }
 
 static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, int16_t *block,
                                     int n, int qscale, int *overflow)
 {
     const uint8_t *scantable= ctx->intra_scantable.scantable;
-    const int *qmat = ctx->q_intra_matrix[qscale];
+    const int *qmat = n<4 ? ctx->q_intra_matrix[qscale] : ctx->q_chroma_intra_matrix[qscale];
     int last_non_zero = 0;
     int i;
 
@@ -119,25 +194,30 @@ static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, int16_t *block,
             last_non_zero = i;
     }
 
+    /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */
+    if (ctx->idsp.perm_type != FF_IDCT_PERM_NONE)
+        ff_block_permute(block, ctx->idsp.idct_permutation,
+                         scantable, last_non_zero);
+
     return last_non_zero;
 }
 
 static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
 {
     int i, j, level, run;
-    int max_level = 1 << (ctx->cid_table->bit_depth + 2);
+    int max_level = 1 << (ctx->bit_depth + 2);
 
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_codes,
-                      max_level * 4 * sizeof(*ctx->vlc_codes), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_bits,
-                      max_level * 4 * sizeof(*ctx->vlc_bits), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->orig_vlc_codes,
+                      max_level, 4 * sizeof(*ctx->orig_vlc_codes), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->orig_vlc_bits,
+                      max_level, 4 * sizeof(*ctx->orig_vlc_bits), fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_codes,
                       63 * 2, fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_bits,
                       63, fail);
 
-    ctx->vlc_codes += max_level * 2;
-    ctx->vlc_bits  += max_level * 2;
+    ctx->vlc_codes = ctx->orig_vlc_codes + max_level * 2;
+    ctx->vlc_bits  = ctx->orig_vlc_bits + max_level * 2;
     for (level = -max_level; level < max_level; level++) {
         for (run = 0; run < 2; run++) {
             int index = (level << 1) | run;
@@ -149,10 +229,10 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
                 alevel -= offset << 6;
             }
             for (j = 0; j < 257; j++) {
-                if (ctx->cid_table->ac_level[j] == alevel &&
-                    (!offset || (ctx->cid_table->ac_index_flag[j] && offset)) &&
-                    (!run    || (ctx->cid_table->ac_run_flag  [j] && run))) {
-                    assert(!ctx->vlc_codes[index]);
+                if (ctx->cid_table->ac_info[2*j+0] >> 1 == alevel &&
+                    (!offset || (ctx->cid_table->ac_info[2*j+1] & 1) && offset) &&
+                    (!run    || (ctx->cid_table->ac_info[2*j+1] & 2) && run)) {
+                    av_assert1(!ctx->vlc_codes[index]);
                     if (alevel) {
                         ctx->vlc_codes[index] =
                             (ctx->cid_table->ac_codes[j] << 1) | (sign & 1);
@@ -164,7 +244,7 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
                     break;
                 }
             }
-            assert(!alevel || j < 257);
+            av_assert0(!alevel || j < 257);
             if (offset) {
                 ctx->vlc_codes[index] =
                     (ctx->vlc_codes[index] << ctx->cid_table->index_bits) | offset;
@@ -174,7 +254,7 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
     }
     for (i = 0; i < 62; i++) {
         int run = ctx->cid_table->run[i];
-        assert(run < 63);
+        av_assert0(run < 63);
         ctx->run_codes[run] = ctx->cid_table->run_codes[i];
         ctx->run_bits[run]  = ctx->cid_table->run_bits[i];
     }
@@ -191,18 +271,18 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
     const uint8_t *luma_weight_table   = ctx->cid_table->luma_weight;
     const uint8_t *chroma_weight_table = ctx->cid_table->chroma_weight;
 
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,
-                      (ctx->m.avctx->qmax + 1) * 64 * sizeof(int), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,
-                      (ctx->m.avctx->qmax + 1) * 64 * sizeof(int), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16,
-                      (ctx->m.avctx->qmax + 1) * 64 * 2 * sizeof(uint16_t),
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,
+                      (ctx->m.avctx->qmax + 1), 64 * sizeof(int), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,
+                      (ctx->m.avctx->qmax + 1), 64 * sizeof(int), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16,
+                      (ctx->m.avctx->qmax + 1), 64 * 2 * sizeof(uint16_t),
                       fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16,
-                      (ctx->m.avctx->qmax + 1) * 64 * 2 * sizeof(uint16_t),
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16,
+                      (ctx->m.avctx->qmax + 1), 64 * 2 * sizeof(uint16_t),
                       fail);
 
-    if (ctx->cid_table->bit_depth == 8) {
+    if (ctx->bit_depth == 8) {
         for (i = 1; i < 64; i++) {
             int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]];
             weight_matrix[j] = ctx->cid_table->luma_weight[i];
@@ -232,7 +312,7 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
         // 10-bit
         for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
             for (i = 1; i < 64; i++) {
-                int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]];
+                int j = ff_zigzag_direct[i];
 
                 /* The quantization formula from the VC-3 standard is:
                  * quantized = sign(block[i]) * floor(abs(block[i]/s) * p /
@@ -253,6 +333,11 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
         }
     }
 
+    ctx->m.q_chroma_intra_matrix16 = ctx->qmatrix_c16;
+    ctx->m.q_chroma_intra_matrix   = ctx->qmatrix_c;
+    ctx->m.q_intra_matrix16        = ctx->qmatrix_l16;
+    ctx->m.q_intra_matrix          = ctx->qmatrix_l;
+
     return 0;
 fail:
     return AVERROR(ENOMEM);
@@ -260,14 +345,16 @@ fail:
 
 static av_cold int dnxhd_init_rc(DNXHDEncContext *ctx)
 {
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_rc,
-                      8160 * ctx->m.avctx->qmax * sizeof(RCEntry), fail);
-    if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD)
-        FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_cmp,
-                          ctx->m.mb_num * sizeof(RCCMPEntry), fail);
-
-    ctx->frame_bits = (ctx->cid_table->coding_unit_size -
-                       640 - 4 - ctx->min_padding) * 8;
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->mb_rc, (ctx->m.avctx->qmax + 1),
+                          ctx->m.mb_num * sizeof(RCEntry), fail);
+    if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD) {
+        FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->mb_cmp,
+                          ctx->m.mb_num, sizeof(RCCMPEntry), fail);
+        FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->mb_cmp_tmp,
+                          ctx->m.mb_num, sizeof(RCCMPEntry), fail);
+    }
+    ctx->frame_bits = (ctx->coding_unit_size -
+                       ctx->data_offset - 4 - ctx->min_padding) * 8;
     ctx->qscale = 1;
     ctx->lambda = 2 << LAMBDA_FRAC_BITS; // qscale 2
     return 0;
@@ -278,50 +365,95 @@ fail:
 static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
 {
     DNXHDEncContext *ctx = avctx->priv_data;
-    int i, index, bit_depth, ret;
+    int i, index, ret;
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_YUV422P:
-        bit_depth = 8;
+        ctx->bit_depth = 8;
         break;
     case AV_PIX_FMT_YUV422P10:
-        bit_depth = 10;
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_GBRP10:
+        ctx->bit_depth = 10;
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
-               "Pixel format is incompatible with DNxHD, use yuv422p or yuv422p10.\n");
+               "pixel format is incompatible with DNxHD\n");
         return AVERROR(EINVAL);
     }
 
-    ctx->cid = ff_dnxhd_find_cid(avctx, bit_depth);
+    if ((ctx->profile == FF_PROFILE_DNXHR_444 && (avctx->pix_fmt != AV_PIX_FMT_YUV444P10 &&
+                                                  avctx->pix_fmt != AV_PIX_FMT_GBRP10)) ||
+        (ctx->profile != FF_PROFILE_DNXHR_444 && (avctx->pix_fmt == AV_PIX_FMT_YUV444P10 ||
+                                                  avctx->pix_fmt == AV_PIX_FMT_GBRP10))) {
+        av_log(avctx, AV_LOG_ERROR,
+               "pixel format is incompatible with DNxHD profile\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (ctx->profile == FF_PROFILE_DNXHR_HQX && avctx->pix_fmt != AV_PIX_FMT_YUV422P10) {
+        av_log(avctx, AV_LOG_ERROR,
+               "pixel format is incompatible with DNxHR HQX profile\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ctx->profile == FF_PROFILE_DNXHR_LB ||
+         ctx->profile == FF_PROFILE_DNXHR_SQ ||
+         ctx->profile == FF_PROFILE_DNXHR_HQ) && avctx->pix_fmt != AV_PIX_FMT_YUV422P) {
+        av_log(avctx, AV_LOG_ERROR,
+               "pixel format is incompatible with DNxHR LB/SQ/HQ profile\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->is_444 = ctx->profile == FF_PROFILE_DNXHR_444;
+    avctx->profile = ctx->profile;
+    ctx->cid = ff_dnxhd_find_cid(avctx, ctx->bit_depth);
     if (!ctx->cid) {
         av_log(avctx, AV_LOG_ERROR,
-               "Video parameters incompatible with DNxHD, available CIDs:\n");
-        ff_dnxhd_list_cid(avctx);
+               "video parameters incompatible with DNxHD. Valid DNxHD profiles:\n");
+        ff_dnxhd_print_profiles(avctx, AV_LOG_ERROR);
         return AVERROR(EINVAL);
     }
     av_log(avctx, AV_LOG_DEBUG, "cid %d\n", ctx->cid);
 
+    if (ctx->cid >= 1270 && ctx->cid <= 1274)
+        avctx->codec_tag = MKTAG('A','V','d','h');
+
+    if (avctx->width < 256 || avctx->height < 120) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Input dimensions too small, input must be at least 256x120\n");
+        return AVERROR(EINVAL);
+    }
+
     index = ff_dnxhd_get_cid_table(ctx->cid);
-    if (index < 0)
-        return index;
+    av_assert0(index >= 0);
+
     ctx->cid_table = &ff_dnxhd_cid_table[index];
 
     ctx->m.avctx    = avctx;
     ctx->m.mb_intra = 1;
     ctx->m.h263_aic = 1;
 
-    avctx->bits_per_raw_sample = ctx->cid_table->bit_depth;
+    avctx->bits_per_raw_sample = ctx->bit_depth;
 
-    ff_blockdsp_init(&ctx->bdsp);
+    ff_blockdsp_init(&ctx->bdsp, avctx);
     ff_fdctdsp_init(&ctx->m.fdsp, avctx);
     ff_mpv_idct_init(&ctx->m);
     ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
     ff_pixblockdsp_init(&ctx->m.pdsp, avctx);
+    ff_dct_encode_init(&ctx->m);
+
+    if (ctx->profile != FF_PROFILE_DNXHD)
+        ff_videodsp_init(&ctx->m.vdsp, ctx->bit_depth);
+
     if (!ctx->m.dct_quantize)
         ctx->m.dct_quantize = ff_dct_quantize_c;
 
-    if (ctx->cid_table->bit_depth == 10) {
+    if (ctx->is_444 || ctx->profile == FF_PROFILE_DNXHR_HQX) {
+        ctx->m.dct_quantize     = dnxhd_10bit_dct_quantize_444;
+        ctx->get_pixels_8x4_sym = dnxhd_10bit_get_pixels_8x4_sym;
+        ctx->block_width_l2     = 4;
+    } else if (ctx->bit_depth == 10) {
         ctx->m.dct_quantize     = dnxhd_10bit_dct_quantize;
         ctx->get_pixels_8x4_sym = dnxhd_10bit_get_pixels_8x4_sym;
         ctx->block_width_l2     = 4;
@@ -341,8 +473,29 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
         ctx->m.mb_height /= 2;
     }
 
+    if (ctx->interlaced && ctx->profile != FF_PROFILE_DNXHD) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Interlaced encoding is not supported for DNxHR profiles.\n");
+        return AVERROR(EINVAL);
+    }
+
     ctx->m.mb_num = ctx->m.mb_height * ctx->m.mb_width;
 
+    if (ctx->cid_table->frame_size == DNXHD_VARIABLE) {
+        ctx->frame_size = avpriv_dnxhd_get_hr_frame_size(ctx->cid,
+                                                     avctx->width, avctx->height);
+        av_assert0(ctx->frame_size >= 0);
+        ctx->coding_unit_size = ctx->frame_size;
+    } else {
+        ctx->frame_size = ctx->cid_table->frame_size;
+        ctx->coding_unit_size = ctx->cid_table->coding_unit_size;
+    }
+
+    if (ctx->m.mb_height > 68)
+        ctx->data_offset = 0x170 + (ctx->m.mb_height << 2);
+    else
+        ctx->data_offset = 0x280;
+
     // XXX tune lbias/cbias
     if ((ret = dnxhd_init_qmat(ctx, ctx->intra_quant_bias, 0)) < 0)
         return ret;
@@ -373,15 +526,24 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    if (avctx->thread_count > MAX_THREADS) {
-        av_log(avctx, AV_LOG_ERROR, "too many threads\n");
+    if (avctx->active_thread_type == FF_THREAD_SLICE) {
+        if (avctx->thread_count > MAX_THREADS) {
+            av_log(avctx, AV_LOG_ERROR, "too many threads\n");
+            return AVERROR(EINVAL);
+        }
+    }
+
+    if (avctx->qmax <= 1) {
+        av_log(avctx, AV_LOG_ERROR, "qmax must be at least 2\n");
         return AVERROR(EINVAL);
     }
 
     ctx->thread[0] = ctx;
-    for (i = 1; i < avctx->thread_count; i++) {
-        ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext));
-        memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
+    if (avctx->active_thread_type == FF_THREAD_SLICE) {
+        for (i = 1; i < avctx->thread_count; i++) {
+            ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext));
+            memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
+        }
     }
 
     return 0;
@@ -392,11 +554,16 @@ fail:  // for FF_ALLOCZ_OR_GOTO
 static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
 {
     DNXHDEncContext *ctx = avctx->priv_data;
-    static const uint8_t header_prefix[5] = { 0x00, 0x00, 0x02, 0x80, 0x01 };
 
-    memset(buf, 0, 640);
+    memset(buf, 0, ctx->data_offset);
+
+    // * write prefix */
+    AV_WB16(buf + 0x02, ctx->data_offset);
+    if (ctx->cid >= 1270 && ctx->cid <= 1274)
+        buf[4] = 0x03;
+    else
+        buf[4] = 0x01;
 
-    memcpy(buf, header_prefix, 5);
     buf[5] = ctx->interlaced ? ctx->cur_field + 2 : 0x01;
     buf[6] = 0x80; // crc flag off
     buf[7] = 0xa0; // reserved
@@ -404,16 +571,16 @@ static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
     AV_WB16(buf + 0x1a, avctx->width);  // SPL
     AV_WB16(buf + 0x1d, avctx->height >> ctx->interlaced); // NAL
 
-    buf[0x21] = ctx->cid_table->bit_depth == 10 ? 0x58 : 0x38;
+    buf[0x21] = ctx->bit_depth == 10 ? 0x58 : 0x38;
     buf[0x22] = 0x88 + (ctx->interlaced << 2);
     AV_WB32(buf + 0x28, ctx->cid); // CID
-    buf[0x2c] = ctx->interlaced ? 0 : 0x80;
+    buf[0x2c] = (!ctx->interlaced << 7) | (ctx->is_444 << 6) | (avctx->pix_fmt == AV_PIX_FMT_YUV444P10);
 
     buf[0x5f] = 0x01; // UDL
 
     buf[0x167] = 0x02; // reserved
     AV_WB16(buf + 0x16a, ctx->m.mb_height * 4 + 4); // MSIPS
-    buf[0x16d] = ctx->m.mb_height; // Ns
+    AV_WB16(buf + 0x16c, ctx->m.mb_height); // Ns
     buf[0x16f] = 0x10; // reserved
 
     ctx->msip = buf + 0x170;
@@ -431,7 +598,7 @@ static av_always_inline void dnxhd_encode_dc(DNXHDEncContext *ctx, int diff)
     }
     put_bits(&ctx->m.pb, ctx->cid_table->dc_bits[nbits] + nbits,
              (ctx->cid_table->dc_codes[nbits] << nbits) +
-             (diff & ((1 << nbits) - 1)));
+             av_mod_uintp2(diff, nbits));
 }
 
 static av_always_inline
@@ -468,8 +635,13 @@ void dnxhd_unquantize_c(DNXHDEncContext *ctx, int16_t *block, int n,
     int level;
     int i;
 
-    weight_matrix = (n & 2) ? ctx->cid_table->chroma_weight
-                            : ctx->cid_table->luma_weight;
+    if (ctx->is_444) {
+        weight_matrix = ((n % 6) < 2) ? ctx->cid_table->luma_weight
+                                      : ctx->cid_table->chroma_weight;
+    } else {
+        weight_matrix = (n & 2) ? ctx->cid_table->chroma_weight
+                                : ctx->cid_table->luma_weight;
+    }
 
     for (i = 1; i <= last_index; i++) {
         int j = ctx->m.intra_scantable.permutated[i];
@@ -477,7 +649,7 @@ void dnxhd_unquantize_c(DNXHDEncContext *ctx, int16_t *block, int n,
         if (level) {
             if (level < 0) {
                 level = (1 - 2 * level) * qscale * weight_matrix[i];
-                if (ctx->cid_table->bit_depth == 10) {
+                if (ctx->bit_depth == 10) {
                     if (weight_matrix[i] != 8)
                         level += 8;
                     level >>= 4;
@@ -489,7 +661,7 @@ void dnxhd_unquantize_c(DNXHDEncContext *ctx, int16_t *block, int n,
                 level = -level;
             } else {
                 level = (2 * level + 1) * qscale * weight_matrix[i];
-                if (ctx->cid_table->bit_depth == 10) {
+                if (ctx->bit_depth == 10) {
                     if (weight_matrix[i] != 8)
                         level += 8;
                     level >>= 4;
@@ -537,63 +709,141 @@ void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
 {
     const int bs = ctx->block_width_l2;
     const int bw = 1 << bs;
+    int dct_y_offset = ctx->dct_y_offset;
+    int dct_uv_offset = ctx->dct_uv_offset;
+    int linesize = ctx->m.linesize;
+    int uvlinesize = ctx->m.uvlinesize;
     const uint8_t *ptr_y = ctx->thread[0]->src[0] +
                            ((mb_y << 4) * ctx->m.linesize) + (mb_x << bs + 1);
     const uint8_t *ptr_u = ctx->thread[0]->src[1] +
-                           ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
+                           ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs + ctx->is_444);
     const uint8_t *ptr_v = ctx->thread[0]->src[2] +
-                           ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
+                           ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs + ctx->is_444);
     PixblockDSPContext *pdsp = &ctx->m.pdsp;
+    VideoDSPContext *vdsp = &ctx->m.vdsp;
+
+    if (ctx->bit_depth != 10 && vdsp->emulated_edge_mc && ((mb_x << 4) + 16 > ctx->m.avctx->width ||
+                                                           (mb_y << 4) + 16 > ctx->m.avctx->height)) {
+        int y_w = ctx->m.avctx->width  - (mb_x << 4);
+        int y_h = ctx->m.avctx->height - (mb_y << 4);
+        int uv_w = (y_w + 1) / 2;
+        int uv_h = y_h;
+        linesize = 16;
+        uvlinesize = 8;
+
+        vdsp->emulated_edge_mc(&ctx->edge_buf_y[0], ptr_y,
+                               linesize, ctx->m.linesize,
+                               linesize, 16,
+                               0, 0, y_w, y_h);
+        vdsp->emulated_edge_mc(&ctx->edge_buf_uv[0][0], ptr_u,
+                               uvlinesize, ctx->m.uvlinesize,
+                               uvlinesize, 16,
+                               0, 0, uv_w, uv_h);
+        vdsp->emulated_edge_mc(&ctx->edge_buf_uv[1][0], ptr_v,
+                               uvlinesize, ctx->m.uvlinesize,
+                               uvlinesize, 16,
+                               0, 0, uv_w, uv_h);
+
+        dct_y_offset =  bw * linesize;
+        dct_uv_offset = bw * uvlinesize;
+        ptr_y = &ctx->edge_buf_y[0];
+        ptr_u = &ctx->edge_buf_uv[0][0];
+        ptr_v = &ctx->edge_buf_uv[1][0];
+    } else if (ctx->bit_depth == 10 && vdsp->emulated_edge_mc && ((mb_x << 4) + 16 > ctx->m.avctx->width ||
+                                                                  (mb_y << 4) + 16 > ctx->m.avctx->height)) {
+        int y_w = ctx->m.avctx->width  - (mb_x << 4);
+        int y_h = ctx->m.avctx->height - (mb_y << 4);
+        int uv_w = ctx->is_444 ? y_w : (y_w + 1) / 2;
+        int uv_h = y_h;
+        linesize = 32;
+        uvlinesize = 16 + 16 * ctx->is_444;
+
+        vdsp->emulated_edge_mc(&ctx->edge_buf_y[0], ptr_y,
+                               linesize, ctx->m.linesize,
+                               linesize / 2, 16,
+                               0, 0, y_w, y_h);
+        vdsp->emulated_edge_mc(&ctx->edge_buf_uv[0][0], ptr_u,
+                               uvlinesize, ctx->m.uvlinesize,
+                               uvlinesize / 2, 16,
+                               0, 0, uv_w, uv_h);
+        vdsp->emulated_edge_mc(&ctx->edge_buf_uv[1][0], ptr_v,
+                               uvlinesize, ctx->m.uvlinesize,
+                               uvlinesize / 2, 16,
+                               0, 0, uv_w, uv_h);
+
+        dct_y_offset =  bw * linesize / 2;
+        dct_uv_offset = bw * uvlinesize / 2;
+        ptr_y = &ctx->edge_buf_y[0];
+        ptr_u = &ctx->edge_buf_uv[0][0];
+        ptr_v = &ctx->edge_buf_uv[1][0];
+    }
 
-    pdsp->get_pixels(ctx->blocks[0], ptr_y,      ctx->m.linesize);
-    pdsp->get_pixels(ctx->blocks[1], ptr_y + bw, ctx->m.linesize);
-    pdsp->get_pixels(ctx->blocks[2], ptr_u,      ctx->m.uvlinesize);
-    pdsp->get_pixels(ctx->blocks[3], ptr_v,      ctx->m.uvlinesize);
-
-    if (mb_y + 1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
-        if (ctx->interlaced) {
-            ctx->get_pixels_8x4_sym(ctx->blocks[4],
-                                    ptr_y + ctx->dct_y_offset,
-                                    ctx->m.linesize);
-            ctx->get_pixels_8x4_sym(ctx->blocks[5],
-                                    ptr_y + ctx->dct_y_offset + bw,
-                                    ctx->m.linesize);
-            ctx->get_pixels_8x4_sym(ctx->blocks[6],
-                                    ptr_u + ctx->dct_uv_offset,
-                                    ctx->m.uvlinesize);
-            ctx->get_pixels_8x4_sym(ctx->blocks[7],
-                                    ptr_v + ctx->dct_uv_offset,
-                                    ctx->m.uvlinesize);
+    if (!ctx->is_444) {
+        pdsp->get_pixels(ctx->blocks[0], ptr_y,      linesize);
+        pdsp->get_pixels(ctx->blocks[1], ptr_y + bw, linesize);
+        pdsp->get_pixels(ctx->blocks[2], ptr_u,      uvlinesize);
+        pdsp->get_pixels(ctx->blocks[3], ptr_v,      uvlinesize);
+
+        if (mb_y + 1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
+            if (ctx->interlaced) {
+                ctx->get_pixels_8x4_sym(ctx->blocks[4],
+                                        ptr_y + dct_y_offset,
+                                        linesize);
+                ctx->get_pixels_8x4_sym(ctx->blocks[5],
+                                        ptr_y + dct_y_offset + bw,
+                                        linesize);
+                ctx->get_pixels_8x4_sym(ctx->blocks[6],
+                                        ptr_u + dct_uv_offset,
+                                        uvlinesize);
+                ctx->get_pixels_8x4_sym(ctx->blocks[7],
+                                        ptr_v + dct_uv_offset,
+                                        uvlinesize);
+            } else {
+                ctx->bdsp.clear_block(ctx->blocks[4]);
+                ctx->bdsp.clear_block(ctx->blocks[5]);
+                ctx->bdsp.clear_block(ctx->blocks[6]);
+                ctx->bdsp.clear_block(ctx->blocks[7]);
+            }
         } else {
-            ctx->bdsp.clear_block(ctx->blocks[4]);
-            ctx->bdsp.clear_block(ctx->blocks[5]);
-            ctx->bdsp.clear_block(ctx->blocks[6]);
-            ctx->bdsp.clear_block(ctx->blocks[7]);
+            pdsp->get_pixels(ctx->blocks[4],
+                             ptr_y + dct_y_offset, linesize);
+            pdsp->get_pixels(ctx->blocks[5],
+                             ptr_y + dct_y_offset + bw, linesize);
+            pdsp->get_pixels(ctx->blocks[6],
+                             ptr_u + dct_uv_offset, uvlinesize);
+            pdsp->get_pixels(ctx->blocks[7],
+                             ptr_v + dct_uv_offset, uvlinesize);
         }
     } else {
-        pdsp->get_pixels(ctx->blocks[4],
-                         ptr_y + ctx->dct_y_offset, ctx->m.linesize);
-        pdsp->get_pixels(ctx->blocks[5],
-                         ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
-        pdsp->get_pixels(ctx->blocks[6],
-                         ptr_u + ctx->dct_uv_offset, ctx->m.uvlinesize);
-        pdsp->get_pixels(ctx->blocks[7],
-                         ptr_v + ctx->dct_uv_offset, ctx->m.uvlinesize);
+        pdsp->get_pixels(ctx->blocks[0], ptr_y,      linesize);
+        pdsp->get_pixels(ctx->blocks[1], ptr_y + bw, linesize);
+        pdsp->get_pixels(ctx->blocks[6], ptr_y + dct_y_offset, linesize);
+        pdsp->get_pixels(ctx->blocks[7], ptr_y + dct_y_offset + bw, linesize);
+
+        pdsp->get_pixels(ctx->blocks[2], ptr_u,      uvlinesize);
+        pdsp->get_pixels(ctx->blocks[3], ptr_u + bw, uvlinesize);
+        pdsp->get_pixels(ctx->blocks[8], ptr_u + dct_uv_offset, uvlinesize);
+        pdsp->get_pixels(ctx->blocks[9], ptr_u + dct_uv_offset + bw, uvlinesize);
+
+        pdsp->get_pixels(ctx->blocks[4], ptr_v,      uvlinesize);
+        pdsp->get_pixels(ctx->blocks[5], ptr_v + bw, uvlinesize);
+        pdsp->get_pixels(ctx->blocks[10], ptr_v + dct_uv_offset, uvlinesize);
+        pdsp->get_pixels(ctx->blocks[11], ptr_v + dct_uv_offset + bw, uvlinesize);
     }
 }
 
 static av_always_inline
 int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i)
 {
-    if (i & 2) {
-        ctx->m.q_intra_matrix16 = ctx->qmatrix_c16;
-        ctx->m.q_intra_matrix   = ctx->qmatrix_c;
-        return 1 + (i & 1);
+    int x;
+
+    if (ctx->is_444) {
+        x = (i >> 1) % 3;
     } else {
-        ctx->m.q_intra_matrix16 = ctx->qmatrix_l16;
-        ctx->m.q_intra_matrix   = ctx->qmatrix_l;
-        return 0;
+        const static uint8_t component[8]={0,0,1,2,0,0,1,2};
+        x = component[i];
     }
+    return x;
 }
 
 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
@@ -607,7 +857,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
 
     ctx->m.last_dc[0] =
     ctx->m.last_dc[1] =
-    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
+    ctx->m.last_dc[2] = 1 << (ctx->bit_depth + 2);
 
     for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
         unsigned mb = mb_y * ctx->m.mb_width + mb_x;
@@ -618,13 +868,14 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
 
         dnxhd_get_blocks(ctx, mb_x, mb_y);
 
-        for (i = 0; i < 8; i++) {
+        for (i = 0; i < 8 + 4 * ctx->is_444; i++) {
             int16_t *src_block = ctx->blocks[i];
             int overflow, nbits, diff, last_index;
             int n = dnxhd_switch_matrix(ctx, i);
 
             memcpy(block, src_block, 64 * sizeof(*block));
-            last_index = ctx->m.dct_quantize(&ctx->m, block, i,
+            last_index = ctx->m.dct_quantize(&ctx->m, block,
+                                             ctx->is_444 ? 4 * (n > 0): 4 & (2*i),
                                              qscale, &overflow);
             ac_bits   += dnxhd_calc_ac_bits(ctx, block, last_index);
 
@@ -634,7 +885,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
             else
                 nbits = av_log2_16bit(2 * diff);
 
-            assert(nbits < ctx->cid_table->bit_depth + 4);
+            av_assert1(nbits < ctx->bit_depth + 4);
             dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;
 
             ctx->m.last_dc[n] = block[0];
@@ -645,9 +896,9 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
                 ssd += dnxhd_ssd_block(block, src_block);
             }
         }
-        ctx->mb_rc[qscale][mb].ssd  = ssd;
-        ctx->mb_rc[qscale][mb].bits = ac_bits + dc_bits + 12 +
-                                      8 * ctx->vlc_bits[0];
+        ctx->mb_rc[(qscale * ctx->m.mb_num) + mb].ssd  = ssd;
+        ctx->mb_rc[(qscale * ctx->m.mb_num) + mb].bits = ac_bits + dc_bits + 12 +
+                                     (1 + ctx->is_444) * 8 * ctx->vlc_bits[0];
     }
     return 0;
 }
@@ -658,25 +909,27 @@ static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg,
     DNXHDEncContext *ctx = avctx->priv_data;
     int mb_y = jobnr, mb_x;
     ctx = ctx->thread[threadnr];
-    init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr],
+    init_put_bits(&ctx->m.pb, (uint8_t *)arg + ctx->data_offset + ctx->slice_offs[jobnr],
                   ctx->slice_size[jobnr]);
 
     ctx->m.last_dc[0] =
     ctx->m.last_dc[1] =
-    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
+    ctx->m.last_dc[2] = 1 << (ctx->bit_depth + 2);
     for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
         unsigned mb = mb_y * ctx->m.mb_width + mb_x;
         int qscale = ctx->mb_qscale[mb];
         int i;
 
-        put_bits(&ctx->m.pb, 12, qscale << 1);
+        put_bits(&ctx->m.pb, 11, qscale);
+        put_bits(&ctx->m.pb, 1, avctx->pix_fmt == AV_PIX_FMT_YUV444P10);
 
         dnxhd_get_blocks(ctx, mb_x, mb_y);
 
-        for (i = 0; i < 8; i++) {
+        for (i = 0; i < 8 + 4 * ctx->is_444; i++) {
             int16_t *block = ctx->blocks[i];
             int overflow, n = dnxhd_switch_matrix(ctx, i);
-            int last_index = ctx->m.dct_quantize(&ctx->m, block, i,
+            int last_index = ctx->m.dct_quantize(&ctx->m, block,
+                                                 ctx->is_444 ? (((i >> 1) % 3) < 1 ? 0 : 4): 4 & (2*i),
                                                  qscale, &overflow);
             // START_TIMER;
             dnxhd_encode_block(ctx, block, last_index, n);
@@ -717,14 +970,14 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg,
                            ((avctx->height >> ctx->interlaced) & 0xF);
 
     ctx = ctx->thread[threadnr];
-    if (ctx->cid_table->bit_depth == 8) {
+    if (ctx->bit_depth == 8) {
         uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize);
         for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x, pix += 16) {
             unsigned mb = mb_y * ctx->m.mb_width + mb_x;
             int sum;
             int varc;
 
-            if (!partial_last_row && mb_x * 16 <= avctx->width - 16) {
+            if (!partial_last_row && mb_x * 16 <= avctx->width - 16 && (avctx->width % 16) == 0) {
                 sum  = ctx->m.mpvencdsp.pix_sum(pix, ctx->m.linesize);
                 varc = ctx->m.mpvencdsp.pix_norm1(pix, ctx->m.linesize);
             } else {
@@ -752,11 +1005,13 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg,
             unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
             int sum = 0;
             int sqsum = 0;
+            int bw = FFMIN(avctx->width - 16 * mb_x, 16);
+            int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16);
             int mean, sqmean;
             int i, j;
             // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
-            for (i = 0; i < 16; ++i) {
-                for (j = 0; j < 16; ++j) {
+            for (i = 0; i < bh; ++i) {
+                for (j = 0; j < bw; ++j) {
                     // Turn 16-bit pixels into 10-bit ones.
                     const int sample = (unsigned) pix[j] >> 6;
                     sum   += sample;
@@ -800,17 +1055,20 @@ static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx)
                 unsigned min = UINT_MAX;
                 int qscale = 1;
                 int mb     = y * ctx->m.mb_width + x;
+                int rc = 0;
                 for (q = 1; q < avctx->qmax; q++) {
-                    unsigned score = ctx->mb_rc[q][mb].bits * lambda +
-                                     ((unsigned) ctx->mb_rc[q][mb].ssd << LAMBDA_FRAC_BITS);
+                    int i = (q*ctx->m.mb_num) + mb;
+                    unsigned score = ctx->mb_rc[i].bits * lambda +
+                                     ((unsigned) ctx->mb_rc[i].ssd << LAMBDA_FRAC_BITS);
                     if (score < min) {
                         min    = score;
                         qscale = q;
+                        rc = i;
                     }
                 }
-                bits += ctx->mb_rc[qscale][mb].bits;
+                bits += ctx->mb_rc[rc].bits;
                 ctx->mb_qscale[mb] = qscale;
-                ctx->mb_bits[mb]   = ctx->mb_rc[qscale][mb].bits;
+                ctx->mb_bits[mb]   = ctx->mb_rc[rc].bits;
             }
             bits = (bits + 31) & ~31; // padding
             if (bits > ctx->frame_bits)
@@ -867,7 +1125,7 @@ static int dnxhd_find_qscale(DNXHDEncContext *ctx)
                                NULL, NULL, ctx->m.mb_height);
         for (y = 0; y < ctx->m.mb_height; y++) {
             for (x = 0; x < ctx->m.mb_width; x++)
-                bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits;
+                bits += ctx->mb_rc[(qscale*ctx->m.mb_num) + (y*ctx->m.mb_width+x)].bits;
             bits = (bits+31)&~31; // padding
             if (bits > ctx->frame_bits)
                 break;
@@ -926,13 +1184,13 @@ static void radix_count(const RCCMPEntry *data, int size,
             buckets[j][get_bucket(v, 0)]++;
             v >>= BUCKET_BITS;
         }
-        assert(!v);
+        av_assert1(!v);
     }
     for (j = 0; j < RADIX_PASSES; j++) {
         int offset = size;
         for (i = NBUCKETS - 1; i >= 0; i--)
             buckets[j][i] = offset -= buckets[j][i];
-        assert(!buckets[j][0]);
+        av_assert1(!buckets[j][0]);
     }
 }
 
@@ -948,10 +1206,9 @@ static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data,
     }
 }
 
-static void radix_sort(RCCMPEntry *data, int size)
+static void radix_sort(RCCMPEntry *data, RCCMPEntry *tmp, int size)
 {
     int buckets[RADIX_PASSES][NBUCKETS];
-    RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size);
     radix_count(data, size, buckets);
     radix_sort_pass(tmp, data, size, buckets[0], 0);
     radix_sort_pass(data, tmp, size, buckets[1], 1);
@@ -959,7 +1216,6 @@ static void radix_sort(RCCMPEntry *data, int size)
         radix_sort_pass(tmp, data, size, buckets[2], 2);
         radix_sort_pass(data, tmp, size, buckets[3], 3);
     }
-    av_free(tmp);
 }
 
 static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx)
@@ -971,17 +1227,18 @@ static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx)
     for (y = 0; y < ctx->m.mb_height; y++) {
         for (x = 0; x < ctx->m.mb_width; x++) {
             int mb = y * ctx->m.mb_width + x;
+            int rc = (ctx->qscale * ctx->m.mb_num ) + mb;
             int delta_bits;
             ctx->mb_qscale[mb] = ctx->qscale;
-            ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale][mb].bits;
-            max_bits += ctx->mb_rc[ctx->qscale][mb].bits;
+            ctx->mb_bits[mb] = ctx->mb_rc[rc].bits;
+            max_bits += ctx->mb_rc[rc].bits;
             if (!RC_VARIANCE) {
-                delta_bits = ctx->mb_rc[ctx->qscale][mb].bits -
-                             ctx->mb_rc[ctx->qscale + 1][mb].bits;
+                delta_bits = ctx->mb_rc[rc].bits -
+                             ctx->mb_rc[rc + ctx->m.mb_num].bits;
                 ctx->mb_cmp[mb].mb = mb;
                 ctx->mb_cmp[mb].value =
-                    delta_bits ? ((ctx->mb_rc[ctx->qscale][mb].ssd -
-                                   ctx->mb_rc[ctx->qscale + 1][mb].ssd) * 100) /
+                    delta_bits ? ((ctx->mb_rc[rc].ssd -
+                                   ctx->mb_rc[rc + ctx->m.mb_num].ssd) * 100) /
                                   delta_bits
                                : INT_MIN; // avoid increasing qscale
             }
@@ -992,13 +1249,14 @@ static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx)
         if (RC_VARIANCE)
             avctx->execute2(avctx, dnxhd_mb_var_thread,
                             NULL, NULL, ctx->m.mb_height);
-        radix_sort(ctx->mb_cmp, ctx->m.mb_num);
+        radix_sort(ctx->mb_cmp, ctx->mb_cmp_tmp, ctx->m.mb_num);
         for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) {
             int mb = ctx->mb_cmp[x].mb;
-            max_bits -= ctx->mb_rc[ctx->qscale][mb].bits -
-                        ctx->mb_rc[ctx->qscale + 1][mb].bits;
+            int rc = (ctx->qscale * ctx->m.mb_num ) + mb;
+            max_bits -= ctx->mb_rc[rc].bits -
+                        ctx->mb_rc[rc + ctx->m.mb_num].bits;
             ctx->mb_qscale[mb] = ctx->qscale + 1;
-            ctx->mb_bits[mb]   = ctx->mb_rc[ctx->qscale + 1][mb].bits;
+            ctx->mb_bits[mb]   = ctx->mb_rc[rc + ctx->m.mb_num].bits;
         }
     }
     return 0;
@@ -1029,13 +1287,10 @@ static int dnxhd_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
     DNXHDEncContext *ctx = avctx->priv_data;
     int first_field = 1;
     int offset, i, ret;
-    uint8_t *buf, *sd;
+    uint8_t *buf;
 
-    if ((ret = ff_alloc_packet(pkt, ctx->cid_table->frame_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "output buffer is too small to compress picture\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, ctx->frame_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
     dnxhd_load_picture(ctx, frame);
@@ -1065,21 +1320,21 @@ encode_coding_unit:
     for (i = 0; i < ctx->m.mb_height; i++) {
         AV_WB32(ctx->msip + i * 4, offset);
         offset += ctx->slice_size[i];
-        assert(!(ctx->slice_size[i] & 3));
+        av_assert1(!(ctx->slice_size[i] & 3));
     }
 
     avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
 
-    assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
-    memset(buf + 640 + offset, 0,
-           ctx->cid_table->coding_unit_size - 4 - offset - 640);
+    av_assert1(ctx->data_offset + offset + 4 <= ctx->coding_unit_size);
+    memset(buf + ctx->data_offset + offset, 0,
+           ctx->coding_unit_size - 4 - offset - ctx->data_offset);
 
-    AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF
+    AV_WB32(buf + ctx->coding_unit_size - 4, 0x600DC0DE); // EOF
 
     if (ctx->interlaced && first_field) {
         first_field     = 0;
         ctx->cur_field ^= 1;
-        buf            += ctx->cid_table->coding_unit_size;
+        buf            += ctx->coding_unit_size;
         goto encode_coding_unit;
     }
 
@@ -1089,10 +1344,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = ctx->qscale * FF_QP2LAMBDA;
+    ff_side_data_set_encoder_stats(pkt, ctx->qscale * FF_QP2LAMBDA, NULL, 0, AV_PICTURE_TYPE_I);
 
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
@@ -1102,11 +1354,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
 static av_cold int dnxhd_encode_end(AVCodecContext *avctx)
 {
     DNXHDEncContext *ctx = avctx->priv_data;
-    int max_level        = 1 << (ctx->cid_table->bit_depth + 2);
     int i;
 
-    av_free(ctx->vlc_codes - max_level * 2);
-    av_free(ctx->vlc_bits - max_level * 2);
+    av_freep(&ctx->orig_vlc_codes);
+    av_freep(&ctx->orig_vlc_bits);
     av_freep(&ctx->run_codes);
     av_freep(&ctx->run_bits);
 
@@ -1114,6 +1365,7 @@ static av_cold int dnxhd_encode_end(AVCodecContext *avctx)
     av_freep(&ctx->mb_qscale);
     av_freep(&ctx->mb_rc);
     av_freep(&ctx->mb_cmp);
+    av_freep(&ctx->mb_cmp_tmp);
     av_freep(&ctx->slice_size);
     av_freep(&ctx->slice_offs);
 
@@ -1122,12 +1374,19 @@ static av_cold int dnxhd_encode_end(AVCodecContext *avctx)
     av_freep(&ctx->qmatrix_c16);
     av_freep(&ctx->qmatrix_l16);
 
-    for (i = 1; i < avctx->thread_count; i++)
-        av_freep(&ctx->thread[i]);
+    if (avctx->active_thread_type == FF_THREAD_SLICE) {
+        for (i = 1; i < avctx->thread_count; i++)
+            av_freep(&ctx->thread[i]);
+    }
 
     return 0;
 }
 
+static const AVCodecDefault dnxhd_defaults[] = {
+    { "qmax", "1024" }, /* Maximum quantization scale factor allowed for VC-3 */
+    { NULL },
+};
+
 AVCodec ff_dnxhd_encoder = {
     .name           = "dnxhd",
     .long_name      = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
@@ -1137,11 +1396,16 @@ AVCodec ff_dnxhd_encoder = {
     .init           = dnxhd_encode_init,
     .encode2        = dnxhd_encode_picture,
     .close          = dnxhd_encode_end,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV422P,
         AV_PIX_FMT_YUV422P10,
+        AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_GBRP10,
         AV_PIX_FMT_NONE
     },
-    .priv_class     = &class,
+    .priv_class     = &dnxhd_class,
+    .defaults       = dnxhd_defaults,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_dnxhd_profiles),
 };
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
index c6755f7..7b0d862 100644
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,9 @@ typedef struct DNXHDEncContext {
     MpegEncContext m; ///< Used for quantization dsp functions
 
     int cid;
+    int profile;
+    int bit_depth;
+    int is_444;
     const CIDEntry *cid_table;
     uint8_t *msip; ///< Macroblock Scan Indexes Payload
     uint32_t *slice_size;
@@ -60,6 +63,10 @@ typedef struct DNXHDEncContext {
     unsigned dct_uv_offset;
     unsigned block_width_l2;
 
+    int frame_size;
+    int coding_unit_size;
+    int data_offset;
+
     int interlaced;
     int cur_field;
 
@@ -67,7 +74,9 @@ typedef struct DNXHDEncContext {
     unsigned min_padding;
     int intra_quant_bias;
 
-    DECLARE_ALIGNED(16, int16_t, blocks)[8][64];
+    DECLARE_ALIGNED(32, int16_t, blocks)[12][64];
+    DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10
+    DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10
 
     int      (*qmatrix_c)     [64];
     int      (*qmatrix_l)     [64];
@@ -77,6 +86,8 @@ typedef struct DNXHDEncContext {
     unsigned frame_bits;
     uint8_t *src[3];
 
+    uint32_t *orig_vlc_codes;
+    uint8_t  *orig_vlc_bits;
     uint32_t *vlc_codes;
     uint8_t  *vlc_bits;
     uint16_t *run_codes;
@@ -87,15 +98,14 @@ typedef struct DNXHDEncContext {
     unsigned qscale;
     unsigned lambda;
 
-    unsigned thread_size;
-
     uint16_t *mb_bits;
     uint8_t  *mb_qscale;
 
     RCCMPEntry *mb_cmp;
-    RCEntry   (*mb_rc)[8160];
+    RCCMPEntry *mb_cmp_tmp;
+    RCEntry    *mb_rc;
 
-    void (*get_pixels_8x4_sym)(int16_t *restrict /* align 16 */ block,
+    void (*get_pixels_8x4_sym)(int16_t *av_restrict /* align 16 */ block,
                                const uint8_t *pixels, ptrdiff_t line_size);
 } DNXHDEncContext;
 
diff --git a/libavcodec/dolby_e.c b/libavcodec/dolby_e.c
new file mode 100644
index 0000000..429612e
--- /dev/null
+++ b/libavcodec/dolby_e.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (C) 2017 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/thread.h"
+#include "libavutil/mem.h"
+
+#include "internal.h"
+#include "get_bits.h"
+#include "put_bits.h"
+#include "dolby_e.h"
+#include "fft.h"
+
+static int skip_input(DBEContext *s, int nb_words)
+{
+    if (nb_words > s->input_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Packet too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->input      += nb_words * s->word_bytes;
+    s->input_size -= nb_words;
+    return 0;
+}
+
+static int parse_key(DBEContext *s)
+{
+    if (s->key_present) {
+        uint8_t *key = s->input;
+        int      ret = skip_input(s, 1);
+        if (ret < 0)
+            return ret;
+        return AV_RB24(key) >> 24 - s->word_bits;
+    }
+    return 0;
+}
+
+static int convert_input(DBEContext *s, int nb_words, int key)
+{
+    uint8_t *src = s->input;
+    uint8_t *dst = s->buffer;
+    PutBitContext pb;
+    int i;
+
+    av_assert0(nb_words <= 1024u);
+
+    if (nb_words > s->input_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Packet too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    switch (s->word_bits) {
+    case 16:
+        for (i = 0; i < nb_words; i++, src += 2, dst += 2)
+            AV_WB16(dst, AV_RB16(src) ^ key);
+        break;
+    case 20:
+        init_put_bits(&pb, s->buffer, sizeof(s->buffer));
+        for (i = 0; i < nb_words; i++, src += 3)
+            put_bits(&pb, 20, AV_RB24(src) >> 4 ^ key);
+        flush_put_bits(&pb);
+        break;
+    case 24:
+        for (i = 0; i < nb_words; i++, src += 3, dst += 3)
+            AV_WB24(dst, AV_RB24(src) ^ key);
+        break;
+    default:
+        av_assert0(0);
+    }
+
+    return init_get_bits(&s->gb, s->buffer, nb_words * s->word_bits);
+}
+
+static int parse_metadata(DBEContext *s)
+{
+    int i, ret, key, mtd_size;
+
+    if ((key = parse_key(s)) < 0)
+        return key;
+    if ((ret = convert_input(s, 1, key)) < 0)
+        return ret;
+
+    skip_bits(&s->gb, 4);
+    mtd_size = get_bits(&s->gb, 10);
+    if (!mtd_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid metadata size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = convert_input(s, mtd_size, key)) < 0)
+        return ret;
+
+    skip_bits(&s->gb, 14);
+    s->prog_conf = get_bits(&s->gb, 6);
+    if (s->prog_conf > MAX_PROG_CONF) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid program configuration\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->nb_channels = nb_channels_tab[s->prog_conf];
+    s->nb_programs = nb_programs_tab[s->prog_conf];
+
+    s->fr_code      = get_bits(&s->gb, 4);
+    s->fr_code_orig = get_bits(&s->gb, 4);
+    if (!sample_rate_tab[s->fr_code] ||
+        !sample_rate_tab[s->fr_code_orig]) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid frame rate code\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    skip_bits_long(&s->gb, 88);
+    for (i = 0; i < s->nb_channels; i++)
+        s->ch_size[i] = get_bits(&s->gb, 10);
+    s->mtd_ext_size = get_bits(&s->gb, 8);
+    s->meter_size   = get_bits(&s->gb, 8);
+
+    skip_bits_long(&s->gb, 10 * s->nb_programs);
+    for (i = 0; i < s->nb_channels; i++) {
+        s->rev_id[i]     = get_bits(&s->gb,  4);
+        skip_bits1(&s->gb);
+        s->begin_gain[i] = get_bits(&s->gb, 10);
+        s->end_gain[i]   = get_bits(&s->gb, 10);
+    }
+
+    if (get_bits_left(&s->gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of metadata\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return skip_input(s, mtd_size + 1);
+}
+
+static int parse_metadata_ext(DBEContext *s)
+{
+    if (s->mtd_ext_size)
+        return skip_input(s, s->key_present + s->mtd_ext_size + 1);
+    return 0;
+}
+
+static void unbias_exponents(DBEContext *s, DBEChannel *c, DBEGroup *g)
+{
+    int mstr_exp[MAX_MSTR_EXP];
+    int bias_exp[MAX_BIAS_EXP];
+    int i, j, k;
+
+    for (i = 0; i < c->nb_mstr_exp; i++)
+        mstr_exp[i] = get_bits(&s->gb, 2) * 6;
+
+    for (i = 0; i < g->nb_exponent; i++)
+        bias_exp[i] = get_bits(&s->gb, 5);
+
+    for (i = k = 0; i < c->nb_mstr_exp; i++)
+        for (j = 0; j < g->nb_bias_exp[i]; j++, k++)
+            c->exponents[g->exp_ofs + k] = mstr_exp[i] + bias_exp[k];
+}
+
+static int parse_exponents(DBEContext *s, DBEChannel *c)
+{
+    DBEGroup *p, *g;
+    int i;
+
+    for (i = 0, p = NULL, g = c->groups; i < c->nb_groups; i++, p = g, g++) {
+        c->exp_strategy[i] = !i || g->nb_exponent != p->nb_exponent || get_bits1(&s->gb);
+        if (c->exp_strategy[i]) {
+            unbias_exponents(s, c, g);
+        } else {
+            memcpy(c->exponents + g->exp_ofs,
+                   c->exponents + p->exp_ofs,
+                   g->nb_exponent * sizeof(c->exponents[0]));
+        }
+    }
+
+    return 0;
+}
+
+static inline int log_add(int a, int b)
+{
+    int c = FFABS(a - b) >> 1;
+    return FFMAX(a, b) + log_add_tab[FFMIN(c, 211)];
+}
+
+static void calc_lowcomp(int *msk_val)
+{
+    int lwc_val[17] = { 0 };
+    int i, j, k;
+
+    for (i = 0; i < 11; i++) {
+        int max_j = 0;
+        int max_v = INT_MIN;
+        int thr   = 0;
+
+        for (j = FFMAX(i - 3, 0), k = 0; j <= i + 3; j++, k++) {
+            int v = msk_val[j] + lwc_gain_tab[i][k];
+            if (v > max_v) {
+                max_j = j;
+                max_v = v;
+            }
+            thr = log_add(thr, v);
+        }
+
+        if (msk_val[i] < thr) {
+            for (j = FFMAX(max_j - 3, 0),
+                 k = FFMAX(3 - max_j, 0);
+                 j <= max_j + 3; j++, k++)
+                lwc_val[j] += lwc_adj_tab[k];
+        }
+    }
+
+    for (i = 0; i < 16; i++) {
+        int v = FFMAX(lwc_val[i], -512);
+        msk_val[i] = FFMAX(msk_val[i] + v, 0);
+    }
+}
+
+static void bit_allocate(int nb_exponent, int nb_code, int fr_code,
+                         int *exp, int *bap,
+                         int fg_spc, int fg_ofs, int msk_mod, int snr_ofs)
+{
+    int msk_val[MAX_BIAS_EXP];
+    int psd_val[MAX_BIAS_EXP];
+    int fast_leak  = 0;
+    int slow_leak  = 0;
+    int dc_code    = dc_code_tab[fr_code - 1];
+    int ht_code    = ht_code_tab[fr_code - 1];
+    int fast_gain  = fast_gain_tab[fg_ofs];
+    int slow_decay = slow_decay_tab[dc_code][msk_mod];
+    int misc_decay = misc_decay_tab[nb_code][dc_code][msk_mod];
+    const uint16_t *slow_gain      = slow_gain_tab[nb_code][msk_mod];
+    const uint16_t *fast_decay     = fast_decay_tab[nb_code][dc_code][msk_mod];
+    const uint16_t *fast_gain_adj  = fast_gain_adj_tab[nb_code][dc_code];
+    const uint16_t *hearing_thresh = hearing_thresh_tab[nb_code][ht_code];
+    int i;
+
+    for (i = 0; i < nb_exponent; i++)
+        psd_val[i] = (48 - exp[i]) * 64;
+
+    fast_gain_adj += band_ofs_tab[nb_code][fg_spc];
+    for (i = 0; i < nb_exponent; i++) {
+        fast_leak = log_add(fast_leak  - fast_decay[i],
+                            psd_val[i] - fast_gain + fast_gain_adj[i]);
+        slow_leak = log_add(slow_leak  - slow_decay,
+                            psd_val[i] - slow_gain[i]);
+        msk_val[i] = FFMAX(fast_leak, slow_leak);
+    }
+
+    fast_leak = 0;
+    for (i = nb_exponent - 1; i > band_low_tab[nb_code]; i--) {
+        fast_leak = log_add(fast_leak - misc_decay, psd_val[i] - fast_gain);
+        msk_val[i] = FFMAX(msk_val[i], fast_leak);
+    }
+
+    for (i = 0; i < nb_exponent; i++)
+        msk_val[i] = FFMAX(msk_val[i], hearing_thresh[i]);
+
+    if (!nb_code)
+        calc_lowcomp(msk_val);
+
+    for (i = 0; i < nb_exponent; i++) {
+        int v = 16 * (snr_ofs - 64) + psd_val[i] - msk_val[i] >> 5;
+        bap[i] = bap_tab[av_clip_uintp2(v, 6)];
+    }
+}
+
+static int parse_bit_alloc(DBEContext *s, DBEChannel *c)
+{
+    DBEGroup *p, *g;
+    int bap_strategy[MAX_GROUPS], fg_spc[MAX_GROUPS];
+    int fg_ofs[MAX_GROUPS], msk_mod[MAX_GROUPS];
+    int i, snr_ofs;
+
+    for (i = 0; i < c->nb_groups; i++) {
+        bap_strategy[i] = !i || get_bits1(&s->gb);
+        if (bap_strategy[i]) {
+             fg_spc[i] = get_bits(&s->gb, 2);
+             fg_ofs[i] = get_bits(&s->gb, 3);
+            msk_mod[i] = get_bits1(&s->gb);
+        } else {
+             fg_spc[i] =  fg_spc[i - 1];
+             fg_ofs[i] =  fg_ofs[i - 1];
+            msk_mod[i] = msk_mod[i - 1];
+        }
+    }
+
+    if (get_bits1(&s->gb)) {
+        avpriv_report_missing_feature(s->avctx, "Delta bit allocation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    snr_ofs = get_bits(&s->gb, 8);
+    if (!snr_ofs) {
+        memset(c->bap, 0, sizeof(c->bap));
+        return 0;
+    }
+
+    for (i = 0, p = NULL, g = c->groups; i < c->nb_groups; i++, p = g, g++) {
+        if (c->exp_strategy[i] || bap_strategy[i]) {
+            bit_allocate(g->nb_exponent, g->imdct_idx, s->fr_code,
+                         c->exponents + g->exp_ofs, c->bap + g->exp_ofs,
+                         fg_spc[i], fg_ofs[i], msk_mod[i], snr_ofs);
+        } else {
+            memcpy(c->bap + g->exp_ofs,
+                   c->bap + p->exp_ofs,
+                   g->nb_exponent * sizeof(c->bap[0]));
+        }
+    }
+
+    return 0;
+}
+
+static int parse_indices(DBEContext *s, DBEChannel *c)
+{
+    DBEGroup *p, *g;
+    int i, j;
+
+    for (i = 0, p = NULL, g = c->groups; i < c->nb_groups; i++, p = g, g++) {
+        if (get_bits1(&s->gb)) {
+            int start = get_bits(&s->gb, 6);
+
+            if (start > g->nb_exponent) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid start index\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            for (j = 0; j < start; j++)
+                c->idx[g->exp_ofs + j] = 0;
+
+            for (; j < g->nb_exponent; j++)
+                c->idx[g->exp_ofs + j] = get_bits(&s->gb, 2);
+        } else if (i && g->nb_exponent == p->nb_exponent) {
+            memcpy(c->idx + g->exp_ofs,
+                   c->idx + p->exp_ofs,
+                   g->nb_exponent * sizeof(c->idx[0]));
+        } else {
+            memset(c->idx + g->exp_ofs, 0, g->nb_exponent * sizeof(c->idx[0]));
+        }
+    }
+
+    return 0;
+}
+
+static int parse_mantissas(DBEContext *s, DBEChannel *c)
+{
+    DBEGroup *g;
+    int i, j, k;
+
+    for (i = 0, g = c->groups; i < c->nb_groups; i++, g++) {
+        float *mnt = c->mantissas + g->mnt_ofs;
+
+        for (j = 0; j < g->nb_exponent; j++) {
+            int bap     = c->bap[g->exp_ofs + j];
+            int idx     = c->idx[g->exp_ofs + j];
+            int size1   = mantissa_size1[bap][idx];
+            int count   = g->nb_mantissa[j];
+            float exp   = exponent_tab[c->exponents[g->exp_ofs + j]];
+            float scale = mantissa_tab1[size1][idx] * exp;
+
+            if (!size1) {
+                memset(mnt, 0, count * sizeof(*mnt));
+            } else if (idx) {
+                int values[100];
+                int escape = -(1 << size1 - 1);
+
+                for (k = 0; k < count; k++)
+                    values[k] = get_sbits(&s->gb, size1);
+
+                for (k = 0; k < count; k++) {
+                    if (values[k] != escape) {
+                        mnt[k] = values[k] * scale;
+                    } else {
+                        int size2 = mantissa_size2[bap][idx];
+                        int value = get_sbits(&s->gb, size2);
+                        float a = mantissa_tab2[size2][idx];
+                        float b = mantissa_tab3[size2][idx];
+                        if (value < 0)
+                            mnt[k] = ((value + 1) * a - b) * exp;
+                        else
+                            mnt[k] = (value * a + b) * exp;
+                    }
+                }
+            } else {
+                for (k = 0; k < count; k++)
+                    mnt[k] = get_sbits(&s->gb, size1) * scale;
+            }
+
+            mnt += count;
+        }
+
+        for (; j < g->nb_exponent + c->bw_code; j++) {
+            memset(mnt, 0, g->nb_mantissa[j] * sizeof(*mnt));
+            mnt += g->nb_mantissa[j];
+        }
+    }
+
+    return 0;
+}
+
+static int parse_channel(DBEContext *s, int ch, int seg_id)
+{
+    DBEChannel *c = &s->channels[seg_id][ch];
+    int i, ret;
+
+    if (s->rev_id[ch] > 1) {
+        avpriv_report_missing_feature(s->avctx, "Encoder revision %d", s->rev_id[ch]);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (ch == lfe_channel_tab[s->prog_conf]) {
+        c->gr_code = 3;
+        c->bw_code = 29;
+    } else {
+        c->gr_code = get_bits(&s->gb, 2);
+        c->bw_code = get_bits(&s->gb, 3);
+        if (c->gr_code == 3) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid group type code\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    c->nb_groups   = nb_groups_tab[c->gr_code];
+    c->nb_mstr_exp = nb_mstr_exp_tab[c->gr_code];
+
+    for (i = 0; i < c->nb_groups; i++) {
+        c->groups[i] = frm_ofs_tab[seg_id][c->gr_code][i];
+        if (c->nb_mstr_exp == 2) {
+            c->groups[i].nb_exponent    -= c->bw_code;
+            c->groups[i].nb_bias_exp[1] -= c->bw_code;
+        }
+    }
+
+    if ((ret = parse_exponents(s, c)) < 0)
+        return ret;
+    if ((ret = parse_bit_alloc(s, c)) < 0)
+        return ret;
+    if ((ret = parse_indices(s, c)) < 0)
+        return ret;
+    if ((ret = parse_mantissas(s, c)) < 0)
+        return ret;
+
+    if (get_bits_left(&s->gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of channel %d\n", ch);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_audio(DBEContext *s, int start, int end, int seg_id)
+{
+    int ch, ret, key;
+
+    if ((key = parse_key(s)) < 0)
+        return key;
+
+    for (ch = start; ch < end; ch++) {
+        if (!s->ch_size[ch]) {
+            s->channels[seg_id][ch].nb_groups = 0;
+            continue;
+        }
+        if ((ret = convert_input(s, s->ch_size[ch], key)) < 0)
+            return ret;
+        if ((ret = parse_channel(s, ch, seg_id)) < 0) {
+            if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                return ret;
+            s->channels[seg_id][ch].nb_groups = 0;
+        }
+        if ((ret = skip_input(s, s->ch_size[ch])) < 0)
+            return ret;
+    }
+
+    return skip_input(s, 1);
+}
+
+static int parse_meter(DBEContext *s)
+{
+    if (s->meter_size)
+        return skip_input(s, s->key_present + s->meter_size + 1);
+    return 0;
+}
+
+static void imdct_calc(DBEContext *s, DBEGroup *g, float *result, float *values)
+{
+    FFTContext *imdct = &s->imdct[g->imdct_idx];
+    int n   = 1 << imdct_bits_tab[g->imdct_idx];
+    int n2  = n >> 1;
+    int i;
+
+    switch (g->imdct_phs) {
+    case 0:
+        imdct->imdct_half(imdct, result, values);
+        for (i = 0; i < n2; i++)
+            result[n2 + i] = result[n2 - i - 1];
+        break;
+    case 1:
+        imdct->imdct_calc(imdct, result, values);
+        break;
+    case 2:
+        imdct->imdct_half(imdct, result + n2, values);
+        for (i = 0; i < n2; i++)
+            result[i] = -result[n - i - 1];
+        break;
+    default:
+        av_assert0(0);
+    }
+}
+
+static void transform(DBEContext *s, DBEChannel *c, float *history, float *output)
+{
+    LOCAL_ALIGNED_32(float, buffer, [2048]);
+    LOCAL_ALIGNED_32(float, result, [1152]);
+    DBEGroup *g;
+    int i;
+
+    memset(result, 0, 1152 * sizeof(float));
+    for (i = 0, g = c->groups; i < c->nb_groups; i++, g++) {
+        float *src = buffer + g->src_ofs;
+        float *dst = result + g->dst_ofs;
+        float *win = window + g->win_ofs;
+
+        imdct_calc(s, g, buffer, c->mantissas + g->mnt_ofs);
+        s->fdsp->vector_fmul_add(dst, src, win, dst, g->win_len);
+    }
+
+    for (i = 0; i < 256; i++)
+        output[i] = history[i] + result[i];
+    for (i = 256; i < 896; i++)
+        output[i] = result[i];
+    for (i = 0; i < 256; i++)
+        history[i] = result[896 + i];
+}
+
+static void apply_gain(DBEContext *s, int begin, int end, float *output)
+{
+    if (begin == 960 && end == 960)
+        return;
+
+    if (begin == end) {
+        s->fdsp->vector_fmul_scalar(output, output, gain_tab[end], FRAME_SAMPLES);
+    } else {
+        float a = gain_tab[begin] * (1.0f / (FRAME_SAMPLES - 1));
+        float b = gain_tab[end  ] * (1.0f / (FRAME_SAMPLES - 1));
+        int i;
+
+        for (i = 0; i < FRAME_SAMPLES; i++)
+            output[i] *= a * (FRAME_SAMPLES - i - 1) + b * i;
+    }
+}
+
+static int filter_frame(DBEContext *s, AVFrame *frame)
+{
+    const uint8_t *reorder;
+    int ch, ret;
+
+    if (s->nb_channels == 4)
+        reorder = ch_reorder_4;
+    else if (s->nb_channels == 6)
+        reorder = ch_reorder_6;
+    else if (s->nb_programs == 1 && !(s->avctx->request_channel_layout & AV_CH_LAYOUT_NATIVE))
+        reorder = ch_reorder_8;
+    else
+        reorder = ch_reorder_n;
+
+    frame->nb_samples = FRAME_SAMPLES;
+    if ((ret = ff_get_buffer(s->avctx, frame, 0)) < 0)
+        return ret;
+
+    for (ch = 0; ch < s->nb_channels; ch++) {
+        float *output = (float *)frame->extended_data[reorder[ch]];
+        transform(s, &s->channels[0][ch], s->history[ch], output);
+        transform(s, &s->channels[1][ch], s->history[ch], output + FRAME_SAMPLES / 2);
+        apply_gain(s, s->begin_gain[ch], s->end_gain[ch], output);
+    }
+
+    return 0;
+}
+
+static int dolby_e_decode_frame(AVCodecContext *avctx, void *data,
+                                int *got_frame_ptr, AVPacket *avpkt)
+{
+    DBEContext *s = avctx->priv_data;
+    int i, j, hdr, ret;
+
+    if (avpkt->size < 3)
+        return AVERROR_INVALIDDATA;
+
+    hdr = AV_RB24(avpkt->data);
+    if ((hdr & 0xfffffe) == 0x7888e) {
+        s->word_bits = 24;
+    } else if ((hdr & 0xffffe0) == 0x788e0) {
+        s->word_bits = 20;
+    } else if ((hdr & 0xfffe00) == 0x78e00) {
+        s->word_bits = 16;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Invalid frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->word_bytes  = s->word_bits + 7 >> 3;
+    s->input       = avpkt->data + s->word_bytes;
+    s->input_size  = avpkt->size / s->word_bytes - 1;
+    s->key_present = hdr >> 24 - s->word_bits & 1;
+
+    if ((ret = parse_metadata(s)) < 0)
+        return ret;
+
+    if (s->nb_programs > 1 && !s->multi_prog_warned) {
+        av_log(avctx, AV_LOG_WARNING, "Stream has %d programs (configuration %d), "
+               "channels will be output in native order.\n", s->nb_programs, s->prog_conf);
+        s->multi_prog_warned = 1;
+    }
+
+    switch (s->nb_channels) {
+    case 4:
+        avctx->channel_layout = AV_CH_LAYOUT_4POINT0;
+        break;
+    case 6:
+        avctx->channel_layout = AV_CH_LAYOUT_5POINT1;
+        break;
+    case 8:
+        avctx->channel_layout = AV_CH_LAYOUT_7POINT1;
+        break;
+    }
+
+    avctx->channels    = s->nb_channels;
+    avctx->sample_rate = sample_rate_tab[s->fr_code];
+    avctx->sample_fmt  = AV_SAMPLE_FMT_FLTP;
+
+    i = s->nb_channels / 2;
+    j = s->nb_channels;
+    if ((ret = parse_audio(s, 0, i, 0)) < 0)
+        return ret;
+    if ((ret = parse_audio(s, i, j, 0)) < 0)
+        return ret;
+    if ((ret = parse_metadata_ext(s)) < 0)
+        return ret;
+    if ((ret = parse_audio(s, 0, i, 1)) < 0)
+        return ret;
+    if ((ret = parse_audio(s, i, j, 1)) < 0)
+        return ret;
+    if ((ret = parse_meter(s)) < 0)
+        return ret;
+    if ((ret = filter_frame(s, data)) < 0)
+        return ret;
+
+    *got_frame_ptr = 1;
+    return avpkt->size;
+}
+
+static av_cold void dolby_e_flush(AVCodecContext *avctx)
+{
+    DBEContext *s = avctx->priv_data;
+
+    memset(s->history, 0, sizeof(s->history));
+}
+
+static av_cold int dolby_e_close(AVCodecContext *avctx)
+{
+    DBEContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < 3; i++)
+        ff_mdct_end(&s->imdct[i]);
+
+    av_freep(&s->fdsp);
+    return 0;
+}
+
+
+static av_cold void init_tables(void)
+{
+    int i, j;
+
+    for (i = 1; i < 17; i++)
+        mantissa_tab1[i][0] = 1.0f / (1 << i - 1);
+
+    for (i = 2; i < 16; i++) {
+        mantissa_tab1[i][1] = 1.0f  / ((1 << i) - 1);
+        mantissa_tab1[i][2] = 0.5f  / ((1 << i) - 1);
+        mantissa_tab1[i][3] = 0.25f / ((1 << i) - 1);
+    }
+
+    mantissa_tab1[i][1] = 0.5f   / (1 << 15);
+    mantissa_tab1[i][2] = 0.75f  / (1 << 15);
+    mantissa_tab1[i][3] = 0.875f / (1 << 15);
+
+    for (i = 1; i < 17; i++) {
+        mantissa_tab2[i][1] = mantissa_tab1[i][0] * 0.5f;
+        mantissa_tab2[i][2] = mantissa_tab1[i][0] * 0.75f;
+        mantissa_tab2[i][3] = mantissa_tab1[i][0] * 0.875f;
+        for (j = 1; j < 4; j++)
+            mantissa_tab3[i][j] = 1.0f / (1 << i) + 1.0f / (1 << j) - 1.0f / (1 << i + j);
+    }
+
+    mantissa_tab3[1][3] = 0.6875f;
+
+    for (i = 0; i < 25; i++) {
+        exponent_tab[i * 2    ] = 1.0f      / (1 << i);
+        exponent_tab[i * 2 + 1] = M_SQRT1_2 / (1 << i);
+    }
+
+    for (i = 1; i < 1024; i++)
+        gain_tab[i] = exp2f((i - 960) / 64.0f);
+
+    // short 1
+    ff_kbd_window_init(window, 3.0f, 128);
+    for (i = 0; i < 128; i++)
+        window[128 + i] = window[127 - i];
+
+    // start
+    for (i = 0; i < 192; i++)
+        window[256 + i] = start_window[i];
+
+    // short 2
+    for (i = 0; i < 192; i++)
+        window[448 + i] = short_window2[i];
+    for (i = 0; i < 64; i++)
+        window[640 + i] = window[63 - i];
+
+    // short 3
+    for (i = 0; i < 64; i++)
+        window[704 + i] = short_window3[i];
+    for (i = 0; i < 192; i++)
+        window[768 + i] = window[64 + i];
+
+    // bridge
+    for (i = 0; i < 128; i++)
+        window[960 + i] = window[i];
+    for (i = 0; i < 64; i++)
+        window[1088 + i] = 1.0f;
+
+    // long
+    ff_kbd_window_init(window + 1408, 3.0f, 256);
+    for (i = 0; i < 640; i++)
+        window[1664 + i] = 1.0f;
+    for (i = 0; i < 256; i++)
+        window[2304 + i] = window[1152 + i] = window[1663 - i];
+
+    // reverse start
+    for (i = 0; i < 192; i++)
+        window[2560 + i] = window[447 - i];
+
+    // reverse short 2
+    for (i = 0; i < 256; i++)
+        window[2752 + i] = window[703 - i];
+
+    // reverse short 3
+    for (i = 0; i < 256; i++)
+        window[3008 + i] = window[959 - i];
+
+    // reverse bridge
+    for (i = 0; i < 448; i++)
+        window[3264 + i] = window[1407 - i];
+}
+
+static av_cold int dolby_e_init(AVCodecContext *avctx)
+{
+    static AVOnce init_once = AV_ONCE_INIT;
+    DBEContext *s = avctx->priv_data;
+    int i;
+
+    if (ff_thread_once(&init_once, init_tables))
+        return AVERROR_UNKNOWN;
+
+    for (i = 0; i < 3; i++)
+        if (ff_mdct_init(&s->imdct[i], imdct_bits_tab[i], 1, 2.0) < 0)
+            return AVERROR(ENOMEM);
+
+    if (!(s->fdsp = avpriv_float_dsp_alloc(0)))
+        return AVERROR(ENOMEM);
+
+    s->multi_prog_warned = !!(avctx->request_channel_layout & AV_CH_LAYOUT_NATIVE);
+    s->avctx = avctx;
+    return 0;
+}
+
+AVCodec ff_dolby_e_decoder = {
+    .name           = "dolby_e",
+    .long_name      = NULL_IF_CONFIG_SMALL("Dolby E"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_DOLBY_E,
+    .priv_data_size = sizeof(DBEContext),
+    .init           = dolby_e_init,
+    .decode         = dolby_e_decode_frame,
+    .close          = dolby_e_close,
+    .flush          = dolby_e_flush,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_CHANNEL_CONF,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_NONE },
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/dolby_e.h b/libavcodec/dolby_e.h
new file mode 100644
index 0000000..ae04bf6
--- /dev/null
+++ b/libavcodec/dolby_e.h
@@ -0,0 +1,647 @@
+/*
+ * Copyright (C) 2017 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DOLBY_E_H
+#define AVCODEC_DOLBY_E_H
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
+#include "libavutil/mem.h"
+
+#include "internal.h"
+#include "get_bits.h"
+#include "kbdwin.h"
+#include "fft.h"
+
+#define FRAME_SAMPLES   1792
+
+#define MAX_PROG_CONF   23
+#define MAX_PROGRAMS    8
+#define MAX_CHANNELS    8
+#define MAX_SEGMENTS    2
+
+#define MAX_GROUPS      8
+#define MAX_EXPONENTS   304
+#define MAX_MANTISSAS   1024
+
+#define MAX_MSTR_EXP    2
+#define MAX_BIAS_EXP    50
+
+typedef struct DBEGroup {
+    uint8_t         nb_exponent;
+    uint8_t         nb_bias_exp[MAX_MSTR_EXP];
+    uint16_t        exp_ofs;
+    uint16_t        mnt_ofs;
+    const uint8_t   *nb_mantissa;
+    uint8_t         imdct_idx;
+    uint8_t         imdct_phs;
+    uint16_t        win_len;
+    uint16_t        dst_ofs;
+    uint16_t        win_ofs;
+    uint16_t        src_ofs;
+} DBEGroup;
+
+typedef struct DBEChannel {
+    int     gr_code;
+    int     bw_code;
+
+    int         nb_groups;
+    int         nb_mstr_exp;
+    DBEGroup    groups[MAX_GROUPS];
+
+    int     exp_strategy[MAX_GROUPS];
+    int     exponents[MAX_EXPONENTS];
+    int     bap[MAX_EXPONENTS];
+    int     idx[MAX_EXPONENTS];
+
+    DECLARE_ALIGNED(32, float, mantissas)[MAX_MANTISSAS];
+} DBEChannel;
+
+typedef struct DBEContext {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    uint8_t     *input;
+    int         input_size;
+
+    int         word_bits;
+    int         word_bytes;
+    int         key_present;
+
+    int         prog_conf;
+    int         nb_channels;
+    int         nb_programs;
+
+    int         fr_code;
+    int         fr_code_orig;
+
+    int         ch_size[MAX_CHANNELS];
+    int         mtd_ext_size;
+    int         meter_size;
+
+    int         rev_id[MAX_CHANNELS];
+    int         begin_gain[MAX_CHANNELS];
+    int         end_gain[MAX_CHANNELS];
+
+    int         multi_prog_warned;
+
+    DBEChannel  channels[MAX_SEGMENTS][MAX_CHANNELS];
+
+    DECLARE_ALIGNED(32, float, history)[MAX_CHANNELS][256];
+
+    FFTContext          imdct[3];
+    AVFloatDSPContext   *fdsp;
+
+    uint8_t     buffer[1024 * 3 + AV_INPUT_BUFFER_PADDING_SIZE];
+} DBEContext;
+
+static const uint8_t nb_programs_tab[MAX_PROG_CONF + 1] = {
+    2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 8, 1, 2, 3, 3, 4, 5, 6, 1, 2, 3, 4, 1, 1
+};
+
+static const uint8_t nb_channels_tab[MAX_PROG_CONF + 1] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 8, 8
+};
+
+static const int8_t lfe_channel_tab[MAX_PROG_CONF + 1] = {
+     5,  5, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  5, 5
+};
+
+static const uint8_t ch_reorder_4[4] = { 0, 2, 1, 3 };
+static const uint8_t ch_reorder_6[6] = { 0, 2, 4, 1, 3, 5 };
+static const uint8_t ch_reorder_8[8] = { 0, 2, 6, 4, 1, 3, 7, 5 };
+static const uint8_t ch_reorder_n[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
+
+static const uint16_t sample_rate_tab[16] = {
+    0, 42965, 43008, 44800, 53706, 53760
+};
+
+static const uint8_t nb_groups_tab[4] = { 1, 8, 7, 1 };
+
+static const uint8_t nb_mstr_exp_tab[4] = { 2, 2, 2, 1 };
+
+static const uint8_t nb_mantissa_38[38] = {
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     2,  2,  2,  2,  2,  2,  3,  3,  3,  4,  4,  4,  5,  5,  6,  6,
+     7,  8,  9, 10, 11, 12,
+};
+
+static const uint8_t nb_mantissa_44[44] = {
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
+     2,  2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  5,  5,  6,  7,  7,
+     8,  9, 10, 11, 12, 13, 15, 16, 18, 20, 22, 25,
+};
+
+static const uint8_t nb_mantissa_50[50] = {
+     1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,
+     3,  4,  4,  5,  5,  6,  6,  7,  8,  9,  9, 10, 12, 13, 14, 16,
+    18, 19, 22, 24, 27, 29, 32, 36, 40, 44, 49, 54, 60, 66, 74, 82,
+    90, 100,
+};
+
+static const uint8_t imdct_bits_tab[3] = { 8, 9, 11 };
+
+static const DBEGroup grp_tab_0[1] = {
+    { 50, { 27, 23 }, 0, 0, nb_mantissa_50, 2, 0, 1152, 0, 1408, 0 },
+};
+
+static const DBEGroup grp_tab_1[8] = {
+    { 38, { 12, 26 }, 0, 0, nb_mantissa_38, 0, 0, 192, 0, 256, 0 },
+    { 38, { 12, 26 }, 38, 128, nb_mantissa_38, 0, 1, 256, 64, 448, 0 },
+    { 38, { 12, 26 }, 76, 256, nb_mantissa_38, 0, 1, 256, 192, 704, 0 },
+    { 38, { 12, 26 }, 114, 384, nb_mantissa_38, 0, 1, 256, 320, 0, 0 },
+    { 38, { 12, 26 }, 152, 512, nb_mantissa_38, 0, 1, 256, 448, 0, 0 },
+    { 38, { 12, 26 }, 190, 640, nb_mantissa_38, 0, 1, 256, 576, 0, 0 },
+    { 38, { 12, 26 }, 228, 768, nb_mantissa_38, 0, 1, 256, 704, 0, 0 },
+    { 38, { 12, 26 }, 266, 896, nb_mantissa_38, 0, 1, 256, 832, 0, 0 },
+};
+
+static const DBEGroup grp_tab_2[7] = {
+    { 38, { 12, 26 }, 0, 0, nb_mantissa_38, 0, 0, 192, 0, 256, 0 },
+    { 38, { 12, 26 }, 38, 128, nb_mantissa_38, 0, 1, 256, 64, 448, 0 },
+    { 38, { 12, 26 }, 76, 256, nb_mantissa_38, 0, 1, 256, 192, 704, 0 },
+    { 38, { 12, 26 }, 114, 384, nb_mantissa_38, 0, 1, 256, 320, 0, 0 },
+    { 38, { 12, 26 }, 152, 512, nb_mantissa_38, 0, 1, 256, 448, 0, 0 },
+    { 38, { 12, 26 }, 190, 640, nb_mantissa_38, 0, 1, 256, 576, 0, 0 },
+    { 44, { 19, 25 }, 228, 768, nb_mantissa_44, 1, 1, 448, 704, 960, 64 },
+};
+
+static const DBEGroup grp_tab_3[1] = {
+    { 21, { 21 }, 0, 0, nb_mantissa_50, 2, 0, 1152, 0, 1408, 0 },
+};
+
+static const DBEGroup grp_tab_4[1] = {
+    { 50, { 27, 23 }, 0, 0, nb_mantissa_50, 2, 2, 1152, 0, 1408, 896 },
+};
+
+static const DBEGroup grp_tab_5[8] = {
+    { 38, { 12, 26 }, 0, 0, nb_mantissa_38, 0, 1, 256, 64, 0, 0 },
+    { 38, { 12, 26 }, 38, 128, nb_mantissa_38, 0, 1, 256, 192, 0, 0 },
+    { 38, { 12, 26 }, 76, 256, nb_mantissa_38, 0, 1, 256, 320, 0, 0 },
+    { 38, { 12, 26 }, 114, 384, nb_mantissa_38, 0, 1, 256, 448, 0, 0 },
+    { 38, { 12, 26 }, 152, 512, nb_mantissa_38, 0, 1, 256, 576, 0, 0 },
+    { 38, { 12, 26 }, 190, 640, nb_mantissa_38, 0, 1, 256, 704, 3008, 0 },
+    { 38, { 12, 26 }, 228, 768, nb_mantissa_38, 0, 1, 256, 832, 2752, 0 },
+    { 38, { 12, 26 }, 266, 896, nb_mantissa_38, 0, 2, 192, 960, 2560, 64 },
+};
+
+static const DBEGroup grp_tab_6[7] = {
+    { 44, { 19, 25 }, 0, 0, nb_mantissa_44, 1, 1, 448, 0, 3264, 0 },
+    { 38, { 12, 26 }, 44, 256, nb_mantissa_38, 0, 1, 256, 320, 0, 0 },
+    { 38, { 12, 26 }, 82, 384, nb_mantissa_38, 0, 1, 256, 448, 0, 0 },
+    { 38, { 12, 26 }, 120, 512, nb_mantissa_38, 0, 1, 256, 576, 0, 0 },
+    { 38, { 12, 26 }, 158, 640, nb_mantissa_38, 0, 1, 256, 704, 3008, 0 },
+    { 38, { 12, 26 }, 196, 768, nb_mantissa_38, 0, 1, 256, 832, 2752, 0 },
+    { 38, { 12, 26 }, 234, 896, nb_mantissa_38, 0, 2, 192, 960, 2560, 64 },
+};
+
+static const DBEGroup grp_tab_7[1] = {
+    { 21, { 21 }, 0, 0, nb_mantissa_50, 2, 2, 1152, 0, 1408, 896 },
+};
+
+static const DBEGroup *const frm_ofs_tab[2][4] = {
+    { grp_tab_0, grp_tab_1, grp_tab_2, grp_tab_3 },
+    { grp_tab_4, grp_tab_5, grp_tab_6, grp_tab_7 }
+};
+
+static const uint8_t mantissa_size1[16][4] = {
+    {  0,  0,  0,  0 }, {  2,  1,  1,  1 }, {  3,  2,  1,  1 }, {  4,  3,  2,  1 },
+    {  5,  4,  3,  2 }, {  6,  5,  4,  3 }, {  7,  6,  5,  4 }, {  8,  7,  6,  5 },
+    {  9,  8,  7,  6 }, { 10,  9,  8,  7 }, { 11, 10,  9,  8 }, { 12, 11, 10,  9 },
+    { 13, 12, 11, 10 }, { 14, 13, 12, 11 }, { 15, 14, 13, 12 }, { 16, 15, 14, 13 },
+};
+
+static const uint8_t mantissa_size2[16][4] = {
+    {  0,  0,  0,  0 }, {  2,  1,  2,  2 }, {  3,  2,  3,  3 }, {  4,  3,  4,  4 },
+    {  5,  4,  5,  5 }, {  6,  5,  6,  6 }, {  7,  6,  7,  7 }, {  8,  7,  8,  8 },
+    {  9,  8,  9,  9 }, { 10,  9, 10, 10 }, { 11, 10, 11, 11 }, { 12, 11, 12, 12 },
+    { 13, 12, 13, 13 }, { 14, 13, 14, 14 }, { 15, 14, 15, 15 }, { 16, 15, 16, 16 },
+};
+
+static const float start_window[192] = {
+    0.00161569379826, 0.00185748233347, 0.00198562758548, 0.00207834078104,
+    0.00215717748523, 0.00223067096393, 0.00230299213147, 0.00237651215396,
+    0.00245275561606, 0.00253281402069, 0.00261754673613, 0.00270768786168,
+    0.00280390761895, 0.00290684998656, 0.00301715751161, 0.00313548872798,
+    0.00326253122934, 0.00339901215995, 0.00354570716636, 0.00370344845023,
+    0.00387313232586, 0.00405572653911, 0.00425227750970, 0.00446391759265,
+    0.00469187240551, 0.00493746822816, 0.00520213944619, 0.00548743597507,
+    0.00579503056737, 0.00612672586953, 0.00648446105606, 0.00687031782873,
+    0.00728652552677, 0.00773546505205, 0.00821967127415, 0.00874183354619,
+    0.00930479393832, 0.00991154278653, 0.01056521116692, 0.01126905994567,
+    0.01202646513050, 0.01284089936559, 0.01371590957417, 0.01465509096066,
+    0.01566205783408, 0.01674041199523, 0.01789370972358, 0.01912542867865,
+    0.02043893626265, 0.02183746113793, 0.02332406961796, 0.02490164852364,
+    0.02657289580178, 0.02834031974193, 0.03020624702903, 0.03217283918354,
+    0.03424211623810, 0.03641598586180, 0.03869627565015, 0.04108476601498,
+    0.04358322107390, 0.04619341515939, 0.04891715301882, 0.05175628239149,
+
+    0.05471237327267, 0.05778734733755, 0.06098291402413, 0.06430101352084,
+    0.06774345212186, 0.07131188644726, 0.07500780649199, 0.07883251748595,
+    0.08278712056651, 0.08687249228061, 0.09108926295730, 0.09543779401074,
+    0.09991815425851, 0.10453009536427, 0.10927302653894, 0.11414598865987,
+    0.11914762799220, 0.12427616972097, 0.12952939152560, 0.13490459744934,
+    0.14039859233595, 0.14600765712201, 0.15172752528722, 0.15755336077528,
+    0.16347973770491, 0.16950062219342, 0.17560935661442, 0.18179864660619,
+    0.18806055113821, 0.19438647593012, 0.20076717050010, 0.20719272909882,
+    0.21365259576030, 0.22013557367283, 0.22662983904194, 0.23312295958328,
+    0.23960191774666, 0.24605313873388, 0.25246252333253, 0.25881548554631,
+    0.26509699495987, 0.27129162373316, 0.27738359807707, 0.28335685401987,
+    0.28919509723179, 0.29488186663467, 0.30040060148455, 0.30573471157819,
+    0.31086765019993, 0.31578298939317, 0.32046449711227, 0.32489621578468,
+    0.32906254179156, 0.33294830535654, 0.33653885031840, 0.33982011325336,
+    0.34277870140679, 0.34540196889300, 0.34767809062480, 0.34959613344194,
+    0.35114612391958, 0.35231911235422, 0.35310723244504, 0.35350375621308,
+
+    0.35350314372945, 0.35310108725579, 0.35229454943591, 0.35108179521634,
+    0.34946241721522, 0.34743735430290, 0.34500890320420, 0.34218072298001,
+    0.33895783229541, 0.33534659943168, 0.33135472505060, 0.32699121776996,
+    0.32226636266000, 0.31719168282019, 0.31177989424432, 0.30604485422875,
+    0.30000150362379, 0.29366580327088, 0.28705466500775, 0.28018587766131,
+    0.27307802848095, 0.26575042049535, 0.25822298630189, 0.25051619882000,
+    0.24265097955783, 0.23464860495522, 0.22653061137548, 0.21831869932335,
+    0.21003463746705, 0.20170016703857, 0.19333690717811, 0.18496626177620,
+    0.17660932835062, 0.16828680947474, 0.16001892724986, 0.15182534128597,
+    0.14372507062477, 0.13573642000364, 0.12787691082233, 0.12016321713317,
+    0.11261110693234, 0.10523538898282, 0.09804986534955, 0.09106728977263,
+    0.08429933194438, 0.07775654768810, 0.07144835495683, 0.06538301547324,
+    0.05956762170687, 0.05400808871425, 0.04870915012107, 0.04367435714993,
+    0.03890607899172, 0.03440550179663, 0.03017262174627, 0.02620622428513,
+    0.02250383492507, 0.01906161305732, 0.01587412848221, 0.01293388032354,
+    0.01023019677288, 0.00774641320626, 0.00545109736891, 0.00325868651263,
+};
+
+static const float short_window2[192] = {
+    0.00018861094606, 0.00033433010202, 0.00050309624485, 0.00070306161748,
+    0.00093995174533, 0.00121913067128, 0.00154606505568, 0.00192647806126,
+    0.00236641248692, 0.00287225985240, 0.00345077377440, 0.00410907465023,
+    0.00485464855241, 0.00569534163219, 0.00663935063508, 0.00769520981249,
+    0.00887177436246, 0.01017820046395, 0.01162392194150, 0.01321862359335,
+    0.01497221122468, 0.01689477844427, 0.01899657030441, 0.02128794388846,
+    0.02377932597692, 0.02648116795039, 0.02940389811590, 0.03255787167130,
+    0.03595331854986, 0.03960028941437, 0.04350860009563, 0.04768777479454,
+    0.05214698838949, 0.05689500821121, 0.06194013566525, 0.06729014809766,
+    0.07295224131210, 0.07893297315602, 0.08523820859989, 0.09187306673620,
+    0.09884187012422, 0.10614809690222, 0.11379433608064, 0.12178224641797,
+    0.13011251926531, 0.13878484574660, 0.14779788861830, 0.15714925912610,
+    0.16683549914631, 0.17685206886673, 0.18719334022589, 0.19785259629099,
+    0.20882203671372, 0.22009278936030, 0.23165492816694, 0.24349749722585,
+    0.25560854105961, 0.26797514099368, 0.28058345748882, 0.29341877824732,
+    0.30646557185942, 0.31970754671026, 0.33312771482295, 0.34670846027024,
+
+    0.36043161174692, 0.37427851885723, 0.38823013163645, 0.40226708279486,
+    0.41636977214436, 0.43051845264462, 0.44469331748632, 0.45887458761470,
+    0.47304259908636, 0.48717788964798, 0.50126128392546, 0.51527397661778,
+    0.52919761310050, 0.54301436685998, 0.55670701320069, 0.57025899869448,
+    0.58365450587230, 0.59687851269542, 0.60991684638414, 0.62275623122793,
+    0.63538433005035, 0.64778977905593, 0.65996221584264, 0.67189230042379,
+    0.68357172916486, 0.69499324160511, 0.70615062019861, 0.71703868307548,
+    0.72765326998919, 0.73799122168099, 0.74805035295521, 0.75782941981995,
+    0.76732808110520, 0.77654685502339, 0.78548707118622, 0.79415081863423,
+    0.80254089047207, 0.81066072573188, 0.81851434910893, 0.82610630922734,
+    0.83344161609862, 0.84052567843230, 0.84736424144524, 0.85396332579459,
+    0.86032916822973, 0.86646816451999, 0.87238681516918, 0.87809167437532,
+    0.88358930263537, 0.88888622333073, 0.89398888356256, 0.89890361943564,
+    0.90363662591861, 0.90819393133744, 0.91258137648979, 0.91680459830070,
+    0.92086901787718, 0.92477983276087, 0.92854201312583, 0.93216030163834,
+    0.93563921662343, 0.93898305819384, 0.94219591693690, 0.94528168477979,
+
+    0.94823843319821, 0.95106834367330, 0.95377776558539, 0.95636718335775,
+    0.95883679961479, 0.96118650212341, 0.96341583179195, 0.96552395212906,
+    0.96750962060547, 0.96937116231768, 0.97110644638309, 0.97271286544154,
+    0.97418731862798, 0.97552619834964, 0.97672538116257, 0.97778022299974,
+    0.97868555895586, 0.97943570778357, 0.98002448120255, 0.98044519806866,
+    0.98069070339493, 0.98075339216123, 0.98062523779637, 0.98029782516478,
+    0.97976238784222, 0.97900984942031, 0.97803086854002, 0.97681588731895,
+    0.97535518280755, 0.97363892108474, 0.97165721358452, 0.96940017523145,
+    0.96685798395452, 0.96402094114589, 0.96087953263194, 0.95742448973047,
+    0.95364684997699, 0.94953801711660, 0.94508981997396, 0.94029456983253,
+    0.93514511597504, 0.92963489905951, 0.92375800202883, 0.91750919827624,
+    0.91088399681406, 0.90387868421832, 0.89649036314692, 0.88871698725397,
+    0.88055739234735, 0.87201132366062, 0.86307945913336, 0.85376342861693,
+    0.84406582894455, 0.83399023482637, 0.82354120554757, 0.81272428745995,
+    0.80154601230457, 0.79001389138101, 0.77813640562199, 0.76592299164227,
+    0.75338402384395, 0.74053079267526, 0.72737547915460, 0.71393112578527,
+};
+
+static const float short_window3[64] = {
+    0.00326887936450, 0.00550242900936, 0.00786846643791, 0.01045683453520,
+    0.01330402120132, 0.01643221072863, 0.01985798040609, 0.02359509464766,
+    0.02765559221954, 0.03205025893128, 0.03678884369614, 0.04188015679495,
+    0.04733210987781, 0.05315172583924, 0.05934513287609, 0.06591755045290,
+    0.07287327156378, 0.08021564389822, 0.08794705152307, 0.09606889811179,
+    0.10458159240070, 0.11348453632940, 0.12277611617809, 0.13245369691511,
+    0.14251361989876, 0.15295120402567, 0.16376075037904, 0.17493555039885,
+    0.18646789757072, 0.19834910260891, 0.21056951208995, 0.22311853047787,
+    0.23598464546683, 0.24915545655419, 0.26261770674500, 0.27635731727778,
+    0.29035942525136, 0.30460842402318, 0.31908800624032, 0.33378120935681,
+    0.34867046348260, 0.36373764140285, 0.37896411059909, 0.39433078709788,
+    0.40981819096657, 0.42540650327031, 0.44107562429959, 0.45680523287270,
+    0.47257484651351, 0.48836388230077, 0.50415171818214, 0.51991775454258,
+    0.53564147581496, 0.55130251191887, 0.56688069931047, 0.58235614142007,
+    0.59770926827271, 0.61292089506118, 0.62797227945823, 0.64284517745255,
+    0.65752189749349, 0.67198535273209, 0.68621911114984, 0.70020744337099,
+};
+
+static const uint8_t dc_code_tab[5] = { 0, 0, 0, 1, 1 };
+
+static const uint8_t ht_code_tab[5] = { 0, 0, 1, 2, 2 };
+
+static const uint8_t band_ofs_tab[3][4] = {
+    { 12, 8, 4, 0 }, { 14, 10, 6, 0 }, { 12, 8, 4, 0 }
+};
+
+static const uint8_t band_low_tab[3] = { 9, 17, 24 };
+
+static const uint16_t fast_gain_tab[8] = {
+    128, 256, 384, 512, 640, 768, 896, 1024
+};
+
+static const uint16_t slow_decay_tab[2][2] = { { 27, -1 }, { 32, 21 } };
+
+static const uint16_t misc_decay_tab[3][2][2] = {
+    { { 354, -1 }, { 425, 425 } },
+    { { 266, -1 }, { 320,  -1 } },
+    { { 213, -1 }, { 256,  -1 } }
+};
+
+static const uint16_t fast_decay_tab[3][2][2][50] = {
+    {{{
+        142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+        142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+        142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+        142, 142, 142, 142, 142, 142, 142, 142,
+    }, {
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+    }}, {{
+        170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
+        170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
+        170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
+        170, 170, 170, 170, 170, 170, 170, 170,
+    }, {
+         64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+         64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+         64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
+         64,  64,  64,  64,  64,  64,  64,  64,
+    }}}, {{{
+        266, 266, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106,
+    }, {
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,
+    }}, {{
+        319, 319, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128,
+    }, {
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,
+    }}}, {{{
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+        106, 106, 106, 106, 106, 106, 106, 106, 106, 106,
+    }, {
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+    }}, {{
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    }, {
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+    }}}
+};
+
+static const uint16_t fast_gain_adj_tab[3][2][62] = {
+    {{
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   1,   2,   4,   7,  11,  16,  29,  44,  59,
+         76,  94, 116, 142, 179, 221, 252, 285, 312, 334,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          2,   5,   8,  10,  15,  28,  42,  57,  75,  93,
+        115, 140, 177, 219, 247, 280, 308, 330, 427, 533,
+    }}, {{
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   2,   5,   8,  12,  21,  35,  51,  69,  89,
+        111, 138, 176, 220, 251, 284, 312, 334,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   2,
+          5,   8,  11,  18,  33,  49,  65,  84, 106, 132,
+        168, 214, 245, 279, 308, 329, 427, 533,
+    }}, {{
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   1,   4,   7,  10,  17,
+         31,  47,  65,  84, 107, 134, 171, 215, 250, 283,
+        312, 334,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   3,   6,   9,  13,  27,  43,
+         60,  79, 100, 126, 160, 207, 242, 276, 307, 329,
+        427, 533,
+    }}
+};
+
+static const uint16_t slow_gain_tab[3][2][50] = {
+    {{
+        3072, 3072, 3072, 3072, 3072, 3072, 1063, 1063, 1063, 1063,
+        1063, 1063, 1063, 1063, 1063, 1063, 1063, 1063, 1063, 1063,
+        1063, 1063, 1063, 1063, 1063, 1063, 1063, 1063, 1063, 1063,
+        1063, 1063, 1063, 1063, 1063, 1063, 1063, 1063,
+    }, {
+        3072, 3072, 3072, 3072, 3072, 3072,  850,  850,  850,  850,
+         850,  850,  850,  850,  850,  850,  850,  850,  850,  850,
+         850,  850,  850,  850,  850,  850,  850,  850,  850,  850,
+         850,  850,  850,  850,  850,  850,  850,  850,
+    }}, {{
+        3072, 1212, 1212, 1212,  999,  999,  999,  999,  999,  999,
+         999,  999,  999,  999,  999,  999,  999,  999,  999,  999,
+         999,  999,  999,  999,  999,  999,  999,  999,  999,  999,
+         999,  999,  999,  999,  999,  999,  999,  999,  999,  999,
+         999,  999,  999,  999,
+    }, {
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,
+    }}, {{
+        3072, 3072, 3072, 3072, 3072, 3072, 3072, 3072, 3072, 3072,
+         999,  999,  999,  999,  999,  999,  999,  999,  999,  999,
+         999,  999,  999,  999,  999,  999,  999,  999,  999,  999,
+         999,  999,  999,  999,  999,  999,  999,  999,  999,  999,
+         999,  999,  999,  999,  999,  999,  999,  999,  999,  999,
+    }, {
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
+    }}
+};
+
+static const uint16_t hearing_thresh_tab[3][3][50] = {
+    {{
+        1403, 1141, 1000,  959,  948,  957,  946,  925,  899,  871,
+         843,  815,  789,  766,  745,  727,  705,  687,  681,  686,
+         701,  725,  768,  854,  940, 1018, 1075, 1103, 1111, 1106,
+        1098, 1105, 1142, 1237, 1419, 1721, 2169, 2805,
+    }, {
+        1401, 1130,  995,  957,  947,  955,  941,  918,  890,  861,
+         831,  803,  777,  754,  734,  717,  698,  684,  682,  692,
+         712,  743,  798,  894,  976, 1045, 1091, 1109, 1110, 1102,
+        1098, 1116, 1174, 1300, 1526, 1884, 2401, 3072,
+    }, {
+        1393, 1086,  974,  949,  957,  941,  913,  878,  843,  808,
+         777,  750,  727,  708,  695,  686,  681,  689,  714,  752,
+         811,  888,  971, 1044, 1087, 1108, 1110, 1102, 1098, 1115,
+        1172, 1290, 1489, 1812, 2293, 2964, 3072, 3072,
+    }}, {{
+        1412, 1343, 1141, 1047, 1000,  974,  959,  951,  948,  947,
+         957,  953,  946,  936,  925,  906,  878,  850,  822,  795,
+         771,  745,  719,  700,  687,  681,  685,  701,  733,  784,
+         885,  977, 1047, 1092, 1110, 1108, 1099, 1102, 1138, 1233,
+        1413, 1711, 2157, 2797,
+    }, {
+        1412, 1336, 1130, 1040,  995,  970,  957,  950,  947,  947,
+         955,  950,  941,  930,  918,  897,  868,  838,  810,  783,
+         759,  734,  710,  693,  684,  681,  690,  712,  752,  823,
+         924, 1009, 1069, 1102, 1111, 1104, 1098, 1111, 1168, 1295,
+        1518, 1873, 2388, 3072,
+    }, {
+        1411, 1293, 1086, 1009,  974,  957,  949,  947,  957,  951,
+         941,  928,  913,  896,  878,  852,  817,  785,  756,  732,
+         713,  695,  683,  682,  689,  710,  746,  811,  906,  992,
+        1061, 1099, 1111, 1106, 1098, 1107, 1155, 1266, 1471, 1799,
+        2277, 2945, 3072, 3072,
+    }}, {{
+        1431, 1412, 1403, 1379, 1343, 1293, 1229, 1180, 1125, 1075,
+        1040, 1014,  996,  979,  965,  957,  951,  948,  947,  957,
+         951,  940,  924,  903,  877,  846,  815,  785,  753,  725,
+         702,  686,  681,  689,  714,  760,  847,  947, 1028, 1083,
+        1108, 1109, 1101, 1100, 1132, 1222, 1402, 1705, 2160, 2803,
+    }, {
+        1431, 1412, 1401, 1375, 1336, 1278, 1215, 1168, 1115, 1066,
+        1032, 1008,  991,  975,  962,  954,  950,  947,  947,  955,
+         948,  935,  916,  894,  866,  835,  803,  772,  742,  715,
+         695,  683,  683,  697,  729,  784,  887,  982, 1054, 1096,
+        1111, 1106, 1098, 1107, 1159, 1281, 1505, 1865, 2391, 3072,
+    }, {
+        1427, 1411, 1393, 1353, 1293, 1215, 1160, 1118, 1072, 1031,
+        1003,  984,  971,  960,  952,  948,  947,  957,  952,  941,
+         924,  902,  876,  847,  815,  781,  750,  723,  700,  685,
+         681,  691,  719,  766,  858,  958, 1039, 1089, 1109, 1108,
+        1099, 1102, 1141, 1245, 1442, 1766, 2250, 2930, 3072, 3072,
+    }}
+};
+
+static const int16_t lwc_gain_tab[11][7] = {
+    {   -21,  -197,  -271,  -466, 32767, 32767, 32767 },
+    {  -197,   -29,  -244,  -271,  -540, 32767, 32767 },
+    {  -271,  -244,   -29,  -249,  -271,  -593, 32767 },
+    {  -466,  -271,  -249,   -29,  -251,  -271,  -632 },
+    {  -540,  -271,  -251,   -29,  -251,  -271,  -664 },
+    {  -593,  -271,  -251,   -29,  -252,  -271,  -690 },
+    {  -632,  -271,  -252,   -29,  -252,  -271,  -711 },
+    {  -664,  -271,  -252,   -29,  -252,  -271,  -730 },
+    {  -690,  -271,  -252,   -29,  -252,  -271,  -745 },
+    {  -711,  -271,  -252,   -29,  -253,  -271,  -759 },
+    {  -730,  -271,  -253,   -29,  -253,  -271,  -771 },
+};
+
+static const int16_t lwc_adj_tab[7] = {
+    -192, -320, -448, -512, -448, -320, -192,
+};
+
+static const uint8_t log_add_tab[212] = {
+    64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 52, 51, 50,
+    49, 48, 47, 47, 46, 45, 44, 44, 43, 42, 41, 41, 40, 39, 38, 38,
+    37, 36, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 29, 28,
+    28, 27, 27, 26, 26, 25, 25, 24, 24, 23, 23, 22, 22, 21, 21, 21,
+    20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+    15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+    10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,
+     7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+     5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  0,  0,
+};
+
+static const uint8_t bap_tab[64] = {
+     0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,  3,  4,
+     4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,
+     8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
+    12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 15,
+};
+
+static float mantissa_tab1[17][4];
+static float mantissa_tab2[17][4];
+static float mantissa_tab3[17][4];
+static float exponent_tab[50];
+static float gain_tab[1024];
+
+DECLARE_ALIGNED(32, static float, window)[3712];
+
+#endif
diff --git a/libavcodec/dpcm.c b/libavcodec/dpcm.c
index 7567643..7d3934e 100644
--- a/libavcodec/dpcm.c
+++ b/libavcodec/dpcm.c
@@ -2,20 +2,20 @@
  * Assorted DPCM codecs
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,7 +44,7 @@
 #include "mathops.h"
 
 typedef struct DPCMContext {
-    int16_t roq_square_array[256];
+    int16_t array[256];
     int sample[2];                  ///< previous sample (for SOL_DPCM)
     const int8_t *sol_table;        ///< delta table for SOL_DPCM
 } DPCMContext;
@@ -118,7 +118,7 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
     int i;
 
     if (avctx->channels < 1 || avctx->channels > 2) {
-        av_log(avctx, AV_LOG_INFO, "invalid number of channels\n");
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
         return AVERROR(EINVAL);
     }
 
@@ -130,8 +130,8 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
         /* initialize square table */
         for (i = 0; i < 128; i++) {
             int16_t square = i * i;
-            s->roq_square_array[i      ] =  square;
-            s->roq_square_array[i + 128] = -square;
+            s->array[i      ] =  square;
+            s->array[i + 128] = -square;
         }
         break;
 
@@ -153,6 +153,31 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
         }
         break;
 
+    case AV_CODEC_ID_SDX2_DPCM:
+        for (i = -128; i < 128; i++) {
+            int16_t square = i * i * 2;
+            s->array[i+128] = i < 0 ? -square: square;
+        }
+        break;
+
+    case AV_CODEC_ID_GREMLIN_DPCM: {
+        int delta = 0;
+        int code = 64;
+        int step = 45;
+
+        s->array[0] = 0;
+        for (i = 0; i < 127; i++) {
+            delta += (code >> 5);
+            code  += step;
+            step  += 2;
+
+            s->array[i*2 + 1] =  delta;
+            s->array[i*2 + 2] = -delta;
+        }
+        s->array[255] = delta + (code >> 5);
+        }
+        break;
+
     default:
         break;
     }
@@ -200,18 +225,23 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
         else
             out = buf_size;
         break;
+    case AV_CODEC_ID_GREMLIN_DPCM:
+    case AV_CODEC_ID_SDX2_DPCM:
+        out = buf_size;
+        break;
     }
     if (out <= 0) {
         av_log(avctx, AV_LOG_ERROR, "packet is too small\n");
         return AVERROR(EINVAL);
     }
+    if (out % avctx->channels) {
+        av_log(avctx, AV_LOG_WARNING, "channels have differing number of samples\n");
+    }
 
     /* get output buffer */
-    frame->nb_samples = out / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = (out + avctx->channels - 1) / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples = (int16_t *)frame->data[0];
     samples_end = output_samples + out;
 
@@ -229,7 +259,7 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
 
         /* decode the samples */
         while (output_samples < samples_end) {
-            predictor[ch] += s->roq_square_array[bytestream2_get_byteu(&gb)];
+            predictor[ch] += s->array[bytestream2_get_byteu(&gb)];
             predictor[ch]  = av_clip_int16(predictor[ch]);
             *output_samples++ = predictor[ch];
 
@@ -317,6 +347,31 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
         break;
+
+    case AV_CODEC_ID_SDX2_DPCM:
+        while (output_samples < samples_end) {
+            int8_t n = bytestream2_get_byteu(&gb);
+
+            if (!(n & 1))
+                s->sample[ch] = 0;
+            s->sample[ch] += s->array[n + 128];
+            s->sample[ch]  = av_clip_int16(s->sample[ch]);
+            *output_samples++ = s->sample[ch];
+            ch ^= stereo;
+        }
+        break;
+
+    case AV_CODEC_ID_GREMLIN_DPCM: {
+        int idx = 0;
+
+        while (output_samples < samples_end) {
+            uint8_t n = bytestream2_get_byteu(&gb);
+
+            *output_samples++ = s->sample[idx] += s->array[n];
+            idx ^= 1;
+        }
+        }
+        break;
     }
 
     *got_frame_ptr = 1;
@@ -336,7 +391,9 @@ AVCodec ff_ ## name_ ## _decoder = {                        \
     .capabilities   = AV_CODEC_CAP_DR1,                     \
 }
 
+DPCM_DECODER(AV_CODEC_ID_GREMLIN_DPCM,   gremlin_dpcm,   "DPCM Gremlin");
 DPCM_DECODER(AV_CODEC_ID_INTERPLAY_DPCM, interplay_dpcm, "DPCM Interplay");
 DPCM_DECODER(AV_CODEC_ID_ROQ_DPCM,       roq_dpcm,       "DPCM id RoQ");
+DPCM_DECODER(AV_CODEC_ID_SDX2_DPCM,      sdx2_dpcm,      "DPCM Squareroot-Delta-Exact");
 DPCM_DECODER(AV_CODEC_ID_SOL_DPCM,       sol_dpcm,       "DPCM Sol");
 DPCM_DECODER(AV_CODEC_ID_XAN_DPCM,       xan_dpcm,       "DPCM Xan");
diff --git a/libavcodec/dpx.c b/libavcodec/dpx.c
index d4effa4..b1833ed 100644
--- a/libavcodec/dpx.c
+++ b/libavcodec/dpx.c
@@ -2,29 +2,43 @@
  * DPX (.dpx) image decoder
  * Copyright (c) 2009 Jimmy Christensen
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avstring.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/intfloat.h"
 #include "libavutil/imgutils.h"
 #include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
 
+static unsigned int read16(const uint8_t **ptr, int is_big)
+{
+    unsigned int temp;
+    if (is_big) {
+        temp = AV_RB16(*ptr);
+    } else {
+        temp = AV_RL16(*ptr);
+    }
+    *ptr += 2;
+    return temp;
+}
+
 static unsigned int read32(const uint8_t **ptr, int is_big)
 {
     unsigned int temp;
@@ -37,12 +51,69 @@ static unsigned int read32(const uint8_t **ptr, int is_big)
     return temp;
 }
 
-static inline unsigned make_16bit(unsigned value)
+static uint16_t read10in32_gray(const uint8_t **ptr, uint32_t *lbuf,
+                                int *n_datum, int is_big, int shift)
+{
+    uint16_t temp;
+
+    if (*n_datum)
+        (*n_datum)--;
+    else {
+        *lbuf = read32(ptr, is_big);
+        *n_datum = 2;
+    }
+
+    temp = *lbuf >> shift & 0x3FF;
+    *lbuf = *lbuf >> 10;
+
+    return temp;
+}
+
+static uint16_t read10in32(const uint8_t **ptr, uint32_t *lbuf,
+                           int *n_datum, int is_big, int shift)
+{
+    if (*n_datum)
+        (*n_datum)--;
+    else {
+        *lbuf = read32(ptr, is_big);
+        *n_datum = 2;
+    }
+
+    *lbuf = *lbuf << 10 | *lbuf >> shift & 0x3FFFFF;
+
+    return *lbuf & 0x3FF;
+}
+
+static uint16_t read12in32(const uint8_t **ptr, uint32_t *lbuf,
+                           int *n_datum, int is_big)
 {
-    // mask away invalid bits
-    value &= 0xFFC0;
-    // correctly expand to 16 bits
-    return value + (value >> 10);
+    if (*n_datum)
+        (*n_datum)--;
+    else {
+        *lbuf = read32(ptr, is_big);
+        *n_datum = 7;
+    }
+
+    switch (*n_datum){
+    case 7: return *lbuf & 0xFFF;
+    case 6: return (*lbuf >> 12) & 0xFFF;
+    case 5: {
+            uint32_t c = *lbuf >> 24;
+            *lbuf = read32(ptr, is_big);
+            c |= *lbuf << 8;
+            return c & 0xFFF;
+            }
+    case 4: return (*lbuf >> 4) & 0xFFF;
+    case 3: return (*lbuf >> 16) & 0xFFF;
+    case 2: {
+            uint32_t c = *lbuf >> 28;
+            *lbuf = read32(ptr, is_big);
+            c |= *lbuf << 4;
+            return c & 0xFFF;
+            }
+    case 1: return (*lbuf >> 8) & 0xFFF;
+    default: return *lbuf >> 20;
+    }
 }
 
 static int decode_frame(AVCodecContext *avctx,
@@ -51,17 +122,21 @@ static int decode_frame(AVCodecContext *avctx,
                         AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = avpkt->data + avpkt->size;
     int buf_size       = avpkt->size;
     AVFrame *const p = data;
-    uint8_t *ptr;
+    uint8_t *ptr[AV_NUM_DATA_POINTERS];
+    uint32_t header_version, version = 0;
+    char creator[101];
+    char input_device[33];
 
     unsigned int offset;
     int magic_num, endian;
-    int x, y, ret;
-    int w, h, stride, bits_per_color, descriptor, elements, target_packet_size, source_packet_size;
+    int x, y, stride, i, ret;
+    int w, h, bits_per_color, descriptor, elements, packing;
+    int encoding, need_align = 0;
 
-    unsigned int rgbBuffer;
+    unsigned int rgbBuffer = 0;
+    int n_datum = 0;
 
     if (avpkt->size <= 1634) {
         av_log(avctx, AV_LOG_ERROR, "Packet too small for DPX header\n");
@@ -87,11 +162,33 @@ static int decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Invalid data start offset\n");
         return AVERROR_INVALIDDATA;
     }
+
+    header_version = read32(&buf, 0);
+    if (header_version == MKTAG('V','1','.','0'))
+        version = 1;
+    if (header_version == MKTAG('V','2','.','0'))
+        version = 2;
+    if (!version)
+        av_log(avctx, AV_LOG_WARNING, "Unknown header format version %s.\n",
+               av_fourcc2str(header_version));
+
+    // Check encryption
+    buf = avpkt->data + 660;
+    ret = read32(&buf, endian);
+    if (ret != 0xFFFFFFFF) {
+        avpriv_report_missing_feature(avctx, "Encryption");
+        av_log(avctx, AV_LOG_WARNING, "The image is encrypted and may "
+               "not properly decode.\n");
+    }
+
     // Need to end in 0x304 offset from start of file
     buf = avpkt->data + 0x304;
     w = read32(&buf, endian);
     h = read32(&buf, endian);
 
+    if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
+        return ret;
+
     // Need to end in 0x320 to read the descriptor
     buf += 20;
     descriptor = buf[0];
@@ -100,108 +197,298 @@ static int decode_frame(AVCodecContext *avctx,
     buf += 3;
     avctx->bits_per_raw_sample =
     bits_per_color = buf[0];
+    buf++;
+    packing = read16(&buf, endian);
+    encoding = read16(&buf, endian);
+
+    if (encoding) {
+        avpriv_report_missing_feature(avctx, "Encoding %d", encoding);
+        return AVERROR_PATCHWELCOME;
+    }
 
-    buf += 825;
+    buf += 820;
     avctx->sample_aspect_ratio.num = read32(&buf, endian);
     avctx->sample_aspect_ratio.den = read32(&buf, endian);
+    if (avctx->sample_aspect_ratio.num > 0 && avctx->sample_aspect_ratio.den > 0)
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den,
+                  0x10000);
+    else
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+
+    if (offset >= 1724 + 4) {
+        buf = avpkt->data + 1724;
+        i = read32(&buf, endian);
+        if(i) {
+            AVRational q = av_d2q(av_int2float(i), 4096);
+            if (q.num > 0 && q.den > 0)
+                avctx->framerate = q;
+        }
+    }
 
     switch (descriptor) {
-        case 51: // RGBA
-            elements = 4;
-            break;
-        case 50: // RGB
-            elements = 3;
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported descriptor %d\n", descriptor);
-            return AVERROR_INVALIDDATA;
+    case 6:  // Y
+        elements = 1;
+        break;
+    case 52: // ABGR
+    case 51: // RGBA
+    case 103: // UYVA4444
+        elements = 4;
+        break;
+    case 50: // RGB
+    case 102: // UYV444
+        elements = 3;
+        break;
+    case 100: // UYVY422
+        elements = 2;
+        break;
+    default:
+        avpriv_report_missing_feature(avctx, "Descriptor %d", descriptor);
+        return AVERROR_PATCHWELCOME;
     }
 
     switch (bits_per_color) {
-        case 8:
-            if (elements == 4) {
-                avctx->pix_fmt = AV_PIX_FMT_RGBA;
-            } else {
-                avctx->pix_fmt = AV_PIX_FMT_RGB24;
+    case 8:
+        stride = avctx->width * elements;
+        break;
+    case 10:
+        if (!packing) {
+            av_log(avctx, AV_LOG_ERROR, "Packing to 32bit required\n");
+            return -1;
+        }
+        stride = (avctx->width * elements + 2) / 3 * 4;
+        break;
+    case 12:
+        stride = avctx->width * elements;
+        if (packing) {
+            stride *= 2;
+        } else {
+            stride *= 3;
+            if (stride % 8) {
+                stride /= 8;
+                stride++;
+                stride *= 8;
             }
-            source_packet_size = elements;
-            target_packet_size = elements;
-            break;
-        case 10:
-            avctx->pix_fmt = AV_PIX_FMT_RGB48;
-            target_packet_size = 6;
-            source_packet_size = 4;
-            break;
-        case 12:
-        case 16:
-            if (endian) {
-                avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
-            } else {
-                avctx->pix_fmt = AV_PIX_FMT_RGB48LE;
-            }
-            target_packet_size = 6;
-            source_packet_size = elements * 2;
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported color depth : %d\n", bits_per_color);
+            stride /= 2;
+        }
+        break;
+    case 16:
+        stride = 2 * avctx->width * elements;
+        break;
+    case 1:
+    case 32:
+    case 64:
+        avpriv_report_missing_feature(avctx, "Depth %d", bits_per_color);
+        return AVERROR_PATCHWELCOME;
+    default:
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Table 3c: Runs will always break at scan line boundaries. Packing
+    // will always break to the next 32-bit word at scan-line boundaries.
+    // Unfortunately, the encoder produced invalid files, so attempt
+    // to detect it
+    need_align = FFALIGN(stride, 4);
+    if (need_align*avctx->height + (int64_t)offset > avpkt->size) {
+        // Alignment seems unappliable, try without
+        if (stride*avctx->height + (int64_t)offset > avpkt->size) {
+            av_log(avctx, AV_LOG_ERROR, "Overread buffer. Invalid header?\n");
             return AVERROR_INVALIDDATA;
+        } else {
+            av_log(avctx, AV_LOG_INFO, "Decoding DPX without scanline "
+                   "alignment.\n");
+            need_align = 0;
+        }
+    } else {
+        need_align -= stride;
+        stride = FFALIGN(stride, 4);
     }
 
-    if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
-        return ret;
+    switch (1000 * descriptor + 10 * bits_per_color + endian) {
+    case 6081:
+    case 6080:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        break;
+    case 6121:
+    case 6120:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY12;
+        break;
+    case 50081:
+    case 50080:
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        break;
+    case 52081:
+    case 52080:
+        avctx->pix_fmt = AV_PIX_FMT_ABGR;
+        break;
+    case 51081:
+    case 51080:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        break;
+    case 50100:
+    case 50101:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        break;
+    case 51100:
+    case 51101:
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        break;
+    case 50120:
+    case 50121:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+        break;
+    case 51120:
+    case 51121:
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP12;
+        break;
+    case 6100:
+    case 6101:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY10;
+        break;
+    case 6161:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
+        break;
+    case 6160:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY16LE;
+        break;
+    case 50161:
+        avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+        break;
+    case 50160:
+        avctx->pix_fmt = AV_PIX_FMT_RGB48LE;
+        break;
+    case 51161:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+        break;
+    case 51160:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA64LE;
+        break;
+    case 100081:
+        avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+        break;
+    case 102081:
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        break;
+    case 103081:
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported format\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     ff_set_sar(avctx, avctx->sample_aspect_ratio);
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
+
+    av_strlcpy(creator, avpkt->data + 160, 100);
+    creator[100] = '\0';
+    av_dict_set(&p->metadata, "Creator", creator, 0);
+
+    av_strlcpy(input_device, avpkt->data + 1556, 32);
+    input_device[32] = '\0';
+    av_dict_set(&p->metadata, "Input Device", input_device, 0);
 
     // Move pointer to offset from start of file
     buf =  avpkt->data + offset;
 
-    ptr    = p->data[0];
-    stride = p->linesize[0];
+    for (i=0; i<AV_NUM_DATA_POINTERS; i++)
+        ptr[i] = p->data[i];
 
-    if (source_packet_size*avctx->width*avctx->height > buf_end - buf) {
-        av_log(avctx, AV_LOG_ERROR, "Overread buffer. Invalid header?\n");
-        return AVERROR_INVALIDDATA;
-    }
     switch (bits_per_color) {
-        case 10:
-            for (x = 0; x < avctx->height; x++) {
-               uint16_t *dst = (uint16_t*)ptr;
-               for (y = 0; y < avctx->width; y++) {
-                   rgbBuffer = read32(&buf, endian);
-                   // Read out the 10-bit colors and convert to 16-bit
-                   *dst++ = make_16bit(rgbBuffer >> 16);
-                   *dst++ = make_16bit(rgbBuffer >>  6);
-                   *dst++ = make_16bit(rgbBuffer <<  4);
-               }
-               ptr += stride;
+    case 10:
+        for (x = 0; x < avctx->height; x++) {
+            uint16_t *dst[4] = {(uint16_t*)ptr[0],
+                                (uint16_t*)ptr[1],
+                                (uint16_t*)ptr[2],
+                                (uint16_t*)ptr[3]};
+            int shift = elements > 1 ? packing == 1 ? 22 : 20 : packing == 1 ? 2 : 0;
+            for (y = 0; y < avctx->width; y++) {
+                if (elements >= 3)
+                    *dst[2]++ = read10in32(&buf, &rgbBuffer,
+                                           &n_datum, endian, shift);
+                if (elements == 1)
+                    *dst[0]++ = read10in32_gray(&buf, &rgbBuffer,
+                                                &n_datum, endian, shift);
+                else
+                    *dst[0]++ = read10in32(&buf, &rgbBuffer,
+                                           &n_datum, endian, shift);
+                if (elements >= 2)
+                    *dst[1]++ = read10in32(&buf, &rgbBuffer,
+                                           &n_datum, endian, shift);
+                if (elements == 4)
+                    *dst[3]++ =
+                    read10in32(&buf, &rgbBuffer,
+                               &n_datum, endian, shift);
             }
-            break;
-        case 8:
-        case 12: // Treat 12-bit as 16-bit
-        case 16:
-            if (source_packet_size == target_packet_size) {
-                for (x = 0; x < avctx->height; x++) {
-                    memcpy(ptr, buf, target_packet_size*avctx->width);
-                    ptr += stride;
-                    buf += source_packet_size*avctx->width;
+            if (memcmp(input_device, "Scanity", 7))
+                n_datum = 0;
+            for (i = 0; i < elements; i++)
+                ptr[i] += p->linesize[i];
+        }
+        break;
+    case 12:
+        for (x = 0; x < avctx->height; x++) {
+            uint16_t *dst[4] = {(uint16_t*)ptr[0],
+                                (uint16_t*)ptr[1],
+                                (uint16_t*)ptr[2],
+                                (uint16_t*)ptr[3]};
+            int shift = packing == 1 ? 4 : 0;
+            for (y = 0; y < avctx->width; y++) {
+                if (packing) {
+                    if (elements >= 3)
+                        *dst[2]++ = read16(&buf, endian) >> shift & 0xFFF;
+                    *dst[0]++ = read16(&buf, endian) >> shift & 0xFFF;
+                    if (elements >= 2)
+                        *dst[1]++ = read16(&buf, endian) >> shift & 0xFFF;
+                    if (elements == 4)
+                        *dst[3]++ = read16(&buf, endian) >> shift & 0xFFF;
+                } else {
+                    if (elements >= 3)
+                        *dst[2]++ = read12in32(&buf, &rgbBuffer,
+                                               &n_datum, endian);
+                    *dst[0]++ = read12in32(&buf, &rgbBuffer,
+                                           &n_datum, endian);
+                    if (elements >= 2)
+                        *dst[1]++ = read12in32(&buf, &rgbBuffer,
+                                               &n_datum, endian);
+                    if (elements == 4)
+                        *dst[3]++ = read12in32(&buf, &rgbBuffer,
+                                               &n_datum, endian);
                 }
-            } else {
-                for (x = 0; x < avctx->height; x++) {
-                    uint8_t *dst = ptr;
-                    for (y = 0; y < avctx->width; y++) {
-                        memcpy(dst, buf, target_packet_size);
-                        dst += target_packet_size;
-                        buf += source_packet_size;
-                    }
-                    ptr += stride;
+            }
+            n_datum = 0;
+            for (i = 0; i < elements; i++)
+                ptr[i] += p->linesize[i];
+            // Jump to next aligned position
+            buf += need_align;
+        }
+        break;
+    case 16:
+        elements *= 2;
+    case 8:
+        if (   avctx->pix_fmt == AV_PIX_FMT_YUVA444P
+            || avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
+            for (x = 0; x < avctx->height; x++) {
+                ptr[0] = p->data[0] + x * p->linesize[0];
+                ptr[1] = p->data[1] + x * p->linesize[1];
+                ptr[2] = p->data[2] + x * p->linesize[2];
+                ptr[3] = p->data[3] + x * p->linesize[3];
+                for (y = 0; y < avctx->width; y++) {
+                    *ptr[1]++ = *buf++;
+                    *ptr[0]++ = *buf++;
+                    *ptr[2]++ = *buf++;
+                    if (avctx->pix_fmt == AV_PIX_FMT_YUVA444P)
+                        *ptr[3]++ = *buf++;
                 }
             }
-            break;
+        } else {
+        av_image_copy_plane(ptr[0], p->linesize[0],
+                            buf, stride,
+                            elements * avctx->width, avctx->height);
+        }
+        break;
     }
 
     *got_frame = 1;
@@ -211,7 +498,7 @@ static int decode_frame(AVCodecContext *avctx,
 
 AVCodec ff_dpx_decoder = {
     .name           = "dpx",
-    .long_name      = NULL_IF_CONFIG_SMALL("DPX image"),
+    .long_name      = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_DPX,
     .decode         = decode_frame,
diff --git a/libavcodec/dpx_parser.c b/libavcodec/dpx_parser.c
index e3a7ac5..8e4a01e 100644
--- a/libavcodec/dpx_parser.c
+++ b/libavcodec/dpx_parser.c
@@ -2,20 +2,20 @@
  * DPX parser
  * Copyright (c) 2013 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dpxenc.c b/libavcodec/dpxenc.c
index adcb529..a596033 100644
--- a/libavcodec/dpxenc.c
+++ b/libavcodec/dpxenc.c
@@ -2,20 +2,20 @@
  * DPX (.dpx) image encoder
  * Copyright (c) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,35 +28,44 @@
 typedef struct DPXContext {
     int big_endian;
     int bits_per_component;
+    int num_components;
     int descriptor;
+    int planar;
 } DPXContext;
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     DPXContext *s = avctx->priv_data;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    s->big_endian         = 1;
-    s->bits_per_component = 8;
-    s->descriptor         = 50; /* RGB */
+    s->big_endian         = !!(desc->flags & AV_PIX_FMT_FLAG_BE);
+    s->bits_per_component = desc->comp[0].depth;
+    s->num_components     = desc->nb_components;
+    s->descriptor         = (desc->flags & AV_PIX_FMT_FLAG_ALPHA) ? 51 : 50;
+    s->planar             = !!(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
 
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_ABGR:
+        s->descriptor = 52;
+        break;
+    case AV_PIX_FMT_GRAY16BE:
+    case AV_PIX_FMT_GRAY16LE:
+    case AV_PIX_FMT_GRAY8:
+        s->descriptor = 6;
         break;
+    case AV_PIX_FMT_GBRP10BE:
+    case AV_PIX_FMT_GBRP10LE:
+    case AV_PIX_FMT_GBRP12BE:
+    case AV_PIX_FMT_GBRP12LE:
+    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_RGBA64BE:
+    case AV_PIX_FMT_RGBA64LE:
     case AV_PIX_FMT_RGBA:
-        s->descriptor = 51; /* RGBA */
         break;
     case AV_PIX_FMT_RGB48LE:
-        s->big_endian = 0;
-        /* fall-through */
     case AV_PIX_FMT_RGB48BE:
-        s->bits_per_component = avctx->bits_per_raw_sample ? avctx->bits_per_raw_sample : 16;
+        if (avctx->bits_per_raw_sample)
+            s->bits_per_component = avctx->bits_per_raw_sample;
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
@@ -66,17 +75,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
-#define write16(p, value) \
-do { \
-    if (s->big_endian) AV_WB16(p, value); \
-    else               AV_WL16(p, value); \
-} while(0)
+static av_always_inline void write16_internal(int big_endian, void *p, int value)
+{
+    if (big_endian) AV_WB16(p, value);
+    else            AV_WL16(p, value);
+}
+
+static av_always_inline void write32_internal(int big_endian, void *p, int value)
+{
+    if (big_endian) AV_WB32(p, value);
+    else            AV_WL32(p, value);
+}
 
-#define write32(p, value) \
-do { \
-    if (s->big_endian) AV_WB32(p, value); \
-    else               AV_WL32(p, value); \
-} while(0)
+#define write16(p, value) write16_internal(s->big_endian, p, value)
+#define write32(p, value) write32_internal(s->big_endian, p, value)
 
 static void encode_rgb48_10bit(AVCodecContext *avctx, const AVFrame *pic,
                                uint8_t *dst)
@@ -88,14 +100,14 @@ static void encode_rgb48_10bit(AVCodecContext *avctx, const AVFrame *pic,
     for (y = 0; y < avctx->height; y++) {
         for (x = 0; x < avctx->width; x++) {
             int value;
-            if ((avctx->pix_fmt & 1)) {
-                value = ((AV_RB16(src + 6*x + 4) & 0xFFC0) >> 4)
-                      | ((AV_RB16(src + 6*x + 2) & 0xFFC0) << 6)
-                      | ((AV_RB16(src + 6*x + 0) & 0xFFC0) << 16);
+            if (s->big_endian) {
+                value = ((AV_RB16(src + 6*x + 4) & 0xFFC0U) >> 4)
+                      | ((AV_RB16(src + 6*x + 2) & 0xFFC0U) << 6)
+                      | ((AV_RB16(src + 6*x + 0) & 0xFFC0U) << 16);
             } else {
-                value = ((AV_RL16(src + 6*x + 4) & 0xFFC0) >> 4)
-                      | ((AV_RL16(src + 6*x + 2) & 0xFFC0) << 6)
-                      | ((AV_RL16(src + 6*x + 0) & 0xFFC0) << 16);
+                value = ((AV_RL16(src + 6*x + 4) & 0xFFC0U) >> 4)
+                      | ((AV_RL16(src + 6*x + 2) & 0xFFC0U) << 6)
+                      | ((AV_RL16(src + 6*x + 0) & 0xFFC0U) << 16);
             }
             write32(dst, value);
             dst += 4;
@@ -104,23 +116,88 @@ static void encode_rgb48_10bit(AVCodecContext *avctx, const AVFrame *pic,
     }
 }
 
+static void encode_gbrp10(AVCodecContext *avctx, const AVFrame *pic, uint8_t *dst)
+{
+    DPXContext *s = avctx->priv_data;
+    const uint8_t *src[3] = {pic->data[0], pic->data[1], pic->data[2]};
+    int x, y, i;
+
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width; x++) {
+            int value;
+            if (s->big_endian) {
+                value = (AV_RB16(src[0] + 2*x) << 12)
+                      | (AV_RB16(src[1] + 2*x) << 2)
+                      | ((unsigned)AV_RB16(src[2] + 2*x) << 22);
+            } else {
+                value = (AV_RL16(src[0] + 2*x) << 12)
+                      | (AV_RL16(src[1] + 2*x) << 2)
+                      | ((unsigned)AV_RL16(src[2] + 2*x) << 22);
+            }
+            write32(dst, value);
+            dst += 4;
+        }
+        for (i = 0; i < 3; i++)
+            src[i] += pic->linesize[i];
+    }
+}
+
+static void encode_gbrp12(AVCodecContext *avctx, const AVFrame *pic, uint16_t *dst)
+{
+    DPXContext *s = avctx->priv_data;
+    const uint16_t *src[3] = {(uint16_t*)pic->data[0],
+                              (uint16_t*)pic->data[1],
+                              (uint16_t*)pic->data[2]};
+    int x, y, i, pad;
+    pad = avctx->width*6;
+    pad = (FFALIGN(pad, 4) - pad) >> 1;
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width; x++) {
+            uint16_t value[3];
+            if (s->big_endian) {
+                value[1] = AV_RB16(src[0] + x) << 4;
+                value[2] = AV_RB16(src[1] + x) << 4;
+                value[0] = AV_RB16(src[2] + x) << 4;
+            } else {
+                value[1] = AV_RL16(src[0] + x) << 4;
+                value[2] = AV_RL16(src[1] + x) << 4;
+                value[0] = AV_RL16(src[2] + x) << 4;
+            }
+            for (i = 0; i < 3; i++)
+                write16(dst++, value[i]);
+        }
+        for (i = 0; i < pad; i++)
+            *dst++ = 0;
+        for (i = 0; i < 3; i++)
+            src[i] += pic->linesize[i]/2;
+    }
+}
+
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *frame, int *got_packet)
 {
     DPXContext *s = avctx->priv_data;
-    int size, ret;
+    int size, ret, need_align, len;
     uint8_t *buf;
 
 #define HEADER_SIZE 1664  /* DPX Generic header */
     if (s->bits_per_component == 10)
         size = avctx->height * avctx->width * 4;
-    else
-        size = av_image_get_buffer_size(avctx->pix_fmt,
-                                        avctx->width, avctx->height, 1);
-    if ((ret = ff_alloc_packet(pkt, size + HEADER_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+    else if (s->bits_per_component == 12) {
+        // 3 components, 12 bits put on 16 bits
+        len  = avctx->width*6;
+        size = FFALIGN(len, 4);
+        need_align = size - len;
+        size *= avctx->height;
+    } else {
+        // N components, M bits
+        len = avctx->width * s->num_components * s->bits_per_component >> 3;
+        size = FFALIGN(len, 4);
+        need_align = size - len;
+        size *= avctx->height;
     }
+    if ((ret = ff_alloc_packet2(avctx, pkt, size + HEADER_SIZE, 0)) < 0)
+        return ret;
     buf = pkt->data;
 
     memset(buf, 0, HEADER_SIZE);
@@ -144,26 +221,45 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     buf[801] = 2; /* linear transfer */
     buf[802] = 2; /* linear colorimetric */
     buf[803] = s->bits_per_component;
-    write16(buf + 804, s->bits_per_component == 10 ? 1 : 0); /* packing method */
+    write16(buf + 804, (s->bits_per_component == 10 || s->bits_per_component == 12) ?
+                       1 : 0); /* packing method */
     write32(buf + 808, HEADER_SIZE); /* data offset */
 
     /* Image source information header */
     write32(buf + 1628, avctx->sample_aspect_ratio.num);
     write32(buf + 1632, avctx->sample_aspect_ratio.den);
 
-    switch (s->bits_per_component) {
+    switch(s->bits_per_component) {
     case 8:
     case 16:
-        size = av_image_copy_to_buffer(buf + HEADER_SIZE,
-                                       pkt->size - HEADER_SIZE,
-                                       frame->data, frame->linesize,
-                                       avctx->pix_fmt,
-                                       avctx->width, avctx->height, 1);
+        if (need_align) {
+            int j;
+            const uint8_t *src = frame->data[0];
+            uint8_t *dst = pkt->data + HEADER_SIZE;
+            size = (len + need_align) * avctx->height;
+            for (j=0; j<avctx->height; j++) {
+                memcpy(dst, src, len);
+                memset(dst + len, 0, need_align);
+                dst += len + need_align;
+                src += frame->linesize[0];
+            }
+        } else {
+            size = av_image_copy_to_buffer(buf + HEADER_SIZE, pkt->size - HEADER_SIZE,
+                                           (const uint8_t**)frame->data, frame->linesize,
+                                           avctx->pix_fmt,
+                                           avctx->width, avctx->height, 1);
+        }
         if (size < 0)
             return size;
         break;
     case 10:
-        encode_rgb48_10bit(avctx, frame, buf + HEADER_SIZE);
+        if (s->planar)
+            encode_gbrp10(avctx, frame, buf + HEADER_SIZE);
+        else
+            encode_rgb48_10bit(avctx, frame, buf + HEADER_SIZE);
+        break;
+    case 12:
+        encode_gbrp12(avctx, frame, (uint16_t*)(buf + HEADER_SIZE));
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n", s->bits_per_component);
@@ -181,17 +277,20 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 }
 
 AVCodec ff_dpx_encoder = {
-    .name = "dpx",
-    .long_name = NULL_IF_CONFIG_SMALL("DPX image"),
-    .type = AVMEDIA_TYPE_VIDEO,
-    .id   = AV_CODEC_ID_DPX,
+    .name           = "dpx",
+    .long_name      = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DPX,
     .priv_data_size = sizeof(DPXContext),
-    .init   = encode_init,
-    .encode2 = encode_frame,
-    .pix_fmts = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24,
-        AV_PIX_FMT_RGBA,
-        AV_PIX_FMT_RGB48LE,
-        AV_PIX_FMT_RGB48BE,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_RGB24,    AV_PIX_FMT_RGBA, AV_PIX_FMT_ABGR,
+        AV_PIX_FMT_GRAY16LE, AV_PIX_FMT_GRAY16BE,
+        AV_PIX_FMT_RGB48LE,  AV_PIX_FMT_RGB48BE,
+        AV_PIX_FMT_RGBA64LE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_GBRP10LE, AV_PIX_FMT_GBRP10BE,
+        AV_PIX_FMT_GBRP12LE, AV_PIX_FMT_GBRP12BE,
         AV_PIX_FMT_NONE},
 };
diff --git a/libavcodec/dsd.c b/libavcodec/dsd.c
new file mode 100644
index 0000000..9104f38
--- /dev/null
+++ b/libavcodec/dsd.c
@@ -0,0 +1,86 @@
+/*
+ * Direct Stream Digital (DSD) decoder
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ * Copyright (c) 2014 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/internal.h"
+#include "libavcodec/mathops.h"
+#include "avcodec.h"
+#include "dsd_tablegen.h"
+#include "dsd.h"
+
+static av_cold void dsd_ctables_tableinit(void)
+{
+    int t, e, m, sign;
+    double acc[CTABLES];
+    for (e = 0; e < 256; ++e) {
+        memset(acc, 0, sizeof(acc));
+        for (m = 0; m < 8; ++m) {
+            sign = (((e >> (7 - m)) & 1) * 2 - 1);
+            for (t = 0; t < CTABLES; ++t)
+                acc[t] += sign * htaps[t * 8 + m];
+        }
+        for (t = 0; t < CTABLES; ++t)
+            ctables[CTABLES - 1 - t][e] = acc[t];
+    }
+}
+
+av_cold void ff_init_dsd_data(void)
+{
+    static int done = 0;
+    if (done)
+        return;
+    dsd_ctables_tableinit();
+    done = 1;
+}
+
+void ff_dsd2pcm_translate(DSDContext* s, size_t samples, int lsbf,
+                          const unsigned char *src, ptrdiff_t src_stride,
+                          float *dst, ptrdiff_t dst_stride)
+{
+    unsigned pos, i;
+    unsigned char* p;
+    double sum;
+
+    pos = s->pos;
+
+    while (samples-- > 0) {
+        s->buf[pos] = lsbf ? ff_reverse[*src] : *src;
+        src += src_stride;
+
+        p = s->buf + ((pos - CTABLES) & FIFOMASK);
+        *p = ff_reverse[*p];
+
+        sum = 0.0;
+        for (i = 0; i < CTABLES; i++) {
+            unsigned char a = s->buf[(pos                   - i) & FIFOMASK];
+            unsigned char b = s->buf[(pos - (CTABLES*2 - 1) + i) & FIFOMASK];
+            sum += ctables[i][a] + ctables[i][b];
+        }
+
+        *dst = (float)sum;
+        dst += dst_stride;
+
+        pos = (pos + 1) & FIFOMASK;
+    }
+
+    s->pos = pos;
+}
diff --git a/libavcodec/dsd.h b/libavcodec/dsd.h
new file mode 100644
index 0000000..5ca4574
--- /dev/null
+++ b/libavcodec/dsd.h
@@ -0,0 +1,52 @@
+/*
+ * Direct Stream Digital (DSD) decoder
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ * Copyright (c) 2014 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DSD_H
+#define AVCODEC_DSD_H
+
+#include "libavcodec/internal.h"
+#include "libavcodec/mathops.h"
+#include "avcodec.h"
+
+#define HTAPS   48               /** number of FIR constants */
+#define FIFOSIZE 16              /** must be a power of two */
+#define FIFOMASK (FIFOSIZE - 1)  /** bit mask for FIFO offsets */
+
+#if FIFOSIZE * 8 < HTAPS * 2
+#error "FIFOSIZE too small"
+#endif
+
+/**
+ * Per-channel buffer
+ */
+typedef struct DSDContext {
+    unsigned char buf[FIFOSIZE];
+    unsigned pos;
+} DSDContext;
+
+void ff_init_dsd_data(void);
+
+void ff_dsd2pcm_translate(DSDContext* s, size_t samples, int lsbf,
+                          const unsigned char *src, ptrdiff_t src_stride,
+                          float *dst, ptrdiff_t dst_stride);
+#endif /* AVCODEC_DSD_H */
diff --git a/libavcodec/dsd_tablegen.h b/libavcodec/dsd_tablegen.h
new file mode 100644
index 0000000..e5da86a
--- /dev/null
+++ b/libavcodec/dsd_tablegen.h
@@ -0,0 +1,75 @@
+/*
+ * Header file for hardcoded DSD tables
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DSD_TABLEGEN_H
+#define AVCODEC_DSD_TABLEGEN_H
+
+#include <stdint.h>
+#include "libavutil/attributes.h"
+#include "dsd.h"
+
+#define HTAPS   48                /** number of FIR constants */
+#define CTABLES ((HTAPS + 7) / 8) /** number of "8 MACs" lookup tables */
+
+#include "libavutil/common.h"
+
+/*
+ * Properties of this 96-tap lowpass filter when applied on a signal
+ * with sampling rate of 44100*64 Hz:
+ *
+ * () has a delay of 17 microseconds.
+ *
+ * () flat response up to 48 kHz
+ *
+ * () if you downsample afterwards by a factor of 8, the
+ *    spectrum below 70 kHz is practically alias-free.
+ *
+ * () stopband rejection is about 160 dB
+ *
+ * The coefficient tables ("ctables") take only 6 Kibi Bytes and
+ * should fit into a modern processor's fast cache.
+ */
+
+/**
+ * The 2nd half (48 coeffs) of a 96-tap symmetric lowpass filter
+ */
+static const double htaps[HTAPS] = {
+     0.09950731974056658,    0.09562845727714668,    0.08819647126516944,
+     0.07782552527068175,    0.06534876523171299,    0.05172629311427257,
+     0.0379429484910187,     0.02490921351762261,    0.0133774746265897,
+     0.003883043418804416,  -0.003284703416210726,  -0.008080250212687497,
+    -0.01067241812471033,   -0.01139427235000863,   -0.0106813877974587,
+    -0.009007905078766049,  -0.006828859761015335,  -0.004535184322001496,
+    -0.002425035959059578,  -0.0006922187080790708,  0.0005700762133516592,
+     0.001353838005269448,   0.001713709169690937,   0.001742046839472948,
+     0.001545601648013235,   0.001226696225277855,   0.0008704322683580222,
+     0.0005381636200535649,  0.000266446345425276,   7.002968738383528e-05,
+    -5.279407053811266e-05, -0.0001140625650874684, -0.0001304796361231895,
+    -0.0001189970287491285, -9.396247155265073e-05, -6.577634378272832e-05,
+    -4.07492895872535e-05,  -2.17407957554587e-05,  -9.163058931391722e-06,
+    -2.017460145032201e-06,  1.249721855219005e-06,  2.166655190537392e-06,
+     1.930520892991082e-06,  1.319400334374195e-06,  7.410039764949091e-07,
+     3.423230509967409e-07,  1.244182214744588e-07,  3.130441005359396e-08
+};
+
+static float ctables[CTABLES][256];
+#endif /* AVCODEC_DSD_TABLEGEN_H */
diff --git a/libavcodec/dsddec.c b/libavcodec/dsddec.c
new file mode 100644
index 0000000..2c5c357
--- /dev/null
+++ b/libavcodec/dsddec.c
@@ -0,0 +1,113 @@
+/*
+ * Direct Stream Digital (DSD) decoder
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ * Copyright (c) 2014 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Direct Stream Digital (DSD) decoder
+ */
+
+#include "libavcodec/internal.h"
+#include "libavcodec/mathops.h"
+#include "avcodec.h"
+#include "dsd.h"
+
+#define DSD_SILENCE 0x69
+/* 0x69 = 01101001
+ * This pattern "on repeat" makes a low energy 352.8 kHz tone
+ * and a high energy 1.0584 MHz tone which should be filtered
+ * out completely by any playback system --> silence
+ */
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    DSDContext * s;
+    int i;
+    uint8_t silence;
+
+    ff_init_dsd_data();
+
+    s = av_malloc_array(sizeof(DSDContext), avctx->channels);
+    if (!s)
+        return AVERROR(ENOMEM);
+
+    silence = avctx->codec_id == AV_CODEC_ID_DSD_LSBF || avctx->codec_id == AV_CODEC_ID_DSD_LSBF_PLANAR ? ff_reverse[DSD_SILENCE] : DSD_SILENCE;
+    for (i = 0; i < avctx->channels; i++) {
+        s[i].pos = 0;
+        memset(s[i].buf, silence, sizeof(s[i].buf));
+    }
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    avctx->priv_data  = s;
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    DSDContext * s = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret, i;
+    int lsbf = avctx->codec_id == AV_CODEC_ID_DSD_LSBF || avctx->codec_id == AV_CODEC_ID_DSD_LSBF_PLANAR;
+    int src_next;
+    int src_stride;
+
+    frame->nb_samples = avpkt->size / avctx->channels;
+
+    if (avctx->codec_id == AV_CODEC_ID_DSD_LSBF_PLANAR || avctx->codec_id == AV_CODEC_ID_DSD_MSBF_PLANAR) {
+        src_next   = frame->nb_samples;
+        src_stride = 1;
+    } else {
+        src_next   = 1;
+        src_stride = avctx->channels;
+    }
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (i = 0; i < avctx->channels; i++) {
+        float * dst = ((float **)frame->extended_data)[i];
+        ff_dsd2pcm_translate(&s[i], frame->nb_samples, lsbf,
+            avpkt->data + i * src_next, src_stride,
+            dst, 1);
+    }
+
+    *got_frame_ptr = 1;
+    return frame->nb_samples * avctx->channels;
+}
+
+#define DSD_DECODER(id_, name_, long_name_) \
+AVCodec ff_##name_##_decoder = { \
+    .name         = #name_, \
+    .long_name    = NULL_IF_CONFIG_SMALL(long_name_), \
+    .type         = AVMEDIA_TYPE_AUDIO, \
+    .id           = AV_CODEC_ID_##id_, \
+    .init         = decode_init, \
+    .decode       = decode_frame, \
+    .sample_fmts  = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLTP, \
+                                                   AV_SAMPLE_FMT_NONE }, \
+};
+
+DSD_DECODER(DSD_LSBF, dsd_lsbf, "DSD (Direct Stream Digital), least significant bit first")
+DSD_DECODER(DSD_MSBF, dsd_msbf, "DSD (Direct Stream Digital), most significant bit first")
+DSD_DECODER(DSD_MSBF_PLANAR, dsd_msbf_planar, "DSD (Direct Stream Digital), most significant bit first, planar")
+DSD_DECODER(DSD_LSBF_PLANAR, dsd_lsbf_planar, "DSD (Direct Stream Digital), least significant bit first, planar")
diff --git a/libavcodec/dsicinaudio.c b/libavcodec/dsicinaudio.c
index e0fecbe..290dab4 100644
--- a/libavcodec/dsicinaudio.c
+++ b/libavcodec/dsicinaudio.c
@@ -2,20 +2,20 @@
  * Delphine Software International CIN audio decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -98,10 +98,8 @@ static int cinaudio_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avpkt->size - cin->initial_decode_frame;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     delta = cin->delta;
diff --git a/libavcodec/dsicinvideo.c b/libavcodec/dsicinvideo.c
index 7c62dcf..37175d6 100644
--- a/libavcodec/dsicinvideo.c
+++ b/libavcodec/dsicinvideo.c
@@ -2,20 +2,20 @@
  * Delphine Software International CIN video decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,10 +42,33 @@ typedef struct CinVideoContext {
     uint8_t *bitmap_table[3];
 } CinVideoContext;
 
+static av_cold void destroy_buffers(CinVideoContext *cin)
+{
+    int i;
+
+    for (i = 0; i < 3; ++i)
+        av_freep(&cin->bitmap_table[i]);
+}
+
+static av_cold int allocate_buffers(CinVideoContext *cin)
+{
+    int i;
+
+    for (i = 0; i < 3; ++i) {
+        cin->bitmap_table[i] = av_mallocz(cin->bitmap_size);
+        if (!cin->bitmap_table[i]) {
+            av_log(cin->avctx, AV_LOG_ERROR, "Can't allocate bitmap buffers.\n");
+            destroy_buffers(cin);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
 static av_cold int cinvideo_decode_init(AVCodecContext *avctx)
 {
     CinVideoContext *cin = avctx->priv_data;
-    unsigned int i;
 
     cin->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -55,11 +78,8 @@ static av_cold int cinvideo_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     cin->bitmap_size = avctx->width * avctx->height;
-    for (i = 0; i < 3; ++i) {
-        cin->bitmap_table[i] = av_mallocz(cin->bitmap_size);
-        if (!cin->bitmap_table[i])
-            av_log(avctx, AV_LOG_ERROR, "Can't allocate bitmap buffers.\n");
-    }
+    if (allocate_buffers(cin))
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -138,30 +158,40 @@ static int cin_decode_lzss(const unsigned char *src, int src_size,
         }
     }
 
+    if (dst_end - dst > dst_size - dst_size/10)
+        return AVERROR_INVALIDDATA;
+
     return 0;
 }
 
-static void cin_decode_rle(const unsigned char *src, int src_size,
+static int cin_decode_rle(const unsigned char *src, int src_size,
                            unsigned char *dst, int dst_size)
 {
     int len, code;
     unsigned char *dst_end       = dst + dst_size;
     const unsigned char *src_end = src + src_size;
 
-    while (src < src_end && dst < dst_end) {
+    while (src + 1 < src_end && dst < dst_end) {
         code = *src++;
         if (code & 0x80) {
-            if (src >= src_end)
-                break;
             len = code - 0x7F;
             memset(dst, *src++, FFMIN(len, dst_end - dst));
         } else {
             len = code + 1;
+            if (len > src_end-src) {
+                av_log(NULL, AV_LOG_ERROR, "RLE overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             memcpy(dst, src, FFMIN3(len, dst_end - dst, src_end - src));
             src += len;
         }
         dst += len;
     }
+
+    if (dst_end - dst > dst_size - dst_size/10)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
 }
 
 static int cinvideo_decode_frame(AVCodecContext *avctx,
@@ -188,44 +218,50 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
         if (palette_colors_count > 256)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < palette_colors_count; ++i) {
-            cin->palette[i]    = bytestream_get_le24(&buf);
+            cin->palette[i]    = 0xFFU << 24 | bytestream_get_le24(&buf);
             bitmap_frame_size -= 3;
         }
     } else {
         for (i = 0; i < palette_colors_count; ++i) {
-            cin->palette[buf[0]] = AV_RL24(buf + 1);
+            cin->palette[buf[0]] = 0xFFU << 24 | AV_RL24(buf + 1);
             buf                 += 4;
             bitmap_frame_size   -= 4;
         }
     }
 
-    bitmap_frame_size = FFMIN(cin->bitmap_size, bitmap_frame_size);
-
     /* note: the decoding routines below assumes that
      * surface.width = surface.pitch */
     switch (bitmap_frame_type) {
     case 9:
-        cin_decode_rle(buf, bitmap_frame_size,
+        res =  cin_decode_rle(buf, bitmap_frame_size,
                        cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
+        if (res < 0)
+            return res;
         break;
     case 34:
-        cin_decode_rle(buf, bitmap_frame_size,
+        res =  cin_decode_rle(buf, bitmap_frame_size,
                        cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
+        if (res < 0)
+            return res;
         cin_apply_delta_data(cin->bitmap_table[CIN_PRE_BMP],
                              cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
         break;
     case 35:
-        cin_decode_huffman(buf, bitmap_frame_size,
+        bitmap_frame_size = cin_decode_huffman(buf, bitmap_frame_size,
                            cin->bitmap_table[CIN_INT_BMP], cin->bitmap_size);
-        cin_decode_rle(cin->bitmap_table[CIN_INT_BMP], bitmap_frame_size,
+        res =  cin_decode_rle(cin->bitmap_table[CIN_INT_BMP], bitmap_frame_size,
                        cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
+        if (res < 0)
+            return res;
         break;
     case 36:
         bitmap_frame_size = cin_decode_huffman(buf, bitmap_frame_size,
                                                cin->bitmap_table[CIN_INT_BMP],
                                                cin->bitmap_size);
-        cin_decode_rle(cin->bitmap_table[CIN_INT_BMP], bitmap_frame_size,
+        res = cin_decode_rle(cin->bitmap_table[CIN_INT_BMP], bitmap_frame_size,
                        cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
+        if (res < 0)
+            return res;
         cin_apply_delta_data(cin->bitmap_table[CIN_PRE_BMP],
                              cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
         break;
@@ -251,11 +287,8 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
         break;
     }
 
-    if ((res = ff_reget_buffer(avctx, cin->frame)) < 0) {
-        av_log(cin->avctx, AV_LOG_ERROR,
-               "delphinecinvideo: reget_buffer() failed to allocate a frame\n");
+    if ((res = ff_reget_buffer(avctx, cin->frame)) < 0)
         return res;
-    }
 
     memcpy(cin->frame->data[1], cin->palette, sizeof(cin->palette));
     cin->frame->palette_has_changed = 1;
@@ -278,12 +311,10 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
 static av_cold int cinvideo_decode_end(AVCodecContext *avctx)
 {
     CinVideoContext *cin = avctx->priv_data;
-    int i;
 
     av_frame_free(&cin->frame);
 
-    for (i = 0; i < 3; ++i)
-        av_free(cin->bitmap_table[i]);
+    destroy_buffers(cin);
 
     return 0;
 }
@@ -297,5 +328,6 @@ AVCodec ff_dsicinvideo_decoder = {
     .init           = cinvideo_decode_init,
     .close          = cinvideo_decode_end,
     .decode         = cinvideo_decode_frame,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/dss_sp.c b/libavcodec/dss_sp.c
index 4fe784c..14025fc 100644
--- a/libavcodec/dss_sp.c
+++ b/libavcodec/dss_sp.c
@@ -2,20 +2,20 @@
  * Digital Speech Standard - Standard Play mode (DSS SP) audio decoder.
  * Copyright (C) 2014 Oleksij Rempel <linux@rempel-privat.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #define SUBFRAMES 4
@@ -33,7 +33,7 @@
 
 #define DSS_SP_FRAME_SIZE        42
 #define DSS_SP_SAMPLE_COUNT     (66 * SUBFRAMES)
-#define DSS_SP_FORMULA(a, b, c) ((((a) << 15) + (b) * (c)) + 0x4000) >> 15
+#define DSS_SP_FORMULA(a, b, c) ((int)((((a) * (1 << 15)) + (b) * (unsigned)(c)) + 0x4000) >> 15)
 
 typedef struct DssSpSubframe {
     int16_t gain;
@@ -50,6 +50,7 @@ typedef struct DssSpFrame {
 } DssSpFrame;
 
 typedef struct DssSpContext {
+    AVCodecContext *avctx;
     int32_t excitation[288 + 6];
     int32_t history[187];
     DssSpFrame fparam;
@@ -296,13 +297,14 @@ static av_cold int dss_sp_decode_init(AVCodecContext *avctx)
 
     memset(p->history, 0, sizeof(p->history));
     p->pulse_dec_mode = 1;
+    p->avctx          = avctx;
 
     return 0;
 }
 
 static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     DssSpFrame *fparam = &p->fparam;
     int i;
     int subframe_idx;
@@ -315,24 +317,24 @@ static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
         p->bits[i + 1] = src[i];
     }
 
-    bitstream_init8(&bc, p->bits, DSS_SP_FRAME_SIZE);
+    init_get_bits(&gb, p->bits, DSS_SP_FRAME_SIZE * 8);
 
     for (i = 0; i < 2; i++)
-        fparam->filter_idx[i] = bitstream_read(&bc, 5);
+        fparam->filter_idx[i] = get_bits(&gb, 5);
     for (; i < 8; i++)
-        fparam->filter_idx[i] = bitstream_read(&bc, 4);
+        fparam->filter_idx[i] = get_bits(&gb, 4);
     for (; i < 14; i++)
-        fparam->filter_idx[i] = bitstream_read(&bc, 3);
+        fparam->filter_idx[i] = get_bits(&gb, 3);
 
     for (subframe_idx = 0; subframe_idx < 4; subframe_idx++) {
-        fparam->sf_adaptive_gain[subframe_idx] = bitstream_read(&bc, 5);
+        fparam->sf_adaptive_gain[subframe_idx] = get_bits(&gb, 5);
 
-        fparam->sf[subframe_idx].combined_pulse_pos = bitstream_read(&bc, 31);
+        fparam->sf[subframe_idx].combined_pulse_pos = get_bits_long(&gb, 31);
 
-        fparam->sf[subframe_idx].gain = bitstream_read(&bc, 6);
+        fparam->sf[subframe_idx].gain = get_bits(&gb, 6);
 
         for (i = 0; i < 7; i++)
-            fparam->sf[subframe_idx].pulse_val[i] = bitstream_read(&bc, 3);
+            fparam->sf[subframe_idx].pulse_val[i] = get_bits(&gb, 3);
     }
 
     for (subframe_idx = 0; subframe_idx < 4; subframe_idx++) {
@@ -378,7 +380,7 @@ static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
                 if (C72_binomials[index] <= combined_pulse_pos) {
                     combined_pulse_pos -= C72_binomials[index];
 
-                    fparam->sf[subframe_idx].pulse_pos[(index ^ 7) - 1] = i;
+                    fparam->sf[subframe_idx].pulse_pos[6 - index] = i;
 
                     if (!index)
                         break;
@@ -394,16 +396,21 @@ static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
         }
     }
 
-    combined_pitch = bitstream_read(&bc, 24);
+    combined_pitch = get_bits(&gb, 24);
 
     fparam->pitch_lag[0] = (combined_pitch % 151) + 36;
 
     combined_pitch /= 151;
 
-    for (i = 1; i < SUBFRAMES; i++) {
+    for (i = 1; i < SUBFRAMES - 1; i++) {
         fparam->pitch_lag[i] = combined_pitch % 48;
         combined_pitch      /= 48;
     }
+    if (combined_pitch > 47) {
+        av_log (p->avctx, AV_LOG_WARNING, "combined_pitch was too large\n");
+        combined_pitch = 0;
+    }
+    fparam->pitch_lag[i] = combined_pitch;
 
     pitch_lag = fparam->pitch_lag[0];
     for (i = 1; i < SUBFRAMES; i++) {
@@ -492,7 +499,7 @@ static void dss_sp_scale_vector(int32_t *vec, int bits, int size)
             vec[i] = vec[i] >> -bits;
     else
         for (i = 0; i < size; i++)
-            vec[i] = vec[i] << bits;
+            vec[i] = vec[i] * (1 << bits);
 }
 
 static void dss_sp_update_buf(int32_t *hist, int32_t *vector)
@@ -517,12 +524,12 @@ static void dss_sp_shift_sq_sub(const int32_t *filter_buf,
         tmp = dst[a] * filter_buf[0];
 
         for (i = 14; i > 0; i--)
-            tmp -= error_buf[i] * filter_buf[i];
+            tmp -= error_buf[i] * (unsigned)filter_buf[i];
 
         for (i = 14; i > 0; i--)
             error_buf[i] = error_buf[i - 1];
 
-        tmp = (tmp + 4096) >> 13;
+        tmp = (int)(tmp + 4096U) >> 13;
 
         error_buf[1] = tmp;
 
@@ -754,10 +761,8 @@ static int dss_sp_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame->nb_samples = DSS_SP_SAMPLE_COUNT;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed.\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     out = (int16_t *)frame->data[0];
 
diff --git a/libavcodec/dstdec.c b/libavcodec/dstdec.c
new file mode 100644
index 0000000..0614c99
--- /dev/null
+++ b/libavcodec/dstdec.c
@@ -0,0 +1,374 @@
+/*
+ * Direct Stream Transfer (DST) decoder
+ * Copyright (c) 2014 Peter Ross <pross@xvid.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Direct Stream Transfer (DST) decoder
+ * ISO/IEC 14496-3 Part 3 Subpart 10: Technical description of lossless coding of oversampled audio
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "avcodec.h"
+#include "golomb.h"
+#include "mathops.h"
+#include "dsd.h"
+
+#define DST_MAX_CHANNELS 6
+#define DST_MAX_ELEMENTS (2 * DST_MAX_CHANNELS)
+
+#define DSD_FS44(sample_rate) (sample_rate * 8 / 44100)
+
+#define DST_SAMPLES_PER_FRAME(sample_rate) (588 * DSD_FS44(sample_rate))
+
+static const int8_t fsets_code_pred_coeff[3][3] = {
+    {  -8 },
+    { -16,  8 },
+    {  -9, -5, 6 },
+};
+
+static const int8_t probs_code_pred_coeff[3][3] = {
+    {  -8 },
+    { -16,  8 },
+    { -24, 24, -8 },
+};
+
+typedef struct ArithCoder {
+    unsigned int a;
+    unsigned int c;
+} ArithCoder;
+
+typedef struct Table {
+    unsigned int elements;
+    unsigned int length[DST_MAX_ELEMENTS];
+    int coeff[DST_MAX_ELEMENTS][128];
+} Table;
+
+typedef struct DSTContext {
+    AVClass *class;
+
+    GetBitContext gb;
+    ArithCoder ac;
+    Table fsets, probs;
+    DECLARE_ALIGNED(16, uint8_t, status)[DST_MAX_CHANNELS][16];
+    DECLARE_ALIGNED(16, int16_t, filter)[DST_MAX_ELEMENTS][16][256];
+    DSDContext dsdctx[DST_MAX_CHANNELS];
+} DSTContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    DSTContext *s = avctx->priv_data;
+    int i;
+
+    if (avctx->channels > DST_MAX_CHANNELS) {
+        avpriv_request_sample(avctx, "Channel count %d", avctx->channels);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
+
+    for (i = 0; i < avctx->channels; i++)
+        memset(s->dsdctx[i].buf, 0x69, sizeof(s->dsdctx[i].buf));
+
+    ff_init_dsd_data();
+
+    return 0;
+}
+
+static int read_map(GetBitContext *gb, Table *t, unsigned int map[DST_MAX_CHANNELS], int channels)
+{
+    int ch;
+    t->elements = 1;
+    map[0] = 0;
+    if (!get_bits1(gb)) {
+        for (ch = 1; ch < channels; ch++) {
+            int bits = av_log2(t->elements) + 1;
+            map[ch] = get_bits(gb, bits);
+            if (map[ch] == t->elements) {
+                t->elements++;
+                if (t->elements >= DST_MAX_ELEMENTS)
+                    return AVERROR_INVALIDDATA;
+            } else if (map[ch] > t->elements) {
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    } else {
+        memset(map, 0, sizeof(*map) * DST_MAX_CHANNELS);
+    }
+    return 0;
+}
+
+static av_always_inline int get_sr_golomb_dst(GetBitContext *gb, unsigned int k)
+{
+    int v = get_ur_golomb(gb, k, get_bits_left(gb), 0);
+    if (v && get_bits1(gb))
+        v = -v;
+    return v;
+}
+
+static void read_uncoded_coeff(GetBitContext *gb, int *dst, unsigned int elements,
+                               int coeff_bits, int is_signed, int offset)
+{
+    int i;
+
+    for (i = 0; i < elements; i++) {
+        dst[i] = (is_signed ? get_sbits(gb, coeff_bits) : get_bits(gb, coeff_bits)) + offset;
+    }
+}
+
+static int read_table(GetBitContext *gb, Table *t, const int8_t code_pred_coeff[3][3],
+                      int length_bits, int coeff_bits, int is_signed, int offset)
+{
+    unsigned int i, j, k;
+    for (i = 0; i < t->elements; i++) {
+        t->length[i] = get_bits(gb, length_bits) + 1;
+        if (!get_bits1(gb)) {
+            read_uncoded_coeff(gb, t->coeff[i], t->length[i], coeff_bits, is_signed, offset);
+        } else {
+            int method = get_bits(gb, 2), lsb_size;
+            if (method == 3)
+                return AVERROR_INVALIDDATA;
+
+            read_uncoded_coeff(gb, t->coeff[i], method + 1, coeff_bits, is_signed, offset);
+
+            lsb_size  = get_bits(gb, 3);
+            for (j = method + 1; j < t->length[i]; j++) {
+                int c, x = 0;
+                for (k = 0; k < method + 1; k++)
+                    x += code_pred_coeff[method][k] * t->coeff[i][j - k - 1];
+                c = get_sr_golomb_dst(gb, lsb_size);
+                if (x >= 0)
+                    c -= (x + 4) / 8;
+                else
+                    c += (-x + 3) / 8;
+                t->coeff[i][j] = c;
+            }
+        }
+    }
+    return 0;
+}
+
+static void ac_init(ArithCoder *ac, GetBitContext *gb)
+{
+    ac->a = 4095;
+    ac->c = get_bits(gb, 12);
+}
+
+static av_always_inline void ac_get(ArithCoder *ac, GetBitContext *gb, int p, int *e)
+{
+    unsigned int k = (ac->a >> 8) | ((ac->a >> 7) & 1);
+    unsigned int q = k * p;
+    unsigned int a_q = ac->a - q;
+
+    *e = ac->c < a_q;
+    if (*e) {
+        ac->a  = a_q;
+    } else {
+        ac->a  = q;
+        ac->c -= a_q;
+    }
+
+    if (ac->a < 2048) {
+        int n = 11 - av_log2(ac->a);
+        ac->a <<= n;
+        ac->c = (ac->c << n) | get_bits(gb, n);
+    }
+}
+
+static uint8_t prob_dst_x_bit(int c)
+{
+    return (ff_reverse[c & 127] >> 1) + 1;
+}
+
+static void build_filter(int16_t table[DST_MAX_ELEMENTS][16][256], const Table *fsets)
+{
+    int i, j, k, l;
+
+    for (i = 0; i < fsets->elements; i++) {
+        int length = fsets->length[i];
+
+        for (j = 0; j < 16; j++) {
+            int total = av_clip(length - j * 8, 0, 8);
+
+            for (k = 0; k < 256; k++) {
+                int v = 0;
+
+                for (l = 0; l < total; l++)
+                    v += (((k >> l) & 1) * 2 - 1) * fsets->coeff[i][j * 8 + l];
+                table[i][j][k] = v;
+            }
+        }
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    unsigned samples_per_frame = DST_SAMPLES_PER_FRAME(avctx->sample_rate);
+    unsigned map_ch_to_felem[DST_MAX_CHANNELS];
+    unsigned map_ch_to_pelem[DST_MAX_CHANNELS];
+    unsigned i, ch, same_map, dst_x_bit;
+    unsigned half_prob[DST_MAX_CHANNELS];
+    const int channels = avctx->channels;
+    DSTContext *s = avctx->priv_data;
+    GetBitContext *gb = &s->gb;
+    ArithCoder *ac = &s->ac;
+    AVFrame *frame = data;
+    uint8_t *dsd;
+    float *pcm;
+    int ret;
+
+    if (avpkt->size <= 1)
+        return AVERROR_INVALIDDATA;
+
+    frame->nb_samples = samples_per_frame / 8;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    dsd = frame->data[0];
+    pcm = (float *)frame->data[0];
+
+    if ((ret = init_get_bits8(gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+
+    if (!get_bits1(gb)) {
+        skip_bits1(gb);
+        if (get_bits(gb, 6))
+            return AVERROR_INVALIDDATA;
+        memcpy(frame->data[0], avpkt->data + 1, FFMIN(avpkt->size - 1, frame->nb_samples * avctx->channels));
+        goto dsd;
+    }
+
+    /* Segmentation (10.4, 10.5, 10.6) */
+
+    if (!get_bits1(gb)) {
+        avpriv_request_sample(avctx, "Not Same Segmentation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (!get_bits1(gb)) {
+        avpriv_request_sample(avctx, "Not Same Segmentation For All Channels");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (!get_bits1(gb)) {
+        avpriv_request_sample(avctx, "Not End Of Channel Segmentation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    /* Mapping (10.7, 10.8, 10.9) */
+
+    same_map = get_bits1(gb);
+
+    if ((ret = read_map(gb, &s->fsets, map_ch_to_felem, avctx->channels)) < 0)
+        return ret;
+
+    if (same_map) {
+        s->probs.elements = s->fsets.elements;
+        memcpy(map_ch_to_pelem, map_ch_to_felem, sizeof(map_ch_to_felem));
+    } else {
+        avpriv_request_sample(avctx, "Not Same Mapping");
+        if ((ret = read_map(gb, &s->probs, map_ch_to_pelem, avctx->channels)) < 0)
+            return ret;
+    }
+
+    /* Half Probability (10.10) */
+
+    for (ch = 0; ch < avctx->channels; ch++)
+        half_prob[ch] = get_bits1(gb);
+
+    /* Filter Coef Sets (10.12) */
+
+    read_table(gb, &s->fsets, fsets_code_pred_coeff, 7, 9, 1, 0);
+
+    /* Probability Tables (10.13) */
+
+    read_table(gb, &s->probs, probs_code_pred_coeff, 6, 7, 0, 1);
+
+    /* Arithmetic Coded Data (10.11) */
+
+    if (get_bits1(gb))
+        return AVERROR_INVALIDDATA;
+    ac_init(ac, gb);
+
+    build_filter(s->filter, &s->fsets);
+
+    memset(s->status, 0xAA, sizeof(s->status));
+    memset(dsd, 0, frame->nb_samples * 4 * avctx->channels);
+
+    ac_get(ac, gb, prob_dst_x_bit(s->fsets.coeff[0][0]), &dst_x_bit);
+
+    for (i = 0; i < samples_per_frame; i++) {
+        for (ch = 0; ch < channels; ch++) {
+            const unsigned felem = map_ch_to_felem[ch];
+            int16_t (*filter)[256] = s->filter[felem];
+            uint8_t *status = s->status[ch];
+            int prob, residual, v;
+
+#define F(x) filter[(x)][status[(x)]]
+            const int16_t predict = F( 0) + F( 1) + F( 2) + F( 3) +
+                                    F( 4) + F( 5) + F( 6) + F( 7) +
+                                    F( 8) + F( 9) + F(10) + F(11) +
+                                    F(12) + F(13) + F(14) + F(15);
+#undef F
+
+            if (!half_prob[ch] || i >= s->fsets.length[felem]) {
+                unsigned pelem = map_ch_to_pelem[ch];
+                unsigned index = FFABS(predict) >> 3;
+                prob = s->probs.coeff[pelem][FFMIN(index, s->probs.length[pelem] - 1)];
+            } else {
+                prob = 128;
+            }
+
+            ac_get(ac, gb, prob, &residual);
+            v = ((predict >> 15) ^ residual) & 1;
+            dsd[((i >> 3) * channels + ch) << 2] |= v << (7 - (i & 0x7 ));
+
+            AV_WL64A(status + 8, (AV_RL64A(status + 8) << 1) | ((AV_RL64A(status) >> 63) & 1));
+            AV_WL64A(status, (AV_RL64A(status) << 1) | v);
+        }
+    }
+
+dsd:
+    for (i = 0; i < avctx->channels; i++) {
+        ff_dsd2pcm_translate(&s->dsdctx[i], frame->nb_samples, 0,
+                             frame->data[0] + i * 4,
+                             avctx->channels * 4, pcm + i, avctx->channels);
+    }
+
+    *got_frame_ptr = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_dst_decoder = {
+    .name           = "dst",
+    .long_name      = NULL_IF_CONFIG_SMALL("DST (Digital Stream Transfer)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_DST,
+    .priv_data_size = sizeof(DSTContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLT,
+                                                      AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/dump_extradata_bsf.c b/libavcodec/dump_extradata_bsf.c
index c960d6a..188a1c6 100644
--- a/libavcodec/dump_extradata_bsf.c
+++ b/libavcodec/dump_extradata_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,16 +34,17 @@ enum DumpFreq {
 
 typedef struct DumpExtradataContext {
     const AVClass *class;
+    AVPacket pkt;
     int freq;
 } DumpExtradataContext;
 
 static int dump_extradata(AVBSFContext *ctx, AVPacket *out)
 {
     DumpExtradataContext *s = ctx->priv_data;
-    AVPacket *in;
+    AVPacket *in = &s->pkt;
     int ret = 0;
 
-    ret = ff_bsf_get_packet(ctx, &in);
+    ret = ff_bsf_get_packet_ref(ctx, in);
     if (ret < 0)
         return ret;
 
@@ -72,17 +73,20 @@ static int dump_extradata(AVBSFContext *ctx, AVPacket *out)
     }
 
 fail:
-    av_packet_free(&in);
+    av_packet_unref(in);
 
     return ret;
 }
 
 #define OFFSET(x) offsetof(DumpExtradataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
 static const AVOption options[] = {
     { "freq", "When do dump extradata", OFFSET(freq), AV_OPT_TYPE_INT,
-        { .i64 = DUMP_FREQ_KEYFRAME }, DUMP_FREQ_KEYFRAME, DUMP_FREQ_ALL, 0, "freq" },
-        { "keyframe", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_KEYFRAME }, .unit = "freq" },
-        { "all",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_ALL      }, .unit = "freq" },
+        { .i64 = DUMP_FREQ_KEYFRAME }, DUMP_FREQ_KEYFRAME, DUMP_FREQ_ALL, FLAGS, "freq" },
+        { "k",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_KEYFRAME }, .flags = FLAGS, .unit = "freq" },
+        { "keyframe", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_KEYFRAME }, .flags = FLAGS, .unit = "freq" },
+        { "e",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_ALL      }, .flags = FLAGS, .unit = "freq" },
+        { "all",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_ALL      }, .flags = FLAGS, .unit = "freq" },
     { NULL },
 };
 
@@ -90,7 +94,7 @@ static const AVClass dump_extradata_class = {
     .class_name = "dump_extradata bsf",
     .item_name  = av_default_item_name,
     .option     = options,
-    .version    = LIBAVUTIL_VERSION_MAJOR,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 const AVBitStreamFilter ff_dump_extradata_bsf = {
diff --git a/libavcodec/dv.c b/libavcodec/dv.c
index 52a015e..9c75cfd 100644
--- a/libavcodec/dv.c
+++ b/libavcodec/dv.c
@@ -16,20 +16,20 @@
  * Many thanks to Dan Dennedy <dan@dennedy.org> for providing wealth
  * of DV technical info.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,7 @@
 #include "simple_idct.h"
 
 /* XXX: also include quantization */
-RL_VLC_ELEM ff_dv_rl_vlc[1184];
+RL_VLC_ELEM ff_dv_rl_vlc[1664];
 
 static inline void dv_calc_mb_coordinates(const AVDVProfile *d, int chan,
                                           int seq, int slot, uint16_t *tbl)
@@ -172,20 +172,9 @@ static inline void dv_calc_mb_coordinates(const AVDVProfile *d, int chan,
     }
 }
 
-/* quantization quanta by QNO for DV100 */
-static const uint8_t dv100_qstep[16] = {
-    1, /* QNO = 0 and 1 both have no quantization */
-    1,
-    2, 3, 4, 5, 6, 7, 8, 16, 18, 20, 22, 24, 28, 52
-};
-
-static const uint8_t dv_quant_areas[4] = { 6, 21, 43, 64 };
-
 int ff_dv_init_dynamic_tables(DVVideoContext *ctx, const AVDVProfile *d)
 {
     int j, i, c, s, p;
-    uint32_t *factor1, *factor2;
-    const int *iweight1, *iweight2;
 
     p = i = 0;
     for (c = 0; c < d->n_difchan; c++) {
@@ -203,38 +192,6 @@ int ff_dv_init_dynamic_tables(DVVideoContext *ctx, const AVDVProfile *d)
         }
     }
 
-    factor1 = &ctx->idct_factor[0];
-    factor2 = &ctx->idct_factor[DV_PROFILE_IS_HD(d) ? 4096 : 2816];
-    if (d->height == 720) {
-        iweight1 = &ff_dv_iweight_720_y[0];
-        iweight2 = &ff_dv_iweight_720_c[0];
-    } else {
-        iweight1 = &ff_dv_iweight_1080_y[0];
-        iweight2 = &ff_dv_iweight_1080_c[0];
-    }
-    if (DV_PROFILE_IS_HD(d)) {
-        for (c = 0; c < 4; c++) {
-            for (s = 0; s < 16; s++) {
-                for (i = 0; i < 64; i++) {
-                    *factor1++ = (dv100_qstep[s] << (c + 9)) * iweight1[i];
-                    *factor2++ = (dv100_qstep[s] << (c + 9)) * iweight2[i];
-                }
-            }
-        }
-    } else {
-        iweight1 = &ff_dv_iweight_88[0];
-        for (j = 0; j < 2; j++, iweight1 = &ff_dv_iweight_248[0]) {
-            for (s = 0; s < 22; s++) {
-                for (i = c = 0; c < 4; c++) {
-                    for (; i < dv_quant_areas[c]; i++) {
-                        *factor1   = iweight1[i] << (ff_dv_quant_shifts[s][c] + 1);
-                        *factor2++ = (*factor1++) << 1;
-                    }
-                }
-            }
-        }
-    }
-
     return 0;
 }
 
@@ -276,7 +233,7 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)
          * to accelerate the parsing of partial codes */
         init_vlc(&dv_vlc, TEX_VLC_BITS, j, new_dv_vlc_len,
                  1, 1, new_dv_vlc_bits, 2, 2, 0);
-        assert(dv_vlc.table_size == 1184);
+        av_assert1(dv_vlc.table_size == 1664);
 
         for (i = 0; i < dv_vlc.table_size; i++) {
             int code = dv_vlc.table[i][0];
@@ -302,3 +259,4 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)
 
     return 0;
 }
+
diff --git a/libavcodec/dv.h b/libavcodec/dv.h
index 9a9ebf1..0e97bb2 100644
--- a/libavcodec/dv.h
+++ b/libavcodec/dv.h
@@ -2,20 +2,20 @@
  * Constants for DV codec
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,7 @@ typedef struct DVwork_chunk {
 } DVwork_chunk;
 
 typedef struct DVVideoContext {
+    AVClass *avclass;
     const AVDVProfile *sys;
     const AVFrame   *frame;
     AVCodecContext  *avctx;
@@ -51,6 +52,8 @@ typedef struct DVVideoContext {
     me_cmp_func ildct_cmp;
     DVwork_chunk work_chunks[4 * 12 * 27];
     uint32_t idct_factor[2 * 4 * 16 * 64];
+
+    int quant_deadzone;
 } DVVideoContext;
 
 enum dv_section_type {
@@ -80,10 +83,6 @@ enum dv_pack_type {
 #define DV_PROFILE_IS_1080i50(p) (((p)->video_stype == 0x14) && ((p)->dsf == 1))
 #define DV_PROFILE_IS_720p50(p)  (((p)->video_stype == 0x18) && ((p)->dsf == 1))
 
-/* minimum number of bytes to read from a DV stream in order to
- * determine the profile */
-#define DV_PROFILE_BYTES (6 * 80) /* 6 DIF blocks */
-
 /**
  * largest possible DV frame, in bytes (1080i50)
  */
@@ -94,11 +93,12 @@ enum dv_pack_type {
  */
 #define DV_MAX_BPM 8
 
-#define TEX_VLC_BITS 9
+#define TEX_VLC_BITS 10
 
-extern RL_VLC_ELEM ff_dv_rl_vlc[1184];
+extern RL_VLC_ELEM ff_dv_rl_vlc[1664];
 
 int ff_dv_init_dynamic_tables(DVVideoContext *s, const AVDVProfile *d);
+
 int ff_dvvideo_init(AVCodecContext *avctx);
 
 static inline int dv_work_pool_size(const AVDVProfile *d)
diff --git a/libavcodec/dv_profile.c b/libavcodec/dv_profile.c
index 74c529d..66505c8 100644
--- a/libavcodec/dv_profile.c
+++ b/libavcodec/dv_profile.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
 #include "libavutil/pixdesc.h"
 
@@ -256,22 +257,30 @@ void ff_dv_print_profiles(void *logctx, int loglevel)
 
 #endif /* CONFIG_DVPROFILE */
 
-const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
+const AVDVProfile* ff_dv_frame_profile(AVCodecContext* codec, const AVDVProfile *sys,
                                        const uint8_t *frame, unsigned buf_size)
 {
 #if CONFIG_DVPROFILE
     int i, dsf, stype;
 
-    if (buf_size < 80 * 5 + 48 + 4)
+    if(buf_size < DV_PROFILE_BYTES)
         return NULL;
 
     dsf   = (frame[3] & 0x80) >> 7;
     stype = frame[80 * 5 + 48 + 3] & 0x1f;
 
     /* 576i50 25Mbps 4:1:1 is a special case */
-    if (dsf == 1 && stype == 0 && frame[4] & 0x07 /* the APT field */)
+    if ((dsf == 1 && stype == 0 && frame[4] & 0x07 /* the APT field */) ||
+        (stype == 31 && codec && codec->codec_tag==AV_RL32("SL25") && codec->coded_width==720 && codec->coded_height==576))
         return &dv_profiles[2];
 
+    if(   stype == 0
+       && codec
+       && (codec->codec_tag==AV_RL32("dvsd") || codec->codec_tag==AV_RL32("CDVC"))
+       && codec->coded_width ==720
+       && codec->coded_height==576)
+        return &dv_profiles[1];
+
     for (i = 0; i < FF_ARRAY_ELEMS(dv_profiles); i++)
         if (dsf == dv_profiles[i].dsf && stype == dv_profiles[i].video_stype)
             return &dv_profiles[i];
@@ -279,23 +288,54 @@ const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
     /* check if old sys matches and assumes corrupted input */
     if (sys && buf_size == sys->frame_size)
         return sys;
+
+    /* hack for trac issue #217, dv files created with QuickTime 3 */
+    if ((frame[3] & 0x7f) == 0x3f && frame[80 * 5 + 48 + 3] == 0xff)
+        return &dv_profiles[dsf];
 #endif
 
     return NULL;
 }
 
+const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
+                                       const uint8_t *frame, unsigned buf_size)
+{
+    return ff_dv_frame_profile(NULL, sys, frame, buf_size);
+}
+
 const AVDVProfile *av_dv_codec_profile(int width, int height,
                                        enum AVPixelFormat pix_fmt)
 {
 #if CONFIG_DVPROFILE
+    return av_dv_codec_profile2(width, height, pix_fmt, (AVRational){0, 0});
+#endif
+
+    return NULL;
+}
+
+const AVDVProfile *av_dv_codec_profile2(int width, int height,
+                                       enum AVPixelFormat pix_fmt,
+                                       AVRational frame_rate)
+{
+    const AVDVProfile *p = NULL;
+#if CONFIG_DVPROFILE
     int i;
+    /* frame rate is necessary to select between 720p50 and 720p60 profiles */
+    int invalid_framerate = frame_rate.num == 0 || frame_rate.den == 0;
 
     for (i = 0; i < FF_ARRAY_ELEMS(dv_profiles); i++)
         if (height  == dv_profiles[i].height  &&
             pix_fmt == dv_profiles[i].pix_fmt &&
             width   == dv_profiles[i].width)
-            return &dv_profiles[i];
+        {
+            if( invalid_framerate || av_div_q(dv_profiles[i].time_base, frame_rate).num == 1 )
+                return &dv_profiles[i];
+
+            if(!p)
+                p = &dv_profiles[i];
+        }
 #endif
 
-    return NULL;
+    return p;
 }
+
diff --git a/libavcodec/dv_profile.h b/libavcodec/dv_profile.h
index 5ad7b4f..9380a66 100644
--- a/libavcodec/dv_profile.h
+++ b/libavcodec/dv_profile.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,11 @@
 #include "libavutil/rational.h"
 #include "avcodec.h"
 
+/* minimum number of bytes to read from a DV stream in order to
+ * determine the profile */
+#define DV_PROFILE_BYTES (6 * 80) /* 6 DIF blocks */
+
+
 /*
  * AVDVProfile is used to express the differences between various
  * DV flavors. For now it's primarily used for differentiating
@@ -69,4 +74,10 @@ const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
  */
 const AVDVProfile *av_dv_codec_profile(int width, int height, enum AVPixelFormat pix_fmt);
 
+/**
+ * Get a DV profile for the provided stream parameters.
+ * The frame rate is used as a best-effort parameter.
+ */
+const AVDVProfile *av_dv_codec_profile2(int width, int height, enum AVPixelFormat pix_fmt, AVRational frame_rate);
+
 #endif /* AVCODEC_DV_PROFILE_H */
diff --git a/libavcodec/dv_profile_internal.h b/libavcodec/dv_profile_internal.h
index f93e7ca..67d3a2b 100644
--- a/libavcodec/dv_profile_internal.h
+++ b/libavcodec/dv_profile_internal.h
@@ -1,27 +1,35 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DV_PROFILE_INTERNAL_H
 #define AVCODEC_DV_PROFILE_INTERNAL_H
 
+#include "dv_profile.h"
+
 /**
  *  Print all allowed DV profiles into logctx at specified logging level.
  */
 void ff_dv_print_profiles(void *logctx, int loglevel);
 
+/**
+ * Get a DV profile for the provided compressed frame.
+ */
+const AVDVProfile* ff_dv_frame_profile(AVCodecContext* codec, const AVDVProfile *sys,
+                                       const uint8_t *frame, unsigned buf_size);
+
 #endif /* AVCODEC_DV_PROFILE_INTERNAL_H */
diff --git a/libavcodec/dv_tablegen.c b/libavcodec/dv_tablegen.c
index 9b2b954..d032101 100644
--- a/libavcodec/dv_tablegen.c
+++ b/libavcodec/dv_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include <inttypes.h>
 
 WRITE_1D_FUNC_ARGV(dv_vlc_pair, 7,
-                   "{0x%"PRIx32", %"PRId8"}", data[i].vlc, data[i].size)
+                   "{0x%"PRIx32", %"PRIu32"}", data[i].vlc, data[i].size)
 WRITE_2D_FUNC(dv_vlc_pair)
 
 int main(void)
diff --git a/libavcodec/dv_tablegen.h b/libavcodec/dv_tablegen.h
index b69721b..941b557 100644
--- a/libavcodec/dv_tablegen.h
+++ b/libavcodec/dv_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #define AVCODEC_DV_TABLEGEN_H
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
 
 #include "dvdata.h"
 
@@ -47,7 +48,7 @@ typedef struct dv_vlc_pair {
 #else
 static struct dv_vlc_pair dv_vlc_map[DV_VLC_MAP_RUN_SIZE][DV_VLC_MAP_LEV_SIZE];
 
-static void dv_vlc_map_tableinit(void)
+static av_cold void dv_vlc_map_tableinit(void)
 {
     int i, j;
     for (i = 0; i < NB_DV_VLC - 1; i++) {
diff --git a/libavcodec/dvaudio.h b/libavcodec/dvaudio.h
new file mode 100644
index 0000000..e7f70c5
--- /dev/null
+++ b/libavcodec/dvaudio.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DVAUDIO_H
+#define AVCODEC_DVAUDIO_H
+
+#include <stdint.h>
+
+static inline int dv_get_audio_sample_count(const uint8_t *buffer, int dsf)
+{
+    int samples = buffer[0] & 0x3f; /* samples in this frame - min samples */
+
+    switch ((buffer[3] >> 3) & 0x07) {
+    case 0:
+        return samples + (dsf ? 1896 : 1580);
+    case 1:
+        return samples + (dsf ? 1742 : 1452);
+    case 2:
+    default:
+        return samples + (dsf ? 1264 : 1053);
+    }
+}
+
+#endif /* AVCODEC_DVAUDIO_H */
diff --git a/libavcodec/dvaudio_parser.c b/libavcodec/dvaudio_parser.c
new file mode 100644
index 0000000..160faaf
--- /dev/null
+++ b/libavcodec/dvaudio_parser.c
@@ -0,0 +1,46 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Ulead DV audio parser
+ *
+ * Determines the duration for each packet.
+ */
+
+#include "parser.h"
+#include "dvaudio.h"
+
+static int dvaudio_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                        const uint8_t **poutbuf, int *poutbuf_size,
+                        const uint8_t *buf, int buf_size)
+{
+    if (buf_size >= 248)
+        s1->duration = dv_get_audio_sample_count(buf + 244, avctx->block_align == 8640);
+
+    /* always return the full packet. this parser isn't doing any splitting or
+       combining, only packet analysis */
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return buf_size;
+}
+
+AVCodecParser ff_dvaudio_parser = {
+    .codec_ids      = { AV_CODEC_ID_DVAUDIO },
+    .parser_parse   = dvaudio_parse,
+};
diff --git a/libavcodec/dvaudiodec.c b/libavcodec/dvaudiodec.c
new file mode 100644
index 0000000..5aa2a95
--- /dev/null
+++ b/libavcodec/dvaudiodec.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012 Laurent Aimar
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "dvaudio.h"
+
+typedef struct DVAudioContext {
+    int block_size;
+    int is_12bit;
+    int is_pal;
+    int16_t shuffle[2000];
+} DVAudioContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    DVAudioContext *s = avctx->priv_data;
+    int i;
+
+    if (avctx->channels != 2) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->codec_tag == 0x0215) {
+        s->block_size = 7200;
+    } else if (avctx->codec_tag == 0x0216) {
+        s->block_size = 8640;
+    } else if (avctx->block_align == 7200 ||
+               avctx->block_align == 8640) {
+        s->block_size = avctx->block_align;
+    } else {
+        return AVERROR(EINVAL);
+    }
+
+    s->is_pal = s->block_size == 8640;
+    s->is_12bit = avctx->bits_per_coded_sample == 12;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+    avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->shuffle); i++) {
+        const unsigned a = s->is_pal ? 18 : 15;
+        const unsigned b = 3 * a;
+
+        s->shuffle[i] = 80 * ((21 * (i % 3) + 9 * (i / 3) + ((i / a) % 3)) % b) +
+                         (2 + s->is_12bit) * (i / b) + 8;
+    }
+
+    return 0;
+}
+
+static inline uint16_t dv_audio_12to16(uint16_t sample)
+{
+    uint16_t shift, result;
+
+    sample = (sample < 0x800) ? sample : sample | 0xf000;
+    shift  = (sample & 0xf00) >> 8;
+
+    if (shift < 0x2 || shift > 0xd) {
+        result = sample;
+    } else if (shift < 0x8) {
+        shift--;
+        result = (sample - (256 * shift)) << shift;
+    } else {
+        shift  = 0xe - shift;
+        result = ((sample + ((256 * shift) + 1)) << shift) - 1;
+    }
+
+    return result;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *pkt)
+{
+    DVAudioContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    const uint8_t *src = pkt->data;
+    int16_t *dst;
+    int ret, i;
+
+    if (pkt->size < s->block_size)
+        return AVERROR_INVALIDDATA;
+
+    frame->nb_samples = dv_get_audio_sample_count(pkt->data + 244, s->is_pal);
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    dst = (int16_t *)frame->data[0];
+
+    for (i = 0; i < frame->nb_samples; i++) {
+       const uint8_t *v = &src[s->shuffle[i]];
+
+       if (s->is_12bit) {
+           *dst++ = dv_audio_12to16((v[0] << 4) | ((v[2] >> 4) & 0x0f));
+           *dst++ = dv_audio_12to16((v[1] << 4) | ((v[2] >> 0) & 0x0f));
+       } else {
+           *dst++ = AV_RB16(&v[0]);
+           *dst++ = AV_RB16(&v[s->is_pal ? 4320 : 3600]);
+       }
+    }
+
+    *got_frame_ptr = 1;
+
+    return s->block_size;
+}
+
+AVCodec ff_dvaudio_decoder = {
+    .name           = "dvaudio",
+    .long_name      = NULL_IF_CONFIG_SMALL("Ulead DV Audio"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_DVAUDIO,
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(DVAudioContext),
+};
diff --git a/libavcodec/dvbsub.c b/libavcodec/dvbsub.c
index 548bae1..8cce702 100644
--- a/libavcodec/dvbsub.c
+++ b/libavcodec/dvbsub.c
@@ -2,20 +2,20 @@
  * DVB subtitle encoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
@@ -23,7 +23,6 @@
 #include "libavutil/colorspace.h"
 
 typedef struct DVBSubtitleContext {
-    int hide_state;
     int object_version;
 } DVBSubtitleContext;
 
@@ -194,6 +193,60 @@ static void dvb_encode_rle4(uint8_t **pq,
     *pq = q;
 }
 
+static void dvb_encode_rle8(uint8_t **pq,
+                            const uint8_t *bitmap, int linesize,
+                            int w, int h)
+{
+    uint8_t *q;
+    int x, y, len, x1, color;
+
+    q = *pq;
+
+    for (y = 0; y < h; y++) {
+        *q++ = 0x12;
+
+        x = 0;
+        while (x < w) {
+            x1 = x;
+            color = bitmap[x1++];
+            while (x1 < w && bitmap[x1] == color)
+                x1++;
+            len = x1 - x;
+            if (len == 1 && color) {
+                // 00000001 to 11111111           1 pixel in colour x
+                *q++ = color;
+            } else {
+                if (color == 0x00) {
+                    // 00000000 0LLLLLLL          L pixels (1-127) in colour 0 (L > 0)
+                    len = FFMIN(len, 127);
+                    *q++ = 0x00;
+                    *q++ = len;
+                } else if (len > 2) {
+                    // 00000000 1LLLLLLL CCCCCCCC L pixels (3-127) in colour C (L > 2)
+                    len = FFMIN(len, 127);
+                    *q++ = 0x00;
+                    *q++ = 0x80+len;
+                    *q++ = color;
+                }
+                else if (len == 2) {
+                    *q++ = color;
+                    *q++ = color;
+                } else {
+                    *q++ = color;
+                    len = 1;
+                }
+            }
+            x += len;
+        }
+        /* end of line */
+        // 00000000 end of 8-bit/pixel_code_string
+        *q++ = 0x00;
+        *q++ = 0xf0;
+        bitmap += linesize;
+    }
+    *pq = q;
+}
+
 static int encode_dvb_subtitles(DVBSubtitleContext *s,
                                 uint8_t *outbuf, const AVSubtitle *h)
 {
@@ -205,11 +258,9 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     page_id = 1;
 
-    if (h->num_rects == 0 || !h->rects)
+    if (h->num_rects && !h->rects)
         return -1;
 
-    *q++ = 0x00; /* subtitle_stream_id */
-
     /* page composition segment */
 
     *q++ = 0x0f; /* sync_byte */
@@ -218,10 +269,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
     pseg_len = q;
     q += 2; /* segment length */
     *q++ = 30; /* page_timeout (seconds) */
-    if (s->hide_state)
-        page_state = 0; /* normal case */
-    else
-        page_state = 2; /* mode change */
+    page_state = 2; /* mode change */
     /* page_version = 0 + page_state */
     *q++ = (s->object_version << 4) | (page_state << 2) | 3;
 
@@ -234,7 +282,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     bytestream_put_be16(&pseg_len, q - pseg_len - 2);
 
-    if (!s->hide_state) {
+    if (h->num_rects) {
         for (clut_id = 0; clut_id < h->num_rects; clut_id++) {
 
             /* CLUT segment */
@@ -245,10 +293,15 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
             } else if (h->rects[clut_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
                 bpp_index = 1;
+            } else if (h->rects[clut_id]->nb_colors <= 256) {
+                /* 8 bpp, standard encoding */
+                bpp_index = 2;
             } else {
                 return -1;
             }
 
+
+            /* CLUT segment */
             *q++ = 0x0f; /* sync byte */
             *q++ = 0x12; /* CLUT definition segment */
             bytestream_put_be16(&q, page_id);
@@ -289,6 +342,9 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
         } else if (h->rects[region_id]->nb_colors <= 16) {
             /* 4 bpp, standard encoding */
             bpp_index = 1;
+        } else if (h->rects[region_id]->nb_colors <= 256) {
+            /* 8 bpp, standard encoding */
+            bpp_index = 2;
         } else {
             return -1;
         }
@@ -307,32 +363,37 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
         *q++ = 0; /* 8 bit fill colors */
         *q++ = 0x03; /* 4 bit and 2 bit fill colors */
 
-        if (!s->hide_state) {
-            bytestream_put_be16(&q, region_id); /* object_id == region_id */
-            *q++ = (0 << 6) | (0 << 4);
-            *q++ = 0;
-            *q++ = 0xf0;
-            *q++ = 0;
-        }
+        bytestream_put_be16(&q, region_id); /* object_id == region_id */
+        *q++ = (0 << 6) | (0 << 4);
+        *q++ = 0;
+        *q++ = 0xf0;
+        *q++ = 0;
 
         bytestream_put_be16(&pseg_len, q - pseg_len - 2);
     }
 
-    if (!s->hide_state) {
+    if (h->num_rects) {
 
         for (object_id = 0; object_id < h->num_rects; object_id++) {
-            /* Object Data segment */
+            void (*dvb_encode_rle)(uint8_t **pq,
+                                    const uint8_t *bitmap, int linesize,
+                                    int w, int h);
 
+            /* bpp_index maths */
             if (h->rects[object_id]->nb_colors <= 4) {
                 /* 2 bpp, some decoders do not support it correctly */
-                bpp_index = 0;
+                dvb_encode_rle = dvb_encode_rle2;
             } else if (h->rects[object_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
-                bpp_index = 1;
+                dvb_encode_rle = dvb_encode_rle4;
+            } else if (h->rects[object_id]->nb_colors <= 256) {
+                /* 8 bpp, standard encoding */
+                dvb_encode_rle = dvb_encode_rle8;
             } else {
                 return -1;
             }
 
+            /* Object Data segment */
             *q++ = 0x0f; /* sync byte */
             *q++ = 0x13;
             bytestream_put_be16(&q, page_id);
@@ -345,19 +406,12 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
                                                                        non_modifying_color_flag */
             {
                 uint8_t *ptop_field_len, *pbottom_field_len, *top_ptr, *bottom_ptr;
-                void (*dvb_encode_rle)(uint8_t **pq,
-                                        const uint8_t *bitmap, int linesize,
-                                        int w, int h);
+
                 ptop_field_len = q;
                 q += 2;
                 pbottom_field_len = q;
                 q += 2;
 
-                if (bpp_index == 0)
-                    dvb_encode_rle = dvb_encode_rle2;
-                else
-                    dvb_encode_rle = dvb_encode_rle4;
-
                 top_ptr = q;
                 dvb_encode_rle(&q, h->rects[object_id]->data[0], h->rects[object_id]->w * 2,
                                     h->rects[object_id]->w, h->rects[object_id]->h >> 1);
@@ -384,10 +438,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     bytestream_put_be16(&pseg_len, q - pseg_len - 2);
 
-    *q++ = 0xff; /* end of PES data */
-
     s->object_version = (s->object_version + 1) & 0xf;
-    s->hide_state = !s->hide_state;
     return q - outbuf;
 }
 
diff --git a/libavcodec/dvbsub_parser.c b/libavcodec/dvbsub_parser.c
index 5afdc4b..8ced3c4 100644
--- a/libavcodec/dvbsub_parser.c
+++ b/libavcodec/dvbsub_parser.c
@@ -1,21 +1,21 @@
 /*
- * DVB subtitle parser for Libav
+ * DVB subtitle parser for FFmpeg
  * Copyright (c) 2005 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -57,6 +57,7 @@ static int dvbsub_parse(AVCodecParserContext *s,
     DVBSubParseContext *pc = s->priv_data;
     uint8_t *p, *p_end;
     int i, len, buf_pos = 0;
+    int out_size = 0;
 
     ff_dlog(avctx, "DVB parse packet pts=%"PRIx64", lpts=%"PRIx64", cpts=%"PRIx64":\n",
             s->pts, s->last_pts, s->cur_frame_pts[s->cur_frame_start_index]);
@@ -71,8 +72,8 @@ static int dvbsub_parse(AVCodecParserContext *s,
     if (i % 16 != 0)
         ff_dlog(avctx, "\n");
 
-    *poutbuf = NULL;
-    *poutbuf_size = 0;
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
 
     s->fetch_timestamp = 1;
 
@@ -89,7 +90,7 @@ static int dvbsub_parse(AVCodecParserContext *s,
 
         if (buf_size < 2 || buf[0] != 0x20 || buf[1] != 0x00) {
             ff_dlog(avctx, "Bad packet header\n");
-            return -1;
+            return buf_size;
         }
 
         buf_pos = 2;
@@ -113,9 +114,9 @@ static int dvbsub_parse(AVCodecParserContext *s,
     }
 
     if (buf_size - buf_pos + pc->packet_index > PARSE_BUF_SIZE)
-        return -1;
+        return buf_size;
 
-/* if not currently in a packet, discard data */
+/* if not currently in a packet, pass data */
     if (pc->in_packet == 0)
         return buf_size;
 
@@ -129,13 +130,13 @@ static int dvbsub_parse(AVCodecParserContext *s,
     {
         if (*p == 0x0f)
         {
-            if (p + 6 <= p_end)
+            if (6 <= p_end - p)
             {
                 len = AV_RB16(p + 4);
 
-                if (p + len + 6 <= p_end)
+                if (len + 6 <= p_end - p)
                 {
-                    *poutbuf_size += len + 6;
+                    out_size += len + 6;
 
                     p += len + 6;
                 } else
@@ -143,7 +144,7 @@ static int dvbsub_parse(AVCodecParserContext *s,
             } else
                 break;
         } else if (*p == 0xff) {
-            if (p + 1 < p_end)
+            if (1 < p_end - p)
             {
                 ff_dlog(avctx, "Junk at end of packet\n");
             }
@@ -159,9 +160,10 @@ static int dvbsub_parse(AVCodecParserContext *s,
         }
     }
 
-    if (*poutbuf_size > 0)
+    if (out_size > 0)
     {
         *poutbuf = pc->packet_buf;
+        *poutbuf_size = out_size;
         pc->packet_start = *poutbuf_size;
     }
 
diff --git a/libavcodec/dvbsubdec.c b/libavcodec/dvbsubdec.c
index b97ff80..bc4a17b 100644
--- a/libavcodec/dvbsubdec.c
+++ b/libavcodec/dvbsubdec.c
@@ -2,28 +2,30 @@
  * DVB subtitle decoding
  * Copyright (c) 2005 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "libavutil/colorspace.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
 #define DVBSUB_PAGE_SEGMENT     0x10
 #define DVBSUB_REGION_SEGMENT   0x11
@@ -38,6 +40,7 @@
 
 typedef struct DVBSubCLUT {
     int id;
+    int version;
 
     uint32_t clut4[4];
     uint32_t clut16[16];
@@ -64,6 +67,7 @@ typedef struct DVBSubObjectDisplay {
 
 typedef struct DVBSubObject {
     int id;
+    int version;
 
     int type;
 
@@ -83,6 +87,7 @@ typedef struct DVBSubRegionDisplay {
 
 typedef struct DVBSubRegion {
     int id;
+    int version;
 
     int width;
     int height;
@@ -91,8 +96,12 @@ typedef struct DVBSubRegion {
     int clut;
     int bgcolor;
 
+    uint8_t computed_clut[4*256];
+    int has_computed_clut;
+
     uint8_t *pbuf;
     int buf_size;
+    int dirty;
 
     DVBSubObjectDisplay *display_list;
 
@@ -109,15 +118,22 @@ typedef struct DVBSubDisplayDefinition {
 } DVBSubDisplayDefinition;
 
 typedef struct DVBSubContext {
+    AVClass *class;
     int composition_id;
     int ancillary_id;
 
+    int version;
     int time_out;
+    int compute_edt; /**< if 1 end display time calculated using pts
+                          if 0 (Default) calculated using time out */
+    int compute_clut;
+    int clut_count2[257][256];
+    int substream;
+    int64_t prev_start;
     DVBSubRegion *region_list;
     DVBSubCLUT   *clut_list;
     DVBSubObject *object_list;
 
-    int display_list_size;
     DVBSubRegionDisplay *display_list;
     DVBSubDisplayDefinition *display_definition;
 } DVBSubContext;
@@ -183,53 +199,59 @@ static void delete_region_display_list(DVBSubContext *ctx, DVBSubRegion *region)
                     obj2 = *obj2_ptr;
 
                     while (obj2 != object) {
-                        assert(obj2);
+                        av_assert0(obj2);
                         obj2_ptr = &obj2->next;
                         obj2 = *obj2_ptr;
                     }
 
                     *obj2_ptr = obj2->next;
 
-                    av_free(obj2);
+                    av_freep(&obj2);
                 }
             }
         }
 
         region->display_list = display->region_list_next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
 }
 
-static void delete_state(DVBSubContext *ctx)
+static void delete_cluts(DVBSubContext *ctx)
 {
-    DVBSubRegion *region;
-    DVBSubCLUT *clut;
-
-    while (ctx->region_list) {
-        region = ctx->region_list;
+    while (ctx->clut_list) {
+        DVBSubCLUT *clut = ctx->clut_list;
 
-        ctx->region_list = region->next;
+        ctx->clut_list = clut->next;
 
-        delete_region_display_list(ctx, region);
-        av_free(region->pbuf);
-        av_free(region);
+        av_freep(&clut);
     }
+}
 
-    while (ctx->clut_list) {
-        clut = ctx->clut_list;
+static void delete_objects(DVBSubContext *ctx)
+{
+    while (ctx->object_list) {
+        DVBSubObject *object = ctx->object_list;
 
-        ctx->clut_list = clut->next;
+        ctx->object_list = object->next;
 
-        av_free(clut);
+        av_freep(&object);
     }
+}
 
-    av_freep(&ctx->display_definition);
+static void delete_regions(DVBSubContext *ctx)
+{
+    while (ctx->region_list) {
+        DVBSubRegion *region = ctx->region_list;
+
+        ctx->region_list = region->next;
+
+        delete_region_display_list(ctx, region);
 
-    /* Should already be null */
-    if (ctx->object_list)
-        av_log(NULL, AV_LOG_ERROR, "Memory deallocation error!\n");
+        av_freep(&region->pbuf);
+        av_freep(&region);
+    }
 }
 
 static av_cold int dvbsub_init_decoder(AVCodecContext *avctx)
@@ -237,15 +259,27 @@ static av_cold int dvbsub_init_decoder(AVCodecContext *avctx)
     int i, r, g, b, a = 0;
     DVBSubContext *ctx = avctx->priv_data;
 
-    if (!avctx->extradata || avctx->extradata_size != 4) {
-        av_log(avctx, AV_LOG_WARNING, "Invalid extradata, subtitle streams may be combined!\n");
+    if (ctx->substream < 0) {
+        ctx->composition_id = -1;
+        ctx->ancillary_id   = -1;
+    } else if (!avctx->extradata || (avctx->extradata_size < 4) || ((avctx->extradata_size % 5 != 0) && (avctx->extradata_size != 4))) {
+        av_log(avctx, AV_LOG_WARNING, "Invalid DVB subtitles stream extradata!\n");
         ctx->composition_id = -1;
         ctx->ancillary_id   = -1;
     } else {
-        ctx->composition_id = AV_RB16(avctx->extradata);
-        ctx->ancillary_id   = AV_RB16(avctx->extradata + 2);
+        if (avctx->extradata_size > 5*ctx->substream + 2) {
+            ctx->composition_id = AV_RB16(avctx->extradata + 5*ctx->substream);
+            ctx->ancillary_id   = AV_RB16(avctx->extradata + 5*ctx->substream + 2);
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "Selected DVB subtitles sub-stream %d is not available\n", ctx->substream);
+            ctx->composition_id = AV_RB16(avctx->extradata);
+            ctx->ancillary_id   = AV_RB16(avctx->extradata + 2);
+        }
     }
 
+    ctx->version = -1;
+    ctx->prev_start = AV_NOPTS_VALUE;
+
     default_clut.id = -1;
     default_clut.next = NULL;
 
@@ -314,32 +348,41 @@ static av_cold int dvbsub_close_decoder(AVCodecContext *avctx)
     DVBSubContext *ctx = avctx->priv_data;
     DVBSubRegionDisplay *display;
 
-    delete_state(ctx);
+    delete_regions(ctx);
+
+    delete_objects(ctx);
+
+    delete_cluts(ctx);
+
+    av_freep(&ctx->display_definition);
 
     while (ctx->display_list) {
         display = ctx->display_list;
         ctx->display_list = display->next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
     return 0;
 }
 
-static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_2bit_string(AVCodecContext *avctx,
+                                   uint8_t *destbuf, int dbuf_len,
                                    const uint8_t **srcbuf, int buf_size,
-                                   int non_mod, uint8_t *map_table)
+                                   int non_mod, uint8_t *map_table, int x_pos)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
 
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
+
+    init_get_bits(&gb, *srcbuf, buf_size << 3);
 
-    bitstream_init8(&bc, *srcbuf, buf_size);
+    destbuf += x_pos;
 
-    while (bitstream_tell(&bc) < buf_size << 3 && pixels_read < dbuf_len) {
-        bits = bitstream_read(&bc, 2);
+    while (get_bits_count(&gb) < buf_size << 3 && pixels_read < dbuf_len) {
+        bits = get_bits(&gb, 2);
 
         if (bits) {
             if (non_mod != 1 || bits != 1) {
@@ -350,10 +393,10 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
             }
             pixels_read++;
         } else {
-            bits = bitstream_read_bit(&bc);
+            bits = get_bits1(&gb);
             if (bits == 1) {
-                run_length = bitstream_read(&bc, 3) + 3;
-                bits       = bitstream_read(&bc, 2);
+                run_length = get_bits(&gb, 3) + 3;
+                bits = get_bits(&gb, 2);
 
                 if (non_mod == 1 && bits == 1)
                     pixels_read += run_length;
@@ -366,12 +409,12 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
                     }
                 }
             } else {
-                bits = bitstream_read_bit(&bc);
+                bits = get_bits1(&gb);
                 if (bits == 0) {
-                    bits = bitstream_read(&bc, 2);
+                    bits = get_bits(&gb, 2);
                     if (bits == 2) {
-                        run_length = bitstream_read(&bc, 4) + 12;
-                        bits       = bitstream_read(&bc, 2);
+                        run_length = get_bits(&gb, 4) + 12;
+                        bits = get_bits(&gb, 2);
 
                         if (non_mod == 1 && bits == 1)
                             pixels_read += run_length;
@@ -384,8 +427,8 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 3) {
-                        run_length = bitstream_read(&bc, 8) + 29;
-                        bits = bitstream_read(&bc, 2);
+                        run_length = get_bits(&gb, 8) + 29;
+                        bits = get_bits(&gb, 2);
 
                         if (non_mod == 1 && bits == 1)
                             pixels_read += run_length;
@@ -398,17 +441,17 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 1) {
-                        pixels_read += 2;
                         if (map_table)
                             bits = map_table[0];
                         else
                             bits = 0;
-                        if (pixels_read <= dbuf_len) {
-                            *destbuf++ = bits;
+                        run_length = 2;
+                        while (run_length-- > 0 && pixels_read < dbuf_len) {
                             *destbuf++ = bits;
+                            pixels_read++;
                         }
                     } else {
-                        *srcbuf += (bitstream_tell(&bc) + 7) >> 3;
+                        (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
                         return pixels_read;
                     }
                 } else {
@@ -423,28 +466,30 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
         }
     }
 
-    if (bitstream_read(&bc, 6))
-        av_log(NULL, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+    if (get_bits(&gb, 6))
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
-    *srcbuf += (bitstream_tell(&bc) + 7) >> 3;
+    (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
 
     return pixels_read;
 }
 
-static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_4bit_string(AVCodecContext *avctx, uint8_t *destbuf, int dbuf_len,
                                    const uint8_t **srcbuf, int buf_size,
-                                   int non_mod, uint8_t *map_table)
+                                   int non_mod, uint8_t *map_table, int x_pos)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
 
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
+
+    init_get_bits(&gb, *srcbuf, buf_size << 3);
 
-    bitstream_init8(&bc, *srcbuf, buf_size);
+    destbuf += x_pos;
 
-    while (bitstream_tell(&bc) < buf_size << 3 && pixels_read < dbuf_len) {
-        bits = bitstream_read(&bc, 4);
+    while (get_bits_count(&gb) < buf_size << 3 && pixels_read < dbuf_len) {
+        bits = get_bits(&gb, 4);
 
         if (bits) {
             if (non_mod != 1 || bits != 1) {
@@ -455,12 +500,12 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
             }
             pixels_read++;
         } else {
-            bits = bitstream_read_bit(&bc);
+            bits = get_bits1(&gb);
             if (bits == 0) {
-                run_length = bitstream_read(&bc, 3);
+                run_length = get_bits(&gb, 3);
 
                 if (run_length == 0) {
-                    *srcbuf += (bitstream_tell(&bc) + 7) >> 3;
+                    (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
                     return pixels_read;
                 }
 
@@ -476,10 +521,10 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
                     pixels_read++;
                 }
             } else {
-                bits = bitstream_read_bit(&bc);
+                bits = get_bits1(&gb);
                 if (bits == 0) {
-                    run_length = bitstream_read(&bc, 2) + 4;
-                    bits       = bitstream_read(&bc, 4);
+                    run_length = get_bits(&gb, 2) + 4;
+                    bits = get_bits(&gb, 4);
 
                     if (non_mod == 1 && bits == 1)
                         pixels_read += run_length;
@@ -492,10 +537,10 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
                         }
                     }
                 } else {
-                    bits = bitstream_read(&bc, 2);
+                    bits = get_bits(&gb, 2);
                     if (bits == 2) {
-                        run_length = bitstream_read(&bc, 4) + 9;
-                        bits       = bitstream_read(&bc, 4);
+                        run_length = get_bits(&gb, 4) + 9;
+                        bits = get_bits(&gb, 4);
 
                         if (non_mod == 1 && bits == 1)
                             pixels_read += run_length;
@@ -508,8 +553,8 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 3) {
-                        run_length = bitstream_read(&bc, 8) + 25;
-                        bits = bitstream_read(&bc, 4);
+                        run_length = get_bits(&gb, 8) + 25;
+                        bits = get_bits(&gb, 4);
 
                         if (non_mod == 1 && bits == 1)
                             pixels_read += run_length;
@@ -522,14 +567,14 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 1) {
-                        pixels_read += 2;
                         if (map_table)
                             bits = map_table[0];
                         else
                             bits = 0;
-                        if (pixels_read <= dbuf_len) {
-                            *destbuf++ = bits;
+                        run_length = 2;
+                        while (run_length-- > 0 && pixels_read < dbuf_len) {
                             *destbuf++ = bits;
+                            pixels_read++;
                         }
                     } else {
                         if (map_table)
@@ -544,22 +589,25 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
         }
     }
 
-    if (bitstream_read(&bc, 8))
-        av_log(NULL, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+    if (get_bits(&gb, 8))
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
-    *srcbuf += (bitstream_tell(&bc) + 7) >> 3;
+    (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
 
     return pixels_read;
 }
 
-static int dvbsub_read_8bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_8bit_string(AVCodecContext *avctx,
+                                   uint8_t *destbuf, int dbuf_len,
                                     const uint8_t **srcbuf, int buf_size,
-                                    int non_mod, uint8_t *map_table)
+                                    int non_mod, uint8_t *map_table, int x_pos)
 {
     const uint8_t *sbuf_end = (*srcbuf) + buf_size;
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
+
+    destbuf += x_pos;
 
     while (*srcbuf < sbuf_end && pixels_read < dbuf_len) {
         bits = *(*srcbuf)++;
@@ -579,30 +627,243 @@ static int dvbsub_read_8bit_string(uint8_t *destbuf, int dbuf_len,
                 if (run_length == 0) {
                     return pixels_read;
                 }
+
+                bits = 0;
             } else {
                 bits = *(*srcbuf)++;
-
-                if (non_mod == 1 && bits == 1)
-                    pixels_read += run_length;
             }
-            if (map_table)
-                bits = map_table[0];
-            else
-                bits = 0;
-            while (run_length-- > 0 && pixels_read < dbuf_len) {
-                *destbuf++ = bits;
-                pixels_read++;
+            if (non_mod == 1 && bits == 1)
+                pixels_read += run_length;
+            else {
+                if (map_table)
+                    bits = map_table[bits];
+                while (run_length-- > 0 && pixels_read < dbuf_len) {
+                    *destbuf++ = bits;
+                    pixels_read++;
+                }
             }
         }
     }
 
     if (*(*srcbuf)++)
-        av_log(NULL, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
     return pixels_read;
 }
 
+static void compute_default_clut(DVBSubContext *ctx, uint8_t *clut, AVSubtitleRect *rect, int w, int h)
+{
+    uint8_t list[256] = {0};
+    uint8_t list_inv[256];
+    int counttab[256] = {0};
+    int (*counttab2)[256] = ctx->clut_count2;
+    int count, i, x, y;
+    ptrdiff_t stride = rect->linesize[0];
+
+    memset(ctx->clut_count2, 0 , sizeof(ctx->clut_count2));
+
+#define V(x,y) rect->data[0][(x) + (y)*stride]
+    for (y = 0; y<h; y++) {
+        for (x = 0; x<w; x++) {
+            int v = V(x,y) + 1;
+            int vl = x     ? V(x-1,y) + 1 : 0;
+            int vr = x+1<w ? V(x+1,y) + 1 : 0;
+            int vt = y     ? V(x,y-1) + 1 : 0;
+            int vb = y+1<h ? V(x,y+1) + 1 : 0;
+            counttab[v-1] += !!((v!=vl) + (v!=vr) + (v!=vt) + (v!=vb));
+            counttab2[vl][v-1] ++;
+            counttab2[vr][v-1] ++;
+            counttab2[vt][v-1] ++;
+            counttab2[vb][v-1] ++;
+        }
+    }
+#define L(x,y) list[d[(x) + (y)*stride]]
+
+    for (i = 0; i<256; i++) {
+        counttab2[i+1][i] = 0;
+    }
+    for (i = 0; i<256; i++) {
+        int bestscore = 0;
+        int bestv = 0;
+
+        for (x = 0; x < 256; x++) {
+            int scorev = 0;
+            if (list[x])
+                continue;
+            scorev += counttab2[0][x];
+            for (y = 0; y < 256; y++) {
+                scorev += list[y] * counttab2[y+1][x];
+            }
+
+            if (scorev) {
+                int score = 1024LL*scorev / counttab[x];
+                if (score > bestscore) {
+                    bestscore = score;
+                    bestv = x;
+                }
+            }
+        }
+        if (!bestscore)
+            break;
+        list    [ bestv ] = 1;
+        list_inv[     i ] = bestv;
+    }
+
+    count = FFMAX(i - 1, 1);
+    for (i--; i>=0; i--) {
+        int v = i*255/count;
+        AV_WN32(clut + 4*list_inv[i], RGBA(v/2,v,v/2,v));
+    }
+}
+
+
+static int save_subtitle_set(AVCodecContext *avctx, AVSubtitle *sub, int *got_output)
+{
+    DVBSubContext *ctx = avctx->priv_data;
+    DVBSubRegionDisplay *display;
+    DVBSubDisplayDefinition *display_def = ctx->display_definition;
+    DVBSubRegion *region;
+    AVSubtitleRect *rect;
+    DVBSubCLUT *clut;
+    uint32_t *clut_table;
+    int i;
+    int offset_x=0, offset_y=0;
+    int ret = 0;
+
+
+    if (display_def) {
+        offset_x = display_def->x;
+        offset_y = display_def->y;
+    }
+
+    /* Not touching AVSubtitles again*/
+    if(sub->num_rects) {
+        avpriv_request_sample(ctx, "Different Version of Segment asked Twice");
+        return AVERROR_PATCHWELCOME;
+    }
+    for (display = ctx->display_list; display; display = display->next) {
+        region = get_region(ctx, display->region_id);
+        if (region && region->dirty)
+            sub->num_rects++;
+    }
+
+    if(ctx->compute_edt == 0) {
+        sub->end_display_time = ctx->time_out * 1000;
+        *got_output = 1;
+    } else if (ctx->prev_start != AV_NOPTS_VALUE) {
+        sub->end_display_time = av_rescale_q((sub->pts - ctx->prev_start ), AV_TIME_BASE_Q, (AVRational){ 1, 1000 }) - 1;
+        *got_output = 1;
+    }
+    if (sub->num_rects > 0) {
+
+        sub->rects = av_mallocz_array(sizeof(*sub->rects), sub->num_rects);
+        if (!sub->rects) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        for (i = 0; i < sub->num_rects; i++) {
+            sub->rects[i] = av_mallocz(sizeof(*sub->rects[i]));
+            if (!sub->rects[i]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+        }
+
+        i = 0;
+
+        for (display = ctx->display_list; display; display = display->next) {
+            region = get_region(ctx, display->region_id);
+
+            if (!region)
+                continue;
+
+            if (!region->dirty)
+                continue;
+
+            rect = sub->rects[i];
+            rect->x = display->x_pos + offset_x;
+            rect->y = display->y_pos + offset_y;
+            rect->w = region->width;
+            rect->h = region->height;
+            rect->nb_colors = (1 << region->depth);
+            rect->type      = SUBTITLE_BITMAP;
+            rect->linesize[0] = region->width;
+
+            clut = get_clut(ctx, region->clut);
+
+            if (!clut)
+                clut = &default_clut;
+
+            switch (region->depth) {
+            case 2:
+                clut_table = clut->clut4;
+                break;
+            case 8:
+                clut_table = clut->clut256;
+                break;
+            case 4:
+            default:
+                clut_table = clut->clut16;
+                break;
+            }
+
+            rect->data[1] = av_mallocz(AVPALETTE_SIZE);
+            if (!rect->data[1]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+            memcpy(rect->data[1], clut_table, (1 << region->depth) * sizeof(uint32_t));
+
+            rect->data[0] = av_malloc(region->buf_size);
+            if (!rect->data[0]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
 
+            memcpy(rect->data[0], region->pbuf, region->buf_size);
+
+            if ((clut == &default_clut && ctx->compute_clut == -1) || ctx->compute_clut == 1) {
+                if (!region->has_computed_clut) {
+                    compute_default_clut(ctx, region->computed_clut, rect, rect->w, rect->h);
+                    region->has_computed_clut = 1;
+                }
+
+                memcpy(rect->data[1], region->computed_clut, sizeof(region->computed_clut));
+            }
+
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+{
+            int j;
+            for (j = 0; j < 4; j++) {
+                rect->pict.data[j] = rect->data[j];
+                rect->pict.linesize[j] = rect->linesize[j];
+            }
+}
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+            i++;
+        }
+    }
+
+    return 0;
+fail:
+    if (sub->rects) {
+        for(i=0; i<sub->num_rects; i++) {
+            rect = sub->rects[i];
+            if (rect) {
+                av_freep(&rect->data[0]);
+                av_freep(&rect->data[1]);
+            }
+            av_freep(&sub->rects[i]);
+        }
+        av_freep(&sub->rects);
+    }
+    sub->num_rects = 0;
+    return ret;
+}
 
 static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDisplay *display,
                                           const uint8_t *buf, int buf_size, int top_bottom, int non_mod)
@@ -621,6 +882,7 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
                          0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff};
     uint8_t *map_table;
 
+#if 0
     ff_dlog(avctx, "DVB pixel block size %d, %s field:\n", buf_size,
             top_bottom ? "bottom" : "top");
 
@@ -635,21 +897,22 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
 
     if (i % 16)
         ff_dlog(avctx, "\n");
+#endif
 
-    if (region == 0)
+    if (!region)
         return;
 
     pbuf = region->pbuf;
+    region->dirty = 1;
 
     x_pos = display->x_pos;
     y_pos = display->y_pos;
 
-    if ((y_pos & 1) != top_bottom)
-        y_pos++;
+    y_pos += top_bottom;
 
     while (buf < buf_end) {
-        if (x_pos > region->width || y_pos > region->height) {
-            av_log(avctx, AV_LOG_ERROR, "Invalid object location!\n");
+        if ((*buf!=0xf0 && x_pos >= region->width) || y_pos >= region->height) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid object location! %d-%d %d-%d %02x\n", x_pos, region->width, y_pos, region->height, *buf);
             return;
         }
 
@@ -662,9 +925,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
             else
                 map_table = NULL;
 
-            x_pos += dvbsub_read_2bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, map_table);
+            x_pos = dvbsub_read_2bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, map_table, x_pos);
             break;
         case 0x11:
             if (region->depth < 4) {
@@ -677,9 +940,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
             else
                 map_table = NULL;
 
-            x_pos += dvbsub_read_4bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, map_table);
+            x_pos = dvbsub_read_4bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, map_table, x_pos);
             break;
         case 0x12:
             if (region->depth < 8) {
@@ -687,9 +950,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
                 return;
             }
 
-            x_pos += dvbsub_read_8bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, NULL);
+            x_pos = dvbsub_read_8bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, NULL, x_pos);
             break;
 
         case 0x20:
@@ -716,6 +979,7 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
         }
     }
 
+    region->has_computed_clut = 0;
 }
 
 static int dvbsub_parse_object_segment(AVCodecContext *avctx,
@@ -724,7 +988,6 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
     DVBSubContext *ctx = avctx->priv_data;
 
     const uint8_t *buf_end = buf + buf_size;
-    const uint8_t *block;
     int object_id;
     DVBSubObject *object;
     DVBSubObjectDisplay *display;
@@ -750,12 +1013,13 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
         buf += 2;
 
         if (buf + top_field_len + bottom_field_len > buf_end) {
-            av_log(avctx, AV_LOG_ERROR, "Field data size too large\n");
+            av_log(avctx, AV_LOG_ERROR, "Field data size %d+%d too large\n", top_field_len, bottom_field_len);
             return AVERROR_INVALIDDATA;
         }
 
         for (display = object->display_list; display; display = display->object_list_next) {
-            block = buf;
+            const uint8_t *block = buf;
+            int bfl = bottom_field_len;
 
             dvbsub_parse_pixel_data_block(avctx, display, block, top_field_len, 0,
                                             non_modifying_color);
@@ -763,9 +1027,9 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
             if (bottom_field_len > 0)
                 block = buf + top_field_len;
             else
-                bottom_field_len = top_field_len;
+                bfl = top_field_len;
 
-            dvbsub_parse_pixel_data_block(avctx, display, block, bottom_field_len, 1,
+            dvbsub_parse_pixel_data_block(avctx, display, block, bfl, 1,
                                             non_modifying_color);
         }
 
@@ -785,6 +1049,7 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
 
     const uint8_t *buf_end = buf + buf_size;
     int i, clut_id;
+    int version;
     DVBSubCLUT *clut;
     int entry_id, depth , full_range;
     int y, cr, cb, alpha;
@@ -802,6 +1067,7 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         ff_dlog(avctx, "\n");
 
     clut_id = *buf++;
+    version = ((*buf)>>4)&15;
     buf += 1;
 
     clut = get_clut(ctx, clut_id);
@@ -814,11 +1080,16 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         memcpy(clut, &default_clut, sizeof(DVBSubCLUT));
 
         clut->id = clut_id;
+        clut->version = -1;
 
         clut->next = ctx->clut_list;
         ctx->clut_list = clut;
     }
 
+    if (clut->version != version) {
+
+    clut->version = version;
+
     while (buf + 4 < buf_end) {
         entry_id = *buf++;
 
@@ -826,7 +1097,6 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
 
         if (depth == 0) {
             av_log(avctx, AV_LOG_ERROR, "Invalid clut depth 0x%x!\n", *buf);
-            return AVERROR_INVALIDDATA;
         }
 
         full_range = (*buf++) & 1;
@@ -852,14 +1122,20 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         YUV_TO_RGB2_CCIR(r, g, b, y);
 
         ff_dlog(avctx, "clut %d := (%d,%d,%d,%d)\n", entry_id, r, g, b, alpha);
+        if (!!(depth & 0x80) + !!(depth & 0x40) + !!(depth & 0x20) > 1) {
+            ff_dlog(avctx, "More than one bit level marked: %x\n", depth);
+            if (avctx->strict_std_compliance > FF_COMPLIANCE_NORMAL)
+                return AVERROR_INVALIDDATA;
+        }
 
-        if (depth & 0x80)
+        if (depth & 0x80 && entry_id < 4)
             clut->clut4[entry_id] = RGBA(r,g,b,255 - alpha);
-        if (depth & 0x40)
+        else if (depth & 0x40 && entry_id < 16)
             clut->clut16[entry_id] = RGBA(r,g,b,255 - alpha);
-        if (depth & 0x20)
+        else if (depth & 0x20)
             clut->clut256[entry_id] = RGBA(r,g,b,255 - alpha);
     }
+    }
 
     return 0;
 }
@@ -872,10 +1148,12 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
 
     const uint8_t *buf_end = buf + buf_size;
     int region_id, object_id;
+    int av_unused version;
     DVBSubRegion *region;
     DVBSubObject *object;
     DVBSubObjectDisplay *display;
     int fill;
+    int ret;
 
     if (buf_size < 10)
         return AVERROR_INVALIDDATA;
@@ -890,11 +1168,13 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
             return AVERROR(ENOMEM);
 
         region->id = region_id;
+        region->version = -1;
 
         region->next = ctx->region_list;
         ctx->region_list = region;
     }
 
+    version = ((*buf)>>4) & 15;
     fill = ((*buf++) >> 3) & 1;
 
     region->width = AV_RB16(buf);
@@ -902,16 +1182,31 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
     region->height = AV_RB16(buf);
     buf += 2;
 
+    ret = av_image_check_size2(region->width, region->height, avctx->max_pixels, AV_PIX_FMT_PAL8, 0, avctx);
+    if (ret >= 0 && region->width * region->height * 2 > 320 * 1024 * 8) {
+        ret = AVERROR_INVALIDDATA;
+        av_log(avctx, AV_LOG_ERROR, "Pixel buffer memory constraint violated\n");
+    }
+    if (ret < 0) {
+        region->width= region->height= 0;
+        return ret;
+    }
+
     if (region->width * region->height != region->buf_size) {
         av_free(region->pbuf);
 
         region->buf_size = region->width * region->height;
 
         region->pbuf = av_malloc(region->buf_size);
-        if (!region->pbuf)
+        if (!region->pbuf) {
+            region->buf_size =
+            region->width =
+            region->height = 0;
             return AVERROR(ENOMEM);
+        }
 
         fill = 1;
+        region->dirty = 0;
     }
 
     region->depth = 1 << (((*buf++) >> 2) & 7);
@@ -921,9 +1216,10 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
     }
     region->clut = *buf++;
 
-    if (region->depth == 8)
+    if (region->depth == 8) {
         region->bgcolor = *buf++;
-    else {
+        buf += 1;
+    } else {
         buf += 1;
 
         if (region->depth == 4)
@@ -971,6 +1267,13 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
         display->y_pos = AV_RB16(buf) & 0xfff;
         buf += 2;
 
+        if (display->x_pos >= region->width ||
+            display->y_pos >= region->height) {
+            av_log(avctx, AV_LOG_ERROR, "Object outside region\n");
+            av_free(display);
+            return AVERROR_INVALIDDATA;
+        }
+
         if ((object->type == 1 || object->type == 2) && buf+1 < buf_end) {
             display->fgcolor = *buf++;
             display->bgcolor = *buf++;
@@ -987,7 +1290,7 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
 }
 
 static int dvbsub_parse_page_segment(AVCodecContext *avctx,
-                                     const uint8_t *buf, int buf_size)
+                                     const uint8_t *buf, int buf_size, AVSubtitle *sub, int *got_output)
 {
     DVBSubContext *ctx = avctx->priv_data;
     DVBSubRegionDisplay *display;
@@ -996,27 +1299,50 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
     const uint8_t *buf_end = buf + buf_size;
     int region_id;
     int page_state;
+    int timeout;
+    int version;
 
     if (buf_size < 1)
         return AVERROR_INVALIDDATA;
 
-    ctx->time_out = *buf++;
+    timeout = *buf++;
+    version = ((*buf)>>4) & 15;
     page_state = ((*buf++) >> 2) & 3;
 
+    if (ctx->version == version) {
+        return 0;
+    }
+
+    ctx->time_out = timeout;
+    ctx->version = version;
+
     ff_dlog(avctx, "Page time out %ds, state %d\n", ctx->time_out, page_state);
 
-    if (page_state == 2) {
-        delete_state(ctx);
+    if(ctx->compute_edt == 1)
+        save_subtitle_set(avctx, sub, got_output);
+
+    if (page_state == 1 || page_state == 2) {
+        delete_regions(ctx);
+        delete_objects(ctx);
+        delete_cluts(ctx);
     }
 
     tmp_display_list = ctx->display_list;
     ctx->display_list = NULL;
-    ctx->display_list_size = 0;
 
     while (buf + 5 < buf_end) {
         region_id = *buf++;
         buf += 1;
 
+        display = ctx->display_list;
+        while (display && display->region_id != region_id) {
+            display = display->next;
+        }
+        if (display) {
+            av_log(avctx, AV_LOG_ERROR, "duplicate region\n");
+            break;
+        }
+
         display = tmp_display_list;
         tmp_ptr = &tmp_display_list;
 
@@ -1042,7 +1368,6 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
 
         display->next = ctx->display_list;
         ctx->display_list = display;
-        ctx->display_list_size++;
 
         ff_dlog(avctx, "Region %d, (%d,%d)\n", region_id, display->x_pos, display->y_pos);
     }
@@ -1052,7 +1377,7 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
 
         tmp_display_list = display->next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
     return 0;
@@ -1060,7 +1385,7 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
 
 
 #ifdef DEBUG
-static void png_save(const char *filename, uint32_t *bitmap, int w, int h)
+static void png_save(DVBSubContext *ctx, const char *filename, uint32_t *bitmap, int w, int h)
 {
     int x, y, v;
     FILE *f;
@@ -1110,13 +1435,13 @@ static void png_save(const char *filename, uint32_t *bitmap, int w, int h)
 
     snprintf(command, sizeof(command), "pnmtopng -alpha %s %s > %s.png 2> /dev/null", fname2, fname, filename);
     if (system(command) != 0) {
-        printf("Error running pnmtopng\n");
+        av_log(ctx, AV_LOG_ERROR, "Error running pnmtopng\n");
         return;
     }
 
     snprintf(command, sizeof(command), "rm %s %s", fname, fname2);
     if (system(command) != 0) {
-        printf("Error removing %s and %s\n", fname, fname2);
+        av_log(ctx, AV_LOG_ERROR, "Error removing %s and %s\n", fname, fname2);
         return;
     }
 }
@@ -1141,6 +1466,9 @@ static int save_display_set(DVBSubContext *ctx)
     for (display = ctx->display_list; display; display = display->next) {
         region = get_region(ctx, display->region_id);
 
+        if (!region)
+            return -1;
+
         if (x_pos == -1) {
             x_pos = display->x_pos;
             y_pos = display->y_pos;
@@ -1171,17 +1499,20 @@ static int save_display_set(DVBSubContext *ctx)
 
         pbuf = av_malloc(width * height * 4);
         if (!pbuf)
-            return AVERROR(ENOMEM);
+            return -1;
 
         for (display = ctx->display_list; display; display = display->next) {
             region = get_region(ctx, display->region_id);
 
+            if (!region)
+                return -1;
+
             x_off = display->x_pos - x_pos;
             y_off = display->y_pos - y_pos;
 
             clut = get_clut(ctx, region->clut);
 
-            if (clut == 0)
+            if (!clut)
                 clut = &default_clut;
 
             switch (region->depth) {
@@ -1208,9 +1539,9 @@ static int save_display_set(DVBSubContext *ctx)
 
         snprintf(filename, sizeof(filename), "dvbs.%d", fileno_index);
 
-        png_save(filename, pbuf, width, height);
+        png_save(ctx, filename, pbuf, width, height);
 
-        av_free(pbuf);
+        av_freep(&pbuf);
     }
 
     fileno_index++;
@@ -1246,14 +1577,18 @@ static int dvbsub_parse_display_definition_segment(AVCodecContext *avctx,
     display_def->y       = 0;
     display_def->width   = bytestream_get_be16(&buf) + 1;
     display_def->height  = bytestream_get_be16(&buf) + 1;
-
-    if (buf_size < 13)
-        return AVERROR_INVALIDDATA;
+    if (!avctx->width || !avctx->height) {
+        avctx->width  = display_def->width;
+        avctx->height = display_def->height;
+    }
 
     if (info_byte & 1<<3) { // display_window_flag
+        if (buf_size < 13)
+            return AVERROR_INVALIDDATA;
+
         display_def->x = bytestream_get_be16(&buf);
-        display_def->y = bytestream_get_be16(&buf);
         display_def->width  = bytestream_get_be16(&buf) - display_def->x + 1;
+        display_def->y = bytestream_get_be16(&buf);
         display_def->height = bytestream_get_be16(&buf) - display_def->y + 1;
     }
 
@@ -1261,123 +1596,16 @@ static int dvbsub_parse_display_definition_segment(AVCodecContext *avctx,
 }
 
 static int dvbsub_display_end_segment(AVCodecContext *avctx, const uint8_t *buf,
-                                      int buf_size, AVSubtitle *sub)
+                                      int buf_size, AVSubtitle *sub,int *got_output)
 {
     DVBSubContext *ctx = avctx->priv_data;
-    DVBSubDisplayDefinition *display_def = ctx->display_definition;
-
-    DVBSubRegion *region;
-    DVBSubRegionDisplay *display;
-    AVSubtitleRect *rect;
-    DVBSubCLUT *clut;
-    uint32_t *clut_table;
-    int i;
-    int offset_x=0, offset_y=0;
-
-    sub->rects = NULL;
-    sub->start_display_time = 0;
-    sub->end_display_time = ctx->time_out * 1000;
-    sub->format = 0;
-
-    if (display_def) {
-        offset_x = display_def->x;
-        offset_y = display_def->y;
-    }
-
-    sub->num_rects = ctx->display_list_size;
-
-    if (sub->num_rects > 0) {
-        sub->rects = av_mallocz(sizeof(*sub->rects) * sub->num_rects);
-        if (!sub->rects)
-            return AVERROR(ENOMEM);
-        for (i = 0; i < sub->num_rects; i++) {
-            sub->rects[i] = av_mallocz(sizeof(*sub->rects[i]));
-            if (!sub->rects[i]) {
-                int j;
-                for (j = 0; j < i; j ++)
-                    av_free(sub->rects[j]);
-                av_free(sub->rects);
-                return AVERROR(ENOMEM);
-            }
-        }
-    }
-
-    i = 0;
-
-    for (display = ctx->display_list; display; display = display->next) {
-        region = get_region(ctx, display->region_id);
-        rect = sub->rects[i];
-
-        if (!region)
-            continue;
-
-        rect->x = display->x_pos + offset_x;
-        rect->y = display->y_pos + offset_y;
-        rect->w = region->width;
-        rect->h = region->height;
-        rect->nb_colors = 16;
-        rect->type      = SUBTITLE_BITMAP;
-        rect->linesize[0] = region->width;
-
-        clut = get_clut(ctx, region->clut);
-
-        if (!clut)
-            clut = &default_clut;
-
-        switch (region->depth) {
-        case 2:
-            clut_table = clut->clut4;
-            break;
-        case 8:
-            clut_table = clut->clut256;
-            break;
-        case 4:
-        default:
-            clut_table = clut->clut16;
-            break;
-        }
-
-        rect->data[1] = av_mallocz(AVPALETTE_SIZE);
-        if (!rect->data[1]) {
-            for (i = 0; i < sub->num_rects; i++)
-                av_free(sub->rects[i]);
-            av_free(sub->rects);
-            return AVERROR(ENOMEM);
-        }
-        memcpy(rect->data[1], clut_table, (1 << region->depth) * sizeof(uint32_t));
-
-        rect->data[0] = av_malloc(region->buf_size);
-        if (!rect->data[0]) {
-            av_free(rect->data[1]);
-            for (i = 0; i < sub->num_rects; i++)
-                av_free(sub->rects[i]);
-            av_free(sub->rects);
-            return AVERROR(ENOMEM);
-        }
-        memcpy(rect->data[0], region->pbuf, region->buf_size);
-
-#if FF_API_AVPICTURE
-FF_DISABLE_DEPRECATION_WARNINGS
-{
-        int j;
-        for (j = 0; j < 4; j++) {
-            rect->pict.data[j] = rect->data[j];
-            rect->pict.linesize[j] = rect->linesize[j];
-        }
-}
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-        i++;
-    }
-
-    sub->num_rects = i;
 
+    if(ctx->compute_edt == 0)
+        save_subtitle_set(avctx, sub, got_output);
 #ifdef DEBUG
     save_display_set(ctx);
 #endif
-
-    return 1;
+    return 0;
 }
 
 static int dvbsub_decode(AVCodecContext *avctx,
@@ -1393,6 +1621,9 @@ static int dvbsub_decode(AVCodecContext *avctx,
     int page_id;
     int segment_length;
     int i;
+    int ret = 0;
+    int got_segment = 0;
+    int got_dds = 0;
 
     ff_dlog(avctx, "DVB sub packet:\n");
 
@@ -1421,9 +1652,14 @@ static int dvbsub_decode(AVCodecContext *avctx,
         segment_length = AV_RB16(p);
         p += 2;
 
+        if (avctx->debug & FF_DEBUG_STARTCODE) {
+            av_log(avctx, AV_LOG_DEBUG, "segment_type:%d page_id:%d segment_length:%d\n", segment_type, page_id, segment_length);
+        }
+
         if (p_end - p < segment_length) {
             ff_dlog(avctx, "incomplete or broken packet");
-            return -1;
+            ret = -1;
+            goto end;
         }
 
         if (page_id == ctx->composition_id || page_id == ctx->ancillary_id ||
@@ -1431,24 +1667,35 @@ static int dvbsub_decode(AVCodecContext *avctx,
             int ret = 0;
             switch (segment_type) {
             case DVBSUB_PAGE_SEGMENT:
-                ret = dvbsub_parse_page_segment(avctx, p, segment_length);
+                ret = dvbsub_parse_page_segment(avctx, p, segment_length, sub, data_size);
+                got_segment |= 1;
                 break;
             case DVBSUB_REGION_SEGMENT:
                 ret = dvbsub_parse_region_segment(avctx, p, segment_length);
+                got_segment |= 2;
                 break;
             case DVBSUB_CLUT_SEGMENT:
                 ret = dvbsub_parse_clut_segment(avctx, p, segment_length);
+                if (ret < 0) goto end;
+                got_segment |= 4;
                 break;
             case DVBSUB_OBJECT_SEGMENT:
                 ret = dvbsub_parse_object_segment(avctx, p, segment_length);
+                got_segment |= 8;
                 break;
             case DVBSUB_DISPLAYDEFINITION_SEGMENT:
                 ret = dvbsub_parse_display_definition_segment(avctx, p,
                                                               segment_length);
+                got_dds = 1;
                 break;
             case DVBSUB_DISPLAY_SEGMENT:
-                ret = dvbsub_display_end_segment(avctx, p, segment_length, sub);
-                *data_size = ret;
+                ret = dvbsub_display_end_segment(avctx, p, segment_length, sub, data_size);
+                if (got_segment == 15 && !got_dds && !avctx->width && !avctx->height) {
+                    // Default from ETSI EN 300 743 V1.3.1 (7.2.1)
+                    avctx->width  = 720;
+                    avctx->height = 576;
+                }
+                got_segment |= 16;
                 break;
             default:
                 ff_dlog(avctx, "Subtitling segment type 0x%x, page id %d, length %d\n",
@@ -1456,15 +1703,44 @@ static int dvbsub_decode(AVCodecContext *avctx,
                 break;
             }
             if (ret < 0)
-                return ret;
+                goto end;
         }
 
         p += segment_length;
     }
+    // Some streams do not send a display segment but if we have all the other
+    // segments then we need no further data.
+    if (got_segment == 15) {
+        av_log(avctx, AV_LOG_DEBUG, "Missing display_end_segment, emulating\n");
+        dvbsub_display_end_segment(avctx, p, 0, sub, data_size);
+    }
+
+end:
+    if(ret < 0) {
+        *data_size = 0;
+        avsubtitle_free(sub);
+        return ret;
+    } else {
+        if(ctx->compute_edt == 1 )
+            FFSWAP(int64_t, ctx->prev_start, sub->pts);
+    }
 
     return p - buf;
 }
 
+#define DS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM
+static const AVOption options[] = {
+    {"compute_edt", "compute end of time using pts or timeout", offsetof(DVBSubContext, compute_edt), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, DS},
+    {"compute_clut", "compute clut when not available(-1) or always(1) or never(0)", offsetof(DVBSubContext, compute_clut), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, DS},
+    {"dvb_substream", "", offsetof(DVBSubContext, substream), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 63, DS},
+    {NULL}
+};
+static const AVClass dvbsubdec_class = {
+    .class_name = "DVB Sub Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 AVCodec ff_dvbsub_decoder = {
     .name           = "dvbsub",
@@ -1475,4 +1751,5 @@ AVCodec ff_dvbsub_decoder = {
     .init           = dvbsub_init_decoder,
     .close          = dvbsub_close_decoder,
     .decode         = dvbsub_decode,
+    .priv_class     = &dvbsubdec_class,
 };
diff --git a/libavcodec/dvbtxt.h b/libavcodec/dvbtxt.h
new file mode 100644
index 0000000..ff88fcf
--- /dev/null
+++ b/libavcodec/dvbtxt.h
@@ -0,0 +1,41 @@
+/*
+ * DVB teletext common functions.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DVBTXT_H
+#define AVCODEC_DVBTXT_H
+
+#include "libavutil/attributes.h"
+
+/* Returns true if data identifier matches a teletext stream according to EN
+ * 301 775 section 4.4.2 */
+static av_always_inline int ff_data_identifier_is_teletext(int data_identifier)
+{
+    return (data_identifier >= 0x10 && data_identifier <= 0x1F ||
+            data_identifier >= 0x99 && data_identifier <= 0x9B);
+}
+
+/* Returns true if data unit id matches EBU teletext data according to
+ * EN 301 775 section 4.4.2 */
+static av_always_inline int ff_data_unit_id_is_teletext(int data_unit_id)
+{
+    return (data_unit_id == 0x02 || data_unit_id == 0x03);
+}
+
+#endif /* AVCODEC_DVBTXT_H */
diff --git a/libavcodec/dvd_nav_parser.c b/libavcodec/dvd_nav_parser.c
new file mode 100644
index 0000000..6e2352d
--- /dev/null
+++ b/libavcodec/dvd_nav_parser.c
@@ -0,0 +1,115 @@
+/*
+ * DVD navigation block parser for FFmpeg
+ * Copyright (c) 2013 The FFmpeg Project
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "get_bits.h"
+#include "parser.h"
+
+#define PCI_SIZE  980
+#define DSI_SIZE 1018
+
+/* parser definition */
+typedef struct DVDNavParseContext {
+    uint32_t     lba;
+    uint8_t      buffer[PCI_SIZE+DSI_SIZE];
+    int          copied;
+} DVDNavParseContext;
+
+static av_cold int dvd_nav_parse_init(AVCodecParserContext *s)
+{
+    DVDNavParseContext *pc = s->priv_data;
+
+    pc->lba    = 0xFFFFFFFF;
+    pc->copied = 0;
+    return 0;
+}
+
+static int dvd_nav_parse(AVCodecParserContext *s,
+                         AVCodecContext *avctx,
+                         const uint8_t **poutbuf, int *poutbuf_size,
+                         const uint8_t *buf, int buf_size)
+{
+    DVDNavParseContext *pc1 = s->priv_data;
+    int lastPacket          = 0;
+    int valid               = 0;
+
+    s->pict_type = AV_PICTURE_TYPE_NONE;
+
+    avctx->time_base.num = 1;
+    avctx->time_base.den = 90000;
+
+    if (buf && buf_size) {
+        switch(buf[0]) {
+            case 0x00:
+                if (buf_size == PCI_SIZE) {
+                    /* PCI */
+                    uint32_t lba      = AV_RB32(&buf[0x01]);
+                    uint32_t startpts = AV_RB32(&buf[0x0D]);
+                    uint32_t endpts   = AV_RB32(&buf[0x11]);
+
+                    if (endpts > startpts) {
+                        pc1->lba    = lba;
+                        s->pts      = (int64_t)startpts;
+                        s->duration = endpts - startpts;
+
+                        memcpy(pc1->buffer, buf, PCI_SIZE);
+                        pc1->copied = PCI_SIZE;
+                        valid       = 1;
+                    }
+                }
+                break;
+
+            case 0x01:
+                if ((buf_size == DSI_SIZE) && (pc1->copied == PCI_SIZE)) {
+                    /* DSI */
+                    uint32_t lba = AV_RB32(&buf[0x05]);
+
+                    if (lba == pc1->lba) {
+                        memcpy(pc1->buffer + pc1->copied, buf, DSI_SIZE);
+                        lastPacket  = 1;
+                        valid       = 1;
+                    }
+                }
+                break;
+        }
+    }
+
+    if (!valid || lastPacket) {
+        pc1->copied = 0;
+        pc1->lba    = 0xFFFFFFFF;
+    }
+
+    if (lastPacket) {
+        *poutbuf      = pc1->buffer;
+        *poutbuf_size = sizeof(pc1->buffer);
+    } else {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+    }
+
+    return buf_size;
+}
+
+AVCodecParser ff_dvd_nav_parser = {
+    .codec_ids      = { AV_CODEC_ID_DVD_NAV },
+    .priv_data_size = sizeof(DVDNavParseContext),
+    .parser_init    = dvd_nav_parse_init,
+    .parser_parse   = dvd_nav_parse,
+};
diff --git a/libavcodec/dvdata.c b/libavcodec/dvdata.c
index 985cda7..231569a 100644
--- a/libavcodec/dvdata.c
+++ b/libavcodec/dvdata.c
@@ -2,20 +2,20 @@
  * Constants for DV codec
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,71 +69,6 @@ const uint8_t ff_dv_quant_shifts[22][4] = {
 
 const uint8_t ff_dv_quant_offset[4] = { 6, 3, 0, 1 };
 
-const int ff_dv_iweight_88[64] = {
-    32768, 16710, 16710, 17735, 17015, 17735, 18197, 18079,
-    18079, 18197, 18725, 18559, 19196, 18559, 18725, 19284,
-    19108, 19692, 19692, 19108, 19284, 21400, 19645, 20262,
-    20214, 20262, 19645, 21400, 22733, 21845, 20867, 20815,
-    20815, 20867, 21845, 22733, 23173, 23173, 21400, 21400,
-    21400, 23173, 23173, 24600, 23764, 22017, 22017, 23764,
-    24600, 25267, 24457, 22672, 24457, 25267, 25971, 25191,
-    25191, 25971, 26715, 27962, 26715, 29642, 29642, 31536,
-};
-const int ff_dv_iweight_248[64] = {
-    32768, 17735, 16710, 18079, 18725, 21400, 17735, 19196,
-    19108, 21845, 16384, 17735, 18725, 21400, 16710, 18079,
-    20262, 23173, 18197, 19692, 18725, 20262, 20815, 23764,
-    17735, 19196, 19108, 21845, 20262, 23173, 18197, 19692,
-    21400, 24457, 19284, 20867, 21400, 23173, 22017, 25191,
-    18725, 20262, 20815, 23764, 21400, 24457, 19284, 20867,
-    24457, 27962, 22733, 24600, 25971, 29642, 21400, 23173,
-    22017, 25191, 24457, 27962, 22733, 24600, 25971, 29642,
-};
-
-/**
- * The "inverse" DV100 weights are actually just the spec weights (zig-zagged).
- */
-const int ff_dv_iweight_1080_y[64] = {
-    128,  16,  16,  17,  17,  17,  18,  18,
-     18,  18,  18,  18,  19,  18,  18,  19,
-     19,  19,  19,  19,  19,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  45,  45,  42,  42,
-     42,  45,  45,  48,  46,  43,  43,  46,
-     48,  49,  48,  44,  48,  49, 101,  98,
-     98, 101, 104, 109, 104, 116, 116, 123,
-};
-const int ff_dv_iweight_1080_c[64] = {
-    128,  16,  16,  17,  17,  17,  25,  25,
-     25,  25,  26,  25,  26,  25,  26,  26,
-     26,  27,  27,  26,  26,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  91,  91,  84,  84,
-     84,  91,  91,  96,  93,  86,  86,  93,
-     96, 197, 191, 177, 191, 197, 203, 197,
-    197, 203, 209, 219, 209, 232, 232, 246,
-};
-const int ff_dv_iweight_720_y[64] = {
-    128,  16,  16,  17,  17,  17,  18,  18,
-     18,  18,  18,  18,  19,  18,  18,  19,
-     19,  19,  19,  19,  19,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  68,  68,  63,  63,
-     63,  68,  68,  96,  92,  86,  86,  92,
-     96,  98,  96,  88,  96,  98, 202, 196,
-    196, 202, 208, 218, 208, 232, 232, 246,
-};
-const int ff_dv_iweight_720_c[64] = {
-    128,  24,  24,  26,  26,  26,  36,  36,
-     36,  36,  36,  36,  38,  36,  36,  38,
-     38,  38,  38,  38,  38,  84,  76,  80,
-     80,  80,  76,  84,  88,  86,  82,  82,
-     82,  82,  86,  88, 182, 182, 168, 168,
-    168, 182, 182, 192, 186, 192, 172, 186,
-    192, 394, 382, 354, 382, 394, 406, 394,
-    394, 406, 418, 438, 418, 464, 464, 492,
-};
-
 /*
  * There's a catch about the following three tables: the mapping they establish
  * between (run, level) and vlc is not 1-1. So you have to watch out for that
diff --git a/libavcodec/dvdata.h b/libavcodec/dvdata.h
index 8e7c0fb..e0ed043 100644
--- a/libavcodec/dvdata.h
+++ b/libavcodec/dvdata.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,13 +26,6 @@ extern const uint8_t ff_dv_zigzag248_direct[64];
 extern const uint8_t ff_dv_quant_shifts[22][4];
 extern const uint8_t ff_dv_quant_offset[4];
 
-extern const int ff_dv_iweight_88[64];
-extern const int ff_dv_iweight_248[64];
-extern const int ff_dv_iweight_1080_y[64];
-extern const int ff_dv_iweight_1080_c[64];
-extern const int ff_dv_iweight_720_y[64];
-extern const int ff_dv_iweight_720_c[64];
-
 #define NB_DV_VLC 409
 
 extern const uint16_t ff_dv_vlc_bits[NB_DV_VLC];
diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index a2f0171..7b16787 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -13,20 +13,20 @@
  * Many thanks to Dan Dennedy <dan@dennedy.org> for providing wealth
  * of DV technical info.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,14 +35,16 @@
  * DV decoder
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "dv.h"
+#include "dv_profile_internal.h"
 #include "dvdata.h"
+#include "get_bits.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "put_bits.h"
@@ -60,18 +62,136 @@ typedef struct BlockInfo {
 
 static const int dv_iweight_bits = 14;
 
+static const uint16_t dv_iweight_88[64] = {
+    32768, 16705, 16705, 17734, 17032, 17734, 18205, 18081,
+    18081, 18205, 18725, 18562, 19195, 18562, 18725, 19266,
+    19091, 19705, 19705, 19091, 19266, 21407, 19643, 20267,
+    20228, 20267, 19643, 21407, 22725, 21826, 20853, 20806,
+    20806, 20853, 21826, 22725, 23170, 23170, 21407, 21400,
+    21407, 23170, 23170, 24598, 23786, 22018, 22018, 23786,
+    24598, 25251, 24465, 22654, 24465, 25251, 25972, 25172,
+    25172, 25972, 26722, 27969, 26722, 29692, 29692, 31521,
+};
+static const uint16_t dv_iweight_248[64] = {
+    32768, 16384, 16705, 16705, 17734, 17734, 17734, 17734,
+    18081, 18081, 18725, 18725, 21407, 21407, 19091, 19091,
+    19195, 19195, 18205, 18205, 18725, 18725, 19705, 19705,
+    20267, 20267, 21826, 21826, 23170, 23170, 20806, 20806,
+    20267, 20267, 19266, 19266, 21407, 21407, 20853, 20853,
+    21400, 21400, 23786, 23786, 24465, 24465, 22018, 22018,
+    23170, 23170, 22725, 22725, 24598, 24598, 24465, 24465,
+    25172, 25172, 27969, 27969, 25972, 25972, 29692, 29692
+};
+
+/**
+ * The "inverse" DV100 weights are actually just the spec weights (zig-zagged).
+ */
+static const uint16_t dv_iweight_1080_y[64] = {
+    128,  16,  16,  17,  17,  17,  18,  18,
+     18,  18,  18,  18,  19,  18,  18,  19,
+     19,  19,  19,  19,  19,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  45,  45,  42,  42,
+     42,  45,  45,  48,  46,  43,  43,  46,
+     48,  49,  48,  44,  48,  49, 101,  98,
+     98, 101, 104, 109, 104, 116, 116, 123,
+};
+static const uint16_t dv_iweight_1080_c[64] = {
+    128,  16,  16,  17,  17,  17,  25,  25,
+     25,  25,  26,  25,  26,  25,  26,  26,
+     26,  27,  27,  26,  26,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  91,  91,  84,  84,
+     84,  91,  91,  96,  93,  86,  86,  93,
+     96, 197, 191, 177, 191, 197, 203, 197,
+    197, 203, 209, 219, 209, 232, 232, 246,
+};
+static const uint16_t dv_iweight_720_y[64] = {
+    128,  16,  16,  17,  17,  17,  18,  18,
+     18,  18,  18,  18,  19,  18,  18,  19,
+     19,  19,  19,  19,  19,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  68,  68,  63,  63,
+     63,  68,  68,  96,  92,  86,  86,  92,
+     96,  98,  96,  88,  96,  98, 202, 196,
+    196, 202, 208, 218, 208, 232, 232, 246,
+};
+static const uint16_t dv_iweight_720_c[64] = {
+    128,  24,  24,  26,  26,  26,  36,  36,
+     36,  36,  36,  36,  38,  36,  36,  38,
+     38,  38,  38,  38,  38,  84,  76,  80,
+     80,  80,  76,  84,  88,  86,  82,  82,
+     82,  82,  86,  88, 182, 182, 168, 168,
+    168, 182, 182, 192, 186, 192, 172, 186,
+    192, 394, 382, 354, 382, 394, 406, 394,
+    394, 406, 418, 438, 418, 464, 464, 492,
+};
+
+static void dv_init_weight_tables(DVVideoContext *ctx, const AVDVProfile *d)
+{
+    int j, i, c, s;
+    uint32_t *factor1 = &ctx->idct_factor[0],
+             *factor2 = &ctx->idct_factor[DV_PROFILE_IS_HD(d) ? 4096 : 2816];
+
+    if (DV_PROFILE_IS_HD(d)) {
+        /* quantization quanta by QNO for DV100 */
+        static const uint8_t dv100_qstep[16] = {
+            1, /* QNO = 0 and 1 both have no quantization */
+            1,
+            2, 3, 4, 5, 6, 7, 8, 16, 18, 20, 22, 24, 28, 52
+        };
+        const uint16_t *iweight1, *iweight2;
+
+        if (d->height == 720) {
+            iweight1 = &dv_iweight_720_y[0];
+            iweight2 = &dv_iweight_720_c[0];
+        } else {
+            iweight1 = &dv_iweight_1080_y[0];
+            iweight2 = &dv_iweight_1080_c[0];
+        }
+        for (c = 0; c < 4; c++) {
+            for (s = 0; s < 16; s++) {
+                for (i = 0; i < 64; i++) {
+                    *factor1++ = (dv100_qstep[s] << (c + 9)) * iweight1[i];
+                    *factor2++ = (dv100_qstep[s] << (c + 9)) * iweight2[i];
+                }
+            }
+        }
+    } else {
+        static const uint8_t dv_quant_areas[4] = { 6, 21, 43, 64 };
+        const uint16_t *iweight1 = &dv_iweight_88[0];
+        for (j = 0; j < 2; j++, iweight1 = &dv_iweight_248[0]) {
+            for (s = 0; s < 22; s++) {
+                for (i = c = 0; c < 4; c++) {
+                    for (; i < dv_quant_areas[c]; i++) {
+                        *factor1   = iweight1[i] << (ff_dv_quant_shifts[s][c] + 1);
+                        *factor2++ = (*factor1++) << 1;
+                    }
+                }
+            }
+        }
+    }
+}
+
 static av_cold int dvvideo_decode_init(AVCodecContext *avctx)
 {
     DVVideoContext *s = avctx->priv_data;
     IDCTDSPContext idsp;
     int i;
 
+    memset(&idsp,0, sizeof(idsp));
     ff_idctdsp_init(&idsp, avctx);
 
     for (i = 0; i < 64; i++)
         s->dv_zigzag[0][i] = idsp.idct_permutation[ff_zigzag_direct[i]];
 
-    memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1]));
+    if (avctx->lowres){
+        for (i = 0; i < 64; i++){
+            int j = ff_dv_zigzag248_direct[i];
+            s->dv_zigzag[1][i] = idsp.idct_permutation[(j & 7) + (j & 8) * 4 + (j & 48) / 2];
+        }
+    }else
+        memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1]));
 
     s->idct_put[0] = idsp.idct_put;
     s->idct_put[1] = ff_simple_idct248_put;
@@ -80,34 +200,51 @@ static av_cold int dvvideo_decode_init(AVCodecContext *avctx)
 }
 
 /* decode AC coefficients */
-static void dv_decode_ac(BitstreamContext *bc, BlockInfo *mb, int16_t *block)
+static void dv_decode_ac(GetBitContext *gb, BlockInfo *mb, int16_t *block)
 {
+    int last_index = gb->size_in_bits;
     const uint8_t  *scan_table   = mb->scan_table;
     const uint32_t *factor_table = mb->factor_table;
     int pos                      = mb->pos;
     int partial_bit_count        = mb->partial_bit_count;
-    int level, run;
+    int level, run, vlc_len, index;
+
+    OPEN_READER_NOSIZE(re, gb);
+    UPDATE_CACHE(re, gb);
 
     /* if we must parse a partial VLC, we do it here */
     if (partial_bit_count > 0) {
-        bitstream_unget(bc, mb->partial_bit_buffer, partial_bit_count);
+        re_cache              = re_cache >> partial_bit_count |
+                                mb->partial_bit_buffer;
+        re_index             -= partial_bit_count;
         mb->partial_bit_count = 0;
     }
 
     /* get the AC coefficients until last_index is reached */
     for (;;) {
-        BitstreamContext tmp = *bc;
-
-        ff_dlog(NULL, "%2d: bits=%04x index=%d\n",
-                pos, bitstream_peek(bc, 16), bitstream_tell(bc));
-
-        BITSTREAM_RL_VLC(level, run, bc, ff_dv_rl_vlc, TEX_VLC_BITS, 2);
-
-        if (bitstream_bits_left(bc) < 0) {
-            mb->partial_bit_count  = bitstream_bits_left(&tmp);
-            mb->partial_bit_buffer = bitstream_peek(&tmp, mb->partial_bit_count);
+        ff_dlog(NULL, "%2d: bits=%04"PRIx32" index=%u\n",
+                pos, SHOW_UBITS(re, gb, 16), re_index);
+        /* our own optimized GET_RL_VLC */
+        index   = NEG_USR32(re_cache, TEX_VLC_BITS);
+        vlc_len = ff_dv_rl_vlc[index].len;
+        if (vlc_len < 0) {
+            index = NEG_USR32((unsigned) re_cache << TEX_VLC_BITS, -vlc_len) +
+                    ff_dv_rl_vlc[index].level;
+            vlc_len = TEX_VLC_BITS - vlc_len;
+        }
+        level = ff_dv_rl_vlc[index].level;
+        run   = ff_dv_rl_vlc[index].run;
+
+        /* gotta check if we're still within gb boundaries */
+        if (re_index + vlc_len > last_index) {
+            /* should be < 16 bits otherwise a codeword could have been parsed */
+            mb->partial_bit_count  = last_index - re_index;
+            mb->partial_bit_buffer = re_cache & ~(-1u >> mb->partial_bit_count);
+            re_index               = last_index;
             break;
         }
+        re_index += vlc_len;
+
         ff_dlog(NULL, "run=%d level=%d\n", run, level);
         pos += run;
         if (pos >= 64)
@@ -116,22 +253,22 @@ static void dv_decode_ac(BitstreamContext *bc, BlockInfo *mb, int16_t *block)
         level = (level * factor_table[pos] + (1 << (dv_iweight_bits - 1))) >>
                 dv_iweight_bits;
         block[scan_table[pos]] = level;
+
+        UPDATE_CACHE(re, gb);
     }
+    CLOSE_READER(re, gb);
     mb->pos = pos;
 }
 
-static inline void bit_copy(PutBitContext *pb, BitstreamContext *bc)
+static inline void bit_copy(PutBitContext *pb, GetBitContext *gb)
 {
-    int bits_left = bitstream_bits_left(bc);
-
-    while (bits_left >= 32) {
-        int read = bitstream_read(bc, 32);
-        put_bits32(pb, read);
-        bits_left -= 32;
+    int bits_left = get_bits_left(gb);
+    while (bits_left >= MIN_CACHE_BITS) {
+        put_bits(pb, MIN_CACHE_BITS, get_bits(gb, MIN_CACHE_BITS));
+        bits_left -= MIN_CACHE_BITS;
     }
-
     if (bits_left > 0)
-        put_bits(pb, bits_left, bitstream_read(bc, bits_left));
+        put_bits(pb, bits_left, get_bits(gb, bits_left));
 }
 
 /* mb_x and mb_y are in units of 8 pixels */
@@ -147,17 +284,22 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     uint8_t *y_ptr;
     const uint8_t *buf_ptr;
     PutBitContext pb, vs_pb;
-    BitstreamContext bc;
+    GetBitContext gb;
     BlockInfo mb_data[5 * DV_MAX_BPM], *mb, *mb1;
     LOCAL_ALIGNED_16(int16_t, sblock, [5 * DV_MAX_BPM], [64]);
     LOCAL_ALIGNED_16(uint8_t, mb_bit_buffer, [80     + AV_INPUT_BUFFER_PADDING_SIZE]); /* allow some slack */
     LOCAL_ALIGNED_16(uint8_t, vs_bit_buffer, [80 * 5 + AV_INPUT_BUFFER_PADDING_SIZE]); /* allow some slack */
-    const int log2_blocksize = 3;
+    const int log2_blocksize = 3-s->avctx->lowres;
     int is_field_mode[5];
-    int mb_bits;
+    int vs_bit_buffer_damaged = 0;
+    int mb_bit_buffer_damaged[5] = {0};
+    int retried = 0;
+    int sta;
 
-    assert((((int) mb_bit_buffer) & 7) == 0);
-    assert((((int) vs_bit_buffer) & 7) == 0);
+    av_assert1((((int) mb_bit_buffer) & 7) == 0);
+    av_assert1((((int) vs_bit_buffer) & 7) == 0);
+
+retry:
 
     memset(sblock, 0, 5 * DV_MAX_BPM * sizeof(*sblock));
 
@@ -169,6 +311,14 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     for (mb_index = 0; mb_index < 5; mb_index++, mb1 += s->sys->bpm, block1 += s->sys->bpm * 64) {
         /* skip header */
         quant    = buf_ptr[3] & 0x0f;
+        if (avctx->error_concealment) {
+            if ((buf_ptr[3] >> 4) == 0x0E)
+                vs_bit_buffer_damaged = 1;
+            if (!mb_index) {
+                sta = buf_ptr[3] >> 4;
+            } else if (sta != (buf_ptr[3] >> 4))
+                vs_bit_buffer_damaged = 1;
+        }
         buf_ptr += 4;
         init_put_bits(&pb, mb_bit_buffer, 80);
         mb    = mb1;
@@ -176,12 +326,12 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
         is_field_mode[mb_index] = 0;
         for (j = 0; j < s->sys->bpm; j++) {
             last_index = s->sys->block_sizes[j];
-            bitstream_init(&bc, buf_ptr, last_index);
+            init_get_bits(&gb, buf_ptr, last_index);
 
             /* get the DC */
-            dc       = bitstream_read_signed(&bc, 9);
-            dct_mode = bitstream_read_bit(&bc);
-            class1   = bitstream_read(&bc, 2);
+            dc       = get_sbits(&gb, 9);
+            dct_mode = get_bits1(&gb);
+            class1   = get_bits(&gb, 2);
             if (DV_PROFILE_IS_HD(s->sys)) {
                 mb->idct_put     = s->idct_put[0];
                 mb->scan_table   = s->dv_zigzag[0];
@@ -197,7 +347,7 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
                                     dct_mode                        * 22 * 64 +
                                     (quant + ff_dv_quant_offset[class1]) * 64];
             }
-            dc = dc << 2;
+            dc = dc * 4;
             /* convert to unsigned because 128 is not added in the
              * standard IDCT */
             dc                   += 1024;
@@ -207,62 +357,73 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
             mb->partial_bit_count = 0;
 
             ff_dlog(avctx, "MB block: %d, %d ", mb_index, j);
-            dv_decode_ac(&bc, mb, block);
+            dv_decode_ac(&gb, mb, block);
 
             /* write the remaining bits in a new buffer only if the
              * block is finished */
             if (mb->pos >= 64)
-                bit_copy(&pb, &bc);
+                bit_copy(&pb, &gb);
+            if (mb->pos >= 64 && mb->pos < 127)
+                vs_bit_buffer_damaged = mb_bit_buffer_damaged[mb_index] = 1;
 
             block += 64;
             mb++;
         }
 
+        if (mb_bit_buffer_damaged[mb_index] > 0)
+            continue;
+
         /* pass 2: we can do it just after */
         ff_dlog(avctx, "***pass 2 size=%d MB#=%d\n", put_bits_count(&pb), mb_index);
         block = block1;
         mb    = mb1;
-
-        mb_bits = put_bits_count(&pb);
+        init_get_bits(&gb, mb_bit_buffer, put_bits_count(&pb));
         put_bits32(&pb, 0); // padding must be zeroed
         flush_put_bits(&pb);
-        bitstream_init(&bc, mb_bit_buffer, mb_bits);
-
         for (j = 0; j < s->sys->bpm; j++, block += 64, mb++) {
-            if (mb->pos < 64 && bitstream_bits_left(&bc) > 0) {
-                dv_decode_ac(&bc, mb, block);
+            if (mb->pos < 64 && get_bits_left(&gb) > 0) {
+                dv_decode_ac(&gb, mb, block);
                 /* if still not finished, no need to parse other blocks */
                 if (mb->pos < 64)
                     break;
+                if (mb->pos < 127)
+                    vs_bit_buffer_damaged = mb_bit_buffer_damaged[mb_index] = 1;
             }
         }
         /* all blocks are finished, so the extra bytes can be used at
          * the video segment level */
         if (j >= s->sys->bpm)
-            bit_copy(&vs_pb, &bc);
+            bit_copy(&vs_pb, &gb);
     }
 
     /* we need a pass over the whole video segment */
     ff_dlog(avctx, "***pass 3 size=%d\n", put_bits_count(&vs_pb));
     block = &sblock[0][0];
     mb    = mb_data;
-    mb_bits = put_bits_count(&vs_pb);
+    init_get_bits(&gb, vs_bit_buffer, put_bits_count(&vs_pb));
     put_bits32(&vs_pb, 0); // padding must be zeroed
     flush_put_bits(&vs_pb);
-    bitstream_init(&bc, vs_bit_buffer, mb_bits);
     for (mb_index = 0; mb_index < 5; mb_index++) {
         for (j = 0; j < s->sys->bpm; j++) {
-            if (mb->pos < 64) {
+            if (mb->pos < 64 && get_bits_left(&gb) > 0 && !vs_bit_buffer_damaged) {
                 ff_dlog(avctx, "start %d:%d\n", mb_index, j);
-                dv_decode_ac(&bc, mb, block);
+                dv_decode_ac(&gb, mb, block);
             }
-            if (mb->pos >= 64 && mb->pos < 127)
+
+            if (mb->pos >= 64 && mb->pos < 127) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "AC EOB marker is absent pos=%"PRIu8"\n", mb->pos);
+                       "AC EOB marker is absent pos=%d\n", mb->pos);
+                vs_bit_buffer_damaged = 1;
+            }
             block += 64;
             mb++;
         }
     }
+    if (vs_bit_buffer_damaged && !retried) {
+        av_log(avctx, AV_LOG_ERROR, "Concealing bitstream errors\n");
+        retried = 1;
+        goto retry;
+    }
 
     /* compute idct and place blocks */
     block = &sblock[0][0];
@@ -305,9 +466,9 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
                 int x, y;
                 mb->idct_put(pixels, 8, block);
                 for (y = 0; y < (1 << log2_blocksize); y++, c_ptr += s->frame->linesize[j], pixels += 8) {
-                    ptr1   = pixels + (1 << (log2_blocksize - 1));
+                    ptr1   = pixels + ((1 << (log2_blocksize))>>1);
                     c_ptr1 = c_ptr + (s->frame->linesize[j] << log2_blocksize);
-                    for (x = 0; x < (1 << (log2_blocksize - 1)); x++) {
+                    for (x = 0; x < (1 << FFMAX(log2_blocksize - 1, 0)); x++) {
                         c_ptr[x]  = pixels[x];
                         c_ptr1[x] = ptr1[x];
                     }
@@ -343,7 +504,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
     int apt, is16_9, ret;
     const AVDVProfile *sys;
 
-    sys = av_dv_frame_profile(s->sys, buf, buf_size);
+    sys = ff_dv_frame_profile(avctx, s->sys, buf, buf_size);
     if (!sys || buf_size < sys->frame_size) {
         av_log(avctx, AV_LOG_ERROR, "could not find dv frame profile\n");
         return -1; /* NOTE: we only accept several full frames */
@@ -355,6 +516,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
             av_log(avctx, AV_LOG_ERROR, "Error initializing the work tables.\n");
             return ret;
         }
+        dv_init_weight_tables(s, sys);
         s->sys = sys;
     }
 
@@ -377,13 +539,16 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
         ff_set_sar(avctx, s->sys->sar[is16_9]);
     }
 
-    if (ff_get_buffer(avctx, frame, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return -1;
-    }
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
     frame->interlaced_frame = 1;
     frame->top_field_first  = 0;
 
+    /* Determine the codec's field order from the packet */
+    if ( *vsc_pack == dv_video_control ) {
+        frame->top_field_first = !(vsc_pack[3] & 0x40);
+    }
+
     s->buf = buf;
     avctx->execute(avctx, dv_decode_video_segment, s->work_chunks, NULL,
                    dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
@@ -405,4 +570,5 @@ AVCodec ff_dvvideo_decoder = {
     .init           = dvvideo_decode_init,
     .decode         = dvvideo_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+    .max_lowres     = 3,
 };
diff --git a/libavcodec/dvdsub_parser.c b/libavcodec/dvdsub_parser.c
index 2ad3b33..698ccb6 100644
--- a/libavcodec/dvdsub_parser.c
+++ b/libavcodec/dvdsub_parser.c
@@ -2,20 +2,20 @@
  * DVD subtitle decoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,14 +44,24 @@ static int dvdsub_parse(AVCodecParserContext *s,
 {
     DVDSubParseContext *pc = s->priv_data;
 
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+
     if (pc->packet_index == 0) {
-        if (buf_size < 2)
-            return 0;
+        if (buf_size < 2 || AV_RB16(buf) && buf_size < 6) {
+            if (buf_size)
+                av_log(avctx, AV_LOG_DEBUG, "Parser input %d too small\n", buf_size);
+            return buf_size;
+        }
         pc->packet_len = AV_RB16(buf);
         if (pc->packet_len == 0) /* HD-DVD subpicture packet */
             pc->packet_len = AV_RB32(buf+2);
         av_freep(&pc->packet);
-        pc->packet = av_malloc(pc->packet_len);
+        if ((unsigned)pc->packet_len > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "packet length %d is invalid\n", pc->packet_len);
+            return buf_size;
+        }
+        pc->packet = av_malloc(pc->packet_len + AV_INPUT_BUFFER_PADDING_SIZE);
     }
     if (pc->packet) {
         if (pc->packet_index + buf_size <= pc->packet_len) {
diff --git a/libavcodec/dvdsubdec.c b/libavcodec/dvdsubdec.c
index b02bb6b..741ea9f 100644
--- a/libavcodec/dvdsubdec.c
+++ b/libavcodec/dvdsubdec.c
@@ -2,35 +2,50 @@
  * DVD subtitle decoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #include "libavutil/attributes.h"
 #include "libavutil/colorspace.h"
+#include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bswap.h"
 
-typedef struct DVDSubContext {
-    uint32_t palette[16];
-    int      has_palette;
+typedef struct DVDSubContext
+{
+  AVClass *class;
+  uint32_t palette[16];
+  char    *palette_str;
+  char    *ifo_str;
+  int      has_palette;
+  uint8_t  colormap[4];
+  uint8_t  alpha[256];
+  uint8_t  buf[0x10000];
+  int      buf_size;
+  int      forced_subs_only;
+  uint8_t  used_color[256];
+#ifdef DEBUG
+  int sub_id;
+#endif
 } DVDSubContext;
 
 static void yuv_a_to_rgba(const uint8_t *ycbcr, const uint8_t *alpha, uint32_t *rgba, int num_values)
@@ -46,17 +61,17 @@ static void yuv_a_to_rgba(const uint8_t *ycbcr, const uint8_t *alpha, uint32_t *
         cb = *ycbcr++;
         YUV_TO_RGB1_CCIR(cb, cr);
         YUV_TO_RGB2_CCIR(r, g, b, y);
-        *rgba++ = (*alpha++ << 24) | (r << 16) | (g << 8) | b;
+        *rgba++ = ((unsigned)*alpha++ << 24) | (r << 16) | (g << 8) | b;
     }
 }
 
-static int decode_run_2bit(BitstreamContext *bc, int *color)
+static int decode_run_2bit(GetBitContext *gb, int *color)
 {
     unsigned int v, t;
 
     v = 0;
     for (t = 1; v < t && t <= 0x40; t <<= 2)
-        v = (v << 4) | bitstream_read(bc, 4);
+        v = (v << 4) | get_bits(gb, 4);
     *color = v & 3;
     if (v < 4) { /* Code for fill rest of line */
         return INT_MAX;
@@ -64,51 +79,57 @@ static int decode_run_2bit(BitstreamContext *bc, int *color)
     return v >> 2;
 }
 
-static int decode_run_8bit(BitstreamContext *bc, int *color)
+static int decode_run_8bit(GetBitContext *gb, int *color)
 {
     int len;
-    int has_run = bitstream_read_bit(bc);
-    if (bitstream_read_bit(bc))
-        *color = bitstream_read(bc, 8);
-    else
-        *color = bitstream_read(bc, 2);
+    int has_run = get_bits1(gb);
+    *color = get_bits(gb, 2 + 6*get_bits1(gb));
     if (has_run) {
-        if (bitstream_read_bit(bc)) {
-            len = bitstream_read(bc, 7);
+        if (get_bits1(gb)) {
+            len = get_bits(gb, 7);
             if (len == 0)
                 len = INT_MAX;
             else
                 len += 9;
         } else
-            len = bitstream_read(bc, 3) + 2;
+            len = get_bits(gb, 3) + 2;
     } else
         len = 1;
     return len;
 }
 
-static int decode_rle(uint8_t *bitmap, int linesize, int w, int h,
+static int decode_rle(uint8_t *bitmap, int linesize, int w, int h, uint8_t used_color[256],
                       const uint8_t *buf, int start, int buf_size, int is_8bit)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int bit_len;
     int x, y, len, color;
     uint8_t *d;
 
+    if (start >= buf_size)
+        return -1;
+
+    if (w <= 0 || h <= 0)
+        return -1;
+
     bit_len = (buf_size - start) * 8;
-    bitstream_init(&bc, buf + start, bit_len);
+    init_get_bits(&gb, buf + start, bit_len);
 
     x = 0;
     y = 0;
     d = bitmap;
     for(;;) {
-        if (bitstream_tell(&bc) > bit_len)
+        if (get_bits_count(&gb) > bit_len)
             return -1;
         if (is_8bit)
-            len = decode_run_8bit(&bc, &color);
+            len = decode_run_8bit(&gb, &color);
         else
-            len = decode_run_2bit(&bc, &color);
+            len = decode_run_2bit(&gb, &color);
+        if (len != INT_MAX && len > w - x)
+            return AVERROR_INVALIDDATA;
         len = FFMIN(len, w - x);
         memset(d + x, color, len);
+        used_color[color] = 1;
         x += len;
         if (x >= w) {
             y++;
@@ -117,7 +138,7 @@ static int decode_rle(uint8_t *bitmap, int linesize, int w, int h,
             d += linesize;
             x = 0;
             /* byte align */
-            bitstream_align(&bc);
+            align_get_bits(&gb);
         }
     }
     return 0;
@@ -125,17 +146,24 @@ static int decode_rle(uint8_t *bitmap, int linesize, int w, int h,
 
 static void guess_palette(DVDSubContext* ctx,
                           uint32_t *rgba_palette,
-                          uint8_t *colormap,
-                          uint8_t *alpha,
                           uint32_t subtitle_color)
 {
+    static const uint8_t level_map[4][4] = {
+        // this configuration (full range, lowest to highest) in tests
+        // seemed most common, so assume this
+        {0xff},
+        {0x00, 0xff},
+        {0x00, 0x80, 0xff},
+        {0x00, 0x55, 0xaa, 0xff},
+    };
     uint8_t color_used[16] = { 0 };
     int nb_opaque_colors, i, level, j, r, g, b;
+    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;
 
-    if (ctx->has_palette) {
-        for (i = 0; i < 4; i++)
+    if(ctx->has_palette) {
+        for(i = 0; i < 4; i++)
             rgba_palette[i] = (ctx->palette[colormap[i]] & 0x00ffffff)
-                              | ((alpha[i] * 17) << 24);
+                              | ((alpha[i] * 17U) << 24);
         return;
     }
 
@@ -153,26 +181,41 @@ static void guess_palette(DVDSubContext* ctx,
     if (nb_opaque_colors == 0)
         return;
 
-    j = nb_opaque_colors;
+    j = 0;
     memset(color_used, 0, 16);
     for(i = 0; i < 4; i++) {
         if (alpha[i] != 0) {
             if (!color_used[colormap[i]])  {
-                level = (0xff * j) / nb_opaque_colors;
+                level = level_map[nb_opaque_colors - 1][j];
                 r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
                 g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
                 b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
-                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17) << 24);
+                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24);
                 color_used[colormap[i]] = (i + 1);
-                j--;
+                j++;
             } else {
                 rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) |
-                                    ((alpha[i] * 17) << 24);
+                                    ((alpha[i] * 17U) << 24);
             }
         }
     }
 }
 
+static void reset_rects(AVSubtitle *sub_header)
+{
+    int i;
+
+    if (sub_header->rects) {
+        for (i = 0; i < sub_header->num_rects; i++) {
+            av_freep(&sub_header->rects[i]->data[0]);
+            av_freep(&sub_header->rects[i]->data[1]);
+            av_freep(&sub_header->rects[i]);
+        }
+        av_freep(&sub_header->rects);
+        sub_header->num_rects = 0;
+    }
+}
+
 #define READ_OFFSET(a) (big_offsets ? AV_RB32(a) : AV_RB16(a))
 
 static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
@@ -180,16 +223,16 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
 {
     int cmd_pos, pos, cmd, x1, y1, x2, y2, next_cmd_pos;
     int big_offsets, offset_size, is_8bit = 0;
-    const uint8_t *yuv_palette = 0;
-    uint8_t colormap[4] = { 0 }, alpha[256] = { 0 };
+    const uint8_t *yuv_palette = NULL;
+    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;
     int date;
     int i;
     int is_menu = 0;
+    uint32_t size;
     int64_t offset1, offset2;
 
     if (buf_size < 10)
         return -1;
-    memset(sub_header, 0, sizeof(*sub_header));
 
     if (AV_RB16(buf) == 0) {   /* HD subpicture with 4-byte offsets */
         big_offsets = 1;
@@ -201,8 +244,17 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
         cmd_pos = 2;
     }
 
+    size = READ_OFFSET(buf + (big_offsets ? 2 : 0));
     cmd_pos = READ_OFFSET(buf + cmd_pos);
 
+    if (cmd_pos < 0 || cmd_pos > buf_size - 2 - offset_size) {
+        if (cmd_pos > size) {
+            av_log(ctx, AV_LOG_ERROR, "Discarding invalid packet\n");
+            return 0;
+        }
+        return AVERROR(EAGAIN);
+    }
+
     while (cmd_pos > 0 && cmd_pos < buf_size - 2 - offset_size) {
         date = AV_RB16(buf + cmd_pos);
         next_cmd_pos = READ_OFFSET(buf + cmd_pos + 2);
@@ -247,8 +299,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 alpha[1] = buf[pos + 1] >> 4;
                 alpha[0] = buf[pos + 1] & 0x0f;
                 pos += 2;
-                ff_dlog(NULL, "alpha=%"PRIx8"%"PRIx8"%"PRIx8"%"PRIx8"\n",
-                        alpha[0], alpha[1], alpha[2], alpha[3]);
+                ff_dlog(NULL, "alpha=%x%x%x%x\n", alpha[0],alpha[1],alpha[2],alpha[3]);
                 break;
             case 0x05:
             case 0x85:
@@ -307,7 +358,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
         if (offset1 >= buf_size || offset2 >= buf_size)
             goto fail;
 
-        if (offset1 >= 0) {
+        if (offset1 >= 0 && offset2 >= 0) {
             int w, h;
             uint8_t *bitmap;
 
@@ -315,20 +366,12 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
             w = x2 - x1 + 1;
             if (w < 0)
                 w = 0;
-            h = y2 - y1;
+            h = y2 - y1 + 1;
             if (h < 0)
                 h = 0;
-            if (w > 0 && h > 0) {
-                if (sub_header->rects) {
-                    for (i = 0; i < sub_header->num_rects; i++) {
-                        av_freep(&sub_header->rects[i]->data[0]);
-                        av_freep(&sub_header->rects[i]->data[1]);
-                        av_freep(&sub_header->rects[i]);
-                    }
-                    av_freep(&sub_header->rects);
-                    sub_header->num_rects = 0;
-                }
-
+            if (w > 0 && h > 1) {
+                reset_rects(sub_header);
+                memset(ctx->used_color, 0, sizeof(ctx->used_color));
                 sub_header->rects = av_mallocz(sizeof(*sub_header->rects));
                 if (!sub_header->rects)
                     goto fail;
@@ -339,15 +382,17 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 bitmap = sub_header->rects[0]->data[0] = av_malloc(w * h);
                 if (!bitmap)
                     goto fail;
-                decode_rle(bitmap, w * 2, w, (h + 1) / 2,
-                           buf, offset1, buf_size, is_8bit);
-                decode_rle(bitmap + w, w * 2, w, h / 2,
-                           buf, offset2, buf_size, is_8bit);
+                if (decode_rle(bitmap, w * 2, w, (h + 1) / 2, ctx->used_color,
+                               buf, offset1, buf_size, is_8bit) < 0)
+                    goto fail;
+                if (decode_rle(bitmap + w, w * 2, w, h / 2, ctx->used_color,
+                               buf, offset2, buf_size, is_8bit) < 0)
+                    goto fail;
                 sub_header->rects[0]->data[1] = av_mallocz(AVPALETTE_SIZE);
                 if (!sub_header->rects[0]->data[1])
                     goto fail;
                 if (is_8bit) {
-                    if (yuv_palette == 0)
+                    if (!yuv_palette)
                         goto fail;
                     sub_header->rects[0]->nb_colors = 256;
                     yuv_a_to_rgba(yuv_palette, alpha,
@@ -355,9 +400,8 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                                   256);
                 } else {
                     sub_header->rects[0]->nb_colors = 4;
-                    guess_palette(ctx,
-                                  (uint32_t *)sub_header->rects[0]->data[1],
-                                  colormap, alpha, 0xffff00);
+                    guess_palette(ctx, (uint32_t*)sub_header->rects[0]->data[1],
+                                  0xffff00);
                 }
                 sub_header->rects[0]->x = x1;
                 sub_header->rects[0]->y = y1;
@@ -365,22 +409,22 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 sub_header->rects[0]->h = h;
                 sub_header->rects[0]->type = SUBTITLE_BITMAP;
                 sub_header->rects[0]->linesize[0] = w;
+                sub_header->rects[0]->flags = is_menu ? AV_SUBTITLE_FLAG_FORCED : 0;
 
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
-{
-                int j;
-                AVSubtitleRect *rect;
-                rect = sub_header->rects[0];
-                for (j = 0; j < 4; j++) {
-                    rect->pict.data[j] = rect->data[j];
-                    rect->pict.linesize[j] = rect->linesize[j];
+                for (i = 0; i < 4; i++) {
+                    sub_header->rects[0]->pict.data[i] = sub_header->rects[0]->data[i];
+                    sub_header->rects[0]->pict.linesize[i] = sub_header->rects[0]->linesize[i];
                 }
-}
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
             }
         }
+        if (next_cmd_pos < cmd_pos) {
+            av_log(ctx, AV_LOG_ERROR, "Invalid command offset\n");
+            break;
+        }
         if (next_cmd_pos == cmd_pos)
             break;
         cmd_pos = next_cmd_pos;
@@ -388,15 +432,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (sub_header->num_rects > 0)
         return is_menu;
  fail:
-    if (!sub_header->rects) {
-        for (i = 0; i < sub_header->num_rects; i++) {
-            av_freep(&sub_header->rects[i]->data[0]);
-            av_freep(&sub_header->rects[i]->data[1]);
-            av_freep(&sub_header->rects[i]);
-        }
-        av_freep(&sub_header->rects);
-        sub_header->num_rects = 0;
-    }
+    reset_rects(sub_header);
     return -1;
 }
 
@@ -413,19 +449,24 @@ static int is_transp(const uint8_t *buf, int pitch, int n,
 }
 
 /* return 0 if empty rectangle, 1 if non empty */
-static int find_smallest_bounding_rectangle(AVSubtitle *s)
+static int find_smallest_bounding_rectangle(DVDSubContext *ctx, AVSubtitle *s)
 {
     uint8_t transp_color[256] = { 0 };
     int y1, y2, x1, x2, y, w, h, i;
     uint8_t *bitmap;
+    int transparent = 1;
 
     if (s->num_rects == 0 || !s->rects || s->rects[0]->w <= 0 || s->rects[0]->h <= 0)
         return 0;
 
     for(i = 0; i < s->rects[0]->nb_colors; i++) {
-        if ((((uint32_t *)s->rects[0]->data[1])[i] >> 24) == 0)
+        if ((((uint32_t *)s->rects[0]->data[1])[i] >> 24) == 0) {
             transp_color[i] = 1;
+        } else if (ctx->used_color[i])
+            transparent = 0;
     }
+    if (transparent)
+        return 0;
     y1 = 0;
     while (y1 < s->rects[0]->h && is_transp(s->rects[0]->data[0] + y1 * s->rects[0]->linesize[0],
                                   1, s->rects[0]->w, transp_color))
@@ -463,20 +504,33 @@ static int find_smallest_bounding_rectangle(AVSubtitle *s)
     s->rects[0]->h = h;
     s->rects[0]->x += x1;
     s->rects[0]->y += y1;
+
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+    for (i = 0; i < 4; i++) {
+        s->rects[0]->pict.data[i] = s->rects[0]->data[i];
+        s->rects[0]->pict.linesize[i] = s->rects[0]->linesize[i];
+    }
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
     return 1;
 }
 
 #ifdef DEBUG
+#define ALPHA_MIX(A,BACK,FORE) (((255-(A)) * (BACK) + (A) * (FORE)) / 255)
 static void ppm_save(const char *filename, uint8_t *bitmap, int w, int h,
                      uint32_t *rgba_palette)
 {
-    int x, y, v;
+    int x, y, alpha;
+    uint32_t v;
+    int back[3] = {0, 255, 0};  /* green background */
     FILE *f;
 
     f = fopen(filename, "w");
     if (!f) {
         perror(filename);
-        exit(1);
+        return;
     }
     fprintf(f, "P6\n"
             "%d %d\n"
@@ -485,15 +539,33 @@ static void ppm_save(const char *filename, uint8_t *bitmap, int w, int h,
     for(y = 0; y < h; y++) {
         for(x = 0; x < w; x++) {
             v = rgba_palette[bitmap[y * w + x]];
-            putc((v >> 16) & 0xff, f);
-            putc((v >> 8) & 0xff, f);
-            putc((v >> 0) & 0xff, f);
+            alpha = v >> 24;
+            putc(ALPHA_MIX(alpha, back[0], (v >> 16) & 0xff), f);
+            putc(ALPHA_MIX(alpha, back[1], (v >> 8) & 0xff), f);
+            putc(ALPHA_MIX(alpha, back[2], (v >> 0) & 0xff), f);
         }
     }
     fclose(f);
 }
 #endif
 
+static int append_to_cached_buf(AVCodecContext *avctx,
+                                const uint8_t *buf, int buf_size)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+
+    av_assert0(buf_size >= 0 && ctx->buf_size <= sizeof(ctx->buf));
+    if (buf_size >= sizeof(ctx->buf) - ctx->buf_size) {
+        av_log(avctx, AV_LOG_WARNING, "Attempt to reconstruct "
+               "too large SPU packets aborted.\n");
+        ctx->buf_size = 0;
+        return AVERROR_INVALIDDATA;
+    }
+    memcpy(ctx->buf + ctx->buf_size, buf, buf_size);
+    ctx->buf_size += buf_size;
+    return 0;
+}
+
 static int dvdsub_decode(AVCodecContext *avctx,
                          void *data, int *data_size,
                          AVPacket *avpkt)
@@ -502,74 +574,220 @@ static int dvdsub_decode(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     AVSubtitle *sub = data;
+    int appended = 0;
     int is_menu;
 
+    if (ctx->buf_size) {
+        int ret = append_to_cached_buf(avctx, buf, buf_size);
+        if (ret < 0) {
+            *data_size = 0;
+            return ret;
+        }
+        buf = ctx->buf;
+        buf_size = ctx->buf_size;
+        appended = 1;
+    }
+
     is_menu = decode_dvd_subtitles(ctx, sub, buf, buf_size);
+    if (is_menu == AVERROR(EAGAIN)) {
+        *data_size = 0;
+        return appended ? 0 : append_to_cached_buf(avctx, buf, buf_size);
+    }
 
     if (is_menu < 0) {
+        ctx->buf_size = 0;
     no_subtitle:
+        reset_rects(sub);
         *data_size = 0;
 
         return buf_size;
     }
-    if (!is_menu && find_smallest_bounding_rectangle(sub) == 0)
+    if (!is_menu && find_smallest_bounding_rectangle(ctx, sub) == 0)
+        goto no_subtitle;
+
+    if (ctx->forced_subs_only && !(sub->rects[0]->flags & AV_SUBTITLE_FLAG_FORCED))
         goto no_subtitle;
 
 #if defined(DEBUG)
-    ff_dlog(NULL, "start=%"PRIu32" ms end =%"PRIu32" ms\n",
+    {
+    char ppm_name[32];
+
+    snprintf(ppm_name, sizeof(ppm_name), "/tmp/%05d.ppm", ctx->sub_id++);
+    ff_dlog(NULL, "start=%d ms end =%d ms\n",
             sub->start_display_time,
             sub->end_display_time);
-    ppm_save("/tmp/a.ppm", sub->rects[0]->data[0],
-             sub->rects[0]->w, sub->rects[0]->h, sub->rects[0]->data[1]);
+    ppm_save(ppm_name, sub->rects[0]->data[0],
+             sub->rects[0]->w, sub->rects[0]->h, (uint32_t*) sub->rects[0]->data[1]);
+    }
 #endif
 
+    ctx->buf_size = 0;
     *data_size = 1;
     return buf_size;
 }
 
-static av_cold int dvdsub_init(AVCodecContext *avctx)
+static void parse_palette(DVDSubContext *ctx, char *p)
 {
-    DVDSubContext *ctx = avctx->priv_data;
-    char *data, *cur;
+    int i;
+
+    ctx->has_palette = 1;
+    for(i=0;i<16;i++) {
+        ctx->palette[i] = strtoul(p, &p, 16);
+        while(*p == ',' || av_isspace(*p))
+            p++;
+    }
+}
+
+static int parse_ifo_palette(DVDSubContext *ctx, char *p)
+{
+    FILE *ifo;
+    char ifostr[12];
+    uint32_t sp_pgci, pgci, off_pgc, pgc;
+    uint8_t r, g, b, yuv[65], *buf;
+    int i, y, cb, cr, r_add, g_add, b_add;
     int ret = 0;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+
+    ctx->has_palette = 0;
+    if ((ifo = fopen(p, "r")) == NULL) {
+        av_log(ctx, AV_LOG_WARNING, "Unable to open IFO file \"%s\": %s\n", p, av_err2str(AVERROR(errno)));
+        return AVERROR_EOF;
+    }
+    if (fread(ifostr, 12, 1, ifo) != 1 || memcmp(ifostr, "DVDVIDEO-VTS", 12)) {
+        av_log(ctx, AV_LOG_WARNING, "\"%s\" is not a proper IFO file\n", p);
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    if (fseek(ifo, 0xCC, SEEK_SET) == -1) {
+        ret = AVERROR(errno);
+        goto end;
+    }
+    if (fread(&sp_pgci, 4, 1, ifo) == 1) {
+        pgci = av_be2ne32(sp_pgci) * 2048;
+        if (fseek(ifo, pgci + 0x0C, SEEK_SET) == -1) {
+            ret = AVERROR(errno);
+            goto end;
+        }
+        if (fread(&off_pgc, 4, 1, ifo) == 1) {
+            pgc = pgci + av_be2ne32(off_pgc);
+            if (fseek(ifo, pgc + 0xA4, SEEK_SET) == -1) {
+                ret = AVERROR(errno);
+                goto end;
+            }
+            if (fread(yuv, 64, 1, ifo) == 1) {
+                buf = yuv;
+                for(i=0; i<16; i++) {
+                    y  = *++buf;
+                    cr = *++buf;
+                    cb = *++buf;
+                    YUV_TO_RGB1_CCIR(cb, cr);
+                    YUV_TO_RGB2_CCIR(r, g, b, y);
+                    ctx->palette[i] = (r << 16) + (g << 8) + b;
+                    buf++;
+                }
+                ctx->has_palette = 1;
+            }
+        }
+    }
+    if (ctx->has_palette == 0) {
+        av_log(ctx, AV_LOG_WARNING, "Failed to read palette from IFO file \"%s\"\n", p);
+        ret = AVERROR_INVALIDDATA;
+    }
+end:
+    fclose(ifo);
+    return ret;
+}
+
+static int dvdsub_parse_extradata(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = (DVDSubContext*) avctx->priv_data;
+    char *dataorig, *data;
+    int ret = 1;
 
     if (!avctx->extradata || !avctx->extradata_size)
-        return 0;
+        return 1;
 
-    data = av_malloc(avctx->extradata_size + 1);
+    dataorig = data = av_malloc(avctx->extradata_size+1);
     if (!data)
         return AVERROR(ENOMEM);
     memcpy(data, avctx->extradata, avctx->extradata_size);
     data[avctx->extradata_size] = '\0';
-    cur = data;
-
-    while (*cur) {
-        if (strncmp("palette:", cur, 8) == 0) {
-            int i;
-            char *p = cur + 8;
-            ctx->has_palette = 1;
-            for (i = 0; i < 16; i++) {
-                ctx->palette[i] = strtoul(p, &p, 16);
-                while (*p == ',' || av_isspace(*p))
-                    p++;
-            }
-        } else if (!strncmp("size:", cur, 5)) {
+
+    for(;;) {
+        int pos = strcspn(data, "\n\r");
+        if (pos==0 && *data==0)
+            break;
+
+        if (strncmp("palette:", data, 8) == 0) {
+            parse_palette(ctx, data + 8);
+        } else if (strncmp("size:", data, 5) == 0) {
             int w, h;
-            if (sscanf(cur + 5, "%dx%d", &w, &h) == 2) {
+            if (sscanf(data + 5, "%dx%d", &w, &h) == 2) {
                ret = ff_set_dimensions(avctx, w, h);
                if (ret < 0)
                    goto fail;
             }
         }
-        cur += strcspn(cur, "\n\r");
-        cur += strspn(cur, "\n\r");
+
+        data += pos;
+        data += strspn(data, "\n\r");
     }
 
 fail:
-    av_free(data);
+    av_free(dataorig);
     return ret;
 }
 
+static av_cold int dvdsub_init(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+    int ret;
+
+    if ((ret = dvdsub_parse_extradata(avctx)) < 0)
+        return ret;
+
+    if (ctx->ifo_str)
+        parse_ifo_palette(ctx, ctx->ifo_str);
+    if (ctx->palette_str)
+        parse_palette(ctx, ctx->palette_str);
+    if (ctx->has_palette) {
+        int i;
+        av_log(avctx, AV_LOG_DEBUG, "palette:");
+        for(i=0;i<16;i++)
+            av_log(avctx, AV_LOG_DEBUG, " 0x%06"PRIx32, ctx->palette[i]);
+        av_log(avctx, AV_LOG_DEBUG, "\n");
+    }
+
+    return 1;
+}
+
+static void dvdsub_flush(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+    ctx->buf_size = 0;
+}
+
+static av_cold int dvdsub_close(AVCodecContext *avctx)
+{
+    dvdsub_flush(avctx);
+    return 0;
+}
+
+#define OFFSET(field) offsetof(DVDSubContext, field)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "palette", "set the global palette", OFFSET(palette_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, SD },
+    { "ifo_palette", "obtain the global palette from .IFO file", OFFSET(ifo_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, SD },
+    { "forced_subs_only", "Only show forced subtitles", OFFSET(forced_subs_only), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, SD},
+    { NULL }
+};
+static const AVClass dvdsub_class = {
+    .class_name = "dvdsubdec",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_dvdsub_decoder = {
     .name           = "dvdsub",
     .long_name      = NULL_IF_CONFIG_SMALL("DVD subtitles"),
@@ -578,4 +796,7 @@ AVCodec ff_dvdsub_decoder = {
     .priv_data_size = sizeof(DVDSubContext),
     .init           = dvdsub_init,
     .decode         = dvdsub_decode,
+    .flush          = dvdsub_flush,
+    .close          = dvdsub_close,
+    .priv_class     = &dvdsub_class,
 };
diff --git a/libavcodec/dvdsubenc.c b/libavcodec/dvdsubenc.c
index b0c2b63..ff95ed2 100644
--- a/libavcodec/dvdsubenc.c
+++ b/libavcodec/dvdsubenc.c
@@ -2,27 +2,35 @@
  * DVD subtitle encoding
  * Copyright (c) 2005 Wolfram Gloger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
+#include "libavutil/bprint.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
-#undef NDEBUG
-#include <assert.h>
+typedef struct {
+    AVClass *class;
+    uint32_t global_palette[16];
+    int even_rows_fix;
+} DVDSubtitleContext;
 
 // ncnt is the nibble counter
 #define PUTNIBBLE(val)\
@@ -53,7 +61,7 @@ static void dvd_encode_rle(uint8_t **pq,
                 if (bitmap[x+len] != color)
                     break;
             color = cmap[color];
-            assert(color < 4);
+            av_assert0(color < 4);
             if (len < 0x04) {
                 PUTNIBBLE((len << 2)|color);
             } else if (len < 0x10) {
@@ -86,32 +94,195 @@ static void dvd_encode_rle(uint8_t **pq,
     *pq = q;
 }
 
-static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
+static int color_distance(uint32_t a, uint32_t b)
+{
+    int r = 0, d, i;
+    int alpha_a = 8, alpha_b = 8;
+
+    for (i = 24; i >= 0; i -= 8) {
+        d = alpha_a * (int)((a >> i) & 0xFF) -
+            alpha_b * (int)((b >> i) & 0xFF);
+        r += d * d;
+        alpha_a = a >> 28;
+        alpha_b = b >> 28;
+    }
+    return r;
+}
+
+/**
+ * Count colors used in a rectangle, quantizing alpha and grouping by
+ * nearest global palette entry.
+ */
+static void count_colors(AVCodecContext *avctx, unsigned hits[33],
+                         const AVSubtitleRect *r)
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    unsigned count[256] = { 0 };
+    uint32_t *palette = (uint32_t *)r->data[1];
+    uint32_t color;
+    int x, y, i, j, match, d, best_d, av_uninit(best_j);
+    uint8_t *p = r->data[0];
+
+    for (y = 0; y < r->h; y++) {
+        for (x = 0; x < r->w; x++)
+            count[*(p++)]++;
+        p += r->linesize[0] - r->w;
+    }
+    for (i = 0; i < 256; i++) {
+        if (!count[i]) /* avoid useless search */
+            continue;
+        color = palette[i];
+        /* 0: transparent, 1-16: semi-transparent, 17-33 opaque */
+        match = color < 0x33000000 ? 0 : color < 0xCC000000 ? 1 : 17;
+        if (match) {
+            best_d = INT_MAX;
+            for (j = 0; j < 16; j++) {
+                d = color_distance(0xFF000000 | color,
+                                   0xFF000000 | dvdc->global_palette[j]);
+                if (d < best_d) {
+                    best_d = d;
+                    best_j = j;
+                }
+            }
+            match += best_j;
+        }
+        hits[match] += count[i];
+    }
+}
+
+static void select_palette(AVCodecContext *avctx, int out_palette[4],
+                           int out_alpha[4], unsigned hits[33])
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    int i, j, bright, mult;
+    uint32_t color;
+    int selected[4] = { 0 };
+    uint32_t pseudopal[33] = { 0 };
+    uint32_t refcolor[3] = { 0x00000000, 0xFFFFFFFF, 0xFF000000 };
+
+    /* Bonus for transparent: if the rectangle fits tightly the text, the
+       background color can be quite rare, but it would be ugly without it */
+    hits[0] *= 16;
+    /* Bonus for bright colors */
+    for (i = 0; i < 16; i++) {
+        if (!(hits[1 + i] + hits[17 + i]))
+            continue; /* skip unused colors to gain time */
+        color = dvdc->global_palette[i];
+        bright = 0;
+        for (j = 0; j < 3; j++, color >>= 8)
+            bright += (color & 0xFF) < 0x40 || (color & 0xFF) >= 0xC0;
+        mult = 2 + FFMIN(bright, 2);
+        hits[ 1 + i] *= mult;
+        hits[17 + i] *= mult;
+    }
+
+    /* Select four most frequent colors */
+    for (i = 0; i < 4; i++) {
+        for (j = 0; j < 33; j++)
+            if (hits[j] > hits[selected[i]])
+                selected[i] = j;
+        hits[selected[i]] = 0;
+    }
+
+    /* Order the colors like in most DVDs:
+       0: background, 1: foreground, 2: outline */
+    for (i = 0; i < 16; i++) {
+        pseudopal[ 1 + i] = 0x80000000 | dvdc->global_palette[i];
+        pseudopal[17 + i] = 0xFF000000 | dvdc->global_palette[i];
+    }
+    for (i = 0; i < 3; i++) {
+        int best_d = color_distance(refcolor[i], pseudopal[selected[i]]);
+        for (j = i + 1; j < 4; j++) {
+            int d = color_distance(refcolor[i], pseudopal[selected[j]]);
+            if (d < best_d) {
+                FFSWAP(int, selected[i], selected[j]);
+                best_d = d;
+            }
+        }
+    }
+
+    /* Output */
+    for (i = 0; i < 4; i++) {
+        out_palette[i] = selected[i] ? (selected[i] - 1) & 0xF : 0;
+        out_alpha  [i] = !selected[i] ? 0 : selected[i] < 17 ? 0x80 : 0xFF;
+    }
+}
+
+static void build_color_map(AVCodecContext *avctx, int cmap[],
+                            const uint32_t palette[],
+                            const int out_palette[], unsigned int const out_alpha[])
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    int i, j, d, best_d;
+    uint32_t pseudopal[4];
+
+    for (i = 0; i < 4; i++)
+        pseudopal[i] = (out_alpha[i] << 24) |
+                       dvdc->global_palette[out_palette[i]];
+    for (i = 0; i < 256; i++) {
+        best_d = INT_MAX;
+        for (j = 0; j < 4; j++) {
+            d = color_distance(pseudopal[j], palette[i]);
+            if (d < best_d) {
+                cmap[i] = j;
+                best_d = d;
+            }
+        }
+    }
+}
+
+static void copy_rectangle(AVSubtitleRect *dst, AVSubtitleRect *src, int cmap[])
+{
+    int x, y;
+    uint8_t *p, *q;
+
+    p = src->data[0];
+    q = dst->data[0] + (src->x - dst->x) +
+                            (src->y - dst->y) * dst->linesize[0];
+    for (y = 0; y < src->h; y++) {
+        for (x = 0; x < src->w; x++)
+            *(q++) = cmap[*(p++)];
+        p += src->linesize[0] - src->w;
+        q += dst->linesize[0] - src->w;
+    }
+}
+
+static int encode_dvd_subtitles(AVCodecContext *avctx,
+                                uint8_t *outbuf, int outbuf_size,
                                 const AVSubtitle *h)
 {
+    DVDSubtitleContext *dvdc = avctx->priv_data;
     uint8_t *q, *qq;
-    int object_id;
-    int offset1[20], offset2[20];
-    int i, imax, color, alpha, rects = h->num_rects;
-    unsigned long hmax;
-    unsigned long hist[256];
-    int           cmap[256];
+    int offset1, offset2;
+    int i, rects = h->num_rects, ret;
+    unsigned global_palette_hits[33] = { 0 };
+    int cmap[256];
+    int out_palette[4];
+    int out_alpha[4];
+    AVSubtitleRect vrect;
+    uint8_t *vrect_data = NULL;
+    int x2, y2;
+    int forced = 0;
 
     if (rects == 0 || !h->rects)
-        return -1;
-    if (rects > 20)
-        rects = 20;
-
-    // analyze bitmaps, compress to 4 colors
-    for (i=0; i<256; ++i) {
-        hist[i] = 0;
-        cmap[i] = 0;
-    }
-    for (object_id = 0; object_id < rects; object_id++) {
+        return AVERROR(EINVAL);
+    for (i = 0; i < rects; i++)
+        if (h->rects[i]->type != SUBTITLE_BITMAP) {
+            av_log(avctx, AV_LOG_ERROR, "Bitmap subtitle required\n");
+            return AVERROR(EINVAL);
+        }
+    /* Mark this subtitle forced if any of the rectangles is forced. */
+    for (i = 0; i < rects; i++)
+        if ((h->rects[i]->flags & AV_SUBTITLE_FLAG_FORCED) != 0) {
+            forced = 1;
+            break;
+        }
+
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
-        if (!h->rects[object_id]->data[0]) {
-            AVSubtitleRect *rect = h->rects[object_id];
+    for (i = 0; i < rects; i++)
+        if (!h->rects[i]->data[0]) {
+            AVSubtitleRect *rect = h->rects[i];
             int j;
             for (j = 0; j < 4; j++) {
                 rect->data[j] = rect->pict.data[j];
@@ -121,51 +292,82 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-        for (i=0; i<h->rects[object_id]->w*h->rects[object_id]->h; ++i) {
-            color = h->rects[object_id]->data[0][i];
-            // only count non-transparent pixels
-            alpha = ((uint32_t *)h->rects[object_id]->data[1])[color] >> 24;
-            hist[color] += alpha;
+    vrect = *h->rects[0];
+
+    if (rects > 1) {
+        /* DVD subtitles can have only one rectangle: build a virtual
+           rectangle containing all actual rectangles.
+           The data of the rectangles will be copied later, when the palette
+           is decided, because the rectangles may have different palettes. */
+        int xmin = h->rects[0]->x, xmax = xmin + h->rects[0]->w;
+        int ymin = h->rects[0]->y, ymax = ymin + h->rects[0]->h;
+        for (i = 1; i < rects; i++) {
+            xmin = FFMIN(xmin, h->rects[i]->x);
+            ymin = FFMIN(ymin, h->rects[i]->y);
+            xmax = FFMAX(xmax, h->rects[i]->x + h->rects[i]->w);
+            ymax = FFMAX(ymax, h->rects[i]->y + h->rects[i]->h);
         }
+        vrect.x = xmin;
+        vrect.y = ymin;
+        vrect.w = xmax - xmin;
+        vrect.h = ymax - ymin;
+        if ((ret = av_image_check_size(vrect.w, vrect.h, 0, avctx)) < 0)
+            return ret;
+
+        /* Count pixels outside the virtual rectangle as transparent */
+        global_palette_hits[0] = vrect.w * vrect.h;
+        for (i = 0; i < rects; i++)
+            global_palette_hits[0] -= h->rects[i]->w * h->rects[i]->h;
     }
-    for (color=3;; --color) {
-        hmax = 0;
-        imax = 0;
-        for (i=0; i<256; ++i)
-            if (hist[i] > hmax) {
-                imax = i;
-                hmax = hist[i];
-            }
-        if (hmax == 0)
-            break;
-        if (color == 0)
-            color = 3;
-        av_log(NULL, AV_LOG_DEBUG, "dvd_subtitle hist[%d]=%ld -> col %d\n",
-               imax, hist[imax], color);
-        cmap[imax] = color;
-        hist[imax] = 0;
+
+    for (i = 0; i < rects; i++)
+        count_colors(avctx, global_palette_hits, h->rects[i]);
+    select_palette(avctx, out_palette, out_alpha, global_palette_hits);
+
+    if (rects > 1) {
+        if (!(vrect_data = av_calloc(vrect.w, vrect.h)))
+            return AVERROR(ENOMEM);
+        vrect.data    [0] = vrect_data;
+        vrect.linesize[0] = vrect.w;
+        for (i = 0; i < rects; i++) {
+            build_color_map(avctx, cmap, (uint32_t *)h->rects[i]->data[1],
+                            out_palette, out_alpha);
+            copy_rectangle(&vrect, h->rects[i], cmap);
+        }
+        for (i = 0; i < 4; i++)
+            cmap[i] = i;
+    } else {
+        build_color_map(avctx, cmap, (uint32_t *)h->rects[0]->data[1],
+                        out_palette, out_alpha);
     }
 
+    av_log(avctx, AV_LOG_DEBUG, "Selected palette:");
+    for (i = 0; i < 4; i++)
+        av_log(avctx, AV_LOG_DEBUG, " 0x%06"PRIx32"@@%02x (0x%x,0x%x)",
+               dvdc->global_palette[out_palette[i]], out_alpha[i],
+               out_palette[i], out_alpha[i] >> 4);
+    av_log(avctx, AV_LOG_DEBUG, "\n");
 
     // encode data block
     q = outbuf + 4;
-    for (object_id = 0; object_id < rects; object_id++) {
-        offset1[object_id] = q - outbuf;
-        // worst case memory requirement: 1 nibble per pixel..
-        if ((q - outbuf) + h->rects[object_id]->w*h->rects[object_id]->h/2
-            + 17*rects + 21 > outbuf_size) {
-            av_log(NULL, AV_LOG_ERROR, "dvd_subtitle too big\n");
-            return -1;
-        }
-        dvd_encode_rle(&q, h->rects[object_id]->data[0],
-                       h->rects[object_id]->w*2,
-                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
-                       cmap);
-        offset2[object_id] = q - outbuf;
-        dvd_encode_rle(&q, h->rects[object_id]->data[0] + h->rects[object_id]->w,
-                       h->rects[object_id]->w*2,
-                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
-                       cmap);
+    offset1 = q - outbuf;
+    // worst case memory requirement: 1 nibble per pixel..
+    if ((q - outbuf) + vrect.w * vrect.h / 2 + 17 + 21 > outbuf_size) {
+        av_log(NULL, AV_LOG_ERROR, "dvd_subtitle too big\n");
+        ret = AVERROR_BUFFER_TOO_SMALL;
+        goto fail;
+    }
+    dvd_encode_rle(&q, vrect.data[0], vrect.w * 2,
+                   vrect.w, (vrect.h + 1) >> 1, cmap);
+    offset2 = q - outbuf;
+    dvd_encode_rle(&q, vrect.data[0] + vrect.w, vrect.w * 2,
+                   vrect.w, vrect.h >> 1, cmap);
+
+    if (dvdc->even_rows_fix && (vrect.h & 1)) {
+        // Work-around for some players that want the height to be even.
+        vrect.h++;
+        *q++ = 0x00; // 0x00 0x00 == empty row, i.e. fully transparent
+        *q++ = 0x00;
     }
 
     // set data packet size
@@ -174,35 +376,34 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     // send start display command
     bytestream_put_be16(&q, (h->start_display_time*90) >> 10);
-    bytestream_put_be16(&q, (q - outbuf) /*- 2 */ + 8 + 12*rects + 2);
+    bytestream_put_be16(&q, (q - outbuf) /*- 2 */ + 8 + 12 + 2);
     *q++ = 0x03; // palette - 4 nibbles
-    *q++ = 0x03; *q++ = 0x7f;
+    *q++ = (out_palette[3] << 4) | out_palette[2];
+    *q++ = (out_palette[1] << 4) | out_palette[0];
     *q++ = 0x04; // alpha - 4 nibbles
-    *q++ = 0xf0; *q++ = 0x00;
-    //*q++ = 0x0f; *q++ = 0xff;
+    *q++ = (out_alpha[3] & 0xF0) | (out_alpha[2] >> 4);
+    *q++ = (out_alpha[1] & 0xF0) | (out_alpha[0] >> 4);
 
-    // XXX not sure if more than one rect can really be encoded..
     // 12 bytes per rect
-    for (object_id = 0; object_id < rects; object_id++) {
-        int x2 = h->rects[object_id]->x + h->rects[object_id]->w - 1;
-        int y2 = h->rects[object_id]->y + h->rects[object_id]->h - 1;
-
-        *q++ = 0x05;
-        // x1 x2 -> 6 nibbles
-        *q++ = h->rects[object_id]->x >> 4;
-        *q++ = (h->rects[object_id]->x << 4) | ((x2 >> 8) & 0xf);
-        *q++ = x2;
-        // y1 y2 -> 6 nibbles
-        *q++ = h->rects[object_id]->y >> 4;
-        *q++ = (h->rects[object_id]->y << 4) | ((y2 >> 8) & 0xf);
-        *q++ = y2;
-
-        *q++ = 0x06;
-        // offset1, offset2
-        bytestream_put_be16(&q, offset1[object_id]);
-        bytestream_put_be16(&q, offset2[object_id]);
-    }
-    *q++ = 0x01; // start command
+    x2 = vrect.x + vrect.w - 1;
+    y2 = vrect.y + vrect.h - 1;
+
+    *q++ = 0x05;
+    // x1 x2 -> 6 nibbles
+    *q++ = vrect.x >> 4;
+    *q++ = (vrect.x << 4) | ((x2 >> 8) & 0xf);
+    *q++ = x2;
+    // y1 y2 -> 6 nibbles
+    *q++ = vrect.y >> 4;
+    *q++ = (vrect.y << 4) | ((y2 >> 8) & 0xf);
+    *q++ = y2;
+
+    *q++ = 0x06;
+    // offset1, offset2
+    bytestream_put_be16(&q, offset1);
+    bytestream_put_be16(&q, offset2);
+
+    *q++ = forced ? 0x00 : 0x01; // start command
     *q++ = 0xff; // terminating command
 
     // send stop display command last
@@ -214,8 +415,42 @@ FF_ENABLE_DEPRECATION_WARNINGS
     qq = outbuf;
     bytestream_put_be16(&qq, q - outbuf);
 
-    av_log(NULL, AV_LOG_DEBUG, "subtitle_packet size=%td\n", q - outbuf);
-    return q - outbuf;
+    av_log(NULL, AV_LOG_DEBUG, "subtitle_packet size=%"PTRDIFF_SPECIFIER"\n", q - outbuf);
+    ret = q - outbuf;
+
+fail:
+    av_free(vrect_data);
+    return ret;
+}
+
+static int dvdsub_init(AVCodecContext *avctx)
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    static const uint32_t default_palette[16] = {
+        0x000000, 0x0000FF, 0x00FF00, 0xFF0000,
+        0xFFFF00, 0xFF00FF, 0x00FFFF, 0xFFFFFF,
+        0x808000, 0x8080FF, 0x800080, 0x80FF80,
+        0x008080, 0xFF8080, 0x555555, 0xAAAAAA,
+    };
+    AVBPrint extradata;
+    int i, ret;
+
+    av_assert0(sizeof(dvdc->global_palette) == sizeof(default_palette));
+    memcpy(dvdc->global_palette, default_palette, sizeof(dvdc->global_palette));
+
+    av_bprint_init(&extradata, 0, AV_BPRINT_SIZE_AUTOMATIC);
+    if (avctx->width && avctx->height)
+        av_bprintf(&extradata, "size: %dx%d\n", avctx->width, avctx->height);
+    av_bprintf(&extradata, "palette:");
+    for (i = 0; i < 16; i++)
+        av_bprintf(&extradata, " %06"PRIx32"%c",
+                   dvdc->global_palette[i] & 0xFFFFFF, i < 15 ? ',' : '\n');
+
+    ret = avpriv_bprint_to_extradata(avctx, &extradata);
+    if (ret < 0)
+        return ret;
+
+    return 0;
 }
 
 static int dvdsub_encode(AVCodecContext *avctx,
@@ -225,14 +460,31 @@ static int dvdsub_encode(AVCodecContext *avctx,
     //DVDSubtitleContext *s = avctx->priv_data;
     int ret;
 
-    ret = encode_dvd_subtitles(buf, buf_size, sub);
+    ret = encode_dvd_subtitles(avctx, buf, buf_size, sub);
     return ret;
 }
 
+#define OFFSET(x) offsetof(DVDSubtitleContext, x)
+#define SE AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    {"even_rows_fix", "Make number of rows even (workaround for some players)", OFFSET(even_rows_fix), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, SE},
+    { NULL },
+};
+
+static const AVClass dvdsubenc_class = {
+    .class_name = "VOBSUB subtitle encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_dvdsub_encoder = {
     .name           = "dvdsub",
     .long_name      = NULL_IF_CONFIG_SMALL("DVD subtitles"),
     .type           = AVMEDIA_TYPE_SUBTITLE,
     .id             = AV_CODEC_ID_DVD_SUBTITLE,
+    .init           = dvdsub_init,
     .encode_sub     = dvdsub_encode,
+    .priv_class     = &dvdsubenc_class,
+    .priv_data_size = sizeof(DVDSubtitleContext),
 };
diff --git a/libavcodec/dvenc.c b/libavcodec/dvenc.c
index 399c434..ce2fc75 100644
--- a/libavcodec/dvenc.c
+++ b/libavcodec/dvenc.c
@@ -2,21 +2,23 @@
  * DV encoder
  * Copyright (c) 2003 Roman Shaposhnik
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * quant_deadzone code and fixes sponsored by NOA GmbH
  */
 
 /**
@@ -28,6 +30,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/internal.h"
+#include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
@@ -49,7 +52,7 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
     PixblockDSPContext pdsp;
     int ret;
 
-    s->sys = av_dv_codec_profile(avctx->width, avctx->height, avctx->pix_fmt);
+    s->sys = av_dv_codec_profile2(avctx->width, avctx->height, avctx->pix_fmt, avctx->time_base);
     if (!s->sys) {
         av_log(avctx, AV_LOG_ERROR, "Found no DV profile for %ix%i %s video. "
                                     "Valid DV profiles are:\n",
@@ -57,6 +60,10 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
         ff_dv_print_profiles(avctx, AV_LOG_ERROR);
         return AVERROR(EINVAL);
     }
+    if (avctx->height > 576) {
+        av_log(avctx, AV_LOG_ERROR, "DVCPRO HD encoding is not supported.\n");
+        return AVERROR_PATCHWELCOME;
+    }
     ret = ff_dv_init_dynamic_tables(s, s->sys);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing work tables.\n");
@@ -65,6 +72,9 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
 
     dv_vlc_map_tableinit();
 
+    memset(&fdsp,0, sizeof(fdsp));
+    memset(&mecc,0, sizeof(mecc));
+    memset(&pdsp,0, sizeof(pdsp));
     ff_fdctdsp_init(&fdsp, avctx);
     ff_me_cmp_init(&mecc, avctx);
     ff_pixblockdsp_init(&pdsp, avctx);
@@ -167,7 +177,7 @@ static av_always_inline PutBitContext *dv_encode_ac(EncBlockInfo *bi,
             if (bits_left) {
                 size -= bits_left;
                 put_bits(pb, bits_left, vlc >> size);
-                vlc = vlc & ((1 << size) - 1);
+                vlc = av_mod_uintp2(vlc, size);
             }
             if (pb + 1 >= pb_end) {
                 bi->partial_bit_count  = size;
@@ -223,14 +233,14 @@ static const int dv_weight_88[64] = {
     170627, 165371, 160727, 153560, 160727, 144651, 144651, 136258,
 };
 static const int dv_weight_248[64] = {
-    131072, 242189, 257107, 237536, 229376, 200636, 242189, 223754,
-    224969, 196781, 262144, 242189, 229376, 200636, 257107, 237536,
-    211916, 185364, 235923, 217965, 229376, 211916, 206433, 180568,
-    242189, 223754, 224969, 196781, 211916, 185364, 235923, 217965,
-    200704, 175557, 222935, 205965, 200636, 185364, 195068, 170627,
-    229376, 211916, 206433, 180568, 200704, 175557, 222935, 205965,
-    175557, 153560, 188995, 174609, 165371, 144651, 200636, 185364,
-    195068, 170627, 175557, 153560, 188995, 174609, 165371, 144651,
+    131072, 262144, 257107, 257107, 242189, 242189, 242189, 242189,
+    237536, 237536, 229376, 229376, 200636, 200636, 224973, 224973,
+    223754, 223754, 235923, 235923, 229376, 229376, 217965, 217965,
+    211916, 211916, 196781, 196781, 185364, 185364, 206433, 206433,
+    211916, 211916, 222935, 222935, 200636, 200636, 205964, 205964,
+    200704, 200704, 180568, 180568, 175557, 175557, 195068, 195068,
+    185364, 185364, 188995, 188995, 174606, 174606, 175557, 175557,
+    170627, 170627, 153560, 153560, 165371, 165371, 144651, 144651,
 };
 
 static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
@@ -245,7 +255,7 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
      * method suggested in SMPTE 314M Table 22, and an improved
      * method. The SMPTE method is very conservative; it assigns class
      * 3 (i.e. severe quantization) to any block where the largest AC
-     * component is greater than 36. Libav's DV encoder tracks AC bit
+     * component is greater than 36. FFmpeg's DV encoder tracks AC bit
      * consumption precisely, so there is no need to bias most blocks
      * towards strongly lossy compression. Instead, we assign class 2
      * to most blocks, and use class 3 only when strictly necessary
@@ -253,13 +263,15 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
 
 #if 0 /* SMPTE spec method */
     static const int classes[] = { 12, 24, 36, 0xffff };
-#else /* improved Libav method */
+#else /* improved FFmpeg method */
     static const int classes[] = { -1, -1, 255, 0xffff };
 #endif
     int max  = classes[0];
     int prev = 0;
+    const unsigned deadzone = s->quant_deadzone;
+    const unsigned threshold = 2 * deadzone;
 
-    assert((((int) blk) & 15) == 0);
+    av_assert2((((int) blk) & 15) == 0);
 
     bi->area_q[0]          =
     bi->area_q[1]          =
@@ -290,13 +302,15 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
         for (i = mb_area_start[area]; i < mb_area_start[area + 1]; i++) {
             int level = blk[zigzag_scan[i]];
 
-            if (level + 15 > 30U) {
+            if (level + deadzone > threshold) {
                 bi->sign[i] = (level >> 31) & 1;
-                /* Weight it and and shift down into range, adding for rounding.
+                /* Weight it and shift down into range, adding for rounding.
                  * The extra division by a factor of 2^4 reverses the 8x
                  * expansion of the DCT AND the 2x doubling of the weights. */
                 level     = (FFABS(level) * weight[i] + (1 << (dv_weight_bits + 3))) >>
                             (dv_weight_bits + 4);
+                if (!level)
+                    continue;
                 bi->mb[i] = level;
                 if (level > max)
                     max = level;
@@ -361,7 +375,7 @@ static inline void dv_guess_qnos(EncBlockInfo *blks, int *qnos)
                         b->bit_size[a] = 1; // 4 areas 4 bits for EOB :)
                         b->area_q[a]++;
                         prev = b->prev[a];
-                        assert(b->next[prev] >= mb_area_start[a + 1] || b->mb[prev]);
+                        av_assert2(b->next[prev] >= mb_area_start[a + 1] || b->mb[prev]);
                         for (k = b->next[prev]; k < mb_area_start[a + 1]; k = b->next[k]) {
                             b->mb[k] >>= 1;
                             if (b->mb[k]) {
@@ -371,11 +385,11 @@ static inline void dv_guess_qnos(EncBlockInfo *blks, int *qnos)
                                 if (b->next[k] >= mb_area_start[a + 1] && b->next[k] < 64) {
                                     for (a2 = a + 1; b->next[k] >= mb_area_start[a2 + 1]; a2++)
                                         b->prev[a2] = prev;
-                                    assert(a2 < 4);
-                                    assert(b->mb[b->next[k]]);
+                                    av_assert2(a2 < 4);
+                                    av_assert2(b->mb[b->next[k]]);
                                     b->bit_size[a2] += dv_rl2vlc_size(b->next[k] - prev - 1, b->mb[b->next[k]]) -
                                                        dv_rl2vlc_size(b->next[k] - k    - 1, b->mb[b->next[k]]);
-                                    assert(b->prev[a2] == k && (a2 + 1 >= 4 || b->prev[a2 + 1] != k));
+                                    av_assert2(b->prev[a2] == k && (a2 + 1 >= 4 || b->prev[a2 + 1] != k));
                                     b->prev[a2] = prev;
                                 }
                                 b->next[prev] = b->next[k];
@@ -570,6 +584,7 @@ static inline int dv_write_pack(enum dv_pack_type pack_id, DVVideoContext *c,
      *      compression scheme (if any).
      */
     int apt = (c->sys->pix_fmt == AV_PIX_FMT_YUV420P ? 0 : 1);
+    int fs  = c->frame->top_field_first ? 0x00 : 0x40;
 
     uint8_t aspect = 0;
     if ((int) (av_q2d(c->avctx->sample_aspect_ratio) *
@@ -609,7 +624,7 @@ static inline int dv_write_pack(enum dv_pack_type pack_id, DVVideoContext *c,
         buf[2] = 0xc8 |        /* reserved -- always b11001xxx */
                  aspect;
         buf[3] = (1 << 7) |    /* frame/field flag 1 -- frame, 0 -- field */
-                 (1 << 6) |    /* first/second field flag 0 -- field 2, 1 -- field 1 */
+                 fs       |    /* first/second field flag 0 -- field 2, 1 -- field 1 */
                  (1 << 5) |    /* frame change flag 0 -- same picture as before, 1 -- different */
                  (1 << 4) |    /* 1 - interlaced, 0 - noninterlaced */
                  0xc;          /* reserved -- always b1100 */
@@ -712,10 +727,8 @@ static int dvvideo_encode_frame(AVCodecContext *c, AVPacket *pkt,
     DVVideoContext *s = c->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, s->sys->frame_size)) < 0) {
-        av_log(c, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(c, pkt, s->sys->frame_size, 0)) < 0)
         return ret;
-    }
 
     c->pix_fmt                = s->sys->pix_fmt;
     s->frame                  = frame;
@@ -740,6 +753,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+#define OFFSET(x) offsetof(DVVideoContext, x)
+static const AVOption dv_options[] = {
+    { "quant_deadzone",        "Quantizer dead zone",    OFFSET(quant_deadzone),       AV_OPT_TYPE_INT, { .i64 = 7 }, 0, 1024, VE },
+    { NULL },
+};
+
+static const AVClass dvvideo_encode_class = {
+    .class_name = "dvvideo encoder",
+    .item_name  = av_default_item_name,
+    .option     = dv_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_dvvideo_encoder = {
     .name           = "dvvideo",
     .long_name      = NULL_IF_CONFIG_SMALL("DV (Digital Video)"),
@@ -748,9 +775,10 @@ AVCodec ff_dvvideo_encoder = {
     .priv_data_size = sizeof(DVVideoContext),
     .init           = dvvideo_encode_init,
     .encode2        = dvvideo_encode_frame,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV422P,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE
     },
+    .priv_class     = &dvvideo_encode_class,
 };
diff --git a/libavcodec/dxa.c b/libavcodec/dxa.c
index b804935..f6edc03 100644
--- a/libavcodec/dxa.c
+++ b/libavcodec/dxa.c
@@ -2,20 +2,20 @@
  * Feeble Files/ScummVM DXA decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
 
@@ -41,6 +42,7 @@ typedef struct DxaDecContext {
     AVFrame *prev;
 
     int dsize;
+#define DECOMP_BUF_PADDING 16
     uint8_t *decomp_buf;
     uint32_t pal[256];
 } DxaDecContext;
@@ -49,13 +51,17 @@ static const int shift1[6] = { 0, 8, 8, 8, 4, 4 };
 static const int shift2[6] = { 0, 0, 8, 4, 0, 4 };
 
 static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
-                     int stride, uint8_t *src, uint8_t *ref)
+                     int stride, uint8_t *src, int srcsize, uint8_t *ref)
 {
     uint8_t *code, *data, *mv, *msk, *tmp, *tmp2;
+    uint8_t *src_end = src + srcsize;
     int i, j, k;
     int type, x, y, d, d2;
     uint32_t mask;
 
+    if (12ULL  + ((avctx->width * avctx->height) >> 4) + AV_RB32(src + 0) + AV_RB32(src + 4) > srcsize)
+        return AVERROR_INVALIDDATA;
+
     code = src  + 12;
     data = code + ((avctx->width * avctx->height) >> 4);
     mv   = data + AV_RB32(src + 0);
@@ -63,6 +69,8 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
 
     for(j = 0; j < avctx->height; j += 4){
         for(i = 0; i < avctx->width; i += 4){
+            if (data > src_end || mv > src_end || msk > src_end)
+                return AVERROR_INVALIDDATA;
             tmp  = dst + i;
             tmp2 = ref + i;
             type = *code++;
@@ -70,6 +78,11 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
             case 4: // motion compensation
                 x = (*mv) >> 4;    if(x & 8) x = 8 - x;
                 y = (*mv++) & 0xF; if(y & 8) y = 8 - y;
+                if (i < -x || avctx->width  - i - 4 < x ||
+                    j < -y || avctx->height - j - 4 < y) {
+                    av_log(avctx, AV_LOG_ERROR, "MV %d %d out of bounds\n", x,y);
+                    return AVERROR_INVALIDDATA;
+                }
                 tmp2 += x + y*stride;
             case 0: // skip
             case 5: // skip in method 12
@@ -127,6 +140,11 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
                     case 0x80: // motion compensation
                         x = (*mv) >> 4;    if(x & 8) x = 8 - x;
                         y = (*mv++) & 0xF; if(y & 8) y = 8 - y;
+                        if (i + 2*(k & 1) < -x || avctx->width  - i - 2*(k & 1) - 2 < x ||
+                            j +   (k & 2) < -y || avctx->height - j -   (k & 2) - 2 < y) {
+                            av_log(avctx, AV_LOG_ERROR, "MV %d %d out of bounds\n", x,y);
+                            return AVERROR_INVALIDDATA;
+                        }
                         tmp2 += x + y*stride;
                     case 0x00: // skip
                         tmp[d + 0         ] = tmp2[0];
@@ -192,35 +210,27 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
     AVFrame *frame = data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     DxaDecContext * const c = avctx->priv_data;
     uint8_t *outptr, *srcptr, *tmpptr;
     unsigned long dsize;
     int i, j, compr, ret;
     int stride;
-    int orig_buf_size = buf_size;
     int pc = 0;
+    GetByteContext gb;
 
-    /* make the palette available on the way out */
-    if(buf[0]=='C' && buf[1]=='M' && buf[2]=='A' && buf[3]=='P'){
-        int r, g, b;
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-        buf += 4;
+    /* make the palette available on the way out */
+    if (bytestream2_peek_le32(&gb) == MKTAG('C','M','A','P')) {
+        bytestream2_skip(&gb, 4);
         for(i = 0; i < 256; i++){
-            r = *buf++;
-            g = *buf++;
-            b = *buf++;
-            c->pal[i] = (r << 16) | (g << 8) | b;
+            c->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&gb);
         }
         pc = 1;
-        buf_size -= 768+4;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
     memcpy(frame->data[1], c->pal, AVPALETTE_SIZE);
     frame->palette_has_changed = pc;
 
@@ -229,16 +239,25 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     tmpptr = c->prev->data[0];
     stride = frame->linesize[0];
 
-    if(buf[0]=='N' && buf[1]=='U' && buf[2]=='L' && buf[3]=='L')
+    if (bytestream2_get_le32(&gb) == MKTAG('N','U','L','L'))
         compr = -1;
     else
-        compr = buf[4];
+        compr = bytestream2_get_byte(&gb);
 
     dsize = c->dsize;
-    if((compr != 4 && compr != -1) && uncompress(c->decomp_buf, &dsize, buf + 9, buf_size - 9) != Z_OK){
-        av_log(avctx, AV_LOG_ERROR, "Uncompress failed!\n");
-        return AVERROR_UNKNOWN;
+    if (compr != 4 && compr != -1) {
+        bytestream2_skip(&gb, 4);
+        if (uncompress(c->decomp_buf, &dsize, avpkt->data + bytestream2_tell(&gb),
+                       bytestream2_get_bytes_left(&gb)) != Z_OK) {
+            av_log(avctx, AV_LOG_ERROR, "Uncompress failed!\n");
+            return AVERROR_UNKNOWN;
+        }
+        memset(c->decomp_buf + dsize, 0, DECOMP_BUF_PADDING);
     }
+
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_DEBUG, "compr:%2d, dsize:%d\n", compr, (int)dsize);
+
     switch(compr){
     case -1:
         frame->key_frame = 0;
@@ -265,14 +284,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case 5:
         if (!tmpptr) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
-            return AVERROR_INVALIDDATA;
+            if (!(avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
+                return AVERROR_INVALIDDATA;
         }
         frame->key_frame = 0;
         frame->pict_type = AV_PICTURE_TYPE_P;
         for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++)
-                outptr[i] = srcptr[i] ^ tmpptr[i];
-            tmpptr += stride;
+            if(tmpptr){
+                for(i = 0; i < avctx->width; i++)
+                    outptr[i] = srcptr[i] ^ tmpptr[i];
+                tmpptr += stride;
+            }else
+                memcpy(outptr, srcptr, avctx->width);
             outptr += stride;
             srcptr += avctx->width;
         }
@@ -281,10 +304,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case 13:
         frame->key_frame = 0;
         frame->pict_type = AV_PICTURE_TYPE_P;
-        decode_13(avctx, c, frame->data[0], frame->linesize[0], srcptr, c->prev->data[0]);
+        if (!c->prev->data[0]) {
+            av_log(avctx, AV_LOG_ERROR, "Missing reference frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+        decode_13(avctx, c, frame->data[0], frame->linesize[0], srcptr, dsize, c->prev->data[0]);
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown/unsupported compression type %d\n", buf[4]);
+        av_log(avctx, AV_LOG_ERROR, "Unknown/unsupported compression type %d\n", compr);
         return AVERROR_INVALIDDATA;
     }
 
@@ -295,13 +322,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     *got_frame = 1;
 
     /* always report that the buffer was completely consumed */
-    return orig_buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     DxaDecContext * const c = avctx->priv_data;
 
+    if (avctx->width%4 || avctx->height%4) {
+        avpriv_request_sample(avctx, "dimensions are not a multiple of 4");
+        return AVERROR_INVALIDDATA;
+    }
+
     c->prev = av_frame_alloc();
     if (!c->prev)
         return AVERROR(ENOMEM);
@@ -309,7 +341,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
     c->dsize = avctx->width * avctx->height * 2;
-    if (!(c->decomp_buf = av_malloc(c->dsize))) {
+    c->decomp_buf = av_malloc(c->dsize + DECOMP_BUF_PADDING);
+    if (!c->decomp_buf) {
+        av_frame_free(&c->prev);
         av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
         return AVERROR(ENOMEM);
     }
diff --git a/libavcodec/dxtory.c b/libavcodec/dxtory.c
index 05de4ac..285ca38 100644
--- a/libavcodec/dxtory.c
+++ b/libavcodec/dxtory.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,8 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "unary.h"
 
@@ -40,7 +40,7 @@ static int dxtory_decode_v1_rgb(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *dst;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * bpp) {
+    if (src_size < avctx->width * avctx->height * (int64_t)bpp) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -66,7 +66,7 @@ static int dxtory_decode_v1_410(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y1, *Y2, *Y3, *Y4, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 18 / 16) {
+    if (src_size < FFALIGN(avctx->width, 4) * FFALIGN(avctx->height, 4) * 9LL / 8) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -83,10 +83,10 @@ static int dxtory_decode_v1_410(AVCodecContext *avctx, AVFrame *pic,
     V  = pic->data[2];
     for (h = 0; h < avctx->height; h += 4) {
         for (w = 0; w < avctx->width; w += 4) {
-            AV_COPY32(Y1 + w, src);
-            AV_COPY32(Y2 + w, src + 4);
-            AV_COPY32(Y3 + w, src + 8);
-            AV_COPY32(Y4 + w, src + 12);
+            AV_COPY32U(Y1 + w, src);
+            AV_COPY32U(Y2 + w, src + 4);
+            AV_COPY32U(Y3 + w, src + 8);
+            AV_COPY32U(Y4 + w, src + 12);
             U[w >> 2] = src[16] + 0x80;
             V[w >> 2] = src[17] + 0x80;
             src += 18;
@@ -109,7 +109,7 @@ static int dxtory_decode_v1_420(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y1, *Y2, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 3 / 2) {
+    if (src_size < FFALIGN(avctx->width, 2) * FFALIGN(avctx->height, 2) * 3LL / 2) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -146,7 +146,7 @@ static int dxtory_decode_v1_444(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 3) {
+    if (src_size < avctx->width * avctx->height * 3LL) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -176,13 +176,13 @@ static const uint8_t def_lru[8] = { 0x00, 0x20, 0x40, 0x60, 0x80, 0xA0, 0xC0, 0x
 static const uint8_t def_lru_555[8] = { 0x00, 0x08, 0x10, 0x18, 0x1F };
 static const uint8_t def_lru_565[8] = { 0x00, 0x08, 0x10, 0x20, 0x30, 0x3F };
 
-static inline uint8_t decode_sym(BitstreamContext *bc, uint8_t lru[8])
+static inline uint8_t decode_sym(GetBitContext *gb, uint8_t lru[8])
 {
     uint8_t c, val;
 
-    c = get_unary(bc, 0, 8);
+    c = get_unary(gb, 0, 8);
     if (!c) {
-        val = bitstream_read(bc, 8);
+        val = get_bits(gb, 8);
         memmove(lru + 1, lru, sizeof(*lru) * (8 - 1));
     } else {
         val = lru[c - 1];
@@ -243,14 +243,14 @@ static int load_buffer(AVCodecContext *avctx,
     return 0;
 }
 
-static inline uint8_t decode_sym_565(BitstreamContext *bc, uint8_t lru[8],
+static inline uint8_t decode_sym_565(GetBitContext *gb, uint8_t lru[8],
                                      int bits)
 {
     uint8_t c, val;
 
-    c = get_unary(bc, 0, bits);
+    c = get_unary(gb, 0, bits);
     if (!c) {
-        val = bitstream_read(bc, bits);
+        val = get_bits(gb, bits);
         memmove(lru + 1, lru, sizeof(*lru) * (6 - 1));
     } else {
         val = lru[c - 1];
@@ -261,7 +261,7 @@ static inline uint8_t decode_sym_565(BitstreamContext *bc, uint8_t lru[8],
     return val;
 }
 
-typedef int (*decode_slice_func)(BitstreamContext *bc, AVFrame *frame,
+typedef int (*decode_slice_func)(GetBitContext *gb, AVFrame *frame,
                                  int line, int height, uint8_t lru[3][8]);
 
 typedef void (*setup_lru_func)(uint8_t lru[3][8]);
@@ -273,7 +273,7 @@ static int dxtory_decode_v2(AVCodecContext *avctx, AVFrame *pic,
                             enum AVPixelFormat fmt)
 {
     GetByteContext gb;
-    BitstreamContext bc;
+    GetBitContext  gb2;
     int nslices, slice, line = 0;
     uint32_t off, slice_size;
     uint8_t lru[3][8];
@@ -296,26 +296,23 @@ static int dxtory_decode_v2(AVCodecContext *avctx, AVFrame *pic,
         if (ret < 0)
             return ret;
 
-        bitstream_init8(&bc, src + off + 16, slice_size - 16);
+        if ((ret = init_get_bits8(&gb2, src + off + 16, slice_size - 16)) < 0)
+            return ret;
 
-        line += decode_slice(&bc, pic, line, avctx->height - line, lru);
+        line += decode_slice(&gb2, pic, line, avctx->height - line, lru);
 
         off += slice_size;
     }
 
     if (avctx->height - line) {
-        av_log(avctx, AV_LOG_VERBOSE,
-               "Not enough slice data available, "
-               "cropping the frame by %d pixels\n",
-                avctx->height - line);
-        avctx->height = line;
+        avpriv_request_sample(avctx, "Not enough slice data available");
     }
 
     return 0;
 }
 
 av_always_inline
-static int dx2_decode_slice_5x5(BitstreamContext *bc, AVFrame *frame,
+static int dx2_decode_slice_5x5(GetBitContext *gb, AVFrame *frame,
                                 int line, int left, uint8_t lru[3][8],
                                 int is_565)
 {
@@ -325,11 +322,11 @@ static int dx2_decode_slice_5x5(BitstreamContext *bc, AVFrame *frame,
     int stride   = frame->linesize[0];
     uint8_t *dst = frame->data[0] + stride * line;
 
-    for (y = 0; y < left && bitstream_bits_left(bc) > 16; y++) {
+    for (y = 0; y < left && get_bits_left(gb) > 6 * width; y++) {
         for (x = 0; x < width; x++) {
-            b = decode_sym_565(bc, lru[0], 5);
-            g = decode_sym_565(bc, lru[1], is_565 ? 6 : 5);
-            r = decode_sym_565(bc, lru[2], 5);
+            b = decode_sym_565(gb, lru[0], 5);
+            g = decode_sym_565(gb, lru[1], is_565 ? 6 : 5);
+            r = decode_sym_565(gb, lru[2], 5);
             dst[x * 3 + 0] = (r << 3) | (r >> 2);
             dst[x * 3 + 1] = is_565 ? (g << 2) | (g >> 4) : (g << 3) | (g >> 2);
             dst[x * 3 + 2] = (b << 3) | (b >> 2);
@@ -355,16 +352,16 @@ static void setup_lru_565(uint8_t lru[3][8])
     memcpy(lru[2], def_lru_555, 8 * sizeof(*def_lru));
 }
 
-static int dx2_decode_slice_555(BitstreamContext *bc, AVFrame *frame,
+static int dx2_decode_slice_555(GetBitContext *gb, AVFrame *frame,
                                 int line, int left, uint8_t lru[3][8])
 {
-    return dx2_decode_slice_5x5(bc, frame, line, left, lru, 0);
+    return dx2_decode_slice_5x5(gb, frame, line, left, lru, 0);
 }
 
-static int dx2_decode_slice_565(BitstreamContext *bc, AVFrame *frame,
+static int dx2_decode_slice_565(GetBitContext *gb, AVFrame *frame,
                                 int line, int left, uint8_t lru[3][8])
 {
-    return dx2_decode_slice_5x5(bc, frame, line, left, lru, 1);
+    return dx2_decode_slice_5x5(gb, frame, line, left, lru, 1);
 }
 
 static int dxtory_decode_v2_565(AVCodecContext *avctx, AVFrame *pic,
@@ -383,7 +380,7 @@ static int dxtory_decode_v2_565(AVCodecContext *avctx, AVFrame *pic,
                                 fmt);
 }
 
-static int dx2_decode_slice_rgb(BitstreamContext *bc, AVFrame *frame,
+static int dx2_decode_slice_rgb(GetBitContext *gb, AVFrame *frame,
                                 int line, int left, uint8_t lru[3][8])
 {
     int x, y;
@@ -391,11 +388,11 @@ static int dx2_decode_slice_rgb(BitstreamContext *bc, AVFrame *frame,
     int stride   = frame->linesize[0];
     uint8_t *dst = frame->data[0] + stride * line;
 
-    for (y = 0; y < left && bitstream_bits_left(bc) > 16; y++) {
+    for (y = 0; y < left && get_bits_left(gb) > 6 * width; y++) {
         for (x = 0; x < width; x++) {
-            dst[x * 3 + 0] = decode_sym(bc, lru[0]);
-            dst[x * 3 + 1] = decode_sym(bc, lru[1]);
-            dst[x * 3 + 2] = decode_sym(bc, lru[2]);
+            dst[x * 3 + 0] = decode_sym(gb, lru[0]);
+            dst[x * 3 + 1] = decode_sym(gb, lru[1]);
+            dst[x * 3 + 2] = decode_sym(gb, lru[2]);
         }
 
         dst += stride;
@@ -421,7 +418,7 @@ static int dxtory_decode_v2_rgb(AVCodecContext *avctx, AVFrame *pic,
                             AV_PIX_FMT_BGR24);
 }
 
-static int dx2_decode_slice_410(BitstreamContext *bc, AVFrame *frame,
+static int dx2_decode_slice_410(GetBitContext *gb, AVFrame *frame,
                                 int line, int left,
                                 uint8_t lru[3][8])
 {
@@ -436,13 +433,13 @@ static int dx2_decode_slice_410(BitstreamContext *bc, AVFrame *frame,
     uint8_t *U  = frame->data[1] + (ustride >> 2) * line;
     uint8_t *V  = frame->data[2] + (vstride >> 2) * line;
 
-    for (y = 0; y < left - 3 && bitstream_bits_left(bc) > 16; y += 4) {
+    for (y = 0; y < left - 3 && get_bits_left(gb) > 9 * width; y += 4) {
         for (x = 0; x < width; x += 4) {
             for (j = 0; j < 4; j++)
                 for (i = 0; i < 4; i++)
-                    Y[x + i + j * ystride] = decode_sym(bc, lru[0]);
-            U[x >> 2] = decode_sym(bc, lru[1]) ^ 0x80;
-            V[x >> 2] = decode_sym(bc, lru[2]) ^ 0x80;
+                    Y[x + i + j * ystride] = decode_sym(gb, lru[0]);
+            U[x >> 2] = decode_sym(gb, lru[1]) ^ 0x80;
+            V[x >> 2] = decode_sym(gb, lru[2]) ^ 0x80;
         }
 
         Y += ystride << 2;
@@ -463,7 +460,7 @@ static int dxtory_decode_v2_410(AVCodecContext *avctx, AVFrame *pic,
                             AV_PIX_FMT_YUV410P);
 }
 
-static int dx2_decode_slice_420(BitstreamContext *bc, AVFrame *frame,
+static int dx2_decode_slice_420(GetBitContext *gb, AVFrame *frame,
                                 int line, int left,
                                 uint8_t lru[3][8])
 {
@@ -480,14 +477,14 @@ static int dx2_decode_slice_420(BitstreamContext *bc, AVFrame *frame,
     uint8_t *V  = frame->data[2] + (vstride >> 1) * line;
 
 
-    for (y = 0; y < left - 1 && bitstream_bits_left(bc) > 16; y += 2) {
+    for (y = 0; y < left - 1 && get_bits_left(gb) > 6 * width; y += 2) {
         for (x = 0; x < width; x += 2) {
-            Y[x + 0 + 0 * ystride] = decode_sym(bc, lru[0]);
-            Y[x + 1 + 0 * ystride] = decode_sym(bc, lru[0]);
-            Y[x + 0 + 1 * ystride] = decode_sym(bc, lru[0]);
-            Y[x + 1 + 1 * ystride] = decode_sym(bc, lru[0]);
-            U[x >> 1] = decode_sym(bc, lru[1]) ^ 0x80;
-            V[x >> 1] = decode_sym(bc, lru[2]) ^ 0x80;
+            Y[x + 0 + 0 * ystride] = decode_sym(gb, lru[0]);
+            Y[x + 1 + 0 * ystride] = decode_sym(gb, lru[0]);
+            Y[x + 0 + 1 * ystride] = decode_sym(gb, lru[0]);
+            Y[x + 1 + 1 * ystride] = decode_sym(gb, lru[0]);
+            U[x >> 1] = decode_sym(gb, lru[1]) ^ 0x80;
+            V[x >> 1] = decode_sym(gb, lru[2]) ^ 0x80;
         }
 
         Y += ystride << 1;
@@ -507,7 +504,7 @@ static int dxtory_decode_v2_420(AVCodecContext *avctx, AVFrame *pic,
                             AV_PIX_FMT_YUV420P);
 }
 
-static int dx2_decode_slice_444(BitstreamContext *bc, AVFrame *frame,
+static int dx2_decode_slice_444(GetBitContext *gb, AVFrame *frame,
                                 int line, int left,
                                 uint8_t lru[3][8])
 {
@@ -523,11 +520,11 @@ static int dx2_decode_slice_444(BitstreamContext *bc, AVFrame *frame,
     uint8_t *U  = frame->data[1] + ustride * line;
     uint8_t *V  = frame->data[2] + vstride * line;
 
-    for (y = 0; y < left && bitstream_bits_left(bc) > 16; y++) {
+    for (y = 0; y < left && get_bits_left(gb) > 6 * width; y++) {
         for (x = 0; x < width; x++) {
-            Y[x] = decode_sym(bc, lru[0]);
-            U[x] = decode_sym(bc, lru[1]) ^ 0x80;
-            V[x] = decode_sym(bc, lru[2]) ^ 0x80;
+            Y[x] = decode_sym(gb, lru[0]);
+            U[x] = decode_sym(gb, lru[1]) ^ 0x80;
+            V[x] = decode_sym(gb, lru[2]) ^ 0x80;
         }
 
         Y += ystride;
diff --git a/libavcodec/dxv.c b/libavcodec/dxv.c
index 41cac73..aef5ec1 100644
--- a/libavcodec/dxv.c
+++ b/libavcodec/dxv.c
@@ -1,21 +1,22 @@
 /*
  * Resolume DXV decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
+ * Copyright (C) 2018 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +24,7 @@
 
 #include "libavutil/imgutils.h"
 
+#include "mathops.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -34,50 +36,211 @@ typedef struct DXVContext {
     TextureDSPContext texdsp;
     GetByteContext gbc;
 
-    uint8_t *tex_data;  // Compressed texture
-    int tex_rat;        // Compression ratio
-    int tex_step;       // Distance between blocks
-    int64_t tex_size;   // Texture size
+    uint8_t *tex_data;   // Compressed texture
+    uint8_t *ctex_data;  // Compressed texture
+    int tex_rat;         // Compression ratio
+    int tex_step;        // Distance between blocks
+    int ctex_step;       // Distance between blocks
+    int64_t tex_size;    // Texture size
+    int64_t ctex_size;   // Texture size
 
     /* Optimal number of slices for parallel decoding */
     int slice_count;
 
+    uint8_t *op_data[4]; // Opcodes
+    int64_t op_size[4];  // Opcodes size
+
+    int texture_block_w;
+    int texture_block_h;
+
+    int ctexture_block_w;
+    int ctexture_block_h;
+
     /* Pointer to the selected decompression function */
     int (*tex_funct)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*tex_funct_planar[2])(uint8_t *plane0, ptrdiff_t stride0,
+                               uint8_t *plane1, ptrdiff_t stride1,
+                               const uint8_t *block);
 } DXVContext;
 
+static void decompress_indices(uint8_t *dst, const uint8_t *src)
+{
+    int block, i;
+
+    for (block = 0; block < 2; block++) {
+        int tmp = AV_RL24(src);
+
+        /* Unpack 8x3 bit from last 3 byte block */
+        for (i = 0; i < 8; i++)
+            dst[i] = (tmp >> (i * 3)) & 0x7;
+
+        src += 3;
+        dst += 8;
+    }
+}
+
+static int extract_component(int yo0, int yo1, int code)
+{
+    int yo;
+
+    if (yo0 == yo1) {
+        yo = yo0;
+    } else if (code == 0) {
+        yo = yo0;
+    } else if (code == 1) {
+        yo = yo1;
+    } else {
+        if (yo0 > yo1) {
+            yo = (uint8_t) (((8 - code) * yo0 +
+                             (code - 1) * yo1) / 7);
+        } else {
+            if (code == 6) {
+                yo = 0;
+            } else if (code == 7) {
+                yo = 255;
+            } else {
+                yo = (uint8_t) (((6 - code) * yo0 +
+                                 (code - 1) * yo1) / 5);
+            }
+        }
+    }
+
+    return yo;
+}
+
+static int cocg_block(uint8_t *plane0, ptrdiff_t stride0,
+                      uint8_t *plane1, ptrdiff_t stride1,
+                      const uint8_t *block)
+{
+    uint8_t co_indices[16];
+    uint8_t cg_indices[16];
+    uint8_t co0 = *(block);
+    uint8_t co1 = *(block + 1);
+    uint8_t cg0 = *(block + 8);
+    uint8_t cg1 = *(block + 9);
+    int x, y;
+
+    decompress_indices(co_indices, block + 2);
+    decompress_indices(cg_indices, block + 10);
+
+    for (y = 0; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+            int co_code = co_indices[x + y * 4];
+            int cg_code = cg_indices[x + y * 4];
+
+            plane0[x] = extract_component(cg0, cg1, cg_code);
+            plane1[x] = extract_component(co0, co1, co_code);
+        }
+        plane0 += stride0;
+        plane1 += stride1;
+    }
+
+    return 16;
+}
+
+static void yao_subblock(uint8_t *dst, uint8_t *yo_indices,
+                        ptrdiff_t stride, const uint8_t *block)
+{
+    uint8_t yo0 = *(block);
+    uint8_t yo1 = *(block + 1);
+    int x, y;
+
+    decompress_indices(yo_indices, block + 2);
+
+    for (y = 0; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+            int yo_code = yo_indices[x + y * 4];
+
+            dst[x] = extract_component(yo0, yo1, yo_code);
+        }
+        dst += stride;
+    }
+}
+
+static int yo_block(uint8_t *dst, ptrdiff_t stride,
+                    uint8_t *unused0, ptrdiff_t unused1,
+                    const uint8_t *block)
+{
+    uint8_t yo_indices[16];
+
+    yao_subblock(dst,      yo_indices, stride, block);
+    yao_subblock(dst + 4,  yo_indices, stride, block + 8);
+    yao_subblock(dst + 8,  yo_indices, stride, block + 16);
+    yao_subblock(dst + 12, yo_indices, stride, block + 24);
+
+    return 32;
+}
+
+static int yao_block(uint8_t *plane0, ptrdiff_t stride0,
+                     uint8_t *plane3, ptrdiff_t stride1,
+                     const uint8_t *block)
+{
+    uint8_t yo_indices[16];
+    uint8_t a_indices[16];
+
+    yao_subblock(plane0,      yo_indices, stride0, block);
+    yao_subblock(plane3,      a_indices,  stride1, block + 8);
+    yao_subblock(plane0 + 4,  yo_indices, stride0, block + 16);
+    yao_subblock(plane3 + 4,  a_indices,  stride1, block + 24);
+    yao_subblock(plane0 + 8,  yo_indices, stride0, block + 32);
+    yao_subblock(plane3 + 8,  a_indices,  stride1, block + 40);
+    yao_subblock(plane0 + 12, yo_indices, stride0, block + 48);
+    yao_subblock(plane3 + 12, a_indices,  stride1, block + 56);
+
+    return 64;
+}
+
 static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
                                      int slice, int thread_nb)
 {
     DXVContext *ctx = avctx->priv_data;
     AVFrame *frame = arg;
     const uint8_t *d = ctx->tex_data;
-    int w_block = avctx->coded_width / TEXTURE_BLOCK_W;
-    int h_block = avctx->coded_height / TEXTURE_BLOCK_H;
+    int w_block = avctx->coded_width / ctx->texture_block_w;
+    int h_block = avctx->coded_height / ctx->texture_block_h;
     int x, y;
     int start_slice, end_slice;
-    int base_blocks_per_slice = h_block / ctx->slice_count;
-    int remainder_blocks = h_block % ctx->slice_count;
-
-    /* When the frame height (in blocks) doesn't divide evenly between the
-     * number of slices, spread the remaining blocks evenly between the first
-     * operations */
-    start_slice = slice * base_blocks_per_slice;
-    /* Add any extra blocks (one per slice) that have been added
-     * before this slice */
-    start_slice += FFMIN(slice, remainder_blocks);
-
-    end_slice = start_slice + base_blocks_per_slice;
-    /* Add an extra block if there are remainder blocks to be accounted for */
-    if (slice < remainder_blocks)
-        end_slice++;
-
-    for (y = start_slice; y < end_slice; y++) {
-        uint8_t *p = frame->data[0] + y * frame->linesize[0] * TEXTURE_BLOCK_H;
-        int off  = y * w_block;
-        for (x = 0; x < w_block; x++) {
-            ctx->tex_funct(p + x * 16, frame->linesize[0],
-                           d + (off + x) * ctx->tex_step);
+
+    start_slice = h_block * slice / ctx->slice_count;
+    end_slice = h_block * (slice + 1) / ctx->slice_count;
+
+    if (ctx->tex_funct) {
+        for (y = start_slice; y < end_slice; y++) {
+            uint8_t *p = frame->data[0] + y * frame->linesize[0] * ctx->texture_block_h;
+            int off = y * w_block;
+            for (x = 0; x < w_block; x++) {
+                ctx->tex_funct(p + x * 4 * ctx->texture_block_w, frame->linesize[0],
+                               d + (off + x) * ctx->tex_step);
+            }
+        }
+    } else {
+        const uint8_t *c = ctx->ctex_data;
+
+        for (y = start_slice; y < end_slice; y++) {
+            uint8_t *p0 = frame->data[0] + y * frame->linesize[0] * ctx->texture_block_h;
+            uint8_t *p3 = ctx->tex_step != 64 ? NULL : frame->data[3] + y * frame->linesize[3] * ctx->texture_block_h;
+            int off = y * w_block;
+            for (x = 0; x < w_block; x++) {
+                ctx->tex_funct_planar[0](p0 + x * ctx->texture_block_w, frame->linesize[0],
+                                         p3 != NULL ? p3 + x * ctx->texture_block_w : NULL, frame->linesize[3],
+                                         d + (off + x) * ctx->tex_step);
+            }
+        }
+
+        w_block = (avctx->coded_width / 2) / ctx->ctexture_block_w;
+        h_block = (avctx->coded_height / 2) / ctx->ctexture_block_h;
+        start_slice = h_block * slice / ctx->slice_count;
+        end_slice = h_block * (slice + 1) / ctx->slice_count;
+
+        for (y = start_slice; y < end_slice; y++) {
+            uint8_t *p0 = frame->data[1] + y * frame->linesize[1] * ctx->ctexture_block_h;
+            uint8_t *p1 = frame->data[2] + y * frame->linesize[2] * ctx->ctexture_block_h;
+            int off = y * w_block;
+            for (x = 0; x < w_block; x++) {
+                ctx->tex_funct_planar[1](p0 + x * ctx->ctexture_block_w, frame->linesize[1],
+                                         p1 + x * ctx->ctexture_block_w, frame->linesize[2],
+                                         c + (off + x) * ctx->ctex_step);
+            }
         }
     }
 
@@ -105,9 +268,17 @@ static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
             break;                                                            \
         case 2:                                                               \
             idx = (bytestream2_get_byte(gbc) + 2) * x;                        \
+            if (idx > pos) {                                                  \
+                av_log(avctx, AV_LOG_ERROR, "idx %d > %d\n", idx, pos);       \
+                return AVERROR_INVALIDDATA;                                   \
+            }                                                                 \
             break;                                                            \
         case 3:                                                               \
             idx = (bytestream2_get_le16(gbc) + 0x102) * x;                    \
+            if (idx > pos) {                                                  \
+                av_log(avctx, AV_LOG_ERROR, "idx %d > %d\n", idx, pos);       \
+                return AVERROR_INVALIDDATA;                                   \
+            }                                                                 \
             break;                                                            \
         }                                                                     \
     } while(0)
@@ -161,6 +332,530 @@ static int dxv_decompress_dxt1(AVCodecContext *avctx)
     return 0;
 }
 
+typedef struct OpcodeTable {
+    int16_t next;
+    uint8_t val1;
+    uint8_t val2;
+} OpcodeTable;
+
+static int fill_ltable(GetByteContext *gb, uint32_t *table, int *nb_elements)
+{
+    unsigned half = 512, bits = 1023, left = 1024, input, mask;
+    int value, counter = 0, rshift = 10, lshift = 30;
+
+    mask = bytestream2_get_le32(gb) >> 2;
+    while (left) {
+        if (counter >= 256)
+            return AVERROR_INVALIDDATA;
+        value = bits & mask;
+        left -= bits & mask;
+        mask >>= rshift;
+        lshift -= rshift;
+        table[counter++] = value;
+        if (lshift < 16) {
+            if (bytestream2_get_bytes_left(gb) <= 0)
+                return AVERROR_INVALIDDATA;
+
+            input = bytestream2_get_le16(gb);
+            mask += input << lshift;
+            lshift += 16;
+        }
+        if (left < half) {
+            half >>= 1;
+            bits >>= 1;
+            rshift--;
+        }
+    }
+
+    for (; !table[counter - 1]; counter--)
+        if (counter <= 0)
+            return AVERROR_INVALIDDATA;
+
+    *nb_elements = counter;
+
+    if (counter < 256)
+        memset(&table[counter], 0, 4 * (256 - counter));
+
+    if (lshift >= 16)
+        bytestream2_seek(gb, -2, SEEK_CUR);
+
+    return 0;
+}
+
+static int fill_optable(unsigned *table0, OpcodeTable *table1, int nb_elements)
+{
+    unsigned table2[256] = { 0 };
+    unsigned x = 0;
+    int val0, val1, i, j = 2, k = 0;
+
+    table2[0] = table0[0];
+    for (i = 0; i < nb_elements - 1; i++, table2[i] = val0) {
+        val0 = table0[i + 1] + table2[i];
+    }
+
+    if (!table2[0]) {
+        do {
+            k++;
+        } while (!table2[k]);
+    }
+
+    j = 2;
+    for (i = 1024; i > 0; i--) {
+        for (table1[x].val1 = k; k < 256 && j > table2[k]; k++);
+        x = (x - 383) & 0x3FF;
+        j++;
+    }
+
+    if (nb_elements > 0)
+        memcpy(&table2[0], table0, 4 * nb_elements);
+
+    for (i = 0; i < 1024; i++) {
+        val0 = table1[i].val1;
+        val1 = table2[val0];
+        table2[val0]++;
+        x = 31 - ff_clz(val1);
+        if (x > 10)
+            return AVERROR_INVALIDDATA;
+        table1[i].val2 = 10 - x;
+        table1[i].next = (val1 << table1[i].val2) - 1024;
+    }
+
+    return 0;
+}
+
+static int get_opcodes(GetByteContext *gb, uint32_t *table, uint8_t *dst, int op_size, int nb_elements)
+{
+    OpcodeTable optable[1024];
+    int sum, x, val, lshift, rshift, ret, i, idx;
+    int64_t size_in_bits;
+    unsigned endoffset, newoffset, offset;
+    unsigned next;
+    uint8_t *src = (uint8_t *)gb->buffer;
+
+    ret = fill_optable(table, optable, nb_elements);
+    if (ret < 0)
+        return ret;
+
+    size_in_bits = bytestream2_get_le32(gb);
+    endoffset = ((size_in_bits + 7) >> 3) - 4;
+    if (endoffset <= 0 || bytestream2_get_bytes_left(gb) < endoffset)
+        return AVERROR_INVALIDDATA;
+
+    offset = endoffset;
+    next = AV_RL32(src + endoffset);
+    rshift = (((size_in_bits & 0xFF) - 1) & 7) + 15;
+    lshift = 32 - rshift;
+    idx = (next >> rshift) & 0x3FF;
+    for (i = 0; i < op_size; i++) {
+        dst[i] = optable[idx].val1;
+        val = optable[idx].val2;
+        sum = val + lshift;
+        x = (next << lshift) >> 1 >> (31 - val);
+        newoffset = offset - (sum >> 3);
+        lshift = sum & 7;
+        idx = x + optable[idx].next;
+        offset = newoffset;
+        if (offset > endoffset)
+            return AVERROR_INVALIDDATA;
+        next = AV_RL32(src + offset);
+    }
+
+    bytestream2_skip(gb, (size_in_bits + 7 >> 3) - 4);
+
+    return 0;
+}
+
+static int dxv_decompress_opcodes(GetByteContext *gb, void *dstp, size_t op_size)
+{
+    int pos = bytestream2_tell(gb);
+    int flag = bytestream2_peek_byte(gb);
+
+    if ((flag & 3) == 0) {
+        bytestream2_skip(gb, 1);
+        bytestream2_get_buffer(gb, dstp, op_size);
+    } else if ((flag & 3) == 1) {
+        bytestream2_skip(gb, 1);
+        memset(dstp, bytestream2_get_byte(gb), op_size);
+    } else {
+        uint32_t table[256];
+        int ret, elements = 0;
+
+        ret = fill_ltable(gb, table, &elements);
+        if (ret < 0)
+            return ret;
+        ret = get_opcodes(gb, table, dstp, op_size, elements);
+        if (ret < 0)
+            return ret;
+    }
+    return bytestream2_tell(gb) - pos;
+}
+
+static int dxv_decompress_cgo(DXVContext *ctx, GetByteContext *gb,
+                              uint8_t *tex_data, int tex_size,
+                              uint8_t *op_data, int *oindex,
+                              int op_size,
+                              uint8_t **dstp, int *statep,
+                              uint8_t **tab0, uint8_t **tab1,
+                              int offset)
+{
+    uint8_t *dst = *dstp;
+    uint8_t *tptr0, *tptr1, *tptr3;
+    int oi = *oindex;
+    int state = *statep;
+    int opcode, v, vv;
+
+    if (state <= 0) {
+        if (oi >= op_size)
+            return AVERROR_INVALIDDATA;
+        opcode = op_data[oi++];
+        if (!opcode) {
+            v = bytestream2_get_byte(gb);
+            if (v == 255) {
+                do {
+                    if (bytestream2_get_bytes_left(gb) <= 0)
+                        return AVERROR_INVALIDDATA;
+                    opcode = bytestream2_get_le16(gb);
+                    v += opcode;
+                } while (opcode == 0xFFFF);
+            }
+            AV_WL32(dst, AV_RL32(dst - (8 + offset)));
+            AV_WL32(dst + 4, AV_RL32(dst - (4 + offset)));
+            state = v + 4;
+            goto done;
+        }
+
+        switch (opcode) {
+        case 1:
+            AV_WL32(dst, AV_RL32(dst - (8 + offset)));
+            AV_WL32(dst + 4, AV_RL32(dst - (4 + offset)));
+            break;
+        case 2:
+            vv = (8 + offset) * (bytestream2_get_le16(gb) + 1);
+            if (vv < 0 || vv > dst - tex_data)
+                return AVERROR_INVALIDDATA;
+            tptr0 = dst - vv;
+            v = AV_RL32(tptr0);
+            AV_WL32(dst, AV_RL32(tptr0));
+            AV_WL32(dst + 4, AV_RL32(tptr0 + 4));
+            tab0[0x9E3779B1 * (uint16_t)v >> 24] = dst;
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 3:
+            AV_WL32(dst, bytestream2_get_le32(gb));
+            AV_WL32(dst + 4, bytestream2_get_le32(gb));
+            tab0[0x9E3779B1 * AV_RL16(dst) >> 24] = dst;
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 4:
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, bytestream2_get_le16(gb));
+            AV_WL16(dst + 2, AV_RL16(tptr3));
+            dst[4] = tptr3[2];
+            AV_WL16(dst + 5, bytestream2_get_le16(gb));
+            dst[7] = bytestream2_get_byte(gb);
+            tab0[0x9E3779B1 * AV_RL16(dst) >> 24] = dst;
+            break;
+        case 5:
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, bytestream2_get_le16(gb));
+            AV_WL16(dst + 2, bytestream2_get_le16(gb));
+            dst[4] = bytestream2_get_byte(gb);
+            AV_WL16(dst + 5, AV_RL16(tptr3));
+            dst[7] = tptr3[2];
+            tab0[0x9E3779B1 * AV_RL16(dst) >> 24] = dst;
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 6:
+            tptr0 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr0)
+                return AVERROR_INVALIDDATA;
+            tptr1 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr1)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, bytestream2_get_le16(gb));
+            AV_WL16(dst + 2, AV_RL16(tptr0));
+            dst[4] = tptr0[2];
+            AV_WL16(dst + 5, AV_RL16(tptr1));
+            dst[7] = tptr1[2];
+            tab0[0x9E3779B1 * AV_RL16(dst) >> 24] = dst;
+            break;
+        case 7:
+            v = (8 + offset) * (bytestream2_get_le16(gb) + 1);
+            if (v < 0 || v > dst - tex_data)
+                return AVERROR_INVALIDDATA;
+            tptr0 = dst - v;
+            AV_WL16(dst, bytestream2_get_le16(gb));
+            AV_WL16(dst + 2, AV_RL16(tptr0 + 2));
+            AV_WL32(dst + 4, AV_RL32(tptr0 + 4));
+            tab0[0x9E3779B1 * AV_RL16(dst) >> 24] = dst;
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 8:
+            tptr1 = tab0[bytestream2_get_byte(gb)];
+            if (!tptr1)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(tptr1));
+            AV_WL16(dst + 2, bytestream2_get_le16(gb));
+            AV_WL32(dst + 4, bytestream2_get_le32(gb));
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 9:
+            tptr1 = tab0[bytestream2_get_byte(gb)];
+            if (!tptr1)
+                return AVERROR_INVALIDDATA;
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(tptr1));
+            AV_WL16(dst + 2, AV_RL16(tptr3));
+            dst[4] = tptr3[2];
+            AV_WL16(dst + 5, bytestream2_get_le16(gb));
+            dst[7] = bytestream2_get_byte(gb);
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 10:
+            tptr1 = tab0[bytestream2_get_byte(gb)];
+            if (!tptr1)
+                return AVERROR_INVALIDDATA;
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(tptr1));
+            AV_WL16(dst + 2, bytestream2_get_le16(gb));
+            dst[4] = bytestream2_get_byte(gb);
+            AV_WL16(dst + 5, AV_RL16(tptr3));
+            dst[7] = tptr3[2];
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 11:
+            tptr0 = tab0[bytestream2_get_byte(gb)];
+            if (!tptr0)
+                return AVERROR_INVALIDDATA;
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            tptr1 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr1)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(tptr0));
+            AV_WL16(dst + 2, AV_RL16(tptr3));
+            dst[4] = tptr3[2];
+            AV_WL16(dst + 5, AV_RL16(tptr1));
+            dst[7] = tptr1[2];
+            break;
+        case 12:
+            tptr1 = tab0[bytestream2_get_byte(gb)];
+            if (!tptr1)
+                return AVERROR_INVALIDDATA;
+            v = (8 + offset) * (bytestream2_get_le16(gb) + 1);
+            if (v < 0 || v > dst - tex_data)
+                return AVERROR_INVALIDDATA;
+            tptr0 = dst - v;
+            AV_WL16(dst, AV_RL16(tptr1));
+            AV_WL16(dst + 2, AV_RL16(tptr0 + 2));
+            AV_WL32(dst + 4, AV_RL32(tptr0 + 4));
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 13:
+            AV_WL16(dst, AV_RL16(dst - (8 + offset)));
+            AV_WL16(dst + 2, bytestream2_get_le16(gb));
+            AV_WL32(dst + 4, bytestream2_get_le32(gb));
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 14:
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(dst - (8 + offset)));
+            AV_WL16(dst + 2, AV_RL16(tptr3));
+            dst[4] = tptr3[2];
+            AV_WL16(dst + 5, bytestream2_get_le16(gb));
+            dst[7] = bytestream2_get_byte(gb);
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 15:
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(dst - (8 + offset)));
+            AV_WL16(dst + 2, bytestream2_get_le16(gb));
+            dst[4] = bytestream2_get_byte(gb);
+            AV_WL16(dst + 5, AV_RL16(tptr3));
+            dst[7] = tptr3[2];
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        case 16:
+            tptr3 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr3)
+                return AVERROR_INVALIDDATA;
+            tptr1 = tab1[bytestream2_get_byte(gb)];
+            if (!tptr1)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(dst - (8 + offset)));
+            AV_WL16(dst + 2, AV_RL16(tptr3));
+            dst[4] = tptr3[2];
+            AV_WL16(dst + 5, AV_RL16(tptr1));
+            dst[7] = tptr1[2];
+            break;
+        case 17:
+            v = (8 + offset) * (bytestream2_get_le16(gb) + 1);
+            if (v < 0 || v > dst - tex_data)
+                return AVERROR_INVALIDDATA;
+            AV_WL16(dst, AV_RL16(dst - (8 + offset)));
+            AV_WL16(dst + 2, AV_RL16(&dst[-v + 2]));
+            AV_WL32(dst + 4, AV_RL32(&dst[-v + 4]));
+            tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFFu) >> 24] = dst + 2;
+            break;
+        default:
+            break;
+        }
+    } else {
+done:
+        AV_WL32(dst, AV_RL32(dst - (8 + offset)));
+        AV_WL32(dst + 4, AV_RL32(dst - (4 + offset)));
+        state--;
+    }
+    if (dst - tex_data + 8 > tex_size)
+        return AVERROR_INVALIDDATA;
+    dst += 8;
+
+    *oindex = oi;
+    *dstp = dst;
+    *statep = state;
+
+    return 0;
+}
+
+static int dxv_decompress_cocg(DXVContext *ctx, GetByteContext *gb,
+                               uint8_t *tex_data, int tex_size,
+                               uint8_t *op_data0, uint8_t *op_data1,
+                               int max_op_size0, int max_op_size1)
+{
+    uint8_t *dst, *tab2[256] = { 0 }, *tab0[256] = { 0 }, *tab3[256] = { 0 }, *tab1[256] = { 0 };
+    int op_offset = bytestream2_get_le32(gb);
+    unsigned op_size0 = bytestream2_get_le32(gb);
+    unsigned op_size1 = bytestream2_get_le32(gb);
+    int data_start = bytestream2_tell(gb);
+    int skip0, skip1, oi0 = 0, oi1 = 0;
+    int ret, state0 = 0, state1 = 0;
+
+    dst = tex_data;
+    bytestream2_skip(gb, op_offset - 12);
+    if (op_size0 > max_op_size0)
+        return AVERROR_INVALIDDATA;
+    skip0 = dxv_decompress_opcodes(gb, op_data0, op_size0);
+    if (skip0 < 0)
+        return skip0;
+    bytestream2_seek(gb, data_start + op_offset + skip0 - 12, SEEK_SET);
+    if (op_size1 > max_op_size1)
+        return AVERROR_INVALIDDATA;
+    skip1 = dxv_decompress_opcodes(gb, op_data1, op_size1);
+    if (skip1 < 0)
+        return skip1;
+    bytestream2_seek(gb, data_start, SEEK_SET);
+
+    AV_WL32(dst, bytestream2_get_le32(gb));
+    AV_WL32(dst + 4, bytestream2_get_le32(gb));
+    AV_WL32(dst + 8, bytestream2_get_le32(gb));
+    AV_WL32(dst + 12, bytestream2_get_le32(gb));
+
+    tab0[0x9E3779B1 * AV_RL16(dst) >> 24] = dst;
+    tab1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFF) >> 24] = dst + 2;
+    tab2[0x9E3779B1 * AV_RL16(dst + 8) >> 24] = dst + 8;
+    tab3[0x9E3779B1 * (AV_RL32(dst + 10) & 0xFFFFFF) >> 24] = dst + 10;
+    dst += 16;
+    while (dst + 10 < tex_data + tex_size) {
+        ret = dxv_decompress_cgo(ctx, gb, tex_data, tex_size, op_data0, &oi0, op_size0,
+                                 &dst, &state0, tab0, tab1, 8);
+        if (ret < 0)
+            return ret;
+        ret = dxv_decompress_cgo(ctx, gb, tex_data, tex_size, op_data1, &oi1, op_size1,
+                                 &dst, &state1, tab2, tab3, 8);
+        if (ret < 0)
+            return ret;
+    }
+
+    bytestream2_seek(gb, data_start + op_offset + skip0 + skip1 - 12, SEEK_SET);
+
+    return 0;
+}
+
+static int dxv_decompress_yo(DXVContext *ctx, GetByteContext *gb,
+                             uint8_t *tex_data, int tex_size,
+                             uint8_t *op_data, int max_op_size)
+{
+    int op_offset = bytestream2_get_le32(gb);
+    unsigned op_size = bytestream2_get_le32(gb);
+    int data_start = bytestream2_tell(gb);
+    uint8_t *dst, *table0[256] = { 0 }, *table1[256] = { 0 };
+    int ret, state = 0, skip, oi = 0, v, vv;
+
+    dst = tex_data;
+    bytestream2_skip(gb, op_offset - 8);
+    if (op_size > max_op_size)
+        return AVERROR_INVALIDDATA;
+    skip = dxv_decompress_opcodes(gb, op_data, op_size);
+    if (skip < 0)
+        return skip;
+    bytestream2_seek(gb, data_start, SEEK_SET);
+
+    v = bytestream2_get_le32(gb);
+    AV_WL32(dst, v);
+    vv = bytestream2_get_le32(gb);
+    table0[0x9E3779B1 * (uint16_t)v >> 24] = dst;
+    AV_WL32(dst + 4, vv);
+    table1[0x9E3779B1 * (AV_RL32(dst + 2) & 0xFFFFFF) >> 24] = dst + 2;
+    dst += 8;
+
+    while (dst < tex_data + tex_size) {
+        ret = dxv_decompress_cgo(ctx, gb, tex_data, tex_size, op_data, &oi, op_size,
+                                 &dst, &state, table0, table1, 0);
+        if (ret < 0)
+            return ret;
+    }
+
+    bytestream2_seek(gb, data_start + op_offset + skip - 8, SEEK_SET);
+
+    return 0;
+}
+
+static int dxv_decompress_ycg6(AVCodecContext *avctx)
+{
+    DXVContext *ctx = avctx->priv_data;
+    GetByteContext *gb = &ctx->gbc;
+    int ret;
+
+    ret = dxv_decompress_yo(ctx, gb, ctx->tex_data, ctx->tex_size,
+                            ctx->op_data[0], ctx->op_size[0]);
+    if (ret < 0)
+        return ret;
+
+    return dxv_decompress_cocg(ctx, gb, ctx->ctex_data, ctx->ctex_size,
+                               ctx->op_data[1], ctx->op_data[2],
+                               ctx->op_size[1], ctx->op_size[2]);
+}
+
+static int dxv_decompress_yg10(AVCodecContext *avctx)
+{
+    DXVContext *ctx = avctx->priv_data;
+    GetByteContext *gb = &ctx->gbc;
+    int ret;
+
+    ret = dxv_decompress_cocg(ctx, gb, ctx->tex_data, ctx->tex_size,
+                              ctx->op_data[0], ctx->op_data[3],
+                              ctx->op_size[0], ctx->op_size[3]);
+    if (ret < 0)
+        return ret;
+
+    return dxv_decompress_cocg(ctx, gb, ctx->ctex_data, ctx->ctex_size,
+                               ctx->op_data[1], ctx->op_data[2],
+                               ctx->op_size[1], ctx->op_size[2]);
+}
+
 static int dxv_decompress_dxt5(AVCodecContext *avctx)
 {
     DXVContext *ctx = avctx->priv_data;
@@ -189,6 +884,8 @@ static int dxv_decompress_dxt5(AVCodecContext *avctx)
             AV_WL32(ctx->tex_data + 4 * pos, prev);
             pos++;
         } else {
+            if (bytestream2_get_bytes_left(gbc) < 1)
+                return AVERROR_INVALIDDATA;
             if (state == 0) {
                 value = bytestream2_get_le32(gbc);
                 state = 16;
@@ -327,6 +1024,9 @@ static int dxv_decompress_raw(AVCodecContext *avctx)
     DXVContext *ctx = avctx->priv_data;
     GetByteContext *gbc = &ctx->gbc;
 
+    if (bytestream2_get_bytes_left(gbc) < ctx->tex_size)
+        return AVERROR_INVALIDDATA;
+
     bytestream2_get_buffer(gbc, ctx->tex_data, ctx->tex_size);
     return 0;
 }
@@ -346,6 +1046,12 @@ static int dxv_decode(AVCodecContext *avctx, void *data,
 
     bytestream2_init(gbc, avpkt->data, avpkt->size);
 
+    ctx->texture_block_h = 4;
+    ctx->texture_block_w = 4;
+
+    avctx->pix_fmt = AV_PIX_FMT_RGBA;
+    avctx->colorspace = AVCOL_SPC_RGB;
+
     tag = bytestream2_get_le32(gbc);
     switch (tag) {
     case MKBETAG('D', 'X', 'T', '1'):
@@ -365,9 +1071,39 @@ static int dxv_decode(AVCodecContext *avctx, void *data,
         msgtext = "DXT5";
         break;
     case MKBETAG('Y', 'C', 'G', '6'):
+        decompress_tex = dxv_decompress_ycg6;
+        ctx->tex_funct_planar[0] = yo_block;
+        ctx->tex_funct_planar[1] = cocg_block;
+        ctx->tex_rat   = 8;
+        ctx->tex_step  = 32;
+        ctx->ctex_step = 16;
+        msgcomp = "YOCOCG6";
+        msgtext = "YCG6";
+        ctx->ctex_size = avctx->coded_width * avctx->coded_height / 4;
+        ctx->texture_block_h = 4;
+        ctx->texture_block_w = 16;
+        ctx->ctexture_block_h = 4;
+        ctx->ctexture_block_w = 4;
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        avctx->colorspace = AVCOL_SPC_YCOCG;
+        break;
     case MKBETAG('Y', 'G', '1', '0'):
-        avpriv_report_missing_feature(avctx, "Tag 0x%08"PRIX32"", tag);
-        return AVERROR_PATCHWELCOME;
+        decompress_tex = dxv_decompress_yg10;
+        ctx->tex_funct_planar[0] = yao_block;
+        ctx->tex_funct_planar[1] = cocg_block;
+        ctx->tex_rat   = 4;
+        ctx->tex_step  = 64;
+        ctx->ctex_step = 16;
+        msgcomp = "YAOCOCG10";
+        msgtext = "YG10";
+        ctx->ctex_size = avctx->coded_width * avctx->coded_height / 4;
+        ctx->texture_block_h = 4;
+        ctx->texture_block_w = 16;
+        ctx->ctexture_block_h = 4;
+        ctx->ctexture_block_w = 4;
+        avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+        avctx->colorspace = AVCOL_SPC_YCOCG;
+        break;
     default:
         /* Old version does not have a real header, just size and type. */
         size = tag & 0x00FFFFFF;
@@ -393,14 +1129,17 @@ static int dxv_decode(AVCodecContext *avctx, void *data,
             ctx->tex_funct = ctx->texdsp.dxt1_block;
             ctx->tex_step  = 8;
         } else {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Unsupported header (0x%08"PRIX32")\n.", tag);
+            av_log(avctx, AV_LOG_ERROR, "Unsupported header (0x%08"PRIX32")\n.", tag);
             return AVERROR_INVALIDDATA;
         }
         ctx->tex_rat = 1;
         break;
     }
 
+    ctx->slice_count = av_clip(avctx->thread_count, 1,
+                               avctx->coded_height / FFMAX(ctx->texture_block_h,
+                                                           ctx->ctexture_block_h));
+
     /* New header is 12 bytes long. */
     if (!old_type) {
         version_major = bytestream2_get_byte(gbc) - 1;
@@ -428,20 +1167,43 @@ static int dxv_decode(AVCodecContext *avctx, void *data,
     }
 
     ctx->tex_size = avctx->coded_width * avctx->coded_height * 4 / ctx->tex_rat;
-    ret = av_reallocp(&ctx->tex_data, ctx->tex_size);
+    ret = av_reallocp(&ctx->tex_data, ctx->tex_size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (ret < 0)
         return ret;
 
+    if (ctx->ctex_size) {
+        int i;
+
+        ctx->op_size[0] = avctx->coded_width * avctx->coded_height / 16;
+        ctx->op_size[1] = avctx->coded_width * avctx->coded_height / 32;
+        ctx->op_size[2] = avctx->coded_width * avctx->coded_height / 32;
+        ctx->op_size[3] = avctx->coded_width * avctx->coded_height / 16;
+
+        ret = av_reallocp(&ctx->ctex_data, ctx->ctex_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (ret < 0)
+            return ret;
+        for (i = 0; i < 4; i++) {
+            ret = av_reallocp(&ctx->op_data[i], ctx->op_size[i]);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
     /* Decompress texture out of the intermediate compression. */
     ret = decompress_tex(avctx);
     if (ret < 0)
         return ret;
+    {
+        int w_block = avctx->coded_width / ctx->texture_block_w;
+        int h_block = avctx->coded_height / ctx->texture_block_h;
+        if (w_block * h_block * ctx->tex_step > ctx->tex_size * 8LL)
+            return AVERROR_INVALIDDATA;
+    }
 
     tframe.f = data;
     ret = ff_thread_get_buffer(avctx, &tframe, 0);
     if (ret < 0)
         return ret;
-    ff_thread_finish_setup(avctx);
 
     /* Now decompress the texture with the standard functions. */
     avctx->execute2(avctx, decompress_texture_thread,
@@ -471,10 +1233,6 @@ static int dxv_init(AVCodecContext *avctx)
     avctx->coded_height = FFALIGN(avctx->height, 16);
 
     ff_texturedsp_init(&ctx->texdsp);
-    avctx->pix_fmt = AV_PIX_FMT_RGBA;
-
-    ctx->slice_count = av_clip(avctx->thread_count, 1,
-                               avctx->coded_height / TEXTURE_BLOCK_H);
 
     return 0;
 }
@@ -484,6 +1242,11 @@ static int dxv_close(AVCodecContext *avctx)
     DXVContext *ctx = avctx->priv_data;
 
     av_freep(&ctx->tex_data);
+    av_freep(&ctx->ctex_data);
+    av_freep(&ctx->op_data[0]);
+    av_freep(&ctx->op_data[1]);
+    av_freep(&ctx->op_data[2]);
+    av_freep(&ctx->op_data[3]);
 
     return 0;
 }
diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
index 939b45a..3241611 100644
--- a/libavcodec/dxva2.c
+++ b/libavcodec/dxva2.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,8 @@ DEFINE_GUID(ff_DXVA2_ModeVC1_D,          0x1b81beA3, 0xa0c7,0x11d3,0xb9,0x84,0x0
 DEFINE_GUID(ff_DXVA2_ModeVC1_D2010,      0x1b81beA4, 0xa0c7,0x11d3,0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5);
 DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main,  0x5b11d51b, 0x2f4c,0x4452,0xbc,0xc3,0x09,0xf2,0xa1,0x16,0x0c,0xc0);
 DEFINE_GUID(ff_DXVA2_ModeHEVC_VLD_Main10,0x107af0e0, 0xef1a,0x4d19,0xab,0xa8,0x67,0xa1,0x63,0x07,0x3d,0x13);
+DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_Profile0,0x463707f8,0xa1d0,0x4585,0x87,0x6d,0x83,0xaa,0x6d,0x60,0xb8,0x9e);
+DEFINE_GUID(ff_DXVA2_ModeVP9_VLD_10bit_Profile2,0xa4c749ef,0x6ecf,0x48aa,0x84,0x48,0x50,0xa7,0xa1,0x16,0x5f,0xf7);
 DEFINE_GUID(ff_DXVA2_NoEncrypt,          0x1b81beD0, 0xa0c7,0x11d3,0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5);
 DEFINE_GUID(ff_GUID_NULL,                0x00000000, 0x0000,0x0000,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
 DEFINE_GUID(ff_IID_IDirectXVideoDecoderService, 0xfc51a551,0xd5e7,0x11d9,0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02);
@@ -66,6 +68,10 @@ static const int prof_hevc_main[]    = {FF_PROFILE_HEVC_MAIN,
                                         FF_PROFILE_UNKNOWN};
 static const int prof_hevc_main10[]  = {FF_PROFILE_HEVC_MAIN_10,
                                         FF_PROFILE_UNKNOWN};
+static const int prof_vp9_profile0[] = {FF_PROFILE_VP9_0,
+                                        FF_PROFILE_UNKNOWN};
+static const int prof_vp9_profile2[] = {FF_PROFILE_VP9_2,
+                                        FF_PROFILE_UNKNOWN};
 
 static const dxva_mode dxva_modes[] = {
     /* MPEG-2 */
@@ -88,6 +94,10 @@ static const dxva_mode dxva_modes[] = {
     { &ff_DXVA2_ModeHEVC_VLD_Main10, AV_CODEC_ID_HEVC, prof_hevc_main10 },
     { &ff_DXVA2_ModeHEVC_VLD_Main,   AV_CODEC_ID_HEVC, prof_hevc_main },
 
+    /* VP8/9 */
+    { &ff_DXVA2_ModeVP9_VLD_Profile0,       AV_CODEC_ID_VP9, prof_vp9_profile0 },
+    { &ff_DXVA2_ModeVP9_VLD_10bit_Profile2, AV_CODEC_ID_VP9, prof_vp9_profile2 },
+
     { NULL,                          0 },
 };
 
@@ -142,7 +152,7 @@ static int dxva_get_decoder_configuration(AVCodecContext *avctx,
 }
 
 #if CONFIG_D3D11VA
-static int d3d11va_validate_output(void *service, GUID guid, void *surface_format)
+static int d3d11va_validate_output(void *service, GUID guid, const void *surface_format)
 {
     HRESULT hr;
     BOOL is_supported = FALSE;
@@ -155,7 +165,7 @@ static int d3d11va_validate_output(void *service, GUID guid, void *surface_forma
 #endif
 
 #if CONFIG_DXVA2
-static int dxva2_validate_output(void *decoder_service, GUID guid, void *surface_format)
+static int dxva2_validate_output(void *decoder_service, GUID guid, const void *surface_format)
 {
     HRESULT hr;
     int ret = 0;
@@ -605,6 +615,8 @@ int ff_dxva2_common_frame_params(AVCodecContext *avctx,
     /* add surfaces based on number of possible refs */
     if (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_HEVC)
         num_surfaces += 16;
+    else if (avctx->codec_id == AV_CODEC_ID_VP9)
+        num_surfaces += 8;
     else
         num_surfaces += 2;
 
@@ -785,7 +797,7 @@ int ff_dxva2_commit_buffer(AVCodecContext *avctx,
     void     *dxva_data;
     unsigned dxva_size;
     int      result;
-    HRESULT hr;
+    HRESULT hr = 0;
 
 #if CONFIG_D3D11VA
     if (ff_dxva2_is_d3d11(avctx))
@@ -880,7 +892,7 @@ int ff_dxva2_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
 #if CONFIG_DXVA2
     DXVA2_DecodeBufferDesc          buffer2[4];
 #endif
-    DECODER_BUFFER_DESC             *buffer,*buffer_slice;
+    DECODER_BUFFER_DESC             *buffer = NULL, *buffer_slice = NULL;
     int result, runs = 0;
     HRESULT hr;
     unsigned type;
diff --git a/libavcodec/dxva2.h b/libavcodec/dxva2.h
index d940b47..22c9399 100644
--- a/libavcodec/dxva2.h
+++ b/libavcodec/dxva2.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2009 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,7 @@
 
 /**
  * This structure is used to provides the necessary configurations and data
- * to the DXVA2 Libav HWAccel implementation.
+ * to the DXVA2 FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  */
@@ -81,7 +81,7 @@ struct dxva_context {
     uint64_t workaround;
 
     /**
-     * Private to the Libav AVHWAccel implementation
+     * Private to the FFmpeg AVHWAccel implementation
      */
     unsigned report_id;
 };
diff --git a/libavcodec/dxva2_h264.c b/libavcodec/dxva2_h264.c
index 790e4a2..5b23b28 100644
--- a/libavcodec/dxva2_h264.c
+++ b/libavcodec/dxva2_h264.c
@@ -3,23 +3,25 @@
  *
  * copyright (c) 2009 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
+
 #include "dxva2_internal.h"
 #include "h264dec.h"
 #include "h264data.h"
@@ -99,7 +101,7 @@ static void fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *
                                         ((sps->mb_aff &&
                                         (h->picture_structure == PICT_FRAME)) <<  1) |
                                         (sps->residual_color_transform_flag   <<  2) |
-                                        /* sp_for_switch_flag (not implemented by Libav) */
+                                        /* sp_for_switch_flag (not implemented by FFmpeg) */
                                         (0                                    <<  3) |
                                         (sps->chroma_format_idc               <<  4) |
                                         ((h->nal_ref_idc != 0)                <<  6) |
@@ -155,14 +157,14 @@ static void fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *
     pp->deblocking_filter_control_present_flag = pps->deblocking_filter_parameters_present;
     pp->redundant_pic_cnt_present_flag= pps->redundant_pic_cnt_present;
     pp->Reserved8BitsB                = 0;
-    pp->slice_group_change_rate_minus1= 0;  /* XXX not implemented by Libav */
-    //pp->SliceGroupMap[810];               /* XXX not implemented by Libav */
+    pp->slice_group_change_rate_minus1= 0;  /* XXX not implemented by FFmpeg */
+    //pp->SliceGroupMap[810];               /* XXX not implemented by FFmpeg */
 }
 
 static void fill_scaling_lists(const AVCodecContext *avctx, AVDXVAContext *ctx, const H264Context *h, DXVA_Qmatrix_H264 *qm)
 {
-    unsigned i, j;
     const PPS *pps = h->ps.pps;
+    unsigned i, j;
     memset(qm, 0, sizeof(*qm));
     if (DXVA_CONTEXT_WORKAROUND(avctx, ctx) & FF_DXVA2_WORKAROUND_SCALING_LIST_ZIGZAG) {
         for (i = 0; i < 6; i++)
@@ -226,7 +228,7 @@ static void fill_slice_long(AVCodecContext *avctx, DXVA_Slice_H264_Long *slice,
 
     slice->first_mb_in_slice     = (sl->mb_y >> FIELD_OR_MBAFF_PICTURE(h)) * h->mb_width + sl->mb_x;
     slice->NumMbsForSlice        = 0; /* XXX it is set once we have all slices */
-    slice->BitOffsetToSliceData  = get_bits_count(&sl->gb);
+    slice->BitOffsetToSliceData  = get_bits_count(&sl->gb) - 8;
     slice->slice_type            = ff_h264_get_slice_type(sl);
     if (sl->slice_type_fixed)
         slice->slice_type += 5;
@@ -252,7 +254,7 @@ static void fill_slice_long(AVCodecContext *avctx, DXVA_Slice_H264_Long *slice,
                 else
                     index = get_refpic_index(pp, ff_dxva2_get_surface_index(avctx, ctx, r->f));
                 fill_picture_entry(&slice->RefPicList[list][i], index,
-                                   r->reference == PICT_BOTTOM_FIELD);
+                                   sl->ref_list[list][i].reference == PICT_BOTTOM_FIELD);
                 for (plane = 0; plane < 3; plane++) {
                     int w, o;
                     if (plane == 0 && sl->pwt.luma_weight_flag[list]) {
@@ -279,7 +281,7 @@ static void fill_slice_long(AVCodecContext *avctx, DXVA_Slice_H264_Long *slice,
             }
         }
     }
-    slice->slice_qs_delta    = 0; /* XXX not implemented by Libav */
+    slice->slice_qs_delta    = 0; /* XXX not implemented by FFmpeg */
     slice->slice_qp_delta    = sl->qscale - h->ps.pps->init_qp;
     slice->redundant_pic_cnt = sl->redundant_pic_count;
     if (sl->slice_type == AV_PICTURE_TYPE_B)
@@ -302,9 +304,9 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
     const H264Picture *current_picture = h->cur_pic_ptr;
     struct dxva2_picture_context *ctx_pic = current_picture->hwaccel_picture_private;
     DXVA_Slice_H264_Short *slice = NULL;
-    void     *dxva_data_ptr;
+    void     *dxva_data_ptr = NULL;
     uint8_t  *dxva_data, *current, *end;
-    unsigned dxva_size;
+    unsigned dxva_size = 0;
     void     *slice_data;
     unsigned slice_size;
     unsigned padding;
@@ -405,6 +407,8 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         dsc11->NumMBsInBuffer       = mb_count;
 
         type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
+
+        av_assert0((dsc11->DataSize & 127) == 0);
     }
 #endif
 #if CONFIG_DXVA2
@@ -416,6 +420,8 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         dsc2->NumMBsInBuffer       = mb_count;
 
         type = DXVA2_SliceControlBufferType;
+
+        av_assert0((dsc2->DataSize & 127) == 0);
     }
 #endif
 
@@ -426,7 +432,6 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         slice_data = ctx_pic->slice_long;
         slice_size = ctx_pic->slice_count * sizeof(*ctx_pic->slice_long);
     }
-    assert((bs->DataSize & 127) == 0);
     return ff_dxva2_commit_buffer(avctx, ctx, sc,
                                   type,
                                   slice_data, slice_size, mb_count);
diff --git a/libavcodec/dxva2_hevc.c b/libavcodec/dxva2_hevc.c
index d2d8080..dbb701f 100644
--- a/libavcodec/dxva2_hevc.c
+++ b/libavcodec/dxva2_hevc.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2014 - 2015 Hendrik Leppkes
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dxva2_internal.h b/libavcodec/dxva2_internal.h
index 42ff346..b822af5 100644
--- a/libavcodec/dxva2_internal.h
+++ b/libavcodec/dxva2_internal.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -104,9 +104,9 @@ typedef struct FFDXVASharedContext {
 #if CONFIG_D3D11VA && CONFIG_DXVA2
 #define DXVA_CONTEXT_WORKAROUND(avctx, ctx)     (ff_dxva2_is_d3d11(avctx) ? ctx->d3d11va.workaround : ctx->dxva2.workaround)
 #define DXVA_CONTEXT_COUNT(avctx, ctx)          (ff_dxva2_is_d3d11(avctx) ? ctx->d3d11va.surface_count : ctx->dxva2.surface_count)
-#define DXVA_CONTEXT_DECODER(avctx, ctx)        (ff_dxva2_is_d3d11(avctx) ? ctx->d3d11va.decoder : ctx->dxva2.decoder)
+#define DXVA_CONTEXT_DECODER(avctx, ctx)        (ff_dxva2_is_d3d11(avctx) ? (void *)ctx->d3d11va.decoder : (void *)ctx->dxva2.decoder)
 #define DXVA_CONTEXT_REPORT_ID(avctx, ctx)      (*(ff_dxva2_is_d3d11(avctx) ? &ctx->d3d11va.report_id : &ctx->dxva2.report_id))
-#define DXVA_CONTEXT_CFG(avctx, ctx)            (ff_dxva2_is_d3d11(avctx) ? ctx->d3d11va.cfg : ctx->dxva2.cfg)
+#define DXVA_CONTEXT_CFG(avctx, ctx)            (ff_dxva2_is_d3d11(avctx) ? (void *)ctx->d3d11va.cfg : (void *)ctx->dxva2.cfg)
 #define DXVA_CONTEXT_CFG_BITSTREAM(avctx, ctx)  (ff_dxva2_is_d3d11(avctx) ? ctx->d3d11va.cfg->ConfigBitstreamRaw : ctx->dxva2.cfg->ConfigBitstreamRaw)
 #define DXVA_CONTEXT_CFG_INTRARESID(avctx, ctx) (ff_dxva2_is_d3d11(avctx) ? ctx->d3d11va.cfg->ConfigIntraResidUnsigned : ctx->dxva2.cfg->ConfigIntraResidUnsigned)
 #define DXVA_CONTEXT_CFG_RESIDACCEL(avctx, ctx) (ff_dxva2_is_d3d11(avctx) ? ctx->d3d11va.cfg->ConfigResidDiffAccelerator : ctx->dxva2.cfg->ConfigResidDiffAccelerator)
diff --git a/libavcodec/dxva2_mpeg2.c b/libavcodec/dxva2_mpeg2.c
index 5cc8128..8cc21bf 100644
--- a/libavcodec/dxva2_mpeg2.c
+++ b/libavcodec/dxva2_mpeg2.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -113,10 +113,10 @@ static void fill_quantization_matrices(AVCodecContext *avctx,
         qm->bNewQmatrix[i] = 1;
     for (i = 0; i < 64; i++) {
         int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
-        qm->Qmatrix[0][i] = s->intra_matrix[n];;
-        qm->Qmatrix[1][i] = s->inter_matrix[n];;
-        qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];;
-        qm->Qmatrix[3][i] = s->chroma_inter_matrix[n];;
+        qm->Qmatrix[0][i] = s->intra_matrix[n];
+        qm->Qmatrix[1][i] = s->inter_matrix[n];
+        qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];
+        qm->Qmatrix[3][i] = s->chroma_inter_matrix[n];
     }
 }
 
@@ -143,8 +143,7 @@ static void fill_slice(AVCodecContext *avctx,
     init_get_bits(&gb, &buffer[4], 8 * (size - 4));
 
     slice->wQuantizerScaleCode = get_bits(&gb, 5);
-    while (get_bits1(&gb))
-        skip_bits(&gb, 8);
+    skip_1stop_8data_bits(&gb);
 
     slice->wMBbitOffset        = 4 * 8 + get_bits_count(&gb);
 }
diff --git a/libavcodec/dxva2_vc1.c b/libavcodec/dxva2_vc1.c
index fc09f88..f08ac8b 100644
--- a/libavcodec/dxva2_vc1.c
+++ b/libavcodec/dxva2_vc1.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,12 @@
 #include "vc1.h"
 #include "vc1data.h"
 
+#define MAX_SLICES 1024
+
 struct dxva2_picture_context {
     DXVA_PictureParameters pp;
-    DXVA_SliceInfo         si;
+    unsigned               slice_count;
+    DXVA_SliceInfo         slice[MAX_SLICES];
 
     const uint8_t          *bitstream;
     unsigned               bitstream_size;
@@ -39,6 +42,15 @@ static void fill_picture_parameters(AVCodecContext *avctx,
 {
     const MpegEncContext *s = &v->s;
     const Picture *current_picture = s->current_picture_ptr;
+    int intcomp = 0;
+
+    // determine if intensity compensation is needed
+    if (s->pict_type == AV_PICTURE_TYPE_P) {
+      if ((v->fcm == ILACE_FRAME && v->intcomp) || (v->fcm != ILACE_FRAME && v->mv_mode == MV_PMODE_INTENSITY_COMP)) {
+        if (v->lumscale != 32 || v->lumshift != 0 || (s->picture_structure != PICT_FRAME && (v->lumscale2 != 32 || v->lumshift2 != 0)))
+          intcomp = 1;
+      }
+    }
 
     memset(pp, 0, sizeof(*pp));
     pp->wDecodedPictureIndex    =
@@ -69,13 +81,13 @@ static void fill_picture_parameters(AVCodecContext *avctx,
         pp->bPicStructure      |= 0x01;
     if (s->picture_structure & PICT_BOTTOM_FIELD)
         pp->bPicStructure      |= 0x02;
-    pp->bSecondField            = v->interlace && v->fcm != ILACE_FIELD && !s->first_field;
+    pp->bSecondField            = v->interlace && v->fcm == ILACE_FIELD && v->second_field;
     pp->bPicIntra               = s->pict_type == AV_PICTURE_TYPE_I || v->bi_type;
     pp->bPicBackwardPrediction  = s->pict_type == AV_PICTURE_TYPE_B && !v->bi_type;
     pp->bBidirectionalAveragingMode = (1                                           << 7) |
                                       ((DXVA_CONTEXT_CFG_INTRARESID(avctx, ctx) != 0) << 6) |
                                       ((DXVA_CONTEXT_CFG_RESIDACCEL(avctx, ctx) != 0) << 5) |
-                                      ((v->lumscale != 32 || v->lumshift != 0)     << 4) |
+                                      (intcomp                                     << 4) |
                                       ((v->profile == PROFILE_ADVANCED)            << 3);
     pp->bMVprecisionAndChromaRelation = ((v->mv_mode == MV_PMODE_1MV_HPEL_BILIN) << 3) |
                                         (1                                       << 2) |
@@ -123,15 +135,25 @@ static void fill_picture_parameters(AVCodecContext *avctx,
                                   (v->range_mapuv_flag << 3) |
                                   (v->range_mapuv          );
     pp->bPicBinPB               = 0;
-    pp->bMV_RPS                 = 0;
-    pp->bReservedBits           = 0;
+    pp->bMV_RPS                 = (v->fcm == ILACE_FIELD && pp->bPicBackwardPrediction) ? v->refdist + 9 : 0;
+    pp->bReservedBits           = v->pq;
     if (s->picture_structure == PICT_FRAME) {
-        pp->wBitstreamFcodes        = v->lumscale;
-        pp->wBitstreamPCEelements   = v->lumshift;
+        if (intcomp) {
+            pp->wBitstreamFcodes      = v->lumscale;
+            pp->wBitstreamPCEelements = v->lumshift;
+        } else {
+            pp->wBitstreamFcodes      = 32;
+            pp->wBitstreamPCEelements = 0;
+        }
     } else {
         /* Syntax: (top_field_param << 8) | bottom_field_param */
-        pp->wBitstreamFcodes        = (v->lumscale << 8) | v->lumscale;
-        pp->wBitstreamPCEelements   = (v->lumshift << 8) | v->lumshift;
+        if (intcomp) {
+            pp->wBitstreamFcodes      = (v->lumscale << 8) | v->lumscale2;
+            pp->wBitstreamPCEelements = (v->lumshift << 8) | v->lumshift2;
+        } else {
+            pp->wBitstreamFcodes      = (32 << 8) | 32;
+            pp->wBitstreamPCEelements = 0;
+        }
     }
     pp->bBitstreamConcealmentNeed   = 0;
     pp->bBitstreamConcealmentMethod = 0;
@@ -149,9 +171,10 @@ static void fill_slice(AVCodecContext *avctx, DXVA_SliceInfo *slice,
     slice->dwSliceBitsInBuffer = 8 * size;
     slice->dwSliceDataLocation = position;
     slice->bStartCodeBitOffset = 0;
-    slice->bReservedBits       = 0;
-    slice->wMBbitOffset        = get_bits_count(&s->gb);
-    slice->wNumberMBsInSlice   = s->mb_width * s->mb_height; /* XXX We assume 1 slice */
+    slice->bReservedBits       = (s->pict_type == AV_PICTURE_TYPE_B && !v->bi_type) ? v->bfraction_lut_index + 9 : 0;
+    slice->wMBbitOffset        = v->p_frame_skipped ? 0xffff : get_bits_count(&s->gb) + (avctx->codec_id == AV_CODEC_ID_VC1 ? 32 : 0);
+    /* XXX We store the index of the first MB and it will be fixed later */
+    slice->wNumberMBsInSlice   = (s->mb_y >> v->field_mode) * s->mb_width + s->mb_x;
     slice->wQuantizerScaleCode = v->pq;
     slice->wBadSliceChopping   = 0;
 }
@@ -165,18 +188,15 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
     const MpegEncContext *s = &v->s;
     struct dxva2_picture_context *ctx_pic = s->current_picture_ptr->hwaccel_picture_private;
 
-    DXVA_SliceInfo *slice = &ctx_pic->si;
-
     static const uint8_t start_code[] = { 0, 0, 1, 0x0d };
     const unsigned start_code_size = avctx->codec_id == AV_CODEC_ID_VC1 ? sizeof(start_code) : 0;
-    const unsigned slice_size = slice->dwSliceBitsInBuffer / 8;
-    const unsigned padding = 128 - ((start_code_size + slice_size) & 127);
-    const unsigned data_size = start_code_size + slice_size + padding;
-
+    const unsigned mb_count = s->mb_width * (s->mb_height >> v->field_mode);
+    DXVA_SliceInfo *slice = NULL;
     void     *dxva_data_ptr;
-    uint8_t  *dxva_data;
+    uint8_t  *dxva_data, *current, *end;
     unsigned dxva_size;
-    int result;
+    unsigned padding;
+    unsigned i;
     unsigned type;
 
 #if CONFIG_D3D11VA
@@ -200,16 +220,49 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
 #endif
 
     dxva_data = dxva_data_ptr;
-    result = data_size <= dxva_size ? 0 : -1;
-    if (!result) {
-        if (start_code_size > 0)
-            memcpy(dxva_data, start_code, start_code_size);
-        memcpy(dxva_data + start_code_size,
-               ctx_pic->bitstream + slice->dwSliceDataLocation, slice_size);
-        if (padding > 0)
-            memset(dxva_data + start_code_size + slice_size, 0, padding);
-        slice->dwSliceBitsInBuffer = 8 * data_size;
+    current = dxva_data;
+    end = dxva_data + dxva_size;
+
+    for (i = 0; i < ctx_pic->slice_count; i++) {
+        unsigned position, size;
+        slice    = &ctx_pic->slice[i];
+        position = slice->dwSliceDataLocation;
+        size     = slice->dwSliceBitsInBuffer / 8;
+        if (start_code_size + size > end - current) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to build bitstream");
+            break;
+        }
+        slice->dwSliceDataLocation = current - dxva_data;
+
+        if (i < ctx_pic->slice_count - 1)
+            slice->wNumberMBsInSlice =
+                slice[1].wNumberMBsInSlice - slice[0].wNumberMBsInSlice;
+        else
+            slice->wNumberMBsInSlice =
+                mb_count - slice[0].wNumberMBsInSlice;
+
+        /* write the appropriate frame, field or slice start code */
+        if (start_code_size) {
+            memcpy(current, start_code, start_code_size);
+            if (i == 0 && v->second_field)
+                current[3] = 0x0c;
+            else if (i > 0)
+                current[3] = 0x0b;
+
+            current += start_code_size;
+            slice->dwSliceBitsInBuffer += start_code_size * 8;
+        }
+
+        memcpy(current, &ctx_pic->bitstream[position], size);
+        current += size;
+    }
+    padding = FFMIN(128 - ((current - dxva_data) & 127), end - current);
+    if (slice && padding > 0) {
+        memset(current, 0, padding);
+        current += padding;
+        slice->dwSliceBitsInBuffer += padding * 8;
     }
+
 #if CONFIG_D3D11VA
     if (ff_dxva2_is_d3d11(avctx))
         if (FAILED(ID3D11VideoContext_ReleaseDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context, D3D11VA_CONTEXT(ctx)->decoder, type)))
@@ -220,16 +273,16 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         if (FAILED(IDirectXVideoDecoder_ReleaseBuffer(DXVA2_CONTEXT(ctx)->decoder, type)))
             return -1;
 #endif
-    if (result)
-        return result;
+    if (i < ctx_pic->slice_count)
+        return -1;
 
 #if CONFIG_D3D11VA
     if (ff_dxva2_is_d3d11(avctx)) {
         D3D11_VIDEO_DECODER_BUFFER_DESC *dsc11 = bs;
         memset(dsc11, 0, sizeof(*dsc11));
         dsc11->BufferType           = type;
-        dsc11->DataSize             = data_size;
-        dsc11->NumMBsInBuffer       = s->mb_width * s->mb_height;
+        dsc11->DataSize             = current - dxva_data;
+        dsc11->NumMBsInBuffer       = mb_count;
 
         type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
     }
@@ -239,17 +292,18 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         DXVA2_DecodeBufferDesc *dsc2 = bs;
         memset(dsc2, 0, sizeof(*dsc2));
         dsc2->CompressedBufferType = type;
-        dsc2->DataSize             = data_size;
-        dsc2->NumMBsInBuffer       = s->mb_width * s->mb_height;
+        dsc2->DataSize             = current - dxva_data;
+        dsc2->NumMBsInBuffer       = mb_count;
 
         type = DXVA2_SliceControlBufferType;
     }
 #endif
-    assert((data_size & 127) == 0);
 
     return ff_dxva2_commit_buffer(avctx, ctx, sc,
                                   type,
-                                  slice, sizeof(*slice), s->mb_width * s->mb_height);
+                                  ctx_pic->slice,
+                                  ctx_pic->slice_count * sizeof(*ctx_pic->slice),
+                                  mb_count);
 }
 
 static int dxva2_vc1_start_frame(AVCodecContext *avctx,
@@ -266,6 +320,7 @@ static int dxva2_vc1_start_frame(AVCodecContext *avctx,
 
     fill_picture_parameters(avctx, ctx, v, &ctx_pic->pp);
 
+    ctx_pic->slice_count    = 0;
     ctx_pic->bitstream_size = 0;
     ctx_pic->bitstream      = NULL;
     return 0;
@@ -278,9 +333,13 @@ static int dxva2_vc1_decode_slice(AVCodecContext *avctx,
     const VC1Context *v = avctx->priv_data;
     const Picture *current_picture = v->s.current_picture_ptr;
     struct dxva2_picture_context *ctx_pic = current_picture->hwaccel_picture_private;
+    unsigned position;
 
-    if (ctx_pic->bitstream_size > 0)
+    if (ctx_pic->slice_count >= MAX_SLICES) {
+        avpriv_request_sample(avctx, "%d slices in dxva2",
+                              ctx_pic->slice_count);
         return -1;
+    }
 
     if (avctx->codec_id == AV_CODEC_ID_VC1 &&
         size >= 4 && IS_MARKER(AV_RB32(buffer))) {
@@ -288,10 +347,12 @@ static int dxva2_vc1_decode_slice(AVCodecContext *avctx,
         size   -= 4;
     }
 
-    ctx_pic->bitstream_size = size;
-    ctx_pic->bitstream      = buffer;
+    if (!ctx_pic->bitstream)
+        ctx_pic->bitstream = buffer;
+    ctx_pic->bitstream_size += size;
 
-    fill_slice(avctx, &ctx_pic->si, 0, size);
+    position = buffer - ctx_pic->bitstream;
+    fill_slice(avctx, &ctx_pic->slice[ctx_pic->slice_count++], position, size);
     return 0;
 }
 
@@ -301,7 +362,7 @@ static int dxva2_vc1_end_frame(AVCodecContext *avctx)
     struct dxva2_picture_context *ctx_pic = v->s.current_picture_ptr->hwaccel_picture_private;
     int ret;
 
-    if (ctx_pic->bitstream_size <= 0)
+    if (ctx_pic->slice_count <= 0 || ctx_pic->bitstream_size <= 0)
         return -1;
 
     ret = ff_dxva2_common_end_frame(avctx, v->s.current_picture_ptr->f,
@@ -409,6 +470,7 @@ const AVHWAccel ff_vc1_d3d11va2_hwaccel = {
     .start_frame    = dxva2_vc1_start_frame,
     .decode_slice   = dxva2_vc1_decode_slice,
     .end_frame      = dxva2_vc1_end_frame,
+    .frame_params   = ff_dxva2_common_frame_params,
     .frame_priv_data_size = sizeof(struct dxva2_picture_context),
     .priv_data_size = sizeof(FFDXVASharedContext),
 };
diff --git a/libavcodec/dxva2_vp9.c b/libavcodec/dxva2_vp9.c
new file mode 100644
index 0000000..eaeab3a
--- /dev/null
+++ b/libavcodec/dxva2_vp9.c
@@ -0,0 +1,356 @@
+/*
+ * DXVA2 VP9 HW acceleration.
+ *
+ * copyright (c) 2015 Hendrik Leppkes
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+
+#include "dxva2_internal.h"
+#include "vp9shared.h"
+
+struct vp9_dxva2_picture_context {
+    DXVA_PicParams_VP9    pp;
+    DXVA_Slice_VPx_Short  slice;
+    const uint8_t         *bitstream;
+    unsigned              bitstream_size;
+};
+
+static void fill_picture_entry(DXVA_PicEntry_VPx *pic,
+                               unsigned index, unsigned flag)
+{
+    av_assert0((index & 0x7f) == index && (flag & 0x01) == flag);
+    pic->bPicEntry = index | (flag << 7);
+}
+
+static int fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *ctx, const VP9SharedContext *h,
+                                    DXVA_PicParams_VP9 *pp)
+{
+    int i;
+    const AVPixFmtDescriptor * pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+
+    if (!pixdesc)
+        return -1;
+
+    memset(pp, 0, sizeof(*pp));
+
+    fill_picture_entry(&pp->CurrPic, ff_dxva2_get_surface_index(avctx, ctx, h->frames[CUR_FRAME].tf.f), 0);
+
+    pp->profile = h->h.profile;
+    pp->wFormatAndPictureInfoFlags = ((h->h.keyframe == 0)   <<  0) |
+                                     ((h->h.invisible == 0)  <<  1) |
+                                     (h->h.errorres          <<  2) |
+                                     (pixdesc->log2_chroma_w <<  3) | /* subsampling_x */
+                                     (pixdesc->log2_chroma_h <<  4) | /* subsampling_y */
+                                     (0                      <<  5) | /* extra_plane */
+                                     (h->h.refreshctx        <<  6) |
+                                     (h->h.parallelmode      <<  7) |
+                                     (h->h.intraonly         <<  8) |
+                                     (h->h.framectxid        <<  9) |
+                                     (h->h.resetctx          << 11) |
+                                     ((h->h.keyframe ? 0 : h->h.highprecisionmvs) << 13) |
+                                     (0                      << 14);  /* ReservedFormatInfo2Bits */
+
+    pp->width  = avctx->width;
+    pp->height = avctx->height;
+    pp->BitDepthMinus8Luma   = pixdesc->comp[0].depth - 8;
+    pp->BitDepthMinus8Chroma = pixdesc->comp[1].depth - 8;
+    /* swap 0/1 to match the reference */
+    pp->interp_filter = h->h.filtermode ^ (h->h.filtermode <= 1);
+    pp->Reserved8Bits = 0;
+
+    for (i = 0; i < 8; i++) {
+        if (h->refs[i].f->buf[0]) {
+            fill_picture_entry(&pp->ref_frame_map[i], ff_dxva2_get_surface_index(avctx, ctx, h->refs[i].f), 0);
+            pp->ref_frame_coded_width[i]  = h->refs[i].f->width;
+            pp->ref_frame_coded_height[i] = h->refs[i].f->height;
+        } else
+            pp->ref_frame_map[i].bPicEntry = 0xFF;
+    }
+
+    for (i = 0; i < 3; i++) {
+        uint8_t refidx = h->h.refidx[i];
+        if (h->refs[refidx].f->buf[0])
+            fill_picture_entry(&pp->frame_refs[i], ff_dxva2_get_surface_index(avctx, ctx, h->refs[refidx].f), 0);
+        else
+            pp->frame_refs[i].bPicEntry = 0xFF;
+
+        pp->ref_frame_sign_bias[i + 1] = h->h.signbias[i];
+    }
+
+    pp->filter_level    = h->h.filter.level;
+    pp->sharpness_level = h->h.filter.sharpness;
+
+    pp->wControlInfoFlags = (h->h.lf_delta.enabled   << 0) |
+                            (h->h.lf_delta.updated   << 1) |
+                            (h->h.use_last_frame_mvs << 2) |
+                            (0                       << 3);  /* ReservedControlInfo5Bits */
+
+    for (i = 0; i < 4; i++)
+        pp->ref_deltas[i]  = h->h.lf_delta.ref[i];
+
+    for (i = 0; i < 2; i++)
+        pp->mode_deltas[i]  = h->h.lf_delta.mode[i];
+
+    pp->base_qindex   = h->h.yac_qi;
+    pp->y_dc_delta_q  = h->h.ydc_qdelta;
+    pp->uv_dc_delta_q = h->h.uvdc_qdelta;
+    pp->uv_ac_delta_q = h->h.uvac_qdelta;
+
+    /* segmentation data */
+    pp->stVP9Segments.wSegmentInfoFlags = (h->h.segmentation.enabled       << 0) |
+                                          (h->h.segmentation.update_map    << 1) |
+                                          (h->h.segmentation.temporal      << 2) |
+                                          (h->h.segmentation.absolute_vals << 3) |
+                                          (0                               << 4);  /* ReservedSegmentFlags4Bits */
+
+    for (i = 0; i < 7; i++)
+        pp->stVP9Segments.tree_probs[i] = h->h.segmentation.prob[i];
+
+    if (h->h.segmentation.temporal)
+        for (i = 0; i < 3; i++)
+            pp->stVP9Segments.pred_probs[i] = h->h.segmentation.pred_prob[i];
+    else
+        memset(pp->stVP9Segments.pred_probs, 255, sizeof(pp->stVP9Segments.pred_probs));
+
+    for (i = 0; i < 8; i++) {
+        pp->stVP9Segments.feature_mask[i] = (h->h.segmentation.feat[i].q_enabled    << 0) |
+                                            (h->h.segmentation.feat[i].lf_enabled   << 1) |
+                                            (h->h.segmentation.feat[i].ref_enabled  << 2) |
+                                            (h->h.segmentation.feat[i].skip_enabled << 3);
+
+        pp->stVP9Segments.feature_data[i][0] = h->h.segmentation.feat[i].q_val;
+        pp->stVP9Segments.feature_data[i][1] = h->h.segmentation.feat[i].lf_val;
+        pp->stVP9Segments.feature_data[i][2] = h->h.segmentation.feat[i].ref_val;
+        pp->stVP9Segments.feature_data[i][3] = 0; /* no data for skip */
+    }
+
+    pp->log2_tile_cols = h->h.tiling.log2_tile_cols;
+    pp->log2_tile_rows = h->h.tiling.log2_tile_rows;
+
+    pp->uncompressed_header_size_byte_aligned = h->h.uncompressed_header_size;
+    pp->first_partition_size = h->h.compressed_header_size;
+
+    pp->StatusReportFeedbackNumber = 1 + DXVA_CONTEXT_REPORT_ID(avctx, ctx)++;
+    return 0;
+}
+
+static void fill_slice_short(DXVA_Slice_VPx_Short *slice,
+                             unsigned position, unsigned size)
+{
+    memset(slice, 0, sizeof(*slice));
+    slice->BSNALunitDataLocation = position;
+    slice->SliceBytesInBuffer    = size;
+    slice->wBadSliceChopping     = 0;
+}
+
+static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
+                                             DECODER_BUFFER_DESC *bs,
+                                             DECODER_BUFFER_DESC *sc)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    void     *dxva_data_ptr;
+    uint8_t  *dxva_data;
+    unsigned dxva_size;
+    unsigned padding;
+    unsigned type;
+
+#if CONFIG_D3D11VA
+    if (ff_dxva2_is_d3d11(avctx)) {
+        type = D3D11_VIDEO_DECODER_BUFFER_BITSTREAM;
+        if (FAILED(ID3D11VideoContext_GetDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context,
+                                                       D3D11VA_CONTEXT(ctx)->decoder,
+                                                       type,
+                                                       &dxva_size, &dxva_data_ptr)))
+            return -1;
+    }
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
+        type = DXVA2_BitStreamDateBufferType;
+        if (FAILED(IDirectXVideoDecoder_GetBuffer(DXVA2_CONTEXT(ctx)->decoder,
+                                                  type,
+                                                  &dxva_data_ptr, &dxva_size)))
+            return -1;
+    }
+#endif
+
+    dxva_data = dxva_data_ptr;
+
+    if (ctx_pic->slice.SliceBytesInBuffer > dxva_size) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to build bitstream");
+        return -1;
+    }
+
+    memcpy(dxva_data, ctx_pic->bitstream, ctx_pic->slice.SliceBytesInBuffer);
+
+    padding = FFMIN(128 - ((ctx_pic->slice.SliceBytesInBuffer) & 127), dxva_size - ctx_pic->slice.SliceBytesInBuffer);
+    if (padding > 0) {
+        memset(dxva_data + ctx_pic->slice.SliceBytesInBuffer, 0, padding);
+        ctx_pic->slice.SliceBytesInBuffer += padding;
+    }
+
+#if CONFIG_D3D11VA
+    if (ff_dxva2_is_d3d11(avctx))
+        if (FAILED(ID3D11VideoContext_ReleaseDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context, D3D11VA_CONTEXT(ctx)->decoder, type)))
+            return -1;
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD)
+        if (FAILED(IDirectXVideoDecoder_ReleaseBuffer(DXVA2_CONTEXT(ctx)->decoder, type)))
+            return -1;
+#endif
+
+#if CONFIG_D3D11VA
+    if (ff_dxva2_is_d3d11(avctx)) {
+        D3D11_VIDEO_DECODER_BUFFER_DESC *dsc11 = bs;
+        memset(dsc11, 0, sizeof(*dsc11));
+        dsc11->BufferType           = type;
+        dsc11->DataSize             = ctx_pic->slice.SliceBytesInBuffer;
+        dsc11->NumMBsInBuffer       = 0;
+
+        type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
+    }
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
+        DXVA2_DecodeBufferDesc *dsc2 = bs;
+        memset(dsc2, 0, sizeof(*dsc2));
+        dsc2->CompressedBufferType = type;
+        dsc2->DataSize             = ctx_pic->slice.SliceBytesInBuffer;
+        dsc2->NumMBsInBuffer       = 0;
+
+        type = DXVA2_SliceControlBufferType;
+    }
+#endif
+
+    return ff_dxva2_commit_buffer(avctx, ctx, sc,
+                                  type,
+                                  &ctx_pic->slice, sizeof(ctx_pic->slice), 0);
+}
+
+
+static int dxva2_vp9_start_frame(AVCodecContext *avctx,
+                                 av_unused const uint8_t *buffer,
+                                 av_unused uint32_t size)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+
+    if (!DXVA_CONTEXT_VALID(avctx, ctx))
+        return -1;
+    av_assert0(ctx_pic);
+
+    /* Fill up DXVA_PicParams_VP9 */
+    if (fill_picture_parameters(avctx, ctx, h, &ctx_pic->pp) < 0)
+        return -1;
+
+    ctx_pic->bitstream_size = 0;
+    ctx_pic->bitstream      = NULL;
+    return 0;
+}
+
+static int dxva2_vp9_decode_slice(AVCodecContext *avctx,
+                                  const uint8_t *buffer,
+                                  uint32_t size)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    unsigned position;
+
+    if (!ctx_pic->bitstream)
+        ctx_pic->bitstream = buffer;
+    ctx_pic->bitstream_size += size;
+
+    position = buffer - ctx_pic->bitstream;
+    fill_slice_short(&ctx_pic->slice, position, size);
+
+    return 0;
+}
+
+static int dxva2_vp9_end_frame(AVCodecContext *avctx)
+{
+    VP9SharedContext *h = avctx->priv_data;
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    int ret;
+
+    if (ctx_pic->bitstream_size <= 0)
+        return -1;
+
+    ret = ff_dxva2_common_end_frame(avctx, h->frames[CUR_FRAME].tf.f,
+                                    &ctx_pic->pp, sizeof(ctx_pic->pp),
+                                    NULL, 0,
+                                    commit_bitstream_and_slice_buffer);
+    return ret;
+}
+
+#if CONFIG_VP9_DXVA2_HWACCEL
+const AVHWAccel ff_vp9_dxva2_hwaccel = {
+    .name           = "vp9_dxva2",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VP9,
+    .pix_fmt        = AV_PIX_FMT_DXVA2_VLD,
+    .init           = ff_dxva2_decode_init,
+    .uninit         = ff_dxva2_decode_uninit,
+    .start_frame    = dxva2_vp9_start_frame,
+    .decode_slice   = dxva2_vp9_decode_slice,
+    .end_frame      = dxva2_vp9_end_frame,
+    .frame_params   = ff_dxva2_common_frame_params,
+    .frame_priv_data_size = sizeof(struct vp9_dxva2_picture_context),
+    .priv_data_size = sizeof(FFDXVASharedContext),
+};
+#endif
+
+#if CONFIG_VP9_D3D11VA_HWACCEL
+const AVHWAccel ff_vp9_d3d11va_hwaccel = {
+    .name           = "vp9_d3d11va",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VP9,
+    .pix_fmt        = AV_PIX_FMT_D3D11VA_VLD,
+    .init           = ff_dxva2_decode_init,
+    .uninit         = ff_dxva2_decode_uninit,
+    .start_frame    = dxva2_vp9_start_frame,
+    .decode_slice   = dxva2_vp9_decode_slice,
+    .end_frame      = dxva2_vp9_end_frame,
+    .frame_params   = ff_dxva2_common_frame_params,
+    .frame_priv_data_size = sizeof(struct vp9_dxva2_picture_context),
+    .priv_data_size = sizeof(FFDXVASharedContext),
+};
+#endif
+
+#if CONFIG_VP9_D3D11VA2_HWACCEL
+const AVHWAccel ff_vp9_d3d11va2_hwaccel = {
+    .name           = "vp9_d3d11va2",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VP9,
+    .pix_fmt        = AV_PIX_FMT_D3D11,
+    .init           = ff_dxva2_decode_init,
+    .uninit         = ff_dxva2_decode_uninit,
+    .start_frame    = dxva2_vp9_start_frame,
+    .decode_slice   = dxva2_vp9_decode_slice,
+    .end_frame      = dxva2_vp9_end_frame,
+    .frame_params   = ff_dxva2_common_frame_params,
+    .frame_priv_data_size = sizeof(struct vp9_dxva2_picture_context),
+    .priv_data_size = sizeof(FFDXVASharedContext),
+};
+#endif
diff --git a/libavcodec/eac3_core_bsf.c b/libavcodec/eac3_core_bsf.c
new file mode 100644
index 0000000..3e4dc2e
--- /dev/null
+++ b/libavcodec/eac3_core_bsf.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "get_bits.h"
+#include "ac3_parser_internal.h"
+
+static int eac3_core_filter(AVBSFContext *ctx, AVPacket *pkt)
+{
+    AC3HeaderInfo hdr;
+    GetBitContext gbc;
+    int ret;
+
+    ret = ff_bsf_get_packet_ref(ctx, pkt);
+    if (ret < 0)
+        return ret;
+    ret = init_get_bits8(&gbc, pkt->data, pkt->size);
+    if (ret < 0)
+        goto fail;
+
+    ret = ff_ac3_parse_header(&gbc, &hdr);
+    if (ret < 0) {
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    if (hdr.frame_type == EAC3_FRAME_TYPE_INDEPENDENT ||
+        hdr.frame_type == EAC3_FRAME_TYPE_AC3_CONVERT) {
+        pkt->size = FFMIN(hdr.frame_size, pkt->size);
+    } else if (hdr.frame_type == EAC3_FRAME_TYPE_DEPENDENT && pkt->size > hdr.frame_size) {
+        AC3HeaderInfo hdr2;
+
+        ret = init_get_bits8(&gbc, pkt->data + hdr.frame_size, pkt->size - hdr.frame_size);
+        if (ret < 0)
+            goto fail;
+
+        ret = ff_ac3_parse_header(&gbc, &hdr2);
+        if (ret < 0) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+
+        if (hdr2.frame_type == EAC3_FRAME_TYPE_INDEPENDENT ||
+            hdr2.frame_type == EAC3_FRAME_TYPE_AC3_CONVERT) {
+            pkt->size -= hdr.frame_size;
+            pkt->data += hdr.frame_size;
+        } else {
+            pkt->size = 0;
+        }
+    } else {
+        pkt->size = 0;
+    }
+
+    return 0;
+fail:
+    av_packet_unref(pkt);
+    return ret;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_EAC3, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_eac3_core_bsf = {
+    .name      = "eac3_core",
+    .filter    = eac3_core_filter,
+    .codec_ids = codec_ids,
+};
diff --git a/libavcodec/eac3_data.c b/libavcodec/eac3_data.c
index b0416f3..b159e16 100644
--- a/libavcodec/eac3_data.c
+++ b/libavcodec/eac3_data.c
@@ -2,20 +2,20 @@
  * E-AC-3 tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eac3_data.h b/libavcodec/eac3_data.h
index 4d88ce0..10a67f1 100644
--- a/libavcodec/eac3_data.h
+++ b/libavcodec/eac3_data.h
@@ -2,20 +2,20 @@
  * E-AC-3 tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eac3dec.c b/libavcodec/eac3dec.c
index 89db7d3..3a5c798 100644
--- a/libavcodec/eac3dec.c
+++ b/libavcodec/eac3dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  * Copyright (c) 2008 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,12 +31,6 @@
  *     No known samples exist.  The spec also does not give clear information
  *     on how this is to be implemented.
  *
- * Dependent Streams
- *     Only the independent stream is currently decoded. Any dependent
- *     streams are skipped.  We have only come across two examples of this, and
- *     they are both just test streams, one for HD-DVD and the other for
- *     Blu-ray.
- *
  * Transient Pre-noise Processing
  *     This is side information which a decoder should use to reduce artifacts
  *     caused by transients.  There are samples which are known to have this
@@ -62,7 +56,7 @@ typedef enum {
 
 #define EAC3_SR_CODE_REDUCED  3
 
-void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
+static void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
 {
     int bin, bnd, ch, i;
     uint8_t wrapflag[SPX_MAX_BANDS]={1,0,}, num_copy_sections, copy_sizes[SPX_MAX_BANDS];
@@ -100,7 +94,7 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
         for (i = 0; i < num_copy_sections; i++) {
             memcpy(&s->transform_coeffs[ch][bin],
                    &s->transform_coeffs[ch][s->spx_dst_start_freq],
-                   copy_sizes[i]*sizeof(float));
+                   copy_sizes[i]*sizeof(INTFLOAT));
             bin += copy_sizes[i];
         }
 
@@ -123,7 +117,7 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
             bin = s->spx_src_start_freq - 2;
             for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
                 if (wrapflag[bnd]) {
-                    float *coeffs = &s->transform_coeffs[ch][bin];
+                    INTFLOAT *coeffs = &s->transform_coeffs[ch][bin];
                     coeffs[0] *= atten_tab[0];
                     coeffs[1] *= atten_tab[1];
                     coeffs[2] *= atten_tab[2];
@@ -141,6 +135,11 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
         for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
             float nscale = s->spx_noise_blend[ch][bnd] * rms_energy[bnd] * (1.0f / INT32_MIN);
             float sscale = s->spx_signal_blend[ch][bnd];
+#if USE_FIXED
+            // spx_noise_blend and spx_signal_blend are both FP.23
+            nscale *= 1.0 / (1<<23);
+            sscale *= 1.0 / (1<<23);
+#endif
             for (i = 0; i < s->spx_band_sizes[bnd]; i++) {
                 float noise  = nscale * (int32_t)av_lfg_get(&s->dith_state);
                 s->transform_coeffs[ch][bin]   *= sscale;
@@ -194,7 +193,7 @@ static void idct6(int pre_mant[6])
     pre_mant[5] = even0 - odd0;
 }
 
-void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
+static void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
 {
     int bin, blk, gs;
     int end_bap, gaq_mode;
@@ -246,7 +245,7 @@ void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
             /* Vector Quantization */
             int v = get_bits(gbc, bits);
             for (blk = 0; blk < 6; blk++) {
-                s->pre_mantissa[ch][bin][blk] = ff_eac3_mantissa_vq[hebap][v][blk] << 8;
+                s->pre_mantissa[ch][bin][blk] = ff_eac3_mantissa_vq[hebap][v][blk] * (1 << 8);
             }
         } else {
             /* Gain Adaptive Quantization */
@@ -265,16 +264,16 @@ void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
                     int b;
                     int mbits = bits - (2 - log_gain);
                     mant = get_sbits(gbc, mbits);
-                    mant <<= (23 - (mbits - 1));
+                    mant = ((unsigned)mant) << (23 - (mbits - 1));
                     /* remap mantissa value to correct for asymmetric quantization */
                     if (mant >= 0)
                         b = 1 << (23 - log_gain);
                     else
-                        b = ff_eac3_gaq_remap_2_4_b[hebap-8][log_gain-1] << 8;
+                        b = ff_eac3_gaq_remap_2_4_b[hebap-8][log_gain-1] * (1 << 8);
                     mant += ((ff_eac3_gaq_remap_2_4_a[hebap-8][log_gain-1] * (int64_t)mant) >> 15) + b;
                 } else {
                     /* small mantissa, no GAQ, or Gk=1 */
-                    mant <<= 24 - bits;
+                    mant *= (1 << 24 - bits);
                     if (!log_gain) {
                         /* remap mantissa value for no GAQ or Gk=1 */
                         mant += (ff_eac3_gaq_remap_1[hebap-8] * (int64_t)mant) >> 15;
@@ -287,7 +286,7 @@ void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
     }
 }
 
-int ff_eac3_parse_header(AC3DecodeContext *s)
+static int ff_eac3_parse_header(AC3DecodeContext *s)
 {
     int i, blk, ch;
     int ac3_exponent_strategy, parse_aht_info, parse_spx_atten_data;
@@ -298,13 +297,7 @@ int ff_eac3_parse_header(AC3DecodeContext *s)
     /* An E-AC-3 stream can have multiple independent streams which the
        application can select from. each independent stream can also contain
        dependent streams which are used to add or replace channels. */
-    if (s->frame_type == EAC3_FRAME_TYPE_DEPENDENT) {
-        if (!s->eac3_frame_dependent_found) {
-            s->eac3_frame_dependent_found = 1;
-            avpriv_request_sample(s->avctx, "Dependent substream decoding");
-        }
-        return AAC_AC3_PARSE_ERROR_FRAME_TYPE;
-    } else if (s->frame_type == EAC3_FRAME_TYPE_RESERVED) {
+    if (s->frame_type == EAC3_FRAME_TYPE_RESERVED) {
         av_log(s->avctx, AV_LOG_ERROR, "Reserved frame type\n");
         return AAC_AC3_PARSE_ERROR_FRAME_TYPE;
     }
@@ -333,16 +326,35 @@ int ff_eac3_parse_header(AC3DecodeContext *s)
 
     /* volume control params */
     for (i = 0; i < (s->channel_mode ? 1 : 2); i++) {
-        skip_bits(gbc, 5); // skip dialog normalization
-        if (get_bits1(gbc)) {
-            skip_bits(gbc, 8); // skip compression gain word
+        s->dialog_normalization[i] = -get_bits(gbc, 5);
+        if (s->dialog_normalization[i] == 0) {
+            s->dialog_normalization[i] = -31;
+        }
+        if (s->target_level != 0) {
+            s->level_gain[i] = powf(2.0f,
+                (float)(s->target_level - s->dialog_normalization[i])/6.0f);
+        }
+        s->compression_exists[i] = get_bits1(gbc);
+        if (s->compression_exists[i]) {
+            s->heavy_dynamic_range[i] = AC3_HEAVY_RANGE(get_bits(gbc, 8));
         }
     }
 
     /* dependent stream channel map */
     if (s->frame_type == EAC3_FRAME_TYPE_DEPENDENT) {
         if (get_bits1(gbc)) {
-            skip_bits(gbc, 16); // skip custom channel map
+            int64_t channel_layout = 0;
+            int channel_map = get_bits(gbc, 16);
+            av_log(s->avctx, AV_LOG_DEBUG, "channel_map: %0X\n", channel_map);
+
+            for (i = 0; i < 16; i++)
+                if (channel_map & (1 << (EAC3_MAX_CHANNELS - i - 1)))
+                    channel_layout |= ff_eac3_custom_channel_map_locations[i][1];
+
+            if (av_popcount64(channel_layout) > EAC3_MAX_CHANNELS) {
+                return AVERROR_INVALIDDATA;
+            }
+            s->channel_map = channel_map;
         }
     }
 
diff --git a/libavcodec/eac3enc.c b/libavcodec/eac3enc.c
index 17757be..e1d61f6 100644
--- a/libavcodec/eac3enc.c
+++ b/libavcodec/eac3enc.c
@@ -2,20 +2,20 @@
  * E-AC-3 encoder
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,7 +42,6 @@ static const AVClass eac3enc_class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-
 /**
  * LUT for finding a matching frame exponent strategy index from a set of
  * exponent strategies for a single channel across all 6 blocks.
diff --git a/libavcodec/eac3enc.h b/libavcodec/eac3enc.h
index a92a24c..7d61559 100644
--- a/libavcodec/eac3enc.h
+++ b/libavcodec/eac3enc.h
@@ -2,20 +2,20 @@
  * E-AC-3 encoder
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eacmv.c b/libavcodec/eacmv.c
index 633c26a..6f39d72 100644
--- a/libavcodec/eacmv.c
+++ b/libavcodec/eacmv.c
@@ -2,20 +2,20 @@
  * Electronic Arts CMV Video Decoder
  * Copyright (c) 2007-2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -44,6 +44,7 @@ typedef struct CmvContext {
 
 static av_cold int cmv_decode_init(AVCodecContext *avctx){
     CmvContext *s = avctx->priv_data;
+
     s->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
@@ -160,7 +161,7 @@ static int cmv_process_header(CmvContext *s, const uint8_t *buf, const uint8_t *
 
     buf += 16;
     for (i=pal_start; i<pal_start+pal_count && i<AVPALETTE_COUNT && buf_end - buf >= 3; i++) {
-        s->palette[i] = AV_RB24(buf);
+        s->palette[i] = 0xFFU << 24 | AV_RB24(buf);
         buf += 3;
     }
 
@@ -185,19 +186,20 @@ static int cmv_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
 
     if (AV_RL32(buf)==MVIh_TAG||AV_RB32(buf)==MVIh_TAG) {
+        unsigned size = AV_RL32(buf + 4);
         ret = cmv_process_header(s, buf+EA_PREAMBLE_SIZE, buf_end);
         if (ret < 0)
             return ret;
-        return buf_size;
+        if (size > buf_end - buf - EA_PREAMBLE_SIZE)
+            return AVERROR_INVALIDDATA;
+        buf += size;
     }
 
-    if (av_image_check_size(s->width, s->height, 0, s->avctx))
-        return -1;
+    if ((ret = av_image_check_size(s->width, s->height, 0, s->avctx)) < 0)
+        return ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     memcpy(frame->data[1], s->palette, AVPALETTE_SIZE);
 
diff --git a/libavcodec/eaidct.c b/libavcodec/eaidct.c
index 271e28c..038ee2a 100644
--- a/libavcodec/eaidct.c
+++ b/libavcodec/eaidct.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGQ/TQI/MAD IDCT algorithm
  * Copyright (c) 2007-2008 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eaidct.h b/libavcodec/eaidct.h
index 395a8ae..a46dae9 100644
--- a/libavcodec/eaidct.h
+++ b/libavcodec/eaidct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c
index 0f8a4d0..7f28abb 100644
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -2,20 +2,20 @@
  * Electronic Arts Madcow Video Decoder
  * Copyright (c) 2007-2009 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -29,17 +29,16 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "blockdsp.h"
 #include "bytestream.h"
 #include "bswapdsp.h"
+#include "get_bits.h"
 #include "aandcttab.h"
 #include "eaidct.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "mpeg12data.h"
 #include "mpeg12vlc.h"
-#include "vlc.h"
 
 #define EA_PREAMBLE_SIZE    8
 #define MADk_TAG MKTAG('M', 'A', 'D', 'k')    /* MAD I-frame */
@@ -52,10 +51,10 @@ typedef struct MadContext {
     BswapDSPContext bbdsp;
     IDCTDSPContext idsp;
     AVFrame *last_frame;
-    BitstreamContext bc;
+    GetBitContext gb;
     void *bitstream_buf;
     unsigned int bitstream_buf_size;
-    DECLARE_ALIGNED(16, int16_t, block)[64];
+    DECLARE_ALIGNED(32, int16_t, block)[64];
     ScanTable scantable;
     uint16_t quant_matrix[64];
     int mb_x;
@@ -67,7 +66,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     MadContext *s = avctx->priv_data;
     s->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-    ff_blockdsp_init(&s->bdsp);
+    ff_blockdsp_init(&s->bdsp, avctx);
     ff_bswapdsp_init(&s->bbdsp);
     ff_idctdsp_init(&s->idsp, avctx);
     ff_init_scantable_permutation(s->idsp.idct_permutation, FF_IDCT_PERM_NONE);
@@ -95,15 +94,21 @@ static inline void comp_block(MadContext *t, AVFrame *frame,
                               int j, int mv_x, int mv_y, int add)
 {
     if (j < 4) {
+        unsigned offset = (mb_y*16 + ((j&2)<<2) + mv_y)*t->last_frame->linesize[0] + mb_x*16 + ((j&1)<<3) + mv_x;
+        if (offset >= (t->avctx->height - 7) * t->last_frame->linesize[0] - 7)
+            return;
         comp(frame->data[0] + (mb_y*16 + ((j&2)<<2))*frame->linesize[0] + mb_x*16 + ((j&1)<<3),
              frame->linesize[0],
-             t->last_frame->data[0] + (mb_y*16 + ((j&2)<<2) + mv_y)*t->last_frame->linesize[0] + mb_x*16 + ((j&1)<<3) + mv_x,
+             t->last_frame->data[0] + offset,
              t->last_frame->linesize[0], add);
     } else if (!(t->avctx->flags & AV_CODEC_FLAG_GRAY)) {
         int index = j - 3;
+        unsigned offset = (mb_y * 8 + (mv_y/2))*t->last_frame->linesize[index] + mb_x * 8 + (mv_x/2);
+        if (offset >= (t->avctx->height/2 - 7) * t->last_frame->linesize[index] - 7)
+            return;
         comp(frame->data[index] + (mb_y*8)*frame->linesize[index] + mb_x * 8,
              frame->linesize[index],
-             t->last_frame->data[index] + (mb_y * 8 + (mv_y/2))*t->last_frame->linesize[index] + mb_x * 8 + (mv_x/2),
+             t->last_frame->data[index] + offset,
              t->last_frame->linesize[index], add);
     }
 }
@@ -123,22 +128,24 @@ static inline void idct_put(MadContext *t, AVFrame *frame, int16_t *block,
     }
 }
 
-static inline void decode_block_intra(MadContext *s, int16_t * block)
+static inline int decode_block_intra(MadContext *s, int16_t * block)
 {
     int level, i, j, run;
     RLTable *rl = &ff_rl_mpeg1;
     const uint8_t *scantable = s->scantable.permutated;
     int16_t *quant_matrix = s->quant_matrix;
 
-    block[0] = (128 + bitstream_read_signed(&s->bc, 8)) * quant_matrix[0];
+    block[0] = (128 + get_sbits(&s->gb, 8)) * quant_matrix[0];
 
     /* The RL decoder is derived from mpeg1_decode_block_intra;
        Escaped level and run values a decoded differently */
     i = 0;
     {
+        OPEN_READER(re, &s->gb);
         /* now quantify & encode AC coefficients */
         for (;;) {
-            BITSTREAM_RL_VLC(level, run, &s->bc, rl->rl_vlc[0], TEX_VLC_BITS, 2);
+            UPDATE_CACHE(re, &s->gb);
+            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
 
             if (level == 127) {
                 break;
@@ -147,23 +154,26 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
                 if (i > 63) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                    return;
+                    return -1;
                 }
                 j = scantable[i];
                 level = (level*quant_matrix[j]) >> 4;
                 level = (level-1)|1;
-                level = bitstream_apply_sign(&s->bc, level);
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
             } else {
                 /* escape */
-                level = bitstream_read_signed(&s->bc, 10);
+                UPDATE_CACHE(re, &s->gb);
+                level = SHOW_SBITS(re, &s->gb, 10); SKIP_BITS(re, &s->gb, 10);
 
-                run = bitstream_read(&s->bc, 6) + 1;
+                UPDATE_CACHE(re, &s->gb);
+                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
 
                 i += run;
                 if (i > 63) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                    return;
+                    return -1;
                 }
                 j = scantable[i];
                 if (level < 0) {
@@ -179,48 +189,50 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
 
             block[j] = level;
         }
+        CLOSE_READER(re, &s->gb);
     }
+    return 0;
 }
 
-static int decode_motion(BitstreamContext *bc)
+static int decode_motion(GetBitContext *gb)
 {
     int value = 0;
-
-    if (bitstream_read_bit(bc)) {
-        if (bitstream_read_bit(bc))
+    if (get_bits1(gb)) {
+        if (get_bits1(gb))
             value = -17;
-        value += bitstream_read(bc, 4) + 1;
+        value += get_bits(gb, 4) + 1;
     }
     return value;
 }
 
-static void decode_mb(MadContext *s, AVFrame *frame, int inter)
+static int decode_mb(MadContext *s, AVFrame *frame, int inter)
 {
     int mv_map = 0;
-    int mv_x, mv_y;
+    int av_uninit(mv_x), av_uninit(mv_y);
     int j;
 
     if (inter) {
-        int v = bitstream_decode210(&s->bc);
+        int v = decode210(&s->gb);
         if (v < 2) {
-            mv_map = v ? bitstream_read(&s->bc, 6) : 63;
-            mv_x = decode_motion(&s->bc);
-            mv_y = decode_motion(&s->bc);
-        } else {
-            mv_map = 0;
+            mv_map = v ? get_bits(&s->gb, 6) : 63;
+            mv_x = decode_motion(&s->gb);
+            mv_y = decode_motion(&s->gb);
         }
     }
 
     for (j=0; j<6; j++) {
         if (mv_map & (1<<j)) {  // mv_x and mv_y are guarded by mv_map
-            int add = 2 * decode_motion(&s->bc);
-            comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add);
+            int add = 2*decode_motion(&s->gb);
+            if (s->last_frame->data[0])
+                comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add);
         } else {
             s->bdsp.clear_block(s->block);
-            decode_block_intra(s, s->block);
+            if(decode_block_intra(s, s->block) < 0)
+                return -1;
             idct_put(s, frame, s->block, s->mb_x, s->mb_y, j);
         }
     }
+    return 0;
 }
 
 static void calc_quant_matrix(MadContext *s, int qscale)
@@ -265,16 +277,21 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    if (width < 16 || height < 16) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     if (avctx->width != width || avctx->height != height) {
         av_frame_unref(s->last_frame);
+        if((width * (int64_t)height)/2048*7 > bytestream2_get_bytes_left(&gb))
+            return AVERROR_INVALIDDATA;
         if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
             return ret;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (inter && !s->last_frame->data[0]) {
         av_log(avctx, AV_LOG_WARNING, "Missing reference frame.\n");
@@ -295,11 +312,13 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR(ENOMEM);
     s->bbdsp.bswap16_buf(s->bitstream_buf, (const uint16_t *)(buf + bytestream2_tell(&gb)),
                          bytestream2_get_bytes_left(&gb) / 2);
-    bitstream_init8(&s->bc, s->bitstream_buf, bytestream2_get_bytes_left(&gb));
+    memset((uint8_t*)s->bitstream_buf + bytestream2_get_bytes_left(&gb), 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    init_get_bits(&s->gb, s->bitstream_buf, 8*(bytestream2_get_bytes_left(&gb)));
 
     for (s->mb_y=0; s->mb_y < (avctx->height+15)/16; s->mb_y++)
         for (s->mb_x=0; s->mb_x < (avctx->width +15)/16; s->mb_x++)
-            decode_mb(s, frame, inter);
+            if(decode_mb(s, frame, inter) < 0)
+                return AVERROR_INVALIDDATA;
 
     *got_frame = 1;
 
@@ -316,7 +335,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
 {
     MadContext *t = avctx->priv_data;
     av_frame_free(&t->last_frame);
-    av_free(t->bitstream_buf);
+    av_freep(&t->bitstream_buf);
     return 0;
 }
 
diff --git a/libavcodec/eatgq.c b/libavcodec/eatgq.c
index 190f4c8..1308c07 100644
--- a/libavcodec/eatgq.c
+++ b/libavcodec/eatgq.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGQ Video Decoder
  * Copyright (c) 2007-2008 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -31,9 +31,9 @@
 #define BITSTREAM_READER_LE
 #include "aandcttab.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
 #include "eaidct.h"
+#include "get_bits.h"
 #include "idctdsp.h"
 #include "internal.h"
 
@@ -58,44 +58,44 @@ static av_cold int tgq_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static void tgq_decode_block(TgqContext *s, int16_t block[64], BitstreamContext *bc)
+static void tgq_decode_block(TgqContext *s, int16_t block[64], GetBitContext *gb)
 {
     uint8_t *perm = s->scantable.permutated;
     int i, j, value;
-    block[0] = bitstream_read_signed(bc, 8) * s->qtable[0];
+    block[0] = get_sbits(gb, 8) * s->qtable[0];
     for (i = 1; i < 64;) {
-        switch (bitstream_peek(bc, 3)) {
+        switch (show_bits(gb, 3)) {
         case 4:
             block[perm[i++]] = 0;
         case 0:
             block[perm[i++]] = 0;
-            bitstream_skip(bc, 3);
+            skip_bits(gb, 3);
             break;
         case 5:
         case 1:
-            bitstream_skip(bc, 2);
-            value = bitstream_read(bc, 6);
+            skip_bits(gb, 2);
+            value = get_bits(gb, 6);
             for (j = 0; j < value; j++)
                 block[perm[i++]] = 0;
             break;
         case 6:
-            bitstream_skip(bc, 3);
+            skip_bits(gb, 3);
             block[perm[i]] = -s->qtable[perm[i]];
             i++;
             break;
         case 2:
-            bitstream_skip(bc, 3);
+            skip_bits(gb, 3);
             block[perm[i]] = s->qtable[perm[i]];
             i++;
             break;
         case 7: // 111b
         case 3: // 011b
-            bitstream_skip(bc, 2);
-            if (bitstream_peek(bc, 6) == 0x3F) {
-                bitstream_skip(bc, 6);
-                block[perm[i]] = bitstream_read_signed(bc, 8) * s->qtable[perm[i]];
+            skip_bits(gb, 2);
+            if (show_bits(gb, 6) == 0x3F) {
+                skip_bits(gb, 6);
+                block[perm[i]] = get_sbits(gb, 8) * s->qtable[perm[i]];
             } else {
-                block[perm[i]] = bitstream_read_signed(bc, 6) * s->qtable[perm[i]];
+                block[perm[i]] = get_sbits(gb, 6) * s->qtable[perm[i]];
             }
             i++;
             break;
@@ -148,7 +148,7 @@ static void tgq_idct_put_mb_dconly(TgqContext *s, AVFrame *frame,
     }
 }
 
-static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
+static int tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
 {
     int mode;
     int i;
@@ -156,10 +156,13 @@ static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
 
     mode = bytestream2_get_byte(&s->gb);
     if (mode > 12) {
-        BitstreamContext bc;
-        bitstream_init8(&bc, s->gb.buffer, FFMIN(s->gb.buffer_end - s->gb.buffer, mode));
+        GetBitContext gb;
+        int ret = init_get_bits8(&gb, s->gb.buffer, FFMIN(bytestream2_get_bytes_left(&s->gb), mode));
+        if (ret < 0)
+            return ret;
+
         for (i = 0; i < 6; i++)
-            tgq_decode_block(s, s->block[i], &bc);
+            tgq_decode_block(s, s->block[i], &gb);
         tgq_idct_put_mb(s, s->block, frame, mb_x, mb_y);
         bytestream2_skip(&s->gb, mode);
     } else {
@@ -176,9 +179,11 @@ static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
             }
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "unsupported mb mode %i\n", mode);
+            return -1;
         }
         tgq_idct_put_mb_dconly(s, frame, mb_x, mb_y, dc);
     }
+    return 0;
 }
 
 static void tgq_calculate_qtable(TgqContext *s, int quant)
@@ -201,12 +206,13 @@ static int tgq_decode_frame(AVCodecContext *avctx,
     TgqContext *s      = avctx->priv_data;
     AVFrame *frame     = data;
     int x, y, ret;
-    int big_endian = AV_RL32(&buf[4]) > 0x000FFFFF;
+    int big_endian;
 
     if (buf_size < 16) {
         av_log(avctx, AV_LOG_WARNING, "truncated header\n");
         return AVERROR_INVALIDDATA;
     }
+    big_endian = AV_RL32(&buf[4]) > 0x000FFFFF;
     bytestream2_init(&s->gb, buf + 8, buf_size - 8);
     if (big_endian) {
         s->width  = bytestream2_get_be16u(&s->gb);
@@ -223,16 +229,15 @@ static int tgq_decode_frame(AVCodecContext *avctx,
     tgq_calculate_qtable(s, bytestream2_get_byteu(&s->gb));
     bytestream2_skip(&s->gb, 3);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     frame->key_frame = 1;
     frame->pict_type = AV_PICTURE_TYPE_I;
 
     for (y = 0; y < FFALIGN(avctx->height, 16) >> 4; y++)
         for (x = 0; x < FFALIGN(avctx->width, 16) >> 4; x++)
-            tgq_decode_mb(s, frame, y, x);
+            if (tgq_decode_mb(s, frame, y, x) < 0)
+                return AVERROR_INVALIDDATA;
 
     *got_frame = 1;
 
diff --git a/libavcodec/eatgv.c b/libavcodec/eatgv.c
index 3894f2b..93e291f 100644
--- a/libavcodec/eatgv.c
+++ b/libavcodec/eatgv.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGV Video Decoder
  * Copyright (c) 2007-2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -33,7 +33,7 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #define EA_PREAMBLE_SIZE    8
@@ -82,7 +82,7 @@ static int unpack(const uint8_t *src, const uint8_t *src_end,
     else
         src += 2;
 
-    if (src + 3 > src_end)
+    if (src_end - src < 3)
         return AVERROR_INVALIDDATA;
     size = AV_RB24(src);
     src += 3;
@@ -153,11 +153,11 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     int num_blocks_packed;
     int vector_bits;
     int i,j,x,y;
-    BitstreamContext bc;
+    GetBitContext gb;
     int mvbits;
     const uint8_t *blocks_raw;
 
-    if (buf + 12 > buf_end)
+    if(buf_end - buf < 12)
         return AVERROR_INVALIDDATA;
 
     num_mvs           = AV_RL16(&buf[0]);
@@ -166,7 +166,7 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     vector_bits       = AV_RL16(&buf[6]);
     buf += 12;
 
-    if (vector_bits > 32 || !vector_bits) {
+    if (vector_bits > MIN_CACHE_BITS || !vector_bits) {
         av_log(s->avctx, AV_LOG_ERROR,
                "Invalid value for motion vector bits: %d\n", vector_bits);
         return AVERROR_INVALIDDATA;
@@ -174,9 +174,11 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
 
     /* allocate codebook buffers as necessary */
     if (num_mvs > s->num_mvs) {
-        int err = av_reallocp(&s->mv_codebook, num_mvs * 2 * sizeof(int));
-        if (err < 0)
+        int err = av_reallocp_array(&s->mv_codebook, num_mvs, sizeof(*s->mv_codebook));
+        if (err < 0) {
+            s->num_mvs = 0;
             return err;
+        }
         s->num_mvs = num_mvs;
     }
 
@@ -192,13 +194,13 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     /* read motion vectors */
     mvbits = (num_mvs * 2 * 10 + 31) & ~31;
 
-    if (buf + (mvbits >> 3) + 16 * num_blocks_raw + 8 * num_blocks_packed > buf_end)
+    if (buf_end - buf < (mvbits>>3) + 16*num_blocks_raw + 8*num_blocks_packed)
         return AVERROR_INVALIDDATA;
 
-    bitstream_init(&bc, buf, mvbits);
+    init_get_bits(&gb, buf, mvbits);
     for (i = 0; i < num_mvs; i++) {
-        s->mv_codebook[i][0] = bitstream_read_signed(&bc, 10);
-        s->mv_codebook[i][1] = bitstream_read_signed(&bc, 10);
+        s->mv_codebook[i][0] = get_sbits(&gb, 10);
+        s->mv_codebook[i][1] = get_sbits(&gb, 10);
     }
     buf += mvbits >> 3;
 
@@ -207,23 +209,23 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     buf       += num_blocks_raw * 16;
 
     /* read compressed blocks */
-    bitstream_init8(&bc, buf, buf_end - buf);
+    init_get_bits(&gb, buf, (buf_end - buf) << 3);
     for (i = 0; i < num_blocks_packed; i++) {
         int tmp[4];
         for (j = 0; j < 4; j++)
-            tmp[j] = bitstream_read(&bc, 8);
+            tmp[j] = get_bits(&gb, 8);
         for (j = 0; j < 16; j++)
-            s->block_codebook[i][15-j] = tmp[bitstream_read(&bc, 2)];
+            s->block_codebook[i][15-j] = tmp[get_bits(&gb, 2)];
     }
 
-    if (bitstream_bits_left(&bc) < vector_bits *
+    if (get_bits_left(&gb) < vector_bits *
         (s->avctx->height / 4) * (s->avctx->width / 4))
         return AVERROR_INVALIDDATA;
 
     /* read vectors and build frame */
     for (y = 0; y < s->avctx->height / 4; y++)
         for (x = 0; x < s->avctx->width / 4; x++) {
-            unsigned int vector = bitstream_read(&bc, vector_bits);
+            unsigned int vector = get_bits(&gb, vector_bits);
             const uint8_t *src;
             ptrdiff_t src_stride;
 
@@ -232,8 +234,10 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
                 int my = y * 4 + s->mv_codebook[vector][1];
 
                 if (mx < 0 || mx + 4 > s->avctx->width ||
-                    my < 0 || my + 4 > s->avctx->height)
+                    my < 0 || my + 4 > s->avctx->height) {
+                    av_log(s->avctx, AV_LOG_ERROR, "MV %d %d out of picture\n", mx, my);
                     continue;
+                }
 
                 src = s->last_frame->data[0] + mx + my * s->last_frame->linesize[0];
                 src_stride = s->last_frame->linesize[0];
@@ -268,12 +272,15 @@ static int tgv_decode_frame(AVCodecContext *avctx,
     AVFrame *frame         = data;
     int chunk_type, ret;
 
+    if (buf_end - buf < EA_PREAMBLE_SIZE)
+        return AVERROR_INVALIDDATA;
+
     chunk_type = AV_RL32(&buf[0]);
     buf       += EA_PREAMBLE_SIZE;
 
     if (chunk_type == kVGT_TAG) {
         int pal_count, i;
-        if (buf + 12 > buf_end) {
+        if(buf_end - buf < 12) {
             av_log(avctx, AV_LOG_WARNING, "truncated header\n");
             return AVERROR_INVALIDDATA;
         }
@@ -289,8 +296,8 @@ static int tgv_decode_frame(AVCodecContext *avctx,
 
         pal_count = AV_RL16(&buf[6]);
         buf += 12;
-        for (i = 0; i < pal_count && i < AVPALETTE_COUNT && buf + 2 < buf_end; i++) {
-            s->palette[i] = AV_RB24(buf);
+        for(i = 0; i < pal_count && i < AVPALETTE_COUNT && buf_end - buf >= 3; i++) {
+            s->palette[i] = 0xFFU << 24 | AV_RB24(buf);
             buf += 3;
         }
     }
@@ -306,7 +313,7 @@ static int tgv_decode_frame(AVCodecContext *avctx,
         frame->pict_type = AV_PICTURE_TYPE_I;
 
         if (!s->frame_buffer &&
-            !(s->frame_buffer = av_malloc(s->width * s->height)))
+            !(s->frame_buffer = av_mallocz(s->width * s->height)))
             return AVERROR(ENOMEM);
 
         if (unpack(buf, buf_end, s->frame_buffer, s->avctx->width, s->avctx->height) < 0) {
@@ -344,8 +351,8 @@ static av_cold int tgv_decode_end(AVCodecContext *avctx)
     TgvContext *s = avctx->priv_data;
     av_frame_free(&s->last_frame);
     av_freep(&s->frame_buffer);
-    av_free(s->mv_codebook);
-    av_free(s->block_codebook);
+    av_freep(&s->mv_codebook);
+    av_freep(&s->block_codebook);
     return 0;
 }
 
diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c
index 1ae4d3f..0002d45 100644
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -2,20 +2,20 @@
  * Electronic Arts TQI Video Decoder
  * Copyright (c) 2007-2009 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -37,6 +37,7 @@
 #include "mpeg12.h"
 
 typedef struct TqiContext {
+    AVCodecContext *avctx;
     GetBitContext gb;
     BlockDSPContext bdsp;
     BswapDSPContext bsdsp;
@@ -50,14 +51,14 @@ typedef struct TqiContext {
     uint16_t intra_matrix[64];
     int last_dc[3];
 
-    DECLARE_ALIGNED(16, int16_t, block)[6][64];
+    DECLARE_ALIGNED(32, int16_t, block)[6][64];
 } TqiContext;
 
 static av_cold int tqi_decode_init(AVCodecContext *avctx)
 {
     TqiContext *t = avctx->priv_data;
 
-    ff_blockdsp_init(&t->bdsp);
+    ff_blockdsp_init(&t->bdsp, avctx);
     ff_bswapdsp_init(&t->bsdsp);
     ff_idctdsp_init(&t->idsp, avctx);
     ff_init_scantable_permutation(t->idsp.idct_permutation, FF_IDCT_PERM_NONE);
@@ -79,8 +80,11 @@ static int tqi_decode_mb(TqiContext *t, int16_t (*block)[64])
                                               t->intra_matrix,
                                               t->intra_scantable.permutated,
                                               t->last_dc, block[n], n, 1);
-        if (ret < 0)
-            return -1;
+        if (ret < 0) {
+            av_log(t->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n",
+                   t->mb_x, t->mb_y);
+            return ret;
+        }
     }
 
     return 0;
@@ -108,7 +112,7 @@ static inline void tqi_idct_put(AVCodecContext *avctx, AVFrame *frame,
 
 static void tqi_calculate_qtable(TqiContext *t, int quant)
 {
-    const int qscale = (215 - 2*quant)*5;
+    const int64_t qscale = (215 - 2*quant)*5;
     int i;
 
     t->intra_matrix[0] = (ff_inv_aanscales[0] * ff_mpeg1_default_intra_matrix[0]) >> 11;
@@ -127,6 +131,8 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     int ret, w, h;
 
+    t->avctx = avctx;
+
     w = AV_RL16(&buf[0]);
     h = AV_RL16(&buf[2]);
     tqi_calculate_qtable(t, buf[4]);
@@ -136,10 +142,8 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     av_fast_padded_malloc(&t->bitstream_buf, &t->bitstream_buf_size,
                           buf_end - buf);
@@ -155,10 +159,11 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     for (t->mb_y = 0; t->mb_y < (h + 15) / 16; t->mb_y++) {
         for (t->mb_x = 0; t->mb_x < (w + 15) / 16; t->mb_x++) {
             if (tqi_decode_mb(t, t->block) < 0)
-                break;
+                goto end;
             tqi_idct_put(avctx, frame, t->block);
         }
     }
+    end:
 
     *got_frame = 1;
     return buf_size;
@@ -167,7 +172,7 @@ static int tqi_decode_frame(AVCodecContext *avctx,
 static av_cold int tqi_decode_end(AVCodecContext *avctx)
 {
     TqiContext *t = avctx->priv_data;
-    av_free(t->bitstream_buf);
+    av_freep(&t->bitstream_buf);
     return 0;
 }
 
diff --git a/libavcodec/elbg.c b/libavcodec/elbg.c
index 07bb2e3..b6049c9 100644
--- a/libavcodec/elbg.c
+++ b/libavcodec/elbg.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007 Vitor Sessak <vitor1001@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/lfg.h"
 #include "elbg.h"
@@ -50,7 +51,7 @@ typedef struct elbg_data {
     int *codebook;
     cell **cells;
     int *utility;
-    int *utility_inc;
+    int64_t *utility_inc;
     int *nearest_cb;
     int *points;
     AVLFG *rand_state;
@@ -107,11 +108,20 @@ static int get_high_utility_cell(elbg_data *elbg)
 {
     int i=0;
     /* Using linear search, do binary if it ever turns to be speed critical */
-    int r = av_lfg_get(elbg->rand_state)%elbg->utility_inc[elbg->numCB-1] + 1;
-    while (elbg->utility_inc[i] < r)
+    uint64_t r;
+
+    if (elbg->utility_inc[elbg->numCB-1] < INT_MAX) {
+        r = av_lfg_get(elbg->rand_state) % (unsigned int)elbg->utility_inc[elbg->numCB-1] + 1;
+    } else {
+        r = av_lfg_get(elbg->rand_state);
+        r = (av_lfg_get(elbg->rand_state) + (r<<32)) % elbg->utility_inc[elbg->numCB-1] + 1;
+    }
+
+    while (elbg->utility_inc[i] < r) {
         i++;
+    }
 
-    assert(elbg->cells[i]);
+    av_assert2(elbg->cells[i]);
 
     return i;
 }
@@ -226,7 +236,8 @@ static void shift_codebook(elbg_data *elbg, int *indexes,
 
 static void evaluate_utility_inc(elbg_data *elbg)
 {
-    int i, inc=0;
+    int i;
+    int64_t inc=0;
 
     for (i=0; i < elbg->numCB; i++) {
         if (elbg->numCB*elbg->utility[i] > elbg->error)
@@ -323,7 +334,7 @@ static void do_shiftings(elbg_data *elbg)
 
 #define BIG_PRIME 433494437LL
 
-int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_init_elbg(int *points, int dim, int numpoints, int *codebook,
                  int numCB, int max_steps, int *closest_cb,
                  AVLFG *rand_state)
 {
@@ -332,7 +343,7 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
     if (numpoints > 24*numCB) {
         /* ELBG is very costly for a big number of points. So if we have a lot
            of them, get a good initial codebook to save on iterations       */
-        int *temp_points = av_malloc(dim*(numpoints/8)*sizeof(int));
+        int *temp_points = av_malloc_array(dim, (numpoints/8)*sizeof(int));
         if (!temp_points)
             return AVERROR(ENOMEM);
         for (i=0; i<numpoints/8; i++) {
@@ -340,14 +351,14 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
             memcpy(temp_points + i*dim, points + k*dim, dim*sizeof(int));
         }
 
-        ret = ff_init_elbg(temp_points, dim, numpoints / 8, codebook,
-                           numCB, 2 * max_steps, closest_cb, rand_state);
+        ret = avpriv_init_elbg(temp_points, dim, numpoints / 8, codebook,
+                               numCB, 2 * max_steps, closest_cb, rand_state);
         if (ret < 0) {
             av_freep(&temp_points);
             return ret;
         }
-        ret = ff_do_elbg(temp_points, dim, numpoints / 8, codebook,
-                         numCB, 2 * max_steps, closest_cb, rand_state);
+        ret = avpriv_do_elbg(temp_points, dim, numpoints / 8, codebook,
+                             numCB, 2 * max_steps, closest_cb, rand_state);
         av_free(temp_points);
 
     } else  // If not, initialize the codebook with random positions
@@ -357,7 +368,7 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
     return ret;
 }
 
-int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_do_elbg(int *points, int dim, int numpoints, int *codebook,
                 int numCB, int max_steps, int *closest_cb,
                 AVLFG *rand_state)
 {
@@ -365,9 +376,9 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
     elbg_data elbg_d;
     elbg_data *elbg = &elbg_d;
     int i, j, k, last_error, steps = 0, ret = 0;
-    int *dist_cb = av_malloc(numpoints*sizeof(int));
-    int *size_part = av_malloc(numCB*sizeof(int));
-    cell *list_buffer = av_malloc(numpoints*sizeof(cell));
+    int *dist_cb = av_malloc_array(numpoints, sizeof(int));
+    int *size_part = av_malloc_array(numCB, sizeof(int));
+    cell *list_buffer = av_malloc_array(numpoints, sizeof(cell));
     cell *free_cells;
     int best_dist, best_idx = 0;
 
@@ -375,12 +386,12 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
     elbg->dim = dim;
     elbg->numCB = numCB;
     elbg->codebook = codebook;
-    elbg->cells = av_malloc(numCB*sizeof(cell *));
-    elbg->utility = av_malloc(numCB*sizeof(int));
+    elbg->cells = av_malloc_array(numCB, sizeof(cell *));
+    elbg->utility = av_malloc_array(numCB, sizeof(int));
     elbg->nearest_cb = closest_cb;
     elbg->points = points;
-    elbg->utility_inc = av_malloc(numCB*sizeof(int));
-    elbg->scratchbuf = av_malloc(5*dim*sizeof(int));
+    elbg->utility_inc = av_malloc_array(numCB, sizeof(*elbg->utility_inc));
+    elbg->scratchbuf = av_malloc_array(5*dim, sizeof(int));
 
     if (!dist_cb || !size_part || !list_buffer || !elbg->cells ||
         !elbg->utility || !elbg->utility_inc || !elbg->scratchbuf) {
diff --git a/libavcodec/elbg.h b/libavcodec/elbg.h
index 3b1587a..f48aa3b 100644
--- a/libavcodec/elbg.h
+++ b/libavcodec/elbg.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007 Vitor Sessak <vitor1001@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
  * @param rand_state A random number generator state. Should be already initialized by av_lfg_init().
  * @return < 0 in case of error, 0 otherwise
  */
-int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_do_elbg(int *points, int dim, int numpoints, int *codebook,
                int numCB, int num_steps, int *closest_cb,
                AVLFG *rand_state);
 
@@ -46,11 +46,11 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
  * Initialize the **codebook vector for the elbg algorithm. If you have already
  * a codebook and you want to refine it, you shouldn't call this function.
  * If numpoints < 8*numCB this function fills **codebook with random numbers.
- * If not, it calls ff_do_elbg for a (smaller) random sample of the points in
- * **points. Get the same parameters as ff_do_elbg.
+ * If not, it calls avpriv_do_elbg for a (smaller) random sample of the points in
+ * **points. Get the same parameters as avpriv_do_elbg.
  * @return < 0 in case of error, 0 otherwise
  */
-int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_init_elbg(int *points, int dim, int numpoints, int *codebook,
                  int numCB, int num_steps, int *closest_cb,
                  AVLFG *rand_state);
 
diff --git a/libavcodec/elsdec.c b/libavcodec/elsdec.c
index 10a1a9d..cb0e9c6 100644
--- a/libavcodec/elsdec.c
+++ b/libavcodec/elsdec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -271,7 +271,7 @@ void ff_els_decoder_init(ElsDecCtx *ctx, const uint8_t *in, size_t data_size)
 
 void ff_els_decoder_uninit(ElsUnsignedRung *rung)
 {
-    av_free(rung->rem_rung_list);
+    av_freep(&rung->rem_rung_list);
 }
 
 static int els_import_byte(ElsDecCtx *ctx)
@@ -391,12 +391,10 @@ unsigned ff_els_decode_unsigned(ElsDecCtx *ctx, ElsUnsignedRung *ur)
                 if (ur->rung_list_size <= (ur->avail_index + 2) * sizeof(ElsRungNode)) {
                     // remember rung_node position
                     ptrdiff_t pos     = rung_node - ur->rem_rung_list;
-                    ur->rem_rung_list = av_realloc(ur->rem_rung_list,
+                    ctx->err = av_reallocp(&ur->rem_rung_list,
                                                    ur->rung_list_size +
                                                    RUNG_SPACE);
-                    if (!ur->rem_rung_list) {
-                        av_free(ur->rem_rung_list);
-                        ctx->err = AVERROR(ENOMEM);
+                    if (ctx->err < 0) {
                         return 0;
                     }
                     memset((uint8_t *) ur->rem_rung_list + ur->rung_list_size, 0,
diff --git a/libavcodec/elsdec.h b/libavcodec/elsdec.h
index 515b49a..139a24a 100644
--- a/libavcodec/elsdec.h
+++ b/libavcodec/elsdec.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/encode.c b/libavcodec/encode.c
index 9bb7ae5..d12c425 100644
--- a/libavcodec/encode.c
+++ b/libavcodec/encode.c
@@ -1,20 +1,20 @@
 /*
  * generic encoding-related code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,28 +26,55 @@
 #include "libavutil/samplefmt.h"
 
 #include "avcodec.h"
+#include "frame_thread_encoder.h"
 #include "internal.h"
 
-int ff_alloc_packet(AVPacket *avpkt, int size)
+int ff_alloc_packet2(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int64_t min_size)
 {
-    if (size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE)
+    if (avpkt->size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid negative user packet size %d\n", avpkt->size);
+        return AVERROR(EINVAL);
+    }
+    if (size < 0 || size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid minimum required packet size %"PRId64" (max allowed is %d)\n",
+               size, INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE);
         return AVERROR(EINVAL);
+    }
+
+    if (avctx && 2*min_size < size) { // FIXME The factor needs to be finetuned
+        av_assert0(!avpkt->data || avpkt->data != avctx->internal->byte_buffer);
+        if (!avpkt->data || avpkt->size < size) {
+            av_fast_padded_malloc(&avctx->internal->byte_buffer, &avctx->internal->byte_buffer_size, size);
+            avpkt->data = avctx->internal->byte_buffer;
+            avpkt->size = avctx->internal->byte_buffer_size;
+        }
+    }
 
     if (avpkt->data) {
         AVBufferRef *buf = avpkt->buf;
 
-        if (avpkt->size < size)
+        if (avpkt->size < size) {
+            av_log(avctx, AV_LOG_ERROR, "User packet is too small (%d < %"PRId64")\n", avpkt->size, size);
             return AVERROR(EINVAL);
+        }
 
         av_init_packet(avpkt);
         avpkt->buf      = buf;
         avpkt->size     = size;
         return 0;
     } else {
-        return av_new_packet(avpkt, size);
+        int ret = av_new_packet(avpkt, size);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate packet of size %"PRId64"\n", size);
+        return ret;
     }
 }
 
+int ff_alloc_packet(AVPacket *avpkt, int size)
+{
+    return ff_alloc_packet2(NULL, avpkt, size, 0);
+}
+
 /**
  * Pad last frame with silence.
  */
@@ -61,6 +88,7 @@ static int pad_last_frame(AVCodecContext *s, AVFrame **dst, const AVFrame *src)
 
     frame->format         = src->format;
     frame->channel_layout = src->channel_layout;
+    frame->channels       = src->channels;
     frame->nb_samples     = s->frame_size;
     ret = av_frame_get_buffer(frame, 32);
     if (ret < 0)
@@ -92,10 +120,11 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
                                               const AVFrame *frame,
                                               int *got_packet_ptr)
 {
-    AVFrame tmp;
+    AVFrame *extended_frame = NULL;
     AVFrame *padded_frame = NULL;
     int ret;
-    int user_packet = !!avpkt->data;
+    AVPacket user_pkt = *avpkt;
+    int needs_realloc = !user_pkt.data;
 
     *got_packet_ptr = 0;
 
@@ -106,7 +135,6 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
 
     if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY) && !frame) {
         av_packet_unref(avpkt);
-        av_init_packet(avpkt);
         return 0;
     }
 
@@ -121,9 +149,13 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
         }
         av_log(avctx, AV_LOG_WARNING, "extended_data is not set.\n");
 
-        tmp = *frame;
-        tmp.extended_data = tmp.data;
-        frame = &tmp;
+        extended_frame = av_frame_alloc();
+        if (!extended_frame)
+            return AVERROR(ENOMEM);
+
+        memcpy(extended_frame, frame, sizeof(AVFrame));
+        extended_frame->extended_data = extended_frame->data;
+        frame = extended_frame;
     }
 
     /* extract audio service type metadata */
@@ -136,26 +168,32 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
     /* check for valid frame size */
     if (frame) {
         if (avctx->codec->capabilities & AV_CODEC_CAP_SMALL_LAST_FRAME) {
-            if (frame->nb_samples > avctx->frame_size)
-                return AVERROR(EINVAL);
+            if (frame->nb_samples > avctx->frame_size) {
+                av_log(avctx, AV_LOG_ERROR, "more samples than frame size (avcodec_encode_audio2)\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
         } else if (!(avctx->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE)) {
             if (frame->nb_samples < avctx->frame_size &&
                 !avctx->internal->last_audio_frame) {
                 ret = pad_last_frame(avctx, &padded_frame, frame);
                 if (ret < 0)
-                    return ret;
+                    goto end;
 
                 frame = padded_frame;
                 avctx->internal->last_audio_frame = 1;
             }
 
             if (frame->nb_samples != avctx->frame_size) {
+                av_log(avctx, AV_LOG_ERROR, "nb_samples (%d) != frame_size (%d) (avcodec_encode_audio2)\n", frame->nb_samples, avctx->frame_size);
                 ret = AVERROR(EINVAL);
                 goto end;
             }
         }
     }
 
+    av_assert0(avctx->codec->encode2);
+
     ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
     if (!ret) {
         if (*got_packet_ptr) {
@@ -170,19 +208,38 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
         } else {
             avpkt->size = 0;
         }
+    }
+    if (avpkt->data && avpkt->data == avctx->internal->byte_buffer) {
+        needs_realloc = 0;
+        if (user_pkt.data) {
+            if (user_pkt.size >= avpkt->size) {
+                memcpy(user_pkt.data, avpkt->data, avpkt->size);
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Provided packet is too small, needs to be %d\n", avpkt->size);
+                avpkt->size = user_pkt.size;
+                ret = -1;
+            }
+            avpkt->buf      = user_pkt.buf;
+            avpkt->data     = user_pkt.data;
+        } else if (!avpkt->buf) {
+            ret = av_packet_make_refcounted(avpkt);
+            if (ret < 0)
+                goto end;
+        }
+    }
 
-        if (!user_packet && avpkt->size) {
-            ret = av_buffer_realloc(&avpkt->buf, avpkt->size);
+    if (!ret) {
+        if (needs_realloc && avpkt->data) {
+            ret = av_buffer_realloc(&avpkt->buf, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
             if (ret >= 0)
                 avpkt->data = avpkt->buf->data;
         }
-
-        avctx->frame_number++;
+        if (frame)
+            avctx->frame_number++;
     }
 
     if (ret < 0 || !*got_packet_ptr) {
         av_packet_unref(avpkt);
-        av_init_packet(avpkt);
         goto end;
     }
 
@@ -193,6 +250,7 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
 
 end:
     av_frame_free(&padded_frame);
+    av_free(extended_frame);
 
     return ret;
 }
@@ -203,7 +261,8 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
                                               int *got_packet_ptr)
 {
     int ret;
-    int user_packet = !!avpkt->data;
+    AVPacket user_pkt = *avpkt;
+    int needs_realloc = !user_pkt.data;
 
     *got_packet_ptr = 0;
 
@@ -212,38 +271,71 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
         return AVERROR(ENOSYS);
     }
 
+    if(CONFIG_FRAME_THREAD_ENCODER &&
+       avctx->internal->frame_thread_encoder && (avctx->active_thread_type&FF_THREAD_FRAME))
+        return ff_thread_video_encode_frame(avctx, avpkt, frame, got_packet_ptr);
+
+    if ((avctx->flags&AV_CODEC_FLAG_PASS1) && avctx->stats_out)
+        avctx->stats_out[0] = '\0';
+
     if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY) && !frame) {
         av_packet_unref(avpkt);
-        av_init_packet(avpkt);
-        avpkt->size = 0;
         return 0;
     }
 
-    if (av_image_check_size(avctx->width, avctx->height, 0, avctx))
+    if (av_image_check_size2(avctx->width, avctx->height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx))
         return AVERROR(EINVAL);
 
+    if (frame && frame->format == AV_PIX_FMT_NONE)
+        av_log(avctx, AV_LOG_WARNING, "AVFrame.format is not set\n");
+    if (frame && (frame->width == 0 || frame->height == 0))
+        av_log(avctx, AV_LOG_WARNING, "AVFrame.width or height is not set\n");
+
     av_assert0(avctx->codec->encode2);
 
     ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
+    av_assert0(ret <= 0);
+
+    emms_c();
+
+    if (avpkt->data && avpkt->data == avctx->internal->byte_buffer) {
+        needs_realloc = 0;
+        if (user_pkt.data) {
+            if (user_pkt.size >= avpkt->size) {
+                memcpy(user_pkt.data, avpkt->data, avpkt->size);
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Provided packet is too small, needs to be %d\n", avpkt->size);
+                avpkt->size = user_pkt.size;
+                ret = -1;
+            }
+            avpkt->buf      = user_pkt.buf;
+            avpkt->data     = user_pkt.data;
+        } else if (!avpkt->buf) {
+            ret = av_packet_make_refcounted(avpkt);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
     if (!ret) {
         if (!*got_packet_ptr)
             avpkt->size = 0;
         else if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY))
             avpkt->pts = avpkt->dts = frame->pts;
 
-        if (!user_packet && avpkt->size) {
-            ret = av_buffer_realloc(&avpkt->buf, avpkt->size);
+        if (needs_realloc && avpkt->data) {
+            ret = av_buffer_realloc(&avpkt->buf, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
             if (ret >= 0)
                 avpkt->data = avpkt->buf->data;
         }
 
-        avctx->frame_number++;
+        if (frame)
+            avctx->frame_number++;
     }
 
     if (ret < 0 || !*got_packet_ptr)
         av_packet_unref(avpkt);
 
-    emms_c();
     return ret;
 }
 
@@ -255,8 +347,7 @@ int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
         av_log(avctx, AV_LOG_ERROR, "start_display_time must be 0.\n");
         return -1;
     }
-    if (sub->num_rects == 0 || !sub->rects)
-        return -1;
+
     ret = avctx->codec->encode_sub(avctx, buf, buf_size, sub);
     avctx->frame_number++;
     return ret;
diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c
index 54b7b3c..35d0c60 100644
--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,7 @@
 #include "mpegvideo.h"
 #include "rectangle.h"
 #include "thread.h"
+#include "version.h"
 
 /**
  * @param stride the number of MVs to get to the next row
@@ -43,7 +44,7 @@
 static void set_mv_strides(ERContext *s, ptrdiff_t *mv_step, ptrdiff_t *stride)
 {
     if (s->avctx->codec_id == AV_CODEC_ID_H264) {
-        assert(s->quarter_sample);
+        av_assert0(s->quarter_sample);
         *mv_step = 4;
         *stride  = s->mb_width * 4;
     } else {
@@ -82,6 +83,8 @@ static void put_dc(ERContext *s, uint8_t *dest_y, uint8_t *dest_cb,
         dcv = 0;
     else if (dcv > 2040)
         dcv = 2040;
+
+    if (dest_cr)
     for (y = 0; y < 8; y++) {
         int x;
         for (x = 0; x < 8; x++) {
@@ -104,7 +107,7 @@ static void filter181(int16_t *data, int width, int height, ptrdiff_t stride)
             dc = -prev_dc +
                  data[x     + y * stride] * 8 -
                  data[x + 1 + y * stride];
-            dc = (dc * 10923 + 32768) >> 16;
+            dc = (av_clip(dc, INT_MIN/10923, INT_MAX/10923 - 32768) * 10923 + 32768) >> 16;
             prev_dc = data[x + y * stride];
             data[x + y * stride] = dc;
         }
@@ -120,7 +123,7 @@ static void filter181(int16_t *data, int width, int height, ptrdiff_t stride)
             dc = -prev_dc +
                  data[x +  y      * stride] * 8 -
                  data[x + (y + 1) * stride];
-            dc = (dc * 10923 + 32768) >> 16;
+            dc = (av_clip(dc, INT_MIN/10923, INT_MAX/10923 - 32768) * 10923 + 32768) >> 16;
             prev_dc = data[x + y * stride];
             data[x + y * stride] = dc;
         }
@@ -136,11 +139,73 @@ static void guess_dc(ERContext *s, int16_t *dc, int w,
                      int h, ptrdiff_t stride, int is_luma)
 {
     int b_x, b_y;
+    int16_t  (*col )[4] = av_malloc_array(stride, h*sizeof( int16_t)*4);
+    uint32_t (*dist)[4] = av_malloc_array(stride, h*sizeof(uint32_t)*4);
+
+    if(!col || !dist) {
+        av_log(s->avctx, AV_LOG_ERROR, "guess_dc() is out of memory\n");
+        goto fail;
+    }
+
+    for(b_y=0; b_y<h; b_y++){
+        int color= 1024;
+        int distance= -1;
+        for(b_x=0; b_x<w; b_x++){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_x;
+            }
+            col [b_x + b_y*stride][1]= color;
+            dist[b_x + b_y*stride][1]= distance >= 0 ? b_x-distance : 9999;
+        }
+        color= 1024;
+        distance= -1;
+        for(b_x=w-1; b_x>=0; b_x--){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_x;
+            }
+            col [b_x + b_y*stride][0]= color;
+            dist[b_x + b_y*stride][0]= distance >= 0 ? distance-b_x : 9999;
+        }
+    }
+    for(b_x=0; b_x<w; b_x++){
+        int color= 1024;
+        int distance= -1;
+        for(b_y=0; b_y<h; b_y++){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_y;
+            }
+            col [b_x + b_y*stride][3]= color;
+            dist[b_x + b_y*stride][3]= distance >= 0 ? b_y-distance : 9999;
+        }
+        color= 1024;
+        distance= -1;
+        for(b_y=h-1; b_y>=0; b_y--){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_y;
+            }
+            col [b_x + b_y*stride][2]= color;
+            dist[b_x + b_y*stride][2]= distance >= 0 ? distance-b_y : 9999;
+        }
+    }
 
     for (b_y = 0; b_y < h; b_y++) {
         for (b_x = 0; b_x < w; b_x++) {
-            int color[4]    = { 1024, 1024, 1024, 1024 };
-            int distance[4] = { 9999, 9999, 9999, 9999 };
             int mb_index, error, j;
             int64_t guess, weight_sum;
             mb_index = (b_x >> is_luma) + (b_y >> is_luma) * s->mb_stride;
@@ -151,66 +216,21 @@ static void guess_dc(ERContext *s, int16_t *dc, int w,
             if (!(error & ER_DC_ERROR))
                 continue; // dc-ok
 
-            /* right block */
-            for (j = b_x + 1; j < w; j++) {
-                int mb_index_j = (j >> is_luma) + (b_y >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[0]    = dc[j + b_y * stride];
-                    distance[0] = j - b_x;
-                    break;
-                }
-            }
-
-            /* left block */
-            for (j = b_x - 1; j >= 0; j--) {
-                int mb_index_j = (j >> is_luma) + (b_y >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[1]    = dc[j + b_y * stride];
-                    distance[1] = b_x - j;
-                    break;
-                }
-            }
-
-            /* bottom block */
-            for (j = b_y + 1; j < h; j++) {
-                int mb_index_j = (b_x >> is_luma) + (j >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[2]    = dc[b_x + j * stride];
-                    distance[2] = j - b_y;
-                    break;
-                }
-            }
-
-            /* top block */
-            for (j = b_y - 1; j >= 0; j--) {
-                int mb_index_j = (b_x >> is_luma) + (j >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[3]    = dc[b_x + j * stride];
-                    distance[3] = b_y - j;
-                    break;
-                }
-            }
-
             weight_sum = 0;
             guess      = 0;
             for (j = 0; j < 4; j++) {
-                int64_t weight  = 256 * 256 * 256 * 16 / distance[j];
-                guess          += weight * (int64_t) color[j];
+                int64_t weight  = 256 * 256 * 256 * 16 / FFMAX(dist[b_x + b_y*stride][j], 1);
+                guess          += weight*(int64_t)col[b_x + b_y*stride][j];
                 weight_sum     += weight;
             }
             guess = (guess + weight_sum / 2) / weight_sum;
             dc[b_x + b_y * stride] = guess;
         }
     }
+
+fail:
+    av_freep(&col);
+    av_freep(&dist);
 }
 
 /**
@@ -354,23 +374,46 @@ static void v_block_filter(ERContext *s, uint8_t *dst, int w, int h,
     }
 }
 
+#define MV_FROZEN    8
+#define MV_CHANGED   4
+#define MV_UNCHANGED 2
+#define MV_LISTED    1
+static av_always_inline void add_blocklist(int (*blocklist)[2], int *blocklist_length, uint8_t *fixed, int mb_x, int mb_y, int mb_xy)
+{
+    if (fixed[mb_xy])
+        return;
+    fixed[mb_xy] = MV_LISTED;
+    blocklist[ *blocklist_length   ][0] = mb_x;
+    blocklist[(*blocklist_length)++][1] = mb_y;
+}
+
 static void guess_mv(ERContext *s)
 {
-    uint8_t *fixed = s->er_temp_buffer;
-#define MV_FROZEN    3
-#define MV_CHANGED   2
-#define MV_UNCHANGED 1
+    int (*blocklist)[2], (*next_blocklist)[2];
+    uint8_t *fixed;
     const ptrdiff_t mb_stride = s->mb_stride;
     const int mb_width  = s->mb_width;
-    const int mb_height = s->mb_height;
+    int mb_height = s->mb_height;
     int i, depth, num_avail;
     int mb_x, mb_y;
     ptrdiff_t mot_step, mot_stride;
+    int blocklist_length, next_blocklist_length;
+
+    if (s->last_pic.f && s->last_pic.f->data[0])
+        mb_height = FFMIN(mb_height, (s->last_pic.f->height+15)>>4);
+    if (s->next_pic.f && s->next_pic.f->data[0])
+        mb_height = FFMIN(mb_height, (s->next_pic.f->height+15)>>4);
+
+    blocklist      = (int (*)[2])s->er_temp_buffer;
+    next_blocklist = blocklist + s->mb_stride * s->mb_height;
+    fixed          = (uint8_t *)(next_blocklist + s->mb_stride * s->mb_height);
 
     set_mv_strides(s, &mot_step, &mot_stride);
 
     num_avail = 0;
-    for (i = 0; i < s->mb_num; i++) {
+    if (s->last_pic.motion_val[0])
+        ff_thread_await_progress(s->last_pic.tf, mb_height-1, 0);
+    for (i = 0; i < mb_width * mb_height; i++) {
         const int mb_xy = s->mb_index2xy[i];
         int f = 0;
         int error = s->error_status_table[mb_xy];
@@ -383,11 +426,19 @@ static void guess_mv(ERContext *s)
         fixed[mb_xy] = f;
         if (f == MV_FROZEN)
             num_avail++;
+        else if(s->last_pic.f->data[0] && s->last_pic.motion_val[0]){
+            const int mb_y= mb_xy / s->mb_stride;
+            const int mb_x= mb_xy % s->mb_stride;
+            const int mot_index= (mb_x + mb_y*mot_stride) * mot_step;
+            s->cur_pic.motion_val[0][mot_index][0]= s->last_pic.motion_val[0][mot_index][0];
+            s->cur_pic.motion_val[0][mot_index][1]= s->last_pic.motion_val[0][mot_index][1];
+            s->cur_pic.ref_index[0][4*mb_xy]      = s->last_pic.ref_index[0][4*mb_xy];
+        }
     }
 
     if ((!(s->avctx->error_concealment&FF_EC_GUESS_MVS)) ||
-        num_avail <= mb_width / 2) {
-        for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
+        num_avail <= FFMAX(mb_width, mb_height) / 2) {
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
                 const int mb_xy = mb_x + mb_y * s->mb_stride;
                 int mv_dir = (s->last_pic.f && s->last_pic.f->data[0]) ? MV_DIR_FORWARD : MV_DIR_BACKWARD;
@@ -406,252 +457,265 @@ static void guess_mv(ERContext *s)
         return;
     }
 
+    blocklist_length = 0;
+    for (mb_y = 0; mb_y < mb_height; mb_y++) {
+        for (mb_x = 0; mb_x < mb_width; mb_x++) {
+            const int mb_xy = mb_x + mb_y * mb_stride;
+            if (fixed[mb_xy] == MV_FROZEN) {
+                if (mb_x)               add_blocklist(blocklist, &blocklist_length, fixed, mb_x - 1, mb_y, mb_xy - 1);
+                if (mb_y)               add_blocklist(blocklist, &blocklist_length, fixed, mb_x, mb_y - 1, mb_xy - mb_stride);
+                if (mb_x+1 < mb_width)  add_blocklist(blocklist, &blocklist_length, fixed, mb_x + 1, mb_y, mb_xy + 1);
+                if (mb_y+1 < mb_height) add_blocklist(blocklist, &blocklist_length, fixed, mb_x, mb_y + 1, mb_xy + mb_stride);
+            }
+        }
+    }
+
     for (depth = 0; ; depth++) {
         int changed, pass, none_left;
+        int blocklist_index;
 
         none_left = 1;
         changed   = 1;
         for (pass = 0; (changed || pass < 2) && pass < 10; pass++) {
-            int mb_x, mb_y;
             int score_sum = 0;
 
             changed = 0;
-            for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
-                for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                    const int mb_xy        = mb_x + mb_y * s->mb_stride;
-                    int mv_predictor[8][2] = { { 0 } };
-                    int ref[8]             = { 0 };
-                    int pred_count         = 0;
-                    int j;
-                    int best_score         = 256 * 256 * 256 * 64;
-                    int best_pred          = 0;
-                    const int mot_index    = (mb_x + mb_y * mot_stride) * mot_step;
-                    int prev_x = 0, prev_y = 0, prev_ref = 0;
-
-                    if ((mb_x ^ mb_y ^ pass) & 1)
-                        continue;
+            for (blocklist_index = 0; blocklist_index < blocklist_length; blocklist_index++) {
+                const int mb_x = blocklist[blocklist_index][0];
+                const int mb_y = blocklist[blocklist_index][1];
+                const int mb_xy = mb_x + mb_y * mb_stride;
+                int mv_predictor[8][2];
+                int ref[8];
+                int pred_count;
+                int j;
+                int best_score;
+                int best_pred;
+                int mot_index;
+                int prev_x, prev_y, prev_ref;
 
-                    if (fixed[mb_xy] == MV_FROZEN)
-                        continue;
+                if ((mb_x ^ mb_y ^ pass) & 1)
+                    continue;
+                av_assert2(fixed[mb_xy] != MV_FROZEN);
 
-                    j = 0;
-                    if (mb_x > 0             && fixed[mb_xy - 1]         == MV_FROZEN)
-                        j = 1;
-                    if (mb_x + 1 < mb_width  && fixed[mb_xy + 1]         == MV_FROZEN)
-                        j = 1;
-                    if (mb_y > 0             && fixed[mb_xy - mb_stride] == MV_FROZEN)
-                        j = 1;
-                    if (mb_y + 1 < mb_height && fixed[mb_xy + mb_stride] == MV_FROZEN)
-                        j = 1;
-                    if (j == 0)
-                        continue;
 
-                    j = 0;
-                    if (mb_x > 0             && fixed[mb_xy - 1        ] == MV_CHANGED)
-                        j = 1;
-                    if (mb_x + 1 < mb_width  && fixed[mb_xy + 1        ] == MV_CHANGED)
-                        j = 1;
-                    if (mb_y > 0             && fixed[mb_xy - mb_stride] == MV_CHANGED)
-                        j = 1;
-                    if (mb_y + 1 < mb_height && fixed[mb_xy + mb_stride] == MV_CHANGED)
-                        j = 1;
-                    if (j == 0 && pass > 1)
-                        continue;
+                av_assert1(!IS_INTRA(s->cur_pic.mb_type[mb_xy]));
+                av_assert1(s->last_pic.f && s->last_pic.f->data[0]);
 
-                    none_left = 0;
+                j = 0;
+                if (mb_x > 0)
+                    j |= fixed[mb_xy - 1];
+                if (mb_x + 1 < mb_width)
+                    j |= fixed[mb_xy + 1];
+                if (mb_y > 0)
+                    j |= fixed[mb_xy - mb_stride];
+                if (mb_y + 1 < mb_height)
+                    j |= fixed[mb_xy + mb_stride];
 
-                    if (mb_x > 0 && fixed[mb_xy - 1]) {
-                        mv_predictor[pred_count][0] =
-                            s->cur_pic.motion_val[0][mot_index - mot_step][0];
-                        mv_predictor[pred_count][1] =
-                            s->cur_pic.motion_val[0][mot_index - mot_step][1];
-                        ref[pred_count] =
-                            s->cur_pic.ref_index[0][4 * (mb_xy - 1)];
-                        pred_count++;
-                    }
-                    if (mb_x + 1 < mb_width && fixed[mb_xy + 1]) {
-                        mv_predictor[pred_count][0] =
-                            s->cur_pic.motion_val[0][mot_index + mot_step][0];
-                        mv_predictor[pred_count][1] =
-                            s->cur_pic.motion_val[0][mot_index + mot_step][1];
-                        ref[pred_count] =
-                            s->cur_pic.ref_index[0][4 * (mb_xy + 1)];
-                        pred_count++;
-                    }
-                    if (mb_y > 0 && fixed[mb_xy - mb_stride]) {
-                        mv_predictor[pred_count][0] =
-                            s->cur_pic.motion_val[0][mot_index - mot_stride * mot_step][0];
-                        mv_predictor[pred_count][1] =
-                            s->cur_pic.motion_val[0][mot_index - mot_stride * mot_step][1];
-                        ref[pred_count] =
-                            s->cur_pic.ref_index[0][4 * (mb_xy - s->mb_stride)];
-                        pred_count++;
-                    }
-                    if (mb_y + 1<mb_height && fixed[mb_xy + mb_stride]) {
-                        mv_predictor[pred_count][0] =
-                            s->cur_pic.motion_val[0][mot_index + mot_stride * mot_step][0];
-                        mv_predictor[pred_count][1] =
-                            s->cur_pic.motion_val[0][mot_index + mot_stride * mot_step][1];
-                        ref[pred_count] =
-                            s->cur_pic.ref_index[0][4 * (mb_xy + s->mb_stride)];
-                        pred_count++;
-                    }
-                    if (pred_count == 0)
-                        continue;
+                av_assert2(j & MV_FROZEN);
 
-                    if (pred_count > 1) {
-                        int sum_x = 0, sum_y = 0, sum_r = 0;
-                        int max_x, max_y, min_x, min_y, max_r, min_r;
-
-                        for (j = 0; j < pred_count; j++) {
-                            sum_x += mv_predictor[j][0];
-                            sum_y += mv_predictor[j][1];
-                            sum_r += ref[j];
-                            if (j && ref[j] != ref[j - 1])
-                                goto skip_mean_and_median;
-                        }
-
-                        /* mean */
-                        mv_predictor[pred_count][0] = sum_x / j;
-                        mv_predictor[pred_count][1] = sum_y / j;
-                                 ref[pred_count]    = sum_r / j;
-
-                        /* median */
-                        if (pred_count >= 3) {
-                            min_y = min_x = min_r =  99999;
-                            max_y = max_x = max_r = -99999;
-                        } else {
-                            min_x = min_y = max_x = max_y = min_r = max_r = 0;
-                        }
-                        for (j = 0; j < pred_count; j++) {
-                            max_x = FFMAX(max_x, mv_predictor[j][0]);
-                            max_y = FFMAX(max_y, mv_predictor[j][1]);
-                            max_r = FFMAX(max_r, ref[j]);
-                            min_x = FFMIN(min_x, mv_predictor[j][0]);
-                            min_y = FFMIN(min_y, mv_predictor[j][1]);
-                            min_r = FFMIN(min_r, ref[j]);
-                        }
-                        mv_predictor[pred_count + 1][0] = sum_x - max_x - min_x;
-                        mv_predictor[pred_count + 1][1] = sum_y - max_y - min_y;
-                                 ref[pred_count + 1]    = sum_r - max_r - min_r;
-
-                        if (pred_count == 4) {
-                            mv_predictor[pred_count + 1][0] /= 2;
-                            mv_predictor[pred_count + 1][1] /= 2;
-                                     ref[pred_count + 1]    /= 2;
-                        }
-                        pred_count += 2;
-                    }
+                if (!(j & MV_CHANGED) && pass > 1)
+                    continue;
 
-skip_mean_and_median:
-                    /* zero MV */
+                none_left = 0;
+                pred_count = 0;
+                mot_index  = (mb_x + mb_y * mot_stride) * mot_step;
+
+                if (mb_x > 0 && fixed[mb_xy - 1] > 1) {
+                    mv_predictor[pred_count][0] =
+                        s->cur_pic.motion_val[0][mot_index - mot_step][0];
+                    mv_predictor[pred_count][1] =
+                        s->cur_pic.motion_val[0][mot_index - mot_step][1];
+                    ref[pred_count] =
+                        s->cur_pic.ref_index[0][4 * (mb_xy - 1)];
                     pred_count++;
-
-                    if (!fixed[mb_xy]) {
-                        if (s->avctx->codec_id == AV_CODEC_ID_H264) {
-                            // FIXME
-                        } else {
-                            ff_thread_await_progress(s->last_pic.tf,
-                                                     mb_y, 0);
-                        }
-                        if (!s->last_pic.motion_val[0] ||
-                            !s->last_pic.ref_index[0])
-                            goto skip_last_mv;
-                        prev_x   = s->last_pic.motion_val[0][mot_index][0];
-                        prev_y   = s->last_pic.motion_val[0][mot_index][1];
-                        prev_ref = s->last_pic.ref_index[0][4 * mb_xy];
-                    } else {
-                        prev_x   = s->cur_pic.motion_val[0][mot_index][0];
-                        prev_y   = s->cur_pic.motion_val[0][mot_index][1];
-                        prev_ref = s->cur_pic.ref_index[0][4 * mb_xy];
-                    }
-
-                    /* last MV */
-                    mv_predictor[pred_count][0] = prev_x;
-                    mv_predictor[pred_count][1] = prev_y;
-                             ref[pred_count]    = prev_ref;
+                }
+                if (mb_x + 1 < mb_width && fixed[mb_xy + 1] > 1) {
+                    mv_predictor[pred_count][0] =
+                        s->cur_pic.motion_val[0][mot_index + mot_step][0];
+                    mv_predictor[pred_count][1] =
+                        s->cur_pic.motion_val[0][mot_index + mot_step][1];
+                    ref[pred_count] =
+                        s->cur_pic.ref_index[0][4 * (mb_xy + 1)];
+                    pred_count++;
+                }
+                if (mb_y > 0 && fixed[mb_xy - mb_stride] > 1) {
+                    mv_predictor[pred_count][0] =
+                        s->cur_pic.motion_val[0][mot_index - mot_stride * mot_step][0];
+                    mv_predictor[pred_count][1] =
+                        s->cur_pic.motion_val[0][mot_index - mot_stride * mot_step][1];
+                    ref[pred_count] =
+                        s->cur_pic.ref_index[0][4 * (mb_xy - s->mb_stride)];
+                    pred_count++;
+                }
+                if (mb_y + 1<mb_height && fixed[mb_xy + mb_stride] > 1) {
+                    mv_predictor[pred_count][0] =
+                        s->cur_pic.motion_val[0][mot_index + mot_stride * mot_step][0];
+                    mv_predictor[pred_count][1] =
+                        s->cur_pic.motion_val[0][mot_index + mot_stride * mot_step][1];
+                    ref[pred_count] =
+                        s->cur_pic.ref_index[0][4 * (mb_xy + s->mb_stride)];
                     pred_count++;
+                }
+                if (pred_count == 0)
+                    continue;
 
-skip_last_mv:
+                if (pred_count > 1) {
+                    int sum_x = 0, sum_y = 0, sum_r = 0;
+                    int max_x, max_y, min_x, min_y, max_r, min_r;
 
                     for (j = 0; j < pred_count; j++) {
-                        int *linesize = s->cur_pic.f->linesize;
-                        int score = 0;
-                        uint8_t *src = s->cur_pic.f->data[0] +
-                                       mb_x * 16 + mb_y * 16 * linesize[0];
+                        sum_x += mv_predictor[j][0];
+                        sum_y += mv_predictor[j][1];
+                        sum_r += ref[j];
+                        if (j && ref[j] != ref[j - 1])
+                            goto skip_mean_and_median;
+                    }
 
-                        s->cur_pic.motion_val[0][mot_index][0] =
-                            s->mv[0][0][0] = mv_predictor[j][0];
-                        s->cur_pic.motion_val[0][mot_index][1] =
-                            s->mv[0][0][1] = mv_predictor[j][1];
-
-                        // predictor intra or otherwise not available
-                        if (ref[j] < 0)
-                            continue;
-
-                        s->decode_mb(s->opaque, ref[j], MV_DIR_FORWARD,
-                                     MV_TYPE_16X16, &s->mv, mb_x, mb_y, 0, 0);
-
-                        if (mb_x > 0 && fixed[mb_xy - 1]) {
-                            int k;
-                            for (k = 0; k < 16; k++)
-                                score += FFABS(src[k * linesize[0] - 1] -
-                                               src[k * linesize[0]]);
-                        }
-                        if (mb_x + 1 < mb_width && fixed[mb_xy + 1]) {
-                            int k;
-                            for (k = 0; k < 16; k++)
-                                score += FFABS(src[k * linesize[0] + 15] -
-                                               src[k * linesize[0] + 16]);
-                        }
-                        if (mb_y > 0 && fixed[mb_xy - mb_stride]) {
-                            int k;
-                            for (k = 0; k < 16; k++)
-                                score += FFABS(src[k - linesize[0]] - src[k]);
-                        }
-                        if (mb_y + 1 < mb_height && fixed[mb_xy + mb_stride]) {
-                            int k;
-                            for (k = 0; k < 16; k++)
-                                score += FFABS(src[k + linesize[0] * 15] -
-                                               src[k + linesize[0] * 16]);
-                        }
-
-                        if (score <= best_score) { // <= will favor the last MV
-                            best_score = score;
-                            best_pred  = j;
-                        }
+                    /* mean */
+                    mv_predictor[pred_count][0] = sum_x / j;
+                    mv_predictor[pred_count][1] = sum_y / j;
+                             ref[pred_count]    = sum_r / j;
+
+                    /* median */
+                    if (pred_count >= 3) {
+                        min_y = min_x = min_r =  99999;
+                        max_y = max_x = max_r = -99999;
+                    } else {
+                        min_x = min_y = max_x = max_y = min_r = max_r = 0;
                     }
-                    score_sum += best_score;
-                    s->mv[0][0][0] = mv_predictor[best_pred][0];
-                    s->mv[0][0][1] = mv_predictor[best_pred][1];
+                    for (j = 0; j < pred_count; j++) {
+                        max_x = FFMAX(max_x, mv_predictor[j][0]);
+                        max_y = FFMAX(max_y, mv_predictor[j][1]);
+                        max_r = FFMAX(max_r, ref[j]);
+                        min_x = FFMIN(min_x, mv_predictor[j][0]);
+                        min_y = FFMIN(min_y, mv_predictor[j][1]);
+                        min_r = FFMIN(min_r, ref[j]);
+                    }
+                    mv_predictor[pred_count + 1][0] = sum_x - max_x - min_x;
+                    mv_predictor[pred_count + 1][1] = sum_y - max_y - min_y;
+                             ref[pred_count + 1]    = sum_r - max_r - min_r;
+
+                    if (pred_count == 4) {
+                        mv_predictor[pred_count + 1][0] /= 2;
+                        mv_predictor[pred_count + 1][1] /= 2;
+                                 ref[pred_count + 1]    /= 2;
+                    }
+                    pred_count += 2;
+                }
 
-                    for (i = 0; i < mot_step; i++)
-                        for (j = 0; j < mot_step; j++) {
-                            s->cur_pic.motion_val[0][mot_index + i + j * mot_stride][0] = s->mv[0][0][0];
-                            s->cur_pic.motion_val[0][mot_index + i + j * mot_stride][1] = s->mv[0][0][1];
-                        }
+skip_mean_and_median:
+                /* zero MV */
+                mv_predictor[pred_count][0] =
+                mv_predictor[pred_count][1] =
+                         ref[pred_count]    = 0;
+                pred_count++;
+
+                prev_x   = s->cur_pic.motion_val[0][mot_index][0];
+                prev_y   = s->cur_pic.motion_val[0][mot_index][1];
+                prev_ref = s->cur_pic.ref_index[0][4 * mb_xy];
+
+                /* last MV */
+                mv_predictor[pred_count][0] = prev_x;
+                mv_predictor[pred_count][1] = prev_y;
+                         ref[pred_count]    = prev_ref;
+                pred_count++;
+
+                best_pred = 0;
+                best_score = 256 * 256 * 256 * 64;
+                for (j = 0; j < pred_count; j++) {
+                    int *linesize = s->cur_pic.f->linesize;
+                    int score = 0;
+                    uint8_t *src = s->cur_pic.f->data[0] +
+                                   mb_x * 16 + mb_y * 16 * linesize[0];
+
+                    s->cur_pic.motion_val[0][mot_index][0] =
+                        s->mv[0][0][0] = mv_predictor[j][0];
+                    s->cur_pic.motion_val[0][mot_index][1] =
+                        s->mv[0][0][1] = mv_predictor[j][1];
+
+                    // predictor intra or otherwise not available
+                    if (ref[j] < 0)
+                        continue;
 
-                    s->decode_mb(s->opaque, ref[best_pred], MV_DIR_FORWARD,
+                    s->decode_mb(s->opaque, ref[j], MV_DIR_FORWARD,
                                  MV_TYPE_16X16, &s->mv, mb_x, mb_y, 0, 0);
 
+                    if (mb_x > 0 && fixed[mb_xy - 1] > 1) {
+                        int k;
+                        for (k = 0; k < 16; k++)
+                            score += FFABS(src[k * linesize[0] - 1] -
+                                           src[k * linesize[0]]);
+                    }
+                    if (mb_x + 1 < mb_width && fixed[mb_xy + 1] > 1) {
+                        int k;
+                        for (k = 0; k < 16; k++)
+                            score += FFABS(src[k * linesize[0] + 15] -
+                                           src[k * linesize[0] + 16]);
+                    }
+                    if (mb_y > 0 && fixed[mb_xy - mb_stride] > 1) {
+                        int k;
+                        for (k = 0; k < 16; k++)
+                            score += FFABS(src[k - linesize[0]] - src[k]);
+                    }
+                    if (mb_y + 1 < mb_height && fixed[mb_xy + mb_stride] > 1) {
+                        int k;
+                        for (k = 0; k < 16; k++)
+                            score += FFABS(src[k + linesize[0] * 15] -
+                                           src[k + linesize[0] * 16]);
+                    }
 
-                    if (s->mv[0][0][0] != prev_x || s->mv[0][0][1] != prev_y) {
-                        fixed[mb_xy] = MV_CHANGED;
-                        changed++;
-                    } else
-                        fixed[mb_xy] = MV_UNCHANGED;
+                    if (score <= best_score) { // <= will favor the last MV
+                        best_score = score;
+                        best_pred  = j;
+                    }
                 }
+                score_sum += best_score;
+                s->mv[0][0][0] = mv_predictor[best_pred][0];
+                s->mv[0][0][1] = mv_predictor[best_pred][1];
+
+                for (i = 0; i < mot_step; i++)
+                    for (j = 0; j < mot_step; j++) {
+                        s->cur_pic.motion_val[0][mot_index + i + j * mot_stride][0] = s->mv[0][0][0];
+                        s->cur_pic.motion_val[0][mot_index + i + j * mot_stride][1] = s->mv[0][0][1];
+                    }
+
+                s->decode_mb(s->opaque, ref[best_pred], MV_DIR_FORWARD,
+                             MV_TYPE_16X16, &s->mv, mb_x, mb_y, 0, 0);
+
+
+                if (s->mv[0][0][0] != prev_x || s->mv[0][0][1] != prev_y) {
+                    fixed[mb_xy] = MV_CHANGED;
+                    changed++;
+                } else
+                    fixed[mb_xy] = MV_UNCHANGED;
             }
         }
 
         if (none_left)
             return;
 
-        for (i = 0; i < s->mb_num; i++) {
-            int mb_xy = s->mb_index2xy[i];
-            if (fixed[mb_xy])
+        next_blocklist_length = 0;
+
+        for (blocklist_index = 0; blocklist_index < blocklist_length; blocklist_index++) {
+            const int mb_x = blocklist[blocklist_index][0];
+            const int mb_y = blocklist[blocklist_index][1];
+            const int mb_xy = mb_x + mb_y * mb_stride;
+
+            if (fixed[mb_xy] & (MV_CHANGED|MV_UNCHANGED|MV_FROZEN)) {
                 fixed[mb_xy] = MV_FROZEN;
+                if (mb_x > 0)
+                    add_blocklist(next_blocklist, &next_blocklist_length, fixed, mb_x - 1, mb_y, mb_xy - 1);
+                if (mb_y > 0)
+                    add_blocklist(next_blocklist, &next_blocklist_length, fixed, mb_x, mb_y - 1, mb_xy - mb_stride);
+                if (mb_x + 1 < mb_width)
+                    add_blocklist(next_blocklist, &next_blocklist_length, fixed, mb_x + 1, mb_y, mb_xy + 1);
+                if (mb_y + 1 < mb_height)
+                    add_blocklist(next_blocklist, &next_blocklist_length, fixed, mb_x, mb_y + 1, mb_xy + mb_stride);
+            }
         }
+        av_assert0(next_blocklist_length <= mb_height * mb_width);
+        FFSWAP(int , blocklist_length, next_blocklist_length);
+        FFSWAP(void*, blocklist, next_blocklist);
     }
 }
 
@@ -662,6 +726,9 @@ static int is_intra_more_likely(ERContext *s)
     if (!s->last_pic.f || !s->last_pic.f->data[0])
         return 1; // no previous frame available -> use spatial prediction
 
+    if (s->avctx->error_concealment & FF_EC_FAVOR_INTER)
+        return 0;
+
     undamaged_count = 0;
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
@@ -670,12 +737,15 @@ static int is_intra_more_likely(ERContext *s)
             undamaged_count++;
     }
 
-    if (s->avctx->codec_id == AV_CODEC_ID_H264 && s->ref_count <= 0)
-        return 1;
-
     if (undamaged_count < 5)
         return 0; // almost all MBs damaged -> use temporal prediction
 
+    // prevent dsp.sad() check, that requires access to the image
+    if (CONFIG_XVMC    &&
+        s->avctx->hwaccel && s->avctx->hwaccel->decode_mb &&
+        s->cur_pic.f->pict_type == AV_PICTURE_TYPE_I)
+        return 1;
+
     skip_amount     = FFMAX(undamaged_count / 50, 1); // check only up to 50 MBs
     is_intra_likely = 0;
 
@@ -708,6 +778,7 @@ static int is_intra_more_likely(ERContext *s)
                 }
                 is_intra_likely += s->mecc.sad[0](NULL, last_mb_ptr, mb_ptr,
                                                   linesize[0], 16);
+                // FIXME need await_progress() here
                 is_intra_likely -= s->mecc.sad[0](NULL, last_mb_ptr,
                                                   last_mb_ptr + linesize[0] * 16,
                                                   linesize[0], 16);
@@ -719,6 +790,7 @@ static int is_intra_more_likely(ERContext *s)
             }
         }
     }
+//      av_log(NULL, AV_LOG_ERROR, "is_intra_likely: %d type:%d\n", is_intra_likely, s->pict_type);
     return is_intra_likely > 0;
 }
 
@@ -734,10 +806,20 @@ void ff_er_frame_start(ERContext *s)
 
     memset(s->error_status_table, ER_MB_ERROR | VP_START | ER_MB_END,
            s->mb_stride * s->mb_height * sizeof(uint8_t));
-    s->error_count    = 3 * s->mb_num;
+    atomic_init(&s->error_count, 3 * s->mb_num);
     s->error_occurred = 0;
 }
 
+static int er_supported(ERContext *s)
+{
+    if(s->avctx->hwaccel && s->avctx->hwaccel->decode_slice           ||
+       !s->cur_pic.f                                                  ||
+       s->cur_pic.field_picture
+    )
+        return 0;
+    return 1;
+}
+
 /**
  * Add a slice.
  * @param endx   x component of the last macroblock, can be -1
@@ -754,7 +836,7 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     const int end_xy   = s->mb_index2xy[end_i];
     int mask           = -1;
 
-    if (s->avctx->hwaccel)
+    if (s->avctx->hwaccel && s->avctx->hwaccel->decode_slice)
         return;
 
     if (start_i > end_i || start_xy > end_xy) {
@@ -769,20 +851,20 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     mask &= ~VP_START;
     if (status & (ER_AC_ERROR | ER_AC_END)) {
         mask           &= ~(ER_AC_ERROR | ER_AC_END);
-        s->error_count -= end_i - start_i + 1;
+        atomic_fetch_add(&s->error_count, start_i - end_i - 1);
     }
     if (status & (ER_DC_ERROR | ER_DC_END)) {
         mask           &= ~(ER_DC_ERROR | ER_DC_END);
-        s->error_count -= end_i - start_i + 1;
+        atomic_fetch_add(&s->error_count, start_i - end_i - 1);
     }
     if (status & (ER_MV_ERROR | ER_MV_END)) {
         mask           &= ~(ER_MV_ERROR | ER_MV_END);
-        s->error_count -= end_i - start_i + 1;
+        atomic_fetch_add(&s->error_count, start_i - end_i - 1);
     }
 
     if (status & ER_MB_ERROR) {
         s->error_occurred = 1;
-        s->error_count    = INT_MAX;
+        atomic_store(&s->error_count, INT_MAX);
     }
 
     if (mask == ~0x7F) {
@@ -795,7 +877,7 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     }
 
     if (end_i == s->mb_num)
-        s->error_count = INT_MAX;
+        atomic_store(&s->error_count, INT_MAX);
     else {
         s->error_status_table[end_xy] &= mask;
         s->error_status_table[end_xy] |= status;
@@ -803,41 +885,92 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
 
     s->error_status_table[start_xy] |= VP_START;
 
-    if (start_xy > 0 && s->avctx->thread_count <= 1 &&
-        s->avctx->skip_top * s->mb_width < start_i) {
+    if (start_xy > 0 && !(s->avctx->active_thread_type & FF_THREAD_SLICE) &&
+        er_supported(s) && s->avctx->skip_top * s->mb_width < start_i) {
         int prev_status = s->error_status_table[s->mb_index2xy[start_i - 1]];
 
         prev_status &= ~ VP_START;
-        if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END))
-            s->error_count = INT_MAX;
+        if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END)) {
+            s->error_occurred = 1;
+            atomic_store(&s->error_count, INT_MAX);
+        }
     }
 }
 
 void ff_er_frame_end(ERContext *s)
 {
-    int *linesize = s->cur_pic.f->linesize;
+    int *linesize = NULL;
     int i, mb_x, mb_y, error, error_type, dc_error, mv_error, ac_error;
     int distance;
     int threshold_part[4] = { 100, 100, 100 };
     int threshold = 50;
     int is_intra_likely;
+    int size = s->b8_stride * 2 * s->mb_height;
 
     /* We do not support ER of field pictures yet,
      * though it should not crash if enabled. */
-    if (!s->avctx->error_concealment || s->error_count == 0            ||
-        s->avctx->hwaccel                                              ||
-        !s->cur_pic.f                                                  ||
-        s->cur_pic.field_picture                                       ||
-        s->error_count == 3 * s->mb_width *
+    if (!s->avctx->error_concealment || !atomic_load(&s->error_count)  ||
+        s->avctx->lowres                                               ||
+        !er_supported(s)                                               ||
+        atomic_load(&s->error_count) == 3 * s->mb_width *
                           (s->avctx->skip_top + s->avctx->skip_bottom)) {
         return;
-    };
+    }
+    linesize = s->cur_pic.f->linesize;
+    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
+        int status = s->error_status_table[mb_x + (s->mb_height - 1) * s->mb_stride];
+        if (status != 0x7F)
+            break;
+    }
 
-    if (!s->cur_pic.motion_val[0] || !s->cur_pic.ref_index[0]) {
-        av_log(s->avctx, AV_LOG_ERROR, "MVs not available, ER not possible.\n");
+    if (   mb_x == s->mb_width
+        && s->avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO
+        && (FFALIGN(s->avctx->height, 16)&16)
+        && atomic_load(&s->error_count) == 3 * s->mb_width * (s->avctx->skip_top + s->avctx->skip_bottom + 1)
+    ) {
+        av_log(s->avctx, AV_LOG_DEBUG, "ignoring last missing slice\n");
         return;
     }
 
+    if (s->last_pic.f) {
+        if (s->last_pic.f->width  != s->cur_pic.f->width  ||
+            s->last_pic.f->height != s->cur_pic.f->height ||
+            s->last_pic.f->format != s->cur_pic.f->format) {
+            av_log(s->avctx, AV_LOG_WARNING, "Cannot use previous picture in error concealment\n");
+            memset(&s->last_pic, 0, sizeof(s->last_pic));
+        }
+    }
+    if (s->next_pic.f) {
+        if (s->next_pic.f->width  != s->cur_pic.f->width  ||
+            s->next_pic.f->height != s->cur_pic.f->height ||
+            s->next_pic.f->format != s->cur_pic.f->format) {
+            av_log(s->avctx, AV_LOG_WARNING, "Cannot use next picture in error concealment\n");
+            memset(&s->next_pic, 0, sizeof(s->next_pic));
+        }
+    }
+
+    if (!s->cur_pic.motion_val[0] || !s->cur_pic.ref_index[0]) {
+        av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n");
+
+        for (i = 0; i < 2; i++) {
+            s->ref_index_buf[i]  = av_buffer_allocz(s->mb_stride * s->mb_height * 4 * sizeof(uint8_t));
+            s->motion_val_buf[i] = av_buffer_allocz((size + 4) * 2 * sizeof(uint16_t));
+            if (!s->ref_index_buf[i] || !s->motion_val_buf[i])
+                break;
+            s->cur_pic.ref_index[i]  = s->ref_index_buf[i]->data;
+            s->cur_pic.motion_val[i] = (int16_t (*)[2])s->motion_val_buf[i]->data + 4;
+        }
+        if (i < 2) {
+            for (i = 0; i < 2; i++) {
+                av_buffer_unref(&s->ref_index_buf[i]);
+                av_buffer_unref(&s->motion_val_buf[i]);
+                s->cur_pic.ref_index[i]  = NULL;
+                s->cur_pic.motion_val[i] = NULL;
+            }
+            return;
+        }
+    }
+
     if (s->avctx->debug & FF_DEBUG_ER) {
         for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -849,6 +982,7 @@ void ff_er_frame_end(ERContext *s)
         }
     }
 
+#if 1
     /* handle overlapping slices */
     for (error_type = 1; error_type <= 3; error_type++) {
         int end_ok = 0;
@@ -869,7 +1003,8 @@ void ff_er_frame_end(ERContext *s)
                 end_ok = 0;
         }
     }
-
+#endif
+#if 1
     /* handle slices with partitions of different length */
     if (s->partitioned_frame) {
         int end_ok = 0;
@@ -892,7 +1027,7 @@ void ff_er_frame_end(ERContext *s)
                 end_ok = 0;
         }
     }
-
+#endif
     /* handle missing slices */
     if (s->avctx->err_recognition & AV_EF_EXPLODE) {
         int end_ok = 1;
@@ -919,6 +1054,7 @@ void ff_er_frame_end(ERContext *s)
         }
     }
 
+#if 1
     /* backward mark errors */
     distance = 9999999;
     for (error_type = 1; error_type <= 3; error_type++) {
@@ -926,7 +1062,7 @@ void ff_er_frame_end(ERContext *s)
             const int mb_xy = s->mb_index2xy[i];
             int       error = s->error_status_table[mb_xy];
 
-            if (s->mbskip_table && !s->mbskip_table[mb_xy]) // FIXME partition specific
+            if (!s->mbskip_table || !s->mbskip_table[mb_xy]) // FIXME partition specific
                 distance++;
             if (error & (1 << error_type))
                 distance = 0;
@@ -943,6 +1079,7 @@ void ff_er_frame_end(ERContext *s)
                 distance = 9999999;
         }
     }
+#endif
 
     /* forward mark errors */
     error = 0;
@@ -957,22 +1094,23 @@ void ff_er_frame_end(ERContext *s)
             s->error_status_table[mb_xy] |= error;
         }
     }
-
+#if 1
     /* handle not partitioned case */
     if (!s->partitioned_frame) {
         for (i = 0; i < s->mb_num; i++) {
             const int mb_xy = s->mb_index2xy[i];
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
             if (error & ER_MB_ERROR)
                 error |= ER_MB_ERROR;
             s->error_status_table[mb_xy] = error;
         }
     }
+#endif
 
     dc_error = ac_error = mv_error = 0;
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
-        error = s->error_status_table[mb_xy];
+        int error = s->error_status_table[mb_xy];
         if (error & ER_DC_ERROR)
             dc_error++;
         if (error & ER_AC_ERROR)
@@ -980,15 +1118,15 @@ void ff_er_frame_end(ERContext *s)
         if (error & ER_MV_ERROR)
             mv_error++;
     }
-    av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors\n",
-           dc_error, ac_error, mv_error);
+    av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors in %c frame\n",
+           dc_error, ac_error, mv_error, av_get_picture_type_char(s->cur_pic.f->pict_type));
 
     is_intra_likely = is_intra_more_likely(s);
 
     /* set unknown mb-type to most likely */
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
-        error = s->error_status_table[mb_xy];
+        int error = s->error_status_table[mb_xy];
         if (!((error & ER_DC_ERROR) && (error & ER_MV_ERROR)))
             continue;
 
@@ -1016,7 +1154,7 @@ void ff_er_frame_end(ERContext *s)
             const int mv_dir  = dir ? MV_DIR_BACKWARD : MV_DIR_FORWARD;
             int mv_type;
 
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
 
             if (IS_INTRA(mb_type))
                 continue; // intra
@@ -1053,7 +1191,7 @@ void ff_er_frame_end(ERContext *s)
                 const int mb_type = s->cur_pic.mb_type[mb_xy];
                 int mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
 
-                error = s->error_status_table[mb_xy];
+                int error = s->error_status_table[mb_xy];
 
                 if (IS_INTRA(mb_type))
                     continue;
@@ -1071,6 +1209,7 @@ void ff_er_frame_end(ERContext *s)
                     int time_pp = s->pp_time;
                     int time_pb = s->pb_time;
 
+                    av_assert0(s->avctx->codec_id != AV_CODEC_ID_H264);
                     ff_thread_await_progress(s->next_pic.tf, mb_y, 0);
 
                     s->mv[0][0][0] = s->next_pic.motion_val[0][xy][0] *  time_pb            / time_pp;
@@ -1091,6 +1230,9 @@ void ff_er_frame_end(ERContext *s)
     } else
         guess_mv(s);
 
+    /* the filters below manipulate raw image, skip them */
+    if (CONFIG_XVMC && s->avctx->hwaccel && s->avctx->hwaccel->decode_mb)
+        goto ec_clean;
     /* fill DC for inter blocks */
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -1100,7 +1242,7 @@ void ff_er_frame_end(ERContext *s)
             const int mb_xy   = mb_x + mb_y * s->mb_stride;
             const int mb_type = s->cur_pic.mb_type[mb_xy];
 
-            error = s->error_status_table[mb_xy];
+            // error = s->error_status_table[mb_xy];
 
             if (IS_INTRA(mb_type) && s->partitioned_frame)
                 continue;
@@ -1123,6 +1265,9 @@ void ff_er_frame_end(ERContext *s)
                 dc_ptr[(n & 1) + (n >> 1) * s->b8_stride] = (dc + 4) >> 3;
             }
 
+            if (!s->cur_pic.f->data[2])
+                continue;
+
             dcu = dcv = 0;
             for (y = 0; y < 8; y++) {
                 int x;
@@ -1135,15 +1280,17 @@ void ff_er_frame_end(ERContext *s)
             s->dc_val[2][mb_x + mb_y * s->mb_stride] = (dcv + 4) >> 3;
         }
     }
-
+#if 1
     /* guess DC for damaged blocks */
-    guess_dc(s, s->dc_val[0], s->mb_width * 2, s->mb_height * 2, s->b8_stride, 1);
-    guess_dc(s, s->dc_val[1], s->mb_width, s->mb_height, s->mb_stride, 0);
-    guess_dc(s, s->dc_val[2], s->mb_width, s->mb_height, s->mb_stride, 0);
+    guess_dc(s, s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride, 1);
+    guess_dc(s, s->dc_val[1], s->mb_width  , s->mb_height  , s->mb_stride, 0);
+    guess_dc(s, s->dc_val[2], s->mb_width  , s->mb_height  , s->mb_stride, 0);
+#endif
 
     /* filter luma DC */
     filter181(s->dc_val[0], s->mb_width * 2, s->mb_height * 2, s->b8_stride);
 
+#if 1
     /* render DC only intra */
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -1151,7 +1298,7 @@ void ff_er_frame_end(ERContext *s)
             const int mb_xy   = mb_x + mb_y * s->mb_stride;
             const int mb_type = s->cur_pic.mb_type[mb_xy];
 
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
 
             if (IS_INTER(mb_type))
                 continue;
@@ -1161,29 +1308,36 @@ void ff_er_frame_end(ERContext *s)
             dest_y  = s->cur_pic.f->data[0] + mb_x * 16 + mb_y * 16 * linesize[0];
             dest_cb = s->cur_pic.f->data[1] + mb_x *  8 + mb_y *  8 * linesize[1];
             dest_cr = s->cur_pic.f->data[2] + mb_x *  8 + mb_y *  8 * linesize[2];
+            if (!s->cur_pic.f->data[2])
+                dest_cb = dest_cr = NULL;
 
             put_dc(s, dest_y, dest_cb, dest_cr, mb_x, mb_y);
         }
     }
+#endif
 
     if (s->avctx->error_concealment & FF_EC_DEBLOCK) {
         /* filter horizontal block boundaries */
         h_block_filter(s, s->cur_pic.f->data[0], s->mb_width * 2,
                        s->mb_height * 2, linesize[0], 1);
-        h_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
-                       s->mb_height, linesize[1], 0);
-        h_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
-                       s->mb_height, linesize[2], 0);
 
         /* filter vertical block boundaries */
         v_block_filter(s, s->cur_pic.f->data[0], s->mb_width * 2,
                        s->mb_height * 2, linesize[0], 1);
-        v_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
-                       s->mb_height, linesize[1], 0);
-        v_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
-                       s->mb_height, linesize[2], 0);
+
+        if (s->cur_pic.f->data[2]) {
+            h_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
+                        s->mb_height, linesize[1], 0);
+            h_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
+                        s->mb_height, linesize[2], 0);
+            v_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
+                        s->mb_height, linesize[1], 0);
+            v_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
+                        s->mb_height, linesize[2], 0);
+        }
     }
 
+ec_clean:
     /* clean a few tables */
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
@@ -1197,6 +1351,13 @@ void ff_er_frame_end(ERContext *s)
             s->mbintra_table[mb_xy] = 1;
     }
 
+    for (i = 0; i < 2; i++) {
+        av_buffer_unref(&s->ref_index_buf[i]);
+        av_buffer_unref(&s->motion_val_buf[i]);
+        s->cur_pic.ref_index[i]  = NULL;
+        s->cur_pic.motion_val[i] = NULL;
+    }
+
     memset(&s->cur_pic, 0, sizeof(ERPicture));
     memset(&s->last_pic, 0, sizeof(ERPicture));
     memset(&s->next_pic, 0, sizeof(ERPicture));
diff --git a/libavcodec/error_resilience.h b/libavcodec/error_resilience.h
index 1045652..664a765 100644
--- a/libavcodec/error_resilience.h
+++ b/libavcodec/error_resilience.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,6 +20,7 @@
 #define AVCODEC_ERROR_RESILIENCE_H
 
 #include <stdint.h>
+#include <stdatomic.h>
 
 #include "avcodec.h"
 #include "me_cmp.h"
@@ -60,7 +61,8 @@ typedef struct ERContext {
     ptrdiff_t mb_stride;
     ptrdiff_t b8_stride;
 
-    int error_count, error_occurred;
+    atomic_int error_count;
+    int error_occurred;
     uint8_t *error_status_table;
     uint8_t *er_temp_buffer;
     int16_t *dc_val[3];
@@ -72,6 +74,9 @@ typedef struct ERContext {
     ERPicture last_pic;
     ERPicture next_pic;
 
+    AVBufferRef *ref_index_buf[2];
+    AVBufferRef *motion_val_buf[2];
+
     uint16_t pp_time;
     uint16_t pb_time;
     int quarter_sample;
diff --git a/libavcodec/escape124.c b/libavcodec/escape124.c
index 86a63a4..cffd3e1 100644
--- a/libavcodec/escape124.c
+++ b/libavcodec/escape124.c
@@ -2,26 +2,26 @@
  * Escape 124 Video Decoder
  * Copyright (C) 2008 Eli Friedman (eli.friedman@gmail.com)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 typedef union MacroBlock {
@@ -48,11 +48,6 @@ typedef struct Escape124Context {
     CodeBook codebooks[3];
 } Escape124Context;
 
-static int can_safely_read(BitstreamContext *bc, int bits)
-{
-    return bitstream_bits_left(bc) >= bits;
-}
-
 /**
  * Initialize the decoder
  * @param avctx decoder context
@@ -80,20 +75,20 @@ static av_cold int escape124_decode_close(AVCodecContext *avctx)
     Escape124Context *s = avctx->priv_data;
 
     for (i = 0; i < 3; i++)
-        av_free(s->codebooks[i].blocks);
+        av_freep(&s->codebooks[i].blocks);
 
     av_frame_free(&s->frame);
 
     return 0;
 }
 
-static CodeBook unpack_codebook(BitstreamContext *bc, unsigned depth,
+static CodeBook unpack_codebook(GetBitContext* gb, unsigned depth,
                                  unsigned size)
 {
     unsigned i, j;
     CodeBook cb = { 0 };
 
-    if (!can_safely_read(bc, size * 34))
+    if (size >= INT_MAX / 34 || get_bits_left(gb) < size * 34)
         return cb;
 
     if (size >= INT_MAX / sizeof(MacroBlock))
@@ -105,9 +100,9 @@ static CodeBook unpack_codebook(BitstreamContext *bc, unsigned depth,
     cb.depth = depth;
     cb.size = size;
     for (i = 0; i < size; i++) {
-        unsigned mask_bits = bitstream_read(bc,  4);
-        unsigned color0    = bitstream_read(bc, 15);
-        unsigned color1    = bitstream_read(bc, 15);
+        unsigned mask_bits = get_bits(gb, 4);
+        unsigned color0 = get_bits(gb, 15);
+        unsigned color1 = get_bits(gb, 15);
 
         for (j = 0; j < 4; j++) {
             if (mask_bits & (1 << j))
@@ -119,43 +114,47 @@ static CodeBook unpack_codebook(BitstreamContext *bc, unsigned depth,
     return cb;
 }
 
-static unsigned decode_skip_count(BitstreamContext *bc)
+static unsigned decode_skip_count(GetBitContext* gb)
 {
     unsigned value;
     // This function reads a maximum of 23 bits,
     // which is within the padding space
-    if (!can_safely_read(bc, 1))
+    if (get_bits_left(gb) < 1)
         return -1;
-    value = bitstream_read_bit(bc);
+    value = get_bits1(gb);
     if (!value)
         return value;
 
-    value += bitstream_read(bc, 3);
+    value += get_bits(gb, 3);
     if (value != (1 + ((1 << 3) - 1)))
         return value;
 
-    value += bitstream_read(bc, 7);
+    value += get_bits(gb, 7);
     if (value != (1 + ((1 << 3) - 1)) + ((1 << 7) - 1))
         return value;
 
-    return value + bitstream_read(bc, 12);
+    return value + get_bits(gb, 12);
 }
 
-static MacroBlock decode_macroblock(Escape124Context *s, BitstreamContext *bc,
-                                    int *codebook_index, int superblock_index)
+static MacroBlock decode_macroblock(Escape124Context* s, GetBitContext* gb,
+                                    int* codebook_index, int superblock_index)
 {
     // This function reads a maximum of 22 bits; the callers
     // guard this function appropriately
     unsigned block_index, depth;
-    int value = bitstream_read_bit(bc);
+    int value = get_bits1(gb);
     if (value) {
-        static const char transitions[3][2] = { {2, 1}, {0, 2}, {1, 0} };
-        value = bitstream_read_bit(bc);
+        static const int8_t transitions[3][2] = { {2, 1}, {0, 2}, {1, 0} };
+        value = get_bits1(gb);
         *codebook_index = transitions[*codebook_index][value];
     }
 
     depth = s->codebooks[*codebook_index].depth;
-    block_index = bitstream_read(bc, depth);
+
+    // depth = 0 means that this shouldn't read any bits;
+    // in theory, this is the same as get_bits(gb, 0), but
+    // that doesn't actually work.
+    block_index = get_bitsz(gb, depth);
 
     if (*codebook_index == 1) {
         block_index += superblock_index << s->codebooks[1].depth;
@@ -200,12 +199,11 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                                   void *data, int *got_frame,
                                   AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     Escape124Context *s = avctx->priv_data;
     AVFrame *frame = data;
 
-    BitstreamContext bc;
+    GetBitContext gb;
     unsigned frame_flags, frame_size;
     unsigned i;
 
@@ -215,17 +213,23 @@ static int escape124_decode_frame(AVCodecContext *avctx,
 
     uint16_t* old_frame_data, *new_frame_data;
     unsigned old_stride, new_stride;
+
     int ret;
 
-    bitstream_init8(&bc, buf, buf_size);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     // This call also guards the potential depth reads for the
     // codebook unpacking.
-    if (!can_safely_read(&bc, 64))
+    // Check if the amount we will read minimally is available on input.
+    // The 64 represent the immediately next 2 frame_* elements read, the 23/4320
+    // represent a lower bound of the space needed for skipped superblocks. Non
+    // skipped SBs need more space.
+    if (get_bits_left(&gb) < 64 + s->num_superblocks * 23LL / 4320)
         return -1;
 
-    frame_flags = bitstream_read(&bc, 32);
-    frame_size  = bitstream_read(&bc, 32);
+    frame_flags = get_bits_long(&gb, 32);
+    frame_size  = get_bits_long(&gb, 32);
 
     // Leave last frame unchanged
     // FIXME: Is this necessary?  I haven't seen it in any real samples
@@ -233,7 +237,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
         if (!s->frame->data[0])
             return AVERROR_INVALIDDATA;
 
-        av_log(NULL, AV_LOG_DEBUG, "Skipping frame\n");
+        av_log(avctx, AV_LOG_DEBUG, "Skipping frame\n");
 
         *got_frame = 1;
         if ((ret = av_frame_ref(frame, s->frame)) < 0)
@@ -248,10 +252,14 @@ static int escape124_decode_frame(AVCodecContext *avctx,
             if (i == 2) {
                 // This codebook can be cut off at places other than
                 // powers of 2, leaving some of the entries undefined.
-                cb_size = bitstream_read(&bc, 20);
+                cb_size = get_bits_long(&gb, 20);
+                if (!cb_size) {
+                    av_log(avctx, AV_LOG_ERROR, "Invalid codebook size 0.\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 cb_depth = av_log2(cb_size - 1) + 1;
             } else {
-                cb_depth = bitstream_read(&bc, 4);
+                cb_depth = get_bits(&gb, 4);
                 if (i == 0) {
                     // This is the most basic codebook: pow(2,depth) entries
                     // for a depth-length key
@@ -263,17 +271,20 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                     cb_size = s->num_superblocks << cb_depth;
                 }
             }
-            av_free(s->codebooks[i].blocks);
-            s->codebooks[i] = unpack_codebook(&bc, cb_depth, cb_size);
+            if (s->num_superblocks >= INT_MAX >> cb_depth) {
+                av_log(avctx, AV_LOG_ERROR, "Depth or num_superblocks are too large\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            av_freep(&s->codebooks[i].blocks);
+            s->codebooks[i] = unpack_codebook(&gb, cb_depth, cb_size);
             if (!s->codebooks[i].blocks)
                 return -1;
         }
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     new_frame_data = (uint16_t*)frame->data[0];
     new_stride = frame->linesize[0] / 2;
@@ -289,7 +300,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
         if (skip == -1) {
             // Note that this call will make us skip the rest of the blocks
             // if the frame prematurely ends
-            skip = decode_skip_count(&bc);
+            skip = decode_skip_count(&gb);
         }
 
         if (skip) {
@@ -299,10 +310,10 @@ static int escape124_decode_frame(AVCodecContext *avctx,
             copy_superblock(sb.pixels, 8,
                             old_frame_data, old_stride);
 
-            while (can_safely_read(&bc, 1) && !bitstream_read_bit(&bc)) {
+            while (get_bits_left(&gb) >= 1 && !get_bits1(&gb)) {
                 unsigned mask;
-                mb = decode_macroblock(s, &bc, &cb_index, superblock_index);
-                mask = bitstream_read(&bc, 16);
+                mb = decode_macroblock(s, &gb, &cb_index, superblock_index);
+                mask = get_bits(&gb, 16);
                 multi_mask |= mask;
                 for (i = 0; i < 16; i++) {
                     if (mask & mask_matrix[i]) {
@@ -311,29 +322,27 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                 }
             }
 
-            if (can_safely_read(&bc, 1) && !bitstream_read_bit(&bc)) {
-                unsigned inv_mask = bitstream_read(&bc, 4);
+            if (!get_bits1(&gb)) {
+                unsigned inv_mask = get_bits(&gb, 4);
                 for (i = 0; i < 4; i++) {
                     if (inv_mask & (1 << i)) {
                         multi_mask ^= 0xF << i*4;
                     } else {
-                        multi_mask ^= bitstream_read(&bc, 4) << i * 4;
+                        multi_mask ^= get_bits(&gb, 4) << i*4;
                     }
                 }
 
                 for (i = 0; i < 16; i++) {
                     if (multi_mask & mask_matrix[i]) {
-                        if (!can_safely_read(&bc, 1))
-                            break;
-                        mb = decode_macroblock(s, &bc, &cb_index,
+                        mb = decode_macroblock(s, &gb, &cb_index,
                                                superblock_index);
                         insert_mb_into_sb(&sb, mb, i);
                     }
                 }
             } else if (frame_flags & (1 << 16)) {
-                while (can_safely_read(&bc, 1) && !bitstream_read_bit(&bc)) {
-                    mb = decode_macroblock(s, &bc, &cb_index, superblock_index);
-                    insert_mb_into_sb(&sb, mb, bitstream_read(&bc, 4));
+                while (get_bits_left(&gb) >= 1 && !get_bits1(&gb)) {
+                    mb = decode_macroblock(s, &gb, &cb_index, superblock_index);
+                    insert_mb_into_sb(&sb, mb, get_bits(&gb, 4));
                 }
             }
 
@@ -353,9 +362,9 @@ static int escape124_decode_frame(AVCodecContext *avctx,
         skip--;
     }
 
-    av_log(NULL, AV_LOG_DEBUG,
+    av_log(avctx, AV_LOG_DEBUG,
            "Escape sizes: %i, %i, %i\n",
-           frame_size, buf_size, bitstream_tell(&bc) / 8);
+           frame_size, buf_size, get_bits_count(&gb) / 8);
 
     av_frame_unref(s->frame);
     if ((ret = av_frame_ref(s->frame, frame)) < 0)
diff --git a/libavcodec/escape130.c b/libavcodec/escape130.c
index 0f2fcae..1dd7eed 100644
--- a/libavcodec/escape130.c
+++ b/libavcodec/escape130.c
@@ -2,20 +2,20 @@
  * Escape 130 video decoder
  * Copyright (C) 2008 Eli Friedman (eli.friedman <at> gmail.com)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,7 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 typedef struct Escape130Context {
@@ -163,23 +163,26 @@ static av_cold int escape130_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
-static int decode_skip_count(BitstreamContext *bc)
+static int decode_skip_count(GetBitContext* gb)
 {
     int value;
 
-    value = bitstream_read_bit(bc);
+    if (get_bits_left(gb) < 1+3)
+        return -1;
+
+    value = get_bits1(gb);
     if (value)
         return 0;
 
-    value = bitstream_read(bc, 3);
+    value = get_bits(gb, 3);
     if (value)
         return value;
 
-    value = bitstream_read(bc, 8);
+    value = get_bits(gb, 8);
     if (value)
         return value + 7;
 
-    value = bitstream_read(bc, 15);
+    value = get_bits(gb, 15);
     if (value)
         return value + 262;
 
@@ -189,11 +192,10 @@ static int decode_skip_count(BitstreamContext *bc)
 static int escape130_decode_frame(AVCodecContext *avctx, void *data,
                                   int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf  = avpkt->data;
     int buf_size        = avpkt->size;
     Escape130Context *s = avctx->priv_data;
     AVFrame *pic        = data;
-    BitstreamContext bc;
+    GetBitContext gb;
     int ret;
 
     uint8_t *old_y, *old_cb, *old_cr,
@@ -216,7 +218,9 @@ static int escape130_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
         return ret;
 
-    bitstream_init8(&bc, buf + 16, buf_size - 16);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+    skip_bits_long(&gb, 16 * 8);
 
     new_y  = s->new_y;
     new_cb = s->new_u;
@@ -235,7 +239,7 @@ static int escape130_decode_frame(AVCodecContext *avctx, void *data,
         // Note that this call will make us skip the rest of the blocks
         // if the frame ends prematurely.
         if (skip == -1)
-            skip = decode_skip_count(&bc);
+            skip = decode_skip_count(&gb);
         if (skip == -1) {
             av_log(avctx, AV_LOG_ERROR, "Error decoding skip value\n");
             return AVERROR_INVALIDDATA;
@@ -250,31 +254,31 @@ static int escape130_decode_frame(AVCodecContext *avctx, void *data,
             cb = old_cb[0];
             cr = old_cr[0];
         } else {
-            if (bitstream_read_bit(&bc)) {
-                unsigned sign_selector       = bitstream_read(&bc, 6);
-                unsigned difference_selector = bitstream_read(&bc, 2);
-                y_avg = 2 * bitstream_read(&bc, 5);
+            if (get_bits1(&gb)) {
+                unsigned sign_selector       = get_bits(&gb, 6);
+                unsigned difference_selector = get_bits(&gb, 2);
+                y_avg = 2 * get_bits(&gb, 5);
                 for (i = 0; i < 4; i++) {
                     y[i] = av_clip(y_avg + offset_table[difference_selector] *
                                    sign_table[sign_selector][i], 0, 63);
                 }
-            } else if (bitstream_read_bit(&bc)) {
-                if (bitstream_read_bit(&bc)) {
-                    y_avg = bitstream_read(&bc, 6);
+            } else if (get_bits1(&gb)) {
+                if (get_bits1(&gb)) {
+                    y_avg = get_bits(&gb, 6);
                 } else {
-                    unsigned adjust_index = bitstream_read(&bc, 3);
+                    unsigned adjust_index = get_bits(&gb, 3);
                     y_avg = (y_avg + luma_adjust[adjust_index]) & 63;
                 }
                 for (i = 0; i < 4; i++)
                     y[i] = y_avg;
             }
 
-            if (bitstream_read_bit(&bc)) {
-                if (bitstream_read_bit(&bc)) {
-                    cb = bitstream_read(&bc, 5);
-                    cr = bitstream_read(&bc, 5);
+            if (get_bits1(&gb)) {
+                if (get_bits1(&gb)) {
+                    cb = get_bits(&gb, 5);
+                    cr = get_bits(&gb, 5);
                 } else {
-                    unsigned adjust_index = bitstream_read(&bc, 3);
+                    unsigned adjust_index = get_bits(&gb, 3);
                     cb = (cb + chroma_adjust[0][adjust_index]) & 31;
                     cr = (cr + chroma_adjust[1][adjust_index]) & 31;
                 }
@@ -333,7 +337,7 @@ static int escape130_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     ff_dlog(avctx, "Frame data: provided %d bytes, used %d bytes\n",
-            buf_size, bitstream_tell(&bc) >> 3);
+            buf_size, get_bits_count(&gb) >> 3);
 
     FFSWAP(uint8_t*, s->old_y, s->new_y);
     FFSWAP(uint8_t*, s->old_u, s->new_u);
diff --git a/libavcodec/evrcdata.h b/libavcodec/evrcdata.h
new file mode 100644
index 0000000..8cfc202
--- /dev/null
+++ b/libavcodec/evrcdata.h
@@ -0,0 +1,1499 @@
+/*
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_EVRCDATA_H
+#define AVCODEC_EVRCDATA_H
+
+/**
+ * @file
+ * Data tables for the EVRC decoder
+ * @author Paul B Mahol
+ */
+
+#include "libavutil/common.h"
+
+/**
+ * Rate 1/8 frame energy quantization
+ *
+ * TIA/IS-127 table 8-18
+ */
+static const float evrc_energy_quant[][3] = {
+{-0.2464E-01,-0.4005E-02,-0.1107E+00 }, { 0.8734E+00, 0.1004E+01, 0.9930E+00 },
+{ 0.4222E+00, 0.3894E+00, 0.5020E+00 }, { 0.1450E+01, 0.1328E+01, 0.1278E+01 },
+{ 0.1957E+00, 0.2169E+00, 0.2735E+00 }, { 0.1142E+01, 0.1240E+01, 0.1157E+01 },
+{ 0.7881E+00, 0.6778E+00, 0.4185E+00 }, { 0.1504E+01, 0.1468E+01, 0.1534E+01 },
+{ 0.3173E+00, 0.2693E+00,-0.9526E-01 }, { 0.1141E+01, 0.1154E+01, 0.1044E+01 },
+{ 0.5147E+00, 0.5784E+00, 0.8802E+00 }, { 0.1502E+01, 0.1407E+01, 0.1409E+01 },
+{ 0.3163E+00, 0.3592E+00, 0.2830E+00 }, { 0.1217E+01, 0.1213E+01, 0.1216E+01 },
+{ 0.1023E+01, 0.1139E+01,-0.9526E-01 }, { 0.1619E+01, 0.1655E+01, 0.1642E+01 },
+{ 0.1437E+00, 0.1505E+00, 0.6838E-01 }, { 0.9794E+00, 0.1021E+01, 0.1117E+01 },
+{ 0.4701E+00, 0.6426E+00, 0.5519E+00 }, { 0.1366E+01, 0.1397E+01, 0.1406E+01 },
+{ 0.2918E+00, 0.3022E+00, 0.2420E+00 }, { 0.1309E+01, 0.1241E+01, 0.1220E+01 },
+{ 0.7989E+00, 0.7654E+00, 0.7391E+00 }, { 0.1612E+01, 0.1502E+01, 0.1447E+01 },
+{ 0.2594E+00, 0.1948E+00, 0.2555E+00 }, { 0.1091E+01, 0.1150E+01, 0.1272E+01 },
+{ 0.3423E+00, 0.4150E+00, 0.1294E+01 }, { 0.1729E+01, 0.1377E+01, 0.1065E+01 },
+{ 0.4103E+00, 0.3287E+00, 0.3228E+00 }, { 0.1144E+01, 0.1281E+01, 0.1416E+01 },
+{ 0.1047E+01, 0.1117E+01, 0.6188E+00 }, { 0.1914E+01, 0.1777E+01, 0.1516E+01 },
+{-0.2117E-01, 0.2159E+00, 0.2351E+00 }, { 0.1093E+01, 0.1088E+01, 0.1026E+01 },
+{ 0.5567E+00, 0.5092E+00, 0.4654E+00 }, { 0.1510E+01, 0.1449E+01, 0.1201E+01 },
+{ 0.2362E+00, 0.3426E+00, 0.2549E+00 }, { 0.1340E+01, 0.1225E+01, 0.1117E+01 },
+{ 0.1203E+01, 0.3819E+00, 0.2269E+00 }, { 0.1373E+01, 0.1404E+01, 0.1830E+01 },
+{ 0.2570E+00, 0.2668E+00, 0.1636E+00 }, { 0.1219E+01, 0.1098E+01, 0.1122E+01 },
+{ 0.6985E+00, 0.8456E+00, 0.1069E+01 }, { 0.1550E+01, 0.1501E+01, 0.1388E+01 },
+{ 0.2870E+00, 0.3060E+00, 0.3599E+00 }, { 0.1178E+01, 0.1345E+01, 0.1302E+01 },
+{ 0.1270E+01, 0.1215E+01, 0.1812E+00 }, { 0.1725E+01, 0.1777E+01, 0.1693E+01 },
+{ 0.2074E+00, 0.2104E+00, 0.1539E+00 }, { 0.1105E+01, 0.1034E+01, 0.1104E+01 },
+{ 0.6683E+00, 0.6646E+00, 0.6639E+00 }, { 0.1403E+01, 0.1462E+01, 0.1435E+01 },
+{ 0.3389E+00, 0.3754E+00, 0.2150E+00 }, { 0.1288E+01, 0.1325E+01, 0.1257E+01 },
+{ 0.8933E+00, 0.8253E+00, 0.8133E+00 }, { 0.1555E+01, 0.1579E+01, 0.1565E+01 },
+{ 0.3264E+00, 0.2434E+00, 0.2852E+00 }, { 0.1242E+01, 0.1180E+01, 0.1202E+01 },
+{ 0.1314E+00, 0.1698E+00, 0.1646E+01 }, { 0.1797E+01, 0.1597E+01, 0.1241E+01 },
+{ 0.4721E+00, 0.5346E+00, 0.3066E+00 }, { 0.1274E+01, 0.1401E+01, 0.1351E+01 },
+{ 0.1455E+01, 0.1386E+01, 0.6430E+00 }, { 0.1828E+01, 0.1867E+01, 0.1825E+01 },
+{-0.3265E+00,-0.2956E+00,-0.2462E+00 }, { 0.1035E+01, 0.1020E+01, 0.1003E+01 },
+{ 0.3702E+00, 0.4307E+00, 0.7072E+00 }, { 0.1424E+01, 0.1345E+01, 0.1352E+01 },
+{ 0.2267E+00, 0.2680E+00, 0.3037E+00 }, { 0.1235E+01, 0.1249E+01, 0.1146E+01 },
+{ 0.9944E+00, 0.6485E+00, 0.5248E+00 }, { 0.1539E+01, 0.1492E+01, 0.1612E+01 },
+{ 0.3815E+00, 0.3360E+00,-0.9526E-01 }, { 0.1163E+01, 0.1144E+01, 0.1117E+01 },
+{ 0.6734E+00, 0.7656E+00, 0.1014E+01 }, { 0.1568E+01, 0.1438E+01, 0.1455E+01 },
+{ 0.3409E+00, 0.3317E+00, 0.3856E+00 }, { 0.1180E+01, 0.1284E+01, 0.1284E+01 },
+{ 0.1244E+01, 0.1214E+01,-0.9526E-01 }, { 0.1753E+01, 0.1598E+01, 0.1744E+01 },
+{ 0.1548E+00, 0.1388E+00, 0.2020E+00 }, { 0.1027E+01, 0.1133E+01, 0.1093E+01 },
+{ 0.3906E+00, 0.7505E+00, 0.5705E+00 }, { 0.1420E+01, 0.1357E+01, 0.1543E+01 },
+{ 0.3252E+00, 0.3136E+00, 0.2804E+00 }, { 0.1351E+01, 0.1309E+01, 0.1224E+01 },
+{ 0.8781E+00, 0.8095E+00, 0.7109E+00 }, { 0.1614E+01, 0.1580E+01, 0.1433E+01 },
+{ 0.3222E+00, 0.2298E+00, 0.2157E+00 }, { 0.1216E+01, 0.1077E+01, 0.1247E+01 },
+{ 0.1363E+01, 0.1280E+01, 0.1317E+01 }, { 0.1751E+01, 0.1457E+01, 0.1182E+01 },
+{ 0.4428E+00, 0.4082E+00, 0.3181E+00 }, { 0.1157E+01, 0.1227E+01, 0.1604E+01 },
+{ 0.1286E+01, 0.1268E+01, 0.8167E+00 }, { 0.1994E+01, 0.2018E+01, 0.1307E+01 },
+{ 0.2671E-01, 0.2594E+00, 0.3397E+00 }, { 0.1164E+01, 0.1080E+01, 0.9321E+00 },
+{ 0.5998E+00, 0.6076E+00, 0.5081E+00 }, { 0.1442E+01, 0.1442E+01, 0.1375E+01 },
+{ 0.2390E+00, 0.3554E+00, 0.3426E+00 }, { 0.1287E+01, 0.1307E+01, 0.1144E+01 },
+{ 0.1200E+01, 0.7495E+00, 0.3967E+00 }, { 0.1561E+01, 0.1517E+01, 0.1898E+01 },
+{ 0.3598E+00, 0.3463E+00, 0.1200E+00 }, { 0.1298E+01, 0.1125E+01, 0.1062E+01 },
+{ 0.7577E+00, 0.1013E+01, 0.1194E+01 }, { 0.1537E+01, 0.1513E+01, 0.1464E+01 },
+{ 0.4041E+00, 0.4038E+00, 0.3897E+00 }, { 0.1293E+01, 0.1219E+01, 0.1378E+01 },
+{ 0.1250E+01, 0.1391E+01, 0.2451E+00 }, { 0.1558E+01, 0.1764E+01, 0.1728E+01 },
+{ 0.2700E+00, 0.1894E+00, 0.1924E+00 }, { 0.1111E+01, 0.1112E+01, 0.1173E+01 },
+{ 0.7579E+00, 0.8342E+00, 0.4781E+00 }, { 0.1464E+01, 0.1477E+01, 0.1469E+01 },
+{ 0.4001E+00, 0.3104E+00, 0.2217E+00 }, { 0.1346E+01, 0.1421E+01, 0.1312E+01 },
+{ 0.1071E+01, 0.8967E+00, 0.7511E+00 }, { 0.1616E+01, 0.1551E+01, 0.1574E+01 },
+{ 0.3329E+00, 0.2785E+00, 0.3140E+00 }, { 0.1281E+01, 0.1209E+01, 0.1239E+01 },
+{ 0.2805E+00, 0.2687E+00, 0.1646E+01 }, { 0.1814E+01, 0.1514E+01, 0.1510E+01 },
+{ 0.6231E+00, 0.4200E+00, 0.3701E+00 }, { 0.1255E+01, 0.1429E+01, 0.1454E+01 },
+{ 0.1642E+01, 0.1581E+01, 0.7112E+00 }, { 0.1844E+01, 0.1963E+01, 0.1895E+01 },
+{-0.4208E-01,-0.1491E+00,-0.7639E-01 }, { 0.1046E+01, 0.9598E+00, 0.9176E+00 },
+{ 0.4478E+00, 0.4605E+00, 0.5111E+00 }, { 0.1521E+01, 0.1292E+01, 0.1342E+01 },
+{ 0.2220E+00, 0.2549E+00, 0.2510E+00 }, { 0.1186E+01, 0.1254E+01, 0.1171E+01 },
+{ 0.8999E+00, 0.4960E+00, 0.4943E+00 }, { 0.1423E+01, 0.1484E+01, 0.1620E+01 },
+{ 0.2796E+00, 0.2778E+00,-0.2820E+00 }, { 0.1170E+01, 0.1181E+01, 0.1076E+01 },
+{ 0.4068E+00, 0.8541E+00, 0.9352E+00 }, { 0.1584E+01, 0.1416E+01, 0.1387E+01 },
+{ 0.3325E+00, 0.3655E+00, 0.3340E+00 }, { 0.1224E+01, 0.1257E+01, 0.1245E+01 },
+{ 0.1061E+01, 0.1138E+01,-0.9526E-01 }, { 0.1681E+01, 0.1704E+01, 0.1673E+01 },
+{ 0.1932E+00, 0.1489E+00, 0.1258E+00 }, { 0.1023E+01, 0.1088E+01, 0.1145E+01 },
+{ 0.5190E+00, 0.6873E+00, 0.5172E+00 }, { 0.1380E+01, 0.1405E+01, 0.1474E+01 },
+{ 0.3393E+00, 0.3100E+00, 0.2231E+00 }, { 0.1354E+01, 0.1249E+01, 0.1270E+01 },
+{ 0.7363E+00, 0.8508E+00, 0.8247E+00 }, { 0.1612E+01, 0.1537E+01, 0.1509E+01 },
+{ 0.2952E+00, 0.2053E+00, 0.2590E+00 }, { 0.1138E+01, 0.1219E+01, 0.1262E+01 },
+{ 0.1345E+01, 0.1289E+01, 0.1338E+01 }, { 0.1437E+01, 0.1360E+01, 0.1442E+01 },
+{ 0.4826E+00, 0.3298E+00, 0.3842E+00 }, { 0.1219E+01, 0.1311E+01, 0.1413E+01 },
+{ 0.1212E+01, 0.1186E+01, 0.6357E+00 }, { 0.1873E+01, 0.1939E+01, 0.1674E+01 },
+{ 0.1260E+01, 0.1306E+01, 0.1368E+01 }, { 0.1146E+01, 0.1077E+01, 0.1025E+01 },
+{ 0.6029E+00, 0.5039E+00, 0.5781E+00 }, { 0.1514E+01, 0.1420E+01, 0.1324E+01 },
+{ 0.2652E+00, 0.3192E+00, 0.3042E+00 }, { 0.1368E+01, 0.1198E+01, 0.1200E+01 },
+{ 0.1234E+01, 0.4910E+00, 0.3464E-01 }, { 0.1347E+01, 0.1560E+01, 0.1861E+01 },
+{ 0.2766E+00, 0.2887E+00, 0.2029E+00 }, { 0.1257E+01, 0.1105E+01, 0.1145E+01 },
+{ 0.1351E+01, 0.1353E+01, 0.1406E+01 }, { 0.1506E+01, 0.1580E+01, 0.1362E+01 },
+{ 0.2794E+00, 0.3868E+00, 0.4277E+00 }, { 0.1234E+01, 0.1334E+01, 0.1336E+01 },
+{ 0.1280E+01, 0.1252E+01, 0.1805E+00 }, { 0.1387E+01, 0.1396E+01, 0.1434E+01 },
+{ 0.2902E+00, 0.1170E+00, 0.1698E+00 }, { 0.1134E+01, 0.1077E+01, 0.1117E+01 },
+{ 0.6986E+00, 0.7177E+00, 0.7366E+00 }, { 0.1370E+01, 0.1491E+01, 0.1495E+01 },
+{ 0.4031E+00, 0.5144E+00, 0.1751E+00 }, { 0.1333E+01, 0.1377E+01, 0.1257E+01 },
+{ 0.9212E+00, 0.8934E+00, 0.8897E+00 }, { 0.1589E+01, 0.1614E+01, 0.1523E+01 },
+{ 0.3152E+00, 0.2164E+00, 0.3230E+00 }, { 0.1300E+01, 0.1145E+01, 0.1212E+01 },
+{ 0.1269E+01, 0.1245E+01, 0.1497E+01 }, { 0.1763E+01, 0.1716E+01, 0.1311E+01 },
+{ 0.4702E+00, 0.5422E+00, 0.4306E+00 }, { 0.1342E+01, 0.1433E+01, 0.1423E+01 },
+{ 0.1472E+01, 0.1404E+01, 0.8371E+00 }, { 0.1936E+01, 0.1883E+01, 0.1838E+01 },
+{ 0.1266E+01, 0.1295E+01, 0.1302E+01 }, { 0.1074E+01, 0.1002E+01, 0.1023E+01 },
+{ 0.5206E+00, 0.4045E+00, 0.6549E+00 }, { 0.1457E+01, 0.1378E+01, 0.1363E+01 },
+{ 0.2715E+00, 0.2629E+00, 0.2841E+00 }, { 0.1264E+01, 0.1271E+01, 0.1175E+01 },
+{ 0.1337E+01, 0.1305E+01, 0.1306E+01 }, { 0.1555E+01, 0.1571E+01, 0.1657E+01 },
+{ 0.3341E+00, 0.4147E+00,-0.3648E+00 }, { 0.1188E+01, 0.1185E+01, 0.1161E+01 },
+{ 0.6198E+00, 0.7208E+00, 0.1157E+01 }, { 0.1582E+01, 0.1465E+01, 0.1513E+01 },
+{ 0.3839E+00, 0.3651E+00, 0.3814E+00 }, { 0.1214E+01, 0.1256E+01, 0.1292E+01 },
+{ 0.1361E+01, 0.1363E+01, 0.1312E+01 }, { 0.1793E+01, 0.1693E+01, 0.1669E+01 },
+{ 0.1889E+00, 0.1275E+00, 0.2534E+00 }, { 0.1066E+01, 0.1174E+01, 0.1133E+01 },
+{ 0.4999E+00, 0.8207E+00, 0.5813E+00 }, { 0.1478E+01, 0.1416E+01, 0.1497E+01 },
+{ 0.3814E+00, 0.3138E+00, 0.2889E+00 }, { 0.1396E+01, 0.1265E+01, 0.1233E+01 },
+{ 0.9458E+00, 0.9161E+00, 0.5875E+00 }, { 0.1672E+01, 0.1632E+01, 0.1553E+01 },
+{ 0.3505E+00, 0.2525E+00, 0.2364E+00 }, { 0.1211E+01, 0.1138E+01, 0.1235E+01 },
+{ 0.1391E+01, 0.1231E+01, 0.1355E+01 }, { 0.1783E+01, 0.1510E+01, 0.1199E+01 },
+{ 0.4227E+00, 0.4548E+00, 0.3671E+00 }, { 0.1281E+01, 0.1254E+01, 0.1661E+01 },
+{ 0.1338E+01, 0.1379E+01, 0.9531E+00 }, { 0.2148E+01, 0.1965E+01, 0.1584E+01 },
+{ 0.9324E-01, 0.3575E+00, 0.3522E+00 }, { 0.1212E+01, 0.1086E+01, 0.1044E+01 },
+{ 0.6128E+00, 0.6136E+00, 0.6060E+00 }, { 0.1484E+01, 0.1507E+01, 0.1396E+01 },
+{ 0.2820E+00, 0.3848E+00, 0.3156E+00 }, { 0.1368E+01, 0.1287E+01, 0.1128E+01 },
+{ 0.1369E+01, 0.1352E+01, 0.1358E+01 }, { 0.1381E+01, 0.1765E+01, 0.2113E+01 },
+{ 0.1314E+01, 0.1345E+01, 0.1334E+01 }, { 0.1290E+01, 0.1172E+01, 0.1119E+01 },
+{ 0.1304E+01, 0.1377E+01, 0.1427E+01 }, { 0.1490E+01, 0.1540E+01, 0.1536E+01 },
+{ 0.3994E+00, 0.4402E+00, 0.4173E+00 }, { 0.1323E+01, 0.1307E+01, 0.1392E+01 },
+{ 0.1400E+01, 0.1388E+01, 0.1369E+01 }, { 0.1669E+01, 0.1818E+01, 0.1834E+01 },
+{ 0.2742E+00, 0.2235E+00, 0.1986E+00 }, { 0.1137E+01, 0.1139E+01, 0.1201E+01 },
+{ 0.1324E+01, 0.1385E+01, 0.1349E+01 }, { 0.1455E+01, 0.1574E+01, 0.1454E+01 },
+{ 0.5019E+00, 0.3255E+00, 0.2555E+00 }, { 0.1388E+01, 0.1438E+01, 0.1300E+01 },
+{ 0.1394E+01, 0.1349E+01, 0.1411E+01 }, { 0.1639E+01, 0.1580E+01, 0.1681E+01 },
+{ 0.3920E+00, 0.2498E+00, 0.3523E+00 }, { 0.1301E+01, 0.1221E+01, 0.1285E+01 },
+{ 0.1318E+01, 0.1342E+01, 0.1494E+01 }, { 0.1910E+01, 0.1680E+01, 0.1470E+01 },
+{ 0.6082E+00, 0.5270E+00, 0.4173E+00 }, { 0.1255E+01, 0.1477E+01, 0.1503E+01 },
+{ 0.1807E+01, 0.1742E+01, 0.6553E+00 }, { 0.2000E+01, 0.2072E+01, 0.2051E+01 }};
+
+/**
+ * LSP vector quantization tables
+ *
+ * TIA/IS-127 tables 8-1 through 8-9
+ */
+
+static const float evrc_lspq_full_codebook1[64][2] = {
+{1.42016308E-2, 1.93881616E-2}, {2.91667543E-2, 6.51749149E-2},
+{2.06693150E-2, 4.97564934E-2}, {3.94719802E-2, 9.55850929E-2},
+{2.27012448E-2, 3.96625809E-2}, {5.38789518E-2, 6.28347769E-2},
+{2.90525518E-2, 5.73435798E-2}, {4.48280610E-2, 1.15364626E-1},
+{1.94110647E-2, 3.46889682E-2}, {4.37502973E-2, 6.75228462E-2},
+{3.55497338E-2, 4.94086780E-2}, {6.99219853E-2, 8.67279768E-2},
+{2.77880151E-2, 4.65748496E-2}, {5.79111017E-2, 6.74542487E-2},
+{4.74664383E-2, 5.50271496E-2}, {7.88898915E-2, 1.22443043E-1},
+{2.21715886E-2, 3.02628800E-2}, {3.39134485E-2, 7.17703998E-2},
+{3.17989141E-2, 4.98996116E-2}, {6.11555986E-2, 8.73361230E-2},
+{2.67506503E-2, 3.96735854E-2}, {4.44100983E-2, 8.26731324E-2},
+{3.89172547E-2, 5.65788932E-2}, {6.04800619E-2, 1.04536951E-1},
+{2.69156620E-2, 3.57168876E-2}, {4.11117189E-2, 7.33322948E-2},
+{4.12660725E-2, 4.85165231E-2}, {7.18049556E-2, 1.06202349E-1},
+{3.38037871E-2, 4.24300395E-2}, {5.91818243E-2, 7.97467977E-2},
+{4.70107906E-2, 6.28563762E-2}, {9.42011923E-2, 1.30053163E-1},
+{1.94244273E-2, 2.72732340E-2}, {3.70831676E-2, 6.64898157E-2},
+{2.80136354E-2, 5.15984930E-2}, {5.34461029E-2, 9.25904214E-2},
+{2.54959203E-2, 4.32844795E-2}, {5.51860742E-2, 7.36182332E-2},
+{3.39851119E-2, 6.05329126E-2}, {6.18182123E-2, 1.34581268E-1},
+{2.35669166E-2, 3.55242006E-2}, {5.10804243E-2, 6.79562539E-2},
+{3.83464955E-2, 5.23469411E-2}, {7.44275749E-2, 9.66108292E-2},
+{3.18591148E-2, 4.62123118E-2}, {6.18909821E-2, 7.33231753E-2},
+{4.41718437E-2, 5.79240918E-2}, {7.93596208E-2, 1.41177371E-1},
+{2.47412287E-2, 3.23629379E-2}, {3.36563922E-2, 8.04650635E-2},
+{3.37943695E-2, 5.44977151E-2}, {6.53648973E-2, 9.52775925E-2},
+{2.93364152E-2, 4.28411029E-2}, {5.27870469E-2, 8.16159397E-2},
+{4.00724895E-2, 6.18144684E-2}, {6.75848573E-2, 1.17196076E-1},
+{3.03064957E-2, 3.86914052E-2}, {4.83106263E-2, 7.42383003E-2},
+{4.37548272E-2, 5.22842295E-2}, {8.32310021E-2, 1.09881967E-1},
+{3.75600643E-2, 4.53217216E-2}, {6.60113171E-2, 7.97580183E-2},
+{5.03225066E-2, 5.90176322E-2}, {8.77133310E-2, 1.63187444E-1}};
+
+static const float evrc_lspq_full_codebook2[64][2] = {
+{5.21959551E-2, 8.38445649E-2}, {1.05874076E-1, 1.28694162E-1},
+{5.48323877E-2, 1.33842856E-1}, {1.17768474E-1, 1.94037274E-1},
+{5.36086522E-2, 1.11398734E-1}, {1.19989693E-1, 1.47474691E-1},
+{8.00373554E-2, 1.42999724E-1}, {1.64086595E-1, 2.09821835E-1},
+{5.21059223E-2, 9.95229408E-2}, {8.67567956E-2, 1.85966507E-1},
+{7.77341127E-2, 1.31506845E-1}, {1.60545513E-1, 1.81930289E-1},
+{7.42243677E-2, 1.10437103E-1}, {1.18635088E-1, 1.75306752E-1},
+{6.61557764E-2, 1.64441928E-1}, {1.96810856E-1, 2.16682002E-1},
+{6.05317838E-2, 9.45408568E-2}, {1.06271386E-1, 1.48013934E-1},
+{5.87486550E-2, 1.47724584E-1}, {1.34816468E-1, 2.01517954E-1},
+{6.59698322E-2, 1.16447397E-1}, {1.32297173E-1, 1.53267249E-1},
+{9.26660746E-2, 1.46725491E-1}, {1.79285541E-1, 2.19705954E-1},
+{7.06458464E-2, 9.99924466E-2}, {1.06500491E-1, 1.79443434E-1},
+{8.79249722E-2, 1.25287697E-1}, {1.53640196E-1, 1.97852716E-1},
+{8.88430104E-2, 1.12465657E-1}, {1.48286715E-1, 1.67517021E-1},
+{8.16568136E-2, 1.69274017E-1}, {2.07810536E-1, 2.31033549E-1},
+{6.14927970E-2, 8.36263224E-2}, {1.14473253E-1, 1.36779979E-1},
+{6.87129870E-2, 1.38099059E-1}, {1.10511415E-1, 2.15352878E-1},
+{5.55652268E-2, 1.22242786E-1}, {1.20557591E-1, 1.61072448E-1},
+{8.32249671E-2, 1.55475482E-1}, {1.61638483E-1, 2.28268847E-1},
+{6.29152283E-2, 1.06229566E-1}, {8.29186887E-2, 2.06774518E-1},
+{8.84756893E-2, 1.35799959E-1}, {1.69772223E-1, 1.93773940E-1},
+{7.77297840E-2, 1.20287232E-1}, {1.30648017E-1, 1.84331819E-1},
+{6.91939592E-2, 1.84218004E-1}, {2.03904077E-1, 2.49715164E-1},
+{7.07671717E-2, 9.03186128E-2}, {1.08471557E-1, 1.61966518E-1},
+{7.16886371E-2, 1.51093170E-1}, {1.38779536E-1, 2.18801782E-1},
+{6.75907061E-2, 1.26740307E-1}, {1.33412346E-1, 1.68838874E-1},
+{9.61822569E-2, 1.58728704E-1}, {1.86485633E-1, 2.36560926E-1},
+{8.23447108E-2, 1.02126025E-1}, {1.00336641E-1, 1.94918498E-1},
+{9.95981991E-2, 1.36425093E-1}, {1.82448462E-1, 2.03655198E-1},
+{9.78890732E-2, 1.21145472E-1}, {1.45453140E-1, 1.83604524E-1},
+{9.58395451E-2, 1.72194853E-1}, {2.23295853E-1, 2.46418610E-1}};
+
+static const float evrc_lspq_full_codebook3[512][3] = {
+{1.36425778E-1, 1.68651849E-1, 2.04688221E-1},
+{1.85717627E-1, 2.28756160E-1, 2.51958042E-1},
+{1.22760192E-1, 1.85950696E-1, 2.79446691E-1},
+{1.96468458E-1, 2.64484435E-1, 2.89318889E-1},
+{1.25653744E-1, 1.50529265E-1, 2.76144296E-1},
+{1.96301565E-1, 2.41699994E-1, 2.88230687E-1},
+{1.40099391E-1, 2.22365588E-1, 2.74666578E-1},
+{2.59952307E-1, 2.75394946E-1, 3.10975939E-1},
+{1.58452198E-1, 1.88591003E-1, 2.07339197E-1},
+{1.95616230E-1, 2.21379519E-1, 2.87022918E-1},
+{1.69424579E-1, 2.01614648E-1, 2.75669187E-1},
+{2.12393746E-1, 2.64250666E-1, 3.17967504E-1},
+{1.82965085E-1, 1.99547559E-1, 2.29538843E-1},
+{2.15200707E-1, 2.62409419E-1, 2.82432705E-1},
+{1.46404549E-1, 2.36966729E-1, 2.90067106E-1},
+{2.45338634E-1, 3.03358108E-1, 3.42260152E-1},
+{1.37478963E-1, 1.58276558E-1, 2.39217222E-1},
+{2.01999024E-1, 2.20102608E-1, 2.69546896E-1},
+{1.18350029E-1, 2.30206400E-1, 2.83554822E-1},
+{2.25519255E-1, 2.72272140E-1, 3.06072980E-1},
+{1.35661438E-1, 1.91633970E-1, 2.65912026E-1},
+{1.95733085E-1, 2.31926173E-1, 3.14376086E-1},
+{1.67998984E-1, 2.27706313E-1, 2.76947826E-1},
+{2.50170559E-1, 3.01627070E-1, 3.21084231E-1},
+{1.33492306E-1, 2.01223105E-1, 2.33893991E-1},
+{2.06442133E-1, 2.38704175E-1, 2.77560145E-1},
+{1.79048792E-1, 1.95776582E-1, 2.80656606E-1},
+{2.06193641E-1, 2.64055401E-1, 3.33098441E-1},
+{1.75185278E-1, 1.91166341E-1, 2.57540315E-1},
+{2.28398636E-1, 2.45296657E-1, 3.08980793E-1},
+{1.80859819E-1, 2.43579060E-1, 2.96631068E-1},
+{2.76152968E-1, 3.08256060E-1, 3.46822590E-1},
+{1.37115732E-1, 1.80057764E-1, 2.20953465E-1},
+{1.81370094E-1, 2.26770103E-1, 2.70392686E-1},
+{1.25246510E-1, 1.79606944E-1, 3.10376436E-1},
+{1.90708354E-1, 2.87734240E-1, 3.13476235E-1},
+{1.30486086E-1, 1.60435289E-1, 3.00243706E-1},
+{1.97318628E-1, 2.56378502E-1, 2.78474301E-1},
+{1.58597067E-1, 2.37381399E-1, 2.62910336E-1},
+{2.61825919E-1, 2.77717203E-1, 3.31382245E-1},
+{1.64160743E-1, 1.85841531E-1, 2.35615849E-1},
+{2.09486142E-1, 2.21452802E-1, 2.92153865E-1},
+{1.66807845E-1, 2.13641763E-1, 2.70675927E-1},
+{2.29834273E-1, 2.88374633E-1, 3.06238323E-1},
+{1.82154253E-1, 2.00822473E-1, 2.40169376E-1},
+{2.24944726E-1, 2.69813925E-1, 2.91401237E-1},
+{1.63940564E-1, 2.50341147E-1, 2.78307766E-1},
+{2.56727993E-1, 2.95103759E-1, 3.53297085E-1},
+{1.40218839E-1, 1.76687688E-1, 2.46773273E-1},
+{2.15291306E-1, 2.29216009E-1, 2.64283627E-1},
+{1.21002659E-1, 2.18333840E-1, 3.22341293E-1},
+{2.54243195E-1, 2.73986191E-1, 2.96262473E-1},
+{1.60385415E-1, 1.83762908E-1, 2.81598717E-1},
+{1.87832162E-1, 2.37420350E-1, 3.29777509E-1},
+{1.77788362E-1, 2.26703495E-1, 3.02322537E-1},
+{2.75108218E-1, 2.93730587E-1, 3.12373787E-1},
+{1.70116410E-1, 1.85232103E-1, 2.46125028E-1},
+{2.21754774E-1, 2.39912242E-1, 2.86891907E-1},
+{1.95083722E-1, 2.08337873E-1, 2.88349718E-1},
+{2.37536535E-1, 2.75004476E-1, 3.39786023E-1},
+{1.88369319E-1, 2.04371840E-1, 2.57375032E-1},
+{2.47250155E-1, 2.60551840E-1, 3.02137524E-1},
+{1.66944191E-1, 2.46912360E-1, 3.18894416E-1},
+{2.78118610E-1, 3.13011140E-1, 3.65329295E-1},
+{1.45213529E-1, 1.63051456E-1, 2.24912614E-1},
+{2.05692515E-1, 2.20831484E-1, 2.52817810E-1},
+{1.21125661E-1, 1.96374118E-1, 3.00122708E-1},
+{2.15566799E-1, 2.65657336E-1, 2.99202889E-1},
+{1.09134212E-1, 1.78472102E-1, 2.88323194E-1},
+{2.03508541E-1, 2.40347922E-1, 2.96309739E-1},
+{1.53101787E-1, 2.25415319E-1, 2.84843713E-1},
+{2.50233442E-1, 2.77736932E-1, 3.24840695E-1},
+{1.66308925E-1, 1.94173396E-1, 2.11635381E-1},
+{2.01289460E-1, 2.26062179E-1, 2.93246478E-1},
+{1.49518773E-1, 2.14201719E-1, 2.83894747E-1},
+{2.21836135E-1, 2.85231501E-1, 3.20082635E-1},
+{1.89573213E-1, 2.06577629E-1, 2.30332345E-1},
+{2.31247649E-1, 2.46864259E-1, 2.89846569E-1},
+{1.39116928E-1, 2.59189934E-1, 2.98019558E-1},
+{2.44512573E-1, 2.82671362E-1, 3.61258298E-1},
+{1.22530967E-1, 1.68514788E-1, 2.70879298E-1},
+{2.04372838E-1, 2.30398357E-1, 2.71792918E-1},
+{1.42643943E-1, 2.22405583E-1, 2.92057186E-1},
+{2.42643669E-1, 2.77429372E-1, 2.97135502E-1},
+{1.52048603E-1, 1.96921080E-1, 2.61013240E-1},
+{2.17875019E-1, 2.45840371E-1, 3.08138579E-1},
+{1.90109268E-1, 2.31099129E-1, 2.80178159E-1},
+{2.54314184E-1, 2.94079810E-1, 3.39649171E-1},
+{1.56698599E-1, 2.08597451E-1, 2.28010774E-1},
+{2.25088730E-1, 2.50014484E-1, 2.76250154E-1},
+{1.78219035E-1, 1.98228240E-1, 3.04198891E-1},
+{2.08567217E-1, 2.92395383E-1, 3.46786886E-1},
+{1.71052113E-1, 2.03438759E-1, 2.62644321E-1},
+{2.30275467E-1, 2.58817524E-1, 3.11986536E-1},
+{1.85333565E-1, 2.45760202E-1, 3.10553998E-1},
+{2.89413869E-1, 3.11095625E-1, 3.46476167E-1},
+{1.50332406E-1, 1.67538226E-1, 2.40182847E-1},
+{1.79971650E-1, 2.37168610E-1, 2.60899693E-1},
+{1.49866179E-1, 1.97890073E-1, 3.07916552E-1},
+{2.10799649E-1, 2.88180083E-1, 3.29747230E-1},
+{1.31711140E-1, 1.65906459E-1, 3.22898000E-1},
+{2.14832023E-1, 2.52822131E-1, 2.97547072E-1},
+{1.83760419E-1, 2.37523615E-1, 2.74610013E-1},
+{2.55575180E-1, 2.75439233E-1, 3.46021861E-1},
+{1.82662204E-1, 1.99470907E-1, 2.16051653E-1},
+{2.09240332E-1, 2.22406715E-1, 3.02382857E-1},
+{1.84088245E-1, 2.11327791E-1, 2.82538086E-1},
+{2.41171077E-1, 2.97036022E-1, 3.15979272E-1},
+{1.96804658E-1, 2.11815894E-1, 2.41647676E-1},
+{2.42761984E-1, 2.58586556E-1, 2.93204397E-1},
+{1.58905461E-1, 2.65077025E-1, 2.89881319E-1},
+{2.58060575E-1, 3.18903178E-1, 3.47846836E-1},
+{1.48766384E-1, 1.66853935E-1, 2.66827434E-1},
+{2.15942249E-1, 2.29938298E-1, 2.76041597E-1},
+{1.38410494E-1, 2.39283442E-1, 3.27972382E-1},
+{2.43765280E-1, 2.88408488E-1, 3.06048721E-1},
+{1.70157120E-1, 1.89986289E-1, 2.81219155E-1},
+{2.19117031E-1, 2.58005291E-1, 3.26571971E-1},
+{1.92163572E-1, 2.23614186E-1, 2.98683077E-1},
+{2.73545444E-1, 3.12078089E-1, 3.30766588E-1},
+{1.62452087E-1, 2.04930902E-1, 2.53337711E-1},
+{2.23855302E-1, 2.37671077E-1, 3.03202003E-1},
+{1.93955287E-1, 2.12335557E-1, 3.07566851E-1},
+{2.29912683E-1, 2.97581047E-1, 3.37499231E-1},
+{1.89335391E-1, 2.04148144E-1, 2.78609782E-1},
+{2.42303565E-1, 2.73163110E-1, 3.15361649E-1},
+{1.55009672E-1, 2.88095146E-1, 3.35996419E-1},
+{2.73716152E-1, 3.31215471E-1, 3.62539083E-1},
+{1.52389362E-1, 1.72619134E-1, 1.90585673E-1},
+{1.96988270E-1, 2.26309747E-1, 2.46197492E-1},
+{1.20555148E-1, 2.06369758E-1, 2.81199783E-1},
+{1.93709418E-1, 2.71900505E-1, 3.01332921E-1},
+{1.36701152E-1, 1.54093146E-1, 2.82258362E-1},
+{1.97299168E-1, 2.53656298E-1, 2.90315062E-1},
+{1.43463776E-1, 2.43872911E-1, 2.75533706E-1},
+{2.58477271E-1, 2.73279876E-1, 3.21119100E-1},
+{1.54406175E-1, 1.93793535E-1, 2.15884149E-1},
+{2.05979452E-1, 2.24277020E-1, 2.85732359E-1},
+{1.74535319E-1, 2.08482355E-1, 2.79668540E-1},
+{2.18844578E-1, 2.72486299E-1, 3.27095598E-1},
+{1.77609727E-1, 2.12990195E-1, 2.39119649E-1},
+{2.29163751E-1, 2.59165913E-1, 2.83514649E-1},
+{1.57353148E-1, 2.39961296E-1, 3.04263145E-1},
+{2.45613828E-1, 3.16824526E-1, 3.42909366E-1},
+{1.42953232E-1, 1.61905348E-1, 2.53710240E-1},
+{2.10192814E-1, 2.22847700E-1, 2.71103770E-1},
+{1.26843944E-1, 2.16709048E-1, 2.97734648E-1},
+{2.31000140E-1, 2.80109137E-1, 2.99707443E-1},
+{1.52980462E-1, 1.93996876E-1, 2.72895664E-1},
+{2.12860718E-1, 2.41545349E-1, 3.16518754E-1},
+{1.71154693E-1, 2.22469687E-1, 2.93786496E-1},
+{2.51988232E-1, 3.04254979E-1, 3.31269950E-1},
+{1.33188918E-1, 2.07924992E-1, 2.55362093E-1},
+{2.12044910E-1, 2.42189646E-1, 2.88903743E-1},
+{1.84612468E-1, 2.01143622E-1, 2.86360770E-1},
+{2.18286708E-1, 2.76752442E-1, 3.44581515E-1},
+{1.83562174E-1, 1.99478507E-1, 2.62156576E-1},
+{2.33130530E-1, 2.49596909E-1, 3.15842837E-1},
+{1.89898983E-1, 2.46874869E-1, 2.97132462E-1},
+{2.75022447E-1, 3.22490305E-1, 3.46977681E-1},
+{1.42305329E-1, 1.92689180E-1, 2.16155857E-1},
+{1.95676163E-1, 2.22268641E-1, 2.76587397E-1},
+{1.33241490E-1, 1.97791785E-1, 3.22897941E-1},
+{1.84865132E-1, 2.97106177E-1, 3.26105148E-1},
+{1.50203660E-1, 1.76781267E-1, 2.91536182E-1},
+{2.03144446E-1, 2.59616166E-1, 2.99156040E-1},
+{1.65488973E-1, 2.38342047E-1, 2.87493914E-1},
+{2.71071255E-1, 2.89544493E-1, 3.19521040E-1},
+{1.68598369E-1, 1.98825568E-1, 2.30347604E-1},
+{2.13811651E-1, 2.34471768E-1, 2.90959626E-1},
+{1.74605444E-1, 2.17256010E-1, 2.85688072E-1},
+{2.28503481E-1, 2.96190292E-1, 3.16534668E-1},
+{1.87172607E-1, 2.20547438E-1, 2.39688724E-1},
+{2.28884771E-1, 2.63583153E-1, 3.01329464E-1},
+{1.77897051E-1, 2.58131474E-1, 2.81487674E-1},
+{2.59513617E-1, 3.07204396E-1, 3.48793596E-1},
+{1.45224437E-1, 1.78715974E-1, 2.59186983E-1},
+{2.19062313E-1, 2.38223523E-1, 2.60461539E-1},
+{1.43650874E-1, 2.09760785E-1, 3.15830201E-1},
+{2.50127465E-1, 2.79182345E-1, 3.05153579E-1},
+{1.48986444E-1, 2.01226771E-1, 2.82543689E-1},
+{2.08387777E-1, 2.35603899E-1, 3.45363885E-1},
+{1.85830340E-1, 2.21607298E-1, 3.10773641E-1},
+{2.80904710E-1, 2.95469791E-1, 3.25499445E-1},
+{1.72967300E-1, 1.97078109E-1, 2.45801106E-1},
+{2.19495699E-1, 2.44767100E-1, 2.93587774E-1},
+{1.83909580E-1, 2.15004295E-1, 3.00334543E-1},
+{2.45338634E-1, 2.68595248E-1, 3.48330349E-1},
+{1.92957386E-1, 2.06625074E-1, 2.67336398E-1},
+{2.54845560E-1, 2.68642277E-1, 3.03547889E-1},
+{1.76853105E-1, 2.59330958E-1, 3.16200763E-1},
+{2.90929139E-1, 3.15634757E-1, 3.68723541E-1},
+{1.57116994E-1, 1.73552901E-1, 2.28736520E-1},
+{2.12509260E-1, 2.30501205E-1, 2.52217978E-1},
+{1.42521843E-1, 2.01979935E-1, 2.93012232E-1},
+{2.14919671E-1, 2.78065056E-1, 3.14176053E-1},
+{1.35947272E-1, 1.81055903E-1, 2.75475413E-1},
+{1.98416695E-1, 2.41673797E-1, 3.05173427E-1},
+{1.59517333E-1, 2.31580108E-1, 2.95412451E-1},
+{2.58203626E-1, 2.87348121E-1, 3.20351988E-1},
+{1.74840674E-1, 1.92883253E-1, 2.11250007E-1},
+{2.02168509E-1, 2.27025688E-1, 3.04884046E-1},
+{1.69532105E-1, 2.11826235E-1, 2.97355384E-1},
+{2.30033740E-1, 2.91504353E-1, 3.26589435E-1},
+{1.95046112E-1, 2.11709172E-1, 2.27705747E-1},
+{2.37926885E-1, 2.52411634E-1, 2.97752172E-1},
+{1.53762922E-1, 2.46541560E-1, 3.14768940E-1},
+{2.36075714E-1, 3.03568929E-1, 3.70624453E-1},
+{1.38660327E-1, 1.67949975E-1, 2.73515254E-1},
+{2.13806167E-1, 2.27267206E-1, 2.86276251E-1},
+{1.25080630E-1, 2.44098395E-1, 3.02548796E-1},
+{2.35714868E-1, 2.81208843E-1, 3.08903724E-1},
+{1.51691392E-1, 2.10877746E-1, 2.63812989E-1},
+{2.20730439E-1, 2.52777904E-1, 3.16413730E-1},
+{1.84924737E-1, 2.39424765E-1, 2.85120815E-1},
+{2.59548545E-1, 3.09809893E-1, 3.26423734E-1},
+{1.62930742E-1, 2.19900876E-1, 2.36148626E-1},
+{2.34194234E-1, 2.49944329E-1, 2.77549058E-1},
+{1.70870200E-1, 1.98291600E-1, 3.21412593E-1},
+{2.31566861E-1, 2.75015086E-1, 3.69710356E-1},
+{1.80002406E-1, 2.06701040E-1, 2.71204919E-1},
+{2.38075271E-1, 2.54006237E-1, 3.23827595E-1},
+{1.99148253E-1, 2.54273921E-1, 3.07479709E-1},
+{2.87428617E-1, 3.25045079E-1, 3.48634571E-1},
+{1.45285025E-1, 1.91359162E-1, 2.49691397E-1},
+{1.94659308E-1, 2.40821242E-1, 2.77302653E-1},
+{1.53150991E-1, 1.94375664E-1, 3.27550441E-1},
+{2.04085842E-1, 2.98595697E-1, 3.21480066E-1},
+{1.56009689E-1, 1.81012720E-1, 3.00931662E-1},
+{2.10962430E-1, 2.55770296E-1, 3.08086127E-1},
+{1.85444072E-1, 2.49021322E-1, 2.74029821E-1},
+{2.74493456E-1, 2.89441973E-1, 3.38794917E-1},
+{1.76941887E-1, 1.94476932E-1, 2.22077265E-1},
+{2.16377512E-1, 2.30735779E-1, 3.03689271E-1},
+{1.89683452E-1, 2.14660764E-1, 2.88445383E-1},
+{2.40827337E-1, 2.98141748E-1, 3.27378422E-1},
+{2.01787844E-1, 2.19441772E-1, 2.39327446E-1},
+{2.48812512E-1, 2.65865892E-1, 2.93382376E-1},
+{1.82027832E-1, 2.68279046E-1, 2.93991417E-1},
+{2.56498635E-1, 3.19984466E-1, 3.62663239E-1},
+{1.58799276E-1, 1.75433666E-1, 2.67389864E-1},
+{2.24259302E-1, 2.36668259E-1, 2.77639121E-1},
+{1.49203405E-1, 2.26585329E-1, 3.45255584E-1},
+{2.50655770E-1, 2.92264849E-1, 3.13574284E-1},
+{1.58096299E-1, 2.02193201E-1, 2.98711687E-1},
+{2.28820905E-1, 2.48557344E-1, 3.44726473E-1},
+{1.87972054E-1, 2.34109432E-1, 3.04235607E-1},
+{2.85657108E-1, 3.14878136E-1, 3.36931497E-1},
+{1.62680015E-1, 2.17820048E-1, 2.57436782E-1},
+{2.24049792E-1, 2.46739820E-1, 3.00795883E-1},
+{2.01354548E-1, 2.18286663E-1, 3.13036293E-1},
+{2.38028511E-1, 2.98103482E-1, 3.53503793E-1},
+{1.98829994E-1, 2.12877125E-1, 2.72980839E-1},
+{2.50616491E-1, 2.67659992E-1, 3.20611864E-1},
+{1.70901820E-1, 2.69330353E-1, 3.34428221E-1},
+{3.04988861E-1, 3.36196691E-1, 3.65235358E-1},
+{1.47624031E-1, 1.81272805E-1, 2.04707921E-1},
+{1.93751350E-1, 2.20973969E-1, 2.61775166E-1},
+{1.32089809E-1, 1.94851607E-1, 2.83547610E-1},
+{2.07739428E-1, 2.70596832E-1, 2.92264789E-1},
+{1.27733424E-1, 1.66896015E-1, 2.83891350E-1},
+{2.05309406E-1, 2.47807533E-1, 2.83632785E-1},
+{1.54211894E-1, 2.25014091E-1, 2.70082027E-1},
+{2.67574131E-1, 2.84426898E-1, 3.09334785E-1},
+{1.68846920E-1, 1.87004536E-1, 2.02433169E-1},
+{2.02441111E-1, 2.16733068E-1, 2.93079227E-1},
+{1.63621262E-1, 2.15616465E-1, 2.82792896E-1},
+{2.25509301E-1, 2.66283005E-1, 3.17886561E-1},
+{1.89110294E-1, 2.05609441E-1, 2.22113580E-1},
+{2.21240178E-1, 2.60288864E-1, 2.92541057E-1},
+{1.55563369E-1, 2.46850818E-1, 2.89648801E-1},
+{2.48406157E-1, 3.05291861E-1, 3.55316669E-1},
+{1.27122149E-1, 1.58053726E-1, 2.54164368E-1},
+{2.04998836E-1, 2.19476849E-1, 2.78342038E-1},
+{1.33302316E-1, 2.29614019E-1, 2.86947161E-1},
+{2.36777052E-1, 2.67918199E-1, 3.08230907E-1},
+{1.40853569E-1, 2.03414679E-1, 2.73257107E-1},
+{2.07684264E-1, 2.34520018E-1, 3.24583262E-1},
+{1.77181646E-1, 2.29595393E-1, 2.83539146E-1},
+{2.61378348E-1, 3.01160187E-1, 3.21707100E-1},
+{1.48595735E-1, 2.07772017E-1, 2.46946126E-1},
+{2.14334831E-1, 2.48061299E-1, 2.72259146E-1},
+{1.76380262E-1, 1.96897894E-1, 2.92286903E-1},
+{1.98193476E-1, 2.75483340E-1, 3.49037558E-1},
+{1.76153168E-1, 1.93248957E-1, 2.69548506E-1},
+{2.36968622E-1, 2.50065804E-1, 3.06820840E-1},
+{1.76060721E-1, 2.54037619E-1, 3.03566784E-1},
+{2.82952905E-1, 3.01765054E-1, 3.53956312E-1},
+{1.45353720E-1, 1.83678836E-1, 2.34750062E-1},
+{1.93842635E-1, 2.30635554E-1, 2.67817765E-1},
+{1.38958976E-1, 1.86760783E-1, 3.13113242E-1},
+{1.99944481E-1, 2.77624756E-1, 3.25046331E-1},
+{1.42966077E-1, 1.71310842E-1, 3.03013414E-1},
+{2.07741663E-1, 2.58691758E-1, 2.88766950E-1},
+{1.71776935E-1, 2.40246087E-1, 2.73284525E-1},
+{2.71046638E-1, 2.85170943E-1, 3.27401131E-1},
+{1.69854626E-1, 1.87545776E-1, 2.24484712E-1},
+{2.15221986E-1, 2.27339745E-1, 2.95008808E-1},
+{1.75596640E-1, 2.17936546E-1, 2.74879605E-1},
+{2.34665439E-1, 2.89530903E-1, 3.16494375E-1},
+{1.89946994E-1, 2.04953820E-1, 2.46955171E-1},
+{2.37297818E-1, 2.68316716E-1, 2.90684313E-1},
+{1.69963166E-1, 2.53367484E-1, 2.92533010E-1},
+{2.70659864E-1, 2.97146112E-1, 3.56183976E-1},
+{1.52539685E-1, 1.70138955E-1, 2.52703935E-1},
+{2.19119206E-1, 2.35900700E-1, 2.69739121E-1},
+{1.42245665E-1, 2.18184620E-1, 3.28218073E-1},
+{2.61472821E-1, 2.78025657E-1, 3.02375883E-1},
+{1.53526023E-1, 1.90727741E-1, 2.92820841E-1},
+{2.09240988E-1, 2.49808684E-1, 3.24709088E-1},
+{1.75176397E-1, 2.38646746E-1, 3.06392699E-1},
+{2.73218870E-1, 3.03954989E-1, 3.20513874E-1},
+{1.63911596E-1, 1.89611584E-1, 2.56272525E-1},
+{2.26953760E-1, 2.40120232E-1, 2.92728513E-1},
+{1.95565715E-1, 2.11956203E-1, 2.97374696E-1},
+{2.41045550E-1, 2.88497001E-1, 3.36352319E-1},
+{1.94948331E-1, 2.09475279E-1, 2.56309658E-1},
+{2.47884631E-1, 2.63356417E-1, 3.11270863E-1},
+{1.69189706E-1, 2.35864580E-1, 3.36249381E-1},
+{2.86001563E-1, 3.25423747E-1, 3.59607369E-1},
+{1.56258598E-1, 1.76704943E-1, 2.14393437E-1},
+{2.08996847E-1, 2.23968685E-1, 2.60886759E-1},
+{1.35765389E-1, 2.03580052E-1, 3.05503219E-1},
+{2.18961373E-1, 2.79463500E-1, 2.99450845E-1},
+{1.34064749E-1, 1.78332120E-1, 2.90169626E-1},
+{2.13298395E-1, 2.40031511E-1, 3.00345927E-1},
+{1.64373413E-1, 2.26438701E-1, 2.87171155E-1},
+{2.50739604E-1, 2.80812472E-1, 3.35349351E-1},
+{1.63649514E-1, 1.97108001E-1, 2.21165180E-1},
+{2.08139613E-1, 2.30869800E-1, 2.96137065E-1},
+{1.59113124E-1, 2.18189180E-1, 2.95531958E-1},
+{2.39883497E-1, 2.81831235E-1, 3.26045603E-1},
+{1.89394727E-1, 2.08127141E-1, 2.38446414E-1},
+{2.32995704E-1, 2.59603471E-1, 2.93427974E-1},
+{1.60558835E-1, 2.55164832E-1, 3.02872926E-1},
+{2.53509283E-1, 2.96028465E-1, 3.67721587E-1},
+{1.30124375E-1, 1.74838990E-1, 2.60486037E-1},
+{2.10203990E-1, 2.33570784E-1, 2.83061892E-1},
+{1.52365491E-1, 2.25338757E-1, 3.03720981E-1},
+{2.40558609E-1, 2.77192205E-1, 3.05891901E-1},
+{1.63728818E-1, 1.94779396E-1, 2.69253582E-1},
+{2.25709423E-1, 2.40902692E-1, 3.18060607E-1},
+{1.92055091E-1, 2.29857832E-1, 2.89826721E-1},
+{2.62759686E-1, 3.04292172E-1, 3.35680574E-1},
+{1.66071162E-1, 2.06819177E-1, 2.39712462E-1},
+{2.23915562E-1, 2.50106871E-1, 2.85296232E-1},
+{1.88402340E-1, 2.03793734E-1, 3.03041130E-1},
+{2.30698988E-1, 2.87044138E-1, 3.49802762E-1},
+{1.82025358E-1, 2.14073509E-1, 2.63470024E-1},
+{2.37297758E-1, 2.65025407E-1, 3.17815512E-1},
+{1.89278707E-1, 2.58802205E-1, 3.04866165E-1},
+{2.97243059E-1, 3.17153066E-1, 3.56583923E-1},
+{1.58607468E-1, 1.78659767E-1, 2.41919369E-1},
+{1.94887385E-1, 2.41695851E-1, 2.62176663E-1},
+{1.58124432E-1, 2.11753070E-1, 3.11352164E-1},
+{2.16902718E-1, 2.98796803E-1, 3.20994049E-1},
+{1.49272785E-1, 1.74964130E-1, 3.15334409E-1},
+{2.21622273E-1, 2.56179065E-1, 3.03902954E-1},
+{1.75979599E-1, 2.43505448E-1, 2.85801739E-1},
+{2.64590383E-1, 2.85541564E-1, 3.45107764E-1},
+{1.80137083E-1, 2.05279350E-1, 2.22255990E-1},
+{2.10796222E-1, 2.26315439E-1, 3.14426929E-1},
+{1.79151163E-1, 2.09439725E-1, 2.93280870E-1},
+{2.49719024E-1, 2.91257650E-1, 3.27162296E-1},
+{1.98700234E-1, 2.15896755E-1, 2.49960214E-1},
+{2.40726396E-1, 2.64857739E-1, 2.99639553E-1},
+{1.71249732E-1, 2.68166155E-1, 3.03572744E-1},
+{2.69555569E-1, 3.16100627E-1, 3.56570691E-1},
+{1.50564745E-1, 1.84190869E-1, 2.68674821E-1},
+{2.16941193E-1, 2.40813971E-1, 2.78942198E-1},
+{1.35399476E-1, 2.60586530E-1, 3.32604855E-1},
+{2.56150961E-1, 2.87822872E-1, 3.06156367E-1},
+{1.66398838E-1, 1.88721806E-1, 2.93023735E-1},
+{2.29214087E-1, 2.61565417E-1, 3.27494055E-1},
+{1.98266640E-1, 2.32970506E-1, 2.99134284E-1},
+{2.87046254E-1, 3.07103783E-1, 3.27298075E-1},
+{1.75898686E-1, 2.11898595E-1, 2.51332909E-1},
+{2.32067421E-1, 2.44622201E-1, 2.99443692E-1},
+{1.90780059E-1, 2.12090015E-1, 3.25059265E-1},
+{2.31531218E-1, 3.14166099E-1, 3.42735857E-1},
+{1.95099846E-1, 2.09554315E-1, 2.79483467E-1},
+{2.40416065E-1, 2.69604772E-1, 3.28015476E-1},
+{1.71800867E-1, 2.82233089E-1, 3.14749271E-1},
+{2.69243777E-1, 3.38462502E-1, 3.79935652E-1},
+{1.59934625E-1, 1.77966774E-1, 2.00818628E-1},
+{2.01979712E-1, 2.30668545E-1, 2.56773323E-1},
+{1.34024277E-1, 2.10961610E-1, 2.84687728E-1},
+{2.03712896E-1, 2.83053070E-1, 3.03309411E-1},
+{1.44528881E-1, 1.64728075E-1, 2.85079390E-1},
+{2.06285611E-1, 2.48649031E-1, 2.96383053E-1},
+{1.58138171E-1, 2.34317720E-1, 2.79650003E-1},
+{2.64995635E-1, 2.79900700E-1, 3.18619400E-1},
+{1.66537479E-1, 1.84279412E-1, 2.14547485E-1},
+{2.03051880E-1, 2.35110492E-1, 2.88755983E-1},
+{1.68422714E-1, 2.03946173E-1, 2.87478894E-1},
+{2.31727019E-1, 2.74086386E-1, 3.24755162E-1},
+{1.85356215E-1, 2.14113116E-1, 2.29030401E-1},
+{2.42482558E-1, 2.60655493E-1, 2.83030301E-1},
+{1.67562261E-1, 2.42027491E-1, 2.99461991E-1},
+{2.38809898E-1, 3.19003850E-1, 3.58415872E-1},
+{1.37908265E-1, 1.54787809E-1, 2.65611202E-1},
+{2.11019263E-1, 2.24607319E-1, 2.79954702E-1},
+{1.37569889E-1, 2.25128531E-1, 3.09312850E-1},
+{2.29239866E-1, 2.76150972E-1, 3.15241843E-1},
+{1.60487458E-1, 1.95461214E-1, 2.83169478E-1},
+{2.18505666E-1, 2.38197207E-1, 3.30340117E-1},
+{1.81991324E-1, 2.33026952E-1, 2.93276042E-1},
+{2.54552305E-1, 3.14394146E-1, 3.36392254E-1},
+{1.44095764E-1, 2.26640165E-1, 2.50595063E-1},
+{2.15188012E-1, 2.51417249E-1, 2.85043985E-1},
+{1.87674388E-1, 2.04458863E-1, 2.94168979E-1},
+{2.30494842E-1, 2.68452436E-1, 3.52370054E-1},
+{1.85022101E-1, 1.99075252E-1, 2.71930546E-1},
+{2.42569372E-1, 2.55389154E-1, 3.11399311E-1},
+{1.95166096E-1, 2.49102056E-1, 2.98998445E-1},
+{2.83654153E-1, 3.14600259E-1, 3.55619401E-1},
+{1.51490018E-1, 1.97729796E-1, 2.32467473E-1},
+{2.00029895E-1, 2.30101258E-1, 2.81933933E-1},
+{1.38711318E-1, 1.91816628E-1, 3.45780402E-1},
+{1.96580395E-1, 3.04714769E-1, 3.40553433E-1},
+{1.38154253E-1, 1.88543141E-1, 2.99461216E-1},
+{2.05666468E-1, 2.68904895E-1, 3.05537194E-1},
+{1.72447845E-1, 2.33558387E-1, 2.93625206E-1},
+{2.70145416E-1, 2.98654765E-1, 3.28556389E-1},
+{1.75489411E-1, 1.91361547E-1, 2.35585332E-1},
+{2.20548794E-1, 2.34773993E-1, 2.95397669E-1},
+{1.85652360E-1, 2.22349137E-1, 2.79883891E-1},
+{2.29456946E-1, 3.04546326E-1, 3.24684292E-1},
+{1.86900780E-1, 2.15469390E-1, 2.51856804E-1},
+{2.34910533E-1, 2.71217376E-1, 2.99894661E-1},
+{1.85142443E-1, 2.56071001E-1, 2.93291301E-1},
+{2.63883710E-1, 3.07127446E-1, 3.62546653E-1},
+{1.60997644E-1, 1.78937852E-1, 2.55808324E-1},
+{2.25671068E-1, 2.43735075E-1, 2.68624991E-1},
+{1.55076161E-1, 2.30396181E-1, 3.21005553E-1},
+{2.51760483E-1, 2.79653400E-1, 3.14202160E-1},
+{1.56988814E-1, 2.07466930E-1, 2.89933950E-1},
+{2.17479482E-1, 2.59626418E-1, 3.40659052E-1},
+{1.76811531E-1, 2.31087089E-1, 3.17562491E-1},
+{2.82952607E-1, 2.99844354E-1, 3.36822897E-1},
+{1.82060316E-1, 1.98734730E-1, 2.51980305E-1},
+{2.25874200E-1, 2.52469152E-1, 2.93356389E-1},
+{2.00799957E-1, 2.17786849E-1, 3.02210063E-1},
+{2.47423753E-1, 2.86882848E-1, 3.47820610E-1},
+{2.01128140E-1, 2.14746892E-1, 2.62269646E-1},
+{2.53963351E-1, 2.69477993E-1, 3.12133819E-1},
+{1.91034868E-1, 2.55738169E-1, 3.32559615E-1},
+{2.91053712E-1, 3.31458420E-1, 3.68588477E-1},
+{1.57229915E-1, 1.85374141E-1, 2.25361317E-1},
+{2.08051339E-1, 2.38350868E-1, 2.64212936E-1},
+{1.46848336E-1, 2.13000089E-1, 3.00192565E-1},
+{2.18630567E-1, 2.90263802E-1, 3.09045762E-1},
+{1.43699184E-1, 1.87815160E-1, 2.83769876E-1},
+{2.07328036E-1, 2.45088696E-1, 3.08956414E-1},
+{1.64228097E-1, 2.27826655E-1, 3.08907896E-1},
+{2.61919737E-1, 2.91333705E-1, 3.31527978E-1},
+{1.70648888E-1, 2.02157527E-1, 2.17827827E-1},
+{2.07796112E-1, 2.34704822E-1, 3.06783766E-1},
+{1.72118798E-1, 2.14057386E-1, 3.10151786E-1},
+{2.29116157E-1, 2.80949861E-1, 3.33774298E-1},
+{1.96622208E-1, 2.16653049E-1, 2.33279720E-1},
+{2.37789229E-1, 2.58971304E-1, 3.04609209E-1},
+{1.55182019E-1, 2.63032585E-1, 3.18943053E-1},
+{2.49388829E-1, 3.16970855E-1, 3.77762467E-1},
+{1.51363596E-1, 1.75010651E-1, 2.78245836E-1},
+{2.19810233E-1, 2.32360214E-1, 2.85034925E-1},
+{1.42630622E-1, 2.40602851E-1, 3.04125100E-1},
+{2.42764875E-1, 2.83762127E-1, 3.15481216E-1},
+{1.57467470E-1, 2.07524061E-1, 2.75674909E-1},
+{2.28758618E-1, 2.49092206E-1, 3.28139395E-1},
+{1.90872714E-1, 2.38125205E-1, 2.94894546E-1},
+{2.66389251E-1, 3.14321429E-1, 3.38669509E-1},
+{1.70644209E-1, 2.25980043E-1, 2.47372389E-1},
+{2.36442789E-1, 2.53003448E-1, 2.88220435E-1},
+{1.85423777E-1, 2.04888850E-1, 3.14608842E-1},
+{2.17379019E-1, 2.94553548E-1, 3.67831022E-1},
+{1.88563988E-1, 2.15174288E-1, 2.72999734E-1},
+{2.45102122E-1, 2.59770364E-1, 3.21885556E-1},
+{1.98444173E-1, 2.61160702E-1, 3.17097872E-1},
+{2.99013853E-1, 3.28965336E-1, 3.56681198E-1},
+{1.58248767E-1, 1.92205697E-1, 2.46059090E-1},
+{2.02385351E-1, 2.47965842E-1, 2.71749645E-1},
+{1.61710784E-1, 2.13708103E-1, 3.27384740E-1},
+{2.14419708E-1, 3.05552453E-1, 3.33721548E-1},
+{1.61819980E-1, 1.89897299E-1, 3.10501546E-1},
+{2.19436333E-1, 2.65029579E-1, 3.09288830E-1},
+{1.88303933E-1, 2.49633163E-1, 2.85499543E-1},
+{2.69325376E-1, 2.99807042E-1, 3.41722459E-1},
+{1.72406003E-1, 2.10977256E-1, 2.27773219E-1},
+{2.20281526E-1, 2.34015763E-1, 3.12846094E-1},
+{1.83257267E-1, 2.22061962E-1, 2.91052371E-1},
+{2.42531225E-1, 3.09527606E-1, 3.30389649E-1},
+{2.07546696E-1, 2.24662632E-1, 2.44420141E-1},
+{2.45858207E-1, 2.70285994E-1, 3.05132121E-1},
+{1.84840545E-1, 2.72096783E-1, 3.12531084E-1},
+{2.74252594E-1, 3.21252435E-1, 3.74658197E-1},
+{1.66425839E-1, 1.84491634E-1, 2.68278092E-1},
+{2.28423670E-1, 2.43025422E-1, 2.81184882E-1},
+{1.60091296E-1, 2.52953321E-1, 3.35822314E-1},
+{2.62109995E-1, 2.95581907E-1, 3.13354105E-1},
+{1.67702749E-1, 2.01536924E-1, 3.01801592E-1},
+{2.37822965E-1, 2.59894758E-1, 3.38231117E-1},
+{1.97206214E-1, 2.45490909E-1, 3.17895442E-1},
+{2.98455298E-1, 3.19209784E-1, 3.40971738E-1},
+{1.71195343E-1, 2.24327832E-1, 2.62736112E-1},
+{2.30626896E-1, 2.53310233E-1, 3.01206797E-1},
+{2.04814211E-1, 2.21881568E-1, 3.25966567E-1},
+{2.22987518E-1, 3.06339115E-1, 3.50717157E-1},
+{2.00855389E-1, 2.15359926E-1, 2.84143478E-1},
+{2.50951648E-1, 2.66189247E-1, 3.33360583E-1},
+{1.75610259E-1, 2.93791324E-1, 3.40326935E-1},
+{2.91745067E-1, 3.40602487E-1, 3.81397158E-1}};
+
+static const float evrc_lspq_full_codebook4[128][3] = {
+{2.77461529E-1, 3.16972077E-1, 3.95498335E-1},
+{3.36560428E-1, 3.60156953E-1, 3.81473005E-1},
+{3.10509324E-1, 3.31732392E-1, 3.66864383E-1},
+{3.37470949E-1, 3.96795273E-1, 4.12356317E-1},
+{2.79660404E-1, 3.66520107E-1, 3.85313451E-1},
+{3.16038966E-1, 3.85609329E-1, 4.01304781E-1},
+{3.09960425E-1, 3.43410730E-1, 4.24745500E-1},
+{3.54243636E-1, 4.08699274E-1, 4.22167957E-1},
+{2.95587242E-1, 3.33741128E-1, 3.87421668E-1},
+{3.33446383E-1, 3.86974752E-1, 4.01353061E-1},
+{3.23412836E-1, 3.65269661E-1, 3.85193288E-1},
+{3.42731953E-1, 4.03192520E-1, 4.19920385E-1},
+{2.77681828E-1, 3.82494986E-1, 4.04274166E-1},
+{3.18247974E-1, 3.95985305E-1, 4.31353152E-1},
+{3.03711414E-1, 3.80319715E-1, 4.37173545E-1},
+{3.78288805E-1, 4.07077312E-1, 4.22679126E-1},
+{2.38116503E-1, 3.42454314E-1, 4.24624741E-1},
+{3.45615685E-1, 3.68681073E-1, 4.00817335E-1},
+{3.17688107E-1, 3.41902673E-1, 4.05601799E-1},
+{3.66368949E-1, 3.89039934E-1, 4.06154454E-1},
+{2.99398005E-1, 3.52021694E-1, 3.99955690E-1},
+{3.24991941E-1, 3.90028834E-1, 4.19478714E-1},
+{3.23025763E-1, 3.68114293E-1, 4.02087748E-1},
+{3.62326264E-1, 4.16927993E-1, 4.32773650E-1},
+{2.72696435E-1, 3.59205008E-1, 4.26880658E-1},
+{3.46539855E-1, 3.69616628E-1, 4.15621221E-1},
+{3.34109128E-1, 3.55736315E-1, 3.96749556E-1},
+{3.37468982E-1, 4.10392702E-1, 4.25986826E-1},
+{2.99468994E-1, 3.80648255E-1, 4.18284118E-1},
+{3.21378171E-1, 4.11198020E-1, 4.28792536E-1},
+{3.27841163E-1, 3.69345129E-1, 4.34395611E-1},
+{3.80669057E-1, 4.26086366E-1, 4.42754567E-1},
+{2.68943667E-1, 3.42942953E-1, 3.98681462E-1},
+{3.38102877E-1, 3.76338840E-1, 3.92043173E-1},
+{3.23593497E-1, 3.48742068E-1, 3.72551978E-1},
+{3.47550809E-1, 3.92885387E-1, 4.21169937E-1},
+{3.04182827E-1, 3.59816670E-1, 3.81633341E-1},
+{3.14221382E-1, 4.02108550E-1, 4.20085251E-1},
+{3.01306546E-1, 3.62662733E-1, 4.29262817E-1},
+{3.71770263E-1, 3.98696363E-1, 4.31438982E-1},
+{2.74591267E-1, 3.35595489E-1, 4.20079648E-1},
+{3.44540834E-1, 3.90451789E-1, 4.06412065E-1},
+{3.25239837E-1, 3.78344476E-1, 3.94673288E-1},
+{3.56683493E-1, 3.90574157E-1, 4.33851063E-1},
+{2.63501287E-1, 3.95260096E-1, 4.23116386E-1},
+{3.37520659E-1, 3.92563462E-1, 4.43415821E-1},
+{3.14522266E-1, 3.80968630E-1, 4.22676384E-1},
+{3.76235068E-1, 4.17298734E-1, 4.31451261E-1},
+{2.61855006E-1, 3.68646085E-1, 4.04260576E-1},
+{3.55580151E-1, 3.77994478E-1, 3.95868242E-1},
+{3.27742815E-1, 3.53872776E-1, 4.11040604E-1},
+{3.62960637E-1, 3.99466991E-1, 4.14690197E-1},
+{3.09410870E-1, 3.73796046E-1, 3.92672479E-1},
+{3.31016302E-1, 4.00801599E-1, 4.31759298E-1},
+{3.23573053E-1, 3.68619561E-1, 4.17455137E-1},
+{3.49115849E-1, 4.26840067E-1, 4.43913996E-1},
+{2.89738595E-1, 3.63759339E-1, 4.10511792E-1},
+{3.55286479E-1, 3.89331281E-1, 4.13432419E-1},
+{3.36565912E-1, 3.60222459E-1, 4.24179018E-1},
+{3.39932680E-1, 4.09228802E-1, 4.40184891E-1},
+{3.00889730E-1, 4.00081098E-1, 4.17955697E-1},
+{3.17052066E-1, 4.22288120E-1, 4.42229569E-1},
+{3.27336788E-1, 3.84311676E-1, 4.30288613E-1},
+{3.98990929E-1, 4.29498434E-1, 4.43475187E-1},
+{2.49110118E-1, 3.25696886E-1, 4.11728263E-1},
+{3.45929205E-1, 3.68577540E-1, 3.88473272E-1},
+{3.13219666E-1, 3.39229465E-1, 3.87597919E-1},
+{3.51453960E-1, 3.98730278E-1, 4.12656188E-1},
+{2.93487132E-1, 3.75763118E-1, 3.94488096E-1},
+{3.24470758E-1, 3.94202888E-1, 4.08882737E-1},
+{3.12710822E-1, 3.57720256E-1, 4.14061189E-1},
+{3.66507173E-1, 4.08171296E-1, 4.23891425E-1},
+{2.99965680E-1, 3.31993401E-1, 4.07860160E-1},
+{3.34925175E-1, 3.86143029E-1, 4.11538124E-1},
+{3.34788024E-1, 3.66196156E-1, 3.93347144E-1},
+{3.47847939E-1, 4.05926466E-1, 4.30507302E-1},
+{2.85952926E-1, 3.95283282E-1, 4.16119337E-1},
+{3.23867381E-1, 4.06476676E-1, 4.42482829E-1},
+{3.16716671E-1, 3.84451628E-1, 4.39411044E-1},
+{3.86772931E-1, 4.11824584E-1, 4.27831531E-1},
+{2.38072395E-1, 3.62342358E-1, 4.30931687E-1},
+{3.46450031E-1, 3.79082918E-1, 4.06567812E-1},
+{3.16576600E-1, 3.56468618E-1, 3.96218300E-1},
+{3.66539180E-1, 3.89590919E-1, 4.21055555E-1},
+{3.08291376E-1, 3.71324301E-1, 4.07867432E-1},
+{3.36435199E-1, 3.91514421E-1, 4.22977090E-1},
+{3.23035538E-1, 3.80447328E-1, 4.09550190E-1},
+{3.65228057E-1, 4.27910388E-1, 4.43691254E-1},
+{2.72038043E-1, 3.76596808E-1, 4.33685899E-1},
+{3.57665777E-1, 3.77761602E-1, 4.09178972E-1},
+{3.36498559E-1, 3.64215910E-1, 4.09255505E-1},
+{3.48082423E-1, 4.17631805E-1, 4.33284521E-1},
+{3.02754521E-1, 3.95974755E-1, 4.33717251E-1},
+{3.31676304E-1, 4.17587161E-1, 4.36239839E-1},
+{3.33287597E-1, 3.80799115E-1, 4.39620733E-1},
+{3.88112009E-1, 4.36933577E-1, 4.50829268E-1},
+{2.56026626E-1, 3.48015189E-1, 4.22922611E-1},
+{3.45773995E-1, 3.81725788E-1, 3.96794081E-1},
+{3.25623751E-1, 3.50391924E-1, 3.87330651E-1},
+{3.56868088E-1, 3.98574769E-1, 4.23177242E-1},
+{3.01226199E-1, 3.86906981E-1, 4.03335571E-1},
+{3.28178406E-1, 4.02090192E-1, 4.19389248E-1},
+{3.14385355E-1, 3.69043887E-1, 4.34375286E-1},
+{3.72321129E-1, 4.11672413E-1, 4.40518737E-1},
+{2.90479720E-1, 3.48121881E-1, 4.26216483E-1},
+{3.44438791E-1, 3.82666349E-1, 4.17321086E-1},
+{3.34866822E-1, 3.76235664E-1, 4.04475212E-1},
+{3.59025359E-1, 4.04721916E-1, 4.34838414E-1},
+{2.79127955E-1, 4.11106586E-1, 4.35360551E-1},
+{3.48125517E-1, 3.98732066E-1, 4.46927428E-1},
+{3.27018857E-1, 3.90107334E-1, 4.41707492E-1},
+{3.90858352E-1, 4.19813931E-1, 4.35153484E-1},
+{2.55319297E-1, 3.70405972E-1, 4.32188630E-1},
+{3.54651988E-1, 3.88332665E-1, 4.02956128E-1},
+{3.21608186E-1, 3.54489803E-1, 4.28299785E-1},
+{3.75163496E-1, 3.98833990E-1, 4.14177418E-1},
+{3.11953604E-1, 3.91430676E-1, 4.12552476E-1},
+{3.42528820E-1, 3.96365345E-1, 4.32497382E-1},
+{3.33744347E-1, 3.76422405E-1, 4.20536995E-1},
+{3.53529096E-1, 4.29231048E-1, 4.59699273E-1},
+{2.88017929E-1, 3.77999961E-1, 4.34011698E-1},
+{3.55683446E-1, 3.80780041E-1, 4.23145533E-1},
+{3.44358265E-1, 3.72184873E-1, 4.31265354E-1},
+{3.53966117E-1, 4.14166689E-1, 4.42941308E-1},
+{3.04770231E-1, 4.12517488E-1, 4.34183121E-1},
+{3.35913360E-1, 4.24590766E-1, 4.46378469E-1},
+{3.43738198E-1, 3.84766221E-1, 4.35271382E-1},
+{4.10941303E-1, 4.40662980E-1, 4.52113390E-1}};
+
+static const float evrc_lspq_half_codebook1[128][3] = {
+{1.35226343E-2, 1.82081293E-2, 3.93940695E-2},
+{2.29392890E-2, 3.57831158E-2, 1.05352886E-1},
+{2.09106486E-2, 3.04159056E-2, 8.93941075E-2},
+{1.88909005E-2, 3.82722206E-2, 1.37820408E-1},
+{2.05143820E-2, 2.85481159E-2, 7.39762187E-2},
+{4.69510332E-2, 6.84031919E-2, 1.09123811E-1},
+{3.15557197E-2, 5.69139980E-2, 8.57057571E-2},
+{3.81181911E-2, 7.77784660E-2, 1.92532852E-1},
+{2.16297153E-2, 2.92908940E-2, 6.25042021E-2},
+{3.11414022E-2, 5.99079318E-2, 1.02860682E-1},
+{3.02799307E-2, 5.35012372E-2, 7.80925751E-2},
+{6.50846213E-2, 9.06624720E-2, 1.42850950E-1},
+{3.27340364E-2, 5.04027791E-2, 6.26492277E-2},
+{5.27439862E-2, 6.22574277E-2, 1.22198336E-1},
+{3.48840356E-2, 6.42222390E-2, 9.16024595E-2},
+{4.88984436E-2, 1.05058022E-1, 1.68813452E-1},
+{2.35791076E-2, 3.21034677E-2, 5.60899563E-2},
+{2.77252812E-2, 4.87281792E-2, 1.01224191E-1},
+{2.74348017E-2, 4.04965915E-2, 9.34926122E-2},
+{4.38360050E-2, 6.03261292E-2, 1.52400866E-1},
+{2.68994924E-2, 4.52906378E-2, 6.49800375E-2},
+{5.16058952E-2, 6.08312152E-2, 1.08799636E-1},
+{4.20064926E-2, 6.11845106E-2, 8.54474008E-2},
+{7.13502690E-2, 1.01972111E-1, 1.74640998E-1},
+{2.88906675E-2, 4.13964354E-2, 5.25928028E-2},
+{3.16364467E-2, 6.63532093E-2, 1.24950245E-1},
+{4.30289507E-2, 5.14023267E-2, 7.96877742E-2},
+{5.70970774E-2, 1.08444504E-1, 1.44075617E-1},
+{3.38840261E-2, 5.04746847E-2, 7.29765445E-2},
+{6.54265657E-2, 7.90987685E-2, 1.15570590E-1},
+{3.85423526E-2, 7.33125433E-2, 1.02307513E-1},
+{6.57824501E-2, 1.02909811E-1, 2.11874440E-1},
+{1.54727865E-2, 2.04559695E-2, 5.46121262E-2},
+{2.27950197E-2, 3.90954204E-2, 1.19443826E-1},
+{3.06889173E-2, 4.54540215E-2, 8.20418894E-2},
+{2.25957241E-2, 4.79101725E-2, 1.71844408E-1},
+{2.71088015E-2, 4.01739590E-2, 7.01922849E-2},
+{4.95789349E-2, 7.92963281E-2, 1.04862511E-1},
+{3.06095853E-2, 5.64059429E-2, 9.49584097E-2},
+{6.34224564E-2, 9.11655501E-2, 1.84724405E-1},
+{2.43342388E-2, 3.91998328E-2, 6.31406233E-2},
+{3.38011980E-2, 6.60846457E-2, 1.11031540E-1},
+{3.51784080E-2, 5.79397269E-2, 7.20702857E-2},
+{6.49054050E-2, 8.65831897E-2, 1.54648736E-1},
+{2.91934665E-2, 5.16204573E-2, 6.94437325E-2},
+{5.94522804E-2, 7.19829276E-2, 1.27434507E-1},
+{5.31888530E-2, 6.38182089E-2, 9.88218486E-2},
+{8.68290961E-2, 1.41135350E-1, 1.91728458E-1},
+{2.49991138E-2, 3.62556018E-2, 5.03724031E-2},
+{2.82246377E-2, 5.44572286E-2, 1.12663500E-1},
+{3.62618119E-2, 4.59073223E-2, 9.43343639E-2},
+{5.70455343E-2, 7.46300444E-2, 1.59157172E-1},
+{2.72987466E-2, 4.56625856E-2, 7.52529651E-2},
+{5.12860194E-2, 8.51126984E-2, 1.23587973E-1},
+{4.91451994E-2, 5.93483113E-2, 9.22686011E-2},
+{7.06961900E-2, 1.05451979E-1, 1.92602143E-1},
+{2.80733760E-2, 4.18509208E-2, 5.87159805E-2},
+{4.64449003E-2, 7.06698820E-2, 1.26038432E-1},
+{4.18453738E-2, 6.30445331E-2, 7.66169876E-2},
+{8.42416435E-2, 1.13282882E-1, 1.43687114E-1},
+{4.17615622E-2, 5.59472926E-2, 7.09872842E-2},
+{5.55161387E-2, 9.50126722E-2, 1.27727196E-1},
+{5.90935498E-2, 7.36730024E-2, 9.65935886E-2},
+{7.84136653E-2, 1.41432360E-1, 2.17428640E-1},
+{2.10490543E-2, 2.91891042E-2, 4.60035764E-2},
+{3.64863276E-2, 4.62387018E-2, 1.07044168E-1},
+{2.68652122E-2, 3.92937548E-2, 8.41179937E-2},
+{2.72903945E-2, 5.53805046E-2, 1.41586170E-1},
+{2.48476695E-2, 3.63277681E-2, 7.62430876E-2},
+{5.25430813E-2, 7.75778666E-2, 1.14567965E-1},
+{4.07741442E-2, 5.39923795E-2, 9.07640457E-2},
+{5.73043302E-2, 7.65803084E-2, 1.79578975E-1},
+{2.46032421E-2, 3.41408364E-2, 6.78990781E-2},
+{4.08220068E-2, 6.29783794E-2, 9.95191261E-2},
+{3.83025035E-2, 5.52857481E-2, 7.90019333E-2},
+{7.24111274E-2, 1.01903863E-1, 1.46979645E-1},
+{3.73902172E-2, 4.70463894E-2, 6.54684529E-2},
+{5.27397543E-2, 6.72770366E-2, 1.39680430E-1},
+{4.05365378E-2, 7.05081299E-2, 9.25668627E-2},
+{4.43425253E-2, 1.10367171E-1, 1.99636266E-1},
+{2.54920740E-2, 3.47603969E-2, 6.05902039E-2},
+{4.35465500E-2, 5.32369502E-2, 1.08325966E-1},
+{2.79599819E-2, 4.91324775E-2, 8.84284526E-2},
+{4.98051867E-2, 8.81728902E-2, 1.52597323E-1},
+{3.19346264E-2, 4.62169312E-2, 6.85206428E-2},
+{5.80246300E-2, 6.84268698E-2, 1.15085281E-1},
+{4.33904678E-2, 6.90575615E-2, 8.44984353E-2},
+{7.39691556E-2, 1.19240515E-1, 1.77340195E-1},
+{3.18767503E-2, 4.59697433E-2, 5.72372638E-2},
+{4.50873822E-2, 5.66509366E-2, 1.32005826E-1},
+{4.59097028E-2, 5.45580424E-2, 8.61423314E-2},
+{7.44685754E-2, 1.13815404E-1, 1.61570594E-1},
+{3.97509560E-2, 4.95359488E-2, 7.22542256E-2},
+{6.76257759E-2, 8.31029043E-2, 1.27990112E-1},
+{5.76258078E-2, 6.95326403E-2, 1.05012968E-1},
+{6.85313493E-2, 1.21758826E-1, 2.20626548E-1},
+{2.18480472E-2, 2.99130920E-2, 5.16208000E-2},
+{3.64343151E-2, 4.91795056E-2, 1.23277210E-1},
+{3.89611274E-2, 4.76634987E-2, 8.61716568E-2},
+{4.14635167E-2, 6.88006952E-2, 1.69356152E-1},
+{3.35514620E-2, 4.17815186E-2, 7.37159401E-2},
+{5.80224693E-2, 8.70314166E-2, 1.12917498E-1},
+{4.80243117E-2, 5.69486506E-2, 1.00755706E-1},
+{5.98873124E-2, 8.57942328E-2, 2.01388851E-1},
+{2.99309995E-2, 3.94828431E-2, 6.46376088E-2},
+{3.88626605E-2, 8.07443634E-2, 1.15519784E-1},
+{3.49444002E-2, 6.28911033E-2, 8.04982036E-2},
+{6.88817874E-2, 9.92431119E-2, 1.60393253E-1},
+{3.64237651E-2, 5.34016453E-2, 6.70152009E-2},
+{5.83492741E-2, 7.85285756E-2, 1.41746715E-1},
+{4.86469641E-2, 7.26736858E-2, 9.48315859E-2},
+{5.85533604E-2, 1.36289746E-1, 1.98639736E-1},
+{2.60888506E-2, 3.73406820E-2, 5.57853170E-2},
+{4.58504409E-2, 5.60512505E-2, 1.17927872E-1},
+{4.28801328E-2, 5.14739119E-2, 9.75309014E-2},
+{6.37611598E-2, 8.73552933E-2, 1.68334916E-1},
+{3.76709923E-2, 4.58216034E-2, 7.86528140E-2},
+{6.75194561E-2, 8.98697898E-2, 1.19418114E-1},
+{5.46374246E-2, 6.66805878E-2, 8.93813819E-2},
+{7.73086548E-2, 1.21754415E-1, 1.99579224E-1},
+{3.15621309E-2, 4.51702215E-2, 6.25768527E-2},
+{3.78782675E-2, 8.03486481E-2, 1.38961688E-1},
+{5.08303270E-2, 6.18740581E-2, 8.31153840E-2},
+{8.96311402E-2, 1.28753766E-1, 1.64891586E-1},
+{4.73503470E-2, 5.75724356E-2, 7.65264630E-2},
+{7.16898590E-2, 9.89895687E-2, 1.30078360E-1},
+{6.29082546E-2, 7.90778771E-2, 1.05111063E-1},
+{8.80649835E-2, 1.65206164E-1, 2.13214174E-1}};
+
+static const float evrc_lspq_half_codebook2[128][3] = {
+{9.75915268E-2, 1.23701490E-1, 1.69437975E-1},
+{9.49536338E-2, 2.01081768E-1, 2.26855248E-1},
+{9.00496617E-2, 1.49164870E-1, 2.26532787E-1},
+{1.70302704E-1, 1.97222874E-1, 2.49974832E-1},
+{1.08773641E-1, 1.51972428E-1, 1.75123364E-1},
+{1.30278930E-1, 2.13229164E-1, 2.29646355E-1},
+{1.24917991E-1, 1.87347755E-1, 2.04712003E-1},
+{2.00670198E-1, 2.28963569E-1, 2.69420803E-1},
+{8.98375586E-2, 1.25332758E-1, 2.10539430E-1},
+{9.62376669E-2, 2.07185850E-1, 2.54174471E-1},
+{1.05694629E-1, 1.78856418E-1, 2.00121015E-1},
+{1.56048968E-1, 2.19573721E-1, 2.91079402E-1},
+{1.37392268E-1, 1.59993336E-1, 1.94698542E-1},
+{1.07262500E-1, 2.37790957E-1, 2.70740807E-1},
+{1.42976448E-1, 2.01550499E-1, 2.18468934E-1},
+{2.14270487E-1, 2.71881402E-1, 3.01200211E-1},
+{1.10729210E-1, 1.33688226E-1, 1.54877156E-1},
+{1.06667660E-1, 1.76678821E-1, 2.62798905E-1},
+{9.16352943E-2, 1.74592838E-1, 2.19329327E-1},
+{1.84038624E-1, 2.27964059E-1, 2.47762203E-1},
+{1.10572360E-1, 1.58207163E-1, 1.96013063E-1},
+{1.33543387E-1, 2.32269660E-1, 2.51828164E-1},
+{1.55922309E-1, 1.77941337E-1, 2.18096644E-1},
+{1.92260072E-1, 2.49512479E-1, 2.89911509E-1},
+{1.13708906E-1, 1.37872443E-1, 2.02929884E-1},
+{1.02557532E-1, 1.84820071E-1, 2.92164624E-1},
+{1.36595622E-1, 1.58687428E-1, 2.41399556E-1},
+{1.72813818E-1, 2.49303415E-1, 3.00458610E-1},
+{1.36871174E-1, 1.57249823E-1, 2.10913152E-1},
+{1.28974810E-1, 2.45167866E-1, 2.67653584E-1},
+{1.66812256E-1, 1.88998029E-1, 2.31345922E-1},
+{2.32248470E-1, 2.63196051E-1, 3.16754937E-1},
+{9.24560949E-2, 1.19977452E-1, 1.91262275E-1},
+{1.13085262E-1, 2.08461538E-1, 2.29368120E-1},
+{1.00716405E-1, 1.40670076E-1, 2.58062959E-1},
+{1.67010382E-1, 2.18105540E-1, 2.62592494E-1},
+{1.25487238E-1, 1.62686959E-1, 1.84409231E-1},
+{1.52406558E-1, 2.07131729E-1, 2.47582436E-1},
+{1.37441203E-1, 1.80262372E-1, 2.17698842E-1},
+{2.07853511E-1, 2.49209508E-1, 2.69830108E-1},
+{9.35257301E-2, 1.49197355E-1, 2.04652041E-1},
+{1.11997180E-1, 2.25233063E-1, 2.47003049E-1},
+{1.09315015E-1, 1.93811879E-1, 2.13802189E-1},
+{1.75118580E-1, 2.52520263E-1, 2.75082767E-1},
+{1.36918738E-1, 1.77440569E-1, 1.97931141E-1},
+{1.36811242E-1, 2.37426177E-1, 2.84737825E-1},
+{1.60759792E-1, 2.00833157E-1, 2.18084484E-1},
+{2.33710244E-1, 2.66372561E-1, 2.91802049E-1},
+{1.19171090E-1, 1.39703169E-1, 1.87723249E-1},
+{1.31049946E-1, 1.93696663E-1, 2.60426998E-1},
+{1.08267047E-1, 1.65194795E-1, 2.39523023E-1},
+{2.03195021E-1, 2.25942209E-1, 2.49403238E-1},
+{1.23842932E-1, 1.45794615E-1, 2.15635628E-1},
+{1.71226338E-1, 2.38054529E-1, 2.57975638E-1},
+{1.66923836E-1, 1.88604668E-1, 2.11124212E-1},
+{2.10620746E-1, 2.62442708E-1, 2.83127964E-1},
+{1.05748810E-1, 1.36286482E-1, 2.20050186E-1},
+{9.72945765E-2, 2.33471528E-1, 2.96113968E-1},
+{1.34298369E-1, 1.93955436E-1, 2.39148825E-1},
+{1.64229318E-1, 2.70067751E-1, 2.94142485E-1},
+{1.42760262E-1, 1.65033355E-1, 2.24100381E-1},
+{1.46414533E-1, 2.47942328E-1, 3.00708115E-1},
+{1.74778774E-1, 2.19349250E-1, 2.38162965E-1},
+{2.36311123E-1, 2.90669680E-1, 3.28010976E-1},
+{1.14076428E-1, 1.33071408E-1, 1.73181504E-1},
+{1.13575839E-1, 1.90307274E-1, 2.41681188E-1},
+{8.59165266E-2, 1.63920239E-1, 2.37934500E-1},
+{1.92916945E-1, 2.15082392E-1, 2.39128128E-1},
+{1.37291834E-1, 1.59423307E-1, 1.79722220E-1},
+{1.40435383E-1, 2.22092256E-1, 2.40960747E-1},
+{1.40387163E-1, 1.89601168E-1, 2.05635697E-1},
+{2.11695507E-1, 2.36578360E-1, 2.81248927E-1},
+{9.03010592E-2, 1.27157405E-1, 2.33567923E-1},
+{1.10118054E-1, 2.09328398E-1, 2.72836268E-1},
+{1.16710417E-1, 1.77853987E-1, 2.22808748E-1},
+{1.81691542E-1, 2.32265159E-1, 2.74991214E-1},
+{1.46553472E-1, 1.69474706E-1, 1.90245956E-1},
+{1.09213792E-1, 2.63291955E-1, 2.88490772E-1},
+{1.49815127E-1, 2.11342707E-1, 2.28899449E-1},
+{1.97645500E-1, 2.83229947E-1, 3.14882278E-1},
+{1.24495603E-1, 1.46097973E-1, 1.66125208E-1},
+{1.34878591E-1, 1.83030054E-1, 2.89288282E-1},
+{9.33032110E-2, 1.83962211E-1, 2.38543004E-1},
+{1.92844257E-1, 2.39588335E-1, 2.58421540E-1},
+{1.23796798E-1, 1.65556595E-1, 2.08408386E-1},
+{1.51144341E-1, 2.35801116E-1, 2.59280622E-1},
+{1.50657728E-1, 1.90052524E-1, 2.28362590E-1},
+{1.98180959E-1, 2.56794214E-1, 3.08975637E-1},
+{1.28490031E-1, 1.49084017E-1, 1.98376507E-1},
+{9.20595750E-2, 2.12231293E-1, 2.92948842E-1},
+{1.41698137E-1, 1.72356680E-1, 2.58454144E-1},
+{1.96733460E-1, 2.29709730E-1, 2.95780182E-1},
+{1.47062227E-1, 1.68918088E-1, 2.07363635E-1},
+{1.36309877E-1, 2.60373056E-1, 2.82607377E-1},
+{1.81041077E-1, 2.01826140E-1, 2.38867551E-1},
+{2.45326266E-1, 2.80183077E-1, 3.11954319E-1},
+{1.04131766E-1, 1.33040652E-1, 1.89834684E-1},
+{1.23298146E-1, 2.09621087E-1, 2.47813210E-1},
+{1.24040775E-1, 1.59827366E-1, 2.58856058E-1},
+{1.87048867E-1, 2.12488100E-1, 2.59629130E-1},
+{1.24255307E-1, 1.73768952E-1, 1.92850024E-1},
+{1.58917829E-1, 2.25389823E-1, 2.43284762E-1},
+{1.53421149E-1, 1.91807315E-1, 2.09249526E-1},
+{2.27154449E-1, 2.51181155E-1, 2.72600353E-1},
+{1.09922059E-1, 1.57100275E-1, 2.20024973E-1},
+{1.32782355E-1, 2.19485506E-1, 2.67028928E-1},
+{1.26857504E-1, 1.98836312E-1, 2.17928499E-1},
+{1.91415027E-1, 2.52424240E-1, 2.72652745E-1},
+{1.55277625E-1, 1.79573521E-1, 2.00773627E-1},
+{1.17547743E-1, 2.47869864E-1, 3.08279335E-1},
+{1.65706977E-1, 2.10339502E-1, 2.29199320E-1},
+{2.25694910E-1, 2.84438193E-1, 3.12106073E-1},
+{1.29503176E-1, 1.48420051E-1, 1.80180401E-1},
+{1.54752508E-1, 1.97748467E-1, 2.67275035E-1},
+{1.28590241E-1, 1.76178381E-1, 2.39905864E-1},
+{2.14926764E-1, 2.37634435E-1, 2.58794010E-1},
+{1.28322318E-1, 1.59338519E-1, 2.26626605E-1},
+{1.55747548E-1, 2.47740522E-1, 2.73726821E-1},
+{1.75741687E-1, 1.97952345E-1, 2.19115943E-1},
+{2.18626365E-1, 2.45809183E-1, 3.00479650E-1},
+{1.17709018E-1, 1.45512864E-1, 2.38044471E-1},
+{1.18006893E-1, 2.23775521E-1, 2.94175088E-1},
+{1.51349202E-1, 1.88157812E-1, 2.48743281E-1},
+{1.89312205E-1, 2.69580543E-1, 2.93785989E-1},
+{1.49895594E-1, 1.74537256E-1, 2.37430006E-1},
+{1.39775530E-1, 2.71709383E-1, 3.07839513E-1},
+{1.83945730E-1, 2.07717165E-1, 2.26722151E-1},
+{2.54552156E-1, 2.96640933E-1, 3.24801445E-1}};
+
+static const float evrc_lspq_half_codebook3[256][4] = {
+{2.36904725E-1, 2.56104350E-1, 3.16955745E-1, 4.07520533E-1},
+{2.97596931E-1, 3.23482454E-1, 3.47667515E-1, 3.74551237E-1},
+{2.73721159E-1, 2.98297524E-1, 3.29923928E-1, 3.83599102E-1},
+{3.07849586E-1, 3.32836270E-1, 3.89340341E-1, 4.05575991E-1},
+{2.33803615E-1, 2.60296524E-1, 3.67351949E-1, 4.04388249E-1},
+{2.97513664E-1, 3.15356553E-1, 3.85135233E-1, 4.02197123E-1},
+{2.85618782E-1, 3.10872793E-1, 3.65022361E-1, 3.84816766E-1},
+{3.35271597E-1, 3.55222225E-1, 3.81921113E-1, 3.98685753E-1},
+{2.00265601E-1, 2.50502288E-1, 3.70398223E-1, 4.32012677E-1},
+{3.07982087E-1, 3.33767712E-1, 3.58199060E-1, 3.78386796E-1},
+{2.60086119E-1, 3.25520277E-1, 3.56873333E-1, 3.84737790E-1},
+{3.01356375E-1, 3.41369390E-1, 4.00296748E-1, 4.17337179E-1},
+{2.67080963E-1, 2.97674358E-1, 3.69702041E-1, 3.89139235E-1},
+{2.72669852E-1, 3.49704087E-1, 3.91925275E-1, 4.06383276E-1},
+{2.52825916E-1, 3.49636555E-1, 3.84550989E-1, 4.05930996E-1},
+{3.42927098E-1, 3.74274015E-1, 4.05468166E-1, 4.20351923E-1},
+{2.52408743E-1, 2.80375838E-1, 3.21436584E-1, 3.88436913E-1},
+{2.96970189E-1, 3.17173600E-1, 3.65342557E-1, 4.02736843E-1},
+{2.81905174E-1, 3.01479161E-1, 3.34335625E-1, 4.07633483E-1},
+{3.26872945E-1, 3.47177684E-1, 3.75017703E-1, 4.05372381E-1},
+{2.36371145E-1, 3.16441059E-1, 3.48707020E-1, 3.82030427E-1},
+{2.87817597E-1, 3.13627005E-1, 4.05129731E-1, 4.23379660E-1},
+{2.77502477E-1, 3.01843822E-1, 3.72250855E-1, 4.19212818E-1},
+{3.28988850E-1, 3.61901104E-1, 4.02015507E-1, 4.19229805E-1},
+{2.24960461E-1, 2.74636388E-1, 3.77016127E-1, 3.94726515E-1},
+{3.01045477E-1, 3.40486169E-1, 3.74888122E-1, 4.02532160E-1},
+{2.59897947E-1, 3.30334961E-1, 3.57493818E-1, 4.08657968E-1},
+{3.00961852E-1, 3.56449068E-1, 4.04779494E-1, 4.22508955E-1},
+{2.20979586E-1, 3.16477656E-1, 4.01744068E-1, 4.20735776E-1},
+{2.79754996E-1, 3.30776095E-1, 4.11152899E-1, 4.32687044E-1},
+{2.64246881E-1, 3.16610634E-1, 3.83876741E-1, 4.36683774E-1},
+{3.44381154E-1, 3.85365665E-1, 4.24949467E-1, 4.41560209E-1},
+{2.19488308E-1, 2.36459881E-1, 3.42465997E-1, 4.24989998E-1},
+{2.91465104E-1, 3.22282016E-1, 3.72852802E-1, 3.91635895E-1},
+{2.74792433E-1, 3.16536307E-1, 3.45392585E-1, 3.74555230E-1},
+{3.10583472E-1, 3.35264921E-1, 3.87527227E-1, 4.23076212E-1},
+{2.23211512E-1, 2.98497617E-1, 3.68426204E-1, 3.90213728E-1},
+{2.89078832E-1, 3.26512754E-1, 3.76308680E-1, 4.09553707E-1},
+{2.63830125E-1, 3.08977246E-1, 3.81453037E-1, 4.04660761E-1},
+{3.47073615E-1, 3.64797831E-1, 3.86763453E-1, 4.04511690E-1},
+{2.18452707E-1, 2.75614083E-1, 3.62711072E-1, 4.18278992E-1},
+{3.15042794E-1, 3.40813220E-1, 3.78627181E-1, 3.96316767E-1},
+{2.79727697E-1, 3.31259727E-1, 3.60061288E-1, 3.81175518E-1},
+{3.18602443E-1, 3.38044286E-1, 4.09010768E-1, 4.30300415E-1},
+{2.64196932E-1, 2.90672481E-1, 3.68595004E-1, 4.31856751E-1},
+{2.72645593E-1, 3.63514841E-1, 3.96518826E-1, 4.20091212E-1},
+{2.26540968E-1, 3.50055099E-1, 3.93851519E-1, 4.12597001E-1},
+{3.53053868E-1, 3.69929552E-1, 4.09656048E-1, 4.26387310E-1},
+{2.60788381E-1, 2.85172462E-1, 3.45943332E-1, 3.97500694E-1},
+{3.01113129E-1, 3.28201890E-1, 3.56068015E-1, 4.10803795E-1},
+{2.88101614E-1, 3.09559643E-1, 3.43756795E-1, 4.24872875E-1},
+{3.10489357E-1, 3.51421893E-1, 3.93717408E-1, 4.15550530E-1},
+{2.22308263E-1, 3.26798201E-1, 3.77981663E-1, 3.98635030E-1},
+{3.02915514E-1, 3.22781920E-1, 3.98558855E-1, 4.25489604E-1},
+{2.77136803E-1, 3.19992602E-1, 3.77490878E-1, 4.29177463E-1},
+{3.38731766E-1, 3.58164370E-1, 4.08386350E-1, 4.25495386E-1},
+{2.18726233E-1, 2.84384966E-1, 3.94053698E-1, 4.16346967E-1},
+{3.01005960E-1, 3.44093680E-1, 3.69013667E-1, 4.15091276E-1},
+{2.80783713E-1, 3.33053648E-1, 3.76726151E-1, 3.97526860E-1},
+{3.14394057E-1, 3.62678826E-1, 4.23668981E-1, 4.41899240E-1},
+{2.66453624E-1, 3.08513761E-1, 3.97407174E-1, 4.17450190E-1},
+{2.94222653E-1, 3.41904402E-1, 4.12726879E-1, 4.34888899E-1},
+{2.87300706E-1, 3.32434595E-1, 3.78856659E-1, 4.38234031E-1},
+{3.57146621E-1, 3.98147047E-1, 4.29875731E-1, 4.44243908E-1},
+{2.29671344E-1, 2.51018614E-1, 3.41046572E-1, 4.04376328E-1},
+{2.94472575E-1, 3.34944606E-1, 3.60409737E-1, 3.83682847E-1},
+{2.88250983E-1, 3.11722696E-1, 3.31680059E-1, 3.65104675E-1},
+{3.24881613E-1, 3.45656693E-1, 3.88306379E-1, 4.05954897E-1},
+{2.50829220E-1, 2.77623534E-1, 3.70799541E-1, 3.90479207E-1},
+{2.93523371E-1, 3.28319192E-1, 3.92112255E-1, 4.09464061E-1},
+{2.83608794E-1, 3.03885639E-1, 3.78504395E-1, 3.97310555E-1},
+{3.34039807E-1, 3.52837384E-1, 3.97272944E-1, 4.14322019E-1},
+{2.21891895E-1, 2.51877457E-1, 3.71723533E-1, 4.31791008E-1},
+{3.13201427E-1, 3.41175437E-1, 3.65503550E-1, 3.88567209E-1},
+{2.71330535E-1, 3.39163721E-1, 3.62616420E-1, 3.95735979E-1},
+{3.07550132E-1, 3.47777665E-1, 4.01049614E-1, 4.32767451E-1},
+{2.59387434E-1, 2.87243843E-1, 3.86817336E-1, 4.06042695E-1},
+{2.85485208E-1, 3.44094992E-1, 4.02050495E-1, 4.19413745E-1},
+{2.65781403E-1, 3.40084374E-1, 3.69407654E-1, 4.27031696E-1},
+{3.53740931E-1, 3.84463251E-1, 4.11747813E-1, 4.26181793E-1},
+{2.43866488E-1, 2.68350184E-1, 3.42201948E-1, 3.98457229E-1},
+{2.93145239E-1, 3.34754169E-1, 3.61702800E-1, 3.98416638E-1},
+{2.91342974E-1, 3.13155174E-1, 3.36525917E-1, 3.87748599E-1},
+{3.05656791E-1, 3.62904549E-1, 3.88153434E-1, 4.05543149E-1},
+{2.17492327E-1, 3.11723530E-1, 3.75984788E-1, 4.28997755E-1},
+{2.91149259E-1, 3.29380929E-1, 4.03900385E-1, 4.22333181E-1},
+{2.90362060E-1, 3.09530973E-1, 3.78994226E-1, 4.13688362E-1},
+{3.29564869E-1, 3.77404690E-1, 4.06584859E-1, 4.24739718E-1},
+{2.46461585E-1, 2.71593273E-1, 3.66338253E-1, 4.30753767E-1},
+{3.14107716E-1, 3.37011874E-1, 3.80409718E-1, 4.11099434E-1},
+{2.76568413E-1, 3.27320695E-1, 3.58844280E-1, 4.28949475E-1},
+{3.17179084E-1, 3.58972430E-1, 4.04765844E-1, 4.40376341E-1},
+{2.42777750E-1, 3.34954798E-1, 3.96943450E-1, 4.13318396E-1},
+{2.88895488E-1, 3.25691164E-1, 4.22859550E-1, 4.43758667E-1},
+{2.77583301E-1, 3.25479031E-1, 3.89144659E-1, 4.41075861E-1},
+{3.59125674E-1, 3.90694141E-1, 4.21009541E-1, 4.35708523E-1},
+{2.20172390E-1, 2.47719273E-1, 3.54381859E-1, 4.25398111E-1},
+{3.06046784E-1, 3.27924728E-1, 3.66992772E-1, 3.93192589E-1},
+{2.70805597E-1, 3.16826642E-1, 3.45648706E-1, 4.11717594E-1},
+{3.23188901E-1, 3.45463097E-1, 3.89778793E-1, 4.21570778E-1},
+{2.46136114E-1, 3.12391996E-1, 3.72188628E-1, 3.95842731E-1},
+{3.03856730E-1, 3.24354768E-1, 3.85747254E-1, 4.14155006E-1},
+{2.81075418E-1, 3.18608463E-1, 3.85646880E-1, 4.02703643E-1},
+{3.53517115E-1, 3.72702539E-1, 3.96264613E-1, 4.13074911E-1},
+{2.09221140E-1, 2.95262218E-1, 3.80314291E-1, 4.31278229E-1},
+{3.25313628E-1, 3.46735477E-1, 3.70724022E-1, 3.91045630E-1},
+{2.86396503E-1, 3.43560040E-1, 3.69713604E-1, 3.89867842E-1},
+{3.27794671E-1, 3.47367823E-1, 4.05465066E-1, 4.24566150E-1},
+{2.53054976E-1, 3.02656293E-1, 3.82165134E-1, 4.29898322E-1},
+{2.94418454E-1, 3.70745420E-1, 3.95443261E-1, 4.19514775E-1},
+{2.62873113E-1, 3.45069230E-1, 4.04140890E-1, 4.21902061E-1},
+{3.65063488E-1, 3.82435143E-1, 4.13424790E-1, 4.31241691E-1},
+{2.48788506E-1, 2.82372773E-1, 3.65772307E-1, 4.10981059E-1},
+{3.07288766E-1, 3.27828944E-1, 3.77664983E-1, 4.36220944E-1},
+{2.98542321E-1, 3.20627332E-1, 3.50569665E-1, 4.27620232E-1},
+{3.16258013E-1, 3.62903833E-1, 3.88225138E-1, 4.25608873E-1},
+{2.39077866E-1, 3.31310451E-1, 3.70317876E-1, 4.15995896E-1},
+{3.03735793E-1, 3.32806051E-1, 4.10232842E-1, 4.27751064E-1},
+{2.96002507E-1, 3.19014788E-1, 3.81062448E-1, 4.26954985E-1},
+{3.32508922E-1, 3.62516999E-1, 4.23315108E-1, 4.40995157E-1},
+{2.35128701E-1, 2.74731100E-1, 4.12070572E-1, 4.35478806E-1},
+{2.98073769E-1, 3.55338752E-1, 3.79087746E-1, 4.15318787E-1},
+{2.83429801E-1, 3.45264912E-1, 3.70376289E-1, 4.09900844E-1},
+{3.23593080E-1, 3.65412831E-1, 4.12813127E-1, 4.31023479E-1},
+{2.76626348E-1, 3.00508440E-1, 4.02236879E-1, 4.26638782E-1},
+{2.94512928E-1, 3.61443222E-1, 4.19635236E-1, 4.36999202E-1},
+{2.90807247E-1, 3.41689348E-1, 3.92779291E-1, 4.43490267E-1},
+{3.59391451E-1, 4.03985143E-1, 4.40843761E-1, 4.53028619E-1},
+{2.23295465E-1, 2.39192486E-1, 3.23768020E-1, 4.21689451E-1},
+{2.94778049E-1, 3.18798721E-1, 3.53217840E-1, 3.91906381E-1},
+{2.59032130E-1, 3.10240507E-1, 3.43569040E-1, 3.95064235E-1},
+{3.16474676E-1, 3.38544369E-1, 3.93329024E-1, 4.12235558E-1},
+{2.40108207E-1, 2.84631193E-1, 3.60280991E-1, 3.79973769E-1},
+{2.96909094E-1, 3.15798342E-1, 3.94964337E-1, 4.15127575E-1},
+{2.85434067E-1, 3.04921508E-1, 3.61974716E-1, 4.05767262E-1},
+{3.37407053E-1, 3.56672168E-1, 3.85155082E-1, 4.11186695E-1},
+{2.24014923E-1, 2.60116160E-1, 3.94772530E-1, 4.19585884E-1},
+{3.00647914E-1, 3.41640651E-1, 3.70223522E-1, 3.89520049E-1},
+{2.65946031E-1, 3.25039148E-1, 3.74339938E-1, 3.92346144E-1},
+{3.16029310E-1, 3.40491295E-1, 4.02355313E-1, 4.20484245E-1},
+{2.69841492E-1, 2.94562399E-1, 3.62341762E-1, 4.06415462E-1},
+{2.78897285E-1, 3.59831035E-1, 3.82025838E-1, 4.10577476E-1},
+{2.60760844E-1, 3.31088543E-1, 3.88826251E-1, 4.05486643E-1},
+{3.43372285E-1, 3.82647038E-1, 4.14716601E-1, 4.31592941E-1},
+{2.47998103E-1, 2.73393154E-1, 3.31160426E-1, 4.18943226E-1},
+{3.03579569E-1, 3.25202465E-1, 3.70984435E-1, 4.14420485E-1},
+{2.76896894E-1, 3.00499499E-1, 3.54178190E-1, 4.28807020E-1},
+{3.23655546E-1, 3.59816968E-1, 3.89525414E-1, 4.09288704E-1},
+{2.38927796E-1, 3.09919238E-1, 3.53915572E-1, 4.16634321E-1},
+{2.81171739E-1, 3.07520270E-1, 4.16264892E-1, 4.38523829E-1},
+{2.88858652E-1, 3.09810817E-1, 3.67845178E-1, 4.36035573E-1},
+{3.38423491E-1, 3.70634377E-1, 4.15449977E-1, 4.31534529E-1},
+{2.41260394E-1, 2.73617864E-1, 3.89554620E-1, 4.12539542E-1},
+{2.98046708E-1, 3.40122104E-1, 3.86183739E-1, 4.13826346E-1},
+{2.82436430E-1, 3.31597507E-1, 3.57941389E-1, 4.12115216E-1},
+{3.03820193E-1, 3.70588601E-1, 4.05774951E-1, 4.31517065E-1},
+{2.39077732E-1, 3.11638474E-1, 4.13935781E-1, 4.35304046E-1},
+{2.67116845E-1, 3.41937900E-1, 4.17409420E-1, 4.39184844E-1},
+{2.67946839E-1, 3.33343923E-1, 3.86481404E-1, 4.37462509E-1},
+{3.40510964E-1, 3.90878022E-1, 4.35485125E-1, 4.49101925E-1},
+{2.10069850E-1, 2.32524484E-1, 3.61781418E-1, 4.31357861E-1},
+{2.94509888E-1, 3.33709776E-1, 3.82278621E-1, 3.98638904E-1},
+{2.80525148E-1, 3.25905204E-1, 3.50647032E-1, 3.92873943E-1},
+{3.19999635E-1, 3.43674660E-1, 3.91070545E-1, 4.37501073E-1},
+{2.20581010E-1, 3.03151906E-1, 3.81765544E-1, 4.04488146E-1},
+{2.86122739E-1, 3.29746544E-1, 3.88102829E-1, 4.24247742E-1},
+{2.69807100E-1, 3.25332284E-1, 3.79154503E-1, 4.15138245E-1},
+{3.34858894E-1, 3.69258404E-1, 3.94743145E-1, 4.11922157E-1},
+{2.07109794E-1, 2.72779524E-1, 3.78566444E-1, 4.34579968E-1},
+{3.06466222E-1, 3.46695721E-1, 3.87138307E-1, 4.03558314E-1},
+{2.70148575E-1, 3.46654534E-1, 3.77696693E-1, 3.96434486E-1},
+{3.18745911E-1, 3.40225697E-1, 4.14991558E-1, 4.41578746E-1},
+{2.58592844E-1, 3.14370096E-1, 3.65083754E-1, 4.21615183E-1},
+{2.82712996E-1, 3.54137123E-1, 4.06745970E-1, 4.29267883E-1},
+{2.52021760E-1, 3.59105110E-1, 3.95102918E-1, 4.18148398E-1},
+{3.54906201E-1, 3.74952912E-1, 4.18965995E-1, 4.36144412E-1},
+{2.64841139E-1, 2.92941809E-1, 3.27751458E-1, 4.08790469E-1},
+{3.07774246E-1, 3.35586190E-1, 3.62209618E-1, 4.25394237E-1},
+{2.88466334E-1, 3.16075742E-1, 3.60989630E-1, 4.19551432E-1},
+{3.17128420E-1, 3.55772197E-1, 4.05808747E-1, 4.23972964E-1},
+{2.47089684E-1, 3.38184595E-1, 3.71859610E-1, 3.95971477E-1},
+{3.07981730E-1, 3.32691789E-1, 4.00534213E-1, 4.38273668E-1},
+{2.79484808E-1, 3.16183507E-1, 3.97237718E-1, 4.34746623E-1},
+{3.44490469E-1, 3.66153181E-1, 4.10959423E-1, 4.41727102E-1},
+{2.35741779E-1, 2.94587255E-1, 3.98072541E-1, 4.16833401E-1},
+{3.14038455E-1, 3.52272034E-1, 3.79138887E-1, 4.10969079E-1},
+{2.83002496E-1, 3.38136256E-1, 3.88641894E-1, 4.06193316E-1},
+{3.23625326E-1, 3.50243390E-1, 4.28089559E-1, 4.46630359E-1},
+{2.61252105E-1, 3.24970961E-1, 4.00214493E-1, 4.25321758E-1},
+{3.05284500E-1, 3.42164159E-1, 4.24475133E-1, 4.43830967E-1},
+{2.87374794E-1, 3.32500637E-1, 3.94308269E-1, 4.42538500E-1},
+{3.74075353E-1, 4.02026355E-1, 4.30933535E-1, 4.44160044E-1},
+{2.34503999E-1, 2.56218612E-1, 3.41238797E-1, 4.23045278E-1},
+{3.05492580E-1, 3.29156995E-1, 3.52709830E-1, 3.92439067E-1},
+{2.81323552E-1, 3.03292334E-1, 3.48925412E-1, 3.93163860E-1},
+{3.21893454E-1, 3.50419939E-1, 3.97317469E-1, 4.14560318E-1},
+{2.39684582E-1, 2.92451501E-1, 3.78937423E-1, 3.96535456E-1},
+{3.07307243E-1, 3.29127908E-1, 3.98455560E-1, 4.16143298E-1},
+{2.85274565E-1, 3.08774531E-1, 3.92916501E-1, 4.14437652E-1},
+{3.44446361E-1, 3.62201869E-1, 3.97619784E-1, 4.17743623E-1},
+{2.32083067E-1, 2.67807961E-1, 3.78075659E-1, 4.34560895E-1},
+{3.04738700E-1, 3.51865292E-1, 3.75973165E-1, 3.95293653E-1},
+{2.61990905E-1, 3.46207321E-1, 3.71296942E-1, 4.12438929E-1},
+{3.11080933E-1, 3.51040900E-1, 4.16082799E-1, 4.34340119E-1},
+{2.74980426E-1, 2.96631455E-1, 3.87520492E-1, 4.09243762E-1},
+{2.90939093E-1, 3.54455590E-1, 3.93426955E-1, 4.08220291E-1},
+{2.71871865E-1, 3.45510781E-1, 3.87125313E-1, 4.22590613E-1},
+{3.63245904E-1, 3.81932199E-1, 4.04114902E-1, 4.18370664E-1},
+{2.45770738E-1, 2.72909343E-1, 3.48317921E-1, 4.25161839E-1},
+{3.14139009E-1, 3.37872326E-1, 3.65195215E-1, 4.04423416E-1},
+{2.94075787E-1, 3.16935539E-1, 3.43047202E-1, 4.06130373E-1},
+{3.14627469E-1, 3.72413397E-1, 4.00660694E-1, 4.17930841E-1},
+{2.34014243E-1, 3.14007223E-1, 3.83003533E-1, 4.34829175E-1},
+{2.93635666E-1, 3.20529997E-1, 4.10837352E-1, 4.36393142E-1},
+{2.89505839E-1, 3.11828852E-1, 3.86311471E-1, 4.38771248E-1},
+{3.26317430E-1, 3.80858183E-1, 4.19721425E-1, 4.38795507E-1},
+{2.50809520E-1, 2.83018053E-1, 3.82247388E-1, 4.34244394E-1},
+{3.18994045E-1, 3.44855130E-1, 3.72690141E-1, 4.23067033E-1},
+{2.88380086E-1, 3.36622238E-1, 3.69742334E-1, 4.25057590E-1},
+{3.06107700E-1, 3.81856918E-1, 4.18206155E-1, 4.32868361E-1},
+{2.33898312E-1, 3.44861805E-1, 4.12176549E-1, 4.29216206E-1},
+{2.85980880E-1, 3.42903793E-1, 4.25112903E-1, 4.44299698E-1},
+{2.79858828E-1, 3.38789344E-1, 3.92085373E-1, 4.40541029E-1},
+{3.64509344E-1, 3.82202744E-1, 4.29830611E-1, 4.45818365E-1},
+{2.34392300E-1, 2.57377386E-1, 3.59567046E-1, 4.30088580E-1},
+{3.05031896E-1, 3.27589393E-1, 3.78305554E-1, 4.01026130E-1},
+{2.77522624E-1, 3.18130314E-1, 3.67794275E-1, 4.01543021E-1},
+{3.33035767E-1, 3.55820954E-1, 3.87548923E-1, 4.24628675E-1},
+{2.45021001E-1, 3.12560678E-1, 3.91147614E-1, 4.08762813E-1},
+{2.97059119E-1, 3.40246916E-1, 3.92919302E-1, 4.28899705E-1},
+{2.77839303E-1, 3.25019777E-1, 3.97436380E-1, 4.15920913E-1},
+{3.49465251E-1, 3.70362461E-1, 3.95482540E-1, 4.31923389E-1},
+{2.31485590E-1, 2.91023374E-1, 3.77909541E-1, 4.32259738E-1},
+{3.19283485E-1, 3.53671074E-1, 3.80982876E-1, 3.97843361E-1},
+{2.89689243E-1, 3.50265682E-1, 3.80729675E-1, 3.97969365E-1},
+{3.28987300E-1, 3.52005422E-1, 4.12557244E-1, 4.37597930E-1},
+{2.76273251E-1, 3.02267194E-1, 3.81723404E-1, 4.34989095E-1},
+{2.79627264E-1, 3.73727322E-1, 4.12374616E-1, 4.30626333E-1},
+{2.53442764E-1, 3.65940034E-1, 4.14937019E-1, 4.32743609E-1},
+{3.76107216E-1, 3.95142019E-1, 4.16787744E-1, 4.33023572E-1},
+{2.62815833E-1, 2.88270533E-1, 3.47397208E-1, 4.24182594E-1},
+{3.01931322E-1, 3.43652546E-1, 3.77031326E-1, 4.34204459E-1},
+{2.97834277E-1, 3.23495388E-1, 3.64492416E-1, 4.33550835E-1},
+{3.31774473E-1, 3.64324927E-1, 3.98243546E-1, 4.35078323E-1},
+{2.49049723E-1, 3.27870786E-1, 3.83587003E-1, 4.35558081E-1},
+{3.04653406E-1, 3.27671230E-1, 4.18484688E-1, 4.41378772E-1},
+{2.96960890E-1, 3.23898911E-1, 3.90463710E-1, 4.39915955E-1},
+{3.43923748E-1, 3.67100477E-1, 4.29523230E-1, 4.45214987E-1},
+{2.59399652E-1, 2.91602671E-1, 4.04372454E-1, 4.31413233E-1},
+{2.97537506E-1, 3.57573807E-1, 3.88991833E-1, 4.30006981E-1},
+{2.84068942E-1, 3.49574566E-1, 3.81042838E-1, 4.29712772E-1},
+{3.25716257E-1, 3.74875903E-1, 4.31959271E-1, 4.47290838E-1},
+{2.65302956E-1, 3.14745963E-1, 4.16703463E-1, 4.37294722E-1},
+{3.00398588E-1, 3.54147255E-1, 4.28538084E-1, 4.60336387E-1},
+{2.98077166E-1, 3.49304914E-1, 4.00429249E-1, 4.48213518E-1},
+{3.75576198E-1, 4.16657329E-1, 4.42136765E-1, 4.52728629E-1}};
+
+static const float evrc_lspq_quant_codebook1[16][5] = {
+{0.42091064E-1, 0.69474973E-1, 0.11168948E+0, 0.14571965E+0, 0.20893581E+0},
+{0.54944664E-1, 0.98242261E-1, 0.11007882E+0, 0.15890779E+0, 0.20548241E+0},
+{0.45188572E-1, 0.75199433E-1, 0.11423391E+0, 0.15469728E+0, 0.19746706E+0},
+{0.49474996E-1, 0.79667501E-1, 0.12571351E+0, 0.16944779E+0, 0.20775315E+0},
+{0.41789379E-1, 0.63459560E-1, 0.12068028E+0, 0.15850765E+0, 0.20406815E+0},
+{0.47159236E-1, 0.79129547E-1, 0.12183110E+0, 0.15650047E+0, 0.22309226E+0},
+{0.54539919E-1, 0.80343045E-1, 0.12947764E+0, 0.15186153E+0, 0.20171718E+0},
+{0.55852082E-1, 0.94114847E-1, 0.14016025E+0, 0.17807084E+0, 0.22955489E+0},
+{0.45443531E-1, 0.73541410E-1, 0.11937657E+0, 0.15442030E+0, 0.21010752E+0},
+{0.63178010E-1, 0.95231488E-1, 0.12364983E+0, 0.17672543E+0, 0.21743731E+0},
+{0.52765369E-1, 0.84351443E-1, 0.11589085E+0, 0.15790924E+0, 0.20732352E+0},
+{0.51865745E-1, 0.81328541E-1, 0.13756232E+0, 0.18322878E+0, 0.21640070E+0},
+{0.44419531E-1, 0.68874463E-1, 0.13115251E+0, 0.16263582E+0, 0.21659100E+0},
+{0.49378436E-1, 0.81882551E-1, 0.13067168E+0, 0.16821896E+0, 0.23136081E+0},
+{0.55909779E-1, 0.90783298E-1, 0.13348848E+0, 0.16298474E+0, 0.20961523E+0},
+{0.61378211E-1, 0.98602772E-1, 0.14793332E+0, 0.19283190E+0, 0.23156509E+0}};
+
+static const float evrc_lspq_quant_codebook2[16][5] = {
+{0.26822963, 0.30585295, 0.31110349, 0.36823335, 0.40774474},
+{0.24418014, 0.28970167, 0.32573757, 0.39021483, 0.41345838},
+{0.23341830, 0.30078292, 0.32893899, 0.38557330, 0.41068462},
+{0.25905868, 0.29756859, 0.34196618, 0.38531172, 0.41295227},
+{0.24290450, 0.29223618, 0.32718554, 0.37788135, 0.40332928},
+{0.24674191, 0.29749370, 0.33631226, 0.39426059, 0.42258954},
+{0.21377595, 0.33140418, 0.34067687, 0.38222077, 0.40939021},
+{0.26673481, 0.30791649, 0.34419721, 0.39611506, 0.42387524},
+{0.26121426, 0.30492544, 0.32997236, 0.38486803, 0.42023736},
+{0.24954870, 0.29372856, 0.33382735, 0.37850669, 0.41714057},
+{0.24158891, 0.30173415, 0.34128246, 0.38428575, 0.41619650},
+{0.25818908, 0.31736413, 0.34904337, 0.38769925, 0.41551358},
+{0.24450587, 0.30673453, 0.33579323, 0.37844428, 0.40557048},
+{0.25164026, 0.31225079, 0.33847794, 0.39554194, 0.42396802},
+{0.22787990, 0.31779197, 0.33831909, 0.40044111, 0.41185561},
+{0.27896860, 0.32261974, 0.35658112, 0.40206763, 0.42370448}};
+
+static const float * const evrc_lspq_full_codebooks[] = {
+    evrc_lspq_full_codebook1[0], evrc_lspq_full_codebook2[0],
+    evrc_lspq_full_codebook3[0], evrc_lspq_full_codebook4[0],
+};
+
+static const float * const evrc_lspq_half_codebooks[] = {
+    evrc_lspq_half_codebook1[0], evrc_lspq_half_codebook2[0],
+    evrc_lspq_half_codebook3[0],
+};
+
+static const float * const evrc_lspq_quant_codebooks[] = {
+    evrc_lspq_quant_codebook1[0], evrc_lspq_quant_codebook2[0],
+};
+
+static const float * const * const evrc_lspq_codebooks[] = {
+    0,
+    evrc_lspq_quant_codebooks,
+    0,
+    evrc_lspq_half_codebooks,
+    evrc_lspq_full_codebooks,
+};
+
+static const uint8_t evrc_lspq_nb_codebooks[] = {
+    0,
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebooks),
+    0,
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebooks),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebooks),
+};
+
+static const uint8_t evrc_lspq_full_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook2[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook3[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook4[0]),
+};
+
+static const uint8_t evrc_lspq_half_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook2[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook3[0]),
+};
+
+static const uint8_t evrc_lspq_quant_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebook2[0]),
+};
+
+static const uint8_t* const evrc_lspq_codebooks_row_sizes[] = {
+    NULL,
+    evrc_lspq_quant_codebooks_row_sizes,
+    NULL,
+    evrc_lspq_half_codebooks_row_sizes,
+    evrc_lspq_full_codebooks_row_sizes,
+};
+
+static const float pitch_gain_vq[] = { 0, 0.3, 0.55, 0.7, 0.8, 0.9, 1, 1.2 };
+static const float estimation_delay[] = { 55.0, 80.0, 39.0, 71.0, 33.0 }; // 5.2.3.4
+static const uint8_t subframe_sizes[] = { 53, 53, 54 };
+#endif /* AVCODEC_EVRCDATA_H */
diff --git a/libavcodec/evrcdec.c b/libavcodec/evrcdec.c
new file mode 100644
index 0000000..8728c02
--- /dev/null
+++ b/libavcodec/evrcdec.c
@@ -0,0 +1,941 @@
+/*
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * @author Paul B Mahol
+ */
+
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "evrcdata.h"
+#include "acelp_vectors.h"
+#include "lsp.h"
+
+#define MIN_LSP_SEP (0.05 / (2.0 * M_PI))
+#define MIN_DELAY      20
+#define MAX_DELAY     120
+#define NB_SUBFRAMES    3
+#define SUBFRAME_SIZE  54
+#define FILTER_ORDER   10
+#define ACB_SIZE      128
+
+typedef enum {
+    RATE_ERRS = -1,
+    SILENCE,
+    RATE_QUANT,
+    RATE_QUARTER,
+    RATE_HALF,
+    RATE_FULL,
+} evrc_packet_rate;
+
+/**
+ * EVRC-A unpacked data frame
+ */
+typedef struct EVRCAFrame {
+    uint8_t  lpc_flag;        ///< spectral change indicator
+    uint16_t lsp[4];          ///< index into LSP codebook
+    uint8_t  pitch_delay;     ///< pitch delay for entire frame
+    uint8_t  delay_diff;      ///< delay difference for entire frame
+    uint8_t  acb_gain[3];     ///< adaptive codebook gain
+    uint16_t fcb_shape[3][4]; ///< fixed codebook shape
+    uint8_t  fcb_gain[3];     ///< fixed codebook gain index
+    uint8_t  energy_gain;     ///< frame energy gain index
+    uint8_t  tty;             ///< tty baud rate bit
+} EVRCAFrame;
+
+typedef struct EVRCContext {
+    AVClass *class;
+
+    int              postfilter;
+
+    GetBitContext    gb;
+    evrc_packet_rate bitrate;
+    evrc_packet_rate last_valid_bitrate;
+    EVRCAFrame       frame;
+
+    float            lspf[FILTER_ORDER];
+    float            prev_lspf[FILTER_ORDER];
+    float            synthesis[FILTER_ORDER];
+    float            postfilter_fir[FILTER_ORDER];
+    float            postfilter_iir[FILTER_ORDER];
+    float            postfilter_residual[ACB_SIZE + SUBFRAME_SIZE];
+    float            pitch_delay;
+    float            prev_pitch_delay;
+    float            avg_acb_gain;  ///< average adaptive codebook gain
+    float            avg_fcb_gain;  ///< average fixed codebook gain
+    float            pitch[ACB_SIZE + FILTER_ORDER + SUBFRAME_SIZE];
+    float            pitch_back[ACB_SIZE];
+    float            interpolation_coeffs[136];
+    float            energy_vector[NB_SUBFRAMES];
+    float            fade_scale;
+    float            last;
+
+    uint8_t          prev_energy_gain;
+    uint8_t          prev_error_flag;
+    uint8_t          warned_buf_mismatch_bitrate;
+} EVRCContext;
+
+/**
+ * Frame unpacking for RATE_FULL, RATE_HALF and RATE_QUANT
+ *
+ * @param e the context
+ *
+ * TIA/IS-127 Table 4.21-1
+ */
+static void unpack_frame(EVRCContext *e)
+{
+    EVRCAFrame *frame = &e->frame;
+    GetBitContext *gb = &e->gb;
+
+    switch (e->bitrate) {
+    case RATE_FULL:
+        frame->lpc_flag        = get_bits1(gb);
+        frame->lsp[0]          = get_bits(gb,  6);
+        frame->lsp[1]          = get_bits(gb,  6);
+        frame->lsp[2]          = get_bits(gb,  9);
+        frame->lsp[3]          = get_bits(gb,  7);
+        frame->pitch_delay     = get_bits(gb,  7);
+        frame->delay_diff      = get_bits(gb,  5);
+        frame->acb_gain[0]     = get_bits(gb,  3);
+        frame->fcb_shape[0][0] = get_bits(gb,  8);
+        frame->fcb_shape[0][1] = get_bits(gb,  8);
+        frame->fcb_shape[0][2] = get_bits(gb,  8);
+        frame->fcb_shape[0][3] = get_bits(gb, 11);
+        frame->fcb_gain[0]     = get_bits(gb,  5);
+        frame->acb_gain[1]     = get_bits(gb,  3);
+        frame->fcb_shape[1][0] = get_bits(gb,  8);
+        frame->fcb_shape[1][1] = get_bits(gb,  8);
+        frame->fcb_shape[1][2] = get_bits(gb,  8);
+        frame->fcb_shape[1][3] = get_bits(gb, 11);
+        frame->fcb_gain    [1] = get_bits(gb,  5);
+        frame->acb_gain    [2] = get_bits(gb,  3);
+        frame->fcb_shape[2][0] = get_bits(gb,  8);
+        frame->fcb_shape[2][1] = get_bits(gb,  8);
+        frame->fcb_shape[2][2] = get_bits(gb,  8);
+        frame->fcb_shape[2][3] = get_bits(gb, 11);
+        frame->fcb_gain    [2] = get_bits(gb,  5);
+        frame->tty             = get_bits1(gb);
+        break;
+    case RATE_HALF:
+        frame->lsp         [0] = get_bits(gb,  7);
+        frame->lsp         [1] = get_bits(gb,  7);
+        frame->lsp         [2] = get_bits(gb,  8);
+        frame->pitch_delay     = get_bits(gb,  7);
+        frame->acb_gain    [0] = get_bits(gb,  3);
+        frame->fcb_shape[0][0] = get_bits(gb, 10);
+        frame->fcb_gain    [0] = get_bits(gb,  4);
+        frame->acb_gain    [1] = get_bits(gb,  3);
+        frame->fcb_shape[1][0] = get_bits(gb, 10);
+        frame->fcb_gain    [1] = get_bits(gb,  4);
+        frame->acb_gain    [2] = get_bits(gb,  3);
+        frame->fcb_shape[2][0] = get_bits(gb, 10);
+        frame->fcb_gain    [2] = get_bits(gb,  4);
+        break;
+    case RATE_QUANT:
+        frame->lsp         [0] = get_bits(gb, 4);
+        frame->lsp         [1] = get_bits(gb, 4);
+        frame->energy_gain     = get_bits(gb, 8);
+        break;
+    }
+}
+
+static evrc_packet_rate buf_size2bitrate(const int buf_size)
+{
+    switch (buf_size) {
+    case 23: return RATE_FULL;
+    case 11: return RATE_HALF;
+    case  6: return RATE_QUARTER;
+    case  3: return RATE_QUANT;
+    case  1: return SILENCE;
+    }
+
+    return RATE_ERRS;
+}
+
+/**
+ * Determine the bitrate from the frame size and/or the first byte of the frame.
+ *
+ * @param avctx the AV codec context
+ * @param buf_size length of the buffer
+ * @param buf the bufffer
+ *
+ * @return the bitrate on success,
+ *         RATE_ERRS  if the bitrate cannot be satisfactorily determined
+ */
+static evrc_packet_rate determine_bitrate(AVCodecContext *avctx,
+                                          int *buf_size,
+                                          const uint8_t **buf)
+{
+    evrc_packet_rate bitrate;
+
+    if ((bitrate = buf_size2bitrate(*buf_size)) >= 0) {
+        if (bitrate > **buf) {
+            EVRCContext *e = avctx->priv_data;
+            if (!e->warned_buf_mismatch_bitrate) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Claimed bitrate and buffer size mismatch.\n");
+                e->warned_buf_mismatch_bitrate = 1;
+            }
+            bitrate = **buf;
+        } else if (bitrate < **buf) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Buffer is too small for the claimed bitrate.\n");
+            return RATE_ERRS;
+        }
+        (*buf)++;
+        *buf_size -= 1;
+    } else if ((bitrate = buf_size2bitrate(*buf_size + 1)) >= 0) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Bitrate byte is missing, guessing the bitrate from packet size.\n");
+    } else
+        return RATE_ERRS;
+
+    return bitrate;
+}
+
+static void warn_insufficient_frame_quality(AVCodecContext *avctx,
+                                            const char *message)
+{
+    av_log(avctx, AV_LOG_WARNING, "Frame #%d, %s\n",
+           avctx->frame_number, message);
+}
+
+/**
+ * Initialize the speech codec according to the specification.
+ *
+ * TIA/IS-127 5.2
+ */
+static av_cold int evrc_decode_init(AVCodecContext *avctx)
+{
+    EVRCContext *e = avctx->priv_data;
+    int i, n, idx = 0;
+    float denom = 2.0 / (2.0 * 8.0 + 1.0);
+
+    avctx->channels       = 1;
+    avctx->channel_layout = AV_CH_LAYOUT_MONO;
+    avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        e->prev_lspf[i] = (i + 1) * 0.048;
+        e->synthesis[i] = 0.0;
+    }
+
+    for (i = 0; i < ACB_SIZE; i++)
+        e->pitch[i] = e->pitch_back[i] = 0.0;
+
+    e->last_valid_bitrate = RATE_QUANT;
+    e->prev_pitch_delay   = 40.0;
+    e->fade_scale         = 1.0;
+    e->prev_error_flag    = 0;
+    e->avg_acb_gain = e->avg_fcb_gain = 0.0;
+
+    for (i = 0; i < 8; i++) {
+        float tt = ((float)i - 8.0 / 2.0) / 8.0;
+
+        for (n = -8; n <= 8; n++, idx++) {
+            float arg1 = M_PI * 0.9 * (tt - n);
+            float arg2 = M_PI * (tt - n);
+
+            e->interpolation_coeffs[idx] = 0.9;
+            if (arg1)
+                e->interpolation_coeffs[idx] *= (0.54 + 0.46 * cos(arg2 * denom)) *
+                                                 sin(arg1) / arg1;
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * Decode the 10 vector quantized line spectral pair frequencies from the LSP
+ * transmission codes of any bitrate and check for badly received packets.
+ *
+ * @param e the context
+ *
+ * @return 0 on success, -1 if the packet is badly received
+ *
+ * TIA/IS-127 5.2.1, 5.7.1
+ */
+static int decode_lspf(EVRCContext *e)
+{
+    const float * const *codebooks = evrc_lspq_codebooks[e->bitrate];
+    int i, j, k = 0;
+
+    for (i = 0; i < evrc_lspq_nb_codebooks[e->bitrate]; i++) {
+        int row_size = evrc_lspq_codebooks_row_sizes[e->bitrate][i];
+        const float *codebook = codebooks[i];
+
+        for (j = 0; j < row_size; j++)
+            e->lspf[k++] = codebook[e->frame.lsp[i] * row_size + j];
+    }
+
+    // check for monotonic LSPs
+    for (i = 1; i < FILTER_ORDER; i++)
+        if (e->lspf[i] <= e->lspf[i - 1])
+            return -1;
+
+    // check for minimum separation of LSPs at the splits
+    for (i = 0, k = 0; i < evrc_lspq_nb_codebooks[e->bitrate] - 1; i++) {
+        k += evrc_lspq_codebooks_row_sizes[e->bitrate][i];
+        if (e->lspf[k] - e->lspf[k - 1] <= MIN_LSP_SEP)
+            return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Interpolation of LSP parameters.
+ *
+ * TIA/IS-127 5.2.3.1, 5.7.3.2
+ */
+static void interpolate_lsp(float *ilsp, const float *lsp,
+                            const float *prev, int index)
+{
+    static const float lsp_interpolation_factors[] = { 0.1667, 0.5, 0.8333 };
+    ff_weighted_vector_sumf(ilsp, prev, lsp,
+                            1.0 - lsp_interpolation_factors[index],
+                            lsp_interpolation_factors[index], FILTER_ORDER);
+}
+
+/*
+ * Reconstruction of the delay contour.
+ *
+ * TIA/IS-127 5.2.2.3.2
+ */
+static void interpolate_delay(float *dst, float current, float prev, int index)
+{
+    static const float d_interpolation_factors[] = { 0, 0.3313, 0.6625, 1, 1 };
+    dst[0] = (1.0 - d_interpolation_factors[index    ]) * prev
+                  + d_interpolation_factors[index    ]  * current;
+    dst[1] = (1.0 - d_interpolation_factors[index + 1]) * prev
+                  + d_interpolation_factors[index + 1]  * current;
+    dst[2] = (1.0 - d_interpolation_factors[index + 2]) * prev
+                  + d_interpolation_factors[index + 2]  * current;
+}
+
+/*
+ * Convert the quantized, interpolated line spectral frequencies,
+ * to prediction coefficients.
+ *
+ * TIA/IS-127 5.2.3.2, 4.7.2.2
+ */
+static void decode_predictor_coeffs(const float *ilspf, float *ilpc)
+{
+    double lsp[FILTER_ORDER];
+    float a[FILTER_ORDER / 2 + 1], b[FILTER_ORDER / 2 + 1];
+    float a1[FILTER_ORDER / 2] = { 0 };
+    float a2[FILTER_ORDER / 2] = { 0 };
+    float b1[FILTER_ORDER / 2] = { 0 };
+    float b2[FILTER_ORDER / 2] = { 0 };
+    int i, k;
+
+    ff_acelp_lsf2lspd(lsp, ilspf, FILTER_ORDER);
+
+    for (k = 0; k <= FILTER_ORDER; k++) {
+        a[0] = k < 2 ? 0.25 : 0;
+        b[0] = k < 2 ? k < 1 ? 0.25 : -0.25 : 0;
+
+        for (i = 0; i < FILTER_ORDER / 2; i++) {
+            a[i + 1] = a[i] - 2 * lsp[i * 2    ] * a1[i] + a2[i];
+            b[i + 1] = b[i] - 2 * lsp[i * 2 + 1] * b1[i] + b2[i];
+            a2[i] = a1[i];
+            a1[i] = a[i];
+            b2[i] = b1[i];
+            b1[i] = b[i];
+        }
+
+        if (k)
+            ilpc[k - 1] = 2.0 * (a[FILTER_ORDER / 2] + b[FILTER_ORDER / 2]);
+    }
+}
+
+static void bl_intrp(EVRCContext *e, float *ex, float delay)
+{
+    float *f;
+    int offset, i, coef_idx;
+    int16_t t;
+
+    offset = lrintf(delay);
+
+    t = (offset - delay + 0.5) * 8.0 + 0.5;
+    if (t == 8) {
+        t = 0;
+        offset--;
+    }
+
+    f = ex - offset - 8;
+
+    coef_idx = t * (2 * 8 + 1);
+
+    ex[0] = 0.0;
+    for (i = 0; i < 2 * 8 + 1; i++)
+        ex[0] += e->interpolation_coeffs[coef_idx + i] * f[i];
+}
+
+/*
+ * Adaptive codebook excitation.
+ *
+ * TIA/IS-127 5.2.2.3.3, 4.12.5.2
+ */
+static void acb_excitation(EVRCContext *e, float *excitation, float gain,
+                           const float delay[3], int length)
+{
+    float denom, locdelay, dpr, invl;
+    int i;
+
+    invl = 1.0 / ((float) length);
+    dpr = length;
+
+    /* first at-most extra samples */
+    denom = (delay[1] - delay[0]) * invl;
+    for (i = 0; i < dpr; i++) {
+        locdelay = delay[0] + i * denom;
+        bl_intrp(e, excitation + i, locdelay);
+    }
+
+    denom = (delay[2] - delay[1]) * invl;
+    /* interpolation */
+    for (i = dpr; i < dpr + 10; i++) {
+        locdelay = delay[1] + (i - dpr) * denom;
+        bl_intrp(e, excitation + i, locdelay);
+    }
+
+    for (i = 0; i < length; i++)
+        excitation[i] *= gain;
+}
+
+static void decode_8_pulses_35bits(const uint16_t *fixed_index, float *cod)
+{
+    int i, pos1, pos2, offset;
+
+    offset = (fixed_index[3] >> 9) & 3;
+
+    for (i = 0; i < 3; i++) {
+        pos1 = ((fixed_index[i] & 0x7f) / 11) * 5 + ((i + offset) % 5);
+        pos2 = ((fixed_index[i] & 0x7f) % 11) * 5 + ((i + offset) % 5);
+
+        cod[pos1] = (fixed_index[i] & 0x80) ? -1.0 : 1.0;
+
+        if (pos2 < pos1)
+            cod[pos2]  = -cod[pos1];
+        else
+            cod[pos2] +=  cod[pos1];
+    }
+
+    pos1 = ((fixed_index[3] & 0x7f) / 11) * 5 + ((3 + offset) % 5);
+    pos2 = ((fixed_index[3] & 0x7f) % 11) * 5 + ((4 + offset) % 5);
+
+    cod[pos1] = (fixed_index[3] & 0x100) ? -1.0 : 1.0;
+    cod[pos2] = (fixed_index[3] & 0x80 ) ? -1.0 : 1.0;
+}
+
+static void decode_3_pulses_10bits(uint16_t fixed_index, float *cod)
+{
+    float sign;
+    int pos;
+
+    sign = (fixed_index & 0x200) ? -1.0 : 1.0;
+
+    pos = ((fixed_index        & 0x7) * 7) + 4;
+    cod[pos] += sign;
+    pos = (((fixed_index >> 3) & 0x7) * 7) + 2;
+    cod[pos] -= sign;
+    pos = (((fixed_index >> 6) & 0x7) * 7);
+    cod[pos] += sign;
+}
+
+/*
+ * Reconstruction of ACELP fixed codebook excitation for full and half rate.
+ *
+ * TIA/IS-127 5.2.3.7
+ */
+static void fcb_excitation(EVRCContext *e, const uint16_t *codebook,
+                           float *excitation, float pitch_gain,
+                           int pitch_lag, int subframe_size)
+{
+    int i;
+
+    if (e->bitrate == RATE_FULL)
+        decode_8_pulses_35bits(codebook, excitation);
+    else
+        decode_3_pulses_10bits(*codebook, excitation);
+
+    pitch_gain = av_clipf(pitch_gain, 0.2, 0.9);
+
+    for (i = pitch_lag; i < subframe_size; i++)
+        excitation[i] += pitch_gain * excitation[i - pitch_lag];
+}
+
+/**
+ * Synthesis of the decoder output signal.
+ *
+ * param[in]     in              input signal
+ * param[in]     filter_coeffs   LPC coefficients
+ * param[in/out] memory          synthesis filter memory
+ * param         buffer_length   amount of data to process
+ * param[out]    samples         output samples
+ *
+ * TIA/IS-127 5.2.3.15, 5.7.3.4
+ */
+static void synthesis_filter(const float *in, const float *filter_coeffs,
+                             float *memory, int buffer_length, float *samples)
+{
+    int i, j;
+
+    for (i = 0; i < buffer_length; i++) {
+        samples[i] = in[i];
+        for (j = FILTER_ORDER - 1; j > 0; j--) {
+            samples[i] -= filter_coeffs[j] * memory[j];
+            memory[j]   = memory[j - 1];
+        }
+        samples[i] -= filter_coeffs[0] * memory[0];
+        memory[0]   = samples[i];
+    }
+}
+
+static void bandwidth_expansion(float *coeff, const float *inbuf, float gamma)
+{
+    double fac = gamma;
+    int i;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        coeff[i] = inbuf[i] * fac;
+        fac *= gamma;
+    }
+}
+
+static void residual_filter(float *output, const float *input,
+                            const float *coef, float *memory, int length)
+{
+    float sum;
+    int i, j;
+
+    for (i = 0; i < length; i++) {
+        sum = input[i];
+
+        for (j = FILTER_ORDER - 1; j > 0; j--) {
+            sum      += coef[j] * memory[j];
+            memory[j] = memory[j - 1];
+        }
+        sum += coef[0] * memory[0];
+        memory[0] = input[i];
+        output[i] = sum;
+    }
+}
+
+/*
+ * TIA/IS-127 Table 5.9.1-1.
+ */
+static const struct PfCoeff {
+    float tilt;
+    float ltgain;
+    float p1;
+    float p2;
+} postfilter_coeffs[5] = {
+    { 0.0 , 0.0 , 0.0 , 0.0  },
+    { 0.0 , 0.0 , 0.57, 0.57 },
+    { 0.0 , 0.0 , 0.0 , 0.0  },
+    { 0.35, 0.50, 0.50, 0.75 },
+    { 0.20, 0.50, 0.57, 0.75 },
+};
+
+/*
+ * Adaptive postfilter.
+ *
+ * TIA/IS-127 5.9
+ */
+static void postfilter(EVRCContext *e, float *in, const float *coeff,
+                       float *out, int idx, const struct PfCoeff *pfc,
+                       int length)
+{
+    float wcoef1[FILTER_ORDER], wcoef2[FILTER_ORDER],
+          scratch[SUBFRAME_SIZE], temp[SUBFRAME_SIZE],
+          mem[SUBFRAME_SIZE];
+    float sum1 = 0.0, sum2 = 0.0, gamma, gain;
+    float tilt = pfc->tilt;
+    int i, n, best;
+
+    bandwidth_expansion(wcoef1, coeff, pfc->p1);
+    bandwidth_expansion(wcoef2, coeff, pfc->p2);
+
+    /* Tilt compensation filter, TIA/IS-127 5.9.1 */
+    for (i = 0; i < length - 1; i++)
+        sum2 += in[i] * in[i + 1];
+    if (sum2 < 0.0)
+        tilt = 0.0;
+
+    for (i = 0; i < length; i++) {
+        scratch[i] = in[i] - tilt * e->last;
+        e->last = in[i];
+    }
+
+    /* Short term residual filter, TIA/IS-127 5.9.2 */
+    residual_filter(&e->postfilter_residual[ACB_SIZE], scratch, wcoef1, e->postfilter_fir, length);
+
+    /* Long term postfilter */
+    best = idx;
+    for (i = FFMIN(MIN_DELAY, idx - 3); i <= FFMAX(MAX_DELAY, idx + 3); i++) {
+        for (n = ACB_SIZE, sum2 = 0; n < ACB_SIZE + length; n++)
+            sum2 += e->postfilter_residual[n] * e->postfilter_residual[n - i];
+        if (sum2 > sum1) {
+            sum1 = sum2;
+            best = i;
+        }
+    }
+
+    for (i = ACB_SIZE, sum1 = 0; i < ACB_SIZE + length; i++)
+        sum1 += e->postfilter_residual[i - best] * e->postfilter_residual[i - best];
+    for (i = ACB_SIZE, sum2 = 0; i < ACB_SIZE + length; i++)
+        sum2 += e->postfilter_residual[i] * e->postfilter_residual[i - best];
+
+    if (sum2 * sum1 == 0 || e->bitrate == RATE_QUANT) {
+        memcpy(temp, e->postfilter_residual + ACB_SIZE, length * sizeof(float));
+    } else {
+        gamma = sum2 / sum1;
+        if (gamma < 0.5)
+            memcpy(temp, e->postfilter_residual + ACB_SIZE, length * sizeof(float));
+        else {
+            gamma = FFMIN(gamma, 1.0);
+
+            for (i = 0; i < length; i++) {
+                temp[i] = e->postfilter_residual[ACB_SIZE + i] + gamma *
+                    pfc->ltgain * e->postfilter_residual[ACB_SIZE + i - best];
+            }
+        }
+    }
+
+    memcpy(scratch, temp, length * sizeof(float));
+    memcpy(mem, e->postfilter_iir, FILTER_ORDER * sizeof(float));
+    synthesis_filter(scratch, wcoef2, mem, length, scratch);
+
+    /* Gain computation, TIA/IS-127 5.9.4-2 */
+    for (i = 0, sum1 = 0, sum2 = 0; i < length; i++) {
+        sum1 += in[i] * in[i];
+        sum2 += scratch[i] * scratch[i];
+    }
+    gain = sum2 ? sqrt(sum1 / sum2) : 1.0;
+
+    for (i = 0; i < length; i++)
+        temp[i] *= gain;
+
+    /* Short term postfilter */
+    synthesis_filter(temp, wcoef2, e->postfilter_iir, length, out);
+
+    memmove(e->postfilter_residual,
+           e->postfilter_residual + length, ACB_SIZE * sizeof(float));
+}
+
+static void frame_erasure(EVRCContext *e, float *samples)
+{
+    float ilspf[FILTER_ORDER], ilpc[FILTER_ORDER], idelay[NB_SUBFRAMES],
+          tmp[SUBFRAME_SIZE + 6], f;
+    int i, j;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        if (e->bitrate != RATE_QUANT)
+            e->lspf[i] = e->prev_lspf[i] * 0.875 + 0.125 * (i + 1) * 0.048;
+        else
+            e->lspf[i] = e->prev_lspf[i];
+    }
+
+    if (e->prev_error_flag)
+        e->avg_acb_gain *= 0.75;
+    if (e->bitrate == RATE_FULL)
+        memcpy(e->pitch_back, e->pitch, ACB_SIZE * sizeof(float));
+    if (e->last_valid_bitrate == RATE_QUANT)
+        e->bitrate = RATE_QUANT;
+    else
+        e->bitrate = RATE_FULL;
+
+    if (e->bitrate == RATE_FULL || e->bitrate == RATE_HALF) {
+        e->pitch_delay = e->prev_pitch_delay;
+    } else {
+        float sum = 0;
+
+        idelay[0] = idelay[1] = idelay[2] = MIN_DELAY;
+
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            sum += evrc_energy_quant[e->prev_energy_gain][i];
+        sum /= (float) NB_SUBFRAMES;
+        sum  = pow(10, sum);
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            e->energy_vector[i] = sum;
+    }
+
+    if (fabs(e->pitch_delay - e->prev_pitch_delay) > 15)
+        e->prev_pitch_delay = e->pitch_delay;
+
+    for (i = 0; i < NB_SUBFRAMES; i++) {
+        int subframe_size = subframe_sizes[i];
+        int pitch_lag;
+
+        interpolate_lsp(ilspf, e->lspf, e->prev_lspf, i);
+
+        if (e->bitrate != RATE_QUANT) {
+            if (e->avg_acb_gain < 0.3) {
+                idelay[0] = estimation_delay[i];
+                idelay[1] = estimation_delay[i + 1];
+                idelay[2] = estimation_delay[i + 2];
+            } else {
+                interpolate_delay(idelay, e->pitch_delay, e->prev_pitch_delay, i);
+            }
+        }
+
+        pitch_lag = lrintf((idelay[1] + idelay[0]) / 2.0);
+        decode_predictor_coeffs(ilspf, ilpc);
+
+        if (e->bitrate != RATE_QUANT) {
+            acb_excitation(e, e->pitch + ACB_SIZE,
+                           e->avg_acb_gain, idelay, subframe_size);
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] *= e->fade_scale;
+            e->fade_scale = FFMAX(e->fade_scale - 0.05, 0.0);
+        } else {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+
+        if (e->bitrate != RATE_QUANT && e->avg_acb_gain < 0.4) {
+            f = 0.1 * e->avg_fcb_gain;
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] += f;
+        } else if (e->bitrate == RATE_QUANT) {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        synthesis_filter(e->pitch + ACB_SIZE, ilpc,
+                         e->synthesis, subframe_size, tmp);
+        postfilter(e, tmp, ilpc, samples, pitch_lag,
+                   &postfilter_coeffs[e->bitrate], subframe_size);
+
+        samples += subframe_size;
+    }
+}
+
+static int evrc_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame_ptr, AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    AVFrame *frame     = data;
+    EVRCContext *e     = avctx->priv_data;
+    int buf_size       = avpkt->size;
+    float ilspf[FILTER_ORDER], ilpc[FILTER_ORDER], idelay[NB_SUBFRAMES];
+    float *samples;
+    int   i, j, ret, error_flag = 0;
+
+    frame->nb_samples = 160;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    samples = (float *)frame->data[0];
+
+    if ((e->bitrate = determine_bitrate(avctx, &buf_size, &buf)) == RATE_ERRS) {
+        warn_insufficient_frame_quality(avctx, "bitrate cannot be determined.");
+        goto erasure;
+    }
+    if (e->bitrate <= SILENCE || e->bitrate == RATE_QUARTER)
+        goto erasure;
+    if (e->bitrate == RATE_QUANT && e->last_valid_bitrate == RATE_FULL
+                                 && !e->prev_error_flag)
+        goto erasure;
+
+    if ((ret = init_get_bits8(&e->gb, buf, buf_size)) < 0)
+        return ret;
+    memset(&e->frame, 0, sizeof(EVRCAFrame));
+
+    unpack_frame(e);
+
+    if (e->bitrate != RATE_QUANT) {
+        uint8_t *p = (uint8_t *) &e->frame;
+        for (i = 0; i < sizeof(EVRCAFrame); i++) {
+            if (p[i])
+                break;
+        }
+        if (i == sizeof(EVRCAFrame))
+            goto erasure;
+    } else if (e->frame.lsp[0] == 0xf &&
+               e->frame.lsp[1] == 0xf &&
+               e->frame.energy_gain == 0xff) {
+        goto erasure;
+    }
+
+    if (decode_lspf(e) < 0)
+        goto erasure;
+
+    if (e->bitrate == RATE_FULL || e->bitrate == RATE_HALF) {
+        /* Pitch delay parameter checking as per TIA/IS-127 5.1.5.1 */
+        if (e->frame.pitch_delay > MAX_DELAY - MIN_DELAY)
+            goto erasure;
+
+        e->pitch_delay = e->frame.pitch_delay + MIN_DELAY;
+
+        /* Delay diff parameter checking as per TIA/IS-127 5.1.5.2 */
+        if (e->frame.delay_diff) {
+            int p = e->pitch_delay - e->frame.delay_diff + 16;
+            if (p < MIN_DELAY || p > MAX_DELAY)
+                goto erasure;
+        }
+
+        /* Delay contour reconstruction as per TIA/IS-127 5.2.2.2 */
+        if (e->frame.delay_diff &&
+            e->bitrate == RATE_FULL && e->prev_error_flag) {
+            float delay;
+
+            memcpy(e->pitch, e->pitch_back, ACB_SIZE * sizeof(float));
+
+            delay = e->prev_pitch_delay;
+            e->prev_pitch_delay = delay - e->frame.delay_diff + 16.0;
+
+            if (fabs(e->pitch_delay - delay) > 15)
+                delay = e->pitch_delay;
+
+            for (i = 0; i < NB_SUBFRAMES; i++) {
+                int subframe_size = subframe_sizes[i];
+
+                interpolate_delay(idelay, delay, e->prev_pitch_delay, i);
+                acb_excitation(e, e->pitch + ACB_SIZE, e->avg_acb_gain, idelay, subframe_size);
+                memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+            }
+        }
+
+        /* Smoothing of the decoded delay as per TIA/IS-127 5.2.2.5 */
+        if (fabs(e->pitch_delay - e->prev_pitch_delay) > 15)
+            e->prev_pitch_delay = e->pitch_delay;
+
+        e->avg_acb_gain = e->avg_fcb_gain = 0.0;
+    } else {
+        idelay[0] = idelay[1] = idelay[2] = MIN_DELAY;
+
+        /* Decode frame energy vectors as per TIA/IS-127 5.7.2 */
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            e->energy_vector[i] = pow(10, evrc_energy_quant[e->frame.energy_gain][i]);
+        e->prev_energy_gain = e->frame.energy_gain;
+    }
+
+    for (i = 0; i < NB_SUBFRAMES; i++) {
+        float tmp[SUBFRAME_SIZE + 6] = { 0 };
+        int subframe_size = subframe_sizes[i];
+        int pitch_lag;
+
+        interpolate_lsp(ilspf, e->lspf, e->prev_lspf, i);
+
+        if (e->bitrate != RATE_QUANT)
+            interpolate_delay(idelay, e->pitch_delay, e->prev_pitch_delay, i);
+
+        pitch_lag = lrintf((idelay[1] + idelay[0]) / 2.0);
+        decode_predictor_coeffs(ilspf, ilpc);
+
+        /* Bandwidth expansion as per TIA/IS-127 5.2.3.3 */
+        if (e->frame.lpc_flag && e->prev_error_flag)
+            bandwidth_expansion(ilpc, ilpc, 0.75);
+
+        if (e->bitrate != RATE_QUANT) {
+            float acb_sum, f;
+
+            f = exp((e->bitrate == RATE_HALF ? 0.5 : 0.25)
+                         * (e->frame.fcb_gain[i] + 1));
+            acb_sum = pitch_gain_vq[e->frame.acb_gain[i]];
+            e->avg_acb_gain += acb_sum / NB_SUBFRAMES;
+            e->avg_fcb_gain += f / NB_SUBFRAMES;
+
+            acb_excitation(e, e->pitch + ACB_SIZE,
+                           acb_sum, idelay, subframe_size);
+            fcb_excitation(e, e->frame.fcb_shape[i], tmp,
+                           acb_sum, pitch_lag, subframe_size);
+
+            /* Total excitation generation as per TIA/IS-127 5.2.3.9 */
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] += f * tmp[j];
+            e->fade_scale = FFMIN(e->fade_scale + 0.2, 1.0);
+        } else {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+
+        synthesis_filter(e->pitch + ACB_SIZE, ilpc,
+                         e->synthesis, subframe_size,
+                         e->postfilter ? tmp : samples);
+        if (e->postfilter)
+            postfilter(e, tmp, ilpc, samples, pitch_lag,
+                       &postfilter_coeffs[e->bitrate], subframe_size);
+
+        samples += subframe_size;
+    }
+
+    if (error_flag) {
+erasure:
+        error_flag = 1;
+        av_log(avctx, AV_LOG_WARNING, "frame erasure\n");
+        frame_erasure(e, samples);
+    }
+
+    memcpy(e->prev_lspf, e->lspf, sizeof(e->prev_lspf));
+    e->prev_error_flag    = error_flag;
+    e->last_valid_bitrate = e->bitrate;
+
+    if (e->bitrate != RATE_QUANT)
+        e->prev_pitch_delay = e->pitch_delay;
+
+    samples = (float *)frame->data[0];
+    for (i = 0; i < 160; i++)
+        samples[i] /= 32768;
+
+    *got_frame_ptr   = 1;
+
+    return avpkt->size;
+}
+
+#define OFFSET(x) offsetof(EVRCContext, x)
+#define AD AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption options[] = {
+    { "postfilter", "enable postfilter", OFFSET(postfilter), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, AD },
+    { NULL }
+};
+
+static const AVClass evrcdec_class = {
+    .class_name = "evrc",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_evrc_decoder = {
+    .name           = "evrc",
+    .long_name      = NULL_IF_CONFIG_SMALL("EVRC (Enhanced Variable Rate Codec)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_EVRC,
+    .init           = evrc_decode_init,
+    .decode         = evrc_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(EVRCContext),
+    .priv_class     = &evrcdec_class,
+};
diff --git a/libavcodec/exif.c b/libavcodec/exif.c
new file mode 100644
index 0000000..2874772
--- /dev/null
+++ b/libavcodec/exif.c
@@ -0,0 +1,152 @@
+/*
+ * EXIF metadata parser
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * EXIF metadata parser
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#include "exif.h"
+
+
+static const char *exif_get_tag_name(uint16_t id)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(tag_list); i++) {
+        if (tag_list[i].id == id)
+            return tag_list[i].name;
+    }
+
+    return NULL;
+}
+
+
+static int exif_add_metadata(void *logctx, int count, int type,
+                             const char *name, const char *sep,
+                             GetByteContext *gb, int le,
+                             AVDictionary **metadata)
+{
+    switch(type) {
+    case 0:
+        av_log(logctx, AV_LOG_WARNING,
+               "Invalid TIFF tag type 0 found for %s with size %d\n",
+               name, count);
+        return 0;
+    case TIFF_DOUBLE   : return ff_tadd_doubles_metadata(count, name, sep, gb, le, metadata);
+    case TIFF_SSHORT   : return ff_tadd_shorts_metadata(count, name, sep, gb, le, 1, metadata);
+    case TIFF_SHORT    : return ff_tadd_shorts_metadata(count, name, sep, gb, le, 0, metadata);
+    case TIFF_SBYTE    : return ff_tadd_bytes_metadata(count, name, sep, gb, le, 1, metadata);
+    case TIFF_BYTE     :
+    case TIFF_UNDEFINED: return ff_tadd_bytes_metadata(count, name, sep, gb, le, 0, metadata);
+    case TIFF_STRING   : return ff_tadd_string_metadata(count, name, gb, le, metadata);
+    case TIFF_SRATIONAL:
+    case TIFF_RATIONAL : return ff_tadd_rational_metadata(count, name, sep, gb, le, metadata);
+    case TIFF_SLONG    :
+    case TIFF_LONG     : return ff_tadd_long_metadata(count, name, sep, gb, le, metadata);
+    default:
+        avpriv_request_sample(logctx, "TIFF tag type (%u)", type);
+        return 0;
+    };
+}
+
+
+static int exif_decode_tag(void *logctx, GetByteContext *gbytes, int le,
+                           int depth, AVDictionary **metadata)
+{
+    int ret, cur_pos;
+    unsigned id, count;
+    enum TiffTypes type;
+
+    if (depth > 2) {
+        return 0;
+    }
+
+    ff_tread_tag(gbytes, le, &id, &type, &count, &cur_pos);
+
+    if (!bytestream2_tell(gbytes)) {
+        bytestream2_seek(gbytes, cur_pos, SEEK_SET);
+        return 0;
+    }
+
+    // read count values and add it metadata
+    // store metadata or proceed with next IFD
+    ret = ff_tis_ifd(id);
+    if (ret) {
+        ret = ff_exif_decode_ifd(logctx, gbytes, le, depth + 1, metadata);
+    } else {
+        const char *name = exif_get_tag_name(id);
+        char *use_name   = (char*) name;
+
+        if (!use_name) {
+            use_name = av_malloc(7);
+            if (!use_name) {
+                return AVERROR(ENOMEM);
+            }
+            snprintf(use_name, 7, "0x%04X", id);
+        }
+
+        ret = exif_add_metadata(logctx, count, type, use_name, NULL,
+                                gbytes, le, metadata);
+
+        if (!name) {
+            av_freep(&use_name);
+        }
+    }
+
+    bytestream2_seek(gbytes, cur_pos, SEEK_SET);
+
+    return ret;
+}
+
+
+int ff_exif_decode_ifd(void *logctx, GetByteContext *gbytes,
+                       int le, int depth, AVDictionary **metadata)
+{
+    int i, ret;
+    int entries;
+
+    entries = ff_tget_short(gbytes, le);
+
+    if (bytestream2_get_bytes_left(gbytes) < entries * 12) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i = 0; i < entries; i++) {
+        if ((ret = exif_decode_tag(logctx, gbytes, le, depth, metadata)) < 0) {
+            return ret;
+        }
+    }
+
+    // return next IDF offset or 0x000000000 or a value < 0 for failure
+    return ff_tget_long(gbytes, le);
+}
+
+int avpriv_exif_decode_ifd(void *logctx, const uint8_t *buf, int size,
+                           int le, int depth, AVDictionary **metadata)
+{
+    GetByteContext gb;
+
+    bytestream2_init(&gb, buf, size);
+
+    return ff_exif_decode_ifd(logctx, &gb, le, depth, metadata);
+}
diff --git a/libavcodec/exif.h b/libavcodec/exif.h
new file mode 100644
index 0000000..05af756
--- /dev/null
+++ b/libavcodec/exif.h
@@ -0,0 +1,173 @@
+/*
+ * EXIF metadata parser
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * EXIF metadata parser
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#ifndef AVCODEC_EXIF_H
+#define AVCODEC_EXIF_H
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "tiff.h"
+
+#define EXIF_MAX_IFD_RECURSION 2
+#define EXIF_TAG_NAME_LENGTH   32
+
+struct exif_tag {
+    char      name[EXIF_TAG_NAME_LENGTH];
+    uint16_t  id;
+};
+
+static const struct exif_tag tag_list[] = { // JEITA CP-3451 EXIF specification:
+    {"GPSVersionID",               0x00}, // <- Table 12 GPS Attribute Information
+    {"GPSLatitudeRef",             0x01},
+    {"GPSLatitude",                0x02},
+    {"GPSLongitudeRef",            0x03},
+    {"GPSLongitude",               0x04},
+    {"GPSAltitudeRef",             0x05},
+    {"GPSAltitude",                0x06},
+    {"GPSTimeStamp",               0x07},
+    {"GPSSatellites",              0x08},
+    {"GPSStatus",                  0x09},
+    {"GPSMeasureMode",             0x0A},
+    {"GPSDOP",                     0x0B},
+    {"GPSSpeedRef",                0x0C},
+    {"GPSSpeed",                   0x0D},
+    {"GPSTrackRef",                0x0E},
+    {"GPSTrack",                   0x0F},
+    {"GPSImgDirectionRef",         0x10},
+    {"GPSImgDirection",            0x11},
+    {"GPSMapDatum",                0x12},
+    {"GPSDestLatitudeRef",         0x13},
+    {"GPSDestLatitude",            0x14},
+    {"GPSDestLongitudeRef",        0x15},
+    {"GPSDestLongitude",           0x16},
+    {"GPSDestBearingRef",          0x17},
+    {"GPSDestBearing",             0x18},
+    {"GPSDestDistanceRef",         0x19},
+    {"GPSDestDistance",            0x1A},
+    {"GPSProcessingMethod",        0x1B},
+    {"GPSAreaInformation",         0x1C},
+    {"GPSDateStamp",               0x1D},
+    {"GPSDifferential",            0x1E},
+    {"ImageWidth",                 0x100}, // <- Table 3 TIFF Rev. 6.0 Attribute Information Used in Exif
+    {"ImageLength",                0x101},
+    {"BitsPerSample",              0x102},
+    {"Compression",                0x103},
+    {"PhotometricInterpretation",  0x106},
+    {"Orientation",                0x112},
+    {"SamplesPerPixel",            0x115},
+    {"PlanarConfiguration",        0x11C},
+    {"YCbCrSubSampling",           0x212},
+    {"YCbCrPositioning",           0x213},
+    {"XResolution",                0x11A},
+    {"YResolution",                0x11B},
+    {"ResolutionUnit",             0x128},
+    {"StripOffsets",               0x111},
+    {"RowsPerStrip",               0x116},
+    {"StripByteCounts",            0x117},
+    {"JPEGInterchangeFormat",      0x201},
+    {"JPEGInterchangeFormatLength",0x202},
+    {"TransferFunction",           0x12D},
+    {"WhitePoint",                 0x13E},
+    {"PrimaryChromaticities",      0x13F},
+    {"YCbCrCoefficients",          0x211},
+    {"ReferenceBlackWhite",        0x214},
+    {"DateTime",                   0x132},
+    {"ImageDescription",           0x10E},
+    {"Make",                       0x10F},
+    {"Model",                      0x110},
+    {"Software",                   0x131},
+    {"Artist",                     0x13B},
+    {"Copyright",                  0x8298},
+    {"ExifVersion",                0x9000}, // <- Table 4 Exif IFD Attribute Information (1)
+    {"FlashpixVersion",            0xA000},
+    {"ColorSpace",                 0xA001},
+    {"ComponentsConfiguration",    0x9101},
+    {"CompressedBitsPerPixel",     0x9102},
+    {"PixelXDimension",            0xA002},
+    {"PixelYDimension",            0xA003},
+    {"MakerNote",                  0x927C},
+    {"UserComment",                0x9286},
+    {"RelatedSoundFile",           0xA004},
+    {"DateTimeOriginal",           0x9003},
+    {"DateTimeDigitized",          0x9004},
+    {"SubSecTime",                 0x9290},
+    {"SubSecTimeOriginal",         0x9291},
+    {"SubSecTimeDigitized",        0x9292},
+    {"ImageUniqueID",              0xA420},
+    {"ExposureTime",               0x829A}, // <- Table 5 Exif IFD Attribute Information (2)
+    {"FNumber",                    0x829D},
+    {"ExposureProgram",            0x8822},
+    {"SpectralSensitivity",        0x8824},
+    {"ISOSpeedRatings",            0x8827},
+    {"OECF",                       0x8828},
+    {"ShutterSpeedValue",          0x9201},
+    {"ApertureValue",              0x9202},
+    {"BrightnessValue",            0x9203},
+    {"ExposureBiasValue",          0x9204},
+    {"MaxApertureValue",           0x9205},
+    {"SubjectDistance",            0x9206},
+    {"MeteringMode",               0x9207},
+    {"LightSource",                0x9208},
+    {"Flash",                      0x9209},
+    {"FocalLength",                0x920A},
+    {"SubjectArea",                0x9214},
+    {"FlashEnergy",                0xA20B},
+    {"SpatialFrequencyResponse",   0xA20C},
+    {"FocalPlaneXResolution",      0xA20E},
+    {"FocalPlaneYResolution",      0xA20F},
+    {"FocalPlaneResolutionUnit",   0xA210},
+    {"SubjectLocation",            0xA214},
+    {"ExposureIndex",              0xA215},
+    {"SensingMethod",              0xA217},
+    {"FileSource",                 0xA300},
+    {"SceneType",                  0xA301},
+    {"CFAPattern",                 0xA302},
+    {"CustomRendered",             0xA401},
+    {"ExposureMode",               0xA402},
+    {"WhiteBalance",               0xA403},
+    {"DigitalZoomRatio",           0xA404},
+    {"FocalLengthIn35mmFilm",      0xA405},
+    {"SceneCaptureType",           0xA406},
+    {"GainControl",                0xA407},
+    {"Contrast",                   0xA408},
+    {"Saturation",                 0xA409},
+    {"Sharpness",                  0xA40A},
+    {"DeviceSettingDescription",   0xA40B},
+    {"SubjectDistanceRange",       0xA40C}
+//    {"InteroperabilityIndex",      0x1}, // <- Table 13 Interoperability IFD Attribute Information
+//    {"",                           0x0}
+};
+
+/** Recursively decodes all IFD's and
+ *  adds included TAGS into the metadata dictionary. */
+int avpriv_exif_decode_ifd(void *logctx, const uint8_t *buf, int size,
+                           int le, int depth, AVDictionary **metadata);
+
+int ff_exif_decode_ifd(void *logctx, GetByteContext *gbytes, int le,
+                       int depth, AVDictionary **metadata);
+
+#endif /* AVCODEC_EXIF_H */
diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index 28cee84..0f8b0fd 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -1,21 +1,24 @@
 /*
  * OpenEXR (.exr) image decoder
+ * Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
  * Copyright (c) 2009 Jimmy Christensen
  *
- * This file is part of Libav
+ * B44/B44A, Tile, UINT32 added by Jokyo Images support by CNC - French National Center for Cinema
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,13 +37,22 @@
 #include <float.h>
 #include <zlib.h>
 
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/opt.h"
+#include "libavutil/color_utils.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+
+#if HAVE_BIGENDIAN
+#include "bswapdsp.h"
+#endif
+
+#include "exrdsp.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
 #include "thread.h"
@@ -54,6 +66,8 @@ enum ExrCompr {
     EXR_PXR24,
     EXR_B44,
     EXR_B44A,
+    EXR_DWA,
+    EXR_DWB,
     EXR_UNKN,
 };
 
@@ -64,11 +78,31 @@ enum ExrPixelType {
     EXR_UNKNOWN,
 };
 
+enum ExrTileLevelMode {
+    EXR_TILE_LEVEL_ONE,
+    EXR_TILE_LEVEL_MIPMAP,
+    EXR_TILE_LEVEL_RIPMAP,
+    EXR_TILE_LEVEL_UNKNOWN,
+};
+
+enum ExrTileLevelRound {
+    EXR_TILE_ROUND_UP,
+    EXR_TILE_ROUND_DOWN,
+    EXR_TILE_ROUND_UNKNOWN,
+};
+
 typedef struct EXRChannel {
     int xsub, ysub;
     enum ExrPixelType pixel_type;
 } EXRChannel;
 
+typedef struct EXRTileAttribute {
+    int32_t xSize;
+    int32_t ySize;
+    enum ExrTileLevelMode level_mode;
+    enum ExrTileLevelRound level_round;
+} EXRTileAttribute;
+
 typedef struct EXRThreadData {
     uint8_t *uncompressed_data;
     int uncompressed_size;
@@ -78,12 +112,21 @@ typedef struct EXRThreadData {
 
     uint8_t *bitmap;
     uint16_t *lut;
+
+    int ysize, xsize;
+
+    int channel_line_size;
 } EXRThreadData;
 
 typedef struct EXRContext {
     AVClass *class;
     AVFrame *picture;
     AVCodecContext *avctx;
+    ExrDSPContext dsp;
+
+#if HAVE_BIGENDIAN
+    BswapDSPContext bbdsp;
+#endif
 
     enum ExrCompr compression;
     enum ExrPixelType pixel_type;
@@ -94,22 +137,27 @@ typedef struct EXRContext {
     uint32_t xmax, xmin;
     uint32_t ymax, ymin;
     uint32_t xdelta, ydelta;
-    int ysize;
 
-    uint64_t scan_line_size;
     int scan_lines_per_block;
 
+    EXRTileAttribute tile_attr; /* header data attribute of tile */
+    int is_tile; /* 0 if scanline, 1 if tile */
+
+    int is_luma;/* 1 if there is an Y plane */
+
     GetByteContext gb;
     const uint8_t *buf;
     int buf_size;
 
     EXRChannel *channels;
     int nb_channels;
+    int current_channel_offset;
 
     EXRThreadData *thread_data;
 
     const char *layer;
 
+    enum AVColorTransferCharacteristic apply_trc_type;
     float gamma;
     uint16_t gamma_table[65536];
 } EXRContext;
@@ -184,9 +232,9 @@ static union av_intfloat32 exr_half2float(uint16_t hf)
  *
  * @return normalized 16-bit unsigned int
  */
-static inline uint16_t exr_flt2uint(uint32_t v)
+static inline uint16_t exr_flt2uint(int32_t v)
 {
-    unsigned int exp = v >> 23;
+    int32_t exp = v >> 23;
     // "HACK": negative values result in exp<  0, so clipping them to 0
     // is also handled by this condition, avoids explicit check for sign bit.
     if (exp <= 127 + 7 - 24) // we would shift out all bits anyway
@@ -217,39 +265,7 @@ static inline uint16_t exr_halflt2uint(uint16_t v)
     return (v + (1 << 16)) >> (exp + 1);
 }
 
-static void predictor(uint8_t *src, int size)
-{
-    uint8_t *t    = src + 1;
-    uint8_t *stop = src + size;
-
-    while (t < stop) {
-        int d = (int) t[-1] + (int) t[0] - 128;
-        t[0] = d;
-        ++t;
-    }
-}
-
-static void reorder_pixels(uint8_t *src, uint8_t *dst, int size)
-{
-    const int8_t *t1 = src;
-    const int8_t *t2 = src + (size + 1) / 2;
-    int8_t *s        = dst;
-    int8_t *stop     = s + size;
-
-    while (1) {
-        if (s < stop)
-            *(s++) = *(t1++);
-        else
-            break;
-
-        if (s < stop)
-            *(s++) = *(t2++);
-        else
-            break;
-    }
-}
-
-static int zip_uncompress(const uint8_t *src, int compressed_size,
+static int zip_uncompress(EXRContext *s, const uint8_t *src, int compressed_size,
                           int uncompressed_size, EXRThreadData *td)
 {
     unsigned long dest_len = uncompressed_size;
@@ -258,13 +274,15 @@ static int zip_uncompress(const uint8_t *src, int compressed_size,
         dest_len != uncompressed_size)
         return AVERROR_INVALIDDATA;
 
-    predictor(td->tmp, uncompressed_size);
-    reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
+    av_assert1(uncompressed_size % 2 == 0);
+
+    s->dsp.predictor(td->tmp, uncompressed_size);
+    s->dsp.reorder_pixels(td->uncompressed_data, td->tmp, uncompressed_size);
 
     return 0;
 }
 
-static int rle_uncompress(const uint8_t *src, int compressed_size,
+static int rle_uncompress(EXRContext *ctx, const uint8_t *src, int compressed_size,
                           int uncompressed_size, EXRThreadData *td)
 {
     uint8_t *d      = td->tmp;
@@ -303,8 +321,10 @@ static int rle_uncompress(const uint8_t *src, int compressed_size,
     if (dend != d)
         return AVERROR_INVALIDDATA;
 
-    predictor(td->tmp, uncompressed_size);
-    reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
+    av_assert1(uncompressed_size % 2 == 0);
+
+    ctx->dsp.predictor(td->tmp, uncompressed_size);
+    ctx->dsp.reorder_pixels(td->uncompressed_data, td->tmp, uncompressed_size);
 
     return 0;
 }
@@ -379,16 +399,16 @@ static void huf_canonical_code_table(uint64_t *hcode)
 static int huf_unpack_enc_table(GetByteContext *gb,
                                 int32_t im, int32_t iM, uint64_t *hcode)
 {
-    BitstreamContext bc;
-    int ret = bitstream_init8(&bc, gb->buffer, bytestream2_get_bytes_left(gb));
+    GetBitContext gbit;
+    int ret = init_get_bits8(&gbit, gb->buffer, bytestream2_get_bytes_left(gb));
     if (ret < 0)
         return ret;
 
     for (; im <= iM; im++) {
-        uint64_t l = hcode[im] = bitstream_read(&bc, 6);
+        uint64_t l = hcode[im] = get_bits(&gbit, 6);
 
         if (l == LONG_ZEROCODE_RUN) {
-            int zerun = bitstream_read(&bc, 8) + SHORTEST_LONG_RUN;
+            int zerun = get_bits(&gbit, 8) + SHORTEST_LONG_RUN;
 
             if (im + zerun > iM + 1)
                 return AVERROR_INVALIDDATA;
@@ -410,7 +430,7 @@ static int huf_unpack_enc_table(GetByteContext *gb,
         }
     }
 
-    bytestream2_skip(gb, (bitstream_tell(&bc) + 7) / 8);
+    bytestream2_skip(gb, (get_bits_count(&gbit) + 7) / 8);
     huf_canonical_code_table(hcode);
 
     return 0;
@@ -490,7 +510,8 @@ static int huf_decode(const uint64_t *hcode, const HufDec *hdecod,
     uint16_t *outb    = out;
     uint16_t *oe      = out + no;
     const uint8_t *ie = gb->buffer + (nbits + 7) / 8; // input byte size
-    uint8_t cs, s;
+    uint8_t cs;
+    uint16_t s;
     int i, lc = 0;
 
     while (gb->buffer < ie) {
@@ -537,7 +558,7 @@ static int huf_decode(const uint64_t *hcode, const HufDec *hdecod,
     while (lc > 0) {
         const HufDec pl = hdecod[(c << (HUF_DECBITS - lc)) & HUF_DECMASK];
 
-        if (pl.len) {
+        if (pl.len && lc >= pl.len) {
             lc -= pl.len;
             get_code(pl.lit, rlc, c, lc, gb, out, oe, outb);
         } else {
@@ -714,16 +735,20 @@ static int piz_uncompress(EXRContext *s, const uint8_t *src, int ssize,
     uint16_t maxval, min_non_zero, max_non_zero;
     uint16_t *ptr;
     uint16_t *tmp = (uint16_t *)td->tmp;
-    uint8_t *out;
+    uint16_t *out;
+    uint16_t *in;
     int ret, i, j;
+    int pixel_half_size;/* 1 for half, 2 for float and uint32 */
+    EXRChannel *channel;
+    int tmp_offset;
 
     if (!td->bitmap)
         td->bitmap = av_malloc(BITMAP_SIZE);
     if (!td->lut)
         td->lut = av_malloc(1 << 17);
     if (!td->bitmap || !td->lut) {
-        av_free(td->bitmap);
-        av_free(td->lut);
+        av_freep(&td->bitmap);
+        av_freep(&td->lut);
         return AVERROR(ENOMEM);
     }
 
@@ -738,7 +763,7 @@ static int piz_uncompress(EXRContext *s, const uint8_t *src, int ssize,
     if (min_non_zero <= max_non_zero)
         bytestream2_get_buffer(&gb, td->bitmap + min_non_zero,
                                max_non_zero - min_non_zero + 1);
-    memset(td->bitmap + max_non_zero, 0, BITMAP_SIZE - max_non_zero);
+    memset(td->bitmap + max_non_zero + 1, 0, BITMAP_SIZE - max_non_zero - 1);
 
     maxval = reverse_lut(td->bitmap, td->lut);
 
@@ -748,24 +773,42 @@ static int piz_uncompress(EXRContext *s, const uint8_t *src, int ssize,
 
     ptr = tmp;
     for (i = 0; i < s->nb_channels; i++) {
-        EXRChannel *channel = &s->channels[i];
-        int size = channel->pixel_type;
+        channel = &s->channels[i];
 
-        for (j = 0; j < size; j++)
-            wav_decode(ptr + j, s->xdelta, size, s->ysize,
-                       s->xdelta * size, maxval);
-        ptr += s->xdelta * s->ysize * size;
+        if (channel->pixel_type == EXR_HALF)
+            pixel_half_size = 1;
+        else
+            pixel_half_size = 2;
+
+        for (j = 0; j < pixel_half_size; j++)
+            wav_decode(ptr + j, td->xsize, pixel_half_size, td->ysize,
+                       td->xsize * pixel_half_size, maxval);
+        ptr += td->xsize * td->ysize * pixel_half_size;
     }
 
     apply_lut(td->lut, tmp, dsize / sizeof(uint16_t));
 
-    out = td->uncompressed_data;
-    for (i = 0; i < s->ysize; i++)
+    out = (uint16_t *)td->uncompressed_data;
+    for (i = 0; i < td->ysize; i++) {
+        tmp_offset = 0;
         for (j = 0; j < s->nb_channels; j++) {
-            uint16_t *in = tmp + j * s->xdelta * s->ysize + i * s->xdelta;
-            memcpy(out, in, s->xdelta * 2);
-            out += s->xdelta * 2;
+            channel = &s->channels[j];
+            if (channel->pixel_type == EXR_HALF)
+                pixel_half_size = 1;
+            else
+                pixel_half_size = 2;
+
+            in = tmp + tmp_offset * td->xsize * td->ysize + i * td->xsize * pixel_half_size;
+            tmp_offset += pixel_half_size;
+
+#if HAVE_BIGENDIAN
+            s->bbdsp.bswap16_buf(out, in, td->xsize * pixel_half_size);
+#else
+            memcpy(out, in, td->xsize * 2 * pixel_half_size);
+#endif
+            out += td->xsize * pixel_half_size;
         }
+    }
 
     return 0;
 }
@@ -774,17 +817,31 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
                             int compressed_size, int uncompressed_size,
                             EXRThreadData *td)
 {
-    unsigned long dest_len = uncompressed_size;
+    unsigned long dest_len, expected_len = 0;
     const uint8_t *in = td->tmp;
     uint8_t *out;
     int c, i, j;
 
-    if (uncompress(td->tmp, &dest_len, src, compressed_size) != Z_OK ||
-        dest_len != uncompressed_size)
+    for (i = 0; i < s->nb_channels; i++) {
+        if (s->channels[i].pixel_type == EXR_FLOAT) {
+            expected_len += (td->xsize * td->ysize * 3);/* PRX 24 store float in 24 bit instead of 32 */
+        } else if (s->channels[i].pixel_type == EXR_HALF) {
+            expected_len += (td->xsize * td->ysize * 2);
+        } else {//UINT 32
+            expected_len += (td->xsize * td->ysize * 4);
+        }
+    }
+
+    dest_len = expected_len;
+
+    if (uncompress(td->tmp, &dest_len, src, compressed_size) != Z_OK) {
+        return AVERROR_INVALIDDATA;
+    } else if (dest_len != expected_len) {
         return AVERROR_INVALIDDATA;
+    }
 
     out = td->uncompressed_data;
-    for (i = 0; i < s->ysize; i++)
+    for (i = 0; i < td->ysize; i++)
         for (c = 0; c < s->nb_channels; c++) {
             EXRChannel *channel = &s->channels[c];
             const uint8_t *ptr[4];
@@ -793,12 +850,12 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
             switch (channel->pixel_type) {
             case EXR_FLOAT:
                 ptr[0] = in;
-                ptr[1] = ptr[0] + s->xdelta;
-                ptr[2] = ptr[1] + s->xdelta;
-                in     = ptr[2] + s->xdelta;
+                ptr[1] = ptr[0] + td->xsize;
+                ptr[2] = ptr[1] + td->xsize;
+                in     = ptr[2] + td->xsize;
 
-                for (j = 0; j < s->xdelta; ++j) {
-                    uint32_t diff = (*(ptr[0]++) << 24) |
+                for (j = 0; j < td->xsize; ++j) {
+                    uint32_t diff = ((unsigned)*(ptr[0]++) << 24) |
                                     (*(ptr[1]++) << 16) |
                                     (*(ptr[2]++) << 8);
                     pixel += diff;
@@ -807,15 +864,31 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
                 break;
             case EXR_HALF:
                 ptr[0] = in;
-                ptr[1] = ptr[0] + s->xdelta;
-                in     = ptr[1] + s->xdelta;
-                for (j = 0; j < s->xdelta; j++) {
+                ptr[1] = ptr[0] + td->xsize;
+                in     = ptr[1] + td->xsize;
+                for (j = 0; j < td->xsize; j++) {
                     uint32_t diff = (*(ptr[0]++) << 8) | *(ptr[1]++);
 
                     pixel += diff;
                     bytestream_put_le16(&out, pixel);
                 }
                 break;
+            case EXR_UINT:
+                ptr[0] = in;
+                ptr[1] = ptr[0] + s->xdelta;
+                ptr[2] = ptr[1] + s->xdelta;
+                ptr[3] = ptr[2] + s->xdelta;
+                in     = ptr[3] + s->xdelta;
+
+                for (j = 0; j < s->xdelta; ++j) {
+                    uint32_t diff = (*(ptr[0]++) << 24) |
+                    (*(ptr[1]++) << 16) |
+                    (*(ptr[2]++) << 8 ) |
+                    (*(ptr[3]++));
+                    pixel += diff;
+                    bytestream_put_le32(&out, pixel);
+                }
+                break;
             default:
                 return AVERROR_INVALIDDATA;
             }
@@ -824,6 +897,134 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
     return 0;
 }
 
+static void unpack_14(const uint8_t b[14], uint16_t s[16])
+{
+    unsigned short shift = (b[ 2] >> 2) & 15;
+    unsigned short bias = (0x20 << shift);
+    int i;
+
+    s[ 0] = (b[0] << 8) | b[1];
+
+    s[ 4] = s[ 0] + ((((b[ 2] << 4) | (b[ 3] >> 4)) & 0x3f) << shift) - bias;
+    s[ 8] = s[ 4] + ((((b[ 3] << 2) | (b[ 4] >> 6)) & 0x3f) << shift) - bias;
+    s[12] = s[ 8] +   ((b[ 4]                       & 0x3f) << shift) - bias;
+
+    s[ 1] = s[ 0] +   ((b[ 5] >> 2)                         << shift) - bias;
+    s[ 5] = s[ 4] + ((((b[ 5] << 4) | (b[ 6] >> 4)) & 0x3f) << shift) - bias;
+    s[ 9] = s[ 8] + ((((b[ 6] << 2) | (b[ 7] >> 6)) & 0x3f) << shift) - bias;
+    s[13] = s[12] +   ((b[ 7]                       & 0x3f) << shift) - bias;
+
+    s[ 2] = s[ 1] +   ((b[ 8] >> 2)                         << shift) - bias;
+    s[ 6] = s[ 5] + ((((b[ 8] << 4) | (b[ 9] >> 4)) & 0x3f) << shift) - bias;
+    s[10] = s[ 9] + ((((b[ 9] << 2) | (b[10] >> 6)) & 0x3f) << shift) - bias;
+    s[14] = s[13] +   ((b[10]                       & 0x3f) << shift) - bias;
+
+    s[ 3] = s[ 2] +   ((b[11] >> 2)                         << shift) - bias;
+    s[ 7] = s[ 6] + ((((b[11] << 4) | (b[12] >> 4)) & 0x3f) << shift) - bias;
+    s[11] = s[10] + ((((b[12] << 2) | (b[13] >> 6)) & 0x3f) << shift) - bias;
+    s[15] = s[14] +   ((b[13]                       & 0x3f) << shift) - bias;
+
+    for (i = 0; i < 16; ++i) {
+        if (s[i] & 0x8000)
+            s[i] &= 0x7fff;
+        else
+            s[i] = ~s[i];
+    }
+}
+
+static void unpack_3(const uint8_t b[3], uint16_t s[16])
+{
+    int i;
+
+    s[0] = (b[0] << 8) | b[1];
+
+    if (s[0] & 0x8000)
+        s[0] &= 0x7fff;
+    else
+        s[0] = ~s[0];
+
+    for (i = 1; i < 16; i++)
+        s[i] = s[0];
+}
+
+
+static int b44_uncompress(EXRContext *s, const uint8_t *src, int compressed_size,
+                          int uncompressed_size, EXRThreadData *td) {
+    const int8_t *sr = src;
+    int stay_to_uncompress = compressed_size;
+    int nb_b44_block_w, nb_b44_block_h;
+    int index_tl_x, index_tl_y, index_out, index_tmp;
+    uint16_t tmp_buffer[16]; /* B44 use 4x4 half float pixel */
+    int c, iY, iX, y, x;
+    int target_channel_offset = 0;
+
+    /* calc B44 block count */
+    nb_b44_block_w = td->xsize / 4;
+    if ((td->xsize % 4) != 0)
+        nb_b44_block_w++;
+
+    nb_b44_block_h = td->ysize / 4;
+    if ((td->ysize % 4) != 0)
+        nb_b44_block_h++;
+
+    for (c = 0; c < s->nb_channels; c++) {
+        if (s->channels[c].pixel_type == EXR_HALF) {/* B44 only compress half float data */
+            for (iY = 0; iY < nb_b44_block_h; iY++) {
+                for (iX = 0; iX < nb_b44_block_w; iX++) {/* For each B44 block */
+                    if (stay_to_uncompress < 3) {
+                        av_log(s, AV_LOG_ERROR, "Not enough data for B44A block: %d", stay_to_uncompress);
+                        return AVERROR_INVALIDDATA;
+                    }
+
+                    if (src[compressed_size - stay_to_uncompress + 2] == 0xfc) { /* B44A block */
+                        unpack_3(sr, tmp_buffer);
+                        sr += 3;
+                        stay_to_uncompress -= 3;
+                    }  else {/* B44 Block */
+                        if (stay_to_uncompress < 14) {
+                            av_log(s, AV_LOG_ERROR, "Not enough data for B44 block: %d", stay_to_uncompress);
+                            return AVERROR_INVALIDDATA;
+                        }
+                        unpack_14(sr, tmp_buffer);
+                        sr += 14;
+                        stay_to_uncompress -= 14;
+                    }
+
+                    /* copy data to uncompress buffer (B44 block can exceed target resolution)*/
+                    index_tl_x = iX * 4;
+                    index_tl_y = iY * 4;
+
+                    for (y = index_tl_y; y < FFMIN(index_tl_y + 4, td->ysize); y++) {
+                        for (x = index_tl_x; x < FFMIN(index_tl_x + 4, td->xsize); x++) {
+                            index_out = target_channel_offset * td->xsize + y * td->channel_line_size + 2 * x;
+                            index_tmp = (y-index_tl_y) * 4 + (x-index_tl_x);
+                            td->uncompressed_data[index_out] = tmp_buffer[index_tmp] & 0xff;
+                            td->uncompressed_data[index_out + 1] = tmp_buffer[index_tmp] >> 8;
+                        }
+                    }
+                }
+            }
+            target_channel_offset += 2;
+        } else {/* Float or UINT 32 channel */
+            if (stay_to_uncompress < td->ysize * td->xsize * 4) {
+                av_log(s, AV_LOG_ERROR, "Not enough data for uncompress channel: %d", stay_to_uncompress);
+                return AVERROR_INVALIDDATA;
+            }
+
+            for (y = 0; y < td->ysize; y++) {
+                index_out = target_channel_offset * td->xsize + y * td->channel_line_size;
+                memcpy(&td->uncompressed_data[index_out], sr, td->xsize * 4);
+                sr += td->xsize * 4;
+            }
+            target_channel_offset += 4;
+
+            stay_to_uncompress -= td->ysize * td->xsize * 4;
+        }
+    }
+
+    return 0;
+}
+
 static int decode_block(AVCodecContext *avctx, void *tdata,
                         int jobnr, int threadnr)
 {
@@ -833,52 +1034,112 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
     const uint8_t *channel_buffer[4] = { 0 };
     const uint8_t *buf = s->buf;
     uint64_t line_offset, uncompressed_size;
-    uint32_t xdelta = s->xdelta;
     uint16_t *ptr_x;
     uint8_t *ptr;
-    uint32_t data_size, line;
+    uint32_t data_size;
+    uint64_t line, col = 0;
+    uint64_t tile_x, tile_y, tile_level_x, tile_level_y;
     const uint8_t *src;
-    int axmax = (avctx->width - (s->xmax + 1)) * 2 * s->desc->nb_components;
-    int bxmin = s->xmin * 2 * s->desc->nb_components;
+    int axmax = (avctx->width - (s->xmax + 1)) * 2 * s->desc->nb_components; /* nb pixel to add at the right of the datawindow */
+    int bxmin = s->xmin * 2 * s->desc->nb_components; /* nb pixel to add at the left of the datawindow */
     int i, x, buf_size = s->buf_size;
+    int c, rgb_channel_count;
     float one_gamma = 1.0f / s->gamma;
+    avpriv_trc_function trc_func = avpriv_get_trc_function_from_trc(s->apply_trc_type);
     int ret;
 
     line_offset = AV_RL64(s->gb.buffer + jobnr * 8);
-    // Check if the buffer has the required bytes needed from the offset
-    if (line_offset > buf_size - 8)
-        return AVERROR_INVALIDDATA;
 
-    src  = buf + line_offset + 8;
-    line = AV_RL32(src - 8);
-    if (line < s->ymin || line > s->ymax)
-        return AVERROR_INVALIDDATA;
+    if (s->is_tile) {
+        if (buf_size < 20 || line_offset > buf_size - 20)
+            return AVERROR_INVALIDDATA;
 
-    data_size = AV_RL32(src - 4);
-    if (data_size <= 0 || data_size > buf_size)
-        return AVERROR_INVALIDDATA;
+        src  = buf + line_offset + 20;
 
-    s->ysize          = FFMIN(s->scan_lines_per_block, s->ymax - line + 1);
-    uncompressed_size = s->scan_line_size * s->ysize;
-    if ((s->compression == EXR_RAW && (data_size != uncompressed_size ||
-                                 line_offset > buf_size - uncompressed_size)) ||
-        (s->compression != EXR_RAW && (data_size > uncompressed_size ||
-                                 line_offset > buf_size - data_size))) {
-        return AVERROR_INVALIDDATA;
+        tile_x = AV_RL32(src - 20);
+        tile_y = AV_RL32(src - 16);
+        tile_level_x = AV_RL32(src - 12);
+        tile_level_y = AV_RL32(src - 8);
+
+        data_size = AV_RL32(src - 4);
+        if (data_size <= 0 || data_size > buf_size - line_offset - 20)
+            return AVERROR_INVALIDDATA;
+
+        if (tile_level_x || tile_level_y) { /* tile level, is not the full res level */
+            avpriv_report_missing_feature(s->avctx, "Subres tile before full res tile");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        if (s->xmin || s->ymin) {
+            avpriv_report_missing_feature(s->avctx, "Tiles with xmin/ymin");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        line = s->tile_attr.ySize * tile_y;
+        col = s->tile_attr.xSize * tile_x;
+
+        if (line < s->ymin || line > s->ymax ||
+            col  < s->xmin || col  > s->xmax)
+            return AVERROR_INVALIDDATA;
+
+        td->ysize = FFMIN(s->tile_attr.ySize, s->ydelta - tile_y * s->tile_attr.ySize);
+        td->xsize = FFMIN(s->tile_attr.xSize, s->xdelta - tile_x * s->tile_attr.xSize);
+
+        if (col) { /* not the first tile of the line */
+            bxmin = 0; /* doesn't add pixel at the left of the datawindow */
+        }
+
+        if ((col + td->xsize) != s->xdelta)/* not the last tile of the line */
+            axmax = 0; /* doesn't add pixel at the right of the datawindow */
+
+        td->channel_line_size = td->xsize * s->current_channel_offset;/* uncompress size of one line */
+        uncompressed_size = td->channel_line_size * (uint64_t)td->ysize;/* uncompress size of the block */
+    } else {
+        if (buf_size < 8 || line_offset > buf_size - 8)
+            return AVERROR_INVALIDDATA;
+
+        src  = buf + line_offset + 8;
+        line = AV_RL32(src - 8);
+
+        if (line < s->ymin || line > s->ymax)
+            return AVERROR_INVALIDDATA;
+
+        data_size = AV_RL32(src - 4);
+        if (data_size <= 0 || data_size > buf_size - line_offset - 8)
+            return AVERROR_INVALIDDATA;
+
+        td->ysize          = FFMIN(s->scan_lines_per_block, s->ymax - line + 1); /* s->ydelta - line ?? */
+        td->xsize          = s->xdelta;
+
+        td->channel_line_size = td->xsize * s->current_channel_offset;/* uncompress size of one line */
+        uncompressed_size = td->channel_line_size * (uint64_t)td->ysize;/* uncompress size of the block */
+
+        if ((s->compression == EXR_RAW && (data_size != uncompressed_size ||
+                                           line_offset > buf_size - uncompressed_size)) ||
+            (s->compression != EXR_RAW && (data_size > uncompressed_size ||
+                                           line_offset > buf_size - data_size))) {
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (data_size < uncompressed_size || s->is_tile) { /* td->tmp is use for tile reorganization */
+        av_fast_padded_malloc(&td->tmp, &td->tmp_size, uncompressed_size);
+        if (!td->tmp)
+            return AVERROR(ENOMEM);
     }
 
     if (data_size < uncompressed_size) {
         av_fast_padded_malloc(&td->uncompressed_data,
-                              &td->uncompressed_size, uncompressed_size);
-        av_fast_padded_malloc(&td->tmp, &td->tmp_size, uncompressed_size);
-        if (!td->uncompressed_data || !td->tmp)
+                              &td->uncompressed_size, uncompressed_size + 64);/* Force 64 padding for AVX2 reorder_pixels dst */
+
+        if (!td->uncompressed_data)
             return AVERROR(ENOMEM);
 
         ret = AVERROR_INVALIDDATA;
         switch (s->compression) {
         case EXR_ZIP1:
         case EXR_ZIP16:
-            ret = zip_uncompress(src, data_size, uncompressed_size, td);
+            ret = zip_uncompress(s, src, data_size, uncompressed_size, td);
             break;
         case EXR_PIZ:
             ret = piz_uncompress(s, src, data_size, uncompressed_size, td);
@@ -887,7 +1148,12 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
             ret = pxr24_uncompress(s, src, data_size, uncompressed_size, td);
             break;
         case EXR_RLE:
-            ret = rle_uncompress(src, data_size, uncompressed_size, td);
+            ret = rle_uncompress(s, src, data_size, uncompressed_size, td);
+            break;
+        case EXR_B44:
+        case EXR_B44A:
+            ret = b44_uncompress(s, src, data_size, uncompressed_size, td);
+            break;
         }
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "decode_block() failed.\n");
@@ -896,21 +1162,30 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         src = td->uncompressed_data;
     }
 
-    channel_buffer[0] = src + xdelta * s->channel_offsets[0];
-    channel_buffer[1] = src + xdelta * s->channel_offsets[1];
-    channel_buffer[2] = src + xdelta * s->channel_offsets[2];
+    if (!s->is_luma) {
+        channel_buffer[0] = src + td->xsize * s->channel_offsets[0];
+        channel_buffer[1] = src + td->xsize * s->channel_offsets[1];
+        channel_buffer[2] = src + td->xsize * s->channel_offsets[2];
+        rgb_channel_count = 3;
+    } else { /* put y data in the first channel_buffer */
+        channel_buffer[0] = src + td->xsize * s->channel_offsets[1];
+        rgb_channel_count = 1;
+    }
     if (s->channel_offsets[3] >= 0)
-        channel_buffer[3] = src + xdelta * s->channel_offsets[3];
+        channel_buffer[3] = src + td->xsize * s->channel_offsets[3];
+
+    ptr = p->data[0] + line * p->linesize[0] + (col * s->desc->nb_components * 2);
 
-    ptr = p->data[0] + line * p->linesize[0];
     for (i = 0;
-         i < s->scan_lines_per_block && line + i <= s->ymax;
-         i++, ptr += p->linesize[0]) {
-        const uint8_t *r, *g, *b, *a;
+         i < td->ysize; i++, ptr += p->linesize[0]) {
+
+        const uint8_t * a;
+        const uint8_t *rgb[3];
+
+        for (c = 0; c < rgb_channel_count; c++){
+            rgb[c] = channel_buffer[c];
+        }
 
-        r = channel_buffer[0];
-        g = channel_buffer[1];
-        b = channel_buffer[2];
         if (channel_buffer[3])
             a = channel_buffer[3];
 
@@ -919,46 +1194,67 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         // Zero out the start if xmin is not 0
         memset(ptr_x, 0, bxmin);
         ptr_x += s->xmin * s->desc->nb_components;
+
         if (s->pixel_type == EXR_FLOAT) {
             // 32-bit
-            for (x = 0; x < xdelta; x++) {
-                union av_intfloat32 t;
-                t.i = bytestream_get_le32(&r);
-                if (t.f > 0.0f)  /* avoid negative values */
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-
-                t.i = bytestream_get_le32(&g);
-                if (t.f > 0.0f)
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
+            if (trc_func) {
+                for (x = 0; x < td->xsize; x++) {
+                    union av_intfloat32 t;
+
+                    for (c = 0; c < rgb_channel_count; c++) {
+                        t.i = bytestream_get_le32(&rgb[c]);
+                        t.f = trc_func(t.f);
+                        *ptr_x++ = exr_flt2uint(t.i);
+                    }
+                    if (channel_buffer[3])
+                        *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+                }
+            } else {
+                for (x = 0; x < td->xsize; x++) {
+                    union av_intfloat32 t;
+                    int c;
+
+                    for (c = 0; c < rgb_channel_count; c++) {
+                        t.i = bytestream_get_le32(&rgb[c]);
+                        if (t.f > 0.0f)  /* avoid negative values */
+                            t.f = powf(t.f, one_gamma);
+                        *ptr_x++ = exr_flt2uint(t.i);
+                    }
 
-                t.i = bytestream_get_le32(&b);
-                if (t.f > 0.0f)
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-                if (channel_buffer[3])
-                    *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+                    if (channel_buffer[3])
+                        *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+                }
             }
-        } else {
+        } else if (s->pixel_type == EXR_HALF) {
             // 16-bit
-            for (x = 0; x < xdelta; x++) {
-                *ptr_x++ = s->gamma_table[bytestream_get_le16(&r)];
-                *ptr_x++ = s->gamma_table[bytestream_get_le16(&g)];
-                *ptr_x++ = s->gamma_table[bytestream_get_le16(&b)];
+            for (x = 0; x < td->xsize; x++) {
+                int c;
+                for (c = 0; c < rgb_channel_count; c++) {
+                    *ptr_x++ = s->gamma_table[bytestream_get_le16(&rgb[c])];
+                }
+
                 if (channel_buffer[3])
                     *ptr_x++ = exr_halflt2uint(bytestream_get_le16(&a));
             }
+        } else if (s->pixel_type == EXR_UINT) {
+            for (x = 0; x < td->xsize; x++) {
+                for (c = 0; c < rgb_channel_count; c++) {
+                    *ptr_x++ = bytestream_get_le32(&rgb[c]) >> 16;
+                }
+
+                if (channel_buffer[3])
+                    *ptr_x++ = bytestream_get_le32(&a) >> 16;
+            }
         }
 
         // Zero out the end if xmax+1 is not w
         memset(ptr_x, 0, axmax);
 
-        channel_buffer[0] += s->scan_line_size;
-        channel_buffer[1] += s->scan_line_size;
-        channel_buffer[2] += s->scan_line_size;
+        channel_buffer[0] += td->channel_line_size;
+        channel_buffer[1] += td->channel_line_size;
+        channel_buffer[2] += td->channel_line_size;
         if (channel_buffer[3])
-            channel_buffer[3] += s->scan_line_size;
+            channel_buffer[3] += td->channel_line_size;
     }
 
     return 0;
@@ -1005,10 +1301,33 @@ static int check_header_variable(EXRContext *s,
     return var_size;
 }
 
-static int decode_header(EXRContext *s)
+static int decode_header(EXRContext *s, AVFrame *frame)
 {
-    int current_channel_offset = 0;
-    int magic_number, version, flags, i;
+    AVDictionary *metadata = NULL;
+    int magic_number, version, i, flags, sar = 0;
+    int layer_match = 0;
+    int ret;
+
+    s->current_channel_offset = 0;
+    s->xmin               = ~0;
+    s->xmax               = ~0;
+    s->ymin               = ~0;
+    s->ymax               = ~0;
+    s->xdelta             = ~0;
+    s->ydelta             = ~0;
+    s->channel_offsets[0] = -1;
+    s->channel_offsets[1] = -1;
+    s->channel_offsets[2] = -1;
+    s->channel_offsets[3] = -1;
+    s->pixel_type         = EXR_UNKNOWN;
+    s->compression        = EXR_UNKN;
+    s->nb_channels        = 0;
+    s->w                  = 0;
+    s->h                  = 0;
+    s->tile_attr.xSize    = -1;
+    s->tile_attr.ySize    = -1;
+    s->is_tile            = 0;
+    s->is_luma            = 0;
 
     if (bytestream2_get_bytes_left(&s->gb) < 10) {
         av_log(s->avctx, AV_LOG_ERROR, "Header too short to parse.\n");
@@ -1030,8 +1349,15 @@ static int decode_header(EXRContext *s)
     }
 
     flags = bytestream2_get_le24(&s->gb);
-    if (flags & 0x02) {
-        avpriv_report_missing_feature(s->avctx, "Tile support");
+
+    if (flags & 0x02)
+        s->is_tile = 1;
+    if (flags & 0x08) {
+        avpriv_report_missing_feature(s->avctx, "deep data");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (flags & 0x10) {
+        avpriv_report_missing_feature(s->avctx, "multipart");
         return AVERROR_PATCHWELCOME;
     }
 
@@ -1041,8 +1367,10 @@ static int decode_header(EXRContext *s)
         if ((var_size = check_header_variable(s, "channels",
                                               "chlist", 38)) >= 0) {
             GetByteContext ch_gb;
-            if (!var_size)
-                return AVERROR_INVALIDDATA;
+            if (!var_size) {
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
 
             bytestream2_init(&ch_gb, s->gb.buffer, var_size);
 
@@ -1054,31 +1382,46 @@ static int decode_header(EXRContext *s)
 
                 if (strcmp(s->layer, "") != 0) {
                     if (strncmp(ch_gb.buffer, s->layer, strlen(s->layer)) == 0) {
+                        layer_match = 1;
+                        av_log(s->avctx, AV_LOG_INFO,
+                               "Channel match layer : %s.\n", ch_gb.buffer);
                         ch_gb.buffer += strlen(s->layer);
                         if (*ch_gb.buffer == '.')
                             ch_gb.buffer++;         /* skip dot if not given */
+                    } else {
+                        layer_match = 0;
                         av_log(s->avctx, AV_LOG_INFO,
-                               "Layer %s.%s matched.\n", s->layer, ch_gb.buffer);
+                               "Channel doesn't match layer : %s.\n", ch_gb.buffer);
                     }
+                } else {
+                    layer_match = 1;
                 }
 
-                if (!strcmp(ch_gb.buffer, "R") ||
-                    !strcmp(ch_gb.buffer, "X") ||
-                    !strcmp(ch_gb.buffer, "U"))
-                    channel_index = 0;
-                else if (!strcmp(ch_gb.buffer, "G") ||
-                         !strcmp(ch_gb.buffer, "Y") ||
-                         !strcmp(ch_gb.buffer, "V"))
-                    channel_index = 1;
-                else if (!strcmp(ch_gb.buffer, "B") ||
-                         !strcmp(ch_gb.buffer, "Z") ||
-                         !strcmp(ch_gb.buffer, "W"))
-                    channel_index = 2;
-                else if (!strcmp(ch_gb.buffer, "A"))
-                    channel_index = 3;
-                else
-                    av_log(s->avctx, AV_LOG_WARNING,
-                           "Unsupported channel %.256s.\n", ch_gb.buffer);
+                if (layer_match) { /* only search channel if the layer match is valid */
+                    if (!strcmp(ch_gb.buffer, "R") ||
+                        !strcmp(ch_gb.buffer, "X") ||
+                        !strcmp(ch_gb.buffer, "U")) {
+                        channel_index = 0;
+                        s->is_luma = 0;
+                    } else if (!strcmp(ch_gb.buffer, "G") ||
+                               !strcmp(ch_gb.buffer, "V")) {
+                        channel_index = 1;
+                        s->is_luma = 0;
+                    } else if (!strcmp(ch_gb.buffer, "Y")) {
+                        channel_index = 1;
+                        s->is_luma = 1;
+                    } else if (!strcmp(ch_gb.buffer, "B") ||
+                               !strcmp(ch_gb.buffer, "Z") ||
+                               !strcmp(ch_gb.buffer, "W")){
+                               channel_index = 2;
+                        s->is_luma = 0;
+                    } else if (!strcmp(ch_gb.buffer, "A")) {
+                        channel_index = 3;
+                    } else {
+                        av_log(s->avctx, AV_LOG_WARNING,
+                               "Unsupported channel %.256s.\n", ch_gb.buffer);
+                    }
+                }
 
                 /* skip until you get a 0 */
                 while (bytestream2_get_bytes_left(&ch_gb) > 0 &&
@@ -1087,61 +1430,80 @@ static int decode_header(EXRContext *s)
 
                 if (bytestream2_get_bytes_left(&ch_gb) < 4) {
                     av_log(s->avctx, AV_LOG_ERROR, "Incomplete header.\n");
-                    return AVERROR_INVALIDDATA;
+                    ret = AVERROR_INVALIDDATA;
+                    goto fail;
                 }
 
                 current_pixel_type = bytestream2_get_le32(&ch_gb);
                 if (current_pixel_type >= EXR_UNKNOWN) {
                     avpriv_report_missing_feature(s->avctx, "Pixel type %d",
                                                   current_pixel_type);
-                    return AVERROR_PATCHWELCOME;
+                    ret = AVERROR_PATCHWELCOME;
+                    goto fail;
                 }
 
                 bytestream2_skip(&ch_gb, 4);
                 xsub = bytestream2_get_le32(&ch_gb);
                 ysub = bytestream2_get_le32(&ch_gb);
+
                 if (xsub != 1 || ysub != 1) {
                     avpriv_report_missing_feature(s->avctx,
                                                   "Subsampling %dx%d",
                                                   xsub, ysub);
-                    return AVERROR_PATCHWELCOME;
+                    ret = AVERROR_PATCHWELCOME;
+                    goto fail;
                 }
 
-                if (channel_index >= 0) {
+                if (channel_index >= 0 && s->channel_offsets[channel_index] == -1) { /* channel has not been previously assigned */
                     if (s->pixel_type != EXR_UNKNOWN &&
                         s->pixel_type != current_pixel_type) {
                         av_log(s->avctx, AV_LOG_ERROR,
                                "RGB channels not of the same depth.\n");
-                        return AVERROR_INVALIDDATA;
+                        ret = AVERROR_INVALIDDATA;
+                        goto fail;
                     }
                     s->pixel_type                     = current_pixel_type;
-                    s->channel_offsets[channel_index] = current_channel_offset;
+                    s->channel_offsets[channel_index] = s->current_channel_offset;
+                } else if (channel_index >= 0) {
+                    av_log(s->avctx, AV_LOG_ERROR,
+                            "Multiple channels with index %d.\n", channel_index);
+                    ret = AVERROR_INVALIDDATA;
+                    goto fail;
                 }
 
                 s->channels = av_realloc(s->channels,
                                          ++s->nb_channels * sizeof(EXRChannel));
-                if (!s->channels)
-                    return AVERROR(ENOMEM);
+                if (!s->channels) {
+                    ret = AVERROR(ENOMEM);
+                    goto fail;
+                }
                 channel             = &s->channels[s->nb_channels - 1];
                 channel->pixel_type = current_pixel_type;
                 channel->xsub       = xsub;
                 channel->ysub       = ysub;
 
-                current_channel_offset += 1 << current_pixel_type;
+                if (current_pixel_type == EXR_HALF) {
+                    s->current_channel_offset += 2;
+                } else {/* Float or UINT32 */
+                    s->current_channel_offset += 4;
+                }
             }
 
             /* Check if all channels are set with an offset or if the channels
              * are causing an overflow  */
-            if (FFMIN3(s->channel_offsets[0],
-                       s->channel_offsets[1],
-                       s->channel_offsets[2]) < 0) {
-                if (s->channel_offsets[0] < 0)
-                    av_log(s->avctx, AV_LOG_ERROR, "Missing red channel.\n");
-                if (s->channel_offsets[1] < 0)
-                    av_log(s->avctx, AV_LOG_ERROR, "Missing green channel.\n");
-                if (s->channel_offsets[2] < 0)
-                    av_log(s->avctx, AV_LOG_ERROR, "Missing blue channel.\n");
-                return AVERROR_INVALIDDATA;
+            if (!s->is_luma){/* if we expected to have at least 3 channels */
+                if (FFMIN3(s->channel_offsets[0],
+                           s->channel_offsets[1],
+                           s->channel_offsets[2]) < 0) {
+                    if (s->channel_offsets[0] < 0)
+                        av_log(s->avctx, AV_LOG_ERROR, "Missing red channel.\n");
+                    if (s->channel_offsets[1] < 0)
+                        av_log(s->avctx, AV_LOG_ERROR, "Missing green channel.\n");
+                    if (s->channel_offsets[2] < 0)
+                        av_log(s->avctx, AV_LOG_ERROR, "Missing blue channel.\n");
+                    ret = AVERROR_INVALIDDATA;
+                    goto fail;
+                }
             }
 
             // skip one last byte and update main gb
@@ -1149,8 +1511,10 @@ static int decode_header(EXRContext *s)
             continue;
         } else if ((var_size = check_header_variable(s, "dataWindow", "box2i",
                                                      31)) >= 0) {
-            if (!var_size)
-                return AVERROR_INVALIDDATA;
+            if (!var_size) {
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
 
             s->xmin   = bytestream2_get_le32(&s->gb);
             s->ymin   = bytestream2_get_le32(&s->gb);
@@ -1162,8 +1526,10 @@ static int decode_header(EXRContext *s)
             continue;
         } else if ((var_size = check_header_variable(s, "displayWindow",
                                                      "box2i", 34)) >= 0) {
-            if (!var_size)
-                return AVERROR_INVALIDDATA;
+            if (!var_size) {
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
 
             bytestream2_skip(&s->gb, 8);
             s->w = bytestream2_get_le32(&s->gb) + 1;
@@ -1173,30 +1539,36 @@ static int decode_header(EXRContext *s)
         } else if ((var_size = check_header_variable(s, "lineOrder",
                                                      "lineOrder", 25)) >= 0) {
             int line_order;
-            if (!var_size)
-                return AVERROR_INVALIDDATA;
+            if (!var_size) {
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
 
             line_order = bytestream2_get_byte(&s->gb);
             av_log(s->avctx, AV_LOG_DEBUG, "line order: %d.\n", line_order);
             if (line_order > 2) {
                 av_log(s->avctx, AV_LOG_ERROR, "Unknown line order.\n");
-                return AVERROR_INVALIDDATA;
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
             }
 
             continue;
         } else if ((var_size = check_header_variable(s, "pixelAspectRatio",
                                                      "float", 31)) >= 0) {
-            if (!var_size)
-                return AVERROR_INVALIDDATA;
+            if (!var_size) {
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
 
-            ff_set_sar(s->avctx,
-                       av_d2q(av_int2float(bytestream2_get_le32(&s->gb)), 255));
+            sar = bytestream2_get_le32(&s->gb);
 
             continue;
         } else if ((var_size = check_header_variable(s, "compression",
                                                      "compression", 29)) >= 0) {
-            if (!var_size)
-                return AVERROR_INVALIDDATA;
+            if (!var_size) {
+                ret = AVERROR_INVALIDDATA;
+                goto fail;
+            }
 
             if (s->compression == EXR_UNKN)
                 s->compression = bytestream2_get_byte(&s->gb);
@@ -1205,12 +1577,51 @@ static int decode_header(EXRContext *s)
                        "Found more than one compression attribute.\n");
 
             continue;
+        } else if ((var_size = check_header_variable(s, "tiles",
+                                                     "tiledesc", 22)) >= 0) {
+            char tileLevel;
+
+            if (!s->is_tile)
+                av_log(s->avctx, AV_LOG_WARNING,
+                       "Found tile attribute and scanline flags. Exr will be interpreted as scanline.\n");
+
+            s->tile_attr.xSize = bytestream2_get_le32(&s->gb);
+            s->tile_attr.ySize = bytestream2_get_le32(&s->gb);
+
+            tileLevel = bytestream2_get_byte(&s->gb);
+            s->tile_attr.level_mode = tileLevel & 0x0f;
+            s->tile_attr.level_round = (tileLevel >> 4) & 0x0f;
+
+            if (s->tile_attr.level_mode >= EXR_TILE_LEVEL_UNKNOWN){
+                avpriv_report_missing_feature(s->avctx, "Tile level mode %d",
+                                              s->tile_attr.level_mode);
+                ret = AVERROR_PATCHWELCOME;
+                goto fail;
+            }
+
+            if (s->tile_attr.level_round >= EXR_TILE_ROUND_UNKNOWN) {
+                avpriv_report_missing_feature(s->avctx, "Tile level round %d",
+                                              s->tile_attr.level_round);
+                ret = AVERROR_PATCHWELCOME;
+                goto fail;
+            }
+
+            continue;
+        } else if ((var_size = check_header_variable(s, "writer",
+                                                     "string", 1)) >= 0) {
+            uint8_t key[256] = { 0 };
+
+            bytestream2_get_buffer(&s->gb, key, FFMIN(sizeof(key) - 1, var_size));
+            av_dict_set(&metadata, "writer", key, 0);
+
+            continue;
         }
 
         // Check if there are enough bytes for a header
         if (bytestream2_get_bytes_left(&s->gb) <= 9) {
             av_log(s->avctx, AV_LOG_ERROR, "Incomplete header\n");
-            return AVERROR_INVALIDDATA;
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
         }
 
         // Process unknown variables
@@ -1221,20 +1632,36 @@ static int decode_header(EXRContext *s)
         bytestream2_skip(&s->gb, bytestream2_get_le32(&s->gb));
     }
 
+    ff_set_sar(s->avctx, av_d2q(av_int2float(sar), 255));
+
     if (s->compression == EXR_UNKN) {
         av_log(s->avctx, AV_LOG_ERROR, "Missing compression attribute.\n");
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    if (s->is_tile) {
+        if (s->tile_attr.xSize < 1 || s->tile_attr.ySize < 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid tile attribute.\n");
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
     }
-    s->scan_line_size = s->xdelta * current_channel_offset;
 
     if (bytestream2_get_bytes_left(&s->gb) <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Incomplete frame.\n");
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
     }
 
+    frame->metadata = metadata;
+
     // aaand we are done
     bytestream2_skip(&s->gb, 1);
     return 0;
+fail:
+    av_dict_free(&metadata);
+    return ret;
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data,
@@ -1247,29 +1674,42 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
     int y, ret;
     int out_line_size;
-    int scan_line_blocks;
+    int nb_blocks;   /* nb scanline or nb tile */
+    uint64_t start_offset_table;
+    uint64_t start_next_scanline;
+    PutByteContext offset_table_writer;
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
-    if ((ret = decode_header(s)) < 0)
+    if ((ret = decode_header(s, picture)) < 0)
         return ret;
 
     switch (s->pixel_type) {
     case EXR_FLOAT:
     case EXR_HALF:
-        if (s->channel_offsets[3] >= 0)
-            avctx->pix_fmt = AV_PIX_FMT_RGBA64;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_RGB48;
-        break;
     case EXR_UINT:
-        avpriv_request_sample(avctx, "32-bit unsigned int");
-        return AVERROR_PATCHWELCOME;
+        if (s->channel_offsets[3] >= 0) {
+            if (!s->is_luma) {
+                avctx->pix_fmt = AV_PIX_FMT_RGBA64;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_YA16;
+            }
+        } else {
+            if (!s->is_luma) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB48;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            }
+        }
+        break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Missing channel list.\n");
         return AVERROR_INVALIDDATA;
     }
 
+    if (s->apply_trc_type != AVCOL_TRC_UNSPECIFIED)
+        avctx->color_trc = s->apply_trc_type;
+
     switch (s->compression) {
     case EXR_RAW:
     case EXR_RLE:
@@ -1281,6 +1721,8 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         s->scan_lines_per_block = 16;
         break;
     case EXR_PIZ:
+    case EXR_B44:
+    case EXR_B44A:
         s->scan_lines_per_block = 32;
         break;
     default:
@@ -1306,15 +1748,40 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (!s->desc)
         return AVERROR_INVALIDDATA;
     out_line_size    = avctx->width * 2 * s->desc->nb_components;
-    scan_line_blocks = (s->ydelta + s->scan_lines_per_block - 1) /
-                       s->scan_lines_per_block;
+
+    if (s->is_tile) {
+        nb_blocks = ((s->xdelta + s->tile_attr.xSize - 1) / s->tile_attr.xSize) *
+        ((s->ydelta + s->tile_attr.ySize - 1) / s->tile_attr.ySize);
+    } else { /* scanline */
+        nb_blocks = (s->ydelta + s->scan_lines_per_block - 1) /
+        s->scan_lines_per_block;
+    }
 
     if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
 
-    if (bytestream2_get_bytes_left(&s->gb) < scan_line_blocks * 8)
+    if (bytestream2_get_bytes_left(&s->gb) < nb_blocks * 8)
         return AVERROR_INVALIDDATA;
 
+    // check offset table and recreate it if need
+    if (!s->is_tile && bytestream2_peek_le64(&s->gb) == 0) {
+        av_log(s->avctx, AV_LOG_DEBUG, "recreating invalid scanline offset table\n");
+
+        start_offset_table = bytestream2_tell(&s->gb);
+        start_next_scanline = start_offset_table + nb_blocks * 8;
+        bytestream2_init_writer(&offset_table_writer, &avpkt->data[start_offset_table], nb_blocks * 8);
+
+        for (y = 0; y < nb_blocks; y++) {
+            /* write offset of prev scanline in offset table */
+            bytestream2_put_le64(&offset_table_writer, start_next_scanline);
+
+            /* get len of next scanline */
+            bytestream2_seek(&s->gb, start_next_scanline + 4, SEEK_SET);/* skip line number */
+            start_next_scanline += (bytestream2_get_le32(&s->gb) + 8);
+        }
+        bytestream2_seek(&s->gb, start_offset_table, SEEK_SET);
+    }
+
     // save pointer we are going to use in decode_block
     s->buf      = avpkt->data;
     s->buf_size = avpkt->size;
@@ -1327,9 +1794,11 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     }
 
     s->picture = picture;
-    avctx->execute2(avctx, decode_block, s->thread_data, NULL, scan_line_blocks);
+
+    avctx->execute2(avctx, decode_block, s->thread_data, NULL, nb_blocks);
 
     // Zero out the end if ymax+1 is not h
+    ptr = picture->data[0] + ((s->ymax+1) * picture->linesize[0]);
     for (y = s->ymax + 1; y < avctx->height; y++) {
         memset(ptr, 0, out_line_size);
         ptr += picture->linesize[0];
@@ -1347,36 +1816,37 @@ static av_cold int decode_init(AVCodecContext *avctx)
     uint32_t i;
     union av_intfloat32 t;
     float one_gamma = 1.0f / s->gamma;
+    avpriv_trc_function trc_func = NULL;
 
     s->avctx              = avctx;
-    s->xmin               = ~0;
-    s->xmax               = ~0;
-    s->ymin               = ~0;
-    s->ymax               = ~0;
-    s->xdelta             = ~0;
-    s->ydelta             = ~0;
-    s->channel_offsets[0] = -1;
-    s->channel_offsets[1] = -1;
-    s->channel_offsets[2] = -1;
-    s->channel_offsets[3] = -1;
-    s->pixel_type         = EXR_UNKNOWN;
-    s->compression        = EXR_UNKN;
-    s->nb_channels        = 0;
-    s->w                  = 0;
-    s->h                  = 0;
 
-    if (one_gamma > 0.9999f && one_gamma < 1.0001f) {
-        for (i = 0; i < 65536; ++i)
-            s->gamma_table[i] = exr_halflt2uint(i);
-    } else {
+    ff_exrdsp_init(&s->dsp);
+
+#if HAVE_BIGENDIAN
+    ff_bswapdsp_init(&s->bbdsp);
+#endif
+
+    trc_func = avpriv_get_trc_function_from_trc(s->apply_trc_type);
+    if (trc_func) {
         for (i = 0; i < 65536; ++i) {
             t = exr_half2float(i);
-            /* If negative value we reuse half value */
-            if (t.f <= 0.0f) {
+            t.f = trc_func(t.f);
+            s->gamma_table[i] = exr_flt2uint(t.i);
+        }
+    } else {
+        if (one_gamma > 0.9999f && one_gamma < 1.0001f) {
+            for (i = 0; i < 65536; ++i)
                 s->gamma_table[i] = exr_halflt2uint(i);
-            } else {
-                t.f = powf(t.f, one_gamma);
-                s->gamma_table[i] = exr_flt2uint(t.i);
+        } else {
+            for (i = 0; i < 65536; ++i) {
+                t = exr_half2float(i);
+                /* If negative value we reuse half value */
+                if (t.f <= 0.0f) {
+                    s->gamma_table[i] = exr_halflt2uint(i);
+                } else {
+                    t.f = powf(t.f, one_gamma);
+                    s->gamma_table[i] = exr_flt2uint(t.i);
+                }
             }
         }
     }
@@ -1389,6 +1859,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static int decode_init_thread_copy(AVCodecContext *avctx)
 {    EXRContext *s = avctx->priv_data;
 
@@ -1399,6 +1870,7 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
@@ -1425,6 +1897,43 @@ static const AVOption options[] = {
         AV_OPT_TYPE_STRING, { .str = "" }, 0, 0, VD },
     { "gamma", "Set the float gamma value when decoding", OFFSET(gamma),
         AV_OPT_TYPE_FLOAT, { .dbl = 1.0f }, 0.001, FLT_MAX, VD },
+
+    // XXX: Note the abuse of the enum using AVCOL_TRC_UNSPECIFIED to subsume the existing gamma option
+    { "apply_trc", "color transfer characteristics to apply to EXR linear input", OFFSET(apply_trc_type),
+        AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_UNSPECIFIED }, 1, AVCOL_TRC_NB-1, VD, "apply_trc_type"},
+    { "bt709",        "BT.709",           0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709 },        INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma",        "gamma",            0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED },  INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma22",      "BT.470 M",         0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA22 },      INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma28",      "BT.470 BG",        0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA28 },      INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte170m",    "SMPTE 170 M",      0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE170M },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte240m",    "SMPTE 240 M",      0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE240M },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "linear",       "Linear",           0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LINEAR },       INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "log",          "Log",              0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG },          INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "log_sqrt",     "Log square root",  0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG_SQRT },     INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "iec61966_2_4", "IEC 61966-2-4",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_4 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt1361",       "BT.1361",          0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT1361_ECG },   INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "iec61966_2_1", "IEC 61966-2-1",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt2020_10bit", "BT.2020 - 10 bit", 0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt2020_12bit", "BT.2020 - 12 bit", 0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte2084",    "SMPTE ST 2084",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST2084 },  INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte428_1",   "SMPTE ST 428-1",   0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST428_1 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+
     { NULL },
 };
 
diff --git a/libavcodec/exrdsp.c b/libavcodec/exrdsp.c
new file mode 100644
index 0000000..42dbf1f
--- /dev/null
+++ b/libavcodec/exrdsp.c
@@ -0,0 +1,56 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "exrdsp.h"
+#include "config.h"
+
+static void reorder_pixels_scalar(uint8_t *dst, const uint8_t *src, ptrdiff_t size)
+{
+    const uint8_t *t1 = src;
+    int half_size     = size / 2;
+    const uint8_t *t2 = src + half_size;
+    uint8_t *s        = dst;
+    int i;
+
+    for (i = 0; i < half_size; i++) {
+        *(s++) = *(t1++);
+        *(s++) = *(t2++);
+    }
+}
+
+static void predictor_scalar(uint8_t *src, ptrdiff_t size)
+{
+    ptrdiff_t i;
+
+    for (i = 1; i < size; i++)
+        src[i] += src[i-1] - 128;
+}
+
+av_cold void ff_exrdsp_init(ExrDSPContext *c)
+{
+    c->reorder_pixels   = reorder_pixels_scalar;
+    c->predictor        = predictor_scalar;
+
+    if (ARCH_X86)
+        ff_exrdsp_init_x86(c);
+}
diff --git a/libavcodec/exrdsp.h b/libavcodec/exrdsp.h
new file mode 100644
index 0000000..2c4dc3a
--- /dev/null
+++ b/libavcodec/exrdsp.h
@@ -0,0 +1,33 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_EXRDSP_H
+#define AVCODEC_EXRDSP_H
+
+#include <stdint.h>
+#include "libavutil/common.h"
+
+typedef struct ExrDSPContext {
+    void (*reorder_pixels)(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+    void (*predictor)(uint8_t *src, ptrdiff_t size);
+} ExrDSPContext;
+
+void ff_exrdsp_init(ExrDSPContext *c);
+void ff_exrdsp_init_x86(ExrDSPContext *c);
+
+#endif /* AVCODEC_EXRDSP_H */
diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c
index f631331..17e5deb 100644
--- a/libavcodec/extract_extradata_bsf.c
+++ b/libavcodec/extract_extradata_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,8 @@
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
+#include "av1.h"
+#include "av1_parse.h"
 #include "bsf.h"
 #include "h2645_parse.h"
 #include "h264.h"
@@ -36,6 +38,9 @@ typedef struct ExtractExtradataContext {
     int (*extract)(AVBSFContext *ctx, AVPacket *pkt,
                    uint8_t **data, int *size);
 
+    /* AV1 specifc fields */
+    AV1Packet av1_pkt;
+
     /* H264/HEVC specifc fields */
     H2645Packet h2645_pkt;
 
@@ -52,6 +57,80 @@ static int val_in_array(const int *arr, int len, int val)
     return 0;
 }
 
+static int extract_extradata_av1(AVBSFContext *ctx, AVPacket *pkt,
+                                 uint8_t **data, int *size)
+{
+    static const int extradata_obu_types[] = {
+        AV1_OBU_SEQUENCE_HEADER, AV1_OBU_METADATA,
+    };
+    ExtractExtradataContext *s = ctx->priv_data;
+
+    int extradata_size = 0, filtered_size = 0;
+    int nb_extradata_obu_types = FF_ARRAY_ELEMS(extradata_obu_types);
+    int i, has_seq = 0, ret = 0;
+
+    ret = ff_av1_packet_split(&s->av1_pkt, pkt->data, pkt->size, ctx);
+    if (ret < 0)
+        return ret;
+
+    for (i = 0; i < s->av1_pkt.nb_obus; i++) {
+        AV1OBU *obu = &s->av1_pkt.obus[i];
+        if (val_in_array(extradata_obu_types, nb_extradata_obu_types, obu->type)) {
+            extradata_size += obu->raw_size;
+            if (obu->type == AV1_OBU_SEQUENCE_HEADER)
+                has_seq = 1;
+        } else if (s->remove) {
+            filtered_size += obu->raw_size;
+        }
+    }
+
+    if (extradata_size && has_seq) {
+        AVBufferRef *filtered_buf;
+        uint8_t *extradata, *filtered_data;
+
+        if (s->remove) {
+            filtered_buf = av_buffer_alloc(filtered_size + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!filtered_buf) {
+                return AVERROR(ENOMEM);
+            }
+            memset(filtered_buf->data + filtered_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+            filtered_data = filtered_buf->data;
+        }
+
+        extradata = av_malloc(extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!extradata) {
+            av_buffer_unref(&filtered_buf);
+            return AVERROR(ENOMEM);
+        }
+        memset(extradata + extradata_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+        *data = extradata;
+        *size = extradata_size;
+
+        for (i = 0; i < s->av1_pkt.nb_obus; i++) {
+            AV1OBU *obu = &s->av1_pkt.obus[i];
+            if (val_in_array(extradata_obu_types, nb_extradata_obu_types,
+                             obu->type)) {
+                memcpy(extradata, obu->raw_data, obu->raw_size);
+                extradata += obu->raw_size;
+            } else if (s->remove) {
+                memcpy(filtered_data, obu->raw_data, obu->raw_size);
+                filtered_data += obu->raw_size;
+            }
+        }
+
+        if (s->remove) {
+            av_buffer_unref(&pkt->buf);
+            pkt->buf  = filtered_buf;
+            pkt->data = filtered_buf->data;
+            pkt->size = filtered_size;
+        }
+    }
+
+    return 0;
+}
+
 static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
                                    uint8_t **data, int *size)
 {
@@ -78,7 +157,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
     }
 
     ret = ff_h2645_packet_split(&s->h2645_pkt, pkt->data, pkt->size,
-                                ctx, 0, 0, ctx->par_in->codec_id);
+                                ctx, 0, 0, ctx->par_in->codec_id, 1, 0);
     if (ret < 0)
         return ret;
 
@@ -152,19 +231,17 @@ static int extract_extradata_vc1(AVBSFContext *ctx, AVPacket *pkt,
                                  uint8_t **data, int *size)
 {
     ExtractExtradataContext *s = ctx->priv_data;
+    const uint8_t *ptr = pkt->data, *end = pkt->data + pkt->size;
     uint32_t state = UINT32_MAX;
     int has_extradata = 0, extradata_size = 0;
-    int i;
 
-    for (i = 0; i < pkt->size; i++) {
-        state = (state << 8) | pkt->data[i];
-        if (IS_MARKER(state)) {
-            if (state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT) {
-                has_extradata = 1;
-            } else if (has_extradata) {
-                extradata_size = i - 3;
-                break;
-            }
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if (state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT) {
+            has_extradata = 1;
+        } else if (has_extradata && IS_MARKER(state)) {
+            extradata_size = ptr - 4 - pkt->data;
+            break;
         }
     }
 
@@ -186,19 +263,18 @@ static int extract_extradata_vc1(AVBSFContext *ctx, AVPacket *pkt,
     return 0;
 }
 
-static int extract_extradata_mpeg124(AVBSFContext *ctx, AVPacket *pkt,
+static int extract_extradata_mpeg12(AVBSFContext *ctx, AVPacket *pkt,
                                      uint8_t **data, int *size)
 {
     ExtractExtradataContext *s = ctx->priv_data;
-    int is_mpeg12 = ctx->par_in->codec_id == AV_CODEC_ID_MPEG1VIDEO ||
-                    ctx->par_in->codec_id == AV_CODEC_ID_MPEG2VIDEO;
     uint32_t state = UINT32_MAX;
-    int i;
+    int i, found = 0;
 
     for (i = 0; i < pkt->size; i++) {
         state = (state << 8) | pkt->data[i];
-        if ((is_mpeg12 && state != 0x1B3 && state != 0x1B5 && state < 0x200 && state >= 0x100) ||
-            (!is_mpeg12 && (state == 0x1B3 || state == 0x1B6))) {
+        if (state == 0x1B3)
+            found = 1;
+        else if (found && state != 0x1B5 && state < 0x200 && state >= 0x100) {
             if (i > 3) {
                 *size = i - 3;
                 *data = av_malloc(*size + AV_INPUT_BUFFER_PADDING_SIZE);
@@ -219,17 +295,49 @@ static int extract_extradata_mpeg124(AVBSFContext *ctx, AVPacket *pkt,
     return 0;
 }
 
+static int extract_extradata_mpeg4(AVBSFContext *ctx, AVPacket *pkt,
+                                   uint8_t **data, int *size)
+{
+    ExtractExtradataContext *s = ctx->priv_data;
+    const uint8_t *ptr = pkt->data, *end = pkt->data + pkt->size;
+    uint32_t state = UINT32_MAX;
+
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if (state == 0x1B3 || state == 0x1B6) {
+            if (ptr - pkt->data > 4) {
+                *size = ptr - 4 - pkt->data;
+                *data = av_malloc(*size + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!*data)
+                    return AVERROR(ENOMEM);
+
+                memcpy(*data, pkt->data, *size);
+                memset(*data + *size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+                if (s->remove) {
+                    pkt->data += *size;
+                    pkt->size -= *size;
+                }
+            }
+            break;
+        }
+    }
+    return 0;
+}
+
 static const struct {
     enum AVCodecID id;
     int (*extract)(AVBSFContext *ctx, AVPacket *pkt,
                    uint8_t **data, int *size);
 } extract_tab[] = {
-    { AV_CODEC_ID_CAVS,       extract_extradata_mpeg124 },
+    { AV_CODEC_ID_AV1,        extract_extradata_av1     },
+    { AV_CODEC_ID_AVS2,       extract_extradata_mpeg4   },
+    { AV_CODEC_ID_CAVS,       extract_extradata_mpeg4   },
     { AV_CODEC_ID_H264,       extract_extradata_h2645   },
     { AV_CODEC_ID_HEVC,       extract_extradata_h2645   },
-    { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg124 },
-    { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg124 },
-    { AV_CODEC_ID_MPEG4,      extract_extradata_mpeg124 },
+    { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12  },
+    { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12  },
+    { AV_CODEC_ID_MPEG4,      extract_extradata_mpeg4   },
     { AV_CODEC_ID_VC1,        extract_extradata_vc1     },
 };
 
@@ -284,10 +392,13 @@ fail:
 static void extract_extradata_close(AVBSFContext *ctx)
 {
     ExtractExtradataContext *s = ctx->priv_data;
+    ff_av1_packet_uninit(&s->av1_pkt);
     ff_h2645_packet_uninit(&s->h2645_pkt);
 }
 
 static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_AV1,
+    AV_CODEC_ID_AVS2,
     AV_CODEC_ID_CAVS,
     AV_CODEC_ID_H264,
     AV_CODEC_ID_HEVC,
@@ -299,9 +410,10 @@ static const enum AVCodecID codec_ids[] = {
 };
 
 #define OFFSET(x) offsetof(ExtractExtradataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
 static const AVOption options[] = {
     { "remove", "remove the extradata from the bitstream", OFFSET(remove), AV_OPT_TYPE_INT,
-        { .i64 = 0 }, 0, 1 },
+        { .i64 = 0 }, 0, 1, FLAGS },
     { NULL },
 };
 
diff --git a/libavcodec/faandct.c b/libavcodec/faandct.c
index 7888633..38c392b 100644
--- a/libavcodec/faandct.c
+++ b/libavcodec/faandct.c
@@ -29,25 +29,24 @@
 #include "libavutil/internal.h"
 #include "libavutil/libm.h"
 
-#define FLOAT float
+typedef float FLOAT;
 
-//numbers generated by simple c code (not as accurate as they could be)
-/*
-for(i=0; i<8; i++){
-    printf("#define B%d %1.20llf\n", i, (long double)1.0/(cosl(i*acosl(-1.0)/(long double)16.0)*sqrtl(2)));
-}
+/* numbers generated by arbitrary precision arithmetic followed by truncation
+to 36 fractional digits (enough for a 128-bit IEEE quad, see /usr/include/math.h
+for this approach). Unfortunately, long double is not always available correctly,
+e.g ppc has issues.
+TODO: add L suffixes when ppc and toolchains sort out their stuff.
 */
-#define B0 1.00000000000000000000
-#define B1 0.72095982200694791383 // (cos(pi*1/16)sqrt(2))^-1
-#define B2 0.76536686473017954350 // (cos(pi*2/16)sqrt(2))^-1
-#define B3 0.85043009476725644878 // (cos(pi*3/16)sqrt(2))^-1
-#define B4 1.00000000000000000000 // (cos(pi*4/16)sqrt(2))^-1
-#define B5 1.27275858057283393842 // (cos(pi*5/16)sqrt(2))^-1
-#define B6 1.84775906502257351242 // (cos(pi*6/16)sqrt(2))^-1
-#define B7 3.62450978541155137218 // (cos(pi*7/16)sqrt(2))^-1
-
-
-#define A1 0.70710678118654752438 // cos(pi*4/16)
+#define B0 1.000000000000000000000000000000000000
+#define B1 0.720959822006947913789091890943021267 // (cos(pi*1/16)sqrt(2))^-1
+#define B2 0.765366864730179543456919968060797734 // (cos(pi*2/16)sqrt(2))^-1
+#define B3 0.850430094767256448766702844371412325 // (cos(pi*3/16)sqrt(2))^-1
+#define B4 1.000000000000000000000000000000000000 // (cos(pi*4/16)sqrt(2))^-1
+#define B5 1.272758580572833938461007018281767032 // (cos(pi*5/16)sqrt(2))^-1
+#define B6 1.847759065022573512256366378793576574 // (cos(pi*6/16)sqrt(2))^-1
+#define B7 3.624509785411551372409941227504289587 // (cos(pi*7/16)sqrt(2))^-1
+
+#define A1 M_SQRT1_2              // cos(pi*4/16)
 #define A2 0.54119610014619698435 // cos(pi*6/16)sqrt(2)
 #define A5 0.38268343236508977170 // cos(pi*6/16)
 #define A4 1.30656296487637652774 // cos(pi*2/16)sqrt(2)
diff --git a/libavcodec/faandct.h b/libavcodec/faandct.h
index 59d5ff3..c5ef96d 100644
--- a/libavcodec/faandct.h
+++ b/libavcodec/faandct.h
@@ -2,20 +2,20 @@
  * Floating point AAN DCT
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/faanidct.c b/libavcodec/faanidct.c
index 57e52a5..3921f82 100644
--- a/libavcodec/faanidct.c
+++ b/libavcodec/faanidct.c
@@ -2,27 +2,27 @@
  * Floating point AAN IDCT
  * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "faanidct.h"
 #include "libavutil/common.h"
 
 /* To allow switching to double. */
-#define FLOAT float
+typedef float FLOAT;
 
 #define B0 1.0000000000000000000000
 #define B1 1.3870398453221474618216 // cos(pi*1/16)sqrt(2)
diff --git a/libavcodec/faanidct.h b/libavcodec/faanidct.h
index 4eedba8..6f4da67 100644
--- a/libavcodec/faanidct.h
+++ b/libavcodec/faanidct.h
@@ -2,20 +2,20 @@
  * Floating point AAN IDCT
  * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/faxcompr.c b/libavcodec/faxcompr.c
index 0a079fc..2a1d2bc 100644
--- a/libavcodec/faxcompr.c
+++ b/libavcodec/faxcompr.c
@@ -2,20 +2,20 @@
  * CCITT Fax Group 3 and 4 decompression
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,8 @@
  * @author Konstantin Shishkov
  */
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "put_bits.h"
-#include "vlc.h"
 #include "faxcompr.h"
 
 #define CCITT_SYMS 104
@@ -123,8 +122,83 @@ av_cold void ff_ccitt_unpack_init(void)
     initialized = 1;
 }
 
+static int decode_uncompressed(AVCodecContext *avctx, GetBitContext *gb,
+                               unsigned int *pix_left, int **runs,
+                               const int *runend, int *mode)
+{
+    int eob = 0;
+    int newmode;
+    int saved_run = 0;
+
+    do {
+        int cwi, k;
+        int cw = 0;
+        int codes[2];
+        do {
+            cwi = show_bits(gb, 11);
+            if (!cwi) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid uncompressed codeword\n");
+                return AVERROR_INVALIDDATA;
+            }
+            cwi = 10 - av_log2(cwi);
+            skip_bits(gb, cwi + 1);
+            if (cwi > 5) {
+                newmode = get_bits1(gb);
+                eob = 1;
+                cwi -= 6;
+            }
+            cw += cwi;
+        } while(cwi == 5);
+
+        codes[0] = cw;
+        codes[1] = !eob;
+
+        for (k = 0; k < 2; k++) {
+            if (codes[k]) {
+                if (*mode == !k) {
+                    *(*runs)++ = saved_run;
+                    if (*runs >= runend) {
+                        av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    if (*pix_left <= saved_run) {
+                        av_log(avctx, AV_LOG_ERROR, "uncompressed run went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    *pix_left -= saved_run;
+                    saved_run = 0;
+                    *mode = !*mode;
+                }
+                saved_run += codes[k];
+            }
+        }
+    } while (!eob);
+    *(*runs)++ = saved_run;
+    if (*runs >= runend) {
+        av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (*pix_left <= saved_run) {
+        if (*pix_left == saved_run)
+            return 1;
+        av_log(avctx, AV_LOG_ERROR, "uncompressed run went out of boundsE\n");
+        return AVERROR_INVALIDDATA;
+    }
+    *pix_left -= saved_run;
+    saved_run = 0;
+    *mode = !*mode;
+    if (newmode != *mode) { //FIXME CHECK
+        *(*runs)++ = 0;
+        if (*runs >= runend) {
+            av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+            return AVERROR_INVALIDDATA;
+        }
+        *mode = newmode;
+    }
+    return 0;
+}
 
-static int decode_group3_1d_line(AVCodecContext *avctx, BitstreamContext *bc,
+static int decode_group3_1d_line(AVCodecContext *avctx, GetBitContext *gb,
                                  unsigned int pix_left, int *runs,
                                  const int *runend)
 {
@@ -132,7 +206,7 @@ static int decode_group3_1d_line(AVCodecContext *avctx, BitstreamContext *bc,
     unsigned int run = 0;
     unsigned int t;
     for (;;) {
-        t    = bitstream_read_vlc(bc, ccitt_vlc[mode].table, 9, 2);
+        t    = get_vlc2(gb, ccitt_vlc[mode].table, 9, 2);
         run += t;
         if (t < 64) {
             *runs++ = run;
@@ -150,15 +224,25 @@ static int decode_group3_1d_line(AVCodecContext *avctx, BitstreamContext *bc,
             run       = 0;
             mode      = !mode;
         } else if ((int)t == -1) {
-            av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
-            return AVERROR_INVALIDDATA;
+            if (show_bits(gb, 12) == 15) {
+                int ret;
+                skip_bits(gb, 12);
+                ret = decode_uncompressed(avctx, gb, &pix_left, &runs, runend, &mode);
+                if (ret < 0) {
+                    return ret;
+                } else if (ret)
+                    break;
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
+                return AVERROR_INVALIDDATA;
+            }
         }
     }
     *runs++ = 0;
     return 0;
 }
 
-static int decode_group3_2d_line(AVCodecContext *avctx, BitstreamContext *bc,
+static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
                                  unsigned int width, int *runs,
                                  const int *runend, const int *ref)
 {
@@ -166,19 +250,19 @@ static int decode_group3_2d_line(AVCodecContext *avctx, BitstreamContext *bc,
     int run_off       = *ref++;
     unsigned int offs = 0, run = 0;
 
-    runend--; // for the last written 0
-
     while (offs < width) {
-        int cmode = bitstream_read_vlc(bc, ccitt_group3_2d_vlc.table, 9, 1);
+        int cmode = get_vlc2(gb, ccitt_group3_2d_vlc.table, 9, 1);
         if (cmode == -1) {
             av_log(avctx, AV_LOG_ERROR, "Incorrect mode VLC\n");
             return AVERROR_INVALIDDATA;
         }
         if (!cmode) { //pass mode
-            run_off += *ref++;
+            if (run_off < width)
+                run_off += *ref++;
             run      = run_off - offs;
             offs     = run_off;
-            run_off += *ref++;
+            if (run_off < width)
+                run_off += *ref++;
             if (offs > width) {
                 av_log(avctx, AV_LOG_ERROR, "Run went out of bounds\n");
                 return AVERROR_INVALIDDATA;
@@ -189,7 +273,7 @@ static int decode_group3_2d_line(AVCodecContext *avctx, BitstreamContext *bc,
             for (k = 0; k < 2; k++) {
                 run = 0;
                 for (;;) {
-                    t = bitstream_read_vlc(bc, ccitt_vlc[mode].table, 9, 2);
+                    t = get_vlc2(gb, ccitt_vlc[mode].table, 9, 2);
                     if (t == -1) {
                         av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
                         return AVERROR_INVALIDDATA;
@@ -212,8 +296,25 @@ static int decode_group3_2d_line(AVCodecContext *avctx, BitstreamContext *bc,
                 mode = !mode;
             }
         } else if (cmode == 9 || cmode == 10) {
-            avpriv_report_missing_feature(avctx, "Special modes support");
-            return AVERROR_PATCHWELCOME;
+            int xxx = get_bits(gb, 3);
+            if (cmode == 9 && xxx == 7) {
+                int ret;
+                int pix_left = width - offs;
+
+                if (saved_run) {
+                    av_log(avctx, AV_LOG_ERROR, "saved run %d on entering uncompressed mode\n", saved_run);
+                    return AVERROR_INVALIDDATA;
+                }
+                ret = decode_uncompressed(avctx, gb, &pix_left, &runs, runend, &mode);
+                offs = width - pix_left;
+                if (ret < 0) {
+                    return ret;
+                } else if (ret)
+                    break;
+            } else {
+                avpriv_report_missing_feature(avctx, "Special mode %d xxx=%d support", cmode, xxx);
+                return AVERROR_PATCHWELCOME;
+            }
         } else { //vertical mode
             run      = run_off - offs + (cmode - 5);
             run_off -= *--ref;
@@ -231,13 +332,19 @@ static int decode_group3_2d_line(AVCodecContext *avctx, BitstreamContext *bc,
             mode      = !mode;
         }
         //sync line pointers
-        while (run_off <= offs) {
+        while (offs < width && run_off <= offs) {
             run_off += *ref++;
             run_off += *ref++;
         }
     }
     *runs++ = saved_run;
-    *runs++ = 0;
+    if (saved_run) {
+        if (runs >= runend) {
+            av_log(avctx, AV_LOG_ERROR, "Run overrun\n");
+            return -1;
+        }
+        *runs++ = 0;
+    }
     return 0;
 }
 
@@ -246,7 +353,7 @@ static void put_line(uint8_t *dst, int size, int width, const int *runs)
     PutBitContext pb;
     int run, mode = ~0, pix_left = width, run_idx = 0;
 
-    init_put_bits(&pb, dst, size * 8);
+    init_put_bits(&pb, dst, size);
     while (pix_left > 0) {
         run       = runs[run_idx++];
         mode      = ~mode;
@@ -259,12 +366,12 @@ static void put_line(uint8_t *dst, int size, int width, const int *runs)
     flush_put_bits(&pb);
 }
 
-static int find_group3_syncmarker(BitstreamContext *bc, int srcsize)
+static int find_group3_syncmarker(GetBitContext *gb, int srcsize)
 {
     unsigned int state = -1;
-    srcsize -= bitstream_tell(bc);
+    srcsize -= get_bits_count(gb);
     while (srcsize-- > 0) {
-        state += state + bitstream_read_bit(bc);
+        state += state + get_bits1(gb);
         if ((state & 0xFFF) == 1)
             return 0;
     }
@@ -276,13 +383,14 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
                     enum TiffCompr compr, int opts)
 {
     int j;
-    BitstreamContext bc;
+    GetBitContext gb;
     int *runs, *ref = NULL, *runend;
     int ret;
     int runsize = avctx->width + 2;
+    int has_eol;
 
-    runs = av_malloc(runsize * sizeof(runs[0]));
-    ref  = av_malloc(runsize * sizeof(ref[0]));
+    runs = av_malloc_array(runsize, sizeof(runs[0]));
+    ref  = av_malloc_array(runsize, sizeof(ref[0]));
     if (!runs || !ref) {
         ret = AVERROR(ENOMEM);
         goto fail;
@@ -290,27 +398,31 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
     ref[0] = avctx->width;
     ref[1] = 0;
     ref[2] = 0;
-    bitstream_init8(&bc, src, srcsize);
+    if ((ret = init_get_bits8(&gb, src, srcsize)) < 0)
+        goto fail;
+    has_eol = show_bits(&gb, 12) == 1 || show_bits(&gb, 16) == 1;
+
     for (j = 0; j < height; j++) {
         runend = runs + runsize;
         if (compr == TIFF_G4) {
-            ret = decode_group3_2d_line(avctx, &bc, avctx->width, runs, runend,
+            ret = decode_group3_2d_line(avctx, &gb, avctx->width, runs, runend,
                                         ref);
             if (ret < 0)
                 goto fail;
         } else {
             int g3d1 = (compr == TIFF_G3) && !(opts & 1);
             if (compr != TIFF_CCITT_RLE &&
-                find_group3_syncmarker(&bc, srcsize * 8) < 0)
+                has_eol &&
+                find_group3_syncmarker(&gb, srcsize * 8) < 0)
                 break;
-            if (compr == TIFF_CCITT_RLE || g3d1 || bitstream_read_bit(&bc))
-                ret = decode_group3_1d_line(avctx, &bc, avctx->width, runs,
+            if (compr == TIFF_CCITT_RLE || g3d1 || get_bits1(&gb))
+                ret = decode_group3_1d_line(avctx, &gb, avctx->width, runs,
                                             runend);
             else
-                ret = decode_group3_2d_line(avctx, &bc, avctx->width, runs,
+                ret = decode_group3_2d_line(avctx, &gb, avctx->width, runs,
                                             runend, ref);
             if (compr == TIFF_CCITT_RLE)
-                bitstream_align(&bc);
+                align_get_bits(&gb);
         }
         if (avctx->err_recognition & AV_EF_EXPLODE && ret < 0)
             goto fail;
diff --git a/libavcodec/faxcompr.h b/libavcodec/faxcompr.h
index 0a8b64d..aa29a7b 100644
--- a/libavcodec/faxcompr.h
+++ b/libavcodec/faxcompr.h
@@ -2,20 +2,20 @@
  * CCITT Fax Group 3 and 4 decompression
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c
index f299eae..b9c2c86 100644
--- a/libavcodec/fdctdsp.c
+++ b/libavcodec/fdctdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (avctx->bits_per_raw_sample == 10) {
+    if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) {
         c->fdct    = ff_jpeg_fdct_islow_10;
         c->fdct248 = ff_fdct248_islow_10;
     } else if (avctx->dct_algo == FF_DCT_FASTINT) {
diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h
index 944dc6d..3e1f683 100644
--- a/libavcodec/fdctdsp.h
+++ b/libavcodec/fdctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ffjni.c b/libavcodec/ffjni.c
new file mode 100644
index 0000000..f5b581f
--- /dev/null
+++ b/libavcodec/ffjni.c
@@ -0,0 +1,418 @@
+/*
+ * JNI utility functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <jni.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "libavutil/bprint.h"
+#include "libavutil/log.h"
+
+#include "config.h"
+#include "jni.h"
+#include "ffjni.h"
+
+static JavaVM *java_vm;
+static pthread_key_t current_env;
+static pthread_once_t once = PTHREAD_ONCE_INIT;
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+static void jni_detach_env(void *data)
+{
+    if (java_vm) {
+        (*java_vm)->DetachCurrentThread(java_vm);
+    }
+}
+
+static void jni_create_pthread_key(void)
+{
+    pthread_key_create(&current_env, jni_detach_env);
+}
+
+JNIEnv *ff_jni_get_env(void *log_ctx)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    pthread_mutex_lock(&lock);
+    if (java_vm == NULL) {
+        java_vm = av_jni_get_java_vm(log_ctx);
+    }
+
+    if (!java_vm) {
+        av_log(log_ctx, AV_LOG_ERROR, "No Java virtual machine has been registered\n");
+        goto done;
+    }
+
+    pthread_once(&once, jni_create_pthread_key);
+
+    if ((env = pthread_getspecific(current_env)) != NULL) {
+        goto done;
+    }
+
+    ret = (*java_vm)->GetEnv(java_vm, (void **)&env, JNI_VERSION_1_6);
+    switch(ret) {
+    case JNI_EDETACHED:
+        if ((*java_vm)->AttachCurrentThread(java_vm, &env, NULL) != 0) {
+            av_log(log_ctx, AV_LOG_ERROR, "Failed to attach the JNI environment to the current thread\n");
+            env = NULL;
+        } else {
+            pthread_setspecific(current_env, env);
+        }
+        break;
+    case JNI_OK:
+        break;
+    case JNI_EVERSION:
+        av_log(log_ctx, AV_LOG_ERROR, "The specified JNI version is not supported\n");
+        break;
+    default:
+        av_log(log_ctx, AV_LOG_ERROR, "Failed to get the JNI environment attached to this thread\n");
+        break;
+    }
+
+done:
+    pthread_mutex_unlock(&lock);
+    return env;
+}
+
+char *ff_jni_jstring_to_utf_chars(JNIEnv *env, jstring string, void *log_ctx)
+{
+    char *ret = NULL;
+    const char *utf_chars = NULL;
+
+    jboolean copy = 0;
+
+    if (!string) {
+        return NULL;
+    }
+
+    utf_chars = (*env)->GetStringUTFChars(env, string, &copy);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "String.getStringUTFChars() threw an exception\n");
+        return NULL;
+    }
+
+    ret = av_strdup(utf_chars);
+
+    (*env)->ReleaseStringUTFChars(env, string, utf_chars);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "String.releaseStringUTFChars() threw an exception\n");
+        return NULL;
+    }
+
+    return ret;
+}
+
+jstring ff_jni_utf_chars_to_jstring(JNIEnv *env, const char *utf_chars, void *log_ctx)
+{
+    jstring ret;
+
+    ret = (*env)->NewStringUTF(env, utf_chars);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "NewStringUTF() threw an exception\n");
+        return NULL;
+    }
+
+    return ret;
+}
+
+int ff_jni_exception_get_summary(JNIEnv *env, jthrowable exception, char **error, void *log_ctx)
+{
+    int ret = 0;
+
+    AVBPrint bp;
+
+    char *name = NULL;
+    char *message = NULL;
+
+    jclass class_class = NULL;
+    jmethodID get_name_id = NULL;
+
+    jclass exception_class = NULL;
+    jmethodID get_message_id = NULL;
+
+    jstring string = NULL;
+
+    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
+
+    exception_class = (*env)->GetObjectClass(env, exception);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find Throwable class\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    class_class = (*env)->GetObjectClass(env, exception_class);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find Throwable class's class\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    get_name_id = (*env)->GetMethodID(env, class_class, "getName", "()Ljava/lang/String;");
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find method Class.getName()\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    string = (*env)->CallObjectMethod(env, exception_class, get_name_id);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Class.getName() threw an exception\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    if (string) {
+        name = ff_jni_jstring_to_utf_chars(env, string, log_ctx);
+        (*env)->DeleteLocalRef(env, string);
+        string = NULL;
+    }
+
+    get_message_id = (*env)->GetMethodID(env, exception_class, "getMessage", "()Ljava/lang/String;");
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find method java/lang/Throwable.getMessage()\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    string = (*env)->CallObjectMethod(env, exception, get_message_id);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Throwable.getMessage() threw an exception\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    if (string) {
+        message = ff_jni_jstring_to_utf_chars(env, string, log_ctx);
+        (*env)->DeleteLocalRef(env, string);
+        string = NULL;
+    }
+
+    if (name && message) {
+        av_bprintf(&bp, "%s: %s", name, message);
+    } else if (name && !message) {
+        av_bprintf(&bp, "%s occurred", name);
+    } else if (!name && message) {
+        av_bprintf(&bp, "Exception: %s", message);
+    } else {
+        av_log(log_ctx, AV_LOG_WARNING, "Could not retrieve exception name and message\n");
+        av_bprintf(&bp, "Exception occurred");
+    }
+
+    ret = av_bprint_finalize(&bp, error);
+done:
+
+    av_free(name);
+    av_free(message);
+
+    if (class_class) {
+        (*env)->DeleteLocalRef(env, class_class);
+    }
+
+    if (exception_class) {
+        (*env)->DeleteLocalRef(env, exception_class);
+    }
+
+    if (string) {
+        (*env)->DeleteLocalRef(env, string);
+    }
+
+    return ret;
+}
+
+int ff_jni_exception_check(JNIEnv *env, int log, void *log_ctx)
+{
+    int ret;
+
+    jthrowable exception;
+
+    char *message = NULL;
+
+    if (!(*(env))->ExceptionCheck((env))) {
+        return 0;
+    }
+
+    if (!log) {
+        (*(env))->ExceptionClear((env));
+        return -1;
+    }
+
+    exception = (*env)->ExceptionOccurred(env);
+    (*(env))->ExceptionClear((env));
+
+    if ((ret = ff_jni_exception_get_summary(env, exception, &message, log_ctx)) < 0) {
+        (*env)->DeleteLocalRef(env, exception);
+        return ret;
+    }
+
+    (*env)->DeleteLocalRef(env, exception);
+
+    av_log(log_ctx, AV_LOG_ERROR, "%s\n", message);
+    av_free(message);
+
+    return -1;
+}
+
+int ff_jni_init_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx)
+{
+    int i, ret = 0;
+    jclass last_clazz = NULL;
+
+    for (i = 0; jfields_mapping[i].name; i++) {
+        int mandatory = jfields_mapping[i].mandatory;
+        enum FFJniFieldType type = jfields_mapping[i].type;
+
+        if (type == FF_JNI_CLASS) {
+            jclass clazz;
+
+            last_clazz = NULL;
+
+            clazz = (*env)->FindClass(env, jfields_mapping[i].name);
+            if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                goto done;
+            }
+
+            last_clazz = *(jclass*)((uint8_t*)jfields + jfields_mapping[i].offset) =
+                    global ? (*env)->NewGlobalRef(env, clazz) : clazz;
+
+            if (global) {
+                (*env)->DeleteLocalRef(env, clazz);
+            }
+
+        } else {
+
+            if (!last_clazz) {
+                ret = AVERROR_EXTERNAL;
+                break;
+            }
+
+            switch(type) {
+            case FF_JNI_FIELD: {
+                jfieldID field_id = (*env)->GetFieldID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = field_id;
+                break;
+            }
+            case FF_JNI_STATIC_FIELD: {
+                jfieldID field_id = (*env)->GetStaticFieldID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = field_id;
+                break;
+            }
+            case FF_JNI_METHOD: {
+                jmethodID method_id = (*env)->GetMethodID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = method_id;
+                break;
+            }
+            case FF_JNI_STATIC_METHOD: {
+                jmethodID method_id = (*env)->GetStaticMethodID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = method_id;
+                break;
+            }
+            default:
+                av_log(log_ctx, AV_LOG_ERROR, "Unknown JNI field type\n");
+                ret = AVERROR(EINVAL);
+                goto done;
+            }
+
+            ret = 0;
+        }
+    }
+
+done:
+    if (ret < 0) {
+        /* reset jfields in case of failure so it does not leak references */
+        ff_jni_reset_jfields(env, jfields, jfields_mapping, global, log_ctx);
+    }
+
+    return ret;
+}
+
+int ff_jni_reset_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx)
+{
+    int i;
+
+    for (i = 0; jfields_mapping[i].name; i++) {
+        enum FFJniFieldType type = jfields_mapping[i].type;
+
+        switch(type) {
+        case FF_JNI_CLASS: {
+            jclass clazz = *(jclass*)((uint8_t*)jfields + jfields_mapping[i].offset);
+            if (!clazz)
+                continue;
+
+            if (global) {
+                (*env)->DeleteGlobalRef(env, clazz);
+            } else {
+                (*env)->DeleteLocalRef(env, clazz);
+            }
+
+            *(jclass*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_FIELD: {
+            *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_STATIC_FIELD: {
+            *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_METHOD: {
+            *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_STATIC_METHOD: {
+            *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        default:
+            av_log(log_ctx, AV_LOG_ERROR, "Unknown JNI field type\n");
+        }
+    }
+
+    return 0;
+}
diff --git a/libavcodec/ffjni.h b/libavcodec/ffjni.h
new file mode 100644
index 0000000..6027bac
--- /dev/null
+++ b/libavcodec/ffjni.h
@@ -0,0 +1,145 @@
+/*
+ * JNI utility functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FFJNI_H
+#define AVCODEC_FFJNI_H
+
+#include <jni.h>
+
+/*
+ * Attach permanently a JNI environment to the current thread and retrieve it.
+ *
+ * If successfully attached, the JNI environment will automatically be detached
+ * at thread destruction.
+ *
+ * @param attached pointer to an integer that will be set to 1 if the
+ * environment has been attached to the current thread or 0 if it is
+ * already attached.
+ * @param log_ctx context used for logging, can be NULL
+ * @return the JNI environment on success, NULL otherwise
+ */
+JNIEnv *ff_jni_get_env(void *log_ctx);
+
+/*
+ * Convert a jstring to its utf characters equivalent.
+ *
+ * @param env JNI environment
+ * @param string Java string to convert
+ * @param log_ctx context used for logging, can be NULL
+ * @return a pointer to an array of unicode characters on success, NULL
+ * otherwise
+ */
+char *ff_jni_jstring_to_utf_chars(JNIEnv *env, jstring string, void *log_ctx);
+
+/*
+ * Convert utf chars to its jstring equivalent.
+ *
+ * @param env JNI environment
+ * @param utf_chars a pointer to an array of unicode characters
+ * @param log_ctx context used for logging, can be NULL
+ * @return a Java string object on success, NULL otherwise
+ */
+jstring ff_jni_utf_chars_to_jstring(JNIEnv *env, const char *utf_chars, void *log_ctx);
+
+/*
+ * Extract the error summary from a jthrowable in the form of "className: errorMessage"
+ *
+ * @param env JNI environment
+ * @param exception exception to get the summary from
+ * @param error address pointing to the error, the value is updated if a
+ * summary can be extracted
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int ff_jni_exception_get_summary(JNIEnv *env, jthrowable exception, char **error, void *log_ctx);
+
+/*
+ * Check if an exception has occurred,log it using av_log and clear it.
+ *
+ * @param env JNI environment
+ * @param log value used to enable logging if an exception has occurred,
+ * 0 disables logging, != 0 enables logging
+ * @param log_ctx context used for logging, can be NULL
+ */
+int ff_jni_exception_check(JNIEnv *env, int log, void *log_ctx);
+
+/*
+ * Jni field type.
+ */
+enum FFJniFieldType {
+
+    FF_JNI_CLASS,
+    FF_JNI_FIELD,
+    FF_JNI_STATIC_FIELD,
+    FF_JNI_METHOD,
+    FF_JNI_STATIC_METHOD
+
+};
+
+/*
+ * Jni field describing a class, a field or a method to be retrieved using
+ * the ff_jni_init_jfields method.
+ */
+struct FFJniField {
+
+    const char *name;
+    const char *method;
+    const char *signature;
+    enum FFJniFieldType type;
+    int offset;
+    int mandatory;
+
+};
+
+/*
+ * Retrieve class references, field ids and method ids to an arbitrary structure.
+ *
+ * @param env JNI environment
+ * @param jfields a pointer to an arbitrary structure where the different
+ * fields are declared and where the FFJNIField mapping table offsets are
+ * pointing to
+ * @param jfields_mapping null terminated array of FFJNIFields describing
+ * the class/field/method to be retrieved
+ * @param global make the classes references global. It is the caller
+ * responsibility to properly release global references.
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int ff_jni_init_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx);
+
+/*
+ * Delete class references, field ids and method ids of an arbitrary structure.
+ *
+ * @param env JNI environment
+ * @param jfields a pointer to an arbitrary structure where the different
+ * fields are declared and where the FFJNIField mapping table offsets are
+ * pointing to
+ * @param jfields_mapping null terminated array of FFJNIFields describing
+ * the class/field/method to be deleted
+ * @param global threat the classes references as global and delete them
+ * accordingly
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int ff_jni_reset_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx);
+
+#endif /* AVCODEC_FFJNI_H */
diff --git a/libavcodec/fft-internal.h b/libavcodec/fft-internal.h
index a449ec0..0a8f7d0 100644
--- a/libavcodec/fft-internal.h
+++ b/libavcodec/fft-internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,12 +36,29 @@
 
 #else
 
+#define SCALE_FLOAT(a, bits) lrint((a) * (double)(1 << (bits)))
+
+#if FFT_FIXED_32
+
+#define CMUL(dre, dim, are, aim, bre, bim) do {             \
+        int64_t accu;                                     \
+        (accu)  = (int64_t)(bre) * (are);                 \
+        (accu) -= (int64_t)(bim) * (aim);                 \
+        (dre)   = (int)(((accu) + 0x40000000) >> 31);       \
+        (accu)  = (int64_t)(bre) * (aim);                 \
+        (accu) += (int64_t)(bim) * (are);                 \
+        (dim)   = (int)(((accu) + 0x40000000) >> 31);       \
+    } while (0)
+
+#define FIX15(a) av_clip(SCALE_FLOAT(a, 31), -2147483647, 2147483647)
+
+#else /* FFT_FIXED_32 */
+
 #include "fft.h"
 #include "mathops.h"
 
 void ff_mdct_calcw_c(FFTContext *s, FFTDouble *output, const FFTSample *input);
 
-#define SCALE_FLOAT(a, bits) lrint((a) * (double)(1 << (bits)))
 #define FIX15(a) av_clip(SCALE_FLOAT(a, 15), -32767, 32767)
 
 #define sqrthalf ((int16_t)((1<<15)*M_SQRT1_2))
@@ -62,6 +79,8 @@ void ff_mdct_calcw_c(FFTContext *s, FFTDouble *output, const FFTSample *input);
 #define CMULL(dre, dim, are, aim, bre, bim)     \
     CMULS(dre, dim, are, aim, bre, bim, 0)
 
+#endif /* FFT_FIXED_32 */
+
 #endif /* FFT_FLOAT */
 
 #define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c)
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index 57dc17f..c858570 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -2,20 +2,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,10 @@
 #define FFT_FLOAT 1
 #endif
 
+#ifndef FFT_FIXED_32
+#define FFT_FIXED_32 0
+#endif
+
 #include <stdint.h>
 #include "config.h"
 #include "libavutil/mem.h"
@@ -40,15 +44,26 @@ typedef float FFTDouble;
 
 #else
 
+#if FFT_FIXED_32
+
+#define Q31(x) (int)((x)*2147483648.0 + 0.5)
+#define FFT_NAME(x) x ## _fixed_32
+
+typedef int32_t FFTSample;
+
+#else /* FFT_FIXED_32 */
+
 #define FFT_NAME(x) x ## _fixed
 
 typedef int16_t FFTSample;
-typedef int     FFTDouble;
+
+#endif /* FFT_FIXED_32 */
 
 typedef struct FFTComplex {
-    int16_t re, im;
+    FFTSample re, im;
 } FFTComplex;
 
+typedef int    FFTDouble;
 typedef struct FFTContext FFTContext;
 
 #endif /* FFT_FLOAT */
@@ -95,6 +110,7 @@ struct FFTContext {
     void (*mdct_calcw)(struct FFTContext *s, FFTDouble *output, const FFTSample *input);
     enum fft_permutation_type fft_permutation;
     enum mdct_permutation_type mdct_permutation;
+    uint32_t *revtab32;
 };
 
 #if CONFIG_HARDCODED_TABLES
@@ -119,7 +135,8 @@ extern COSTABLE(8192);
 extern COSTABLE(16384);
 extern COSTABLE(32768);
 extern COSTABLE(65536);
-extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[17];
+extern COSTABLE(131072);
+extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[18];
 
 #define ff_init_ff_cos_tabs FFT_NAME(ff_init_ff_cos_tabs)
 
@@ -142,6 +159,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse);
 void ff_fft_init_aarch64(FFTContext *s);
 void ff_fft_init_x86(FFTContext *s);
 void ff_fft_init_arm(FFTContext *s);
+void ff_fft_init_mips(FFTContext *s);
 void ff_fft_init_ppc(FFTContext *s);
 
 void ff_fft_fixed_init_arm(FFTContext *s);
@@ -154,11 +172,4 @@ void ff_fft_end(FFTContext *s);
 int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale);
 void ff_mdct_end(FFTContext *s);
 
-void ff_mdct_init_aarch64(FFTContext *s);
-void ff_mdct_init_arm(FFTContext *s);
-void ff_mdct_init_ppc(FFTContext *s);
-void ff_mdct_init_x86(FFTContext *s);
-
-void ff_mdct_fixed_init_arm(FFTContext *s);
-
 #endif /* AVCODEC_FFT_H */
diff --git a/libavcodec/fft_fixed.c b/libavcodec/fft_fixed.c
index bad4821..3d3bd2f 100644
--- a/libavcodec/fft_fixed.c
+++ b/libavcodec/fft_fixed.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 0
+#define FFT_FIXED_32 0
 #include "fft_template.c"
diff --git a/libavcodec/fft_fixed_32.c b/libavcodec/fft_fixed_32.c
new file mode 100644
index 0000000..fbdbf84
--- /dev/null
+++ b/libavcodec/fft_fixed_32.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "fft_template.c"
diff --git a/libavcodec/fft_float.c b/libavcodec/fft_float.c
index ed4cffa..73cc98d 100644
--- a/libavcodec/fft_float.c
+++ b/libavcodec/fft_float.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 1
+#define FFT_FIXED_32 0
 #include "fft_template.c"
diff --git a/libavcodec/fft_init_table.c b/libavcodec/fft_init_table.c
new file mode 100644
index 0000000..c488018
--- /dev/null
+++ b/libavcodec/fft_init_table.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * definitions and initialization of LUT table for FFT
+ */
+#include "libavcodec/fft_table.h"
+
+const int32_t ff_w_tab_sr[MAX_FFT_SIZE/(4*16)] = {
+2147483647, 2147483016, 2147481121, 2147477963, 2147473542, 2147467857, 2147460908, 2147452697,
+2147443222, 2147432484, 2147420483, 2147407218, 2147392690, 2147376899, 2147359845, 2147341527,
+2147321946, 2147301102, 2147278995, 2147255625, 2147230991, 2147205094, 2147177934, 2147149511,
+2147119825, 2147088876, 2147056664, 2147023188, 2146988450, 2146952448, 2146915184, 2146876656,
+2146836866, 2146795813, 2146753497, 2146709917, 2146665076, 2146618971, 2146571603, 2146522973,
+2146473080, 2146421924, 2146369505, 2146315824, 2146260881, 2146204674, 2146147205, 2146088474,
+2146028480, 2145967224, 2145904705, 2145840924, 2145775880, 2145709574, 2145642006, 2145573176,
+2145503083, 2145431729, 2145359112, 2145285233, 2145210092, 2145133690, 2145056025, 2144977098,
+2144896910, 2144815460, 2144732748, 2144648774, 2144563539, 2144477042, 2144389283, 2144300264,
+2144209982, 2144118439, 2144025635, 2143931570, 2143836244, 2143739656, 2143641807, 2143542697,
+2143442326, 2143340694, 2143237802, 2143133648, 2143028234, 2142921559, 2142813624, 2142704427,
+2142593971, 2142482254, 2142369276, 2142255039, 2142139541, 2142022783, 2141904764, 2141785486,
+2141664948, 2141543150, 2141420092, 2141295774, 2141170197, 2141043360, 2140915264, 2140785908,
+2140655293, 2140523418, 2140390284, 2140255892, 2140120240, 2139983329, 2139845159, 2139705730,
+2139565043, 2139423097, 2139279892, 2139135429, 2138989708, 2138842728, 2138694490, 2138544994,
+2138394240, 2138242228, 2138088958, 2137934430, 2137778644, 2137621601, 2137463301, 2137303743,
+2137142927, 2136980855, 2136817525, 2136652938, 2136487095, 2136319994, 2136151637, 2135982023,
+2135811153, 2135639026, 2135465642, 2135291003, 2135115107, 2134937956, 2134759548, 2134579885,
+2134398966, 2134216791, 2134033361, 2133848675, 2133662734, 2133475538, 2133287087, 2133097381,
+2132906420, 2132714204, 2132520734, 2132326009, 2132130030, 2131932796, 2131734309, 2131534567,
+2131333572, 2131131322, 2130927819, 2130723062, 2130517052, 2130309789, 2130101272, 2129891502,
+2129680480, 2129468204, 2129254676, 2129039895, 2128823862, 2128606576, 2128388038, 2128168248,
+2127947206, 2127724913, 2127501367, 2127276570, 2127050522, 2126823222, 2126594672, 2126364870,
+2126133817, 2125901514, 2125667960, 2125433155, 2125197100, 2124959795, 2124721240, 2124481435,
+2124240380, 2123998076, 2123754522, 2123509718, 2123263666, 2123016364, 2122767814, 2122518015,
+2122266967, 2122014670, 2121761126, 2121506333, 2121250292, 2120993003, 2120734467, 2120474683,
+2120213651, 2119951372, 2119687847, 2119423074, 2119157054, 2118889788, 2118621275, 2118351516,
+2118080511, 2117808259, 2117534762, 2117260020, 2116984031, 2116706797, 2116428319, 2116148595,
+2115867626, 2115585412, 2115301954, 2115017252, 2114731305, 2114444114, 2114155680, 2113866001,
+2113575080, 2113282914, 2112989506, 2112694855, 2112398960, 2112101824, 2111803444, 2111503822,
+2111202959, 2110900853, 2110597505, 2110292916, 2109987085, 2109680013, 2109371700, 2109062146,
+2108751352, 2108439317, 2108126041, 2107811526, 2107495770, 2107178775, 2106860540, 2106541065,
+2106220352, 2105898399, 2105575208, 2105250778, 2104925109, 2104598202, 2104270057, 2103940674,
+2103610054, 2103278196, 2102945101, 2102610768, 2102275199, 2101938393, 2101600350, 2101261071,
+2100920556, 2100578805, 2100235819, 2099891596, 2099546139, 2099199446, 2098851519, 2098502357,
+2098151960, 2097800329, 2097447464, 2097093365, 2096738032, 2096381466, 2096023667, 2095664635,
+2095304370, 2094942872, 2094580142, 2094216179, 2093850985, 2093484559, 2093116901, 2092748012,
+2092377892, 2092006541, 2091633960, 2091260147, 2090885105, 2090508833, 2090131331, 2089752599,
+2089372638, 2088991448, 2088609029, 2088225381, 2087840505, 2087454400, 2087067068, 2086678508,
+2086288720, 2085897705, 2085505463, 2085111994, 2084717298, 2084321376, 2083924228, 2083525854,
+2083126254, 2082725429, 2082323379, 2081920103, 2081515603, 2081109879, 2080702930, 2080294757,
+2079885360, 2079474740, 2079062896, 2078649830, 2078235540, 2077820028, 2077403294, 2076985338,
+2076566160, 2076145760, 2075724139, 2075301296, 2074877233, 2074451950, 2074025446, 2073597721,
+2073168777, 2072738614, 2072307231, 2071874629, 2071440808, 2071005769, 2070569511, 2070132035,
+2069693342, 2069253430, 2068812302, 2068369957, 2067926394, 2067481616, 2067035621, 2066588410,
+2066139983, 2065690341, 2065239484, 2064787411, 2064334124, 2063879623, 2063423908, 2062966978,
+2062508835, 2062049479, 2061588910, 2061127128, 2060664133, 2060199927, 2059734508, 2059267877,
+2058800036, 2058330983, 2057860719, 2057389244, 2056916560, 2056442665, 2055967560, 2055491246,
+2055013723, 2054534991, 2054055050, 2053573901, 2053091544, 2052607979, 2052123207, 2051637227,
+2051150040, 2050661647, 2050172048, 2049681242, 2049189231, 2048696014, 2048201592, 2047705965,
+2047209133, 2046711097, 2046211857, 2045711414, 2045209767, 2044706916, 2044202863, 2043697608,
+2043191150, 2042683490, 2042174628, 2041664565, 2041153301, 2040640837, 2040127172, 2039612306,
+2039096241, 2038578976, 2038060512, 2037540850, 2037019988, 2036497928, 2035974670, 2035450215,
+2034924562, 2034397712, 2033869665, 2033340422, 2032809982, 2032278347, 2031745516, 2031211490,
+2030676269, 2030139853, 2029602243, 2029063439, 2028523442, 2027982251, 2027439867, 2026896291,
+2026351522, 2025805561, 2025258408, 2024710064, 2024160529, 2023609803, 2023057887, 2022504780,
+2021950484, 2021394998, 2020838323, 2020280460, 2019721407, 2019161167, 2018599739, 2018037123,
+2017473321, 2016908331, 2016342155, 2015774793, 2015206245, 2014636511, 2014065592, 2013493489,
+2012920201, 2012345729, 2011770073, 2011193233, 2010615210, 2010036005, 2009455617, 2008874047,
+2008291295, 2007707362, 2007122248, 2006535953, 2005948478, 2005359822, 2004769987, 2004178973,
+2003586779, 2002993407, 2002398857, 2001803128, 2001206222, 2000608139, 2000008879, 1999408442,
+1998806829, 1998204040, 1997600076, 1996994937, 1996388622, 1995781134, 1995172471, 1994562635,
+1993951625, 1993339442, 1992726087, 1992111559, 1991495860, 1990878989, 1990260946, 1989641733,
+1989021350, 1988399796, 1987777073, 1987153180, 1986528118, 1985901888, 1985274489, 1984645923,
+1984016189, 1983385288, 1982753220, 1982119985, 1981485585, 1980850019, 1980213288, 1979575392,
+1978936331, 1978296106, 1977654717, 1977012165, 1976368450, 1975723572, 1975077532, 1974430331,
+1973781967, 1973132443, 1972481757, 1971829912, 1971176906, 1970522741, 1969867417, 1969210933,
+1968553292, 1967894492, 1967234535, 1966573420, 1965911148, 1965247720, 1964583136, 1963917396,
+1963250501, 1962582451, 1961913246, 1961242888, 1960571375, 1959898709, 1959224890, 1958549919,
+1957873796, 1957196520, 1956518093, 1955838516, 1955157788, 1954475909, 1953792881, 1953108703,
+1952423377, 1951736902, 1951049279, 1950360508, 1949670589, 1948979524, 1948287312, 1947593954,
+1946899451, 1946203802, 1945507008, 1944809070, 1944109987, 1943409761, 1942708392, 1942005880,
+1941302225, 1940597428, 1939891490, 1939184411, 1938476190, 1937766830, 1937056329, 1936344689,
+1935631910, 1934917992, 1934202936, 1933486742, 1932769411, 1932050943, 1931331338, 1930610597,
+1929888720, 1929165708, 1928441561, 1927716279, 1926989864, 1926262315, 1925533633, 1924803818,
+1924072871, 1923340791, 1922607581, 1921873239, 1921137767, 1920401165, 1919663432, 1918924571,
+1918184581, 1917443462, 1916701216, 1915957841, 1915213340, 1914467712, 1913720958, 1912973078,
+1912224073, 1911473942, 1910722688, 1909970309, 1909216806, 1908462181, 1907706433, 1906949562,
+1906191570, 1905432457, 1904672222, 1903910867, 1903148392, 1902384797, 1901620084, 1900854251,
+1900087301, 1899319232, 1898550047, 1897779744, 1897008325, 1896235790, 1895462140, 1894687374,
+1893911494, 1893134500, 1892356392, 1891577171, 1890796837, 1890015391, 1889232832, 1888449163,
+1887664383, 1886878492, 1886091491, 1885303381, 1884514161, 1883723833, 1882932397, 1882139853,
+1881346202, 1880551444, 1879755580, 1878958610, 1878160535, 1877361354, 1876561070, 1875759681,
+1874957189, 1874153594, 1873348897, 1872543097, 1871736196, 1870928194, 1870119091, 1869308888,
+1868497586, 1867685184, 1866871683, 1866057085, 1865241388, 1864424594, 1863606704, 1862787717,
+1861967634, 1861146456, 1860324183, 1859500816, 1858676355, 1857850800, 1857024153, 1856196413,
+1855367581, 1854537657, 1853706643, 1852874538, 1852041343, 1851207059, 1850371686, 1849535224,
+1848697674, 1847859036, 1847019312, 1846178501, 1845336604, 1844493621, 1843649553, 1842804401,
+1841958164, 1841110844, 1840262441, 1839412956, 1838562388, 1837710739, 1836858008, 1836004197,
+1835149306, 1834293336, 1833436286, 1832578158, 1831718951, 1830858668, 1829997307, 1829134869,
+1828271356, 1827406767, 1826541103, 1825674364, 1824806552, 1823937666, 1823067707, 1822196675,
+1821324572, 1820451397, 1819577151, 1818701835, 1817825449, 1816947994, 1816069469, 1815189877,
+1814309216, 1813427489, 1812544694, 1811660833, 1810775906, 1809889915, 1809002858, 1808114737,
+1807225553, 1806335305, 1805443995, 1804551623, 1803658189, 1802763694, 1801868139, 1800971523,
+1800073849, 1799175115, 1798275323, 1797374472, 1796472565, 1795569601, 1794665580, 1793760504,
+1792854372, 1791947186, 1791038946, 1790129652, 1789219305, 1788307905, 1787395453, 1786481950,
+1785567396, 1784651792, 1783735137, 1782817434, 1781898681, 1780978881, 1780058032, 1779136137,
+1778213194, 1777289206, 1776364172, 1775438094, 1774510970, 1773582803, 1772653593, 1771723340,
+1770792044, 1769859707, 1768926328, 1767991909, 1767056450, 1766119952, 1765182414, 1764243838,
+1763304224, 1762363573, 1761421885, 1760479161, 1759535401, 1758590607, 1757644777, 1756697914,
+1755750017, 1754801087, 1753851126, 1752900132, 1751948107, 1750995052, 1750040966, 1749085851,
+1748129707, 1747172535, 1746214334, 1745255107, 1744294853, 1743333573, 1742371267, 1741407936,
+1740443581, 1739478202, 1738511799, 1737544374, 1736575927, 1735606458, 1734635968, 1733664458,
+1732691928, 1731718378, 1730743810, 1729768224, 1728791620, 1727813999, 1726835361, 1725855708,
+1724875040, 1723893357, 1722910659, 1721926948, 1720942225, 1719956488, 1718969740, 1717981981,
+1716993211, 1716003431, 1715012642, 1714020844, 1713028037, 1712034223, 1711039401, 1710043573,
+1709046739, 1708048900, 1707050055, 1706050207, 1705049355, 1704047500, 1703044642, 1702040783,
+1701035922, 1700030061, 1699023199, 1698015339, 1697006479, 1695996621, 1694985765, 1693973912,
+1692961062, 1691947217, 1690932376, 1689916541, 1688899711, 1687881888, 1686863072, 1685843263,
+1684822463, 1683800672, 1682777890, 1681754118, 1680729357, 1679703608, 1678676870, 1677649144,
+1676620432, 1675590733, 1674560049, 1673528379, 1672495725, 1671462087, 1670427466, 1669391862,
+1668355276, 1667317709, 1666279161, 1665239632, 1664199124, 1663157637, 1662115172, 1661071729,
+1660027308, 1658981911, 1657935539, 1656888190, 1655839867, 1654790570, 1653740300, 1652689057,
+1651636841, 1650583654, 1649529496, 1648474367, 1647418269, 1646361202, 1645303166, 1644244162,
+1643184191, 1642123253, 1641061349, 1639998480, 1638934646, 1637869848, 1636804087, 1635737362,
+1634669676, 1633601027, 1632531418, 1631460848, 1630389319, 1629316830, 1628243383, 1627168978,
+1626093616, 1625017297, 1623940023, 1622861793, 1621782608, 1620702469, 1619621377, 1618539332,
+1617456335, 1616372386, 1615287487, 1614201637, 1613114838, 1612027089, 1610938393, 1609848749,
+1608758157, 1607666620, 1606574136, 1605480708, 1604386335, 1603291018, 1602194758, 1601097555,
+1599999411, 1598900325, 1597800299, 1596699333, 1595597428, 1594494583, 1593390801, 1592286082,
+1591180426, 1590073833, 1588966306, 1587857843, 1586748447, 1585638117, 1584526854, 1583414660,
+1582301533, 1581187476, 1580072489, 1578956572, 1577839726, 1576721952, 1575603251, 1574483623,
+1573363068, 1572241588, 1571119183, 1569995854, 1568871601, 1567746425, 1566620327, 1565493307,
+1564365367, 1563236506, 1562106725, 1560976026, 1559844408, 1558711873, 1557578421, 1556444052,
+1555308768, 1554172569, 1553035455, 1551897428, 1550758488, 1549618636, 1548477872, 1547336197,
+1546193612, 1545050118, 1543905714, 1542760402, 1541614183, 1540467057, 1539319024, 1538170087,
+1537020244, 1535869497, 1534717846, 1533565293, 1532411837, 1531257480, 1530102222, 1528946064,
+1527789007, 1526631051, 1525472197, 1524312445, 1523151797, 1521990252, 1520827813, 1519664478,
+1518500250, 1517335128, 1516169114, 1515002208, 1513834411, 1512665723, 1511496145, 1510325678,
+1509154322, 1507982079, 1506808949, 1505634932, 1504460029, 1503284242, 1502107570, 1500930014,
+1499751576, 1498572255, 1497392053, 1496210969, 1495029006, 1493846163, 1492662441, 1491477842,
+1490292364, 1489106011, 1487918781, 1486730675, 1485541696, 1484351842, 1483161115, 1481969516,
+1480777044, 1479583702, 1478389489, 1477194407, 1475998456, 1474801636, 1473603949, 1472405394,
+1471205974, 1470005688, 1468804538, 1467602523, 1466399645, 1465195904, 1463991302, 1462785838,
+1461579514, 1460372329, 1459164286, 1457955385, 1456745625, 1455535009, 1454323536, 1453111208,
+1451898025, 1450683988, 1449469098, 1448253355, 1447036760, 1445819314, 1444601017, 1443381870,
+1442161874, 1440941030, 1439719338, 1438496799, 1437273414, 1436049184, 1434824109, 1433598189,
+1432371426, 1431143821, 1429915374, 1428686085, 1427455956, 1426224988, 1424993180, 1423760534,
+1422527051, 1421292730, 1420057574, 1418821582, 1417584755, 1416347095, 1415108601, 1413869275,
+1412629117, 1411388129, 1410146309, 1408903661, 1407660183, 1406415878, 1405170745, 1403924785,
+1402678000, 1401430389, 1400181954, 1398932695, 1397682613, 1396431709, 1395179984, 1393927438,
+1392674072, 1391419886, 1390164882, 1388909060, 1387652422, 1386394966, 1385136696, 1383877610,
+1382617710, 1381356997, 1380095472, 1378833134, 1377569986, 1376306026, 1375041258, 1373775680,
+1372509294, 1371242101, 1369974101, 1368705296, 1367435685, 1366165269, 1364894050, 1363622028,
+1362349204, 1361075579, 1359801152, 1358525926, 1357249901, 1355973077, 1354695455, 1353417037,
+1352137822, 1350857812, 1349577007, 1348295409, 1347013017, 1345729833, 1344445857, 1343161090,
+1341875533, 1340589187, 1339302052, 1338014129, 1336725419, 1335435923, 1334145641, 1332854574,
+1331562723, 1330270089, 1328976672, 1327682474, 1326387494, 1325091734, 1323795195, 1322497877,
+1321199781, 1319900907, 1318601257, 1317300832, 1315999631, 1314697657, 1313394909, 1312091388,
+1310787095, 1309482032, 1308176198, 1306869594, 1305562222, 1304254082, 1302945174, 1301635500,
+1300325060, 1299013855, 1297701886, 1296389154, 1295075659, 1293761402, 1292446384, 1291130606,
+1289814068, 1288496772, 1287178717, 1285859905, 1284540337, 1283220013, 1281898935, 1280577102,
+1279254516, 1277931177, 1276607086, 1275282245, 1273956653, 1272630312, 1271303222, 1269975384,
+1268646800, 1267317469, 1265987392, 1264656571, 1263325005, 1261992697, 1260659646, 1259325853,
+1257991320, 1256656047, 1255320034, 1253983283, 1252645794, 1251307568, 1249968606, 1248628909,
+1247288478, 1245947312, 1244605414, 1243262783, 1241919421, 1240575329, 1239230506, 1237884955,
+1236538675, 1235191668, 1233843935, 1232495475, 1231146291, 1229796382, 1228445750, 1227094395,
+1225742318, 1224389521, 1223036002, 1221681765, 1220326809, 1218971135, 1217614743, 1216257636,
+1214899813, 1213541275, 1212182024, 1210822059, 1209461382, 1208099993, 1206737894, 1205375085,
+1204011567, 1202647340, 1201282407, 1199916766, 1198550419, 1197183368, 1195815612, 1194447153,
+1193077991, 1191708127, 1190337562, 1188966297, 1187594332, 1186221669, 1184848308, 1183474250,
+1182099496, 1180724046, 1179347902, 1177971064, 1176593533, 1175215310, 1173836395, 1172456790,
+1171076495, 1169695512, 1168313840, 1166931481, 1165548435, 1164164704, 1162780288, 1161395188,
+1160009405, 1158622939, 1157235792, 1155847964, 1154459456, 1153070269, 1151680403, 1150289860,
+1148898640, 1147506745, 1146114174, 1144720929, 1143327011, 1141932420, 1140537158, 1139141224,
+1137744621, 1136347348, 1134949406, 1133550797, 1132151521, 1130751579, 1129350972, 1127949701,
+1126547765, 1125145168, 1123741908, 1122337987, 1120933406, 1119528166, 1118122267, 1116715710,
+1115308496, 1113900627, 1112492101, 1111082922, 1109673089, 1108262603, 1106851465, 1105439676,
+1104027237, 1102614148, 1101200410, 1099786025, 1098370993, 1096955314, 1095538991, 1094122023,
+1092704411, 1091286156, 1089867259, 1088447722, 1087027544, 1085606726, 1084185270, 1082763176,
+1081340445, 1079917078, 1078493076, 1077068439, 1075643169, 1074217266, 1072790730, 1071363564,
+1069935768, 1068507342, 1067078288, 1065648605, 1064218296, 1062787361, 1061355801, 1059923616,
+1058490808, 1057057377, 1055623324, 1054188651, 1052753357, 1051317443, 1049880912, 1048443763,
+1047005996, 1045567615, 1044128617, 1042689006, 1041248781, 1039807944, 1038366495, 1036924436,
+1035481766, 1034038487, 1032594600, 1031150105, 1029705004, 1028259297, 1026812985, 1025366069,
+1023918550, 1022470428, 1021021705, 1019572382, 1018122458, 1016671936, 1015220816, 1013769098,
+1012316784, 1010863875, 1009410370, 1007956272, 1006501581, 1005046298, 1003590424, 1002133959,
+1000676905, 999219262, 997761031, 996302214, 994842810, 993382821, 991922248, 990461091,
+988999351, 987537030, 986074127, 984610645, 983146583, 981681943, 980216726, 978750932,
+977284562, 975817617, 974350098, 972882006, 971413342, 969944106, 968474300, 967003923,
+965532978, 964061465, 962589385, 961116739, 959643527, 958169751, 956695411, 955220508,
+953745043, 952269017, 950792431, 949315286, 947837582, 946359321, 944880503, 943401129,
+941921200, 940440717, 938959681, 937478092, 935995952, 934513261, 933030021, 931546231,
+930061894, 928577010, 927091579, 925605603, 924119082, 922632018, 921144411, 919656262,
+918167572, 916678342, 915188572, 913698265, 912207419, 910716038, 909224120, 907731667,
+906238681, 904745161, 903251110, 901756526, 900261413, 898765769, 897269597, 895772898,
+894275671, 892777918, 891279640, 889780838, 888281512, 886781663, 885281293, 883780402,
+882278992, 880777062, 879274614, 877771649, 876268167, 874764170, 873259659, 871754633,
+870249095, 868743045, 867236484, 865729413, 864221832, 862713743, 861205147, 859696043,
+858186435, 856676321, 855165703, 853654582, 852142959, 850630835, 849118210, 847605086,
+846091463, 844577343, 843062726, 841547612, 840032004, 838515901, 836999305, 835482217,
+833964638, 832446567, 830928007, 829408958, 827889422, 826369398, 824848888, 823327893,
+821806413, 820284450, 818762005, 817239078, 815715670, 814191782, 812667415, 811142571,
+809617249, 808091450, 806565177, 805038429, 803511207, 801983513, 800455346, 798926709,
+797397602, 795868026, 794337982, 792807470, 791276492, 789745049, 788213141, 786680769,
+785147934, 783614638, 782080880, 780546663, 779011986, 777476851, 775941259, 774405210,
+772868706, 771331747, 769794334, 768256469, 766718151, 765179382, 763640164, 762100496,
+760560380, 759019816, 757478806, 755937350, 754395449, 752853105, 751310318, 749767089,
+748223418, 746679308, 745134758, 743589770, 742044345, 740498483, 738952186, 737405453,
+735858287, 734310688, 732762657, 731214195, 729665303, 728115982, 726566232, 725016055,
+723465451, 721914422, 720362968, 718811090, 717258790, 715706067, 714152924, 712599360,
+711045377, 709490976, 707936158, 706380923, 704825272, 703269207, 701712728, 700155836,
+698598533, 697040818, 695482694, 693924160, 692365218, 690805869, 689246113, 687685952,
+686125387, 684564417, 683003045, 681441272, 679879097, 678316522, 676753549, 675190177,
+673626408, 672062243, 670497682, 668932727, 667367379, 665801638, 664235505, 662668981,
+661102068, 659534766, 657967075, 656398998, 654830535, 653261686, 651692453, 650122837,
+648552838, 646982457, 645411696, 643840556, 642269036, 640697139, 639124865, 637552215,
+635979190, 634405791, 632832018, 631257873, 629683357, 628108471, 626533215, 624957590,
+623381598, 621805239, 620228514, 618651424, 617073971, 615496154, 613917975, 612339436,
+610760536, 609181276, 607601658, 606021683, 604441352, 602860664, 601279623, 599698227,
+598116479, 596534378, 594951927, 593369126, 591785976, 590202477, 588618632, 587034440,
+585449903, 583865021, 582279796, 580694229, 579108320, 577522070, 575935480, 574348552,
+572761285, 571173682, 569585743, 567997469, 566408860, 564819919, 563230645, 561641039,
+560051104, 558460839, 556870245, 555279324, 553688076, 552096502, 550504604, 548912382,
+547319836, 545726969, 544133781, 542540273, 540946445, 539352300, 537757837, 536163058,
+534567963, 532972554, 531376831, 529780796, 528184449, 526587791, 524990824, 523393547,
+521795963, 520198072, 518599875, 517001373, 515402566, 513803457, 512204045, 510604332,
+509004318, 507404005, 505803394, 504202485, 502601279, 500999778, 499397982, 497795892,
+496193509, 494590835, 492987869, 491384614, 489781069, 488177236, 486573117, 484968710,
+483364019, 481759043, 480153784, 478548243, 476942419, 475336316, 473729932, 472123270,
+470516330, 468909114, 467301622, 465693854, 464085813, 462477499, 460868912, 459260055,
+457650927, 456041530, 454431865, 452821933, 451211734, 449601270, 447990541, 446379549,
+444768294, 443156777, 441545000, 439932963, 438320667, 436708113, 435095303, 433482236,
+431868915, 430255339, 428641511, 427027430, 425413098, 423798515, 422183684, 420568604,
+418953276, 417337703, 415721883, 414105819, 412489512, 410872962, 409256170, 407639137,
+406021865, 404404353, 402786604, 401168618, 399550396, 397931939, 396313247, 394694323,
+393075166, 391455778, 389836160, 388216313, 386596237, 384975934, 383355404, 381734649,
+380113669, 378492466, 376871039, 375249392, 373627523, 372005435, 370383128, 368760603,
+367137861, 365514903, 363891730, 362268343, 360644742, 359020930, 357396906, 355772673,
+354148230, 352523578, 350898719, 349273654, 347648383, 346022908, 344397230, 342771348,
+341145265, 339518981, 337892498, 336265816, 334638936, 333011859, 331384586, 329757119,
+328129457, 326501602, 324873555, 323245317, 321616889, 319988272, 318359466, 316730474,
+315101295, 313471930, 311842381, 310212649, 308582734, 306952638, 305322361, 303691904,
+302061269, 300430456, 298799466, 297168301, 295536961, 293905447, 292273760, 290641901,
+289009871, 287377671, 285745302, 284112765, 282480061, 280847190, 279214155, 277580955,
+275947592, 274314066, 272680379, 271046532, 269412525, 267778360, 266144038, 264509558,
+262874923, 261240134, 259605191, 257970095, 256334847, 254699448, 253063900, 251428203,
+249792358, 248156366, 246520228, 244883945, 243247518, 241610947, 239974235, 238337382,
+236700388, 235063255, 233425984, 231788575, 230151030, 228513350, 226875535, 225237587,
+223599506, 221961294, 220322951, 218684479, 217045878, 215407149, 213768293, 212129312,
+210490206, 208850976, 207211624, 205572149, 203932553, 202292838, 200653003, 199013051,
+197372981, 195732795, 194092495, 192452080, 190811551, 189170911, 187530159, 185889297,
+184248325, 182607245, 180966058, 179324764, 177683365, 176041861, 174400254, 172758544,
+171116733, 169474820, 167832808, 166190698, 164548489, 162906184, 161263783, 159621287,
+157978697, 156336015, 154693240, 153050374, 151407418, 149764374, 148121241, 146478021,
+144834714, 143191323, 141547847, 139904288, 138260647, 136616925, 134973122, 133329239,
+131685278, 130041240, 128397125, 126752935, 125108670, 123464332, 121819921, 120175438,
+118530885, 116886262, 115241570, 113596810, 111951983, 110307091, 108662134, 107017112,
+105372028, 103726882, 102081675, 100436408,  98791081,  97145697,  95500255,  93854758,
+ 92209205,  90563597,  88917937,  87272224,  85626460,  83980645,  82334782,  80688869,
+ 79042909,  77396903,  75750851,  74104755,  72458615,  70812432,  69166208,  67519943,
+ 65873638,  64227295,  62580914,  60934496,  59288042,  57641553,  55995030,  54348475,
+ 52701887,  51055268,  49408620,  47761942,  46115236,  44468503,  42821744,  41174960,
+ 39528151,  37881320,  36234466,  34587590,  32940695,  31293780,  29646846,  27999895,
+ 26352928,  24705945,  23058947,  21411936,  19764913,  18117878,  16470832,  14823776,
+ 13176712,  11529640,   9882561,   8235476,   6588387,   4941294,   3294197,   1647099
+};
+
+uint16_t ff_fft_offsets_lut[21845];
+
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index)
+{
+    if (size < 16) {
+        table[*index] = off >> 2;
+        (*index)++;
+    }
+    else {
+        ff_fft_lut_init(table, off, size>>1, index);
+        ff_fft_lut_init(table, off+(size>>1), size>>2, index);
+        ff_fft_lut_init(table, off+3*(size>>2), size>>2, index);
+    }
+}
diff --git a/libavcodec/fft_table.h b/libavcodec/fft_table.h
new file mode 100644
index 0000000..ed0a658
--- /dev/null
+++ b/libavcodec/fft_table.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * definitions and tables for FFT
+ */
+#ifndef AVCODEC_FFT_TABLE_H
+#define AVCODEC_FFT_TABLE_H
+
+#include "libavcodec/fft.h"
+
+#define MAX_LOG2_NFFT 17 //!< Specifies maximum allowed fft size
+#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT)
+
+extern const int32_t ff_w_tab_sr[];
+extern uint16_t ff_fft_offsets_lut[];
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index);
+
+#endif /* AVCODEC_FFT_TABLE_H */
diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c
index 3642b43..20a62e4 100644
--- a/libavcodec/fft_template.c
+++ b/libavcodec/fft_template.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2002 Fabrice Bellard
  * Partly based on libdjbfft by D. J. Bernstein
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,9 +29,21 @@
 #include <stdlib.h>
 #include <string.h>
 #include "libavutil/mathematics.h"
+#include "libavutil/thread.h"
 #include "fft.h"
 #include "fft-internal.h"
 
+#if FFT_FIXED_32
+#include "fft_table.h"
+
+static void av_cold fft_lut_init(void)
+{
+    int n = 0;
+    ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n);
+}
+
+#else /* FFT_FIXED_32 */
+
 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
 #if !CONFIG_HARDCODED_TABLES
 COSTABLE(16);
@@ -47,6 +59,67 @@ COSTABLE(8192);
 COSTABLE(16384);
 COSTABLE(32768);
 COSTABLE(65536);
+COSTABLE(131072);
+
+static av_cold void init_ff_cos_tabs(int index)
+{
+    int i;
+    int m = 1<<index;
+    double freq = 2*M_PI/m;
+    FFTSample *tab = FFT_NAME(ff_cos_tabs)[index];
+    for(i=0; i<=m/4; i++)
+        tab[i] = FIX15(cos(i*freq));
+    for(i=1; i<m/4; i++)
+        tab[m/2-i] = tab[i];
+}
+
+typedef struct CosTabsInitOnce {
+    void (*func)(void);
+    AVOnce control;
+} CosTabsInitOnce;
+
+#define INIT_FF_COS_TABS_FUNC(index, size)          \
+static av_cold void init_ff_cos_tabs_ ## size (void)\
+{                                                   \
+    init_ff_cos_tabs(index);                        \
+}
+
+INIT_FF_COS_TABS_FUNC(4, 16)
+INIT_FF_COS_TABS_FUNC(5, 32)
+INIT_FF_COS_TABS_FUNC(6, 64)
+INIT_FF_COS_TABS_FUNC(7, 128)
+INIT_FF_COS_TABS_FUNC(8, 256)
+INIT_FF_COS_TABS_FUNC(9, 512)
+INIT_FF_COS_TABS_FUNC(10, 1024)
+INIT_FF_COS_TABS_FUNC(11, 2048)
+INIT_FF_COS_TABS_FUNC(12, 4096)
+INIT_FF_COS_TABS_FUNC(13, 8192)
+INIT_FF_COS_TABS_FUNC(14, 16384)
+INIT_FF_COS_TABS_FUNC(15, 32768)
+INIT_FF_COS_TABS_FUNC(16, 65536)
+INIT_FF_COS_TABS_FUNC(17, 131072)
+
+static CosTabsInitOnce cos_tabs_init_once[] = {
+    { NULL },
+    { NULL },
+    { NULL },
+    { NULL },
+    { init_ff_cos_tabs_16, AV_ONCE_INIT },
+    { init_ff_cos_tabs_32, AV_ONCE_INIT },
+    { init_ff_cos_tabs_64, AV_ONCE_INIT },
+    { init_ff_cos_tabs_128, AV_ONCE_INIT },
+    { init_ff_cos_tabs_256, AV_ONCE_INIT },
+    { init_ff_cos_tabs_512, AV_ONCE_INIT },
+    { init_ff_cos_tabs_1024, AV_ONCE_INIT },
+    { init_ff_cos_tabs_2048, AV_ONCE_INIT },
+    { init_ff_cos_tabs_4096, AV_ONCE_INIT },
+    { init_ff_cos_tabs_8192, AV_ONCE_INIT },
+    { init_ff_cos_tabs_16384, AV_ONCE_INIT },
+    { init_ff_cos_tabs_32768, AV_ONCE_INIT },
+    { init_ff_cos_tabs_65536, AV_ONCE_INIT },
+    { init_ff_cos_tabs_131072, AV_ONCE_INIT },
+};
+
 #endif
 COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
     NULL, NULL, NULL, NULL,
@@ -63,8 +136,11 @@ COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
     FFT_NAME(ff_cos_16384),
     FFT_NAME(ff_cos_32768),
     FFT_NAME(ff_cos_65536),
+    FFT_NAME(ff_cos_131072),
 };
 
+#endif /* FFT_FIXED_32 */
+
 static void fft_permute_c(FFTContext *s, FFTComplex *z);
 static void fft_calc_c(FFTContext *s, FFTComplex *z);
 
@@ -81,15 +157,8 @@ static int split_radix_permutation(int i, int n, int inverse)
 
 av_cold void ff_init_ff_cos_tabs(int index)
 {
-#if !CONFIG_HARDCODED_TABLES
-    int i;
-    int m = 1<<index;
-    double freq = 2*M_PI/m;
-    FFTSample *tab = FFT_NAME(ff_cos_tabs)[index];
-    for(i=0; i<=m/4; i++)
-        tab[i] = FIX15(cos(i*freq));
-    for(i=1; i<m/4; i++)
-        tab[m/2-i] = tab[i];
+#if (!CONFIG_HARDCODED_TABLES) && (!FFT_FIXED_32)
+    ff_thread_once(&cos_tabs_init_once[index].control, cos_tabs_init_once[index].func);
 #endif
 }
 
@@ -135,14 +204,23 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 {
     int i, j, n;
 
-    if (nbits < 2 || nbits > 16)
+    s->revtab = NULL;
+    s->revtab32 = NULL;
+
+    if (nbits < 2 || nbits > 17)
         goto fail;
     s->nbits = nbits;
     n = 1 << nbits;
 
-    s->revtab = av_malloc(n * sizeof(uint16_t));
-    if (!s->revtab)
-        goto fail;
+    if (nbits <= 16) {
+        s->revtab = av_malloc(n * sizeof(uint16_t));
+        if (!s->revtab)
+            goto fail;
+    } else {
+        s->revtab32 = av_malloc(n * sizeof(uint32_t));
+        if (!s->revtab32)
+            goto fail;
+    }
     s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
     if (!s->tmp_buf)
         goto fail;
@@ -151,34 +229,79 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 
     s->fft_permute = fft_permute_c;
     s->fft_calc    = fft_calc_c;
+#if CONFIG_MDCT
+    s->imdct_calc  = ff_imdct_calc_c;
+    s->imdct_half  = ff_imdct_half_c;
+    s->mdct_calc   = ff_mdct_calc_c;
+#endif
 
+#if FFT_FIXED_32
+    {
+        static AVOnce control = AV_ONCE_INIT;
+        ff_thread_once(&control, fft_lut_init);
+    }
+#else /* FFT_FIXED_32 */
 #if FFT_FLOAT
     if (ARCH_AARCH64) ff_fft_init_aarch64(s);
     if (ARCH_ARM)     ff_fft_init_arm(s);
     if (ARCH_PPC)     ff_fft_init_ppc(s);
     if (ARCH_X86)     ff_fft_init_x86(s);
+    if (CONFIG_MDCT)  s->mdct_calcw = s->mdct_calc;
+    if (HAVE_MIPSFPU) ff_fft_init_mips(s);
 #else
+    if (CONFIG_MDCT)  s->mdct_calcw = ff_mdct_calcw_c;
     if (ARCH_ARM)     ff_fft_fixed_init_arm(s);
 #endif
-
     for(j=4; j<=nbits; j++) {
         ff_init_ff_cos_tabs(j);
     }
+#endif /* FFT_FIXED_32 */
+
 
     if (s->fft_permutation == FF_FFT_PERM_AVX) {
         fft_perm_avx(s);
     } else {
-        for(i=0; i<n; i++) {
-            int j = i;
-            if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
-                j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
-            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
-        }
+#define PROCESS_FFT_PERM_SWAP_LSBS(num) do {\
+    for(i = 0; i < n; i++) {\
+        int k;\
+        j = i;\
+        j = (j & ~3) | ((j >> 1) & 1) | ((j << 1) & 2);\
+        k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
+        s->revtab##num[k] = j;\
+    } \
+} while(0);
+
+#define PROCESS_FFT_PERM_DEFAULT(num) do {\
+    for(i = 0; i < n; i++) {\
+        int k;\
+        j = i;\
+        k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
+        s->revtab##num[k] = j;\
+    } \
+} while(0);
+
+#define SPLIT_RADIX_PERMUTATION(num) do { \
+    if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) {\
+        PROCESS_FFT_PERM_SWAP_LSBS(num) \
+    } else {\
+        PROCESS_FFT_PERM_DEFAULT(num) \
+    }\
+} while(0);
+
+    if (s->revtab)
+        SPLIT_RADIX_PERMUTATION()
+    if (s->revtab32)
+        SPLIT_RADIX_PERMUTATION(32)
+
+#undef PROCESS_FFT_PERM_DEFAULT
+#undef PROCESS_FFT_PERM_SWAP_LSBS
+#undef SPLIT_RADIX_PERMUTATION
     }
 
     return 0;
  fail:
     av_freep(&s->revtab);
+    av_freep(&s->revtab32);
     av_freep(&s->tmp_buf);
     return -1;
 }
@@ -187,18 +310,184 @@ static void fft_permute_c(FFTContext *s, FFTComplex *z)
 {
     int j, np;
     const uint16_t *revtab = s->revtab;
+    const uint32_t *revtab32 = s->revtab32;
     np = 1 << s->nbits;
     /* TODO: handle split-radix permute in a more optimal way, probably in-place */
-    for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+    if (revtab) {
+        for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+    } else
+        for(j=0;j<np;j++) s->tmp_buf[revtab32[j]] = z[j];
+
     memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
 }
 
 av_cold void ff_fft_end(FFTContext *s)
 {
     av_freep(&s->revtab);
+    av_freep(&s->revtab32);
     av_freep(&s->tmp_buf);
 }
 
+#if FFT_FIXED_32
+
+static void fft_calc_c(FFTContext *s, FFTComplex *z) {
+
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    unsigned tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    FFTComplex *tmpz;
+    const int fft_size = (1 << s->nbits);
+    int64_t accu;
+
+    num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = ff_fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[0].re + (unsigned)tmpz[1].re;
+        tmp5 = tmpz[2].re + (unsigned)tmpz[3].re;
+        tmp2 = tmpz[0].im + (unsigned)tmpz[1].im;
+        tmp6 = tmpz[2].im + (unsigned)tmpz[3].im;
+        tmp3 = tmpz[0].re - (unsigned)tmpz[1].re;
+        tmp8 = tmpz[2].im - (unsigned)tmpz[3].im;
+        tmp4 = tmpz[0].im - (unsigned)tmpz[1].im;
+        tmp7 = tmpz[2].re - (unsigned)tmpz[3].re;
+
+        tmpz[0].re = tmp1 + tmp5;
+        tmpz[2].re = tmp1 - tmp5;
+        tmpz[0].im = tmp2 + tmp6;
+        tmpz[2].im = tmp2 - tmp6;
+        tmpz[1].re = tmp3 + tmp8;
+        tmpz[3].re = tmp3 - tmp8;
+        tmpz[1].im = tmp4 - tmp7;
+        tmpz[3].im = tmp4 + tmp7;
+    }
+
+    if (fft_size < 8)
+        return;
+
+    num_transforms = (num_transforms >> 1) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = ff_fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[4].re + (unsigned)tmpz[5].re;
+        tmp3 = tmpz[6].re + (unsigned)tmpz[7].re;
+        tmp2 = tmpz[4].im + (unsigned)tmpz[5].im;
+        tmp4 = tmpz[6].im + (unsigned)tmpz[7].im;
+        tmp5 = tmp1 + tmp3;
+        tmp7 = tmp1 - tmp3;
+        tmp6 = tmp2 + tmp4;
+        tmp8 = tmp2 - tmp4;
+
+        tmp1 = tmpz[4].re - (unsigned)tmpz[5].re;
+        tmp2 = tmpz[4].im - (unsigned)tmpz[5].im;
+        tmp3 = tmpz[6].re - (unsigned)tmpz[7].re;
+        tmp4 = tmpz[6].im - (unsigned)tmpz[7].im;
+
+        tmpz[4].re = tmpz[0].re - tmp5;
+        tmpz[0].re = tmpz[0].re + tmp5;
+        tmpz[4].im = tmpz[0].im - tmp6;
+        tmpz[0].im = tmpz[0].im + tmp6;
+        tmpz[6].re = tmpz[2].re - tmp8;
+        tmpz[2].re = tmpz[2].re + tmp8;
+        tmpz[6].im = tmpz[2].im + tmp7;
+        tmpz[2].im = tmpz[2].im - tmp7;
+
+        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp1 + tmp2);
+        tmp5 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 - tmp4);
+        tmp7 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp2 - tmp1);
+        tmp6 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 + tmp4);
+        tmp8 = (int32_t)((accu + 0x40000000) >> 31);
+        tmp1 = tmp5 + tmp7;
+        tmp3 = tmp5 - tmp7;
+        tmp2 = tmp6 + tmp8;
+        tmp4 = tmp6 - tmp8;
+
+        tmpz[5].re = tmpz[1].re - tmp1;
+        tmpz[1].re = tmpz[1].re + tmp1;
+        tmpz[5].im = tmpz[1].im - tmp2;
+        tmpz[1].im = tmpz[1].im + tmp2;
+        tmpz[7].re = tmpz[3].re - tmp4;
+        tmpz[3].re = tmpz[3].re + tmp4;
+        tmpz[7].im = tmpz[3].im + tmp3;
+        tmpz[3].im = tmpz[3].im - tmp3;
+    }
+
+    step = 1 << ((MAX_LOG2_NFFT-4) - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++){
+        n2  = 2*n4;
+        n34 = 3*n4;
+        num_transforms = (num_transforms >> 1) | 1;
+
+        for (n=0; n<num_transforms; n++){
+            const FFTSample *w_re_ptr = ff_w_tab_sr + step;
+            const FFTSample *w_im_ptr = ff_w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
+            offset = ff_fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            tmp5 = tmpz[ n2].re + (unsigned)tmpz[n34].re;
+            tmp1 = tmpz[ n2].re - (unsigned)tmpz[n34].re;
+            tmp6 = tmpz[ n2].im + (unsigned)tmpz[n34].im;
+            tmp2 = tmpz[ n2].im - (unsigned)tmpz[n34].im;
+
+            tmpz[ n2].re = tmpz[ 0].re - tmp5;
+            tmpz[  0].re = tmpz[ 0].re + tmp5;
+            tmpz[ n2].im = tmpz[ 0].im - tmp6;
+            tmpz[  0].im = tmpz[ 0].im + tmp6;
+            tmpz[n34].re = tmpz[n4].re - tmp2;
+            tmpz[ n4].re = tmpz[n4].re + tmp2;
+            tmpz[n34].im = tmpz[n4].im + tmp1;
+            tmpz[ n4].im = tmpz[n4].im - tmp1;
+
+            for (i=1; i<n4; i++){
+                FFTSample w_re = w_re_ptr[0];
+                FFTSample w_im = w_im_ptr[0];
+                accu  = (int64_t)w_re*tmpz[ n2+i].re;
+                accu += (int64_t)w_im*tmpz[ n2+i].im;
+                tmp1 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[ n2+i].im;
+                accu -= (int64_t)w_im*tmpz[ n2+i].re;
+                tmp2 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].re;
+                accu -= (int64_t)w_im*tmpz[n34+i].im;
+                tmp3 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].im;
+                accu += (int64_t)w_im*tmpz[n34+i].re;
+                tmp4 = (int32_t)((accu + 0x40000000) >> 31);
+
+                tmp5 = tmp1 + tmp3;
+                tmp1 = tmp1 - tmp3;
+                tmp6 = tmp2 + tmp4;
+                tmp2 = tmp2 - tmp4;
+
+                tmpz[ n2+i].re = tmpz[   i].re - tmp5;
+                tmpz[    i].re = tmpz[   i].re + tmp5;
+                tmpz[ n2+i].im = tmpz[   i].im - tmp6;
+                tmpz[    i].im = tmpz[   i].im + tmp6;
+                tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+                tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+                tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+                tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+
+                w_re_ptr += step;
+                w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+
+#else /* FFT_FIXED_32 */
+
 #define BUTTERFLIES(a0,a1,a2,a3) {\
     BF(t3, t5, t5, t1);\
     BF(a2.re, a0.re, a0.re, t5);\
@@ -258,9 +547,11 @@ static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
 }
 
 PASS(pass)
+#if !CONFIG_SMALL
 #undef BUTTERFLIES
 #define BUTTERFLIES BUTTERFLIES_BIG
 PASS(pass_big)
+#endif
 
 #define DECL_FFT(n,n2,n4)\
 static void fft##n(FFTComplex *z)\
@@ -334,13 +625,15 @@ DECL_FFT(8192,4096,2048)
 DECL_FFT(16384,8192,4096)
 DECL_FFT(32768,16384,8192)
 DECL_FFT(65536,32768,16384)
+DECL_FFT(131072,65536,32768)
 
 static void (* const fft_dispatch[])(FFTComplex*) = {
     fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
-    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
+    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072
 };
 
 static void fft_calc_c(FFTContext *s, FFTComplex *z)
 {
     fft_dispatch[s->nbits-2](z);
 }
+#endif /* FFT_FIXED_32 */
diff --git a/libavcodec/ffv1.c b/libavcodec/ffv1.c
index 89d27bb..a14dd2a 100644
--- a/libavcodec/ffv1.c
+++ b/libavcodec/ffv1.c
@@ -1,22 +1,22 @@
 /*
  * FFV1 codec for libavcodec
  *
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,118 +26,33 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 
 #include "avcodec.h"
-#include "put_bits.h"
+#include "internal.h"
 #include "rangecoder.h"
 #include "mathops.h"
 #include "ffv1.h"
 
-const int8_t ffv1_quant5_10bit[256] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0,
-};
-
-const int8_t ffv1_quant5[256] = {
-     0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1,
-};
-
-const int8_t ffv1_quant9_10bit[256] = {
-     0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
-     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-     3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3,
-    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
-    -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -0, -0, -0, -0,
-};
-
-const int8_t ffv1_quant11[256] = {
-     0,  1,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -1,
-};
-
-const uint8_t ffv1_ver2_state[256] = {
-      0,  10,  10,  10,  10,  16,  16,  16,  28,  16,  16,  29,  42,  49,  20,  49,
-     59,  25,  26,  26,  27,  31,  33,  33,  33,  34,  34,  37,  67,  38,  39,  39,
-     40,  40,  41,  79,  43,  44,  45,  45,  48,  48,  64,  50,  51,  52,  88,  52,
-     53,  74,  55,  57,  58,  58,  74,  60,  101, 61,  62,  84,  66,  66,  68,  69,
-     87,  82,  71,  97,  73,  73,  82,  75,  111, 77,  94,  78,  87,  81,  83,  97,
-     85,  83,  94,  86,  99,  89,  90,  99,  111, 92,  93,  134, 95,  98,  105, 98,
-    105, 110, 102, 108, 102, 118, 103, 106, 106, 113, 109, 112, 114, 112, 116, 125,
-    115, 116, 117, 117, 126, 119, 125, 121, 121, 123, 145, 124, 126, 131, 127, 129,
-    165, 130, 132, 138, 133, 135, 145, 136, 137, 139, 146, 141, 143, 142, 144, 148,
-    147, 155, 151, 149, 151, 150, 152, 157, 153, 154, 156, 168, 158, 162, 161, 160,
-    172, 163, 169, 164, 166, 184, 167, 170, 177, 174, 171, 173, 182, 176, 180, 178,
-    175, 189, 179, 181, 186, 183, 192, 185, 200, 187, 191, 188, 190, 197, 193, 196,
-    197, 194, 195, 196, 198, 202, 199, 201, 210, 203, 207, 204, 205, 206, 208, 214,
-    209, 211, 221, 212, 213, 215, 224, 216, 217, 218, 219, 220, 222, 228, 223, 225,
-    226, 224, 227, 229, 240, 230, 231, 232, 233, 234, 235, 236, 238, 239, 237, 242,
-    241, 243, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255,
-};
-
-
-av_cold int ffv1_common_init(AVCodecContext *avctx)
+av_cold int ff_ffv1_common_init(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
 
+    if (!avctx->width || !avctx->height)
+        return AVERROR_INVALIDDATA;
+
     s->avctx = avctx;
     s->flags = avctx->flags;
 
-    if (!avctx->width || !avctx->height)
-        return AVERROR_INVALIDDATA;
+    s->picture.f = av_frame_alloc();
+    s->last_picture.f = av_frame_alloc();
+    if (!s->picture.f || !s->last_picture.f)
+        return AVERROR(ENOMEM);
 
     s->width  = avctx->width;
     s->height = avctx->height;
@@ -149,9 +64,9 @@ av_cold int ffv1_common_init(AVCodecContext *avctx)
     return 0;
 }
 
-int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
+av_cold int ff_ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
 {
-    int j;
+    int j, i;
 
     fs->plane_count  = f->plane_count;
     fs->transparency = f->transparency;
@@ -160,22 +75,27 @@ int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
 
         if (fs->ac != AC_GOLOMB_RICE) {
             if (!p->state)
-                p->state = av_malloc(CONTEXT_SIZE * p->context_count *
+                p->state = av_malloc_array(p->context_count, CONTEXT_SIZE *
                                      sizeof(uint8_t));
             if (!p->state)
                 return AVERROR(ENOMEM);
         } else {
-            if (!p->vlc_state)
-                p->vlc_state = av_malloc(p->context_count * sizeof(VlcState));
-            if (!p->vlc_state)
-                return AVERROR(ENOMEM);
+            if (!p->vlc_state) {
+                p->vlc_state = av_mallocz_array(p->context_count, sizeof(VlcState));
+                if (!p->vlc_state)
+                    return AVERROR(ENOMEM);
+                for (i = 0; i < p->context_count; i++) {
+                    p->vlc_state[i].error_sum = 4;
+                    p->vlc_state[i].count     = 1;
+                }
+            }
         }
     }
 
     if (fs->ac == AC_RANGE_CUSTOM_TAB) {
         //FIXME only redo if state_transition changed
         for (j = 1; j < 256; j++) {
-            fs->c.one_state[j]        = f->state_transition[j];
+            fs->c. one_state[      j] = f->state_transition[j];
             fs->c.zero_state[256 - j] = 256 - fs->c.one_state[j];
         }
     }
@@ -183,17 +103,25 @@ int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
     return 0;
 }
 
-av_cold int ffv1_init_slice_contexts(FFV1Context *f)
+av_cold int ff_ffv1_init_slices_state(FFV1Context *f)
 {
-    int i, j;
-
-    f->slice_count = f->num_h_slices * f->num_v_slices;
-    if (f->slice_count <= 0) {
-        av_log(f->avctx, AV_LOG_ERROR, "Invalid number of slices\n");
-        return AVERROR(EINVAL);
+    int i, ret;
+    for (i = 0; i < f->max_slice_count; i++) {
+        FFV1Context *fs = f->slice_context[i];
+        if ((ret = ff_ffv1_init_slice_state(f, fs)) < 0)
+            return AVERROR(ENOMEM);
     }
+    return 0;
+}
+
+av_cold int ff_ffv1_init_slice_contexts(FFV1Context *f)
+{
+    int i;
 
-    for (i = 0; i < f->slice_count; i++) {
+    f->max_slice_count = f->num_h_slices * f->num_v_slices;
+    av_assert0(f->max_slice_count > 0);
+
+    for (i = 0; i < f->max_slice_count; i++) {
         int sx          = i % f->num_h_slices;
         int sy          = i / f->num_h_slices;
         int sxs         = f->avctx->width  *  sx      / f->num_h_slices;
@@ -201,6 +129,7 @@ av_cold int ffv1_init_slice_contexts(FFV1Context *f)
         int sys         = f->avctx->height *  sy      / f->num_v_slices;
         int sye         = f->avctx->height * (sy + 1) / f->num_v_slices;
         FFV1Context *fs = av_mallocz(sizeof(*fs));
+
         if (!fs)
             goto memfail;
 
@@ -213,29 +142,34 @@ av_cold int ffv1_init_slice_contexts(FFV1Context *f)
         fs->slice_x      = sxs;
         fs->slice_y      = sys;
 
-        fs->sample_buffer = av_malloc(3 * MAX_PLANES * (fs->width + 6) *
+        fs->sample_buffer = av_malloc_array((fs->width + 6), 3 * MAX_PLANES *
                                       sizeof(*fs->sample_buffer));
-        if (!fs->sample_buffer) {
-            av_free(fs);
+        fs->sample_buffer32 = av_malloc_array((fs->width + 6), 3 * MAX_PLANES *
+                                        sizeof(*fs->sample_buffer32));
+        if (!fs->sample_buffer || !fs->sample_buffer32) {
+            av_freep(&fs->sample_buffer);
+            av_freep(&fs->sample_buffer32);
+            av_freep(&f->slice_context[i]);
             goto memfail;
         }
     }
     return 0;
 
 memfail:
-    for (j = 0; j < i; j++) {
-        av_free(f->slice_context[j]->sample_buffer);
-        av_free(f->slice_context[j]);
+    while(--i >= 0) {
+        av_freep(&f->slice_context[i]->sample_buffer);
+        av_freep(&f->slice_context[i]->sample_buffer32);
+        av_freep(&f->slice_context[i]);
     }
     return AVERROR(ENOMEM);
 }
 
-int ffv1_allocate_initial_states(FFV1Context *f)
+int ff_ffv1_allocate_initial_states(FFV1Context *f)
 {
     int i;
 
     for (i = 0; i < f->quant_table_count; i++) {
-        f->initial_states[i] = av_malloc(f->context_count[i] *
+        f->initial_states[i] = av_malloc_array(f->context_count[i],
                                          sizeof(*f->initial_states[i]));
         if (!f->initial_states[i])
             return AVERROR(ENOMEM);
@@ -245,7 +179,7 @@ int ffv1_allocate_initial_states(FFV1Context *f)
     return 0;
 }
 
-void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
+void ff_ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
 {
     int i, j;
 
@@ -272,12 +206,21 @@ void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
     }
 }
 
-av_cold int ffv1_close(AVCodecContext *avctx)
+
+av_cold int ff_ffv1_close(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
     int i, j;
 
-    for (j = 0; j < s->slice_count; j++) {
+    if (s->picture.f)
+        ff_thread_release_buffer(avctx, &s->picture);
+    av_frame_free(&s->picture.f);
+
+    if (s->last_picture.f)
+        ff_thread_release_buffer(avctx, &s->last_picture);
+    av_frame_free(&s->last_picture.f);
+
+    for (j = 0; j < s->max_slice_count; j++) {
         FFV1Context *fs = s->slice_context[j];
         for (i = 0; i < s->plane_count; i++) {
             PlaneContext *p = &fs->plane[i];
@@ -286,19 +229,20 @@ av_cold int ffv1_close(AVCodecContext *avctx)
             av_freep(&p->vlc_state);
         }
         av_freep(&fs->sample_buffer);
+        av_freep(&fs->sample_buffer32);
     }
 
     av_freep(&avctx->stats_out);
     for (j = 0; j < s->quant_table_count; j++) {
         av_freep(&s->initial_states[j]);
-        for (i = 0; i < s->slice_count; i++) {
+        for (i = 0; i < s->max_slice_count; i++) {
             FFV1Context *sf = s->slice_context[i];
             av_freep(&sf->rc_stat2[j]);
         }
         av_freep(&s->rc_stat2[j]);
     }
 
-    for (i = 0; i < s->slice_count; i++)
+    for (i = 0; i < s->max_slice_count; i++)
         av_freep(&s->slice_context[i]);
 
     return 0;
diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
index 7e0465a..f0bb193 100644
--- a/libavcodec/ffv1.h
+++ b/libavcodec/ffv1.h
@@ -3,32 +3,49 @@
  *
  * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_FFV1_H
 #define AVCODEC_FFV1_H
 
-#include <stdint.h>
+/**
+ * @file
+ * FF Video Codec 1 (a lossless codec)
+ */
 
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
+#include "internal.h"
+#include "mathops.h"
 #include "put_bits.h"
 #include "rangecoder.h"
+#include "thread.h"
+
+#ifdef __INTEL_COMPILER
+#undef av_flatten
+#define av_flatten
+#endif
 
 #define MAX_PLANES 4
 #define CONTEXT_SIZE 32
@@ -39,14 +56,7 @@
 #define AC_GOLOMB_RICE          0
 #define AC_RANGE_DEFAULT_TAB    1
 #define AC_RANGE_CUSTOM_TAB     2
-
-extern const uint8_t ff_log2_run[41];
-
-extern const int8_t ffv1_quant5_10bit[256];
-extern const int8_t ffv1_quant5[256];
-extern const int8_t ffv1_quant9_10bit[256];
-extern const int8_t ffv1_quant11[256];
-extern const uint8_t ffv1_ver2_state[256];
+#define AC_RANGE_DEFAULT_TAB_FORCE -2
 
 typedef struct VlcState {
     int16_t drift;
@@ -64,18 +74,18 @@ typedef struct PlaneContext {
     uint8_t interlace_bit_state[2];
 } PlaneContext;
 
-#define MAX_SLICES 256
+#define MAX_SLICES 1024
 
 typedef struct FFV1Context {
     AVClass *class;
     AVCodecContext *avctx;
     RangeCoder c;
-    BitstreamContext bc;
+    GetBitContext gb;
     PutBitContext pb;
     uint64_t rc_stat[256][2];
     uint64_t (*rc_stat2[MAX_QUANT_TABLES])[32][2];
     int version;
-    int minor_version;
+    int micro_version;
     int width, height;
     int chroma_planes;
     int chroma_h_shift, chroma_v_shift;
@@ -83,13 +93,13 @@ typedef struct FFV1Context {
     int flags;
     int picture_number;
     int key_frame;
-    const AVFrame *frame;
-    AVFrame *last_picture;
+    ThreadFrame picture, last_picture;
+    struct FFV1Context *fsrc;
 
     AVFrame *cur;
     int plane_count;
-    int ac;     // 1 = range coder <-> 0 = golomb rice
-    int ac_byte_count;      // number of bytes used for AC coding
+    int ac;                              ///< 1=range coder <-> 0=golomb rice
+    int ac_byte_count;                   ///< number of bytes used for AC coding
     PlaneContext plane[MAX_PLANES];
     int16_t quant_table[MAX_CONTEXT_INPUTS][256];
     int16_t quant_tables[MAX_QUANT_TABLES][MAX_CONTEXT_INPUTS][256];
@@ -99,8 +109,12 @@ typedef struct FFV1Context {
     int run_index;
     int colorspace;
     int16_t *sample_buffer;
+    int32_t *sample_buffer32;
+
+    int use32bit;
 
     int ec;
+    int intra;
     int slice_damaged;
     int key_frame_ok;
     int context_model;
@@ -113,58 +127,38 @@ typedef struct FFV1Context {
 
     struct FFV1Context *slice_context[MAX_SLICES];
     int slice_count;
+    int max_slice_count;
     int num_v_slices;
     int num_h_slices;
     int slice_width;
     int slice_height;
     int slice_x;
     int slice_y;
+    int slice_reset_contexts;
+    int slice_coding_mode;
+    int slice_rct_by_coef;
+    int slice_rct_ry_coef;
 } FFV1Context;
 
+int ff_ffv1_common_init(AVCodecContext *avctx);
+int ff_ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs);
+int ff_ffv1_init_slices_state(FFV1Context *f);
+int ff_ffv1_init_slice_contexts(FFV1Context *f);
+int ff_ffv1_allocate_initial_states(FFV1Context *f);
+void ff_ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs);
+int ff_ffv1_close(AVCodecContext *avctx);
+
 static av_always_inline int fold(int diff, int bits)
 {
     if (bits == 8)
         diff = (int8_t)diff;
     else {
-        diff +=  1 << (bits  - 1);
-        diff &= (1 <<  bits) - 1;
-        diff -=  1 << (bits  - 1);
+        diff = sign_extend(diff, bits);
     }
 
     return diff;
 }
 
-static inline int predict(int16_t *src, int16_t *last)
-{
-    const int LT = last[-1];
-    const int T  = last[0];
-    const int L  = src[-1];
-
-    return mid_pred(L, L + T - LT, T);
-}
-
-static inline int get_context(PlaneContext *p, int16_t *src,
-                              int16_t *last, int16_t *last2)
-{
-    const int LT = last[-1];
-    const int T  = last[0];
-    const int RT = last[1];
-    const int L  = src[-1];
-
-    if (p->quant_table[3][127]) {
-        const int TT = last2[0];
-        const int LL = src[-2];
-        return p->quant_table[0][(L - LT) & 0xFF] +
-               p->quant_table[1][(LT - T) & 0xFF] +
-               p->quant_table[2][(T - RT) & 0xFF] +
-               p->quant_table[3][(LL - L) & 0xFF] +
-               p->quant_table[4][(TT - T) & 0xFF];
-    } else
-        return p->quant_table[0][(L - LT) & 0xFF] +
-               p->quant_table[1][(LT - T) & 0xFF] +
-               p->quant_table[2][(T - RT) & 0xFF];
-}
-
 static inline void update_vlc_state(VlcState *const state, const int v)
 {
     int drift = state->drift;
@@ -180,30 +174,29 @@ static inline void update_vlc_state(VlcState *const state, const int v)
     count++;
 
     if (drift <= -count) {
-        if (state->bias > -128)
-            state->bias--;
+        state->bias = FFMAX(state->bias - 1, -128);
 
-        drift += count;
-        if (drift <= -count)
-            drift = -count + 1;
+        drift = FFMAX(drift + count, -count + 1);
     } else if (drift > 0) {
-        if (state->bias < 127)
-            state->bias++;
+        state->bias = FFMIN(state->bias + 1, 127);
 
-        drift -= count;
-        if (drift > 0)
-            drift = 0;
+        drift = FFMIN(drift - count, 0);
     }
 
     state->drift = drift;
     state->count = count;
 }
 
-int ffv1_common_init(AVCodecContext *avctx);
-int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs);
-int ffv1_init_slice_contexts(FFV1Context *f);
-int ffv1_allocate_initial_states(FFV1Context *f);
-void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs);
-int ffv1_close(AVCodecContext *avctx);
+#define TYPE int16_t
+#define RENAME(name) name
+#include "ffv1_template.c"
+#undef TYPE
+#undef RENAME
+
+#define TYPE int32_t
+#define RENAME(name) name ## 32
+#include "ffv1_template.c"
+#undef TYPE
+#undef RENAME
 
 #endif /* AVCODEC_FFV1_H */
diff --git a/libavcodec/ffv1_template.c b/libavcodec/ffv1_template.c
new file mode 100644
index 0000000..f2ab933
--- /dev/null
+++ b/libavcodec/ffv1_template.c
@@ -0,0 +1,53 @@
+/*
+ * FFV1 codec
+ *
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static inline int RENAME(predict)(TYPE *src, TYPE *last)
+{
+    const int LT = last[-1];
+    const int T  = last[0];
+    const int L  = src[-1];
+
+    return mid_pred(L, L + T - LT, T);
+}
+
+static inline int RENAME(get_context)(PlaneContext *p, TYPE *src,
+                                      TYPE *last, TYPE *last2)
+{
+    const int LT = last[-1];
+    const int T  = last[0];
+    const int RT = last[1];
+    const int L  = src[-1];
+
+    if (p->quant_table[3][127]) {
+        const int TT = last2[0];
+        const int LL = src[-2];
+        return p->quant_table[0][(L - LT) & 0xFF] +
+               p->quant_table[1][(LT - T) & 0xFF] +
+               p->quant_table[2][(T - RT) & 0xFF] +
+               p->quant_table[3][(LL - L) & 0xFF] +
+               p->quant_table[4][(TT - T) & 0xFF];
+    } else
+        return p->quant_table[0][(L - LT) & 0xFF] +
+               p->quant_table[1][(LT - T) & 0xFF] +
+               p->quant_table[2][(T - RT) & 0xFF];
+}
+
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 07e66b9..261e0cf 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -1,22 +1,22 @@
 /*
  * FFV1 decoder
  *
- * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,18 +26,16 @@
  */
 
 #include "libavutil/avassert.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/crc.h"
 #include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
 #include "libavutil/timer.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
-#include "golomb.h"
 #include "internal.h"
-#include "put_bits.h"
+#include "get_bits.h"
 #include "rangecoder.h"
+#include "golomb.h"
 #include "mathops.h"
 #include "ffv1.h"
 
@@ -47,10 +45,14 @@ static inline av_flatten int get_symbol_inline(RangeCoder *c, uint8_t *state,
     if (get_rac(c, state + 0))
         return 0;
     else {
-        int i, e, a;
+        int i, e;
+        unsigned a;
         e = 0;
-        while (get_rac(c, state + 1 + FFMIN(e, 9))) // 1..10
+        while (get_rac(c, state + 1 + FFMIN(e, 9))) { // 1..10
             e++;
+            if (e > 31)
+                return AVERROR_INVALIDDATA;
+        }
 
         a = 1;
         for (i = e - 1; i >= 0; i--)
@@ -66,7 +68,7 @@ static av_noinline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed)
     return get_symbol_inline(c, state, is_signed);
 }
 
-static inline int get_vlc_symbol(BitstreamContext *bc, VlcState *const state,
+static inline int get_vlc_symbol(GetBitContext *gb, VlcState *const state,
                                  int bits)
 {
     int k, i, v, ret;
@@ -78,9 +80,7 @@ static inline int get_vlc_symbol(BitstreamContext *bc, VlcState *const state,
         i += i;
     }
 
-    assert(k <= 8);
-
-    v = get_sr_golomb(bc, k, 12, bits);
+    v = get_sr_golomb(gb, k, 12, bits);
     ff_dlog(NULL, "v:%d bias:%d error:%d drift:%d count:%d k:%d",
             v, state->bias, state->error_sum, state->drift, state->count, k);
 
@@ -93,79 +93,32 @@ static inline int get_vlc_symbol(BitstreamContext *bc, VlcState *const state,
     return ret;
 }
 
-static av_always_inline void decode_line(FFV1Context *s, int w,
-                                         int16_t *sample[2],
-                                         int plane_index, int bits)
+static int is_input_end(FFV1Context *s)
 {
-    PlaneContext *const p = &s->plane[plane_index];
-    RangeCoder *const c   = &s->c;
-    int x;
-    int run_count = 0;
-    int run_mode  = 0;
-    int run_index = s->run_index;
-
-    for (x = 0; x < w; x++) {
-        int diff, context, sign;
-
-        context = get_context(p, sample[1] + x, sample[0] + x, sample[1] + x);
-        if (context < 0) {
-            context = -context;
-            sign    = 1;
-        } else
-            sign = 0;
-
-        av_assert2(context < p->context_count);
-
-        if (s->ac != AC_GOLOMB_RICE) {
-            diff = get_symbol_inline(c, p->state[context], 1);
-        } else {
-            if (context == 0 && run_mode == 0)
-                run_mode = 1;
-
-            if (run_mode) {
-                if (run_count == 0 && run_mode == 1) {
-                    if (bitstream_read_bit(&s->bc)) {
-                        run_count = 1 << ff_log2_run[run_index];
-                        if (x + run_count <= w)
-                            run_index++;
-                    } else {
-                        if (ff_log2_run[run_index])
-                            run_count = bitstream_read(&s->bc, ff_log2_run[run_index]);
-                        else
-                            run_count = 0;
-                        if (run_index)
-                            run_index--;
-                        run_mode = 2;
-                    }
-                }
-                run_count--;
-                if (run_count < 0) {
-                    run_mode  = 0;
-                    run_count = 0;
-                    diff      = get_vlc_symbol(&s->bc, &p->vlc_state[context],
-                                               bits);
-                    if (diff >= 0)
-                        diff++;
-                } else
-                    diff = 0;
-            } else
-                diff = get_vlc_symbol(&s->bc, &p->vlc_state[context], bits);
-
-            ff_dlog(s->avctx, "count:%d index:%d, mode:%d, x:%d pos:%d\n",
-                    run_count, run_index, run_mode, x, bitstream_tell(&s->bc));
-        }
-
-        if (sign)
-            diff = -diff;
-
-        sample[1][x] = (predict(sample[1] + x, sample[0] + x) + diff) &
-                       ((1 << bits) - 1);
+    if (s->ac != AC_GOLOMB_RICE) {
+        RangeCoder *const c = &s->c;
+        if (c->overread > MAX_OVERREAD)
+            return AVERROR_INVALIDDATA;
+    } else {
+        if (get_bits_left(&s->gb) < 1)
+            return AVERROR_INVALIDDATA;
     }
-    s->run_index = run_index;
+    return 0;
 }
 
-static void decode_plane(FFV1Context *s, uint8_t *src,
-                         int w, int h, int stride, int plane_index)
+#define TYPE int16_t
+#define RENAME(name) name
+#include "ffv1dec_template.c"
+#undef TYPE
+#undef RENAME
+
+#define TYPE int32_t
+#define RENAME(name) name ## 32
+#include "ffv1dec_template.c"
+
+static int decode_plane(FFV1Context *s, uint8_t *src,
+                         int w, int h, int stride, int plane_index,
+                         int pixel_stride)
 {
     int x, y;
     int16_t *sample[2];
@@ -187,80 +140,28 @@ static void decode_plane(FFV1Context *s, uint8_t *src,
 
 // { START_TIMER
         if (s->avctx->bits_per_raw_sample <= 8) {
-            decode_line(s, w, sample, plane_index, 8);
+            int ret = decode_line(s, w, sample, plane_index, 8);
+            if (ret < 0)
+                return ret;
             for (x = 0; x < w; x++)
-                src[x + stride * y] = sample[1][x];
+                src[x*pixel_stride + stride * y] = sample[1][x];
         } else {
-            decode_line(s, w, sample, plane_index,
-                        s->avctx->bits_per_raw_sample);
+            int ret = decode_line(s, w, sample, plane_index, s->avctx->bits_per_raw_sample);
+            if (ret < 0)
+                return ret;
             if (s->packed_at_lsb) {
-                for (x = 0; x < w; x++)
-                    ((uint16_t *)(src + stride * y))[x] = sample[1][x];
+                for (x = 0; x < w; x++) {
+                    ((uint16_t*)(src + stride*y))[x*pixel_stride] = sample[1][x];
+                }
             } else {
-                for (x = 0; x < w; x++)
-                    ((uint16_t *)(src + stride * y))[x] = sample[1][x] << (16 - s->avctx->bits_per_raw_sample);
+                for (x = 0; x < w; x++) {
+                    ((uint16_t*)(src + stride*y))[x*pixel_stride] = sample[1][x] << (16 - s->avctx->bits_per_raw_sample) | ((uint16_t **)sample)[1][x] >> (2 * s->avctx->bits_per_raw_sample - 16);
+                }
             }
         }
 // STOP_TIMER("decode-line") }
     }
-}
-
-static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h,
-                             int stride[3])
-{
-    int x, y, p;
-    int16_t *sample[4][2];
-    int lbd  = s->avctx->bits_per_raw_sample <= 8;
-    int bits = s->avctx->bits_per_raw_sample > 0
-               ? s->avctx->bits_per_raw_sample
-               : 8;
-    int offset = 1 << bits;
-
-    for (x = 0; x < 4; x++) {
-        sample[x][0] = s->sample_buffer +  x * 2      * (w + 6) + 3;
-        sample[x][1] = s->sample_buffer + (x * 2 + 1) * (w + 6) + 3;
-    }
-
-    s->run_index = 0;
-
-    memset(s->sample_buffer, 0, 8 * (w + 6) * sizeof(*s->sample_buffer));
-
-    for (y = 0; y < h; y++) {
-        for (p = 0; p < 3 + s->transparency; p++) {
-            int16_t *temp = sample[p][0]; //FIXME try a normal buffer
-
-            sample[p][0] = sample[p][1];
-            sample[p][1] = temp;
-
-            sample[p][1][-1] = sample[p][0][0];
-            sample[p][0][w]  = sample[p][0][w - 1];
-            if (lbd)
-                decode_line(s, w, sample[p], (p + 1) / 2, 9);
-            else
-                decode_line(s, w, sample[p], (p + 1) / 2, bits + 1);
-        }
-        for (x = 0; x < w; x++) {
-            int g = sample[0][1][x];
-            int b = sample[1][1][x];
-            int r = sample[2][1][x];
-            int a = sample[3][1][x];
-
-            b -= offset;
-            r -= offset;
-            g -= (b + r) >> 2;
-            b += g;
-            r += g;
-
-            if (lbd)
-                *((uint32_t *)(src[0] + x * 4 + stride[0] * y)) = b +
-                    (g << 8) + (r << 16) + (a << 24);
-            else {
-                *((uint16_t *)(src[0] + x * 2 + stride[0] * y)) = b;
-                *((uint16_t *)(src[1] + x * 2 + stride[1] * y)) = g;
-                *((uint16_t *)(src[2] + x * 2 + stride[2] * y)) = r;
-            }
-        }
-    }
+    return 0;
 }
 
 static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
@@ -270,35 +171,29 @@ static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
     unsigned ps, i, context_count;
     memset(state, 128, sizeof(state));
 
-    if (fs->ac == AC_RANGE_CUSTOM_TAB) {
-        for (i = 1; i < 256; i++) {
-            fs->c.one_state[i]        = f->state_transition[i];
-            fs->c.zero_state[256 - i] = 256 - fs->c.one_state[i];
-        }
-    }
+    av_assert0(f->version > 2);
 
-    fs->slice_x      = get_symbol(c, state, 0) * f->width;
-    fs->slice_y      = get_symbol(c, state, 0) * f->height;
-    fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width + fs->slice_x;
+    fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+    fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+    fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
     fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
 
-    fs->slice_x     /= f->num_h_slices;
-    fs->slice_y     /= f->num_v_slices;
-    fs->slice_width  = fs->slice_width / f->num_h_slices - fs->slice_x;
-    fs->slice_height = fs->slice_height / f->num_v_slices - fs->slice_y;
-    if ((unsigned)fs->slice_width  > f->width ||
-        (unsigned)fs->slice_height > f->height)
-        return AVERROR_INVALIDDATA;
-    if ((unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width ||
-        (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
-        return AVERROR_INVALIDDATA;
+    fs->slice_x /= f->num_h_slices;
+    fs->slice_y /= f->num_v_slices;
+    fs->slice_width  = fs->slice_width /f->num_h_slices - fs->slice_x;
+    fs->slice_height = fs->slice_height/f->num_v_slices - fs->slice_y;
+    if ((unsigned)fs->slice_width > f->width || (unsigned)fs->slice_height > f->height)
+        return -1;
+    if (    (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+         || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
+        return -1;
 
     for (i = 0; i < f->plane_count; i++) {
-        PlaneContext *const p = &fs->plane[i];
-        int idx               = get_symbol(c, state, 0);
-        if (idx > (unsigned)f->quant_table_count) {
+        PlaneContext * const p = &fs->plane[i];
+        int idx = get_symbol(c, state, 0);
+        if (idx >= (unsigned)f->quant_table_count) {
             av_log(f->avctx, AV_LOG_ERROR, "quant_table_index out of range\n");
-            return AVERROR_INVALIDDATA;
+            return -1;
         }
         p->quant_table_index = idx;
         memcpy(p->quant_table, f->quant_tables[idx], sizeof(p->quant_table));
@@ -332,84 +227,141 @@ static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
         f->cur->sample_aspect_ratio = (AVRational){ 0, 1 };
     }
 
+    if (fs->version > 3) {
+        fs->slice_reset_contexts = get_rac(c, state);
+        fs->slice_coding_mode = get_symbol(c, state, 0);
+        if (fs->slice_coding_mode != 1) {
+            fs->slice_rct_by_coef = get_symbol(c, state, 0);
+            fs->slice_rct_ry_coef = get_symbol(c, state, 0);
+            if ((uint64_t)fs->slice_rct_by_coef + (uint64_t)fs->slice_rct_ry_coef > 4) {
+                av_log(f->avctx, AV_LOG_ERROR, "slice_rct_y_coef out of range\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
     return 0;
 }
 
 static int decode_slice(AVCodecContext *c, void *arg)
 {
-    FFV1Context *fs = *(void **)arg;
-    FFV1Context *f  = fs->avctx->priv_data;
+    FFV1Context *fs   = *(void **)arg;
+    FFV1Context *f    = fs->avctx->priv_data;
     int width, height, x, y, ret;
-    const int ps = (av_pix_fmt_desc_get(c->pix_fmt)->flags & AV_PIX_FMT_FLAG_PLANAR)
-                   ? (c->bits_per_raw_sample > 8) + 1
-                   : 4;
-    AVFrame *const p = f->cur;
+    const int ps      = av_pix_fmt_desc_get(c->pix_fmt)->comp[0].step;
+    AVFrame * const p = f->cur;
+    int i, si;
+
+    for( si=0; fs != f->slice_context[si]; si ++)
+        ;
+
+    if(f->fsrc && !p->key_frame)
+        ff_thread_await_progress(&f->last_picture, si, 0);
+
+    if(f->fsrc && !p->key_frame) {
+        FFV1Context *fssrc = f->fsrc->slice_context[si];
+        FFV1Context *fsdst = f->slice_context[si];
+        av_assert1(fsdst->plane_count == fssrc->plane_count);
+        av_assert1(fsdst == fs);
+
+        if (!p->key_frame)
+            fsdst->slice_damaged |= fssrc->slice_damaged;
+
+        for (i = 0; i < f->plane_count; i++) {
+            PlaneContext *psrc = &fssrc->plane[i];
+            PlaneContext *pdst = &fsdst->plane[i];
+
+            av_free(pdst->state);
+            av_free(pdst->vlc_state);
+            memcpy(pdst, psrc, sizeof(*pdst));
+            pdst->state = NULL;
+            pdst->vlc_state = NULL;
+
+            if (fssrc->ac) {
+                pdst->state = av_malloc_array(CONTEXT_SIZE,  psrc->context_count);
+                memcpy(pdst->state, psrc->state, CONTEXT_SIZE * psrc->context_count);
+            } else {
+                pdst->vlc_state = av_malloc_array(sizeof(*pdst->vlc_state), psrc->context_count);
+                memcpy(pdst->vlc_state, psrc->vlc_state, sizeof(*pdst->vlc_state) * psrc->context_count);
+            }
+        }
+    }
+
+    fs->slice_rct_by_coef = 1;
+    fs->slice_rct_ry_coef = 1;
 
     if (f->version > 2) {
+        if (ff_ffv1_init_slice_state(f, fs) < 0)
+            return AVERROR(ENOMEM);
         if (decode_slice_header(f, fs) < 0) {
+            fs->slice_x = fs->slice_y = fs->slice_height = fs->slice_width = 0;
             fs->slice_damaged = 1;
             return AVERROR_INVALIDDATA;
         }
     }
-    if ((ret = ffv1_init_slice_state(f, fs)) < 0)
+    if ((ret = ff_ffv1_init_slice_state(f, fs)) < 0)
         return ret;
-    if (f->cur->key_frame)
-        ffv1_clear_slice_state(f, fs);
+    if (f->cur->key_frame || fs->slice_reset_contexts)
+        ff_ffv1_clear_slice_state(f, fs);
+
     width  = fs->slice_width;
     height = fs->slice_height;
     x      = fs->slice_x;
     y      = fs->slice_y;
 
     if (fs->ac == AC_GOLOMB_RICE) {
-        if (f->version == 3 && f->minor_version > 1 || f->version > 3)
+        if (f->version == 3 && f->micro_version > 1 || f->version > 3)
             get_rac(&fs->c, (uint8_t[]) { 129 });
         fs->ac_byte_count = f->version > 2 || (!x && !y) ? fs->c.bytestream - fs->c.bytestream_start - 1 : 0;
-        bitstream_init8(&fs->bc, fs->c.bytestream_start + fs->ac_byte_count,
-                        (fs->c.bytestream_end - fs->c.bytestream_start -
-                         fs->ac_byte_count));
+        init_get_bits(&fs->gb,
+                      fs->c.bytestream_start + fs->ac_byte_count,
+                      (fs->c.bytestream_end - fs->c.bytestream_start - fs->ac_byte_count) * 8);
     }
 
     av_assert1(width && height);
-    if (f->colorspace == 0) {
+    if (f->colorspace == 0 && (f->chroma_planes || !fs->transparency)) {
         const int chroma_width  = AV_CEIL_RSHIFT(width,  f->chroma_h_shift);
         const int chroma_height = AV_CEIL_RSHIFT(height, f->chroma_v_shift);
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
-        decode_plane(fs, p->data[0] + ps * x + y * p->linesize[0], width,
-                     height, p->linesize[0],
-                     0);
+        decode_plane(fs, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
 
         if (f->chroma_planes) {
-            decode_plane(fs, p->data[1] + ps * cx + cy * p->linesize[1],
-                         chroma_width, chroma_height, p->linesize[1],
-                         1);
-            decode_plane(fs, p->data[2] + ps * cx + cy * p->linesize[2],
-                         chroma_width, chroma_height, p->linesize[2],
-                         1);
+            decode_plane(fs, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
+            decode_plane(fs, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
         }
         if (fs->transparency)
-            decode_plane(fs, p->data[3] + ps * x + y * p->linesize[3], width,
-                         height, p->linesize[3],
-                         2);
+            decode_plane(fs, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], (f->version >= 4 && !f->chroma_planes) ? 1 : 2, 1);
+    } else if (f->colorspace == 0) {
+         decode_plane(fs, p->data[0] + ps*x + y*p->linesize[0]    , width, height, p->linesize[0], 0, 2);
+         decode_plane(fs, p->data[0] + ps*x + y*p->linesize[0] + 1, width, height, p->linesize[0], 1, 2);
+    } else if (f->use32bit) {
+        uint8_t *planes[4] = { p->data[0] + ps * x + y * p->linesize[0],
+                               p->data[1] + ps * x + y * p->linesize[1],
+                               p->data[2] + ps * x + y * p->linesize[2],
+                               p->data[3] + ps * x + y * p->linesize[3] };
+        decode_rgb_frame32(fs, planes, width, height, p->linesize);
     } else {
-        uint8_t *planes[3] = { p->data[0] + ps * x + y * p->linesize[0],
+        uint8_t *planes[4] = { p->data[0] + ps * x + y * p->linesize[0],
                                p->data[1] + ps * x + y * p->linesize[1],
-                               p->data[2] + ps * x + y * p->linesize[2] };
+                               p->data[2] + ps * x + y * p->linesize[2],
+                               p->data[3] + ps * x + y * p->linesize[3] };
         decode_rgb_frame(fs, planes, width, height, p->linesize);
     }
     if (fs->ac != AC_GOLOMB_RICE && f->version > 2) {
         int v;
         get_rac(&fs->c, (uint8_t[]) { 129 });
-        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5 * f->ec;
+        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5*f->ec;
         if (v) {
-            av_log(f->avctx, AV_LOG_ERROR, "bytestream end mismatching by %d\n",
-                   v);
+            av_log(f->avctx, AV_LOG_ERROR, "bytestream end mismatching by %d\n", v);
             fs->slice_damaged = 1;
         }
     }
 
     emms_c();
 
+    ff_thread_report_progress(&f->picture, si, 0);
+
     return 0;
 }
 
@@ -422,10 +374,10 @@ static int read_quant_table(RangeCoder *c, int16_t *quant_table, int scale)
     memset(state, 128, sizeof(state));
 
     for (v = 0; i < 128; v++) {
-        unsigned len = get_symbol(c, state, 0) + 1;
+        unsigned len = get_symbol(c, state, 0) + 1U;
 
-        if (len > 128 - i)
-            return -1;
+        if (len > 128 - i || !len)
+            return AVERROR_INVALIDDATA;
 
         while (len--) {
             quant_table[i] = scale * v;
@@ -447,9 +399,12 @@ static int read_quant_tables(RangeCoder *c,
     int context_count = 1;
 
     for (i = 0; i < 5; i++) {
-        context_count *= read_quant_table(c, quant_table[i], context_count);
+        int ret = read_quant_table(c, quant_table[i], context_count);
+        if (ret < 0)
+            return ret;
+        context_count *= ret;
         if (context_count > 32768U) {
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
     }
     return (context_count + 1) / 2;
@@ -461,6 +416,7 @@ static int read_extra_header(FFV1Context *f)
     uint8_t state[CONTEXT_SIZE];
     int i, j, k, ret;
     uint8_t state2[32][CONTEXT_SIZE];
+    unsigned crc = 0;
 
     memset(state2, 128, sizeof(state2));
     memset(state, 128, sizeof(state));
@@ -469,9 +425,15 @@ static int read_extra_header(FFV1Context *f)
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
     f->version = get_symbol(c, state, 0);
+    if (f->version < 2) {
+        av_log(f->avctx, AV_LOG_ERROR, "Invalid version in global header\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (f->version > 2) {
         c->bytestream_end -= 4;
-        f->minor_version   = get_symbol(c, state, 0);
+        f->micro_version = get_symbol(c, state, 0);
+        if (f->micro_version < 0)
+            return AVERROR_INVALIDDATA;
     }
     f->ac = get_symbol(c, state, 0);
 
@@ -486,19 +448,30 @@ static int read_extra_header(FFV1Context *f)
     f->chroma_h_shift             = get_symbol(c, state, 0);
     f->chroma_v_shift             = get_symbol(c, state, 0);
     f->transparency               = get_rac(c, state);
-    f->plane_count                = 2 + f->transparency;
+    f->plane_count                = 1 + (f->chroma_planes || f->version<4) + f->transparency;
     f->num_h_slices               = 1 + get_symbol(c, state, 0);
     f->num_v_slices               = 1 + get_symbol(c, state, 0);
 
-    if (f->num_h_slices > (unsigned)f->width ||
-        f->num_v_slices > (unsigned)f->height) {
-        av_log(f->avctx, AV_LOG_ERROR, "too many slices\n");
+    if (f->chroma_h_shift > 4U || f->chroma_v_shift > 4U) {
+        av_log(f->avctx, AV_LOG_ERROR, "chroma shift parameters %d %d are invalid\n",
+               f->chroma_h_shift, f->chroma_v_shift);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (f->num_h_slices > (unsigned)f->width  || !f->num_h_slices ||
+        f->num_v_slices > (unsigned)f->height || !f->num_v_slices
+       ) {
+        av_log(f->avctx, AV_LOG_ERROR, "slice count invalid\n");
         return AVERROR_INVALIDDATA;
     }
 
     f->quant_table_count = get_symbol(c, state, 0);
-    if (f->quant_table_count > (unsigned)MAX_QUANT_TABLES)
+    if (f->quant_table_count > (unsigned)MAX_QUANT_TABLES || !f->quant_table_count) {
+        av_log(f->avctx, AV_LOG_ERROR, "quant table count %d is invalid\n", f->quant_table_count);
+        f->quant_table_count = 0;
         return AVERROR_INVALIDDATA;
+    }
+
     for (i = 0; i < f->quant_table_count; i++) {
         f->context_count[i] = read_quant_tables(c, f->quant_tables[i]);
         if (f->context_count[i] < 0) {
@@ -506,7 +479,7 @@ static int read_extra_header(FFV1Context *f)
             return AVERROR_INVALIDDATA;
         }
     }
-    if ((ret = ffv1_allocate_initial_states(f)) < 0)
+    if ((ret = ff_ffv1_allocate_initial_states(f)) < 0)
         return ret;
 
     for (i = 0; i < f->quant_table_count; i++)
@@ -521,53 +494,66 @@ static int read_extra_header(FFV1Context *f)
 
     if (f->version > 2) {
         f->ec = get_symbol(c, state, 0);
+        if (f->micro_version > 2)
+            f->intra = get_symbol(c, state, 0);
     }
 
     if (f->version > 2) {
         unsigned v;
         v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0,
                    f->avctx->extradata, f->avctx->extradata_size);
-        if (v) {
+        if (v || f->avctx->extradata_size < 4) {
             av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!\n", v);
             return AVERROR_INVALIDDATA;
         }
+        crc = AV_RB32(f->avctx->extradata + f->avctx->extradata_size - 4);
     }
 
-    av_log(f->avctx, AV_LOG_VERBOSE,
-           "FFV1 version %d.%d colorspace %d - %d bits - %d/%d planes, %s transparent - tile geometry %dx%d - %s\n",
-           f->version, f->minor_version, f->colorspace, f->avctx->bits_per_raw_sample,
-           f->plane_count, f->chroma_planes, f->transparency ? "" : "not",
-           f->num_h_slices, f->num_v_slices,
-           f->ec ? "per-slice crc" : "no crc");
-
+    if (f->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(f->avctx, AV_LOG_DEBUG,
+               "global: ver:%d.%d, coder:%d, colorspace: %d bpr:%d chroma:%d(%d:%d), alpha:%d slices:%dx%d qtabs:%d ec:%d intra:%d CRC:0x%08X\n",
+               f->version, f->micro_version,
+               f->ac,
+               f->colorspace,
+               f->avctx->bits_per_raw_sample,
+               f->chroma_planes, f->chroma_h_shift, f->chroma_v_shift,
+               f->transparency,
+               f->num_h_slices, f->num_v_slices,
+               f->quant_table_count,
+               f->ec,
+               f->intra,
+               crc
+              );
     return 0;
 }
 
-
 static int read_header(FFV1Context *f)
 {
     uint8_t state[CONTEXT_SIZE];
-    int i, j, context_count = -1;
+    int i, j, context_count = -1; //-1 to avoid warning
     RangeCoder *const c = &f->slice_context[0]->c;
 
     memset(state, 128, sizeof(state));
 
     if (f->version < 2) {
         int chroma_planes, chroma_h_shift, chroma_v_shift, transparency, colorspace, bits_per_raw_sample;
-        unsigned v = get_symbol(c, state, 0);
-        if (v > 1) {
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "invalid version %d in version 1 header\n", v);
+        unsigned v= get_symbol(c, state, 0);
+        if (v >= 2) {
+            av_log(f->avctx, AV_LOG_ERROR, "invalid version %d in ver01 header\n", v);
             return AVERROR_INVALIDDATA;
         }
         f->version = v;
-
         f->ac = get_symbol(c, state, 0);
 
         if (f->ac == AC_RANGE_CUSTOM_TAB) {
-            for (i = 1; i < 256; i++)
-                f->state_transition[i] =
-                    get_symbol(c, state, 1) + c->one_state[i];
+            for (i = 1; i < 256; i++) {
+                int st = get_symbol(c, state, 1) + c->one_state[i];
+                if (st < 1 || st > 255) {
+                    av_log(f->avctx, AV_LOG_ERROR, "invalid state transition %d\n", st);
+                    return AVERROR_INVALIDDATA;
+                }
+                f->state_transition[i] = st;
+            }
         }
 
         colorspace          = get_symbol(c, state, 0); //YUV cs type
@@ -576,6 +562,8 @@ static int read_header(FFV1Context *f)
         chroma_h_shift      = get_symbol(c, state, 0);
         chroma_v_shift      = get_symbol(c, state, 0);
         transparency        = get_rac(c, state);
+        if (colorspace == 0 && f->avctx->skip_alpha)
+            transparency = 0;
 
         if (f->plane_count) {
             if (colorspace          != f->colorspace                 ||
@@ -589,6 +577,12 @@ static int read_header(FFV1Context *f)
             }
         }
 
+        if (chroma_h_shift > 4U || chroma_v_shift > 4U) {
+            av_log(f->avctx, AV_LOG_ERROR, "chroma shift parameters %d %d are invalid\n",
+                   chroma_h_shift, chroma_v_shift);
+            return AVERROR_INVALIDDATA;
+        }
+
         f->colorspace                 = colorspace;
         f->avctx->bits_per_raw_sample = bits_per_raw_sample;
         f->chroma_planes              = chroma_planes;
@@ -600,102 +594,102 @@ static int read_header(FFV1Context *f)
     }
 
     if (f->colorspace == 0) {
-        if (f->transparency && f->avctx->bits_per_raw_sample > 8) {
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "Transparency not supported for bit depth %d\n",
-                   f->avctx->bits_per_raw_sample);
-            return AVERROR(ENOSYS);
-        }
         if (!f->transparency && !f->chroma_planes) {
             if (f->avctx->bits_per_raw_sample <= 8)
                 f->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
-            else
+            else if (f->avctx->bits_per_raw_sample == 9) {
+                f->packed_at_lsb = 1;
+                f->avctx->pix_fmt = AV_PIX_FMT_GRAY9;
+            } else if (f->avctx->bits_per_raw_sample == 10) {
+                f->packed_at_lsb = 1;
+                f->avctx->pix_fmt = AV_PIX_FMT_GRAY10;
+            } else if (f->avctx->bits_per_raw_sample == 12) {
+                f->packed_at_lsb = 1;
+                f->avctx->pix_fmt = AV_PIX_FMT_GRAY12;
+            } else if (f->avctx->bits_per_raw_sample == 16) {
+                f->packed_at_lsb = 1;
                 f->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
-        } else if (f->avctx->bits_per_raw_sample <= 8 && !f->transparency) {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P;
-                break;
-            case 0x01:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV440P;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-                break;
-            case 0x20:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV411P;
-                break;
-            case 0x22:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV410P;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
+            } else if (f->avctx->bits_per_raw_sample < 16) {
+                f->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            } else
+                return AVERROR(ENOSYS);
+        } else if (f->transparency && !f->chroma_planes) {
+            if (f->avctx->bits_per_raw_sample <= 8)
+                f->avctx->pix_fmt = AV_PIX_FMT_YA8;
+            else
                 return AVERROR(ENOSYS);
+        } else if (f->avctx->bits_per_raw_sample<=8 && !f->transparency) {
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P; break;
+            case 0x01: f->avctx->pix_fmt = AV_PIX_FMT_YUV440P; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P; break;
+            case 0x20: f->avctx->pix_fmt = AV_PIX_FMT_YUV411P; break;
+            case 0x22: f->avctx->pix_fmt = AV_PIX_FMT_YUV410P; break;
             }
         } else if (f->avctx->bits_per_raw_sample <= 8 && f->transparency) {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16*f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P; break;
             }
-        } else if (f->avctx->bits_per_raw_sample == 9) {
+        } else if (f->avctx->bits_per_raw_sample == 9 && !f->transparency) {
             f->packed_at_lsb = 1;
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P9;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P9;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P9; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P9; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P9; break;
             }
-        } else if (f->avctx->bits_per_raw_sample == 10) {
+        } else if (f->avctx->bits_per_raw_sample == 9 && f->transparency) {
             f->packed_at_lsb = 1;
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P9; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P9; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P9; break;
             }
-        } else {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+        } else if (f->avctx->bits_per_raw_sample == 10 && !f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P10; break;
+            case 0x01: f->avctx->pix_fmt = AV_PIX_FMT_YUV440P10; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P10; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P10; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 10 && f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P10; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P10; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P10; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 12 && !f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P12; break;
+            case 0x01: f->avctx->pix_fmt = AV_PIX_FMT_YUV440P12; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P12; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P12; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 14 && !f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P14; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P14; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P14; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 16 && !f->transparency){
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P16; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P16; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P16; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 16 && f->transparency){
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P16; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P16; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P16; break;
             }
         }
     } else if (f->colorspace == 1) {
@@ -704,32 +698,38 @@ static int read_header(FFV1Context *f)
                    "chroma subsampling not supported in this colorspace\n");
             return AVERROR(ENOSYS);
         }
-        if (f->transparency) {
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "Transparency not supported in this colorspace\n");
-                   return AVERROR(ENOSYS);
-        }
-        switch (f->avctx->bits_per_raw_sample) {
-        case 0:
-        case 8:
+        if (     f->avctx->bits_per_raw_sample <=  8 && !f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_0RGB32;
+        else if (f->avctx->bits_per_raw_sample <=  8 && f->transparency)
             f->avctx->pix_fmt = AV_PIX_FMT_RGB32;
-            break;
-        case 9:
+        else if (f->avctx->bits_per_raw_sample ==  9 && !f->transparency)
             f->avctx->pix_fmt = AV_PIX_FMT_GBRP9;
-            break;
-        case 10:
+        else if (f->avctx->bits_per_raw_sample == 10 && !f->transparency)
             f->avctx->pix_fmt = AV_PIX_FMT_GBRP10;
-            break;
-        default:
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "bit depth %d not supported\n",
-                   f->avctx->bits_per_raw_sample);
-            return AVERROR(ENOSYS);
+        else if (f->avctx->bits_per_raw_sample == 10 && f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        else if (f->avctx->bits_per_raw_sample == 12 && !f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+        else if (f->avctx->bits_per_raw_sample == 12 && f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRAP12;
+        else if (f->avctx->bits_per_raw_sample == 14 && !f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRP14;
+        else if (f->avctx->bits_per_raw_sample == 16 && !f->transparency) {
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRP16;
+            f->use32bit = 1;
+        }
+        else if (f->avctx->bits_per_raw_sample == 16 && f->transparency) {
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRAP16;
+            f->use32bit = 1;
         }
     } else {
         av_log(f->avctx, AV_LOG_ERROR, "colorspace not supported\n");
         return AVERROR(ENOSYS);
     }
+    if (f->avctx->pix_fmt == AV_PIX_FMT_NONE) {
+        av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
+        return AVERROR(ENOSYS);
+    }
 
     ff_dlog(f->avctx, "%d %d %d\n",
             f->chroma_h_shift, f->chroma_v_shift, f->avctx->pix_fmt);
@@ -739,23 +739,23 @@ static int read_header(FFV1Context *f)
             av_log(f->avctx, AV_LOG_ERROR, "read_quant_table error\n");
             return AVERROR_INVALIDDATA;
         }
+        f->slice_count = f->max_slice_count;
     } else if (f->version < 3) {
         f->slice_count = get_symbol(c, state, 0);
     } else {
         const uint8_t *p = c->bytestream_end;
         for (f->slice_count = 0;
-             f->slice_count < MAX_SLICES && 3 < p - c->bytestream_start;
+             f->slice_count < MAX_SLICES && 3 + 5*!!f->ec < p - c->bytestream_start;
              f->slice_count++) {
-            int trailer = 3 + 5 * !!f->ec;
-            int size    = AV_RB24(p - trailer);
+            int trailer = 3 + 5*!!f->ec;
+            int size = AV_RB24(p-trailer);
             if (size + trailer > p - c->bytestream_start)
                 break;
             p -= size + trailer;
         }
     }
-    if (f->slice_count > (unsigned)MAX_SLICES || f->slice_count <= 0) {
-        av_log(f->avctx, AV_LOG_ERROR, "slice count %d is invalid\n",
-               f->slice_count);
+    if (f->slice_count > (unsigned)MAX_SLICES || f->slice_count <= 0 || f->slice_count > f->max_slice_count) {
+        av_log(f->avctx, AV_LOG_ERROR, "slice count %d is invalid (max=%d)\n", f->slice_count, f->max_slice_count);
         return AVERROR_INVALIDDATA;
     }
 
@@ -767,23 +767,20 @@ static int read_header(FFV1Context *f)
         fs->slice_damaged = 0;
 
         if (f->version == 2) {
-            fs->slice_x     = get_symbol(c, state, 0) * f->width;
-            fs->slice_y     = get_symbol(c, state, 0) * f->height;
-            fs->slice_width =
-                (get_symbol(c, state, 0) + 1) * f->width + fs->slice_x;
-            fs->slice_height =
-                (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
-
-            fs->slice_x      /= f->num_h_slices;
-            fs->slice_y      /= f->num_v_slices;
+            fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+            fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+            fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
+            fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
+
+            fs->slice_x     /= f->num_h_slices;
+            fs->slice_y     /= f->num_v_slices;
             fs->slice_width  = fs->slice_width  / f->num_h_slices - fs->slice_x;
             fs->slice_height = fs->slice_height / f->num_v_slices - fs->slice_y;
-            if ((unsigned)fs->slice_width > f->width ||
+            if ((unsigned)fs->slice_width  > f->width ||
                 (unsigned)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
-            if ((unsigned)fs->slice_x + (uint64_t)fs->slice_width > f->width
-                || (unsigned)fs->slice_y + (uint64_t)fs->slice_height >
-                f->height)
+            if (   (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+                || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
         }
 
@@ -818,28 +815,26 @@ static int read_header(FFV1Context *f)
     return 0;
 }
 
-static av_cold int ffv1_decode_init(AVCodecContext *avctx)
+static av_cold int decode_init(AVCodecContext *avctx)
 {
     FFV1Context *f = avctx->priv_data;
     int ret;
 
-    ffv1_common_init(avctx);
-
-    f->last_picture = av_frame_alloc();
-    if (!f->last_picture)
-        return AVERROR(ENOMEM);
+    if ((ret = ff_ffv1_common_init(avctx)) < 0)
+        return ret;
 
-    if (avctx->extradata && (ret = read_extra_header(f)) < 0)
+    if (avctx->extradata_size > 0 && (ret = read_extra_header(f)) < 0)
         return ret;
 
-    if ((ret = ffv1_init_slice_contexts(f)) < 0)
+    if ((ret = ff_ffv1_init_slice_contexts(f)) < 0)
         return ret;
 
+    avctx->internal->allocate_progress = 1;
+
     return 0;
 }
 
-static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
-                             int *got_frame, AVPacket *avpkt)
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
     uint8_t *buf        = avpkt->data;
     int buf_size        = avpkt->size;
@@ -848,10 +843,22 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
     int i, ret;
     uint8_t keystate = 128;
     uint8_t *buf_p;
-    AVFrame *const p    = data;
+    AVFrame *p;
+
+    if (f->last_picture.f)
+        ff_thread_release_buffer(avctx, &f->last_picture);
+    FFSWAP(ThreadFrame, f->picture, f->last_picture);
 
-    f->cur = p;
+    f->cur = p = f->picture.f;
 
+    if (f->version < 3 && avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        /* we have interlaced material flagged in container */
+        p->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            p->top_field_first = 1;
+    }
+
+    f->avctx = avctx;
     ff_init_range_decoder(c, buf, buf_size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
@@ -871,29 +878,26 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         p->key_frame = 0;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &f->picture, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(avctx, AV_LOG_DEBUG,
-               "ver:%d keyframe:%d coder:%d ec:%d slices:%d bps:%d\n",
-               f->version, p->key_frame, f->ac, f->ec, f->slice_count,
-               f->avctx->bits_per_raw_sample);
+        av_log(avctx, AV_LOG_DEBUG, "ver:%d keyframe:%d coder:%d ec:%d slices:%d bps:%d\n",
+               f->version, p->key_frame, f->ac, f->ec, f->slice_count, f->avctx->bits_per_raw_sample);
+
+    ff_thread_finish_setup(avctx);
 
     buf_p = buf + buf_size;
     for (i = f->slice_count - 1; i >= 0; i--) {
         FFV1Context *fs = f->slice_context[i];
-        int trailer     = 3 + 5 * !!f->ec;
+        int trailer = 3 + 5*!!f->ec;
         int v;
 
-        if (i || f->version > 2)
-            v = AV_RB24(buf_p - trailer) + trailer;
-        else
-            v = buf_p - c->bytestream_start;
+        if (i || f->version > 2) v = AV_RB24(buf_p-trailer) + trailer;
+        else                     v = buf_p - c->bytestream_start;
         if (buf_p - c->bytestream_start < v) {
             av_log(avctx, AV_LOG_ERROR, "Slice pointer chain broken\n");
+            ff_thread_report_progress(&f->picture, INT_MAX, 0);
             return AVERROR_INVALIDDATA;
         }
         buf_p -= v;
@@ -901,9 +905,20 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         if (f->ec) {
             unsigned crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0, buf_p, v);
             if (crc) {
-                av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!\n", crc);
+                int64_t ts = avpkt->pts != AV_NOPTS_VALUE ? avpkt->pts : avpkt->dts;
+                av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!", crc);
+                if (ts != AV_NOPTS_VALUE && avctx->pkt_timebase.num) {
+                    av_log(f->avctx, AV_LOG_ERROR, "at %f seconds\n", ts*av_q2d(avctx->pkt_timebase));
+                } else if (ts != AV_NOPTS_VALUE) {
+                    av_log(f->avctx, AV_LOG_ERROR, "at %"PRId64"\n", ts);
+                } else {
+                    av_log(f->avctx, AV_LOG_ERROR, "\n");
+                }
                 fs->slice_damaged = 1;
             }
+            if (avctx->debug & FF_DEBUG_PICT_INFO) {
+                av_log(avctx, AV_LOG_DEBUG, "slice %d, CRC: 0x%08"PRIX32"\n", i, AV_RB32(buf_p + v - 4));
+            }
         }
 
         if (i) {
@@ -911,57 +926,163 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         } else
             fs->c.bytestream_end = buf_p + v;
 
+        fs->avctx = avctx;
         fs->cur = p;
     }
 
-    avctx->execute(avctx, decode_slice, &f->slice_context[0], NULL,
+    avctx->execute(avctx,
+                   decode_slice,
+                   &f->slice_context[0],
+                   NULL,
                    f->slice_count,
-                   sizeof(void *));
+                   sizeof(void*));
 
     for (i = f->slice_count - 1; i >= 0; i--) {
         FFV1Context *fs = f->slice_context[i];
         int j;
-        if (fs->slice_damaged && f->last_picture->data[0]) {
+        if (fs->slice_damaged && f->last_picture.f->data[0]) {
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
             const uint8_t *src[4];
             uint8_t *dst[4];
-            for (j = 0; j < 4; j++) {
+            ff_thread_await_progress(&f->last_picture, INT_MAX, 0);
+            for (j = 0; j < desc->nb_components; j++) {
+                int pixshift = desc->comp[j].depth > 8;
                 int sh = (j == 1 || j == 2) ? f->chroma_h_shift : 0;
                 int sv = (j == 1 || j == 2) ? f->chroma_v_shift : 0;
                 dst[j] = p->data[j] + p->linesize[j] *
-                         (fs->slice_y >> sv) + (fs->slice_x >> sh);
-                src[j] = f->last_picture->data[j] +
-                         f->last_picture->linesize[j] *
-                         (fs->slice_y >> sv) + (fs->slice_x >> sh);
+                         (fs->slice_y >> sv) + ((fs->slice_x >> sh) << pixshift);
+                src[j] = f->last_picture.f->data[j] + f->last_picture.f->linesize[j] *
+                         (fs->slice_y >> sv) + ((fs->slice_x >> sh) << pixshift);
+
+            }
+            if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
+                desc->flags & FF_PSEUDOPAL) {
+                dst[1] = p->data[1];
+                src[1] = f->last_picture.f->data[1];
             }
             av_image_copy(dst, p->linesize, src,
-                          f->last_picture->linesize,
-                          avctx->pix_fmt, fs->slice_width,
+                          f->last_picture.f->linesize,
+                          avctx->pix_fmt,
+                          fs->slice_width,
                           fs->slice_height);
         }
     }
+    ff_thread_report_progress(&f->picture, INT_MAX, 0);
 
     f->picture_number++;
 
-    av_frame_unref(f->last_picture);
-    if ((ret = av_frame_ref(f->last_picture, p)) < 0)
-        return ret;
+    if (f->last_picture.f)
+        ff_thread_release_buffer(avctx, &f->last_picture);
     f->cur = NULL;
+    if ((ret = av_frame_ref(data, f->picture.f)) < 0)
+        return ret;
 
     *got_frame = 1;
 
     return buf_size;
 }
 
-static av_cold int ffv1_decode_close(AVCodecContext *avctx)
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
 {
-    FFV1Context *s = avctx->priv_data;;
+    FFV1Context *f = avctx->priv_data;
+    int i, ret;
 
-    av_frame_free(&s->last_picture);
+    f->picture.f      = NULL;
+    f->last_picture.f = NULL;
+    f->sample_buffer  = NULL;
+    f->max_slice_count = 0;
+    f->slice_count = 0;
+
+    for (i = 0; i < f->quant_table_count; i++) {
+        av_assert0(f->version > 1);
+        f->initial_states[i] = av_memdup(f->initial_states[i],
+                                         f->context_count[i] * sizeof(*f->initial_states[i]));
+    }
+
+    f->picture.f      = av_frame_alloc();
+    f->last_picture.f = av_frame_alloc();
+
+    if ((ret = ff_ffv1_init_slice_contexts(f)) < 0)
+        return ret;
+
+    return 0;
+}
+#endif
+
+static void copy_fields(FFV1Context *fsdst, FFV1Context *fssrc, FFV1Context *fsrc)
+{
+    fsdst->version             = fsrc->version;
+    fsdst->micro_version       = fsrc->micro_version;
+    fsdst->chroma_planes       = fsrc->chroma_planes;
+    fsdst->chroma_h_shift      = fsrc->chroma_h_shift;
+    fsdst->chroma_v_shift      = fsrc->chroma_v_shift;
+    fsdst->transparency        = fsrc->transparency;
+    fsdst->plane_count         = fsrc->plane_count;
+    fsdst->ac                  = fsrc->ac;
+    fsdst->colorspace          = fsrc->colorspace;
+
+    fsdst->ec                  = fsrc->ec;
+    fsdst->intra               = fsrc->intra;
+    fsdst->slice_damaged       = fssrc->slice_damaged;
+    fsdst->key_frame_ok        = fsrc->key_frame_ok;
+
+    fsdst->bits_per_raw_sample = fsrc->bits_per_raw_sample;
+    fsdst->packed_at_lsb       = fsrc->packed_at_lsb;
+    fsdst->slice_count         = fsrc->slice_count;
+    if (fsrc->version<3){
+        fsdst->slice_x             = fssrc->slice_x;
+        fsdst->slice_y             = fssrc->slice_y;
+        fsdst->slice_width         = fssrc->slice_width;
+        fsdst->slice_height        = fssrc->slice_height;
+    }
+}
+
+#if HAVE_THREADS
+static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    FFV1Context *fsrc = src->priv_data;
+    FFV1Context *fdst = dst->priv_data;
+    int i, ret;
+
+    if (dst == src)
+        return 0;
+
+    {
+        ThreadFrame picture = fdst->picture, last_picture = fdst->last_picture;
+        uint8_t (*initial_states[MAX_QUANT_TABLES])[32];
+        struct FFV1Context *slice_context[MAX_SLICES];
+        memcpy(initial_states, fdst->initial_states, sizeof(fdst->initial_states));
+        memcpy(slice_context,  fdst->slice_context , sizeof(fdst->slice_context));
+
+        memcpy(fdst, fsrc, sizeof(*fdst));
+        memcpy(fdst->initial_states, initial_states, sizeof(fdst->initial_states));
+        memcpy(fdst->slice_context,  slice_context , sizeof(fdst->slice_context));
+        fdst->picture      = picture;
+        fdst->last_picture = last_picture;
+        for (i = 0; i<fdst->num_h_slices * fdst->num_v_slices; i++) {
+            FFV1Context *fssrc = fsrc->slice_context[i];
+            FFV1Context *fsdst = fdst->slice_context[i];
+            copy_fields(fsdst, fssrc, fsrc);
+        }
+        av_assert0(!fdst->plane[0].state);
+        av_assert0(!fdst->sample_buffer);
+    }
+
+    av_assert1(fdst->max_slice_count == fsrc->max_slice_count);
+
+
+    ff_thread_release_buffer(dst, &fdst->picture);
+    if (fsrc->picture.f->data[0]) {
+        if ((ret = ff_thread_ref_frame(&fdst->picture, &fsrc->picture)) < 0)
+            return ret;
+    }
 
-    ffv1_close(avctx);
+    fdst->fsrc = fsrc;
 
     return 0;
 }
+#endif
 
 AVCodec ff_ffv1_decoder = {
     .name           = "ffv1",
@@ -969,9 +1090,12 @@ AVCodec ff_ffv1_decoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFV1,
     .priv_data_size = sizeof(FFV1Context),
-    .init           = ffv1_decode_init,
-    .close          = ffv1_decode_close,
-    .decode         = ffv1_decode_frame,
+    .init           = decode_init,
+    .close          = ff_ffv1_close,
+    .decode         = decode_frame,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
     .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/ |
-                      AV_CODEC_CAP_SLICE_THREADS,
+                      AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP
 };
diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
new file mode 100644
index 0000000..25032fe
--- /dev/null
+++ b/libavcodec/ffv1dec_template.c
@@ -0,0 +1,180 @@
+/*
+ * FFV1 decoder template
+ *
+ * Copyright (c) 2003-2016 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static av_always_inline int RENAME(decode_line)(FFV1Context *s, int w,
+                                                 TYPE *sample[2],
+                                                 int plane_index, int bits)
+{
+    PlaneContext *const p = &s->plane[plane_index];
+    RangeCoder *const c   = &s->c;
+    int x;
+    int run_count = 0;
+    int run_mode  = 0;
+    int run_index = s->run_index;
+
+    if (is_input_end(s))
+        return AVERROR_INVALIDDATA;
+
+    if (s->slice_coding_mode == 1) {
+        int i;
+        for (x = 0; x < w; x++) {
+            int v = 0;
+            for (i=0; i<bits; i++) {
+                uint8_t state = 128;
+                v += v + get_rac(c, &state);
+            }
+            sample[1][x] = v;
+        }
+        return 0;
+    }
+
+    for (x = 0; x < w; x++) {
+        int diff, context, sign;
+
+        if (!(x & 1023)) {
+            if (is_input_end(s))
+                return AVERROR_INVALIDDATA;
+        }
+
+        context = RENAME(get_context)(p, sample[1] + x, sample[0] + x, sample[1] + x);
+        if (context < 0) {
+            context = -context;
+            sign    = 1;
+        } else
+            sign = 0;
+
+        av_assert2(context < p->context_count);
+
+        if (s->ac != AC_GOLOMB_RICE) {
+            diff = get_symbol_inline(c, p->state[context], 1);
+        } else {
+            if (context == 0 && run_mode == 0)
+                run_mode = 1;
+
+            if (run_mode) {
+                if (run_count == 0 && run_mode == 1) {
+                    if (get_bits1(&s->gb)) {
+                        run_count = 1 << ff_log2_run[run_index];
+                        if (x + run_count <= w)
+                            run_index++;
+                    } else {
+                        if (ff_log2_run[run_index])
+                            run_count = get_bits(&s->gb, ff_log2_run[run_index]);
+                        else
+                            run_count = 0;
+                        if (run_index)
+                            run_index--;
+                        run_mode = 2;
+                    }
+                }
+                run_count--;
+                if (run_count < 0) {
+                    run_mode  = 0;
+                    run_count = 0;
+                    diff      = get_vlc_symbol(&s->gb, &p->vlc_state[context],
+                                               bits);
+                    if (diff >= 0)
+                        diff++;
+                } else
+                    diff = 0;
+            } else
+                diff = get_vlc_symbol(&s->gb, &p->vlc_state[context], bits);
+
+            ff_dlog(s->avctx, "count:%d index:%d, mode:%d, x:%d pos:%d\n",
+                    run_count, run_index, run_mode, x, get_bits_count(&s->gb));
+        }
+
+        if (sign)
+            diff = -(unsigned)diff;
+
+        sample[1][x] = av_mod_uintp2(RENAME(predict)(sample[1] + x, sample[0] + x) + (SUINT)diff, bits);
+    }
+    s->run_index = run_index;
+    return 0;
+}
+
+static int RENAME(decode_rgb_frame)(FFV1Context *s, uint8_t *src[4], int w, int h, int stride[4])
+{
+    int x, y, p;
+    TYPE *sample[4][2];
+    int lbd    = s->avctx->bits_per_raw_sample <= 8;
+    int bits   = s->avctx->bits_per_raw_sample > 0 ? s->avctx->bits_per_raw_sample : 8;
+    int offset = 1 << bits;
+    int transparency = s->transparency;
+
+    for (x = 0; x < 4; x++) {
+        sample[x][0] = RENAME(s->sample_buffer) +  x * 2      * (w + 6) + 3;
+        sample[x][1] = RENAME(s->sample_buffer) + (x * 2 + 1) * (w + 6) + 3;
+    }
+
+    s->run_index = 0;
+
+    memset(RENAME(s->sample_buffer), 0, 8 * (w + 6) * sizeof(*RENAME(s->sample_buffer)));
+
+    for (y = 0; y < h; y++) {
+        for (p = 0; p < 3 + transparency; p++) {
+            int ret;
+            TYPE *temp = sample[p][0]; // FIXME: try a normal buffer
+
+            sample[p][0] = sample[p][1];
+            sample[p][1] = temp;
+
+            sample[p][1][-1]= sample[p][0][0  ];
+            sample[p][0][ w]= sample[p][0][w-1];
+            if (lbd && s->slice_coding_mode == 0)
+                ret = RENAME(decode_line)(s, w, sample[p], (p + 1)/2, 9);
+            else
+                ret = RENAME(decode_line)(s, w, sample[p], (p + 1)/2, bits + (s->slice_coding_mode != 1));
+            if (ret < 0)
+                return ret;
+        }
+        for (x = 0; x < w; x++) {
+            int g = sample[0][1][x];
+            int b = sample[1][1][x];
+            int r = sample[2][1][x];
+            int a = sample[3][1][x];
+
+            if (s->slice_coding_mode != 1) {
+                b -= offset;
+                r -= offset;
+                g -= (b * s->slice_rct_by_coef + r * s->slice_rct_ry_coef) >> 2;
+                b += g;
+                r += g;
+            }
+
+            if (lbd)
+                *((uint32_t*)(src[0] + x*4 + stride[0]*y)) = b + ((unsigned)g<<8) + ((unsigned)r<<16) + ((unsigned)a<<24);
+            else if (sizeof(TYPE) == 4 || transparency) {
+                *((uint16_t*)(src[0] + x*2 + stride[0]*y)) = g;
+                *((uint16_t*)(src[1] + x*2 + stride[1]*y)) = b;
+                *((uint16_t*)(src[2] + x*2 + stride[2]*y)) = r;
+                if (transparency)
+                    *((uint16_t*)(src[3] + x*2 + stride[3]*y)) = a;
+            } else {
+                *((uint16_t*)(src[0] + x*2 + stride[0]*y)) = b;
+                *((uint16_t*)(src[1] + x*2 + stride[1]*y)) = g;
+                *((uint16_t*)(src[2] + x*2 + stride[2]*y)) = r;
+            }
+        }
+    }
+    return 0;
+}
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index c5088bb..796d81f 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -1,22 +1,22 @@
 /*
- * FFV1 encoder for libavcodec
+ * FFV1 encoder
  *
- * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,19 +27,115 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/crc.h"
 #include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 
 #include "avcodec.h"
-#include "golomb.h"
 #include "internal.h"
 #include "put_bits.h"
 #include "rangecoder.h"
+#include "golomb.h"
 #include "mathops.h"
 #include "ffv1.h"
 
+static const int8_t quant5_10bit[256] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0,
+};
+
+static const int8_t quant5[256] = {
+     0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1,
+};
+
+static const int8_t quant9_10bit[256] = {
+     0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
+     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+     3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3,
+    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+    -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -0, -0, -0, -0,
+};
+
+static const int8_t quant11[256] = {
+     0,  1,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -1,
+};
+
+static const uint8_t ver2_state[256] = {
+      0,  10,  10,  10,  10,  16,  16,  16, 28,   16,  16,  29,  42,  49,  20,  49,
+     59,  25,  26,  26,  27,  31,  33,  33, 33,   34,  34,  37,  67,  38,  39,  39,
+     40,  40,  41,  79,  43,  44,  45,  45, 48,   48,  64,  50,  51,  52,  88,  52,
+     53,  74,  55,  57,  58,  58,  74,  60, 101,  61,  62,  84,  66,  66,  68,  69,
+     87,  82,  71,  97,  73,  73,  82,  75, 111,  77,  94,  78,  87,  81,  83,  97,
+     85,  83,  94,  86,  99,  89,  90,  99, 111,  92,  93,  134, 95,  98, 105,  98,
+    105, 110, 102, 108, 102, 118, 103, 106, 106, 113, 109, 112, 114, 112, 116, 125,
+    115, 116, 117, 117, 126, 119, 125, 121, 121, 123, 145, 124, 126, 131, 127, 129,
+    165, 130, 132, 138, 133, 135, 145, 136, 137, 139, 146, 141, 143, 142, 144, 148,
+    147, 155, 151, 149, 151, 150, 152, 157, 153, 154, 156, 168, 158, 162, 161, 160,
+    172, 163, 169, 164, 166, 184, 167, 170, 177, 174, 171, 173, 182, 176, 180, 178,
+    175, 189, 179, 181, 186, 183, 192, 185, 200, 187, 191, 188, 190, 197, 193, 196,
+    197, 194, 195, 196, 198, 202, 199, 201, 210, 203, 207, 204, 205, 206, 208, 214,
+    209, 211, 221, 212, 213, 215, 224, 216, 217, 218, 219, 220, 222, 228, 223, 225,
+    226, 224, 227, 229, 240, 230, 231, 232, 233, 234, 235, 236, 238, 239, 237, 242,
+    241, 243, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255,
+};
+
 static void find_best_state(uint8_t best_state[256][256],
                             const uint8_t one_state[256])
 {
@@ -60,12 +156,16 @@ static void find_best_state(uint8_t best_state[256][256],
             double occ[256] = { 0 };
             double len      = 0;
             occ[j] = 1.0;
+
+            if (!one_state[j])
+                continue;
+
             for (k = 0; k < 256; k++) {
                 double newocc[256] = { 0 };
                 for (m = 1; m < 256; m++)
                     if (occ[m]) {
-                        len -= occ[m] *     (p  * l2tab[m] +
-                                        (1 - p) * l2tab[256 - m]);
+                        len -=occ[m]*(     p *l2tab[    m]
+                                      + (1-p)*l2tab[256-m]);
                     }
                 if (len < best_len[k]) {
                     best_len[k]      = len;
@@ -73,7 +173,7 @@ static void find_best_state(uint8_t best_state[256][256],
                 }
                 for (m = 1; m < 256; m++)
                     if (occ[m]) {
-                        newocc[one_state[m]]             += occ[m] * p;
+                        newocc[      one_state[      m]] += occ[m] * p;
                         newocc[256 - one_state[256 - m]] += occ[m] * (1 - p);
                     }
                 memcpy(occ, newocc, sizeof(occ));
@@ -136,6 +236,7 @@ static av_noinline void put_symbol(RangeCoder *c, uint8_t *state,
     put_symbol_inline(c, state, v, is_signed, NULL, NULL);
 }
 
+
 static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
                                   int v, int bits)
 {
@@ -149,7 +250,7 @@ static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
         i += i;
     }
 
-    assert(k <= 13);
+    av_assert2(k <= 13);
 
     code = v ^ ((2 * state->drift + state->count) >> 31);
 
@@ -160,100 +261,20 @@ static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
     update_vlc_state(state, v);
 }
 
-static av_always_inline int encode_line(FFV1Context *s, int w,
-                                        int16_t *sample[3],
-                                        int plane_index, int bits)
-{
-    PlaneContext *const p = &s->plane[plane_index];
-    RangeCoder *const c   = &s->c;
-    int x;
-    int run_index = s->run_index;
-    int run_count = 0;
-    int run_mode  = 0;
-
-    if (s->ac != AC_GOLOMB_RICE) {
-        if (c->bytestream_end - c->bytestream < w * 20) {
-            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
-            return AVERROR_INVALIDDATA;
-        }
-    } else {
-        if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) < w * 4) {
-            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
-            return AVERROR_INVALIDDATA;
-        }
-    }
-
-    for (x = 0; x < w; x++) {
-        int diff, context;
-
-        context = get_context(p, sample[0] + x, sample[1] + x, sample[2] + x);
-        diff    = sample[0][x] - predict(sample[0] + x, sample[1] + x);
-
-        if (context < 0) {
-            context = -context;
-            diff    = -diff;
-        }
+#define TYPE int16_t
+#define RENAME(name) name
+#include "ffv1enc_template.c"
+#undef TYPE
+#undef RENAME
 
-        diff = fold(diff, bits);
+#define TYPE int32_t
+#define RENAME(name) name ## 32
+#include "ffv1enc_template.c"
 
-        if (s->ac != AC_GOLOMB_RICE) {
-            if (s->flags & AV_CODEC_FLAG_PASS1) {
-                put_symbol_inline(c, p->state[context], diff, 1, s->rc_stat,
-                                  s->rc_stat2[p->quant_table_index][context]);
-            } else {
-                put_symbol_inline(c, p->state[context], diff, 1, NULL, NULL);
-            }
-        } else {
-            if (context == 0)
-                run_mode = 1;
-
-            if (run_mode) {
-                if (diff) {
-                    while (run_count >= 1 << ff_log2_run[run_index]) {
-                        run_count -= 1 << ff_log2_run[run_index];
-                        run_index++;
-                        put_bits(&s->pb, 1, 1);
-                    }
-
-                    put_bits(&s->pb, 1 + ff_log2_run[run_index], run_count);
-                    if (run_index)
-                        run_index--;
-                    run_count = 0;
-                    run_mode  = 0;
-                    if (diff > 0)
-                        diff--;
-                } else {
-                    run_count++;
-                }
-            }
-
-            ff_dlog(s->avctx, "count:%d index:%d, mode:%d, x:%d pos:%d\n",
-                    run_count, run_index, run_mode, x,
-                    (int)put_bits_count(&s->pb));
-
-            if (run_mode == 0)
-                put_vlc_symbol(&s->pb, &p->vlc_state[context], diff, bits);
-        }
-    }
-    if (run_mode) {
-        while (run_count >= 1 << ff_log2_run[run_index]) {
-            run_count -= 1 << ff_log2_run[run_index];
-            run_index++;
-            put_bits(&s->pb, 1, 1);
-        }
-
-        if (run_count)
-            put_bits(&s->pb, 1, 1);
-    }
-    s->run_index = run_index;
-
-    return 0;
-}
-
-static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
-                         int stride, int plane_index)
+static int encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
+                         int stride, int plane_index, int pixel_stride)
 {
-    int x, y, i;
+    int x, y, i, ret;
     const int ring_size = s->context_model ? 3 : 2;
     int16_t *sample[3];
     s->run_index = 0;
@@ -264,89 +285,32 @@ static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
         for (i = 0; i < ring_size; i++)
             sample[i] = s->sample_buffer + (w + 6) * ((h + i - y) % ring_size) + 3;
 
-        sample[0][-1] = sample[1][0];
-        sample[1][w]  = sample[1][w - 1];
+        sample[0][-1]= sample[1][0  ];
+        sample[1][ w]= sample[1][w-1];
 // { START_TIMER
         if (s->bits_per_raw_sample <= 8) {
             for (x = 0; x < w; x++)
-                sample[0][x] = src[x + stride * y];
-            encode_line(s, w, sample, plane_index, 8);
+                sample[0][x] = src[x * pixel_stride + stride * y];
+            if((ret = encode_line(s, w, sample, plane_index, 8)) < 0)
+                return ret;
         } else {
             if (s->packed_at_lsb) {
-                for (x = 0; x < w; x++)
-                    sample[0][x] = ((uint16_t *)(src + stride * y))[x];
+                for (x = 0; x < w; x++) {
+                    sample[0][x] = ((uint16_t*)(src + stride*y))[x];
+                }
             } else {
-                for (x = 0; x < w; x++)
-                    sample[0][x] =
-                        ((uint16_t *)(src + stride * y))[x] >> (16 - s->bits_per_raw_sample);
+                for (x = 0; x < w; x++) {
+                    sample[0][x] = ((uint16_t*)(src + stride*y))[x] >> (16 - s->bits_per_raw_sample);
+                }
             }
-            encode_line(s, w, sample, plane_index, s->bits_per_raw_sample);
+            if((ret = encode_line(s, w, sample, plane_index, s->bits_per_raw_sample)) < 0)
+                return ret;
         }
 // STOP_TIMER("encode line") }
     }
+    return 0;
 }
 
-static void encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
-                             int w, int h, const int stride[3])
-{
-    int x, y, p, i;
-    const int ring_size = s->context_model ? 3 : 2;
-    int16_t *sample[MAX_PLANES][3];
-    int lbd  = s->avctx->bits_per_raw_sample <= 8;
-    int bits = s->avctx->bits_per_raw_sample > 0
-               ? s->avctx->bits_per_raw_sample
-               : 8;
-    int offset = 1 << bits;
-
-    s->run_index = 0;
-
-    memset(s->sample_buffer, 0, ring_size * MAX_PLANES *
-                                (w + 6) * sizeof(*s->sample_buffer));
-
-    for (y = 0; y < h; y++) {
-        for (i = 0; i < ring_size; i++)
-            for (p = 0; p < MAX_PLANES; p++)
-                sample[p][i] = s->sample_buffer + p * ring_size *
-                               (w + 6) +
-                               ((h + i - y) % ring_size) * (w + 6) + 3;
-
-        for (x = 0; x < w; x++) {
-            int b, g, r, av_uninit(a);
-            if (lbd) {
-                unsigned v = *((const uint32_t *)(src[0] + x * 4 + stride[0] * y));
-                b = v & 0xFF;
-                g = (v >> 8) & 0xFF;
-                r = (v >> 16) & 0xFF;
-                a = v >> 24;
-            } else {
-                b = *((const uint16_t *)(src[0] + x * 2 + stride[0] * y));
-                g = *((const uint16_t *)(src[1] + x * 2 + stride[1] * y));
-                r = *((const uint16_t *)(src[2] + x * 2 + stride[2] * y));
-            }
-
-            b -= g;
-            r -= g;
-            g += (b + r) >> 2;
-            b += offset;
-            r += offset;
-
-            sample[0][0][x] = g;
-            sample[1][0][x] = b;
-            sample[2][0][x] = r;
-            sample[3][0][x] = a;
-        }
-        for (p = 0; p < 3 + s->transparency; p++) {
-            sample[p][0][-1] = sample[p][1][0];
-            sample[p][1][w]  = sample[p][1][w - 1];
-            if (lbd)
-                encode_line(s, w, sample[p], (p + 1) / 2, 9);
-            else
-                encode_line(s, w, sample[p], (p + 1) / 2, bits + 1);
-        }
-    }
-}
-
-
 static void write_quant_table(RangeCoder *c, int16_t *quant_table)
 {
     int last = 0;
@@ -373,7 +337,7 @@ static void write_quant_tables(RangeCoder *c,
 static void write_header(FFV1Context *f)
 {
     uint8_t state[CONTEXT_SIZE];
-    int i;
+    int i, j;
     RangeCoder *const c = &f->slice_context[0]->c;
 
     memset(state, 128, sizeof(state));
@@ -386,7 +350,7 @@ static void write_header(FFV1Context *f)
                 put_symbol(c, state,
                            f->state_transition[i] - c->one_state[i], 1);
         }
-        put_symbol(c, state, f->colorspace, 0); // YUV cs type
+        put_symbol(c, state, f->colorspace, 0); //YUV cs type
         if (f->version > 0)
             put_symbol(c, state, f->bits_per_raw_sample, 0);
         put_rac(c, state, f->chroma_planes);
@@ -395,6 +359,25 @@ static void write_header(FFV1Context *f)
         put_rac(c, state, f->transparency);
 
         write_quant_tables(c, f->quant_table);
+    } else if (f->version < 3) {
+        put_symbol(c, state, f->slice_count, 0);
+        for (i = 0; i < f->slice_count; i++) {
+            FFV1Context *fs = f->slice_context[i];
+            put_symbol(c, state,
+                       (fs->slice_x      + 1) * f->num_h_slices / f->width, 0);
+            put_symbol(c, state,
+                       (fs->slice_y      + 1) * f->num_v_slices / f->height, 0);
+            put_symbol(c, state,
+                       (fs->slice_width  + 1) * f->num_h_slices / f->width - 1,
+                       0);
+            put_symbol(c, state,
+                       (fs->slice_height + 1) * f->num_v_slices / f->height - 1,
+                       0);
+            for (j = 0; j < f->plane_count; j++) {
+                put_symbol(c, state, f->plane[j].quant_table_index, 0);
+                av_assert0(f->plane[j].quant_table_index == f->context_model);
+            }
+        }
     }
 }
 
@@ -411,15 +394,19 @@ static int write_extradata(FFV1Context *f)
 
     f->avctx->extradata_size = 10000 + 4 +
                                     (11 * 11 * 5 * 5 * 5 + 11 * 11 * 11) * 32;
-    f->avctx->extradata = av_malloc(f->avctx->extradata_size);
+    f->avctx->extradata = av_malloc(f->avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!f->avctx->extradata)
+        return AVERROR(ENOMEM);
     ff_init_range_encoder(c, f->avctx->extradata, f->avctx->extradata_size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
     put_symbol(c, state, f->version, 0);
-    if (f->version > 1) {
-        if (f->version == 3)
-            f->minor_version = 2;
-        put_symbol(c, state, f->minor_version, 0);
+    if (f->version > 2) {
+        if (f->version == 3) {
+            f->micro_version = 4;
+        } else if (f->version == 4)
+            f->micro_version = 2;
+        put_symbol(c, state, f->micro_version, 0);
     }
 
     put_symbol(c, state, f->ac, 0);
@@ -459,12 +446,11 @@ static int write_extradata(FFV1Context *f)
 
     if (f->version > 2) {
         put_symbol(c, state, f->ec, 0);
+        put_symbol(c, state, f->intra = (f->avctx->gop_size < 2), 0);
     }
 
-    f->avctx->extradata_size = ff_rac_terminate(c);
-
-    v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0,
-               f->avctx->extradata, f->avctx->extradata_size);
+    f->avctx->extradata_size = ff_rac_terminate(c, 0);
+    v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0, f->avctx->extradata, f->avctx->extradata_size);
     AV_WL32(f->avctx->extradata + f->avctx->extradata_size, v);
     f->avctx->extradata_size += 4;
 
@@ -489,7 +475,7 @@ static int sort_stt(FFV1Context *s, uint8_t stt[256])
 
                 double size0 = COST2(i,  i) + COST2(i2, i2);
                 double sizeX = COST2(i, i2) + COST2(i2, i);
-                if (sizeX < size0 && i != 128 && i2 != 128) {
+                if (size0 - sizeX > size0*(1e-14) && i != 128 && i2 != 128) {
                     int j;
                     FFSWAP(int, stt[i], stt[i2]);
                     FFSWAP(int, s->rc_stat[i][0], s->rc_stat[i2][0]);
@@ -519,89 +505,102 @@ static int sort_stt(FFV1Context *s, uint8_t stt[256])
     return print;
 }
 
-static av_cold int init_slices_state(FFV1Context *f)
-{
-    int i, ret;
-    for (i = 0; i < f->slice_count; i++) {
-        FFV1Context *fs = f->slice_context[i];
-        if ((ret = ffv1_init_slice_state(f, fs)) < 0)
-            return AVERROR(ENOMEM);
-    }
-    return 0;
-}
-
-static av_cold int ffv1_encode_init(AVCodecContext *avctx)
+static av_cold int encode_init(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
     int i, j, k, m, ret;
 
-    ffv1_common_init(avctx);
+    if ((ret = ff_ffv1_common_init(avctx)) < 0)
+        return ret;
 
     s->version = 0;
 
-    switch (avctx->level) {
-    case 3:
-        break;
-    case 2:
-        av_log(avctx, AV_LOG_ERROR,
-               "Version 2 had been deemed non-standard and deprecated "
-               "the support for it had been removed\n");
-        return AVERROR(ENOSYS);
-    case 1:
-    case 0:
-        if (avctx->flags & (AV_CODEC_FLAG_PASS1 | AV_CODEC_FLAG_PASS2)) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Multiple pass encoding requires version 3.\n");
-            return AVERROR(ENOSYS);
-        }
-        if (avctx->slices > 1) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Multiple slices support requires version 3.\n");
-            return AVERROR(ENOSYS);
+    if ((avctx->flags & (AV_CODEC_FLAG_PASS1 | AV_CODEC_FLAG_PASS2)) ||
+        avctx->slices > 1)
+        s->version = FFMAX(s->version, 2);
+
+    // Unspecified level & slices, we choose version 1.2+ to ensure multithreaded decodability
+    if (avctx->slices == 0 && avctx->level < 0 && avctx->width * avctx->height > 720*576)
+        s->version = FFMAX(s->version, 2);
+
+    if (avctx->level <= 0 && s->version == 2) {
+        s->version = 3;
+    }
+    if (avctx->level >= 0 && avctx->level <= 4) {
+        if (avctx->level < s->version) {
+            av_log(avctx, AV_LOG_ERROR, "Version %d needed for requested features but %d requested\n", s->version, avctx->level);
+            return AVERROR(EINVAL);
         }
-        break;
-    case FF_LEVEL_UNKNOWN:
-        if ((avctx->flags & (AV_CODEC_FLAG_PASS1 | AV_CODEC_FLAG_PASS2)) ||
-            avctx->slices > 1)
-            s->version = 3;
-        else
-            s->version = 0;
-        break;
-    default:
-        av_log(avctx, AV_LOG_ERROR, "Version %d not supported\n",
-               avctx->level);
-        return AVERROR(ENOSYS);
+        s->version = avctx->level;
     }
 
     if (s->ec < 0) {
         s->ec = (s->version >= 3);
     }
 
+    // CRC requires version 3+
+    if (s->ec)
+        s->version = FFMAX(s->version, 3);
+
+    if ((s->version == 2 || s->version>3) && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_ERROR, "Version 2 needed for requested features but version 2 is experimental and not enabled\n");
+        return AVERROR_INVALIDDATA;
+    }
+
 #if FF_API_CODER_TYPE
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->coder_type != -1)
         s->ac = avctx->coder_type > 0 ? AC_RANGE_CUSTOM_TAB : AC_GOLOMB_RICE;
+    else
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+    if (s->ac == 1) // Compatbility with common command line usage
+        s->ac = AC_RANGE_CUSTOM_TAB;
+    else if (s->ac == AC_RANGE_DEFAULT_TAB_FORCE)
+        s->ac = AC_RANGE_DEFAULT_TAB;
 
     s->plane_count = 3;
-    switch (avctx->pix_fmt) {
+    switch(avctx->pix_fmt) {
+    case AV_PIX_FMT_GRAY9:
     case AV_PIX_FMT_YUV444P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUVA444P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA420P9:
         if (!avctx->bits_per_raw_sample)
             s->bits_per_raw_sample = 9;
+    case AV_PIX_FMT_GRAY10:
     case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV440P10:
     case AV_PIX_FMT_YUV420P10:
     case AV_PIX_FMT_YUV422P10:
-        s->packed_at_lsb = 1;
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA420P10:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 10;
+    case AV_PIX_FMT_GRAY12:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV440P12:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 12;
+    case AV_PIX_FMT_YUV444P14:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV422P14:
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 14;
+        s->packed_at_lsb = 1;
     case AV_PIX_FMT_GRAY16:
     case AV_PIX_FMT_YUV444P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV420P16:
+    case AV_PIX_FMT_YUVA444P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA420P16:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample) {
             s->bits_per_raw_sample = 16;
         } else if (!s->bits_per_raw_sample) {
@@ -611,94 +610,134 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_log(avctx, AV_LOG_ERROR, "bits_per_raw_sample invalid\n");
             return AVERROR_INVALIDDATA;
         }
-        if (s->ac == AC_GOLOMB_RICE) {
-            av_log(avctx, AV_LOG_INFO,
-                   "bits_per_raw_sample > 8, forcing range coder\n");
-            s->ac = AC_RANGE_CUSTOM_TAB;
-        }
         s->version = FFMAX(s->version, 1);
     case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_YA8:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV422P:
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV411P:
     case AV_PIX_FMT_YUV410P:
-        s->chroma_planes = desc->nb_components < 3 ? 0 : 1;
-        s->colorspace    = 0;
-        break;
     case AV_PIX_FMT_YUVA444P:
     case AV_PIX_FMT_YUVA422P:
     case AV_PIX_FMT_YUVA420P:
-        s->chroma_planes = 1;
-        s->colorspace    = 0;
-        s->transparency  = 1;
+        s->chroma_planes = desc->nb_components < 3 ? 0 : 1;
+        s->colorspace = 0;
+        s->transparency = !!(desc->flags & AV_PIX_FMT_FLAG_ALPHA);
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
+        else if (!s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
         break;
     case AV_PIX_FMT_RGB32:
-        s->colorspace   = 1;
+        s->colorspace = 1;
+        s->transparency = 1;
+        s->chroma_planes = 1;
+        s->bits_per_raw_sample = 8;
+        break;
+    case AV_PIX_FMT_RGBA64:
+        s->colorspace = 1;
         s->transparency = 1;
+        s->chroma_planes = 1;
+        s->bits_per_raw_sample = 16;
+        s->use32bit = 1;
+        s->version = FFMAX(s->version, 1);
+        break;
+    case AV_PIX_FMT_RGB48:
+        s->colorspace = 1;
+        s->chroma_planes = 1;
+        s->bits_per_raw_sample = 16;
+        s->use32bit = 1;
+        s->version = FFMAX(s->version, 1);
+        break;
+    case AV_PIX_FMT_0RGB32:
+        s->colorspace = 1;
+        s->chroma_planes = 1;
+        s->bits_per_raw_sample = 8;
         break;
     case AV_PIX_FMT_GBRP9:
         if (!avctx->bits_per_raw_sample)
             s->bits_per_raw_sample = 9;
     case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRAP10:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 10;
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRAP12:
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 12;
+    case AV_PIX_FMT_GBRP14:
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 14;
     case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_GBRAP16:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 16;
         else if (!s->bits_per_raw_sample)
             s->bits_per_raw_sample = avctx->bits_per_raw_sample;
-        s->colorspace    = 1;
+        s->transparency = !!(desc->flags & AV_PIX_FMT_FLAG_ALPHA);
+        s->colorspace = 1;
         s->chroma_planes = 1;
-        s->version       = FFMAX(s->version, 1);
+        if (s->bits_per_raw_sample >= 16) {
+            s->use32bit = 1;
+        }
+        s->version = FFMAX(s->version, 1);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
-        return AVERROR_INVALIDDATA;
+        return AVERROR(ENOSYS);
     }
-    if (s->transparency) {
-        av_log(
-            avctx, AV_LOG_WARNING,
-            "Storing alpha plane, this will require a recent FFV1 decoder to playback!\n");
+    av_assert0(s->bits_per_raw_sample >= 8);
+
+    if (s->bits_per_raw_sample > 8) {
+        if (s->ac == AC_GOLOMB_RICE) {
+            av_log(avctx, AV_LOG_INFO,
+                    "bits_per_raw_sample > 8, forcing range coder\n");
+            s->ac = AC_RANGE_CUSTOM_TAB;
+        }
     }
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->context_model)
         s->context_model = avctx->context_model;
     if (avctx->context_model > 1U) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid context model %d, valid values are 0 and 1\n",
-               avctx->context_model);
+        av_log(avctx, AV_LOG_ERROR, "Invalid context model %d, valid values are 0 and 1\n", avctx->context_model);
         return AVERROR(EINVAL);
     }
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    if (s->ac == AC_RANGE_CUSTOM_TAB)
+    if (s->ac == AC_RANGE_CUSTOM_TAB) {
         for (i = 1; i < 256; i++)
-            s->state_transition[i] = ffv1_ver2_state[i];
+            s->state_transition[i] = ver2_state[i];
+    } else {
+        RangeCoder c;
+        ff_build_rac_states(&c, 0.05 * (1LL << 32), 256 - 8);
+        for (i = 1; i < 256; i++)
+            s->state_transition[i] = c.one_state[i];
+    }
 
     for (i = 0; i < 256; i++) {
         s->quant_table_count = 2;
         if (s->bits_per_raw_sample <= 8) {
-            s->quant_tables[0][0][i] = ffv1_quant11[i];
-            s->quant_tables[0][1][i] = ffv1_quant11[i] * 11;
-            s->quant_tables[0][2][i] = ffv1_quant11[i] * 11 * 11;
-            s->quant_tables[1][0][i] = ffv1_quant11[i];
-            s->quant_tables[1][1][i] = ffv1_quant11[i] * 11;
-            s->quant_tables[1][2][i] = ffv1_quant5[i]  * 11 * 11;
-            s->quant_tables[1][3][i] = ffv1_quant5[i]  *  5 * 11 * 11;
-            s->quant_tables[1][4][i] = ffv1_quant5[i]  *  5 *  5 * 11 * 11;
+            s->quant_tables[0][0][i]=           quant11[i];
+            s->quant_tables[0][1][i]=        11*quant11[i];
+            s->quant_tables[0][2][i]=     11*11*quant11[i];
+            s->quant_tables[1][0][i]=           quant11[i];
+            s->quant_tables[1][1][i]=        11*quant11[i];
+            s->quant_tables[1][2][i]=     11*11*quant5 [i];
+            s->quant_tables[1][3][i]=   5*11*11*quant5 [i];
+            s->quant_tables[1][4][i]= 5*5*11*11*quant5 [i];
         } else {
-            s->quant_tables[0][0][i] = ffv1_quant9_10bit[i];
-            s->quant_tables[0][1][i] = ffv1_quant9_10bit[i] * 11;
-            s->quant_tables[0][2][i] = ffv1_quant9_10bit[i] * 11 * 11;
-            s->quant_tables[1][0][i] = ffv1_quant9_10bit[i];
-            s->quant_tables[1][1][i] = ffv1_quant9_10bit[i] * 11;
-            s->quant_tables[1][2][i] = ffv1_quant5_10bit[i] * 11 * 11;
-            s->quant_tables[1][3][i] = ffv1_quant5_10bit[i] *  5 * 11 * 11;
-            s->quant_tables[1][4][i] = ffv1_quant5_10bit[i] *  5 *  5 * 11 * 11;
+            s->quant_tables[0][0][i]=           quant9_10bit[i];
+            s->quant_tables[0][1][i]=        11*quant9_10bit[i];
+            s->quant_tables[0][2][i]=     11*11*quant9_10bit[i];
+            s->quant_tables[1][0][i]=           quant9_10bit[i];
+            s->quant_tables[1][1][i]=        11*quant9_10bit[i];
+            s->quant_tables[1][2][i]=     11*11*quant5_10bit[i];
+            s->quant_tables[1][3][i]=   5*11*11*quant5_10bit[i];
+            s->quant_tables[1][4][i]= 5*5*11*11*quant5_10bit[i];
         }
     }
     s->context_count[0] = (11 * 11 * 11        + 1) / 2;
@@ -714,7 +753,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         p->context_count     = s->context_count[p->quant_table_index];
     }
 
-    if ((ret = ffv1_allocate_initial_states(s)) < 0)
+    if ((ret = ff_ffv1_allocate_initial_states(s)) < 0)
         return ret;
 
 #if FF_API_CODED_FRAME
@@ -725,9 +764,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (!s->transparency)
         s->plane_count = 2;
+    if (!s->chroma_planes && s->version > 3)
+        s->plane_count--;
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift,
-                                     &s->chroma_v_shift);
+    ret = av_pix_fmt_get_chroma_sub_sample (avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
+    if (ret)
+        return ret;
 
     s->picture_number = 0;
 
@@ -741,19 +783,22 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
     if (avctx->stats_in) {
         char *p = avctx->stats_in;
-        uint8_t best_state[256][256];
+        uint8_t (*best_state)[256] = av_malloc_array(256, 256);
         int gob_count = 0;
         char *next;
+        if (!best_state)
+            return AVERROR(ENOMEM);
 
-        av_assert0(s->version > 2);
+        av_assert0(s->version >= 2);
 
-        for (;; ) {
+        for (;;) {
             for (j = 0; j < 256; j++)
                 for (i = 0; i < 2; i++) {
                     s->rc_stat[j][i] = strtol(p, &next, 0);
                     if (next == p) {
                         av_log(avctx, AV_LOG_ERROR,
                                "2Pass file invalid at %d %d [%s]\n", j, i, p);
+                        av_freep(&best_state);
                         return AVERROR_INVALIDDATA;
                     }
                     p = next;
@@ -767,6 +812,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                 av_log(avctx, AV_LOG_ERROR,
                                        "2Pass file invalid at %d %d %d %d [%s]\n",
                                        i, j, k, m, p);
+                                av_freep(&best_state);
                                 return AVERROR_INVALIDDATA;
                             }
                             p = next;
@@ -775,6 +821,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             gob_count = strtol(p, &next, 0);
             if (next == p || gob_count <= 0) {
                 av_log(avctx, AV_LOG_ERROR, "2Pass file invalid\n");
+                av_freep(&best_state);
                 return AVERROR_INVALIDDATA;
             }
             p = next;
@@ -783,52 +830,82 @@ FF_ENABLE_DEPRECATION_WARNINGS
             if (p[0] == 0)
                 break;
         }
-        sort_stt(s, s->state_transition);
+        if (s->ac == AC_RANGE_CUSTOM_TAB)
+            sort_stt(s, s->state_transition);
 
         find_best_state(best_state, s->state_transition);
 
         for (i = 0; i < s->quant_table_count; i++) {
-            for (j = 0; j < s->context_count[i]; j++)
-                for (k = 0; k < 32; k++) {
+            for (k = 0; k < 32; k++) {
+                double a=0, b=0;
+                int jp = 0;
+                for (j = 0; j < s->context_count[i]; j++) {
                     double p = 128;
-                    if (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1]) {
-                        p = 256.0 * s->rc_stat2[i][j][k][1] /
-                            (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1]);
+                    if (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1] > 200 && j || a+b > 200) {
+                        if (a+b)
+                            p = 256.0 * b / (a + b);
+                        s->initial_states[i][jp][k] =
+                            best_state[av_clip(round(p), 1, 255)][av_clip_uint8((a + b) / gob_count)];
+                        for(jp++; jp<j; jp++)
+                            s->initial_states[i][jp][k] = s->initial_states[i][jp-1][k];
+                        a=b=0;
+                    }
+                    a += s->rc_stat2[i][j][k][0];
+                    b += s->rc_stat2[i][j][k][1];
+                    if (a+b) {
+                        p = 256.0 * b / (a + b);
                     }
                     s->initial_states[i][j][k] =
-                        best_state[av_clip(round(p), 1, 255)][av_clip((s->rc_stat2[i][j][k][0] +
-                                                                       s->rc_stat2[i][j][k][1]) /
-                                                                      gob_count, 0, 255)];
+                        best_state[av_clip(round(p), 1, 255)][av_clip_uint8((a + b) / gob_count)];
                 }
+            }
         }
+        av_freep(&best_state);
     }
 
     if (s->version > 1) {
-        for (s->num_v_slices = 2; s->num_v_slices < 9; s->num_v_slices++)
-            for (s->num_h_slices = s->num_v_slices;
-                 s->num_h_slices < 2 * s->num_v_slices; s->num_h_slices++)
-                if (avctx->slices == s->num_h_slices * s->num_v_slices &&
-                    avctx->slices <= 64 || !avctx->slices)
+        int plane_count = 1 + 2*s->chroma_planes + s->transparency;
+        int max_h_slices = AV_CEIL_RSHIFT(avctx->width , s->chroma_h_shift);
+        int max_v_slices = AV_CEIL_RSHIFT(avctx->height, s->chroma_v_shift);
+        s->num_v_slices = (avctx->width > 352 || avctx->height > 288 || !avctx->slices) ? 2 : 1;
+
+        s->num_v_slices = FFMIN(s->num_v_slices, max_v_slices);
+
+        for (; s->num_v_slices < 32; s->num_v_slices++) {
+            for (s->num_h_slices = s->num_v_slices; s->num_h_slices < 2*s->num_v_slices; s->num_h_slices++) {
+                int maxw = (avctx->width  + s->num_h_slices - 1) / s->num_h_slices;
+                int maxh = (avctx->height + s->num_v_slices - 1) / s->num_v_slices;
+                if (s->num_h_slices > max_h_slices || s->num_v_slices > max_v_slices)
+                    continue;
+                if (maxw * maxh * (int64_t)(s->bits_per_raw_sample+1) * plane_count > 8<<24)
+                    continue;
+                if (avctx->slices == s->num_h_slices * s->num_v_slices && avctx->slices <= MAX_SLICES || !avctx->slices)
                     goto slices_ok;
+            }
+        }
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported number %d of slices requested, please specify a "
                "supported number with -slices (ex:4,6,9,12,16, ...)\n",
                avctx->slices);
         return AVERROR(ENOSYS);
 slices_ok:
-        write_extradata(s);
+        if ((ret = write_extradata(s)) < 0)
+            return ret;
     }
 
-    if ((ret = ffv1_init_slice_contexts(s)) < 0)
+    if ((ret = ff_ffv1_init_slice_contexts(s)) < 0)
         return ret;
-    if ((ret = init_slices_state(s)) < 0)
+    s->slice_count = s->max_slice_count;
+    if ((ret = ff_ffv1_init_slices_state(s)) < 0)
         return ret;
 
 #define STATS_OUT_SIZE 1024 * 1024 * 6
     if (avctx->flags & AV_CODEC_FLAG_PASS1) {
         avctx->stats_out = av_mallocz(STATS_OUT_SIZE);
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
         for (i = 0; i < s->quant_table_count; i++)
-            for (j = 0; j < s->slice_count; j++) {
+            for (j = 0; j < s->max_slice_count; j++) {
                 FFV1Context *sf = s->slice_context[j];
                 av_assert0(!sf->rc_stat2[i]);
                 sf->rc_stat2[i] = av_mallocz(s->context_count[i] *
@@ -848,23 +925,112 @@ static void encode_slice_header(FFV1Context *f, FFV1Context *fs)
     int j;
     memset(state, 128, sizeof(state));
 
-    put_symbol(c, state, (fs->slice_x + 1) * f->num_h_slices / f->width, 0);
-    put_symbol(c, state, (fs->slice_y + 1) * f->num_v_slices / f->height, 0);
-    put_symbol(c, state, (fs->slice_width + 1) * f->num_h_slices / f->width - 1,
-               0);
-    put_symbol(c, state,
-               (fs->slice_height + 1) * f->num_v_slices / f->height - 1,
-               0);
-    for (j = 0; j < f->plane_count; j++) {
+    put_symbol(c, state, (fs->slice_x     +1)*f->num_h_slices / f->width   , 0);
+    put_symbol(c, state, (fs->slice_y     +1)*f->num_v_slices / f->height  , 0);
+    put_symbol(c, state, (fs->slice_width +1)*f->num_h_slices / f->width -1, 0);
+    put_symbol(c, state, (fs->slice_height+1)*f->num_v_slices / f->height-1, 0);
+    for (j=0; j<f->plane_count; j++) {
         put_symbol(c, state, f->plane[j].quant_table_index, 0);
         av_assert0(f->plane[j].quant_table_index == f->context_model);
     }
-    if (!f->frame->interlaced_frame)
+    if (!f->picture.f->interlaced_frame)
         put_symbol(c, state, 3, 0);
     else
-        put_symbol(c, state, 1 + !f->frame->top_field_first, 0);
-    put_symbol(c, state, f->frame->sample_aspect_ratio.num, 0);
-    put_symbol(c, state, f->frame->sample_aspect_ratio.den, 0);
+        put_symbol(c, state, 1 + !f->picture.f->top_field_first, 0);
+    put_symbol(c, state, f->picture.f->sample_aspect_ratio.num, 0);
+    put_symbol(c, state, f->picture.f->sample_aspect_ratio.den, 0);
+    if (f->version > 3) {
+        put_rac(c, state, fs->slice_coding_mode == 1);
+        if (fs->slice_coding_mode == 1)
+            ff_ffv1_clear_slice_state(f, fs);
+        put_symbol(c, state, fs->slice_coding_mode, 0);
+        if (fs->slice_coding_mode != 1) {
+            put_symbol(c, state, fs->slice_rct_by_coef, 0);
+            put_symbol(c, state, fs->slice_rct_ry_coef, 0);
+        }
+    }
+}
+
+static void choose_rct_params(FFV1Context *fs, const uint8_t *src[3], const int stride[3], int w, int h)
+{
+#define NB_Y_COEFF 15
+    static const int rct_y_coeff[15][2] = {
+        {0, 0}, //      4G
+        {1, 1}, //  R + 2G + B
+        {2, 2}, // 2R      + 2B
+        {0, 2}, //      2G + 2B
+        {2, 0}, // 2R + 2G
+        {4, 0}, // 4R
+        {0, 4}, //           4B
+
+        {0, 3}, //      1G + 3B
+        {3, 0}, // 3R + 1G
+        {3, 1}, // 3R      +  B
+        {1, 3}, //  R      + 3B
+        {1, 2}, //  R +  G + 2B
+        {2, 1}, // 2R +  G +  B
+        {0, 1}, //      3G +  B
+        {1, 0}, //  R + 3G
+    };
+
+    int stat[NB_Y_COEFF] = {0};
+    int x, y, i, p, best;
+    int16_t *sample[3];
+    int lbd = fs->bits_per_raw_sample <= 8;
+
+    for (y = 0; y < h; y++) {
+        int lastr=0, lastg=0, lastb=0;
+        for (p = 0; p < 3; p++)
+            sample[p] = fs->sample_buffer + p*w;
+
+        for (x = 0; x < w; x++) {
+            int b, g, r;
+            int ab, ag, ar;
+            if (lbd) {
+                unsigned v = *((const uint32_t*)(src[0] + x*4 + stride[0]*y));
+                b =  v        & 0xFF;
+                g = (v >>  8) & 0xFF;
+                r = (v >> 16) & 0xFF;
+            } else {
+                b = *((const uint16_t*)(src[0] + x*2 + stride[0]*y));
+                g = *((const uint16_t*)(src[1] + x*2 + stride[1]*y));
+                r = *((const uint16_t*)(src[2] + x*2 + stride[2]*y));
+            }
+
+            ar = r - lastr;
+            ag = g - lastg;
+            ab = b - lastb;
+            if (x && y) {
+                int bg = ag - sample[0][x];
+                int bb = ab - sample[1][x];
+                int br = ar - sample[2][x];
+
+                br -= bg;
+                bb -= bg;
+
+                for (i = 0; i<NB_Y_COEFF; i++) {
+                    stat[i] += FFABS(bg + ((br*rct_y_coeff[i][0] + bb*rct_y_coeff[i][1])>>2));
+                }
+
+            }
+            sample[0][x] = ag;
+            sample[1][x] = ab;
+            sample[2][x] = ar;
+
+            lastr = r;
+            lastg = g;
+            lastb = b;
+        }
+    }
+
+    best = 0;
+    for (i=1; i<NB_Y_COEFF; i++) {
+        if (stat[i] < stat[best])
+            best = i;
+    }
+
+    fs->slice_rct_by_coef = rct_y_coeff[best][1];
+    fs->slice_rct_ry_coef = rct_y_coeff[best][0];
 }
 
 static int encode_slice(AVCodecContext *c, void *arg)
@@ -875,75 +1041,156 @@ static int encode_slice(AVCodecContext *c, void *arg)
     int height       = fs->slice_height;
     int x            = fs->slice_x;
     int y            = fs->slice_y;
-    const AVFrame *const p = f->frame;
-    const int ps     = (av_pix_fmt_desc_get(c->pix_fmt)->flags & AV_PIX_FMT_FLAG_PLANAR)
-                       ? (f->bits_per_raw_sample > 8) + 1
-                       : 4;
+    const AVFrame *const p = f->picture.f;
+    const int ps     = av_pix_fmt_desc_get(c->pix_fmt)->comp[0].step;
+    int ret;
+    RangeCoder c_bak = fs->c;
+    const uint8_t *planes[4] = {p->data[0] + ps*x + y*p->linesize[0],
+                                p->data[1] ? p->data[1] + ps*x + y*p->linesize[1] : NULL,
+                                p->data[2] ? p->data[2] + ps*x + y*p->linesize[2] : NULL,
+                                p->data[3] ? p->data[3] + ps*x + y*p->linesize[3] : NULL};
+
+    fs->slice_coding_mode = 0;
+    if (f->version > 3) {
+        choose_rct_params(fs, planes, p->linesize, width, height);
+    } else {
+        fs->slice_rct_by_coef = 1;
+        fs->slice_rct_ry_coef = 1;
+    }
 
+retry:
     if (f->key_frame)
-        ffv1_clear_slice_state(f, fs);
+        ff_ffv1_clear_slice_state(f, fs);
     if (f->version > 2) {
         encode_slice_header(f, fs);
     }
     if (fs->ac == AC_GOLOMB_RICE) {
-        if (f->version > 2)
-            put_rac(&fs->c, (uint8_t[]) { 129 }, 0);
-        fs->ac_byte_count = f->version > 2 || (!x && !y) ? ff_rac_terminate( &fs->c) : 0;
-        init_put_bits(&fs->pb, fs->c.bytestream_start + fs->ac_byte_count,
+        fs->ac_byte_count = f->version > 2 || (!x && !y) ? ff_rac_terminate(&fs->c, f->version > 2) : 0;
+        init_put_bits(&fs->pb,
+                      fs->c.bytestream_start + fs->ac_byte_count,
                       fs->c.bytestream_end - fs->c.bytestream_start - fs->ac_byte_count);
     }
 
-    if (f->colorspace == 0) {
+    if (f->colorspace == 0 && c->pix_fmt != AV_PIX_FMT_YA8) {
         const int chroma_width  = AV_CEIL_RSHIFT(width,  f->chroma_h_shift);
         const int chroma_height = AV_CEIL_RSHIFT(height, f->chroma_v_shift);
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
 
-        encode_plane(fs, p->data[0] + ps * x + y * p->linesize[0],
-                     width, height, p->linesize[0], 0);
+        ret = encode_plane(fs, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
 
         if (f->chroma_planes) {
-            encode_plane(fs, p->data[1] + ps * cx + cy * p->linesize[1],
-                         chroma_width, chroma_height, p->linesize[1], 1);
-            encode_plane(fs, p->data[2] + ps * cx + cy * p->linesize[2],
-                         chroma_width, chroma_height, p->linesize[2], 1);
+            ret |= encode_plane(fs, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
+            ret |= encode_plane(fs, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
         }
         if (fs->transparency)
-            encode_plane(fs, p->data[3] + ps * x + y * p->linesize[3], width,
-                         height, p->linesize[3], 2);
+            ret |= encode_plane(fs, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], 2, 1);
+    } else if (c->pix_fmt == AV_PIX_FMT_YA8) {
+        ret  = encode_plane(fs, p->data[0] +     ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 2);
+        ret |= encode_plane(fs, p->data[0] + 1 + ps*x + y*p->linesize[0], width, height, p->linesize[0], 1, 2);
+    } else if (f->use32bit) {
+        ret = encode_rgb_frame32(fs, planes, width, height, p->linesize);
     } else {
-        const uint8_t *planes[3] = { p->data[0] + ps * x + y * p->linesize[0],
-                                     p->data[1] + ps * x + y * p->linesize[1],
-                                     p->data[2] + ps * x + y * p->linesize[2] };
-        encode_rgb_frame(fs, planes, width, height, p->linesize);
+        ret = encode_rgb_frame(fs, planes, width, height, p->linesize);
     }
     emms_c();
 
+    if (ret < 0) {
+        av_assert0(fs->slice_coding_mode == 0);
+        if (fs->version < 4 || !fs->ac) {
+            av_log(c, AV_LOG_ERROR, "Buffer too small\n");
+            return ret;
+        }
+        av_log(c, AV_LOG_DEBUG, "Coding slice as PCM\n");
+        fs->slice_coding_mode = 1;
+        fs->c = c_bak;
+        goto retry;
+    }
+
     return 0;
 }
 
-static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pict, int *got_packet)
 {
     FFV1Context *f      = avctx->priv_data;
     RangeCoder *const c = &f->slice_context[0]->c;
-    int used_count      = 0;
+    AVFrame *const p    = f->picture.f;
     uint8_t keystate    = 128;
     uint8_t *buf_p;
     int i, ret;
+    int64_t maxsize =   AV_INPUT_BUFFER_MIN_SIZE
+                      + avctx->width*avctx->height*37LL*4;
+
+    if(!pict) {
+        if (avctx->flags & AV_CODEC_FLAG_PASS1) {
+            int j, k, m;
+            char *p   = avctx->stats_out;
+            char *end = p + STATS_OUT_SIZE;
+
+            memset(f->rc_stat, 0, sizeof(f->rc_stat));
+            for (i = 0; i < f->quant_table_count; i++)
+                memset(f->rc_stat2[i], 0, f->context_count[i] * sizeof(*f->rc_stat2[i]));
+
+            av_assert0(f->slice_count == f->max_slice_count);
+            for (j = 0; j < f->slice_count; j++) {
+                FFV1Context *fs = f->slice_context[j];
+                for (i = 0; i < 256; i++) {
+                    f->rc_stat[i][0] += fs->rc_stat[i][0];
+                    f->rc_stat[i][1] += fs->rc_stat[i][1];
+                }
+                for (i = 0; i < f->quant_table_count; i++) {
+                    for (k = 0; k < f->context_count[i]; k++)
+                        for (m = 0; m < 32; m++) {
+                            f->rc_stat2[i][k][m][0] += fs->rc_stat2[i][k][m][0];
+                            f->rc_stat2[i][k][m][1] += fs->rc_stat2[i][k][m][1];
+                        }
+                }
+            }
 
-    f->frame = pict;
+            for (j = 0; j < 256; j++) {
+                snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
+                        f->rc_stat[j][0], f->rc_stat[j][1]);
+                p += strlen(p);
+            }
+            snprintf(p, end - p, "\n");
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height *
-                             ((8 * 2 + 1 + 1) * 4) / 8 +
-                             AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+            for (i = 0; i < f->quant_table_count; i++) {
+                for (j = 0; j < f->context_count[i]; j++)
+                    for (m = 0; m < 32; m++) {
+                        snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
+                                f->rc_stat2[i][j][m][0], f->rc_stat2[i][j][m][1]);
+                        p += strlen(p);
+                    }
+            }
+            snprintf(p, end - p, "%d\n", f->gob_count);
+        }
+        return 0;
     }
 
+    if (f->version > 3)
+        maxsize = AV_INPUT_BUFFER_MIN_SIZE + avctx->width*avctx->height*3LL*4;
+
+    if (maxsize > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - 32) {
+        av_log(avctx, AV_LOG_WARNING, "Cannot allocate worst case packet size, the encoding could fail\n");
+        maxsize = INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - 32;
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, maxsize, 0)) < 0)
+        return ret;
+
     ff_init_range_encoder(c, pkt->data, pkt->size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
+    av_frame_unref(p);
+    if ((ret = av_frame_ref(p, pict)) < 0)
+        return ret;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
     if (avctx->gop_size == 0 || f->picture_number % avctx->gop_size == 0) {
         put_rac(c, &keystate, 1);
         f->key_frame = 1;
@@ -962,12 +1209,17 @@ static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
 
-    for (i = 1; i < f->slice_count; i++) {
+    for (i = 0; i < f->slice_count; i++) {
         FFV1Context *fs = f->slice_context[i];
-        uint8_t *start  = pkt->data +
-                          (pkt->size - used_count) * (int64_t)i / f->slice_count;
-        int len = pkt->size / f->slice_count;
-        ff_init_range_encoder(&fs->c, start, len);
+        uint8_t *start  = pkt->data + pkt->size * (int64_t)i / f->slice_count;
+        int len         = pkt->size / f->slice_count;
+        if (i) {
+            ff_init_range_encoder(&fs->c, start, len);
+        } else {
+            av_assert0(fs->c.bytestream_end >= fs->c.bytestream_start + len);
+            av_assert0(fs->c.bytestream < fs->c.bytestream_start + len);
+            fs->c.bytestream_end = fs->c.bytestream_start + len;
+        }
     }
     avctx->execute(avctx, encode_slice, &f->slice_context[0], NULL,
                    f->slice_count, sizeof(void *));
@@ -978,9 +1230,7 @@ static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         int bytes;
 
         if (fs->ac != AC_GOLOMB_RICE) {
-            uint8_t state = 129;
-            put_rac(&fs->c, &state, 0);
-            bytes = ff_rac_terminate(&fs->c);
+            bytes = ff_rac_terminate(&fs->c, 1);
         } else {
             flush_put_bits(&fs->pb); // FIXME: nicer padding
             bytes = fs->ac_byte_count + (put_bits_count(&fs->pb) + 7) / 8;
@@ -1002,47 +1252,7 @@ static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         buf_p += bytes;
     }
 
-    if ((avctx->flags & AV_CODEC_FLAG_PASS1) && (f->picture_number & 31) == 0) {
-        int j, k, m;
-        char *p   = avctx->stats_out;
-        char *end = p + STATS_OUT_SIZE;
-
-        memset(f->rc_stat, 0, sizeof(f->rc_stat));
-        for (i = 0; i < f->quant_table_count; i++)
-            memset(f->rc_stat2[i], 0, f->context_count[i] * sizeof(*f->rc_stat2[i]));
-
-        for (j = 0; j < f->slice_count; j++) {
-            FFV1Context *fs = f->slice_context[j];
-            for (i = 0; i < 256; i++) {
-                f->rc_stat[i][0] += fs->rc_stat[i][0];
-                f->rc_stat[i][1] += fs->rc_stat[i][1];
-            }
-            for (i = 0; i < f->quant_table_count; i++) {
-                for (k = 0; k < f->context_count[i]; k++)
-                    for (m = 0; m < 32; m++) {
-                        f->rc_stat2[i][k][m][0] += fs->rc_stat2[i][k][m][0];
-                        f->rc_stat2[i][k][m][1] += fs->rc_stat2[i][k][m][1];
-                    }
-            }
-        }
-
-        for (j = 0; j < 256; j++) {
-            snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
-                     f->rc_stat[j][0], f->rc_stat[j][1]);
-            p += strlen(p);
-        }
-        snprintf(p, end - p, "\n");
-
-        for (i = 0; i < f->quant_table_count; i++) {
-            for (j = 0; j < f->context_count[i]; j++)
-                for (m = 0; m < 32; m++) {
-                    snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
-                             f->rc_stat2[i][j][m][0], f->rc_stat2[i][j][m][1]);
-                    p += strlen(p);
-                }
-        }
-        snprintf(p, end - p, "%d\n", f->gob_count);
-    } else if (avctx->flags & AV_CODEC_FLAG_PASS1)
+    if (avctx->flags & AV_CODEC_FLAG_PASS1)
         avctx->stats_out[0] = '\0';
 
 #if FF_API_CODED_FRAME
@@ -1053,38 +1263,41 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     f->picture_number++;
     pkt->size   = buf_p - pkt->data;
+    pkt->pts    =
+    pkt->dts    = pict->pts;
     pkt->flags |= AV_PKT_FLAG_KEY * f->key_frame;
     *got_packet = 1;
 
     return 0;
 }
 
-static av_cold int ffv1_encode_close(AVCodecContext *avctx)
+static av_cold int encode_close(AVCodecContext *avctx)
 {
-    ffv1_close(avctx);
+    ff_ffv1_close(avctx);
     return 0;
 }
 
 #define OFFSET(x) offsetof(FFV1Context, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "slicecrc", "Protect slices with CRCs", OFFSET(ec), AV_OPT_TYPE_INT,
-             { .i64 = -1 }, -1, 1, VE },
+    { "slicecrc", "Protect slices with CRCs", OFFSET(ec), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
     { "coder", "Coder type", OFFSET(ac), AV_OPT_TYPE_INT,
-            { .i64 = AC_GOLOMB_RICE }, 0, 2, VE, "coder" },
+            { .i64 = 0 }, -2, 2, VE, "coder" },
         { "rice", "Golomb rice", 0, AV_OPT_TYPE_CONST,
             { .i64 = AC_GOLOMB_RICE }, INT_MIN, INT_MAX, VE, "coder" },
         { "range_def", "Range with default table", 0, AV_OPT_TYPE_CONST,
-            { .i64 = AC_RANGE_DEFAULT_TAB }, INT_MIN, INT_MAX, VE, "coder" },
+            { .i64 = AC_RANGE_DEFAULT_TAB_FORCE }, INT_MIN, INT_MAX, VE, "coder" },
         { "range_tab", "Range with custom table", 0, AV_OPT_TYPE_CONST,
             { .i64 = AC_RANGE_CUSTOM_TAB }, INT_MIN, INT_MAX, VE, "coder" },
+        { "ac", "Range with custom table (the ac option exists for compatibility and is deprecated)", 0, AV_OPT_TYPE_CONST,
+            { .i64 = 1 }, INT_MIN, INT_MAX, VE, "coder" },
     { "context", "Context model", OFFSET(context_model), AV_OPT_TYPE_INT,
             { .i64 = 0 }, 0, 1, VE },
 
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass ffv1_class = {
     .class_name = "ffv1 encoder",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -1104,25 +1317,35 @@ AVCodec ff_ffv1_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFV1,
     .priv_data_size = sizeof(FFV1Context),
-    .init           = ffv1_encode_init,
-    .encode2        = ffv1_encode_frame,
-    .close          = ffv1_encode_close,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_close,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV411P,   AV_PIX_FMT_YUV410P,
-        AV_PIX_FMT_YUV444P9,  AV_PIX_FMT_YUV422P9,  AV_PIX_FMT_YUV420P9,
-        AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
-        AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
-        AV_PIX_FMT_RGB32,
-        AV_PIX_FMT_GBRP9,     AV_PIX_FMT_GBRP10,
-        AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,  AV_PIX_FMT_YUVA444P,
-        AV_PIX_FMT_GRAY16,    AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,  AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUVA444P,  AV_PIX_FMT_YUV440P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_YUV410P,   AV_PIX_FMT_0RGB32,    AV_PIX_FMT_RGB32,     AV_PIX_FMT_YUV420P16,
+        AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16, AV_PIX_FMT_YUV444P9,  AV_PIX_FMT_YUV422P9,
+        AV_PIX_FMT_YUV420P9,  AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
+        AV_PIX_FMT_YUVA444P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA420P16,
+        AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA420P10,
+        AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA420P9,
+        AV_PIX_FMT_GRAY16,    AV_PIX_FMT_GRAY8,     AV_PIX_FMT_GBRP9,     AV_PIX_FMT_GBRP10,
+        AV_PIX_FMT_GBRP12,    AV_PIX_FMT_GBRP14,
+        AV_PIX_FMT_GBRAP10, AV_PIX_FMT_GBRAP12,
+        AV_PIX_FMT_YA8,
+        AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12,
+        AV_PIX_FMT_GBRP16, AV_PIX_FMT_RGB48,
+        AV_PIX_FMT_GBRAP16, AV_PIX_FMT_RGBA64,
+        AV_PIX_FMT_GRAY9,
+        AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14,
+        AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV440P12,
         AV_PIX_FMT_NONE
 
     },
 #if FF_API_CODER_TYPE
     .defaults       = ffv1_defaults,
 #endif
-    .priv_class     = &class,
+    .priv_class     = &ffv1_class,
 };
diff --git a/libavcodec/ffv1enc_template.c b/libavcodec/ffv1enc_template.c
new file mode 100644
index 0000000..bc0add5
--- /dev/null
+++ b/libavcodec/ffv1enc_template.c
@@ -0,0 +1,202 @@
+/*
+ * FFV1 encoder template
+ *
+ * Copyright (c) 2003-2016 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static av_always_inline int RENAME(encode_line)(FFV1Context *s, int w,
+                                                TYPE *sample[3],
+                                                int plane_index, int bits)
+{
+    PlaneContext *const p = &s->plane[plane_index];
+    RangeCoder *const c   = &s->c;
+    int x;
+    int run_index = s->run_index;
+    int run_count = 0;
+    int run_mode  = 0;
+
+    if (s->ac != AC_GOLOMB_RICE) {
+        if (c->bytestream_end - c->bytestream < w * 35) {
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) < w * 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (s->slice_coding_mode == 1) {
+        for (x = 0; x < w; x++) {
+            int i;
+            int v = sample[0][x];
+            for (i = bits-1; i>=0; i--) {
+                uint8_t state = 128;
+                put_rac(c, &state, (v>>i) & 1);
+            }
+        }
+        return 0;
+    }
+
+    for (x = 0; x < w; x++) {
+        int diff, context;
+
+        context = RENAME(get_context)(p, sample[0] + x, sample[1] + x, sample[2] + x);
+        diff    = sample[0][x] - RENAME(predict)(sample[0] + x, sample[1] + x);
+
+        if (context < 0) {
+            context = -context;
+            diff    = -diff;
+        }
+
+        diff = fold(diff, bits);
+
+        if (s->ac != AC_GOLOMB_RICE) {
+            if (s->flags & AV_CODEC_FLAG_PASS1) {
+                put_symbol_inline(c, p->state[context], diff, 1, s->rc_stat,
+                                  s->rc_stat2[p->quant_table_index][context]);
+            } else {
+                put_symbol_inline(c, p->state[context], diff, 1, NULL, NULL);
+            }
+        } else {
+            if (context == 0)
+                run_mode = 1;
+
+            if (run_mode) {
+                if (diff) {
+                    while (run_count >= 1 << ff_log2_run[run_index]) {
+                        run_count -= 1 << ff_log2_run[run_index];
+                        run_index++;
+                        put_bits(&s->pb, 1, 1);
+                    }
+
+                    put_bits(&s->pb, 1 + ff_log2_run[run_index], run_count);
+                    if (run_index)
+                        run_index--;
+                    run_count = 0;
+                    run_mode  = 0;
+                    if (diff > 0)
+                        diff--;
+                } else {
+                    run_count++;
+                }
+            }
+
+            ff_dlog(s->avctx, "count:%d index:%d, mode:%d, x:%d pos:%d\n",
+                    run_count, run_index, run_mode, x,
+                    (int)put_bits_count(&s->pb));
+
+            if (run_mode == 0)
+                put_vlc_symbol(&s->pb, &p->vlc_state[context], diff, bits);
+        }
+    }
+    if (run_mode) {
+        while (run_count >= 1 << ff_log2_run[run_index]) {
+            run_count -= 1 << ff_log2_run[run_index];
+            run_index++;
+            put_bits(&s->pb, 1, 1);
+        }
+
+        if (run_count)
+            put_bits(&s->pb, 1, 1);
+    }
+    s->run_index = run_index;
+
+    return 0;
+}
+
+static int RENAME(encode_rgb_frame)(FFV1Context *s, const uint8_t *src[4],
+                                    int w, int h, const int stride[4])
+{
+    int x, y, p, i;
+    const int ring_size = s->context_model ? 3 : 2;
+    TYPE *sample[4][3];
+    int lbd    = s->bits_per_raw_sample <= 8;
+    int packed = !src[1];
+    int bits   = s->bits_per_raw_sample > 0 ? s->bits_per_raw_sample : 8;
+    int offset = 1 << bits;
+    int transparency = s->transparency;
+    int packed_size = (3 + transparency)*2;
+
+    s->run_index = 0;
+
+    memset(RENAME(s->sample_buffer), 0, ring_size * MAX_PLANES *
+           (w + 6) * sizeof(*RENAME(s->sample_buffer)));
+
+    for (y = 0; y < h; y++) {
+        for (i = 0; i < ring_size; i++)
+            for (p = 0; p < MAX_PLANES; p++)
+                sample[p][i]= RENAME(s->sample_buffer) + p*ring_size*(w+6) + ((h+i-y)%ring_size)*(w+6) + 3;
+
+        for (x = 0; x < w; x++) {
+            int b, g, r, av_uninit(a);
+            if (lbd) {
+                unsigned v = *((const uint32_t*)(src[0] + x*4 + stride[0]*y));
+                b =  v        & 0xFF;
+                g = (v >>  8) & 0xFF;
+                r = (v >> 16) & 0xFF;
+                a =  v >> 24;
+            } else if (packed) {
+                const uint16_t *p = ((const uint16_t*)(src[0] + x*packed_size + stride[0]*y));
+                r = p[0];
+                g = p[1];
+                b = p[2];
+                if (transparency)
+                  a = p[3];
+            } else if (sizeof(TYPE) == 4 || transparency) {
+                g = *((const uint16_t *)(src[0] + x*2 + stride[0]*y));
+                b = *((const uint16_t *)(src[1] + x*2 + stride[1]*y));
+                r = *((const uint16_t *)(src[2] + x*2 + stride[2]*y));
+                if (transparency)
+                    a = *((const uint16_t *)(src[3] + x*2 + stride[3]*y));
+            } else {
+                b = *((const uint16_t *)(src[0] + x*2 + stride[0]*y));
+                g = *((const uint16_t *)(src[1] + x*2 + stride[1]*y));
+                r = *((const uint16_t *)(src[2] + x*2 + stride[2]*y));
+            }
+
+            if (s->slice_coding_mode != 1) {
+                b -= g;
+                r -= g;
+                g += (b * s->slice_rct_by_coef + r * s->slice_rct_ry_coef) >> 2;
+                b += offset;
+                r += offset;
+            }
+
+            sample[0][0][x] = g;
+            sample[1][0][x] = b;
+            sample[2][0][x] = r;
+            sample[3][0][x] = a;
+        }
+        for (p = 0; p < 3 + transparency; p++) {
+            int ret;
+            sample[p][0][-1] = sample[p][1][0  ];
+            sample[p][1][ w] = sample[p][1][w-1];
+            if (lbd && s->slice_coding_mode == 0)
+                ret = RENAME(encode_line)(s, w, sample[p], (p + 1) / 2, 9);
+            else
+                ret = RENAME(encode_line)(s, w, sample[p], (p + 1) / 2, bits + (s->slice_coding_mode != 1));
+            if (ret < 0)
+                return ret;
+        }
+    }
+    return 0;
+}
+
diff --git a/libavcodec/ffwavesynth.c b/libavcodec/ffwavesynth.c
new file mode 100644
index 0000000..9d055e4
--- /dev/null
+++ b/libavcodec/ffwavesynth.c
@@ -0,0 +1,481 @@
+/*
+ * Wavesynth pseudo-codec
+ * Copyright (c) 2011 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+#include "avcodec.h"
+#include "internal.h"
+
+
+#define SIN_BITS 14
+#define WS_MAX_CHANNELS 32
+#define INF_TS 0x7FFFFFFFFFFFFFFF
+
+#define PINK_UNIT 128
+
+/*
+   Format of the extradata and packets
+
+   THIS INFORMATION IS NOT PART OF THE PUBLIC API OR ABI.
+   IT CAN CHANGE WITHOUT NOTIFICATION.
+
+   All numbers are in little endian.
+
+   The codec extradata define a set of intervals with uniform content.
+   Overlapping intervals are added together.
+
+   extradata:
+       uint32      number of intervals
+       ...         intervals
+
+   interval:
+       int64       start timestamp; time_base must be 1/sample_rate;
+                   start timestamps must be in ascending order
+       int64       end timestamp
+       uint32      type
+       uint32      channels mask
+       ...         additional information, depends on type
+
+   sine interval (type fourcc "SINE"):
+       int32       start frequency, in 1/(1<<16) Hz
+       int32       end frequency
+       int32       start amplitude, 1<<16 is the full amplitude
+       int32       end amplitude
+       uint32      start phase, 0 is sin(0), 0x20000000 is sin(pi/2), etc.;
+                   n | (1<<31) means to match the phase of previous channel #n
+
+   pink noise interval (type fourcc "NOIS"):
+       int32       start amplitude
+       int32       end amplitude
+
+   The input packets encode the time and duration of the requested segment.
+
+   packet:
+       int64       start timestamp
+       int32       duration
+
+*/
+
+enum ws_interval_type {
+    WS_SINE  = MKTAG('S','I','N','E'),
+    WS_NOISE = MKTAG('N','O','I','S'),
+};
+
+struct ws_interval {
+    int64_t ts_start, ts_end;
+    uint64_t phi0, dphi0, ddphi;
+    uint64_t amp0, damp;
+    uint64_t phi, dphi, amp;
+    uint32_t channels;
+    enum ws_interval_type type;
+    int next;
+};
+
+struct wavesynth_context {
+    int64_t cur_ts;
+    int64_t next_ts;
+    int32_t *sin;
+    struct ws_interval *inter;
+    uint32_t dither_state;
+    uint32_t pink_state;
+    int32_t pink_pool[PINK_UNIT];
+    unsigned pink_need, pink_pos;
+    int nb_inter;
+    int cur_inter;
+    int next_inter;
+};
+
+#define LCG_A 1284865837
+#define LCG_C 4150755663
+#define LCG_AI 849225893 /* A*AI = 1 [mod 1<<32] */
+
+static uint32_t lcg_next(uint32_t *s)
+{
+    *s = *s * LCG_A + LCG_C;
+    return *s;
+}
+
+static void lcg_seek(uint32_t *s, int64_t dt)
+{
+    uint32_t a, c, t = *s;
+
+    if (dt >= 0) {
+        a = LCG_A;
+        c = LCG_C;
+    } else { /* coefficients for a step backward */
+        a = LCG_AI;
+        c = (uint32_t)(LCG_AI * LCG_C);
+        dt = -dt;
+    }
+    while (dt) {
+        if (dt & 1)
+            t = a * t + c;
+        c *= a + 1; /* coefficients for a double step */
+        a *= a;
+        dt >>= 1;
+    }
+    *s = t;
+}
+
+/* Emulate pink noise by summing white noise at the sampling frequency,
+ * white noise at half the sampling frequency (each value taken twice),
+ * etc., with a total of 8 octaves.
+ * This is known as the Voss-McCartney algorithm. */
+
+static void pink_fill(struct wavesynth_context *ws)
+{
+    int32_t vt[7] = { 0 }, v = 0;
+    int i, j;
+
+    ws->pink_pos = 0;
+    if (!ws->pink_need)
+        return;
+    for (i = 0; i < PINK_UNIT; i++) {
+        for (j = 0; j < 7; j++) {
+            if ((i >> j) & 1)
+                break;
+            v -= vt[j];
+            vt[j] = (int32_t)lcg_next(&ws->pink_state) >> 3;
+            v += vt[j];
+        }
+        ws->pink_pool[i] = v + ((int32_t)lcg_next(&ws->pink_state) >> 3);
+    }
+    lcg_next(&ws->pink_state); /* so we use exactly 256 steps */
+}
+
+/**
+ * @return  (1<<64) * a / b, without overflow, if a < b
+ */
+static uint64_t frac64(uint64_t a, uint64_t b)
+{
+    uint64_t r = 0;
+    int i;
+
+    if (b < (uint64_t)1 << 32) { /* b small, use two 32-bits steps */
+        a <<= 32;
+        return ((a / b) << 32) | ((a % b) << 32) / b;
+    }
+    if (b < (uint64_t)1 << 48) { /* b medium, use four 16-bits steps */
+        for (i = 0; i < 4; i++) {
+            a <<= 16;
+            r = (r << 16) | (a / b);
+            a %= b;
+        }
+        return r;
+    }
+    for (i = 63; i >= 0; i--) {
+        if (a >= (uint64_t)1 << 63 || a << 1 >= b) {
+            r |= (uint64_t)1 << i;
+            a = (a << 1) - b;
+        } else {
+            a <<= 1;
+        }
+    }
+    return r;
+}
+
+static uint64_t phi_at(struct ws_interval *in, int64_t ts)
+{
+    uint64_t dt = ts - in->ts_start;
+    uint64_t dt2 = dt & 1 ? /* dt * (dt - 1) / 2 without overflow */
+                   dt * ((dt - 1) >> 1) : (dt >> 1) * (dt - 1);
+    return in->phi0 + dt * in->dphi0 + dt2 * in->ddphi;
+}
+
+static void wavesynth_seek(struct wavesynth_context *ws, int64_t ts)
+{
+    int *last, i;
+    struct ws_interval *in;
+
+    last = &ws->cur_inter;
+    for (i = 0; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (ts < in->ts_start)
+            break;
+        if (ts >= in->ts_end)
+            continue;
+        *last = i;
+        last = &in->next;
+        in->phi  = phi_at(in, ts);
+        in->dphi = in->dphi0 + (ts - in->ts_start) * in->ddphi;
+        in->amp  = in->amp0  + (ts - in->ts_start) * in->damp;
+    }
+    ws->next_inter = i;
+    ws->next_ts = i < ws->nb_inter ? ws->inter[i].ts_start : INF_TS;
+    *last = -1;
+    lcg_seek(&ws->dither_state, ts - ws->cur_ts);
+    if (ws->pink_need) {
+        int64_t pink_ts_cur  = (ws->cur_ts + PINK_UNIT - 1) & ~(PINK_UNIT - 1);
+        int64_t pink_ts_next = ts & ~(PINK_UNIT - 1);
+        int pos = ts & (PINK_UNIT - 1);
+        lcg_seek(&ws->pink_state, (pink_ts_next - pink_ts_cur) << 1);
+        if (pos) {
+            pink_fill(ws);
+            ws->pink_pos = pos;
+        } else {
+            ws->pink_pos = PINK_UNIT;
+        }
+    }
+    ws->cur_ts = ts;
+}
+
+static int wavesynth_parse_extradata(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    struct ws_interval *in;
+    uint8_t *edata, *edata_end;
+    int32_t f1, f2, a1, a2;
+    uint32_t phi;
+    int64_t dphi1, dphi2, dt, cur_ts = -0x8000000000000000;
+    int i;
+
+    if (avc->extradata_size < 4)
+        return AVERROR(EINVAL);
+    edata = avc->extradata;
+    edata_end = edata + avc->extradata_size;
+    ws->nb_inter = AV_RL32(edata);
+    edata += 4;
+    if (ws->nb_inter < 0)
+        return AVERROR(EINVAL);
+    ws->inter = av_calloc(ws->nb_inter, sizeof(*ws->inter));
+    if (!ws->inter)
+        return AVERROR(ENOMEM);
+    for (i = 0; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (edata_end - edata < 24)
+            return AVERROR(EINVAL);
+        in->ts_start = AV_RL64(edata +  0);
+        in->ts_end   = AV_RL64(edata +  8);
+        in->type     = AV_RL32(edata + 16);
+        in->channels = AV_RL32(edata + 20);
+        edata += 24;
+        if (in->ts_start < cur_ts || in->ts_end <= in->ts_start)
+            return AVERROR(EINVAL);
+        cur_ts = in->ts_start;
+        dt = in->ts_end - in->ts_start;
+        switch (in->type) {
+            case WS_SINE:
+                if (edata_end - edata < 20)
+                    return AVERROR(EINVAL);
+                f1  = AV_RL32(edata +  0);
+                f2  = AV_RL32(edata +  4);
+                a1  = AV_RL32(edata +  8);
+                a2  = AV_RL32(edata + 12);
+                phi = AV_RL32(edata + 16);
+                edata += 20;
+                dphi1 = frac64(f1, (int64_t)avc->sample_rate << 16);
+                dphi2 = frac64(f2, (int64_t)avc->sample_rate << 16);
+                in->dphi0 = dphi1;
+                in->ddphi = (dphi2 - dphi1) / dt;
+                if (phi & 0x80000000) {
+                    phi &= ~0x80000000;
+                    if (phi >= i)
+                        return AVERROR(EINVAL);
+                    in->phi0 = phi_at(&ws->inter[phi], in->ts_start);
+                } else {
+                    in->phi0 = (uint64_t)phi << 33;
+                }
+                break;
+            case WS_NOISE:
+                if (edata_end - edata < 8)
+                    return AVERROR(EINVAL);
+                a1  = AV_RL32(edata +  0);
+                a2  = AV_RL32(edata +  4);
+                edata += 8;
+                break;
+            default:
+                return AVERROR(EINVAL);
+        }
+        in->amp0 = (int64_t)a1 << 32;
+        in->damp = (((int64_t)a2 << 32) - ((int64_t)a1 << 32)) / dt;
+    }
+    if (edata != edata_end)
+        return AVERROR(EINVAL);
+    return 0;
+}
+
+static av_cold int wavesynth_init(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    int i, r;
+
+    if (avc->channels > WS_MAX_CHANNELS) {
+        av_log(avc, AV_LOG_ERROR,
+               "This implementation is limited to %d channels.\n",
+               WS_MAX_CHANNELS);
+        return AVERROR(EINVAL);
+    }
+    r = wavesynth_parse_extradata(avc);
+    if (r < 0) {
+        av_log(avc, AV_LOG_ERROR, "Invalid intervals definitions.\n");
+        goto fail;
+    }
+    ws->sin = av_malloc(sizeof(*ws->sin) << SIN_BITS);
+    if (!ws->sin) {
+        r = AVERROR(ENOMEM);
+        goto fail;
+    }
+    for (i = 0; i < 1 << SIN_BITS; i++)
+        ws->sin[i] = floor(32767 * sin(2 * M_PI * i / (1 << SIN_BITS)));
+    ws->dither_state = MKTAG('D','I','T','H');
+    for (i = 0; i < ws->nb_inter; i++)
+        ws->pink_need += ws->inter[i].type == WS_NOISE;
+    ws->pink_state = MKTAG('P','I','N','K');
+    ws->pink_pos = PINK_UNIT;
+    wavesynth_seek(ws, 0);
+    avc->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+
+fail:
+    av_freep(&ws->inter);
+    av_freep(&ws->sin);
+    return r;
+}
+
+static void wavesynth_synth_sample(struct wavesynth_context *ws, int64_t ts,
+                                   int32_t *channels)
+{
+    int32_t amp, val, *cv;
+    struct ws_interval *in;
+    int i, *last, pink;
+    uint32_t c, all_ch = 0;
+
+    i = ws->cur_inter;
+    last = &ws->cur_inter;
+    if (ws->pink_pos == PINK_UNIT)
+        pink_fill(ws);
+    pink = ws->pink_pool[ws->pink_pos++] >> 16;
+    while (i >= 0) {
+        in = &ws->inter[i];
+        i = in->next;
+        if (ts >= in->ts_end) {
+            *last = i;
+            continue;
+        }
+        last = &in->next;
+        amp = in->amp >> 32;
+        in->amp  += in->damp;
+        switch (in->type) {
+            case WS_SINE:
+                val = amp * ws->sin[in->phi >> (64 - SIN_BITS)];
+                in->phi  += in->dphi;
+                in->dphi += in->ddphi;
+                break;
+            case WS_NOISE:
+                val = amp * pink;
+                break;
+            default:
+                val = 0;
+        }
+        all_ch |= in->channels;
+        for (c = in->channels, cv = channels; c; c >>= 1, cv++)
+            if (c & 1)
+                *cv += val;
+    }
+    val = (int32_t)lcg_next(&ws->dither_state) >> 16;
+    for (c = all_ch, cv = channels; c; c >>= 1, cv++)
+        if (c & 1)
+            *cv += val;
+}
+
+static void wavesynth_enter_intervals(struct wavesynth_context *ws, int64_t ts)
+{
+    int *last, i;
+    struct ws_interval *in;
+
+    last = &ws->cur_inter;
+    for (i = ws->cur_inter; i >= 0; i = ws->inter[i].next)
+        last = &ws->inter[i].next;
+    for (i = ws->next_inter; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (ts < in->ts_start)
+            break;
+        if (ts >= in->ts_end)
+            continue;
+        *last = i;
+        last = &in->next;
+        in->phi = in->phi0;
+        in->dphi = in->dphi0;
+        in->amp = in->amp0;
+    }
+    ws->next_inter = i;
+    ws->next_ts = i < ws->nb_inter ? ws->inter[i].ts_start : INF_TS;
+    *last = -1;
+}
+
+static int wavesynth_decode(AVCodecContext *avc, void *rframe, int *rgot_frame,
+                            AVPacket *packet)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    AVFrame *frame = rframe;
+    int64_t ts;
+    int duration;
+    int s, c, r;
+    int16_t *pcm;
+    int32_t channels[WS_MAX_CHANNELS];
+
+    *rgot_frame = 0;
+    if (packet->size != 12)
+        return AVERROR_INVALIDDATA;
+    ts = AV_RL64(packet->data);
+    if (ts != ws->cur_ts)
+        wavesynth_seek(ws, ts);
+    duration = AV_RL32(packet->data + 8);
+    if (duration <= 0)
+        return AVERROR(EINVAL);
+    frame->nb_samples = duration;
+    r = ff_get_buffer(avc, frame, 0);
+    if (r < 0)
+        return r;
+    pcm = (int16_t *)frame->data[0];
+    for (s = 0; s < duration; s++, ts++) {
+        memset(channels, 0, avc->channels * sizeof(*channels));
+        if (ts >= ws->next_ts)
+            wavesynth_enter_intervals(ws, ts);
+        wavesynth_synth_sample(ws, ts, channels);
+        for (c = 0; c < avc->channels; c++)
+            *(pcm++) = channels[c] >> 16;
+    }
+    ws->cur_ts += duration;
+    *rgot_frame = 1;
+    return packet->size;
+}
+
+static av_cold int wavesynth_close(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+
+    av_freep(&ws->sin);
+    av_freep(&ws->inter);
+    return 0;
+}
+
+AVCodec ff_ffwavesynth_decoder = {
+    .name           = "wavesynth",
+    .long_name      = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_FFWAVESYNTH,
+    .priv_data_size = sizeof(struct wavesynth_context),
+    .init           = wavesynth_init,
+    .close          = wavesynth_close,
+    .decode         = wavesynth_decode,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/fic.c b/libavcodec/fic.c
index a038af6..65d102b 100644
--- a/libavcodec/fic.c
+++ b/libavcodec/fic.c
@@ -4,29 +4,29 @@
  * Copyright (c) 2014 Konstantin Shishkov
  * Copyright (c) 2014 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/common.h"
-
+#include "libavutil/opt.h"
 #include "avcodec.h"
-#include "bitstream.h"
-#include "golomb.h"
 #include "internal.h"
+#include "get_bits.h"
+#include "golomb.h"
 
 typedef struct FICThreadContext {
     DECLARE_ALIGNED(16, int16_t, block)[64];
@@ -34,9 +34,11 @@ typedef struct FICThreadContext {
     int slice_h;
     int src_size;
     int y_off;
+    int p_frame;
 } FICThreadContext;
 
 typedef struct FICContext {
+    AVClass *class;
     AVCodecContext *avctx;
     AVFrame *frame;
     AVFrame *final_frame;
@@ -52,6 +54,7 @@ typedef struct FICContext {
     int num_slices, slice_h;
 
     uint8_t cursor_buf[4096];
+    int skip_cursor;
 } FICContext;
 
 static const uint8_t fic_qmat_hq[64] = {
@@ -79,29 +82,30 @@ static const uint8_t fic_qmat_lq[64] = {
 static const uint8_t fic_header[7] = { 0, 0, 1, 'F', 'I', 'C', 'V' };
 
 #define FIC_HEADER_SIZE 27
+#define CURSOR_OFFSET 59
 
-static av_always_inline void fic_idct(int16_t *blk, int step, int shift)
+static av_always_inline void fic_idct(int16_t *blk, int step, int shift, int rnd)
 {
-    const int t0 =  27246 * blk[3 * step] + 18405 * blk[5 * step];
-    const int t1 =  27246 * blk[5 * step] - 18405 * blk[3 * step];
-    const int t2 =   6393 * blk[7 * step] + 32139 * blk[1 * step];
-    const int t3 =   6393 * blk[1 * step] - 32139 * blk[7 * step];
-    const int t4 = 5793 * (t2 + t0 + 0x800 >> 12);
-    const int t5 = 5793 * (t3 + t1 + 0x800 >> 12);
-    const int t6 = t2 - t0;
-    const int t7 = t3 - t1;
-    const int t8 =  17734 * blk[2 * step] - 42813 * blk[6 * step];
-    const int t9 =  17734 * blk[6 * step] + 42814 * blk[2 * step];
-    const int tA = (blk[0 * step] - blk[4 * step] << 15) + (1 << shift - 1);
-    const int tB = (blk[0 * step] + blk[4 * step] << 15) + (1 << shift - 1);
-    blk[0 * step] = (  t4       + t9 + tB) >> shift;
-    blk[1 * step] = (  t6 + t7  + t8 + tA) >> shift;
-    blk[2 * step] = (  t6 - t7  - t8 + tA) >> shift;
-    blk[3 * step] = (  t5       - t9 + tB) >> shift;
-    blk[4 * step] = ( -t5       - t9 + tB) >> shift;
-    blk[5 * step] = (-(t6 - t7) - t8 + tA) >> shift;
-    blk[6 * step] = (-(t6 + t7) + t8 + tA) >> shift;
-    blk[7 * step] = ( -t4       + t9 + tB) >> shift;
+    const unsigned t0 =  27246 * blk[3 * step] + 18405 * blk[5 * step];
+    const unsigned t1 =  27246 * blk[5 * step] - 18405 * blk[3 * step];
+    const unsigned t2 =   6393 * blk[7 * step] + 32139 * blk[1 * step];
+    const unsigned t3 =   6393 * blk[1 * step] - 32139 * blk[7 * step];
+    const unsigned t4 = 5793U * ((int)(t2 + t0 + 0x800) >> 12);
+    const unsigned t5 = 5793U * ((int)(t3 + t1 + 0x800) >> 12);
+    const unsigned t6 = t2 - t0;
+    const unsigned t7 = t3 - t1;
+    const unsigned t8 =  17734 * blk[2 * step] - 42813 * blk[6 * step];
+    const unsigned t9 =  17734 * blk[6 * step] + 42814 * blk[2 * step];
+    const unsigned tA = (blk[0 * step] - blk[4 * step]) * 32768 + rnd;
+    const unsigned tB = (blk[0 * step] + blk[4 * step]) * 32768 + rnd;
+    blk[0 * step] = (int)(  t4       + t9 + tB) >> shift;
+    blk[1 * step] = (int)(  t6 + t7  + t8 + tA) >> shift;
+    blk[2 * step] = (int)(  t6 - t7  - t8 + tA) >> shift;
+    blk[3 * step] = (int)(  t5       - t9 + tB) >> shift;
+    blk[4 * step] = (int)( -t5       - t9 + tB) >> shift;
+    blk[5 * step] = (int)(-(t6 - t7) - t8 + tA) >> shift;
+    blk[6 * step] = (int)(-(t6 + t7) + t8 + tA) >> shift;
+    blk[7 * step] = (int)( -t4       + t9 + tB) >> shift;
 }
 
 static void fic_idct_put(uint8_t *dst, int stride, int16_t *block)
@@ -110,14 +114,15 @@ static void fic_idct_put(uint8_t *dst, int stride, int16_t *block)
     int16_t *ptr;
 
     ptr = block;
-    for (i = 0; i < 8; i++) {
-        fic_idct(ptr, 8, 13);
+    fic_idct(ptr++, 8, 13, (1 << 12) + (1 << 17));
+    for (i = 1; i < 8; i++) {
+        fic_idct(ptr, 8, 13, 1 << 12);
         ptr++;
     }
 
     ptr = block;
     for (i = 0; i < 8; i++) {
-        fic_idct(ptr, 1, 20);
+        fic_idct(ptr, 1, 20, 0);
         ptr += 8;
     }
 
@@ -129,29 +134,33 @@ static void fic_idct_put(uint8_t *dst, int stride, int16_t *block)
         ptr += 8;
     }
 }
-static int fic_decode_block(FICContext *ctx, BitstreamContext *bc,
-                            uint8_t *dst, int stride, int16_t *block)
+static int fic_decode_block(FICContext *ctx, GetBitContext *gb,
+                            uint8_t *dst, int stride, int16_t *block, int *is_p)
 {
     int i, num_coeff;
 
-    /* Is it a skip block? */
-    if (bitstream_read_bit(bc)) {
-        /* This is a P-frame. */
-        ctx->frame->key_frame = 0;
-        ctx->frame->pict_type = AV_PICTURE_TYPE_P;
+    if (get_bits_left(gb) < 8)
+        return AVERROR_INVALIDDATA;
 
+    /* Is it a skip block? */
+    if (get_bits1(gb)) {
+        *is_p = 1;
         return 0;
     }
 
     memset(block, 0, sizeof(*block) * 64);
 
-    num_coeff = bitstream_read(bc, 7);
+    num_coeff = get_bits(gb, 7);
     if (num_coeff > 64)
         return AVERROR_INVALIDDATA;
 
-    for (i = 0; i < num_coeff; i++)
-        block[ff_zigzag_direct[i]] = get_se_golomb(bc) *
+    for (i = 0; i < num_coeff; i++) {
+        int v = get_se_golomb(gb);
+        if (v < -2048 || v > 2048)
+             return AVERROR_INVALIDDATA;
+        block[ff_zigzag_direct[i]] = v *
                                      ctx->qmat[ff_zigzag_direct[i]];
+    }
 
     fic_idct_put(dst, stride, block);
 
@@ -162,14 +171,16 @@ static int fic_decode_slice(AVCodecContext *avctx, void *tdata)
 {
     FICContext *ctx        = avctx->priv_data;
     FICThreadContext *tctx = tdata;
-    BitstreamContext bc;
+    GetBitContext gb;
     uint8_t *src = tctx->src;
     int slice_h  = tctx->slice_h;
     int src_size = tctx->src_size;
     int y_off    = tctx->y_off;
-    int x, y, p;
+    int x, y, p, ret;
 
-    bitstream_init8(&bc, src, src_size);
+    ret = init_get_bits8(&gb, src, src_size);
+    if (ret < 0)
+        return ret;
 
     for (p = 0; p < 3; p++) {
         int stride   = ctx->frame->linesize[p];
@@ -179,7 +190,8 @@ static int fic_decode_slice(AVCodecContext *avctx, void *tdata)
             for (x = 0; x < (ctx->aligned_width >> !!p); x += 8) {
                 int ret;
 
-                if ((ret = fic_decode_block(ctx, &bc, dst + x, stride, tctx->block)) != 0)
+                if ((ret = fic_decode_block(ctx, &gb, dst + x, stride,
+                                            tctx->block, &tctx->p_frame)) != 0)
                     return ret;
             }
 
@@ -263,13 +275,11 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
     int msize;
     int tsize;
     int cur_x, cur_y;
-    int skip_cursor = 0;
+    int skip_cursor = ctx->skip_cursor;
     uint8_t *sdata;
 
-    if ((ret = ff_reget_buffer(avctx, ctx->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, ctx->frame)) < 0)
         return ret;
-    }
 
     /* Header + at least one slice (4) */
     if (avpkt->size < FIC_HEADER_SIZE + 4) {
@@ -282,8 +292,13 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_WARNING, "Invalid FIC Header.\n");
 
     /* Is it a skip frame? */
-    if (src[17])
+    if (src[17]) {
+        if (!ctx->final_frame) {
+            av_log(avctx, AV_LOG_WARNING, "Initial frame is skipped\n");
+            return AVERROR_INVALIDDATA;
+        }
         goto skip;
+    }
 
     nslices = src[13];
     if (!nslices) {
@@ -303,7 +318,10 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (tsize < 32) {
+    if (!tsize || !AV_RL16(src + 37) || !AV_RL16(src + 39))
+        skip_cursor = 1;
+
+    if (!skip_cursor && tsize < 32) {
         av_log(avctx, AV_LOG_WARNING,
                "Cursor data too small. Skipping cursor.\n");
         skip_cursor = 1;
@@ -312,19 +330,23 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
     /* Cursor position. */
     cur_x = AV_RL16(src + 33);
     cur_y = AV_RL16(src + 35);
-    if (cur_x > avctx->width || cur_y > avctx->height) {
-        av_log(avctx, AV_LOG_WARNING,
-               "Invalid cursor position: (%d,%d). Skipping cusor.\n",
+    if (!skip_cursor && (cur_x > avctx->width || cur_y > avctx->height)) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Invalid cursor position: (%d,%d). Skipping cursor.\n",
                cur_x, cur_y);
         skip_cursor = 1;
     }
 
-    if (AV_RL16(src + 37) != 32 || AV_RL16(src + 39) != 32) {
+    if (!skip_cursor && (AV_RL16(src + 37) != 32 || AV_RL16(src + 39) != 32)) {
         av_log(avctx, AV_LOG_WARNING,
                "Invalid cursor size. Skipping cursor.\n");
         skip_cursor = 1;
     }
 
+    if (!skip_cursor && avpkt->size < CURSOR_OFFSET + sizeof(ctx->cursor_buf)) {
+        skip_cursor = 1;
+    }
+
     /* Slice height for all but the last slice. */
     ctx->slice_h = 16 * (ctx->aligned_height >> 4) / nslices;
     if (ctx->slice_h % 16)
@@ -339,15 +361,6 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    /*
-     * Set the frametype to I initially. It will be set to P if the frame
-     * has any dependencies (skip blocks). There will be a race condition
-     * inside the slice decode function to set these, but we do not care.
-     * since they will only ever be set to 0/P.
-     */
-    ctx->frame->key_frame = 1;
-    ctx->frame->pict_type = AV_PICTURE_TYPE_I;
-
     /* Allocate slice data. */
     av_fast_malloc(&ctx->slice_data, &ctx->slice_data_size,
                    nslices * sizeof(ctx->slice_data[0]));
@@ -372,6 +385,8 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
             slice_h      = FFALIGN(avctx->height - ctx->slice_h * (nslices - 1), 16);
         } else {
             slice_size = AV_RB32(src + tsize + FIC_HEADER_SIZE + slice * 4 + 4);
+            if (slice_size < slice_off)
+                return AVERROR_INVALIDDATA;
         }
 
         if (slice_size < slice_off || slice_size > msize)
@@ -389,6 +404,15 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
                               NULL, nslices, sizeof(ctx->slice_data[0]))) < 0)
         return ret;
 
+    ctx->frame->key_frame = 1;
+    ctx->frame->pict_type = AV_PICTURE_TYPE_I;
+    for (slice = 0; slice < nslices; slice++) {
+        if (ctx->slice_data[slice].p_frame) {
+            ctx->frame->key_frame = 0;
+            ctx->frame->pict_type = AV_PICTURE_TYPE_P;
+            break;
+        }
+    }
     av_frame_free(&ctx->final_frame);
     ctx->final_frame = av_frame_clone(ctx->frame);
     if (!ctx->final_frame) {
@@ -404,7 +428,7 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
 
     /* Draw cursor. */
     if (!skip_cursor) {
-        memcpy(ctx->cursor_buf, src + 59, 32 * 32 * 4);
+        memcpy(ctx->cursor_buf, src + CURSOR_OFFSET, sizeof(ctx->cursor_buf));
         fic_draw_cursor(avctx, cur_x, cur_y);
     }
 
@@ -446,6 +470,18 @@ static av_cold int fic_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption options[] = {
+{ "skip_cursor", "skip the cursor", offsetof(FICContext, skip_cursor), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM },
+{ NULL },
+};
+
+static const AVClass fic_decoder_class = {
+    .class_name = "FIC decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_fic_decoder = {
     .name           = "fic",
     .long_name      = NULL_IF_CONFIG_SMALL("Mirillis FIC"),
@@ -456,4 +492,5 @@ AVCodec ff_fic_decoder = {
     .decode         = fic_decode_frame,
     .close          = fic_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+    .priv_class     = &fic_decoder_class,
 };
diff --git a/libavcodec/filter_units_bsf.c b/libavcodec/filter_units_bsf.c
new file mode 100644
index 0000000..bc2ca28
--- /dev/null
+++ b/libavcodec/filter_units_bsf.c
@@ -0,0 +1,257 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+
+#include "libavutil/common.h"
+#include "libavutil/opt.h"
+
+#include "bsf.h"
+#include "cbs.h"
+
+
+typedef struct FilterUnitsContext {
+    const AVClass *class;
+
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment fragment;
+
+    const char *pass_types;
+    const char *remove_types;
+
+    enum {
+        NOOP,
+        PASS,
+        REMOVE,
+    } mode;
+    CodedBitstreamUnitType *type_list;
+    int nb_types;
+} FilterUnitsContext;
+
+
+static int filter_units_make_type_list(const char *list_string,
+                                       CodedBitstreamUnitType **type_list,
+                                       int *nb_types)
+{
+    CodedBitstreamUnitType *list = NULL;
+    int pass, count;
+
+    for (pass = 1; pass <= 2; pass++) {
+        long value, range_start, range_end;
+        const char *str;
+        char *value_end;
+
+        count = 0;
+        for (str = list_string; *str;) {
+            value = strtol(str, &value_end, 0);
+            if (str == value_end)
+                goto invalid;
+            str = (const char *)value_end;
+            if (*str == '-') {
+                ++str;
+                range_start = value;
+                range_end   = strtol(str, &value_end, 0);
+                if (str == value_end)
+                    goto invalid;
+
+                for (value = range_start; value < range_end; value++) {
+                    if (pass == 2)
+                        list[count] = value;
+                    ++count;
+                }
+            } else {
+                if (pass == 2)
+                    list[count] = value;
+                ++count;
+            }
+            if (*str == '|')
+                ++str;
+        }
+        if (pass == 1) {
+            list = av_malloc_array(count, sizeof(*list));
+            if (!list)
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    *type_list = list;
+    *nb_types  = count;
+    return 0;
+
+invalid:
+    av_freep(&list);
+    return AVERROR(EINVAL);
+}
+
+static int filter_units_filter(AVBSFContext *bsf, AVPacket *out)
+{
+    FilterUnitsContext      *ctx = bsf->priv_data;
+    CodedBitstreamFragment *frag = &ctx->fragment;
+    AVPacket *in = NULL;
+    int err, i, j;
+
+    while (1) {
+        err = ff_bsf_get_packet(bsf, &in);
+        if (err < 0)
+            return err;
+
+        if (ctx->mode == NOOP) {
+            av_packet_move_ref(out, in);
+            av_packet_free(&in);
+            return 0;
+        }
+
+        err = ff_cbs_read_packet(ctx->cbc, frag, in);
+        if (err < 0) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to read packet.\n");
+            goto fail;
+        }
+
+        for (i = 0; i < frag->nb_units; i++) {
+            for (j = 0; j < ctx->nb_types; j++) {
+                if (frag->units[i].type == ctx->type_list[j])
+                    break;
+            }
+            if (ctx->mode == REMOVE ? j <  ctx->nb_types
+                                    : j >= ctx->nb_types) {
+                ff_cbs_delete_unit(ctx->cbc, frag, i);
+                --i;
+            }
+        }
+
+        if (frag->nb_units > 0)
+            break;
+
+        // Don't return packets with nothing in them.
+        av_packet_free(&in);
+        ff_cbs_fragment_reset(ctx->cbc, frag);
+    }
+
+    err = ff_cbs_write_packet(ctx->cbc, out, frag);
+    if (err < 0) {
+        av_log(bsf, AV_LOG_ERROR, "Failed to write packet.\n");
+        goto fail;
+    }
+
+    err = av_packet_copy_props(out, in);
+    if (err < 0)
+        goto fail;
+
+fail:
+    ff_cbs_fragment_reset(ctx->cbc, frag);
+    av_packet_free(&in);
+
+    return err;
+}
+
+static int filter_units_init(AVBSFContext *bsf)
+{
+    FilterUnitsContext *ctx = bsf->priv_data;
+    int err;
+
+    if (ctx->pass_types && ctx->remove_types) {
+        av_log(bsf, AV_LOG_ERROR, "Exactly one of pass_types or "
+               "remove_types is required.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (ctx->pass_types) {
+        ctx->mode = PASS;
+        err = filter_units_make_type_list(ctx->pass_types,
+                                          &ctx->type_list, &ctx->nb_types);
+        if (err < 0) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to parse pass_types.\n");
+            return err;
+        }
+    } else if (ctx->remove_types) {
+        ctx->mode = REMOVE;
+        err = filter_units_make_type_list(ctx->remove_types,
+                                          &ctx->type_list, &ctx->nb_types);
+        if (err < 0) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to parse remove_types.\n");
+            return err;
+        }
+    } else {
+        return 0;
+    }
+
+    err = ff_cbs_init(&ctx->cbc, bsf->par_in->codec_id, bsf);
+    if (err < 0)
+        return err;
+
+    // Don't actually decompose anything, we only want the unit data.
+    ctx->cbc->decompose_unit_types    = ctx->type_list;
+    ctx->cbc->nb_decompose_unit_types = 0;
+
+    if (bsf->par_in->extradata) {
+        CodedBitstreamFragment *frag = &ctx->fragment;
+
+        err = ff_cbs_read_extradata(ctx->cbc, frag, bsf->par_in);
+        if (err < 0) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to read extradata.\n");
+        } else {
+            err = ff_cbs_write_extradata(ctx->cbc, bsf->par_out, frag);
+            if (err < 0)
+                av_log(bsf, AV_LOG_ERROR, "Failed to write extradata.\n");
+        }
+
+        ff_cbs_fragment_reset(ctx->cbc, frag);
+    }
+
+    return err;
+}
+
+static void filter_units_close(AVBSFContext *bsf)
+{
+    FilterUnitsContext *ctx = bsf->priv_data;
+
+    av_freep(&ctx->type_list);
+
+    ff_cbs_fragment_free(ctx->cbc, &ctx->fragment);
+    ff_cbs_close(&ctx->cbc);
+}
+
+#define OFFSET(x) offsetof(FilterUnitsContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
+static const AVOption filter_units_options[] = {
+    { "pass_types",   "List of unit types to pass through the filter.",
+        OFFSET(pass_types),   AV_OPT_TYPE_STRING,
+        { .str = NULL }, .flags = FLAGS },
+    { "remove_types", "List of unit types to remove in the filter.",
+        OFFSET(remove_types), AV_OPT_TYPE_STRING,
+        { .str = NULL }, .flags = FLAGS },
+
+    { NULL }
+};
+
+static const AVClass filter_units_class = {
+    .class_name = "filter_units",
+    .item_name  = av_default_item_name,
+    .option     = filter_units_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const AVBitStreamFilter ff_filter_units_bsf = {
+    .name           = "filter_units",
+    .priv_data_size = sizeof(FilterUnitsContext),
+    .priv_class     = &filter_units_class,
+    .init           = &filter_units_init,
+    .close          = &filter_units_close,
+    .filter         = &filter_units_filter,
+    .codec_ids      = ff_cbs_all_codec_ids,
+};
diff --git a/libavcodec/fits.c b/libavcodec/fits.c
new file mode 100644
index 0000000..365347f
--- /dev/null
+++ b/libavcodec/fits.c
@@ -0,0 +1,203 @@
+/*
+ * FITS implementation of common functions
+ * Copyright (c) 2017 Paras Chadha
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "libavutil/dict.h"
+#include "fits.h"
+
+int avpriv_fits_header_init(FITSHeader *header, FITSHeaderState state)
+{
+    header->state = state;
+    header->naxis_index = 0;
+    header->blank_found = 0;
+    header->pcount = 0;
+    header->gcount = 1;
+    header->groups = 0;
+    header->rgb = 0;
+    header->image_extension = 0;
+    header->bscale = 1.0;
+    header->bzero = 0;
+    header->data_min_found = 0;
+    header->data_max_found = 0;
+    return 0;
+}
+
+static int dict_set_if_not_null(AVDictionary ***metadata, char *keyword, char *value)
+{
+    if (metadata)
+        av_dict_set(*metadata, keyword, value, 0);
+    return 0;
+}
+
+/**
+ * Extract keyword and value from a header line (80 bytes) and store them in keyword and value strings respectively
+ * @param ptr8 pointer to the data
+ * @param keyword pointer to the char array in which keyword is to be stored
+ * @param value pointer to the char array in which value is to be stored
+ * @return 0 if calculated successfully otherwise AVERROR_INVALIDDATA
+ */
+static int read_keyword_value(const uint8_t *ptr8, char *keyword, char *value)
+{
+    int i;
+
+    for (i = 0; i < 8 && ptr8[i] != ' '; i++) {
+        keyword[i] = ptr8[i];
+    }
+    keyword[i] = '\0';
+
+    if (ptr8[8] == '=') {
+        i = 10;
+        while (i < 80 && ptr8[i] == ' ') {
+            i++;
+        }
+
+        if (i < 80) {
+            *value++ = ptr8[i];
+            i++;
+            if (ptr8[i-1] == '\'') {
+                for (; i < 80 && ptr8[i] != '\''; i++) {
+                    *value++ = ptr8[i];
+                }
+                *value++ = '\'';
+            } else if (ptr8[i-1] == '(') {
+                for (; i < 80 && ptr8[i] != ')'; i++) {
+                    *value++ = ptr8[i];
+                }
+                *value++ = ')';
+            } else {
+                for (; i < 80 && ptr8[i] != ' ' && ptr8[i] != '/'; i++) {
+                    *value++ = ptr8[i];
+                }
+            }
+        }
+    }
+    *value = '\0';
+    return 0;
+}
+
+#define CHECK_KEYWORD(key) \
+    if (strcmp(keyword, key)) { \
+        av_log(avcl, AV_LOG_ERROR, "expected %s keyword, found %s = %s\n", key, keyword, value); \
+        return AVERROR_INVALIDDATA; \
+    }
+
+#define CHECK_VALUE(key, val) \
+    if (sscanf(value, "%d", &header->val) != 1) { \
+        av_log(avcl, AV_LOG_ERROR, "invalid value of %s keyword, %s = %s\n", key, keyword, value); \
+        return AVERROR_INVALIDDATA; \
+    }
+
+int avpriv_fits_header_parse_line(void *avcl, FITSHeader *header, const uint8_t line[80], AVDictionary ***metadata)
+{
+    int dim_no, ret;
+    int64_t t;
+    double d;
+    char keyword[10], value[72], c;
+
+    read_keyword_value(line, keyword, value);
+    switch (header->state) {
+    case STATE_SIMPLE:
+        CHECK_KEYWORD("SIMPLE");
+
+        if (value[0] == 'F') {
+            av_log(avcl, AV_LOG_WARNING, "not a standard FITS file\n");
+        } else if (value[0] != 'T') {
+            av_log(avcl, AV_LOG_ERROR, "invalid value of SIMPLE keyword, SIMPLE = %c\n", value[0]);
+            return AVERROR_INVALIDDATA;
+        }
+
+        header->state = STATE_BITPIX;
+        break;
+    case STATE_XTENSION:
+        CHECK_KEYWORD("XTENSION");
+
+        if (!strcmp(value, "'IMAGE   '")) {
+            header->image_extension = 1;
+        }
+
+        header->state = STATE_BITPIX;
+        break;
+    case STATE_BITPIX:
+        CHECK_KEYWORD("BITPIX");
+        CHECK_VALUE("BITPIX", bitpix);
+        dict_set_if_not_null(metadata, keyword, value);
+
+        header->state = STATE_NAXIS;
+        break;
+    case STATE_NAXIS:
+        CHECK_KEYWORD("NAXIS");
+        CHECK_VALUE("NAXIS", naxis);
+        dict_set_if_not_null(metadata, keyword, value);
+
+        if (header->naxis) {
+            header->state = STATE_NAXIS_N;
+        } else {
+            header->state = STATE_REST;
+        }
+        break;
+    case STATE_NAXIS_N:
+        ret = sscanf(keyword, "NAXIS%d", &dim_no);
+        if (ret != 1 || dim_no != header->naxis_index + 1) {
+            av_log(avcl, AV_LOG_ERROR, "expected NAXIS%d keyword, found %s = %s\n", header->naxis_index + 1, keyword, value);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (sscanf(value, "%d", &header->naxisn[header->naxis_index]) != 1) {
+            av_log(avcl, AV_LOG_ERROR, "invalid value of NAXIS%d keyword, %s = %s\n", header->naxis_index + 1, keyword, value);
+            return AVERROR_INVALIDDATA;
+        }
+
+        dict_set_if_not_null(metadata, keyword, value);
+        header->naxis_index++;
+        if (header->naxis_index == header->naxis) {
+            header->state = STATE_REST;
+        }
+        break;
+    case STATE_REST:
+        if (!strcmp(keyword, "BLANK") && sscanf(value, "%"SCNd64"", &t) == 1) {
+            header->blank = t;
+            header->blank_found = 1;
+        } else if (!strcmp(keyword, "BSCALE") && sscanf(value, "%lf", &d) == 1) {
+            header->bscale = d;
+        } else if (!strcmp(keyword, "BZERO") && sscanf(value, "%lf", &d) == 1) {
+            header->bzero = d;
+        } else if (!strcmp(keyword, "CTYPE3") && !strncmp(value, "'RGB", 4)) {
+            header->rgb = 1;
+        } else if (!strcmp(keyword, "DATAMAX") && sscanf(value, "%lf", &d) == 1) {
+            header->data_max_found = 1;
+            header->data_max = d;
+        } else if (!strcmp(keyword, "DATAMIN") && sscanf(value, "%lf", &d) == 1) {
+            header->data_min_found = 1;
+            header->data_min = d;
+        } else if (!strcmp(keyword, "END")) {
+            return 1;
+        } else if (!strcmp(keyword, "GROUPS") && sscanf(value, "%c", &c) == 1) {
+            header->groups = (c == 'T');
+        } else if (!strcmp(keyword, "GCOUNT") && sscanf(value, "%"SCNd64"", &t) == 1) {
+            header->gcount = t;
+        } else if (!strcmp(keyword, "PCOUNT") && sscanf(value, "%"SCNd64"", &t) == 1) {
+            header->pcount = t;
+        }
+        dict_set_if_not_null(metadata, keyword, value);
+        break;
+    }
+    return 0;
+}
diff --git a/libavcodec/fits.h b/libavcodec/fits.h
new file mode 100644
index 0000000..ebae85c
--- /dev/null
+++ b/libavcodec/fits.h
@@ -0,0 +1,83 @@
+/*
+ * FITS image format common prototypes and structures
+ * Copyright (c) 2017 Paras Chadha
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FITS_H
+#define AVCODEC_FITS_H
+
+#include <inttypes.h>
+
+#include "libavutil/dict.h"
+
+typedef enum FITSHeaderState {
+    STATE_SIMPLE,
+    STATE_XTENSION,
+    STATE_BITPIX,
+    STATE_NAXIS,
+    STATE_NAXIS_N,
+    STATE_PCOUNT,
+    STATE_GCOUNT,
+    STATE_REST,
+} FITSHeaderState;
+
+/**
+ * Structure to store the header keywords in FITS file
+ */
+typedef struct FITSHeader {
+    FITSHeaderState state;
+    unsigned naxis_index;
+    int bitpix;
+    int64_t blank;
+    int blank_found;
+    int naxis;
+    int naxisn[999];
+    int pcount;
+    int gcount;
+    int groups;
+    int rgb; /**< 1 if file contains RGB image, 0 otherwise */
+    int image_extension;
+    double bscale;
+    double bzero;
+    int data_min_found;
+    double data_min;
+    int data_max_found;
+    double data_max;
+} FITSHeader;
+
+
+/**
+ * Initialize a single header line
+ * @param header pointer to the header
+ * @param state current state of parsing the header
+ * @return 0 if successful otherwise AVERROR_INVALIDDATA
+ */
+int avpriv_fits_header_init(FITSHeader *header, FITSHeaderState state);
+
+/**
+ * Parse a single header line
+ * @param avcl used in av_log
+ * @param header pointer to the header
+ * @param line one header line
+ * @param metadata used to store metadata while decoding
+ * @return 0 if successful otherwise AVERROR_INVALIDDATA
+ */
+int avpriv_fits_header_parse_line(void *avcl, FITSHeader *header, const uint8_t line[80], AVDictionary ***metadata);
+
+#endif /* AVCODEC_FITS_H */
diff --git a/libavcodec/fitsdec.c b/libavcodec/fitsdec.c
new file mode 100644
index 0000000..b075381
--- /dev/null
+++ b/libavcodec/fitsdec.c
@@ -0,0 +1,317 @@
+/*
+ * FITS image decoder
+ * Copyright (c) 2017 Paras Chadha
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * FITS image decoder
+ *
+ * Specification: https://fits.gsfc.nasa.gov/fits_standard.html Version 3.0
+ *
+ * Support all 2d images alongwith, bzero, bscale and blank keywords.
+ * RGBA images are supported as NAXIS3 = 3 or 4 i.e. Planes in RGBA order. Also CTYPE = 'RGB ' should be present.
+ * Also to interpret data, values are linearly scaled using min-max scaling but not RGB images.
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include <float.h>
+#include "libavutil/intreadwrite.h"
+#include "libavutil/intfloat.h"
+#include "libavutil/dict.h"
+#include "libavutil/opt.h"
+#include "fits.h"
+
+typedef struct FITSContext {
+    const AVClass *class;
+    int blank_val;
+} FITSContext;
+
+/**
+ * Calculate the data_min and data_max values from the data.
+ * This is called if the values are not present in the header.
+ * @param ptr8 pointer to the data
+ * @param header pointer to the header
+ * @param end pointer to end of packet
+ * @return 0 if calculated successfully otherwise AVERROR_INVALIDDATA
+ */
+static int fill_data_min_max(const uint8_t *ptr8, FITSHeader *header, const uint8_t *end)
+{
+    uint8_t t8;
+    int16_t t16;
+    int32_t t32;
+    int64_t t64;
+    float tflt;
+    double tdbl;
+    int i, j;
+
+    header->data_min = DBL_MAX;
+    header->data_max = DBL_MIN;
+    switch (header->bitpix) {
+#define CASE_N(a, t, rd) \
+    case a: \
+        for (i = 0; i < header->naxisn[1]; i++) { \
+            for (j = 0; j < header->naxisn[0]; j++) { \
+                t = rd; \
+                if (!header->blank_found || t != header->blank) { \
+                    if (t > header->data_max) \
+                        header->data_max = t; \
+                    if (t < header->data_min) \
+                        header->data_min = t; \
+                } \
+                ptr8 += abs(a) >> 3; \
+            } \
+        } \
+        break
+
+        CASE_N(-64, tdbl, av_int2double(AV_RB64(ptr8)));
+        CASE_N(-32, tflt, av_int2float(AV_RB32(ptr8)));
+        CASE_N(8, t8, ptr8[0]);
+        CASE_N(16, t16, AV_RB16(ptr8));
+        CASE_N(32, t32, AV_RB32(ptr8));
+        CASE_N(64, t64, AV_RB64(ptr8));
+        default:
+            return AVERROR_INVALIDDATA;
+    }
+    return 0;
+}
+
+/**
+ * Read the fits header and store the values in FITSHeader pointed by header
+ * @param avctx AVCodec context
+ * @param ptr pointer to pointer to the data
+ * @param header pointer to the FITSHeader
+ * @param end pointer to end of packet
+ * @param metadata pointer to pointer to AVDictionary to store metadata
+ * @return 0 if calculated successfully otherwise AVERROR_INVALIDDATA
+ */
+static int fits_read_header(AVCodecContext *avctx, const uint8_t **ptr, FITSHeader *header,
+                            const uint8_t *end, AVDictionary **metadata)
+{
+    const uint8_t *ptr8 = *ptr;
+    int lines_read, bytes_left, i, ret;
+    size_t size;
+
+    lines_read = 1; // to account for first header line, SIMPLE or XTENSION which is not included in packet...
+    avpriv_fits_header_init(header, STATE_BITPIX);
+    do {
+        if (end - ptr8 < 80)
+            return AVERROR_INVALIDDATA;
+        ret = avpriv_fits_header_parse_line(avctx, header, ptr8, &metadata);
+        ptr8 += 80;
+        lines_read++;
+    } while (!ret);
+    if (ret < 0)
+        return ret;
+
+    bytes_left = (((lines_read + 35) / 36) * 36 - lines_read) * 80;
+    if (end - ptr8 < bytes_left)
+        return AVERROR_INVALIDDATA;
+    ptr8 += bytes_left;
+
+    if (header->rgb && (header->naxis != 3 || (header->naxisn[2] != 3 && header->naxisn[2] != 4))) {
+        av_log(avctx, AV_LOG_ERROR, "File contains RGB image but NAXIS = %d and NAXIS3 = %d\n", header->naxis, header->naxisn[2]);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!header->rgb && header->naxis != 2) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported number of dimensions, NAXIS = %d\n", header->naxis);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (header->blank_found && (header->bitpix == -32 || header->bitpix == -64)) {
+        av_log(avctx, AV_LOG_WARNING, "BLANK keyword found but BITPIX = %d\n. Ignoring BLANK", header->bitpix);
+        header->blank_found = 0;
+    }
+
+    size = abs(header->bitpix) >> 3;
+    for (i = 0; i < header->naxis; i++) {
+        if (header->naxisn[i] > SIZE_MAX / size) {
+            av_log(avctx, AV_LOG_ERROR, "unsupported size of FITS image");
+            return AVERROR_INVALIDDATA;
+        }
+        size *= header->naxisn[i];
+    }
+
+    if (end - ptr8 < size)
+        return AVERROR_INVALIDDATA;
+    *ptr = ptr8;
+
+    if (!header->rgb && (!header->data_min_found || !header->data_max_found)) {
+        ret = fill_data_min_max(ptr8, header, end);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "invalid BITPIX, %d\n", header->bitpix);
+            return ret;
+        }
+    } else {
+        /*
+         * instead of applying bscale and bzero to every element,
+         * we can do inverse transformation on data_min and data_max
+         */
+        header->data_min = (header->data_min - header->bzero) / header->bscale;
+        header->data_max = (header->data_max - header->bzero) / header->bscale;
+    }
+
+    return 0;
+}
+
+static int fits_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *p=data;
+    const uint8_t *ptr8 = avpkt->data, *end;
+    uint8_t t8;
+    int16_t t16;
+    int32_t t32;
+    int64_t t64;
+    float   tflt;
+    double  tdbl;
+    int ret, i, j, k;
+    const int map[] = {2, 0, 1, 3}; // mapping from GBRA -> RGBA as RGBA is to be stored in FITS file..
+    uint8_t *dst8;
+    uint16_t *dst16;
+    uint64_t t;
+    FITSHeader header;
+    FITSContext * fitsctx = avctx->priv_data;
+
+    end = ptr8 + avpkt->size;
+    p->metadata = NULL;
+    ret = fits_read_header(avctx, &ptr8, &header, end, &p->metadata);
+    if (ret < 0)
+        return ret;
+
+    if (header.rgb) {
+        if (header.bitpix == 8) {
+            if (header.naxisn[2] == 3) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            }
+        } else if (header.bitpix == 16) {
+            if (header.naxisn[2] == 3) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRP16;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_GBRAP16;
+            }
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "unsupported BITPIX = %d\n", header.bitpix);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        if (header.bitpix == 8) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        } else {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+        }
+    }
+
+    if ((ret = ff_set_dimensions(avctx, header.naxisn[0], header.naxisn[1])) < 0)
+        return ret;
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
+
+    /*
+     * FITS stores images with bottom row first. Therefore we have
+     * to fill the image from bottom to top.
+     */
+    if (header.rgb) {
+        switch(header.bitpix) {
+#define CASE_RGB(cas, dst, type, dref) \
+    case cas: \
+        for (k = 0; k < header.naxisn[2]; k++) { \
+            for (i = 0; i < avctx->height; i++) { \
+                dst = (type *) (p->data[map[k]] + (avctx->height - i - 1) * p->linesize[map[k]]); \
+                for (j = 0; j < avctx->width; j++) { \
+                    t32 = dref(ptr8); \
+                    if (!header.blank_found || t32 != header.blank) { \
+                        t = t32 * header.bscale + header.bzero; \
+                    } else { \
+                        t = fitsctx->blank_val; \
+                    } \
+                    *dst++ = (type) t; \
+                    ptr8 += cas >> 3; \
+                } \
+            } \
+        } \
+        break
+
+            CASE_RGB(8, dst8, uint8_t, *);
+            CASE_RGB(16, dst16, uint16_t, AV_RB16);
+        }
+    } else {
+        switch (header.bitpix) {
+#define CASE_GRAY(cas, dst, type, t, rd) \
+    case cas: \
+        for (i = 0; i < avctx->height; i++) { \
+            dst = (type *) (p->data[0] + (avctx->height-i-1)* p->linesize[0]); \
+            for (j = 0; j < avctx->width; j++) { \
+                t = rd; \
+                if (!header.blank_found || t != header.blank) { \
+                    *dst++ = ((t - header.data_min) * ((1 << (sizeof(type) * 8)) - 1)) / (header.data_max - header.data_min); \
+                } else { \
+                    *dst++ = fitsctx->blank_val; \
+                } \
+                ptr8 += abs(cas) >> 3; \
+            } \
+        } \
+        break
+
+            CASE_GRAY(-64, dst16, uint16_t, tdbl, av_int2double(AV_RB64(ptr8)));
+            CASE_GRAY(-32, dst16, uint16_t, tflt, av_int2float(AV_RB32(ptr8)));
+            CASE_GRAY(8, dst8, uint8_t, t8, ptr8[0]);
+            CASE_GRAY(16, dst16, uint16_t, t16, AV_RB16(ptr8));
+            CASE_GRAY(32, dst16, uint16_t, t32, AV_RB32(ptr8));
+            CASE_GRAY(64, dst16, uint16_t, t64, AV_RB64(ptr8));
+            default:
+                av_log(avctx, AV_LOG_ERROR, "invalid BITPIX, %d\n", header.bitpix);
+                return AVERROR_INVALIDDATA;
+        }
+    }
+
+    p->key_frame = 1;
+    p->pict_type = AV_PICTURE_TYPE_I;
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static const AVOption fits_options[] = {
+    { "blank_value", "value that is used to replace BLANK pixels in data array", offsetof(FITSContext, blank_val), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 65535, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM},
+    { NULL },
+};
+
+static const AVClass fits_decoder_class = {
+    .class_name = "FITS decoder",
+    .item_name  = av_default_item_name,
+    .option     = fits_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_fits_decoder = {
+    .name           = "fits",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_FITS,
+    .priv_data_size = sizeof(FITSContext),
+    .decode         = fits_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("Flexible Image Transport System"),
+    .priv_class     = &fits_decoder_class
+};
diff --git a/libavcodec/fitsenc.c b/libavcodec/fitsenc.c
new file mode 100644
index 0000000..b44507e
--- /dev/null
+++ b/libavcodec/fitsenc.c
@@ -0,0 +1,129 @@
+/*
+ * FITS image encoder
+ * Copyright (c) 2017 Paras Chadha
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * FITS image encoder
+ *
+ * Specification: https://fits.gsfc.nasa.gov/fits_standard.html Version 3.0
+ *
+ * RGBA images are encoded as planes in RGBA order. So, NAXIS3 is 3 or 4 for them.
+ * Also CTYPE3 = 'RGB ' is added to the header to distinguish them from 3d images.
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+static int fits_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                            const AVFrame *pict, int *got_packet)
+{
+    AVFrame * const p = (AVFrame *)pict;
+    uint8_t *bytestream, *bytestream_start, *ptr;
+    const uint16_t flip = (1 << 15);
+    uint64_t data_size = 0, padded_data_size = 0;
+    int ret, bitpix, naxis3 = 1, i, j, k, bytes_left;
+    int map[] = {2, 0, 1, 3}; // mapping from GBRA -> RGBA as RGBA is to be stored in FITS file..
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_GRAY16BE:
+        map[0] = 0; // grayscale images should be directly mapped
+        if (avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+            bitpix = 8;
+        } else {
+            bitpix = 16;
+        }
+        break;
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRAP:
+        bitpix = 8;
+        if (avctx->pix_fmt == AV_PIX_FMT_GBRP) {
+            naxis3 = 3;
+        } else {
+            naxis3 = 4;
+        }
+        break;
+    case AV_PIX_FMT_GBRP16BE:
+    case AV_PIX_FMT_GBRAP16BE:
+        bitpix = 16;
+        if (avctx->pix_fmt == AV_PIX_FMT_GBRP16BE) {
+            naxis3 = 3;
+        } else {
+            naxis3 = 4;
+        }
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "unsupported pixel format\n");
+        return AVERROR(EINVAL);
+    }
+
+    data_size = (bitpix >> 3) * avctx->height * avctx->width * naxis3;
+    padded_data_size = ((data_size + 2879) / 2880 ) * 2880;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, padded_data_size, 0)) < 0)
+        return ret;
+
+    bytestream_start =
+    bytestream       = pkt->data;
+
+    for (k = 0; k < naxis3; k++) {
+        for (i = 0; i < avctx->height; i++) {
+            ptr = p->data[map[k]] + (avctx->height - i - 1) * p->linesize[map[k]];
+            if (bitpix == 16) {
+                for (j = 0; j < avctx->width; j++) {
+                    // subtracting bzero is equivalent to first bit flip
+                    bytestream_put_be16(&bytestream, AV_RB16(ptr) ^ flip);
+                    ptr += 2;
+                }
+            } else {
+                memcpy(bytestream, ptr, avctx->width);
+                bytestream += avctx->width;
+            }
+        }
+    }
+
+    bytes_left = padded_data_size - data_size;
+    memset(bytestream, 0, bytes_left);
+    bytestream += bytes_left;
+
+    pkt->size   = bytestream - bytestream_start;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+AVCodec ff_fits_encoder = {
+    .name           = "fits",
+    .long_name      = NULL_IF_CONFIG_SMALL("Flexible Image Transport System"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_FITS,
+    .encode2        = fits_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_GBRAP16BE,
+                                                     AV_PIX_FMT_GBRP16BE,
+                                                     AV_PIX_FMT_GBRP,
+                                                     AV_PIX_FMT_GBRAP,
+                                                     AV_PIX_FMT_GRAY16BE,
+                                                     AV_PIX_FMT_GRAY8,
+                                                     AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/flac.c b/libavcodec/flac.c
index 5062c1e..5ffbf93 100644
--- a/libavcodec/flac.c
+++ b/libavcodec/flac.c
@@ -2,29 +2,28 @@
  * FLAC common code
  * Copyright (c) 2009 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/crc.h"
 #include "libavutil/log.h"
-
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "flac.h"
 #include "flacdata.h"
 
@@ -41,33 +40,33 @@ static const uint64_t flac_channel_layouts[8] = {
     AV_CH_LAYOUT_7POINT1
 };
 
-static int64_t get_utf8(BitstreamContext *bc)
+static int64_t get_utf8(GetBitContext *gb)
 {
     int64_t val;
-    GET_UTF8(val, bitstream_read(bc, 8), return -1;)
+    GET_UTF8(val, get_bits(gb, 8), return -1;)
     return val;
 }
 
-int ff_flac_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
+int ff_flac_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
                                 FLACFrameInfo *fi, int log_level_offset)
 {
     int bs_code, sr_code, bps_code;
 
     /* frame sync code */
-    if ((bitstream_read(bc, 15) & 0x7FFF) != 0x7FFC) {
+    if ((get_bits(gb, 15) & 0x7FFF) != 0x7FFC) {
         av_log(avctx, AV_LOG_ERROR + log_level_offset, "invalid sync code\n");
         return AVERROR_INVALIDDATA;
     }
 
     /* variable block size stream code */
-    fi->is_var_size = bitstream_read_bit(bc);
+    fi->is_var_size = get_bits1(gb);
 
     /* block size and sample rate codes */
-    bs_code = bitstream_read(bc, 4);
-    sr_code = bitstream_read(bc, 4);
+    bs_code = get_bits(gb, 4);
+    sr_code = get_bits(gb, 4);
 
     /* channels and decorrelation */
-    fi->ch_mode = bitstream_read(bc, 4);
+    fi->ch_mode = get_bits(gb, 4);
     if (fi->ch_mode < FLAC_MAX_CHANNELS) {
         fi->channels = fi->ch_mode + 1;
         fi->ch_mode = FLAC_CHMODE_INDEPENDENT;
@@ -81,7 +80,7 @@ int ff_flac_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
     }
 
     /* bits per sample */
-    bps_code = bitstream_read(bc, 3);
+    bps_code = get_bits(gb, 3);
     if (bps_code == 3 || bps_code == 7) {
         av_log(avctx, AV_LOG_ERROR + log_level_offset,
                "invalid sample size code (%d)\n",
@@ -91,14 +90,14 @@ int ff_flac_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
     fi->bps = sample_size_table[bps_code];
 
     /* reserved bit */
-    if (bitstream_read_bit(bc)) {
+    if (get_bits1(gb)) {
         av_log(avctx, AV_LOG_ERROR + log_level_offset,
                "broken stream, invalid padding\n");
         return AVERROR_INVALIDDATA;
     }
 
     /* sample or frame count */
-    fi->frame_or_sample_num = get_utf8(bc);
+    fi->frame_or_sample_num = get_utf8(gb);
     if (fi->frame_or_sample_num < 0) {
         av_log(avctx, AV_LOG_ERROR + log_level_offset,
                "sample/frame number invalid; utf8 fscked\n");
@@ -111,9 +110,9 @@ int ff_flac_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
                "reserved blocksize code: 0\n");
         return AVERROR_INVALIDDATA;
     } else if (bs_code == 6) {
-        fi->blocksize = bitstream_read(bc, 8) + 1;
+        fi->blocksize = get_bits(gb, 8) + 1;
     } else if (bs_code == 7) {
-        fi->blocksize = bitstream_read(bc, 16) + 1;
+        fi->blocksize = get_bits(gb, 16) + 1;
     } else {
         fi->blocksize = ff_flac_blocksize_table[bs_code];
     }
@@ -122,11 +121,11 @@ int ff_flac_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
     if (sr_code < 12) {
         fi->samplerate = ff_flac_sample_rate_table[sr_code];
     } else if (sr_code == 12) {
-        fi->samplerate = bitstream_read(bc, 8) * 1000;
+        fi->samplerate = get_bits(gb, 8) * 1000;
     } else if (sr_code == 13) {
-        fi->samplerate = bitstream_read(bc, 16);
+        fi->samplerate = get_bits(gb, 16);
     } else if (sr_code == 14) {
-        fi->samplerate = bitstream_read(bc, 16) * 10;
+        fi->samplerate = get_bits(gb, 16) * 10;
     } else {
         av_log(avctx, AV_LOG_ERROR + log_level_offset,
                "illegal sample rate code %d\n",
@@ -135,9 +134,9 @@ int ff_flac_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
     }
 
     /* header CRC-8 check */
-    bitstream_skip(bc, 8);
-    if (av_crc(av_crc_get_table(AV_CRC_8_ATM), 0, bc->buffer,
-               bitstream_tell(bc) / 8)) {
+    skip_bits(gb, 8);
+    if (av_crc(av_crc_get_table(AV_CRC_8_ATM), 0, gb->buffer,
+               get_bits_count(gb)/8)) {
         av_log(avctx, AV_LOG_ERROR + log_level_offset,
                "header crc mismatch\n");
         return AVERROR_INVALIDDATA;
@@ -202,25 +201,33 @@ void ff_flac_set_channel_layout(AVCodecContext *avctx)
         avctx->channel_layout = 0;
 }
 
-void ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
+int ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
                               const uint8_t *buffer)
 {
-    BitstreamContext bc;
-    bitstream_init8(&bc, buffer, FLAC_STREAMINFO_SIZE);
+    GetBitContext gb;
+    init_get_bits(&gb, buffer, FLAC_STREAMINFO_SIZE*8);
 
-    bitstream_skip(&bc, 16); /* skip min blocksize */
-    s->max_blocksize = bitstream_read(&bc, 16);
+    skip_bits(&gb, 16); /* skip min blocksize */
+    s->max_blocksize = get_bits(&gb, 16);
     if (s->max_blocksize < FLAC_MIN_BLOCKSIZE) {
         av_log(avctx, AV_LOG_WARNING, "invalid max blocksize: %d\n",
                s->max_blocksize);
         s->max_blocksize = 16;
+        return AVERROR_INVALIDDATA;
     }
 
-    bitstream_skip(&bc, 24); /* skip min frame size */
-    s->max_framesize = bitstream_read(&bc, 24);
-    s->samplerate    = bitstream_read(&bc, 20);
-    s->channels      = bitstream_read(&bc, 3) + 1;
-    s->bps           = bitstream_read(&bc, 5) + 1;
+    skip_bits(&gb, 24); /* skip min frame size */
+    s->max_framesize = get_bits_long(&gb, 24);
+
+    s->samplerate = get_bits_long(&gb, 20);
+    s->channels = get_bits(&gb, 3) + 1;
+    s->bps = get_bits(&gb, 5) + 1;
+
+    if (s->bps < 4) {
+        av_log(avctx, AV_LOG_ERROR, "invalid bps: %d\n", s->bps);
+        s->bps = 16;
+        return AVERROR_INVALIDDATA;
+    }
 
     avctx->channels = s->channels;
     avctx->sample_rate = s->samplerate;
@@ -230,24 +237,10 @@ void ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
         av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels)
         ff_flac_set_channel_layout(avctx);
 
-    s->samples  = bitstream_read(&bc, 32) << 4;
-    s->samples |= bitstream_read(&bc, 4);
+    s->samples = get_bits64(&gb, 36);
 
-    bitstream_skip(&bc, 64); /* md5 sum */
-    bitstream_skip(&bc, 64); /* md5 sum */
-}
+    skip_bits_long(&gb, 64); /* md5 sum */
+    skip_bits_long(&gb, 64); /* md5 sum */
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-void avpriv_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
-                              const uint8_t *buffer)
-{
-    ff_flac_parse_streaminfo(avctx, s, buffer);
-}
-
-int avpriv_flac_is_extradata_valid(AVCodecContext *avctx,
-                               enum FLACExtradataFormat *format,
-                               uint8_t **streaminfo_start)
-{
-    return ff_flac_is_extradata_valid(avctx, format, streaminfo_start);
+    return 0;
 }
-#endif
diff --git a/libavcodec/flac.h b/libavcodec/flac.h
index d050717..991ab43 100644
--- a/libavcodec/flac.h
+++ b/libavcodec/flac.h
@@ -2,20 +2,20 @@
  * FLAC (Free Lossless Audio Codec) decoder/demuxer common functions
  * Copyright (c) 2008 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,8 @@
 #define AVCODEC_FLAC_H
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 
 #define FLAC_STREAMINFO_SIZE   34
 #define FLAC_MAX_CHANNELS       8
@@ -95,18 +95,12 @@ typedef struct FLACFrameInfo {
  * @param[out] avctx   codec context to set basic stream parameters
  * @param[out] s       where parsed information is stored
  * @param[in]  buffer  pointer to start of 34-byte streaminfo data
+ *
+ * @return negative error code on faiure or >= 0 on success
  */
-void ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
+int ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
                               const uint8_t *buffer);
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-void avpriv_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
-                                  const uint8_t *buffer);
-int avpriv_flac_is_extradata_valid(AVCodecContext *avctx,
-                                   enum FLACExtradataFormat *format,
-                                   uint8_t **streaminfo_start);
-#endif
-
 /**
  * Validate the FLAC extradata.
  * @param[in]  avctx codec context containing the extradata.
@@ -129,12 +123,12 @@ int ff_flac_get_max_frame_size(int blocksize, int ch, int bps);
 /**
  * Validate and decode a frame header.
  * @param      avctx AVCodecContext to use as av_log() context
- * @param      bc    BitstreamContext from which to read frame header
+ * @param      gb    GetBitContext from which to read frame header
  * @param[out] fi    frame information
  * @param      log_level_offset  log level offset. can be used to silence error messages.
  * @return non-zero on error, 0 if ok
  */
-int ff_flac_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
+int ff_flac_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
                                 FLACFrameInfo *fi, int log_level_offset);
 
 void ff_flac_set_channel_layout(AVCodecContext *avctx);
diff --git a/libavcodec/flac_parser.c b/libavcodec/flac_parser.c
index 8150ec4..2721286 100644
--- a/libavcodec/flac_parser.c
+++ b/libavcodec/flac_parser.c
@@ -2,20 +2,20 @@
  * FLAC parser
  * Copyright (c) 2010 Michael Chinen
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,7 +40,7 @@
 #include "flac.h"
 
 /** maximum number of adjacent headers that compare CRCs against each other   */
-#define FLAC_MAX_SEQUENTIAL_HEADERS 3
+#define FLAC_MAX_SEQUENTIAL_HEADERS 4
 /** minimum number of headers buffered and checked before returning frames    */
 #define FLAC_MIN_HEADERS 10
 /** estimate for average size of a FLAC frame                                 */
@@ -87,14 +87,16 @@ typedef struct FLACParseContext {
     int end_padded;                /**< specifies if fifo_buf's end is padded */
     uint8_t *wrap_buf;             /**< general fifo read buffer when wrapped */
     int wrap_buf_allocated_size;   /**< actual allocated size of the buffer   */
+    FLACFrameInfo last_fi;         /**< last decoded frame header info        */
+    int last_fi_valid;             /**< set if last_fi is valid               */
 } FLACParseContext;
 
 static int frame_header_is_valid(AVCodecContext *avctx, const uint8_t *buf,
                                  FLACFrameInfo *fi)
 {
-    BitstreamContext bc;
-    bitstream_init8(&bc, buf, MAX_FRAME_HEADER_SIZE);
-    return !ff_flac_decode_frame_header(avctx, &bc, fi, 127);
+    GetBitContext gb;
+    init_get_bits(&gb, buf, MAX_FRAME_HEADER_SIZE * 8);
+    return !ff_flac_decode_frame_header(avctx, &gb, fi, 127);
 }
 
 /**
@@ -180,7 +182,7 @@ static int find_headers_search_validate(FLACParseContext *fpc, int offset)
             size++;
         }
 
-        *end_handle = av_mallocz(sizeof(FLACHeaderMarker));
+        *end_handle = av_mallocz(sizeof(**end_handle));
         if (!*end_handle) {
             av_log(fpc->avctx, AV_LOG_ERROR,
                    "couldn't allocate FLACHeaderMarker\n");
@@ -190,6 +192,13 @@ static int find_headers_search_validate(FLACParseContext *fpc, int offset)
         (*end_handle)->offset       = offset;
         (*end_handle)->link_penalty = av_malloc(sizeof(int) *
                                             FLAC_MAX_SEQUENTIAL_HEADERS);
+        if (!(*end_handle)->link_penalty) {
+            av_freep(end_handle);
+            av_log(fpc->avctx, AV_LOG_ERROR,
+                   "couldn't allocate link_penalty\n");
+            return AVERROR(ENOMEM);
+        }
+
         for (i = 0; i < FLAC_MAX_SEQUENTIAL_HEADERS; i++)
             (*end_handle)->link_penalty[i] = FLAC_HEADER_NOT_PENALIZED_YET;
 
@@ -267,13 +276,12 @@ static int find_new_headers(FLACParseContext *fpc, int search_start)
     return size;
 }
 
-static int check_header_mismatch(FLACParseContext  *fpc,
-                                 FLACHeaderMarker  *header,
-                                 FLACHeaderMarker  *child,
-                                 int                log_level_offset)
+static int check_header_fi_mismatch(FLACParseContext  *fpc,
+                                    FLACFrameInfo     *header_fi,
+                                    FLACFrameInfo     *child_fi,
+                                    int                log_level_offset)
 {
-    FLACFrameInfo  *header_fi = &header->fi, *child_fi = &child->fi;
-    int deduction = 0, deduction_expected = 0, i;
+    int deduction = 0;
     if (child_fi->samplerate != header_fi->samplerate) {
         deduction += FLAC_HEADER_CHANGED_PENALTY;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
@@ -288,13 +296,25 @@ static int check_header_mismatch(FLACParseContext  *fpc,
         /* Changing blocking strategy not allowed per the spec */
         deduction += FLAC_HEADER_BASE_SCORE;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
-                   "blocking strategy change detected in adjacent frames\n");
+               "blocking strategy change detected in adjacent frames\n");
     }
     if (child_fi->channels != header_fi->channels) {
         deduction += FLAC_HEADER_CHANGED_PENALTY;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
-                   "number of channels change detected in adjacent frames\n");
+               "number of channels change detected in adjacent frames\n");
     }
+    return deduction;
+}
+
+static int check_header_mismatch(FLACParseContext  *fpc,
+                                 FLACHeaderMarker  *header,
+                                 FLACHeaderMarker  *child,
+                                 int                log_level_offset)
+{
+    FLACFrameInfo  *header_fi = &header->fi, *child_fi = &child->fi;
+    int deduction, deduction_expected = 0, i;
+    deduction = check_header_fi_mismatch(fpc, header_fi, child_fi,
+                                         log_level_offset);
     /* Check sample and frame numbers. */
     if ((child_fi->frame_or_sample_num - header_fi->frame_or_sample_num
          != header_fi->blocksize) &&
@@ -399,11 +419,18 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
     FLACHeaderMarker *child;
     int dist = 0;
     int child_score;
-
+    int base_score = FLAC_HEADER_BASE_SCORE;
     if (header->max_score != FLAC_HEADER_NOT_SCORED_YET)
         return header->max_score;
 
-    header->max_score = FLAC_HEADER_BASE_SCORE;
+    /* Modify the base score with changes from the last output header */
+    if (fpc->last_fi_valid) {
+        /* Silence the log since this will be repeated if selected */
+        base_score -= check_header_fi_mismatch(fpc, &fpc->last_fi, &header->fi,
+                                               AV_LOG_DEBUG);
+    }
+
+    header->max_score = base_score;
 
     /* Check and compute the children's scores. */
     child = header->next;
@@ -419,7 +446,7 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
         if (FLAC_HEADER_BASE_SCORE + child_score > header->max_score) {
             /* Keep the child because the frame scoring is dynamic. */
             header->best_child = child;
-            header->max_score  = FLAC_HEADER_BASE_SCORE + child_score;
+            header->max_score  = base_score + child_score;
         }
         child = child->next;
     }
@@ -430,7 +457,7 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
 static void score_sequences(FLACParseContext *fpc)
 {
     FLACHeaderMarker *curr;
-    int best_score = FLAC_HEADER_NOT_SCORED_YET;
+    int best_score = 0;//FLAC_HEADER_NOT_SCORED_YET;
     /* First pass to clear all old scores. */
     for (curr = fpc->headers; curr; curr = curr->next)
         curr->max_score = FLAC_HEADER_NOT_SCORED_YET;
@@ -469,7 +496,18 @@ static int get_best_header(FLACParseContext* fpc, const uint8_t **poutbuf,
                                         &fpc->wrap_buf,
                                         &fpc->wrap_buf_allocated_size);
 
+
+    if (fpc->pc->flags & PARSER_FLAG_USE_CODEC_TS){
+        if (header->fi.is_var_size)
+          fpc->pc->pts = header->fi.frame_or_sample_num;
+        else if (header->best_child)
+          fpc->pc->pts = header->fi.frame_or_sample_num * header->fi.blocksize;
+    }
+
     fpc->best_header_valid = 0;
+    fpc->last_fi_valid = 1;
+    fpc->last_fi = header->fi;
+
     /* Return the negative overread index so the client can compute pos.
        This should be the amount overread to the beginning of the child */
     if (child)
@@ -489,8 +527,16 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         FLACFrameInfo fi;
-        if (frame_header_is_valid(avctx, buf, &fi))
+        if (frame_header_is_valid(avctx, buf, &fi)) {
             s->duration = fi.blocksize;
+            if (!avctx->sample_rate)
+                avctx->sample_rate = fi.samplerate;
+            if (fpc->pc->flags & PARSER_FLAG_USE_CODEC_TS){
+                fpc->pc->pts = fi.frame_or_sample_num;
+                if (!fi.is_var_size)
+                  fpc->pc->pts *= fi.blocksize;
+            }
+        }
         *poutbuf      = buf;
         *poutbuf_size = buf_size;
         return buf_size;
@@ -540,20 +586,26 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
             temp = curr->next;
             av_freep(&curr->link_penalty);
             av_free(curr);
+            fpc->nb_headers_buffered--;
         }
         fpc->headers = fpc->best_header->next;
         av_freep(&fpc->best_header->link_penalty);
         av_freep(&fpc->best_header);
+        fpc->nb_headers_buffered--;
     }
 
-    /* Find and score new headers. */
-    while ((buf && read_end < buf + buf_size &&
+    /* Find and score new headers.                                     */
+    /* buf_size is to zero when padding, so check for this since we do */
+    /* not want to try to read more input once we have found the end.  */
+    /* Note that as (non-modified) parameters, buf can be non-NULL,    */
+    /* while buf_size is 0.                                            */
+    while ((buf && buf_size && read_end < buf + buf_size &&
             fpc->nb_headers_buffered < FLAC_MIN_HEADERS)
-           || (!buf && !fpc->end_padded)) {
+           || ((!buf || !buf_size) && !fpc->end_padded)) {
         int start_offset;
 
         /* Pad the end once if EOF, to check the final region for headers. */
-        if (!buf) {
+        if (!buf || !buf_size) {
             fpc->end_padded      = 1;
             buf_size = MAX_FRAME_HEADER_SIZE;
             read_end = read_start + MAX_FRAME_HEADER_SIZE;
@@ -567,23 +619,23 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
         if (!av_fifo_space(fpc->fifo_buf) &&
             av_fifo_size(fpc->fifo_buf) / FLAC_AVG_FRAME_SIZE >
-            fpc->nb_headers_buffered * 10) {
-            /* There is less than one valid flac header buffered for 10 headers
+            fpc->nb_headers_buffered * 20) {
+            /* There is less than one valid flac header buffered for 20 headers
              * buffered. Therefore the fifo is most likely filled with invalid
              * data and the input is not a flac file. */
             goto handle_error;
         }
 
         /* Fill the buffer. */
-        if (av_fifo_realloc2(fpc->fifo_buf,
-                             (read_end - read_start) + av_fifo_size(fpc->fifo_buf)) < 0) {
+        if (   av_fifo_space(fpc->fifo_buf) < read_end - read_start
+            && av_fifo_realloc2(fpc->fifo_buf, (read_end - read_start) + 2*av_fifo_size(fpc->fifo_buf)) < 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "couldn't reallocate buffer of size %td\n",
+                   "couldn't reallocate buffer of size %"PTRDIFF_SPECIFIER"\n",
                    (read_end - read_start) + av_fifo_size(fpc->fifo_buf));
             goto handle_error;
         }
 
-        if (buf) {
+        if (buf && buf_size) {
             av_fifo_generic_write(fpc->fifo_buf, (void*) read_start,
                                   read_end - read_start, NULL);
         } else {
@@ -620,10 +672,11 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
         /* restore the state pre-padding */
         if (fpc->end_padded) {
+            int warp = fpc->fifo_buf->wptr - fpc->fifo_buf->buffer < MAX_FRAME_HEADER_SIZE;
             /* HACK: drain the tail of the fifo */
             fpc->fifo_buf->wptr -= MAX_FRAME_HEADER_SIZE;
             fpc->fifo_buf->wndx -= MAX_FRAME_HEADER_SIZE;
-            if (fpc->fifo_buf->wptr < 0) {
+            if (warp) {
                 fpc->fifo_buf->wptr += fpc->fifo_buf->end -
                     fpc->fifo_buf->buffer;
             }
@@ -632,10 +685,17 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         }
     }
 
-    curr = fpc->headers;
-    for (curr = fpc->headers; curr; curr = curr->next)
-        if (!fpc->best_header || curr->max_score > fpc->best_header->max_score)
+    for (curr = fpc->headers; curr; curr = curr->next) {
+        if (!fpc->best_header || curr->max_score > fpc->best_header->max_score) {
             fpc->best_header = curr;
+        }
+    }
+
+    if (fpc->best_header && fpc->best_header->max_score <= 0) {
+        // Only accept a bad header if there is no other option to continue
+        if (!buf_size || !buf || read_end != buf || fpc->nb_headers_buffered < FLAC_MIN_HEADERS)
+            fpc->best_header = NULL;
+    }
 
     if (fpc->best_header) {
         fpc->best_header_valid = 1;
@@ -660,7 +720,7 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 handle_error:
     *poutbuf      = NULL;
     *poutbuf_size = 0;
-    return read_end - buf;
+    return buf_size ? read_end - buf : 0;
 }
 
 static av_cold int flac_parse_init(AVCodecParserContext *c)
@@ -669,7 +729,12 @@ static av_cold int flac_parse_init(AVCodecParserContext *c)
     fpc->pc = c;
     /* There will generally be FLAC_MIN_HEADERS buffered in the fifo before
        it drains.  This is allocated early to avoid slow reallocation. */
-    fpc->fifo_buf = av_fifo_alloc(FLAC_AVG_FRAME_SIZE * (FLAC_MIN_HEADERS + 3));
+    fpc->fifo_buf = av_fifo_alloc_array(FLAC_MIN_HEADERS + 3, FLAC_AVG_FRAME_SIZE);
+    if (!fpc->fifo_buf) {
+        av_log(fpc->avctx, AV_LOG_ERROR,
+                "couldn't allocate fifo_buf\n");
+        return AVERROR(ENOMEM);
+    }
     return 0;
 }
 
@@ -684,8 +749,8 @@ static void flac_parse_close(AVCodecParserContext *c)
         av_free(curr);
         curr = temp;
     }
-    av_fifo_free(fpc->fifo_buf);
-    av_free(fpc->wrap_buf);
+    av_fifo_freep(&fpc->fifo_buf);
+    av_freep(&fpc->wrap_buf);
 }
 
 AVCodecParser ff_flac_parser = {
diff --git a/libavcodec/flacdata.c b/libavcodec/flacdata.c
index 820c3aa..1954f32 100644
--- a/libavcodec/flacdata.c
+++ b/libavcodec/flacdata.c
@@ -2,20 +2,20 @@
  * FLAC data
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@ const int ff_flac_sample_rate_table[16] =
   8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000,
   0, 0, 0, 0 };
 
-const int16_t ff_flac_blocksize_table[16] = {
+const int32_t ff_flac_blocksize_table[16] = {
      0,    192, 576<<0, 576<<1, 576<<2, 576<<3,      0,      0,
 256<<0, 256<<1, 256<<2, 256<<3, 256<<4, 256<<5, 256<<6, 256<<7
 };
diff --git a/libavcodec/flacdata.h b/libavcodec/flacdata.h
index f566377..e2c1e5d 100644
--- a/libavcodec/flacdata.h
+++ b/libavcodec/flacdata.h
@@ -2,20 +2,20 @@
  * FLAC data header
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,6 @@
 
 extern const int ff_flac_sample_rate_table[16];
 
-extern const int16_t ff_flac_blocksize_table[16];
+extern const int32_t ff_flac_blocksize_table[16];
 
 #endif /* AVCODEC_FLACDATA_H */
diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
index 87ab2e5..c8eb456 100644
--- a/libavcodec/flacdec.c
+++ b/libavcodec/flacdec.c
@@ -2,20 +2,20 @@
  * FLAC (Free Lossless Audio Codec) decoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,20 +33,27 @@
 
 #include <limits.h>
 
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "bytestream.h"
 #include "golomb.h"
 #include "flac.h"
 #include "flacdata.h"
 #include "flacdsp.h"
+#include "thread.h"
+#include "unary.h"
+
 
 typedef struct FLACContext {
-    FLACSTREAMINFO
+    AVClass *class;
+    struct FLACStreaminfo flac_stream_info;
 
     AVCodecContext *avctx;                  ///< parent AVCodecContext
-    BitstreamContext bc;                    ///< BitstreamContext initialized to start at the current frame
+    GetBitContext gb;                       ///< GetBitContext initialized to start at the current frame
 
     int blocksize;                          ///< number of samples in the current frame
     int sample_shift;                       ///< shift required to make output samples 16-bit or 32-bit
@@ -56,6 +63,7 @@ typedef struct FLACContext {
     int32_t *decoded[FLAC_MAX_CHANNELS];    ///< decoded samples
     uint8_t *decoded_buffer;
     unsigned int decoded_buffer_size;
+    int buggy_lpc;                          ///< use workaround for old lavc encoded files
 
     FLACDSPContext dsp;
 } FLACContext;
@@ -65,7 +73,7 @@ static int allocate_buffers(FLACContext *s);
 static void flac_set_bps(FLACContext *s)
 {
     enum AVSampleFormat req = s->avctx->request_sample_fmt;
-    int need32 = s->bps > 16;
+    int need32 = s->flac_stream_info.bps > 16;
     int want32 = av_get_bytes_per_sample(req) > 2;
     int planar = av_sample_fmt_is_planar(req);
 
@@ -74,13 +82,13 @@ static void flac_set_bps(FLACContext *s)
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
         else
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
-        s->sample_shift = 32 - s->bps;
+        s->sample_shift = 32 - s->flac_stream_info.bps;
     } else {
         if (planar)
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         else
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
-        s->sample_shift = 16 - s->bps;
+        s->sample_shift = 16 - s->flac_stream_info.bps;
     }
 }
 
@@ -101,12 +109,15 @@ static av_cold int flac_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
 
     /* initialize based on the demuxer-supplied streamdata header */
-    ff_flac_parse_streaminfo(avctx, (FLACStreaminfo *)s, streaminfo);
+    ret = ff_flac_parse_streaminfo(avctx, &s->flac_stream_info, streaminfo);
+    if (ret < 0)
+        return ret;
     ret = allocate_buffers(s);
     if (ret < 0)
         return ret;
     flac_set_bps(s);
-    ff_flacdsp_init(&s->dsp, avctx->sample_fmt, s->bps);
+    ff_flacdsp_init(&s->dsp, avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
     s->got_streaminfo = 1;
 
     return 0;
@@ -124,8 +135,12 @@ static void dump_headers(AVCodecContext *avctx, FLACStreaminfo *s)
 static int allocate_buffers(FLACContext *s)
 {
     int buf_size;
+    int ret;
+
+    av_assert0(s->flac_stream_info.max_blocksize);
 
-    buf_size = av_samples_get_buffer_size(NULL, s->channels, s->max_blocksize,
+    buf_size = av_samples_get_buffer_size(NULL, s->flac_stream_info.channels,
+                                          s->flac_stream_info.max_blocksize,
                                           AV_SAMPLE_FMT_S32P, 0);
     if (buf_size < 0)
         return buf_size;
@@ -134,9 +149,12 @@ static int allocate_buffers(FLACContext *s)
     if (!s->decoded_buffer)
         return AVERROR(ENOMEM);
 
-    return av_samples_fill_arrays((uint8_t **)s->decoded, NULL,
-                                  s->decoded_buffer, s->channels,
-                                  s->max_blocksize, AV_SAMPLE_FMT_S32P, 0);
+    ret = av_samples_fill_arrays((uint8_t **)s->decoded, NULL,
+                                 s->decoded_buffer,
+                                 s->flac_stream_info.channels,
+                                 s->flac_stream_info.max_blocksize,
+                                 AV_SAMPLE_FMT_S32P, 0);
+    return ret < 0 ? ret : 0;
 }
 
 /**
@@ -159,12 +177,15 @@ static int parse_streaminfo(FLACContext *s, const uint8_t *buf, int buf_size)
         metadata_size != FLAC_STREAMINFO_SIZE) {
         return AVERROR_INVALIDDATA;
     }
-    ff_flac_parse_streaminfo(s->avctx, (FLACStreaminfo *)s, &buf[8]);
+    ret = ff_flac_parse_streaminfo(s->avctx, &s->flac_stream_info, &buf[8]);
+    if (ret < 0)
+        return ret;
     ret = allocate_buffers(s);
     if (ret < 0)
         return ret;
     flac_set_bps(s);
-    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, s->bps);
+    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
     s->got_streaminfo = 1;
 
     return 0;
@@ -184,12 +205,12 @@ static int get_metadata_size(const uint8_t *buf, int buf_size)
     buf += 4;
     do {
         if (buf_end - buf < 4)
-            return 0;
+            return AVERROR_INVALIDDATA;
         flac_parse_block_header(buf, &metadata_last, NULL, &metadata_size);
         buf += 4;
         if (buf_end - buf < metadata_size) {
             /* need more data in order to read the complete header */
-            return 0;
+            return AVERROR_INVALIDDATA;
         }
         buf += metadata_size;
     } while (!metadata_last);
@@ -199,13 +220,13 @@ static int get_metadata_size(const uint8_t *buf, int buf_size)
 
 static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
 {
-    BitstreamContext bc = s->bc;
+    GetBitContext gb = s->gb;
     int i, tmp, partition, method_type, rice_order;
     int rice_bits, rice_esc;
     int samples;
 
-    method_type = bitstream_read(&bc, 2);
-    rice_order  = bitstream_read(&bc, 4);
+    method_type = get_bits(&gb, 2);
+    rice_order  = get_bits(&gb, 4);
 
     samples   = s->blocksize >> rice_order;
     rice_bits = 4 + method_type;
@@ -220,6 +241,12 @@ static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
         return AVERROR_INVALIDDATA;
     }
 
+    if (samples << rice_order != s->blocksize) {
+        av_log(s->avctx, AV_LOG_ERROR, "invalid rice order: %i blocksize %i\n",
+               rice_order, s->blocksize);
+        return AVERROR_INVALIDDATA;
+    }
+
     if (pred_order > samples) {
         av_log(s->avctx, AV_LOG_ERROR, "invalid predictor order: %i > %i\n",
                pred_order, samples);
@@ -227,20 +254,27 @@ static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
     }
 
     for (partition = 0; partition < (1 << rice_order); partition++) {
-        tmp = bitstream_read(&bc, rice_bits);
+        tmp = get_bits(&gb, rice_bits);
         if (tmp == rice_esc) {
-            tmp = bitstream_read(&bc, 5);
+            tmp = get_bits(&gb, 5);
             for (; i < samples; i++)
-                *decoded++ = bitstream_read_signed(&bc, tmp);
+                *decoded++ = get_sbits_long(&gb, tmp);
         } else {
+            int real_limit = tmp ? (INT_MAX >> tmp) + 2 : INT_MAX;
             for (; i < samples; i++) {
-                *decoded++ = get_sr_golomb_flac(&bc, tmp, INT_MAX, 0);
+                int v = get_sr_golomb_flac(&gb, tmp, real_limit, 0);
+                if (v == 0x80000000){
+                    av_log(s->avctx, AV_LOG_ERROR, "invalid residual\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                *decoded++ = v;
             }
         }
         i= 0;
     }
 
-    s->bc = bc;
+    s->gb = gb;
 
     return 0;
 }
@@ -249,11 +283,13 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
                                  int pred_order, int bps)
 {
     const int blocksize = s->blocksize;
-    int a, b, c, d, i, ret;
+    unsigned av_uninit(a), av_uninit(b), av_uninit(c), av_uninit(d);
+    int i;
+    int ret;
 
     /* warm up samples */
     for (i = 0; i < pred_order; i++) {
-        decoded[i] = bitstream_read_signed(&s->bc, bps);
+        decoded[i] = get_sbits_long(&s->gb, bps);
     }
 
     if ((ret = decode_residuals(s, decoded, pred_order)) < 0)
@@ -266,7 +302,7 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
     if (pred_order > 2)
         c = b - decoded[pred_order-2] + decoded[pred_order-3];
     if (pred_order > 3)
-        d = c - decoded[pred_order-2] + 2*decoded[pred_order-3] - decoded[pred_order-4];
+        d = c - decoded[pred_order-2] + 2U*decoded[pred_order-3] - decoded[pred_order-4];
 
     switch (pred_order) {
     case 0:
@@ -295,6 +331,33 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
     return 0;
 }
 
+static void lpc_analyze_remodulate(SUINT32 *decoded, const int coeffs[32],
+                                   int order, int qlevel, int len, int bps)
+{
+    int i, j;
+    int ebps = 1 << (bps-1);
+    unsigned sigma = 0;
+
+    for (i = order; i < len; i++)
+        sigma |= decoded[i] + ebps;
+
+    if (sigma < 2*ebps)
+        return;
+
+    for (i = len - 1; i >= order; i--) {
+        int64_t p = 0;
+        for (j = 0; j < order; j++)
+            p += coeffs[j] * (int64_t)(int32_t)decoded[i-order+j];
+        decoded[i] -= p >> qlevel;
+    }
+    for (i = order; i < len; i++, decoded++) {
+        int32_t p = 0;
+        for (j = 0; j < order; j++)
+            p += coeffs[j] * (uint32_t)decoded[j];
+        decoded[j] += p >> qlevel;
+    }
+}
+
 static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order,
                                int bps)
 {
@@ -304,15 +367,15 @@ static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order,
 
     /* warm up samples */
     for (i = 0; i < pred_order; i++) {
-        decoded[i] = bitstream_read_signed(&s->bc, bps);
+        decoded[i] = get_sbits_long(&s->gb, bps);
     }
 
-    coeff_prec = bitstream_read(&s->bc, 4) + 1;
+    coeff_prec = get_bits(&s->gb, 4) + 1;
     if (coeff_prec == 16) {
         av_log(s->avctx, AV_LOG_ERROR, "invalid coeff precision\n");
         return AVERROR_INVALIDDATA;
     }
-    qlevel = bitstream_read_signed(&s->bc, 5);
+    qlevel = get_sbits(&s->gb, 5);
     if (qlevel < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "qlevel %d not supported, maybe buggy stream\n",
                qlevel);
@@ -320,13 +383,21 @@ static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order,
     }
 
     for (i = 0; i < pred_order; i++) {
-        coeffs[pred_order - i - 1] = bitstream_read_signed(&s->bc, coeff_prec);
+        coeffs[pred_order - i - 1] = get_sbits(&s->gb, coeff_prec);
     }
 
     if ((ret = decode_residuals(s, decoded, pred_order)) < 0)
         return ret;
 
-    s->dsp.lpc(decoded, coeffs, pred_order, qlevel, s->blocksize);
+    if (   (    s->buggy_lpc && s->flac_stream_info.bps <= 16)
+        || (   !s->buggy_lpc && bps <= 16
+            && bps + coeff_prec + av_log2(pred_order) <= 32)) {
+        s->dsp.lpc16(decoded, coeffs, pred_order, qlevel, s->blocksize);
+    } else {
+        s->dsp.lpc32(decoded, coeffs, pred_order, qlevel, s->blocksize);
+        if (s->flac_stream_info.bps <= 16)
+            lpc_analyze_remodulate(decoded, coeffs, pred_order, qlevel, s->blocksize, bps);
+    }
 
     return 0;
 }
@@ -335,7 +406,7 @@ static inline int decode_subframe(FLACContext *s, int channel)
 {
     int32_t *decoded = s->decoded[channel];
     int type, wasted = 0;
-    int bps = s->bps;
+    int bps = s->flac_stream_info.bps;
     int i, tmp, ret;
 
     if (channel == 0) {
@@ -346,25 +417,23 @@ static inline int decode_subframe(FLACContext *s, int channel)
             bps++;
     }
 
-    if (bitstream_read_bit(&s->bc)) {
+    if (get_bits1(&s->gb)) {
         av_log(s->avctx, AV_LOG_ERROR, "invalid subframe padding\n");
         return AVERROR_INVALIDDATA;
     }
-    type = bitstream_read(&s->bc, 6);
+    type = get_bits(&s->gb, 6);
 
-    if (bitstream_read_bit(&s->bc)) {
-        int left = bitstream_bits_left(&s->bc);
-        wasted = 1;
-        if ( left < 0 ||
-            (left < bps && !bitstream_peek(&s->bc, left)) ||
-                           !bitstream_peek(&s->bc, bps)) {
+    if (get_bits1(&s->gb)) {
+        int left = get_bits_left(&s->gb);
+        if ( left <= 0 ||
+            (left < bps && !show_bits_long(&s->gb, left)) ||
+                           !show_bits_long(&s->gb, bps)) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Invalid number of wasted bits > available bits (%d) - left=%d\n",
                    bps, left);
             return AVERROR_INVALIDDATA;
         }
-        while (!bitstream_read_bit(&s->bc))
-            wasted++;
+        wasted = 1 + get_unary(&s->gb, 1, get_bits_left(&s->gb));
         bps -= wasted;
     }
     if (bps > 32) {
@@ -374,12 +443,12 @@ static inline int decode_subframe(FLACContext *s, int channel)
 
 //FIXME use av_log2 for types
     if (type == 0) {
-        tmp = bitstream_read_signed(&s->bc, bps);
+        tmp = get_sbits_long(&s->gb, bps);
         for (i = 0; i < s->blocksize; i++)
             decoded[i] = tmp;
     } else if (type == 1) {
         for (i = 0; i < s->blocksize; i++)
-            decoded[i] = bitstream_read_signed(&s->bc, bps);
+            decoded[i] = get_sbits_long(&s->gb, bps);
     } else if ((type >= 8) && (type <= 12)) {
         if ((ret = decode_subframe_fixed(s, decoded, type & ~0x8, bps)) < 0)
             return ret;
@@ -391,10 +460,10 @@ static inline int decode_subframe(FLACContext *s, int channel)
         return AVERROR_INVALIDDATA;
     }
 
-    if (wasted) {
+    if (wasted && wasted < 32) {
         int i;
         for (i = 0; i < s->blocksize; i++)
-            decoded[i] <<= wasted;
+            decoded[i] = (unsigned)decoded[i] << wasted;
     }
 
     return 0;
@@ -403,82 +472,85 @@ static inline int decode_subframe(FLACContext *s, int channel)
 static int decode_frame(FLACContext *s)
 {
     int i, ret;
-    BitstreamContext *bc = &s->bc;
+    GetBitContext *gb = &s->gb;
     FLACFrameInfo fi;
 
-    if ((ret = ff_flac_decode_frame_header(s->avctx, bc, &fi, 0)) < 0) {
+    if ((ret = ff_flac_decode_frame_header(s->avctx, gb, &fi, 0)) < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "invalid frame header\n");
         return ret;
     }
 
-    if (s->channels && fi.channels != s->channels && s->got_streaminfo) {
-        s->channels = s->avctx->channels = fi.channels;
+    if (   s->flac_stream_info.channels
+        && fi.channels != s->flac_stream_info.channels
+        && s->got_streaminfo) {
+        s->flac_stream_info.channels = s->avctx->channels = fi.channels;
         ff_flac_set_channel_layout(s->avctx);
         ret = allocate_buffers(s);
         if (ret < 0)
             return ret;
     }
-    s->channels = s->avctx->channels = fi.channels;
+    s->flac_stream_info.channels = s->avctx->channels = fi.channels;
     if (!s->avctx->channel_layout)
         ff_flac_set_channel_layout(s->avctx);
     s->ch_mode = fi.ch_mode;
 
-    if (!s->bps && !fi.bps) {
+    if (!s->flac_stream_info.bps && !fi.bps) {
         av_log(s->avctx, AV_LOG_ERROR, "bps not found in STREAMINFO or frame header\n");
         return AVERROR_INVALIDDATA;
     }
     if (!fi.bps) {
-        fi.bps = s->bps;
-    } else if (s->bps && fi.bps != s->bps) {
+        fi.bps = s->flac_stream_info.bps;
+    } else if (s->flac_stream_info.bps && fi.bps != s->flac_stream_info.bps) {
         av_log(s->avctx, AV_LOG_ERROR, "switching bps mid-stream is not "
                                        "supported\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (!s->bps) {
-        s->bps = s->avctx->bits_per_raw_sample = fi.bps;
+    if (!s->flac_stream_info.bps) {
+        s->flac_stream_info.bps = s->avctx->bits_per_raw_sample = fi.bps;
         flac_set_bps(s);
     }
 
-    if (!s->max_blocksize)
-        s->max_blocksize = FLAC_MAX_BLOCKSIZE;
-    if (fi.blocksize > s->max_blocksize) {
+    if (!s->flac_stream_info.max_blocksize)
+        s->flac_stream_info.max_blocksize = FLAC_MAX_BLOCKSIZE;
+    if (fi.blocksize > s->flac_stream_info.max_blocksize) {
         av_log(s->avctx, AV_LOG_ERROR, "blocksize %d > %d\n", fi.blocksize,
-               s->max_blocksize);
+               s->flac_stream_info.max_blocksize);
         return AVERROR_INVALIDDATA;
     }
     s->blocksize = fi.blocksize;
 
-    if (!s->samplerate && !fi.samplerate) {
+    if (!s->flac_stream_info.samplerate && !fi.samplerate) {
         av_log(s->avctx, AV_LOG_ERROR, "sample rate not found in STREAMINFO"
                                         " or frame header\n");
         return AVERROR_INVALIDDATA;
     }
     if (fi.samplerate == 0)
-        fi.samplerate = s->samplerate;
-    s->samplerate = s->avctx->sample_rate = fi.samplerate;
+        fi.samplerate = s->flac_stream_info.samplerate;
+    s->flac_stream_info.samplerate = s->avctx->sample_rate = fi.samplerate;
 
     if (!s->got_streaminfo) {
         ret = allocate_buffers(s);
         if (ret < 0)
             return ret;
-        ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, s->bps);
         s->got_streaminfo = 1;
-        dump_headers(s->avctx, (FLACStreaminfo *)s);
+        dump_headers(s->avctx, &s->flac_stream_info);
     }
+    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
 
-//    dump_headers(s->avctx, (FLACStreaminfo *)s);
+//    dump_headers(s->avctx, &s->flac_stream_info);
 
     /* subframes */
-    for (i = 0; i < s->channels; i++) {
+    for (i = 0; i < s->flac_stream_info.channels; i++) {
         if ((ret = decode_subframe(s, i)) < 0)
             return ret;
     }
 
-    bitstream_align(bc);
+    align_get_bits(gb);
 
     /* frame footer */
-    bitstream_skip(bc, 16); /* data crc */
+    skip_bits(gb, 16); /* data crc */
 
     return 0;
 }
@@ -487,6 +559,7 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame_ptr, AVPacket *avpkt)
 {
     AVFrame *frame     = data;
+    ThreadFrame tframe = { .f = data };
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     FLACContext *s = avctx->priv_data;
@@ -495,12 +568,22 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 0;
 
-    if (s->max_framesize == 0) {
-        s->max_framesize =
-            ff_flac_get_max_frame_size(s->max_blocksize ? s->max_blocksize : FLAC_MAX_BLOCKSIZE,
+    if (s->flac_stream_info.max_framesize == 0) {
+        s->flac_stream_info.max_framesize =
+            ff_flac_get_max_frame_size(s->flac_stream_info.max_blocksize ? s->flac_stream_info.max_blocksize : FLAC_MAX_BLOCKSIZE,
                                        FLAC_MAX_CHANNELS, 32);
     }
 
+    if (buf_size > 5 && !memcmp(buf, "\177FLAC", 5)) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skipping flac header packet 1\n");
+        return buf_size;
+    }
+
+    if (buf_size > 0 && (*buf & 0x7F) == FLAC_METADATA_TYPE_VORBIS_COMMENT) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skipping vorbis comment\n");
+        return buf_size;
+    }
+
     /* check that there is at least the smallest decodable amount of data.
        this amount corresponds to the smallest valid FLAC frame possible.
        FF F8 69 02 00 00 9A 00 00 34 46 */
@@ -517,21 +600,29 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     /* decode frame */
-    bitstream_init8(&s->bc, buf, buf_size);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
     if ((ret = decode_frame(s)) < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "decode_frame() failed\n");
         return ret;
     }
-    bytes_read = (bitstream_tell(&s->bc) + 7) / 8;
+    bytes_read = get_bits_count(&s->gb)/8;
+
+    if ((s->avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_COMPLIANT)) &&
+        av_crc(av_crc_get_table(AV_CRC_16_ANSI),
+               0, buf, bytes_read)) {
+        av_log(s->avctx, AV_LOG_ERROR, "CRC error at PTS %"PRId64"\n", avpkt->pts);
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return AVERROR_INVALIDDATA;
+    }
 
     /* get output buffer */
     frame->nb_samples = s->blocksize;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
-    }
 
-    s->dsp.decorrelate[s->ch_mode](frame->data, s->decoded, s->channels,
+    s->dsp.decorrelate[s->ch_mode](frame->data, s->decoded,
+                                   s->flac_stream_info.channels,
                                    s->blocksize, s->sample_shift);
 
     if (bytes_read > buf_size) {
@@ -548,6 +639,19 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
     return bytes_read;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    FLACContext *s = avctx->priv_data;
+    s->decoded_buffer = NULL;
+    s->decoded_buffer_size = 0;
+    s->avctx = avctx;
+    if (s->flac_stream_info.max_blocksize)
+        return allocate_buffers(s);
+    return 0;
+}
+#endif
+
 static av_cold int flac_decode_close(AVCodecContext *avctx)
 {
     FLACContext *s = avctx->priv_data;
@@ -557,6 +661,18 @@ static av_cold int flac_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption options[] = {
+{ "use_buggy_lpc", "emulate old buggy lavc behavior", offsetof(FLACContext, buggy_lpc), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM },
+{ NULL },
+};
+
+static const AVClass flac_decoder_class = {
+    "FLAC decoder",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_flac_decoder = {
     .name           = "flac",
     .long_name      = NULL_IF_CONFIG_SMALL("FLAC (Free Lossless Audio Codec)"),
@@ -566,10 +682,12 @@ AVCodec ff_flac_decoder = {
     .init           = flac_decode_init,
     .close          = flac_decode_close,
     .decode         = flac_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16,
                                                       AV_SAMPLE_FMT_S16P,
                                                       AV_SAMPLE_FMT_S32,
                                                       AV_SAMPLE_FMT_S32P,
-                                                      -1 },
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &flac_decoder_class,
 };
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index b916869..bc9a5db 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,8 +49,8 @@ static void flac_lpc_16_c(int32_t *decoded, const int coeffs[32],
     int i, j;
 
     for (i = pred_order; i < len - 1; i += 2, decoded += 2) {
-        int c = coeffs[0];
-        int d = decoded[0];
+        SUINT c = coeffs[0];
+        SUINT d = decoded[0];
         int s0 = 0, s1 = 0;
         for (j = 1; j < pred_order; j++) {
             s0 += c*d;
@@ -59,15 +59,15 @@ static void flac_lpc_16_c(int32_t *decoded, const int coeffs[32],
             c = coeffs[j];
         }
         s0 += c*d;
-        d = decoded[j] += s0 >> qlevel;
+        d = decoded[j] += (SUINT)(s0 >> qlevel);
         s1 += c*d;
-        decoded[j + 1] += s1 >> qlevel;
+        decoded[j + 1] += (SUINT)(s1 >> qlevel);
     }
     if (i < len) {
         int sum = 0;
         for (j = 0; j < pred_order; j++)
-            sum += coeffs[j] * decoded[j];
-        decoded[j] += sum >> qlevel;
+            sum += coeffs[j] * (SUINT)decoded[j];
+        decoded[j] = decoded[j] + (unsigned)(sum >> qlevel);
     }
 }
 
@@ -85,16 +85,13 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
 
 }
 
-av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
                              int bps)
 {
-    if (bps > 16) {
-        c->lpc            = flac_lpc_32_c;
-        c->lpc_encode     = flac_lpc_encode_c_32;
-    } else {
-        c->lpc            = flac_lpc_16_c;
-        c->lpc_encode     = flac_lpc_encode_c_16;
-    }
+    c->lpc16        = flac_lpc_16_c;
+    c->lpc32        = flac_lpc_32_c;
+    c->lpc16_encode = flac_lpc_encode_c_16;
+    c->lpc32_encode = flac_lpc_encode_c_32;
 
     switch (fmt) {
     case AV_SAMPLE_FMT_S32:
@@ -127,5 +124,7 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
     }
 
     if (ARCH_ARM)
-        ff_flacdsp_init_arm(c, fmt, bps);
+        ff_flacdsp_init_arm(c, fmt, channels, bps);
+    if (ARCH_X86)
+        ff_flacdsp_init_x86(c, fmt, channels, bps);
 }
diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
index 33184b5..7bb0dd0 100644
--- a/libavcodec/flacdsp.h
+++ b/libavcodec/flacdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,18 +20,24 @@
 #define AVCODEC_FLACDSP_H
 
 #include <stdint.h>
+#include "libavutil/internal.h"
 #include "libavutil/samplefmt.h"
 
 typedef struct FLACDSPContext {
     void (*decorrelate[4])(uint8_t **out, int32_t **in, int channels,
                            int len, int shift);
-    void (*lpc)(int32_t *samples, const int coeffs[32], int order,
-                int qlevel, int len);
-    void (*lpc_encode)(int32_t *res, const int32_t *smp, int len, int order,
-                       const int32_t *coefs, int shift);
+    void (*lpc16)(int32_t *samples, const int coeffs[32], int order,
+                  int qlevel, int len);
+    void (*lpc32)(int32_t *samples, const int coeffs[32], int order,
+                  int qlevel, int len);
+    void (*lpc16_encode)(int32_t *res, const int32_t *smp, int len, int order,
+                         const int32_t coefs[32], int shift);
+    void (*lpc32_encode)(int32_t *res, const int32_t *smp, int len, int order,
+                         const int32_t coefs[32], int shift);
 } FLACDSPContext;
 
-void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
-void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
+void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
+void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
+void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
 
 #endif /* AVCODEC_FLACDSP_H */
diff --git a/libavcodec/flacdsp_lpc_template.c b/libavcodec/flacdsp_lpc_template.c
index 269e64b..5d532e0 100644
--- a/libavcodec/flacdsp_lpc_template.c
+++ b/libavcodec/flacdsp_lpc_template.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,3 +139,21 @@ static void FUNC(flac_lpc_encode_c)(int32_t *res, const int32_t *smp, int len,
     }
 #endif
 }
+
+/* Comment for clarity/de-obfuscation.
+ *
+ * for (int i = order; i < len; i++) {
+ *     int32_t p = 0;
+ *     for (int j = 0; j < order; j++) {
+ *         int c = coefs[j];
+ *         int s = smp[(i-1)-j];
+ *         p    += c*s;
+ *     }
+ *     res[i] = smp[i] - (p >> shift);
+ * }
+ *
+ * The CONFIG_SMALL code above simplifies to this, in the case of SAMPLE_SIZE
+ * not being equal to 32 (at the present time that means for 16-bit audio). The
+ * code above does 2 samples per iteration.  Commit bfdd5bc (made all the way
+ * back in 2007) says that way is faster.
+ */
diff --git a/libavcodec/flacdsp_template.c b/libavcodec/flacdsp_template.c
index 0affe22..776c78d 100644
--- a/libavcodec/flacdsp_template.c
+++ b/libavcodec/flacdsp_template.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,7 +56,7 @@ static void FUNC(flac_decorrelate_indep_c)(uint8_t **out, int32_t **in,
 
     for (j = 0; j < len; j++)
         for (i = 0; i < channels; i++)
-            S(samples, i, j) = in[i][j] << shift;
+            S(samples, i, j) = (int)((unsigned)in[i][j] << shift);
 }
 
 static void FUNC(flac_decorrelate_ls_c)(uint8_t **out, int32_t **in,
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 631f02f..170c3ca 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -2,23 +2,24 @@
  * FLAC audio encoder
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/crc.h"
 #include "libavutil/intmath.h"
 #include "libavutil/md5.h"
@@ -26,6 +27,7 @@
 
 #include "avcodec.h"
 #include "bswapdsp.h"
+#include "put_bits.h"
 #include "golomb.h"
 #include "internal.h"
 #include "lpc.h"
@@ -42,6 +44,7 @@
 #define MAX_PARTITION_ORDER 8
 #define MAX_PARTITIONS     (1 << MAX_PARTITION_ORDER)
 #define MAX_LPC_PRECISION  15
+#define MIN_LPC_SHIFT       0
 #define MAX_LPC_SHIFT      15
 
 enum CodingMode {
@@ -61,13 +64,14 @@ typedef struct CompressionOptions {
     int min_partition_order;
     int max_partition_order;
     int ch_mode;
+    int exact_rice_parameters;
+    int multi_dim_quant;
 } CompressionOptions;
 
 typedef struct RiceContext {
     enum CodingMode coding_mode;
     int porder;
     int params[MAX_PARTITIONS];
-    uint32_t udata[FLAC_MAX_BLOCKSIZE];
 } RiceContext;
 
 typedef struct FlacSubframe {
@@ -78,9 +82,13 @@ typedef struct FlacSubframe {
     int order;
     int32_t coefs[MAX_LPC_ORDER];
     int shift;
+
     RiceContext rc;
+    uint32_t rc_udata[FLAC_MAX_BLOCKSIZE];
+    uint64_t rc_sums[32][MAX_PARTITIONS];
+
     int32_t samples[FLAC_MAX_BLOCKSIZE];
-    int32_t residual[FLAC_MAX_BLOCKSIZE+1];
+    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
 } FlacSubframe;
 
 typedef struct FlacFrame {
@@ -157,7 +165,7 @@ static int select_blocksize(int samplerate, int block_time_ms)
     int target;
     int blocksize;
 
-    assert(samplerate > 0);
+    av_assert0(samplerate > 0);
     blocksize = ff_flac_blocksize_table[1];
     target    = (samplerate * block_time_ms) / 1000;
     for (i = 0; i < 16; i++) {
@@ -251,13 +259,16 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
         break;
     }
 
-    if (channels < 1 || channels > FLAC_MAX_CHANNELS)
-        return -1;
+    if (channels < 1 || channels > FLAC_MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "%d channels not supported (max %d)\n",
+               channels, FLAC_MAX_CHANNELS);
+        return AVERROR(EINVAL);
+    }
     s->channels = channels;
 
     /* find samplerate in table */
     if (freq < 1)
-        return -1;
+        return AVERROR(EINVAL);
     for (i = 4; i < 12; i++) {
         if (freq == ff_flac_sample_rate_table[i]) {
             s->samplerate = ff_flac_sample_rate_table[i];
@@ -278,7 +289,8 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
             s->sr_code[0] = 13;
             s->sr_code[1] = freq;
         } else {
-            return -1;
+            av_log(avctx, AV_LOG_ERROR, "%d Hz not supported\n", freq);
+            return AVERROR(EINVAL);
         }
         s->samplerate = freq;
     }
@@ -293,7 +305,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
     if (level > 12) {
         av_log(avctx, AV_LOG_ERROR, "invalid compression level: %d\n",
                s->options.compression_level);
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     s->options.block_time_ms = ((int[]){ 27, 27, 27,105,105,105,105,105,105,105,105,105,105})[level];
@@ -341,7 +353,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
                    avctx->min_prediction_order > MAX_LPC_ORDER) {
             av_log(avctx, AV_LOG_ERROR, "invalid min prediction order: %d\n",
                    avctx->min_prediction_order);
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->options.min_prediction_order = avctx->min_prediction_order;
     }
@@ -357,7 +369,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
                    avctx->max_prediction_order > MAX_LPC_ORDER) {
             av_log(avctx, AV_LOG_ERROR, "invalid max prediction order: %d\n",
                    avctx->max_prediction_order);
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->options.max_prediction_order = avctx->max_prediction_order;
     }
@@ -384,7 +396,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (s->options.max_prediction_order < s->options.min_prediction_order) {
         av_log(avctx, AV_LOG_ERROR, "invalid prediction orders: min=%d max=%d\n",
                s->options.min_prediction_order, s->options.max_prediction_order);
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     if (avctx->frame_size > 0) {
@@ -392,7 +404,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 avctx->frame_size > FLAC_MAX_BLOCKSIZE) {
             av_log(avctx, AV_LOG_ERROR, "invalid block size: %d\n",
                    avctx->frame_size);
-            return -1;
+            return AVERROR(EINVAL);
         }
     } else {
         s->avctx->frame_size = select_blocksize(s->samplerate, s->options.block_time_ms);
@@ -420,11 +432,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->frame_count   = 0;
     s->min_framesize = s->max_framesize;
 
+    if (channels == 3 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_STEREO|AV_CH_FRONT_CENTER) ||
+        channels == 4 &&
+            avctx->channel_layout != AV_CH_LAYOUT_2_2 &&
+            avctx->channel_layout != AV_CH_LAYOUT_QUAD ||
+        channels == 5 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0_BACK ||
+        channels == 6 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1_BACK) {
+        if (avctx->channel_layout) {
+            av_log(avctx, AV_LOG_ERROR, "Channel layout not supported by Flac, "
+                                             "output stream will have incorrect "
+                                             "channel layout.\n");
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+                                               "will use Flac channel layout for "
+                                               "%d channels.\n", channels);
+        }
+    }
+
     ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size,
                       s->options.max_prediction_order, FF_LPC_TYPE_LEVINSON);
 
     ff_bswapdsp_init(&s->bdsp);
-    ff_flacdsp_init(&s->flac_dsp, avctx->sample_fmt,
+    ff_flacdsp_init(&s->flac_dsp, avctx->sample_fmt, channels,
                     avctx->bits_per_raw_sample);
 
     dprint_compression_options(s);
@@ -500,7 +534,7 @@ static void copy_samples(FlacEncodeContext *s, const void *samples)
 }
 
 
-static uint64_t rice_count_exact(int32_t *res, int n, int k)
+static uint64_t rice_count_exact(const int32_t *res, int n, int k)
 {
     int i;
     uint64_t count = 0;
@@ -524,6 +558,9 @@ static uint64_t subframe_count_exact(FlacEncodeContext *s, FlacSubframe *sub,
     /* subframe header */
     count += 8;
 
+    if (sub->wasted)
+        count += sub->wasted;
+
     /* subframe */
     if (sub->type == FLAC_SUBFRAME_CONSTANT) {
         count += sub->obits;
@@ -578,24 +615,44 @@ static int find_optimal_param(uint64_t sum, int n, int max_param)
     return FFMIN(k, max_param);
 }
 
+static int find_optimal_param_exact(uint64_t sums[32][MAX_PARTITIONS], int i, int max_param)
+{
+    int bestk = 0;
+    int64_t bestbits = INT64_MAX;
+    int k;
+
+    for (k = 0; k <= max_param; k++) {
+        int64_t bits = sums[k][i];
+        if (bits < bestbits) {
+            bestbits = bits;
+            bestk = k;
+        }
+    }
+
+    return bestk;
+}
 
 static uint64_t calc_optimal_rice_params(RiceContext *rc, int porder,
-                                         uint64_t *sums, int n, int pred_order)
+                                         uint64_t sums[32][MAX_PARTITIONS],
+                                         int n, int pred_order, int max_param, int exact)
 {
     int i;
-    int k, cnt, part, max_param;
+    int k, cnt, part;
     uint64_t all_bits;
 
-    max_param = (1 << rc->coding_mode) - 2;
-
     part     = (1 << porder);
     all_bits = 4 * part;
 
     cnt = (n >> porder) - pred_order;
     for (i = 0; i < part; i++) {
-        k = find_optimal_param(sums[i], cnt, max_param);
+        if (exact) {
+            k = find_optimal_param_exact(sums, i, max_param);
+            all_bits += sums[k][i];
+        } else {
+            k = find_optimal_param(sums[0][i], cnt, max_param);
+            all_bits += rice_encode_count(sums[0][i], cnt, k);
+        }
         rc->params[i] = k;
-        all_bits += rice_encode_count(sums[i], cnt, k);
         cnt = n >> porder;
     }
 
@@ -605,61 +662,80 @@ static uint64_t calc_optimal_rice_params(RiceContext *rc, int porder,
 }
 
 
-static void calc_sums(int pmin, int pmax, uint32_t *data, int n, int pred_order,
-                      uint64_t sums[][MAX_PARTITIONS])
+static void calc_sum_top(int pmax, int kmax, const uint32_t *data, int n, int pred_order,
+                         uint64_t sums[32][MAX_PARTITIONS])
 {
-    int i, j;
+    int i, k;
     int parts;
-    uint32_t *res, *res_end;
+    const uint32_t *res, *res_end;
 
     /* sums for highest level */
     parts   = (1 << pmax);
-    res     = &data[pred_order];
-    res_end = &data[n >> pmax];
-    for (i = 0; i < parts; i++) {
-        uint64_t sum = 0;
-        while (res < res_end)
-            sum += *(res++);
-        sums[pmax][i] = sum;
-        res_end += n >> pmax;
-    }
-    /* sums for lower levels */
-    for (i = pmax - 1; i >= pmin; i--) {
-        parts = (1 << i);
-        for (j = 0; j < parts; j++)
-            sums[i][j] = sums[i+1][2*j] + sums[i+1][2*j+1];
+
+    for (k = 0; k <= kmax; k++) {
+        res     = &data[pred_order];
+        res_end = &data[n >> pmax];
+        for (i = 0; i < parts; i++) {
+            if (kmax) {
+                uint64_t sum = (1LL + k) * (res_end - res);
+                while (res < res_end)
+                    sum += *(res++) >> k;
+                sums[k][i] = sum;
+            } else {
+                uint64_t sum = 0;
+                while (res < res_end)
+                    sum += *(res++);
+                sums[k][i] = sum;
+            }
+            res_end += n >> pmax;
+        }
     }
 }
 
+static void calc_sum_next(int level, uint64_t sums[32][MAX_PARTITIONS], int kmax)
+{
+    int i, k;
+    int parts = (1 << level);
+    for (i = 0; i < parts; i++) {
+        for (k=0; k<=kmax; k++)
+            sums[k][i] = sums[k][2*i] + sums[k][2*i+1];
+    }
+}
 
-static uint64_t calc_rice_params(RiceContext *rc, int pmin, int pmax,
-                                 int32_t *data, int n, int pred_order)
+static uint64_t calc_rice_params(RiceContext *rc,
+                                 uint32_t udata[FLAC_MAX_BLOCKSIZE],
+                                 uint64_t sums[32][MAX_PARTITIONS],
+                                 int pmin, int pmax,
+                                 const int32_t *data, int n, int pred_order, int exact)
 {
     int i;
     uint64_t bits[MAX_PARTITION_ORDER+1];
     int opt_porder;
     RiceContext tmp_rc;
-    uint64_t sums[MAX_PARTITION_ORDER + 1][MAX_PARTITIONS] = { { 0 } };
+    int kmax = (1 << rc->coding_mode) - 2;
 
-    assert(pmin >= 0 && pmin <= MAX_PARTITION_ORDER);
-    assert(pmax >= 0 && pmax <= MAX_PARTITION_ORDER);
-    assert(pmin <= pmax);
+    av_assert1(pmin >= 0 && pmin <= MAX_PARTITION_ORDER);
+    av_assert1(pmax >= 0 && pmax <= MAX_PARTITION_ORDER);
+    av_assert1(pmin <= pmax);
 
     tmp_rc.coding_mode = rc->coding_mode;
 
     for (i = 0; i < n; i++)
-        rc->udata[i] = (2 * data[i]) ^ (data[i] >> 31);
+        udata[i] = (2 * data[i]) ^ (data[i] >> 31);
 
-    calc_sums(pmin, pmax, rc->udata, n, pred_order, sums);
+    calc_sum_top(pmax, exact ? kmax : 0, udata, n, pred_order, sums);
 
     opt_porder = pmin;
     bits[pmin] = UINT32_MAX;
-    for (i = pmin; i <= pmax; i++) {
-        bits[i] = calc_optimal_rice_params(&tmp_rc, i, sums[i], n, pred_order);
-        if (bits[i] <= bits[opt_porder]) {
+    for (i = pmax; ; ) {
+        bits[i] = calc_optimal_rice_params(&tmp_rc, i, sums, n, pred_order, kmax, exact);
+        if (bits[i] < bits[opt_porder] || pmax == pmin) {
             opt_porder = i;
             *rc = tmp_rc;
         }
+        if (i == pmin)
+            break;
+        calc_sum_next(--i, sums, exact ? kmax : 0);
     }
 
     return bits[opt_porder];
@@ -686,8 +762,8 @@ static uint64_t find_subframe_rice_params(FlacEncodeContext *s,
     uint64_t bits = 8 + pred_order * sub->obits + 2 + sub->rc.coding_mode;
     if (sub->type == FLAC_SUBFRAME_LPC)
         bits += 4 + 5 + pred_order * s->options.lpc_coeff_precision;
-    bits += calc_rice_params(&sub->rc, pmin, pmax, sub->residual,
-                             s->frame.blocksize, pred_order);
+    bits += calc_rice_params(&sub->rc, sub->rc_udata, sub->rc_sums, pmin, pmax, sub->residual,
+                             s->frame.blocksize, pred_order, s->options.exact_rice_parameters);
     return bits;
 }
 
@@ -809,7 +885,7 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
     opt_order = ff_lpc_calc_coefs(&s->lpc_ctx, smp, n, min_order, max_order,
                                   s->options.lpc_coeff_precision, coefs, shift, s->options.lpc_type,
                                   s->options.lpc_passes, omethod,
-                                  MAX_LPC_SHIFT, 0);
+                                  MIN_LPC_SHIFT, MAX_LPC_SHIFT, 0);
 
     if (omethod == ORDER_METHOD_2LEVEL ||
         omethod == ORDER_METHOD_4LEVEL ||
@@ -826,8 +902,13 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             order = av_clip(order, min_order - 1, max_order - 1);
             if (order == last_order)
                 continue;
-            s->flac_dsp.lpc_encode(res, smp, n, order+1, coefs[order],
-                                   shift[order]);
+            if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(order) <= 32) {
+                s->flac_dsp.lpc16_encode(res, smp, n, order+1, coefs[order],
+                                         shift[order]);
+            } else {
+                s->flac_dsp.lpc32_encode(res, smp, n, order+1, coefs[order],
+                                         shift[order]);
+            }
             bits[i] = find_subframe_rice_params(s, sub, order+1);
             if (bits[i] < bits[opt_index]) {
                 opt_index = i;
@@ -841,7 +922,11 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
         opt_order = 0;
         bits[0]   = UINT32_MAX;
         for (i = min_order-1; i < max_order; i++) {
-            s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(i) <= 32) {
+                s->flac_dsp.lpc16_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            } else {
+                s->flac_dsp.lpc32_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            }
             bits[i] = find_subframe_rice_params(s, sub, i+1);
             if (bits[i] < bits[opt_order])
                 opt_order = i;
@@ -859,7 +944,11 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             for (i = last-step; i <= last+step; i += step) {
                 if (i < min_order-1 || i >= max_order || bits[i] < UINT32_MAX)
                     continue;
-                s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(i) <= 32) {
+                    s->flac_dsp.lpc32_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                } else {
+                    s->flac_dsp.lpc16_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                }
                 bits[i] = find_subframe_rice_params(s, sub, i+1);
                 if (bits[i] < bits[opt_order])
                     opt_order = i;
@@ -868,13 +957,60 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
         opt_order++;
     }
 
+    if (s->options.multi_dim_quant) {
+        int allsteps = 1;
+        int i, step, improved;
+        int64_t best_score = INT64_MAX;
+        int32_t qmax;
+
+        qmax = (1 << (s->options.lpc_coeff_precision - 1)) - 1;
+
+        for (i=0; i<opt_order; i++)
+            allsteps *= 3;
+
+        do {
+            improved = 0;
+            for (step = 0; step < allsteps; step++) {
+                int tmp = step;
+                int32_t lpc_try[MAX_LPC_ORDER];
+                int64_t score = 0;
+                int diffsum = 0;
+
+                for (i=0; i<opt_order; i++) {
+                    int diff = ((tmp + 1) % 3) - 1;
+                    lpc_try[i] = av_clip(coefs[opt_order - 1][i] + diff, -qmax, qmax);
+                    tmp /= 3;
+                    diffsum += !!diff;
+                }
+                if (diffsum >8)
+                    continue;
+
+                if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(opt_order - 1) <= 32) {
+                    s->flac_dsp.lpc16_encode(res, smp, n, opt_order, lpc_try, shift[opt_order-1]);
+                } else {
+                    s->flac_dsp.lpc32_encode(res, smp, n, opt_order, lpc_try, shift[opt_order-1]);
+                }
+                score = find_subframe_rice_params(s, sub, opt_order);
+                if (score < best_score) {
+                    best_score = score;
+                    memcpy(coefs[opt_order-1], lpc_try, sizeof(*coefs));
+                    improved=1;
+                }
+            }
+        } while(improved);
+    }
+
     sub->order     = opt_order;
     sub->type_code = sub->type | (sub->order-1);
     sub->shift     = shift[sub->order-1];
     for (i = 0; i < sub->order; i++)
         sub->coefs[i] = coefs[sub->order-1][i];
 
-    s->flac_dsp.lpc_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(opt_order) <= 32) {
+        s->flac_dsp.lpc16_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    } else {
+        s->flac_dsp.lpc32_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    }
 
     find_subframe_rice_params(s, sub, sub->order);
 
@@ -909,7 +1045,7 @@ static int count_frame_header(FlacEncodeContext *s)
         count += 16;
 
     /* explicit sample rate */
-    count += ((s->sr_code[0] == 12) + (s->sr_code[0] > 12)) * 8;
+    count += ((s->sr_code[0] == 12) + (s->sr_code[0] > 12) * 2) * 8;
 
     /* frame header CRC-8 */
     count += 8;
@@ -953,7 +1089,7 @@ static void remove_wasted_bits(FlacEncodeContext *s)
         }
 
         if (v && !(v & 1)) {
-            v = av_ctz(v);
+            v = ff_ctz(v);
 
             for (i = 0; i < s->frame.blocksize; i++)
                 sub->samples[i] >>= v;
@@ -970,7 +1106,7 @@ static void remove_wasted_bits(FlacEncodeContext *s)
 }
 
 
-static int estimate_stereo_mode(int32_t *left_ch, int32_t *right_ch, int n,
+static int estimate_stereo_mode(const int32_t *left_ch, const int32_t *right_ch, int n,
                                 int max_rice_param)
 {
     int i, best;
@@ -1210,9 +1346,7 @@ static int update_md5_sum(FlacEncodeContext *s, const void *samples)
 
         for (i = 0; i < s->frame.blocksize * s->channels; i++) {
             int32_t v = samples0[i] >> 8;
-            *tmp++    = (v      ) & 0xFF;
-            *tmp++    = (v >>  8) & 0xFF;
-            *tmp++    = (v >> 16) & 0xFF;
+            AV_WL24(tmp + 3*i, v);
         }
         buf = s->md5_buffer;
     }
@@ -1286,10 +1420,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     }
 
-    if ((ret = ff_alloc_packet(avpkt, frame_bytes))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, frame_bytes, 0)) < 0)
         return ret;
-    }
 
     out_bytes = write_frame(s, avpkt);
 
@@ -1336,7 +1468,7 @@ static const AVOption options[] = {
 { "fixed",    NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_FIXED },    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
 { "levinson", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
 { "cholesky", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  AV_OPT_TYPE_INT, {.i64 = 1 }, 1, INT_MAX, FLAGS },
+{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  AV_OPT_TYPE_INT, {.i64 = 2 }, 1, INT_MAX, FLAGS },
 { "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  AV_OPT_TYPE_INT, {.i64 = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
 { "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  AV_OPT_TYPE_INT, {.i64 = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
 { "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), AV_OPT_TYPE_INT, {.i64 = -1 }, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
@@ -1352,6 +1484,8 @@ static const AVOption options[] = {
 { "left_side",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_LEFT_SIDE   }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
 { "right_side", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_RIGHT_SIDE  }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
 { "mid_side",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_MID_SIDE    }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
+{ "exact_rice_parameters", "Calculate rice parameters exactly", offsetof(FlacEncodeContext, options.exact_rice_parameters), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+{ "multi_dim_quant",       "Multi-dimensional quantization",    offsetof(FlacEncodeContext, options.multi_dim_quant),       AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
 { "min_prediction_order", NULL, offsetof(FlacEncodeContext, options.min_prediction_order), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, MAX_LPC_ORDER, FLAGS },
 { "max_prediction_order", NULL, offsetof(FlacEncodeContext, options.max_prediction_order), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, MAX_LPC_ORDER, FLAGS },
 
@@ -1374,7 +1508,7 @@ AVCodec ff_flac_encoder = {
     .init           = flac_encode_init,
     .encode2        = flac_encode_frame,
     .close          = flac_encode_close,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_LOSSLESS,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_S32,
                                                      AV_SAMPLE_FMT_NONE },
diff --git a/libavcodec/flashsv.c b/libavcodec/flashsv.c
index 252caab..1dc3c71 100644
--- a/libavcodec/flashsv.c
+++ b/libavcodec/flashsv.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Alex Beregszaszi
  * Copyright (C) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,10 +38,9 @@
 #include <zlib.h>
 
 #include "libavutil/intreadwrite.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 typedef struct BlockInfo {
@@ -70,7 +69,7 @@ typedef struct FlashSVContext {
     int             diff_start, diff_height;
 } FlashSVContext;
 
-static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy,
+static int decode_hybrid(const uint8_t *sptr, const uint8_t *sptr_end, uint8_t *dptr, int dx, int dy,
                          int h, int w, int stride, const uint32_t *pal)
 {
     int x, y;
@@ -79,6 +78,8 @@ static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy,
     for (y = dx + h; y > dx; y--) {
         uint8_t *dst = dptr + (y * stride) + dy * 3;
         for (x = 0; x < w; x++) {
+            if (sptr >= sptr_end)
+                return AVERROR_INVALIDDATA;
             if (*sptr & 0x80) {
                 /* 15-bit color */
                 unsigned c = AV_RB16(sptr) & ~0x8000;
@@ -108,7 +109,7 @@ static av_cold int flashsv_decode_end(AVCodecContext *avctx)
     av_frame_free(&s->frame);
 
     /* free the tmpblock */
-    av_free(s->tmpblock);
+    av_freep(&s->tmpblock);
 
     return 0;
 }
@@ -143,6 +144,9 @@ static int flashsv2_prime(FlashSVContext *s, uint8_t *src, int size)
     z_stream zs;
     int zret; // Zlib return code
 
+    if (!src)
+        return AVERROR_INVALIDDATA;
+
     zs.zalloc = NULL;
     zs.zfree  = NULL;
     zs.opaque = NULL;
@@ -153,7 +157,8 @@ static int flashsv2_prime(FlashSVContext *s, uint8_t *src, int size)
     s->zstream.avail_out = s->block_size * 3;
     inflate(&s->zstream, Z_SYNC_FLUSH);
 
-    deflateInit(&zs, 0);
+    if (deflateInit(&zs, 0) != Z_OK)
+        return -1;
     zs.next_in   = s->tmpblock;
     zs.avail_in  = s->block_size * 3 - s->zstream.avail_out;
     zs.next_out  = s->deflate_block;
@@ -176,7 +181,7 @@ static int flashsv2_prime(FlashSVContext *s, uint8_t *src, int size)
 }
 
 static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
-                                BitstreamContext *bc, int block_size,
+                                GetBitContext *gb, int block_size,
                                 int width, int height, int x_pos, int y_pos,
                                 int blk_idx)
 {
@@ -195,7 +200,7 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
         if (ret < 0)
             return ret;
     }
-    s->zstream.next_in   = avpkt->data + bitstream_tell(bc) / 8;
+    s->zstream.next_in   = avpkt->data + get_bits_count(gb) / 8;
     s->zstream.avail_in  = block_size;
     s->zstream.next_out  = s->tmpblock;
     s->zstream.avail_out = s->block_size * 3;
@@ -211,7 +216,7 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
     }
 
     if (s->is_keyframe) {
-        s->blocks[blk_idx].pos  = s->keyframedata + (bitstream_tell(bc) / 8);
+        s->blocks[blk_idx].pos  = s->keyframedata + (get_bits_count(gb) / 8);
         s->blocks[blk_idx].size = block_size;
     }
 
@@ -229,12 +234,17 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
         }
     } else {
         /* hybrid 15-bit/palette mode */
-        decode_hybrid(s->tmpblock, s->frame->data[0],
+        ret = decode_hybrid(s->tmpblock, s->zstream.next_out,
+                      s->frame->data[0],
                       s->image_height - (y_pos + 1 + s->diff_height),
                       x_pos, s->diff_height, width,
                       s->frame->linesize[0], s->pal);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "decode_hybrid failed\n");
+            return ret;
+        }
     }
-    bitstream_skip(bc, 8 * block_size); /* skip the consumed bits */
+    skip_bits_long(gb, 8 * block_size); /* skip the consumed bits */
     return 0;
 }
 
@@ -260,7 +270,9 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     int buf_size = avpkt->size;
     FlashSVContext *s = avctx->priv_data;
     int h_blocks, v_blocks, h_part, v_part, i, j, ret;
-    BitstreamContext bc;
+    GetBitContext gb;
+    int last_blockwidth = s->block_width;
+    int last_blockheight= s->block_height;
 
     /* no supplementary picture */
     if (buf_size == 0)
@@ -268,21 +280,26 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     if (buf_size < 4)
         return -1;
 
-    bitstream_init8(&bc, avpkt->data, buf_size);
+    if ((ret = init_get_bits8(&gb, avpkt->data, buf_size)) < 0)
+        return ret;
 
     /* start to parse the bitstream */
-    s->block_width  = 16 * (bitstream_read(&bc, 4) + 1);
-    s->image_width  = bitstream_read(&bc, 12);
-    s->block_height = 16 * (bitstream_read(&bc, 4) + 1);
-    s->image_height = bitstream_read(&bc, 12);
+    s->block_width  = 16 * (get_bits(&gb, 4) + 1);
+    s->image_width  = get_bits(&gb, 12);
+    s->block_height = 16 * (get_bits(&gb, 4) + 1);
+    s->image_height = get_bits(&gb, 12);
+
+    if (   last_blockwidth != s->block_width
+        || last_blockheight!= s->block_height)
+        av_freep(&s->blocks);
 
     if (s->ver == 2) {
-        bitstream_skip(&bc, 6);
-        if (bitstream_read_bit(&bc)) {
+        skip_bits(&gb, 6);
+        if (get_bits1(&gb)) {
             avpriv_request_sample(avctx, "iframe");
             return AVERROR_PATCHWELCOME;
         }
-        if (bitstream_read_bit(&bc)) {
+        if (get_bits1(&gb)) {
             avpriv_request_sample(avctx, "Custom palette");
             return AVERROR_PATCHWELCOME;
         }
@@ -323,8 +340,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
 
     /* initialize the image size once */
     if (avctx->width == 0 && avctx->height == 0) {
-        avctx->width  = s->image_width;
-        avctx->height = s->image_height;
+        if ((ret = ff_set_dimensions(avctx, s->image_width, s->image_height)) < 0)
+            return ret;
     }
 
     /* check for changes of image width and image height */
@@ -340,24 +357,20 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     s->is_keyframe = (avpkt->flags & AV_PKT_FLAG_KEY) && (s->ver == 2);
     if (s->is_keyframe) {
         int err;
-        int nb_blocks = (v_blocks + !!v_part) *
-                        (h_blocks + !!h_part) * sizeof(s->blocks[0]);
         if ((err = av_reallocp(&s->keyframedata, avpkt->size)) < 0)
             return err;
         memcpy(s->keyframedata, avpkt->data, avpkt->size);
-        if ((err = av_reallocp(&s->blocks, nb_blocks)) < 0)
-            return err;
-        memset(s->blocks, 0, nb_blocks);
     }
+    if(s->ver == 2 && !s->blocks)
+        s->blocks = av_mallocz((v_blocks + !!v_part) * (h_blocks + !!h_part) *
+                               sizeof(s->blocks[0]));
 
     ff_dlog(avctx, "image: %dx%d block: %dx%d num: %dx%d part: %dx%d\n",
             s->image_width, s->image_height, s->block_width, s->block_height,
             h_blocks, v_blocks, h_part, v_part);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     /* loop over all block columns */
     for (j = 0; j < v_blocks + (v_part ? 1 : 0); j++) {
@@ -372,7 +385,7 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
             int has_diff = 0;
 
             /* get the size of the compressed zlib chunk */
-            int size = bitstream_read(&bc, 16);
+            int size = get_bits(&gb, 16);
 
             s->color_depth    = 0;
             s->zlibprime_curr = 0;
@@ -380,17 +393,17 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
             s->diff_start     = 0;
             s->diff_height    = cur_blk_height;
 
-            if (8 * size > bitstream_bits_left(&bc)) {
+            if (8 * size > get_bits_left(&gb)) {
                 av_frame_unref(s->frame);
                 return AVERROR_INVALIDDATA;
             }
 
             if (s->ver == 2 && size) {
-                bitstream_skip(&bc, 3);
-                s->color_depth    = bitstream_read(&bc, 2);
-                has_diff          = bitstream_read_bit(&bc);
-                s->zlibprime_curr = bitstream_read_bit(&bc);
-                s->zlibprime_prev = bitstream_read_bit(&bc);
+                skip_bits(&gb, 3);
+                s->color_depth    = get_bits(&gb, 2);
+                has_diff          = get_bits1(&gb);
+                s->zlibprime_curr = get_bits1(&gb);
+                s->zlibprime_prev = get_bits1(&gb);
 
                 if (s->color_depth != 0 && s->color_depth != 2) {
                     av_log(avctx, AV_LOG_ERROR,
@@ -400,13 +413,17 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
                 }
 
                 if (has_diff) {
+                    if (size < 3) {
+                        av_log(avctx, AV_LOG_ERROR, "size too small for diff\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     if (!s->keyframe) {
                         av_log(avctx, AV_LOG_ERROR,
                                "Inter frame without keyframe\n");
                         return AVERROR_INVALIDDATA;
                     }
-                    s->diff_start  = bitstream_read(&bc, 8);
-                    s->diff_height = bitstream_read(&bc, 8);
+                    s->diff_start  = get_bits(&gb, 8);
+                    s->diff_height = get_bits(&gb, 8);
                     if (s->diff_start + s->diff_height > cur_blk_height) {
                         av_log(avctx, AV_LOG_ERROR,
                                "Block parameters invalid: %d + %d > %d\n",
@@ -423,10 +440,14 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
                     av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_prev\n", i, j);
 
                 if (s->zlibprime_curr) {
-                    int col = bitstream_read(&bc, 8);
-                    int row = bitstream_read(&bc, 8);
+                    int col = get_bits(&gb, 8);
+                    int row = get_bits(&gb, 8);
                     av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_curr %dx%d\n",
                            i, j, col, row);
+                    if (size < 3) {
+                        av_log(avctx, AV_LOG_ERROR, "size too small for zlibprime_curr\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     size -= 2;
                     avpriv_request_sample(avctx, "zlibprime_curr");
                     return AVERROR_PATCHWELCOME;
@@ -452,7 +473,7 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
 
             /* skip unchanged blocks, which have size 0 */
             if (size) {
-                if (flashsv_decode_block(avctx, avpkt, &bc, size,
+                if (flashsv_decode_block(avctx, avpkt, &gb, size,
                                          cur_blk_width, cur_blk_height,
                                          x_pos, y_pos,
                                          i + j * (h_blocks + !!h_part)))
@@ -478,9 +499,9 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame = 1;
 
-    if ((bitstream_tell(&bc) / 8) != buf_size)
+    if ((get_bits_count(&gb) / 8) != buf_size)
         av_log(avctx, AV_LOG_ERROR, "buffer not fully consumed (%d != %d)\n",
-               buf_size, (bitstream_tell(&bc) / 8));
+               buf_size, (get_bits_count(&gb) / 8));
 
     /* report that the buffer was completely consumed */
     return buf_size;
@@ -530,7 +551,11 @@ static const uint32_t ff_flashsv2_default_palette[128] = {
 static av_cold int flashsv2_decode_init(AVCodecContext *avctx)
 {
     FlashSVContext *s = avctx->priv_data;
-    flashsv_decode_init(avctx);
+    int ret;
+
+    ret = flashsv_decode_init(avctx);
+    if (ret < 0)
+        return ret;
     s->pal = ff_flashsv2_default_palette;
     s->ver = 2;
 
diff --git a/libavcodec/flashsv2enc.c b/libavcodec/flashsv2enc.c
new file mode 100644
index 0000000..65db112
--- /dev/null
+++ b/libavcodec/flashsv2enc.c
@@ -0,0 +1,922 @@
+/*
+ * Flash Screen Video Version 2 encoder
+ * Copyright (C) 2009 Joshua Warner
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Flash Screen Video Version 2 encoder
+ * @author Joshua Warner
+ */
+
+/* Differences from version 1 stream:
+ * NOTE: Currently, the only player that supports version 2 streams is Adobe Flash Player itself.
+ * * Supports sending only a range of scanlines in a block,
+ *   indicating a difference from the corresponding block in the last keyframe.
+ * * Supports initializing the zlib dictionary with data from the corresponding
+ *   block in the last keyframe, to improve compression.
+ * * Supports a hybrid 15-bit rgb / 7-bit palette color space.
+ */
+
+/* TODO:
+ * Don't keep Block structures for both current frame and keyframe.
+ * Make better heuristics for deciding stream parameters (optimum_* functions).  Currently these return constants.
+ * Figure out how to encode palette information in the stream, choose an optimum palette at each keyframe.
+ * Figure out how the zlibPrimeCompressCurrent flag works, implement support.
+ * Find other sample files (that weren't generated here), develop a decoder.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+
+#include "libavutil/imgutils.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+
+#define HAS_IFRAME_IMAGE 0x02
+#define HAS_PALLET_INFO 0x01
+
+#define COLORSPACE_BGR 0x00
+#define COLORSPACE_15_7 0x10
+#define HAS_DIFF_BLOCKS 0x04
+#define ZLIB_PRIME_COMPRESS_CURRENT 0x02
+#define ZLIB_PRIME_COMPRESS_PREVIOUS 0x01
+
+// Disables experimental "smart" parameter-choosing code, as well as the statistics that it depends on.
+// At the moment, the "smart" code is a great example of how the parameters *shouldn't* be chosen.
+#define FLASHSV2_DUMB
+
+typedef struct Block {
+    uint8_t *enc;
+    uint8_t *sl_begin, *sl_end;
+    int enc_size;
+    uint8_t *data;
+    unsigned long data_size;
+
+    uint8_t start, len;
+    uint8_t dirty;
+    uint8_t col, row, width, height;
+    uint8_t flags;
+} Block;
+
+typedef struct Palette {
+    unsigned colors[128];
+    uint8_t index[1 << 15];
+} Palette;
+
+typedef struct FlashSV2Context {
+    AVCodecContext *avctx;
+    uint8_t *current_frame;
+    uint8_t *key_frame;
+    uint8_t *encbuffer;
+    uint8_t *keybuffer;
+    uint8_t *databuffer;
+
+    uint8_t *blockbuffer;
+    int blockbuffer_size;
+
+    Block *frame_blocks;
+    Block *key_blocks;
+    int frame_size;
+    int blocks_size;
+
+    int use15_7, dist, comp;
+
+    int rows, cols;
+
+    int last_key_frame;
+
+    int image_width, image_height;
+    int block_width, block_height;
+    uint8_t flags;
+    uint8_t use_custom_palette;
+    uint8_t palette_type;       ///< 0=>default, 1=>custom - changed when palette regenerated.
+    Palette palette;
+#ifndef FLASHSV2_DUMB
+    double tot_blocks;          ///< blocks encoded since last keyframe
+    double diff_blocks;         ///< blocks that were different since last keyframe
+    double tot_lines;           ///< total scanlines in image since last keyframe
+    double diff_lines;          ///< scanlines that were different since last keyframe
+    double raw_size;            ///< size of raw frames since last keyframe
+    double comp_size;           ///< size of compressed data since last keyframe
+    double uncomp_size;         ///< size of uncompressed data since last keyframe
+
+    double total_bits;          ///< total bits written to stream so far
+#endif
+} FlashSV2Context;
+
+static av_cold void cleanup(FlashSV2Context * s)
+{
+    av_freep(&s->encbuffer);
+    av_freep(&s->keybuffer);
+    av_freep(&s->databuffer);
+    av_freep(&s->blockbuffer);
+    av_freep(&s->current_frame);
+    av_freep(&s->key_frame);
+
+    av_freep(&s->frame_blocks);
+    av_freep(&s->key_blocks);
+}
+
+static void init_blocks(FlashSV2Context * s, Block * blocks,
+                        uint8_t * encbuf, uint8_t * databuf)
+{
+    int row, col;
+    Block *b;
+    for (col = 0; col < s->cols; col++) {
+        for (row = 0; row < s->rows; row++) {
+            b = blocks + (col + row * s->cols);
+            b->width = (col < s->cols - 1) ?
+                s->block_width :
+                s->image_width - col * s->block_width;
+
+            b->height = (row < s->rows - 1) ?
+                s->block_height :
+                s->image_height - row * s->block_height;
+
+            b->row   = row;
+            b->col   = col;
+            b->enc   = encbuf;
+            b->data  = databuf;
+            encbuf  += b->width * b->height * 3;
+            databuf += !databuf ? 0 : b->width * b->height * 6;
+        }
+    }
+}
+
+static void reset_stats(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    s->diff_blocks = 0.1;
+    s->tot_blocks = 1;
+    s->diff_lines = 0.1;
+    s->tot_lines = 1;
+    s->raw_size = s->comp_size = s->uncomp_size = 10;
+#endif
+}
+
+static av_cold int flashsv2_encode_init(AVCodecContext * avctx)
+{
+    FlashSV2Context *s = avctx->priv_data;
+
+    s->avctx = avctx;
+
+    s->comp = avctx->compression_level;
+    if (s->comp == -1)
+        s->comp = 9;
+    if (s->comp < 0 || s->comp > 9) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Compression level should be 0-9, not %d\n", s->comp);
+        return -1;
+    }
+
+
+    if ((avctx->width > 4095) || (avctx->height > 4095)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Input dimensions too large, input must be max 4095x4095 !\n");
+        return -1;
+    }
+    if ((avctx->width < 16) || (avctx->height < 16)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Input dimensions too small, input must be at least 16x16 !\n");
+        return -1;
+    }
+
+    if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0)
+        return -1;
+
+
+    s->last_key_frame = 0;
+
+    s->image_width  = avctx->width;
+    s->image_height = avctx->height;
+
+    s->block_width  = (s->image_width /  12) & ~15;
+    s->block_height = (s->image_height / 12) & ~15;
+
+    if(!s->block_width)
+        s->block_width = 1;
+    if(!s->block_height)
+        s->block_height = 1;
+
+    s->rows = (s->image_height + s->block_height - 1) / s->block_height;
+    s->cols = (s->image_width +  s->block_width -  1) / s->block_width;
+
+    s->frame_size  = s->image_width * s->image_height * 3;
+    s->blocks_size = s->rows * s->cols * sizeof(Block);
+
+    s->encbuffer     = av_mallocz(s->frame_size);
+    s->keybuffer     = av_mallocz(s->frame_size);
+    s->databuffer    = av_mallocz(s->frame_size * 6);
+    s->current_frame = av_mallocz(s->frame_size);
+    s->key_frame     = av_mallocz(s->frame_size);
+    s->frame_blocks  = av_mallocz(s->blocks_size);
+    s->key_blocks    = av_mallocz(s->blocks_size);
+
+    s->blockbuffer      = NULL;
+    s->blockbuffer_size = 0;
+
+    init_blocks(s, s->frame_blocks, s->encbuffer, s->databuffer);
+    init_blocks(s, s->key_blocks,   s->keybuffer, 0);
+    reset_stats(s);
+#ifndef FLASHSV2_DUMB
+    s->total_bits = 1;
+#endif
+
+    s->use_custom_palette =  0;
+    s->palette_type       = -1;        // so that the palette will be generated in reconfigure_at_keyframe
+
+    if (!s->encbuffer || !s->keybuffer || !s->databuffer
+        || !s->current_frame || !s->key_frame || !s->key_blocks
+        || !s->frame_blocks) {
+        av_log(avctx, AV_LOG_ERROR, "Memory allocation failed.\n");
+        cleanup(s);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int new_key_frame(FlashSV2Context * s)
+{
+    int i;
+    memcpy(s->key_blocks, s->frame_blocks, s->blocks_size);
+    memcpy(s->key_frame, s->current_frame, s->frame_size);
+
+    for (i = 0; i < s->rows * s->cols; i++) {
+        s->key_blocks[i].enc += (s->keybuffer - s->encbuffer);
+        s->key_blocks[i].sl_begin = 0;
+        s->key_blocks[i].sl_end   = 0;
+        s->key_blocks[i].data     = 0;
+    }
+    memcpy(s->keybuffer, s->encbuffer, s->frame_size);
+
+    return 0;
+}
+
+static int write_palette(FlashSV2Context * s, uint8_t * buf, int buf_size)
+{
+    //this isn't implemented yet!  Default palette only!
+    return -1;
+}
+
+static int write_header(FlashSV2Context * s, uint8_t * buf, int buf_size)
+{
+    PutBitContext pb;
+    int buf_pos, len;
+
+    if (buf_size < 5)
+        return -1;
+
+    init_put_bits(&pb, buf, buf_size);
+
+    put_bits(&pb, 4, (s->block_width  >> 4) - 1);
+    put_bits(&pb, 12, s->image_width);
+    put_bits(&pb, 4, (s->block_height >> 4) - 1);
+    put_bits(&pb, 12, s->image_height);
+
+    flush_put_bits(&pb);
+    buf_pos = 4;
+
+    buf[buf_pos++] = s->flags;
+
+    if (s->flags & HAS_PALLET_INFO) {
+        len = write_palette(s, buf + buf_pos, buf_size - buf_pos);
+        if (len < 0)
+            return -1;
+        buf_pos += len;
+    }
+
+    return buf_pos;
+}
+
+static int write_block(Block * b, uint8_t * buf, int buf_size)
+{
+    int buf_pos = 0;
+    unsigned block_size = b->data_size;
+
+    if (b->flags & HAS_DIFF_BLOCKS)
+        block_size += 2;
+    if (b->flags & ZLIB_PRIME_COMPRESS_CURRENT)
+        block_size += 2;
+    if (block_size > 0)
+        block_size += 1;
+    if (buf_size < block_size + 2)
+        return -1;
+
+    buf[buf_pos++] = block_size >> 8;
+    buf[buf_pos++] = block_size;
+
+    if (block_size == 0)
+        return buf_pos;
+
+    buf[buf_pos++] = b->flags;
+
+    if (b->flags & HAS_DIFF_BLOCKS) {
+        buf[buf_pos++] = (b->start);
+        buf[buf_pos++] = (b->len);
+    }
+
+    if (b->flags & ZLIB_PRIME_COMPRESS_CURRENT) {
+        //This feature of the format is poorly understood, and as of now, unused.
+        buf[buf_pos++] = (b->col);
+        buf[buf_pos++] = (b->row);
+    }
+
+    memcpy(buf + buf_pos, b->data, b->data_size);
+
+    buf_pos += b->data_size;
+
+    return buf_pos;
+}
+
+static int encode_zlib(Block * b, uint8_t * buf, unsigned long *buf_size, int comp)
+{
+    int res = compress2(buf, buf_size, b->sl_begin, b->sl_end - b->sl_begin, comp);
+    return res == Z_OK ? 0 : -1;
+}
+
+static int encode_zlibprime(Block * b, Block * prime, uint8_t * buf,
+                            int *buf_size, int comp)
+{
+    z_stream s;
+    int res;
+    s.zalloc = NULL;
+    s.zfree  = NULL;
+    s.opaque = NULL;
+    res = deflateInit(&s, comp);
+    if (res < 0)
+        return -1;
+
+    s.next_in  = prime->enc;
+    s.avail_in = prime->enc_size;
+    while (s.avail_in > 0) {
+        s.next_out  = buf;
+        s.avail_out = *buf_size;
+        res = deflate(&s, Z_SYNC_FLUSH);
+        if (res < 0)
+            return -1;
+    }
+
+    s.next_in   = b->sl_begin;
+    s.avail_in  = b->sl_end - b->sl_begin;
+    s.next_out  = buf;
+    s.avail_out = *buf_size;
+    res = deflate(&s, Z_FINISH);
+    deflateEnd(&s);
+    *buf_size -= s.avail_out;
+    if (res != Z_STREAM_END)
+        return -1;
+    return 0;
+}
+
+static int encode_bgr(Block * b, const uint8_t * src, int stride)
+{
+    int i;
+    uint8_t *ptr = b->enc;
+    for (i = 0; i < b->start; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->sl_begin = ptr + i * b->width * 3;
+    for (; i < b->start + b->len; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->sl_end = ptr + i * b->width * 3;
+    for (; i < b->height; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->enc_size = ptr + i * b->width * 3 - b->enc;
+    return b->enc_size;
+}
+
+static inline unsigned pixel_color15(const uint8_t * src)
+{
+    return (src[0] >> 3) | ((src[1] & 0xf8) << 2) | ((src[2] & 0xf8) << 7);
+}
+
+static inline unsigned int chroma_diff(unsigned int c1, unsigned int c2)
+{
+#define ABSDIFF(a,b) (abs((int)(a)-(int)(b)))
+
+    unsigned int t1 = (c1 & 0x000000ff) + ((c1 & 0x0000ff00) >> 8) + ((c1 & 0x00ff0000) >> 16);
+    unsigned int t2 = (c2 & 0x000000ff) + ((c2 & 0x0000ff00) >> 8) + ((c2 & 0x00ff0000) >> 16);
+
+    return ABSDIFF(t1, t2) + ABSDIFF(c1 & 0x000000ff, c2 & 0x000000ff) +
+        ABSDIFF((c1 & 0x0000ff00) >> 8 , (c2 & 0x0000ff00) >> 8) +
+        ABSDIFF((c1 & 0x00ff0000) >> 16, (c2 & 0x00ff0000) >> 16);
+}
+
+static inline int pixel_color7_fast(Palette * palette, unsigned c15)
+{
+    return palette->index[c15];
+}
+
+static int pixel_color7_slow(Palette * palette, unsigned color)
+{
+    int i, min = 0x7fffffff;
+    int minc = -1;
+    for (i = 0; i < 128; i++) {
+        int c1 = palette->colors[i];
+        int diff = chroma_diff(c1, color);
+        if (diff < min) {
+            min = diff;
+            minc = i;
+        }
+    }
+    return minc;
+}
+
+static inline unsigned pixel_bgr(const uint8_t * src)
+{
+    return (src[0]) | (src[1] << 8) | (src[2] << 16);
+}
+
+static int write_pixel_15_7(Palette * palette, uint8_t * dest, const uint8_t * src,
+                            int dist)
+{
+    unsigned c15 = pixel_color15(src);
+    unsigned color = pixel_bgr(src);
+    int d15 = chroma_diff(color, color & 0x00f8f8f8);
+    int c7 = pixel_color7_fast(palette, c15);
+    int d7 = chroma_diff(color, palette->colors[c7]);
+    if (dist + d15 >= d7) {
+        dest[0] = c7;
+        return 1;
+    } else {
+        dest[0] = 0x80 | (c15 >> 8);
+        dest[1] = c15 & 0xff;
+        return 2;
+    }
+}
+
+static int update_palette_index(Palette * palette)
+{
+    int r, g, b;
+    unsigned int bgr, c15, index;
+    for (r = 4; r < 256; r += 8) {
+        for (g = 4; g < 256; g += 8) {
+            for (b = 4; b < 256; b += 8) {
+                bgr = b | (g << 8) | (r << 16);
+                c15 = (b >> 3) | ((g & 0xf8) << 2) | ((r & 0xf8) << 7);
+                index = pixel_color7_slow(palette, bgr);
+
+                palette->index[c15] = index;
+            }
+        }
+    }
+    return 0;
+}
+
+static const unsigned int default_screen_video_v2_palette[128] = {
+    0x00000000, 0x00333333, 0x00666666, 0x00999999, 0x00CCCCCC, 0x00FFFFFF,
+    0x00330000, 0x00660000, 0x00990000, 0x00CC0000, 0x00FF0000, 0x00003300,
+    0x00006600, 0x00009900, 0x0000CC00, 0x0000FF00, 0x00000033, 0x00000066,
+    0x00000099, 0x000000CC, 0x000000FF, 0x00333300, 0x00666600, 0x00999900,
+    0x00CCCC00, 0x00FFFF00, 0x00003333, 0x00006666, 0x00009999, 0x0000CCCC,
+    0x0000FFFF, 0x00330033, 0x00660066, 0x00990099, 0x00CC00CC, 0x00FF00FF,
+    0x00FFFF33, 0x00FFFF66, 0x00FFFF99, 0x00FFFFCC, 0x00FF33FF, 0x00FF66FF,
+    0x00FF99FF, 0x00FFCCFF, 0x0033FFFF, 0x0066FFFF, 0x0099FFFF, 0x00CCFFFF,
+    0x00CCCC33, 0x00CCCC66, 0x00CCCC99, 0x00CCCCFF, 0x00CC33CC, 0x00CC66CC,
+    0x00CC99CC, 0x00CCFFCC, 0x0033CCCC, 0x0066CCCC, 0x0099CCCC, 0x00FFCCCC,
+    0x00999933, 0x00999966, 0x009999CC, 0x009999FF, 0x00993399, 0x00996699,
+    0x0099CC99, 0x0099FF99, 0x00339999, 0x00669999, 0x00CC9999, 0x00FF9999,
+    0x00666633, 0x00666699, 0x006666CC, 0x006666FF, 0x00663366, 0x00669966,
+    0x0066CC66, 0x0066FF66, 0x00336666, 0x00996666, 0x00CC6666, 0x00FF6666,
+    0x00333366, 0x00333399, 0x003333CC, 0x003333FF, 0x00336633, 0x00339933,
+    0x0033CC33, 0x0033FF33, 0x00663333, 0x00993333, 0x00CC3333, 0x00FF3333,
+    0x00003366, 0x00336600, 0x00660033, 0x00006633, 0x00330066, 0x00663300,
+    0x00336699, 0x00669933, 0x00993366, 0x00339966, 0x00663399, 0x00996633,
+    0x006699CC, 0x0099CC66, 0x00CC6699, 0x0066CC99, 0x009966CC, 0x00CC9966,
+    0x0099CCFF, 0x00CCFF99, 0x00FF99CC, 0x0099FFCC, 0x00CC99FF, 0x00FFCC99,
+    0x00111111, 0x00222222, 0x00444444, 0x00555555, 0x00AAAAAA, 0x00BBBBBB,
+    0x00DDDDDD, 0x00EEEEEE
+};
+
+static int generate_default_palette(Palette * palette)
+{
+    memcpy(palette->colors, default_screen_video_v2_palette,
+           sizeof(default_screen_video_v2_palette));
+
+    return update_palette_index(palette);
+}
+
+static int generate_optimum_palette(Palette * palette, const uint8_t * image,
+                                   int width, int height, int stride)
+{
+    //this isn't implemented yet!  Default palette only!
+    return -1;
+}
+
+static inline int encode_15_7_sl(Palette * palette, uint8_t * dest,
+                                 const uint8_t * src, int width, int dist)
+{
+    int len = 0, x;
+    for (x = 0; x < width; x++) {
+        len += write_pixel_15_7(palette, dest + len, src + 3 * x, dist);
+    }
+    return len;
+}
+
+static int encode_15_7(Palette * palette, Block * b, const uint8_t * src,
+                       int stride, int dist)
+{
+    int i;
+    uint8_t *ptr = b->enc;
+    for (i = 0; i < b->start; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->sl_begin = ptr;
+    for (; i < b->start + b->len; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->sl_end = ptr;
+    for (; i < b->height; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->enc_size = ptr - b->enc;
+    return b->enc_size;
+}
+
+static int encode_block(FlashSV2Context *s, Palette * palette, Block * b,
+                        Block * prev, const uint8_t * src, int stride, int comp,
+                        int dist, int keyframe)
+{
+    unsigned buf_size = b->width * b->height * 6;
+    uint8_t *buf = s->blockbuffer;
+    int res;
+
+    if (b->flags & COLORSPACE_15_7) {
+        encode_15_7(palette, b, src, stride, dist);
+    } else {
+        encode_bgr(b, src, stride);
+    }
+
+    if (b->len > 0) {
+        b->data_size = buf_size;
+        res = encode_zlib(b, b->data, &b->data_size, comp);
+        if (res)
+            return res;
+
+        if (!keyframe) {
+            res = encode_zlibprime(b, prev, buf, &buf_size, comp);
+            if (res)
+                return res;
+
+            if (buf_size < b->data_size) {
+                b->data_size = buf_size;
+                memcpy(b->data, buf, buf_size);
+                b->flags |= ZLIB_PRIME_COMPRESS_PREVIOUS;
+            }
+        }
+    } else {
+        b->data_size = 0;
+    }
+    return 0;
+}
+
+static int compare_sl(FlashSV2Context * s, Block * b, const uint8_t * src,
+                      uint8_t * frame, uint8_t * key, int y, int keyframe)
+{
+    if (memcmp(src, frame, b->width * 3) != 0) {
+        b->dirty = 1;
+        memcpy(frame, src, b->width * 3);
+#ifndef FLASHSV2_DUMB
+        s->diff_lines++;
+#endif
+    }
+    if (memcmp(src, key, b->width * 3) != 0) {
+        if (b->len == 0)
+            b->start = y;
+        b->len = y + 1 - b->start;
+    }
+    return 0;
+}
+
+static int mark_all_blocks(FlashSV2Context * s, const uint8_t * src, int stride,
+                           int keyframe)
+{
+    int sl, rsl, col, pos, possl;
+    Block *b;
+    for (sl = s->image_height - 1; sl >= 0; sl--) {
+        for (col = 0; col < s->cols; col++) {
+            rsl = s->image_height - sl - 1;
+            b = s->frame_blocks + col + rsl / s->block_height * s->cols;
+            possl = stride * sl + col * s->block_width * 3;
+            pos = s->image_width * rsl * 3 + col * s->block_width * 3;
+            compare_sl(s, b, src + possl, s->current_frame + pos,
+                       s->key_frame + pos, rsl % s->block_height, keyframe);
+        }
+    }
+#ifndef FLASHSV2_DUMB
+    s->tot_lines += s->image_height * s->cols;
+#endif
+    return 0;
+}
+
+static int encode_all_blocks(FlashSV2Context * s, int keyframe)
+{
+    int row, col, res;
+    uint8_t *data;
+    Block *b, *prev;
+    for (row = 0; row < s->rows; row++) {
+        for (col = 0; col < s->cols; col++) {
+            b = s->frame_blocks + (row * s->cols + col);
+            prev = s->key_blocks + (row * s->cols + col);
+            b->flags = s->use15_7 ? COLORSPACE_15_7 : 0;
+            if (keyframe) {
+                b->start = 0;
+                b->len = b->height;
+            } else if (!b->dirty) {
+                b->start = 0;
+                b->len = 0;
+                b->data_size = 0;
+                continue;
+            } else if (b->start != 0 || b->len != b->height) {
+                b->flags |= HAS_DIFF_BLOCKS;
+            }
+            data = s->current_frame + s->image_width * 3 * s->block_height * row + s->block_width * col * 3;
+            res = encode_block(s, &s->palette, b, prev, data, s->image_width * 3, s->comp, s->dist, keyframe);
+#ifndef FLASHSV2_DUMB
+            if (b->dirty)
+                s->diff_blocks++;
+            s->comp_size += b->data_size;
+            s->uncomp_size += b->enc_size;
+#endif
+            if (res)
+                return res;
+        }
+    }
+#ifndef FLASHSV2_DUMB
+    s->raw_size += s->image_width * s->image_height * 3;
+    s->tot_blocks += s->rows * s->cols;
+#endif
+    return 0;
+}
+
+static int write_all_blocks(FlashSV2Context * s, uint8_t * buf,
+                            int buf_size)
+{
+    int row, col, buf_pos = 0, len;
+    Block *b;
+    for (row = 0; row < s->rows; row++) {
+        for (col = 0; col < s->cols; col++) {
+            b = s->frame_blocks + row * s->cols + col;
+            len = write_block(b, buf + buf_pos, buf_size - buf_pos);
+            b->start = b->len = b->dirty = 0;
+            if (len < 0)
+                return len;
+            buf_pos += len;
+        }
+    }
+    return buf_pos;
+}
+
+static int write_bitstream(FlashSV2Context * s, const uint8_t * src, int stride,
+                           uint8_t * buf, int buf_size, int keyframe)
+{
+    int buf_pos, res;
+
+    res = mark_all_blocks(s, src, stride, keyframe);
+    if (res)
+        return res;
+    res = encode_all_blocks(s, keyframe);
+    if (res)
+        return res;
+
+    res = write_header(s, buf, buf_size);
+    if (res < 0) {
+        return res;
+    } else {
+        buf_pos = res;
+    }
+    res = write_all_blocks(s, buf + buf_pos, buf_size - buf_pos);
+    if (res < 0)
+        return res;
+    buf_pos += res;
+#ifndef FLASHSV2_DUMB
+    s->total_bits += ((double) buf_pos) * 8.0;
+#endif
+
+    return buf_pos;
+}
+
+static void recommend_keyframe(FlashSV2Context * s, int *keyframe)
+{
+#ifndef FLASHSV2_DUMB
+    double block_ratio, line_ratio, enc_ratio, comp_ratio, data_ratio;
+    if (s->avctx->gop_size > 0) {
+        block_ratio = s->diff_blocks / s->tot_blocks;
+        line_ratio = s->diff_lines / s->tot_lines;
+        enc_ratio = s->uncomp_size / s->raw_size;
+        comp_ratio = s->comp_size / s->uncomp_size;
+        data_ratio = s->comp_size / s->raw_size;
+
+        if ((block_ratio >= 0.5 && line_ratio / block_ratio <= 0.5) || line_ratio >= 0.95) {
+            *keyframe = 1;
+            return;
+        }
+    }
+#else
+    return;
+#endif
+}
+
+#ifndef FLASHSV2_DUMB
+static const double block_size_fraction = 1.0 / 300;
+static const double use15_7_threshold = 8192;
+static const double color15_7_factor = 100;
+#endif
+static int optimum_block_width(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double save = (1-pow(s->diff_lines/s->diff_blocks/s->block_height, 0.5)) * s->comp_size/s->tot_blocks;
+    double width = block_size_fraction * sqrt(0.5 * save * s->rows * s->cols) * s->image_width;
+    int pwidth = ((int) width);
+    return FFCLIP(pwidth & ~15, 256, 16);
+#else
+    return 64;
+#endif
+}
+
+static int optimum_block_height(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double save = (1-pow(s->diff_lines/s->diff_blocks/s->block_height, 0.5)) * s->comp_size/s->tot_blocks;
+    double height = block_size_fraction * sqrt(0.5 * save * s->rows * s->cols) * s->image_height;
+    int pheight = ((int) height);
+    return FFCLIP(pheight & ~15, 256, 16);
+#else
+    return 64;
+#endif
+}
+
+static int optimum_use15_7(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double ideal = ((double)(s->avctx->bit_rate * s->avctx->time_base.den * s->avctx->ticks_per_frame)) /
+        ((double) s->avctx->time_base.num) * s->avctx->frame_number;
+    if (ideal + use15_7_threshold < s->total_bits) {
+        return 1;
+    } else {
+        return 0;
+    }
+#else
+    return s->avctx->global_quality == 0;
+#endif
+}
+
+static int optimum_dist(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double ideal =
+        s->avctx->bit_rate * s->avctx->time_base.den *
+        s->avctx->ticks_per_frame;
+    int dist = pow((s->total_bits / ideal) * color15_7_factor, 3);
+    av_log(s->avctx, AV_LOG_DEBUG, "dist: %d\n", dist);
+    return dist;
+#else
+    return 15;
+#endif
+}
+
+
+static int reconfigure_at_keyframe(FlashSV2Context * s, const uint8_t * image,
+                                   int stride)
+{
+    int update_palette = 0;
+    int res;
+    int block_width  = optimum_block_width (s);
+    int block_height = optimum_block_height(s);
+
+    s->rows = (s->image_height + block_height - 1) / block_height;
+    s->cols = (s->image_width  + block_width  - 1) / block_width;
+
+    if (block_width != s->block_width || block_height != s->block_height) {
+        s->block_width  = block_width;
+        s->block_height = block_height;
+        if (s->rows * s->cols > s->blocks_size / sizeof(Block)) {
+            s->frame_blocks = av_realloc_array(s->frame_blocks, s->rows, s->cols * sizeof(Block));
+            s->key_blocks = av_realloc_array(s->key_blocks, s->cols, s->rows * sizeof(Block));
+            if (!s->frame_blocks || !s->key_blocks) {
+                av_log(s->avctx, AV_LOG_ERROR, "Memory allocation failed.\n");
+                return -1;
+            }
+            s->blocks_size = s->rows * s->cols * sizeof(Block);
+        }
+        init_blocks(s, s->frame_blocks, s->encbuffer, s->databuffer);
+        init_blocks(s, s->key_blocks, s->keybuffer, 0);
+
+        av_fast_malloc(&s->blockbuffer, &s->blockbuffer_size, block_width * block_height * 6);
+        if (!s->blockbuffer) {
+            av_log(s->avctx, AV_LOG_ERROR, "Could not allocate block buffer.\n");
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    s->use15_7 = optimum_use15_7(s);
+    if (s->use15_7) {
+        if ((s->use_custom_palette && s->palette_type != 1) || update_palette) {
+            res = generate_optimum_palette(&s->palette, image, s->image_width, s->image_height, stride);
+            if (res)
+                return res;
+            s->palette_type = 1;
+            av_log(s->avctx, AV_LOG_DEBUG, "Generated optimum palette\n");
+        } else if (!s->use_custom_palette && s->palette_type != 0) {
+            res = generate_default_palette(&s->palette);
+            if (res)
+                return res;
+            s->palette_type = 0;
+            av_log(s->avctx, AV_LOG_DEBUG, "Generated default palette\n");
+        }
+    }
+
+
+    reset_stats(s);
+
+    return 0;
+}
+
+static int flashsv2_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                 const AVFrame *p, int *got_packet)
+{
+    FlashSV2Context *const s = avctx->priv_data;
+    int res;
+    int keyframe = 0;
+
+    if ((res = ff_alloc_packet2(avctx, pkt, s->frame_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return res;
+
+    /* First frame needs to be a keyframe */
+    if (avctx->frame_number == 0)
+        keyframe = 1;
+
+    /* Check the placement of keyframes */
+    if (avctx->gop_size > 0) {
+        if (avctx->frame_number >= s->last_key_frame + avctx->gop_size)
+            keyframe = 1;
+    }
+
+    if (!keyframe
+        && avctx->frame_number > s->last_key_frame + avctx->keyint_min) {
+        recommend_keyframe(s, &keyframe);
+        if (keyframe)
+            av_log(avctx, AV_LOG_DEBUG, "Recommending key frame at frame %d\n", avctx->frame_number);
+    }
+
+    if (keyframe) {
+        res = reconfigure_at_keyframe(s, p->data[0], p->linesize[0]);
+        if (res)
+            return res;
+    }
+
+    if (s->use15_7)
+        s->dist = optimum_dist(s);
+
+    res = write_bitstream(s, p->data[0], p->linesize[0], pkt->data, pkt->size, keyframe);
+
+    if (keyframe) {
+        new_key_frame(s);
+        s->last_key_frame = avctx->frame_number;
+        pkt->flags |= AV_PKT_FLAG_KEY;
+        av_log(avctx, AV_LOG_DEBUG, "Inserting key frame at frame %d\n", avctx->frame_number);
+    }
+
+    pkt->size = res;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int flashsv2_encode_end(AVCodecContext * avctx)
+{
+    FlashSV2Context *s = avctx->priv_data;
+
+    cleanup(s);
+
+    return 0;
+}
+
+AVCodec ff_flashsv2_encoder = {
+    .name           = "flashsv2",
+    .long_name      = NULL_IF_CONFIG_SMALL("Flash Screen Video Version 2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_FLASHSV2,
+    .priv_data_size = sizeof(FlashSV2Context),
+    .init           = flashsv2_encode_init,
+    .encode2        = flashsv2_encode_frame,
+    .close          = flashsv2_encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_BGR24, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/flashsvenc.c b/libavcodec/flashsvenc.c
index 7e14e47..f7f98ef 100644
--- a/libavcodec/flashsvenc.c
+++ b/libavcodec/flashsvenc.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Alex Beregszaszi
  * Copyright (C) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -94,9 +94,9 @@ static av_cold int flashsv_encode_end(AVCodecContext *avctx)
 
     deflateEnd(&s->zstream);
 
-    av_free(s->encbuffer);
-    av_free(s->previous_frame);
-    av_free(s->tmpblock);
+    av_freep(&s->encbuffer);
+    av_freep(&s->previous_frame);
+    av_freep(&s->tmpblock);
 
     return 0;
 }
@@ -109,7 +109,7 @@ static av_cold int flashsv_encode_init(AVCodecContext *avctx)
 
     if (avctx->width > 4095 || avctx->height > 4095) {
         av_log(avctx, AV_LOG_ERROR,
-               "Input dimensions too large, input must be max 4096x4096 !\n");
+               "Input dimensions too large, input must be max 4095x4095 !\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -143,7 +143,7 @@ static int encode_bitstream(FlashSVContext *s, const AVFrame *p, uint8_t *buf,
     int buf_pos, res;
     int pred_blocks = 0;
 
-    init_put_bits(&pb, buf, buf_size * 8);
+    init_put_bits(&pb, buf, buf_size);
 
     put_bits(&pb,  4, block_width / 16 - 1);
     put_bits(&pb, 12, s->image_width);
@@ -238,12 +238,8 @@ static int flashsv_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         I_frame = 1;
     }
 
-    if ((res = ff_alloc_packet(pkt, s->image_width * s->image_height * 3)) < 0) {
-        //Conservative upper bound check for compressed data
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n",
-               s->image_width * s->image_height * 3);
+    if ((res = ff_alloc_packet2(avctx, pkt, s->image_width * s->image_height * 3, 0)) < 0)
         return res;
-    }
 
     pkt->size = encode_bitstream(s, p, pkt->data, pkt->size, opt_w * 16, opt_h * 16,
                                  pfptr, &I_frame);
diff --git a/libavcodec/flicvideo.c b/libavcodec/flicvideo.c
index b4556dc..ba5bda4 100644
--- a/libavcodec/flicvideo.c
+++ b/libavcodec/flicvideo.c
@@ -2,20 +2,20 @@
  * FLI/FLC Animation Video Decoder
  * Copyright (C) 2003, 2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,7 @@
  * variations, visit:
  *   http://www.compuphase.com/flic.htm
  *
- * This decoder outputs PAL8/RGB555/RGB565 and maybe one day RGB24
- * colorspace data, depending on the FLC. To use this decoder, be
+ * This decoder outputs PAL8/RGB555/RGB565/BGR24. To use this decoder, be
  * sure that your demuxer sends the FLI file header to the decoder via
  * the extradata chunk in AVCodecContext. The chunk should be 128 bytes
  * large. The only exception is for FLI files from the game "Magic Carpet",
@@ -64,7 +63,7 @@
 
 #define CHECK_PIXEL_PTR(n) \
     if (pixel_ptr + n > pixel_limit) { \
-        av_log (s->avctx, AV_LOG_INFO, "Problem: pixel_ptr >= pixel_limit (%d >= %d)\n", \
+        av_log (s->avctx, AV_LOG_ERROR, "Invalid pixel_ptr = %d > pixel_limit = %d\n", \
         pixel_ptr + n, pixel_limit); \
         return AVERROR_INVALIDDATA; \
     } \
@@ -84,22 +83,40 @@ static av_cold int flic_decode_init(AVCodecContext *avctx)
     unsigned char *fli_header = (unsigned char *)avctx->extradata;
     int depth;
 
-    if (avctx->extradata_size != 12 &&
-        avctx->extradata_size != 128) {
-        av_log(avctx, AV_LOG_ERROR, "Expected extradata of 12 or 128 bytes\n");
+    if (avctx->extradata_size != 0 &&
+        avctx->extradata_size != 12 &&
+        avctx->extradata_size != 128 &&
+        avctx->extradata_size != 256 &&
+        avctx->extradata_size != 904 &&
+        avctx->extradata_size != 1024) {
+        av_log(avctx, AV_LOG_ERROR, "Unexpected extradata size %d\n", avctx->extradata_size);
         return AVERROR_INVALIDDATA;
     }
 
     s->avctx = avctx;
 
-    s->fli_type = AV_RL16(&fli_header[4]); /* Might be overridden if a Magic Carpet FLC */
-
-    depth = 0;
     if (s->avctx->extradata_size == 12) {
         /* special case for magic carpet FLIs */
         s->fli_type = FLC_MAGIC_CARPET_SYNTHETIC_TYPE_CODE;
         depth = 8;
+    } else if (avctx->extradata_size == 1024) {
+        uint8_t *ptr = avctx->extradata;
+        int i;
+
+        for (i = 0; i < 256; i++) {
+            s->palette[i] = AV_RL32(ptr);
+            ptr += 4;
+        }
+        depth = 8;
+        /* FLI in MOV, see e.g. FFmpeg trac issue #626 */
+    } else if (avctx->extradata_size == 0 ||
+               avctx->extradata_size == 256 ||
+        /* see FFmpeg ticket #1234 */
+               avctx->extradata_size == 904) {
+        s->fli_type = FLI_TYPE_CODE;
+        depth = 8;
     } else {
+        s->fli_type = AV_RL16(&fli_header[4]);
         depth = AV_RL16(&fli_header[12]);
     }
 
@@ -115,9 +132,7 @@ static av_cold int flic_decode_init(AVCodecContext *avctx)
         case 8  : avctx->pix_fmt = AV_PIX_FMT_PAL8; break;
         case 15 : avctx->pix_fmt = AV_PIX_FMT_RGB555; break;
         case 16 : avctx->pix_fmt = AV_PIX_FMT_RGB565; break;
-        case 24 : avctx->pix_fmt = AV_PIX_FMT_BGR24; /* Supposedly BGR, but no files to test with */
-                  avpriv_request_sample(avctx, "24bpp FLC/FLX");
-                  return AVERROR_PATCHWELCOME;
+        case 24 : avctx->pix_fmt = AV_PIX_FMT_BGR24; break;
         default :
                   av_log(avctx, AV_LOG_ERROR, "Unknown FLC/FLX depth of %d Bpp is unsupported.\n",depth);
                   return AVERROR_INVALIDDATA;
@@ -139,7 +154,6 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
     FlicDecodeContext *s = avctx->priv_data;
 
     GetByteContext g2;
-    int stream_ptr_after_color_chunk;
     int pixel_ptr;
     int palette_ptr;
     unsigned char palette_idx1;
@@ -171,30 +185,42 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
 
     bytestream2_init(&g2, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     pixels = s->frame->data[0];
     pixel_limit = s->avctx->height * s->frame->linesize[0];
+    if (buf_size < 16 || buf_size > INT_MAX - (3 * 256 + AV_INPUT_BUFFER_PADDING_SIZE))
+        return AVERROR_INVALIDDATA;
     frame_size = bytestream2_get_le32(&g2);
+    if (frame_size > buf_size)
+        frame_size = buf_size;
     bytestream2_skip(&g2, 2); /* skip the magic number */
     num_chunks = bytestream2_get_le16(&g2);
     bytestream2_skip(&g2, 8);  /* skip padding */
 
+    if (frame_size < 16)
+        return AVERROR_INVALIDDATA;
+
     frame_size -= 16;
 
     /* iterate through the chunks */
-    while ((frame_size > 0) && (num_chunks > 0)) {
+    while ((frame_size >= 6) && (num_chunks > 0) &&
+            bytestream2_get_bytes_left(&g2) >= 4) {
+        int stream_ptr_after_chunk;
         chunk_size = bytestream2_get_le32(&g2);
+        if (chunk_size > frame_size) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid chunk_size = %u > frame_size = %u\n", chunk_size, frame_size);
+            chunk_size = frame_size;
+        }
+        stream_ptr_after_chunk = bytestream2_tell(&g2) - 4 + chunk_size;
+
         chunk_type = bytestream2_get_le16(&g2);
 
         switch (chunk_type) {
         case FLI_256_COLOR:
         case FLI_COLOR:
-            stream_ptr_after_color_chunk = bytestream2_tell(&g2) + chunk_size - 6;
-
             /* check special case: If this file is from the Magic Carpet
              * game and uses 6-bit colors even though it reports 256-color
              * chunks in a 0xAF12-type file (fli_type is set to 0xAF13 during
@@ -217,6 +243,9 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                 if (color_changes == 0)
                     color_changes = 256;
 
+                if (bytestream2_tell(&g2) + color_changes * 3 > stream_ptr_after_chunk)
+                    break;
+
                 for (j = 0; j < color_changes; j++) {
                     unsigned int entry;
 
@@ -227,30 +256,30 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     r = bytestream2_get_byte(&g2) << color_shift;
                     g = bytestream2_get_byte(&g2) << color_shift;
                     b = bytestream2_get_byte(&g2) << color_shift;
-                    entry = (r << 16) | (g << 8) | b;
+                    entry = 0xFFU << 24 | r << 16 | g << 8 | b;
+                    if (color_shift == 2)
+                        entry |= entry >> 6 & 0x30303;
                     if (s->palette[palette_ptr] != entry)
                         s->new_palette = 1;
                     s->palette[palette_ptr++] = entry;
                 }
             }
-
-            /* color chunks sometimes have weird 16-bit alignment issues;
-             * therefore, take the hardline approach and skip
-             * to the value calculated w.r.t. the size specified by the color
-             * chunk header */
-            if (stream_ptr_after_color_chunk - bytestream2_tell(&g2) > 0)
-                bytestream2_skip(&g2, stream_ptr_after_color_chunk - bytestream2_tell(&g2));
-
             break;
 
         case FLI_DELTA:
             y_ptr = 0;
             compressed_lines = bytestream2_get_le16(&g2);
             while (compressed_lines > 0) {
+                if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                    break;
+                if (y_ptr > pixel_limit)
+                    return AVERROR_INVALIDDATA;
                 line_packets = bytestream2_get_le16(&g2);
                 if ((line_packets & 0xC000) == 0xC000) {
                     // line skip opcode
                     line_packets = -line_packets;
+                    if (line_packets > s->avctx->height)
+                        return AVERROR_INVALIDDATA;
                     y_ptr += line_packets * s->frame->linesize[0];
                 } else if ((line_packets & 0xC000) == 0x4000) {
                     av_log(avctx, AV_LOG_ERROR, "Undefined opcode (%x) in DELTA_FLI\n", line_packets);
@@ -265,6 +294,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     CHECK_PIXEL_PTR(0);
                     pixel_countdown = s->avctx->width;
                     for (i = 0; i < line_packets; i++) {
+                        if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                            break;
                         /* account for the skip bytes */
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += pixel_skip;
@@ -281,6 +312,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                             }
                         } else {
                             CHECK_PIXEL_PTR(byte_run * 2);
+                            if (bytestream2_tell(&g2) + byte_run * 2 > stream_ptr_after_chunk)
+                                break;
                             for (j = 0; j < byte_run * 2; j++, pixel_countdown--) {
                                 pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             }
@@ -295,6 +328,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
         case FLI_LC:
             /* line compressed */
             starting_line = bytestream2_get_le16(&g2);
+            if (starting_line >= s->avctx->height)
+                return AVERROR_INVALIDDATA;
             y_ptr = 0;
             y_ptr += starting_line * s->frame->linesize[0];
 
@@ -303,16 +338,22 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                 pixel_ptr = y_ptr;
                 CHECK_PIXEL_PTR(0);
                 pixel_countdown = s->avctx->width;
+                if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                    break;
                 line_packets = bytestream2_get_byte(&g2);
                 if (line_packets > 0) {
                     for (i = 0; i < line_packets; i++) {
                         /* account for the skip bytes */
+                        if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                            break;
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += pixel_skip;
                         pixel_countdown -= pixel_skip;
                         byte_run = sign_extend(bytestream2_get_byte(&g2),8);
                         if (byte_run > 0) {
                             CHECK_PIXEL_PTR(byte_run);
+                            if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                                break;
                             for (j = 0; j < byte_run; j++, pixel_countdown--) {
                                 pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             }
@@ -349,6 +390,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                  bytestream2_skip(&g2, 1);
                 pixel_countdown = s->avctx->width;
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (!byte_run) {
                         av_log(avctx, AV_LOG_ERROR, "Invalid byte run value.\n");
@@ -368,6 +411,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     } else {  /* copy bytes if byte_run < 0 */
                         byte_run = -byte_run;
                         CHECK_PIXEL_PTR(byte_run);
+                        if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                            break;
                         for (j = 0; j < byte_run; j++) {
                             pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             pixel_countdown--;
@@ -384,22 +429,23 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
 
         case FLI_COPY:
             /* copy the chunk (uncompressed frame) */
-            if (chunk_size - 6 > s->avctx->width * s->avctx->height) {
+            if (chunk_size - 6 != FFALIGN(s->avctx->width, 4) * s->avctx->height) {
                 av_log(avctx, AV_LOG_ERROR, "In chunk FLI_COPY : source data (%d bytes) " \
-                       "bigger than image, skipping chunk\n", chunk_size - 6);
+                       "has incorrect size, skipping chunk\n", chunk_size - 6);
                 bytestream2_skip(&g2, chunk_size - 6);
             } else {
                 for (y_ptr = 0; y_ptr < s->frame->linesize[0] * s->avctx->height;
                      y_ptr += s->frame->linesize[0]) {
                     bytestream2_get_buffer(&g2, &pixels[y_ptr],
                                            s->avctx->width);
+                    if (s->avctx->width & 3)
+                        bytestream2_skip(&g2, 4 - (s->avctx->width & 3));
                 }
             }
             break;
 
         case FLI_MINI:
             /* some sort of a thumbnail? disregard this chunk... */
-            bytestream2_skip(&g2, chunk_size - 6);
             break;
 
         default:
@@ -407,14 +453,20 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
             break;
         }
 
+        if (stream_ptr_after_chunk - bytestream2_tell(&g2) >= 0) {
+            bytestream2_skip(&g2, stream_ptr_after_chunk - bytestream2_tell(&g2));
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Chunk overread\n");
+            break;
+        }
+
         frame_size -= chunk_size;
         num_chunks--;
     }
 
     /* by the end of the chunk, the stream ptr should equal the frame
-     * size (minus 1, possibly); if it doesn't, issue a warning */
-    if ((bytestream2_get_bytes_left(&g2) != 0) &&
-        (bytestream2_get_bytes_left(&g2) != 1))
+     * size (minus 1 or 2, possibly); if it doesn't, issue a warning */
+    if (bytestream2_get_bytes_left(&g2) > 2)
         av_log(avctx, AV_LOG_ERROR, "Processed FLI chunk where chunk size = %d " \
                "and final chunk ptr = %d\n", buf_size,
                buf_size - bytestream2_get_bytes_left(&g2));
@@ -467,10 +519,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
 
     bytestream2_init(&g2, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     pixels = s->frame->data[0];
     pixel_limit = s->avctx->height * s->frame->linesize[0];
@@ -479,14 +529,28 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
     bytestream2_skip(&g2, 2);  /* skip the magic number */
     num_chunks = bytestream2_get_le16(&g2);
     bytestream2_skip(&g2, 8);  /* skip padding */
+    if (frame_size > buf_size)
+        frame_size = buf_size;
 
+    if (frame_size < 16)
+        return AVERROR_INVALIDDATA;
     frame_size -= 16;
 
     /* iterate through the chunks */
-    while ((frame_size > 0) && (num_chunks > 0)) {
+    while ((frame_size > 0) && (num_chunks > 0) &&
+            bytestream2_get_bytes_left(&g2) >= 4) {
+        int stream_ptr_after_chunk;
         chunk_size = bytestream2_get_le32(&g2);
+        if (chunk_size > frame_size) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid chunk_size = %u > frame_size = %u\n", chunk_size, frame_size);
+            chunk_size = frame_size;
+        }
+        stream_ptr_after_chunk = bytestream2_tell(&g2) - 4 + chunk_size;
+
         chunk_type = bytestream2_get_le16(&g2);
 
+
         switch (chunk_type) {
         case FLI_256_COLOR:
         case FLI_COLOR:
@@ -504,9 +568,15 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
             y_ptr = 0;
             compressed_lines = bytestream2_get_le16(&g2);
             while (compressed_lines > 0) {
+                if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                    break;
+                if (y_ptr > pixel_limit)
+                    return AVERROR_INVALIDDATA;
                 line_packets = bytestream2_get_le16(&g2);
                 if (line_packets < 0) {
                     line_packets = -line_packets;
+                    if (line_packets > s->avctx->height)
+                        return AVERROR_INVALIDDATA;
                     y_ptr += line_packets * s->frame->linesize[0];
                 } else {
                     compressed_lines--;
@@ -515,6 +585,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                     pixel_countdown = s->avctx->width;
                     for (i = 0; i < line_packets; i++) {
                         /* account for the skip bytes */
+                        if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                            break;
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += (pixel_skip*2); /* Pixel is 2 bytes wide */
                         pixel_countdown -= pixel_skip;
@@ -528,6 +600,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                                 pixel_ptr += 2;
                             }
                         } else {
+                            if (bytestream2_tell(&g2) + 2*byte_run > stream_ptr_after_chunk)
+                                break;
                             CHECK_PIXEL_PTR(2 * byte_run);
                             for (j = 0; j < byte_run; j++, pixel_countdown--) {
                                 *((signed short*)(&pixels[pixel_ptr])) = bytestream2_get_le16(&g2);
@@ -562,6 +636,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                 pixel_countdown = (s->avctx->width * 2);
 
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (byte_run > 0) {
                         palette_idx1 = bytestream2_get_byte(&g2);
@@ -575,6 +651,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                         }
                     } else {  /* copy bytes if byte_run < 0 */
                         byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                            break;
                         CHECK_PIXEL_PTR(byte_run);
                         for (j = 0; j < byte_run; j++) {
                             palette_idx1 = bytestream2_get_byte(&g2);
@@ -614,6 +692,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                 pixel_countdown = s->avctx->width; /* Width is in pixels, not bytes */
 
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (byte_run > 0) {
                         pixel    = bytestream2_get_le16(&g2);
@@ -628,6 +708,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                         }
                     } else {  /* copy pixels if byte_run < 0 */
                         byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + 2 * byte_run > stream_ptr_after_chunk)
+                            break;
                         CHECK_PIXEL_PTR(2 * byte_run);
                         for (j = 0; j < byte_run; j++) {
                             *((signed short*)(&pixels[pixel_ptr])) = bytestream2_get_le16(&g2);
@@ -647,7 +729,7 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
         case FLI_COPY:
         case FLI_DTA_COPY:
             /* copy the chunk (uncompressed frame) */
-            if (chunk_size - 6 > (unsigned int)(s->avctx->width * s->avctx->height)*2) {
+            if (chunk_size - 6 > (unsigned int)(FFALIGN(s->avctx->width, 2) * s->avctx->height)*2) {
                 av_log(avctx, AV_LOG_ERROR, "In chunk FLI_COPY : source data (%d bytes) " \
                        "bigger than image, skipping chunk\n", chunk_size - 6);
                 bytestream2_skip(&g2, chunk_size - 6);
@@ -663,6 +745,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                       pixel_ptr += 2;
                       pixel_countdown--;
                     }
+                    if (s->avctx->width & 1)
+                        bytestream2_skip(&g2, 2);
                 }
             }
             break;
@@ -677,6 +761,300 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
             break;
         }
 
+        if (stream_ptr_after_chunk - bytestream2_tell(&g2) >= 0) {
+            bytestream2_skip(&g2, stream_ptr_after_chunk - bytestream2_tell(&g2));
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Chunk overread\n");
+            break;
+        }
+
+        frame_size -= chunk_size;
+        num_chunks--;
+    }
+
+    /* by the end of the chunk, the stream ptr should equal the frame
+     * size (minus 1, possibly); if it doesn't, issue a warning */
+    if ((bytestream2_get_bytes_left(&g2) != 0) && (bytestream2_get_bytes_left(&g2) != 1))
+        av_log(avctx, AV_LOG_ERROR, "Processed FLI chunk where chunk size = %d " \
+               "and final chunk ptr = %d\n", buf_size, bytestream2_tell(&g2));
+
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
+
+    *got_frame = 1;
+
+    return buf_size;
+}
+
+static int flic_decode_frame_24BPP(AVCodecContext *avctx,
+                                   void *data, int *got_frame,
+                                   const uint8_t *buf, int buf_size)
+{
+    FlicDecodeContext *s = avctx->priv_data;
+
+    GetByteContext g2;
+    int pixel_ptr;
+    unsigned char palette_idx1;
+
+    unsigned int frame_size;
+    int num_chunks;
+
+    unsigned int chunk_size;
+    int chunk_type;
+
+    int i, j, ret;
+
+    int lines;
+    int compressed_lines;
+    signed short line_packets;
+    int y_ptr;
+    int byte_run;
+    int pixel_skip;
+    int pixel_countdown;
+    unsigned char *pixels;
+    int pixel;
+    unsigned int pixel_limit;
+
+    bytestream2_init(&g2, buf, buf_size);
+
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+        return ret;
+
+    pixels = s->frame->data[0];
+    pixel_limit = s->avctx->height * s->frame->linesize[0];
+
+    frame_size = bytestream2_get_le32(&g2);
+    bytestream2_skip(&g2, 2);  /* skip the magic number */
+    num_chunks = bytestream2_get_le16(&g2);
+    bytestream2_skip(&g2, 8);  /* skip padding */
+    if (frame_size > buf_size)
+        frame_size = buf_size;
+
+    if (frame_size < 16)
+        return AVERROR_INVALIDDATA;
+    frame_size -= 16;
+
+    /* iterate through the chunks */
+    while ((frame_size > 0) && (num_chunks > 0) &&
+            bytestream2_get_bytes_left(&g2) >= 4) {
+        int stream_ptr_after_chunk;
+        chunk_size = bytestream2_get_le32(&g2);
+        if (chunk_size > frame_size) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid chunk_size = %u > frame_size = %u\n", chunk_size, frame_size);
+            chunk_size = frame_size;
+        }
+        stream_ptr_after_chunk = bytestream2_tell(&g2) - 4 + chunk_size;
+
+        chunk_type = bytestream2_get_le16(&g2);
+
+
+        switch (chunk_type) {
+        case FLI_256_COLOR:
+        case FLI_COLOR:
+            /* For some reason, it seems that non-palettized flics do
+             * include one of these chunks in their first frame.
+             * Why I do not know, it seems rather extraneous. */
+            ff_dlog(avctx,
+                    "Unexpected Palette chunk %d in non-palettized FLC\n",
+                    chunk_type);
+            bytestream2_skip(&g2, chunk_size - 6);
+            break;
+
+        case FLI_DELTA:
+        case FLI_DTA_LC:
+            y_ptr = 0;
+            compressed_lines = bytestream2_get_le16(&g2);
+            while (compressed_lines > 0) {
+                if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                    break;
+                if (y_ptr > pixel_limit)
+                    return AVERROR_INVALIDDATA;
+                line_packets = bytestream2_get_le16(&g2);
+                if (line_packets < 0) {
+                    line_packets = -line_packets;
+                    if (line_packets > s->avctx->height)
+                        return AVERROR_INVALIDDATA;
+                    y_ptr += line_packets * s->frame->linesize[0];
+                } else {
+                    compressed_lines--;
+                    pixel_ptr = y_ptr;
+                    CHECK_PIXEL_PTR(0);
+                    pixel_countdown = s->avctx->width;
+                    for (i = 0; i < line_packets; i++) {
+                        /* account for the skip bytes */
+                        if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                            break;
+                        pixel_skip = bytestream2_get_byte(&g2);
+                        pixel_ptr += (pixel_skip*3); /* Pixel is 3 bytes wide */
+                        pixel_countdown -= pixel_skip;
+                        byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
+                        if (byte_run < 0) {
+                            byte_run = -byte_run;
+                            pixel    = bytestream2_get_le24(&g2);
+                            CHECK_PIXEL_PTR(3 * byte_run);
+                            for (j = 0; j < byte_run; j++, pixel_countdown -= 1) {
+                                AV_WL24(&pixels[pixel_ptr], pixel);
+                                pixel_ptr += 3;
+                            }
+                        } else {
+                            if (bytestream2_tell(&g2) + 2*byte_run > stream_ptr_after_chunk)
+                                break;
+                            CHECK_PIXEL_PTR(2 * byte_run);
+                            for (j = 0; j < byte_run; j++, pixel_countdown--) {
+                                pixel = bytestream2_get_le24(&g2);
+                                AV_WL24(&pixels[pixel_ptr], pixel);
+                                pixel_ptr += 3;
+                            }
+                        }
+                    }
+
+                    y_ptr += s->frame->linesize[0];
+                }
+            }
+            break;
+
+        case FLI_LC:
+            av_log(avctx, AV_LOG_ERROR, "Unexpected FLI_LC chunk in non-palettized FLC\n");
+            bytestream2_skip(&g2, chunk_size - 6);
+            break;
+
+        case FLI_BLACK:
+            /* set the whole frame to 0x00 which is black for 24 bit mode. */
+            memset(pixels, 0x00,
+                   s->frame->linesize[0] * s->avctx->height);
+            break;
+
+        case FLI_BRUN:
+            y_ptr = 0;
+            for (lines = 0; lines < s->avctx->height; lines++) {
+                pixel_ptr = y_ptr;
+                /* disregard the line packets; instead, iterate through all
+                 * pixels on a row */
+                bytestream2_skip(&g2, 1);
+                pixel_countdown = (s->avctx->width * 3);
+
+                while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
+                    byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
+                    if (byte_run > 0) {
+                        palette_idx1 = bytestream2_get_byte(&g2);
+                        CHECK_PIXEL_PTR(byte_run);
+                        for (j = 0; j < byte_run; j++) {
+                            pixels[pixel_ptr++] = palette_idx1;
+                            pixel_countdown--;
+                            if (pixel_countdown < 0)
+                                av_log(avctx, AV_LOG_ERROR, "pixel_countdown < 0 (%d) (linea%d)\n",
+                                       pixel_countdown, lines);
+                        }
+                    } else {  /* copy bytes if byte_run < 0 */
+                        byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                            break;
+                        CHECK_PIXEL_PTR(byte_run);
+                        for (j = 0; j < byte_run; j++) {
+                            palette_idx1 = bytestream2_get_byte(&g2);
+                            pixels[pixel_ptr++] = palette_idx1;
+                            pixel_countdown--;
+                            if (pixel_countdown < 0)
+                                av_log(avctx, AV_LOG_ERROR, "pixel_countdown < 0 (%d) at line %d\n",
+                                       pixel_countdown, lines);
+                        }
+                    }
+                }
+
+                y_ptr += s->frame->linesize[0];
+            }
+            break;
+
+        case FLI_DTA_BRUN:
+            y_ptr = 0;
+            for (lines = 0; lines < s->avctx->height; lines++) {
+                pixel_ptr = y_ptr;
+                /* disregard the line packets; instead, iterate through all
+                 * pixels on a row */
+                bytestream2_skip(&g2, 1);
+                pixel_countdown = s->avctx->width; /* Width is in pixels, not bytes */
+
+                while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
+                    byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
+                    if (byte_run > 0) {
+                        pixel = bytestream2_get_le24(&g2);
+                        CHECK_PIXEL_PTR(3 * byte_run);
+                        for (j = 0; j < byte_run; j++) {
+                            AV_WL24(pixels + pixel_ptr, pixel);
+                            pixel_ptr += 3;
+                            pixel_countdown--;
+                            if (pixel_countdown < 0)
+                                av_log(avctx, AV_LOG_ERROR, "pixel_countdown < 0 (%d)\n",
+                                       pixel_countdown);
+                        }
+                    } else {  /* copy pixels if byte_run < 0 */
+                        byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + 3 * byte_run > stream_ptr_after_chunk)
+                            break;
+                        CHECK_PIXEL_PTR(3 * byte_run);
+                        for (j = 0; j < byte_run; j++) {
+                            pixel = bytestream2_get_le24(&g2);
+                            AV_WL24(pixels + pixel_ptr, pixel);
+                            pixel_ptr  += 3;
+                            pixel_countdown--;
+                            if (pixel_countdown < 0)
+                                av_log(avctx, AV_LOG_ERROR, "pixel_countdown < 0 (%d)\n",
+                                       pixel_countdown);
+                        }
+                    }
+                }
+
+                y_ptr += s->frame->linesize[0];
+            }
+            break;
+
+        case FLI_COPY:
+        case FLI_DTA_COPY:
+            /* copy the chunk (uncompressed frame) */
+            if (chunk_size - 6 > (unsigned int)(FFALIGN(s->avctx->width, 2) * s->avctx->height)*3) {
+                av_log(avctx, AV_LOG_ERROR, "In chunk FLI_COPY : source data (%d bytes) " \
+                       "bigger than image, skipping chunk\n", chunk_size - 6);
+                bytestream2_skip(&g2, chunk_size - 6);
+            } else {
+                for (y_ptr = 0; y_ptr < s->frame->linesize[0] * s->avctx->height;
+                     y_ptr += s->frame->linesize[0]) {
+
+                    pixel_countdown = s->avctx->width;
+                    pixel_ptr = 0;
+                    while (pixel_countdown > 0) {
+                        pixel = bytestream2_get_le24(&g2);
+                        AV_WL24(&pixels[y_ptr + pixel_ptr], pixel);
+                        pixel_ptr += 3;
+                        pixel_countdown--;
+                    }
+                    if (s->avctx->width & 1)
+                        bytestream2_skip(&g2, 3);
+                }
+            }
+            break;
+
+        case FLI_MINI:
+            /* some sort of a thumbnail? disregard this chunk... */
+            bytestream2_skip(&g2, chunk_size - 6);
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Unrecognized chunk type: %d\n", chunk_type);
+            break;
+        }
+
+        if (stream_ptr_after_chunk - bytestream2_tell(&g2) >= 0) {
+            bytestream2_skip(&g2, stream_ptr_after_chunk - bytestream2_tell(&g2));
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Chunk overread\n");
+            break;
+        }
+
         frame_size -= chunk_size;
         num_chunks--;
     }
@@ -702,17 +1080,15 @@ static int flic_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
-      return flic_decode_frame_8BPP(avctx, data, got_frame,
-                                    buf, buf_size);
-    }
-    else if ((avctx->pix_fmt == AV_PIX_FMT_RGB555) ||
-             (avctx->pix_fmt == AV_PIX_FMT_RGB565)) {
-      return flic_decode_frame_15_16BPP(avctx, data, got_frame,
-                                        buf, buf_size);
-    }
-    else if (avctx->pix_fmt == AV_PIX_FMT_BGR24) {
-        avpriv_request_sample(avctx, "24bpp FLC");
-        return AVERROR_PATCHWELCOME;
+        return flic_decode_frame_8BPP(avctx, data, got_frame,
+                                      buf, buf_size);
+    } else if ((avctx->pix_fmt == AV_PIX_FMT_RGB555) ||
+               (avctx->pix_fmt == AV_PIX_FMT_RGB565)) {
+        return flic_decode_frame_15_16BPP(avctx, data, got_frame,
+                                          buf, buf_size);
+    } else if (avctx->pix_fmt == AV_PIX_FMT_BGR24) {
+        return flic_decode_frame_24BPP(avctx, data, got_frame,
+                                       buf, buf_size);
     }
 
     /* Should not get  here, ever as the pix_fmt is processed */
diff --git a/libavcodec/flv.h b/libavcodec/flv.h
index 801e357..561cfe0 100644
--- a/libavcodec/flv.h
+++ b/libavcodec/flv.h
@@ -1,20 +1,20 @@
 /*
  * FLV specific private header.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,5 @@ void ff_flv2_encode_ac_esc(PutBitContext *pb, int slevel, int level, int run,
                            int last);
 
 int ff_flv_decode_picture_header(MpegEncContext *s);
-void ff_flv2_decode_ac_esc(GetBitContext *gb, int *level, int *run, int *last);
 
 #endif /* AVCODEC_FLV_H */
diff --git a/libavcodec/flvdec.c b/libavcodec/flvdec.c
index f2d4929..f9beb40 100644
--- a/libavcodec/flvdec.c
+++ b/libavcodec/flvdec.c
@@ -1,20 +1,20 @@
 /*
  * FLV decoding.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,17 +25,6 @@
 #include "mpegvideo.h"
 #include "mpegvideodata.h"
 
-void ff_flv2_decode_ac_esc(GetBitContext *gb, int *level, int *run, int *last)
-{
-    int is11 = get_bits1(gb);
-    *last = get_bits1(gb);
-    *run  = get_bits(gb, 6);
-    if (is11)
-        *level = get_sbits(gb, 11);
-    else
-        *level = get_sbits(gb, 7);
-}
-
 int ff_flv_decode_picture_header(MpegEncContext *s)
 {
     int format, width, height;
@@ -43,12 +32,12 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
     /* picture header */
     if (get_bits_long(&s->gb, 17) != 1) {
         av_log(s->avctx, AV_LOG_ERROR, "Bad picture start code\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     format = get_bits(&s->gb, 5);
     if (format != 0 && format != 1) {
         av_log(s->avctx, AV_LOG_ERROR, "Bad picture format\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     s->h263_flv       = format + 1;
     s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
@@ -87,7 +76,7 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
         break;
     }
     if (av_image_check_size(width, height, 0, s->avctx))
-        return -1;
+        return AVERROR(EINVAL);
     s->width  = width;
     s->height = height;
 
@@ -105,10 +94,14 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
     s->h263_long_vectors = 0;
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
     s->f_code = 1;
 
+    if (s->ehc_mode)
+        s->avctx->sample_aspect_ratio= (AVRational){1,2};
+
     if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(s->avctx, AV_LOG_DEBUG, "%c esc_type:%d, qp:%d num:%d\n",
                s->droppable ? 'D' : av_get_picture_type_char(s->pict_type),
@@ -130,6 +123,8 @@ AVCodec ff_flv_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
                                                      AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/flvenc.c b/libavcodec/flvenc.c
index f7c72c5..15f794e 100644
--- a/libavcodec/flvenc.c
+++ b/libavcodec/flvenc.c
@@ -1,20 +1,20 @@
 /*
  * FLV Encoding specific code.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index f94d438..3b33af6 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,4 +63,6 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
         ff_fmt_convert_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_fmt_convert_init_x86(c, avctx);
+    if (HAVE_MIPSFPU)
+        ff_fmt_convert_init_mips(c);
 }
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index b2c2356..a1b17e4 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,5 +72,6 @@ void ff_fmt_convert_init_aarch64(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_mips(FmtConvertContext *c);
 
 #endif /* AVCODEC_FMTCONVERT_H */
diff --git a/libavcodec/fmvc.c b/libavcodec/fmvc.c
index 64136e3..5778d7b 100644
--- a/libavcodec/fmvc.c
+++ b/libavcodec/fmvc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2017 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,7 +53,7 @@ typedef struct FMVCContext {
 
 static int decode_type2(GetByteContext *gb, PutByteContext *pb)
 {
-    unsigned repeat = 0, first = 1, opcode;
+    unsigned repeat = 0, first = 1, opcode = 0;
     int i, len, pos;
 
     while (bytestream2_get_bytes_left(gb) > 0) {
@@ -288,7 +288,7 @@ static int decode_type2(GetByteContext *gb, PutByteContext *pb)
 
 static int decode_type1(GetByteContext *gb, PutByteContext *pb)
 {
-    unsigned opcode, len;
+    unsigned opcode = 0, len;
     int high = 0;
     int i, pos;
 
@@ -436,7 +436,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         for (y = 0; y < avctx->height; y++) {
             memcpy(dst, src, avctx->width * s->bpp);
             dst -= frame->linesize[0];
-            src += avctx->width * s->bpp;
+            src += s->stride * 4;
         }
     } else {
         unsigned block, nb_blocks;
@@ -460,7 +460,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
             int start = 0;
 
             offset = bytestream2_get_le16(gb);
-            if (offset > s->nb_blocks)
+            if (offset >= s->nb_blocks)
                 return AVERROR_INVALIDDATA;
 
             size = bytestream2_get_le16(gb);
@@ -514,7 +514,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         for (y = 0; y < avctx->height; y++) {
             memcpy(ddst, ssrc, avctx->width * s->bpp);
             ddst -= frame->linesize[0];
-            ssrc += avctx->width * s->bpp;
+            ssrc += s->stride * 4;
         }
     }
 
@@ -530,7 +530,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     switch (avctx->bits_per_coded_sample) {
     case 16:
-        avctx->pix_fmt = AV_PIX_FMT_RGB555;
+        avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
         break;
     case 24:
         avctx->pix_fmt = AV_PIX_FMT_BGR24;
@@ -570,7 +570,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     s->nb_blocks = s->xb * s->yb;
     if (!s->nb_blocks)
         return AVERROR_INVALIDDATA;
-    s->blocks    = av_mallocz(s->nb_blocks * sizeof(*s->blocks));
+    s->blocks    = av_calloc(s->nb_blocks, sizeof(*s->blocks));
     if (!s->blocks)
         return AVERROR(ENOMEM);
 
@@ -602,8 +602,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
     s->bpp          = avctx->bits_per_coded_sample >> 3;
     s->buffer_size  = avctx->width * avctx->height * 4;
     s->pbuffer_size = avctx->width * avctx->height * 4;
-    s->buffer       = av_malloc(s->buffer_size);
-    s->pbuffer      = av_malloc(s->pbuffer_size);
+    s->buffer       = av_mallocz(s->buffer_size);
+    s->pbuffer      = av_mallocz(s->pbuffer_size);
     if (!s->buffer || !s->pbuffer)
         return AVERROR(ENOMEM);
 
diff --git a/libavcodec/frame_thread_encoder.c b/libavcodec/frame_thread_encoder.c
new file mode 100644
index 0000000..55756c4
--- /dev/null
+++ b/libavcodec/frame_thread_encoder.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdatomic.h>
+
+#include "frame_thread_encoder.h"
+
+#include "libavutil/fifo.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+#include "libavutil/thread.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "thread.h"
+
+#define MAX_THREADS 64
+#define BUFFER_SIZE (2*MAX_THREADS)
+
+typedef struct{
+    void *indata;
+    void *outdata;
+    int64_t return_code;
+    unsigned index;
+} Task;
+
+typedef struct{
+    AVCodecContext *parent_avctx;
+    pthread_mutex_t buffer_mutex;
+
+    AVFifoBuffer *task_fifo;
+    pthread_mutex_t task_fifo_mutex;
+    pthread_cond_t task_fifo_cond;
+
+    Task finished_tasks[BUFFER_SIZE];
+    pthread_mutex_t finished_task_mutex;
+    pthread_cond_t finished_task_cond;
+
+    unsigned task_index;
+    unsigned finished_task_index;
+
+    pthread_t worker[MAX_THREADS];
+    atomic_int exit;
+} ThreadContext;
+
+static void * attribute_align_arg worker(void *v){
+    AVCodecContext *avctx = v;
+    ThreadContext *c = avctx->internal->frame_thread_encoder;
+    AVPacket *pkt = NULL;
+
+    while (!atomic_load(&c->exit)) {
+        int got_packet, ret;
+        AVFrame *frame;
+        Task task;
+
+        if(!pkt) pkt = av_packet_alloc();
+        if(!pkt) continue;
+        av_init_packet(pkt);
+
+        pthread_mutex_lock(&c->task_fifo_mutex);
+        while (av_fifo_size(c->task_fifo) <= 0 || atomic_load(&c->exit)) {
+            if (atomic_load(&c->exit)) {
+                pthread_mutex_unlock(&c->task_fifo_mutex);
+                goto end;
+            }
+            pthread_cond_wait(&c->task_fifo_cond, &c->task_fifo_mutex);
+        }
+        av_fifo_generic_read(c->task_fifo, &task, sizeof(task), NULL);
+        pthread_mutex_unlock(&c->task_fifo_mutex);
+        frame = task.indata;
+
+        ret = avcodec_encode_video2(avctx, pkt, frame, &got_packet);
+        pthread_mutex_lock(&c->buffer_mutex);
+        av_frame_unref(frame);
+        pthread_mutex_unlock(&c->buffer_mutex);
+        av_frame_free(&frame);
+        if(got_packet) {
+            int ret2 = av_packet_make_refcounted(pkt);
+            if (ret >= 0 && ret2 < 0)
+                ret = ret2;
+        } else {
+            pkt->data = NULL;
+            pkt->size = 0;
+        }
+        pthread_mutex_lock(&c->finished_task_mutex);
+        c->finished_tasks[task.index].outdata = pkt; pkt = NULL;
+        c->finished_tasks[task.index].return_code = ret;
+        pthread_cond_signal(&c->finished_task_cond);
+        pthread_mutex_unlock(&c->finished_task_mutex);
+    }
+end:
+    av_free(pkt);
+    pthread_mutex_lock(&c->buffer_mutex);
+    avcodec_close(avctx);
+    pthread_mutex_unlock(&c->buffer_mutex);
+    av_freep(&avctx);
+    return NULL;
+}
+
+int ff_frame_thread_encoder_init(AVCodecContext *avctx, AVDictionary *options){
+    int i=0;
+    ThreadContext *c;
+
+
+    if(   !(avctx->thread_type & FF_THREAD_FRAME)
+       || !(avctx->codec->capabilities & AV_CODEC_CAP_INTRA_ONLY))
+        return 0;
+
+    if(   !avctx->thread_count
+       && avctx->codec_id == AV_CODEC_ID_MJPEG
+       && !(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Forcing thread count to 1 for MJPEG encoding, use -thread_type slice "
+               "or a constant quantizer if you want to use multiple cpu cores\n");
+        avctx->thread_count = 1;
+    }
+    if(   avctx->thread_count > 1
+       && avctx->codec_id == AV_CODEC_ID_MJPEG
+       && !(avctx->flags & AV_CODEC_FLAG_QSCALE))
+        av_log(avctx, AV_LOG_WARNING,
+               "MJPEG CBR encoding works badly with frame multi-threading, consider "
+               "using -threads 1, -thread_type slice or a constant quantizer.\n");
+
+    if (avctx->codec_id == AV_CODEC_ID_HUFFYUV ||
+        avctx->codec_id == AV_CODEC_ID_FFVHUFF) {
+        int warn = 0;
+        int context_model = 0;
+        AVDictionaryEntry *con = av_dict_get(options, "context", NULL, AV_DICT_MATCH_CASE);
+
+        if (con && con->value)
+            context_model = atoi(con->value);
+
+        if (avctx->flags & AV_CODEC_FLAG_PASS1)
+            warn = 1;
+        else if(context_model > 0) {
+            AVDictionaryEntry *t = av_dict_get(options, "non_deterministic",
+                                               NULL, AV_DICT_MATCH_CASE);
+            warn = !t || !t->value || !atoi(t->value) ? 1 : 0;
+        }
+        // huffyuv does not support these with multiple frame threads currently
+        if (warn) {
+            av_log(avctx, AV_LOG_WARNING,
+               "Forcing thread count to 1 for huffyuv encoding with first pass or context 1\n");
+            avctx->thread_count = 1;
+        }
+    }
+
+    if(!avctx->thread_count) {
+        avctx->thread_count = av_cpu_count();
+        avctx->thread_count = FFMIN(avctx->thread_count, MAX_THREADS);
+    }
+
+    if(avctx->thread_count <= 1)
+        return 0;
+
+    if(avctx->thread_count > MAX_THREADS)
+        return AVERROR(EINVAL);
+
+    av_assert0(!avctx->internal->frame_thread_encoder);
+    c = avctx->internal->frame_thread_encoder = av_mallocz(sizeof(ThreadContext));
+    if(!c)
+        return AVERROR(ENOMEM);
+
+    c->parent_avctx = avctx;
+
+    c->task_fifo = av_fifo_alloc_array(BUFFER_SIZE, sizeof(Task));
+    if(!c->task_fifo)
+        goto fail;
+
+    pthread_mutex_init(&c->task_fifo_mutex, NULL);
+    pthread_mutex_init(&c->finished_task_mutex, NULL);
+    pthread_mutex_init(&c->buffer_mutex, NULL);
+    pthread_cond_init(&c->task_fifo_cond, NULL);
+    pthread_cond_init(&c->finished_task_cond, NULL);
+    atomic_init(&c->exit, 0);
+
+    for(i=0; i<avctx->thread_count ; i++){
+        AVDictionary *tmp = NULL;
+        int ret;
+        void *tmpv;
+        AVCodecContext *thread_avctx = avcodec_alloc_context3(avctx->codec);
+        if(!thread_avctx)
+            goto fail;
+        tmpv = thread_avctx->priv_data;
+        *thread_avctx = *avctx;
+        ret = av_opt_copy(thread_avctx, avctx);
+        if (ret < 0)
+            goto fail;
+        thread_avctx->priv_data = tmpv;
+        thread_avctx->internal = NULL;
+        if (avctx->codec->priv_class) {
+            int ret = av_opt_copy(thread_avctx->priv_data, avctx->priv_data);
+            if (ret < 0)
+                goto fail;
+        } else
+            memcpy(thread_avctx->priv_data, avctx->priv_data, avctx->codec->priv_data_size);
+        thread_avctx->thread_count = 1;
+        thread_avctx->active_thread_type &= ~FF_THREAD_FRAME;
+
+        av_dict_copy(&tmp, options, 0);
+        av_dict_set(&tmp, "threads", "1", 0);
+        if(avcodec_open2(thread_avctx, avctx->codec, &tmp) < 0) {
+            av_dict_free(&tmp);
+            goto fail;
+        }
+        av_dict_free(&tmp);
+        av_assert0(!thread_avctx->internal->frame_thread_encoder);
+        thread_avctx->internal->frame_thread_encoder = c;
+        if(pthread_create(&c->worker[i], NULL, worker, thread_avctx)) {
+            goto fail;
+        }
+    }
+
+    avctx->active_thread_type = FF_THREAD_FRAME;
+
+    return 0;
+fail:
+    avctx->thread_count = i;
+    av_log(avctx, AV_LOG_ERROR, "ff_frame_thread_encoder_init failed\n");
+    ff_frame_thread_encoder_free(avctx);
+    return -1;
+}
+
+void ff_frame_thread_encoder_free(AVCodecContext *avctx){
+    int i;
+    ThreadContext *c= avctx->internal->frame_thread_encoder;
+
+    pthread_mutex_lock(&c->task_fifo_mutex);
+    atomic_store(&c->exit, 1);
+    pthread_cond_broadcast(&c->task_fifo_cond);
+    pthread_mutex_unlock(&c->task_fifo_mutex);
+
+    for (i=0; i<avctx->thread_count; i++) {
+         pthread_join(c->worker[i], NULL);
+    }
+
+    while (av_fifo_size(c->task_fifo) > 0) {
+        Task task;
+        AVFrame *frame;
+        av_fifo_generic_read(c->task_fifo, &task, sizeof(task), NULL);
+        frame = task.indata;
+        av_frame_free(&frame);
+        task.indata = NULL;
+    }
+
+    for (i=0; i<BUFFER_SIZE; i++) {
+        if (c->finished_tasks[i].outdata != NULL) {
+            AVPacket *pkt = c->finished_tasks[i].outdata;
+            av_packet_free(&pkt);
+            c->finished_tasks[i].outdata = NULL;
+        }
+    }
+
+    pthread_mutex_destroy(&c->task_fifo_mutex);
+    pthread_mutex_destroy(&c->finished_task_mutex);
+    pthread_mutex_destroy(&c->buffer_mutex);
+    pthread_cond_destroy(&c->task_fifo_cond);
+    pthread_cond_destroy(&c->finished_task_cond);
+    av_fifo_freep(&c->task_fifo);
+    av_freep(&avctx->internal->frame_thread_encoder);
+}
+
+int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet_ptr){
+    ThreadContext *c = avctx->internal->frame_thread_encoder;
+    Task task;
+    int ret;
+
+    av_assert1(!*got_packet_ptr);
+
+    if(frame){
+        AVFrame *new = av_frame_alloc();
+        if(!new)
+            return AVERROR(ENOMEM);
+        ret = av_frame_ref(new, frame);
+        if(ret < 0) {
+            av_frame_free(&new);
+            return ret;
+        }
+
+        task.index = c->task_index;
+        task.indata = (void*)new;
+        pthread_mutex_lock(&c->task_fifo_mutex);
+        av_fifo_generic_write(c->task_fifo, &task, sizeof(task), NULL);
+        pthread_cond_signal(&c->task_fifo_cond);
+        pthread_mutex_unlock(&c->task_fifo_mutex);
+
+        c->task_index = (c->task_index+1) % BUFFER_SIZE;
+    }
+
+    pthread_mutex_lock(&c->finished_task_mutex);
+    if (c->task_index == c->finished_task_index ||
+        (frame && !c->finished_tasks[c->finished_task_index].outdata &&
+         (c->task_index - c->finished_task_index) % BUFFER_SIZE <= avctx->thread_count)) {
+            pthread_mutex_unlock(&c->finished_task_mutex);
+            return 0;
+        }
+
+    while (!c->finished_tasks[c->finished_task_index].outdata) {
+        pthread_cond_wait(&c->finished_task_cond, &c->finished_task_mutex);
+    }
+    task = c->finished_tasks[c->finished_task_index];
+    *pkt = *(AVPacket*)(task.outdata);
+    if(pkt->data)
+        *got_packet_ptr = 1;
+    av_freep(&c->finished_tasks[c->finished_task_index].outdata);
+    c->finished_task_index = (c->finished_task_index+1) % BUFFER_SIZE;
+    pthread_mutex_unlock(&c->finished_task_mutex);
+
+    return task.return_code;
+}
diff --git a/libavcodec/frame_thread_encoder.h b/libavcodec/frame_thread_encoder.h
new file mode 100644
index 0000000..1f79553
--- /dev/null
+++ b/libavcodec/frame_thread_encoder.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FRAME_THREAD_ENCODER_H
+#define AVCODEC_FRAME_THREAD_ENCODER_H
+
+#include "avcodec.h"
+
+int ff_frame_thread_encoder_init(AVCodecContext *avctx, AVDictionary *options);
+void ff_frame_thread_encoder_free(AVCodecContext *avctx);
+int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet_ptr);
+
+#endif /* AVCODEC_FRAME_THREAD_ENCODER_H */
diff --git a/libavcodec/fraps.c b/libavcodec/fraps.c
index 4620ec1..7a7673f 100644
--- a/libavcodec/fraps.c
+++ b/libavcodec/fraps.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Roine Gustafsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,11 +32,12 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "huffman.h"
 #include "bytestream.h"
 #include "bswapdsp.h"
 #include "internal.h"
+#include "thread.h"
 
 #define FPS_TAG MKTAG('F', 'P', 'S', 'x')
 #define VLC_BITS 11
@@ -47,7 +48,6 @@
 typedef struct FrapsContext {
     AVCodecContext *avctx;
     BswapDSPContext bdsp;
-    AVFrame *frame;
     uint8_t *tmpbuf;
     int tmpbuf_size;
 } FrapsContext;
@@ -62,15 +62,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
 {
     FrapsContext * const s = avctx->priv_data;
 
-    avctx->pix_fmt     = AV_PIX_FMT_NONE; /* set in decode_frame */
-
     s->avctx  = avctx;
     s->tmpbuf = NULL;
 
-    s->frame = av_frame_alloc();
-    if (!s->frame)
-        return AVERROR(ENOMEM);
-
     ff_bswapdsp_init(&s->bdsp);
 
     return 0;
@@ -94,7 +88,7 @@ static int fraps2_decode_plane(FrapsContext *s, uint8_t *dst, int stride, int w,
                                const int step)
 {
     int i, j, ret;
-    BitstreamContext bc;
+    GetBitContext gb;
     VLC vlc;
     Node nodes[512];
 
@@ -111,10 +105,12 @@ static int fraps2_decode_plane(FrapsContext *s, uint8_t *dst, int stride, int w,
     s->bdsp.bswap_buf((uint32_t *) s->tmpbuf,
                       (const uint32_t *) src, size >> 2);
 
-    bitstream_init8(&bc, s->tmpbuf, size);
+    if ((ret = init_get_bits8(&gb, s->tmpbuf, size)) < 0)
+        return ret;
+
     for (j = 0; j < h; j++) {
         for (i = 0; i < w*step; i += step) {
-            dst[i] = bitstream_read_vlc(&bc, vlc.table, VLC_BITS, 3);
+            dst[i] = get_vlc2(&gb, vlc.table, VLC_BITS, 3);
             /* lines are stored as deltas between previous lines
              * and we need to add 0x80 to the first lines of chroma planes
              */
@@ -122,7 +118,7 @@ static int fraps2_decode_plane(FrapsContext *s, uint8_t *dst, int stride, int w,
                 dst[i] += dst[i - stride];
             else if (Uoff)
                 dst[i] += 0x80;
-            if (bitstream_bits_left(&bc) < 0) {
+            if (get_bits_left(&gb) < 0) {
                 ff_free_vlc(&vlc);
                 return AVERROR_INVALIDDATA;
             }
@@ -140,17 +136,18 @@ static int decode_frame(AVCodecContext *avctx,
     FrapsContext * const s = avctx->priv_data;
     const uint8_t *buf     = avpkt->data;
     int buf_size           = avpkt->size;
-    AVFrame *frame         = data;
-    AVFrame * const f      = s->frame;
+    ThreadFrame frame = { .f = data };
+    AVFrame * const f = data;
     uint32_t header;
     unsigned int version,header_size;
     unsigned int x, y;
     const uint32_t *buf32;
     uint32_t *luma1,*luma2,*cb,*cr;
     uint32_t offs[4];
-    int i, j, ret, is_chroma, planes;
-    enum AVPixelFormat pix_fmt;
-    int prev_pic_bit, expected_size;
+    int i, j, ret, is_chroma;
+    const int planes = 3;
+    int is_pal;
+    uint8_t *out;
 
     if (buf_size < 4) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too short\n");
@@ -159,92 +156,118 @@ static int decode_frame(AVCodecContext *avctx,
 
     header      = AV_RL32(buf);
     version     = header & 0xff;
+    is_pal      = buf[1] == 2 && version == 1;
     header_size = (header & (1<<30))? 8 : 4; /* bit 30 means pad to 8 bytes */
-    prev_pic_bit = header & (1U << 31); /* bit 31 means same as previous pic */
 
     if (version > 5) {
         avpriv_report_missing_feature(avctx, "Fraps version %u", version);
         return AVERROR_PATCHWELCOME;
     }
 
-    buf += 4;
-    if (header_size == 8)
-        buf += 4;
+    buf += header_size;
 
-    pix_fmt = version & 1 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
-    if (avctx->pix_fmt != pix_fmt && f->data[0]) {
-        av_frame_unref(f);
+    if (is_pal) {
+        unsigned needed_size = avctx->width * avctx->height + 1024;
+        needed_size += header_size;
+        if (buf_size != needed_size) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid frame length %d (should be %d)\n",
+                   buf_size, needed_size);
+            return AVERROR_INVALIDDATA;
+        }
+    } else if (version < 2) {
+        unsigned needed_size = avctx->width * avctx->height * 3;
+        if (version == 0) needed_size /= 2;
+        needed_size += header_size;
+        /* bit 31 means same as previous pic */
+        if (header & (1U<<31)) {
+            *got_frame = 0;
+            return buf_size;
+        }
+        if (buf_size != needed_size) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid frame length %d (should be %d)\n",
+                   buf_size, needed_size);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        /* skip frame */
+        if (buf_size == 8) {
+            *got_frame = 0;
+            return buf_size;
+        }
+        if (AV_RL32(buf) != FPS_TAG || buf_size < planes*1024 + 24) {
+            av_log(avctx, AV_LOG_ERROR, "error in data stream\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < planes; i++) {
+            offs[i] = AV_RL32(buf + 4 + i * 4);
+            if (offs[i] >= buf_size - header_size || (i && offs[i] <= offs[i - 1] + 1024)) {
+                av_log(avctx, AV_LOG_ERROR, "plane %i offset is out of bounds\n", i);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        offs[planes] = buf_size - header_size;
+        for (i = 0; i < planes; i++) {
+            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size, offs[i + 1] - offs[i] - 1024);
+            if (!s->tmpbuf)
+                return AVERROR(ENOMEM);
+        }
     }
-    avctx->pix_fmt = pix_fmt;
+
+    f->pict_type = AV_PICTURE_TYPE_I;
+    f->key_frame = 1;
+
+    avctx->pix_fmt = version & 1 ? is_pal ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
     avctx->color_range = version & 1 ? AVCOL_RANGE_UNSPECIFIED
                                      : AVCOL_RANGE_JPEG;
+    avctx->colorspace = version & 1 ? AVCOL_SPC_UNSPECIFIED : AVCOL_SPC_BT709;
 
-    expected_size = header_size;
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
 
     switch (version) {
     case 0:
     default:
         /* Fraps v0 is a reordered YUV420 */
-        if (!prev_pic_bit)
-            expected_size += avctx->width * avctx->height * 3 / 2;
-        if (buf_size != expected_size) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid frame length %d (should be %d)\n",
-                   buf_size, expected_size);
-            return AVERROR_INVALIDDATA;
-        }
-
         if (((avctx->width % 8) != 0) || ((avctx->height % 2) != 0)) {
             av_log(avctx, AV_LOG_ERROR, "Invalid frame size %dx%d\n",
                    avctx->width, avctx->height);
             return AVERROR_INVALIDDATA;
         }
 
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        f->pict_type = prev_pic_bit ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
-        f->key_frame = f->pict_type == AV_PICTURE_TYPE_I;
-
-        if (f->pict_type == AV_PICTURE_TYPE_I) {
-            buf32 = (const uint32_t*)buf;
-            for (y = 0; y < avctx->height / 2; y++) {
-                luma1 = (uint32_t*)&f->data[0][ y * 2      * f->linesize[0]];
-                luma2 = (uint32_t*)&f->data[0][(y * 2 + 1) * f->linesize[0]];
-                cr    = (uint32_t*)&f->data[1][ y          * f->linesize[1]];
-                cb    = (uint32_t*)&f->data[2][ y          * f->linesize[2]];
-                for (x = 0; x < avctx->width; x += 8) {
-                    *(luma1++) = *(buf32++);
-                    *(luma1++) = *(buf32++);
-                    *(luma2++) = *(buf32++);
-                    *(luma2++) = *(buf32++);
-                    *(cr++) = *(buf32++);
-                    *(cb++) = *(buf32++);
-                }
+        buf32 = (const uint32_t*)buf;
+        for (y = 0; y < avctx->height / 2; y++) {
+            luma1 = (uint32_t*)&f->data[0][  y * 2      * f->linesize[0] ];
+            luma2 = (uint32_t*)&f->data[0][ (y * 2 + 1) * f->linesize[0] ];
+            cr    = (uint32_t*)&f->data[1][  y          * f->linesize[1] ];
+            cb    = (uint32_t*)&f->data[2][  y          * f->linesize[2] ];
+            for (x = 0; x < avctx->width; x += 8) {
+                *luma1++ = *buf32++;
+                *luma1++ = *buf32++;
+                *luma2++ = *buf32++;
+                *luma2++ = *buf32++;
+                *cr++    = *buf32++;
+                *cb++    = *buf32++;
             }
         }
         break;
 
     case 1:
-        /* Fraps v1 is an upside-down BGR24 */
-        if (!prev_pic_bit)
-            expected_size += avctx->width * avctx->height * 3;
-        if (buf_size != expected_size) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid frame length %d (should be %d)\n",
-                   buf_size, expected_size);
-            return AVERROR_INVALIDDATA;
-        }
+        if (is_pal) {
+            uint32_t *pal = (uint32_t *)f->data[1];
 
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        f->pict_type = prev_pic_bit ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
-        f->key_frame = f->pict_type == AV_PICTURE_TYPE_I;
+            for (y = 0; y < 256; y++) {
+                pal[y] = AV_RL32(buf) | 0xFF000000;
+                buf += 4;
+            }
 
-        if (f->pict_type == AV_PICTURE_TYPE_I) {
+            for (y = 0; y <avctx->height; y++)
+                memcpy(&f->data[0][y * f->linesize[0]],
+                       &buf[y * avctx->width],
+                       avctx->width);
+        } else {
+        /* Fraps v1 is an upside-down BGR24 */
             for (y = 0; y<avctx->height; y++)
                 memcpy(&f->data[0][(avctx->height - y - 1) * f->linesize[0]],
                        &buf[y * avctx->width * 3],
@@ -258,37 +281,8 @@ static int decode_frame(AVCodecContext *avctx,
          * Fraps v2 is Huffman-coded YUV420 planes
          * Fraps v4 is virtually the same
          */
-        planes = 3;
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        /* skip frame */
-        if (buf_size == 8) {
-            f->pict_type = AV_PICTURE_TYPE_P;
-            f->key_frame = 0;
-            break;
-        }
-        f->pict_type = AV_PICTURE_TYPE_I;
-        f->key_frame = 1;
-        if ((AV_RL32(buf) != FPS_TAG) || (buf_size < (planes * 1024 + 24))) {
-            av_log(avctx, AV_LOG_ERROR, "Fraps: error in data stream\n");
-            return AVERROR_INVALIDDATA;
-        }
-        for (i = 0; i < planes; i++) {
-            offs[i] = AV_RL32(buf + 4 + i * 4);
-            if (offs[i] >= buf_size || (i && offs[i] <= offs[i - 1] + 1024)) {
-                av_log(avctx, AV_LOG_ERROR, "Fraps: plane %i offset is out of bounds\n", i);
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        offs[planes] = buf_size;
         for (i = 0; i < planes; i++) {
             is_chroma = !!i;
-            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size,
-                                  offs[i + 1] - offs[i] - 1024);
-            if (!s->tmpbuf)
-                return AVERROR(ENOMEM);
             if ((ret = fraps2_decode_plane(s, f->data[i], f->linesize[i],
                                            avctx->width  >> is_chroma,
                                            avctx->height >> is_chroma,
@@ -302,36 +296,7 @@ static int decode_frame(AVCodecContext *avctx,
     case 3:
     case 5:
         /* Virtually the same as version 4, but is for RGB24 */
-        planes = 3;
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        /* skip frame */
-        if (buf_size == 8) {
-            f->pict_type = AV_PICTURE_TYPE_P;
-            f->key_frame = 0;
-            break;
-        }
-        f->pict_type = AV_PICTURE_TYPE_I;
-        f->key_frame = 1;
-        if ((AV_RL32(buf) != FPS_TAG)||(buf_size < (planes*1024 + 24))) {
-            av_log(avctx, AV_LOG_ERROR, "Fraps: error in data stream\n");
-            return AVERROR_INVALIDDATA;
-        }
         for (i = 0; i < planes; i++) {
-            offs[i] = AV_RL32(buf + 4 + i * 4);
-            if (offs[i] >= buf_size || (i && offs[i] <= offs[i - 1] + 1024)) {
-                av_log(avctx, AV_LOG_ERROR, "Fraps: plane %i offset is out of bounds\n", i);
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        offs[planes] = buf_size;
-        for (i = 0; i < planes; i++) {
-            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size,
-                                  offs[i + 1] - offs[i] - 1024);
-            if (!s->tmpbuf)
-                return AVERROR(ENOMEM);
             if ((ret = fraps2_decode_plane(s, f->data[0] + i + (f->linesize[0] * (avctx->height - 1)),
                                            -f->linesize[0], avctx->width, avctx->height,
                                            buf + offs[i], offs[i + 1] - offs[i], 0, 3)) < 0) {
@@ -339,18 +304,20 @@ static int decode_frame(AVCodecContext *avctx,
                 return ret;
             }
         }
+        out = f->data[0];
         // convert pseudo-YUV into real RGB
         for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++) {
-                f->data[0][0 + i*3 + j*f->linesize[0]] += f->data[0][1 + i*3 + j*f->linesize[0]];
-                f->data[0][2 + i*3 + j*f->linesize[0]] += f->data[0][1 + i*3 + j*f->linesize[0]];
+            uint8_t *line_end = out + 3*avctx->width;
+            while (out < line_end) {
+                out[0]  += out[1];
+                out[2]  += out[1];
+                out += 3;
             }
+            out += f->linesize[0] - 3*avctx->width;
         }
         break;
     }
 
-    if ((ret = av_frame_ref(frame, f)) < 0)
-        return ret;
     *got_frame = 1;
 
     return buf_size;
@@ -366,8 +333,6 @@ static av_cold int decode_end(AVCodecContext *avctx)
 {
     FrapsContext *s = (FrapsContext*)avctx->priv_data;
 
-    av_frame_free(&s->frame);
-
     av_freep(&s->tmpbuf);
     return 0;
 }
@@ -382,6 +347,6 @@ AVCodec ff_fraps_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/frwu.c b/libavcodec/frwu.c
index 61cd315..e68fda9 100644
--- a/libavcodec/frwu.c
+++ b/libavcodec/frwu.c
@@ -3,26 +3,32 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+    AVClass *av_class;
+    int change_field_order;
+} FRWUContext;
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
@@ -38,6 +44,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
+    FRWUContext *s = avctx->priv_data;
     int field, ret;
     AVFrame *pic = data;
     const uint8_t *buf = avpkt->data;
@@ -52,15 +59,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
         return ret;
-    }
 
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
-    pic->interlaced_frame = 1;
-    pic->top_field_first = 1;
 
     for (field = 0; field < 2; field++) {
         int i;
@@ -79,9 +82,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             av_log(avctx, AV_LOG_ERROR, "Packet is too small, need %i, have %i\n", field_size, (int)(buf_end - buf));
             return AVERROR_INVALIDDATA;
         }
-        if (field)
+        if (field ^ s->change_field_order) {
             dst += pic->linesize[0];
+        } else if (s->change_field_order) {
+            dst += 2 * pic->linesize[0];
+        }
         for (i = 0; i < field_h; i++) {
+            if (s->change_field_order && field && i == field_h - 1)
+                dst = pic->data[0];
             memcpy(dst, buf, avctx->width * 2);
             buf += avctx->width * 2;
             dst += pic->linesize[0] << 1;
@@ -94,12 +102,27 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     return avpkt->size;
 }
 
+static const AVOption frwu_options[] = {
+    {"change_field_order", "Change field order", offsetof(FRWUContext, change_field_order), AV_OPT_TYPE_BOOL,
+     {.i64 = 0}, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM},
+    {NULL}
+};
+
+static const AVClass frwu_class = {
+    .class_name = "frwu Decoder",
+    .item_name  = av_default_item_name,
+    .option     = frwu_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_frwu_decoder = {
     .name           = "frwu",
     .long_name      = NULL_IF_CONFIG_SMALL("Forward Uncompressed"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FRWU,
+    .priv_data_size = sizeof(FRWUContext),
     .init           = decode_init,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &frwu_class,
 };
diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c
index 93befb4..a1dec8d 100644
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Konstantin Shishkov
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,18 +28,18 @@
 #include <inttypes.h>
 #include <zlib.h>
 
+#include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "blockdsp.h"
 #include "bytestream.h"
 #include "elsdec.h"
+#include "get_bits.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "jpegtables.h"
 #include "mjpeg.h"
-#include "vlc.h"
 
 #define EPIC_PIX_STACK_SIZE 1024
 #define EPIC_PIX_STACK_MAX  (EPIC_PIX_STACK_SIZE - 1)
@@ -123,7 +123,7 @@ typedef struct JPGContext {
 
     VLC        dc_vlc[2], ac_vlc[2];
     int        prev_dc[3];
-    DECLARE_ALIGNED(16, int16_t, block)[6][64];
+    DECLARE_ALIGNED(32, int16_t, block)[6][64];
 
     uint8_t    *buf;
 } JPGContext;
@@ -200,7 +200,7 @@ static av_cold int jpg_init(AVCodecContext *avctx, JPGContext *c)
     if (ret)
         return ret;
 
-    ff_blockdsp_init(&c->bdsp);
+    ff_blockdsp_init(&c->bdsp, avctx);
     ff_idctdsp_init(&c->idsp, avctx);
     ff_init_scantable(c->idsp.idct_permutation, &c->scantable,
                       ff_zigzag_direct);
@@ -237,7 +237,7 @@ static void jpg_unescape(const uint8_t *src, int src_size,
     *dst_size = dst - dst_start;
 }
 
-static int jpg_decode_block(JPGContext *c, BitstreamContext *bc,
+static int jpg_decode_block(JPGContext *c, GetBitContext *gb,
                             int plane, int16_t *block)
 {
     int dc, val, pos;
@@ -245,18 +245,18 @@ static int jpg_decode_block(JPGContext *c, BitstreamContext *bc,
     const uint8_t *qmat = is_chroma ? chroma_quant : luma_quant;
 
     c->bdsp.clear_block(block);
-    dc = bitstream_read_vlc(bc, c->dc_vlc[is_chroma].table, 9, 3);
+    dc = get_vlc2(gb, c->dc_vlc[is_chroma].table, 9, 3);
     if (dc < 0)
         return AVERROR_INVALIDDATA;
     if (dc)
-        dc = bitstream_read_xbits(bc, dc);
+        dc = get_xbits(gb, dc);
     dc                = dc * qmat[0] + c->prev_dc[plane];
     block[0]          = dc;
     c->prev_dc[plane] = dc;
 
     pos = 0;
     while (pos < 63) {
-        val = bitstream_read_vlc(bc, c->ac_vlc[is_chroma].table, 9, 3);
+        val = get_vlc2(gb, c->ac_vlc[is_chroma].table, 9, 3);
         if (val < 0)
             return AVERROR_INVALIDDATA;
         pos += val >> 4;
@@ -266,7 +266,7 @@ static int jpg_decode_block(JPGContext *c, BitstreamContext *bc,
         if (val) {
             int nbits = val;
 
-            val                                 = bitstream_read_xbits(bc, nbits);
+            val                                 = get_xbits(gb, nbits);
             val                                *= qmat[ff_zigzag_direct[pos]];
             block[c->scantable.permutated[pos]] = val;
         }
@@ -287,7 +287,7 @@ static int jpg_decode_data(JPGContext *c, int width, int height,
                            const uint8_t *mask, int mask_stride, int num_mbs,
                            int swapuv)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int mb_w, mb_h, mb_x, mb_y, i, j;
     int bx, by;
     int unesc_size;
@@ -299,7 +299,8 @@ static int jpg_decode_data(JPGContext *c, int width, int height,
         return ret;
     jpg_unescape(src, src_size, c->buf, &unesc_size);
     memset(c->buf + unesc_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-    bitstream_init8(&bc, c->buf, unesc_size);
+    if((ret = init_get_bits8(&gb, c->buf, unesc_size)) < 0)
+        return ret;
 
     width = FFALIGN(width, 16);
     mb_w  =  width        >> 4;
@@ -326,14 +327,14 @@ static int jpg_decode_data(JPGContext *c, int width, int height,
                     if (mask && !mask[mb_x * 2 + i + j * mask_stride])
                         continue;
                     num_mbs--;
-                    if ((ret = jpg_decode_block(c, &bc, 0,
+                    if ((ret = jpg_decode_block(c, &gb, 0,
                                                 c->block[i + j * 2])) != 0)
                         return ret;
                     c->idsp.idct(c->block[i + j * 2]);
                 }
             }
             for (i = 1; i < 3; i++) {
-                if ((ret = jpg_decode_block(c, &bc, i, c->block[i + 3])) != 0)
+                if ((ret = jpg_decode_block(c, &gb, i, c->block[i + 3])) != 0)
                     return ret;
                 c->idsp.idct(c->block[i + 3]);
             }
@@ -555,6 +556,11 @@ static uint32_t epic_decode_pixel_pred(ePICContext *dc, int x, int y,
         B     = ((pred >> B_shift) & 0xFF) - TOSIGNED(delta);
     }
 
+    if (R<0 || G<0 || B<0 || R > 255 || G > 255 || B > 255) {
+        avpriv_request_sample(NULL, "RGB %d %d %d is out of range\n", R, G, B);
+        return 0;
+    }
+
     return (R << R_shift) | (G << G_shift) | (B << B_shift);
 }
 
@@ -626,6 +632,8 @@ static int epic_decode_run_length(ePICContext *dc, int x, int y, int tile_width,
               (NN  != N)  << 1 |
               (NNW != NW);
         WWneW = ff_els_decode_bit(&dc->els_ctx, &dc->W_ctx_rung[idx]);
+        if (WWneW < 0)
+            return WWneW;
     }
 
     if (WWneW)
@@ -832,10 +840,13 @@ static int epic_decode_tile(ePICContext *dc, uint8_t *out, int tile_height,
                 if (y < 2 || x < 2 || x == tile_width - 1) {
                     run       = 1;
                     got_pixel = epic_handle_edges(dc, x, y, curr_row, above_row, &pix);
-                } else
+                } else {
                     got_pixel = epic_decode_run_length(dc, x, y, tile_width,
                                                        curr_row, above_row,
                                                        above2_row, &pix, &run);
+                    if (got_pixel < 0)
+                        return got_pixel;
+                }
 
                 if (!got_pixel && !epic_predict_from_NW_NE(dc, x, y, run,
                                                            tile_width, curr_row,
@@ -890,7 +901,7 @@ static int epic_jb_decode_tile(G2MContext *c, int tile_x, int tile_y,
     }
 
     if (src_size < els_dsize) {
-        av_log(avctx, AV_LOG_ERROR, "ePIC: data too short, needed %zu, got %zu\n",
+        av_log(avctx, AV_LOG_ERROR, "ePIC: data too short, needed %"SIZE_SPECIFIER", got %"SIZE_SPECIFIER"\n",
                els_dsize, src_size);
         return AVERROR_INVALIDDATA;
     }
@@ -916,6 +927,7 @@ static int epic_jb_decode_tile(G2MContext *c, int tile_x, int tile_y,
         if (c->ec.els_ctx.err != 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "ePIC: couldn't decode transparency pixel!\n");
+            ff_els_decoder_uninit(&c->ec.unsigned_rung);
             return AVERROR_INVALIDDATA;
         }
 
@@ -1006,17 +1018,19 @@ static int epic_jb_decode_tile(G2MContext *c, int tile_x, int tile_y,
     return 0;
 }
 
-static void kempf_restore_buf(const uint8_t *src, int len,
+static int kempf_restore_buf(const uint8_t *src, int len,
                               uint8_t *dst, int stride,
                               const uint8_t *jpeg_tile, int tile_stride,
                               int width, int height,
                               const uint8_t *pal, int npal, int tidx)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int i, j, nb, col;
+    int ret;
     int align_width = FFALIGN(width, 16);
 
-    bitstream_init8(&bc, src, len);
+    if ((ret = init_get_bits8(&gb, src, len)) < 0)
+        return ret;
 
     if (npal <= 2)       nb = 1;
     else if (npal <= 4)  nb = 2;
@@ -1024,17 +1038,19 @@ static void kempf_restore_buf(const uint8_t *src, int len,
     else                 nb = 8;
 
     for (j = 0; j < height; j++, dst += stride, jpeg_tile += tile_stride) {
-        if (bitstream_read(&bc, 8))
+        if (get_bits(&gb, 8))
             continue;
         for (i = 0; i < width; i++) {
-            col = bitstream_read(&bc, nb);
+            col = get_bits(&gb, nb);
             if (col != tidx)
                 memcpy(dst + i * 3, pal + col * 3, 3);
             else
                 memcpy(dst + i * 3, jpeg_tile + i * 3, 3);
         }
-        bitstream_skip(&bc, nb * (align_width - width));
+        skip_bits_long(&gb, nb * (align_width - width));
     }
+
+    return 0;
 }
 
 static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
@@ -1078,6 +1094,8 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
         src += 3;
     }
     npal = *src++ + 1;
+    if (src_end - src < npal * 3)
+        return AVERROR_INVALIDDATA;
     memcpy(pal, src, npal * 3);
     src += npal * 3;
     if (sub_type != 2) {
@@ -1094,7 +1112,7 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
     zsize = (src[0] << 8) | src[1];
     src  += 2;
 
-    if (src_end - src < zsize)
+    if (src_end - src < zsize + (sub_type != 2))
         return AVERROR_INVALIDDATA;
 
     ret = uncompress(c->kempf_buf, &dlen, src, zsize);
@@ -1116,6 +1134,8 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
     for (i = 0; i < (FFALIGN(height, 16) >> 4); i++) {
         for (j = 0; j < (FFALIGN(width, 16) >> 4); j++) {
             if (!bits) {
+                if (src >= src_end)
+                    return AVERROR_INVALIDDATA;
                 bitbuf = *src++;
                 bits   = 8;
             }
@@ -1149,10 +1169,10 @@ static int g2m_init_buffers(G2MContext *c)
     int aligned_height;
 
     if (!c->framebuf || c->old_width < c->width || c->old_height < c->height) {
-        c->framebuf_stride = FFALIGN(c->width * 3, 16);
-        aligned_height     = FFALIGN(c->height,    16);
+        c->framebuf_stride = FFALIGN(c->width + 15, 16) * 3;
+        aligned_height     = c->height + 15;
         av_free(c->framebuf);
-        c->framebuf = av_mallocz(c->framebuf_stride * aligned_height);
+        c->framebuf = av_mallocz_array(c->framebuf_stride, aligned_height);
         if (!c->framebuf)
             return AVERROR(ENOMEM);
     }
@@ -1160,14 +1180,15 @@ static int g2m_init_buffers(G2MContext *c)
         (c->compression == 2 && !c->epic_buf_base) ||
         c->old_tile_w < c->tile_width ||
         c->old_tile_h < c->tile_height) {
-        c->tile_stride     = FFALIGN(c->tile_width * 3, 16);
+        c->tile_stride     = FFALIGN(c->tile_width, 16) * 3;
         c->epic_buf_stride = FFALIGN(c->tile_width * 4, 16);
         aligned_height     = FFALIGN(c->tile_height,    16);
-        av_free(c->synth_tile);
-        av_free(c->jpeg_tile);
-        av_free(c->kempf_buf);
-        av_free(c->kempf_flags);
-        av_free(c->epic_buf_base);
+        av_freep(&c->synth_tile);
+        av_freep(&c->jpeg_tile);
+        av_freep(&c->kempf_buf);
+        av_freep(&c->kempf_flags);
+        av_freep(&c->epic_buf_base);
+        c->epic_buf    = NULL;
         c->synth_tile  = av_mallocz(c->tile_stride      * aligned_height);
         c->jpeg_tile   = av_mallocz(c->tile_stride      * aligned_height);
         c->kempf_buf   = av_mallocz((c->tile_width + 1) * aligned_height +
@@ -1204,7 +1225,7 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
     cursor_hot_y = bytestream2_get_byte(gb);
     cursor_fmt   = bytestream2_get_byte(gb);
 
-    cursor_stride = FFALIGN(cursor_w, 32) * 4;
+    cursor_stride = FFALIGN(cursor_w, cursor_fmt==1 ? 32 : 1) * 4;
 
     if (cursor_w < 1 || cursor_w > 256 ||
         cursor_h < 1 || cursor_h > 256) {
@@ -1254,7 +1275,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                     bits <<= 1;
                 }
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
 
         dst = c->cursor;
@@ -1286,7 +1306,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                     bits <<= 1;
                 }
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
         break;
     case 32: // full colour
@@ -1300,7 +1319,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                 *dst++ = val >> 16;
                 *dst++ = val >> 24;
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
         break;
     default:
@@ -1338,14 +1356,16 @@ static void g2m_paint_cursor(G2MContext *c, uint8_t *dst, int stride)
     } else {
         dst    +=  x * 3;
     }
-    if (y < 0) {
+
+    if (y < 0)
         h      +=  y;
+    if (w < 0 || h < 0)
+        return;
+    if (y < 0) {
         cursor += -y * c->cursor_stride;
     } else {
         dst    +=  y * stride;
     }
-    if (w < 0 || h < 0)
-        return;
 
     for (j = 0; j < h; j++) {
         for (i = 0; i < w; i++) {
@@ -1403,6 +1423,7 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
         }
         switch (chunk_type) {
         case DISPLAY_INFO:
+            got_header =
             c->got_header = 0;
             if (chunk_size < 21) {
                 av_log(avctx, AV_LOG_ERROR, "Invalid display info size %"PRIu32"\n",
@@ -1421,18 +1442,22 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
             if (c->width != avctx->width || c->height != avctx->height) {
                 ret = ff_set_dimensions(avctx, c->width, c->height);
                 if (ret < 0)
-                    return ret;
+                    goto header_fail;
             }
             c->compression = bytestream2_get_be32(&bc);
             if (c->compression != 2 && c->compression != 3) {
                 avpriv_report_missing_feature(avctx, "Compression method %d",
                                               c->compression);
-                return AVERROR_PATCHWELCOME;
+                ret = AVERROR_PATCHWELCOME;
+                goto header_fail;
             }
             c->tile_width  = bytestream2_get_be32(&bc);
             c->tile_height = bytestream2_get_be32(&bc);
-            if (!c->tile_width || !c->tile_height ||
-                ((c->tile_width | c->tile_height) & 0xF)) {
+            if (c->tile_width <= 0 || c->tile_height <= 0 ||
+                ((c->tile_width | c->tile_height) & 0xF) ||
+                c->tile_width * (uint64_t)c->tile_height >= INT_MAX / 4 ||
+                av_image_check_size2(c->tile_width, c->tile_height, avctx->max_pixels, avctx->pix_fmt, 0, avctx) < 0
+            ) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Invalid tile dimensions %dx%d\n",
                        c->tile_width, c->tile_height);
@@ -1447,7 +1472,8 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
                     (chunk_size - 21) < 16) {
                     av_log(avctx, AV_LOG_ERROR,
                            "Display info: missing bitmasks!\n");
-                    return AVERROR_INVALIDDATA;
+                    ret = AVERROR_INVALIDDATA;
+                    goto header_fail;
                 }
                 r_mask = bytestream2_get_be32(&bc);
                 g_mask = bytestream2_get_be32(&bc);
@@ -1456,11 +1482,13 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
                     avpriv_report_missing_feature(avctx,
                                                   "Bitmasks: R=%"PRIX32", G=%"PRIX32", B=%"PRIX32,
                                                   r_mask, g_mask, b_mask);
-                    return AVERROR_PATCHWELCOME;
+                    ret = AVERROR_PATCHWELCOME;
+                    goto header_fail;
                 }
             } else {
                 avpriv_request_sample(avctx, "bpp=%d", c->bpp);
-                return AVERROR_PATCHWELCOME;
+                ret = AVERROR_PATCHWELCOME;
+                goto header_fail;
             }
             if (g2m_init_buffers(c)) {
                 ret = AVERROR(ENOMEM);
@@ -1537,11 +1565,9 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
     if (got_header)
         c->got_header = 1;
 
-    if (c->width && c->height) {
-        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (c->width && c->height && c->framebuf) {
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         pic->key_frame = got_header;
         pic->pict_type = got_header ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
@@ -1562,6 +1588,8 @@ header_fail:
     c->height  = 0;
     c->tiles_x =
     c->tiles_y = 0;
+    c->tile_width =
+    c->tile_height = 0;
     return ret;
 }
 
@@ -1592,6 +1620,7 @@ static av_cold int g2m_decode_end(AVCodecContext *avctx)
     jpg_free_context(&c->jc);
 
     av_freep(&c->epic_buf_base);
+    c->epic_buf = NULL;
     av_freep(&c->kempf_buf);
     av_freep(&c->kempf_flags);
     av_freep(&c->synth_tile);
diff --git a/libavcodec/g722.c b/libavcodec/g722.c
index 830877e..ef7ca6d 100644
--- a/libavcodec/g722.c
+++ b/libavcodec/g722.c
@@ -7,20 +7,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -88,14 +88,14 @@ static inline void s_zero(int cur_diff, struct G722Band *band)
         ACCUM(3, band->diff_mem[2], 1);
         ACCUM(2, band->diff_mem[1], 1);
         ACCUM(1, band->diff_mem[0], 1);
-        ACCUM(0, cur_diff << 1, 1);
+        ACCUM(0, cur_diff * 2, 1);
     } else {
         ACCUM(5, band->diff_mem[4], 0);
         ACCUM(4, band->diff_mem[3], 0);
         ACCUM(3, band->diff_mem[2], 0);
         ACCUM(2, band->diff_mem[1], 0);
         ACCUM(1, band->diff_mem[0], 0);
-        ACCUM(0, cur_diff << 1, 0);
+        ACCUM(0, cur_diff * 2, 0);
     }
     #undef ACCUM
     band->s_zero = s_zero;
@@ -119,14 +119,14 @@ static void do_adaptive_prediction(struct G722Band *band, const int cur_diff)
     band->part_reconst_mem[0] = cur_part_reconst;
 
     band->pole_mem[1] = av_clip((sg[0] * av_clip(band->pole_mem[0], -8191, 8191) >> 5) +
-                                (sg[1] << 7) + (band->pole_mem[1] * 127 >> 7), -12288, 12288);
+                                (sg[1] * 128) + (band->pole_mem[1] * 127 >> 7), -12288, 12288);
 
     limit = 15360 - band->pole_mem[1];
     band->pole_mem[0] = av_clip(-192 * sg[0] + (band->pole_mem[0] * 255 >> 8), -limit, limit);
 
     s_zero(cur_diff, band);
 
-    cur_qtzd_reconst = av_clip_int16((band->s_predictor + cur_diff) << 1);
+    cur_qtzd_reconst = av_clip_int16((band->s_predictor + cur_diff) * 2);
     band->s_predictor = av_clip_int16(band->s_zero +
                                       (band->pole_mem[0] * cur_qtzd_reconst >> 15) +
                                       (band->pole_mem[1] * band->prev_qtzd_reconst >> 15));
diff --git a/libavcodec/g722.h b/libavcodec/g722.h
index 4830170..25676a3 100644
--- a/libavcodec/g722.h
+++ b/libavcodec/g722.h
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/g722dec.c b/libavcodec/g722dec.c
index 07af0a0..7c270bc 100644
--- a/libavcodec/g722dec.c
+++ b/libavcodec/g722dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,9 +36,8 @@
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "g722.h"
 #include "internal.h"
 
@@ -93,25 +92,25 @@ static int g722_decode_frame(AVCodecContext *avctx, void *data,
     int j, ret;
     const int skip = 8 - c->bits_per_codeword;
     const int16_t *quantizer_table = low_inv_quants[skip];
-    BitstreamContext bc;
+    GetBitContext gb;
 
     /* get output buffer */
     frame->nb_samples = avpkt->size * 2;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out_buf = (int16_t *)frame->data[0];
 
-    bitstream_init8(&bc, avpkt->data, avpkt->size);
+    ret = init_get_bits8(&gb, avpkt->data, avpkt->size);
+    if (ret < 0)
+        return ret;
 
     for (j = 0; j < avpkt->size; j++) {
         int ilow, ihigh, rlow, rhigh, dhigh;
         int xout[2];
 
-        ihigh = bitstream_read(&bc, 2);
-        ilow  = bitstream_read(&bc, 6 - skip);
-        bitstream_skip(&bc, skip);
+        ihigh = get_bits(&gb, 2);
+        ilow = get_bits(&gb, 6 - skip);
+        skip_bits(&gb, skip);
 
         rlow = av_clip_intp2((c->band[0].scale_factor * quantizer_table[ilow] >> 10)
                       + c->band[0].s_predictor, 14);
diff --git a/libavcodec/g722dsp.c b/libavcodec/g722dsp.c
index c7e41ff..f148053 100644
--- a/libavcodec/g722dsp.c
+++ b/libavcodec/g722dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,4 +71,6 @@ av_cold void ff_g722dsp_init(G722DSPContext *c)
 
     if (ARCH_ARM)
         ff_g722dsp_init_arm(c);
+    if (ARCH_X86)
+        ff_g722dsp_init_x86(c);
 }
diff --git a/libavcodec/g722dsp.h b/libavcodec/g722dsp.h
index ecd6a47..c956a1e 100644
--- a/libavcodec/g722dsp.h
+++ b/libavcodec/g722dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,5 +29,6 @@ typedef struct G722DSPContext {
 
 void ff_g722dsp_init(G722DSPContext *c);
 void ff_g722dsp_init_arm(G722DSPContext *c);
+void ff_g722dsp_init_x86(G722DSPContext *c);
 
 #endif /* AVCODEC_G722DSP_H */
diff --git a/libavcodec/g722enc.c b/libavcodec/g722enc.c
index 545825b..25b61df 100644
--- a/libavcodec/g722enc.c
+++ b/libavcodec/g722enc.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
  * G.722 ADPCM audio encoder
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "g722.h"
@@ -60,11 +61,6 @@ static av_cold int g722_encode_init(AVCodecContext * avctx)
     G722Context *c = avctx->priv_data;
     int ret;
 
-    if (avctx->channels != 1) {
-        av_log(avctx, AV_LOG_ERROR, "Only mono tracks are allowed.\n");
-        return AVERROR_INVALIDDATA;
-    }
-
     c->band[0].scale_factor = 8;
     c->band[1].scale_factor = 2;
     c->prev_samples_pos = 22;
@@ -74,9 +70,9 @@ static av_cold int g722_encode_init(AVCodecContext * avctx)
         int max_paths = frontier * FREEZE_INTERVAL;
         int i;
         for (i = 0; i < 2; i++) {
-            c->paths[i] = av_mallocz(max_paths * sizeof(**c->paths));
-            c->node_buf[i] = av_mallocz(2 * frontier * sizeof(**c->node_buf));
-            c->nodep_buf[i] = av_mallocz(2 * frontier * sizeof(**c->nodep_buf));
+            c->paths[i] = av_mallocz_array(max_paths, sizeof(**c->paths));
+            c->node_buf[i] = av_mallocz_array(frontier, 2 * sizeof(**c->node_buf));
+            c->nodep_buf[i] = av_mallocz_array(frontier, 2 * sizeof(**c->nodep_buf));
             if (!c->paths[i] || !c->node_buf[i] || !c->nodep_buf[i]) {
                 ret = AVERROR(ENOMEM);
                 goto error;
@@ -238,7 +234,7 @@ static void g722_encode_trellis(G722Context *c, int trellis,
                     continue;\
                 if (heap_pos[index] < frontier) {\
                     pos = heap_pos[index]++;\
-                    assert(pathn[index] < FREEZE_INTERVAL * frontier);\
+                    av_assert2(pathn[index] < FREEZE_INTERVAL * frontier);\
                     node = nodes_next[index][pos] = next[index]++;\
                     node->path = pathn[index]++;\
                 } else {\
@@ -357,10 +353,8 @@ static int g722_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int nb_samples, out_size, ret;
 
     out_size = (frame->nb_samples + 1) / 2;
-    if ((ret = ff_alloc_packet(avpkt, out_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
 
     nb_samples = frame->nb_samples - (frame->nb_samples & 1);
 
@@ -382,15 +376,15 @@ static int g722_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 }
 
 AVCodec ff_adpcm_g722_encoder = {
-    .name           = "g722",
-    .long_name      = NULL_IF_CONFIG_SMALL("G.722 ADPCM"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_ADPCM_G722,
-    .priv_data_size = sizeof(G722Context),
-    .init           = g722_encode_init,
-    .close          = g722_encode_close,
-    .encode2        = g722_encode_frame,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
-    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
-                                                     AV_SAMPLE_FMT_NONE },
+    .name            = "g722",
+    .long_name       = NULL_IF_CONFIG_SMALL("G.722 ADPCM"),
+    .type            = AVMEDIA_TYPE_AUDIO,
+    .id              = AV_CODEC_ID_ADPCM_G722,
+    .priv_data_size  = sizeof(G722Context),
+    .init            = g722_encode_init,
+    .close           = g722_encode_close,
+    .encode2         = g722_encode_frame,
+    .capabilities    = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .sample_fmts     = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .channel_layouts = (const uint64_t[]){ AV_CH_LAYOUT_MONO, 0 },
 };
diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 3d45f9d..1deff49 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,11 +37,11 @@ int ff_g723_1_scale_vector(int16_t *dst, const int16_t *vector, int length)
     for (i = 0; i < length; i++)
         max |= FFABS(vector[i]);
 
-    max  = FFMIN(max, 0x7FFF);
-    bits = ff_g723_1_normalize_bits(max, 15);
+    bits= 14 - av_log2_16bit(max);
+    bits= FFMAX(bits, 0);
 
     for (i = 0; i < length; i++)
-        dst[i] = vector[i] << bits >> 3;
+        dst[i] = (vector[i] * (1 << bits)) >> 3;
 
     return bits - 3;
 }
@@ -97,16 +97,16 @@ void ff_g723_1_gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
     ff_g723_1_get_residual(residual, prev_excitation, lag);
 
     /* Select quantization table */
-    if (cur_rate == RATE_6300 && pitch_lag < SUBFRAME_LEN - 2)
+    if (cur_rate == RATE_6300 && pitch_lag < SUBFRAME_LEN - 2) {
         cb_ptr = adaptive_cb_gain85;
-    else
+    } else
         cb_ptr = adaptive_cb_gain170;
 
     /* Calculate adaptive vector */
     cb_ptr += subfrm->ad_cb_gain * 20;
     for (i = 0; i < SUBFRAME_LEN; i++) {
-        sum       = ff_g723_1_dot_product(residual + i, cb_ptr, PITCH_ORDER);
-        vector[i] = av_sat_dadd32(1 << 15, sum) >> 16;
+        sum = ff_dot_product(residual + i, cb_ptr, PITCH_ORDER);
+        vector[i] = av_sat_dadd32(1 << 15, av_sat_add32(sum, sum)) >> 16;
     }
 }
 
@@ -123,11 +123,11 @@ static void lsp2lpc(int16_t *lpc)
 
     /* Calculate negative cosine */
     for (j = 0; j < LPC_ORDER; j++) {
-        int index  = (lpc[j] >> 7) & 0x1FF;
-        int offset = lpc[j] & 0x7f;
-        int temp1  = cos_tab[index] << 16;
-        int temp2  = (cos_tab[index + 1] - cos_tab[index]) *
-                     ((offset << 8) + 0x80) << 1;
+        int index     = (lpc[j] >> 7) & 0x1FF;
+        int offset    = lpc[j] & 0x7f;
+        int temp1     = cos_tab[index] * (1 << 16);
+        int temp2     = (cos_tab[index + 1] - cos_tab[index]) *
+                          (((offset << 8) + 0x80) << 1);
 
         lpc[j] = -(av_sat_dadd32(1 << 15, temp1 + temp2) >> 16);
     }
@@ -138,11 +138,11 @@ static void lsp2lpc(int16_t *lpc)
      */
     /* Initialize with values in Q28 */
     f1[0] = 1 << 28;
-    f1[1] = (lpc[0] << 14) + (lpc[2] << 14);
+    f1[1] = (lpc[0] + lpc[2]) * (1 << 14);
     f1[2] = lpc[0] * lpc[2] + (2 << 28);
 
     f2[0] = 1 << 28;
-    f2[1] = (lpc[1] << 14) + (lpc[3] << 14);
+    f2[1] = (lpc[1] + lpc[3]) * (1 << 14);
     f2[2] = lpc[1] * lpc[3] + (2 << 28);
 
     /*
@@ -150,8 +150,8 @@ static void lsp2lpc(int16_t *lpc)
      * each iteration for a final scaling factor of Q25
      */
     for (i = 2; i < LPC_ORDER / 2; i++) {
-        f1[i + 1] = f1[i - 1] + MULL2(f1[i], lpc[2 * i]);
-        f2[i + 1] = f2[i - 1] + MULL2(f2[i], lpc[2 * i + 1]);
+        f1[i + 1] = av_clipl_int32(f1[i - 1] + (int64_t)MULL2(f1[i], lpc[2 * i]));
+        f2[i + 1] = av_clipl_int32(f2[i - 1] + (int64_t)MULL2(f2[i], lpc[2 * i + 1]));
 
         for (j = i; j >= 2; j--) {
             f1[j] = MULL2(f1[j - 1], lpc[2 * i]) +
@@ -162,8 +162,8 @@ static void lsp2lpc(int16_t *lpc)
 
         f1[0] >>= 1;
         f2[0] >>= 1;
-        f1[1]   = ((lpc[2 * i]     << 16 >> i) + f1[1]) >> 1;
-        f2[1]   = ((lpc[2 * i + 1] << 16 >> i) + f2[1]) >> 1;
+        f1[1] = ((lpc[2 * i]     * 65536 >> i) + f1[1]) >> 1;
+        f2[1] = ((lpc[2 * i + 1] * 65536 >> i) + f2[1]) >> 1;
     }
 
     /* Convert polynomial coefficients to LPC coefficients */
@@ -171,9 +171,8 @@ static void lsp2lpc(int16_t *lpc)
         int64_t ff1 = f1[i + 1] + f1[i];
         int64_t ff2 = f2[i + 1] - f2[i];
 
-        lpc[i]                 = av_clipl_int32(((ff1 + ff2) << 3) +
-                                                (1 << 15)) >> 16;
-        lpc[LPC_ORDER - i - 1] = av_clipl_int32(((ff1 - ff2) << 3) +
+        lpc[i] = av_clipl_int32(((ff1 + ff2) * 8) + (1 << 15)) >> 16;
+        lpc[LPC_ORDER - i - 1] = av_clipl_int32(((ff1 - ff2) * 8) +
                                                 (1 << 15)) >> 16;
     }
 }
@@ -234,7 +233,7 @@ void ff_g723_1_inverse_quant(int16_t *cur_lsp, int16_t *prev_lsp,
     }
 
     for (i = 0; i < LPC_ORDER; i++) {
-        cur_lsp[0]             = FFMAX(cur_lsp[0], 0x180);
+        cur_lsp[0]             = FFMAX(cur_lsp[0],  0x180);
         cur_lsp[LPC_ORDER - 1] = FFMIN(cur_lsp[LPC_ORDER - 1], 0x7e00);
 
         /* Stability check */
diff --git a/libavcodec/g723_1.h b/libavcodec/g723_1.h
index 166d897..d60d481 100644
--- a/libavcodec/g723_1.h
+++ b/libavcodec/g723_1.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -55,7 +55,7 @@
  * @param b 16 bit multiplier
  */
 #define MULL2(a, b) \
-        ((((a) >> 16) * (b) << 1) + (((a) & 0xffff) * (b) >> 15))
+        ((((a) >> 16) * (b) * 2) + (((a) & 0xffff) * (b) >> 15))
 
 /**
  * G723.1 frame types
@@ -116,9 +116,7 @@ typedef struct FCBParam {
     int pulse_sign[PULSE_MAX];
 } FCBParam;
 
-typedef struct g723_1_context {
-    AVClass *class;
-
+typedef struct G723_1_ChannelContext {
     G723_1_Subframe subframe[4];
     enum FrameType cur_frame_type;
     enum FrameType past_frame_type;
@@ -142,9 +140,8 @@ typedef struct g723_1_context {
     int sid_gain;
     int cur_gain;
     int reflection_coef;
-    int pf_gain;
-    int postfilter;
-
+    int pf_gain;                 ///< formant postfilter
+                                 ///< gain scaling unit memory
     int16_t audio[FRAME_LEN + LPC_ORDER + PITCH_MAX + 4];
 
     /* encoder */
@@ -157,6 +154,13 @@ typedef struct g723_1_context {
     int16_t perf_iir_mem[LPC_ORDER];       ///< and iir memories
 
     int16_t harmonic_mem[PITCH_MAX];
+} G723_1_ChannelContext;
+
+typedef struct G723_1_Context {
+    AVClass *class;
+    int postfilter;
+
+    G723_1_ChannelContext ch[2];
 } G723_1_Context;
 
 
@@ -215,16 +219,27 @@ void ff_g723_1_lsp_interpolate(int16_t *lpc, int16_t *cur_lsp,
 void ff_g723_1_inverse_quant(int16_t *cur_lsp, int16_t *prev_lsp,
                              uint8_t *lsp_index, int bad_frame);
 
-
 static const uint8_t frame_size[4] = { 24, 20, 4, 1 };
 
-/* Postfilter gain weighting factors scaled by 2^15 */
-static const int16_t ppf_gain_weight[2] = { 0x1800, 0x2000 };
+/**
+ * Postfilter gain weighting factors scaled by 2^15
+ */
+static const int16_t ppf_gain_weight[2] = {0x1800, 0x2000};
 
-/* LSP DC component */
+/**
+ * LSP DC component
+ */
 static const int16_t dc_lsp[LPC_ORDER] = {
-    0x0c3b, 0x1271, 0x1e0a, 0x2a36, 0x3630,
-    0x406f, 0x4d28, 0x56f4, 0x638c, 0x6c46
+    0x0c3b,
+    0x1271,
+    0x1e0a,
+    0x2a36,
+    0x3630,
+    0x406f,
+    0x4d28,
+    0x56f4,
+    0x638c,
+    0x6c46
 };
 
 /* Cosine table scaled by 2^14 */
@@ -296,7 +311,9 @@ static const int16_t cos_tab[COS_TBL_SIZE + 1] = {
     16384
 };
 
-/* LSP VQ tables */
+/**
+ *  LSP VQ tables
+ */
 static const int16_t lsp_band0[LSP_CB_SIZE][3] = {
     {    0,      0,      0}, { -270,  -1372,  -1032}, { -541,  -1650,  -1382},
     { -723,  -2011,  -2213}, { -941,  -1122,  -1942}, { -780,  -1145,  -2454},
@@ -606,12 +623,12 @@ static const int16_t lsp_band2[LSP_CB_SIZE][4] = {
     { 3633,   2336,   2408,   1453}, { 2923,   3517,   2567,   1318},
 };
 
-/*
+/**
  * Used for the coding/decoding of the pulses positions
  * for the MP-MLQ codebook
  */
 static const int32_t combinatorial_table[PULSE_MAX][SUBFRAME_LEN/GRID_SIZE] = {
-    {118755, 98280, 80730, 65780L, 53130,
+    {118755, 98280, 80730,  65780, 53130,
       42504, 33649, 26334,  20349, 15504,
       11628,  8568,  6188,   4368,  3003,
        2002,  1287,   792,    462,   252,
@@ -700,10 +717,14 @@ static const int16_t pitch_contrib[340] = {
     -2, 25144,  0, 17998
 };
 
-/* Number of non-zero pulses in the MP-MLQ excitation */
+/**
+ * Number of non-zero pulses in the MP-MLQ excitation
+ */
 static const int8_t pulses[4] = {6, 5, 6, 5};
 
-/* Size of the MP-MLQ fixed excitation codebooks */
+/**
+ * Size of the MP-MLQ fixed excitation codebooks
+ */
 static const int32_t max_pos[4] = {593775, 142506, 593775, 142506};
 
 static const int16_t fixed_cb_gain[GAIN_LEVELS] = {
@@ -1356,15 +1377,16 @@ static const int16_t adaptive_cb_gain170[170 * 20] = {
     -4534,  -2487,  -3932,  -4166,  -2113,  -3341,  -3540,  -3070
 };
 
-/* 0.65^i (Zero part) and 0.75^i (Pole part) scaled by 2^15 */
+/**
+ * 0.65^i (Zero part) and 0.75^i (Pole part) scaled by 2^15
+ */
 static const int16_t postfilter_tbl[2][LPC_ORDER] = {
     /* Zero */
-    { 21299, 13844,  8999,  5849, 3802, 2471, 1606, 1044,  679,  441 },
+    {21299, 13844,  8999,  5849, 3802, 2471, 1606, 1044,  679,  441},
     /* Pole */
-    { 24576, 18432, 13824, 10368, 7776, 5832, 4374, 3281, 2460, 1845 }
+    {24576, 18432, 13824, 10368, 7776, 5832, 4374, 3281, 2460, 1845}
 };
 
-
 /**
  * Hamming window coefficients scaled by 2^15
  */
diff --git a/libavcodec/g723_1_parser.c b/libavcodec/g723_1_parser.c
new file mode 100644
index 0000000..0305ca3
--- /dev/null
+++ b/libavcodec/g723_1_parser.c
@@ -0,0 +1,60 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * G723_1 audio parser
+ */
+
+#include "parser.h"
+#include "g723_1.h"
+
+typedef struct G723_1ParseContext {
+    ParseContext pc;
+} G723_1ParseContext;
+
+static int g723_1_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                        const uint8_t **poutbuf, int *poutbuf_size,
+                        const uint8_t *buf, int buf_size)
+{
+    G723_1ParseContext *s = s1->priv_data;
+    ParseContext *pc = &s->pc;
+    int next = END_NOT_FOUND;
+
+    if (buf_size > 0)
+        next = frame_size[buf[0] & 3] * FFMAX(1, avctx->channels);
+
+    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0 || !buf_size) {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+
+    s1->duration = 240;
+
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+AVCodecParser ff_g723_1_parser = {
+    .codec_ids      = { AV_CODEC_ID_G723_1 },
+    .priv_data_size = sizeof(G723_1ParseContext),
+    .parser_parse   = g723_1_parse,
+    .parser_close   = ff_parse_close,
+};
diff --git a/libavcodec/g723_1dec.c b/libavcodec/g723_1dec.c
index 0cb5ba7..d8bc3f9 100644
--- a/libavcodec/g723_1dec.c
+++ b/libavcodec/g723_1dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,9 @@
 #define BITSTREAM_READER_LE
 #include "acelp_vectors.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "celp_filters.h"
+#include "celp_math.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "g723_1.h"
 
@@ -41,19 +42,25 @@
 
 static av_cold int g723_1_decode_init(AVCodecContext *avctx)
 {
-    G723_1_Context *p = avctx->priv_data;
+    G723_1_Context *s = avctx->priv_data;
 
-    avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
-    avctx->channels       = 1;
-    avctx->sample_rate    = 8000;
-    p->pf_gain            = 1 << 12;
+    avctx->sample_fmt     = AV_SAMPLE_FMT_S16P;
+    if (avctx->channels < 1 || avctx->channels > 2) {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo are supported (requested channels: %d).\n", avctx->channels);
+        return AVERROR(EINVAL);
+    }
+    avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO : AV_CH_LAYOUT_STEREO;
+    for (int ch = 0; ch < avctx->channels; ch++) {
+        G723_1_ChannelContext *p = &s->ch[ch];
+
+        p->pf_gain = 1 << 12;
 
-    memcpy(p->prev_lsp, dc_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
-    memcpy(p->sid_lsp,  dc_lsp, LPC_ORDER * sizeof(*p->sid_lsp));
+        memcpy(p->prev_lsp, dc_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
+        memcpy(p->sid_lsp,  dc_lsp, LPC_ORDER * sizeof(*p->sid_lsp));
 
-    p->cng_random_seed = CNG_RANDOM_SEED;
-    p->past_frame_type = SID_FRAME;
+        p->cng_random_seed = CNG_RANDOM_SEED;
+        p->past_frame_type = SID_FRAME;
+    }
 
     return 0;
 }
@@ -65,17 +72,20 @@ static av_cold int g723_1_decode_init(AVCodecContext *avctx)
  * @param buf         pointer to the input buffer
  * @param buf_size    size of the input buffer
  */
-static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
+static int unpack_bitstream(G723_1_ChannelContext *p, const uint8_t *buf,
                             int buf_size)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int ad_cb_len;
     int temp, info_bits, i;
+    int ret;
 
-    bitstream_init8(&bc, buf, buf_size);
+    ret = init_get_bits8(&gb, buf, buf_size);
+    if (ret < 0)
+        return ret;
 
     /* Extract frame type and rate info */
-    info_bits = bitstream_read(&bc, 2);
+    info_bits = get_bits(&gb, 2);
 
     if (info_bits == 3) {
         p->cur_frame_type = UNTRANSMITTED_FRAME;
@@ -83,13 +93,13 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
     }
 
     /* Extract 24 bit lsp indices, 8 bit for each band */
-    p->lsp_index[2] = bitstream_read(&bc, 8);
-    p->lsp_index[1] = bitstream_read(&bc, 8);
-    p->lsp_index[0] = bitstream_read(&bc, 8);
+    p->lsp_index[2] = get_bits(&gb, 8);
+    p->lsp_index[1] = get_bits(&gb, 8);
+    p->lsp_index[0] = get_bits(&gb, 8);
 
     if (info_bits == 2) {
         p->cur_frame_type = SID_FRAME;
-        p->subframe[0].amp_index = bitstream_read(&bc, 6);
+        p->subframe[0].amp_index = get_bits(&gb, 6);
         return 0;
     }
 
@@ -97,23 +107,23 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
     p->cur_rate       = info_bits ? RATE_5300 : RATE_6300;
     p->cur_frame_type = ACTIVE_FRAME;
 
-    p->pitch_lag[0] = bitstream_read(&bc, 7);
+    p->pitch_lag[0] = get_bits(&gb, 7);
     if (p->pitch_lag[0] > 123)       /* test if forbidden code */
         return -1;
     p->pitch_lag[0] += PITCH_MIN;
-    p->subframe[1].ad_cb_lag = bitstream_read(&bc, 2);
+    p->subframe[1].ad_cb_lag = get_bits(&gb, 2);
 
-    p->pitch_lag[1] = bitstream_read(&bc, 7);
+    p->pitch_lag[1] = get_bits(&gb, 7);
     if (p->pitch_lag[1] > 123)
         return -1;
     p->pitch_lag[1] += PITCH_MIN;
-    p->subframe[3].ad_cb_lag = bitstream_read(&bc, 2);
+    p->subframe[3].ad_cb_lag = get_bits(&gb, 2);
     p->subframe[0].ad_cb_lag = 1;
     p->subframe[2].ad_cb_lag = 1;
 
     for (i = 0; i < SUBFRAMES; i++) {
         /* Extract combined gain */
-        temp = bitstream_read(&bc, 12);
+        temp = get_bits(&gb, 12);
         ad_cb_len = 170;
         p->subframe[i].dirac_train = 0;
         if (p->cur_rate == RATE_6300 && p->pitch_lag[i >> 1] < SUBFRAME_LEN - 2) {
@@ -130,16 +140,16 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
         }
     }
 
-    p->subframe[0].grid_index = bitstream_read(&bc, 1);
-    p->subframe[1].grid_index = bitstream_read(&bc, 1);
-    p->subframe[2].grid_index = bitstream_read(&bc, 1);
-    p->subframe[3].grid_index = bitstream_read(&bc, 1);
+    p->subframe[0].grid_index = get_bits1(&gb);
+    p->subframe[1].grid_index = get_bits1(&gb);
+    p->subframe[2].grid_index = get_bits1(&gb);
+    p->subframe[3].grid_index = get_bits1(&gb);
 
     if (p->cur_rate == RATE_6300) {
-        bitstream_skip(&bc, 1); /* skip reserved bit */
+        skip_bits1(&gb);  /* skip reserved bit */
 
         /* Compute pulse_pos index using the 13-bit combined position index */
-        temp = bitstream_read(&bc, 13);
+        temp = get_bits(&gb, 13);
         p->subframe[0].pulse_pos = temp / 810;
 
         temp -= p->subframe[0].pulse_pos * 810;
@@ -150,28 +160,28 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
         p->subframe[3].pulse_pos = temp - p->subframe[2].pulse_pos * 9;
 
         p->subframe[0].pulse_pos = (p->subframe[0].pulse_pos << 16) +
-                                   bitstream_read(&bc, 16);
+                                   get_bits(&gb, 16);
         p->subframe[1].pulse_pos = (p->subframe[1].pulse_pos << 14) +
-                                   bitstream_read(&bc, 14);
+                                   get_bits(&gb, 14);
         p->subframe[2].pulse_pos = (p->subframe[2].pulse_pos << 16) +
-                                   bitstream_read(&bc, 16);
+                                   get_bits(&gb, 16);
         p->subframe[3].pulse_pos = (p->subframe[3].pulse_pos << 14) +
-                                   bitstream_read(&bc, 14);
+                                   get_bits(&gb, 14);
 
-        p->subframe[0].pulse_sign = bitstream_read(&bc, 6);
-        p->subframe[1].pulse_sign = bitstream_read(&bc, 5);
-        p->subframe[2].pulse_sign = bitstream_read(&bc, 6);
-        p->subframe[3].pulse_sign = bitstream_read(&bc, 5);
+        p->subframe[0].pulse_sign = get_bits(&gb, 6);
+        p->subframe[1].pulse_sign = get_bits(&gb, 5);
+        p->subframe[2].pulse_sign = get_bits(&gb, 6);
+        p->subframe[3].pulse_sign = get_bits(&gb, 5);
     } else { /* 5300 bps */
-        p->subframe[0].pulse_pos  = bitstream_read(&bc, 12);
-        p->subframe[1].pulse_pos  = bitstream_read(&bc, 12);
-        p->subframe[2].pulse_pos  = bitstream_read(&bc, 12);
-        p->subframe[3].pulse_pos  = bitstream_read(&bc, 12);
-
-        p->subframe[0].pulse_sign = bitstream_read(&bc, 4);
-        p->subframe[1].pulse_sign = bitstream_read(&bc, 4);
-        p->subframe[2].pulse_sign = bitstream_read(&bc, 4);
-        p->subframe[3].pulse_sign = bitstream_read(&bc, 4);
+        p->subframe[0].pulse_pos  = get_bits(&gb, 12);
+        p->subframe[1].pulse_pos  = get_bits(&gb, 12);
+        p->subframe[2].pulse_pos  = get_bits(&gb, 12);
+        p->subframe[3].pulse_pos  = get_bits(&gb, 12);
+
+        p->subframe[0].pulse_sign = get_bits(&gb, 4);
+        p->subframe[1].pulse_sign = get_bits(&gb, 4);
+        p->subframe[2].pulse_sign = get_bits(&gb, 4);
+        p->subframe[3].pulse_sign = get_bits(&gb, 4);
     }
 
     return 0;
@@ -180,31 +190,14 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
 /**
  * Bitexact implementation of sqrt(val/2).
  */
-static int16_t square_root(int val)
+static int16_t square_root(unsigned val)
 {
-    int16_t res = 0;
-    int16_t exp = 0x4000;
-    int i;
+    av_assert2(!(val & 0x80000000));
 
-    for (i = 0; i < 14; i ++) {
-        int res_exp = res + exp;
-        if (val >= res_exp * res_exp << 1)
-            res += exp;
-        exp >>= 1;
-    }
-    return res;
+    return (ff_sqrt(val << 1) >> 1) & (~1);
 }
 
 /**
- * Bitexact implementation of 2ab scaled by 1/2^16.
- *
- * @param a 32 bit multiplicand
- * @param b 16 bit multiplier
- */
-#define MULL2(a, b) \
-        ((((a) >> 16) * (b) << 1) + (((a) & 0xffff) * (b) >> 15))
-
-/**
  * Generate fixed codebook excitation vector.
  *
  * @param vector    decoded excitation vector
@@ -361,7 +354,7 @@ static void comp_ppf_gains(int lag, PPFParam *ppf, enum Rate cur_rate,
  * @param ppf       pitch postfilter parameters
  * @param cur_rate  current bitrate
  */
-static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
+static void comp_ppf_coeff(G723_1_ChannelContext *p, int offset, int pitch_lag,
                            PPFParam *ppf, enum Rate cur_rate)
 {
 
@@ -447,7 +440,7 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
  *
  * @return residual interpolation index if voiced, 0 otherwise
  */
-static int comp_interp_index(G723_1_Context *p, int pitch_lag,
+static int comp_interp_index(G723_1_ChannelContext *p, int pitch_lag,
                              int *exc_eng, int *scale)
 {
     int offset = PITCH_MAX + 2 * SUBFRAME_LEN;
@@ -477,9 +470,9 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
 
     temp = best_eng * *exc_eng >> 3;
 
-    if (temp < ccr * ccr)
+    if (temp < ccr * ccr) {
         return index;
-    else
+    } else
         return 0;
 }
 
@@ -505,7 +498,7 @@ static void residual_interp(int16_t *buf, int16_t *out, int lag,
                           (FRAME_LEN - lag) * sizeof(*out));
     } else {  /* Unvoiced */
         for (i = 0; i < FRAME_LEN; i++) {
-            *rseed = *rseed * 521 + 259;
+            *rseed = (int16_t)(*rseed * 521 + 259);
             out[i] = gain * *rseed >> 15;
         }
         memset(buf, 0, (FRAME_LEN + PITCH_MAX) * sizeof(*buf));
@@ -519,21 +512,24 @@ static void residual_interp(int16_t *buf, int16_t *out, int lag,
  * @param iir_coef IIR coefficients
  * @param src      source vector
  * @param dest     destination vector
+ * @param width    width of the output, 16 bits(0) / 32 bits(1)
  */
-static void iir_filter(int16_t *fir_coef, int16_t *iir_coef,
-                       int16_t *src, int *dest)
-{
-    int m, n;
-
-    for (m = 0; m < SUBFRAME_LEN; m++) {
-        int64_t filter = 0;
-        for (n = 1; n <= LPC_ORDER; n++) {
-            filter -= fir_coef[n - 1] * src[m - n] -
-                      iir_coef[n - 1] * (dest[m - n] >> 16);
-        }
-
-        dest[m] = av_clipl_int32((src[m] << 16) + (filter << 3) + (1 << 15));
-    }
+#define iir_filter(fir_coef, iir_coef, src, dest, width)\
+{\
+    int m, n;\
+    int res_shift = 16 & ~-(width);\
+    int in_shift  = 16 - res_shift;\
+\
+    for (m = 0; m < SUBFRAME_LEN; m++) {\
+        int64_t filter = 0;\
+        for (n = 1; n <= LPC_ORDER; n++) {\
+            filter -= (fir_coef)[n - 1] * (src)[m - n] -\
+                      (iir_coef)[n - 1] * ((dest)[m - n] >> in_shift);\
+        }\
+\
+        (dest)[m] = av_clipl_int32(((src)[m] * 65536) + (filter * 8) +\
+                                   (1 << 15)) >> res_shift;\
+    }\
 }
 
 /**
@@ -543,7 +539,7 @@ static void iir_filter(int16_t *fir_coef, int16_t *iir_coef,
  * @param buf    postfiltered output vector
  * @param energy input energy coefficient
  */
-static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
+static void gain_scale(G723_1_ChannelContext *p, int16_t * buf, int energy)
 {
     int num, denom, gain, bits1, bits2;
     int i;
@@ -563,7 +559,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
         denom <<= bits2;
 
         bits2 = 5 + bits1 - bits2;
-        bits2 = FFMAX(0, bits2);
+        bits2 = av_clip_uintp2(bits2, 5);
 
         gain = (num >> 1) / (denom >> 16);
         gain = square_root(gain << 16 >> bits2);
@@ -586,7 +582,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
  * @param buf input buffer
  * @param dst output buffer
  */
-static void formant_postfilter(G723_1_Context *p, int16_t *lpc,
+static void formant_postfilter(G723_1_ChannelContext *p, int16_t *lpc,
                                int16_t *buf, int16_t *dst)
 {
     int16_t filter_coef[2][LPC_ORDER];
@@ -603,13 +599,12 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc,
             filter_coef[1][k] = (-lpc[k] * postfilter_tbl[1][k] +
                                  (1 << 14)) >> 15;
         }
-        iir_filter(filter_coef[0], filter_coef[1], buf + i, filter_signal + i);
+        iir_filter(filter_coef[0], filter_coef[1], buf + i, filter_signal + i, 1);
         lpc += LPC_ORDER;
     }
 
-    memcpy(p->fir_mem, buf + FRAME_LEN, LPC_ORDER * sizeof(*p->fir_mem));
-    memcpy(p->iir_mem, filter_signal + FRAME_LEN,
-           LPC_ORDER * sizeof(*p->iir_mem));
+    memcpy(p->fir_mem, buf + FRAME_LEN, LPC_ORDER * sizeof(int16_t));
+    memcpy(p->iir_mem, filter_signal + FRAME_LEN, LPC_ORDER * sizeof(int));
 
     buf += LPC_ORDER;
     signal_ptr = filter_signal + LPC_ORDER;
@@ -670,16 +665,22 @@ static inline int cng_rand(int *state, int base)
     return (*state & 0x7FFF) * base >> 15;
 }
 
-static int estimate_sid_gain(G723_1_Context *p)
+static int estimate_sid_gain(G723_1_ChannelContext *p)
 {
     int i, shift, seg, seg2, t, val, val_add, x, y;
 
     shift = 16 - p->cur_gain * 2;
-    if (shift > 0)
-        t = p->sid_gain << shift;
-    else
+    if (shift > 0) {
+        if (p->sid_gain == 0) {
+            t = 0;
+        } else if (shift >= 31 || (int32_t)((uint32_t)p->sid_gain << shift) >> shift != p->sid_gain) {
+            if (p->sid_gain < 0) t = INT32_MIN;
+            else                 t = INT32_MAX;
+        } else
+            t = p->sid_gain << shift;
+    }else
         t = p->sid_gain >> -shift;
-    x = t * cng_filt[0] >> 16;
+    x = av_clipl_int32(t * (int64_t)cng_filt[0] >> 16);
 
     if (x >= cng_bseg[2])
         return 0x3F;
@@ -710,13 +711,13 @@ static int estimate_sid_gain(G723_1_Context *p)
     if (y <= 0) {
         t = seg * 32 + (val + 1 << seg2);
         t = t * t - x;
-        val = (seg2 - 1 << 4) + val;
+        val = (seg2 - 1) * 16 + val;
         if (t >= y)
             val++;
     } else {
         t = seg * 32 + (val - 1 << seg2);
         t = t * t - x;
-        val = (seg2 - 1 << 4) + val;
+        val = (seg2 - 1) * 16 + val;
         if (t >= y)
             val--;
     }
@@ -724,7 +725,7 @@ static int estimate_sid_gain(G723_1_Context *p)
     return val;
 }
 
-static void generate_noise(G723_1_Context *p)
+static void generate_noise(G723_1_ChannelContext *p)
 {
     int i, j, idx, t;
     int off[SUBFRAMES];
@@ -748,7 +749,7 @@ static void generate_noise(G723_1_Context *p)
         off[i * 2 + 1] = ((t >> 1) & 1) + SUBFRAME_LEN;
         t >>= 2;
         for (j = 0; j < 11; j++) {
-            signs[i * 11 + j] = (t & 1) * 2 - 1 << 14;
+            signs[i * 11 + j] = ((t & 1) * 2 - 1)  * (1 << 14);
             t >>= 1;
         }
     }
@@ -792,7 +793,7 @@ static void generate_noise(G723_1_Context *p)
         sum = 0;
         if (shift < 0) {
            for (j = 0; j < SUBFRAME_LEN * 2; j++) {
-               t      = vector_ptr[j] << -shift;
+               t      = vector_ptr[j] * (1 << -shift);
                sum   += t * t;
                tmp[j] = t;
            }
@@ -830,7 +831,7 @@ static void generate_noise(G723_1_Context *p)
         if (shift < 0)
            x >>= -shift;
         else
-           x <<= shift;
+           x *= 1 << shift;
         x = av_clip(x, -10000, 10000);
 
         for (j = 0; j < 11; j++) {
@@ -852,7 +853,7 @@ static void generate_noise(G723_1_Context *p)
 static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
                                int *got_frame_ptr, AVPacket *avpkt)
 {
-    G723_1_Context *p  = avctx->priv_data;
+    G723_1_Context *s  = avctx->priv_data;
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
@@ -864,9 +865,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
     int16_t acb_vector[SUBFRAME_LEN];
     int16_t *out;
     int bad_frame = 0, i, j, ret;
-    int16_t *audio = p->audio;
 
-    if (buf_size < frame_size[dec_mode]) {
+    if (buf_size < frame_size[dec_mode] * avctx->channels) {
         if (buf_size)
             av_log(avctx, AV_LOG_WARNING,
                    "Expected %d bytes, got %d - skipping packet\n",
@@ -875,151 +875,155 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
         return buf_size;
     }
 
-    if (unpack_bitstream(p, buf, buf_size) < 0) {
-        bad_frame = 1;
-        if (p->past_frame_type == ACTIVE_FRAME)
-            p->cur_frame_type = ACTIVE_FRAME;
-        else
-            p->cur_frame_type = UNTRANSMITTED_FRAME;
-    }
-
     frame->nb_samples = FRAME_LEN;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-         return ret;
-    }
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (int ch = 0; ch < avctx->channels; ch++) {
+        G723_1_ChannelContext *p = &s->ch[ch];
+        int16_t *audio = p->audio;
+
+        if (unpack_bitstream(p, buf + ch * (buf_size / avctx->channels),
+                             buf_size / avctx->channels) < 0) {
+            bad_frame = 1;
+            if (p->past_frame_type == ACTIVE_FRAME)
+                p->cur_frame_type = ACTIVE_FRAME;
+            else
+                p->cur_frame_type = UNTRANSMITTED_FRAME;
+        }
 
-    out = (int16_t *)frame->data[0];
-
-    if (p->cur_frame_type == ACTIVE_FRAME) {
-        if (!bad_frame)
-            p->erased_frames = 0;
-        else if (p->erased_frames != 3)
-            p->erased_frames++;
-
-        ff_g723_1_inverse_quant(cur_lsp, p->prev_lsp, p->lsp_index, bad_frame);
-        ff_g723_1_lsp_interpolate(lpc, cur_lsp, p->prev_lsp);
-
-        /* Save the lsp_vector for the next frame */
-        memcpy(p->prev_lsp, cur_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
-
-        /* Generate the excitation for the frame */
-        memcpy(p->excitation, p->prev_excitation,
-               PITCH_MAX * sizeof(*p->excitation));
-        if (!p->erased_frames) {
-            int16_t *vector_ptr = p->excitation + PITCH_MAX;
-
-            /* Update interpolation gain memory */
-            p->interp_gain = fixed_cb_gain[(p->subframe[2].amp_index +
-                                            p->subframe[3].amp_index) >> 1];
-            for (i = 0; i < SUBFRAMES; i++) {
-                gen_fcb_excitation(vector_ptr, &p->subframe[i], p->cur_rate,
-                                   p->pitch_lag[i >> 1], i);
-                ff_g723_1_gen_acb_excitation(acb_vector,
-                                             &p->excitation[SUBFRAME_LEN * i],
-                                             p->pitch_lag[i >> 1],
-                                             &p->subframe[i], p->cur_rate);
-                /* Get the total excitation */
-                for (j = 0; j < SUBFRAME_LEN; j++) {
-                    int v = av_clip_int16(vector_ptr[j] << 1);
-                    vector_ptr[j] = av_clip_int16(v + acb_vector[j]);
-                }
-                vector_ptr += SUBFRAME_LEN;
-            }
+        out = (int16_t *)frame->extended_data[ch];
 
-            vector_ptr = p->excitation + PITCH_MAX;
-
-            p->interp_index = comp_interp_index(p, p->pitch_lag[1],
-                                                &p->sid_gain, &p->cur_gain);
-
-            /* Perform pitch postfiltering */
-            if (p->postfilter) {
-                i = PITCH_MAX;
-                for (j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
-                    comp_ppf_coeff(p, i, p->pitch_lag[j >> 1],
-                                   ppf + j, p->cur_rate);
-
-                for (i = 0, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
-                    ff_acelp_weighted_vector_sum(p->audio + LPC_ORDER + i,
-                                                 vector_ptr + i,
-                                                 vector_ptr + i + ppf[j].index,
-                                                 ppf[j].sc_gain,
-                                                 ppf[j].opt_gain,
-                                                 1 << 14, 15, SUBFRAME_LEN);
-            } else {
-                audio = vector_ptr - LPC_ORDER;
-            }
+        if (p->cur_frame_type == ACTIVE_FRAME) {
+            if (!bad_frame)
+                p->erased_frames = 0;
+            else if (p->erased_frames != 3)
+                p->erased_frames++;
+
+            ff_g723_1_inverse_quant(cur_lsp, p->prev_lsp, p->lsp_index, bad_frame);
+            ff_g723_1_lsp_interpolate(lpc, cur_lsp, p->prev_lsp);
+
+            /* Save the lsp_vector for the next frame */
+            memcpy(p->prev_lsp, cur_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
 
-            /* Save the excitation for the next frame */
-            memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
+            /* Generate the excitation for the frame */
+            memcpy(p->excitation, p->prev_excitation,
                    PITCH_MAX * sizeof(*p->excitation));
-        } else {
-            p->interp_gain = (p->interp_gain * 3 + 2) >> 2;
-            if (p->erased_frames == 3) {
-                /* Mute output */
-                memset(p->excitation, 0,
-                       (FRAME_LEN + PITCH_MAX) * sizeof(*p->excitation));
-                memset(p->prev_excitation, 0,
-                       PITCH_MAX * sizeof(*p->excitation));
-                memset(frame->data[0], 0,
-                       (FRAME_LEN + LPC_ORDER) * sizeof(int16_t));
-            } else {
-                int16_t *buf = p->audio + LPC_ORDER;
+            if (!p->erased_frames) {
+                int16_t *vector_ptr = p->excitation + PITCH_MAX;
+
+                /* Update interpolation gain memory */
+                p->interp_gain = fixed_cb_gain[(p->subframe[2].amp_index +
+                                                p->subframe[3].amp_index) >> 1];
+                for (i = 0; i < SUBFRAMES; i++) {
+                    gen_fcb_excitation(vector_ptr, &p->subframe[i], p->cur_rate,
+                                       p->pitch_lag[i >> 1], i);
+                    ff_g723_1_gen_acb_excitation(acb_vector,
+                                                 &p->excitation[SUBFRAME_LEN * i],
+                                                 p->pitch_lag[i >> 1],
+                                                 &p->subframe[i], p->cur_rate);
+                    /* Get the total excitation */
+                    for (j = 0; j < SUBFRAME_LEN; j++) {
+                        int v = av_clip_int16(vector_ptr[j] * 2);
+                        vector_ptr[j] = av_clip_int16(v + acb_vector[j]);
+                    }
+                    vector_ptr += SUBFRAME_LEN;
+                }
 
-                /* Regenerate frame */
-                residual_interp(p->excitation, buf, p->interp_index,
-                                p->interp_gain, &p->random_seed);
+                vector_ptr = p->excitation + PITCH_MAX;
+
+                p->interp_index = comp_interp_index(p, p->pitch_lag[1],
+                                                    &p->sid_gain, &p->cur_gain);
+
+                /* Perform pitch postfiltering */
+                if (s->postfilter) {
+                    i = PITCH_MAX;
+                    for (j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
+                        comp_ppf_coeff(p, i, p->pitch_lag[j >> 1],
+                                       ppf + j, p->cur_rate);
+
+                    for (i = 0, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
+                        ff_acelp_weighted_vector_sum(p->audio + LPC_ORDER + i,
+                                                     vector_ptr + i,
+                                                     vector_ptr + i + ppf[j].index,
+                                                     ppf[j].sc_gain,
+                                                     ppf[j].opt_gain,
+                                                     1 << 14, 15, SUBFRAME_LEN);
+                } else {
+                    audio = vector_ptr - LPC_ORDER;
+                }
 
                 /* Save the excitation for the next frame */
-                memcpy(p->prev_excitation, buf + (FRAME_LEN - PITCH_MAX),
+                memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
                        PITCH_MAX * sizeof(*p->excitation));
+            } else {
+                p->interp_gain = (p->interp_gain * 3 + 2) >> 2;
+                if (p->erased_frames == 3) {
+                    /* Mute output */
+                    memset(p->excitation, 0,
+                           (FRAME_LEN + PITCH_MAX) * sizeof(*p->excitation));
+                    memset(p->prev_excitation, 0,
+                           PITCH_MAX * sizeof(*p->excitation));
+                    memset(frame->data[0], 0,
+                           (FRAME_LEN + LPC_ORDER) * sizeof(int16_t));
+                } else {
+                    int16_t *buf = p->audio + LPC_ORDER;
+
+                    /* Regenerate frame */
+                    residual_interp(p->excitation, buf, p->interp_index,
+                                    p->interp_gain, &p->random_seed);
+
+                    /* Save the excitation for the next frame */
+                    memcpy(p->prev_excitation, buf + (FRAME_LEN - PITCH_MAX),
+                           PITCH_MAX * sizeof(*p->excitation));
+                }
+            }
+            p->cng_random_seed = CNG_RANDOM_SEED;
+        } else {
+            if (p->cur_frame_type == SID_FRAME) {
+                p->sid_gain = sid_gain_to_lsp_index(p->subframe[0].amp_index);
+                ff_g723_1_inverse_quant(p->sid_lsp, p->prev_lsp, p->lsp_index, 0);
+            } else if (p->past_frame_type == ACTIVE_FRAME) {
+                p->sid_gain = estimate_sid_gain(p);
             }
-        }
-        p->cng_random_seed = CNG_RANDOM_SEED;
-    } else {
-        if (p->cur_frame_type == SID_FRAME) {
-            p->sid_gain = sid_gain_to_lsp_index(p->subframe[0].amp_index);
-            ff_g723_1_inverse_quant(p->sid_lsp, p->prev_lsp, p->lsp_index, 0);
-        } else if (p->past_frame_type == ACTIVE_FRAME) {
-            p->sid_gain = estimate_sid_gain(p);
-        }
 
-        if (p->past_frame_type == ACTIVE_FRAME)
-            p->cur_gain = p->sid_gain;
-        else
-            p->cur_gain = (p->cur_gain * 7 + p->sid_gain) >> 3;
-        generate_noise(p);
-        ff_g723_1_lsp_interpolate(lpc, p->sid_lsp, p->prev_lsp);
-        /* Save the lsp_vector for the next frame */
-        memcpy(p->prev_lsp, p->sid_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
-    }
+            if (p->past_frame_type == ACTIVE_FRAME)
+                p->cur_gain = p->sid_gain;
+            else
+                p->cur_gain = (p->cur_gain * 7 + p->sid_gain) >> 3;
+            generate_noise(p);
+            ff_g723_1_lsp_interpolate(lpc, p->sid_lsp, p->prev_lsp);
+            /* Save the lsp_vector for the next frame */
+            memcpy(p->prev_lsp, p->sid_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
+        }
 
-    p->past_frame_type = p->cur_frame_type;
+        p->past_frame_type = p->cur_frame_type;
 
-    memcpy(p->audio, p->synth_mem, LPC_ORDER * sizeof(*p->audio));
-    for (i = LPC_ORDER, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
-        ff_celp_lp_synthesis_filter(p->audio + i, &lpc[j * LPC_ORDER],
-                                    audio + i, SUBFRAME_LEN, LPC_ORDER,
-                                    0, 1, 1 << 12);
-    memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio));
+        memcpy(p->audio, p->synth_mem, LPC_ORDER * sizeof(*p->audio));
+        for (i = LPC_ORDER, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
+            ff_celp_lp_synthesis_filter(p->audio + i, &lpc[j * LPC_ORDER],
+                                        audio + i, SUBFRAME_LEN, LPC_ORDER,
+                                        0, 1, 1 << 12);
+        memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio));
 
-    if (p->postfilter) {
-        formant_postfilter(p, lpc, p->audio, out);
-    } else { // if output is not postfiltered it should be scaled by 2
-        for (i = 0; i < FRAME_LEN; i++)
-            out[i] = av_clip_int16(p->audio[LPC_ORDER + i] << 1);
+        if (s->postfilter) {
+            formant_postfilter(p, lpc, p->audio, out);
+        } else { // if output is not postfiltered it should be scaled by 2
+            for (i = 0; i < FRAME_LEN; i++)
+                out[i] = av_clip_int16(p->audio[LPC_ORDER + i] << 1);
+        }
     }
 
     *got_frame_ptr = 1;
 
-    return frame_size[dec_mode];
+    return frame_size[dec_mode] * avctx->channels;
 }
 
 #define OFFSET(x) offsetof(G723_1_Context, x)
 #define AD     AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
 static const AVOption options[] = {
-    { "postfilter", "postfilter on/off", OFFSET(postfilter), AV_OPT_TYPE_INT,
+    { "postfilter", "enable postfilter", OFFSET(postfilter), AV_OPT_TYPE_BOOL,
       { .i64 = 1 }, 0, 1, AD },
     { NULL }
 };
diff --git a/libavcodec/g723_1enc.c b/libavcodec/g723_1enc.c
index 82f5cec..5928405 100644
--- a/libavcodec/g723_1enc.c
+++ b/libavcodec/g723_1enc.c
@@ -2,20 +2,20 @@
  * G.723.1 compatible encoder
  * Copyright (c) Mohamed Naufal <naufal22@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,7 +42,8 @@
 
 static av_cold int g723_1_encode_init(AVCodecContext *avctx)
 {
-    G723_1_Context *p = avctx->priv_data;
+    G723_1_Context *s = avctx->priv_data;
+    G723_1_ChannelContext *p = &s->ch[0];
 
     if (avctx->sample_rate != 8000) {
         av_log(avctx, AV_LOG_ERROR, "Only 8000Hz sample rate supported\n");
@@ -83,7 +84,7 @@ static void highpass_filter(int16_t *buf, int16_t *fir, int *iir)
     for (i = 0; i < FRAME_LEN; i++) {
         *iir   = (buf[i] << 15) + ((-*fir) << 15) + MULL2(*iir, 0x7f00);
         *fir   = buf[i];
-        buf[i] = av_clipl_int32((int64_t) *iir + (1 << 15)) >> 16;
+        buf[i] = av_clipl_int32((int64_t)*iir + (1 << 15)) >> 16;
     }
 }
 
@@ -386,7 +387,7 @@ static void iir_filter(int16_t *fir_coef, int16_t *iir_coef,
  * @param flt_coef filter coefficients
  * @param unq_lpc  unquantized lpc vector
  */
-static void perceptual_filter(G723_1_Context *p, int16_t *flt_coef,
+static void perceptual_filter(G723_1_ChannelContext *p, int16_t *flt_coef,
                               int16_t *unq_lpc, int16_t *buf)
 {
     int16_t vector[FRAME_LEN + LPC_ORDER];
@@ -635,7 +636,7 @@ static void synth_percept_filter(int16_t *qnt_lpc, int16_t *perf_lpc,
  * @param buf   input signal
  * @param index the current subframe index
  */
-static void acb_search(G723_1_Context *p, int16_t *residual,
+static void acb_search(G723_1_ChannelContext *p, int16_t *residual,
                        int16_t *impulse_resp, const int16_t *buf,
                        int index)
 {
@@ -963,7 +964,7 @@ static void pack_fcb_param(G723_1_Subframe *subfrm, FCBParam *optim,
  * @param buf          target vector
  * @param impulse_resp impulse response of the combined filter
  */
-static void fcb_search(G723_1_Context *p, int16_t *impulse_resp,
+static void fcb_search(G723_1_ChannelContext *p, int16_t *impulse_resp,
                        int16_t *buf, int index)
 {
     FCBParam optim;
@@ -995,7 +996,7 @@ static void fcb_search(G723_1_Context *p, int16_t *impulse_resp,
  * @param frame output buffer
  * @param size  size of the buffer
  */
-static int pack_bitstream(G723_1_Context *p, AVPacket *avpkt)
+static int pack_bitstream(G723_1_ChannelContext *p, AVPacket *avpkt)
 {
     PutBitContext pb;
     int info_bits = 0;
@@ -1056,7 +1057,8 @@ static int pack_bitstream(G723_1_Context *p, AVPacket *avpkt)
 static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                                const AVFrame *frame, int *got_packet_ptr)
 {
-    G723_1_Context *p = avctx->priv_data;
+    G723_1_Context *s = avctx->priv_data;
+    G723_1_ChannelContext *p = &s->ch[0];
     int16_t unq_lpc[LPC_ORDER * SUBFRAMES];
     int16_t qnt_lpc[LPC_ORDER * SUBFRAMES];
     int16_t cur_lsp[LPC_ORDER];
@@ -1149,7 +1151,7 @@ static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         acb_search(p, residual, impulse_resp, in, i);
         ff_g723_1_gen_acb_excitation(residual, p->prev_excitation,
                                      p->pitch_lag[i >> 1], &p->subframe[i],
-                                     RATE_6300);
+                                     p->cur_rate);
         sub_acb_contrib(residual, impulse_resp, in);
 
         fcb_search(p, impulse_resp, in, i);
@@ -1181,14 +1183,19 @@ static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     av_free(start);
 
-    ret = ff_alloc_packet(avpkt, 24);
-    if (ret < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 24, 0)) < 0)
         return ret;
 
     *got_packet_ptr = 1;
-    return pack_bitstream(p, avpkt);
+    avpkt->size = pack_bitstream(p, avpkt);
+    return 0;
 }
 
+static const AVCodecDefault defaults[] = {
+    { "b", "6300" },
+    { NULL },
+};
+
 AVCodec ff_g723_1_encoder = {
     .name           = "g723_1",
     .long_name      = NULL_IF_CONFIG_SMALL("G.723.1"),
@@ -1197,6 +1204,7 @@ AVCodec ff_g723_1_encoder = {
     .priv_data_size = sizeof(G723_1_Context),
     .init           = g723_1_encode_init,
     .encode2        = g723_1_encode_frame,
+    .defaults       = defaults,
     .sample_fmts    = (const enum AVSampleFormat[]) {
         AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE
     },
diff --git a/libavcodec/g726.c b/libavcodec/g726.c
index dab038f..80cb064 100644
--- a/libavcodec/g726.c
+++ b/libavcodec/g726.c
@@ -5,30 +5,29 @@
  * This is a very straightforward rendition of the G.726
  * Section 4 "Computational Details".
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <limits.h>
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "put_bits.h"
 
 /**
@@ -96,6 +95,7 @@ typedef struct G726Context {
     int sez;            /**< estimated second order prediction */
     int y;              /**< quantizer scaling factor for the next iteration */
     int code_size;
+    int little_endian;  /**< little-endian bitstream as used in aiff and Sun AU */
 } G726Context;
 
 static const int quant_tbl16[] =                  /**< 16kbit/s 2 bits per sample */
@@ -206,7 +206,7 @@ static int16_t g726_decode(G726Context* c, int I)
 
     if (I_sig)  /* get the sign */
         dq = -dq;
-    re_signal = c->se + dq;
+    re_signal = (int16_t)(c->se + dq);
 
     /* Update second order predictor coefficient A2 and A1 */
     pk0 = (c->sez + dq) ? sgn(c->sez + dq) : 0;
@@ -269,7 +269,7 @@ static int16_t g726_decode(G726Context* c, int I)
         c->se += mult(i2f(c->a[i] >> 2, &f), &c->sr[i]);
     c->se >>= 1;
 
-    return av_clip(re_signal << 2, -0xffff, 0xffff);
+    return av_clip(re_signal * 4, -0xffff, 0xffff);
 }
 
 static av_cold int g726_reset(G726Context *c)
@@ -292,12 +292,12 @@ static av_cold int g726_reset(G726Context *c)
     return 0;
 }
 
-#if CONFIG_ADPCM_G726_ENCODER
+#if CONFIG_ADPCM_G726_ENCODER || CONFIG_ADPCM_G726LE_ENCODER
 static int16_t g726_encode(G726Context* c, int16_t sig)
 {
     uint8_t i;
 
-    i = quant(c, sig/4 - c->se) & ((1<<c->code_size) - 1);
+    i = av_mod_uintp2(quant(c, sig/4 - c->se), c->code_size);
     g726_decode(c, i);
     return i;
 }
@@ -308,6 +308,8 @@ static av_cold int g726_encode_init(AVCodecContext *avctx)
 {
     G726Context* c = avctx->priv_data;
 
+    c->little_endian = !strcmp(avctx->codec->name, "g726le");
+
     if (avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL &&
         avctx->sample_rate != 8000) {
         av_log(avctx, AV_LOG_ERROR, "Sample rates other than 8kHz are not "
@@ -351,16 +353,22 @@ static int g726_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int i, ret, out_size;
 
     out_size = (frame->nb_samples * c->code_size + 7) / 8;
-    if ((ret = ff_alloc_packet(avpkt, out_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
     init_put_bits(&pb, avpkt->data, avpkt->size);
 
     for (i = 0; i < frame->nb_samples; i++)
-        put_bits(&pb, c->code_size, g726_encode(c, *samples++));
-
-    flush_put_bits(&pb);
+        if (c->little_endian) {
+            put_bits_le(&pb, c->code_size, g726_encode(c, *samples++));
+        } else {
+            put_bits(&pb, c->code_size, g726_encode(c, *samples++));
+        }
+
+    if (c->little_endian) {
+        flush_put_bits_le(&pb);
+    } else {
+        flush_put_bits(&pb);
+    }
 
     avpkt->size = out_size;
     *got_packet_ptr = 1;
@@ -374,18 +382,20 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVCodecDefault defaults[] = {
+    { "b", "0" },
+    { NULL },
+};
+#endif
+
+#if CONFIG_ADPCM_G726_ENCODER
+static const AVClass g726_class = {
     .class_name = "g726",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static const AVCodecDefault defaults[] = {
-    { "b", "0" },
-    { NULL },
-};
-
 AVCodec ff_adpcm_g726_encoder = {
     .name           = "g726",
     .long_name      = NULL_IF_CONFIG_SMALL("G.726 ADPCM"),
@@ -397,19 +407,49 @@ AVCodec ff_adpcm_g726_encoder = {
     .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &g726_class,
     .defaults       = defaults,
 };
 #endif
 
-#if CONFIG_ADPCM_G726_DECODER
+#if CONFIG_ADPCM_G726LE_ENCODER
+static const AVClass g726le_class = {
+    .class_name = "g726le",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_adpcm_g726le_encoder = {
+    .name           = "g726le",
+    .long_name      = NULL_IF_CONFIG_SMALL("G.726 little endian ADPCM (\"right-justified\")"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ADPCM_G726LE,
+    .priv_data_size = sizeof(G726Context),
+    .init           = g726_encode_init,
+    .encode2        = g726_encode_frame,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                     AV_SAMPLE_FMT_NONE },
+    .priv_class     = &g726le_class,
+    .defaults       = defaults,
+};
+#endif
+
+#if CONFIG_ADPCM_G726_DECODER || CONFIG_ADPCM_G726LE_DECODER
 static av_cold int g726_decode_init(AVCodecContext *avctx)
 {
     G726Context* c = avctx->priv_data;
 
+    if(avctx->channels > 1){
+        avpriv_request_sample(avctx, "Decoding more than one channel");
+        return AVERROR_PATCHWELCOME;
+    }
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
 
+    c->little_endian = !strcmp(avctx->codec->name, "g726le");
+
     c->code_size = avctx->bits_per_coded_sample;
     if (c->code_size < 2 || c->code_size > 5) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of bits %d\n", c->code_size);
@@ -430,25 +470,25 @@ static int g726_decode_frame(AVCodecContext *avctx, void *data,
     int buf_size = avpkt->size;
     G726Context *c = avctx->priv_data;
     int16_t *samples;
-    BitstreamContext bc;
+    GetBitContext gb;
     int out_samples, ret;
 
     out_samples = buf_size * 8 / c->code_size;
 
     /* get output buffer */
     frame->nb_samples = out_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
-    bitstream_init8(&bc, buf, buf_size);
+    init_get_bits(&gb, buf, buf_size * 8);
 
     while (out_samples--)
-        *samples++ = g726_decode(c, bitstream_read(&bc, c->code_size));
+        *samples++ = g726_decode(c, c->little_endian ?
+                                    get_bits_le(&gb, c->code_size) :
+                                    get_bits(&gb, c->code_size));
 
-    if (bitstream_bits_left(&bc) > 0)
+    if (get_bits_left(&gb) > 0)
         av_log(avctx, AV_LOG_ERROR, "Frame invalidly split, missing parser?\n");
 
     *got_frame_ptr = 1;
@@ -461,7 +501,9 @@ static void g726_decode_flush(AVCodecContext *avctx)
     G726Context *c = avctx->priv_data;
     g726_reset(c);
 }
+#endif
 
+#if CONFIG_ADPCM_G726_DECODER
 AVCodec ff_adpcm_g726_decoder = {
     .name           = "g726",
     .long_name      = NULL_IF_CONFIG_SMALL("G.726 ADPCM"),
@@ -474,3 +516,17 @@ AVCodec ff_adpcm_g726_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
 };
 #endif
+
+#if CONFIG_ADPCM_G726LE_DECODER
+AVCodec ff_adpcm_g726le_decoder = {
+    .name           = "g726le",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ADPCM_G726LE,
+    .priv_data_size = sizeof(G726Context),
+    .init           = g726_decode_init,
+    .decode         = g726_decode_frame,
+    .flush          = g726_decode_flush,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("G.726 ADPCM little-endian"),
+};
+#endif
diff --git a/libavcodec/g729.h b/libavcodec/g729.h
new file mode 100644
index 0000000..7c5f693
--- /dev/null
+++ b/libavcodec/g729.h
@@ -0,0 +1,33 @@
+/*
+ * G.729, G729 Annex D decoders
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_G729_H
+#define AVCODEC_G729_H
+
+/**
+ * subframe size
+ */
+#define SUBFRAME_SIZE 40
+
+/* bytes per block */
+#define G729_8K_BLOCK_SIZE     10
+#define G729D_6K4_BLOCK_SIZE   8
+
+#endif // AVCODEC_G729_H
diff --git a/libavcodec/g729_parser.c b/libavcodec/g729_parser.c
new file mode 100644
index 0000000..9982dbf
--- /dev/null
+++ b/libavcodec/g729_parser.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015  Ganesh Ajjanagadde
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * G.729 audio parser
+ *
+ * Splits packets into individual blocks.
+ */
+
+#include "libavutil/avassert.h"
+#include "parser.h"
+#include "g729.h"
+
+typedef struct G729ParseContext {
+    ParseContext pc;
+    int block_size;
+    int duration;
+    int remaining;
+} G729ParseContext;
+
+static int g729_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                     const uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size)
+{
+    G729ParseContext *s = s1->priv_data;
+    ParseContext *pc = &s->pc;
+    int next;
+
+    if (!s->block_size) {
+        av_assert1(avctx->codec_id == AV_CODEC_ID_G729);
+        /* FIXME: replace this heuristic block_size with more precise estimate */
+        s->block_size = (avctx->bit_rate < 8000) ? G729D_6K4_BLOCK_SIZE : G729_8K_BLOCK_SIZE;
+        s->block_size *= avctx->channels;
+        s->duration   = avctx->frame_size;
+    }
+
+    if (!s->remaining)
+        s->remaining = s->block_size;
+    if (s->remaining <= buf_size) {
+        next = s->remaining;
+        s->remaining = 0;
+    } else {
+        next = END_NOT_FOUND;
+        s->remaining -= buf_size;
+    }
+
+    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0 || !buf_size) {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+
+    s1->duration = s->duration;
+
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+AVCodecParser ff_g729_parser = {
+    .codec_ids      = { AV_CODEC_ID_G729 },
+    .priv_data_size = sizeof(G729ParseContext),
+    .parser_parse   = g729_parse,
+    .parser_close   = ff_parse_close,
+};
diff --git a/libavcodec/g729data.h b/libavcodec/g729data.h
new file mode 100644
index 0000000..365ca47
--- /dev/null
+++ b/libavcodec/g729data.h
@@ -0,0 +1,382 @@
+/*
+ * data for G.729, G729 Annex D decoders
+ * Copyright (c) 2007 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_G729DATA_H
+#define AVCODEC_G729DATA_H
+
+#include <stdint.h>
+
+#define MA_NP                4  ///< Moving Average (MA) prediction order
+
+#define VQ_1ST_BITS          7  ///< first stage vector of quantizer (size in bits)
+#define VQ_2ND_BITS          5  ///< second stage vector of quantizer (size in bits)
+
+#define GC_1ST_IDX_BITS_8K   3  ///< gain codebook (first stage) index, 8k mode (size in bits)
+#define GC_2ND_IDX_BITS_8K   4  ///< gain codebook (second stage) index, 8k mode (size in bits)
+
+#define GC_1ST_IDX_BITS_6K4  3  ///< gain codebook (first stage) index, 6.4k mode (size in bits)
+#define GC_2ND_IDX_BITS_6K4  3  ///< gain codebook (second stage) index, 6.4k mode (size in bits)
+
+/**
+ * first stage LSP codebook
+ * (10-dimensional, with 128 entries (3.24 of G.729)
+ */
+static const int16_t cb_lsp_1st[1<<VQ_1ST_BITS][10] = { /* (2.13) */
+  { 1486,  2168,  3751,  9074, 12134, 13944, 17983, 19173, 21190, 21820},
+  { 1730,  2640,  3450,  4870,  6126,  7876, 15644, 17817, 20294, 21902},
+  { 1568,  2256,  3088,  4874, 11063, 13393, 18307, 19293, 21109, 21741},
+  { 1733,  2512,  3357,  4708,  6977, 10296, 17024, 17956, 19145, 20350},
+  { 1744,  2436,  3308,  8731, 10432, 12007, 15614, 16639, 21359, 21913},
+  { 1786,  2369,  3372,  4521,  6795, 12963, 17674, 18988, 20855, 21640},
+  { 1631,  2433,  3361,  6328, 10709, 12013, 13277, 13904, 19441, 21088},
+  { 1489,  2364,  3291,  6250,  9227, 10403, 13843, 15278, 17721, 21451},
+  { 1869,  2533,  3475,  4365,  9152, 14513, 15908, 17022, 20611, 21411},
+  { 2070,  3025,  4333,  5854,  7805,  9231, 10597, 16047, 20109, 21834},
+  { 1910,  2673,  3419,  4261, 11168, 15111, 16577, 17591, 19310, 20265},
+  { 1141,  1815,  2624,  4623,  6495,  9588, 13968, 16428, 19351, 21286},
+  { 2192,  3171,  4707,  5808, 10904, 12500, 14162, 15664, 21124, 21789},
+  { 1286,  1907,  2548,  3453,  9574, 11964, 15978, 17344, 19691, 22495},
+  { 1921,  2720,  4604,  6684, 11503, 12992, 14350, 15262, 16997, 20791},
+  { 2052,  2759,  3897,  5246,  6638, 10267, 15834, 16814, 18149, 21675},
+  { 1798,  2497,  5617, 11449, 13189, 14711, 17050, 18195, 20307, 21182},
+  { 1009,  1647,  2889,  5709,  9541, 12354, 15231, 18494, 20966, 22033},
+  { 3016,  3794,  5406,  7469, 12488, 13984, 15328, 16334, 19952, 20791},
+  { 2203,  3040,  3796,  5442, 11987, 13512, 14931, 16370, 17856, 18803},
+  { 2912,  4292,  7988,  9572, 11562, 13244, 14556, 16529, 20004, 21073},
+  { 2861,  3607,  5923,  7034,  9234, 12054, 13729, 18056, 20262, 20974},
+  { 3069,  4311,  5967,  7367, 11482, 12699, 14309, 16233, 18333, 19172},
+  { 2434,  3661,  4866,  5798, 10383, 11722, 13049, 15668, 18862, 19831},
+  { 2020,  2605,  3860,  9241, 13275, 14644, 16010, 17099, 19268, 20251},
+  { 1877,  2809,  3590,  4707, 11056, 12441, 15622, 17168, 18761, 19907},
+  { 2107,  2873,  3673,  5799, 13579, 14687, 15938, 17077, 18890, 19831},
+  { 1612,  2284,  2944,  3572,  8219, 13959, 15924, 17239, 18592, 20117},
+  { 2420,  3156,  6542, 10215, 12061, 13534, 15305, 16452, 18717, 19880},
+  { 1667,  2612,  3534,  5237, 10513, 11696, 12940, 16798, 18058, 19378},
+  { 2388,  3017,  4839,  9333, 11413, 12730, 15024, 16248, 17449, 18677},
+  { 1875,  2786,  4231,  6320,  8694, 10149, 11785, 17013, 18608, 19960},
+  {  679,  1411,  4654,  8006, 11446, 13249, 15763, 18127, 20361, 21567},
+  { 1838,  2596,  3578,  4608,  5650, 11274, 14355, 15886, 20579, 21754},
+  { 1303,  1955,  2395,  3322, 12023, 13764, 15883, 18077, 20180, 21232},
+  { 1438,  2102,  2663,  3462,  8328, 10362, 13763, 17248, 19732, 22344},
+  {  860,  1904,  6098,  7775,  9815, 12007, 14821, 16709, 19787, 21132},
+  { 1673,  2723,  3704,  6125,  7668,  9447, 13683, 14443, 20538, 21731},
+  { 1246,  1849,  2902,  4508,  7221, 12710, 14835, 16314, 19335, 22720},
+  { 1525,  2260,  3862,  5659,  7342, 11748, 13370, 14442, 18044, 21334},
+  { 1196,  1846,  3104,  7063, 10972, 12905, 14814, 17037, 19922, 22636},
+  { 2147,  3106,  4475,  6511,  8227,  9765, 10984, 12161, 18971, 21300},
+  { 1585,  2405,  2994,  4036, 11481, 13177, 14519, 15431, 19967, 21275},
+  { 1778,  2688,  3614,  4680,  9465, 11064, 12473, 16320, 19742, 20800},
+  { 1862,  2586,  3492,  6719, 11708, 13012, 14364, 16128, 19610, 20425},
+  { 1395,  2156,  2669,  3386, 10607, 12125, 13614, 16705, 18976, 21367},
+  { 1444,  2117,  3286,  6233,  9423, 12981, 14998, 15853, 17188, 21857},
+  { 2004,  2895,  3783,  4897,  6168,  7297, 12609, 16445, 19297, 21465},
+  { 1495,  2863,  6360,  8100, 11399, 14271, 15902, 17711, 20479, 22061},
+  { 2484,  3114,  5718,  7097,  8400, 12616, 14073, 14847, 20535, 21396},
+  { 2424,  3277,  5296,  6284, 11290, 12903, 16022, 17508, 19333, 20283},
+  { 2565,  3778,  5360,  6989,  8782, 10428, 14390, 15742, 17770, 21734},
+  { 2727,  3384,  6613,  9254, 10542, 12236, 14651, 15687, 20074, 21102},
+  { 1916,  2953,  6274,  8088,  9710, 10925, 12392, 16434, 20010, 21183},
+  { 3384,  4366,  5349,  7667, 11180, 12605, 13921, 15324, 19901, 20754},
+  { 3075,  4283,  5951,  7619,  9604, 11010, 12384, 14006, 20658, 21497},
+  { 1751,  2455,  5147,  9966, 11621, 13176, 14739, 16470, 20788, 21756},
+  { 1442,  2188,  3330,  6813,  8929, 12135, 14476, 15306, 19635, 20544},
+  { 2294,  2895,  4070,  8035, 12233, 13416, 14762, 17367, 18952, 19688},
+  { 1937,  2659,  4602,  6697,  9071, 12863, 14197, 15230, 16047, 18877},
+  { 2071,  2663,  4216,  9445, 10887, 12292, 13949, 14909, 19236, 20341},
+  { 1740,  2491,  3488,  8138,  9656, 11153, 13206, 14688, 20896, 21907},
+  { 2199,  2881,  4675,  8527, 10051, 11408, 14435, 15463, 17190, 20597},
+  { 1943,  2988,  4177,  6039,  7478,  8536, 14181, 15551, 17622, 21579},
+  { 1825,  3175,  7062,  9818, 12824, 15450, 18330, 19856, 21830, 22412},
+  { 2464,  3046,  4822,  5977,  7696, 15398, 16730, 17646, 20588, 21320},
+  { 2550,  3393,  5305,  6920, 10235, 14083, 18143, 19195, 20681, 21336},
+  { 3003,  3799,  5321,  6437,  7919, 11643, 15810, 16846, 18119, 18980},
+  { 3455,  4157,  6838,  8199,  9877, 12314, 15905, 16826, 19949, 20892},
+  { 3052,  3769,  4891,  5810,  6977, 10126, 14788, 15990, 19773, 20904},
+  { 3671,  4356,  5827,  6997,  8460, 12084, 14154, 14939, 19247, 20423},
+  { 2716,  3684,  5246,  6686,  8463, 10001, 12394, 14131, 16150, 19776},
+  { 1945,  2638,  4130,  7995, 14338, 15576, 17057, 18206, 20225, 20997},
+  { 2304,  2928,  4122,  4824,  5640, 13139, 15825, 16938, 20108, 21054},
+  { 1800,  2516,  3350,  5219, 13406, 15948, 17618, 18540, 20531, 21252},
+  { 1436,  2224,  2753,  4546,  9657, 11245, 15177, 16317, 17489, 19135},
+  { 2319,  2899,  4980,  6936,  8404, 13489, 15554, 16281, 20270, 20911},
+  { 2187,  2919,  4610,  5875,  7390, 12556, 14033, 16794, 20998, 21769},
+  { 2235,  2923,  5121,  6259,  8099, 13589, 15340, 16340, 17927, 20159},
+  { 1765,  2638,  3751,  5730,  7883, 10108, 13633, 15419, 16808, 18574},
+  { 3460,  5741,  9596, 11742, 14413, 16080, 18173, 19090, 20845, 21601},
+  { 3735,  4426,  6199,  7363,  9250, 14489, 16035, 17026, 19873, 20876},
+  { 3521,  4778,  6887,  8680, 12717, 14322, 15950, 18050, 20166, 21145},
+  { 2141,  2968,  6865,  8051, 10010, 13159, 14813, 15861, 17528, 18655},
+  { 4148,  6128,  9028, 10871, 12686, 14005, 15976, 17208, 19587, 20595},
+  { 4403,  5367,  6634,  8371, 10163, 11599, 14963, 16331, 17982, 18768},
+  { 4091,  5386,  6852,  8770, 11563, 13290, 15728, 16930, 19056, 20102},
+  { 2746,  3625,  5299,  7504, 10262, 11432, 13172, 15490, 16875, 17514},
+  { 2248,  3556,  8539, 10590, 12665, 14696, 16515, 17824, 20268, 21247},
+  { 1279,  1960,  3920,  7793, 10153, 14753, 16646, 18139, 20679, 21466},
+  { 2440,  3475,  6737,  8654, 12190, 14588, 17119, 17925, 19110, 19979},
+  { 1879,  2514,  4497,  7572, 10017, 14948, 16141, 16897, 18397, 19376},
+  { 2804,  3688,  7490, 10086, 11218, 12711, 16307, 17470, 20077, 21126},
+  { 2023,  2682,  3873,  8268, 10255, 11645, 15187, 17102, 18965, 19788},
+  { 2823,  3605,  5815,  8595, 10085, 11469, 16568, 17462, 18754, 19876},
+  { 2851,  3681,  5280,  7648,  9173, 10338, 14961, 16148, 17559, 18474},
+  { 1348,  2645,  5826,  8785, 10620, 12831, 16255, 18319, 21133, 22586},
+  { 2141,  3036,  4293,  6082,  7593, 10629, 17158, 18033, 21466, 22084},
+  { 1608,  2375,  3384,  6878,  9970, 11227, 16928, 17650, 20185, 21120},
+  { 2774,  3616,  5014,  6557,  7788,  8959, 17068, 18302, 19537, 20542},
+  { 1934,  4813,  6204,  7212,  8979, 11665, 15989, 17811, 20426, 21703},
+  { 2288,  3507,  5037,  6841,  8278,  9638, 15066, 16481, 21653, 22214},
+  { 2951,  3771,  4878,  7578,  9016, 10298, 14490, 15242, 20223, 20990},
+  { 3256,  4791,  6601,  7521,  8644,  9707, 13398, 16078, 19102, 20249},
+  { 1827,  2614,  3486,  6039, 12149, 13823, 16191, 17282, 21423, 22041},
+  { 1000,  1704,  3002,  6335,  8471, 10500, 14878, 16979, 20026, 22427},
+  { 1646,  2286,  3109,  7245, 11493, 12791, 16824, 17667, 18981, 20222},
+  { 1708,  2501,  3315,  6737,  8729,  9924, 16089, 17097, 18374, 19917},
+  { 2623,  3510,  4478,  5645,  9862, 11115, 15219, 18067, 19583, 20382},
+  { 2518,  3434,  4728,  6388,  8082,  9285, 13162, 18383, 19819, 20552},
+  { 1726,  2383,  4090,  6303,  7805, 12845, 14612, 17608, 19269, 20181},
+  { 2860,  3735,  4838,  6044,  7254,  8402, 14031, 16381, 18037, 19410},
+  { 4247,  5993,  7952,  9792, 12342, 14653, 17527, 18774, 20831, 21699},
+  { 3502,  4051,  5680,  6805,  8146, 11945, 16649, 17444, 20390, 21564},
+  { 3151,  4893,  5899,  7198, 11418, 13073, 15124, 17673, 20520, 21861},
+  { 3960,  4848,  5926,  7259,  8811, 10529, 15661, 16560, 18196, 20183},
+  { 4499,  6604,  8036,  9251, 10804, 12627, 15880, 17512, 20020, 21046},
+  { 4251,  5541,  6654,  8318,  9900, 11686, 15100, 17093, 20572, 21687},
+  { 3769,  5327,  7865,  9360, 10684, 11818, 13660, 15366, 18733, 19882},
+  { 3083,  3969,  6248,  8121,  9798, 10994, 12393, 13686, 17888, 19105},
+  { 2731,  4670,  7063,  9201, 11346, 13735, 16875, 18797, 20787, 22360},
+  { 1187,  2227,  4737,  7214,  9622, 12633, 15404, 17968, 20262, 23533},
+  { 1911,  2477,  3915, 10098, 11616, 12955, 16223, 17138, 19270, 20729},
+  { 1764,  2519,  3887,  6944,  9150, 12590, 16258, 16984, 17924, 18435},
+  { 1400,  3674,  7131,  8718, 10688, 12508, 15708, 17711, 19720, 21068},
+  { 2322,  3073,  4287,  8108,  9407, 10628, 15862, 16693, 19714, 21474},
+  { 2630,  3339,  4758,  8360, 10274, 11333, 12880, 17374, 19221, 19936},
+  { 1721,  2577,  5553,  7195,  8651, 10686, 15069, 16953, 18703, 19929}
+};
+
+/**
+ * second stage LSP codebook, high and low parts
+   (both 5-dimensional, with 32 entries (3.2.4 of G.729)
+ */
+static const int16_t cb_lsp_2nd[1<<VQ_2ND_BITS][10] = { /* (2.13) */
+  { -435,  -815,  -742,  1033,  -518,   582, -1201,   829,    86,   385},
+  { -833,  -891,   463,    -8, -1251,  1450,    72,  -231,   864,   661},
+  {-1021,   231,  -306,   321,  -220,  -163,  -526,  -754, -1633,   267},
+  {   57,  -198,  -339,   -33, -1468,   573,   796,  -169,  -631,   816},
+  {  171,  -350,   294,  1660,   453,   519,   291,   159,  -640, -1296},
+  { -701,  -842,   -58,   950,   892,  1549,   715,   527,  -714,  -193},
+  {  584,    31,  -289,   356,  -333,  -457,   612,  -283, -1381,  -741},
+  { -109,  -808,   231,    77,   -87,  -344,  1341,  1087,  -654,  -569},
+  { -859,  1236,   550,   854,   714,  -543, -1752,  -195,   -98,  -276},
+  { -877,  -954, -1248,  -299,   212,  -235,  -728,   949,  1517,   895},
+  {  -77,   344,  -620,   763,   413,   502,  -362,  -960,  -483,  1386},
+  { -314,  -307,  -256, -1260,  -429,   450,  -466,  -108,  1010,  2223},
+  {  711,   693,   521,   650,  1305,   -28,  -378,   744, -1005,   240},
+  { -112,  -271,  -500,   946,  1733,   271,   -15,   909,  -259,  1688},
+  {  575,   -10,  -468,  -199,  1101, -1011,   581,   -53,  -747,   878},
+  {  145,  -285, -1280,  -398,    36,  -498, -1377,    18,  -444,  1483},
+  {-1133,  -835,  1350,  1284,   -95,  1015,  -222,   443,   372,  -354},
+  {-1459, -1237,   416,  -213,   466,   669,   659,  1640,   932,   534},
+  {  -15,    66,   468,  1019,  -748,  1385,  -182,  -907,  -721,  -262},
+  { -338,   148,  1445,    75,  -760,   569,  1247,   337,   416,  -121},
+  {  389,   239,  1568,   981,   113,   369, -1003,  -507,  -587,  -904},
+  { -312,   -98,   949,    31,  1104,    72,  -141,  1465,    63,  -785},
+  { 1127,   584,   835,   277, -1159,   208,   301,  -882,   117,  -404},
+  {  539,  -114,   856,  -493,   223,  -912,   623,   -76,   276,  -440},
+  { 2197,  2337,  1268,   670,   304,  -267,  -525,   140,   882,  -139},
+  {-1596,   550,   801,  -456,   -56,  -697,   865,  1060,   413,   446},
+  { 1154,   593,   -77,  1237,   -31,   581, -1037,  -895,   669,   297},
+  {  397,   558,   203,  -797,  -919,     3,   692,  -292,  1050,   782},
+  {  334,  1475,   632,   -80,    48, -1061,  -484,   362,  -597,  -852},
+  { -545,  -330,  -429,  -680,  1133, -1182,  -744,  1340,   262,    63},
+  { 1320,   827,  -398,  -576,   341,  -774,  -483, -1247,   -70,    98},
+  { -163,   674,   -11,  -886,   531, -1125,  -265,  -242,   724,   934}
+};
+
+/**
+ * gain codebook (first stage), 8k mode (3.9.2 of G.729)
+ */
+static const int16_t cb_gain_1st_8k[1<<GC_1ST_IDX_BITS_8K][2] = { /*(0.14) (2.13) */
+  { 3242 ,  9949 },
+  { 1551 ,  2425 },
+  { 2678 , 27162 },
+  { 1921 ,  9291 },
+  { 1831 ,  5022 },
+  {    1 ,  1516 },
+  {  356 , 14756 },
+  {   57 ,  5404 },
+};
+
+/**
+ * gain codebook (second stage), 8k mode (3.9.2 of G.729)
+ */
+static const int16_t cb_gain_2nd_8k[1<<GC_2ND_IDX_BITS_8K][2] = { /*(1.14) (1.13) */
+  {  5142 ,   592 },
+  { 17299 ,  1861 },
+  {  6160 ,  2395 },
+  { 16112 ,  3392 },
+  {   826 ,  2005 },
+  { 18973 ,  5935 },
+  {  1994 ,     0 },
+  { 15434 ,   237 },
+  { 10573 ,  2966 },
+  { 15132 ,  4914 },
+  { 11569 ,  1196 },
+  { 14194 ,  1630 },
+  {  8091 ,  4861 },
+  { 15161 , 14276 },
+  {  9120 ,   525 },
+  { 13260 ,  3256 },
+};
+
+/**
+ * gain codebook (first stage), 6.4k mode (D.3.9.2 of G.729)
+ */
+static const int16_t cb_gain_1st_6k4[1<<GC_1ST_IDX_BITS_6K4][2] =
+{ /*(0.14) (1.14)*/
+ { 5849,     0 },
+ { 3171,  9280 },
+ { 3617,  6747 },
+ { 4987, 22294 },
+ { 2929,  1078 },
+ { 6068,  6093 },
+ { 9425,  2731 },
+ { 3915, 12872 },
+};
+
+/**
+ * gain codebook (second stage), 6.4k mode (D.3.9.2 of G.729)
+ */
+static const int16_t cb_gain_2nd_6k4[1<<GC_2ND_IDX_BITS_6K4][2] =
+{ /*(1.14) (1.14)*/
+ {    0,  4175 },
+ {10828, 27602 },
+ {16423, 15724 },
+ { 4478,  7324 },
+ { 3988,     0 },
+ {10291, 11385 },
+ {11956, 10735 },
+ { 7876,  7821 },
+};
+
+/**
+ * 4th order Moving Average (MA) Predictor codebook (3.2.4 of G.729)
+ *
+ * float cb_ma_predictor_float[2][MA_NP][10] = {
+ *   {
+ *     {0.2570, 0.2780, 0.2800, 0.2736, 0.2757, 0.2764, 0.2675, 0.2678, 0.2779, 0.2647},
+ *     {0.2142, 0.2194, 0.2331, 0.2230, 0.2272, 0.2252, 0.2148, 0.2123, 0.2115, 0.2096},
+ *     {0.1670, 0.1523, 0.1567, 0.1580, 0.1601, 0.1569, 0.1589, 0.1555, 0.1474, 0.1571},
+ *     {0.1238, 0.0925, 0.0798, 0.0923, 0.0890, 0.0828, 0.1010, 0.0988, 0.0872, 0.1060},
+ *   },
+ *   {
+ *     {0.2360, 0.2405, 0.2499, 0.2495, 0.2517, 0.2591, 0.2636, 0.2625, 0.2551, 0.2310},
+ *     {0.1285, 0.0925, 0.0779, 0.1060, 0.1183, 0.1176, 0.1277, 0.1268, 0.1193, 0.1211},
+ *     {0.0981, 0.0589, 0.0401, 0.0654, 0.0761, 0.0728, 0.0841, 0.0826, 0.0776, 0.0891},
+ *     {0.0923, 0.0486, 0.0287, 0.0498, 0.0526, 0.0482, 0.0621, 0.0636, 0.0584, 0.0794},
+ *   },
+ * };
+ *                                    15
+ * cb_ma_predictor[j][k][i] = floor( 2 * cb_ma_predictor_float[j][k][i] )
+ *
+ * j=0..1, i=0..9, k=0..MA_NP-1
+ */
+static const int16_t cb_ma_predictor[2][MA_NP][10] = { /* (0.15) */
+  {
+    { 8421,  9109,  9175,  8965,  9034,  9057,  8765,  8775,  9106,  8673},
+    { 7018,  7189,  7638,  7307,  7444,  7379,  7038,  6956,  6930,  6868},
+    { 5472,  4990,  5134,  5177,  5246,  5141,  5206,  5095,  4830,  5147},
+    { 4056,  3031,  2614,  3024,  2916,  2713,  3309,  3237,  2857,  3473}
+  },
+  {
+    { 7733,  7880,  8188,  8175,  8247,  8490,  8637,  8601,  8359,  7569},
+    { 4210,  3031,  2552,  3473,  3876,  3853,  4184,  4154,  3909,  3968},
+    { 3214,  1930,  1313,  2143,  2493,  2385,  2755,  2706,  2542,  2919},
+    { 3024,  1592,   940,  1631,  1723,  1579,  2034,  2084,  1913,  2601}
+  }
+};
+
+/**
+ *                                     15         3
+ * cb_ma_predictor_sum[j][i] = floor( 2 * (1.0 - sum ( cb_ma_predictor_float[j][k][i] ) ) )
+ *                                               k=0
+ * j=0..1, i=0..9
+ */
+static const int16_t cb_ma_predictor_sum[2][10] = { /* (0.15) */
+  { 7798,  8447,  8205,  8293,  8126,  8477,  8447,  8703,  9043,  8604},
+  {14585, 18333, 19772, 17344, 16426, 16459, 15155, 15220, 16043, 15708}
+};
+
+/**
+ *                                                           12
+ *                                                          2
+ * cb_ma_predictor_sum_inv[j][i] = floor(---------------------------------------------)
+ *                                               3
+ *                                        1.0 - sum ( cb_ma_predictor_float[j][k][i] )
+ *                                              k=0
+ * j=0..1, i=0..9
+ */
+static const int16_t cb_ma_predictor_sum_inv[2][10] = { /* (3.12) */
+  {17210, 15888, 16357, 16183, 16516, 15833, 15888, 15421, 14840, 15597},
+  { 9202,  7320,  6788,  7738,  8170,  8154,  8856,  8818,  8366,  8544}
+};
+
+/**
+ * MA prediction coefficients (3.9.1 of G.729, near Equation 69)
+ */
+static const uint16_t ma_prediction_coeff[4] = { /* (0.13) */
+  5571, 4751, 2785, 1556
+};
+
+/**
+ * initial LSP coefficients belongs to virtual frame preceding  the
+ * first frame of the stream
+ */
+static const int16_t lsp_init[10]= { /* (0.15) */
+   30000, 26000, 21000, 15000, 8000, 0, -8000,-15000,-21000,-26000
+};
+
+/**
+ * additional "phase" post-processing filter impulse response (D.6.2 of G.729)
+ *
+ * Table contains three impulse responses, correspond to
+ * different amounts of spreading.
+ */
+static const int16_t phase_filter[3][40] =
+{
+  { // maximum spreading (for noise-like segments)
+    14690, 11518,  1268, -2762, -5672,  7514,  -36, -2808, -3041,  4823,
+     2952, -8425,  3785,  1455,  2179, -8638, 8051, -2104, -1455,   777,
+     1108, -2386,  2254,  -364,  -675, -2104, 6046, -5682,  1072,  3123,
+    -5059,  5312, -2330, -3729,  6924, -3890,  675, -1776,    29, 10145,
+  },
+  { // medium spreading
+    30274,  3831, -4037,  2972, -1049, -1003,  2477, -3044,  2815, -2232,
+     1753, -1612,  1714, -1776,  1543, -1009,   429,  -170,   472, -1265,
+     2176, -2707,  2523, -1622,   344,   826, -1530,  1724, -1658,  1701,
+    -2064,  2644, -3061,  2897, -1979,   557,   780, -1370,   842,   655,
+  },
+  { // no spreading (for voiced speech)
+    32767, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  }
+};
+#endif /* AVCODEC_G729DATA_H */
diff --git a/libavcodec/g729dec.c b/libavcodec/g729dec.c
new file mode 100644
index 0000000..2e4756b
--- /dev/null
+++ b/libavcodec/g729dec.c
@@ -0,0 +1,751 @@
+/*
+ * G.729, G729 Annex D decoders
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "libavutil/avutil.h"
+#include "get_bits.h"
+#include "audiodsp.h"
+#include "internal.h"
+
+
+#include "g729.h"
+#include "lsp.h"
+#include "celp_math.h"
+#include "celp_filters.h"
+#include "acelp_filters.h"
+#include "acelp_pitch_delay.h"
+#include "acelp_vectors.h"
+#include "g729data.h"
+#include "g729postfilter.h"
+
+/**
+ * minimum quantized LSF value (3.2.4)
+ * 0.005 in Q13
+ */
+#define LSFQ_MIN                   40
+
+/**
+ * maximum quantized LSF value (3.2.4)
+ * 3.135 in Q13
+ */
+#define LSFQ_MAX                   25681
+
+/**
+ * minimum LSF distance (3.2.4)
+ * 0.0391 in Q13
+ */
+#define LSFQ_DIFF_MIN              321
+
+/// interpolation filter length
+#define INTERPOL_LEN              11
+
+/**
+ * minimum gain pitch value (3.8, Equation 47)
+ * 0.2 in (1.14)
+ */
+#define SHARP_MIN                  3277
+
+/**
+ * maximum gain pitch value (3.8, Equation 47)
+ * (EE) This does not comply with the specification.
+ * Specification says about 0.8, which should be
+ * 13107 in (1.14), but reference C code uses
+ * 13017 (equals to 0.7945) instead of it.
+ */
+#define SHARP_MAX                  13017
+
+/**
+ * MR_ENERGY (mean removed energy) = mean_energy + 10 * log10(2^26  * subframe_size) in (7.13)
+ */
+#define MR_ENERGY 1018156
+
+#define DECISION_NOISE        0
+#define DECISION_INTERMEDIATE 1
+#define DECISION_VOICE        2
+
+typedef enum {
+    FORMAT_G729_8K = 0,
+    FORMAT_G729D_6K4,
+    FORMAT_COUNT,
+} G729Formats;
+
+typedef struct {
+    uint8_t ac_index_bits[2];   ///< adaptive codebook index for second subframe (size in bits)
+    uint8_t parity_bit;         ///< parity bit for pitch delay
+    uint8_t gc_1st_index_bits;  ///< gain codebook (first stage) index (size in bits)
+    uint8_t gc_2nd_index_bits;  ///< gain codebook (second stage) index (size in bits)
+    uint8_t fc_signs_bits;      ///< number of pulses in fixed-codebook vector
+    uint8_t fc_indexes_bits;    ///< size (in bits) of fixed-codebook index entry
+} G729FormatDescription;
+
+typedef struct {
+    /// past excitation signal buffer
+    int16_t exc_base[2*SUBFRAME_SIZE+PITCH_DELAY_MAX+INTERPOL_LEN];
+
+    int16_t* exc;               ///< start of past excitation data in buffer
+    int pitch_delay_int_prev;   ///< integer part of previous subframe's pitch delay (4.1.3)
+
+    /// (2.13) LSP quantizer outputs
+    int16_t  past_quantizer_output_buf[MA_NP + 1][10];
+    int16_t* past_quantizer_outputs[MA_NP + 1];
+
+    int16_t lsfq[10];           ///< (2.13) quantized LSF coefficients from previous frame
+    int16_t lsp_buf[2][10];     ///< (0.15) LSP coefficients (previous and current frames) (3.2.5)
+    int16_t *lsp[2];            ///< pointers to lsp_buf
+
+    int16_t quant_energy[4];    ///< (5.10) past quantized energy
+
+    /// previous speech data for LP synthesis filter
+    int16_t syn_filter_data[10];
+
+
+    /// residual signal buffer (used in long-term postfilter)
+    int16_t residual[SUBFRAME_SIZE + RES_PREV_DATA_SIZE];
+
+    /// previous speech data for residual calculation filter
+    int16_t res_filter_data[SUBFRAME_SIZE+10];
+
+    /// previous speech data for short-term postfilter
+    int16_t pos_filter_data[SUBFRAME_SIZE+10];
+
+    /// (1.14) pitch gain of current and five previous subframes
+    int16_t past_gain_pitch[6];
+
+    /// (14.1) gain code from current and previous subframe
+    int16_t past_gain_code[2];
+
+    /// voice decision on previous subframe (0-noise, 1-intermediate, 2-voice), G.729D
+    int16_t voice_decision;
+
+    int16_t onset;              ///< detected onset level (0-2)
+    int16_t was_periodic;       ///< whether previous frame was declared as periodic or not (4.4)
+    int16_t ht_prev_data;       ///< previous data for 4.2.3, equation 86
+    int gain_coeff;             ///< (1.14) gain coefficient (4.2.4)
+    uint16_t rand_value;        ///< random number generator value (4.4.4)
+    int ma_predictor_prev;      ///< switched MA predictor of LSP quantizer from last good frame
+
+    /// (14.14) high-pass filter data (past input)
+    int hpf_f[2];
+
+    /// high-pass filter data (past output)
+    int16_t hpf_z[2];
+}  G729ChannelContext;
+
+typedef struct {
+    AudioDSPContext adsp;
+
+    G729ChannelContext *channel_context;
+} G729Context;
+
+static const G729FormatDescription format_g729_8k = {
+    .ac_index_bits     = {8,5},
+    .parity_bit        = 1,
+    .gc_1st_index_bits = GC_1ST_IDX_BITS_8K,
+    .gc_2nd_index_bits = GC_2ND_IDX_BITS_8K,
+    .fc_signs_bits     = 4,
+    .fc_indexes_bits   = 13,
+};
+
+static const G729FormatDescription format_g729d_6k4 = {
+    .ac_index_bits     = {8,4},
+    .parity_bit        = 0,
+    .gc_1st_index_bits = GC_1ST_IDX_BITS_6K4,
+    .gc_2nd_index_bits = GC_2ND_IDX_BITS_6K4,
+    .fc_signs_bits     = 2,
+    .fc_indexes_bits   = 9,
+};
+
+/**
+ * @brief pseudo random number generator
+ */
+static inline uint16_t g729_prng(uint16_t value)
+{
+    return 31821 * value + 13849;
+}
+
+/**
+ * Decodes LSF (Line Spectral Frequencies) from L0-L3 (3.2.4).
+ * @param[out] lsfq (2.13) quantized LSF coefficients
+ * @param[in,out] past_quantizer_outputs (2.13) quantizer outputs from previous frames
+ * @param ma_predictor switched MA predictor of LSP quantizer
+ * @param vq_1st first stage vector of quantizer
+ * @param vq_2nd_low second stage lower vector of LSP quantizer
+ * @param vq_2nd_high second stage higher vector of LSP quantizer
+ */
+static void lsf_decode(int16_t* lsfq, int16_t* past_quantizer_outputs[MA_NP + 1],
+                       int16_t ma_predictor,
+                       int16_t vq_1st, int16_t vq_2nd_low, int16_t vq_2nd_high)
+{
+    int i,j;
+    static const uint8_t min_distance[2]={10, 5}; //(2.13)
+    int16_t* quantizer_output = past_quantizer_outputs[MA_NP];
+
+    for (i = 0; i < 5; i++) {
+        quantizer_output[i]     = cb_lsp_1st[vq_1st][i    ] + cb_lsp_2nd[vq_2nd_low ][i    ];
+        quantizer_output[i + 5] = cb_lsp_1st[vq_1st][i + 5] + cb_lsp_2nd[vq_2nd_high][i + 5];
+    }
+
+    for (j = 0; j < 2; j++) {
+        for (i = 1; i < 10; i++) {
+            int diff = (quantizer_output[i - 1] - quantizer_output[i] + min_distance[j]) >> 1;
+            if (diff > 0) {
+                quantizer_output[i - 1] -= diff;
+                quantizer_output[i    ] += diff;
+            }
+        }
+    }
+
+    for (i = 0; i < 10; i++) {
+        int sum = quantizer_output[i] * cb_ma_predictor_sum[ma_predictor][i];
+        for (j = 0; j < MA_NP; j++)
+            sum += past_quantizer_outputs[j][i] * cb_ma_predictor[ma_predictor][j][i];
+
+        lsfq[i] = sum >> 15;
+    }
+
+    ff_acelp_reorder_lsf(lsfq, LSFQ_DIFF_MIN, LSFQ_MIN, LSFQ_MAX, 10);
+}
+
+/**
+ * Restores past LSP quantizer output using LSF from previous frame
+ * @param[in,out] lsfq (2.13) quantized LSF coefficients
+ * @param[in,out] past_quantizer_outputs (2.13) quantizer outputs from previous frames
+ * @param ma_predictor_prev MA predictor from previous frame
+ * @param lsfq_prev (2.13) quantized LSF coefficients from previous frame
+ */
+static void lsf_restore_from_previous(int16_t* lsfq,
+                                      int16_t* past_quantizer_outputs[MA_NP + 1],
+                                      int ma_predictor_prev)
+{
+    int16_t* quantizer_output = past_quantizer_outputs[MA_NP];
+    int i,k;
+
+    for (i = 0; i < 10; i++) {
+        int tmp = lsfq[i] << 15;
+
+        for (k = 0; k < MA_NP; k++)
+            tmp -= past_quantizer_outputs[k][i] * cb_ma_predictor[ma_predictor_prev][k][i];
+
+        quantizer_output[i] = ((tmp >> 15) * cb_ma_predictor_sum_inv[ma_predictor_prev][i]) >> 12;
+    }
+}
+
+/**
+ * Constructs new excitation signal and applies phase filter to it
+ * @param[out] out constructed speech signal
+ * @param in original excitation signal
+ * @param fc_cur (2.13) original fixed-codebook vector
+ * @param gain_code (14.1) gain code
+ * @param subframe_size length of the subframe
+ */
+static void g729d_get_new_exc(
+        int16_t* out,
+        const int16_t* in,
+        const int16_t* fc_cur,
+        int dstate,
+        int gain_code,
+        int subframe_size)
+{
+    int i;
+    int16_t fc_new[SUBFRAME_SIZE];
+
+    ff_celp_convolve_circ(fc_new, fc_cur, phase_filter[dstate], subframe_size);
+
+    for (i = 0; i < subframe_size; i++) {
+        out[i]  = in[i];
+        out[i] -= (gain_code * fc_cur[i] + 0x2000) >> 14;
+        out[i] += (gain_code * fc_new[i] + 0x2000) >> 14;
+    }
+}
+
+/**
+ * Makes decision about onset in current subframe
+ * @param past_onset decision result of previous subframe
+ * @param past_gain_code gain code of current and previous subframe
+ *
+ * @return onset decision result for current subframe
+ */
+static int g729d_onset_decision(int past_onset, const int16_t* past_gain_code)
+{
+    if ((past_gain_code[0] >> 1) > past_gain_code[1])
+        return 2;
+
+    return FFMAX(past_onset-1, 0);
+}
+
+/**
+ * Makes decision about voice presence in current subframe
+ * @param onset onset level
+ * @param prev_voice_decision voice decision result from previous subframe
+ * @param past_gain_pitch pitch gain of current and previous subframes
+ *
+ * @return voice decision result for current subframe
+ */
+static int16_t g729d_voice_decision(int onset, int prev_voice_decision, const int16_t* past_gain_pitch)
+{
+    int i, low_gain_pitch_cnt, voice_decision;
+
+    if (past_gain_pitch[0] >= 14745) {       // 0.9
+        voice_decision = DECISION_VOICE;
+    } else if (past_gain_pitch[0] <= 9830) { // 0.6
+        voice_decision = DECISION_NOISE;
+    } else {
+        voice_decision = DECISION_INTERMEDIATE;
+    }
+
+    for (i = 0, low_gain_pitch_cnt = 0; i < 6; i++)
+        if (past_gain_pitch[i] < 9830)
+            low_gain_pitch_cnt++;
+
+    if (low_gain_pitch_cnt > 2 && !onset)
+        voice_decision = DECISION_NOISE;
+
+    if (!onset && voice_decision > prev_voice_decision + 1)
+        voice_decision--;
+
+    if (onset && voice_decision < DECISION_VOICE)
+        voice_decision++;
+
+    return voice_decision;
+}
+
+static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
+{
+    int res = 0;
+
+    while (order--)
+        res += *v1++ * *v2++;
+
+    return res;
+}
+
+static av_cold int decoder_init(AVCodecContext * avctx)
+{
+    G729Context *s = avctx->priv_data;
+    G729ChannelContext *ctx;
+    int c,i,k;
+
+    if (avctx->channels < 1 || avctx->channels > 2) {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo are supported (requested channels: %d).\n", avctx->channels);
+        return AVERROR(EINVAL);
+    }
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
+
+    /* Both 8kbit/s and 6.4kbit/s modes uses two subframes per frame. */
+    avctx->frame_size = SUBFRAME_SIZE << 1;
+
+    ctx =
+    s->channel_context = av_mallocz(sizeof(G729ChannelContext) * avctx->channels);
+    if (!ctx)
+        return AVERROR(ENOMEM);
+
+    for (c = 0; c < avctx->channels; c++) {
+        ctx->gain_coeff = 16384; // 1.0 in (1.14)
+
+        for (k = 0; k < MA_NP + 1; k++) {
+            ctx->past_quantizer_outputs[k] = ctx->past_quantizer_output_buf[k];
+            for (i = 1; i < 11; i++)
+                ctx->past_quantizer_outputs[k][i - 1] = (18717 * i) >> 3;
+        }
+
+        ctx->lsp[0] = ctx->lsp_buf[0];
+        ctx->lsp[1] = ctx->lsp_buf[1];
+        memcpy(ctx->lsp[0], lsp_init, 10 * sizeof(int16_t));
+
+        ctx->exc = &ctx->exc_base[PITCH_DELAY_MAX+INTERPOL_LEN];
+
+        ctx->pitch_delay_int_prev = PITCH_DELAY_MIN;
+
+        /* random seed initialization */
+        ctx->rand_value = 21845;
+
+        /* quantized prediction error */
+        for (i = 0; i < 4; i++)
+            ctx->quant_energy[i] = -14336; // -14 in (5.10)
+
+        ctx++;
+    }
+
+    ff_audiodsp_init(&s->adsp);
+    s->adsp.scalarproduct_int16 = scalarproduct_int16_c;
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
+                        AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+    int16_t *out_frame;
+    GetBitContext gb;
+    const G729FormatDescription *format;
+    int c, i;
+    int16_t *tmp;
+    G729Formats packet_type;
+    G729Context *s = avctx->priv_data;
+    G729ChannelContext *ctx = s->channel_context;
+    int16_t lp[2][11];           // (3.12)
+    uint8_t ma_predictor;     ///< switched MA predictor of LSP quantizer
+    uint8_t quantizer_1st;    ///< first stage vector of quantizer
+    uint8_t quantizer_2nd_lo; ///< second stage lower vector of quantizer (size in bits)
+    uint8_t quantizer_2nd_hi; ///< second stage higher vector of quantizer (size in bits)
+
+    int pitch_delay_int[2];      // pitch delay, integer part
+    int pitch_delay_3x;          // pitch delay, multiplied by 3
+    int16_t fc[SUBFRAME_SIZE];   // fixed-codebook vector
+    int16_t synth[SUBFRAME_SIZE+10]; // fixed-codebook vector
+    int j, ret;
+    int gain_before, gain_after;
+    AVFrame *frame = data;
+
+    frame->nb_samples = SUBFRAME_SIZE<<1;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    if (buf_size % (G729_8K_BLOCK_SIZE * avctx->channels) == 0) {
+        packet_type = FORMAT_G729_8K;
+        format = &format_g729_8k;
+        //Reset voice decision
+        ctx->onset = 0;
+        ctx->voice_decision = DECISION_VOICE;
+        av_log(avctx, AV_LOG_DEBUG, "Packet type: %s\n", "G.729 @ 8kbit/s");
+    } else if (buf_size == G729D_6K4_BLOCK_SIZE * avctx->channels) {
+        packet_type = FORMAT_G729D_6K4;
+        format = &format_g729d_6k4;
+        av_log(avctx, AV_LOG_DEBUG, "Packet type: %s\n", "G.729D @ 6.4kbit/s");
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Packet size %d is unknown.\n", buf_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (c = 0; c < avctx->channels; c++) {
+        int frame_erasure = 0; ///< frame erasure detected during decoding
+        int bad_pitch = 0;     ///< parity check failed
+        int is_periodic = 0;   ///< whether one of the subframes is declared as periodic or not
+        out_frame = (int16_t*)frame->data[c];
+
+        for (i = 0; i < buf_size; i++)
+            frame_erasure |= buf[i];
+        frame_erasure = !frame_erasure;
+
+        init_get_bits(&gb, buf, 8*buf_size);
+
+        ma_predictor     = get_bits(&gb, 1);
+        quantizer_1st    = get_bits(&gb, VQ_1ST_BITS);
+        quantizer_2nd_lo = get_bits(&gb, VQ_2ND_BITS);
+        quantizer_2nd_hi = get_bits(&gb, VQ_2ND_BITS);
+
+        if (frame_erasure) {
+            lsf_restore_from_previous(ctx->lsfq, ctx->past_quantizer_outputs,
+                                      ctx->ma_predictor_prev);
+        } else {
+            lsf_decode(ctx->lsfq, ctx->past_quantizer_outputs,
+                       ma_predictor,
+                       quantizer_1st, quantizer_2nd_lo, quantizer_2nd_hi);
+            ctx->ma_predictor_prev = ma_predictor;
+        }
+
+        tmp = ctx->past_quantizer_outputs[MA_NP];
+        memmove(ctx->past_quantizer_outputs + 1, ctx->past_quantizer_outputs,
+                MA_NP * sizeof(int16_t*));
+        ctx->past_quantizer_outputs[0] = tmp;
+
+        ff_acelp_lsf2lsp(ctx->lsp[1], ctx->lsfq, 10);
+
+        ff_acelp_lp_decode(&lp[0][0], &lp[1][0], ctx->lsp[1], ctx->lsp[0], 10);
+
+        FFSWAP(int16_t*, ctx->lsp[1], ctx->lsp[0]);
+
+        for (i = 0; i < 2; i++) {
+            int gain_corr_factor;
+
+            uint8_t ac_index;      ///< adaptive codebook index
+            uint8_t pulses_signs;  ///< fixed-codebook vector pulse signs
+            int fc_indexes;        ///< fixed-codebook indexes
+            uint8_t gc_1st_index;  ///< gain codebook (first stage) index
+            uint8_t gc_2nd_index;  ///< gain codebook (second stage) index
+
+            ac_index      = get_bits(&gb, format->ac_index_bits[i]);
+            if (!i && format->parity_bit)
+                bad_pitch = av_parity(ac_index >> 2) == get_bits1(&gb);
+            fc_indexes    = get_bits(&gb, format->fc_indexes_bits);
+            pulses_signs  = get_bits(&gb, format->fc_signs_bits);
+            gc_1st_index  = get_bits(&gb, format->gc_1st_index_bits);
+            gc_2nd_index  = get_bits(&gb, format->gc_2nd_index_bits);
+
+            if (frame_erasure) {
+                pitch_delay_3x = 3 * ctx->pitch_delay_int_prev;
+            } else if (!i) {
+                if (bad_pitch) {
+                    pitch_delay_3x = 3 * ctx->pitch_delay_int_prev;
+                } else {
+                    pitch_delay_3x = ff_acelp_decode_8bit_to_1st_delay3(ac_index);
+                }
+            } else {
+                int pitch_delay_min = av_clip(ctx->pitch_delay_int_prev - 5,
+                                              PITCH_DELAY_MIN, PITCH_DELAY_MAX - 9);
+
+                if (packet_type == FORMAT_G729D_6K4) {
+                    pitch_delay_3x = ff_acelp_decode_4bit_to_2nd_delay3(ac_index, pitch_delay_min);
+                } else {
+                    pitch_delay_3x = ff_acelp_decode_5_6_bit_to_2nd_delay3(ac_index, pitch_delay_min);
+                }
+            }
+
+            /* Round pitch delay to nearest (used everywhere except ff_acelp_interpolate). */
+            pitch_delay_int[i]  = (pitch_delay_3x + 1) / 3;
+            if (pitch_delay_int[i] > PITCH_DELAY_MAX) {
+                av_log(avctx, AV_LOG_WARNING, "pitch_delay_int %d is too large\n", pitch_delay_int[i]);
+                pitch_delay_int[i] = PITCH_DELAY_MAX;
+            }
+
+            if (frame_erasure) {
+                ctx->rand_value = g729_prng(ctx->rand_value);
+                fc_indexes   = av_mod_uintp2(ctx->rand_value, format->fc_indexes_bits);
+
+                ctx->rand_value = g729_prng(ctx->rand_value);
+                pulses_signs = ctx->rand_value;
+            }
+
+
+            memset(fc, 0, sizeof(int16_t) * SUBFRAME_SIZE);
+            switch (packet_type) {
+                case FORMAT_G729_8K:
+                    ff_acelp_fc_pulse_per_track(fc, ff_fc_4pulses_8bits_tracks_13,
+                                                ff_fc_4pulses_8bits_track_4,
+                                                fc_indexes, pulses_signs, 3, 3);
+                    break;
+                case FORMAT_G729D_6K4:
+                    ff_acelp_fc_pulse_per_track(fc, ff_fc_2pulses_9bits_track1_gray,
+                                                ff_fc_2pulses_9bits_track2_gray,
+                                                fc_indexes, pulses_signs, 1, 4);
+                    break;
+            }
+
+            /*
+              This filter enhances harmonic components of the fixed-codebook vector to
+              improve the quality of the reconstructed speech.
+
+                         / fc_v[i],                                    i < pitch_delay
+              fc_v[i] = <
+                         \ fc_v[i] + gain_pitch * fc_v[i-pitch_delay], i >= pitch_delay
+            */
+            ff_acelp_weighted_vector_sum(fc + pitch_delay_int[i],
+                                         fc + pitch_delay_int[i],
+                                         fc, 1 << 14,
+                                         av_clip(ctx->past_gain_pitch[0], SHARP_MIN, SHARP_MAX),
+                                         0, 14,
+                                         SUBFRAME_SIZE - pitch_delay_int[i]);
+
+            memmove(ctx->past_gain_pitch+1, ctx->past_gain_pitch, 5 * sizeof(int16_t));
+            ctx->past_gain_code[1] = ctx->past_gain_code[0];
+
+            if (frame_erasure) {
+                ctx->past_gain_pitch[0] = (29491 * ctx->past_gain_pitch[0]) >> 15; // 0.90 (0.15)
+                ctx->past_gain_code[0]  = ( 2007 * ctx->past_gain_code[0] ) >> 11; // 0.98 (0.11)
+
+                gain_corr_factor = 0;
+            } else {
+                if (packet_type == FORMAT_G729D_6K4) {
+                    ctx->past_gain_pitch[0]  = cb_gain_1st_6k4[gc_1st_index][0] +
+                                               cb_gain_2nd_6k4[gc_2nd_index][0];
+                    gain_corr_factor = cb_gain_1st_6k4[gc_1st_index][1] +
+                                       cb_gain_2nd_6k4[gc_2nd_index][1];
+
+                    /* Without check below overflow can occur in ff_acelp_update_past_gain.
+                       It is not issue for G.729, because gain_corr_factor in it's case is always
+                       greater than 1024, while in G.729D it can be even zero. */
+                    gain_corr_factor = FFMAX(gain_corr_factor, 1024);
+    #ifndef G729_BITEXACT
+                    gain_corr_factor >>= 1;
+    #endif
+                } else {
+                    ctx->past_gain_pitch[0]  = cb_gain_1st_8k[gc_1st_index][0] +
+                                               cb_gain_2nd_8k[gc_2nd_index][0];
+                    gain_corr_factor = cb_gain_1st_8k[gc_1st_index][1] +
+                                       cb_gain_2nd_8k[gc_2nd_index][1];
+                }
+
+                /* Decode the fixed-codebook gain. */
+                ctx->past_gain_code[0] = ff_acelp_decode_gain_code(&s->adsp, gain_corr_factor,
+                                                                   fc, MR_ENERGY,
+                                                                   ctx->quant_energy,
+                                                                   ma_prediction_coeff,
+                                                                   SUBFRAME_SIZE, 4);
+    #ifdef G729_BITEXACT
+                /*
+                  This correction required to get bit-exact result with
+                  reference code, because gain_corr_factor in G.729D is
+                  two times larger than in original G.729.
+
+                  If bit-exact result is not issue then gain_corr_factor
+                  can be simpler divided by 2 before call to g729_get_gain_code
+                  instead of using correction below.
+                */
+                if (packet_type == FORMAT_G729D_6K4) {
+                    gain_corr_factor >>= 1;
+                    ctx->past_gain_code[0] >>= 1;
+                }
+    #endif
+            }
+            ff_acelp_update_past_gain(ctx->quant_energy, gain_corr_factor, 2, frame_erasure);
+
+            /* Routine requires rounding to lowest. */
+            ff_acelp_interpolate(ctx->exc + i * SUBFRAME_SIZE,
+                                 ctx->exc + i * SUBFRAME_SIZE - pitch_delay_3x / 3,
+                                 ff_acelp_interp_filter, 6,
+                                 (pitch_delay_3x % 3) << 1,
+                                 10, SUBFRAME_SIZE);
+
+            ff_acelp_weighted_vector_sum(ctx->exc + i * SUBFRAME_SIZE,
+                                         ctx->exc + i * SUBFRAME_SIZE, fc,
+                                         (!ctx->was_periodic && frame_erasure) ? 0 : ctx->past_gain_pitch[0],
+                                         ( ctx->was_periodic && frame_erasure) ? 0 : ctx->past_gain_code[0],
+                                         1 << 13, 14, SUBFRAME_SIZE);
+
+            memcpy(synth, ctx->syn_filter_data, 10 * sizeof(int16_t));
+
+            if (ff_celp_lp_synthesis_filter(
+                synth+10,
+                &lp[i][1],
+                ctx->exc  + i * SUBFRAME_SIZE,
+                SUBFRAME_SIZE,
+                10,
+                1,
+                0,
+                0x800))
+                /* Overflow occurred, downscale excitation signal... */
+                for (j = 0; j < 2 * SUBFRAME_SIZE + PITCH_DELAY_MAX + INTERPOL_LEN; j++)
+                    ctx->exc_base[j] >>= 2;
+
+            /* ... and make synthesis again. */
+            if (packet_type == FORMAT_G729D_6K4) {
+                int16_t exc_new[SUBFRAME_SIZE];
+
+                ctx->onset = g729d_onset_decision(ctx->onset, ctx->past_gain_code);
+                ctx->voice_decision = g729d_voice_decision(ctx->onset, ctx->voice_decision, ctx->past_gain_pitch);
+
+                g729d_get_new_exc(exc_new, ctx->exc  + i * SUBFRAME_SIZE, fc, ctx->voice_decision, ctx->past_gain_code[0], SUBFRAME_SIZE);
+
+                ff_celp_lp_synthesis_filter(
+                        synth+10,
+                        &lp[i][1],
+                        exc_new,
+                        SUBFRAME_SIZE,
+                        10,
+                        0,
+                        0,
+                        0x800);
+            } else {
+                ff_celp_lp_synthesis_filter(
+                        synth+10,
+                        &lp[i][1],
+                        ctx->exc  + i * SUBFRAME_SIZE,
+                        SUBFRAME_SIZE,
+                        10,
+                        0,
+                        0,
+                        0x800);
+            }
+            /* Save data (without postfilter) for use in next subframe. */
+            memcpy(ctx->syn_filter_data, synth+SUBFRAME_SIZE, 10 * sizeof(int16_t));
+
+            /* Calculate gain of unfiltered signal for use in AGC. */
+            gain_before = 0;
+            for (j = 0; j < SUBFRAME_SIZE; j++)
+                gain_before += FFABS(synth[j+10]);
+
+            /* Call postfilter and also update voicing decision for use in next frame. */
+            ff_g729_postfilter(
+                    &s->adsp,
+                    &ctx->ht_prev_data,
+                    &is_periodic,
+                    &lp[i][0],
+                    pitch_delay_int[0],
+                    ctx->residual,
+                    ctx->res_filter_data,
+                    ctx->pos_filter_data,
+                    synth+10,
+                    SUBFRAME_SIZE);
+
+            /* Calculate gain of filtered signal for use in AGC. */
+            gain_after = 0;
+            for (j = 0; j < SUBFRAME_SIZE; j++)
+                gain_after += FFABS(synth[j+10]);
+
+            ctx->gain_coeff = ff_g729_adaptive_gain_control(
+                    gain_before,
+                    gain_after,
+                    synth+10,
+                    SUBFRAME_SIZE,
+                    ctx->gain_coeff);
+
+            if (frame_erasure) {
+                ctx->pitch_delay_int_prev = FFMIN(ctx->pitch_delay_int_prev + 1, PITCH_DELAY_MAX);
+            } else {
+                ctx->pitch_delay_int_prev = pitch_delay_int[i];
+            }
+
+            memcpy(synth+8, ctx->hpf_z, 2*sizeof(int16_t));
+            ff_acelp_high_pass_filter(
+                    out_frame + i*SUBFRAME_SIZE,
+                    ctx->hpf_f,
+                    synth+10,
+                    SUBFRAME_SIZE);
+            memcpy(ctx->hpf_z, synth+8+SUBFRAME_SIZE, 2*sizeof(int16_t));
+        }
+
+        ctx->was_periodic = is_periodic;
+
+        /* Save signal for use in next frame. */
+        memmove(ctx->exc_base, ctx->exc_base + 2 * SUBFRAME_SIZE, (PITCH_DELAY_MAX+INTERPOL_LEN)*sizeof(int16_t));
+
+        buf += packet_type == FORMAT_G729_8K ? G729_8K_BLOCK_SIZE : G729D_6K4_BLOCK_SIZE;
+        ctx++;
+    }
+
+    *got_frame_ptr = 1;
+    return packet_type == FORMAT_G729_8K ? G729_8K_BLOCK_SIZE * avctx->channels : G729D_6K4_BLOCK_SIZE * avctx->channels;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    G729Context *s = avctx->priv_data;
+    av_freep(&s->channel_context);
+
+    return 0;
+}
+
+AVCodec ff_g729_decoder = {
+    .name           = "g729",
+    .long_name      = NULL_IF_CONFIG_SMALL("G.729"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_G729,
+    .priv_data_size = sizeof(G729Context),
+    .init           = decoder_init,
+    .decode         = decode_frame,
+    .close          = decode_close,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/g729postfilter.c b/libavcodec/g729postfilter.c
new file mode 100644
index 0000000..d9076ec
--- /dev/null
+++ b/libavcodec/g729postfilter.c
@@ -0,0 +1,614 @@
+/*
+ * G.729, G729 Annex D postfilter
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <inttypes.h>
+#include <limits.h>
+
+#include "avcodec.h"
+#include "g729.h"
+#include "acelp_pitch_delay.h"
+#include "g729postfilter.h"
+#include "celp_math.h"
+#include "acelp_filters.h"
+#include "acelp_vectors.h"
+#include "celp_filters.h"
+
+#define FRAC_BITS 15
+#include "mathops.h"
+
+/**
+ * short interpolation filter (of length 33, according to spec)
+ * for computing signal with non-integer delay
+ */
+static const int16_t ff_g729_interp_filt_short[(ANALYZED_FRAC_DELAYS+1)*SHORT_INT_FILT_LEN] = {
+      0, 31650, 28469, 23705, 18050, 12266,  7041,  2873,
+      0, -1597, -2147, -1992, -1492,  -933,  -484,  -188,
+};
+
+/**
+ * long interpolation filter (of length 129, according to spec)
+ * for computing signal with non-integer delay
+ */
+static const int16_t ff_g729_interp_filt_long[(ANALYZED_FRAC_DELAYS+1)*LONG_INT_FILT_LEN] = {
+   0, 31915, 29436, 25569, 20676, 15206,  9639,  4439,
+   0, -3390, -5579, -6549, -6414, -5392, -3773, -1874,
+   0,  1595,  2727,  3303,  3319,  2850,  2030,  1023,
+   0,  -887, -1527, -1860, -1876, -1614, -1150,  -579,
+   0,   501,   859,  1041,  1044,   892,   631,   315,
+   0,  -266,  -453,  -543,  -538,  -455,  -317,  -156,
+   0,   130,   218,   258,   253,   212,   147,    72,
+   0,   -59,  -101,  -122,  -123,  -106,   -77,   -40,
+};
+
+/**
+ * formant_pp_factor_num_pow[i] = FORMANT_PP_FACTOR_NUM^(i+1)
+ */
+static const int16_t formant_pp_factor_num_pow[10]= {
+  /* (0.15) */
+  18022, 9912, 5451, 2998, 1649, 907, 499, 274, 151, 83
+};
+
+/**
+ * formant_pp_factor_den_pow[i] = FORMANT_PP_FACTOR_DEN^(i+1)
+ */
+static const int16_t formant_pp_factor_den_pow[10] = {
+  /* (0.15) */
+  22938, 16057, 11240, 7868, 5508, 3856, 2699, 1889, 1322, 925
+};
+
+/**
+ * \brief Residual signal calculation (4.2.1 if G.729)
+ * \param out [out] output data filtered through A(z/FORMANT_PP_FACTOR_NUM)
+ * \param filter_coeffs (3.12) A(z/FORMANT_PP_FACTOR_NUM) filter coefficients
+ * \param in input speech data to process
+ * \param subframe_size size of one subframe
+ *
+ * \note in buffer must contain 10 items of previous speech data before top of the buffer
+ * \remark It is safe to pass the same buffer for input and output.
+ */
+static void residual_filter(int16_t* out, const int16_t* filter_coeffs, const int16_t* in,
+                            int subframe_size)
+{
+    int i, n;
+
+    for (n = subframe_size - 1; n >= 0; n--) {
+        int sum = 0x800;
+        for (i = 0; i < 10; i++)
+            sum += filter_coeffs[i] * in[n - i - 1];
+
+        out[n] = in[n] + (sum >> 12);
+    }
+}
+
+/**
+ * \brief long-term postfilter (4.2.1)
+ * \param dsp initialized DSP context
+ * \param pitch_delay_int integer part of the pitch delay in the first subframe
+ * \param residual filtering input data
+ * \param residual_filt [out] speech signal with applied A(z/FORMANT_PP_FACTOR_NUM) filter
+ * \param subframe_size size of subframe
+ *
+ * \return 0 if long-term prediction gain is less than 3dB, 1 -  otherwise
+ */
+static int16_t long_term_filter(AudioDSPContext *adsp, int pitch_delay_int,
+                                const int16_t* residual, int16_t *residual_filt,
+                                int subframe_size)
+{
+    int i, k, tmp, tmp2;
+    int sum;
+    int L_temp0;
+    int L_temp1;
+    int64_t L64_temp0;
+    int64_t L64_temp1;
+    int16_t shift;
+    int corr_int_num, corr_int_den;
+
+    int ener;
+    int16_t sh_ener;
+
+    int16_t gain_num,gain_den; //selected signal's gain numerator and denominator
+    int16_t sh_gain_num, sh_gain_den;
+    int gain_num_square;
+
+    int16_t gain_long_num,gain_long_den; //filtered through long interpolation filter signal's gain numerator and denominator
+    int16_t sh_gain_long_num, sh_gain_long_den;
+
+    int16_t best_delay_int, best_delay_frac;
+
+    int16_t delayed_signal_offset;
+    int lt_filt_factor_a, lt_filt_factor_b;
+
+    int16_t * selected_signal;
+    const int16_t * selected_signal_const; //Necessary to avoid compiler warning
+
+    int16_t sig_scaled[SUBFRAME_SIZE + RES_PREV_DATA_SIZE];
+    int16_t delayed_signal[ANALYZED_FRAC_DELAYS][SUBFRAME_SIZE+1];
+    int corr_den[ANALYZED_FRAC_DELAYS][2];
+
+    tmp = 0;
+    for(i=0; i<subframe_size + RES_PREV_DATA_SIZE; i++)
+        tmp |= FFABS(residual[i]);
+
+    if(!tmp)
+        shift = 3;
+    else
+        shift = av_log2(tmp) - 11;
+
+    if (shift > 0)
+        for (i = 0; i < subframe_size + RES_PREV_DATA_SIZE; i++)
+            sig_scaled[i] = residual[i] >> shift;
+    else
+        for (i = 0; i < subframe_size + RES_PREV_DATA_SIZE; i++)
+            sig_scaled[i] = residual[i] << -shift;
+
+    /* Start of best delay searching code */
+    gain_num = 0;
+
+    ener = adsp->scalarproduct_int16(sig_scaled + RES_PREV_DATA_SIZE,
+                                    sig_scaled + RES_PREV_DATA_SIZE,
+                                    subframe_size);
+    if (ener) {
+        sh_ener = av_log2(ener) - 14;
+        sh_ener = FFMAX(sh_ener, 0);
+        ener >>= sh_ener;
+        /* Search for best pitch delay.
+
+                       sum{ r(n) * r(k,n) ] }^2
+           R'(k)^2 := -------------------------
+                       sum{ r(k,n) * r(k,n) }
+
+
+           R(T)    :=  sum{ r(n) * r(n-T) ] }
+
+
+           where
+           r(n-T) is integer delayed signal with delay T
+           r(k,n) is non-integer delayed signal with integer delay best_delay
+           and fractional delay k */
+
+        /* Find integer delay best_delay which maximizes correlation R(T).
+
+           This is also equals to numerator of R'(0),
+           since the fine search (second step) is done with 1/8
+           precision around best_delay. */
+        corr_int_num = 0;
+        best_delay_int = pitch_delay_int - 1;
+        for (i = pitch_delay_int - 1; i <= pitch_delay_int + 1; i++) {
+            sum = adsp->scalarproduct_int16(sig_scaled + RES_PREV_DATA_SIZE,
+                                           sig_scaled + RES_PREV_DATA_SIZE - i,
+                                           subframe_size);
+            if (sum > corr_int_num) {
+                corr_int_num = sum;
+                best_delay_int = i;
+            }
+        }
+        if (corr_int_num) {
+            /* Compute denominator of pseudo-normalized correlation R'(0). */
+            corr_int_den = adsp->scalarproduct_int16(sig_scaled - best_delay_int + RES_PREV_DATA_SIZE,
+                                                    sig_scaled - best_delay_int + RES_PREV_DATA_SIZE,
+                                                    subframe_size);
+
+            /* Compute signals with non-integer delay k (with 1/8 precision),
+               where k is in [0;6] range.
+               Entire delay is qual to best_delay+(k+1)/8
+               This is archieved by applying an interpolation filter of
+               legth 33 to source signal. */
+            for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                ff_acelp_interpolate(&delayed_signal[k][0],
+                                     &sig_scaled[RES_PREV_DATA_SIZE - best_delay_int],
+                                     ff_g729_interp_filt_short,
+                                     ANALYZED_FRAC_DELAYS+1,
+                                     8 - k - 1,
+                                     SHORT_INT_FILT_LEN,
+                                     subframe_size + 1);
+            }
+
+            /* Compute denominator of pseudo-normalized correlation R'(k).
+
+                 corr_den[k][0] is square root of R'(k) denominator, for int(T) == int(T0)
+                 corr_den[k][1] is square root of R'(k) denominator, for int(T) == int(T0)+1
+
+              Also compute maximum value of above denominators over all k. */
+            tmp = corr_int_den;
+            for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                sum = adsp->scalarproduct_int16(&delayed_signal[k][1],
+                                               &delayed_signal[k][1],
+                                               subframe_size - 1);
+                corr_den[k][0] = sum + delayed_signal[k][0            ] * delayed_signal[k][0            ];
+                corr_den[k][1] = sum + delayed_signal[k][subframe_size] * delayed_signal[k][subframe_size];
+
+                tmp = FFMAX3(tmp, corr_den[k][0], corr_den[k][1]);
+            }
+
+            sh_gain_den = av_log2(tmp) - 14;
+            if (sh_gain_den >= 0) {
+
+                sh_gain_num =  FFMAX(sh_gain_den, sh_ener);
+                /* Loop through all k and find delay that maximizes
+                   R'(k) correlation.
+                   Search is done in [int(T0)-1; intT(0)+1] range
+                   with 1/8 precision. */
+                delayed_signal_offset = 1;
+                best_delay_frac = 0;
+                gain_den = corr_int_den >> sh_gain_den;
+                gain_num = corr_int_num >> sh_gain_num;
+                gain_num_square = gain_num * gain_num;
+                for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                    for (i = 0; i < 2; i++) {
+                        int16_t gain_num_short, gain_den_short;
+                        int gain_num_short_square;
+                        /* Compute numerator of pseudo-normalized
+                           correlation R'(k). */
+                        sum = adsp->scalarproduct_int16(&delayed_signal[k][i],
+                                                       sig_scaled + RES_PREV_DATA_SIZE,
+                                                       subframe_size);
+                        gain_num_short = FFMAX(sum >> sh_gain_num, 0);
+
+                        /*
+                                      gain_num_short_square                gain_num_square
+                           R'(T)^2 = -----------------------, max R'(T)^2= --------------
+                                           den                                 gain_den
+                        */
+                        gain_num_short_square = gain_num_short * gain_num_short;
+                        gain_den_short = corr_den[k][i] >> sh_gain_den;
+
+                        tmp = MULL(gain_num_short_square, gain_den, FRAC_BITS);
+                        tmp2 = MULL(gain_num_square, gain_den_short, FRAC_BITS);
+
+                        // R'(T)^2 > max R'(T)^2
+                        if (tmp > tmp2) {
+                            gain_num = gain_num_short;
+                            gain_den = gain_den_short;
+                            gain_num_square = gain_num_short_square;
+                            delayed_signal_offset = i;
+                            best_delay_frac = k + 1;
+                        }
+                    }
+                }
+
+                /*
+                       R'(T)^2
+                  2 * --------- < 1
+                        R(0)
+                */
+                L64_temp0 =  (int64_t)gain_num_square  << ((sh_gain_num << 1) + 1);
+                L64_temp1 = ((int64_t)gain_den * ener) << (sh_gain_den + sh_ener);
+                if (L64_temp0 < L64_temp1)
+                    gain_num = 0;
+            } // if(sh_gain_den >= 0)
+        } // if(corr_int_num)
+    } // if(ener)
+    /* End of best delay searching code  */
+
+    if (!gain_num) {
+        memcpy(residual_filt, residual + RES_PREV_DATA_SIZE, subframe_size * sizeof(int16_t));
+
+        /* Long-term prediction gain is less than 3dB. Long-term postfilter is disabled. */
+        return 0;
+    }
+    if (best_delay_frac) {
+        /* Recompute delayed signal with an interpolation filter of length 129. */
+        ff_acelp_interpolate(residual_filt,
+                             &sig_scaled[RES_PREV_DATA_SIZE - best_delay_int + delayed_signal_offset],
+                             ff_g729_interp_filt_long,
+                             ANALYZED_FRAC_DELAYS + 1,
+                             8 - best_delay_frac,
+                             LONG_INT_FILT_LEN,
+                             subframe_size + 1);
+        /* Compute R'(k) correlation's numerator. */
+        sum = adsp->scalarproduct_int16(residual_filt,
+                                       sig_scaled + RES_PREV_DATA_SIZE,
+                                       subframe_size);
+
+        if (sum < 0) {
+            gain_long_num = 0;
+            sh_gain_long_num = 0;
+        } else {
+            tmp = av_log2(sum) - 14;
+            tmp = FFMAX(tmp, 0);
+            sum >>= tmp;
+            gain_long_num = sum;
+            sh_gain_long_num = tmp;
+        }
+
+        /* Compute R'(k) correlation's denominator. */
+        sum = adsp->scalarproduct_int16(residual_filt, residual_filt, subframe_size);
+
+        tmp = av_log2(sum) - 14;
+        tmp = FFMAX(tmp, 0);
+        sum >>= tmp;
+        gain_long_den = sum;
+        sh_gain_long_den = tmp;
+
+        /* Select between original and delayed signal.
+           Delayed signal will be selected if it increases R'(k)
+           correlation. */
+        L_temp0 = gain_num * gain_num;
+        L_temp0 = MULL(L_temp0, gain_long_den, FRAC_BITS);
+
+        L_temp1 = gain_long_num * gain_long_num;
+        L_temp1 = MULL(L_temp1, gain_den, FRAC_BITS);
+
+        tmp = ((sh_gain_long_num - sh_gain_num) << 1) - (sh_gain_long_den - sh_gain_den);
+        if (tmp > 0)
+            L_temp0 >>= tmp;
+        else
+            L_temp1 >>= -tmp;
+
+        /* Check if longer filter increases the values of R'(k). */
+        if (L_temp1 > L_temp0) {
+            /* Select long filter. */
+            selected_signal = residual_filt;
+            gain_num = gain_long_num;
+            gain_den = gain_long_den;
+            sh_gain_num = sh_gain_long_num;
+            sh_gain_den = sh_gain_long_den;
+        } else
+            /* Select short filter. */
+            selected_signal = &delayed_signal[best_delay_frac-1][delayed_signal_offset];
+
+        /* Rescale selected signal to original value. */
+        if (shift > 0)
+            for (i = 0; i < subframe_size; i++)
+                selected_signal[i] <<= shift;
+        else
+            for (i = 0; i < subframe_size; i++)
+                selected_signal[i] >>= -shift;
+
+        /* necessary to avoid compiler warning */
+        selected_signal_const = selected_signal;
+    } // if(best_delay_frac)
+    else
+        selected_signal_const = residual + RES_PREV_DATA_SIZE - (best_delay_int + 1 - delayed_signal_offset);
+#ifdef G729_BITEXACT
+    tmp = sh_gain_num - sh_gain_den;
+    if (tmp > 0)
+        gain_den >>= tmp;
+    else
+        gain_num >>= -tmp;
+
+    if (gain_num > gain_den)
+        lt_filt_factor_a = MIN_LT_FILT_FACTOR_A;
+    else {
+        gain_num >>= 2;
+        gain_den >>= 1;
+        lt_filt_factor_a = (gain_den << 15) / (gain_den + gain_num);
+    }
+#else
+    L64_temp0 = (((int64_t)gain_num) << sh_gain_num) >> 1;
+    L64_temp1 = ((int64_t)gain_den) << sh_gain_den;
+    lt_filt_factor_a = FFMAX((L64_temp1 << 15) / (L64_temp1 + L64_temp0), MIN_LT_FILT_FACTOR_A);
+#endif
+
+    /* Filter through selected filter. */
+    lt_filt_factor_b = 32767 - lt_filt_factor_a + 1;
+
+    ff_acelp_weighted_vector_sum(residual_filt, residual + RES_PREV_DATA_SIZE,
+                                 selected_signal_const,
+                                 lt_filt_factor_a, lt_filt_factor_b,
+                                 1<<14, 15, subframe_size);
+
+    // Long-term prediction gain is larger than 3dB.
+    return 1;
+}
+
+/**
+ * \brief Calculate reflection coefficient for tilt compensation filter (4.2.3).
+ * \param dsp initialized DSP context
+ * \param lp_gn (3.12) coefficients of A(z/FORMANT_PP_FACTOR_NUM) filter
+ * \param lp_gd (3.12) coefficients of A(z/FORMANT_PP_FACTOR_DEN) filter
+ * \param speech speech to update
+ * \param subframe_size size of subframe
+ *
+ * \return (3.12) reflection coefficient
+ *
+ * \remark The routine also calculates the gain term for the short-term
+ *         filter (gf) and multiplies the speech data by 1/gf.
+ *
+ * \note All members of lp_gn, except 10-19 must be equal to zero.
+ */
+static int16_t get_tilt_comp(AudioDSPContext *adsp, int16_t *lp_gn,
+                             const int16_t *lp_gd, int16_t* speech,
+                             int subframe_size)
+{
+    int rh1,rh0; // (3.12)
+    int temp;
+    int i;
+    int gain_term;
+
+    lp_gn[10] = 4096; //1.0 in (3.12)
+
+    /* Apply 1/A(z/FORMANT_PP_FACTOR_DEN) filter to hf. */
+    ff_celp_lp_synthesis_filter(lp_gn + 11, lp_gd + 1, lp_gn + 11, 22, 10, 0, 0, 0x800);
+    /* Now lp_gn (starting with 10) contains impulse response
+       of A(z/FORMANT_PP_FACTOR_NUM)/A(z/FORMANT_PP_FACTOR_DEN) filter. */
+
+    rh0 = adsp->scalarproduct_int16(lp_gn + 10, lp_gn + 10, 20);
+    rh1 = adsp->scalarproduct_int16(lp_gn + 10, lp_gn + 11, 20);
+
+    /* downscale to avoid overflow */
+    temp = av_log2(rh0) - 14;
+    if (temp > 0) {
+        rh0 >>= temp;
+        rh1 >>= temp;
+    }
+
+    if (FFABS(rh1) > rh0 || !rh0)
+        return 0;
+
+    gain_term = 0;
+    for (i = 0; i < 20; i++)
+        gain_term += FFABS(lp_gn[i + 10]);
+    gain_term >>= 2; // (3.12) -> (5.10)
+
+    if (gain_term > 0x400) { // 1.0 in (5.10)
+        temp = 0x2000000 / gain_term; // 1.0/gain_term in (0.15)
+        for (i = 0; i < subframe_size; i++)
+            speech[i] = (speech[i] * temp + 0x4000) >> 15;
+    }
+
+    return -(rh1 << 15) / rh0;
+}
+
+/**
+ * \brief Apply tilt compensation filter (4.2.3).
+ * \param res_pst [in/out] residual signal (partially filtered)
+ * \param k1 (3.12) reflection coefficient
+ * \param subframe_size size of subframe
+ * \param ht_prev_data previous data for 4.2.3, equation 86
+ *
+ * \return new value for ht_prev_data
+*/
+static int16_t apply_tilt_comp(int16_t* out, int16_t* res_pst, int refl_coeff,
+                               int subframe_size, int16_t ht_prev_data)
+{
+    int tmp, tmp2;
+    int i;
+    int gt, ga;
+    int fact, sh_fact;
+
+    if (refl_coeff > 0) {
+        gt = (refl_coeff * G729_TILT_FACTOR_PLUS + 0x4000) >> 15;
+        fact = 0x4000; // 0.5 in (0.15)
+        sh_fact = 15;
+    } else {
+        gt = (refl_coeff * G729_TILT_FACTOR_MINUS + 0x4000) >> 15;
+        fact = 0x800; // 0.5 in (3.12)
+        sh_fact = 12;
+    }
+    ga = (fact << 15) / av_clip_int16(32768 - FFABS(gt));
+    gt >>= 1;
+
+    /* Apply tilt compensation filter to signal. */
+    tmp = res_pst[subframe_size - 1];
+
+    for (i = subframe_size - 1; i >= 1; i--) {
+        tmp2 = (res_pst[i] << 15) + ((gt * res_pst[i-1]) << 1);
+        tmp2 = (tmp2 + 0x4000) >> 15;
+
+        tmp2 = (tmp2 * ga * 2 + fact) >> sh_fact;
+        out[i] = tmp2;
+    }
+    tmp2 = (res_pst[0] << 15) + ((gt * ht_prev_data) << 1);
+    tmp2 = (tmp2 + 0x4000) >> 15;
+    tmp2 = (tmp2 * ga * 2 + fact) >> sh_fact;
+    out[0] = tmp2;
+
+    return tmp;
+}
+
+void ff_g729_postfilter(AudioDSPContext *adsp, int16_t* ht_prev_data, int* voicing,
+                     const int16_t *lp_filter_coeffs, int pitch_delay_int,
+                     int16_t* residual, int16_t* res_filter_data,
+                     int16_t* pos_filter_data, int16_t *speech, int subframe_size)
+{
+    int16_t residual_filt_buf[SUBFRAME_SIZE+11];
+    int16_t lp_gn[33]; // (3.12)
+    int16_t lp_gd[11]; // (3.12)
+    int tilt_comp_coeff;
+    int i;
+
+    /* Zero-filling is necessary for tilt-compensation filter. */
+    memset(lp_gn, 0, 33 * sizeof(int16_t));
+
+    /* Calculate A(z/FORMANT_PP_FACTOR_NUM) filter coefficients. */
+    for (i = 0; i < 10; i++)
+        lp_gn[i + 11] = (lp_filter_coeffs[i + 1] * formant_pp_factor_num_pow[i] + 0x4000) >> 15;
+
+    /* Calculate A(z/FORMANT_PP_FACTOR_DEN) filter coefficients. */
+    for (i = 0; i < 10; i++)
+        lp_gd[i + 1] = (lp_filter_coeffs[i + 1] * formant_pp_factor_den_pow[i] + 0x4000) >> 15;
+
+    /* residual signal calculation (one-half of short-term postfilter) */
+    memcpy(speech - 10, res_filter_data, 10 * sizeof(int16_t));
+    residual_filter(residual + RES_PREV_DATA_SIZE, lp_gn + 11, speech, subframe_size);
+    /* Save data to use it in the next subframe. */
+    memcpy(res_filter_data, speech + subframe_size - 10, 10 * sizeof(int16_t));
+
+    /* long-term filter. If long-term prediction gain is larger than 3dB (returned value is
+       nonzero) then declare current subframe as periodic. */
+    i = long_term_filter(adsp, pitch_delay_int,
+                                                residual, residual_filt_buf + 10,
+                                                subframe_size);
+    *voicing = FFMAX(*voicing, i);
+
+    /* shift residual for using in next subframe */
+    memmove(residual, residual + subframe_size, RES_PREV_DATA_SIZE * sizeof(int16_t));
+
+    /* short-term filter tilt compensation */
+    tilt_comp_coeff = get_tilt_comp(adsp, lp_gn, lp_gd, residual_filt_buf + 10, subframe_size);
+
+    /* Apply second half of short-term postfilter: 1/A(z/FORMANT_PP_FACTOR_DEN) */
+    ff_celp_lp_synthesis_filter(pos_filter_data + 10, lp_gd + 1,
+                                residual_filt_buf + 10,
+                                subframe_size, 10, 0, 0, 0x800);
+    memcpy(pos_filter_data, pos_filter_data + subframe_size, 10 * sizeof(int16_t));
+
+    *ht_prev_data = apply_tilt_comp(speech, pos_filter_data + 10, tilt_comp_coeff,
+                                    subframe_size, *ht_prev_data);
+}
+
+/**
+ * \brief Adaptive gain control (4.2.4)
+ * \param gain_before gain of speech before applying postfilters
+ * \param gain_after  gain of speech after applying postfilters
+ * \param speech [in/out] signal buffer
+ * \param subframe_size length of subframe
+ * \param gain_prev (3.12) previous value of gain coefficient
+ *
+ * \return (3.12) last value of gain coefficient
+ */
+int16_t ff_g729_adaptive_gain_control(int gain_before, int gain_after, int16_t *speech,
+                                   int subframe_size, int16_t gain_prev)
+{
+    int gain; // (3.12)
+    int n;
+    int exp_before, exp_after;
+
+    if(!gain_after && gain_before)
+        return 0;
+
+    if (gain_before) {
+
+        exp_before  = 14 - av_log2(gain_before);
+        gain_before = bidir_sal(gain_before, exp_before);
+
+        exp_after  = 14 - av_log2(gain_after);
+        gain_after = bidir_sal(gain_after, exp_after);
+
+        if (gain_before < gain_after) {
+            gain = (gain_before << 15) / gain_after;
+            gain = bidir_sal(gain, exp_after - exp_before - 1);
+        } else {
+            gain = ((gain_before - gain_after) << 14) / gain_after + 0x4000;
+            gain = bidir_sal(gain, exp_after - exp_before);
+        }
+        gain = (gain * G729_AGC_FAC1 + 0x4000) >> 15; // gain * (1-0.9875)
+    } else
+        gain = 0;
+
+    for (n = 0; n < subframe_size; n++) {
+        // gain_prev = gain + 0.9875 * gain_prev
+        gain_prev = (G729_AGC_FACTOR * gain_prev + 0x4000) >> 15;
+        gain_prev = av_clip_int16(gain + gain_prev);
+        speech[n] = av_clip_int16((speech[n] * gain_prev + 0x2000) >> 14);
+    }
+    return gain_prev;
+}
diff --git a/libavcodec/g729postfilter.h b/libavcodec/g729postfilter.h
new file mode 100644
index 0000000..5c2aaf2
--- /dev/null
+++ b/libavcodec/g729postfilter.h
@@ -0,0 +1,116 @@
+/*
+ * G.729, G729 Annex D postfilter
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_G729POSTFILTER_H
+#define AVCODEC_G729POSTFILTER_H
+
+#include <stdint.h>
+#include "audiodsp.h"
+
+/**
+ * tilt compensation factor (G.729, k1>0)
+ * 0.2 in Q15
+ */
+#define G729_TILT_FACTOR_PLUS       6554
+
+/**
+ * tilt compensation factor (G.729, k1<0)
+ * 0.9 in Q15
+ */
+#define G729_TILT_FACTOR_MINUS     29491
+
+/* 4.2.2 */
+#define FORMANT_PP_FACTOR_NUM  18022             //0.55 in Q15
+#define FORMANT_PP_FACTOR_DEN  22938             //0.70 in Q15
+
+/**
+ * gain adjustment factor (G.729, 4.2.4)
+ * 0.9875 in Q15
+ */
+#define G729_AGC_FACTOR            32358
+#define G729_AGC_FAC1 (32768-G729_AGC_FACTOR)
+
+/**
+ * 1.0 / (1.0 + 0.5) in Q15
+ * where 0.5 is the minimum value of
+ * weight factor, controlling amount of long-term postfiltering
+ */
+#define MIN_LT_FILT_FACTOR_A       21845
+
+/**
+ * Short interpolation filter length
+ */
+#define SHORT_INT_FILT_LEN         2
+
+/**
+ * Long interpolation filter length
+ */
+#define LONG_INT_FILT_LEN          8
+
+/**
+ * Number of analyzed fractional pitch delays in second stage of long-term
+ * postfilter
+ */
+#define ANALYZED_FRAC_DELAYS       7
+
+/**
+ * Amount of past residual signal data stored in buffer
+ */
+#define RES_PREV_DATA_SIZE (PITCH_DELAY_MAX + LONG_INT_FILT_LEN + 1)
+
+/**
+ * \brief Signal postfiltering (4.2)
+ * \param dsp initialized DSP context
+ * \param ht_prev_data [in/out] (Q12) pointer to variable receiving tilt
+ *                     compensation filter data from previous subframe
+ * \param voicing [in/out] (Q0) pointer to variable receiving voicing decision
+ * \param lp_filter_coeffs (Q12) LP filter coefficients
+ * \param pitch_delay_int integer part of the pitch delay
+ * \param residual [in/out] (Q0) residual signal buffer (used in long-term postfilter)
+ * \param res_filter_data [in/out] (Q0) speech data of previous subframe
+ * \param pos_filter_data [in/out] (Q0) previous speech data for short-term postfilter
+ * \param speech [in/out] (Q0) signal buffer
+ * \param subframe_size size of subframe
+ *
+ * Filtering has the following  stages:
+ *   Long-term postfilter (4.2.1)
+ *   Short-term postfilter (4.2.2).
+ *   Tilt-compensation (4.2.3)
+ */
+void ff_g729_postfilter(AudioDSPContext *adsp, int16_t* ht_prev_data, int* voicing,
+                     const int16_t *lp_filter_coeffs, int pitch_delay_int,
+                     int16_t* residual, int16_t* res_filter_data,
+                     int16_t* pos_filter_data, int16_t *speech,
+                     int subframe_size);
+
+/**
+ * \brief Adaptive gain control (4.2.4)
+ * \param gain_before (Q0) gain of speech before applying postfilters
+ * \param gain_after  (Q0) gain of speech after applying postfilters
+ * \param speech [in/out] (Q0) signal buffer
+ * \param subframe_size length of subframe
+ * \param gain_prev (Q12) previous value of gain coefficient
+ *
+ * \return (Q12) last value of gain coefficient
+ */
+int16_t ff_g729_adaptive_gain_control(int gain_before, int gain_after, int16_t *speech,
+                                   int subframe_size, int16_t gain_prev);
+
+#endif // AVCODEC_G729POSTFILTER_H
diff --git a/libavcodec/gdv.c b/libavcodec/gdv.c
new file mode 100644
index 0000000..183286b
--- /dev/null
+++ b/libavcodec/gdv.c
@@ -0,0 +1,569 @@
+/*
+ * Gremlin Digital Video (GDV) decoder
+ * Copyright (c) 2017 Konstantin Shishkov
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+typedef struct GDVContext {
+    AVCodecContext *avctx;
+
+    GetByteContext gb;
+    GetByteContext g2;
+    PutByteContext pb;
+
+    uint32_t pal[256];
+    uint8_t *frame;
+    unsigned frame_size;
+    unsigned scale_h, scale_v;
+} GDVContext;
+
+typedef struct Bits8 {
+    uint8_t queue;
+    uint8_t fill;
+} Bits8;
+
+typedef struct Bits32 {
+    uint32_t queue;
+    uint8_t  fill;
+} Bits32;
+
+#define PREAMBLE_SIZE 4096
+
+static av_cold int gdv_decode_init(AVCodecContext *avctx)
+{
+    GDVContext *gdv = avctx->priv_data;
+    int i, j, k;
+
+    avctx->pix_fmt  = AV_PIX_FMT_PAL8;
+    gdv->frame_size = avctx->width * avctx->height + PREAMBLE_SIZE;
+    gdv->frame = av_calloc(gdv->frame_size, 1);
+    if (!gdv->frame)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 256; j++) {
+            for (k = 0; k < 8; k++) {
+                gdv->frame[i * 2048 + j * 8 + k] = j;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void scaleup(uint8_t *dst, const uint8_t *src, int w)
+{
+    int x;
+    for (x = 0; x < w - 7; x+=8) {
+        dst[x + 0] =
+        dst[x + 1] = src[(x>>1) + 0];
+        dst[x + 2] =
+        dst[x + 3] = src[(x>>1) + 1];
+        dst[x + 4] =
+        dst[x + 5] = src[(x>>1) + 2];
+        dst[x + 6] =
+        dst[x + 7] = src[(x>>1) + 3];
+    }
+    for (; x < w; x++) {
+        dst[x] = src[(x>>1)];
+    }
+}
+
+static void scaleup_rev(uint8_t *dst, const uint8_t *src, int w)
+{
+    int x;
+
+    for (x = w - 1; (x+1) & 7; x--) {
+        dst[x] = src[(x>>1)];
+    }
+    for (x -= 7; x >= 0; x -= 8) {
+        dst[x + 6] =
+        dst[x + 7] = src[(x>>1) + 3];
+        dst[x + 4] =
+        dst[x + 5] = src[(x>>1) + 2];
+        dst[x + 2] =
+        dst[x + 3] = src[(x>>1) + 1];
+        dst[x + 0] =
+        dst[x + 1] = src[(x>>1) + 0];
+    }
+}
+
+static void scaledown(uint8_t *dst, const uint8_t *src, int w)
+{
+    int x;
+    for (x = 0; x < w - 7; x+=8) {
+        dst[x + 0] = src[2*x + 0];
+        dst[x + 1] = src[2*x + 2];
+        dst[x + 2] = src[2*x + 4];
+        dst[x + 3] = src[2*x + 6];
+        dst[x + 4] = src[2*x + 8];
+        dst[x + 5] = src[2*x +10];
+        dst[x + 6] = src[2*x +12];
+        dst[x + 7] = src[2*x +14];
+    }
+    for (; x < w; x++) {
+        dst[x] = src[2*x];
+    }
+}
+
+static void rescale(GDVContext *gdv, uint8_t *dst, int w, int h, int scale_v, int scale_h)
+{
+    int j, y;
+
+    if ((gdv->scale_v == scale_v) && (gdv->scale_h == scale_h)) {
+        return;
+    }
+
+    if (gdv->scale_v) {
+        for (j = 0; j < h; j++) {
+            int y = h - j - 1;
+            uint8_t *dst1 = dst + PREAMBLE_SIZE + y * w;
+            uint8_t *src1 = dst + PREAMBLE_SIZE + (y>>!!gdv->scale_h) * (w>>1);
+
+            scaleup_rev(dst1, src1, w);
+        }
+    } else if (gdv->scale_h) {
+        for (j = 0; j < h; j++) {
+            int y = h - j - 1;
+            uint8_t *dst1 = dst + PREAMBLE_SIZE + y * w;
+            uint8_t *src1 = dst + PREAMBLE_SIZE + (y>>1) * w;
+            memcpy(dst1, src1, w);
+        }
+    }
+
+    if (scale_h && scale_v) {
+        for (y = 0; y < (h>>1); y++) {
+            uint8_t *dst1 = dst + PREAMBLE_SIZE + y * (w>>1);
+            uint8_t *src1 = dst + PREAMBLE_SIZE + y*2 * w;
+            scaledown(dst1, src1, w>>1);
+        }
+    } else if (scale_h) {
+        for (y = 0; y < (h>>1); y++) {
+            uint8_t *dst1 = dst + PREAMBLE_SIZE + y * w;
+            uint8_t *src1 = dst + PREAMBLE_SIZE + y*2 * w;
+            memcpy(dst1, src1, w);
+        }
+    } else if (scale_v) {
+        for (y = 0; y < h; y++) {
+            uint8_t *dst1 = dst + PREAMBLE_SIZE + y * w;
+            scaledown(dst1, dst1, w>>1);
+        }
+    }
+
+    gdv->scale_v = scale_v;
+    gdv->scale_h = scale_h;
+}
+
+static int read_bits2(Bits8 *bits, GetByteContext *gb)
+{
+    int res;
+
+    if (bits->fill == 0) {
+        bits->queue |= bytestream2_get_byte(gb);
+        bits->fill   = 8;
+    }
+    res = bits->queue >> 6;
+    bits->queue <<= 2;
+    bits->fill   -= 2;
+
+    return res;
+}
+
+static void fill_bits32(Bits32 *bits, GetByteContext *gb)
+{
+    bits->queue = bytestream2_get_le32(gb);
+    bits->fill  = 32;
+}
+
+static int read_bits32(Bits32 *bits, GetByteContext *gb, int nbits)
+{
+    int res = bits->queue & ((1 << nbits) - 1);
+
+    bits->queue >>= nbits;
+    bits->fill   -= nbits;
+    if (bits->fill <= 16) {
+        bits->queue |= bytestream2_get_le16(gb) << bits->fill;
+        bits->fill  += 16;
+    }
+
+    return res;
+}
+
+static void lz_copy(PutByteContext *pb, GetByteContext *g2, int offset, unsigned len)
+{
+    int i;
+
+    if (offset == -1) {
+        int c;
+
+        bytestream2_seek(g2, bytestream2_tell_p(pb) - 1, SEEK_SET);
+        c = bytestream2_get_byte(g2);
+        for (i = 0; i < len; i++) {
+            bytestream2_put_byte(pb, c);
+        }
+    } else if (offset < 0) {
+        int start = bytestream2_tell_p(pb) - (-offset);
+
+        bytestream2_seek(g2, start, SEEK_SET);
+        for (i = 0; i < len; i++) {
+            bytestream2_put_byte(pb, bytestream2_get_byte(g2));
+        }
+    } else {
+        int start = bytestream2_tell_p(pb) + offset;
+
+        bytestream2_seek(g2, start, SEEK_SET);
+        for (i = 0; i < len; i++) {
+            bytestream2_put_byte(pb, bytestream2_get_byte(g2));
+        }
+    }
+}
+
+static int decompress_2(AVCodecContext *avctx)
+{
+    GDVContext *gdv = avctx->priv_data;
+    GetByteContext *gb = &gdv->gb;
+    GetByteContext *g2 = &gdv->g2;
+    PutByteContext *pb = &gdv->pb;
+    Bits8 bits = { 0 };
+    int c, i;
+
+    bytestream2_init(g2, gdv->frame, gdv->frame_size);
+    bytestream2_skip_p(pb, PREAMBLE_SIZE);
+
+    for (c = 0; c < 256; c++) {
+        for (i = 0; i < 16; i++) {
+            gdv->frame[c * 16 + i] = c;
+        }
+    }
+
+    while (bytestream2_get_bytes_left_p(pb) > 0 && bytestream2_get_bytes_left(gb) > 0) {
+        int tag = read_bits2(&bits, gb);
+        if (tag == 0) {
+            bytestream2_put_byte(pb, bytestream2_get_byte(gb));
+        } else if (tag == 1) {
+            int b = bytestream2_get_byte(gb);
+            int len = (b & 0xF) + 3;
+            int top = (b >> 4) & 0xF;
+            int off = (bytestream2_get_byte(gb) << 4) + top - 4096;
+            lz_copy(pb, g2, off, len);
+        } else if (tag == 2) {
+            int len = (bytestream2_get_byte(gb)) + 2;
+            bytestream2_skip_p(pb, len);
+        } else {
+            break;
+        }
+    }
+
+    if (bytestream2_get_bytes_left_p(pb) > 0)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+static int decompress_5(AVCodecContext *avctx, unsigned skip)
+{
+    GDVContext *gdv = avctx->priv_data;
+    GetByteContext *gb = &gdv->gb;
+    GetByteContext *g2 = &gdv->g2;
+    PutByteContext *pb = &gdv->pb;
+    Bits8 bits = { 0 };
+
+    bytestream2_init(g2, gdv->frame, gdv->frame_size);
+    bytestream2_skip_p(pb, skip + PREAMBLE_SIZE);
+
+    while (bytestream2_get_bytes_left_p(pb) > 0 && bytestream2_get_bytes_left(gb) > 0) {
+        int tag = read_bits2(&bits, gb);
+        if (bytestream2_get_bytes_left(gb) < 1)
+            return AVERROR_INVALIDDATA;
+        if (tag == 0) {
+            bytestream2_put_byte(pb, bytestream2_get_byte(gb));
+        } else if (tag == 1) {
+            int b = bytestream2_get_byte(gb);
+            int len = (b & 0xF) + 3;
+            int top = b >> 4;
+            int off = (bytestream2_get_byte(gb) << 4) + top - 4096;
+            lz_copy(pb, g2, off, len);
+        } else if (tag == 2) {
+            int len;
+            int b = bytestream2_get_byte(gb);
+            if (b == 0) {
+                break;
+            }
+            if (b != 0xFF) {
+                len = b;
+            } else {
+                len = bytestream2_get_le16(gb);
+            }
+            bytestream2_skip_p(pb, len + 1);
+        } else {
+            int b = bytestream2_get_byte(gb);
+            int len = (b & 0x3) + 2;
+            int off = -(b >> 2) - 1;
+            lz_copy(pb, g2, off, len);
+        }
+    }
+    return 0;
+}
+
+static int decompress_68(AVCodecContext *avctx, unsigned skip, unsigned use8)
+{
+    GDVContext *gdv = avctx->priv_data;
+    GetByteContext *gb = &gdv->gb;
+    GetByteContext *g2 = &gdv->g2;
+    PutByteContext *pb = &gdv->pb;
+    Bits32 bits;
+
+    bytestream2_init(g2, gdv->frame, gdv->frame_size);
+    bytestream2_skip_p(pb, skip + PREAMBLE_SIZE);
+    fill_bits32(&bits, gb);
+
+    while (bytestream2_get_bytes_left_p(pb) > 0 && bytestream2_get_bytes_left(gb) > 0) {
+        int tag = read_bits32(&bits, gb, 2);
+        if (tag == 0) {
+            int b = read_bits32(&bits, gb, 1);
+            if (b == 0) {
+                bytestream2_put_byte(pb, bytestream2_get_byte(gb));
+            } else {
+                int i, len = 2;
+                int lbits = 0;
+                while (1) {
+                    int val;
+
+                    lbits += 1;
+                    val = read_bits32(&bits, gb, lbits);
+                    len += val;
+                    if (val != ((1 << lbits) - 1)) {
+                        break;
+                    }
+                    assert(lbits < 16);
+                }
+                for (i = 0; i < len; i++) {
+                    bytestream2_put_byte(pb, bytestream2_get_byte(gb));
+                }
+            }
+        } else if (tag == 1) {
+            int b = read_bits32(&bits, gb, 1);
+            int len;
+
+            if (b == 0) {
+                len = (read_bits32(&bits, gb, 4)) + 2;
+            } else {
+                int bb = bytestream2_get_byte(gb);
+                if ((bb & 0x80) == 0) {
+                    len = bb + 18;
+                } else {
+                    int top = (bb & 0x7F) << 8;
+                    len = top + bytestream2_get_byte(gb) + 146;
+                }
+            }
+            bytestream2_skip_p(pb, len);
+        } else if (tag == 2) {
+            int i, subtag = read_bits32(&bits, gb, 2);
+
+            if (subtag != 3) {
+                int top = (read_bits32(&bits, gb, 4)) << 8;
+                int offs = top + bytestream2_get_byte(gb);
+                if ((subtag != 0) || (offs <= 0xF80)) {
+                    int len = (subtag) + 3;
+                    lz_copy(pb, g2, (offs) - 4096, len);
+                } else {
+                    int real_off, len, c1, c2;
+
+                    if (offs == 0xFFF) {
+                        return 0;
+                    }
+
+                    real_off = ((offs >> 4) & 0x7) + 1;
+                    len = ((offs & 0xF) + 2) * 2;
+                    c1 = gdv->frame[bytestream2_tell_p(pb) - real_off];
+                    c2 = gdv->frame[bytestream2_tell_p(pb) - real_off + 1];
+                    for (i = 0; i < len/2; i++) {
+                        bytestream2_put_byte(pb, c1);
+                        bytestream2_put_byte(pb, c2);
+                    }
+                }
+            } else {
+                int b = bytestream2_get_byte(gb);
+                int off = ((b & 0x7F)) + 1;
+                int len = ((b & 0x80) == 0) ? 2 : 3;
+
+                lz_copy(pb, g2, -off, len);
+            }
+        } else {
+            int len;
+            int off;
+            if (use8) {
+                int q, b = bytestream2_get_byte(gb);
+                if ((b & 0xC0) == 0xC0) {
+                    len = ((b & 0x3F)) + 8;
+                    q = read_bits32(&bits, gb, 4);
+                    off = (q << 8) + (bytestream2_get_byte(gb)) + 1;
+                } else {
+                    int ofs1;
+                    if ((b & 0x80) == 0) {
+                        len = ((b >> 4)) + 6;
+                        ofs1 = (b & 0xF);
+                    } else {
+                        len = ((b & 0x3F)) + 14;
+                        ofs1 = read_bits32(&bits, gb, 4);
+                    }
+                    off = (ofs1 << 8) + (bytestream2_get_byte(gb)) - 4096;
+                }
+            } else {
+                int ofs1, b = bytestream2_get_byte(gb);
+
+                if ((b >> 4) == 0xF) {
+                    len = bytestream2_get_byte(gb) + 21;
+                } else {
+                    len = (b >> 4) + 6;
+                }
+                ofs1 = (b & 0xF);
+                off = (ofs1 << 8) + bytestream2_get_byte(gb) - 4096;
+            }
+            lz_copy(pb, g2, off, len);
+        }
+    }
+
+    return 0;
+}
+
+static int gdv_decode_frame(AVCodecContext *avctx, void *data,
+                            int *got_frame, AVPacket *avpkt)
+{
+    GDVContext *gdv = avctx->priv_data;
+    GetByteContext *gb = &gdv->gb;
+    PutByteContext *pb = &gdv->pb;
+    AVFrame *frame = data;
+    int ret, i, pal_size;
+    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &pal_size);
+    int compression;
+    unsigned flags;
+    uint8_t *dst;
+
+    bytestream2_init(gb, avpkt->data, avpkt->size);
+    bytestream2_init_writer(pb, gdv->frame, gdv->frame_size);
+
+    flags = bytestream2_get_le32(gb);
+    compression = flags & 0xF;
+
+    if (compression == 4 || compression == 7 || compression > 8)
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    if (pal && pal_size == AVPALETTE_SIZE)
+        memcpy(gdv->pal, pal, AVPALETTE_SIZE);
+
+    rescale(gdv, gdv->frame, avctx->width, avctx->height,
+            !!(flags & 0x10), !!(flags & 0x20));
+
+    switch (compression) {
+    case 1:
+        memset(gdv->frame + PREAMBLE_SIZE, 0, gdv->frame_size - PREAMBLE_SIZE);
+    case 0:
+        if (bytestream2_get_bytes_left(gb) < 256*3)
+            return AVERROR_INVALIDDATA;
+        for (i = 0; i < 256; i++) {
+            unsigned r = bytestream2_get_byte(gb);
+            unsigned g = bytestream2_get_byte(gb);
+            unsigned b = bytestream2_get_byte(gb);
+            gdv->pal[i] = 0xFFU << 24 | r << 18 | g << 10 | b << 2;
+        }
+        break;
+    case 2:
+        ret = decompress_2(avctx);
+        break;
+    case 3:
+        break;
+    case 5:
+        ret = decompress_5(avctx, flags >> 8);
+        break;
+    case 6:
+        ret = decompress_68(avctx, flags >> 8, 0);
+        break;
+    case 8:
+        ret = decompress_68(avctx, flags >> 8, 1);
+        break;
+    default:
+        av_assert0(0);
+    }
+    if (ret < 0)
+        return ret;
+
+    memcpy(frame->data[1], gdv->pal, AVPALETTE_SIZE);
+    dst = frame->data[0];
+
+    if (!gdv->scale_v && !gdv->scale_h) {
+        int sidx = PREAMBLE_SIZE, didx = 0;
+        int y;
+
+        for (y = 0; y < avctx->height; y++) {
+            memcpy(dst + didx, gdv->frame + sidx, avctx->width);
+            sidx += avctx->width;
+            didx += frame->linesize[0];
+        }
+    } else {
+        int sidx = PREAMBLE_SIZE, didx = 0;
+        int y;
+
+        for (y = 0; y < avctx->height; y++) {
+            if (!gdv->scale_v) {
+                memcpy(dst + didx, gdv->frame + sidx, avctx->width);
+            } else {
+                uint8_t *dst2 = dst + didx;
+                uint8_t *src2 = gdv->frame + sidx;
+
+                scaleup(dst2, src2, avctx->width);
+            }
+            if (!gdv->scale_h || ((y & 1) == 1)) {
+                sidx += !gdv->scale_v ? avctx->width : avctx->width/2;
+            }
+            didx += frame->linesize[0];
+        }
+    }
+
+    *got_frame = 1;
+
+    return ret < 0 ? ret : avpkt->size;
+}
+
+static av_cold int gdv_decode_close(AVCodecContext *avctx)
+{
+    GDVContext *gdv = avctx->priv_data;
+    av_freep(&gdv->frame);
+    return 0;
+}
+
+AVCodec ff_gdv_decoder = {
+    .name           = "gdv",
+    .long_name      = NULL_IF_CONFIG_SMALL("Gremlin Digital Video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_GDV,
+    .priv_data_size = sizeof(GDVContext),
+    .init           = gdv_decode_init,
+    .close          = gdv_decode_close,
+    .decode         = gdv_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
index b225c13..c2f2671 100644
--- a/libavcodec/get_bits.h
+++ b/libavcodec/get_bits.h
@@ -1,20 +1,21 @@
 /*
- * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2016 Alexandra Hájková
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +32,8 @@
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
+#include "libavutil/avassert.h"
+#include "avcodec.h"
 #include "mathops.h"
 #include "vlc.h"
 
@@ -51,15 +54,25 @@
 #define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
 #endif
 
+#ifndef CACHED_BITSTREAM_READER
+#define CACHED_BITSTREAM_READER 0
+#endif
+
 typedef struct GetBitContext {
     const uint8_t *buffer, *buffer_end;
+#if CACHED_BITSTREAM_READER
+    uint64_t cache;
+    unsigned bits_left;
+#endif
     int index;
     int size_in_bits;
-#if !UNCHECKED_BITSTREAM_READER
     int size_in_bits_plus8;
-#endif
 } GetBitContext;
 
+static inline unsigned int get_bits(GetBitContext *s, int n);
+static inline void skip_bits(GetBitContext *s, int n);
+static inline unsigned int show_bits(GetBitContext *s, int n);
+
 /* Bitstream reader API docs:
  * name
  *   arbitrary name which is used as prefix for the internal variables
@@ -101,18 +114,25 @@ typedef struct GetBitContext {
  * LAST_SKIP_BITS(name, gb, num)
  *   Like SKIP_BITS, to be used if next call is UPDATE_CACHE or CLOSE_READER.
  *
+ * BITS_LEFT(name, gb)
+ *   Return the number of bits left
+ *
  * For examples see get_bits, show_bits, skip_bits, get_vlc.
  */
 
-#ifdef LONG_BITSTREAM_READER
+#if CACHED_BITSTREAM_READER
+#   define MIN_CACHE_BITS 64
+#elif defined LONG_BITSTREAM_READER
 #   define MIN_CACHE_BITS 32
 #else
 #   define MIN_CACHE_BITS 25
 #endif
 
+#if !CACHED_BITSTREAM_READER
+
 #define OPEN_READER_NOSIZE(name, gb)            \
     unsigned int name ## _index = (gb)->index;  \
-    unsigned int av_unused name ## _cache = 0
+    unsigned int av_unused name ## _cache
 
 #if UNCHECKED_BITSTREAM_READER
 #define OPEN_READER(name, gb) OPEN_READER_NOSIZE(name, gb)
@@ -128,27 +148,34 @@ typedef struct GetBitContext {
 
 #define CLOSE_READER(name, gb) (gb)->index = name ## _index
 
+# ifdef LONG_BITSTREAM_READER
+
+# define UPDATE_CACHE_LE(name, gb) name ## _cache = \
+      AV_RL64((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
+
+# define UPDATE_CACHE_BE(name, gb) name ## _cache = \
+      AV_RB64((gb)->buffer + (name ## _index >> 3)) >> (32 - (name ## _index & 7))
+
+#else
+
+# define UPDATE_CACHE_LE(name, gb) name ## _cache = \
+      AV_RL32((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
+
+# define UPDATE_CACHE_BE(name, gb) name ## _cache = \
+      AV_RB32((gb)->buffer + (name ## _index >> 3)) << (name ## _index & 7)
+
+#endif
+
+
 #ifdef BITSTREAM_READER_LE
 
-# ifdef LONG_BITSTREAM_READER
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RL64((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
-# else
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RL32((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
-# endif
+# define UPDATE_CACHE(name, gb) UPDATE_CACHE_LE(name, gb)
 
 # define SKIP_CACHE(name, gb, num) name ## _cache >>= (num)
 
 #else
 
-# ifdef LONG_BITSTREAM_READER
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RB64((gb)->buffer + (name ## _index >> 3)) >> (32 - (name ## _index & 7))
-# else
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RB32((gb)->buffer + (name ## _index >> 3)) << (name ## _index & 7)
-# endif
+# define UPDATE_CACHE(name, gb) UPDATE_CACHE_BE(name, gb)
 
 # define SKIP_CACHE(name, gb, num) name ## _cache <<= (num)
 
@@ -161,6 +188,8 @@ typedef struct GetBitContext {
     name ## _index = FFMIN(name ## _size_plus8, name ## _index + (num))
 #endif
 
+#define BITS_LEFT(name, gb) ((int)((gb)->size_in_bits - name ## _index))
+
 #define SKIP_BITS(name, gb, num)                \
     do {                                        \
         SKIP_CACHE(name, gb, num);              \
@@ -169,56 +198,180 @@ typedef struct GetBitContext {
 
 #define LAST_SKIP_BITS(name, gb, num) SKIP_COUNTER(name, gb, num)
 
+#define SHOW_UBITS_LE(name, gb, num) zero_extend(name ## _cache, num)
+#define SHOW_SBITS_LE(name, gb, num) sign_extend(name ## _cache, num)
+
+#define SHOW_UBITS_BE(name, gb, num) NEG_USR32(name ## _cache, num)
+#define SHOW_SBITS_BE(name, gb, num) NEG_SSR32(name ## _cache, num)
+
 #ifdef BITSTREAM_READER_LE
-#   define SHOW_UBITS(name, gb, num) zero_extend(name ## _cache, num)
-#   define SHOW_SBITS(name, gb, num) sign_extend(name ## _cache, num)
+#   define SHOW_UBITS(name, gb, num) SHOW_UBITS_LE(name, gb, num)
+#   define SHOW_SBITS(name, gb, num) SHOW_SBITS_LE(name, gb, num)
 #else
-#   define SHOW_UBITS(name, gb, num) NEG_USR32(name ## _cache, num)
-#   define SHOW_SBITS(name, gb, num) NEG_SSR32(name ## _cache, num)
+#   define SHOW_UBITS(name, gb, num) SHOW_UBITS_BE(name, gb, num)
+#   define SHOW_SBITS(name, gb, num) SHOW_SBITS_BE(name, gb, num)
 #endif
 
 #define GET_CACHE(name, gb) ((uint32_t) name ## _cache)
 
+#endif
+
 static inline int get_bits_count(const GetBitContext *s)
 {
+#if CACHED_BITSTREAM_READER
+    return s->index - s->bits_left;
+#else
     return s->index;
+#endif
 }
 
+#if CACHED_BITSTREAM_READER
+static inline void refill_32(GetBitContext *s)
+{
+#if !UNCHECKED_BITSTREAM_READER
+    if (s->index >> 3 >= s->buffer_end - s->buffer)
+        return;
+#endif
+
+#ifdef BITSTREAM_READER_LE
+    s->cache       = (uint64_t)AV_RL32(s->buffer + (s->index >> 3)) << s->bits_left | s->cache;
+#else
+    s->cache       = s->cache | (uint64_t)AV_RB32(s->buffer + (s->index >> 3)) << (32 - s->bits_left);
+#endif
+    s->index     += 32;
+    s->bits_left += 32;
+}
+
+static inline void refill_64(GetBitContext *s)
+{
+#if !UNCHECKED_BITSTREAM_READER
+    if (s->index >> 3 >= s->buffer_end - s->buffer)
+        return;
+#endif
+
+#ifdef BITSTREAM_READER_LE
+    s->cache = AV_RL64(s->buffer + (s->index >> 3));
+#else
+    s->cache = AV_RB64(s->buffer + (s->index >> 3));
+#endif
+    s->index += 64;
+    s->bits_left = 64;
+}
+
+static inline uint64_t get_val(GetBitContext *s, unsigned n, int is_le)
+{
+    uint64_t ret;
+    av_assert2(n>0 && n<=63);
+    if (is_le) {
+        ret = s->cache & ((UINT64_C(1) << n) - 1);
+        s->cache >>= n;
+    } else {
+        ret = s->cache >> (64 - n);
+        s->cache <<= n;
+    }
+    s->bits_left -= n;
+    return ret;
+}
+
+static inline unsigned show_val(const GetBitContext *s, unsigned n)
+{
+#ifdef BITSTREAM_READER_LE
+    return s->cache & ((UINT64_C(1) << n) - 1);
+#else
+    return s->cache >> (64 - n);
+#endif
+}
+#endif
+
+/**
+ * Skips the specified number of bits.
+ * @param n the number of bits to skip,
+ *          For the UNCHECKED_BITSTREAM_READER this must not cause the distance
+ *          from the start to overflow int32_t. Staying within the bitstream + padding
+ *          is sufficient, too.
+ */
 static inline void skip_bits_long(GetBitContext *s, int n)
 {
+#if CACHED_BITSTREAM_READER
+    skip_bits(s, n);
+#else
 #if UNCHECKED_BITSTREAM_READER
     s->index += n;
 #else
     s->index += av_clip(n, -s->index, s->size_in_bits_plus8 - s->index);
 #endif
+#endif
 }
 
+#if CACHED_BITSTREAM_READER
+static inline void skip_remaining(GetBitContext *s, unsigned n)
+{
+#ifdef BITSTREAM_READER_LE
+    s->cache >>= n;
+#else
+    s->cache <<= n;
+#endif
+    s->bits_left -= n;
+}
+#endif
+
 /**
- * Read MPEG-1 dc-style VLC (sign bit + mantisse with no MSB).
+ * Read MPEG-1 dc-style VLC (sign bit + mantissa with no MSB).
  * if MSB not set it is negative
  * @param n length in bits
  */
 static inline int get_xbits(GetBitContext *s, int n)
 {
+#if CACHED_BITSTREAM_READER
+    int32_t cache = show_bits(s, 32);
+    int sign = ~cache >> 31;
+    skip_remaining(s, n);
+
+    return ((((uint32_t)(sign ^ cache)) >> (32 - n)) ^ sign) - sign;
+#else
     register int sign;
     register int32_t cache;
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     cache = GET_CACHE(re, s);
     sign  = ~cache >> 31;
     LAST_SKIP_BITS(re, s, n);
     CLOSE_READER(re, s);
     return (NEG_USR32(sign ^ cache, n) ^ sign) - sign;
+#endif
+}
+
+#if !CACHED_BITSTREAM_READER
+static inline int get_xbits_le(GetBitContext *s, int n)
+{
+    register int sign;
+    register int32_t cache;
+    OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
+    UPDATE_CACHE_LE(re, s);
+    cache = GET_CACHE(re, s);
+    sign  = sign_extend(~cache, n) >> 31;
+    LAST_SKIP_BITS(re, s, n);
+    CLOSE_READER(re, s);
+    return (zero_extend(sign ^ cache, n) ^ sign) - sign;
 }
+#endif
 
 static inline int get_sbits(GetBitContext *s, int n)
 {
     register int tmp;
+#if CACHED_BITSTREAM_READER
+    av_assert2(n>0 && n<=25);
+    tmp = sign_extend(get_bits(s, n), n);
+#else
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_SBITS(re, s, n);
     LAST_SKIP_BITS(re, s, n);
     CLOSE_READER(re, s);
+#endif
     return tmp;
 }
 
@@ -227,12 +380,30 @@ static inline int get_sbits(GetBitContext *s, int n)
  */
 static inline unsigned int get_bits(GetBitContext *s, int n)
 {
-    register int tmp;
+    register unsigned int tmp;
+#if CACHED_BITSTREAM_READER
+
+    av_assert2(n>0 && n<=32);
+    if (n > s->bits_left) {
+        refill_32(s);
+        if (s->bits_left < 32)
+            s->bits_left = n;
+    }
+
+#ifdef BITSTREAM_READER_LE
+    tmp = get_val(s, n, 1);
+#else
+    tmp = get_val(s, n, 0);
+#endif
+#else
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_UBITS(re, s, n);
     LAST_SKIP_BITS(re, s, n);
     CLOSE_READER(re, s);
+#endif
+    av_assert2(tmp < UINT64_C(1) << n);
     return tmp;
 }
 
@@ -244,28 +415,88 @@ static av_always_inline int get_bitsz(GetBitContext *s, int n)
     return n ? get_bits(s, n) : 0;
 }
 
+static inline unsigned int get_bits_le(GetBitContext *s, int n)
+{
+#if CACHED_BITSTREAM_READER
+    av_assert2(n>0 && n<=32);
+    if (n > s->bits_left) {
+        refill_32(s);
+        if (s->bits_left < 32)
+            s->bits_left = n;
+    }
+
+    return get_val(s, n, 1);
+#else
+    register int tmp;
+    OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
+    UPDATE_CACHE_LE(re, s);
+    tmp = SHOW_UBITS_LE(re, s, n);
+    LAST_SKIP_BITS(re, s, n);
+    CLOSE_READER(re, s);
+    return tmp;
+#endif
+}
+
 /**
  * Show 1-25 bits.
  */
 static inline unsigned int show_bits(GetBitContext *s, int n)
 {
-    register int tmp;
+    register unsigned int tmp;
+#if CACHED_BITSTREAM_READER
+    if (n > s->bits_left)
+        refill_32(s);
+
+    tmp = show_val(s, n);
+#else
     OPEN_READER_NOSIZE(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_UBITS(re, s, n);
+#endif
     return tmp;
 }
 
 static inline void skip_bits(GetBitContext *s, int n)
 {
+#if CACHED_BITSTREAM_READER
+    if (n < s->bits_left)
+        skip_remaining(s, n);
+    else {
+        n -= s->bits_left;
+        s->cache = 0;
+        s->bits_left = 0;
+
+        if (n >= 64) {
+            unsigned skip = (n / 8) * 8;
+
+            n -= skip;
+            s->index += skip;
+        }
+        refill_64(s);
+        if (n)
+            skip_remaining(s, n);
+    }
+#else
     OPEN_READER(re, s);
-    UPDATE_CACHE(re, s);
     LAST_SKIP_BITS(re, s, n);
     CLOSE_READER(re, s);
+#endif
 }
 
 static inline unsigned int get_bits1(GetBitContext *s)
 {
+#if CACHED_BITSTREAM_READER
+    if (!s->bits_left)
+        refill_64(s);
+
+#ifdef BITSTREAM_READER_LE
+    return get_val(s, 1, 1);
+#else
+    return get_val(s, 1, 0);
+#endif
+#else
     unsigned int index = s->index;
     uint8_t result     = s->buffer[index >> 3];
 #ifdef BITSTREAM_READER_LE
@@ -282,6 +513,7 @@ static inline unsigned int get_bits1(GetBitContext *s)
     s->index = index;
 
     return result;
+#endif
 }
 
 static inline unsigned int show_bits1(GetBitContext *s)
@@ -299,20 +531,28 @@ static inline void skip_bits1(GetBitContext *s)
  */
 static inline unsigned int get_bits_long(GetBitContext *s, int n)
 {
-    if (n <= MIN_CACHE_BITS) {
+    av_assert2(n>=0 && n<=32);
+    if (!n) {
+        return 0;
+#if CACHED_BITSTREAM_READER
+    }
+    return get_bits(s, n);
+#else
+    } else if (n <= MIN_CACHE_BITS) {
         return get_bits(s, n);
     } else {
 #ifdef BITSTREAM_READER_LE
-        int ret = get_bits(s, 16);
+        unsigned ret = get_bits(s, 16);
         return ret | (get_bits(s, n - 16) << 16);
 #else
-        int ret = get_bits(s, 16) << (n - 16);
+        unsigned ret = get_bits(s, 16) << (n - 16);
         return ret | get_bits(s, n - 16);
 #endif
     }
+#endif
 }
 
-/*
+/**
  * Read 0-64 bits.
  */
 static inline uint64_t get_bits64(GetBitContext *s, int n)
@@ -335,6 +575,10 @@ static inline uint64_t get_bits64(GetBitContext *s, int n)
  */
 static inline int get_sbits_long(GetBitContext *s, int n)
 {
+    // sign_extend(x, 0) is undefined
+    if (!n)
+        return 0;
+
     return sign_extend(get_bits_long(s, n), n);
 }
 
@@ -351,6 +595,16 @@ static inline unsigned int show_bits_long(GetBitContext *s, int n)
     }
 }
 
+static inline int check_marker(void *logctx, GetBitContext *s, const char *msg)
+{
+    int bit = get_bits1(s);
+    if (!bit)
+        av_log(logctx, AV_LOG_INFO, "Marker bit missing at %d of %d %s\n",
+               get_bits_count(s) - 1, s->size_in_bits, msg);
+
+    return bit;
+}
+
 /**
  * Initialize GetBitContext.
  * @param buffer bitstream buffer, must be AV_INPUT_BUFFER_PADDING_SIZE bytes
@@ -365,7 +619,7 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
     int buffer_size;
     int ret = 0;
 
-    if (bit_size > INT_MAX - 7 || bit_size < 0 || !buffer) {
+    if (bit_size >= INT_MAX - FFMAX(7, AV_INPUT_BUFFER_PADDING_SIZE*8) || bit_size < 0 || !buffer) {
         bit_size    = 0;
         buffer      = NULL;
         ret         = AVERROR_INVALIDDATA;
@@ -375,12 +629,14 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
 
     s->buffer             = buffer;
     s->size_in_bits       = bit_size;
-#if !UNCHECKED_BITSTREAM_READER
     s->size_in_bits_plus8 = bit_size + 8;
-#endif
     s->buffer_end         = buffer + buffer_size;
     s->index              = 0;
 
+#if CACHED_BITSTREAM_READER
+    refill_64(s);
+#endif
+
     return ret;
 }
 
@@ -395,8 +651,8 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
 static inline int init_get_bits8(GetBitContext *s, const uint8_t *buffer,
                                  int byte_size)
 {
-    if (byte_size > INT_MAX / 8)
-        return AVERROR_INVALIDDATA;
+    if (byte_size > INT_MAX / 8 || byte_size < 0)
+        byte_size = -1;
     return init_get_bits(s, buffer, byte_size * 8);
 }
 
@@ -445,7 +701,7 @@ static inline const uint8_t *align_get_bits(GetBitContext *s)
         SKIP_BITS(name, gb, n);                                 \
     } while (0)
 
-#define GET_RL_VLC(level, run, name, gb, table, bits,           \
+#define GET_RL_VLC(level, run, name, gb, table, bits,  \
                    max_depth, need_update)                      \
     do {                                                        \
         int n, nb_bits;                                         \
@@ -482,6 +738,19 @@ static inline const uint8_t *align_get_bits(GetBitContext *s)
         SKIP_BITS(name, gb, n);                                 \
     } while (0)
 
+/* Return the LUT element for the given bitstream configuration. */
+static inline int set_idx(GetBitContext *s, int code, int *n, int *nb_bits,
+                          VLC_TYPE (*table)[2])
+{
+    unsigned idx;
+
+    *nb_bits = -*n;
+    idx = show_bits(s, *nb_bits) + code;
+    *n = table[idx][1];
+
+    return table[idx][0];
+}
+
 /**
  * Parse a vlc code.
  * @param bits is the number of bits which will be read at once, must be
@@ -489,10 +758,29 @@ static inline const uint8_t *align_get_bits(GetBitContext *s)
  * @param max_depth is the number of times bits bits must be read to completely
  *                  read the longest vlc code
  *                  = (max_vlc_length + bits - 1) / bits
+ * @returns the code parsed or -1 if no vlc matches
  */
 static av_always_inline int get_vlc2(GetBitContext *s, VLC_TYPE (*table)[2],
                                      int bits, int max_depth)
 {
+#if CACHED_BITSTREAM_READER
+    int nb_bits;
+    unsigned idx = show_bits(s, bits);
+    int code = table[idx][0];
+    int n    = table[idx][1];
+
+    if (max_depth > 1 && n < 0) {
+        skip_remaining(s, bits);
+        code = set_idx(s, code, &n, &nb_bits, table);
+        if (max_depth > 2 && n < 0) {
+            skip_remaining(s, nb_bits);
+            code = set_idx(s, code, &n, &nb_bits, table);
+        }
+    }
+    skip_remaining(s, n);
+
+    return code;
+#else
     int code;
 
     OPEN_READER(re, s);
@@ -503,6 +791,7 @@ static av_always_inline int get_vlc2(GetBitContext *s, VLC_TYPE (*table)[2],
     CLOSE_READER(re, s);
 
     return code;
+#endif
 }
 
 static inline int decode012(GetBitContext *gb)
@@ -528,4 +817,18 @@ static inline int get_bits_left(GetBitContext *gb)
     return gb->size_in_bits - get_bits_count(gb);
 }
 
+static inline int skip_1stop_8data_bits(GetBitContext *gb)
+{
+    if (get_bits_left(gb) <= 0)
+        return AVERROR_INVALIDDATA;
+
+    while (get_bits1(gb)) {
+        skip_bits(gb, 8);
+        if (get_bits_left(gb) <= 0)
+            return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
 #endif /* AVCODEC_GET_BITS_H */
diff --git a/libavcodec/gif.c b/libavcodec/gif.c
index 451e335..9f2f30d 100644
--- a/libavcodec/gif.c
+++ b/libavcodec/gif.c
@@ -1,113 +1,370 @@
 /*
- * GIF encoder.
  * Copyright (c) 2000 Fabrice Bellard
  * Copyright (c) 2002 Francois Revol
  * Copyright (c) 2006 Baptiste Coudurier
+ * Copyright (c) 2018 Bjorn Roche
+ * Copyright (c) 2018 Paul B Mahol
  *
  * first version by Francois Revol <revol@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/*
- * Features and limitations:
- * - currently no compression is performed,
- *   in fact the size of the data is 9/8 the size of the image in 8bpp
- * - uses only a global standard palette
- * - tested with IE 5.0, Opera for BeOS, NetPositive (BeOS), and Mozilla (BeOS).
- *
- * Reference documents:
- * http://www.goice.co.jp/member/mo/formats/gif.html
- * http://astronomy.swin.edu.au/pbourke/dataformats/gif/
- * http://www.dcs.ed.ac.uk/home/mxr/gfx/2d/GIF89a.txt
- *
- * this url claims to have an LZW algorithm not covered by Unisys patent:
- * http://www.msg.net/utility/whirlgif/gifencod.html
- * could help reduce the size of the files _a lot_...
- * some sites mentions an RLE type compression also.
+/**
+ * @file
+ * GIF encoder
+ * @see http://www.w3.org/Graphics/GIF/spec-gif89a.txt
  */
 
+#define BITSTREAM_WRITER_LE
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "lzw.h"
-
-/* The GIF format uses reversed order for bitstreams... */
-/* at least they don't use PDP_ENDIAN :) */
-#define BITSTREAM_WRITER_LE
+#include "gif.h"
 
 #include "put_bits.h"
 
+#define DEFAULT_TRANSPARENCY_INDEX 0x1f
+
 typedef struct GIFContext {
+    const AVClass *class;
     LZWState *lzw;
     uint8_t *buf;
+    int buf_size;
+    AVFrame *last_frame;
+    int flags;
+    int image;
+    uint32_t palette[AVPALETTE_COUNT];  ///< local reference palette for !pal8
+    int palette_loaded;
+    int transparent_index;
+    uint8_t *tmpl;                      ///< temporary line buffer
 } GIFContext;
 
-/* GIF header */
-static int gif_image_write_header(AVCodecContext *avctx,
-                                  uint8_t **bytestream, uint32_t *palette)
+enum {
+    GF_OFFSETTING = 1<<0,
+    GF_TRANSDIFF  = 1<<1,
+};
+
+static int is_image_translucent(AVCodecContext *avctx,
+                                const uint8_t *buf, const int linesize)
 {
-    int i;
-    unsigned int v;
-
-    bytestream_put_buffer(bytestream, "GIF", 3);
-    bytestream_put_buffer(bytestream, "89a", 3);
-    bytestream_put_le16(bytestream, avctx->width);
-    bytestream_put_le16(bytestream, avctx->height);
-
-    bytestream_put_byte(bytestream, 0xf7); /* flags: global clut, 256 entries */
-    bytestream_put_byte(bytestream, 0x1f); /* background color index */
-    bytestream_put_byte(bytestream, 0); /* aspect ratio */
-
-    /* the global palette */
-    for(i=0;i<256;i++) {
-        v = palette[i];
-        bytestream_put_be24(bytestream, v);
+    GIFContext *s = avctx->priv_data;
+    int trans = s->transparent_index;
+
+    if (trans < 0)
+        return 0;
+
+    for (int y = 0; y < avctx->height; y++) {
+        for (int x = 0; x < avctx->width; x++) {
+            if (buf[x] == trans) {
+                return 1;
+            }
+        }
+        buf += linesize;
     }
 
     return 0;
 }
 
+static int get_palette_transparency_index(const uint32_t *palette)
+{
+    int transparent_color_index = -1;
+    unsigned i, smallest_alpha = 0xff;
+
+    if (!palette)
+        return -1;
+
+    for (i = 0; i < AVPALETTE_COUNT; i++) {
+        const uint32_t v = palette[i];
+        if (v >> 24 < smallest_alpha) {
+            smallest_alpha = v >> 24;
+            transparent_color_index = i;
+        }
+    }
+    return smallest_alpha < 128 ? transparent_color_index : -1;
+}
+
+static int pick_palette_entry(const uint8_t *buf, int linesize, int w, int h)
+{
+    int histogram[AVPALETTE_COUNT] = {0};
+    int x, y, i;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < w; x++)
+            histogram[buf[x]]++;
+        buf += linesize;
+    }
+    for (i = 0; i < FF_ARRAY_ELEMS(histogram); i++)
+        if (!histogram[i])
+            return i;
+    return -1;
+}
+
+static void gif_crop_translucent(AVCodecContext *avctx,
+                                 const uint8_t *buf, const int linesize,
+                                 int *width, int *height,
+                                 int *x_start, int *y_start)
+{
+    GIFContext *s = avctx->priv_data;
+    int trans = s->transparent_index;
+
+    /* Crop image */
+    if ((s->flags & GF_OFFSETTING) && trans >= 0) {
+        const int w = avctx->width;
+        const int h = avctx->height;
+        int x_end = w - 1,
+            y_end = h - 1;
+
+        // crop top
+        while (*y_start < y_end) {
+            int is_trans = 1;
+            for (int i = 0; i < w; i++) {
+                if (buf[w * *y_start + i] != trans) {
+                    is_trans = 0;
+                    break;
+                }
+            }
+
+            if (!is_trans)
+                break;
+            (*y_start)++;
+        }
+
+        // crop bottom
+        while (y_end < h) {
+            int is_trans = 1;
+            for (int i = 0; i < w; i++) {
+                if (buf[w * y_end + i] != trans) {
+                    is_trans = 0;
+                    break;
+                }
+            }
+            if (!is_trans)
+                break;
+            y_end--;
+        }
+
+        // crop left
+        while (*x_start < x_end) {
+            int is_trans = 1;
+            for (int i = *y_start; i < y_end; i++) {
+                if (buf[w * i + *x_start] != trans) {
+                    is_trans = 0;
+                    break;
+                }
+            }
+            if (!is_trans)
+                break;
+            (*x_start)++;
+        }
+
+        // crop right
+        while (x_end < w) {
+            int is_trans = 1;
+            for (int i = *y_start; i < y_end; i++) {
+                if (buf[w * i + x_end] != trans) {
+                    is_trans = 0;
+                    break;
+                }
+            }
+            if (!is_trans)
+                break;
+            x_end--;
+        }
+
+        *height = y_end + 1 - *y_start;
+        *width  = x_end + 1 - *x_start;
+        av_log(avctx, AV_LOG_DEBUG,"%dx%d image at pos (%d;%d) [area:%dx%d]\n",
+               *width, *height, *x_start, *y_start, avctx->width, avctx->height);
+    }
+}
+
+static void gif_crop_opaque(AVCodecContext *avctx,
+                            const uint32_t *palette,
+                            const uint8_t *buf, const int linesize,
+                            int *width, int *height, int *x_start, int *y_start)
+{
+    GIFContext *s = avctx->priv_data;
+
+    /* Crop image */
+    if ((s->flags & GF_OFFSETTING) && s->last_frame && !palette) {
+        const uint8_t *ref = s->last_frame->data[0];
+        const int ref_linesize = s->last_frame->linesize[0];
+        int x_end = avctx->width  - 1,
+            y_end = avctx->height - 1;
+
+        /* skip common lines */
+        while (*y_start < y_end) {
+            if (memcmp(ref + *y_start*ref_linesize, buf + *y_start*linesize, *width))
+                break;
+            (*y_start)++;
+        }
+        while (y_end > *y_start) {
+            if (memcmp(ref + y_end*ref_linesize, buf + y_end*linesize, *width))
+                break;
+            y_end--;
+        }
+        *height = y_end + 1 - *y_start;
+
+        /* skip common columns */
+        while (*x_start < x_end) {
+            int same_column = 1;
+            for (int y = *y_start; y <= y_end; y++) {
+                if (ref[y*ref_linesize + *x_start] != buf[y*linesize + *x_start]) {
+                    same_column = 0;
+                    break;
+                }
+            }
+            if (!same_column)
+                break;
+            (*x_start)++;
+        }
+        while (x_end > *x_start) {
+            int same_column = 1;
+            for (int y = *y_start; y <= y_end; y++) {
+                if (ref[y*ref_linesize + x_end] != buf[y*linesize + x_end]) {
+                    same_column = 0;
+                    break;
+                }
+            }
+            if (!same_column)
+                break;
+            x_end--;
+        }
+        *width = x_end + 1 - *x_start;
+
+        av_log(avctx, AV_LOG_DEBUG,"%dx%d image at pos (%d;%d) [area:%dx%d]\n",
+               *width, *height, *x_start, *y_start, avctx->width, avctx->height);
+    }
+}
+
 static int gif_image_write_image(AVCodecContext *avctx,
                                  uint8_t **bytestream, uint8_t *end,
-                                 const uint8_t *buf, int linesize)
+                                 const uint32_t *palette,
+                                 const uint8_t *buf, const int linesize,
+                                 AVPacket *pkt)
 {
     GIFContext *s = avctx->priv_data;
-    int len = 0, height;
+    int disposal, len = 0, height = avctx->height, width = avctx->width, x, y;
+    int x_start = 0, y_start = 0, trans = s->transparent_index;
+    int bcid = -1, honor_transparency = (s->flags & GF_TRANSDIFF) && s->last_frame && !palette;
     const uint8_t *ptr;
+
+    if (!s->image && avctx->frame_number && is_image_translucent(avctx, buf, linesize)) {
+        gif_crop_translucent(avctx, buf, linesize, &width, &height, &x_start, &y_start);
+        honor_transparency = 0;
+        disposal = GCE_DISPOSAL_BACKGROUND;
+    } else {
+        gif_crop_opaque(avctx, palette, buf, linesize, &width, &height, &x_start, &y_start);
+        disposal = GCE_DISPOSAL_INPLACE;
+    }
+
+    if (s->image || !avctx->frame_number) { /* GIF header */
+        const uint32_t *global_palette = palette ? palette : s->palette;
+        const AVRational sar = avctx->sample_aspect_ratio;
+        int64_t aspect = 0;
+
+        if (sar.num > 0 && sar.den > 0) {
+            aspect = sar.num * 64LL / sar.den - 15;
+            if (aspect < 0 || aspect > 255)
+                aspect = 0;
+        }
+
+        bytestream_put_buffer(bytestream, gif89a_sig, sizeof(gif89a_sig));
+        bytestream_put_le16(bytestream, avctx->width);
+        bytestream_put_le16(bytestream, avctx->height);
+
+        bcid = get_palette_transparency_index(global_palette);
+
+        bytestream_put_byte(bytestream, 0xf7); /* flags: global clut, 256 entries */
+        bytestream_put_byte(bytestream, bcid < 0 ? DEFAULT_TRANSPARENCY_INDEX : bcid); /* background color index */
+        bytestream_put_byte(bytestream, aspect);
+        for (int i = 0; i < 256; i++) {
+            const uint32_t v = global_palette[i] & 0xffffff;
+            bytestream_put_be24(bytestream, v);
+        }
+    }
+
+    if (honor_transparency && trans < 0) {
+        trans = pick_palette_entry(buf + y_start*linesize + x_start,
+                                   linesize, width, height);
+        if (trans < 0) // TODO, patch welcome
+            av_log(avctx, AV_LOG_DEBUG, "No available color, can not use transparency\n");
+    }
+
+    if (trans < 0)
+        honor_transparency = 0;
+
+    bcid = honor_transparency || disposal == GCE_DISPOSAL_BACKGROUND ? trans : get_palette_transparency_index(palette);
+
+    /* graphic control extension */
+    bytestream_put_byte(bytestream, GIF_EXTENSION_INTRODUCER);
+    bytestream_put_byte(bytestream, GIF_GCE_EXT_LABEL);
+    bytestream_put_byte(bytestream, 0x04); /* block size */
+    bytestream_put_byte(bytestream, disposal<<2 | (bcid >= 0));
+    bytestream_put_le16(bytestream, 5); // default delay
+    bytestream_put_byte(bytestream, bcid < 0 ? DEFAULT_TRANSPARENCY_INDEX : bcid);
+    bytestream_put_byte(bytestream, 0x00);
+
     /* image block */
+    bytestream_put_byte(bytestream, GIF_IMAGE_SEPARATOR);
+    bytestream_put_le16(bytestream, x_start);
+    bytestream_put_le16(bytestream, y_start);
+    bytestream_put_le16(bytestream, width);
+    bytestream_put_le16(bytestream, height);
 
-    bytestream_put_byte(bytestream, 0x2c);
-    bytestream_put_le16(bytestream, 0);
-    bytestream_put_le16(bytestream, 0);
-    bytestream_put_le16(bytestream, avctx->width);
-    bytestream_put_le16(bytestream, avctx->height);
-    bytestream_put_byte(bytestream, 0x00); /* flags */
-    /* no local clut */
+    if (!palette) {
+        bytestream_put_byte(bytestream, 0x00); /* flags */
+    } else {
+        unsigned i;
+        bytestream_put_byte(bytestream, 1<<7 | 0x7); /* flags */
+        for (i = 0; i < AVPALETTE_COUNT; i++) {
+            const uint32_t v = palette[i];
+            bytestream_put_be24(bytestream, v);
+        }
+    }
 
     bytestream_put_byte(bytestream, 0x08);
 
-    ff_lzw_encode_init(s->lzw, s->buf, avctx->width*avctx->height,
+    ff_lzw_encode_init(s->lzw, s->buf, s->buf_size,
                        12, FF_LZW_GIF, put_bits);
 
-    ptr = buf;
-    for (height = avctx->height; height--;) {
-        len += ff_lzw_encode(s->lzw, ptr, avctx->width);
-        ptr += linesize;
+    ptr = buf + y_start*linesize + x_start;
+    if (honor_transparency) {
+        const int ref_linesize = s->last_frame->linesize[0];
+        const uint8_t *ref = s->last_frame->data[0] + y_start*ref_linesize + x_start;
+
+        for (y = 0; y < height; y++) {
+            memcpy(s->tmpl, ptr, width);
+            for (x = 0; x < width; x++)
+                if (ref[x] == ptr[x])
+                    s->tmpl[x] = trans;
+            len += ff_lzw_encode(s->lzw, s->tmpl, width);
+            ptr += linesize;
+            ref += ref_linesize;
+        }
+    } else {
+        for (y = 0; y < height; y++) {
+            len += ff_lzw_encode(s->lzw, ptr, width);
+            ptr += linesize;
+        }
     }
     len += ff_lzw_encode_flush(s->lzw, flush_put_bits);
 
@@ -122,7 +379,6 @@ static int gif_image_write_image(AVCodecContext *avctx,
         len -= size;
     }
     bytestream_put_byte(bytestream, 0x00); /* end of image block */
-    bytestream_put_byte(bytestream, 0x3b);
     return 0;
 }
 
@@ -130,41 +386,69 @@ static av_cold int gif_encode_init(AVCodecContext *avctx)
 {
     GIFContext *s = avctx->priv_data;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    if (avctx->width > 65535 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR, "GIF does not support resolutions above 65535x65535\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->transparent_index = -1;
 
     s->lzw = av_mallocz(ff_lzw_encode_state_size);
-    if (!s->lzw)
+    s->buf_size = avctx->width*avctx->height*2 + 1000;
+    s->buf = av_malloc(s->buf_size);
+    s->tmpl = av_malloc(avctx->width);
+    if (!s->tmpl || !s->buf || !s->lzw)
         return AVERROR(ENOMEM);
-    s->buf = av_malloc(avctx->width*avctx->height*2);
-    if (!s->buf)
-         return AVERROR(ENOMEM);
+
+    if (avpriv_set_systematic_pal2(s->palette, avctx->pix_fmt) < 0)
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_PAL8);
+
     return 0;
 }
 
-/* better than nothing gif encoder */
 static int gif_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                             const AVFrame *pict, int *got_packet)
 {
+    GIFContext *s = avctx->priv_data;
     uint8_t *outbuf_ptr, *end;
+    const uint32_t *palette = NULL;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width*avctx->height*7/5 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*7/5 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
     outbuf_ptr = pkt->data;
     end        = pkt->data + pkt->size;
 
-    gif_image_write_header(avctx, &outbuf_ptr, (uint32_t *)pict->data[1]);
-    gif_image_write_image(avctx, &outbuf_ptr, end, pict->data[0], pict->linesize[0]);
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        palette = (uint32_t*)pict->data[1];
+
+        if (!s->palette_loaded) {
+            memcpy(s->palette, palette, AVPALETTE_SIZE);
+            s->transparent_index = get_palette_transparency_index(palette);
+            s->palette_loaded = 1;
+        } else if (!memcmp(s->palette, palette, AVPALETTE_SIZE)) {
+            palette = NULL;
+        }
+    }
+
+    gif_image_write_image(avctx, &outbuf_ptr, end, palette,
+                          pict->data[0], pict->linesize[0], pkt);
+    if (!s->last_frame && !s->image) {
+        s->last_frame = av_frame_alloc();
+        if (!s->last_frame)
+            return AVERROR(ENOMEM);
+    }
+
+    if (!s->image) {
+        av_frame_unref(s->last_frame);
+        ret = av_frame_ref(s->last_frame, (AVFrame*)pict);
+        if (ret < 0)
+            return ret;
+    }
 
     pkt->size   = outbuf_ptr - pkt->data;
-    pkt->flags |= AV_PKT_FLAG_KEY;
+    if (s->image || !avctx->frame_number)
+        pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
 
     return 0;
@@ -176,9 +460,29 @@ static int gif_encode_close(AVCodecContext *avctx)
 
     av_freep(&s->lzw);
     av_freep(&s->buf);
+    s->buf_size = 0;
+    av_frame_free(&s->last_frame);
+    av_freep(&s->tmpl);
     return 0;
 }
 
+#define OFFSET(x) offsetof(GIFContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption gif_options[] = {
+    { "gifflags", "set GIF flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = GF_OFFSETTING|GF_TRANSDIFF}, 0, INT_MAX, FLAGS, "flags" },
+        { "offsetting", "enable picture offsetting", 0, AV_OPT_TYPE_CONST, {.i64=GF_OFFSETTING}, INT_MIN, INT_MAX, FLAGS, "flags" },
+        { "transdiff", "enable transparency detection between frames", 0, AV_OPT_TYPE_CONST, {.i64=GF_TRANSDIFF}, INT_MIN, INT_MAX, FLAGS, "flags" },
+    { "gifimage", "enable encoding only images per frame", OFFSET(image), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS, "flags" },
+    { NULL }
+};
+
+static const AVClass gif_class = {
+    .class_name = "GIF encoder",
+    .item_name  = av_default_item_name,
+    .option     = gif_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_gif_encoder = {
     .name           = "gif",
     .long_name      = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
@@ -192,4 +496,5 @@ AVCodec ff_gif_encoder = {
         AV_PIX_FMT_RGB8, AV_PIX_FMT_BGR8, AV_PIX_FMT_RGB4_BYTE, AV_PIX_FMT_BGR4_BYTE,
         AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8, AV_PIX_FMT_NONE
     },
+    .priv_class     = &gif_class,
 };
diff --git a/libavcodec/gif.h b/libavcodec/gif.h
new file mode 100644
index 0000000..7fb6149
--- /dev/null
+++ b/libavcodec/gif.h
@@ -0,0 +1,50 @@
+/*
+ * GIF format definitions
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2006 Baptiste Coudurier
+ * Copyright (c) 2012 Vitaliy E Sugrobov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * GIF format definitions.
+ */
+
+#ifndef AVCODEC_GIF_H
+#define AVCODEC_GIF_H
+
+#include <stdint.h>
+
+static const uint8_t gif87a_sig[6] = "GIF87a";
+static const uint8_t gif89a_sig[6] = "GIF89a";
+
+#define GCE_DISPOSAL_NONE       0
+#define GCE_DISPOSAL_INPLACE    1
+#define GCE_DISPOSAL_BACKGROUND 2
+#define GCE_DISPOSAL_RESTORE    3
+
+#define GIF_TRAILER                 0x3b
+#define GIF_EXTENSION_INTRODUCER    0x21
+#define GIF_IMAGE_SEPARATOR         0x2c
+#define GIF_GCE_EXT_LABEL           0xf9
+#define GIF_COM_EXT_LABEL           0xfe
+#define GIF_APP_EXT_LABEL           0xff
+#define NETSCAPE_EXT_STR            "NETSCAPE2.0"
+
+#endif /* AVCODEC_GIF_H */
diff --git a/libavcodec/gif_parser.c b/libavcodec/gif_parser.c
new file mode 100644
index 0000000..e88338f
--- /dev/null
+++ b/libavcodec/gif_parser.c
@@ -0,0 +1,188 @@
+/*
+ * GIF parser
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * GIF parser
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/bswap.h"
+#include "libavutil/common.h"
+
+#include "gif.h"
+#include "parser.h"
+
+typedef enum GIFParseStates {
+    GIF_HEADER = 1,
+    GIF_EXTENSION,
+    GIF_EXTENSION_BLOCK,
+    GIF_IMAGE,
+    GIF_IMAGE_BLOCK,
+} gif_states;
+
+typedef struct GIFParseContext {
+    ParseContext pc;
+    unsigned found_sig;
+    int found_start;
+    int found_end;
+    int index;
+    int state;
+    int gct_flag;
+    int gct_size;
+    int block_size;
+    int etype;
+    int delay;
+} GIFParseContext;
+
+static int gif_find_frame_end(GIFParseContext *g, const uint8_t *buf,
+                              int buf_size, void *logctx)
+{
+    int index, next = END_NOT_FOUND;
+
+    for (index = 0; index < buf_size; index++) {
+        if (!g->state) {
+            if (!memcmp(buf + index, gif87a_sig, 6) ||
+                !memcmp(buf + index, gif89a_sig, 6)) {
+                g->state = GIF_HEADER;
+                g->found_sig++;
+            } else if (buf[index] == GIF_EXTENSION_INTRODUCER) {
+                g->state = GIF_EXTENSION;
+                g->found_start = 1;
+            } else if (buf[index] == GIF_IMAGE_SEPARATOR) {
+                g->state = GIF_IMAGE;
+            } else if (buf[index] == GIF_TRAILER) {
+                g->state = 0;
+                g->found_end = 1;
+                g->found_sig = 0;
+            } else {
+                g->found_sig = 0;
+            }
+        }
+
+        if (g->state == GIF_HEADER) {
+            if (g->index == 10) {
+                g->gct_flag = !!(buf[index] & 0x80);
+                g->gct_size = 3 * (1 << ((buf[index] & 0x07) + 1));
+            }
+            if (g->index >= 12 + g->gct_flag * g->gct_size) {
+                g->state = 0;
+                g->index = 0;
+                g->gct_flag = 0;
+                g->gct_size = 0;
+                continue;
+            }
+            g->index++;
+        } else if (g->state == GIF_EXTENSION) {
+            if (g->found_start && g->found_end && g->found_sig) {
+                next = index;
+                g->found_start = 0;
+                g->found_end = 0;
+                g->index = 0;
+                g->gct_flag = 0;
+                g->gct_size = 0;
+                g->state = 0;
+                break;
+            }
+            if (g->index == 1) {
+                g->etype = buf[index];
+            }
+            if (g->index >= 2) {
+                g->block_size = buf[index];
+                g->index = 0;
+                g->state = GIF_EXTENSION_BLOCK;
+                continue;
+            }
+            g->index++;
+        } else if (g->state == GIF_IMAGE_BLOCK) {
+            if (!g->index)
+                g->block_size = buf[index];
+            if (g->index >= g->block_size) {
+                g->index = 0;
+                if (!g->block_size) {
+                    g->state = 0;
+                    g->found_end = 1;
+                }
+                continue;
+            }
+            g->index++;
+        } else if (g->state == GIF_EXTENSION_BLOCK) {
+            if (g->etype == GIF_GCE_EXT_LABEL) {
+                if (g->index == 0)
+                    g->delay = 0;
+                if (g->index >= 1 && g->index <= 2) {
+                    g->delay |= buf[index] << (8 * (g->index - 1));
+                }
+            }
+            if (g->index >= g->block_size) {
+                g->block_size = buf[index];
+                g->index = 0;
+                if (!g->block_size)
+                    g->state = 0;
+                continue;
+            }
+            g->index++;
+        } else if (g->state == GIF_IMAGE) {
+            if (g->index == 8) {
+                g->gct_flag = !!(buf[index] & 0x80);
+                g->gct_size = 3 * (1 << ((buf[index] & 0x07) + 1));
+            }
+            if (g->index >= 10 + g->gct_flag * g->gct_size) {
+                g->state = GIF_IMAGE_BLOCK;
+                g->index = 0;
+                g->gct_flag = 0;
+                g->gct_size = 0;
+                continue;
+            }
+            g->index++;
+        }
+    }
+
+    return next;
+}
+
+static int gif_parse(AVCodecParserContext *s, AVCodecContext *avctx,
+                     const uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size)
+{
+    GIFParseContext *g = s->priv_data;
+    int next;
+
+    next = gif_find_frame_end(g, buf, buf_size, avctx);
+    if (ff_combine_frame(&g->pc, next, &buf, &buf_size) < 0) {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+
+    s->duration   = g->delay;
+
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+AVCodecParser ff_gif_parser = {
+    .codec_ids      = { AV_CODEC_ID_GIF },
+    .priv_data_size = sizeof(GIFParseContext),
+    .parser_parse   = gif_parse,
+    .parser_close   = ff_parse_close,
+};
diff --git a/libavcodec/gifdec.c b/libavcodec/gifdec.c
index f08d501..2115da1 100644
--- a/libavcodec/gifdec.c
+++ b/libavcodec/gifdec.c
@@ -2,122 +2,276 @@
  * GIF decoder
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Baptiste Coudurier
+ * Copyright (c) 2012 Vitaliy E Sugrobov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "lzw.h"
+#include "gif.h"
 
-#define GCE_DISPOSAL_NONE       0
-#define GCE_DISPOSAL_INPLACE    1
-#define GCE_DISPOSAL_BACKGROUND 2
-#define GCE_DISPOSAL_RESTORE    3
+/* This value is intentionally set to "transparent white" color.
+ * It is much better to have white background instead of black
+ * when gif image converted to format which not support transparency.
+ */
+#define GIF_TRANSPARENT_COLOR    0x00ffffff
 
 typedef struct GifState {
+    const AVClass *class;
+    AVFrame *frame;
     int screen_width;
     int screen_height;
+    int has_global_palette;
     int bits_per_pixel;
+    uint32_t bg_color;
     int background_color_index;
     int transparent_color_index;
     int color_resolution;
-    uint32_t *image_palette;
+    /* intermediate buffer for storing color indices
+     * obtained from lzw-encoded data stream */
+    uint8_t *idx_line;
+    int idx_line_size;
 
     /* after the frame is displayed, the disposal method is used */
+    int gce_prev_disposal;
     int gce_disposal;
-    /* delay during which the frame is shown */
-    int gce_delay;
+    /* rectangle describing area that must be disposed */
+    int gce_l, gce_t, gce_w, gce_h;
+    /* depending on disposal method we store either part of the image
+     * drawn on the canvas or background color that
+     * should be used upon disposal */
+    uint32_t * stored_img;
+    int stored_img_size;
+    int stored_bg_color;
 
-    /* LZW compatible decoder */
     GetByteContext gb;
     LZWState *lzw;
 
     /* aux buffers */
-    uint8_t global_palette[256 * 3];
-    uint8_t local_palette[256 * 3];
+    uint32_t global_palette[256];
+    uint32_t local_palette[256];
 
-  AVCodecContext* avctx;
+    AVCodecContext *avctx;
+    int keyframe;
+    int keyframe_ok;
+    int trans_color;    /**< color value that is used instead of transparent color */
 } GifState;
 
-static const uint8_t gif87a_sig[6] = "GIF87a";
-static const uint8_t gif89a_sig[6] = "GIF89a";
+static void gif_read_palette(GifState *s, uint32_t *pal, int nb)
+{
+    int i;
+
+    for (i = 0; i < nb; i++, pal++)
+        *pal = (0xffu << 24) | bytestream2_get_be24u(&s->gb);
+}
+
+static void gif_fill(AVFrame *picture, uint32_t color)
+{
+    uint32_t *p = (uint32_t *)picture->data[0];
+    uint32_t *p_end = p + (picture->linesize[0] / sizeof(uint32_t)) * picture->height;
+
+    for (; p < p_end; p++)
+        *p = color;
+}
+
+static void gif_fill_rect(AVFrame *picture, uint32_t color, int l, int t, int w, int h)
+{
+    const int linesize = picture->linesize[0] / sizeof(uint32_t);
+    const uint32_t *py = (uint32_t *)picture->data[0] + t * linesize;
+    const uint32_t *pr, *pb = py + h * linesize;
+    uint32_t *px;
+
+    for (; py < pb; py += linesize) {
+        px = (uint32_t *)py + l;
+        pr = px + w;
+
+        for (; px < pr; px++)
+            *px = color;
+    }
+}
+
+static void gif_copy_img_rect(const uint32_t *src, uint32_t *dst,
+                              int linesize, int l, int t, int w, int h)
+{
+    const int y_start = t * linesize;
+    const uint32_t *src_px,
+                   *src_py = src + y_start,
+                   *dst_py = dst + y_start;
+    const uint32_t *src_pb = src_py + h * linesize;
+    uint32_t *dst_px;
+
+    for (; src_py < src_pb; src_py += linesize, dst_py += linesize) {
+        src_px = src_py + l;
+        dst_px = (uint32_t *)dst_py + l;
+
+        memcpy(dst_px, src_px, w * sizeof(uint32_t));
+    }
+}
 
 static int gif_read_image(GifState *s, AVFrame *frame)
 {
-    int left, top, width, height, bits_per_pixel, code_size, flags;
-    int is_interleaved, has_local_palette, y, pass, y1, linesize, n, i;
-    uint8_t *ptr, *spal, *palette, *ptr1;
-
-    left   = bytestream2_get_le16(&s->gb);
-    top    = bytestream2_get_le16(&s->gb);
-    width  = bytestream2_get_le16(&s->gb);
-    height = bytestream2_get_le16(&s->gb);
-    flags  = bytestream2_get_byte(&s->gb);
+    int left, top, width, height, bits_per_pixel, code_size, flags, pw;
+    int is_interleaved, has_local_palette, y, pass, y1, linesize, pal_size, lzwed_len;
+    uint32_t *ptr, *pal, *px, *pr, *ptr1;
+    int ret;
+    uint8_t *idx;
+
+    /* At least 9 bytes of Image Descriptor. */
+    if (bytestream2_get_bytes_left(&s->gb) < 9)
+        return AVERROR_INVALIDDATA;
+
+    left   = bytestream2_get_le16u(&s->gb);
+    top    = bytestream2_get_le16u(&s->gb);
+    width  = bytestream2_get_le16u(&s->gb);
+    height = bytestream2_get_le16u(&s->gb);
+    flags  = bytestream2_get_byteu(&s->gb);
     is_interleaved = flags & 0x40;
     has_local_palette = flags & 0x80;
     bits_per_pixel = (flags & 0x07) + 1;
 
-    ff_dlog(s->avctx, "gif: image x=%d y=%d w=%d h=%d\n", left, top, width, height);
+    ff_dlog(s->avctx, "image x=%d y=%d w=%d h=%d\n", left, top, width, height);
 
     if (has_local_palette) {
-        bytestream2_get_buffer(&s->gb, s->local_palette, 3 * (1 << bits_per_pixel));
-        palette = s->local_palette;
+        pal_size = 1 << bits_per_pixel;
+
+        if (bytestream2_get_bytes_left(&s->gb) < pal_size * 3)
+            return AVERROR_INVALIDDATA;
+
+        gif_read_palette(s, s->local_palette, pal_size);
+        pal = s->local_palette;
     } else {
-        palette = s->global_palette;
-        bits_per_pixel = s->bits_per_pixel;
+        if (!s->has_global_palette) {
+            av_log(s->avctx, AV_LOG_ERROR, "picture doesn't have either global or local palette.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        pal = s->global_palette;
+    }
+
+    if (s->keyframe) {
+        if (s->transparent_color_index == -1 && s->has_global_palette) {
+            /* transparency wasn't set before the first frame, fill with background color */
+            gif_fill(frame, s->bg_color);
+        } else {
+            /* otherwise fill with transparent color.
+             * this is necessary since by default picture filled with 0x80808080. */
+            gif_fill(frame, s->trans_color);
+        }
     }
 
     /* verify that all the image is inside the screen dimensions */
-    if (left + width > s->screen_width ||
-        top + height > s->screen_height ||
-        !width || !height) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid image dimensions.\n");
+    if (!width || width > s->screen_width) {
+        av_log(s->avctx, AV_LOG_WARNING, "Invalid image width: %d, truncating.\n", width);
+        width = s->screen_width;
+    }
+    if (left >= s->screen_width) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid left position: %d.\n", left);
+        return AVERROR_INVALIDDATA;
+    }
+    if (!height || height > s->screen_height) {
+        av_log(s->avctx, AV_LOG_WARNING, "Invalid image height: %d, truncating.\n", height);
+        height = s->screen_height;
+    }
+    if (top >= s->screen_height) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid top position: %d.\n", top);
         return AVERROR_INVALIDDATA;
     }
+    if (left + width > s->screen_width) {
+        /* width must be kept around to avoid lzw vs line desync */
+        pw = s->screen_width - left;
+        av_log(s->avctx, AV_LOG_WARNING, "Image too wide by %d, truncating.\n",
+               left + width - s->screen_width);
+    } else {
+        pw = width;
+    }
+    if (top + height > s->screen_height) {
+        /* we don't care about the extra invisible lines */
+        av_log(s->avctx, AV_LOG_WARNING, "Image too high by %d, truncating.\n",
+               top + height - s->screen_height);
+        height = s->screen_height - top;
+    }
+
+    /* process disposal method */
+    if (s->gce_prev_disposal == GCE_DISPOSAL_BACKGROUND) {
+        gif_fill_rect(frame, s->stored_bg_color, s->gce_l, s->gce_t, s->gce_w, s->gce_h);
+    } else if (s->gce_prev_disposal == GCE_DISPOSAL_RESTORE) {
+        gif_copy_img_rect(s->stored_img, (uint32_t *)frame->data[0],
+            frame->linesize[0] / sizeof(uint32_t), s->gce_l, s->gce_t, s->gce_w, s->gce_h);
+    }
+
+    s->gce_prev_disposal = s->gce_disposal;
 
-    /* build the palette */
-    n = (1 << bits_per_pixel);
-    spal = palette;
-    for(i = 0; i < n; i++) {
-        s->image_palette[i] = (0xffu << 24) | AV_RB24(spal);
-        spal += 3;
+    if (s->gce_disposal != GCE_DISPOSAL_NONE) {
+        s->gce_l = left;  s->gce_t = top;
+        s->gce_w = pw;    s->gce_h = height;
+
+        if (s->gce_disposal == GCE_DISPOSAL_BACKGROUND) {
+            if (s->transparent_color_index >= 0)
+                s->stored_bg_color = s->trans_color;
+            else
+                s->stored_bg_color = s->bg_color;
+        } else if (s->gce_disposal == GCE_DISPOSAL_RESTORE) {
+            av_fast_malloc(&s->stored_img, &s->stored_img_size, frame->linesize[0] * frame->height);
+            if (!s->stored_img)
+                return AVERROR(ENOMEM);
+
+            gif_copy_img_rect((uint32_t *)frame->data[0], s->stored_img,
+                frame->linesize[0] / sizeof(uint32_t), left, top, pw, height);
+        }
     }
-    for(; i < 256; i++)
-        s->image_palette[i] = (0xffu << 24);
-    /* handle transparency */
-    if (s->transparent_color_index >= 0)
-        s->image_palette[s->transparent_color_index] = 0;
+
+    /* Expect at least 2 bytes: 1 for lzw code size and 1 for block size. */
+    if (bytestream2_get_bytes_left(&s->gb) < 2)
+        return AVERROR_INVALIDDATA;
 
     /* now get the image data */
-    code_size = bytestream2_get_byte(&s->gb);
-    ff_lzw_decode_init(s->lzw, code_size, s->gb.buffer,
-                       bytestream2_get_bytes_left(&s->gb), FF_LZW_GIF);
+    code_size = bytestream2_get_byteu(&s->gb);
+    if ((ret = ff_lzw_decode_init(s->lzw, code_size, s->gb.buffer,
+                                  bytestream2_get_bytes_left(&s->gb), FF_LZW_GIF)) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "LZW init failed\n");
+        return ret;
+    }
 
     /* read all the image */
-    linesize = frame->linesize[0];
-    ptr1 = frame->data[0] + top * linesize + left;
+    linesize = frame->linesize[0] / sizeof(uint32_t);
+    ptr1 = (uint32_t *)frame->data[0] + top * linesize + left;
     ptr = ptr1;
     pass = 0;
     y1 = 0;
     for (y = 0; y < height; y++) {
-        ff_lzw_decode(s->lzw, ptr, width);
+        int count = ff_lzw_decode(s->lzw, s->idx_line, width);
+        if (count != width) {
+            if (count)
+                av_log(s->avctx, AV_LOG_ERROR, "LZW decode failed\n");
+            goto decode_tail;
+        }
+
+        pr = ptr + pw;
+
+        for (px = ptr, idx = s->idx_line; px < pr; px++, idx++) {
+            if (*idx != s->transparent_color_index)
+                *px = pal[*idx];
+        }
+
         if (is_interleaved) {
             switch(pass) {
             default:
@@ -144,53 +298,77 @@ static int gif_read_image(GifState *s, AVFrame *frame)
             ptr += linesize;
         }
     }
+
+ decode_tail:
     /* read the garbage data until end marker is found */
-    ff_lzw_decode_tail(s->lzw);
+    lzwed_len = ff_lzw_decode_tail(s->lzw);
+    bytestream2_skipu(&s->gb, lzwed_len);
+
+    /* Graphic Control Extension's scope is single frame.
+     * Remove its influence. */
+    s->transparent_color_index = -1;
+    s->gce_disposal = GCE_DISPOSAL_NONE;
 
-    bytestream2_skip(&s->gb, ff_lzw_size_read(s->lzw));
     return 0;
 }
 
 static int gif_read_extension(GifState *s)
 {
-    int ext_code, ext_len, i, gce_flags, gce_transparent_index;
+    int ext_code, ext_len, gce_flags, gce_transparent_index;
 
-    /* extension */
-    ext_code = bytestream2_get_byte(&s->gb);
-    ext_len  = bytestream2_get_byte(&s->gb);
+    /* There must be at least 2 bytes:
+     * 1 for extension label and 1 for extension length. */
+    if (bytestream2_get_bytes_left(&s->gb) < 2)
+        return AVERROR_INVALIDDATA;
+
+    ext_code = bytestream2_get_byteu(&s->gb);
+    ext_len  = bytestream2_get_byteu(&s->gb);
 
-    ff_dlog(s->avctx, "gif: ext_code=0x%x len=%d\n", ext_code, ext_len);
+    ff_dlog(s->avctx, "ext_code=0x%x len=%d\n", ext_code, ext_len);
 
     switch(ext_code) {
-    case 0xf9:
+    case GIF_GCE_EXT_LABEL:
         if (ext_len != 4)
             goto discard_ext;
-        s->transparent_color_index = -1;
-        gce_flags    = bytestream2_get_byte(&s->gb);
-        s->gce_delay = bytestream2_get_le16(&s->gb);
-        gce_transparent_index = bytestream2_get_byte(&s->gb);
+
+        /* We need at least 5 bytes more: 4 is for extension body
+         * and 1 for next block size. */
+        if (bytestream2_get_bytes_left(&s->gb) < 5)
+            return AVERROR_INVALIDDATA;
+
+        gce_flags    = bytestream2_get_byteu(&s->gb);
+        bytestream2_skipu(&s->gb, 2);    // delay during which the frame is shown
+        gce_transparent_index = bytestream2_get_byteu(&s->gb);
         if (gce_flags & 0x01)
             s->transparent_color_index = gce_transparent_index;
         else
             s->transparent_color_index = -1;
         s->gce_disposal = (gce_flags >> 2) & 0x7;
 
-        ff_dlog(s->avctx, "gif: gce_flags=%x delay=%d tcolor=%d disposal=%d\n",
-               gce_flags, s->gce_delay,
+        ff_dlog(s->avctx, "gce_flags=%x tcolor=%d disposal=%d\n",
+               gce_flags,
                s->transparent_color_index, s->gce_disposal);
 
-        ext_len = bytestream2_get_byte(&s->gb);
+        if (s->gce_disposal > 3) {
+            s->gce_disposal = GCE_DISPOSAL_NONE;
+            ff_dlog(s->avctx, "invalid value in gce_disposal (%d). Using default value of 0.\n", ext_len);
+        }
+
+        ext_len = bytestream2_get_byteu(&s->gb);
         break;
     }
 
     /* NOTE: many extension blocks can come after */
  discard_ext:
-    while (ext_len != 0) {
-        for (i = 0; i < ext_len; i++)
-            bytestream2_get_byte(&s->gb);
-        ext_len = bytestream2_get_byte(&s->gb);
+    while (ext_len) {
+        /* There must be at least ext_len bytes and 1 for next block size byte. */
+        if (bytestream2_get_bytes_left(&s->gb) < ext_len + 1)
+            return AVERROR_INVALIDDATA;
+
+        bytestream2_skipu(&s->gb, ext_len);
+        ext_len = bytestream2_get_byteu(&s->gb);
 
-        ff_dlog(s->avctx, "gif: ext_len1=%d\n", ext_len);
+        ff_dlog(s->avctx, "ext_len1=%d\n", ext_len);
     }
     return 0;
 }
@@ -199,44 +377,48 @@ static int gif_read_header1(GifState *s)
 {
     uint8_t sig[6];
     int v, n;
-    int has_global_palette;
+    int background_color_index;
 
     if (bytestream2_get_bytes_left(&s->gb) < 13)
         return AVERROR_INVALIDDATA;
 
     /* read gif signature */
-    bytestream2_get_buffer(&s->gb, sig, 6);
-    if (memcmp(sig, gif87a_sig, 6) != 0 &&
-        memcmp(sig, gif89a_sig, 6) != 0)
+    bytestream2_get_bufferu(&s->gb, sig, 6);
+    if (memcmp(sig, gif87a_sig, 6) &&
+        memcmp(sig, gif89a_sig, 6))
         return AVERROR_INVALIDDATA;
 
     /* read screen header */
     s->transparent_color_index = -1;
-    s->screen_width  = bytestream2_get_le16(&s->gb);
-    s->screen_height = bytestream2_get_le16(&s->gb);
-    if(   (unsigned)s->screen_width  > 32767
-       || (unsigned)s->screen_height > 32767){
-        av_log(NULL, AV_LOG_ERROR, "picture size too large\n");
-        return AVERROR_INVALIDDATA;
-    }
+    s->screen_width  = bytestream2_get_le16u(&s->gb);
+    s->screen_height = bytestream2_get_le16u(&s->gb);
 
-    v = bytestream2_get_byte(&s->gb);
+    v = bytestream2_get_byteu(&s->gb);
     s->color_resolution = ((v & 0x70) >> 4) + 1;
-    has_global_palette = (v & 0x80);
+    s->has_global_palette = (v & 0x80);
     s->bits_per_pixel = (v & 0x07) + 1;
-    s->background_color_index = bytestream2_get_byte(&s->gb);
-    bytestream2_get_byte(&s->gb);                /* ignored */
+    background_color_index = bytestream2_get_byteu(&s->gb);
+    n = bytestream2_get_byteu(&s->gb);
+    if (n) {
+        s->avctx->sample_aspect_ratio.num = n + 15;
+        s->avctx->sample_aspect_ratio.den = 64;
+    }
 
-    ff_dlog(s->avctx, "gif: screen_w=%d screen_h=%d bpp=%d global_palette=%d\n",
+    ff_dlog(s->avctx, "screen_w=%d screen_h=%d bpp=%d global_palette=%d\n",
            s->screen_width, s->screen_height, s->bits_per_pixel,
-           has_global_palette);
+           s->has_global_palette);
 
-    if (has_global_palette) {
+    if (s->has_global_palette) {
+        s->background_color_index = background_color_index;
         n = 1 << s->bits_per_pixel;
         if (bytestream2_get_bytes_left(&s->gb) < n * 3)
             return AVERROR_INVALIDDATA;
-        bytestream2_get_buffer(&s->gb, s->global_palette, n * 3);
-    }
+
+        gif_read_palette(s, s->global_palette, n);
+        s->bg_color = s->global_palette[s->background_color_index];
+    } else
+        s->background_color_index = -1;
+
     return 0;
 }
 
@@ -246,23 +428,24 @@ static int gif_parse_next_image(GifState *s, AVFrame *frame)
         int code = bytestream2_get_byte(&s->gb);
         int ret;
 
-        ff_dlog(s->avctx, "gif: code=%02x '%c'\n", code, code);
+        av_log(s->avctx, AV_LOG_DEBUG, "code=%02x '%c'\n", code, code);
 
         switch (code) {
-        case ',':
+        case GIF_IMAGE_SEPARATOR:
             return gif_read_image(s, frame);
-        case '!':
+        case GIF_EXTENSION_INTRODUCER:
             if ((ret = gif_read_extension(s)) < 0)
                 return ret;
             break;
-        case ';':
+        case GIF_TRAILER:
             /* end of image */
+            return AVERROR_EOF;
         default:
-            /* error or erroneous EOF */
+            /* erroneous block label */
             return AVERROR_INVALIDDATA;
         }
     }
-    return AVERROR_INVALIDDATA;
+    return AVERROR_EOF;
 }
 
 static av_cold int gif_decode_init(AVCodecContext *avctx)
@@ -271,38 +454,80 @@ static av_cold int gif_decode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
+    avctx->pix_fmt = AV_PIX_FMT_RGB32;
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
     ff_lzw_decode_open(&s->lzw);
+    if (!s->lzw)
+        return AVERROR(ENOMEM);
     return 0;
 }
 
-static int gif_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                            AVPacket *avpkt)
+static int gif_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     GifState *s = avctx->priv_data;
-    AVFrame *picture = data;
     int ret;
 
-    bytestream2_init(&s->gb, buf, buf_size);
-    if ((ret = gif_read_header1(s)) < 0)
-        return ret;
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
-    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    s->frame->pts     = avpkt->pts;
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+    s->frame->pkt_pts = avpkt->pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    s->frame->pkt_dts = avpkt->dts;
+    s->frame->pkt_duration = avpkt->duration;
 
-    if ((ret = ff_set_dimensions(avctx, s->screen_width, s->screen_height)) < 0)
-        return ret;
+    if (avpkt->size >= 6) {
+        s->keyframe = memcmp(avpkt->data, gif87a_sig, 6) == 0 ||
+                      memcmp(avpkt->data, gif89a_sig, 6) == 0;
+    } else {
+        s->keyframe = 0;
+    }
 
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (s->keyframe) {
+        s->keyframe_ok = 0;
+        s->gce_prev_disposal = GCE_DISPOSAL_NONE;
+        if ((ret = gif_read_header1(s)) < 0)
+            return ret;
+
+        if ((ret = ff_set_dimensions(avctx, s->screen_width, s->screen_height)) < 0)
+            return ret;
+
+        av_frame_unref(s->frame);
+        if ((ret = ff_get_buffer(avctx, s->frame, 0)) < 0)
+            return ret;
+
+        av_fast_malloc(&s->idx_line, &s->idx_line_size, s->screen_width);
+        if (!s->idx_line)
+            return AVERROR(ENOMEM);
+
+        s->frame->pict_type = AV_PICTURE_TYPE_I;
+        s->frame->key_frame = 1;
+        s->keyframe_ok = 1;
+    } else {
+        if (!s->keyframe_ok) {
+            av_log(avctx, AV_LOG_ERROR, "cannot decode frame without keyframe\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+            return ret;
+
+        s->frame->pict_type = AV_PICTURE_TYPE_P;
+        s->frame->key_frame = 0;
     }
-    s->image_palette = (uint32_t *)picture->data[1];
-    ret = gif_parse_next_image(s, picture);
+
+    ret = gif_parse_next_image(s, s->frame);
     if (ret < 0)
         return ret;
 
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
     *got_frame = 1;
+
     return bytestream2_tell(&s->gb);
 }
 
@@ -311,9 +536,29 @@ static av_cold int gif_decode_close(AVCodecContext *avctx)
     GifState *s = avctx->priv_data;
 
     ff_lzw_decode_close(&s->lzw);
+    av_frame_free(&s->frame);
+    av_freep(&s->idx_line);
+    av_freep(&s->stored_img);
+
     return 0;
 }
 
+static const AVOption options[] = {
+    { "trans_color", "color value (ARGB) that is used instead of transparent color",
+      offsetof(GifState, trans_color), AV_OPT_TYPE_INT,
+      {.i64 = GIF_TRANSPARENT_COLOR}, 0, 0xffffffff,
+      AV_OPT_FLAG_DECODING_PARAM|AV_OPT_FLAG_VIDEO_PARAM },
+    { NULL },
+};
+
+static const AVClass decoder_class = {
+    .class_name = "gif decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DECODER,
+};
+
 AVCodec ff_gif_decoder = {
     .name           = "gif",
     .long_name      = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
@@ -324,4 +569,7 @@ AVCodec ff_gif_decoder = {
     .close          = gif_decode_close,
     .decode         = gif_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+    .priv_class     = &decoder_class,
 };
diff --git a/libavcodec/golomb.c b/libavcodec/golomb.c
index 550c41e..937ac22 100644
--- a/libavcodec/golomb.c
+++ b/libavcodec/golomb.c
@@ -2,20 +2,20 @@
  * exp golomb vlc stuff
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/golomb.h b/libavcodec/golomb.h
index cff7e42..fcc78f4 100644
--- a/libavcodec/golomb.h
+++ b/libavcodec/golomb.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 
 #include <stdint.h>
 
-#include "bitstream.h"
+#include "get_bits.h"
 #include "put_bits.h"
 
 #define INVALID_VLC           0x80000000
@@ -48,68 +48,135 @@ extern const  int8_t ff_interleaved_se_golomb_vlc_code[256];
 extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256];
 
 /**
- * read unsigned exp golomb code.
+ * Read an unsigned Exp-Golomb code in the range 0 to 8190.
  */
-static inline int get_ue_golomb(BitstreamContext *bc)
+static inline int get_ue_golomb(GetBitContext *gb)
 {
     unsigned int buf;
 
-    buf = bitstream_peek(bc, 32);
+#if CACHED_BITSTREAM_READER
+    buf = show_bits_long(gb, 32);
 
     if (buf >= (1 << 27)) {
         buf >>= 32 - 9;
-        bitstream_skip(bc, ff_golomb_vlc_len[buf]);
+        skip_bits_long(gb, ff_golomb_vlc_len[buf]);
 
         return ff_ue_golomb_vlc_code[buf];
     } else {
         int log = 2 * av_log2(buf) - 31;
         buf >>= log;
         buf--;
-        bitstream_skip(bc, 32 - log);
+        skip_bits_long(gb, 32 - log);
 
         return buf;
     }
+#else
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
+
+    if (buf >= (1 << 27)) {
+        buf >>= 32 - 9;
+        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
+
+        return ff_ue_golomb_vlc_code[buf];
+    } else {
+        int log = 2 * av_log2(buf) - 31;
+        LAST_SKIP_BITS(re, gb, 32 - log);
+        CLOSE_READER(re, gb);
+        if (log < 7) {
+            av_log(NULL, AV_LOG_ERROR, "Invalid UE golomb code\n");
+            return AVERROR_INVALIDDATA;
+        }
+        buf >>= log;
+        buf--;
+
+        return buf;
+    }
+#endif
 }
 
 /**
  * Read an unsigned Exp-Golomb code in the range 0 to UINT32_MAX-1.
  */
-static inline unsigned get_ue_golomb_long(BitstreamContext *bc)
+static inline unsigned get_ue_golomb_long(GetBitContext *gb)
 {
     unsigned buf, log;
 
-    buf = bitstream_peek(bc, 32);
+    buf = show_bits_long(gb, 32);
     log = 31 - av_log2(buf);
-    bitstream_skip(bc, log);
+    skip_bits_long(gb, log);
 
-    return bitstream_read(bc, log + 1) - 1;
+    return get_bits_long(gb, log + 1) - 1;
 }
 
 /**
  * read unsigned exp golomb code, constraint to a max of 31.
  * the return value is undefined if the stored value exceeds 31.
  */
-static inline int get_ue_golomb_31(BitstreamContext *bc)
+static inline int get_ue_golomb_31(GetBitContext *gb)
 {
     unsigned int buf;
 
-    buf = bitstream_peek(bc, 32);
+#if CACHED_BITSTREAM_READER
+    buf = show_bits_long(gb, 32);
+
+    buf >>= 32 - 9;
+    skip_bits_long(gb, ff_golomb_vlc_len[buf]);
+#else
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
 
     buf >>= 32 - 9;
-    bitstream_skip(bc, ff_golomb_vlc_len[buf]);
+    LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
+    CLOSE_READER(re, gb);
+#endif
 
     return ff_ue_golomb_vlc_code[buf];
 }
 
-static inline unsigned get_interleaved_ue_golomb(BitstreamContext *bc)
+static inline unsigned get_interleaved_ue_golomb(GetBitContext *gb)
 {
     uint32_t buf;
 
-    buf = bitstream_peek(bc, 32);
+#if CACHED_BITSTREAM_READER
+    buf = show_bits_long(gb, 32);
+
+    if (buf & 0xAA800000) {
+        buf >>= 32 - 8;
+        skip_bits_long(gb, ff_interleaved_golomb_vlc_len[buf]);
+
+        return ff_interleaved_ue_golomb_vlc_code[buf];
+    } else {
+        unsigned ret = 1;
+
+        do {
+            buf >>= 32 - 8;
+            skip_bits_long(gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
+
+            if (ff_interleaved_golomb_vlc_len[buf] != 9) {
+                ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1;
+                ret  |= ff_interleaved_dirac_golomb_vlc_code[buf];
+                break;
+            }
+            ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
+            buf = show_bits_long(gb, 32);
+        } while (get_bits_left(gb) > 0);
+
+        return ret - 1;
+    }
+#else
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
 
     if (buf & 0xAA800000) {
         buf >>= 32 - 8;
-        bitstream_skip(bc, ff_interleaved_golomb_vlc_len[buf]);
+        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
 
         return ff_interleaved_ue_golomb_vlc_code[buf];
     } else {
@@ -117,7 +184,8 @@ static inline unsigned get_interleaved_ue_golomb(BitstreamContext *bc)
 
         do {
             buf >>= 32 - 8;
-            bitstream_skip(bc, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
+            LAST_SKIP_BITS(re, gb,
+                           FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
 
             if (ff_interleaved_golomb_vlc_len[buf] != 9) {
                 ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1;
@@ -125,60 +193,64 @@ static inline unsigned get_interleaved_ue_golomb(BitstreamContext *bc)
                 break;
             }
             ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
-            buf = bitstream_peek(bc, 32);
-        } while (bitstream_bits_left(bc) > 0);
+            UPDATE_CACHE(re, gb);
+            buf = GET_CACHE(re, gb);
+        } while (ret<0x8000000U && BITS_AVAILABLE(re, gb));
 
+        CLOSE_READER(re, gb);
         return ret - 1;
     }
+#endif
 }
 
 /**
  * read unsigned truncated exp golomb code.
  */
-static inline int get_te0_golomb(BitstreamContext *bc, int range)
+static inline int get_te0_golomb(GetBitContext *gb, int range)
 {
-    assert(range >= 1);
+    av_assert2(range >= 1);
 
     if (range == 1)
         return 0;
     else if (range == 2)
-        return bitstream_read_bit(bc) ^ 1;
+        return get_bits1(gb) ^ 1;
     else
-        return get_ue_golomb(bc);
+        return get_ue_golomb(gb);
 }
 
 /**
  * read unsigned truncated exp golomb code.
  */
-static inline int get_te_golomb(BitstreamContext *bc, int range)
+static inline int get_te_golomb(GetBitContext *gb, int range)
 {
-    assert(range >= 1);
+    av_assert2(range >= 1);
 
     if (range == 2)
-        return bitstream_read_bit(bc) ^ 1;
+        return get_bits1(gb) ^ 1;
     else
-        return get_ue_golomb(bc);
+        return get_ue_golomb(gb);
 }
 
 /**
  * read signed exp golomb code.
  */
-static inline int get_se_golomb(BitstreamContext *bc)
+static inline int get_se_golomb(GetBitContext *gb)
 {
     unsigned int buf;
 
-    buf = bitstream_peek(bc, 32);
+#if CACHED_BITSTREAM_READER
+    buf = show_bits_long(gb, 32);
 
     if (buf >= (1 << 27)) {
         buf >>= 32 - 9;
-        bitstream_skip(bc, ff_golomb_vlc_len[buf]);
+        skip_bits_long(gb, ff_golomb_vlc_len[buf]);
 
         return ff_se_golomb_vlc_code[buf];
     } else {
         int log = 2 * av_log2(buf) - 31;
         buf >>= log;
 
-        bitstream_skip(bc, 32 - log);
+        skip_bits_long(gb, 32 - log);
 
         if (buf & 1)
             buf = -(buf >> 1);
@@ -187,35 +259,59 @@ static inline int get_se_golomb(BitstreamContext *bc)
 
         return buf;
     }
-}
+#else
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
 
-static inline int get_se_golomb_long(BitstreamContext *bc)
-{
-    unsigned int buf = get_ue_golomb_long(bc);
+    if (buf >= (1 << 27)) {
+        buf >>= 32 - 9;
+        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
 
-    if (buf & 1)
-        buf = (buf + 1) >> 1;
-    else
-        buf = -(buf >> 1);
+        return ff_se_golomb_vlc_code[buf];
+    } else {
+        int log = av_log2(buf), sign;
+        LAST_SKIP_BITS(re, gb, 31 - log);
+        UPDATE_CACHE(re, gb);
+        buf = GET_CACHE(re, gb);
+
+        buf >>= log;
 
-    return buf;
+        LAST_SKIP_BITS(re, gb, 32 - log);
+        CLOSE_READER(re, gb);
+
+        sign = -(buf & 1);
+        buf  = ((buf >> 1) ^ sign) - sign;
+
+        return buf;
+    }
+#endif
 }
 
-static inline int get_interleaved_se_golomb(BitstreamContext *bc)
+static inline int get_se_golomb_long(GetBitContext *gb)
+{
+    unsigned int buf = get_ue_golomb_long(gb);
+    int sign = (buf & 1) - 1;
+    return ((buf >> 1) ^ sign) + 1;
+}
+
+static inline int get_interleaved_se_golomb(GetBitContext *gb)
 {
     unsigned int buf;
 
-    buf = bitstream_peek(bc, 32);
+#if CACHED_BITSTREAM_READER
+    buf = show_bits_long(gb, 32);
 
     if (buf & 0xAA800000) {
         buf >>= 32 - 8;
-        bitstream_skip(bc, ff_interleaved_golomb_vlc_len[buf]);
+        skip_bits_long(gb, ff_interleaved_golomb_vlc_len[buf]);
 
         return ff_interleaved_se_golomb_vlc_code[buf];
     } else {
         int log;
-        bitstream_skip(bc, 8);
-        buf |= 1 | bitstream_peek(bc, 24);
+        skip_bits(gb, 8);
+        buf |= 1 | show_bits_long(gb, 24);
 
         if ((buf & 0xAAAAAAAA) == 0)
             return INVALID_VLC;
@@ -223,20 +319,48 @@ static inline int get_interleaved_se_golomb(BitstreamContext *bc)
         for (log = 31; (buf & 0x80000000) == 0; log--)
             buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
 
-        bitstream_skip(bc, 63 - 2 * log - 8);
+        skip_bits_long(gb, 63 - 2 * log - 8);
 
         return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1;
     }
+#else
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
+
+    if (buf & 0xAA800000) {
+        buf >>= 32 - 8;
+        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
+        CLOSE_READER(re, gb);
+
+        return ff_interleaved_se_golomb_vlc_code[buf];
+    } else {
+        int log;
+        LAST_SKIP_BITS(re, gb, 8);
+        UPDATE_CACHE(re, gb);
+        buf |= 1 | (GET_CACHE(re, gb) >> 8);
+
+        if ((buf & 0xAAAAAAAA) == 0)
+            return INVALID_VLC;
+
+        for (log = 31; (buf & 0x80000000) == 0; log--)
+            buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
+
+        LAST_SKIP_BITS(re, gb, 63 - 2 * log - 8);
+        CLOSE_READER(re, gb);
+
+        return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1;
+    }
+#endif
 }
 
-static inline int dirac_get_se_golomb(BitstreamContext *bc)
+static inline int dirac_get_se_golomb(GetBitContext *gb)
 {
-    uint32_t ret = get_interleaved_ue_golomb(bc);
+    uint32_t ret = get_interleaved_ue_golomb(gb);
 
     if (ret) {
-        uint32_t buf;
-        buf = bitstream_read_signed(bc, 1);
-        ret = (ret ^ buf) - buf;
+        int sign = -get_bits1(gb);
+        ret = (ret ^ sign) - sign;
     }
 
     return ret;
@@ -245,124 +369,204 @@ static inline int dirac_get_se_golomb(BitstreamContext *bc)
 /**
  * read unsigned golomb rice code (ffv1).
  */
-static inline int get_ur_golomb(BitstreamContext *bc, int k, int limit,
+static inline int get_ur_golomb(GetBitContext *gb, int k, int limit,
                                 int esc_len)
 {
     unsigned int buf;
     int log;
 
-    buf = bitstream_peek(bc, 32);
+#if CACHED_BITSTREAM_READER
+    buf = show_bits_long(gb, 32);
 
     log = av_log2(buf);
 
     if (log > 31 - limit) {
         buf >>= log - k;
         buf  += (30 - log) << k;
-        bitstream_skip(bc, 32 + k - log);
+        skip_bits_long(gb, 32 + k - log);
+
+        return buf;
+    } else {
+        skip_bits_long(gb, limit);
+        buf = get_bits_long(gb, esc_len);
+
+        return buf + limit - 1;
+    }
+#else
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
+
+    log = av_log2(buf);
+
+    if (log > 31 - limit) {
+        buf >>= log - k;
+        buf  += (30U - log) << k;
+        LAST_SKIP_BITS(re, gb, 32 + k - log);
+        CLOSE_READER(re, gb);
 
         return buf;
     } else {
-        bitstream_skip(bc, limit);
-        buf = bitstream_read(bc, esc_len);
+        LAST_SKIP_BITS(re, gb, limit);
+        UPDATE_CACHE(re, gb);
+
+        buf = SHOW_UBITS(re, gb, esc_len);
+
+        LAST_SKIP_BITS(re, gb, esc_len);
+        CLOSE_READER(re, gb);
 
         return buf + limit - 1;
     }
+#endif
 }
 
 /**
  * read unsigned golomb rice code (jpegls).
  */
-static inline int get_ur_golomb_jpegls(BitstreamContext *bc, int k, int limit,
+static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
                                        int esc_len)
 {
     unsigned int buf;
     int log;
 
-    buf = bitstream_peek(bc, 32);
+#if CACHED_BITSTREAM_READER
+    buf = show_bits_long(gb, 32);
 
     log = av_log2(buf);
 
     if (log - k >= 1 && 32 - log < limit) {
         buf >>= log - k;
         buf  += (30 - log) << k;
-        bitstream_skip(bc, 32 + k - log);
+        skip_bits_long(gb, 32 + k - log);
 
         return buf;
     } else {
         int i;
         for (i = 0;
-             i < limit && bitstream_read_bit(bc) == 0 && bitstream_bits_left(bc) > 0;
+             i < limit && get_bits1(gb) == 0 && get_bits_left(gb) > 0;
              i++);
 
         if (i < limit - 1) {
-            buf = bitstream_read(bc, k);
+            buf = get_bits_long(gb, k);
 
             return buf + (i << k);
         } else if (i == limit - 1) {
-            buf = bitstream_read(bc, esc_len);
+            buf = get_bits_long(gb, esc_len);
 
             return buf + 1;
         } else
             return -1;
     }
+#else
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
+
+    log = av_log2(buf);
+
+    av_assert2(k <= 31);
+
+    if (log - k >= 32 - MIN_CACHE_BITS + (MIN_CACHE_BITS == 32) &&
+        32 - log < limit) {
+        buf >>= log - k;
+        buf  += (30U - log) << k;
+        LAST_SKIP_BITS(re, gb, 32 + k - log);
+        CLOSE_READER(re, gb);
+
+        return buf;
+    } else {
+        int i;
+        for (i = 0; i + MIN_CACHE_BITS <= limit && SHOW_UBITS(re, gb, MIN_CACHE_BITS) == 0; i += MIN_CACHE_BITS) {
+            if (gb->size_in_bits <= re_index) {
+                CLOSE_READER(re, gb);
+                return -1;
+            }
+            LAST_SKIP_BITS(re, gb, MIN_CACHE_BITS);
+            UPDATE_CACHE(re, gb);
+        }
+        for (; i < limit && SHOW_UBITS(re, gb, 1) == 0; i++) {
+            SKIP_BITS(re, gb, 1);
+        }
+        LAST_SKIP_BITS(re, gb, 1);
+        UPDATE_CACHE(re, gb);
+
+        if (i < limit - 1) {
+            if (k) {
+                if (k > MIN_CACHE_BITS - 1) {
+                    buf = SHOW_UBITS(re, gb, 16) << (k-16);
+                    LAST_SKIP_BITS(re, gb, 16);
+                    UPDATE_CACHE(re, gb);
+                    buf |= SHOW_UBITS(re, gb, k-16);
+                    LAST_SKIP_BITS(re, gb, k-16);
+                } else {
+                    buf = SHOW_UBITS(re, gb, k);
+                    LAST_SKIP_BITS(re, gb, k);
+                }
+            } else {
+                buf = 0;
+            }
+
+            buf += ((SUINT)i << k);
+        } else if (i == limit - 1) {
+            buf = SHOW_UBITS(re, gb, esc_len);
+            LAST_SKIP_BITS(re, gb, esc_len);
+
+            buf ++;
+        } else {
+            buf = -1;
+        }
+        CLOSE_READER(re, gb);
+        return buf;
+    }
+#endif
 }
 
 /**
  * read signed golomb rice code (ffv1).
  */
-static inline int get_sr_golomb(BitstreamContext *bc, int k, int limit,
+static inline int get_sr_golomb(GetBitContext *gb, int k, int limit,
                                 int esc_len)
 {
-    int v = get_ur_golomb(bc, k, limit, esc_len);
-
-    v++;
-    if (v & 1)
-        return v >> 1;
-    else
-        return -(v >> 1);
-
-//    return (v>>1) ^ -(v&1);
+    unsigned v = get_ur_golomb(gb, k, limit, esc_len);
+    return (v >> 1) ^ -(v & 1);
 }
 
 /**
  * read signed golomb rice code (flac).
  */
-static inline int get_sr_golomb_flac(BitstreamContext *bc, int k, int limit,
+static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit,
                                      int esc_len)
 {
-    int v = get_ur_golomb_jpegls(bc, k, limit, esc_len);
+    unsigned v = get_ur_golomb_jpegls(gb, k, limit, esc_len);
     return (v >> 1) ^ -(v & 1);
 }
 
 /**
  * read unsigned golomb rice code (shorten).
  */
-static inline unsigned int get_ur_golomb_shorten(BitstreamContext *bc, int k)
+static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k)
 {
-    return get_ur_golomb_jpegls(bc, k, INT_MAX, 0);
+    return get_ur_golomb_jpegls(gb, k, INT_MAX, 0);
 }
 
 /**
  * read signed golomb rice code (shorten).
  */
-static inline int get_sr_golomb_shorten(BitstreamContext *bc, int k)
+static inline int get_sr_golomb_shorten(GetBitContext *gb, int k)
 {
-    int uvar = get_ur_golomb_jpegls(bc, k + 1, INT_MAX, 0);
-    if (uvar & 1)
-        return ~(uvar >> 1);
-    else
-        return uvar >> 1;
+    int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
+    return (uvar >> 1) ^ -(uvar & 1);
 }
 
 #ifdef TRACE
 
-static inline int get_ue(BitstreamContext *s, const char *file, const char *func,
+static inline int get_ue(GetBitContext *s, const char *file, const char *func,
                          int line)
 {
-    int show = bitstream_peek(s, 24);
-    int pos  = bitstream_tell(s);
+    int show = show_bits(s, 24);
+    int pos  = get_bits_count(s);
     int i    = get_ue_golomb(s);
-    int len  = bitstream_tell(s) - pos;
+    int len  = get_bits_count(s) - pos;
     int bits = show >> (24 - len);
 
     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue  @%5d in %s %s:%d\n",
@@ -371,13 +575,13 @@ static inline int get_ue(BitstreamContext *s, const char *file, const char *func
     return i;
 }
 
-static inline int get_se(BitstreamContext *s, const char *file, const char *func,
+static inline int get_se(GetBitContext *s, const char *file, const char *func,
                          int line)
 {
-    int show = bitstream_peek(s, 24);
-    int pos  = bitstream_tell(s);
+    int show = show_bits(s, 24);
+    int pos  = get_bits_count(s);
     int i    = get_se_golomb(s);
-    int len  = bitstream_tell(s) - pos;
+    int len  = get_bits_count(s) - pos;
     int bits = show >> (24 - len);
 
     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se  @%5d in %s %s:%d\n",
@@ -386,13 +590,13 @@ static inline int get_se(BitstreamContext *s, const char *file, const char *func
     return i;
 }
 
-static inline int get_te(BitstreamContext *s, int r, char *file, const char *func,
+static inline int get_te(GetBitContext *s, int r, char *file, const char *func,
                          int line)
 {
-    int show = bitstream_peek(s, 24);
-    int pos  = bitstream_tell(s);
+    int show = show_bits(s, 24);
+    int pos  = get_bits_count(s);
     int i    = get_te0_golomb(s, r);
-    int len  = bitstream_tell(s) - pos;
+    int len  = get_bits_count(s) - pos;
     int bits = show >> (24 - len);
 
     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te  @%5d in %s %s:%d\n",
@@ -409,11 +613,12 @@ static inline int get_te(BitstreamContext *s, int r, char *file, const char *fun
 #endif /* TRACE */
 
 /**
- * write unsigned exp golomb code.
+ * write unsigned exp golomb code. 2^16 - 2 at most
  */
 static inline void set_ue_golomb(PutBitContext *pb, int i)
 {
-    assert(i >= 0);
+    av_assert2(i >= 0);
+    av_assert2(i <= 0xFFFE);
 
     if (i < 256)
         put_bits(pb, ff_ue_golomb_len[i], i + 1);
@@ -424,12 +629,27 @@ static inline void set_ue_golomb(PutBitContext *pb, int i)
 }
 
 /**
+ * write unsigned exp golomb code. 2^32-2 at most.
+ */
+static inline void set_ue_golomb_long(PutBitContext *pb, uint32_t i)
+{
+    av_assert2(i <= (UINT32_MAX - 1));
+
+    if (i < 256)
+        put_bits(pb, ff_ue_golomb_len[i], i + 1);
+    else {
+        int e = av_log2(i + 1);
+        put_bits64(pb, 2 * e + 1, i + 1);
+    }
+}
+
+/**
  * write truncated unsigned exp golomb code.
  */
 static inline void set_te_golomb(PutBitContext *pb, int i, int range)
 {
-    assert(range >= 1);
-    assert(i <= range);
+    av_assert2(range >= 1);
+    av_assert2(i <= range);
 
     if (range == 2)
         put_bits(pb, 1, i ^ 1);
@@ -456,11 +676,11 @@ static inline void set_ur_golomb(PutBitContext *pb, int i, int k, int limit,
 {
     int e;
 
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
     e = i >> k;
     if (e < limit)
-        put_bits(pb, e + k + 1, (1 << k) + (i & ((1 << k) - 1)));
+        put_bits(pb, e + k + 1, (1 << k) + av_mod_uintp2(i, k));
     else
         put_bits(pb, limit + esc_len, i - limit + 1);
 }
@@ -473,7 +693,7 @@ static inline void set_ur_golomb_jpegls(PutBitContext *pb, int i, int k,
 {
     int e;
 
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
     e = (i >> k) + 1;
     if (e < limit) {
diff --git a/libavcodec/golomb_legacy.h b/libavcodec/golomb_legacy.h
deleted file mode 100644
index 9fafbcd..0000000
--- a/libavcodec/golomb_legacy.h
+++ /dev/null
@@ -1,573 +0,0 @@
-/*
- * exp golomb vlc stuff
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2004 Alex Beregszaszi
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * @brief
- *     exp golomb vlc stuff
- * @author Michael Niedermayer <michaelni@gmx.at> and Alex Beregszaszi
- */
-
-#ifndef AVCODEC_GOLOMB_H
-#define AVCODEC_GOLOMB_H
-
-#include <stdint.h>
-
-#include "get_bits.h"
-#include "put_bits.h"
-
-#define INVALID_VLC           0x80000000
-
-extern const uint8_t ff_golomb_vlc_len[512];
-extern const uint8_t ff_ue_golomb_vlc_code[512];
-extern const  int8_t ff_se_golomb_vlc_code[512];
-extern const uint8_t ff_ue_golomb_len[256];
-
-extern const uint8_t ff_interleaved_golomb_vlc_len[256];
-extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256];
-extern const  int8_t ff_interleaved_se_golomb_vlc_code[256];
-extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256];
-
-/**
- * read unsigned exp golomb code.
- */
-static inline int get_ue_golomb(GetBitContext *gb)
-{
-    unsigned int buf;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    if (buf >= (1 << 27)) {
-        buf >>= 32 - 9;
-        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_ue_golomb_vlc_code[buf];
-    } else {
-        int log = 2 * av_log2(buf) - 31;
-        buf >>= log;
-        buf--;
-        LAST_SKIP_BITS(re, gb, 32 - log);
-        CLOSE_READER(re, gb);
-
-        return buf;
-    }
-}
-
-/**
- * Read an unsigned Exp-Golomb code in the range 0 to UINT32_MAX-1.
- */
-static inline unsigned get_ue_golomb_long(GetBitContext *gb)
-{
-    unsigned buf, log;
-
-    buf = show_bits_long(gb, 32);
-    log = 31 - av_log2(buf);
-    skip_bits_long(gb, log);
-
-    return get_bits_long(gb, log + 1) - 1;
-}
-
-/**
- * read unsigned exp golomb code, constraint to a max of 31.
- * the return value is undefined if the stored value exceeds 31.
- */
-static inline int get_ue_golomb_31(GetBitContext *gb)
-{
-    unsigned int buf;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    buf >>= 32 - 9;
-    LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
-    CLOSE_READER(re, gb);
-
-    return ff_ue_golomb_vlc_code[buf];
-}
-
-static inline unsigned get_interleaved_ue_golomb(GetBitContext *gb)
-{
-    uint32_t buf;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    if (buf & 0xAA800000) {
-        buf >>= 32 - 8;
-        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_interleaved_ue_golomb_vlc_code[buf];
-    } else {
-        unsigned ret = 1;
-
-        do {
-            buf >>= 32 - 8;
-            LAST_SKIP_BITS(re, gb,
-                           FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
-
-            if (ff_interleaved_golomb_vlc_len[buf] != 9) {
-                ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1;
-                ret  |= ff_interleaved_dirac_golomb_vlc_code[buf];
-                break;
-            }
-            ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
-            UPDATE_CACHE(re, gb);
-            buf = GET_CACHE(re, gb);
-        } while (BITS_AVAILABLE(re, gb));
-
-        CLOSE_READER(re, gb);
-        return ret - 1;
-    }
-}
-
-/**
- * read unsigned truncated exp golomb code.
- */
-static inline int get_te0_golomb(GetBitContext *gb, int range)
-{
-    assert(range >= 1);
-
-    if (range == 1)
-        return 0;
-    else if (range == 2)
-        return get_bits1(gb) ^ 1;
-    else
-        return get_ue_golomb(gb);
-}
-
-/**
- * read unsigned truncated exp golomb code.
- */
-static inline int get_te_golomb(GetBitContext *gb, int range)
-{
-    assert(range >= 1);
-
-    if (range == 2)
-        return get_bits1(gb) ^ 1;
-    else
-        return get_ue_golomb(gb);
-}
-
-/**
- * read signed exp golomb code.
- */
-static inline int get_se_golomb(GetBitContext *gb)
-{
-    unsigned int buf;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    if (buf >= (1 << 27)) {
-        buf >>= 32 - 9;
-        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_se_golomb_vlc_code[buf];
-    } else {
-        int log = 2 * av_log2(buf) - 31;
-        buf >>= log;
-
-        LAST_SKIP_BITS(re, gb, 32 - log);
-        CLOSE_READER(re, gb);
-
-        if (buf & 1)
-            buf = -(buf >> 1);
-        else
-            buf = (buf >> 1);
-
-        return buf;
-    }
-}
-
-static inline int get_se_golomb_long(GetBitContext *gb)
-{
-    unsigned int buf = get_ue_golomb_long(gb);
-
-    if (buf & 1)
-        buf = (buf + 1) >> 1;
-    else
-        buf = -(buf >> 1);
-
-    return buf;
-}
-
-static inline int get_interleaved_se_golomb(GetBitContext *gb)
-{
-    unsigned int buf;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    if (buf & 0xAA800000) {
-        buf >>= 32 - 8;
-        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
-        CLOSE_READER(re, gb);
-
-        return ff_interleaved_se_golomb_vlc_code[buf];
-    } else {
-        int log;
-        LAST_SKIP_BITS(re, gb, 8);
-        UPDATE_CACHE(re, gb);
-        buf |= 1 | (GET_CACHE(re, gb) >> 8);
-
-        if ((buf & 0xAAAAAAAA) == 0)
-            return INVALID_VLC;
-
-        for (log = 31; (buf & 0x80000000) == 0; log--)
-            buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
-
-        LAST_SKIP_BITS(re, gb, 63 - 2 * log - 8);
-        CLOSE_READER(re, gb);
-
-        return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1;
-    }
-}
-
-static inline int dirac_get_se_golomb(GetBitContext *gb)
-{
-    uint32_t ret = get_interleaved_ue_golomb(gb);
-
-    if (ret) {
-        uint32_t buf;
-        OPEN_READER(re, gb);
-        UPDATE_CACHE(re, gb);
-        buf = SHOW_SBITS(re, gb, 1);
-        LAST_SKIP_BITS(re, gb, 1);
-        ret = (ret ^ buf) - buf;
-        CLOSE_READER(re, gb);
-    }
-
-    return ret;
-}
-
-/**
- * read unsigned golomb rice code (ffv1).
- */
-static inline int get_ur_golomb(GetBitContext *gb, int k, int limit,
-                                int esc_len)
-{
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    log = av_log2(buf);
-
-    if (log > 31 - limit) {
-        buf >>= log - k;
-        buf  += (30 - log) << k;
-        LAST_SKIP_BITS(re, gb, 32 + k - log);
-        CLOSE_READER(re, gb);
-
-        return buf;
-    } else {
-        LAST_SKIP_BITS(re, gb, limit);
-        UPDATE_CACHE(re, gb);
-
-        buf = SHOW_UBITS(re, gb, esc_len);
-
-        LAST_SKIP_BITS(re, gb, esc_len);
-        CLOSE_READER(re, gb);
-
-        return buf + limit - 1;
-    }
-}
-
-/**
- * read unsigned golomb rice code (jpegls).
- */
-static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
-                                       int esc_len)
-{
-    unsigned int buf;
-    int log;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    log = av_log2(buf);
-
-    if (log - k >= 32 - MIN_CACHE_BITS + (MIN_CACHE_BITS == 32) &&
-        32 - log < limit) {
-        buf >>= log - k;
-        buf  += (30 - log) << k;
-        LAST_SKIP_BITS(re, gb, 32 + k - log);
-        CLOSE_READER(re, gb);
-
-        return buf;
-    } else {
-        int i;
-        for (i = 0; i < limit && SHOW_UBITS(re, gb, 1) == 0 && BITS_AVAILABLE(re, gb); i++) {
-            LAST_SKIP_BITS(re, gb, 1);
-            UPDATE_CACHE(re, gb);
-        }
-        SKIP_BITS(re, gb, 1);
-
-        if (i < limit - 1) {
-            if (k) {
-                buf = SHOW_UBITS(re, gb, k);
-                LAST_SKIP_BITS(re, gb, k);
-            } else {
-                buf = 0;
-            }
-
-            CLOSE_READER(re, gb);
-            return buf + (i << k);
-        } else if (i == limit - 1) {
-            buf = SHOW_UBITS(re, gb, esc_len);
-            LAST_SKIP_BITS(re, gb, esc_len);
-            CLOSE_READER(re, gb);
-
-            return buf + 1;
-        } else
-            return -1;
-    }
-}
-
-/**
- * read signed golomb rice code (ffv1).
- */
-static inline int get_sr_golomb(GetBitContext *gb, int k, int limit,
-                                int esc_len)
-{
-    int v = get_ur_golomb(gb, k, limit, esc_len);
-
-    v++;
-    if (v & 1)
-        return v >> 1;
-    else
-        return -(v >> 1);
-
-//    return (v>>1) ^ -(v&1);
-}
-
-/**
- * read signed golomb rice code (flac).
- */
-static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit,
-                                     int esc_len)
-{
-    int v = get_ur_golomb_jpegls(gb, k, limit, esc_len);
-    return (v >> 1) ^ -(v & 1);
-}
-
-/**
- * read unsigned golomb rice code (shorten).
- */
-static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k)
-{
-    return get_ur_golomb_jpegls(gb, k, INT_MAX, 0);
-}
-
-/**
- * read signed golomb rice code (shorten).
- */
-static inline int get_sr_golomb_shorten(GetBitContext *gb, int k)
-{
-    int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
-    if (uvar & 1)
-        return ~(uvar >> 1);
-    else
-        return uvar >> 1;
-}
-
-#ifdef TRACE
-
-static inline int get_ue(GetBitContext *s, const char *file, const char *func,
-                         int line)
-{
-    int show = show_bits(s, 24);
-    int pos  = get_bits_count(s);
-    int i    = get_ue_golomb(s);
-    int len  = get_bits_count(s) - pos;
-    int bits = show >> (24 - len);
-
-    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue  @%5d in %s %s:%d\n",
-           bits, len, i, pos, file, func, line);
-
-    return i;
-}
-
-static inline int get_se(GetBitContext *s, const char *file, const char *func,
-                         int line)
-{
-    int show = show_bits(s, 24);
-    int pos  = get_bits_count(s);
-    int i    = get_se_golomb(s);
-    int len  = get_bits_count(s) - pos;
-    int bits = show >> (24 - len);
-
-    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se  @%5d in %s %s:%d\n",
-           bits, len, i, pos, file, func, line);
-
-    return i;
-}
-
-static inline int get_te(GetBitContext *s, int r, char *file, const char *func,
-                         int line)
-{
-    int show = show_bits(s, 24);
-    int pos  = get_bits_count(s);
-    int i    = get_te0_golomb(s, r);
-    int len  = get_bits_count(s) - pos;
-    int bits = show >> (24 - len);
-
-    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te  @%5d in %s %s:%d\n",
-           bits, len, i, pos, file, func, line);
-
-    return i;
-}
-
-#define get_ue_golomb(a) get_ue(a, __FILE__, __func__, __LINE__)
-#define get_se_golomb(a) get_se(a, __FILE__, __func__, __LINE__)
-#define get_te_golomb(a, r)  get_te(a, r, __FILE__, __func__, __LINE__)
-#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __func__, __LINE__)
-
-#endif /* TRACE */
-
-/**
- * write unsigned exp golomb code.
- */
-static inline void set_ue_golomb(PutBitContext *pb, int i)
-{
-    assert(i >= 0);
-
-    if (i < 256)
-        put_bits(pb, ff_ue_golomb_len[i], i + 1);
-    else {
-        int e = av_log2(i + 1);
-        put_bits(pb, 2 * e + 1, i + 1);
-    }
-}
-
-/**
- * write truncated unsigned exp golomb code.
- */
-static inline void set_te_golomb(PutBitContext *pb, int i, int range)
-{
-    assert(range >= 1);
-    assert(i <= range);
-
-    if (range == 2)
-        put_bits(pb, 1, i ^ 1);
-    else
-        set_ue_golomb(pb, i);
-}
-
-/**
- * write signed exp golomb code. 16 bits at most.
- */
-static inline void set_se_golomb(PutBitContext *pb, int i)
-{
-    i = 2 * i - 1;
-    if (i < 0)
-        i ^= -1;    //FIXME check if gcc does the right thing
-    set_ue_golomb(pb, i);
-}
-
-/**
- * write unsigned golomb rice code (ffv1).
- */
-static inline void set_ur_golomb(PutBitContext *pb, int i, int k, int limit,
-                                 int esc_len)
-{
-    int e;
-
-    assert(i >= 0);
-
-    e = i >> k;
-    if (e < limit)
-        put_bits(pb, e + k + 1, (1 << k) + (i & ((1 << k) - 1)));
-    else
-        put_bits(pb, limit + esc_len, i - limit + 1);
-}
-
-/**
- * write unsigned golomb rice code (jpegls).
- */
-static inline void set_ur_golomb_jpegls(PutBitContext *pb, int i, int k,
-                                        int limit, int esc_len)
-{
-    int e;
-
-    assert(i >= 0);
-
-    e = (i >> k) + 1;
-    if (e < limit) {
-        while (e > 31) {
-            put_bits(pb, 31, 0);
-            e -= 31;
-        }
-        put_bits(pb, e, 1);
-        if (k)
-            put_sbits(pb, k, i);
-    } else {
-        while (limit > 31) {
-            put_bits(pb, 31, 0);
-            limit -= 31;
-        }
-        put_bits(pb, limit, 1);
-        put_bits(pb, esc_len, i - 1);
-    }
-}
-
-/**
- * write signed golomb rice code (ffv1).
- */
-static inline void set_sr_golomb(PutBitContext *pb, int i, int k, int limit,
-                                 int esc_len)
-{
-    int v;
-
-    v  = -2 * i - 1;
-    v ^= (v >> 31);
-
-    set_ur_golomb(pb, v, k, limit, esc_len);
-}
-
-/**
- * write signed golomb rice code (flac).
- */
-static inline void set_sr_golomb_flac(PutBitContext *pb, int i, int k,
-                                      int limit, int esc_len)
-{
-    int v;
-
-    v  = -2 * i - 1;
-    v ^= (v >> 31);
-
-    set_ur_golomb_jpegls(pb, v, k, limit, esc_len);
-}
-
-#endif /* AVCODEC_GOLOMB_H */
diff --git a/libavcodec/gsm.h b/libavcodec/gsm.h
index 238cb73..53d65c4 100644
--- a/libavcodec/gsm.h
+++ b/libavcodec/gsm.h
@@ -1,20 +1,20 @@
 /*
  * GSM common header
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsm_parser.c b/libavcodec/gsm_parser.c
index c0befc7..1054a30 100644
--- a/libavcodec/gsm_parser.c
+++ b/libavcodec/gsm_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * Splits packets into individual blocks.
  */
 
+#include "libavutil/avassert.h"
 #include "parser.h"
 #include "gsm.h"
 
@@ -55,7 +56,7 @@ static int gsm_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
             s->duration   = GSM_FRAME_SIZE * 2;
             break;
         default:
-            return AVERROR(EINVAL);
+            av_assert0(0);
         }
     }
 
diff --git a/libavcodec/gsmdec.c b/libavcodec/gsmdec.c
index 0205faf..cd56995 100644
--- a/libavcodec/gsmdec.c
+++ b/libavcodec/gsmdec.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,8 @@
  */
 
 #include "libavutil/channel_layout.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "msgsmdec.h"
 
@@ -68,7 +67,7 @@ static int gsm_decode_frame(AVCodecContext *avctx, void *data,
 {
     AVFrame *frame = data;
     int res;
-    BitstreamContext bc;
+    GetBitContext gb;
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     int16_t *samples;
@@ -80,18 +79,16 @@ static int gsm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avctx->frame_size;
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
     samples = (int16_t *)frame->data[0];
 
     switch (avctx->codec_id) {
     case AV_CODEC_ID_GSM:
-        bitstream_init8(&bc, buf, buf_size);
-        if (bitstream_read(&bc, 4) != 0xd)
+        init_get_bits(&gb, buf, buf_size * 8);
+        if (get_bits(&gb, 4) != 0xd)
             av_log(avctx, AV_LOG_WARNING, "Missing GSM magic!\n");
-        res = gsm_decode_block(avctx, samples, &bc, GSM_13000);
+        res = gsm_decode_block(avctx, samples, &gb, GSM_13000);
         if (res < 0)
             return res;
         break;
@@ -113,6 +110,7 @@ static void gsm_flush(AVCodecContext *avctx)
     memset(s, 0, sizeof(*s));
 }
 
+#if CONFIG_GSM_DECODER
 AVCodec ff_gsm_decoder = {
     .name           = "gsm",
     .long_name      = NULL_IF_CONFIG_SMALL("GSM"),
@@ -124,7 +122,8 @@ AVCodec ff_gsm_decoder = {
     .flush          = gsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
+#if CONFIG_GSM_MS_DECODER
 AVCodec ff_gsm_ms_decoder = {
     .name           = "gsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("GSM Microsoft variant"),
@@ -136,3 +135,4 @@ AVCodec ff_gsm_ms_decoder = {
     .flush          = gsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/gsmdec_data.c b/libavcodec/gsmdec_data.c
index 71f788e..6015f78 100644
--- a/libavcodec/gsmdec_data.c
+++ b/libavcodec/gsmdec_data.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder data
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsmdec_data.h b/libavcodec/gsmdec_data.h
index f301f56..f3499e8 100644
--- a/libavcodec/gsmdec_data.h
+++ b/libavcodec/gsmdec_data.h
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder data
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsmdec_template.c b/libavcodec/gsmdec_template.c
index 7437908..4e40a20 100644
--- a/libavcodec/gsmdec_template.c
+++ b/libavcodec/gsmdec_template.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,24 +24,24 @@
  * GSM decoder
  */
 
-#include "bitstream.h"
+#include "get_bits.h"
 #include "gsm.h"
 #include "gsmdec_data.h"
 
-static void apcm_dequant_add(BitstreamContext *bc, int16_t *dst, const int *frame_bits)
+static void apcm_dequant_add(GetBitContext *gb, int16_t *dst, const int *frame_bits)
 {
     int i, val;
-    int maxidx = bitstream_read(bc, 6);
+    int maxidx = get_bits(gb, 6);
     const int16_t *tab = ff_gsm_dequant_tab[maxidx];
     for (i = 0; i < 13; i++) {
-        val = bitstream_read(bc, frame_bits[i]);
+        val = get_bits(gb, frame_bits[i]);
         dst[3 * i] += tab[ff_gsm_requant_tab[frame_bits[i]][val]];
     }
 }
 
 static inline int gsm_mult(int a, int b)
 {
-    return (a * b + (1 << 14)) >> 15;
+    return (int)(a * (SUINT)b + (1 << 14)) >> 15;
 }
 
 static void long_term_synth(int16_t *dst, int lag, int gain_idx)
@@ -57,7 +57,7 @@ static inline int decode_log_area(int coded, int factor, int offset)
 {
     coded <<= 10;
     coded -= offset;
-    return gsm_mult(coded, factor) << 1;
+    return gsm_mult(coded, factor) * 2;
 }
 
 static av_noinline int get_rrp(int filtered)
@@ -114,34 +114,34 @@ static int postprocess(int16_t *data, int msr)
     int i;
     for (i = 0; i < 160; i++) {
         msr = av_clip_int16(data[i] + gsm_mult(msr, 28180));
-        data[i] = av_clip_int16(msr << 1) & ~7;
+        data[i] = av_clip_int16(msr * 2) & ~7;
     }
     return msr;
 }
 
 static int gsm_decode_block(AVCodecContext *avctx, int16_t *samples,
-                            BitstreamContext *bc, int mode)
+                            GetBitContext *gb, int mode)
 {
     GSMContext *ctx = avctx->priv_data;
     int i;
     int16_t *ref_dst = ctx->ref_buf + 120;
     int *lar = ctx->lar[ctx->lar_idx];
-    lar[0] = decode_log_area(bitstream_read(bc, 6), 13107,  1 << 15);
-    lar[1] = decode_log_area(bitstream_read(bc, 6), 13107,  1 << 15);
-    lar[2] = decode_log_area(bitstream_read(bc, 5), 13107, (1 << 14) + 2048 * 2);
-    lar[3] = decode_log_area(bitstream_read(bc, 5), 13107, (1 << 14) - 2560 * 2);
-    lar[4] = decode_log_area(bitstream_read(bc, 4), 19223, (1 << 13) +   94 * 2);
-    lar[5] = decode_log_area(bitstream_read(bc, 4), 17476, (1 << 13) - 1792 * 2);
-    lar[6] = decode_log_area(bitstream_read(bc, 3), 31454, (1 << 12) -  341 * 2);
-    lar[7] = decode_log_area(bitstream_read(bc, 3), 29708, (1 << 12) - 1144 * 2);
+    lar[0] = decode_log_area(get_bits(gb, 6), 13107,  1 << 15);
+    lar[1] = decode_log_area(get_bits(gb, 6), 13107,  1 << 15);
+    lar[2] = decode_log_area(get_bits(gb, 5), 13107, (1 << 14) + 2048*2);
+    lar[3] = decode_log_area(get_bits(gb, 5), 13107, (1 << 14) - 2560*2);
+    lar[4] = decode_log_area(get_bits(gb, 4), 19223, (1 << 13) +   94*2);
+    lar[5] = decode_log_area(get_bits(gb, 4), 17476, (1 << 13) - 1792*2);
+    lar[6] = decode_log_area(get_bits(gb, 3), 31454, (1 << 12) -  341*2);
+    lar[7] = decode_log_area(get_bits(gb, 3), 29708, (1 << 12) - 1144*2);
 
     for (i = 0; i < 4; i++) {
-        int lag      = bitstream_read(bc, 7);
-        int gain_idx = bitstream_read(bc, 2);
-        int offset   = bitstream_read(bc, 2);
+        int lag      = get_bits(gb, 7);
+        int gain_idx = get_bits(gb, 2);
+        int offset   = get_bits(gb, 2);
         lag = av_clip(lag, 40, 120);
         long_term_synth(ref_dst, lag, gain_idx);
-        apcm_dequant_add(bc, ref_dst + offset, ff_gsm_apcm_bits[mode][i]);
+        apcm_dequant_add(gb, ref_dst + offset, ff_gsm_apcm_bits[mode][i]);
         ref_dst += 40;
     }
     memcpy(ctx->ref_buf, ctx->ref_buf + 160, 120 * sizeof(*ctx->ref_buf));
diff --git a/libavcodec/h261.c b/libavcodec/h261.c
index 320d621..47bad4e 100644
--- a/libavcodec/h261.c
+++ b/libavcodec/h261.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h261.h b/libavcodec/h261.h
index fdfe560..399a404 100644
--- a/libavcodec/h261.h
+++ b/libavcodec/h261.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,6 @@ typedef struct H261Context {
     MpegEncContext s;
 
     int current_mba;
-    int previous_mba;
     int mba_diff;
     int mtype;
     int current_mv_x;
diff --git a/libavcodec/h261_parser.c b/libavcodec/h261_parser.c
index 59eed02..2299c1c 100644
--- a/libavcodec/h261_parser.c
+++ b/libavcodec/h261_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,11 +71,15 @@ static int h261_parse(AVCodecParserContext *s,
     ParseContext *pc = s->priv_data;
     int next;
 
-    next = h261_find_frame_end(pc, avctx, buf, buf_size);
-    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-        *poutbuf      = NULL;
-        *poutbuf_size = 0;
-        return buf_size;
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        next = h261_find_frame_end(pc, avctx, buf, buf_size);
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf      = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
     }
     *poutbuf      = buf;
     *poutbuf_size = buf_size;
diff --git a/libavcodec/h261data.c b/libavcodec/h261data.c
index a81ccdf..a9891ed 100644
--- a/libavcodec/h261data.c
+++ b/libavcodec/h261data.c
@@ -2,20 +2,20 @@
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h261dec.c b/libavcodec/h261dec.c
index b08598e..14a874c 100644
--- a/libavcodec/h261dec.c
+++ b/libavcodec/h261dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,15 +25,14 @@
  * H.261 decoder.
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "mpeg_er.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "h263.h"
 #include "h261.h"
 #include "internal.h"
-#include "vlc.h"
 
 #define H261_MBA_VLC_BITS 9
 #define H261_MTYPE_VLC_BITS 6
@@ -77,14 +76,11 @@ static av_cold int h261_decode_init(AVCodecContext *avctx)
 
     // set defaults
     ff_mpv_decode_defaults(s);
-    s->avctx       = avctx;
-    s->width       = s->avctx->coded_width;
-    s->height      = s->avctx->coded_height;
-    s->codec_id    = s->avctx->codec->id;
+    ff_mpv_decode_init(s, avctx);
+
     s->out_format  = FMT_H261;
     s->low_delay   = 1;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-    s->codec_id    = avctx->codec->id;
 
     ff_h261_common_init();
     h261_decode_init_vlc(h);
@@ -105,18 +101,18 @@ static int h261_decode_gob_header(H261Context *h)
 
     if (!h->gob_start_code_skipped) {
         /* Check for GOB Start Code */
-        val = bitstream_peek(&s->bc, 15);
+        val = show_bits(&s->gb, 15);
         if (val)
             return -1;
 
         /* We have a GBSC */
-        bitstream_skip(&s->bc, 16);
+        skip_bits(&s->gb, 16);
     }
 
     h->gob_start_code_skipped = 0;
 
-    h->gob_number = bitstream_read(&s->bc, 4); /* GN */
-    s->qscale     = bitstream_read(&s->bc, 5); /* GQUANT */
+    h->gob_number = get_bits(&s->gb, 4); /* GN */
+    s->qscale     = get_bits(&s->gb, 5); /* GQUANT */
 
     /* Check if gob_number is valid */
     if (s->mb_height == 18) { // CIF
@@ -129,12 +125,12 @@ static int h261_decode_gob_header(H261Context *h)
     }
 
     /* GEI */
-    while (bitstream_read_bit(&s->bc) != 0)
-        bitstream_skip(&s->bc, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     if (s->qscale == 0) {
         av_log(s->avctx, AV_LOG_ERROR, "qscale has forbidden 0 value\n");
-        if (s->avctx->err_recognition & AV_EF_BITSTREAM)
+        if (s->avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return -1;
     }
 
@@ -162,27 +158,27 @@ static int h261_resync(H261Context *h)
         if (ret >= 0)
             return 0;
     } else {
-        if (bitstream_peek(&s->bc, 15) == 0) {
+        if (show_bits(&s->gb, 15) == 0) {
             ret = h261_decode_gob_header(h);
             if (ret >= 0)
                 return 0;
         }
         // OK, it is not where it is supposed to be ...
-        s->bc = s->last_resync_bc;
-        bitstream_align(&s->bc);
-        left = bitstream_bits_left(&s->bc);
+        s->gb = s->last_resync_gb;
+        align_get_bits(&s->gb);
+        left = get_bits_left(&s->gb);
 
         for (; left > 15 + 1 + 4 + 5; left -= 8) {
-            if (bitstream_peek(&s->bc, 15) == 0) {
-                BitstreamContext bak = s->bc;
+            if (show_bits(&s->gb, 15) == 0) {
+                GetBitContext bak = s->gb;
 
                 ret = h261_decode_gob_header(h);
                 if (ret >= 0)
                     return 0;
 
-                s->bc = bak;
+                s->gb = bak;
             }
-            bitstream_skip(&s->bc, 8);
+            skip_bits(&s->gb, 8);
         }
     }
 
@@ -220,7 +216,14 @@ static int h261_decode_mb_skipped(H261Context *h, int mba1, int mba2)
         s->mb_skipped                  = 1;
         h->mtype                      &= ~MB_TYPE_H261_FIL;
 
-        ff_mpv_decode_mb(s, s->block);
+        if (s->current_picture.motion_val[0]) {
+            int b_stride = 2*s->mb_width + 1;
+            int b_xy     = 2 * s->mb_x + (2 * s->mb_y) * b_stride;
+            s->current_picture.motion_val[0][b_xy][0] = s->mv[0][0][0];
+            s->current_picture.motion_val[0][b_xy][1] = s->mv[0][0][1];
+        }
+
+        ff_mpv_reconstruct_mb(s, s->block);
     }
 
     return 0;
@@ -230,9 +233,9 @@ static const int mvmap[17] = {
     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16
 };
 
-static int decode_mv_component(BitstreamContext *bc, int v)
+static int decode_mv_component(GetBitContext *gb, int v)
 {
-    int mv_diff = bitstream_read_vlc(bc, h261_mv_vlc.table, H261_MV_VLC_BITS, 2);
+    int mv_diff = get_vlc2(gb, h261_mv_vlc.table, H261_MV_VLC_BITS, 2);
 
     /* check if mv_diff is valid */
     if (mv_diff < 0)
@@ -240,7 +243,7 @@ static int decode_mv_component(BitstreamContext *bc, int v)
 
     mv_diff = mvmap[mv_diff];
 
-    if (mv_diff && !bitstream_read_bit(bc))
+    if (mv_diff && !get_bits1(gb))
         mv_diff = -mv_diff;
 
     v += mv_diff;
@@ -259,7 +262,7 @@ static int decode_mv_component(BitstreamContext *bc, int v)
 static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
 {
     MpegEncContext *const s = &h->s;
-    int code, level, i, j, run;
+    int level, i, j, run;
     RLTable *rl = &ff_h261_rl_tcoeff;
     const uint8_t *scan_table;
 
@@ -272,7 +275,7 @@ static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
     scan_table = s->intra_scantable.permutated;
     if (s->mb_intra) {
         /* DC coef */
-        level = bitstream_read(&s->bc, 8);
+        level = get_bits(&s->gb, 8);
         // 0 (00000000b) and -128 (10000000b) are FORBIDDEN
         if ((level & 0x7F) == 0) {
             av_log(s->avctx, AV_LOG_ERROR, "illegal dc %d at %d %d\n",
@@ -290,10 +293,10 @@ static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
         // EOB          Not possible for first level when cbp is available (that's why the table is different)
         // 0    1       1s
         // *    *       0*
-        int check = bitstream_peek(&s->bc, 2);
+        int check = show_bits(&s->gb, 2);
         i = 0;
         if (check & 0x2) {
-            bitstream_skip(&s->bc, 2);
+            skip_bits(&s->gb, 2);
             block[0] = (check & 0x1) ? -1 : 1;
             i        = 1;
         }
@@ -304,39 +307,47 @@ static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
         s->block_last_index[n] = i - 1;
         return 0;
     }
+    {
+    OPEN_READER(re, &s->gb);
+    i--; // offset by -1 to allow direct indexing of scan_table
     for (;;) {
-        code = bitstream_read_vlc(&s->bc, rl->vlc.table, TCOEFF_VLC_BITS, 2);
-        if (code < 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n",
-                   s->mb_x, s->mb_y);
-            return -1;
-        }
-        if (code == rl->n) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TCOEFF_VLC_BITS, 2, 0);
+        if (run == 66) {
+            if (level) {
+                CLOSE_READER(re, &s->gb);
+                av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n",
+                       s->mb_x, s->mb_y);
+                return -1;
+            }
             /* escape */
             /* The remaining combinations of (run, level) are encoded with a
              * 20-bit word consisting of 6 bits escape, 6 bits run and 8 bits
              * level. */
-            run   = bitstream_read(&s->bc, 6);
-            level = bitstream_read_signed(&s->bc, 8);
-        } else if (code == 0) {
+            run   = SHOW_UBITS(re, &s->gb, 6) + 1;
+            SKIP_CACHE(re, &s->gb, 6);
+            level = SHOW_SBITS(re, &s->gb, 8);
+            SKIP_COUNTER(re, &s->gb, 6 + 8);
+        } else if (level == 0) {
             break;
         } else {
-            run   = rl->table_run[code];
-            level = rl->table_level[code];
-            if (bitstream_read_bit(&s->bc))
+            if (SHOW_UBITS(re, &s->gb, 1))
                 level = -level;
+            SKIP_COUNTER(re, &s->gb, 1);
         }
         i += run;
         if (i >= 64) {
+            CLOSE_READER(re, &s->gb);
             av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d\n",
                    s->mb_x, s->mb_y);
             return -1;
         }
         j        = scan_table[i];
         block[j] = level;
-        i++;
     }
-    s->block_last_index[n] = i - 1;
+    CLOSE_READER(re, &s->gb);
+    }
+    s->block_last_index[n] = i;
     return 0;
 }
 
@@ -348,8 +359,8 @@ static int h261_decode_mb(H261Context *h)
     cbp = 63;
     // Read mba
     do {
-        h->mba_diff = bitstream_read_vlc(&s->bc, h261_mba_vlc.table,
-                                         H261_MBA_VLC_BITS, 2);
+        h->mba_diff = get_vlc2(&s->gb, h261_mba_vlc.table,
+                               H261_MBA_VLC_BITS, 2);
 
         /* Check for slice end */
         /* NOTE: GOB can be empty (no MB data) or exist only of MBA_stuffing */
@@ -360,7 +371,7 @@ static int h261_decode_mb(H261Context *h)
     } while (h->mba_diff == MBA_STUFFING); // stuffing
 
     if (h->mba_diff < 0) {
-        if (bitstream_bits_left(&s->bc) <= 7)
+        if (get_bits_left(&s->gb) <= 7)
             return SLICE_END;
 
         av_log(s->avctx, AV_LOG_ERROR, "illegal mba at %d %d\n", s->mb_x, s->mb_y);
@@ -380,17 +391,18 @@ static int h261_decode_mb(H261Context *h)
     ff_update_block_index(s);
 
     // Read mtype
-    h->mtype = bitstream_read_vlc(&s->bc, h261_mtype_vlc.table, H261_MTYPE_VLC_BITS, 2);
-    if (h->mtype < 0 || h->mtype >= FF_ARRAY_ELEMS(ff_h261_mtype_map)) {
+    h->mtype = get_vlc2(&s->gb, h261_mtype_vlc.table, H261_MTYPE_VLC_BITS, 2);
+    if (h->mtype < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid mtype index %d\n",
                h->mtype);
         return SLICE_ERROR;
     }
+    av_assert0(h->mtype < FF_ARRAY_ELEMS(ff_h261_mtype_map));
     h->mtype = ff_h261_mtype_map[h->mtype];
 
     // Read mquant
     if (IS_QUANT(h->mtype))
-        ff_set_qscale(s, bitstream_read(&s->bc, 5));
+        ff_set_qscale(s, get_bits(&s->gb, 5));
 
     s->mb_intra = IS_INTRA4x4(h->mtype);
 
@@ -410,8 +422,8 @@ static int h261_decode_mb(H261Context *h)
             h->current_mv_y = 0;
         }
 
-        h->current_mv_x = decode_mv_component(&s->bc, h->current_mv_x);
-        h->current_mv_y = decode_mv_component(&s->bc, h->current_mv_y);
+        h->current_mv_x = decode_mv_component(&s->gb, h->current_mv_x);
+        h->current_mv_y = decode_mv_component(&s->gb, h->current_mv_y);
     } else {
         h->current_mv_x = 0;
         h->current_mv_y = 0;
@@ -419,7 +431,7 @@ static int h261_decode_mb(H261Context *h)
 
     // Read cbp
     if (HAS_CBP(h->mtype))
-        cbp = bitstream_read_vlc(&s->bc, h261_cbp_vlc.table, H261_CBP_VLC_BITS, 2) + 1;
+        cbp = get_vlc2(&s->gb, h261_cbp_vlc.table, H261_CBP_VLC_BITS, 2) + 1;
 
     if (s->mb_intra) {
         s->current_picture.mb_type[xy] = MB_TYPE_INTRA;
@@ -433,6 +445,13 @@ static int h261_decode_mb(H261Context *h)
     s->mv[0][0][0]                 = h->current_mv_x * 2; // gets divided by 2 in motion compensation
     s->mv[0][0][1]                 = h->current_mv_y * 2;
 
+    if (s->current_picture.motion_val[0]) {
+        int b_stride = 2*s->mb_width + 1;
+        int b_xy     = 2 * s->mb_x + (2 * s->mb_y) * b_stride;
+        s->current_picture.motion_val[0][b_xy][0] = s->mv[0][0][0];
+        s->current_picture.motion_val[0][b_xy][1] = s->mv[0][0][1];
+    }
+
 intra:
     /* decode each block */
     if (s->mb_intra || HAS_CBP(h->mtype)) {
@@ -447,7 +466,7 @@ intra:
             s->block_last_index[i] = -1;
     }
 
-    ff_mpv_decode_mb(s, s->block);
+    ff_mpv_reconstruct_mb(s, s->block);
 
     return SLICE_OK;
 }
@@ -462,8 +481,8 @@ static int h261_decode_picture_header(H261Context *h)
     int format, i;
     uint32_t startcode = 0;
 
-    for (i = bitstream_bits_left(&s->bc); i > 24; i -= 1) {
-        startcode = ((startcode << 1) | bitstream_read(&s->bc, 1)) & 0x000FFFFF;
+    for (i = get_bits_left(&s->gb); i > 24; i -= 1) {
+        startcode = ((startcode << 1) | get_bits(&s->gb, 1)) & 0x000FFFFF;
 
         if (startcode == 0x10)
             break;
@@ -475,7 +494,7 @@ static int h261_decode_picture_header(H261Context *h)
     }
 
     /* temporal reference */
-    i = bitstream_read(&s->bc, 5); /* picture timestamp */
+    i = get_bits(&s->gb, 5); /* picture timestamp */
     if (i < (s->picture_number & 31))
         i += 32;
     s->picture_number = (s->picture_number & ~31) + i;
@@ -483,11 +502,11 @@ static int h261_decode_picture_header(H261Context *h)
     s->avctx->framerate = (AVRational) { 30000, 1001 };
 
     /* PTYPE starts here */
-    bitstream_skip(&s->bc, 1); /* split screen off */
-    bitstream_skip(&s->bc, 1); /* camera  off */
-    bitstream_skip(&s->bc, 1); /* freeze picture release off */
+    skip_bits1(&s->gb); /* split screen off */
+    skip_bits1(&s->gb); /* camera  off */
+    skip_bits1(&s->gb); /* freeze picture release off */
 
-    format = bitstream_read_bit(&s->bc);
+    format = get_bits1(&s->gb);
 
     // only 2 formats possible
     if (format == 0) { // QCIF
@@ -504,12 +523,12 @@ static int h261_decode_picture_header(H261Context *h)
 
     s->mb_num = s->mb_width * s->mb_height;
 
-    bitstream_skip(&s->bc, 1); /* still image mode off */
-    bitstream_skip(&s->bc, 1); /* Reserved */
+    skip_bits1(&s->gb); /* still image mode off */
+    skip_bits1(&s->gb); /* Reserved */
 
     /* PEI */
-    while (bitstream_read_bit(&s->bc) != 0)
-        bitstream_skip(&s->bc, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     /* H.261 has no I-frames, but if we pass AV_PICTURE_TYPE_I for the first
      * frame, the codec crashes if it does not contain all I-blocks
@@ -554,7 +573,7 @@ static int h261_decode_gob(H261Context *h)
  */
 static int get_consumed_bytes(MpegEncContext *s, int buf_size)
 {
-    int pos = bitstream_tell(&s->bc) >> 3;
+    int pos = get_bits_count(&s->gb) >> 3;
     if (pos == 0)
         pos = 1;      // avoid infinite loops (i doubt that is needed but ...)
     if (pos + 10 > buf_size)
@@ -579,7 +598,7 @@ static int h261_decode_frame(AVCodecContext *avctx, void *data,
     h->gob_start_code_skipped = 0;
 
 retry:
-    bitstream_init8(&s->bc, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size * 8);
 
     if (!s->context_initialized)
         // we need the IDCT permutation for reading a custom matrix
@@ -636,12 +655,12 @@ retry:
     }
     ff_mpv_frame_end(s);
 
-    assert(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
-    assert(s->current_picture.f->pict_type == s->pict_type);
+    av_assert0(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
+    av_assert0(s->current_picture.f->pict_type == s->pict_type);
 
     if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
         return ret;
-    ff_print_debug_info(s, s->current_picture_ptr);
+    ff_print_debug_info(s, s->current_picture_ptr, pict);
 
     *got_frame = 1;
 
@@ -667,4 +686,5 @@ AVCodec ff_h261_decoder = {
     .close          = h261_decode_end,
     .decode         = h261_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
 };
diff --git a/libavcodec/h261enc.c b/libavcodec/h261enc.c
index 3cac882..315762c 100644
--- a/libavcodec/h261enc.c
+++ b/libavcodec/h261enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
@@ -33,6 +34,9 @@
 #include "h261.h"
 #include "mpegvideodata.h"
 
+static uint8_t uni_h261_rl_len [64*64*2*2];
+#define UNI_ENC_INDEX(last,run,level) ((last)*128*64 + (run)*128 + (level))
+
 int ff_h261_get_picture_format(int width, int height)
 {
     // QCIF
@@ -43,7 +47,7 @@ int ff_h261_get_picture_format(int width, int height)
         return 1;
     // ERROR
     else
-        return -1;
+        return AVERROR(EINVAL);
 }
 
 void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
@@ -58,8 +62,8 @@ void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
 
     put_bits(&s->pb, 20, 0x10); /* PSC */
 
-    temp_ref = s->picture_number * (int64_t)30000 * s->avctx->time_base.num /
-               (1001 * (int64_t)s->avctx->time_base.den);   // FIXME maybe this should use a timestamp
+    temp_ref = s->picture_number * 30000LL * s->avctx->time_base.num /
+               (1001LL * s->avctx->time_base.den);   // FIXME maybe this should use a timestamp
     put_sbits(&s->pb, 5, temp_ref); /* TemporalReference */
 
     put_bits(&s->pb, 1, 0); /* split screen off */
@@ -78,7 +82,7 @@ void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
         h->gob_number = -1;
     else
         h->gob_number = 0;
-    h->current_mba = 0;
+    s->mb_skip_run = 0;
 }
 
 /**
@@ -96,18 +100,21 @@ static void h261_encode_gob_header(MpegEncContext *s, int mb_line)
     put_bits(&s->pb, 4, h->gob_number); /* GN */
     put_bits(&s->pb, 5, s->qscale);     /* GQUANT */
     put_bits(&s->pb, 1, 0);             /* no GEI */
-    h->current_mba  = 0;
-    h->previous_mba = 0;
-    h->current_mv_x = 0;
-    h->current_mv_y = 0;
+    s->mb_skip_run = 0;
+    s->last_mv[0][0][0] = 0;
+    s->last_mv[0][0][1] = 0;
 }
 
 void ff_h261_reorder_mb_index(MpegEncContext *s)
 {
     int index = s->mb_x + s->mb_y * s->mb_width;
 
-    if (index % 33 == 0)
-        h261_encode_gob_header(s, 0);
+    if (index % 11 == 0) {
+        if (index % 33 == 0)
+            h261_encode_gob_header(s, 0);
+        s->last_mv[0][0][0] = 0;
+        s->last_mv[0][0][1] = 0;
+    }
 
     /* for CIF the GOB's are fragmented in the middle of a scanline
      * that's why we need to adjust the x and y index of the macroblocks */
@@ -214,8 +221,8 @@ static void h261_encode_block(H261Context *h, int16_t *block, int n)
             put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
             if (code == rl->n) {
                 put_bits(&s->pb, 6, run);
-                assert(slevel != 0);
-                assert(level <= 127);
+                av_assert1(slevel != 0);
+                av_assert1(level <= 127);
                 put_sbits(&s->pb, 8, slevel);
             } else {
                 put_bits(&s->pb, 1, sign);
@@ -235,7 +242,6 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
     cbp = 63; // avoid warning
     mvd = 0;
 
-    h->current_mba++;
     h->mtype = 0;
 
     if (!s->mb_intra) {
@@ -245,19 +251,22 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
         /* mvd indicates if this block is motion compensated */
         mvd = motion_x | motion_y;
 
-        if ((cbp | mvd | s->dquant) == 0) {
+        if ((cbp | mvd) == 0) {
             /* skip macroblock */
             s->skip_count++;
-            h->current_mv_x = 0;
-            h->current_mv_y = 0;
+            s->mb_skip_run++;
+            s->last_mv[0][0][0] = 0;
+            s->last_mv[0][0][1] = 0;
+            s->qscale -= s->dquant;
             return;
         }
     }
 
     /* MB is not skipped, encode MBA */
     put_bits(&s->pb,
-             ff_h261_mba_bits[(h->current_mba - h->previous_mba) - 1],
-             ff_h261_mba_code[(h->current_mba - h->previous_mba) - 1]);
+             ff_h261_mba_bits[s->mb_skip_run],
+             ff_h261_mba_code[s->mb_skip_run]);
+    s->mb_skip_run = 0;
 
     /* calculate MTYPE */
     if (!s->mb_intra) {
@@ -267,13 +276,15 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
             h->mtype += 3;
         if (s->loop_filter)
             h->mtype += 3;
-        if (cbp || s->dquant)
+        if (cbp)
             h->mtype++;
-        assert(h->mtype > 1);
+        av_assert1(h->mtype > 1);
     }
 
-    if (s->dquant)
+    if (s->dquant && cbp) {
         h->mtype++;
+    } else
+        s->qscale -= s->dquant;
 
     put_bits(&s->pb,
              ff_h261_mtype_bits[h->mtype],
@@ -287,18 +298,16 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
     }
 
     if (IS_16X16(h->mtype)) {
-        mv_diff_x       = (motion_x >> 1) - h->current_mv_x;
-        mv_diff_y       = (motion_y >> 1) - h->current_mv_y;
-        h->current_mv_x = (motion_x >> 1);
-        h->current_mv_y = (motion_y >> 1);
+        mv_diff_x       = (motion_x >> 1) - s->last_mv[0][0][0];
+        mv_diff_y       = (motion_y >> 1) - s->last_mv[0][0][1];
+        s->last_mv[0][0][0] = (motion_x >> 1);
+        s->last_mv[0][0][1] = (motion_y >> 1);
         h261_encode_motion(h, mv_diff_x);
         h261_encode_motion(h, mv_diff_y);
     }
 
-    h->previous_mba = h->current_mba;
-
     if (HAS_CBP(h->mtype)) {
-        assert(cbp > 0);
+        av_assert1(cbp > 0);
         put_bits(&s->pb,
                  ff_h261_cbp_tab[cbp - 1][1],
                  ff_h261_cbp_tab[cbp - 1][0]);
@@ -307,10 +316,49 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
         /* encode each block */
         h261_encode_block(h, block[i], i);
 
-    if ((h->current_mba == 11) || (h->current_mba == 22) ||
-        (h->current_mba == 33) || (!IS_16X16(h->mtype))) {
-        h->current_mv_x = 0;
-        h->current_mv_y = 0;
+    if (!IS_16X16(h->mtype)) {
+        s->last_mv[0][0][0] = 0;
+        s->last_mv[0][0][1] = 0;
+    }
+}
+
+static av_cold void init_uni_h261_rl_tab(RLTable *rl, uint32_t *bits_tab,
+                                         uint8_t *len_tab)
+{
+    int slevel, run, last;
+
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN   >= 63);
+
+    for(slevel=-64; slevel<64; slevel++){
+        if(slevel==0) continue;
+        for(run=0; run<64; run++){
+            for(last=0; last<=1; last++){
+                const int index= UNI_ENC_INDEX(last, run, slevel+64);
+                int level= slevel < 0 ? -slevel : slevel;
+                int len, code;
+
+                len_tab[index]= 100;
+
+                /* ESC0 */
+                code= get_rl_index(rl, 0, run, level);
+                len=  rl->table_vlc[code][1] + 1;
+                if(last)
+                    len += 2;
+
+                if(code!=rl->n && len < len_tab[index]){
+                    len_tab [index]= len;
+                }
+                /* ESC */
+                len = rl->table_vlc[rl->n][1];
+                if(last)
+                    len += 2;
+
+                if(len < len_tab[index]){
+                    len_tab [index]= len;
+                }
+            }
+        }
     }
 }
 
@@ -322,6 +370,12 @@ av_cold void ff_h261_encode_init(MpegEncContext *s)
     s->max_qcoeff       = 127;
     s->y_dc_scale_table =
     s->c_dc_scale_table = ff_mpeg1_dc_scale_table;
+    s->ac_esc_length    = 6+6+8;
+
+    init_uni_h261_rl_tab(&ff_h261_rl_tcoeff, NULL, uni_h261_rl_len);
+
+    s->intra_ac_vlc_length      = s->inter_ac_vlc_length      = uni_h261_rl_len;
+    s->intra_ac_vlc_last_length = s->inter_ac_vlc_last_length = uni_h261_rl_len + 128*64;
 }
 
 static const AVClass h261_class = {
diff --git a/libavcodec/h263.c b/libavcodec/h263.c
index f3a7dab..bc5c0d5 100644
--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -141,8 +141,6 @@ void ff_h263_loop_filter(MpegEncContext * s){
     uint8_t *dest_cb= s->dest[1];
     uint8_t *dest_cr= s->dest[2];
 
-//    if(s->pict_type==AV_PICTURE_TYPE_B && !s->readable) return;
-
     /*
        Diag Top
        Left Center
diff --git a/libavcodec/h263.h b/libavcodec/h263.h
index ce697da..f891f72 100644
--- a/libavcodec/h263.h
+++ b/libavcodec/h263.h
@@ -1,31 +1,27 @@
 /*
  * H.263 internal header
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_H263_H
 #define AVCODEC_H263_H
 
 #include <stdint.h>
-
-#include "config.h"
-
 #include "libavutil/rational.h"
-
 #include "get_bits.h"
 #include "mpegvideo.h"
 #include "h263data.h"
@@ -97,11 +93,10 @@ int av_const h263_get_picture_format(int width, int height);
 
 void ff_clean_h263_qscales(MpegEncContext *s);
 int ff_h263_resync(MpegEncContext *s);
-const uint8_t *ff_h263_find_resync_marker(const uint8_t *restrict p, const uint8_t *restrict end);
-void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code);
+void ff_h263_encode_motion(PutBitContext *pb, int val, int f_code);
 
 
-static inline int h263_get_motion_length(MpegEncContext * s, int val, int f_code){
+static inline int h263_get_motion_length(int val, int f_code){
     int l, bit_size, code;
 
     if (val == 0) {
@@ -121,11 +116,11 @@ static inline int h263_get_motion_length(MpegEncContext * s, int val, int f_code
 static inline void ff_h263_encode_motion_vector(MpegEncContext * s, int x, int y, int f_code){
     if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT) {
         skip_put_bits(&s->pb,
-            h263_get_motion_length(s, x, f_code)
-           +h263_get_motion_length(s, y, f_code));
+            h263_get_motion_length(x, f_code)
+           +h263_get_motion_length(y, f_code));
     }else{
-        ff_h263_encode_motion(s, x, f_code);
-        ff_h263_encode_motion(s, y, f_code);
+        ff_h263_encode_motion(&s->pb, x, f_code);
+        ff_h263_encode_motion(&s->pb, y, f_code);
     }
 }
 
diff --git a/libavcodec/h263_parser.c b/libavcodec/h263_parser.c
index 71e047a..2e7d493 100644
--- a/libavcodec/h263_parser.c
+++ b/libavcodec/h263_parser.c
@@ -2,20 +2,20 @@
  * H.263 parser
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,12 +70,16 @@ static int h263_parse(AVCodecParserContext *s,
     ParseContext *pc = s->priv_data;
     int next;
 
-    next= ff_h263_find_frame_end(pc, buf, buf_size);
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        next= ff_h263_find_frame_end(pc, buf, buf_size);
 
-    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-        *poutbuf = NULL;
-        *poutbuf_size = 0;
-        return buf_size;
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
     }
 
     *poutbuf = buf;
diff --git a/libavcodec/h263_parser.h b/libavcodec/h263_parser.h
index 5bd715f..565a222 100644
--- a/libavcodec/h263_parser.h
+++ b/libavcodec/h263_parser.h
@@ -2,20 +2,20 @@
  * H.263 parser
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h263data.c b/libavcodec/h263data.c
index 0e368a3..f649d58 100644
--- a/libavcodec/h263data.c
+++ b/libavcodec/h263data.c
@@ -1,20 +1,20 @@
 /*
  * H.263+ tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h263data.h b/libavcodec/h263data.h
index 2c494b3..3da0e37 100644
--- a/libavcodec/h263data.h
+++ b/libavcodec/h263data.h
@@ -4,20 +4,20 @@
  * copyright (c) 2001 Juan J. Sierralta P
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c
index b000088..8385ddf 100644
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  * H.263 decoder.
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/cpu.h"
 #include "avcodec.h"
 #include "error_resilience.h"
@@ -45,9 +47,21 @@
 
 static enum AVPixelFormat h263_get_format(AVCodecContext *avctx)
 {
+    /* MPEG-4 Studio Profile only, not supported by hardware */
+    if (avctx->bits_per_raw_sample > 8) {
+        av_assert1(((MpegEncContext *)avctx->priv_data)->studio_profile);
+        return avctx->pix_fmt;
+    }
+
     if (avctx->codec->id == AV_CODEC_ID_MSS2)
         return AV_PIX_FMT_YUV420P;
 
+    if (CONFIG_GRAY && (avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        if (avctx->color_range == AVCOL_RANGE_UNSPECIFIED)
+            avctx->color_range = AVCOL_RANGE_MPEG;
+        return AV_PIX_FMT_GRAY8;
+    }
+
     return avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
 }
 
@@ -56,14 +70,12 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     int ret;
 
-    s->avctx           = avctx;
     s->out_format      = FMT_H263;
-    s->width           = avctx->coded_width;
-    s->height          = avctx->coded_height;
-    s->workaround_bugs = avctx->workaround_bugs;
 
     // set defaults
     ff_mpv_decode_defaults(s);
+    ff_mpv_decode_init(s, avctx);
+
     s->quant_precision = 5;
     s->decode_mb       = ff_h263_decode_mb;
     s->low_delay       = 1;
@@ -72,6 +84,7 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     /* select sub codec */
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H263:
+    case AV_CODEC_ID_H263P:
         s->unrestricted_mv = 0;
         avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
         break;
@@ -118,8 +131,13 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     }
     s->codec_id    = avctx->codec->id;
 
+    if (avctx->codec_tag == AV_RL32("L263") || avctx->codec_tag == AV_RL32("S263"))
+        if (avctx->extradata_size == 56 && avctx->extradata[0] == 1)
+            s->ehc_mode = 1;
+
     /* for H.263, we allocate the images after having read the header */
     if (avctx->codec->id != AV_CODEC_ID_H263 &&
+        avctx->codec->id != AV_CODEC_ID_H263P &&
         avctx->codec->id != AV_CODEC_ID_MPEG4) {
         avctx->pix_fmt = h263_get_format(avctx);
         ff_mpv_idct_init(s);
@@ -175,7 +193,7 @@ static int decode_slice(MpegEncContext *s)
 {
     const int part_mask = s->partitioned_frame
                           ? (ER_AC_END | ER_AC_ERROR) : 0x7F;
-    const int mb_size = 16;
+    const int mb_size   = 16 >> s->avctx->lowres;
     int ret;
 
     s->last_resync_gb   = s->gb;
@@ -185,12 +203,17 @@ static int decode_slice(MpegEncContext *s)
 
     ff_set_qscale(s, s->qscale);
 
+    if (s->studio_profile) {
+        if ((ret = ff_mpeg4_decode_studio_slice_header(s->avctx->priv_data)) < 0)
+            return ret;
+    }
+
     if (s->avctx->hwaccel) {
         const uint8_t *start = s->gb.buffer + get_bits_count(&s->gb) / 8;
-        const uint8_t *end   = ff_h263_find_resync_marker(start + 1,
-                                                          s->gb.buffer_end);
-        skip_bits_long(&s->gb, 8 * (end - start));
-        return s->avctx->hwaccel->decode_slice(s->avctx, start, end - start);
+        ret = s->avctx->hwaccel->decode_slice(s->avctx, start, s->gb.buffer_end - start);
+        // ensure we exit decode loop
+        s->mb_y = s->mb_height;
+        return ret;
     }
 
     if (s->partitioned_frame) {
@@ -239,6 +262,8 @@ static int decode_slice(MpegEncContext *s)
             s->mv_type = MV_TYPE_16X16;
             ff_dlog(s, "%d %06X\n",
                     get_bits_count(&s->gb), show_bits(&s->gb, 24));
+
+            ff_tlog(NULL, "Decoding MB at %dx%d\n", s->mb_x, s->mb_y);
             ret = s->decode_mb(s, s->block);
 
             if (s->pict_type != AV_PICTURE_TYPE_B)
@@ -247,7 +272,7 @@ static int decode_slice(MpegEncContext *s)
             if (ret < 0) {
                 const int xy = s->mb_x + s->mb_y * s->mb_stride;
                 if (ret == SLICE_END) {
-                    ff_mpv_decode_mb(s, s->block);
+                    ff_mpv_reconstruct_mb(s, s->block);
                     if (s->loop_filter)
                         ff_h263_loop_filter(s);
 
@@ -275,10 +300,12 @@ static int decode_slice(MpegEncContext *s)
                 ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                                 s->mb_x, s->mb_y, ER_MB_ERROR & part_mask);
 
+                if (s->avctx->err_recognition & AV_EF_IGNORE_ERR)
+                    continue;
                 return AVERROR_INVALIDDATA;
             }
 
-            ff_mpv_decode_mb(s, s->block);
+            ff_mpv_reconstruct_mb(s, s->block);
             if (s->loop_filter)
                 ff_h263_loop_filter(s);
         }
@@ -289,8 +316,9 @@ static int decode_slice(MpegEncContext *s)
         s->mb_x = 0;
     }
 
-    assert(s->mb_x == 0 && s->mb_y == s->mb_height);
+    av_assert1(s->mb_x == 0 && s->mb_y == s->mb_height);
 
+    // Detect incorrect padding with wrong stuffing codes used by NEC N-02B
     if (s->codec_id == AV_CODEC_ID_MPEG4         &&
         (s->workaround_bugs & FF_BUG_AUTODETECT) &&
         get_bits_left(&s->gb) >= 48              &&
@@ -302,7 +330,7 @@ static int decode_slice(MpegEncContext *s)
     if (s->codec_id == AV_CODEC_ID_MPEG4         &&
         (s->workaround_bugs & FF_BUG_AUTODETECT) &&
         get_bits_left(&s->gb) >= 0               &&
-        get_bits_left(&s->gb) < 48               &&
+        get_bits_left(&s->gb) < 137              &&
         !s->data_partitioning) {
         const int bits_count = get_bits_count(&s->gb);
         const int bits_left  = s->gb.size_in_bits - bits_count;
@@ -323,8 +351,27 @@ static int decode_slice(MpegEncContext *s)
         }
     }
 
+    if (s->codec_id == AV_CODEC_ID_H263          &&
+        (s->workaround_bugs & FF_BUG_AUTODETECT) &&
+        get_bits_left(&s->gb) >= 8               &&
+        get_bits_left(&s->gb) < 300              &&
+        s->pict_type == AV_PICTURE_TYPE_I        &&
+        show_bits(&s->gb, 8) == 0                &&
+        !s->data_partitioning) {
+
+        s->padding_bug_score += 32;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_H263          &&
+        (s->workaround_bugs & FF_BUG_AUTODETECT) &&
+        get_bits_left(&s->gb) >= 64              &&
+        AV_RB64(s->gb.buffer_end - 8) == 0xCDCDCDCDFC7F0000) {
+
+        s->padding_bug_score += 32;
+    }
+
     if (s->workaround_bugs & FF_BUG_AUTODETECT) {
-        if (s->codec_id == AV_CODEC_ID_H263 ||
+        if (
             (s->padding_bug_score > -2 && !s->data_partitioning))
             s->workaround_bugs |= FF_BUG_NO_PADDING;
         else
@@ -343,7 +390,7 @@ static int decode_slice(MpegEncContext *s)
         /* buggy padding but the frame should still end approximately at
          * the bitstream end */
         if ((s->workaround_bugs & FF_BUG_NO_PADDING) &&
-            (s->avctx->err_recognition & AV_EF_BUFFER))
+            (s->avctx->err_recognition & (AV_EF_BUFFER|AV_EF_AGGRESSIVE)))
             max_extra += 48;
         else if ((s->workaround_bugs & FF_BUG_NO_PADDING))
             max_extra += 256 * 256 * 256 * 64;
@@ -378,6 +425,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int buf_size       = avpkt->size;
     MpegEncContext *s  = avctx->priv_data;
     int ret;
+    int slice_ret = 0;
     AVFrame *pict = data;
 
     /* no supplementary picture */
@@ -401,6 +449,8 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             next = ff_mpeg4_find_frame_end(&s->parse_context, buf, buf_size);
         } else if (CONFIG_H263_DECODER && s->codec_id == AV_CODEC_ID_H263) {
             next = ff_h263_find_frame_end(&s->parse_context, buf, buf_size);
+        } else if (CONFIG_H263P_DECODER && s->codec_id == AV_CODEC_ID_H263P) {
+            next = ff_h263_find_frame_end(&s->parse_context, buf, buf_size);
         } else {
             av_log(s->avctx, AV_LOG_ERROR,
                    "this codec does not support truncated bitstreams\n");
@@ -412,13 +462,27 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return buf_size;
     }
 
-    if (s->bitstream_buffer_size && (s->divx_packed || buf_size < 20)) // divx 5.01+/xvid frame reorder
+retry:
+    if (s->divx_packed && s->bitstream_buffer_size) {
+        int i;
+        for(i=0; i < buf_size-3; i++) {
+            if (buf[i]==0 && buf[i+1]==0 && buf[i+2]==1) {
+                if (buf[i+3]==0xB0) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Discarding excessive bitstream in packed xvid\n");
+                    s->bitstream_buffer_size = 0;
+                }
+                break;
+            }
+        }
+    }
+
+    if (s->bitstream_buffer_size && (s->divx_packed || buf_size <= MAX_NVOP_SIZE)) // divx 5.01+/xvid frame reorder
         ret = init_get_bits8(&s->gb, s->bitstream_buffer,
                              s->bitstream_buffer_size);
     else
         ret = init_get_bits8(&s->gb, buf, buf_size);
-    s->bitstream_buffer_size = 0;
 
+    s->bitstream_buffer_size = 0;
     if (ret < 0)
         return ret;
 
@@ -435,13 +499,10 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (s->avctx->extradata_size && s->picture_number == 0) {
             GetBitContext gb;
 
-            ret = init_get_bits8(&gb, s->avctx->extradata,
-                                 s->avctx->extradata_size);
-            if (ret < 0)
-                return ret;
-            ff_mpeg4_decode_picture_header(avctx->priv_data, &gb);
+            if (init_get_bits8(&gb, s->avctx->extradata, s->avctx->extradata_size) >= 0 )
+                ff_mpeg4_decode_picture_header(avctx->priv_data, &gb, 1);
         }
-        ret = ff_mpeg4_decode_picture_header(avctx->priv_data, &s->gb);
+        ret = ff_mpeg4_decode_picture_header(avctx->priv_data, &s->gb, 0);
     } else if (CONFIG_H263I_DECODER && s->codec_id == AV_CODEC_ID_H263I) {
         ret = ff_intel_h263_decode_picture_header(s);
     } else if (CONFIG_FLV_DECODER && s->h263_flv) {
@@ -450,6 +511,14 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ret = ff_h263_decode_picture_header(s);
     }
 
+    if (ret < 0 || ret == FRAME_SKIPPED) {
+        if (   s->width  != avctx->coded_width
+            || s->height != avctx->coded_height) {
+                av_log(s->avctx, AV_LOG_WARNING, "Reverting picture dimensions change due to header decoding failure\n");
+                s->width = avctx->coded_width;
+                s->height= avctx->coded_height;
+        }
+    }
     if (ret == FRAME_SKIPPED)
         return get_consumed_bytes(s, buf_size);
 
@@ -474,25 +543,11 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     avctx->has_b_frames = !s->low_delay;
 
-#define SET_QPEL_FUNC(postfix1, postfix2)                           \
-    s->qdsp.put_        ## postfix1 = ff_put_        ## postfix2;   \
-    s->qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2;   \
-    s->qdsp.avg_        ## postfix1 = ff_avg_        ## postfix2;
-
-    if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-        SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c)
-
-        SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c)
+    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4) {
+        if (ff_mpeg4_workaround_bugs(avctx) == 1)
+            goto retry;
+        if (s->studio_profile != (s->idsp.idct == NULL))
+            ff_mpv_idct_init(s);
     }
 
     /* After H.263 & MPEG-4 header decode we have the height, width,
@@ -579,17 +634,17 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (ret < 0)
             return ret;
         if (ret == 1)
-            goto intrax8_decoded;
+            goto frame_end;
     }
 
     /* decode each macroblock */
     s->mb_x = 0;
     s->mb_y = 0;
 
-    ret = decode_slice(s);
+    slice_ret = decode_slice(s);
     while (s->mb_y < s->mb_height) {
         if (s->msmpeg4_version) {
-            if (s->slice_height == 0 || s->mb_x != 0 ||
+            if (s->slice_height == 0 || s->mb_x != 0 || slice_ret < 0 ||
                 (s->mb_y % s->slice_height) != 0 || get_bits_left(&s->gb) < 0)
                 break;
         } else {
@@ -604,7 +659,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_mpeg4_clean_buffers(s);
 
         if (decode_slice(s) < 0)
-            ret = AVERROR_INVALIDDATA;
+            slice_ret = AVERROR_INVALIDDATA;
     }
 
     if (s->msmpeg4_version && s->msmpeg4_version < 4 &&
@@ -613,13 +668,10 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_msmpeg4_decode_ext_header(s, buf_size) < 0)
             s->er.error_status_table[s->mb_num - 1] = ER_MB_ERROR;
 
-    assert(s->bitstream_buffer_size == 0);
-
-    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4)
-        ff_mpeg4_frame_end(avctx, buf, buf_size);
-
-intrax8_decoded:
-    ff_er_frame_end(&s->er);
+    av_assert1(s->bitstream_buffer_size == 0);
+frame_end:
+    if (!s->studio_profile)
+        ff_er_frame_end(&s->er);
 
     if (avctx->hwaccel) {
         ret = avctx->hwaccel->end_frame(avctx);
@@ -629,27 +681,47 @@ intrax8_decoded:
 
     ff_mpv_frame_end(s);
 
+    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4)
+        ff_mpeg4_frame_end(avctx, buf, buf_size);
+
     if (!s->divx_packed && avctx->hwaccel)
         ff_thread_finish_setup(avctx);
 
-    assert(s->current_picture.f->pict_type ==
-           s->current_picture_ptr->f->pict_type);
-    assert(s->current_picture.f->pict_type == s->pict_type);
+    av_assert1(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
+    av_assert1(s->current_picture.f->pict_type == s->pict_type);
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
         if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->current_picture_ptr);
+        ff_print_debug_info(s, s->current_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
     } else if (s->last_picture_ptr) {
         if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->last_picture_ptr);
+        ff_print_debug_info(s, s->last_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
     }
 
-    if (s->last_picture_ptr || s->low_delay)
+    if (s->last_picture_ptr || s->low_delay) {
+        if (   pict->format == AV_PIX_FMT_YUV420P
+            && (s->codec_tag == AV_RL32("GEOV") || s->codec_tag == AV_RL32("GEOX"))) {
+            int x, y, p;
+            av_frame_make_writable(pict);
+            for (p=0; p<3; p++) {
+                int w = AV_CEIL_RSHIFT(pict-> width, !!p);
+                int h = AV_CEIL_RSHIFT(pict->height, !!p);
+                int linesize = pict->linesize[p];
+                for (y=0; y<(h>>1); y++)
+                    for (x=0; x<w; x++)
+                        FFSWAP(int,
+                               pict->data[p][x + y*linesize],
+                               pict->data[p][x + (h-1-y)*linesize]);
+            }
+        }
         *got_frame = 1;
+    }
 
-    if (ret && (avctx->err_recognition & AV_EF_EXPLODE))
-        return ret;
+    if (slice_ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
+        return slice_ret;
     else
         return get_consumed_bytes(s, buf_size);
 }
@@ -658,9 +730,15 @@ const enum AVPixelFormat ff_h263_hwaccel_pixfmt_list_420[] = {
 #if CONFIG_H263_VAAPI_HWACCEL || CONFIG_MPEG4_VAAPI_HWACCEL
     AV_PIX_FMT_VAAPI,
 #endif
+#if CONFIG_MPEG4_NVDEC_HWACCEL
+    AV_PIX_FMT_CUDA,
+#endif
 #if CONFIG_MPEG4_VDPAU_HWACCEL
     AV_PIX_FMT_VDPAU,
 #endif
+#if CONFIG_H263_VIDEOTOOLBOX_HWACCEL || CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL
+    AV_PIX_FMT_VIDEOTOOLBOX,
+#endif
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
 };
@@ -676,7 +754,26 @@ AVCodec ff_h263_decoder = {
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
                       AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
     .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
+    .pix_fmts       = ff_h263_hwaccel_pixfmt_list_420,
+};
+
+AVCodec ff_h263p_decoder = {
+    .name           = "h263p",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.263 / H.263-1996, H.263+ / H.263-1998 / H.263 version 2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H263P,
+    .priv_data_size = sizeof(MpegEncContext),
+    .init           = ff_h263_decode_init,
+    .close          = ff_h263_decode_end,
+    .decode         = ff_h263_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
+                      AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
     .pix_fmts       = ff_h263_hwaccel_pixfmt_list_420,
     .hw_configs     = (const AVCodecHWConfigInternal*[]) {
 #if CONFIG_H263_VAAPI_HWACCEL
@@ -685,6 +782,9 @@ AVCodec ff_h263_decoder = {
 #if CONFIG_MPEG4_VDPAU_HWACCEL
                         HWACCEL_VDPAU(mpeg4),
 #endif
+#if CONFIG_H263_VIDEOTOOLBOX_HWACCEL
+                        HWACCEL_VIDEOTOOLBOX(h263),
+#endif
                         NULL
                     },
 };
diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
index 70ecdb9..b3c0bcd 100644
--- a/libavcodec/h263dsp.c
+++ b/libavcodec/h263dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -121,4 +121,6 @@ av_cold void ff_h263dsp_init(H263DSPContext *ctx)
 
     if (ARCH_X86)
         ff_h263dsp_init_x86(ctx);
+    if (ARCH_MIPS)
+        ff_h263dsp_init_mips(ctx);
 }
diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
index 40f041c..1abea3c 100644
--- a/libavcodec/h263dsp.h
+++ b/libavcodec/h263dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,5 +30,6 @@ typedef struct H263DSPContext {
 
 void ff_h263dsp_init(H263DSPContext *ctx);
 void ff_h263dsp_init_x86(H263DSPContext *ctx);
+void ff_h263dsp_init_mips(H263DSPContext *ctx);
 
 #endif /* AVCODEC_H263DSP_H */
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index aa137b5..7a1fb6d 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,10 +24,14 @@
 #ifndef AVCODEC_H264_H
 #define AVCODEC_H264_H
 
-#define QP_MAX_NUM (51 + 2 * 6)           // The maximum supported qp
+#define QP_MAX_NUM (51 + 6*6)           // The maximum supported qp
 
-/* NAL unit types */
+/*
+ * Table 7-1 – NAL unit type codes, syntax element categories, and NAL unit type classes in
+ * T-REC-H.264-201704
+ */
 enum {
+    H264_NAL_UNSPECIFIED     = 0,
     H264_NAL_SLICE           = 1,
     H264_NAL_DPA             = 2,
     H264_NAL_DPB             = 3,
@@ -41,7 +45,24 @@ enum {
     H264_NAL_END_STREAM      = 11,
     H264_NAL_FILLER_DATA     = 12,
     H264_NAL_SPS_EXT         = 13,
+    H264_NAL_PREFIX          = 14,
+    H264_NAL_SUB_SPS         = 15,
+    H264_NAL_DPS             = 16,
+    H264_NAL_RESERVED17      = 17,
+    H264_NAL_RESERVED18      = 18,
     H264_NAL_AUXILIARY_SLICE = 19,
+    H264_NAL_EXTEN_SLICE     = 20,
+    H264_NAL_DEPTH_EXTEN_SLICE = 21,
+    H264_NAL_RESERVED22      = 22,
+    H264_NAL_RESERVED23      = 23,
+    H264_NAL_UNSPECIFIED24   = 24,
+    H264_NAL_UNSPECIFIED25   = 25,
+    H264_NAL_UNSPECIFIED26   = 26,
+    H264_NAL_UNSPECIFIED27   = 27,
+    H264_NAL_UNSPECIFIED28   = 28,
+    H264_NAL_UNSPECIFIED29   = 29,
+    H264_NAL_UNSPECIFIED30   = 30,
+    H264_NAL_UNSPECIFIED31   = 31,
 };
 
 
diff --git a/libavcodec/h2645_parse.c b/libavcodec/h2645_parse.c
index b507b19..942f2c5 100644
--- a/libavcodec/h2645_parse.c
+++ b/libavcodec/h2645_parse.c
@@ -1,20 +1,20 @@
 /*
  * H.264/HEVC common parsing code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,17 +27,20 @@
 #include "libavutil/mem.h"
 
 #include "bytestream.h"
+#include "hevc.h"
+#include "h264.h"
 #include "h2645_parse.h"
 
 int ff_h2645_extract_rbsp(const uint8_t *src, int length,
-                          H2645NAL *nal)
+                          H2645RBSP *rbsp, H2645NAL *nal, int small_padding)
 {
     int i, si, di;
     uint8_t *dst;
 
+    nal->skipped_bytes = 0;
 #define STARTCODE_TEST                                                  \
         if (i + 2 < length && src[i + 1] == 0 && src[i + 2] <= 3) {     \
-            if (src[i + 2] != 3) {                                      \
+            if (src[i + 2] != 3 && src[i + 2] != 0) {                   \
                 /* startcode, so we must be past the end */             \
                 length = i;                                             \
             }                                                           \
@@ -51,8 +54,8 @@ int ff_h2645_extract_rbsp(const uint8_t *src, int length,
             i++
 #if HAVE_FAST_64BIT
     for (i = 0; i + 1 < length; i += 9) {
-        if (!((~AV_RN64A(src + i) &
-               (AV_RN64A(src + i) - 0x0100010001000101ULL)) &
+        if (!((~AV_RN64(src + i) &
+               (AV_RN64(src + i) - 0x0100010001000101ULL)) &
               0x8000800080008080ULL))
             continue;
         FIND_FIRST_ZERO;
@@ -61,8 +64,8 @@ int ff_h2645_extract_rbsp(const uint8_t *src, int length,
     }
 #else
     for (i = 0; i + 1 < length; i += 5) {
-        if (!((~AV_RN32A(src + i) &
-               (AV_RN32A(src + i) - 0x01000101U)) &
+        if (!((~AV_RN32(src + i) &
+               (AV_RN32(src + i) - 0x01000101U)) &
               0x80008080U))
             continue;
         FIND_FIRST_ZERO;
@@ -80,19 +83,16 @@ int ff_h2645_extract_rbsp(const uint8_t *src, int length,
     }
 #endif /* HAVE_FAST_UNALIGNED */
 
-    if (i >= length - 1) { // no escaped 0
+    if (i >= length - 1 && small_padding) { // no escaped 0
         nal->data     =
         nal->raw_data = src;
         nal->size     =
         nal->raw_size = length;
         return length;
-    }
-
-    av_fast_malloc(&nal->rbsp_buffer, &nal->rbsp_buffer_size,
-                   length + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!nal->rbsp_buffer)
-        return AVERROR(ENOMEM);
+    } else if (i > length)
+        i = length;
 
+    nal->rbsp_buffer = &rbsp->rbsp_buffer[rbsp->rbsp_buffer_size];
     dst = nal->rbsp_buffer;
 
     memcpy(dst, src, i);
@@ -102,12 +102,28 @@ int ff_h2645_extract_rbsp(const uint8_t *src, int length,
         if (src[si + 2] > 3) {
             dst[di++] = src[si++];
             dst[di++] = src[si++];
-        } else if (src[si] == 0 && src[si + 1] == 0) {
+        } else if (src[si] == 0 && src[si + 1] == 0 && src[si + 2] != 0) {
             if (src[si + 2] == 3) { // escape
                 dst[di++] = 0;
                 dst[di++] = 0;
                 si       += 3;
 
+                if (nal->skipped_bytes_pos) {
+                    nal->skipped_bytes++;
+                    if (nal->skipped_bytes_pos_size < nal->skipped_bytes) {
+                        nal->skipped_bytes_pos_size *= 2;
+                        av_assert0(nal->skipped_bytes_pos_size >= nal->skipped_bytes);
+                        av_reallocp_array(&nal->skipped_bytes_pos,
+                                nal->skipped_bytes_pos_size,
+                                sizeof(*nal->skipped_bytes_pos));
+                        if (!nal->skipped_bytes_pos) {
+                            nal->skipped_bytes_pos_size = 0;
+                            return AVERROR(ENOMEM);
+                        }
+                    }
+                    if (nal->skipped_bytes_pos)
+                        nal->skipped_bytes_pos[nal->skipped_bytes-1] = di - 1;
+                }
                 continue;
             } else // next start code
                 goto nsc;
@@ -125,9 +141,125 @@ nsc:
     nal->size = di;
     nal->raw_data = src;
     nal->raw_size = si;
+    rbsp->rbsp_buffer_size += si;
+
     return si;
 }
 
+static const char *hevc_nal_type_name[64] = {
+    "TRAIL_N", // HEVC_NAL_TRAIL_N
+    "TRAIL_R", // HEVC_NAL_TRAIL_R
+    "TSA_N", // HEVC_NAL_TSA_N
+    "TSA_R", // HEVC_NAL_TSA_R
+    "STSA_N", // HEVC_NAL_STSA_N
+    "STSA_R", // HEVC_NAL_STSA_R
+    "RADL_N", // HEVC_NAL_RADL_N
+    "RADL_R", // HEVC_NAL_RADL_R
+    "RASL_N", // HEVC_NAL_RASL_N
+    "RASL_R", // HEVC_NAL_RASL_R
+    "RSV_VCL_N10", // HEVC_NAL_VCL_N10
+    "RSV_VCL_R11", // HEVC_NAL_VCL_R11
+    "RSV_VCL_N12", // HEVC_NAL_VCL_N12
+    "RSV_VLC_R13", // HEVC_NAL_VCL_R13
+    "RSV_VCL_N14", // HEVC_NAL_VCL_N14
+    "RSV_VCL_R15", // HEVC_NAL_VCL_R15
+    "BLA_W_LP", // HEVC_NAL_BLA_W_LP
+    "BLA_W_RADL", // HEVC_NAL_BLA_W_RADL
+    "BLA_N_LP", // HEVC_NAL_BLA_N_LP
+    "IDR_W_RADL", // HEVC_NAL_IDR_W_RADL
+    "IDR_N_LP", // HEVC_NAL_IDR_N_LP
+    "CRA_NUT", // HEVC_NAL_CRA_NUT
+    "IRAP_IRAP_VCL22", // HEVC_NAL_IRAP_VCL22
+    "IRAP_IRAP_VCL23", // HEVC_NAL_IRAP_VCL23
+    "RSV_VCL24", // HEVC_NAL_RSV_VCL24
+    "RSV_VCL25", // HEVC_NAL_RSV_VCL25
+    "RSV_VCL26", // HEVC_NAL_RSV_VCL26
+    "RSV_VCL27", // HEVC_NAL_RSV_VCL27
+    "RSV_VCL28", // HEVC_NAL_RSV_VCL28
+    "RSV_VCL29", // HEVC_NAL_RSV_VCL29
+    "RSV_VCL30", // HEVC_NAL_RSV_VCL30
+    "RSV_VCL31", // HEVC_NAL_RSV_VCL31
+    "VPS", // HEVC_NAL_VPS
+    "SPS", // HEVC_NAL_SPS
+    "PPS", // HEVC_NAL_PPS
+    "AUD", // HEVC_NAL_AUD
+    "EOS_NUT", // HEVC_NAL_EOS_NUT
+    "EOB_NUT", // HEVC_NAL_EOB_NUT
+    "FD_NUT", // HEVC_NAL_FD_NUT
+    "SEI_PREFIX", // HEVC_NAL_SEI_PREFIX
+    "SEI_SUFFIX", // HEVC_NAL_SEI_SUFFIX
+    "RSV_NVCL41", // HEVC_NAL_RSV_NVCL41
+    "RSV_NVCL42", // HEVC_NAL_RSV_NVCL42
+    "RSV_NVCL43", // HEVC_NAL_RSV_NVCL43
+    "RSV_NVCL44", // HEVC_NAL_RSV_NVCL44
+    "RSV_NVCL45", // HEVC_NAL_RSV_NVCL45
+    "RSV_NVCL46", // HEVC_NAL_RSV_NVCL46
+    "RSV_NVCL47", // HEVC_NAL_RSV_NVCL47
+    "UNSPEC48", // HEVC_NAL_UNSPEC48
+    "UNSPEC49", // HEVC_NAL_UNSPEC49
+    "UNSPEC50", // HEVC_NAL_UNSPEC50
+    "UNSPEC51", // HEVC_NAL_UNSPEC51
+    "UNSPEC52", // HEVC_NAL_UNSPEC52
+    "UNSPEC53", // HEVC_NAL_UNSPEC53
+    "UNSPEC54", // HEVC_NAL_UNSPEC54
+    "UNSPEC55", // HEVC_NAL_UNSPEC55
+    "UNSPEC56", // HEVC_NAL_UNSPEC56
+    "UNSPEC57", // HEVC_NAL_UNSPEC57
+    "UNSPEC58", // HEVC_NAL_UNSPEC58
+    "UNSPEC59", // HEVC_NAL_UNSPEC59
+    "UNSPEC60", // HEVC_NAL_UNSPEC60
+    "UNSPEC61", // HEVC_NAL_UNSPEC61
+    "UNSPEC62", // HEVC_NAL_UNSPEC62
+    "UNSPEC63", // HEVC_NAL_UNSPEC63
+};
+
+static const char *hevc_nal_unit_name(int nal_type)
+{
+    av_assert0(nal_type >= 0 && nal_type < 64);
+    return hevc_nal_type_name[nal_type];
+}
+
+static const char *h264_nal_type_name[32] = {
+    "Unspecified 0", //H264_NAL_UNSPECIFIED
+    "Coded slice of a non-IDR picture", // H264_NAL_SLICE
+    "Coded slice data partition A", // H264_NAL_DPA
+    "Coded slice data partition B", // H264_NAL_DPB
+    "Coded slice data partition C", // H264_NAL_DPC
+    "IDR", // H264_NAL_IDR_SLICE
+    "SEI", // H264_NAL_SEI
+    "SPS", // H264_NAL_SPS
+    "PPS", // H264_NAL_PPS
+    "AUD", // H264_NAL_AUD
+    "End of sequence", // H264_NAL_END_SEQUENCE
+    "End of stream", // H264_NAL_END_STREAM
+    "Filler data", // H264_NAL_FILLER_DATA
+    "SPS extension", // H264_NAL_SPS_EXT
+    "Prefix", // H264_NAL_PREFIX
+    "Subset SPS", // H264_NAL_SUB_SPS
+    "Depth parameter set", // H264_NAL_DPS
+    "Reserved 17", // H264_NAL_RESERVED17
+    "Reserved 18", // H264_NAL_RESERVED18
+    "Auxiliary coded picture without partitioning", // H264_NAL_AUXILIARY_SLICE
+    "Slice extension", // H264_NAL_EXTEN_SLICE
+    "Slice extension for a depth view or a 3D-AVC texture view", // H264_NAL_DEPTH_EXTEN_SLICE
+    "Reserved 22", // H264_NAL_RESERVED22
+    "Reserved 23", // H264_NAL_RESERVED23
+    "Unspecified 24", // H264_NAL_UNSPECIFIED24
+    "Unspecified 25", // H264_NAL_UNSPECIFIED25
+    "Unspecified 26", // H264_NAL_UNSPECIFIED26
+    "Unspecified 27", // H264_NAL_UNSPECIFIED27
+    "Unspecified 28", // H264_NAL_UNSPECIFIED28
+    "Unspecified 29", // H264_NAL_UNSPECIFIED29
+    "Unspecified 30", // H264_NAL_UNSPECIFIED30
+    "Unspecified 31", // H264_NAL_UNSPECIFIED31
+};
+
+static const char *h264_nal_unit_name(int nal_type)
+{
+    av_assert0(nal_type >= 0 && nal_type < 32);
+    return h264_nal_type_name[nal_type];
+}
+
 static int get_bit_length(H2645NAL *nal, int skip_trailing_zeros)
 {
     int size = nal->size;
@@ -148,7 +280,7 @@ static int get_bit_length(H2645NAL *nal, int skip_trailing_zeros)
     /* remove the stop bit and following trailing zeros,
      * or nothing for damaged bitstreams */
     if (v)
-        size -= av_ctz(v) + 1;
+        size -= ff_ctz(v) + 1;
 
     return size;
 }
@@ -173,8 +305,8 @@ static int hevc_parse_nal_header(H2645NAL *nal, void *logctx)
         return AVERROR_INVALIDDATA;
 
     av_log(logctx, AV_LOG_DEBUG,
-           "nal_unit_type: %d, nuh_layer_id: %dtemporal_id: %d\n",
-           nal->type, nuh_layer_id, nal->temporal_id);
+           "nal_unit_type: %d(%s), nuh_layer_id: %d, temporal_id: %d\n",
+           nal->type, hevc_nal_unit_name(nal->type), nuh_layer_id, nal->temporal_id);
 
     return nuh_layer_id == 0;
 }
@@ -190,8 +322,8 @@ static int h264_parse_nal_header(H2645NAL *nal, void *logctx)
     nal->type    = get_bits(gb, 5);
 
     av_log(logctx, AV_LOG_DEBUG,
-           "nal_unit_type: %d, nal_ref_idc: %d\n",
-           nal->type, nal->ref_idc);
+           "nal_unit_type: %d(%s), nal_ref_idc: %d\n",
+           nal->type, h264_nal_unit_name(nal->type), nal->ref_idc);
 
     return 1;
 }
@@ -211,86 +343,143 @@ static int find_next_start_code(const uint8_t *buf, const uint8_t *next_avc)
     return i + 3;
 }
 
+static void alloc_rbsp_buffer(H2645RBSP *rbsp, unsigned int size, int use_ref)
+{
+    if (size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE)
+        goto fail;
+    size += AV_INPUT_BUFFER_PADDING_SIZE;
+
+    if (rbsp->rbsp_buffer_alloc_size >= size &&
+        (!rbsp->rbsp_buffer_ref || av_buffer_is_writable(rbsp->rbsp_buffer_ref)))
+        return;
+
+    size = FFMIN(size + size / 16 + 32, INT_MAX);
+
+    if (rbsp->rbsp_buffer_ref)
+        av_buffer_unref(&rbsp->rbsp_buffer_ref);
+    else
+        av_free(rbsp->rbsp_buffer);
+
+    rbsp->rbsp_buffer = av_malloc(size);
+    if (!rbsp->rbsp_buffer)
+        goto fail;
+    rbsp->rbsp_buffer_alloc_size = size;
+
+    if (use_ref) {
+        rbsp->rbsp_buffer_ref = av_buffer_create(rbsp->rbsp_buffer, size,
+                                                 NULL, NULL, 0);
+        if (!rbsp->rbsp_buffer_ref)
+            goto fail;
+    }
+
+    return;
+
+fail:
+    rbsp->rbsp_buffer_alloc_size = 0;
+    if (rbsp->rbsp_buffer_ref) {
+        av_buffer_unref(&rbsp->rbsp_buffer_ref);
+        rbsp->rbsp_buffer = NULL;
+    } else
+        av_freep(&rbsp->rbsp_buffer);
+
+    return;
+}
+
 int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
                           void *logctx, int is_nalff, int nal_length_size,
-                          enum AVCodecID codec_id)
+                          enum AVCodecID codec_id, int small_padding, int use_ref)
 {
     GetByteContext bc;
     int consumed, ret = 0;
-    size_t next_avc = is_nalff ? 0 : length;
+    int next_avc = is_nalff ? 0 : length;
+    int64_t padding = small_padding ? 0 : MAX_MBPAIR_SIZE;
 
     bytestream2_init(&bc, buf, length);
+    alloc_rbsp_buffer(&pkt->rbsp, length + padding, use_ref);
 
+    if (!pkt->rbsp.rbsp_buffer)
+        return AVERROR(ENOMEM);
+
+    pkt->rbsp.rbsp_buffer_size = 0;
     pkt->nb_nals = 0;
     while (bytestream2_get_bytes_left(&bc) >= 4) {
         H2645NAL *nal;
         int extract_length = 0;
         int skip_trailing_zeros = 1;
 
-        /*
-         * Only parse an AVC1 length field if one is expected at the current
-         * buffer position. There are unfortunately streams with multiple
-         * NAL units covered by the length field. Those NAL units are delimited
-         * by Annex B start code prefixes. ff_h2645_extract_rbsp() detects it
-         * correctly and consumes only the first NAL unit. The additional NAL
-         * units are handled here in the Annex B parsing code.
-         */
         if (bytestream2_tell(&bc) == next_avc) {
-            int i;
-            for (i = 0; i < nal_length_size; i++)
-                extract_length = (extract_length << 8) | bytestream2_get_byte(&bc);
-
-            if (extract_length > bytestream2_get_bytes_left(&bc)) {
-                av_log(logctx, AV_LOG_ERROR,
-                       "Invalid NAL unit size (%d > %d).\n",
-                       extract_length, bytestream2_get_bytes_left(&bc));
-                return AVERROR_INVALIDDATA;
-            }
-            // keep track of the next AVC1 length field
+            int i = 0;
+            extract_length = get_nalsize(nal_length_size,
+                                         bc.buffer, bytestream2_get_bytes_left(&bc), &i, logctx);
+            if (extract_length < 0)
+                return extract_length;
+
+            bytestream2_skip(&bc, nal_length_size);
+
             next_avc = bytestream2_tell(&bc) + extract_length;
         } else {
-            /*
-             * expected to return immediately except for streams with mixed
-             * NAL unit coding
-             */
-            int buf_index = find_next_start_code(bc.buffer, buf + next_avc);
+            int buf_index;
+
+            if (bytestream2_tell(&bc) > next_avc)
+                av_log(logctx, AV_LOG_WARNING, "Exceeded next NALFF position, re-syncing.\n");
+
+            /* search start code */
+            buf_index = find_next_start_code(bc.buffer, buf + next_avc);
 
             bytestream2_skip(&bc, buf_index);
 
-            /*
-             * break if an AVC1 length field is expected at the current buffer
-             * position
-             */
-            if (bytestream2_tell(&bc) == next_avc)
-                continue;
+            if (!bytestream2_get_bytes_left(&bc)) {
+                if (pkt->nb_nals > 0) {
+                    // No more start codes: we discarded some irrelevant
+                    // bytes at the end of the packet.
+                    return 0;
+                } else {
+                    av_log(logctx, AV_LOG_ERROR, "No start code is found.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+
+            extract_length = FFMIN(bytestream2_get_bytes_left(&bc), next_avc - bytestream2_tell(&bc));
 
-            if (bytestream2_get_bytes_left(&bc) > 0) {
-                extract_length = bytestream2_get_bytes_left(&bc);
-            } else if (pkt->nb_nals == 0) {
-                av_log(logctx, AV_LOG_ERROR, "No NAL unit found\n");
-                return AVERROR_INVALIDDATA;
-            } else {
-                break;
+            if (bytestream2_tell(&bc) >= next_avc) {
+                /* skip to the start of the next NAL */
+                bytestream2_skip(&bc, next_avc - bytestream2_tell(&bc));
+                continue;
             }
         }
 
         if (pkt->nals_allocated < pkt->nb_nals + 1) {
             int new_size = pkt->nals_allocated + 1;
-            H2645NAL *tmp = av_realloc_array(pkt->nals, new_size, sizeof(*tmp));
+            void *tmp = av_realloc_array(pkt->nals, new_size, sizeof(*pkt->nals));
+
             if (!tmp)
                 return AVERROR(ENOMEM);
 
             pkt->nals = tmp;
             memset(pkt->nals + pkt->nals_allocated, 0,
-                   (new_size - pkt->nals_allocated) * sizeof(*tmp));
+                   (new_size - pkt->nals_allocated) * sizeof(*pkt->nals));
+
+            nal = &pkt->nals[pkt->nb_nals];
+            nal->skipped_bytes_pos_size = 1024; // initial buffer size
+            nal->skipped_bytes_pos = av_malloc_array(nal->skipped_bytes_pos_size, sizeof(*nal->skipped_bytes_pos));
+            if (!nal->skipped_bytes_pos)
+                return AVERROR(ENOMEM);
+
             pkt->nals_allocated = new_size;
         }
-        nal = &pkt->nals[pkt->nb_nals++];
+        nal = &pkt->nals[pkt->nb_nals];
 
-        consumed = ff_h2645_extract_rbsp(bc.buffer, extract_length, nal);
+        consumed = ff_h2645_extract_rbsp(bc.buffer, extract_length, &pkt->rbsp, nal, small_padding);
         if (consumed < 0)
             return consumed;
 
+        if (is_nalff && (extract_length != consumed) && extract_length)
+            av_log(logctx, AV_LOG_DEBUG,
+                   "NALFF: Consumed only %d bytes instead of %d\n",
+                   consumed, extract_length);
+
+        pkt->nb_nals++;
+
         bytestream2_skip(&bc, consumed);
 
         /* see commit 3566042a0 */
@@ -308,7 +497,7 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
             ret = hevc_parse_nal_header(nal, logctx);
         else
             ret = h264_parse_nal_header(nal, logctx);
-        if (ret <= 0) {
+        if (ret <= 0 || nal->size <= 0 || nal->size_bits <= 0) {
             if (ret < 0) {
                 av_log(logctx, AV_LOG_ERROR, "Invalid NAL unit %d, skipping.\n",
                        nal->type);
@@ -323,8 +512,15 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
 void ff_h2645_packet_uninit(H2645Packet *pkt)
 {
     int i;
-    for (i = 0; i < pkt->nals_allocated; i++)
-        av_freep(&pkt->nals[i].rbsp_buffer);
+    for (i = 0; i < pkt->nals_allocated; i++) {
+        av_freep(&pkt->nals[i].skipped_bytes_pos);
+    }
     av_freep(&pkt->nals);
     pkt->nals_allocated = 0;
+    if (pkt->rbsp.rbsp_buffer_ref) {
+        av_buffer_unref(&pkt->rbsp.rbsp_buffer_ref);
+        pkt->rbsp.rbsp_buffer = NULL;
+    } else
+        av_freep(&pkt->rbsp.rbsp_buffer);
+    pkt->rbsp.rbsp_buffer_alloc_size = pkt->rbsp.rbsp_buffer_size = 0;
 }
diff --git a/libavcodec/h2645_parse.h b/libavcodec/h2645_parse.h
index 9cc4441..2c29ca5 100644
--- a/libavcodec/h2645_parse.h
+++ b/libavcodec/h2645_parse.h
@@ -1,20 +1,20 @@
 /*
  * H.264/HEVC common parsing code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,12 +23,14 @@
 
 #include <stdint.h>
 
+#include "libavutil/buffer.h"
 #include "avcodec.h"
 #include "get_bits.h"
 
+#define MAX_MBPAIR_SIZE (256*1024) // a tighter bound could be calculated if someone cares about a few bytes
+
 typedef struct H2645NAL {
     uint8_t *rbsp_buffer;
-    int rbsp_buffer_size;
 
     int size;
     const uint8_t *data;
@@ -54,15 +56,26 @@ typedef struct H2645NAL {
      */
     int temporal_id;
 
+    int skipped_bytes;
+    int skipped_bytes_pos_size;
+    int *skipped_bytes_pos;
     /**
      * H.264 only, nal_ref_idc
      */
     int ref_idc;
 } H2645NAL;
 
+typedef struct H2645RBSP {
+    uint8_t *rbsp_buffer;
+    AVBufferRef *rbsp_buffer_ref;
+    int rbsp_buffer_alloc_size;
+    int rbsp_buffer_size;
+} H2645RBSP;
+
 /* an input packet split into unescaped NAL units */
 typedef struct H2645Packet {
     H2645NAL *nals;
+    H2645RBSP rbsp;
     int nb_nals;
     int nals_allocated;
 } H2645Packet;
@@ -70,19 +83,50 @@ typedef struct H2645Packet {
 /**
  * Extract the raw (unescaped) bitstream.
  */
-int ff_h2645_extract_rbsp(const uint8_t *src, int length,
-                          H2645NAL *nal);
+int ff_h2645_extract_rbsp(const uint8_t *src, int length, H2645RBSP *rbsp,
+                          H2645NAL *nal, int small_padding);
 
 /**
  * Split an input packet into NAL units.
+ *
+ * If data == raw_data holds true for a NAL unit of the returned pkt, then
+ * said NAL unit does not contain any emulation_prevention_three_byte and
+ * the data is contained in the input buffer pointed to by buf.
+ * Otherwise, the unescaped data is part of the rbsp_buffer described by the
+ * packet's H2645RBSP.
+ *
+ * If the packet's rbsp_buffer_ref is not NULL, the underlying AVBuffer must
+ * own rbsp_buffer. If not and rbsp_buffer is not NULL, use_ref must be 0.
+ * If use_ref is set, rbsp_buffer will be reference-counted and owned by
+ * the underlying AVBuffer of rbsp_buffer_ref.
  */
 int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
                           void *logctx, int is_nalff, int nal_length_size,
-                          enum AVCodecID codec_id);
+                          enum AVCodecID codec_id, int small_padding, int use_ref);
 
 /**
  * Free all the allocated memory in the packet.
  */
 void ff_h2645_packet_uninit(H2645Packet *pkt);
 
+static inline int get_nalsize(int nal_length_size, const uint8_t *buf,
+                              int buf_size, int *buf_index, void *logctx)
+{
+    int i, nalsize = 0;
+
+    if (*buf_index >= buf_size - nal_length_size) {
+        // the end of the buffer is reached, refill it
+        return AVERROR(EAGAIN);
+    }
+
+    for (i = 0; i < nal_length_size; i++)
+        nalsize = ((unsigned)nalsize << 8) | buf[(*buf_index)++];
+    if (nalsize <= 0 || nalsize > buf_size - *buf_index) {
+        av_log(logctx, AV_LOG_ERROR,
+               "Invalid NAL unit size (%d > %d).\n", nalsize, buf_size - *buf_index);
+        return AVERROR_INVALIDDATA;
+    }
+    return nalsize;
+}
+
 #endif /* AVCODEC_H2645_PARSE_H */
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index c0b9e30..815149a 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,11 @@
  */
 
 #define CABAC(h) 1
+#define UNCHECKED_BITSTREAM_READER 1
 #define INT_BIT (CHAR_BIT * sizeof(int))
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/timer.h"
 #include "config.h"
 #include "cabac.h"
@@ -44,8 +46,6 @@
 #include "x86/h264_cabac.c"
 #endif
 
-#include <assert.h>
-
 /* Cabac pre state table */
 
 static const int8_t cabac_context_init_I[1024][2] =
@@ -1283,7 +1283,7 @@ void ff_h264_init_cabac_states(const H264Context *h, H264SliceContext *sl)
 
 static int decode_cabac_field_decoding_flag(const H264Context *h, H264SliceContext *sl)
 {
-    const long mbb_xy = sl->mb_xy - 2L*h->mb_stride;
+    const int mbb_xy = sl->mb_xy - 2*h->mb_stride;
 
     unsigned long ctx = 0;
 
@@ -1500,7 +1500,7 @@ static int decode_cabac_mb_mvd(H264SliceContext *sl, int ctxbase, int amvd, int
     int mvd;
 
     if(!get_cabac(&sl->cabac, &sl->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
-//    if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
+//    if(!get_cabac(&sl->cabac, &sl->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
         *mvda= 0;
         return 0;
     }
@@ -1539,8 +1539,12 @@ static int decode_cabac_mb_mvd(H264SliceContext *sl, int ctxbase, int amvd, int
     int amvd1 = sl->mvd_cache[list][scan8[n] - 1][1] +\
                 sl->mvd_cache[list][scan8[n] - 8][1];\
 \
-    mx += decode_cabac_mb_mvd(sl, 40, amvd0, &mpx);\
-    my += decode_cabac_mb_mvd(sl, 47, amvd1, &mpy);\
+    int mxd = decode_cabac_mb_mvd(sl, 40, amvd0, &mpx);\
+    int myd = decode_cabac_mb_mvd(sl, 47, amvd1, &mpy);\
+    if (mxd == INT_MIN || myd == INT_MIN) \
+        return AVERROR_INVALIDDATA; \
+    mx += mxd;\
+    my += myd;\
 }
 
 static av_always_inline int get_cabac_cbf_ctx(H264SliceContext *sl,
@@ -1639,7 +1643,9 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
     cc.range     = sl->cabac.range;
     cc.low       = sl->cabac.low;
     cc.bytestream= sl->cabac.bytestream;
+#if !UNCHECKED_BITSTREAM_READER || ARCH_AARCH64
     cc.bytestream_end = sl->cabac.bytestream_end;
+#endif
 #else
 #define CC &sl->cabac
 #endif
@@ -1688,7 +1694,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
         }
 #endif
     }
-    assert(coeff_count > 0);
+    av_assert2(coeff_count > 0);
 
     if( is_dc ) {
         if( cat == 3 )
@@ -1700,7 +1706,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
         if( max_coeff == 64 )
             fill_rectangle(&sl->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
         else {
-            assert( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
+            av_assert2( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
             sl->non_zero_count_cache[scan8[n]] = coeff_count;
         }
     }
@@ -1719,7 +1725,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
                 ((type*)block)[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6; \
             } \
         } else { \
-            int coeff_abs = 2; \
+            unsigned coeff_abs = 2; \
             ctx = coeff_abs_levelgt1_ctx[is_dc && chroma422][node_ctx] + abs_level_m1_ctx_base; \
             node_ctx = coeff_abs_level_transition[1][node_ctx]; \
 \
@@ -1729,7 +1735,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
 \
             if( coeff_abs >= 15 ) { \
                 int j = 0; \
-                while (get_cabac_bypass(CC) && j < 30) { \
+                while (get_cabac_bypass(CC) && j < 16+7) { \
                     j++; \
                 } \
 \
@@ -1737,7 +1743,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
                 while( j-- ) { \
                     coeff_abs += coeff_abs + get_cabac_bypass( CC ); \
                 } \
-                coeff_abs+= 14; \
+                coeff_abs+= 14U; \
             } \
 \
             if( is_dc ) { \
@@ -1910,12 +1916,13 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
     const SPS *sps = h->ps.sps;
     int mb_xy;
     int mb_type, partition_count, cbp = 0;
-    int dct8x8_allowed= h->ps.pps->transform_8x8_mode;
-    int decode_chroma = sps->chroma_format_idc == 1 || sps->chroma_format_idc == 2;
+    int dct8x8_allowed = h->ps.pps->transform_8x8_mode;
+    const int decode_chroma = sps->chroma_format_idc == 1 || sps->chroma_format_idc == 2;
     const int pixel_shift = h->pixel_shift;
 
     mb_xy = sl->mb_xy = sl->mb_x + sl->mb_y*h->mb_stride;
 
+    ff_tlog(h->avctx, "pic:%d mb:%d/%d\n", h->poc.frame_num, sl->mb_x, sl->mb_y);
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
         int skip;
         /* a skipped mb needs the aff flag from the following mb */
@@ -1954,7 +1961,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
 
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         int ctx = 0;
-        assert(sl->slice_type_nos == AV_PICTURE_TYPE_B);
+        av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_B);
 
         if (!IS_DIRECT(sl->left_type[LTOP] - 1))
             ctx++;
@@ -2007,7 +2014,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
         mb_type = decode_cabac_intra_mb_type(sl, 3, 1);
         if (sl->slice_type == AV_PICTURE_TYPE_SI && mb_type)
             mb_type--;
-        assert(sl->slice_type_nos == AV_PICTURE_TYPE_I);
+        av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_I);
 decode_intra_mb:
         partition_count = 0;
         cbp                      = ff_h264_i_mb_type_info[mb_type].cbp;
@@ -2023,6 +2030,7 @@ decode_intra_mb:
         const int mb_size = ff_h264_mb_sizes[sps->chroma_format_idc] *
                             sps->bit_depth_luma >> 3;
         const uint8_t *ptr;
+        int ret;
 
         // We assume these blocks are very rare so we do not optimize it.
         // FIXME The two following lines get the bitstream position in the cabac
@@ -2039,7 +2047,9 @@ decode_intra_mb:
         sl->intra_pcm_ptr = ptr;
         ptr += mb_size;
 
-        ff_init_cabac_decoder(&sl->cabac, ptr, sl->cabac.bytestream_end - ptr);
+        ret = ff_init_cabac_decoder(&sl->cabac, ptr, sl->cabac.bytestream_end - ptr);
+        if (ret < 0)
+            return ret;
 
         // All blocks are present
         h->cbp_table[mb_xy] = 0xf7ef;
@@ -2070,7 +2080,7 @@ decode_intra_mb:
                     int pred = pred_intra_mode(h, sl, i);
                     sl->intra4x4_pred_mode_cache[scan8[i]] = decode_cabac_mb_intra4x4_pred_mode(sl, pred);
 
-                    ff_dlog(h->avctx, "i4x4 pred=%d mode=%d\n", pred,
+                    ff_tlog(h->avctx, "i4x4 pred=%d mode=%d\n", pred,
                             sl->intra4x4_pred_mode_cache[scan8[i]]);
                 }
             }
@@ -2125,10 +2135,10 @@ decode_intra_mb:
                 for( i = 0; i < 4; i++ ) {
                     if(IS_DIRECT(sl->sub_mb_type[i])) continue;
                     if(IS_DIR(sl->sub_mb_type[i], 0, list)){
-                        int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                        unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                         if (rc > 1) {
                             ref[list][i] = decode_cabac_mb_ref(sl, list, 4 * i);
-                            if (ref[list][i] >= (unsigned) rc) {
+                            if (ref[list][i] >= rc) {
                                 av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], rc);
                                 return -1;
                             }
@@ -2211,10 +2221,11 @@ decode_intra_mb:
         if(IS_16X16(mb_type)){
             for (list = 0; list < sl->list_count; list++) {
                 if(IS_DIR(mb_type, 0, list)){
-                    int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                    int ref;
+                    unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                     if (rc > 1) {
                         ref= decode_cabac_mb_ref(sl, list, 0);
-                        if (ref >= (unsigned) rc) {
+                        if (ref >= rc) {
                             av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                             return -1;
                         }
@@ -2239,10 +2250,11 @@ decode_intra_mb:
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){
-                            int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            int ref;
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc > 1) {
                                 ref= decode_cabac_mb_ref(sl, list, 8 * i);
-                                if (ref >= (unsigned) rc) {
+                                if (ref >= rc) {
                                     av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                                     return -1;
                                 }
@@ -2270,14 +2282,15 @@ decode_intra_mb:
                 }
             }
         }else{
-            assert(IS_8X16(mb_type));
+            av_assert2(IS_8X16(mb_type));
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            int ref;
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc > 1) {
                                 ref = decode_cabac_mb_ref(sl, list, 4 * i);
-                                if (ref >= (unsigned) rc) {
+                                if (ref >= rc) {
                                     av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                                     return -1;
                                 }
@@ -2316,6 +2329,11 @@ decode_intra_mb:
         cbp  = decode_cabac_mb_cbp_luma(sl);
         if(decode_chroma)
             cbp |= decode_cabac_mb_cbp_chroma(sl) << 4;
+    } else {
+        if (!decode_chroma && cbp>15) {
+            av_log(h->avctx, AV_LOG_ERROR, "gray chroma\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     h->cbp_table[mb_xy] = sl->cbp = cbp;
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index d57062b..d82144e 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... cavlc bitstream decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,16 +26,17 @@
  */
 
 #define CABAC(h) 0
+#define UNCHECKED_BITSTREAM_READER 1
 
 #include "internal.h"
 #include "avcodec.h"
 #include "h264dec.h"
 #include "h264_mvpred.h"
 #include "h264data.h"
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "mpegutils.h"
+#include "libavutil/avassert.h"
 
-#include <assert.h>
 
 static const uint8_t golomb_to_inter_cbp_gray[16]={
  0, 1, 2, 4, 8, 3, 5,10,12,15, 7,11,13,14, 6, 9,
@@ -247,19 +248,19 @@ static VLC chroma422_dc_coeff_token_vlc;
 static VLC_TYPE chroma422_dc_coeff_token_vlc_table[8192][2];
 static const int chroma422_dc_coeff_token_vlc_table_size = 8192;
 
-static VLC total_zeros_vlc[15];
+static VLC total_zeros_vlc[15+1];
 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
 static const int total_zeros_vlc_tables_size = 512;
 
-static VLC chroma_dc_total_zeros_vlc[3];
+static VLC chroma_dc_total_zeros_vlc[3+1];
 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
 
-static VLC chroma422_dc_total_zeros_vlc[7];
+static VLC chroma422_dc_total_zeros_vlc[7+1];
 static VLC_TYPE chroma422_dc_total_zeros_vlc_tables[7][32][2];
 static const int chroma422_dc_total_zeros_vlc_tables_size = 32;
 
-static VLC run_vlc[6];
+static VLC run_vlc[6+1];
 static VLC_TYPE run_vlc_tables[6][8][2];
 static const int run_vlc_tables_size = 8;
 
@@ -360,12 +361,12 @@ av_cold void ff_h264_decode_init_vlc(void){
          * the packed static coeff_token_vlc table sizes
          * were initialized correctly.
          */
-        assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
+        av_assert0(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
 
         for(i=0; i<3; i++){
-            chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
-            chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
-            init_vlc(&chroma_dc_total_zeros_vlc[i],
+            chroma_dc_total_zeros_vlc[i+1].table = chroma_dc_total_zeros_vlc_tables[i];
+            chroma_dc_total_zeros_vlc[i+1].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
+            init_vlc(&chroma_dc_total_zeros_vlc[i+1],
                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
                      &chroma_dc_total_zeros_len [i][0], 1, 1,
                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
@@ -373,9 +374,9 @@ av_cold void ff_h264_decode_init_vlc(void){
         }
 
         for(i=0; i<7; i++){
-            chroma422_dc_total_zeros_vlc[i].table = chroma422_dc_total_zeros_vlc_tables[i];
-            chroma422_dc_total_zeros_vlc[i].table_allocated = chroma422_dc_total_zeros_vlc_tables_size;
-            init_vlc(&chroma422_dc_total_zeros_vlc[i],
+            chroma422_dc_total_zeros_vlc[i+1].table = chroma422_dc_total_zeros_vlc_tables[i];
+            chroma422_dc_total_zeros_vlc[i+1].table_allocated = chroma422_dc_total_zeros_vlc_tables_size;
+            init_vlc(&chroma422_dc_total_zeros_vlc[i+1],
                      CHROMA422_DC_TOTAL_ZEROS_VLC_BITS, 8,
                      &chroma422_dc_total_zeros_len [i][0], 1, 1,
                      &chroma422_dc_total_zeros_bits[i][0], 1, 1,
@@ -383,9 +384,9 @@ av_cold void ff_h264_decode_init_vlc(void){
         }
 
         for(i=0; i<15; i++){
-            total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
-            total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
-            init_vlc(&total_zeros_vlc[i],
+            total_zeros_vlc[i+1].table = total_zeros_vlc_tables[i];
+            total_zeros_vlc[i+1].table_allocated = total_zeros_vlc_tables_size;
+            init_vlc(&total_zeros_vlc[i+1],
                      TOTAL_ZEROS_VLC_BITS, 16,
                      &total_zeros_len [i][0], 1, 1,
                      &total_zeros_bits[i][0], 1, 1,
@@ -393,9 +394,9 @@ av_cold void ff_h264_decode_init_vlc(void){
         }
 
         for(i=0; i<6; i++){
-            run_vlc[i].table = run_vlc_tables[i];
-            run_vlc[i].table_allocated = run_vlc_tables_size;
-            init_vlc(&run_vlc[i],
+            run_vlc[i+1].table = run_vlc_tables[i];
+            run_vlc[i+1].table_allocated = run_vlc_tables_size;
+            init_vlc(&run_vlc[i+1],
                      RUN_VLC_BITS, 7,
                      &run_len [i][0], 1, 1,
                      &run_bits[i][0], 1, 1,
@@ -476,7 +477,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
 
     trailing_ones= coeff_token&3;
     ff_tlog(h->avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
-    assert(total_coeff<=16);
+    av_assert2(total_coeff<=16);
 
     i = show_bits(gb, 3);
     skip_bits(gb, trailing_ones);
@@ -508,7 +509,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                 else
                     level_code= prefix + get_bits(gb, 4); //part
             }else{
-                level_code= 30 + get_bits(gb, prefix-3); //part
+                level_code= 30;
                 if(prefix>=16){
                     if(prefix > 25+3){
                         av_log(h->avctx, AV_LOG_ERROR, "Invalid level prefix\n");
@@ -516,6 +517,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                     }
                     level_code += (1<<(prefix-3))-4096;
                 }
+                level_code += get_bits(gb, prefix-3); //part
             }
 
             if(trailing_ones < 3) level_code += 2;
@@ -545,9 +547,15 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                 if(prefix<15){
                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
                 }else{
-                    level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
-                    if(prefix>=16)
+                    level_code = 15<<suffix_length;
+                    if (prefix>=16) {
+                        if(prefix > 25+3){
+                            av_log(h->avctx, AV_LOG_ERROR, "Invalid level prefix\n");
+                            return AVERROR_INVALIDDATA;
+                        }
                         level_code += (1<<(prefix-3))-4096;
+                    }
+                    level_code += get_bits(gb, prefix-3);
                 }
                 mask= -(level_code&1);
                 level_code= (((2+level_code)>>1) ^ mask) - mask;
@@ -562,13 +570,13 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
     else{
         if (max_coeff <= 8) {
             if (max_coeff == 4)
-                zeros_left = get_vlc2(gb, chroma_dc_total_zeros_vlc[total_coeff - 1].table,
+                zeros_left = get_vlc2(gb, chroma_dc_total_zeros_vlc[total_coeff].table,
                                       CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
             else
-                zeros_left = get_vlc2(gb, chroma422_dc_total_zeros_vlc[total_coeff - 1].table,
+                zeros_left = get_vlc2(gb, chroma422_dc_total_zeros_vlc[total_coeff].table,
                                       CHROMA422_DC_TOTAL_ZEROS_VLC_BITS, 1);
         } else {
-            zeros_left= get_vlc2(gb, total_zeros_vlc[total_coeff - 1].table, TOTAL_ZEROS_VLC_BITS, 1);
+            zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
         }
     }
 
@@ -578,11 +586,9 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         ((type*)block)[*scantable] = level[0]; \
         for(i=1;i<total_coeff && zeros_left > 0;i++) { \
             if(zeros_left < 7) \
-                run_before= get_vlc2(gb, run_vlc[zeros_left - 1].table, RUN_VLC_BITS, 1); \
-            else {\
+                run_before= get_vlc2(gb, run_vlc[zeros_left].table, RUN_VLC_BITS, 1); \
+            else \
                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
-                run_before = FFMIN(zeros_left, run_before);\
-            }\
             zeros_left -= run_before; \
             scantable -= 1 + run_before; \
             ((type*)block)[*scantable]= level[i]; \
@@ -595,11 +601,9 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         ((type*)block)[*scantable] = ((int)(level[0] * qmul[*scantable] + 32))>>6; \
         for(i=1;i<total_coeff && zeros_left > 0;i++) { \
             if(zeros_left < 7) \
-                run_before= get_vlc2(gb, run_vlc[zeros_left - 1].table, RUN_VLC_BITS, 1); \
-            else {\
+                run_before= get_vlc2(gb, run_vlc[zeros_left].table, RUN_VLC_BITS, 1); \
+            else \
                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
-                run_before = FFMIN(zeros_left, run_before);\
-            }\
             zeros_left -= run_before; \
             scantable -= 1 + run_before; \
             ((type*)block)[*scantable]= ((int)(level[i] * qmul[*scantable] + 32))>>6; \
@@ -610,18 +614,17 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         } \
     }
 
-    if (zeros_left < 0) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "negative number of zero coeffs at %d %d\n", sl->mb_x, sl->mb_y);
-        return AVERROR_INVALIDDATA;
-    }
-
     if (h->pixel_shift) {
         STORE_BLOCK(int32_t)
     } else {
         STORE_BLOCK(int16_t)
     }
 
+    if(zeros_left<0){
+        av_log(h->avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", sl->mb_x, sl->mb_y);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -642,7 +645,7 @@ int decode_luma_residual(const H264Context *h, H264SliceContext *sl,
             return -1; //FIXME continue if partitioned and other return -1 too
         }
 
-        assert((cbp&15) == 0 || (cbp&15) == 15);
+        av_assert2((cbp&15) == 0 || (cbp&15) == 15);
 
         if(cbp&15){
             for(i8x8=0; i8x8<4; i8x8++){
@@ -701,17 +704,24 @@ int ff_h264_decode_mb_cavlc(const H264Context *h, H264SliceContext *sl)
     int mb_xy;
     int partition_count;
     unsigned int mb_type, cbp;
-    int dct8x8_allowed= h->ps.pps->transform_8x8_mode;
-    int decode_chroma = h->ps.sps->chroma_format_idc == 1 || h->ps.sps->chroma_format_idc == 2;
+    int dct8x8_allowed = h->ps.pps->transform_8x8_mode;
+    const int decode_chroma = h->ps.sps->chroma_format_idc == 1 || h->ps.sps->chroma_format_idc == 2;
     const int pixel_shift = h->pixel_shift;
 
     mb_xy = sl->mb_xy = sl->mb_x + sl->mb_y*h->mb_stride;
 
+    ff_tlog(h->avctx, "pic:%d mb:%d/%d\n", h->poc.frame_num, sl->mb_x, sl->mb_y);
     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
                 down the code */
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
-        if (sl->mb_skip_run == -1)
-            sl->mb_skip_run = get_ue_golomb(&sl->gb);
+        if (sl->mb_skip_run == -1) {
+            unsigned mb_skip_run = get_ue_golomb_long(&sl->gb);
+            if (mb_skip_run > h->mb_num) {
+                av_log(h->avctx, AV_LOG_ERROR, "mb_skip_run %d is invalid\n", mb_skip_run);
+                return AVERROR_INVALIDDATA;
+            }
+            sl->mb_skip_run = mb_skip_run;
+        }
 
         if (sl->mb_skip_run--) {
             if (FRAME_MBAFF(h) && (sl->mb_y & 1) == 0) {
@@ -747,7 +757,7 @@ int ff_h264_decode_mb_cavlc(const H264Context *h, H264SliceContext *sl)
             goto decode_intra_mb;
         }
     }else{
-       assert(sl->slice_type_nos == AV_PICTURE_TYPE_I);
+       av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_I);
         if (sl->slice_type == AV_PICTURE_TYPE_SI && mb_type)
             mb_type--;
 decode_intra_mb:
@@ -856,7 +866,7 @@ decode_intra_mb:
                 sl->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
             }
         }else{
-            assert(sl->slice_type_nos == AV_PICTURE_TYPE_P); //FIXME SP correct ?
+            av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_P); //FIXME SP correct ?
             for(i=0; i<4; i++){
                 sl->sub_mb_type[i]= get_ue_golomb_31(&sl->gb);
                 if(sl->sub_mb_type[i] >=4){
@@ -949,7 +959,7 @@ decode_intra_mb:
             for (list = 0; list < sl->list_count; list++) {
                     unsigned int val;
                     if(IS_DIR(mb_type, 0, list)){
-                        int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                        unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                         if (rc == 1) {
                             val= 0;
                         } else if (rc == 2) {
@@ -980,7 +990,7 @@ decode_intra_mb:
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){
-                            int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc == 1) {
                                 val= 0;
                             } else if (rc == 2) {
@@ -1013,12 +1023,12 @@ decode_intra_mb:
                 }
             }
         }else{
-            assert(IS_8X16(mb_type));
+            av_assert2(IS_8X16(mb_type));
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc == 1) {
                                 val= 0;
                             } else if (rc == 2) {
@@ -1076,6 +1086,11 @@ decode_intra_mb:
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
             else                     cbp= golomb_to_inter_cbp_gray[cbp];
         }
+    } else {
+        if (!decode_chroma && cbp>15) {
+            av_log(h->avctx, AV_LOG_ERROR, "gray chroma\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
@@ -1095,13 +1110,14 @@ decode_intra_mb:
 
         dquant= get_se_golomb(&sl->gb);
 
-        sl->qscale += dquant;
+        sl->qscale += (unsigned)dquant;
 
         if (((unsigned)sl->qscale) > max_qp){
             if (sl->qscale < 0) sl->qscale += max_qp + 1;
             else                sl->qscale -= max_qp+1;
             if (((unsigned)sl->qscale) > max_qp){
                 av_log(h->avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, sl->mb_x, sl->mb_y);
+                sl->qscale = max_qp;
                 return -1;
             }
         }
@@ -1128,12 +1144,15 @@ decode_intra_mb:
             if (decode_luma_residual(h, sl, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ) {
                 return -1;
             }
-        } else if (CHROMA422(h)) {
+        } else {
+            const int num_c8x8 = h->ps.sps->chroma_format_idc;
+
             if(cbp&0x30){
                 for(chroma_idx=0; chroma_idx<2; chroma_idx++)
                     if (decode_residual(h, sl, gb, sl->mb + ((256 + 16*16*chroma_idx) << pixel_shift),
-                                        CHROMA_DC_BLOCK_INDEX + chroma_idx, ff_h264_chroma422_dc_scan,
-                                        NULL, 8) < 0) {
+                                        CHROMA_DC_BLOCK_INDEX + chroma_idx,
+                                        CHROMA422(h) ? ff_h264_chroma422_dc_scan : ff_h264_chroma_dc_scan,
+                                        NULL, 4 * num_c8x8) < 0) {
                         return -1;
                     }
             }
@@ -1142,7 +1161,7 @@ decode_intra_mb:
                 for(chroma_idx=0; chroma_idx<2; chroma_idx++){
                     const uint32_t *qmul = h->ps.pps->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][sl->chroma_qp[chroma_idx]];
                     int16_t *mb = sl->mb + (16*(16 + 16*chroma_idx) << pixel_shift);
-                    for (i8x8 = 0; i8x8 < 2; i8x8++) {
+                    for (i8x8 = 0; i8x8<num_c8x8; i8x8++) {
                         for (i4x4 = 0; i4x4 < 4; i4x4++) {
                             const int index = 16 + 16*chroma_idx + 8*i8x8 + i4x4;
                             if (decode_residual(h, sl, gb, mb, index, scan + 1, qmul, 15) < 0)
@@ -1155,29 +1174,6 @@ decode_intra_mb:
                 fill_rectangle(&sl->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
                 fill_rectangle(&sl->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
             }
-        } else /* yuv420 */ {
-            if(cbp&0x30){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                    if (decode_residual(h, sl, gb, sl->mb + ((256 + 16 * 16 * chroma_idx) << pixel_shift),
-                                        CHROMA_DC_BLOCK_INDEX + chroma_idx, ff_h264_chroma_dc_scan, NULL, 4) < 0) {
-                        return -1;
-                    }
-            }
-
-            if(cbp&0x20){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++){
-                    const uint32_t *qmul = h->ps.pps->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][sl->chroma_qp[chroma_idx]];
-                    for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= 16 + 16*chroma_idx + i4x4;
-                        if( decode_residual(h, sl, gb, sl->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
-                            return -1;
-                        }
-                    }
-                }
-            }else{
-                fill_rectangle(&sl->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
-                fill_rectangle(&sl->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
-            }
         }
     }else{
         fill_rectangle(&sl->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
diff --git a/libavcodec/h264_direct.c b/libavcodec/h264_direct.c
index abac259..a01d823 100644
--- a/libavcodec/h264_direct.c
+++ b/libavcodec/h264_direct.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,12 +39,22 @@ static int get_scale_factor(H264SliceContext *sl,
                             int poc, int poc1, int i)
 {
     int poc0 = sl->ref_list[0][i].poc;
-    int td = av_clip_int8(poc1 - poc0);
+    int64_t pocdiff = poc1 - (int64_t)poc0;
+    int td = av_clip_int8(pocdiff);
+
+    if (pocdiff != (int)pocdiff)
+        avpriv_request_sample(sl->h264->avctx, "pocdiff overflow\n");
+
     if (td == 0 || sl->ref_list[0][i].parent->long_ref) {
         return 256;
     } else {
-        int tb = av_clip_int8(poc - poc0);
+        int64_t pocdiff0 = poc - (int64_t)poc0;
+        int tb = av_clip_int8(pocdiff0);
         int tx = (16384 + (FFABS(td) >> 1)) / td;
+
+        if (pocdiff0 != (int)pocdiff0)
+            av_log(sl->h264->avctx, AV_LOG_DEBUG, "pocdiff0 overflow\n");
+
         return av_clip_intp2((tb * tx + 32) >> 6, 10);
     }
 }
@@ -128,7 +138,11 @@ void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *
         memcpy(cur->ref_poc[1],   cur->ref_poc[0],   sizeof(cur->ref_poc[0]));
     }
 
-    cur->mbaff = FRAME_MBAFF(h);
+    if (h->current_slice == 0) {
+        cur->mbaff = FRAME_MBAFF(h);
+    } else {
+        av_assert0(cur->mbaff == FRAME_MBAFF(h));
+    }
 
     sl->col_fieldoff = 0;
 
@@ -138,8 +152,12 @@ void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *
     if (h->picture_structure == PICT_FRAME) {
         int cur_poc  = h->cur_pic_ptr->poc;
         int *col_poc = sl->ref_list[1][0].parent->field_poc;
-        sl->col_parity = (FFABS(col_poc[0] - cur_poc) >=
-                          FFABS(col_poc[1] - cur_poc));
+        if (col_poc[0] == INT_MAX && col_poc[1] == INT_MAX) {
+            av_log(h->avctx, AV_LOG_ERROR, "co located POCs unavailable\n");
+            sl->col_parity = 1;
+        } else
+            sl->col_parity = (FFABS(col_poc[0] - (int64_t)cur_poc) >=
+                              FFABS(col_poc[1] - (int64_t)cur_poc));
         ref1sidx =
         sidx     = sl->col_parity;
     // FL -> FL & differ parity
@@ -238,6 +256,7 @@ static void pred_spatial_direct_motion(const H264Context *const h, H264SliceCont
                 else
                     mv[list] = AV_RN32A(C);
             }
+            av_assert2(ref[list] < (sl->ref_count[list] << !!FRAME_MBAFF(h)));
         } else {
             int mask = ~(MB_TYPE_L0 << (2 * list));
             mv[list]  = 0;
@@ -323,8 +342,8 @@ single_col:
 
     await_reference_mb_row(h, &sl->ref_list[1][0], mb_y);
 
-    l1mv0  = &sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
-    l1mv1  = &sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
+    l1mv0  = (void*)&sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
+    l1mv1  = (void*)&sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
     l1ref0 = &sl->ref_list[1][0].parent->ref_index[0][4 * mb_xy];
     l1ref1 = &sl->ref_list[1][0].parent->ref_index[1][4 * mb_xy];
     if (!b8_stride) {
@@ -547,8 +566,8 @@ single_col:
 
     await_reference_mb_row(h, &sl->ref_list[1][0], mb_y);
 
-    l1mv0  = &sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
-    l1mv1  = &sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
+    l1mv0  = (void*)&sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
+    l1mv1  = (void*)&sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
     l1ref0 = &sl->ref_list[1][0].parent->ref_index[0][4 * mb_xy];
     l1ref1 = &sl->ref_list[1][0].parent->ref_index[1][4 * mb_xy];
     if (!b8_stride) {
@@ -609,7 +628,7 @@ single_col:
 
                 {
                     const int16_t *mv_col = l1mv[x8 * 3 + y8 * b4_stride];
-                    int my_col            = (mv_col[1] << y_shift) / 2;
+                    int my_col            = (mv_col[1] * (1 << y_shift)) / 2;
                     int mx                = (scale * mv_col[0] + 128) >> 8;
                     int my                = (scale * my_col    + 128) >> 8;
                     fill_rectangle(&sl->mv_cache[0][scan8[i8 * 4]], 2, 2, 8,
diff --git a/libavcodec/h264_levels.c b/libavcodec/h264_levels.c
new file mode 100644
index 0000000..7a55116
--- /dev/null
+++ b/libavcodec/h264_levels.c
@@ -0,0 +1,130 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "h264_levels.h"
+
+// H.264 table A-1.
+static const H264LevelDescriptor h264_levels[] = {
+    // Name          MaxMBPS                   MaxBR              MinCR
+    //  | level_idc     |       MaxFS            |    MaxCPB        | MaxMvsPer2Mb
+    //  |     | cs3f    |         |  MaxDpbMbs   |       |  MaxVmvR |   |
+    { "1",   10, 0,     1485,     99,    396,     64,    175,   64, 2,  0 },
+    { "1b",  11, 1,     1485,     99,    396,    128,    350,   64, 2,  0 },
+    { "1b",   9, 0,     1485,     99,    396,    128,    350,   64, 2,  0 },
+    { "1.1", 11, 0,     3000,    396,    900,    192,    500,  128, 2,  0 },
+    { "1.2", 12, 0,     6000,    396,   2376,    384,   1000,  128, 2,  0 },
+    { "1.3", 13, 0,    11880,    396,   2376,    768,   2000,  128, 2,  0 },
+    { "2",   20, 0,    11880,    396,   2376,   2000,   2000,  128, 2,  0 },
+    { "2.1", 21, 0,    19800,    792,   4752,   4000,   4000,  256, 2,  0 },
+    { "2.2", 22, 0,    20250,   1620,   8100,   4000,   4000,  256, 2,  0 },
+    { "3",   30, 0,    40500,   1620,   8100,  10000,  10000,  256, 2, 32 },
+    { "3.1", 31, 0,   108000,   3600,  18000,  14000,  14000,  512, 4, 16 },
+    { "3.2", 32, 0,   216000,   5120,  20480,  20000,  20000,  512, 4, 16 },
+    { "4",   40, 0,   245760,   8192,  32768,  20000,  25000,  512, 4, 16 },
+    { "4.1", 41, 0,   245760,   8192,  32768,  50000,  62500,  512, 2, 16 },
+    { "4.2", 42, 0,   522240,   8704,  34816,  50000,  62500,  512, 2, 16 },
+    { "5",   50, 0,   589824,  22080, 110400, 135000, 135000,  512, 2, 16 },
+    { "5.1", 51, 0,   983040,  36864, 184320, 240000, 240000,  512, 2, 16 },
+    { "5.2", 52, 0,  2073600,  36864, 184320, 240000, 240000,  512, 2, 16 },
+    { "6",   60, 0,  4177920, 139264, 696320, 240000, 240000, 8192, 2, 16 },
+    { "6.1", 61, 0,  8355840, 139264, 696320, 480000, 480000, 8192, 2, 16 },
+    { "6.2", 62, 0, 16711680, 139264, 696320, 800000, 800000, 8192, 2, 16 },
+};
+
+// H.264 table A-2 plus values from A-1.
+static const struct {
+    int profile_idc;
+    int cpb_br_vcl_factor;
+    int cpb_br_nal_factor;
+} h264_br_factors[] = {
+    {  66, 1000, 1200 },
+    {  77, 1000, 1200 },
+    {  88, 1000, 1200 },
+    { 100, 1250, 1500 },
+    { 110, 3000, 3600 },
+    { 122, 4000, 4800 },
+    { 244, 4000, 4800 },
+    {  44, 4000, 4800 },
+};
+
+// We are only ever interested in the NAL bitrate factor.
+static int h264_get_br_factor(int profile_idc)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(h264_br_factors); i++) {
+        if (h264_br_factors[i].profile_idc == profile_idc)
+            return h264_br_factors[i].cpb_br_nal_factor;
+    }
+    // Default to the non-high profile value if not specified.
+    return 1200;
+}
+
+const H264LevelDescriptor *ff_h264_get_level(int level_idc,
+                                             int constraint_set3_flag)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(h264_levels); i++) {
+        if (h264_levels[i].level_idc            == level_idc &&
+            h264_levels[i].constraint_set3_flag == constraint_set3_flag)
+            return &h264_levels[i];
+    }
+    return NULL;
+}
+
+const H264LevelDescriptor *ff_h264_guess_level(int profile_idc,
+                                               int64_t bitrate,
+                                               int width, int height,
+                                               int max_dec_frame_buffering)
+{
+    int width_mbs  = (width  + 15) / 16;
+    int height_mbs = (height + 15) / 16;
+    int no_cs3f = !(profile_idc == 66 ||
+                    profile_idc == 77 ||
+                    profile_idc == 88);
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h264_levels); i++) {
+        const H264LevelDescriptor *level = &h264_levels[i];
+
+        if (level->constraint_set3_flag && no_cs3f)
+            continue;
+
+        if (bitrate > (int64_t)level->max_br * h264_get_br_factor(profile_idc))
+            continue;
+
+        if (width_mbs  * height_mbs > level->max_fs)
+            continue;
+        if (width_mbs  * width_mbs  > 8 * level->max_fs)
+            continue;
+        if (height_mbs * height_mbs > 8 * level->max_fs)
+            continue;
+
+        if (width_mbs && height_mbs) {
+            int max_dpb_frames =
+                FFMIN(level->max_dpb_mbs / (width_mbs * height_mbs), 16);
+            if (max_dec_frame_buffering > max_dpb_frames)
+                continue;
+        }
+
+        return level;
+    }
+
+    // No usable levels found - frame is too big or bitrate is too high.
+    return NULL;
+}
diff --git a/libavcodec/h264_levels.h b/libavcodec/h264_levels.h
new file mode 100644
index 0000000..4189fc6
--- /dev/null
+++ b/libavcodec/h264_levels.h
@@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_H264_LEVELS_H
+#define AVCODEC_H264_LEVELS_H
+
+
+#include <stdint.h>
+
+typedef struct H264LevelDescriptor {
+    const char *name;
+    uint8_t     level_idc;
+    uint8_t     constraint_set3_flag;
+    uint32_t    max_mbps;
+    uint32_t    max_fs;
+    uint32_t    max_dpb_mbs;
+    uint32_t    max_br;
+    uint32_t    max_cpb;
+    uint16_t    max_v_mv_r;
+    uint8_t     min_cr;
+    uint8_t     max_mvs_per_2mb;
+} H264LevelDescriptor;
+
+const H264LevelDescriptor *ff_h264_get_level(int level_idc,
+                                             int constraint_set3_flag);
+
+/**
+ * Guess the level of a stream from some parameters.
+ *
+ * Unknown parameters may be zero, in which case they are ignored.
+ */
+const H264LevelDescriptor *ff_h264_guess_level(int profile_idc,
+                                               int64_t bitrate,
+                                               int width, int height,
+                                               int max_dec_frame_buffering);
+
+
+#endif /* AVCODEC_H264_LEVELS_H */
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index f39b951..0924f32 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... loop filter
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,8 +35,6 @@
 #include "mpegutils.h"
 #include "rectangle.h"
 
-#include <assert.h>
-
 /* Deblocking filter (p153) */
 static const uint8_t alpha_table[52*3] = {
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
@@ -245,7 +243,7 @@ static av_always_inline void h264_filter_mb_fast_internal(const H264Context *h,
                                                           unsigned int uvlinesize,
                                                           int pixel_shift)
 {
-    int chroma = !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
+    int chroma = CHROMA(h) && !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
     int chroma444 = CHROMA444(h);
     int chroma422 = CHROMA422(h);
 
@@ -359,7 +357,7 @@ static av_always_inline void h264_filter_mb_fast_internal(const H264Context *h,
         }
         return;
     } else {
-        LOCAL_ALIGNED_8(int16_t, bS, [2], [4][4]);
+        LOCAL_ALIGNED(8, int16_t, bS, [2], [4][4]);
         int edges;
         if( IS_8x8DCT(mb_type) && (sl->cbp&7) == 7 && !chroma444 ) {
             edges = 4;
@@ -422,7 +420,7 @@ void ff_h264_filter_mb_fast(const H264Context *h, H264SliceContext *sl,
                             uint8_t *img_cb, uint8_t *img_cr,
                             unsigned int linesize, unsigned int uvlinesize)
 {
-    assert(!FRAME_MBAFF(h));
+    av_assert2(!FRAME_MBAFF(h));
     if(!h->h264dsp.h264_loop_filter_strength || h->ps.pps->chroma_qp_diff) {
         ff_h264_filter_mb(h, sl, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
         return;
@@ -508,7 +506,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
             int j;
 
             for(j=0; j<2; j++, mbn_xy += h->mb_stride){
-                DECLARE_ALIGNED(8, int16_t, bS)[4];
+                LOCAL_ALIGNED(8, int16_t, bS, [4]);
                 int qp;
                 if (IS_INTRA(mb_type | h->cur_pic.mb_type[mbn_xy])) {
                     AV_WN64A(bS, 0x0003000300030003ULL);
@@ -545,7 +543,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
                 }
             }
         }else{
-            DECLARE_ALIGNED(8, int16_t, bS)[4];
+            LOCAL_ALIGNED(8, int16_t, bS, [4]);
             int qp;
 
             if( IS_INTRA(mb_type|mbm_type)) {
@@ -594,7 +592,9 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
             // value in IPCM macroblocks.
             if(bS[0]+bS[1]+bS[2]+bS[3]){
                 qp = (h->cur_pic.qscale_table[mb_xy] + h->cur_pic.qscale_table[mbm_xy] + 1) >> 1;
+                //ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], h->cur_pic.qscale_table[mbn_xy]);
                 ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+                //{ int i; for (i = 0; i < 4; i++) ff_tlog(h->avctx, " bS[%d]:%d", i, bS[i]); ff_tlog(h->avctx, "\n"); }
                 chroma_qp_avg[0] = (sl->chroma_qp[0] + get_chroma_qp(h->ps.pps, 0, h->cur_pic.qscale_table[mbm_xy]) + 1) >> 1;
                 chroma_qp_avg[1] = (sl->chroma_qp[1] + get_chroma_qp(h->ps.pps, 1, h->cur_pic.qscale_table[mbm_xy]) + 1) >> 1;
                 if( dir == 0 ) {
@@ -626,7 +626,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
 
     /* Calculate bS */
     for( edge = 1; edge < edges; edge++ ) {
-        DECLARE_ALIGNED(8, int16_t, bS)[4];
+        LOCAL_ALIGNED(8, int16_t, bS, [4]);
         int qp;
         const int deblock_edge = !IS_8x8DCT(mb_type & (edge<<24)); // (edge&1) && IS_8x8DCT(mb_type)
 
@@ -677,7 +677,9 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
         // Do not use s->qscale as luma quantizer because it has not the same
         // value in IPCM macroblocks.
         qp = h->cur_pic.qscale_table[mb_xy];
+        //ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], h->cur_pic.qscale_table[mbn_xy]);
         ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+        //{ int i; for (i = 0; i < 4; i++) ff_tlog(h->avctx, " bS[%d]:%d", i, bS[i]); ff_tlog(h->avctx, "\n"); }
         if( dir == 0 ) {
             filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, a, b, h, 0 );
             if (chroma) {
@@ -722,7 +724,7 @@ void ff_h264_filter_mb(const H264Context *h, H264SliceContext *sl,
     const int mb_type = h->cur_pic.mb_type[mb_xy];
     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
     int first_vertical_edge_done = 0;
-    int chroma = !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
+    int chroma = CHROMA(h) && !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
     int qp_bd_offset = 6 * (h->ps.sps->bit_depth_luma - 8);
     int a = 52 + sl->slice_alpha_c0_offset - qp_bd_offset;
     int b = 52 + sl->slice_beta_offset - qp_bd_offset;
@@ -735,7 +737,7 @@ void ff_h264_filter_mb(const H264Context *h, H264SliceContext *sl,
         /* First vertical edge is different in MBAFF frames
          * There are 8 different bS to compute and 2 different Qp
          */
-        DECLARE_ALIGNED(8, int16_t, bS)[8];
+        LOCAL_ALIGNED(8, int16_t, bS, [8]);
         int qp[2];
         int bqp[2];
         int rqp[2];
diff --git a/libavcodec/h264_mb.c b/libavcodec/h264_mb.c
index 51d73ce..3cd17b7 100644
--- a/libavcodec/h264_mb.c
+++ b/libavcodec/h264_mb.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,17 +40,17 @@ static inline int get_lowest_part_list_y(H264SliceContext *sl,
                                          int n, int height, int y_offset, int list)
 {
     int raw_my             = sl->mv_cache[list][scan8[n]][1];
-    int filter_height_up   = (raw_my & 3) ? 2 : 0;
     int filter_height_down = (raw_my & 3) ? 3 : 0;
     int full_my            = (raw_my >> 2) + y_offset;
-    int top                = full_my - filter_height_up;
     int bottom             = full_my + filter_height_down + height;
 
-    return FFMAX(abs(top), bottom);
+    av_assert2(height >= 0);
+
+    return FFMAX(0, bottom);
 }
 
 static inline void get_lowest_part_y(const H264Context *h, H264SliceContext *sl,
-                                     int refs[2][48], int n,
+                                     int16_t refs[2][48], int n,
                                      int height, int y_offset, int list0,
                                      int list1, int *nrefs)
 {
@@ -97,7 +97,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
 {
     const int mb_xy   = sl->mb_xy;
     const int mb_type = h->cur_pic.mb_type[mb_xy];
-    int refs[2][48];
+    int16_t refs[2][48];
     int nrefs[2] = { 0 };
     int ref, list;
 
@@ -119,7 +119,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
     } else {
         int i;
 
-        assert(IS_8X8(mb_type));
+        av_assert2(IS_8X8(mb_type));
 
         for (i = 0; i < 4; i++) {
             const int sub_mb_type = sl->sub_mb_type[i];
@@ -151,7 +151,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
                                   nrefs);
             } else {
                 int j;
-                assert(IS_SUB_4X4(sub_mb_type));
+                av_assert2(IS_SUB_4X4(sub_mb_type));
                 for (j = 0; j < 4; j++) {
                     int sub_y_offset = y_offset + 2 * (j & 2);
                     get_lowest_part_y(h, sl, refs, n + j, 4, sub_y_offset,
@@ -176,6 +176,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
                 nrefs[list]--;
 
                 if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields
+                    av_assert2((ref_pic->parent->reference & 3) == 3);
                     ff_thread_await_progress(&ref_pic->parent->tf,
                                              FFMIN((row >> 1) - !(row & 1),
                                                    pic_height - 1),
@@ -215,7 +216,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
     const int mx      = sl->mv_cache[list][scan8[n]][0] + src_x_offset * 8;
     int my            = sl->mv_cache[list][scan8[n]][1] + src_y_offset * 8;
     const int luma_xy = (mx & 3) + ((my & 3) << 2);
-    ptrdiff_t offset  = ((mx >> 2) << pixel_shift) + (my >> 2) * sl->mb_linesize;
+    ptrdiff_t offset  = (mx >> 2) * (1 << pixel_shift) + (my >> 2) * sl->mb_linesize;
     uint8_t *src_y    = pic->data[0] + offset;
     uint8_t *src_cb, *src_cr;
     int extra_width  = 0;
@@ -290,9 +291,9 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
         emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1);
     }
 
-    src_cb = pic->data[1] + ((mx >> 3) << pixel_shift) +
+    src_cb = pic->data[1] + ((mx >> 3) * (1 << pixel_shift)) +
              (my >> ysh) * sl->mb_uvlinesize;
-    src_cr = pic->data[2] + ((mx >> 3) << pixel_shift) +
+    src_cr = pic->data[2] + ((mx >> 3) * (1 << pixel_shift)) +
              (my >> ysh) * sl->mb_uvlinesize;
 
     if (emu) {
@@ -304,7 +305,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
     }
     chroma_op(dest_cb, src_cb, sl->mb_uvlinesize,
               height >> (chroma_idc == 1 /* yuv420 */),
-              mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
+              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
 
     if (emu) {
         h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cr,
@@ -314,7 +315,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
         src_cr = sl->edge_emu_buffer;
     }
     chroma_op(dest_cr, src_cr, sl->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
-              mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
+              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
 }
 
 static av_always_inline void mc_part_std(const H264Context *h, H264SliceContext *sl,
@@ -424,10 +425,12 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
             int weight1 = 64 - weight0;
             luma_weight_avg(dest_y, tmp_y, sl->mb_linesize,
                             height, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
-                              chroma_height, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
-                              chroma_height, 5, weight0, weight1, 0);
+            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
+                                  chroma_height, 5, weight0, weight1, 0);
+                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
+                                  chroma_height, 5, weight0, weight1, 0);
+            }
         } else {
             luma_weight_avg(dest_y, tmp_y, sl->mb_linesize, height,
                             sl->pwt.luma_log2_weight_denom,
@@ -435,18 +438,20 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
                             sl->pwt.luma_weight[refn1][1][0],
                             sl->pwt.luma_weight[refn0][0][1] +
                             sl->pwt.luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
-                              sl->pwt.chroma_log2_weight_denom,
-                              sl->pwt.chroma_weight[refn0][0][0][0],
-                              sl->pwt.chroma_weight[refn1][1][0][0],
-                              sl->pwt.chroma_weight[refn0][0][0][1] +
-                              sl->pwt.chroma_weight[refn1][1][0][1]);
-            chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
-                              sl->pwt.chroma_log2_weight_denom,
-                              sl->pwt.chroma_weight[refn0][0][1][0],
-                              sl->pwt.chroma_weight[refn1][1][1][0],
-                              sl->pwt.chroma_weight[refn0][0][1][1] +
-                              sl->pwt.chroma_weight[refn1][1][1][1]);
+            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
+                                  sl->pwt.chroma_log2_weight_denom,
+                                  sl->pwt.chroma_weight[refn0][0][0][0],
+                                  sl->pwt.chroma_weight[refn1][1][0][0],
+                                  sl->pwt.chroma_weight[refn0][0][0][1] +
+                                  sl->pwt.chroma_weight[refn1][1][0][1]);
+                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
+                                  sl->pwt.chroma_log2_weight_denom,
+                                  sl->pwt.chroma_weight[refn0][0][1][0],
+                                  sl->pwt.chroma_weight[refn1][1][1][0],
+                                  sl->pwt.chroma_weight[refn0][0][1][1] +
+                                  sl->pwt.chroma_weight[refn1][1][1][1]);
+            }
         }
     } else {
         int list     = list1 ? 1 : 0;
@@ -460,15 +465,17 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
                        sl->pwt.luma_log2_weight_denom,
                        sl->pwt.luma_weight[refn][list][0],
                        sl->pwt.luma_weight[refn][list][1]);
-        if (sl->pwt.use_weight_chroma) {
-            chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
-                             sl->pwt.chroma_log2_weight_denom,
-                             sl->pwt.chroma_weight[refn][list][0][0],
-                             sl->pwt.chroma_weight[refn][list][0][1]);
-            chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
-                             sl->pwt.chroma_log2_weight_denom,
-                             sl->pwt.chroma_weight[refn][list][1][0],
-                             sl->pwt.chroma_weight[refn][list][1][1]);
+        if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+            if (sl->pwt.use_weight_chroma) {
+                chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
+                                 sl->pwt.chroma_log2_weight_denom,
+                                 sl->pwt.chroma_weight[refn][list][0][0],
+                                 sl->pwt.chroma_weight[refn][list][0][1]);
+                chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
+                                 sl->pwt.chroma_log2_weight_denom,
+                                 sl->pwt.chroma_weight[refn][list][1][0],
+                                 sl->pwt.chroma_weight[refn][list][1][1]);
+            }
         }
     }
 }
@@ -484,7 +491,7 @@ static av_always_inline void prefetch_motion(const H264Context *h, H264SliceCont
         const int mx  = (sl->mv_cache[list][scan8[0]][0] >> 2) + 16 * sl->mb_x + 8;
         const int my  = (sl->mv_cache[list][scan8[0]][1] >> 2) + 16 * sl->mb_y;
         uint8_t **src = sl->ref_list[list][refn].data;
-        int off       = (mx << pixel_shift) +
+        int off       =  mx * (1<< pixel_shift) +
                         (my + (sl->mb_x & 3) * 4) * sl->mb_linesize +
                         (64 << pixel_shift);
         h->vdsp.prefetch(src[0] + off, sl->linesize, 4);
@@ -492,9 +499,7 @@ static av_always_inline void prefetch_motion(const H264Context *h, H264SliceCont
             h->vdsp.prefetch(src[1] + off, sl->linesize, 4);
             h->vdsp.prefetch(src[2] + off, sl->linesize, 4);
         } else {
-            off = ((mx >> 1) << pixel_shift) +
-                  ((my >> 1) + (sl->mb_x & 7)) * sl->uvlinesize +
-                  (64 << pixel_shift);
+            off= ((mx>>1)+64) * (1<<pixel_shift) + ((my>>1) + (sl->mb_x&7))*sl->uvlinesize;
             h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
         }
     }
@@ -561,10 +566,8 @@ static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceConte
             XCHG(sl->top_borders[top_idx][sl->mb_x + 1],
                  src_y + (17 << pixel_shift), 1);
         }
-    }
-    if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
-        if (chroma444) {
-            if (deblock_top) {
+        if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+            if (chroma444) {
                 if (deblock_topleft) {
                     XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                     XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
@@ -577,9 +580,7 @@ static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceConte
                     XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
                     XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
                 }
-            }
-        } else {
-            if (deblock_top) {
+            } else {
                 if (deblock_topleft) {
                     XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                     XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
@@ -674,7 +675,7 @@ static av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
                     uint64_t tr_high;
                     if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
                         const int topright_avail = (sl->topright_samples_available << i) & 0x8000;
-                        assert(sl->mb_y || linesize <= block_offset[i]);
+                        av_assert2(sl->mb_y || linesize <= block_offset[i]);
                         if (!topright_avail) {
                             if (pixel_shift) {
                                 tr_high  = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL;
diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c
index 1f583df..d5ea26a 100644
--- a/libavcodec/h264_mb_template.c
+++ b/libavcodec/h264_mb_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -96,8 +96,8 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
     }
 
     if (!SIMPLE && IS_INTRA_PCM(mb_type)) {
+        const int bit_depth = h->ps.sps->bit_depth_luma;
         if (PIXEL_SHIFT) {
-            const int bit_depth = h->ps.sps->bit_depth_luma;
             int j;
             GetBitContext gb;
             init_get_bits(&gb, sl->intra_pcm_ptr,
@@ -112,13 +112,10 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
                 if (!h->ps.sps->chroma_format_idc) {
                     for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cb[j] = 1 << (bit_depth - 1);
-                    }
-                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cr[j] = 1 << (bit_depth - 1);
+                        for (j = 0; j < 8; j++) {
+                            tmp_cb[j] = tmp_cr[j] = 1 << (bit_depth - 1);
+                        }
                     }
                 } else {
                     for (i = 0; i < block_h; i++) {
@@ -138,9 +135,9 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
                 memcpy(dest_y + i * linesize, sl->intra_pcm_ptr + i * 16, 16);
             if (SIMPLE || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
                 if (!h->ps.sps->chroma_format_idc) {
-                    for (i = 0; i < block_h; i++) {
-                        memset(dest_cb + i * uvlinesize, 128, 8);
-                        memset(dest_cr + i * uvlinesize, 128, 8);
+                    for (i = 0; i < 8; i++) {
+                        memset(dest_cb + i * uvlinesize, 1 << (bit_depth - 1), 8);
+                        memset(dest_cr + i * uvlinesize, 1 << (bit_depth - 1), 8);
                     }
                 } else {
                     const uint8_t *src_cb = sl->intra_pcm_ptr + 256;
diff --git a/libavcodec/h264_mc_template.c b/libavcodec/h264_mc_template.c
index 7e6f62f..d02e2bf 100644
--- a/libavcodec/h264_mc_template.c
+++ b/libavcodec/h264_mc_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -74,11 +74,12 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
     const int mb_xy   = sl->mb_xy;
     const int mb_type = h->cur_pic.mb_type[mb_xy];
 
-    assert(IS_INTER(mb_type));
+    av_assert2(IS_INTER(mb_type));
 
     if (HAVE_THREADS && (h->avctx->active_thread_type & FF_THREAD_FRAME))
         await_references(h, sl);
-    prefetch_motion(h, sl, 0, PIXEL_SHIFT, CHROMA_IDC);
+    if (USES_LIST(mb_type, 0))
+        prefetch_motion(h, sl, 0, PIXEL_SHIFT, CHROMA_IDC);
 
     if (IS_16X16(mb_type)) {
         mc_part(h, sl, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
@@ -106,7 +107,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
     } else {
         int i;
 
-        assert(IS_8X8(mb_type));
+        av_assert2(IS_8X8(mb_type));
 
         for (i = 0; i < 4; i++) {
             const int sub_mb_type = sl->sub_mb_type[i];
@@ -144,7 +145,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
             } else {
                 int j;
-                assert(IS_SUB_4X4(sub_mb_type));
+                av_assert2(IS_SUB_4X4(sub_mb_type));
                 for (j = 0; j < 4; j++) {
                     int sub_x_offset = x_offset + 2 * (j & 1);
                     int sub_y_offset = y_offset + (j & 2);
@@ -158,6 +159,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
         }
     }
 
-    prefetch_motion(h, sl, 1, PIXEL_SHIFT, CHROMA_IDC);
+    if (USES_LIST(mb_type, 1))
+        prefetch_motion(h, sl, 1, PIXEL_SHIFT, CHROMA_IDC);
 }
 
diff --git a/libavcodec/h264_metadata_bsf.c b/libavcodec/h264_metadata_bsf.c
index 7b51e8e..a17987a 100644
--- a/libavcodec/h264_metadata_bsf.c
+++ b/libavcodec/h264_metadata_bsf.c
@@ -1,22 +1,23 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/avstring.h"
+#include "libavutil/display.h"
 #include "libavutil/common.h"
 #include "libavutil/opt.h"
 
@@ -24,12 +25,24 @@
 #include "cbs.h"
 #include "cbs_h264.h"
 #include "h264.h"
+#include "h264_levels.h"
 #include "h264_sei.h"
 
 enum {
     PASS,
     INSERT,
     REMOVE,
+    EXTRACT,
+};
+
+enum {
+    FLIP_HORIZONTAL = 1,
+    FLIP_VERTICAL   = 2,
+};
+
+enum {
+    LEVEL_UNSET = -2,
+    LEVEL_AUTO  = -1,
 };
 
 typedef struct H264MetadataContext {
@@ -38,8 +51,7 @@ typedef struct H264MetadataContext {
     CodedBitstreamContext *cbc;
     CodedBitstreamFragment access_unit;
 
-    H264RawAUD aud_nal;
-    H264RawSEI sei_nal;
+    int done_first_au;
 
     int aud;
 
@@ -62,9 +74,14 @@ typedef struct H264MetadataContext {
     int crop_bottom;
 
     const char *sei_user_data;
-    int sei_first_au;
 
     int delete_filler;
+
+    int display_orientation;
+    double rotate;
+    int flip;
+
+    int level;
 } H264MetadataContext;
 
 
@@ -199,6 +216,61 @@ static int h264_metadata_update_sps(AVBSFContext *bsf,
     CROP(bottom, crop_unit_y);
 #undef CROP
 
+    if (ctx->level != LEVEL_UNSET) {
+        int level_idc;
+
+        if (ctx->level == LEVEL_AUTO) {
+            const H264LevelDescriptor *desc;
+            int64_t bit_rate;
+            int width, height, dpb_frames;
+
+            if (sps->vui.nal_hrd_parameters_present_flag) {
+                bit_rate = (sps->vui.nal_hrd_parameters.bit_rate_value_minus1[0] + 1) *
+                    (INT64_C(1) << (sps->vui.nal_hrd_parameters.bit_rate_scale + 6));
+            } else if (sps->vui.vcl_hrd_parameters_present_flag) {
+                bit_rate = (sps->vui.vcl_hrd_parameters.bit_rate_value_minus1[0] + 1) *
+                    (INT64_C(1) << (sps->vui.vcl_hrd_parameters.bit_rate_scale + 6));
+                // Adjust for VCL vs. NAL limits.
+                bit_rate = bit_rate * 6 / 5;
+            } else {
+                bit_rate = 0;
+            }
+
+            // Don't use max_dec_frame_buffering if it is only inferred.
+            dpb_frames = sps->vui.bitstream_restriction_flag ?
+                sps->vui.max_dec_frame_buffering : H264_MAX_DPB_FRAMES;
+
+            width  = 16 * (sps->pic_width_in_mbs_minus1 + 1);
+            height = 16 * (sps->pic_height_in_map_units_minus1 + 1) *
+                (2 - sps->frame_mbs_only_flag);
+
+            desc = ff_h264_guess_level(sps->profile_idc, bit_rate,
+                                       width, height, dpb_frames);
+            if (desc) {
+                level_idc = desc->level_idc;
+            } else {
+                av_log(bsf, AV_LOG_WARNING, "Stream does not appear to "
+                       "conform to any level: using level 6.2.\n");
+                level_idc = 62;
+            }
+        } else {
+            level_idc = ctx->level;
+        }
+
+        if (level_idc == 9) {
+            if (sps->profile_idc == 66 ||
+                sps->profile_idc == 77 ||
+                sps->profile_idc == 88) {
+                sps->level_idc = 11;
+                sps->constraint_set3_flag = 1;
+            } else {
+                sps->level_idc = 9;
+            }
+        } else {
+            sps->level_idc = level_idc;
+        }
+    }
+
     if (need_vui)
         sps->vui_parameters_present_flag = 1;
 
@@ -211,10 +283,13 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
     AVPacket *in = NULL;
     CodedBitstreamFragment *au = &ctx->access_unit;
     int err, i, j, has_sps;
+    H264RawAUD aud;
+    uint8_t *displaymatrix_side_data = NULL;
+    size_t displaymatrix_side_data_size = 0;
 
     err = ff_bsf_get_packet(bsf, &in);
     if (err < 0)
-        goto fail;
+        return err;
 
     err = ff_cbs_read_packet(ctx->cbc, au, in);
     if (err < 0) {
@@ -245,7 +320,6 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
                 0x3ff, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
             };
             int primary_pic_type_mask = 0xff;
-            H264RawAUD *aud = &ctx->aud_nal;
 
             for (i = 0; i < au->nb_units; i++) {
                 if (au->units[i].type == H264_NAL_SLICE ||
@@ -268,11 +342,13 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
                 goto fail;
             }
 
-            aud->nal_unit_header.nal_unit_type = H264_NAL_AUD;
-            aud->primary_pic_type = j;
+            aud = (H264RawAUD) {
+                .nal_unit_header.nal_unit_type = H264_NAL_AUD,
+                .primary_pic_type = j,
+            };
 
             err = ff_cbs_insert_unit_content(ctx->cbc, au,
-                                             0, H264_NAL_AUD, aud, NULL);
+                                             0, H264_NAL_AUD, &aud, NULL);
             if (err < 0) {
                 av_log(bsf, AV_LOG_ERROR, "Failed to insert AUD.\n");
                 goto fail;
@@ -292,15 +368,13 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
 
     // Only insert the SEI in access units containing SPSs, and also
     // unconditionally in the first access unit we ever see.
-    if (ctx->sei_user_data && (has_sps || !ctx->sei_first_au)) {
+    if (ctx->sei_user_data && (has_sps || !ctx->done_first_au)) {
         H264RawSEIPayload payload = {
             .payload_type = H264_SEI_TYPE_USER_DATA_UNREGISTERED,
         };
         H264RawSEIUserDataUnregistered *udu =
             &payload.payload.user_data_unregistered;
 
-        ctx->sei_first_au = 1;
-
         for (i = j = 0; j < 32 && ctx->sei_user_data[i]; i++) {
             int c, v;
             c = ctx->sei_user_data[i];
@@ -331,8 +405,6 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
             udu->data_length = len + 1;
             memcpy(udu->data, ctx->sei_user_data + i + 1, len + 1);
 
-            payload.payload_size = 16 + udu->data_length;
-
             err = ff_cbs_h264_add_sei_message(ctx->cbc, au, &payload);
             if (err < 0) {
                 av_log(bsf, AV_LOG_ERROR, "Failed to add user data SEI "
@@ -345,6 +417,7 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
             av_log(bsf, AV_LOG_ERROR, "Invalid user data: "
                    "must be \"UUID+string\".\n");
             err = AVERROR(EINVAL);
+            goto fail;
         }
     }
 
@@ -386,6 +459,125 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
         }
     }
 
+    if (ctx->display_orientation != PASS) {
+        for (i = 0; i < au->nb_units; i++) {
+            H264RawSEI *sei;
+            if (au->units[i].type != H264_NAL_SEI)
+                continue;
+            sei = au->units[i].content;
+
+            for (j = 0; j < sei->payload_count; j++) {
+                H264RawSEIDisplayOrientation *disp;
+                int32_t *matrix;
+
+                if (sei->payload[j].payload_type !=
+                    H264_SEI_TYPE_DISPLAY_ORIENTATION)
+                    continue;
+                disp = &sei->payload[j].payload.display_orientation;
+
+                if (ctx->display_orientation == REMOVE ||
+                    ctx->display_orientation == INSERT) {
+                    err = ff_cbs_h264_delete_sei_message(ctx->cbc, au,
+                                                         &au->units[i], j);
+                    if (err < 0) {
+                        av_log(bsf, AV_LOG_ERROR, "Failed to delete "
+                               "display orientation SEI message.\n");
+                        goto fail;
+                    }
+                    --i;
+                    break;
+                }
+
+                matrix = av_mallocz(9 * sizeof(int32_t));
+                if (!matrix) {
+                    err = AVERROR(ENOMEM);
+                    goto fail;
+                }
+
+                av_display_rotation_set(matrix,
+                                        disp->anticlockwise_rotation *
+                                        180.0 / 65536.0);
+                av_display_matrix_flip(matrix, disp->hor_flip, disp->ver_flip);
+
+                // If there are multiple display orientation messages in an
+                // access unit then ignore all but the last one.
+                av_freep(&displaymatrix_side_data);
+
+                displaymatrix_side_data      = (uint8_t*)matrix;
+                displaymatrix_side_data_size = 9 * sizeof(int32_t);
+            }
+        }
+    }
+    if (ctx->display_orientation == INSERT) {
+        H264RawSEIPayload payload = {
+            .payload_type = H264_SEI_TYPE_DISPLAY_ORIENTATION,
+        };
+        H264RawSEIDisplayOrientation *disp =
+            &payload.payload.display_orientation;
+        uint8_t *data;
+        int size;
+        int write = 0;
+
+        data = av_packet_get_side_data(in, AV_PKT_DATA_DISPLAYMATRIX, &size);
+        if (data && size >= 9 * sizeof(int32_t)) {
+            int32_t matrix[9];
+            int hflip, vflip;
+            double angle;
+
+            memcpy(matrix, data, sizeof(matrix));
+
+            hflip = vflip = 0;
+            if (matrix[0] < 0 && matrix[4] > 0)
+                hflip = 1;
+            else if (matrix[0] > 0 && matrix[4] < 0)
+                vflip = 1;
+            av_display_matrix_flip(matrix, hflip, vflip);
+
+            angle = av_display_rotation_get(matrix);
+
+            if (!(angle >= -180.0 && angle <= 180.0 /* also excludes NaN */) ||
+                matrix[2] != 0 || matrix[5] != 0 ||
+                matrix[6] != 0 || matrix[7] != 0) {
+                av_log(bsf, AV_LOG_WARNING, "Input display matrix is not "
+                       "representable in H.264 parameters.\n");
+            } else {
+                disp->hor_flip = hflip;
+                disp->ver_flip = vflip;
+                disp->anticlockwise_rotation =
+                    (uint16_t)rint((angle >= 0.0 ? angle
+                                                 : angle + 360.0) *
+                                   65536.0 / 360.0);
+                write = 1;
+            }
+        }
+
+        if (has_sps || !ctx->done_first_au) {
+            if (!isnan(ctx->rotate)) {
+                disp->anticlockwise_rotation =
+                    (uint16_t)rint((ctx->rotate >= 0.0 ? ctx->rotate
+                                                       : ctx->rotate + 360.0) *
+                                   65536.0 / 360.0);
+                write = 1;
+            }
+            if (ctx->flip) {
+                disp->hor_flip = !!(ctx->flip & FLIP_HORIZONTAL);
+                disp->ver_flip = !!(ctx->flip & FLIP_VERTICAL);
+                write = 1;
+            }
+        }
+
+        if (write) {
+            disp->display_orientation_repetition_period = 1;
+
+            err = ff_cbs_h264_add_sei_message(ctx->cbc, au, &payload);
+            if (err < 0) {
+                av_log(bsf, AV_LOG_ERROR, "Failed to add display orientation "
+                       "SEI message to access unit.\n");
+                goto fail;
+            }
+        }
+    }
+
     err = ff_cbs_write_packet(ctx->cbc, out, au);
     if (err < 0) {
         av_log(bsf, AV_LOG_ERROR, "Failed to write packet.\n");
@@ -396,10 +588,27 @@ static int h264_metadata_filter(AVBSFContext *bsf, AVPacket *out)
     if (err < 0)
         goto fail;
 
+    if (displaymatrix_side_data) {
+        err = av_packet_add_side_data(out, AV_PKT_DATA_DISPLAYMATRIX,
+                                      displaymatrix_side_data,
+                                      displaymatrix_side_data_size);
+        if (err) {
+            av_log(bsf, AV_LOG_ERROR, "Failed to attach extracted "
+                   "displaymatrix side data to packet.\n");
+            goto fail;
+        }
+        displaymatrix_side_data = NULL;
+    }
+
+    ctx->done_first_au = 1;
+
     err = 0;
 fail:
-    ff_cbs_fragment_uninit(ctx->cbc, au);
+    ff_cbs_fragment_reset(ctx->cbc, au);
+    av_freep(&displaymatrix_side_data);
 
+    if (err < 0)
+        av_packet_unref(out);
     av_packet_free(&in);
 
     return err;
@@ -439,74 +648,135 @@ static int h264_metadata_init(AVBSFContext *bsf)
 
     err = 0;
 fail:
-    ff_cbs_fragment_uninit(ctx->cbc, au);
+    ff_cbs_fragment_reset(ctx->cbc, au);
     return err;
 }
 
 static void h264_metadata_close(AVBSFContext *bsf)
 {
     H264MetadataContext *ctx = bsf->priv_data;
+
+    ff_cbs_fragment_free(ctx->cbc, &ctx->access_unit);
     ff_cbs_close(&ctx->cbc);
 }
 
 #define OFFSET(x) offsetof(H264MetadataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
 static const AVOption h264_metadata_options[] = {
     { "aud", "Access Unit Delimiter NAL units",
         OFFSET(aud), AV_OPT_TYPE_INT,
-        { .i64 = PASS }, PASS, REMOVE, 0, "aud" },
-    { "pass",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PASS   }, .unit = "aud" },
-    { "insert", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = INSERT }, .unit = "aud" },
-    { "remove", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE }, .unit = "aud" },
+        { .i64 = PASS }, PASS, REMOVE, FLAGS, "aud" },
+    { "pass",   NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = PASS   }, .flags = FLAGS, .unit = "aud" },
+    { "insert", NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = INSERT }, .flags = FLAGS, .unit = "aud" },
+    { "remove", NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = REMOVE }, .flags = FLAGS, .unit = "aud" },
 
     { "sample_aspect_ratio", "Set sample aspect ratio (table E-1)",
         OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL,
-        { .i64 = 0 }, 0, 65535 },
+        { .dbl = 0.0 }, 0, 65535, FLAGS },
 
     { "video_format", "Set video format (table E-2)",
         OFFSET(video_format), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 7 },
+        { .i64 = -1 }, -1, 7, FLAGS},
     { "video_full_range_flag", "Set video full range flag",
         OFFSET(video_full_range_flag), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 1 },
+        { .i64 = -1 }, -1, 1, FLAGS },
     { "colour_primaries", "Set colour primaries (table E-3)",
         OFFSET(colour_primaries), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
     { "transfer_characteristics", "Set transfer characteristics (table E-4)",
         OFFSET(transfer_characteristics), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
     { "matrix_coefficients", "Set matrix coefficients (table E-5)",
         OFFSET(matrix_coefficients), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
 
     { "chroma_sample_loc_type", "Set chroma sample location type (figure E-1)",
         OFFSET(chroma_sample_loc_type), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 6 },
+        { .i64 = -1 }, -1, 6, FLAGS },
 
     { "tick_rate", "Set VUI tick rate (num_units_in_tick / time_scale)",
         OFFSET(tick_rate), AV_OPT_TYPE_RATIONAL,
-        { .i64 = 0 }, 0, UINT_MAX },
+        { .dbl = 0.0 }, 0, UINT_MAX, FLAGS },
     { "fixed_frame_rate_flag", "Set VUI fixed frame rate flag",
         OFFSET(fixed_frame_rate_flag), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 1 },
+        { .i64 = -1 }, -1, 1, FLAGS },
 
     { "crop_left", "Set left border crop offset",
         OFFSET(crop_left), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, H264_MAX_WIDTH },
+        { .i64 = -1 }, -1, H264_MAX_WIDTH, FLAGS },
     { "crop_right", "Set right border crop offset",
         OFFSET(crop_right), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, H264_MAX_WIDTH },
+        { .i64 = -1 }, -1, H264_MAX_WIDTH, FLAGS },
     { "crop_top", "Set top border crop offset",
         OFFSET(crop_top), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, H264_MAX_HEIGHT },
+        { .i64 = -1 }, -1, H264_MAX_HEIGHT, FLAGS },
     { "crop_bottom", "Set bottom border crop offset",
         OFFSET(crop_bottom), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, H264_MAX_HEIGHT },
+        { .i64 = -1 }, -1, H264_MAX_HEIGHT, FLAGS },
 
     { "sei_user_data", "Insert SEI user data (UUID+string)",
-        OFFSET(sei_user_data), AV_OPT_TYPE_STRING, { .str = NULL } },
+        OFFSET(sei_user_data), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS },
 
     { "delete_filler", "Delete all filler (both NAL and SEI)",
-        OFFSET(delete_filler), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1 },
+        OFFSET(delete_filler), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS},
+
+    { "display_orientation", "Display orientation SEI",
+        OFFSET(display_orientation), AV_OPT_TYPE_INT,
+        { .i64 = PASS }, PASS, EXTRACT, FLAGS, "disp_or" },
+    { "pass",    NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = PASS    }, .flags = FLAGS, .unit = "disp_or" },
+    { "insert",  NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = INSERT  }, .flags = FLAGS, .unit = "disp_or" },
+    { "remove",  NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = REMOVE  }, .flags = FLAGS, .unit = "disp_or" },
+    { "extract", NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = EXTRACT }, .flags = FLAGS, .unit = "disp_or" },
+
+    { "rotate", "Set rotation in display orientation SEI (anticlockwise angle in degrees)",
+        OFFSET(rotate), AV_OPT_TYPE_DOUBLE,
+        { .dbl = NAN }, -360.0, +360.0, FLAGS },
+    { "flip", "Set flip in display orientation SEI",
+        OFFSET(flip), AV_OPT_TYPE_FLAGS,
+        { .i64 = 0 }, 0, FLIP_HORIZONTAL | FLIP_VERTICAL, FLAGS, "flip" },
+    { "horizontal", "Set hor_flip",
+        0, AV_OPT_TYPE_CONST,
+        { .i64 = FLIP_HORIZONTAL }, .flags = FLAGS, .unit = "flip" },
+    { "vertical",   "Set ver_flip",
+        0, AV_OPT_TYPE_CONST,
+        { .i64 = FLIP_VERTICAL },   .flags = FLAGS, .unit = "flip" },
+
+    { "level", "Set level (table A-1)",
+        OFFSET(level), AV_OPT_TYPE_INT,
+        { .i64 = LEVEL_UNSET }, LEVEL_UNSET, 0xff, FLAGS, "level" },
+    { "auto", "Attempt to guess level from stream properties",
+        0, AV_OPT_TYPE_CONST,
+        { .i64 = LEVEL_AUTO }, .flags = FLAGS, .unit = "level" },
+#define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
+        { .i64 = value },      .flags = FLAGS, .unit = "level"
+    { LEVEL("1",   10) },
+    { LEVEL("1b",   9) },
+    { LEVEL("1.1", 11) },
+    { LEVEL("1.2", 12) },
+    { LEVEL("1.3", 13) },
+    { LEVEL("2",   20) },
+    { LEVEL("2.1", 21) },
+    { LEVEL("2.2", 22) },
+    { LEVEL("3",   30) },
+    { LEVEL("3.1", 31) },
+    { LEVEL("3.2", 32) },
+    { LEVEL("4",   40) },
+    { LEVEL("4.1", 41) },
+    { LEVEL("4.2", 42) },
+    { LEVEL("5",   50) },
+    { LEVEL("5.1", 51) },
+    { LEVEL("5.2", 52) },
+    { LEVEL("6",   60) },
+    { LEVEL("6.1", 61) },
+    { LEVEL("6.2", 62) },
+#undef LEVEL
 
     { NULL }
 };
@@ -515,7 +785,7 @@ static const AVClass h264_metadata_class = {
     .class_name = "h264_metadata_bsf",
     .item_name  = av_default_item_name,
     .option     = h264_metadata_options,
-    .version    = LIBAVCODEC_VERSION_MAJOR,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static const enum AVCodecID h264_metadata_codec_ids[] = {
diff --git a/libavcodec/h264_mp4toannexb_bsf.c b/libavcodec/h264_mp4toannexb_bsf.c
index c45ecd8..fb3f24e 100644
--- a/libavcodec/h264_mp4toannexb_bsf.c
+++ b/libavcodec/h264_mp4toannexb_bsf.c
@@ -2,20 +2,20 @@
  * H.264 MP4 to Annex B byte stream format filter
  * Copyright (c) 2007 Benoit Fouet <benoit.fouet@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,30 +26,35 @@
 
 #include "avcodec.h"
 #include "bsf.h"
+#include "h264.h"
 
 typedef struct H264BSFContext {
+    int32_t  sps_offset;
+    int32_t  pps_offset;
     uint8_t  length_size;
-    uint8_t  first_idr;
+    uint8_t  new_idr;
+    uint8_t  idr_sps_seen;
+    uint8_t  idr_pps_seen;
     int      extradata_parsed;
 } H264BSFContext;
 
 static int alloc_and_copy(AVPacket *out,
                           const uint8_t *sps_pps, uint32_t sps_pps_size,
-                          const uint8_t *in, uint32_t in_size)
+                          const uint8_t *in, uint32_t in_size, int ps)
 {
     uint32_t offset         = out->size;
-    uint8_t nal_header_size = offset ? 3 : 4;
+    uint8_t start_code_size = offset == 0 || ps ? 4 : 3;
     int err;
 
-    err = av_grow_packet(out, sps_pps_size + in_size + nal_header_size);
+    err = av_grow_packet(out, sps_pps_size + in_size + start_code_size);
     if (err < 0)
         return err;
 
     if (sps_pps)
         memcpy(out->data + offset, sps_pps, sps_pps_size);
-    memcpy(out->data + sps_pps_size + nal_header_size + offset, in, in_size);
-    if (!offset) {
-        AV_WB32(out->data + sps_pps_size, 1);
+    memcpy(out->data + sps_pps_size + start_code_size + offset, in, in_size);
+    if (start_code_size == 4) {
+        AV_WB32(out->data + offset + sps_pps_size, 1);
     } else {
         (out->data + offset + sps_pps_size)[0] =
         (out->data + offset + sps_pps_size)[1] = 0;
@@ -61,6 +66,7 @@ static int alloc_and_copy(AVPacket *out,
 
 static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
 {
+    H264BSFContext *s = ctx->priv_data;
     uint16_t unit_size;
     uint64_t total_size                 = 0;
     uint8_t *out                        = NULL, unit_nb, sps_done = 0,
@@ -69,18 +75,14 @@ static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
     static const uint8_t nalu_header[4] = { 0, 0, 0, 1 };
     int length_size = (*extradata++ & 0x3) + 1; // retrieve length coded size
 
-    if (length_size == 3)
-        return AVERROR(EINVAL);
+    s->sps_offset = s->pps_offset = -1;
 
     /* retrieve sps and pps unit(s) */
     unit_nb = *extradata++ & 0x1f; /* number of sps unit(s) */
     if (!unit_nb) {
-        unit_nb = *extradata++; /* number of pps unit(s) */
-        sps_done++;
-
-        if (unit_nb)
-            pps_seen = 1;
+        goto pps;
     } else {
+        s->sps_offset = 0;
         sps_seen = 1;
     }
 
@@ -89,9 +91,15 @@ static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
 
         unit_size   = AV_RB16(extradata);
         total_size += unit_size + 4;
-        if (total_size > INT_MAX - padding ||
-            extradata + 2 + unit_size > ctx->par_in->extradata +
-            ctx->par_in->extradata_size) {
+        if (total_size > INT_MAX - padding) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Too big extradata size, corrupted stream or invalid MP4/AVCC bitstream\n");
+            av_free(out);
+            return AVERROR(EINVAL);
+        }
+        if (extradata + 2 + unit_size > ctx->par_in->extradata + ctx->par_in->extradata_size) {
+            av_log(ctx, AV_LOG_ERROR, "Packet header is not contained in global extradata, "
+                   "corrupted stream or invalid MP4/AVCC bitstream\n");
             av_free(out);
             return AVERROR(EINVAL);
         }
@@ -100,16 +108,18 @@ static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
         memcpy(out + total_size - unit_size - 4, nalu_header, 4);
         memcpy(out + total_size - unit_size, extradata + 2, unit_size);
         extradata += 2 + unit_size;
-
+pps:
         if (!unit_nb && !sps_done++) {
             unit_nb = *extradata++; /* number of pps unit(s) */
-            if (unit_nb)
+            if (unit_nb) {
+                s->pps_offset = total_size;
                 pps_seen = 1;
+            }
         }
     }
 
     if (out)
-        memset(out + total_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        memset(out + total_size, 0, padding);
 
     if (!sps_seen)
         av_log(ctx, AV_LOG_WARNING,
@@ -146,7 +156,9 @@ static int h264_mp4toannexb_init(AVBSFContext *ctx)
             return ret;
 
         s->length_size      = ret;
-        s->first_idr        = 1;
+        s->new_idr          = 1;
+        s->idr_sps_seen     = 0;
+        s->idr_pps_seen     = 0;
         s->extradata_parsed = 1;
     } else {
         av_log(ctx, AV_LOG_ERROR, "Invalid extradata size: %d\n", extra_size);
@@ -167,7 +179,7 @@ static int h264_mp4toannexb_filter(AVBSFContext *ctx, AVPacket *out)
     const uint8_t *buf;
     const uint8_t *buf_end;
     int            buf_size;
-    int ret = 0;
+    int ret = 0, i;
 
     ret = ff_bsf_get_packet(ctx, &in);
     if (ret < 0)
@@ -185,37 +197,73 @@ static int h264_mp4toannexb_filter(AVBSFContext *ctx, AVPacket *out)
     buf_end  = in->data + in->size;
 
     do {
+        ret= AVERROR(EINVAL);
         if (buf + s->length_size > buf_end)
             goto fail;
 
-        if (s->length_size == 1) {
-            nal_size = buf[0];
-        } else if (s->length_size == 2) {
-            nal_size = AV_RB16(buf);
-        } else
-            nal_size = AV_RB32(buf);
+        for (nal_size = 0, i = 0; i<s->length_size; i++)
+            nal_size = (nal_size << 8) | buf[i];
 
         buf += s->length_size;
         unit_type = *buf & 0x1f;
 
-        if (buf + nal_size > buf_end || nal_size < 0)
+        if (nal_size > buf_end - buf || nal_size < 0)
             goto fail;
 
-        /* prepend only to the first type 5 NAL unit of an IDR picture */
-        if (s->first_idr && unit_type == 5) {
-            if (alloc_and_copy(out,
+        if (unit_type == H264_NAL_SPS)
+            s->idr_sps_seen = s->new_idr = 1;
+        else if (unit_type == H264_NAL_PPS) {
+            s->idr_pps_seen = s->new_idr = 1;
+            /* if SPS has not been seen yet, prepend the AVCC one to PPS */
+            if (!s->idr_sps_seen) {
+                if (s->sps_offset == -1)
+                    av_log(ctx, AV_LOG_WARNING, "SPS not present in the stream, nor in AVCC, stream may be unreadable\n");
+                else {
+                    if ((ret = alloc_and_copy(out,
+                                         ctx->par_out->extradata + s->sps_offset,
+                                         s->pps_offset != -1 ? s->pps_offset : ctx->par_out->extradata_size - s->sps_offset,
+                                         buf, nal_size, 1)) < 0)
+                        goto fail;
+                    s->idr_sps_seen = 1;
+                    goto next_nal;
+                }
+            }
+        }
+
+        /* if this is a new IDR picture following an IDR picture, reset the idr flag.
+         * Just check first_mb_in_slice to be 0 as this is the simplest solution.
+         * This could be checking idr_pic_id instead, but would complexify the parsing. */
+        if (!s->new_idr && unit_type == H264_NAL_IDR_SLICE && (buf[1] & 0x80))
+            s->new_idr = 1;
+
+        /* prepend only to the first type 5 NAL unit of an IDR picture, if no sps/pps are already present */
+        if (s->new_idr && unit_type == H264_NAL_IDR_SLICE && !s->idr_sps_seen && !s->idr_pps_seen) {
+            if ((ret=alloc_and_copy(out,
                                ctx->par_out->extradata, ctx->par_out->extradata_size,
-                               buf, nal_size) < 0)
+                               buf, nal_size, 1)) < 0)
+                goto fail;
+            s->new_idr = 0;
+        /* if only SPS has been seen, also insert PPS */
+        } else if (s->new_idr && unit_type == H264_NAL_IDR_SLICE && s->idr_sps_seen && !s->idr_pps_seen) {
+            if (s->pps_offset == -1) {
+                av_log(ctx, AV_LOG_WARNING, "PPS not present in the stream, nor in AVCC, stream may be unreadable\n");
+                if ((ret = alloc_and_copy(out, NULL, 0, buf, nal_size, 0)) < 0)
+                    goto fail;
+            } else if ((ret = alloc_and_copy(out,
+                                        ctx->par_out->extradata + s->pps_offset, ctx->par_out->extradata_size - s->pps_offset,
+                                        buf, nal_size, 1)) < 0)
                 goto fail;
-            s->first_idr = 0;
         } else {
-            if (alloc_and_copy(out,
-                               NULL, 0, buf, nal_size) < 0)
+            if ((ret=alloc_and_copy(out, NULL, 0, buf, nal_size, unit_type == H264_NAL_SPS || unit_type == H264_NAL_PPS)) < 0)
                 goto fail;
-            if (!s->first_idr && unit_type == 1)
-                s->first_idr = 1;
+            if (!s->new_idr && unit_type == H264_NAL_SLICE) {
+                s->new_idr = 1;
+                s->idr_sps_seen = 0;
+                s->idr_pps_seen = 0;
+            }
         }
 
+next_nal:
         buf        += nal_size;
         cumul_size += nal_size + s->length_size;
     } while (cumul_size < buf_size);
@@ -236,7 +284,9 @@ static void h264_mp4toannexb_flush(AVBSFContext *ctx)
 {
     H264BSFContext *s = ctx->priv_data;
 
-    s->first_idr = s->extradata_parsed;
+    s->idr_sps_seen = 0;
+    s->idr_pps_seen = 0;
+    s->new_idr      = s->extradata_parsed;
 }
 
 static const enum AVCodecID codec_ids[] = {
diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h
index 83b1ea6..bf395e3 100644
--- a/libavcodec/h264_mvpred.h
+++ b/libavcodec/h264_mvpred.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... motion vector prediction
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,8 @@
 #include "avcodec.h"
 #include "h264dec.h"
 #include "mpegutils.h"
+#include "libavutil/avassert.h"
 
-#include <assert.h>
 
 static av_always_inline int fetch_diagonal_mv(const H264Context *h, H264SliceContext *sl,
                                               const int16_t **C,
@@ -68,7 +68,7 @@ static av_always_inline int fetch_diagonal_mv(const H264Context *h, H264SliceCon
             }
             if (MB_FIELD(sl) && !IS_INTERLACED(sl->left_type[0])) {
                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
-                SET_DIAG_MV(/ 2, << 1, sl->left_mb_xy[i >= 36], ((i >> 2)) & 3);
+                SET_DIAG_MV(/ 2, *2, sl->left_mb_xy[i >= 36], ((i >> 2)) & 3);
             }
         }
 #undef SET_DIAG_MV
@@ -106,7 +106,7 @@ static av_always_inline void pred_motion(const H264Context *const h,
     const int16_t *C;
     int diagonal_ref, match_count;
 
-    assert(part_width == 1 || part_width == 2 || part_width == 4);
+    av_assert2(part_width == 1 || part_width == 2 || part_width == 4);
 
 /* mv_cache
  * B . . A T T T T
@@ -248,7 +248,7 @@ static av_always_inline void pred_8x16_motion(const H264Context *const h,
             if (IS_INTERLACED(type)) {          \
                 refn >>= 1;                     \
                 AV_COPY32(mvbuf[idx], mvn);     \
-                mvbuf[idx][1] <<= 1;            \
+                mvbuf[idx][1] *= 2;             \
                 mvn = mvbuf[idx];               \
             }                                   \
         }                                       \
@@ -488,7 +488,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
                 } else {
                     int left_typei = h->cur_pic.mb_type[left_xy[LTOP] + h->mb_stride];
 
-                    assert(left_xy[LTOP] == left_xy[LBOT]);
+                    av_assert2(left_xy[LTOP] == left_xy[LBOT]);
                     if (!((left_typei & type_mask) && (left_type[LTOP] & type_mask))) {
                         sl->topleft_samples_available &= 0xDF5F;
                         sl->left_samples_available    &= 0x5F5F;
@@ -613,7 +613,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
             int16_t(*mv)[2]       = h->cur_pic.motion_val[list];
             if (!USES_LIST(mb_type, list))
                 continue;
-            assert(!(IS_DIRECT(mb_type) && !sl->direct_spatial_mv_pred));
+            av_assert2(!(IS_DIRECT(mb_type) && !sl->direct_spatial_mv_pred));
 
             if (USES_LIST(top_type, list)) {
                 const int b_xy = h->mb2b_xy[top_xy] + 3 * b_stride;
@@ -670,7 +670,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
                 ref_cache[4 - 1 * 8] = topright_type ? LIST_NOT_USED
                                                      : PART_NOT_AVAILABLE;
             }
-            if (ref_cache[4 - 1 * 8] < 0) {
+            if(ref_cache[2 - 1*8] < 0 || ref_cache[4 - 1 * 8] < 0) {
                 if (USES_LIST(topleft_type, list)) {
                     const int b_xy  = h->mb2b_xy[topleft_xy] + 3 + b_stride +
                                       (sl->topleft_partition & 2 * b_stride);
@@ -771,7 +771,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
 
 #define MAP_F2F(idx, mb_type)                                           \
     if (!IS_INTERLACED(mb_type) && sl->ref_cache[list][idx] >= 0) {     \
-        sl->ref_cache[list][idx]    <<= 1;                              \
+        sl->ref_cache[list][idx]     *= 2;                              \
         sl->mv_cache[list][idx][1]   /= 2;                              \
         sl->mvd_cache[list][idx][1] >>= 1;                              \
     }
@@ -783,7 +783,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
 #define MAP_F2F(idx, mb_type)                                           \
     if (IS_INTERLACED(mb_type) && sl->ref_cache[list][idx] >= 0) {      \
         sl->ref_cache[list][idx]    >>= 1;                              \
-        sl->mv_cache[list][idx][1]  <<= 1;                              \
+        sl->mv_cache[list][idx][1]   *= 2;                              \
         sl->mvd_cache[list][idx][1] <<= 1;                              \
     }
 
diff --git a/libavcodec/h264_parse.c b/libavcodec/h264_parse.c
index cde46fa..a075443 100644
--- a/libavcodec/h264_parse.c
+++ b/libavcodec/h264_parse.c
@@ -1,24 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "bytestream.h"
 #include "get_bits.h"
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "h264.h"
 #include "h264dec.h"
 #include "h264_parse.h"
@@ -26,18 +26,30 @@
 
 int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
                               const int *ref_count, int slice_type_nos,
-                              H264PredWeightTable *pwt)
+                              H264PredWeightTable *pwt,
+                              int picture_structure, void *logctx)
 {
     int list, i, j;
     int luma_def, chroma_def;
 
     pwt->use_weight             = 0;
     pwt->use_weight_chroma      = 0;
+
     pwt->luma_log2_weight_denom = get_ue_golomb(gb);
-    if (sps->chroma_format_idc)
+    if (pwt->luma_log2_weight_denom > 7U) {
+        av_log(logctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is out of range\n", pwt->luma_log2_weight_denom);
+        pwt->luma_log2_weight_denom = 0;
+    }
+    luma_def = 1 << pwt->luma_log2_weight_denom;
+
+    if (sps->chroma_format_idc) {
         pwt->chroma_log2_weight_denom = get_ue_golomb(gb);
-    luma_def   = 1 << pwt->luma_log2_weight_denom;
-    chroma_def = 1 << pwt->chroma_log2_weight_denom;
+        if (pwt->chroma_log2_weight_denom > 7U) {
+            av_log(logctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is out of range\n", pwt->chroma_log2_weight_denom);
+            pwt->chroma_log2_weight_denom = 0;
+        }
+        chroma_def = 1 << pwt->chroma_log2_weight_denom;
+    }
 
     for (list = 0; list < 2; list++) {
         pwt->luma_weight_flag[list]   = 0;
@@ -49,6 +61,9 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
             if (luma_weight_flag) {
                 pwt->luma_weight[i][list][0] = get_se_golomb(gb);
                 pwt->luma_weight[i][list][1] = get_se_golomb(gb);
+                if ((int8_t)pwt->luma_weight[i][list][0] != pwt->luma_weight[i][list][0] ||
+                    (int8_t)pwt->luma_weight[i][list][1] != pwt->luma_weight[i][list][1])
+                    goto out_range_weight;
                 if (pwt->luma_weight[i][list][0] != luma_def ||
                     pwt->luma_weight[i][list][1] != 0) {
                     pwt->use_weight             = 1;
@@ -66,6 +81,12 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
                     for (j = 0; j < 2; j++) {
                         pwt->chroma_weight[i][list][j][0] = get_se_golomb(gb);
                         pwt->chroma_weight[i][list][j][1] = get_se_golomb(gb);
+                        if ((int8_t)pwt->chroma_weight[i][list][j][0] != pwt->chroma_weight[i][list][j][0] ||
+                            (int8_t)pwt->chroma_weight[i][list][j][1] != pwt->chroma_weight[i][list][j][1]) {
+                            pwt->chroma_weight[i][list][j][0] = chroma_def;
+                            pwt->chroma_weight[i][list][j][1] = 0;
+                            goto out_range_weight;
+                        }
                         if (pwt->chroma_weight[i][list][j][0] != chroma_def ||
                             pwt->chroma_weight[i][list][j][1] != 0) {
                             pwt->use_weight_chroma        = 1;
@@ -82,11 +103,15 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
             }
 
             // for MBAFF
-            pwt->luma_weight[16 + 2 * i][list][0] = pwt->luma_weight[16 + 2 * i + 1][list][0] = pwt->luma_weight[i][list][0];
-            pwt->luma_weight[16 + 2 * i][list][1] = pwt->luma_weight[16 + 2 * i + 1][list][1] = pwt->luma_weight[i][list][1];
-            for (j = 0; j < 2; j++) {
-                pwt->chroma_weight[16 + 2 * i][list][j][0] = pwt->chroma_weight[16 + 2 * i + 1][list][j][0] = pwt->chroma_weight[i][list][j][0];
-                pwt->chroma_weight[16 + 2 * i][list][j][1] = pwt->chroma_weight[16 + 2 * i + 1][list][j][1] = pwt->chroma_weight[i][list][j][1];
+            if (picture_structure == PICT_FRAME) {
+                pwt->luma_weight[16 + 2 * i][list][0] = pwt->luma_weight[16 + 2 * i + 1][list][0] = pwt->luma_weight[i][list][0];
+                pwt->luma_weight[16 + 2 * i][list][1] = pwt->luma_weight[16 + 2 * i + 1][list][1] = pwt->luma_weight[i][list][1];
+                if (sps->chroma_format_idc) {
+                    for (j = 0; j < 2; j++) {
+                        pwt->chroma_weight[16 + 2 * i][list][j][0] = pwt->chroma_weight[16 + 2 * i + 1][list][j][0] = pwt->chroma_weight[i][list][j][0];
+                        pwt->chroma_weight[16 + 2 * i][list][j][1] = pwt->chroma_weight[16 + 2 * i + 1][list][j][1] = pwt->chroma_weight[i][list][j][1];
+                    }
+                }
             }
         }
         if (slice_type_nos != AV_PICTURE_TYPE_B)
@@ -94,6 +119,9 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
     }
     pwt->use_weight = pwt->use_weight || pwt->use_weight_chroma;
     return 0;
+out_range_weight:
+    avpriv_request_sample(logctx, "Out of range weight");
+    return AVERROR_INVALIDDATA;
 }
 
 /**
@@ -116,7 +144,7 @@ int ff_h264_check_intra4x4_pred_mode(int8_t *pred_mode_cache, void *logctx,
             int status = top[pred_mode_cache[scan8[0] + i]];
             if (status < 0) {
                 av_log(logctx, AV_LOG_ERROR,
-                       "top block unavailable for requested intra4x4 mode %d\n",
+                       "top block unavailable for requested intra mode %d\n",
                        status);
                 return AVERROR_INVALIDDATA;
             } else if (status) {
@@ -172,17 +200,17 @@ int ff_h264_check_intra_pred_mode(void *logctx, int top_samples_available,
 
     if ((left_samples_available & 0x8080) != 0x8080) {
         mode = left[mode];
+        if (mode < 0) {
+            av_log(logctx, AV_LOG_ERROR,
+                   "left block unavailable for requested intra mode\n");
+            return AVERROR_INVALIDDATA;
+        }
         if (is_chroma && (left_samples_available & 0x8080)) {
             // mad cow disease mode, aka MBAFF + constrained_intra_pred
             mode = ALZHEIMER_DC_L0T_PRED8x8 +
                    (!(left_samples_available & 0x8000)) +
                    2 * (mode == DC_128_PRED8x8);
         }
-        if (mode < 0) {
-            av_log(logctx, AV_LOG_ERROR,
-                   "left block unavailable for requested intra mode\n");
-            return AVERROR_INVALIDDATA;
-        }
     }
 
     return mode;
@@ -190,43 +218,52 @@ int ff_h264_check_intra_pred_mode(void *logctx, int top_samples_available,
 
 int ff_h264_parse_ref_count(int *plist_count, int ref_count[2],
                             GetBitContext *gb, const PPS *pps,
-                            int slice_type_nos, int picture_structure)
+                            int slice_type_nos, int picture_structure, void *logctx)
 {
     int list_count;
-    int num_ref_idx_active_override_flag, max_refs;
+    int num_ref_idx_active_override_flag;
 
     // set defaults, might be overridden a few lines later
     ref_count[0] = pps->ref_count[0];
     ref_count[1] = pps->ref_count[1];
 
     if (slice_type_nos != AV_PICTURE_TYPE_I) {
+        unsigned max[2];
+        max[0] = max[1] = picture_structure == PICT_FRAME ? 15 : 31;
+
         num_ref_idx_active_override_flag = get_bits1(gb);
 
         if (num_ref_idx_active_override_flag) {
             ref_count[0] = get_ue_golomb(gb) + 1;
-            if (ref_count[0] < 1)
-                goto fail;
             if (slice_type_nos == AV_PICTURE_TYPE_B) {
                 ref_count[1] = get_ue_golomb(gb) + 1;
-                if (ref_count[1] < 1)
-                    goto fail;
-            }
+            } else
+                // full range is spec-ok in this case, even for frames
+                ref_count[1] = 1;
         }
 
         if (slice_type_nos == AV_PICTURE_TYPE_B)
             list_count = 2;
         else
             list_count = 1;
+
+        if (ref_count[0] - 1 > max[0] || (list_count == 2 && (ref_count[1] - 1 > max[1]))) {
+            av_log(logctx, AV_LOG_ERROR, "reference overflow %u > %u or %u > %u\n",
+                   ref_count[0] - 1, max[0], ref_count[1] - 1, max[1]);
+            ref_count[0] = ref_count[1] = 0;
+            *plist_count = 0;
+            goto fail;
+        } else if (ref_count[1] - 1 > max[1]) {
+            av_log(logctx, AV_LOG_DEBUG, "reference overflow %u > %u \n",
+                   ref_count[1] - 1, max[1]);
+            ref_count[1] = 0;
+        }
+
     } else {
         list_count   = 0;
         ref_count[0] = ref_count[1] = 0;
     }
 
-    max_refs = picture_structure == PICT_FRAME ? 16 : 32;
-
-    if (ref_count[0] > max_refs || ref_count[1] > max_refs)
-        goto fail;
-
     *plist_count = list_count;
 
     return 0;
@@ -242,7 +279,7 @@ int ff_h264_init_poc(int pic_field_poc[2], int *pic_poc,
                      int picture_structure, int nal_ref_idc)
 {
     const int max_frame_num = 1 << sps->log2_max_frame_num;
-    int field_poc[2];
+    int64_t field_poc[2];
 
     pc->frame_num_offset = pc->prev_frame_num_offset;
     if (pc->frame_num < pc->prev_frame_num)
@@ -308,6 +345,10 @@ int ff_h264_init_poc(int pic_field_poc[2], int *pic_poc,
         field_poc[1] = poc;
     }
 
+    if (   field_poc[0] != (int)field_poc[0]
+        || field_poc[1] != (int)field_poc[1])
+        return AVERROR_INVALIDDATA;
+
     if (picture_structure != PICT_BOTTOM_FIELD)
         pic_field_poc[0] = field_poc[0];
     if (picture_structure != PICT_TOP_FIELD)
@@ -323,15 +364,17 @@ static int decode_extradata_ps(const uint8_t *data, int size, H264ParamSets *ps,
     H2645Packet pkt = { 0 };
     int i, ret = 0;
 
-    ret = ff_h2645_packet_split(&pkt, data, size, logctx, is_avc, 2, AV_CODEC_ID_H264);
-    if (ret < 0)
+    ret = ff_h2645_packet_split(&pkt, data, size, logctx, is_avc, 2, AV_CODEC_ID_H264, 1, 0);
+    if (ret < 0) {
+        ret = 0;
         goto fail;
+    }
 
     for (i = 0; i < pkt.nb_nals; i++) {
         H2645NAL *nal = &pkt.nals[i];
         switch (nal->type) {
         case H264_NAL_SPS:
-            ret = ff_h264_decode_seq_parameter_set(&nal->gb, logctx, ps);
+            ret = ff_h264_decode_seq_parameter_set(&nal->gb, logctx, ps, 0);
             if (ret < 0)
                 goto fail;
             break;
@@ -394,10 +437,9 @@ static int decode_extradata_ps_mp4(const uint8_t *buf, int buf_size, H264ParamSe
         escaped_buf_size = bytestream2_tell_p(&pbc);
         AV_WB16(escaped_buf, escaped_buf_size - 2);
 
-        ret = decode_extradata_ps(escaped_buf, escaped_buf_size, ps, 1, logctx);
+        (void)decode_extradata_ps(escaped_buf, escaped_buf_size, ps, 1, logctx);
+        // lorex.mp4 decodes ok even with extradata decoding failing
         av_freep(&escaped_buf);
-        if (ret < 0)
-            return ret;
     }
 
     return 0;
@@ -409,6 +451,9 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
 {
     int ret;
 
+    if (!data || size <= 0)
+        return -1;
+
     if (data[0] == 1) {
         int i, cnt, nalsize;
         const uint8_t *p = data;
@@ -425,7 +470,7 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
         p  += 6;
         for (i = 0; i < cnt; i++) {
             nalsize = AV_RB16(p) + 2;
-            if (p - data + nalsize > size)
+            if (nalsize > size - (p - data))
                 return AVERROR_INVALIDDATA;
             ret = decode_extradata_ps_mp4(p, nalsize, ps, err_recognition, logctx);
             if (ret < 0) {
@@ -439,7 +484,7 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
         cnt = *(p++); // Number of pps
         for (i = 0; i < cnt; i++) {
             nalsize = AV_RB16(p) + 2;
-            if (p - data + nalsize > size)
+            if (nalsize > size - (p - data))
                 return AVERROR_INVALIDDATA;
             ret = decode_extradata_ps_mp4(p, nalsize, ps, err_recognition, logctx);
             if (ret < 0) {
@@ -457,7 +502,7 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
         if (ret < 0)
             return ret;
     }
-    return 0;
+    return size;
 }
 
 /**
diff --git a/libavcodec/h264_parse.h b/libavcodec/h264_parse.h
index 5c6024d..4d01620 100644
--- a/libavcodec/h264_parse.h
+++ b/libavcodec/h264_parse.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -55,7 +55,8 @@ typedef struct H264POCContext {
 
 int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
                               const int *ref_count, int slice_type_nos,
-                              H264PredWeightTable *pwt);
+                              H264PredWeightTable *pwt,
+                              int picture_structure, void *logctx);
 
 /**
  * Check if the top & left blocks are available if needed & change the
@@ -74,7 +75,7 @@ int ff_h264_check_intra_pred_mode(void *logctx, int top_samples_available,
 
 int ff_h264_parse_ref_count(int *plist_count, int ref_count[2],
                             GetBitContext *gb, const PPS *pps,
-                            int slice_type_nos, int picture_structure);
+                            int slice_type_nos, int picture_structure, void *logctx);
 
 int ff_h264_init_poc(int pic_field_poc[2], int *pic_poc,
                      const SPS *sps, H264POCContext *poc,
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index 0bb78e0..5f9a9c4 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... parser
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include <assert.h>
 #include <stdint.h>
 
@@ -36,7 +38,7 @@
 
 #include "avcodec.h"
 #include "get_bits.h"
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "h264.h"
 #include "h264_sei.h"
 #include "h264_ps.h"
@@ -55,24 +57,47 @@ typedef struct H264ParseContext {
     int nal_length_size;
     int got_first;
     int picture_structure;
+    uint8_t parse_history[6];
+    int parse_history_count;
+    int parse_last_mb;
+    int64_t reference_dts;
+    int last_frame_num, last_picture_structure;
 } H264ParseContext;
 
 
 static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-                               int buf_size)
+                               int buf_size, void *logctx)
 {
-    int i;
+    int i, j;
     uint32_t state;
     ParseContext *pc = &p->pc;
+
+    int next_avc = p->is_avc ? 0 : buf_size;
 //    mb_addr= pc->mb_addr - 1;
     state = pc->state;
     if (state > 13)
         state = 7;
 
+    if (p->is_avc && !p->nal_length_size)
+        av_log(logctx, AV_LOG_ERROR, "AVC-parser: nal length size invalid\n");
+
     for (i = 0; i < buf_size; i++) {
+        if (i >= next_avc) {
+            int nalsize = 0;
+            i = next_avc;
+            for (j = 0; j < p->nal_length_size; j++)
+                nalsize = (nalsize << 8) | buf[i++];
+            if (nalsize <= 0 || nalsize > buf_size - i) {
+                av_log(logctx, AV_LOG_ERROR, "AVC-parser: nal size %d remaining %d\n", nalsize, buf_size - i);
+                return buf_size;
+            }
+            next_avc = i + nalsize;
+            state    = 5;
+        }
+
         if (state == 7) {
-            i += p->h264dsp.startcode_find_candidate(buf + i, buf_size - i);
-            if (i < buf_size)
+            i += p->h264dsp.startcode_find_candidate(buf + i, next_avc - i);
+            if (i < next_avc)
                 state = 2;
         } else if (state <= 2) {
             if (buf[i] == 1)
@@ -91,31 +116,47 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
                 }
             } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
                        nalu_type == H264_NAL_IDR_SLICE) {
+                state += 8;
+                continue;
+            }
+            state = 7;
+        } else {
+            unsigned int mb, last_mb = p->parse_last_mb;
+            GetBitContext gb;
+            p->parse_history[p->parse_history_count++] = buf[i];
+
+            init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+            mb= get_ue_golomb_long(&gb);
+            if (get_bits_left(&gb) > 0 || p->parse_history_count > 5) {
+                p->parse_last_mb = mb;
                 if (pc->frame_start_found) {
-                    state += 8;
-                    continue;
+                    if (mb <= last_mb) {
+                        i -= p->parse_history_count - 1;
+                        p->parse_history_count = 0;
+                        goto found;
+                    }
                 } else
                     pc->frame_start_found = 1;
+                p->parse_history_count = 0;
+                state = 7;
             }
-            state = 7;
-        } else {
-            // first_mb_in_slice is 0, probably the first nal of a new slice
-            if (buf[i] & 0x80)
-                goto found;
-            state = 7;
         }
     }
     pc->state = state;
+    if (p->is_avc)
+        return next_avc;
     return END_NOT_FOUND;
 
 found:
     pc->state             = 7;
     pc->frame_start_found = 0;
+    if (p->is_avc)
+        return next_avc;
     return i - (state & 5);
 }
 
 static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
-                           AVCodecContext *avctx)
+                           void *logctx)
 {
     H264PredWeightTable pwt;
     int slice_type_nos = s->pict_type & 3;
@@ -130,7 +171,7 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
         get_bits1(gb); // direct_spatial_mv_pred
 
     if (ff_h264_parse_ref_count(&list_count, ref_count, gb, p->ps.pps,
-                                slice_type_nos, p->picture_structure) < 0)
+                                slice_type_nos, p->picture_structure, logctx) < 0)
         return AVERROR_INVALIDDATA;
 
     if (slice_type_nos != AV_PICTURE_TYPE_I) {
@@ -142,9 +183,9 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
                     unsigned int reordering_of_pic_nums_idc = get_ue_golomb_31(gb);
 
                     if (reordering_of_pic_nums_idc < 3)
-                        get_ue_golomb(gb);
+                        get_ue_golomb_long(gb);
                     else if (reordering_of_pic_nums_idc > 3) {
-                        av_log(avctx, AV_LOG_ERROR,
+                        av_log(logctx, AV_LOG_ERROR,
                                "illegal reordering_of_pic_nums_idc %d\n",
                                reordering_of_pic_nums_idc);
                         return AVERROR_INVALIDDATA;
@@ -152,7 +193,7 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
                         break;
 
                     if (index >= ref_count[list]) {
-                        av_log(avctx, AV_LOG_ERROR,
+                        av_log(logctx, AV_LOG_ERROR,
                                "reference count %d overflow\n", index);
                         return AVERROR_INVALIDDATA;
                     }
@@ -164,14 +205,14 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
     if ((p->ps.pps->weighted_pred && slice_type_nos == AV_PICTURE_TYPE_P) ||
         (p->ps.pps->weighted_bipred_idc == 1 && slice_type_nos == AV_PICTURE_TYPE_B))
         ff_h264_pred_weight_table(gb, p->ps.sps, ref_count, slice_type_nos,
-                                  &pwt);
+                                  &pwt, p->picture_structure, logctx);
 
     if (get_bits1(gb)) { // adaptive_ref_pic_marking_mode_flag
         int i;
         for (i = 0; i < MAX_MMCO_COUNT; i++) {
             MMCOOpcode opcode = get_ue_golomb_31(gb);
             if (opcode > (unsigned) MMCO_LONG) {
-                av_log(avctx, AV_LOG_ERROR,
+                av_log(logctx, AV_LOG_ERROR,
                        "illegal memory management control operation %d\n",
                        opcode);
                 return AVERROR_INVALIDDATA;
@@ -182,7 +223,7 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
                 return 1;
 
             if (opcode == MMCO_SHORT2UNUSED || opcode == MMCO_SHORT2LONG)
-                get_ue_golomb(gb);
+                get_ue_golomb_long(gb); // difference_of_pic_nums_minus1
             if (opcode == MMCO_SHORT2LONG || opcode == MMCO_LONG2UNUSED ||
                 opcode == MMCO_LONG || opcode == MMCO_SET_MAX_LONG)
                 get_ue_golomb_31(gb);
@@ -202,16 +243,16 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
  */
 static inline int parse_nal_units(AVCodecParserContext *s,
                                   AVCodecContext *avctx,
-                                  const uint8_t *buf, int buf_size)
+                                  const uint8_t * const buf, int buf_size)
 {
     H264ParseContext *p = s->priv_data;
-    const uint8_t *buf_end = buf + buf_size;
-
+    H2645RBSP rbsp = { NULL };
     H2645NAL nal = { NULL };
-
+    int buf_index, next_avc;
     unsigned int pps_id;
     unsigned int slice_type;
     int state = -1, got_reset = 0;
+    int q264 = buf_size >=4 && !memcmp("Q264", buf, 4);
     int field_poc[2];
     int ret;
 
@@ -221,18 +262,36 @@ static inline int parse_nal_units(AVCodecParserContext *s,
     s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
 
     ff_h264_sei_uninit(&p->sei);
+    p->sei.frame_packing.arrangement_cancel_flag = -1;
 
     if (!buf_size)
         return 0;
 
+    av_fast_padded_malloc(&rbsp.rbsp_buffer, &rbsp.rbsp_buffer_alloc_size, buf_size);
+    if (!rbsp.rbsp_buffer)
+        return AVERROR(ENOMEM);
+
+    buf_index     = 0;
+    next_avc      = p->is_avc ? 0 : buf_size;
     for (;;) {
         const SPS *sps;
-        int src_length, consumed;
-        buf = avpriv_find_start_code(buf, buf_end, &state);
-        if (buf >= buf_end)
-            break;
-        --buf;
-        src_length = buf_end - buf;
+        int src_length, consumed, nalsize = 0;
+
+        if (buf_index >= next_avc) {
+            nalsize = get_nalsize(p->nal_length_size, buf, buf_size, &buf_index, avctx);
+            if (nalsize < 0)
+                break;
+            next_avc = buf_index + nalsize;
+        } else {
+            buf_index = find_start_code(buf, buf_size, buf_index, next_avc);
+            if (buf_index >= buf_size)
+                break;
+            if (buf_index >= next_avc)
+                continue;
+        }
+        src_length = next_avc - buf_index;
+
+        state = buf[buf_index];
         switch (state & 0x1f) {
         case H264_NAL_SLICE:
         case H264_NAL_IDR_SLICE:
@@ -249,12 +308,13 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             }
             break;
         }
-
-        consumed = ff_h2645_extract_rbsp(buf, src_length, &nal);
+        consumed = ff_h2645_extract_rbsp(buf + buf_index, src_length, &rbsp, &nal, 1);
         if (consumed < 0)
             break;
 
-        ret = init_get_bits(&nal.gb, nal.data, nal.size * 8);
+        buf_index += consumed;
+
+        ret = init_get_bits8(&nal.gb, nal.data, nal.size);
         if (ret < 0)
             goto fail;
         get_bits1(&nal.gb);
@@ -263,7 +323,7 @@ static inline int parse_nal_units(AVCodecParserContext *s,
 
         switch (nal.type) {
         case H264_NAL_SPS:
-            ff_h264_decode_seq_parameter_set(&nal.gb, avctx, &p->ps);
+            ff_h264_decode_seq_parameter_set(&nal.gb, avctx, &p->ps, 0);
             break;
         case H264_NAL_PPS:
             ff_h264_decode_picture_parameter_set(&nal.gb, avctx, &p->ps,
@@ -281,7 +341,7 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             p->poc.prev_poc_lsb          = 0;
         /* fall through */
         case H264_NAL_SLICE:
-            get_ue_golomb(&nal.gb);  // skip first_mb_in_slice
+            get_ue_golomb_long(&nal.gb);  // skip first_mb_in_slice
             slice_type   = get_ue_golomb_31(&nal.gb);
             s->pict_type = ff_h264_golomb_to_pict_type[slice_type % 5];
             if (p->sei.recovery_point.recovery_frame_cnt >= 0) {
@@ -299,16 +359,33 @@ static inline int parse_nal_units(AVCodecParserContext *s,
                        "non-existing PPS %u referenced\n", pps_id);
                 goto fail;
             }
-            p->ps.pps = (const PPS*)p->ps.pps_list[pps_id]->data;
+
+            av_buffer_unref(&p->ps.pps_ref);
+            av_buffer_unref(&p->ps.sps_ref);
+            p->ps.pps = NULL;
+            p->ps.sps = NULL;
+            p->ps.pps_ref = av_buffer_ref(p->ps.pps_list[pps_id]);
+            if (!p->ps.pps_ref)
+                goto fail;
+            p->ps.pps = (const PPS*)p->ps.pps_ref->data;
+
             if (!p->ps.sps_list[p->ps.pps->sps_id]) {
                 av_log(avctx, AV_LOG_ERROR,
                        "non-existing SPS %u referenced\n", p->ps.pps->sps_id);
                 goto fail;
             }
-            p->ps.sps = (SPS*)p->ps.sps_list[p->ps.pps->sps_id]->data;
+
+            p->ps.sps_ref = av_buffer_ref(p->ps.sps_list[p->ps.pps->sps_id]);
+            if (!p->ps.sps_ref)
+                goto fail;
+            p->ps.sps = (const SPS*)p->ps.sps_ref->data;
 
             sps = p->ps.sps;
 
+            // heuristic to detect non marked keyframes
+            if (p->ps.sps->ref_frame_count <= 1 && p->ps.pps->ref_count[0] <= 1 && s->pict_type == AV_PICTURE_TYPE_I)
+                s->key_frame = 1;
+
             p->poc.frame_num = get_bits(&nal.gb, sps->log2_max_frame_num);
 
             s->coded_width  = 16 * sps->mb_width;
@@ -354,7 +431,7 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             }
 
             if (nal.type == H264_NAL_IDR_SLICE)
-                get_ue_golomb(&nal.gb); /* idr_pic_id */
+                get_ue_golomb_long(&nal.gb); /* idr_pic_id */
             if (sps->poc_type == 0) {
                 p->poc.poc_lsb = get_bits(&nal.gb, sps->log2_max_poc_lsb);
 
@@ -375,8 +452,10 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             /* Decode POC of this picture.
              * The prev_ values needed for decoding POC of the next picture are not set here. */
             field_poc[0] = field_poc[1] = INT_MAX;
-            ff_h264_init_poc(field_poc, &s->output_picture_number, sps,
+            ret = ff_h264_init_poc(field_poc, &s->output_picture_number, sps,
                              &p->poc, p->picture_structure, nal.ref_idc);
+            if (ret < 0)
+                goto fail;
 
             /* Continue parsing to check if MMCO_RESET is present.
              * FIXME: MMCO_RESET could appear in non-first slice.
@@ -460,18 +539,33 @@ static inline int parse_nal_units(AVCodecParserContext *s,
                     s->picture_structure = AV_PICTURE_STRUCTURE_TOP_FIELD;
                 else
                     s->picture_structure = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
-                s->field_order = AV_FIELD_UNKNOWN;
+                if (p->poc.frame_num == p->last_frame_num &&
+                    p->last_picture_structure != AV_PICTURE_STRUCTURE_UNKNOWN &&
+                    p->last_picture_structure != AV_PICTURE_STRUCTURE_FRAME &&
+                    p->last_picture_structure != s->picture_structure) {
+                    if (p->last_picture_structure == AV_PICTURE_STRUCTURE_TOP_FIELD)
+                        s->field_order = AV_FIELD_TT;
+                    else
+                        s->field_order = AV_FIELD_BB;
+                } else {
+                    s->field_order = AV_FIELD_UNKNOWN;
+                }
+                p->last_picture_structure = s->picture_structure;
+                p->last_frame_num = p->poc.frame_num;
             }
 
-            av_freep(&nal.rbsp_buffer);
+            av_freep(&rbsp.rbsp_buffer);
             return 0; /* no need to evaluate the rest */
         }
-        buf += consumed;
+    }
+    if (q264) {
+        av_freep(&rbsp.rbsp_buffer);
+        return 0;
     }
     /* didn't find a picture! */
-    av_log(avctx, AV_LOG_ERROR, "missing picture in access unit\n");
+    av_log(avctx, AV_LOG_ERROR, "missing picture in access unit with size %d\n", buf_size);
 fail:
-    av_freep(&nal.rbsp_buffer);
+    av_freep(&rbsp.rbsp_buffer);
     return -1;
 }
 
@@ -496,7 +590,7 @@ static int h264_parse(AVCodecParserContext *s,
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         next = buf_size;
     } else {
-        next = h264_find_frame_end(p, buf, buf_size);
+        next = h264_find_frame_end(p, buf, buf_size, avctx);
 
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf      = NULL;
@@ -505,13 +599,15 @@ static int h264_parse(AVCodecParserContext *s,
         }
 
         if (next < 0 && next != END_NOT_FOUND) {
-            assert(pc->last_index + next >= 0);
-            h264_find_frame_end(p, &pc->buffer[pc->last_index + next], -next); // update state
+            av_assert1(pc->last_index + next >= 0);
+            h264_find_frame_end(p, &pc->buffer[pc->last_index + next], -next, avctx); // update state
         }
     }
 
     parse_nal_units(s, avctx, buf, buf_size);
 
+    if (avctx->framerate.num)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
     if (p->sei.picture_timing.cpb_removal_delay >= 0) {
         s->dts_sync_point    = p->sei.buffering_period.present;
         s->dts_ref_dts_delta = p->sei.picture_timing.cpb_removal_delay;
@@ -526,6 +622,26 @@ static int h264_parse(AVCodecParserContext *s,
         s->flags &= PARSER_FLAG_COMPLETE_FRAMES;
     }
 
+    if (s->dts_sync_point >= 0) {
+        int64_t den = avctx->time_base.den * (int64_t)avctx->pkt_timebase.num;
+        if (den > 0) {
+            int64_t num = avctx->time_base.num * (int64_t)avctx->pkt_timebase.den;
+            if (s->dts != AV_NOPTS_VALUE) {
+                // got DTS from the stream, update reference timestamp
+                p->reference_dts = s->dts - av_rescale(s->dts_ref_dts_delta, num, den);
+            } else if (p->reference_dts != AV_NOPTS_VALUE) {
+                // compute DTS based on reference timestamp
+                s->dts = p->reference_dts + av_rescale(s->dts_ref_dts_delta, num, den);
+            }
+
+            if (p->reference_dts != AV_NOPTS_VALUE && s->pts == AV_NOPTS_VALUE)
+                s->pts = s->dts + av_rescale(s->pts_dts_delta, num, den);
+
+            if (s->dts_sync_point > 0)
+                p->reference_dts = s->dts; // new reference
+        }
+    }
+
     *poutbuf      = buf;
     *poutbuf_size = buf_size;
     return next;
@@ -534,31 +650,37 @@ static int h264_parse(AVCodecParserContext *s,
 static int h264_split(AVCodecContext *avctx,
                       const uint8_t *buf, int buf_size)
 {
-    int i;
     uint32_t state = -1;
     int has_sps    = 0;
+    int has_pps    = 0;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
+    int nalu_type;
 
-    for (i = 0; i <= buf_size; i++) {
-        if ((state & 0xFFFFFF1F) == 0x107)
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if ((state & 0xFFFFFF00) != 0x100)
+            break;
+        nalu_type = state & 0x1F;
+        if (nalu_type == H264_NAL_SPS) {
             has_sps = 1;
-        /*  if((state&0xFFFFFF1F) == 0x101 ||
-         *     (state&0xFFFFFF1F) == 0x102 ||
-         *     (state&0xFFFFFF1F) == 0x105) {
+        } else if (nalu_type == H264_NAL_PPS)
+            has_pps = 1;
+        /* else if (nalu_type == 0x01 ||
+         *     nalu_type == 0x02 ||
+         *     nalu_type == 0x05) {
          *  }
          */
-        if ((state & 0xFFFFFF00) == 0x100 && (state & 0xFFFFFF1F) != 0x106 &&
-            (state & 0xFFFFFF1F) != 0x107 && (state & 0xFFFFFF1F) != 0x108 &&
-            (state & 0xFFFFFF1F) != 0x109 && (state & 0xFFFFFF1F) != 0x10d &&
-            (state & 0xFFFFFF1F) != 0x10f) {
+        else if ((nalu_type != H264_NAL_SEI || has_pps) &&
+                  nalu_type != H264_NAL_AUD && nalu_type != H264_NAL_SPS_EXT &&
+                  nalu_type != 0x0f) {
             if (has_sps) {
-                while (i > 4 && buf[i - 5] == 0)
-                    i--;
-                return i - 4;
+                while (ptr - 4 > buf && ptr[-5] == 0)
+                    ptr--;
+                return ptr - 4 - buf;
             }
         }
-        if (i < buf_size)
-            state = (state << 8) | buf[i];
     }
+
     return 0;
 }
 
@@ -566,23 +688,19 @@ static void h264_close(AVCodecParserContext *s)
 {
     H264ParseContext *p = s->priv_data;
     ParseContext *pc = &p->pc;
-    int i;
 
-    av_free(pc->buffer);
+    av_freep(&pc->buffer);
 
     ff_h264_sei_uninit(&p->sei);
-
-    for (i = 0; i < FF_ARRAY_ELEMS(p->ps.sps_list); i++)
-        av_buffer_unref(&p->ps.sps_list[i]);
-
-    for (i = 0; i < FF_ARRAY_ELEMS(p->ps.pps_list); i++)
-        av_buffer_unref(&p->ps.pps_list[i]);
+    ff_h264_ps_uninit(&p->ps);
 }
 
 static av_cold int init(AVCodecParserContext *s)
 {
     H264ParseContext *p = s->priv_data;
 
+    p->reference_dts = AV_NOPTS_VALUE;
+    p->last_frame_num = INT_MAX;
     ff_h264dsp_init(&p->h264dsp, 8, 1);
     return 0;
 }
diff --git a/libavcodec/h264_picture.c b/libavcodec/h264_picture.c
index 24ba79d..e833835 100644
--- a/libavcodec/h264_picture.c
+++ b/libavcodec/h264_picture.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,8 +69,8 @@ int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src)
 
     av_assert0(!dst->f->buf[0]);
     av_assert0(src->f->buf[0]);
+    av_assert0(src->tf.f == src->f);
 
-    src->tf.f = src->f;
     dst->tf.f = dst->f;
     ret = ff_thread_ref_frame(&dst->tf, &src->tf);
     if (ret < 0)
@@ -78,24 +78,30 @@ int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src)
 
     dst->qscale_table_buf = av_buffer_ref(src->qscale_table_buf);
     dst->mb_type_buf      = av_buffer_ref(src->mb_type_buf);
-    if (!dst->qscale_table_buf || !dst->mb_type_buf)
+    if (!dst->qscale_table_buf || !dst->mb_type_buf) {
+        ret = AVERROR(ENOMEM);
         goto fail;
+    }
     dst->qscale_table = src->qscale_table;
     dst->mb_type      = src->mb_type;
 
     for (i = 0; i < 2; i++) {
         dst->motion_val_buf[i] = av_buffer_ref(src->motion_val_buf[i]);
         dst->ref_index_buf[i]  = av_buffer_ref(src->ref_index_buf[i]);
-        if (!dst->motion_val_buf[i] || !dst->ref_index_buf[i])
+        if (!dst->motion_val_buf[i] || !dst->ref_index_buf[i]) {
+            ret = AVERROR(ENOMEM);
             goto fail;
+        }
         dst->motion_val[i] = src->motion_val[i];
         dst->ref_index[i]  = src->ref_index[i];
     }
 
     if (src->hwaccel_picture_private) {
         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
-        if (!dst->hwaccel_priv_buf)
+        if (!dst->hwaccel_priv_buf) {
+            ret = AVERROR(ENOMEM);
             goto fail;
+        }
         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
     }
 
@@ -108,12 +114,13 @@ int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src)
     dst->poc           = src->poc;
     dst->frame_num     = src->frame_num;
     dst->mmco_reset    = src->mmco_reset;
-    dst->pic_id        = src->pic_id;
     dst->long_ref      = src->long_ref;
     dst->mbaff         = src->mbaff;
     dst->field_picture = src->field_picture;
     dst->reference     = src->reference;
     dst->recovered     = src->recovered;
+    dst->invalid_gap   = src->invalid_gap;
+    dst->sei_recovery_frame_cnt = src->sei_recovery_frame_cnt;
 
     return 0;
 fail:
@@ -121,11 +128,13 @@ fail:
     return ret;
 }
 
-#if CONFIG_ERROR_RESILIENCE
-static void h264_set_erpic(ERPicture *dst, H264Picture *src)
+void ff_h264_set_erpic(ERPicture *dst, H264Picture *src)
 {
+#if CONFIG_ERROR_RESILIENCE
     int i;
 
+    memset(dst, 0, sizeof(*dst));
+
     if (!src)
         return;
 
@@ -139,8 +148,8 @@ static void h264_set_erpic(ERPicture *dst, H264Picture *src)
 
     dst->mb_type = src->mb_type;
     dst->field_picture = src->field_picture;
-}
 #endif /* CONFIG_ERROR_RESILIENCE */
+}
 
 int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
 {
@@ -148,10 +157,6 @@ int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
     int err = 0;
     h->mb_y = 0;
 
-    if (!in_setup && !h->droppable)
-        ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
-                                  h->picture_structure == PICT_BOTTOM_FIELD);
-
     if (in_setup || !(avctx->active_thread_type & FF_THREAD_FRAME)) {
         if (!h->droppable) {
             err = ff_h264_execute_ref_pic_marking(h);
@@ -163,38 +168,18 @@ int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
     }
 
     if (avctx->hwaccel) {
-        if (avctx->hwaccel->end_frame(avctx) < 0)
+        err = avctx->hwaccel->end_frame(avctx);
+        if (err < 0)
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
     }
 
-#if CONFIG_ERROR_RESILIENCE
-    /*
-     * FIXME: Error handling code does not seem to support interlaced
-     * when slices span multiple rows
-     * The ff_er_add_slice calls don't work right for bottom
-     * fields; they cause massive erroneous error concealing
-     * Error marking covers both fields (top and bottom).
-     * This causes a mismatched s->error_count
-     * and a bad error table. Further, the error count goes to
-     * INT_MAX when called for bottom field, because mb_y is
-     * past end by one (callers fault) and resync_mb_y != 0
-     * causes problems for the first MB line, too.
-     */
-    if (!FIELD_PICTURE(h) && h->enable_er) {
-        h264_set_erpic(&sl->er.cur_pic, h->cur_pic_ptr);
-        h264_set_erpic(&sl->er.last_pic,
-                       sl->ref_count[0] ? sl->ref_list[0][0].parent : NULL);
-        h264_set_erpic(&sl->er.next_pic,
-                       sl->ref_count[1] ? sl->ref_list[1][0].parent : NULL);
-        ff_er_frame_end(&sl->er);
-    }
-#endif /* CONFIG_ERROR_RESILIENCE */
-
+    if (!in_setup && !h->droppable)
+        ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                  h->picture_structure == PICT_BOTTOM_FIELD);
     emms_c();
 
     h->current_slice = 0;
-    h->field_started = 0;
 
     return err;
 }
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index b3a0e8a..17bfa78 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,39 +28,17 @@
 #include <inttypes.h>
 
 #include "libavutil/imgutils.h"
-
-#include "golomb_legacy.h"
 #include "internal.h"
 #include "mathops.h"
 #include "avcodec.h"
 #include "h264data.h"
 #include "h264_ps.h"
+#include "golomb.h"
 
-#define MAX_LOG2_MAX_FRAME_NUM    (12 + 4)
 #define MIN_LOG2_MAX_FRAME_NUM    4
 
 #define EXTENDED_SAR       255
 
-static const AVRational pixel_aspect[17] = {
-    {   0,  1 },
-    {   1,  1 },
-    {  12, 11 },
-    {  10, 11 },
-    {  16, 11 },
-    {  40, 33 },
-    {  24, 11 },
-    {  20, 11 },
-    {  32, 11 },
-    {  80, 33 },
-    {  18, 11 },
-    {  15, 11 },
-    {  64, 33 },
-    { 160, 99 },
-    {   4,  3 },
-    {   3,  2 },
-    {   2,  1 },
-};
-
 static const uint8_t default_scaling4[2][16] = {
     {  6, 13, 20, 28, 13, 20, 28, 32,
       20, 28, 32, 37, 28, 32, 37, 42 },
@@ -109,23 +87,20 @@ static const int level_max_dpb_mbs[][2] = {
 
 static void remove_pps(H264ParamSets *s, int id)
 {
-    if (s->pps_list[id] && s->pps == (const PPS*)s->pps_list[id]->data)
-        s->pps = NULL;
     av_buffer_unref(&s->pps_list[id]);
 }
 
 static void remove_sps(H264ParamSets *s, int id)
 {
+#if 0
     int i;
     if (s->sps_list[id]) {
-        if (s->sps == (SPS*)s->sps_list[id]->data)
-            s->sps = NULL;
-
         /* drop all PPS that depend on this SPS */
         for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
             if (s->pps_list[i] && ((PPS*)s->pps_list[i]->data)->sps_id == id)
                 remove_pps(s, i);
     }
+#endif
     av_buffer_unref(&s->sps_list[id]);
 }
 
@@ -168,8 +143,8 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
         if (aspect_ratio_idc == EXTENDED_SAR) {
             sps->sar.num = get_bits(gb, 16);
             sps->sar.den = get_bits(gb, 16);
-        } else if (aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)) {
-            sps->sar = pixel_aspect[aspect_ratio_idc];
+        } else if (aspect_ratio_idc < FF_ARRAY_ELEMS(ff_h264_pixel_aspect)) {
+            sps->sar = ff_h264_pixel_aspect[aspect_ratio_idc];
         } else {
             av_log(avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
             return AVERROR_INVALIDDATA;
@@ -210,15 +185,23 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
         get_ue_golomb(gb);  /* chroma_sample_location_type_bottom_field */
     }
 
+    if (show_bits1(gb) && get_bits_left(gb) < 10) {
+        av_log(avctx, AV_LOG_WARNING, "Truncated VUI\n");
+        return 0;
+    }
+
     sps->timing_info_present_flag = get_bits1(gb);
     if (sps->timing_info_present_flag) {
-        sps->num_units_in_tick = get_bits_long(gb, 32);
-        sps->time_scale        = get_bits_long(gb, 32);
-        if (!sps->num_units_in_tick || !sps->time_scale) {
+        unsigned num_units_in_tick = get_bits_long(gb, 32);
+        unsigned time_scale        = get_bits_long(gb, 32);
+        if (!num_units_in_tick || !time_scale) {
             av_log(avctx, AV_LOG_ERROR,
-                   "time_scale/num_units_in_tick invalid or unsupported (%"PRIu32"/%"PRIu32")\n",
-                   sps->time_scale, sps->num_units_in_tick);
-            return AVERROR_INVALIDDATA;
+                   "time_scale/num_units_in_tick invalid or unsupported (%u/%u)\n",
+                   time_scale, num_units_in_tick);
+            sps->timing_info_present_flag = 0;
+        } else {
+            sps->num_units_in_tick = num_units_in_tick;
+            sps->time_scale = time_scale;
         }
         sps->fixed_frame_rate_flag = get_bits1(gb);
     }
@@ -235,7 +218,8 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
         sps->vcl_hrd_parameters_present_flag)
         get_bits1(gb);     /* low_delay_hrd_flag */
     sps->pic_struct_present_flag = get_bits1(gb);
-
+    if (!get_bits_left(gb))
+        return 0;
     sps->bitstream_restriction_flag = get_bits1(gb);
     if (sps->bitstream_restriction_flag) {
         get_bits1(gb);     /* motion_vectors_over_pic_boundaries_flag */
@@ -260,16 +244,11 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
             return AVERROR_INVALIDDATA;
         }
     }
-    if (get_bits_left(gb) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Overread VUI by %d bits\n", -get_bits_left(gb));
-        return AVERROR_INVALIDDATA;
-    }
 
     return 0;
 }
 
-static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size,
+static int decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size,
                                 const uint8_t *jvt_list,
                                 const uint8_t *fallback_list)
 {
@@ -279,18 +258,26 @@ static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size,
         memcpy(factors, fallback_list, size * sizeof(uint8_t));
     else
         for (i = 0; i < size; i++) {
-            if (next)
-                next = (last + get_se_golomb(gb)) & 0xff;
+            if (next) {
+                int v = get_se_golomb(gb);
+                if (v < -128 || v > 127) {
+                    av_log(NULL, AV_LOG_ERROR, "delta scale %d is invalid\n", v);
+                    return AVERROR_INVALIDDATA;
+                }
+                next = (last + v) & 0xff;
+            }
             if (!i && !next) { /* matrix not written, we use the preset one */
                 memcpy(factors, jvt_list, size * sizeof(uint8_t));
                 break;
             }
             last = factors[scan[i]] = next ? next : last;
         }
+    return 0;
 }
 
-static void decode_scaling_matrices(GetBitContext *gb, SPS *sps,
-                                    PPS *pps, int is_sps,
+/* returns non zero if the provided SPS scaling matrix has been filled */
+static int decode_scaling_matrices(GetBitContext *gb, const SPS *sps,
+                                    const PPS *pps, int is_sps,
                                     uint8_t(*scaling_matrix4)[16],
                                     uint8_t(*scaling_matrix8)[64])
 {
@@ -301,37 +288,69 @@ static void decode_scaling_matrices(GetBitContext *gb, SPS *sps,
         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
         fallback_sps ? sps->scaling_matrix8[3] : default_scaling8[1]
     };
+    int ret = 0;
     if (get_bits1(gb)) {
-        sps->scaling_matrix_present |= is_sps;
-        decode_scaling_list(gb, scaling_matrix4[0], 16, default_scaling4[0], fallback[0]);        // Intra, Y
-        decode_scaling_list(gb, scaling_matrix4[1], 16, default_scaling4[0], scaling_matrix4[0]); // Intra, Cr
-        decode_scaling_list(gb, scaling_matrix4[2], 16, default_scaling4[0], scaling_matrix4[1]); // Intra, Cb
-        decode_scaling_list(gb, scaling_matrix4[3], 16, default_scaling4[1], fallback[1]);        // Inter, Y
-        decode_scaling_list(gb, scaling_matrix4[4], 16, default_scaling4[1], scaling_matrix4[3]); // Inter, Cr
-        decode_scaling_list(gb, scaling_matrix4[5], 16, default_scaling4[1], scaling_matrix4[4]); // Inter, Cb
+        ret |= decode_scaling_list(gb, scaling_matrix4[0], 16, default_scaling4[0], fallback[0]);        // Intra, Y
+        ret |= decode_scaling_list(gb, scaling_matrix4[1], 16, default_scaling4[0], scaling_matrix4[0]); // Intra, Cr
+        ret |= decode_scaling_list(gb, scaling_matrix4[2], 16, default_scaling4[0], scaling_matrix4[1]); // Intra, Cb
+        ret |= decode_scaling_list(gb, scaling_matrix4[3], 16, default_scaling4[1], fallback[1]);        // Inter, Y
+        ret |= decode_scaling_list(gb, scaling_matrix4[4], 16, default_scaling4[1], scaling_matrix4[3]); // Inter, Cr
+        ret |= decode_scaling_list(gb, scaling_matrix4[5], 16, default_scaling4[1], scaling_matrix4[4]); // Inter, Cb
         if (is_sps || pps->transform_8x8_mode) {
-            decode_scaling_list(gb, scaling_matrix8[0], 64, default_scaling8[0], fallback[2]); // Intra, Y
+            ret |= decode_scaling_list(gb, scaling_matrix8[0], 64, default_scaling8[0], fallback[2]); // Intra, Y
+            ret |= decode_scaling_list(gb, scaling_matrix8[3], 64, default_scaling8[1], fallback[3]); // Inter, Y
             if (sps->chroma_format_idc == 3) {
-                decode_scaling_list(gb, scaling_matrix8[1], 64, default_scaling8[0], scaling_matrix8[0]); // Intra, Cr
-                decode_scaling_list(gb, scaling_matrix8[2], 64, default_scaling8[0], scaling_matrix8[1]); // Intra, Cb
-            }
-            decode_scaling_list(gb, scaling_matrix8[3], 64, default_scaling8[1], fallback[3]); // Inter, Y
-            if (sps->chroma_format_idc == 3) {
-                decode_scaling_list(gb, scaling_matrix8[4], 64, default_scaling8[1], scaling_matrix8[3]); // Inter, Cr
-                decode_scaling_list(gb, scaling_matrix8[5], 64, default_scaling8[1], scaling_matrix8[4]); // Inter, Cb
+                ret |= decode_scaling_list(gb, scaling_matrix8[1], 64, default_scaling8[0], scaling_matrix8[0]); // Intra, Cr
+                ret |= decode_scaling_list(gb, scaling_matrix8[4], 64, default_scaling8[1], scaling_matrix8[3]); // Inter, Cr
+                ret |= decode_scaling_list(gb, scaling_matrix8[2], 64, default_scaling8[0], scaling_matrix8[1]); // Intra, Cb
+                ret |= decode_scaling_list(gb, scaling_matrix8[5], 64, default_scaling8[1], scaling_matrix8[4]); // Inter, Cb
             }
         }
+        if (!ret)
+            ret = is_sps;
     }
+
+    return ret;
+}
+
+void ff_h264_ps_uninit(H264ParamSets *ps)
+{
+    int i;
+
+    for (i = 0; i < MAX_SPS_COUNT; i++)
+        av_buffer_unref(&ps->sps_list[i]);
+
+    for (i = 0; i < MAX_PPS_COUNT; i++)
+        av_buffer_unref(&ps->pps_list[i]);
+
+    av_buffer_unref(&ps->sps_ref);
+    av_buffer_unref(&ps->pps_ref);
+
+    ps->pps = NULL;
+    ps->sps = NULL;
 }
 
 int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
-                                     H264ParamSets *ps)
+                                     H264ParamSets *ps, int ignore_truncation)
 {
     AVBufferRef *sps_buf;
     int profile_idc, level_idc, constraint_set_flags = 0;
     unsigned int sps_id;
     int i, log2_max_frame_num_minus4;
     SPS *sps;
+    int ret;
+
+    sps_buf = av_buffer_allocz(sizeof(*sps));
+    if (!sps_buf)
+        return AVERROR(ENOMEM);
+    sps = (SPS*)sps_buf->data;
+
+    sps->data_size = gb->buffer_end - gb->buffer;
+    if (sps->data_size > sizeof(sps->data)) {
+        av_log(avctx, AV_LOG_DEBUG, "Truncating likely oversized SPS\n");
+        sps->data_size = sizeof(sps->data);
+    }
+    memcpy(sps->data, gb->buffer, sps->data_size);
 
     profile_idc           = get_bits(gb, 8);
     constraint_set_flags |= get_bits1(gb) << 0;   // constraint_set0_flag
@@ -346,23 +365,20 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
 
     if (sps_id >= MAX_SPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "sps_id %u out of range\n", sps_id);
-        return AVERROR_INVALIDDATA;
+        goto fail;
     }
 
-    sps_buf = av_buffer_allocz(sizeof(*sps));
-    if (!sps_buf)
-        return AVERROR(ENOMEM);
-    sps = (SPS*)sps_buf->data;
-
     sps->sps_id               = sps_id;
     sps->time_offset_length   = 24;
     sps->profile_idc          = profile_idc;
     sps->constraint_set_flags = constraint_set_flags;
     sps->level_idc            = level_idc;
+    sps->full_range           = -1;
 
     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
     sps->scaling_matrix_present = 0;
+    sps->colorspace = 2; //AVCOL_SPC_UNSPECIFIED
 
     if (sps->profile_idc == 100 ||  // High profile
         sps->profile_idc == 110 ||  // High10 profile
@@ -376,12 +392,16 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
         sps->profile_idc == 138 ||  // Multiview Depth High profile (MVCD)
         sps->profile_idc == 144) {  // old High444 profile
         sps->chroma_format_idc = get_ue_golomb_31(gb);
-        if (sps->chroma_format_idc > 3) {
+        if (sps->chroma_format_idc > 3U) {
             avpriv_request_sample(avctx, "chroma_format_idc %u",
                                   sps->chroma_format_idc);
             goto fail;
         } else if (sps->chroma_format_idc == 3) {
             sps->residual_color_transform_flag = get_bits1(gb);
+            if (sps->residual_color_transform_flag) {
+                av_log(avctx, AV_LOG_ERROR, "separate color planes are not supported\n");
+                goto fail;
+            }
         }
         sps->bit_depth_luma   = get_ue_golomb(gb) + 8;
         sps->bit_depth_chroma = get_ue_golomb(gb) + 8;
@@ -390,9 +410,18 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                                   "Different chroma and luma bit depth");
             goto fail;
         }
+        if (sps->bit_depth_luma   < 8 || sps->bit_depth_luma   > 14 ||
+            sps->bit_depth_chroma < 8 || sps->bit_depth_chroma > 14) {
+            av_log(avctx, AV_LOG_ERROR, "illegal bit depth value (%d, %d)\n",
+                   sps->bit_depth_luma, sps->bit_depth_chroma);
+            goto fail;
+        }
         sps->transform_bypass = get_bits1(gb);
-        decode_scaling_matrices(gb, sps, NULL, 1,
-                                sps->scaling_matrix4, sps->scaling_matrix8);
+        ret = decode_scaling_matrices(gb, sps, NULL, 1,
+                                      sps->scaling_matrix4, sps->scaling_matrix8);
+        if (ret < 0)
+            goto fail;
+        sps->scaling_matrix_present |= ret;
     } else {
         sps->chroma_format_idc = 1;
         sps->bit_depth_luma    = 8;
@@ -412,7 +441,12 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     sps->poc_type = get_ue_golomb_31(gb);
 
     if (sps->poc_type == 0) { // FIXME #define
-        sps->log2_max_poc_lsb = get_ue_golomb(gb) + 4;
+        unsigned t = get_ue_golomb(gb);
+        if (t>12) {
+            av_log(avctx, AV_LOG_ERROR, "log2_max_poc_lsb (%d) is out of range\n", t);
+            goto fail;
+        }
+        sps->log2_max_poc_lsb = t + 4;
     } else if (sps->poc_type == 1) { // FIXME #define
         sps->delta_pic_order_always_zero_flag = get_bits1(gb);
         sps->offset_for_non_ref_pic           = get_se_golomb(gb);
@@ -434,6 +468,8 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     }
 
     sps->ref_frame_count = get_ue_golomb_31(gb);
+    if (avctx->codec_tag == MKTAG('S', 'M', 'V', '2'))
+        sps->ref_frame_count = FFMAX(2, sps->ref_frame_count);
     if (sps->ref_frame_count > MAX_DELAYED_PIC_COUNT) {
         av_log(avctx, AV_LOG_ERROR,
                "too many reference frames %d\n", sps->ref_frame_count);
@@ -445,12 +481,17 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
 
     sps->frame_mbs_only_flag = get_bits1(gb);
 
-    if (sps->mb_height >= INT_MAX / 2) {
+    if (sps->mb_height >= INT_MAX / 2U) {
         av_log(avctx, AV_LOG_ERROR, "height overflow\n");
         goto fail;
     }
     sps->mb_height *= 2 - sps->frame_mbs_only_flag;
 
+    if (!sps->frame_mbs_only_flag)
+        sps->mb_aff = get_bits1(gb);
+    else
+        sps->mb_aff = 0;
+
     if ((unsigned)sps->mb_width  >= INT_MAX / 16 ||
         (unsigned)sps->mb_height >= INT_MAX / 16 ||
         av_image_check_size(16 * sps->mb_width,
@@ -459,17 +500,7 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
         goto fail;
     }
 
-    if (!sps->frame_mbs_only_flag)
-        sps->mb_aff = get_bits1(gb);
-    else
-        sps->mb_aff = 0;
-
     sps->direct_8x8_inference_flag = get_bits1(gb);
-    if (!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag) {
-        av_log(avctx, AV_LOG_ERROR,
-               "This stream was generated by a broken encoder, invalid 8x8 inference\n");
-        goto fail;
-    }
 
 #ifndef ALLOW_INTERLACE
     if (sps->mb_aff)
@@ -482,6 +513,8 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
         unsigned int crop_right  = get_ue_golomb(gb);
         unsigned int crop_top    = get_ue_golomb(gb);
         unsigned int crop_bottom = get_ue_golomb(gb);
+        int width  = 16 * sps->mb_width;
+        int height = 16 * sps->mb_height;
 
         if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
             av_log(avctx, AV_LOG_DEBUG, "discarding sps cropping, original "
@@ -499,16 +532,15 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
             int step_x = 1 << hsub;
             int step_y = (2 - sps->frame_mbs_only_flag) << vsub;
 
-            if (INT_MAX / step_x             <= crop_left               ||
-                INT_MAX / step_x - crop_left <= crop_right              ||
-                16 * sps->mb_width <= step_x * (crop_left + crop_right) ||
-                INT_MAX / step_y             <= crop_top                ||
-                INT_MAX / step_y - crop_top  <= crop_bottom             ||
-                16 * sps->mb_height <= step_y * (crop_top + crop_bottom)) {
-                av_log(avctx, AV_LOG_WARNING, "Invalid crop parameters\n");
-                if (avctx->err_recognition & AV_EF_EXPLODE)
-                    goto fail;
-                crop_left = crop_right = crop_top = crop_bottom = 0;
+            if (crop_left  > (unsigned)INT_MAX / 4 / step_x ||
+                crop_right > (unsigned)INT_MAX / 4 / step_x ||
+                crop_top   > (unsigned)INT_MAX / 4 / step_y ||
+                crop_bottom> (unsigned)INT_MAX / 4 / step_y ||
+                (crop_left + crop_right ) * step_x >= width ||
+                (crop_top  + crop_bottom) * step_y >= height
+            ) {
+                av_log(avctx, AV_LOG_ERROR, "crop values invalid %d %d %d %d / %d %d\n", crop_left, crop_right, crop_top, crop_bottom, width, height);
+                goto fail;
             }
 
             sps->crop_left   = crop_left   * step_x;
@@ -527,7 +559,14 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     sps->vui_parameters_present_flag = get_bits1(gb);
     if (sps->vui_parameters_present_flag) {
         int ret = decode_vui_parameters(gb, avctx, sps);
-        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE)
+        if (ret < 0)
+            goto fail;
+    }
+
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, ignore_truncation ? AV_LOG_WARNING : AV_LOG_ERROR,
+               "Overread %s by %d bits\n", sps->vui_parameters_present_flag ? "VUI" : "SPS", -get_bits_left(gb));
+        if (!ignore_truncation)
             goto fail;
     }
 
@@ -551,7 +590,7 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     if (avctx->debug & FF_DEBUG_PICT_INFO) {
         static const char csp[4][5] = { "Gray", "420", "422", "444" };
         av_log(avctx, AV_LOG_DEBUG,
-               "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%u/%u/%u/%u %s %s %"PRId32"/%"PRId32"\n",
+               "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%u/%u/%u/%u %s %s %"PRId32"/%"PRId32" b%d reo:%d\n",
                sps_id, sps->profile_idc, sps->level_idc,
                sps->poc_type,
                sps->ref_frame_count,
@@ -563,7 +602,10 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                sps->vui_parameters_present_flag ? "VUI" : "",
                csp[sps->chroma_format_idc],
                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
-               sps->timing_info_present_flag ? sps->time_scale : 0);
+               sps->timing_info_present_flag ? sps->time_scale : 0,
+               sps->bit_depth_luma,
+               sps->bitstream_restriction_flag ? sps->num_reorder_frames : -1
+               );
     }
 
     /* check if this is a repeat of an already parsed SPS, then keep the
@@ -641,6 +683,8 @@ static void init_dequant_tables(PPS *pps, const SPS *sps)
 {
     int i, x;
     init_dequant4_coeff_table(pps, sps);
+    memset(pps->dequant8_coeff, 0, sizeof(pps->dequant8_coeff));
+
     if (pps->transform_8x8_mode)
         init_dequant8_coeff_table(pps, sps);
     if (sps->transform_bypass) {
@@ -663,11 +707,25 @@ static void build_qp_table(PPS *pps, int t, int index, const int depth)
             ff_h264_chroma_qp[depth - 8][av_clip(i + index, 0, max_qp)];
 }
 
+static int more_rbsp_data_in_pps(const SPS *sps, void *logctx)
+{
+    int profile_idc = sps->profile_idc;
+
+    if ((profile_idc == 66 || profile_idc == 77 ||
+         profile_idc == 88) && (sps->constraint_set_flags & 7)) {
+        av_log(logctx, AV_LOG_VERBOSE,
+               "Current profile doesn't provide more RBSP data in PPS, skipping\n");
+        return 0;
+    }
+
+    return 1;
+}
+
 int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                                          H264ParamSets *ps, int bit_length)
 {
     AVBufferRef *pps_buf;
-    SPS *sps;
+    const SPS *sps;
     unsigned int pps_id = get_ue_golomb(gb);
     PPS *pps;
     int qp_bd_offset;
@@ -684,6 +742,15 @@ int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avct
         return AVERROR(ENOMEM);
     pps = (PPS*)pps_buf->data;
 
+    pps->data_size = gb->buffer_end - gb->buffer;
+    if (pps->data_size > sizeof(pps->data)) {
+        av_log(avctx, AV_LOG_DEBUG, "Truncating likely oversized PPS "
+               "(%"SIZE_SPECIFIER" > %"SIZE_SPECIFIER")\n",
+               pps->data_size, sizeof(pps->data));
+        pps->data_size = sizeof(pps->data);
+    }
+    memcpy(pps->data, gb->buffer, pps->data_size);
+
     pps->sps_id = get_ue_golomb_31(gb);
     if ((unsigned)pps->sps_id >= MAX_SPS_COUNT ||
         !ps->sps_list[pps->sps_id]) {
@@ -691,11 +758,17 @@ int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avct
         ret = AVERROR_INVALIDDATA;
         goto fail;
     }
-    sps = (SPS*)ps->sps_list[pps->sps_id]->data;
-
-    if (sps->bit_depth_luma > 10) {
-        avpriv_report_missing_feature(avctx, "Luma bit depth=%d (max=10)",
-                                      sps->bit_depth_luma);
+    sps = (const SPS*)ps->sps_list[pps->sps_id]->data;
+    if (sps->bit_depth_luma > 14) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid luma bit depth=%d\n",
+               sps->bit_depth_luma);
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    } else if (sps->bit_depth_luma == 11 || sps->bit_depth_luma == 13) {
+        avpriv_report_missing_feature(avctx,
+               "Unimplemented luma bit depth=%d",
+               sps->bit_depth_luma);
         ret = AVERROR_PATCHWELCOME;
         goto fail;
     }
@@ -719,9 +792,14 @@ int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avct
 
     pps->weighted_pred                        = get_bits1(gb);
     pps->weighted_bipred_idc                  = get_bits(gb, 2);
-    pps->init_qp                              = get_se_golomb(gb) + 26 + qp_bd_offset;
-    pps->init_qs                              = get_se_golomb(gb) + 26 + qp_bd_offset;
+    pps->init_qp                              = get_se_golomb(gb) + 26U + qp_bd_offset;
+    pps->init_qs                              = get_se_golomb(gb) + 26U + qp_bd_offset;
     pps->chroma_qp_index_offset[0]            = get_se_golomb(gb);
+    if (pps->chroma_qp_index_offset[0] < -12 || pps->chroma_qp_index_offset[0] > 12) {
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
     pps->deblocking_filter_parameters_present = get_bits1(gb);
     pps->constrained_intra_pred               = get_bits1(gb);
     pps->redundant_pic_cnt_present            = get_bits1(gb);
@@ -733,13 +811,18 @@ int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avct
            sizeof(pps->scaling_matrix8));
 
     bits_left = bit_length - get_bits_count(gb);
-    if (bits_left && (bits_left > 8 ||
-                      show_bits(gb, bits_left) != 1 << (bits_left - 1))) {
+    if (bits_left > 0 && more_rbsp_data_in_pps(sps, avctx)) {
         pps->transform_8x8_mode = get_bits1(gb);
-        decode_scaling_matrices(gb, sps, pps, 0,
+        ret = decode_scaling_matrices(gb, sps, pps, 0,
                                 pps->scaling_matrix4, pps->scaling_matrix8);
+        if (ret < 0)
+            goto fail;
         // second_chroma_qp_index_offset
         pps->chroma_qp_index_offset[1] = get_se_golomb(gb);
+        if (pps->chroma_qp_index_offset[1] < -12 || pps->chroma_qp_index_offset[1] > 12) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
     } else {
         pps->chroma_qp_index_offset[1] = pps->chroma_qp_index_offset[0];
     }
diff --git a/libavcodec/h264_ps.h b/libavcodec/h264_ps.h
index 9a32d93..e967b9c 100644
--- a/libavcodec/h264_ps.h
+++ b/libavcodec/h264_ps.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,6 +36,7 @@
 
 #define MAX_SPS_COUNT          32
 #define MAX_PPS_COUNT         256
+#define MAX_LOG2_MAX_FRAME_NUM    (12 + 4)
 
 /**
  * Sequence parameter set
@@ -98,6 +99,8 @@ typedef struct SPS {
     int bit_depth_chroma;                 ///< bit_depth_chroma_minus8 + 8
     int residual_color_transform_flag;    ///< residual_colour_transform_flag
     int constraint_set_flags;             ///< constraint_set[0-3]_flag
+    uint8_t data[4096];
+    size_t data_size;
 } SPS;
 
 /**
@@ -121,8 +124,10 @@ typedef struct PPS {
     int transform_8x8_mode;         ///< transform_8x8_mode_flag
     uint8_t scaling_matrix4[6][16];
     uint8_t scaling_matrix8[6][64];
-    uint8_t chroma_qp_table[2][64]; ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
+    uint8_t chroma_qp_table[2][QP_MAX_NUM+1];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
     int chroma_qp_diff;
+    uint8_t data[4096];
+    size_t data_size;
 
     uint32_t dequant4_buffer[6][QP_MAX_NUM + 1][16];
     uint32_t dequant8_buffer[6][QP_MAX_NUM + 1][64];
@@ -134,17 +139,18 @@ typedef struct H264ParamSets {
     AVBufferRef *sps_list[MAX_SPS_COUNT];
     AVBufferRef *pps_list[MAX_PPS_COUNT];
 
+    AVBufferRef *pps_ref;
+    AVBufferRef *sps_ref;
     /* currently active parameters sets */
     const PPS *pps;
-    // FIXME this should properly be const
-    SPS *sps;
+    const SPS *sps;
 } H264ParamSets;
 
 /**
  * Decode SPS
  */
 int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
-                                     H264ParamSets *ps);
+                                     H264ParamSets *ps, int ignore_truncation);
 
 /**
  * Decode PPS
@@ -152,4 +158,9 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
 int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                                          H264ParamSets *ps, int bit_length);
 
+/**
+ * Uninit H264 param sets structure.
+ */
+void ff_h264_ps_uninit(H264ParamSets *ps);
+
 #endif /* AVCODEC_H264_PS_H */
diff --git a/libavcodec/h264_redundant_pps_bsf.c b/libavcodec/h264_redundant_pps_bsf.c
index d806427..db8717d 100644
--- a/libavcodec/h264_redundant_pps_bsf.c
+++ b/libavcodec/h264_redundant_pps_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -80,7 +80,7 @@ static int h264_redundant_pps_filter(AVBSFContext *bsf, AVPacket *out)
 
     err = ff_cbs_read_packet(ctx->input, au, in);
     if (err < 0)
-        return err;
+        goto fail;
 
     au_has_sps = 0;
     for (i = 0; i < au->nb_units; i++) {
@@ -89,11 +89,15 @@ static int h264_redundant_pps_filter(AVBSFContext *bsf, AVPacket *out)
         if (nal->type == H264_NAL_SPS)
             au_has_sps = 1;
         if (nal->type == H264_NAL_PPS) {
-            h264_redundant_pps_fixup_pps(ctx, nal->content);
+            err = h264_redundant_pps_fixup_pps(ctx, nal->content);
+            if (err < 0)
+                goto fail;
             if (!au_has_sps) {
-                av_log(ctx, AV_LOG_VERBOSE, "Deleting redundant PPS "
+                av_log(bsf, AV_LOG_VERBOSE, "Deleting redundant PPS "
                        "at %"PRId64".\n", in->pts);
-                ff_cbs_delete_unit(ctx->input, au, i);
+                err = ff_cbs_delete_unit(ctx->input, au, i);
+                if (err < 0)
+                    goto fail;
             }
         }
         if (nal->type == H264_NAL_SLICE ||
@@ -105,17 +109,21 @@ static int h264_redundant_pps_filter(AVBSFContext *bsf, AVPacket *out)
 
     err = ff_cbs_write_packet(ctx->output, out, au);
     if (err < 0)
-        return err;
+        goto fail;
 
-    ff_cbs_fragment_uninit(ctx->output, au);
 
     err = av_packet_copy_props(out, in);
     if (err < 0)
-        return err;
+        goto fail;
 
+    err = 0;
+fail:
+    ff_cbs_fragment_reset(ctx->output, au);
     av_packet_free(&in);
+    if (err < 0)
+        av_packet_unref(out);
 
-    return 0;
+    return err;
 }
 
 static int h264_redundant_pps_init(AVBSFContext *bsf)
@@ -138,25 +146,29 @@ static int h264_redundant_pps_init(AVBSFContext *bsf)
         err = ff_cbs_read_extradata(ctx->input, au, bsf->par_in);
         if (err < 0) {
             av_log(bsf, AV_LOG_ERROR, "Failed to read extradata.\n");
-            return err;
+            goto fail;
         }
 
         for (i = 0; i < au->nb_units; i++) {
-            if (au->units[i].type == H264_NAL_PPS)
-                h264_redundant_pps_fixup_pps(ctx, au->units[i].content);
+            if (au->units[i].type == H264_NAL_PPS) {
+                err = h264_redundant_pps_fixup_pps(ctx, au->units[i].content);
+                if (err < 0)
+                    goto fail;
+            }
         }
 
         ctx->extradata_pic_init_qp = ctx->current_pic_init_qp;
         err = ff_cbs_write_extradata(ctx->output, bsf->par_out, au);
         if (err < 0) {
             av_log(bsf, AV_LOG_ERROR, "Failed to write extradata.\n");
-            return err;
+            goto fail;
         }
-
-        ff_cbs_fragment_uninit(ctx->output, au);
     }
 
-    return 0;
+    err = 0;
+fail:
+    ff_cbs_fragment_reset(ctx->output, au);
+    return err;
 }
 
 static void h264_redundant_pps_flush(AVBSFContext *bsf)
@@ -168,6 +180,8 @@ static void h264_redundant_pps_flush(AVBSFContext *bsf)
 static void h264_redundant_pps_close(AVBSFContext *bsf)
 {
     H264RedundantPPSContext *ctx = bsf->priv_data;
+
+    ff_cbs_fragment_free(ctx->input, &ctx->access_unit);
     ff_cbs_close(&ctx->input);
     ff_cbs_close(&ctx->output);
 }
diff --git a/libavcodec/h264_refs.c b/libavcodec/h264_refs.c
index 9536c4b..eaf965e 100644
--- a/libavcodec/h264_refs.c
+++ b/libavcodec/h264_refs.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,11 +27,12 @@
 
 #include <inttypes.h>
 
-#include "golomb_legacy.h"
+#include "libavutil/avassert.h"
 #include "internal.h"
 #include "avcodec.h"
 #include "h264.h"
 #include "h264dec.h"
+#include "golomb.h"
 #include "mpegutils.h"
 
 #include <assert.h>
@@ -80,16 +81,18 @@ static int build_def_list(H264Ref *def, int def_len,
     int  i[2] = { 0 };
     int index = 0;
 
-    while ((i[0] < len || i[1] < len) && index < def_len) {
+    while (i[0] < len || i[1] < len) {
         while (i[0] < len && !(in[i[0]] && (in[i[0]]->reference & sel)))
             i[0]++;
         while (i[1] < len && !(in[i[1]] && (in[i[1]]->reference & (sel ^ 3))))
             i[1]++;
-        if (i[0] < len && index < def_len) {
+        if (i[0] < len) {
+            av_assert0(index < def_len);
             in[i[0]]->pic_id = is_long ? i[0] : in[i[0]]->frame_num;
             split_field_copy(&def[index++], in[i[0]++], sel, 1);
         }
-        if (i[1] < len && index < def_len) {
+        if (i[1] < len) {
+            av_assert0(index < def_len);
             in[i[1]]->pic_id = is_long ? i[1] : in[i[1]]->frame_num;
             split_field_copy(&def[index++], in[i[1]++], sel ^ 3, 0);
         }
@@ -121,9 +124,18 @@ static int add_sorted(H264Picture **sorted, H264Picture * const *src,
     return out_i;
 }
 
-static void h264_initialise_ref_list(const H264Context *h, H264SliceContext *sl)
+static int mismatches_ref(const H264Context *h, const H264Picture *pic)
+{
+    const AVFrame *f = pic->f;
+    return (h->cur_pic_ptr->f->width  != f->width ||
+            h->cur_pic_ptr->f->height != f->height ||
+            h->cur_pic_ptr->f->format != f->format);
+}
+
+static void h264_initialise_ref_list(H264Context *h, H264SliceContext *sl)
 {
     int i, len;
+    int j;
 
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         H264Picture *sorted[32];
@@ -138,13 +150,14 @@ static void h264_initialise_ref_list(const H264Context *h, H264SliceContext *sl)
         for (list = 0; list < 2; list++) {
             len  = add_sorted(sorted,       h->short_ref, h->short_ref_count, cur_poc, 1 ^ list);
             len += add_sorted(sorted + len, h->short_ref, h->short_ref_count, cur_poc, 0 ^ list);
-            assert(len <= 32);
+            av_assert0(len <= 32);
 
             len  = build_def_list(sl->ref_list[list], FF_ARRAY_ELEMS(sl->ref_list[0]),
                                   sorted, len, 0, h->picture_structure);
             len += build_def_list(sl->ref_list[list] + len,
                                   FF_ARRAY_ELEMS(sl->ref_list[0]) - len,
                                   h->long_ref, 16, 1, h->picture_structure);
+            av_assert0(len <= 32);
 
             if (len < sl->ref_count[list])
                 memset(&sl->ref_list[list][len], 0, sizeof(H264Ref) * (sl->ref_count[list] - len));
@@ -165,10 +178,40 @@ static void h264_initialise_ref_list(const H264Context *h, H264SliceContext *sl)
         len += build_def_list(sl->ref_list[0] + len,
                               FF_ARRAY_ELEMS(sl->ref_list[0]) - len,
                               h-> long_ref, 16, 1, h->picture_structure);
+        av_assert0(len <= 32);
 
         if (len < sl->ref_count[0])
             memset(&sl->ref_list[0][len], 0, sizeof(H264Ref) * (sl->ref_count[0] - len));
     }
+#ifdef TRACE
+    for (i = 0; i < sl->ref_count[0]; i++) {
+        ff_tlog(h->avctx, "List0: %s fn:%d 0x%p\n",
+                (sl->ref_list[0][i].parent ? (sl->ref_list[0][i].parent->long_ref ? "LT" : "ST") : "??"),
+                sl->ref_list[0][i].pic_id,
+                sl->ref_list[0][i].data[0]);
+    }
+    if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
+        for (i = 0; i < sl->ref_count[1]; i++) {
+            ff_tlog(h->avctx, "List1: %s fn:%d 0x%p\n",
+                    (sl->ref_list[1][i].parent ? (sl->ref_list[1][i].parent->long_ref ? "LT" : "ST") : "??"),
+                    sl->ref_list[1][i].pic_id,
+                    sl->ref_list[1][i].data[0]);
+        }
+    }
+#endif
+
+    for (j = 0; j<1+(sl->slice_type_nos == AV_PICTURE_TYPE_B); j++) {
+        for (i = 0; i < sl->ref_count[j]; i++) {
+            if (sl->ref_list[j][i].parent) {
+                if (mismatches_ref(h, sl->ref_list[j][i].parent)) {
+                    av_log(h->avctx, AV_LOG_ERROR, "Discarding mismatching reference\n");
+                    memset(&sl->ref_list[j][i], 0, sizeof(sl->ref_list[j][i]));
+                }
+            }
+        }
+    }
+    for (i = 0; i < sl->list_count; i++)
+        h->default_ref[i] = sl->ref_list[i][0];
 }
 
 /**
@@ -253,7 +296,7 @@ static void h264_fill_mbaff_ref_list(H264SliceContext *sl)
     }
 }
 
-int ff_h264_build_ref_list(const H264Context *h, H264SliceContext *sl)
+int ff_h264_build_ref_list(H264Context *h, H264SliceContext *sl)
 {
     int list, index, pic_structure;
 
@@ -310,7 +353,7 @@ int ff_h264_build_ref_list(const H264Context *h, H264SliceContext *sl)
 
                 long_idx = pic_num_extract(h, pic_id, &pic_structure);
 
-                if (long_idx > 31) {
+                if (long_idx > 31U) {
                     av_log(h->avctx, AV_LOG_ERROR,
                            "long_term_pic_idx overflow\n");
                     return AVERROR_INVALIDDATA;
@@ -326,6 +369,8 @@ int ff_h264_build_ref_list(const H264Context *h, H264SliceContext *sl)
                 }
                 break;
             }
+            default:
+                av_assert0(0);
             }
 
             if (i < 0) {
@@ -351,13 +396,19 @@ int ff_h264_build_ref_list(const H264Context *h, H264SliceContext *sl)
     }
     for (list = 0; list < sl->list_count; list++) {
         for (index = 0; index < sl->ref_count[list]; index++) {
-            if (!sl->ref_list[list][index].parent) {
-                av_log(h->avctx, AV_LOG_ERROR, "Missing reference picture\n");
-                if (index == 0 || h->avctx->err_recognition & AV_EF_EXPLODE)
-                    return AVERROR_INVALIDDATA;
+            if (   !sl->ref_list[list][index].parent
+                || (!FIELD_PICTURE(h) && (sl->ref_list[list][index].reference&3) != 3)) {
+                int i;
+                av_log(h->avctx, AV_LOG_ERROR, "Missing reference picture, default is %d\n", h->default_ref[list].poc);
+                for (i = 0; i < FF_ARRAY_ELEMS(h->last_pocs); i++)
+                    h->last_pocs[i] = INT_MIN;
+                if (h->default_ref[list].parent
+                    && !(!FIELD_PICTURE(h) && (h->default_ref[list].reference&3) != 3))
+                    sl->ref_list[list][index] = h->default_ref[list];
                 else
-                    sl->ref_list[list][index] = sl->ref_list[list][index - 1];
+                    return -1;
             }
+            av_assert0(av_buffer_get_ref_count(sl->ref_list[list][index].parent->f->buf[0]) > 0);
         }
     }
 
@@ -393,7 +444,7 @@ int ff_h264_decode_ref_pic_list_reordering(H264SliceContext *sl, void *logctx)
                        op);
                 return AVERROR_INVALIDDATA;
             }
-            sl->ref_modifications[list][index].val = get_ue_golomb(&sl->gb);
+            sl->ref_modifications[list][index].val = get_ue_golomb_long(&sl->gb);
             sl->ref_modifications[list][index].op  = op;
             sl->nb_ref_modifications[list]++;
         }
@@ -518,11 +569,18 @@ void ff_h264_remove_all_refs(H264Context *h)
     }
     assert(h->long_ref_count == 0);
 
+    if (h->short_ref_count && !h->last_pic_for_ec.f->data[0]) {
+        ff_h264_unref_picture(h, &h->last_pic_for_ec);
+        ff_h264_ref_picture(h, &h->last_pic_for_ec, h->short_ref[0]);
+    }
+
     for (i = 0; i < h->short_ref_count; i++) {
         unreference_pic(h, h->short_ref[i], 0);
         h->short_ref[i] = NULL;
     }
     h->short_ref_count = 0;
+
+    memset(h->default_ref, 0, sizeof(h->default_ref));
 }
 
 static void generate_sliding_window_mmcos(H264Context *h)
@@ -530,10 +588,8 @@ static void generate_sliding_window_mmcos(H264Context *h)
     MMCO *mmco = h->mmco;
     int nb_mmco = 0;
 
-    assert(h->long_ref_count + h->short_ref_count <= h->ps.sps->ref_frame_count);
-
     if (h->short_ref_count &&
-        h->long_ref_count + h->short_ref_count == h->ps.sps->ref_frame_count &&
+        h->long_ref_count + h->short_ref_count >= h->ps.sps->ref_frame_count &&
         !(FIELD_PICTURE(h) && !h->first_field && h->cur_pic_ptr->reference)) {
         mmco[0].opcode        = MMCO_SHORT2UNUSED;
         mmco[0].short_pic_num = h->short_ref[h->short_ref_count - 1]->frame_num;
@@ -554,6 +610,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h)
     MMCO *mmco = h->mmco;
     int mmco_count;
     int i, av_uninit(j);
+    int pps_ref_count[2] = {0};
     int current_ref_assigned = 0, err = 0;
     H264Picture *av_uninit(pic);
 
@@ -584,7 +641,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h)
                 if (mmco[i].opcode != MMCO_SHORT2LONG ||
                     !h->long_ref[mmco[i].long_arg]    ||
                     h->long_ref[mmco[i].long_arg]->frame_num != frame_num) {
-                    av_log(h->avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
+                    av_log(h->avctx, h->short_ref_count ? AV_LOG_ERROR : AV_LOG_DEBUG, "mmco: unref short failure\n");
                     err = AVERROR_INVALIDDATA;
                 }
                 continue;
@@ -625,19 +682,24 @@ int ff_h264_execute_ref_pic_marking(H264Context *h)
                      * Report the problem and keep the pair where it is,
                      * and mark this field valid.
                      */
-            if (h->short_ref[0] == h->cur_pic_ptr)
+            if (h->short_ref[0] == h->cur_pic_ptr) {
+                av_log(h->avctx, AV_LOG_ERROR, "mmco: cannot assign current picture to short and long at the same time\n");
                 remove_short_at_index(h, 0);
+            }
 
             /* make sure the current picture is not already assigned as a long ref */
             if (h->cur_pic_ptr->long_ref) {
                 for (j = 0; j < FF_ARRAY_ELEMS(h->long_ref); j++) {
-                    if (h->long_ref[j] == h->cur_pic_ptr)
+                    if (h->long_ref[j] == h->cur_pic_ptr) {
+                        if (j != mmco[i].long_arg)
+                            av_log(h->avctx, AV_LOG_ERROR, "mmco: cannot assign current picture to 2 long term references\n");
                         remove_long(h, j, 0);
+                    }
                 }
             }
 
-
             if (h->long_ref[mmco[i].long_arg] != h->cur_pic_ptr) {
+                av_assert0(!h->cur_pic_ptr->long_ref);
                 remove_long(h, mmco[i].long_arg, 0);
 
                 h->long_ref[mmco[i].long_arg]           = h->cur_pic_ptr;
@@ -665,6 +727,8 @@ int ff_h264_execute_ref_pic_marking(H264Context *h)
             h->poc.frame_num = h->cur_pic_ptr->frame_num = 0;
             h->mmco_reset = 1;
             h->cur_pic_ptr->mmco_reset = 1;
+            for (j = 0; j < MAX_DELAYED_PIC_COUNT; j++)
+                h->last_pocs[j] = INT_MIN;
             break;
         default: assert(0);
         }
@@ -679,7 +743,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h)
          */
         if (h->short_ref_count && h->short_ref[0] == h->cur_pic_ptr) {
             /* Just mark the second field valid */
-            h->cur_pic_ptr->reference = PICT_FRAME;
+            h->cur_pic_ptr->reference |= h->picture_structure;
         } else if (h->cur_pic_ptr->long_ref) {
             av_log(h->avctx, AV_LOG_ERROR, "illegal short term reference "
                                            "assignment for second field "
@@ -703,8 +767,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h)
         }
     }
 
-    if (h->long_ref_count + h->short_ref_count -
-        (h->short_ref[0] == h->cur_pic_ptr) > h->ps.sps->ref_frame_count) {
+    if (h->long_ref_count + h->short_ref_count > FFMAX(h->ps.sps->ref_frame_count, 1)) {
 
         /* We have too many reference frames, probably due to corrupted
          * stream. Need to discard one frame. Prevents overrun of the
@@ -729,8 +792,39 @@ int ff_h264_execute_ref_pic_marking(H264Context *h)
         }
     }
 
+    for (i = 0; i<h->short_ref_count; i++) {
+        pic = h->short_ref[i];
+        if (pic->invalid_gap) {
+            int d = av_mod_uintp2(h->cur_pic_ptr->frame_num - pic->frame_num, h->ps.sps->log2_max_frame_num);
+            if (d > h->ps.sps->ref_frame_count)
+                remove_short(h, pic->frame_num, 0);
+        }
+    }
+
     print_short_term(h);
     print_long_term(h);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.pps_list); i++) {
+        if (h->ps.pps_list[i]) {
+            const PPS *pps = (const PPS *)h->ps.pps_list[i]->data;
+            pps_ref_count[0] = FFMAX(pps_ref_count[0], pps->ref_count[0]);
+            pps_ref_count[1] = FFMAX(pps_ref_count[1], pps->ref_count[1]);
+        }
+    }
+
+    // Detect unmarked random access points
+    if (   err >= 0
+        && h->long_ref_count==0
+        && (   h->short_ref_count<=2
+            || pps_ref_count[0] <= 2 && pps_ref_count[1] <= 1 && h->avctx->has_b_frames
+            || pps_ref_count[0] <= 1 + (h->picture_structure != PICT_FRAME) && pps_ref_count[1] <= 1)
+        && pps_ref_count[0]<=2 + (h->picture_structure != PICT_FRAME) + (2*!h->has_recovery_point)
+        && h->cur_pic_ptr->f->pict_type == AV_PICTURE_TYPE_I){
+        h->cur_pic_ptr->recovered |= 1;
+        if(!h->avctx->has_b_frames)
+            h->frame_recovered |= FRAME_RECOVERED_SEI;
+    }
+
 out:
     return (h->avctx->err_recognition & AV_EF_EXPLODE) ? err : 0;
 }
@@ -759,7 +853,7 @@ int ff_h264_decode_ref_pic_marking(H264SliceContext *sl, GetBitContext *gb,
                 mmco[i].opcode = opcode;
                 if (opcode == MMCO_SHORT2UNUSED || opcode == MMCO_SHORT2LONG) {
                     mmco[i].short_pic_num =
-                        (sl->curr_pic_num - get_ue_golomb(gb) - 1) &
+                        (sl->curr_pic_num - get_ue_golomb_long(gb) - 1) &
                             (sl->max_pic_num - 1);
                 }
                 if (opcode == MMCO_SHORT2LONG || opcode == MMCO_LONG2UNUSED ||
diff --git a/libavcodec/h264_sei.c b/libavcodec/h264_sei.c
index da5d33c..d4eb9c0 100644
--- a/libavcodec/h264_sei.c
+++ b/libavcodec/h264_sei.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... SEI decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,18 +27,19 @@
 
 #include "avcodec.h"
 #include "get_bits.h"
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "h264_ps.h"
 #include "h264_sei.h"
 #include "internal.h"
 
+#define AVERROR_PS_NOT_FOUND      FFERRTAG(0xF8,'?','P','S')
+
 static const uint8_t sei_num_clock_ts_table[9] = {
     1, 1, 1, 2, 2, 3, 3, 2, 3
 };
 
 void ff_h264_sei_uninit(H264SEIContext *h)
 {
-    h->unregistered.x264_build           = -1;
     h->recovery_point.recovery_frame_cnt = -1;
 
     h->picture_timing.dpb_output_delay  = 0;
@@ -50,20 +51,28 @@ void ff_h264_sei_uninit(H264SEIContext *h)
     h->display_orientation.present = 0;
     h->afd.present                 =  0;
 
-    h->a53_caption.a53_caption_size = 0;
-    av_freep(&h->a53_caption.a53_caption);
+    av_buffer_unref(&h->a53_caption.buf_ref);
 }
 
 static int decode_picture_timing(H264SEIPictureTiming *h, GetBitContext *gb,
-                                 const SPS *sps, void *logctx)
+                                 const H264ParamSets *ps, void *logctx)
 {
-    if (!sps)
-        return AVERROR_INVALIDDATA;
+    int i;
+    const SPS *sps = ps->sps;
+
+    for (i = 0; i<MAX_SPS_COUNT; i++)
+        if ((!sps || !sps->log2_max_frame_num) && ps->sps_list[i])
+            sps = (const SPS *)ps->sps_list[i]->data;
+
+    if (!sps) {
+        av_log(logctx, AV_LOG_ERROR, "SPS unavailable in decode_picture_timing\n");
+        return AVERROR_PS_NOT_FOUND;
+    }
 
     if (sps->nal_hrd_parameters_present_flag ||
         sps->vcl_hrd_parameters_present_flag) {
-        h->cpb_removal_delay = get_bits(gb, sps->cpb_removal_delay_length);
-        h->dpb_output_delay  = get_bits(gb, sps->dpb_output_delay_length);
+        h->cpb_removal_delay = get_bits_long(gb, sps->cpb_removal_delay_length);
+        h->dpb_output_delay  = get_bits_long(gb, sps->dpb_output_delay_length);
     }
     if (sps->pic_struct_present_flag) {
         unsigned int i, num_clock_ts;
@@ -75,32 +84,38 @@ static int decode_picture_timing(H264SEIPictureTiming *h, GetBitContext *gb,
             return AVERROR_INVALIDDATA;
 
         num_clock_ts = sei_num_clock_ts_table[h->pic_struct];
-
+        h->timecode_cnt = 0;
         for (i = 0; i < num_clock_ts; i++) {
-            if (get_bits(gb, 1)) {                /* clock_timestamp_flag */
+            if (get_bits(gb, 1)) {                      /* clock_timestamp_flag */
+                H264SEITimeCode *tc = &h->timecode[h->timecode_cnt++];
                 unsigned int full_timestamp_flag;
-
+                unsigned int counting_type, cnt_dropped_flag;
                 h->ct_type |= 1 << get_bits(gb, 2);
-                skip_bits(gb, 1);                 /* nuit_field_based_flag */
-                skip_bits(gb, 5);                 /* counting_type */
+                skip_bits(gb, 1);                       /* nuit_field_based_flag */
+                counting_type = get_bits(gb, 5);        /* counting_type */
                 full_timestamp_flag = get_bits(gb, 1);
-                skip_bits(gb, 1);                 /* discontinuity_flag */
-                skip_bits(gb, 1);                 /* cnt_dropped_flag */
-                skip_bits(gb, 8);                 /* n_frames */
+                skip_bits(gb, 1);                       /* discontinuity_flag */
+                cnt_dropped_flag = get_bits(gb, 1);      /* cnt_dropped_flag */
+                if (cnt_dropped_flag && counting_type > 1 && counting_type < 7)
+                    tc->dropframe = 1;
+                tc->frame = get_bits(gb, 8);         /* n_frames */
                 if (full_timestamp_flag) {
-                    skip_bits(gb, 6);             /* seconds_value 0..59 */
-                    skip_bits(gb, 6);             /* minutes_value 0..59 */
-                    skip_bits(gb, 5);             /* hours_value 0..23 */
+                    tc->full = 1;
+                    tc->seconds = get_bits(gb, 6); /* seconds_value 0..59 */
+                    tc->minutes = get_bits(gb, 6); /* minutes_value 0..59 */
+                    tc->hours = get_bits(gb, 5);   /* hours_value 0..23 */
                 } else {
-                    if (get_bits(gb, 1)) {        /* seconds_flag */
-                        skip_bits(gb, 6);         /* seconds_value range 0..59 */
-                        if (get_bits(gb, 1)) {    /* minutes_flag */
-                            skip_bits(gb, 6);     /* minutes_value 0..59 */
-                            if (get_bits(gb, 1))  /* hours_flag */
-                                skip_bits(gb, 5); /* hours_value 0..23 */
+                    tc->seconds = tc->minutes = tc->hours = tc->full = 0;
+                    if (get_bits(gb, 1)) {             /* seconds_flag */
+                        tc->seconds = get_bits(gb, 6);
+                        if (get_bits(gb, 1)) {         /* minutes_flag */
+                            tc->minutes = get_bits(gb, 6);
+                            if (get_bits(gb, 1))       /* hours_flag */
+                                tc->hours = get_bits(gb, 5);
                         }
                     }
                 }
+
                 if (sps->time_offset_length > 0)
                     skip_bits(gb,
                               sps->time_offset_length); /* time_offset */
@@ -159,7 +174,8 @@ static int decode_registered_user_data_closed_caption(H264SEIA53Caption *h,
             size -= 2;
 
             if (cc_count && size >= cc_count * 3) {
-                const uint64_t new_size = (h->a53_caption_size + cc_count
+                int old_size = h->buf_ref ? h->buf_ref->size : 0;
+                const uint64_t new_size = (old_size + cc_count
                                            * UINT64_C(3));
                 int i, ret;
 
@@ -167,14 +183,15 @@ static int decode_registered_user_data_closed_caption(H264SEIA53Caption *h,
                     return AVERROR(EINVAL);
 
                 /* Allow merging of the cc data from two fields. */
-                ret = av_reallocp(&h->a53_caption, new_size);
+                ret = av_buffer_realloc(&h->buf_ref, new_size);
                 if (ret < 0)
                     return ret;
 
+                /* Use of av_buffer_realloc assumes buffer is writeable */
                 for (i = 0; i < cc_count; i++) {
-                    h->a53_caption[h->a53_caption_size++] = get_bits(gb, 8);
-                    h->a53_caption[h->a53_caption_size++] = get_bits(gb, 8);
-                    h->a53_caption[h->a53_caption_size++] = get_bits(gb, 8);
+                    h->buf_ref->data[old_size++] = get_bits(gb, 8);
+                    h->buf_ref->data[old_size++] = get_bits(gb, 8);
+                    h->buf_ref->data[old_size++] = get_bits(gb, 8);
                 }
 
                 skip_bits(gb, 8);   // marker_bits
@@ -182,8 +199,6 @@ static int decode_registered_user_data_closed_caption(H264SEIA53Caption *h,
         }
     } else {
         int i;
-        avpriv_request_sample(logctx, "Subtitles with data type 0x%02x",
-                              user_data_type_code);
         for (i = 0; i < size - 1; i++)
             skip_bits(gb, 8);
     }
@@ -246,18 +261,23 @@ static int decode_unregistered_user_data(H264SEIUnregistered *h, GetBitContext *
     e = sscanf(user_data + 16, "x264 - core %d", &build);
     if (e == 1 && build > 0)
         h->x264_build = build;
-
-    if (strlen(user_data + 16) > 0)
-        av_log(logctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data + 16);
+    if (e == 1 && build == 1 && !strncmp(user_data+16, "x264 - core 0000", 16))
+        h->x264_build = 67;
 
     av_free(user_data);
     return 0;
 }
 
-static int decode_recovery_point(H264SEIRecoveryPoint *h, GetBitContext *gb)
+static int decode_recovery_point(H264SEIRecoveryPoint *h, GetBitContext *gb, void *logctx)
 {
-    h->recovery_frame_cnt = get_ue_golomb(gb);
+    unsigned recovery_frame_cnt = get_ue_golomb_long(gb);
 
+    if (recovery_frame_cnt >= (1<<MAX_LOG2_MAX_FRAME_NUM)) {
+        av_log(logctx, AV_LOG_ERROR, "recovery_frame_cnt %u is out of range\n", recovery_frame_cnt);
+        return AVERROR_INVALIDDATA;
+    }
+
+    h->recovery_frame_cnt = recovery_frame_cnt;
     /* 1b exact_match_flag,
      * 1b broken_link_flag,
      * 2b changing_slice_group_idc */
@@ -271,21 +291,21 @@ static int decode_buffering_period(H264SEIBufferingPeriod *h, GetBitContext *gb,
 {
     unsigned int sps_id;
     int sched_sel_idx;
-    SPS *sps;
+    const SPS *sps;
 
     sps_id = get_ue_golomb_31(gb);
     if (sps_id > 31 || !ps->sps_list[sps_id]) {
         av_log(logctx, AV_LOG_ERROR,
                "non-existing SPS %d referenced in buffering period\n", sps_id);
-        return AVERROR_INVALIDDATA;
+        return sps_id > 31 ? AVERROR_INVALIDDATA : AVERROR_PS_NOT_FOUND;
     }
-    sps = (SPS*)ps->sps_list[sps_id]->data;
+    sps = (const SPS*)ps->sps_list[sps_id]->data;
 
     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
     if (sps->nal_hrd_parameters_present_flag) {
         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
             h->initial_cpb_removal_delay[sched_sel_idx] =
-                get_bits(gb, sps->initial_cpb_removal_delay_length);
+                get_bits_long(gb, sps->initial_cpb_removal_delay_length);
             // initial_cpb_removal_delay_offset
             skip_bits(gb, sps->initial_cpb_removal_delay_length);
         }
@@ -293,7 +313,7 @@ static int decode_buffering_period(H264SEIBufferingPeriod *h, GetBitContext *gb,
     if (sps->vcl_hrd_parameters_present_flag) {
         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
             h->initial_cpb_removal_delay[sched_sel_idx] =
-                get_bits(gb, sps->initial_cpb_removal_delay_length);
+                get_bits_long(gb, sps->initial_cpb_removal_delay_length);
             // initial_cpb_removal_delay_offset
             skip_bits(gb, sps->initial_cpb_removal_delay_length);
         }
@@ -306,12 +326,13 @@ static int decode_buffering_period(H264SEIBufferingPeriod *h, GetBitContext *gb,
 static int decode_frame_packing_arrangement(H264SEIFramePacking *h,
                                             GetBitContext *gb)
 {
-    get_ue_golomb(gb);              // frame_packing_arrangement_id
-    h->present = !get_bits1(gb);
+    h->arrangement_id          = get_ue_golomb_long(gb);
+    h->arrangement_cancel_flag = get_bits1(gb);
+    h->present = !h->arrangement_cancel_flag;
 
     if (h->present) {
         h->arrangement_type = get_bits(gb, 7);
-        h->quincunx_subsampling           = get_bits1(gb);
+        h->quincunx_sampling_flag         = get_bits1(gb);
         h->content_interpretation_type    = get_bits(gb, 6);
 
         // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
@@ -320,10 +341,10 @@ static int decode_frame_packing_arrangement(H264SEIFramePacking *h,
         // frame0_self_contained_flag, frame1_self_contained_flag
         skip_bits(gb, 2);
 
-        if (!h->quincunx_subsampling && h->arrangement_type != 5)
+        if (!h->quincunx_sampling_flag && h->arrangement_type != 5)
             skip_bits(gb, 16);      // frame[01]_grid_position_[xy]
         skip_bits(gb, 8);           // frame_packing_arrangement_reserved_byte
-        get_ue_golomb(gb);          // frame_packing_arrangement_repetition_period
+        h->arrangement_repetition_period = get_ue_golomb_long(gb);
     }
     skip_bits1(gb);                 // frame_packing_arrangement_extension_flag
 
@@ -340,8 +361,33 @@ static int decode_display_orientation(H264SEIDisplayOrientation *h,
         h->vflip = get_bits1(gb);     // ver_flip
 
         h->anticlockwise_rotation = get_bits(gb, 16);
-        get_ue_golomb(gb);  // display_orientation_repetition_period
-        skip_bits1(gb);     // display_orientation_extension_flag
+        get_ue_golomb_long(gb);       // display_orientation_repetition_period
+        skip_bits1(gb);               // display_orientation_extension_flag
+    }
+
+    return 0;
+}
+
+static int decode_green_metadata(H264SEIGreenMetaData *h, GetBitContext *gb)
+{
+    h->green_metadata_type = get_bits(gb, 8);
+
+    if (h->green_metadata_type == 0) {
+        h->period_type = get_bits(gb, 8);
+
+        if (h->period_type == 2)
+            h->num_seconds = get_bits(gb, 16);
+        else if (h->period_type == 3)
+            h->num_pictures = get_bits(gb, 16);
+
+        h->percent_non_zero_macroblocks            = get_bits(gb, 8);
+        h->percent_intra_coded_macroblocks         = get_bits(gb, 8);
+        h->percent_six_tap_filtering               = get_bits(gb, 8);
+        h->percent_alpha_point_deblocking_instance = get_bits(gb, 8);
+
+    } else if (h->green_metadata_type == 1) {
+        h->xsd_metric_type  = get_bits(gb, 8);
+        h->xsd_metric_value = get_bits(gb, 16);
     }
 
     return 0;
@@ -358,34 +404,36 @@ static int decode_alternative_transfer(H264SEIAlternativeTransfer *h,
 int ff_h264_sei_decode(H264SEIContext *h, GetBitContext *gb,
                        const H264ParamSets *ps, void *logctx)
 {
-    while (get_bits_left(gb) > 16) {
-        int size = 0;
+    int master_ret = 0;
+
+    while (get_bits_left(gb) > 16 && show_bits(gb, 16)) {
         int type = 0;
+        unsigned size = 0;
+        unsigned next;
         int ret  = 0;
-        int last = 0;
 
-        while (get_bits_left(gb) >= 8 &&
-               (last = get_bits(gb, 8)) == 255) {
-            type += 255;
-        }
-        type += last;
+        do {
+            if (get_bits_left(gb) < 8)
+                return AVERROR_INVALIDDATA;
+            type += show_bits(gb, 8);
+        } while (get_bits(gb, 8) == 255);
 
-        last = 0;
-        while (get_bits_left(gb) >= 8 &&
-               (last = get_bits(gb, 8)) == 255) {
-            size += 255;
-        }
-        size += last;
+        do {
+            if (get_bits_left(gb) < 8)
+                return AVERROR_INVALIDDATA;
+            size += show_bits(gb, 8);
+        } while (get_bits(gb, 8) == 255);
 
         if (size > get_bits_left(gb) / 8) {
-            av_log(logctx, AV_LOG_ERROR, "SEI type %d truncated at %d\n",
-                   type, get_bits_left(gb));
+            av_log(logctx, AV_LOG_ERROR, "SEI type %d size %d truncated at %d\n",
+                   type, 8*size, get_bits_left(gb));
             return AVERROR_INVALIDDATA;
         }
+        next = get_bits_count(gb) + 8 * size;
 
         switch (type) {
         case H264_SEI_TYPE_PIC_TIMING: // Picture timing SEI
-            ret = decode_picture_timing(&h->picture_timing, gb, ps->sps, logctx);
+            ret = decode_picture_timing(&h->picture_timing, gb, ps, logctx);
             break;
         case H264_SEI_TYPE_USER_DATA_REGISTERED:
             ret = decode_registered_user_data(h, gb, logctx, size);
@@ -394,7 +442,7 @@ int ff_h264_sei_decode(H264SEIContext *h, GetBitContext *gb,
             ret = decode_unregistered_user_data(&h->unregistered, gb, logctx, size);
             break;
         case H264_SEI_TYPE_RECOVERY_POINT:
-            ret = decode_recovery_point(&h->recovery_point, gb);
+            ret = decode_recovery_point(&h->recovery_point, gb, logctx);
             break;
         case H264_SEI_TYPE_BUFFERING_PERIOD:
             ret = decode_buffering_period(&h->buffering_period, gb, ps, logctx);
@@ -405,19 +453,70 @@ int ff_h264_sei_decode(H264SEIContext *h, GetBitContext *gb,
         case H264_SEI_TYPE_DISPLAY_ORIENTATION:
             ret = decode_display_orientation(&h->display_orientation, gb);
             break;
+        case H264_SEI_TYPE_GREEN_METADATA:
+            ret = decode_green_metadata(&h->green_metadata, gb);
+            break;
         case H264_SEI_TYPE_ALTERNATIVE_TRANSFER:
             ret = decode_alternative_transfer(&h->alternative_transfer, gb);
             break;
         default:
             av_log(logctx, AV_LOG_DEBUG, "unknown SEI type %d\n", type);
-            skip_bits(gb, 8 * size);
         }
-        if (ret < 0)
+        if (ret < 0 && ret != AVERROR_PS_NOT_FOUND)
             return ret;
+        if (ret < 0)
+            master_ret = ret;
+
+        skip_bits_long(gb, next - get_bits_count(gb));
 
         // FIXME check bits here
         align_get_bits(gb);
     }
 
-    return 0;
+    return master_ret;
+}
+
+const char *ff_h264_sei_stereo_mode(const H264SEIFramePacking *h)
+{
+    if (h->arrangement_cancel_flag == 0) {
+        switch (h->arrangement_type) {
+            case H264_SEI_FPA_TYPE_CHECKERBOARD:
+                if (h->content_interpretation_type == 2)
+                    return "checkerboard_rl";
+                else
+                    return "checkerboard_lr";
+            case H264_SEI_FPA_TYPE_INTERLEAVE_COLUMN:
+                if (h->content_interpretation_type == 2)
+                    return "col_interleaved_rl";
+                else
+                    return "col_interleaved_lr";
+            case H264_SEI_FPA_TYPE_INTERLEAVE_ROW:
+                if (h->content_interpretation_type == 2)
+                    return "row_interleaved_rl";
+                else
+                    return "row_interleaved_lr";
+            case H264_SEI_FPA_TYPE_SIDE_BY_SIDE:
+                if (h->content_interpretation_type == 2)
+                    return "right_left";
+                else
+                    return "left_right";
+            case H264_SEI_FPA_TYPE_TOP_BOTTOM:
+                if (h->content_interpretation_type == 2)
+                    return "bottom_top";
+                else
+                    return "top_bottom";
+            case H264_SEI_FPA_TYPE_INTERLEAVE_TEMPORAL:
+                if (h->content_interpretation_type == 2)
+                    return "block_rl";
+                else
+                    return "block_lr";
+            case H264_SEI_FPA_TYPE_2D:
+            default:
+                return "mono";
+        }
+    } else if (h->arrangement_cancel_flag == 1) {
+        return "mono";
+    } else {
+        return NULL;
+    }
 }
diff --git a/libavcodec/h264_sei.h b/libavcodec/h264_sei.h
index c3a19dd..a75c3aa 100644
--- a/libavcodec/h264_sei.h
+++ b/libavcodec/h264_sei.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,15 @@
 typedef enum {
     H264_SEI_TYPE_BUFFERING_PERIOD       = 0,   ///< buffering period (H.264, D.1.1)
     H264_SEI_TYPE_PIC_TIMING             = 1,   ///< picture timing
+    H264_SEI_TYPE_PAN_SCAN_RECT          = 2,   ///< pan-scan rectangle
     H264_SEI_TYPE_FILLER_PAYLOAD         = 3,   ///< filler data
     H264_SEI_TYPE_USER_DATA_REGISTERED   = 4,   ///< registered user data as specified by Rec. ITU-T T.35
     H264_SEI_TYPE_USER_DATA_UNREGISTERED = 5,   ///< unregistered user data
     H264_SEI_TYPE_RECOVERY_POINT         = 6,   ///< recovery point (frame # to decoder sync)
     H264_SEI_TYPE_FRAME_PACKING          = 45,  ///< frame packing arrangement
     H264_SEI_TYPE_DISPLAY_ORIENTATION    = 47,  ///< display orientation
+    H264_SEI_TYPE_GREEN_METADATA         = 56,  ///< GreenMPEG information
+    H264_SEI_TYPE_MASTERING_DISPLAY_COLOUR_VOLUME = 137,  ///< mastering display properties
     H264_SEI_TYPE_ALTERNATIVE_TRANSFER   = 147, ///< alternative transfer
 } H264_SEI_Type;
 
@@ -51,6 +54,30 @@ typedef enum {
     H264_SEI_PIC_STRUCT_FRAME_TRIPLING    = 8  ///<  8: %frame tripling
 } H264_SEI_PicStructType;
 
+/**
+ * frame_packing_arrangement types
+ */
+typedef enum {
+    H264_SEI_FPA_TYPE_CHECKERBOARD        = 0,
+    H264_SEI_FPA_TYPE_INTERLEAVE_COLUMN   = 1,
+    H264_SEI_FPA_TYPE_INTERLEAVE_ROW      = 2,
+    H264_SEI_FPA_TYPE_SIDE_BY_SIDE        = 3,
+    H264_SEI_FPA_TYPE_TOP_BOTTOM          = 4,
+    H264_SEI_FPA_TYPE_INTERLEAVE_TEMPORAL = 5,
+    H264_SEI_FPA_TYPE_2D                  = 6,
+} H264_SEI_FpaType;
+
+typedef struct H264SEITimeCode {
+    /* When not continuously receiving full timecodes, we have to reference
+       the previous timecode received */
+    int full;
+    int frame;
+    int seconds;
+    int minutes;
+    int hours;
+    int dropframe;
+} H264SEITimeCode;
+
 typedef struct H264SEIPictureTiming {
     int present;
     H264_SEI_PicStructType pic_struct;
@@ -71,6 +98,16 @@ typedef struct H264SEIPictureTiming {
      * cpb_removal_delay in picture timing SEI message, see H.264 C.1.2
      */
     int cpb_removal_delay;
+
+    /**
+     * Maximum three timecodes in a pic_timing SEI.
+     */
+    H264SEITimeCode timecode[3];
+
+    /**
+     * Number of timecode in use
+     */
+    int timecode_cnt;
 } H264SEIPictureTiming;
 
 typedef struct H264SEIAFD {
@@ -79,8 +116,7 @@ typedef struct H264SEIAFD {
 } H264SEIAFD;
 
 typedef struct H264SEIA53Caption {
-    int a53_caption_size;
-    uint8_t *a53_caption;
+    AVBufferRef *buf_ref;
 } H264SEIA53Caption;
 
 typedef struct H264SEIUnregistered {
@@ -105,9 +141,12 @@ typedef struct H264SEIBufferingPeriod {
 
 typedef struct H264SEIFramePacking {
     int present;
-    int arrangement_type;
+    int arrangement_id;
+    int arrangement_cancel_flag;  ///< is previous arrangement canceled, -1 if never received
+    H264_SEI_FpaType arrangement_type;
+    int arrangement_repetition_period;
     int content_interpretation_type;
-    int quincunx_subsampling;
+    int quincunx_sampling_flag;
     int current_frame_is_frame0_flag;
 } H264SEIFramePacking;
 
@@ -117,6 +156,19 @@ typedef struct H264SEIDisplayOrientation {
     int hflip, vflip;
 } H264SEIDisplayOrientation;
 
+typedef struct H264SEIGreenMetaData {
+    uint8_t green_metadata_type;
+    uint8_t period_type;
+    uint16_t num_seconds;
+    uint16_t num_pictures;
+    uint8_t percent_non_zero_macroblocks;
+    uint8_t percent_intra_coded_macroblocks;
+    uint8_t percent_six_tap_filtering;
+    uint8_t percent_alpha_point_deblocking_instance;
+    uint8_t xsd_metric_type;
+    uint16_t xsd_metric_value;
+} H264SEIGreenMetaData;
+
 typedef struct H264SEIAlternativeTransfer {
     int present;
     int preferred_transfer_characteristics;
@@ -131,6 +183,7 @@ typedef struct H264SEIContext {
     H264SEIBufferingPeriod buffering_period;
     H264SEIFramePacking frame_packing;
     H264SEIDisplayOrientation display_orientation;
+    H264SEIGreenMetaData green_metadata;
     H264SEIAlternativeTransfer alternative_transfer;
 } H264SEIContext;
 
@@ -144,4 +197,9 @@ int ff_h264_sei_decode(H264SEIContext *h, GetBitContext *gb,
  */
 void ff_h264_sei_uninit(H264SEIContext *h);
 
+/**
+ * Get stereo_mode string from the h264 frame_packing_arrangement
+ */
+const char *ff_h264_sei_stereo_mode(const H264SEIFramePacking *h);
+
 #endif /* AVCODEC_H264_SEI_H */
diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
index 1b968eb..1c9a270 100644
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,7 +34,6 @@
 #include "cabac.h"
 #include "cabac_functions.h"
 #include "error_resilience.h"
-#include "golomb_legacy.h"
 #include "avcodec.h"
 #include "h264.h"
 #include "h264dec.h"
@@ -42,19 +41,21 @@
 #include "h264chroma.h"
 #include "h264_mvpred.h"
 #include "h264_ps.h"
+#include "golomb.h"
 #include "mathops.h"
 #include "mpegutils.h"
+#include "mpegvideo.h"
 #include "rectangle.h"
 #include "thread.h"
 
-static const uint8_t field_scan[16] = {
+static const uint8_t field_scan[16+1] = {
     0 + 0 * 4, 0 + 1 * 4, 1 + 0 * 4, 0 + 2 * 4,
     0 + 3 * 4, 1 + 1 * 4, 1 + 2 * 4, 1 + 3 * 4,
     2 + 0 * 4, 2 + 1 * 4, 2 + 2 * 4, 2 + 3 * 4,
     3 + 0 * 4, 3 + 1 * 4, 3 + 2 * 4, 3 + 3 * 4,
 };
 
-static const uint8_t field_scan8x8[64] = {
+static const uint8_t field_scan8x8[64+1] = {
     0 + 0 * 8, 0 + 1 * 8, 0 + 2 * 8, 1 + 0 * 8,
     1 + 1 * 8, 0 + 3 * 8, 0 + 4 * 8, 1 + 2 * 8,
     2 + 0 * 8, 1 + 3 * 8, 0 + 5 * 8, 0 + 6 * 8,
@@ -73,7 +74,7 @@ static const uint8_t field_scan8x8[64] = {
     7 + 4 * 8, 7 + 5 * 8, 7 + 6 * 8, 7 + 7 * 8,
 };
 
-static const uint8_t field_scan8x8_cavlc[64] = {
+static const uint8_t field_scan8x8_cavlc[64+1] = {
     0 + 0 * 8, 1 + 1 * 8, 2 + 0 * 8, 0 + 7 * 8,
     2 + 2 * 8, 2 + 3 * 8, 2 + 4 * 8, 3 + 3 * 8,
     3 + 4 * 8, 4 + 3 * 8, 4 + 4 * 8, 5 + 3 * 8,
@@ -93,7 +94,7 @@ static const uint8_t field_scan8x8_cavlc[64] = {
 };
 
 // zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)]
-static const uint8_t zigzag_scan8x8_cavlc[64] = {
+static const uint8_t zigzag_scan8x8_cavlc[64+1] = {
     0 + 0 * 8, 1 + 1 * 8, 1 + 2 * 8, 2 + 2 * 8,
     4 + 1 * 8, 0 + 5 * 8, 3 + 3 * 8, 7 + 0 * 8,
     3 + 4 * 8, 1 + 7 * 8, 5 + 3 * 8, 6 + 3 * 8,
@@ -135,9 +136,9 @@ static int alloc_scratch_buffers(H264SliceContext *sl, int linesize)
     // (= 21x21 for  H.264)
     av_fast_malloc(&sl->edge_emu_buffer, &sl->edge_emu_buffer_allocated, alloc_size * 2 * 21);
 
-    av_fast_malloc(&sl->top_borders[0], &sl->top_borders_allocated[0],
+    av_fast_mallocz(&sl->top_borders[0], &sl->top_borders_allocated[0],
                    h->mb_width * 16 * 3 * sizeof(uint8_t) * 2);
-    av_fast_malloc(&sl->top_borders[1], &sl->top_borders_allocated[1],
+    av_fast_mallocz(&sl->top_borders[1], &sl->top_borders_allocated[1],
                    h->mb_width * 16 * 3 * sizeof(uint8_t) * 2);
 
     if (!sl->bipred_scratchpad || !sl->edge_emu_buffer ||
@@ -206,6 +207,18 @@ static int alloc_picture(H264Context *h, H264Picture *pic)
             pic->hwaccel_picture_private = pic->hwaccel_priv_buf->data;
         }
     }
+    if (CONFIG_GRAY && !h->avctx->hwaccel && h->flags & AV_CODEC_FLAG_GRAY && pic->f->data[2]) {
+        int h_chroma_shift, v_chroma_shift;
+        av_pix_fmt_get_chroma_sub_sample(pic->f->format,
+                                         &h_chroma_shift, &v_chroma_shift);
+
+        for(i=0; i<AV_CEIL_RSHIFT(pic->f->height, v_chroma_shift); i++) {
+            memset(pic->f->data[1] + pic->f->linesize[1]*i,
+                   0x80, AV_CEIL_RSHIFT(pic->f->width, h_chroma_shift));
+            memset(pic->f->data[2] + pic->f->linesize[2]*i,
+                   0x80, AV_CEIL_RSHIFT(pic->f->width, h_chroma_shift));
+        }
+    }
 
     if (!h->qscale_table_pool) {
         ret = init_table_pools(h);
@@ -237,61 +250,24 @@ fail:
     return (ret < 0) ? ret : AVERROR(ENOMEM);
 }
 
-static inline int pic_is_unused(H264Context *h, H264Picture *pic)
-{
-    if (!pic->f->buf[0])
-        return 1;
-    return 0;
-}
-
 static int find_unused_picture(H264Context *h)
 {
     int i;
 
     for (i = 0; i < H264_MAX_PICTURE_COUNT; i++) {
-        if (pic_is_unused(h, &h->DPB[i]))
-            break;
+        if (!h->DPB[i].f->buf[0])
+            return i;
     }
-    if (i == H264_MAX_PICTURE_COUNT)
-        return AVERROR_INVALIDDATA;
-
-    return i;
+    return AVERROR_INVALIDDATA;
 }
 
-static int initialize_cur_frame(H264Context *h)
-{
-    H264Picture *cur;
-    int ret;
-
-    release_unused_pictures(h, 1);
-    ff_h264_unref_picture(h, &h->cur_pic);
-    h->cur_pic_ptr = NULL;
-
-    ret = find_unused_picture(h);
-    if (ret < 0) {
-        av_log(h->avctx, AV_LOG_ERROR, "no frame buffer available\n");
-        return ret;
-    }
-    cur = &h->DPB[ret];
 
-    ret = alloc_picture(h, cur);
-    if (ret < 0)
-        return ret;
-
-    ret = ff_h264_ref_picture(h, &h->cur_pic, cur);
-    if (ret < 0)
-        return ret;
-    h->cur_pic_ptr = cur;
-
-    return 0;
-}
-
-#define IN_RANGE(a, b, size) (((a) >= (b)) && ((a) < ((b) + (size))))
+#define IN_RANGE(a, b, size) (((void*)(a) >= (void*)(b)) && ((void*)(a) < (void*)((b) + (size))))
 
 #define REBASE_PICTURE(pic, new_ctx, old_ctx)             \
-    ((pic && pic >= old_ctx->DPB &&                       \
-      pic < old_ctx->DPB + H264_MAX_PICTURE_COUNT) ?          \
-     &new_ctx->DPB[pic - old_ctx->DPB] : NULL)
+    (((pic) && (pic) >= (old_ctx)->DPB &&                       \
+      (pic) < (old_ctx)->DPB + H264_MAX_PICTURE_COUNT) ?          \
+     &(new_ctx)->DPB[(pic) - (old_ctx)->DPB] : NULL)
 
 static void copy_picture_range(H264Picture **to, H264Picture **from, int count,
                                H264Context *new_base,
@@ -300,10 +276,9 @@ static void copy_picture_range(H264Picture **to, H264Picture **from, int count,
     int i;
 
     for (i = 0; i < count; i++) {
-        assert((IN_RANGE(from[i], old_base, sizeof(*old_base)) ||
-                IN_RANGE(from[i], old_base->DPB,
-                         sizeof(H264Picture) * H264_MAX_PICTURE_COUNT) ||
-                !from[i]));
+        av_assert1(!from[i] ||
+                   IN_RANGE(from[i], old_base, 1) ||
+                   IN_RANGE(from[i], old_base->DPB, H264_MAX_PICTURE_COUNT));
         to[i] = REBASE_PICTURE(from[i], new_base, old_base);
     }
 }
@@ -318,11 +293,12 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     int need_reinit = 0;
     int i, ret;
 
-    if (dst == src || !h1->context_initialized)
+    if (dst == src)
         return 0;
 
-    if (!h1->ps.sps)
-        return AVERROR_INVALIDDATA;
+    // We can't fail if SPS isn't set at it breaks current skip_frame code
+    //if (!h1->ps.sps)
+    //    return AVERROR_INVALIDDATA;
 
     if (inited &&
         (h->width                 != h1->width                 ||
@@ -336,6 +312,9 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
         need_reinit = 1;
     }
 
+    /* copy block_offset since frame_start may not be called */
+    memcpy(h->block_offset, h1->block_offset, sizeof(h->block_offset));
+
     // SPS/PPS
     for (i = 0; i < FF_ARRAY_ELEMS(h->ps.sps_list); i++) {
         av_buffer_unref(&h->ps.sps_list[i]);
@@ -354,7 +333,22 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
         }
     }
 
-    h->ps.sps = h1->ps.sps;
+    av_buffer_unref(&h->ps.pps_ref);
+    av_buffer_unref(&h->ps.sps_ref);
+    h->ps.pps = NULL;
+    h->ps.sps = NULL;
+    if (h1->ps.pps_ref) {
+        h->ps.pps_ref = av_buffer_ref(h1->ps.pps_ref);
+        if (!h->ps.pps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.pps = (const PPS*)h->ps.pps_ref->data;
+    }
+    if (h1->ps.sps_ref) {
+        h->ps.sps_ref = av_buffer_ref(h1->ps.sps_ref);
+        if (!h->ps.sps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.sps = (const SPS*)h->ps.sps_ref->data;
+    }
 
     if (need_reinit || !inited) {
         h->width     = h1->width;
@@ -364,10 +358,13 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
         h->mb_num    = h1->mb_num;
         h->mb_stride = h1->mb_stride;
         h->b_stride  = h1->b_stride;
+        h->x264_build = h1->x264_build;
 
-        if ((err = h264_slice_header_init(h)) < 0) {
-            av_log(h->avctx, AV_LOG_ERROR, "h264_slice_header_init() failed");
-            return err;
+        if (h->context_initialized || h1->context_initialized) {
+            if ((err = h264_slice_header_init(h)) < 0) {
+                av_log(h->avctx, AV_LOG_ERROR, "h264_slice_header_init() failed");
+                return err;
+            }
         }
 
         /* copy block_offset since frame_start may not be called */
@@ -403,7 +400,6 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
 
     h->enable_er       = h1->enable_er;
     h->workaround_bugs = h1->workaround_bugs;
-    h->x264_build      = h1->x264_build;
     h->droppable       = h1->droppable;
 
     // extradata/NAL handling
@@ -417,6 +413,7 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     memcpy(h->delayed_pic, h1->delayed_pic, sizeof(h->delayed_pic));
     memcpy(h->last_pocs,   h1->last_pocs,   sizeof(h->last_pocs));
 
+    h->next_output_pic   = h1->next_output_pic;
     h->next_outputed_poc = h1->next_outputed_poc;
 
     memcpy(h->mmco, h1->mmco, sizeof(h->mmco));
@@ -431,6 +428,15 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     copy_picture_range(h->delayed_pic, h1->delayed_pic,
                        MAX_DELAYED_PIC_COUNT + 2, h, h1);
 
+    h->frame_recovered       = h1->frame_recovered;
+
+    av_buffer_unref(&h->sei.a53_caption.buf_ref);
+    if (h1->sei.a53_caption.buf_ref) {
+        h->sei.a53_caption.buf_ref = av_buffer_ref(h1->sei.a53_caption.buf_ref);
+        if (!h->sei.a53_caption.buf_ref)
+            return AVERROR(ENOMEM);
+    }
+
     if (!h->cur_pic_ptr)
         return 0;
 
@@ -443,7 +449,6 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     h->poc.prev_frame_num        = h->poc.frame_num;
 
     h->recovery_frame        = h1->recovery_frame;
-    h->frame_recovered       = h1->frame_recovered;
 
     return err;
 }
@@ -453,12 +458,28 @@ static int h264_frame_start(H264Context *h)
     H264Picture *pic;
     int i, ret;
     const int pixel_shift = h->pixel_shift;
+    int c[4] = {
+        1<<(h->ps.sps->bit_depth_luma-1),
+        1<<(h->ps.sps->bit_depth_chroma-1),
+        1<<(h->ps.sps->bit_depth_chroma-1),
+        -1
+    };
 
-    ret = initialize_cur_frame(h);
-    if (ret < 0)
-        return ret;
+    if (!ff_thread_can_start_frame(h->avctx)) {
+        av_log(h->avctx, AV_LOG_ERROR, "Attempt to start a frame outside SETUP state\n");
+        return -1;
+    }
+
+    release_unused_pictures(h, 1);
+    h->cur_pic_ptr = NULL;
+
+    i = find_unused_picture(h);
+    if (i < 0) {
+        av_log(h->avctx, AV_LOG_ERROR, "no frame buffer available\n");
+        return i;
+    }
+    pic = &h->DPB[i];
 
-    pic = h->cur_pic_ptr;
     pic->reference              = h->droppable ? 0 : h->picture_structure;
     pic->f->coded_picture_number = h->coded_picture_number++;
     pic->field_picture          = h->picture_structure != PICT_FRAME;
@@ -471,6 +492,8 @@ static int h264_frame_start(H264Context *h)
     pic->f->key_frame = 0;
     pic->mmco_reset  = 0;
     pic->recovered   = 0;
+    pic->invalid_gap = 0;
+    pic->sei_recovery_frame_cnt = h->sei.recovery_point.recovery_frame_cnt;
 
     pic->f->pict_type = h->slice_ctx[0].slice_type;
 
@@ -479,8 +502,30 @@ static int h264_frame_start(H264Context *h)
     pic->f->crop_top    = h->crop_top;
     pic->f->crop_bottom = h->crop_bottom;
 
-    if (CONFIG_ERROR_RESILIENCE && h->enable_er)
+    if ((ret = alloc_picture(h, pic)) < 0)
+        return ret;
+    if(!h->frame_recovered && !h->avctx->hwaccel)
+        ff_color_frame(pic->f, c);
+
+    h->cur_pic_ptr = pic;
+    ff_h264_unref_picture(h, &h->cur_pic);
+    if (CONFIG_ERROR_RESILIENCE) {
+        ff_h264_set_erpic(&h->slice_ctx[0].er.cur_pic, NULL);
+    }
+
+    if ((ret = ff_h264_ref_picture(h, &h->cur_pic, h->cur_pic_ptr)) < 0)
+        return ret;
+
+    for (i = 0; i < h->nb_slice_ctx; i++) {
+        h->slice_ctx[i].linesize   = h->cur_pic_ptr->f->linesize[0];
+        h->slice_ctx[i].uvlinesize = h->cur_pic_ptr->f->linesize[1];
+    }
+
+    if (CONFIG_ERROR_RESILIENCE && h->enable_er) {
         ff_er_frame_start(&h->slice_ctx[0].er);
+        ff_h264_set_erpic(&h->slice_ctx[0].er.last_pic, NULL);
+        ff_h264_set_erpic(&h->slice_ctx[0].er.next_pic, NULL);
+    }
 
     for (i = 0; i < 16; i++) {
         h->block_offset[i]           = (4 * ((scan8[i] - scan8[0]) & 7) << pixel_shift) + 4 * pic->f->linesize[0] * ((scan8[i] - scan8[0]) >> 3);
@@ -493,11 +538,6 @@ static int h264_frame_start(H264Context *h)
         h->block_offset[48 + 32 + i] = (4 * ((scan8[i] - scan8[0]) & 7) << pixel_shift) + 8 * pic->f->linesize[1] * ((scan8[i] - scan8[0]) >> 3);
     }
 
-    /* Some macroblocks can be accessed before they're available in case
-     * of lost slices, MBAFF or threading. */
-    memset(h->slice_table, -1,
-           (h->mb_height * h->mb_stride - 1) * sizeof(*h->slice_table));
-
     /* We mark the current picture as non-reference after allocating it, so
      * that if we break out due to an error it can be released automatically
      * in the next ff_mpv_frame_start().
@@ -506,6 +546,8 @@ static int h264_frame_start(H264Context *h)
 
     h->cur_pic_ptr->field_poc[0] = h->cur_pic_ptr->field_poc[1] = INT_MAX;
 
+    h->next_output_pic = NULL;
+
     h->postpone_filter = 0;
 
     h->mb_aff_frame = h->ps.sps->mb_aff && (h->picture_structure == PICT_FRAME);
@@ -636,7 +678,7 @@ static void implicit_weight_table(const H264Context *h, H264SliceContext *sl, in
             cur_poc = h->cur_pic_ptr->field_poc[h->picture_structure - 1];
         }
         if (sl->ref_count[0] == 1 && sl->ref_count[1] == 1 && !FRAME_MBAFF(h) &&
-            sl->ref_list[0][0].poc + sl->ref_list[1][0].poc == 2 * cur_poc) {
+            sl->ref_list[0][0].poc + (int64_t)sl->ref_list[1][0].poc == 2LL * cur_poc) {
             sl->pwt.use_weight        = 0;
             sl->pwt.use_weight_chroma = 0;
             return;
@@ -657,7 +699,7 @@ static void implicit_weight_table(const H264Context *h, H264SliceContext *sl, in
     sl->pwt.chroma_log2_weight_denom = 5;
 
     for (ref0 = ref_start; ref0 < ref_count0; ref0++) {
-        int poc0 = sl->ref_list[0][ref0].poc;
+        int64_t poc0 = sl->ref_list[0][ref0].poc;
         for (ref1 = ref_start; ref1 < ref_count1; ref1++) {
             int w = 32;
             if (!sl->ref_list[0][ref0].parent->long_ref && !sl->ref_list[1][ref1].parent->long_ref) {
@@ -688,13 +730,13 @@ static void init_scan_tables(H264Context *h)
 {
     int i;
     for (i = 0; i < 16; i++) {
-#define TRANSPOSE(x) (x >> 2) | ((x << 2) & 0xF)
+#define TRANSPOSE(x) ((x) >> 2) | (((x) << 2) & 0xF)
         h->zigzag_scan[i] = TRANSPOSE(ff_zigzag_scan[i]);
         h->field_scan[i]  = TRANSPOSE(field_scan[i]);
 #undef TRANSPOSE
     }
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) ((x) >> 3) | (((x) & 7) << 3)
         h->zigzag_scan8x8[i]       = TRANSPOSE(ff_zigzag_direct[i]);
         h->zigzag_scan8x8_cavlc[i] = TRANSPOSE(zigzag_scan8x8_cavlc[i]);
         h->field_scan8x8[i]        = TRANSPOSE(field_scan8x8[i]);
@@ -702,32 +744,33 @@ static void init_scan_tables(H264Context *h)
 #undef TRANSPOSE
     }
     if (h->ps.sps->transform_bypass) { // FIXME same ugly
-        h->zigzag_scan_q0          = ff_zigzag_scan;
-        h->zigzag_scan8x8_q0       = ff_zigzag_direct;
-        h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
-        h->field_scan_q0           = field_scan;
-        h->field_scan8x8_q0        = field_scan8x8;
-        h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
+        memcpy(h->zigzag_scan_q0          , ff_zigzag_scan          , sizeof(h->zigzag_scan_q0         ));
+        memcpy(h->zigzag_scan8x8_q0       , ff_zigzag_direct        , sizeof(h->zigzag_scan8x8_q0      ));
+        memcpy(h->zigzag_scan8x8_cavlc_q0 , zigzag_scan8x8_cavlc    , sizeof(h->zigzag_scan8x8_cavlc_q0));
+        memcpy(h->field_scan_q0           , field_scan              , sizeof(h->field_scan_q0          ));
+        memcpy(h->field_scan8x8_q0        , field_scan8x8           , sizeof(h->field_scan8x8_q0       ));
+        memcpy(h->field_scan8x8_cavlc_q0  , field_scan8x8_cavlc     , sizeof(h->field_scan8x8_cavlc_q0 ));
     } else {
-        h->zigzag_scan_q0          = h->zigzag_scan;
-        h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
-        h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
-        h->field_scan_q0           = h->field_scan;
-        h->field_scan8x8_q0        = h->field_scan8x8;
-        h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
+        memcpy(h->zigzag_scan_q0          , h->zigzag_scan          , sizeof(h->zigzag_scan_q0         ));
+        memcpy(h->zigzag_scan8x8_q0       , h->zigzag_scan8x8       , sizeof(h->zigzag_scan8x8_q0      ));
+        memcpy(h->zigzag_scan8x8_cavlc_q0 , h->zigzag_scan8x8_cavlc , sizeof(h->zigzag_scan8x8_cavlc_q0));
+        memcpy(h->field_scan_q0           , h->field_scan           , sizeof(h->field_scan_q0          ));
+        memcpy(h->field_scan8x8_q0        , h->field_scan8x8        , sizeof(h->field_scan8x8_q0       ));
+        memcpy(h->field_scan8x8_cavlc_q0  , h->field_scan8x8_cavlc  , sizeof(h->field_scan8x8_cavlc_q0 ));
     }
 }
 
-static enum AVPixelFormat get_pixel_format(H264Context *h)
+static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
 {
 #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \
                      (CONFIG_H264_D3D11VA_HWACCEL * 2) + \
+                     CONFIG_H264_NVDEC_HWACCEL + \
                      CONFIG_H264_VAAPI_HWACCEL + \
-                     (CONFIG_H264_VDA_HWACCEL * 2) + \
-                     CONFIG_H264_VDPAU_HWACCEL     + \
-                     CONFIG_H264_CUVID_HWACCEL)
+                     CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \
+                     CONFIG_H264_VDPAU_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
     const enum AVPixelFormat *choices = pix_fmts;
+    int i;
 
     switch (h->ps.sps->bit_depth_luma) {
     case 9:
@@ -752,11 +795,33 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
         else
             *fmt++ = AV_PIX_FMT_YUV420P10;
         break;
+    case 12:
+        if (CHROMA444(h)) {
+            if (h->avctx->colorspace == AVCOL_SPC_RGB) {
+                *fmt++ = AV_PIX_FMT_GBRP12;
+            } else
+                *fmt++ = AV_PIX_FMT_YUV444P12;
+        } else if (CHROMA422(h))
+            *fmt++ = AV_PIX_FMT_YUV422P12;
+        else
+            *fmt++ = AV_PIX_FMT_YUV420P12;
+        break;
+    case 14:
+        if (CHROMA444(h)) {
+            if (h->avctx->colorspace == AVCOL_SPC_RGB) {
+                *fmt++ = AV_PIX_FMT_GBRP14;
+            } else
+                *fmt++ = AV_PIX_FMT_YUV444P14;
+        } else if (CHROMA422(h))
+            *fmt++ = AV_PIX_FMT_YUV422P14;
+        else
+            *fmt++ = AV_PIX_FMT_YUV420P14;
+        break;
     case 8:
 #if CONFIG_H264_VDPAU_HWACCEL
         *fmt++ = AV_PIX_FMT_VDPAU;
 #endif
-#if CONFIG_H264_CUVID_HWACCEL
+#if CONFIG_H264_NVDEC_HWACCEL
         *fmt++ = AV_PIX_FMT_CUDA;
 #endif
         if (CHROMA444(h)) {
@@ -782,9 +847,8 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
 #if CONFIG_H264_VAAPI_HWACCEL
             *fmt++ = AV_PIX_FMT_VAAPI;
 #endif
-#if CONFIG_H264_VDA_HWACCEL
-            *fmt++ = AV_PIX_FMT_VDA_VLD;
-            *fmt++ = AV_PIX_FMT_VDA;
+#if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+            *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
 #endif
             if (h->avctx->codec->pix_fmts)
                 choices = h->avctx->codec->pix_fmts;
@@ -802,25 +866,32 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
 
     *fmt = AV_PIX_FMT_NONE;
 
-    return ff_get_format(h->avctx, choices);
+    for (i=0; choices[i] != AV_PIX_FMT_NONE; i++)
+        if (choices[i] == h->avctx->pix_fmt && !force_callback)
+            return choices[i];
+    return ff_thread_get_format(h->avctx, choices);
 }
 
 /* export coded and cropped frame dimensions to AVCodecContext */
 static int init_dimensions(H264Context *h)
 {
-    SPS *sps = h->ps.sps;
+    const SPS *sps = (const SPS*)h->ps.sps;
     int cr = sps->crop_right;
     int cl = sps->crop_left;
     int ct = sps->crop_top;
     int cb = sps->crop_bottom;
     int width  = h->width  - (cr + cl);
     int height = h->height - (ct + cb);
+    av_assert0(sps->crop_right + sps->crop_left < (unsigned)h->width);
+    av_assert0(sps->crop_top + sps->crop_bottom < (unsigned)h->height);
 
     /* handle container cropping */
     if (h->width_from_caller > 0 && h->height_from_caller > 0     &&
         !sps->crop_top && !sps->crop_left                         &&
         FFALIGN(h->width_from_caller,  16) == FFALIGN(width,  16) &&
-        FFALIGN(h->height_from_caller, 16) == FFALIGN(height, 16)) {
+        FFALIGN(h->height_from_caller, 16) == FFALIGN(height, 16) &&
+        h->width_from_caller  <= width &&
+        h->height_from_caller <= height) {
         width  = h->width_from_caller;
         height = h->height_from_caller;
         cl = 0;
@@ -858,7 +929,7 @@ static int h264_slice_header_init(H264Context *h)
         if (h->x264_build < 44U)
             den *= 2;
         av_reduce(&h->avctx->framerate.den, &h->avctx->framerate.num,
-                  sps->num_units_in_tick, den, 1 << 30);
+                  sps->num_units_in_tick * h->avctx->ticks_per_frame, den, 1 << 30);
     }
 
     ff_h264_free_tables(h);
@@ -870,16 +941,21 @@ static int h264_slice_header_init(H264Context *h)
     ret = ff_h264_alloc_tables(h);
     if (ret < 0) {
         av_log(h->avctx, AV_LOG_ERROR, "Could not allocate memory\n");
-        return ret;
+        goto fail;
     }
 
-    if (sps->bit_depth_luma < 8 || sps->bit_depth_luma > 10) {
+    if (sps->bit_depth_luma < 8 || sps->bit_depth_luma > 14 ||
+        sps->bit_depth_luma == 11 || sps->bit_depth_luma == 13
+    ) {
         av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n",
                sps->bit_depth_luma);
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
     }
 
+    h->cur_bit_depth_luma         =
     h->avctx->bits_per_raw_sample = sps->bit_depth_luma;
+    h->cur_chroma_format_idc      = sps->chroma_format_idc;
     h->pixel_shift                = sps->bit_depth_luma > 8;
     h->chroma_format_idc          = sps->chroma_format_idc;
     h->bit_depth_luma             = sps->bit_depth_luma;
@@ -896,7 +972,7 @@ static int h264_slice_header_init(H264Context *h)
         ret = ff_h264_slice_context_init(h, &h->slice_ctx[0]);
         if (ret < 0) {
             av_log(h->avctx, AV_LOG_ERROR, "context_init() failed.\n");
-            return ret;
+            goto fail;
         }
     } else {
         for (i = 0; i < h->nb_slice_ctx; i++) {
@@ -909,7 +985,7 @@ static int h264_slice_header_init(H264Context *h)
 
             if ((ret = ff_h264_slice_context_init(h, sl)) < 0) {
                 av_log(h->avctx, AV_LOG_ERROR, "context_init() failed.\n");
-                return ret;
+                goto fail;
             }
         }
     }
@@ -917,16 +993,51 @@ static int h264_slice_header_init(H264Context *h)
     h->context_initialized = 1;
 
     return 0;
+fail:
+    ff_h264_free_tables(h);
+    h->context_initialized = 0;
+    return ret;
+}
+
+static enum AVPixelFormat non_j_pixfmt(enum AVPixelFormat a)
+{
+    switch (a) {
+    case AV_PIX_FMT_YUVJ420P: return AV_PIX_FMT_YUV420P;
+    case AV_PIX_FMT_YUVJ422P: return AV_PIX_FMT_YUV422P;
+    case AV_PIX_FMT_YUVJ444P: return AV_PIX_FMT_YUV444P;
+    default:
+        return a;
+    }
 }
 
-static int h264_init_ps(H264Context *h, const H264SliceContext *sl)
+static int h264_init_ps(H264Context *h, const H264SliceContext *sl, int first_slice)
 {
     const SPS *sps;
-    int needs_reinit = 0, ret;
+    int needs_reinit = 0, must_reinit, ret;
+
+    if (first_slice) {
+        av_buffer_unref(&h->ps.pps_ref);
+        h->ps.pps = NULL;
+        h->ps.pps_ref = av_buffer_ref(h->ps.pps_list[sl->pps_id]);
+        if (!h->ps.pps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.pps = (const PPS*)h->ps.pps_ref->data;
+    }
 
-    h->ps.pps = (const PPS*)h->ps.pps_list[sl->pps_id]->data;
     if (h->ps.sps != (const SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data) {
-        h->ps.sps = (SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data;
+        av_buffer_unref(&h->ps.sps_ref);
+        h->ps.sps = NULL;
+        h->ps.sps_ref = av_buffer_ref(h->ps.sps_list[h->ps.pps->sps_id]);
+        if (!h->ps.sps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.sps = (const SPS*)h->ps.sps_ref->data;
+
+        if (h->mb_width  != h->ps.sps->mb_width ||
+            h->mb_height != h->ps.sps->mb_height ||
+            h->cur_bit_depth_luma    != h->ps.sps->bit_depth_luma ||
+            h->cur_chroma_format_idc != h->ps.sps->chroma_format_idc
+        )
+            needs_reinit = 1;
 
         if (h->bit_depth_luma    != h->ps.sps->bit_depth_luma ||
             h->chroma_format_idc != h->ps.sps->chroma_format_idc)
@@ -934,43 +1045,57 @@ static int h264_init_ps(H264Context *h, const H264SliceContext *sl)
     }
     sps = h->ps.sps;
 
-    h->avctx->profile = ff_h264_get_profile(sps);
-    h->avctx->level   = sps->level_idc;
-    h->avctx->refs    = sps->ref_frame_count;
+    must_reinit = (h->context_initialized &&
+                    (   16*sps->mb_width != h->avctx->coded_width
+                     || 16*sps->mb_height != h->avctx->coded_height
+                     || h->cur_bit_depth_luma    != sps->bit_depth_luma
+                     || h->cur_chroma_format_idc != sps->chroma_format_idc
+                     || h->mb_width  != sps->mb_width
+                     || h->mb_height != sps->mb_height
+                    ));
+    if (h->avctx->pix_fmt == AV_PIX_FMT_NONE
+        || (non_j_pixfmt(h->avctx->pix_fmt) != non_j_pixfmt(get_pixel_format(h, 0))))
+        must_reinit = 1;
+
+    if (first_slice && av_cmp_q(sps->sar, h->avctx->sample_aspect_ratio))
+        must_reinit = 1;
 
-    if (h->mb_width  != sps->mb_width ||
-        h->mb_height != sps->mb_height)
-        needs_reinit = 1;
+    if (!h->setup_finished) {
+        h->avctx->profile = ff_h264_get_profile(sps);
+        h->avctx->level   = sps->level_idc;
+        h->avctx->refs    = sps->ref_frame_count;
 
-    h->mb_width  = sps->mb_width;
-    h->mb_height = sps->mb_height;
-    h->mb_num    = h->mb_width * h->mb_height;
-    h->mb_stride = h->mb_width + 1;
+        h->mb_width  = sps->mb_width;
+        h->mb_height = sps->mb_height;
+        h->mb_num    = h->mb_width * h->mb_height;
+        h->mb_stride = h->mb_width + 1;
 
-    h->b_stride = h->mb_width * 4;
+        h->b_stride = h->mb_width * 4;
 
-    h->chroma_y_shift = sps->chroma_format_idc <= 1; // 400 uses yuv420p
+        h->chroma_y_shift = sps->chroma_format_idc <= 1; // 400 uses yuv420p
 
-    h->width  = 16 * h->mb_width;
-    h->height = 16 * h->mb_height;
+        h->width  = 16 * h->mb_width;
+        h->height = 16 * h->mb_height;
 
-    ret = init_dimensions(h);
-    if (ret < 0)
-        return ret;
+        ret = init_dimensions(h);
+        if (ret < 0)
+            return ret;
 
-    if (sps->video_signal_type_present_flag) {
-        h->avctx->color_range = sps->full_range ? AVCOL_RANGE_JPEG
-            : AVCOL_RANGE_MPEG;
-        if (sps->colour_description_present_flag) {
-            if (h->avctx->colorspace != sps->colorspace)
-                needs_reinit = 1;
-            h->avctx->color_primaries = sps->color_primaries;
-            h->avctx->color_trc       = sps->color_trc;
-            h->avctx->colorspace      = sps->colorspace;
+        if (sps->video_signal_type_present_flag) {
+            h->avctx->color_range = sps->full_range > 0 ? AVCOL_RANGE_JPEG
+                                                        : AVCOL_RANGE_MPEG;
+            if (sps->colour_description_present_flag) {
+                if (h->avctx->colorspace != sps->colorspace)
+                    needs_reinit = 1;
+                h->avctx->color_primaries = sps->color_primaries;
+                h->avctx->color_trc       = sps->color_trc;
+                h->avctx->colorspace      = sps->colorspace;
+            }
         }
     }
 
-    if (!h->context_initialized || needs_reinit) {
+    if (!h->context_initialized || must_reinit || needs_reinit) {
+        int flush_changes = h->context_initialized;
         h->context_initialized = 0;
         if (sl != h->slice_ctx) {
             av_log(h->avctx, AV_LOG_ERROR,
@@ -982,14 +1107,17 @@ static int h264_init_ps(H264Context *h, const H264SliceContext *sl)
             return AVERROR_INVALIDDATA;
         }
 
-        ff_h264_flush_change(h);
+        av_assert1(first_slice);
 
-        if ((ret = get_pixel_format(h)) < 0)
+        if (flush_changes)
+            ff_h264_flush_change(h);
+
+        if ((ret = get_pixel_format(h, 1)) < 0)
             return ret;
         h->avctx->pix_fmt = ret;
 
         av_log(h->avctx, AV_LOG_VERBOSE, "Reinit context to %dx%d, "
-               "pix_fmt: %d\n", h->width, h->height, h->avctx->pix_fmt);
+               "pix_fmt: %s\n", h->width, h->height, av_get_pix_fmt_name(h->avctx->pix_fmt));
 
         if ((ret = h264_slice_header_init(h)) < 0) {
             av_log(h->avctx, AV_LOG_ERROR,
@@ -1058,8 +1186,7 @@ static int h264_export_frame_props(H264Context *h)
         /* Derive top_field_first from field pocs. */
         cur->f->top_field_first = cur->field_poc[0] < cur->field_poc[1];
     } else {
-        if (cur->f->interlaced_frame ||
-            (sps->pic_struct_present_flag && h->sei.picture_timing.present)) {
+        if (sps->pic_struct_present_flag && h->sei.picture_timing.present) {
             /* Use picture timing SEI information. Even if it is a
              * information of a past frame, better than nothing. */
             if (h->sei.picture_timing.pic_struct == H264_SEI_PIC_STRUCT_TOP_BOTTOM ||
@@ -1067,6 +1194,10 @@ static int h264_export_frame_props(H264Context *h)
                 cur->f->top_field_first = 1;
             else
                 cur->f->top_field_first = 0;
+        } else if (cur->f->interlaced_frame) {
+            /* Default to top field first when pic_struct_present_flag
+             * is not set but interlaced frame detected */
+            cur->f->top_field_first = 1;
         } else {
             /* Most likely progressive */
             cur->f->top_field_first = 0;
@@ -1074,38 +1205,35 @@ static int h264_export_frame_props(H264Context *h)
     }
 
     if (h->sei.frame_packing.present &&
-        h->sei.frame_packing.arrangement_type >= 0 &&
         h->sei.frame_packing.arrangement_type <= 6 &&
         h->sei.frame_packing.content_interpretation_type > 0 &&
         h->sei.frame_packing.content_interpretation_type < 3) {
         H264SEIFramePacking *fp = &h->sei.frame_packing;
         AVStereo3D *stereo = av_stereo3d_create_side_data(cur->f);
-        if (!stereo)
-            return AVERROR(ENOMEM);
-
+        if (stereo) {
         switch (fp->arrangement_type) {
-        case 0:
+        case H264_SEI_FPA_TYPE_CHECKERBOARD:
             stereo->type = AV_STEREO3D_CHECKERBOARD;
             break;
-        case 1:
+        case H264_SEI_FPA_TYPE_INTERLEAVE_COLUMN:
             stereo->type = AV_STEREO3D_COLUMNS;
             break;
-        case 2:
+        case H264_SEI_FPA_TYPE_INTERLEAVE_ROW:
             stereo->type = AV_STEREO3D_LINES;
             break;
-        case 3:
-            if (fp->quincunx_subsampling)
+        case H264_SEI_FPA_TYPE_SIDE_BY_SIDE:
+            if (fp->quincunx_sampling_flag)
                 stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
             else
                 stereo->type = AV_STEREO3D_SIDEBYSIDE;
             break;
-        case 4:
+        case H264_SEI_FPA_TYPE_TOP_BOTTOM:
             stereo->type = AV_STEREO3D_TOPBOTTOM;
             break;
-        case 5:
+        case H264_SEI_FPA_TYPE_INTERLEAVE_TEMPORAL:
             stereo->type = AV_STEREO3D_FRAMESEQUENCE;
             break;
-        case 6:
+        case H264_SEI_FPA_TYPE_2D:
             stereo->type = AV_STEREO3D_2D;
             break;
         }
@@ -1113,12 +1241,13 @@ static int h264_export_frame_props(H264Context *h)
         if (fp->content_interpretation_type == 2)
             stereo->flags = AV_STEREO3D_FLAG_INVERT;
 
-        if (fp->arrangement_type == 5) {
+        if (fp->arrangement_type == H264_SEI_FPA_TYPE_INTERLEAVE_TEMPORAL) {
             if (fp->current_frame_is_frame0_flag)
                 stereo->view = AV_STEREO3D_VIEW_LEFT;
             else
                 stereo->view = AV_STEREO3D_VIEW_RIGHT;
         }
+        }
     }
 
     if (h->sei.display_orientation.present &&
@@ -1130,35 +1259,77 @@ static int h264_export_frame_props(H264Context *h)
         AVFrameSideData *rotation = av_frame_new_side_data(cur->f,
                                                            AV_FRAME_DATA_DISPLAYMATRIX,
                                                            sizeof(int32_t) * 9);
-        if (!rotation)
-            return AVERROR(ENOMEM);
-
-        av_display_rotation_set((int32_t *)rotation->data, angle);
-        av_display_matrix_flip((int32_t *)rotation->data,
-                               o->hflip, o->vflip);
+        if (rotation) {
+            av_display_rotation_set((int32_t *)rotation->data, angle);
+            av_display_matrix_flip((int32_t *)rotation->data,
+                                   o->hflip, o->vflip);
+        }
     }
 
     if (h->sei.afd.present) {
         AVFrameSideData *sd = av_frame_new_side_data(cur->f, AV_FRAME_DATA_AFD,
                                                      sizeof(uint8_t));
-        if (!sd)
-            return AVERROR(ENOMEM);
 
-        *sd->data = h->sei.afd.active_format_description;
-        h->sei.afd.present = 0;
+        if (sd) {
+            *sd->data = h->sei.afd.active_format_description;
+            h->sei.afd.present = 0;
+        }
     }
 
-    if (h->sei.a53_caption.a53_caption) {
+    if (h->sei.a53_caption.buf_ref) {
         H264SEIA53Caption *a53 = &h->sei.a53_caption;
-        AVFrameSideData *sd = av_frame_new_side_data(cur->f,
-                                                     AV_FRAME_DATA_A53_CC,
-                                                     a53->a53_caption_size);
+
+        AVFrameSideData *sd = av_frame_new_side_data_from_buf(cur->f, AV_FRAME_DATA_A53_CC, a53->buf_ref);
         if (!sd)
+            av_buffer_unref(&a53->buf_ref);
+        a53->buf_ref = NULL;
+
+        h->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+    }
+
+    if (h->sei.picture_timing.timecode_cnt > 0) {
+        uint32_t tc = 0;
+        uint32_t *tc_sd;
+
+        AVFrameSideData *tcside = av_frame_new_side_data(cur->f,
+                                                         AV_FRAME_DATA_S12M_TIMECODE,
+                                                         sizeof(uint32_t)*4);
+        if (!tcside)
             return AVERROR(ENOMEM);
 
-        memcpy(sd->data, a53->a53_caption, a53->a53_caption_size);
-        av_freep(&a53->a53_caption);
-        a53->a53_caption_size = 0;
+        tc_sd = (uint32_t*)tcside->data;
+        tc_sd[0] = h->sei.picture_timing.timecode_cnt;
+
+        for (int i = 0; i < tc_sd[0]; i++) {
+            uint32_t frames;
+
+            /* For SMPTE 12-M timecodes, frame count is a special case if > 30 FPS.
+               See SMPTE ST 12-1:2014 Sec 12.1 for more info. */
+            if (av_cmp_q(h->avctx->framerate, (AVRational) {30, 1}) == 1) {
+                frames = h->sei.picture_timing.timecode[i].frame / 2;
+                if (h->sei.picture_timing.timecode[i].frame % 2 == 1) {
+                    if (av_cmp_q(h->avctx->framerate, (AVRational) {50, 1}) == 0)
+                        tc |= (1 << 7);
+                    else
+                        tc |= (1 << 23);
+                }
+            } else {
+                frames = h->sei.picture_timing.timecode[i].frame;
+            }
+
+            tc |= h->sei.picture_timing.timecode[i].dropframe << 30;
+            tc |= (frames / 10) << 28;
+            tc |= (frames % 10) << 24;
+            tc |= (h->sei.picture_timing.timecode[i].seconds / 10) << 20;
+            tc |= (h->sei.picture_timing.timecode[i].seconds % 10) << 16;
+            tc |= (h->sei.picture_timing.timecode[i].minutes / 10) << 12;
+            tc |= (h->sei.picture_timing.timecode[i].minutes % 10) << 8;
+            tc |= (h->sei.picture_timing.timecode[i].hours / 10) << 4;
+            tc |= (h->sei.picture_timing.timecode[i].hours % 10);
+
+            tc_sd[i + 1] = tc;
+        }
+        h->sei.picture_timing.timecode_cnt = 0;
     }
 
     if (h->sei.alternative_transfer.present &&
@@ -1176,97 +1347,76 @@ static int h264_select_output_frame(H264Context *h)
     H264Picture *out = h->cur_pic_ptr;
     H264Picture *cur = h->cur_pic_ptr;
     int i, pics, out_of_order, out_idx;
-    int invalid = 0, cnt = 0;
-    int ret;
+
+    cur->mmco_reset = h->mmco_reset;
+    h->mmco_reset = 0;
 
     if (sps->bitstream_restriction_flag ||
-        h->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL) {
+        h->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT) {
         h->avctx->has_b_frames = FFMAX(h->avctx->has_b_frames, sps->num_reorder_frames);
     }
 
+    for (i = 0; 1; i++) {
+        if(i == MAX_DELAYED_PIC_COUNT || cur->poc < h->last_pocs[i]){
+            if(i)
+                h->last_pocs[i-1] = cur->poc;
+            break;
+        } else if(i) {
+            h->last_pocs[i-1]= h->last_pocs[i];
+        }
+    }
+    out_of_order = MAX_DELAYED_PIC_COUNT - i;
+    if(   cur->f->pict_type == AV_PICTURE_TYPE_B
+       || (h->last_pocs[MAX_DELAYED_PIC_COUNT-2] > INT_MIN && h->last_pocs[MAX_DELAYED_PIC_COUNT-1] - (int64_t)h->last_pocs[MAX_DELAYED_PIC_COUNT-2] > 2))
+        out_of_order = FFMAX(out_of_order, 1);
+    if (out_of_order == MAX_DELAYED_PIC_COUNT) {
+        av_log(h->avctx, AV_LOG_VERBOSE, "Invalid POC %d<%d\n", cur->poc, h->last_pocs[0]);
+        for (i = 1; i < MAX_DELAYED_PIC_COUNT; i++)
+            h->last_pocs[i] = INT_MIN;
+        h->last_pocs[0] = cur->poc;
+        cur->mmco_reset = 1;
+    } else if(h->avctx->has_b_frames < out_of_order && !sps->bitstream_restriction_flag){
+        int loglevel = h->avctx->frame_number > 1 ? AV_LOG_WARNING : AV_LOG_VERBOSE;
+        av_log(h->avctx, loglevel, "Increasing reorder buffer to %d\n", out_of_order);
+        h->avctx->has_b_frames = out_of_order;
+    }
+
     pics = 0;
     while (h->delayed_pic[pics])
         pics++;
 
-    assert(pics <= MAX_DELAYED_PIC_COUNT);
+    av_assert0(pics <= MAX_DELAYED_PIC_COUNT);
 
     h->delayed_pic[pics++] = cur;
     if (cur->reference == 0)
         cur->reference = DELAYED_PIC_REF;
 
-    /* Frame reordering. This code takes pictures from coding order and sorts
-     * them by their incremental POC value into display order. It supports POC
-     * gaps, MMCO reset codes and random resets.
-     * A "display group" can start either with a IDR frame (f.key_frame = 1),
-     * and/or can be closed down with a MMCO reset code. In sequences where
-     * there is no delay, we can't detect that (since the frame was already
-     * output to the user), so we also set h->mmco_reset to detect the MMCO
-     * reset code.
-     * FIXME: if we detect insufficient delays (as per h->avctx->has_b_frames),
-     * we increase the delay between input and output. All frames affected by
-     * the lag (e.g. those that should have been output before another frame
-     * that we already returned to the user) will be dropped. This is a bug
-     * that we will fix later. */
-    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++) {
-        cnt     += out->poc < h->last_pocs[i];
-        invalid += out->poc == INT_MIN;
-    }
-    if (!h->mmco_reset && !cur->f->key_frame &&
-        cnt + invalid == MAX_DELAYED_PIC_COUNT && cnt > 0) {
-        h->mmco_reset = 2;
-        if (pics > 1)
-            h->delayed_pic[pics - 2]->mmco_reset = 2;
-    }
-    if (h->mmco_reset || cur->f->key_frame) {
-        for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
-            h->last_pocs[i] = INT_MIN;
-        cnt     = 0;
-        invalid = MAX_DELAYED_PIC_COUNT;
-    }
     out     = h->delayed_pic[0];
     out_idx = 0;
-    for (i = 1; i < MAX_DELAYED_PIC_COUNT &&
-                h->delayed_pic[i] &&
-                !h->delayed_pic[i - 1]->mmco_reset &&
-                !h->delayed_pic[i]->f->key_frame;
+    for (i = 1; h->delayed_pic[i] &&
+                !h->delayed_pic[i]->f->key_frame &&
+                !h->delayed_pic[i]->mmco_reset;
          i++)
         if (h->delayed_pic[i]->poc < out->poc) {
             out     = h->delayed_pic[i];
             out_idx = i;
         }
     if (h->avctx->has_b_frames == 0 &&
-        (h->delayed_pic[0]->f->key_frame || h->mmco_reset))
+        (h->delayed_pic[0]->f->key_frame || h->delayed_pic[0]->mmco_reset))
         h->next_outputed_poc = INT_MIN;
-    out_of_order = !out->f->key_frame && !h->mmco_reset &&
-                   (out->poc < h->next_outputed_poc);
-
-    if (sps->bitstream_restriction_flag &&
-        h->avctx->has_b_frames >= sps->num_reorder_frames) {
-    } else if (out_of_order && pics - 1 == h->avctx->has_b_frames &&
-               h->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT) {
-        if (invalid + cnt < MAX_DELAYED_PIC_COUNT) {
-            h->avctx->has_b_frames = FFMAX(h->avctx->has_b_frames, cnt);
-        }
-    } else if (!h->avctx->has_b_frames &&
-               ((h->next_outputed_poc != INT_MIN &&
-                 out->poc > h->next_outputed_poc + 2) ||
-                cur->f->pict_type == AV_PICTURE_TYPE_B)) {
-        h->avctx->has_b_frames++;
-    }
+    out_of_order = out->poc < h->next_outputed_poc;
 
-    if (pics > h->avctx->has_b_frames) {
+    if (out_of_order || pics > h->avctx->has_b_frames) {
         out->reference &= ~DELAYED_PIC_REF;
         for (i = out_idx; h->delayed_pic[i]; i++)
             h->delayed_pic[i] = h->delayed_pic[i + 1];
     }
-    memmove(h->last_pocs, &h->last_pocs[1],
-            sizeof(*h->last_pocs) * (MAX_DELAYED_PIC_COUNT - 1));
-    h->last_pocs[MAX_DELAYED_PIC_COUNT - 1] = cur->poc;
     if (!out_of_order && pics > h->avctx->has_b_frames) {
-        av_frame_unref(h->output_frame);
-        ret = av_frame_ref(h->output_frame, out->f);
-        if (ret < 0)
-            return ret;
+        h->next_output_pic = out;
+        if (out_idx == 0 && h->delayed_pic[0] && (h->delayed_pic[0]->f->key_frame || h->delayed_pic[0]->mmco_reset)) {
+            h->next_outputed_poc = INT_MIN;
+        } else
+            h->next_outputed_poc = out->poc;
 
         if (out->recovered) {
             // We have reached an recovery point and all frames after it in
@@ -1276,29 +1426,15 @@ static int h264_select_output_frame(H264Context *h)
         out->recovered |= !!(h->frame_recovered & FRAME_RECOVERED_SEI);
 
         if (!out->recovered) {
-            if (!(h->avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT))
-                av_frame_unref(h->output_frame);
-            else
-                h->output_frame->flags |= AV_FRAME_FLAG_CORRUPT;
-        }
-
-        if (out->mmco_reset) {
-            if (out_idx > 0) {
-                h->next_outputed_poc                    = out->poc;
-                h->delayed_pic[out_idx - 1]->mmco_reset = out->mmco_reset;
+            if (!(h->avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) &&
+                !(h->avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL)) {
+                h->next_output_pic = NULL;
             } else {
-                h->next_outputed_poc = INT_MIN;
-            }
-        } else {
-            if (out_idx == 0 && pics > 1 && h->delayed_pic[0]->f->key_frame) {
-                h->next_outputed_poc = INT_MIN;
-            } else {
-                h->next_outputed_poc = out->poc;
+                out->f->flags |= AV_FRAME_FLAG_CORRUPT;
             }
         }
-        h->mmco_reset = 0;
     } else {
-        av_log(h->avctx, AV_LOG_DEBUG, "no picture\n");
+        av_log(h->avctx, AV_LOG_DEBUG, "no picture %s\n", out_of_order ? "ooo" : "");
     }
 
     return 0;
@@ -1309,18 +1445,24 @@ static int h264_select_output_frame(H264Context *h)
  * or a second field in a pair and does the necessary setup.
  */
 static int h264_field_start(H264Context *h, const H264SliceContext *sl,
-                            const H2645NAL *nal)
+                            const H2645NAL *nal, int first_slice)
 {
+    int i;
     const SPS *sps;
 
     int last_pic_structure, last_pic_droppable, ret;
 
-    ret = h264_init_ps(h, sl);
+    ret = h264_init_ps(h, sl, first_slice);
     if (ret < 0)
         return ret;
 
     sps = h->ps.sps;
 
+    if (sps && sps->bitstream_restriction_flag &&
+        h->avctx->has_b_frames < sps->num_reorder_frames) {
+        h->avctx->has_b_frames = sps->num_reorder_frames;
+    }
+
     last_pic_droppable   = h->droppable;
     last_pic_structure   = h->picture_structure;
     h->droppable         = (nal->ref_idc == 0);
@@ -1354,17 +1496,23 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
      * Here, we're using that to see if we should mark previously
      * decode frames as "finished".
      * We have to do that before the "dummy" in-between frame allocation,
-     * since that can modify s->current_picture_ptr. */
+     * since that can modify h->cur_pic_ptr. */
     if (h->first_field) {
-        assert(h->cur_pic_ptr);
-        assert(h->cur_pic_ptr->f->buf[0]);
+        int last_field = last_pic_structure == PICT_BOTTOM_FIELD;
+        av_assert0(h->cur_pic_ptr);
+        av_assert0(h->cur_pic_ptr->f->buf[0]);
         assert(h->cur_pic_ptr->reference != DELAYED_PIC_REF);
 
+        /* Mark old field/frame as completed */
+        if (h->cur_pic_ptr->tf.owner[last_field] == h->avctx) {
+            ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, last_field);
+        }
+
         /* figure out if we have a complementary field pair */
         if (!FIELD_PICTURE(h) || h->picture_structure == last_pic_structure) {
             /* Previous field is unmatched. Don't display it, but let it
              * remain for reference if marked as such. */
-            if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
+            if (last_pic_structure != PICT_FRAME) {
                 ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
                                           last_pic_structure == PICT_TOP_FIELD);
             }
@@ -1374,7 +1522,7 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
                  * different frame_nums. Consider this field first in
                  * pair. Throw away previous field except for reference
                  * purposes. */
-                if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
+                if (last_pic_structure != PICT_FRAME) {
                     ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
                                               last_pic_structure == PICT_TOP_FIELD);
                 }
@@ -1401,12 +1549,15 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
         }
     }
 
-    while (h->poc.frame_num != h->poc.prev_frame_num &&
+    while (h->poc.frame_num != h->poc.prev_frame_num && !h->first_field &&
            h->poc.frame_num != (h->poc.prev_frame_num + 1) % (1 << sps->log2_max_frame_num)) {
         H264Picture *prev = h->short_ref_count ? h->short_ref[0] : NULL;
         av_log(h->avctx, AV_LOG_DEBUG, "Frame num gap %d %d\n",
                h->poc.frame_num, h->poc.prev_frame_num);
-        ret = initialize_cur_frame(h);
+        if (!sps->gaps_in_frame_num_allowed_flag)
+            for(i=0; i<FF_ARRAY_ELEMS(h->last_pocs); i++)
+                h->last_pocs[i] = INT_MIN;
+        ret = h264_frame_start(h);
         if (ret < 0) {
             h->first_field = 0;
             return ret;
@@ -1415,6 +1566,7 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
         h->poc.prev_frame_num++;
         h->poc.prev_frame_num        %= 1 << sps->log2_max_frame_num;
         h->cur_pic_ptr->frame_num = h->poc.prev_frame_num;
+        h->cur_pic_ptr->invalid_gap = !sps->gaps_in_frame_num_allowed_flag;
         ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 0);
         ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 1);
 
@@ -1442,8 +1594,8 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
                               (const uint8_t **)prev->f->data,
                               prev->f->linesize,
                               prev->f->format,
-                              h->mb_width  * 16,
-                              h->mb_height * 16);
+                              prev->f->width,
+                              prev->f->height);
                 h->short_ref[0]->poc = prev->poc + 2;
             }
             h->short_ref[0]->frame_num = h->poc.prev_frame_num;
@@ -1454,23 +1606,33 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
      * We're using that to see whether to continue decoding in that
      * frame, or to allocate a new one. */
     if (h->first_field) {
-        assert(h->cur_pic_ptr);
-        assert(h->cur_pic_ptr->f->buf[0]);
+        av_assert0(h->cur_pic_ptr);
+        av_assert0(h->cur_pic_ptr->f->buf[0]);
         assert(h->cur_pic_ptr->reference != DELAYED_PIC_REF);
 
         /* figure out if we have a complementary field pair */
         if (!FIELD_PICTURE(h) || h->picture_structure == last_pic_structure) {
             /* Previous field is unmatched. Don't display it, but let it
              * remain for reference if marked as such. */
+            h->missing_fields ++;
             h->cur_pic_ptr = NULL;
             h->first_field = FIELD_PICTURE(h);
         } else {
+            h->missing_fields = 0;
             if (h->cur_pic_ptr->frame_num != h->poc.frame_num) {
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                          h->picture_structure==PICT_BOTTOM_FIELD);
                 /* This and the previous field had different frame_nums.
                  * Consider this field first in pair. Throw away previous
                  * one except for reference purposes. */
                 h->first_field = 1;
                 h->cur_pic_ptr = NULL;
+            } else if (h->cur_pic_ptr->reference & DELAYED_PIC_REF) {
+                /* This frame was already output, we cannot draw into it
+                 * anymore.
+                 */
+                h->first_field = 1;
+                h->cur_pic_ptr = NULL;
             } else {
                 /* Second field in complementary pair */
                 h->first_field = 0;
@@ -1487,11 +1649,24 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
             return AVERROR_INVALIDDATA;
         }
     } else {
+        int field = h->picture_structure == PICT_BOTTOM_FIELD;
         release_unused_pictures(h, 0);
+        h->cur_pic_ptr->tf.owner[field] = h->avctx;
+    }
+    /* Some macroblocks can be accessed before they're available in case
+    * of lost slices, MBAFF or threading. */
+    if (FIELD_PICTURE(h)) {
+        for(i = (h->picture_structure == PICT_BOTTOM_FIELD); i<h->mb_height; i++)
+            memset(h->slice_table + i*h->mb_stride, -1, (h->mb_stride - (i+1==h->mb_height)) * sizeof(*h->slice_table));
+    } else {
+        memset(h->slice_table, -1,
+            (h->mb_height * h->mb_stride - 1) * sizeof(*h->slice_table));
     }
 
-    ff_h264_init_poc(h->cur_pic_ptr->field_poc, &h->cur_pic_ptr->poc,
+    ret = ff_h264_init_poc(h->cur_pic_ptr->field_poc, &h->cur_pic_ptr->poc,
                      h->ps.sps, &h->poc, h->picture_structure, nal->ref_idc);
+    if (ret < 0)
+        return ret;
 
     memcpy(h->mmco, sl->mmco, sl->nb_mmco * sizeof(*h->mmco));
     h->nb_mmco = sl->nb_mmco;
@@ -1499,15 +1674,25 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
 
     h->picture_idr = nal->type == H264_NAL_IDR_SLICE;
 
-    if (h->sei.recovery_point.recovery_frame_cnt >= 0 && h->recovery_frame < 0) {
-        h->recovery_frame = (h->poc.frame_num + h->sei.recovery_point.recovery_frame_cnt) &
-                            ((1 << h->ps.sps->log2_max_frame_num) - 1);
+    if (h->sei.recovery_point.recovery_frame_cnt >= 0) {
+        const int sei_recovery_frame_cnt = h->sei.recovery_point.recovery_frame_cnt;
+
+        if (h->poc.frame_num != sei_recovery_frame_cnt || sl->slice_type_nos != AV_PICTURE_TYPE_I)
+            h->valid_recovery_point = 1;
+
+        if (   h->recovery_frame < 0
+            || av_mod_uintp2(h->recovery_frame - h->poc.frame_num, h->ps.sps->log2_max_frame_num) > sei_recovery_frame_cnt) {
+            h->recovery_frame = av_mod_uintp2(h->poc.frame_num + sei_recovery_frame_cnt, h->ps.sps->log2_max_frame_num);
+
+            if (!h->valid_recovery_point)
+                h->recovery_frame = h->poc.frame_num;
+        }
     }
 
-    h->cur_pic_ptr->f->key_frame |= (nal->type == H264_NAL_IDR_SLICE) ||
-                                    (h->sei.recovery_point.recovery_frame_cnt >= 0);
+    h->cur_pic_ptr->f->key_frame |= (nal->type == H264_NAL_IDR_SLICE);
 
-    if (nal->type == H264_NAL_IDR_SLICE || h->recovery_frame == h->poc.frame_num) {
+    if (nal->type == H264_NAL_IDR_SLICE ||
+        (h->recovery_frame == h->poc.frame_num && nal->ref_idc)) {
         h->recovery_frame         = -1;
         h->cur_pic_ptr->recovered = 1;
     }
@@ -1515,12 +1700,16 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
     // "recovered".
     if (nal->type == H264_NAL_IDR_SLICE)
         h->frame_recovered |= FRAME_RECOVERED_IDR;
+#if 1
+    h->cur_pic_ptr->recovered |= h->frame_recovered;
+#else
     h->cur_pic_ptr->recovered |= !!(h->frame_recovered & FRAME_RECOVERED_IDR);
+#endif
 
     /* Set the frame properties/side data. Only done for the second field in
      * field coded frames, since some SEI information is present for each field
      * and is merged by the SEI parsing code. */
-    if (!FIELD_PICTURE(h) || !h->first_field) {
+    if (!FIELD_PICTURE(h) || !h->first_field || h->missing_fields > 1) {
         ret = h264_export_frame_props(h);
         if (ret < 0)
             return ret;
@@ -1530,29 +1719,28 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
             return ret;
     }
 
-    if (h->avctx->hwaccel) {
-        ret = h->avctx->hwaccel->start_frame(h->avctx, NULL, 0);
-        if (ret < 0)
-            return ret;
-    }
-
     return 0;
 }
 
-static int h264_slice_header_parse(H264SliceContext *sl, const H2645NAL *nal,
-                                   const H264ParamSets *ps, AVCodecContext *avctx)
+static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+                                   const H2645NAL *nal)
 {
     const SPS *sps;
     const PPS *pps;
     int ret;
     unsigned int slice_type, tmp, i;
-    int field_pic_flag, bottom_field_flag, picture_structure;
+    int field_pic_flag, bottom_field_flag;
+    int first_slice = sl == h->slice_ctx && !h->current_slice;
+    int picture_structure;
 
-    sl->first_mb_addr = get_ue_golomb(&sl->gb);
+    if (first_slice)
+        av_assert0(!h->setup_finished);
+
+    sl->first_mb_addr = get_ue_golomb_long(&sl->gb);
 
     slice_type = get_ue_golomb_31(&sl->gb);
     if (slice_type > 9) {
-        av_log(avctx, AV_LOG_ERROR,
+        av_log(h->avctx, AV_LOG_ERROR,
                "slice type %d too large at %d\n",
                slice_type, sl->first_mb_addr);
         return AVERROR_INVALIDDATA;
@@ -1569,37 +1757,48 @@ static int h264_slice_header_parse(H264SliceContext *sl, const H2645NAL *nal,
 
     if (nal->type  == H264_NAL_IDR_SLICE &&
         sl->slice_type_nos != AV_PICTURE_TYPE_I) {
-        av_log(avctx, AV_LOG_ERROR, "A non-intra slice in an IDR NAL unit.\n");
+        av_log(h->avctx, AV_LOG_ERROR, "A non-intra slice in an IDR NAL unit.\n");
         return AVERROR_INVALIDDATA;
     }
 
     sl->pps_id = get_ue_golomb(&sl->gb);
     if (sl->pps_id >= MAX_PPS_COUNT) {
-        av_log(avctx, AV_LOG_ERROR, "pps_id %u out of range\n", sl->pps_id);
+        av_log(h->avctx, AV_LOG_ERROR, "pps_id %u out of range\n", sl->pps_id);
         return AVERROR_INVALIDDATA;
     }
-    if (!ps->pps_list[sl->pps_id]) {
-        av_log(avctx, AV_LOG_ERROR,
+    if (!h->ps.pps_list[sl->pps_id]) {
+        av_log(h->avctx, AV_LOG_ERROR,
                "non-existing PPS %u referenced\n",
                sl->pps_id);
         return AVERROR_INVALIDDATA;
     }
-    pps = (const PPS*)ps->pps_list[sl->pps_id]->data;
+    pps = (const PPS*)h->ps.pps_list[sl->pps_id]->data;
 
-    if (!ps->sps_list[pps->sps_id]) {
-        av_log(avctx, AV_LOG_ERROR,
+    if (!h->ps.sps_list[pps->sps_id]) {
+        av_log(h->avctx, AV_LOG_ERROR,
                "non-existing SPS %u referenced\n", pps->sps_id);
         return AVERROR_INVALIDDATA;
     }
-    sps = (const SPS*)ps->sps_list[pps->sps_id]->data;
+    sps = (const SPS*)h->ps.sps_list[pps->sps_id]->data;
 
     sl->frame_num = get_bits(&sl->gb, sps->log2_max_frame_num);
+    if (!first_slice) {
+        if (h->poc.frame_num != sl->frame_num) {
+            av_log(h->avctx, AV_LOG_ERROR, "Frame num change from %d to %d\n",
+                   h->poc.frame_num, sl->frame_num);
+            return AVERROR_INVALIDDATA;
+        }
+    }
 
     sl->mb_mbaff       = 0;
 
     if (sps->frame_mbs_only_flag) {
         picture_structure = PICT_FRAME;
     } else {
+        if (!sps->direct_8x8_inference_flag && slice_type == AV_PICTURE_TYPE_B) {
+            av_log(h->avctx, AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n");
+            return -1;
+        }
         field_pic_flag = get_bits1(&sl->gb);
         if (field_pic_flag) {
             bottom_field_flag = get_bits1(&sl->gb);
@@ -1620,7 +1819,7 @@ static int h264_slice_header_parse(H264SliceContext *sl, const H2645NAL *nal,
     }
 
     if (nal->type == H264_NAL_IDR_SLICE)
-        get_ue_golomb(&sl->gb); /* idr_pic_id */
+        get_ue_golomb_long(&sl->gb); /* idr_pic_id */
 
     if (sps->poc_type == 0) {
         sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
@@ -1645,12 +1844,12 @@ static int h264_slice_header_parse(H264SliceContext *sl, const H2645NAL *nal,
 
     ret = ff_h264_parse_ref_count(&sl->list_count, sl->ref_count,
                                   &sl->gb, pps, sl->slice_type_nos,
-                                  picture_structure);
+                                  picture_structure, h->avctx);
     if (ret < 0)
         return ret;
 
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
-       ret = ff_h264_decode_ref_pic_list_reordering(sl, avctx);
+       ret = ff_h264_decode_ref_pic_list_reordering(sl, h->avctx);
        if (ret < 0) {
            sl->ref_count[1] = sl->ref_count[0] = 0;
            return ret;
@@ -1664,30 +1863,34 @@ static int h264_slice_header_parse(H264SliceContext *sl, const H2645NAL *nal,
     }
     if ((pps->weighted_pred && sl->slice_type_nos == AV_PICTURE_TYPE_P) ||
         (pps->weighted_bipred_idc == 1 &&
-         sl->slice_type_nos == AV_PICTURE_TYPE_B))
-        ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count,
-                                  sl->slice_type_nos, &sl->pwt);
+         sl->slice_type_nos == AV_PICTURE_TYPE_B)) {
+        ret = ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count,
+                                  sl->slice_type_nos, &sl->pwt,
+                                  picture_structure, h->avctx);
+        if (ret < 0)
+            return ret;
+    }
 
     sl->explicit_ref_marking = 0;
     if (nal->ref_idc) {
-        ret = ff_h264_decode_ref_pic_marking(sl, &sl->gb, nal, avctx);
-        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
+        ret = ff_h264_decode_ref_pic_marking(sl, &sl->gb, nal, h->avctx);
+        if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
             return AVERROR_INVALIDDATA;
     }
 
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I && pps->cabac) {
         tmp = get_ue_golomb_31(&sl->gb);
         if (tmp > 2) {
-            av_log(avctx, AV_LOG_ERROR, "cabac_init_idc %u overflow\n", tmp);
+            av_log(h->avctx, AV_LOG_ERROR, "cabac_init_idc %u overflow\n", tmp);
             return AVERROR_INVALIDDATA;
         }
         sl->cabac_init_idc = tmp;
     }
 
     sl->last_qscale_diff = 0;
-    tmp = pps->init_qp + get_se_golomb(&sl->gb);
+    tmp = pps->init_qp + (unsigned)get_se_golomb(&sl->gb);
     if (tmp > 51 + 6 * (sps->bit_depth_luma - 8)) {
-        av_log(avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
+        av_log(h->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
         return AVERROR_INVALIDDATA;
     }
     sl->qscale       = tmp;
@@ -1706,7 +1909,7 @@ static int h264_slice_header_parse(H264SliceContext *sl, const H2645NAL *nal,
     if (pps->deblocking_filter_parameters_present) {
         tmp = get_ue_golomb_31(&sl->gb);
         if (tmp > 2) {
-            av_log(avctx, AV_LOG_ERROR,
+            av_log(h->avctx, AV_LOG_ERROR,
                    "deblocking_filter_idc %u out of range\n", tmp);
             return AVERROR_INVALIDDATA;
         }
@@ -1715,17 +1918,19 @@ static int h264_slice_header_parse(H264SliceContext *sl, const H2645NAL *nal,
             sl->deblocking_filter ^= 1;  // 1<->0
 
         if (sl->deblocking_filter) {
-            sl->slice_alpha_c0_offset = get_se_golomb(&sl->gb) * 2;
-            sl->slice_beta_offset     = get_se_golomb(&sl->gb) * 2;
-            if (sl->slice_alpha_c0_offset >  12 ||
-                sl->slice_alpha_c0_offset < -12 ||
-                sl->slice_beta_offset >  12     ||
-                sl->slice_beta_offset < -12) {
-                av_log(avctx, AV_LOG_ERROR,
+            int slice_alpha_c0_offset_div2 = get_se_golomb(&sl->gb);
+            int slice_beta_offset_div2     = get_se_golomb(&sl->gb);
+            if (slice_alpha_c0_offset_div2 >  6 ||
+                slice_alpha_c0_offset_div2 < -6 ||
+                slice_beta_offset_div2 >  6     ||
+                slice_beta_offset_div2 < -6) {
+                av_log(h->avctx, AV_LOG_ERROR,
                        "deblocking filter parameters %d %d out of range\n",
-                       sl->slice_alpha_c0_offset, sl->slice_beta_offset);
+                       slice_alpha_c0_offset_div2, slice_beta_offset_div2);
                 return AVERROR_INVALIDDATA;
             }
+            sl->slice_alpha_c0_offset = slice_alpha_c0_offset_div2 * 2;
+            sl->slice_beta_offset     = slice_beta_offset_div2 * 2;
         }
     }
 
@@ -1739,32 +1944,12 @@ static int h264_slice_init(H264Context *h, H264SliceContext *sl,
 {
     int i, j, ret = 0;
 
-    if (h->current_slice > 0) {
-        if (h->ps.pps != (const PPS*)h->ps.pps_list[sl->pps_id]->data) {
-            av_log(h->avctx, AV_LOG_ERROR, "PPS changed between slices\n");
-            return AVERROR_INVALIDDATA;
-        }
-
-        if (h->picture_structure != sl->picture_structure ||
-            h->droppable         != (nal->ref_idc == 0)) {
-            av_log(h->avctx, AV_LOG_ERROR,
-                   "Changing field mode (%d -> %d) between slices is not allowed\n",
-                   h->picture_structure, sl->picture_structure);
-            return AVERROR_INVALIDDATA;
-        } else if (!h->cur_pic_ptr) {
-            av_log(h->avctx, AV_LOG_ERROR,
-                   "unset cur_pic_ptr on slice %d\n",
-                   h->current_slice + 1);
-            return AVERROR_INVALIDDATA;
-        }
-    }
-
     if (h->picture_idr && nal->type != H264_NAL_IDR_SLICE) {
         av_log(h->avctx, AV_LOG_ERROR, "Invalid mix of IDR and non-IDR slices\n");
         return AVERROR_INVALIDDATA;
     }
 
-    assert(h->mb_num == h->mb_width * h->mb_height);
+    av_assert1(h->mb_num == h->mb_width * h->mb_height);
     if (sl->first_mb_addr << FIELD_OR_MBAFF_PICTURE(h) >= h->mb_num ||
         sl->first_mb_addr >= h->mb_num) {
         av_log(h->avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
@@ -1775,7 +1960,7 @@ static int h264_slice_init(H264Context *h, H264SliceContext *sl,
                                  FIELD_OR_MBAFF_PICTURE(h);
     if (h->picture_structure == PICT_BOTTOM_FIELD)
         sl->resync_mb_y = sl->mb_y = sl->mb_y + 1;
-    assert(sl->mb_y < h->mb_height);
+    av_assert1(sl->mb_y < h->mb_height);
 
     ret = ff_h264_build_ref_list(h, sl);
     if (ret < 0)
@@ -1792,10 +1977,13 @@ static int h264_slice_init(H264Context *h, H264SliceContext *sl,
 
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B && !sl->direct_spatial_mv_pred)
         ff_h264_direct_dist_scale_factor(h, sl);
-    ff_h264_direct_ref_list_init(h, sl);
+    if (!h->setup_finished)
+        ff_h264_direct_ref_list_init(h, sl);
 
     if (h->avctx->skip_loop_filter >= AVDISCARD_ALL ||
         (h->avctx->skip_loop_filter >= AVDISCARD_NONKEY &&
+         h->nal_unit_type != H264_NAL_IDR_SLICE) ||
+        (h->avctx->skip_loop_filter >= AVDISCARD_NONINTRA &&
          sl->slice_type_nos != AV_PICTURE_TYPE_I) ||
         (h->avctx->skip_loop_filter >= AVDISCARD_BIDIR  &&
          sl->slice_type_nos == AV_PICTURE_TYPE_B) ||
@@ -1820,9 +2008,14 @@ static int h264_slice_init(H264Context *h, H264SliceContext *sl,
                    6 * (h->ps.sps->bit_depth_luma - 8);
 
     sl->slice_num       = ++h->current_slice;
-    if (sl->slice_num >= MAX_SLICES) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "Too many slices, increase MAX_SLICES and recompile\n");
+
+    if (sl->slice_num)
+        h->slice_row[(sl->slice_num-1)&(MAX_SLICES-1)]= sl->resync_mb_y;
+    if (   h->slice_row[sl->slice_num&(MAX_SLICES-1)] + 3 >= sl->resync_mb_y
+        && h->slice_row[sl->slice_num&(MAX_SLICES-1)] <= sl->resync_mb_y
+        && sl->slice_num >= MAX_SLICES) {
+        //in case of ASO this check needs to be updated depending on how we decide to assign slice numbers in this case
+        av_log(h->avctx, AV_LOG_WARNING, "Possibly too many slices (%d >= %d), increase MAX_SLICES and recompile if there are artifacts\n", sl->slice_num, MAX_SLICES);
     }
 
     for (j = 0; j < 2; j++) {
@@ -1885,20 +2078,30 @@ static int h264_slice_init(H264Context *h, H264SliceContext *sl,
 int ff_h264_queue_decode_slice(H264Context *h, const H2645NAL *nal)
 {
     H264SliceContext *sl = h->slice_ctx + h->nb_slice_ctx_queued;
+    int first_slice = sl == h->slice_ctx && !h->current_slice;
     int ret;
 
     sl->gb = nal->gb;
 
-    ret = h264_slice_header_parse(sl, nal, &h->ps, h->avctx);
+    ret = h264_slice_header_parse(h, sl, nal);
     if (ret < 0)
         return ret;
 
     // discard redundant pictures
-    if (sl->redundant_pic_count > 0)
+    if (sl->redundant_pic_count > 0) {
+        sl->ref_count[0] = sl->ref_count[1] = 0;
         return 0;
+    }
 
-    if (!h->setup_finished) {
-        if (sl->first_mb_addr == 0) { // FIXME better field boundary detection
+    if (sl->first_mb_addr == 0 || !h->current_slice) {
+        if (h->setup_finished) {
+            av_log(h->avctx, AV_LOG_ERROR, "Too many fields\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (sl->first_mb_addr == 0) { // FIXME better field boundary detection
+        if (h->current_slice) {
             // this slice starts a new field
             // first decode any pending queued slices
             if (h->nb_slice_ctx_queued) {
@@ -1914,24 +2117,77 @@ int ff_h264_queue_decode_slice(H264Context *h, const H2645NAL *nal)
                 sl = h->slice_ctx;
             }
 
-            if (h->field_started)
-                ff_h264_field_end(h, sl, 1);
-
-            h->current_slice = 0;
-            if (!h->first_field) {
-                if (h->cur_pic_ptr && !h->droppable) {
-                    ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
-                                              h->picture_structure == PICT_BOTTOM_FIELD);
-                }
+            if (h->cur_pic_ptr && FIELD_PICTURE(h) && h->first_field) {
+                ret = ff_h264_field_end(h, h->slice_ctx, 1);
+                if (ret < 0)
+                    return ret;
+            } else if (h->cur_pic_ptr && !FIELD_PICTURE(h) && !h->first_field && h->nal_unit_type  == H264_NAL_IDR_SLICE) {
+                av_log(h, AV_LOG_WARNING, "Broken frame packetizing\n");
+                ret = ff_h264_field_end(h, h->slice_ctx, 1);
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 0);
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 1);
                 h->cur_pic_ptr = NULL;
+                if (ret < 0)
+                    return ret;
+            } else
+                return AVERROR_INVALIDDATA;
+        }
+
+        if (!h->first_field) {
+            if (h->cur_pic_ptr && !h->droppable) {
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                          h->picture_structure == PICT_BOTTOM_FIELD);
             }
+            h->cur_pic_ptr = NULL;
         }
+    }
 
-        if (h->current_slice == 0) {
-            ret = h264_field_start(h, sl, nal);
-            if (ret < 0)
-                return ret;
-            h->field_started = 1;
+    if (!h->current_slice)
+        av_assert0(sl == h->slice_ctx);
+
+    if (h->current_slice == 0 && !h->first_field) {
+        if (
+            (h->avctx->skip_frame >= AVDISCARD_NONREF && !h->nal_ref_idc) ||
+            (h->avctx->skip_frame >= AVDISCARD_BIDIR  && sl->slice_type_nos == AV_PICTURE_TYPE_B) ||
+            (h->avctx->skip_frame >= AVDISCARD_NONINTRA && sl->slice_type_nos != AV_PICTURE_TYPE_I) ||
+            (h->avctx->skip_frame >= AVDISCARD_NONKEY && h->nal_unit_type != H264_NAL_IDR_SLICE && h->sei.recovery_point.recovery_frame_cnt < 0) ||
+            h->avctx->skip_frame >= AVDISCARD_ALL) {
+            return 0;
+        }
+    }
+
+    if (!first_slice) {
+        const PPS *pps = (const PPS*)h->ps.pps_list[sl->pps_id]->data;
+
+        if (h->ps.pps->sps_id != pps->sps_id ||
+            h->ps.pps->transform_8x8_mode != pps->transform_8x8_mode /*||
+            (h->setup_finished && h->ps.pps != pps)*/) {
+            av_log(h->avctx, AV_LOG_ERROR, "PPS changed between slices\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if (h->ps.sps != (const SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data) {
+            av_log(h->avctx, AV_LOG_ERROR,
+               "SPS changed in the middle of the frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (h->current_slice == 0) {
+        ret = h264_field_start(h, sl, nal, first_slice);
+        if (ret < 0)
+            return ret;
+    } else {
+        if (h->picture_structure != sl->picture_structure ||
+            h->droppable         != (nal->ref_idc == 0)) {
+            av_log(h->avctx, AV_LOG_ERROR,
+                   "Changing field mode (%d -> %d) between slices is not allowed\n",
+                   h->picture_structure, sl->picture_structure);
+            return AVERROR_INVALIDDATA;
+        } else if (!h->cur_pic_ptr) {
+            av_log(h->avctx, AV_LOG_ERROR,
+                   "unset cur_pic_ptr on slice %d\n",
+                   h->current_slice + 1);
+            return AVERROR_INVALIDDATA;
         }
     }
 
@@ -1939,14 +2195,7 @@ int ff_h264_queue_decode_slice(H264Context *h, const H2645NAL *nal)
     if (ret < 0)
         return ret;
 
-    if ((h->avctx->skip_frame < AVDISCARD_NONREF || nal->ref_idc) &&
-        (h->avctx->skip_frame < AVDISCARD_BIDIR  ||
-         sl->slice_type_nos != AV_PICTURE_TYPE_B) &&
-        (h->avctx->skip_frame < AVDISCARD_NONKEY ||
-         h->cur_pic_ptr->f->key_frame) &&
-        h->avctx->skip_frame < AVDISCARD_ALL) {
-        h->nb_slice_ctx_queued++;
-    }
+    h->nb_slice_ctx_queued++;
 
     return 0;
 }
@@ -2302,7 +2551,7 @@ static void decode_finish_row(const H264Context *h, H264SliceContext *sl)
 
     ff_h264_draw_horiz_band(h, sl, top, height);
 
-    if (h->droppable)
+    if (h->droppable || sl->h264->slice_ctx[0].er.error_occurred)
         return;
 
     ff_thread_report_progress(&h->cur_pic_ptr->tf, top + height - 1,
@@ -2313,15 +2562,14 @@ static void er_add_slice(H264SliceContext *sl,
                          int startx, int starty,
                          int endx, int endy, int status)
 {
-#if CONFIG_ERROR_RESILIENCE
-    ERContext *er = &sl->er;
-
     if (!sl->h264->enable_er)
         return;
 
-    er->ref_count = sl->ref_count[0];
-    ff_er_add_slice(er, startx, starty, endx, endy, status);
-#endif
+    if (CONFIG_ERROR_RESILIENCE) {
+        ERContext *er = &sl->h264->slice_ctx[0].er;
+
+        ff_er_add_slice(er, startx, starty, endx, endy, status);
+    }
 }
 
 static int decode_slice(struct AVCodecContext *avctx, void *arg)
@@ -2341,30 +2589,45 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
 
     sl->mb_skip_run = -1;
 
+    av_assert0(h->block_offset[15] == (4 * ((scan8[15] - scan8[0]) & 7) << h->pixel_shift) + 4 * sl->linesize * ((scan8[15] - scan8[0]) >> 3));
+
     if (h->postpone_filter)
         sl->deblocking_filter = 0;
 
     sl->is_complex = FRAME_MBAFF(h) || h->picture_structure != PICT_FRAME ||
                      (CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
 
+    if (!(h->avctx->active_thread_type & FF_THREAD_SLICE) && h->picture_structure == PICT_FRAME && h->slice_ctx[0].er.error_status_table) {
+        const int start_i  = av_clip(sl->resync_mb_x + sl->resync_mb_y * h->mb_width, 0, h->mb_num - 1);
+        if (start_i) {
+            int prev_status = h->slice_ctx[0].er.error_status_table[h->slice_ctx[0].er.mb_index2xy[start_i - 1]];
+            prev_status &= ~ VP_START;
+            if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END))
+                h->slice_ctx[0].er.error_occurred = 1;
+        }
+    }
+
     if (h->ps.pps->cabac) {
         /* realign */
         align_get_bits(&sl->gb);
 
         /* init cabac */
-        ff_init_cabac_decoder(&sl->cabac,
+        ret = ff_init_cabac_decoder(&sl->cabac,
                               sl->gb.buffer + get_bits_count(&sl->gb) / 8,
                               (get_bits_left(&sl->gb) + 7) / 8);
+        if (ret < 0)
+            return ret;
 
         ff_h264_init_cabac_states(h, sl);
 
         for (;;) {
             // START_TIMER
             int ret, eos;
-
             if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
                 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at %d\n",
                        sl->next_slice_idx);
+                er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
+                             sl->mb_y, ER_MB_ERROR);
                 return AVERROR_INVALIDDATA;
             }
 
@@ -2394,9 +2657,11 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                     loop_filter(h, sl, lf_x_start, sl->mb_x + 1);
                 goto finish;
             }
-            if (ret < 0 || sl->cabac.bytestream > sl->cabac.bytestream_end + 2) {
+            if (sl->cabac.bytestream > sl->cabac.bytestream_end + 2 )
+                av_log(h->avctx, AV_LOG_DEBUG, "bytestream overread %"PTRDIFF_SPECIFIER"\n", sl->cabac.bytestream_end - sl->cabac.bytestream);
+            if (ret < 0 || sl->cabac.bytestream > sl->cabac.bytestream_end + 4) {
                 av_log(h->avctx, AV_LOG_ERROR,
-                       "error while decoding MB %d %d, bytestream %td\n",
+                       "error while decoding MB %d %d, bytestream %"PTRDIFF_SPECIFIER"\n",
                        sl->mb_x, sl->mb_y,
                        sl->cabac.bytestream_end - sl->cabac.bytestream);
                 er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
@@ -2433,6 +2698,8 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
             if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
                 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at %d\n",
                        sl->next_slice_idx);
+                er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
+                             sl->mb_y, ER_MB_ERROR);
                 return AVERROR_INVALIDDATA;
             }
 
@@ -2473,14 +2740,15 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                     ff_tlog(h->avctx, "slice end %d %d\n",
                             get_bits_count(&sl->gb), sl->gb.size_in_bits);
 
-                    if (get_bits_left(&sl->gb) == 0) {
+                    if (   get_bits_left(&sl->gb) == 0
+                        || get_bits_left(&sl->gb) > 0 && !(h->avctx->err_recognition & AV_EF_AGGRESSIVE)) {
                         er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
                                      sl->mb_x - 1, sl->mb_y, ER_MB_END);
 
                         goto finish;
                     } else {
                         er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
-                                     sl->mb_x - 1, sl->mb_y, ER_MB_END);
+                                     sl->mb_x, sl->mb_y, ER_MB_END);
 
                         return AVERROR_INVALIDDATA;
                     }
@@ -2526,8 +2794,13 @@ int ff_h264_execute_decode_slices(H264Context *h)
     int ret = 0;
     int i, j;
 
+    h->slice_ctx[0].next_slice_idx = INT_MAX;
+
     if (h->avctx->hwaccel || context_count < 1)
         return 0;
+
+    av_assert0(context_count && h->slice_ctx[context_count - 1].mb_y < h->mb_height);
+
     if (context_count == 1) {
 
         h->slice_ctx[0].next_slice_idx = h->mb_width * h->mb_height;
@@ -2538,12 +2811,15 @@ int ff_h264_execute_decode_slices(H264Context *h)
         if (ret < 0)
             goto finish;
     } else {
+        av_assert0(context_count > 0);
         for (i = 0; i < context_count; i++) {
             int next_slice_idx = h->mb_width * h->mb_height;
             int slice_idx;
 
             sl                 = &h->slice_ctx[i];
-            sl->er.error_count = 0;
+            if (CONFIG_ERROR_RESILIENCE) {
+                sl->er.error_count = 0;
+            }
 
             /* make sure none of those slices overlap */
             slice_idx = sl->mb_y * h->mb_width + sl->mb_x;
@@ -2564,8 +2840,10 @@ int ff_h264_execute_decode_slices(H264Context *h)
         /* pull back stuff from slices to master context */
         sl                   = &h->slice_ctx[context_count - 1];
         h->mb_y              = sl->mb_y;
-        for (i = 1; i < context_count; i++)
-            h->slice_ctx[0].er.error_count += h->slice_ctx[i].er.error_count;
+        if (CONFIG_ERROR_RESILIENCE) {
+            for (i = 1; i < context_count; i++)
+                h->slice_ctx[0].er.error_count += h->slice_ctx[i].er.error_count;
+        }
 
         if (h->postpone_filter) {
             h->postpone_filter = 0;
diff --git a/libavcodec/h264addpx_template.c b/libavcodec/h264addpx_template.c
index e3adfe2..9a1e6a2 100644
--- a/libavcodec/h264addpx_template.c
+++ b/libavcodec/h264addpx_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,10 +35,10 @@ static void FUNCC(ff_h264_add_pixels4)(uint8_t *_dst, int16_t *_src, int stride)
     stride /= sizeof(pixel);
 
     for (i = 0; i < 4; i++) {
-        dst[0] += src[0];
-        dst[1] += src[1];
-        dst[2] += src[2];
-        dst[3] += src[3];
+        dst[0] += (unsigned)src[0];
+        dst[1] += (unsigned)src[1];
+        dst[2] += (unsigned)src[2];
+        dst[3] += (unsigned)src[3];
 
         dst += stride;
         src += 4;
@@ -55,14 +55,14 @@ static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride)
     stride /= sizeof(pixel);
 
     for (i = 0; i < 8; i++) {
-        dst[0] += src[0];
-        dst[1] += src[1];
-        dst[2] += src[2];
-        dst[3] += src[3];
-        dst[4] += src[4];
-        dst[5] += src[5];
-        dst[6] += src[6];
-        dst[7] += src[7];
+        dst[0] += (unsigned)src[0];
+        dst[1] += (unsigned)src[1];
+        dst[2] += (unsigned)src[2];
+        dst[3] += (unsigned)src[3];
+        dst[4] += (unsigned)src[4];
+        dst[5] += (unsigned)src[5];
+        dst[6] += (unsigned)src[6];
+        dst[7] += (unsigned)src[7];
 
         dst += stride;
         src += 8;
diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index d5146de..c2f1f30 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,9 +32,11 @@
     c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_ ## depth ## _c; \
+    c->put_h264_chroma_pixels_tab[3] = put_h264_chroma_mc1_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_ ## depth ## _c; \
+    c->avg_h264_chroma_pixels_tab[3] = avg_h264_chroma_mc1_ ## depth ## _c; \
 
 av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
 {
@@ -52,4 +54,6 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
         ff_h264chroma_init_ppc(c, bit_depth);
     if (ARCH_X86)
         ff_h264chroma_init_x86(c, bit_depth);
+    if (ARCH_MIPS)
+        ff_h264chroma_init_mips(c, bit_depth);
 }
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index 9fc2a0f..5c89fd1 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,8 @@
 typedef void (*h264_chroma_mc_func)(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ptrdiff_t srcStride, int h, int x, int y);
 
 typedef struct H264ChromaContext {
-    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
+    h264_chroma_mc_func put_h264_chroma_pixels_tab[4];
+    h264_chroma_mc_func avg_h264_chroma_pixels_tab[4];
 } H264ChromaContext;
 
 void ff_h264chroma_init(H264ChromaContext *c, int bit_depth);
@@ -35,5 +35,6 @@ void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/h264chroma_template.c b/libavcodec/h264chroma_template.c
index ed364dd..a3ca07b 100644
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@ -2,29 +2,64 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
+
 #include <stddef.h>
 
+#include "libavutil/avassert.h"
 #include "bit_depth_template.c"
 
 #define H264_CHROMA_MC(OPNAME, OP)\
+static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y){\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    stride >>= sizeof(pixel)-1;\
+    \
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    } else if (B + C) {\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    } else {\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
 static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst /*align 8*/, uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
 {\
     pixel *dst = (pixel*)_dst;\
@@ -34,9 +69,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst /*align 8*/, uint8_t
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
@@ -73,9 +108,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst /*align 8*/, uint8_t
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
@@ -118,9 +153,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst /*align 8*/, uint8_t
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
diff --git a/libavcodec/h264data.c b/libavcodec/h264data.c
index a2a4a47..a4c6d93 100644
--- a/libavcodec/h264data.c
+++ b/libavcodec/h264data.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -174,13 +174,17 @@ const uint8_t ff_h264_dequant8_coeff_init[6][6] = {
 const uint8_t ff_h264_quant_rem6[QP_MAX_NUM + 1] = {
     0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
     3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+    3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+    0, 1, 2, 3,
 };
 
 const uint8_t ff_h264_quant_div6[QP_MAX_NUM + 1] = {
     0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
     3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
-    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10,
+   10,10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13, 13, 13, 13,
+   14,14,14,14,
 };
 
 #define QP(qP, depth) ((qP) + 6 * ((depth) - 8))
@@ -196,11 +200,33 @@ const uint8_t ff_h264_quant_div6[QP_MAX_NUM + 1] = {
     QP(37, d), QP(37, d), QP(37, d), QP(38, d), QP(38, d), QP(38, d),   \
     QP(39, d), QP(39, d), QP(39, d), QP(39, d)
 
-const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM + 1] = {
+const uint8_t ff_h264_chroma_qp[7][QP_MAX_NUM + 1] = {
     { CHROMA_QP_TABLE_END(8) },
     { 0, 1, 2, 3, 4, 5,
       CHROMA_QP_TABLE_END(9) },
-    { 0, 1, 2, 3, 4, 5,
+    { 0, 1, 2, 3,  4,  5,
       6, 7, 8, 9, 10, 11,
       CHROMA_QP_TABLE_END(10) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      CHROMA_QP_TABLE_END(11) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      CHROMA_QP_TABLE_END(12) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      24,25,26,27, 28, 29,
+      CHROMA_QP_TABLE_END(13) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      24,25,26,27, 28, 29,
+      30,31,32,33, 34, 35,
+      CHROMA_QP_TABLE_END(14) },
 };
diff --git a/libavcodec/h264data.h b/libavcodec/h264data.h
index f1284e6..2968b08 100644
--- a/libavcodec/h264data.h
+++ b/libavcodec/h264data.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,12 +48,32 @@ extern const PMbInfo ff_h264_p_sub_mb_type_info[4];
 extern const PMbInfo ff_h264_b_mb_type_info[23];
 extern const PMbInfo ff_h264_b_sub_mb_type_info[13];
 
+static const AVRational ff_h264_pixel_aspect[17] = {
+    {   0,  1 },
+    {   1,  1 },
+    {  12, 11 },
+    {  10, 11 },
+    {  16, 11 },
+    {  40, 33 },
+    {  24, 11 },
+    {  20, 11 },
+    {  32, 11 },
+    {  80, 33 },
+    {  18, 11 },
+    {  15, 11 },
+    {  64, 33 },
+    { 160, 99 },
+    {   4,  3 },
+    {   3,  2 },
+    {   2,  1 },
+};
+
 extern const uint8_t ff_h264_dequant4_coeff_init[6][3];
 extern const uint8_t ff_h264_dequant8_coeff_init_scan[16];
 extern const uint8_t ff_h264_dequant8_coeff_init[6][6];
 extern const uint8_t ff_h264_quant_rem6[QP_MAX_NUM + 1];
 extern const uint8_t ff_h264_quant_div6[QP_MAX_NUM + 1];
 
-extern const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM + 1];
+extern const uint8_t ff_h264_chroma_qp[7][QP_MAX_NUM + 1];
 
 #endif /* AVCODEC_H264DATA_H */
diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
index 4bfd789..837c3b7 100644
--- a/libavcodec/h264dec.c
+++ b/libavcodec/h264dec.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,9 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
+#include "libavutil/avassert.h"
 #include "libavutil/display.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/opt.h"
@@ -36,7 +39,6 @@
 #include "cabac_functions.h"
 #include "error_resilience.h"
 #include "avcodec.h"
-#include "golomb_legacy.h"
 #include "h264.h"
 #include "h264dec.h"
 #include "h2645_parse.h"
@@ -44,6 +46,7 @@
 #include "h264chroma.h"
 #include "h264_mvpred.h"
 #include "h264_ps.h"
+#include "golomb.h"
 #include "hwaccel.h"
 #include "mathops.h"
 #include "me_cmp.h"
@@ -52,10 +55,14 @@
 #include "rectangle.h"
 #include "thread.h"
 
-#include <assert.h>
-
 const uint16_t ff_h264_mb_sizes[4] = { 256, 384, 512, 768 };
 
+int avpriv_h264_has_num_reorder_frames(AVCodecContext *avctx)
+{
+    H264Context *h = avctx->priv_data;
+    return h && h->ps.sps ? h->ps.sps->num_reorder_frames : 0;
+}
+
 static void h264_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
                               int (*mv)[2][4][2],
                               int mb_x, int mb_y, int mb_intra, int mb_skipped)
@@ -67,19 +74,28 @@ static void h264_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
     sl->mb_y = mb_y;
     sl->mb_xy = mb_x + mb_y * h->mb_stride;
     memset(sl->non_zero_count_cache, 0, sizeof(sl->non_zero_count_cache));
-    assert(ref >= 0);
+    av_assert1(ref >= 0);
     /* FIXME: It is possible albeit uncommon that slice references
      * differ between slices. We take the easy approach and ignore
      * it for now. If this turns out to have any relevance in
      * practice then correct remapping should be added. */
     if (ref >= sl->ref_count[0])
         ref = 0;
+    if (!sl->ref_list[0][ref].data[0]) {
+        av_log(h->avctx, AV_LOG_DEBUG, "Reference not available for error concealing\n");
+        ref = 0;
+    }
+    if ((sl->ref_list[0][ref].reference&3) != 3) {
+        av_log(h->avctx, AV_LOG_DEBUG, "Reference invalid\n");
+        return;
+    }
     fill_rectangle(&h->cur_pic.ref_index[0][4 * sl->mb_xy],
                    2, 2, 2, ref, 1);
     fill_rectangle(&sl->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
     fill_rectangle(sl->mv_cache[0][scan8[0]], 4, 4, 8,
                    pack16to32((*mv)[0][0][0], (*mv)[0][0][1]), 4);
-    assert(!FRAME_MBAFF(h));
+    sl->mb_mbaff =
+    sl->mb_field_decoding_flag = 0;
     ff_h264_hl_decode_mb(h, &h->slice_ctx[0]);
 }
 
@@ -164,11 +180,11 @@ void ff_h264_free_tables(H264Context *h)
 int ff_h264_alloc_tables(H264Context *h)
 {
     const int big_mb_num = h->mb_stride * (h->mb_height + 1);
-    const int row_mb_num = h->mb_stride * 2 * h->nb_slice_ctx;
+    const int row_mb_num = 2*h->mb_stride*FFMAX(h->nb_slice_ctx, 1);
     int x, y;
 
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->intra4x4_pred_mode,
-                      row_mb_num * 8 * sizeof(uint8_t), fail)
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->intra4x4_pred_mode,
+                      row_mb_num, 8 * sizeof(uint8_t), fail)
     h->slice_ctx[0].intra4x4_pred_mode = h->intra4x4_pred_mode;
 
     FF_ALLOCZ_OR_GOTO(h->avctx, h->non_zero_count,
@@ -179,10 +195,10 @@ int ff_h264_alloc_tables(H264Context *h)
                       big_mb_num * sizeof(uint16_t), fail)
     FF_ALLOCZ_OR_GOTO(h->avctx, h->chroma_pred_mode_table,
                       big_mb_num * sizeof(uint8_t), fail)
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->mvd_table[0],
-                      16 * row_mb_num * sizeof(uint8_t), fail);
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->mvd_table[1],
-                      16 * row_mb_num * sizeof(uint8_t), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->mvd_table[0],
+                      row_mb_num, 16 * sizeof(uint8_t), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->mvd_table[1],
+                      row_mb_num, 16 * sizeof(uint8_t), fail);
     h->slice_ctx[0].mvd_table[0] = h->mvd_table[0];
     h->slice_ctx[0].mvd_table[1] = h->mvd_table[1];
 
@@ -235,7 +251,11 @@ int ff_h264_slice_context_init(H264Context *h, H264SliceContext *sl)
     sl->ref_cache[1][scan8[7]  + 1] =
     sl->ref_cache[1][scan8[13] + 1] = PART_NOT_AVAILABLE;
 
+    if (sl != h->slice_ctx) {
+        memset(er, 0, sizeof(*er));
+    } else
     if (CONFIG_ERROR_RESILIENCE) {
+
         /* init ER */
         er->avctx          = h->avctx;
         er->decode_mb      = h264_er_decode_mb;
@@ -263,7 +283,7 @@ int ff_h264_slice_context_init(H264Context *h, H264SliceContext *sl)
                           mb_array_size * sizeof(uint8_t), fail);
 
         FF_ALLOC_OR_GOTO(h->avctx, er->er_temp_buffer,
-                         h->mb_height * h->mb_stride, fail);
+                         h->mb_height * h->mb_stride * (4*sizeof(int) + 1), fail);
 
         FF_ALLOCZ_OR_GOTO(h->avctx, sl->dc_val_base,
                           yc_size * sizeof(int16_t), fail);
@@ -285,6 +305,7 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     int i;
 
     h->avctx                 = avctx;
+    h->cur_chroma_format_idc = -1;
 
     h->width_from_caller     = avctx->width;
     h->height_from_caller    = avctx->height;
@@ -294,8 +315,10 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     h->flags                 = avctx->flags;
     h->poc.prev_poc_msb      = 1 << 16;
     h->recovery_frame        = -1;
-    h->x264_build            = -1;
     h->frame_recovered       = 0;
+    h->poc.prev_frame_num    = -1;
+    h->sei.frame_packing.arrangement_cancel_flag = -1;
+    h->sei.unregistered.x264_build = -1;
 
     h->next_outputed_poc = INT_MIN;
     for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
@@ -322,8 +345,8 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     if (!h->cur_pic.f)
         return AVERROR(ENOMEM);
 
-    h->output_frame = av_frame_alloc();
-    if (!h->output_frame)
+    h->last_pic_for_ec.f = av_frame_alloc();
+    if (!h->last_pic_for_ec.f)
         return AVERROR(ENOMEM);
 
     for (i = 0; i < h->nb_slice_ctx; i++)
@@ -337,29 +360,29 @@ static av_cold int h264_decode_end(AVCodecContext *avctx)
     H264Context *h = avctx->priv_data;
     int i;
 
+    ff_h264_remove_all_refs(h);
     ff_h264_free_tables(h);
 
     for (i = 0; i < H264_MAX_PICTURE_COUNT; i++) {
         ff_h264_unref_picture(h, &h->DPB[i]);
         av_frame_free(&h->DPB[i].f);
     }
+    memset(h->delayed_pic, 0, sizeof(h->delayed_pic));
 
     h->cur_pic_ptr = NULL;
 
     av_freep(&h->slice_ctx);
     h->nb_slice_ctx = 0;
 
-    for (i = 0; i < MAX_SPS_COUNT; i++)
-        av_buffer_unref(&h->ps.sps_list[i]);
-
-    for (i = 0; i < MAX_PPS_COUNT; i++)
-        av_buffer_unref(&h->ps.pps_list[i]);
+    ff_h264_sei_uninit(&h->sei);
+    ff_h264_ps_uninit(&h->ps);
 
     ff_h2645_packet_uninit(&h->pkt);
 
     ff_h264_unref_picture(h, &h->cur_pic);
     av_frame_free(&h->cur_pic.f);
-    av_frame_free(&h->output_frame);
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+    av_frame_free(&h->last_pic_for_ec.f);
 
     return 0;
 }
@@ -381,18 +404,22 @@ static av_cold int h264_decode_init(AVCodecContext *avctx)
         return AVERROR_UNKNOWN;
     }
 
-    if (avctx->ticks_per_frame == 1)
-        h->avctx->framerate.num *= 2;
+    if (avctx->ticks_per_frame == 1) {
+        if(h->avctx->time_base.den < INT_MAX/2) {
+            h->avctx->time_base.den *= 2;
+        } else
+            h->avctx->time_base.num /= 2;
+    }
     avctx->ticks_per_frame = 2;
 
     if (avctx->extradata_size > 0 && avctx->extradata) {
-       ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
-                                      &h->ps, &h->is_avc, &h->nal_length_size,
-                                      avctx->err_recognition, avctx);
-       if (ret < 0) {
-           h264_decode_end(avctx);
-           return ret;
-       }
+        ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                       &h->ps, &h->is_avc, &h->nal_length_size,
+                                       avctx->err_recognition, avctx);
+        if (ret < 0) {
+            h264_decode_end(avctx);
+            return ret;
+        }
     }
 
     if (h->ps.sps && h->ps.sps->bitstream_restriction_flag &&
@@ -402,15 +429,21 @@ static av_cold int h264_decode_init(AVCodecContext *avctx)
 
     avctx->internal->allocate_progress = 1;
 
-    if (h->enable_er) {
+    ff_h264_flush_change(h);
+
+    if (h->enable_er < 0 && (avctx->active_thread_type & FF_THREAD_SLICE))
+        h->enable_er = 0;
+
+    if (h->enable_er && (avctx->active_thread_type & FF_THREAD_SLICE)) {
         av_log(avctx, AV_LOG_WARNING,
-               "Error resilience is enabled. It is unsafe and unsupported and may crash. "
+               "Error resilience with slice threads is enabled. It is unsafe and unsupported and may crash. "
                "Use it at your own risk\n");
     }
 
     return 0;
 }
 
+#if HAVE_THREADS
 static int decode_init_thread_copy(AVCodecContext *avctx)
 {
     H264Context *h = avctx->priv_data;
@@ -429,33 +462,47 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 /**
  * instantaneous decoder refresh.
  */
 static void idr(H264Context *h)
 {
+    int i;
     ff_h264_remove_all_refs(h);
     h->poc.prev_frame_num        =
-    h->poc.prev_frame_num_offset =
-    h->poc.prev_poc_msb          =
+    h->poc.prev_frame_num_offset = 0;
+    h->poc.prev_poc_msb          = 1<<16;
     h->poc.prev_poc_lsb          = 0;
+    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
+        h->last_pocs[i] = INT_MIN;
 }
 
 /* forget old pics after a seek */
 void ff_h264_flush_change(H264Context *h)
 {
-    int i;
-    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
-        h->last_pocs[i] = INT_MIN;
+    int i, j;
+
     h->next_outputed_poc = INT_MIN;
     h->prev_interlaced_frame = 1;
     idr(h);
-    if (h->cur_pic_ptr)
+
+    h->poc.prev_frame_num = -1;
+    if (h->cur_pic_ptr) {
         h->cur_pic_ptr->reference = 0;
+        for (j=i=0; h->delayed_pic[i]; i++)
+            if (h->delayed_pic[i] != h->cur_pic_ptr)
+                h->delayed_pic[j++] = h->delayed_pic[i];
+        h->delayed_pic[j] = NULL;
+    }
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+
     h->first_field = 0;
     h->recovery_frame = -1;
     h->frame_recovered = 0;
+    h->current_slice = 0;
+    h->mmco_reset = 1;
 }
 
 /* forget old pics after a seek */
@@ -483,6 +530,7 @@ static void flush_dpb(AVCodecContext *avctx)
 static int get_last_needed_nal(H264Context *h)
 {
     int nals_needed = 0;
+    int first_slice = 0;
     int i, ret;
 
     for (i = 0; i < h->pkt.nb_nals; i++) {
@@ -509,55 +557,83 @@ static int get_last_needed_nal(H264Context *h)
 
                 break;
             }
-            if (!get_ue_golomb(&gb))
+            if (!get_ue_golomb_long(&gb) ||  // first_mb_in_slice
+                !first_slice ||
+                first_slice != nal->type)
                 nals_needed = i;
+            if (!first_slice)
+                first_slice = nal->type;
         }
     }
 
     return nals_needed;
 }
 
+static void debug_green_metadata(const H264SEIGreenMetaData *gm, void *logctx)
+{
+    av_log(logctx, AV_LOG_DEBUG, "Green Metadata Info SEI message\n");
+    av_log(logctx, AV_LOG_DEBUG, "  green_metadata_type: %d\n", gm->green_metadata_type);
+
+    if (gm->green_metadata_type == 0) {
+        av_log(logctx, AV_LOG_DEBUG, "  green_metadata_period_type: %d\n", gm->period_type);
+
+        if (gm->period_type == 2)
+            av_log(logctx, AV_LOG_DEBUG, "  green_metadata_num_seconds: %d\n", gm->num_seconds);
+        else if (gm->period_type == 3)
+            av_log(logctx, AV_LOG_DEBUG, "  green_metadata_num_pictures: %d\n", gm->num_pictures);
+
+        av_log(logctx, AV_LOG_DEBUG, "  SEI GREEN Complexity Metrics: %f %f %f %f\n",
+               (float)gm->percent_non_zero_macroblocks/255,
+               (float)gm->percent_intra_coded_macroblocks/255,
+               (float)gm->percent_six_tap_filtering/255,
+               (float)gm->percent_alpha_point_deblocking_instance/255);
+
+    } else if (gm->green_metadata_type == 1) {
+        av_log(logctx, AV_LOG_DEBUG, "  xsd_metric_type: %d\n", gm->xsd_metric_type);
+
+        if (gm->xsd_metric_type == 0)
+            av_log(logctx, AV_LOG_DEBUG, "  xsd_metric_value: %f\n",
+                   (float)gm->xsd_metric_value/100);
+    }
+}
+
 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
 {
     AVCodecContext *const avctx = h->avctx;
     int nals_needed = 0; ///< number of NALs that need decoding before the next frame thread starts
+    int idr_cleared=0;
     int i, ret = 0;
 
+    h->has_slice = 0;
+    h->nal_unit_type= 0;
+
     if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS)) {
         h->current_slice = 0;
-        h->field_started = 0;
-        if (!h->first_field)
+        if (!h->first_field) {
             h->cur_pic_ptr = NULL;
-        ff_h264_sei_uninit(&h->sei);
+            ff_h264_sei_uninit(&h->sei);
+        }
     }
 
-    ret = ff_h2645_packet_split(&h->pkt, buf, buf_size, avctx, h->is_avc,
-                                h->nal_length_size, avctx->codec_id);
+    if (h->nal_length_size == 4) {
+        if (buf_size > 8 && AV_RB32(buf) == 1 && AV_RB32(buf+5) > (unsigned)buf_size) {
+            h->is_avc = 0;
+        }else if(buf_size > 3 && AV_RB32(buf) > 1 && AV_RB32(buf) <= (unsigned)buf_size)
+            h->is_avc = 1;
+    }
+
+    ret = ff_h2645_packet_split(&h->pkt, buf, buf_size, avctx, h->is_avc, h->nal_length_size,
+                                avctx->codec_id, avctx->flags2 & AV_CODEC_FLAG2_FAST, 0);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR,
                "Error splitting the input into NAL units.\n");
-
-        /* There are samples in the wild with mp4-style extradata, but Annex B
-         * data in the packets. If we fail parsing the packet as mp4, try it again
-         * as Annex B. */
-        if (h->is_avc && !(avctx->err_recognition & AV_EF_EXPLODE)) {
-            int err = ff_h2645_packet_split(&h->pkt, buf, buf_size, avctx, 0, 0,
-                                            avctx->codec_id);
-            if (err >= 0) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "The stream seems to contain AVCC extradata with Annex B "
-                       "formatted data, which is invalid.");
-                h->is_avc = 0;
-                ret       = 0;
-            }
-        }
-
-        if (ret < 0)
-            return ret;
+        return ret;
     }
 
     if (avctx->active_thread_type & FF_THREAD_FRAME)
         nals_needed = get_last_needed_nal(h);
+    if (nals_needed < 0)
+        return nals_needed;
 
     for (i = 0; i < h->pkt.nb_nals; i++) {
         H2645NAL *nal = &h->pkt.nals[i];
@@ -574,20 +650,41 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
         err = 0;
         switch (nal->type) {
         case H264_NAL_IDR_SLICE:
-            idr(h); // FIXME ensure we don't lose some frames if there is reordering
+            if ((nal->data[1] & 0xFC) == 0x98) {
+                av_log(h->avctx, AV_LOG_ERROR, "Invalid inter IDR frame\n");
+                h->next_outputed_poc = INT_MIN;
+                ret = -1;
+                goto end;
+            }
+            if(!idr_cleared) {
+                idr(h); // FIXME ensure we don't lose some frames if there is reordering
+            }
+            idr_cleared = 1;
+            h->has_recovery_point = 1;
         case H264_NAL_SLICE:
-            if ((err = ff_h264_queue_decode_slice(h, nal)))
+            h->has_slice = 1;
+
+            if ((err = ff_h264_queue_decode_slice(h, nal))) {
+                H264SliceContext *sl = h->slice_ctx + h->nb_slice_ctx_queued;
+                sl->ref_count[0] = sl->ref_count[1] = 0;
                 break;
+            }
 
-            if (avctx->active_thread_type & FF_THREAD_FRAME &&
-                i >= nals_needed && !h->setup_finished && h->cur_pic_ptr) {
-                ff_thread_finish_setup(avctx);
-                h->setup_finished = 1;
+            if (h->current_slice == 1) {
+                if (avctx->active_thread_type & FF_THREAD_FRAME &&
+                    i >= nals_needed && !h->setup_finished && h->cur_pic_ptr) {
+                    ff_thread_finish_setup(avctx);
+                    h->setup_finished = 1;
+                }
+
+                if (h->avctx->hwaccel &&
+                    (ret = h->avctx->hwaccel->start_frame(h->avctx, buf, buf_size)) < 0)
+                    goto end;
             }
 
             max_slice_ctx = avctx->hwaccel ? 1 : h->nb_slice_ctx;
             if (h->nb_slice_ctx_queued == max_slice_ctx) {
-                if (avctx->hwaccel) {
+                if (h->avctx->hwaccel) {
                     ret = avctx->hwaccel->decode_slice(avctx, nal->raw_data, nal->raw_size);
                     h->nb_slice_ctx_queued = 0;
                 } else
@@ -600,20 +697,44 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
         case H264_NAL_DPB:
         case H264_NAL_DPC:
             avpriv_request_sample(avctx, "data partitioning");
-            ret = AVERROR(ENOSYS);
-            goto end;
             break;
         case H264_NAL_SEI:
             ret = ff_h264_sei_decode(&h->sei, &nal->gb, &h->ps, avctx);
+            h->has_recovery_point = h->has_recovery_point || h->sei.recovery_point.recovery_frame_cnt != -1;
+            if (avctx->debug & FF_DEBUG_GREEN_MD)
+                debug_green_metadata(&h->sei.green_metadata, h->avctx);
             if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
                 goto end;
             break;
-        case H264_NAL_SPS:
-            ret = ff_h264_decode_seq_parameter_set(&nal->gb, avctx, &h->ps);
-            if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
-                goto end;
+        case H264_NAL_SPS: {
+            GetBitContext tmp_gb = nal->gb;
+            if (avctx->hwaccel && avctx->hwaccel->decode_params) {
+                ret = avctx->hwaccel->decode_params(avctx,
+                                                    nal->type,
+                                                    nal->raw_data,
+                                                    nal->raw_size);
+                if (ret < 0)
+                    goto end;
+            }
+            if (ff_h264_decode_seq_parameter_set(&tmp_gb, avctx, &h->ps, 0) >= 0)
+                break;
+            av_log(h->avctx, AV_LOG_DEBUG,
+                   "SPS decoding failure, trying again with the complete NAL\n");
+            init_get_bits8(&tmp_gb, nal->raw_data + 1, nal->raw_size - 1);
+            if (ff_h264_decode_seq_parameter_set(&tmp_gb, avctx, &h->ps, 0) >= 0)
+                break;
+            ff_h264_decode_seq_parameter_set(&nal->gb, avctx, &h->ps, 1);
             break;
+        }
         case H264_NAL_PPS:
+            if (avctx->hwaccel && avctx->hwaccel->decode_params) {
+                ret = avctx->hwaccel->decode_params(avctx,
+                                                    nal->type,
+                                                    nal->raw_data,
+                                                    nal->raw_size);
+                if (ret < 0)
+                    goto end;
+            }
             ret = ff_h264_decode_picture_parameter_set(&nal->gb, avctx, &h->ps,
                                                        nal->size_bits);
             if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
@@ -642,8 +763,52 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
 
     ret = 0;
 end:
+
+#if CONFIG_ERROR_RESILIENCE
+    /*
+     * FIXME: Error handling code does not seem to support interlaced
+     * when slices span multiple rows
+     * The ff_er_add_slice calls don't work right for bottom
+     * fields; they cause massive erroneous error concealing
+     * Error marking covers both fields (top and bottom).
+     * This causes a mismatched s->error_count
+     * and a bad error table. Further, the error count goes to
+     * INT_MAX when called for bottom field, because mb_y is
+     * past end by one (callers fault) and resync_mb_y != 0
+     * causes problems for the first MB line, too.
+     */
+    if (!FIELD_PICTURE(h) && h->current_slice &&
+        h->ps.sps == (const SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data &&
+        h->enable_er) {
+
+        H264SliceContext *sl = h->slice_ctx;
+        int use_last_pic = h->last_pic_for_ec.f->buf[0] && !sl->ref_count[0];
+
+        ff_h264_set_erpic(&sl->er.cur_pic, h->cur_pic_ptr);
+
+        if (use_last_pic) {
+            ff_h264_set_erpic(&sl->er.last_pic, &h->last_pic_for_ec);
+            sl->ref_list[0][0].parent = &h->last_pic_for_ec;
+            memcpy(sl->ref_list[0][0].data, h->last_pic_for_ec.f->data, sizeof(sl->ref_list[0][0].data));
+            memcpy(sl->ref_list[0][0].linesize, h->last_pic_for_ec.f->linesize, sizeof(sl->ref_list[0][0].linesize));
+            sl->ref_list[0][0].reference = h->last_pic_for_ec.reference;
+        } else if (sl->ref_count[0]) {
+            ff_h264_set_erpic(&sl->er.last_pic, sl->ref_list[0][0].parent);
+        } else
+            ff_h264_set_erpic(&sl->er.last_pic, NULL);
+
+        if (sl->ref_count[1])
+            ff_h264_set_erpic(&sl->er.next_pic, sl->ref_list[1][0].parent);
+
+        sl->er.ref_count = sl->ref_count[0];
+
+        ff_er_frame_end(&sl->er);
+        if (use_last_pic)
+            memset(&sl->ref_list[0][0], 0, sizeof(sl->ref_list[0][0]));
+    }
+#endif /* CONFIG_ERROR_RESILIENCE */
     /* clean up */
-    if (h->cur_pic_ptr && !h->droppable) {
+    if (h->cur_pic_ptr && !h->droppable && h->has_slice) {
         ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
                                   h->picture_structure == PICT_BOTTOM_FIELD);
     }
@@ -664,6 +829,130 @@ static int get_consumed_bytes(int pos, int buf_size)
     return pos;
 }
 
+static int output_frame(H264Context *h, AVFrame *dst, H264Picture *srcp)
+{
+    AVFrame *src = srcp->f;
+    int ret;
+
+    ret = av_frame_ref(dst, src);
+    if (ret < 0)
+        return ret;
+
+    av_dict_set(&dst->metadata, "stereo_mode", ff_h264_sei_stereo_mode(&h->sei.frame_packing), 0);
+
+    if (srcp->sei_recovery_frame_cnt == 0)
+        dst->key_frame = 1;
+
+    return 0;
+}
+
+static int is_extra(const uint8_t *buf, int buf_size)
+{
+    int cnt= buf[5]&0x1f;
+    const uint8_t *p= buf+6;
+    if (!cnt)
+        return 0;
+    while(cnt--){
+        int nalsize= AV_RB16(p) + 2;
+        if(nalsize > buf_size - (p-buf) || (p[2] & 0x9F) != 7)
+            return 0;
+        p += nalsize;
+    }
+    cnt = *(p++);
+    if(!cnt)
+        return 0;
+    while(cnt--){
+        int nalsize= AV_RB16(p) + 2;
+        if(nalsize > buf_size - (p-buf) || (p[2] & 0x9F) != 8)
+            return 0;
+        p += nalsize;
+    }
+    return 1;
+}
+
+static int finalize_frame(H264Context *h, AVFrame *dst, H264Picture *out, int *got_frame)
+{
+    int ret;
+
+    if (((h->avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) ||
+         (h->avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL) ||
+         out->recovered)) {
+
+        if (!h->avctx->hwaccel &&
+            (out->field_poc[0] == INT_MAX ||
+             out->field_poc[1] == INT_MAX)
+           ) {
+            int p;
+            AVFrame *f = out->f;
+            int field = out->field_poc[0] == INT_MAX;
+            uint8_t *dst_data[4];
+            int linesizes[4];
+            const uint8_t *src_data[4];
+
+            av_log(h->avctx, AV_LOG_DEBUG, "Duplicating field %d to fill missing\n", field);
+
+            for (p = 0; p<4; p++) {
+                dst_data[p] = f->data[p] + (field^1)*f->linesize[p];
+                src_data[p] = f->data[p] +  field   *f->linesize[p];
+                linesizes[p] = 2*f->linesize[p];
+            }
+
+            av_image_copy(dst_data, linesizes, src_data, linesizes,
+                          f->format, f->width, f->height>>1);
+        }
+
+        ret = output_frame(h, dst, out);
+        if (ret < 0)
+            return ret;
+
+        *got_frame = 1;
+
+        if (CONFIG_MPEGVIDEO) {
+            ff_print_debug_info2(h->avctx, dst, NULL,
+                                 out->mb_type,
+                                 out->qscale_table,
+                                 out->motion_val,
+                                 NULL,
+                                 h->mb_width, h->mb_height, h->mb_stride, 1);
+        }
+    }
+
+    return 0;
+}
+
+static int send_next_delayed_frame(H264Context *h, AVFrame *dst_frame,
+                                   int *got_frame, int buf_index)
+{
+    int ret, i, out_idx;
+    H264Picture *out = h->delayed_pic[0];
+
+    h->cur_pic_ptr = NULL;
+    h->first_field = 0;
+
+    out_idx = 0;
+    for (i = 1;
+         h->delayed_pic[i] &&
+         !h->delayed_pic[i]->f->key_frame &&
+         !h->delayed_pic[i]->mmco_reset;
+         i++)
+        if (h->delayed_pic[i]->poc < out->poc) {
+            out     = h->delayed_pic[i];
+            out_idx = i;
+        }
+
+    for (i = out_idx; h->delayed_pic[i]; i++)
+        h->delayed_pic[i] = h->delayed_pic[i + 1];
+
+    if (out) {
+        out->reference &= ~DELAYED_PIC_REF;
+        ret = finalize_frame(h, dst_frame, out, got_frame);
+        if (ret < 0)
+            return ret;
+    }
+
+    return buf_index;
+}
+
 static int h264_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
@@ -671,58 +960,32 @@ static int h264_decode_frame(AVCodecContext *avctx, void *data,
     int buf_size       = avpkt->size;
     H264Context *h     = avctx->priv_data;
     AVFrame *pict      = data;
-    int buf_index      = 0;
+    int buf_index;
     int ret;
-    const uint8_t *new_extradata;
-    int new_extradata_size;
 
     h->flags = avctx->flags;
     h->setup_finished = 0;
     h->nb_slice_ctx_queued = 0;
 
-    /* end of stream, output what is still in the buffers */
-out:
-    if (buf_size == 0) {
-        H264Picture *out;
-        int i, out_idx;
-
-        h->cur_pic_ptr = NULL;
-
-        // FIXME factorize this with the output code below
-        out     = h->delayed_pic[0];
-        out_idx = 0;
-        for (i = 1;
-             h->delayed_pic[i] &&
-             !h->delayed_pic[i]->f->key_frame &&
-             !h->delayed_pic[i]->mmco_reset;
-             i++)
-            if (h->delayed_pic[i]->poc < out->poc) {
-                out     = h->delayed_pic[i];
-                out_idx = i;
-            }
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
 
-        for (i = out_idx; h->delayed_pic[i]; i++)
-            h->delayed_pic[i] = h->delayed_pic[i + 1];
-
-        if (out) {
-            ret = av_frame_ref(pict, out->f);
-            if (ret < 0)
-                return ret;
-            *got_frame = 1;
-        }
-
-        return buf_index;
+    /* end of stream, output what is still in the buffers */
+    if (buf_size == 0)
+        return send_next_delayed_frame(h, pict, got_frame, 0);
+
+    if (h->is_avc && av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, NULL)) {
+        int side_size;
+        uint8_t *side = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+        if (is_extra(side, side_size))
+            ff_h264_decode_extradata(side, side_size,
+                                     &h->ps, &h->is_avc, &h->nal_length_size,
+                                     avctx->err_recognition, avctx);
     }
-
-    new_extradata_size = 0;
-    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
-                                            &new_extradata_size);
-    if (new_extradata_size > 0 && new_extradata) {
-        ret = ff_h264_decode_extradata(new_extradata, new_extradata_size,
-                                       &h->ps, &h->is_avc, &h->nal_length_size,
-                                       avctx->err_recognition, avctx);
-        if (ret < 0)
-            return ret;
+    if (h->is_avc && buf_size >= 9 && buf[0]==1 && buf[2]==0 && (buf[4]&0xFC)==0xFC) {
+        if (is_extra(buf, buf_size))
+            return ff_h264_decode_extradata(buf, buf_size,
+                                            &h->ps, &h->is_avc, &h->nal_length_size,
+                                            avctx->err_recognition, avctx);
     }
 
     buf_index = decode_nal_units(h, buf, buf_size);
@@ -730,33 +993,34 @@ out:
         return AVERROR_INVALIDDATA;
 
     if (!h->cur_pic_ptr && h->nal_unit_type == H264_NAL_END_SEQUENCE) {
-        buf_size = 0;
-        goto out;
+        av_assert0(buf_index <= buf_size);
+        return send_next_delayed_frame(h, pict, got_frame, buf_index);
     }
 
-    if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS) && !h->cur_pic_ptr) {
-        if (avctx->skip_frame >= AVDISCARD_NONREF)
-            return 0;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS) && (!h->cur_pic_ptr || !h->has_slice)) {
+        if (avctx->skip_frame >= AVDISCARD_NONREF ||
+            buf_size >= 4 && !memcmp("Q264", buf, 4))
+            return buf_size;
         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
         return AVERROR_INVALIDDATA;
     }
 
     if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS) ||
         (h->mb_y >= h->mb_height && h->mb_height)) {
-        if (h->field_started)
-            ff_h264_field_end(h, &h->slice_ctx[0], 0);
+        if ((ret = ff_h264_field_end(h, &h->slice_ctx[0], 0)) < 0)
+            return ret;
 
-        *got_frame = 0;
-        if (h->output_frame->buf[0]) {
-            ret = av_frame_ref(pict, h->output_frame);
-            av_frame_unref(h->output_frame);
+        /* Wait for second field. */
+        if (h->next_output_pic) {
+            ret = finalize_frame(h, pict, h->next_output_pic, got_frame);
             if (ret < 0)
                 return ret;
-            *got_frame = 1;
         }
     }
 
-    assert(pict->buf[0] || !*got_frame);
+    av_assert0(pict->buf[0] || !*got_frame);
+
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
 
     return get_consumed_bytes(buf_index, buf_size);
 }
@@ -764,12 +1028,15 @@ out:
 #define OFFSET(x) offsetof(H264Context, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption h264_options[] = {
-    { "enable_er", "Enable error resilience on damaged frames (unsafe)", OFFSET(enable_er), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VD },
+    { "is_avc", "is avc", OFFSET(is_avc), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, 0 },
+    { "nal_length_size", "nal_length_size", OFFSET(nal_length_size), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 4, 0 },
+    { "enable_er", "Enable error resilience on damaged frames (unsafe)", OFFSET(enable_er), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VD },
+    { "x264_build", "Assume this x264 version if no x264 version found in any SEI", OFFSET(x264_build), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, VD },
     { NULL },
 };
 
 static const AVClass h264_class = {
-    .class_name = "h264",
+    .class_name = "H264 Decoder",
     .item_name  = av_default_item_name,
     .option     = h264_options,
     .version    = LIBAVUTIL_VERSION_INT,
@@ -788,9 +1055,6 @@ AVCodec ff_h264_decoder = {
                              AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS |
                              AV_CODEC_CAP_FRAME_THREADS,
     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
-#if CONFIG_H264_CUVID_HWACCEL
-                               HWACCEL_CUVID(h264),
-#endif
 #if CONFIG_H264_DXVA2_HWACCEL
                                HWACCEL_DXVA2(h264),
 #endif
@@ -800,17 +1064,17 @@ AVCodec ff_h264_decoder = {
 #if CONFIG_H264_D3D11VA2_HWACCEL
                                HWACCEL_D3D11VA2(h264),
 #endif
+#if CONFIG_H264_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(h264),
+#endif
 #if CONFIG_H264_VAAPI_HWACCEL
                                HWACCEL_VAAPI(h264),
 #endif
 #if CONFIG_H264_VDPAU_HWACCEL
                                HWACCEL_VDPAU(h264),
 #endif
-#if CONFIG_H264_VDA_HWACCEL
-                               HW_CONFIG_HWACCEL(0, 0, 1, VDA, NONE, ff_h264_vda_hwaccel),
-#endif
-#if CONFIG_H264_VDA_OLD_HWACCEL
-                               HW_CONFIG_HWACCEL(0, 0, 1, VDA_VLD, NONE, ff_h264_vda_old_hwaccel),
+#if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+                               HWACCEL_VIDEOTOOLBOX(h264),
 #endif
                                NULL
                            },
diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h
index cce5e19..1d97232 100644
--- a/libavcodec/h264dec.h
+++ b/libavcodec/h264dec.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,7 @@
 #include "rectangle.h"
 #include "videodsp.h"
 
-#define H264_MAX_PICTURE_COUNT 32
+#define H264_MAX_PICTURE_COUNT 36
 
 #define MAX_MMCO_COUNT         66
 
@@ -68,17 +68,17 @@
 #define MAX_SLICES 32
 
 #ifdef ALLOW_INTERLACE
-#define MB_MBAFF(h)    h->mb_mbaff
-#define MB_FIELD(h)    h->mb_field_decoding_flag
-#define FRAME_MBAFF(h) h->mb_aff_frame
-#define FIELD_PICTURE(h) (h->picture_structure != PICT_FRAME)
+#define MB_MBAFF(h)    (h)->mb_mbaff
+#define MB_FIELD(sl)  (sl)->mb_field_decoding_flag
+#define FRAME_MBAFF(h) (h)->mb_aff_frame
+#define FIELD_PICTURE(h) ((h)->picture_structure != PICT_FRAME)
 #define LEFT_MBS 2
 #define LTOP     0
 #define LBOT     1
 #define LEFT(i)  (i)
 #else
 #define MB_MBAFF(h)      0
-#define MB_FIELD(h)      0
+#define MB_FIELD(sl)     0
 #define FRAME_MBAFF(h)   0
 #define FIELD_PICTURE(h) 0
 #undef  IS_INTERLACED
@@ -91,11 +91,12 @@
 #define FIELD_OR_MBAFF_PICTURE(h) (FRAME_MBAFF(h) || FIELD_PICTURE(h))
 
 #ifndef CABAC
-#define CABAC(h) h->ps.pps->cabac
+#define CABAC(h) (h)->ps.pps->cabac
 #endif
 
-#define CHROMA422(h) (h->ps.sps->chroma_format_idc == 2)
-#define CHROMA444(h) (h->ps.sps->chroma_format_idc == 3)
+#define CHROMA(h)    ((h)->ps.sps->chroma_format_idc)
+#define CHROMA422(h) ((h)->ps.sps->chroma_format_idc == 2)
+#define CHROMA444(h) ((h)->ps.sps->chroma_format_idc == 3)
 
 #define MB_TYPE_REF0       MB_TYPE_ACPRED // dirty but it fits in 16 bit
 #define MB_TYPE_8x8DCT     0x01000000
@@ -151,13 +152,15 @@ typedef struct H264Picture {
     int pic_id;             /**< pic_num (short -> no wrap version of pic_num,
                                  pic_num & max_pic_num; long -> long_pic_num) */
     int long_ref;           ///< 1->long term reference 0->short term reference
-    int ref_poc[2][2][32];  ///< POCs of the frames used as reference (FIXME need per slice)
+    int ref_poc[2][2][32];  ///< POCs of the frames/fields used as reference (FIXME need per slice)
     int ref_count[2][2];    ///< number of entries in ref_poc         (FIXME need per slice)
     int mbaff;              ///< 1 -> MBAFF frame 0-> not MBAFF
     int field_picture;      ///< whether or not picture was encoded in separate fields
 
     int reference;
     int recovered;          ///< picture at IDR or recovery point + recovery count
+    int invalid_gap;
+    int sei_recovery_frame_cnt;
 } H264Picture;
 
 typedef struct H264Ref {
@@ -342,6 +345,7 @@ typedef struct H264Context {
     H264Picture DPB[H264_MAX_PICTURE_COUNT];
     H264Picture *cur_pic_ptr;
     H264Picture cur_pic;
+    H264Picture last_pic_for_ec;
 
     H264SliceContext *slice_ctx;
     int            nb_slice_ctx;
@@ -413,18 +417,19 @@ typedef struct H264Context {
     uint8_t (*mvd_table[2])[2];
     uint8_t *direct_table;
 
+    uint8_t scan_padding[16];
     uint8_t zigzag_scan[16];
     uint8_t zigzag_scan8x8[64];
     uint8_t zigzag_scan8x8_cavlc[64];
     uint8_t field_scan[16];
     uint8_t field_scan8x8[64];
     uint8_t field_scan8x8_cavlc[64];
-    const uint8_t *zigzag_scan_q0;
-    const uint8_t *zigzag_scan8x8_q0;
-    const uint8_t *zigzag_scan8x8_cavlc_q0;
-    const uint8_t *field_scan_q0;
-    const uint8_t *field_scan8x8_q0;
-    const uint8_t *field_scan8x8_cavlc_q0;
+    uint8_t zigzag_scan_q0[16];
+    uint8_t zigzag_scan8x8_q0[64];
+    uint8_t zigzag_scan8x8_cavlc_q0[64];
+    uint8_t field_scan_q0[16];
+    uint8_t field_scan8x8_q0[64];
+    uint8_t field_scan8x8_cavlc_q0[64];
 
     int mb_y;
     int mb_height, mb_width;
@@ -437,6 +442,8 @@ typedef struct H264Context {
     int nal_ref_idc;
     int nal_unit_type;
 
+    int has_slice;          ///< slice NAL is found in the packet, set by decode_nal_units, its state does not need to be preserved outside h264_decode_frame()
+
     /**
      * Used to parse AVC variant of H.264
      */
@@ -452,10 +459,12 @@ typedef struct H264Context {
 
     H264POCContext poc;
 
+    H264Ref default_ref[2];
     H264Picture *short_ref[32];
     H264Picture *long_ref[32];
     H264Picture *delayed_pic[MAX_DELAYED_PIC_COUNT + 2]; // FIXME size?
     int last_pocs[MAX_DELAYED_PIC_COUNT];
+    H264Picture *next_output_pic;
     int next_outputed_poc;
 
     /**
@@ -489,6 +498,11 @@ typedef struct H264Context {
     int prev_interlaced_frame;
 
     /**
+     * Are the SEI recovery points looking valid.
+     */
+    int valid_recovery_point;
+
+    /**
      * recovery_frame is the frame_num at which the next frame should
      * be fully constructed.
      *
@@ -509,24 +523,25 @@ typedef struct H264Context {
 
     int frame_recovered;    ///< Initial frame has been completely recovered
 
+    int has_recovery_point;
+
+    int missing_fields;
+
     /* for frame threading, this is set to 1
      * after finish_setup() has been called, so we cannot modify
      * some context properties (which are supposed to stay constant between
      * slices) anymore */
     int setup_finished;
 
-    /* This is set to 1 if h264_field_start() has been called successfully,
-     * so all per-field state is properly initialized and we can decode
-     * the slice data */
-    int field_started;
+    int cur_chroma_format_idc;
+    int cur_bit_depth_luma;
+    int16_t slice_row[MAX_SLICES]; ///< to detect when MAX_SLICES is too low
 
     /* original AVCodecContext dimensions, used to handle container
      * cropping */
     int width_from_caller;
     int height_from_caller;
 
-    AVFrame *output_frame;
-
     int enable_er;
 
     H264SEIContext sei;
@@ -552,7 +567,7 @@ int ff_h264_get_slice_type(const H264SliceContext *sl);
 int ff_h264_alloc_tables(H264Context *h);
 
 int ff_h264_decode_ref_pic_list_reordering(H264SliceContext *sl, void *logctx);
-int ff_h264_build_ref_list(const H264Context *h, H264SliceContext *sl);
+int ff_h264_build_ref_list(H264Context *h, H264SliceContext *sl);
 void ff_h264_remove_all_refs(H264Context *h);
 
 /**
@@ -580,8 +595,6 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl);
 
 void ff_h264_init_cabac_states(const H264Context *h, H264SliceContext *sl);
 
-void ff_h264_init_dequant_tables(H264Context *h);
-
 void ff_h264_direct_dist_scale_factor(const H264Context *const h, H264SliceContext *sl);
 void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *sl);
 void ff_h264_pred_direct_motion(const H264Context *const h, H264SliceContext *sl,
@@ -644,7 +657,7 @@ static const uint8_t scan8[16 * 3 + 3] = {
     0 +  0 * 8, 0 +  5 * 8, 0 + 10 * 8
 };
 
-static av_always_inline uint32_t pack16to32(int a, int b)
+static av_always_inline uint32_t pack16to32(unsigned a, unsigned b)
 {
 #if HAVE_BIGENDIAN
     return (b & 0xFFFF) + (a << 16);
@@ -653,7 +666,7 @@ static av_always_inline uint32_t pack16to32(int a, int b)
 #endif
 }
 
-static av_always_inline uint16_t pack8to16(int a, int b)
+static av_always_inline uint16_t pack8to16(unsigned a, unsigned b)
 {
 #if HAVE_BIGENDIAN
     return (b & 0xFF) + (a << 8);
@@ -800,6 +813,16 @@ static av_always_inline int get_dct8x8_allowed(const H264Context *h, H264SliceCo
                   0x0001000100010001ULL));
 }
 
+static inline int find_start_code(const uint8_t *buf, int buf_size,
+                           int buf_index, int next_avc)
+{
+    uint32_t state = -1;
+
+    buf_index = avpriv_find_start_code(buf + buf_index, buf + next_avc + 1, &state) - buf - 1;
+
+    return FFMIN(buf_index, buf_size);
+}
+
 int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup);
 
 int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src);
@@ -809,6 +832,8 @@ int ff_h264_slice_context_init(H264Context *h, H264SliceContext *sl);
 
 void ff_h264_draw_horiz_band(const H264Context *h, H264SliceContext *sl, int y, int height);
 
+int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl,
+                                const H2645NAL *nal);
 /**
  * Submit a slice for decoding.
  *
@@ -824,4 +849,6 @@ void ff_h264_flush_change(H264Context *h);
 
 void ff_h264_free_tables(H264Context *h);
 
+void ff_h264_set_erpic(ERPicture *dst, H264Picture *src);
+
 #endif /* AVCODEC_H264DEC_H */
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 684566b..d26f552 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+
 #include "avcodec.h"
 #include "h264dsp.h"
 #include "h264idct.h"
@@ -46,6 +48,14 @@
 #include "h264dsp_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "h264dsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264dsp_template.c"
+#undef BIT_DEPTH
+
 #define BIT_DEPTH 8
 #include "h264addpx_template.c"
 #undef BIT_DEPTH
@@ -130,7 +140,14 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
     case 10:
         H264_DSP(10);
         break;
+    case 12:
+        H264_DSP(12);
+        break;
+    case 14:
+        H264_DSP(14);
+        break;
     default:
+        av_assert0(bit_depth<=8);
         H264_DSP(8);
         break;
     }
@@ -140,4 +157,5 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
     if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
     if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
     if (ARCH_X86) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc);
+    if (ARCH_MIPS) ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 3a5b25b..cbea317 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,11 +28,12 @@
 #define AVCODEC_H264DSP_H
 
 #include <stdint.h>
+#include <stddef.h>
 
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
+typedef void (*h264_weight_func)(uint8_t *block, ptrdiff_t stride, int height,
                                  int log2_denom, int weight, int offset);
 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src,
-                                   int stride, int height, int log2_denom,
+                                   ptrdiff_t stride, int height, int log2_denom,
                                    int weightd, int weights, int offset);
 
 /**
@@ -44,32 +45,32 @@ typedef struct H264DSPContext {
     h264_biweight_func biweight_h264_pixels_tab[4];
 
     /* loop filter */
-    void (*h264_v_loop_filter_luma)(uint8_t *pix /*align 16*/, int stride,
+    void (*h264_v_loop_filter_luma)(uint8_t *pix /*align 16*/, ptrdiff_t stride,
                                     int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_luma)(uint8_t *pix /*align 4 */, int stride,
+    void (*h264_h_loop_filter_luma)(uint8_t *pix /*align 4 */, ptrdiff_t stride,
                                     int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_luma_mbaff)(uint8_t *pix /*align 16*/, int stride,
+    void (*h264_h_loop_filter_luma_mbaff)(uint8_t *pix /*align 16*/, ptrdiff_t stride,
                                           int alpha, int beta, int8_t *tc0);
     /* v/h_loop_filter_luma_intra: align 16 */
-    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride,
+    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, ptrdiff_t stride,
                                           int alpha, int beta);
-    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride,
+    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, ptrdiff_t stride,
                                           int alpha, int beta);
     void (*h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix /*align 16*/,
-                                                int stride, int alpha, int beta);
-    void (*h264_v_loop_filter_chroma)(uint8_t *pix /*align 8*/, int stride,
+                                                ptrdiff_t stride, int alpha, int beta);
+    void (*h264_v_loop_filter_chroma)(uint8_t *pix /*align 8*/, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
-    void (*h264_h_loop_filter_chroma)(uint8_t *pix /*align 4*/, int stride,
+    void (*h264_h_loop_filter_chroma)(uint8_t *pix /*align 4*/, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
     void (*h264_h_loop_filter_chroma_mbaff)(uint8_t *pix /*align 8*/,
-                                            int stride, int alpha, int beta,
+                                            ptrdiff_t stride, int alpha, int beta,
                                             int8_t *tc0);
     void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix /*align 8*/,
-                                            int stride, int alpha, int beta);
+                                            ptrdiff_t stride, int alpha, int beta);
     void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix /*align 8*/,
-                                            int stride, int alpha, int beta);
+                                            ptrdiff_t stride, int alpha, int beta);
     void (*h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix /*align 8*/,
-                                                  int stride, int alpha, int beta);
+                                                  ptrdiff_t stride, int alpha, int beta);
     // h264_loop_filter_strength: simd only. the C version is inlined in h264_loopfilter.c
     void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40],
                                       int8_t ref[2][40], int16_t mv[2][40][2],
@@ -126,5 +127,7 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
                          const int chroma_format_idc);
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                          const int chroma_format_idc);
+void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
+                          const int chroma_format_idc);
 
 #endif /* AVCODEC_H264DSP_H */
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index c2d1394..fe23a2c 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -1,21 +1,21 @@
 /*
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,13 +30,13 @@
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
 #define H264_WEIGHT(W) \
-static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, ptrdiff_t stride, int height, \
                                            int log2_denom, int weight, int offset) \
 { \
     int y; \
     pixel *block = (pixel*)_block; \
-    stride /= sizeof(pixel); \
-    offset <<= (log2_denom + (BIT_DEPTH-8)); \
+    stride >>= sizeof(pixel)-1; \
+    offset = (unsigned)offset << (log2_denom + (BIT_DEPTH-8)); \
     if(log2_denom) offset += 1<<(log2_denom-1); \
     for (y = 0; y < height; y++, block += stride) { \
         op_scale1(0); \
@@ -60,15 +60,15 @@ static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int heig
         op_scale1(15); \
     } \
 } \
-static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride, int height, \
                                              int log2_denom, int weightd, int weights, int offset) \
 { \
     int y; \
     pixel *dst = (pixel*)_dst; \
     pixel *src = (pixel*)_src; \
-    stride /= sizeof(pixel); \
-    offset <<= (BIT_DEPTH-8); \
-    offset = ((offset + 1) | 1) << log2_denom; \
+    stride >>= sizeof(pixel)-1; \
+    offset = (unsigned)offset << (BIT_DEPTH-8); \
+    offset = (unsigned)((offset + 1) | 1) << log2_denom; \
     for (y = 0; y < height; y++, dst += stride, src += stride) { \
         op_scale2(0); \
         op_scale2(1); \
@@ -101,16 +101,16 @@ H264_WEIGHT(2)
 #undef op_scale2
 #undef H264_WEIGHT
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *p_pix, ptrdiff_t xstride, ptrdiff_t ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int i, d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( i = 0; i < 4; i++ ) {
-        const int tc_orig = tc0[i] << (BIT_DEPTH - 8);
+        const int tc_orig = tc0[i] * (1 << (BIT_DEPTH - 8));
         if( tc_orig < 0 ) {
             pix += inner_iters*ystride;
             continue;
@@ -141,7 +141,7 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_p
                     tc++;
                 }
 
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                i_delta = av_clip( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
                 pix[-xstride] = av_clip_pixel( p0 + i_delta );    /* p0' */
                 pix[0]        = av_clip_pixel( q0 - i_delta );    /* q0' */
             }
@@ -149,25 +149,25 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_p
         }
     }
 }
-static void FUNCC(h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_luma)(pix, stride, sizeof(pixel), 4, alpha, beta, tc0);
 }
-static void FUNCC(h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0);
 }
-static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *p_pix, ptrdiff_t xstride, ptrdiff_t ystride, int inner_iters, int alpha, int beta)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( d = 0; d < 4 * inner_iters; d++ ) {
@@ -215,29 +215,29 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8
         pix += ystride;
     }
 }
-static void FUNCC(h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_v_loop_filter_luma_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_luma_intra)(pix, stride, sizeof(pixel), 4, alpha, beta);
 }
-static void FUNCC(h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_h_loop_filter_luma_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta);
 }
-static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *p_pix, ptrdiff_t xstride, ptrdiff_t ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int i, d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     for( i = 0; i < 4; i++ ) {
-        const int tc = ((tc0[i] - 1) << (BIT_DEPTH - 8)) + 1;
+        const int tc = ((tc0[i] - 1U) << (BIT_DEPTH - 8)) + 1;
         if( tc <= 0 ) {
             pix += inner_iters*ystride;
             continue;
@@ -252,7 +252,7 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *
                 FFABS( p1 - p0 ) < beta &&
                 FFABS( q1 - q0 ) < beta ) {
 
-                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                int delta = av_clip( ((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc );
 
                 pix[-xstride] = av_clip_pixel( p0 + delta );    /* p0' */
                 pix[0]        = av_clip_pixel( q0 - delta );    /* q0' */
@@ -261,33 +261,33 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *
         }
     }
 }
-static void FUNCC(h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_chroma)(pix, stride, sizeof(pixel), 2, alpha, beta, tc0);
 }
-static void FUNCC(h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
 }
-static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 1, alpha, beta, tc0);
 }
-static void FUNCC(h264_h_loop_filter_chroma422)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_h_loop_filter_chroma422)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0);
 }
-static void FUNCC(h264_h_loop_filter_chroma422_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+static void FUNCC(h264_h_loop_filter_chroma422_mbaff)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0)
 {
     FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *p_pix, ptrdiff_t xstride, ptrdiff_t ystride, int inner_iters, int alpha, int beta)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( d = 0; d < 4 * inner_iters; d++ ) {
@@ -306,23 +306,23 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uin
         pix += ystride;
     }
 }
-static void FUNCC(h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_v_loop_filter_chroma_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_chroma_intra)(pix, stride, sizeof(pixel), 2, alpha, beta);
 }
-static void FUNCC(h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_h_loop_filter_chroma_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
 }
-static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 1, alpha, beta);
 }
-static void FUNCC(h264_h_loop_filter_chroma422_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_h_loop_filter_chroma422_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta);
 }
-static void FUNCC(h264_h_loop_filter_chroma422_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+static void FUNCC(h264_h_loop_filter_chroma422_mbaff_intra)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
 {
     FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
 }
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index ea08d03..6a771af 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -2,20 +2,20 @@
  * H.264 IDCT
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,3 +38,11 @@
 #define BIT_DEPTH 10
 #include "h264idct_template.c"
 #undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "h264idct_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264idct_template.c"
+#undef BIT_DEPTH
diff --git a/libavcodec/h264idct.h b/libavcodec/h264idct.h
index 816a825..17e0051 100644
--- a/libavcodec/h264idct.h
+++ b/libavcodec/h264idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,5 +38,7 @@ void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(int16_t *block, int qmul);
 H264_IDCT( 8)
 H264_IDCT( 9)
 H264_IDCT(10)
+H264_IDCT(12)
+H264_IDCT(14)
 
 #endif /* AVCODEC_H264IDCT_H */
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index 08a71cd..5993ae2 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -2,20 +2,20 @@
  * H.264 IDCT
  * Copyright (c) 2004-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,15 +35,15 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
     int i;
     pixel *dst = (pixel*)_dst;
     dctcoef *block = (dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     block[0] += 1 << 5;
 
     for(i=0; i<4; i++){
-        const int z0=  block[i + 4*0]     +  block[i + 4*2];
-        const int z1=  block[i + 4*0]     -  block[i + 4*2];
-        const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
-        const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
+        const SUINT z0=  block[i + 4*0]     +  (unsigned)block[i + 4*2];
+        const SUINT z1=  block[i + 4*0]     -  (unsigned)block[i + 4*2];
+        const SUINT z2= (block[i + 4*1]>>1) -  (unsigned)block[i + 4*3];
+        const SUINT z3=  block[i + 4*1]     + (unsigned)(block[i + 4*3]>>1);
 
         block[i + 4*0]= z0 + z3;
         block[i + 4*1]= z1 + z2;
@@ -52,15 +52,15 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
     }
 
     for(i=0; i<4; i++){
-        const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
-        const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
-        const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
-        const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
-
-        dst[i + 0*stride]= av_clip_pixel(dst[i + 0*stride] + ((z0 + z3) >> 6));
-        dst[i + 1*stride]= av_clip_pixel(dst[i + 1*stride] + ((z1 + z2) >> 6));
-        dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6));
-        dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6));
+        const SUINT z0=  block[0 + 4*i]     +  (SUINT)block[2 + 4*i];
+        const SUINT z1=  block[0 + 4*i]     -  (SUINT)block[2 + 4*i];
+        const SUINT z2= (block[1 + 4*i]>>1) -  (SUINT)block[3 + 4*i];
+        const SUINT z3=  block[1 + 4*i]     + (SUINT)(block[3 + 4*i]>>1);
+
+        dst[i + 0*stride]= av_clip_pixel(dst[i + 0*stride] + ((int)(z0 + z3) >> 6));
+        dst[i + 1*stride]= av_clip_pixel(dst[i + 1*stride] + ((int)(z1 + z2) >> 6));
+        dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((int)(z1 - z2) >> 6));
+        dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((int)(z0 - z3) >> 6));
     }
 
     memset(block, 0, 16 * sizeof(dctcoef));
@@ -70,31 +70,31 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
     int i;
     pixel *dst = (pixel*)_dst;
     dctcoef *block = (dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     block[0] += 32;
 
     for( i = 0; i < 8; i++ )
     {
-        const int a0 =  block[i+0*8] + block[i+4*8];
-        const int a2 =  block[i+0*8] - block[i+4*8];
-        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
-        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
-
-        const int b0 = a0 + a6;
-        const int b2 = a2 + a4;
-        const int b4 = a2 - a4;
-        const int b6 = a0 - a6;
-
-        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
-        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
-        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
-        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
-
-        const int b1 = (a7>>2) + a1;
-        const int b3 =  a3 + (a5>>2);
-        const int b5 = (a3>>2) - a5;
-        const int b7 =  a7 - (a1>>2);
+        const unsigned int a0 =  block[i+0*8] + (unsigned)block[i+4*8];
+        const unsigned int a2 =  block[i+0*8] - (unsigned)block[i+4*8];
+        const unsigned int a4 = (block[i+2*8]>>1) - (unsigned)block[i+6*8];
+        const unsigned int a6 = (block[i+6*8]>>1) + (unsigned)block[i+2*8];
+
+        const unsigned int b0 = a0 + a6;
+        const unsigned int b2 = a2 + a4;
+        const unsigned int b4 = a2 - a4;
+        const unsigned int b6 = a0 - a6;
+
+        const int a1 = -block[i+3*8] + (unsigned)block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
+        const int a3 =  block[i+1*8] + (unsigned)block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
+        const int a5 = -block[i+1*8] + (unsigned)block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
+        const int a7 =  block[i+3*8] + (unsigned)block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
+
+        const int b1 = (a7>>2) + (unsigned)a1;
+        const int b3 =  (unsigned)a3 + (a5>>2);
+        const int b5 = (a3>>2) - (unsigned)a5;
+        const int b7 =  (unsigned)a7 - (a1>>2);
 
         block[i+0*8] = b0 + b7;
         block[i+7*8] = b0 - b7;
@@ -107,34 +107,34 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
     }
     for( i = 0; i < 8; i++ )
     {
-        const int a0 =  block[0+i*8] + block[4+i*8];
-        const int a2 =  block[0+i*8] - block[4+i*8];
-        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
-        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
-
-        const int b0 = a0 + a6;
-        const int b2 = a2 + a4;
-        const int b4 = a2 - a4;
-        const int b6 = a0 - a6;
-
-        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
-        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
-        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
-        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
-
-        const int b1 = (a7>>2) + a1;
-        const int b3 =  a3 + (a5>>2);
-        const int b5 = (a3>>2) - a5;
-        const int b7 =  a7 - (a1>>2);
-
-        dst[i + 0*stride] = av_clip_pixel( dst[i + 0*stride] + ((b0 + b7) >> 6) );
-        dst[i + 1*stride] = av_clip_pixel( dst[i + 1*stride] + ((b2 + b5) >> 6) );
-        dst[i + 2*stride] = av_clip_pixel( dst[i + 2*stride] + ((b4 + b3) >> 6) );
-        dst[i + 3*stride] = av_clip_pixel( dst[i + 3*stride] + ((b6 + b1) >> 6) );
-        dst[i + 4*stride] = av_clip_pixel( dst[i + 4*stride] + ((b6 - b1) >> 6) );
-        dst[i + 5*stride] = av_clip_pixel( dst[i + 5*stride] + ((b4 - b3) >> 6) );
-        dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) );
-        dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) );
+        const unsigned a0 =  block[0+i*8] + (unsigned)block[4+i*8];
+        const unsigned a2 =  block[0+i*8] - (unsigned)block[4+i*8];
+        const unsigned a4 = (block[2+i*8]>>1) - (unsigned)block[6+i*8];
+        const unsigned a6 = (block[6+i*8]>>1) + (unsigned)block[2+i*8];
+
+        const unsigned b0 = a0 + a6;
+        const unsigned b2 = a2 + a4;
+        const unsigned b4 = a2 - a4;
+        const unsigned b6 = a0 - a6;
+
+        const int a1 = -(unsigned)block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
+        const int a3 =  (unsigned)block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
+        const int a5 = -(unsigned)block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
+        const int a7 =  (unsigned)block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
+
+        const unsigned b1 = (a7>>2) + (unsigned)a1;
+        const unsigned b3 =  (unsigned)a3 + (a5>>2);
+        const unsigned b5 = (a3>>2) - (unsigned)a5;
+        const unsigned b7 =  (unsigned)a7 - (a1>>2);
+
+        dst[i + 0*stride] = av_clip_pixel( dst[i + 0*stride] + ((int)(b0 + b7) >> 6) );
+        dst[i + 1*stride] = av_clip_pixel( dst[i + 1*stride] + ((int)(b2 + b5) >> 6) );
+        dst[i + 2*stride] = av_clip_pixel( dst[i + 2*stride] + ((int)(b4 + b3) >> 6) );
+        dst[i + 3*stride] = av_clip_pixel( dst[i + 3*stride] + ((int)(b6 + b1) >> 6) );
+        dst[i + 4*stride] = av_clip_pixel( dst[i + 4*stride] + ((int)(b6 - b1) >> 6) );
+        dst[i + 5*stride] = av_clip_pixel( dst[i + 5*stride] + ((int)(b4 - b3) >> 6) );
+        dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((int)(b2 - b5) >> 6) );
+        dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((int)(b0 - b7) >> 6) );
     }
 
     memset(block, 0, 64 * sizeof(dctcoef));
@@ -261,15 +261,15 @@ void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int
 
     for(i=0; i<4; i++){
         const int offset= x_offset[i];
-        const int z0= temp[4*0+i] + temp[4*2+i];
-        const int z1= temp[4*0+i] - temp[4*2+i];
-        const int z2= temp[4*1+i] - temp[4*3+i];
-        const int z3= temp[4*1+i] + temp[4*3+i];
-
-        output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
-        output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
-        output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
-        output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
+        const SUINT z0= temp[4*0+i] + temp[4*2+i];
+        const SUINT z1= temp[4*0+i] - temp[4*2+i];
+        const SUINT z2= temp[4*1+i] - temp[4*3+i];
+        const SUINT z3= temp[4*1+i] + temp[4*3+i];
+
+        output[stride* 0+offset]= (int)((z0 + z3)*qmul + 128 ) >> 8;
+        output[stride* 1+offset]= (int)((z1 + z2)*qmul + 128 ) >> 8;
+        output[stride* 4+offset]= (int)((z1 - z2)*qmul + 128 ) >> 8;
+        output[stride* 5+offset]= (int)((z0 - z3)*qmul + 128 ) >> 8;
     }
 #undef stride
 }
@@ -289,22 +289,22 @@ void FUNCC(ff_h264_chroma422_dc_dequant_idct)(int16_t *_block, int qmul){
 
     for(i=0; i<2; i++){
         const int offset= x_offset[i];
-        const int z0= temp[2*0+i] + temp[2*2+i];
-        const int z1= temp[2*0+i] - temp[2*2+i];
-        const int z2= temp[2*1+i] - temp[2*3+i];
-        const int z3= temp[2*1+i] + temp[2*3+i];
-
-        block[stride*0+offset]= ((z0 + z3)*qmul + 128) >> 8;
-        block[stride*1+offset]= ((z1 + z2)*qmul + 128) >> 8;
-        block[stride*2+offset]= ((z1 - z2)*qmul + 128) >> 8;
-        block[stride*3+offset]= ((z0 - z3)*qmul + 128) >> 8;
+        const SUINT z0= temp[2*0+i] + temp[2*2+i];
+        const SUINT z1= temp[2*0+i] - temp[2*2+i];
+        const SUINT z2= temp[2*1+i] - temp[2*3+i];
+        const SUINT z3= temp[2*1+i] + temp[2*3+i];
+
+        block[stride*0+offset]= (int)((z0 + z3)*qmul + 128) >> 8;
+        block[stride*1+offset]= (int)((z1 + z2)*qmul + 128) >> 8;
+        block[stride*2+offset]= (int)((z1 - z2)*qmul + 128) >> 8;
+        block[stride*3+offset]= (int)((z0 - z3)*qmul + 128) >> 8;
     }
 }
 
 void FUNCC(ff_h264_chroma_dc_dequant_idct)(int16_t *_block, int qmul){
     const int stride= 16*2;
     const int xStride= 16;
-    int a,b,c,d,e;
+    SUINT a,b,c,d,e;
     dctcoef *block = (dctcoef*)_block;
 
     a= block[stride*0 + xStride*0];
@@ -317,8 +317,8 @@ void FUNCC(ff_h264_chroma_dc_dequant_idct)(int16_t *_block, int qmul){
     b= c-d;
     c= c+d;
 
-    block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
-    block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
-    block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
-    block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
+    block[stride*0 + xStride*0]= (int)((a+c)*qmul) >> 7;
+    block[stride*0 + xStride*1]= (int)((e+b)*qmul) >> 7;
+    block[stride*1 + xStride*0]= (int)((a-c)*qmul) >> 7;
+    block[stride*1 + xStride*1]= (int)((e-b)*qmul) >> 7;
 }
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 135babc..5632a58 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "h264pred.h"
@@ -42,6 +43,14 @@
 #include "h264pred_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "h264pred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264pred_template.c"
+#undef BIT_DEPTH
+
 static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright,
                                    ptrdiff_t stride)
 {
@@ -401,7 +410,7 @@ static void pred8x8_tm_vp8_c(uint8_t *src, ptrdiff_t stride)
  */
 av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
                                const int bit_depth,
-                               const int chroma_format_idc)
+                               int chroma_format_idc)
 {
 #undef FUNC
 #undef FUNCC
@@ -571,7 +580,14 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
         case 10:
             H264_PRED(10)
             break;
+        case 12:
+            H264_PRED(12)
+            break;
+        case 14:
+            H264_PRED(14)
+            break;
         default:
+            av_assert0(bit_depth<=8);
             H264_PRED(8)
             break;
     }
@@ -582,4 +598,6 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
         ff_h264_pred_init_arm(h, codec_id, bit_depth, chroma_format_idc);
     if (ARCH_X86)
         ff_h264_pred_init_x86(h, codec_id, bit_depth, chroma_format_idc);
+    if (ARCH_MIPS)
+        ff_h264_pred_init_mips(h, codec_id, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index 795d8f3..2863dc9 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -102,8 +102,7 @@ typedef struct H264PredContext {
     void(*pred8x8l_add[2])(uint8_t *pix /*align  8*/,
                            int16_t *block /*align 16*/, ptrdiff_t stride);
     void(*pred8x8l_filter_add[2])(uint8_t *pix /*align  8*/,
-                                  int16_t *block /*align 16*/,
-                                  int topleft, int topright, ptrdiff_t stride);
+                           int16_t *block /*align 16*/, int topleft, int topright, ptrdiff_t stride);
     void(*pred8x8_add[3])(uint8_t *pix /*align  8*/,
                           const int *block_offset,
                           int16_t *block /*align 16*/, ptrdiff_t stride);
@@ -121,5 +120,7 @@ void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
                            const int bit_depth, const int chroma_format_idc);
 void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                            const int bit_depth, const int chroma_format_idc);
+void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
+                            const int bit_depth, const int chroma_format_idc);
 
 #endif /* AVCODEC_H264PRED_H */
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index 02494aa..2b30fff 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
                                     ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a= AV_RN4PA(src-stride);
 
     AV_WN4PA(src+0*stride, a);
@@ -48,7 +48,7 @@ static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
                                       ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
     AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
     AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
@@ -59,7 +59,7 @@ static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
                               ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
@@ -74,7 +74,7 @@ static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
                                    ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
 
@@ -88,7 +88,7 @@ static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
 
@@ -102,7 +102,7 @@ static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
 
     AV_WN4PA(src+0*stride, a);
@@ -115,7 +115,7 @@ static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
 
     AV_WN4PA(src+0*stride, a);
@@ -128,7 +128,7 @@ static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
 
     AV_WN4PA(src+0*stride, a);
@@ -166,7 +166,7 @@ static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
                                       ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -194,7 +194,7 @@ static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
 {
     pixel *src = (pixel*)_src;
     const pixel *topright = (const pixel*)_topright;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_TOP_EDGE
     LOAD_TOP_RIGHT_EDGE
 //    LOAD_LEFT_EDGE
@@ -222,7 +222,7 @@ static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
                                           ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -251,7 +251,7 @@ static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
 {
     pixel *src = (pixel*)_src;
     const pixel *topright = (const pixel*)_topright;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_TOP_EDGE
     LOAD_TOP_RIGHT_EDGE
 
@@ -277,7 +277,7 @@ static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
                                          ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_LEFT_EDGE
 
     src[0+0*stride]=(l0 + l1 + 1)>>1;
@@ -303,7 +303,7 @@ static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
                                            ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -330,7 +330,7 @@ static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
     const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
     const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
@@ -348,7 +348,7 @@ static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0; i<16; i++){
         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
@@ -374,7 +374,7 @@ static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[-1+i*stride];
@@ -393,7 +393,7 @@ static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[-1+i*stride];
@@ -408,7 +408,7 @@ static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[i-stride];
@@ -423,7 +423,7 @@ static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
 {\
     int i;\
     pixel *src = (pixel*)_src;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
 }
 
@@ -440,7 +440,7 @@ static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
   int a;
   INIT_CLIP
   pixel *src = (pixel*)_src;
-  int stride = _stride/sizeof(pixel);
+  int stride = _stride>>(sizeof(pixel)-1);
   const pixel * const src0 = src +7-stride;
   const pixel *       src1 = src +8*stride-1;
   const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
@@ -489,7 +489,7 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
     const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
 
@@ -517,7 +517,7 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0; i<8; i++){
         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
@@ -544,7 +544,7 @@ static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
     int i;\
     const pixel4 a = PIXEL_SPLAT_X4(v);\
     pixel *src = (pixel*)_src;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     for(i=0; i<8; i++){\
         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
         AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
@@ -567,7 +567,7 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc2;
     pixel4 dc0splat, dc2splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc2=0;
     for(i=0;i<4; i++){
@@ -599,7 +599,7 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc1;
     pixel4 dc0splat, dc1splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc1=0;
     for(i=0;i<4; i++){
@@ -647,7 +647,7 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc1, dc2;
     pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc1=dc2=0;
     for(i=0;i<4; i++){
@@ -713,6 +713,7 @@ static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
     }
 }
 
+//the following 4 function should not be optimized!
 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
 {
     FUNCC(pred8x8_top_dc)(src, stride);
@@ -771,7 +772,7 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
   int a;
   INIT_CLIP
   pixel *src = (pixel*)_src;
-  int stride = _stride/sizeof(pixel);
+  int stride = _stride>>(sizeof(pixel)-1);
   const pixel * const src0 = src +3-stride;
   const pixel *       src1 = src +4*stride-1;
   const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
@@ -885,7 +886,7 @@ static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
                                    int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
 }
@@ -893,7 +894,7 @@ static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
                                     int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_LEFT;
     const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
@@ -903,7 +904,7 @@ static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
                                    int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_TOP;
     const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
@@ -913,7 +914,7 @@ static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
                                int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOP;
@@ -925,7 +926,7 @@ static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
                                        int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     pixel4 a;
 
     PREDICT_8x8_LOAD_LEFT;
@@ -940,7 +941,7 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
 {
     int y;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     pixel4 a, b;
 
     PREDICT_8x8_LOAD_TOP;
@@ -963,7 +964,7 @@ static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
                                       int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_TOPRIGHT;
     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
@@ -986,7 +987,7 @@ static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
                                        int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1010,7 +1011,7 @@ static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
                                            int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1041,7 +1042,7 @@ static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
                                             int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1072,7 +1073,7 @@ static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
                                           int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_TOPRIGHT;
     SRC(0,0)= (t0 + t1 + 1) >> 1;
@@ -1102,7 +1103,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
                                           int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_LEFT;
     SRC(0,0)= (l0 + l1 + 1) >> 1;
     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
@@ -1125,13 +1126,13 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
 }
 
 static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
-                                                int has_topright, ptrdiff_t _stride)
+                                     int has_topright, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
     const dctcoef *block = (const dctcoef*)_block;
     pixel pix[8];
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
 
     pix[0] = t0;
@@ -1143,16 +1144,16 @@ static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block,
     pix[6] = t6;
     pix[7] = t7;
 
-    for (i = 0; i < 8; i++) {
+    for(i=0; i<8; i++){
         pixel v = pix[i];
-        src[0 * stride] = v += block[0];
-        src[1 * stride] = v += block[8];
-        src[2 * stride] = v += block[16];
-        src[3 * stride] = v += block[24];
-        src[4 * stride] = v += block[32];
-        src[5 * stride] = v += block[40];
-        src[6 * stride] = v += block[48];
-        src[7 * stride] = v +  block[56];
+        src[0*stride]= v += block[0];
+        src[1*stride]= v += block[8];
+        src[2*stride]= v += block[16];
+        src[3*stride]= v += block[24];
+        src[4*stride]= v += block[32];
+        src[5*stride]= v += block[40];
+        src[6*stride]= v += block[48];
+        src[7*stride]= v +  block[56];
         src++;
         block++;
     }
@@ -1161,13 +1162,13 @@ static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block,
 }
 
 static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
-                                                  int has_topright, ptrdiff_t _stride)
+                               int has_topright, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
     const dctcoef *block = (const dctcoef*)_block;
     pixel pix[8];
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_LEFT;
 
     pix[0] = l0;
@@ -1179,18 +1180,18 @@ static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block
     pix[6] = l6;
     pix[7] = l7;
 
-    for (i = 0; i < 8; i++) {
+    for(i=0; i<8; i++){
         pixel v = pix[i];
-        src[0] = v += block[0];
-        src[1] = v += block[1];
-        src[2] = v += block[2];
-        src[3] = v += block[3];
-        src[4] = v += block[4];
-        src[5] = v += block[5];
-        src[6] = v += block[6];
-        src[7] = v +  block[7];
-        src   += stride;
-        block += 8;
+        src[0]= v += block[0];
+        src[1]= v += block[1];
+        src[2]= v += block[2];
+        src[3]= v += block[3];
+        src[4]= v += block[4];
+        src[5]= v += block[5];
+        src[6]= v += block[6];
+        src[7]= v +  block[7];
+        src+= stride;
+        block+= 8;
     }
 
     memset(_block, 0, sizeof(dctcoef) * 64);
@@ -1212,7 +1213,7 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     pix -= stride;
     for(i=0; i<4; i++){
         pixel v = pix[0];
@@ -1233,7 +1234,7 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     for(i=0; i<4; i++){
         pixel v = pix[-1];
         pix[0]= v += block[0];
@@ -1253,7 +1254,7 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     pix -= stride;
     for(i=0; i<8; i++){
         pixel v = pix[0];
@@ -1278,7 +1279,7 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     for(i=0; i<8; i++){
         pixel v = pix[-1];
         pix[0]= v += block[0];
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index ec46da2..50e82e2 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -2,26 +2,27 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "h264qpel.h"
 
+#define pixeltmp int16_t
 #define BIT_DEPTH 8
 #include "h264qpel_template.c"
 #undef BIT_DEPTH
@@ -33,6 +34,17 @@
 #define BIT_DEPTH 10
 #include "h264qpel_template.c"
 #undef BIT_DEPTH
+#undef pixeltmp
+
+#define pixeltmp int32_t
+#define BIT_DEPTH 12
+#include "h264qpel_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264qpel_template.c"
+#undef BIT_DEPTH
+
 
 av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
 {
@@ -76,6 +88,12 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
     case 10:
         SET_QPEL(10);
         break;
+    case 12:
+        SET_QPEL(12);
+        break;
+    case 14:
+        SET_QPEL(14);
+        break;
     }
 
     if (ARCH_AARCH64)
@@ -86,4 +104,6 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
         ff_h264qpel_init_ppc(c, bit_depth);
     if (ARCH_X86)
         ff_h264qpel_init_x86(c, bit_depth);
+    if (ARCH_MIPS)
+        ff_h264qpel_init_mips(c, bit_depth);
 }
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 97ce195..7c57ad0 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,5 +35,6 @@ void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
+void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264QPEL_H */
diff --git a/libavcodec/h264qpel_template.c b/libavcodec/h264qpel_template.c
index e846ac9..27c5b8f 100644
--- a/libavcodec/h264qpel_template.c
+++ b/libavcodec/h264qpel_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,14 +75,14 @@ static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstS
 }
 
 #define H264_LOWPASS(OPNAME, OP, OP2) \
-static av_unused void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *_dst, const uint8_t *_src, int dstStride, int srcStride){\
+static av_unused void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *p_dst, const uint8_t *p_src, int dstStride, int srcStride){\
     const int h=2;\
     INIT_CLIP\
     int i;\
-    pixel *dst = (pixel*)_dst;\
-    const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    pixel *dst = (pixel*)p_dst;\
+    const pixel *src = (const pixel*)p_src;\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
@@ -98,8 +98,8 @@ static av_unused void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, const
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -116,16 +116,16 @@ static av_unused void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, const
     }\
 }\
 \
-static av_unused void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static av_unused void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=2;\
     const int w=2;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -156,8 +156,8 @@ static void FUNC(OPNAME ## h264_qpel4_h_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
@@ -175,8 +175,8 @@ static void FUNC(OPNAME ## h264_qpel4_v_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -197,16 +197,16 @@ static void FUNC(OPNAME ## h264_qpel4_v_lowpass)(uint8_t *_dst, const uint8_t *_
     }\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel4_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel4_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=4;\
     const int w=4;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -244,8 +244,8 @@ static void FUNC(OPNAME ## h264_qpel8_h_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
@@ -267,8 +267,8 @@ static void FUNC(OPNAME ## h264_qpel8_v_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -297,16 +297,16 @@ static void FUNC(OPNAME ## h264_qpel8_v_lowpass)(uint8_t *_dst, const uint8_t *_
     }\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel8_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel8_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=8;\
     const int w=8;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -368,7 +368,7 @@ static void FUNC(OPNAME ## h264_qpel16_h_lowpass)(uint8_t *dst, const uint8_t *s
     FUNC(OPNAME ## h264_qpel8_h_lowpass)(dst+8*sizeof(pixel), src+8*sizeof(pixel), dstStride, srcStride);\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, pixeltmp *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
     FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst                , tmp  , src                , dstStride, tmpStride, srcStride);\
     FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst+8*sizeof(pixel), tmp+8, src+8*sizeof(pixel), dstStride, tmpStride, srcStride);\
     src += 8*srcStride;\
@@ -480,13 +480,13 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc33)(uint8_t *dst, const uint
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc22)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     FUNC(OPNAME ## h264_qpel ## SIZE ## _hv_lowpass)(dst, tmp, src, stride, SIZE*sizeof(pixel), stride);\
 }\
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
@@ -496,7 +496,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const uint
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc23)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
@@ -508,7 +508,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc12)(uint8_t *dst, const uint
 {\
     uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
@@ -521,7 +521,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const uint
 {\
     uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel),  stride, SIZE + 5);\
diff --git a/libavcodec/h265_metadata_bsf.c b/libavcodec/h265_metadata_bsf.c
index cb73210..0683cc2 100644
--- a/libavcodec/h265_metadata_bsf.c
+++ b/libavcodec/h265_metadata_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -239,7 +239,7 @@ static int h265_metadata_filter(AVBSFContext *bsf, AVPacket *out)
 
     err = ff_bsf_get_packet(bsf, &in);
     if (err < 0)
-        goto fail;
+        return err;
 
     err = ff_cbs_read_packet(ctx->cbc, au, in);
     if (err < 0) {
@@ -322,8 +322,10 @@ static int h265_metadata_filter(AVBSFContext *bsf, AVPacket *out)
 
     err = 0;
 fail:
-    ff_cbs_fragment_uninit(ctx->cbc, au);
+    ff_cbs_fragment_reset(ctx->cbc, au);
 
+    if (err < 0)
+        av_packet_unref(out);
     av_packet_free(&in);
 
     return err;
@@ -368,70 +370,76 @@ static int h265_metadata_init(AVBSFContext *bsf)
 
     err = 0;
 fail:
-    ff_cbs_fragment_uninit(ctx->cbc, au);
+    ff_cbs_fragment_reset(ctx->cbc, au);
     return err;
 }
 
 static void h265_metadata_close(AVBSFContext *bsf)
 {
     H265MetadataContext *ctx = bsf->priv_data;
+
+    ff_cbs_fragment_free(ctx->cbc, &ctx->access_unit);
     ff_cbs_close(&ctx->cbc);
 }
 
 #define OFFSET(x) offsetof(H265MetadataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
 static const AVOption h265_metadata_options[] = {
     { "aud", "Access Unit Delimiter NAL units",
         OFFSET(aud), AV_OPT_TYPE_INT,
-        { .i64 = PASS }, PASS, REMOVE, 0, "aud" },
-    { "pass",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PASS   }, .unit = "aud" },
-    { "insert", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = INSERT }, .unit = "aud" },
-    { "remove", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE }, .unit = "aud" },
+        { .i64 = PASS }, PASS, REMOVE, FLAGS, "aud" },
+    { "pass",   NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = PASS   }, .flags = FLAGS, .unit = "aud" },
+    { "insert", NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = INSERT }, .flags = FLAGS, .unit = "aud" },
+    { "remove", NULL, 0, AV_OPT_TYPE_CONST,
+        { .i64 = REMOVE }, .flags = FLAGS, .unit = "aud" },
 
     { "sample_aspect_ratio", "Set sample aspect ratio (table E-1)",
         OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL,
-        { .i64 = 0 }, 0, 65535 },
+        { .dbl = 0.0 }, 0, 65535, FLAGS },
 
     { "video_format", "Set video format (table E-2)",
         OFFSET(video_format), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 7 },
+        { .i64 = -1 }, -1, 7, FLAGS },
     { "video_full_range_flag", "Set video full range flag",
         OFFSET(video_full_range_flag), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 1 },
+        { .i64 = -1 }, -1, 1, FLAGS },
     { "colour_primaries", "Set colour primaries (table E-3)",
         OFFSET(colour_primaries), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
     { "transfer_characteristics", "Set transfer characteristics (table E-4)",
         OFFSET(transfer_characteristics), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
     { "matrix_coefficients", "Set matrix coefficients (table E-5)",
         OFFSET(matrix_coefficients), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
 
     { "chroma_sample_loc_type", "Set chroma sample location type (figure E-1)",
         OFFSET(chroma_sample_loc_type), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 6 },
+        { .i64 = -1 }, -1, 6, FLAGS },
 
     { "tick_rate",
         "Set VPS and VUI tick rate (num_units_in_tick / time_scale)",
         OFFSET(tick_rate), AV_OPT_TYPE_RATIONAL,
-        { .i64 = 0 }, 0, UINT_MAX },
+        { .dbl = 0.0 }, 0, UINT_MAX, FLAGS },
     { "num_ticks_poc_diff_one",
         "Set VPS and VUI number of ticks per POC increment",
         OFFSET(num_ticks_poc_diff_one), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, INT_MAX },
+        { .i64 = -1 }, -1, INT_MAX, FLAGS },
 
     { "crop_left", "Set left border crop offset",
         OFFSET(crop_left), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, HEVC_MAX_WIDTH },
+        { .i64 = -1 }, -1, HEVC_MAX_WIDTH, FLAGS },
     { "crop_right", "Set right border crop offset",
         OFFSET(crop_right), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, HEVC_MAX_WIDTH },
+        { .i64 = -1 }, -1, HEVC_MAX_WIDTH, FLAGS },
     { "crop_top", "Set top border crop offset",
         OFFSET(crop_top), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, HEVC_MAX_HEIGHT },
+        { .i64 = -1 }, -1, HEVC_MAX_HEIGHT, FLAGS },
     { "crop_bottom", "Set bottom border crop offset",
         OFFSET(crop_bottom), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, HEVC_MAX_HEIGHT },
+        { .i64 = -1 }, -1, HEVC_MAX_HEIGHT, FLAGS },
 
     { NULL }
 };
@@ -440,7 +448,7 @@ static const AVClass h265_metadata_class = {
     .class_name = "h265_metadata_bsf",
     .item_name  = av_default_item_name,
     .option     = h265_metadata_options,
-    .version    = LIBAVCODEC_VERSION_MAJOR,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static const enum AVCodecID h265_metadata_codec_ids[] = {
diff --git a/libavcodec/h265_profile_level.c b/libavcodec/h265_profile_level.c
new file mode 100644
index 0000000..6604ca2
--- /dev/null
+++ b/libavcodec/h265_profile_level.c
@@ -0,0 +1,249 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h265_profile_level.h"
+
+
+static const H265LevelDescriptor h265_levels[] = {
+    // Name             CpbFactor-Main    MaxSliceSegmentsPerPicture
+    // |  level_idc            | CpbFactor-High           MaxLumaSr      BrFactor-High
+    // |      |   MaxLumaPs    |       |      | MaxTileRows   |   BrFactor-Main | MinCr-Main
+    // |      |      |         |       |      |   | MaxTileCols         |       |    |  MinCr-High
+    { "1",    30,    36864,    350,      0,  16,  1,  1,     552960,    128,      0, 2, 2 },
+    { "2",    60,   122880,   1500,      0,  16,  1,  1,    3686400,   1500,      0, 2, 2 },
+    { "2.1",  63,   245760,   3000,      0,  20,  1,  1,    7372800,   3000,      0, 2, 2 },
+    { "3",    90,   552960,   6000,      0,  30,  2,  2,   16588800,   6000,      0, 2, 2 },
+    { "3.1",  93,   983040,  10000,      0,  40,  3,  3,   33177600,  10000,      0, 2, 2 },
+    { "4",   120,  2228224,  12000,  30000,  75,  5,  5,   66846720,  12000,  30000, 4, 4 },
+    { "4.1", 123,  2228224,  20000,  50000,  75,  5,  5,  133693440,  20000,  50000, 4, 4 },
+    { "5",   150,  8912896,  25000, 100000, 200, 11, 10,  267386880,  25000, 100000, 6, 4 },
+    { "5.1", 153,  8912896,  40000, 160000, 200, 11, 10,  534773760,  40000, 160000, 8, 4 },
+    { "5.2", 156,  8912896,  60000, 240000, 200, 11, 10, 1069547520,  60000, 240000, 8, 4 },
+    { "6",   180, 35651584,  60000, 240000, 600, 22, 20, 1069547520,  60000, 240000, 8, 4 },
+    { "6.1", 183, 35651584, 120000, 480000, 600, 22, 20, 2139095040, 120000, 480000, 8, 4 },
+    { "6.2", 186, 35651584, 240000, 800000, 600, 22, 20, 4278190080, 240000, 800000, 6, 4 },
+};
+
+static const H265ProfileDescriptor h265_profiles[] = {
+    // profile_idc   8bit       one-picture
+    //   HT-profile  | 422chroma    | lower-bit-rate
+    //   |  14bit    |  | 420chroma |  | CpbVclFactor     MinCrScaleFactor
+    //   |  |  12bit |  |  | monochrome|    | CpbNalFactor    |
+    //   |  |  |  10bit |  |  | intra  |    |     | FormatCapabilityFactor
+    { "Monochrome", //  |  |  |  |  |  |    |     |     |     |
+      4, 0, 2, 1, 1, 1, 1, 1, 1, 0, 0, 1,  667,  733, 1.000, 1.0 },
+    { "Monochrome 12",
+      4, 0, 2, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1000, 1100, 1.500, 1.0 },
+    { "Monochrome 16",
+      4, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1333, 1467, 2.000, 1.0 },
+    { "Main",
+      1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1000, 1100, 1.500, 1.0 },
+    { "Screen-Extended Main",
+      9, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1000, 1100, 1.500, 1.0 },
+    { "Main 10",
+      2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1000, 1100, 1.875, 1.0 },
+    { "Screen-Extended Main 10",
+      9, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1000, 1100, 1.875, 1.0 },
+    { "Main 12",
+      4, 0, 2, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1500, 1650, 2.250, 1.0 },
+    { "Main Still Picture",
+      3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1000, 1100, 1.500, 1.0 },
+    { "Main 4:2:2 10",
+      4, 0, 2, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1667, 1833, 2.500, 0.5 },
+    { "Main 4:2:2 12",
+      4, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 1, 2000, 2200, 3.000, 0.5 },
+    { "Main 4:4:4",
+      4, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2000, 2200, 3.000, 0.5 },
+    { "High Throughput 4:4:4",
+      5, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2000, 2200, 3.000, 0.5 },
+    { "Screen-Extended Main 4:4:4",
+      9, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2000, 2200, 3.000, 0.5 },
+    { "Screen-Extended High Throughput 4:4:4",
+      9, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2000, 2200, 3.000, 0.5 },
+    { "Main 4:4:4 10",
+      4, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2500, 2750, 3.750, 0.5 },
+    { "High Throughput 4:4:4 10",
+      5, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2500, 2750, 3.750, 0.5 },
+    { "Screen-Extended Main 4:4:4 10",
+      9, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2500, 2750, 3.750, 0.5 },
+    { "Screen-Extended High Throughput 4:4:4 10",
+      9, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2500, 2750, 3.750, 0.5 },
+    { "Main 4:4:4 12",
+      4, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 3000, 3300, 4.500, 0.5 },
+    { "High Throughput 4:4:4 14",
+      5, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3500, 3850, 5.250, 0.5 },
+    { "Screen-Extended High Throughput 4:4:4 14",
+      9, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3500, 3850, 5.250, 0.5 },
+    { "Main Intra",
+      4, 0, 2, 1, 1, 1, 1, 1, 0, 1, 0, 2, 1000, 1100, 1.500, 1.0 },
+    { "Main 10 Intra",
+      4, 0, 2, 1, 1, 0, 1, 1, 0, 1, 0, 2, 1000, 1100, 1.875, 1.0 },
+    { "Main 12 Intra",
+      4, 0, 2, 1, 0, 0, 1, 1, 0, 1, 0, 2, 1500, 1650, 2.250, 1.0 },
+    { "Main 4:2:2 10 Intra",
+      4, 0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 2, 1667, 1833, 2.500, 0.5 },
+    { "Main 4:2:2 12 Intra",
+      4, 0, 2, 1, 0, 0, 1, 0, 0, 1, 0, 2, 2000, 2200, 3.000, 0.5 },
+    { "Main 4:4:4 Intra",
+      4, 0, 2, 1, 1, 1, 0, 0, 0, 1, 0, 2, 2000, 2200, 3.000, 0.5 },
+    { "Main 4:4:4 10 Intra",
+      4, 0, 2, 1, 1, 0, 0, 0, 0, 1, 0, 2, 2500, 2750, 3.750, 0.5 },
+    { "Main 4:4:4 12 Intra",
+      4, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 2, 3000, 3300, 4.500, 0.5 },
+    { "Main 4:4:4 16 Intra",
+      4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 2, 4000, 4400, 6.000, 0.5 },
+    { "Main 4:4:4 Still Picture",
+      4, 0, 2, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2000, 2200, 3.000, 0.5 },
+    { "Main 4:4:4 16 Still Picture",
+      4, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 2, 4000, 4400, 6.000, 0.5 },
+    { "High Throughput 4:4:4 16 Intra",
+      5, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 4000, 4400, 6.000, 0.5 },
+};
+
+
+const H265LevelDescriptor *ff_h265_get_level(int level_idc)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h265_levels); i++) {
+        if (h265_levels[i].level_idc == level_idc)
+            return &h265_levels[i];
+    }
+
+    return NULL;
+}
+
+const H265ProfileDescriptor *ff_h265_get_profile(const H265RawProfileTierLevel *ptl)
+{
+    int i;
+
+    if (ptl->general_profile_space)
+        return NULL;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h265_profiles); i++) {
+        const H265ProfileDescriptor *profile = &h265_profiles[i];
+
+        if (ptl->general_profile_idc &&
+            ptl->general_profile_idc != profile->profile_idc)
+            continue;
+        if (!ptl->general_profile_compatibility_flag[profile->profile_idc])
+            continue;
+
+#define check_flag(name) \
+        if (profile->name < 2) { \
+            if (profile->name != ptl->general_ ## name ## _constraint_flag) \
+                continue; \
+        }
+        check_flag(max_14bit);
+        check_flag(max_12bit);
+        check_flag(max_10bit);
+        check_flag(max_8bit);
+        check_flag(max_422chroma);
+        check_flag(max_420chroma);
+        check_flag(max_monochrome);
+        check_flag(intra);
+        check_flag(one_picture_only);
+        check_flag(lower_bit_rate);
+#undef check_flag
+
+        return profile;
+    }
+
+    return NULL;
+}
+
+const H265LevelDescriptor *ff_h265_guess_level(const H265RawProfileTierLevel *ptl,
+                                               int64_t bitrate,
+                                               int width, int height,
+                                               int slice_segments,
+                                               int tile_rows, int tile_cols,
+                                               int max_dec_pic_buffering)
+{
+    const H265ProfileDescriptor *profile;
+    int pic_size, lbr_flag, hbr_factor;
+    int i;
+
+    if (ptl)
+        profile = ff_h265_get_profile(ptl);
+    else
+        profile = NULL;
+    if (!profile) {
+        // Default to using multiplication factors for Main profile.
+        profile = &h265_profiles[3];
+    }
+
+    pic_size = width * height;
+
+    if (ptl)
+        lbr_flag = ptl->general_lower_bit_rate_constraint_flag;
+    else
+        lbr_flag = profile->lower_bit_rate > 0;
+    if (profile->profile_idc == 1 || profile->profile_idc == 2) {
+        hbr_factor = 1;
+    } else if (profile->high_throughput) {
+        if (profile->intra)
+            hbr_factor = 24 - 12 * lbr_flag;
+        else
+            hbr_factor = 6;
+    } else {
+        hbr_factor = 2 - lbr_flag;
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h265_levels); i++) {
+        const H265LevelDescriptor *level = &h265_levels[i];
+        int max_br, max_dpb_size;
+
+        if (pic_size > level->max_luma_ps)
+            continue;
+        if (width  * width  > 8 * level->max_luma_ps)
+            continue;
+        if (height * height > 8 * level->max_luma_ps)
+            continue;
+
+        if (slice_segments > level->max_slice_segments_per_picture)
+            continue;
+        if (tile_rows > level->max_tile_rows)
+            continue;
+        if (tile_cols > level->max_tile_cols)
+            continue;
+
+        if (ptl && ptl->general_tier_flag)
+            max_br = level->max_br_high;
+        else
+            max_br = level->max_br_main;
+        if (!max_br)
+            continue;
+        if (bitrate > (int64_t)profile->cpb_nal_factor * hbr_factor * max_br)
+            continue;
+
+        if (pic_size < (level->max_luma_ps >> 2))
+            max_dpb_size = 16;
+        else if (pic_size < (level->max_luma_ps >> 1))
+            max_dpb_size = 14;
+        else if (pic_size < (3 * level->max_luma_ps >> 2))
+            max_dpb_size = 9;
+        else
+            max_dpb_size = 7;
+        if (max_dec_pic_buffering > max_dpb_size)
+            continue;
+
+        return level;
+    }
+
+    return NULL;
+}
diff --git a/libavcodec/h265_profile_level.h b/libavcodec/h265_profile_level.h
new file mode 100644
index 0000000..12c00f0
--- /dev/null
+++ b/libavcodec/h265_profile_level.h
@@ -0,0 +1,89 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_H265_PROFILE_LEVEL_H
+#define AVCODEC_H265_PROFILE_LEVEL_H
+
+#include <stdint.h>
+
+#include "cbs_h265.h"
+
+
+typedef struct H265LevelDescriptor {
+    const char *name;
+    uint8_t     level_idc;
+
+    // Table A.6.
+    uint32_t    max_luma_ps;
+    uint32_t    max_cpb_main;
+    uint32_t    max_cpb_high;
+    uint16_t    max_slice_segments_per_picture;
+    uint8_t     max_tile_rows;
+    uint8_t     max_tile_cols;
+
+    // Table A.7.
+    uint32_t    max_luma_sr;
+    uint32_t    max_br_main;
+    uint32_t    max_br_high;
+    uint8_t     min_cr_base_main;
+    uint8_t     min_cr_base_high;
+} H265LevelDescriptor;
+
+typedef struct H265ProfileDescriptor {
+    const char *name;
+    uint8_t profile_idc;
+    uint8_t high_throughput;
+
+    // Tables A.2, A.3 and A.5.
+    uint8_t max_14bit;
+    uint8_t max_12bit;
+    uint8_t max_10bit;
+    uint8_t max_8bit;
+    uint8_t max_422chroma;
+    uint8_t max_420chroma;
+    uint8_t max_monochrome;
+    uint8_t intra;
+    uint8_t one_picture_only;
+    uint8_t lower_bit_rate;
+
+    // Table A.8.
+    uint16_t cpb_vcl_factor;
+    uint16_t cpb_nal_factor;
+    float format_capability_factor;
+    float min_cr_scale_factor;
+} H265ProfileDescriptor;
+
+
+const H265LevelDescriptor *ff_h265_get_level(int level_idc);
+
+const H265ProfileDescriptor *ff_h265_get_profile(const H265RawProfileTierLevel *ptl);
+
+
+/**
+ * Guess the level of a stream from some parameters.
+ *
+ * Unknown parameters may be zero, in which case they are ignored.
+ */
+const H265LevelDescriptor *ff_h265_guess_level(const H265RawProfileTierLevel *ptl,
+                                               int64_t bitrate,
+                                               int width, int height,
+                                               int slice_segments,
+                                               int tile_rows, int tile_cols,
+                                               int max_dec_pic_buffering);
+
+#endif /* AVCODEC_H265_PROFILE_LEVEL_H */
diff --git a/libavcodec/hap.c b/libavcodec/hap.c
index 770142c..1a330c9 100644
--- a/libavcodec/hap.c
+++ b/libavcodec/hap.c
@@ -2,20 +2,20 @@
  * Vidvox Hap utility functions
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,3 +53,25 @@ av_cold void ff_hap_free_context(HapContext *ctx)
     av_freep(&ctx->chunks);
     av_freep(&ctx->chunk_results);
 }
+
+int ff_hap_parse_section_header(GetByteContext *gbc, int *section_size,
+                                enum HapSectionType *section_type)
+{
+    if (bytestream2_get_bytes_left(gbc) < 4)
+        return AVERROR_INVALIDDATA;
+
+    *section_size = bytestream2_get_le24(gbc);
+    *section_type = bytestream2_get_byte(gbc);
+
+    if (*section_size == 0) {
+        if (bytestream2_get_bytes_left(gbc) < 4)
+            return AVERROR_INVALIDDATA;
+
+        *section_size = bytestream2_get_le32(gbc);
+    }
+
+    if (*section_size > bytestream2_get_bytes_left(gbc) || *section_size < 0)
+        return AVERROR_INVALIDDATA;
+    else
+        return 0;
+}
diff --git a/libavcodec/hap.h b/libavcodec/hap.h
index 9d847f7..bbeed11 100644
--- a/libavcodec/hap.h
+++ b/libavcodec/hap.h
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ enum HapTextureFormat {
     HAP_FMT_RGBDXT1   = 0x0B,
     HAP_FMT_RGBADXT5  = 0x0E,
     HAP_FMT_YCOCGDXT5 = 0x0F,
+    HAP_FMT_RGTC1     = 0x01,
 };
 
 enum HapCompressor {
@@ -65,12 +66,14 @@ typedef struct HapContext {
 
     enum HapTextureFormat opt_tex_fmt; /* Texture type (encoder only) */
     int opt_chunk_count; /* User-requested chunk count (encoder only) */
+    int opt_compressor; /* User-requested compressor (encoder only) */
 
     int chunk_count;
     HapChunk *chunks;
     int *chunk_results;      /* Results from threaded operations */
 
     int tex_rat;             /* Compression ratio */
+    int tex_rat2;             /* Compression ratio of the second texture */
     const uint8_t *tex_data; /* Compressed texture */
     uint8_t *tex_buf;        /* Buffer for compressed texture */
     size_t tex_size;         /* Size of the compressed texture */
@@ -79,8 +82,13 @@ typedef struct HapContext {
 
     int slice_count;         /* Number of slices for threaded operations */
 
+    int texture_count;      /* 2 for HAQA, 1 for other version */
+    int texture_section_size; /* size of the part of the texture section (for HAPQA) */
+    int uncompress_pix_size; /* nb of byte / pixel for the target picture */
+
     /* Pointer to the selected compress or decompress function */
     int (*tex_fun)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*tex_fun2)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
 } HapContext;
 
 /*
@@ -95,4 +103,10 @@ int ff_hap_set_chunk_count(HapContext *ctx, int count, int first_in_frame);
  */
 av_cold void ff_hap_free_context(HapContext *ctx);
 
+/* The first three bytes are the size of the section past the header, or zero
+ * if the length is stored in the next long word. The fourth byte in the first
+ * long word indicates the type of the current section. */
+int ff_hap_parse_section_header(GetByteContext *gbc, int *section_size,
+                                enum HapSectionType *section_type);
+
 #endif /* AVCODEC_HAP_H */
diff --git a/libavcodec/hapdec.c b/libavcodec/hapdec.c
index 1770718..8c84577 100644
--- a/libavcodec/hapdec.c
+++ b/libavcodec/hapdec.c
@@ -3,20 +3,22 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * HapQA and HAPAlphaOnly added by Jokyo Images
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +26,7 @@
  * @file
  * Hap decoder
  *
- * Fourcc: Hap1, Hap5, HapY
+ * Fourcc: Hap1, Hap5, HapY, HapA, HapM
  *
  * https://github.com/Vidvox/hap/blob/master/documentation/HapVideoDRAFT.md
  */
@@ -41,31 +43,6 @@
 #include "texturedsp.h"
 #include "thread.h"
 
-/* The first three bytes are the size of the section past the header, or zero
- * if the length is stored in the next long word. The fourth byte in the first
- * long word indicates the type of the current section. */
-static int parse_section_header(GetByteContext *gbc, int *section_size,
-                                enum HapSectionType *section_type)
-{
-    if (bytestream2_get_bytes_left(gbc) < 4)
-        return AVERROR_INVALIDDATA;
-
-    *section_size = bytestream2_get_le24(gbc);
-    *section_type = bytestream2_get_byte(gbc);
-
-    if (*section_size == 0) {
-        if (bytestream2_get_bytes_left(gbc) < 4)
-            return AVERROR_INVALIDDATA;
-
-        *section_size = bytestream2_get_le32(gbc);
-    }
-
-    if (*section_size > bytestream2_get_bytes_left(gbc))
-        return AVERROR_INVALIDDATA;
-    else
-        return 0;
-}
-
 static int hap_parse_decode_instructions(HapContext *ctx, int size)
 {
     GetByteContext *gbc = &ctx->gbc;
@@ -76,7 +53,7 @@ static int hap_parse_decode_instructions(HapContext *ctx, int size)
 
     while (size > 0) {
         int stream_remaining = bytestream2_get_bytes_left(gbc);
-        ret = parse_section_header(gbc, &section_size, &section_type);
+        ret = ff_hap_parse_section_header(gbc, &section_size, &section_type);
         if (ret != 0)
             return ret;
 
@@ -157,13 +134,16 @@ static int hap_parse_frame_header(AVCodecContext *avctx)
     const char *compressorstr;
     int i, ret;
 
-    ret = parse_section_header(gbc, &section_size, &section_type);
+    ret = ff_hap_parse_section_header(gbc, &ctx->texture_section_size, &section_type);
     if (ret != 0)
         return ret;
 
     if ((avctx->codec_tag == MKTAG('H','a','p','1') && (section_type & 0x0F) != HAP_FMT_RGBDXT1) ||
         (avctx->codec_tag == MKTAG('H','a','p','5') && (section_type & 0x0F) != HAP_FMT_RGBADXT5) ||
-        (avctx->codec_tag == MKTAG('H','a','p','Y') && (section_type & 0x0F) != HAP_FMT_YCOCGDXT5)) {
+        (avctx->codec_tag == MKTAG('H','a','p','Y') && (section_type & 0x0F) != HAP_FMT_YCOCGDXT5) ||
+        (avctx->codec_tag == MKTAG('H','a','p','A') && (section_type & 0x0F) != HAP_FMT_RGTC1) ||
+        ((avctx->codec_tag == MKTAG('H','a','p','M') && (section_type & 0x0F) != HAP_FMT_RGTC1) &&
+                                                        (section_type & 0x0F) != HAP_FMT_YCOCGDXT5)) {
         av_log(avctx, AV_LOG_ERROR,
                "Invalid texture format %#04x.\n", section_type & 0x0F);
         return AVERROR_INVALIDDATA;
@@ -176,7 +156,7 @@ static int hap_parse_frame_header(AVCodecContext *avctx)
             if (ret == 0) {
                 ctx->chunks[0].compressor = section_type & 0xF0;
                 ctx->chunks[0].compressed_offset = 0;
-                ctx->chunks[0].compressed_size = section_size;
+                ctx->chunks[0].compressed_size = ctx->texture_section_size;
             }
             if (ctx->chunks[0].compressor == HAP_COMP_NONE) {
                 compressorstr = "none";
@@ -185,7 +165,7 @@ static int hap_parse_frame_header(AVCodecContext *avctx)
             }
             break;
         case HAP_COMP_COMPLEX:
-            ret = parse_section_header(gbc, &section_size, &section_type);
+            ret = ff_hap_parse_section_header(gbc, &section_size, &section_type);
             if (ret == 0 && section_type != HAP_ST_DECODE_INSTRUCTIONS)
                 ret = AVERROR_INVALIDDATA;
             if (ret == 0)
@@ -265,8 +245,8 @@ static int decompress_chunks_thread(AVCodecContext *avctx, void *arg,
     return 0;
 }
 
-static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
-                                     int slice, int thread_nb)
+static int decompress_texture_thread_internal(AVCodecContext *avctx, void *arg,
+                                              int slice, int thread_nb, int texture_num)
 {
     HapContext *ctx = avctx->priv_data;
     AVFrame *frame = arg;
@@ -294,58 +274,117 @@ static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
         uint8_t *p = frame->data[0] + y * frame->linesize[0] * TEXTURE_BLOCK_H;
         int off  = y * w_block;
         for (x = 0; x < w_block; x++) {
-            ctx->tex_fun(p + x * 16, frame->linesize[0],
-                         d + (off + x) * ctx->tex_rat);
+            if (texture_num == 0) {
+                ctx->tex_fun(p + x * 4 * ctx->uncompress_pix_size, frame->linesize[0],
+                             d + (off + x) * ctx->tex_rat);
+            } else {
+                ctx->tex_fun2(p + x * 4 * ctx->uncompress_pix_size, frame->linesize[0],
+                              d + (off + x) * ctx->tex_rat2);
+            }
         }
     }
 
     return 0;
 }
 
+static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
+                                     int slice, int thread_nb)
+{
+    return decompress_texture_thread_internal(avctx, arg, slice, thread_nb, 0);
+}
+
+static int decompress_texture2_thread(AVCodecContext *avctx, void *arg,
+                                      int slice, int thread_nb)
+{
+    return decompress_texture_thread_internal(avctx, arg, slice, thread_nb, 1);
+}
+
 static int hap_decode(AVCodecContext *avctx, void *data,
                       int *got_frame, AVPacket *avpkt)
 {
     HapContext *ctx = avctx->priv_data;
     ThreadFrame tframe;
-    int ret, i;
+    int ret, i, t;
+    int tex_size;
+    int section_size;
+    enum HapSectionType section_type;
+    int start_texture_section = 0;
+    int tex_rat[2] = {0, 0};
 
     bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
 
-    /* Check for section header */
-    ret = hap_parse_frame_header(avctx);
-    if (ret < 0)
-        return ret;
+    tex_rat[0] = ctx->tex_rat;
+
+    /* check for multi texture header */
+    if (ctx->texture_count == 2) {
+        ret = ff_hap_parse_section_header(&ctx->gbc, &section_size, &section_type);
+        if (ret != 0)
+            return ret;
+        if ((section_type & 0x0F) != 0x0D) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid section type in 2 textures mode %#04x.\n", section_type);
+            return AVERROR_INVALIDDATA;
+        }
+        start_texture_section = 4;
+        tex_rat[1] = ctx->tex_rat2;
+    }
 
     /* Get the output frame ready to receive data */
     tframe.f = data;
     ret = ff_thread_get_buffer(avctx, &tframe, 0);
     if (ret < 0)
         return ret;
-    ff_thread_finish_setup(avctx);
-
-    /* Unpack the DXT texture */
-    if (hap_can_use_tex_in_place(ctx)) {
-        /* Only DXTC texture compression in a contiguous block */
-        ctx->tex_data = ctx->gbc.buffer;
-    } else {
-        /* Perform the second-stage decompression */
-        ret = av_reallocp(&ctx->tex_buf, ctx->tex_size);
+
+    for (t = 0; t < ctx->texture_count; t++) {
+        bytestream2_seek(&ctx->gbc, start_texture_section, SEEK_SET);
+
+        /* Check for section header */
+        ret = hap_parse_frame_header(avctx);
         if (ret < 0)
             return ret;
 
-        avctx->execute2(avctx, decompress_chunks_thread, NULL,
-                        ctx->chunk_results, ctx->chunk_count);
+        start_texture_section += ctx->texture_section_size + 4;
 
-        for (i = 0; i < ctx->chunk_count; i++) {
-            if (ctx->chunk_results[i] < 0)
-                return ctx->chunk_results[i];
+        if (avctx->codec->update_thread_context)
+            ff_thread_finish_setup(avctx);
+
+        /* Unpack the DXT texture */
+        if (hap_can_use_tex_in_place(ctx)) {
+            /* Only DXTC texture compression in a contiguous block */
+            ctx->tex_data = ctx->gbc.buffer;
+            tex_size = FFMIN(ctx->texture_section_size, bytestream2_get_bytes_left(&ctx->gbc));
+        } else {
+            /* Perform the second-stage decompression */
+            ret = av_reallocp(&ctx->tex_buf, ctx->tex_size);
+            if (ret < 0)
+                return ret;
+
+            avctx->execute2(avctx, decompress_chunks_thread, NULL,
+                            ctx->chunk_results, ctx->chunk_count);
+
+            for (i = 0; i < ctx->chunk_count; i++) {
+                if (ctx->chunk_results[i] < 0)
+                    return ctx->chunk_results[i];
+            }
+
+            ctx->tex_data = ctx->tex_buf;
+            tex_size = ctx->tex_size;
         }
 
-        ctx->tex_data = ctx->tex_buf;
-    }
+        if (tex_size < (avctx->coded_width  / TEXTURE_BLOCK_W)
+            *(avctx->coded_height / TEXTURE_BLOCK_H)
+            *tex_rat[t]) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficient data\n");
+            return AVERROR_INVALIDDATA;
+        }
 
-    /* Use the decompress function on the texture, one block per thread */
-    avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, ctx->slice_count);
+        /* Use the decompress function on the texture, one block per thread */
+        if (t == 0){
+            avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, ctx->slice_count);
+        } else{
+            tframe.f = data;
+            avctx->execute2(avctx, decompress_texture2_thread, tframe.f, NULL, ctx->slice_count);
+        }
+    }
 
     /* Frame is ready to be output */
     tframe.f->pict_type = AV_PICTURE_TYPE_I;
@@ -371,26 +410,45 @@ static av_cold int hap_init(AVCodecContext *avctx)
     avctx->coded_width  = FFALIGN(avctx->width,  TEXTURE_BLOCK_W);
     avctx->coded_height = FFALIGN(avctx->height, TEXTURE_BLOCK_H);
 
-    /* Technically only one mode has alpha, but 32 bits are easier to handle */
-    avctx->pix_fmt = AV_PIX_FMT_RGBA;
-
     ff_texturedsp_init(&ctx->dxtc);
 
+    ctx->texture_count  = 1;
+    ctx->uncompress_pix_size = 4;
+
     switch (avctx->codec_tag) {
     case MKTAG('H','a','p','1'):
         texture_name = "DXT1";
         ctx->tex_rat = 8;
         ctx->tex_fun = ctx->dxtc.dxt1_block;
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
         break;
     case MKTAG('H','a','p','5'):
         texture_name = "DXT5";
         ctx->tex_rat = 16;
         ctx->tex_fun = ctx->dxtc.dxt5_block;
+        avctx->pix_fmt = AV_PIX_FMT_RGBA;
         break;
     case MKTAG('H','a','p','Y'):
         texture_name = "DXT5-YCoCg-scaled";
         ctx->tex_rat = 16;
         ctx->tex_fun = ctx->dxtc.dxt5ys_block;
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
+        break;
+    case MKTAG('H','a','p','A'):
+        texture_name = "RGTC1";
+        ctx->tex_rat = 8;
+        ctx->tex_fun = ctx->dxtc.rgtc1u_gray_block;
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        ctx->uncompress_pix_size = 1;
+        break;
+    case MKTAG('H','a','p','M'):
+        texture_name  = "DXT5-YCoCg-scaled / RGTC1";
+        ctx->tex_rat  = 16;
+        ctx->tex_rat2 = 8;
+        ctx->tex_fun  = ctx->dxtc.dxt5ys_block;
+        ctx->tex_fun2 = ctx->dxtc.rgtc1u_alpha_block;
+        avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        ctx->texture_count = 2;
         break;
     default:
         return AVERROR_DECODER_NOT_FOUND;
@@ -415,7 +473,7 @@ static av_cold int hap_close(AVCodecContext *avctx)
 
 AVCodec ff_hap_decoder = {
     .name           = "hap",
-    .long_name      = NULL_IF_CONFIG_SMALL("Vidvox Hap decoder"),
+    .long_name      = NULL_IF_CONFIG_SMALL("Vidvox Hap"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_HAP,
     .init           = hap_init,
diff --git a/libavcodec/hapenc.c b/libavcodec/hapenc.c
index bf2ccaa..3a1bc87 100644
--- a/libavcodec/hapenc.c
+++ b/libavcodec/hapenc.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,12 +52,14 @@ enum HapHeaderLength {
     HAP_HDR_LONG = 8,
 };
 
-static void compress_texture(AVCodecContext *avctx, const AVFrame *f)
+static int compress_texture(AVCodecContext *avctx, uint8_t *out, int out_length, const AVFrame *f)
 {
     HapContext *ctx = avctx->priv_data;
-    uint8_t *out = ctx->tex_buf;
     int i, j;
 
+    if (ctx->tex_size > out_length)
+        return AVERROR_BUFFER_TOO_SMALL;
+
     for (j = 0; j < avctx->height; j += 4) {
         for (i = 0; i < avctx->width; i += 4) {
             uint8_t *p = f->data[0] + i * 4 + j * f->linesize[0];
@@ -65,6 +67,8 @@ static void compress_texture(AVCodecContext *avctx, const AVFrame *f)
             out += step;
         }
     }
+
+    return 0;
 }
 
 /* section_length does not include the header */
@@ -118,7 +122,7 @@ static int hap_compress_frame(AVCodecContext *avctx, uint8_t *dst)
         /* If there is no gain from snappy, just use the raw texture. */
         if (chunk->compressed_size >= chunk->uncompressed_size) {
             av_log(avctx, AV_LOG_VERBOSE,
-                   "Snappy buffer bigger than uncompressed (%zu >= %zu bytes).\n",
+                   "Snappy buffer bigger than uncompressed (%"SIZE_SPECIFIER" >= %"SIZE_SPECIFIER" bytes).\n",
                    chunk->compressed_size, chunk->uncompressed_size);
             memcpy(chunk_dst, chunk_src, chunk->uncompressed_size);
             chunk->compressor = HAP_COMP_NONE;
@@ -196,17 +200,29 @@ static int hap_encode(AVCodecContext *avctx, AVPacket *pkt,
     int pktsize = FFMAX(ctx->tex_size, ctx->max_snappy * ctx->chunk_count) + header_length;
 
     /* Allocate maximum size packet, shrink later. */
-    ret = ff_alloc_packet(pkt, pktsize);
+    ret = ff_alloc_packet2(avctx, pkt, pktsize, header_length);
     if (ret < 0)
         return ret;
 
-    /* DXTC compression. */
-    compress_texture(avctx, frame);
+    if (ctx->opt_compressor == HAP_COMP_NONE) {
+        /* DXTC compression directly to the packet buffer. */
+        ret = compress_texture(avctx, pkt->data + header_length, pkt->size - header_length, frame);
+        if (ret < 0)
+            return ret;
 
-    /* Compress (using Snappy) the frame */
-    final_data_size = hap_compress_frame(avctx, pkt->data + header_length);
-    if (final_data_size < 0)
-        return final_data_size;
+        ctx->chunks[0].compressor = HAP_COMP_NONE;
+        final_data_size = ctx->tex_size;
+    } else {
+        /* DXTC compression. */
+        ret = compress_texture(avctx, ctx->tex_buf, ctx->tex_size, frame);
+        if (ret < 0)
+            return ret;
+
+        /* Compress (using Snappy) the frame */
+        final_data_size = hap_compress_frame(avctx, pkt->data + header_length);
+        if (final_data_size < 0)
+            return final_data_size;
+    }
 
     /* Write header at the start. */
     hap_write_frame_header(ctx, pkt->data, final_data_size + header_length);
@@ -267,10 +283,30 @@ static av_cold int hap_init(AVCodecContext *avctx)
     ctx->tex_size   = FFALIGN(avctx->width,  TEXTURE_BLOCK_W) *
                       FFALIGN(avctx->height, TEXTURE_BLOCK_H) * 4 / ratio;
 
-    /* Round the chunk count to divide evenly on DXT block edges */
-    corrected_chunk_count = av_clip(ctx->opt_chunk_count, 1, HAP_MAX_CHUNKS);
-    while ((ctx->tex_size / (64 / ratio)) % corrected_chunk_count != 0) {
-        corrected_chunk_count--;
+    switch (ctx->opt_compressor) {
+    case HAP_COMP_NONE:
+        /* No benefit chunking uncompressed data */
+        corrected_chunk_count = 1;
+
+        ctx->max_snappy = ctx->tex_size;
+        ctx->tex_buf = NULL;
+        break;
+    case HAP_COMP_SNAPPY:
+        /* Round the chunk count to divide evenly on DXT block edges */
+        corrected_chunk_count = av_clip(ctx->opt_chunk_count, 1, HAP_MAX_CHUNKS);
+        while ((ctx->tex_size / (64 / ratio)) % corrected_chunk_count != 0) {
+            corrected_chunk_count--;
+        }
+
+        ctx->max_snappy = snappy_max_compressed_length(ctx->tex_size / corrected_chunk_count);
+        ctx->tex_buf = av_malloc(ctx->tex_size);
+        if (!ctx->tex_buf) {
+            return AVERROR(ENOMEM);
+        }
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Invalid compresor %02X\n", ctx->opt_compressor);
+        return AVERROR_INVALIDDATA;
     }
     if (corrected_chunk_count != ctx->opt_chunk_count) {
         av_log(avctx, AV_LOG_INFO, "%d chunks requested but %d used.\n",
@@ -280,12 +316,6 @@ static av_cold int hap_init(AVCodecContext *avctx)
     if (ret != 0)
         return ret;
 
-    ctx->max_snappy = snappy_max_compressed_length(ctx->tex_size / corrected_chunk_count);
-
-    ctx->tex_buf  = av_malloc(ctx->tex_size);
-    if (!ctx->tex_buf)
-        return AVERROR(ENOMEM);
-
     return 0;
 }
 
@@ -306,6 +336,9 @@ static const AVOption options[] = {
         { "hap_alpha", "Hap Alpha (DXT5 textures)", 0, AV_OPT_TYPE_CONST, { .i64 = HAP_FMT_RGBADXT5  }, 0, 0, FLAGS, "format" },
         { "hap_q",     "Hap Q (DXT5-YCoCg textures)", 0, AV_OPT_TYPE_CONST, { .i64 = HAP_FMT_YCOCGDXT5 }, 0, 0, FLAGS, "format" },
     { "chunks", "chunk count", OFFSET(opt_chunk_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 1, HAP_MAX_CHUNKS, FLAGS, },
+    { "compressor", "second-stage compressor", OFFSET(opt_compressor), AV_OPT_TYPE_INT, { .i64 = HAP_COMP_SNAPPY }, HAP_COMP_NONE, HAP_COMP_SNAPPY, FLAGS, "compressor" },
+        { "none",       "None", 0, AV_OPT_TYPE_CONST, { .i64 = HAP_COMP_NONE }, 0, 0, FLAGS, "compressor" },
+        { "snappy",     "Snappy", 0, AV_OPT_TYPE_CONST, { .i64 = HAP_COMP_SNAPPY }, 0, 0, FLAGS, "compressor" },
     { NULL },
 };
 
@@ -318,7 +351,7 @@ static const AVClass hapenc_class = {
 
 AVCodec ff_hap_encoder = {
     .name           = "hap",
-    .long_name      = NULL_IF_CONFIG_SMALL("Vidvox Hap encoder"),
+    .long_name      = NULL_IF_CONFIG_SMALL("Vidvox Hap"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_HAP,
     .priv_data_size = sizeof(HapContext),
diff --git a/libavcodec/hapqa_extract_bsf.c b/libavcodec/hapqa_extract_bsf.c
new file mode 100644
index 0000000..5c221848
--- /dev/null
+++ b/libavcodec/hapqa_extract_bsf.c
@@ -0,0 +1,134 @@
+/*
+ * HAPQA extract bitstream filter
+ * Copyright (c) 2017 Jokyo Images
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * HAPQA extract bitstream filter
+ * extract one of the two textures of the HAQA
+ */
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "bytestream.h"
+#include "hap.h"
+
+typedef struct HapqaExtractContext {
+    const AVClass *class;
+    int texture;/* index of the texture to keep (0 for rgb or 1 for alpha) */
+} HapqaExtractContext;
+
+static int check_texture(HapqaExtractContext *ctx, int section_type) {
+    if (((ctx->texture == 0)&&((section_type & 0x0F) == 0x0F)) || /* HapQ texture and rgb extract */
+        ((ctx->texture == 1)&&((section_type & 0x0F) == 0x01))) /* HapAlphaOnly texture and alpha extract */
+    {
+        return 1; /* the texture is the one to keep */
+    } else {
+        return 0;
+    }
+}
+
+static int hapqa_extract(AVBSFContext *bsf, AVPacket *pkt)
+{
+    HapqaExtractContext *ctx = bsf->priv_data;
+    GetByteContext gbc;
+    int section_size;
+    enum HapSectionType section_type;
+    int start_section_size;
+    int target_packet_size = 0;
+    int ret = 0;
+
+    ret = ff_bsf_get_packet_ref(bsf, pkt);
+    if (ret < 0)
+        return ret;
+
+    bytestream2_init(&gbc, pkt->data, pkt->size);
+    ret = ff_hap_parse_section_header(&gbc, &section_size, &section_type);
+    if (ret != 0)
+        goto fail;
+
+    if ((section_type & 0x0F) != 0x0D) {
+        av_log(bsf, AV_LOG_ERROR, "Invalid section type for HAPQA %#04x.\n", section_type & 0x0F);
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    start_section_size = 4;
+
+    bytestream2_seek(&gbc, start_section_size, SEEK_SET);/* go to start of the first texture */
+
+    ret = ff_hap_parse_section_header(&gbc, &section_size, &section_type);
+    if (ret != 0)
+        goto fail;
+
+    target_packet_size = section_size + 4;
+
+    if (check_texture(ctx, section_type) == 0) { /* the texture is not the one to keep */
+        start_section_size += 4 + section_size;
+        bytestream2_seek(&gbc, start_section_size, SEEK_SET);/* go to start of the second texture */
+        ret = ff_hap_parse_section_header(&gbc, &section_size, &section_type);
+        if (ret != 0)
+            goto fail;
+
+        target_packet_size = section_size + 4;
+
+        if (check_texture(ctx, section_type) == 0){ /* the second texture is not the one to keep */
+            av_log(bsf, AV_LOG_ERROR, "No valid texture found.\n");
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+    }
+
+    pkt->data += start_section_size;
+    pkt->size = target_packet_size;
+
+fail:
+    if (ret < 0)
+        av_packet_unref(pkt);
+    return ret;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_HAP, AV_CODEC_ID_NONE,
+};
+
+#define OFFSET(x) offsetof(HapqaExtractContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_BSF_PARAM)
+static const AVOption options[] = {
+    { "texture", "texture to keep", OFFSET(texture), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS, "texture" },
+        { "color", "keep HapQ texture",         0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "texture" },
+        { "alpha", "keep HapAlphaOnly texture", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "texture" },
+    { NULL },
+};
+
+static const AVClass hapqa_extract_class = {
+    .class_name = "hapqa_extract_bsf",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const AVBitStreamFilter ff_hapqa_extract_bsf = {
+    .name       = "hapqa_extract",
+    .filter     = hapqa_extract,
+    .priv_data_size = sizeof(HapqaExtractContext),
+    .priv_class = &hapqa_extract_class,
+    .codec_ids  = codec_ids,
+};
diff --git a/libavcodec/hcom.c b/libavcodec/hcom.c
new file mode 100644
index 0000000..bce9e80
--- /dev/null
+++ b/libavcodec/hcom.c
@@ -0,0 +1,143 @@
+/*
+ * HCOM audio decoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+
+typedef struct HEntry {
+    int16_t l, r;
+} HEntry;
+
+typedef struct HCOMContext {
+    AVCodecContext *avctx;
+
+    uint8_t first_sample;
+    uint8_t sample;
+    int dict_entries;
+    int dict_entry;
+    int delta_compression;
+
+    HEntry *dict;
+} HCOMContext;
+
+static av_cold int hcom_init(AVCodecContext *avctx)
+{
+    HCOMContext *s = avctx->priv_data;
+
+    if (avctx->channels != 1) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (avctx->extradata_size <= 7)
+        return AVERROR_INVALIDDATA;
+    s->dict_entries = AV_RB16(avctx->extradata);
+    if (avctx->extradata_size < s->dict_entries * 4 + 7)
+        return AVERROR_INVALIDDATA;
+    s->delta_compression = AV_RB32(avctx->extradata + 2);
+    s->sample = s->first_sample = avctx->extradata[avctx->extradata_size - 1];
+
+    s->dict = av_calloc(s->dict_entries, sizeof(*s->dict));
+    if (!s->dict)
+        return AVERROR(ENOMEM);
+    for (int i = 0; i < s->dict_entries; i++) {
+        s->dict[i].l = AV_RB16(avctx->extradata + 6 + 4 * i);
+        s->dict[i].r = AV_RB16(avctx->extradata + 6 + 4 * i + 2);
+        if (s->dict[i].l >= 0 &&
+            (s->dict[i].l >= s->dict_entries ||
+             s->dict[i].r >= s->dict_entries))
+            return AVERROR_INVALIDDATA;
+    }
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_U8;
+    s->dict_entry = 0;
+
+    return 0;
+}
+
+static int hcom_decode(AVCodecContext *avctx, void *data,
+                       int *got_frame, AVPacket *pkt)
+{
+    HCOMContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    GetBitContext gb;
+    int ret, n = 0;
+
+    if (pkt->size > INT16_MAX)
+        return AVERROR_INVALIDDATA;
+
+    frame->nb_samples = pkt->size * 8;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    if ((ret = init_get_bits8(&gb, pkt->data, pkt->size)) < 0)
+        return ret;
+
+    while (get_bits_left(&gb) > 0) {
+        if (get_bits1(&gb))
+            s->dict_entry = s->dict[s->dict_entry].r;
+        else
+            s->dict_entry = s->dict[s->dict_entry].l;
+
+        if (s->dict[s->dict_entry].l < 0) {
+            int16_t datum;
+
+            datum = s->dict[s->dict_entry].r;
+
+            if (!s->delta_compression)
+                s->sample = 0;
+            s->sample = (s->sample + datum) & 0xFF;
+
+            frame->data[0][n++] = s->sample;
+
+            s->dict_entry = 0;
+        }
+    }
+
+    frame->nb_samples = n;
+
+    *got_frame = 1;
+
+    return pkt->size;
+}
+
+static av_cold int hcom_close(AVCodecContext *avctx)
+{
+    HCOMContext *s = avctx->priv_data;
+
+    av_freep(&s->dict);
+
+    return 0;
+}
+
+AVCodec ff_hcom_decoder = {
+    .name           = "hcom",
+    .long_name      = NULL_IF_CONFIG_SMALL("HCOM Audio"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_HCOM,
+    .priv_data_size = sizeof(HCOMContext),
+    .init           = hcom_init,
+    .close          = hcom_close,
+    .decode         = hcom_decode,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index 77bd6a6..56b5541 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -1,20 +1,20 @@
 /*
  * HEVC shared code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,7 +22,8 @@
 #define AVCODEC_HEVC_H
 
 /**
- * Table 7-3: NAL unit type codes
+ * Table 7-1 – NAL unit type codes and NAL unit type classes in
+ * T-REC-H.265-201802
  */
 enum HEVCNALUnitType {
     HEVC_NAL_TRAIL_N    = 0,
@@ -66,6 +67,29 @@ enum HEVCNALUnitType {
     HEVC_NAL_FD_NUT     = 38,
     HEVC_NAL_SEI_PREFIX = 39,
     HEVC_NAL_SEI_SUFFIX = 40,
+    HEVC_NAL_RSV_NVCL41 = 41,
+    HEVC_NAL_RSV_NVCL42 = 42,
+    HEVC_NAL_RSV_NVCL43 = 43,
+    HEVC_NAL_RSV_NVCL44 = 44,
+    HEVC_NAL_RSV_NVCL45 = 45,
+    HEVC_NAL_RSV_NVCL46 = 46,
+    HEVC_NAL_RSV_NVCL47 = 47,
+    HEVC_NAL_UNSPEC48   = 48,
+    HEVC_NAL_UNSPEC49   = 49,
+    HEVC_NAL_UNSPEC50   = 50,
+    HEVC_NAL_UNSPEC51   = 51,
+    HEVC_NAL_UNSPEC52   = 52,
+    HEVC_NAL_UNSPEC53   = 53,
+    HEVC_NAL_UNSPEC54   = 54,
+    HEVC_NAL_UNSPEC55   = 55,
+    HEVC_NAL_UNSPEC56   = 56,
+    HEVC_NAL_UNSPEC57   = 57,
+    HEVC_NAL_UNSPEC58   = 58,
+    HEVC_NAL_UNSPEC59   = 59,
+    HEVC_NAL_UNSPEC60   = 60,
+    HEVC_NAL_UNSPEC61   = 61,
+    HEVC_NAL_UNSPEC62   = 62,
+    HEVC_NAL_UNSPEC63   = 63,
 };
 
 enum HEVCSliceType {
@@ -119,6 +143,9 @@ enum {
     // A.4.1: table A.6 allows at most 20 tile columns for any level.
     HEVC_MAX_TILE_COLUMNS = 20,
 
+    // A.4.2: table A.6 allows at most 600 slice segments for any level.
+    HEVC_MAX_SLICE_SEGMENTS = 600,
+
     // 7.4.7.1: in the worst case (tiles_enabled_flag and
     // entropy_coding_sync_enabled_flag are both set), entry points can be
     // placed at the beginning of every Ctb row in every tile, giving an
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 23e05e2..faa36d5 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 #include "libavutil/common.h"
 
 #include "cabac_functions.h"
+#include "hevc_data.h"
 #include "hevc.h"
 #include "hevcdec.h"
 
@@ -67,65 +68,77 @@ static const int8_t num_bins_in_se[] = {
      2, // cbf_luma
      4, // cbf_cb, cbf_cr
      2, // transform_skip_flag[][]
+     2, // explicit_rdpcm_flag[][]
+     2, // explicit_rdpcm_dir_flag[][]
     18, // last_significant_coeff_x_prefix
     18, // last_significant_coeff_y_prefix
      0, // last_significant_coeff_x_suffix
      0, // last_significant_coeff_y_suffix
      4, // significant_coeff_group_flag
-    42, // significant_coeff_flag
+    44, // significant_coeff_flag
     24, // coeff_abs_level_greater1_flag
      6, // coeff_abs_level_greater2_flag
      0, // coeff_abs_level_remaining
      0, // coeff_sign_flag
+     8, // log2_res_scale_abs
+     2, // res_scale_sign_flag
+     1, // cu_chroma_qp_offset_flag
+     1, // cu_chroma_qp_offset_idx
 };
 
 /**
  * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
  */
 static const int elem_offset[sizeof(num_bins_in_se)] = {
-      0,
-      1,
-      2,
-      2,
-      2,
-      2,
-      2,
-      2,
-      5,
-      6,
-      9,
-     12,
-     13,
-     17,
-     17,
-     18,
-     18,
-     18,
-     20,
-     21,
-     22,
-     27,
-     29,
-     31,
-     33,
-     35,
-     35,
-     35,
-     36,
-     37,
-     40,
-     42,
-     46,
-     48,
-     66,
-     84,
-     84,
-     84,
-     88,
-    130,
-    154,
-    160,
-    160,
+    0, // sao_merge_flag
+    1, // sao_type_idx
+    2, // sao_eo_class
+    2, // sao_band_position
+    2, // sao_offset_abs
+    2, // sao_offset_sign
+    2, // end_of_slice_flag
+    2, // split_coding_unit_flag
+    5, // cu_transquant_bypass_flag
+    6, // skip_flag
+    9, // cu_qp_delta
+    12, // pred_mode
+    13, // part_mode
+    17, // pcm_flag
+    17, // prev_intra_luma_pred_mode
+    18, // mpm_idx
+    18, // rem_intra_luma_pred_mode
+    18, // intra_chroma_pred_mode
+    20, // merge_flag
+    21, // merge_idx
+    22, // inter_pred_idc
+    27, // ref_idx_l0
+    29, // ref_idx_l1
+    31, // abs_mvd_greater0_flag
+    33, // abs_mvd_greater1_flag
+    35, // abs_mvd_minus2
+    35, // mvd_sign_flag
+    35, // mvp_lx_flag
+    36, // no_residual_data_flag
+    37, // split_transform_flag
+    40, // cbf_luma
+    42, // cbf_cb, cbf_cr
+    46, // transform_skip_flag[][]
+    48, // explicit_rdpcm_flag[][]
+    50, // explicit_rdpcm_dir_flag[][]
+    52, // last_significant_coeff_x_prefix
+    70, // last_significant_coeff_y_prefix
+    88, // last_significant_coeff_x_suffix
+    88, // last_significant_coeff_y_suffix
+    88, // significant_coeff_group_flag
+    92, // significant_coeff_flag
+    136, // coeff_abs_level_greater1_flag
+    160, // coeff_abs_level_greater2_flag
+    166, // coeff_abs_level_remaining
+    166, // coeff_sign_flag
+    166, // log2_res_scale_abs
+    174, // res_scale_sign_flag
+    176, // cu_chroma_qp_offset_flag
+    177, // cu_chroma_qp_offset_idx
 };
 
 #define CNU 154
@@ -179,6 +192,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       94, 138, 182, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
        79, 108, 123,  63,
@@ -191,11 +208,21 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
       125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
       139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
+      141, 111,
       // coeff_abs_level_greater1_flag
       140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
       122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
       // coeff_abs_level_greater2_flag
-      138, 153, 136, 167, 152, 152, },
+      138, 153, 136, 167, 152, 152,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
     { // sao_merge_flag
       153,
       // sao_type_idx
@@ -242,6 +269,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       149, 107, 167, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
        94, 108, 123, 108,
@@ -254,11 +285,21 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
       154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
       153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
       // coeff_abs_level_greater1_flag
       154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
       136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
       // coeff_abs_level_greater2_flag
-      107, 167, 91, 122, 107, 167, },
+      107, 167, 91, 122, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
     { // sao_merge_flag
       153,
       // sao_type_idx
@@ -305,6 +346,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       149, 92, 167, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
        79, 108, 123,  93,
@@ -317,11 +362,89 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
       154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
       153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
       // coeff_abs_level_greater1_flag
       154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
       136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
       // coeff_abs_level_greater2_flag
-      107, 167, 91, 107, 107, 167, },
+      107, 167, 91, 107, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
+};
+
+static const uint8_t scan_1x1[1] = {
+    0,
+};
+
+static const uint8_t horiz_scan2x2_x[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t horiz_scan2x2_y[4] = {
+    0, 0, 1, 1
+};
+
+static const uint8_t horiz_scan4x4_x[16] = {
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+};
+
+static const uint8_t horiz_scan4x4_y[16] = {
+    0, 0, 0, 0,
+    1, 1, 1, 1,
+    2, 2, 2, 2,
+    3, 3, 3, 3,
+};
+
+static const uint8_t horiz_scan8x8_inv[8][8] = {
+    {  0,  1,  2,  3, 16, 17, 18, 19, },
+    {  4,  5,  6,  7, 20, 21, 22, 23, },
+    {  8,  9, 10, 11, 24, 25, 26, 27, },
+    { 12, 13, 14, 15, 28, 29, 30, 31, },
+    { 32, 33, 34, 35, 48, 49, 50, 51, },
+    { 36, 37, 38, 39, 52, 53, 54, 55, },
+    { 40, 41, 42, 43, 56, 57, 58, 59, },
+    { 44, 45, 46, 47, 60, 61, 62, 63, },
+};
+
+static const uint8_t diag_scan2x2_x[4] = {
+    0, 0, 1, 1,
+};
+
+static const uint8_t diag_scan2x2_y[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t diag_scan2x2_inv[2][2] = {
+    { 0, 2, },
+    { 1, 3, },
+};
+
+static const uint8_t diag_scan4x4_inv[4][4] = {
+    { 0,  2,  5,  9, },
+    { 1,  4,  8, 12, },
+    { 3,  7, 11, 14, },
+    { 6, 10, 13, 15, },
+};
+
+static const uint8_t diag_scan8x8_inv[8][8] = {
+    {  0,  2,  5,  9, 14, 20, 27, 35, },
+    {  1,  4,  8, 13, 19, 26, 34, 42, },
+    {  3,  7, 12, 18, 25, 33, 41, 48, },
+    {  6, 11, 17, 24, 32, 40, 47, 53, },
+    { 10, 16, 23, 31, 39, 46, 52, 57, },
+    { 15, 22, 30, 38, 45, 51, 56, 60, },
+    { 21, 29, 37, 44, 50, 55, 59, 62, },
+    { 28, 36, 43, 49, 54, 58, 61, 63, },
 };
 
 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
@@ -330,26 +453,26 @@ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
         (ctb_addr_ts % s->ps.sps->ctb_width == 2 ||
          (s->ps.sps->ctb_width == 2 &&
           ctb_addr_ts % s->ps.sps->ctb_width == 0))) {
-        memcpy(s->cabac_state, s->HEVClc.cabac_state, HEVC_CONTEXTS);
+        memcpy(s->cabac_state, s->HEVClc->cabac_state, HEVC_CONTEXTS);
     }
 }
 
 static void load_states(HEVCContext *s)
 {
-    memcpy(s->HEVClc.cabac_state, s->cabac_state, HEVC_CONTEXTS);
+    memcpy(s->HEVClc->cabac_state, s->cabac_state, HEVC_CONTEXTS);
 }
 
-static void cabac_reinit(HEVCLocalContext *lc)
+static int cabac_reinit(HEVCLocalContext *lc)
 {
-    skip_bytes(&lc->cc, 0);
+    return skip_bytes(&lc->cc, 0) == NULL ? AVERROR_INVALIDDATA : 0;
 }
 
-static void cabac_init_decoder(HEVCContext *s)
+static int cabac_init_decoder(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
     skip_bits(gb, 1);
     align_get_bits(gb);
-    ff_init_cabac_decoder(&s->HEVClc.cc,
+    return ff_init_cabac_decoder(&s->HEVClc->cc,
                           gb->buffer + get_bits_count(gb) / 8,
                           (get_bits_left(gb) + 7) / 8);
 }
@@ -371,14 +494,19 @@ static void cabac_init_state(HEVCContext *s)
         pre ^= pre >> 31;
         if (pre > 124)
             pre = 124 + (pre & 1);
-        s->HEVClc.cabac_state[i] = pre;
+        s->HEVClc->cabac_state[i] = pre;
     }
+
+    for (i = 0; i < 4; i++)
+        s->HEVClc->stat_coeff[i] = 0;
 }
 
-void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
+int ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
 {
     if (ctb_addr_ts == s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) {
-        cabac_init_decoder(s);
+        int ret = cabac_init_decoder(s);
+        if (ret < 0)
+            return ret;
         if (s->sh.dependent_slice_segment_flag == 0 ||
             (s->ps.pps->tiles_enabled_flag &&
              s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]))
@@ -396,13 +524,27 @@ void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
     } else {
         if (s->ps.pps->tiles_enabled_flag &&
             s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
-            cabac_reinit(&s->HEVClc);
+            int ret;
+            if (s->threads_number == 1)
+                ret = cabac_reinit(s->HEVClc);
+            else {
+                ret = cabac_init_decoder(s);
+            }
+            if (ret < 0)
+                return ret;
             cabac_init_state(s);
         }
         if (s->ps.pps->entropy_coding_sync_enabled_flag) {
             if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {
-                get_cabac_terminate(&s->HEVClc.cc);
-                cabac_reinit(&s->HEVClc);
+                int ret;
+                get_cabac_terminate(&s->HEVClc->cc);
+                if (s->threads_number == 1)
+                    ret = cabac_reinit(s->HEVClc);
+                else {
+                    ret = cabac_init_decoder(s);
+                }
+                if (ret < 0)
+                    return ret;
 
                 if (s->ps.sps->ctb_width == 1)
                     cabac_init_state(s);
@@ -411,9 +553,10 @@ void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
             }
         }
     }
+    return 0;
 }
 
-#define GET_CABAC(ctx) get_cabac(&s->HEVClc.cc, &s->HEVClc.cabac_state[ctx])
+#define GET_CABAC(ctx) get_cabac(&s->HEVClc->cc, &s->HEVClc->cabac_state[ctx])
 
 int ff_hevc_sao_merge_flag_decode(HEVCContext *s)
 {
@@ -425,7 +568,7 @@ int ff_hevc_sao_type_idx_decode(HEVCContext *s)
     if (!GET_CABAC(elem_offset[SAO_TYPE_IDX]))
         return 0;
 
-    if (!get_cabac_bypass(&s->HEVClc.cc))
+    if (!get_cabac_bypass(&s->HEVClc->cc))
         return SAO_BAND;
     return SAO_EDGE;
 }
@@ -433,10 +576,10 @@ int ff_hevc_sao_type_idx_decode(HEVCContext *s)
 int ff_hevc_sao_band_position_decode(HEVCContext *s)
 {
     int i;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 0; i < 4; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
@@ -445,26 +588,26 @@ int ff_hevc_sao_offset_abs_decode(HEVCContext *s)
     int i = 0;
     int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
 
-    while (i < length && get_cabac_bypass(&s->HEVClc.cc))
+    while (i < length && get_cabac_bypass(&s->HEVClc->cc))
         i++;
     return i;
 }
 
 int ff_hevc_sao_offset_sign_decode(HEVCContext *s)
 {
-    return get_cabac_bypass(&s->HEVClc.cc);
+    return get_cabac_bypass(&s->HEVClc->cc);
 }
 
 int ff_hevc_sao_eo_class_decode(HEVCContext *s)
 {
-    int ret = get_cabac_bypass(&s->HEVClc.cc) << 1;
-    ret    |= get_cabac_bypass(&s->HEVClc.cc);
+    int ret = get_cabac_bypass(&s->HEVClc->cc) << 1;
+    ret    |= get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
 
 int ff_hevc_end_of_slice_flag_decode(HEVCContext *s)
 {
-    return get_cabac_terminate(&s->HEVClc.cc);
+    return get_cabac_terminate(&s->HEVClc->cc);
 }
 
 int ff_hevc_cu_transquant_bypass_flag_decode(HEVCContext *s)
@@ -476,12 +619,12 @@ int ff_hevc_skip_flag_decode(HEVCContext *s, int x0, int y0, int x_cb, int y_cb)
 {
     int min_cb_width = s->ps.sps->min_cb_width;
     int inc = 0;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
-    if (s->HEVClc.ctb_left_flag || x0b)
+    if (s->HEVClc->ctb_left_flag || x0b)
         inc = !!SAMPLE_CTB(s->skip_flag, x_cb - 1, y_cb);
-    if (s->HEVClc.ctb_up_flag || y0b)
+    if (s->HEVClc->ctb_up_flag || y0b)
         inc += !!SAMPLE_CTB(s->skip_flag, x_cb, y_cb - 1);
 
     return GET_CABAC(elem_offset[SKIP_FLAG] + inc);
@@ -499,22 +642,40 @@ int ff_hevc_cu_qp_delta_abs(HEVCContext *s)
     }
     if (prefix_val >= 5) {
         int k = 0;
-        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
             suffix_val += 1 << k;
             k++;
         }
-        if (k == CABAC_MAX_BIN)
+        if (k == CABAC_MAX_BIN) {
             av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
+            return AVERROR_INVALIDDATA;
+        }
 
         while (k--)
-            suffix_val += get_cabac_bypass(&s->HEVClc.cc) << k;
+            suffix_val += get_cabac_bypass(&s->HEVClc->cc) << k;
     }
     return prefix_val + suffix_val;
 }
 
 int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s)
 {
-    return get_cabac_bypass(&s->HEVClc.cc);
+    return get_cabac_bypass(&s->HEVClc->cc);
+}
+
+int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_FLAG]);
+}
+
+int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s)
+{
+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
+    int i = 0;
+
+    while (i < c_max && GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
+        i++;
+
+    return i;
 }
 
 int ff_hevc_pred_mode_decode(HEVCContext *s)
@@ -525,14 +686,14 @@ int ff_hevc_pred_mode_decode(HEVCContext *s)
 int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, int y0)
 {
     int inc = 0, depth_left = 0, depth_top = 0;
-    int x0b  = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b  = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b  = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b  = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
     int x_cb = x0 >> s->ps.sps->log2_min_cb_size;
     int y_cb = y0 >> s->ps.sps->log2_min_cb_size;
 
-    if (s->HEVClc.ctb_left_flag || x0b)
+    if (s->HEVClc->ctb_left_flag || x0b)
         depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1];
-    if (s->HEVClc.ctb_up_flag || y0b)
+    if (s->HEVClc->ctb_up_flag || y0b)
         depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb];
 
     inc += (depth_left > ct_depth);
@@ -546,7 +707,7 @@ int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
     if (GET_CABAC(elem_offset[PART_MODE])) // 1
         return PART_2Nx2N;
     if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
-        if (s->HEVClc.cu.pred_mode == MODE_INTRA) // 0
+        if (s->HEVClc->cu.pred_mode == MODE_INTRA) // 0
             return PART_NxN;
         if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01
             return PART_2NxN;
@@ -566,21 +727,21 @@ int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
     if (GET_CABAC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
         if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 011
             return PART_2NxN;
-        if (get_cabac_bypass(&s->HEVClc.cc)) // 0101
+        if (get_cabac_bypass(&s->HEVClc->cc)) // 0101
             return PART_2NxnD;
         return PART_2NxnU; // 0100
     }
 
     if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 001
         return PART_Nx2N;
-    if (get_cabac_bypass(&s->HEVClc.cc)) // 0001
+    if (get_cabac_bypass(&s->HEVClc->cc)) // 0001
         return PART_nRx2N;
     return PART_nLx2N;  // 0000
 }
 
 int ff_hevc_pcm_flag_decode(HEVCContext *s)
 {
-    return get_cabac_terminate(&s->HEVClc.cc);
+    return get_cabac_terminate(&s->HEVClc->cc);
 }
 
 int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
@@ -591,7 +752,7 @@ int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
 int ff_hevc_mpm_idx_decode(HEVCContext *s)
 {
     int i = 0;
-    while (i < 2 && get_cabac_bypass(&s->HEVClc.cc))
+    while (i < 2 && get_cabac_bypass(&s->HEVClc->cc))
         i++;
     return i;
 }
@@ -599,10 +760,10 @@ int ff_hevc_mpm_idx_decode(HEVCContext *s)
 int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCContext *s)
 {
     int i;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 0; i < 4; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
@@ -612,8 +773,8 @@ int ff_hevc_intra_chroma_pred_mode_decode(HEVCContext *s)
     if (!GET_CABAC(elem_offset[INTRA_CHROMA_PRED_MODE]))
         return 4;
 
-    ret  = get_cabac_bypass(&s->HEVClc.cc) << 1;
-    ret |= get_cabac_bypass(&s->HEVClc.cc);
+    ret  = get_cabac_bypass(&s->HEVClc->cc) << 1;
+    ret |= get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
 
@@ -622,7 +783,7 @@ int ff_hevc_merge_idx_decode(HEVCContext *s)
     int i = GET_CABAC(elem_offset[MERGE_IDX]);
 
     if (i != 0) {
-        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc.cc))
+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc->cc))
             i++;
     }
     return i;
@@ -637,7 +798,7 @@ int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH)
 {
     if (nPbW + nPbH == 12)
         return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
-    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc.ct.depth))
+    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc->ct_depth))
         return PRED_BI;
 
     return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
@@ -652,7 +813,7 @@ int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx)
     while (i < max_ctx && GET_CABAC(elem_offset[REF_IDX_L0] + i))
         i++;
     if (i == 2) {
-        while (i < max && get_cabac_bypass(&s->HEVClc.cc))
+        while (i < max && get_cabac_bypass(&s->HEVClc->cc))
             i++;
     }
 
@@ -669,35 +830,37 @@ int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s)
     return GET_CABAC(elem_offset[NO_RESIDUAL_DATA_FLAG]);
 }
 
-int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s)
+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCContext *s)
 {
     return GET_CABAC(elem_offset[ABS_MVD_GREATER0_FLAG]);
 }
 
-int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s)
+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCContext *s)
 {
     return GET_CABAC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
 }
 
-int ff_hevc_mvd_decode(HEVCContext *s)
+static av_always_inline int mvd_decode(HEVCContext *s)
 {
     int ret = 2;
     int k = 1;
 
-    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
-        ret += 1 << k;
+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
+        ret += 1U << k;
         k++;
     }
-    if (k == CABAC_MAX_BIN)
+    if (k == CABAC_MAX_BIN) {
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
+        return 0;
+    }
     while (k--)
-        ret += get_cabac_bypass(&s->HEVClc.cc) << k;
-    return get_cabac_bypass_sign(&s->HEVClc.cc, -ret);
+        ret += get_cabac_bypass(&s->HEVClc->cc) << k;
+    return get_cabac_bypass_sign(&s->HEVClc->cc, -ret);
 }
 
-int ff_hevc_mvd_sign_flag_decode(HEVCContext *s)
+static av_always_inline int mvd_sign_flag_decode(HEVCContext *s)
 {
-    return get_cabac_bypass_sign(&s->HEVClc.cc, -1);
+    return get_cabac_bypass_sign(&s->HEVClc->cc, -1);
 }
 
 int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size)
@@ -715,53 +878,73 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
 }
 
-int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
 {
     return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
 }
 
-#define LAST_SIG_COEFF(elem)                                                    \
-    int i = 0;                                                                  \
-    int max = (log2_size << 1) - 1;                                             \
-    int ctx_offset, ctx_shift;                                                  \
-                                                                                \
-    if (c_idx == 0) {                                                           \
-        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);             \
-        ctx_shift = (log2_size + 1) >> 2;                                       \
-    } else {                                                                    \
-        ctx_offset = 15;                                                        \
-        ctx_shift = log2_size - 2;                                              \
-    }                                                                           \
-    while (i < max &&                                                           \
-           GET_CABAC(elem_offset[elem] + (i >> ctx_shift) + ctx_offset))        \
-        i++;                                                                    \
-    return i;
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+{
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+}
 
-int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
 {
-    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_X_PREFIX)
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
 }
 
-int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size)
+int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+    int i =0;
+
+    while (i < 4 && GET_CABAC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
+        i++;
+
+    return i;
+}
+
+int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+    return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+}
+
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
 {
-    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_Y_PREFIX)
+    int i = 0;
+    int max = (log2_size << 1) - 1;
+    int ctx_offset, ctx_shift;
+
+    if (!c_idx) {
+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+        ctx_shift = (log2_size + 1) >> 2;
+    } else {
+        ctx_offset = 15;
+        ctx_shift = log2_size - 2;
+    }
+    while (i < max &&
+           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scx_prefix = i;
+
+    i = 0;
+    while (i < max &&
+           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scy_prefix = i;
 }
 
-int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
+static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
                                                  int last_significant_coeff_prefix)
 {
     int i;
     int length = (last_significant_coeff_prefix >> 1) - 1;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 1; i < length; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
-int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
 {
     int inc;
 
@@ -769,58 +952,19 @@ int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int c
 
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
 }
-
-int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c, int y_c,
-                                          int log2_trafo_size, int scan_idx, int prev_sig)
+static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
+                                           int offset, const uint8_t *ctx_idx_map)
 {
-    static const uint8_t ctx_idx_map[] = {
-        0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8
-    };
-    int x_cg = x_c >> 2;
-    int y_cg = y_c >> 2;
-    int sig_ctx, inc;
-
-    if (x_c + y_c == 0) {
-        sig_ctx = 0;
-    } else if (log2_trafo_size == 2) {
-        sig_ctx = ctx_idx_map[(y_c << 2) + x_c];
-    } else {
-        switch (prev_sig) {
-        case 0: {
-                int x_off = x_c & 3;
-                int y_off = y_c & 3;
-                sig_ctx   = ((x_off + y_off) == 0) ? 2 : ((x_off + y_off) <= 2) ? 1 : 0;
-            }
-            break;
-        case 1:
-            sig_ctx = 2 - FFMIN(y_c & 3, 2);
-            break;
-        case 2:
-            sig_ctx = 2 - FFMIN(x_c & 3, 2);
-            break;
-        default:
-            sig_ctx = 2;
-        }
-
-        if (c_idx == 0 && (x_cg > 0 || y_cg > 0))
-            sig_ctx += 3;
-
-        if (log2_trafo_size == 3) {
-            sig_ctx += (scan_idx == SCAN_DIAG) ? 9 : 15;
-        } else {
-            sig_ctx += c_idx ? 12 : 21;
-        }
-    }
-
-    if (c_idx == 0)
-        inc = sig_ctx;
-    else
-        inc = sig_ctx + 27;
-
+    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+{
+    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+}
+
+static av_always_inline int coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc)
 {
 
     if (c_idx > 0)
@@ -829,7 +973,7 @@ int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc)
+static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc)
 {
     if (c_idx > 0)
         inc += 4;
@@ -837,37 +981,577 @@ int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int base_level, int rc_rice_param)
+static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
 {
     int prefix = 0;
     int suffix = 0;
     int last_coeff_abs_level_remaining;
     int i;
 
-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc))
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
         prefix++;
-    if (prefix == CABAC_MAX_BIN)
-        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+
     if (prefix < 3) {
         for (i = 0; i < rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
     } else {
         int prefix_minus3 = prefix - 3;
+
+        if (prefix == CABAC_MAX_BIN || prefix_minus3 + rc_rice_param >= 31) {
+            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+            return 0;
+        }
+
         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
                                               << rc_rice_param) + suffix;
     }
     return last_coeff_abs_level_remaining;
 }
 
-int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb)
+static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
 {
     int i;
     int ret = 0;
 
     for (i = 0; i < nb; i++)
-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
+
+void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                int log2_trafo_size, enum ScanType scan_idx,
+                                int c_idx)
+{
+#define GET_COORD(offset, n)                                    \
+    do {                                                        \
+        x_c = (x_cg << 2) + scan_x_off[n];                      \
+        y_c = (y_cg << 2) + scan_y_off[n];                      \
+    } while (0)
+    HEVCLocalContext *lc = s->HEVClc;
+    int transform_skip_flag = 0;
+
+    int last_significant_coeff_x, last_significant_coeff_y;
+    int last_scan_pos;
+    int n_end;
+    int num_coeff = 0;
+    int greater1_ctx = 1;
+
+    int num_last_subset;
+    int x_cg_last_sig, y_cg_last_sig;
+
+    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+
+    ptrdiff_t stride = s->frame->linesize[c_idx];
+    int hshift = s->ps.sps->hshift[c_idx];
+    int vshift = s->ps.sps->vshift[c_idx];
+    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+    int explicit_rdpcm_flag = 0;
+    int explicit_rdpcm_dir_flag;
+
+    int trafo_size = 1 << log2_trafo_size;
+    int i;
+    int qp,shift,add,scale,scale_m;
+    static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+    const uint8_t *scale_matrix = NULL;
+    uint8_t dc_scale;
+    int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+                                         lc->tu.intra_pred_mode_c;
+
+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+
+    // Derive QP for dequant
+    if (!lc->cu.cu_transquant_bypass_flag) {
+        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+        static const uint8_t rem6[51 + 4 * 6 + 1] = {
+            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+            4, 5, 0, 1, 2, 3, 4, 5, 0, 1
+        };
+
+        static const uint8_t div6[51 + 4 * 6 + 1] = {
+            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
+            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
+            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+            10, 10, 11, 11, 11, 11, 11, 11, 12, 12
+        };
+        int qp_y = lc->qp_y;
+
+        if (s->ps.pps->transform_skip_enabled_flag &&
+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+        }
+
+        if (c_idx == 0) {
+            qp = qp_y + s->ps.sps->qp_bd_offset;
+        } else {
+            int qp_i, offset;
+
+            if (c_idx == 1)
+                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset +
+                         lc->tu.cu_qp_offset_cb;
+            else
+                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset +
+                         lc->tu.cu_qp_offset_cr;
+
+            qp_i = av_clip(qp_y + offset, - s->ps.sps->qp_bd_offset, 57);
+            if (s->ps.sps->chroma_format_idc == 1) {
+                if (qp_i < 30)
+                    qp = qp_i;
+                else if (qp_i > 43)
+                    qp = qp_i - 6;
+                else
+                    qp = qp_c[qp_i - 30];
+            } else {
+                if (qp_i > 51)
+                    qp = 51;
+                else
+                    qp = qp_i;
+            }
+
+            qp += s->ps.sps->qp_bd_offset;
+        }
+
+        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
+        add      = 1 << (shift-1);
+        scale    = level_scale[rem6[qp]] << (div6[qp]);
+        scale_m  = 16; // default when no custom scaling lists.
+        dc_scale = 16;
+
+        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
+            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+
+            matrix_id = 3 * matrix_id + c_idx;
+
+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            if (log2_trafo_size >= 4)
+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+        }
+    } else {
+        shift        = 0;
+        add          = 0;
+        scale        = 0;
+        dc_scale     = 0;
+    }
+
+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+        if (explicit_rdpcm_flag) {
+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+        }
+    }
+
+    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+                                           &last_significant_coeff_x, &last_significant_coeff_y);
+
+    if (last_significant_coeff_x > 3) {
+        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
+        (2 + (last_significant_coeff_x & 1)) +
+        suffix;
+    }
+
+    if (last_significant_coeff_y > 3) {
+        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
+        (2 + (last_significant_coeff_y & 1)) +
+        suffix;
+    }
+
+    if (scan_idx == SCAN_VERT)
+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
+
+    x_cg_last_sig = last_significant_coeff_x >> 2;
+    y_cg_last_sig = last_significant_coeff_y >> 2;
+
+    switch (scan_idx) {
+    case SCAN_DIAG: {
+        int last_x_c = last_significant_coeff_x & 3;
+        int last_y_c = last_significant_coeff_y & 3;
+
+        scan_x_off = ff_hevc_diag_scan4x4_x;
+        scan_y_off = ff_hevc_diag_scan4x4_y;
+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+        if (trafo_size == 4) {
+            scan_x_cg = scan_1x1;
+            scan_y_cg = scan_1x1;
+        } else if (trafo_size == 8) {
+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = diag_scan2x2_x;
+            scan_y_cg = diag_scan2x2_y;
+        } else if (trafo_size == 16) {
+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan4x4_x;
+            scan_y_cg = ff_hevc_diag_scan4x4_y;
+        } else { // trafo_size == 32
+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan8x8_x;
+            scan_y_cg = ff_hevc_diag_scan8x8_y;
+        }
+        break;
+    }
+    case SCAN_HORIZ:
+        scan_x_cg = horiz_scan2x2_x;
+        scan_y_cg = horiz_scan2x2_y;
+        scan_x_off = horiz_scan4x4_x;
+        scan_y_off = horiz_scan4x4_y;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+        break;
+    default: //SCAN_VERT
+        scan_x_cg = horiz_scan2x2_y;
+        scan_y_cg = horiz_scan2x2_x;
+        scan_x_off = horiz_scan4x4_y;
+        scan_y_off = horiz_scan4x4_x;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+        break;
+    }
+    num_coeff++;
+    num_last_subset = (num_coeff - 1) >> 4;
+
+    for (i = num_last_subset; i >= 0; i--) {
+        int n, m;
+        int x_cg, y_cg, x_c, y_c, pos;
+        int implicit_non_zero_coeff = 0;
+        int64_t trans_coeff_level;
+        int prev_sig = 0;
+        int offset = i << 4;
+        int rice_init = 0;
+
+        uint8_t significant_coeff_flag_idx[16];
+        uint8_t nb_significant_coeff_flag = 0;
+
+        x_cg = scan_x_cg[i];
+        y_cg = scan_y_cg[i];
+
+        if ((i < num_last_subset) && (i > 0)) {
+            int ctx_cg = 0;
+            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+
+            significant_coeff_group_flag[x_cg][y_cg] =
+                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+            implicit_non_zero_coeff = 1;
+        } else {
+            significant_coeff_group_flag[x_cg][y_cg] =
+            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+             (x_cg == 0 && y_cg == 0));
+        }
+
+        last_scan_pos = num_coeff - offset - 1;
+
+        if (i == num_last_subset) {
+            n_end = last_scan_pos - 1;
+            significant_coeff_flag_idx[0] = last_scan_pos;
+            nb_significant_coeff_flag = 1;
+        } else {
+            n_end = 15;
+        }
+
+        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
+        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+
+        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
+            static const uint8_t ctx_idx_map[] = {
+                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
+                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
+                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
+                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+            };
+            const uint8_t *ctx_idx_map_p;
+            int scf_offset = 0;
+            if (s->ps.sps->transform_skip_context_enabled_flag &&
+                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
+                if (c_idx == 0) {
+                    scf_offset = 40;
+                } else {
+                    scf_offset = 14 + 27;
+                }
+            } else {
+                if (c_idx != 0)
+                    scf_offset = 27;
+                if (log2_trafo_size == 2) {
+                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+                } else {
+                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
+                    if (c_idx == 0) {
+                        if ((x_cg > 0 || y_cg > 0))
+                            scf_offset += 3;
+                        if (log2_trafo_size == 3) {
+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+                        } else {
+                            scf_offset += 21;
+                        }
+                    } else {
+                        if (log2_trafo_size == 3)
+                            scf_offset += 9;
+                        else
+                            scf_offset += 12;
+                    }
+                }
+            }
+            for (n = n_end; n > 0; n--) {
+                x_c = scan_x_off[n];
+                y_c = scan_y_off[n];
+                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+                    nb_significant_coeff_flag++;
+                    implicit_non_zero_coeff = 0;
+                }
+            }
+            if (implicit_non_zero_coeff == 0) {
+                if (s->ps.sps->transform_skip_context_enabled_flag &&
+                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+                    if (c_idx == 0) {
+                        scf_offset = 42;
+                    } else {
+                        scf_offset = 16 + 27;
+                    }
+                } else {
+                    if (i == 0) {
+                        if (c_idx == 0)
+                            scf_offset = 0;
+                        else
+                            scf_offset = 27;
+                    } else {
+                        scf_offset = 2 + scf_offset;
+                    }
+                }
+                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                    nb_significant_coeff_flag++;
+                }
+            } else {
+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                nb_significant_coeff_flag++;
+            }
+        }
+
+        n_end = nb_significant_coeff_flag;
+
+
+        if (n_end) {
+            int first_nz_pos_in_cg;
+            int last_nz_pos_in_cg;
+            int c_rice_param = 0;
+            int first_greater1_coeff_idx = -1;
+            uint8_t coeff_abs_level_greater1_flag[8];
+            uint16_t coeff_sign_flag;
+            int sum_abs = 0;
+            int sign_hidden;
+            int sb_type;
+
+
+            // initialize first elem of coeff_bas_level_greater1_flag
+            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+
+            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+                else
+                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+                c_rice_param = lc->stat_coeff[sb_type] / 4;
+            }
+
+            if (!(i == num_last_subset) && greater1_ctx == 0)
+                ctx_set++;
+            greater1_ctx = 1;
+            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
+
+            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
+                int inc = (ctx_set << 2) + greater1_ctx;
+                coeff_abs_level_greater1_flag[m] =
+                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
+                if (coeff_abs_level_greater1_flag[m]) {
+                    greater1_ctx = 0;
+                    if (first_greater1_coeff_idx == -1)
+                        first_greater1_coeff_idx = m;
+                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
+                    greater1_ctx++;
+                }
+            }
+            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
+
+            if (lc->cu.cu_transquant_bypass_flag ||
+                (lc->cu.pred_mode ==  MODE_INTRA  &&
+                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
+                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
+                 explicit_rdpcm_flag)
+                sign_hidden = 0;
+            else
+                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+
+            if (first_greater1_coeff_idx != -1) {
+                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
+            }
+            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
+                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+            } else {
+                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
+            }
+
+            for (m = 0; m < n_end; m++) {
+                n = significant_coeff_flag_idx[m];
+                GET_COORD(offset, n);
+                if (m < 8) {
+                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
+                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
+                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+                        trans_coeff_level += last_coeff_abs_level_remaining;
+                        if (trans_coeff_level > (3 << c_rice_param))
+                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+                                lc->stat_coeff[sb_type]++;
+                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+                                if (lc->stat_coeff[sb_type] > 0)
+                                    lc->stat_coeff[sb_type]--;
+                            rice_init = 1;
+                        }
+                    }
+                } else {
+                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
+                    if (trans_coeff_level > (3 << c_rice_param))
+                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+                            lc->stat_coeff[sb_type]++;
+                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+                            if (lc->stat_coeff[sb_type] > 0)
+                                lc->stat_coeff[sb_type]--;
+                        rice_init = 1;
+                    }
+                }
+                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
+                    sum_abs += trans_coeff_level;
+                    if (n == first_nz_pos_in_cg && (sum_abs&1))
+                        trans_coeff_level = -trans_coeff_level;
+                }
+                if (coeff_sign_flag >> 15)
+                    trans_coeff_level = -trans_coeff_level;
+                coeff_sign_flag <<= 1;
+                if(!lc->cu.cu_transquant_bypass_flag) {
+                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+                        if(y_c || x_c || log2_trafo_size < 4) {
+                            switch(log2_trafo_size) {
+                                case 3: pos = (y_c << 3) + x_c; break;
+                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+                                default: pos = (y_c << 2) + x_c; break;
+                            }
+                            scale_m = scale_matrix[pos];
+                        } else {
+                            scale_m = dc_scale;
+                        }
+                    }
+                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+                    if(trans_coeff_level < 0) {
+                        if((~trans_coeff_level) & 0xFffffffffff8000)
+                            trans_coeff_level = -32768;
+                    } else {
+                        if(trans_coeff_level & 0xffffffffffff8000)
+                            trans_coeff_level = 32767;
+                    }
+                }
+                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+            }
+        }
+    }
+
+    if (lc->cu.cu_transquant_bypass_flag) {
+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
+
+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+        }
+    } else {
+        if (transform_skip_flag) {
+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+                      log2_trafo_size == 2 &&
+                      lc->cu.pred_mode == MODE_INTRA;
+            if (rot) {
+                for (i = 0; i < 8; i++)
+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+            }
+
+            s->hevcdsp.dequant(coeffs, log2_trafo_size);
+
+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                        lc->cu.pred_mode == MODE_INTRA &&
+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
+
+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+            }
+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+            s->hevcdsp.transform_4x4_luma(coeffs);
+        } else {
+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+            if (max_xy == 0)
+                s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
+            else {
+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                if (max_xy < 4)
+                    col_limit = FFMIN(4, col_limit);
+                else if (max_xy < 8)
+                    col_limit = FFMIN(8, col_limit);
+                else if (max_xy < 12)
+                    col_limit = FFMIN(24, col_limit);
+                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
+            }
+        }
+    }
+    if (lc->tu.cross_pf) {
+        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+
+        for (i = 0; i < (trafo_size * trafo_size); i++) {
+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+        }
+    }
+    s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
+}
+
+void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    int x = abs_mvd_greater0_flag_decode(s);
+    int y = abs_mvd_greater0_flag_decode(s);
+
+    if (x)
+        x += abs_mvd_greater1_flag_decode(s);
+    if (y)
+        y += abs_mvd_greater1_flag_decode(s);
+
+    switch (x) {
+    case 2: lc->pu.mvd.x = mvd_decode(s);           break;
+    case 1: lc->pu.mvd.x = mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.x = 0;                       break;
+    }
+
+    switch (y) {
+    case 2: lc->pu.mvd.y = mvd_decode(s);           break;
+    case 1: lc->pu.mvd.y = mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.y = 0;                       break;
+    }
+}
+
diff --git a/libavcodec/hevc_data.c b/libavcodec/hevc_data.c
index ff9548f..1633a41 100644
--- a/libavcodec/hevc_data.c
+++ b/libavcodec/hevc_data.c
@@ -1,20 +1,20 @@
 /*
  * HEVC shared tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hevc_data.h b/libavcodec/hevc_data.h
index d1d2c33..74558f0 100644
--- a/libavcodec/hevc_data.h
+++ b/libavcodec/hevc_data.h
@@ -1,20 +1,20 @@
 /*
  * HEVC shared data tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 4b71a89..6b98240 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -5,20 +5,20 @@
  * Copyright (C) 2013 Seppo Tomperi
  * Copyright (C) 2013 Wassim Hamidouche
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@
 #include "cabac_functions.h"
 #include "hevcdec.h"
 
+#include "bit_depth_template.c"
+
 #define LUMA 0
 #define CB 1
 #define CR 2
@@ -58,28 +60,30 @@ static int chroma_tc(HEVCContext *s, int qp_y, int c_idx, int tc_offset)
         offset = s->ps.pps->cr_qp_offset;
 
     qp_i = av_clip(qp_y + offset, 0, 57);
-    if (qp_i < 30)
-        qp = qp_i;
-    else if (qp_i > 43)
-        qp = qp_i - 6;
-    else
-        qp = qp_c[qp_i - 30];
+    if (s->ps.sps->chroma_format_idc == 1) {
+        if (qp_i < 30)
+            qp = qp_i;
+        else if (qp_i > 43)
+            qp = qp_i - 6;
+        else
+            qp = qp_c[qp_i - 30];
+    } else {
+        qp = av_clip(qp_i, 0, 51);
+    }
 
     idxt = av_clip(qp + DEFAULT_INTRA_TC_OFFSET + tc_offset, 0, 53);
     return tctable[idxt];
 }
 
-static int get_qPy_pred(HEVCContext *s, int xC, int yC,
-                        int xBase, int yBase, int log2_cb_size)
+static int get_qPy_pred(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
 {
-    HEVCLocalContext *lc     = &s->HEVClc;
+    HEVCLocalContext *lc     = s->HEVClc;
     int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
     int MinCuQpDeltaSizeMask = (1 << (s->ps.sps->log2_ctb_size -
                                       s->ps.pps->diff_cu_qp_delta_depth)) - 1;
     int xQgBase              = xBase - (xBase & MinCuQpDeltaSizeMask);
     int yQgBase              = yBase - (yBase & MinCuQpDeltaSizeMask);
     int min_cb_width         = s->ps.sps->min_cb_width;
-    int min_cb_height        = s->ps.sps->min_cb_height;
     int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
     int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
     int availableA           = (xBase   & ctb_size_mask) &&
@@ -93,46 +97,7 @@ static int get_qPy_pred(HEVCContext *s, int xC, int yC,
         lc->first_qp_group = !lc->tu.is_cu_qp_delta_coded;
         qPy_pred = s->sh.slice_qp;
     } else {
-        qPy_pred = lc->qp_y;
-        if (log2_cb_size < s->ps.sps->log2_ctb_size -
-                           s->ps.pps->diff_cu_qp_delta_depth) {
-            static const int offsetX[8][8] = {
-                { -1, 1, 3, 1, 7, 1, 3, 1 },
-                {  0, 0, 0, 0, 0, 0, 0, 0 },
-                {  1, 3, 1, 3, 1, 3, 1, 3 },
-                {  2, 2, 2, 2, 2, 2, 2, 2 },
-                {  3, 5, 7, 5, 3, 5, 7, 5 },
-                {  4, 4, 4, 4, 4, 4, 4, 4 },
-                {  5, 7, 5, 7, 5, 7, 5, 7 },
-                {  6, 6, 6, 6, 6, 6, 6, 6 }
-            };
-            static const int offsetY[8][8] = {
-                { 7, 0, 1, 2, 3, 4, 5, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 1, 0, 3, 2, 5, 4, 7, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 3, 0, 1, 2, 7, 4, 5, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 1, 0, 3, 2, 5, 4, 7, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 }
-            };
-            int xC0b = (xC - (xC & ctb_size_mask)) >> s->ps.sps->log2_min_cb_size;
-            int yC0b = (yC - (yC & ctb_size_mask)) >> s->ps.sps->log2_min_cb_size;
-            int idxX = (xQgBase  & ctb_size_mask)  >> s->ps.sps->log2_min_cb_size;
-            int idxY = (yQgBase  & ctb_size_mask)  >> s->ps.sps->log2_min_cb_size;
-            int idx_mask = ctb_size_mask >> s->ps.sps->log2_min_cb_size;
-            int x, y;
-
-            x = FFMIN(xC0b +  offsetX[idxX][idxY],             min_cb_width  - 1);
-            y = FFMIN(yC0b + (offsetY[idxX][idxY] & idx_mask), min_cb_height - 1);
-
-            if (xC0b == (lc->start_of_tiles_x >> s->ps.sps->log2_min_cb_size) &&
-                offsetX[idxX][idxY] == -1) {
-                x = (lc->end_of_tiles_x >> s->ps.sps->log2_min_cb_size) - 1;
-                y = yC0b - 1;
-            }
-            qPy_pred = s->qp_y_tab[y * min_cb_width + x];
-        }
+        qPy_pred = lc->qPy_pred;
     }
 
     // qPy_a
@@ -147,20 +112,22 @@ static int get_qPy_pred(HEVCContext *s, int xC, int yC,
     else
         qPy_b = s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width];
 
+    av_assert2(qPy_a >= -s->ps.sps->qp_bd_offset && qPy_a < 52);
+    av_assert2(qPy_b >= -s->ps.sps->qp_bd_offset && qPy_b < 52);
+
     return (qPy_a + qPy_b + 1) >> 1;
 }
 
-void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC,
-                     int xBase, int yBase, int log2_cb_size)
+void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
 {
-    int qp_y = get_qPy_pred(s, xC, yC, xBase, yBase, log2_cb_size);
+    int qp_y = get_qPy_pred(s, xBase, yBase, log2_cb_size);
 
-    if (s->HEVClc.tu.cu_qp_delta != 0) {
+    if (s->HEVClc->tu.cu_qp_delta != 0) {
         int off = s->ps.sps->qp_bd_offset;
-        s->HEVClc.qp_y = FFUMOD(qp_y + s->HEVClc.tu.cu_qp_delta + 52 + 2 * off,
-                                52 + off) - off;
+        s->HEVClc->qp_y = FFUMOD(qp_y + s->HEVClc->tu.cu_qp_delta + 52 + 2 * off,
+                                 52 + off) - off;
     } else
-        s->HEVClc.qp_y = qp_y;
+        s->HEVClc->qp_y = qp_y;
 }
 
 static int get_qPy(HEVCContext *s, int xC, int yC)
@@ -171,15 +138,106 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
 }
 
-static void copy_CTB(uint8_t *dst, uint8_t *src,
-                     int width, int height, ptrdiff_t stride)
+static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+                     ptrdiff_t stride_dst, ptrdiff_t stride_src)
+{
+int i, j;
+
+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=8)
+                AV_COPY64U(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=16)
+                AV_COPY128(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+{
+    if (pixel_shift)
+        *(uint16_t *)dst = *(uint16_t *)src;
+    else
+        *dst = *src;
+}
+
+static void copy_vert(uint8_t *dst, const uint8_t *src,
+                      int pixel_shift, int height,
+                      ptrdiff_t stride_dst, ptrdiff_t stride_src)
 {
     int i;
+    if (pixel_shift == 0) {
+        for (i = 0; i < height; i++) {
+            *dst = *src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+                           ptrdiff_t stride_src, int x, int y, int width, int height,
+                           int c_idx, int x_ctb, int y_ctb)
+{
+    int sh = s->ps.sps->pixel_shift;
+    int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+    int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
 
-    for (i = 0; i < height; i++) {
-        memcpy(dst, src, width);
-        dst += stride;
-        src += stride;
+    /* copy horizontal edges */
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
+        src, width << sh);
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
+        src + stride_src * (height - 1), width << sh);
+
+    /* copy vertical edges */
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
+
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
+}
+
+static void restore_tqb_pixels(HEVCContext *s,
+                               uint8_t *src1, const uint8_t *dst1,
+                               ptrdiff_t stride_src, ptrdiff_t stride_dst,
+                               int x0, int y0, int width, int height, int c_idx)
+{
+    if ( s->ps.pps->transquant_bypass_enable_flag ||
+            (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+        int x, y;
+        int min_pu_size  = 1 << s->ps.sps->log2_min_pu_size;
+        int hshift       = s->ps.sps->hshift[c_idx];
+        int vshift       = s->ps.sps->vshift[c_idx];
+        int x_min        = ((x0         ) >> s->ps.sps->log2_min_pu_size);
+        int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+        int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+        int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+        for (y = y_min; y < y_max; y++) {
+            for (x = x_min; x < x_max; x++) {
+                if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+                    int n;
+                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+                    for (n = 0; n < (min_pu_size >> vshift); n++) {
+                        memcpy(src, dst, len);
+                        src += stride_src;
+                        dst += stride_dst;
+                    }
+                }
+            }
+        }
     }
 }
 
@@ -187,128 +245,209 @@ static void copy_CTB(uint8_t *dst, uint8_t *src,
 
 static void sao_filter_CTB(HEVCContext *s, int x, int y)
 {
-    //  TODO: This should be easily parallelizable
-    //  TODO: skip CBs when (cu_transquant_bypass_flag || (pcm_loop_filter_disable_flag && pcm_flag))
-    int c_idx = 0;
-    int class = 1, class_index;
+    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+    HEVCLocalContext *lc = s->HEVClc;
+    int c_idx;
     int edges[4];  // 0 left 1 top 2 right 3 bottom
-    SAOParams *sao[4];
-    int classes[4];
-    int x_shift = 0, y_shift = 0;
-    int x_ctb = x >> s->ps.sps->log2_ctb_size;
-    int y_ctb = y >> s->ps.sps->log2_ctb_size;
-    int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb;
-    int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
-
+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+    SAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
     // flags indicating unfilterable edges
-    uint8_t vert_edge[]  = { 0, 0, 0, 0 };
-    uint8_t horiz_edge[] = { 0, 0, 0, 0 };
-    uint8_t diag_edge[]  = { 0, 0, 0, 0 };
-    uint8_t lfase[3]; // current, above, left
-    uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag &&
-                             !s->ps.pps->loop_filter_across_tiles_enabled_flag;
-    uint8_t left_tile_edge = 0, up_tile_edge = 0;
-
-    sao[0]     = &CTB(s->sao, x_ctb, y_ctb);
+    uint8_t vert_edge[]      = { 0, 0 };
+    uint8_t horiz_edge[]     = { 0, 0 };
+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
+    uint8_t restore          = no_tile_filter || !lfase;
+    uint8_t left_tile_edge   = 0;
+    uint8_t right_tile_edge  = 0;
+    uint8_t up_tile_edge     = 0;
+    uint8_t bottom_tile_edge = 0;
+
     edges[0]   = x_ctb == 0;
     edges[1]   = y_ctb == 0;
     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-    lfase[0]   = CTB(s->filter_slice_edges, x_ctb, y_ctb);
-    classes[0] = 0;
-
-    if (!edges[0]) {
-        left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-        sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb);
-        vert_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
-        vert_edge[2] = vert_edge[0];
-        lfase[2]     = CTB(s->filter_slice_edges, x_ctb - 1, y_ctb);
-        classes[class] = 2;
-        class++;
-        x_shift = 8;
-    }
-
-    if (!edges[1]) {
-        up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
-        sao[class] = &CTB(s->sao, x_ctb, y_ctb - 1);
-        horiz_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
-        horiz_edge[1] = horiz_edge[0];
-        lfase[1] = CTB(s->filter_slice_edges, x_ctb, y_ctb - 1);
-        classes[class] = 1;
-        class++;
-        y_shift = 4;
 
+    if (restore) {
         if (!edges[0]) {
-            classes[class] = 3;
-            sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb - 1);
-            class++;
-
-            // Tile check here is done current CTB row/col, not above/left like you'd expect,
-            //but that is because the tile boundary always extends through the whole pic
-            vert_edge[1] = (!lfase[1] && CTB(s->tab_slice_address, x_ctb, y_ctb - 1) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge;
-            vert_edge[3] = vert_edge[1];
-            horiz_edge[2] = (!lfase[2] && CTB(s->tab_slice_address, x_ctb - 1, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || up_tile_edge;
-            horiz_edge[3] = horiz_edge[2];
-            diag_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
-            diag_edge[3] = diag_edge[0];
-
-            // Does left CTB comes after above CTB?
-            if (CTB(s->tab_slice_address, x_ctb - 1, y_ctb) >
-                CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
-                diag_edge[2] = !lfase[2] || left_tile_edge || up_tile_edge;
-                diag_edge[1] = diag_edge[2];
-            } else if (CTB(s->tab_slice_address, x_ctb - 1, y_ctb) <
-                       CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
-                diag_edge[1] = !lfase[1] || left_tile_edge || up_tile_edge;
-                diag_edge[2] = diag_edge[1];
-            } else {
-                // Same slice, only consider tiles
-                diag_edge[2] = left_tile_edge || up_tile_edge;
-                diag_edge[1] = diag_edge[2];
-            }
+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
+        }
+        if (!edges[2]) {
+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
+        }
+        if (!edges[1]) {
+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
+        }
+        if (!edges[3]) {
+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[1]) {
+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
+        }
+        if (!edges[1] && !edges[2]) {
+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
+        }
+        if (!edges[2] && !edges[3]) {
+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[3]) {
+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
         }
     }
 
-    for (c_idx = 0; c_idx < 3; c_idx++) {
-        int chroma = c_idx ? 1 : 0;
-        int x0 = x >> chroma;
-        int y0 = y >> chroma;
-        ptrdiff_t stride = s->frame->linesize[c_idx];
-        int ctb_size = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->hshift[c_idx];
-        int width = FFMIN(ctb_size,
-                          (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0);
-        int height = FFMIN(ctb_size,
-                           (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
-
-        uint8_t *src = &s->frame->data[c_idx][y0 * stride + (x0 << s->ps.sps->pixel_shift)];
-        uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride + (x0 << s->ps.sps->pixel_shift)];
-        int offset = (y_shift >> chroma) * stride + ((x_shift >> chroma) << s->ps.sps->pixel_shift);
-
-        copy_CTB(dst - offset, src - offset,
-                 (edges[2] ? width  + (x_shift >> chroma) : width)  << s->ps.sps->pixel_shift,
-                 (edges[3] ? height + (y_shift >> chroma) : height), stride);
-
-        for (class_index = 0; class_index < class; class_index++) {
-
-            switch (sao[class_index]->type_idx[c_idx]) {
-            case SAO_BAND:
-                s->hevcdsp.sao_band_filter[classes[class_index]](dst, src,
-                                                                 stride,
-                                                                 sao[class_index],
-                                                                 edges, width,
-                                                                 height, c_idx);
-                break;
-            case SAO_EDGE:
-                s->hevcdsp.sao_edge_filter[classes[class_index]](dst, src,
-                                                                 stride,
-                                                                 sao[class_index],
-                                                                 edges, width,
-                                                                 height, c_idx,
-                                                                 vert_edge[classes[class_index]],
-                                                                 horiz_edge[classes[class_index]],
-                                                                 diag_edge[classes[class_index]]);
-                break;
+    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+        int x0       = x >> s->ps.sps->hshift[c_idx];
+        int y0       = y >> s->ps.sps->vshift[c_idx];
+        ptrdiff_t stride_src = s->frame->linesize[c_idx];
+        int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->hshift[c_idx];
+        int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->vshift[c_idx];
+        int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+        int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+        ptrdiff_t stride_dst;
+        uint8_t *dst;
+
+        switch (sao->type_idx[c_idx]) {
+        case SAO_BAND:
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            if (s->ps.pps->transquant_bypass_enable_flag ||
+                (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+            dst = lc->edge_emu_buffer;
+            stride_dst = 2*MAX_PB_SIZE;
+            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+                                            width, height);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
+            } else {
+            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+                                            width, height);
+            }
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        case SAO_EDGE:
+        {
+            int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+            int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+            int left_edge = edges[0];
+            int top_edge = edges[1];
+            int right_edge = edges[2];
+            int bottom_edge = edges[3];
+            int sh = s->ps.sps->pixel_shift;
+            int left_pixels, right_pixels;
+
+            stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+            dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+
+            if (!top_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst - stride_dst - (left << sh);
+                src1[0] = src - stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
             }
+            if (!bottom_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst + height * stride_dst - (left << sh);
+                src1[0] = src + height * stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
+            }
+            left_pixels = 0;
+            if (!left_edge) {
+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst - (1 << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    left_pixels = 1;
+                }
+            }
+            right_pixels = 0;
+            if (!right_edge) {
+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst + (width << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    right_pixels = 1;
+                }
+            }
+
+            copy_CTB(dst - (left_pixels << sh),
+                     src - (left_pixels << sh),
+                     (width + left_pixels + right_pixels) << sh,
+                     height, stride_dst, stride_src);
+
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+                                            sao->eo_class[c_idx], width, height);
+            s->hevcdsp.sao_edge_restore[restore](src, dst,
+                                                stride_src, stride_dst,
+                                                sao,
+                                                edges, width,
+                                                height, c_idx,
+                                                vert_edge,
+                                                horiz_edge,
+                                                diag_edge);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        }
         }
     }
 }
@@ -331,24 +470,27 @@ static int get_pcm(HEVCContext *s, int x, int y)
 
 #define TC_CALC(qp, bs)                                                 \
     tctable[av_clip((qp) + DEFAULT_INTRA_TC_OFFSET * ((bs) - 1) +       \
-                    (tc_offset >> 1 << 1),                              \
+                    (tc_offset & -2),                                   \
                     0, MAX_QP + DEFAULT_INTRA_TC_OFFSET)]
 
 static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
 {
     uint8_t *src;
-    int x, y, x_end, y_end, chroma;
-    int c_tc[2], tc[2], beta;
+    int x, y;
+    int chroma, beta;
+    int32_t c_tc[2], tc[2];
     uint8_t no_p[2] = { 0 };
     uint8_t no_q[2] = { 0 };
 
     int log2_ctb_size = s->ps.sps->log2_ctb_size;
+    int x_end, x_end2, y_end;
     int ctb_size        = 1 << log2_ctb_size;
     int ctb             = (x0 >> log2_ctb_size) +
                           (y0 >> log2_ctb_size) * s->ps.sps->ctb_width;
     int cur_tc_offset   = s->deblock[ctb].tc_offset;
     int cur_beta_offset = s->deblock[ctb].beta_offset;
-    int tc_offset, left_tc_offset, beta_offset, left_beta_offset;
+    int left_tc_offset, left_beta_offset;
+    int tc_offset, beta_offset;
     int pcmf = (s->ps.sps->pcm_enabled_flag &&
                 s->ps.sps->pcm.loop_filter_disable_flag) ||
                s->ps.pps->transquant_bypass_enable_flag;
@@ -356,6 +498,9 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     if (x0) {
         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+    } else {
+        left_tc_offset   = 0;
+        left_beta_offset = 0;
     }
 
     x_end = x0 + ctb_size;
@@ -368,11 +513,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     tc_offset   = cur_tc_offset;
     beta_offset = cur_beta_offset;
 
-    // vertical filtering luma
+    x_end2 = x_end;
+    if (x_end2 != s->ps.sps->width)
+        x_end2 -= 8;
     for (y = y0; y < y_end; y += 8) {
+        // vertical filtering luma
         for (x = x0 ? x0 : 8; x < x_end; x += 8) {
-            const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
-            const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];
+            const int bs0 = s->vertical_bs[(x +  y      * s->bs_width) >> 2];
+            const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2];
             if (bs0 || bs1) {
                 const int qp = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
 
@@ -395,45 +543,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                        beta, tc, no_p, no_q);
             }
         }
-    }
 
-    // vertical filtering chroma
-    for (chroma = 1; chroma <= 2; chroma++) {
-        for (y = y0; y < y_end; y += 16) {
-            for (x = x0 ? x0 : 16; x < x_end; x += 16) {
-                const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
-                const int bs1 = s->vertical_bs[(x >> 3) + ((y + 8) >> 2) * s->bs_width];
-                if ((bs0 == 2) || (bs1 == 2)) {
-                    const int qp0 = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
-                    const int qp1 = (get_qPy(s, x - 1, y + 8) + get_qPy(s, x, y + 8) + 1) >> 1;
-
-                    c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
-                    c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
-                    src     = &s->frame->data[chroma][y / 2 * s->frame->linesize[chroma] + ((x / 2) << s->ps.sps->pixel_shift)];
-                    if (pcmf) {
-                        no_p[0] = get_pcm(s, x - 1, y);
-                        no_p[1] = get_pcm(s, x - 1, y + 8);
-                        no_q[0] = get_pcm(s, x, y);
-                        no_q[1] = get_pcm(s, x, y + 8);
-                        s->hevcdsp.hevc_v_loop_filter_chroma_c(src,
-                                                               s->frame->linesize[chroma],
-                                                               c_tc, no_p, no_q);
-                    } else
-                        s->hevcdsp.hevc_v_loop_filter_chroma(src,
-                                                             s->frame->linesize[chroma],
-                                                             c_tc, no_p, no_q);
-                }
-            }
-        }
-    }
+        if(!y)
+             continue;
 
-    // horizontal filtering luma
-    if (x_end != s->ps.sps->width)
-        x_end -= 8;
-    for (y = y0 ? y0 : 8; y < y_end; y += 8) {
-        for (x = x0 ? x0 - 8 : 0; x < x_end; x += 8) {
-            const int bs0 = s->horizontal_bs[(x +     y * s->bs_width) >> 2];
-            const int bs1 = s->horizontal_bs[(x + 4 + y * s->bs_width) >> 2];
+        // horizontal filtering luma
+        for (x = x0 ? x0 - 8 : 0; x < x_end2; x += 8) {
+            const int bs0 = s->horizontal_bs[( x      + y * s->bs_width) >> 2];
+            const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2];
             if (bs0 || bs1) {
                 const int qp = (get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1;
 
@@ -460,123 +577,135 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
         }
     }
 
-    // horizontal filtering chroma
-    for (chroma = 1; chroma <= 2; chroma++) {
-        for (y = y0 ? y0 : 16; y < y_end; y += 16) {
-            for (x = x0 - 8; x < x_end; x += 16) {
-                int bs0, bs1;
-                // to make sure no memory access over boundary when x = -8
-                // TODO: simplify with row based deblocking
-                if (x < 0) {
-                    bs0 = 0;
-                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
-                } else if (x >= x_end - 8) {
-                    bs0 = s->horizontal_bs[(x +     y * s->bs_width) >> 2];
-                    bs1 = 0;
-                } else {
-                    bs0 = s->horizontal_bs[(x + y     * s->bs_width) >> 2];
-                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
+    if (s->ps.sps->chroma_format_idc) {
+        for (chroma = 1; chroma <= 2; chroma++) {
+            int h = 1 << s->ps.sps->hshift[chroma];
+            int v = 1 << s->ps.sps->vshift[chroma];
+
+            // vertical filtering chroma
+            for (y = y0; y < y_end; y += (8 * v)) {
+                for (x = x0 ? x0 : 8 * h; x < x_end; x += (8 * h)) {
+                    const int bs0 = s->vertical_bs[(x +  y            * s->bs_width) >> 2];
+                    const int bs1 = s->vertical_bs[(x + (y + (4 * v)) * s->bs_width) >> 2];
+
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = (get_qPy(s, x - 1, y)           + get_qPy(s, x, y)           + 1) >> 1;
+                        const int qp1 = (get_qPy(s, x - 1, y + (4 * v)) + get_qPy(s, x, y + (4 * v)) + 1) >> 1;
+
+                        c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                        c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+                        if (pcmf) {
+                            no_p[0] = get_pcm(s, x - 1, y);
+                            no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+                            no_q[0] = get_pcm(s, x, y);
+                            no_q[1] = get_pcm(s, x, y + (4 * v));
+                            s->hevcdsp.hevc_v_loop_filter_chroma_c(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+                        } else
+                            s->hevcdsp.hevc_v_loop_filter_chroma(src,
+                                                                 s->frame->linesize[chroma],
+                                                                 c_tc, no_p, no_q);
+                    }
                 }
 
-                if ((bs0 == 2) || (bs1 == 2)) {
-                    const int qp0 = bs0 == 2 ? (get_qPy(s, x,     y - 1) + get_qPy(s, x,     y) + 1) >> 1 : 0;
-                    const int qp1 = bs1 == 2 ? (get_qPy(s, x + 8, y - 1) + get_qPy(s, x + 8, y) + 1) >> 1 : 0;
-
-                    tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset;
-                    c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
-                    c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
-                    src       = &s->frame->data[chroma][y / 2 * s->frame->linesize[chroma] + ((x / 2) << s->ps.sps->pixel_shift)];
-                    if (pcmf) {
-                        no_p[0] = get_pcm(s, x, y - 1);
-                        no_p[1] = get_pcm(s, x + 8, y - 1);
-                        no_q[0] = get_pcm(s, x, y);
-                        no_q[1] = get_pcm(s, x + 8, y);
-                        s->hevcdsp.hevc_h_loop_filter_chroma_c(src,
-                                                               s->frame->linesize[chroma],
-                                                               c_tc, no_p, no_q);
-                    } else
-                        s->hevcdsp.hevc_h_loop_filter_chroma(src,
-                                                             s->frame->linesize[chroma],
-                                                             c_tc, no_p, no_q);
+                if(!y)
+                    continue;
+
+                // horizontal filtering chroma
+                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+                x_end2 = x_end;
+                if (x_end != s->ps.sps->width)
+                    x_end2 = x_end - 8 * h;
+                for (x = x0 ? x0 - 8 * h : 0; x < x_end2; x += (8 * h)) {
+                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,           y - 1) + get_qPy(s, x,           y) + 1) >> 1 : 0;
+                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + (4 * h), y - 1) + get_qPy(s, x + (4 * h), y) + 1) >> 1 : 0;
+
+                        c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+                        c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+                        if (pcmf) {
+                            no_p[0] = get_pcm(s, x,           y - 1);
+                            no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+                            no_q[0] = get_pcm(s, x,           y);
+                            no_q[1] = get_pcm(s, x + (4 * h), y);
+                            s->hevcdsp.hevc_h_loop_filter_chroma_c(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+                        } else
+                            s->hevcdsp.hevc_h_loop_filter_chroma(src,
+                                                                 s->frame->linesize[chroma],
+                                                                 c_tc, no_p, no_q);
+                    }
                 }
             }
         }
     }
 }
 
-static int boundary_strength(HEVCContext *s, MvField *curr,
-                             uint8_t curr_cbf_luma, MvField *neigh,
-                             uint8_t neigh_cbf_luma,
-                             RefPicList *neigh_refPicList,
-                             int tu_border)
+static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+                             RefPicList *neigh_refPicList)
 {
-    int mvs = curr->pred_flag[0] + curr->pred_flag[1];
-
-    if (tu_border) {
-        if (curr->is_intra || neigh->is_intra)
-            return 2;
-        if (curr_cbf_luma || neigh_cbf_luma)
-            return 1;
-    }
-
-    if (mvs == neigh->pred_flag[0] + neigh->pred_flag[1]) {
-        if (mvs == 2) {
-            // same L0 and L1
-            if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
-                s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
-                neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
-                if ((abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                     abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-                    (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                     abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4))
-                    return 1;
-                else
-                    return 0;
-            } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                       neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-                if (abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                    abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                       neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-                if (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                    abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else {
+    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+        // same L0 and L1
+        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
                 return 1;
-            }
-        } else { // 1 MV
-            Mv A, B;
-            int ref_A, ref_B;
-
-            if (curr->pred_flag[0]) {
-                A     = curr->mv[0];
-                ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
-            } else {
-                A     = curr->mv[1];
-                ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
-            }
+            else
+                return 0;
+        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+                return 1;
+            else
+                return 0;
+        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+                return 1;
+            else
+                return 0;
+        } else {
+            return 1;
+        }
+    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+        Mv A, B;
+        int ref_A, ref_B;
+
+        if (curr->pred_flag & 1) {
+            A     = curr->mv[0];
+            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+        } else {
+            A     = curr->mv[1];
+            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+        }
 
-            if (neigh->pred_flag[0]) {
-                B     = neigh->mv[0];
-                ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
-            } else {
-                B     = neigh->mv[1];
-                ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
-            }
+        if (neigh->pred_flag & 1) {
+            B     = neigh->mv[0];
+            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+        } else {
+            B     = neigh->mv[1];
+            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+        }
 
-            if (ref_A == ref_B) {
-                if (abs(A.x - B.x) >= 4 || abs(A.y - B.y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else
+        if (ref_A == ref_B) {
+            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
                 return 1;
-        }
+            else
+                return 0;
+        } else
+            return 1;
     }
 
     return 1;
@@ -585,14 +714,14 @@ static int boundary_strength(HEVCContext *s, MvField *curr,
 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     MvField *tab_mvf     = s->ref->tab_mvf;
     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int min_tu_width     = s->ps.sps->min_tb_width;
     int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
-                           (x0 >> log2_min_pu_size)].is_intra;
+                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
     int boundary_upper, boundary_left;
     int i, j, bs;
 
@@ -610,37 +739,11 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
                               s->ref->refPicList;
-
         int yp_pu = (y0 - 1) >> log2_min_pu_size;
         int yq_pu =  y0      >> log2_min_pu_size;
         int yp_tu = (y0 - 1) >> log2_min_tu_size;
         int yq_tu =  y0      >> log2_min_tu_size;
 
-        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-            int x_pu = (x0 + i) >> log2_min_pu_size;
-            int x_tu = (x0 + i) >> log2_min_tu_size;
-            MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-            MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-            uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
-            uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-
-            bs = boundary_strength(s, curr, curr_cbf_luma,
-                                   top, top_cbf_luma, rpl_top, 1);
-            if (bs)
-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
-        }
-    }
-
-    // bs for TU internal horizontal PU boundaries
-    if (log2_trafo_size > s->ps.sps->log2_min_pu_size && !is_intra) {
-        RefPicList *rpl = s->ref->refPicList;
-
-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
-            int yp_tu = (y0 + j - 1) >> log2_min_tu_size;
-            int yq_tu = (y0 + j)     >> log2_min_tu_size;
-
             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
                 int x_pu = (x0 + i) >> log2_min_pu_size;
                 int x_tu = (x0 + i) >> log2_min_tu_size;
@@ -649,12 +752,14 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                 uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
                 uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
 
-                bs = boundary_strength(s, curr, curr_cbf_luma,
-                                       top, top_cbf_luma, rpl, 0);
-                if (bs)
-                    s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+                    bs = 2;
+                else if (curr_cbf_luma || top_cbf_luma)
+                    bs = 1;
+                else
+                    bs = boundary_strength(s, curr, top, rpl_top);
+                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
             }
-        }
     }
 
     // bs for vertical TU boundaries
@@ -672,50 +777,59 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
                                s->ref->refPicList;
-
         int xp_pu = (x0 - 1) >> log2_min_pu_size;
         int xq_pu =  x0      >> log2_min_pu_size;
         int xp_tu = (x0 - 1) >> log2_min_tu_size;
         int xq_tu =  x0      >> log2_min_tu_size;
 
-        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-            int y_pu      = (y0 + i) >> log2_min_pu_size;
-            int y_tu      = (y0 + i) >> log2_min_tu_size;
-            MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-            MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-
-            uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-            uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+                int y_pu      = (y0 + i) >> log2_min_pu_size;
+                int y_tu      = (y0 + i) >> log2_min_tu_size;
+                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
 
-            bs = boundary_strength(s, curr, curr_cbf_luma,
-                                   left, left_cbf_luma, rpl_left, 1);
-            if (bs)
-                s->vertical_bs[(x0 >> 3) + ((y0 + i) >> 2) * s->bs_width] = bs;
-        }
+                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+                    bs = 2;
+                else if (curr_cbf_luma || left_cbf_luma)
+                    bs = 1;
+                else
+                    bs = boundary_strength(s, curr, left, rpl_left);
+                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+            }
     }
 
-    // bs for TU internal vertical PU boundaries
     if (log2_trafo_size > log2_min_pu_size && !is_intra) {
         RefPicList *rpl = s->ref->refPicList;
 
+        // bs for TU internal horizontal PU boundaries
+        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+
+            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+                int x_pu = (x0 + i) >> log2_min_pu_size;
+                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+
+                bs = boundary_strength(s, curr, top, rpl);
+                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+            }
+        }
+
+        // bs for TU internal vertical PU boundaries
         for (j = 0; j < (1 << log2_trafo_size); j += 4) {
             int y_pu = (y0 + j) >> log2_min_pu_size;
-            int y_tu = (y0 + j) >> log2_min_tu_size;
 
             for (i = 8; i < (1 << log2_trafo_size); i += 8) {
                 int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
                 int xq_pu = (x0 + i)     >> log2_min_pu_size;
-                int xp_tu = (x0 + i - 1) >> log2_min_tu_size;
-                int xq_tu = (x0 + i)     >> log2_min_tu_size;
                 MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
                 MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
 
-                bs = boundary_strength(s, curr, curr_cbf_luma,
-                                       left, left_cbf_luma, rpl, 0);
-                if (bs)
-                    s->vertical_bs[((x0 + i) >> 3) + ((y0 + j) >> 2) * s->bs_width] = bs;
+                bs = boundary_strength(s, curr, left, rpl);
+                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
             }
         }
     }
@@ -725,21 +839,50 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
 #undef CB
 #undef CR
 
-void ff_hevc_hls_filter(HEVCContext *s, int x, int y)
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
 {
-    deblocking_filter_CTB(s, x, y);
-    if (s->ps.sps->sao_enabled)
-        sao_filter_CTB(s, x, y);
+    int x_end = x >= s->ps.sps->width  - ctb_size;
+    int skip = 0;
+    if (s->avctx->skip_loop_filter >= AVDISCARD_ALL ||
+        (s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && !IS_IDR(s)) ||
+        (s->avctx->skip_loop_filter >= AVDISCARD_NONINTRA &&
+         s->sh.slice_type != HEVC_SLICE_I) ||
+        (s->avctx->skip_loop_filter >= AVDISCARD_BIDIR &&
+         s->sh.slice_type == HEVC_SLICE_B) ||
+        (s->avctx->skip_loop_filter >= AVDISCARD_NONREF &&
+        ff_hevc_nal_is_nonref(s->nal_unit_type)))
+        skip = 1;
+
+    if (!skip)
+        deblocking_filter_CTB(s, x, y);
+    if (s->ps.sps->sao_enabled && !skip) {
+        int y_end = y >= s->ps.sps->height - ctb_size;
+        if (y && x)
+            sao_filter_CTB(s, x - ctb_size, y - ctb_size);
+        if (x && y_end)
+            sao_filter_CTB(s, x - ctb_size, y);
+        if (y && x_end) {
+            sao_filter_CTB(s, x, y - ctb_size);
+            if (s->threads_type & FF_THREAD_FRAME )
+                ff_thread_report_progress(&s->ref->tf, y, 0);
+        }
+        if (x_end && y_end) {
+            sao_filter_CTB(s, x , y);
+            if (s->threads_type & FF_THREAD_FRAME )
+                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+        }
+    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 }
 
 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
 {
+    int x_end = x_ctb >= s->ps.sps->width  - ctb_size;
+    int y_end = y_ctb >= s->ps.sps->height - ctb_size;
     if (y_ctb && x_ctb)
-        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size);
-    if (y_ctb && x_ctb >= s->ps.sps->width - ctb_size) {
-        ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size);
-        ff_thread_report_progress(&s->ref->tf, y_ctb - ctb_size, 0);
-    }
-    if (x_ctb && y_ctb >= s->ps.sps->height - ctb_size)
-        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb);
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size, ctb_size);
+    if (y_ctb && x_end)
+        ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size, ctb_size);
+    if (x_ctb && y_end)
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb, ctb_size);
 }
diff --git a/libavcodec/hevc_mp4toannexb_bsf.c b/libavcodec/hevc_mp4toannexb_bsf.c
index d6b1f00..09bce5b 100644
--- a/libavcodec/hevc_mp4toannexb_bsf.c
+++ b/libavcodec/hevc_mp4toannexb_bsf.c
@@ -2,20 +2,20 @@
  * HEVC MP4 to Annex B byte stream format filter
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,7 +43,7 @@ static int hevc_extradata_to_annexb(AVBSFContext *ctx)
     int ret = 0;
 
     uint8_t *new_extradata = NULL;
-    size_t   new_extradata_size = 0;;
+    size_t   new_extradata_size = 0;
 
     bytestream2_init(&gb, ctx->par_in->extradata, ctx->par_in->extradata_size);
 
diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
index 27df7c1..fd0dbd9 100644
--- a/libavcodec/hevc_mvs.c
+++ b/libavcodec/hevc_mvs.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2013 Anand Meher Kotra
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,9 +42,9 @@ static const uint8_t l0_l1_cand_idx[12][2] = {
 void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
                                      int nPbW, int nPbH)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    HEVCLocalContext *lc = s->HEVClc;
+    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
     lc->na.cand_up       = (lc->ctb_up_flag   || y0b);
     lc->na.cand_left     = (lc->ctb_left_flag || x0b);
@@ -53,8 +53,7 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
             ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size)) ?
                     lc->ctb_up_right_flag && !y0b : lc->na.cand_up;
     lc->na.cand_up_right =
-            ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size) ?
-                    lc->ctb_up_right_flag && !y0b : lc->na.cand_up )
+            lc->na.cand_up_right_sap
                      && (x0 + nPbW) < lc->end_of_tiles_x;
     lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left;
 }
@@ -62,56 +61,29 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
 /*
  * 6.4.1 Derivation process for z-scan order block availability
  */
-static int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
+static av_always_inline int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
                               int xN, int yN)
 {
 #define MIN_TB_ADDR_ZS(x, y)                                            \
-    s->ps.pps->min_tb_addr_zs[(y) * s->ps.sps->min_tb_width + (x)]
-    int Curr = MIN_TB_ADDR_ZS(xCurr >> s->ps.sps->log2_min_tb_size,
-                              yCurr >> s->ps.sps->log2_min_tb_size);
-    int N;
-
-    if (xN < 0 || yN < 0 ||
-        xN >= s->ps.sps->width ||
-        yN >= s->ps.sps->height)
-        return 0;
-
-    N = MIN_TB_ADDR_ZS(xN >> s->ps.sps->log2_min_tb_size,
-                       yN >> s->ps.sps->log2_min_tb_size);
-
-    return N <= Curr;
-}
-
-static int same_prediction_block(HEVCLocalContext *lc, int log2_cb_size,
-                                 int x0, int y0, int nPbW, int nPbH,
-                                 int xA1, int yA1, int partIdx)
-{
-    return !(nPbW << 1 == 1 << log2_cb_size &&
-             nPbH << 1 == 1 << log2_cb_size && partIdx == 1 &&
-             lc->cu.x + nPbW > xA1 &&
-             lc->cu.y + nPbH <= yA1);
-}
+    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
 
-/*
- * 6.4.2 Derivation process for prediction block availability
- */
-static int check_prediction_block_available(HEVCContext *s, int log2_cb_size,
-                                            int x0, int y0, int nPbW, int nPbH,
-                                            int xA1, int yA1, int partIdx)
-{
-    HEVCLocalContext *lc = &s->HEVClc;
-
-    if (lc->cu.x < xA1 && lc->cu.y < yA1 &&
-        (lc->cu.x + (1 << log2_cb_size)) > xA1 &&
-        (lc->cu.y + (1 << log2_cb_size)) > yA1)
-        return same_prediction_block(lc, log2_cb_size, x0, y0,
-                                     nPbW, nPbH, xA1, yA1, partIdx);
-    else
-        return z_scan_block_avail(s, x0, y0, xA1, yA1);
+    int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size;
+    int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size;
+    int xN_ctb    = xN    >> s->ps.sps->log2_ctb_size;
+    int yN_ctb    = yN    >> s->ps.sps->log2_ctb_size;
+    if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb )
+        return 1;
+    else {
+        int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
+                (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
+        int N    = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
+                (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
+        return N <= Curr;
+    }
 }
 
 //check if the two luma locations belong to the same motion estimation region
-static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
+static av_always_inline int is_diff_mer(HEVCContext *s, int xN, int yN, int xP, int yP)
 {
     uint8_t plevel = s->ps.pps->log2_parallel_merge_level;
 
@@ -123,18 +95,20 @@ static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
 #define MATCH(x) (A.x == B.x)
 
 // check if the mv's and refidx are the same between A and B
-static int compareMVrefidx(struct MvField A, struct MvField B)
+static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField B)
 {
-    if (A.pred_flag[0] && A.pred_flag[1] && B.pred_flag[0] && B.pred_flag[1])
-        return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
-               MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-
-    if (A.pred_flag[0] && !A.pred_flag[1] && B.pred_flag[0] && !B.pred_flag[1])
-        return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
-
-    if (!A.pred_flag[0] && A.pred_flag[1] && !B.pred_flag[0] && B.pred_flag[1])
-        return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-
+    int a_pf = A.pred_flag;
+    int b_pf = B.pred_flag;
+    if (a_pf == b_pf) {
+        if (a_pf == PF_BI) {
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
+                   MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
+        } else if (a_pf == PF_L0) {
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
+        } else if (a_pf == PF_L1) {
+            return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
+        }
+    }
     return 0;
 }
 
@@ -145,11 +119,11 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
     td = av_clip_int8(td);
     tb = av_clip_int8(tb);
     tx = (0x4000 + abs(td / 2)) / td;
-    scale_factor = av_clip((tb * tx + 32) >> 6, -4096, 4095);
+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
     dst->x = av_clip_int16((scale_factor * src->x + 127 +
-                             (scale_factor * src->x < 0)) >> 8);
+                           (scale_factor * src->x < 0)) >> 8);
     dst->y = av_clip_int16((scale_factor * src->y + 127 +
-                             (scale_factor * src->y < 0)) >> 8);
+                           (scale_factor * src->y < 0)) >> 8);
 }
 
 static int check_mvset(Mv *mvLXCol, Mv *mvCol,
@@ -170,10 +144,7 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
     col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol];
     cur_poc_diff = poc    - refPicList[X].list[refIdxLx];
 
-    if (!col_poc_diff)
-        col_poc_diff = 1;  // error resilience
-
-    if (cur_lt || col_poc_diff == cur_poc_diff) {
+    if (cur_lt || col_poc_diff == cur_poc_diff || !col_poc_diff) {
         mvLXCol->x = mvCol->x;
         mvLXCol->y = mvCol->y;
     } else {
@@ -195,32 +166,30 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
 {
     RefPicList *refPicList = s->ref->refPicList;
 
-    if (temp_col.is_intra) {
-        mvLXCol->x = 0;
-        mvLXCol->y = 0;
+    if (temp_col.pred_flag == PF_INTRA)
         return 0;
-    }
 
-    if (temp_col.pred_flag[0] == 0)
+    if (!(temp_col.pred_flag & PF_L0))
         return CHECK_MVSET(1);
-    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 0)
+    else if (temp_col.pred_flag == PF_L0)
         return CHECK_MVSET(0);
-    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 1) {
+    else if (temp_col.pred_flag == PF_BI) {
         int check_diffpicount = 0;
-        int i = 0;
-        for (i = 0; i < refPicList[0].nb_refs; i++) {
-            if (refPicList[0].list[i] > s->poc)
-                check_diffpicount++;
-        }
-        for (i = 0; i < refPicList[1].nb_refs; i++) {
-            if (refPicList[1].list[i] > s->poc)
-                check_diffpicount++;
+        int i, j;
+        for (j = 0; j < 2; j++) {
+            for (i = 0; i < refPicList[j].nb_refs; i++) {
+                if (refPicList[j].list[i] > s->poc) {
+                    check_diffpicount++;
+                    break;
+                }
+            }
         }
-        if (check_diffpicount == 0 && X == 0)
-            return CHECK_MVSET(0);
-        else if (check_diffpicount == 0 && X == 1)
-            return CHECK_MVSET(1);
-        else {
+        if (!check_diffpicount) {
+            if (X==0)
+                return CHECK_MVSET(0);
+            else
+                return CHECK_MVSET(1);
+        } else {
             if (s->sh.collocated_list == L1)
                 return CHECK_MVSET(0);
             else
@@ -235,7 +204,8 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
     tab_mvf[(y) * min_pu_width + x]
 
 #define TAB_MVF_PU(v)                                                   \
-    TAB_MVF(x ## v ## _pu, y ## v ## _pu)
+    TAB_MVF(((x ## v) >> s->ps.sps->log2_min_pu_size),                     \
+            ((y ## v) >> s->ps.sps->log2_min_pu_size))
 
 #define DERIVE_TEMPORAL_COLOCATED_MVS                                   \
     derive_temporal_colocated_mvs(s, temp_col,                          \
@@ -276,7 +246,8 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
         x < s->ps.sps->width) {
         x                 &= ~15;
         y                 &= ~15;
-        ff_thread_await_progress(&ref->tf, y, 0);
+        if (s->threads_type == FF_THREAD_FRAME)
+            ff_thread_await_progress(&ref->tf, y, 0);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
@@ -289,7 +260,8 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
         y                  = y0 + (nPbH >> 1);
         x                 &= ~15;
         y                 &= ~15;
-        ff_thread_await_progress(&ref->tf, y, 0);
+        if (s->threads_type == FF_THREAD_FRAME)
+            ff_thread_await_progress(&ref->tf, y, 0);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
@@ -299,15 +271,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
 }
 
 #define AVAILABLE(cand, v)                                      \
-    (cand && !TAB_MVF_PU(v).is_intra)
+    (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA))
 
 #define PRED_BLOCK_AVAILABLE(v)                                 \
-    check_prediction_block_available(s, log2_cb_size,           \
-                                     x0, y0, nPbW, nPbH,        \
-                                     x ## v, y ## v, part_idx)
+    z_scan_block_avail(s, x0, y0, x ## v, y ## v)
 
 #define COMPARE_MV_REFIDX(a, b)                                 \
-    compareMVrefidx(TAB_MVF_PU(a), TAB_MVF_PU(b))
+    compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b))
 
 /*
  * 8.5.3.1.2  Derivation process for spatial merging candidates
@@ -319,7 +289,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
                                             int merge_idx,
                                             struct MvField mergecandlist[])
 {
-    HEVCLocalContext *lc   = &s->HEVClc;
+    HEVCLocalContext *lc   = s->HEVClc;
     RefPicList *refPicList = s->ref->refPicList;
     MvField *tab_mvf       = s->ref->tab_mvf;
 
@@ -333,33 +303,21 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
 
     const int xA1    = x0 - 1;
     const int yA1    = y0 + nPbH - 1;
-    const int xA1_pu = xA1 >> s->ps.sps->log2_min_pu_size;
-    const int yA1_pu = yA1 >> s->ps.sps->log2_min_pu_size;
 
     const int xB1    = x0 + nPbW - 1;
     const int yB1    = y0 - 1;
-    const int xB1_pu = xB1 >> s->ps.sps->log2_min_pu_size;
-    const int yB1_pu = yB1 >> s->ps.sps->log2_min_pu_size;
 
     const int xB0    = x0 + nPbW;
     const int yB0    = y0 - 1;
-    const int xB0_pu = xB0 >> s->ps.sps->log2_min_pu_size;
-    const int yB0_pu = yB0 >> s->ps.sps->log2_min_pu_size;
 
     const int xA0    = x0 - 1;
     const int yA0    = y0 + nPbH;
-    const int xA0_pu = xA0 >> s->ps.sps->log2_min_pu_size;
-    const int yA0_pu = yA0 >> s->ps.sps->log2_min_pu_size;
 
     const int xB2    = x0 - 1;
     const int yB2    = y0 - 1;
-    const int xB2_pu = xB2 >> s->ps.sps->log2_min_pu_size;
-    const int yB2_pu = yB2 >> s->ps.sps->log2_min_pu_size;
 
     const int nb_refs = (s->sh.slice_type == HEVC_SLICE_P) ?
                         s->sh.nb_refs[0] : FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]);
-    int check_MER   = 1;
-    int check_MER_1 = 1;
 
     int zero_idx = 0;
 
@@ -371,57 +329,49 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     int is_available_b0;
     int is_available_b1;
     int is_available_b2;
-    int check_B0;
-    int check_A0;
 
-    //first left spatial merge candidate
-    is_available_a1 = AVAILABLE(cand_left, A1);
 
     if (!singleMCLFlag && part_idx == 1 &&
         (lc->cu.part_mode == PART_Nx2N ||
          lc->cu.part_mode == PART_nLx2N ||
          lc->cu.part_mode == PART_nRx2N) ||
-        isDiffMER(s, xA1, yA1, x0, y0)) {
+        is_diff_mer(s, xA1, yA1, x0, y0)) {
         is_available_a1 = 0;
+    } else {
+        is_available_a1 = AVAILABLE(cand_left, A1);
+        if (is_available_a1) {
+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1);
+            if (merge_idx == 0)
+                return;
+            nb_merge_cand++;
+        }
     }
 
-    if (is_available_a1) {
-        mergecandlist[0] = TAB_MVF_PU(A1);
-        if (merge_idx == 0)
-            return;
-        nb_merge_cand++;
-    }
-
-    // above spatial merge candidate
-    is_available_b1 = AVAILABLE(cand_up, B1);
-
     if (!singleMCLFlag && part_idx == 1 &&
         (lc->cu.part_mode == PART_2NxN ||
          lc->cu.part_mode == PART_2NxnU ||
          lc->cu.part_mode == PART_2NxnD) ||
-        isDiffMER(s, xB1, yB1, x0, y0)) {
+        is_diff_mer(s, xB1, yB1, x0, y0)) {
         is_available_b1 = 0;
+    } else {
+        is_available_b1 = AVAILABLE(cand_up, B1);
+        if (is_available_b1 &&
+            !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) {
+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1);
+            if (merge_idx == nb_merge_cand)
+                return;
+            nb_merge_cand++;
+        }
     }
 
-    if (is_available_a1 && is_available_b1)
-        check_MER = !COMPARE_MV_REFIDX(B1, A1);
-
-    if (is_available_b1 && check_MER)
-        mergecandlist[nb_merge_cand++] = TAB_MVF_PU(B1);
-
     // above right spatial merge candidate
-    check_MER = 1;
-    check_B0  = PRED_BLOCK_AVAILABLE(B0);
-
-    is_available_b0 = check_B0 && AVAILABLE(cand_up_right, B0);
+    is_available_b0 = AVAILABLE(cand_up_right, B0) &&
+                      xB0 < s->ps.sps->width &&
+                      PRED_BLOCK_AVAILABLE(B0) &&
+                      !is_diff_mer(s, xB0, yB0, x0, y0);
 
-    if (isDiffMER(s, xB0, yB0, x0, y0))
-        is_available_b0 = 0;
-
-    if (is_available_b1 && is_available_b0)
-        check_MER = !COMPARE_MV_REFIDX(B0, B1);
-
-    if (is_available_b0 && check_MER) {
+    if (is_available_b0 &&
+        !(is_available_b1 && COMPARE_MV_REFIDX(B0, B1))) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(B0);
         if (merge_idx == nb_merge_cand)
             return;
@@ -429,18 +379,13 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     }
 
     // left bottom spatial merge candidate
-    check_MER = 1;
-    check_A0  = PRED_BLOCK_AVAILABLE(A0);
-
-    is_available_a0 = check_A0 && AVAILABLE(cand_bottom_left, A0);
+    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
+                      yA0 < s->ps.sps->height &&
+                      PRED_BLOCK_AVAILABLE(A0) &&
+                      !is_diff_mer(s, xA0, yA0, x0, y0);
 
-    if (isDiffMER(s, xA0, yA0, x0, y0))
-        is_available_a0 = 0;
-
-    if (is_available_a1 && is_available_a0)
-        check_MER = !COMPARE_MV_REFIDX(A0, A1);
-
-    if (is_available_a0 && check_MER) {
+    if (is_available_a0 &&
+        !(is_available_a1 && COMPARE_MV_REFIDX(A0, A1))) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(A0);
         if (merge_idx == nb_merge_cand)
             return;
@@ -448,20 +393,13 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     }
 
     // above left spatial merge candidate
-    check_MER = 1;
+    is_available_b2 = AVAILABLE(cand_up_left, B2) &&
+                      !is_diff_mer(s, xB2, yB2, x0, y0);
 
-    is_available_b2 = AVAILABLE(cand_up_left, B2);
-
-    if (isDiffMER(s, xB2, yB2, x0, y0))
-        is_available_b2 = 0;
-
-    if (is_available_a1 && is_available_b2)
-        check_MER = !COMPARE_MV_REFIDX(B2, A1);
-
-    if (is_available_b1 && is_available_b2)
-        check_MER_1 = !COMPARE_MV_REFIDX(B2, B1);
-
-    if (is_available_b2 && check_MER && check_MER_1 && nb_merge_cand != 4) {
+    if (is_available_b2 &&
+        !(is_available_a1 && COMPARE_MV_REFIDX(B2, A1)) &&
+        !(is_available_b1 && COMPARE_MV_REFIDX(B2, B1)) &&
+        nb_merge_cand != 4) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(B2);
         if (merge_idx == nb_merge_cand)
             return;
@@ -479,9 +417,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
                                                        0, &mv_l1_col, 1) : 0;
 
         if (available_l0 || available_l1) {
-            mergecandlist[nb_merge_cand].is_intra     = 0;
-            mergecandlist[nb_merge_cand].pred_flag[0] = available_l0;
-            mergecandlist[nb_merge_cand].pred_flag[1] = available_l1;
+            mergecandlist[nb_merge_cand].pred_flag = available_l0 + (available_l1 << 1);
             AV_ZERO16(mergecandlist[nb_merge_cand].ref_idx);
             mergecandlist[nb_merge_cand].mv[0]      = mv_l0_col;
             mergecandlist[nb_merge_cand].mv[1]      = mv_l1_col;
@@ -497,7 +433,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     // combined bi-predictive merge candidates  (applies for B slices)
     if (s->sh.slice_type == HEVC_SLICE_B && nb_orig_merge_cand > 1 &&
         nb_orig_merge_cand < s->sh.max_num_merge_cand) {
-        int comb_idx;
+        int comb_idx = 0;
 
         for (comb_idx = 0; nb_merge_cand < s->sh.max_num_merge_cand &&
                            comb_idx < nb_orig_merge_cand * (nb_orig_merge_cand - 1); comb_idx++) {
@@ -506,17 +442,15 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
             MvField l0_cand = mergecandlist[l0_cand_idx];
             MvField l1_cand = mergecandlist[l1_cand_idx];
 
-            if (l0_cand.pred_flag[0] && l1_cand.pred_flag[1] &&
+            if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) &&
                 (refPicList[0].list[l0_cand.ref_idx[0]] !=
                  refPicList[1].list[l1_cand.ref_idx[1]] ||
                  AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) {
                 mergecandlist[nb_merge_cand].ref_idx[0]   = l0_cand.ref_idx[0];
                 mergecandlist[nb_merge_cand].ref_idx[1]   = l1_cand.ref_idx[1];
-                mergecandlist[nb_merge_cand].pred_flag[0] = 1;
-                mergecandlist[nb_merge_cand].pred_flag[1] = 1;
+                mergecandlist[nb_merge_cand].pred_flag    = PF_BI;
                 AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]);
                 AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]);
-                mergecandlist[nb_merge_cand].is_intra     = 0;
                 if (merge_idx == nb_merge_cand)
                     return;
                 nb_merge_cand++;
@@ -526,11 +460,9 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
 
     // append Zero motion vector candidates
     while (nb_merge_cand < s->sh.max_num_merge_cand) {
-        mergecandlist[nb_merge_cand].pred_flag[0] = 1;
-        mergecandlist[nb_merge_cand].pred_flag[1] = s->sh.slice_type == HEVC_SLICE_B;
+        mergecandlist[nb_merge_cand].pred_flag    = PF_L0 + ((s->sh.slice_type == HEVC_SLICE_B) << 1);
         AV_ZERO32(mergecandlist[nb_merge_cand].mv + 0);
         AV_ZERO32(mergecandlist[nb_merge_cand].mv + 1);
-        mergecandlist[nb_merge_cand].is_intra     = 0;
         mergecandlist[nb_merge_cand].ref_idx[0]   = zero_idx < nb_refs ? zero_idx : 0;
         mergecandlist[nb_merge_cand].ref_idx[1]   = zero_idx < nb_refs ? zero_idx : 0;
 
@@ -553,7 +485,7 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
     MvField mergecand_list[MRG_MAX_NUM_CANDS];
     int nPbW2 = nPbW;
     int nPbH2 = nPbH;
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
 
     if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) {
         singleMCLFlag = 1;
@@ -569,11 +501,9 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
                                     singleMCLFlag, part_idx,
                                     merge_idx, mergecand_list);
 
-    if (mergecand_list[merge_idx].pred_flag[0] == 1 &&
-        mergecand_list[merge_idx].pred_flag[1] == 1 &&
+    if (mergecand_list[merge_idx].pred_flag == PF_BI &&
         (nPbW2 + nPbH2) == 12) {
-        mergecand_list[merge_idx].ref_idx[1]   = -1;
-        mergecand_list[merge_idx].pred_flag[1] = 0;
+        mergecand_list[merge_idx].pred_flag = PF_L0;
     }
 
     *mv = mergecand_list[merge_idx];
@@ -604,7 +534,7 @@ static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index,
 
     RefPicList *refPicList = s->ref->refPicList;
 
-    if (TAB_MVF(x, y).pred_flag[pred_flag_index] == 1 &&
+    if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) &&
         refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) {
         *mv = TAB_MVF(x, y).mv[pred_flag_index];
         return 1;
@@ -619,82 +549,73 @@ static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index,
     int min_pu_width = s->ps.sps->min_pu_width;
 
     RefPicList *refPicList = s->ref->refPicList;
-    int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
 
-    int colIsLongTerm =
-        refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
+    if ((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) {
+        int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
 
-    if (TAB_MVF(x, y).pred_flag[pred_flag_index] &&
-        colIsLongTerm == currIsLongTerm) {
-        *mv = TAB_MVF(x, y).mv[pred_flag_index];
-        if (!currIsLongTerm)
-            dist_scale(s, mv, min_pu_width, x, y,
-                       pred_flag_index, ref_idx_curr, ref_idx);
-        return 1;
+        int colIsLongTerm =
+            refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
+
+        if (colIsLongTerm == currIsLongTerm) {
+            *mv = TAB_MVF(x, y).mv[pred_flag_index];
+            if (!currIsLongTerm)
+                dist_scale(s, mv, min_pu_width, x, y,
+                           pred_flag_index, ref_idx_curr, ref_idx);
+            return 1;
+        }
     }
     return 0;
 }
 
 #define MP_MX(v, pred, mx)                                      \
-    mv_mp_mode_mx(s, x ## v ## _pu, y ## v ## _pu, pred,        \
-                  &mx, ref_idx_curr, ref_idx)
+    mv_mp_mode_mx(s,                                            \
+                  (x ## v) >> s->ps.sps->log2_min_pu_size,         \
+                  (y ## v) >> s->ps.sps->log2_min_pu_size,         \
+                  pred, &mx, ref_idx_curr, ref_idx)
 
 #define MP_MX_LT(v, pred, mx)                                   \
-    mv_mp_mode_mx_lt(s, x ## v ## _pu, y ## v ## _pu, pred,     \
-                     &mx, ref_idx_curr, ref_idx)
+    mv_mp_mode_mx_lt(s,                                         \
+                     (x ## v) >> s->ps.sps->log2_min_pu_size,      \
+                     (y ## v) >> s->ps.sps->log2_min_pu_size,      \
+                     pred, &mx, ref_idx_curr, ref_idx)
 
 void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
                               int nPbH, int log2_cb_size, int part_idx,
                               int merge_idx, MvField *mv,
                               int mvp_lx_flag, int LX)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     MvField *tab_mvf = s->ref->tab_mvf;
     int isScaledFlag_L0 = 0;
-    int availableFlagLXA0 = 0;
-    int availableFlagLXB0 = 0;
+    int availableFlagLXA0 = 1;
+    int availableFlagLXB0 = 1;
     int numMVPCandLX = 0;
     int min_pu_width = s->ps.sps->min_pu_width;
 
     int xA0, yA0;
-    int xA0_pu, yA0_pu;
     int is_available_a0;
-
     int xA1, yA1;
-    int xA1_pu, yA1_pu;
     int is_available_a1;
-
     int xB0, yB0;
-    int xB0_pu, yB0_pu;
     int is_available_b0;
-
     int xB1, yB1;
-    int xB1_pu = 0, yB1_pu = 0;
-    int is_available_b1 = 0;
-
+    int is_available_b1;
     int xB2, yB2;
-    int xB2_pu = 0, yB2_pu = 0;
-    int is_available_b2 = 0;
+    int is_available_b2;
+
     Mv mvpcand_list[2] = { { 0 } };
-    Mv mxA = { 0 };
-    Mv mxB = { 0 };
-    int ref_idx_curr = 0;
+    Mv mxA;
+    Mv mxB;
+    int ref_idx_curr;
     int ref_idx = 0;
     int pred_flag_index_l0;
     int pred_flag_index_l1;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-
-    int cand_up = (lc->ctb_up_flag || y0b);
-    int cand_left = (lc->ctb_left_flag || x0b);
-    int cand_up_left =
-            (!x0b && !y0b) ? lc->ctb_up_left_flag : cand_left && cand_up;
-    int cand_up_right =
-            (x0b + nPbW == (1 << s->ps.sps->log2_ctb_size) ||
-             x0  + nPbW >= lc->end_of_tiles_x) ? lc->ctb_up_right_flag && !y0b
-                                               : cand_up;
-    int cand_bottom_left = (y0 + nPbH >= lc->end_of_tiles_y) ? 0 : cand_left;
 
+    const int cand_bottom_left = lc->na.cand_bottom_left;
+    const int cand_left        = lc->na.cand_left;
+    const int cand_up_left     = lc->na.cand_up_left;
+    const int cand_up          = lc->na.cand_up;
+    const int cand_up_right    = lc->na.cand_up_right_sap;
     ref_idx_curr       = LX;
     ref_idx            = mv->ref_idx[LX];
     pred_flag_index_l0 = LX;
@@ -703,97 +624,109 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
     // left bottom spatial candidate
     xA0 = x0 - 1;
     yA0 = y0 + nPbH;
-    xA0_pu = xA0 >> s->ps.sps->log2_min_pu_size;
-    yA0_pu = yA0 >> s->ps.sps->log2_min_pu_size;
 
-    is_available_a0 = PRED_BLOCK_AVAILABLE(A0) && AVAILABLE(cand_bottom_left, A0);
+    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
+                      yA0 < s->ps.sps->height &&
+                      PRED_BLOCK_AVAILABLE(A0);
 
     //left spatial merge candidate
     xA1    = x0 - 1;
     yA1    = y0 + nPbH - 1;
-    xA1_pu = xA1 >> s->ps.sps->log2_min_pu_size;
-    yA1_pu = yA1 >> s->ps.sps->log2_min_pu_size;
 
     is_available_a1 = AVAILABLE(cand_left, A1);
     if (is_available_a0 || is_available_a1)
         isScaledFlag_L0 = 1;
 
     if (is_available_a0) {
-        availableFlagLXA0 = MP_MX(A0, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX(A0, pred_flag_index_l1, mxA);
-    }
-
-    if (is_available_a1 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX(A1, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX(A1, pred_flag_index_l1, mxA);
+        if (MP_MX(A0, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX(A0, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (is_available_a0 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l1, mxA);
+    if (is_available_a1) {
+        if (MP_MX(A1, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX(A1, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (is_available_a1 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l1, mxA);
+    if (is_available_a0) {
+        if (MP_MX_LT(A0, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX_LT(A0, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (availableFlagLXA0 && !mvp_lx_flag) {
-        mv->mv[LX] = mxA;
-        return;
+    if (is_available_a1) {
+        if (MP_MX_LT(A1, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX_LT(A1, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
+    availableFlagLXA0 = 0;
 
+b_candidates:
     // B candidates
     // above right spatial merge candidate
     xB0    = x0 + nPbW;
     yB0    = y0 - 1;
-    xB0_pu = xB0 >> s->ps.sps->log2_min_pu_size;
-    yB0_pu = yB0 >> s->ps.sps->log2_min_pu_size;
 
-    is_available_b0 = PRED_BLOCK_AVAILABLE(B0) && AVAILABLE(cand_up_right, B0);
+    is_available_b0 =  AVAILABLE(cand_up_right, B0) &&
+                       xB0 < s->ps.sps->width &&
+                       PRED_BLOCK_AVAILABLE(B0);
 
-    if (is_available_b0) {
-        availableFlagLXB0 = MP_MX(B0, pred_flag_index_l0, mxB);
-        if (!availableFlagLXB0)
-            availableFlagLXB0 = MP_MX(B0, pred_flag_index_l1, mxB);
-    }
-
-    if (!availableFlagLXB0) {
-        // above spatial merge candidate
-        xB1    = x0 + nPbW - 1;
-        yB1    = y0 - 1;
-        xB1_pu = xB1 >> s->ps.sps->log2_min_pu_size;
-        yB1_pu = yB1 >> s->ps.sps->log2_min_pu_size;
+    // above spatial merge candidate
+    xB1    = x0 + nPbW - 1;
+    yB1    = y0 - 1;
+    is_available_b1 = AVAILABLE(cand_up, B1);
 
-        is_available_b1 = AVAILABLE(cand_up, B1);
+    // above left spatial merge candidate
+    xB2 = x0 - 1;
+    yB2 = y0 - 1;
+    is_available_b2 = AVAILABLE(cand_up_left, B2);
 
-        if (is_available_b1) {
-            availableFlagLXB0 = MP_MX(B1, pred_flag_index_l0, mxB);
-            if (!availableFlagLXB0)
-                availableFlagLXB0 = MP_MX(B1, pred_flag_index_l1, mxB);
+    // above right spatial merge candidate
+    if (is_available_b0) {
+        if (MP_MX(B0, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B0, pred_flag_index_l1, mxB)) {
+            goto scalef;
         }
     }
 
-    if (!availableFlagLXB0) {
-        // above left spatial merge candidate
-        xB2 = x0 - 1;
-        yB2 = y0 - 1;
-        xB2_pu = xB2 >> s->ps.sps->log2_min_pu_size;
-        yB2_pu = yB2 >> s->ps.sps->log2_min_pu_size;
-        is_available_b2 = AVAILABLE(cand_up_left, B2);
+    // above spatial merge candidate
+    if (is_available_b1) {
+        if (MP_MX(B1, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B1, pred_flag_index_l1, mxB)) {
+            goto scalef;
+        }
+    }
 
-        if (is_available_b2) {
-            availableFlagLXB0 = MP_MX(B2, pred_flag_index_l0, mxB);
-            if (!availableFlagLXB0)
-                availableFlagLXB0 = MP_MX(B2, pred_flag_index_l1, mxB);
+    // above left spatial merge candidate
+    if (is_available_b2) {
+        if (MP_MX(B2, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B2, pred_flag_index_l1, mxB)) {
+            goto scalef;
         }
     }
+    availableFlagLXB0 = 0;
 
-    if (isScaledFlag_L0 == 0) {
+scalef:
+    if (!isScaledFlag_L0) {
         if (availableFlagLXB0) {
             availableFlagLXA0 = 1;
             mxA = mxB;
@@ -837,10 +770,5 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
             mvpcand_list[numMVPCandLX++] = mv_col;
     }
 
-    // insert zero motion vectors when the number of available candidates are less than 2
-    while (numMVPCandLX < 2)
-        mvpcand_list[numMVPCandLX++] = (Mv){ 0, 0 };
-
-    mv->mv[LX].x = mvpcand_list[mvp_lx_flag].x;
-    mv->mv[LX].y = mvpcand_list[mvp_lx_flag].y;
+    mv->mv[LX] = mvpcand_list[mvp_lx_flag];
 }
diff --git a/libavcodec/hevc_parse.c b/libavcodec/hevc_parse.c
new file mode 100644
index 0000000..dddb293
--- /dev/null
+++ b/libavcodec/hevc_parse.c
@@ -0,0 +1,143 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "bytestream.h"
+#include "h2645_parse.h"
+#include "hevc.h"
+#include "hevc_parse.h"
+
+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCParamSets *ps,
+                                 HEVCSEI *sei, int is_nalff, int nal_length_size,
+                                 int err_recognition, int apply_defdispwin, void *logctx)
+{
+    int i;
+    int ret = 0;
+    H2645Packet pkt = { 0 };
+
+    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
+                                nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
+    if (ret < 0) {
+        goto done;
+    }
+
+    for (i = 0; i < pkt.nb_nals; i++) {
+        H2645NAL *nal = &pkt.nals[i];
+
+        /* ignore everything except parameter sets and VCL NALUs */
+        switch (nal->type) {
+        case HEVC_NAL_VPS:
+            ret = ff_hevc_decode_nal_vps(&nal->gb, logctx, ps);
+            if (ret < 0)
+                goto done;
+            break;
+        case HEVC_NAL_SPS:
+            ret = ff_hevc_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
+            if (ret < 0)
+                goto done;
+            break;
+        case HEVC_NAL_PPS:
+            ret = ff_hevc_decode_nal_pps(&nal->gb, logctx, ps);
+            if (ret < 0)
+                goto done;
+            break;
+        case HEVC_NAL_SEI_PREFIX:
+        case HEVC_NAL_SEI_SUFFIX:
+            ret = ff_hevc_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
+            if (ret < 0)
+                goto done;
+            break;
+        default:
+            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
+            break;
+        }
+    }
+
+done:
+    ff_h2645_packet_uninit(&pkt);
+    if (err_recognition & AV_EF_EXPLODE)
+        return ret;
+
+    return 0;
+}
+
+int ff_hevc_decode_extradata(const uint8_t *data, int size, HEVCParamSets *ps,
+                             HEVCSEI *sei, int *is_nalff, int *nal_length_size,
+                             int err_recognition, int apply_defdispwin, void *logctx)
+{
+    int ret = 0;
+    GetByteContext gb;
+
+    bytestream2_init(&gb, data, size);
+
+    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
+        /* It seems the extradata is encoded as hvcC format.
+         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
+         * is finalized. When finalized, configurationVersion will be 1 and we
+         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
+        int i, j, num_arrays, nal_len_size;
+
+        *is_nalff = 1;
+
+        bytestream2_skip(&gb, 21);
+        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
+        num_arrays   = bytestream2_get_byte(&gb);
+
+        /* nal units in the hvcC always have length coded with 2 bytes,
+         * so put a fake nal_length_size = 2 while parsing them */
+        *nal_length_size = 2;
+
+        /* Decode nal units from hvcC. */
+        for (i = 0; i < num_arrays; i++) {
+            int type = bytestream2_get_byte(&gb) & 0x3f;
+            int cnt  = bytestream2_get_be16(&gb);
+
+            for (j = 0; j < cnt; j++) {
+                // +2 for the nal size field
+                int nalsize = bytestream2_peek_be16(&gb) + 2;
+                if (bytestream2_get_bytes_left(&gb) < nalsize) {
+                    av_log(logctx, AV_LOG_ERROR,
+                           "Invalid NAL unit size in extradata.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
+                                            *nal_length_size, err_recognition, apply_defdispwin,
+                                            logctx);
+                if (ret < 0) {
+                    av_log(logctx, AV_LOG_ERROR,
+                           "Decoding nal unit %d %d from hvcC failed\n",
+                           type, i);
+                    return ret;
+                }
+                bytestream2_skip(&gb, nalsize);
+            }
+        }
+
+        /* Now store right nal length size, that will be used to parse
+         * all other nals */
+        *nal_length_size = nal_len_size;
+    } else {
+        *is_nalff = 0;
+        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
+                                    err_recognition, apply_defdispwin, logctx);
+        if (ret < 0)
+            return ret;
+    }
+
+    return ret;
+}
diff --git a/libavcodec/hevc_parse.h b/libavcodec/hevc_parse.h
new file mode 100644
index 0000000..4ab96ab
--- /dev/null
+++ b/libavcodec/hevc_parse.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.265 parser code
+ */
+
+#ifndef AVCODEC_HEVC_PARSE_H
+#define AVCODEC_HEVC_PARSE_H
+
+#include <stdint.h>
+
+#include "hevc_ps.h"
+#include "hevc_sei.h"
+
+int ff_hevc_decode_extradata(const uint8_t *data, int size, HEVCParamSets *ps,
+                             HEVCSEI *sei, int *is_nalff, int *nal_length_size,
+                             int err_recognition, int apply_defdispwin, void *logctx);
+
+#endif /* AVCODEC_HEVC_PARSE_H */
diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
index 5129e3a..b444b99 100644
--- a/libavcodec/hevc_parser.c
+++ b/libavcodec/hevc_parser.c
@@ -3,103 +3,221 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/common.h"
 
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "hevc.h"
-#include "hevcdec.h"
+#include "hevc_parse.h"
+#include "hevc_ps.h"
+#include "hevc_sei.h"
 #include "h2645_parse.h"
+#include "internal.h"
 #include "parser.h"
 
 #define START_CODE 0x000001 ///< start_code_prefix_one_3bytes
 
 #define IS_IRAP_NAL(nal) (nal->type >= 16 && nal->type <= 23)
+#define IS_IDR_NAL(nal) (nal->type == HEVC_NAL_IDR_W_RADL || nal->type == HEVC_NAL_IDR_N_LP)
 
 typedef struct HEVCParserContext {
     ParseContext pc;
 
     H2645Packet pkt;
     HEVCParamSets ps;
+    HEVCSEI sei;
+    SliceHeader sh;
 
+    int is_avc;
+    int nal_length_size;
     int parsed_extradata;
+
+    int poc;
+    int pocTid0;
 } HEVCParserContext;
 
 static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
                                    AVCodecContext *avctx)
 {
     HEVCParserContext *ctx = s->priv_data;
+    HEVCParamSets *ps = &ctx->ps;
+    HEVCSEI *sei = &ctx->sei;
+    SliceHeader *sh = &ctx->sh;
     GetBitContext *gb = &nal->gb;
+    const HEVCWindow *ow;
+    int i, num = 0, den = 0;
 
-    HEVCPPS *pps;
-    HEVCSPS *sps;
-    HEVCWindow *ow;
-    unsigned int pps_id;
+    sh->first_slice_in_pic_flag = get_bits1(gb);
+    s->picture_structure = sei->picture_timing.picture_struct;
+    s->field_order = sei->picture_timing.picture_struct;
 
-    get_bits1(gb);          // first slice in pic
-    if (IS_IRAP_NAL(nal))
-        get_bits1(gb);      // no output of prior pics
+    if (IS_IRAP_NAL(nal)) {
+        s->key_frame = 1;
+        sh->no_output_of_prior_pics_flag = get_bits1(gb);
+    }
 
-    pps_id = get_ue_golomb_long(gb);
-    if (pps_id >= HEVC_MAX_PPS_COUNT || !ctx->ps.pps_list[pps_id]) {
-        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
+    sh->pps_id = get_ue_golomb(gb);
+    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !ps->pps_list[sh->pps_id]) {
+        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
         return AVERROR_INVALIDDATA;
     }
-    pps = (HEVCPPS*)ctx->ps.pps_list[pps_id]->data;
-    sps = (HEVCSPS*)ctx->ps.sps_list[pps->sps_id]->data;
-    ow  = &sps->output_window;
+    ps->pps = (HEVCPPS*)ps->pps_list[sh->pps_id]->data;
 
-    /* export the stream parameters */
-    s->coded_width  = sps->width;
-    s->coded_height = sps->height;
-    s->width        = sps->width  - ow->left_offset - ow->right_offset;
-    s->height       = sps->height - ow->top_offset  - ow->bottom_offset;
-    s->format       = sps->pix_fmt;
-    avctx->profile  = sps->ptl.general_ptl.profile_idc;
-    avctx->level    = sps->ptl.general_ptl.level_idc;
+    if (ps->pps->sps_id >= HEVC_MAX_SPS_COUNT || !ps->sps_list[ps->pps->sps_id]) {
+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", ps->pps->sps_id);
+        return AVERROR_INVALIDDATA;
+    }
+    if (ps->sps != (HEVCSPS*)ps->sps_list[ps->pps->sps_id]->data) {
+        ps->sps = (HEVCSPS*)ps->sps_list[ps->pps->sps_id]->data;
+        ps->vps = (HEVCVPS*)ps->vps_list[ps->sps->vps_id]->data;
+    }
+    ow  = &ps->sps->output_window;
+
+    s->coded_width  = ps->sps->width;
+    s->coded_height = ps->sps->height;
+    s->width        = ps->sps->width  - ow->left_offset - ow->right_offset;
+    s->height       = ps->sps->height - ow->top_offset  - ow->bottom_offset;
+    s->format       = ps->sps->pix_fmt;
+    avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
+    avctx->level    = ps->sps->ptl.general_ptl.level_idc;
+
+    if (ps->vps->vps_timing_info_present_flag) {
+        num = ps->vps->vps_num_units_in_tick;
+        den = ps->vps->vps_time_scale;
+    } else if (ps->sps->vui.vui_timing_info_present_flag) {
+        num = ps->sps->vui.vui_num_units_in_tick;
+        den = ps->sps->vui.vui_time_scale;
+    }
 
-    /* ignore the rest for now*/
+    if (num != 0 && den != 0)
+        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
+                  num, den, 1 << 30);
+
+    if (!sh->first_slice_in_pic_flag) {
+        int slice_address_length;
+
+        if (ps->pps->dependent_slice_segments_enabled_flag)
+            sh->dependent_slice_segment_flag = get_bits1(gb);
+        else
+            sh->dependent_slice_segment_flag = 0;
+
+        slice_address_length = av_ceil_log2_c(ps->sps->ctb_width *
+                                              ps->sps->ctb_height);
+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
+        if (sh->slice_segment_addr >= ps->sps->ctb_width * ps->sps->ctb_height) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid slice segment address: %u.\n",
+                   sh->slice_segment_addr);
+            return AVERROR_INVALIDDATA;
+        }
+    } else
+        sh->dependent_slice_segment_flag = 0;
 
-    return 0;
+    if (sh->dependent_slice_segment_flag)
+        return 0; /* break; */
+
+    for (i = 0; i < ps->pps->num_extra_slice_header_bits; i++)
+        skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
+
+    sh->slice_type = get_ue_golomb(gb);
+    if (!(sh->slice_type == HEVC_SLICE_I || sh->slice_type == HEVC_SLICE_P ||
+          sh->slice_type == HEVC_SLICE_B)) {
+        av_log(avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
+               sh->slice_type);
+        return AVERROR_INVALIDDATA;
+    }
+    s->pict_type = sh->slice_type == HEVC_SLICE_B ? AV_PICTURE_TYPE_B :
+                   sh->slice_type == HEVC_SLICE_P ? AV_PICTURE_TYPE_P :
+                                               AV_PICTURE_TYPE_I;
+
+    if (ps->pps->output_flag_present_flag)
+        sh->pic_output_flag = get_bits1(gb);
+
+    if (ps->sps->separate_colour_plane_flag)
+        sh->colour_plane_id = get_bits(gb, 2);
+
+    if (!IS_IDR_NAL(nal)) {
+        sh->pic_order_cnt_lsb = get_bits(gb, ps->sps->log2_max_poc_lsb);
+        s->output_picture_number = ctx->poc = ff_hevc_compute_poc(ps->sps, ctx->pocTid0, sh->pic_order_cnt_lsb, nal->type);
+    } else
+        s->output_picture_number = ctx->poc = 0;
+
+    if (nal->temporal_id == 0 &&
+        nal->type != HEVC_NAL_TRAIL_N &&
+        nal->type != HEVC_NAL_TSA_N &&
+        nal->type != HEVC_NAL_STSA_N &&
+        nal->type != HEVC_NAL_RADL_N &&
+        nal->type != HEVC_NAL_RASL_N &&
+        nal->type != HEVC_NAL_RADL_R &&
+        nal->type != HEVC_NAL_RASL_R)
+        ctx->pocTid0 = ctx->poc;
+
+    return 1; /* no need to evaluate the rest */
 }
 
+/**
+ * Parse NAL units of found picture and decode some basic information.
+ *
+ * @param s parser context.
+ * @param avctx codec context.
+ * @param buf buffer with field/frame data.
+ * @param buf_size size of the buffer.
+ */
 static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
                            int buf_size, AVCodecContext *avctx)
 {
     HEVCParserContext *ctx = s->priv_data;
+    HEVCParamSets *ps = &ctx->ps;
+    HEVCSEI *sei = &ctx->sei;
     int ret, i;
 
-    ret = ff_h2645_packet_split(&ctx->pkt, buf, buf_size, avctx, 0, 0,
-                                AV_CODEC_ID_HEVC);
+    /* set some sane default values */
+    s->pict_type         = AV_PICTURE_TYPE_I;
+    s->key_frame         = 0;
+    s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
+
+    ff_hevc_reset_sei(sei);
+
+    ret = ff_h2645_packet_split(&ctx->pkt, buf, buf_size, avctx, ctx->is_avc,
+                                ctx->nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
     if (ret < 0)
         return ret;
 
     for (i = 0; i < ctx->pkt.nb_nals; i++) {
         H2645NAL *nal = &ctx->pkt.nals[i];
+        GetBitContext *gb = &nal->gb;
 
-        /* ignore everything except parameter sets and VCL NALUs */
         switch (nal->type) {
-        case HEVC_NAL_VPS: ff_hevc_decode_nal_vps(&nal->gb, avctx, &ctx->ps);    break;
-        case HEVC_NAL_SPS: ff_hevc_decode_nal_sps(&nal->gb, avctx, &ctx->ps, 1); break;
-        case HEVC_NAL_PPS: ff_hevc_decode_nal_pps(&nal->gb, avctx, &ctx->ps);    break;
-        case HEVC_NAL_TRAIL_R:
+        case HEVC_NAL_VPS:
+            ff_hevc_decode_nal_vps(gb, avctx, ps);
+            break;
+        case HEVC_NAL_SPS:
+            ff_hevc_decode_nal_sps(gb, avctx, ps, 1);
+            break;
+        case HEVC_NAL_PPS:
+            ff_hevc_decode_nal_pps(gb, avctx, ps);
+            break;
+        case HEVC_NAL_SEI_PREFIX:
+        case HEVC_NAL_SEI_SUFFIX:
+            ff_hevc_decode_nal_sei(gb, avctx, sei, ps, nal->type);
+            break;
         case HEVC_NAL_TRAIL_N:
+        case HEVC_NAL_TRAIL_R:
         case HEVC_NAL_TSA_N:
         case HEVC_NAL_TSA_R:
         case HEVC_NAL_STSA_N:
@@ -113,11 +231,16 @@ static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
         case HEVC_NAL_RADL_N:
         case HEVC_NAL_RADL_R:
         case HEVC_NAL_RASL_N:
-        case HEVC_NAL_RASL_R: hevc_parse_slice_header(s, nal, avctx); break;
+        case HEVC_NAL_RASL_R:
+            ret = hevc_parse_slice_header(s, nal, avctx);
+            if (ret)
+                return ret;
+            break;
         }
     }
-
-    return 0;
+    /* didn't find a picture! */
+    av_log(avctx, AV_LOG_ERROR, "missing picture in access unit with size %d\n", buf_size);
+    return -1;
 }
 
 /**
@@ -141,7 +264,7 @@ static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
 
         nut = (pc->state64 >> 2 * 8 + 1) & 0x3F;
         // Beginning of access unit
-        if ((nut >= HEVC_NAL_VPS && nut <= HEVC_NAL_AUD) || nut == HEVC_NAL_SEI_PREFIX ||
+        if ((nut >= HEVC_NAL_VPS && nut <= HEVC_NAL_EOB_NUT) || nut == HEVC_NAL_SEI_PREFIX ||
             (nut >= 41 && nut <= 44) || (nut >= 48 && nut <= 55)) {
             if (pc->frame_start_found) {
                 pc->frame_start_found = 0;
@@ -153,7 +276,6 @@ static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
             if (first_slice_segment_in_pic_flag) {
                 if (!pc->frame_start_found) {
                     pc->frame_start_found = 1;
-                    s->key_frame = nut >= HEVC_NAL_BLA_W_LP && nut <= HEVC_NAL_CRA_NUT;
                 } else { // First slice of next frame found
                     pc->frame_start_found = 0;
                     return i - 5;
@@ -170,12 +292,15 @@ static int hevc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                       const uint8_t *buf, int buf_size)
 {
     int next;
-
     HEVCParserContext *ctx = s->priv_data;
     ParseContext *pc = &ctx->pc;
+    int is_dummy_buf = !buf_size;
+    const uint8_t *dummy_buf = buf;
 
     if (avctx->extradata && !ctx->parsed_extradata) {
-        parse_nal_units(s, avctx->extradata, avctx->extradata_size, avctx);
+        ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size, &ctx->ps, &ctx->sei,
+                                 &ctx->is_avc, &ctx->nal_length_size, avctx->err_recognition,
+                                 1, avctx);
         ctx->parsed_extradata = 1;
     }
 
@@ -190,7 +315,10 @@ static int hevc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         }
     }
 
-    parse_nal_units(s, buf, buf_size, avctx);
+    is_dummy_buf &= (dummy_buf == buf);
+
+    if (!is_dummy_buf)
+        parse_nal_units(s, buf, buf_size, avctx);
 
     *poutbuf      = buf;
     *poutbuf_size = buf_size;
@@ -200,20 +328,31 @@ static int hevc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 // Split after the parameter sets at the beginning of the stream if they exist.
 static int hevc_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
 {
-    int i;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
     uint32_t state = -1;
-    int has_ps = 0;
-
-    for (i = 0; i < buf_size; i++) {
-        state = (state << 8) | buf[i];
-        if (((state >> 8) & 0xFFFFFF) == START_CODE) {
-            int nut = (state >> 1) & 0x3F;
-            if (nut >= HEVC_NAL_VPS && nut <= HEVC_NAL_PPS)
-                has_ps = 1;
-            else if (has_ps)
-                return i - 3;
-            else // no parameter set at the beginning of the stream
-                return 0;
+    int has_vps = 0;
+    int has_sps = 0;
+    int has_pps = 0;
+    int nut;
+
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if ((state >> 8) != START_CODE)
+            break;
+        nut = (state >> 1) & 0x3F;
+        if (nut == HEVC_NAL_VPS)
+            has_vps = 1;
+        else if (nut == HEVC_NAL_SPS)
+            has_sps = 1;
+        else if (nut == HEVC_NAL_PPS)
+            has_pps = 1;
+        else if ((nut != HEVC_NAL_SEI_PREFIX || has_pps) &&
+                  nut != HEVC_NAL_AUD) {
+            if (has_vps && has_sps) {
+                while (ptr - 4 > buf && ptr[-5] == 0)
+                    ptr--;
+                return ptr - 4 - buf;
+            }
         }
     }
     return 0;
@@ -222,16 +361,10 @@ static int hevc_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
 static void hevc_parser_close(AVCodecParserContext *s)
 {
     HEVCParserContext *ctx = s->priv_data;
-    int i;
-
-    for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.vps_list); i++)
-        av_buffer_unref(&ctx->ps.vps_list[i]);
-    for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.sps_list); i++)
-        av_buffer_unref(&ctx->ps.sps_list[i]);
-    for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.pps_list); i++)
-        av_buffer_unref(&ctx->ps.pps_list[i]);
 
+    ff_hevc_ps_uninit(&ctx->ps);
     ff_h2645_packet_uninit(&ctx->pkt);
+    ff_hevc_reset_sei(&ctx->sei);
 
     av_freep(&ctx->pc.buffer);
 }
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
index 2603e6d..80df417 100644
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@@ -6,26 +6,25 @@
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2013 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
-
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "hevc_data.h"
 #include "hevc_ps.h"
 
@@ -71,6 +70,14 @@ static const AVRational vui_sar[] = {
     {  2,   1 },
 };
 
+static const uint8_t hevc_sub_width_c[] = {
+    1, 2, 2, 1
+};
+
+static const uint8_t hevc_sub_height_c[] = {
+    1, 2, 1, 1
+};
+
 static void remove_pps(HEVCParamSets *s, int id)
 {
     if (s->pps_list[id] && s->pps == (const HEVCPPS*)s->pps_list[id]->data)
@@ -89,6 +96,8 @@ static void remove_sps(HEVCParamSets *s, int id)
         for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
             if (s->pps_list[i] && ((HEVCPPS*)s->pps_list[i]->data)->sps_id == id)
                 remove_pps(s, i);
+
+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCSPS*)s->sps_list[id]->data));
     }
     av_buffer_unref(&s->sps_list[id]);
 }
@@ -122,7 +131,8 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 
     if (rps_predict) {
         const ShortTermRPS *rps_ridx;
-        int delta_rps, abs_delta_rps;
+        int delta_rps;
+        unsigned abs_delta_rps;
         uint8_t use_delta_flag = 0;
         uint8_t delta_rps_sign;
 
@@ -141,6 +151,12 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 
         delta_rps_sign = get_bits1(gb);
         abs_delta_rps  = get_ue_golomb_long(gb) + 1;
+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid value of abs_delta_rps: %d\n",
+                   abs_delta_rps);
+            return AVERROR_INVALIDDATA;
+        }
         delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
         for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
             int used = rps->used[k] = get_bits1(gb);
@@ -162,6 +178,12 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
             }
         }
 
+        if (k >= FF_ARRAY_ELEMS(rps->used)) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid num_delta_pocs: %d\n", k);
+            return AVERROR_INVALIDDATA;
+        }
+
         rps->num_delta_pocs    = k;
         rps->num_negative_pics = k0;
         // sort in increasing order (smallest first)
@@ -211,6 +233,12 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
             prev = 0;
             for (i = 0; i < rps->num_negative_pics; i++) {
                 delta_poc = get_ue_golomb_long(gb) + 1;
+                if (delta_poc < 1 || delta_poc > 32768) {
+                    av_log(avctx, AV_LOG_ERROR,
+                        "Invalid value of delta_poc: %d\n",
+                        delta_poc);
+                    return AVERROR_INVALIDDATA;
+                }
                 prev -= delta_poc;
                 rps->delta_poc[i] = prev;
                 rps->used[i]      = get_bits1(gb);
@@ -218,6 +246,12 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
             prev = 0;
             for (i = 0; i < nb_positive_pics; i++) {
                 delta_poc = get_ue_golomb_long(gb) + 1;
+                if (delta_poc < 1 || delta_poc > 32768) {
+                    av_log(avctx, AV_LOG_ERROR,
+                        "Invalid value of delta_poc: %d\n",
+                        delta_poc);
+                    return AVERROR_INVALIDDATA;
+                }
                 prev += delta_poc;
                 rps->delta_poc[rps->num_negative_pics + i] = prev;
                 rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
@@ -228,11 +262,14 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 }
 
 
-static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
+static int decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
                                       PTLCommon *ptl)
 {
     int i;
 
+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
+        return -1;
+
     ptl->profile_space = get_bits(gb, 2);
     ptl->tier_flag     = get_bits1(gb);
     ptl->profile_idc   = get_bits(gb, 5);
@@ -242,6 +279,8 @@ static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
         av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
     else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
         av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
     else
         av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
 
@@ -259,28 +298,48 @@ static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
     skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
     skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
     skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
+
+    return 0;
 }
 
-static void parse_ptl(GetBitContext *gb, AVCodecContext *avctx,
+static int parse_ptl(GetBitContext *gb, AVCodecContext *avctx,
                       PTL *ptl, int max_num_sub_layers)
 {
     int i;
-    decode_profile_tier_level(gb, avctx, &ptl->general_ptl);
+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
+        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
+        return -1;
+    }
+
     ptl->general_ptl.level_idc = get_bits(gb, 8);
 
     for (i = 0; i < max_num_sub_layers - 1; i++) {
         ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
         ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
     }
-    if (max_num_sub_layers - 1 > 0)
+
+    if (max_num_sub_layers - 1> 0)
         for (i = max_num_sub_layers - 1; i < 8; i++)
             skip_bits(gb, 2); // reserved_zero_2bits[i]
     for (i = 0; i < max_num_sub_layers - 1; i++) {
-        if (ptl->sub_layer_profile_present_flag[i])
-            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]);
-        if (ptl->sub_layer_level_present_flag[i])
-            ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
+        if (ptl->sub_layer_profile_present_flag[i] &&
+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "PTL information for sublayer %i too short\n", i);
+            return -1;
+        }
+        if (ptl->sub_layer_level_present_flag[i]) {
+            if (get_bits_left(gb) < 8) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not enough data for sublayer %i level_idc\n", i);
+                return -1;
+            } else
+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
+        }
     }
+
+    return 0;
 }
 
 static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
@@ -300,7 +359,7 @@ static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
     }
 }
 
-static void decode_hrd(GetBitContext *gb, int common_inf_present,
+static int decode_hrd(GetBitContext *gb, int common_inf_present,
                        int max_sublayers)
 {
     int nal_params_present = 0, vcl_params_present = 0;
@@ -346,14 +405,20 @@ static void decode_hrd(GetBitContext *gb, int common_inf_present,
         else
             low_delay = get_bits1(gb);
 
-        if (!low_delay)
+        if (!low_delay) {
             nb_cpb = get_ue_golomb_long(gb) + 1;
+            if (nb_cpb < 1 || nb_cpb > 32) {
+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
+                return AVERROR_INVALIDDATA;
+            }
+        }
 
         if (nal_params_present)
             decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
         if (vcl_params_present)
             decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
     }
+    return 0;
 }
 
 int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
@@ -361,6 +426,7 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
 {
     int i,j;
     int vps_id = 0;
+    ptrdiff_t nal_size;
     HEVCVPS *vps;
     AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
 
@@ -370,6 +436,17 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
 
     av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
 
+    nal_size = gb->buffer_end - gb->buffer;
+    if (nal_size > sizeof(vps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
+               nal_size, sizeof(vps->data));
+        vps->data_size = sizeof(vps->data);
+    } else {
+        vps->data_size = nal_size;
+    }
+    memcpy(vps->data, gb->buffer, vps->data_size);
+
     vps_id = get_bits(gb, 4);
     if (vps_id >= HEVC_MAX_VPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
@@ -396,7 +473,8 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         goto err;
     }
 
-    parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers);
+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
+        goto err;
 
     vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
 
@@ -406,7 +484,7 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
         vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
 
-        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE) {
+        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
             av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
                    vps->vps_max_dec_pic_buffering[i] - 1);
             goto err;
@@ -421,6 +499,12 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
 
     vps->vps_max_layer_id   = get_bits(gb, 6);
     vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
+        goto err;
+    }
+
     for (i = 1; i < vps->vps_num_layer_sets; i++)
         for (j = 0; j <= vps->vps_max_layer_id; j++)
             skip_bits(gb, 1);  // layer_id_included_flag[i][j]
@@ -433,6 +517,11 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         if (vps->vps_poc_proportional_to_timing_flag)
             vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
         vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
+            goto err;
+        }
         for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
             int common_inf_present = 1;
 
@@ -444,6 +533,13 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
     }
     get_bits1(gb); /* vps_extension_flag */
 
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread VPS by %d bits\n", -get_bits_left(gb));
+        if (ps->vps_list[vps_id])
+            goto err;
+    }
+
     if (ps->vps_list[vps_id] &&
         !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
         av_buffer_unref(&vps_buf);
@@ -462,8 +558,9 @@ err:
 static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
                        int apply_defdispwin, HEVCSPS *sps)
 {
-    VUI *vui          = &sps->vui;
-    int sar_present;
+    VUI backup_vui, *vui = &sps->vui;
+    GetBitContext backup;
+    int sar_present, alt = 0;
 
     av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
 
@@ -503,6 +600,19 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
                 vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
             if (!av_color_space_name(vui->matrix_coeffs))
                 vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
+            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
+                switch (sps->pix_fmt) {
+                case AV_PIX_FMT_YUV444P:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP;
+                    break;
+                case AV_PIX_FMT_YUV444P10:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
+                    break;
+                case AV_PIX_FMT_YUV444P12:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
+                    break;
+                }
+            }
         }
     }
 
@@ -516,13 +626,22 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
     vui->field_seq_flag                = get_bits1(gb);
     vui->frame_field_info_present_flag = get_bits1(gb);
 
-    vui->default_display_window_flag = get_bits1(gb);
+    // Backup context in case an alternate header is detected
+    memcpy(&backup, gb, sizeof(backup));
+    memcpy(&backup_vui, vui, sizeof(backup_vui));
+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
+        vui->default_display_window_flag = 0;
+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
+    } else
+        vui->default_display_window_flag = get_bits1(gb);
+
     if (vui->default_display_window_flag) {
-        //TODO: * 2 is only valid for 420
-        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * 2;
-        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * 2;
-        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) * 2;
-        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * 2;
+        int vert_mult  = hevc_sub_height_c[sps->chroma_format_idc];
+        int horiz_mult = hevc_sub_width_c[sps->chroma_format_idc];
+        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
+        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
+        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
+        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
 
         if (apply_defdispwin &&
             avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
@@ -541,10 +660,26 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
         }
     }
 
+timing_info:
     vui->vui_timing_info_present_flag = get_bits1(gb);
+
     if (vui->vui_timing_info_present_flag) {
+        if( get_bits_left(gb) < 66 && !alt) {
+            // The alternate syntax seem to have timing info located
+            // at where def_disp_win is normally located
+            av_log(avctx, AV_LOG_WARNING,
+                   "Strange VUI timing information, retrying...\n");
+            memcpy(vui, &backup_vui, sizeof(backup_vui));
+            memcpy(gb, &backup, sizeof(backup));
+            alt = 1;
+            goto timing_info;
+        }
         vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
         vui->vui_time_scale                      = get_bits_long(gb, 32);
+        if (alt) {
+            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
+        }
         vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
         if (vui->vui_poc_proportional_to_timing_flag)
             vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
@@ -555,6 +690,15 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
 
     vui->bitstream_restriction_flag = get_bits1(gb);
     if (vui->bitstream_restriction_flag) {
+        if (get_bits_left(gb) < 8 && !alt) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Strange VUI bitstream restriction information, retrying"
+                   " from timing information...\n");
+            memcpy(vui, &backup_vui, sizeof(backup_vui));
+            memcpy(gb, &backup, sizeof(backup));
+            alt = 1;
+            goto timing_info;
+        }
         vui->tiles_fixed_structure_flag              = get_bits1(gb);
         vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
         vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
@@ -564,6 +708,16 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
         vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
         vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
     }
+
+    if (get_bits_left(gb) < 1 && !alt) {
+        // XXX: Alternate syntax when sps_range_extension_flag != 0?
+        av_log(avctx, AV_LOG_WARNING,
+               "Overread in VUI, retrying from timing information...\n");
+        memcpy(vui, &backup_vui, sizeof(backup_vui));
+        memcpy(gb, &backup, sizeof(backup));
+        alt = 1;
+        goto timing_info;
+    }
 }
 
 static void set_default_scaling_list_data(ScalingList *sl)
@@ -589,24 +743,30 @@ static void set_default_scaling_list_data(ScalingList *sl)
     memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
     memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
     memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
-    memcpy(sl->sl[3][1], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
 }
 
-static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl)
+static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl, HEVCSPS *sps)
 {
-    uint8_t scaling_list_pred_mode_flag[4][6];
+    uint8_t scaling_list_pred_mode_flag;
     int32_t scaling_list_dc_coef[2][6];
-    int size_id, matrix_id, i, pos;
+    int size_id, matrix_id, pos;
+    int i;
 
     for (size_id = 0; size_id < 4; size_id++)
-        for (matrix_id = 0; matrix_id < (size_id == 3 ? 2 : 6); matrix_id++) {
-            scaling_list_pred_mode_flag[size_id][matrix_id] = get_bits1(gb);
-            if (!scaling_list_pred_mode_flag[size_id][matrix_id]) {
+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
+            scaling_list_pred_mode_flag = get_bits1(gb);
+            if (!scaling_list_pred_mode_flag) {
                 unsigned int delta = get_ue_golomb_long(gb);
                 /* Only need to handle non-zero delta. Zero means default,
                  * which should already be in the arrays. */
                 if (delta) {
                     // Copy from previous array.
+                    delta *= (size_id == 3) ? 3 : 1;
                     if (matrix_id < delta) {
                         av_log(avctx, AV_LOG_ERROR,
                                "Invalid delta in scaling list data: %d.\n", delta);
@@ -639,31 +799,63 @@ static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingLi
                                   ff_hevc_diag_scan8x8_x[i];
 
                     scaling_list_delta_coef = get_se_golomb(gb);
-                    next_coef = (next_coef + scaling_list_delta_coef + 256) % 256;
+                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
                     sl->sl[size_id][matrix_id][pos] = next_coef;
                 }
             }
         }
 
+    if (sps->chroma_format_idc == 3) {
+        for (i = 0; i < 64; i++) {
+            sl->sl[3][1][i] = sl->sl[2][1][i];
+            sl->sl[3][2][i] = sl->sl[2][2][i];
+            sl->sl[3][4][i] = sl->sl[2][4][i];
+            sl->sl[3][5][i] = sl->sl[2][5][i];
+        }
+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
+    }
+
+
     return 0;
 }
 
 static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
 {
     const AVPixFmtDescriptor *desc;
-    if (sps->chroma_format_idc == 1) {
-        switch (sps->bit_depth) {
-        case 8:  sps->pix_fmt = AV_PIX_FMT_YUV420P;   break;
-        case 9:  sps->pix_fmt = AV_PIX_FMT_YUV420P9;  break;
-        case 10: sps->pix_fmt = AV_PIX_FMT_YUV420P10; break;
-        default:
-            avpriv_report_missing_feature(avctx, "Bit depth %d",
-                                          sps->bit_depth);
-            return AVERROR_PATCHWELCOME;
-        }
-    } else {
-        avpriv_report_missing_feature(avctx, "Non-4:2:0 support");
-        return AVERROR_PATCHWELCOME;
+    switch (sps->bit_depth) {
+    case 8:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+       break;
+    case 9:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY9;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P9;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P9;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P9;
+        break;
+    case 10:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY10;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
+        break;
+    case 12:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY12;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P12;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P12;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P12;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR,
+               "The following bit-depths are currently specified: 8, 9, 10 and 12 bits, "
+               "chroma_format_idc is %d, depth is %d\n",
+               sps->chroma_format_idc, sps->bit_depth);
+        return AVERROR_INVALIDDATA;
     }
 
     desc = av_pix_fmt_desc_get(sps->pix_fmt);
@@ -693,59 +885,58 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->vps_id = get_bits(gb, 4);
     if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     if (vps_list && !vps_list[sps->vps_id]) {
         av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
                sps->vps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sps->max_sub_layers = get_bits(gb, 3) + 1;
     if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
         av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
                sps->max_sub_layers);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-    skip_bits1(gb); // temporal_id_nesting_flag
+    sps->temporal_id_nesting_flag = get_bits(gb, 1);
 
-    parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers);
+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
+        return ret;
 
     *sps_id = get_ue_golomb_long(gb);
     if (*sps_id >= HEVC_MAX_SPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sps->chroma_format_idc = get_ue_golomb_long(gb);
-    if (sps->chroma_format_idc != 1) {
-        avpriv_report_missing_feature(avctx, "chroma_format_idc %d",
-                                      sps->chroma_format_idc);
-        ret = AVERROR_PATCHWELCOME;
-        goto err;
+    if (sps->chroma_format_idc > 3U) {
+        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
+        return AVERROR_INVALIDDATA;
     }
 
     if (sps->chroma_format_idc == 3)
         sps->separate_colour_plane_flag = get_bits1(gb);
 
+    if (sps->separate_colour_plane_flag)
+        sps->chroma_format_idc = 0;
+
     sps->width  = get_ue_golomb_long(gb);
     sps->height = get_ue_golomb_long(gb);
     if ((ret = av_image_check_size(sps->width,
                                    sps->height, 0, avctx)) < 0)
-        goto err;
+        return ret;
 
     if (get_bits1(gb)) { // pic_conformance_flag
-        //TODO: * 2 is only valid for 420
-        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * 2;
-        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * 2;
-        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) * 2;
-        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * 2;
+        int vert_mult  = hevc_sub_height_c[sps->chroma_format_idc];
+        int horiz_mult = hevc_sub_width_c[sps->chroma_format_idc];
+        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
+        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
+        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
+        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
 
         if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
             av_log(avctx, AV_LOG_DEBUG,
@@ -766,26 +957,24 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
 
     sps->bit_depth   = get_ue_golomb_long(gb) + 8;
     bit_depth_chroma = get_ue_golomb_long(gb) + 8;
-    if (bit_depth_chroma != sps->bit_depth) {
+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
         av_log(avctx, AV_LOG_ERROR,
                "Luma bit depth (%d) is different from chroma bit depth (%d), "
                "this is unsupported.\n",
                sps->bit_depth, bit_depth_chroma);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
-
+    sps->bit_depth_chroma = bit_depth_chroma;
 
     ret = map_pixel_format(avctx, sps);
     if (ret < 0)
-        goto err;
+        return ret;
 
     sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
     if (sps->log2_max_poc_lsb > 16) {
         av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
                sps->log2_max_poc_lsb - 4);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sublayer_ordering_info = get_bits1(gb);
@@ -794,19 +983,17 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
         sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
         sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
-        if (sps->temporal_layer[i].max_dec_pic_buffering > HEVC_MAX_DPB_SIZE) {
+        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
             av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
-                   sps->temporal_layer[i].max_dec_pic_buffering - 1);
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
+            return AVERROR_INVALIDDATA;
         }
         if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
             av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
                    sps->temporal_layer[i].num_reorder_pics);
             if (avctx->err_recognition & AV_EF_EXPLODE ||
                 sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
-                ret = AVERROR_INVALIDDATA;
-                goto err;
+                return AVERROR_INVALIDDATA;
             }
             sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
         }
@@ -827,11 +1014,26 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
                                                sps->log2_min_tb_size;
 
-    if (sps->log2_min_tb_size >= sps->log2_min_cb_size) {
+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_diff_max_min_coding_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
         av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
+        return AVERROR_INVALIDDATA;
     }
+
     sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
     sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
 
@@ -840,9 +1042,9 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         set_default_scaling_list_data(&sps->scaling_list);
 
         if (get_bits1(gb)) {
-            ret = scaling_list_data(gb, avctx, &sps->scaling_list);
+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
             if (ret < 0)
-                goto err;
+                return ret;
         }
     }
 
@@ -856,12 +1058,11 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
         sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
                                         get_ue_golomb_long(gb);
-        if (sps->pcm.bit_depth > sps->bit_depth) {
+        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
             av_log(avctx, AV_LOG_ERROR,
-                   "PCM bit depth (%d) is greater than normal bit depth (%d)\n",
-                   sps->pcm.bit_depth, sps->bit_depth);
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
+                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
+            return AVERROR_INVALIDDATA;
         }
 
         sps->pcm.loop_filter_disable_flag = get_bits1(gb);
@@ -871,13 +1072,12 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
         av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
                sps->nb_st_rps);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     for (i = 0; i < sps->nb_st_rps; i++) {
         if ((ret = ff_hevc_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
                                                  sps, 0)) < 0)
-            goto err;
+            return ret;
     }
 
     sps->long_term_ref_pics_present_flag = get_bits1(gb);
@@ -886,8 +1086,7 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
             av_log(avctx, AV_LOG_ERROR, "Too many long term ref pics: %d.\n",
                    sps->num_long_term_ref_pics_sps);
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
         for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
             sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
@@ -901,8 +1100,36 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     vui_present = get_bits1(gb);
     if (vui_present)
         decode_vui(gb, avctx, apply_defdispwin, sps);
-    skip_bits1(gb); // sps_extension_flag
 
+    if (get_bits1(gb)) { // sps_extension_flag
+        sps->sps_range_extension_flag = get_bits1(gb);
+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
+        if (sps->sps_range_extension_flag) {
+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            sps->extended_precision_processing_flag = get_bits1(gb);
+            if (sps->extended_precision_processing_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "extended_precision_processing_flag not yet implemented\n");
+
+            sps->intra_smoothing_disabled_flag       = get_bits1(gb);
+            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
+            if (sps->high_precision_offsets_enabled_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "high_precision_offsets_enabled_flag not yet implemented\n");
+
+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
+
+            sps->cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
+            if (sps->cabac_bypass_alignment_enabled_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
+        }
+    }
     if (apply_defdispwin) {
         sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
         sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
@@ -918,12 +1145,12 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
                ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
         if (avctx->err_recognition & AV_EF_EXPLODE) {
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
         av_log(avctx, AV_LOG_WARNING,
                "Displaying the whole video surface.\n");
         memset(ow, 0, sizeof(*ow));
+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
     }
 
     // Inferred parameters
@@ -931,6 +1158,19 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
                          sps->log2_diff_max_min_coding_block_size;
     sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
 
+    if (sps->log2_ctb_size > HEVC_MAX_LOG2_CTB_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
+        return AVERROR_INVALIDDATA;
+    }
+    if (sps->log2_ctb_size < 4) {
+        av_log(avctx,
+               AV_LOG_ERROR,
+               "log2_ctb_size %d differs from the bounds of any known profile\n",
+               sps->log2_ctb_size);
+        avpriv_request_sample(avctx, "log2_ctb_size %d", sps->log2_ctb_size);
+        return AVERROR_INVALIDDATA;
+    }
+
     sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
     sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
     sps->ctb_size   = sps->ctb_width * sps->ctb_height;
@@ -941,40 +1181,40 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
     sps->min_pu_width  = sps->width  >> sps->log2_min_pu_size;
     sps->min_pu_height = sps->height >> sps->log2_min_pu_size;
+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
 
     sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
 
-    if (sps->width  & ((1 << sps->log2_min_cb_size) - 1) ||
-        sps->height & ((1 << sps->log2_min_cb_size) - 1)) {
+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
         av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-    if (sps->log2_ctb_size > HEVC_MAX_LOG2_CTB_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
-        goto err;
-    }
     if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
         av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
                sps->max_transform_hierarchy_depth_inter);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
         av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
                sps->max_transform_hierarchy_depth_intra);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
         av_log(avctx, AV_LOG_ERROR,
                "max transform block size out of range: %d\n",
                sps->log2_max_trafo_size);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-    return 0;
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread SPS by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
 
-err:
-    return ret < 0 ? ret : AVERROR_INVALIDDATA;
+    return 0;
 }
 
 int ff_hevc_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
@@ -984,6 +1224,7 @@ int ff_hevc_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
     AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
     unsigned int sps_id;
     int ret;
+    ptrdiff_t nal_size;
 
     if (!sps_buf)
         return AVERROR(ENOMEM);
@@ -991,6 +1232,17 @@ int ff_hevc_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
 
     av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
 
+    nal_size = gb->buffer_end - gb->buffer;
+    if (nal_size > sizeof(sps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
+               nal_size, sizeof(sps->data));
+        sps->data_size = sizeof(sps->data);
+    } else {
+        sps->data_size = nal_size;
+    }
+    memcpy(sps->data, gb->buffer, sps->data_size);
+
     ret = ff_hevc_parse_sps(sps, gb, &sps_id,
                             apply_defdispwin,
                             ps->vps_list, avctx);
@@ -1036,16 +1288,57 @@ static void hevc_pps_free(void *opaque, uint8_t *data)
     av_freep(&pps->ctb_addr_ts_to_rs);
     av_freep(&pps->tile_pos_rs);
     av_freep(&pps->tile_id);
-    av_freep(&pps->min_tb_addr_zs);
+    av_freep(&pps->min_tb_addr_zs_tab);
 
     av_freep(&pps);
 }
 
+static int pps_range_extensions(GetBitContext *gb, AVCodecContext *avctx,
+                                HEVCPPS *pps, HEVCSPS *sps) {
+    int i;
+
+    if (pps->transform_skip_enabled_flag) {
+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
+    }
+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
+    if (pps->chroma_qp_offset_list_enabled_flag) {
+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
+        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i <= pps->chroma_qp_offset_list_len_minus1; i++) {
+            pps->cb_qp_offset_list[i] = get_se_golomb_long(gb);
+            if (pps->cb_qp_offset_list[i]) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "cb_qp_offset_list not tested yet.\n");
+            }
+            pps->cr_qp_offset_list[i] = get_se_golomb_long(gb);
+            if (pps->cr_qp_offset_list[i]) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "cb_qp_offset_list not tested yet.\n");
+            }
+        }
+    }
+    pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
+    pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
+
+    if (   pps->log2_sao_offset_scale_luma   > FFMAX(sps->bit_depth        - 10, 0)
+        || pps->log2_sao_offset_scale_chroma > FFMAX(sps->bit_depth_chroma - 10, 0)
+    )
+        return AVERROR_INVALIDDATA;
+
+    return(0);
+}
+
 static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                             HEVCPPS *pps, HEVCSPS *sps)
 {
     int log2_diff;
-    int pic_area_in_ctbs, pic_area_in_min_tbs;
+    int pic_area_in_ctbs;
     int i, j, x, y, ctb_addr_rs, tile_id;
 
     // Inferred parameters
@@ -1092,14 +1385,13 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
      * 6.5
      */
     pic_area_in_ctbs     = sps->ctb_width    * sps->ctb_height;
-    pic_area_in_min_tbs  = sps->min_tb_width * sps->min_tb_height;
 
     pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
     pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
     pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-    pps->min_tb_addr_zs    = av_malloc_array(pic_area_in_min_tbs, sizeof(*pps->min_tb_addr_zs));
+    pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab));
     if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-        !pps->tile_id || !pps->min_tb_addr_zs) {
+        !pps->tile_id || !pps->min_tb_addr_zs_tab) {
         return AVERROR(ENOMEM);
     }
 
@@ -1152,8 +1444,13 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                 pps->row_bd[j] * sps->ctb_width + pps->col_bd[i];
 
     log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size;
-    for (y = 0; y < sps->min_tb_height; y++) {
-        for (x = 0; x < sps->min_tb_width; x++) {
+    pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1];
+    for (y = 0; y < sps->tb_mask+2; y++) {
+        pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1;
+        pps->min_tb_addr_zs_tab[y]    = -1;
+    }
+    for (y = 0; y < sps->tb_mask+1; y++) {
+        for (x = 0; x < sps->tb_mask+1; x++) {
             int tb_x = x >> log2_diff;
             int tb_y = y >> log2_diff;
             int rs   = sps->ctb_width * tb_y + tb_x;
@@ -1162,7 +1459,7 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                 int m = 1 << i;
                 val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0);
             }
-            pps->min_tb_addr_zs[y * sps->min_tb_width + x] = val;
+            pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val;
         }
     }
 
@@ -1175,6 +1472,8 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     HEVCSPS      *sps = NULL;
     int i, ret = 0;
     unsigned int pps_id = 0;
+    ptrdiff_t nal_size;
+    unsigned log2_parallel_merge_level_minus2;
 
     AVBufferRef *pps_buf;
     HEVCPPS *pps = av_mallocz(sizeof(*pps));
@@ -1191,6 +1490,17 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
 
     av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
 
+    nal_size = gb->buffer_end - gb->buffer;
+    if (nal_size > sizeof(pps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
+               nal_size, sizeof(pps->data));
+        pps->data_size = sizeof(pps->data);
+    } else {
+        pps->data_size = nal_size;
+    }
+    memcpy(pps->data, gb->buffer, pps->data_size);
+
     // Default values
     pps->loop_filter_across_tiles_enabled_flag = 1;
     pps->num_tile_columns                      = 1;
@@ -1199,6 +1509,7 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     pps->disable_dbf                           = 0;
     pps->beta_offset                           = 0;
     pps->tc_offset                             = 0;
+    pps->log2_max_transform_skip_block_size    = 2;
 
     // Coded parameters
     pps_id = get_ue_golomb_long(gb);
@@ -1241,6 +1552,14 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     if (pps->cu_qp_delta_enabled_flag)
         pps->diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
 
+    if (pps->diff_cu_qp_delta_depth < 0 ||
+        pps->diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
+        av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
+               pps->diff_cu_qp_delta_depth);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
     pps->cb_qp_offset = get_se_golomb(gb);
     if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
         av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
@@ -1267,14 +1586,14 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     if (pps->tiles_enabled_flag) {
         pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
         pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
-        if (pps->num_tile_columns == 0 ||
+        if (pps->num_tile_columns <= 0 ||
             pps->num_tile_columns >= sps->width) {
             av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
                    pps->num_tile_columns - 1);
             ret = AVERROR_INVALIDDATA;
             goto err;
         }
-        if (pps->num_tile_rows == 0 ||
+        if (pps->num_tile_rows <= 0 ||
             pps->num_tile_rows >= sps->height) {
             av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
                    pps->num_tile_rows - 1);
@@ -1325,46 +1644,63 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
         pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
         pps->disable_dbf                             = get_bits1(gb);
         if (!pps->disable_dbf) {
-            pps->beta_offset = get_se_golomb(gb) * 2;
-            pps->tc_offset = get_se_golomb(gb) * 2;
-            if (pps->beta_offset/2 < -6 || pps->beta_offset/2 > 6) {
+            int beta_offset_div2 = get_se_golomb(gb);
+            int tc_offset_div2   = get_se_golomb(gb) ;
+            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
                 av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
-                       pps->beta_offset/2);
+                       beta_offset_div2);
                 ret = AVERROR_INVALIDDATA;
                 goto err;
             }
-            if (pps->tc_offset/2 < -6 || pps->tc_offset/2 > 6) {
+            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
                 av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
-                       pps->tc_offset/2);
+                       tc_offset_div2);
                 ret = AVERROR_INVALIDDATA;
                 goto err;
             }
+            pps->beta_offset = 2 * beta_offset_div2;
+            pps->tc_offset   = 2 *   tc_offset_div2;
         }
     }
 
     pps->scaling_list_data_present_flag = get_bits1(gb);
     if (pps->scaling_list_data_present_flag) {
         set_default_scaling_list_data(&pps->scaling_list);
-        ret = scaling_list_data(gb, avctx, &pps->scaling_list);
+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
         if (ret < 0)
             goto err;
     }
     pps->lists_modification_present_flag = get_bits1(gb);
-    pps->log2_parallel_merge_level       = get_ue_golomb_long(gb) + 2;
-    if (pps->log2_parallel_merge_level > sps->log2_ctb_size) {
+    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
+    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
         av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
-               pps->log2_parallel_merge_level - 2);
+               log2_parallel_merge_level_minus2);
         ret = AVERROR_INVALIDDATA;
         goto err;
     }
+    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
 
     pps->slice_header_extension_present_flag = get_bits1(gb);
-    skip_bits1(gb);     // pps_extension_flag
+
+    if (get_bits1(gb)) { // pps_extension_present_flag
+        pps->pps_range_extensions_flag = get_bits1(gb);
+        skip_bits(gb, 7); // pps_extension_7bits
+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps->pps_range_extensions_flag) {
+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
+                goto err;
+        }
+    }
 
     ret = setup_pps(avctx, gb, pps, sps);
     if (ret < 0)
         goto err;
 
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread PPS by %d bits\n", -get_bits_left(gb));
+        goto err;
+    }
+
     remove_pps(ps, pps_id);
     ps->pps_list[pps_id] = pps_buf;
 
@@ -1374,3 +1710,42 @@ err:
     av_buffer_unref(&pps_buf);
     return ret;
 }
+
+void ff_hevc_ps_uninit(HEVCParamSets *ps)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(ps->vps_list); i++)
+        av_buffer_unref(&ps->vps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(ps->sps_list); i++)
+        av_buffer_unref(&ps->sps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(ps->pps_list); i++)
+        av_buffer_unref(&ps->pps_list[i]);
+
+    ps->sps = NULL;
+    ps->pps = NULL;
+    ps->vps = NULL;
+}
+
+int ff_hevc_compute_poc(const HEVCSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
+{
+    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
+    int prev_poc_lsb = pocTid0 % max_poc_lsb;
+    int prev_poc_msb = pocTid0 - prev_poc_lsb;
+    int poc_msb;
+
+    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
+        poc_msb = prev_poc_msb + max_poc_lsb;
+    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
+        poc_msb = prev_poc_msb - max_poc_lsb;
+    else
+        poc_msb = prev_poc_msb;
+
+    // For BLA picture types, POCmsb is set to 0.
+    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
+        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
+        nal_unit_type == HEVC_NAL_BLA_N_LP)
+        poc_msb = 0;
+
+    return poc_msb + poc_lsb;
+}
diff --git a/libavcodec/hevc_ps.h b/libavcodec/hevc_ps.h
index 6e2b527..bbaa920 100644
--- a/libavcodec/hevc_ps.h
+++ b/libavcodec/hevc_ps.h
@@ -1,20 +1,20 @@
 /*
  * HEVC parameter set parsing
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,89 @@ typedef struct ShortTermRPS {
     uint8_t used[32];
 } ShortTermRPS;
 
+typedef struct LongTermRPS {
+    int     poc[32];
+    uint8_t used[32];
+    uint8_t nb_refs;
+} LongTermRPS;
+
+typedef struct SliceHeader {
+    unsigned int pps_id;
+
+    ///< address (in raster order) of the first block in the current slice segment
+    unsigned int   slice_segment_addr;
+    ///< address (in raster order) of the first block in the current slice
+    unsigned int   slice_addr;
+
+    enum HEVCSliceType slice_type;
+
+    int pic_order_cnt_lsb;
+
+    uint8_t first_slice_in_pic_flag;
+    uint8_t dependent_slice_segment_flag;
+    uint8_t pic_output_flag;
+    uint8_t colour_plane_id;
+
+    ///< RPS coded in the slice header itself is stored here
+    int short_term_ref_pic_set_sps_flag;
+    int short_term_ref_pic_set_size;
+    ShortTermRPS slice_rps;
+    const ShortTermRPS *short_term_rps;
+    int long_term_ref_pic_set_size;
+    LongTermRPS long_term_rps;
+    unsigned int list_entry_lx[2][32];
+
+    uint8_t rpl_modification_flag[2];
+    uint8_t no_output_of_prior_pics_flag;
+    uint8_t slice_temporal_mvp_enabled_flag;
+
+    unsigned int nb_refs[2];
+
+    uint8_t slice_sample_adaptive_offset_flag[3];
+    uint8_t mvd_l1_zero_flag;
+
+    uint8_t cabac_init_flag;
+    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
+    uint8_t slice_loop_filter_across_slices_enabled_flag;
+    uint8_t collocated_list;
+
+    unsigned int collocated_ref_idx;
+
+    int slice_qp_delta;
+    int slice_cb_qp_offset;
+    int slice_cr_qp_offset;
+
+    uint8_t cu_chroma_qp_offset_enabled_flag;
+
+    int beta_offset;    ///< beta_offset_div2 * 2
+    int tc_offset;      ///< tc_offset_div2 * 2
+
+    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
+
+    unsigned *entry_point_offset;
+    int * offset;
+    int * size;
+    int num_entry_point_offsets;
+
+    int8_t slice_qp;
+
+    uint8_t luma_log2_weight_denom;
+    int16_t chroma_log2_weight_denom;
+
+    int16_t luma_weight_l0[16];
+    int16_t chroma_weight_l0[16][2];
+    int16_t chroma_weight_l1[16][2];
+    int16_t luma_weight_l1[16];
+
+    int16_t luma_offset_l0[16];
+    int16_t chroma_offset_l0[16][2];
+
+    int16_t luma_offset_l1[16];
+    int16_t chroma_offset_l1[16][2];
+
+    int slice_ctb_addr_rs;
+} SliceHeader;
+
 typedef struct HEVCWindow {
     unsigned int left_offset;
     unsigned int right_offset;
@@ -127,6 +210,9 @@ typedef struct HEVCVPS {
     uint8_t vps_poc_proportional_to_timing_flag;
     int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
     int vps_num_hrd_parameters;
+
+    uint8_t data[4096];
+    int data_size;
 } HEVCVPS;
 
 typedef struct ScalingList {
@@ -137,7 +223,7 @@ typedef struct ScalingList {
 } ScalingList;
 
 typedef struct HEVCSPS {
-    int vps_id;
+    unsigned vps_id;
     int chroma_format_idc;
     uint8_t separate_colour_plane_flag;
 
@@ -146,6 +232,7 @@ typedef struct HEVCSPS {
     HEVCWindow pic_conf_win;
 
     int bit_depth;
+    int bit_depth_chroma;
     int pixel_shift;
     enum AVPixelFormat pix_fmt;
 
@@ -158,6 +245,7 @@ typedef struct HEVCSPS {
         int num_reorder_pics;
         int max_latency_increase;
     } temporal_layer[HEVC_MAX_SUB_LAYERS];
+    uint8_t temporal_id_nesting_flag;
 
     VUI vui;
     PTL ptl;
@@ -196,6 +284,17 @@ typedef struct HEVCSPS {
     int max_transform_hierarchy_depth_inter;
     int max_transform_hierarchy_depth_intra;
 
+    int sps_range_extension_flag;
+    int transform_skip_rotation_enabled_flag;
+    int transform_skip_context_enabled_flag;
+    int implicit_rdpcm_enabled_flag;
+    int explicit_rdpcm_enabled_flag;
+    int extended_precision_processing_flag;
+    int intra_smoothing_disabled_flag;
+    int high_precision_offsets_enabled_flag;
+    int persistent_rice_adaptation_enabled_flag;
+    int cabac_bypass_alignment_enabled_flag;
+
     ///< coded frame dimension in various units
     int width;
     int height;
@@ -208,11 +307,15 @@ typedef struct HEVCSPS {
     int min_tb_height;
     int min_pu_width;
     int min_pu_height;
+    int tb_mask;
 
     int hshift[3];
     int vshift[3];
 
     int qp_bd_offset;
+
+    uint8_t data[4096];
+    int data_size;
 } HEVCSPS;
 
 typedef struct HEVCPPS {
@@ -264,6 +367,16 @@ typedef struct HEVCPPS {
     int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
     int num_extra_slice_header_bits;
     uint8_t slice_header_extension_present_flag;
+    uint8_t log2_max_transform_skip_block_size;
+    uint8_t pps_range_extensions_flag;
+    uint8_t cross_component_prediction_enabled_flag;
+    uint8_t chroma_qp_offset_list_enabled_flag;
+    uint8_t diff_cu_chroma_qp_offset_depth;
+    uint8_t chroma_qp_offset_list_len_minus1;
+    int8_t  cb_qp_offset_list[6];
+    int8_t  cr_qp_offset_list[6];
+    uint8_t log2_sao_offset_scale_luma;
+    uint8_t log2_sao_offset_scale_chroma;
 
     // Inferred parameters
     unsigned int *column_width;  ///< ColumnWidth
@@ -277,6 +390,10 @@ typedef struct HEVCPPS {
     int *tile_id;           ///< TileId
     int *tile_pos_rs;       ///< TilePosRS
     int *min_tb_addr_zs;    ///< MinTbAddrZS
+    int *min_tb_addr_zs_tab;///< MinTbAddrZS
+
+    uint8_t data[4096];
+    int data_size;
 } HEVCPPS;
 
 typedef struct HEVCParamSets {
@@ -309,10 +426,17 @@ int ff_hevc_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
 int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
                            HEVCParamSets *ps);
 
+void ff_hevc_ps_uninit(HEVCParamSets *ps);
+
 int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
                                   ShortTermRPS *rps, const HEVCSPS *sps, int is_slice_header);
 
 int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
                            uint8_t *buf, int buf_size);
 
+/**
+ * Compute POC of the current frame and return it.
+ */
+int ff_hevc_compute_poc(const HEVCSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
+
 #endif /* AVCODEC_HEVC_PS_H */
diff --git a/libavcodec/hevc_ps_enc.c b/libavcodec/hevc_ps_enc.c
index 1fb93b3..4c71cf4 100644
--- a/libavcodec/hevc_ps_enc.c
+++ b/libavcodec/hevc_ps_enc.c
@@ -1,24 +1,24 @@
 /*
  * HEVC Parameter Set encoding
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "hevc_ps.h"
 #include "put_bits.h"
 
diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index 0c19187..7cf3a55 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,8 +58,7 @@ RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *ref, int x0, int y0)
 {
     int x_cb         = x0 >> s->ps.sps->log2_ctb_size;
     int y_cb         = y0 >> s->ps.sps->log2_ctb_size;
-    int pic_width_cb = (s->ps.sps->width + (1 << s->ps.sps->log2_ctb_size) - 1) >>
-        s->ps.sps->log2_ctb_size;
+    int pic_width_cb = s->ps.sps->ctb_width;
     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb];
     return (RefPicList *)ref->rpl_tab[ctb_addr_ts];
 }
@@ -110,6 +109,9 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         for (j = 0; j < frame->ctb_count; j++)
             frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
 
+        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+
         if (s->avctx->hwaccel) {
             const AVHWAccel *hwaccel = s->avctx->hwaccel;
             av_assert0(!frame->hwaccel_picture_private);
@@ -122,7 +124,6 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         }
 
         return frame;
-
 fail:
         ff_hevc_unref_frame(s, frame, ~0);
         return NULL;
@@ -177,12 +178,22 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
         int min_poc   = INT_MAX;
         int i, min_idx, ret;
 
+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+                HEVCFrame *frame = &s->DPB[i];
+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
+                        frame->sequence == s->seq_output) {
+                    ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+                }
+            }
+        }
+
         for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
             HEVCFrame *frame = &s->DPB[i];
             if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
                 frame->sequence == s->seq_output) {
                 nb_output++;
-                if (frame->poc < min_poc) {
+                if (frame->poc < min_poc || nb_output == 1) {
                     min_poc = frame->poc;
                     min_idx = i;
                 }
@@ -198,7 +209,10 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
             HEVCFrame *frame = &s->DPB[min_idx];
 
             ret = av_frame_ref(out, frame->frame);
-            ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
+                ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
+            else
+                ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
             if (ret < 0)
                 return ret;
 
@@ -216,6 +230,46 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
     return 0;
 }
 
+void ff_hevc_bump_frame(HEVCContext *s)
+{
+    int dpb = 0;
+    int min_poc = INT_MAX;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *frame = &s->DPB[i];
+        if ((frame->flags) &&
+            frame->sequence == s->seq_output &&
+            frame->poc != s->poc) {
+            dpb++;
+        }
+    }
+
+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCFrame *frame = &s->DPB[i];
+            if ((frame->flags) &&
+                frame->sequence == s->seq_output &&
+                frame->poc != s->poc) {
+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
+                    min_poc = frame->poc;
+                }
+            }
+        }
+
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCFrame *frame = &s->DPB[i];
+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
+                frame->sequence == s->seq_output &&
+                frame->poc <= min_poc) {
+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
+            }
+        }
+
+        dpb--;
+    }
+}
+
 static int init_slice_rpl(HEVCContext *s)
 {
     HEVCFrame *frame = s->ref;
@@ -325,8 +379,9 @@ static HEVCFrame *find_ref_idx(HEVCContext *s, int poc)
         }
     }
 
-    av_log(s->avctx, AV_LOG_ERROR,
-           "Could not find ref with POC %d\n", poc);
+    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Could not find ref with POC %d\n", poc);
     return NULL;
 }
 
@@ -364,7 +419,8 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
     frame->sequence = s->seq_decode;
     frame->flags    = 0;
 
-    ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+    if (s->threads_type == FF_THREAD_FRAME)
+        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
 
     return frame;
 }
@@ -375,7 +431,7 @@ static int add_candidate_ref(HEVCContext *s, RefPicList *list,
 {
     HEVCFrame *ref = find_ref_idx(s, poc);
 
-    if (ref == s->ref)
+    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
         return AVERROR_INVALIDDATA;
 
     if (!ref) {
@@ -452,35 +508,12 @@ fail:
     return ret;
 }
 
-int ff_hevc_compute_poc(HEVCContext *s, int poc_lsb)
-{
-    int max_poc_lsb  = 1 << s->ps.sps->log2_max_poc_lsb;
-    int prev_poc_lsb = s->pocTid0 % max_poc_lsb;
-    int prev_poc_msb = s->pocTid0 - prev_poc_lsb;
-    int poc_msb;
-
-    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
-        poc_msb = prev_poc_msb + max_poc_lsb;
-    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
-        poc_msb = prev_poc_msb - max_poc_lsb;
-    else
-        poc_msb = prev_poc_msb;
-
-    // For BLA picture types, POCmsb is set to 0.
-    if (s->nal_unit_type == HEVC_NAL_BLA_W_LP   ||
-        s->nal_unit_type == HEVC_NAL_BLA_W_RADL ||
-        s->nal_unit_type == HEVC_NAL_BLA_N_LP)
-        poc_msb = 0;
-
-    return poc_msb + poc_lsb;
-}
-
-int ff_hevc_frame_nb_refs(HEVCContext *s)
+int ff_hevc_frame_nb_refs(const HEVCContext *s)
 {
     int ret = 0;
     int i;
     const ShortTermRPS *rps = s->sh.short_term_rps;
-    LongTermRPS *long_rps   = &s->sh.long_term_rps;
+    const LongTermRPS *long_rps = &s->sh.long_term_rps;
 
     if (rps) {
         for (i = 0; i < rps->num_negative_pics; i++)
diff --git a/libavcodec/hevc_sei.c b/libavcodec/hevc_sei.c
index 2bf1706..c59bd43 100644
--- a/libavcodec/hevc_sei.c
+++ b/libavcodec/hevc_sei.c
@@ -5,33 +5,36 @@
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2013 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "golomb_legacy.h"
-#include "hevc.h"
+#include "golomb.h"
+#include "hevc_ps.h"
 #include "hevc_sei.h"
 
 static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
 {
     int cIdx, i;
-    uint8_t hash_type = get_bits(gb, 8);
+    uint8_t hash_type;
+    //uint16_t picture_crc;
+    //uint32_t picture_checksum;
+    hash_type = get_bits(gb, 8);
 
-    for (cIdx = 0; cIdx < 3; cIdx++) {
+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
         if (hash_type == 0) {
             s->is_md5 = 1;
             for (i = 0; i < 16; i++)
@@ -40,16 +43,51 @@ static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitCont
             // picture_crc = get_bits(gb, 16);
             skip_bits(gb, 16);
         } else if (hash_type == 2) {
-            // picture_checksum = get_bits(gb, 32);
+            // picture_checksum = get_bits_long(gb, 32);
             skip_bits(gb, 32);
         }
     }
     return 0;
 }
 
+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
+{
+    int i;
+    // Mastering primaries
+    for (i = 0; i < 3; i++) {
+        s->display_primaries[i][0] = get_bits(gb, 16);
+        s->display_primaries[i][1] = get_bits(gb, 16);
+    }
+    // White point (x, y)
+    s->white_point[0] = get_bits(gb, 16);
+    s->white_point[1] = get_bits(gb, 16);
+
+    // Max and min luminance of mastering display
+    s->max_luminance = get_bits_long(gb, 32);
+    s->min_luminance = get_bits_long(gb, 32);
+
+    // As this SEI message comes before the first frame that references it,
+    // initialize the flag to 2 and decrement on IRAP access unit so it
+    // persists for the coded video sequence (e.g., between two IRAPs)
+    s->present = 2;
+    return 0;
+}
+
+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
+{
+    // Max and average light levels
+    s->max_content_light_level     = get_bits_long(gb, 16);
+    s->max_pic_average_light_level = get_bits_long(gb, 16);
+    // As this SEI message comes before the first frame that references it,
+    // initialize the flag to 2 and decrement on IRAP access unit so it
+    // persists for the coded video sequence (e.g., between two IRAPs)
+    s->present = 2;
+    return  0;
+}
+
 static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
 {
-    get_ue_golomb(gb);                  // frame_packing_arrangement_id
+    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
     s->present = !get_bits1(gb);
 
     if (s->present) {
@@ -87,6 +125,147 @@ static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetB
     return 0;
 }
 
+static int decode_nal_sei_pic_timing(HEVCSEI *s, GetBitContext *gb, const HEVCParamSets *ps,
+                                     void *logctx, int size)
+{
+    HEVCSEIPictureTiming *h = &s->picture_timing;
+    HEVCSPS *sps;
+
+    if (!ps->sps_list[s->active_seq_parameter_set_id])
+        return(AVERROR(ENOMEM));
+    sps = (HEVCSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
+
+    if (sps->vui.frame_field_info_present_flag) {
+        int pic_struct = get_bits(gb, 4);
+        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
+        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
+            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
+            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
+        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
+            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
+            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
+        }
+        get_bits(gb, 2);                   // source_scan_type
+        get_bits(gb, 1);                   // duplicate_flag
+        skip_bits1(gb);
+        size--;
+    }
+    skip_bits_long(gb, 8 * size);
+
+    return 0;
+}
+
+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
+                                                      int size)
+{
+    int flag;
+    int user_data_type_code;
+    int cc_count;
+
+    if (size < 3)
+       return AVERROR(EINVAL);
+
+    user_data_type_code = get_bits(gb, 8);
+    if (user_data_type_code == 0x3) {
+        skip_bits(gb, 1); // reserved
+
+        flag = get_bits(gb, 1); // process_cc_data_flag
+        if (flag) {
+            skip_bits(gb, 1);
+            cc_count = get_bits(gb, 5);
+            skip_bits(gb, 8); // reserved
+            size -= 2;
+
+            if (cc_count && size >= cc_count * 3) {
+                const uint64_t new_size = (s->a53_caption_size + cc_count
+                                           * UINT64_C(3));
+                int i, ret;
+
+                if (new_size > INT_MAX)
+                    return AVERROR(EINVAL);
+
+                /* Allow merging of the cc data from two fields. */
+                ret = av_reallocp(&s->a53_caption, new_size);
+                if (ret < 0)
+                    return ret;
+
+                for (i = 0; i < cc_count; i++) {
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                }
+                skip_bits(gb, 8); // marker_bits
+            }
+        }
+    } else {
+        int i;
+        for (i = 0; i < size - 1; i++)
+            skip_bits(gb, 8);
+    }
+
+    return 0;
+}
+
+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEI *s, GetBitContext *gb,
+                                                         int size)
+{
+    uint32_t country_code;
+    uint32_t user_identifier;
+
+    if (size < 7)
+        return AVERROR(EINVAL);
+    size -= 7;
+
+    country_code = get_bits(gb, 8);
+    if (country_code == 0xFF) {
+        skip_bits(gb, 8);
+        size--;
+    }
+
+    skip_bits(gb, 8);
+    skip_bits(gb, 8);
+
+    user_identifier = get_bits_long(gb, 32);
+
+    switch (user_identifier) {
+        case MKBETAG('G', 'A', '9', '4'):
+            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
+        default:
+            skip_bits_long(gb, size * 8);
+            break;
+    }
+    return 0;
+}
+
+static int decode_nal_sei_active_parameter_sets(HEVCSEI *s, GetBitContext *gb, void *logctx)
+{
+    int num_sps_ids_minus1;
+    int i;
+    unsigned active_seq_parameter_set_id;
+
+    get_bits(gb, 4); // active_video_parameter_set_id
+    get_bits(gb, 1); // self_contained_cvs_flag
+    get_bits(gb, 1); // num_sps_ids_minus1
+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
+
+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
+        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
+        return AVERROR_INVALIDDATA;
+    }
+
+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
+    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
+        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
+        return AVERROR_INVALIDDATA;
+    }
+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
+
+    for (i = 1; i <= num_sps_ids_minus1; i++)
+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
+
+    return 0;
+}
+
 static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
 {
     s->present = 1;
@@ -95,7 +274,7 @@ static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, Ge
 }
 
 static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEI *s,
-                                 int type, int size)
+                                 const HEVCParamSets *ps, int type, int size)
 {
     switch (type) {
     case 256:  // Mismatched value from HM 8.1
@@ -104,6 +283,16 @@ static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEI *s,
         return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
     case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
         return decode_nal_sei_display_orientation(&s->display_orientation, gb);
+    case HEVC_SEI_TYPE_PICTURE_TIMING:
+        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
+    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
+        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
+    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
+        return decode_nal_sei_content_light_info(&s->content_light, gb);
+    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
+        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
+    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
+        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
     case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
         return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
     default:
@@ -127,7 +316,7 @@ static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEI *s,
 }
 
 static int decode_nal_sei_message(GetBitContext *gb, void *logctx, HEVCSEI *s,
-                                  int type)
+                                  const HEVCParamSets *ps, int nal_unit_type)
 {
     int payload_type = 0;
     int payload_size = 0;
@@ -135,16 +324,20 @@ static int decode_nal_sei_message(GetBitContext *gb, void *logctx, HEVCSEI *s,
     av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
 
     while (byte == 0xFF) {
+        if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
+            return AVERROR_INVALIDDATA;
         byte          = get_bits(gb, 8);
         payload_type += byte;
     }
     byte = 0xFF;
     while (byte == 0xFF) {
+        if (get_bits_left(gb) < 8 + 8LL*payload_size)
+            return AVERROR_INVALIDDATA;
         byte          = get_bits(gb, 8);
         payload_size += byte;
     }
-    if (type == HEVC_NAL_SEI_PREFIX) {
-        return decode_nal_sei_prefix(gb, logctx, s, payload_type, payload_size);
+    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
+        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
     } else { /* nal_unit_type == NAL_SEI_SUFFIX */
         return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
     }
@@ -156,10 +349,20 @@ static int more_rbsp_data(GetBitContext *gb)
 }
 
 int ff_hevc_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEI *s,
-                           int type)
+                           const HEVCParamSets *ps, int type)
 {
+    int ret;
+
     do {
-        decode_nal_sei_message(gb, logctx, s, type);
+        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
+        if (ret < 0)
+            return ret;
     } while (more_rbsp_data(gb));
-    return 0;
+    return 1;
+}
+
+void ff_hevc_reset_sei(HEVCSEI *s)
+{
+    s->a53_caption.a53_caption_size = 0;
+    av_freep(&s->a53_caption.a53_caption);
 }
diff --git a/libavcodec/hevc_sei.h b/libavcodec/hevc_sei.h
index 8d4f5df..2fec00a 100644
--- a/libavcodec/hevc_sei.h
+++ b/libavcodec/hevc_sei.h
@@ -1,20 +1,20 @@
 /*
  * HEVC Supplementary Enhancement Information messages
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,6 +52,7 @@ typedef enum {
     HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
     HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
     HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
+    HEVC_SEI_TYPE_TIME_CODE                            = 136,
     HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
     HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
     HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
@@ -76,6 +77,29 @@ typedef struct HEVCSEIDisplayOrientation {
     int hflip, vflip;
 } HEVCSEIDisplayOrientation;
 
+typedef struct HEVCSEIPictureTiming {
+    int picture_struct;
+} HEVCSEIPictureTiming;
+
+typedef struct HEVCSEIA53Caption {
+    int a53_caption_size;
+    uint8_t *a53_caption;
+} HEVCSEIA53Caption;
+
+typedef struct HEVCSEIMasteringDisplay {
+    int present;
+    uint16_t display_primaries[3][2];
+    uint16_t white_point[2];
+    uint32_t max_luminance;
+    uint32_t min_luminance;
+} HEVCSEIMasteringDisplay;
+
+typedef struct HEVCSEIContentLight {
+    int present;
+    uint16_t max_content_light_level;
+    uint16_t max_pic_average_light_level;
+} HEVCSEIContentLight;
+
 typedef struct HEVCSEIAlternativeTransfer {
     int present;
     int preferred_transfer_characteristics;
@@ -85,10 +109,26 @@ typedef struct HEVCSEI {
     HEVCSEIPictureHash picture_hash;
     HEVCSEIFramePacking frame_packing;
     HEVCSEIDisplayOrientation display_orientation;
+    HEVCSEIPictureTiming picture_timing;
+    HEVCSEIA53Caption a53_caption;
+    HEVCSEIMasteringDisplay mastering_display;
+    HEVCSEIContentLight content_light;
+    int active_seq_parameter_set_id;
     HEVCSEIAlternativeTransfer alternative_transfer;
 } HEVCSEI;
 
+struct HEVCParamSets;
+
 int ff_hevc_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEI *s,
-                           int type);
+                           const struct HEVCParamSets *ps, int type);
+
+/**
+ * Reset SEI values that are stored on the Context.
+ * e.g. Caption data that was extracted during NAL
+ * parsing.
+ *
+ * @param s HEVCContext.
+ */
+void ff_hevc_reset_sei(HEVCSEI *s);
 
 #endif /* AVCODEC_HEVC_SEI_H */
diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
index 130b99f..967f8f1 100644
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@@ -1,25 +1,25 @@
 /*
- * HEVC video decoder
+ * HEVC video Decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Mickael Raulet
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2012 - 2013 Wassim Hamidouche
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "libavutil/common.h"
 #include "libavutil/display.h"
 #include "libavutil/internal.h"
+#include "libavutil/mastering_display_metadata.h"
 #include "libavutil/md5.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
@@ -35,74 +36,15 @@
 #include "bswapdsp.h"
 #include "bytestream.h"
 #include "cabac_functions.h"
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "hevc.h"
 #include "hevc_data.h"
+#include "hevc_parse.h"
 #include "hevcdec.h"
 #include "hwaccel.h"
 #include "profiles.h"
 
-const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 3 };
-const uint8_t ff_hevc_qpel_extra_after[4]  = { 0, 4, 4, 4 };
-const uint8_t ff_hevc_qpel_extra[4]        = { 0, 7, 7, 7 };
-
-static const uint8_t scan_1x1[1] = { 0 };
-
-static const uint8_t horiz_scan2x2_x[4] = { 0, 1, 0, 1 };
-
-static const uint8_t horiz_scan2x2_y[4] = { 0, 0, 1, 1 };
-
-static const uint8_t horiz_scan4x4_x[16] = {
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-};
-
-static const uint8_t horiz_scan4x4_y[16] = {
-    0, 0, 0, 0,
-    1, 1, 1, 1,
-    2, 2, 2, 2,
-    3, 3, 3, 3,
-};
-
-static const uint8_t horiz_scan8x8_inv[8][8] = {
-    {  0,  1,  2,  3, 16, 17, 18, 19, },
-    {  4,  5,  6,  7, 20, 21, 22, 23, },
-    {  8,  9, 10, 11, 24, 25, 26, 27, },
-    { 12, 13, 14, 15, 28, 29, 30, 31, },
-    { 32, 33, 34, 35, 48, 49, 50, 51, },
-    { 36, 37, 38, 39, 52, 53, 54, 55, },
-    { 40, 41, 42, 43, 56, 57, 58, 59, },
-    { 44, 45, 46, 47, 60, 61, 62, 63, },
-};
-
-static const uint8_t diag_scan2x2_x[4] = { 0, 0, 1, 1 };
-
-static const uint8_t diag_scan2x2_y[4] = { 0, 1, 0, 1 };
-
-static const uint8_t diag_scan2x2_inv[2][2] = {
-    { 0, 2, },
-    { 1, 3, },
-};
-
-static const uint8_t diag_scan4x4_inv[4][4] = {
-    { 0,  2,  5,  9, },
-    { 1,  4,  8, 12, },
-    { 3,  7, 11, 14, },
-    { 6, 10, 13, 15, },
-};
-
-static const uint8_t diag_scan8x8_inv[8][8] = {
-    {  0,  2,  5,  9, 14, 20, 27, 35, },
-    {  1,  4,  8, 13, 19, 26, 34, 42, },
-    {  3,  7, 12, 18, 25, 33, 41, 48, },
-    {  6, 11, 17, 24, 32, 40, 47, 53, },
-    { 10, 16, 23, 31, 39, 46, 52, 57, },
-    { 15, 22, 30, 38, 45, 51, 56, 60, },
-    { 21, 29, 37, 44, 50, 55, 59, 62, },
-    { 28, 36, 43, 49, 54, 58, 61, 63, },
-};
+const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
 
 /**
  * NOTE: Each function hls_foo correspond to the function foo in the
@@ -133,6 +75,10 @@ static void pic_arrays_free(HEVCContext *s)
     av_freep(&s->horizontal_bs);
     av_freep(&s->vertical_bs);
 
+    av_freep(&s->sh.entry_point_offset);
+    av_freep(&s->sh.size);
+    av_freep(&s->sh.offset);
+
     av_buffer_pool_uninit(&s->tab_mvf_pool);
     av_buffer_pool_uninit(&s->rpl_tab_pool);
 }
@@ -148,40 +94,40 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
     int ctb_count        = sps->ctb_width * sps->ctb_height;
     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
 
-    s->bs_width  = width  >> 3;
-    s->bs_height = height >> 3;
+    s->bs_width  = (width  >> 2) + 1;
+    s->bs_height = (height >> 2) + 1;
 
     s->sao           = av_mallocz_array(ctb_count, sizeof(*s->sao));
     s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
     if (!s->sao || !s->deblock)
         goto fail;
 
-    s->skip_flag    = av_malloc(pic_size_in_ctb);
-    s->tab_ct_depth = av_malloc(sps->min_cb_height * sps->min_cb_width);
+    s->skip_flag    = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
+    s->tab_ct_depth = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
     if (!s->skip_flag || !s->tab_ct_depth)
         goto fail;
 
-    s->cbf_luma = av_malloc(sps->min_tb_width * sps->min_tb_height);
+    s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height);
     s->tab_ipm  = av_mallocz(min_pu_size);
-    s->is_pcm   = av_malloc(min_pu_size);
+    s->is_pcm   = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
     if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
         goto fail;
 
-    s->filter_slice_edges = av_malloc(ctb_count);
-    s->tab_slice_address  = av_malloc(pic_size_in_ctb *
+    s->filter_slice_edges = av_mallocz(ctb_count);
+    s->tab_slice_address  = av_malloc_array(pic_size_in_ctb,
                                       sizeof(*s->tab_slice_address));
-    s->qp_y_tab           = av_malloc(pic_size_in_ctb *
+    s->qp_y_tab           = av_malloc_array(pic_size_in_ctb,
                                       sizeof(*s->qp_y_tab));
     if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
         goto fail;
 
-    s->horizontal_bs = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
-    s->vertical_bs   = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
+    s->horizontal_bs = av_mallocz_array(s->bs_width, s->bs_height);
+    s->vertical_bs   = av_mallocz_array(s->bs_width, s->bs_height);
     if (!s->horizontal_bs || !s->vertical_bs)
         goto fail;
 
     s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField),
-                                          av_buffer_alloc);
+                                          av_buffer_allocz);
     s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab),
                                           av_buffer_allocz);
     if (!s->tab_mvf_pool || !s->rpl_tab_pool)
@@ -194,7 +140,7 @@ fail:
     return AVERROR(ENOMEM);
 }
 
-static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+static int pred_weight_table(HEVCContext *s, GetBitContext *gb)
 {
     int i = 0;
     int j = 0;
@@ -202,11 +148,21 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
     uint8_t chroma_weight_l0_flag[16];
     uint8_t luma_weight_l1_flag[16];
     uint8_t chroma_weight_l1_flag[16];
+    int luma_log2_weight_denom;
 
-    s->sh.luma_log2_weight_denom = av_clip(get_ue_golomb_long(gb), 0, 7);
+    luma_log2_weight_denom = get_ue_golomb_long(gb);
+    if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7) {
+        av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom);
+        return AVERROR_INVALIDDATA;
+    }
+    s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3);
     if (s->ps.sps->chroma_format_idc != 0) {
-        int delta = get_se_golomb(gb);
-        s->sh.chroma_log2_weight_denom = av_clip(s->sh.luma_log2_weight_denom + delta, 0, 7);
+        int64_t chroma_log2_weight_denom = luma_log2_weight_denom + (int64_t)get_se_golomb(gb);
+        if (chroma_log2_weight_denom < 0 || chroma_log2_weight_denom > 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "chroma_log2_weight_denom %"PRId64" is invalid\n", chroma_log2_weight_denom);
+            return AVERROR_INVALIDDATA;
+        }
+        s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
     }
 
     for (i = 0; i < s->sh.nb_refs[L0]; i++) {
@@ -216,7 +172,7 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
             s->sh.luma_offset_l0[i] = 0;
         }
     }
-    if (s->ps.sps->chroma_format_idc != 0) { // FIXME: invert "if" and "for"
+    if (s->ps.sps->chroma_format_idc != 0) {
         for (i = 0; i < s->sh.nb_refs[L0]; i++)
             chroma_weight_l0_flag[i] = get_bits1(gb);
     } else {
@@ -233,6 +189,12 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
             for (j = 0; j < 2; j++) {
                 int delta_chroma_weight_l0 = get_se_golomb(gb);
                 int delta_chroma_offset_l0 = get_se_golomb(gb);
+
+                if (   (int8_t)delta_chroma_weight_l0 != delta_chroma_weight_l0
+                    || delta_chroma_offset_l0 < -(1<<17) || delta_chroma_offset_l0 > (1<<17)) {
+                    return AVERROR_INVALIDDATA;
+                }
+
                 s->sh.chroma_weight_l0[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l0;
                 s->sh.chroma_offset_l0[i][j] = av_clip((delta_chroma_offset_l0 - ((128 * s->sh.chroma_weight_l0[i][j])
                                                                                     >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
@@ -269,6 +231,12 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
                 for (j = 0; j < 2; j++) {
                     int delta_chroma_weight_l1 = get_se_golomb(gb);
                     int delta_chroma_offset_l1 = get_se_golomb(gb);
+
+                    if (   (int8_t)delta_chroma_weight_l1 != delta_chroma_weight_l1
+                        || delta_chroma_offset_l1 < -(1<<17) || delta_chroma_offset_l1 > (1<<17)) {
+                        return AVERROR_INVALIDDATA;
+                    }
+
                     s->sh.chroma_weight_l1[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l1;
                     s->sh.chroma_offset_l1[i][j] = av_clip((delta_chroma_offset_l1 - ((128 * s->sh.chroma_weight_l1[i][j])
                                                                                         >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
@@ -281,6 +249,7 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
             }
         }
     }
+    return 0;
 }
 
 static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
@@ -299,7 +268,9 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
         nb_sps = get_ue_golomb_long(gb);
     nb_sh = get_ue_golomb_long(gb);
 
-    if (nb_sh + nb_sps > FF_ARRAY_ELEMS(rps->poc))
+    if (nb_sps > sps->num_long_term_ref_pics_sps)
+        return AVERROR_INVALIDDATA;
+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
         return AVERROR_INVALIDDATA;
 
     rps->nb_refs = nb_sh + nb_sps;
@@ -322,12 +293,16 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
 
         delta_poc_msb_present = get_bits1(gb);
         if (delta_poc_msb_present) {
-            int delta = get_ue_golomb_long(gb);
+            int64_t delta = get_ue_golomb_long(gb);
+            int64_t poc;
 
             if (i && i != nb_sps)
                 delta += prev_delta_msb;
 
-            rps->poc[i] += s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
+            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
+            if (poc != (int32_t)poc)
+                return AVERROR_INVALIDDATA;
+            rps->poc[i] = poc;
             prev_delta_msb = delta;
         }
     }
@@ -384,46 +359,75 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
 
 static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
 {
-    #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
-                         CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL + \
-                         CONFIG_HEVC_CUVID_HWACCEL)
+#define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
+                     CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
+                     CONFIG_HEVC_NVDEC_HWACCEL + \
+                     CONFIG_HEVC_VAAPI_HWACCEL + \
+                     CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
+                     CONFIG_HEVC_VDPAU_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
 
-    if (sps->pix_fmt == AV_PIX_FMT_YUV420P || sps->pix_fmt == AV_PIX_FMT_YUVJ420P ||
-        sps->pix_fmt == AV_PIX_FMT_YUV420P10) {
+    switch (sps->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUVJ420P:
+#if CONFIG_HEVC_DXVA2_HWACCEL
+        *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+#endif
 #if CONFIG_HEVC_D3D11VA_HWACCEL
         *fmt++ = AV_PIX_FMT_D3D11VA_VLD;
         *fmt++ = AV_PIX_FMT_D3D11;
 #endif
+#if CONFIG_HEVC_VAAPI_HWACCEL
+        *fmt++ = AV_PIX_FMT_VAAPI;
+#endif
+#if CONFIG_HEVC_VDPAU_HWACCEL
+        *fmt++ = AV_PIX_FMT_VDPAU;
+#endif
+#if CONFIG_HEVC_NVDEC_HWACCEL
+        *fmt++ = AV_PIX_FMT_CUDA;
+#endif
+#if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+        *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+#endif
+        break;
+    case AV_PIX_FMT_YUV420P10:
 #if CONFIG_HEVC_DXVA2_HWACCEL
         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
 #endif
+#if CONFIG_HEVC_D3D11VA_HWACCEL
+        *fmt++ = AV_PIX_FMT_D3D11VA_VLD;
+        *fmt++ = AV_PIX_FMT_D3D11;
+#endif
 #if CONFIG_HEVC_VAAPI_HWACCEL
         *fmt++ = AV_PIX_FMT_VAAPI;
 #endif
-#if CONFIG_HEVC_CUVID_HWACCEL && HAVE_CUVIDDECODECREATEINFO_BITDEPTHMINUS8
-        *fmt++ = AV_PIX_FMT_CUDA;
+#if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+        *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
 #endif
-    }
-    if (sps->pix_fmt == AV_PIX_FMT_YUV420P || sps->pix_fmt == AV_PIX_FMT_YUVJ420P) {
-#if CONFIG_HEVC_CUVID_HWACCEL && !HAVE_CUVIDDECODECREATEINFO_BITDEPTHMINUS8
+#if CONFIG_HEVC_NVDEC_HWACCEL
         *fmt++ = AV_PIX_FMT_CUDA;
 #endif
-#if CONFIG_HEVC_VDPAU_HWACCEL
-        *fmt++ = AV_PIX_FMT_VDPAU;
+        break;
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV444P:
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
+#if CONFIG_HEVC_NVDEC_HWACCEL
+        *fmt++ = AV_PIX_FMT_CUDA;
 #endif
+        break;
     }
 
     *fmt++ = sps->pix_fmt;
     *fmt = AV_PIX_FMT_NONE;
 
-    return ff_get_format(s->avctx, pix_fmts);
+    return ff_thread_get_format(s->avctx, pix_fmts);
 }
 
 static int set_sps(HEVCContext *s, const HEVCSPS *sps,
                    enum AVPixelFormat pix_fmt)
 {
-    int ret;
+    int ret, i;
 
     pic_arrays_free(s);
     s->ps.sps = NULL;
@@ -444,12 +448,25 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
 
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
+
     if (sps->sao_enabled && !s->avctx->hwaccel) {
-        av_frame_unref(s->tmp_frame);
-        ret = ff_get_buffer(s->avctx, s->tmp_frame, AV_GET_BUFFER_FLAG_REF);
-        if (ret < 0)
-            goto fail;
-        s->frame = s->tmp_frame;
+        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+        int c_idx;
+
+        for(c_idx = 0; c_idx < c_count; c_idx++) {
+            int w = sps->width >> sps->hshift[c_idx];
+            int h = sps->height >> sps->vshift[c_idx];
+            s->sao_pixel_buffer_h[c_idx] =
+                av_malloc((w * 2 * sps->ctb_height) <<
+                          sps->pixel_shift);
+            s->sao_pixel_buffer_v[c_idx] =
+                av_malloc((h * 2 * sps->ctb_width) <<
+                          sps->pixel_shift);
+        }
     }
 
     s->ps.sps = sps;
@@ -465,7 +482,7 @@ fail:
 
 static int hls_slice_header(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
     SliceHeader *sh   = &s->sh;
     int i, ret;
 
@@ -477,6 +494,7 @@ static int hls_slice_header(HEVCContext *s)
         if (IS_IDR(s))
             ff_hevc_clear_refs(s);
     }
+    sh->no_output_of_prior_pics_flag = 0;
     if (IS_IRAP(s))
         sh->no_output_of_prior_pics_flag = get_bits1(gb);
 
@@ -491,11 +509,20 @@ static int hls_slice_header(HEVCContext *s)
         return AVERROR_INVALIDDATA;
     }
     s->ps.pps = (HEVCPPS*)s->ps.pps_list[sh->pps_id]->data;
+    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
+        sh->no_output_of_prior_pics_flag = 1;
 
     if (s->ps.sps != (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
         const HEVCSPS *sps = (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
+        const HEVCSPS *last_sps = s->ps.sps;
         enum AVPixelFormat pix_fmt;
 
+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
+            if (sps->width != last_sps->width || sps->height != last_sps->height ||
+                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
+                sh->no_output_of_prior_pics_flag = 0;
+        }
         ff_hevc_clear_refs(s);
 
         ret = set_sps(s, sps, sps->pix_fmt);
@@ -520,7 +547,7 @@ static int hls_slice_header(HEVCContext *s)
 
         slice_address_length = av_ceil_log2(s->ps.sps->ctb_width *
                                             s->ps.sps->ctb_height);
-        sh->slice_segment_addr = slice_address_length ? get_bits(gb, slice_address_length) : 0;
+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
         if (sh->slice_segment_addr >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Invalid slice segment address: %u.\n",
@@ -569,7 +596,7 @@ static int hls_slice_header(HEVCContext *s)
             int poc, pos;
 
             sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
-            poc = ff_hevc_compute_poc(s, sh->pic_order_cnt_lsb);
+            poc = ff_hevc_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
             if (!sh->first_slice_in_pic_flag && poc != s->poc) {
                 av_log(s->avctx, AV_LOG_WARNING,
                        "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
@@ -620,7 +647,7 @@ static int hls_slice_header(HEVCContext *s)
         }
 
         /* 8.3.1 */
-        if (s->temporal_id == 0 &&
+        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
             s->nal_unit_type != HEVC_NAL_TRAIL_N &&
             s->nal_unit_type != HEVC_NAL_TSA_N   &&
             s->nal_unit_type != HEVC_NAL_STSA_N  &&
@@ -632,8 +659,10 @@ static int hls_slice_header(HEVCContext *s)
 
         if (s->ps.sps->sao_enabled) {
             sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
-            sh->slice_sample_adaptive_offset_flag[1] =
-            sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
+            if (s->ps.sps->chroma_format_idc) {
+                sh->slice_sample_adaptive_offset_flag[1] =
+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
+            }
         } else {
             sh->slice_sample_adaptive_offset_flag[0] = 0;
             sh->slice_sample_adaptive_offset_flag[1] = 0;
@@ -709,7 +738,9 @@ static int hls_slice_header(HEVCContext *s)
 
             if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) {
-                pred_weight_table(s, gb);
+                int ret = pred_weight_table(s, gb);
+                if (ret < 0)
+                    return ret;
             }
 
             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
@@ -731,6 +762,11 @@ static int hls_slice_header(HEVCContext *s)
             sh->slice_cr_qp_offset = 0;
         }
 
+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
+        else
+            sh->cu_chroma_qp_offset_enabled_flag = 0;
+
         if (s->ps.pps->deblocking_filter_control_present_flag) {
             int deblocking_filter_override_flag = 0;
 
@@ -740,8 +776,17 @@ static int hls_slice_header(HEVCContext *s)
             if (deblocking_filter_override_flag) {
                 sh->disable_deblocking_filter_flag = get_bits1(gb);
                 if (!sh->disable_deblocking_filter_flag) {
-                    sh->beta_offset = get_se_golomb(gb) * 2;
-                    sh->tc_offset   = get_se_golomb(gb) * 2;
+                    int beta_offset_div2 = get_se_golomb(gb);
+                    int tc_offset_div2   = get_se_golomb(gb) ;
+                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
+                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                            "Invalid deblock filter offsets: %d, %d\n",
+                            beta_offset_div2, tc_offset_div2);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    sh->beta_offset = beta_offset_div2 * 2;
+                    sh->tc_offset   =   tc_offset_div2 * 2;
                 }
             } else {
                 sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
@@ -769,23 +814,59 @@ static int hls_slice_header(HEVCContext *s)
 
     sh->num_entry_point_offsets = 0;
     if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
-        sh->num_entry_point_offsets = get_ue_golomb_long(gb);
+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
+        // It would be possible to bound this tighter but this here is simpler
+        if (num_entry_point_offsets > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
+            return AVERROR_INVALIDDATA;
+        }
+
+        sh->num_entry_point_offsets = num_entry_point_offsets;
         if (sh->num_entry_point_offsets > 0) {
             int offset_len = get_ue_golomb_long(gb) + 1;
 
-            for (i = 0; i < sh->num_entry_point_offsets; i++)
-                skip_bits(gb, offset_len);
-        }
+            if (offset_len < 1 || offset_len > 32) {
+                sh->num_entry_point_offsets = 0;
+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
+                return AVERROR_INVALIDDATA;
+            }
+
+            av_freep(&sh->entry_point_offset);
+            av_freep(&sh->offset);
+            av_freep(&sh->size);
+            sh->entry_point_offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(unsigned));
+            sh->offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
+            sh->size = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
+                sh->num_entry_point_offsets = 0;
+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
+                return AVERROR(ENOMEM);
+            }
+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
+                unsigned val = get_bits_long(gb, offset_len);
+                sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size
+            }
+            if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) {
+                s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here
+                s->threads_number = 1;
+            } else
+                s->enable_parallel_tiles = 0;
+        } else
+            s->enable_parallel_tiles = 0;
     }
 
     if (s->ps.pps->slice_header_extension_present_flag) {
         unsigned int length = get_ue_golomb_long(gb);
+        if (length*8LL > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
+            return AVERROR_INVALIDDATA;
+        }
         for (i = 0; i < length; i++)
             skip_bits(gb, 8);  // slice_header_extension_data_byte
     }
 
     // Inferred parameters
-    sh->slice_qp = 26 + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
     if (sh->slice_qp > 51 ||
         sh->slice_qp < -s->ps.sps->qp_bd_offset) {
         av_log(s->avctx, AV_LOG_ERROR,
@@ -803,13 +884,20 @@ static int hls_slice_header(HEVCContext *s)
         return AVERROR_INVALIDDATA;
     }
 
-    s->HEVClc.first_qp_group = !s->sh.dependent_slice_segment_flag;
+    if (get_bits_left(gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Overread slice header by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->HEVClc->first_qp_group = !s->sh.dependent_slice_segment_flag;
 
     if (!s->ps.pps->cu_qp_delta_enabled_flag)
-        s->HEVClc.qp_y = FFUMOD(s->sh.slice_qp + 52 + 2 * s->ps.sps->qp_bd_offset,
-                                52 + s->ps.sps->qp_bd_offset) - s->ps.sps->qp_bd_offset;
+        s->HEVClc->qp_y = s->sh.slice_qp;
 
     s->slice_initialized = 1;
+    s->HEVClc->tu.cu_qp_offset_cb = 0;
+    s->HEVClc->tu.cu_qp_offset_cr = 0;
 
     return 0;
 }
@@ -830,10 +918,9 @@ do {                                                    \
 
 static void hls_sao_param(HEVCContext *s, int rx, int ry)
 {
-    HEVCLocalContext *lc    = &s->HEVClc;
+    HEVCLocalContext *lc    = s->HEVClc;
     int sao_merge_left_flag = 0;
     int sao_merge_up_flag   = 0;
-    int shift               = s->ps.sps->bit_depth - FFMIN(s->ps.sps->bit_depth, 10);
     SAOParams *sao          = &CTB(s->sao, rx, ry);
     int c_idx, i;
 
@@ -849,7 +936,10 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
         }
     }
 
-    for (c_idx = 0; c_idx < 3; c_idx++) {
+    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+        int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
+
         if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
             sao->type_idx[c_idx] = SAO_NOT_APPLIED;
             continue;
@@ -885,13 +975,14 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
         // Inferred parameters
         sao->offset_val[c_idx][0] = 0;
         for (i = 0; i < 4; i++) {
-            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i] << shift;
+            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i];
             if (sao->type_idx[c_idx] == SAO_EDGE) {
                 if (i > 1)
                     sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
             } else if (sao->offset_sign[c_idx][i]) {
                 sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
             }
+            sao->offset_val[c_idx][i + 1] *= 1 << log2_sao_offset_scale;
         }
     }
 }
@@ -899,396 +990,45 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
 #undef SET_SAO
 #undef CTB
 
-static void hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                int log2_trafo_size, enum ScanType scan_idx,
-                                int c_idx)
-{
-#define GET_COORD(offset, n)                                    \
-    do {                                                        \
-        x_c = (scan_x_cg[offset >> 4] << 2) + scan_x_off[n];    \
-        y_c = (scan_y_cg[offset >> 4] << 2) + scan_y_off[n];    \
-    } while (0)
-    HEVCLocalContext *lc    = &s->HEVClc;
-    int transform_skip_flag = 0;
-
-    int last_significant_coeff_x, last_significant_coeff_y;
-    int last_scan_pos;
-    int n_end;
-    int num_coeff    = 0;
-    int greater1_ctx = 1;
-
-    int num_last_subset;
-    int x_cg_last_sig, y_cg_last_sig;
-
-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
-
-    ptrdiff_t stride = s->frame->linesize[c_idx];
-    int hshift       = s->ps.sps->hshift[c_idx];
-    int vshift       = s->ps.sps->vshift[c_idx];
-    uint8_t *dst     = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-    LOCAL_ALIGNED_32(int16_t, coeffs, [MAX_TB_SIZE * MAX_TB_SIZE]);
-    LOCAL_ALIGNED_8(uint8_t, significant_coeff_group_flag, [8], [8]);
-
-    int trafo_size = 1 << log2_trafo_size;
-    int i, qp, shift, add, scale, scale_m;
-    static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-    const uint8_t *scale_matrix;
-    uint8_t dc_scale;
-
-    memset(coeffs, 0, sizeof(int16_t) * MAX_TB_SIZE * MAX_TB_SIZE);
-    memset(significant_coeff_group_flag, 0, sizeof(uint8_t) * 8 * 8);
-    // Derive QP for dequant
-    if (!lc->cu.cu_transquant_bypass_flag) {
-        static const int qp_c[] = {
-            29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37
-        };
-
-        static const uint8_t rem6[51 + 2 * 6 + 1] = {
-            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
-            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-        };
-
-        static const uint8_t div6[51 + 2 * 6 + 1] = {
-            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,  3,  3,  3,
-            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6,  6,  6,  6,
-            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
-        };
-        int qp_y = lc->qp_y;
-
-        if (c_idx == 0) {
-            qp = qp_y + s->ps.sps->qp_bd_offset;
-        } else {
-            int qp_i, offset;
-
-            if (c_idx == 1)
-                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset;
-            else
-                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset;
-
-            qp_i = av_clip(qp_y + offset, -s->ps.sps->qp_bd_offset, 57);
-            if (qp_i < 30)
-                qp = qp_i;
-            else if (qp_i > 43)
-                qp = qp_i - 6;
-            else
-                qp = qp_c[qp_i - 30];
-
-            qp += s->ps.sps->qp_bd_offset;
-        }
-
-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-        add      = 1 << (shift - 1);
-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-        scale_m  = 16; // default when no custom scaling lists.
-        dc_scale = 16;
-
-        if (s->ps.sps->scaling_list_enable_flag) {
-            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-                                    &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
-
-            if (log2_trafo_size != 5)
-                matrix_id = 3 * matrix_id + c_idx;
-
-            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-            if (log2_trafo_size >= 4)
-                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-        }
-    }
-
-    if (s->ps.pps->transform_skip_enabled_flag &&
-        !lc->cu.cu_transquant_bypass_flag   &&
-        log2_trafo_size == 2) {
-        transform_skip_flag = ff_hevc_transform_skip_flag_decode(s, c_idx);
-    }
-
-    last_significant_coeff_x =
-        ff_hevc_last_significant_coeff_x_prefix_decode(s, c_idx, log2_trafo_size);
-    last_significant_coeff_y =
-        ff_hevc_last_significant_coeff_y_prefix_decode(s, c_idx, log2_trafo_size);
-
-    if (last_significant_coeff_x > 3) {
-        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
-        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-                                   (2 + (last_significant_coeff_x & 1)) +
-                                   suffix;
-    }
+static int hls_cross_component_pred(HEVCContext *s, int idx) {
+    HEVCLocalContext *lc    = s->HEVClc;
+    int log2_res_scale_abs_plus1 = ff_hevc_log2_res_scale_abs(s, idx);
 
-    if (last_significant_coeff_y > 3) {
-        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
-        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-                                   (2 + (last_significant_coeff_y & 1)) +
-                                   suffix;
-    }
-
-    if (scan_idx == SCAN_VERT)
-        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
-
-    x_cg_last_sig = last_significant_coeff_x >> 2;
-    y_cg_last_sig = last_significant_coeff_y >> 2;
-
-    switch (scan_idx) {
-    case SCAN_DIAG: {
-        int last_x_c = last_significant_coeff_x & 3;
-        int last_y_c = last_significant_coeff_y & 3;
-
-        scan_x_off = ff_hevc_diag_scan4x4_x;
-        scan_y_off = ff_hevc_diag_scan4x4_y;
-        num_coeff  = diag_scan4x4_inv[last_y_c][last_x_c];
-        if (trafo_size == 4) {
-            scan_x_cg = scan_1x1;
-            scan_y_cg = scan_1x1;
-        } else if (trafo_size == 8) {
-            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = diag_scan2x2_x;
-            scan_y_cg  = diag_scan2x2_y;
-        } else if (trafo_size == 16) {
-            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = ff_hevc_diag_scan4x4_x;
-            scan_y_cg  = ff_hevc_diag_scan4x4_y;
-        } else { // trafo_size == 32
-            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = ff_hevc_diag_scan8x8_x;
-            scan_y_cg  = ff_hevc_diag_scan8x8_y;
-        }
-        break;
-    }
-    case SCAN_HORIZ:
-        scan_x_cg  = horiz_scan2x2_x;
-        scan_y_cg  = horiz_scan2x2_y;
-        scan_x_off = horiz_scan4x4_x;
-        scan_y_off = horiz_scan4x4_y;
-        num_coeff  = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-        break;
-    default: //SCAN_VERT
-        scan_x_cg  = horiz_scan2x2_y;
-        scan_y_cg  = horiz_scan2x2_x;
-        scan_x_off = horiz_scan4x4_y;
-        scan_y_off = horiz_scan4x4_x;
-        num_coeff  = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-        break;
+    if (log2_res_scale_abs_plus1 !=  0) {
+        int res_scale_sign_flag = ff_hevc_res_scale_sign_flag(s, idx);
+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
+                               (1 - 2 * res_scale_sign_flag);
+    } else {
+        lc->tu.res_scale_val = 0;
     }
-    num_coeff++;
-    num_last_subset = (num_coeff - 1) >> 4;
-
-    for (i = num_last_subset; i >= 0; i--) {
-        int n, m;
-        int x_cg, y_cg, x_c, y_c;
-        int implicit_non_zero_coeff = 0;
-        int64_t trans_coeff_level;
-        int prev_sig = 0;
-        int offset   = i << 4;
-
-        uint8_t significant_coeff_flag_idx[16];
-        uint8_t nb_significant_coeff_flag = 0;
-
-        x_cg = scan_x_cg[i];
-        y_cg = scan_y_cg[i];
-
-        if (i < num_last_subset && i > 0) {
-            int ctx_cg = 0;
-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-
-            significant_coeff_group_flag[x_cg][y_cg] =
-                ff_hevc_significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-            implicit_non_zero_coeff = 1;
-        } else {
-            significant_coeff_group_flag[x_cg][y_cg] =
-                ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-                 (x_cg == 0 && y_cg == 0));
-        }
 
-        last_scan_pos = num_coeff - offset - 1;
 
-        if (i == num_last_subset) {
-            n_end                         = last_scan_pos - 1;
-            significant_coeff_flag_idx[0] = last_scan_pos;
-            nb_significant_coeff_flag     = 1;
-        } else {
-            n_end = 15;
-        }
-
-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig = significant_coeff_group_flag[x_cg + 1][y_cg];
-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig += significant_coeff_group_flag[x_cg][y_cg + 1] << 1;
-
-        for (n = n_end; n >= 0; n--) {
-            GET_COORD(offset, n);
-
-            if (significant_coeff_group_flag[x_cg][y_cg] &&
-                (n > 0 || implicit_non_zero_coeff == 0)) {
-                if (ff_hevc_significant_coeff_flag_decode(s, c_idx, x_c, y_c,
-                                                          log2_trafo_size,
-                                                          scan_idx,
-                                                          prev_sig) == 1) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
-                    implicit_non_zero_coeff = 0;
-                }
-            } else {
-                int last_cg = (x_c == (x_cg << 2) && y_c == (y_cg << 2));
-                if (last_cg && implicit_non_zero_coeff && significant_coeff_group_flag[x_cg][y_cg]) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
-                }
-            }
-        }
-
-        n_end = nb_significant_coeff_flag;
-
-        if (n_end) {
-            int first_nz_pos_in_cg = 16;
-            int last_nz_pos_in_cg = -1;
-            int c_rice_param = 0;
-            int first_greater1_coeff_idx = -1;
-            uint8_t coeff_abs_level_greater1_flag[16] = { 0 };
-            uint16_t coeff_sign_flag;
-            int sum_abs = 0;
-            int sign_hidden = 0;
-
-            // initialize first elem of coeff_bas_level_greater1_flag
-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-
-            if (!(i == num_last_subset) && greater1_ctx == 0)
-                ctx_set++;
-            greater1_ctx      = 1;
-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-                int n_idx = significant_coeff_flag_idx[m];
-                int inc   = (ctx_set << 2) + greater1_ctx;
-                coeff_abs_level_greater1_flag[n_idx] =
-                    ff_hevc_coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-                if (coeff_abs_level_greater1_flag[n_idx]) {
-                    greater1_ctx = 0;
-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-                    greater1_ctx++;
-                }
-
-                if (coeff_abs_level_greater1_flag[n_idx] &&
-                    first_greater1_coeff_idx == -1)
-                    first_greater1_coeff_idx = n_idx;
-            }
-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-            sign_hidden        = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 &&
-                                 !lc->cu.cu_transquant_bypass_flag;
-
-            if (first_greater1_coeff_idx != -1) {
-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += ff_hevc_coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-            }
-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden) {
-                coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-            } else {
-                coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-            }
-
-            for (m = 0; m < n_end; m++) {
-                n = significant_coeff_flag_idx[m];
-                GET_COORD(offset, n);
-                trans_coeff_level = 1 + coeff_abs_level_greater1_flag[n];
-                if (trans_coeff_level == ((m < 8) ?
-                                          ((n == first_greater1_coeff_idx) ? 3 : 2) : 1)) {
-                    trans_coeff_level += ff_hevc_coeff_abs_level_remaining(s, trans_coeff_level, c_rice_param);
-                    if ((trans_coeff_level) > (3 * (1 << c_rice_param)))
-                        c_rice_param = FFMIN(c_rice_param + 1, 4);
-                }
-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-                    sum_abs += trans_coeff_level;
-                    if (n == first_nz_pos_in_cg && ((sum_abs & 1) == 1))
-                        trans_coeff_level = -trans_coeff_level;
-                }
-                if (coeff_sign_flag >> 15)
-                    trans_coeff_level = -trans_coeff_level;
-                coeff_sign_flag <<= 1;
-                if (!lc->cu.cu_transquant_bypass_flag) {
-                    if (s->ps.sps->scaling_list_enable_flag) {
-                        if (y_c || x_c || log2_trafo_size < 4) {
-                            int pos;
-                            switch (log2_trafo_size) {
-                            case 3:  pos = (y_c        << 3) +  x_c;       break;
-                            case 4:  pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-                            case 5:  pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-                            default: pos = (y_c        << 2) +  x_c;
-                            }
-                            scale_m = scale_matrix[pos];
-                        } else {
-                            scale_m = dc_scale;
-                        }
-                    }
-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-                    if(trans_coeff_level < 0) {
-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-                            trans_coeff_level = -32768;
-                    } else {
-                        if (trans_coeff_level & 0xffffffffffff8000)
-                            trans_coeff_level = 32767;
-                    }
-                }
-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
-            }
-        }
-    }
-
-    if (!lc->cu.cu_transquant_bypass_flag) {
-        if (transform_skip_flag)
-            s->hevcdsp.dequant(coeffs);
-        else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 &&
-                 log2_trafo_size == 2)
-            s->hevcdsp.transform_4x4_luma(coeffs);
-        else {
-            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-            if (max_xy == 0)
-                s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
-            else {
-                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-                if (max_xy < 4)
-                    col_limit = FFMIN(4, col_limit);
-                else if (max_xy < 8)
-                    col_limit = FFMIN(8, col_limit);
-                else if (max_xy < 12)
-                    col_limit = FFMIN(24, col_limit);
-                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
-            }
-        }
-    }
-    s->hevcdsp.add_residual[log2_trafo_size - 2](dst, coeffs, stride);
+    return 0;
 }
 
 static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
-                              int blk_idx, int cbf_luma, int cbf_cb, int cbf_cr)
+                              int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
+    const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1];
+    int i;
 
     if (lc->cu.pred_mode == MODE_INTRA) {
         int trafo_size = 1 << log2_trafo_size;
         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
 
         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
-        if (log2_trafo_size > 2) {
-            trafo_size = trafo_size << (s->ps.sps->hshift[1] - 1);
-            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-            s->hpc.intra_pred[log2_trafo_size - 3](s, x0, y0, 1);
-            s->hpc.intra_pred[log2_trafo_size - 3](s, x0, y0, 2);
-        } else if (blk_idx == 3) {
-            trafo_size = trafo_size << s->ps.sps->hshift[1];
-            ff_hevc_set_neighbour_available(s, xBase, yBase,
-                                            trafo_size, trafo_size);
-            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
-            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
-        }
     }
 
-    if (cbf_luma || cbf_cb || cbf_cr) {
+    if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+        (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) {
         int scan_idx   = SCAN_DIAG;
         int scan_idx_c = SCAN_DIAG;
+        int cbf_chroma = cbf_cb[0] || cbf_cr[0] ||
+                         (s->ps.sps->chroma_format_idc == 2 &&
+                         (cbf_cb[1] || cbf_cr[1]));
 
         if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) {
             lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(s);
@@ -1308,41 +1048,167 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                 return AVERROR_INVALIDDATA;
             }
 
-            ff_hevc_set_qPy(s, x0, y0, cb_xBase, cb_yBase, log2_cb_size);
+            ff_hevc_set_qPy(s, cb_xBase, cb_yBase, log2_cb_size);
+        }
+
+        if (s->sh.cu_chroma_qp_offset_enabled_flag && cbf_chroma &&
+            !lc->cu.cu_transquant_bypass_flag  &&  !lc->tu.is_cu_chroma_qp_offset_coded) {
+            int cu_chroma_qp_offset_flag = ff_hevc_cu_chroma_qp_offset_flag(s);
+            if (cu_chroma_qp_offset_flag) {
+                int cu_chroma_qp_offset_idx  = 0;
+                if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
+                    cu_chroma_qp_offset_idx = ff_hevc_cu_chroma_qp_offset_idx(s);
+                    av_log(s->avctx, AV_LOG_ERROR,
+                        "cu_chroma_qp_offset_idx not yet tested.\n");
+                }
+                lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
+                lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
+            } else {
+                lc->tu.cu_qp_offset_cb = 0;
+                lc->tu.cu_qp_offset_cr = 0;
+            }
+            lc->tu.is_cu_chroma_qp_offset_coded = 1;
         }
 
         if (lc->cu.pred_mode == MODE_INTRA && log2_trafo_size < 4) {
-            if (lc->tu.cur_intra_pred_mode >= 6 &&
-                lc->tu.cur_intra_pred_mode <= 14) {
+            if (lc->tu.intra_pred_mode >= 6 &&
+                lc->tu.intra_pred_mode <= 14) {
                 scan_idx = SCAN_VERT;
-            } else if (lc->tu.cur_intra_pred_mode >= 22 &&
-                       lc->tu.cur_intra_pred_mode <= 30) {
+            } else if (lc->tu.intra_pred_mode >= 22 &&
+                       lc->tu.intra_pred_mode <= 30) {
                 scan_idx = SCAN_HORIZ;
             }
 
-            if (lc->pu.intra_pred_mode_c >=  6 &&
-                lc->pu.intra_pred_mode_c <= 14) {
+            if (lc->tu.intra_pred_mode_c >=  6 &&
+                lc->tu.intra_pred_mode_c <= 14) {
                 scan_idx_c = SCAN_VERT;
-            } else if (lc->pu.intra_pred_mode_c >= 22 &&
-                       lc->pu.intra_pred_mode_c <= 30) {
+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
+                       lc->tu.intra_pred_mode_c <= 30) {
                 scan_idx_c = SCAN_HORIZ;
             }
         }
 
+        lc->tu.cross_pf = 0;
+
         if (cbf_luma)
-            hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
-        if (log2_trafo_size > 2) {
-            if (cbf_cb)
-                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 1);
-            if (cbf_cr)
-                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 2);
+            ff_hevc_hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
+        if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
+            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+            lc->tu.cross_pf  = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma &&
+                                (lc->cu.pred_mode == MODE_INTER ||
+                                 (lc->tu.chroma_mode_c ==  4)));
+
+            if (lc->tu.cross_pf) {
+                hls_cross_component_pred(s, 0);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+                }
+                if (cbf_cb[i])
+                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+                                                log2_trafo_size_c, scan_idx_c, 1);
+                else
+                    if (lc->tu.cross_pf) {
+                        ptrdiff_t stride = s->frame->linesize[1];
+                        int hshift = s->ps.sps->hshift[1];
+                        int vshift = s->ps.sps->vshift[1];
+                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
+                        int size = 1 << log2_trafo_size_c;
+
+                        uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride +
+                                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+                        for (i = 0; i < (size * size); i++) {
+                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+                        }
+                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
+                    }
+            }
+
+            if (lc->tu.cross_pf) {
+                hls_cross_component_pred(s, 1);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+                }
+                if (cbf_cr[i])
+                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+                                                log2_trafo_size_c, scan_idx_c, 2);
+                else
+                    if (lc->tu.cross_pf) {
+                        ptrdiff_t stride = s->frame->linesize[2];
+                        int hshift = s->ps.sps->hshift[2];
+                        int vshift = s->ps.sps->vshift[2];
+                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
+                        int size = 1 << log2_trafo_size_c;
+
+                        uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
+                                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+                        for (i = 0; i < (size * size); i++) {
+                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+                        }
+                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
+                    }
+            }
+        } else if (s->ps.sps->chroma_format_idc && blk_idx == 3) {
+            int trafo_size_h = 1 << (log2_trafo_size + 1);
+            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                    trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+                }
+                if (cbf_cb[i])
+                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+                                                log2_trafo_size, scan_idx_c, 1);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+                }
+                if (cbf_cr[i])
+                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+                                                log2_trafo_size, scan_idx_c, 2);
+            }
+        }
+    } else if (s->ps.sps->chroma_format_idc && lc->cu.pred_mode == MODE_INTRA) {
+        if (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3) {
+            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+            if (s->ps.sps->chroma_format_idc == 2) {
+                ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+                                                trafo_size_h, trafo_size_v);
+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+            }
         } else if (blk_idx == 3) {
-            if (cbf_cb)
-                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 1);
-            if (cbf_cr)
-                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 2);
+            int trafo_size_h = 1 << (log2_trafo_size + 1);
+            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+            ff_hevc_set_neighbour_available(s, xBase, yBase,
+                                            trafo_size_h, trafo_size_v);
+            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+            if (s->ps.sps->chroma_format_idc == 2) {
+                ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+                                                trafo_size_h, trafo_size_v);
+                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+            }
         }
     }
+
     return 0;
 }
 
@@ -1365,17 +1231,34 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
                               int trafo_depth, int blk_idx,
-                              int cbf_cb, int cbf_cr)
+                              const int *base_cbf_cb, const int *base_cbf_cr)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     uint8_t split_transform_flag;
+    int cbf_cb[2];
+    int cbf_cr[2];
     int ret;
 
+    cbf_cb[0] = base_cbf_cb[0];
+    cbf_cb[1] = base_cbf_cb[1];
+    cbf_cr[0] = base_cbf_cr[0];
+    cbf_cr[1] = base_cbf_cr[1];
+
     if (lc->cu.intra_split_flag) {
-        if (trafo_depth == 1)
-            lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
+        if (trafo_depth == 1) {
+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
+            if (s->ps.sps->chroma_format_idc == 3) {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
+            } else {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
+            }
+        }
     } else {
-        lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[0];
+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
     }
 
     if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
@@ -1394,14 +1277,21 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
                                inter_split;
     }
 
-    if (log2_trafo_size > 2 && (trafo_depth == 0 || cbf_cb))
-        cbf_cb = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-    else if (log2_trafo_size > 2 || trafo_depth == 0)
-        cbf_cb = 0;
-    if (log2_trafo_size > 2 && (trafo_depth == 0 || cbf_cr))
-        cbf_cr = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-    else if (log2_trafo_size > 2 || trafo_depth == 0)
-        cbf_cr = 0;
+    if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
+        if (trafo_depth == 0 || cbf_cb[0]) {
+            cbf_cb[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
+                cbf_cb[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            }
+        }
+
+        if (trafo_depth == 0 || cbf_cr[0]) {
+            cbf_cr[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
+                cbf_cr[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            }
+        }
+    }
 
     if (split_transform_flag) {
         const int trafo_size_split = 1 << (log2_trafo_size - 1);
@@ -1430,8 +1320,10 @@ do {
         int cbf_luma         = 1;
 
         if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 ||
-            cbf_cb || cbf_cr)
+            cbf_cb[0] || cbf_cr[0] ||
+            (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) {
             cbf_luma = ff_hevc_cbf_luma_decode(s, trafo_depth);
+        }
 
         ret = hls_transform_unit(s, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
                                  log2_cb_size, log2_trafo_size,
@@ -1460,8 +1352,7 @@ do {
 
 static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
 {
-    //TODO: non-4:2:0 support
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     GetBitContext gb;
     int cb_size   = 1 << log2_cb_size;
     ptrdiff_t stride0 = s->frame->linesize[0];
@@ -1471,7 +1362,10 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
     uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
     uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
 
-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth + ((cb_size * cb_size) >> 1) * s->ps.sps->pcm.bit_depth_chroma;
+    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+                          s->ps.sps->pcm.bit_depth_chroma;
     const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
     int ret;
 
@@ -1482,38 +1376,23 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
     if (ret < 0)
         return ret;
 
-    s->hevcdsp.put_pcm(dst0, stride0, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
-    s->hevcdsp.put_pcm(dst1, stride1, cb_size / 2, &gb, s->ps.sps->pcm.bit_depth_chroma);
-    s->hevcdsp.put_pcm(dst2, stride2, cb_size / 2, &gb, s->ps.sps->pcm.bit_depth_chroma);
-    return 0;
-}
-
-static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
-{
-    HEVCLocalContext *lc = &s->HEVClc;
-    int x = ff_hevc_abs_mvd_greater0_flag_decode(s);
-    int y = ff_hevc_abs_mvd_greater0_flag_decode(s);
-
-    if (x)
-        x += ff_hevc_abs_mvd_greater1_flag_decode(s);
-    if (y)
-        y += ff_hevc_abs_mvd_greater1_flag_decode(s);
-
-    switch (x) {
-    case 2: lc->pu.mvd.x = ff_hevc_mvd_decode(s);           break;
-    case 1: lc->pu.mvd.x = ff_hevc_mvd_sign_flag_decode(s); break;
-    case 0: lc->pu.mvd.x = 0;                               break;
+    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
+    if (s->ps.sps->chroma_format_idc) {
+        s->hevcdsp.put_pcm(dst1, stride1,
+                           cb_size >> s->ps.sps->hshift[1],
+                           cb_size >> s->ps.sps->vshift[1],
+                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+        s->hevcdsp.put_pcm(dst2, stride2,
+                           cb_size >> s->ps.sps->hshift[2],
+                           cb_size >> s->ps.sps->vshift[2],
+                           &gb, s->ps.sps->pcm.bit_depth_chroma);
     }
 
-    switch (y) {
-    case 2: lc->pu.mvd.y = ff_hevc_mvd_decode(s);           break;
-    case 1: lc->pu.mvd.y = ff_hevc_mvd_sign_flag_decode(s); break;
-    case 0: lc->pu.mvd.y = 0;                               break;
-    }
+    return 0;
 }
 
 /**
- * 8.5.3.2.2.1 Luma sample interpolation process
+ * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
  *
  * @param s HEVC decoding context
  * @param dst target buffer for block data at block position
@@ -1524,49 +1403,147 @@ static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
  * @param y_off vertical position of block from origin (0, 0)
  * @param block_w width of block
  * @param block_h height of block
+ * @param luma_weight weighting factor applied to the luma prediction
+ * @param luma_offset additive offset applied to the luma prediction value
  */
-static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
-                    AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                    int block_w, int block_h, int pred_idx)
+
+static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+                        int block_w, int block_h, int luma_weight, int luma_offset)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     uint8_t *src         = ref->data[0];
     ptrdiff_t srcstride  = ref->linesize[0];
     int pic_width        = s->ps.sps->width;
     int pic_height       = s->ps.sps->height;
-
-    int mx         = mv->x & 3;
-    int my         = mv->y & 3;
-    int extra_left = ff_hevc_qpel_extra_before[mx];
-    int extra_top  = ff_hevc_qpel_extra_before[my];
+    int mx               = mv->x & 3;
+    int my               = mv->y & 3;
+    int weight_flag      = (s->sh.slice_type == HEVC_SLICE_P && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
+    int idx              = ff_hevc_pel_weight[block_w];
 
     x_off += mv->x >> 2;
     y_off += mv->y >> 2;
     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
 
-    if (x_off < extra_left || y_off < extra_top ||
-        x_off >= pic_width - block_w - ff_hevc_qpel_extra_after[mx] ||
-        y_off >= pic_height - block_h - ff_hevc_qpel_extra_after[my]) {
+    if (x_off < QPEL_EXTRA_BEFORE || y_off < QPEL_EXTRA_AFTER ||
+        x_off >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off >= pic_height - block_h - QPEL_EXTRA_AFTER) {
         const ptrdiff_t edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
-        int offset = extra_top * srcstride + (extra_left << s->ps.sps->pixel_shift);
-        int buf_offset = extra_top *
-                         edge_emu_stride + (extra_left << s->ps.sps->pixel_shift);
+        int offset     = QPEL_EXTRA_BEFORE * srcstride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
 
         s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src - offset,
                                  edge_emu_stride, srcstride,
-                                 block_w + ff_hevc_qpel_extra[mx],
-                                 block_h + ff_hevc_qpel_extra[my],
-                                 x_off - extra_left, y_off - extra_top,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off - QPEL_EXTRA_BEFORE, y_off - QPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
         src = lc->edge_emu_buffer + buf_offset;
         srcstride = edge_emu_stride;
     }
-    s->hevcdsp.put_hevc_qpel[!!my][!!mx][pred_idx](dst, dststride, src, srcstride,
-                                                   block_h, mx, my, lc->mc_buffer);
+
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_qpel_uni[idx][!!my][!!mx](dst, dststride, src, srcstride,
+                                                      block_h, mx, my, block_w);
+    else
+        s->hevcdsp.put_hevc_qpel_uni_w[idx][!!my][!!mx](dst, dststride, src, srcstride,
+                                                        block_h, s->sh.luma_log2_weight_denom,
+                                                        luma_weight, luma_offset, mx, my, block_w);
 }
 
 /**
- * 8.5.3.2.2.2 Chroma sample interpolation process
+ * 8.5.3.2.2.1 Luma sample bidirectional interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst target buffer for block data at block position
+ * @param dststride stride of the dst buffer
+ * @param ref0 reference picture0 buffer at origin (0, 0)
+ * @param mv0 motion vector0 (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ * @param ref1 reference picture1 buffer at origin (0, 0)
+ * @param mv1 motion vector1 (relative to block position) to get pixel data from
+ * @param current_mv current motion vector structure
+ */
+ static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    ptrdiff_t src0stride  = ref0->linesize[0];
+    ptrdiff_t src1stride  = ref1->linesize[0];
+    int pic_width        = s->ps.sps->width;
+    int pic_height       = s->ps.sps->height;
+    int mx0              = mv0->x & 3;
+    int my0              = mv0->y & 3;
+    int mx1              = mv1->x & 3;
+    int my1              = mv1->y & 3;
+    int weight_flag      = (s->sh.slice_type == HEVC_SLICE_P && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
+    int x_off0           = x_off + (mv0->x >> 2);
+    int y_off0           = y_off + (mv0->y >> 2);
+    int x_off1           = x_off + (mv1->x >> 2);
+    int y_off1           = y_off + (mv1->y >> 2);
+    int idx              = ff_hevc_pel_weight[block_w];
+
+    uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+    uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+
+    if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+        x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+        const ptrdiff_t edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset     = QPEL_EXTRA_BEFORE * src0stride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src0 - offset,
+                                 edge_emu_stride, src0stride,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off0 - QPEL_EXTRA_BEFORE, y_off0 - QPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+        src0 = lc->edge_emu_buffer + buf_offset;
+        src0stride = edge_emu_stride;
+    }
+
+    if (x_off1 < QPEL_EXTRA_BEFORE || y_off1 < QPEL_EXTRA_AFTER ||
+        x_off1 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off1 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+        const ptrdiff_t edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset     = QPEL_EXTRA_BEFORE * src1stride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer2, src1 - offset,
+                                 edge_emu_stride, src1stride,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off1 - QPEL_EXTRA_BEFORE, y_off1 - QPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+        src1 = lc->edge_emu_buffer2 + buf_offset;
+        src1stride = edge_emu_stride;
+    }
+
+    s->hevcdsp.put_hevc_qpel[idx][!!my0][!!mx0](lc->tmp, src0, src0stride,
+                                                block_h, mx0, my0, block_w);
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_qpel_bi[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, lc->tmp,
+                                                       block_h, mx1, my1, block_w);
+    else
+        s->hevcdsp.put_hevc_qpel_bi_w[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, lc->tmp,
+                                                         block_h, s->sh.luma_log2_weight_denom,
+                                                         s->sh.luma_weight_l0[current_mv->ref_idx[0]],
+                                                         s->sh.luma_weight_l1[current_mv->ref_idx[1]],
+                                                         s->sh.luma_offset_l0[current_mv->ref_idx[0]],
+                                                         s->sh.luma_offset_l1[current_mv->ref_idx[1]],
+                                                         mx1, my1, block_w);
+
+}
+
+/**
+ * 8.5.3.2.2.2 Chroma sample uniprediction interpolation process
  *
  * @param s HEVC decoding context
  * @param dst1 target buffer for block data at block position (U plane)
@@ -1578,85 +1555,185 @@ static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
  * @param y_off vertical position of block from origin (0, 0)
  * @param block_w width of block
  * @param block_h height of block
+ * @param chroma_weight weighting factor applied to the chroma prediction
+ * @param chroma_offset additive offset applied to the chroma prediction value
  */
-static void chroma_mc(HEVCContext *s, int16_t *dst1, int16_t *dst2,
-                      ptrdiff_t dststride, AVFrame *ref, const Mv *mv,
-                      int x_off, int y_off, int block_w, int block_h, int pred_idx)
+
+static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
-    uint8_t *src1        = ref->data[1];
-    uint8_t *src2        = ref->data[2];
-    ptrdiff_t src1stride = ref->linesize[1];
-    ptrdiff_t src2stride = ref->linesize[2];
-    int pic_width        = s->ps.sps->width >> 1;
-    int pic_height       = s->ps.sps->height >> 1;
-
-    int mx = mv->x & 7;
-    int my = mv->y & 7;
-
-    x_off += mv->x >> 3;
-    y_off += mv->y >> 3;
-    src1  += y_off * src1stride + (x_off * (1 << s->ps.sps->pixel_shift));
-    src2  += y_off * src2stride + (x_off * (1 << s->ps.sps->pixel_shift));
+    HEVCLocalContext *lc = s->HEVClc;
+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+    const Mv *mv         = &current_mv->mv[reflist];
+    int weight_flag      = (s->sh.slice_type == HEVC_SLICE_P && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
+    int idx              = ff_hevc_pel_weight[block_w];
+    int hshift           = s->ps.sps->hshift[1];
+    int vshift           = s->ps.sps->vshift[1];
+    intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+    intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+    intptr_t _mx         = mx << (1 - hshift);
+    intptr_t _my         = my << (1 - vshift);
+
+    x_off += mv->x >> (2 + hshift);
+    y_off += mv->y >> (2 + vshift);
+    src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
 
     if (x_off < EPEL_EXTRA_BEFORE || y_off < EPEL_EXTRA_AFTER ||
         x_off >= pic_width - block_w - EPEL_EXTRA_AFTER ||
         y_off >= pic_height - block_h - EPEL_EXTRA_AFTER) {
-        const ptrdiff_t edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset0 = EPEL_EXTRA_BEFORE * (srcstride + (1 << s->ps.sps->pixel_shift));
+        int buf_offset0 = EPEL_EXTRA_BEFORE *
+                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src0 - offset0,
+                                 edge_emu_stride, srcstride,
+                                 block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
+                                 x_off - EPEL_EXTRA_BEFORE,
+                                 y_off - EPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+
+        src0 = lc->edge_emu_buffer + buf_offset0;
+        srcstride = edge_emu_stride;
+    }
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_epel_uni[idx][!!my][!!mx](dst0, dststride, src0, srcstride,
+                                                  block_h, _mx, _my, block_w);
+    else
+        s->hevcdsp.put_hevc_epel_uni_w[idx][!!my][!!mx](dst0, dststride, src0, srcstride,
+                                                        block_h, s->sh.chroma_log2_weight_denom,
+                                                        chroma_weight, chroma_offset, _mx, _my, block_w);
+}
+
+/**
+ * 8.5.3.2.2.2 Chroma sample bidirectional interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst target buffer for block data at block position
+ * @param dststride stride of the dst buffer
+ * @param ref0 reference picture0 buffer at origin (0, 0)
+ * @param mv0 motion vector0 (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ * @param ref1 reference picture1 buffer at origin (0, 0)
+ * @param mv1 motion vector1 (relative to block position) to get pixel data from
+ * @param current_mv current motion vector structure
+ * @param cidx chroma component(cb, cr)
+ */
+static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    uint8_t *src1        = ref0->data[cidx+1];
+    uint8_t *src2        = ref1->data[cidx+1];
+    ptrdiff_t src1stride = ref0->linesize[cidx+1];
+    ptrdiff_t src2stride = ref1->linesize[cidx+1];
+    int weight_flag      = (s->sh.slice_type == HEVC_SLICE_P && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+    Mv *mv0              = &current_mv->mv[0];
+    Mv *mv1              = &current_mv->mv[1];
+    int hshift = s->ps.sps->hshift[1];
+    int vshift = s->ps.sps->vshift[1];
+
+    intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+    intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+    intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+    intptr_t my1 = av_mod_uintp2(mv1->y, 2 + vshift);
+    intptr_t _mx0 = mx0 << (1 - hshift);
+    intptr_t _my0 = my0 << (1 - vshift);
+    intptr_t _mx1 = mx1 << (1 - hshift);
+    intptr_t _my1 = my1 << (1 - vshift);
+
+    int x_off0 = x_off + (mv0->x >> (2 + hshift));
+    int y_off0 = y_off + (mv0->y >> (2 + vshift));
+    int x_off1 = x_off + (mv1->x >> (2 + hshift));
+    int y_off1 = y_off + (mv1->y >> (2 + vshift));
+    int idx = ff_hevc_pel_weight[block_w];
+    src1  += y_off0 * src1stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+    src2  += y_off1 * src2stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+
+    if (x_off0 < EPEL_EXTRA_BEFORE || y_off0 < EPEL_EXTRA_AFTER ||
+        x_off0 >= pic_width - block_w - EPEL_EXTRA_AFTER ||
+        y_off0 >= pic_height - block_h - EPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
         int offset1 = EPEL_EXTRA_BEFORE * (src1stride + (1 << s->ps.sps->pixel_shift));
         int buf_offset1 = EPEL_EXTRA_BEFORE *
                           (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
-        int offset2 = EPEL_EXTRA_BEFORE * (src2stride + (1 << s->ps.sps->pixel_shift));
-        int buf_offset2 = EPEL_EXTRA_BEFORE *
-                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
 
         s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src1 - offset1,
                                  edge_emu_stride, src1stride,
                                  block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
-                                 x_off - EPEL_EXTRA_BEFORE,
-                                 y_off - EPEL_EXTRA_BEFORE,
+                                 x_off0 - EPEL_EXTRA_BEFORE,
+                                 y_off0 - EPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
 
         src1 = lc->edge_emu_buffer + buf_offset1;
         src1stride = edge_emu_stride;
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst1, dststride, src1, src1stride,
-                                                       block_h, mx, my, lc->mc_buffer);
+    }
+
+    if (x_off1 < EPEL_EXTRA_BEFORE || y_off1 < EPEL_EXTRA_AFTER ||
+        x_off1 >= pic_width - block_w - EPEL_EXTRA_AFTER ||
+        y_off1 >= pic_height - block_h - EPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset1 = EPEL_EXTRA_BEFORE * (src2stride + (1 << s->ps.sps->pixel_shift));
+        int buf_offset1 = EPEL_EXTRA_BEFORE *
+                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
 
-        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src2 - offset2,
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer2, src2 - offset1,
                                  edge_emu_stride, src2stride,
                                  block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
-                                 x_off - EPEL_EXTRA_BEFORE,
-                                 y_off - EPEL_EXTRA_BEFORE,
+                                 x_off1 - EPEL_EXTRA_BEFORE,
+                                 y_off1 - EPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
-        src2 = lc->edge_emu_buffer + buf_offset2;
-        src2stride = edge_emu_stride;
 
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst2, dststride, src2, src2stride,
-                                                       block_h, mx, my, lc->mc_buffer);
-    } else {
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst1, dststride, src1, src1stride,
-                                                       block_h, mx, my, lc->mc_buffer);
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst2, dststride, src2, src2stride,
-                                                       block_h, mx, my, lc->mc_buffer);
+        src2 = lc->edge_emu_buffer2 + buf_offset1;
+        src2stride = edge_emu_stride;
     }
+
+    s->hevcdsp.put_hevc_epel[idx][!!my0][!!mx0](lc->tmp, src1, src1stride,
+                                                block_h, _mx0, _my0, block_w);
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_epel_bi[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1],
+                                                       src2, src2stride, lc->tmp,
+                                                       block_h, _mx1, _my1, block_w);
+    else
+        s->hevcdsp.put_hevc_epel_bi_w[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1],
+                                                         src2, src2stride, lc->tmp,
+                                                         block_h,
+                                                         s->sh.chroma_log2_weight_denom,
+                                                         s->sh.chroma_weight_l0[current_mv->ref_idx[0]][cidx],
+                                                         s->sh.chroma_weight_l1[current_mv->ref_idx[1]][cidx],
+                                                         s->sh.chroma_offset_l0[current_mv->ref_idx[0]][cidx],
+                                                         s->sh.chroma_offset_l1[current_mv->ref_idx[1]][cidx],
+                                                         _mx1, _my1, block_w);
 }
 
 static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
                                 const Mv *mv, int y0, int height)
 {
-    int y = (mv->y >> 2) + y0 + height + 9;
-    ff_thread_await_progress(&ref->tf, y, 0);
+    if (s->threads_type == FF_THREAD_FRAME ) {
+        int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+
+        ff_thread_await_progress(&ref->tf, y, 0);
+    }
 }
 
-static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
+static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
                                   int nPbH, int log2_cb_size, int part_idx,
                                   int merge_idx, MvField *mv)
 {
-    HEVCLocalContext *lc             = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     enum InterPredIdc inter_pred_idc = PRED_L0;
     int mvp_flag;
 
     ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH);
+    mv->pred_flag = 0;
     if (s->sh.slice_type == HEVC_SLICE_B)
         inter_pred_idc = ff_hevc_inter_pred_idc_decode(s, nPbW, nPbH);
 
@@ -1664,8 +1741,8 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
         if (s->sh.nb_refs[L0])
             mv->ref_idx[0]= ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L0]);
 
-        mv->pred_flag[0] = 1;
-        hls_mvd_coding(s, x0, y0, 0);
+        mv->pred_flag = PF_L0;
+        ff_hevc_hls_mvd_coding(s, x0, y0, 0);
         mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
         ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                  part_idx, merge_idx, mv, mvp_flag, 0);
@@ -1680,10 +1757,10 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
         if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
             AV_ZERO32(&lc->pu.mvd);
         } else {
-            hls_mvd_coding(s, x0, y0, 1);
+            ff_hevc_hls_mvd_coding(s, x0, y0, 1);
         }
 
-        mv->pred_flag[1] = 1;
+        mv->pred_flag += PF_L1;
         mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
         ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                  part_idx, merge_idx, mv, mvp_flag, 1);
@@ -1694,30 +1771,20 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
 
 static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
                                 int nPbW, int nPbH,
-                                int log2_cb_size, int partIdx)
+                                int log2_cb_size, int partIdx, int idx)
 {
-    static const int pred_indices[] = {
-        [4] = 0, [8] = 1, [12] = 2, [16] = 3, [24] = 4, [32] = 5, [48] = 6, [64] = 7,
-    };
-    const int pred_idx = pred_indices[nPbW];
-
 #define POS(c_idx, x, y)                                                              \
     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int merge_idx = 0;
     struct MvField current_mv = {{{ 0 }}};
 
     int min_pu_width = s->ps.sps->min_pu_width;
-    int weighted_pred = (s->sh.slice_type == HEVC_SLICE_P && s->ps.pps->weighted_pred_flag) ||
-                        (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
 
     MvField *tab_mvf = s->ref->tab_mvf;
     RefPicList  *refPicList = s->ref->refPicList;
-    HEVCFrame *ref0, *ref1;
-
-    ptrdiff_t tmpstride = MAX_PB_SIZE * sizeof(int16_t);
-
+    HEVCFrame *ref0 = NULL, *ref1 = NULL;
     uint8_t *dst0 = POS(0, x0, y0);
     uint8_t *dst1 = POS(1, x0, y0);
     uint8_t *dst2 = POS(2, x0, y0);
@@ -1742,7 +1809,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         ff_hevc_luma_mv_merge_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                    partIdx, merge_idx, &current_mv);
     } else {
-        hevc_luma_mv_mpv_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
+        hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                               partIdx, merge_idx, &current_mv);
     }
 
@@ -1753,133 +1820,74 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++)
             tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv;
 
-    if (current_mv.pred_flag[0]) {
+    if (current_mv.pred_flag & PF_L0) {
         ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
         if (!ref0)
             return;
         hevc_await_progress(s, ref0, &current_mv.mv[0], y0, nPbH);
     }
-    if (current_mv.pred_flag[1]) {
+    if (current_mv.pred_flag & PF_L1) {
         ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
         if (!ref1)
             return;
         hevc_await_progress(s, ref1, &current_mv.mv[1], y0, nPbH);
     }
 
-    if (current_mv.pred_flag[0] && !current_mv.pred_flag[1]) {
-        LOCAL_ALIGNED_16(int16_t,  tmp, [MAX_PB_SIZE * MAX_PB_SIZE]);
-        LOCAL_ALIGNED_16(int16_t, tmp2, [MAX_PB_SIZE * MAX_PB_SIZE]);
+    if (current_mv.pred_flag == PF_L0) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
 
-        luma_mc(s, tmp, tmpstride, ref0->frame,
-                &current_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx);
+        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                    s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                    s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
 
-        if (weighted_pred) {
-            s->hevcdsp.weighted_pred[pred_idx](s->sh.luma_log2_weight_denom,
-                                               s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                                               s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-                                               dst0, s->frame->linesize[0], tmp,
-                                               tmpstride, nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred[pred_idx](dst0, s->frame->linesize[0], tmp, tmpstride, nPbH);
-        }
-        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
-                  &current_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
-
-        if (weighted_pred) {
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
-                                                      s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
-                                                      dst1, s->frame->linesize[1], tmp, tmpstride,
-                                                      nPbH / 2);
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
-                                                      s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
-                                                      dst2, s->frame->linesize[2], tmp2, tmpstride,
-                                                      nPbH / 2);
-        } else {
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst1, s->frame->linesize[1], tmp,  tmpstride, nPbH / 2);
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH / 2);
-        }
-    } else if (!current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
-        LOCAL_ALIGNED_16(int16_t, tmp,  [MAX_PB_SIZE * MAX_PB_SIZE]);
-        LOCAL_ALIGNED_16(int16_t, tmp2, [MAX_PB_SIZE * MAX_PB_SIZE]);
-
-        luma_mc(s, tmp, tmpstride, ref1->frame,
-                &current_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx);
-
-        if (weighted_pred) {
-            s->hevcdsp.weighted_pred[pred_idx](s->sh.luma_log2_weight_denom,
-                                               s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                                               s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-                                               dst0, s->frame->linesize[0], tmp, tmpstride,
-                                               nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred[pred_idx](dst0, s->frame->linesize[0], tmp, tmpstride, nPbH);
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+                          0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+                          0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
         }
+    } else if (current_mv.pred_flag == PF_L1) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
 
-        chroma_mc(s, tmp, tmp2, tmpstride, ref1->frame,
-                  &current_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
+        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+                    &current_mv.mv[1], x0, y0, nPbW, nPbH,
+                    s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                    s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
 
-        if (weighted_pred) {
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
-                                                      s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
-                                                      dst1, s->frame->linesize[1], tmp, tmpstride, nPbH/2);
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
-                                                      s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
-                                                      dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH/2);
-        } else {
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst1, s->frame->linesize[1], tmp,  tmpstride, nPbH / 2);
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH / 2);
-        }
-    } else if (current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
-        LOCAL_ALIGNED_16(int16_t, tmp,  [MAX_PB_SIZE * MAX_PB_SIZE]);
-        LOCAL_ALIGNED_16(int16_t, tmp2, [MAX_PB_SIZE * MAX_PB_SIZE]);
-        LOCAL_ALIGNED_16(int16_t, tmp3, [MAX_PB_SIZE * MAX_PB_SIZE]);
-        LOCAL_ALIGNED_16(int16_t, tmp4, [MAX_PB_SIZE * MAX_PB_SIZE]);
-
-        luma_mc(s, tmp, tmpstride, ref0->frame,
-                &current_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx);
-        luma_mc(s, tmp2, tmpstride, ref1->frame,
-                &current_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx);
-
-        if (weighted_pred) {
-            s->hevcdsp.weighted_pred_avg[pred_idx](s->sh.luma_log2_weight_denom,
-                                                   s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                                                   s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                                                   s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-                                                   s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-                                                   dst0, s->frame->linesize[0],
-                                                   tmp, tmp2, tmpstride, nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred_avg[pred_idx](dst0, s->frame->linesize[0],
-                                                         tmp, tmp2, tmpstride, nPbH);
-        }
-
-        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
-                  &current_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
-        chroma_mc(s, tmp3, tmp4, tmpstride, ref1->frame,
-                  &current_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
-
-        if (weighted_pred) {
-            s->hevcdsp.weighted_pred_avg_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
-                                                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
-                                                          s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
-                                                          s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
-                                                          dst1, s->frame->linesize[1], tmp, tmp3,
-                                                          tmpstride, nPbH / 2);
-            s->hevcdsp.weighted_pred_avg_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
-                                                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
-                                                          s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
-                                                          s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
-                                                          dst2, s->frame->linesize[2], tmp2, tmp4,
-                                                          tmpstride, nPbH / 2);
-        } else {
-            s->hevcdsp.put_unweighted_pred_avg_chroma[pred_idx](dst1, s->frame->linesize[1], tmp, tmp3,  tmpstride, nPbH/2);
-            s->hevcdsp.put_unweighted_pred_avg_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmp4, tmpstride, nPbH/2);
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+                          1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+
+            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+                          1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
+        }
+    } else if (current_mv.pred_flag == PF_BI) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+
+        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+                   &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                   ref1->frame, &current_mv.mv[1], &current_mv);
+
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+                         x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+
+            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+                         x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
         }
     }
 }
@@ -1890,13 +1898,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
 static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
                                 int prev_intra_luma_pred_flag)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
     int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int size_in_pus      = pu_size >> s->ps.sps->log2_min_pu_size;
-    int x0b              = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b              = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b              = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b              = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
     int cand_up   = (lc->ctb_up_flag || y0b) ?
                     s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC;
@@ -1960,15 +1968,7 @@ static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
                intra_pred_mode, size_in_pus);
 
         for (j = 0; j < size_in_pus; j++) {
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].is_intra     = 1;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag[0] = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag[1] = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].ref_idx[0]   = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].ref_idx[1]   = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[0].x      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[0].y      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[1].x      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[1].y      = 0;
+            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag = PF_INTRA;
         }
     }
 
@@ -1988,10 +1988,14 @@ static av_always_inline void set_ct_depth(HEVCContext *s, int x0, int y0,
                ct_depth, length);
 }
 
+static const uint8_t tab_mode_idx[] = {
+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
+
 static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
                                   int log2_cb_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
     uint8_t prev_intra_luma_pred_flag[4];
     int split   = lc->cu.part_mode == PART_NxN;
@@ -2017,14 +2021,42 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
         }
     }
 
-    chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
-    if (chroma_mode != 4) {
-        if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-            lc->pu.intra_pred_mode_c = 34;
-        else
-            lc->pu.intra_pred_mode_c = intra_chroma_table[chroma_mode];
-    } else {
-        lc->pu.intra_pred_mode_c = lc->pu.intra_pred_mode[0];
+    if (s->ps.sps->chroma_format_idc == 3) {
+        for (i = 0; i < side; i++) {
+            for (j = 0; j < side; j++) {
+                lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+                if (chroma_mode != 4) {
+                    if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode])
+                        lc->pu.intra_pred_mode_c[2 * i + j] = 34;
+                    else
+                        lc->pu.intra_pred_mode_c[2 * i + j] = intra_chroma_table[chroma_mode];
+                } else {
+                    lc->pu.intra_pred_mode_c[2 * i + j] = lc->pu.intra_pred_mode[2 * i + j];
+                }
+            }
+        }
+    } else if (s->ps.sps->chroma_format_idc == 2) {
+        int mode_idx;
+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                mode_idx = 34;
+            else
+                mode_idx = intra_chroma_table[chroma_mode];
+        } else {
+            mode_idx = lc->pu.intra_pred_mode[0];
+        }
+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
+    } else if (s->ps.sps->chroma_format_idc != 0) {
+        chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                lc->pu.intra_pred_mode_c[0] = 34;
+            else
+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
+        } else {
+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
+        }
     }
 }
 
@@ -2032,7 +2064,7 @@ static void intra_prediction_unit_default_value(HEVCContext *s,
                                                 int x0, int y0,
                                                 int log2_cb_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int pb_size          = 1 << log2_cb_size;
     int size_in_pus      = pb_size >> s->ps.sps->log2_min_pu_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
@@ -2043,22 +2075,25 @@ static void intra_prediction_unit_default_value(HEVCContext *s,
 
     if (size_in_pus == 0)
         size_in_pus = 1;
-    for (j = 0; j < size_in_pus; j++) {
+    for (j = 0; j < size_in_pus; j++)
         memset(&s->tab_ipm[(y_pu + j) * min_pu_width + x_pu], INTRA_DC, size_in_pus);
-        for (k = 0; k < size_in_pus; k++)
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].is_intra = lc->cu.pred_mode == MODE_INTRA;
-    }
+    if (lc->cu.pred_mode == MODE_INTRA)
+        for (j = 0; j < size_in_pus; j++)
+            for (k = 0; k < size_in_pus; k++)
+                tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA;
 }
 
 static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
 {
     int cb_size          = 1 << log2_cb_size;
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
     int length           = cb_size >> log2_min_cb_size;
     int min_cb_width     = s->ps.sps->min_cb_width;
     int x_cb             = x0 >> log2_min_cb_size;
     int y_cb             = y0 >> log2_min_cb_size;
+    int idx              = log2_cb_size - 2;
+    int qp_block_mask    = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
     int x, y, ret;
 
     lc->cu.x                = x0;
@@ -2086,10 +2121,16 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
             x += min_cb_width;
         }
         lc->cu.pred_mode = skip_flag ? MODE_SKIP : MODE_INTER;
+    } else {
+        x = y_cb * min_cb_width + x_cb;
+        for (y = 0; y < length; y++) {
+            memset(&s->skip_flag[x], 0, length);
+            x += min_cb_width;
+        }
     }
 
     if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
-        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
         intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
 
         if (!s->sh.disable_deblocking_filter_flag)
@@ -2127,37 +2168,37 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
             intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
             switch (lc->cu.part_mode) {
             case PART_2Nx2N:
-                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
                 break;
             case PART_2NxN:
-                hls_prediction_unit(s, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
                 break;
             case PART_Nx2N:
-                hls_prediction_unit(s, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
                 break;
             case PART_2NxnU:
-                hls_prediction_unit(s, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx);
                 break;
             case PART_2NxnD:
-                hls_prediction_unit(s, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
                 break;
             case PART_nLx2N:
-                hls_prediction_unit(s, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
+                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
                 break;
             case PART_nRx2N:
-                hls_prediction_unit(s, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2);
+                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
                 break;
             case PART_NxN:
-                hls_prediction_unit(s, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1);
-                hls_prediction_unit(s, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3);
+                hls_prediction_unit(s, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
+                hls_prediction_unit(s, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
                 break;
             }
         }
@@ -2170,12 +2211,13 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                 rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(s);
             }
             if (rqt_root_cbf) {
+                const static int cbf[2] = { 0 };
                 lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
                                          s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
                                          s->ps.sps->max_transform_hierarchy_depth_inter;
                 ret = hls_transform_tree(s, x0, y0, x0, y0, x0, y0,
                                          log2_cb_size,
-                                         log2_cb_size, 0, 0, 0, 0);
+                                         log2_cb_size, 0, 0, cbf, cbf);
                 if (ret < 0)
                     return ret;
             } else {
@@ -2186,7 +2228,7 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
     }
 
     if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0)
-        ff_hevc_set_qPy(s, x0, y0, x0, y0, log2_cb_size);
+        ff_hevc_set_qPy(s, x0, y0, log2_cb_size);
 
     x = y_cb * min_cb_width + x_cb;
     for (y = 0; y < length; y++) {
@@ -2194,7 +2236,12 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
         x += min_cb_width;
     }
 
-    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct.depth);
+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
+        lc->qPy_pred = lc->qp_y;
+    }
+
+    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth);
 
     return 0;
 }
@@ -2202,11 +2249,12 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
 static int hls_coding_quadtree(HEVCContext *s, int x0, int y0,
                                int log2_cb_size, int cb_depth)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     const int cb_size    = 1 << log2_cb_size;
+    int ret;
     int split_cu;
 
-    lc->ct.depth = cb_depth;
+    lc->ct_depth = cb_depth;
     if (x0 + cb_size <= s->ps.sps->width  &&
         y0 + cb_size <= s->ps.sps->height &&
         log2_cb_size > s->ps.sps->log2_min_cb_size) {
@@ -2220,31 +2268,64 @@ static int hls_coding_quadtree(HEVCContext *s, int x0, int y0,
         lc->tu.cu_qp_delta          = 0;
     }
 
+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
+        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth) {
+        lc->tu.is_cu_chroma_qp_offset_coded = 0;
+    }
+
     if (split_cu) {
+        int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
         const int cb_size_split = cb_size >> 1;
         const int x1 = x0 + cb_size_split;
         const int y1 = y0 + cb_size_split;
 
-        log2_cb_size--;
-        cb_depth++;
+        int more_data = 0;
 
-#define SUBDIVIDE(x, y)                                                \
-do {                                                                   \
-    if (x < s->ps.sps->width && y < s->ps.sps->height) {                     \
-        int ret = hls_coding_quadtree(s, x, y, log2_cb_size, cb_depth);\
-        if (ret < 0)                                                   \
-            return ret;                                                \
-    }                                                                  \
-} while (0)
+        more_data = hls_coding_quadtree(s, x0, y0, log2_cb_size - 1, cb_depth + 1);
+        if (more_data < 0)
+            return more_data;
 
-        SUBDIVIDE(x0, y0);
-        SUBDIVIDE(x1, y0);
-        SUBDIVIDE(x0, y1);
-        SUBDIVIDE(x1, y1);
+        if (more_data && x1 < s->ps.sps->width) {
+            more_data = hls_coding_quadtree(s, x1, y0, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, x0, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && x1 < s->ps.sps->width &&
+            y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, x1, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+
+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
+            lc->qPy_pred = lc->qp_y;
+
+        if (more_data)
+            return ((x1 + cb_size_split) < s->ps.sps->width ||
+                    (y1 + cb_size_split) < s->ps.sps->height);
+        else
+            return 0;
     } else {
-        int ret = hls_coding_unit(s, x0, y0, log2_cb_size);
+        ret = hls_coding_unit(s, x0, y0, log2_cb_size);
         if (ret < 0)
             return ret;
+        if ((!((x0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (x0 + cb_size >= s->ps.sps->width)) &&
+            (!((y0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (y0 + cb_size >= s->ps.sps->height))) {
+            int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(s);
+            return !end_of_slice_flag;
+        } else {
+            return 1;
+        }
     }
 
     return 0;
@@ -2253,7 +2334,7 @@ do {                                                                   \
 static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
                                  int ctb_addr_ts)
 {
-    HEVCLocalContext *lc  = &s->HEVClc;
+    HEVCLocalContext *lc  = s->HEVClc;
     int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
     int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
     int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;
@@ -2267,7 +2348,6 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     } else if (s->ps.pps->tiles_enabled_flag) {
         if (ctb_addr_ts && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
             int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
-            lc->start_of_tiles_x = x_ctb;
             lc->end_of_tiles_x   = x_ctb + (s->ps.pps->column_width[idxX] << s->ps.sps->log2_ctb_size);
             lc->first_qp_group   = 1;
         }
@@ -2288,7 +2368,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
         if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
             lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
     } else {
-        if (!ctb_addr_in_slice)
+        if (ctb_addr_in_slice <= 0)
             lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
         if (ctb_addr_in_slice < s->ps.sps->ctb_width)
             lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
@@ -2300,8 +2380,9 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
 }
 
-static int hls_slice_data(HEVCContext *s)
+static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
 {
+    HEVCContext *s  = avctxt->priv_data;
     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
     int more_data   = 1;
     int x_ctb       = 0;
@@ -2309,6 +2390,19 @@ static int hls_slice_data(HEVCContext *s)
     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
     int ret;
 
+    if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->sh.dependent_slice_segment_flag) {
+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
 
@@ -2316,7 +2410,11 @@ static int hls_slice_data(HEVCContext *s)
         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
 
-        ff_hevc_cabac_init(s, ctb_addr_ts);
+        ret = ff_hevc_cabac_init(s, ctb_addr_ts);
+        if (ret < 0) {
+            s->tab_slice_address[ctb_addr_rs] = -1;
+            return ret;
+        }
 
         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
 
@@ -2324,10 +2422,12 @@ static int hls_slice_data(HEVCContext *s)
         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
 
-        ret = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-        if (ret < 0)
-            return ret;
-        more_data = !ff_hevc_end_of_slice_flag_decode(s);
+        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+        if (more_data < 0) {
+            s->tab_slice_address[ctb_addr_rs] = -1;
+            return more_data;
+        }
+
 
         ctb_addr_ts++;
         ff_hevc_save_states(s, ctb_addr_ts);
@@ -2336,36 +2436,199 @@ static int hls_slice_data(HEVCContext *s)
 
     if (x_ctb + ctb_size >= s->ps.sps->width &&
         y_ctb + ctb_size >= s->ps.sps->height)
-        ff_hevc_hls_filter(s, x_ctb, y_ctb);
+        ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
 
     return ctb_addr_ts;
 }
 
-static void restore_tqb_pixels(HEVCContext *s)
+static int hls_slice_data(HEVCContext *s)
+{
+    int arg[2];
+    int ret[2];
+
+    arg[0] = 0;
+    arg[1] = 1;
+
+    s->avctx->execute(s->avctx, hls_decode_entry, arg, ret , 1, sizeof(int));
+    return ret[0];
+}
+static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int job, int self_id)
 {
-    int min_pu_size = 1 << s->ps.sps->log2_min_pu_size;
-    int x, y, c_idx;
-
-    for (c_idx = 0; c_idx < 3; c_idx++) {
-        ptrdiff_t stride = s->frame->linesize[c_idx];
-        int hshift       = s->ps.sps->hshift[c_idx];
-        int vshift       = s->ps.sps->vshift[c_idx];
-        for (y = 0; y < s->ps.sps->min_pu_height; y++) {
-            for (x = 0; x < s->ps.sps->min_pu_width; x++) {
-                if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
-                    int n;
-                    int len      = min_pu_size >> hshift;
-                    uint8_t *src = &s->frame->data[c_idx][((y << s->ps.sps->log2_min_pu_size) >> vshift) * stride + (((x << s->ps.sps->log2_min_pu_size) >> hshift) << s->ps.sps->pixel_shift)];
-                    uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->ps.sps->log2_min_pu_size) >> vshift) * stride + (((x << s->ps.sps->log2_min_pu_size) >> hshift) << s->ps.sps->pixel_shift)];
-                    for (n = 0; n < (min_pu_size >> vshift); n++) {
-                        memcpy(dst, src, len);
-                        src += stride;
-                        dst += stride;
-                    }
-                }
+    HEVCContext *s1  = avctxt->priv_data, *s;
+    HEVCLocalContext *lc;
+    int ctb_size    = 1<< s1->ps.sps->log2_ctb_size;
+    int more_data   = 1;
+    int *ctb_row_p    = input_ctb_row;
+    int ctb_row = ctb_row_p[job];
+    int ctb_addr_rs = s1->sh.slice_ctb_addr_rs + ctb_row * ((s1->ps.sps->width + ctb_size - 1) >> s1->ps.sps->log2_ctb_size);
+    int ctb_addr_ts = s1->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+    int thread = ctb_row % s1->threads_number;
+    int ret;
+
+    s = s1->sList[self_id];
+    lc = s->HEVClc;
+
+    if(ctb_row) {
+        ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+        if (ret < 0)
+            goto error;
+        ff_init_cabac_decoder(&lc->cc, s->data + s->sh.offset[(ctb_row)-1], s->sh.size[ctb_row - 1]);
+    }
+
+    while(more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+        int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
+        int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
+
+        hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+
+        ff_thread_await_progress2(s->avctx, ctb_row, thread, SHIFT_CTB_WPP);
+
+        if (atomic_load(&s1->wpp_err)) {
+            ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP);
+            return 0;
+        }
+
+        ret = ff_hevc_cabac_init(s, ctb_addr_ts);
+        if (ret < 0)
+            goto error;
+        hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+
+        if (more_data < 0) {
+            ret = more_data;
+            goto error;
+        }
+
+        ctb_addr_ts++;
+
+        ff_hevc_save_states(s, ctb_addr_ts);
+        ff_thread_report_progress2(s->avctx, ctb_row, thread, 1);
+        ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+
+        if (!more_data && (x_ctb+ctb_size) < s->ps.sps->width && ctb_row != s->sh.num_entry_point_offsets) {
+            atomic_store(&s1->wpp_err, 1);
+            ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+            return 0;
+        }
+
+        if ((x_ctb+ctb_size) >= s->ps.sps->width && (y_ctb+ctb_size) >= s->ps.sps->height ) {
+            ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+            ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP);
+            return ctb_addr_ts;
+        }
+        ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+        x_ctb+=ctb_size;
+
+        if(x_ctb >= s->ps.sps->width) {
+            break;
+        }
+    }
+    ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+
+    return 0;
+error:
+    s->tab_slice_address[ctb_addr_rs] = -1;
+    atomic_store(&s1->wpp_err, 1);
+    ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+    return ret;
+}
+
+static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal)
+{
+    const uint8_t *data = nal->data;
+    int length          = nal->size;
+    HEVCLocalContext *lc = s->HEVClc;
+    int *ret = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
+    int *arg = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
+    int64_t offset;
+    int64_t startheader, cmpt = 0;
+    int i, j, res = 0;
+
+    if (!ret || !arg) {
+        av_free(ret);
+        av_free(arg);
+        return AVERROR(ENOMEM);
+    }
+
+    if (s->sh.slice_ctb_addr_rs + s->sh.num_entry_point_offsets * s->ps.sps->ctb_width >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) {
+        av_log(s->avctx, AV_LOG_ERROR, "WPP ctb addresses are wrong (%d %d %d %d)\n",
+            s->sh.slice_ctb_addr_rs, s->sh.num_entry_point_offsets,
+            s->ps.sps->ctb_width, s->ps.sps->ctb_height
+        );
+        res = AVERROR_INVALIDDATA;
+        goto error;
+    }
+
+    ff_alloc_entries(s->avctx, s->sh.num_entry_point_offsets + 1);
+
+    if (!s->sList[1]) {
+        for (i = 1; i < s->threads_number; i++) {
+            s->sList[i] = av_malloc(sizeof(HEVCContext));
+            memcpy(s->sList[i], s, sizeof(HEVCContext));
+            s->HEVClcList[i] = av_mallocz(sizeof(HEVCLocalContext));
+            s->sList[i]->HEVClc = s->HEVClcList[i];
+        }
+    }
+
+    offset = (lc->gb.index >> 3);
+
+    for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+            startheader--;
+            cmpt++;
+        }
+    }
+
+    for (i = 1; i < s->sh.num_entry_point_offsets; i++) {
+        offset += (s->sh.entry_point_offset[i - 1] - cmpt);
+        for (j = 0, cmpt = 0, startheader = offset
+             + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) {
+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+                startheader--;
+                cmpt++;
             }
         }
+        s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt;
+        s->sh.offset[i - 1] = offset;
+
+    }
+    if (s->sh.num_entry_point_offsets != 0) {
+        offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt;
+        if (length < offset) {
+            av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
+            res = AVERROR_INVALIDDATA;
+            goto error;
+        }
+        s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset;
+        s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset;
+
+    }
+    s->data = data;
+
+    for (i = 1; i < s->threads_number; i++) {
+        s->sList[i]->HEVClc->first_qp_group = 1;
+        s->sList[i]->HEVClc->qp_y = s->sList[0]->HEVClc->qp_y;
+        memcpy(s->sList[i], s, sizeof(HEVCContext));
+        s->sList[i]->HEVClc = s->HEVClcList[i];
+    }
+
+    atomic_store(&s->wpp_err, 0);
+    ff_reset_entries(s->avctx);
+
+    for (i = 0; i <= s->sh.num_entry_point_offsets; i++) {
+        arg[i] = i;
+        ret[i] = 0;
     }
+
+    if (s->ps.pps->entropy_coding_sync_enabled_flag)
+        s->avctx->execute2(s->avctx, hls_decode_entry_wpp, arg, ret, s->sh.num_entry_point_offsets + 1);
+
+    for (i = 0; i <= s->sh.num_entry_point_offsets; i++)
+        res += ret[i];
+error:
+    av_free(ret);
+    av_free(arg);
+    return res;
 }
 
 static int set_side_data(HEVCContext *s)
@@ -2423,6 +2686,86 @@ static int set_side_data(HEVCContext *s)
                                s->sei.display_orientation.vflip);
     }
 
+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
+    // so the side data persists for the entire coded video sequence.
+    if (s->sei.mastering_display.present > 0 &&
+        IS_IRAP(s) && s->no_rasl_output_flag) {
+        s->sei.mastering_display.present--;
+    }
+    if (s->sei.mastering_display.present) {
+        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
+        const int mapping[3] = {2, 0, 1};
+        const int chroma_den = 50000;
+        const int luma_den = 10000;
+        int i;
+        AVMasteringDisplayMetadata *metadata =
+            av_mastering_display_metadata_create_side_data(out);
+        if (!metadata)
+            return AVERROR(ENOMEM);
+
+        for (i = 0; i < 3; i++) {
+            const int j = mapping[i];
+            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
+            metadata->display_primaries[i][0].den = chroma_den;
+            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
+            metadata->display_primaries[i][1].den = chroma_den;
+        }
+        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
+        metadata->white_point[0].den = chroma_den;
+        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
+        metadata->white_point[1].den = chroma_den;
+
+        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
+        metadata->max_luminance.den = luma_den;
+        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
+        metadata->min_luminance.den = luma_den;
+        metadata->has_luminance = 1;
+        metadata->has_primaries = 1;
+
+        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
+               av_q2d(metadata->display_primaries[0][0]),
+               av_q2d(metadata->display_primaries[0][1]),
+               av_q2d(metadata->display_primaries[1][0]),
+               av_q2d(metadata->display_primaries[1][1]),
+               av_q2d(metadata->display_primaries[2][0]),
+               av_q2d(metadata->display_primaries[2][1]),
+               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "min_luminance=%f, max_luminance=%f\n",
+               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
+    }
+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
+    // so the side data persists for the entire coded video sequence.
+    if (s->sei.content_light.present > 0 &&
+        IS_IRAP(s) && s->no_rasl_output_flag) {
+        s->sei.content_light.present--;
+    }
+    if (s->sei.content_light.present) {
+        AVContentLightMetadata *metadata =
+            av_content_light_metadata_create_side_data(out);
+        if (!metadata)
+            return AVERROR(ENOMEM);
+        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
+        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
+
+        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
+        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
+               metadata->MaxCLL, metadata->MaxFALL);
+    }
+
+    if (s->sei.a53_caption.a53_caption) {
+        AVFrameSideData* sd = av_frame_new_side_data(out,
+                                                     AV_FRAME_DATA_A53_CC,
+                                                     s->sei.a53_caption.a53_caption_size);
+        if (sd)
+            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
+        av_freep(&s->sei.a53_caption.a53_caption);
+        s->sei.a53_caption.a53_caption_size = 0;
+        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+    }
+
     if (s->sei.alternative_transfer.present &&
         av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
         s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
@@ -2434,23 +2777,26 @@ static int set_side_data(HEVCContext *s)
 
 static int hevc_frame_start(HEVCContext *s)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
+    int pic_size_in_ctb  = ((s->ps.sps->width  >> s->ps.sps->log2_min_cb_size) + 1) *
+                           ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
     int ret;
 
-    memset(s->horizontal_bs, 0, 2 * s->bs_width * (s->bs_height + 1));
-    memset(s->vertical_bs,   0, 2 * s->bs_width * (s->bs_height + 1));
+    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
     memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-    memset(s->is_pcm,        0, s->ps.sps->min_pu_width * s->ps.sps->min_pu_height);
+    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
 
-    lc->start_of_tiles_x = 0;
     s->is_decoded        = 0;
     s->first_nal_type    = s->nal_unit_type;
 
+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
+
     if (s->ps.pps->tiles_enabled_flag)
         lc->end_of_tiles_x = s->ps.pps->column_width[0] << s->ps.sps->log2_ctb_size;
 
-    ret = ff_hevc_set_new_ref(s, s->ps.sps->sao_enabled ? &s->sao_frame : &s->frame,
-                              s->poc);
+    ret = ff_hevc_set_new_ref(s, &s->frame, s->poc);
     if (ret < 0)
         goto fail;
 
@@ -2466,12 +2812,18 @@ static int hevc_frame_start(HEVCContext *s)
     if (ret < 0)
         goto fail;
 
+    s->frame->pict_type = 3 - s->sh.slice_type;
+
+    if (!IS_IRAP(s))
+        ff_hevc_bump_frame(s);
+
     av_frame_unref(s->output_frame);
     ret = ff_hevc_output_frame(s, s->output_frame, 0);
     if (ret < 0)
         goto fail;
 
-    ff_thread_finish_setup(s->avctx);
+    if (!s->avctx->hwaccel)
+        ff_thread_finish_setup(s->avctx);
 
     return 0;
 
@@ -2484,7 +2836,7 @@ fail:
 
 static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     GetBitContext *gb    = &lc->gb;
     int ctb_addr_ts, ret;
 
@@ -2494,25 +2846,56 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
 
     switch (s->nal_unit_type) {
     case HEVC_NAL_VPS:
+        if (s->avctx->hwaccel && s->avctx->hwaccel->decode_params) {
+            ret = s->avctx->hwaccel->decode_params(s->avctx,
+                                                   nal->type,
+                                                   nal->raw_data,
+                                                   nal->raw_size);
+            if (ret < 0)
+                goto fail;
+        }
         ret = ff_hevc_decode_nal_vps(gb, s->avctx, &s->ps);
         if (ret < 0)
             goto fail;
         break;
     case HEVC_NAL_SPS:
+        if (s->avctx->hwaccel && s->avctx->hwaccel->decode_params) {
+            ret = s->avctx->hwaccel->decode_params(s->avctx,
+                                                   nal->type,
+                                                   nal->raw_data,
+                                                   nal->raw_size);
+            if (ret < 0)
+                goto fail;
+        }
         ret = ff_hevc_decode_nal_sps(gb, s->avctx, &s->ps,
                                      s->apply_defdispwin);
         if (ret < 0)
             goto fail;
         break;
     case HEVC_NAL_PPS:
+        if (s->avctx->hwaccel && s->avctx->hwaccel->decode_params) {
+            ret = s->avctx->hwaccel->decode_params(s->avctx,
+                                                   nal->type,
+                                                   nal->raw_data,
+                                                   nal->raw_size);
+            if (ret < 0)
+                goto fail;
+        }
         ret = ff_hevc_decode_nal_pps(gb, s->avctx, &s->ps);
         if (ret < 0)
             goto fail;
         break;
     case HEVC_NAL_SEI_PREFIX:
     case HEVC_NAL_SEI_SUFFIX:
-        ret = ff_hevc_decode_nal_sei(gb, s->avctx, &s->sei,
-                                     s->nal_unit_type);
+        if (s->avctx->hwaccel && s->avctx->hwaccel->decode_params) {
+            ret = s->avctx->hwaccel->decode_params(s->avctx,
+                                                   nal->type,
+                                                   nal->raw_data,
+                                                   nal->raw_size);
+            if (ret < 0)
+                goto fail;
+        }
+        ret = ff_hevc_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
         if (ret < 0)
             goto fail;
         break;
@@ -2536,25 +2919,33 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
         if (ret < 0)
             return ret;
 
-        if (s->max_ra == INT_MAX) {
-            if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-                s->max_ra = s->poc;
-            } else {
-                if (IS_IDR(s))
-                    s->max_ra = INT_MIN;
-            }
-        }
-
-        if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
-            s->poc <= s->max_ra) {
-            s->is_decoded = 0;
+        if (
+            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
+            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
+            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s))) {
             break;
-        } else {
-            if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
-                s->max_ra = INT_MIN;
         }
 
         if (s->sh.first_slice_in_pic_flag) {
+            if (s->max_ra == INT_MAX) {
+                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
+                    s->max_ra = s->poc;
+                } else {
+                    if (IS_IDR(s))
+                        s->max_ra = INT_MIN;
+                }
+            }
+
+            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
+                s->poc <= s->max_ra) {
+                s->is_decoded = 0;
+                break;
+            } else {
+                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
+                    s->max_ra = INT_MIN;
+            }
+
+            s->overlap ++;
             ret = hevc_frame_start(s);
             if (ret < 0)
                 return ret;
@@ -2591,13 +2982,12 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
             if (ret < 0)
                 goto fail;
         } else {
-            ctb_addr_ts = hls_slice_data(s);
+            if (s->threads_number > 1 && s->sh.num_entry_point_offsets > 0)
+                ctb_addr_ts = hls_slice_data_wpp(s, nal);
+            else
+                ctb_addr_ts = hls_slice_data(s);
             if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) {
                 s->is_decoded = 1;
-                if ((s->ps.pps->transquant_bypass_enable_flag ||
-                     (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) &&
-                    s->ps.sps->sao_enabled)
-                    restore_tqb_pixels(s);
             }
 
             if (ctb_addr_ts < 0) {
@@ -2629,14 +3019,17 @@ fail:
 static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
 {
     int i, ret = 0;
+    int eos_at_start = 1;
 
     s->ref = NULL;
+    s->last_eos = s->eos;
     s->eos = 0;
+    s->overlap = 0;
 
     /* split the input packet into NAL units, so we know the upper bound on the
      * number of slices in the frame */
     ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
-                                s->nal_length_size, s->avctx->codec_id);
+                                s->nal_length_size, s->avctx->codec_id, 1, 0);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR,
                "Error splitting the input into NAL units.\n");
@@ -2645,13 +3038,29 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
 
     for (i = 0; i < s->pkt.nb_nals; i++) {
         if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
-            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT)
-            s->eos = 1;
+            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
+            if (eos_at_start) {
+                s->last_eos = 1;
+            } else {
+                s->eos = 1;
+            }
+        } else {
+            eos_at_start = 0;
+        }
     }
 
     /* decode the NAL units */
     for (i = 0; i < s->pkt.nb_nals; i++) {
-        ret = decode_nal_unit(s, &s->pkt.nals[i]);
+        H2645NAL *nal = &s->pkt.nals[i];
+
+        if (s->avctx->skip_frame >= AVDISCARD_ALL ||
+            (s->avctx->skip_frame >= AVDISCARD_NONREF
+            && ff_hevc_nal_is_nonref(nal->type)))
+            continue;
+
+        ret = decode_nal_unit(s, nal);
+        if (ret >= 0 && s->overlap > 2)
+            ret = AVERROR_INVALIDDATA;
         if (ret < 0) {
             av_log(s->avctx, AV_LOG_WARNING,
                    "Error parsing NAL unit #%d.\n", i);
@@ -2660,7 +3069,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
     }
 
 fail:
-    if (s->ref)
+    if (s->ref && s->threads_type == FF_THREAD_FRAME)
         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 
     return ret;
@@ -2739,69 +3148,19 @@ static int verify_md5(HEVCContext *s, AVFrame *frame)
     return 0;
 }
 
-static int hevc_decode_extradata(HEVCContext *s, uint8_t *buf, int length)
+static int hevc_decode_extradata(HEVCContext *s, uint8_t *buf, int length, int first)
 {
-    AVCodecContext *avctx = s->avctx;
-    GetByteContext gb;
     int ret, i;
 
-    bytestream2_init(&gb, buf, length);
-
-    if (length > 3 && (buf[0] || buf[1] || buf[2] > 1)) {
-        /* It seems the extradata is encoded as hvcC format.
-         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
-         * is finalized. When finalized, configurationVersion will be 1 and we
-         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
-        int i, j, num_arrays, nal_len_size;
-
-        s->is_nalff = 1;
-
-        bytestream2_skip(&gb, 21);
-        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
-        num_arrays   = bytestream2_get_byte(&gb);
-
-        /* nal units in the hvcC always have length coded with 2 bytes,
-         * so put a fake nal_length_size = 2 while parsing them */
-        s->nal_length_size = 2;
-
-        /* Decode nal units from hvcC. */
-        for (i = 0; i < num_arrays; i++) {
-            int type = bytestream2_get_byte(&gb) & 0x3f;
-            int cnt  = bytestream2_get_be16(&gb);
-
-            for (j = 0; j < cnt; j++) {
-                // +2 for the nal size field
-                int nalsize = bytestream2_peek_be16(&gb) + 2;
-                if (bytestream2_get_bytes_left(&gb) < nalsize) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Invalid NAL unit size in extradata.\n");
-                    return AVERROR_INVALIDDATA;
-                }
-
-                ret = decode_nal_units(s, gb.buffer, nalsize);
-                if (ret < 0) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Decoding nal unit %d %d from hvcC failed\n",
-                           type, i);
-                    return ret;
-                }
-                bytestream2_skip(&gb, nalsize);
-            }
-        }
-
-        /* Now store right nal length size, that will be used to parse
-         * all other nals */
-        s->nal_length_size = nal_len_size;
-    } else {
-        s->is_nalff = 0;
-        ret = decode_nal_units(s, buf, length);
-        if (ret < 0)
-            return ret;
-    }
+    ret = ff_hevc_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
+                                   &s->nal_length_size, s->avctx->err_recognition,
+                                   s->apply_defdispwin, s->avctx);
+    if (ret < 0)
+        return ret;
 
     /* export stream parameters from the first SPS */
     for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-        if (s->ps.sps_list[i]) {
+        if (first && s->ps.sps_list[i]) {
             const HEVCSPS *sps = (const HEVCSPS*)s->ps.sps_list[i]->data;
             export_stream_params(s->avctx, &s->ps, sps);
             break;
@@ -2831,7 +3190,7 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
     new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
                                             &new_extradata_size);
     if (new_extradata && new_extradata_size > 0) {
-        ret = hevc_decode_extradata(s, new_extradata, new_extradata_size);
+        ret = hevc_decode_extradata(s, new_extradata, new_extradata_size, 0);
         if (ret < 0)
             return ret;
     }
@@ -2842,9 +3201,12 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
         return ret;
 
     if (avctx->hwaccel) {
-        if (s->ref && avctx->hwaccel->end_frame(avctx) < 0)
+        if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
+            ff_hevc_unref_frame(s, s->ref, ~0);
+            return ret;
+        }
     } else {
         /* verify the SEI checksum */
         if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
@@ -2873,7 +3235,9 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
 
 static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
 {
-    int ret = ff_thread_ref_frame(&dst->tf, &src->tf);
+    int ret;
+
+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
     if (ret < 0)
         return ret;
 
@@ -2918,7 +3282,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
 
     av_freep(&s->md5_ctx);
 
-    av_frame_free(&s->tmp_frame);
+    av_freep(&s->cabac_state);
+
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
     av_frame_free(&s->output_frame);
 
     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -2926,12 +3295,22 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
         av_frame_free(&s->DPB[i].frame);
     }
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
-        av_buffer_unref(&s->ps.vps_list[i]);
-    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
-        av_buffer_unref(&s->ps.sps_list[i]);
-    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
-        av_buffer_unref(&s->ps.pps_list[i]);
+    ff_hevc_ps_uninit(&s->ps);
+
+    av_freep(&s->sh.entry_point_offset);
+    av_freep(&s->sh.offset);
+    av_freep(&s->sh.size);
+
+    for (i = 1; i < s->threads_number; i++) {
+        HEVCLocalContext *lc = s->HEVClcList[i];
+        if (lc) {
+            av_freep(&s->HEVClcList[i]);
+            av_freep(&s->sList[i]);
+        }
+    }
+    if (s->HEVClc == s->HEVClcList[0])
+        s->HEVClc = NULL;
+    av_freep(&s->HEVClcList[0]);
 
     ff_h2645_packet_uninit(&s->pkt);
 
@@ -2945,8 +3324,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
-    s->tmp_frame = av_frame_alloc();
-    if (!s->tmp_frame)
+    s->HEVClc = av_mallocz(sizeof(HEVCLocalContext));
+    if (!s->HEVClc)
+        goto fail;
+    s->HEVClcList[0] = s->HEVClc;
+    s->sList[0] = s;
+
+    s->cabac_state = av_malloc(HEVC_CONTEXTS);
+    if (!s->cabac_state)
         goto fail;
 
     s->output_frame = av_frame_alloc();
@@ -2969,6 +3354,9 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
     ff_bswapdsp_init(&s->bdsp);
 
     s->context_initialized = 1;
+    s->eos = 0;
+
+    ff_hevc_reset_sei(&s->sei);
 
     return 0;
 
@@ -2977,6 +3365,7 @@ fail:
     return AVERROR(ENOMEM);
 }
 
+#if HAVE_THREADS
 static int hevc_update_thread_context(AVCodecContext *dst,
                                       const AVCodecContext *src)
 {
@@ -2999,6 +3388,8 @@ static int hevc_update_thread_context(AVCodecContext *dst,
         }
     }
 
+    if (s->ps.sps != s0->ps.sps)
+        s->ps.sps = NULL;
     for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
         av_buffer_unref(&s->ps.vps_list[i]);
         if (s0->ps.vps_list[i]) {
@@ -3027,23 +3418,36 @@ static int hevc_update_thread_context(AVCodecContext *dst,
     }
 
     if (s->ps.sps != s0->ps.sps)
-        ret = set_sps(s, s0->ps.sps, src->pix_fmt);
+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
+            return ret;
 
     s->seq_decode = s0->seq_decode;
     s->seq_output = s0->seq_output;
     s->pocTid0    = s0->pocTid0;
     s->max_ra     = s0->max_ra;
+    s->eos        = s0->eos;
+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
 
     s->is_nalff        = s0->is_nalff;
     s->nal_length_size = s0->nal_length_size;
 
+    s->threads_number      = s0->threads_number;
+    s->threads_type        = s0->threads_type;
+
     if (s0->eos) {
         s->seq_decode = (s->seq_decode + 1) & 0xff;
         s->max_ra = INT_MAX;
     }
 
+    s->sei.frame_packing        = s0->sei.frame_packing;
+    s->sei.display_orientation  = s0->sei.display_orientation;
+    s->sei.mastering_display    = s0->sei.mastering_display;
+    s->sei.content_light        = s0->sei.content_light;
+    s->sei.alternative_transfer = s0->sei.alternative_transfer;
+
     return 0;
 }
+#endif
 
 static av_cold int hevc_decode_init(AVCodecContext *avctx)
 {
@@ -3056,17 +3460,34 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
+    s->enable_parallel_tiles = 0;
+    s->sei.picture_timing.picture_struct = 0;
+    s->eos = 1;
+
+    atomic_init(&s->wpp_err, 0);
+
+    if(avctx->active_thread_type & FF_THREAD_SLICE)
+        s->threads_number = avctx->thread_count;
+    else
+        s->threads_number = 1;
+
     if (avctx->extradata_size > 0 && avctx->extradata) {
-        ret = hevc_decode_extradata(s, avctx->extradata, avctx->extradata_size);
+        ret = hevc_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
         if (ret < 0) {
             hevc_decode_free(avctx);
             return ret;
         }
     }
 
+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+            s->threads_type = FF_THREAD_FRAME;
+        else
+            s->threads_type = FF_THREAD_SLICE;
+
     return 0;
 }
 
+#if HAVE_THREADS
 static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
 {
     HEVCContext *s = avctx->priv_data;
@@ -3080,12 +3501,14 @@ static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 static void hevc_decode_flush(AVCodecContext *avctx)
 {
     HEVCContext *s = avctx->priv_data;
     ff_hevc_flush_dpb(s);
     s->max_ra = INT_MAX;
+    s->eos = 1;
 }
 
 #define OFFSET(x) offsetof(HEVCContext, x)
@@ -3093,7 +3516,9 @@ static void hevc_decode_flush(AVCodecContext *avctx)
 
 static const AVOption options[] = {
     { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
-        AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, PAR },
+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
     { NULL },
 };
 
@@ -3115,16 +3540,13 @@ AVCodec ff_hevc_decoder = {
     .close                 = hevc_decode_free,
     .decode                = hevc_decode_frame,
     .flush                 = hevc_decode_flush,
-    .update_thread_context = hevc_update_thread_context,
-    .init_thread_copy      = hevc_init_thread_copy,
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
+    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(hevc_init_thread_copy),
     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-                             AV_CODEC_CAP_FRAME_THREADS,
+                             AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING,
     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
-    .caps_internal         = FF_CODEC_CAP_EXPORTS_CROPPING | FF_CODEC_CAP_INIT_THREADSAFE,
     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
-#if CONFIG_HEVC_CUVID_HWACCEL
-                               HWACCEL_CUVID(hevc),
-#endif
 #if CONFIG_HEVC_DXVA2_HWACCEL
                                HWACCEL_DXVA2(hevc),
 #endif
@@ -3134,12 +3556,18 @@ AVCodec ff_hevc_decoder = {
 #if CONFIG_HEVC_D3D11VA2_HWACCEL
                                HWACCEL_D3D11VA2(hevc),
 #endif
+#if CONFIG_HEVC_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(hevc),
+#endif
 #if CONFIG_HEVC_VAAPI_HWACCEL
                                HWACCEL_VAAPI(hevc),
 #endif
 #if CONFIG_HEVC_VDPAU_HWACCEL
                                HWACCEL_VDPAU(hevc),
 #endif
+#if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+                               HWACCEL_VIDEOTOOLBOX(hevc),
+#endif
                                NULL
                            },
 };
diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h
index 7adb826..b45969b 100644
--- a/libavcodec/hevcdec.h
+++ b/libavcodec/hevcdec.h
@@ -3,28 +3,27 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_HEVCDEC_H
 #define AVCODEC_HEVCDEC_H
 
-#include <stddef.h>
-#include <stdint.h>
+#include <stdatomic.h>
 
 #include "libavutil/buffer.h"
 #include "libavutil/md5.h"
@@ -33,6 +32,7 @@
 #include "bswapdsp.h"
 #include "cabac.h"
 #include "get_bits.h"
+#include "hevcpred.h"
 #include "h2645_parse.h"
 #include "hevc.h"
 #include "hevc_ps.h"
@@ -42,15 +42,17 @@
 #include "thread.h"
 #include "videodsp.h"
 
+#define MAX_NB_THREADS 16
+#define SHIFT_CTB_WPP 2
+
 //TODO: check if this is really the maximum
 #define MAX_TRANSFORM_DEPTH 5
 
 #define MAX_TB_SIZE 32
-#define MAX_PB_SIZE 64
 #define MAX_QP 51
 #define DEFAULT_INTRA_TC_OFFSET 2
 
-#define HEVC_CONTEXTS 183
+#define HEVC_CONTEXTS 199
 
 #define MRG_MAX_NUM_CANDS     5
 
@@ -60,6 +62,9 @@
 #define EPEL_EXTRA_BEFORE 1
 #define EPEL_EXTRA_AFTER  2
 #define EPEL_EXTRA        3
+#define QPEL_EXTRA_BEFORE 3
+#define QPEL_EXTRA_AFTER  4
+#define QPEL_EXTRA        7
 
 #define EDGE_EMU_BUFFER_STRIDE 80
 
@@ -69,13 +74,10 @@
 #define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
 #define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
 
-#define IS_IDR(s) (s->nal_unit_type == HEVC_NAL_IDR_W_RADL || s->nal_unit_type == HEVC_NAL_IDR_N_LP)
-#define IS_BLA(s) (s->nal_unit_type == HEVC_NAL_BLA_W_RADL || s->nal_unit_type == HEVC_NAL_BLA_W_LP || \
-                   s->nal_unit_type == HEVC_NAL_BLA_N_LP)
-#define IS_IRAP(s) (s->nal_unit_type >= 16 && s->nal_unit_type <= 23)
-
-#define FFUDIV(a,b) (((a) > 0 ? (a) : (a) - (b) + 1) / (b))
-#define FFUMOD(a,b) ((a) - (b) * FFUDIV(a,b))
+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
+                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
 
 enum RPSType {
     ST_CURR_BEF = 0,
@@ -120,6 +122,8 @@ enum SyntaxElement {
     CBF_LUMA,
     CBF_CB_CR,
     TRANSFORM_SKIP_FLAG,
+    EXPLICIT_RDPCM_FLAG,
+    EXPLICIT_RDPCM_DIR_FLAG,
     LAST_SIGNIFICANT_COEFF_X_PREFIX,
     LAST_SIGNIFICANT_COEFF_Y_PREFIX,
     LAST_SIGNIFICANT_COEFF_X_SUFFIX,
@@ -130,6 +134,10 @@ enum SyntaxElement {
     COEFF_ABS_LEVEL_GREATER2_FLAG,
     COEFF_ABS_LEVEL_REMAINING,
     COEFF_SIGN_FLAG,
+    LOG2_RES_SCALE_ABS,
+    RES_SCALE_SIGN_FLAG,
+    CU_CHROMA_QP_OFFSET_FLAG,
+    CU_CHROMA_QP_OFFSET_IDX,
 };
 
 enum PartMode {
@@ -155,6 +163,13 @@ enum InterPredIdc {
     PRED_BI,
 };
 
+enum PredFlag {
+    PF_INTRA = 0,
+    PF_L0,
+    PF_L1,
+    PF_BI,
+};
+
 enum IntraPredMode {
     INTRA_PLANAR = 0,
     INTRA_DC,
@@ -197,6 +212,7 @@ enum SAOType {
     SAO_NOT_APPLIED = 0,
     SAO_BAND,
     SAO_EDGE,
+    SAO_APPLIED
 };
 
 enum SAOEOClass {
@@ -212,12 +228,6 @@ enum ScanType {
     SCAN_VERT,
 };
 
-typedef struct LongTermRPS {
-    int     poc[32];
-    uint8_t used[32];
-    uint8_t nb_refs;
-} LongTermRPS;
-
 typedef struct RefPicList {
     struct HEVCFrame *ref[HEVC_MAX_REFS];
     int list[HEVC_MAX_REFS];
@@ -229,82 +239,6 @@ typedef struct RefPicListTab {
     RefPicList refPicList[2];
 } RefPicListTab;
 
-typedef struct SliceHeader {
-    unsigned int pps_id;
-
-    ///< address (in raster order) of the first block in the current slice segment
-    unsigned int   slice_segment_addr;
-    ///< address (in raster order) of the first block in the current slice
-    unsigned int   slice_addr;
-
-    enum HEVCSliceType slice_type;
-
-    int pic_order_cnt_lsb;
-
-    uint8_t first_slice_in_pic_flag;
-    uint8_t dependent_slice_segment_flag;
-    uint8_t pic_output_flag;
-    uint8_t colour_plane_id;
-
-    ///< RPS coded in the slice header itself is stored here
-    int short_term_ref_pic_set_sps_flag;
-    int short_term_ref_pic_set_size;
-    ShortTermRPS slice_rps;
-    const ShortTermRPS *short_term_rps;
-    int long_term_ref_pic_set_size;
-    LongTermRPS long_term_rps;
-    unsigned int list_entry_lx[2][32];
-
-    uint8_t rpl_modification_flag[2];
-    uint8_t no_output_of_prior_pics_flag;
-    uint8_t slice_temporal_mvp_enabled_flag;
-
-    unsigned int nb_refs[2];
-
-    uint8_t slice_sample_adaptive_offset_flag[3];
-    uint8_t mvd_l1_zero_flag;
-
-    uint8_t cabac_init_flag;
-    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
-    uint8_t slice_loop_filter_across_slices_enabled_flag;
-    uint8_t collocated_list;
-
-    unsigned int collocated_ref_idx;
-
-    int slice_qp_delta;
-    int slice_cb_qp_offset;
-    int slice_cr_qp_offset;
-
-    int beta_offset;    ///< beta_offset_div2 * 2
-    int tc_offset;      ///< tc_offset_div2 * 2
-
-    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
-
-    int num_entry_point_offsets;
-
-    int8_t slice_qp;
-
-    uint8_t luma_log2_weight_denom;
-    int16_t chroma_log2_weight_denom;
-
-    int16_t luma_weight_l0[16];
-    int16_t chroma_weight_l0[16][2];
-    int16_t chroma_weight_l1[16][2];
-    int16_t luma_weight_l1[16];
-
-    int16_t luma_offset_l0[16];
-    int16_t chroma_offset_l0[16][2];
-
-    int16_t luma_offset_l1[16];
-    int16_t chroma_offset_l1[16][2];
-
-    int slice_ctb_addr_rs;
-} SliceHeader;
-
-typedef struct CodingTree {
-    int depth; ///< ctDepth
-} CodingTree;
-
 typedef struct CodingUnit {
     int x;
     int y;
@@ -326,8 +260,7 @@ typedef struct Mv {
 typedef struct MvField {
     DECLARE_ALIGNED(4, Mv, mv)[2];
     int8_t ref_idx[2];
-    int8_t pred_flag[2];
-    uint8_t is_intra;
+    int8_t pred_flag;
 } MvField;
 
 typedef struct NeighbourAvailable {
@@ -345,15 +278,24 @@ typedef struct PredictionUnit {
     uint8_t intra_pred_mode[4];
     Mv mvd;
     uint8_t merge_flag;
-    uint8_t intra_pred_mode_c;
+    uint8_t intra_pred_mode_c[4];
+    uint8_t chroma_mode_c[4];
 } PredictionUnit;
 
 typedef struct TransformUnit {
     int cu_qp_delta;
 
+    int res_scale_val;
+
     // Inferred parameters;
-    int cur_intra_pred_mode;
+    int intra_pred_mode;
+    int intra_pred_mode_c;
+    int chroma_mode_c;
     uint8_t is_cu_qp_delta_coded;
+    uint8_t is_cu_chroma_qp_offset_coded;
+    int8_t  cu_qp_offset_cb;
+    int8_t  cu_qp_offset_cr;
+    uint8_t cross_pf;
 } TransformUnit;
 
 typedef struct DBParams {
@@ -364,6 +306,7 @@ typedef struct DBParams {
 #define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
 #define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
 #define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
 
 typedef struct HEVCFrame {
     AVFrame *frame;
@@ -394,24 +337,11 @@ typedef struct HEVCFrame {
     uint8_t flags;
 } HEVCFrame;
 
-struct HEVCContext;
-
-typedef struct HEVCPredContext {
-    void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
-
-    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-                           const uint8_t *left, ptrdiff_t stride);
-    void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-                    ptrdiff_t stride, int log2_size, int c_idx);
-    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-                            const uint8_t *left, ptrdiff_t stride,
-                            int c_idx, int mode);
-} HEVCPredContext;
-
 typedef struct HEVCLocalContext {
-    DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 24) * MAX_PB_SIZE]);
     uint8_t cabac_state[HEVC_CONTEXTS];
 
+    uint8_t stat_coeff[4];
+
     uint8_t first_qp_group;
 
     GetBitContext gb;
@@ -420,18 +350,23 @@ typedef struct HEVCLocalContext {
     int8_t qp_y;
     int8_t curr_qp_y;
 
+    int qPy_pred;
+
     TransformUnit tu;
 
     uint8_t ctb_left_flag;
     uint8_t ctb_up_flag;
     uint8_t ctb_up_right_flag;
     uint8_t ctb_up_left_flag;
-    int     start_of_tiles_x;
     int     end_of_tiles_x;
     int     end_of_tiles_y;
     /* +7 is for subpixel interpolation, *2 for high bit depths */
     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-    CodingTree ct;
+    /* The extended size between the new edge emu buffer is abused by SAO */
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
+    DECLARE_ALIGNED(32, int16_t, tmp)[MAX_PB_SIZE * MAX_PB_SIZE];
+
+    int ct_depth;
     CodingUnit cu;
     PredictionUnit pu;
     NeighbourAvailable na;
@@ -449,17 +384,26 @@ typedef struct HEVCContext {
     const AVClass *c;  // needed by private avoptions
     AVCodecContext *avctx;
 
-    HEVCLocalContext HEVClc;
+    struct HEVCContext  *sList[MAX_NB_THREADS];
 
-    uint8_t cabac_state[HEVC_CONTEXTS];
+    HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+    HEVCLocalContext    *HEVClc;
+
+    uint8_t             threads_type;
+    uint8_t             threads_number;
+
+    int                 width;
+    int                 height;
+
+    uint8_t *cabac_state;
 
     /** 1 if the independent slice segment header was successfully parsed */
     uint8_t slice_initialized;
 
     AVFrame *frame;
-    AVFrame *sao_frame;
-    AVFrame *tmp_frame;
     AVFrame *output_frame;
+    uint8_t *sao_pixel_buffer_h[3];
+    uint8_t *sao_pixel_buffer_v[3];
 
     HEVCParamSets ps;
     HEVCSEI sei;
@@ -482,11 +426,14 @@ typedef struct HEVCContext {
     int pocTid0;
     int slice_idx; ///< number of the slice being currently decoded
     int eos;       ///< current packet contains an EOS/EOB NAL
+    int last_eos;  ///< last packet contains an EOS/EOB NAL
     int max_ra;
     int bs_width;
     int bs_height;
+    int overlap;
 
     int is_decoded;
+    int no_rasl_output_flag;
 
     HEVCPredContext hpc;
     HEVCDSPContext hevcdsp;
@@ -521,12 +468,17 @@ typedef struct HEVCContext {
     uint16_t seq_decode;
     uint16_t seq_output;
 
+    int enable_parallel_tiles;
+    atomic_int wpp_err;
+
+    const uint8_t *data;
+
     H2645Packet pkt;
     // type of the first VCL NAL of the current frame
     enum HEVCNALUnitType first_nal_type;
 
     uint8_t context_initialized;
-    uint8_t is_nalff;       ///< this flag is != 0 if bitstream is encapsulated
+    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
                             ///< as a format defined in 14496-15
     int apply_defdispwin;
 
@@ -544,11 +496,6 @@ void ff_hevc_clear_refs(HEVCContext *s);
  */
 void ff_hevc_flush_dpb(HEVCContext *s);
 
-/**
- * Compute POC of the current frame and return it.
- */
-int ff_hevc_compute_poc(HEVCContext *s, int poc_lsb);
-
 RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *frame,
                                  int x0, int y0);
 
@@ -563,7 +510,7 @@ int ff_hevc_frame_rps(HEVCContext *s);
 int ff_hevc_slice_rpl(HEVCContext *s);
 
 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts);
-void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts);
+int ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts);
 int ff_hevc_sao_merge_flag_decode(HEVCContext *s);
 int ff_hevc_sao_type_idx_decode(HEVCContext *s);
 int ff_hevc_sao_band_position_decode(HEVCContext *s);
@@ -589,46 +536,45 @@ int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH);
 int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx);
 int ff_hevc_mvp_lx_flag_decode(HEVCContext *s);
 int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s);
-int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s);
-int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s);
-int ff_hevc_mvd_decode(HEVCContext *s);
-int ff_hevc_mvd_sign_flag_decode(HEVCContext *s);
 int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size);
 int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth);
 int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth);
-int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx);
-int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size);
-int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size);
-int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
-                                                 int last_significant_coeff_prefix);
-int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx,
-                                                int ctx_cg);
-int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c,
-                                          int y_c, int log2_trafo_size,
-                                          int scan_idx, int prev_sig);
-int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx,
-                                                 int ctx_set);
-int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx,
-                                                 int inc);
-int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int base_level,
-                                      int rc_rice_param);
-int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb);
+int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx);
+int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx);
 
 /**
  * Get the number of candidate references for the current frame.
  */
-int ff_hevc_frame_nb_refs(HEVCContext *s);
+int ff_hevc_frame_nb_refs(const HEVCContext *s);
 
 int ff_hevc_set_new_ref(HEVCContext *s, AVFrame **frame, int poc);
 
+static av_always_inline int ff_hevc_nal_is_nonref(enum HEVCNALUnitType type)
+{
+    switch (type) {
+    case HEVC_NAL_TRAIL_N:
+    case HEVC_NAL_TSA_N:
+    case HEVC_NAL_STSA_N:
+    case HEVC_NAL_RADL_N:
+    case HEVC_NAL_RASL_N:
+    case HEVC_NAL_VCL_N10:
+    case HEVC_NAL_VCL_N12:
+    case HEVC_NAL_VCL_N14:
+        return 1;
+        break;
+    default: break;
+    }
+    return 0;
+}
+
 /**
  * Find next frame in output order and put a reference to it in frame.
  * @return 1 if a frame was output, 0 otherwise
  */
 int ff_hevc_output_frame(HEVCContext *s, AVFrame *frame, int flush);
 
+void ff_hevc_bump_frame(HEVCContext *s);
+
 void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags);
 
 void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
@@ -640,16 +586,21 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0,
                               int nPbW, int nPbH, int log2_cb_size,
                               int part_idx, int merge_idx,
                               MvField *mv, int mvp_lx_flag, int LX);
-void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC, int xBase, int yBase,
+void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase,
                      int log2_cb_size);
 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size);
 int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s);
 int ff_hevc_cu_qp_delta_abs(HEVCContext *s);
-void ff_hevc_hls_filter(HEVCContext *s, int x, int y);
+int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s);
+int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s);
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size);
 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size);
+void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                 int log2_trafo_size, enum ScanType scan_idx,
+                                 int c_idx);
 
-void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
 
 extern const uint8_t ff_hevc_qpel_extra_before[4];
 extern const uint8_t ff_hevc_qpel_extra_after[4];
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 558e938..957e40d 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -2,21 +2,23 @@
  * HEVC video decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -89,36 +91,20 @@ static const int8_t transform[32][32] = {
       90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
 };
 
-DECLARE_ALIGNED(16, const int16_t, ff_hevc_epel_coeffs)[7][16] = {
-    { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
-    { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
-    { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
-    { -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 },
-    { -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 },
-    { -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 },
-    { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
-};
-
-DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_coeffs8)[7][16] = {
-    { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
-    { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
-    { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
-    { -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 },
-    { -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 },
-    { -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 },
-    { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
-};
-
-DECLARE_ALIGNED(16, const int16_t, ff_hevc_qpel_coeffs)[3][8] = {
-    { -1, 4, -10, 58, 17, -5,  1,  0 },
-    { -1, 4, -11, 40, 40, -11, 4, -1 },
-    {  0, 1,  -5, 17, 58, -10, 4, -1 },
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters)[7][4] = {
+    { -2, 58, 10, -2},
+    { -4, 54, 16, -2},
+    { -6, 46, 28, -4},
+    { -4, 36, 36, -4},
+    { -4, 28, 46, -6},
+    { -2, 16, 54, -4},
+    { -2, 10, 58, -2},
 };
 
-DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_coeffs8)[3][16] = {
-    { -1, 4, -10, 58, 17, -5,  1,  0, -1, 4, -10, 58, 17, -5,  1,  0 },
-    { -1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1 },
-    {  0, 1,  -5, 17, 58, -10, 4, -1,  0, 1,  -5, 17, 58, -10, 4, -1 },
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters)[3][16] = {
+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
 };
 
 #define BIT_DEPTH 8
@@ -133,34 +119,79 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_coeffs8)[3][16] = {
 #include "hevcdsp_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "hevcdsp_template.c"
+#undef BIT_DEPTH
+
 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 {
 #undef FUNC
 #define FUNC(a, depth) a ## _ ## depth
 
-#define QPEL_FUNC(i, width, depth)                                                  \
-    hevcdsp->put_hevc_qpel[0][0][i] = FUNC(put_hevc_qpel_pixels_ ## width, depth);  \
-    hevcdsp->put_hevc_qpel[0][1][i] = FUNC(put_hevc_qpel_h_      ## width, depth);  \
-    hevcdsp->put_hevc_qpel[1][0][i] = FUNC(put_hevc_qpel_v_      ## width, depth);  \
-    hevcdsp->put_hevc_qpel[1][1][i] = FUNC(put_hevc_qpel_hv_     ## width, depth);  \
-
-#define EPEL_FUNC(i, width, depth)                                                  \
-    hevcdsp->put_hevc_epel[0][0][i] = FUNC(put_hevc_epel_pixels_ ## width, depth);  \
-    hevcdsp->put_hevc_epel[0][1][i] = FUNC(put_hevc_epel_h_      ## width, depth);  \
-    hevcdsp->put_hevc_epel[1][0][i] = FUNC(put_hevc_epel_v_      ## width, depth);  \
-    hevcdsp->put_hevc_epel[1][1][i] = FUNC(put_hevc_epel_hv_     ## width, depth);  \
-
-#define PRED_FUNC(i, width, depth)                                                        \
-    hevcdsp->put_unweighted_pred[i]     = FUNC(put_unweighted_pred_ ## width, depth);     \
-    hevcdsp->put_unweighted_pred_avg[i] = FUNC(put_unweighted_pred_avg_ ## width, depth); \
-    hevcdsp->weighted_pred[i]           = FUNC(put_weighted_pred_ ## width, depth);       \
-    hevcdsp->weighted_pred_avg[i]       = FUNC(put_weighted_pred_avg_ ## width, depth);   \
-
-#define PRED_FUNC_CHROMA(i, width, depth)                                                        \
-    hevcdsp->put_unweighted_pred_chroma[i]     = FUNC(put_unweighted_pred_ ## width, depth);     \
-    hevcdsp->put_unweighted_pred_avg_chroma[i] = FUNC(put_unweighted_pred_avg_ ## width, depth); \
-    hevcdsp->weighted_pred_chroma[i]           = FUNC(put_weighted_pred_ ## width, depth);       \
-    hevcdsp->weighted_pred_avg_chroma[i]       = FUNC(put_weighted_pred_avg_ ## width, depth);   \
+#undef PEL_FUNC
+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
+    for(i = 0 ; i < 10 ; i++)                                                  \
+{                                                                              \
+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
+}
+
+#undef EPEL_FUNCS
+#define EPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
+
+#undef EPEL_UNI_FUNCS
+#define EPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
+
+#undef EPEL_BI_FUNCS
+#define EPEL_BI_FUNCS(depth)                                                \
+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
+
+#undef QPEL_FUNCS
+#define QPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
+
+#undef QPEL_UNI_FUNCS
+#define QPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
+
+#undef QPEL_BI_FUNCS
+#define QPEL_BI_FUNCS(depth)                                                  \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
 
 #define HEVC_DSP(depth)                                                     \
     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
@@ -169,6 +200,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
     hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
     hevcdsp->dequant                = FUNC(dequant, depth);                 \
+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
     hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
     hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
@@ -179,51 +211,27 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
-    hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
-    hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
-    hevcdsp->sao_band_filter[2] = FUNC(sao_band_filter_2, depth);           \
-    hevcdsp->sao_band_filter[3] = FUNC(sao_band_filter_3, depth);           \
-                                                                            \
-    hevcdsp->sao_edge_filter[0] = FUNC(sao_edge_filter_0, depth);           \
-    hevcdsp->sao_edge_filter[1] = FUNC(sao_edge_filter_1, depth);           \
-    hevcdsp->sao_edge_filter[2] = FUNC(sao_edge_filter_2, depth);           \
-    hevcdsp->sao_edge_filter[3] = FUNC(sao_edge_filter_3, depth);           \
-                                                                            \
-    QPEL_FUNC(0, 4,  depth);                                                \
-    QPEL_FUNC(1, 8,  depth);                                                \
-    QPEL_FUNC(2, 12, depth);                                                \
-    QPEL_FUNC(3, 16, depth);                                                \
-    QPEL_FUNC(4, 24, depth);                                                \
-    QPEL_FUNC(5, 32, depth);                                                \
-    QPEL_FUNC(6, 48, depth);                                                \
-    QPEL_FUNC(7, 64, depth);                                                \
-                                                                            \
-    EPEL_FUNC(0, 2,  depth);                                                \
-    EPEL_FUNC(1, 4,  depth);                                                \
-    EPEL_FUNC(2, 6, depth);                                                 \
-    EPEL_FUNC(3, 8, depth);                                                 \
-    EPEL_FUNC(4, 12, depth);                                                \
-    EPEL_FUNC(5, 16, depth);                                                \
-    EPEL_FUNC(6, 24, depth);                                                \
-    EPEL_FUNC(7, 32, depth);                                                \
-                                                                            \
-    PRED_FUNC(0, 4,  depth);                                                \
-    PRED_FUNC(1, 8,  depth);                                                \
-    PRED_FUNC(2, 12, depth);                                                \
-    PRED_FUNC(3, 16, depth);                                                \
-    PRED_FUNC(4, 24, depth);                                                \
-    PRED_FUNC(5, 32, depth);                                                \
-    PRED_FUNC(6, 48, depth);                                                \
-    PRED_FUNC(7, 64, depth);                                                \
-    PRED_FUNC_CHROMA(0, 2,  depth);                                         \
-    PRED_FUNC_CHROMA(1, 4,  depth);                                         \
-    PRED_FUNC_CHROMA(2, 6, depth);                                          \
-    PRED_FUNC_CHROMA(3, 8, depth);                                          \
-    PRED_FUNC_CHROMA(4, 12, depth);                                         \
-    PRED_FUNC_CHROMA(5, 16, depth);                                         \
-    PRED_FUNC_CHROMA(6, 24, depth);                                         \
-    PRED_FUNC_CHROMA(7, 32, depth);                                         \
                                                                             \
+    hevcdsp->sao_band_filter[0] =                                              \
+    hevcdsp->sao_band_filter[1] =                                              \
+    hevcdsp->sao_band_filter[2] =                                              \
+    hevcdsp->sao_band_filter[3] =                                              \
+    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
+    hevcdsp->sao_edge_filter[0] =                                              \
+    hevcdsp->sao_edge_filter[1] =                                              \
+    hevcdsp->sao_edge_filter[2] =                                              \
+    hevcdsp->sao_edge_filter[3] =                                              \
+    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+                                                                               \
+    QPEL_FUNCS(depth);                                                         \
+    QPEL_UNI_FUNCS(depth);                                                     \
+    QPEL_BI_FUNCS(depth);                                                      \
+    EPEL_FUNCS(depth);                                                         \
+    EPEL_UNI_FUNCS(depth);                                                     \
+    EPEL_BI_FUNCS(depth);                                                      \
+                                                                               \
     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
@@ -231,7 +239,8 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
     hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
     hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
-    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth);
+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
+int i = 0;
 
     switch (bit_depth) {
     case 9:
@@ -240,6 +249,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     case 10:
         HEVC_DSP(10);
         break;
+    case 12:
+        HEVC_DSP(12);
+        break;
     default:
         HEVC_DSP(8);
         break;
@@ -251,4 +263,6 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
         ff_hevc_dsp_init_ppc(hevcdsp, bit_depth);
     if (ARCH_X86)
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+    if (ARCH_MIPS)
+        ff_hevc_dsp_init_mips(hevcdsp, bit_depth);
 }
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 7fc6f9c..0ae67cb 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -2,21 +2,23 @@
  * HEVC video decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,103 +27,109 @@
 
 #include "get_bits.h"
 
+#define MAX_PB_SIZE 64
+
 typedef struct SAOParams {
     int offset_abs[3][4];   ///< sao_offset_abs
     int offset_sign[3][4];  ///< sao_offset_sign
 
-    int band_position[3];   ///< sao_band_position
+    uint8_t band_position[3];   ///< sao_band_position
 
     int eo_class[3];        ///< sao_eo_class
 
-    int offset_val[3][5];   ///<SaoOffsetVal
+    int16_t offset_val[3][5];   ///<SaoOffsetVal
 
     uint8_t type_idx[3];    ///< sao_type_idx
 } SAOParams;
 
 typedef struct HEVCDSPContext {
-    void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
-                    GetBitContext *gb, int pcm_bit_depth);
+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                    struct GetBitContext *gb, int pcm_bit_depth);
 
     void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
 
-    void (*dequant)(int16_t *coeffs);
+    void (*dequant)(int16_t *coeffs, int16_t log2_size);
+
+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
+
     void (*transform_4x4_luma)(int16_t *coeffs);
+
     void (*idct[4])(int16_t *coeffs, int col_limit);
+
     void (*idct_dc[4])(int16_t *coeffs);
 
-    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                               struct SAOParams *sao, int *borders,
-                               int width, int height, int c_idx);
-    void (*sao_edge_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                               struct SAOParams *sao, int *borders, int width,
-                               int height, int c_idx, uint8_t vert_edge,
-                               uint8_t horiz_edge, uint8_t diag_edge);
-
-    void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height,
-                                   int mx, int my, int16_t *mcbuffer);
-    void (*put_hevc_epel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height,
-                                   int mx, int my, int16_t *mcbuffer);
-
-    void (*put_unweighted_pred[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                                   ptrdiff_t srcstride, int height);
-    void (*put_unweighted_pred_chroma[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                                          ptrdiff_t srcstride, int height);
-    void (*put_unweighted_pred_avg[8])(uint8_t *dst, ptrdiff_t dststride,
-                                       int16_t *src1, int16_t *src2,
-                                       ptrdiff_t srcstride, int height);
-    void (*put_unweighted_pred_avg_chroma[8])(uint8_t *dst, ptrdiff_t dststride,
-                                              int16_t *src1, int16_t *src2,
-                                              ptrdiff_t srcstride, int height);
-    void (*weighted_pred[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                             uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                             ptrdiff_t srcstride, int height);
-    void (*weighted_pred_chroma[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                                    uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                                    ptrdiff_t srcstride, int height);
-    void (*weighted_pred_avg[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
-                                 int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
-                                 ptrdiff_t dststride, int16_t *src1, int16_t *src2,
-                                 ptrdiff_t srcstride, int height);
-    void (*weighted_pred_avg_chroma[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
-                                        int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
-                                        ptrdiff_t dststride, int16_t *src1, int16_t *src2,
-                                        ptrdiff_t srcstride, int height);
+    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+
+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+
+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int wx1,
+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int ox0, int wx1,
+                                         int ox1, intptr_t mx, intptr_t my, int width);
 
     void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int beta, int *tc,
+                                    int beta, int32_t *tc,
                                     uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int beta, int *tc,
+                                    int beta, int32_t *tc,
                                     uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                      int *tc, uint8_t *no_p, uint8_t *no_q);
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                      int *tc, uint8_t *no_p, uint8_t *no_q);
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int beta, int *tc,
+                                      int beta, int32_t *tc,
                                       uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int beta, int *tc,
+                                      int beta, int32_t *tc,
                                       uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                        int *tc, uint8_t *no_p,
+                                        int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                        int *tc, uint8_t *no_p,
+                                        int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
 } HEVCDSPContext;
 
 void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 
+extern const int8_t ff_hevc_epel_filters[7][4];
+extern const int8_t ff_hevc_qpel_filters[3][16];
+
 void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
-
-extern const int16_t ff_hevc_epel_coeffs[7][16];
-extern const int8_t ff_hevc_epel_coeffs8[7][16];
-extern const int16_t ff_hevc_qpel_coeffs[3][8];
-extern const int8_t ff_hevc_qpel_coeffs8[3][16];
+void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
 
 #endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 97ce34a..56cd9e6 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,8 +24,9 @@
 #include "hevcdec.h"
 
 #include "bit_depth_template.c"
+#include "hevcdsp.h"
 
-static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
                           GetBitContext *gb, int pcm_bit_depth)
 {
     int x, y;
@@ -33,8 +34,8 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
 
     stride /= sizeof(pixel);
 
-    for (y = 0; y < size; y++) {
-        for (x = 0; x < size; x++)
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
         dst += stride;
     }
@@ -81,19 +82,49 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
     FUNC(add_residual)(_dst, res, stride, 32);
 }
 
-static void FUNC(dequant)(int16_t *coeffs)
+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
 {
-    int shift  = 13 - BIT_DEPTH;
-#if BIT_DEPTH <= 13
-    int offset = 1 << (shift - 1);
-#else
-    int offset = 0;
-#endif
+    int16_t *coeffs = (int16_t *) _coeffs;
     int x, y;
+    int size = 1 << log2_size;
+
+    if (mode) {
+        coeffs += size;
+        for (y = 0; y < size - 1; y++) {
+            for (x = 0; x < size; x++)
+                coeffs[x] += coeffs[x - size];
+            coeffs += size;
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 1; x < size; x++)
+                coeffs[x] += coeffs[x - 1];
+            coeffs += size;
+        }
+    }
+}
 
-    for (y = 0; y < 4 * 4; y += 4) {
-        for (x = 0; x < 4; x++)
-            coeffs[y + x] = (coeffs[y + x] + offset) >> shift;
+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
+{
+    int shift  = 15 - BIT_DEPTH - log2_size;
+    int x, y;
+    int size = 1 << log2_size;
+
+    if (shift > 0) {
+        int offset = 1 << (shift - 1);
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = (*coeffs + offset) >> shift;
+                coeffs++;
+            }
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = *(uint16_t*)coeffs << -shift;
+                coeffs++;
+            }
+        }
     }
 }
 
@@ -137,21 +168,17 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 
 #undef TR_4x4_LUMA
 
-#define TR_4(dst, src, dstep, sstep, assign, end)                       \
-    do {                                                                \
-        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
-                       transform[8 * 2][0] * src[2 * sstep];            \
-        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
-                       transform[8 * 2][1] * src[2 * sstep];            \
-        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
-                       transform[8 * 3][0] * src[3 * sstep];            \
-        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
-                       transform[8 * 3][1] * src[3 * sstep];            \
-                                                                        \
-        assign(dst[0 * dstep], e0 + o0);                                \
-        assign(dst[1 * dstep], e1 + o1);                                \
-        assign(dst[2 * dstep], e1 - o1);                                \
-        assign(dst[3 * dstep], e0 - o0);                                \
+#define TR_4(dst, src, dstep, sstep, assign, end)                 \
+    do {                                                          \
+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
+                                                                  \
+        assign(dst[0 * dstep], e0 + o0);                          \
+        assign(dst[1 * dstep], e1 + o1);                          \
+        assign(dst[2 * dstep], e1 - o1);                          \
+        assign(dst[3 * dstep], e0 - o0);                          \
     } while (0)
 
 #define TR_8(dst, src, dstep, sstep, assign, end)                 \
@@ -254,10 +281,12 @@ IDCT( 4)
 IDCT( 8)
 IDCT(16)
 IDCT(32)
+
 IDCT_DC( 4)
 IDCT_DC( 8)
 IDCT_DC(16)
 IDCT_DC(32)
+
 #undef TR_4
 #undef TR_8
 #undef TR_16
@@ -265,150 +294,93 @@ IDCT_DC(32)
 
 #undef SET
 #undef SCALE
-#undef ADD_AND_SCALE
 
 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
-                                  ptrdiff_t stride, SAOParams *sao,
-                                  int *borders, int width, int height,
-                                  int c_idx, int class)
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height)
 {
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
     int offset_table[32] = { 0 };
     int k, y, x;
-    int chroma = !!c_idx;
     int shift  = BIT_DEPTH - 5;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_left_class  = sao->band_position[c_idx];
-    int init_y = 0, init_x = 0;
-
-    stride /= sizeof(pixel);
 
-    switch (class) {
-    case 0:
-        if (!borders[2])
-            width -= (8 >> chroma) + 2;
-        if (!borders[3])
-            height -= (4 >> chroma) + 2;
-        break;
-    case 1:
-        init_y = -(4 >> chroma) - 2;
-        if (!borders[2])
-            width -= (8 >> chroma) + 2;
-        height = (4 >> chroma) + 2;
-        break;
-    case 2:
-        init_x = -(8 >> chroma) - 2;
-        width  =  (8 >> chroma) + 2;
-        if (!borders[3])
-            height -= (4 >> chroma) + 2;
-        break;
-    case 3:
-        init_y = -(4 >> chroma) - 2;
-        init_x = -(8 >> chroma) - 2;
-        width  =  (8 >> chroma) + 2;
-        height =  (4 >> chroma) + 2;
-        break;
-    }
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
     for (k = 0; k < 4; k++)
         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-        dst += stride;
-        src += stride;
+        dst += stride_dst;
+        src += stride_src;
     }
 }
 
-static void FUNC(sao_band_filter_0)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 0);
-}
+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
 
-static void FUNC(sao_band_filter_1)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 1);
-}
+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+                                  int eo, int width, int height) {
 
-static void FUNC(sao_band_filter_2)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 2);
-}
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+    stride_dst /= sizeof(pixel);
 
-static void FUNC(sao_band_filter_3)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(_dst, _src, stride, sao, borders,
-                          width, height, c_idx, 3);
+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int diff0 = CMP(src[x], src[x + a_stride]);
+            int diff1 = CMP(src[x], src[x + b_stride]);
+            int offset_val        = edge_idx[2 + diff0 + diff1];
+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
+        }
+        src += stride_src;
+        dst += stride_dst;
+    }
 }
 
-static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
                                     int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
+    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
+    int init_x = 0, width = _width, height = _height;
 
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-
-    stride /= sizeof(pixel);
-
-    if (!borders[2])
-        width -= (8 >> chroma) + 2;
-    if (!borders[3])
-        height -= (4 >> chroma) + 2;
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
             int offset_val = sao_offset_val[0];
-            ptrdiff_t y_stride   = 0;
             for (y = 0; y < height; y++) {
-                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
-                y_stride     += stride;
+                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
             }
             init_x = 1;
         }
         if (borders[2]) {
             int offset_val = sao_offset_val[0];
-            ptrdiff_t x_stride = width - 1;
+            int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
-                x_stride     += stride;
+                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
             }
             width--;
         }
@@ -418,180 +390,51 @@ static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
             int offset_val = sao_offset_val[0];
             for (x = init_x; x < width; x++)
                 dst[x] = av_clip_pixel(src[x] + offset_val);
-            init_y = 1;
         }
         if (borders[3]) {
-            int offset_val = sao_offset_val[0];
-            ptrdiff_t y_stride = stride * (height - 1);
+            int offset_val   = sao_offset_val[0];
+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+            ptrdiff_t y_stride_src = stride_src * (height - 1);
             for (x = init_x; x < width; x++)
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
+                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
             height--;
         }
     }
-    {
-        ptrdiff_t y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        ptrdiff_t y_stride_0_1 = (init_y + pos_0_1) * stride;
-        ptrdiff_t y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
-    }
-
-    {
-        // Restore pixels that can't be modified
-        int save_upper_left = !diag_edge && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
-        if (vert_edge && sao_eo_class != SAO_EO_VERT)
-            for (y = init_y+save_upper_left; y< height; y++)
-                dst[y*stride] = src[y*stride];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x+save_upper_left; x<width; x++)
-                dst[x] = src[x];
-        if(diag_edge && sao_eo_class == SAO_EO_135D)
-            dst[0] = src[0];
-    }
-
-#undef CMP
 }
 
-static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
                                     int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
+    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
     int init_x = 0, init_y = 0, width = _width, height = _height;
 
-    static const int8_t pos[4][2][2] = {
-        { { -1, 0  }, { 1,  0 } }, // horizontal
-        { { 0,  -1 }, { 0,  1 } }, // vertical
-        { { -1, -1 }, { 1,  1 } }, // 45 degree
-        { { 1,  -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    stride /= sizeof(pixel);
-
-    init_y = -(4 >> chroma) - 2;
-    if (!borders[2])
-        width -= (8 >> chroma) + 2;
-    height = (4 >> chroma) + 2;
-
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
             int offset_val = sao_offset_val[0];
-            ptrdiff_t y_stride = 0;
             for (y = 0; y < height; y++) {
-                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
-                y_stride     += stride;
+                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
             }
             init_x = 1;
         }
         if (borders[2]) {
             int offset_val = sao_offset_val[0];
-            ptrdiff_t x_stride = width - 1;
+            int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
-                x_stride     += stride;
+                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
             }
             width--;
         }
     }
-    {
-        ptrdiff_t y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        ptrdiff_t y_stride_0_1 = (init_y + pos_0_1) * stride;
-        ptrdiff_t y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
-    }
-
-    {
-        // Restore pixels that can't be modified
-        int save_lower_left = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[0];
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y; y< height-save_lower_left; y++)
-                dst[y*stride] = src[y*stride];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x+save_lower_left; x<width; x++)
-                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
-        if(diag_edge && sao_eo_class == SAO_EO_45D)
-            dst[stride*(height-1)] = src[stride*(height-1)];
-    }
-
-#undef CMP
-}
-
-static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
-{
-    int x, y;
-    pixel *dst = (pixel *)_dst;
-    pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
-
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-
-    stride /= sizeof(pixel);
-
-    init_x = -(8 >> chroma) - 2;
-    width  =  (8 >> chroma) + 2;
-    if (!borders[3])
-        height -= (4 >> chroma) + 2;
-
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_HORIZ) {
         if (borders[1]) {
             int offset_val = sao_offset_val[0];
@@ -600,429 +443,674 @@ static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
             init_y = 1;
         }
         if (borders[3]) {
-            int offset_val = sao_offset_val[0];
-            ptrdiff_t y_stride = stride * (height - 1);
+            int offset_val   = sao_offset_val[0];
+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+            ptrdiff_t y_stride_src = stride_src * (height - 1);
             for (x = init_x; x < width; x++)
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
+                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
             height--;
         }
     }
-    {
-        ptrdiff_t y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        ptrdiff_t y_stride_0_1 = (init_y + pos_0_1) * stride;
-        ptrdiff_t y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
-    }
 
     {
+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
+
         // Restore pixels that can't be modified
-        int save_upper_right = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[1];
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y+save_upper_right; y< height; y++)
-                dst[y*stride+width-1] = src[y*stride+width-1];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x; x<width-save_upper_right; x++)
+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
+                dst[y*stride_dst] = src[y*stride_src];
+        }
+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
+        }
+
+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
                 dst[x] = src[x];
-        if(diag_edge && sao_eo_class == SAO_EO_45D)
+        }
+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
+        }
+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
+            dst[0] = src[0];
+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
             dst[width-1] = src[width-1];
+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
+
     }
+}
+
 #undef CMP
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = src[x] << (14 - BIT_DEPTH);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
 }
 
-static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, intptr_t mx, intptr_t my, int width)
+{
+    int y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        memcpy(dst, src, width * sizeof(pixel));
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    pixel *dst = (pixel *)_dst;
-    pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
-    stride /= sizeof(pixel);
+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-    init_y = -(4 >> chroma) - 2;
-    init_x = -(8 >> chroma) - 2;
-    width  =  (8 >> chroma) + 2;
-    height =  (4 >> chroma) + 2;
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
 
+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                           int16_t *src2,
+                                           int height, int denom, int wx0, int wx1,
+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    dst    = dst + (init_y * stride + init_x);
-    src    = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
 
-    {
-        ptrdiff_t y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        ptrdiff_t y_stride_0_1 = (init_y + pos_0_1) * stride;
-        ptrdiff_t y_stride_1_1 = (init_y + pos_1_1) * stride;
-
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
         }
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
     }
+}
 
-    {
-        // Restore pixels that can't be modified
-        int save_lower_right = !diag_edge && sao_eo_class == SAO_EO_135D;
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y; y< height-save_lower_right; y++)
-                dst[y*stride+width-1] = src[y*stride+width-1];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x; x<width-save_lower_right; x++)
-                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
-        if(diag_edge && sao_eo_class == SAO_EO_135D)
-            dst[stride*(height-1)+width-1] = src[stride*(height-1)+width-1];
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define QPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - 3 * stride] +                                         \
+     filter[1] * src[x - 2 * stride] +                                         \
+     filter[2] * src[x -     stride] +                                         \
+     filter[3] * src[x             ] +                                         \
+     filter[4] * src[x +     stride] +                                         \
+     filter[5] * src[x + 2 * stride] +                                         \
+     filter[6] * src[x + 3 * stride] +                                         \
+     filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
     }
-#undef CMP
 }
 
-#undef SET
-#undef SCALE
-#undef TR_4
-#undef TR_8
-#undef TR_16
-#undef TR_32
+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    for (y = 0; y < height; y++)  {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
+}
 
-static av_always_inline void
-FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
-                           uint8_t *_src, ptrdiff_t _srcstride,
-                           int width, int height, int mx, int my,
-                           int16_t* mcbuffer)
+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
+                                   uint8_t *_src,
+                                   ptrdiff_t _srcstride,
+                                   int height, intptr_t mx,
+                                   intptr_t my, int width)
 {
     int x, y;
-    pixel *src          = (pixel *)_src;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
 
-    dststride /= sizeof(*dst);
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = src[x] << (14 - BIT_DEPTH);
+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
+        tmp += MAX_PB_SIZE;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
         src += srcstride;
         dst += dststride;
     }
 }
 
-#define QPEL_FILTER_1(src, stride)      \
-    (1 * -src[x - 3 * stride] +         \
-     4 *  src[x - 2 * stride] -         \
-    10 *  src[x -     stride] +         \
-    58 *  src[x]              +         \
-    17 *  src[x +     stride] -         \
-     5 *  src[x + 2 * stride] +         \
-     1 *  src[x + 3 * stride])
-
-#define QPEL_FILTER_2(src, stride)      \
-    (1  * -src[x - 3 * stride] +        \
-     4  *  src[x - 2 * stride] -        \
-    11  *  src[x -     stride] +        \
-    40  *  src[x]              +        \
-    40  *  src[x +     stride] -        \
-    11  *  src[x + 2 * stride] +        \
-     4  *  src[x + 3 * stride] -        \
-     1  *  src[x + 4 * stride])
-
-#define QPEL_FILTER_3(src, stride)      \
-    (1  * src[x - 2 * stride] -         \
-     5  * src[x -     stride] +         \
-    17  * src[x]              +         \
-    58  * src[x + stride]     -         \
-    10  * src[x + 2 * stride] +         \
-     4  * src[x + 3 * stride] -         \
-     1  * src[x + 4 * stride])
-
-
-#define PUT_HEVC_QPEL_H(H)                                                     \
-static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
-                                       uint8_t *_src, ptrdiff_t _srcstride,    \
-                                       int width, int height,                  \
-                                       int16_t* mcbuffer)                      \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    dststride /= sizeof(*dst);                                                 \
-    for (y = 0; y < height; y++) {                                             \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
-        src += srcstride;                                                      \
-        dst += dststride;                                                      \
-    }                                                                          \
+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
 }
 
-#define PUT_HEVC_QPEL_V(V)                                                     \
-static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
-                                       uint8_t *_src, ptrdiff_t _srcstride,    \
-                                       int width, int height,                  \
-                                       int16_t* mcbuffer)                      \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    dststride /= sizeof(*dst);                                                 \
-    for (y = 0; y < height; y++)  {                                            \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8);     \
-        src += srcstride;                                                      \
-        dst += dststride;                                                      \
-    }                                                                          \
+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                     uint8_t *_src, ptrdiff_t _srcstride,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    int shift = 14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
 }
 
-#define PUT_HEVC_QPEL_HV(H, V)                                                 \
-static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,                 \
-                                                 ptrdiff_t dststride,          \
-                                                 uint8_t *_src,                \
-                                                 ptrdiff_t _srcstride,         \
-                                                 int width, int height,        \
-                                                 int16_t* mcbuffer)            \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE];                        \
-    int16_t *tmp = tmp_array;                                                  \
-                                                                               \
-    dststride /= sizeof(*dst);                                                 \
-    src -= ff_hevc_qpel_extra_before[V] * srcstride;                           \
-                                                                               \
-    for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                     \
-        for (x = 0; x < width; x++)                                            \
-            tmp[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
-        src += srcstride;                                                      \
-        tmp += MAX_PB_SIZE;                                                    \
-    }                                                                          \
-                                                                               \
-    tmp = tmp_array + ff_hevc_qpel_extra_before[V] * MAX_PB_SIZE;              \
-                                                                               \
-    for (y = 0; y < height; y++) {                                             \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## V(tmp, MAX_PB_SIZE) >> 6;                 \
-        tmp += MAX_PB_SIZE;                                                    \
-        dst += dststride;                                                      \
-    }                                                                          \
+
+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
 }
 
-PUT_HEVC_QPEL_H(1)
-PUT_HEVC_QPEL_H(2)
-PUT_HEVC_QPEL_H(3)
-PUT_HEVC_QPEL_V(1)
-PUT_HEVC_QPEL_V(2)
-PUT_HEVC_QPEL_V(3)
-PUT_HEVC_QPEL_HV(1, 1)
-PUT_HEVC_QPEL_HV(1, 2)
-PUT_HEVC_QPEL_HV(1, 3)
-PUT_HEVC_QPEL_HV(2, 1)
-PUT_HEVC_QPEL_HV(2, 2)
-PUT_HEVC_QPEL_HV(2, 3)
-PUT_HEVC_QPEL_HV(3, 1)
-PUT_HEVC_QPEL_HV(3, 2)
-PUT_HEVC_QPEL_HV(3, 3)
-
-#define QPEL(W)                                                                             \
-static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride,             \
-                                             uint8_t *src, ptrdiff_t srcstride,             \
-                                             int height, int mx, int my,                    \
-                                             int16_t *mcbuffer)                             \
-{                                                                                           \
-    FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height,                   \
-                               mx, my, mcbuffer);                                           \
-}                                                                                           \
-                                                                                            \
-static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
-                                        uint8_t *src, ptrdiff_t srcstride,                  \
-                                        int height, int mx, int my,                         \
-                                        int16_t *mcbuffer)                                  \
-{                                                                                           \
-    if (mx == 1)                                                                            \
-        FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else if (mx == 2)                                                                       \
-        FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else                                                                                    \
-        FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-}                                                                                           \
-                                                                                            \
-static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
-                                             uint8_t *src, ptrdiff_t srcstride,             \
-                                             int height, int mx, int my,                    \
-                                             int16_t *mcbuffer)                             \
-{                                                                                           \
-    if (my == 1)                                                                            \
-        FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else if (my == 2)                                                                       \
-        FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else                                                                                    \
-        FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-}                                                                                           \
-                                                                                            \
-static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,                 \
-                                             uint8_t *src, ptrdiff_t srcstride,             \
-                                             int height, int mx, int my,                    \
-                                             int16_t *mcbuffer)                             \
-{                                                                                           \
-    if (my == 1) {                                                                          \
-        if (mx == 1)                                                                        \
-            FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else if (mx == 2)                                                                   \
-            FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else                                                                                \
-            FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-    } else if (my == 2) {                                                                   \
-        if (mx == 1)                                                                        \
-            FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else if (mx == 2)                                                                   \
-            FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else                                                                                \
-            FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-    } else {                                                                                \
-        if (mx == 1)                                                                        \
-            FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else if (mx == 2)                                                                   \
-            FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else                                                                                \
-            FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-    }                                                                                       \
+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                       uint8_t *_src, ptrdiff_t _srcstride,
+                                       int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift =  14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
 }
 
-QPEL(64)
-QPEL(48)
-QPEL(32)
-QPEL(24)
-QPEL(16)
-QPEL(12)
-QPEL(8)
-QPEL(4)
-
-static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
-                                              uint8_t *_src, ptrdiff_t _srcstride,
-                                              int width, int height, int mx, int my,
-                                              int16_t* mcbuffer)
+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    pixel *src          = (pixel *)_src;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
 
-    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = src[x] << (14 - BIT_DEPTH);
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
         src += srcstride;
         dst += dststride;
     }
 }
 
-#define EPEL_FILTER(src, stride)                \
-    (filter_0 * src[x - stride] +               \
-     filter_1 * src[x]          +               \
-     filter_2 * src[x + stride] +               \
-     filter_3 * src[x + 2 * stride])
+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
-static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
                                          uint8_t *_src, ptrdiff_t _srcstride,
-                                         int width, int height, int mx, int my,
-                                         int16_t* mcbuffer)
+                                         int height, int denom, int wx, int ox,
+                                         intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define EPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - stride] +                                             \
+     filter[1] * src[x]          +                                             \
+     filter[2] * src[x + stride] +                                             \
+     filter[3] * src[x + 2 * stride])
+
+static void FUNC(put_hevc_epel_h)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-    const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
-    int8_t filter_0 = filter[0];
-    int8_t filter_1 = filter[1];
-    int8_t filter_2 = filter[2];
-    int8_t filter_3 = filter[3];
-    dststride /= sizeof(*dst);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
         src += srcstride;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *_src, ptrdiff_t _srcstride,
-                                         int width, int height, int mx, int my,
-                                         int16_t* mcbuffer)
+static void FUNC(put_hevc_epel_v)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
-    int8_t filter_0 = filter[0];
-    int8_t filter_1 = filter[1];
-    int8_t filter_2 = filter[2];
-    int8_t filter_3 = filter[3];
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
 
-    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
         src += srcstride;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
-                                          uint8_t *_src, ptrdiff_t _srcstride,
-                                          int width, int height, int mx, int my,
-                                          int16_t* mcbuffer)
+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
+                                   uint8_t *_src, ptrdiff_t _srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
-    const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
-    int8_t filter_0 = filter_h[0];
-    int8_t filter_1 = filter_h[1];
-    int8_t filter_2 = filter_h[2];
-    int8_t filter_3 = filter_h[3];
-    int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
     int16_t *tmp = tmp_array;
 
-    dststride /= sizeof(*dst);
     src -= EPEL_EXTRA_BEFORE * srcstride;
 
     for (y = 0; y < height + EPEL_EXTRA; y++) {
@@ -1033,95 +1121,101 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
     }
 
     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter_0 = filter_v[0];
-    filter_1 = filter_v[1];
-    filter_2 = filter_v[2];
-    filter_3 = filter_v[3];
+    filter = ff_hevc_epel_filters[my - 1];
+
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
         tmp += MAX_PB_SIZE;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-#define EPEL(W)                                                                 \
-static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
-                                             uint8_t *src, ptrdiff_t srcstride, \
-                                             int height, int mx, int my,        \
-                                             int16_t *mcbuffer)                 \
-{                                                                               \
-    FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride,                  \
-                               W, height, mx, my, mcbuffer);                    \
-}                                                                               \
-static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
-                                        uint8_t *src, ptrdiff_t srcstride,      \
-                                        int height, int mx, int my,             \
-                                        int16_t *mcbuffer)                      \
-{                                                                               \
-    FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride,                       \
-                          W, height, mx, my, mcbuffer);                         \
-}                                                                               \
-static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
-                                        uint8_t *src, ptrdiff_t srcstride,      \
-                                        int height, int mx, int my,             \
-                                        int16_t *mcbuffer)                      \
-{                                                                               \
-    FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride,                       \
-                          W, height, mx, my, mcbuffer);                         \
-}                                                                               \
-static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,     \
-                                         uint8_t *src, ptrdiff_t srcstride,     \
-                                         int height, int mx, int my,            \
-                                         int16_t *mcbuffer)                     \
-{                                                                               \
-    FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride,                      \
-                           W, height, mx, my, mcbuffer);                        \
+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
 }
 
-EPEL(32)
-EPEL(24)
-EPEL(16)
-EPEL(12)
-EPEL(8)
-EPEL(6)
-EPEL(4)
-EPEL(2)
-
-static av_always_inline void
-FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
-                          int16_t *src, ptrdiff_t srcstride,
-                          int width, int height)
+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        }
+        dst  += dststride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
     int shift = 14 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
 #else
     int offset = 0;
 #endif
-    srcstride /= sizeof(*src);
+
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src[x] + offset) >> shift);
-        dst += dststride;
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
         src += srcstride;
+        dst += dststride;
     }
 }
 
-static av_always_inline void
-FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
-                              int16_t *src1, int16_t *src2,
-                              ptrdiff_t srcstride,
-                              int width, int height)
+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
-
     int shift = 14 + 1 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
@@ -1129,117 +1223,275 @@ FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
     int offset = 0;
 #endif
 
-    srcstride /= sizeof(*src1);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
         dst  += dststride;
-        src1 += srcstride;
-        src2 += srcstride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
     }
 }
 
-static av_always_inline void
-FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                    uint8_t *_dst, ptrdiff_t _dststride,
-                    int16_t *src, ptrdiff_t srcstride,
-                    int width, int height)
+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int height, intptr_t mx, intptr_t my, int width)
 {
-    int shift, log2Wd, wx, ox, x, y, offset;
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-    shift  = 14 - BIT_DEPTH;
-    log2Wd = denom + shift;
-    offset = 1 << (log2Wd - 1);
-    wx     = wlxFlag;
-    ox     = olxFlag * (1 << (BIT_DEPTH - 8));
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
 
-    srcstride /= sizeof(*src);
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
-            if (log2Wd >= 1) {
-                dst[x] = av_clip_pixel(((src[x] * wx + offset) >> log2Wd) + ox);
-            } else {
-                dst[x] = av_clip_pixel(src[x] * wx + ox);
-            }
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
         }
         dst += dststride;
         src += srcstride;
     }
 }
 
-static av_always_inline void
-FUNC(weighted_pred_avg)(uint8_t denom,
-                        int16_t wl0Flag, int16_t wl1Flag,
-                        int16_t ol0Flag, int16_t ol1Flag,
-                        uint8_t *_dst, ptrdiff_t _dststride,
-                        int16_t *src1, int16_t *src2,
-                        ptrdiff_t srcstride,
-                        int width, int height)
+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 {
-    int shift, log2Wd, w0, w1, o0, o1, x, y;
-    pixel *dst = (pixel *)_dst;
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
 
-    shift  = 14 - BIT_DEPTH;
-    log2Wd = denom + shift;
-    w0     = wl0Flag;
-    w1     = wl1Flag;
-    o0     = ol0Flag * (1 << (BIT_DEPTH - 8));
-    o1     = ol1Flag * (1 << (BIT_DEPTH - 8));
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
-    srcstride /= sizeof(*src1);
+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        }
+        dst += dststride;
+        src += srcstride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
-                                    ((o0 + o1 + 1) << log2Wd)) >> (log2Wd + 1));
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
+        src  += srcstride;
         dst  += dststride;
-        src1 += srcstride;
-        src2 += srcstride;
+        src2 += MAX_PB_SIZE;
     }
 }
 
-#define PUT_PRED(w)                                                                            \
-static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride,                 \
-                                            int16_t *src, ptrdiff_t srcstride,                 \
-                                            int height)                                        \
-{                                                                                              \
-    FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height);                      \
-}                                                                                              \
-static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride,             \
-                                                int16_t *src1, int16_t *src2,                  \
-                                                ptrdiff_t srcstride, int height)               \
-{                                                                                              \
-    FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height);           \
-}                                                                                              \
-static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset,       \
-                                          uint8_t *dst, ptrdiff_t dststride,                   \
-                                          int16_t *src, ptrdiff_t srcstride, int height)       \
-{                                                                                              \
-    FUNC(weighted_pred)(denom, weight, offset,                                                 \
-                        dst, dststride, src, srcstride, w, height);                            \
-}                                                                                              \
-static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \
-                                              int16_t offset0, int16_t offset1,                \
-                                              uint8_t *dst, ptrdiff_t dststride,               \
-                                              int16_t *src1, int16_t *src2,                    \
-                                              ptrdiff_t srcstride, int height)                 \
-{                                                                                              \
-    FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1,                         \
-                            dst, dststride, src1, src2, srcstride, w, height);                 \
+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
 }
 
-PUT_PRED(64)
-PUT_PRED(48)
-PUT_PRED(32)
-PUT_PRED(24)
-PUT_PRED(16)
-PUT_PRED(12)
-PUT_PRED(8)
-PUT_PRED(6)
-PUT_PRED(4)
-PUT_PRED(2)
+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
 // line zero
 #define P3 pix[-4 * xstride]
@@ -1392,21 +1644,21 @@ static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
 }
 
 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                            int *tc, uint8_t *no_p,
+                                            int32_t *tc, uint8_t *no_p,
                                             uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
 }
 
 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                            int *tc, uint8_t *no_p,
+                                            int32_t *tc, uint8_t *no_p,
                                             uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
 }
 
 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int beta, int *tc, uint8_t *no_p,
+                                          int beta, int32_t *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
@@ -1414,7 +1666,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 }
 
 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int beta, int *tc, uint8_t *no_p,
+                                          int beta, int32_t *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
index 7342b7e..7a86ed3 100644
--- a/libavcodec/hevcpred.c
+++ b/libavcodec/hevcpred.c
@@ -1,27 +1,29 @@
 /*
- * HEVC video decoder
+ * HEVC video Decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "hevcdec.h"
 
+#include "hevcpred.h"
+
 #define BIT_DEPTH 8
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
@@ -34,6 +36,10 @@
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
 {
 #undef FUNC
@@ -61,8 +67,14 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
     case 10:
         HEVC_PRED(10);
         break;
+    case 12:
+        HEVC_PRED(12);
+        break;
     default:
         HEVC_PRED(8);
         break;
     }
+
+    if (ARCH_MIPS)
+        ff_hevc_pred_init_mips(hpc, bit_depth);
 }
diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
new file mode 100644
index 0000000..eb17663
--- /dev/null
+++ b/libavcodec/hevcpred.h
@@ -0,0 +1,46 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HEVCPRED_H
+#define AVCODEC_HEVCPRED_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct HEVCContext;
+
+typedef struct HEVCPredContext {
+    void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+
+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
+                           const uint8_t *left, ptrdiff_t stride);
+    void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+                    ptrdiff_t stride, int log2_size, int c_idx);
+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int c_idx, int mode);
+} HEVCPredContext;
+
+void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
+
+#endif /* AVCODEC_HEVCPRED_H */
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
index 5fe3ea3..6fe3354 100644
--- a/libavcodec/hevcpred_template.c
+++ b/libavcodec/hevcpred_template.c
@@ -3,28 +3,27 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/pixdesc.h"
 
-#include "hevcdec.h"
-
 #include "bit_depth_template.c"
+#include "hevcpred.h"
 
 #define POS(x, y) src[(x) + stride * (y)]
 
@@ -36,12 +35,11 @@ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
 #define MVF(x, y) \
     (s->ref->tab_mvf[(x) + (y) * min_pu_width])
 #define MVF_PU(x, y) \
-    MVF(PU(x0 + ((x) << hshift)), PU(y0 + ((y) << vshift)))
+    MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift))))
 #define IS_INTRA(x, y) \
-    MVF_PU(x, y).is_intra
+    (MVF_PU(x, y).pred_flag == PF_INTRA)
 #define MIN_TB_ADDR_ZS(x, y) \
-    s->ps.pps->min_tb_addr_zs[(y) * s->ps.sps->min_tb_width + (x)]
-
+    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
 #define EXTEND(ptr, val, len)         \
 do {                                  \
     pixel4 pix = PIXEL_SPLAT_X4(val); \
@@ -49,36 +47,43 @@ do {                                  \
         AV_WN4P(ptr + i, pix);        \
 } while (0)
 
+#define EXTEND_RIGHT_CIP(ptr, start, length)                                   \
+        for (i = start; i < (start) + (length); i += 4)                        \
+            if (!IS_INTRA(i, -1))                                              \
+                AV_WN4P(&ptr[i], a);                                           \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i+3])
 #define EXTEND_LEFT_CIP(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
+        for (i = start; i > (start) - (length); i--) \
             if (!IS_INTRA(i - 1, -1)) \
                 ptr[i - 1] = ptr[i]
-#define EXTEND_RIGHT_CIP(ptr, start, length) \
-        for (i = (start); i < (start) + (length); i++) \
-            if (!IS_INTRA(i, -1)) \
-                ptr[i] = ptr[i - 1]
-#define EXTEND_UP_CIP(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
-            if (!IS_INTRA(-1, i - 1)) \
-                ptr[i - 1] = ptr[i]
-#define EXTEND_UP_CIP_0(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
-            ptr[i - 1] = ptr[i]
-#define EXTEND_DOWN_CIP(ptr, start, length) \
-        for (i = (start); i < (start) + (length); i++) \
-            if (!IS_INTRA(-1, i)) \
-                ptr[i] = ptr[i - 1]
-    HEVCLocalContext *lc = &s->HEVClc;
+#define EXTEND_UP_CIP(ptr, start, length)                                      \
+        for (i = (start); i > (start) - (length); i -= 4)                      \
+            if (!IS_INTRA(-1, i - 3))                                          \
+                AV_WN4P(&ptr[i - 3], a);                                       \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i - 3])
+#define EXTEND_DOWN_CIP(ptr, start, length)                                    \
+        for (i = start; i < (start) + (length); i += 4)                        \
+            if (!IS_INTRA(-1, i))                                              \
+                AV_WN4P(&ptr[i], a);                                           \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i + 3])
+
+    HEVCLocalContext *lc = s->HEVClc;
     int i;
     int hshift = s->ps.sps->hshift[c_idx];
     int vshift = s->ps.sps->vshift[c_idx];
     int size = (1 << log2_size);
-    int size_in_luma = size << hshift;
-    int size_in_tbs = size_in_luma >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_h = size << hshift;
+    int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_v = size << vshift;
+    int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
     int x = x0 >> hshift;
     int y = y0 >> vshift;
-    int x_tb = x0 >> s->ps.sps->log2_min_tb_size;
-    int y_tb = y0 >> s->ps.sps->log2_min_tb_size;
+    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
 
     ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
@@ -86,87 +91,77 @@ do {                                  \
 
     int min_pu_width = s->ps.sps->min_pu_width;
 
-    enum IntraPredMode mode = c_idx ? lc->pu.intra_pred_mode_c :
-                              lc->tu.cur_intra_pred_mode;
-
-    pixel left_array[2 * MAX_TB_SIZE + 1];
-    pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
-    pixel top_array[2 * MAX_TB_SIZE + 1];
-    pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
-
-    pixel *left          = left_array + 1;
-    pixel *top           = top_array  + 1;
-    pixel *filtered_left = filtered_left_array + 1;
-    pixel *filtered_top  = filtered_top_array  + 1;
-
-    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb - 1, y_tb + size_in_tbs);
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+                              lc->tu.intra_pred_mode;
+    pixel4 a;
+    pixel  left_array[2 * MAX_TB_SIZE + 1];
+    pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+    pixel  top_array[2 * MAX_TB_SIZE + 1];
+    pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+
+    pixel  *left          = left_array + 1;
+    pixel  *top           = top_array  + 1;
+    pixel  *filtered_left = filtered_left_array + 1;
+    pixel  *filtered_top  = filtered_top_array  + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
     int cand_left        = lc->na.cand_left;
     int cand_up_left     = lc->na.cand_up_left;
     int cand_up          = lc->na.cand_up;
-    int cand_up_right    = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb + size_in_tbs, y_tb - 1);
+    int cand_up_right    = lc->na.cand_up_right    && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1);
 
-    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma, s->ps.sps->height) -
-                            (y0 + size_in_luma)) >> vshift;
-    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma, s->ps.sps->width) -
-                            (x0 + size_in_luma)) >> hshift;
+    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) -
+                           (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                           (x0 + size_in_luma_h)) >> hshift;
 
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
-        int size_in_luma_pu = PU(size_in_luma);
-        int on_pu_edge_x    = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
-        int on_pu_edge_y    = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
-        if (!size_in_luma_pu)
-            size_in_luma_pu++;
+        int size_in_luma_pu_v = PU(size_in_luma_v);
+        int size_in_luma_pu_h = PU(size_in_luma_h);
+        int on_pu_edge_x    = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size);
+        int on_pu_edge_y    = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size);
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
         if (cand_bottom_left == 1 && on_pu_edge_x) {
             int x_left_pu   = PU(x0 - 1);
-            int y_bottom_pu = PU(y0 + size_in_luma);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_height - y_bottom_pu);
+            int y_bottom_pu = PU(y0 + size_in_luma_v);
+            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu);
             cand_bottom_left = 0;
-            for (i = 0; i < max; i++)
-                cand_bottom_left |= MVF(x_left_pu, y_bottom_pu + i).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA);
         }
         if (cand_left == 1 && on_pu_edge_x) {
             int x_left_pu   = PU(x0 - 1);
             int y_left_pu   = PU(y0);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_height - y_left_pu);
+            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu);
             cand_left = 0;
-            for (i = 0; i < max; i++)
-                cand_left |= MVF(x_left_pu, y_left_pu + i).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA);
         }
         if (cand_up_left == 1) {
             int x_left_pu   = PU(x0 - 1);
             int y_top_pu    = PU(y0 - 1);
-            cand_up_left = MVF(x_left_pu, y_top_pu).is_intra;
+            cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA;
         }
         if (cand_up == 1 && on_pu_edge_y) {
             int x_top_pu    = PU(x0);
             int y_top_pu    = PU(y0 - 1);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_width - x_top_pu);
+            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu);
             cand_up = 0;
-            for (i = 0; i < max; i++)
-                cand_up |= MVF(x_top_pu + i, y_top_pu).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA);
         }
         if (cand_up_right == 1 && on_pu_edge_y) {
             int y_top_pu    = PU(y0 - 1);
-            int x_right_pu  = PU(x0 + size_in_luma);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_width - x_right_pu);
+            int x_right_pu  = PU(x0 + size_in_luma_h);
+            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu);
             cand_up_right = 0;
-            for (i = 0; i < max; i++)
-                cand_up_right |= MVF(x_right_pu + i, y_top_pu).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA);
         }
-        for (i = 0; i < 2 * MAX_TB_SIZE; i++) {
-            left[i] = 128;
-            top[i]  = 128;
-        }
-    }
-    if (cand_bottom_left) {
-        for (i = size; i < size + bottom_left_size; i++)
-            left[i] = POS(-1, i);
-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
-               size - bottom_left_size);
+        memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel));
+        memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel));
+        top[-1] = 128;
     }
-    if (cand_left)
-        for (i = size - 1; i >= 0; i--)
-            left[i] = POS(-1, i);
     if (cand_up_left) {
         left[-1] = POS(-1, -1);
         top[-1]  = left[-1];
@@ -178,6 +173,15 @@ do {                                  \
         EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
                size - top_right_size);
     }
+    if (cand_left)
+        for (i = 0; i < size; i++)
+            left[i] = POS(-1, i);
+    if (cand_bottom_left) {
+        for (i = size; i < size + bottom_left_size; i++)
+            left[i] = POS(-1, i);
+        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+               size - bottom_left_size);
+    }
 
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
         if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) {
@@ -203,7 +207,6 @@ do {                                  \
                         j++;
                     EXTEND_LEFT_CIP(top, j, j + 1);
                     left[-1] = top[-1];
-                    j        = 0;
                 }
             } else {
                 j = 0;
@@ -217,24 +220,30 @@ do {                                  \
                         top[-1] = top[0];
                     }
                 left[-1] = top[-1];
-                j        = 0;
             }
+            left[-1] = top[-1];
             if (cand_bottom_left || cand_left) {
-                EXTEND_DOWN_CIP(left, j, size_max_y - j);
+                a = PIXEL_SPLAT_X4(left[-1]);
+                EXTEND_DOWN_CIP(left, 0, size_max_y);
             }
             if (!cand_left)
                 EXTEND(left, left[-1], size);
             if (!cand_bottom_left)
                 EXTEND(left + size, left[size - 1], size);
             if (x0 != 0 && y0 != 0) {
+                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
                 EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
+                if (!IS_INTRA(-1, - 1))
+                    left[-1] = left[0];
             } else if (x0 == 0) {
-                EXTEND_UP_CIP_0(left, size_max_y - 1, size_max_y);
+                EXTEND(left, 0, size_max_y);
             } else {
-                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y - 1);
+                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
+                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
             }
             top[-1] = left[-1];
             if (y0 != 0) {
+                a = PIXEL_SPLAT_X4(left[-1]);
                 EXTEND_RIGHT_CIP(top, 0, size_max_x);
             }
         }
@@ -278,40 +287,42 @@ do {                                  \
     top[-1] = left[-1];
 
     // Filtering process
-    if (c_idx == 0 && mode != INTRA_DC && size != 4) {
-        int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-        int min_dist_vert_hor = FFMIN(FFABS((int)mode - 26),
-                                      FFABS((int)mode - 10));
-        if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
-            int threshold = 1 << (BIT_DEPTH - 5);
-            if (s->ps.sps->sps_strong_intra_smoothing_enable_flag &&
-                log2_size == 5 &&
-                FFABS(top[-1]  + top[63]  - 2 * top[31])  < threshold &&
-                FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
-                // We can't just overwrite values in top because it could be
-                // a pointer into src
-                filtered_top[-1] = top[-1];
-                filtered_top[63] = top[63];
-                for (i = 0; i < 63; i++)
-                    filtered_top[i] = ((64 - (i + 1)) * top[-1] +
-                                             (i + 1)  * top[63] + 32) >> 6;
-                for (i = 0; i < 63; i++)
-                    left[i] = ((64 - (i + 1)) * left[-1] +
-                                     (i + 1)  * left[63] + 32) >> 6;
-                top = filtered_top;
-            } else {
-                filtered_left[2 * size - 1] = left[2 * size - 1];
-                filtered_top[2 * size - 1]  = top[2 * size - 1];
-                for (i = 2 * size - 2; i >= 0; i--)
-                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
-                                        left[i - 1] + 2) >> 2;
-                filtered_top[-1]  =
-                filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
-                for (i = 2 * size - 2; i >= 0; i--)
-                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
-                                       top[i - 1] + 2) >> 2;
-                left = filtered_left;
-                top  = filtered_top;
+    if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && size != 4){
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)),
+                                          FFABS((int)(mode - 10U)));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
+                int threshold = 1 << (BIT_DEPTH - 5);
+                if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 &&
+                    log2_size == 5 &&
+                    FFABS(top[-1]  + top[63]  - 2 * top[31])  < threshold &&
+                    FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
+                    // We can't just overwrite values in top because it could be
+                    // a pointer into src
+                    filtered_top[-1] = top[-1];
+                    filtered_top[63] = top[63];
+                    for (i = 0; i < 63; i++)
+                        filtered_top[i] = ((64 - (i + 1)) * top[-1] +
+                                           (i + 1)  * top[63] + 32) >> 6;
+                    for (i = 0; i < 63; i++)
+                        left[i] = ((64 - (i + 1)) * left[-1] +
+                                   (i + 1)  * left[63] + 32) >> 6;
+                    top = filtered_top;
+                } else {
+                    filtered_left[2 * size - 1] = left[2 * size - 1];
+                    filtered_top[2 * size - 1]  = top[2 * size - 1];
+                    for (i = 2 * size - 2; i >= 0; i--)
+                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                            left[i - 1] + 2) >> 2;
+                    filtered_top[-1]  =
+                    filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                    for (i = 2 * size - 2; i >= 0; i--)
+                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                           top[i - 1] + 2) >> 2;
+                    left = filtered_left;
+                    top  = filtered_top;
+                }
             }
         }
     }
@@ -394,8 +405,8 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
     a = PIXEL_SPLAT_X4(dc);
 
     for (i = 0; i < size; i++)
-        for (j = 0; j < size / 4; j++)
-            AV_WN4PA(&POS(j * 4, i), a);
+        for (j = 0; j < size; j+=4)
+            AV_WN4P(&POS(j, i), a);
 
     if (c_idx == 0 && size < 32) {
         POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
@@ -427,7 +438,7 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     };
 
     int angle = intra_pred_angle[mode - 2];
-    pixel ref_array[3 * MAX_TB_SIZE + 1];
+    pixel ref_array[3 * MAX_TB_SIZE + 4];
     pixel *ref_tmp = ref_array + size;
     const pixel *ref;
     int last = (size * angle) >> 5;
@@ -435,8 +446,8 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     if (mode >= 18) {
         ref = top - 1;
         if (angle < 0 && last < -1) {
-            for (x = 0; x <= size; x++)
-                ref_tmp[x] = top[x - 1];
+            for (x = 0; x <= size; x += 4)
+                AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1]));
             for (x = last; x <= -1; x++)
                 ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
             ref = ref_tmp;
@@ -446,13 +457,19 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
             int idx  = ((y + 1) * angle) >> 5;
             int fact = ((y + 1) * angle) & 31;
             if (fact) {
-                for (x = 0; x < size; x++) {
-                    POS(x, y) = ((32 - fact) * ref[x + idx + 1] +
-                                       fact  * ref[x + idx + 2] + 16) >> 5;
+                for (x = 0; x < size; x += 4) {
+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
+                                           fact  * ref[x + idx + 2] + 16) >> 5;
+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
                 }
             } else {
-                for (x = 0; x < size; x++)
-                    POS(x, y) = ref[x + idx + 1];
+                for (x = 0; x < size; x += 4)
+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
             }
         }
         if (mode == 26 && c_idx == 0 && size < 32) {
@@ -462,8 +479,8 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     } else {
         ref = left - 1;
         if (angle < 0 && last < -1) {
-            for (x = 0; x <= size; x++)
-                ref_tmp[x] = left[x - 1];
+            for (x = 0; x <= size; x += 4)
+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
             for (x = last; x <= -1; x++)
                 ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
             ref = ref_tmp;
@@ -483,8 +500,12 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
             }
         }
         if (mode == 10 && c_idx == 0 && size < 32) {
-            for (x = 0; x < size; x++)
-                POS(x, 0) = av_clip_pixel(left[0] + ((top[x] - top[-1]) >> 1));
+            for (x = 0; x < size; x += 4) {
+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - top[-1]) >> 1));
+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1));
+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1));
+            }
         }
     }
 }
diff --git a/libavcodec/hnm4video.c b/libavcodec/hnm4video.c
index 1dc6ed3..9e1ac49 100644
--- a/libavcodec/hnm4video.c
+++ b/libavcodec/hnm4video.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 David Kment
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,7 +79,7 @@ static void unpack_intraframe(AVCodecContext *avctx, uint8_t *src,
         if (getbit(&gb, &bitbuf, &bits)) {
             if (writeoffset >= hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
             hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
@@ -100,11 +100,11 @@ static void unpack_intraframe(AVCodecContext *avctx, uint8_t *src,
             count  += 2;
             offset += writeoffset;
             if (offset < 0 || offset + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
             } else if (writeoffset + count >= hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
             while (count--) {
@@ -147,7 +147,8 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
 {
     Hnm4VideoContext *hnm = avctx->priv_data;
     GetByteContext gb;
-    uint32_t writeoffset = 0, count, left, offset;
+    uint32_t writeoffset = 0;
+    int count, left, offset;
     uint8_t tag, previous, backline, backward, swap;
 
     bytestream2_init(&gb, src, size);
@@ -157,7 +158,12 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
         if (count == 0) {
             tag = bytestream2_get_byte(&gb) & 0xE0;
             tag = tag >> 5;
+
             if (tag == 0) {
+                if (writeoffset + 2 > hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
                 hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
             } else if (tag == 1) {
@@ -168,6 +174,10 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
                 writeoffset += count;
             } else if (tag == 3) {
                 count = bytestream2_get_byte(&gb) * 2;
+                if (writeoffset + count > hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 while (count > 0) {
                     hnm->current[writeoffset++] = bytestream2_peek_byte(&gb);
                     count--;
@@ -176,6 +186,10 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
             } else {
                 break;
             }
+            if (writeoffset > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                break;
+            }
         } else {
             previous = bytestream2_peek_byte(&gb) & 0x20;
             backline = bytestream2_peek_byte(&gb) & 0x40;
@@ -188,17 +202,28 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
 
             left = count;
 
-            if (!backward && offset + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+            if (!backward && offset + 2*count > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
-            } else if (backward && offset >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+            } else if (backward && offset + 1 >= hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
-            } else if (writeoffset + count >= hnm->width * hnm->height) {
+            } else if (writeoffset + 2*count > hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
+            if(backward) {
+                if (offset < (!!backline)*(2 * hnm->width - 1) + 2*(left-1)) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
+            } else {
+                if (offset < (!!backline)*(2 * hnm->width - 1)) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
+            }
 
             if (previous) {
                 while (left > 0) {
@@ -263,6 +288,10 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             if (tag == 0) {
                 writeoffset += bytestream2_get_byte(&gb);
             } else if (tag == 1) {
+                if (writeoffset + hnm->width >= hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 hnm->current[writeoffset]              = bytestream2_get_byte(&gb);
                 hnm->current[writeoffset + hnm->width] = bytestream2_get_byte(&gb);
                 writeoffset++;
@@ -271,6 +300,10 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             } else if (tag == 3) {
                 break;
             }
+            if (writeoffset > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                break;
+            }
         } else {
             delta    = bytestream2_peek_byte(&gb) & 0x80;
             previous = bytestream2_peek_byte(&gb) & 0x40;
@@ -279,14 +312,19 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             offset  = writeoffset;
             offset += bytestream2_get_le16(&gb);
 
-            if (delta)
+            if (delta) {
+                if (offset < 0x10000) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
                 offset -= 0x10000;
+            }
 
             if (offset + hnm->width + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
             } else if (writeoffset + hnm->width + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to write out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to write out of bounds\n");
                 break;
             }
 
@@ -337,6 +375,7 @@ static void hnm_update_palette(AVCodecContext *avctx, uint8_t *src,
             hnm->palette[writeoffset] = bytestream2_get_be24(&gb);
             if (!eight_bit_colors)
                 hnm->palette[writeoffset] <<= 2;
+            hnm->palette[writeoffset] |= (0xFFU << 24);
             count--;
             writeoffset++;
         }
@@ -360,17 +399,23 @@ static int hnm_decode_frame(AVCodecContext *avctx, void *data,
     int ret;
     uint16_t chunk_id;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (avpkt->size < 8) {
+        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+        return AVERROR_INVALIDDATA;
     }
 
     chunk_id = AV_RL16(avpkt->data + 4);
 
     if (chunk_id == HNM4_CHUNK_ID_PL) {
         hnm_update_palette(avctx, avpkt->data, avpkt->size);
-        frame->palette_has_changed = 1;
     } else if (chunk_id == HNM4_CHUNK_ID_IZ) {
+        if (avpkt->size < 12) {
+            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+
         unpack_intraframe(avctx, avpkt->data + 12, avpkt->size - 12);
         memcpy(hnm->previous, hnm->current, hnm->width * hnm->height);
         if (hnm->version == 0x4a)
@@ -383,6 +428,9 @@ static int hnm_decode_frame(AVCodecContext *avctx, void *data,
         memcpy(frame->data[1], hnm->palette, 256 * 4);
         *got_frame = 1;
     } else if (chunk_id == HNM4_CHUNK_ID_IU) {
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+
         if (hnm->version == 0x4a) {
             decode_interframe_v4a(avctx, avpkt->data + 8, avpkt->size - 8);
             memcpy(hnm->processed, hnm->current, hnm->width * hnm->height);
@@ -427,7 +475,9 @@ static av_cold int hnm_decode_init(AVCodecContext *avctx)
     hnm->buffer2   = av_mallocz(avctx->width * avctx->height);
     hnm->processed = av_mallocz(avctx->width * avctx->height);
 
-    if (!hnm->buffer1 || !hnm->buffer2 || !hnm->processed) {
+    if (   !hnm->buffer1 || !hnm->buffer2 || !hnm->processed
+        || avctx->width * avctx->height == 0
+        || avctx->height % 2) {
         av_log(avctx, AV_LOG_ERROR, "av_mallocz() failed\n");
         av_freep(&hnm->buffer1);
         av_freep(&hnm->buffer2);
diff --git a/libavcodec/hpel_template.c b/libavcodec/hpel_template.c
index 81d3892..fccfe76 100644
--- a/libavcodec/hpel_template.c
+++ b/libavcodec/hpel_template.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 25694c5..8e2fd8f 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -5,20 +5,20 @@
  *
  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -357,10 +357,14 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
 
     if (ARCH_AARCH64)
         ff_hpeldsp_init_aarch64(c, flags);
+    if (ARCH_ALPHA)
+        ff_hpeldsp_init_alpha(c, flags);
     if (ARCH_ARM)
         ff_hpeldsp_init_arm(c, flags);
     if (ARCH_PPC)
         ff_hpeldsp_init_ppc(c, flags);
     if (ARCH_X86)
         ff_hpeldsp_init_x86(c, flags);
+    if (ARCH_MIPS)
+        ff_hpeldsp_init_mips(c, flags);
 }
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index 62dee68..768139b 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -97,8 +97,10 @@ typedef struct HpelDSPContext {
 void ff_hpeldsp_init(HpelDSPContext *c, int flags);
 
 void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags);
+void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags);
 
 #endif /* AVCODEC_HPELDSP_H */
diff --git a/libavcodec/hq_hqa.c b/libavcodec/hq_hqa.c
index 2afe853..ec9da3e0 100644
--- a/libavcodec/hq_hqa.c
+++ b/libavcodec/hq_hqa.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,8 +24,8 @@
 #include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "canopus.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #include "hq_hqa.h"
@@ -59,7 +59,7 @@ static inline void put_blocks(HQContext *c, AVFrame *pic,
                          pic->linesize[plane] << ilace, block1);
 }
 
-static int hq_decode_block(HQContext *c, BitstreamContext *bc, int16_t block[64],
+static int hq_decode_block(HQContext *c, GetBitContext *gb, int16_t block[64],
                            int qsel, int is_chroma, int is_hqa)
 {
     const int32_t *q;
@@ -68,22 +68,22 @@ static int hq_decode_block(HQContext *c, BitstreamContext *bc, int16_t block[64]
     memset(block, 0, 64 * sizeof(*block));
 
     if (!is_hqa) {
-        block[0] = bitstream_read_signed(bc, 9) << 6;
-        q = ff_hq_quants[qsel][is_chroma][bitstream_read(bc, 2)];
+        block[0] = get_sbits(gb, 9) * 64;
+        q = ff_hq_quants[qsel][is_chroma][get_bits(gb, 2)];
     } else {
-        q = ff_hq_quants[qsel][is_chroma][bitstream_read(bc, 2)];
-        block[0] = bitstream_read_signed(bc, 9) << 6;
+        q = ff_hq_quants[qsel][is_chroma][get_bits(gb, 2)];
+        block[0] = get_sbits(gb, 9) * 64;
     }
 
     for (;;) {
-        val = bitstream_read_vlc(bc, c->hq_ac_vlc.table, 9, 2);
+        val = get_vlc2(gb, c->hq_ac_vlc.table, 9, 2);
         if (val < 0)
             return AVERROR_INVALIDDATA;
 
         pos += ff_hq_ac_skips[val];
         if (pos >= 64)
             break;
-        block[ff_zigzag_direct[pos]] = (ff_hq_ac_syms[val] * q[pos]) >> 12;
+        block[ff_zigzag_direct[pos]] = (int)(ff_hq_ac_syms[val] * (unsigned)q[pos]) >> 12;
         pos++;
     }
 
@@ -91,16 +91,16 @@ static int hq_decode_block(HQContext *c, BitstreamContext *bc, int16_t block[64]
 }
 
 static int hq_decode_mb(HQContext *c, AVFrame *pic,
-                        BitstreamContext *bc, int x, int y)
+                        GetBitContext *gb, int x, int y)
 {
     int qgroup, flag;
     int i, ret;
 
-    qgroup = bitstream_read(bc, 4);
-    flag   = bitstream_read_bit(bc);
+    qgroup = get_bits(gb, 4);
+    flag = get_bits1(gb);
 
     for (i = 0; i < 8; i++) {
-        ret = hq_decode_block(c, bc, c->block[i], qgroup, i >= 4, 0);
+        ret = hq_decode_block(c, gb, c->block[i], qgroup, i >= 4, 0);
         if (ret < 0)
             return ret;
     }
@@ -117,12 +117,12 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
                            int prof_num, size_t data_size)
 {
     const HQProfile *profile;
-    BitstreamContext bc;
+    GetBitContext gb;
     const uint8_t *perm, *src = ctx->gbc.buffer;
     uint32_t slice_off[21];
     int slice, start_off, next_off, i, ret;
 
-    if (prof_num >= NUM_HQ_PROFILES) {
+    if ((unsigned)prof_num >= NUM_HQ_PROFILES) {
         profile = &ff_hq_profile[0];
         avpriv_request_sample(ctx->avctx, "HQ Profile %d", prof_num);
     } else {
@@ -138,10 +138,8 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
     ctx->avctx->pix_fmt             = AV_PIX_FMT_YUV422P;
 
     ret = ff_get_buffer(ctx->avctx, pic, 0);
-    if (ret < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     /* Offsets are stored from CUV position, so adjust them accordingly. */
     for (i = 0; i < profile->num_slices + 1; i++)
@@ -157,14 +155,14 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
             slice_off[slice] >= slice_off[slice + 1] ||
             slice_off[slice + 1] > data_size) {
             av_log(ctx->avctx, AV_LOG_ERROR,
-                   "Invalid slice size %zu.\n", data_size);
+                   "Invalid slice size %"SIZE_SPECIFIER".\n", data_size);
             break;
         }
-        bitstream_init8(&bc, src + slice_off[slice],
-                        slice_off[slice + 1] - slice_off[slice]);
+        init_get_bits(&gb, src + slice_off[slice],
+                      (slice_off[slice + 1] - slice_off[slice]) * 8);
 
         for (i = 0; i < (next_off - start_off) * profile->tab_w; i++) {
-            ret = hq_decode_mb(ctx, pic, &bc, perm[0] * 16, perm[1] * 16);
+            ret = hq_decode_mb(ctx, pic, &gb, perm[0] * 16, perm[1] * 16);
             if (ret < 0) {
                 av_log(ctx->avctx, AV_LOG_ERROR,
                        "Error decoding macroblock %d at slice %d.\n", i, slice);
@@ -178,12 +176,15 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
 }
 
 static int hqa_decode_mb(HQContext *c, AVFrame *pic, int qgroup,
-                         BitstreamContext *bc, int x, int y)
+                         GetBitContext *gb, int x, int y)
 {
     int flag = 0;
     int i, ret, cbp;
 
-    cbp = bitstream_read_vlc(bc, c->hqa_cbp_vlc.table, 5, 1);
+    if (get_bits_left(gb) < 1)
+        return AVERROR_INVALIDDATA;
+
+    cbp = get_vlc2(gb, c->hqa_cbp_vlc.table, 5, 1);
 
     for (i = 0; i < 12; i++)
         memset(c->block[i], 0, sizeof(*c->block));
@@ -191,7 +192,7 @@ static int hqa_decode_mb(HQContext *c, AVFrame *pic, int qgroup,
         c->block[i][0] = -128 * (1 << 6);
 
     if (cbp) {
-        flag = bitstream_read_bit(bc);
+        flag = get_bits1(gb);
 
         cbp |= cbp << 4;
         if (cbp & 0x3)
@@ -201,7 +202,7 @@ static int hqa_decode_mb(HQContext *c, AVFrame *pic, int qgroup,
         for (i = 0; i < 12; i++) {
             if (!(cbp & (1 << i)))
                 continue;
-            ret = hq_decode_block(c, bc, c->block[i], qgroup, i >= 8, 1);
+            ret = hq_decode_block(c, gb, c->block[i], qgroup, i >= 8, 1);
             if (ret < 0)
                 return ret;
         }
@@ -217,7 +218,7 @@ static int hqa_decode_mb(HQContext *c, AVFrame *pic, int qgroup,
     return 0;
 }
 
-static int hqa_decode_slice(HQContext *ctx, AVFrame *pic, BitstreamContext *bc,
+static int hqa_decode_slice(HQContext *ctx, AVFrame *pic, GetBitContext *gb,
                             int quant, int slice_no, int w, int h)
 {
     int i, j, off;
@@ -226,7 +227,7 @@ static int hqa_decode_slice(HQContext *ctx, AVFrame *pic, BitstreamContext *bc,
     for (i = 0; i < h; i += 16) {
         off = (slice_no * 16 + i * 3) & 0x70;
         for (j = off; j < w; j += 128) {
-            ret = hqa_decode_mb(ctx, pic, quant, bc, j, i);
+            ret = hqa_decode_mb(ctx, pic, quant, gb, j, i);
             if (ret < 0) {
                 av_log(ctx->avctx, AV_LOG_ERROR,
                        "Error decoding macroblock at %dx%d.\n", i, j);
@@ -240,7 +241,7 @@ static int hqa_decode_slice(HQContext *ctx, AVFrame *pic, BitstreamContext *bc,
 
 static int hqa_decode_frame(HQContext *ctx, AVFrame *pic, size_t data_size)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     const int num_slices = 8;
     uint32_t slice_off[9];
     int i, slice, ret;
@@ -268,10 +269,8 @@ static int hqa_decode_frame(HQContext *ctx, AVFrame *pic, size_t data_size)
     }
 
     ret = ff_get_buffer(ctx->avctx, pic, 0);
-    if (ret < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     /* Offsets are stored from HQA1 position, so adjust them accordingly. */
     for (i = 0; i < num_slices + 1; i++)
@@ -282,13 +281,13 @@ static int hqa_decode_frame(HQContext *ctx, AVFrame *pic, size_t data_size)
             slice_off[slice] >= slice_off[slice + 1] ||
             slice_off[slice + 1] > data_size) {
             av_log(ctx->avctx, AV_LOG_ERROR,
-                   "Invalid slice size %zu.\n", data_size);
+                   "Invalid slice size %"SIZE_SPECIFIER".\n", data_size);
             break;
         }
-        bitstream_init8(&bc, src + slice_off[slice],
-                        slice_off[slice + 1] - slice_off[slice]);
+        init_get_bits(&gb, src + slice_off[slice],
+                      (slice_off[slice + 1] - slice_off[slice]) * 8);
 
-        ret = hqa_decode_slice(ctx, pic, &bc, quant, slice, width, height);
+        ret = hqa_decode_slice(ctx, pic, &gb, quant, slice, width, height);
         if (ret < 0)
             return ret;
     }
@@ -303,7 +302,8 @@ static int hq_hqa_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *pic = data;
     uint32_t info_tag;
     unsigned int data_size;
-    int tag, ret;
+    int ret;
+    unsigned tag;
 
     bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
     if (bytestream2_get_bytes_left(&ctx->gbc) < 4 + 4) {
diff --git a/libavcodec/hq_hqa.h b/libavcodec/hq_hqa.h
index 1fa9975..608e2ca 100644
--- a/libavcodec/hq_hqa.h
+++ b/libavcodec/hq_hqa.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadata.c b/libavcodec/hq_hqadata.c
index 23fefc1..ae9231a 100644
--- a/libavcodec/hq_hqadata.c
+++ b/libavcodec/hq_hqadata.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadsp.c b/libavcodec/hq_hqadsp.c
index 93fc067..1b9f138 100644
--- a/libavcodec/hq_hqadsp.c
+++ b/libavcodec/hq_hqadsp.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #define FIX_1_414 23170
 #define FIX_2_613 21407 // divided by two to fit the range
 
-#define IDCTMUL(a, b) ((a) * (b) >> 16)
+#define IDCTMUL(a, b) ((int)((a) * (unsigned)(b)) >> 16)
 
 static inline void idct_row(int16_t *blk)
 {
diff --git a/libavcodec/hq_hqadsp.h b/libavcodec/hq_hqadsp.h
index 22b1e61..420ed92 100644
--- a/libavcodec/hq_hqadsp.h
+++ b/libavcodec/hq_hqadsp.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqx.c b/libavcodec/hqx.c
index 2d1abf0..bc24ba9 100644
--- a/libavcodec/hqx.c
+++ b/libavcodec/hqx.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,10 @@
 #include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "canopus.h"
+#include "get_bits.h"
 #include "internal.h"
+#include "thread.h"
 
 #include "hqx.h"
 #include "hqxdsp.h"
@@ -95,23 +96,23 @@ static inline void put_blocks(HQXContext *ctx, int plane,
                          lsize * fields, block1, quant);
 }
 
-static inline void hqx_get_ac(BitstreamContext *bc, const HQXAC *ac,
+static inline void hqx_get_ac(GetBitContext *gb, const HQXAC *ac,
                               int *run, int *lev)
 {
     int val;
 
-    val = bitstream_peek(bc, ac->lut_bits);
+    val = show_bits(gb, ac->lut_bits);
     if (ac->lut[val].bits == -1) {
-        BitstreamContext bc2 = *bc;
-        bitstream_skip(&bc2, ac->lut_bits);
-        val = ac->lut[val].lev + bitstream_peek(&bc2, ac->extra_bits);
+        GetBitContext gb2 = *gb;
+        skip_bits(&gb2, ac->lut_bits);
+        val = ac->lut[val].lev + show_bits(&gb2, ac->extra_bits);
     }
     *run = ac->lut[val].run;
     *lev = ac->lut[val].lev;
-    bitstream_skip(bc, ac->lut[val].bits);
+    skip_bits(gb, ac->lut[val].bits);
 }
 
-static int decode_block(BitstreamContext *bc, VLC *vlc,
+static int decode_block(GetBitContext *gb, VLC *vlc,
                         const int *quants, int dcb,
                         int16_t block[64], int *last_dc)
 {
@@ -120,14 +121,14 @@ static int decode_block(BitstreamContext *bc, VLC *vlc,
     int run, lev, pos = 1;
 
     memset(block, 0, 64 * sizeof(*block));
-    dc = bitstream_read_vlc(bc, vlc->table, HQX_DC_VLC_BITS, 2);
+    dc = get_vlc2(gb, vlc->table, HQX_DC_VLC_BITS, 2);
     if (dc < 0)
         return AVERROR_INVALIDDATA;
     *last_dc += dc;
 
     block[0] = sign_extend(*last_dc << (12 - dcb), 12);
 
-    q = quants[bitstream_read(bc, 2)];
+    q = quants[get_bits(gb, 2)];
     if (q >= 128)
         ac_idx = HQX_AC_Q128;
     else if (q >= 64)
@@ -142,7 +143,7 @@ static int decode_block(BitstreamContext *bc, VLC *vlc,
         ac_idx = HQX_AC_Q0;
 
     do {
-        hqx_get_ac(bc, &ff_hqx_ac[ac_idx], &run, &lev);
+        hqx_get_ac(gb, &ff_hqx_ac[ac_idx], &run, &lev);
         pos += run;
         if (pos >= 64)
             break;
@@ -155,24 +156,24 @@ static int decode_block(BitstreamContext *bc, VLC *vlc,
 static int hqx_decode_422(HQXContext *ctx, int slice_no, int x, int y)
 {
     HQXSlice *slice = &ctx->slice[slice_no];
-    BitstreamContext *bc = &slice->bc;
+    GetBitContext *gb = &slice->gb;
     const int *quants;
     int flag;
     int last_dc;
     int i, ret;
 
     if (ctx->interlaced)
-        flag = bitstream_read_bit(bc);
+        flag = get_bits1(gb);
     else
         flag = 0;
 
-    quants = hqx_quants[bitstream_read(bc, 4)];
+    quants = hqx_quants[get_bits(gb, 4)];
 
     for (i = 0; i < 8; i++) {
         int vlc_index = ctx->dcb - 9;
         if (i == 0 || i == 4 || i == 6)
             last_dc = 0;
-        ret = decode_block(bc, &ctx->dc_vlc[vlc_index], quants,
+        ret = decode_block(gb, &ctx->dc_vlc[vlc_index], quants,
                            ctx->dcb, slice->block[i], &last_dc);
         if (ret < 0)
             return ret;
@@ -189,14 +190,14 @@ static int hqx_decode_422(HQXContext *ctx, int slice_no, int x, int y)
 static int hqx_decode_422a(HQXContext *ctx, int slice_no, int x, int y)
 {
     HQXSlice *slice = &ctx->slice[slice_no];
-    BitstreamContext *bc = &slice->bc;
+    GetBitContext *gb = &slice->gb;
     const int *quants;
     int flag = 0;
     int last_dc;
     int i, ret;
     int cbp;
 
-    cbp = bitstream_read_vlc(bc, ctx->cbp_vlc.table, ctx->cbp_vlc.bits, 1);
+    cbp = get_vlc2(gb, ctx->cbp_vlc.table, ctx->cbp_vlc.bits, 1);
 
     for (i = 0; i < 12; i++)
         memset(slice->block[i], 0, sizeof(**slice->block) * 64);
@@ -204,9 +205,9 @@ static int hqx_decode_422a(HQXContext *ctx, int slice_no, int x, int y)
         slice->block[i][0] = -0x800;
     if (cbp) {
         if (ctx->interlaced)
-            flag = bitstream_read_bit(bc);
+            flag = get_bits1(gb);
 
-        quants = hqx_quants[bitstream_read(bc, 4)];
+        quants = hqx_quants[get_bits(gb, 4)];
 
         cbp |= cbp << 4; // alpha CBP
         if (cbp & 0x3)   // chroma CBP - top
@@ -218,7 +219,7 @@ static int hqx_decode_422a(HQXContext *ctx, int slice_no, int x, int y)
                 last_dc = 0;
             if (cbp & (1 << i)) {
                 int vlc_index = ctx->dcb - 9;
-                ret = decode_block(bc, &ctx->dc_vlc[vlc_index], quants,
+                ret = decode_block(gb, &ctx->dc_vlc[vlc_index], quants,
                                    ctx->dcb, slice->block[i], &last_dc);
                 if (ret < 0)
                     return ret;
@@ -239,24 +240,24 @@ static int hqx_decode_422a(HQXContext *ctx, int slice_no, int x, int y)
 static int hqx_decode_444(HQXContext *ctx, int slice_no, int x, int y)
 {
     HQXSlice *slice = &ctx->slice[slice_no];
-    BitstreamContext *bc = &slice->bc;
+    GetBitContext *gb = &slice->gb;
     const int *quants;
     int flag;
     int last_dc;
     int i, ret;
 
     if (ctx->interlaced)
-        flag = bitstream_read_bit(bc);
+        flag = get_bits1(gb);
     else
         flag = 0;
 
-    quants = hqx_quants[bitstream_read(bc, 4)];
+    quants = hqx_quants[get_bits(gb, 4)];
 
     for (i = 0; i < 12; i++) {
         int vlc_index = ctx->dcb - 9;
         if (i == 0 || i == 4 || i == 8)
             last_dc = 0;
-        ret = decode_block(bc, &ctx->dc_vlc[vlc_index], quants,
+        ret = decode_block(gb, &ctx->dc_vlc[vlc_index], quants,
                            ctx->dcb, slice->block[i], &last_dc);
         if (ret < 0)
             return ret;
@@ -275,14 +276,14 @@ static int hqx_decode_444(HQXContext *ctx, int slice_no, int x, int y)
 static int hqx_decode_444a(HQXContext *ctx, int slice_no, int x, int y)
 {
     HQXSlice *slice = &ctx->slice[slice_no];
-    BitstreamContext *bc = &slice->bc;
+    GetBitContext *gb = &slice->gb;
     const int *quants;
     int flag = 0;
     int last_dc;
     int i, ret;
     int cbp;
 
-    cbp = bitstream_read_vlc(bc, ctx->cbp_vlc.table, ctx->cbp_vlc.bits, 1);
+    cbp = get_vlc2(gb, ctx->cbp_vlc.table, ctx->cbp_vlc.bits, 1);
 
     for (i = 0; i < 16; i++)
         memset(slice->block[i], 0, sizeof(**slice->block) * 64);
@@ -290,9 +291,9 @@ static int hqx_decode_444a(HQXContext *ctx, int slice_no, int x, int y)
         slice->block[i][0] = -0x800;
     if (cbp) {
         if (ctx->interlaced)
-            flag = bitstream_read_bit(bc);
+            flag = get_bits1(gb);
 
-        quants = hqx_quants[bitstream_read(bc, 4)];
+        quants = hqx_quants[get_bits(gb, 4)];
 
         cbp |= cbp << 4; // alpha CBP
         cbp |= cbp << 8; // chroma CBP
@@ -301,7 +302,7 @@ static int hqx_decode_444a(HQXContext *ctx, int slice_no, int x, int y)
                 last_dc = 0;
             if (cbp & (1 << i)) {
                 int vlc_index = ctx->dcb - 9;
-                ret = decode_block(bc, &ctx->dc_vlc[vlc_index], quants,
+                ret = decode_block(gb, &ctx->dc_vlc[vlc_index], quants,
                                    ctx->dcb, slice->block[i], &last_dc);
                 if (ret < 0)
                     return ret;
@@ -392,9 +393,9 @@ static int decode_slice_thread(AVCodecContext *avctx, void *arg,
         return AVERROR_INVALIDDATA;
     }
 
-    ret = bitstream_init8(&ctx->slice[slice_no].bc,
-                          ctx->src + slice_off[slice_no],
-                          slice_off[slice_no + 1] - slice_off[slice_no]);
+    ret = init_get_bits8(&ctx->slice[slice_no].gb,
+                         ctx->src + slice_off[slice_no],
+                         slice_off[slice_no + 1] - slice_off[slice_no]);
     if (ret < 0)
         return ret;
 
@@ -405,6 +406,7 @@ static int hqx_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_picture_ptr, AVPacket *avpkt)
 {
     HQXContext *ctx = avctx->priv_data;
+    ThreadFrame frame = { .f = data };
     uint8_t *src = avpkt->data;
     uint32_t info_tag;
     int data_start;
@@ -458,7 +460,7 @@ static int hqx_decode_frame(AVCodecContext *avctx, void *data,
     }
     ret = av_image_check_size(ctx->width, ctx->height, 0, avctx);
     if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid stored dimenstions %dx%d.\n",
+        av_log(avctx, AV_LOG_ERROR, "Invalid stored dimensions %dx%d.\n",
                ctx->width, ctx->height);
         return AVERROR_INVALIDDATA;
     }
@@ -491,11 +493,9 @@ static int hqx_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    ret = ff_get_buffer(avctx, ctx->pic, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    ret = ff_thread_get_buffer(avctx, &frame, 0);
+    if (ret < 0)
         return ret;
-    }
 
     avctx->execute2(avctx, decode_slice_thread, NULL, NULL, 16);
 
@@ -512,6 +512,9 @@ static av_cold int hqx_decode_close(AVCodecContext *avctx)
     int i;
     HQXContext *ctx = avctx->priv_data;
 
+    if (avctx->internal->is_copy)
+        return 0;
+
     ff_free_vlc(&ctx->cbp_vlc);
     for (i = 0; i < 3; i++) {
         ff_free_vlc(&ctx->dc_vlc[i]);
@@ -538,7 +541,8 @@ AVCodec ff_hqx_decoder = {
     .init           = hqx_decode_init,
     .decode         = hqx_decode_frame,
     .close          = hqx_decode_close,
-    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
+                      AV_CODEC_CAP_FRAME_THREADS,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
                       FF_CODEC_CAP_INIT_CLEANUP,
 };
diff --git a/libavcodec/hqx.h b/libavcodec/hqx.h
index f35721a..42d382d 100644
--- a/libavcodec/hqx.h
+++ b/libavcodec/hqx.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,10 +25,8 @@
 
 #include "libavutil/frame.h"
 #include "libavutil/mem.h"
-
-#include "bitstream.h"
+#include "get_bits.h"
 #include "hqxdsp.h"
-#include "vlc.h"
 
 enum HQXACMode {
     HQX_AC_Q0 = 0,
@@ -57,7 +55,7 @@ typedef int (*mb_decode_func)(struct HQXContext *ctx,
                               int slice_no, int x, int y);
 
 typedef struct HQXSlice {
-    BitstreamContext bc;
+    GetBitContext gb;
     DECLARE_ALIGNED(16, int16_t, block)[16][64];
 } HQXSlice;
 
diff --git a/libavcodec/hqxdsp.c b/libavcodec/hqxdsp.c
index 2a02299..7f8044e 100644
--- a/libavcodec/hqxdsp.c
+++ b/libavcodec/hqxdsp.c
@@ -1,20 +1,20 @@
 /*
  * HQX DSP routines
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,18 +39,18 @@ static inline void idct_col(int16_t *blk, const uint8_t *quant)
     s6 = (int) blk[6 * 8] * quant[6 * 8];
     s7 = (int) blk[7 * 8] * quant[7 * 8];
 
-    t0  =  (s3 * 19266 + s5 * 12873) >> 15;
-    t1  =  (s5 * 19266 - s3 * 12873) >> 15;
-    t2  = ((s7 * 4520  + s1 * 22725) >> 15) - t0;
-    t3  = ((s1 * 4520  - s7 * 22725) >> 15) - t1;
+    t0  =  (int)(s3 * 19266U + s5 * 12873U) >> 15;
+    t1  =  (int)(s5 * 19266U - s3 * 12873U) >> 15;
+    t2  = ((int)(s7 * 4520U  + s1 * 22725U) >> 15) - t0;
+    t3  = ((int)(s1 * 4520U  - s7 * 22725U) >> 15) - t1;
     t4  = t0 * 2 + t2;
     t5  = t1 * 2 + t3;
     t6  = t2 - t3;
     t7  = t3 * 2 + t6;
-    t8  = (t6 * 11585) >> 14;
-    t9  = (t7 * 11585) >> 14;
-    tA  = (s2 * 8867 - s6 * 21407) >> 14;
-    tB  = (s6 * 8867 + s2 * 21407) >> 14;
+    t8  = (int)(t6 * 11585U) >> 14;
+    t9  = (int)(t7 * 11585U) >> 14;
+    tA  = (int)(s2 * 8867U - s6 * 21407U) >> 14;
+    tB  = (int)(s6 * 8867U + s2 * 21407U) >> 14;
     tC  = (s0 >> 1) - (s4 >> 1);
     tD  = (s4 >> 1) * 2 + tC;
     tE  = tC - (tA >> 1);
@@ -118,7 +118,7 @@ static void hqx_idct_put(uint16_t *dst, ptrdiff_t stride,
 
     for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
-            int v = av_clip(block[j + i * 8] + 0x800, 0, 0xFFF);
+            int v = av_clip_uintp2(block[j + i * 8] + 0x800, 12);
             dst[j] = (v << 4) | (v >> 8);
         }
         dst += stride >> 1;
diff --git a/libavcodec/hqxdsp.h b/libavcodec/hqxdsp.h
index 2cd2a8e..39ab3e2 100644
--- a/libavcodec/hqxdsp.h
+++ b/libavcodec/hqxdsp.h
@@ -1,20 +1,20 @@
 /*
  * HQX DSP routines
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqxvlc.c b/libavcodec/hqxvlc.c
index d185e86..06a8073 100644
--- a/libavcodec/hqxvlc.c
+++ b/libavcodec/hqxvlc.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/htmlsubtitles.c b/libavcodec/htmlsubtitles.c
new file mode 100644
index 0000000..d9221ba
--- /dev/null
+++ b/libavcodec/htmlsubtitles.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (c) 2017  Clément Bœsch <u@pkh.me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/parseutils.h"
+#include "htmlsubtitles.h"
+#include <ctype.h>
+
+static int html_color_parse(void *log_ctx, const char *str)
+{
+    uint8_t rgba[4];
+    int nb_sharps = 0;
+    while (str[nb_sharps] == '#')
+        nb_sharps++;
+    str += FFMAX(0, nb_sharps - 1);
+    if (av_parse_color(rgba, str, strcspn(str, "\" >"), log_ctx) < 0)
+        return -1;
+    return rgba[0] | rgba[1] << 8 | rgba[2] << 16;
+}
+
+static void rstrip_spaces_buf(AVBPrint *buf)
+{
+    if (av_bprint_is_complete(buf))
+        while (buf->len > 0 && buf->str[buf->len - 1] == ' ')
+            buf->str[--buf->len] = 0;
+}
+
+/*
+ * Fast code for scanning text enclosed in braces. Functionally
+ * equivalent to this sscanf call:
+ *
+ * sscanf(in, "{\\an%*1u}%n", &len) >= 0 && len > 0
+ */
+static int scanbraces(const char* in) {
+    if (strncmp(in, "{\\an", 4) != 0) {
+        return 0;
+    }
+    if (!isdigit(in[4])) {
+        return 0;
+    }
+    if (in[5] != '}') {
+        return 0;
+    }
+    return 1;
+}
+
+/* skip all {\xxx} substrings except for {\an%d}
+   and all microdvd like styles such as {Y:xxx} */
+static void handle_open_brace(AVBPrint *dst, const char **inp, int *an, int *closing_brace_missing)
+{
+    const char *in = *inp;
+
+    *an += scanbraces(in);
+
+    if (!*closing_brace_missing) {
+        if (   (*an != 1 && in[1] == '\\')
+            || (in[1] && strchr("CcFfoPSsYy", in[1]) && in[2] == ':')) {
+            char *bracep = strchr(in+2, '}');
+            if (bracep) {
+                *inp = bracep;
+                return;
+            } else
+                *closing_brace_missing = 1;
+        }
+    }
+
+    av_bprint_chars(dst, *in, 1);
+}
+
+struct font_tag {
+    char face[128];
+    int size;
+    uint32_t color;
+};
+
+/*
+ * Fast code for scanning the rest of a tag. Functionally equivalent to
+ * this sscanf call:
+ *
+ * sscanf(in, "%127[^<>]>%n", buffer, lenp) == 2
+ */
+static int scantag(const char* in, char* buffer, int* lenp) {
+    int len;
+
+    for (len = 0; len < 128; len++) {
+        const char c = *in++;
+        switch (c) {
+        case '\0':
+            return 0;
+        case '<':
+            return 0;
+        case '>':
+            buffer[len] = '\0';
+            *lenp = len+1;
+            return 1;
+        default:
+            break;
+        }
+        buffer[len] = c;
+    }
+    return 0;
+}
+
+/*
+ * The general politic of the convert is to mask unsupported tags or formatting
+ * errors (but still alert the user/subtitles writer with an error/warning)
+ * without dropping any actual text content for the final user.
+ */
+int ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in)
+{
+    char *param, buffer[128];
+    int len, tag_close, sptr = 0, line_start = 1, an = 0, end = 0;
+    int closing_brace_missing = 0;
+    int i, likely_a_tag;
+
+    /*
+     * state stack is only present for fonts since they are the only tags where
+     * the state is not binary. Here is a typical use case:
+     *
+     *   <font color="red" size=10>
+     *     red 10
+     *     <font size=50> RED AND BIG </font>
+     *     red 10 again
+     *   </font>
+     *
+     * On the other hand, using the state system for all the tags should be
+     * avoided because it breaks wrongly nested tags such as:
+     *
+     *   <b> foo <i> bar </b> bla </i>
+     *
+     * We don't want to break here; instead, we will treat all these tags as
+     * binary state markers. Basically, "<b>" will activate bold, and "</b>"
+     * will deactivate it, whatever the current state.
+     *
+     * This will also prevents cases where we have a random closing tag
+     * remaining after the opening one was dropped. Yes, this happens and we
+     * still don't want to print a "</b>" at the end of the dialog event.
+     */
+    struct font_tag stack[16];
+
+    memset(&stack[0], 0, sizeof(stack[0]));
+
+    for (; !end && *in; in++) {
+        switch (*in) {
+        case '\r':
+            break;
+        case '\n':
+            if (line_start) {
+                end = 1;
+                break;
+            }
+            rstrip_spaces_buf(dst);
+            av_bprintf(dst, "\\N");
+            line_start = 1;
+            break;
+        case ' ':
+            if (!line_start)
+                av_bprint_chars(dst, *in, 1);
+            break;
+        case '{':
+            handle_open_brace(dst, &in, &an, &closing_brace_missing);
+            break;
+        case '<':
+            /*
+             * "<<" are likely latin guillemets in ASCII or some kind of random
+             * style effect; see sub/badsyntax.srt in the FATE samples
+             * directory for real test cases.
+             */
+
+            likely_a_tag = 1;
+            for (i = 0; in[1] == '<'; i++) {
+                av_bprint_chars(dst, '<', 1);
+                likely_a_tag = 0;
+                in++;
+            }
+
+            tag_close = in[1] == '/';
+            if (tag_close)
+                likely_a_tag = 1;
+
+            av_assert0(in[0] == '<');
+
+            len = 0;
+
+            if (scantag(in+tag_close+1, buffer, &len) && len > 0) {
+                const int skip = len + tag_close;
+                const char *tagname = buffer;
+                while (*tagname == ' ') {
+                    likely_a_tag = 0;
+                    tagname++;
+                }
+                if ((param = strchr(tagname, ' ')))
+                    *param++ = 0;
+
+                /* Check if this is likely a tag */
+#define LIKELY_A_TAG_CHAR(x) (((x) >= '0' && (x) <= '9') || \
+                              ((x) >= 'a' && (x) <= 'z') || \
+                              ((x) >= 'A' && (x) <= 'Z') || \
+                               (x) == '_' || (x) == '/')
+                for (i = 0; tagname[i]; i++) {
+                    if (!LIKELY_A_TAG_CHAR(tagname[i])) {
+                        likely_a_tag = 0;
+                        break;
+                    }
+                }
+
+                if (!av_strcasecmp(tagname, "font")) {
+                    if (tag_close && sptr > 0) {
+                        struct font_tag *cur_tag  = &stack[sptr--];
+                        struct font_tag *last_tag = &stack[sptr];
+
+                        if (cur_tag->size) {
+                            if (!last_tag->size)
+                                av_bprintf(dst, "{\\fs}");
+                            else if (last_tag->size != cur_tag->size)
+                                av_bprintf(dst, "{\\fs%d}", last_tag->size);
+                        }
+
+                        if (cur_tag->color & 0xff000000) {
+                            if (!(last_tag->color & 0xff000000))
+                                av_bprintf(dst, "{\\c}");
+                            else if (last_tag->color != cur_tag->color)
+                                av_bprintf(dst, "{\\c&H%"PRIX32"&}", last_tag->color & 0xffffff);
+                        }
+
+                        if (cur_tag->face[0]) {
+                            if (!last_tag->face[0])
+                                av_bprintf(dst, "{\\fn}");
+                            else if (strcmp(last_tag->face, cur_tag->face))
+                                av_bprintf(dst, "{\\fn%s}", last_tag->face);
+                        }
+                    } else if (!tag_close && sptr < FF_ARRAY_ELEMS(stack) - 1) {
+                        struct font_tag *new_tag = &stack[sptr + 1];
+
+                        *new_tag = stack[sptr++];
+
+                        while (param) {
+                            if (!av_strncasecmp(param, "size=", 5)) {
+                                param += 5 + (param[5] == '"');
+                                if (sscanf(param, "%u", &new_tag->size) == 1)
+                                    av_bprintf(dst, "{\\fs%u}", new_tag->size);
+                            } else if (!av_strncasecmp(param, "color=", 6)) {
+                                int color;
+                                param += 6 + (param[6] == '"');
+                                color = html_color_parse(log_ctx, param);
+                                if (color >= 0) {
+                                    new_tag->color = 0xff000000 | color;
+                                    av_bprintf(dst, "{\\c&H%"PRIX32"&}", new_tag->color & 0xffffff);
+                                }
+                            } else if (!av_strncasecmp(param, "face=", 5)) {
+                                param += 5 + (param[5] == '"');
+                                len = strcspn(param,
+                                              param[-1] == '"' ? "\"" :" ");
+                                av_strlcpy(new_tag->face, param,
+                                           FFMIN(sizeof(new_tag->face), len+1));
+                                param += len;
+                                av_bprintf(dst, "{\\fn%s}", new_tag->face);
+                            }
+                            if ((param = strchr(param, ' ')))
+                                param++;
+                        }
+                    }
+                    in += skip;
+                } else if (tagname[0] && !tagname[1] && strchr("bisu", av_tolower(tagname[0]))) {
+                    av_bprintf(dst, "{\\%c%d}", (char)av_tolower(tagname[0]), !tag_close);
+                    in += skip;
+                } else if (!av_strncasecmp(tagname, "br", 2) &&
+                           (!tagname[2] || (tagname[2] == '/' && !tagname[3]))) {
+                    av_bprintf(dst, "\\N");
+                    in += skip;
+                } else if (likely_a_tag) {
+                    if (!tag_close) // warn only once
+                        av_log(log_ctx, AV_LOG_WARNING, "Unrecognized tag %s\n", tagname);
+                    in += skip;
+                } else {
+                    av_bprint_chars(dst, '<', 1);
+                }
+            } else {
+                av_bprint_chars(dst, *in, 1);
+            }
+            break;
+        default:
+            av_bprint_chars(dst, *in, 1);
+            break;
+        }
+        if (*in != ' ' && *in != '\r' && *in != '\n')
+            line_start = 0;
+    }
+
+    if (!av_bprint_is_complete(dst))
+        return AVERROR(ENOMEM);
+
+    while (dst->len >= 2 && !strncmp(&dst->str[dst->len - 2], "\\N", 2))
+        dst->len -= 2;
+    dst->str[dst->len] = 0;
+    rstrip_spaces_buf(dst);
+
+    return 0;
+}
diff --git a/libavcodec/htmlsubtitles.h b/libavcodec/htmlsubtitles.h
new file mode 100644
index 0000000..f3a8ef5
--- /dev/null
+++ b/libavcodec/htmlsubtitles.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HTMLSUBTITLES_H
+#define AVCODEC_HTMLSUBTITLES_H
+
+#include "libavutil/bprint.h"
+
+int ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in);
+
+#endif /* AVCODEC_HTMLSUBTITLES_H */
diff --git a/libavcodec/huffman.c b/libavcodec/huffman.c
index 3b15aa2..df1141b 100644
--- a/libavcodec/huffman.c
+++ b/libavcodec/huffman.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2006 Konstantin Shishkov
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,8 @@
 
 #include <stdint.h>
 
-#include"libavutil/common.h"
+#include "libavutil/qsort.h"
+#include "libavutil/common.h"
 
 #include "avcodec.h"
 #include "huffman.h"
@@ -54,18 +55,31 @@ static void heap_sift(HeapElem *h, int root, int size)
     }
 }
 
-void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats)
+int ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats, int stats_size, int skip0)
 {
-    HeapElem h[256];
-    int up[2*256];
-    int len[2*256];
+    HeapElem *h  = av_malloc_array(sizeof(*h), stats_size);
+    int *up      = av_malloc_array(sizeof(*up) * 2, stats_size);
+    uint8_t *len = av_malloc_array(sizeof(*len) * 2, stats_size);
+    uint16_t *map= av_malloc_array(sizeof(*map), stats_size);
     int offset, i, next;
-    int size = 256;
+    int size = 0;
+    int ret = 0;
+
+    if (!h || !up || !len || !map) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    for (i = 0; i<stats_size; i++) {
+        dst[i] = 255;
+        if (stats[i] || !skip0)
+            map[size++] = i;
+    }
 
     for (offset = 1; ; offset <<= 1) {
         for (i=0; i < size; i++) {
             h[i].name = i;
-            h[i].val = (stats[i] << 8) + offset;
+            h[i].val = (stats[map[i]] << 14) + offset;
         }
         for (i = size / 2 - 1; i >= 0; i--)
             heap_sift(h, i, size);
@@ -86,11 +100,17 @@ void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats)
         for (i = 2 * size - 3; i >= size; i--)
             len[i] = len[up[i]] + 1;
         for (i = 0; i < size; i++) {
-            dst[i] = len[up[i]] + 1;
-            if (dst[i] >= 32) break;
+            dst[map[i]] = len[up[i]] + 1;
+            if (dst[map[i]] >= 32) break;
         }
         if (i==size) break;
     }
+end:
+    av_free(h);
+    av_free(up);
+    av_free(len);
+    av_free(map);
+    return ret;
 }
 
 static void get_tree_codes(uint32_t *bits, int16_t *lens, uint8_t *xlat,
@@ -153,22 +173,23 @@ int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes, int nb_bit
                "Tree construction is not possible\n");
         return -1;
     }
-    qsort(nodes, nb_codes, sizeof(Node), cmp);
+    AV_QSORT(nodes, nb_codes, Node, cmp);
     cur_node = nb_codes;
     nodes[nb_codes*2-1].count = 0;
     for (i = 0; i < nb_codes * 2 - 1; i += 2) {
-        nodes[cur_node].sym = HNODE;
-        nodes[cur_node].count = nodes[i].count + nodes[i + 1].count;
-        nodes[cur_node].n0 = i;
-        for (j = cur_node; j > 0; j--) {
-            if (nodes[j].count > nodes[j - 1].count ||
-                (nodes[j].count == nodes[j - 1].count &&
-                 (!(flags & FF_HUFFMAN_FLAG_HNODE_FIRST) ||
-                  nodes[j].n0 == j - 1 || nodes[j].n0 == j - 2 ||
-                  (nodes[j].sym!=HNODE && nodes[j-1].sym!=HNODE))))
+        uint32_t cur_count = nodes[i].count + nodes[i+1].count;
+        // find correct place to insert new node, and
+        // make space for the new node while at it
+        for(j = cur_node; j > i + 2; j--){
+            if(cur_count > nodes[j-1].count ||
+               (cur_count == nodes[j-1].count &&
+                !(flags & FF_HUFFMAN_FLAG_HNODE_FIRST)))
                 break;
-            FFSWAP(Node, nodes[j], nodes[j - 1]);
+            nodes[j] = nodes[j - 1];
         }
+        nodes[j].sym = HNODE;
+        nodes[j].count = cur_count;
+        nodes[j].n0 = i;
         cur_node++;
     }
     if (build_huff_tree(vlc, nodes, nb_codes * 2 - 2, flags, nb_bits) < 0) {
diff --git a/libavcodec/huffman.h b/libavcodec/huffman.h
index 87cbe4b..4f879e6 100644
--- a/libavcodec/huffman.h
+++ b/libavcodec/huffman.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,6 @@ typedef int (*HuffCmp)(const void *va, const void *vb);
 int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes, int nb_bits,
                        Node *nodes, HuffCmp cmp, int flags);
 
-void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats);
+int ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats, int n, int skip0);
 
 #endif /* AVCODEC_HUFFMAN_H */
diff --git a/libavcodec/huffyuv.c b/libavcodec/huffyuv.c
index da5c52f..e582060 100644
--- a/libavcodec/huffyuv.c
+++ b/libavcodec/huffyuv.c
@@ -1,25 +1,25 @@
 /*
  * huffyuv codec for libavcodec
  *
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,13 +36,13 @@
 #include "bswapdsp.h"
 #include "huffyuv.h"
 
-int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table)
+int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table, int n)
 {
     int len, index;
     uint32_t bits = 0;
 
     for (len = 32; len > 0; len--) {
-        for (index = 0; index < 256; index++) {
+        for (index = 0; index < n; index++) {
             if (len_table[index] == len)
                 dst[index] = bits++;
         }
@@ -59,16 +59,11 @@ av_cold int ff_huffyuv_alloc_temp(HYuvContext *s)
 {
     int i;
 
-    if (s->bitstream_bpp<24) {
-        for (i=0; i<3; i++) {
-            s->temp[i]= av_malloc(s->width + 16);
-            if (!s->temp[i])
-                return AVERROR(ENOMEM);
-        }
-    } else {
-        s->temp[0]= av_mallocz(4*s->width + 16);
-        if (!s->temp[0])
+    for (i=0; i<3; i++) {
+        s->temp[i]= av_malloc(4*s->width + 16);
+        if (!s->temp[i])
             return AVERROR(ENOMEM);
+        s->temp16[i] = (uint16_t*)s->temp[i];
     }
     return 0;
 }
@@ -84,14 +79,16 @@ av_cold void ff_huffyuv_common_init(AVCodecContext *avctx)
 
     s->width = avctx->width;
     s->height = avctx->height;
-    assert(s->width>0 && s->height>0);
+
+    av_assert1(s->width > 0 && s->height > 0);
 }
 
-void ff_huffyuv_common_end(HYuvContext *s)
+av_cold void ff_huffyuv_common_end(HYuvContext *s)
 {
     int i;
 
     for(i = 0; i < 3; i++) {
         av_freep(&s->temp[i]);
+        s->temp16[i] = NULL;
     }
 }
diff --git a/libavcodec/huffyuv.h b/libavcodec/huffyuv.h
index a4a83b9..83309d4 100644
--- a/libavcodec/huffyuv.h
+++ b/libavcodec/huffyuv.h
@@ -1,23 +1,23 @@
 /*
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,20 +37,14 @@
 #include "huffyuvdsp.h"
 #include "huffyuvencdsp.h"
 #include "put_bits.h"
+#include "lossless_videodsp.h"
+#include "lossless_videoencdsp.h"
 
-#define VLC_BITS 11
+#define VLC_BITS 12
 
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
+#define MAX_BITS 16
+#define MAX_N (1<<MAX_BITS)
+#define MAX_VLC_N 16384
 
 typedef enum Predictor {
     LEFT = 0,
@@ -59,7 +53,7 @@ typedef enum Predictor {
 } Predictor;
 
 typedef struct HYuvContext {
-    const AVClass *class;
+    AVClass *class;
     AVCodecContext *avctx;
     Predictor predictor;
     GetBitContext gb;
@@ -70,27 +64,39 @@ typedef struct HYuvContext {
     int version;
     int yuy2;                               //use yuy2 instead of 422P
     int bgr32;                              //use bgr32 instead of bgr24
+    int bps;
+    int n;                                  // 1<<bps
+    int vlc_n;                              // number of vlc codes (FFMIN(1<<bps, MAX_VLC_N))
+    int alpha;
+    int chroma;
+    int yuv;
+    int chroma_h_shift;
+    int chroma_v_shift;
     int width, height;
     int flags;
     int context;
     int picture_number;
     int last_slice_end;
     uint8_t *temp[3];
-    uint64_t stats[3][256];
-    uint8_t len[3][256];
-    uint32_t bits[3][256];
+    uint16_t *temp16[3];                    ///< identical to temp but 16bit type
+    uint64_t stats[4][MAX_VLC_N];
+    uint8_t len[4][MAX_VLC_N];
+    uint32_t bits[4][MAX_VLC_N];
     uint32_t pix_bgr_map[1<<VLC_BITS];
-    VLC vlc[6];                             //Y,U,V,YY,YU,YV
+    VLC vlc[8];                             //Y,U,V,A,YY,YU,YV,AA
     uint8_t *bitstream_buffer;
     unsigned int bitstream_buffer_size;
     BswapDSPContext bdsp;
     HuffYUVDSPContext hdsp;
     HuffYUVEncDSPContext hencdsp;
+    LLVidDSPContext llviddsp;
+    LLVidEncDSPContext llvidencdsp;
+    int non_determ; // non-deterministic, multi-threaded encoder allowed
 } HYuvContext;
 
 void ff_huffyuv_common_init(AVCodecContext *s);
 void ff_huffyuv_common_end(HYuvContext *s);
 int  ff_huffyuv_alloc_temp(HYuvContext *s);
-int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table);
+int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table, int n);
 
 #endif /* AVCODEC_HUFFYUV_H */
diff --git a/libavcodec/huffyuvdec.c b/libavcodec/huffyuvdec.c
index 12eca26..27f650d 100644
--- a/libavcodec/huffyuvdec.c
+++ b/libavcodec/huffyuvdec.c
@@ -1,26 +1,28 @@
 /*
  * huffyuv decoder
  *
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * yuva, gray, 4:4:4, 4:1:1, 4:1:0 and >8 bit per sample support sponsored by NOA
  */
 
 /**
@@ -28,17 +30,23 @@
  * huffyuv decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "avcodec.h"
 #include "get_bits.h"
 #include "huffyuv.h"
 #include "huffyuvdsp.h"
+#include "lossless_videodsp.h"
 #include "thread.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
 
 #define classic_shift_luma_table_size 42
 static const unsigned char classic_shift_luma[classic_shift_luma_table_size + AV_INPUT_BUFFER_PADDING_SIZE] = {
     34, 36, 35, 69, 135, 232,   9, 16, 10, 24,  11,  23,  12,  16, 13, 10,
     14,  8, 15,  8,  16,   8,  17, 20, 16, 10, 207, 206, 205, 236, 11,  8,
-    10, 21,  9, 23,   8,   8, 199, 70, 69, 68,   0
+    10, 21,  9, 23,   8,   8, 199, 70, 69, 68,   0,
+  0,0,0,0,0,0,0,0,
 };
 
 #define classic_shift_chroma_table_size 59
@@ -46,7 +54,8 @@ static const unsigned char classic_shift_chroma[classic_shift_chroma_table_size
     66, 36,  37,  38, 39, 40,  41,  75,  76,  77, 110, 239, 144, 81, 82,  83,
     84, 85, 118, 183, 56, 57,  88,  89,  56,  89, 154,  57,  58, 57, 26, 141,
     57, 56,  58,  57, 58, 57, 184, 119, 214, 245, 116,  83,  82, 49, 80,  79,
-    78, 77,  44,  75, 41, 40,  39,  38,  37,  36,  34,  0
+    78, 77,  44,  75, 41, 40,  39,  38,  37,  36,  34,  0,
+  0,0,0,0,0,0,0,0,
 };
 
 static const unsigned char classic_add_luma[256] = {
@@ -87,16 +96,16 @@ static const unsigned char classic_add_chroma[256] = {
       6,  12,   8,  10,   7,   9,   6,   4,   6,   2,   2,   3,   3,   3,   3,   2,
 };
 
-static int read_len_table(uint8_t *dst, GetBitContext *gb)
+static int read_len_table(uint8_t *dst, GetBitContext *gb, int n)
 {
     int i, val, repeat;
 
-    for (i = 0; i < 256;) {
+    for (i = 0; i < n;) {
         repeat = get_bits(gb, 3);
         val    = get_bits(gb, 5);
         if (repeat == 0)
             repeat = get_bits(gb, 8);
-        if (i + repeat > 256 || get_bits_left(gb) < 0) {
+        if (i + repeat > n || get_bits_left(gb) < 0) {
             av_log(NULL, AV_LOG_ERROR, "Error reading huffman table\n");
             return AVERROR_INVALIDDATA;
         }
@@ -108,34 +117,43 @@ static int read_len_table(uint8_t *dst, GetBitContext *gb)
 
 static int generate_joint_tables(HYuvContext *s)
 {
-    uint16_t symbols[1 << VLC_BITS];
-    uint16_t bits[1 << VLC_BITS];
-    uint8_t len[1 << VLC_BITS];
     int ret;
+    uint16_t *symbols = av_mallocz(5 << VLC_BITS);
+    uint16_t *bits;
+    uint8_t *len;
+    if (!symbols)
+        return AVERROR(ENOMEM);
+    bits = symbols + (1 << VLC_BITS);
+    len = (uint8_t *)(bits + (1 << VLC_BITS));
 
-    if (s->bitstream_bpp < 24) {
+    if (s->bitstream_bpp < 24 || s->version > 2) {
         int p, i, y, u;
-        for (p = 0; p < 3; p++) {
-            for (i = y = 0; y < 256; y++) {
-                int len0  = s->len[0][y];
+        for (p = 0; p < 4; p++) {
+            int p0 = s->version > 2 ? p : 0;
+            for (i = y = 0; y < s->vlc_n; y++) {
+                int len0  = s->len[p0][y];
                 int limit = VLC_BITS - len0;
-                if (limit <= 0)
+                if (limit <= 0 || !len0)
                     continue;
-                for (u = 0; u < 256; u++) {
+                if ((sign_extend(y, 8) & (s->vlc_n-1)) != y)
+                    continue;
+                for (u = 0; u < s->vlc_n; u++) {
                     int len1 = s->len[p][u];
-                    if (len1 > limit)
+                    if (len1 > limit || !len1)
+                        continue;
+                    if ((sign_extend(u, 8) & (s->vlc_n-1)) != u)
                         continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i]     = len0 + len1;
-                    bits[i]    = (s->bits[0][y] << len1) + s->bits[p][u];
-                    symbols[i] = (y << 8) + u;
-                    if (symbols[i] != 0xffff) // reserved to mean "invalid"
+                    bits[i]    = (s->bits[p0][y] << len1) + s->bits[p][u];
+                    symbols[i] = (y << 8) + (u & 0xFF);
                         i++;
                 }
             }
-            ff_free_vlc(&s->vlc[3 + p]);
-            if ((ret = ff_init_vlc_sparse(&s->vlc[3 + p], VLC_BITS, i, len, 1, 1,
+            ff_free_vlc(&s->vlc[4 + p]);
+            if ((ret = ff_init_vlc_sparse(&s->vlc[4 + p], VLC_BITS, i, len, 1, 1,
                                           bits, 2, 2, symbols, 2, 2, 0)) < 0)
-                return ret;
+                goto out;
         }
     } else {
         uint8_t (*map)[4] = (uint8_t(*)[4]) s->pix_bgr_map;
@@ -148,18 +166,19 @@ static int generate_joint_tables(HYuvContext *s)
         for (i = 0, g = -16; g < 16; g++) {
             int len0   = s->len[p0][g & 255];
             int limit0 = VLC_BITS - len0;
-            if (limit0 < 2)
+            if (limit0 < 2 || !len0)
                 continue;
             for (b = -16; b < 16; b++) {
                 int len1   = s->len[p1][b & 255];
                 int limit1 = limit0 - len1;
-                if (limit1 < 1)
+                if (limit1 < 1 || !len1)
                     continue;
                 code = (s->bits[p0][g & 255] << len1) + s->bits[p1][b & 255];
                 for (r = -16; r < 16; r++) {
                     int len2 = s->len[2][r & 255];
-                    if (len2 > limit1)
+                    if (len2 > limit1 || !len2)
                         continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i]  = len0 + len1 + len2;
                     bits[i] = (code << len2) + s->bits[2][r & 255];
                     if (s->decorrelate) {
@@ -175,30 +194,37 @@ static int generate_joint_tables(HYuvContext *s)
                 }
             }
         }
-        ff_free_vlc(&s->vlc[3]);
-        if ((ret = init_vlc(&s->vlc[3], VLC_BITS, i, len, 1, 1,
+        ff_free_vlc(&s->vlc[4]);
+        if ((ret = init_vlc(&s->vlc[4], VLC_BITS, i, len, 1, 1,
                             bits, 2, 2, 0)) < 0)
-            return ret;
+            goto out;
     }
-    return 0;
+    ret = 0;
+out:
+    av_freep(&symbols);
+    return ret;
 }
 
 static int read_huffman_tables(HYuvContext *s, const uint8_t *src, int length)
 {
     GetBitContext gb;
     int i, ret;
+    int count = 3;
 
     if ((ret = init_get_bits(&gb, src, length * 8)) < 0)
         return ret;
 
-    for (i = 0; i < 3; i++) {
-        if ((ret = read_len_table(s->len[i], &gb)) < 0)
+    if (s->version > 2)
+        count = 1 + s->alpha + 2*s->chroma;
+
+    for (i = 0; i < count; i++) {
+        if ((ret = read_len_table(s->len[i], &gb, s->vlc_n)) < 0)
             return ret;
-        if ((ret = ff_huffyuv_generate_bits_table(s->bits[i], s->len[i])) < 0)
+        if ((ret = ff_huffyuv_generate_bits_table(s->bits[i], s->len[i], s->vlc_n)) < 0)
             return ret;
         ff_free_vlc(&s->vlc[i]);
-        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
-                            s->bits[i], 4, 4, 0)) < 0)
+        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, s->vlc_n, s->len[i], 1, 1,
+                           s->bits[i], 4, 4, 0)) < 0)
             return ret;
     }
 
@@ -213,16 +239,14 @@ static int read_old_huffman_tables(HYuvContext *s)
     GetBitContext gb;
     int i, ret;
 
-    if ((ret = init_get_bits(&gb, classic_shift_luma,
-                             classic_shift_luma_table_size * 8)) < 0)
-        return ret;
-    if ((ret = read_len_table(s->len[0], &gb)) < 0)
+    init_get_bits(&gb, classic_shift_luma,
+                  classic_shift_luma_table_size * 8);
+    if ((ret = read_len_table(s->len[0], &gb, 256)) < 0)
         return ret;
 
-    if ((ret = init_get_bits(&gb, classic_shift_chroma,
-                             classic_shift_chroma_table_size * 8)) < 0)
-        return ret;
-    if ((ret = read_len_table(s->len[1], &gb)) < 0)
+    init_get_bits(&gb, classic_shift_chroma,
+                  classic_shift_chroma_table_size * 8);
+    if ((ret = read_len_table(s->len[1], &gb, 256)) < 0)
         return ret;
 
     for (i = 0; i < 256; i++)
@@ -237,7 +261,7 @@ static int read_old_huffman_tables(HYuvContext *s)
     memcpy(s->bits[2], s->bits[1], 256 * sizeof(uint32_t));
     memcpy(s->len[2], s->len[1], 256 * sizeof(uint8_t));
 
-    for (i = 0; i < 3; i++) {
+    for (i = 0; i < 4; i++) {
         ff_free_vlc(&s->vlc[i]);
         if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
                             s->bits[i], 4, 4, 0)) < 0)
@@ -250,28 +274,52 @@ static int read_old_huffman_tables(HYuvContext *s)
     return 0;
 }
 
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    HYuvContext *s = avctx->priv_data;
+    int i;
+
+    ff_huffyuv_common_end(s);
+    av_freep(&s->bitstream_buffer);
+
+    for (i = 0; i < 8; i++)
+        ff_free_vlc(&s->vlc[i]);
+
+    return 0;
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int ret;
 
-    ff_huffyuv_common_init(avctx);
-    ff_huffyuvdsp_init(&s->hdsp);
-    memset(s->vlc, 0, 3 * sizeof(VLC));
+    ret = av_image_check_size(avctx->width, avctx->height, 0, avctx);
+    if (ret < 0)
+        return ret;
+
+    ff_huffyuvdsp_init(&s->hdsp, avctx->pix_fmt);
+    ff_llviddsp_init(&s->llviddsp);
+    memset(s->vlc, 0, 4 * sizeof(VLC));
 
-    s->interlaced = s->height > 288;
+    s->interlaced = avctx->height > 288;
     s->bgr32      = 1;
 
     if (avctx->extradata_size) {
         if ((avctx->bits_per_coded_sample & 7) &&
             avctx->bits_per_coded_sample != 12)
             s->version = 1; // do such files exist at all?
-        else
+        else if (avctx->extradata_size > 3 && avctx->extradata[3] == 0)
             s->version = 2;
+        else
+            s->version = 3;
     } else
         s->version = 0;
 
-    if (s->version == 2) {
+    s->bps = 8;
+    s->n = 1<<s->bps;
+    s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+    s->chroma = 1;
+    if (s->version >= 2) {
         int method, interlace;
 
         if (avctx->extradata_size < 4)
@@ -280,16 +328,27 @@ static av_cold int decode_init(AVCodecContext *avctx)
         method           = avctx->extradata[0];
         s->decorrelate   = method & 64 ? 1 : 0;
         s->predictor     = method & 63;
-        s->bitstream_bpp = avctx->extradata[1];
-        if (s->bitstream_bpp == 0)
-            s->bitstream_bpp = avctx->bits_per_coded_sample & ~7;
+        if (s->version == 2) {
+            s->bitstream_bpp = avctx->extradata[1];
+            if (s->bitstream_bpp == 0)
+                s->bitstream_bpp = avctx->bits_per_coded_sample & ~7;
+        } else {
+            s->bps = (avctx->extradata[1] >> 4) + 1;
+            s->n = 1<<s->bps;
+            s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+            s->chroma_h_shift = avctx->extradata[1] & 3;
+            s->chroma_v_shift = (avctx->extradata[1] >> 2) & 3;
+            s->yuv   = !!(avctx->extradata[2] & 1);
+            s->chroma= !!(avctx->extradata[2] & 3);
+            s->alpha = !!(avctx->extradata[2] & 4);
+        }
         interlace     = (avctx->extradata[2] & 0x30) >> 4;
         s->interlaced = (interlace == 1) ? 1 : (interlace == 2) ? 0 : s->interlaced;
         s->context    = avctx->extradata[2] & 0x40 ? 1 : 0;
 
         if ((ret = read_huffman_tables(s, avctx->extradata + 4,
                                        avctx->extradata_size - 4)) < 0)
-            return ret;
+            goto error;
     } else {
         switch (avctx->bits_per_coded_sample & 7) {
         case 1:
@@ -317,55 +376,220 @@ static av_cold int decode_init(AVCodecContext *avctx)
         s->context       = 0;
 
         if ((ret = read_old_huffman_tables(s)) < 0)
-            return ret;
+            goto error;
     }
 
-    switch (s->bitstream_bpp) {
-    case 12:
-        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-        break;
-    case 16:
-        if (s->yuy2)
-            avctx->pix_fmt = AV_PIX_FMT_YUYV422;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-        break;
-    case 24:
-    case 32:
-        if (s->bgr32)
+    if (s->version <= 2) {
+        switch (s->bitstream_bpp) {
+        case 12:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            s->yuv = 1;
+            break;
+        case 16:
+            if (s->yuy2)
+                avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            s->yuv = 1;
+            break;
+        case 24:
+            if (s->bgr32)
+                avctx->pix_fmt = AV_PIX_FMT_0RGB32;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_BGR24;
+            break;
+        case 32:
+            av_assert0(s->bgr32);
             avctx->pix_fmt = AV_PIX_FMT_RGB32;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_BGR24;
-        break;
-    default:
-        return AVERROR_INVALIDDATA;
+            s->alpha = 1;
+            break;
+        default:
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
+        av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                         &s->chroma_h_shift,
+                                         &s->chroma_v_shift);
+    } else {
+        switch ( (s->chroma<<10) | (s->yuv<<9) | (s->alpha<<8) | ((s->bps-1)<<4) | s->chroma_h_shift | (s->chroma_v_shift<<2)) {
+        case 0x070:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+            break;
+        case 0x0F0:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            break;
+        case 0x170:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8A;
+            break;
+        case 0x470:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            break;
+        case 0x480:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP9;
+            break;
+        case 0x490:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+            break;
+        case 0x4B0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+            break;
+        case 0x4D0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP14;
+            break;
+        case 0x4F0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP16;
+            break;
+        case 0x570:
+            avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            break;
+        case 0x670:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+            break;
+        case 0x680:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P9;
+            break;
+        case 0x690:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+            break;
+        case 0x6B0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P12;
+            break;
+        case 0x6D0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P14;
+            break;
+        case 0x6F0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
+            break;
+        case 0x671:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            break;
+        case 0x681:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P9;
+            break;
+        case 0x691:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            break;
+        case 0x6B1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+            break;
+        case 0x6D1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P14;
+            break;
+        case 0x6F1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
+            break;
+        case 0x672:
+            avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+            break;
+        case 0x674:
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+            break;
+        case 0x675:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            break;
+        case 0x685:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
+            break;
+        case 0x695:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
+            break;
+        case 0x6B5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P12;
+            break;
+        case 0x6D5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P14;
+            break;
+        case 0x6F5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
+            break;
+        case 0x67A:
+            avctx->pix_fmt = AV_PIX_FMT_YUV410P;
+            break;
+        case 0x770:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+            break;
+        case 0x780:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P9;
+            break;
+        case 0x790:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P10;
+            break;
+        case 0x7F0:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P16;
+            break;
+        case 0x771:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+            break;
+        case 0x781:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P9;
+            break;
+        case 0x791:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P10;
+            break;
+        case 0x7F1:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P16;
+            break;
+        case 0x775:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+            break;
+        case 0x785:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P9;
+            break;
+        case 0x795:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P10;
+            break;
+        case 0x7F5:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P16;
+            break;
+        default:
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
     }
 
+    ff_huffyuv_common_init(avctx);
+
+    if ((avctx->pix_fmt == AV_PIX_FMT_YUV422P || avctx->pix_fmt == AV_PIX_FMT_YUV420P) && avctx->width & 1) {
+        av_log(avctx, AV_LOG_ERROR, "width must be even for this colorspace\n");
+        ret = AVERROR_INVALIDDATA;
+        goto error;
+    }
     if (s->predictor == MEDIAN && avctx->pix_fmt == AV_PIX_FMT_YUV422P &&
         avctx->width % 4) {
-        av_log(avctx, AV_LOG_ERROR, "width must be multiple of 4 "
+        av_log(avctx, AV_LOG_ERROR, "width must be a multiple of 4 "
                "for this combination of colorspace and predictor type.\n");
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto error;
     }
 
-    if ((ret = ff_huffyuv_alloc_temp(s)) < 0)
-        return ret;
+    if ((ret = ff_huffyuv_alloc_temp(s)) < 0) {
+        ff_huffyuv_common_end(s);
+        goto error;
+    }
 
     return 0;
+  error:
+    decode_end(avctx);
+    return ret;
 }
 
+#if HAVE_THREADS
 static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int i, ret;
 
-    if ((ret = ff_huffyuv_alloc_temp(s)) < 0)
+    s->avctx = avctx;
+
+    if ((ret = ff_huffyuv_alloc_temp(s)) < 0) {
+        ff_huffyuv_common_end(s);
         return ret;
+    }
 
-    for (i = 0; i < 6; i++)
+    for (i = 0; i < 8; i++)
         s->vlc[i].table = NULL;
 
-    if (s->version == 2) {
+    if (s->version >= 2) {
         if ((ret = read_huffman_tables(s, avctx->extradata + 4,
                                        avctx->extradata_size)) < 0)
             return ret;
@@ -376,49 +600,174 @@ static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
+
+/** Subset of GET_VLC for use in hand-roller VLC code */
+#define VLC_INTERN(dst, table, gb, name, bits, max_depth)   \
+    code = table[index][0];                                 \
+    n    = table[index][1];                                 \
+    if (max_depth > 1 && n < 0) {                           \
+        LAST_SKIP_BITS(name, gb, bits);                     \
+        UPDATE_CACHE(name, gb);                             \
+                                                            \
+        nb_bits = -n;                                       \
+        index   = SHOW_UBITS(name, gb, nb_bits) + code;     \
+        code    = table[index][0];                          \
+        n       = table[index][1];                          \
+        if (max_depth > 2 && n < 0) {                       \
+            LAST_SKIP_BITS(name, gb, nb_bits);              \
+            UPDATE_CACHE(name, gb);                         \
+                                                            \
+            nb_bits = -n;                                   \
+            index   = SHOW_UBITS(name, gb, nb_bits) + code; \
+            code    = table[index][0];                      \
+            n       = table[index][1];                      \
+        }                                                   \
+    }                                                       \
+    dst = code;                                             \
+    LAST_SKIP_BITS(name, gb, n)
+
+
+#define GET_VLC_DUAL(dst0, dst1, name, gb, dtable, table1, table2,  \
+                     bits, max_depth, OP)                           \
+    do {                                                            \
+        unsigned int index = SHOW_UBITS(name, gb, bits);            \
+        int          code, n = dtable[index][1];                    \
+                                                                    \
+        if (n<=0) {                                                 \
+            int nb_bits;                                            \
+            VLC_INTERN(dst0, table1, gb, name, bits, max_depth);    \
+                                                                    \
+            UPDATE_CACHE(re, gb);                                   \
+            index = SHOW_UBITS(name, gb, bits);                     \
+            VLC_INTERN(dst1, table2, gb, name, bits, max_depth);    \
+        } else {                                                    \
+            code = dtable[index][0];                                \
+            OP(dst0, dst1, code);                                   \
+            LAST_SKIP_BITS(name, gb, n);                            \
+        }                                                           \
+    } while (0)
+
+#define OP8bits(dst0, dst1, code) dst0 = code>>8; dst1 = code
 
-/* TODO instead of restarting the read when the code isn't in the first level
- * of the joint table, jump into the 2nd level of the individual table. */
 #define READ_2PIX(dst0, dst1, plane1)                                   \
-    {                                                                   \
-        uint16_t code = get_vlc2(&s->gb, s->vlc[3 + plane1].table,      \
-                                 VLC_BITS, 1);                          \
-        if (code != 0xffff) {                                           \
-            dst0 = code >> 8;                                           \
-            dst1 = code;                                                \
-        } else {                                                        \
-            dst0 = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);      \
-            dst1 = get_vlc2(&s->gb, s->vlc[plane1].table, VLC_BITS, 3); \
-        }                                                               \
-    }
+    UPDATE_CACHE(re, &s->gb);                                           \
+    GET_VLC_DUAL(dst0, dst1, re, &s->gb, s->vlc[4+plane1].table,        \
+                 s->vlc[0].table, s->vlc[plane1].table, VLC_BITS, 3, OP8bits)
 
 static void decode_422_bitstream(HYuvContext *s, int count)
 {
-    int i;
-
+    int i, icount;
+    OPEN_READER(re, &s->gb);
     count /= 2;
 
-    if (count >= (get_bits_left(&s->gb)) / (31 * 4)) {
-        for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+    icount = get_bits_left(&s->gb) / (32 * 4);
+    if (count >= icount) {
+        for (i = 0; i < icount; i++) {
             READ_2PIX(s->temp[0][2 * i],     s->temp[1][i], 1);
             READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
         }
+        for (; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+            READ_2PIX(s->temp[0][2 * i    ], s->temp[1][i], 1);
+            if (BITS_LEFT(re, &s->gb) <= 0) break;
+            READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
+        }
+        for (; i < count; i++)
+            s->temp[0][2 * i    ] = s->temp[1][i] =
+            s->temp[0][2 * i + 1] = s->temp[2][i] = 0;
     } else {
         for (i = 0; i < count; i++) {
             READ_2PIX(s->temp[0][2 * i],     s->temp[1][i], 1);
             READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
         }
     }
+    CLOSE_READER(re, &s->gb);
+}
+
+#define READ_2PIX_PLANE(dst0, dst1, plane, OP) \
+    UPDATE_CACHE(re, &s->gb); \
+    GET_VLC_DUAL(dst0, dst1, re, &s->gb, s->vlc[4+plane].table, \
+                 s->vlc[plane].table, s->vlc[plane].table, VLC_BITS, 3, OP)
+
+#define OP14bits(dst0, dst1, code) dst0 = code>>8; dst1 = sign_extend(code, 8)
+
+/* TODO instead of restarting the read when the code isn't in the first level
+ * of the joint table, jump into the 2nd level of the individual table. */
+#define READ_2PIX_PLANE16(dst0, dst1, plane){\
+    dst0 = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;\
+    dst0 += get_bits(&s->gb, 2);\
+    dst1 = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;\
+    dst1 += get_bits(&s->gb, 2);\
+}
+static void decode_plane_bitstream(HYuvContext *s, int width, int plane)
+{
+    int i, count = width/2;
+
+    if (s->bps <= 8) {
+        OPEN_READER(re, &s->gb);
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+                READ_2PIX_PLANE(s->temp[0][2 * i], s->temp[0][2 * i + 1], plane, OP8bits);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE(s->temp[0][2 * i], s->temp[0][2 * i + 1], plane, OP8bits);
+            }
+        }
+        if( width&1 && BITS_LEFT(re, &s->gb)>0 ) {
+            unsigned int index;
+            int nb_bits, code, n;
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp[0][width-1], s->vlc[plane].table,
+                       &s->gb, re, VLC_BITS, 3);
+        }
+        CLOSE_READER(re, &s->gb);
+    } else if (s->bps <= 14) {
+        OPEN_READER(re, &s->gb);
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+                READ_2PIX_PLANE(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane, OP14bits);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane, OP14bits);
+            }
+        }
+        if( width&1 && BITS_LEFT(re, &s->gb)>0 ) {
+            unsigned int index;
+            int nb_bits, code, n;
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp16[0][width-1], s->vlc[plane].table,
+                       &s->gb, re, VLC_BITS, 3);
+        }
+        CLOSE_READER(re, &s->gb);
+    } else {
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+                READ_2PIX_PLANE16(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE16(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane);
+            }
+        }
+        if( width&1 && get_bits_left(&s->gb)>0 ) {
+            int dst = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;
+            s->temp16[0][width-1] = dst + get_bits(&s->gb, 2);
+        }
+    }
 }
 
 static void decode_gray_bitstream(HYuvContext *s, int count)
 {
     int i;
-
+    OPEN_READER(re, &s->gb);
     count /= 2;
 
-    if (count >= (get_bits_left(&s->gb)) / (31 * 2)) {
-        for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+    if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+        for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
             READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
         }
     } else {
@@ -426,30 +775,65 @@ static void decode_gray_bitstream(HYuvContext *s, int count)
             READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
         }
     }
+    CLOSE_READER(re, &s->gb);
 }
 
 static av_always_inline void decode_bgr_1(HYuvContext *s, int count,
                                           int decorrelate, int alpha)
 {
     int i;
-    for (i = 0; i < count; i++) {
-        int code = get_vlc2(&s->gb, s->vlc[3].table, VLC_BITS, 1);
-        if (code != -1) {
+    OPEN_READER(re, &s->gb);
+
+    for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+        unsigned int index;
+        int code, n, nb_bits;
+
+        UPDATE_CACHE(re, &s->gb);
+        index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+        n     = s->vlc[4].table[index][1];
+
+        if (n>0) {
+            code  = s->vlc[4].table[index][0];
             *(uint32_t *) &s->temp[0][4 * i] = s->pix_bgr_map[code];
-        } else if (decorrelate) {
-            s->temp[0][4 * i + G] = get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3);
-            s->temp[0][4 * i + B] = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) +
-                                    s->temp[0][4 * i + G];
-            s->temp[0][4 * i + R] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) +
-                                    s->temp[0][4 * i + G];
+            LAST_SKIP_BITS(re, &s->gb, n);
         } else {
-            s->temp[0][4 * i + B] = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
-            s->temp[0][4 * i + G] = get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3);
-            s->temp[0][4 * i + R] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3);
+            if (decorrelate) {
+                VLC_INTERN(s->temp[0][4 * i + G], s->vlc[1].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(code, s->vlc[0].table, &s->gb, re, VLC_BITS, 3);
+                s->temp[0][4 * i + B] = code + s->temp[0][4 * i + G];
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(code, s->vlc[2].table, &s->gb, re, VLC_BITS, 3);
+                s->temp[0][4 * i + R] = code + s->temp[0][4 * i + G];
+            } else {
+                VLC_INTERN(s->temp[0][4 * i + B], s->vlc[0].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(s->temp[0][4 * i + G], s->vlc[1].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(s->temp[0][4 * i + R], s->vlc[2].table,
+                           &s->gb, re, VLC_BITS, 3);
+            }
         }
-        if (alpha)
-            s->temp[0][4 * i + A] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3);
+        if (alpha) {
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp[0][4 * i + A], s->vlc[2].table,
+                       &s->gb, re, VLC_BITS, 3);
+        } else
+            s->temp[0][4 * i + A] = 0;
     }
+    CLOSE_READER(re, &s->gb);
 }
 
 static void decode_bgr_bitstream(HYuvContext *s, int count)
@@ -495,55 +879,115 @@ static void draw_slice(HYuvContext *s, AVFrame *frame, int y)
     s->last_slice_end = y + h;
 }
 
-static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                        AVPacket *avpkt)
+static int left_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src, int w, int acc)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-    HYuvContext *s = avctx->priv_data;
-    const int width  = s->width;
-    const int width2 = s->width >> 1;
-    const int height = s->height;
-    int fake_ystride, fake_ustride, fake_vstride;
-    ThreadFrame frame = { .f = data };
-    AVFrame *const p = data;
-    int table_size = 0, ret;
-
-    av_fast_malloc(&s->bitstream_buffer,
-                   &s->bitstream_buffer_size,
-                   buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!s->bitstream_buffer)
-        return AVERROR(ENOMEM);
-
-    memset(s->bitstream_buffer + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-    s->bdsp.bswap_buf((uint32_t *) s->bitstream_buffer,
-                      (const uint32_t *) buf, buf_size / 4);
+    if (s->bps <= 8) {
+        return s->llviddsp.add_left_pred(dst, src, w, acc);
+    } else {
+        return s->llviddsp.add_left_pred_int16((      uint16_t *)dst, (const uint16_t *)src, s->n-1, w, acc);
+    }
+}
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+static void add_bytes(HYuvContext *s, uint8_t *dst, uint8_t *src, int w)
+{
+    if (s->bps <= 8) {
+        s->llviddsp.add_bytes(dst, src, w);
+    } else {
+        s->hdsp.add_int16((uint16_t*)dst, (const uint16_t*)src, s->n - 1, w);
     }
+}
 
-    if (s->context) {
-        table_size = read_huffman_tables(s, s->bitstream_buffer, buf_size);
-        if (table_size < 0)
-            return table_size;
+static void add_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src, const uint8_t *diff, int w, int *left, int *left_top)
+{
+    if (s->bps <= 8) {
+        s->llviddsp.add_median_pred(dst, src, diff, w, left, left_top);
+    } else {
+        s->hdsp.add_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src, (const uint16_t *)diff, s->n-1, w, left, left_top);
     }
+}
 
-    if ((unsigned) (buf_size - table_size) >= INT_MAX / 8)
-        return AVERROR_INVALIDDATA;
+static int decode_slice(AVCodecContext *avctx, AVFrame *p, int height,
+                        int buf_size, int y_offset, int table_size)
+{
+    HYuvContext *s = avctx->priv_data;
+    int fake_ystride, fake_ustride, fake_vstride;
+    const int width  = s->width;
+    const int width2 = s->width >> 1;
+    int ret;
 
-    if ((ret = init_get_bits(&s->gb, s->bitstream_buffer + table_size,
-                             (buf_size - table_size) * 8)) < 0)
+    if ((ret = init_get_bits8(&s->gb, s->bitstream_buffer + table_size, buf_size - table_size)) < 0)
         return ret;
 
     fake_ystride = s->interlaced ? p->linesize[0] * 2 : p->linesize[0];
     fake_ustride = s->interlaced ? p->linesize[1] * 2 : p->linesize[1];
     fake_vstride = s->interlaced ? p->linesize[2] * 2 : p->linesize[2];
 
-    s->last_slice_end = 0;
+    if (s->version > 2) {
+        int plane;
+        for(plane = 0; plane < 1 + 2*s->chroma + s->alpha; plane++) {
+            int left, lefttop, y;
+            int w = width;
+            int h = height;
+            int fake_stride = fake_ystride;
+
+            if (s->chroma && (plane == 1 || plane == 2)) {
+                w >>= s->chroma_h_shift;
+                h >>= s->chroma_v_shift;
+                fake_stride = plane == 1 ? fake_ustride : fake_vstride;
+            }
+
+            switch (s->predictor) {
+            case LEFT:
+            case PLANE:
+                decode_plane_bitstream(s, w, plane);
+                left = left_prediction(s, p->data[plane], s->temp[0], w, 0);
+
+                for (y = 1; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane]*y;
+
+                    decode_plane_bitstream(s, w, plane);
+                    left = left_prediction(s, dst, s->temp[0], w, left);
+                    if (s->predictor == PLANE) {
+                        if (y > s->interlaced) {
+                            add_bytes(s, dst, dst - fake_stride, w);
+                        }
+                    }
+                }
+
+                break;
+            case MEDIAN:
+                decode_plane_bitstream(s, w, plane);
+                left= left_prediction(s, p->data[plane], s->temp[0], w, 0);
 
-    if (s->bitstream_bpp < 24) {
+                y = 1;
+
+                /* second line is left predicted for interlaced case */
+                if (s->interlaced) {
+                    decode_plane_bitstream(s, w, plane);
+                    left = left_prediction(s, p->data[plane] + p->linesize[plane], s->temp[0], w, left);
+                    y++;
+                }
+
+                lefttop = p->data[plane][0];
+                decode_plane_bitstream(s, w, plane);
+                add_median_prediction(s, p->data[plane] + fake_stride, p->data[plane], s->temp[0], w, &left, &lefttop);
+                y++;
+
+                for (; y<h; y++) {
+                    uint8_t *dst;
+
+                    decode_plane_bitstream(s, w, plane);
+
+                    dst = p->data[plane] + p->linesize[plane] * y;
+
+                    add_median_prediction(s, dst, dst - fake_stride, s->temp[0], w, &left, &lefttop);
+                }
+
+                break;
+            }
+        }
+        draw_slice(s, p, height);
+    } else if (s->bitstream_bpp < 24) {
         int y, cy;
         int lefty, leftu, leftv;
         int lefttopy, lefttopu, lefttopv;
@@ -554,66 +998,67 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             p->data[0][1] = get_bits(&s->gb, 8);
             p->data[0][0] = get_bits(&s->gb, 8);
 
-            avpriv_report_missing_feature(avctx, "YUY2 output");
+            av_log(avctx, AV_LOG_ERROR,
+                   "YUY2 output is not implemented yet\n");
             return AVERROR_PATCHWELCOME;
         } else {
             leftv         =
-            p->data[2][0] = get_bits(&s->gb, 8);
+            p->data[2][0 + y_offset * p->linesize[2]] = get_bits(&s->gb, 8);
             lefty         =
-            p->data[0][1] = get_bits(&s->gb, 8);
+            p->data[0][1 + y_offset * p->linesize[0]] = get_bits(&s->gb, 8);
             leftu         =
-            p->data[1][0] = get_bits(&s->gb, 8);
-            p->data[0][0] = get_bits(&s->gb, 8);
+            p->data[1][0 + y_offset * p->linesize[1]] = get_bits(&s->gb, 8);
+            p->data[0][0 + y_offset * p->linesize[0]] = get_bits(&s->gb, 8);
 
             switch (s->predictor) {
             case LEFT:
             case PLANE:
                 decode_422_bitstream(s, width - 2);
-                lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + 2, s->temp[0],
+                lefty = s->llviddsp.add_left_pred(p->data[0] + p->linesize[0] * y_offset + 2, s->temp[0],
                                                    width - 2, lefty);
                 if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
-                    leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + 1, s->temp[1], width2 - 1, leftu);
-                    leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + 1, s->temp[2], width2 - 1, leftv);
+                    leftu = s->llviddsp.add_left_pred(p->data[1] + p->linesize[1] * y_offset + 1, s->temp[1], width2 - 1, leftu);
+                    leftv = s->llviddsp.add_left_pred(p->data[2] + p->linesize[2] * y_offset + 1, s->temp[2], width2 - 1, leftv);
                 }
 
-                for (cy = y = 1; y < s->height; y++, cy++) {
+                for (cy = y = 1; y < height; y++, cy++) {
                     uint8_t *ydst, *udst, *vdst;
 
                     if (s->bitstream_bpp == 12) {
                         decode_gray_bitstream(s, width);
 
-                        ydst = p->data[0] + p->linesize[0] * y;
+                        ydst = p->data[0] + p->linesize[0] * (y + y_offset);
 
-                        lefty = s->hdsp.add_hfyu_left_pred(ydst, s->temp[0],
+                        lefty = s->llviddsp.add_left_pred(ydst, s->temp[0],
                                                            width, lefty);
                         if (s->predictor == PLANE) {
                             if (y > s->interlaced)
-                                s->hdsp.add_bytes(ydst, ydst - fake_ystride, width);
+                                s->llviddsp.add_bytes(ydst, ydst - fake_ystride, width);
                         }
                         y++;
-                        if (y >= s->height)
+                        if (y >= height)
                             break;
                     }
 
                     draw_slice(s, p, y);
 
-                    ydst = p->data[0] + p->linesize[0] * y;
-                    udst = p->data[1] + p->linesize[1] * cy;
-                    vdst = p->data[2] + p->linesize[2] * cy;
+                    ydst = p->data[0] + p->linesize[0] * (y  + y_offset);
+                    udst = p->data[1] + p->linesize[1] * (cy + y_offset);
+                    vdst = p->data[2] + p->linesize[2] * (cy + y_offset);
 
                     decode_422_bitstream(s, width);
-                    lefty = s->hdsp.add_hfyu_left_pred(ydst, s->temp[0],
+                    lefty = s->llviddsp.add_left_pred(ydst, s->temp[0],
                                                        width, lefty);
                     if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
-                        leftu = s->hdsp.add_hfyu_left_pred(udst, s->temp[1], width2, leftu);
-                        leftv = s->hdsp.add_hfyu_left_pred(vdst, s->temp[2], width2, leftv);
+                        leftu = s->llviddsp.add_left_pred(udst, s->temp[1], width2, leftu);
+                        leftv = s->llviddsp.add_left_pred(vdst, s->temp[2], width2, leftv);
                     }
                     if (s->predictor == PLANE) {
                         if (cy > s->interlaced) {
-                            s->hdsp.add_bytes(ydst, ydst - fake_ystride, width);
+                            s->llviddsp.add_bytes(ydst, ydst - fake_ystride, width);
                             if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
-                                s->hdsp.add_bytes(udst, udst - fake_ustride, width2);
-                                s->hdsp.add_bytes(vdst, vdst - fake_vstride, width2);
+                                s->llviddsp.add_bytes(udst, udst - fake_ustride, width2);
+                                s->llviddsp.add_bytes(vdst, vdst - fake_vstride, width2);
                             }
                         }
                     }
@@ -624,11 +1069,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             case MEDIAN:
                 /* first line except first 2 pixels is left predicted */
                 decode_422_bitstream(s, width - 2);
-                lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + 2, s->temp[0],
+                lefty = s->llviddsp.add_left_pred(p->data[0] + 2, s->temp[0],
                                                    width - 2, lefty);
                 if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
-                    leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + 1, s->temp[1], width2 - 1, leftu);
-                    leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + 1, s->temp[2], width2 - 1, leftv);
+                    leftu = s->llviddsp.add_left_pred(p->data[1] + 1, s->temp[1], width2 - 1, leftu);
+                    leftv = s->llviddsp.add_left_pred(p->data[2] + 1, s->temp[2], width2 - 1, leftv);
                 }
 
                 cy = y = 1;
@@ -636,11 +1081,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 /* second line is left predicted for interlaced case */
                 if (s->interlaced) {
                     decode_422_bitstream(s, width);
-                    lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + p->linesize[0],
+                    lefty = s->llviddsp.add_left_pred(p->data[0] + p->linesize[0],
                                                        s->temp[0], width, lefty);
                     if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
-                        leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + p->linesize[2], s->temp[1], width2, leftu);
-                        leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + p->linesize[1], s->temp[2], width2, leftv);
+                        leftu = s->llviddsp.add_left_pred(p->data[1] + p->linesize[2], s->temp[1], width2, leftu);
+                        leftv = s->llviddsp.add_left_pred(p->data[2] + p->linesize[1], s->temp[2], width2, leftv);
                     }
                     y++;
                     cy++;
@@ -648,24 +1093,24 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
                 /* next 4 pixels are left predicted too */
                 decode_422_bitstream(s, 4);
-                lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + fake_ystride,
+                lefty = s->llviddsp.add_left_pred(p->data[0] + fake_ystride,
                                                    s->temp[0], 4, lefty);
                 if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
-                    leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + fake_ustride, s->temp[1], 2, leftu);
-                    leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + fake_vstride, s->temp[2], 2, leftv);
+                    leftu = s->llviddsp.add_left_pred(p->data[1] + fake_ustride, s->temp[1], 2, leftu);
+                    leftv = s->llviddsp.add_left_pred(p->data[2] + fake_vstride, s->temp[2], 2, leftv);
                 }
 
                 /* next line except the first 4 pixels is median predicted */
                 lefttopy = p->data[0][3];
                 decode_422_bitstream(s, width - 4);
-                s->hdsp.add_hfyu_median_pred(p->data[0] + fake_ystride + 4,
+                s->llviddsp.add_median_pred(p->data[0] + fake_ystride + 4,
                                              p->data[0] + 4, s->temp[0],
                                              width - 4, &lefty, &lefttopy);
                 if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
                     lefttopu = p->data[1][1];
                     lefttopv = p->data[2][1];
-                    s->hdsp.add_hfyu_median_pred(p->data[1] + fake_ustride + 2, p->data[1] + 2, s->temp[1], width2 - 2, &leftu, &lefttopu);
-                    s->hdsp.add_hfyu_median_pred(p->data[2] + fake_vstride + 2, p->data[2] + 2, s->temp[2], width2 - 2, &leftv, &lefttopv);
+                    s->llviddsp.add_median_pred(p->data[1] + fake_ustride + 2, p->data[1] + 2, s->temp[1], width2 - 2, &leftu, &lefttopu);
+                    s->llviddsp.add_median_pred(p->data[2] + fake_vstride + 2, p->data[2] + 2, s->temp[2], width2 - 2, &leftv, &lefttopv);
                 }
                 y++;
                 cy++;
@@ -677,7 +1122,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         while (2 * cy > y) {
                             decode_gray_bitstream(s, width);
                             ydst = p->data[0] + p->linesize[0] * y;
-                            s->hdsp.add_hfyu_median_pred(ydst, ydst - fake_ystride,
+                            s->llviddsp.add_median_pred(ydst, ydst - fake_ystride,
                                                          s->temp[0], width,
                                                          &lefty, &lefttopy);
                             y++;
@@ -693,12 +1138,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     udst = p->data[1] + p->linesize[1] * cy;
                     vdst = p->data[2] + p->linesize[2] * cy;
 
-                    s->hdsp.add_hfyu_median_pred(ydst, ydst - fake_ystride,
+                    s->llviddsp.add_median_pred(ydst, ydst - fake_ystride,
                                                  s->temp[0], width,
                                                  &lefty, &lefttopy);
                     if (!(s->flags & AV_CODEC_FLAG_GRAY)) {
-                        s->hdsp.add_hfyu_median_pred(udst, udst - fake_ustride, s->temp[1], width2, &leftu, &lefttopu);
-                        s->hdsp.add_hfyu_median_pred(vdst, vdst - fake_vstride, s->temp[2], width2, &leftv, &lefttopv);
+                        s->llviddsp.add_median_pred(udst, udst - fake_ustride, s->temp[1], width2, &leftu, &lefttopu);
+                        s->llviddsp.add_median_pred(vdst, vdst - fake_vstride, s->temp[2], width2, &leftv, &lefttopv);
                     }
                 }
 
@@ -708,19 +1153,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     } else {
         int y;
-        int leftr, leftg, leftb, lefta;
-        const int last_line = (height - 1) * p->linesize[0];
+        uint8_t left[4];
+        const int last_line = (y_offset + height - 1) * p->linesize[0];
 
         if (s->bitstream_bpp == 32) {
-            lefta = p->data[0][last_line + A] = get_bits(&s->gb, 8);
-            leftr = p->data[0][last_line + R] = get_bits(&s->gb, 8);
-            leftg = p->data[0][last_line + G] = get_bits(&s->gb, 8);
-            leftb = p->data[0][last_line + B] = get_bits(&s->gb, 8);
+            left[A] = p->data[0][last_line + A] = get_bits(&s->gb, 8);
+            left[R] = p->data[0][last_line + R] = get_bits(&s->gb, 8);
+            left[G] = p->data[0][last_line + G] = get_bits(&s->gb, 8);
+            left[B] = p->data[0][last_line + B] = get_bits(&s->gb, 8);
         } else {
-            leftr = p->data[0][last_line + R] = get_bits(&s->gb, 8);
-            leftg = p->data[0][last_line + G] = get_bits(&s->gb, 8);
-            leftb = p->data[0][last_line + B] = get_bits(&s->gb, 8);
-            lefta = p->data[0][last_line + A] = 255;
+            left[R] = p->data[0][last_line + R] = get_bits(&s->gb, 8);
+            left[G] = p->data[0][last_line + G] = get_bits(&s->gb, 8);
+            left[B] = p->data[0][last_line + B] = get_bits(&s->gb, 8);
+            left[A] = p->data[0][last_line + A] = 255;
             skip_bits(&s->gb, 8);
         }
 
@@ -730,23 +1175,20 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             case PLANE:
                 decode_bgr_bitstream(s, width - 1);
                 s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + last_line + 4,
-                                                 s->temp[0], width - 1, &leftr,
-                                                 &leftg, &leftb, &lefta);
+                                                 s->temp[0], width - 1, left);
 
-                for (y = s->height - 2; y >= 0; y--) { // Yes it is stored upside down.
+                for (y = height - 2; y >= 0; y--) { // Yes it is stored upside down.
                     decode_bgr_bitstream(s, width);
 
-                    s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + p->linesize[0] * y,
-                                                     s->temp[0], width, &leftr,
-                                                     &leftg, &leftb, &lefta);
+                    s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + p->linesize[0] * (y + y_offset),
+                                                     s->temp[0], width, left);
                     if (s->predictor == PLANE) {
                         if (s->bitstream_bpp != 32)
-                            lefta = 0;
-                        if ((y & s->interlaced) == 0 &&
-                            y < s->height - 1 - s->interlaced) {
-                            s->hdsp.add_bytes(p->data[0] + p->linesize[0] * y,
-                                              p->data[0] + p->linesize[0] * y +
-                                              fake_ystride, fake_ystride);
+                            left[A] = 0;
+                        if (y < height - 1 - s->interlaced) {
+                            s->llviddsp.add_bytes(p->data[0] + p->linesize[0] * (y + y_offset),
+                                              p->data[0] + p->linesize[0] * (y + y_offset) +
+                                              fake_ystride, 4 * width);
                         }
                     }
                 }
@@ -758,29 +1200,97 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                        "prediction type not supported!\n");
             }
         } else {
-            avpriv_report_missing_feature(avctx, "BGR24 output");
+            av_log(avctx, AV_LOG_ERROR,
+                   "BGR24 output is not implemented yet\n");
             return AVERROR_PATCHWELCOME;
         }
     }
-    emms_c();
-
-    *got_frame = 1;
 
-    return (get_bits_count(&s->gb) + 31) / 32 * 4 + table_size;
+    return 0;
 }
 
-static av_cold int decode_end(AVCodecContext *avctx)
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
 {
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
     HYuvContext *s = avctx->priv_data;
-    int i;
+    const int width  = s->width;
+    const int height = s->height;
+    ThreadFrame frame = { .f = data };
+    AVFrame *const p = data;
+    int slice, table_size = 0, ret, nb_slices;
+    unsigned slices_info_offset;
+    int slice_height;
 
-    ff_huffyuv_common_end(s);
-    av_freep(&s->bitstream_buffer);
+    if (buf_size < (width * height + 7)/8)
+        return AVERROR_INVALIDDATA;
 
-    for (i = 0; i < 6; i++)
-        ff_free_vlc(&s->vlc[i]);
+    av_fast_padded_malloc(&s->bitstream_buffer,
+                   &s->bitstream_buffer_size,
+                   buf_size);
+    if (!s->bitstream_buffer)
+        return AVERROR(ENOMEM);
 
-    return 0;
+    s->bdsp.bswap_buf((uint32_t *) s->bitstream_buffer,
+                      (const uint32_t *) buf, buf_size / 4);
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    if (s->context) {
+        table_size = read_huffman_tables(s, s->bitstream_buffer, buf_size);
+        if (table_size < 0)
+            return table_size;
+    }
+
+    if ((unsigned) (buf_size - table_size) >= INT_MAX / 8)
+        return AVERROR_INVALIDDATA;
+
+    s->last_slice_end = 0;
+
+    if (avctx->codec_id == AV_CODEC_ID_HYMT &&
+        (buf_size > 32 && AV_RL32(avpkt->data + buf_size - 16) == 0)) {
+        slices_info_offset = AV_RL32(avpkt->data + buf_size - 4);
+        slice_height = AV_RL32(avpkt->data + buf_size - 8);
+        nb_slices = AV_RL32(avpkt->data + buf_size - 12);
+        if (nb_slices * 8LL + slices_info_offset > buf_size - 16 ||
+            slice_height <= 0 || nb_slices * (uint64_t)slice_height > height)
+            return AVERROR_INVALIDDATA;
+    } else {
+        slice_height = height;
+        nb_slices = 1;
+    }
+
+    for (slice = 0; slice < nb_slices; slice++) {
+        int y_offset, slice_offset, slice_size;
+
+        if (nb_slices > 1) {
+            slice_offset = AV_RL32(avpkt->data + slices_info_offset + slice * 8);
+            slice_size = AV_RL32(avpkt->data + slices_info_offset + slice * 8 + 4);
+
+            if (slice_offset < 0 || slice_size <= 0 || (slice_offset&3) ||
+                slice_offset + (int64_t)slice_size > buf_size)
+                return AVERROR_INVALIDDATA;
+
+            y_offset = height - (slice + 1) * slice_height;
+            s->bdsp.bswap_buf((uint32_t *)s->bitstream_buffer,
+                              (const uint32_t *)(buf + slice_offset), slice_size / 4);
+        } else {
+            y_offset = 0;
+            slice_offset = 0;
+            slice_size = buf_size;
+        }
+
+        ret = decode_slice(avctx, p, slice_height, slice_size, y_offset, table_size);
+        emms_c();
+        if (ret < 0)
+            return ret;
+    }
+
+    *got_frame = 1;
+
+    return (get_bits_count(&s->gb) + 31) / 32 * 4 + table_size;
 }
 
 AVCodec ff_huffyuv_decoder = {
@@ -812,3 +1322,19 @@ AVCodec ff_ffvhuff_decoder = {
     .init_thread_copy = ONLY_IF_THREADS_ENABLED(decode_init_thread_copy),
 };
 #endif /* CONFIG_FFVHUFF_DECODER */
+
+#if CONFIG_HYMT_DECODER
+AVCodec ff_hymt_decoder = {
+    .name             = "hymt",
+    .long_name        = NULL_IF_CONFIG_SMALL("HuffYUV MT"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_HYMT,
+    .priv_data_size   = sizeof(HYuvContext),
+    .init             = decode_init,
+    .close            = decode_end,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DRAW_HORIZ_BAND |
+                        AV_CODEC_CAP_FRAME_THREADS,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(decode_init_thread_copy),
+};
+#endif /* CONFIG_HYMT_DECODER */
diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c
index ff69b45..7b865fe 100644
--- a/libavcodec/huffyuvdsp.c
+++ b/libavcodec/huffyuvdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,36 +23,32 @@
 #include "mathops.h"
 #include "huffyuvdsp.h"
 
-// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
-#define pb_7f (~0UL / 255 * 0x7f)
-#define pb_80 (~0UL / 255 * 0x80)
+// 0x00010001 or 0x0001000100010001 or whatever, depending on the cpu's native arithmetic size
+#define pw_1 (ULONG_MAX / UINT16_MAX)
 
-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
-{
+static void add_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w){
     long i;
-
-    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
-        long a = *(long *) (src + i);
-        long b = *(long *) (dst + i);
-        *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
+    unsigned long pw_lsb = (mask >> 1) * pw_1;
+    unsigned long pw_msb = pw_lsb +  pw_1;
+    for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) {
+        long a = *(long*)(src+i);
+        long b = *(long*)(dst+i);
+        *(long*)(dst+i) = ((a&pw_lsb) + (b&pw_lsb)) ^ ((a^b)&pw_msb);
     }
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
+    for(; i<w; i++)
+        dst[i] = (dst[i] + src[i]) & mask;
 }
 
-static void add_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
-                                   const uint8_t *diff, int w,
-                                   int *left, int *left_top)
-{
+static void add_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top){
     int i;
-    uint8_t l, lt;
+    uint16_t l, lt;
 
     l  = *left;
     lt = *left_top;
 
-    for (i = 0; i < w; i++) {
-        l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
-        lt     = src1[i];
+    for(i=0; i<w; i++){
+        l  = (mid_pred(l, src[i], (l + src[i] - lt) & mask) + diff[i]) & mask;
+        lt = src[i];
         dst[i] = l;
     }
 
@@ -60,43 +56,11 @@ static void add_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
     *left_top = lt;
 }
 
-static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, int w,
-                                int acc)
-{
-    int i;
-
-    for (i = 0; i < w - 1; i++) {
-        acc   += src[i];
-        dst[i] = acc;
-        i++;
-        acc   += src[i];
-        dst[i] = acc;
-    }
-
-    for (; i < w; i++) {
-        acc   += src[i];
-        dst[i] = acc;
-    }
-
-    return acc;
-}
-
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
 static void add_hfyu_left_pred_bgr32_c(uint8_t *dst, const uint8_t *src,
-                                       int w, int *red, int *green,
-                                       int *blue, int *alpha)
+                                       intptr_t w, uint8_t *left)
 {
-    int i, r = *red, g = *green, b = *blue, a = *alpha;
+    int i;
+    uint8_t r = left[R], g = left[G], b = left[B], a = left[A];
 
     for (i = 0; i < w; i++) {
         b += src[4 * i + B];
@@ -110,25 +74,18 @@ static void add_hfyu_left_pred_bgr32_c(uint8_t *dst, const uint8_t *src,
         dst[4 * i + A] = a;
     }
 
-    *red   = r;
-    *green = g;
-    *blue  = b;
-    *alpha = a;
+    left[B] = b;
+    left[G] = g;
+    left[R] = r;
+    left[A] = a;
 }
-#undef B
-#undef G
-#undef R
-#undef A
 
-av_cold void ff_huffyuvdsp_init(HuffYUVDSPContext *c)
+av_cold void ff_huffyuvdsp_init(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
 {
-    c->add_bytes                = add_bytes_c;
-    c->add_hfyu_median_pred     = add_hfyu_median_pred_c;
-    c->add_hfyu_left_pred       = add_hfyu_left_pred_c;
+    c->add_int16 = add_int16_c;
+    c->add_hfyu_median_pred_int16 = add_hfyu_median_pred_int16_c;
     c->add_hfyu_left_pred_bgr32 = add_hfyu_left_pred_bgr32_c;
 
-    if (ARCH_PPC)
-        ff_huffyuvdsp_init_ppc(c);
     if (ARCH_X86)
-        ff_huffyuvdsp_init_x86(c);
+        ff_huffyuvdsp_init_x86(c, pix_fmt);
 }
diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h
index 5e84e3a..e5f5b05 100644
--- a/libavcodec/huffyuvdsp.h
+++ b/libavcodec/huffyuvdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,22 +20,33 @@
 #define AVCODEC_HUFFYUVDSP_H
 
 #include <stdint.h>
+#include "libavutil/pixfmt.h"
+#include "config.h"
+
+#if HAVE_BIGENDIAN
+#define B 3
+#define G 2
+#define R 1
+#define A 0
+#else
+#define B 0
+#define G 1
+#define R 2
+#define A 3
+#endif
 
 typedef struct HuffYUVDSPContext {
-    void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
-                      int w);
-    void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
-                                 const uint8_t *diff, int w,
-                                 int *left, int *left_top);
-    int (*add_hfyu_left_pred)(uint8_t *dst, const uint8_t *src,
-                              int w, int left);
+    void (*add_int16)(uint16_t *dst/*align 16*/, const uint16_t *src/*align 16*/,
+                      unsigned mask, int w);
+
+    void (*add_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *top,
+                                       const uint16_t *diff, unsigned mask,
+                                       int w, int *left, int *left_top);
     void (*add_hfyu_left_pred_bgr32)(uint8_t *dst, const uint8_t *src,
-                                     int w, int *red, int *green,
-                                     int *blue, int *alpha);
+                                     intptr_t w, uint8_t *left);
 } HuffYUVDSPContext;
 
-void ff_huffyuvdsp_init(HuffYUVDSPContext *c);
-void ff_huffyuvdsp_init_ppc(HuffYUVDSPContext *c);
-void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c);
+void ff_huffyuvdsp_init(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt);
+void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt);
 
 #endif /* AVCODEC_HUFFYUVDSP_H */
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index a6ffd24..3662c17 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -1,24 +1,26 @@
 /*
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * yuva, gray, 4:4:4, 4:1:1, 4:1:0 and >8 bit per sample support sponsored by NOA
  */
 
 /**
@@ -26,50 +28,71 @@
  * huffyuv encoder
  */
 
-#include "libavutil/opt.h"
-
 #include "avcodec.h"
 #include "huffyuv.h"
 #include "huffman.h"
 #include "huffyuvencdsp.h"
 #include "internal.h"
+#include "lossless_videoencdsp.h"
 #include "put_bits.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+static inline void diff_bytes(HYuvContext *s, uint8_t *dst,
+                              const uint8_t *src0, const uint8_t *src1, int w)
+{
+    if (s->bps <= 8) {
+        s->llvidencdsp.diff_bytes(dst, src0, src1, w);
+    } else {
+        s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w);
+    }
+}
 
 static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst,
-                                      uint8_t *src, int w, int left)
+                                      const uint8_t *src, int w, int left)
 {
     int i;
-    if (w < 32) {
-        for (i = 0; i < w; i++) {
+    int min_width = FFMIN(w, 32);
+
+    if (s->bps <= 8) {
+        for (i = 0; i < min_width; i++) { /* scalar loop before dsp call */
             const int temp = src[i];
             dst[i] = temp - left;
             left   = temp;
         }
-        return left;
+        if (w < 32)
+            return left;
+        s->llvidencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32);
+        return src[w-1];
     } else {
-        for (i = 0; i < 16; i++) {
-            const int temp = src[i];
-            dst[i] = temp - left;
+        const uint16_t *src16 = (const uint16_t *)src;
+        uint16_t       *dst16 = (      uint16_t *)dst;
+        for (i = 0; i < min_width; i++) { /* scalar loop before dsp call */
+            const int temp = src16[i];
+            dst16[i] = temp - left;
             left   = temp;
         }
-        s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16);
-        return src[w-1];
+        if (w < 32)
+            return left;
+        s->hencdsp.diff_int16(dst16 + 32, src16 + 32, src16 + 31, s->n - 1, w - 32);
+        return src16[w-1];
     }
 }
 
 static inline void sub_left_prediction_bgr32(HYuvContext *s, uint8_t *dst,
-                                             uint8_t *src, int w,
+                                             const uint8_t *src, int w,
                                              int *red, int *green, int *blue,
                                              int *alpha)
 {
     int i;
     int r, g, b, a;
+    int min_width = FFMIN(w, 8);
     r = *red;
     g = *green;
     b = *blue;
     a = *alpha;
 
-    for (i = 0; i < FFMIN(w, 4); i++) {
+    for (i = 0; i < min_width; i++) {
         const int rt = src[i * 4 + R];
         const int gt = src[i * 4 + G];
         const int bt = src[i * 4 + B];
@@ -84,7 +107,7 @@ static inline void sub_left_prediction_bgr32(HYuvContext *s, uint8_t *dst,
         a = at;
     }
 
-    s->hencdsp.diff_bytes(dst + 16, src + 16, src + 12, w * 4 - 16);
+    s->llvidencdsp.diff_bytes(dst + 32, src + 32, src + 32 - 4, w * 4 - 32);
 
     *red   = src[(w - 1) * 4 + R];
     *green = src[(w - 1) * 4 + G];
@@ -113,27 +136,37 @@ static inline void sub_left_prediction_rgb24(HYuvContext *s, uint8_t *dst,
         b = bt;
     }
 
-    s->hencdsp.diff_bytes(dst + 48, src + 48, src + 48 - 3, w * 3 - 48);
+    s->llvidencdsp.diff_bytes(dst + 48, src + 48, src + 48 - 3, w * 3 - 48);
 
     *red   = src[(w - 1) * 3 + 0];
     *green = src[(w - 1) * 3 + 1];
     *blue  = src[(w - 1) * 3 + 2];
 }
 
+static void sub_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top)
+{
+    if (s->bps <= 8) {
+        s->llvidencdsp.sub_median_pred(dst, src1, src2, w , left, left_top);
+    } else {
+        s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top);
+    }
+}
+
 static int store_table(HYuvContext *s, const uint8_t *len, uint8_t *buf)
 {
     int i;
     int index = 0;
+    int n = s->vlc_n;
 
-    for (i = 0; i < 256;) {
+    for (i = 0; i < n;) {
         int val = len[i];
         int repeat = 0;
 
-        for (; i < 256 && len[i] == val && repeat < 255; i++)
+        for (; i < n && len[i] == val && repeat < 255; i++)
             repeat++;
 
-        assert(val < 32 && val >0 && repeat<256 && repeat>0);
-        if ( repeat > 7) {
+        av_assert0(val < 32 && val >0 && repeat < 256 && repeat>0);
+        if (repeat > 7) {
             buf[index++] = val;
             buf[index++] = repeat;
         } else {
@@ -144,19 +177,49 @@ static int store_table(HYuvContext *s, const uint8_t *len, uint8_t *buf)
     return index;
 }
 
+static int store_huffman_tables(HYuvContext *s, uint8_t *buf)
+{
+    int i, ret;
+    int size = 0;
+    int count = 3;
+
+    if (s->version > 2)
+        count = 1 + s->alpha + 2*s->chroma;
+
+    for (i = 0; i < count; i++) {
+        if ((ret = ff_huff_gen_len_table(s->len[i], s->stats[i], s->vlc_n, 0)) < 0)
+            return ret;
+
+        if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i], s->vlc_n) < 0) {
+            return -1;
+        }
+
+        size += store_table(s, s->len[i], buf + size);
+    }
+    return size;
+}
+
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int i, j;
+    int ret;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
     ff_huffyuv_common_init(avctx);
-    ff_huffyuvencdsp_init(&s->hencdsp);
-
-    avctx->extradata = av_mallocz(1024*30); // 256*3+4 == 772
-    avctx->stats_out = av_mallocz(1024*30); // 21*256*3(%llu ) + 3(\n) + 1(0) = 16132
+    ff_huffyuvencdsp_init(&s->hencdsp, avctx);
+    ff_llvidencdsp_init(&s->llvidencdsp);
+
+    avctx->extradata = av_mallocz(3*MAX_N + 4);
+    if (s->flags&AV_CODEC_FLAG_PASS1) {
+#define STATS_OUT_SIZE 21*MAX_N*3 + 4
+        avctx->stats_out = av_mallocz(STATS_OUT_SIZE); // 21*256*3(%llu ) + 3(\n) + 1(0) = 16132
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
+    }
     s->version = 2;
 
-    if (!avctx->extradata || !avctx->stats_out)
+    if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
 #if FF_API_CODED_FRAME
@@ -172,15 +235,66 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    s->bps = desc->comp[0].depth;
+    s->yuv = !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components >= 2;
+    s->chroma = desc->nb_components > 2;
+    s->alpha = !!(desc->flags & AV_PIX_FMT_FLAG_ALPHA);
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                     &s->chroma_h_shift,
+                                     &s->chroma_v_shift);
+
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
         if (s->width & 1) {
             av_log(avctx, AV_LOG_ERROR, "Width must be even for this colorspace.\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->bitstream_bpp = avctx->pix_fmt == AV_PIX_FMT_YUV420P ? 12 : 16;
         break;
+    case AV_PIX_FMT_YUV444P:
+    case AV_PIX_FMT_YUV410P:
+    case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUV440P:
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_GRAY16:
+    case AV_PIX_FMT_YUVA444P:
+    case AV_PIX_FMT_YUVA420P:
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GRAY8A:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV420P16:
+    case AV_PIX_FMT_YUV422P9:
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV422P16:
+    case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV444P14:
+    case AV_PIX_FMT_YUV444P16:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUVA420P16:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA444P9:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA444P16:
+        s->version = 3;
+        break;
     case AV_PIX_FMT_RGB32:
         s->bitstream_bpp = 32;
         break;
@@ -189,10 +303,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
+    s->n = 1<<s->bps;
+    s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+
     avctx->bits_per_coded_sample = s->bitstream_bpp;
-    s->decorrelate = s->bitstream_bpp >= 24;
+    s->decorrelate = s->bitstream_bpp >= 24 && !s->yuv && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->prediction_method)
@@ -205,7 +322,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_log(avctx, AV_LOG_ERROR,
                    "context=1 is not compatible with "
                    "2 pass huffyuv encoding\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
     }
 
@@ -214,14 +331,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_log(avctx, AV_LOG_ERROR,
                    "Error: YV12 is not supported by huffyuv; use "
                    "vcodec=ffvhuff or format=422p\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
 #if FF_API_PRIVATE_OPT
         if (s->context) {
             av_log(avctx, AV_LOG_ERROR,
                    "Error: per-frame huffman tables are not supported "
                    "by huffyuv; use vcodec=ffvhuff\n");
-            return -1;
+            return AVERROR(EINVAL);
+        }
+        if (s->version > 2) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Error: ver>2 is not supported "
+                   "by huffyuv; use vcodec=ffvhuff\n");
+            return AVERROR(EINVAL);
         }
 #endif
         if (s->interlaced != ( s->height > 288 ))
@@ -229,32 +352,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
                    "using huffyuv 2.2.0 or newer interlacing flag\n");
     }
 
-    if (s->bitstream_bpp >= 24 && s->predictor == MEDIAN) {
+    if (s->version > 3 && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_ERROR, "Ver > 3 is under development, files encoded with it may not be decodable with future versions!!!\n"
+               "Use vstrict=-2 / -strict -2 to use it anyway.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->bitstream_bpp >= 24 && s->predictor == MEDIAN && s->version <= 2) {
         av_log(avctx, AV_LOG_ERROR,
                "Error: RGB is incompatible with median predictor\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     ((uint8_t*)avctx->extradata)[0] = s->predictor | (s->decorrelate << 6);
-    ((uint8_t*)avctx->extradata)[1] = s->bitstream_bpp;
     ((uint8_t*)avctx->extradata)[2] = s->interlaced ? 0x10 : 0x20;
     if (s->context)
         ((uint8_t*)avctx->extradata)[2] |= 0x40;
-    ((uint8_t*)avctx->extradata)[3] = 0;
+    if (s->version < 3) {
+        ((uint8_t*)avctx->extradata)[1] = s->bitstream_bpp;
+        ((uint8_t*)avctx->extradata)[3] = 0;
+    } else {
+        ((uint8_t*)avctx->extradata)[1] = ((s->bps-1)<<4) | s->chroma_h_shift | (s->chroma_v_shift<<2);
+        if (s->chroma)
+            ((uint8_t*)avctx->extradata)[2] |= s->yuv ? 1 : 2;
+        if (s->alpha)
+            ((uint8_t*)avctx->extradata)[2] |= 4;
+        ((uint8_t*)avctx->extradata)[3] = 1;
+    }
     s->avctx->extradata_size = 4;
 
     if (avctx->stats_in) {
         char *p = avctx->stats_in;
 
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j] = 1;
 
         for (;;) {
-            for (i = 0; i < 3; i++) {
+            for (i = 0; i < 4; i++) {
                 char *next;
 
-                for (j = 0; j < 256; j++) {
+                for (j = 0; j < s->vlc_n; j++) {
                     s->stats[i][j] += strtol(p, &next, 0);
                     if (next == p) return -1;
                     p = next;
@@ -263,40 +401,37 @@ FF_ENABLE_DEPRECATION_WARNINGS
             if (p[0] == 0 || p[1] == 0 || p[2] == 0) break;
         }
     } else {
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++) {
-                int d = FFMIN(j, 256 - j);
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++) {
+                int d = FFMIN(j, s->vlc_n - j);
 
-                s->stats[i][j] = 100000000 / (d + 1);
+                s->stats[i][j] = 100000000 / (d*d + 1);
             }
     }
 
-    for (i = 0; i < 3; i++) {
-        ff_huff_gen_len_table(s->len[i], s->stats[i]);
-
-        if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i]) < 0) {
-            return -1;
-        }
-
-        s->avctx->extradata_size +=
-            store_table(s, s->len[i], &((uint8_t*)s->avctx->extradata)[s->avctx->extradata_size]);
-    }
+    ret = store_huffman_tables(s, s->avctx->extradata + s->avctx->extradata_size);
+    if (ret < 0)
+        return ret;
+    s->avctx->extradata_size += ret;
 
     if (s->context) {
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < 4; i++) {
             int pels = s->width * s->height / (i ? 40 : 10);
-            for (j = 0; j < 256; j++) {
-                int d = FFMIN(j, 256 - j);
-                s->stats[i][j] = pels/(d + 1);
+            for (j = 0; j < s->vlc_n; j++) {
+                int d = FFMIN(j, s->vlc_n - j);
+                s->stats[i][j] = pels/(d*d + 1);
             }
         }
     } else {
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j]= 0;
     }
 
-    ff_huffyuv_alloc_temp(s);
+    if (ff_huffyuv_alloc_temp(s)) {
+        ff_huffyuv_common_end(s);
+        return AVERROR(ENOMEM);
+    }
 
     s->picture_number=0;
 
@@ -357,6 +492,168 @@ static int encode_422_bitstream(HYuvContext *s, int offset, int count)
     return 0;
 }
 
+static int encode_plane_bitstream(HYuvContext *s, int width, int plane)
+{
+    int i, count = width/2;
+
+    if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) < count * s->bps / 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+        return -1;
+    }
+
+#define LOADEND\
+            int y0 = s->temp[0][width-1];
+#define LOADEND_14\
+            int y0 = s->temp16[0][width-1] & mask;
+#define LOADEND_16\
+            int y0 = s->temp16[0][width-1];
+#define STATEND\
+            s->stats[plane][y0]++;
+#define STATEND_16\
+            s->stats[plane][y0>>2]++;
+#define WRITEEND\
+            put_bits(&s->pb, s->len[plane][y0], s->bits[plane][y0]);
+#define WRITEEND_16\
+            put_bits(&s->pb, s->len[plane][y0>>2], s->bits[plane][y0>>2]);\
+            put_bits(&s->pb, 2, y0&3);
+
+#define LOAD2\
+            int y0 = s->temp[0][2 * i];\
+            int y1 = s->temp[0][2 * i + 1];
+#define LOAD2_14\
+            int y0 = s->temp16[0][2 * i] & mask;\
+            int y1 = s->temp16[0][2 * i + 1] & mask;
+#define LOAD2_16\
+            int y0 = s->temp16[0][2 * i];\
+            int y1 = s->temp16[0][2 * i + 1];
+#define STAT2\
+            s->stats[plane][y0]++;\
+            s->stats[plane][y1]++;
+#define STAT2_16\
+            s->stats[plane][y0>>2]++;\
+            s->stats[plane][y1>>2]++;
+#define WRITE2\
+            put_bits(&s->pb, s->len[plane][y0], s->bits[plane][y0]);\
+            put_bits(&s->pb, s->len[plane][y1], s->bits[plane][y1]);
+#define WRITE2_16\
+            put_bits(&s->pb, s->len[plane][y0>>2], s->bits[plane][y0>>2]);\
+            put_bits(&s->pb, 2, y0&3);\
+            put_bits(&s->pb, s->len[plane][y1>>2], s->bits[plane][y1>>2]);\
+            put_bits(&s->pb, 2, y1&3);
+
+    if (s->bps <= 8) {
+    if (s->flags & AV_CODEC_FLAG_PASS1) {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            STAT2;
+        }
+        if (width&1) {
+            LOADEND;
+            STATEND;
+        }
+    }
+    if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+        return 0;
+
+    if (s->context) {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            STAT2;
+            WRITE2;
+        }
+        if (width&1) {
+            LOADEND;
+            STATEND;
+            WRITEEND;
+        }
+    } else {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            WRITE2;
+        }
+        if (width&1) {
+            LOADEND;
+            WRITEEND;
+        }
+    }
+    } else if (s->bps <= 14) {
+        int mask = s->n - 1;
+        if (s->flags & AV_CODEC_FLAG_PASS1) {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                STAT2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                STATEND;
+            }
+        }
+        if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+            return 0;
+
+        if (s->context) {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                STAT2;
+                WRITE2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                STATEND;
+                WRITEEND;
+            }
+        } else {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                WRITE2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                WRITEEND;
+            }
+        }
+    } else {
+        if (s->flags & AV_CODEC_FLAG_PASS1) {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                STAT2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                STATEND_16;
+            }
+        }
+        if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+            return 0;
+
+        if (s->context) {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                STAT2_16;
+                WRITE2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                STATEND_16;
+                WRITEEND_16;
+            }
+        } else {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                WRITE2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                WRITEEND_16;
+            }
+        }
+    }
+#undef LOAD2
+#undef STAT2
+#undef WRITE2
+    return 0;
+}
+
 static int encode_gray_bitstream(HYuvContext *s, int count)
 {
     int i;
@@ -414,8 +711,8 @@ static inline int encode_bgra_bitstream(HYuvContext *s, int count, int planes)
 
 #define LOAD_GBRA                                                       \
     int g = s->temp[0][planes == 3 ? 3 * i + 1 : 4 * i + G];            \
-    int b = s->temp[0][planes == 3 ? 3 * i + 2 : 4 * i + B] - g & 0xFF; \
-    int r = s->temp[0][planes == 3 ? 3 * i + 0 : 4 * i + R] - g & 0xFF; \
+    int b =(s->temp[0][planes == 3 ? 3 * i + 2 : 4 * i + B] - g) & 0xFF;\
+    int r =(s->temp[0][planes == 3 ? 3 * i + 0 : 4 * i + R] - g) & 0xFF;\
     int a = s->temp[0][planes * i + A];
 
 #define STAT_BGRA                                                       \
@@ -466,22 +763,16 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame * const p = pict;
     int i, j, size = 0, ret;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, width * height * 3 * 4 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error allocating output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, width * height * 3 * 4 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (s->context) {
-        for (i = 0; i < 3; i++) {
-            ff_huff_gen_len_table(s->len[i], s->stats[i]);
-            if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i]) < 0)
-                return -1;
-            size += store_table(s, s->len[i], &pkt->data[size]);
-        }
+        size = store_huffman_tables(s, pkt->data);
+        if (size < 0)
+            return size;
 
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j] >>= 1;
     }
 
@@ -523,9 +814,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             lefttopy = p->data[0][3];
             lefttopu = p->data[1][1];
             lefttopv = p->data[2][1];
-            s->hencdsp.sub_hfyu_median_pred(s->temp[0], p->data[0] + 4, p->data[0] + fake_ystride + 4, width  - 4, &lefty, &lefttopy);
-            s->hencdsp.sub_hfyu_median_pred(s->temp[1], p->data[1] + 2, p->data[1] + fake_ustride + 2, width2 - 2, &leftu, &lefttopu);
-            s->hencdsp.sub_hfyu_median_pred(s->temp[2], p->data[2] + 2, p->data[2] + fake_vstride + 2, width2 - 2, &leftv, &lefttopv);
+            s->llvidencdsp.sub_median_pred(s->temp[0], p->data[0] + 4, p->data[0] + fake_ystride + 4, width  - 4, &lefty, &lefttopy);
+            s->llvidencdsp.sub_median_pred(s->temp[1], p->data[1] + 2, p->data[1] + fake_ustride + 2, width2 - 2, &leftu, &lefttopu);
+            s->llvidencdsp.sub_median_pred(s->temp[2], p->data[2] + 2, p->data[2] + fake_vstride + 2, width2 - 2, &leftv, &lefttopv);
             encode_422_bitstream(s, 0, width - 4);
             y++; cy++;
 
@@ -535,7 +826,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                 if (s->bitstream_bpp == 12) {
                     while (2 * cy > y) {
                         ydst = p->data[0] + p->linesize[0] * y;
-                        s->hencdsp.sub_hfyu_median_pred(s->temp[0], ydst - fake_ystride, ydst, width, &lefty, &lefttopy);
+                        s->llvidencdsp.sub_median_pred(s->temp[0], ydst - fake_ystride, ydst, width, &lefty, &lefttopy);
                         encode_gray_bitstream(s, width);
                         y++;
                     }
@@ -545,9 +836,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                 udst = p->data[1] + p->linesize[1] * cy;
                 vdst = p->data[2] + p->linesize[2] * cy;
 
-                s->hencdsp.sub_hfyu_median_pred(s->temp[0], ydst - fake_ystride, ydst, width,  &lefty, &lefttopy);
-                s->hencdsp.sub_hfyu_median_pred(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
-                s->hencdsp.sub_hfyu_median_pred(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
+                s->llvidencdsp.sub_median_pred(s->temp[0], ydst - fake_ystride, ydst, width,  &lefty, &lefttopy);
+                s->llvidencdsp.sub_median_pred(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
+                s->llvidencdsp.sub_median_pred(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
 
                 encode_422_bitstream(s, 0, width);
             }
@@ -560,7 +851,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                     ydst = p->data[0] + p->linesize[0] * y;
 
                     if (s->predictor == PLANE && s->interlaced < y) {
-                        s->hencdsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
+                        s->llvidencdsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
 
                         lefty = sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
                     } else {
@@ -576,9 +867,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                 vdst = p->data[2] + p->linesize[2] * cy;
 
                 if (s->predictor == PLANE && s->interlaced < cy) {
-                    s->hencdsp.diff_bytes(s->temp[1],          ydst, ydst - fake_ystride, width);
-                    s->hencdsp.diff_bytes(s->temp[2],          udst, udst - fake_ustride, width2);
-                    s->hencdsp.diff_bytes(s->temp[2] + width2, vdst, vdst - fake_vstride, width2);
+                    s->llvidencdsp.diff_bytes(s->temp[1],          ydst, ydst - fake_ystride, width);
+                    s->llvidencdsp.diff_bytes(s->temp[2],          udst, udst - fake_ustride, width2);
+                    s->llvidencdsp.diff_bytes(s->temp[2] + width2, vdst, vdst - fake_vstride, width2);
 
                     lefty = sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
                     leftu = sub_left_prediction(s, s->temp[1], s->temp[2], width2, leftu);
@@ -611,7 +902,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         for (y = 1; y < s->height; y++) {
             uint8_t *dst = data + y*stride;
             if (s->predictor == PLANE && s->interlaced < y) {
-                s->hencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride, width * 4);
+                s->llvidencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride, width * 4);
                 sub_left_prediction_bgr32(s, s->temp[0], s->temp[1], width,
                                           &leftr, &leftg, &leftb, &lefta);
             } else {
@@ -639,7 +930,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         for (y = 1; y < s->height; y++) {
             uint8_t *dst = data + y * stride;
             if (s->predictor == PLANE && s->interlaced < y) {
-                s->hencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride,
+                s->llvidencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride,
                                       width * 3);
                 sub_left_prediction_rgb24(s, s->temp[0], s->temp[1], width,
                                           &leftr, &leftg, &leftb);
@@ -649,6 +940,59 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             }
             encode_bgra_bitstream(s, width, 3);
         }
+    } else if (s->version > 2) {
+        int plane;
+        for (plane = 0; plane < 1 + 2*s->chroma + s->alpha; plane++) {
+            int left, y;
+            int w = width;
+            int h = height;
+            int fake_stride = fake_ystride;
+
+            if (s->chroma && (plane == 1 || plane == 2)) {
+                w >>= s->chroma_h_shift;
+                h >>= s->chroma_v_shift;
+                fake_stride = plane == 1 ? fake_ustride : fake_vstride;
+            }
+
+            left = sub_left_prediction(s, s->temp[0], p->data[plane], w , 0);
+
+            encode_plane_bitstream(s, w, plane);
+
+            if (s->predictor==MEDIAN) {
+                int lefttop;
+                y = 1;
+                if (s->interlaced) {
+                    left = sub_left_prediction(s, s->temp[0], p->data[plane] + p->linesize[plane], w , left);
+
+                    encode_plane_bitstream(s, w, plane);
+                    y++;
+                }
+
+                lefttop = p->data[plane][0];
+
+                for (; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane] * y;
+
+                    sub_median_prediction(s, s->temp[0], dst - fake_stride, dst, w , &left, &lefttop);
+
+                    encode_plane_bitstream(s, w, plane);
+                }
+            } else {
+                for (y = 1; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane] * y;
+
+                    if (s->predictor == PLANE && s->interlaced < y) {
+                        diff_bytes(s, s->temp[1], dst, dst - fake_stride, w);
+
+                        left = sub_left_prediction(s, s->temp[0], s->temp[1], w , left);
+                    } else {
+                        left = sub_left_prediction(s, s->temp[0], dst, w , left);
+                    }
+
+                    encode_plane_bitstream(s, w, plane);
+                }
+            }
+        }
     } else {
         av_log(avctx, AV_LOG_ERROR, "Format not supported!\n");
     }
@@ -662,17 +1006,19 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if ((s->flags & AV_CODEC_FLAG_PASS1) && (s->picture_number & 31) == 0) {
         int j;
         char *p = avctx->stats_out;
-        char *end = p + 1024*30;
-        for (i = 0; i < 3; i++) {
-            for (j = 0; j < 256; j++) {
+        char *end = p + STATS_OUT_SIZE;
+        for (i = 0; i < 4; i++) {
+            for (j = 0; j < s->vlc_n; j++) {
                 snprintf(p, end-p, "%"PRIu64" ", s->stats[i][j]);
                 p += strlen(p);
                 s->stats[i][j]= 0;
             }
             snprintf(p, end-p, "\n");
             p++;
+            if (end <= p)
+                return AVERROR(ENOMEM);
         }
-    } else
+    } else if (avctx->stats_out)
         avctx->stats_out[0] = '\0';
     if (!(s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)) {
         flush_put_bits(&s->pb);
@@ -703,26 +1049,39 @@ static av_cold int encode_end(AVCodecContext *avctx)
 #define OFFSET(x) offsetof(HYuvContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 
-#define HUFF_CLASS(variant)                  \
-static const AVClass variant ## _class = {   \
-    .class_name = # variant,                 \
-    .item_name  = av_default_item_name,      \
-    .option     = variant ## _options,       \
-    .version    = LIBAVUTIL_VERSION_INT,     \
-}
+#define COMMON_OPTIONS \
+    { "non_deterministic", "Allow multithreading for e.g. context=1 at the expense of determinism", \
+      OFFSET(non_determ), AV_OPT_TYPE_BOOL, { .i64 = 1 }, \
+      0, 1, VE }, \
+    { "pred", "Prediction method", OFFSET(predictor), AV_OPT_TYPE_INT, { .i64 = LEFT }, LEFT, MEDIAN, VE, "pred" }, \
+        { "left",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LEFT },   INT_MIN, INT_MAX, VE, "pred" }, \
+        { "plane",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PLANE },  INT_MIN, INT_MAX, VE, "pred" }, \
+        { "median", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MEDIAN }, INT_MIN, INT_MAX, VE, "pred" }, \
+
+static const AVOption normal_options[] = {
+    COMMON_OPTIONS
+    { NULL },
+};
 
-#define FF_HUFFYUV_COMMON_OPTS \
-{ "pred", "Prediction method", OFFSET(predictor), AV_OPT_TYPE_INT, { .i64 = LEFT }, LEFT, MEDIAN, VE, "pred" }, \
-    { "left",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LEFT },   INT_MIN, INT_MAX, VE, "pred" }, \
-    { "plane",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PLANE },  INT_MIN, INT_MAX, VE, "pred" }, \
-    { "median", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MEDIAN }, INT_MIN, INT_MAX, VE, "pred" }
+static const AVOption ff_options[] = {
+    COMMON_OPTIONS
+    { "context", "Set per-frame huffman tables", OFFSET(context), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { NULL },
+};
 
-static const AVOption huffyuv_options[] = {
-    FF_HUFFYUV_COMMON_OPTS,
-    { NULL},
+static const AVClass normal_class = {
+    .class_name = "huffyuv",
+    .item_name  = av_default_item_name,
+    .option     = normal_options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
-HUFF_CLASS(huffyuv);
+static const AVClass ff_class = {
+    .class_name = "ffvhuff",
+    .item_name  = av_default_item_name,
+    .option     = ff_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 AVCodec ff_huffyuv_encoder = {
     .name           = "huffyuv",
@@ -730,10 +1089,11 @@ AVCodec ff_huffyuv_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_HUFFYUV,
     .priv_data_size = sizeof(HYuvContext),
-    .priv_class     = &huffyuv_class,
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &normal_class,
     .pix_fmts       = (const enum AVPixelFormat[]){
         AV_PIX_FMT_YUV422P, AV_PIX_FMT_RGB24,
         AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
@@ -743,26 +1103,33 @@ AVCodec ff_huffyuv_encoder = {
 };
 
 #if CONFIG_FFVHUFF_ENCODER
-static const AVOption ffhuffyuv_options[] = {
-    FF_HUFFYUV_COMMON_OPTS,
-    { "context", "Set per-frame huffman tables", OFFSET(context), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { NULL }
-};
-
-HUFF_CLASS(ffhuffyuv);
-
 AVCodec ff_ffvhuff_encoder = {
     .name           = "ffvhuff",
     .long_name      = NULL_IF_CONFIG_SMALL("Huffyuv FFmpeg variant"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFVHUFF,
     .priv_data_size = sizeof(HYuvContext),
-    .priv_class     = &ffhuffyuv_class,
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &ff_class,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_RGB24,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV440P,
+        AV_PIX_FMT_GBRP,
+        AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16,
+        AV_PIX_FMT_YUVA420P, AV_PIX_FMT_YUVA422P, AV_PIX_FMT_YUVA444P,
+        AV_PIX_FMT_GBRAP,
+        AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV420P16,
+        AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV422P16,
+        AV_PIX_FMT_YUV444P9, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV444P14, AV_PIX_FMT_YUV444P16,
+        AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16,
+        AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P16,
+        AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P16,
+        AV_PIX_FMT_RGB24,
         AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
     },
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c
index 6c30877..427d118 100644
--- a/libavcodec/huffyuvencdsp.c
+++ b/libavcodec/huffyuvencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,64 +21,58 @@
 #include "huffyuvencdsp.h"
 #include "mathops.h"
 
-// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
-#define pb_7f (~0UL / 255 * 0x7f)
-#define pb_80 (~0UL / 255 * 0x80)
+// 0x00010001 or 0x0001000100010001 or whatever, depending on the cpu's native arithmetic size
+#define pw_1 (ULONG_MAX / UINT16_MAX)
 
-static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
-{
+static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){
     long i;
-
 #if !HAVE_FAST_UNALIGNED
-    if ((long) src2 & (sizeof(long) - 1)) {
-        for (i = 0; i + 7 < w; i += 8) {
-            dst[i + 0] = src1[i + 0] - src2[i + 0];
-            dst[i + 1] = src1[i + 1] - src2[i + 1];
-            dst[i + 2] = src1[i + 2] - src2[i + 2];
-            dst[i + 3] = src1[i + 3] - src2[i + 3];
-            dst[i + 4] = src1[i + 4] - src2[i + 4];
-            dst[i + 5] = src1[i + 5] - src2[i + 5];
-            dst[i + 6] = src1[i + 6] - src2[i + 6];
-            dst[i + 7] = src1[i + 7] - src2[i + 7];
+    if((long)src2 & (sizeof(long)-1)){
+        for(i=0; i+3<w; i+=4){
+            dst[i+0] = (src1[i+0]-src2[i+0]) & mask;
+            dst[i+1] = (src1[i+1]-src2[i+1]) & mask;
+            dst[i+2] = (src1[i+2]-src2[i+2]) & mask;
+            dst[i+3] = (src1[i+3]-src2[i+3]) & mask;
         }
-    } else
+    }else
 #endif
-    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
-        long a = *(long *) (src1 + i);
-        long b = *(long *) (src2 + i);
-        *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
-                              ((a ^ b ^ pb_80) & pb_80);
+    {
+        unsigned long pw_lsb = (mask >> 1) * pw_1;
+        unsigned long pw_msb = pw_lsb +  pw_1;
+
+        for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) {
+            long a = *(long*)(src1+i);
+            long b = *(long*)(src2+i);
+            *(long*)(dst+i) = ((a|pw_msb) - (b&pw_lsb)) ^ ((a^b^pw_msb)&pw_msb);
+        }
     }
-    for (; i < w; i++)
-        dst[i + 0] = src1[i + 0] - src2[i + 0];
+    for (; i<w; i++)
+        dst[i] = (src1[i] - src2[i]) & mask;
 }
 
-static void sub_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
-                                   const uint8_t *src2, int w,
-                                   int *left, int *left_top)
-{
+static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){
     int i;
-    uint8_t l, lt;
+    uint16_t l, lt;
 
     l  = *left;
     lt = *left_top;
 
-    for (i = 0; i < w; i++) {
-        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
-        lt     = src1[i];
-        l      = src2[i];
-        dst[i] = l - pred;
+    for(i=0; i<w; i++){
+        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & mask);
+        lt = src1[i];
+        l  = src2[i];
+        dst[i] = (l - pred) & mask;
     }
 
     *left     = l;
     *left_top = lt;
 }
 
-av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c)
+av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
 {
-    c->diff_bytes           = diff_bytes_c;
-    c->sub_hfyu_median_pred = sub_hfyu_median_pred_c;
+    c->diff_int16           = diff_int16_c;
+    c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
 
     if (ARCH_X86)
-        ff_huffyuvencdsp_init_x86(c);
+        ff_huffyuvencdsp_init_x86(c, avctx);
 }
diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h
index 603c36f..603f9c8 100644
--- a/libavcodec/huffyuvencdsp.h
+++ b/libavcodec/huffyuvencdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,21 +21,20 @@
 
 #include <stdint.h>
 
+#include "avcodec.h"
+
 typedef struct HuffYUVEncDSPContext {
-    void (*diff_bytes)(uint8_t *dst /* align 16 */,
-                       uint8_t *src1 /* align 16 */,
-                       uint8_t *src2 /* align 1 */,
-                       int w);
-    /**
-     * Subtract HuffYUV's variant of median prediction.
-     * Note, this might read from src1[-1], src2[-1].
-     */
-    void (*sub_hfyu_median_pred)(uint8_t *dst, const uint8_t *src1,
-                                 const uint8_t *src2, int w,
-                                 int *left, int *left_top);
+    void (*diff_int16)(uint16_t *dst /* align 16 */,
+                       const uint16_t *src1 /* align 16 */,
+                       const uint16_t *src2 /* align 1 */,
+                       unsigned mask, int w);
+
+    void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1,
+                                       const uint16_t *src2, unsigned mask,
+                                       int w, int *left, int *left_top);
 } HuffYUVEncDSPContext;
 
-void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c);
-void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c);
+void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, AVCodecContext *avctx);
+void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx);
 
 #endif /* AVCODEC_HUFFYUVENCDSP_H */
diff --git a/libavcodec/hwaccel.h b/libavcodec/hwaccel.h
index e215736..3aaa925 100644
--- a/libavcodec/hwaccel.h
+++ b/libavcodec/hwaccel.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,17 +64,21 @@ typedef struct AVCodecHWConfigInternal {
         .hwaccel         = NULL, \
     }
 
-#define HWACCEL_CUVID(codec) \
-    HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _cuvid_hwaccel)
 #define HWACCEL_DXVA2(codec) \
     HW_CONFIG_HWACCEL(1, 1, 1, DXVA2_VLD,    DXVA2,        ff_ ## codec ## _dxva2_hwaccel)
 #define HWACCEL_D3D11VA2(codec) \
     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
+#define HWACCEL_NVDEC(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
 #define HWACCEL_VAAPI(codec) \
     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
 #define HWACCEL_VDPAU(codec) \
     HW_CONFIG_HWACCEL(1, 1, 1, VDPAU,        VDPAU,        ff_ ## codec ## _vdpau_hwaccel)
+#define HWACCEL_VIDEOTOOLBOX(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 1, VIDEOTOOLBOX, VIDEOTOOLBOX, ff_ ## codec ## _videotoolbox_hwaccel)
 #define HWACCEL_D3D11VA(codec) \
     HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD,  NONE,         ff_ ## codec ## _d3d11va_hwaccel)
+#define HWACCEL_XVMC(codec) \
+    HW_CONFIG_HWACCEL(0, 0, 1, XVMC,         NONE,         ff_ ## codec ## _xvmc_hwaccel)
 
 #endif /* AVCODEC_HWACCEL_H */
diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
index afa86f1..7d73da8 100644
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,37 +22,56 @@
 #include "avcodec.h"
 
 extern const AVHWAccel ff_h263_vaapi_hwaccel;
-extern const AVHWAccel ff_h264_cuvid_hwaccel;
+extern const AVHWAccel ff_h263_videotoolbox_hwaccel;
 extern const AVHWAccel ff_h264_d3d11va_hwaccel;
 extern const AVHWAccel ff_h264_d3d11va2_hwaccel;
 extern const AVHWAccel ff_h264_dxva2_hwaccel;
+extern const AVHWAccel ff_h264_nvdec_hwaccel;
 extern const AVHWAccel ff_h264_vaapi_hwaccel;
-extern const AVHWAccel ff_h264_vda_hwaccel;
-extern const AVHWAccel ff_h264_vda_old_hwaccel;
 extern const AVHWAccel ff_h264_vdpau_hwaccel;
-extern const AVHWAccel ff_hevc_cuvid_hwaccel;
+extern const AVHWAccel ff_h264_videotoolbox_hwaccel;
 extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
 extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
 extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+extern const AVHWAccel ff_hevc_nvdec_hwaccel;
 extern const AVHWAccel ff_hevc_vaapi_hwaccel;
 extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+extern const AVHWAccel ff_mjpeg_nvdec_hwaccel;
+extern const AVHWAccel ff_mjpeg_vaapi_hwaccel;
+extern const AVHWAccel ff_mpeg1_nvdec_hwaccel;
 extern const AVHWAccel ff_mpeg1_vdpau_hwaccel;
+extern const AVHWAccel ff_mpeg1_videotoolbox_hwaccel;
+extern const AVHWAccel ff_mpeg1_xvmc_hwaccel;
 extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel;
 extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel;
+extern const AVHWAccel ff_mpeg2_nvdec_hwaccel;
 extern const AVHWAccel ff_mpeg2_dxva2_hwaccel;
 extern const AVHWAccel ff_mpeg2_vaapi_hwaccel;
 extern const AVHWAccel ff_mpeg2_vdpau_hwaccel;
+extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel;
+extern const AVHWAccel ff_mpeg2_xvmc_hwaccel;
+extern const AVHWAccel ff_mpeg4_nvdec_hwaccel;
 extern const AVHWAccel ff_mpeg4_vaapi_hwaccel;
 extern const AVHWAccel ff_mpeg4_vdpau_hwaccel;
+extern const AVHWAccel ff_mpeg4_videotoolbox_hwaccel;
 extern const AVHWAccel ff_vc1_d3d11va_hwaccel;
 extern const AVHWAccel ff_vc1_d3d11va2_hwaccel;
 extern const AVHWAccel ff_vc1_dxva2_hwaccel;
+extern const AVHWAccel ff_vc1_nvdec_hwaccel;
 extern const AVHWAccel ff_vc1_vaapi_hwaccel;
 extern const AVHWAccel ff_vc1_vdpau_hwaccel;
+extern const AVHWAccel ff_vp8_nvdec_hwaccel;
 extern const AVHWAccel ff_vp8_vaapi_hwaccel;
+extern const AVHWAccel ff_vp9_d3d11va_hwaccel;
+extern const AVHWAccel ff_vp9_d3d11va2_hwaccel;
+extern const AVHWAccel ff_vp9_dxva2_hwaccel;
+extern const AVHWAccel ff_vp9_nvdec_hwaccel;
+extern const AVHWAccel ff_vp9_vaapi_hwaccel;
 extern const AVHWAccel ff_wmv3_d3d11va_hwaccel;
 extern const AVHWAccel ff_wmv3_d3d11va2_hwaccel;
 extern const AVHWAccel ff_wmv3_dxva2_hwaccel;
+extern const AVHWAccel ff_wmv3_nvdec_hwaccel;
 extern const AVHWAccel ff_wmv3_vaapi_hwaccel;
 extern const AVHWAccel ff_wmv3_vdpau_hwaccel;
 
diff --git a/libavcodec/idcinvideo.c b/libavcodec/idcinvideo.c
index 67dcf1c..cff9ad3 100644
--- a/libavcodec/idcinvideo.c
+++ b/libavcodec/idcinvideo.c
@@ -2,20 +2,20 @@
  * id Quake II CIN Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,7 +75,7 @@ typedef struct IdcinContext {
     uint32_t pal[256];
 } IdcinContext;
 
-/*
+/**
  * Find the lowest probability node in a Huffman table, and mark it as
  * being assigned to a higher probability.
  * @return the node index of the lowest unused node, or -1 if all nodes
@@ -169,7 +169,7 @@ static av_cold int idcin_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
+static int idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
 {
     hnode *hnodes;
     long x, y;
@@ -188,7 +188,7 @@ static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
                 if(!bit_pos) {
                     if(dat_pos >= s->size) {
                         av_log(s->avctx, AV_LOG_ERROR, "Huffman decode error.\n");
-                        return;
+                        return -1;
                     }
                     bit_pos = 8;
                     v = s->buf[dat_pos++];
@@ -203,6 +203,8 @@ static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
             prev = node_num;
         }
     }
+
+    return 0;
 }
 
 static int idcin_decode_frame(AVCodecContext *avctx,
@@ -212,23 +214,25 @@ static int idcin_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     IdcinContext *s = avctx->priv_data;
-    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+    int pal_size;
+    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &pal_size);
     AVFrame *frame = data;
     int ret;
 
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  id CIN Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
-    idcin_decode_vlcs(s, frame);
+    if (idcin_decode_vlcs(s, frame))
+        return AVERROR_INVALIDDATA;
 
-    if (pal) {
+    if (pal && pal_size == AVPALETTE_SIZE) {
         frame->palette_has_changed = 1;
         memcpy(s->pal, pal, AVPALETTE_SIZE);
+    } else if (pal) {
+        av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", pal_size);
     }
     /* make the palette available on the way out */
     memcpy(frame->data[1], s->pal, AVPALETTE_SIZE);
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index 5a267e4..846ed0b 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #include "faanidct.h"
 #include "idctdsp.h"
 #include "simple_idct.h"
+#include "xvididct.h"
 
 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
                                const uint8_t *src_scantable)
@@ -79,11 +80,8 @@ av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
     }
 }
 
-void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
-void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
-
-static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 ptrdiff_t line_size)
+void ff_put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                             ptrdiff_t line_size)
 {
     int i;
 
@@ -103,8 +101,40 @@ static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
     }
 }
 
+static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = av_clip_uint8(block[0]);
+        pixels[1] = av_clip_uint8(block[1]);
+        pixels[2] = av_clip_uint8(block[2]);
+        pixels[3] = av_clip_uint8(block[3]);
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = av_clip_uint8(block[0]);
+        pixels[1] = av_clip_uint8(block[1]);
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
 static void put_signed_pixels_clamped_c(const int16_t *block,
-                                        uint8_t *restrict pixels,
+                                        uint8_t *av_restrict pixels,
                                         ptrdiff_t line_size)
 {
     int i, j;
@@ -124,8 +154,8 @@ static void put_signed_pixels_clamped_c(const int16_t *block,
     }
 }
 
-static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 ptrdiff_t line_size)
+void ff_add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                             ptrdiff_t line_size)
 {
     int i;
 
@@ -144,47 +174,147 @@ static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
     }
 }
 
+static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
+                          int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
+        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
+        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
+        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
+                          int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
+        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void ff_jref_idct4_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_j_rev_dct4 (block);
+    put_pixels_clamped4_c(block, dest, line_size);
+}
+static void ff_jref_idct4_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_j_rev_dct4 (block);
+    add_pixels_clamped4_c(block, dest, line_size);
+}
+
+static void ff_jref_idct2_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_j_rev_dct2 (block);
+    put_pixels_clamped2_c(block, dest, line_size);
+}
+static void ff_jref_idct2_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_j_rev_dct2 (block);
+    add_pixels_clamped2_c(block, dest, line_size);
+}
+
+static void ff_jref_idct1_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    dest[0] = av_clip_uint8((block[0] + 4)>>3);
+}
+static void ff_jref_idct1_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
+}
+
 av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (avctx->bits_per_raw_sample == 10) {
-        c->idct_put  = ff_simple_idct_put_10;
-        c->idct_add  = ff_simple_idct_add_10;
-        c->idct      = ff_simple_idct_10;
+    if (avctx->lowres==1) {
+        c->idct_put  = ff_jref_idct4_put;
+        c->idct_add  = ff_jref_idct4_add;
+        c->idct      = ff_j_rev_dct4;
         c->perm_type = FF_IDCT_PERM_NONE;
-    } else if (avctx->idct_algo == FF_IDCT_INT) {
-        c->idct_put  = ff_jref_idct_put;
-        c->idct_add  = ff_jref_idct_add;
-        c->idct      = ff_j_rev_dct;
-        c->perm_type = FF_IDCT_PERM_LIBMPEG2;
-#if CONFIG_FAANIDCT
-    } else if (avctx->idct_algo == FF_IDCT_FAAN) {
-        c->idct_put  = ff_faanidct_put;
-        c->idct_add  = ff_faanidct_add;
-        c->idct      = ff_faanidct;
+    } else if (avctx->lowres==2) {
+        c->idct_put  = ff_jref_idct2_put;
+        c->idct_add  = ff_jref_idct2_add;
+        c->idct      = ff_j_rev_dct2;
         c->perm_type = FF_IDCT_PERM_NONE;
-#endif /* CONFIG_FAANIDCT */
-    } else { // accurate/default
-        c->idct_put  = ff_simple_idct_put_8;
-        c->idct_add  = ff_simple_idct_add_8;
-        c->idct      = ff_simple_idct_8;
+    } else if (avctx->lowres==3) {
+        c->idct_put  = ff_jref_idct1_put;
+        c->idct_add  = ff_jref_idct1_add;
+        c->idct      = ff_j_rev_dct1;
         c->perm_type = FF_IDCT_PERM_NONE;
+    } else {
+        if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) {
+            /* 10-bit MPEG-4 Simple Studio Profile requires a higher precision IDCT
+               However, it only uses idct_put */
+            if (c->mpeg4_studio_profile) {
+                c->idct_put              = ff_simple_idct_put_int32_10bit;
+                c->idct_add              = NULL;
+                c->idct                  = NULL;
+            } else {
+                c->idct_put              = ff_simple_idct_put_int16_10bit;
+                c->idct_add              = ff_simple_idct_add_int16_10bit;
+                c->idct                  = ff_simple_idct_int16_10bit;
+            }
+            c->perm_type             = FF_IDCT_PERM_NONE;
+        } else if (avctx->bits_per_raw_sample == 12) {
+            c->idct_put              = ff_simple_idct_put_int16_12bit;
+            c->idct_add              = ff_simple_idct_add_int16_12bit;
+            c->idct                  = ff_simple_idct_int16_12bit;
+            c->perm_type             = FF_IDCT_PERM_NONE;
+        } else {
+            if (avctx->idct_algo == FF_IDCT_INT) {
+                c->idct_put  = ff_jref_idct_put;
+                c->idct_add  = ff_jref_idct_add;
+                c->idct      = ff_j_rev_dct;
+                c->perm_type = FF_IDCT_PERM_LIBMPEG2;
+#if CONFIG_FAANIDCT
+            } else if (avctx->idct_algo == FF_IDCT_FAAN) {
+                c->idct_put  = ff_faanidct_put;
+                c->idct_add  = ff_faanidct_add;
+                c->idct      = ff_faanidct;
+                c->perm_type = FF_IDCT_PERM_NONE;
+#endif /* CONFIG_FAANIDCT */
+            } else { // accurate/default
+                /* Be sure FF_IDCT_NONE will select this one, since it uses FF_IDCT_PERM_NONE */
+                c->idct_put  = ff_simple_idct_put_int16_8bit;
+                c->idct_add  = ff_simple_idct_add_int16_8bit;
+                c->idct      = ff_simple_idct_int16_8bit;
+                c->perm_type = FF_IDCT_PERM_NONE;
+            }
+        }
     }
 
-    c->put_pixels_clamped        = put_pixels_clamped_c;
+    c->put_pixels_clamped        = ff_put_pixels_clamped_c;
     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
-    c->add_pixels_clamped        = add_pixels_clamped_c;
+    c->add_pixels_clamped        = ff_add_pixels_clamped_c;
 
-    ff_put_pixels_clamped = c->put_pixels_clamped;
-    ff_add_pixels_clamped = c->add_pixels_clamped;
+    if (CONFIG_MPEG4_DECODER && avctx->idct_algo == FF_IDCT_XVID)
+        ff_xvid_idct_init(c, avctx);
 
+    if (ARCH_AARCH64)
+        ff_idctdsp_init_aarch64(c, avctx, high_bit_depth);
+    if (ARCH_ALPHA)
+        ff_idctdsp_init_alpha(c, avctx, high_bit_depth);
     if (ARCH_ARM)
         ff_idctdsp_init_arm(c, avctx, high_bit_depth);
     if (ARCH_PPC)
         ff_idctdsp_init_ppc(c, avctx, high_bit_depth);
     if (ARCH_X86)
         ff_idctdsp_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_idctdsp_init_mips(c, avctx, high_bit_depth);
 
     ff_init_scantable_permutation(c->idct_permutation,
                                   c->perm_type);
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
index c6b7aed..ca21a31 100644
--- a/libavcodec/idctdsp.h
+++ b/libavcodec/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,13 +53,13 @@ int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
 typedef struct IDCTDSPContext {
     /* pixel ops : interface with DCT */
     void (*put_pixels_clamped)(const int16_t *block /* align 16 */,
-                               uint8_t *restrict pixels /* align 8 */,
+                               uint8_t *av_restrict pixels /* align 8 */,
                                ptrdiff_t line_size);
     void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */,
-                                      uint8_t *restrict pixels /* align 8 */,
+                                      uint8_t *av_restrict pixels /* align 8 */,
                                       ptrdiff_t line_size);
     void (*add_pixels_clamped)(const int16_t *block /* align 16 */,
-                               uint8_t *restrict pixels /* align 8 */,
+                               uint8_t *av_restrict pixels /* align 8 */,
                                ptrdiff_t line_size);
 
     void (*idct)(int16_t *block /* align 16 */);
@@ -95,18 +95,28 @@ typedef struct IDCTDSPContext {
      */
     uint8_t idct_permutation[64];
     enum idct_permutation_type perm_type;
+
+    int mpeg4_studio_profile;
 } IDCTDSPContext;
 
-extern void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
-extern void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
+void ff_put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                             ptrdiff_t line_size);
+void ff_add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                             ptrdiff_t line_size);
 
 void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx);
 
+void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                             unsigned high_bit_depth);
+void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx,
+                           unsigned high_bit_depth);
 void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
+void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth);
 
 #endif /* AVCODEC_IDCTDSP_H */
diff --git a/libavcodec/iff.c b/libavcodec/iff.c
index a186e31..33cf2e3 100644
--- a/libavcodec/iff.c
+++ b/libavcodec/iff.c
@@ -1,28 +1,29 @@
 /*
- * IFF PBM/ILBM bitmap decoder
+ * IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN bitmap decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  * Copyright (c) 2010 Sebastian Vater <cdgs.basty@googlemail.com>
+ * Copyright (c) 2016 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * IFF PBM/ILBM bitmap decoder
+ * IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN bitmap decoder
  */
 
 #include <stdint.h>
@@ -32,12 +33,39 @@
 #include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
+#include "mathops.h"
+
+// TODO: masking bits
+typedef enum {
+    MASK_NONE,
+    MASK_HAS_MASK,
+    MASK_HAS_TRANSPARENT_COLOR,
+    MASK_LASSO
+} mask_type;
 
 typedef struct IffContext {
     AVFrame *frame;
     int planesize;
     uint8_t * planebuf;
+    uint8_t * ham_buf;      ///< temporary buffer for planar to chunky conversation
+    uint32_t *ham_palbuf;   ///< HAM decode table
+    uint32_t *mask_buf;     ///< temporary buffer for palette indices
+    uint32_t *mask_palbuf;  ///< masking palette table
+    unsigned  compression;  ///< delta compression method used
+    unsigned  is_short;     ///< short compression method used
+    unsigned  is_interlaced;///< video is interlaced
+    unsigned  is_brush;     ///< video is in ANBR format
+    unsigned  bpp;          ///< bits per plane to decode (differs from bits_per_coded_sample if HAM)
+    unsigned  ham;          ///< 0 if non-HAM or number of hold bits (6 for bpp > 6, 4 otherwise)
+    unsigned  flags;        ///< 1 for EHB, 0 is no extra half darkening
+    unsigned  transparency; ///< TODO: transparency color index in palette
+    unsigned  masking;      ///< TODO: masking method used
     int init; // 1 if buffer and palette data already initialized, 0 otherwise
+    int16_t   tvdc[16];     ///< TVDC lookup table
+    GetByteContext gb;
+    uint8_t *video[2];
+    unsigned video_size;
+    uint32_t *pal;
 } IffContext;
 
 #define LUT8_PART(plane, v)                             \
@@ -124,33 +152,228 @@ static av_always_inline uint32_t gray2rgb(const uint32_t x) {
  */
 static int cmap_read_palette(AVCodecContext *avctx, uint32_t *pal)
 {
+    IffContext *s = avctx->priv_data;
     int count, i;
+    const uint8_t *const palette = avctx->extradata + AV_RB16(avctx->extradata);
+    int palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
 
     if (avctx->bits_per_coded_sample > 8) {
-        av_log(avctx, AV_LOG_ERROR, "bit_per_coded_sample > 8 not supported\n");
+        av_log(avctx, AV_LOG_ERROR, "bits_per_coded_sample > 8 not supported\n");
         return AVERROR_INVALIDDATA;
     }
 
     count = 1 << avctx->bits_per_coded_sample;
     // If extradata is smaller than actually needed, fill the remaining with black.
-    count = FFMIN(avctx->extradata_size / 3, count);
+    count = FFMIN(palette_size / 3, count);
     if (count) {
         for (i = 0; i < count; i++)
-            pal[i] = 0xFF000000 | AV_RB24(avctx->extradata + i * 3);
+            pal[i] = 0xFF000000 | AV_RB24(palette + i*3);
+        if (s->flags && count >= 32) { // EHB
+            for (i = 0; i < 32; i++)
+                pal[i + 32] = 0xFF000000 | (AV_RB24(palette + i*3) & 0xFEFEFE) >> 1;
+            count = FFMAX(count, 64);
+        }
     } else { // Create gray-scale color palette for bps < 8
         count = 1 << avctx->bits_per_coded_sample;
 
         for (i = 0; i < count; i++)
             pal[i] = 0xFF000000 | gray2rgb((i * 255) >> avctx->bits_per_coded_sample);
     }
+    if (s->masking == MASK_HAS_MASK) {
+        memcpy(pal + (1 << avctx->bits_per_coded_sample), pal, count * 4);
+        for (i = 0; i < count; i++)
+            pal[i] &= 0xFFFFFF;
+    } else if (s->masking == MASK_HAS_TRANSPARENT_COLOR &&
+        s->transparency < 1 << avctx->bits_per_coded_sample)
+        pal[s->transparency] &= 0xFFFFFF;
+    return 0;
+}
+
+/**
+ * Extracts the IFF extra context and updates internal
+ * decoder structures.
+ *
+ * @param avctx the AVCodecContext where to extract extra context to
+ * @param avpkt the AVPacket to extract extra context from or NULL to use avctx
+ * @return >= 0 in case of success, a negative error code otherwise
+ */
+static int extract_header(AVCodecContext *const avctx,
+                          const AVPacket *const avpkt)
+{
+    IffContext *s = avctx->priv_data;
+    const uint8_t *buf;
+    unsigned buf_size = 0;
+    int i, palette_size;
+
+    if (avctx->extradata_size < 2) {
+        av_log(avctx, AV_LOG_ERROR, "not enough extradata\n");
+        return AVERROR_INVALIDDATA;
+    }
+    palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
+
+    if (avpkt && avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+        uint32_t chunk_id;
+        uint64_t data_size;
+        GetByteContext *gb = &s->gb;
+
+        bytestream2_skip(gb, 4);
+        while (bytestream2_get_bytes_left(gb) >= 1) {
+            chunk_id  = bytestream2_get_le32(gb);
+            data_size = bytestream2_get_be32(gb);
+
+            if (chunk_id == MKTAG('B', 'M', 'H', 'D')) {
+                bytestream2_skip(gb, data_size + (data_size & 1));
+            } else if (chunk_id == MKTAG('A', 'N', 'H', 'D')) {
+                unsigned extra;
+                if (data_size < 40)
+                    return AVERROR_INVALIDDATA;
+
+                s->compression = (bytestream2_get_byte(gb) << 8) | (s->compression & 0xFF);
+                bytestream2_skip(gb, 19);
+                extra = bytestream2_get_be32(gb);
+                s->is_short = !(extra & 1);
+                s->is_brush = extra == 2;
+                s->is_interlaced = !!(extra & 0x40);
+                data_size -= 24;
+                bytestream2_skip(gb, data_size + (data_size & 1));
+            } else if (chunk_id == MKTAG('D', 'L', 'T', 'A') ||
+                       chunk_id == MKTAG('B', 'O', 'D', 'Y')) {
+                if (chunk_id == MKTAG('B','O','D','Y'))
+                    s->compression &= 0xFF;
+                break;
+            } else if (chunk_id == MKTAG('C', 'M', 'A', 'P')) {
+                int count = data_size / 3;
+                uint32_t *pal = s->pal;
+
+                if (count > 256)
+                    return AVERROR_INVALIDDATA;
+                if (s->ham) {
+                    for (i = 0; i < count; i++)
+                        pal[i] = 0xFF000000 | bytestream2_get_le24(gb);
+                } else {
+                    for (i = 0; i < count; i++)
+                        pal[i] = 0xFF000000 | bytestream2_get_be24(gb);
+                }
+                bytestream2_skip(gb, data_size & 1);
+            } else {
+                bytestream2_skip(gb, data_size + (data_size&1));
+            }
+        }
+    } else if (!avpkt) {
+        buf = avctx->extradata;
+        buf_size = bytestream_get_be16(&buf);
+        if (buf_size <= 1 || palette_size < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid palette size received: %u -> palette data offset: %d\n",
+                   buf_size, palette_size);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (buf_size >= 41) {
+        s->compression  = bytestream_get_byte(&buf);
+        s->bpp          = bytestream_get_byte(&buf);
+        s->ham          = bytestream_get_byte(&buf);
+        s->flags        = bytestream_get_byte(&buf);
+        s->transparency = bytestream_get_be16(&buf);
+        s->masking      = bytestream_get_byte(&buf);
+        for (i = 0; i < 16; i++)
+            s->tvdc[i] = bytestream_get_be16(&buf);
+
+        if (s->masking == MASK_HAS_MASK) {
+            if (s->bpp >= 8 && !s->ham) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB32;
+                av_freep(&s->mask_buf);
+                av_freep(&s->mask_palbuf);
+                s->mask_buf = av_malloc((s->planesize * 32) + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!s->mask_buf)
+                    return AVERROR(ENOMEM);
+                if (s->bpp > 16) {
+                    av_log(avctx, AV_LOG_ERROR, "bpp %d too large for palette\n", s->bpp);
+                    av_freep(&s->mask_buf);
+                    return AVERROR(ENOMEM);
+                }
+                s->mask_palbuf = av_malloc((2 << s->bpp) * sizeof(uint32_t) + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!s->mask_palbuf) {
+                    av_freep(&s->mask_buf);
+                    return AVERROR(ENOMEM);
+                }
+            }
+            s->bpp++;
+        } else if (s->masking != MASK_NONE && s->masking != MASK_HAS_TRANSPARENT_COLOR) {
+            av_log(avctx, AV_LOG_ERROR, "Masking not supported\n");
+            return AVERROR_PATCHWELCOME;
+        }
+        if (!s->bpp || s->bpp > 32) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of bitplanes: %u\n", s->bpp);
+            return AVERROR_INVALIDDATA;
+        } else if (s->ham >= 8) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of hold bits for HAM: %u\n", s->ham);
+            return AVERROR_INVALIDDATA;
+        }
+
+        av_freep(&s->ham_buf);
+        av_freep(&s->ham_palbuf);
+
+        if (s->ham) {
+            int i, count = FFMIN(palette_size / 3, 1 << s->ham);
+            int ham_count;
+            const uint8_t *const palette = avctx->extradata + AV_RB16(avctx->extradata);
+
+            s->ham_buf = av_malloc((s->planesize * 8) + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->ham_buf)
+                return AVERROR(ENOMEM);
+
+            ham_count = 8 * (1 << s->ham);
+            s->ham_palbuf = av_malloc((ham_count << !!(s->masking == MASK_HAS_MASK)) * sizeof (uint32_t) + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->ham_palbuf) {
+                av_freep(&s->ham_buf);
+                return AVERROR(ENOMEM);
+            }
+
+            if (count) { // HAM with color palette attached
+                // prefill with black and palette and set HAM take direct value mask to zero
+                memset(s->ham_palbuf, 0, (1 << s->ham) * 2 * sizeof (uint32_t));
+                for (i=0; i < count; i++) {
+                    s->ham_palbuf[i*2+1] = 0xFF000000 | AV_RL24(palette + i*3);
+                }
+                count = 1 << s->ham;
+            } else { // HAM with grayscale color palette
+                count = 1 << s->ham;
+                for (i=0; i < count; i++) {
+                    s->ham_palbuf[i*2]   = 0xFF000000; // take direct color value from palette
+                    s->ham_palbuf[i*2+1] = 0xFF000000 | av_le2ne32(gray2rgb((i * 255) >> s->ham));
+                }
+            }
+            for (i=0; i < count; i++) {
+                uint32_t tmp = i << (8 - s->ham);
+                tmp |= tmp >> s->ham;
+                s->ham_palbuf[(i+count)*2]     = 0xFF00FFFF; // just modify blue color component
+                s->ham_palbuf[(i+count*2)*2]   = 0xFFFFFF00; // just modify red color component
+                s->ham_palbuf[(i+count*3)*2]   = 0xFFFF00FF; // just modify green color component
+                s->ham_palbuf[(i+count)*2+1]   = 0xFF000000 | tmp << 16;
+                s->ham_palbuf[(i+count*2)*2+1] = 0xFF000000 | tmp;
+                s->ham_palbuf[(i+count*3)*2+1] = 0xFF000000 | tmp << 8;
+            }
+            if (s->masking == MASK_HAS_MASK) {
+                for (i = 0; i < ham_count; i++)
+                    s->ham_palbuf[(1 << s->bpp) + i] = s->ham_palbuf[i] | 0xFF000000;
+            }
+        }
+    }
+
     return 0;
 }
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
     IffContext *s = avctx->priv_data;
-    av_frame_free(&s->frame);
     av_freep(&s->planebuf);
+    av_freep(&s->ham_buf);
+    av_freep(&s->ham_palbuf);
+    av_freep(&s->video[0]);
+    av_freep(&s->video[1]);
+    av_freep(&s->pal);
     return 0;
 }
 
@@ -160,11 +383,29 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int err;
 
     if (avctx->bits_per_coded_sample <= 8) {
-        avctx->pix_fmt = (avctx->bits_per_coded_sample < 8 ||
-                          avctx->extradata_size) ? AV_PIX_FMT_PAL8
-                                                 : AV_PIX_FMT_GRAY8;
+        int palette_size;
+
+        if (avctx->extradata_size >= 2)
+            palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
+        else
+            palette_size = 0;
+        avctx->pix_fmt = (avctx->bits_per_coded_sample < 8) ||
+                         (avctx->extradata_size >= 2 && palette_size) ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
     } else if (avctx->bits_per_coded_sample <= 32) {
-        avctx->pix_fmt = AV_PIX_FMT_BGR32;
+        if (avctx->codec_tag == MKTAG('R', 'G', 'B', '8')) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        } else if (avctx->codec_tag == MKTAG('R', 'G', 'B', 'N')) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB444;
+        } else if (avctx->codec_tag != MKTAG('D', 'E', 'E', 'P')) {
+            if (avctx->bits_per_coded_sample == 24) {
+                avctx->pix_fmt = AV_PIX_FMT_0BGR32;
+            } else if (avctx->bits_per_coded_sample == 32) {
+                avctx->pix_fmt = AV_PIX_FMT_BGR32;
+            } else {
+                avpriv_request_sample(avctx, "unknown bits_per_coded_sample");
+                return AVERROR_PATCHWELCOME;
+            }
+        }
     } else {
         return AVERROR_INVALIDDATA;
     }
@@ -172,16 +413,24 @@ static av_cold int decode_init(AVCodecContext *avctx)
     if ((err = av_image_check_size(avctx->width, avctx->height, 0, avctx)))
         return err;
     s->planesize = FFALIGN(avctx->width, 16) >> 3; // Align plane size in bits to word-boundary
-    s->planebuf  = av_malloc(s->planesize + AV_INPUT_BUFFER_PADDING_SIZE);
+    s->planebuf  = av_malloc(s->planesize * avctx->height + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!s->planebuf)
         return AVERROR(ENOMEM);
 
-    s->frame = av_frame_alloc();
-    if (!s->frame) {
-        decode_end(avctx);
-        return AVERROR(ENOMEM);
+    s->bpp = avctx->bits_per_coded_sample;
+
+    if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+        s->video_size = FFALIGN(avctx->width, 2) * avctx->height * s->bpp;
+        s->video[0] = av_calloc(FFALIGN(avctx->width, 2) * avctx->height, s->bpp);
+        s->video[1] = av_calloc(FFALIGN(avctx->width, 2) * avctx->height, s->bpp);
+        s->pal = av_calloc(256, sizeof(*s->pal));
+        if (!s->video[0] || !s->video[1] || !s->pal)
+            return AVERROR(ENOMEM);
     }
 
+    if ((err = extract_header(avctx, NULL)) < 0)
+        return err;
+
     return 0;
 }
 
@@ -195,6 +444,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
 static void decodeplane8(uint8_t *dst, const uint8_t *buf, int buf_size, int plane)
 {
     const uint64_t *lut = plane8_lut[plane];
+    if (plane >= 8) {
+        av_log(NULL, AV_LOG_WARNING, "Ignoring extra planes beyond 8\n");
+        return;
+    }
     do {
         uint64_t v = AV_RN64A(dst) | lut[*buf++];
         AV_WN64A(dst, v);
@@ -227,6 +480,47 @@ static void decodeplane32(uint32_t *dst, const uint8_t *buf, int buf_size, int p
     } while (--buf_size);
 }
 
+#define DECODE_HAM_PLANE32(x)       \
+    first       = buf[x] << 1;      \
+    second      = buf[(x)+1] << 1;  \
+    delta      &= pal[first++];     \
+    delta      |= pal[first];       \
+    dst[x]      = delta;            \
+    delta      &= pal[second++];    \
+    delta      |= pal[second];      \
+    dst[(x)+1]  = delta
+
+/**
+ * Converts one line of HAM6/8-encoded chunky buffer to 24bpp.
+ *
+ * @param dst the destination 24bpp buffer
+ * @param buf the source 8bpp chunky buffer
+ * @param pal the HAM decode table
+ * @param buf_size the plane size in bytes
+ */
+static void decode_ham_plane32(uint32_t *dst, const uint8_t  *buf,
+                               const uint32_t *const pal, unsigned buf_size)
+{
+    uint32_t delta = pal[1]; /* first palette entry */
+    do {
+        uint32_t first, second;
+        DECODE_HAM_PLANE32(0);
+        DECODE_HAM_PLANE32(2);
+        DECODE_HAM_PLANE32(4);
+        DECODE_HAM_PLANE32(6);
+        buf += 8;
+        dst += 8;
+    } while (--buf_size);
+}
+
+static void lookup_pal_indicies(uint32_t *dst, const uint32_t *buf,
+                         const uint32_t *const pal, unsigned width)
+{
+    do {
+        *dst++ = pal[*buf++];
+    } while (--width);
+}
+
 /**
  * Decode one complete byterun1 encoded line.
  *
@@ -237,162 +531,1354 @@ static void decodeplane32(uint32_t *dst, const uint8_t *buf, int buf_size, int p
  * @return number of consumed bytes in byterun1 compressed bitstream
  */
 static int decode_byterun(uint8_t *dst, int dst_size,
-                          const uint8_t *buf, const uint8_t *const buf_end)
+                          GetByteContext *gb)
 {
-    const uint8_t *const buf_start = buf;
     unsigned x;
-    for (x = 0; x < dst_size && buf < buf_end;) {
+    for (x = 0; x < dst_size && bytestream2_get_bytes_left(gb) > 0;) {
         unsigned length;
-        const int8_t value = *buf++;
+        const int8_t value = bytestream2_get_byte(gb);
         if (value >= 0) {
-            length = value + 1;
-            memcpy(dst + x, buf, FFMIN3(length, dst_size - x, buf_end - buf));
-            buf += length;
+            length = FFMIN3(value + 1, dst_size - x, bytestream2_get_bytes_left(gb));
+            bytestream2_get_buffer(gb, dst + x, length);
+            if (length < value + 1)
+                bytestream2_skip(gb, value + 1 - length);
         } else if (value > -128) {
-            length = -value + 1;
-            memset(dst + x, *buf++, FFMIN(length, dst_size - x));
+            length = FFMIN(-value + 1, dst_size - x);
+            memset(dst + x, bytestream2_get_byte(gb), length);
         } else { // noop
             continue;
         }
         x += length;
     }
-    return buf - buf_start;
+    if (x < dst_size) {
+        av_log(NULL, AV_LOG_WARNING, "decode_byterun ended before plane size\n");
+        memset(dst+x, 0, dst_size - x);
+    }
+    return bytestream2_tell(gb);
 }
 
-static int decode_frame_ilbm(AVCodecContext *avctx,
-                             void *data, int *got_frame,
-                             AVPacket *avpkt)
+static int decode_byterun2(uint8_t *dst, int height, int line_size,
+                           GetByteContext *gb)
 {
-    IffContext *s          = avctx->priv_data;
-    const uint8_t *buf     = avpkt->data;
-    int buf_size           = avpkt->size;
-    const uint8_t *buf_end = buf + buf_size;
-    int y, plane, res;
+    GetByteContext cmds;
+    unsigned count;
+    int i, y_pos = 0, x_pos = 0;
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
-        return res;
+    if (bytestream2_get_be32(gb) != MKBETAG('V', 'D', 'A', 'T'))
+        return 0;
 
-    if (!s->init && avctx->bits_per_coded_sample <= 8 &&
-        avctx->pix_fmt != AV_PIX_FMT_GRAY8) {
-        if ((res = cmap_read_palette(avctx, (uint32_t *)s->frame->data[1])) < 0)
-            return res;
+    bytestream2_skip(gb, 4);
+    count = bytestream2_get_be16(gb) - 2;
+    if (bytestream2_get_bytes_left(gb) < count)
+        return 0;
+
+    bytestream2_init(&cmds, gb->buffer, count);
+    bytestream2_skip(gb, count);
+
+    for (i = 0; i < count && x_pos < line_size; i++) {
+        int8_t cmd = bytestream2_get_byte(&cmds);
+        int l, r;
+
+        if (cmd == 0) {
+            l = bytestream2_get_be16(gb);
+            while (l-- > 0 && x_pos < line_size) {
+                dst[x_pos + y_pos   * line_size    ] = bytestream2_get_byte(gb);
+                dst[x_pos + y_pos++ * line_size + 1] = bytestream2_get_byte(gb);
+                if (y_pos >= height) {
+                    y_pos  = 0;
+                    x_pos += 2;
+                }
+            }
+        } else if (cmd < 0) {
+            l = -cmd;
+            while (l-- > 0 && x_pos < line_size) {
+                dst[x_pos + y_pos   * line_size    ] = bytestream2_get_byte(gb);
+                dst[x_pos + y_pos++ * line_size + 1] = bytestream2_get_byte(gb);
+                if (y_pos >= height) {
+                    y_pos  = 0;
+                    x_pos += 2;
+                }
+            }
+        } else if (cmd == 1) {
+            l = bytestream2_get_be16(gb);
+            r = bytestream2_get_be16(gb);
+            while (l-- > 0 && x_pos < line_size) {
+                dst[x_pos + y_pos   * line_size    ] = r >> 8;
+                dst[x_pos + y_pos++ * line_size + 1] = r & 0xFF;
+                if (y_pos >= height) {
+                    y_pos  = 0;
+                    x_pos += 2;
+                }
+            }
+        } else {
+            l = cmd;
+            r = bytestream2_get_be16(gb);
+            while (l-- > 0 && x_pos < line_size) {
+                dst[x_pos + y_pos   * line_size    ] = r >> 8;
+                dst[x_pos + y_pos++ * line_size + 1] = r & 0xFF;
+                if (y_pos >= height) {
+                    y_pos  = 0;
+                    x_pos += 2;
+                }
+            }
+        }
     }
-    s->init = 1;
 
-    if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
-        if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
-            for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width);
-                for (plane = 0; plane < avctx->bits_per_coded_sample && buf < buf_end;
-                     plane++) {
-                    decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
-                    buf += s->planesize;
+    return bytestream2_tell(gb);
+}
+
+#define DECODE_RGBX_COMMON(type) \
+    if (!length) { \
+        length = bytestream2_get_byte(gb); \
+        if (!length) { \
+            length = bytestream2_get_be16(gb); \
+            if (!length) \
+                return; \
+        } \
+    } \
+    for (i = 0; i < length; i++) { \
+        *(type *)(dst + y*linesize + x * sizeof(type)) = pixel; \
+        x += 1; \
+        if (x >= width) { \
+            y += 1; \
+            if (y >= height) \
+                return; \
+            x = 0; \
+        } \
+    }
+
+/**
+ * Decode RGB8 buffer
+ * @param[out] dst Destination buffer
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_rgb8(GetByteContext *gb, uint8_t *dst, int width, int height, int linesize)
+{
+    int x = 0, y = 0, i, length;
+    while (bytestream2_get_bytes_left(gb) >= 4) {
+        uint32_t pixel = 0xFF000000 | bytestream2_get_be24(gb);
+        length = bytestream2_get_byte(gb) & 0x7F;
+        DECODE_RGBX_COMMON(uint32_t)
+    }
+}
+
+/**
+ * Decode RGBN buffer
+ * @param[out] dst Destination buffer
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_rgbn(GetByteContext *gb, uint8_t *dst, int width, int height, int linesize)
+{
+    int x = 0, y = 0, i, length;
+    while (bytestream2_get_bytes_left(gb) >= 2) {
+        uint32_t pixel = bytestream2_get_be16u(gb);
+        length = pixel & 0x7;
+        pixel >>= 4;
+        DECODE_RGBX_COMMON(uint16_t)
+    }
+}
+
+/**
+ * Decode DEEP RLE 32-bit buffer
+ * @param[out] dst Destination buffer
+ * @param[in] src Source buffer
+ * @param src_size Source buffer size (bytes)
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_deep_rle32(uint8_t *dst, const uint8_t *src, int src_size, int width, int height, int linesize)
+{
+    const uint8_t *src_end = src + src_size;
+    int x = 0, y = 0, i;
+    while (src + 5 <= src_end) {
+        int opcode;
+        opcode = *(int8_t *)src++;
+        if (opcode >= 0) {
+            int size = opcode + 1;
+            for (i = 0; i < size; i++) {
+                int length = FFMIN(size - i, width);
+                memcpy(dst + y*linesize + x * 4, src, length * 4);
+                src += length * 4;
+                x += length;
+                i += length;
+                if (x >= width) {
+                    x = 0;
+                    y += 1;
+                    if (y >= height)
+                        return;
                 }
             }
-        } else { // AV_PIX_FMT_BGR32
-            for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width << 2);
-                for (plane = 0; plane < avctx->bits_per_coded_sample && buf < buf_end;
-                     plane++) {
-                    decodeplane32((uint32_t *)row, buf,
-                                  FFMIN(s->planesize, buf_end - buf), plane);
-                    buf += s->planesize;
+        } else {
+            int size = -opcode + 1;
+            uint32_t pixel = AV_RN32(src);
+            for (i = 0; i < size; i++) {
+                *(uint32_t *)(dst + y*linesize + x * 4) = pixel;
+                x += 1;
+                if (x >= width) {
+                    x = 0;
+                    y += 1;
+                    if (y >= height)
+                        return;
                 }
             }
+            src += 4;
+        }
+    }
+}
+
+/**
+ * Decode DEEP TVDC 32-bit buffer
+ * @param[out] dst Destination buffer
+ * @param[in] src Source buffer
+ * @param src_size Source buffer size (bytes)
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ * @param[int] tvdc TVDC lookup table
+ */
+static void decode_deep_tvdc32(uint8_t *dst, const uint8_t *src, int src_size, int width, int height, int linesize, const int16_t *tvdc)
+{
+    int x = 0, y = 0, plane = 0;
+    int8_t pixel = 0;
+    int i, j;
+
+    for (i = 0; i < src_size * 2;) {
+#define GETNIBBLE ((i & 1) ?  (src[i>>1] & 0xF) : (src[i>>1] >> 4))
+        int d = tvdc[GETNIBBLE];
+        i++;
+        if (d) {
+            pixel += d;
+            dst[y * linesize + x*4 + plane] = pixel;
+            x++;
+        } else {
+            if (i >= src_size * 2)
+                return;
+            d = GETNIBBLE + 1;
+            i++;
+            d = FFMIN(d, width - x);
+            for (j = 0; j < d; j++) {
+                dst[y * linesize + x*4 + plane] = pixel;
+                x++;
+            }
         }
-    } else if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) { // IFF-PBM
-        for (y = 0; y < avctx->height && buf < buf_end; y++) {
-            uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-            memcpy(row, buf, FFMIN(avctx->width, buf_end - buf));
-            buf += avctx->width + (avctx->width % 2); // padding if odd
+        if (x >= width) {
+            plane++;
+            if (plane >= 4) {
+                y++;
+                if (y >= height)
+                    return;
+                plane = 0;
+            }
+            x = 0;
+            pixel = 0;
+            i = (i + 1) & ~1;
         }
     }
+}
 
-    if ((res = av_frame_ref(data, s->frame)) < 0)
-        return res;
+static void decode_short_horizontal_delta(uint8_t *dst,
+                                          const uint8_t *buf, const uint8_t *buf_end,
+                                          int w, int bpp, int dst_size)
+{
+    int planepitch = FFALIGN(w, 16) >> 3;
+    int pitch = planepitch * bpp;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    unsigned ofssrc, pos;
+    int i, k;
 
-    *got_frame = 1;
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
 
-    return buf_size;
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+        pos = 0;
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        while (bytestream2_peek_be16(&gb) != 0xFFFF && bytestream2_get_bytes_left(&gb) > 3) {
+            int16_t offset = bytestream2_get_be16(&gb);
+            unsigned noffset;
+
+            if (offset >= 0) {
+                unsigned data = bytestream2_get_be16(&gb);
+
+                pos += offset * 2;
+                noffset = (pos / planepitch) * pitch + (pos % planepitch) + k * planepitch;
+                bytestream2_seek_p(&pb, noffset, SEEK_SET);
+                bytestream2_put_be16(&pb, data);
+            } else {
+                uint16_t count = bytestream2_get_be16(&gb);
+
+                pos += 2 * -(offset + 2);
+                for (i = 0; i < count; i++) {
+                    uint16_t data = bytestream2_get_be16(&gb);
+
+                    pos += 2;
+                    noffset = (pos / planepitch) * pitch + (pos % planepitch) + k * planepitch;
+                    bytestream2_seek_p(&pb, noffset, SEEK_SET);
+                    bytestream2_put_be16(&pb, data);
+                }
+            }
+        }
+    }
+}
+
+static void decode_byte_vertical_delta(uint8_t *dst,
+                                       const uint8_t *buf, const uint8_t *buf_end,
+                                       int w, int xor, int bpp, int dst_size)
+{
+    int ncolumns = ((w + 15) / 16) * 2;
+    int dstpitch = ncolumns * bpp;
+    unsigned ofsdst, ofssrc, opcode, x;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    int i, j, k;
+
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = j + k * ncolumns;
+
+            i = bytestream2_get_byte(&gb);
+            while (i > 0) {
+                opcode = bytestream2_get_byte(&gb);
+
+                if (opcode == 0) {
+                    opcode  = bytestream2_get_byte(&gb);
+                    x = bytestream2_get_byte(&gb);
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (xor && ofsdst < dst_size) {
+                            bytestream2_put_byte(&pb, dst[ofsdst] ^ x);
+                        } else {
+                            bytestream2_put_byte(&pb, x);
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x80) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7f;
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (xor && ofsdst < dst_size) {
+                            bytestream2_put_byte(&pb, dst[ofsdst] ^ bytestream2_get_byte(&gb));
+                        } else {
+                            bytestream2_put_byte(&pb, bytestream2_get_byte(&gb));
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_delta_j(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int h, int bpp, int dst_size)
+{
+    int32_t pitch;
+    uint8_t *ptr;
+    uint32_t type, flag, cols, groups, rows, bytes;
+    uint32_t offset;
+    int planepitch_byte = (w + 7) / 8;
+    int planepitch = ((w + 15) / 16) * 2;
+    int kludge_j, b, g, r, d;
+    GetByteContext gb;
+
+    pitch = planepitch * bpp;
+    kludge_j = w < 320 ? (320 - w) / 8 / 2 : 0;
+
+    bytestream2_init(&gb, buf, buf_end - buf);
+
+    while (bytestream2_get_bytes_left(&gb) >= 2) {
+        type = bytestream2_get_be16(&gb);
+
+        switch (type) {
+        case 0:
+            return;
+        case 1:
+            flag   = bytestream2_get_be16(&gb);
+            cols   = bytestream2_get_be16(&gb);
+            groups = bytestream2_get_be16(&gb);
+
+            for (g = 0; g < groups; g++) {
+                offset = bytestream2_get_be16(&gb);
+
+                if (cols * bpp == 0 || bytestream2_get_bytes_left(&gb) < cols * bpp) {
+                    av_log(NULL, AV_LOG_ERROR, "cols*bpp is invalid (%"PRId32"*%d)", cols, bpp);
+                    return;
+                }
+
+                if (kludge_j)
+                    offset = ((offset / (320 / 8)) * pitch) + (offset % (320 / 8)) - kludge_j;
+                else
+                    offset = ((offset / planepitch_byte) * pitch) + (offset % planepitch_byte);
+
+                for (b = 0; b < cols; b++) {
+                    for (d = 0; d < bpp; d++) {
+                        uint8_t value = bytestream2_get_byte(&gb);
+
+                        if (offset >= dst_size)
+                            return;
+                        ptr = dst + offset;
+
+                        if (flag)
+                            ptr[0] ^= value;
+                        else
+                            ptr[0]  = value;
+
+                        offset += planepitch;
+                    }
+                }
+                if ((cols * bpp) & 1)
+                    bytestream2_skip(&gb, 1);
+            }
+            break;
+        case 2:
+            flag   = bytestream2_get_be16(&gb);
+            rows   = bytestream2_get_be16(&gb);
+            bytes  = bytestream2_get_be16(&gb);
+            groups = bytestream2_get_be16(&gb);
+
+            for (g = 0; g < groups; g++) {
+                offset = bytestream2_get_be16(&gb);
+
+                if (kludge_j)
+                    offset = ((offset / (320 / 8)) * pitch) + (offset % (320/ 8)) - kludge_j;
+                else
+                    offset = ((offset / planepitch_byte) * pitch) + (offset % planepitch_byte);
+
+                for (r = 0; r < rows; r++) {
+                    for (d = 0; d < bpp; d++) {
+                        unsigned noffset = offset + (r * pitch) + d * planepitch;
+
+                        if (!bytes || bytestream2_get_bytes_left(&gb) < bytes) {
+                            av_log(NULL, AV_LOG_ERROR, "bytes %"PRId32" is invalid", bytes);
+                            return;
+                        }
+
+                        for (b = 0; b < bytes; b++) {
+                            uint8_t value = bytestream2_get_byte(&gb);
+
+                            if (noffset >= dst_size)
+                                return;
+                            ptr = dst + noffset;
+
+                            if (flag)
+                                ptr[0] ^= value;
+                            else
+                                ptr[0]  = value;
+
+                            noffset++;
+                        }
+                    }
+                }
+                if ((rows * bytes * bpp) & 1)
+                    bytestream2_skip(&gb, 1);
+            }
+            break;
+        default:
+            return;
+        }
+    }
 }
 
-static int decode_frame_byterun1(AVCodecContext *avctx,
-                                 void *data, int *got_frame,
-                                 AVPacket *avpkt)
+static void decode_short_vertical_delta(uint8_t *dst,
+                                        const uint8_t *buf, const uint8_t *buf_end,
+                                        int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 15) >> 4;
+    int dstpitch = ncolumns * bpp * 2;
+    unsigned ofsdst, ofssrc, ofsdata, opcode, x;
+    GetByteContext ptrs, gb, dptrs, dgb;
+    PutByteContext pb;
+    int i, j, k;
+
+    if (buf_end - buf <= 64)
+        return;
+
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init(&dptrs, buf + 32, (buf_end - buf) - 32);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+        ofsdata = bytestream2_get_be32(&dptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            return;
+
+        if (ofsdata >= buf_end - buf)
+            return;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        bytestream2_init(&dgb, buf + ofsdata, buf_end - (buf + ofsdata));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 2;
+
+            i = bytestream2_get_byte(&gb);
+            while (i > 0) {
+                opcode = bytestream2_get_byte(&gb);
+
+                if (opcode == 0) {
+                    opcode = bytestream2_get_byte(&gb);
+                    x = bytestream2_get_be16(&dgb);
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, x);
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x80) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7f;
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, bytestream2_get_be16(&dgb));
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_long_vertical_delta(uint8_t *dst,
+                                       const uint8_t *buf, const uint8_t *buf_end,
+                                       int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 31) >> 5;
+    int dstpitch = ((w + 15) / 16 * 2) * bpp;
+    unsigned ofsdst, ofssrc, ofsdata, opcode, x;
+    GetByteContext ptrs, gb, dptrs, dgb;
+    PutByteContext pb;
+    int i, j, k, h;
+
+    if (buf_end - buf <= 64)
+        return;
+
+    h = (((w + 15) / 16 * 2) != ((w + 31) / 32 * 4)) ? 1 : 0;
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init(&dptrs, buf + 32, (buf_end - buf) - 32);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+        ofsdata = bytestream2_get_be32(&dptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            return;
+
+        if (ofsdata >= buf_end - buf)
+            return;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        bytestream2_init(&dgb, buf + ofsdata, buf_end - (buf + ofsdata));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 4 - h * (2 * k);
+
+            i = bytestream2_get_byte(&gb);
+            while (i > 0) {
+                opcode = bytestream2_get_byte(&gb);
+
+                if (opcode == 0) {
+                    opcode = bytestream2_get_byte(&gb);
+                    if (h && (j == (ncolumns - 1))) {
+                        x = bytestream2_get_be16(&dgb);
+                        bytestream2_skip(&dgb, 2);
+                    } else {
+                        x = bytestream2_get_be32(&dgb);
+                    }
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == (ncolumns - 1))) {
+                            bytestream2_put_be16(&pb, x);
+                        } else {
+                            bytestream2_put_be32(&pb, x);
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x80) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7f;
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == (ncolumns - 1))) {
+                            bytestream2_put_be16(&pb, bytestream2_get_be16(&dgb));
+                            bytestream2_skip(&dgb, 2);
+                        } else {
+                            bytestream2_put_be32(&pb, bytestream2_get_be32(&dgb));
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_short_vertical_delta2(uint8_t *dst,
+                                         const uint8_t *buf, const uint8_t *buf_end,
+                                         int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 15) >> 4;
+    int dstpitch = ncolumns * bpp * 2;
+    unsigned ofsdst, ofssrc, opcode, x;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    int i, j, k;
+
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 2;
+
+            i = bytestream2_get_be16(&gb);
+            while (i > 0 && bytestream2_get_bytes_left(&gb) > 4) {
+                opcode = bytestream2_get_be16(&gb);
+
+                if (opcode == 0) {
+                    opcode = bytestream2_get_be16(&gb);
+                    x = bytestream2_get_be16(&gb);
+
+                    while (opcode && bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, x);
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x8000) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7fff;
+
+                    while (opcode && bytestream2_get_bytes_left(&gb) > 1 &&
+                           bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, bytestream2_get_be16(&gb));
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_long_vertical_delta2(uint8_t *dst,
+                                        const uint8_t *buf, const uint8_t *buf_end,
+                                        int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 31) >> 5;
+    int dstpitch = ((w + 15) / 16 * 2) * bpp;
+    unsigned ofsdst, ofssrc, opcode, x;
+    unsigned skip = 0x80000000, mask = skip - 1;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    int i, j, k, h;
+
+    h = (((w + 15) / 16 * 2) != ((w + 31) / 32 * 4)) ? 1 : 0;
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 4 - h * (2 * k);
+
+            if (h && (j == (ncolumns - 1))) {
+                skip = 0x8000;
+                mask = skip - 1;
+            }
+
+            i = bytestream2_get_be32(&gb);
+            while (i > 0 && bytestream2_get_bytes_left(&gb) > 4) {
+                opcode = bytestream2_get_be32(&gb);
+
+                if (opcode == 0) {
+                    if (h && (j == ncolumns - 1)) {
+                        opcode = bytestream2_get_be16(&gb);
+                        x = bytestream2_get_be16(&gb);
+                    } else {
+                        opcode = bytestream2_get_be32(&gb);
+                        x = bytestream2_get_be32(&gb);
+                    }
+
+                    while (opcode && bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == ncolumns - 1))
+                            bytestream2_put_be16(&pb, x);
+                        else
+                            bytestream2_put_be32(&pb, x);
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < skip) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= mask;
+
+                    while (opcode && bytestream2_get_bytes_left(&gb) > 1 &&
+                           bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == ncolumns - 1)) {
+                            bytestream2_put_be16(&pb, bytestream2_get_be16(&gb));
+                        } else {
+                            bytestream2_put_be32(&pb, bytestream2_get_be32(&gb));
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_delta_d(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int flag, int bpp, int dst_size)
+{
+    int planepitch = FFALIGN(w, 16) >> 3;
+    int pitch = planepitch * bpp;
+    int planepitch_byte = (w + 7) / 8;
+    unsigned entries, ofssrc;
+    GetByteContext gb, ptrs;
+    PutByteContext pb;
+    int k;
+
+    if (buf_end - buf <= 4 * bpp)
+        return;
+
+    bytestream2_init_writer(&pb, dst, dst_size);
+    bytestream2_init(&ptrs, buf, bpp * 4);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+
+        entries = bytestream2_get_be32(&gb);
+        while (entries && bytestream2_get_bytes_left(&gb) >= 8) {
+            int32_t opcode  = bytestream2_get_be32(&gb);
+            unsigned offset = bytestream2_get_be32(&gb);
+
+            bytestream2_seek_p(&pb, (offset / planepitch_byte) * pitch + (offset % planepitch_byte) + k * planepitch, SEEK_SET);
+            if (opcode >= 0) {
+                uint32_t x = bytestream2_get_be32(&gb);
+                while (opcode && bytestream2_get_bytes_left_p(&pb) > 0) {
+                    bytestream2_put_be32(&pb, x);
+                    bytestream2_skip_p(&pb, pitch - 4);
+                    opcode--;
+                }
+            } else {
+                opcode = -opcode;
+                while (opcode && bytestream2_get_bytes_left(&gb) > 0) {
+                    bytestream2_put_be32(&pb, bytestream2_get_be32(&gb));
+                    bytestream2_skip_p(&pb, pitch - 4);
+                    opcode--;
+                }
+            }
+            entries--;
+        }
+    }
+}
+
+static void decode_delta_e(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int flag, int bpp, int dst_size)
+{
+    int planepitch = FFALIGN(w, 16) >> 3;
+    int pitch = planepitch * bpp;
+    int planepitch_byte = (w + 7) / 8;
+    unsigned entries, ofssrc;
+    GetByteContext gb, ptrs;
+    PutByteContext pb;
+    int k;
+
+    if (buf_end - buf <= 4 * bpp)
+        return;
+
+    bytestream2_init_writer(&pb, dst, dst_size);
+    bytestream2_init(&ptrs, buf, bpp * 4);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+
+        entries = bytestream2_get_be16(&gb);
+        while (entries && bytestream2_get_bytes_left(&gb) >= 6) {
+            int16_t opcode  = bytestream2_get_be16(&gb);
+            unsigned offset = bytestream2_get_be32(&gb);
+
+            bytestream2_seek_p(&pb, (offset / planepitch_byte) * pitch + (offset % planepitch_byte) + k * planepitch, SEEK_SET);
+            if (opcode >= 0) {
+                uint16_t x = bytestream2_get_be16(&gb);
+                while (opcode && bytestream2_get_bytes_left_p(&pb) > 0) {
+                    bytestream2_put_be16(&pb, x);
+                    bytestream2_skip_p(&pb, pitch - 2);
+                    opcode--;
+                }
+            } else {
+                opcode = -opcode;
+                while (opcode && bytestream2_get_bytes_left(&gb) > 0) {
+                    bytestream2_put_be16(&pb, bytestream2_get_be16(&gb));
+                    bytestream2_skip_p(&pb, pitch - 2);
+                    opcode--;
+                }
+            }
+            entries--;
+        }
+    }
+}
+
+static void decode_delta_l(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int flag, int bpp, int dst_size)
+{
+    GetByteContext off0, off1, dgb, ogb;
+    PutByteContext pb;
+    unsigned poff0, poff1;
+    int i, k, dstpitch;
+    int planepitch_byte = (w + 7) / 8;
+    int planepitch = ((w + 15) / 16) * 2;
+    int pitch = planepitch * bpp;
+
+    if (buf_end - buf <= 64)
+        return;
+
+    bytestream2_init(&off0, buf, buf_end - buf);
+    bytestream2_init(&off1, buf + 32, buf_end - (buf + 32));
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    dstpitch = flag ? (((w + 7) / 8) * bpp): 2;
+
+    for (k = 0; k < bpp; k++) {
+        poff0 = bytestream2_get_be32(&off0);
+        poff1 = bytestream2_get_be32(&off1);
+
+        if (!poff0)
+            continue;
+
+        if (2LL * poff0 >= buf_end - buf)
+            return;
+
+        if (2LL * poff1 >= buf_end - buf)
+            return;
+
+        bytestream2_init(&dgb, buf + 2 * poff0, buf_end - (buf + 2 * poff0));
+        bytestream2_init(&ogb, buf + 2 * poff1, buf_end - (buf + 2 * poff1));
+
+        while (bytestream2_peek_be16(&ogb) != 0xFFFF && bytestream2_get_bytes_left(&ogb) >= 4) {
+            uint32_t offset = bytestream2_get_be16(&ogb);
+            int16_t cnt = bytestream2_get_be16(&ogb);
+            uint16_t data;
+
+            offset = ((2 * offset) / planepitch_byte) * pitch + ((2 * offset) % planepitch_byte) + k * planepitch;
+            if (cnt < 0) {
+                if (bytestream2_get_bytes_left(&dgb) < 2)
+                    break;
+                bytestream2_seek_p(&pb, offset, SEEK_SET);
+                cnt = -cnt;
+                data = bytestream2_get_be16(&dgb);
+                for (i = 0; i < cnt; i++) {
+                    bytestream2_put_be16(&pb, data);
+                    bytestream2_skip_p(&pb, dstpitch - 2);
+                }
+            } else {
+                if (bytestream2_get_bytes_left(&dgb) < 2*cnt)
+                    break;
+                bytestream2_seek_p(&pb, offset, SEEK_SET);
+                for (i = 0; i < cnt; i++) {
+                    data = bytestream2_get_be16(&dgb);
+                    bytestream2_put_be16(&pb, data);
+                    bytestream2_skip_p(&pb, dstpitch - 2);
+                }
+            }
+        }
+    }
+}
+
+static int unsupported(AVCodecContext *avctx)
+{
+    IffContext *s = avctx->priv_data;
+    avpriv_request_sample(avctx, "bitmap (compression 0x%0x, bpp %i, ham %i, interlaced %i)", s->compression, s->bpp, s->ham, s->is_interlaced);
+    return AVERROR_INVALIDDATA;
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
 {
     IffContext *s          = avctx->priv_data;
+    AVFrame *frame         = data;
     const uint8_t *buf     = avpkt->data;
     int buf_size           = avpkt->size;
     const uint8_t *buf_end = buf + buf_size;
     int y, plane, res;
+    GetByteContext *gb = &s->gb;
+    const AVPixFmtDescriptor *desc;
+
+    bytestream2_init(gb, avpkt->data, avpkt->size);
+
+    if ((res = extract_header(avctx, avpkt)) < 0)
+        return res;
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
+    s->frame = frame;
+
+    buf      += bytestream2_tell(gb);
+    buf_size -= bytestream2_tell(gb);
+    desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
     if (!s->init && avctx->bits_per_coded_sample <= 8 &&
-        avctx->pix_fmt != AV_PIX_FMT_GRAY8) {
-        if ((res = cmap_read_palette(avctx, (uint32_t *)s->frame->data[1])) < 0)
+        avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        if ((res = cmap_read_palette(avctx, (uint32_t *)frame->data[1])) < 0)
+            return res;
+    } else if (!s->init && avctx->bits_per_coded_sample <= 8 &&
+               avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+        if ((res = cmap_read_palette(avctx, s->mask_palbuf)) < 0)
             return res;
     }
     s->init = 1;
 
-    if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
+    if (s->compression <= 0xff && (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M'))) {
+        if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+            memcpy(s->pal, s->frame->data[1], 256 * 4);
+    }
+
+    switch (s->compression) {
+    case 0x0:
+        if (avctx->codec_tag == MKTAG('A', 'C', 'B', 'M')) {
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                memset(frame->data[0], 0, avctx->height * frame->linesize[0]);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    for (y = 0; y < avctx->height && buf < buf_end; y++) {
+                        uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                        decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                memset(frame->data[0], 0, avctx->height * frame->linesize[0]);
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        const uint8_t * start = buf + (plane * avctx->height + y) * s->planesize;
+                        if (start >= buf_end)
+                            break;
+                        decodeplane8(s->ham_buf, start, FFMIN(s->planesize, buf_end - start), plane);
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        } else if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) {
+            int raw_width = avctx->width * (av_get_bits_per_pixel(desc) >> 3);
+            int x;
+            for (y = 0; y < avctx->height && buf < buf_end; y++) {
+                uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                memcpy(row, buf, FFMIN(raw_width, buf_end - buf));
+                buf += raw_width;
+                if (avctx->pix_fmt == AV_PIX_FMT_BGR32) {
+                    for (x = 0; x < avctx->width; x++)
+                        row[4 * x + 3] = row[4 * x + 3] & 0xF0 | (row[4 * x + 3] >> 4);
+                }
+            }
+        } else if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M') || // interleaved
+                   avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+            if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M'))
+                memcpy(s->video[0], buf, FFMIN(buf_end - buf, s->video_size));
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane8(s->ham_buf, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else { // AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width << 2);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane32((uint32_t *)row, buf,
+                                      FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            }
+        } else if (avctx->codec_tag == MKTAG('P', 'B', 'M', ' ')) { // IFF-PBM
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height && buf_end > buf; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memcpy(row, buf, FFMIN(avctx->width, buf_end - buf));
+                    buf += avctx->width + (avctx->width % 2); // padding if odd
+                }
+            } else if (s->ham) { // IFF-PBM: HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height && buf_end > buf; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memcpy(s->ham_buf, buf, FFMIN(avctx->width, buf_end - buf));
+                    buf += avctx->width + (avctx->width & 1); // padding if odd
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        } else {
+            return unsupported(avctx);
+        }
+        break;
+    case 0x1:
+        if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M') || // interleaved
+            avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                uint8_t *video = s->video[0];
+
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+                            memcpy(video, s->planebuf, s->planesize);
+                            video += s->planesize;
+                        }
+                        decodeplane8(row, s->planebuf, s->planesize, plane);
+                    }
+                }
+            } else if (avctx->bits_per_coded_sample <= 8) { //8-bit (+ mask) to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->mask_buf, 0, avctx->width * sizeof(uint32_t));
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        decodeplane32(s->mask_buf, s->planebuf, s->planesize, plane);
+                    }
+                    lookup_pal_indicies((uint32_t *)row, s->mask_buf, s->mask_palbuf, avctx->width);
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                uint8_t *video = s->video[0];
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+                            memcpy(video, s->planebuf, s->planesize);
+                            video += s->planesize;
+                        }
+                        decodeplane8(s->ham_buf, s->planebuf, s->planesize, plane);
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else { // AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width << 2);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        decodeplane32((uint32_t *)row, s->planebuf, s->planesize, plane);
+                    }
+                }
+            }
+        } else if (avctx->codec_tag == MKTAG('P', 'B', 'M', ' ')) { // IFF-PBM
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    buf += decode_byterun(row, avctx->width, gb);
+                }
+            } else if (s->ham) { // IFF-PBM: HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    buf += decode_byterun(s->ham_buf, avctx->width, gb);
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        } else if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) { // IFF-DEEP
+            if (av_get_bits_per_pixel(desc) == 32)
+                decode_deep_rle32(frame->data[0], buf, buf_size, avctx->width, avctx->height, frame->linesize[0]);
+            else
+                return unsupported(avctx);
+        } else if (avctx->codec_tag == MKTAG('A', 'C', 'B', 'M')) {
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                memset(frame->data[0], 0, avctx->height * frame->linesize[0]);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    for (y = 0; y < avctx->height && buf < buf_end; y++) {
+                        uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                        decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                memset(frame->data[0], 0, avctx->height * frame->linesize[0]);
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        const uint8_t * start = buf + (plane * avctx->height + y) * s->planesize;
+                        if (start >= buf_end)
+                            break;
+                        decodeplane8(s->ham_buf, start, FFMIN(s->planesize, buf_end - start), plane);
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else {
+                return unsupported(avctx);
+            }
+        } else {
+            return unsupported(avctx);
+        }
+        break;
+    case 0x2:
+        if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M') && avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            for (plane = 0; plane < s->bpp; plane++) {
+                decode_byterun2(s->planebuf, avctx->height, s->planesize, gb);
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    decodeplane8(row, s->planebuf + s->planesize * y, s->planesize, plane);
+                }
+            }
+        } else {
+            return unsupported(avctx);
+        }
+        break;
+    case 0x4:
+        if (avctx->codec_tag == MKTAG('R', 'G', 'B', '8') && avctx->pix_fmt == AV_PIX_FMT_RGB32)
+            decode_rgb8(gb, frame->data[0], avctx->width, avctx->height, frame->linesize[0]);
+        else if (avctx->codec_tag == MKTAG('R', 'G', 'B', 'N') && avctx->pix_fmt == AV_PIX_FMT_RGB444)
+            decode_rgbn(gb, frame->data[0], avctx->width, avctx->height, frame->linesize[0]);
+        else
+            return unsupported(avctx);
+        break;
+    case 0x5:
+        if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) {
+            if (av_get_bits_per_pixel(desc) == 32)
+                decode_deep_tvdc32(frame->data[0], buf, buf_size, avctx->width, avctx->height, frame->linesize[0], s->tvdc);
+            else
+                return unsupported(avctx);
+        } else
+            return unsupported(avctx);
+        break;
+    case 0x300:
+    case 0x301:
+        decode_short_horizontal_delta(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        break;
+    case 0x500:
+    case 0x501:
+        decode_byte_vertical_delta(s->video[0], buf, buf_end, avctx->width, s->is_brush, s->bpp, s->video_size);
+        break;
+    case 0x700:
+    case 0x701:
+        if (s->is_short)
+            decode_short_vertical_delta(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        else
+            decode_long_vertical_delta(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        break;
+    case 0x800:
+    case 0x801:
+        if (s->is_short)
+            decode_short_vertical_delta2(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        else
+            decode_long_vertical_delta2(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        break;
+    case 0x4a00:
+    case 0x4a01:
+        decode_delta_j(s->video[0], buf, buf_end, avctx->width, avctx->height, s->bpp, s->video_size);
+        break;
+    case 0x6400:
+    case 0x6401:
+        if (s->is_interlaced)
+            return unsupported(avctx);
+        decode_delta_d(s->video[0], buf, buf_end, avctx->width, s->is_interlaced, s->bpp, s->video_size);
+        break;
+    case 0x6500:
+    case 0x6501:
+        if (s->is_interlaced)
+            return unsupported(avctx);
+        decode_delta_e(s->video[0], buf, buf_end, avctx->width, s->is_interlaced, s->bpp, s->video_size);
+        break;
+    case 0x6c00:
+    case 0x6c01:
+        decode_delta_l(s->video[0], buf, buf_end, avctx->width, s->is_short, s->bpp, s->video_size);
+        break;
+    default:
+        return unsupported(avctx);
+    }
+
+    if (s->compression <= 0xff && (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M'))) {
+        memcpy(s->video[1], s->video[0], s->video_size);
+    }
+
+    if (s->compression > 0xff) {
         if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+            buf = s->video[0];
             for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                uint8_t *row = &frame->data[0][y * frame->linesize[0]];
                 memset(row, 0, avctx->width);
-                for (plane = 0; plane < avctx->bits_per_coded_sample; plane++) {
-                    buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
-                    decodeplane8(row, s->planebuf, s->planesize, plane);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    decodeplane8(row, buf, s->planesize, plane);
+                    buf += s->planesize;
                 }
             }
-        } else { // AV_PIX_FMT_BGR32
+            memcpy(frame->data[1], s->pal, 256 * 4);
+        } else if (s->ham) {
+            int i, count = 1 << s->ham;
+
+            buf = s->video[0];
+            memset(s->ham_palbuf, 0, (1 << s->ham) * 2 * sizeof(uint32_t));
+            for (i = 0; i < count; i++) {
+                s->ham_palbuf[i*2+1] = s->pal[i];
+            }
+            for (i = 0; i < count; i++) {
+                uint32_t tmp = i << (8 - s->ham);
+                tmp |= tmp >> s->ham;
+                s->ham_palbuf[(i+count)*2]     = 0xFF00FFFF;
+                s->ham_palbuf[(i+count*2)*2]   = 0xFFFFFF00;
+                s->ham_palbuf[(i+count*3)*2]   = 0xFFFF00FF;
+                s->ham_palbuf[(i+count)*2+1]   = 0xFF000000 | tmp << 16;
+                s->ham_palbuf[(i+count*2)*2+1] = 0xFF000000 | tmp;
+                s->ham_palbuf[(i+count*3)*2+1] = 0xFF000000 | tmp << 8;
+            }
+            if (s->masking == MASK_HAS_MASK) {
+                for (i = 0; i < 8 * (1 << s->ham); i++)
+                    s->ham_palbuf[(1 << s->bpp) + i] = s->ham_palbuf[i] | 0xFF000000;
+            }
             for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width << 2);
-                for (plane = 0; plane < avctx->bits_per_coded_sample; plane++) {
-                    buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
-                    decodeplane32((uint32_t *)row, s->planebuf, s->planesize, plane);
+                uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                memset(s->ham_buf, 0, s->planesize * 8);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    decodeplane8(s->ham_buf, buf, s->planesize, plane);
+                    buf += s->planesize;
                 }
+                decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
             }
+        } else {
+            return unsupported(avctx);
         }
-    } else {
-        for (y = 0; y < avctx->height; y++) {
-            uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-            buf += decode_byterun(row, avctx->width, buf, buf_end);
+
+        if (!s->is_brush) {
+            FFSWAP(uint8_t *, s->video[0], s->video[1]);
         }
     }
 
-    if ((res = av_frame_ref(data, s->frame)) < 0)
-        return res;
+    if (avpkt->flags & AV_PKT_FLAG_KEY) {
+        frame->key_frame = 1;
+        frame->pict_type = AV_PICTURE_TYPE_I;
+    } else {
+        frame->key_frame = 0;
+        frame->pict_type = AV_PICTURE_TYPE_P;
+    }
 
     *got_frame = 1;
 
     return buf_size;
 }
 
+#if CONFIG_IFF_ILBM_DECODER
 AVCodec ff_iff_ilbm_decoder = {
-    .name           = "iff_ilbm",
-    .long_name      = NULL_IF_CONFIG_SMALL("IFF ILBM"),
+    .name           = "iff",
+    .long_name      = NULL_IF_CONFIG_SMALL("IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_IFF_ILBM,
     .priv_data_size = sizeof(IffContext),
     .init           = decode_init,
     .close          = decode_end,
-    .decode         = decode_frame_ilbm,
-    .capabilities   = AV_CODEC_CAP_DR1,
-};
-
-AVCodec ff_iff_byterun1_decoder = {
-    .name           = "iff_byterun1",
-    .long_name      = NULL_IF_CONFIG_SMALL("IFF ByteRun1"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_IFF_BYTERUN1,
-    .priv_data_size = sizeof(IffContext),
-    .init           = decode_init,
-    .close          = decode_end,
-    .decode         = decode_frame_byterun1,
+    .decode         = decode_frame,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/iirfilter.c b/libavcodec/iirfilter.c
index 4116d5c..b202515 100644
--- a/libavcodec/iirfilter.c
+++ b/libavcodec/iirfilter.c
@@ -2,20 +2,20 @@
  * IIR filter
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,6 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
-#include "libavutil/log.h"
 
 #include "iirfilter.h"
 
@@ -198,7 +197,7 @@ av_cold struct FFIIRFilterCoeffs *ff_iir_filter_init_coeffs(void *avc,
         return c;
 
 init_fail:
-    ff_iir_filter_free_coeffs(c);
+    ff_iir_filter_free_coeffsp(&c);
     return NULL;
 }
 
@@ -305,16 +304,24 @@ void ff_iir_filter_flt(const struct FFIIRFilterCoeffs *c,
     }
 }
 
-av_cold void ff_iir_filter_free_state(struct FFIIRFilterState *state)
+av_cold void ff_iir_filter_free_statep(struct FFIIRFilterState **state)
 {
-    av_free(state);
+    av_freep(state);
 }
 
-av_cold void ff_iir_filter_free_coeffs(struct FFIIRFilterCoeffs *coeffs)
+av_cold void ff_iir_filter_free_coeffsp(struct FFIIRFilterCoeffs **coeffsp)
 {
+    struct FFIIRFilterCoeffs *coeffs = *coeffsp;
     if (coeffs) {
-        av_free(coeffs->cx);
-        av_free(coeffs->cy);
+        av_freep(&coeffs->cx);
+        av_freep(&coeffs->cy);
     }
-    av_free(coeffs);
+    av_freep(coeffsp);
+}
+
+void ff_iir_filter_init(FFIIRFilterContext *f) {
+    f->filter_flt = ff_iir_filter_flt;
+
+    if (HAVE_MIPSFPU)
+        ff_iir_filter_init_mips(f);
 }
diff --git a/libavcodec/iirfilter.h b/libavcodec/iirfilter.h
index 052a4b3..5ffa1ce 100644
--- a/libavcodec/iirfilter.h
+++ b/libavcodec/iirfilter.h
@@ -2,20 +2,20 @@
  * IIR filter
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,6 +48,29 @@ enum IIRFilterMode{
     FF_FILTER_MODE_BANDSTOP,
 };
 
+typedef struct FFIIRFilterContext {
+    /**
+    * Perform IIR filtering on floating-point input samples.
+    *
+    * @param coeffs pointer to filter coefficients
+    * @param state  pointer to filter state
+    * @param size   input length
+    * @param src    source samples
+    * @param sstep  source stride
+    * @param dst    filtered samples (destination may be the same as input)
+    * @param dstep  destination stride
+    */
+    void (*filter_flt)(const struct FFIIRFilterCoeffs *coeffs,
+                        struct FFIIRFilterState *state, int size,
+                        const float *src, ptrdiff_t sstep, float *dst, ptrdiff_t dstep);
+} FFIIRFilterContext;
+
+/**
+ * Initialize FFIIRFilterContext
+ */
+void ff_iir_filter_init(FFIIRFilterContext *f);
+void ff_iir_filter_init_mips(FFIIRFilterContext *f);
+
 /**
  * Initialize filter coefficients.
  *
@@ -82,14 +105,14 @@ struct FFIIRFilterState* ff_iir_filter_init_state(int order);
  *
  * @param coeffs pointer allocated with ff_iir_filter_init_coeffs()
  */
-void ff_iir_filter_free_coeffs(struct FFIIRFilterCoeffs *coeffs);
+void ff_iir_filter_free_coeffsp(struct FFIIRFilterCoeffs **coeffs);
 
 /**
- * Free filter state.
+ * Free and zero filter state.
  *
- * @param state pointer allocated with ff_iir_filter_init_state()
+ * @param state pointer to pointer allocated with ff_iir_filter_init_state()
  */
-void ff_iir_filter_free_state(struct FFIIRFilterState *state);
+void ff_iir_filter_free_statep(struct FFIIRFilterState **state);
 
 /**
  * Perform IIR filtering on signed 16-bit input samples.
diff --git a/libavcodec/ilbcdata.h b/libavcodec/ilbcdata.h
new file mode 100644
index 0000000..8d145bc
--- /dev/null
+++ b/libavcodec/ilbcdata.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2013, The WebRTC project authors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ *   * Neither the name of Google nor the names of its contributors may
+ *     be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef AVCODEC_ILBCDATA_H
+#define AVCODEC_ILBCDATA_H
+
+#include "libavutil/common.h"
+
+static const uint8_t lsf_dim_codebook[] = { 3, 3, 4 };
+static const uint8_t lsf_size_codebook[] = { 64, 128, 128 };
+static const int16_t lsf_weight_20ms[] = { 12288, 8192, 4096, 0 };
+static const int16_t lsf_weight_30ms[] = { 8192, 16384, 10923, 5461, 0, 0 };
+
+static const int16_t hp_out_coeffs[] = { 3849, -7699, 3849, 7918, -3833 };
+
+static const int16_t kPlcPfSlope[] = { 26667, 18729, 13653, 10258, 7901, 6214 };
+
+static const int16_t kPlcPitchFact[] = { 0, 5462, 10922, 16384, 21846, 27306 };
+
+static const int16_t kCbFiltersRev[] = {
+    -140, 446, -755, 3302, 2922, -590, 343, -138
+};
+
+static const int16_t kPlcPerSqr[] = { 839, 1343, 2048, 2998, 4247, 5849 };
+
+static const int16_t alpha[] = {
+    6554, 13107, 19661, 26214
+};
+
+static const int16_t kLpcChirpSyntDenum[] = {
+    32767, 29573, 26690, 24087, 21739, 19619, 17707, 15980, 14422, 13016, 11747
+};
+
+static const int16_t LpcChirpWeightDenum[] = {
+    32767, 13835, 5841, 2466, 1041, 440, 186, 78,  33,  14,  6
+};
+
+static const int16_t cos_tbl[64] = {
+    32767,  32729,  32610,  32413,  32138,  31786,  31357,   30853,
+    30274,  29622,  28899,  28106,  27246,  26320,  25330,   24279,
+    23170,  22006,  20788,  19520,  18205,  16846,  15447,   14010,
+    12540,  11039,   9512,   7962,   6393,   4808,   3212,    1608,
+    0,      -1608,  -3212,  -4808,  -6393,  -7962,  -9512,  -11039,
+    -12540, -14010, -15447, -16846, -18205, -19520, -20788, -22006,
+    -23170, -24279, -25330, -26320, -27246, -28106, -28899, -29622,
+    -30274, -30853, -31357, -31786, -32138, -32413, -32610, -32729,
+};
+
+static const int16_t cos_derivative_tbl[64] = {
+    -632,  -1893,  -3150,  -4399,  -5638,  -6863,  -8072,  -9261,
+    -10428, -11570, -12684, -13767, -14817, -15832, -16808, -17744,
+    -18637, -19486, -20287, -21039, -21741, -22390, -22986, -23526,
+    -24009, -24435, -24801, -25108, -25354, -25540, -25664, -25726,
+    -25726, -25664, -25540, -25354, -25108, -24801, -24435, -24009,
+    -23526, -22986, -22390, -21741, -21039, -20287, -19486, -18637,
+    -17744, -16808, -15832, -14817, -13767, -12684, -11570, -10428,
+    -9261,  -8072,  -6863,  -5638,  -4399,  -3150,  -1893,   -632
+};
+
+static const int16_t lsf_codebook[64 * 3 + 128 * 3 + 128 * 4] = {
+    1273, 2238, 3696, 3199, 5309, 8209, 3606, 5671, 7829,
+    2815, 5262, 8778, 2608, 4027, 5493, 1582, 3076, 5945,
+    2983, 4181, 5396, 2437, 4322, 6902, 1861, 2998, 4613,
+    2007, 3250, 5214, 1388, 2459, 4262, 2563, 3805, 5269,
+    2036, 3522, 5129, 1935, 4025, 6694, 2744, 5121, 7338,
+    2810, 4248, 5723, 3054, 5405, 7745, 1449, 2593, 4763,
+    3411, 5128, 6596, 2484, 4659, 7496, 1668, 2879, 4818,
+    1812, 3072, 5036, 1638, 2649, 3900, 2464, 3550, 4644,
+    1853, 2900, 4158, 2458, 4163, 5830, 2556, 4036, 6254,
+    2703, 4432, 6519, 3062, 4953, 7609, 1725, 3703, 6187,
+    2221, 3877, 5427, 2339, 3579, 5197, 2021, 4633, 7037,
+    2216, 3328, 4535, 2961, 4739, 6667, 2807, 3955, 5099,
+    2788, 4501, 6088, 1642, 2755, 4431, 3341, 5282, 7333,
+    2414, 3726, 5727, 1582, 2822, 5269, 2259, 3447, 4905,
+    3117, 4986, 7054, 1825, 3491, 5542, 3338, 5736, 8627,
+    1789, 3090, 5488, 2566, 3720, 4923, 2846, 4682, 7161,
+    1950, 3321, 5976, 1834, 3383, 6734, 3238, 4769, 6094,
+    2031, 3978, 5903, 1877, 4068, 7436, 2131, 4644, 8296,
+    2764, 5010, 8013, 2194, 3667, 6302, 2053, 3127, 4342,
+    3523, 6595, 10010, 3134, 4457, 5748, 3142, 5819, 9414,
+    2223, 4334, 6353, 2022, 3224, 4822, 2186, 3458, 5544,
+    2552, 4757, 6870, 10905, 12917, 14578, 9503, 11485, 14485,
+    9518, 12494, 14052, 6222, 7487, 9174, 7759, 9186, 10506,
+    8315, 12755, 14786, 9609, 11486, 13866, 8909, 12077, 13643,
+    7369, 9054, 11520, 9408, 12163, 14715, 6436, 9911, 12843,
+    7109, 9556, 11884, 7557, 10075, 11640, 6482, 9202, 11547,
+    6463, 7914, 10980, 8611, 10427, 12752, 7101, 9676, 12606,
+    7428, 11252, 13172, 10197, 12955, 15842, 7487, 10955, 12613,
+    5575, 7858, 13621, 7268, 11719, 14752, 7476, 11744, 13795,
+    7049, 8686, 11922, 8234, 11314, 13983, 6560, 11173, 14984,
+    6405, 9211, 12337, 8222, 12054, 13801, 8039, 10728, 13255,
+    10066, 12733, 14389, 6016, 7338, 10040, 6896, 8648, 10234,
+    7538, 9170, 12175, 7327, 12608, 14983, 10516, 12643, 15223,
+    5538, 7644, 12213, 6728, 12221, 14253, 7563, 9377, 12948,
+    8661, 11023, 13401, 7280, 8806, 11085, 7723, 9793, 12333,
+    12225, 14648, 16709, 8768, 13389, 15245, 10267, 12197, 13812,
+    5301, 7078, 11484, 7100, 10280, 11906, 8716, 12555, 14183,
+    9567, 12464, 15434, 7832, 12305, 14300, 7608, 10556, 12121,
+    8913, 11311, 12868, 7414, 9722, 11239, 8666, 11641, 13250,
+    9079, 10752, 12300, 8024, 11608, 13306, 10453, 13607, 16449,
+    8135, 9573, 10909, 6375, 7741, 10125, 10025, 12217, 14874,
+    6985, 11063, 14109, 9296, 13051, 14642, 8613, 10975, 12542,
+    6583, 10414, 13534, 6191, 9368, 13430, 5742, 6859, 9260,
+    7723, 9813, 13679, 8137, 11291, 12833, 6562, 8973, 10641,
+    6062, 8462, 11335, 6928, 8784, 12647, 7501, 8784, 10031,
+    8372, 10045, 12135, 8191, 9864, 12746, 5917, 7487, 10979,
+    5516, 6848, 10318, 6819, 9899, 11421, 7882, 12912, 15670,
+    9558, 11230, 12753, 7752, 9327, 11472, 8479, 9980, 11358,
+    11418, 14072, 16386, 7968, 10330, 14423, 8423, 10555, 12162,
+    6337, 10306, 14391, 8850, 10879, 14276, 6750, 11885, 15710,
+    7037, 8328, 9764, 6914, 9266, 13476, 9746, 13949, 15519,
+    11032, 14444, 16925, 8032, 10271, 11810, 10962, 13451, 15833,
+    10021, 11667, 13324, 6273, 8226, 12936, 8543, 10397, 13496,
+    7936, 10302, 12745, 6769, 8138, 10446, 6081, 7786, 11719,
+    8637, 11795, 14975, 8790, 10336, 11812, 7040, 8490, 10771,
+    7338, 10381, 13153, 6598, 7888, 9358, 6518, 8237, 12030,
+    9055, 10763, 12983, 6490, 10009, 12007, 9589, 12023, 13632,
+    6867, 9447, 10995, 7930, 9816, 11397, 10241, 13300, 14939,
+    5830, 8670, 12387, 9870, 11915, 14247, 9318, 11647, 13272,
+    6721, 10836, 12929, 6543, 8233, 9944, 8034, 10854, 12394,
+    9112, 11787, 14218, 9302, 11114, 13400, 9022, 11366, 13816,
+    6962, 10461, 12480, 11288, 13333, 15222, 7249, 8974, 10547,
+    10566, 12336, 14390, 6697, 11339, 13521, 11851, 13944, 15826,
+    6847, 8381, 11349, 7509, 9331, 10939, 8029, 9618, 11909,
+    13973, 17644, 19647, 22474, 14722, 16522, 20035, 22134, 16305, 18179, 21106, 23048,
+    15150, 17948, 21394, 23225, 13582, 15191, 17687, 22333, 11778, 15546, 18458, 21753,
+    16619, 18410, 20827, 23559, 14229, 15746, 17907, 22474, 12465, 15327, 20700, 22831,
+    15085, 16799, 20182, 23410, 13026, 16935, 19890, 22892, 14310, 16854, 19007, 22944,
+    14210, 15897, 18891, 23154, 14633, 18059, 20132, 22899, 15246, 17781, 19780, 22640,
+    16396, 18904, 20912, 23035, 14618, 17401, 19510, 21672, 15473, 17497, 19813, 23439,
+    18851, 20736, 22323, 23864, 15055, 16804, 18530, 20916, 16490, 18196, 19990, 21939,
+    11711, 15223, 21154, 23312, 13294, 15546, 19393, 21472, 12956, 16060, 20610, 22417,
+    11628, 15843, 19617, 22501, 14106, 16872, 19839, 22689, 15655, 18192, 20161, 22452,
+    12953, 15244, 20619, 23549, 15322, 17193, 19926, 21762, 16873, 18676, 20444, 22359,
+    14874, 17871, 20083, 21959, 11534, 14486, 19194, 21857, 17766, 19617, 21338, 23178,
+    13404, 15284, 19080, 23136, 15392, 17527, 19470, 21953, 14462, 16153, 17985, 21192,
+    17734, 19750, 21903, 23783, 16973, 19096, 21675, 23815, 16597, 18936, 21257, 23461,
+    15966, 17865, 20602, 22920, 15416, 17456, 20301, 22972, 18335, 20093, 21732, 23497,
+    15548, 17217, 20679, 23594, 15208, 16995, 20816, 22870, 13890, 18015, 20531, 22468,
+    13211, 15377, 19951, 22388, 12852, 14635, 17978, 22680, 16002, 17732, 20373, 23544,
+    11373, 14134, 19534, 22707, 17329, 19151, 21241, 23462, 15612, 17296, 19362, 22850,
+    15422, 19104, 21285, 23164, 13792, 17111, 19349, 21370, 15352, 17876, 20776, 22667,
+    15253, 16961, 18921, 22123, 14108, 17264, 20294, 23246, 15785, 17897, 20010, 21822,
+    17399, 19147, 20915, 22753, 13010, 15659, 18127, 20840, 16826, 19422, 22218, 24084,
+    18108, 20641, 22695, 24237, 18018, 20273, 22268, 23920, 16057, 17821, 21365, 23665,
+    16005, 17901, 19892, 23016, 13232, 16683, 21107, 23221, 13280, 16615, 19915, 21829,
+    14950, 18575, 20599, 22511, 16337, 18261, 20277, 23216, 14306, 16477, 21203, 23158,
+    12803, 17498, 20248, 22014, 14327, 17068, 20160, 22006, 14402, 17461, 21599, 23688,
+    16968, 18834, 20896, 23055, 15070, 17157, 20451, 22315, 15419, 17107, 21601, 23946,
+    16039, 17639, 19533, 21424, 16326, 19261, 21745, 23673, 16489, 18534, 21658, 23782,
+    16594, 18471, 20549, 22807, 18973, 21212, 22890, 24278, 14264, 18674, 21123, 23071,
+    15117, 16841, 19239, 23118, 13762, 15782, 20478, 23230, 14111, 15949, 20058, 22354,
+    14990, 16738, 21139, 23492, 13735, 16971, 19026, 22158, 14676, 17314, 20232, 22807,
+    16196, 18146, 20459, 22339, 14747, 17258, 19315, 22437, 14973, 17778, 20692, 23367,
+    15715, 17472, 20385, 22349, 15702, 18228, 20829, 23410, 14428, 16188, 20541, 23630,
+    16824, 19394, 21365, 23246, 13069, 16392, 18900, 21121, 12047, 16640, 19463, 21689,
+    14757, 17433, 19659, 23125, 15185, 16930, 19900, 22540, 16026, 17725, 19618, 22399,
+    16086, 18643, 21179, 23472, 15462, 17248, 19102, 21196, 17368, 20016, 22396, 24096,
+    12340, 14475, 19665, 23362, 13636, 16229, 19462, 22728, 14096, 16211, 19591, 21635,
+    12152, 14867, 19943, 22301, 14492, 17503, 21002, 22728, 14834, 16788, 19447, 21411,
+    14650, 16433, 19326, 22308, 14624, 16328, 19659, 23204, 13888, 16572, 20665, 22488,
+    12977, 16102, 18841, 22246, 15523, 18431, 21757, 23738, 14095, 16349, 18837, 20947,
+    13266, 17809, 21088, 22839, 15427, 18190, 20270, 23143, 11859, 16753, 20935, 22486,
+    12310, 17667, 21736, 23319, 14021, 15926, 18702, 22002, 12286, 15299, 19178, 21126,
+    15703, 17491, 21039, 23151, 12272, 14018, 18213, 22570, 14817, 16364, 18485, 22598,
+    17109, 19683, 21851, 23677, 12657, 14903, 19039, 22061, 14713, 16487, 20527, 22814,
+    14635, 16726, 18763, 21715, 15878, 18550, 20718, 22906
+};
+
+static const int16_t gain3[9]={
+    -16384, -10813, -5407, 0, 4096, 8192, 12288, 16384, 32767
+};
+
+static const int16_t gain4[17]={
+    -17203, -14746, -12288, -9830, -7373, -4915, -2458, 0, 2458, 4915, 7373, 9830,
+    12288, 14746, 17203, 19661, 32767
+};
+
+static const int16_t gain5[33]={
+    614,   1229,  1843,  2458,  3072,  3686,
+    4301,  4915,  5530,  6144,  6758,  7373,
+    7987,  8602,  9216,  9830,  10445, 11059,
+    11674, 12288, 12902, 13517, 14131, 14746,
+    15360, 15974, 16589, 17203, 17818, 18432,
+    19046, 19661, 32767
+};
+
+static const int16_t *const ilbc_gain[] = {
+    gain5, gain4, gain3,
+};
+
+static const int16_t ilbc_state[8] = {
+   -30473, -17838, -9257, -2537, 3639, 10893, 19958, 32636
+};
+
+static const int16_t frg_quant_mod[64] = {
+    /* First 37 values in Q8 */
+    569, 671, 786, 916, 1077, 1278,
+    1529, 1802, 2109, 2481, 2898, 3440,
+    3943, 4535, 5149, 5778, 6464, 7208,
+    7904, 8682, 9397, 10285, 11240, 12246,
+    13313, 14382, 15492, 16735, 18131, 19693,
+    21280, 22912, 24624, 26544, 28432, 30488,
+    32720,
+    /* 22 values in Q5 */
+    4383, 4684, 5012, 5363, 5739, 6146,
+    6603, 7113, 7679, 8285, 9040, 9850,
+    10838, 11882, 13103, 14467, 15950, 17669,
+    19712, 22016, 24800, 28576,
+    /* 5 values in Q3 */
+    8240, 9792, 12040, 15440, 22472
+};
+
+#endif /* AVCODEC_ILBCDATA_H */
diff --git a/libavcodec/ilbcdec.c b/libavcodec/ilbcdec.c
new file mode 100644
index 0000000..bba83a5
--- /dev/null
+++ b/libavcodec/ilbcdec.c
@@ -0,0 +1,1489 @@
+/*
+ * Copyright (c) 2013, The WebRTC project authors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ *   * Neither the name of Google nor the names of its contributors may
+ *     be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "ilbcdata.h"
+
+#define LPC_N_20MS            1
+#define LPC_N_30MS            2
+#define LPC_N_MAX             2
+#define LSF_NSPLIT            3
+#define NASUB_MAX             4
+#define LPC_FILTERORDER       10
+#define NSUB_MAX              6
+#define SUBL                  40
+
+#define ST_MEM_L_TBL          85
+#define MEM_LF_TBL            147
+#define STATE_SHORT_LEN_20MS  57
+#define STATE_SHORT_LEN_30MS  58
+
+#define BLOCKL_MAX            240
+#define CB_MEML               147
+#define CB_NSTAGES            3
+#define CB_HALFFILTERLEN      4
+#define CB_FILTERLEN          8
+
+#define ENH_NBLOCKS_TOT 8
+#define ENH_BLOCKL     80
+#define ENH_BUFL     (ENH_NBLOCKS_TOT)*ENH_BLOCKL
+#define ENH_BUFL_FILTEROVERHEAD  3
+#define BLOCKL_MAX      240
+#define NSUB_20MS         4
+#define NSUB_30MS         6
+#define NSUB_MAX          6
+#define NASUB_20MS        2
+#define NASUB_30MS        4
+#define NASUB_MAX         4
+#define STATE_LEN        80
+#define STATE_SHORT_LEN_30MS  58
+#define STATE_SHORT_LEN_20MS  57
+
+#define SPL_MUL_16_16(a, b) ((int32_t) (((int16_t)(a)) * ((int16_t)(b))))
+#define SPL_MUL_16_16_RSFT(a, b, c) (SPL_MUL_16_16(a, b) >> (c))
+
+typedef struct ILBCFrame {
+    int16_t  lsf[LSF_NSPLIT*LPC_N_MAX];
+    int16_t  cb_index[CB_NSTAGES*(NASUB_MAX + 1)];
+    int16_t  gain_index[CB_NSTAGES*(NASUB_MAX + 1)];
+    int16_t  ifm;
+    int16_t  state_first;
+    int16_t  idx[STATE_SHORT_LEN_30MS];
+    int16_t  firstbits;
+    int16_t  start;
+} ILBCFrame;
+
+typedef struct ILBCContext {
+    AVClass         *class;
+    int              enhancer;
+
+    int              mode;
+    GetBitContext    gb;
+    ILBCFrame        frame;
+
+    int              prev_enh_pl;
+    int              consPLICount;
+    int              last_lag;
+    int              state_short_len;
+    int              lpc_n;
+    int16_t          nasub;
+    int16_t          nsub;
+    int              block_samples;
+    int16_t          no_of_words;
+    int16_t          no_of_bytes;
+    int16_t          lsfdeq[LPC_FILTERORDER*LPC_N_MAX];
+    int16_t          lsfold[LPC_FILTERORDER];
+    int16_t          syntMem[LPC_FILTERORDER];
+    int16_t          lsfdeqold[LPC_FILTERORDER];
+    int16_t          weightdenum[(LPC_FILTERORDER + 1) * NSUB_MAX];
+    int16_t          syntdenum[NSUB_MAX * (LPC_FILTERORDER + 1)];
+    int16_t          old_syntdenum[NSUB_MAX * (LPC_FILTERORDER + 1)];
+    int16_t          enh_buf[ENH_BUFL+ENH_BUFL_FILTEROVERHEAD];
+    int16_t          enh_period[ENH_NBLOCKS_TOT];
+    int16_t          prevResidual[NSUB_MAX*SUBL];
+    int16_t          decresidual[BLOCKL_MAX];
+    int16_t          plc_residual[BLOCKL_MAX + LPC_FILTERORDER];
+    int16_t          seed;
+    int16_t          prevPLI;
+    int16_t          prevScale;
+    int16_t          prevLag;
+    int16_t          per_square;
+    int16_t          prev_lpc[LPC_FILTERORDER + 1];
+    int16_t          plc_lpc[LPC_FILTERORDER + 1];
+    int16_t          hpimemx[2];
+    int16_t          hpimemy[4];
+} ILBCContext;
+
+static int unpack_frame(ILBCContext *s)
+{
+    ILBCFrame *frame = &s->frame;
+    GetBitContext *gb = &s->gb;
+    int j;
+
+    frame->lsf[0] = get_bits(gb, 6);
+    frame->lsf[1] = get_bits(gb, 7);
+    frame->lsf[2] = get_bits(gb, 7);
+
+    if (s->mode == 20) {
+        frame->start          = get_bits(gb, 2);
+        frame->state_first    = get_bits1(gb);
+        frame->ifm            = get_bits(gb, 6);
+        frame->cb_index[0]    = get_bits(gb, 6) << 1;
+        frame->gain_index[0]  = get_bits(gb, 2) << 3;
+        frame->gain_index[1]  = get_bits1(gb) << 3;
+        frame->cb_index[3]    = get_bits(gb, 7) << 1;
+        frame->gain_index[3]  = get_bits1(gb) << 4;
+        frame->gain_index[4]  = get_bits1(gb) << 3;
+        frame->gain_index[6]  = get_bits1(gb) << 4;
+    } else {
+        frame->lsf[3]         = get_bits(gb, 6);
+        frame->lsf[4]         = get_bits(gb, 7);
+        frame->lsf[5]         = get_bits(gb, 7);
+        frame->start          = get_bits(gb, 3);
+        frame->state_first    = get_bits1(gb);
+        frame->ifm            = get_bits(gb, 6);
+        frame->cb_index[0]    = get_bits(gb, 4) << 3;
+        frame->gain_index[0]  = get_bits1(gb) << 4;
+        frame->gain_index[1]  = get_bits1(gb) << 3;
+        frame->cb_index[3]    = get_bits(gb, 6) << 2;
+        frame->gain_index[3]  = get_bits1(gb) << 4;
+        frame->gain_index[4]  = get_bits1(gb) << 3;
+    }
+
+    for (j = 0; j < 48; j++)
+        frame->idx[j] = get_bits1(gb) << 2;
+
+    if (s->mode == 20) {
+        for (; j < 57; j++)
+            frame->idx[j] = get_bits1(gb) << 2;
+
+        frame->gain_index[1] |= get_bits1(gb) << 2;
+        frame->gain_index[3] |= get_bits(gb, 2) << 2;
+        frame->gain_index[4] |= get_bits1(gb) << 2;
+        frame->gain_index[6] |= get_bits1(gb) << 3;
+        frame->gain_index[7]  = get_bits(gb, 2) << 2;
+    } else {
+        for (; j < 58; j++)
+            frame->idx[j] = get_bits1(gb) << 2;
+
+        frame->cb_index[0]    |= get_bits(gb, 2) << 1;
+        frame->gain_index[0]  |= get_bits1(gb) << 3;
+        frame->gain_index[1]  |= get_bits1(gb) << 2;
+        frame->cb_index[3]    |= get_bits1(gb) << 1;
+        frame->cb_index[6]     = get_bits1(gb) << 7;
+        frame->cb_index[6]    |= get_bits(gb, 6) << 1;
+        frame->cb_index[9]     = get_bits(gb, 7) << 1;
+        frame->cb_index[12]    = get_bits(gb, 3) << 5;
+        frame->cb_index[12]   |= get_bits(gb, 4) << 1;
+        frame->gain_index[3]  |= get_bits(gb, 2) << 2;
+        frame->gain_index[4]  |= get_bits(gb, 2) << 1;
+        frame->gain_index[6]   = get_bits(gb, 2) << 3;
+        frame->gain_index[7]   = get_bits(gb, 2) << 2;
+        frame->gain_index[9]   = get_bits1(gb) << 4;
+        frame->gain_index[10]  = get_bits1(gb) << 3;
+        frame->gain_index[12]  = get_bits1(gb) << 4;
+        frame->gain_index[13]  = get_bits1(gb) << 3;
+    }
+
+    for (j = 0; j < 56; j++)
+        frame->idx[j] |= get_bits(gb, 2);
+
+    if (s->mode == 20) {
+        frame->idx[56]        |= get_bits(gb, 2);
+        frame->cb_index[0]    |= get_bits1(gb);
+        frame->cb_index[1]     = get_bits(gb, 7);
+        frame->cb_index[2]     = get_bits(gb, 6) << 1;
+        frame->cb_index[2]    |= get_bits1(gb);
+        frame->gain_index[0]  |= get_bits(gb, 3);
+        frame->gain_index[1]  |= get_bits(gb, 2);
+        frame->gain_index[2]   = get_bits(gb, 3);
+        frame->cb_index[3]    |= get_bits1(gb);
+        frame->cb_index[4]     = get_bits(gb, 6) << 1;
+        frame->cb_index[4]    |= get_bits1(gb);
+        frame->cb_index[5]     = get_bits(gb, 7);
+        frame->cb_index[6]     = get_bits(gb, 8);
+        frame->cb_index[7]     = get_bits(gb, 8);
+        frame->cb_index[8]     = get_bits(gb, 8);
+        frame->gain_index[3]  |= get_bits(gb, 2);
+        frame->gain_index[4]  |= get_bits(gb, 2);
+        frame->gain_index[5]   = get_bits(gb, 3);
+        frame->gain_index[6]  |= get_bits(gb, 3);
+        frame->gain_index[7]  |= get_bits(gb, 2);
+        frame->gain_index[8]   = get_bits(gb, 3);
+    } else {
+        frame->idx[56]        |= get_bits(gb, 2);
+        frame->idx[57]        |= get_bits(gb, 2);
+        frame->cb_index[0]    |= get_bits1(gb);
+        frame->cb_index[1]     = get_bits(gb, 7);
+        frame->cb_index[2]     = get_bits(gb, 4) << 3;
+        frame->cb_index[2]    |= get_bits(gb, 3);
+        frame->gain_index[0]  |= get_bits(gb, 3);
+        frame->gain_index[1]  |= get_bits(gb, 2);
+        frame->gain_index[2]   = get_bits(gb, 3);
+        frame->cb_index[3]    |= get_bits1(gb);
+        frame->cb_index[4]     = get_bits(gb, 4) << 3;
+        frame->cb_index[4]    |= get_bits(gb, 3);
+        frame->cb_index[5]     = get_bits(gb, 7);
+        frame->cb_index[6]    |= get_bits1(gb);
+        frame->cb_index[7]     = get_bits(gb, 5) << 3;
+        frame->cb_index[7]    |= get_bits(gb, 3);
+        frame->cb_index[8]     = get_bits(gb, 8);
+        frame->cb_index[9]    |= get_bits1(gb);
+        frame->cb_index[10]    = get_bits(gb, 4) << 4;
+        frame->cb_index[10]   |= get_bits(gb, 4);
+        frame->cb_index[11]    = get_bits(gb, 8);
+        frame->cb_index[12]   |= get_bits1(gb);
+        frame->cb_index[13]    = get_bits(gb, 3) << 5;
+        frame->cb_index[13]   |= get_bits(gb, 5);
+        frame->cb_index[14]    = get_bits(gb, 8);
+        frame->gain_index[3]  |= get_bits(gb, 2);
+        frame->gain_index[4]  |= get_bits1(gb);
+        frame->gain_index[5]   = get_bits(gb, 3);
+        frame->gain_index[6]  |= get_bits(gb, 3);
+        frame->gain_index[7]  |= get_bits(gb, 2);
+        frame->gain_index[8]   = get_bits(gb, 3);
+        frame->gain_index[9]  |= get_bits(gb, 4);
+        frame->gain_index[10] |= get_bits1(gb) << 2;
+        frame->gain_index[10] |= get_bits(gb, 2);
+        frame->gain_index[11]  = get_bits(gb, 3);
+        frame->gain_index[12] |= get_bits(gb, 4);
+        frame->gain_index[13] |= get_bits(gb, 3);
+        frame->gain_index[14]  = get_bits(gb, 3);
+    }
+
+    return get_bits1(gb);
+}
+
+static void index_conv(int16_t *index)
+{
+    int k;
+
+    for (k = 4; k < 6; k++) {
+        if (index[k] >= 44 && index[k] < 108) {
+            index[k] += 64;
+        } else if (index[k] >= 108 && index[k] < 128) {
+            index[k] += 128;
+        }
+    }
+}
+
+static void lsf_dequantization(int16_t *lsfdeq, int16_t *index, int16_t lpc_n)
+{
+    int i, j, pos = 0, cb_pos = 0;
+
+    for (i = 0; i < LSF_NSPLIT; i++) {
+        for (j = 0; j < lsf_dim_codebook[i]; j++) {
+            lsfdeq[pos + j] = lsf_codebook[cb_pos + index[i] * lsf_dim_codebook[i] + j];
+        }
+
+        pos    += lsf_dim_codebook[i];
+        cb_pos += lsf_size_codebook[i] * lsf_dim_codebook[i];
+    }
+
+    if (lpc_n > 1) {
+        pos = 0;
+        cb_pos = 0;
+        for (i = 0; i < LSF_NSPLIT; i++) {
+            for (j = 0; j < lsf_dim_codebook[i]; j++) {
+                lsfdeq[LPC_FILTERORDER + pos + j] = lsf_codebook[cb_pos +
+                    index[LSF_NSPLIT + i] * lsf_dim_codebook[i] + j];
+            }
+
+            pos    += lsf_dim_codebook[i];
+            cb_pos += lsf_size_codebook[i] * lsf_dim_codebook[i];
+        }
+    }
+}
+
+static void lsf_check_stability(int16_t *lsf, int dim, int nb_vectors)
+{
+    for (int n = 0; n < 2; n++) {
+        for (int m = 0; m < nb_vectors; m++) {
+            for (int k = 0; k < dim - 1; k++) {
+                int i = m * dim + k;
+
+                if ((lsf[i + 1] - lsf[i]) < 319) {
+                    if (lsf[i + 1] < lsf[i]) {
+                        lsf[i + 1] = lsf[i] + 160;
+                        lsf[i]     = lsf[i + 1] - 160;
+                    } else {
+                        lsf[i]     -= 160;
+                        lsf[i + 1] += 160;
+                    }
+                }
+
+                lsf[i] = av_clip(lsf[i], 82, 25723);
+            }
+        }
+    }
+}
+
+static void lsf_interpolate(int16_t *out, int16_t *in1,
+                            int16_t *in2, int16_t coef,
+                            int size)
+{
+    int invcoef = 16384 - coef, i;
+
+    for (i = 0; i < size; i++)
+        out[i] = (coef * in1[i] + invcoef * in2[i] + 8192) >> 14;
+}
+
+static void lsf2lsp(int16_t *lsf, int16_t *lsp, int order)
+{
+    int16_t diff, freq;
+    int32_t tmp;
+    int i, k;
+
+    for (i = 0; i < order; i++) {
+        freq = (lsf[i] * 20861) >> 15;
+        /* 20861: 1.0/(2.0*PI) in Q17 */
+        /*
+           Upper 8 bits give the index k and
+           Lower 8 bits give the difference, which needs
+           to be approximated linearly
+         */
+        k = FFMIN(freq >> 8, 63);
+        diff = freq & 0xFF;
+
+        /* Calculate linear approximation */
+        tmp = cos_derivative_tbl[k] * diff;
+        lsp[i] = cos_tbl[k] + (tmp >> 12);
+    }
+}
+
+static void get_lsp_poly(int16_t *lsp, int32_t *f)
+{
+    int16_t high, low;
+    int i, j, k, l;
+    int32_t tmp;
+
+    f[0] = 16777216;
+    f[1] = lsp[0] * -1024;
+
+    for (i = 2, k = 2, l = 2; i <= 5; i++, k += 2) {
+        f[l] = f[l - 2];
+
+        for (j = i; j > 1; j--, l--) {
+            high = f[l - 1] >> 16;
+            low = (f[l - 1] - (high * (1 << 16))) >> 1;
+
+            tmp = ((high * lsp[k]) * 4) + (((low * lsp[k]) >> 15) * 4);
+
+            f[l] += f[l - 2];
+            f[l] -= (unsigned)tmp;
+        }
+
+        f[l] -= lsp[k] * (1 << 10);
+        l += i;
+    }
+}
+
+static void lsf2poly(int16_t *a, int16_t *lsf)
+{
+    int32_t f[2][6];
+    int16_t lsp[10];
+    int32_t tmp;
+    int i;
+
+    lsf2lsp(lsf, lsp, LPC_FILTERORDER);
+
+    get_lsp_poly(&lsp[0], f[0]);
+    get_lsp_poly(&lsp[1], f[1]);
+
+    for (i = 5; i > 0; i--) {
+        f[0][i] += (unsigned)f[0][i - 1];
+        f[1][i] -= (unsigned)f[1][i - 1];
+    }
+
+    a[0] = 4096;
+    for (i = 5; i > 0; i--) {
+        tmp = f[0][6 - i] + (unsigned)f[1][6 - i] + 4096;
+        a[6 - i] = tmp >> 13;
+
+        tmp = f[0][6 - i] - (unsigned)f[1][6 - i] + 4096;
+        a[5 + i] = tmp >> 13;
+    }
+}
+
+static void lsp_interpolate2polydec(int16_t *a, int16_t *lsf1,
+                                   int16_t *lsf2, int coef, int length)
+{
+    int16_t lsftmp[LPC_FILTERORDER];
+
+    lsf_interpolate(lsftmp, lsf1, lsf2, coef, length);
+    lsf2poly(a, lsftmp);
+}
+
+static void bw_expand(int16_t *out, const int16_t *in, const int16_t *coef, int length)
+{
+    int i;
+
+    out[0] = in[0];
+    for (i = 1; i < length; i++)
+        out[i] = (coef[i] * in[i] + 16384) >> 15;
+}
+
+static void lsp_interpolate(int16_t *syntdenum, int16_t *weightdenum,
+                            int16_t *lsfdeq, int16_t length,
+                            ILBCContext *s)
+{
+    int16_t lp[LPC_FILTERORDER + 1], *lsfdeq2;
+    int i, pos, lp_length;
+
+    lsfdeq2 = lsfdeq + length;
+    lp_length = length + 1;
+
+    if (s->mode == 30) {
+        lsp_interpolate2polydec(lp, (*s).lsfdeqold, lsfdeq, lsf_weight_30ms[0], length);
+        memcpy(syntdenum, lp, lp_length * 2);
+        bw_expand(weightdenum, lp, kLpcChirpSyntDenum, lp_length);
+
+        pos = lp_length;
+        for (i = 1; i < 6; i++) {
+            lsp_interpolate2polydec(lp, lsfdeq, lsfdeq2,
+                                                 lsf_weight_30ms[i],
+                                                 length);
+            memcpy(syntdenum + pos, lp, lp_length * 2);
+            bw_expand(weightdenum + pos, lp, kLpcChirpSyntDenum, lp_length);
+            pos += lp_length;
+        }
+    } else {
+        pos = 0;
+        for (i = 0; i < s->nsub; i++) {
+            lsp_interpolate2polydec(lp, s->lsfdeqold, lsfdeq,
+                                    lsf_weight_20ms[i], length);
+            memcpy(syntdenum + pos, lp, lp_length * 2);
+            bw_expand(weightdenum + pos, lp, kLpcChirpSyntDenum, lp_length);
+            pos += lp_length;
+        }
+    }
+
+    if (s->mode == 30) {
+        memcpy(s->lsfdeqold, lsfdeq2, length * 2);
+    } else {
+        memcpy(s->lsfdeqold, lsfdeq, length * 2);
+    }
+}
+
+static void filter_mafq12(int16_t *in_ptr, int16_t *out_ptr,
+                          int16_t *B, int16_t B_length,
+                          int16_t length)
+{
+    int o, i, j;
+
+    for (i = 0; i < length; i++) {
+        const int16_t *b_ptr = &B[0];
+        const int16_t *x_ptr = &in_ptr[i];
+
+        o = 0;
+        for (j = 0; j < B_length; j++)
+            o += b_ptr[j] * *x_ptr--;
+
+        o = av_clip(o, -134217728, 134215679);
+
+        out_ptr[i] = ((o + 2048) >> 12);
+    }
+}
+
+static void filter_arfq12(const int16_t *data_in,
+                          int16_t *data_out,
+                          const int16_t *coefficients,
+                          int coefficients_length,
+                          int data_length)
+{
+    int i, j;
+
+    for (i = 0; i < data_length; i++) {
+        int output = 0, sum = 0;
+
+        for (j = coefficients_length - 1; j > 0; j--) {
+            sum += (unsigned)(coefficients[j] * data_out[i - j]);
+        }
+
+        output = coefficients[0] * data_in[i] - (unsigned)sum;
+        output = av_clip(output, -134217728, 134215679);
+
+        data_out[i] = (output + 2048) >> 12;
+    }
+}
+
+static void state_construct(int16_t ifm, int16_t *idx,
+                           int16_t *synt_denum, int16_t *Out_fix,
+                           int16_t len)
+{
+    int k;
+    int16_t maxVal;
+    int16_t *tmp1, *tmp2, *tmp3;
+    /* Stack based */
+    int16_t numerator[1 + LPC_FILTERORDER];
+    int16_t sampleValVec[2 * STATE_SHORT_LEN_30MS + LPC_FILTERORDER];
+    int16_t sampleMaVec[2 * STATE_SHORT_LEN_30MS + LPC_FILTERORDER];
+    int16_t *sampleVal = &sampleValVec[LPC_FILTERORDER];
+    int16_t *sampleMa = &sampleMaVec[LPC_FILTERORDER];
+    int16_t *sampleAr = &sampleValVec[LPC_FILTERORDER];
+
+    /* initialization of coefficients */
+
+    for (k = 0; k < LPC_FILTERORDER + 1; k++) {
+        numerator[k] = synt_denum[LPC_FILTERORDER - k];
+    }
+
+    /* decoding of the maximum value */
+
+    maxVal = frg_quant_mod[ifm];
+
+    /* decoding of the sample values */
+    tmp1 = sampleVal;
+    tmp2 = &idx[len - 1];
+
+    if (ifm < 37) {
+        for (k = 0; k < len; k++) {
+            /*the shifting is due to the Q13 in sq4_fixQ13[i], also the adding of 2097152 (= 0.5 << 22)
+               maxVal is in Q8 and result is in Q(-1) */
+            (*tmp1) = (int16_t) ((SPL_MUL_16_16(maxVal, ilbc_state[(*tmp2)]) + 2097152) >> 22);
+            tmp1++;
+            tmp2--;
+        }
+    } else if (ifm < 59) {
+        for (k = 0; k < len; k++) {
+            /*the shifting is due to the Q13 in sq4_fixQ13[i], also the adding of 262144 (= 0.5 << 19)
+               maxVal is in Q5 and result is in Q(-1) */
+            (*tmp1) = (int16_t) ((SPL_MUL_16_16(maxVal, ilbc_state[(*tmp2)]) + 262144) >> 19);
+            tmp1++;
+            tmp2--;
+        }
+    } else {
+        for (k = 0; k < len; k++) {
+            /*the shifting is due to the Q13 in sq4_fixQ13[i], also the adding of 65536 (= 0.5 << 17)
+               maxVal is in Q3 and result is in Q(-1) */
+            (*tmp1) = (int16_t) ((SPL_MUL_16_16(maxVal, ilbc_state[(*tmp2)]) + 65536) >> 17);
+            tmp1++;
+            tmp2--;
+        }
+    }
+
+    /* Set the rest of the data to zero */
+    memset(&sampleVal[len], 0, len * 2);
+
+    /* circular convolution with all-pass filter */
+
+    /* Set the state to zero */
+    memset(sampleValVec, 0, LPC_FILTERORDER * 2);
+
+    /* Run MA filter + AR filter */
+    filter_mafq12(sampleVal, sampleMa, numerator, LPC_FILTERORDER + 1, len + LPC_FILTERORDER);
+    memset(&sampleMa[len + LPC_FILTERORDER], 0, (len - LPC_FILTERORDER) * 2);
+    filter_arfq12(sampleMa, sampleAr, synt_denum, LPC_FILTERORDER + 1, 2 * len);
+
+    tmp1 = &sampleAr[len - 1];
+    tmp2 = &sampleAr[2 * len - 1];
+    tmp3 = Out_fix;
+    for (k = 0; k < len; k++) {
+        (*tmp3) = (*tmp1) + (*tmp2);
+        tmp1--;
+        tmp2--;
+        tmp3++;
+    }
+}
+
+static int16_t gain_dequantization(int index, int max_in, int stage)
+{
+    int16_t scale = FFMAX(1638, FFABS(max_in));
+
+    return ((scale * ilbc_gain[stage][index]) + 8192) >> 14;
+}
+
+static void vector_rmultiplication(int16_t *out, const int16_t *in,
+                                   const int16_t *win,
+                                   int length, int shift)
+{
+    for (int i = 0; i < length; i++)
+        out[i] = (in[i] * win[-i]) >> shift;
+}
+
+static void vector_multiplication(int16_t *out, const int16_t *in,
+                                  const int16_t *win, int length,
+                                  int shift)
+{
+    for (int i = 0; i < length; i++)
+        out[i] = (in[i] * win[i]) >> shift;
+}
+
+static void add_vector_and_shift(int16_t *out, const int16_t *in1,
+                                 const int16_t *in2, int length,
+                                 int shift)
+{
+    for (int i = 0; i < length; i++)
+        out[i] = (in1[i] + in2[i]) >> shift;
+}
+
+static void create_augmented_vector(int index, int16_t *buffer, int16_t *cbVec)
+{
+    int16_t cbVecTmp[4];
+    int interpolation_length = FFMIN(4, index);
+    int16_t ilow = index - interpolation_length;
+
+    memcpy(cbVec, buffer - index, index * 2);
+
+    vector_multiplication(&cbVec[ilow], buffer - index - interpolation_length, alpha, interpolation_length, 15);
+    vector_rmultiplication(cbVecTmp, buffer - interpolation_length, &alpha[interpolation_length - 1], interpolation_length, 15);
+    add_vector_and_shift(&cbVec[ilow], &cbVec[ilow], cbVecTmp, interpolation_length, 0);
+
+    memcpy(cbVec + index, buffer - index, FFMIN(SUBL - index, index) * sizeof(*cbVec));
+}
+
+static void get_codebook(int16_t * cbvec,   /* (o) Constructed codebook vector */
+                     int16_t * mem,     /* (i) Codebook buffer */
+                     int16_t index,     /* (i) Codebook index */
+                     int16_t lMem,      /* (i) Length of codebook buffer */
+                     int16_t cbveclen   /* (i) Codebook vector length */
+)
+{
+    int16_t k, base_size;
+    int16_t lag;
+    /* Stack based */
+    int16_t tempbuff2[SUBL + 5];
+
+    /* Determine size of codebook sections */
+    base_size = lMem - cbveclen + 1;
+
+    if (cbveclen == SUBL) {
+        base_size += cbveclen / 2;
+    }
+
+    /* No filter -> First codebook section */
+    if (index < lMem - cbveclen + 1) {
+        /* first non-interpolated vectors */
+
+        k = index + cbveclen;
+        /* get vector */
+        memcpy(cbvec, mem + lMem - k, cbveclen * 2);
+    } else if (index < base_size) {
+
+        /* Calculate lag */
+
+        k = (int16_t) SPL_MUL_16_16(2, (index - (lMem - cbveclen + 1))) + cbveclen;
+
+        lag = k / 2;
+
+        create_augmented_vector(lag, mem + lMem, cbvec);
+    } else {
+        int16_t memIndTest;
+
+        /* first non-interpolated vectors */
+
+        if (index - base_size < lMem - cbveclen + 1) {
+
+            /* Set up filter memory, stuff zeros outside memory buffer */
+
+            memIndTest = lMem - (index - base_size + cbveclen);
+
+            memset(mem - CB_HALFFILTERLEN, 0, CB_HALFFILTERLEN * 2);
+            memset(mem + lMem, 0, CB_HALFFILTERLEN * 2);
+
+            /* do filtering to get the codebook vector */
+
+            filter_mafq12(&mem[memIndTest + 4], cbvec, (int16_t *) kCbFiltersRev, CB_FILTERLEN, cbveclen);
+        } else {
+            /* interpolated vectors */
+            /* Stuff zeros outside memory buffer  */
+            memIndTest = lMem - cbveclen - CB_FILTERLEN;
+            memset(mem + lMem, 0, CB_HALFFILTERLEN * 2);
+
+            /* do filtering */
+            filter_mafq12(&mem[memIndTest + 7], tempbuff2, (int16_t *) kCbFiltersRev, CB_FILTERLEN, (int16_t) (cbveclen + 5));
+
+            /* Calculate lag index */
+            lag = (cbveclen << 1) - 20 + index - base_size - lMem - 1;
+
+            create_augmented_vector(lag, tempbuff2 + SUBL + 5, cbvec);
+        }
+    }
+}
+
+static void construct_vector (
+    int16_t *decvector,   /* (o) Decoded vector */
+    int16_t *index,       /* (i) Codebook indices */
+    int16_t *gain_index,  /* (i) Gain quantization indices */
+    int16_t *mem,         /* (i) Buffer for codevector construction */
+    int16_t lMem,         /* (i) Length of buffer */
+    int16_t veclen)
+{
+    int16_t gain[CB_NSTAGES];
+    int16_t cbvec0[SUBL];
+    int16_t cbvec1[SUBL];
+    int16_t cbvec2[SUBL];
+    int32_t a32;
+    int16_t *gainPtr;
+    int j;
+
+    /* gain de-quantization */
+
+    gain[0] = gain_dequantization(gain_index[0], 16384, 0);
+    gain[1] = gain_dequantization(gain_index[1], gain[0], 1);
+    gain[2] = gain_dequantization(gain_index[2], gain[1], 2);
+
+    /* codebook vector construction and construction of total vector */
+
+    /* Stack based */
+    get_codebook(cbvec0, mem, index[0], lMem, veclen);
+    get_codebook(cbvec1, mem, index[1], lMem, veclen);
+    get_codebook(cbvec2, mem, index[2], lMem, veclen);
+
+    gainPtr = &gain[0];
+    for (j = 0; j < veclen; j++) {
+        a32 = SPL_MUL_16_16(*gainPtr++, cbvec0[j]);
+        a32 += SPL_MUL_16_16(*gainPtr++, cbvec1[j]);
+        a32 += (unsigned)SPL_MUL_16_16(*gainPtr, cbvec2[j]);
+        gainPtr -= 2;
+        decvector[j] = (a32 + 8192) >> 14;
+    }
+}
+
+static void reverse_memcpy(int16_t *dest, int16_t *source, int length)
+{
+    int16_t* destPtr = dest;
+    int16_t* sourcePtr = source;
+    int j;
+
+    for (j = 0; j < length; j++)
+        *destPtr-- = *sourcePtr++;
+}
+
+static void decode_residual(ILBCContext *s,
+                            ILBCFrame *encbits,
+                            int16_t *decresidual,
+                            int16_t *syntdenum)
+{
+    int16_t meml_gotten, Nfor, Nback, diff, start_pos;
+    int16_t subcount, subframe;
+    int16_t *reverseDecresidual = s->enh_buf;        /* Reversed decoded data, used for decoding backwards in time (reuse memory in state) */
+    int16_t *memVec = s->prevResidual;
+    int16_t *mem = &memVec[CB_HALFFILTERLEN];   /* Memory for codebook */
+
+    diff = STATE_LEN - s->state_short_len;
+
+    if (encbits->state_first == 1) {
+        start_pos = (encbits->start - 1) * SUBL;
+    } else {
+        start_pos = (encbits->start - 1) * SUBL + diff;
+    }
+
+    /* decode scalar part of start state */
+
+    state_construct(encbits->ifm, encbits->idx, &syntdenum[(encbits->start - 1) * (LPC_FILTERORDER + 1)], &decresidual[start_pos], s->state_short_len);
+
+    if (encbits->state_first) { /* put adaptive part in the end */
+        /* setup memory */
+        memset(mem, 0, (int16_t) (CB_MEML - s->state_short_len) * 2);
+        memcpy(mem + CB_MEML - s->state_short_len, decresidual + start_pos, s->state_short_len * 2);
+
+        /* construct decoded vector */
+
+        construct_vector(&decresidual[start_pos + s->state_short_len], encbits->cb_index, encbits->gain_index, mem + CB_MEML - ST_MEM_L_TBL, ST_MEM_L_TBL, (int16_t) diff);
+
+    } else { /* put adaptive part in the beginning */
+        /* setup memory */
+        meml_gotten = s->state_short_len;
+        reverse_memcpy(mem + CB_MEML - 1, decresidual + start_pos, meml_gotten);
+        memset(mem, 0, (int16_t) (CB_MEML - meml_gotten) * 2);
+
+        /* construct decoded vector */
+        construct_vector(reverseDecresidual, encbits->cb_index, encbits->gain_index, mem + CB_MEML - ST_MEM_L_TBL, ST_MEM_L_TBL, diff);
+
+        /* get decoded residual from reversed vector */
+        reverse_memcpy(&decresidual[start_pos - 1], reverseDecresidual, diff);
+    }
+
+    /* counter for predicted subframes */
+    subcount = 1;
+
+    /* forward prediction of subframes */
+    Nfor = s->nsub - encbits->start - 1;
+
+    if (Nfor > 0) {
+        /* setup memory */
+        memset(mem, 0, (CB_MEML - STATE_LEN) * 2);
+        memcpy(mem + CB_MEML - STATE_LEN, decresidual + (encbits->start - 1) * SUBL, STATE_LEN * 2);
+
+        /* loop over subframes to encode */
+        for (subframe = 0; subframe < Nfor; subframe++) {
+            /* construct decoded vector */
+            construct_vector(&decresidual[(encbits->start + 1 + subframe) * SUBL], encbits->cb_index + subcount * CB_NSTAGES, encbits->gain_index + subcount * CB_NSTAGES, mem, MEM_LF_TBL, SUBL);
+
+            /* update memory */
+            memmove(mem, mem + SUBL, (CB_MEML - SUBL) * sizeof(*mem));
+            memcpy(mem + CB_MEML - SUBL, &decresidual[(encbits->start + 1 + subframe) * SUBL], SUBL * 2);
+
+            subcount++;
+        }
+
+    }
+
+    /* backward prediction of subframes */
+    Nback = encbits->start - 1;
+
+    if (Nback > 0) {
+        /* setup memory */
+        meml_gotten = SUBL * (s->nsub + 1 - encbits->start);
+        if (meml_gotten > CB_MEML) {
+            meml_gotten = CB_MEML;
+        }
+
+        reverse_memcpy(mem + CB_MEML - 1, decresidual + (encbits->start - 1) * SUBL, meml_gotten);
+        memset(mem, 0, (int16_t) (CB_MEML - meml_gotten) * 2);
+
+        /* loop over subframes to decode */
+        for (subframe = 0; subframe < Nback; subframe++) {
+            /* construct decoded vector */
+            construct_vector(&reverseDecresidual[subframe * SUBL], encbits->cb_index + subcount * CB_NSTAGES,
+                        encbits->gain_index + subcount * CB_NSTAGES, mem, MEM_LF_TBL, SUBL);
+
+            /* update memory */
+            memmove(mem, mem + SUBL, (CB_MEML - SUBL) * sizeof(*mem));
+            memcpy(mem + CB_MEML - SUBL, &reverseDecresidual[subframe * SUBL], SUBL * 2);
+
+            subcount++;
+        }
+
+        /* get decoded residual from reversed vector */
+        reverse_memcpy(decresidual + SUBL * Nback - 1, reverseDecresidual, SUBL * Nback);
+    }
+}
+
+static int16_t max_abs_value_w16(const int16_t* vector, int length)
+{
+    int i = 0, absolute = 0, maximum = 0;
+
+    if (vector == NULL || length <= 0) {
+        return -1;
+    }
+
+    for (i = 0; i < length; i++) {
+        absolute = FFABS(vector[i]);
+        if (absolute > maximum)
+            maximum = absolute;
+    }
+
+    // Guard the case for abs(-32768).
+    return FFMIN(maximum, INT16_MAX);
+}
+
+static int16_t get_size_in_bits(uint32_t n)
+{
+    int16_t bits;
+
+    if (0xFFFF0000 & n) {
+        bits = 16;
+    } else {
+        bits = 0;
+    }
+
+    if (0x0000FF00 & (n >> bits)) bits += 8;
+    if (0x000000F0 & (n >> bits)) bits += 4;
+    if (0x0000000C & (n >> bits)) bits += 2;
+    if (0x00000002 & (n >> bits)) bits += 1;
+    if (0x00000001 & (n >> bits)) bits += 1;
+
+    return bits;
+}
+
+static int32_t scale_dot_product(const int16_t *v1, const int16_t *v2, int length, int scaling)
+{
+    int64_t sum = 0;
+
+    for (int i = 0; i < length; i++)
+        sum += (v1[i] * v2[i]) >> scaling;
+
+    return av_clipl_int32(sum);
+}
+
+static void correlation(int32_t *corr, int32_t *ener, int16_t *buffer,
+                        int16_t lag, int16_t blen, int16_t srange, int16_t scale)
+{
+    int16_t *w16ptr;
+
+    w16ptr = &buffer[blen - srange - lag];
+
+    *corr = scale_dot_product(&buffer[blen - srange], w16ptr, srange, scale);
+    *ener = scale_dot_product(w16ptr, w16ptr, srange, scale);
+
+    if (*ener == 0) {
+        *corr = 0;
+        *ener = 1;
+    }
+}
+
+#define SPL_SHIFT_W32(x, c) (((c) >= 0) ? ((x) << (c)) : ((x) >> (-(c))))
+
+static int16_t norm_w32(int32_t a)
+{
+    if (a == 0) {
+        return 0;
+    } else if (a < 0) {
+        a = ~a;
+    }
+
+    return ff_clz(a);
+}
+
+static int32_t div_w32_w16(int32_t num, int16_t den)
+{
+    if (den != 0)
+        return num / den;
+    else
+        return 0x7FFFFFFF;
+}
+
+static void do_plc(int16_t *plc_residual,      /* (o) concealed residual */
+                   int16_t *plc_lpc,           /* (o) concealed LP parameters */
+                   int16_t PLI,                /* (i) packet loss indicator
+                                                      0 - no PL, 1 = PL */
+                   int16_t *decresidual,       /* (i) decoded residual */
+                   int16_t *lpc,               /* (i) decoded LPC (only used for no PL) */
+                   int16_t inlag,              /* (i) pitch lag */
+                   ILBCContext *s)             /* (i/o) decoder instance */
+{
+    int16_t i, pick;
+    int32_t cross, ener, cross_comp, ener_comp = 0;
+    int32_t measure, max_measure, energy;
+    int16_t max, cross_square_max, cross_square;
+    int16_t j, lag, tmp1, tmp2, randlag;
+    int16_t shift1, shift2, shift3, shift_max;
+    int16_t scale3;
+    int16_t corrLen;
+    int32_t tmpW32, tmp2W32;
+    int16_t use_gain;
+    int16_t tot_gain;
+    int16_t max_perSquare;
+    int16_t scale1, scale2;
+    int16_t totscale;
+    int32_t nom;
+    int16_t denom;
+    int16_t pitchfact;
+    int16_t use_lag;
+    int ind;
+    int16_t randvec[BLOCKL_MAX];
+
+    /* Packet Loss */
+    if (PLI == 1) {
+
+        s->consPLICount += 1;
+
+        /* if previous frame not lost,
+           determine pitch pred. gain */
+
+        if (s->prevPLI != 1) {
+
+            /* Maximum 60 samples are correlated, preserve as high accuracy
+               as possible without getting overflow */
+            max = max_abs_value_w16(s->prevResidual, s->block_samples);
+            scale3 = (get_size_in_bits(max) << 1) - 25;
+            if (scale3 < 0) {
+                scale3 = 0;
+            }
+
+            /* Store scale for use when interpolating between the
+             * concealment and the received packet */
+            s->prevScale = scale3;
+
+            /* Search around the previous lag +/-3 to find the
+               best pitch period */
+            lag = inlag - 3;
+
+            /* Guard against getting outside the frame */
+            corrLen = FFMIN(60, s->block_samples - (inlag + 3));
+
+            correlation(&cross, &ener, s->prevResidual, lag, s->block_samples, corrLen, scale3);
+
+            /* Normalize and store cross^2 and the number of shifts */
+            shift_max = get_size_in_bits(FFABS(cross)) - 15;
+            cross_square_max = (int16_t) SPL_MUL_16_16_RSFT(SPL_SHIFT_W32(cross, -shift_max), SPL_SHIFT_W32(cross, -shift_max), 15);
+
+            for (j = inlag - 2; j <= inlag + 3; j++) {
+                correlation(&cross_comp, &ener_comp, s->prevResidual, j, s->block_samples, corrLen, scale3);
+
+                /* Use the criteria (corr*corr)/energy to compare if
+                   this lag is better or not. To avoid the division,
+                   do a cross multiplication */
+                shift1 = get_size_in_bits(FFABS(cross_comp)) - 15;
+                cross_square = (int16_t) SPL_MUL_16_16_RSFT(SPL_SHIFT_W32(cross_comp, -shift1), SPL_SHIFT_W32(cross_comp, -shift1), 15);
+
+                shift2 = get_size_in_bits(ener) - 15;
+                measure = SPL_MUL_16_16(SPL_SHIFT_W32(ener, -shift2), cross_square);
+
+                shift3 = get_size_in_bits(ener_comp) - 15;
+                max_measure = SPL_MUL_16_16(SPL_SHIFT_W32(ener_comp, -shift3), cross_square_max);
+
+                /* Calculate shift value, so that the two measures can
+                   be put in the same Q domain */
+                if (((shift_max << 1) + shift3) > ((shift1 << 1) + shift2)) {
+                    tmp1 = FFMIN(31, (shift_max << 1) + shift3 - (shift1 << 1) - shift2);
+                    tmp2 = 0;
+                } else {
+                    tmp1 = 0;
+                    tmp2 = FFMIN(31, (shift1 << 1) + shift2 - (shift_max << 1) - shift3);
+                }
+
+                if ((measure >> tmp1) > (max_measure >> tmp2)) {
+                    /* New lag is better => record lag, measure and domain */
+                    lag = j;
+                    cross_square_max = cross_square;
+                    cross = cross_comp;
+                    shift_max = shift1;
+                    ener = ener_comp;
+                }
+            }
+
+            /* Calculate the periodicity for the lag with the maximum correlation.
+
+               Definition of the periodicity:
+               abs(corr(vec1, vec2))/(sqrt(energy(vec1))*sqrt(energy(vec2)))
+
+               Work in the Square domain to simplify the calculations
+               max_perSquare is less than 1 (in Q15)
+             */
+            tmp2W32 = scale_dot_product(&s->prevResidual[s->block_samples - corrLen], &s->prevResidual[s->block_samples - corrLen], corrLen, scale3);
+
+            if ((tmp2W32 > 0) && (ener_comp > 0)) {
+                /* norm energies to int16_t, compute the product of the energies and
+                   use the upper int16_t as the denominator */
+
+                scale1 = norm_w32(tmp2W32) - 16;
+                tmp1 = SPL_SHIFT_W32(tmp2W32, scale1);
+
+                scale2 = norm_w32(ener) - 16;
+                tmp2 =  SPL_SHIFT_W32(ener, scale2);
+                denom = SPL_MUL_16_16_RSFT(tmp1, tmp2, 16);    /* denom in Q(scale1+scale2-16) */
+
+                /* Square the cross correlation and norm it such that max_perSquare
+                   will be in Q15 after the division */
+
+                totscale = scale1 + scale2 - 1;
+                tmp1 = SPL_SHIFT_W32(cross, (totscale >> 1));
+                tmp2 = SPL_SHIFT_W32(cross, totscale - (totscale >> 1));
+
+                nom = SPL_MUL_16_16(tmp1, tmp2);
+                max_perSquare = div_w32_w16(nom, denom);
+            } else {
+                max_perSquare = 0;
+            }
+        } else {
+            /* previous frame lost, use recorded lag and gain */
+            lag = s->prevLag;
+            max_perSquare = s->per_square;
+        }
+
+        /* Attenuate signal and scale down pitch pred gain if
+           several frames lost consecutively */
+
+        use_gain = 32767;       /* 1.0 in Q15 */
+
+        if (s->consPLICount * s->block_samples > 320) {
+            use_gain = 29491;   /* 0.9 in Q15 */
+        } else if (s->consPLICount * s->block_samples > 640) {
+            use_gain = 22938;   /* 0.7 in Q15 */
+        } else if (s->consPLICount * s->block_samples > 960) {
+            use_gain = 16384;   /* 0.5 in Q15 */
+        } else if (s->consPLICount * s->block_samples > 1280) {
+            use_gain = 0;       /* 0.0 in Q15 */
+        }
+
+        /* Compute mixing factor of picth repeatition and noise:
+           for max_per>0.7 set periodicity to 1.0
+           0.4<max_per<0.7 set periodicity to (maxper-0.4)/0.7-0.4)
+           max_per<0.4 set periodicity to 0.0
+         */
+
+        if (max_perSquare > 7868) {     /* periodicity > 0.7  (0.7^4=0.2401 in Q15) */
+            pitchfact = 32767;
+        } else if (max_perSquare > 839) {       /* 0.4 < periodicity < 0.7 (0.4^4=0.0256 in Q15) */
+            /* find best index and interpolate from that */
+            ind = 5;
+            while ((max_perSquare < kPlcPerSqr[ind]) && (ind > 0)) {
+                ind--;
+            }
+            /* pitch fact is approximated by first order */
+            tmpW32 = kPlcPitchFact[ind] + SPL_MUL_16_16_RSFT(kPlcPfSlope[ind], (max_perSquare - kPlcPerSqr[ind]), 11);
+
+            pitchfact = FFMIN(tmpW32, 32767); /* guard against overflow */
+
+        } else {                /* periodicity < 0.4 */
+            pitchfact = 0;
+        }
+
+        /* avoid repetition of same pitch cycle (buzzyness) */
+        use_lag = lag;
+        if (lag < 80) {
+            use_lag = 2 * lag;
+        }
+
+        /* compute concealed residual */
+        energy = 0;
+
+        for (i = 0; i < s->block_samples; i++) {
+            /* noise component -  52 < randlagFIX < 117 */
+            s->seed = SPL_MUL_16_16(s->seed, 31821) + 13849;
+            randlag = 53 + (s->seed & 63);
+
+            pick = i - randlag;
+
+            if (pick < 0) {
+                randvec[i] = s->prevResidual[s->block_samples + pick];
+            } else {
+                randvec[i] = s->prevResidual[pick];
+            }
+
+            /* pitch repeatition component */
+            pick = i - use_lag;
+
+            if (pick < 0) {
+                plc_residual[i] = s->prevResidual[s->block_samples + pick];
+            } else {
+                plc_residual[i] = plc_residual[pick];
+            }
+
+            /* Attinuate total gain for each 10 ms */
+            if (i < 80) {
+                tot_gain = use_gain;
+            } else if (i < 160) {
+                tot_gain = SPL_MUL_16_16_RSFT(31130, use_gain, 15);    /* 0.95*use_gain */
+            } else {
+                tot_gain = SPL_MUL_16_16_RSFT(29491, use_gain, 15);    /* 0.9*use_gain */
+            }
+
+            /* mix noise and pitch repeatition */
+            plc_residual[i] = SPL_MUL_16_16_RSFT(tot_gain, (pitchfact * plc_residual[i] + (32767 - pitchfact) * randvec[i] + 16384) >> 15, 15);
+
+            /* Shifting down the result one step extra to ensure that no overflow
+               will occur */
+            energy += SPL_MUL_16_16_RSFT(plc_residual[i], plc_residual[i], (s->prevScale + 1));
+
+        }
+
+        /* less than 30 dB, use only noise */
+        if (energy < SPL_SHIFT_W32(s->block_samples * 900, -s->prevScale - 1)) {
+            energy = 0;
+            for (i = 0; i < s->block_samples; i++) {
+                plc_residual[i] = randvec[i];
+            }
+        }
+
+        /* use the old LPC */
+        memcpy(plc_lpc, (*s).prev_lpc, (LPC_FILTERORDER + 1) * 2);
+
+        /* Update state in case there are multiple frame losses */
+        s->prevLag = lag;
+        s->per_square = max_perSquare;
+    } else { /* no packet loss, copy input */
+        memcpy(plc_residual, decresidual, s->block_samples * 2);
+        memcpy(plc_lpc, lpc, (LPC_FILTERORDER + 1) * 2);
+        s->consPLICount = 0;
+    }
+
+    /* update state */
+    s->prevPLI = PLI;
+    memcpy(s->prev_lpc, plc_lpc, (LPC_FILTERORDER + 1) * 2);
+    memcpy(s->prevResidual, plc_residual, s->block_samples * 2);
+
+    return;
+}
+
+static int xcorr_coeff(int16_t *target, int16_t *regressor,
+                       int16_t subl, int16_t searchLen,
+                       int16_t offset, int16_t step)
+{
+    int16_t maxlag;
+    int16_t pos;
+    int16_t max;
+    int16_t cross_corr_scale, energy_scale;
+    int16_t cross_corr_sg_mod, cross_corr_sg_mod_max;
+    int32_t cross_corr, energy;
+    int16_t cross_corr_mod, energy_mod, enery_mod_max;
+    int16_t *tp, *rp;
+    int16_t *rp_beg, *rp_end;
+    int16_t totscale, totscale_max;
+    int16_t scalediff;
+    int32_t new_crit, max_crit;
+    int shifts;
+    int k;
+
+    /* Initializations, to make sure that the first one is selected */
+    cross_corr_sg_mod_max = 0;
+    enery_mod_max = INT16_MAX;
+    totscale_max = -500;
+    maxlag = 0;
+    pos = 0;
+
+    /* Find scale value and start position */
+    if (step == 1) {
+        max = max_abs_value_w16(regressor, (int16_t) (subl + searchLen - 1));
+        rp_beg = regressor;
+        rp_end = &regressor[subl];
+    } else {                    /* step== -1 */
+        max = max_abs_value_w16(&regressor[-searchLen], (int16_t) (subl + searchLen - 1));
+        rp_beg = &regressor[-1];
+        rp_end = &regressor[subl - 1];
+    }
+
+    /* Introduce a scale factor on the energy in int32_t in
+       order to make sure that the calculation does not
+       overflow */
+
+    if (max > 5000) {
+        shifts = 2;
+    } else {
+        shifts = 0;
+    }
+
+    /* Calculate the first energy, then do a +/- to get the other energies */
+    energy = scale_dot_product(regressor, regressor, subl, shifts);
+
+    for (k = 0; k < searchLen; k++) {
+        tp = target;
+        rp = &regressor[pos];
+
+        cross_corr = scale_dot_product(tp, rp, subl, shifts);
+
+        if ((energy > 0) && (cross_corr > 0)) {
+            /* Put cross correlation and energy on 16 bit word */
+            cross_corr_scale = norm_w32(cross_corr) - 16;
+            cross_corr_mod = (int16_t) SPL_SHIFT_W32(cross_corr, cross_corr_scale);
+            energy_scale = norm_w32(energy) - 16;
+            energy_mod = (int16_t) SPL_SHIFT_W32(energy, energy_scale);
+
+            /* Square cross correlation and store upper int16_t */
+            cross_corr_sg_mod = (int16_t) SPL_MUL_16_16_RSFT(cross_corr_mod, cross_corr_mod, 16);
+
+            /* Calculate the total number of (dynamic) right shifts that have
+               been performed on (cross_corr*cross_corr)/energy
+             */
+            totscale = energy_scale - (cross_corr_scale * 2);
+
+            /* Calculate the shift difference in order to be able to compare the two
+               (cross_corr*cross_corr)/energy in the same domain
+             */
+            scalediff = totscale - totscale_max;
+            scalediff = FFMIN(scalediff, 31);
+            scalediff = FFMAX(scalediff, -31);
+
+            /* Compute the cross multiplication between the old best criteria
+               and the new one to be able to compare them without using a
+               division */
+
+            if (scalediff < 0) {
+                new_crit = ((int32_t) cross_corr_sg_mod * enery_mod_max) >> (-scalediff);
+                max_crit = ((int32_t) cross_corr_sg_mod_max * energy_mod);
+            } else {
+                new_crit = ((int32_t) cross_corr_sg_mod * enery_mod_max);
+                max_crit = ((int32_t) cross_corr_sg_mod_max * energy_mod) >> scalediff;
+            }
+
+            /* Store the new lag value if the new criteria is larger
+               than previous largest criteria */
+
+            if (new_crit > max_crit) {
+                cross_corr_sg_mod_max = cross_corr_sg_mod;
+                enery_mod_max = energy_mod;
+                totscale_max = totscale;
+                maxlag = k;
+            }
+        }
+        pos += step;
+
+        /* Do a +/- to get the next energy */
+        energy += (unsigned)step * ((*rp_end * *rp_end - *rp_beg * *rp_beg) >> shifts);
+
+        rp_beg += step;
+        rp_end += step;
+    }
+
+    return maxlag + offset;
+}
+
+static void hp_output(int16_t *signal, const int16_t *ba, int16_t *y,
+                      int16_t *x, int16_t len)
+{
+    int32_t tmp;
+
+    for (int i = 0; i < len; i++) {
+        tmp = SPL_MUL_16_16(y[1], ba[3]);     /* (-a[1])*y[i-1] (low part) */
+        tmp += SPL_MUL_16_16(y[3], ba[4]);    /* (-a[2])*y[i-2] (low part) */
+        tmp = (tmp >> 15);
+        tmp += SPL_MUL_16_16(y[0], ba[3]);    /* (-a[1])*y[i-1] (high part) */
+        tmp += SPL_MUL_16_16(y[2], ba[4]);    /* (-a[2])*y[i-2] (high part) */
+        tmp = (tmp * 2);
+
+        tmp += SPL_MUL_16_16(signal[i], ba[0]);       /* b[0]*x[0] */
+        tmp += SPL_MUL_16_16(x[0], ba[1]);    /* b[1]*x[i-1] */
+        tmp += SPL_MUL_16_16(x[1], ba[2]);    /* b[2]*x[i-2] */
+
+        /* Update state (input part) */
+        x[1] = x[0];
+        x[0] = signal[i];
+
+        /* Convert back to Q0 and multiply with 2 */
+        signal[i] = av_clip_intp2(tmp + 1024, 26) >> 11;
+
+        /* Update state (filtered part) */
+        y[2] = y[0];
+        y[3] = y[1];
+
+        /* upshift tmp by 3 with saturation */
+        if (tmp > 268435455) {
+            tmp = INT32_MAX;
+        } else if (tmp < -268435456) {
+            tmp = INT32_MIN;
+        } else {
+            tmp = tmp * 8;
+        }
+
+        y[0] = tmp >> 16;
+        y[1] = (tmp - (y[0] * (1 << 16))) >> 1;
+    }
+}
+
+static int ilbc_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame_ptr, AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    AVFrame *frame     = data;
+    ILBCContext *s     = avctx->priv_data;
+    int mode = s->mode, ret;
+    int16_t *plc_data = &s->plc_residual[LPC_FILTERORDER];
+
+    if ((ret = init_get_bits8(&s->gb, buf, avpkt->size)) < 0)
+        return ret;
+    memset(&s->frame, 0, sizeof(ILBCFrame));
+
+    frame->nb_samples = s->block_samples;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    if (unpack_frame(s))
+        mode = 0;
+    if (s->frame.start < 1 || s->frame.start > 5)
+        mode = 0;
+
+    if (mode) {
+        index_conv(s->frame.cb_index);
+
+        lsf_dequantization(s->lsfdeq, s->frame.lsf, s->lpc_n);
+        lsf_check_stability(s->lsfdeq, LPC_FILTERORDER, s->lpc_n);
+        lsp_interpolate(s->syntdenum, s->weightdenum,
+                        s->lsfdeq, LPC_FILTERORDER, s);
+        decode_residual(s, &s->frame, s->decresidual, s->syntdenum);
+
+        do_plc(s->plc_residual, s->plc_lpc, 0,
+                               s->decresidual, s->syntdenum + (LPC_FILTERORDER + 1) * (s->nsub - 1),
+                               s->last_lag, s);
+
+        memcpy(s->decresidual, s->plc_residual, s->block_samples * 2);
+    }
+
+    if (s->enhancer) {
+        /* TODO */
+    } else {
+        int16_t lag, i;
+
+        /* Find last lag (since the enhancer is not called to give this info) */
+        if (s->mode == 20) {
+            lag = xcorr_coeff(&s->decresidual[s->block_samples-60], &s->decresidual[s->block_samples-80],
+                              60, 80, 20, -1);
+        } else {
+            lag = xcorr_coeff(&s->decresidual[s->block_samples-ENH_BLOCKL],
+                              &s->decresidual[s->block_samples-ENH_BLOCKL-20],
+                              ENH_BLOCKL, 100, 20, -1);
+        }
+
+        /* Store lag (it is needed if next packet is lost) */
+        s->last_lag = lag;
+
+        /* copy data and run synthesis filter */
+        memcpy(plc_data, s->decresidual, s->block_samples * 2);
+
+        /* Set up the filter state */
+        memcpy(&plc_data[-LPC_FILTERORDER], s->syntMem, LPC_FILTERORDER * 2);
+
+        for (i = 0; i < s->nsub; i++) {
+            filter_arfq12(plc_data+i*SUBL, plc_data+i*SUBL,
+                                      s->syntdenum + i*(LPC_FILTERORDER + 1),
+                                      LPC_FILTERORDER + 1, SUBL);
+        }
+
+        /* Save the filter state */
+        memcpy(s->syntMem, &plc_data[s->block_samples-LPC_FILTERORDER], LPC_FILTERORDER * 2);
+    }
+
+    memcpy(frame->data[0], plc_data, s->block_samples * 2);
+
+    hp_output((int16_t *)frame->data[0], hp_out_coeffs,
+              s->hpimemy, s->hpimemx, s->block_samples);
+
+    memcpy(s->old_syntdenum, s->syntdenum, s->nsub*(LPC_FILTERORDER + 1) * 2);
+
+    s->prev_enh_pl = 0;
+    if (mode == 0)
+        s->prev_enh_pl = 1;
+
+    *got_frame_ptr = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int ilbc_decode_init(AVCodecContext *avctx)
+{
+    ILBCContext *s  = avctx->priv_data;
+
+    if (avctx->block_align == 38)
+        s->mode = 20;
+    else if (avctx->block_align == 50)
+        s->mode = 30;
+    else if (avctx->bit_rate > 0)
+        s->mode = avctx->bit_rate <= 14000 ? 30 : 20;
+    else
+        return AVERROR_INVALIDDATA;
+
+    avctx->channels       = 1;
+    avctx->channel_layout = AV_CH_LAYOUT_MONO;
+    avctx->sample_rate    = 8000;
+    avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
+
+    if (s->mode == 30) {
+        s->block_samples = 240;
+        s->nsub = NSUB_30MS;
+        s->nasub = NASUB_30MS;
+        s->lpc_n = LPC_N_30MS;
+        s->state_short_len = STATE_SHORT_LEN_30MS;
+    } else {
+        s->block_samples = 160;
+        s->nsub = NSUB_20MS;
+        s->nasub = NASUB_20MS;
+        s->lpc_n = LPC_N_20MS;
+        s->state_short_len = STATE_SHORT_LEN_20MS;
+    }
+
+    return 0;
+}
+
+AVCodec ff_ilbc_decoder = {
+    .name           = "ilbc",
+    .long_name      = NULL_IF_CONFIG_SMALL("iLBC (Internet Low Bitrate Codec)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ILBC,
+    .init           = ilbc_decode_init,
+    .decode         = ilbc_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(ILBCContext),
+};
diff --git a/libavcodec/imc.c b/libavcodec/imc.c
index 100e6f8..82a9081 100644
--- a/libavcodec/imc.c
+++ b/libavcodec/imc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,16 +35,15 @@
 #include <stdio.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/internal.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bswapdsp.h"
+#include "get_bits.h"
 #include "fft.h"
 #include "internal.h"
 #include "sinewin.h"
-#include "vlc.h"
 
 #include "imcdata.h"
 
@@ -93,10 +92,10 @@ typedef struct IMCContext {
     //@}
 
     float sqrt_tab[30];
-    BitstreamContext bc;
+    GetBitContext gb;
 
     BswapDSPContext bdsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext fft;
     DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS / 2];
     float *out_samples;
@@ -105,6 +104,8 @@ typedef struct IMCContext {
 
     int8_t cyclTab[32], cyclTab2[32];
     float  weights1[31], weights2[31];
+
+    AVCodecContext *avctx;
 } IMCContext;
 
 static VLC huffman_vlc[4][4];
@@ -137,8 +138,8 @@ static av_cold void iac_generate_tabs(IMCContext *q, int sampling_rate)
 
         if (i > 0) {
             tb = bark - prev_bark;
-            q->weights1[i - 1] = pow(10.0, -1.0 * tb);
-            q->weights2[i - 1] = pow(10.0, -2.7 * tb);
+            q->weights1[i - 1] = ff_exp10(-1.0 * tb);
+            q->weights2[i - 1] = ff_exp10(-2.7 * tb);
         }
         prev_bark = bark;
 
@@ -180,6 +181,14 @@ static av_cold int imc_decode_init(AVCodecContext *avctx)
     IMCContext *q = avctx->priv_data;
     double r1, r2;
 
+    if (avctx->codec_id == AV_CODEC_ID_IAC && avctx->sample_rate > 96000) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Strange sample rate of %i, file likely corrupt or "
+               "needing a new table derivation method.\n",
+               avctx->sample_rate);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (avctx->codec_id == AV_CODEC_ID_IMC)
         avctx->channels = 1;
 
@@ -248,7 +257,13 @@ static av_cold int imc_decode_init(AVCodecContext *avctx)
         return ret;
     }
     ff_bswapdsp_init(&q->bdsp);
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!q->fdsp) {
+        ff_fft_end(&q->fft);
+
+        return AVERROR(ENOMEM);
+    }
+
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;
     avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO
                                                  : AV_CH_LAYOUT_STEREO;
@@ -330,12 +345,12 @@ static void imc_read_level_coeffs(IMCContext *q, int stream_format_code,
     if (stream_format_code & 4)
         start = 1;
     if (start)
-        levlCoeffs[0] = bitstream_read(&q->bc, 7);
+        levlCoeffs[0] = get_bits(&q->gb, 7);
     for (i = start; i < BANDS; i++) {
-        levlCoeffs[i] = bitstream_read_vlc(&q->bc, hufftab[cb_sel[i]]->table,
-                                           hufftab[cb_sel[i]]->bits, 2);
+        levlCoeffs[i] = get_vlc2(&q->gb, hufftab[cb_sel[i]]->table,
+                                 hufftab[cb_sel[i]]->bits, 2);
         if (levlCoeffs[i] == 17)
-            levlCoeffs[i] += bitstream_read(&q->bc, 4);
+            levlCoeffs[i] += get_bits(&q->gb, 4);
     }
 }
 
@@ -344,10 +359,10 @@ static void imc_read_level_coeffs_raw(IMCContext *q, int stream_format_code,
 {
     int i;
 
-    q->coef0_pos  = bitstream_read(&q->bc, 5);
-    levlCoeffs[0] = bitstream_read(&q->bc, 7);
+    q->coef0_pos  = get_bits(&q->gb, 5);
+    levlCoeffs[0] = get_bits(&q->gb, 7);
     for (i = 1; i < BANDS; i++)
-        levlCoeffs[i] = bitstream_read(&q->bc, 4);
+        levlCoeffs[i] = get_bits(&q->gb, 4);
 }
 
 static void imc_decode_level_coefficients(IMCContext *q, int *levlCoeffBuf,
@@ -357,7 +372,7 @@ static void imc_decode_level_coefficients(IMCContext *q, int *levlCoeffBuf,
     float tmp, tmp2;
     // maybe some frequency division thingy
 
-    flcoeffs1[0] = 20000.0 / pow (2, levlCoeffBuf[0] * 0.18945); // 0.18945 = log2(10) * 0.05703125
+    flcoeffs1[0] = 20000.0 / exp2 (levlCoeffBuf[0] * 0.18945); // 0.18945 = log2(10) * 0.05703125
     flcoeffs2[0] = log2f(flcoeffs1[0]);
     tmp  = flcoeffs1[0];
     tmp2 = flcoeffs2[0];
@@ -451,8 +466,13 @@ static int bit_allocation(IMCContext *q, IMCChannel *chctx,
     for (i = 0; i < BANDS; i++)
         highest = FFMAX(highest, chctx->flcoeffs1[i]);
 
-    for (i = 0; i < BANDS - 1; i++)
+    for (i = 0; i < BANDS - 1; i++) {
+        if (chctx->flcoeffs5[i] <= 0) {
+            av_log(q->avctx, AV_LOG_ERROR, "flcoeffs5 %f invalid\n", chctx->flcoeffs5[i]);
+            return AVERROR_INVALIDDATA;
+        }
         chctx->flcoeffs4[i] = chctx->flcoeffs3[i] - log2f(chctx->flcoeffs5[i]);
+    }
     chctx->flcoeffs4[BANDS - 1] = limit;
 
     highest = highest * 0.25;
@@ -614,19 +634,19 @@ static void imc_get_skip_coeff(IMCContext *q, IMCChannel *chctx)
             chctx->skipFlagBits[i] = band_tab[i + 1] - band_tab[i];
 
             for (j = band_tab[i]; j < band_tab[i + 1]; j++) {
-                chctx->skipFlags[j] = bitstream_read_bit(&q->bc);
+                chctx->skipFlags[j] = get_bits1(&q->gb);
                 if (chctx->skipFlags[j])
                     chctx->skipFlagCount[i]++;
             }
         } else {
             for (j = band_tab[i]; j < band_tab[i + 1] - 1; j += 2) {
-                if (!bitstream_read_bit(&q->bc)) { // 0
+                if (!get_bits1(&q->gb)) { // 0
                     chctx->skipFlagBits[i]++;
                     chctx->skipFlags[j]      = 1;
                     chctx->skipFlags[j + 1]  = 1;
                     chctx->skipFlagCount[i] += 2;
                 } else {
-                    if (bitstream_read_bit(&q->bc)) { // 11
+                    if (get_bits1(&q->gb)) { // 11
                         chctx->skipFlagBits[i] += 2;
                         chctx->skipFlags[j]     = 0;
                         chctx->skipFlags[j + 1] = 1;
@@ -634,7 +654,7 @@ static void imc_get_skip_coeff(IMCContext *q, IMCChannel *chctx)
                     } else {
                         chctx->skipFlagBits[i] += 3;
                         chctx->skipFlags[j + 1] = 0;
-                        if (!bitstream_read_bit(&q->bc)) { // 100
+                        if (!get_bits1(&q->gb)) { // 100
                             chctx->skipFlags[j] = 1;
                             chctx->skipFlagCount[i]++;
                         } else { // 101
@@ -646,7 +666,7 @@ static void imc_get_skip_coeff(IMCContext *q, IMCChannel *chctx)
 
             if (j < band_tab[i + 1]) {
                 chctx->skipFlagBits[i]++;
-                if ((chctx->skipFlags[j] = bitstream_read_bit(&q->bc)))
+                if ((chctx->skipFlags[j] = get_bits1(&q->gb)))
                     chctx->skipFlagCount[i]++;
             }
         }
@@ -771,7 +791,8 @@ static int inverse_quant_coeff(IMCContext *q, IMCChannel *chctx,
 }
 
 
-static int imc_get_coeffs(IMCContext *q, IMCChannel *chctx)
+static void imc_get_coeffs(AVCodecContext *avctx,
+                           IMCContext *q, IMCChannel *chctx)
 {
     int i, j, cw_len, cw;
 
@@ -783,19 +804,19 @@ static int imc_get_coeffs(IMCContext *q, IMCChannel *chctx)
                 cw_len = chctx->CWlengthT[j];
                 cw = 0;
 
-                if (bitstream_tell(&q->bc) + cw_len > 512) {
-                    ff_dlog(NULL, "Band %i coeff %i cw_len %i\n", i, j, cw_len);
-                    return AVERROR_INVALIDDATA;
+                if (cw_len && (!chctx->bandFlagsBuf[i] || !chctx->skipFlags[j])) {
+                    if (get_bits_count(&q->gb) + cw_len > 512) {
+                        av_log(avctx, AV_LOG_WARNING,
+                            "Potential problem on band %i, coefficient %i"
+                            ": cw_len=%i\n", i, j, cw_len);
+                    } else
+                        cw = get_bits(&q->gb, cw_len);
                 }
 
-                if (cw_len && (!chctx->bandFlagsBuf[i] || !chctx->skipFlags[j]))
-                    cw = bitstream_read(&q->bc, cw_len);
-
                 chctx->codewords[j] = cw;
             }
         }
     }
-    return 0;
 }
 
 static void imc_refine_bit_allocation(IMCContext *q, IMCChannel *chctx)
@@ -809,7 +830,7 @@ static void imc_refine_bit_allocation(IMCContext *q, IMCChannel *chctx)
         for (j = band_tab[i]; j < band_tab[i + 1]; j++)
             chctx->sumLenArr[i] += chctx->CWlengthT[j];
         if (chctx->bandFlagsBuf[i])
-            if ((((band_tab[i + 1] - band_tab[i]) * 1.5) > chctx->sumLenArr[i]) && (chctx->sumLenArr[i] > 0))
+            if (((int)((band_tab[i + 1] - band_tab[i]) * 1.5) > chctx->sumLenArr[i]) && (chctx->sumLenArr[i] > 0))
                 chctx->skipFlagRaw[i] = 1;
     }
 
@@ -853,13 +874,13 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
 
 
     /* Check the frame header */
-    imc_hdr = bitstream_read(&q->bc, 9);
+    imc_hdr = get_bits(&q->gb, 9);
     if (imc_hdr & 0x18) {
         av_log(avctx, AV_LOG_ERROR, "frame header check failed!\n");
         av_log(avctx, AV_LOG_ERROR, "got %X.\n", imc_hdr);
         return AVERROR_INVALIDDATA;
     }
-    stream_format_code = bitstream_read(&q->bc, 3);
+    stream_format_code = get_bits(&q->gb, 3);
 
     if (stream_format_code & 0x04)
         chctx->decoder_reset = 1;
@@ -872,7 +893,7 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
         chctx->decoder_reset = 0;
     }
 
-    flag = bitstream_read_bit(&q->bc);
+    flag = get_bits1(&q->gb);
     if (stream_format_code & 0x1)
         imc_read_level_coeffs_raw(q, stream_format_code, chctx->levlCoeffBuf);
     else
@@ -888,6 +909,13 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
         imc_decode_level_coefficients2(q, chctx->levlCoeffBuf, chctx->old_floor,
                                        chctx->flcoeffs1, chctx->flcoeffs2);
 
+    for(i=0; i<BANDS; i++) {
+        if(chctx->flcoeffs1[i] > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "scalefactor out of range\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     memcpy(chctx->old_floor, chctx->flcoeffs1, 32 * sizeof(float));
 
     counter = 0;
@@ -910,7 +938,7 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
         memset(chctx->bandFlagsBuf, 0, BANDS * sizeof(int));
         for (i = 0; i < BANDS - 1; i++)
             if (chctx->bandWidthT[i])
-                chctx->bandFlagsBuf[i] = bitstream_read_bit(&q->bc);
+                chctx->bandFlagsBuf[i] = get_bits1(&q->gb);
 
         imc_calculate_coeffs(q, chctx->flcoeffs1, chctx->flcoeffs2,
                              chctx->bandWidthT, chctx->flcoeffs3,
@@ -945,7 +973,7 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
     }
 
     if ((ret = bit_allocation(q, chctx, stream_format_code,
-                              512 - bitscount - bitstream_tell(&q->bc),
+                              512 - bitscount - get_bits_count(&q->gb),
                               flag)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Bit allocations failed\n");
         chctx->decoder_reset = 1;
@@ -969,11 +997,7 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
 
     memset(chctx->codewords, 0, sizeof(chctx->codewords));
 
-    if (imc_get_coeffs(q, chctx) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Read coefficients failed\n");
-        chctx->decoder_reset = 1;
-        return AVERROR_INVALIDDATA;
-    }
+    imc_get_coeffs(avctx, q, chctx);
 
     if (inverse_quant_coeff(q, chctx, stream_format_code) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Inverse quantization of coefficients failed\n");
@@ -1000,6 +1024,8 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
 
     LOCAL_ALIGNED_16(uint16_t, buf16, [(IMC_BLOCK_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / 2]);
 
+    q->avctx = avctx;
+
     if (buf_size < IMC_BLOCK_SIZE * avctx->channels) {
         av_log(avctx, AV_LOG_ERROR, "frame too small!\n");
         return AVERROR_INVALIDDATA;
@@ -1007,17 +1033,15 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = COEFFS;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (i = 0; i < avctx->channels; i++) {
         q->out_samples = (float *)frame->extended_data[i];
 
         q->bdsp.bswap16_buf(buf16, (const uint16_t *) buf, IMC_BLOCK_SIZE / 2);
 
-        bitstream_init8(&q->bc, (const uint8_t *)buf16, IMC_BLOCK_SIZE);
+        init_get_bits(&q->gb, (const uint8_t*)buf16, IMC_BLOCK_SIZE * 8);
 
         buf += IMC_BLOCK_SIZE;
 
@@ -1026,7 +1050,7 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (avctx->channels == 2) {
-        q->fdsp.butterflies_float((float *)frame->extended_data[0],
+        q->fdsp->butterflies_float((float *)frame->extended_data[0],
                                   (float *)frame->extended_data[1], COEFFS);
     }
 
@@ -1035,17 +1059,25 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
     return IMC_BLOCK_SIZE * avctx->channels;
 }
 
-
 static av_cold int imc_decode_close(AVCodecContext * avctx)
 {
     IMCContext *q = avctx->priv_data;
 
     ff_fft_end(&q->fft);
+    av_freep(&q->fdsp);
 
     return 0;
 }
 
+static av_cold void flush(AVCodecContext *avctx)
+{
+    IMCContext *q = avctx->priv_data;
 
+    q->chctx[0].decoder_reset =
+    q->chctx[1].decoder_reset = 1;
+}
+
+#if CONFIG_IMC_DECODER
 AVCodec ff_imc_decoder = {
     .name           = "imc",
     .long_name      = NULL_IF_CONFIG_SMALL("IMC (Intel Music Coder)"),
@@ -1055,11 +1087,13 @@ AVCodec ff_imc_decoder = {
     .init           = imc_decode_init,
     .close          = imc_decode_close,
     .decode         = imc_decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_IAC_DECODER
 AVCodec ff_iac_decoder = {
     .name           = "iac",
     .long_name      = NULL_IF_CONFIG_SMALL("IAC (Indeo Audio Coder)"),
@@ -1069,7 +1103,9 @@ AVCodec ff_iac_decoder = {
     .init           = imc_decode_init,
     .close          = imc_decode_close,
     .decode         = imc_decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/imcdata.h b/libavcodec/imcdata.h
index 8e99391..64e7c71 100644
--- a/libavcodec/imcdata.h
+++ b/libavcodec/imcdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/imdct15.c b/libavcodec/imdct15.c
deleted file mode 100644
index e02e9ce..0000000
--- a/libavcodec/imdct15.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2013-2014 Mozilla Corporation
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Celt non-power of 2 iMDCT
- */
-
-#include <float.h>
-#include <math.h>
-#include <stddef.h>
-
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/common.h"
-
-#include "avfft.h"
-#include "imdct15.h"
-#include "opus.h"
-
-// minimal iMDCT size to make SIMD opts easier
-#define CELT_MIN_IMDCT_SIZE 120
-
-// complex c = a * b
-#define CMUL3(cre, cim, are, aim, bre, bim)          \
-do {                                                 \
-    cre = are * bre - aim * bim;                     \
-    cim = are * bim + aim * bre;                     \
-} while (0)
-
-#define CMUL(c, a, b) CMUL3((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
-
-// complex c = a * b
-//         d = a * conjugate(b)
-#define CMUL2(c, d, a, b)                            \
-do {                                                 \
-    float are = (a).re;                              \
-    float aim = (a).im;                              \
-    float bre = (b).re;                              \
-    float bim = (b).im;                              \
-    float rr  = are * bre;                           \
-    float ri  = are * bim;                           \
-    float ir  = aim * bre;                           \
-    float ii  = aim * bim;                           \
-    (c).re =  rr - ii;                               \
-    (c).im =  ri + ir;                               \
-    (d).re =  rr + ii;                               \
-    (d).im = -ri + ir;                               \
-} while (0)
-
-av_cold void ff_imdct15_uninit(IMDCT15Context **ps)
-{
-    IMDCT15Context *s = *ps;
-    int i;
-
-    if (!s)
-        return;
-
-    for (i = 0; i < FF_ARRAY_ELEMS(s->exptab); i++)
-        av_freep(&s->exptab[i]);
-
-    av_freep(&s->twiddle_exptab);
-
-    av_freep(&s->tmp);
-
-    av_freep(ps);
-}
-
-static void imdct15_half(IMDCT15Context *s, float *dst, const float *src,
-                         ptrdiff_t stride, float scale);
-
-av_cold int ff_imdct15_init(IMDCT15Context **ps, int N)
-{
-    IMDCT15Context *s;
-    int len2 = 15 * (1 << N);
-    int len  = 2 * len2;
-    int i, j;
-
-    if (len2 > CELT_MAX_FRAME_SIZE || len2 < CELT_MIN_IMDCT_SIZE)
-        return AVERROR(EINVAL);
-
-    s = av_mallocz(sizeof(*s));
-    if (!s)
-        return AVERROR(ENOMEM);
-
-    s->fft_n = N - 1;
-    s->len4 = len2 / 2;
-    s->len2 = len2;
-
-    s->tmp  = av_malloc(len * 2 * sizeof(*s->tmp));
-    if (!s->tmp)
-        goto fail;
-
-    s->twiddle_exptab  = av_malloc(s->len4 * sizeof(*s->twiddle_exptab));
-    if (!s->twiddle_exptab)
-        goto fail;
-
-    for (i = 0; i < s->len4; i++) {
-        s->twiddle_exptab[i].re = cos(2 * M_PI * (i + 0.125 + s->len4) / len);
-        s->twiddle_exptab[i].im = sin(2 * M_PI * (i + 0.125 + s->len4) / len);
-    }
-
-    for (i = 0; i < FF_ARRAY_ELEMS(s->exptab); i++) {
-        int N = 15 * (1 << i);
-        s->exptab[i] = av_malloc(sizeof(*s->exptab[i]) * FFMAX(N, 19));
-        if (!s->exptab[i])
-            goto fail;
-
-        for (j = 0; j < N; j++) {
-            s->exptab[i][j].re = cos(2 * M_PI * j / N);
-            s->exptab[i][j].im = sin(2 * M_PI * j / N);
-        }
-    }
-
-    // wrap around to simplify fft15
-    for (j = 15; j < 19; j++)
-        s->exptab[0][j] = s->exptab[0][j - 15];
-
-    s->imdct_half = imdct15_half;
-
-    if (ARCH_AARCH64)
-        ff_imdct15_init_aarch64(s);
-
-    *ps = s;
-
-    return 0;
-
-fail:
-    ff_imdct15_uninit(&s);
-    return AVERROR(ENOMEM);
-}
-
-static void fft5(FFTComplex *out, const FFTComplex *in, ptrdiff_t stride)
-{
-    // [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
-    static const FFTComplex fact[] = { { 0.30901699437494745,  0.95105651629515353 },
-                                       { -0.80901699437494734, 0.58778525229247325 } };
-
-    FFTComplex z[4][4];
-
-    CMUL2(z[0][0], z[0][3], in[1 * stride], fact[0]);
-    CMUL2(z[0][1], z[0][2], in[1 * stride], fact[1]);
-    CMUL2(z[1][0], z[1][3], in[2 * stride], fact[0]);
-    CMUL2(z[1][1], z[1][2], in[2 * stride], fact[1]);
-    CMUL2(z[2][0], z[2][3], in[3 * stride], fact[0]);
-    CMUL2(z[2][1], z[2][2], in[3 * stride], fact[1]);
-    CMUL2(z[3][0], z[3][3], in[4 * stride], fact[0]);
-    CMUL2(z[3][1], z[3][2], in[4 * stride], fact[1]);
-
-    out[0].re = in[0].re + in[stride].re + in[2 * stride].re + in[3 * stride].re + in[4 * stride].re;
-    out[0].im = in[0].im + in[stride].im + in[2 * stride].im + in[3 * stride].im + in[4 * stride].im;
-
-    out[1].re = in[0].re + z[0][0].re + z[1][1].re + z[2][2].re + z[3][3].re;
-    out[1].im = in[0].im + z[0][0].im + z[1][1].im + z[2][2].im + z[3][3].im;
-
-    out[2].re = in[0].re + z[0][1].re + z[1][3].re + z[2][0].re + z[3][2].re;
-    out[2].im = in[0].im + z[0][1].im + z[1][3].im + z[2][0].im + z[3][2].im;
-
-    out[3].re = in[0].re + z[0][2].re + z[1][0].re + z[2][3].re + z[3][1].re;
-    out[3].im = in[0].im + z[0][2].im + z[1][0].im + z[2][3].im + z[3][1].im;
-
-    out[4].re = in[0].re + z[0][3].re + z[1][2].re + z[2][1].re + z[3][0].re;
-    out[4].im = in[0].im + z[0][3].im + z[1][2].im + z[2][1].im + z[3][0].im;
-}
-
-static void fft15(IMDCT15Context *s, FFTComplex *out, const FFTComplex *in,
-                  ptrdiff_t stride)
-{
-    const FFTComplex *exptab = s->exptab[0];
-    FFTComplex tmp[5];
-    FFTComplex tmp1[5];
-    FFTComplex tmp2[5];
-    int k;
-
-    fft5(tmp,  in,              stride * 3);
-    fft5(tmp1, in +     stride, stride * 3);
-    fft5(tmp2, in + 2 * stride, stride * 3);
-
-    for (k = 0; k < 5; k++) {
-        FFTComplex t1, t2;
-
-        CMUL(t1, tmp1[k], exptab[k]);
-        CMUL(t2, tmp2[k], exptab[2 * k]);
-        out[k].re = tmp[k].re + t1.re + t2.re;
-        out[k].im = tmp[k].im + t1.im + t2.im;
-
-        CMUL(t1, tmp1[k], exptab[k + 5]);
-        CMUL(t2, tmp2[k], exptab[2 * (k + 5)]);
-        out[k + 5].re = tmp[k].re + t1.re + t2.re;
-        out[k + 5].im = tmp[k].im + t1.im + t2.im;
-
-        CMUL(t1, tmp1[k], exptab[k + 10]);
-        CMUL(t2, tmp2[k], exptab[2 * k + 5]);
-        out[k + 10].re = tmp[k].re + t1.re + t2.re;
-        out[k + 10].im = tmp[k].im + t1.im + t2.im;
-    }
-}
-
-/*
- * FFT of the length 15 * (2^N)
- */
-static void fft_calc(IMDCT15Context *s, FFTComplex *out, const FFTComplex *in,
-                     int N, ptrdiff_t stride)
-{
-    if (N) {
-        const FFTComplex *exptab = s->exptab[N];
-        const int len2 = 15 * (1 << (N - 1));
-        int k;
-
-        fft_calc(s, out,        in,          N - 1, stride * 2);
-        fft_calc(s, out + len2, in + stride, N - 1, stride * 2);
-
-        for (k = 0; k < len2; k++) {
-            FFTComplex t;
-
-            CMUL(t, out[len2 + k], exptab[k]);
-
-            out[len2 + k].re = out[k].re - t.re;
-            out[len2 + k].im = out[k].im - t.im;
-
-            out[k].re += t.re;
-            out[k].im += t.im;
-        }
-    } else
-        fft15(s, out, in, stride);
-}
-
-static void imdct15_half(IMDCT15Context *s, float *dst, const float *src,
-                         ptrdiff_t stride, float scale)
-{
-    FFTComplex *z = (FFTComplex *)dst;
-    const int len8 = s->len4 / 2;
-    const float *in1 = src;
-    const float *in2 = src + (s->len2 - 1) * stride;
-    int i;
-
-    for (i = 0; i < s->len4; i++) {
-        FFTComplex tmp = { *in2, *in1 };
-        CMUL(s->tmp[i], tmp, s->twiddle_exptab[i]);
-        in1 += 2 * stride;
-        in2 -= 2 * stride;
-    }
-
-    fft_calc(s, z, s->tmp, s->fft_n, 1);
-
-    for (i = 0; i < len8; i++) {
-        float r0, i0, r1, i1;
-
-        CMUL3(r0, i1, z[len8 - i - 1].im, z[len8 - i - 1].re,  s->twiddle_exptab[len8 - i - 1].im, s->twiddle_exptab[len8 - i - 1].re);
-        CMUL3(r1, i0, z[len8 + i].im,     z[len8 + i].re,      s->twiddle_exptab[len8 + i].im,     s->twiddle_exptab[len8 + i].re);
-        z[len8 - i - 1].re = scale * r0;
-        z[len8 - i - 1].im = scale * i0;
-        z[len8 + i].re     = scale * r1;
-        z[len8 + i].im     = scale * i1;
-    }
-}
diff --git a/libavcodec/imdct15.h b/libavcodec/imdct15.h
deleted file mode 100644
index ed3f003..0000000
--- a/libavcodec/imdct15.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_IMDCT15_H
-#define AVCODEC_IMDCT15_H
-
-#include <stddef.h>
-
-#include "avfft.h"
-
-typedef struct IMDCT15Context {
-    int fft_n;
-    int len2;
-    int len4;
-
-    FFTComplex *tmp;
-
-    FFTComplex *twiddle_exptab;
-
-    FFTComplex *exptab[6];
-
-    /**
-     * Calculate the middle half of the iMDCT
-     */
-    void (*imdct_half)(struct IMDCT15Context *s, float *dst, const float *src,
-                       ptrdiff_t src_stride, float scale);
-} IMDCT15Context;
-
-/**
- * Init an iMDCT of the length 2 * 15 * (2^N)
- */
-int ff_imdct15_init(IMDCT15Context **s, int N);
-
-/**
- * Free an iMDCT.
- */
-void ff_imdct15_uninit(IMDCT15Context **s);
-
-
-void ff_imdct15_init_aarch64(IMDCT15Context *s);
-
-#endif /* AVCODEC_IMDCT15_H */
diff --git a/libavcodec/imgconvert.c b/libavcodec/imgconvert.c
index 9c8fab5..1fd636c 100644
--- a/libavcodec/imgconvert.c
+++ b/libavcodec/imgconvert.c
@@ -2,20 +2,20 @@
  * Misc image conversion routines
  * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,123 +27,58 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
+#include "libavutil/avassert.h"
 #include "libavutil/colorspace.h"
 #include "libavutil/common.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/internal.h"
 #include "libavutil/imgutils.h"
 
-static int is_gray(const AVPixFmtDescriptor *desc)
+#if FF_API_GETCHROMA
+void avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift)
 {
-    return desc->nb_components - (desc->flags & AV_PIX_FMT_FLAG_ALPHA) == 1;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    *h_shift = desc->log2_chroma_w;
+    *v_shift = desc->log2_chroma_h;
 }
+#endif
 
 int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt,
                              enum AVPixelFormat src_pix_fmt,
                              int has_alpha)
 {
-    const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt);
-    const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt);
-    int loss, i, nb_components = FFMIN(src_desc->nb_components,
-                                       dst_desc->nb_components);
-
-    /* compute loss */
-    loss = 0;
-
-    if (dst_pix_fmt == src_pix_fmt)
-        return 0;
-
-    for (i = 0; i < nb_components; i++)
-        if (src_desc->comp[i].depth > dst_desc->comp[i].depth)
-            loss |= FF_LOSS_DEPTH;
-
-    if (dst_desc->log2_chroma_w > src_desc->log2_chroma_w ||
-        dst_desc->log2_chroma_h > src_desc->log2_chroma_h)
-        loss |= FF_LOSS_RESOLUTION;
-
-    if ((src_desc->flags & AV_PIX_FMT_FLAG_RGB) != (dst_desc->flags & AV_PIX_FMT_FLAG_RGB))
-        loss |= FF_LOSS_COLORSPACE;
-
-    if (has_alpha && !(dst_desc->flags & AV_PIX_FMT_FLAG_ALPHA) &&
-         (src_desc->flags & AV_PIX_FMT_FLAG_ALPHA))
-        loss |= FF_LOSS_ALPHA;
-
-    if (dst_pix_fmt == AV_PIX_FMT_PAL8 && !is_gray(src_desc))
-        return loss | FF_LOSS_COLORQUANT;
-
-    if (src_desc->nb_components > dst_desc->nb_components)
-        if (is_gray(dst_desc))
-            loss |= FF_LOSS_CHROMA;
-
-    return loss;
+    return av_get_pix_fmt_loss(dst_pix_fmt, src_pix_fmt, has_alpha);
 }
 
-static enum AVPixelFormat avcodec_find_best_pix_fmt1(enum AVPixelFormat *pix_fmt_list,
-                                      enum AVPixelFormat src_pix_fmt,
-                                      int has_alpha,
-                                      int loss_mask)
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr)
 {
-    int dist, i, loss, min_dist;
-    enum AVPixelFormat dst_pix_fmt;
-
-    /* find exact color match with smallest size */
-    dst_pix_fmt = AV_PIX_FMT_NONE;
-    min_dist = 0x7fffffff;
-    i = 0;
-    while (pix_fmt_list[i] != AV_PIX_FMT_NONE) {
-        enum AVPixelFormat pix_fmt = pix_fmt_list[i];
-
-        if (i > AV_PIX_FMT_NB) {
-            av_log(NULL, AV_LOG_ERROR, "Pixel format list longer than expected, "
-                   "it is either not properly terminated or contains duplicates\n");
-            return AV_PIX_FMT_NONE;
-        }
+    return av_find_best_pix_fmt_of_2(dst_pix_fmt1, dst_pix_fmt2, src_pix_fmt, has_alpha, loss_ptr);
+}
 
-        loss = avcodec_get_pix_fmt_loss(pix_fmt, src_pix_fmt, has_alpha) & loss_mask;
-        if (loss == 0) {
-            dist = av_get_bits_per_pixel(av_pix_fmt_desc_get(pix_fmt));
-            if (dist < min_dist) {
-                min_dist = dist;
-                dst_pix_fmt = pix_fmt;
-            }
-        }
-        i++;
-    }
-    return dst_pix_fmt;
+enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr)
+{
+    return avcodec_find_best_pix_fmt_of_2(dst_pix_fmt1, dst_pix_fmt2, src_pix_fmt, has_alpha, loss_ptr);
 }
 
-enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat *pix_fmt_list,
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list,
                                             enum AVPixelFormat src_pix_fmt,
-                                            int has_alpha, int *loss_ptr)
-{
-    enum AVPixelFormat dst_pix_fmt;
-    int loss_mask, i;
-    static const int loss_mask_order[] = {
-        ~0, /* no loss first */
-        ~FF_LOSS_ALPHA,
-        ~FF_LOSS_RESOLUTION,
-        ~(FF_LOSS_COLORSPACE | FF_LOSS_RESOLUTION),
-        ~FF_LOSS_COLORQUANT,
-        ~FF_LOSS_DEPTH,
-        0,
-    };
-
-    /* try with successive loss */
-    i = 0;
-    for(;;) {
-        loss_mask = loss_mask_order[i++];
-        dst_pix_fmt = avcodec_find_best_pix_fmt1(pix_fmt_list, src_pix_fmt,
-                                                 has_alpha, loss_mask);
-        if (dst_pix_fmt >= 0)
-            goto found;
-        if (loss_mask == 0)
-            break;
+                                            int has_alpha, int *loss_ptr){
+    int i;
+
+    enum AVPixelFormat best = AV_PIX_FMT_NONE;
+    int loss;
+
+    for (i=0; pix_fmt_list[i] != AV_PIX_FMT_NONE; i++) {
+        loss = loss_ptr ? *loss_ptr : 0;
+        best = avcodec_find_best_pix_fmt_of_2(best, pix_fmt_list[i], src_pix_fmt, has_alpha, &loss);
     }
-    return AV_PIX_FMT_NONE;
- found:
+
     if (loss_ptr)
-        *loss_ptr = avcodec_get_pix_fmt_loss(dst_pix_fmt, src_pix_fmt, has_alpha);
-    return dst_pix_fmt;
+        *loss_ptr = loss;
+    return best;
 }
 
 #if FF_API_AVPICTURE
@@ -151,8 +86,22 @@ FF_DISABLE_DEPRECATION_WARNINGS
 /* return true if yuv planar */
 static inline int is_yuv_planar(const AVPixFmtDescriptor *desc)
 {
-    return (!(desc->flags & AV_PIX_FMT_FLAG_RGB) &&
-             (desc->flags & AV_PIX_FMT_FLAG_PLANAR));
+    int i;
+    int planes[4] = { 0 };
+
+    if (     desc->flags & AV_PIX_FMT_FLAG_RGB
+        || !(desc->flags & AV_PIX_FMT_FLAG_PLANAR))
+        return 0;
+
+    /* set the used planes */
+    for (i = 0; i < desc->nb_components; i++)
+        planes[desc->comp[i].plane] = 1;
+
+    /* if there is an unused plane, the format is not planar */
+    for (i = 0; i < desc->nb_components; i++)
+        if (!planes[i])
+            return 0;
+    return 1;
 }
 
 int av_picture_crop(AVPicture *dst, const AVPicture *src,
@@ -161,16 +110,24 @@ int av_picture_crop(AVPicture *dst, const AVPicture *src,
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     int y_shift;
     int x_shift;
+    int max_step[4];
 
-    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB || !is_yuv_planar(desc))
+    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB)
         return -1;
 
     y_shift = desc->log2_chroma_h;
     x_shift = desc->log2_chroma_w;
+    av_image_fill_max_pixsteps(max_step, NULL, desc);
 
+    if (is_yuv_planar(desc)) {
     dst->data[0] = src->data[0] + (top_band * src->linesize[0]) + left_band;
     dst->data[1] = src->data[1] + ((top_band >> y_shift) * src->linesize[1]) + (left_band >> x_shift);
     dst->data[2] = src->data[2] + ((top_band >> y_shift) * src->linesize[2]) + (left_band >> x_shift);
+    } else{
+        if(top_band % (1<<y_shift) || left_band % (1<<x_shift))
+            return -1;
+        dst->data[0] = src->data[0] + (top_band * src->linesize[0]) + (left_band * max_step[0]);
+    }
 
     dst->linesize[0] = src->linesize[0];
     dst->linesize[1] = src->linesize[1];
@@ -188,9 +145,41 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
     int x_shift;
     int yheight;
     int i, y;
+    int max_step[4];
+
+    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB)
+        return -1;
+
+    if (!is_yuv_planar(desc)) {
+        if (src)
+            return -1; //TODO: Not yet implemented
+
+        av_image_fill_max_pixsteps(max_step, NULL, desc);
+
+        if (padtop || padleft) {
+            memset(dst->data[0], color[0],
+                    dst->linesize[0] * padtop + (padleft * max_step[0]));
+        }
+
+        if (padleft || padright) {
+            optr = dst->data[0] + dst->linesize[0] * padtop +
+                    (dst->linesize[0] - (padright * max_step[0]));
+            yheight = height - 1 - (padtop + padbottom);
+            for (y = 0; y < yheight; y++) {
+                memset(optr, color[0], (padleft + padright) * max_step[0]);
+                optr += dst->linesize[0];
+            }
+        }
 
-    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB ||
-        !is_yuv_planar(desc)) return -1;
+        if (padbottom || padright) {
+            optr = dst->data[0] + dst->linesize[0] * (height - padbottom) -
+                    (padright * max_step[0]);
+            memset(optr, color[0], dst->linesize[0] * padbottom +
+                    (padright * max_step[0]));
+        }
+
+        return 0;
+    }
 
     for (i = 0; i < 3; i++) {
         x_shift = i ? desc->log2_chroma_w : 0;
@@ -236,8 +225,8 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
                 (padbottom >> y_shift) + (padright >> x_shift));
         }
     }
+
     return 0;
 }
-
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif /* FF_API_AVPICTURE */
diff --git a/libavcodec/imm4.c b/libavcodec/imm4.c
new file mode 100644
index 0000000..1a4d0de
--- /dev/null
+++ b/libavcodec/imm4.c
@@ -0,0 +1,549 @@
+/*
+ * Infinity IMM4 decoder
+ *
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/thread.h"
+
+#include "avcodec.h"
+#include "bswapdsp.h"
+#include "copy_block.h"
+#include "get_bits.h"
+#include "idctdsp.h"
+#include "internal.h"
+
+typedef struct IMM4Context {
+    BswapDSPContext bdsp;
+    GetBitContext  gb;
+
+    AVFrame *prev_frame;
+    uint8_t *bitstream;
+    int bitstream_size;
+
+    int factor;
+    unsigned lo;
+    unsigned hi;
+
+    ScanTable intra_scantable;
+    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    IDCTDSPContext idsp;
+} IMM4Context;
+
+static const uint8_t intra_cb[] = {
+    24, 18, 12
+};
+
+static const uint8_t inter_cb[] = {
+    30, 20, 15
+};
+
+static const uint8_t cbplo_symbols[] = {
+    3, 4, 19, 20, 35, 36, 51, 52
+};
+
+static const uint8_t cbplo_bits[] = {
+    1, 4, 3, 6, 3, 6, 3, 6
+};
+
+static const uint8_t cbplo_codes[] = {
+    1, 1, 1, 1, 2, 2, 3, 3
+};
+
+static const uint8_t cbphi_bits[] = {
+    4, 5, 5, 4, 5, 4, 6, 4, 5, 6, 4, 4, 4, 4, 4, 2
+};
+
+static const uint8_t cbphi_codes[] = {
+    3, 5, 4, 9, 3, 7, 2, 11, 2, 3, 5, 10, 4, 8, 6, 3
+};
+
+static const uint8_t blktype_symbols[] = {
+    0, 1, 2, 3, 4, 16, 17, 18, 19, 20, 32, 33, 34, 35, 48, 50, 51, 52
+};
+
+static const uint8_t blktype_bits[] = {
+    1, 3, 3, 5, 6, 4, 7, 7, 8, 9, 4, 7, 7, 8, 6, 8, 7, 9
+};
+
+static const uint8_t blktype_codes[] = {
+    1, 3, 2, 3, 4, 3, 7, 5, 4, 4, 2, 6, 4, 3, 5, 5, 3, 2
+};
+
+static const uint16_t block_symbols[] = {
+    0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x81, 0x82, 0x83,
+    0x84, 0x85, 0x86, 0x101, 0x102, 0x103, 0x104, 0x181, 0x182, 0x183, 0x201, 0x202,
+    0x203, 0x281, 0x282, 0x283, 0x301, 0x302, 0x303, 0x381, 0x382, 0x401, 0x402,
+    0x481, 0x482, 0x501, 0x502, 0x581, 0x601, 0x681, 0x701, 0x781, 0x801, 0x881,
+    0x901, 0x981, 0xA01, 0xA81, 0xB01, 0xB81, 0xC01, 0xC81, 0xD01, 0x4001, 0x4002,
+    0x4003, 0x4081, 0x4082, 0x4101, 0x4181, 0x4201, 0x4281, 0x4301, 0x4381, 0x4401,
+    0x4481, 0x4501, 0x4581, 0x4601, 0x4681, 0x4701, 0x4781, 0x4801, 0x4881, 0x4901,
+    0x4981, 0x4A01, 0x4A81, 0x4B01, 0x4B81, 0x4C01, 0x4C81, 0x4D01, 0x4D81, 0x4E01,
+    0x4E81, 0x4F01, 0x4F81, 0x5001, 0x5081, 0x5101, 0x5181, 0x5201, 0x5281, 0x5301,
+    0x5381, 0x5401
+};
+
+static const uint8_t block_bits[] = {
+    7, 2, 4, 6, 7, 8, 9, 9, 10, 10, 11, 11, 11, 3, 6, 8, 10, 11, 12, 4, 8,
+    10, 12, 5, 9, 10, 5, 9, 12, 5, 10, 12, 6, 10, 12, 6, 10, 6, 10, 6,
+    10, 7, 12, 7, 7, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 11, 11, 12, 12, 4, 9,
+    11, 6, 11, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9,
+    9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12
+};
+
+static const uint8_t block_codes[] = {
+    3, 2, 15, 21, 23, 31, 37, 36, 33, 32, 7, 6, 32, 6, 20, 30, 15, 33, 80,
+    14, 29, 14, 81, 13, 35, 13, 12, 34, 82, 11, 12, 83, 19, 11, 84, 18,
+    10, 17, 9, 16, 8, 22, 85, 21, 20, 28, 27, 33, 32, 31, 30, 29, 28,
+    27, 26, 34, 35, 86, 87, 7, 25, 5, 15, 4, 14, 13, 12, 19, 18, 17, 16,
+    26, 25, 24, 23, 22, 21, 20, 19, 24, 23, 22, 21, 20, 19, 18, 17, 7,
+    6, 5, 4, 36, 37, 38, 39, 88, 89, 90, 91, 92, 93, 94, 95
+};
+
+static VLC cbplo_tab;
+static VLC cbphi_tab;
+static VLC blktype_tab;
+static VLC block_tab;
+
+static int get_cbphi(GetBitContext *gb, int x)
+{
+    int value;
+
+    value = get_vlc2(gb, cbphi_tab.table, cbphi_tab.bits, 1);
+    if (value < 0)
+        return AVERROR_INVALIDDATA;
+
+    return x ? value : 15 - value;
+}
+
+static int decode_block(AVCodecContext *avctx, GetBitContext *gb,
+                        int block, int factor, int flag, int offset, int flag2)
+{
+    IMM4Context *s = avctx->priv_data;
+    const uint8_t *scantable = s->intra_scantable.permutated;
+    int i, last, len, factor2;
+
+    for (i = !flag; i < 64; i++) {
+        int value;
+
+        value = get_vlc2(gb, block_tab.table, block_tab.bits, 1);
+        if (value < 0)
+            return AVERROR_INVALIDDATA;
+        if (value == 0) {
+            last = get_bits1(gb);
+            len = get_bits(gb, 6);
+            factor2 = get_sbits(gb, 8);
+        } else {
+            factor2 = value & 0x7F;
+            last = (value >> 14) & 1;
+            len = (value >> 7) & 0x3F;
+            if (get_bits1(gb))
+                factor2 = -factor2;
+        }
+        i += len;
+        if (i >= 64)
+            break;
+        s->block[block][scantable[i]] = offset * (factor2 < 0 ? -1 : 1) + factor * factor2;
+        if (last)
+            break;
+    }
+
+    if (s->hi == 2 && flag2 && block < 4) {
+        if (flag)
+            s->block[block][scantable[0]]  *= 2;
+        s->block[block][scantable[1]]  *= 2;
+        s->block[block][scantable[8]]  *= 2;
+        s->block[block][scantable[16]] *= 2;
+    }
+
+    return 0;
+}
+
+static int decode_blocks(AVCodecContext *avctx, GetBitContext *gb,
+                         unsigned cbp, int flag, int offset, unsigned flag2)
+{
+    IMM4Context *s = avctx->priv_data;
+    const uint8_t *scantable = s->intra_scantable.permutated;
+    int ret, i;
+
+    memset(s->block, 0, sizeof(s->block));
+
+    for (i = 0; i < 6; i++) {
+        if (!flag) {
+            int x = get_bits(gb, 8);
+
+            if (x == 255)
+                x = 128;
+            x *= 8;
+
+            s->block[i][scantable[0]] = x;
+        }
+
+        if (cbp & (1 << (5 - i))) {
+            ret = decode_block(avctx, gb, i, s->factor, flag, offset, flag2);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
+    return 0;
+}
+
+static int decode_intra(AVCodecContext *avctx, GetBitContext *gb, AVFrame *frame)
+{
+    IMM4Context *s = avctx->priv_data;
+    int ret, x, y, offset = 0;
+
+    if (s->hi == 0) {
+        if (s->lo > 2)
+            return AVERROR_INVALIDDATA;
+        s->factor = intra_cb[s->lo];
+    } else {
+        s->factor = s->lo * 2;
+    }
+
+    if (s->hi) {
+        offset = s->factor;
+        offset >>= 1;
+        if (!(offset & 1))
+            offset--;
+    }
+
+    for (y = 0; y < avctx->height; y += 16) {
+        for (x = 0; x < avctx->width; x += 16) {
+            unsigned flag, cbphi, cbplo;
+
+            cbplo = get_vlc2(gb, cbplo_tab.table, cbplo_tab.bits, 1) >> 4;
+            flag = get_bits1(gb);
+
+            cbphi = get_cbphi(gb, 1);
+
+            ret = decode_blocks(avctx, gb, cbplo | (cbphi << 2), 0, offset, flag);
+            if (ret < 0)
+                return ret;
+
+            s->idsp.idct_put(frame->data[0] + y * frame->linesize[0] + x,
+                             frame->linesize[0], s->block[0]);
+            s->idsp.idct_put(frame->data[0] + y * frame->linesize[0] + x + 8,
+                             frame->linesize[0], s->block[1]);
+            s->idsp.idct_put(frame->data[0] + (y + 8) * frame->linesize[0] + x,
+                             frame->linesize[0], s->block[2]);
+            s->idsp.idct_put(frame->data[0] + (y + 8) * frame->linesize[0] + x + 8,
+                             frame->linesize[0], s->block[3]);
+            s->idsp.idct_put(frame->data[1] + (y >> 1) * frame->linesize[1] + (x >> 1),
+                             frame->linesize[1], s->block[4]);
+            s->idsp.idct_put(frame->data[2] + (y >> 1) * frame->linesize[2] + (x >> 1),
+                             frame->linesize[2], s->block[5]);
+        }
+    }
+
+    return 0;
+}
+
+static int decode_inter(AVCodecContext *avctx, GetBitContext *gb,
+                        AVFrame *frame, AVFrame *prev)
+{
+    IMM4Context *s = avctx->priv_data;
+    int ret, x, y, offset = 0;
+
+    if (s->hi == 0) {
+        if (s->lo > 2)
+            return AVERROR_INVALIDDATA;
+        s->factor = inter_cb[s->lo];
+    } else {
+        s->factor = s->lo * 2;
+    }
+
+    if (s->hi) {
+        offset = s->factor;
+        offset >>= 1;
+        if (!(offset & 1))
+            offset--;
+    }
+
+    for (y = 0; y < avctx->height; y += 16) {
+        for (x = 0; x < avctx->width; x += 16) {
+            int reverse, intra_block, value;
+            unsigned cbphi, cbplo, flag2 = 0;
+
+            if (get_bits1(gb)) {
+                copy_block16(frame->data[0] + y * frame->linesize[0] + x,
+                             prev->data[0] + y * prev->linesize[0] + x,
+                             frame->linesize[0], prev->linesize[0], 16);
+                copy_block8(frame->data[1] + (y >> 1) * frame->linesize[1] + (x >> 1),
+                            prev->data[1] + (y >> 1) * prev->linesize[1] + (x >> 1),
+                            frame->linesize[1], prev->linesize[1], 8);
+                copy_block8(frame->data[2] + (y >> 1) * frame->linesize[2] + (x >> 1),
+                            prev->data[2] + (y >> 1) * prev->linesize[2] + (x >> 1),
+                            frame->linesize[2], prev->linesize[2], 8);
+                continue;
+            }
+
+            value = get_vlc2(gb, blktype_tab.table, blktype_tab.bits, 1);
+            if (value < 0)
+                return AVERROR_INVALIDDATA;
+
+            intra_block = value & 0x07;
+            reverse = intra_block == 3;
+            if (reverse)
+                flag2 = get_bits1(gb);
+
+            cbplo = value >> 4;
+            cbphi = get_cbphi(gb, reverse);
+            if (intra_block) {
+                ret = decode_blocks(avctx, gb, cbplo | (cbphi << 2), 0, offset, flag2);
+                if (ret < 0)
+                    return ret;
+
+                s->idsp.idct_put(frame->data[0] + y * frame->linesize[0] + x,
+                                 frame->linesize[0], s->block[0]);
+                s->idsp.idct_put(frame->data[0] + y * frame->linesize[0] + x + 8,
+                                 frame->linesize[0], s->block[1]);
+                s->idsp.idct_put(frame->data[0] + (y + 8) * frame->linesize[0] + x,
+                                 frame->linesize[0], s->block[2]);
+                s->idsp.idct_put(frame->data[0] + (y + 8) * frame->linesize[0] + x + 8,
+                                 frame->linesize[0], s->block[3]);
+                s->idsp.idct_put(frame->data[1] + (y >> 1) * frame->linesize[1] + (x >> 1),
+                                 frame->linesize[1], s->block[4]);
+                s->idsp.idct_put(frame->data[2] + (y >> 1) * frame->linesize[2] + (x >> 1),
+                                 frame->linesize[2], s->block[5]);
+            } else {
+                flag2 = get_bits1(gb);
+                skip_bits1(gb);
+                ret = decode_blocks(avctx, gb, cbplo | (cbphi << 2), 1, offset, flag2);
+                if (ret < 0)
+                    return ret;
+
+                copy_block16(frame->data[0] + y * frame->linesize[0] + x,
+                             prev->data[0] + y * prev->linesize[0] + x,
+                             frame->linesize[0], prev->linesize[0], 16);
+                copy_block8(frame->data[1] + (y >> 1) * frame->linesize[1] + (x >> 1),
+                            prev->data[1] + (y >> 1) * prev->linesize[1] + (x >> 1),
+                            frame->linesize[1], prev->linesize[1], 8);
+                copy_block8(frame->data[2] + (y >> 1) * frame->linesize[2] + (x >> 1),
+                            prev->data[2] + (y >> 1) * prev->linesize[2] + (x >> 1),
+                            frame->linesize[2], prev->linesize[2], 8);
+
+                s->idsp.idct_add(frame->data[0] + y * frame->linesize[0] + x,
+                                 frame->linesize[0], s->block[0]);
+                s->idsp.idct_add(frame->data[0] + y * frame->linesize[0] + x + 8,
+                                 frame->linesize[0], s->block[1]);
+                s->idsp.idct_add(frame->data[0] + (y + 8) * frame->linesize[0] + x,
+                                 frame->linesize[0], s->block[2]);
+                s->idsp.idct_add(frame->data[0] + (y + 8) * frame->linesize[0] + x + 8,
+                                 frame->linesize[0], s->block[3]);
+                s->idsp.idct_add(frame->data[1] + (y >> 1) * frame->linesize[1] + (x >> 1),
+                                 frame->linesize[1], s->block[4]);
+                s->idsp.idct_add(frame->data[2] + (y >> 1) * frame->linesize[2] + (x >> 1),
+                                 frame->linesize[2], s->block[5]);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame, AVPacket *avpkt)
+{
+    IMM4Context *s = avctx->priv_data;
+    GetBitContext *gb = &s->gb;
+    AVFrame *frame = data;
+    int width, height;
+    unsigned type;
+    int ret, scaled;
+
+    if (avpkt->size <= 32)
+        return AVERROR_INVALIDDATA;
+
+    av_fast_padded_malloc(&s->bitstream, &s->bitstream_size,
+                          FFALIGN(avpkt->size, 4));
+    if (!s->bitstream)
+        return AVERROR(ENOMEM);
+
+    s->bdsp.bswap_buf((uint32_t *)s->bitstream,
+                      (uint32_t *)avpkt->data,
+                      (avpkt->size + 3) >> 2);
+
+    if ((ret = init_get_bits8(gb, s->bitstream, FFALIGN(avpkt->size, 4))) < 0)
+        return ret;
+
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+    avctx->color_range = AVCOL_RANGE_JPEG;
+
+    width = avctx->width;
+    height = avctx->height;
+
+    scaled = avpkt->data[8];
+    if (scaled < 2) {
+        int mode = avpkt->data[10];
+
+        switch (mode) {
+        case 1:
+            width = 352;
+            height = 240;
+            break;
+        case 2:
+            width = 704;
+            height = 240;
+            break;
+        case 4:
+            width = 480;
+            height = 704;
+            break;
+        case 17:
+            width = 352;
+            height = 288;
+            break;
+        case 18:
+            width = 704;
+            height = 288;
+            break;
+        default:
+            width = 704;
+            height = 576;
+            break;
+        }
+    }
+
+    skip_bits_long(gb, 24 * 8);
+    type = get_bits_long(gb, 32);
+    s->hi = get_bits(gb, 16);
+    s->lo = get_bits(gb, 16);
+
+    switch (type) {
+    case 0x19781977:
+        frame->key_frame = 1;
+        frame->pict_type = AV_PICTURE_TYPE_I;
+        break;
+    case 0x12250926:
+        frame->key_frame = 0;
+        frame->pict_type = AV_PICTURE_TYPE_P;
+        break;
+    default:
+        avpriv_request_sample(avctx, "type %X", type);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (avctx->width  != width ||
+        avctx->height != height) {
+        if (!frame->key_frame) {
+            av_log(avctx, AV_LOG_ERROR, "Frame size change is unsupported.\n");
+            return AVERROR_INVALIDDATA;
+        }
+        av_frame_unref(s->prev_frame);
+    }
+
+    ret = ff_set_dimensions(avctx, width, height);
+    if (ret < 0)
+        return ret;
+
+    if ((ret = ff_get_buffer(avctx, frame, frame->key_frame ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
+        return ret;
+
+    if (frame->key_frame) {
+        ret = decode_intra(avctx, gb, frame);
+        if (ret < 0)
+            return ret;
+
+        av_frame_unref(s->prev_frame);
+        if ((ret = av_frame_ref(s->prev_frame, frame)) < 0)
+            return ret;
+    } else {
+        if (!s->prev_frame->data[0]) {
+            av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        ret = decode_inter(avctx, gb, frame, s->prev_frame);
+        if (ret < 0)
+            return ret;
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold void imm4_init_static_data(void)
+{
+    INIT_VLC_SPARSE_STATIC(&cbplo_tab, 9, FF_ARRAY_ELEMS(cbplo_bits),
+                           cbplo_bits, 1, 1, cbplo_codes, 1, 1, cbplo_symbols, 1, 1, 512);
+
+    INIT_VLC_SPARSE_STATIC(&cbphi_tab, 6, FF_ARRAY_ELEMS(cbphi_bits),
+                           cbphi_bits, 1, 1, cbphi_codes, 1, 1, NULL, 0, 0, 64);
+
+    INIT_VLC_SPARSE_STATIC(&blktype_tab, 9, FF_ARRAY_ELEMS(blktype_bits),
+                           blktype_bits, 1, 1, blktype_codes, 1, 1, blktype_symbols, 1, 1, 512);
+
+    INIT_VLC_SPARSE_STATIC(&block_tab, 12, FF_ARRAY_ELEMS(block_bits),
+                           block_bits, 1, 1, block_codes, 1, 1, block_symbols, 2, 2, 4096);
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    static AVOnce init_static_once = AV_ONCE_INIT;
+    IMM4Context *s = avctx->priv_data;
+    uint8_t table[64];
+
+    for (int i = 0; i < 64; i++)
+        table[i] = i;
+
+    ff_bswapdsp_init(&s->bdsp);
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, table);
+
+    s->prev_frame = av_frame_alloc();
+    if (!s->prev_frame)
+        return AVERROR(ENOMEM);
+
+    ff_thread_once(&init_static_once, imm4_init_static_data);
+
+    return 0;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    IMM4Context *s = avctx->priv_data;
+
+    av_frame_free(&s->prev_frame);
+    av_freep(&s->bitstream);
+    s->bitstream_size = 0;
+
+    return 0;
+}
+
+AVCodec ff_imm4_decoder = {
+    .name             = "imm4",
+    .long_name        = NULL_IF_CONFIG_SMALL("Infinity IMM4"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_IMM4,
+    .priv_data_size   = sizeof(IMM4Context),
+    .init             = decode_init,
+    .close            = decode_close,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/imx_dump_header_bsf.c b/libavcodec/imx_dump_header_bsf.c
index 71bda02..9a9de05 100644
--- a/libavcodec/imx_dump_header_bsf.c
+++ b/libavcodec/imx_dump_header_bsf.c
@@ -2,20 +2,20 @@
  * imx dump header bitstream filter
  * Copyright (c) 2007 Baptiste Coudurier
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo2.c b/libavcodec/indeo2.c
index 64e92d7..4971b84 100644
--- a/libavcodec/indeo2.c
+++ b/libavcodec/indeo2.c
@@ -2,20 +2,20 @@
  * Intel Indeo 2 codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,16 +28,15 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "indeo2data.h"
 #include "internal.h"
 #include "mathops.h"
-#include "vlc.h"
 
 typedef struct Ir2Context{
     AVCodecContext *avctx;
     AVFrame *picture;
-    BitstreamContext bc;
+    GetBitContext gb;
     int decode_delta;
 } Ir2Context;
 
@@ -45,9 +44,9 @@ typedef struct Ir2Context{
 static VLC ir2_vlc;
 
 /* Indeo 2 codes are in range 0x01..0x7F and 0x81..0x90 */
-static inline int ir2_get_code(BitstreamContext *bc)
+static inline int ir2_get_code(GetBitContext *gb)
 {
-    return bitstream_read_vlc(bc, ir2_vlc.table, CODE_VLC_BITS, 1) + 1;
+    return get_vlc2(gb, ir2_vlc.table, CODE_VLC_BITS, 1) + 1;
 }
 
 static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst,
@@ -56,15 +55,13 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
     int i;
     int j;
     int out = 0;
-    int c;
-    int t;
 
     if (width & 1)
         return AVERROR_INVALIDDATA;
 
     /* first line contain absolute values, other lines contain deltas */
     while (out < width) {
-        c = ir2_get_code(&ctx->bc);
+        int c = ir2_get_code(&ctx->gb);
         if (c >= 0x80) { /* we have a run */
             c -= 0x7F;
             if (out + c*2 > width)
@@ -72,6 +69,8 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
             for (i = 0; i < c * 2; i++)
                 dst[out++] = 0x80;
         } else { /* copy two values from table */
+            if (c <= 0)
+                return AVERROR_INVALIDDATA;
             dst[out++] = table[c * 2];
             dst[out++] = table[(c * 2) + 1];
         }
@@ -80,8 +79,10 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
 
     for (j = 1; j < height; j++) {
         out = 0;
+        if (get_bits_left(&ctx->gb) <= 0)
+            return AVERROR_INVALIDDATA;
         while (out < width) {
-            c = ir2_get_code(&ctx->bc);
+            int c = ir2_get_code(&ctx->gb);
             if (c >= 0x80) { /* we have a skip */
                 c -= 0x7F;
                 if (out + c*2 > width)
@@ -91,6 +92,9 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
                     out++;
                 }
             } else { /* add two deltas from table */
+                int t;
+                if (c <= 0)
+                    return AVERROR_INVALIDDATA;
                 t        = dst[out - pitch] + (table[c * 2] - 128);
                 t        = av_clip_uint8(t);
                 dst[out] = t;
@@ -119,12 +123,16 @@ static int ir2_decode_plane_inter(Ir2Context *ctx, int width, int height, uint8_
 
     for (j = 0; j < height; j++) {
         out = 0;
+        if (get_bits_left(&ctx->gb) <= 0)
+            return AVERROR_INVALIDDATA;
         while (out < width) {
-            c = ir2_get_code(&ctx->bc);
+            c = ir2_get_code(&ctx->gb);
             if (c >= 0x80) { /* we have a skip */
                 c   -= 0x7F;
                 out += c * 2;
             } else { /* add two deltas from table */
+                if (c <= 0)
+                    return AVERROR_INVALIDDATA;
                 t        = dst[out] + (((table[c * 2] - 128)*3) >> 2);
                 t        = av_clip_uint8(t);
                 dst[out] = t;
@@ -152,10 +160,8 @@ static int ir2_decode_frame(AVCodecContext *avctx,
     int start, ret;
     int ltab, ctab;
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
 
     start = 48; /* hardcoded for now */
 
@@ -172,10 +178,17 @@ static int ir2_decode_frame(AVCodecContext *avctx,
         buf[i] = ff_reverse[buf[i]];
 #endif
 
-    bitstream_init8(&s->bc, buf + start, buf_size - start);
+    if ((ret = init_get_bits8(&s->gb, buf + start, buf_size - start)) < 0)
+        return ret;
 
     ltab = buf[0x22] & 3;
     ctab = buf[0x22] >> 2;
+
+    if (ctab > 3) {
+        av_log(avctx, AV_LOG_ERROR, "ctab %d is invalid\n", ctab);
+        return AVERROR_INVALIDDATA;
+    }
+
     if (s->decode_delta) { /* intraframe */
         if ((ret = ir2_decode_plane(s, avctx->width, avctx->height,
                                     p->data[0], p->linesize[0],
diff --git a/libavcodec/indeo2data.h b/libavcodec/indeo2data.h
index 3d7411d..bfdb0a6 100644
--- a/libavcodec/indeo2data.h
+++ b/libavcodec/indeo2data.h
@@ -2,20 +2,20 @@
  * Intel Indeo 2 codec
  * copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo3.c b/libavcodec/indeo3.c
index 22a072d..71d478c 100644
--- a/libavcodec/indeo3.c
+++ b/libavcodec/indeo3.c
@@ -2,20 +2,20 @@
  * Indeo Video v3 compatible decoder
  * Copyright (c) 2009 - 2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,10 +31,10 @@
 
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "copy_block.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "hpeldsp.h"
 #include "internal.h"
 
@@ -84,7 +84,7 @@ typedef struct Indeo3DecodeContext {
     AVCodecContext *avctx;
     HpelDSPContext  hdsp;
 
-    BitstreamContext bc;
+    GetBitContext   gb;
     int             need_resync;
     int             skip_bits;
     const uint8_t   *next_cell_data;
@@ -94,7 +94,7 @@ typedef struct Indeo3DecodeContext {
 
     int16_t         width, height;
     uint32_t        frame_num;      ///< current frame number (zero-based)
-    uint32_t        data_size;      ///< size of the frame data in bytes
+    int             data_size;      ///< size of the frame data in bytes
     uint16_t        frame_flags;    ///< frame properties
     uint8_t         cb_offset;      ///< needed for selecting VQ tables
     uint8_t         buf_sel;        ///< active frame buffer: 0 - primary, 1 -secondary
@@ -148,16 +148,27 @@ static av_cold void build_requant_tab(void)
 }
 
 
+static av_cold void free_frame_buffers(Indeo3DecodeContext *ctx)
+{
+    int p;
+
+    ctx->width = ctx->height = 0;
+
+    for (p = 0; p < 3; p++) {
+        av_freep(&ctx->planes[p].buffers[0]);
+        av_freep(&ctx->planes[p].buffers[1]);
+        ctx->planes[p].pixels[0] = ctx->planes[p].pixels[1] = 0;
+    }
+}
+
+
 static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
-                                          AVCodecContext *avctx)
+                                          AVCodecContext *avctx, int luma_width, int luma_height)
 {
-    int p, luma_width, luma_height, chroma_width, chroma_height;
+    int p, chroma_width, chroma_height;
     int luma_size, chroma_size;
     ptrdiff_t luma_pitch, chroma_pitch;
 
-    luma_width  = ctx->width;
-    luma_height = ctx->height;
-
     if (luma_width  < 16 || luma_width  > 640 ||
         luma_height < 16 || luma_height > 480 ||
         luma_width  &  3 || luma_height &   3) {
@@ -166,6 +177,9 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
         return AVERROR_INVALIDDATA;
     }
 
+    ctx->width  = luma_width ;
+    ctx->height = luma_height;
+
     chroma_width  = FFALIGN(luma_width  >> 2, 4);
     chroma_height = FFALIGN(luma_height >> 2, 4);
 
@@ -189,6 +203,11 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
         ctx->planes[p].buffers[0] = av_malloc(!p ? luma_size : chroma_size);
         ctx->planes[p].buffers[1] = av_malloc(!p ? luma_size : chroma_size);
 
+        if (!ctx->planes[p].buffers[0] || !ctx->planes[p].buffers[1]) {
+            free_frame_buffers(ctx);
+            return AVERROR(ENOMEM);
+        }
+
         /* fill the INTRA prediction lines with the middle pixel value = 64 */
         memset(ctx->planes[p].buffers[0], 0x40, ctx->planes[p].pitch);
         memset(ctx->planes[p].buffers[1], 0x40, ctx->planes[p].pitch);
@@ -203,19 +222,6 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
     return 0;
 }
 
-
-static av_cold void free_frame_buffers(Indeo3DecodeContext *ctx)
-{
-    int p;
-
-    for (p = 0; p < 3; p++) {
-        av_freep(&ctx->planes[p].buffers[0]);
-        av_freep(&ctx->planes[p].buffers[1]);
-        ctx->planes[p].pixels[0] = ctx->planes[p].pixels[1] = 0;
-    }
-}
-
-
 /**
  *  Copy pixels of the cell(x + mv_x, y + mv_y) from the previous frame into
  *  the cell(x, y) in the current frame.
@@ -232,8 +238,11 @@ static int copy_cell(Indeo3DecodeContext *ctx, Plane *plane, Cell *cell)
     /* setup output and reference pointers */
     offset_dst  = (cell->ypos << 2) * plane->pitch + (cell->xpos << 2);
     dst         = plane->pixels[ctx->buf_sel] + offset_dst;
+    if(cell->mv_ptr){
     mv_y        = cell->mv_ptr[0];
     mv_x        = cell->mv_ptr[1];
+    }else
+        mv_x= mv_y= 0;
 
     /* -1 because there is an extra line on top for prediction */
     if ((cell->ypos << 2) + mv_y < -1 || (cell->xpos << 2) + mv_x < 0 ||
@@ -335,7 +344,7 @@ if (*data_ptr >= last_ptr) \
 
 #define RLE_BLOCK_COPY \
     if (cell->mv_ptr || !skip_flag) \
-        ctx->hdsp.put_pixels_tab[2][0](dst, ref, row_offset, 4 << v_zoom)
+        copy_block4(dst, ref, row_offset, row_offset, 4 << v_zoom)
 
 #define RLE_BLOCK_COPY_8 \
     pix64 = AV_RN64(ref);\
@@ -347,7 +356,7 @@ if (*data_ptr >= last_ptr) \
         fill_64(dst, pix64, 8, row_offset)
 
 #define RLE_LINES_COPY \
-    ctx->hdsp.put_pixels_tab[2][0](dst, ref, row_offset, num_lines << v_zoom)
+    copy_block4(dst, ref, row_offset, row_offset, num_lines << v_zoom)
 
 #define RLE_LINES_COPY_M10 \
     pix64 = AV_RN64(ref);\
@@ -590,6 +599,7 @@ static int decode_cell(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     /* setup output and reference pointers */
     offset = (cell->ypos << 2) * plane->pitch + (cell->xpos << 2);
     block  =  plane->pixels[ctx->buf_sel] + offset;
+
     if (!cell->mv_ptr) {
         /* use previous line as reference for INTRA cells */
         ref_block = block - plane->pitch;
@@ -644,7 +654,7 @@ static int decode_cell(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     /* of the predicted cell in order to avoid overflows. */
     if (vq_index >= 8 && ref_block) {
         for (x = 0; x < cell->width << 2; x++)
-            ref_block[x] = requant_tab[vq_index & 7][ref_block[x]];
+            ref_block[x] = requant_tab[vq_index & 7][ref_block[x] & 127];
     }
 
     error = IV3_NOERR;
@@ -726,8 +736,8 @@ enum {
     ctx->need_resync = 1
 
 #define RESYNC_BITSTREAM \
-    if (ctx->need_resync && !(bitstream_tell(&ctx->bc) & 7)) { \
-        bitstream_skip(&ctx->bc, ctx->skip_bits);              \
+    if (ctx->need_resync && !(get_bits_count(&ctx->gb) & 7)) { \
+        skip_bits_long(&ctx->gb, ctx->skip_bits);              \
         ctx->skip_bits   = 0;                                  \
         ctx->need_resync = 0;                                  \
     }
@@ -772,9 +782,9 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
             return AVERROR_INVALIDDATA;
     }
 
-    while (1) { /* loop until return */
+    while (get_bits_left(&ctx->gb) >= 2) { /* loop until return */
         RESYNC_BITSTREAM;
-        switch (code = bitstream_read(&ctx->bc, 2)) {
+        switch (code = get_bits(&ctx->gb, 2)) {
         case H_SPLIT:
         case V_SPLIT:
             if (parse_bintree(ctx, avctx, plane, code, &curr_cell, depth - 1, strip_width))
@@ -786,7 +796,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 curr_cell.tree   = 1; /* enter the VQ tree */
             } else { /* VQ tree NULL code */
                 RESYNC_BITSTREAM;
-                code = bitstream_read(&ctx->bc, 2);
+                code = get_bits(&ctx->gb, 2);
                 if (code >= 2) {
                     av_log(avctx, AV_LOG_ERROR, "Invalid VQ_NULL code: %d\n", code);
                     return AVERROR_INVALIDDATA;
@@ -797,6 +807,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 CHECK_CELL
                 if (!curr_cell.mv_ptr)
                     return AVERROR_INVALIDDATA;
+
                 ret = copy_cell(ctx, plane, &curr_cell);
                 return ret;
             }
@@ -806,7 +817,11 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 unsigned mv_idx;
                 /* get motion vector index and setup the pointer to the mv set */
                 if (!ctx->need_resync)
-                    ctx->next_cell_data = &ctx->bc.buffer[(bitstream_tell(&ctx->bc) + 7) >> 3];
+                    ctx->next_cell_data = &ctx->gb.buffer[(get_bits_count(&ctx->gb) + 7) >> 3];
+                if (ctx->next_cell_data >= ctx->last_byte) {
+                    av_log(avctx, AV_LOG_ERROR, "motion vector out of array\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 mv_idx = *(ctx->next_cell_data++);
                 if (mv_idx >= ctx->num_vectors) {
                     av_log(avctx, AV_LOG_ERROR, "motion vector index out of range\n");
@@ -817,7 +832,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 UPDATE_BITPOS(8);
             } else { /* VQ tree DATA code */
                 if (!ctx->need_resync)
-                    ctx->next_cell_data = &ctx->bc.buffer[(bitstream_tell(&ctx->bc) + 7) >> 3];
+                    ctx->next_cell_data = &ctx->gb.buffer[(get_bits_count(&ctx->gb) + 7) >> 3];
 
                 CHECK_CELL
                 bytes_used = decode_cell(ctx, avctx, plane, &curr_cell,
@@ -832,6 +847,8 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
             break;
         }
     }//while
+
+    return AVERROR_INVALIDDATA;
 }
 
 
@@ -844,24 +861,24 @@ static int decode_plane(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
 
     /* each plane data starts with mc_vector_count field, */
     /* an optional array of motion vectors followed by the vq data */
-    num_vectors = bytestream_get_le32(&data);
+    num_vectors = bytestream_get_le32(&data); data_size -= 4;
     if (num_vectors > 256) {
         av_log(ctx->avctx, AV_LOG_ERROR,
                "Read invalid number of motion vectors %d\n", num_vectors);
         return AVERROR_INVALIDDATA;
     }
-    if (num_vectors * 2 >= data_size)
+    if (num_vectors * 2 > data_size)
         return AVERROR_INVALIDDATA;
 
     ctx->num_vectors = num_vectors;
     ctx->mc_vectors  = num_vectors ? data : 0;
 
     /* init the bitreader */
-    bitstream_init8(&ctx->bc, &data[num_vectors * 2], data_size - num_vectors * 2);
+    init_get_bits(&ctx->gb, &data[num_vectors * 2], (data_size - num_vectors * 2) << 3);
     ctx->skip_bits   = 0;
     ctx->need_resync = 0;
 
-    ctx->last_byte = data + data_size - 1;
+    ctx->last_byte = data + data_size;
 
     /* initialize the 1st cell and set its dimensions to whole plane */
     curr_cell.xpos   = curr_cell.ypos = 0;
@@ -882,7 +899,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     GetByteContext gb;
     const uint8_t   *bs_hdr;
     uint32_t        frame_num, word2, check_sum, data_size;
-    uint32_t        y_offset, u_offset, v_offset, starts[3], ends[3];
+    int             y_offset, u_offset, v_offset;
+    uint32_t        starts[3], ends[3];
     uint16_t        height, width;
     int             i, j;
 
@@ -936,12 +954,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                    "Invalid picture dimensions: %d x %d!\n", width, height);
             return AVERROR_INVALIDDATA;
         }
-
-        ctx->width  = width;
-        ctx->height = height;
-
         free_frame_buffers(ctx);
-        if ((res = allocate_frame_buffers(ctx, avctx)) < 0)
+        if ((res = allocate_frame_buffers(ctx, avctx, width, height)) < 0)
              return res;
         if ((res = ff_set_dimensions(avctx, width, height)) < 0)
             return res;
@@ -968,7 +982,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     ctx->y_data_size = ends[0] - starts[0];
     ctx->v_data_size = ends[1] - starts[1];
     ctx->u_data_size = ends[2] - starts[2];
-    if (FFMAX3(y_offset, v_offset, u_offset) >= ctx->data_size - 16 ||
+    if (FFMIN3(y_offset, v_offset, u_offset) < 0 ||
+        FFMAX3(y_offset, v_offset, u_offset) >= ctx->data_size - 16 ||
         FFMIN3(y_offset, v_offset, u_offset) < gb.buffer - bs_hdr + 16 ||
         FFMIN3(ctx->y_data_size, ctx->v_data_size, ctx->u_data_size) <= 0) {
         av_log(avctx, AV_LOG_ERROR, "One of the y/u/v offsets is invalid\n");
@@ -1039,17 +1054,13 @@ static av_cold int decode_init(AVCodecContext *avctx)
     Indeo3DecodeContext *ctx = avctx->priv_data;
 
     ctx->avctx     = avctx;
-    ctx->width     = avctx->width;
-    ctx->height    = avctx->height;
     avctx->pix_fmt = AV_PIX_FMT_YUV410P;
 
     build_requant_tab();
 
     ff_hpeldsp_init(&ctx->hdsp, avctx->flags);
 
-    allocate_frame_buffers(ctx, avctx);
-
-    return 0;
+    return allocate_frame_buffers(ctx, avctx, avctx->width, avctx->height);
 }
 
 
@@ -1085,6 +1096,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     /* use BS_BUFFER flag for buffer switching */
     ctx->buf_sel = (ctx->frame_flags >> BS_BUFFER) & 1;
 
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
+        return res;
+
     /* decode luma plane */
     if ((res = decode_plane(ctx, avctx, ctx->planes, ctx->y_data_ptr, ctx->y_data_size, 40)))
         return res;
@@ -1096,11 +1110,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if ((res = decode_plane(ctx, avctx, &ctx->planes[2], ctx->v_data_ptr, ctx->v_data_size, 10)))
         return res;
 
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return res;
-    }
-
     output_plane(&ctx->planes[0], ctx->buf_sel,
                  frame->data[0], frame->linesize[0],
                  avctx->height);
diff --git a/libavcodec/indeo3data.h b/libavcodec/indeo3data.h
index 41a29e5..fbe76af 100644
--- a/libavcodec/indeo3data.h
+++ b/libavcodec/indeo3data.h
@@ -2,20 +2,20 @@
  * Indeo Video v3 compatible decoder
  * Copyright (c) 2009 - 2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo4.c b/libavcodec/indeo4.c
index 37b5da8..4bfc6cd 100644
--- a/libavcodec/indeo4.c
+++ b/libavcodec/indeo4.c
@@ -2,20 +2,20 @@
  * Indeo Video Interactive v4 compatible decoder
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,12 +29,12 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
+#include "libavutil/imgutils.h"
 #include "indeo4data.h"
 #include "internal.h"
 #include "ivi.h"
 #include "ivi_dsp.h"
-#include "vlc.h"
 
 #define IVI4_PIC_SIZE_ESC   7
 
@@ -71,19 +71,19 @@ static const struct {
  *  - 4 wavelet bands per plane, size factor 1:4, code pattern: 2,3,3,3,3
  *  Anything else is either unsupported or corrupt.
  *
- *  @param[in,out] bc    the Bitstream context
+ *  @param[in,out] gb    the GetBit context
  *  @return        number of wavelet bands or 0 on error
  */
-static int decode_plane_subdivision(BitstreamContext *bc)
+static int decode_plane_subdivision(GetBitContext *gb)
 {
     int i;
 
-    switch (bitstream_read(bc, 2)) {
+    switch (get_bits(gb, 2)) {
     case 3:
         return 1;
     case 2:
         for (i = 0; i < 4; i++)
-            if (bitstream_read(bc, 2) != 3)
+            if (get_bits(gb, 2) != 3)
                 return 0;
         return 4;
     default:
@@ -108,13 +108,13 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
     int             pic_size_indx, i, p;
     IVIPicConfig    pic_conf;
 
-    if (bitstream_read(&ctx->bc, 18) != 0x3FFF8) {
+    if (get_bits(&ctx->gb, 18) != 0x3FFF8) {
         av_log(avctx, AV_LOG_ERROR, "Invalid picture start code!\n");
         return AVERROR_INVALIDDATA;
     }
 
     ctx->prev_frame_type = ctx->frame_type;
-    ctx->frame_type      = bitstream_read(&ctx->bc, 3);
+    ctx->frame_type      = get_bits(&ctx->gb, 3);
     if (ctx->frame_type == 7) {
         av_log(avctx, AV_LOG_ERROR, "Invalid frame type: %d\n", ctx->frame_type);
         return AVERROR_INVALIDDATA;
@@ -123,15 +123,15 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
     if (ctx->frame_type == IVI4_FRAMETYPE_BIDIR)
         ctx->has_b_frames = 1;
 
-    ctx->has_transp = bitstream_read_bit(&ctx->bc);
+    ctx->has_transp = get_bits1(&ctx->gb);
 
     /* unknown bit: Mac decoder ignores this bit, XANIM returns error */
-    if (bitstream_read_bit(&ctx->bc)) {
+    if (get_bits1(&ctx->gb)) {
         av_log(avctx, AV_LOG_ERROR, "Sync bit is set!\n");
         return AVERROR_INVALIDDATA;
     }
 
-    ctx->data_size = bitstream_read_bit(&ctx->bc) ? bitstream_read(&ctx->bc, 24) : 0;
+    ctx->data_size = get_bits1(&ctx->gb) ? get_bits(&ctx->gb, 24) : 0;
 
     /* null frames don't contain anything else so we just return */
     if (ctx->frame_type >= IVI4_FRAMETYPE_NULL_FIRST) {
@@ -142,32 +142,32 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
     /* Check key lock status. If enabled - ignore lock word.         */
     /* Usually we have to prompt the user for the password, but      */
     /* we don't do that because Indeo 4 videos can be decoded anyway */
-    if (bitstream_read_bit(&ctx->bc)) {
-        bitstream_skip(&ctx->bc, 32);
+    if (get_bits1(&ctx->gb)) {
+        skip_bits_long(&ctx->gb, 32);
         ff_dlog(avctx, "Password-protected clip!\n");
     }
 
-    pic_size_indx = bitstream_read(&ctx->bc, 3);
+    pic_size_indx = get_bits(&ctx->gb, 3);
     if (pic_size_indx == IVI4_PIC_SIZE_ESC) {
-        pic_conf.pic_height = bitstream_read(&ctx->bc, 16);
-        pic_conf.pic_width  = bitstream_read(&ctx->bc, 16);
+        pic_conf.pic_height = get_bits(&ctx->gb, 16);
+        pic_conf.pic_width  = get_bits(&ctx->gb, 16);
     } else {
         pic_conf.pic_height = ivi4_common_pic_sizes[pic_size_indx * 2 + 1];
         pic_conf.pic_width  = ivi4_common_pic_sizes[pic_size_indx * 2    ];
     }
 
     /* Decode tile dimensions. */
-    ctx->uses_tiling = bitstream_read_bit(&ctx->bc);
+    ctx->uses_tiling = get_bits1(&ctx->gb);
     if (ctx->uses_tiling) {
-        pic_conf.tile_height = scale_tile_size(pic_conf.pic_height, bitstream_read(&ctx->bc, 4));
-        pic_conf.tile_width  = scale_tile_size(pic_conf.pic_width,  bitstream_read(&ctx->bc, 4));
+        pic_conf.tile_height = scale_tile_size(pic_conf.pic_height, get_bits(&ctx->gb, 4));
+        pic_conf.tile_width  = scale_tile_size(pic_conf.pic_width,  get_bits(&ctx->gb, 4));
     } else {
         pic_conf.tile_height = pic_conf.pic_height;
         pic_conf.tile_width  = pic_conf.pic_width;
     }
 
     /* Decode chroma subsampling. We support only 4:4 aka YVU9. */
-    if (bitstream_read(&ctx->bc, 2)) {
+    if (get_bits(&ctx->gb, 2)) {
         av_log(avctx, AV_LOG_ERROR, "Only YVU9 picture format is supported!\n");
         return AVERROR_INVALIDDATA;
     }
@@ -175,9 +175,17 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
     pic_conf.chroma_width  = (pic_conf.pic_width  + 3) >> 2;
 
     /* decode subdivision of the planes */
-    pic_conf.luma_bands = decode_plane_subdivision(&ctx->bc);
+    pic_conf.luma_bands = decode_plane_subdivision(&ctx->gb);
+    pic_conf.chroma_bands = 0;
     if (pic_conf.luma_bands)
-        pic_conf.chroma_bands = decode_plane_subdivision(&ctx->bc);
+        pic_conf.chroma_bands = decode_plane_subdivision(&ctx->gb);
+
+    if (av_image_check_size2(pic_conf.pic_width, pic_conf.pic_height, avctx->max_pixels, AV_PIX_FMT_YUV410P, 0, avctx) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "picture dimensions %d %d cannot be decoded\n",
+               pic_conf.pic_width, pic_conf.pic_height);
+        return AVERROR_INVALIDDATA;
+    }
+
     ctx->is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
     if (ctx->is_scalable && (pic_conf.luma_bands != 4 || pic_conf.chroma_bands != 1)) {
         av_log(avctx, AV_LOG_ERROR, "Scalability: unsupported subdivision! Luma bands: %d, chroma bands: %d\n",
@@ -187,7 +195,7 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
 
     /* check if picture layout was changed and reallocate buffers */
     if (ivi_pic_config_cmp(&pic_conf, &ctx->pic_conf)) {
-        if (ff_ivi_init_planes(ctx->planes, &pic_conf, 1)) {
+        if (ff_ivi_init_planes(avctx, ctx->planes, &pic_conf, 1)) {
             av_log(avctx, AV_LOG_ERROR, "Couldn't reallocate color planes!\n");
             ctx->pic_conf.luma_bands = 0;
             return AVERROR(ENOMEM);
@@ -211,40 +219,42 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
         }
     }
 
-    ctx->frame_num = bitstream_read_bit(&ctx->bc) ? bitstream_read(&ctx->bc, 20) : 0;
+    ctx->frame_num = get_bits1(&ctx->gb) ? get_bits(&ctx->gb, 20) : 0;
 
     /* skip decTimeEst field if present */
-    if (bitstream_read_bit(&ctx->bc))
-        bitstream_skip(&ctx->bc, 8);
+    if (get_bits1(&ctx->gb))
+        skip_bits(&ctx->gb, 8);
 
     /* decode macroblock and block huffman codebooks */
-    if (ff_ivi_dec_huff_desc(&ctx->bc, bitstream_read_bit(&ctx->bc), IVI_MB_HUFF,  &ctx->mb_vlc,  avctx) ||
-        ff_ivi_dec_huff_desc(&ctx->bc, bitstream_read_bit(&ctx->bc), IVI_BLK_HUFF, &ctx->blk_vlc, avctx))
+    if (ff_ivi_dec_huff_desc(&ctx->gb, get_bits1(&ctx->gb), IVI_MB_HUFF,  &ctx->mb_vlc,  avctx) ||
+        ff_ivi_dec_huff_desc(&ctx->gb, get_bits1(&ctx->gb), IVI_BLK_HUFF, &ctx->blk_vlc, avctx))
         return AVERROR_INVALIDDATA;
 
-    ctx->rvmap_sel = bitstream_read_bit(&ctx->bc) ? bitstream_read(&ctx->bc, 3) : 8;
+    ctx->rvmap_sel = get_bits1(&ctx->gb) ? get_bits(&ctx->gb, 3) : 8;
 
-    ctx->in_imf = bitstream_read_bit(&ctx->bc);
-    ctx->in_q   = bitstream_read_bit(&ctx->bc);
+    ctx->in_imf = get_bits1(&ctx->gb);
+    ctx->in_q   = get_bits1(&ctx->gb);
 
-    ctx->pic_glob_quant = bitstream_read(&ctx->bc, 5);
+    ctx->pic_glob_quant = get_bits(&ctx->gb, 5);
 
     /* TODO: ignore this parameter if unused */
-    ctx->unknown1 = bitstream_read_bit(&ctx->bc) ? bitstream_read(&ctx->bc, 3) : 0;
+    ctx->unknown1 = get_bits1(&ctx->gb) ? get_bits(&ctx->gb, 3) : 0;
 
-    ctx->checksum = bitstream_read_bit(&ctx->bc) ? bitstream_read(&ctx->bc, 16) : 0;
+    ctx->checksum = get_bits1(&ctx->gb) ? get_bits(&ctx->gb, 16) : 0;
 
     /* skip picture header extension if any */
-    while (bitstream_read_bit(&ctx->bc)) {
+    while (get_bits1(&ctx->gb)) {
         ff_dlog(avctx, "Pic hdr extension encountered!\n");
-        bitstream_skip(&ctx->bc, 8);
+        if (get_bits_left(&ctx->gb) < 10)
+            return AVERROR_INVALIDDATA;
+        skip_bits(&ctx->gb, 8);
     }
 
-    if (bitstream_read_bit(&ctx->bc)) {
+    if (get_bits1(&ctx->gb)) {
         av_log(avctx, AV_LOG_ERROR, "Bad blocks bits encountered!\n");
     }
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
     return 0;
 }
@@ -258,28 +268,31 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
  *  @param[in]     avctx     pointer to the AVCodecContext
  *  @return        result code: 0 = OK, negative number = error
  */
-static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
+static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *arg_band,
                            AVCodecContext *avctx)
 {
     int plane, band_num, indx, transform_id, scan_indx;
     int i;
+    int quant_mat;
+    IVIBandDesc temp_band, *band = &temp_band;
+    memcpy(&temp_band, arg_band, sizeof(temp_band));
 
-    plane    = bitstream_read(&ctx->bc, 2);
-    band_num = bitstream_read(&ctx->bc, 4);
+    plane    = get_bits(&ctx->gb, 2);
+    band_num = get_bits(&ctx->gb, 4);
     if (band->plane != plane || band->band_num != band_num) {
         av_log(avctx, AV_LOG_ERROR, "Invalid band header sequence!\n");
         return AVERROR_INVALIDDATA;
     }
 
-    band->is_empty = bitstream_read_bit(&ctx->bc);
+    band->is_empty = get_bits1(&ctx->gb);
     if (!band->is_empty) {
         int old_blk_size = band->blk_size;
         /* skip header size
          * If header size is not given, header size is 4 bytes. */
-        if (bitstream_read_bit(&ctx->bc))
-            bitstream_skip(&ctx->bc, 16);
+        if (get_bits1(&ctx->gb))
+            skip_bits(&ctx->gb, 16);
 
-        band->is_halfpel = bitstream_read(&ctx->bc, 2);
+        band->is_halfpel = get_bits(&ctx->gb, 2);
         if (band->is_halfpel >= 2) {
             av_log(avctx, AV_LOG_ERROR, "Invalid/unsupported mv resolution: %d!\n",
                    band->is_halfpel);
@@ -288,11 +301,11 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
         if (!band->is_halfpel)
             ctx->uses_fullpel = 1;
 
-        band->checksum_present = bitstream_read_bit(&ctx->bc);
+        band->checksum_present = get_bits1(&ctx->gb);
         if (band->checksum_present)
-            band->checksum = bitstream_read(&ctx->bc, 16);
+            band->checksum = get_bits(&ctx->gb, 16);
 
-        indx = bitstream_read(&ctx->bc, 2);
+        indx = get_bits(&ctx->gb, 2);
         if (indx == 3) {
             av_log(avctx, AV_LOG_ERROR, "Invalid block size!\n");
             return AVERROR_INVALIDDATA;
@@ -300,13 +313,13 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
         band->mb_size  = 16 >> indx;
         band->blk_size = 8 >> (indx >> 1);
 
-        band->inherit_mv     = bitstream_read_bit(&ctx->bc);
-        band->inherit_qdelta = bitstream_read_bit(&ctx->bc);
+        band->inherit_mv     = get_bits1(&ctx->gb);
+        band->inherit_qdelta = get_bits1(&ctx->gb);
 
-        band->glob_quant = bitstream_read(&ctx->bc, 5);
+        band->glob_quant = get_bits(&ctx->gb, 5);
 
-        if (!bitstream_read_bit(&ctx->bc) || ctx->frame_type == IVI4_FRAMETYPE_INTRA) {
-            transform_id = bitstream_read(&ctx->bc, 5);
+        if (!get_bits1(&ctx->gb) || ctx->frame_type == IVI4_FRAMETYPE_INTRA) {
+            transform_id = get_bits(&ctx->gb, 5);
             if (transform_id >= FF_ARRAY_ELEMS(transforms) ||
                 !transforms[transform_id].inv_trans) {
                 avpriv_request_sample(avctx, "Transform %d", transform_id);
@@ -318,45 +331,56 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                 return AVERROR_PATCHWELCOME;
             }
 
+            if (transform_id < 10 && band->blk_size < 8) {
+                av_log(avctx, AV_LOG_ERROR, "wrong transform size!\n");
+                return AVERROR_INVALIDDATA;
+            }
             if ((transform_id >= 0 && transform_id <= 2) || transform_id == 10)
                 ctx->uses_haar = 1;
 
             band->inv_transform = transforms[transform_id].inv_trans;
             band->dc_transform  = transforms[transform_id].dc_trans;
             band->is_2d_trans   = transforms[transform_id].is_2d_trans;
+
             if (transform_id < 10)
                 band->transform_size = 8;
             else
                 band->transform_size = 4;
 
-            if (band->blk_size != band->transform_size)
+            if (band->blk_size != band->transform_size) {
+                av_log(avctx, AV_LOG_ERROR, "transform and block size mismatch (%d != %d)\n", band->transform_size, band->blk_size);
                 return AVERROR_INVALIDDATA;
+            }
 
-            scan_indx = bitstream_read(&ctx->bc, 4);
+            scan_indx = get_bits(&ctx->gb, 4);
             if (scan_indx == 15) {
                 av_log(avctx, AV_LOG_ERROR, "Custom scan pattern encountered!\n");
                 return AVERROR_INVALIDDATA;
             }
             if (scan_indx > 4 && scan_indx < 10) {
-                if (band->blk_size != 4)
+                if (band->blk_size != 4) {
+                    av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
                     return AVERROR_INVALIDDATA;
-            } else if (band->blk_size != 8)
+                }
+            } else if (band->blk_size != 8) {
+                av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
                 return AVERROR_INVALIDDATA;
+            }
 
             band->scan = scan_index_to_tab[scan_indx];
+            band->scan_size = band->blk_size;
 
-            band->quant_mat = bitstream_read(&ctx->bc, 5);
-            if (band->quant_mat >= FF_ARRAY_ELEMS(quant_index_to_tab)) {
-
-                if (band->quant_mat == 31)
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Custom quant matrix encountered!\n");
-                else
-                    avpriv_request_sample(avctx, "Quantization matrix %d",
-                                          band->quant_mat);
-                band->quant_mat = -1;
+            quant_mat = get_bits(&ctx->gb, 5);
+            if (quant_mat == 31) {
+                av_log(avctx, AV_LOG_ERROR, "Custom quant matrix encountered!\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (quant_mat >= FF_ARRAY_ELEMS(quant_index_to_tab)) {
+                avpriv_request_sample(avctx, "Quantization matrix %d",
+                                      quant_mat);
                 return AVERROR_INVALIDDATA;
             }
+            band->quant_mat = quant_mat;
         } else {
             if (old_blk_size != band->blk_size) {
                 av_log(avctx, AV_LOG_ERROR,
@@ -364,27 +388,36 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                        "inherited\n");
                 return AVERROR_INVALIDDATA;
             }
-            if (band->quant_mat < 0) {
-                av_log(avctx, AV_LOG_ERROR, "Invalid quant_mat inherited\n");
-                return AVERROR_INVALIDDATA;
-            }
+        }
+        if (quant_index_to_tab[band->quant_mat] > 4 && band->blk_size == 4) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid quant matrix for 4x4 block encountered!\n");
+            band->quant_mat = 0;
+            return AVERROR_INVALIDDATA;
+        }
+        if (band->scan_size != band->blk_size) {
+            av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if (band->transform_size == 8 && band->blk_size < 8) {
+            av_log(avctx, AV_LOG_ERROR, "mismatching transform_size!\n");
+            return AVERROR_INVALIDDATA;
         }
 
         /* decode block huffman codebook */
-        if (!bitstream_read_bit(&ctx->bc))
-            band->blk_vlc.tab = ctx->blk_vlc.tab;
+        if (!get_bits1(&ctx->gb))
+            arg_band->blk_vlc.tab = ctx->blk_vlc.tab;
         else
-            if (ff_ivi_dec_huff_desc(&ctx->bc, 1, IVI_BLK_HUFF,
-                                     &band->blk_vlc, avctx))
+            if (ff_ivi_dec_huff_desc(&ctx->gb, 1, IVI_BLK_HUFF,
+                                     &arg_band->blk_vlc, avctx))
                 return AVERROR_INVALIDDATA;
 
         /* select appropriate rvmap table for this band */
-        band->rvmap_sel = bitstream_read_bit(&ctx->bc) ? bitstream_read(&ctx->bc, 3) : 8;
+        band->rvmap_sel = get_bits1(&ctx->gb) ? get_bits(&ctx->gb, 3) : 8;
 
         /* decode rvmap probability corrections if any */
         band->num_corr = 0; /* there is no corrections */
-        if (bitstream_read_bit(&ctx->bc)) {
-            band->num_corr = bitstream_read(&ctx->bc, 8); /* get number of correction pairs */
+        if (get_bits1(&ctx->gb)) {
+            band->num_corr = get_bits(&ctx->gb, 8); /* get number of correction pairs */
             if (band->num_corr > 61) {
                 av_log(avctx, AV_LOG_ERROR, "Too many corrections: %d\n",
                        band->num_corr);
@@ -393,7 +426,7 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
 
             /* read correction pairs */
             for (i = 0; i < band->num_corr * 2; i++)
-                band->corr[i] = bitstream_read(&ctx->bc, 8);
+                band->corr[i] = get_bits(&ctx->gb, 8);
         }
     }
 
@@ -409,7 +442,15 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
     band->intra_scale = NULL;
     band->inter_scale = NULL;
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
+
+    if (!band->scan) {
+        av_log(avctx, AV_LOG_ERROR, "band->scan not set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    band->blk_vlc = arg_band->blk_vlc;
+    memcpy(arg_band, band, sizeof(*arg_band));
 
     return 0;
 }
@@ -429,7 +470,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                           IVITile *tile, AVCodecContext *avctx)
 {
     int         x, y, mv_x, mv_y, mv_delta, offs, mb_offset, blks_per_mb,
-                mv_scale, mb_type_bits;
+                mv_scale, mb_type_bits, s;
     IVIMbInfo   *mb, *ref_mb;
     int         row_offset = band->mb_size * band->pitch;
 
@@ -444,6 +485,11 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
     mv_scale = (ctx->planes[0].bands[0].mb_size >> 3) - (band->mb_size >> 3);
     mv_x = mv_y = 0;
 
+    if (((tile->width + band->mb_size-1)/band->mb_size) * ((tile->height + band->mb_size-1)/band->mb_size) != tile->num_MBs) {
+        av_log(avctx, AV_LOG_ERROR, "num_MBs mismatch %d %d %d %d\n", tile->width, tile->height, band->mb_size, tile->num_MBs);
+        return -1;
+    }
+
     for (y = tile->ypos; y < tile->ypos + tile->height; y += band->mb_size) {
         mb_offset = offs;
 
@@ -454,7 +500,12 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
             mb->b_mv_x   =
             mb->b_mv_y   = 0;
 
-            if (bitstream_read_bit(&ctx->bc)) {
+            if (get_bits_left(&ctx->gb) < 1) {
+                av_log(avctx, AV_LOG_ERROR, "Insufficient input for mb info\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (get_bits1(&ctx->gb)) {
                 if (ctx->frame_type == IVI4_FRAMETYPE_INTRA) {
                     av_log(avctx, AV_LOG_ERROR, "Empty macroblock in an INTRA picture!\n");
                     return AVERROR_INVALIDDATA;
@@ -464,9 +515,8 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
 
                 mb->q_delta = 0;
                 if (!band->plane && !band->band_num && ctx->in_q) {
-                    mb->q_delta = bitstream_read_vlc(&ctx->bc,
-                                                     ctx->mb_vlc.tab->table,
-                                                     IVI_VLC_BITS, 1);
+                    mb->q_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                           IVI_VLC_BITS, 1);
                     mb->q_delta = IVI_TOSIGNED(mb->q_delta);
                 }
 
@@ -484,26 +534,27 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
             } else {
                 if (band->inherit_mv) {
                     /* copy mb_type from corresponding reference mb */
-                    if (!ref_mb)
+                    if (!ref_mb) {
+                        av_log(avctx, AV_LOG_ERROR, "ref_mb unavailable\n");
                         return AVERROR_INVALIDDATA;
+                    }
                     mb->type = ref_mb->type;
                 } else if (ctx->frame_type == IVI4_FRAMETYPE_INTRA ||
                            ctx->frame_type == IVI4_FRAMETYPE_INTRA1) {
                     mb->type = 0; /* mb_type is always INTRA for intra-frames */
                 } else {
-                    mb->type = bitstream_read(&ctx->bc, mb_type_bits);
+                    mb->type = get_bits(&ctx->gb, mb_type_bits);
                 }
 
-                mb->cbp = bitstream_read(&ctx->bc, blks_per_mb);
+                mb->cbp = get_bits(&ctx->gb, blks_per_mb);
 
                 mb->q_delta = 0;
                 if (band->inherit_qdelta) {
                     if (ref_mb) mb->q_delta = ref_mb->q_delta;
                 } else if (mb->cbp || (!band->plane && !band->band_num &&
                            ctx->in_q)) {
-                    mb->q_delta = bitstream_read_vlc(&ctx->bc,
-                                                     ctx->mb_vlc.tab->table,
-                                                     IVI_VLC_BITS, 1);
+                    mb->q_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                           IVI_VLC_BITS, 1);
                     mb->q_delta = IVI_TOSIGNED(mb->q_delta);
                 }
 
@@ -522,24 +573,22 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                             }
                     } else {
                         /* decode motion vector deltas */
-                        mv_delta = bitstream_read_vlc(&ctx->bc,
-                                                      ctx->mb_vlc.tab->table,
-                                                      IVI_VLC_BITS, 1);
+                        mv_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                            IVI_VLC_BITS, 1);
                         mv_y += IVI_TOSIGNED(mv_delta);
-                        mv_delta = bitstream_read_vlc(&ctx->bc,
-                                                      ctx->mb_vlc.tab->table,
-                                                      IVI_VLC_BITS, 1);
+                        mv_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                            IVI_VLC_BITS, 1);
                         mv_x += IVI_TOSIGNED(mv_delta);
                         mb->mv_x = mv_x;
                         mb->mv_y = mv_y;
                         if (mb->type == 3) {
-                            mv_delta = bitstream_read_vlc(&ctx->bc,
-                                                          ctx->mb_vlc.tab->table,
-                                                          IVI_VLC_BITS, 1);
+                            mv_delta = get_vlc2(&ctx->gb,
+                                                ctx->mb_vlc.tab->table,
+                                                IVI_VLC_BITS, 1);
                             mv_y += IVI_TOSIGNED(mv_delta);
-                            mv_delta = bitstream_read_vlc(&ctx->bc,
-                                                          ctx->mb_vlc.tab->table,
-                                                          IVI_VLC_BITS, 1);
+                            mv_delta = get_vlc2(&ctx->gb,
+                                                ctx->mb_vlc.tab->table,
+                                                IVI_VLC_BITS, 1);
                             mv_x += IVI_TOSIGNED(mv_delta);
                             mb->b_mv_x = -mv_x;
                             mb->b_mv_y = -mv_y;
@@ -554,6 +603,15 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
             }
 
+            s= band->is_halfpel;
+            if (mb->type)
+            if ( x +  (mb->mv_x   >>s) +                 (y+               (mb->mv_y   >>s))*band->pitch < 0 ||
+                 x + ((mb->mv_x+s)>>s) + band->mb_size - 1
+                   + (y+band->mb_size - 1 +((mb->mv_y+s)>>s))*band->pitch > band->bufsize -1) {
+                av_log(avctx, AV_LOG_ERROR, "motion vector %d %d outside reference\n", x*s + mb->mv_x, y*s + mb->mv_y);
+                return AVERROR_INVALIDDATA;
+            }
+
             mb++;
             if (ref_mb)
                 ref_mb++;
@@ -563,7 +621,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
         offs += row_offset;
     }
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
     return 0;
 }
diff --git a/libavcodec/indeo4data.h b/libavcodec/indeo4data.h
index 35ff404..cc497c2 100644
--- a/libavcodec/indeo4data.h
+++ b/libavcodec/indeo4data.h
@@ -2,20 +2,20 @@
  * Indeo Video Interactive 4 compatible decoder
  * Copyright (c) 2009-2010 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo5.c b/libavcodec/indeo5.c
index 38907eb..7b9da53 100644
--- a/libavcodec/indeo5.c
+++ b/libavcodec/indeo5.c
@@ -2,20 +2,20 @@
  * Indeo Video Interactive v5 compatible decoder
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,10 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "ivi.h"
 #include "ivi_dsp.h"
 #include "indeo5data.h"
-#include "vlc.h"
 
 /**
  *  Indeo5 frame types.
@@ -59,19 +58,19 @@ enum {
  */
 static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
 {
-    int             result, i, p, tile_size, pic_size_indx, mb_size, blk_size;
+    int             result, i, p, tile_size, pic_size_indx, mb_size, blk_size, is_scalable;
     int             quant_mat, blk_size_changed = 0;
     IVIBandDesc     *band, *band1, *band2;
     IVIPicConfig    pic_conf;
 
-    ctx->gop_flags = bitstream_read(&ctx->bc, 8);
+    ctx->gop_flags = get_bits(&ctx->gb, 8);
 
-    ctx->gop_hdr_size = (ctx->gop_flags & 1) ? bitstream_read(&ctx->bc, 16) : 0;
+    ctx->gop_hdr_size = (ctx->gop_flags & 1) ? get_bits(&ctx->gb, 16) : 0;
 
     if (ctx->gop_flags & IVI5_IS_PROTECTED)
-        ctx->lock_word = bitstream_read(&ctx->bc, 32);
+        ctx->lock_word = get_bits_long(&ctx->gb, 32);
 
-    tile_size = (ctx->gop_flags & 0x40) ? 64 << bitstream_read(&ctx->bc, 2) : 0;
+    tile_size = (ctx->gop_flags & 0x40) ? 64 << get_bits(&ctx->gb, 2) : 0;
     if (tile_size > 256) {
         av_log(avctx, AV_LOG_ERROR, "Invalid tile size: %d\n", tile_size);
         return AVERROR_INVALIDDATA;
@@ -79,19 +78,19 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
 
     /* decode number of wavelet bands */
     /* num_levels * 3 + 1 */
-    pic_conf.luma_bands   = bitstream_read(&ctx->bc, 2)  * 3 + 1;
-    pic_conf.chroma_bands = bitstream_read_bit(&ctx->bc) * 3 + 1;
-    ctx->is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
-    if (ctx->is_scalable && (pic_conf.luma_bands != 4 || pic_conf.chroma_bands != 1)) {
+    pic_conf.luma_bands   = get_bits(&ctx->gb, 2) * 3 + 1;
+    pic_conf.chroma_bands = get_bits1(&ctx->gb)   * 3 + 1;
+    is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
+    if (is_scalable && (pic_conf.luma_bands != 4 || pic_conf.chroma_bands != 1)) {
         av_log(avctx, AV_LOG_ERROR, "Scalability: unsupported subdivision! Luma bands: %d, chroma bands: %d\n",
                pic_conf.luma_bands, pic_conf.chroma_bands);
         return AVERROR_INVALIDDATA;
     }
 
-    pic_size_indx = bitstream_read(&ctx->bc, 4);
+    pic_size_indx = get_bits(&ctx->gb, 4);
     if (pic_size_indx == IVI5_PIC_SIZE_ESC) {
-        pic_conf.pic_height = bitstream_read(&ctx->bc, 13);
-        pic_conf.pic_width  = bitstream_read(&ctx->bc, 13);
+        pic_conf.pic_height = get_bits(&ctx->gb, 13);
+        pic_conf.pic_width  = get_bits(&ctx->gb, 13);
     } else {
         pic_conf.pic_height = ivi5_common_pic_sizes[pic_size_indx * 2 + 1] << 2;
         pic_conf.pic_width  = ivi5_common_pic_sizes[pic_size_indx * 2    ] << 2;
@@ -114,12 +113,13 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
 
     /* check if picture layout was changed and reallocate buffers */
     if (ivi_pic_config_cmp(&pic_conf, &ctx->pic_conf) || ctx->gop_invalid) {
-        result = ff_ivi_init_planes(ctx->planes, &pic_conf, 0);
+        result = ff_ivi_init_planes(avctx, ctx->planes, &pic_conf, 0);
         if (result < 0) {
             av_log(avctx, AV_LOG_ERROR, "Couldn't reallocate color planes!\n");
             return result;
         }
         ctx->pic_conf = pic_conf;
+        ctx->is_scalable = is_scalable;
         blk_size_changed = 1; /* force reallocation of the internal structures */
     }
 
@@ -127,19 +127,24 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
         for (i = 0; i < (!p ? pic_conf.luma_bands : pic_conf.chroma_bands); i++) {
             band = &ctx->planes[p].bands[i];
 
-            band->is_halfpel = bitstream_read_bit(&ctx->bc);
+            band->is_halfpel = get_bits1(&ctx->gb);
 
-            mb_size  = bitstream_read_bit(&ctx->bc);
-            blk_size = 8 >> bitstream_read_bit(&ctx->bc);
+            mb_size  = get_bits1(&ctx->gb);
+            blk_size = 8 >> get_bits1(&ctx->gb);
             mb_size  = blk_size << !mb_size;
 
+            if (p==0 && blk_size==4) {
+                av_log(avctx, AV_LOG_ERROR, "4x4 luma blocks are unsupported!\n");
+                return AVERROR_PATCHWELCOME;
+            }
+
             blk_size_changed = mb_size != band->mb_size || blk_size != band->blk_size;
             if (blk_size_changed) {
                 band->mb_size  = mb_size;
                 band->blk_size = blk_size;
             }
 
-            if (bitstream_read_bit(&ctx->bc)) {
+            if (get_bits1(&ctx->gb)) {
                 avpriv_report_missing_feature(avctx, "Extended transform info");
                 return AVERROR_PATCHWELCOME;
             }
@@ -185,8 +190,10 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             band->is_2d_trans = band->inv_transform == ff_ivi_inverse_slant_8x8 ||
                                 band->inv_transform == ff_ivi_inverse_slant_4x4;
 
-            if (band->transform_size != band->blk_size)
+            if (band->transform_size != band->blk_size) {
+                av_log(avctx, AV_LOG_ERROR, "transform and block size mismatch (%d != %d)\n", band->transform_size, band->blk_size);
                 return AVERROR_INVALIDDATA;
+            }
 
             /* select dequant matrix according to plane and band number */
             if (!p) {
@@ -196,6 +203,10 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             }
 
             if (band->blk_size == 8) {
+                if(quant_mat >= 5){
+                    av_log(avctx, AV_LOG_ERROR, "quant_mat %d too large!\n", quant_mat);
+                    return -1;
+                }
                 band->intra_base  = &ivi5_base_quant_8x8_intra[quant_mat][0];
                 band->inter_base  = &ivi5_base_quant_8x8_inter[quant_mat][0];
                 band->intra_scale = &ivi5_scale_quant_8x8_intra[quant_mat][0];
@@ -207,7 +218,7 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
                 band->inter_scale = ivi5_scale_quant_4x4_inter;
             }
 
-            if (bitstream_read(&ctx->bc, 2)) {
+            if (get_bits(&ctx->gb, 2)) {
                 av_log(avctx, AV_LOG_ERROR, "End marker missing!\n");
                 return AVERROR_INVALIDDATA;
             }
@@ -232,6 +243,7 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
         band2->inv_transform = band1->inv_transform;
         band2->dc_transform  = band1->dc_transform;
         band2->is_2d_trans   = band1->is_2d_trans;
+        band2->transform_size= band1->transform_size;
     }
 
     /* reallocate internal structures if needed */
@@ -246,27 +258,27 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
     }
 
     if (ctx->gop_flags & 8) {
-        if (bitstream_read(&ctx->bc, 3)) {
+        if (get_bits(&ctx->gb, 3)) {
             av_log(avctx, AV_LOG_ERROR, "Alignment bits are not zero!\n");
             return AVERROR_INVALIDDATA;
         }
 
-        if (bitstream_read_bit(&ctx->bc))
-            bitstream_skip(&ctx->bc, 24); /* skip transparency fill color */
+        if (get_bits1(&ctx->gb))
+            skip_bits_long(&ctx->gb, 24); /* skip transparency fill color */
     }
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
-    bitstream_skip(&ctx->bc, 23); /* FIXME: unknown meaning */
+    skip_bits(&ctx->gb, 23); /* FIXME: unknown meaning */
 
     /* skip GOP extension if any */
-    if (bitstream_read_bit(&ctx->bc)) {
+    if (get_bits1(&ctx->gb)) {
         do {
-            i = bitstream_read(&ctx->bc, 16);
+            i = get_bits(&ctx->gb, 16);
         } while (i & 0x8000);
     }
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
     return 0;
 }
@@ -275,17 +287,20 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
 /**
  *  Skip a header extension.
  *
- *  @param[in,out]  bc  the Bitstream context
+ *  @param[in,out]  gb  the GetBit context
  */
-static inline void skip_hdr_extension(BitstreamContext *bc)
+static inline int skip_hdr_extension(GetBitContext *gb)
 {
     int i, len;
 
     do {
-        len = bitstream_read(bc, 8);
-        for (i = 0; i < len; i++)
-            bitstream_skip(bc, 8);
+        len = get_bits(gb, 8);
+        if (8*len > get_bits_left(gb))
+            return AVERROR_INVALIDDATA;
+        for (i = 0; i < len; i++) skip_bits(gb, 8);
     } while(len);
+
+    return 0;
 }
 
 
@@ -300,19 +315,20 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
 {
     int ret;
 
-    if (bitstream_read(&ctx->bc, 5) != 0x1F) {
+    if (get_bits(&ctx->gb, 5) != 0x1F) {
         av_log(avctx, AV_LOG_ERROR, "Invalid picture start code!\n");
         return AVERROR_INVALIDDATA;
     }
 
     ctx->prev_frame_type = ctx->frame_type;
-    ctx->frame_type      = bitstream_read(&ctx->bc, 3);
+    ctx->frame_type      = get_bits(&ctx->gb, 3);
     if (ctx->frame_type >= 5) {
         av_log(avctx, AV_LOG_ERROR, "Invalid frame type: %d \n", ctx->frame_type);
+        ctx->frame_type = FRAMETYPE_INTRA;
         return AVERROR_INVALIDDATA;
     }
 
-    ctx->frame_num = bitstream_read(&ctx->bc, 8);
+    ctx->frame_num = get_bits(&ctx->gb, 8);
 
     if (ctx->frame_type == FRAMETYPE_INTRA) {
         if ((ret = decode_gop_header(ctx, avctx)) < 0) {
@@ -323,27 +339,33 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
         ctx->gop_invalid = 0;
     }
 
+    if (ctx->frame_type == FRAMETYPE_INTER_SCAL && !ctx->is_scalable) {
+        av_log(avctx, AV_LOG_ERROR, "Scalable inter frame in non scalable stream\n");
+        ctx->frame_type = FRAMETYPE_INTER;
+        return AVERROR_INVALIDDATA;
+    }
+
     if (ctx->frame_type != FRAMETYPE_NULL) {
-        ctx->frame_flags = bitstream_read(&ctx->bc, 8);
+        ctx->frame_flags = get_bits(&ctx->gb, 8);
 
-        ctx->pic_hdr_size = (ctx->frame_flags & 1) ? bitstream_read(&ctx->bc, 24) : 0;
+        ctx->pic_hdr_size = (ctx->frame_flags & 1) ? get_bits_long(&ctx->gb, 24) : 0;
 
-        ctx->checksum = (ctx->frame_flags & 0x10) ? bitstream_read(&ctx->bc, 16) : 0;
+        ctx->checksum = (ctx->frame_flags & 0x10) ? get_bits(&ctx->gb, 16) : 0;
 
         /* skip unknown extension if any */
         if (ctx->frame_flags & 0x20)
-            skip_hdr_extension(&ctx->bc); /* XXX: untested */
+            skip_hdr_extension(&ctx->gb); /* XXX: untested */
 
         /* decode macroblock huffman codebook */
-        ret = ff_ivi_dec_huff_desc(&ctx->bc, ctx->frame_flags & 0x40,
+        ret = ff_ivi_dec_huff_desc(&ctx->gb, ctx->frame_flags & 0x40,
                                    IVI_MB_HUFF, &ctx->mb_vlc, avctx);
         if (ret < 0)
             return ret;
 
-        bitstream_skip(&ctx->bc, 3); /* FIXME: unknown meaning! */
+        skip_bits(&ctx->gb, 3); /* FIXME: unknown meaning! */
     }
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
     return 0;
 }
@@ -363,14 +385,14 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
     int         i, ret;
     uint8_t     band_flags;
 
-    band_flags = bitstream_read(&ctx->bc, 8);
+    band_flags = get_bits(&ctx->gb, 8);
 
     if (band_flags & 1) {
         band->is_empty = 1;
         return 0;
     }
 
-    band->data_size = (ctx->frame_flags & 0x80) ? bitstream_read(&ctx->bc, 24) : 0;
+    band->data_size = (ctx->frame_flags & 0x80) ? get_bits_long(&ctx->gb, 24) : 0;
 
     band->inherit_mv     = band_flags & 2;
     band->inherit_qdelta = band_flags & 8;
@@ -380,7 +402,7 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
     /* decode rvmap probability corrections if any */
     band->num_corr = 0; /* there are no corrections */
     if (band_flags & 0x10) {
-        band->num_corr = bitstream_read(&ctx->bc, 8); /* get number of correction pairs */
+        band->num_corr = get_bits(&ctx->gb, 8); /* get number of correction pairs */
         if (band->num_corr > 61) {
             av_log(avctx, AV_LOG_ERROR, "Too many corrections: %d\n",
                    band->num_corr);
@@ -389,31 +411,31 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
 
         /* read correction pairs */
         for (i = 0; i < band->num_corr * 2; i++)
-            band->corr[i] = bitstream_read(&ctx->bc, 8);
+            band->corr[i] = get_bits(&ctx->gb, 8);
     }
 
     /* select appropriate rvmap table for this band */
-    band->rvmap_sel = (band_flags & 0x40) ? bitstream_read(&ctx->bc, 3) : 8;
+    band->rvmap_sel = (band_flags & 0x40) ? get_bits(&ctx->gb, 3) : 8;
 
     /* decode block huffman codebook */
-    ret = ff_ivi_dec_huff_desc(&ctx->bc, band_flags & 0x80, IVI_BLK_HUFF,
+    ret = ff_ivi_dec_huff_desc(&ctx->gb, band_flags & 0x80, IVI_BLK_HUFF,
                                &band->blk_vlc, avctx);
     if (ret < 0)
         return ret;
 
-    band->checksum_present = bitstream_read_bit(&ctx->bc);
+    band->checksum_present = get_bits1(&ctx->gb);
     if (band->checksum_present)
-        band->checksum = bitstream_read(&ctx->bc, 16);
+        band->checksum = get_bits(&ctx->gb, 16);
 
-    band->glob_quant = bitstream_read(&ctx->bc, 5);
+    band->glob_quant = get_bits(&ctx->gb, 5);
 
     /* skip unknown extension if any */
     if (band_flags & 0x20) { /* XXX: untested */
-        bitstream_align(&ctx->bc);
-        skip_hdr_extension(&ctx->bc);
+        align_get_bits(&ctx->gb);
+        skip_hdr_extension(&ctx->gb);
     }
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
     return 0;
 }
@@ -433,7 +455,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                           IVITile *tile, AVCodecContext *avctx)
 {
     int         x, y, mv_x, mv_y, mv_delta, offs, mb_offset,
-                mv_scale, blks_per_mb;
+                mv_scale, blks_per_mb, s;
     IVIMbInfo   *mb, *ref_mb;
     int         row_offset = band->mb_size * band->pitch;
 
@@ -463,7 +485,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
             mb->ypos     = y;
             mb->buf_offs = mb_offset;
 
-            if (bitstream_read_bit(&ctx->bc)) {
+            if (get_bits1(&ctx->gb)) {
                 if (ctx->frame_type == FRAMETYPE_INTRA) {
                     av_log(avctx, AV_LOG_ERROR, "Empty macroblock in an INTRA picture!\n");
                     return AVERROR_INVALIDDATA;
@@ -473,14 +495,13 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
 
                 mb->q_delta = 0;
                 if (!band->plane && !band->band_num && (ctx->frame_flags & 8)) {
-                    mb->q_delta = bitstream_read_vlc(&ctx->bc,
-                                                     ctx->mb_vlc.tab->table,
-                                                     IVI_VLC_BITS, 1);
+                    mb->q_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                           IVI_VLC_BITS, 1);
                     mb->q_delta = IVI_TOSIGNED(mb->q_delta);
                 }
 
                 mb->mv_x = mb->mv_y = 0; /* no motion vector coded */
-                if (band->inherit_mv){
+                if (band->inherit_mv && ref_mb){
                     /* motion vector inheritance */
                     if (mv_scale) {
                         mb->mv_x = ivi_scale_mv(ref_mb->mv_x, mv_scale);
@@ -491,16 +512,16 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                     }
                 }
             } else {
-                if (band->inherit_mv) {
+                if (band->inherit_mv && ref_mb) {
                     mb->type = ref_mb->type; /* copy mb_type from corresponding reference mb */
                 } else if (ctx->frame_type == FRAMETYPE_INTRA) {
                     mb->type = 0; /* mb_type is always INTRA for intra-frames */
                 } else {
-                    mb->type = bitstream_read_bit(&ctx->bc);
+                    mb->type = get_bits1(&ctx->gb);
                 }
 
                 blks_per_mb = band->mb_size != band->blk_size ? 4 : 1;
-                mb->cbp = bitstream_read(&ctx->bc, blks_per_mb);
+                mb->cbp = get_bits(&ctx->gb, blks_per_mb);
 
                 mb->q_delta = 0;
                 if (band->qdelta_present) {
@@ -508,9 +529,8 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                         if (ref_mb) mb->q_delta = ref_mb->q_delta;
                     } else if (mb->cbp || (!band->plane && !band->band_num &&
                                            (ctx->frame_flags & 8))) {
-                        mb->q_delta = bitstream_read_vlc(&ctx->bc,
-                                                         ctx->mb_vlc.tab->table,
-                                                         IVI_VLC_BITS, 1);
+                        mb->q_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                               IVI_VLC_BITS, 1);
                         mb->q_delta = IVI_TOSIGNED(mb->q_delta);
                     }
                 }
@@ -518,7 +538,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 if (!mb->type) {
                     mb->mv_x = mb->mv_y = 0; /* there is no motion vector in intra-macroblocks */
                 } else {
-                    if (band->inherit_mv){
+                    if (band->inherit_mv && ref_mb){
                         /* motion vector inheritance */
                         if (mv_scale) {
                             mb->mv_x = ivi_scale_mv(ref_mb->mv_x, mv_scale);
@@ -529,13 +549,11 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                         }
                     } else {
                         /* decode motion vector deltas */
-                        mv_delta = bitstream_read_vlc(&ctx->bc,
-                                                      ctx->mb_vlc.tab->table,
-                                                      IVI_VLC_BITS, 1);
+                        mv_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                            IVI_VLC_BITS, 1);
                         mv_y += IVI_TOSIGNED(mv_delta);
-                        mv_delta = bitstream_read_vlc(&ctx->bc,
-                                                      ctx->mb_vlc.tab->table,
-                                                      IVI_VLC_BITS, 1);
+                        mv_delta = get_vlc2(&ctx->gb, ctx->mb_vlc.tab->table,
+                                            IVI_VLC_BITS, 1);
                         mv_x += IVI_TOSIGNED(mv_delta);
                         mb->mv_x = mv_x;
                         mb->mv_y = mv_y;
@@ -543,6 +561,15 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
             }
 
+            s= band->is_halfpel;
+            if (mb->type)
+            if ( x +  (mb->mv_x   >>s) +                 (y+               (mb->mv_y   >>s))*band->pitch < 0 ||
+                 x + ((mb->mv_x+s)>>s) + band->mb_size - 1
+                   + (y+band->mb_size - 1 +((mb->mv_y+s)>>s))*band->pitch > band->bufsize - 1) {
+                av_log(avctx, AV_LOG_ERROR, "motion vector %d %d outside reference\n", x*s + mb->mv_x, y*s + mb->mv_y);
+                return AVERROR_INVALIDDATA;
+            }
+
             mb++;
             if (ref_mb)
                 ref_mb++;
@@ -552,7 +579,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
         offs += row_offset;
     }
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
     return 0;
 }
@@ -615,6 +642,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
     IVI45DecContext  *ctx = avctx->priv_data;
     int             result;
 
+    ctx->gop_invalid = 1;
+
     ff_ivi_init_static_vlc();
 
     /* copy rvmap tables in our context so we can apply changes to them */
@@ -631,7 +660,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ctx->pic_conf.tile_height   = avctx->height;
     ctx->pic_conf.luma_bands    = ctx->pic_conf.chroma_bands = 1;
 
-    result = ff_ivi_init_planes(ctx->planes, &ctx->pic_conf, 0);
+    result = ff_ivi_init_planes(avctx, ctx->planes, &ctx->pic_conf, 0);
     if (result) {
         av_log(avctx, AV_LOG_ERROR, "Couldn't allocate color planes!\n");
         return AVERROR_INVALIDDATA;
@@ -653,7 +682,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-
 AVCodec ff_indeo5_decoder = {
     .name           = "indeo5",
     .long_name      = NULL_IF_CONFIG_SMALL("Intel Indeo Video Interactive 5"),
diff --git a/libavcodec/indeo5data.h b/libavcodec/indeo5data.h
index f4252b5..a6217d0 100644
--- a/libavcodec/indeo5data.h
+++ b/libavcodec/indeo5data.h
@@ -2,20 +2,20 @@
  * Indeo Video Interactive 5 compatible decoder
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intelh263dec.c b/libavcodec/intelh263dec.c
index cd1971f..d321dd4 100644
--- a/libavcodec/intelh263dec.c
+++ b/libavcodec/intelh263dec.c
@@ -1,20 +1,20 @@
 /*
  * H.263i decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,8 +39,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     }
     s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
 
-    if (get_bits1(&s->gb) != 1) {
-        av_log(s->avctx, AV_LOG_ERROR, "Bad marker\n");
+    if (check_marker(s->avctx, &s->gb, "after picture_number") != 1) {
         return -1;      /* marker */
     }
     if (get_bits1(&s->gb) != 0) {
@@ -60,14 +59,14 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
 
     s->pict_type = AV_PICTURE_TYPE_I + get_bits1(&s->gb);
 
-    s->unrestricted_mv = get_bits1(&s->gb);
-    s->h263_long_vectors = s->unrestricted_mv;
+    s->h263_long_vectors = get_bits1(&s->gb);
 
     if (get_bits1(&s->gb) != 0) {
         av_log(s->avctx, AV_LOG_ERROR, "SAC not supported\n");
         return -1;      /* SAC: off */
     }
     s->obmc= get_bits1(&s->gb);
+    s->unrestricted_mv = s->obmc || s->h263_long_vectors;
     s->pb_frame = get_bits1(&s->gb);
 
     if (format < 6) {
@@ -83,7 +82,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
         }
         if(get_bits(&s->gb, 2))
             av_log(s->avctx, AV_LOG_ERROR, "Bad value for reserved field\n");
-        s->loop_filter = get_bits1(&s->gb);
+        s->loop_filter = get_bits1(&s->gb) * !s->avctx->lowres;
         if(get_bits1(&s->gb))
             av_log(s->avctx, AV_LOG_ERROR, "Bad value for reserved field\n");
         if(get_bits1(&s->gb))
@@ -96,7 +95,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     if(format == 6){
         int ar = get_bits(&s->gb, 4);
         skip_bits(&s->gb, 9); // display width
-        skip_bits1(&s->gb);
+        check_marker(s->avctx, &s->gb, "in dimensions");
         skip_bits(&s->gb, 9); // display height
         if(ar == 15){
             s->avctx->sample_aspect_ratio.num = get_bits(&s->gb, 8); // aspect ratio - width
@@ -117,9 +116,8 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     }
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0) {
-        skip_bits(&s->gb, 8);
-    }
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
     s->f_code = 1;
 
     s->y_dc_scale_table=
@@ -140,6 +138,7 @@ AVCodec ff_h263i_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/internal.h b/libavcodec/internal.h
index a619e97..f2e6f00 100644
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,36 +48,54 @@
 #define FF_CODEC_CAP_INIT_CLEANUP           (1 << 1)
 /**
  * Decoders marked with FF_CODEC_CAP_SETS_PKT_DTS want to set
- * AVFrame.pkt_dts manually. If the flag is set, utils.c won't overwrite
- * this field. If it's unset, utils.c tries to guess the pkt_dts field
+ * AVFrame.pkt_dts manually. If the flag is set, decode.c won't overwrite
+ * this field. If it's unset, decode.c tries to guess the pkt_dts field
  * from the input AVPacket.
  */
 #define FF_CODEC_CAP_SETS_PKT_DTS           (1 << 2)
 /**
+ * The decoder extracts and fills its parameters even if the frame is
+ * skipped due to the skip_frame setting.
+ */
+#define FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM  (1 << 3)
+/**
  * The decoder sets the cropping fields in the output frames manually.
  * If this cap is set, the generic code will initialize output frame
  * dimensions to coded rather than display values.
  */
-#define FF_CODEC_CAP_EXPORTS_CROPPING       (1 << 3)
-
-#ifdef DEBUG
-#   define ff_dlog(ctx, ...) av_log(ctx, AV_LOG_DEBUG, __VA_ARGS__)
-#else
-#   define ff_dlog(ctx, ...) do { } while (0)
-#endif
+#define FF_CODEC_CAP_EXPORTS_CROPPING       (1 << 4)
+/**
+ * Codec initializes slice-based threading with a main function
+ */
+#define FF_CODEC_CAP_SLICE_THREAD_HAS_MF    (1 << 5)
 
 #ifdef TRACE
 #   define ff_tlog(ctx, ...) av_log(ctx, AV_LOG_TRACE, __VA_ARGS__)
 #else
-#   define ff_tlog(ctx, ...) do { } while (0)
+#   define ff_tlog(ctx, ...) do { } while(0)
 #endif
 
 
 #define FF_DEFAULT_QUANT_BIAS 999999
 
-#define FF_SANE_NB_CHANNELS 63U
+#define FF_QSCALE_TYPE_MPEG1 0
+#define FF_QSCALE_TYPE_MPEG2 1
+#define FF_QSCALE_TYPE_H264  2
+#define FF_QSCALE_TYPE_VP56  3
+
+#define FF_SANE_NB_CHANNELS 256U
+
+#define FF_SIGNBIT(x) ((x) >> CHAR_BIT * sizeof(x) - 1)
 
-#define FF_SIGNBIT(x) (x >> CHAR_BIT * sizeof(x) - 1)
+#if HAVE_SIMD_ALIGN_64
+#   define STRIDE_ALIGN 64 /* AVX-512 */
+#elif HAVE_SIMD_ALIGN_32
+#   define STRIDE_ALIGN 32
+#elif HAVE_SIMD_ALIGN_16
+#   define STRIDE_ALIGN 16
+#else
+#   define STRIDE_ALIGN 8
+#endif
 
 typedef struct FramePool {
     /**
@@ -154,6 +172,19 @@ typedef struct AVCodecInternal {
     AVPacket *last_pkt_props;
 
     /**
+     * temporary buffer used for encoders to store their bitstream
+     */
+    uint8_t *byte_buffer;
+    unsigned int byte_buffer_size;
+
+    void *frame_thread_encoder;
+
+    /**
+     * Number of audio samples to skip at the start of the next decoded frame
+     */
+    int skip_samples;
+
+    /**
      * hwaccel-specific private data
      */
     void *hwaccel_priv_data;
@@ -180,6 +211,13 @@ typedef struct AVCodecInternal {
      * of the packet (that should be submitted in the next decode call */
     size_t compat_decode_partial_size;
     AVFrame *compat_decode_frame;
+
+    int showed_multi_packet_warning;
+
+    int skip_samples_multiplier;
+
+    /* to prevent infinite loop on errors when draining */
+    int nb_draining_errors;
 } AVCodecInternal;
 
 struct AVCodecDefault {
@@ -187,6 +225,8 @@ struct AVCodecDefault {
     const uint8_t *value;
 };
 
+extern const uint8_t ff_log2_run[41];
+
 /**
  * Return the index into tab at which {a,b} match elements {[0],[1]} of tab.
  * If there is no such matching pair then size is returned.
@@ -195,8 +235,7 @@ int ff_match_2uint16(const uint16_t (*tab)[2], int size, int a, int b);
 
 unsigned int avpriv_toupper4(unsigned int x);
 
-int avpriv_lock_avformat(void);
-int avpriv_unlock_avformat(void);
+void ff_color_frame(AVFrame *frame, const int color[4]);
 
 /**
  * Maximum size in bytes of extradata.
@@ -212,6 +251,7 @@ int avpriv_unlock_avformat(void);
  * ensure the output packet data is large enough, whether provided by the user
  * or allocated in this function.
  *
+ * @param avctx   the AVCodecContext of the encoder
  * @param avpkt   the AVPacket
  *                If avpkt->data is already set, avpkt->size is checked
  *                to ensure it is large enough.
@@ -219,9 +259,20 @@ int avpriv_unlock_avformat(void);
  *                avpkt->size is set to the specified size.
  *                All other AVPacket fields will be reset with av_init_packet().
  * @param size    the minimum required packet size
- * @return        0 on success, negative error code on failure
+ * @param min_size This is a hint to the allocation algorithm, which indicates
+ *                to what minimal size the caller might later shrink the packet
+ *                to. Encoders often allocate packets which are larger than the
+ *                amount of data that is written into them as the exact amount is
+ *                not known at the time of allocation. min_size represents the
+ *                size a packet might be shrunk to by the caller. Can be set to
+ *                0. setting this roughly correctly allows the allocation code
+ *                to choose between several allocation strategies to improve
+ *                speed slightly.
+ * @return        non negative on success, negative error code on failure
  */
-int ff_alloc_packet(AVPacket *avpkt, int size);
+int ff_alloc_packet2(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int64_t min_size);
+
+attribute_deprecated int ff_alloc_packet(AVPacket *avpkt, int size);
 
 /**
  * Rescale from sample rate to AVCodecContext.time_base.
@@ -229,11 +280,32 @@ int ff_alloc_packet(AVPacket *avpkt, int size);
 static av_always_inline int64_t ff_samples_to_time_base(AVCodecContext *avctx,
                                                         int64_t samples)
 {
+    if(samples == AV_NOPTS_VALUE)
+        return AV_NOPTS_VALUE;
     return av_rescale_q(samples, (AVRational){ 1, avctx->sample_rate },
                         avctx->time_base);
 }
 
 /**
+ * 2^(x) for integer x
+ * @return correctly rounded float
+ */
+static av_always_inline float ff_exp2fi(int x) {
+    /* Normal range */
+    if (-126 <= x && x <= 128)
+        return av_int2float((x+127) << 23);
+    /* Too large */
+    else if (x > 128)
+        return INFINITY;
+    /* Subnormal numbers */
+    else if (x > -150)
+        return av_int2float(1 << (x+149));
+    /* Negligibly small */
+    else
+        return 0;
+}
+
+/**
  * Get a buffer for a frame. This is a wrapper around
  * AVCodecContext.get_buffer() and should be used instead calling get_buffer()
  * directly.
@@ -246,9 +318,27 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags);
  */
 int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame);
 
-const uint8_t *avpriv_find_start_code(const uint8_t *restrict p,
+int ff_thread_can_start_frame(AVCodecContext *avctx);
+
+int avpriv_h264_has_num_reorder_frames(AVCodecContext *avctx);
+
+/**
+ * Call avcodec_open2 recursively by decrementing counter, unlocking mutex,
+ * calling the function and then restoring again. Assumes the mutex is
+ * already locked
+ */
+int ff_codec_open2_recursive(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options);
+
+/**
+ * Finalize buf into extradata and set its size appropriately.
+ */
+int avpriv_bprint_to_extradata(AVCodecContext *avctx, struct AVBPrint *buf);
+
+const uint8_t *avpriv_find_start_code(const uint8_t *p,
                                       const uint8_t *end,
-                                      uint32_t *restrict state);
+                                      uint32_t *state);
+
+int avpriv_codec_get_cap_skip_frame_fill_param(const AVCodec *codec);
 
 /**
  * Check that the provided frame dimensions are valid and set them on the codec
@@ -291,6 +381,41 @@ int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame);
  */
 AVCPBProperties *ff_add_cpb_side_data(AVCodecContext *avctx);
 
+int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type);
+
+/**
+ * Check AVFrame for A53 side data and allocate and fill SEI message with A53 info
+ *
+ * @param frame      Raw frame to get A53 side data from
+ * @param prefix_len Number of bytes to allocate before SEI message
+ * @param data       Pointer to a variable to store allocated memory
+ *                   Upon return the variable will hold NULL on error or if frame has no A53 info.
+ *                   Otherwise it will point to prefix_len uninitialized bytes followed by
+ *                   *sei_size SEI message
+ * @param sei_size   Pointer to a variable to store generated SEI message length
+ * @return           Zero on success, negative error code on failure
+ */
+int ff_alloc_a53_sei(const AVFrame *frame, size_t prefix_len,
+                     void **data, size_t *sei_size);
+
+/**
+ * Get an estimated video bitrate based on frame size, frame rate and coded
+ * bits per pixel.
+ */
+int64_t ff_guess_coded_bitrate(AVCodecContext *avctx);
+
+/**
+ * Check if a value is in the list. If not, return the default value
+ *
+ * @param ctx                Context for the log msg
+ * @param val_name           Name of the checked value, for log msg
+ * @param array_valid_values Array of valid int, ended with INT_MAX
+ * @param default_value      Value return if checked value is not in the array
+ * @return                   Value or default_value.
+ */
+int ff_int_from_list_or_default(void *ctx, const char * val_name, int val,
+                                const int * array_valid_values, int default_value);
+
 #if defined(_WIN32) && CONFIG_SHARED && !defined(BUILDING_avcodec)
 #    define av_export_avcodec __declspec(dllimport)
 #else
diff --git a/libavcodec/interplayacm.c b/libavcodec/interplayacm.c
new file mode 100644
index 0000000..5639d8d
--- /dev/null
+++ b/libavcodec/interplayacm.c
@@ -0,0 +1,634 @@
+/*
+ * Interplay ACM decoder
+ *
+ * Copyright (c) 2004-2008 Marko Kreen
+ * Copyright (c) 2008 Adam Gashlin
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "libavutil/intreadwrite.h"
+
+#define BITSTREAM_READER_LE
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+
+static const int8_t map_1bit[]      = { -1, +1 };
+static const int8_t map_2bit_near[] = { -2, -1, +1, +2 };
+static const int8_t map_2bit_far[]  = { -3, -2, +2, +3 };
+static const int8_t map_3bit[]      = { -4, -3, -2, -1, +1, +2, +3, +4 };
+
+static int mul_3x3 [3 * 3 * 3];
+static int mul_3x5 [5 * 5 * 5];
+static int mul_2x11[11  *  11];
+
+typedef struct InterplayACMContext {
+    GetBitContext gb;
+    uint8_t *bitstream;
+    int max_framesize;
+    int bitstream_size;
+    int bitstream_index;
+
+    int level;
+    int rows;
+    int cols;
+    int wrapbuf_len;
+    int block_len;
+    int skip;
+
+    int *block;
+    int *wrapbuf;
+    int *ampbuf;
+    int *midbuf;
+} InterplayACMContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    InterplayACMContext *s = avctx->priv_data;
+    int x1, x2, x3;
+
+    if (avctx->extradata_size < 14)
+        return AVERROR_INVALIDDATA;
+
+    if (avctx->channels <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels: %d\n", avctx->channels);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->level = AV_RL16(avctx->extradata + 12) & 0xf;
+    s->rows  = AV_RL16(avctx->extradata + 12) >>  4;
+    s->cols  = 1 << s->level;
+    s->wrapbuf_len = 2 * s->cols - 2;
+    s->block_len = s->rows * s->cols;
+    s->max_framesize = s->block_len;
+
+    s->block   = av_calloc(s->block_len, sizeof(int));
+    s->wrapbuf = av_calloc(s->wrapbuf_len, sizeof(int));
+    s->ampbuf  = av_calloc(0x10000, sizeof(int));
+    s->bitstream = av_calloc(s->max_framesize + AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*s->bitstream) + 1, sizeof(*s->bitstream));
+    if (!s->block || !s->wrapbuf || !s->ampbuf || !s->bitstream)
+        return AVERROR(ENOMEM);
+
+    s->midbuf  = s->ampbuf + 0x8000;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+
+    for (x3 = 0; x3 < 3; x3++)
+        for (x2 = 0; x2 < 3; x2++)
+            for (x1 = 0; x1 < 3; x1++)
+                mul_3x3[x1 + x2 * 3 + x3* 3 * 3] = x1 + (x2 << 4) + (x3 << 8);
+    for (x3 = 0; x3 < 5; x3++)
+        for (x2 = 0; x2 < 5; x2++)
+            for (x1 = 0; x1 < 5; x1++)
+                mul_3x5[x1 + x2 * 5 + x3 * 5 * 5] = x1 + (x2 << 4) + (x3 << 8);
+    for (x2 = 0; x2 < 11; x2++)
+        for (x1 = 0; x1 < 11; x1++)
+            mul_2x11[x1 + x2 * 11] = x1 + (x2 << 4);
+
+    return 0;
+}
+
+#define set_pos(s, r, c, idx) do {               \
+        unsigned pos = ((r) << s->level) + (c);  \
+        s->block[pos] = s->midbuf[(idx)];        \
+    } while (0)
+
+static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    unsigned i;
+
+    for (i = 0; i < s->rows; i++)
+        set_pos(s, i, col, 0);
+    return 0;
+}
+
+static int bad(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    return AVERROR_INVALIDDATA;
+}
+
+static int linear(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned int i;
+    int b, middle = 1 << (ind - 1);
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits(gb, ind);
+        set_pos(s, i, col, b - middle);
+    }
+    return 0;
+}
+
+static int k13(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i++, col, 0);
+            if (i >= s->rows)
+                break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+        b = get_bits1(gb);
+        set_pos(s, i, col, map_1bit[b]);
+    }
+    return 0;
+}
+
+static int k12(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        set_pos(s, i, col, map_1bit[b]);
+    }
+    return 0;
+}
+
+static int k24(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i++, col, 0);
+            if (i >= s->rows) break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_near[b]);
+    }
+    return 0;
+}
+
+static int k23(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_near[b]);
+    }
+    return 0;
+}
+
+static int k35(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i++, col, 0);
+            if (i >= s->rows)
+                break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            b = get_bits1(gb);
+            set_pos(s, i, col, map_1bit[b]);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_far[b]);
+    }
+    return 0;
+}
+
+static int k34(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            b = get_bits1(gb);
+            set_pos(s, i, col, map_1bit[b]);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_far[b]);
+    }
+    return 0;
+}
+
+static int k45(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0); i++;
+            if (i >= s->rows)
+                break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 3);
+        set_pos(s, i, col, map_3bit[b]);
+    }
+    return 0;
+}
+
+static int k44(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 3);
+        set_pos(s, i, col, map_3bit[b]);
+    }
+    return 0;
+}
+
+static int t15(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+    int n1, n2, n3;
+
+    for (i = 0; i < s->rows; i++) {
+        /* b = (x1) + (x2 * 3) + (x3 * 9) */
+        b = get_bits(gb, 5);
+        if (b > 26) {
+            av_log(NULL, AV_LOG_ERROR, "Too large b = %d > 26\n", b);
+            return AVERROR_INVALIDDATA;
+        }
+
+        n1 =  (mul_3x3[b] & 0x0F) - 1;
+        n2 = ((mul_3x3[b] >> 4) & 0x0F) - 1;
+        n3 = ((mul_3x3[b] >> 8) & 0x0F) - 1;
+
+        set_pos(s, i++, col, n1);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i++, col, n2);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i, col, n3);
+    }
+    return 0;
+}
+
+static int t27(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+    int n1, n2, n3;
+
+    for (i = 0; i < s->rows; i++) {
+        /* b = (x1) + (x2 * 5) + (x3 * 25) */
+        b = get_bits(gb, 7);
+        if (b > 124) {
+            av_log(NULL, AV_LOG_ERROR, "Too large b = %d > 124\n", b);
+            return AVERROR_INVALIDDATA;
+        }
+
+        n1 =  (mul_3x5[b] & 0x0F) - 2;
+        n2 = ((mul_3x5[b] >> 4) & 0x0F) - 2;
+        n3 = ((mul_3x5[b] >> 8) & 0x0F) - 2;
+
+        set_pos(s, i++, col, n1);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i++, col, n2);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i, col, n3);
+    }
+    return 0;
+}
+
+static int t37(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+    int n1, n2;
+    for (i = 0; i < s->rows; i++) {
+        /* b = (x1) + (x2 * 11) */
+        b = get_bits(gb, 7);
+        if (b > 120) {
+            av_log(NULL, AV_LOG_ERROR, "Too large b = %d > 120\n", b);
+            return AVERROR_INVALIDDATA;
+        }
+
+        n1 =  (mul_2x11[b] & 0x0F) - 5;
+        n2 = ((mul_2x11[b] >> 4) & 0x0F) - 5;
+
+        set_pos(s, i++, col, n1);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i, col, n2);
+    }
+    return 0;
+}
+
+typedef int (*filler)(InterplayACMContext *s, unsigned ind, unsigned col);
+
+static const filler filler_list[] = {
+    zero,   bad,    bad,    linear,
+    linear, linear, linear, linear,
+    linear, linear, linear, linear,
+    linear, linear, linear, linear,
+    linear, k13,    k12,    t15,
+    k24,    k23,    t27,    k35,
+    k34,    bad,    k45,    k44,
+    bad,    t37,    bad,    bad,
+};
+
+static int fill_block(InterplayACMContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, ind;
+    int ret;
+
+    for (i = 0; i < s->cols; i++) {
+        ind = get_bits(gb, 5);
+        ret = filler_list[ind](s, ind, i);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
+}
+
+static void juggle(int *wrap_p, int *block_p, unsigned sub_len, unsigned sub_count)
+{
+    unsigned i, j;
+    int *p, r0, r1, r2, r3;
+
+    for (i = 0; i < sub_len; i++) {
+        p = block_p;
+        r0 = wrap_p[0];
+        r1 = wrap_p[1];
+        for (j = 0; j < sub_count/2; j++) {
+            r2 = *p;
+            *p = r1 * 2 + (r0 + r2);
+            p += sub_len;
+            r3 = *p;
+            *p = r2 * 2 - (r1 + r3);
+            p += sub_len;
+            r0 = r2;
+            r1 = r3;
+        }
+
+        *wrap_p++ = r0;
+        *wrap_p++ = r1;
+        block_p++;
+    }
+}
+
+static void juggle_block(InterplayACMContext *s)
+{
+    unsigned sub_count, sub_len, todo_count, step_subcount, i;
+    int *wrap_p, *block_p, *p;
+
+    /* juggle only if subblock_len > 1 */
+    if (s->level == 0)
+        return;
+
+    /* 2048 / subblock_len */
+    if (s->level > 9)
+        step_subcount = 1;
+    else
+        step_subcount = (2048 >> s->level) - 2;
+
+    /* Apply juggle()  (rows)x(cols)
+     * from (step_subcount * 2)            x (subblock_len/2)
+     * to   (step_subcount * subblock_len) x (1)
+     */
+    todo_count = s->rows;
+    block_p = s->block;
+    while (1) {
+        wrap_p = s->wrapbuf;
+        sub_count = step_subcount;
+        if (sub_count > todo_count)
+            sub_count = todo_count;
+
+        sub_len = s->cols / 2;
+        sub_count *= 2;
+
+        juggle(wrap_p, block_p, sub_len, sub_count);
+        wrap_p += sub_len * 2;
+
+        for (i = 0, p = block_p; i < sub_count; i++) {
+            p[0]++;
+            p += sub_len;
+        }
+
+        while (sub_len > 1) {
+            sub_len /= 2;
+            sub_count *= 2;
+            juggle(wrap_p, block_p, sub_len, sub_count);
+            wrap_p += sub_len * 2;
+        }
+
+        if (todo_count <= step_subcount)
+            break;
+
+        todo_count -= step_subcount;
+        block_p += step_subcount << s->level;
+    }
+}
+
+static int decode_block(InterplayACMContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    int pwr, count, val, i, x, ret;
+
+    pwr = get_bits(gb, 4);
+    val = get_bits(gb, 16);
+
+    count = 1 << pwr;
+
+    for (i = 0, x = 0; i < count; i++) {
+        s->midbuf[i] = x;
+        x += val;
+    }
+
+    for (i = 1, x = -val; i <= count; i++) {
+        s->midbuf[-i] = x;
+        x -= val;
+    }
+
+    ret = fill_block(s);
+    if (ret < 0)
+        return ret;
+
+    juggle_block(s);
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *pkt)
+{
+    InterplayACMContext *s = avctx->priv_data;
+    GetBitContext *gb = &s->gb;
+    AVFrame *frame = data;
+    const uint8_t *buf;
+    int16_t *samples;
+    int ret, n, buf_size, input_buf_size;
+
+    if (!pkt->size && !s->bitstream_size) {
+        *got_frame_ptr = 0;
+        return 0;
+    }
+
+    buf_size = FFMIN(pkt->size, s->max_framesize - s->bitstream_size);
+    input_buf_size = buf_size;
+    if (s->bitstream_index + s->bitstream_size + buf_size > s->max_framesize) {
+        memmove(s->bitstream, &s->bitstream[s->bitstream_index], s->bitstream_size);
+        s->bitstream_index = 0;
+    }
+    if (pkt->data)
+        memcpy(&s->bitstream[s->bitstream_index + s->bitstream_size], pkt->data, buf_size);
+    buf                = &s->bitstream[s->bitstream_index];
+    buf_size          += s->bitstream_size;
+    s->bitstream_size  = buf_size;
+    if (buf_size < s->max_framesize && pkt->data) {
+        *got_frame_ptr = 0;
+        return input_buf_size;
+    }
+
+    if ((ret = init_get_bits8(gb, buf, buf_size)) < 0)
+        return ret;
+
+    frame->nb_samples = s->block_len / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    skip_bits(gb, s->skip);
+    ret = decode_block(s);
+    if (ret < 0)
+        return ret;
+
+    samples = (int16_t *)frame->data[0];
+    for (n = 0; n < frame->nb_samples * avctx->channels; n++) {
+        int val = s->block[n] >> s->level;
+        *samples++ = val;
+    }
+
+    *got_frame_ptr = 1;
+    s->skip = get_bits_count(gb) - 8 * (get_bits_count(gb) / 8);
+    n = get_bits_count(gb) / 8;
+
+    if (n > buf_size && pkt->data) {
+        s->bitstream_size = 0;
+        s->bitstream_index = 0;
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->bitstream_size) {
+        s->bitstream_index += n;
+        s->bitstream_size  -= n;
+        return input_buf_size;
+    }
+    return n;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    InterplayACMContext *s = avctx->priv_data;
+
+    av_freep(&s->block);
+    av_freep(&s->wrapbuf);
+    av_freep(&s->ampbuf);
+    av_freep(&s->bitstream);
+    s->bitstream_size = 0;
+
+    return 0;
+}
+
+AVCodec ff_interplay_acm_decoder = {
+    .name           = "interplayacm",
+    .long_name      = NULL_IF_CONFIG_SMALL("Interplay ACM"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_INTERPLAY_ACM,
+    .init           = decode_init,
+    .close          = decode_close,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
+    .priv_data_size = sizeof(InterplayACMContext),
+};
diff --git a/libavcodec/interplayvideo.c b/libavcodec/interplayvideo.c
index f5593d3..deaa09c 100644
--- a/libavcodec/interplayvideo.c
+++ b/libavcodec/interplayvideo.c
@@ -2,20 +2,20 @@
  * Interplay MVE Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,10 +38,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "libavutil/intreadwrite.h"
+
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "hpeldsp.h"
 #include "internal.h"
 
@@ -53,8 +55,15 @@ typedef struct IpvideoContext {
     HpelDSPContext hdsp;
     AVFrame *second_last_frame;
     AVFrame *last_frame;
+
+    /* For format 0x10 */
+    AVFrame *cur_decode_frame;
+    AVFrame *prev_decode_frame;
+
     const unsigned char *decoding_map;
     int decoding_map_size;
+    const unsigned char *skip_map;
+    int skip_map_size;
 
     int is_16bpp;
     GetByteContext stream_ptr, mv_ptr;
@@ -72,10 +81,10 @@ static int copy_from(IpvideoContext *s, AVFrame *src, AVFrame *dst, int delta_x,
     int motion_offset = current_offset + delta_y * dst->linesize[0]
                        + delta_x * (1 + s->is_16bpp);
     if (motion_offset < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, " Interplay video: motion offset < 0 (%d)\n", motion_offset);
+        av_log(s->avctx, AV_LOG_ERROR, "motion offset < 0 (%d)\n", motion_offset);
         return AVERROR_INVALIDDATA;
     } else if (motion_offset > s->upper_motion_limit_offset) {
-        av_log(s->avctx, AV_LOG_ERROR, " Interplay video: motion offset above limit (%d >= %d)\n",
+        av_log(s->avctx, AV_LOG_ERROR, "motion offset above limit (%d >= %d)\n",
             motion_offset, s->upper_motion_limit_offset);
         return AVERROR_INVALIDDATA;
     }
@@ -118,7 +127,7 @@ static int ipvideo_decode_block_opcode_0x2(IpvideoContext *s, AVFrame *frame)
         y =   8 + ((B - 56) / 29);
     }
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, s->second_last_frame, frame, x, y);
 }
 
@@ -144,7 +153,7 @@ static int ipvideo_decode_block_opcode_0x3(IpvideoContext *s, AVFrame *frame)
         y = -(  8 + ((B - 56) / 29));
     }
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, frame, frame, x, y);
 }
 
@@ -165,7 +174,7 @@ static int ipvideo_decode_block_opcode_0x4(IpvideoContext *s, AVFrame *frame)
     x = -8 + BL;
     y = -8 + BH;
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, s->last_frame, frame, x, y);
 }
 
@@ -178,14 +187,14 @@ static int ipvideo_decode_block_opcode_0x5(IpvideoContext *s, AVFrame *frame)
     x = bytestream2_get_byte(&s->stream_ptr);
     y = bytestream2_get_byte(&s->stream_ptr);
 
-    ff_dlog(NULL, "    motion bytes = %d, %d\n", x, y);
+    ff_tlog(s->avctx, "motion bytes = %d, %d\n", x, y);
     return copy_from(s, s->last_frame, frame, x, y);
 }
 
 static int ipvideo_decode_block_opcode_0x6(IpvideoContext *s, AVFrame *frame)
 {
     /* mystery opcode? skip multiple blocks? */
-    av_log(s->avctx, AV_LOG_ERROR, "  Interplay video: Help! Mystery opcode 0x6 seen\n");
+    av_log(s->avctx, AV_LOG_ERROR, "Help! Mystery opcode 0x6 seen\n");
 
     /* report success */
     return 0;
@@ -197,6 +206,11 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s, AVFrame *frame)
     unsigned char P[2];
     unsigned int flags;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x7\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 2-color encoding */
     P[0] = bytestream2_get_byte(&s->stream_ptr);
     P[1] = bytestream2_get_byte(&s->stream_ptr);
@@ -236,6 +250,11 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s, AVFrame *frame)
     unsigned char P[4];
     unsigned int flags = 0;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 12) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x8\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 2-color encoding for each 4x4 quadrant, or 2-color encoding on
      * either top and bottom or left and right halves */
     P[0] = bytestream2_get_byte(&s->stream_ptr);
@@ -308,6 +327,11 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s, AVFrame *frame)
     int x, y;
     unsigned char P[4];
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 8) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x9\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 4-color encoding */
     bytestream2_get_buffer(&s->stream_ptr, P, 4);
 
@@ -374,6 +398,11 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s, AVFrame *frame)
     unsigned char P[8];
     int flags = 0;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 16) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0xA\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     bytestream2_get_buffer(&s->stream_ptr, P, 4);
 
     /* 4-color encoding for each 4x4 quadrant, or 4-color encoding on
@@ -467,6 +496,11 @@ static int ipvideo_decode_block_opcode_0xD(IpvideoContext *s, AVFrame *frame)
     int y;
     unsigned char P[2];
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0xD\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 4-color block encoding: each 4x4 block is a different color */
     for (y = 0; y < 8; y++) {
         if (!(y & 3)) {
@@ -528,7 +562,7 @@ static int ipvideo_decode_block_opcode_0x6_16(IpvideoContext *s, AVFrame *frame)
     x = bytestream2_get_byte(&s->stream_ptr);
     y = bytestream2_get_byte(&s->stream_ptr);
 
-    ff_dlog(NULL, "    motion bytes = %d, %d\n", x, y);
+    ff_tlog(s->avctx, "motion bytes = %d, %d\n", x, y);
     return copy_from(s, s->second_last_frame, frame, x, y);
 }
 
@@ -876,12 +910,198 @@ static int (* const ipvideo_decode_block16[])(IpvideoContext *s, AVFrame *frame)
     ipvideo_decode_block_opcode_0xE_16, ipvideo_decode_block_opcode_0x1,
 };
 
-static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
+static void ipvideo_format_06_firstpass(IpvideoContext *s, AVFrame *frame, int16_t opcode)
+{
+    int line;
+
+    if (!opcode) {
+        for (line = 0; line < 8; ++line) {
+            bytestream2_get_buffer(&s->stream_ptr, s->pixel_ptr, 8);
+            s->pixel_ptr += s->stride;
+        }
+    } else {
+        /* Don't try to copy second_last_frame data on the first frames */
+        if (s->avctx->frame_number > 2)
+            copy_from(s, s->second_last_frame, frame, 0, 0);
+    }
+}
+
+static void ipvideo_format_06_secondpass(IpvideoContext *s, AVFrame *frame, int16_t opcode)
+{
+    int off_x, off_y;
+
+    if (opcode < 0) {
+        off_x = ((uint16_t)opcode - 0xC000) % frame->linesize[0];
+        off_y = ((uint16_t)opcode - 0xC000) / frame->linesize[0];
+        copy_from(s, s->last_frame, frame, off_x, off_y);
+    } else if (opcode > 0) {
+        off_x = ((uint16_t)opcode - 0x4000) % frame->linesize[0];
+        off_y = ((uint16_t)opcode - 0x4000) / frame->linesize[0];
+        copy_from(s, frame, frame, off_x, off_y);
+    }
+}
+
+static void (* const ipvideo_format_06_passes[])(IpvideoContext *s, AVFrame *frame, int16_t op) = {
+    ipvideo_format_06_firstpass, ipvideo_format_06_secondpass,
+};
+
+static void ipvideo_decode_format_06_opcodes(IpvideoContext *s, AVFrame *frame)
+{
+    int pass, x, y;
+    int16_t opcode;
+    GetByteContext decoding_map_ptr;
+
+    /* this is PAL8, so make the palette available */
+    memcpy(frame->data[1], s->pal, AVPALETTE_SIZE);
+    s->stride = frame->linesize[0];
+
+    s->line_inc = s->stride - 8;
+    s->upper_motion_limit_offset = (s->avctx->height - 8) * frame->linesize[0]
+                                  + (s->avctx->width - 8) * (1 + s->is_16bpp);
+
+    bytestream2_init(&decoding_map_ptr, s->decoding_map, s->decoding_map_size);
+
+    for (pass = 0; pass < 2; ++pass) {
+        bytestream2_seek(&decoding_map_ptr, 0, SEEK_SET);
+        for (y = 0; y < s->avctx->height; y += 8) {
+            for (x = 0; x < s->avctx->width; x += 8) {
+                opcode = bytestream2_get_le16(&decoding_map_ptr);
+
+                ff_tlog(s->avctx,
+                        "  block @ (%3d, %3d): opcode 0x%X, data ptr offset %d\n",
+                        x, y, opcode, bytestream2_tell(&s->stream_ptr));
+
+                s->pixel_ptr = frame->data[0] + x + y * frame->linesize[0];
+                ipvideo_format_06_passes[pass](s, frame, opcode);
+            }
+        }
+    }
+
+    if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) {
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "decode finished with %d bytes left over\n",
+               bytestream2_get_bytes_left(&s->stream_ptr));
+    }
+}
+
+static void ipvideo_format_10_firstpass(IpvideoContext *s, AVFrame *frame, int16_t opcode)
+{
+    int line;
+
+    if (!opcode) {
+        for (line = 0; line < 8; ++line) {
+            bytestream2_get_buffer(&s->stream_ptr, s->pixel_ptr, 8);
+            s->pixel_ptr += s->stride;
+        }
+    }
+}
+
+static void ipvideo_format_10_secondpass(IpvideoContext *s, AVFrame *frame, int16_t opcode)
+{
+    int off_x, off_y;
+
+    if (opcode < 0) {
+        off_x = ((uint16_t)opcode - 0xC000) % s->cur_decode_frame->linesize[0];
+        off_y = ((uint16_t)opcode - 0xC000) / s->cur_decode_frame->linesize[0];
+        copy_from(s, s->prev_decode_frame, s->cur_decode_frame, off_x, off_y);
+    } else if (opcode > 0) {
+        off_x = ((uint16_t)opcode - 0x4000) % s->cur_decode_frame->linesize[0];
+        off_y = ((uint16_t)opcode - 0x4000) / s->cur_decode_frame->linesize[0];
+        copy_from(s, s->cur_decode_frame, s->cur_decode_frame, off_x, off_y);
+    }
+}
+
+static void (* const ipvideo_format_10_passes[])(IpvideoContext *s, AVFrame *frame, int16_t op) = {
+    ipvideo_format_10_firstpass, ipvideo_format_10_secondpass,
+};
+
+static void ipvideo_decode_format_10_opcodes(IpvideoContext *s, AVFrame *frame)
+{
+    int pass, x, y, changed_block;
+    int16_t opcode, skip;
+    GetByteContext decoding_map_ptr;
+    GetByteContext skip_map_ptr;
+
+    bytestream2_skip(&s->stream_ptr, 14); /* data starts 14 bytes in */
+
+    /* this is PAL8, so make the palette available */
+    memcpy(frame->data[1], s->pal, AVPALETTE_SIZE);
+    s->stride = frame->linesize[0];
+
+    s->line_inc = s->stride - 8;
+    s->upper_motion_limit_offset = (s->avctx->height - 8) * frame->linesize[0]
+                                  + (s->avctx->width - 8) * (1 + s->is_16bpp);
+
+    bytestream2_init(&decoding_map_ptr, s->decoding_map, s->decoding_map_size);
+    bytestream2_init(&skip_map_ptr, s->skip_map, s->skip_map_size);
+
+    for (pass = 0; pass < 2; ++pass) {
+        bytestream2_seek(&decoding_map_ptr, 0, SEEK_SET);
+        bytestream2_seek(&skip_map_ptr, 0, SEEK_SET);
+        skip = bytestream2_get_le16(&skip_map_ptr);
+
+        for (y = 0; y < s->avctx->height; y += 8) {
+            for (x = 0; x < s->avctx->width; x += 8) {
+                s->pixel_ptr = s->cur_decode_frame->data[0] + x + y * s->cur_decode_frame->linesize[0];
+
+                while (skip <= 0)  {
+                    if (skip != -0x8000 && skip) {
+                        opcode = bytestream2_get_le16(&decoding_map_ptr);
+                        ipvideo_format_10_passes[pass](s, frame, opcode);
+                        break;
+                    }
+                    if (bytestream2_get_bytes_left(&skip_map_ptr) < 2)
+                        return;
+                    skip = bytestream2_get_le16(&skip_map_ptr);
+                }
+                skip *= 2;
+            }
+        }
+    }
+
+    bytestream2_seek(&skip_map_ptr, 0, SEEK_SET);
+    skip = bytestream2_get_le16(&skip_map_ptr);
+    for (y = 0; y < s->avctx->height; y += 8) {
+        for (x = 0; x < s->avctx->width; x += 8) {
+            changed_block = 0;
+            s->pixel_ptr = frame->data[0] + x + y*frame->linesize[0];
+
+            while (skip <= 0)  {
+                if (skip != -0x8000 && skip) {
+                    changed_block = 1;
+                    break;
+                }
+                if (bytestream2_get_bytes_left(&skip_map_ptr) < 2)
+                    return;
+                skip = bytestream2_get_le16(&skip_map_ptr);
+            }
+
+            if (changed_block) {
+                copy_from(s, s->cur_decode_frame, frame, 0, 0);
+            } else {
+                /* Don't try to copy last_frame data on the first frame */
+                if (s->avctx->frame_number)
+                    copy_from(s, s->last_frame, frame, 0, 0);
+            }
+            skip *= 2;
+        }
+    }
+
+    FFSWAP(AVFrame*, s->prev_decode_frame, s->cur_decode_frame);
+
+    if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) {
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "decode finished with %d bytes left over\n",
+               bytestream2_get_bytes_left(&s->stream_ptr));
+    }
+}
+
+static void ipvideo_decode_format_11_opcodes(IpvideoContext *s, AVFrame *frame)
 {
     int x, y;
     unsigned char opcode;
     int ret;
-    BitstreamContext bc;
+    GetBitContext gb;
 
     bytestream2_skip(&s->stream_ptr, 14); /* data starts 14 bytes in */
     if (!s->is_16bpp) {
@@ -898,12 +1118,14 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
     s->upper_motion_limit_offset = (s->avctx->height - 8) * frame->linesize[0]
                                   + (s->avctx->width - 8) * (1 + s->is_16bpp);
 
-    bitstream_init8(&bc, s->decoding_map, s->decoding_map_size);
+    init_get_bits(&gb, s->decoding_map, s->decoding_map_size * 8);
     for (y = 0; y < s->avctx->height; y += 8) {
         for (x = 0; x < s->avctx->width; x += 8) {
-            opcode = bitstream_read(&bc, 4);
+            if (get_bits_left(&gb) < 4)
+                return;
+            opcode = get_bits(&gb, 4);
 
-            ff_dlog(s->avctx,
+            ff_tlog(s->avctx,
                     "  block @ (%3d, %3d): encoding 0x%X, data ptr offset %d\n",
                     x, y, opcode, bytestream2_tell(&s->stream_ptr));
 
@@ -917,15 +1139,15 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
                 ret = ipvideo_decode_block16[opcode](s, frame);
             }
             if (ret != 0) {
-                av_log(s->avctx, AV_LOG_ERROR, " Interplay video: decode problem on frame %d, @ block (%d, %d)\n",
+                av_log(s->avctx, AV_LOG_ERROR, "decode problem on frame %d, @ block (%d, %d)\n",
                        s->avctx->frame_number, x, y);
                 return;
             }
         }
     }
     if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "Interplay video: decode finished with %d bytes left over\n",
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "decode finished with %d bytes left over\n",
                bytestream2_get_bytes_left(&s->stream_ptr));
     }
 }
@@ -933,6 +1155,7 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
 static av_cold int ipvideo_decode_init(AVCodecContext *avctx)
 {
     IpvideoContext *s = avctx->priv_data;
+    int ret;
 
     s->avctx = avctx;
 
@@ -943,13 +1166,36 @@ static av_cold int ipvideo_decode_init(AVCodecContext *avctx)
 
     s->last_frame        = av_frame_alloc();
     s->second_last_frame = av_frame_alloc();
-    if (!s->last_frame || !s->second_last_frame) {
-        av_frame_free(&s->last_frame);
-        av_frame_free(&s->second_last_frame);
-        return AVERROR(ENOMEM);
+    s->cur_decode_frame  = av_frame_alloc();
+    s->prev_decode_frame = av_frame_alloc();
+    if (!s->last_frame || !s->second_last_frame ||
+        !s->cur_decode_frame || !s->prev_decode_frame) {
+        ret = AVERROR(ENOMEM);
+        goto error;
     }
 
+    s->cur_decode_frame->width   = avctx->width;
+    s->prev_decode_frame->width  = avctx->width;
+    s->cur_decode_frame->height  = avctx->height;
+    s->prev_decode_frame->height = avctx->height;
+    s->cur_decode_frame->format  = avctx->pix_fmt;
+    s->prev_decode_frame->format = avctx->pix_fmt;
+
+    ret = ff_get_buffer(avctx, s->cur_decode_frame, 0);
+    if (ret < 0)
+        goto error;
+
+    ret = ff_get_buffer(avctx, s->prev_decode_frame, 0);
+    if (ret < 0)
+        goto error;
+
     return 0;
+error:
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->second_last_frame);
+    av_frame_free(&s->cur_decode_frame);
+    av_frame_free(&s->prev_decode_frame);
+    return ret;
 }
 
 static int ipvideo_decode_frame(AVCodecContext *avctx,
@@ -961,35 +1207,150 @@ static int ipvideo_decode_frame(AVCodecContext *avctx,
     IpvideoContext *s = avctx->priv_data;
     AVFrame *frame = data;
     int ret;
+    int send_buffer;
+    int frame_format;
+    int video_data_size;
+
+    if (av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, NULL)) {
+        av_frame_unref(s->last_frame);
+        av_frame_unref(s->second_last_frame);
+        av_frame_unref(s->cur_decode_frame);
+        av_frame_unref(s->prev_decode_frame);
+    }
+
+    if (!s->cur_decode_frame->data[0]) {
+        ret = ff_get_buffer(avctx, s->cur_decode_frame, 0);
+        if (ret < 0)
+            return ret;
+
+        ret = ff_get_buffer(avctx, s->prev_decode_frame, 0);
+        if (ret < 0) {
+            av_frame_unref(s->cur_decode_frame);
+            return ret;
+        }
+    }
 
-    /* decoding map contains 4 bits of information per 8x8 block */
-    s->decoding_map_size = avctx->width * avctx->height / (8 * 8 * 2);
+    if (buf_size < 8)
+        return AVERROR_INVALIDDATA;
 
-    /* compressed buffer needs to be large enough to at least hold an entire
-     * decoding map */
-    if (buf_size < s->decoding_map_size)
-        return buf_size;
+    frame_format         = AV_RL8(buf);
+    send_buffer          = AV_RL8(buf + 1);
+    video_data_size      = AV_RL16(buf + 2);
+    s->decoding_map_size = AV_RL16(buf + 4);
+    s->skip_map_size     = AV_RL16(buf + 6);
+
+    switch(frame_format) {
+        case 0x06:
+            if (s->decoding_map_size) {
+                av_log(avctx, AV_LOG_ERROR, "Decoding map for format 0x06\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-    s->decoding_map = buf;
-    bytestream2_init(&s->stream_ptr, buf + s->decoding_map_size,
-                     buf_size - s->decoding_map_size);
+            if (s->skip_map_size) {
+                av_log(avctx, AV_LOG_ERROR, "Skip map for format 0x06\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  Interplay Video: get_buffer() failed\n");
-        return ret;
+            if (s->is_16bpp) {
+                av_log(avctx, AV_LOG_ERROR, "Video format 0x06 does not support 16bpp movies\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            /* Decoding map for 0x06 frame format is at the top of pixeldata */
+            s->decoding_map_size = ((s->avctx->width / 8) * (s->avctx->height / 8)) * 2;
+            s->decoding_map = buf + 8 + 14; /* 14 bits of op data */
+            video_data_size -= s->decoding_map_size + 14;
+            if (video_data_size <= 0)
+                return AVERROR_INVALIDDATA;
+
+            if (buf_size < 8 + s->decoding_map_size + 14 + video_data_size)
+                return AVERROR_INVALIDDATA;
+
+            bytestream2_init(&s->stream_ptr, buf + 8 + s->decoding_map_size + 14, video_data_size);
+
+            break;
+
+        case 0x10:
+            if (! s->decoding_map_size) {
+                av_log(avctx, AV_LOG_ERROR, "Empty decoding map for format 0x10\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (! s->skip_map_size) {
+                av_log(avctx, AV_LOG_ERROR, "Empty skip map for format 0x10\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (s->is_16bpp) {
+                av_log(avctx, AV_LOG_ERROR, "Video format 0x10 does not support 16bpp movies\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (buf_size < 8 + video_data_size + s->decoding_map_size + s->skip_map_size)
+                return AVERROR_INVALIDDATA;
+
+            bytestream2_init(&s->stream_ptr, buf + 8, video_data_size);
+            s->decoding_map = buf + 8 + video_data_size;
+            s->skip_map = buf + 8 + video_data_size + s->decoding_map_size;
+
+            break;
+
+        case 0x11:
+            if (! s->decoding_map_size) {
+                av_log(avctx, AV_LOG_ERROR, "Empty decoding map for format 0x11\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (s->skip_map_size) {
+                av_log(avctx, AV_LOG_ERROR, "Skip map for format 0x11\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (buf_size < 8 + video_data_size + s->decoding_map_size)
+                return AVERROR_INVALIDDATA;
+
+            bytestream2_init(&s->stream_ptr, buf + 8, video_data_size);
+            s->decoding_map = buf + 8 + video_data_size;
+
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Frame type 0x%02X unsupported\n", frame_format);
+    }
+
+    /* ensure we can't overread the packet */
+    if (buf_size < 8 + s->decoding_map_size + video_data_size + s->skip_map_size) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid IP packet size\n");
+        return AVERROR_INVALIDDATA;
     }
 
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+
     if (!s->is_16bpp) {
-        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
-        if (pal) {
+        int size;
+        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &size);
+        if (pal && size == AVPALETTE_SIZE) {
             frame->palette_has_changed = 1;
             memcpy(s->pal, pal, AVPALETTE_SIZE);
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
         }
     }
 
-    ipvideo_decode_opcodes(s, frame);
+    switch(frame_format) {
+        case 0x06:
+            ipvideo_decode_format_06_opcodes(s, frame);
+            break;
+        case 0x10:
+            ipvideo_decode_format_10_opcodes(s, frame);
+            break;
+        case 0x11:
+            ipvideo_decode_format_11_opcodes(s, frame);
+            break;
+    }
 
-    *got_frame = 1;
+    *got_frame = send_buffer;
 
     /* shuffle frames */
     av_frame_unref(s->second_last_frame);
@@ -1007,6 +1368,8 @@ static av_cold int ipvideo_decode_end(AVCodecContext *avctx)
 
     av_frame_free(&s->last_frame);
     av_frame_free(&s->second_last_frame);
+    av_frame_free(&s->cur_decode_frame);
+    av_frame_free(&s->prev_decode_frame);
 
     return 0;
 }
diff --git a/libavcodec/intrax8.c b/libavcodec/intrax8.c
index d32bb05..d46f97c 100644
--- a/libavcodec/intrax8.c
+++ b/libavcodec/intrax8.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
  * @brief IntraX8 (J-Frame) subdecoder, used by WMV2 and VC-1
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "idctdsp.h"
@@ -115,7 +116,7 @@ static av_cold int x8_vlc_init(void)
 #undef init_or_vlc
 
     if (offset != sizeof(table) / sizeof(VLC_TYPE) / 2) {
-        av_log(NULL, AV_LOG_ERROR, "table size %zd does not match needed %i\n",
+        av_log(NULL, AV_LOG_ERROR, "table size %"SIZE_SPECIFIER" does not match needed %i\n",
                sizeof(table) / sizeof(VLC_TYPE) / 2, offset);
         return AVERROR_INVALIDDATA;
     }
@@ -134,7 +135,7 @@ static inline void x8_select_ac_table(IntraX8Context *const w, int mode)
 {
     int table_index;
 
-    assert(mode < 4);
+    av_assert2(mode < 4);
 
     if (w->j_ac_vlc[mode])
         return;
@@ -142,8 +143,7 @@ static inline void x8_select_ac_table(IntraX8Context *const w, int mode)
     table_index       = get_bits(w->gb, 3);
     // 2 modes use same tables
     w->j_ac_vlc[mode] = &j_ac_vlc[w->quant < 13][mode >> 1][table_index];
-
-    assert(w->j_ac_vlc[mode]);
+    av_assert2(w->j_ac_vlc[mode]);
 }
 
 static inline int x8_get_orient_vlc(IntraX8Context *w)
@@ -152,8 +152,6 @@ static inline int x8_get_orient_vlc(IntraX8Context *w)
         int table_index = get_bits(w->gb, 1 + (w->quant < 13));
         w->j_orient_vlc = &j_orient_vlc[w->quant < 13][table_index];
     }
-    assert(w->j_orient_vlc);
-    assert(w->j_orient_vlc->table);
 
     return get_vlc2(w->gb, w->j_orient_vlc->table, OR_VLC_BITS, OR_VLC_MTD);
 }
@@ -290,14 +288,12 @@ static int x8_get_dc_rlf(IntraX8Context *const w, const int mode,
 {
     int i, e, c;
 
-    assert(mode < 3);
+    av_assert2(mode < 3);
     if (!w->j_dc_vlc[mode]) {
         int table_index = get_bits(w->gb, 3);
         // 4 modes, same table
         w->j_dc_vlc[mode] = &j_dc_vlc[w->quant < 13][table_index];
     }
-    assert(w->j_dc_vlc);
-    assert(w->j_dc_vlc[mode]->table);
 
     i = get_vlc2(w->gb, w->j_dc_vlc[mode]->table, DC_VLC_BITS, DC_VLC_MTD);
 
@@ -354,7 +350,7 @@ static int x8_setup_spatial_predictor(IntraX8Context *const w, const int chroma)
     if (chroma)
         return 0;
 
-    assert(w->orient < 3);
+    av_assert2(w->orient < 3);
     if (range < 2 * w->quant) {
         if ((w->edges & 3) == 0) {
             if (w->orient == 1)
@@ -374,9 +370,9 @@ static int x8_setup_spatial_predictor(IntraX8Context *const w, const int chroma)
         w->raw_orient = x8_get_orient_vlc(w);
         if (w->raw_orient < 0)
             return -1;
-        assert(w->raw_orient < 12);
-        assert(w->orient < 3);
-        w->orient = prediction_table[w->orient][w->raw_orient];
+        av_assert2(w->raw_orient < 12);
+        av_assert2(w->orient < 3);
+        w->orient=prediction_table[w->orient][w->raw_orient];
     }
     return 0;
 }
@@ -480,7 +476,7 @@ static void x8_ac_compensation(IntraX8Context *const w, const int direction,
                                const int dc_level)
 {
     int t;
-#define B(x, y) w->block[0][w->idsp.idct_permutation[(x) + (y) * 8]]
+#define B(x,y)  w->block[0][w->idct_permutation[(x) + (y) * 8]]
 #define T(x)  ((x) * dc_level + 0x8000) >> 16;
     switch (direction) {
     case 0:
@@ -578,7 +574,7 @@ static int x8_decode_intra_mb(IntraX8Context *const w, const int chroma)
     int use_quant_matrix;
     int sign;
 
-    assert(w->orient < 12);
+    av_assert2(w->orient < 12);
     w->bdsp.clear_block(w->block[0]);
 
     if (chroma)
@@ -690,7 +686,7 @@ static int x8_decode_intra_mb(IntraX8Context *const w, const int chroma)
                                                w->frame->linesize[!!chroma]);
     }
     if (!zeros_only)
-        w->idsp.idct_add(w->dest[chroma],
+        w->wdsp.idct_add(w->dest[chroma],
                          w->frame->linesize[!!chroma],
                          w->block[0]);
 
@@ -751,15 +747,20 @@ av_cold int ff_intrax8_common_init(AVCodecContext *avctx,
     if (!w->prediction_table)
         return AVERROR(ENOMEM);
 
-    ff_init_scantable(w->idsp.idct_permutation, &w->scantable[0],
+    ff_wmv2dsp_init(&w->wdsp);
+
+    ff_init_scantable_permutation(w->idct_permutation,
+                                  w->wdsp.idct_perm);
+
+    ff_init_scantable(w->idct_permutation, &w->scantable[0],
                       ff_wmv1_scantable[0]);
-    ff_init_scantable(w->idsp.idct_permutation, &w->scantable[1],
+    ff_init_scantable(w->idct_permutation, &w->scantable[1],
                       ff_wmv1_scantable[2]);
-    ff_init_scantable(w->idsp.idct_permutation, &w->scantable[2],
+    ff_init_scantable(w->idct_permutation, &w->scantable[2],
                       ff_wmv1_scantable[3]);
 
     ff_intrax8dsp_init(&w->dsp);
-    ff_blockdsp_init(&w->bdsp);
+    ff_blockdsp_init(&w->bdsp, avctx);
 
     return 0;
 }
diff --git a/libavcodec/intrax8.h b/libavcodec/intrax8.h
index ad172b1..5b8946e 100644
--- a/libavcodec/intrax8.h
+++ b/libavcodec/intrax8.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 #include "get_bits.h"
 #include "idctdsp.h"
 #include "intrax8dsp.h"
+#include "wmv2dsp.h"
 #include "mpegpicture.h"
 
 typedef struct IntraX8Context {
@@ -35,6 +36,8 @@ typedef struct IntraX8Context {
     // set by ff_intrax8_common_init
     uint8_t *prediction_table; // 2 * (mb_w * 2)
     ScanTable scantable[3];
+    WMV2DSPContext wdsp;
+    uint8_t idct_permutation[64];
     AVCodecContext *avctx;
     int *block_last_index;  ///< last nonzero coefficient in block
     int16_t (*block)[64];
@@ -96,6 +99,7 @@ void ff_intrax8_common_end(IntraX8Context *w);
 
 /**
  * Decode single IntraX8 frame.
+ * lowres decoding is theoretically impossible.
  * @param w pointer to IntraX8Context
  * @param pict the output Picture containing an AVFrame
  * @param gb open bitstream reader
diff --git a/libavcodec/intrax8dsp.c b/libavcodec/intrax8dsp.c
index bb74a68..80c3929 100644
--- a/libavcodec/intrax8dsp.c
+++ b/libavcodec/intrax8dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intrax8dsp.h b/libavcodec/intrax8dsp.h
index 27e71e6..bf42698 100644
--- a/libavcodec/intrax8dsp.h
+++ b/libavcodec/intrax8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intrax8huf.h b/libavcodec/intrax8huf.h
index 684fdb7..558d0e5 100644
--- a/libavcodec/intrax8huf.h
+++ b/libavcodec/intrax8huf.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c
index ca0dd2c..1b57e53 100644
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
  * H.263 decoder.
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
 #include <limits.h>
 
 #include "libavutil/attributes.h"
@@ -40,7 +41,7 @@
 #include "internal.h"
 #include "mathops.h"
 #include "mpegutils.h"
-#include "unary_legacy.h"
+#include "unary.h"
 #include "flv.h"
 #include "rv10.h"
 #include "mpeg4video.h"
@@ -106,11 +107,9 @@ static VLC cbpc_b_vlc;
 /* XXX: find a better solution to handle static init */
 av_cold void ff_h263_decode_init_vlc(void)
 {
-    static int done = 0;
+    static volatile int done = 0;
 
     if (!done) {
-        done = 1;
-
         INIT_VLC_STATIC(&ff_h263_intra_MCBPC_vlc, INTRA_MCBPC_VLC_BITS, 9,
                  ff_h263_intra_MCBPC_bits, 1, 1,
                  ff_h263_intra_MCBPC_code, 1, 1, 72);
@@ -133,6 +132,7 @@ av_cold void ff_h263_decode_init_vlc(void)
         INIT_VLC_STATIC(&cbpc_b_vlc, CBPC_B_VLC_BITS, 4,
                  &ff_cbpc_b_tab[0][1], 2, 1,
                  &ff_cbpc_b_tab[0][0], 2, 1, 8);
+        done = 1;
     }
 }
 
@@ -167,7 +167,8 @@ static int h263_decode_gob_header(MpegEncContext *s)
         /* We have a GBSC probably with GSTUFF */
     skip_bits(&s->gb, 16); /* Drop the zeros */
     left= get_bits_left(&s->gb);
-    //MN: we must check the bits left or we might end in a infinite loop (or segfault)
+    left = FFMIN(left, 32);
+    //MN: we must check the bits left or we might end in an infinite loop (or segfault)
     for(;left>13; left--){
         if(get_bits1(&s->gb)) break; /* Seek the '1' bit */
     }
@@ -175,17 +176,17 @@ static int h263_decode_gob_header(MpegEncContext *s)
         return -1;
 
     if(s->h263_slice_structured){
-        if(get_bits1(&s->gb)==0)
+        if(check_marker(s->avctx, &s->gb, "before MBA")==0)
             return -1;
 
         ff_h263_decode_mba(s);
 
         if(s->mb_num > 1583)
-            if(get_bits1(&s->gb)==0)
+            if(check_marker(s->avctx, &s->gb, "after MBA")==0)
                 return -1;
 
         s->qscale = get_bits(&s->gb, 5); /* SQUANT */
-        if(get_bits1(&s->gb)==0)
+        if(check_marker(s->avctx, &s->gb, "after SQUANT")==0)
             return -1;
         skip_bits(&s->gb, 2); /* GFID */
     }else{
@@ -206,33 +207,27 @@ static int h263_decode_gob_header(MpegEncContext *s)
 }
 
 /**
- * Find the next resync_marker.
- * @param p pointer to buffer to scan
- * @param end pointer to the end of the buffer
- * @return pointer to the next resync_marker, or end if none was found
- */
-const uint8_t *ff_h263_find_resync_marker(const uint8_t *restrict p, const uint8_t * restrict end)
-{
-    assert(p < end);
-
-    end-=2;
-    p++;
-    for(;p<end; p+=2){
-        if(!*p){
-            if     (!p[-1] && p[1]) return p - 1;
-            else if(!p[ 1] && p[2]) return p;
-        }
-    }
-    return end+2;
-}
-
-/**
- * Decode the group of blocks / video packet header.
+ * Decode the group of blocks / video packet header / slice header (MPEG-4 Studio).
  * @return bit position of the resync_marker, or <0 if none was found
  */
 int ff_h263_resync(MpegEncContext *s){
     int left, pos, ret;
 
+    /* In MPEG-4 studio mode look for a new slice startcode
+     * and decode slice header */
+    if(s->codec_id==AV_CODEC_ID_MPEG4 && s->studio_profile) {
+        align_get_bits(&s->gb);
+
+        while (get_bits_left(&s->gb) >= 32 && show_bits_long(&s->gb, 32) != SLICE_START_CODE) {
+            get_bits(&s->gb, 8);
+        }
+
+        if (show_bits_long(&s->gb, 32) == SLICE_START_CODE)
+            return get_bits_count(&s->gb);
+        else
+            return -1;
+    }
+
     if(s->codec_id==AV_CODEC_ID_MPEG4){
         skip_bits1(&s->gb);
         align_get_bits(&s->gb);
@@ -323,18 +318,22 @@ static int h263p_decode_umotion(MpegEncContext * s, int pred)
    {
       code <<= 1;
       code += get_bits1(&s->gb);
+      if (code >= 32768) {
+          avpriv_request_sample(s->avctx, "Huge DMV");
+          return 0xffff;
+      }
    }
    sign = code & 1;
    code >>= 1;
 
    code = (sign) ? (pred - code) : (pred + code);
-   ff_dlog(s->avctx,"H.263+ UMV Motion = %d\n", code);
+   ff_tlog(s->avctx,"H.263+ UMV Motion = %d\n", code);
    return code;
 
 }
 
 /**
- * read the next MVs for OBMC. yes this is a ugly hack, feel free to send a patch :)
+ * read the next MVs for OBMC. yes this is an ugly hack, feel free to send a patch :)
  */
 static void preview_obmc(MpegEncContext *s){
     GetBitContext gb= s->gb;
@@ -350,7 +349,7 @@ static void preview_obmc(MpegEncContext *s){
         s->block_index[i]+= 1;
     s->mb_x++;
 
-    assert(s->pict_type == AV_PICTURE_TYPE_P);
+    av_assert2(s->pict_type == AV_PICTURE_TYPE_P);
 
     do{
         if (get_bits1(&s->gb)) {
@@ -384,14 +383,14 @@ static void preview_obmc(MpegEncContext *s){
                 /* 16x16 motion prediction */
                 mot_val= ff_h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
                 if (s->umvplus)
-                   mx = h263p_decode_umotion(s, pred_x);
+                    mx = h263p_decode_umotion(s, pred_x);
                 else
-                   mx = ff_h263_decode_motion(s, pred_x, 1);
+                    mx = ff_h263_decode_motion(s, pred_x, 1);
 
                 if (s->umvplus)
-                   my = h263p_decode_umotion(s, pred_y);
+                    my = h263p_decode_umotion(s, pred_y);
                 else
-                   my = ff_h263_decode_motion(s, pred_y, 1);
+                    my = ff_h263_decode_motion(s, pred_y, 1);
 
                 mot_val[0       ]= mot_val[2       ]=
                 mot_val[0+stride]= mot_val[2+stride]= mx;
@@ -402,16 +401,16 @@ static void preview_obmc(MpegEncContext *s){
             for(i=0;i<4;i++) {
                 mot_val = ff_h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                 if (s->umvplus)
-                  mx = h263p_decode_umotion(s, pred_x);
+                    mx = h263p_decode_umotion(s, pred_x);
                 else
-                  mx = ff_h263_decode_motion(s, pred_x, 1);
+                    mx = ff_h263_decode_motion(s, pred_x, 1);
 
                 if (s->umvplus)
-                  my = h263p_decode_umotion(s, pred_y);
+                    my = h263p_decode_umotion(s, pred_y);
                 else
-                  my = ff_h263_decode_motion(s, pred_y, 1);
+                    my = ff_h263_decode_motion(s, pred_y, 1);
                 if (s->umvplus && (mx - pred_x) == 1 && (my - pred_y) == 1)
-                  skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
+                    skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
                 mot_val[0] = mx;
                 mot_val[1] = my;
             }
@@ -444,7 +443,7 @@ static void h263_decode_dquant(MpegEncContext *s){
 static int h263_decode_block(MpegEncContext * s, int16_t * block,
                              int n, int coded)
 {
-    int code, level, i, j, last, run;
+    int level, i, j, run;
     RLTable *rl = &ff_h263_rl_inter;
     const uint8_t *scan_table;
     GetBitContext gb= s->gb;
@@ -485,7 +484,7 @@ static int h263_decode_block(MpegEncContext * s, int16_t * block,
             level = get_bits(&s->gb, 8);
             if((level&0x7F) == 0){
                 av_log(s->avctx, AV_LOG_ERROR, "illegal dc %d at %d %d\n", level, s->mb_x, s->mb_y);
-                if (s->avctx->err_recognition & AV_EF_BITSTREAM)
+                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))
                     return -1;
             }
             if (level == 255)
@@ -503,39 +502,67 @@ static int h263_decode_block(MpegEncContext * s, int16_t * block,
         return 0;
     }
 retry:
+    {
+    OPEN_READER(re, &s->gb);
+    i--; // offset by -1 to allow direct indexing of scan_table
     for(;;) {
-        code = get_vlc2(&s->gb, rl->vlc.table, TEX_VLC_BITS, 2);
-        if (code < 0){
-            av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n", s->mb_x, s->mb_y);
-            return -1;
-        }
-        if (code == rl->n) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+        if (run == 66) {
+            if (level){
+                CLOSE_READER(re, &s->gb);
+                av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
             /* escape */
             if (CONFIG_FLV_DECODER && s->h263_flv > 1) {
-                ff_flv2_decode_ac_esc(&s->gb, &level, &run, &last);
+                int is11 = SHOW_UBITS(re, &s->gb, 1);
+                SKIP_CACHE(re, &s->gb, 1);
+                run = SHOW_UBITS(re, &s->gb, 7) + 1;
+                if (is11) {
+                    SKIP_COUNTER(re, &s->gb, 1 + 7);
+                    UPDATE_CACHE(re, &s->gb);
+                    level = SHOW_SBITS(re, &s->gb, 11);
+                    SKIP_COUNTER(re, &s->gb, 11);
+                } else {
+                    SKIP_CACHE(re, &s->gb, 7);
+                    level = SHOW_SBITS(re, &s->gb, 7);
+                    SKIP_COUNTER(re, &s->gb, 1 + 7 + 7);
+                }
             } else {
-                last = get_bits1(&s->gb);
-                run = get_bits(&s->gb, 6);
-                level = (int8_t)get_bits(&s->gb, 8);
+                run = SHOW_UBITS(re, &s->gb, 7) + 1;
+                SKIP_CACHE(re, &s->gb, 7);
+                level = (int8_t)SHOW_UBITS(re, &s->gb, 8);
+                SKIP_COUNTER(re, &s->gb, 7 + 8);
                 if(level == -128){
+                    UPDATE_CACHE(re, &s->gb);
                     if (s->codec_id == AV_CODEC_ID_RV10) {
                         /* XXX: should patch encoder too */
-                        level = get_sbits(&s->gb, 12);
+                        level = SHOW_SBITS(re, &s->gb, 12);
+                        SKIP_COUNTER(re, &s->gb, 12);
                     }else{
-                        level = get_bits(&s->gb, 5);
-                        level |= get_sbits(&s->gb, 6)<<5;
+                        level = SHOW_UBITS(re, &s->gb, 5);
+                        SKIP_CACHE(re, &s->gb, 5);
+                        level |= SHOW_SBITS(re, &s->gb, 6) * (1<<5);
+                        SKIP_COUNTER(re, &s->gb, 5 + 6);
                     }
                 }
             }
         } else {
-            run = rl->table_run[code];
-            level = rl->table_level[code];
-            last = code >= rl->last;
-            if (get_bits1(&s->gb))
+            if (SHOW_UBITS(re, &s->gb, 1))
                 level = -level;
+            SKIP_COUNTER(re, &s->gb, 1);
         }
         i += run;
         if (i >= 64){
+            CLOSE_READER(re, &s->gb);
+            // redo update without last flag, revert -1 offset
+            i = i - run + ((run-1)&63) + 1;
+            if (i < 64) {
+                // only last marker, no overrun
+                block[scan_table[i]] = level;
+                break;
+            }
             if(s->alt_inter_vlc && rl == &ff_h263_rl_inter && !s->mb_intra){
                 //Looks like a hack but no, it's the way it is supposed to work ...
                 rl = &ff_rl_intra_aic;
@@ -549,9 +576,7 @@ retry:
         }
         j = scan_table[i];
         block[j] = level;
-        if (last)
-            break;
-        i++;
+    }
     }
 not_coded:
     if (s->mb_intra && s->h263_aic) {
@@ -564,13 +589,15 @@ not_coded:
 
 static int h263_skip_b_part(MpegEncContext *s, int cbp)
 {
-    LOCAL_ALIGNED_16(int16_t, dblock, [64]);
+    LOCAL_ALIGNED_32(int16_t, dblock, [64]);
     int i, mbi;
+    int bli[6];
 
     /* we have to set s->mb_intra to zero to decode B-part of PB-frame correctly
      * but real value should be restored in order to be used later (in OBMC condition)
      */
     mbi = s->mb_intra;
+    memcpy(bli, s->block_last_index, sizeof(bli));
     s->mb_intra = 0;
     for (i = 0; i < 6; i++) {
         if (h263_decode_block(s, dblock, i, cbp&32) < 0)
@@ -578,6 +605,7 @@ static int h263_skip_b_part(MpegEncContext *s, int cbp)
         cbp+=cbp;
     }
     s->mb_intra = mbi;
+    memcpy(s->block_last_index, bli, sizeof(bli));
     return 0;
 }
 
@@ -599,6 +627,73 @@ static int h263_get_modb(GetBitContext *gb, int pb_frame, int *cbpb)
     return mv;
 }
 
+#define tab_size ((signed)FF_ARRAY_ELEMS(s->direct_scale_mv[0]))
+#define tab_bias (tab_size / 2)
+static inline void set_one_direct_mv(MpegEncContext *s, Picture *p, int i)
+{
+    int xy           = s->block_index[i];
+    uint16_t time_pp = s->pp_time;
+    uint16_t time_pb = s->pb_time;
+    int p_mx, p_my;
+
+    p_mx = p->motion_val[0][xy][0];
+    if ((unsigned)(p_mx + tab_bias) < tab_size) {
+        s->mv[0][i][0] = s->direct_scale_mv[0][p_mx + tab_bias];
+        s->mv[1][i][0] = s->direct_scale_mv[1][p_mx + tab_bias];
+    } else {
+        s->mv[0][i][0] = p_mx * time_pb / time_pp;
+        s->mv[1][i][0] = p_mx * (time_pb - time_pp) / time_pp;
+    }
+    p_my = p->motion_val[0][xy][1];
+    if ((unsigned)(p_my + tab_bias) < tab_size) {
+        s->mv[0][i][1] = s->direct_scale_mv[0][p_my + tab_bias];
+        s->mv[1][i][1] = s->direct_scale_mv[1][p_my + tab_bias];
+    } else {
+        s->mv[0][i][1] = p_my * time_pb / time_pp;
+        s->mv[1][i][1] = p_my * (time_pb - time_pp) / time_pp;
+    }
+}
+
+/**
+ * @return the mb_type
+ */
+static int set_direct_mv(MpegEncContext *s)
+{
+    const int mb_index = s->mb_x + s->mb_y * s->mb_stride;
+    Picture *p = &s->next_picture;
+    int colocated_mb_type = p->mb_type[mb_index];
+    int i;
+
+    if (s->codec_tag == AV_RL32("U263") && p->f->pict_type == AV_PICTURE_TYPE_I) {
+        p = &s->last_picture;
+        colocated_mb_type = p->mb_type[mb_index];
+    }
+
+    if (IS_8X8(colocated_mb_type)) {
+        s->mv_type = MV_TYPE_8X8;
+        for (i = 0; i < 4; i++)
+            set_one_direct_mv(s, p, i);
+        return MB_TYPE_DIRECT2 | MB_TYPE_8x8 | MB_TYPE_L0L1;
+    } else {
+        set_one_direct_mv(s, p, 0);
+        s->mv[0][1][0] =
+        s->mv[0][2][0] =
+        s->mv[0][3][0] = s->mv[0][0][0];
+        s->mv[0][1][1] =
+        s->mv[0][2][1] =
+        s->mv[0][3][1] = s->mv[0][0][1];
+        s->mv[1][1][0] =
+        s->mv[1][2][0] =
+        s->mv[1][3][0] = s->mv[1][0][0];
+        s->mv[1][1][1] =
+        s->mv[1][2][1] =
+        s->mv[1][3][1] = s->mv[1][0][1];
+        s->mv_type = MV_TYPE_8X8;
+        // Note see prev line
+        return MB_TYPE_DIRECT2 | MB_TYPE_16x16 | MB_TYPE_L0L1;
+    }
+}
+
 int ff_h263_decode_mb(MpegEncContext *s,
                       int16_t block[6][64])
 {
@@ -607,7 +702,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
     const int xy= s->mb_x + s->mb_y * s->mb_stride;
     int cbpb = 0, pb_mv_count = 0;
 
-    assert(!s->h263_pred);
+    av_assert2(!s->h263_pred);
 
     if (s->pict_type == AV_PICTURE_TYPE_P) {
         do{
@@ -627,7 +722,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
             cbpc = get_vlc2(&s->gb, ff_h263_inter_MCBPC_vlc.table, INTER_MCBPC_VLC_BITS, 2);
             if (cbpc < 0){
                 av_log(s->avctx, AV_LOG_ERROR, "cbpc damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
+                return SLICE_ERROR;
             }
         }while(cbpc == 20);
 
@@ -641,6 +736,11 @@ int ff_h263_decode_mb(MpegEncContext *s,
             pb_mv_count = h263_get_modb(&s->gb, s->pb_frame, &cbpb);
         cbpy = get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1);
 
+        if (cbpy < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "cbpy damaged at %d %d\n", s->mb_x, s->mb_y);
+            return SLICE_ERROR;
+        }
+
         if(s->alt_inter_vlc==0 || (cbpc & 3)!=3)
             cbpy ^= 0xF;
 
@@ -661,7 +761,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
                mx = ff_h263_decode_motion(s, pred_x, 1);
 
             if (mx >= 0xffff)
-                return -1;
+                return SLICE_ERROR;
 
             if (s->umvplus)
                my = h263p_decode_umotion(s, pred_y);
@@ -669,7 +769,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
                my = ff_h263_decode_motion(s, pred_y, 1);
 
             if (my >= 0xffff)
-                return -1;
+                return SLICE_ERROR;
             s->mv[0][0][0] = mx;
             s->mv[0][0][1] = my;
 
@@ -681,18 +781,18 @@ int ff_h263_decode_mb(MpegEncContext *s,
             for(i=0;i<4;i++) {
                 mot_val = ff_h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                 if (s->umvplus)
-                  mx = h263p_decode_umotion(s, pred_x);
+                    mx = h263p_decode_umotion(s, pred_x);
                 else
-                  mx = ff_h263_decode_motion(s, pred_x, 1);
+                    mx = ff_h263_decode_motion(s, pred_x, 1);
                 if (mx >= 0xffff)
-                    return -1;
+                    return SLICE_ERROR;
 
                 if (s->umvplus)
-                  my = h263p_decode_umotion(s, pred_y);
+                    my = h263p_decode_umotion(s, pred_y);
                 else
-                  my = ff_h263_decode_motion(s, pred_y, 1);
+                    my = ff_h263_decode_motion(s, pred_y, 1);
                 if (my >= 0xffff)
-                    return -1;
+                    return SLICE_ERROR;
                 s->mv[0][i][0] = mx;
                 s->mv[0][i][1] = my;
                 if (s->umvplus && (mx - pred_x) == 1 && (my - pred_y) == 1)
@@ -718,7 +818,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
             mb_type= get_vlc2(&s->gb, h263_mbtype_b_vlc.table, H263_MBTYPE_B_VLC_BITS, 2);
             if (mb_type < 0){
                 av_log(s->avctx, AV_LOG_ERROR, "b mb_type damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
+                return SLICE_ERROR;
             }
 
             mb_type= h263_mb_type_b_map[ mb_type ];
@@ -737,7 +837,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
 
             if (cbpy < 0){
                 av_log(s->avctx, AV_LOG_ERROR, "b cbpy damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
+                return SLICE_ERROR;
             }
 
             if(s->alt_inter_vlc==0 || (cbpc & 3)!=3)
@@ -747,28 +847,40 @@ int ff_h263_decode_mb(MpegEncContext *s,
         }else
             cbp=0;
 
-        assert(!s->mb_intra);
+        av_assert2(!s->mb_intra);
 
         if(IS_QUANT(mb_type)){
             h263_decode_dquant(s);
         }
 
         if(IS_DIRECT(mb_type)){
-            if (!s->pp_time)
-                return AVERROR_INVALIDDATA;
             s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
-            mb_type |= ff_mpeg4_set_direct_mv(s, 0, 0);
+            mb_type |= set_direct_mv(s);
         }else{
             s->mv_dir = 0;
             s->mv_type= MV_TYPE_16X16;
 //FIXME UMV
 
             if(USES_LIST(mb_type, 0)){
-                int16_t *mot_val= ff_h263_pred_motion(s, 0, 0, &mx, &my);
+                int16_t *mot_val= ff_h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
                 s->mv_dir = MV_DIR_FORWARD;
 
-                mx = ff_h263_decode_motion(s, mx, 1);
-                my = ff_h263_decode_motion(s, my, 1);
+                if (s->umvplus)
+                    mx = h263p_decode_umotion(s, pred_x);
+                else
+                    mx = ff_h263_decode_motion(s, pred_x, 1);
+                if (mx >= 0xffff)
+                    return SLICE_ERROR;
+
+                if (s->umvplus)
+                    my = h263p_decode_umotion(s, pred_y);
+                else
+                    my = ff_h263_decode_motion(s, pred_y, 1);
+                if (my >= 0xffff)
+                    return SLICE_ERROR;
+
+                if (s->umvplus && (mx - pred_x) == 1 && (my - pred_y) == 1)
+                    skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
 
                 s->mv[0][0][0] = mx;
                 s->mv[0][0][1] = my;
@@ -777,11 +889,25 @@ int ff_h263_decode_mb(MpegEncContext *s,
             }
 
             if(USES_LIST(mb_type, 1)){
-                int16_t *mot_val= ff_h263_pred_motion(s, 0, 1, &mx, &my);
+                int16_t *mot_val= ff_h263_pred_motion(s, 0, 1, &pred_x, &pred_y);
                 s->mv_dir |= MV_DIR_BACKWARD;
 
-                mx = ff_h263_decode_motion(s, mx, 1);
-                my = ff_h263_decode_motion(s, my, 1);
+                if (s->umvplus)
+                    mx = h263p_decode_umotion(s, pred_x);
+                else
+                    mx = ff_h263_decode_motion(s, pred_x, 1);
+                if (mx >= 0xffff)
+                    return SLICE_ERROR;
+
+                if (s->umvplus)
+                    my = h263p_decode_umotion(s, pred_y);
+                else
+                    my = ff_h263_decode_motion(s, pred_y, 1);
+                if (my >= 0xffff)
+                    return SLICE_ERROR;
+
+                if (s->umvplus && (mx - pred_x) == 1 && (my - pred_y) == 1)
+                    skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
 
                 s->mv[1][0][0] = mx;
                 s->mv[1][0][1] = my;
@@ -796,7 +922,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
             cbpc = get_vlc2(&s->gb, ff_h263_intra_MCBPC_vlc.table, INTRA_MCBPC_VLC_BITS, 2);
             if (cbpc < 0){
                 av_log(s->avctx, AV_LOG_ERROR, "I cbpc damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
+                return SLICE_ERROR;
             }
         }while(cbpc == 8);
 
@@ -821,7 +947,7 @@ intra:
         cbpy = get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1);
         if(cbpy<0){
             av_log(s->avctx, AV_LOG_ERROR, "I cbpy damaged at %d %d\n", s->mb_x, s->mb_y);
-            return -1;
+            return SLICE_ERROR;
         }
         cbp = (cbpc & 3) | (cbpy << 2);
         if (dquant) {
@@ -851,6 +977,9 @@ intra:
     }
 end:
 
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
         /* per-MB end of slice check */
     {
         int v= show_bits(&s->gb, 16);
@@ -874,6 +1003,10 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
 
     align_get_bits(&s->gb);
 
+    if (show_bits(&s->gb, 2) == 2 && s->avctx->frame_number == 0) {
+         av_log(s->avctx, AV_LOG_WARNING, "Header looks like RTP instead of H.263\n");
+    }
+
     startcode= get_bits(&s->gb, 22-8);
 
     for(i= get_bits_left(&s->gb); i>24; i-=8) {
@@ -889,14 +1022,13 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     }
     /* temporal reference */
     i = get_bits(&s->gb, 8); /* picture timestamp */
-    if( (s->picture_number&~0xFF)+i < s->picture_number)
-        i+= 256;
+
+    i -= (i - (s->picture_number & 0xFF) + 128) & ~0xFF;
+
     s->picture_number= (s->picture_number&~0xFF) + i;
 
     /* PTYPE starts here */
-    if (get_bits1(&s->gb) != 1) {
-        /* marker */
-        av_log(s->avctx, AV_LOG_ERROR, "Bad marker\n");
+    if (check_marker(s->avctx, &s->gb, "in PTYPE") != 1) {
         return -1;
     }
     if (get_bits1(&s->gb) != 0) {
@@ -920,6 +1052,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
         /* H.263v1 */
         width = ff_h263_format[format][0];
         height = ff_h263_format[format][1];
+        if (!width)
+            return -1;
 
         s->pict_type = AV_PICTURE_TYPE_I + get_bits1(&s->gb);
 
@@ -961,6 +1095,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
             s->h263_aic = get_bits1(&s->gb); /* Advanced Intra Coding (AIC) */
             s->loop_filter= get_bits1(&s->gb);
             s->unrestricted_mv = s->umvplus || s->obmc || s->loop_filter;
+            if(s->avctx->lowres)
+                s->loop_filter = 0;
 
             s->h263_slice_structured= get_bits1(&s->gb);
             if (get_bits1(&s->gb) != 0) {
@@ -1013,7 +1149,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
                 6-14 - reserved
                 */
                 width = (get_bits(&s->gb, 9) + 1) * 4;
-                skip_bits1(&s->gb);
+                check_marker(s->avctx, &s->gb, "in dimensions");
                 height = get_bits(&s->gb, 9) * 4;
                 ff_dlog(s->avctx, "\nH.263+ Custom picture: %dx%d\n",width,height);
                 if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
@@ -1028,6 +1164,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
                 height = ff_h263_format[format][1];
                 s->avctx->sample_aspect_ratio= (AVRational){12,11};
             }
+            s->avctx->sample_aspect_ratio.den <<= s->ehc_mode;
             if ((width == 0) || (height == 0))
                 return -1;
             s->width = width;
@@ -1067,6 +1204,12 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
                     av_log(s->avctx, AV_LOG_ERROR, "unordered slices not supported\n");
                 }
             }
+            if (s->pict_type == AV_PICTURE_TYPE_B) {
+                skip_bits(&s->gb, 4); //ELNUM
+                if (ufep == 1) {
+                    skip_bits(&s->gb, 4); // RLNUM
+                }
+            }
         }
 
         s->qscale = get_bits(&s->gb, 5);
@@ -1103,25 +1246,25 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     }
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0) {
-        skip_bits(&s->gb, 8);
-    }
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     if(s->h263_slice_structured){
-        if (get_bits1(&s->gb) != 1) {
-            av_log(s->avctx, AV_LOG_ERROR, "SEPB1 marker missing\n");
+        if (check_marker(s->avctx, &s->gb, "SEPB1") != 1) {
             return -1;
         }
 
         ff_h263_decode_mba(s);
 
-        if (get_bits1(&s->gb) != 1) {
-            av_log(s->avctx, AV_LOG_ERROR, "SEPB2 marker missing\n");
+        if (check_marker(s->avctx, &s->gb, "SEPB2") != 1) {
             return -1;
         }
     }
     s->f_code = 1;
 
+    if (s->pict_type == AV_PICTURE_TYPE_B)
+        s->low_delay = 0;
+
     if(s->h263_aic){
          s->y_dc_scale_table=
          s->c_dc_scale_table= ff_aic_dc_scale_table;
@@ -1131,7 +1274,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     }
 
         ff_h263_show_pict_info(s);
-    if (s->pict_type == AV_PICTURE_TYPE_I && s->codec_tag == AV_RL32("ZYGO")){
+    if (s->pict_type == AV_PICTURE_TYPE_I && s->codec_tag == AV_RL32("ZYGO") && get_bits_left(&s->gb) >= 85 + 13*3*16 + 50){
         int i,j;
         for(i=0; i<85; i++) av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb));
         av_log(s->avctx, AV_LOG_DEBUG, "\n");
diff --git a/libavcodec/ituh263enc.c b/libavcodec/ituh263enc.c
index 313f5f3..ee09f29 100644
--- a/libavcodec/ituh263enc.c
+++ b/libavcodec/ituh263enc.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,7 +44,7 @@
 /**
  * Table of number of bits a motion vector component needs.
  */
-static uint8_t mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
+static uint8_t mv_penalty[MAX_FCODE+1][MAX_DMV*2+1];
 
 /**
  * Minimal fcode that a motion vector component would need.
@@ -89,7 +89,7 @@ static const uint8_t wrong_run[102] = {
 av_const int ff_h263_aspect_to_info(AVRational aspect){
     int i;
 
-    if(aspect.num==0) aspect= (AVRational){1,1};
+    if(aspect.num==0 || aspect.den==0) aspect= (AVRational){1,1};
 
     for(i=1; i<6; i++){
         if(av_cmp_q(ff_h263_pixel_aspect[i], aspect) == 0){
@@ -227,19 +227,11 @@ void ff_h263_encode_picture_header(MpegEncContext * s, int picture_number)
     if(s->h263_slice_structured){
         put_bits(&s->pb, 1, 1);
 
-        assert(s->mb_x == 0 && s->mb_y == 0);
+        av_assert1(s->mb_x == 0 && s->mb_y == 0);
         ff_h263_encode_mba(s);
 
         put_bits(&s->pb, 1, 1);
     }
-
-    if(s->h263_aic){
-         s->y_dc_scale_table=
-         s->c_dc_scale_table= ff_aic_dc_scale_table;
-    }else{
-        s->y_dc_scale_table=
-        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
-    }
 }
 
 /**
@@ -394,7 +386,7 @@ static void h263_encode_block(MpegEncContext * s, int16_t * block, int n)
                 put_bits(&s->pb, 1, last);
                 put_bits(&s->pb, 6, run);
 
-                assert(slevel != 0);
+                av_assert2(slevel != 0);
 
                 if(level < 128)
                     put_sbits(&s->pb, 8, slevel);
@@ -415,7 +407,7 @@ static void h263_encode_block(MpegEncContext * s, int16_t * block, int n)
 }
 
 /* Encode MV differences on H.263+ with Unrestricted MV mode */
-static void h263p_encode_umotion(MpegEncContext * s, int val)
+static void h263p_encode_umotion(PutBitContext *pb, int val)
 {
     short sval = 0;
     short i = 0;
@@ -425,11 +417,11 @@ static void h263p_encode_umotion(MpegEncContext * s, int val)
     int tcode;
 
     if ( val == 0)
-        put_bits(&s->pb, 1, 1);
+        put_bits(pb, 1, 1);
     else if (val == 1)
-        put_bits(&s->pb, 3, 0);
+        put_bits(pb, 3, 0);
     else if (val == -1)
-        put_bits(&s->pb, 3, 2);
+        put_bits(pb, 3, 2);
     else {
 
         sval = ((val < 0) ? (short)(-val):(short)val);
@@ -448,7 +440,7 @@ static void h263p_encode_umotion(MpegEncContext * s, int val)
             i--;
         }
         code = ((code << 1) | (val < 0)) << 1;
-        put_bits(&s->pb, (2*n_bits)+1, code);
+        put_bits(pb, (2*n_bits)+1, code);
     }
 }
 
@@ -505,8 +497,8 @@ void ff_h263_encode_mb(MpegEncContext * s,
                                                 motion_y - pred_y, 1);
             }
             else {
-                h263p_encode_umotion(s, motion_x - pred_x);
-                h263p_encode_umotion(s, motion_y - pred_y);
+                h263p_encode_umotion(&s->pb, motion_x - pred_x);
+                h263p_encode_umotion(&s->pb, motion_y - pred_y);
                 if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
                     /* To prevent Start Code emulation */
                     put_bits(&s->pb,1,1);
@@ -534,8 +526,8 @@ void ff_h263_encode_mb(MpegEncContext * s,
                                                     motion_y - pred_y, 1);
                 }
                 else {
-                    h263p_encode_umotion(s, motion_x - pred_x);
-                    h263p_encode_umotion(s, motion_y - pred_y);
+                    h263p_encode_umotion(&s->pb, motion_x - pred_x);
+                    h263p_encode_umotion(&s->pb, motion_y - pred_y);
                     if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
                         /* To prevent Start Code emulation */
                         put_bits(&s->pb,1,1);
@@ -547,7 +539,7 @@ void ff_h263_encode_mb(MpegEncContext * s,
             s->mv_bits+= get_bits_diff(s);
         }
     } else {
-        assert(s->mb_intra);
+        av_assert2(s->mb_intra);
 
         cbp = 0;
         if (s->h263_aic) {
@@ -651,14 +643,14 @@ void ff_h263_encode_mb(MpegEncContext * s,
     }
 }
 
-void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
+void ff_h263_encode_motion(PutBitContext *pb, int val, int f_code)
 {
     int range, bit_size, sign, code, bits;
 
     if (val == 0) {
         /* zero vector */
         code = 0;
-        put_bits(&s->pb, ff_mvtab[code][1], ff_mvtab[code][0]);
+        put_bits(pb, ff_mvtab[code][1], ff_mvtab[code][0]);
     } else {
         bit_size = f_code - 1;
         range = 1 << bit_size;
@@ -672,9 +664,9 @@ void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
         code = (val >> bit_size) + 1;
         bits = val & (range - 1);
 
-        put_bits(&s->pb, ff_mvtab[code][1] + 1, (ff_mvtab[code][0] << 1) | sign);
+        put_bits(pb, ff_mvtab[code][1] + 1, (ff_mvtab[code][0] << 1) | sign);
         if (bit_size > 0) {
-            put_bits(&s->pb, bit_size, bits);
+            put_bits(pb, bit_size, bits);
         }
     }
 }
@@ -685,7 +677,7 @@ static av_cold void init_mv_penalty_and_fcode(MpegEncContext *s)
     int mv;
 
     for(f_code=1; f_code<=MAX_FCODE; f_code++){
-        for(mv=-MAX_MV; mv<=MAX_MV; mv++){
+        for(mv=-MAX_DMV; mv<=MAX_DMV; mv++){
             int len;
 
             if(mv==0) len= ff_mvtab[0][1];
@@ -706,7 +698,7 @@ static av_cold void init_mv_penalty_and_fcode(MpegEncContext *s)
                 }
             }
 
-            mv_penalty[f_code][mv+MAX_MV]= len;
+            mv_penalty[f_code][mv+MAX_DMV]= len;
         }
     }
 
@@ -726,8 +718,8 @@ static av_cold void init_uni_h263_rl_tab(RLTable *rl, uint32_t *bits_tab,
 {
     int slevel, run, last;
 
-    assert(MAX_LEVEL >= 64);
-    assert(MAX_RUN   >= 63);
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN   >= 63);
 
     for(slevel=-64; slevel<64; slevel++){
         if(slevel==0) continue;
@@ -816,12 +808,15 @@ av_cold void ff_h263_encode_init(MpegEncContext *s)
             s->min_qcoeff= -127;
             s->max_qcoeff=  127;
         }
-        s->y_dc_scale_table=
-        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
         break;
     default: //nothing needed - default table already set in mpegvideo.c
         s->min_qcoeff= -127;
         s->max_qcoeff=  127;
+    }
+    if(s->h263_aic){
+         s->y_dc_scale_table=
+         s->c_dc_scale_table= ff_aic_dc_scale_table;
+    }else{
         s->y_dc_scale_table=
         s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
     }
diff --git a/libavcodec/ivi.c b/libavcodec/ivi.c
index 2ff4824..71bf0e6 100644
--- a/libavcodec/ivi.c
+++ b/libavcodec/ivi.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,12 @@
 #include <inttypes.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/timer.h"
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
 #include "ivi.h"
@@ -88,12 +89,9 @@ static int ivi_mc(IVIBandDesc *band, ivi_mc_func mc, ivi_mc_avg_func mc_avg,
     int ref_size = (mc_type > 1) * band->pitch + (mc_type & 1);
 
     if (mc_type != -1) {
-        if (offs < 0 || ref_offs < 0 || !band->ref_buf)
-            return AVERROR_INVALIDDATA;
-        if (buf_size - min_size < offs)
-            return AVERROR_INVALIDDATA;
-        if (buf_size - min_size - ref_size < ref_offs)
-            return AVERROR_INVALIDDATA;
+        av_assert0(offs >= 0 && ref_offs >= 0 && band->ref_buf);
+        av_assert0(buf_size - min_size >= offs);
+        av_assert0(buf_size - min_size - ref_size >= ref_offs);
     }
 
     if (mc_type2 == -1) {
@@ -137,7 +135,7 @@ static uint16_t inv_bits(uint16_t val, int nbits)
 
 /*
  *  Generate a huffman codebook from the given descriptor
- *  and convert it into the Libav VLC table.
+ *  and convert it into the FFmpeg VLC table.
  *
  *  @param[in]   cb    pointer to codebook descriptor
  *  @param[out]  vlc   where to place the generated VLC table
@@ -225,7 +223,7 @@ static int ivi_huff_desc_cmp(const IVIHuffDesc *desc1,
            memcmp(desc1->xbits, desc2->xbits, desc1->num_rows);
 }
 
-int ff_ivi_dec_huff_desc(BitstreamContext *bc, int desc_coded, int which_tab,
+int ff_ivi_dec_huff_desc(GetBitContext *gb, int desc_coded, int which_tab,
                          IVIHuffTab *huff_tab, AVCodecContext *avctx)
 {
     int i, result;
@@ -238,20 +236,20 @@ int ff_ivi_dec_huff_desc(BitstreamContext *bc, int desc_coded, int which_tab,
         return 0;
     }
 
-    huff_tab->tab_sel = bitstream_read(bc, 3);
+    huff_tab->tab_sel = get_bits(gb, 3);
     if (huff_tab->tab_sel == 7) {
         /* custom huffman table (explicitly encoded) */
-        new_huff.num_rows = bitstream_read(bc, 4);
+        new_huff.num_rows = get_bits(gb, 4);
         if (!new_huff.num_rows) {
             av_log(avctx, AV_LOG_ERROR, "Empty custom Huffman table!\n");
             return AVERROR_INVALIDDATA;
         }
 
         for (i = 0; i < new_huff.num_rows; i++)
-            new_huff.xbits[i] = bitstream_read(bc, 4);
+            new_huff.xbits[i] = get_bits(gb, 4);
 
         /* Have we got the same custom table? Rebuild if not. */
-        if (ivi_huff_desc_cmp(&new_huff, &huff_tab->cust_desc)) {
+        if (ivi_huff_desc_cmp(&new_huff, &huff_tab->cust_desc) || !huff_tab->cust_tab.table) {
             ivi_huff_desc_copy(&huff_tab->cust_desc, &new_huff);
 
             if (huff_tab->cust_tab.table)
@@ -286,6 +284,7 @@ static av_cold void ivi_free_buffers(IVIPlaneDesc *planes)
     int p, b, t;
 
     for (p = 0; p < 3; p++) {
+        if (planes[p].bands)
         for (b = 0; b < planes[p].num_bands; b++) {
             av_freep(&planes[p].bands[b].bufs[0]);
             av_freep(&planes[p].bands[b].bufs[1]);
@@ -303,7 +302,7 @@ static av_cold void ivi_free_buffers(IVIPlaneDesc *planes)
     }
 }
 
-av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
+av_cold int ff_ivi_init_planes(AVCodecContext *avctx, IVIPlaneDesc *planes, const IVIPicConfig *cfg,
                                int is_indeo4)
 {
     int p, b;
@@ -313,7 +312,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
 
     ivi_free_buffers(planes);
 
-    if (cfg->pic_width < 1 || cfg->pic_height < 1 ||
+    if (av_image_check_size2(cfg->pic_width, cfg->pic_height, avctx->max_pixels, AV_PIX_FMT_YUV410P, 0, avctx) < 0 ||
         cfg->luma_bands < 1 || cfg->chroma_bands < 1)
         return AVERROR_INVALIDDATA;
 
@@ -328,7 +327,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
     planes[1].num_bands = planes[2].num_bands = cfg->chroma_bands;
 
     for (p = 0; p < 3; p++) {
-        planes[p].bands = av_mallocz(planes[p].num_bands * sizeof(IVIBandDesc));
+        planes[p].bands = av_mallocz_array(planes[p].num_bands, sizeof(IVIBandDesc));
         if (!planes[p].bands)
             return AVERROR(ENOMEM);
 
@@ -357,6 +356,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
             band->aheight  = height_aligned;
             band->bufs[0]  = av_mallocz(buf_size);
             band->bufs[1]  = av_mallocz(buf_size);
+            band->bufsize  = buf_size/2;
             if (!band->bufs[0] || !band->bufs[1])
                 return AVERROR(ENOMEM);
 
@@ -398,14 +398,16 @@ static int ivi_init_tiles(IVIBandDesc *band, IVITile *ref_tile,
                                               band->mb_size);
 
             av_freep(&tile->mbs);
-            tile->mbs = av_malloc(tile->num_MBs * sizeof(IVIMbInfo));
+            tile->mbs = av_mallocz_array(tile->num_MBs, sizeof(IVIMbInfo));
             if (!tile->mbs)
                 return AVERROR(ENOMEM);
 
             tile->ref_mbs = 0;
             if (p || b) {
-                if (tile->num_MBs != ref_tile->num_MBs)
+                if (tile->num_MBs != ref_tile->num_MBs) {
+                    av_log(NULL, AV_LOG_DEBUG, "ref_tile mismatch\n");
                     return AVERROR_INVALIDDATA;
+                }
                 tile->ref_mbs = ref_tile->mbs;
                 ref_tile++;
             }
@@ -430,15 +432,25 @@ av_cold int ff_ivi_init_tiles(IVIPlaneDesc *planes,
             t_width  >>= 1;
             t_height >>= 1;
         }
+        if(t_width<=0 || t_height<=0)
+            return AVERROR(EINVAL);
 
         for (b = 0; b < planes[p].num_bands; b++) {
             band = &planes[p].bands[b];
+
+            if (band->tiles) {
+                int t;
+                for (t = 0; t < band->num_tiles; t++) {
+                    av_freep(&band->tiles[t].mbs);
+                }
+            }
+
             x_tiles = IVI_NUM_TILES(band->width, t_width);
             y_tiles = IVI_NUM_TILES(band->height, t_height);
             band->num_tiles = x_tiles * y_tiles;
 
             av_freep(&band->tiles);
-            band->tiles = av_mallocz(band->num_tiles * sizeof(IVITile));
+            band->tiles = av_mallocz_array(band->num_tiles, sizeof(IVITile));
             if (!band->tiles)
                 return AVERROR(ENOMEM);
 
@@ -461,22 +473,22 @@ av_cold int ff_ivi_init_tiles(IVIPlaneDesc *planes,
  *  if (tile_data_size >= 255) than this field four is byte long: 0xFF X1 X2 X3
  *  where X1-X3 is size of the tile data
  *
- *  @param[in,out]  bc  the Bitstream context
+ *  @param[in,out]  gb  the GetBit context
  *  @return     size of the tile data in bytes
  */
-static int ivi_dec_tile_data_size(BitstreamContext *bc)
+static int ivi_dec_tile_data_size(GetBitContext *gb)
 {
     int    len;
 
     len = 0;
-    if (bitstream_read_bit(bc)) {
-        len = bitstream_read(bc, 8);
+    if (get_bits1(gb)) {
+        len = get_bits(gb, 8);
         if (len == 255)
-            len = bitstream_read(bc, 24);
+            len = get_bits_long(gb, 24);
     }
 
     /* align the bitstream reader on the byte boundary */
-    bitstream_align(bc);
+    align_get_bits(gb);
 
     return len;
 }
@@ -487,10 +499,6 @@ static int ivi_dc_transform(IVIBandDesc *band, int *prev_dc, int buf_offs,
     int buf_size = band->pitch * band->aheight - buf_offs;
     int min_size = (blk_size - 1) * band->pitch + blk_size;
 
-    if (!band->dc_transform)
-        return 0;
-
-
     if (min_size > buf_size)
         return AVERROR_INVALIDDATA;
 
@@ -500,7 +508,7 @@ static int ivi_dc_transform(IVIBandDesc *band, int *prev_dc, int buf_offs,
     return 0;
 }
 
-static int ivi_decode_coded_blocks(BitstreamContext *bc, IVIBandDesc *band,
+static int ivi_decode_coded_blocks(GetBitContext *gb, IVIBandDesc *band,
                                    ivi_mc_func mc, ivi_mc_avg_func mc_avg,
                                    int mv_x, int mv_y,
                                    int mv_x2, int mv_y2,
@@ -536,15 +544,16 @@ static int ivi_decode_coded_blocks(BitstreamContext *bc, IVIBandDesc *band,
     /* zero column flags */
     memset(col_flags, 0, sizeof(col_flags));
     while (scan_pos <= num_coeffs) {
-        sym = bitstream_read_vlc(bc, band->blk_vlc.tab->table, IVI_VLC_BITS, 1);
+        sym = get_vlc2(gb, band->blk_vlc.tab->table,
+                       IVI_VLC_BITS, 1);
         if (sym == rvmap->eob_sym)
             break; /* End of block */
 
         /* Escape - run/val explicitly coded using 3 vlc codes */
         if (sym == rvmap->esc_sym) {
-            run = bitstream_read_vlc(bc, band->blk_vlc.tab->table, IVI_VLC_BITS, 1) + 1;
-            lo  = bitstream_read_vlc(bc, band->blk_vlc.tab->table, IVI_VLC_BITS, 1);
-            hi  = bitstream_read_vlc(bc, band->blk_vlc.tab->table, IVI_VLC_BITS, 1);
+            run = get_vlc2(gb, band->blk_vlc.tab->table, IVI_VLC_BITS, 1) + 1;
+            lo  = get_vlc2(gb, band->blk_vlc.tab->table, IVI_VLC_BITS, 1);
+            hi  = get_vlc2(gb, band->blk_vlc.tab->table, IVI_VLC_BITS, 1);
             /* merge them and convert into signed val */
             val = IVI_TOSIGNED((hi << 6) | lo);
         } else {
@@ -583,6 +592,11 @@ static int ivi_decode_coded_blocks(BitstreamContext *bc, IVIBandDesc *band,
         col_flags[0] |= !!*prev_dc;
     }
 
+    if(band->transform_size > band->blk_size){
+        av_log(NULL, AV_LOG_ERROR, "Too large transform\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* apply inverse transform */
     band->inv_transform(trvec, band->buf + offs,
                         band->pitch, col_flags);
@@ -600,12 +614,12 @@ static int ivi_decode_coded_blocks(BitstreamContext *bc, IVIBandDesc *band,
  *  dequantize them, apply inverse transform and motion compensation
  *  in order to reconstruct the picture.
  *
- *  @param[in,out]  bc    the Bitstream context
+ *  @param[in,out]  gb    the GetBit context
  *  @param[in]      band  pointer to the band descriptor
  *  @param[in]      tile  pointer to the tile descriptor
  *  @return     result code: 0 - OK, -1 = error (corrupted blocks data)
  */
-static int ivi_decode_blocks(BitstreamContext *bc, IVIBandDesc *band,
+static int ivi_decode_blocks(GetBitContext *gb, IVIBandDesc *band,
                              IVITile *tile, AVCodecContext *avctx)
 {
     int mbn, blk, num_blocks, blk_size, ret, is_intra;
@@ -642,7 +656,7 @@ static int ivi_decode_blocks(BitstreamContext *bc, IVIBandDesc *band,
 
         quant = band->glob_quant + mb->q_delta;
         if (avctx->codec_id == AV_CODEC_ID_INDEO4)
-            quant = av_clip(quant, 0, 31);
+            quant = av_clip_uintp2(quant, 5);
         else
             quant = av_clip(quant, 0, 23);
 
@@ -709,7 +723,7 @@ static int ivi_decode_blocks(BitstreamContext *bc, IVIBandDesc *band,
             }
 
             if (cbp & 1) { /* block coded ? */
-                ret = ivi_decode_coded_blocks(bc, band, mc_with_delta_func,
+                ret = ivi_decode_coded_blocks(gb, band, mc_with_delta_func,
                                               mc_avg_with_delta_func,
                                               mv_x, mv_y, mv_x2, mv_y2,
                                               &prev_dc, is_intra,
@@ -738,7 +752,7 @@ static int ivi_decode_blocks(BitstreamContext *bc, IVIBandDesc *band,
         }// for blk
     }// for mbn
 
-    bitstream_align(bc);
+    align_get_bits(gb);
 
     return 0;
 }
@@ -805,6 +819,22 @@ static int ivi_process_empty_tile(AVCodecContext *avctx, IVIBandDesc *band,
                     mb->mv_y = ref_mb->mv_y;
                 }
                 need_mc |= mb->mv_x || mb->mv_y; /* tracking non-zero motion vectors */
+                {
+                    int dmv_x, dmv_y, cx, cy;
+
+                    dmv_x = mb->mv_x >> band->is_halfpel;
+                    dmv_y = mb->mv_y >> band->is_halfpel;
+                    cx    = mb->mv_x &  band->is_halfpel;
+                    cy    = mb->mv_y &  band->is_halfpel;
+
+                    if (   mb->xpos + dmv_x < 0
+                        || mb->xpos + dmv_x + band->mb_size + cx > band->pitch
+                        || mb->ypos + dmv_y < 0
+                        || mb->ypos + dmv_y + band->mb_size + cy > band->aheight) {
+                        av_log(avctx, AV_LOG_ERROR, "MV out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
             }
 
             mb++;
@@ -891,8 +921,16 @@ static void ivi_output_plane(IVIPlaneDesc *plane, uint8_t *dst, ptrdiff_t dst_pi
         return;
 
     for (y = 0; y < plane->height; y++) {
-        for (x = 0; x < plane->width; x++)
-            dst[x] = av_clip_uint8(src[x] + 128);
+        int m = 0;
+        int w = plane->width;
+        for (x = 0; x < w; x++) {
+            int t = src[x] + 128;
+            dst[x] = t;
+            m |= t;
+        }
+        if (m & ~255)
+            for (x = 0; x < w; x++)
+                dst[x] = av_clip_uint8(src[x] + 128);
         src += pitch;
         dst += dst_pitch;
     }
@@ -924,7 +962,7 @@ static int decode_band(IVI45DecContext *ctx,
         band->ref_buf   = band->bufs[ctx->ref_buf];
         band->b_ref_buf = 0;
     }
-    band->data_ptr  = ctx->frame_data + (bitstream_tell(&ctx->bc) >> 3);
+    band->data_ptr  = ctx->frame_data + (get_bits_count(&ctx->gb) >> 3);
 
     result = ctx->decode_band_hdr(ctx, band, avctx);
     if (result) {
@@ -946,9 +984,13 @@ static int decode_band(IVI45DecContext *ctx,
         idx2 = band->corr[i * 2 + 1];
         FFSWAP(uint8_t, band->rv_map->runtab[idx1], band->rv_map->runtab[idx2]);
         FFSWAP(int16_t, band->rv_map->valtab[idx1], band->rv_map->valtab[idx2]);
+        if (idx1 == band->rv_map->eob_sym || idx2 == band->rv_map->eob_sym)
+            band->rv_map->eob_sym ^= idx1 ^ idx2;
+        if (idx1 == band->rv_map->esc_sym || idx2 == band->rv_map->esc_sym)
+            band->rv_map->esc_sym ^= idx1 ^ idx2;
     }
 
-    pos = bitstream_tell(&ctx->bc);
+    pos = get_bits_count(&ctx->gb);
 
     for (t = 0; t < band->num_tiles; t++) {
         tile = &band->tiles[t];
@@ -958,7 +1000,7 @@ static int decode_band(IVI45DecContext *ctx,
                    band->mb_size, tile->mb_size);
             return AVERROR_INVALIDDATA;
         }
-        tile->is_empty = bitstream_read_bit(&ctx->bc);
+        tile->is_empty = get_bits1(&ctx->gb);
         if (tile->is_empty) {
             result = ivi_process_empty_tile(avctx, band, tile,
                                       (ctx->planes[0].bands[0].mb_size >> 3) - (band->mb_size >> 3));
@@ -966,24 +1008,25 @@ static int decode_band(IVI45DecContext *ctx,
                 break;
             ff_dlog(avctx, "Empty tile encountered!\n");
         } else {
-            tile->data_size = ivi_dec_tile_data_size(&ctx->bc);
+            tile->data_size = ivi_dec_tile_data_size(&ctx->gb);
             if (!tile->data_size) {
                 av_log(avctx, AV_LOG_ERROR, "Tile data size is zero!\n");
-                return AVERROR_INVALIDDATA;
+                result = AVERROR_INVALIDDATA;
+                break;
             }
 
             result = ctx->decode_mb_info(ctx, band, tile, avctx);
             if (result < 0)
                 break;
 
-            result = ivi_decode_blocks(&ctx->bc, band, tile, avctx);
+            result = ivi_decode_blocks(&ctx->gb, band, tile, avctx);
             if (result < 0) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Corrupted tile data encountered!\n");
                 break;
             }
 
-            if (((bitstream_tell(&ctx->bc) - pos) >> 3) != tile->data_size) {
+            if (((get_bits_count(&ctx->gb) - pos) >> 3) != tile->data_size) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Tile data_size mismatch!\n");
                 result = AVERROR_INVALIDDATA;
@@ -1001,6 +1044,10 @@ static int decode_band(IVI45DecContext *ctx,
         idx2 = band->corr[i*2+1];
         FFSWAP(uint8_t, band->rv_map->runtab[idx1], band->rv_map->runtab[idx2]);
         FFSWAP(int16_t, band->rv_map->valtab[idx1], band->rv_map->valtab[idx2]);
+        if (idx1 == band->rv_map->eob_sym || idx2 == band->rv_map->eob_sym)
+            band->rv_map->eob_sym ^= idx1 ^ idx2;
+        if (idx1 == band->rv_map->esc_sym || idx2 == band->rv_map->esc_sym)
+            band->rv_map->esc_sym ^= idx1 ^ idx2;
     }
 
 #ifdef DEBUG
@@ -1015,7 +1062,7 @@ static int decode_band(IVI45DecContext *ctx,
     }
 #endif
 
-    bitstream_align(&ctx->bc);
+    align_get_bits(&ctx->gb);
 
     return result;
 }
@@ -1029,7 +1076,9 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int             buf_size = avpkt->size;
     int             result, p, b;
 
-    bitstream_init8(&ctx->bc, buf, buf_size);
+    result = init_get_bits8(&ctx->gb, buf, buf_size);
+    if (result < 0)
+        return result;
     ctx->frame_data = buf;
     ctx->frame_size = buf_size;
 
@@ -1068,6 +1117,7 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     //{ START_TIMER;
 
     if (ctx->is_nonnull_frame(ctx)) {
+        ctx->buf_invalid[ctx->dst_buf] = 1;
         for (p = 0; p < 3; p++) {
             for (b = 0; b < ctx->planes[p].num_bands; b++) {
                 result = decode_band(ctx, &ctx->planes[p].bands[b], avctx);
@@ -1078,6 +1128,7 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 }
             }
         }
+        ctx->buf_invalid[ctx->dst_buf] = 0;
     } else {
         if (ctx->is_scalable)
             return AVERROR_INVALIDDATA;
@@ -1087,17 +1138,20 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 return AVERROR_INVALIDDATA;
         }
     }
+    if (ctx->buf_invalid[ctx->dst_buf])
+        return -1;
 
     //STOP_TIMER("decode_planes"); }
 
+    if (!ctx->is_nonnull_frame(ctx))
+        return buf_size;
+
     result = ff_set_dimensions(avctx, ctx->planes[0].width, ctx->planes[0].height);
     if (result < 0)
         return result;
 
-    if ((result = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((result = ff_get_buffer(avctx, frame, 0)) < 0)
         return result;
-    }
 
     if (ctx->is_scalable) {
         if (ctx->is_indeo4)
@@ -1121,14 +1175,18 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (ctx->is_indeo4 && ctx->frame_type == IVI4_FRAMETYPE_INTRA) {
         int left;
 
-        while (bitstream_read(&ctx->bc, 8)); // skip version string
-        left = bitstream_tell(&ctx->bc) & 0x18;
-        bitstream_skip(&ctx->bc, 64 - left);
-        if (bitstream_bits_left(&ctx->bc) > 18 &&
-            bitstream_peek(&ctx->bc, 21) == 0xBFFF8) { // syncheader + inter type
+            // skip version string
+        while (get_bits(&ctx->gb, 8)) {
+            if (get_bits_left(&ctx->gb) < 8)
+                return AVERROR_INVALIDDATA;
+        }
+        left = get_bits_count(&ctx->gb) & 0x18;
+        skip_bits_long(&ctx->gb, 64 - left);
+        if (get_bits_left(&ctx->gb) > 18 &&
+            show_bits_long(&ctx->gb, 21) == 0xBFFF8) { // syncheader + inter type
             AVPacket pkt;
-            pkt.data = avpkt->data + (bitstream_tell(&ctx->bc) >> 3);
-            pkt.size = bitstream_bits_left(&ctx->bc) >> 3;
+            pkt.data = avpkt->data + (get_bits_count(&ctx->gb) >> 3);
+            pkt.size = get_bits_left(&ctx->gb) >> 3;
             ff_ivi_decode_frame(avctx, ctx->p_frame, &ctx->got_p_frame, &pkt);
         }
     }
@@ -1164,6 +1222,9 @@ av_cold int ff_ivi_decode_close(AVCodecContext *avctx)
     if (ctx->mb_vlc.cust_tab.table)
         ff_free_vlc(&ctx->mb_vlc.cust_tab);
 
+    if (ctx->blk_vlc.cust_tab.table)
+        ff_free_vlc(&ctx->blk_vlc.cust_tab);
+
     av_frame_free(&ctx->p_frame);
 
     return 0;
diff --git a/libavcodec/ivi.h b/libavcodec/ivi.h
index 6fde8a6..1427535 100644
--- a/libavcodec/ivi.h
+++ b/libavcodec/ivi.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,9 @@
 #ifndef AVCODEC_IVI_H
 #define AVCODEC_IVI_H
 
-#include <stdint.h>
-
 #include "avcodec.h"
-#include "bitstream.h"
-#include "vlc.h"
+#include "get_bits.h"
+#include <stdint.h>
 
 /**
  *  Indeo 4 frame types.
@@ -166,6 +164,7 @@ typedef struct IVIBandDesc {
     int             quant_mat;      ///< dequant matrix index
     int             glob_quant;     ///< quant base for this band
     const uint8_t   *scan;          ///< ptr to the scan pattern
+    int             scan_size;      ///< size of the scantable
 
     IVIHuffTab      blk_vlc;        ///< vlc table for decoding block data
 
@@ -212,7 +211,7 @@ typedef struct IVIPicConfig {
 } IVIPicConfig;
 
 typedef struct IVI45DecContext {
-    BitstreamContext bc;
+    GetBitContext   gb;
     RVMapDesc       rvmap_tabs[9];   ///< local corrected copy of the static rvmap tables
 
     uint32_t        frame_num;
@@ -263,6 +262,7 @@ typedef struct IVI45DecContext {
     int             (*is_nonnull_frame)(struct IVI45DecContext *ctx);
 
     int gop_invalid;
+    int buf_invalid[4];
 
     int is_indeo4;
 
@@ -304,14 +304,14 @@ void ff_ivi_init_static_vlc(void);
  *  Decode a huffman codebook descriptor from the bitstream
  *  and select specified huffman table.
  *
- *  @param[in,out]  bc          the Bitstream context
+ *  @param[in,out]  gb          the GetBit context
  *  @param[in]      desc_coded  flag signalling if table descriptor was coded
  *  @param[in]      which_tab   codebook purpose (IVI_MB_HUFF or IVI_BLK_HUFF)
  *  @param[out]     huff_tab    pointer to the descriptor of the selected table
  *  @param[in]      avctx       AVCodecContext pointer
  *  @return             zero on success, negative value otherwise
  */
-int  ff_ivi_dec_huff_desc(BitstreamContext *bc, int desc_coded, int which_tab,
+int  ff_ivi_dec_huff_desc(GetBitContext *gb, int desc_coded, int which_tab,
                           IVIHuffTab *huff_tab, AVCodecContext *avctx);
 
 /**
@@ -322,8 +322,8 @@ int  ff_ivi_dec_huff_desc(BitstreamContext *bc, int desc_coded, int which_tab,
  *  @param[in]      is_indeo4  flag signalling if it is Indeo 4 or not
  *  @return             result code: 0 - OK
  */
-int  ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
-                        int is_indeo4);
+int  ff_ivi_init_planes(AVCodecContext *avctx, IVIPlaneDesc *planes,
+                        const IVIPicConfig *cfg, int is_indeo4);
 
 /**
  *  Initialize tile and macroblock descriptors.
diff --git a/libavcodec/ivi_dsp.c b/libavcodec/ivi_dsp.c
index b8a476d..4aedf17 100644
--- a/libavcodec/ivi_dsp.c
+++ b/libavcodec/ivi_dsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
     int32_t         b0_1, b0_2, b1_1, b1_2, b1_3, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6;
     int32_t         b3_1, b3_2, b3_3, b3_4, b3_5, b3_6, b3_7, b3_8, b3_9;
     ptrdiff_t       pitch, back_pitch;
-    const short    *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
+    const short     *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
     const int       num_bands = 4;
 
     /* all bands should have the same pitch */
@@ -54,6 +54,9 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
     b3_ptr = plane->bands[3].buf;
 
     for (y = 0; y < plane->height; y += 2) {
+
+        if (y+2 >= plane->height)
+            pitch= 0;
         /* load storage variables with values */
         if (num_bands > 0) {
             b0_1 = b0_ptr[0];
@@ -83,6 +86,13 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
         }
 
         for (x = 0, indx = 0; x < plane->width; x+=2, indx++) {
+            if (x+2 >= plane->width) {
+                b0_ptr --;
+                b1_ptr --;
+                b2_ptr --;
+                b3_ptr --;
+            }
+
             /* some values calculated in the previous iterations can */
             /* be reused in the next ones, so do appropriate copying */
             b2_1 = b2_2; // b2[x-1,y  ] = b2[x,  y  ]
@@ -106,10 +116,10 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
                 b0_2 = b0_ptr[pitch+indx+1];
                 tmp1 = tmp0 + b0_1;
 
-                p0 =  tmp0 << 4;
-                p1 =  tmp1 << 3;
-                p2 = (tmp0 + tmp2) << 3;
-                p3 = (tmp1 + tmp2 + b0_2) << 2;
+                p0 =  tmp0 * 16;
+                p1 =  tmp1 * 8;
+                p2 = (tmp0 + tmp2) * 8;
+                p3 = (tmp1 + tmp2 + b0_2) * 4;
             }
 
             /* process the HL-band by applying HPF vertically and LPF horizontally */
@@ -122,10 +132,10 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
                 tmp2 = tmp1 - tmp0*6 + b1_3;
                 b1_3 = b1_1 - b1_2*6 + b1_ptr[pitch+indx+1];
 
-                p0 += (tmp0 + tmp1) << 3;
-                p1 += (tmp0 + tmp1 + b1_1 + b1_2) << 2;
-                p2 +=  tmp2 << 2;
-                p3 += (tmp2 + b1_3) << 1;
+                p0 += (tmp0 + tmp1) * 8;
+                p1 += (tmp0 + tmp1 + b1_1 + b1_2) * 4;
+                p2 +=  tmp2 * 4;
+                p3 += (tmp2 + b1_3) * 2;
             }
 
             /* process the LH-band by applying LPF vertically and HPF horizontally */
@@ -136,10 +146,10 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
                 tmp0 = b2_1 + b2_2;
                 tmp1 = b2_1 - b2_2*6 + b2_3;
 
-                p0 += tmp0 << 3;
-                p1 += tmp1 << 2;
-                p2 += (tmp0 + b2_4 + b2_5) << 2;
-                p3 += (tmp1 + b2_4 - b2_5*6 + b2_6) << 1;
+                p0 += tmp0 * 8;
+                p1 += tmp1 * 4;
+                p2 += (tmp0 + b2_4 + b2_5) * 4;
+                p3 += (tmp1 + b2_4 - b2_5*6 + b2_6) * 2;
             }
 
             /* process the HH-band by applying HPF both vertically and horizontally */
@@ -153,9 +163,9 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
 
                 b3_9 = b3_3 - b3_6*6 + b3_ptr[pitch+indx+1];
 
-                p0 += (tmp0 + tmp1) << 2;
-                p1 += (tmp0 - tmp1*6 + tmp2) << 1;
-                p2 += (b3_7 + b3_8) << 1;
+                p0 += (tmp0 + tmp1) * 4;
+                p1 += (tmp0 - tmp1*6 + tmp2) * 2;
+                p2 += (b3_7 + b3_8) * 2;
                 p3 +=  b3_7 - b3_8*6 + b3_9;
             }
 
@@ -170,10 +180,10 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
 
         back_pitch = -pitch;
 
-        b0_ptr += pitch;
-        b1_ptr += pitch;
-        b2_ptr += pitch;
-        b3_ptr += pitch;
+        b0_ptr += pitch + 1;
+        b1_ptr += pitch + 1;
+        b2_ptr += pitch + 1;
+        b3_ptr += pitch + 1;
     }
 }
 
@@ -181,7 +191,7 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
                            const ptrdiff_t dst_pitch)
 {
     int             x, y, indx, b0, b1, b2, b3, p0, p1, p2, p3;
-    const short    *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
+    const short     *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
     ptrdiff_t       pitch;
 
     /* all bands should have the same pitch */
@@ -225,15 +235,15 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
 
 /** butterfly operation for the inverse Haar transform */
 #define IVI_HAAR_BFLY(s1, s2, o1, o2, t) \
-    t  = (s1 - s2) >> 1;\
-    o1 = (s1 + s2) >> 1;\
-    o2 = t;\
+    t  = ((s1) - (s2)) >> 1;\
+    o1 = ((s1) + (s2)) >> 1;\
+    o2 = (t);\
 
 /** inverse 8-point Haar transform */
 #define INV_HAAR8(s1, s5, s3, s7, s2, s4, s6, s8,\
                   d1, d2, d3, d4, d5, d6, d7, d8,\
                   t0, t1, t2, t3, t4, t5, t6, t7, t8) {\
-    t1 = s1 << 1; t5 = s5 << 1;\
+    t1 = (s1) * 2; t5 = (s5) * 2;\
     IVI_HAAR_BFLY(t1, t5, t1, t5, t0); IVI_HAAR_BFLY(t1, s3, t1, t3, t0);\
     IVI_HAAR_BFLY(t5, s7, t5, t7, t0); IVI_HAAR_BFLY(t1, s2, t1, t2, t0);\
     IVI_HAAR_BFLY(t3, s4, t3, t4, t0); IVI_HAAR_BFLY(t5, s6, t5, t6, t0);\
@@ -274,10 +284,10 @@ void ff_ivi_inverse_haar_8x8(const int32_t *in, int16_t *out, ptrdiff_t pitch,
         if (flags[i]) {
             /* pre-scaling */
             shift = !(i & 4);
-            sp1 = src[ 0] << shift;
-            sp2 = src[ 8] << shift;
-            sp3 = src[16] << shift;
-            sp4 = src[24] << shift;
+            sp1 = src[ 0] * (1 << shift);
+            sp2 = src[ 8] * (1 << shift);
+            sp3 = src[16] * (1 << shift);
+            sp4 = src[24] * (1 << shift);
             INV_HAAR8(    sp1,     sp2,     sp3,     sp4,
                       src[32], src[40], src[48], src[56],
                       dst[ 0], dst[ 8], dst[16], dst[24],
@@ -383,8 +393,8 @@ void ff_ivi_inverse_haar_4x4(const int32_t *in, int16_t *out, ptrdiff_t pitch,
         if (flags[i]) {
             /* pre-scaling */
             shift = !(i & 2);
-            sp1 = src[0] << shift;
-            sp2 = src[4] << shift;
+            sp1 = src[0] * (1 << shift);
+            sp2 = src[4] * (1 << shift);
             INV_HAAR4(   sp1,    sp2, src[8], src[12],
                       dst[0], dst[4], dst[8], dst[12],
                       t0, t1, t2, t3, t4);
@@ -475,21 +485,21 @@ void ff_ivi_dc_haar_2d(const int32_t *in, int16_t *out, ptrdiff_t pitch,
 
 /** butterfly operation for the inverse slant transform */
 #define IVI_SLANT_BFLY(s1, s2, o1, o2, t) \
-    t  = s1 - s2;\
-    o1 = s1 + s2;\
-    o2 = t;\
+    t  = (s1) - (s2);\
+    o1 = (s1) + (s2);\
+    o2 = (t);\
 
 /** This is a reflection a,b = 1/2, 5/4 for the inverse slant transform */
 #define IVI_IREFLECT(s1, s2, o1, o2, t) \
-    t  = ((s1 + s2*2 + 2) >> 2) + s1;\
-    o2 = ((s1*2 - s2 + 2) >> 2) - s2;\
-    o1 = t;\
+    t  = (((s1) + (s2)*2 + 2) >> 2) + (s1);\
+    o2 = (((s1)*2 - (s2) + 2) >> 2) - (s2);\
+    o1 = (t);\
 
 /** This is a reflection a,b = 1/2, 7/8 for the inverse slant transform */
 #define IVI_SLANT_PART4(s1, s2, o1, o2, t) \
-    t  = s2 + ((s1*4  - s2 + 4) >> 3);\
-    o2 = s1 + ((-s1 - s2*4 + 4) >> 3);\
-    o1 = t;\
+    t  = (s2) + (((s1)*4  - (s2) + 4) >> 3);\
+    o2 = (s1) + ((-(s1) - (s2)*4 + 4) >> 3);\
+    o1 = (t);\
 
 /** inverse slant8 transform */
 #define IVI_INV_SLANT8(s1, s4, s8, s5, s2, s6, s3, s7,\
@@ -547,7 +557,7 @@ void ff_ivi_inverse_slant_8x8(const int32_t *in, int16_t *out, ptrdiff_t pitch,
     }
 #undef COMPENSATE
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     src = tmp;
     for (i = 0; i < 8; i++) {
         if (!src[0] && !src[1] && !src[2] && !src[3] && !src[4] && !src[5] && !src[6] && !src[7]) {
@@ -587,7 +597,7 @@ void ff_ivi_inverse_slant_4x4(const int32_t *in, int16_t *out, ptrdiff_t pitch,
     }
 #undef COMPENSATE
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     src = tmp;
     for (i = 0; i < 4; i++) {
         if (!src[0] && !src[1] && !src[2] && !src[3]) {
@@ -621,7 +631,7 @@ void ff_ivi_row_slant8(const int32_t *in, int16_t *out, ptrdiff_t pitch, const u
     int     i;
     int     t0, t1, t2, t3, t4, t5, t6, t7, t8;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 8; i++) {
         if (!in[0] && !in[1] && !in[2] && !in[3] && !in[4] && !in[5] && !in[6] && !in[7]) {
             memset(out, 0, 8*sizeof(out[0]));
@@ -663,7 +673,7 @@ void ff_ivi_col_slant8(const int32_t *in, int16_t *out, ptrdiff_t pitch, const u
     row4 = pitch << 2;
     row8 = pitch << 3;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 8; i++) {
         if (flags[i]) {
             IVI_INV_SLANT8(in[0], in[8], in[16], in[24], in[32], in[40], in[48], in[56],
@@ -700,7 +710,7 @@ void ff_ivi_row_slant4(const int32_t *in, int16_t *out, ptrdiff_t pitch, const u
     int     i;
     int     t0, t1, t2, t3, t4;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 4; i++) {
         if (!in[0] && !in[1] && !in[2] && !in[3]) {
             memset(out, 0, 4*sizeof(out[0]));
@@ -722,7 +732,7 @@ void ff_ivi_col_slant4(const int32_t *in, int16_t *out, ptrdiff_t pitch, const u
 
     row2 = pitch << 1;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 4; i++) {
         if (flags[i]) {
             IVI_INV_SLANT4(in[0], in[4], in[8], in[12],
diff --git a/libavcodec/ivi_dsp.h b/libavcodec/ivi_dsp.h
index d9d3d17..2704d2b 100644
--- a/libavcodec/ivi_dsp.h
+++ b/libavcodec/ivi_dsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,6 +66,10 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
  */
 void ff_ivi_inverse_haar_8x8(const int32_t *in, int16_t *out, ptrdiff_t pitch,
                              const uint8_t *flags);
+void ff_ivi_inverse_haar_8x1(const int32_t *in, int16_t *out, uint32_t pitch,
+                             const uint8_t *flags);
+void ff_ivi_inverse_haar_1x8(const int32_t *in, int16_t *out, uint32_t pitch,
+                             const uint8_t *flags);
 
 /**
  *  one-dimensional inverse 8-point Haar transform on rows for Indeo 4
diff --git a/libavcodec/j2kenc.c b/libavcodec/j2kenc.c
new file mode 100644
index 0000000..e91d932
--- /dev/null
+++ b/libavcodec/j2kenc.c
@@ -0,0 +1,1261 @@
+/*
+ * JPEG2000 image encoder
+ * Copyright (c) 2007 Kamil Nowosad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * **********************************************************************************************************************
+ *
+ *
+ *
+ * This source code incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium
+ * Copyright (c) 2002-2007, Professor Benoit Macq
+ * Copyright (c) 2001-2003, David Janssens
+ * Copyright (c) 2002-2003, Yannick Verschueren
+ * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe
+ * Copyright (c) 2005, Herve Drolon, FreeImage Team
+ * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/**
+ * JPEG2000 image encoder
+ * @file
+ * @author Kamil Nowosad
+ */
+
+#include <float.h>
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "jpeg2000.h"
+#include "libavutil/common.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+#include "libavutil/intreadwrite.h"
+
+#define NMSEDEC_BITS 7
+#define NMSEDEC_FRACBITS (NMSEDEC_BITS-1)
+#define WMSEDEC_SHIFT 13 ///< must be >= 13
+#define LAMBDA_SCALE (100000000LL << (WMSEDEC_SHIFT - 13))
+
+#define CODEC_JP2 1
+#define CODEC_J2K 0
+
+static int lut_nmsedec_ref [1<<NMSEDEC_BITS],
+           lut_nmsedec_ref0[1<<NMSEDEC_BITS],
+           lut_nmsedec_sig [1<<NMSEDEC_BITS],
+           lut_nmsedec_sig0[1<<NMSEDEC_BITS];
+
+static const int dwt_norms[2][4][10] = { // [dwt_type][band][rlevel] (multiplied by 10000)
+    {{10000, 19650, 41770,  84030, 169000, 338400,  676900, 1353000, 2706000, 5409000},
+     {20220, 39890, 83550, 170400, 342700, 686300, 1373000, 2746000, 5490000},
+     {20220, 39890, 83550, 170400, 342700, 686300, 1373000, 2746000, 5490000},
+     {20800, 38650, 83070, 171800, 347100, 695900, 1393000, 2786000, 5572000}},
+
+    {{10000, 15000, 27500, 53750, 106800, 213400, 426700, 853300, 1707000, 3413000},
+     {10380, 15920, 29190, 57030, 113300, 226400, 452500, 904800, 1809000},
+     {10380, 15920, 29190, 57030, 113300, 226400, 452500, 904800, 1809000},
+     { 7186,  9218, 15860, 30430,  60190, 120100, 240000, 479700,  959300}}
+};
+
+typedef struct {
+   Jpeg2000Component *comp;
+} Jpeg2000Tile;
+
+typedef struct {
+    AVClass *class;
+    AVCodecContext *avctx;
+    const AVFrame *picture;
+
+    int width, height; ///< image width and height
+    uint8_t cbps[4]; ///< bits per sample in particular components
+    int chroma_shift[2];
+    uint8_t planar;
+    int ncomponents;
+    int tile_width, tile_height; ///< tile size
+    int numXtiles, numYtiles;
+
+    uint8_t *buf_start;
+    uint8_t *buf;
+    uint8_t *buf_end;
+    int bit_index;
+
+    int64_t lambda;
+
+    Jpeg2000CodingStyle codsty;
+    Jpeg2000QuantStyle  qntsty;
+
+    Jpeg2000Tile *tile;
+
+    int format;
+    int pred;
+} Jpeg2000EncoderContext;
+
+
+/* debug */
+#if 0
+#undef ifprintf
+#undef printf
+
+static void nspaces(FILE *fd, int n)
+{
+    while(n--) putc(' ', fd);
+}
+
+static void printcomp(Jpeg2000Component *comp)
+{
+    int i;
+    for (i = 0; i < comp->y1 - comp->y0; i++)
+        ff_jpeg2000_printv(comp->i_data + i * (comp->x1 - comp->x0), comp->x1 - comp->x0);
+}
+
+static void dump(Jpeg2000EncoderContext *s, FILE *fd)
+{
+    int tileno, compno, reslevelno, bandno, precno;
+    fprintf(fd, "XSiz = %d, YSiz = %d, tile_width = %d, tile_height = %d\n"
+                "numXtiles = %d, numYtiles = %d, ncomponents = %d\n"
+                "tiles:\n",
+            s->width, s->height, s->tile_width, s->tile_height,
+            s->numXtiles, s->numYtiles, s->ncomponents);
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        nspaces(fd, 2);
+        fprintf(fd, "tile %d:\n", tileno);
+        for(compno = 0; compno < s->ncomponents; compno++){
+            Jpeg2000Component *comp = tile->comp + compno;
+            nspaces(fd, 4);
+            fprintf(fd, "component %d:\n", compno);
+            nspaces(fd, 4);
+            fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d\n",
+                        comp->x0, comp->x1, comp->y0, comp->y1);
+            for(reslevelno = 0; reslevelno < s->nreslevels; reslevelno++){
+                Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+                nspaces(fd, 6);
+                fprintf(fd, "reslevel %d:\n", reslevelno);
+                nspaces(fd, 6);
+                fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d, nbands = %d\n",
+                        reslevel->x0, reslevel->x1, reslevel->y0,
+                        reslevel->y1, reslevel->nbands);
+                for(bandno = 0; bandno < reslevel->nbands; bandno++){
+                    Jpeg2000Band *band = reslevel->band + bandno;
+                    nspaces(fd, 8);
+                    fprintf(fd, "band %d:\n", bandno);
+                    nspaces(fd, 8);
+                    fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d,"
+                                "codeblock_width = %d, codeblock_height = %d cblknx = %d cblkny = %d\n",
+                                band->x0, band->x1,
+                                band->y0, band->y1,
+                                band->codeblock_width, band->codeblock_height,
+                                band->cblknx, band->cblkny);
+                    for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                        Jpeg2000Prec *prec = band->prec + precno;
+                        nspaces(fd, 10);
+                        fprintf(fd, "prec %d:\n", precno);
+                        nspaces(fd, 10);
+                        fprintf(fd, "xi0 = %d, xi1 = %d, yi0 = %d, yi1 = %d\n",
+                                     prec->xi0, prec->xi1, prec->yi0, prec->yi1);
+                    }
+                }
+            }
+        }
+    }
+}
+#endif
+
+/* bitstream routines */
+
+/** put n times val bit */
+static void put_bits(Jpeg2000EncoderContext *s, int val, int n) // TODO: optimize
+{
+    while (n-- > 0){
+        if (s->bit_index == 8)
+        {
+            s->bit_index = *s->buf == 0xff;
+            *(++s->buf) = 0;
+        }
+        *s->buf |= val << (7 - s->bit_index++);
+    }
+}
+
+/** put n least significant bits of a number num */
+static void put_num(Jpeg2000EncoderContext *s, int num, int n)
+{
+    while(--n >= 0)
+        put_bits(s, (num >> n) & 1, 1);
+}
+
+/** flush the bitstream */
+static void j2k_flush(Jpeg2000EncoderContext *s)
+{
+    if (s->bit_index){
+        s->bit_index = 0;
+        s->buf++;
+    }
+}
+
+/* tag tree routines */
+
+/** code the value stored in node */
+static void tag_tree_code(Jpeg2000EncoderContext *s, Jpeg2000TgtNode *node, int threshold)
+{
+    Jpeg2000TgtNode *stack[30];
+    int sp = 1, curval = 0;
+    stack[0] = node;
+
+    node = node->parent;
+    while(node){
+        if (node->vis){
+            curval = node->val;
+            break;
+        }
+        node->vis++;
+        stack[sp++] = node;
+        node = node->parent;
+    }
+    while(--sp >= 0){
+        if (stack[sp]->val >= threshold){
+            put_bits(s, 0, threshold - curval);
+            break;
+        }
+        put_bits(s, 0, stack[sp]->val - curval);
+        put_bits(s, 1, 1);
+        curval = stack[sp]->val;
+    }
+}
+
+/** update the value in node */
+static void tag_tree_update(Jpeg2000TgtNode *node)
+{
+    int lev = 0;
+    while (node->parent){
+        if (node->parent->val <= node->val)
+            break;
+        node->parent->val = node->val;
+        node = node->parent;
+        lev++;
+    }
+}
+
+static int put_siz(Jpeg2000EncoderContext *s)
+{
+    int i;
+
+    if (s->buf_end - s->buf < 40 + 3 * s->ncomponents)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_SIZ);
+    bytestream_put_be16(&s->buf, 38 + 3 * s->ncomponents); // Lsiz
+    bytestream_put_be16(&s->buf, 0); // Rsiz
+    bytestream_put_be32(&s->buf, s->width); // width
+    bytestream_put_be32(&s->buf, s->height); // height
+    bytestream_put_be32(&s->buf, 0); // X0Siz
+    bytestream_put_be32(&s->buf, 0); // Y0Siz
+
+    bytestream_put_be32(&s->buf, s->tile_width); // XTSiz
+    bytestream_put_be32(&s->buf, s->tile_height); // YTSiz
+    bytestream_put_be32(&s->buf, 0); // XT0Siz
+    bytestream_put_be32(&s->buf, 0); // YT0Siz
+    bytestream_put_be16(&s->buf, s->ncomponents); // CSiz
+
+    for (i = 0; i < s->ncomponents; i++){ // Ssiz_i XRsiz_i, YRsiz_i
+        bytestream_put_byte(&s->buf, 7);
+        bytestream_put_byte(&s->buf, i?1<<s->chroma_shift[0]:1);
+        bytestream_put_byte(&s->buf, i?1<<s->chroma_shift[1]:1);
+    }
+    return 0;
+}
+
+static int put_cod(Jpeg2000EncoderContext *s)
+{
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    if (s->buf_end - s->buf < 14)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_COD);
+    bytestream_put_be16(&s->buf, 12); // Lcod
+    bytestream_put_byte(&s->buf, 0);  // Scod
+    // SGcod
+    bytestream_put_byte(&s->buf, 0); // progression level
+    bytestream_put_be16(&s->buf, 1); // num of layers
+    if(s->avctx->pix_fmt == AV_PIX_FMT_YUV444P){
+        bytestream_put_byte(&s->buf, 0); // unspecified
+    }else{
+        bytestream_put_byte(&s->buf, 0); // unspecified
+    }
+    // SPcod
+    bytestream_put_byte(&s->buf, codsty->nreslevels - 1); // num of decomp. levels
+    bytestream_put_byte(&s->buf, codsty->log2_cblk_width-2); // cblk width
+    bytestream_put_byte(&s->buf, codsty->log2_cblk_height-2); // cblk height
+    bytestream_put_byte(&s->buf, 0); // cblk style
+    bytestream_put_byte(&s->buf, codsty->transform == FF_DWT53); // transformation
+    return 0;
+}
+
+static int put_qcd(Jpeg2000EncoderContext *s, int compno)
+{
+    int i, size;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    if (qntsty->quantsty == JPEG2000_QSTY_NONE)
+        size = 4 + 3 * (codsty->nreslevels-1);
+    else // QSTY_SE
+        size = 5 + 6 * (codsty->nreslevels-1);
+
+    if (s->buf_end - s->buf < size + 2)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_QCD);
+    bytestream_put_be16(&s->buf, size);  // LQcd
+    bytestream_put_byte(&s->buf, (qntsty->nguardbits << 5) | qntsty->quantsty);  // Sqcd
+    if (qntsty->quantsty == JPEG2000_QSTY_NONE)
+        for (i = 0; i < codsty->nreslevels * 3 - 2; i++)
+            bytestream_put_byte(&s->buf, qntsty->expn[i] << 3);
+    else // QSTY_SE
+        for (i = 0; i < codsty->nreslevels * 3 - 2; i++)
+            bytestream_put_be16(&s->buf, (qntsty->expn[i] << 11) | qntsty->mant[i]);
+    return 0;
+}
+
+static int put_com(Jpeg2000EncoderContext *s, int compno)
+{
+    int size = 4 + strlen(LIBAVCODEC_IDENT);
+
+    if (s->avctx->flags & AV_CODEC_FLAG_BITEXACT)
+        return 0;
+
+    if (s->buf_end - s->buf < size + 2)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_COM);
+    bytestream_put_be16(&s->buf, size);
+    bytestream_put_be16(&s->buf, 1); // General use (ISO/IEC 8859-15 (Latin) values)
+
+    bytestream_put_buffer(&s->buf, LIBAVCODEC_IDENT, strlen(LIBAVCODEC_IDENT));
+
+    return 0;
+}
+
+static uint8_t *put_sot(Jpeg2000EncoderContext *s, int tileno)
+{
+    uint8_t *psotptr;
+
+    if (s->buf_end - s->buf < 12)
+        return NULL;
+
+    bytestream_put_be16(&s->buf, JPEG2000_SOT);
+    bytestream_put_be16(&s->buf, 10); // Lsot
+    bytestream_put_be16(&s->buf, tileno); // Isot
+
+    psotptr = s->buf;
+    bytestream_put_be32(&s->buf, 0); // Psot (filled in later)
+
+    bytestream_put_byte(&s->buf, 0); // TPsot
+    bytestream_put_byte(&s->buf, 1); // TNsot
+    return psotptr;
+}
+
+/**
+ * compute the sizes of tiles, resolution levels, bands, etc.
+ * allocate memory for them
+ * divide the input image into tile-components
+ */
+static int init_tiles(Jpeg2000EncoderContext *s)
+{
+    int tileno, tilex, tiley, compno;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    s->numXtiles = ff_jpeg2000_ceildiv(s->width, s->tile_width);
+    s->numYtiles = ff_jpeg2000_ceildiv(s->height, s->tile_height);
+
+    s->tile = av_malloc_array(s->numXtiles, s->numYtiles * sizeof(Jpeg2000Tile));
+    if (!s->tile)
+        return AVERROR(ENOMEM);
+    for (tileno = 0, tiley = 0; tiley < s->numYtiles; tiley++)
+        for (tilex = 0; tilex < s->numXtiles; tilex++, tileno++){
+            Jpeg2000Tile *tile = s->tile + tileno;
+
+            tile->comp = av_mallocz_array(s->ncomponents, sizeof(Jpeg2000Component));
+            if (!tile->comp)
+                return AVERROR(ENOMEM);
+            for (compno = 0; compno < s->ncomponents; compno++){
+                Jpeg2000Component *comp = tile->comp + compno;
+                int ret, i, j;
+
+                comp->coord[0][0] = comp->coord_o[0][0] = tilex * s->tile_width;
+                comp->coord[0][1] = comp->coord_o[0][1] = FFMIN((tilex+1)*s->tile_width, s->width);
+                comp->coord[1][0] = comp->coord_o[1][0] = tiley * s->tile_height;
+                comp->coord[1][1] = comp->coord_o[1][1] = FFMIN((tiley+1)*s->tile_height, s->height);
+                if (compno > 0)
+                    for (i = 0; i < 2; i++)
+                        for (j = 0; j < 2; j++)
+                            comp->coord[i][j] = comp->coord_o[i][j] = ff_jpeg2000_ceildivpow2(comp->coord[i][j], s->chroma_shift[i]);
+
+                if ((ret = ff_jpeg2000_init_component(comp,
+                                                codsty,
+                                                qntsty,
+                                                s->cbps[compno],
+                                                compno?1<<s->chroma_shift[0]:1,
+                                                compno?1<<s->chroma_shift[1]:1,
+                                                s->avctx
+                                               )) < 0)
+                    return ret;
+            }
+        }
+    return 0;
+}
+
+static void copy_frame(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno, i, y, x;
+    uint8_t *line;
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        if (s->planar){
+            for (compno = 0; compno < s->ncomponents; compno++){
+                Jpeg2000Component *comp = tile->comp + compno;
+                int *dst = comp->i_data;
+                line = s->picture->data[compno]
+                       + comp->coord[1][0] * s->picture->linesize[compno]
+                       + comp->coord[0][0];
+                for (y = comp->coord[1][0]; y < comp->coord[1][1]; y++){
+                    uint8_t *ptr = line;
+                    for (x = comp->coord[0][0]; x < comp->coord[0][1]; x++)
+                        *dst++ = *ptr++ - (1 << 7);
+                    line += s->picture->linesize[compno];
+                }
+            }
+        } else{
+            line = s->picture->data[0] + tile->comp[0].coord[1][0] * s->picture->linesize[0]
+                   + tile->comp[0].coord[0][0] * s->ncomponents;
+
+            i = 0;
+            for (y = tile->comp[0].coord[1][0]; y < tile->comp[0].coord[1][1]; y++){
+                uint8_t *ptr = line;
+                for (x = tile->comp[0].coord[0][0]; x < tile->comp[0].coord[0][1]; x++, i++){
+                    for (compno = 0; compno < s->ncomponents; compno++){
+                        tile->comp[compno].i_data[i] = *ptr++  - (1 << 7);
+                    }
+                }
+                line += s->picture->linesize[0];
+            }
+        }
+    }
+}
+
+static void init_quantization(Jpeg2000EncoderContext *s)
+{
+    int compno, reslevelno, bandno;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (compno = 0; compno < s->ncomponents; compno++){
+        int gbandno = 0;
+        for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+            int nbands, lev = codsty->nreslevels - reslevelno - 1;
+            nbands = reslevelno ? 3 : 1;
+            for (bandno = 0; bandno < nbands; bandno++, gbandno++){
+                int expn, mant = 0;
+
+                if (codsty->transform == FF_DWT97_INT){
+                    int bandpos = bandno + (reslevelno>0),
+                        ss = 81920000 / dwt_norms[0][bandpos][lev],
+                        log = av_log2(ss);
+                    mant = (11 - log < 0 ? ss >> log - 11 : ss << 11 - log) & 0x7ff;
+                    expn = s->cbps[compno] - log + 13;
+                } else
+                    expn = ((bandno&2)>>1) + (reslevelno>0) + s->cbps[compno];
+
+                qntsty->expn[gbandno] = expn;
+                qntsty->mant[gbandno] = mant;
+            }
+        }
+    }
+}
+
+static void init_luts(void)
+{
+    int i, a,
+        mask = ~((1<<NMSEDEC_FRACBITS)-1);
+
+    for (i = 0; i < (1 << NMSEDEC_BITS); i++){
+        lut_nmsedec_sig[i]  = FFMAX(6*i - (9<<NMSEDEC_FRACBITS-1) << 12-NMSEDEC_FRACBITS, 0);
+        lut_nmsedec_sig0[i] = FFMAX((i*i + (1<<NMSEDEC_FRACBITS-1) & mask) << 1, 0);
+
+        a = (i >> (NMSEDEC_BITS-2)&2) + 1;
+        lut_nmsedec_ref[i]  = FFMAX((-2*i + (1<<NMSEDEC_FRACBITS) + a*i - (a*a<<NMSEDEC_FRACBITS-2))
+                                    << 13-NMSEDEC_FRACBITS, 0);
+        lut_nmsedec_ref0[i] = FFMAX(((i*i + (1-4*i << NMSEDEC_FRACBITS-1) + (1<<2*NMSEDEC_FRACBITS)) & mask)
+                                    << 1, 0);
+    }
+}
+
+/* tier-1 routines */
+static int getnmsedec_sig(int x, int bpno)
+{
+    if (bpno > NMSEDEC_FRACBITS)
+        return lut_nmsedec_sig[(x >> (bpno - NMSEDEC_FRACBITS)) & ((1 << NMSEDEC_BITS) - 1)];
+    return lut_nmsedec_sig0[x & ((1 << NMSEDEC_BITS) - 1)];
+}
+
+static int getnmsedec_ref(int x, int bpno)
+{
+    if (bpno > NMSEDEC_FRACBITS)
+        return lut_nmsedec_ref[(x >> (bpno - NMSEDEC_FRACBITS)) & ((1 << NMSEDEC_BITS) - 1)];
+    return lut_nmsedec_ref0[x & ((1 << NMSEDEC_BITS) - 1)];
+}
+
+static void encode_sigpass(Jpeg2000T1Context *t1, int width, int height, int bandno, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++)
+            for (y = y0; y < height && y < y0+4; y++){
+                if (!(t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG) && (t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG_NB)){
+                    int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno),
+                        bit = t1->data[(y) * t1->stride + x] & mask ? 1 : 0;
+                    ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, bit);
+                    if (bit){
+                        int xorbit;
+                        int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                        ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                        *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                        ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_VIS;
+                }
+            }
+}
+
+static void encode_refpass(Jpeg2000T1Context *t1, int width, int height, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++)
+            for (y = y0; y < height && y < y0+4; y++)
+                if ((t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG){
+                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[(y+1) * t1->stride + x+1]);
+                    *nmsedec += getnmsedec_ref(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                    ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                    t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_REF;
+                }
+}
+
+static void encode_clnpass(Jpeg2000T1Context *t1, int width, int height, int bandno, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++){
+            if (y0 + 3 < height && !(
+            (t1->flags[(y0+1) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+2) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+3) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+4) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG))))
+            {
+                // aggregation mode
+                int rlen;
+                for (rlen = 0; rlen < 4; rlen++)
+                    if (t1->data[(y0+rlen) * t1->stride + x] & mask)
+                        break;
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_RL, rlen != 4);
+                if (rlen == 4)
+                    continue;
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_UNI, rlen >> 1);
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_UNI, rlen & 1);
+                for (y = y0 + rlen; y < y0 + 4; y++){
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))){
+                        int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno);
+                        if (y > y0 + rlen)
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                        if (t1->data[(y) * t1->stride + x] & mask){ // newly significant
+                            int xorbit;
+                            int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                            *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                            ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                        }
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] &= ~JPEG2000_T1_VIS;
+                }
+            } else{
+                for (y = y0; y < y0 + 4 && y < height; y++){
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))){
+                        int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno);
+                        ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                        if (t1->data[(y) * t1->stride + x] & mask){ // newly significant
+                            int xorbit;
+                            int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                            *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                            ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                        }
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] &= ~JPEG2000_T1_VIS;
+                }
+            }
+        }
+}
+
+static void encode_cblk(Jpeg2000EncoderContext *s, Jpeg2000T1Context *t1, Jpeg2000Cblk *cblk, Jpeg2000Tile *tile,
+                        int width, int height, int bandpos, int lev)
+{
+    int pass_t = 2, passno, x, y, max=0, nmsedec, bpno;
+    int64_t wmsedec = 0;
+
+    memset(t1->flags, 0, t1->stride * (height + 2) * sizeof(*t1->flags));
+
+    for (y = 0; y < height; y++){
+        for (x = 0; x < width; x++){
+            if (t1->data[(y) * t1->stride + x] < 0){
+                t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_SGN;
+                t1->data[(y) * t1->stride + x] = -t1->data[(y) * t1->stride + x];
+            }
+            max = FFMAX(max, t1->data[(y) * t1->stride + x]);
+        }
+    }
+
+    if (max == 0){
+        cblk->nonzerobits = 0;
+        bpno = 0;
+    } else{
+        cblk->nonzerobits = av_log2(max) + 1 - NMSEDEC_FRACBITS;
+        bpno = cblk->nonzerobits - 1;
+    }
+
+    cblk->data[0] = 0;
+    ff_mqc_initenc(&t1->mqc, cblk->data + 1);
+
+    for (passno = 0; bpno >= 0; passno++){
+        nmsedec=0;
+
+        switch(pass_t){
+            case 0: encode_sigpass(t1, width, height, bandpos, &nmsedec, bpno);
+                    break;
+            case 1: encode_refpass(t1, width, height, &nmsedec, bpno);
+                    break;
+            case 2: encode_clnpass(t1, width, height, bandpos, &nmsedec, bpno);
+                    break;
+        }
+
+        cblk->passes[passno].rate = ff_mqc_flush_to(&t1->mqc, cblk->passes[passno].flushed, &cblk->passes[passno].flushed_len);
+        wmsedec += (int64_t)nmsedec << (2*bpno);
+        cblk->passes[passno].disto = wmsedec;
+
+        if (++pass_t == 3){
+            pass_t = 0;
+            bpno--;
+        }
+    }
+    cblk->npasses = passno;
+    cblk->ninclpasses = passno;
+
+    if (passno)
+        cblk->passes[passno-1].rate = ff_mqc_flush_to(&t1->mqc, cblk->passes[passno-1].flushed, &cblk->passes[passno-1].flushed_len);
+}
+
+/* tier-2 routines: */
+
+static void putnumpasses(Jpeg2000EncoderContext *s, int n)
+{
+    if (n == 1)
+        put_num(s, 0, 1);
+    else if (n == 2)
+        put_num(s, 2, 2);
+    else if (n <= 5)
+        put_num(s, 0xc | (n-3), 4);
+    else if (n <= 36)
+        put_num(s, 0x1e0 | (n-6), 9);
+    else
+        put_num(s, 0xff80 | (n-37), 16);
+}
+
+
+static int encode_packet(Jpeg2000EncoderContext *s, Jpeg2000ResLevel *rlevel, int precno,
+                          uint8_t *expn, int numgbits)
+{
+    int bandno, empty = 1;
+
+    // init bitstream
+    *s->buf = 0;
+    s->bit_index = 0;
+
+    // header
+
+    // is the packet empty?
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        if (rlevel->band[bandno].coord[0][0] < rlevel->band[bandno].coord[0][1]
+        &&  rlevel->band[bandno].coord[1][0] < rlevel->band[bandno].coord[1][1]){
+            empty = 0;
+            break;
+        }
+    }
+
+    put_bits(s, !empty, 1);
+    if (empty){
+        j2k_flush(s);
+        return 0;
+    }
+
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        Jpeg2000Band *band = rlevel->band + bandno;
+        Jpeg2000Prec *prec = band->prec + precno;
+        int yi, xi, pos;
+        int cblknw = prec->nb_codeblocks_width;
+
+        if (band->coord[0][0] == band->coord[0][1]
+        ||  band->coord[1][0] == band->coord[1][1])
+            continue;
+
+        for (pos=0, yi = 0; yi < prec->nb_codeblocks_height; yi++){
+            for (xi = 0; xi < cblknw; xi++, pos++){
+                prec->cblkincl[pos].val = prec->cblk[yi * cblknw + xi].ninclpasses == 0;
+                tag_tree_update(prec->cblkincl + pos);
+                prec->zerobits[pos].val = expn[bandno] + numgbits - 1 - prec->cblk[yi * cblknw + xi].nonzerobits;
+                tag_tree_update(prec->zerobits + pos);
+            }
+        }
+
+        for (pos=0, yi = 0; yi < prec->nb_codeblocks_height; yi++){
+            for (xi = 0; xi < cblknw; xi++, pos++){
+                int pad = 0, llen, length;
+                Jpeg2000Cblk *cblk = prec->cblk + yi * cblknw + xi;
+
+                if (s->buf_end - s->buf < 20) // approximately
+                    return -1;
+
+                // inclusion information
+                tag_tree_code(s, prec->cblkincl + pos, 1);
+                if (!cblk->ninclpasses)
+                    continue;
+                // zerobits information
+                tag_tree_code(s, prec->zerobits + pos, 100);
+                // number of passes
+                putnumpasses(s, cblk->ninclpasses);
+
+                length = cblk->passes[cblk->ninclpasses-1].rate;
+                llen = av_log2(length) - av_log2(cblk->ninclpasses) - 2;
+                if (llen < 0){
+                    pad = -llen;
+                    llen = 0;
+                }
+                // length of code block
+                put_bits(s, 1, llen);
+                put_bits(s, 0, 1);
+                put_num(s, length, av_log2(length)+1+pad);
+            }
+        }
+    }
+    j2k_flush(s);
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        Jpeg2000Band *band = rlevel->band + bandno;
+        Jpeg2000Prec *prec = band->prec + precno;
+        int yi, cblknw = prec->nb_codeblocks_width;
+        for (yi =0; yi < prec->nb_codeblocks_height; yi++){
+            int xi;
+            for (xi = 0; xi < cblknw; xi++){
+                Jpeg2000Cblk *cblk = prec->cblk + yi * cblknw + xi;
+                if (cblk->ninclpasses){
+                    if (s->buf_end - s->buf < cblk->passes[cblk->ninclpasses-1].rate)
+                        return -1;
+                    bytestream_put_buffer(&s->buf, cblk->data + 1,   cblk->passes[cblk->ninclpasses-1].rate
+                                                               - cblk->passes[cblk->ninclpasses-1].flushed_len);
+                    bytestream_put_buffer(&s->buf, cblk->passes[cblk->ninclpasses-1].flushed,
+                                                   cblk->passes[cblk->ninclpasses-1].flushed_len);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int encode_packets(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile, int tileno)
+{
+    int compno, reslevelno, ret;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "tier2\n");
+    // lay-rlevel-comp-pos progression
+    for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+        for (compno = 0; compno < s->ncomponents; compno++){
+            int precno;
+            Jpeg2000ResLevel *reslevel = s->tile[tileno].comp[compno].reslevel + reslevelno;
+            for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                if ((ret = encode_packet(s, reslevel, precno, qntsty->expn + (reslevelno ? 3*reslevelno-2 : 0),
+                              qntsty->nguardbits)) < 0)
+                    return ret;
+            }
+        }
+    }
+    av_log(s->avctx, AV_LOG_DEBUG, "after tier2\n");
+    return 0;
+}
+
+static int getcut(Jpeg2000Cblk *cblk, int64_t lambda, int dwt_norm)
+{
+    int passno, res = 0;
+    for (passno = 0; passno < cblk->npasses; passno++){
+        int dr;
+        int64_t dd;
+
+        dr = cblk->passes[passno].rate
+           - (res ? cblk->passes[res-1].rate:0);
+        dd = cblk->passes[passno].disto
+           - (res ? cblk->passes[res-1].disto:0);
+
+        if (((dd * dwt_norm) >> WMSEDEC_SHIFT) * dwt_norm >= dr * lambda)
+            res = passno+1;
+    }
+    return res;
+}
+
+static void truncpasses(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile)
+{
+    int precno, compno, reslevelno, bandno, cblkno, lev;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (compno = 0; compno < s->ncomponents; compno++){
+        Jpeg2000Component *comp = tile->comp + compno;
+
+        for (reslevelno = 0, lev = codsty->nreslevels-1; reslevelno < codsty->nreslevels; reslevelno++, lev--){
+            Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+
+            for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                for (bandno = 0; bandno < reslevel->nbands ; bandno++){
+                    int bandpos = bandno + (reslevelno > 0);
+                    Jpeg2000Band *band = reslevel->band + bandno;
+                    Jpeg2000Prec *prec = band->prec + precno;
+
+                    for (cblkno = 0; cblkno < prec->nb_codeblocks_height * prec->nb_codeblocks_width; cblkno++){
+                        Jpeg2000Cblk *cblk = prec->cblk + cblkno;
+
+                        cblk->ninclpasses = getcut(cblk, s->lambda,
+                                (int64_t)dwt_norms[codsty->transform == FF_DWT53][bandpos][lev] * (int64_t)band->i_stepsize >> 15);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static int encode_tile(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile, int tileno)
+{
+    int compno, reslevelno, bandno, ret;
+    Jpeg2000T1Context t1;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    for (compno = 0; compno < s->ncomponents; compno++){
+        Jpeg2000Component *comp = s->tile[tileno].comp + compno;
+
+        t1.stride = (1<<codsty->log2_cblk_width) + 2;
+
+        av_log(s->avctx, AV_LOG_DEBUG,"dwt\n");
+        if ((ret = ff_dwt_encode(&comp->dwt, comp->i_data)) < 0)
+            return ret;
+        av_log(s->avctx, AV_LOG_DEBUG,"after dwt -> tier1\n");
+
+        for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+            Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+
+            for (bandno = 0; bandno < reslevel->nbands ; bandno++){
+                Jpeg2000Band *band = reslevel->band + bandno;
+                Jpeg2000Prec *prec = band->prec; // we support only 1 precinct per band ATM in the encoder
+                int cblkx, cblky, cblkno=0, xx0, x0, xx1, y0, yy0, yy1, bandpos;
+                yy0 = bandno == 0 ? 0 : comp->reslevel[reslevelno-1].coord[1][1] - comp->reslevel[reslevelno-1].coord[1][0];
+                y0 = yy0;
+                yy1 = FFMIN(ff_jpeg2000_ceildivpow2(band->coord[1][0] + 1, band->log2_cblk_height) << band->log2_cblk_height,
+                            band->coord[1][1]) - band->coord[1][0] + yy0;
+
+                if (band->coord[0][0] == band->coord[0][1] || band->coord[1][0] == band->coord[1][1])
+                    continue;
+
+                bandpos = bandno + (reslevelno > 0);
+
+                for (cblky = 0; cblky < prec->nb_codeblocks_height; cblky++){
+                    if (reslevelno == 0 || bandno == 1)
+                        xx0 = 0;
+                    else
+                        xx0 = comp->reslevel[reslevelno-1].coord[0][1] - comp->reslevel[reslevelno-1].coord[0][0];
+                    x0 = xx0;
+                    xx1 = FFMIN(ff_jpeg2000_ceildivpow2(band->coord[0][0] + 1, band->log2_cblk_width) << band->log2_cblk_width,
+                                band->coord[0][1]) - band->coord[0][0] + xx0;
+
+                    for (cblkx = 0; cblkx < prec->nb_codeblocks_width; cblkx++, cblkno++){
+                        int y, x;
+                        if (codsty->transform == FF_DWT53){
+                            for (y = yy0; y < yy1; y++){
+                                int *ptr = t1.data + (y-yy0)*t1.stride;
+                                for (x = xx0; x < xx1; x++){
+                                    *ptr++ = comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * y + x] << NMSEDEC_FRACBITS;
+                                }
+                            }
+                        } else{
+                            for (y = yy0; y < yy1; y++){
+                                int *ptr = t1.data + (y-yy0)*t1.stride;
+                                for (x = xx0; x < xx1; x++){
+                                    *ptr = (comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * y + x]);
+                                    *ptr = (int64_t)*ptr * (int64_t)(16384 * 65536 / band->i_stepsize) >> 15 - NMSEDEC_FRACBITS;
+                                    ptr++;
+                                }
+                            }
+                        }
+                        if (!prec->cblk[cblkno].data)
+                            prec->cblk[cblkno].data = av_malloc(1 + 8192);
+                        if (!prec->cblk[cblkno].passes)
+                            prec->cblk[cblkno].passes = av_malloc_array(JPEG2000_MAX_PASSES, sizeof (*prec->cblk[cblkno].passes));
+                        if (!prec->cblk[cblkno].data || !prec->cblk[cblkno].passes)
+                            return AVERROR(ENOMEM);
+                        encode_cblk(s, &t1, prec->cblk + cblkno, tile, xx1 - xx0, yy1 - yy0,
+                                    bandpos, codsty->nreslevels - reslevelno - 1);
+                        xx0 = xx1;
+                        xx1 = FFMIN(xx1 + (1 << band->log2_cblk_width), band->coord[0][1] - band->coord[0][0] + x0);
+                    }
+                    yy0 = yy1;
+                    yy1 = FFMIN(yy1 + (1 << band->log2_cblk_height), band->coord[1][1] - band->coord[1][0] + y0);
+                }
+            }
+        }
+        av_log(s->avctx, AV_LOG_DEBUG, "after tier1\n");
+    }
+
+    av_log(s->avctx, AV_LOG_DEBUG, "rate control\n");
+    truncpasses(s, tile);
+    if ((ret = encode_packets(s, tile, tileno)) < 0)
+        return ret;
+    av_log(s->avctx, AV_LOG_DEBUG, "after rate control\n");
+    return 0;
+}
+
+static void cleanup(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        for (compno = 0; compno < s->ncomponents; compno++){
+            Jpeg2000Component *comp = s->tile[tileno].comp + compno;
+            ff_jpeg2000_cleanup(comp, codsty);
+        }
+        av_freep(&s->tile[tileno].comp);
+    }
+    av_freep(&s->tile);
+}
+
+static void reinit(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno;
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        for (compno = 0; compno < s->ncomponents; compno++)
+            ff_jpeg2000_reinit(tile->comp + compno, &s->codsty);
+    }
+}
+
+static void update_size(uint8_t *size, const uint8_t *end)
+{
+    AV_WB32(size, end-size);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    int tileno, ret;
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+    uint8_t *chunkstart, *jp2cstart, *jp2hstart;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*9 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    // init:
+    s->buf = s->buf_start = pkt->data;
+    s->buf_end = pkt->data + pkt->size;
+
+    s->picture = pict;
+
+    s->lambda = s->picture->quality * LAMBDA_SCALE;
+
+    copy_frame(s);
+    reinit(s);
+
+    if (s->format == CODEC_JP2) {
+        av_assert0(s->buf == pkt->data);
+
+        bytestream_put_be32(&s->buf, 0x0000000C);
+        bytestream_put_be32(&s->buf, 0x6A502020);
+        bytestream_put_be32(&s->buf, 0x0D0A870A);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "ftyp", 4);
+        bytestream_put_buffer(&s->buf, "jp2\040\040", 4);
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2\040", 4);
+        update_size(chunkstart, s->buf);
+
+        jp2hstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2h", 4);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "ihdr", 4);
+        bytestream_put_be32(&s->buf, avctx->height);
+        bytestream_put_be32(&s->buf, avctx->width);
+        bytestream_put_be16(&s->buf, s->ncomponents);
+        bytestream_put_byte(&s->buf, s->cbps[0]);
+        bytestream_put_byte(&s->buf, 7);
+        bytestream_put_byte(&s->buf, 0);
+        bytestream_put_byte(&s->buf, 0);
+        update_size(chunkstart, s->buf);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "colr", 4);
+        bytestream_put_byte(&s->buf, 1);
+        bytestream_put_byte(&s->buf, 0);
+        bytestream_put_byte(&s->buf, 0);
+        if (avctx->pix_fmt == AV_PIX_FMT_RGB24 || avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            bytestream_put_be32(&s->buf, 16);
+        } else if (s->ncomponents == 1) {
+            bytestream_put_be32(&s->buf, 17);
+        } else {
+            bytestream_put_be32(&s->buf, 18);
+        }
+        update_size(chunkstart, s->buf);
+        if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            int i;
+            uint8_t *palette = pict->data[1];
+            chunkstart = s->buf;
+            bytestream_put_be32(&s->buf, 0);
+            bytestream_put_buffer(&s->buf, "pclr", 4);
+            bytestream_put_be16(&s->buf, AVPALETTE_COUNT);
+            bytestream_put_byte(&s->buf, 3); // colour channels
+            bytestream_put_be24(&s->buf, 0x070707); //colour depths
+            for (i = 0; i < AVPALETTE_COUNT; i++) {
+                bytestream_put_be24(&s->buf, HAVE_BIGENDIAN ? AV_RB24(palette + 1) : AV_RL24(palette));
+                palette += 4;
+            }
+            update_size(chunkstart, s->buf);
+            chunkstart = s->buf;
+            bytestream_put_be32(&s->buf, 0);
+            bytestream_put_buffer(&s->buf, "cmap", 4);
+            for (i = 0; i < 3; i++) {
+                bytestream_put_be16(&s->buf, 0); // component
+                bytestream_put_byte(&s->buf, 1); // palette mapping
+                bytestream_put_byte(&s->buf, i); // index
+            }
+            update_size(chunkstart, s->buf);
+        }
+        update_size(jp2hstart, s->buf);
+
+        jp2cstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2c", 4);
+    }
+
+    if (s->buf_end - s->buf < 2)
+        return -1;
+    bytestream_put_be16(&s->buf, JPEG2000_SOC);
+    if ((ret = put_siz(s)) < 0)
+        return ret;
+    if ((ret = put_cod(s)) < 0)
+        return ret;
+    if ((ret = put_qcd(s, 0)) < 0)
+        return ret;
+    if ((ret = put_com(s, 0)) < 0)
+        return ret;
+
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        uint8_t *psotptr;
+        if (!(psotptr = put_sot(s, tileno)))
+            return -1;
+        if (s->buf_end - s->buf < 2)
+            return -1;
+        bytestream_put_be16(&s->buf, JPEG2000_SOD);
+        if ((ret = encode_tile(s, s->tile + tileno, tileno)) < 0)
+            return ret;
+        bytestream_put_be32(&psotptr, s->buf - psotptr + 6);
+    }
+    if (s->buf_end - s->buf < 2)
+        return -1;
+    bytestream_put_be16(&s->buf, JPEG2000_EOC);
+
+    if (s->format == CODEC_JP2)
+        update_size(jp2cstart, s->buf);
+
+    av_log(s->avctx, AV_LOG_DEBUG, "end\n");
+    pkt->size = s->buf - s->buf_start;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int j2kenc_init(AVCodecContext *avctx)
+{
+    int i, ret;
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    s->avctx = avctx;
+    av_log(s->avctx, AV_LOG_DEBUG, "init\n");
+
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->prediction_method)
+        s->pred = avctx->prediction_method;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8 && (s->pred != FF_DWT97_INT || s->format != CODEC_JP2)) {
+        av_log(s->avctx, AV_LOG_WARNING, "Forcing lossless jp2 for pal8\n");
+        s->pred = FF_DWT97_INT;
+        s->format = CODEC_JP2;
+    }
+
+    // defaults:
+    // TODO: implement setting non-standard precinct size
+    memset(codsty->log2_prec_widths , 15, sizeof(codsty->log2_prec_widths ));
+    memset(codsty->log2_prec_heights, 15, sizeof(codsty->log2_prec_heights));
+    codsty->nreslevels2decode=
+    codsty->nreslevels       = 7;
+    codsty->log2_cblk_width  = 4;
+    codsty->log2_cblk_height = 4;
+    codsty->transform        = s->pred ? FF_DWT53 : FF_DWT97_INT;
+
+    qntsty->nguardbits       = 1;
+
+    if ((s->tile_width  & (s->tile_width -1)) ||
+        (s->tile_height & (s->tile_height-1))) {
+        av_log(avctx, AV_LOG_WARNING, "Tile dimension not a power of 2\n");
+    }
+
+    if (codsty->transform == FF_DWT53)
+        qntsty->quantsty = JPEG2000_QSTY_NONE;
+    else
+        qntsty->quantsty = JPEG2000_QSTY_SE;
+
+    s->width = avctx->width;
+    s->height = avctx->height;
+
+    for (i = 0; i < 3; i++)
+        s->cbps[i] = 8;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24){
+        s->ncomponents = 3;
+    } else if (avctx->pix_fmt == AV_PIX_FMT_GRAY8 || avctx->pix_fmt == AV_PIX_FMT_PAL8){
+        s->ncomponents = 1;
+    } else{ // planar YUV
+        s->planar = 1;
+        s->ncomponents = 3;
+        ret = av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                               s->chroma_shift, s->chroma_shift + 1);
+        if (ret)
+            return ret;
+    }
+
+    ff_jpeg2000_init_tier1_luts();
+    ff_mqc_init_context_tables();
+    init_luts();
+
+    init_quantization(s);
+    if ((ret=init_tiles(s)) < 0)
+        return ret;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "after init\n");
+
+    return 0;
+}
+
+static int j2kenc_destroy(AVCodecContext *avctx)
+{
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+
+    cleanup(s);
+    return 0;
+}
+
+// taken from the libopenjpeg wraper so it matches
+
+#define OFFSET(x) offsetof(Jpeg2000EncoderContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "format",        "Codec Format",      OFFSET(format),        AV_OPT_TYPE_INT,   { .i64 = CODEC_JP2   }, CODEC_J2K, CODEC_JP2,   VE, "format"      },
+    { "j2k",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_J2K   }, 0,         0,           VE, "format"      },
+    { "jp2",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_JP2   }, 0,         0,           VE, "format"      },
+    { "tile_width",    "Tile Width",        OFFSET(tile_width),    AV_OPT_TYPE_INT,   { .i64 = 256         }, 1,     1<<30,           VE, },
+    { "tile_height",   "Tile Height",       OFFSET(tile_height),   AV_OPT_TYPE_INT,   { .i64 = 256         }, 1,     1<<30,           VE, },
+    { "pred",          "DWT Type",          OFFSET(pred),          AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE, "pred"        },
+    { "dwt97int",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = 0           }, INT_MIN, INT_MAX,       VE, "pred"        },
+    { "dwt53",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = 0           }, INT_MIN, INT_MAX,       VE, "pred"        },
+
+    { NULL }
+};
+
+static const AVClass j2k_class = {
+    .class_name = "jpeg 2000 encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_jpeg2000_encoder = {
+    .name           = "jpeg2000",
+    .long_name      = NULL_IF_CONFIG_SMALL("JPEG 2000"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_JPEG2000,
+    .priv_data_size = sizeof(Jpeg2000EncoderContext),
+    .init           = j2kenc_init,
+    .encode2        = encode_frame,
+    .close          = j2kenc_destroy,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_YUV444P, AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_PAL8,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &j2k_class,
+};
diff --git a/libavcodec/jacosub.h b/libavcodec/jacosub.h
new file mode 100644
index 0000000..c3665ae
--- /dev/null
+++ b/libavcodec/jacosub.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * JACOsub shared utils
+ */
+
+#ifndef AVCODEC_JACOSUB_H
+#define AVCODEC_JACOSUB_H
+
+#include "libavutil/common.h"
+
+#define JSS_MAX_LINESIZE 512
+
+static av_always_inline int jss_whitespace(char c)
+{
+    return c == ' ' || (c >= '\t' && c <= '\r');
+}
+
+static av_always_inline const char *jss_skip_whitespace(const char *p)
+{
+    while (jss_whitespace(*p))
+        p++;
+    return p;
+}
+
+#endif /* AVCODEC_JACOSUB_H */
diff --git a/libavcodec/jacosubdec.c b/libavcodec/jacosubdec.c
new file mode 100644
index 0000000..cdb372a
--- /dev/null
+++ b/libavcodec/jacosubdec.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * JACOsub subtitle decoder
+ * @see http://unicorn.us.com/jacosub/jscripts.html
+ */
+
+#include <time.h>
+#include "ass.h"
+#include "jacosub.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "libavutil/time_internal.h"
+
+#undef time
+
+static int insert_text(AVBPrint *dst, const char *in, const char *arg)
+{
+    av_bprintf(dst, "%s", arg);
+    return 0;
+}
+
+static int insert_datetime(AVBPrint *dst, const char *in, const char *arg)
+{
+    char buf[16] = {0};
+    time_t now = time(0);
+    struct tm ltime;
+
+    localtime_r(&now, &ltime);
+    if (strftime(buf, sizeof(buf), arg, &ltime))
+        av_bprintf(dst, "%s", buf);
+    return 0;
+}
+
+static int insert_color(AVBPrint *dst, const char *in, const char *arg)
+{
+    return 1; // skip id
+}
+
+static int insert_font(AVBPrint *dst, const char *in, const char *arg)
+{
+    return 1; // skip id
+}
+
+static const struct {
+    const char *from;
+    const char *arg;
+    int (*func)(AVBPrint *dst, const char *in, const char *arg);
+} ass_codes_map[] = {
+    {"\\~", "~",        insert_text},       // tilde doesn't need escaping
+    {"~",   "{\\h}",    insert_text},       // hard space
+    {"\\n", "\\N",      insert_text},       // newline
+    {"\\D", "%d %b %Y", insert_datetime},   // current date
+    {"\\T", "%H:%M",    insert_datetime},   // current time
+    {"\\N", "{\\r}",    insert_text},       // reset to default style
+    {"\\I", "{\\i1}",   insert_text},       // italic on
+    {"\\i", "{\\i0}",   insert_text},       // italic off
+    {"\\B", "{\\b1}",   insert_text},       // bold on
+    {"\\b", "{\\b0}",   insert_text},       // bold off
+    {"\\U", "{\\u1}",   insert_text},       // underline on
+    {"\\u", "{\\u0}",   insert_text},       // underline off
+    {"\\C", "",         insert_color},      // TODO: color
+    {"\\F", "",         insert_font},       // TODO: font
+};
+
+enum {
+    ALIGN_VB = 1<<0, // vertical bottom, default
+    ALIGN_VM = 1<<1, // vertical middle
+    ALIGN_VT = 1<<2, // vertical top
+    ALIGN_JC = 1<<3, // justify center, default
+    ALIGN_JL = 1<<4, // justify left
+    ALIGN_JR = 1<<5, // justify right
+};
+
+static void jacosub_to_ass(AVCodecContext *avctx, AVBPrint *dst, const char *src)
+{
+    int i, valign = 0, halign = 0;
+    char c = av_toupper(*src);
+    char directives[128] = {0};
+
+    /* extract the optional directives */
+    if ((c >= 'A' && c <= 'Z') || c == '[') {
+        char *p    = directives;
+        char *pend = directives + sizeof(directives) - 1;
+
+        do *p++ = av_toupper(*src++);
+        while (*src && !jss_whitespace(*src) && p < pend);
+        *p = 0;
+        src = jss_skip_whitespace(src);
+    }
+
+    /* handle directives (TODO: handle more of them, and more reliably) */
+    if      (strstr(directives, "VB")) valign = ALIGN_VB;
+    else if (strstr(directives, "VM")) valign = ALIGN_VM;
+    else if (strstr(directives, "VT")) valign = ALIGN_VT;
+    if      (strstr(directives, "JC")) halign = ALIGN_JC;
+    else if (strstr(directives, "JL")) halign = ALIGN_JL;
+    else if (strstr(directives, "JR")) halign = ALIGN_JR;
+    if (valign || halign) {
+        if (!valign) valign = ALIGN_VB;
+        if (!halign) halign = ALIGN_JC;
+        switch (valign | halign) {
+        case ALIGN_VB | ALIGN_JL: av_bprintf(dst, "{\\an1}"); break; // bottom left
+        case ALIGN_VB | ALIGN_JC: av_bprintf(dst, "{\\an2}"); break; // bottom center
+        case ALIGN_VB | ALIGN_JR: av_bprintf(dst, "{\\an3}"); break; // bottom right
+        case ALIGN_VM | ALIGN_JL: av_bprintf(dst, "{\\an4}"); break; // middle left
+        case ALIGN_VM | ALIGN_JC: av_bprintf(dst, "{\\an5}"); break; // middle center
+        case ALIGN_VM | ALIGN_JR: av_bprintf(dst, "{\\an6}"); break; // middle right
+        case ALIGN_VT | ALIGN_JL: av_bprintf(dst, "{\\an7}"); break; // top left
+        case ALIGN_VT | ALIGN_JC: av_bprintf(dst, "{\\an8}"); break; // top center
+        case ALIGN_VT | ALIGN_JR: av_bprintf(dst, "{\\an9}"); break; // top right
+        }
+    }
+
+    /* process timed line */
+    while (*src && *src != '\n') {
+
+        /* text continue on the next line */
+        if (src[0] == '\\' && src[1] == '\n') {
+            src += 2;
+            while (jss_whitespace(*src))
+                src++;
+            continue;
+        }
+
+        /* special character codes */
+        for (i = 0; i < FF_ARRAY_ELEMS(ass_codes_map); i++) {
+            const char *from = ass_codes_map[i].from;
+            const char *arg  = ass_codes_map[i].arg;
+            size_t codemap_len = strlen(from);
+
+            if (!strncmp(src, from, codemap_len)) {
+                src += codemap_len;
+                src += ass_codes_map[i].func(dst, src, arg);
+                break;
+            }
+        }
+
+        /* simple char copy */
+        if (i == FF_ARRAY_ELEMS(ass_codes_map))
+            av_bprintf(dst, "%c", *src++);
+    }
+}
+
+static int jacosub_decode_frame(AVCodecContext *avctx,
+                                void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+
+    if (avpkt->size <= 0)
+        goto end;
+
+    if (*ptr) {
+        AVBPrint buffer;
+
+        // skip timers
+        ptr = jss_skip_whitespace(ptr);
+        ptr = strchr(ptr, ' '); if (!ptr) goto end; ptr++;
+        ptr = strchr(ptr, ' '); if (!ptr) goto end; ptr++;
+
+        av_bprint_init(&buffer, JSS_MAX_LINESIZE, JSS_MAX_LINESIZE);
+        jacosub_to_ass(avctx, &buffer, ptr);
+        ret = ff_ass_add_rect(sub, buffer.str, s->readorder++, 0, NULL, NULL);
+        av_bprint_finalize(&buffer, NULL);
+        if (ret < 0)
+            return ret;
+    }
+
+end:
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_jacosub_decoder = {
+    .name           = "jacosub",
+    .long_name      = NULL_IF_CONFIG_SMALL("JACOsub subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_JACOSUB,
+    .init           = ff_ass_subtitle_header_default,
+    .decode         = jacosub_decode_frame,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/jfdctint.c b/libavcodec/jfdctint.c
index ed6b7ff..6a39578 100644
--- a/libavcodec/jfdctint.c
+++ b/libavcodec/jfdctint.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jni.c b/libavcodec/jni.c
new file mode 100644
index 0000000..85dcf2a
--- /dev/null
+++ b/libavcodec/jni.c
@@ -0,0 +1,79 @@
+/*
+ * JNI public API functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "libavutil/error.h"
+#include "jni.h"
+
+#if CONFIG_JNI
+#include <jni.h>
+#include <pthread.h>
+
+#include "libavutil/log.h"
+#include "ffjni.h"
+
+void *java_vm;
+pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+int av_jni_set_java_vm(void *vm, void *log_ctx)
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&lock);
+    if (java_vm == NULL) {
+        java_vm = vm;
+    } else if (java_vm != vm) {
+        ret = AVERROR(EINVAL);
+        av_log(log_ctx, AV_LOG_ERROR, "A Java virtual machine has already been set");
+    }
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
+void *av_jni_get_java_vm(void *log_ctx)
+{
+    void *vm;
+
+    pthread_mutex_lock(&lock);
+    vm = java_vm;
+    pthread_mutex_unlock(&lock);
+
+    return vm;
+}
+
+#else
+
+int av_jni_set_java_vm(void *vm, void *log_ctx)
+{
+    return AVERROR(ENOSYS);
+}
+
+void *av_jni_get_java_vm(void *log_ctx)
+{
+    return NULL;
+}
+
+#endif
diff --git a/libavcodec/jni.h b/libavcodec/jni.h
new file mode 100644
index 0000000..dd99e92
--- /dev/null
+++ b/libavcodec/jni.h
@@ -0,0 +1,46 @@
+/*
+ * JNI public API functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_JNI_H
+#define AVCODEC_JNI_H
+
+/*
+ * Manually set a Java virtual machine which will be used to retrieve the JNI
+ * environment. Once a Java VM is set it cannot be changed afterwards, meaning
+ * you can call multiple times av_jni_set_java_vm with the same Java VM pointer
+ * however it will error out if you try to set a different Java VM.
+ *
+ * @param vm Java virtual machine
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int av_jni_set_java_vm(void *vm, void *log_ctx);
+
+/*
+ * Get the Java virtual machine which has been set with av_jni_set_java_vm.
+ *
+ * @param vm Java virtual machine
+ * @return a pointer to the Java virtual machine
+ */
+void *av_jni_get_java_vm(void *log_ctx);
+
+#endif /* AVCODEC_JNI_H */
diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index ef5ffa6..8e90980 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,12 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 #include "avcodec.h"
+#include "internal.h"
 #include "jpeg2000.h"
 
 #define SHL(a, n) ((n) >= 0 ? (a) << (n) : (a) >> -(n))
@@ -36,13 +39,12 @@
 /* tag tree routines */
 
 /* allocate the memory for tag tree */
-static int32_t tag_tree_size(uint16_t w, uint16_t h)
+static int32_t tag_tree_size(int w, int h)
 {
-    uint32_t res = 0;
+    int64_t res = 0;
     while (w > 1 || h > 1) {
-        res += w * h;
-        if (res + 1 >= INT32_MAX)
-            return -1;
+        res += w * (int64_t)h;
+        av_assert0(res + 1 < INT32_MAX);
         w = (w + 1) >> 1;
         h = (h + 1) >> 1;
     }
@@ -56,8 +58,6 @@ static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
     int32_t tt_size;
 
     tt_size = tag_tree_size(w, h);
-    if (tt_size == -1)
-        return NULL;
 
     t = res = av_mallocz_array(tt_size, sizeof(*t));
     if (!res)
@@ -82,6 +82,16 @@ static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
     return res;
 }
 
+static void tag_tree_zero(Jpeg2000TgtNode *t, int w, int h)
+{
+    int i, siz = tag_tree_size(w, h);
+
+    for (i = 0; i < siz; i++) {
+        t[i].val = 0;
+        t[i].vis = 0;
+    }
+}
+
 uint8_t ff_jpeg2000_sigctxno_lut[256][4];
 
 static int getsigctxno(int flag, int bandno)
@@ -96,45 +106,33 @@ static int getsigctxno(int flag, int bandno)
         ((flag & JPEG2000_T1_SIG_NW) ? 1 : 0) +
         ((flag & JPEG2000_T1_SIG_SE) ? 1 : 0) +
         ((flag & JPEG2000_T1_SIG_SW) ? 1 : 0);
+
     if (bandno < 3) {
         if (bandno == 1)
             FFSWAP(int, h, v);
-        if (h == 2)
-            return 8;
+        if (h == 2) return 8;
         if (h == 1) {
-            if (v >= 1)
-                return 7;
-            if (d >= 1)
-                return 6;
+            if (v >= 1) return 7;
+            if (d >= 1) return 6;
             return 5;
         }
-        if (v == 2)
-            return 4;
-        if (v == 1)
-            return 3;
-        if (d >= 2)
-            return 2;
-        if (d == 1)
-            return 1;
+        if (v == 2) return 4;
+        if (v == 1) return 3;
+        if (d >= 2) return 2;
+        if (d == 1) return 1;
     } else {
-        if (d >= 3)
-            return 8;
+        if (d >= 3) return 8;
         if (d == 2) {
-            if (h + v >= 1)
-                return 7;
+            if (h+v >= 1) return 7;
             return 6;
         }
         if (d == 1) {
-            if (h + v >= 2)
-                return 5;
-            if (h + v == 1)
-                return 4;
+            if (h+v >= 2) return 5;
+            if (h+v == 1) return 4;
             return 3;
         }
-        if (h + v >= 2)
-            return 2;
-        if (h + v == 1)
-            return 1;
+        if (h+v >= 2) return 2;
+        if (h+v == 1) return 1;
     }
     return 0;
 }
@@ -175,25 +173,25 @@ void ff_jpeg2000_set_significance(Jpeg2000T1Context *t1, int x, int y,
 {
     x++;
     y++;
-    t1->flags[y][x] |= JPEG2000_T1_SIG;
+    t1->flags[(y) * t1->stride + x] |= JPEG2000_T1_SIG;
     if (negative) {
-        t1->flags[y][x + 1] |= JPEG2000_T1_SIG_W | JPEG2000_T1_SGN_W;
-        t1->flags[y][x - 1] |= JPEG2000_T1_SIG_E | JPEG2000_T1_SGN_E;
-        t1->flags[y + 1][x] |= JPEG2000_T1_SIG_N | JPEG2000_T1_SGN_N;
-        t1->flags[y - 1][x] |= JPEG2000_T1_SIG_S | JPEG2000_T1_SGN_S;
+        t1->flags[(y) * t1->stride + x + 1] |= JPEG2000_T1_SIG_W | JPEG2000_T1_SGN_W;
+        t1->flags[(y) * t1->stride + x - 1] |= JPEG2000_T1_SIG_E | JPEG2000_T1_SGN_E;
+        t1->flags[(y + 1) * t1->stride + x] |= JPEG2000_T1_SIG_N | JPEG2000_T1_SGN_N;
+        t1->flags[(y - 1) * t1->stride + x] |= JPEG2000_T1_SIG_S | JPEG2000_T1_SGN_S;
     } else {
-        t1->flags[y][x + 1] |= JPEG2000_T1_SIG_W;
-        t1->flags[y][x - 1] |= JPEG2000_T1_SIG_E;
-        t1->flags[y + 1][x] |= JPEG2000_T1_SIG_N;
-        t1->flags[y - 1][x] |= JPEG2000_T1_SIG_S;
+        t1->flags[(y) * t1->stride + x + 1] |= JPEG2000_T1_SIG_W;
+        t1->flags[(y) * t1->stride + x - 1] |= JPEG2000_T1_SIG_E;
+        t1->flags[(y + 1) * t1->stride + x] |= JPEG2000_T1_SIG_N;
+        t1->flags[(y - 1) * t1->stride + x] |= JPEG2000_T1_SIG_S;
     }
-    t1->flags[y + 1][x + 1] |= JPEG2000_T1_SIG_NW;
-    t1->flags[y + 1][x - 1] |= JPEG2000_T1_SIG_NE;
-    t1->flags[y - 1][x + 1] |= JPEG2000_T1_SIG_SW;
-    t1->flags[y - 1][x - 1] |= JPEG2000_T1_SIG_SE;
+    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_SIG_NW;
+    t1->flags[(y + 1) * t1->stride + x - 1] |= JPEG2000_T1_SIG_NE;
+    t1->flags[(y - 1) * t1->stride + x + 1] |= JPEG2000_T1_SIG_SW;
+    t1->flags[(y - 1) * t1->stride + x - 1] |= JPEG2000_T1_SIG_SE;
 }
 
-static const uint8_t lut_gain[2][4] = { { 0, 0, 0, 0 }, { 0, 1, 1, 2 } };
+// static const uint8_t lut_gain[2][4] = { { 0, 0, 0, 0 }, { 0, 1, 1, 2 } }; (unused)
 
 static void init_band_stepsize(AVCodecContext *avctx,
                                Jpeg2000Band *band,
@@ -206,29 +204,25 @@ static void init_band_stepsize(AVCodecContext *avctx,
      * see ISO/IEC 15444-1:2002 E.1 and A.6.4. */
     switch (qntsty->quantsty) {
         uint8_t gain;
-        int numbps;
     case JPEG2000_QSTY_NONE:
         /* TODO: to verify. No quantization in this case */
         band->f_stepsize = 1;
         break;
     case JPEG2000_QSTY_SI:
         /*TODO: Compute formula to implement. */
-        numbps = cbps +
-                 lut_gain[codsty->transform == FF_DWT53][bandno + (reslevelno > 0)];
-        band->f_stepsize = SHL(2048 + qntsty->mant[gbandno],
-                               2 + numbps - qntsty->expn[gbandno]);
-        break;
+//         numbps = cbps +
+//                  lut_gain[codsty->transform == FF_DWT53][bandno + (reslevelno > 0)];
+//         band->f_stepsize = SHL(2048 + qntsty->mant[gbandno],
+//                                2 + numbps - qntsty->expn[gbandno]);
+//         break;
     case JPEG2000_QSTY_SE:
         /* Exponent quantization step.
          * Formula:
          * delta_b = 2 ^ (R_b - expn_b) * (1 + (mant_b / 2 ^ 11))
          * R_b = R_I + log2 (gain_b )
          * see ISO/IEC 15444-1:2002 E.1.1 eqn. E-3 and E-4 */
-        /* TODO/WARN: value of log2 (gain_b ) not taken into account
-         * but it works (compared to OpenJPEG). Why?
-         * Further investigation needed. */
         gain            = cbps;
-        band->f_stepsize  = pow(2.0, gain - qntsty->expn[gbandno]);
+        band->f_stepsize  = ff_exp2fi(gain - qntsty->expn[gbandno]);
         band->f_stepsize *= qntsty->mant[gbandno] / 2048.0 + 1.0;
         break;
     default:
@@ -236,12 +230,29 @@ static void init_band_stepsize(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Unknown quantization format\n");
         break;
     }
-    /* FIXME: In OpenJPEG code stespize = stepsize * 0.5. Why?
+    if (codsty->transform != FF_DWT53) {
+        int lband = 0;
+        switch (bandno + (reslevelno > 0)) {
+            case 1:
+            case 2:
+                band->f_stepsize *= F_LFTG_X * 2;
+                lband = 1;
+                break;
+            case 3:
+                band->f_stepsize *= F_LFTG_X * F_LFTG_X * 4;
+                break;
+        }
+        if (codsty->transform == FF_DWT97) {
+            band->f_stepsize *= pow(F_LFTG_K, 2*(codsty->nreslevels2decode - reslevelno) + lband - 2);
+        }
+    }
+
+    band->i_stepsize = band->f_stepsize * (1 << 15);
+
+    /* FIXME: In OpenJPEG code stepsize = stepsize * 0.5. Why?
      * If not set output of entropic decoder is not correct. */
     if (!av_codec_is_encoder(avctx->codec))
         band->f_stepsize *= 0.5;
-
-    band->i_stepsize = band->f_stepsize * (1 << 16);
 }
 
 static int init_prec(Jpeg2000Band *band,
@@ -254,37 +265,40 @@ static int init_prec(Jpeg2000Band *band,
     Jpeg2000Prec *prec = band->prec + precno;
     int nb_codeblocks, cblkno;
 
+    prec->decoded_layers = 0;
+
     /* TODO: Explain formula for JPEG200 DCINEMA. */
     /* TODO: Verify with previous count of codeblocks per band */
 
     /* Compute P_x0 */
-    prec->coord[0][0] = (precno % reslevel->num_precincts_x) *
+    prec->coord[0][0] = ((band->coord[0][0] >> log2_band_prec_width) + precno % reslevel->num_precincts_x) *
                         (1 << log2_band_prec_width);
-    prec->coord[0][0] = FFMAX(prec->coord[0][0], band->coord[0][0]);
 
     /* Compute P_y0 */
-    prec->coord[1][0] = (precno / reslevel->num_precincts_x) *
+    prec->coord[1][0] = ((band->coord[1][0] >> log2_band_prec_height) + precno / reslevel->num_precincts_x) *
                         (1 << log2_band_prec_height);
-    prec->coord[1][0] = FFMAX(prec->coord[1][0], band->coord[1][0]);
 
     /* Compute P_x1 */
     prec->coord[0][1] = prec->coord[0][0] +
                         (1 << log2_band_prec_width);
+    prec->coord[0][0] = FFMAX(prec->coord[0][0], band->coord[0][0]);
     prec->coord[0][1] = FFMIN(prec->coord[0][1], band->coord[0][1]);
 
     /* Compute P_y1 */
     prec->coord[1][1] = prec->coord[1][0] +
                         (1 << log2_band_prec_height);
+    prec->coord[1][0] = FFMAX(prec->coord[1][0], band->coord[1][0]);
     prec->coord[1][1] = FFMIN(prec->coord[1][1], band->coord[1][1]);
 
     prec->nb_codeblocks_width =
-        ff_jpeg2000_ceildivpow2(prec->coord[0][1] -
-                                prec->coord[0][0],
-                                band->log2_cblk_width);
+        ff_jpeg2000_ceildivpow2(prec->coord[0][1],
+                                band->log2_cblk_width)
+        - (prec->coord[0][0] >> band->log2_cblk_width);
     prec->nb_codeblocks_height =
-        ff_jpeg2000_ceildivpow2(prec->coord[1][1] -
-                                prec->coord[1][0],
-                                band->log2_cblk_height);
+        ff_jpeg2000_ceildivpow2(prec->coord[1][1],
+                                band->log2_cblk_height)
+        - (prec->coord[1][0] >> band->log2_cblk_height);
+
 
     /* Tag trees initialization */
     prec->cblkincl =
@@ -299,22 +313,26 @@ static int init_prec(Jpeg2000Band *band,
     if (!prec->zerobits)
         return AVERROR(ENOMEM);
 
+    if (prec->nb_codeblocks_width * (uint64_t)prec->nb_codeblocks_height > INT_MAX) {
+        prec->cblk = NULL;
+        return AVERROR(ENOMEM);
+    }
     nb_codeblocks = prec->nb_codeblocks_width * prec->nb_codeblocks_height;
     prec->cblk = av_mallocz_array(nb_codeblocks, sizeof(*prec->cblk));
     if (!prec->cblk)
         return AVERROR(ENOMEM);
     for (cblkno = 0; cblkno < nb_codeblocks; cblkno++) {
         Jpeg2000Cblk *cblk = prec->cblk + cblkno;
-        uint16_t Cx0, Cy0;
+        int Cx0, Cy0;
 
         /* Compute coordinates of codeblocks */
         /* Compute Cx0*/
-        Cx0 = (prec->coord[0][0] >> band->log2_cblk_width) << band->log2_cblk_width;
+        Cx0 = ((prec->coord[0][0]) >> band->log2_cblk_width) << band->log2_cblk_width;
         Cx0 = Cx0 + ((cblkno % prec->nb_codeblocks_width)  << band->log2_cblk_width);
         cblk->coord[0][0] = FFMAX(Cx0, prec->coord[0][0]);
 
         /* Compute Cy0*/
-        Cy0 = (prec->coord[1][0] >> band->log2_cblk_height) << band->log2_cblk_height;
+        Cy0 = ((prec->coord[1][0]) >> band->log2_cblk_height) << band->log2_cblk_height;
         Cy0 = Cy0 + ((cblkno / prec->nb_codeblocks_width)   << band->log2_cblk_height);
         cblk->coord[1][0] = FFMAX(Cy0, prec->coord[1][0]);
 
@@ -339,10 +357,8 @@ static int init_prec(Jpeg2000Band *band,
                                  comp->reslevel[reslevelno-1].coord[1][0];
         }
 
-        cblk->zero      = 0;
         cblk->lblock    = 3;
         cblk->length    = 0;
-        cblk->lengthinc = 0;
         cblk->npasses   = 0;
     }
 
@@ -366,7 +382,6 @@ static int init_band(AVCodecContext *avctx,
 
     init_band_stepsize(avctx, band, codsty, qntsty, bandno, gbandno, reslevelno, cbps);
 
-
     /* computation of tbx_0, tbx_1, tby_0, tby_1
      * see ISO/IEC 15444-1:2002 B.5 eq. B-15 and tbl B.1
      * codeblock width and height is computed for
@@ -376,7 +391,7 @@ static int init_band(AVCodecContext *avctx,
         for (i = 0; i < 2; i++)
             for (j = 0; j < 2; j++)
                 band->coord[i][j] =
-                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] - comp->coord_o[i][0],
+                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j],
                                             declvl - 1);
         log2_band_prec_width  = reslevel->log2_prec_width;
         log2_band_prec_height = reslevel->log2_prec_height;
@@ -392,8 +407,8 @@ static int init_band(AVCodecContext *avctx,
             for (j = 0; j < 2; j++)
                 /* Formula example for tbx_0 = ceildiv((tcx_0 - 2 ^ (declvl - 1) * x0_b) / declvl) */
                 band->coord[i][j] =
-                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] - comp->coord_o[i][0] -
-                                            (((bandno + 1 >> i) & 1) << declvl - 1),
+                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] -
+                                            (((bandno + 1 >> i) & 1LL) << declvl - 1),
                                             declvl);
         /* TODO: Manage case of 3 band offsets here or
          * in coding/decoding function? */
@@ -408,11 +423,10 @@ static int init_band(AVCodecContext *avctx,
         log2_band_prec_height = reslevel->log2_prec_height - 1;
     }
 
-    for (j = 0; j < 2; j++)
-        band->coord[0][j] = ff_jpeg2000_ceildiv(band->coord[0][j], dx);
-    for (j = 0; j < 2; j++)
-        band->coord[1][j] = ff_jpeg2000_ceildiv(band->coord[1][j], dy);
-
+    if (reslevel->num_precincts_x * (uint64_t)reslevel->num_precincts_y > INT_MAX) {
+        band->prec = NULL;
+        return AVERROR(ENOMEM);
+    }
     nb_precincts = reslevel->num_precincts_x * reslevel->num_precincts_y;
     band->prec = av_mallocz_array(nb_precincts, sizeof(*band->prec));
     if (!band->prec)
@@ -438,8 +452,8 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
     int reslevelno, bandno, gbandno = 0, ret, i, j;
     uint32_t csize;
 
-    if (!codsty->nreslevels2decode) {
-        av_log(avctx, AV_LOG_ERROR, "nreslevels2decode uninitialized\n");
+    if (codsty->nreslevels2decode <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "nreslevels2decode %d invalid or uninitialized\n", codsty->nreslevels2decode);
         return AVERROR_INVALIDDATA;
     }
 
@@ -447,18 +461,28 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                    codsty->nreslevels2decode - 1,
                                    codsty->transform))
         return ret;
-    // component size comp->coord is uint16_t so ir cannot overflow
+
+    if (av_image_check_size(comp->coord[0][1] - comp->coord[0][0],
+                            comp->coord[1][1] - comp->coord[1][0], 0, avctx))
+        return AVERROR_INVALIDDATA;
     csize = (comp->coord[0][1] - comp->coord[0][0]) *
             (comp->coord[1][1] - comp->coord[1][0]);
+    if (comp->coord[0][1] - comp->coord[0][0] > 32768 ||
+        comp->coord[1][1] - comp->coord[1][0] > 32768) {
+        av_log(avctx, AV_LOG_ERROR, "component size too large\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     if (codsty->transform == FF_DWT97) {
+        csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->f_data);
         comp->i_data = NULL;
-        comp->f_data = av_malloc_array(csize, sizeof(*comp->f_data));
+        comp->f_data = av_mallocz_array(csize, sizeof(*comp->f_data));
         if (!comp->f_data)
             return AVERROR(ENOMEM);
     } else {
+        csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->i_data);
         comp->f_data = NULL;
-        comp->i_data = av_malloc_array(csize, sizeof(*comp->i_data));
+        comp->i_data = av_mallocz_array(csize, sizeof(*comp->i_data));
         if (!comp->i_data)
             return AVERROR(ENOMEM);
     }
@@ -480,6 +504,9 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
         // update precincts size: 2^n value
         reslevel->log2_prec_width  = codsty->log2_prec_widths[reslevelno];
         reslevel->log2_prec_height = codsty->log2_prec_heights[reslevelno];
+        if (!reslevel->log2_prec_width || !reslevel->log2_prec_height) {
+            return AVERROR_INVALIDDATA;
+        }
 
         /* Number of bands for each resolution level */
         if (reslevelno == 0)
@@ -514,6 +541,9 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
         if (!reslevel->band)
             return AVERROR(ENOMEM);
 
+        if (reslevel->num_precincts_x * (uint64_t)reslevel->num_precincts_y * reslevel->nbands > avctx->max_pixels / sizeof(*reslevel->band->prec))
+            return AVERROR(ENOMEM);
+
         for (bandno = 0; bandno < reslevel->nbands; bandno++, gbandno++) {
             ret = init_band(avctx, reslevel,
                             comp, codsty, qntsty,
@@ -526,6 +556,27 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
     return 0;
 }
 
+void ff_jpeg2000_reinit(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
+{
+    int reslevelno, bandno, cblkno, precno;
+    for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++) {
+        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+        for (bandno = 0; bandno < rlevel->nbands; bandno++) {
+            Jpeg2000Band *band = rlevel->band + bandno;
+            for(precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++) {
+                Jpeg2000Prec *prec = band->prec + precno;
+                tag_tree_zero(prec->zerobits, prec->nb_codeblocks_width, prec->nb_codeblocks_height);
+                tag_tree_zero(prec->cblkincl, prec->nb_codeblocks_width, prec->nb_codeblocks_height);
+                for (cblkno = 0; cblkno < prec->nb_codeblocks_width * prec->nb_codeblocks_height; cblkno++) {
+                    Jpeg2000Cblk *cblk = prec->cblk + cblkno;
+                    cblk->length = 0;
+                    cblk->lblock = 3;
+                }
+            }
+        }
+    }
+}
+
 void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
 {
     int reslevelno, bandno, precno;
@@ -546,16 +597,24 @@ void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
 
             band = reslevel->band + bandno;
             for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++) {
-                Jpeg2000Prec *prec;
-
-                if (!band->prec)
-                    continue;
-
-                prec = band->prec + precno;
-                av_freep(&prec->zerobits);
-                av_freep(&prec->cblkincl);
-                av_freep(&prec->cblk);
-
+                if (band->prec) {
+                    Jpeg2000Prec *prec = band->prec + precno;
+                    int nb_code_blocks = prec->nb_codeblocks_height * prec->nb_codeblocks_width;
+
+                    av_freep(&prec->zerobits);
+                    av_freep(&prec->cblkincl);
+                    if (prec->cblk) {
+                        int cblkno;
+                        for (cblkno = 0; cblkno < nb_code_blocks; cblkno ++) {
+                            Jpeg2000Cblk *cblk = &prec->cblk[cblkno];
+                            av_freep(&cblk->data);
+                            av_freep(&cblk->passes);
+                            av_freep(&cblk->lengthinc);
+                            av_freep(&cblk->data_start);
+                        }
+                        av_freep(&prec->cblk);
+                    }
+                }
             }
 
             av_freep(&band->prec);
diff --git a/libavcodec/jpeg2000.h b/libavcodec/jpeg2000.h
index b96b7e2..c429ca5 100644
--- a/libavcodec/jpeg2000.h
+++ b/libavcodec/jpeg2000.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,19 +58,20 @@ enum Jpeg2000Markers {
     JPEG2000_EOC = 0xffd9, // end of codestream
 };
 
+#define JPEG2000_SOP_FIXED_BYTES 0xFF910004
+#define JPEG2000_SOP_BYTE_LENGTH 6
+
 enum Jpeg2000Quantsty { // quantization style
     JPEG2000_QSTY_NONE, // no quantization
     JPEG2000_QSTY_SI,   // scalar derived
     JPEG2000_QSTY_SE    // scalar expounded
 };
 
-#define JPEG2000_MAX_CBLKW 64
-#define JPEG2000_MAX_CBLKH 64
-
-
-#define JPEG2000_MAX_DECLEVELS 32
+#define JPEG2000_MAX_DECLEVELS 33
 #define JPEG2000_MAX_RESLEVELS (JPEG2000_MAX_DECLEVELS + 1)
 
+#define JPEG2000_MAX_PASSES 100
+
 // T1 flags
 // flags determining significance of neighbor coefficients
 #define JPEG2000_T1_SIG_N  0x0001
@@ -118,9 +119,10 @@ enum Jpeg2000Quantsty { // quantization style
 #define JPEG2000_PGOD_CPRL      0x04  // Component-position-resolution level-layer progression
 
 typedef struct Jpeg2000T1Context {
-    int data[JPEG2000_MAX_CBLKW][JPEG2000_MAX_CBLKH];
-    int flags[JPEG2000_MAX_CBLKW + 2][JPEG2000_MAX_CBLKH + 2];
+    int data[6144];
+    uint16_t flags[6156];
     MqcState mqc;
+    int stride;
 } Jpeg2000T1Context;
 
 typedef struct Jpeg2000TgtNode {
@@ -130,8 +132,8 @@ typedef struct Jpeg2000TgtNode {
 } Jpeg2000TgtNode;
 
 typedef struct Jpeg2000CodingStyle {
-    uint8_t nreslevels;       // number of resolution levels
-    uint8_t nreslevels2decode; // number of resolution levels to decode
+    int nreslevels;           // number of resolution levels
+    int nreslevels2decode;    // number of resolution levels to decode
     uint8_t log2_cblk_width,
             log2_cblk_height; // exponent of codeblock size
     uint8_t transform;        // DWT type
@@ -146,34 +148,47 @@ typedef struct Jpeg2000CodingStyle {
 
 typedef struct Jpeg2000QuantStyle {
     uint8_t expn[JPEG2000_MAX_DECLEVELS * 3];  // quantization exponent
-    uint32_t mant[JPEG2000_MAX_DECLEVELS * 3]; // quantization mantissa
+    uint16_t mant[JPEG2000_MAX_DECLEVELS * 3]; // quantization mantissa
     uint8_t quantsty;      // quantization style
     uint8_t nguardbits;    // number of guard bits
 } Jpeg2000QuantStyle;
 
+typedef struct Jpeg2000Pass {
+    uint16_t rate;
+    int64_t disto;
+    uint8_t flushed[4];
+    int flushed_len;
+} Jpeg2000Pass;
+
 typedef struct Jpeg2000Cblk {
     uint8_t npasses;
     uint8_t ninclpasses; // number coding of passes included in codestream
     uint8_t nonzerobits;
     uint16_t length;
-    uint16_t lengthinc;
+    uint16_t *lengthinc;
+    uint8_t nb_lengthinc;
     uint8_t lblock;
-    uint8_t zero;
-    uint8_t data[8192];
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    uint8_t *data;
+    size_t data_allocated;
+    int nb_terminations;
+    int nb_terminationsinc;
+    int *data_start;
+    Jpeg2000Pass *passes;
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Cblk; // code block
 
 typedef struct Jpeg2000Prec {
-    uint16_t nb_codeblocks_width;
-    uint16_t nb_codeblocks_height;
+    int nb_codeblocks_width;
+    int nb_codeblocks_height;
     Jpeg2000TgtNode *zerobits;
     Jpeg2000TgtNode *cblkincl;
     Jpeg2000Cblk *cblk;
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    int decoded_layers;
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Prec; // precinct
 
 typedef struct Jpeg2000Band {
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
     uint16_t log2_cblk_width, log2_cblk_height;
     int i_stepsize; // quantization stepsize
     float f_stepsize; // quantization stepsize
@@ -182,8 +197,8 @@ typedef struct Jpeg2000Band {
 
 typedef struct Jpeg2000ResLevel {
     uint8_t nbands;
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
-    uint16_t num_precincts_x, num_precincts_y; // number of precincts in x/y direction
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    int num_precincts_x, num_precincts_y; // number of precincts in x/y direction
     uint8_t log2_prec_width, log2_prec_height; // exponent of precinct size
     Jpeg2000Band *band;
 } Jpeg2000ResLevel; // resolution level
@@ -193,19 +208,19 @@ typedef struct Jpeg2000Component {
     DWTContext dwt;
     float *f_data;
     int *i_data;
-    uint16_t coord[2][2];   // border coordinates {{x0, x1}, {y0, y1}} -- can be reduced with lowres option
-    uint16_t coord_o[2][2]; // border coordinates {{x0, x1}, {y0, y1}} -- original values from jpeg2000 headers
+    int coord[2][2];   // border coordinates {{x0, x1}, {y0, y1}} -- can be reduced with lowres option
+    int coord_o[2][2]; // border coordinates {{x0, x1}, {y0, y1}} -- original values from jpeg2000 headers
 } Jpeg2000Component;
 
 /* misc tools */
 static inline int ff_jpeg2000_ceildivpow2(int a, int b)
 {
-    return (a + (1 << b) - 1) >> b;
+    return -((-(int64_t)a) >> b);
 }
 
 static inline int ff_jpeg2000_ceildiv(int a, int b)
 {
-    return (a + b - 1) / b;
+    return (a + (int64_t)b - 1) / b;
 }
 
 /* TIER-1 routines */
@@ -252,6 +267,25 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                int cbps, int dx, int dy,
                                AVCodecContext *ctx);
 
+void ff_jpeg2000_reinit(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty);
+
 void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty);
 
+static inline int needs_termination(int style, int passno) {
+    if (style & JPEG2000_CBLK_BYPASS) {
+        int type = passno % 3;
+        passno /= 3;
+        if (type == 0 && passno > 2)
+            return 2;
+        if (type == 2 && passno > 2)
+            return 1;
+        if (style & JPEG2000_CBLK_TERMALL) {
+            return passno > 2 ? 2 : 1;
+        }
+    }
+    if (style & JPEG2000_CBLK_TERMALL)
+        return 1;
+    return 0;
+}
+
 #endif /* AVCODEC_JPEG2000_H */
diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index deab1e8..a4291bc 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,10 +26,15 @@
  */
 
 #include <inttypes.h>
+#include <math.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/thread.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -41,10 +46,28 @@
 #define JP2_SIG_TYPE    0x6A502020
 #define JP2_SIG_VALUE   0x0D0A870A
 #define JP2_CODESTREAM  0x6A703263
+#define JP2_HEADER      0x6A703268
 
 #define HAD_COC 0x01
 #define HAD_QCC 0x02
 
+#define MAX_POCS 32
+
+typedef struct Jpeg2000POCEntry {
+    uint16_t LYEpoc;
+    uint16_t CSpoc;
+    uint16_t CEpoc;
+    uint8_t RSpoc;
+    uint8_t REpoc;
+    uint8_t Ppoc;
+} Jpeg2000POCEntry;
+
+typedef struct Jpeg2000POC {
+    Jpeg2000POCEntry poc[MAX_POCS];
+    int nb_poc;
+    int is_default;
+} Jpeg2000POC;
+
 typedef struct Jpeg2000TilePart {
     uint8_t tile_index;                 // Tile index who refers the tile-part
     const uint8_t *tp_end;
@@ -58,14 +81,16 @@ typedef struct Jpeg2000Tile {
     uint8_t             properties[4];
     Jpeg2000CodingStyle codsty[4];
     Jpeg2000QuantStyle  qntsty[4];
-    Jpeg2000TilePart    tile_part[3];
+    Jpeg2000POC         poc;
+    Jpeg2000TilePart    tile_part[32];
     uint16_t tp_idx;                    // Tile-part index
+    int coord[2][2];                    // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Tile;
 
 typedef struct Jpeg2000DecoderContext {
     AVClass         *class;
     AVCodecContext  *avctx;
-    GetByteContext g;
+    GetByteContext  g;
 
     int             width, height;
     int             image_offset_x, image_offset_y;
@@ -76,16 +101,23 @@ typedef struct Jpeg2000DecoderContext {
     int             cdx[4], cdy[4];
     int             precision;
     int             ncomponents;
+    int             colour_space;
+    uint32_t        palette[256];
+    int8_t          pal8;
+    int             cdef[4];
     int             tile_width, tile_height;
     unsigned        numXtiles, numYtiles;
     int             maxtilelen;
+    AVRational      sar;
 
     Jpeg2000CodingStyle codsty[4];
     Jpeg2000QuantStyle  qntsty[4];
+    Jpeg2000POC         poc;
 
     int             bit_index;
 
-    int16_t         curtileno;
+    int             curtileno;
+
     Jpeg2000Tile    *tile;
     Jpeg2000DSPContext dsp;
 
@@ -100,6 +132,7 @@ typedef struct Jpeg2000DecoderContext {
 static int get_bits(Jpeg2000DecoderContext *s, int n)
 {
     int res = 0;
+
     while (--n >= 0) {
         res <<= 1;
         if (s->bit_index == 0) {
@@ -125,8 +158,10 @@ static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *node,
     Jpeg2000TgtNode *stack[30];
     int sp = -1, curval = 0;
 
-    if (!node)
+    if (!node) {
+        av_log(s->avctx, AV_LOG_ERROR, "missing node\n");
         return AVERROR_INVALIDDATA;
+    }
 
     while (node && !node->vis) {
         stack[++sp] = node;
@@ -157,15 +192,83 @@ static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *node,
     return curval;
 }
 
+static int pix_fmt_match(enum AVPixelFormat pix_fmt, int components,
+                         int bpc, uint32_t log2_chroma_wh, int pal8)
+{
+    int match = 1;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+
+    av_assert2(desc);
+
+    if (desc->nb_components != components) {
+        return 0;
+    }
+
+    switch (components) {
+    case 4:
+        match = match && desc->comp[3].depth >= bpc &&
+                         (log2_chroma_wh >> 14 & 3) == 0 &&
+                         (log2_chroma_wh >> 12 & 3) == 0;
+    case 3:
+        match = match && desc->comp[2].depth >= bpc &&
+                         (log2_chroma_wh >> 10 & 3) == desc->log2_chroma_w &&
+                         (log2_chroma_wh >>  8 & 3) == desc->log2_chroma_h;
+    case 2:
+        match = match && desc->comp[1].depth >= bpc &&
+                         (log2_chroma_wh >>  6 & 3) == desc->log2_chroma_w &&
+                         (log2_chroma_wh >>  4 & 3) == desc->log2_chroma_h;
+
+    case 1:
+        match = match && desc->comp[0].depth >= bpc &&
+                         (log2_chroma_wh >>  2 & 3) == 0 &&
+                         (log2_chroma_wh       & 3) == 0 &&
+                         (desc->flags & AV_PIX_FMT_FLAG_PAL) == pal8 * AV_PIX_FMT_FLAG_PAL;
+    }
+    return match;
+}
+
+// pix_fmts with lower bpp have to be listed before
+// similar pix_fmts with higher bpp.
+#define RGB_PIXEL_FORMATS   AV_PIX_FMT_PAL8,AV_PIX_FMT_RGB24,AV_PIX_FMT_RGBA,AV_PIX_FMT_RGB48,AV_PIX_FMT_RGBA64
+#define GRAY_PIXEL_FORMATS  AV_PIX_FMT_GRAY8,AV_PIX_FMT_GRAY8A,AV_PIX_FMT_GRAY16,AV_PIX_FMT_YA16
+#define YUV_PIXEL_FORMATS   AV_PIX_FMT_YUV410P,AV_PIX_FMT_YUV411P,AV_PIX_FMT_YUVA420P, \
+                            AV_PIX_FMT_YUV420P,AV_PIX_FMT_YUV422P,AV_PIX_FMT_YUVA422P, \
+                            AV_PIX_FMT_YUV440P,AV_PIX_FMT_YUV444P,AV_PIX_FMT_YUVA444P, \
+                            AV_PIX_FMT_YUV420P9,AV_PIX_FMT_YUV422P9,AV_PIX_FMT_YUV444P9, \
+                            AV_PIX_FMT_YUVA420P9,AV_PIX_FMT_YUVA422P9,AV_PIX_FMT_YUVA444P9, \
+                            AV_PIX_FMT_YUV420P10,AV_PIX_FMT_YUV422P10,AV_PIX_FMT_YUV444P10, \
+                            AV_PIX_FMT_YUVA420P10,AV_PIX_FMT_YUVA422P10,AV_PIX_FMT_YUVA444P10, \
+                            AV_PIX_FMT_YUV420P12,AV_PIX_FMT_YUV422P12,AV_PIX_FMT_YUV444P12, \
+                            AV_PIX_FMT_YUV420P14,AV_PIX_FMT_YUV422P14,AV_PIX_FMT_YUV444P14, \
+                            AV_PIX_FMT_YUV420P16,AV_PIX_FMT_YUV422P16,AV_PIX_FMT_YUV444P16, \
+                            AV_PIX_FMT_YUVA420P16,AV_PIX_FMT_YUVA422P16,AV_PIX_FMT_YUVA444P16
+#define XYZ_PIXEL_FORMATS   AV_PIX_FMT_XYZ12
+
+static const enum AVPixelFormat rgb_pix_fmts[]  = {RGB_PIXEL_FORMATS};
+static const enum AVPixelFormat gray_pix_fmts[] = {GRAY_PIXEL_FORMATS};
+static const enum AVPixelFormat yuv_pix_fmts[]  = {YUV_PIXEL_FORMATS};
+static const enum AVPixelFormat xyz_pix_fmts[]  = {XYZ_PIXEL_FORMATS,
+                                                   YUV_PIXEL_FORMATS};
+static const enum AVPixelFormat all_pix_fmts[]  = {RGB_PIXEL_FORMATS,
+                                                   GRAY_PIXEL_FORMATS,
+                                                   YUV_PIXEL_FORMATS,
+                                                   XYZ_PIXEL_FORMATS};
+
 /* marker segments */
 /* get sizes and offsets of image, tiles; number of components */
 static int get_siz(Jpeg2000DecoderContext *s)
 {
     int i;
     int ncomponents;
+    uint32_t log2_chroma_wh = 0;
+    const enum AVPixelFormat *possible_fmts = NULL;
+    int possible_fmts_nb = 0;
+    int ret;
 
-    if (bytestream2_get_bytes_left(&s->g) < 36)
+    if (bytestream2_get_bytes_left(&s->g) < 36) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for SIZ\n");
         return AVERROR_INVALIDDATA;
+    }
 
     s->avctx->profile = bytestream2_get_be16u(&s->g); // Rsiz
     s->width          = bytestream2_get_be32u(&s->g); // Width
@@ -178,6 +281,15 @@ static int get_siz(Jpeg2000DecoderContext *s)
     s->tile_offset_y  = bytestream2_get_be32u(&s->g); // YT0Siz
     ncomponents       = bytestream2_get_be16u(&s->g); // CSiz
 
+    if (s->image_offset_x || s->image_offset_y) {
+        avpriv_request_sample(s->avctx, "Support for image offsets");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (av_image_check_size2(s->width, s->height, s->avctx->max_pixels, AV_PIX_FMT_NONE, 0, s->avctx)) {
+        avpriv_request_sample(s->avctx, "Large Dimensions");
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (ncomponents <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid number of components: %d\n",
                s->ncomponents);
@@ -186,21 +298,32 @@ static int get_siz(Jpeg2000DecoderContext *s)
 
     if (ncomponents > 4) {
         avpriv_request_sample(s->avctx, "Support for %d components",
-                              s->ncomponents);
+                              ncomponents);
         return AVERROR_PATCHWELCOME;
     }
 
+    if (s->tile_offset_x < 0 || s->tile_offset_y < 0 ||
+        s->image_offset_x < s->tile_offset_x ||
+        s->image_offset_y < s->tile_offset_y ||
+        s->tile_width  + (int64_t)s->tile_offset_x <= s->image_offset_x ||
+        s->tile_height + (int64_t)s->tile_offset_y <= s->image_offset_y
+    ) {
+        av_log(s->avctx, AV_LOG_ERROR, "Tile offsets are invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     s->ncomponents = ncomponents;
 
-    if (s->tile_width <= 0 || s->tile_height <= 0 ||
-        s->tile_width > s->width || s->tile_height > s->height) {
+    if (s->tile_width <= 0 || s->tile_height <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid tile dimension %dx%d.\n",
                s->tile_width, s->tile_height);
         return AVERROR_INVALIDDATA;
     }
 
-    if (bytestream2_get_bytes_left(&s->g) < 3 * s->ncomponents)
+    if (bytestream2_get_bytes_left(&s->g) < 3 * s->ncomponents) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for %d components in SIZ\n", s->ncomponents);
         return AVERROR_INVALIDDATA;
+    }
 
     for (i = 0; i < s->ncomponents; i++) { // Ssiz_i XRsiz_i, YRsiz_i
         uint8_t x    = bytestream2_get_byteu(&s->g);
@@ -209,21 +332,25 @@ static int get_siz(Jpeg2000DecoderContext *s)
         s->sgnd[i]   = !!(x & 0x80);
         s->cdx[i]    = bytestream2_get_byteu(&s->g);
         s->cdy[i]    = bytestream2_get_byteu(&s->g);
-
-        if (s->cdx[i] != 1 || s->cdy[i] != 1) {
-            avpriv_request_sample(s->avctx,
-                                  "CDxy values %d %d for component %d",
-                                  s->cdx[i], s->cdy[i], i);
-            if (!s->cdx[i] || !s->cdy[i])
-                return AVERROR_INVALIDDATA;
-            else
-                return AVERROR_PATCHWELCOME;
+        if (   !s->cdx[i] || s->cdx[i] == 3 || s->cdx[i] > 4
+            || !s->cdy[i] || s->cdy[i] == 3 || s->cdy[i] > 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid sample separation %d/%d\n", s->cdx[i], s->cdy[i]);
+            return AVERROR_INVALIDDATA;
         }
+        log2_chroma_wh |= s->cdy[i] >> 1 << i * 4 | s->cdx[i] >> 1 << i * 4 + 2;
     }
 
     s->numXtiles = ff_jpeg2000_ceildiv(s->width  - s->tile_offset_x, s->tile_width);
     s->numYtiles = ff_jpeg2000_ceildiv(s->height - s->tile_offset_y, s->tile_height);
 
+    // There must be at least a SOT and SOD per tile, their minimum size is 14
+    if (s->numXtiles * (uint64_t)s->numYtiles > INT_MAX/sizeof(*s->tile) ||
+        s->numXtiles * s->numYtiles * 14LL > bytestream2_size(&s->g)
+    ) {
+        s->numXtiles = s->numYtiles = 0;
+        return AVERROR(EINVAL);
+    }
+
     s->tile = av_mallocz_array(s->numXtiles * s->numYtiles, sizeof(*s->tile));
     if (!s->tile) {
         s->numXtiles = s->numYtiles = 0;
@@ -239,41 +366,82 @@ static int get_siz(Jpeg2000DecoderContext *s)
     }
 
     /* compute image size with reduction factor */
-    s->avctx->width  = ff_jpeg2000_ceildivpow2(s->width  - s->image_offset_x,
-                                               s->reduction_factor);
-    s->avctx->height = ff_jpeg2000_ceildivpow2(s->height - s->image_offset_y,
-                                               s->reduction_factor);
+    ret = ff_set_dimensions(s->avctx,
+            ff_jpeg2000_ceildivpow2(s->width  - s->image_offset_x,
+                                               s->reduction_factor),
+            ff_jpeg2000_ceildivpow2(s->height - s->image_offset_y,
+                                               s->reduction_factor));
+    if (ret < 0)
+        return ret;
 
-    switch (s->ncomponents) {
-    case 1:
-        if (s->precision > 8)
-            s->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
-        else
-            s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
-        break;
-    case 3:
-        switch (s->avctx->profile) {
-        case FF_PROFILE_JPEG2000_DCINEMA_2K:
-        case FF_PROFILE_JPEG2000_DCINEMA_4K:
-            /* XYZ color-space for digital cinema profiles */
-            s->avctx->pix_fmt = AV_PIX_FMT_XYZ12;
+    if (s->avctx->profile == FF_PROFILE_JPEG2000_DCINEMA_2K ||
+        s->avctx->profile == FF_PROFILE_JPEG2000_DCINEMA_4K) {
+        possible_fmts = xyz_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(xyz_pix_fmts);
+    } else {
+        switch (s->colour_space) {
+        case 16:
+            possible_fmts = rgb_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(rgb_pix_fmts);
+            break;
+        case 17:
+            possible_fmts = gray_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(gray_pix_fmts);
+            break;
+        case 18:
+            possible_fmts = yuv_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(yuv_pix_fmts);
             break;
         default:
-            if (s->precision > 8)
-                s->avctx->pix_fmt = AV_PIX_FMT_RGB48;
-            else
-                s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            possible_fmts = all_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(all_pix_fmts);
             break;
         }
-        break;
-    case 4:
-        s->avctx->pix_fmt = AV_PIX_FMT_RGBA;
-        break;
-    default:
-        /* pixel format can not be identified */
-        s->avctx->pix_fmt = AV_PIX_FMT_NONE;
-        break;
     }
+    for (i = 0; i < possible_fmts_nb; ++i) {
+        if (pix_fmt_match(possible_fmts[i], ncomponents, s->precision, log2_chroma_wh, s->pal8)) {
+            s->avctx->pix_fmt = possible_fmts[i];
+            break;
+        }
+    }
+
+    if (i == possible_fmts_nb) {
+        if (ncomponents == 4 &&
+            s->cdy[0] == 1 && s->cdx[0] == 1 &&
+            s->cdy[1] == 1 && s->cdx[1] == 1 &&
+            s->cdy[2] == s->cdy[3] && s->cdx[2] == s->cdx[3]) {
+            if (s->precision == 8 && s->cdy[2] == 2 && s->cdx[2] == 2 && !s->pal8) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+                s->cdef[0] = 0;
+                s->cdef[1] = 1;
+                s->cdef[2] = 2;
+                s->cdef[3] = 3;
+                i = 0;
+            }
+        }
+    }
+
+
+    if (i == possible_fmts_nb) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Unknown pix_fmt, profile: %d, colour_space: %d, "
+               "components: %d, precision: %d\n"
+               "cdx[0]: %d, cdy[0]: %d\n"
+               "cdx[1]: %d, cdy[1]: %d\n"
+               "cdx[2]: %d, cdy[2]: %d\n"
+               "cdx[3]: %d, cdy[3]: %d\n",
+               s->avctx->profile, s->colour_space, ncomponents, s->precision,
+               s->cdx[0],
+               s->cdy[0],
+               ncomponents > 1 ? s->cdx[1] : 0,
+               ncomponents > 1 ? s->cdy[1] : 0,
+               ncomponents > 2 ? s->cdx[2] : 0,
+               ncomponents > 2 ? s->cdy[2] : 0,
+               ncomponents > 3 ? s->cdx[3] : 0,
+               ncomponents > 3 ? s->cdy[3] : 0);
+        return AVERROR_PATCHWELCOME;
+    }
+    s->avctx->bits_per_raw_sample = s->precision;
     return 0;
 }
 
@@ -282,24 +450,34 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
 {
     uint8_t byte;
 
-    if (bytestream2_get_bytes_left(&s->g) < 5)
+    if (bytestream2_get_bytes_left(&s->g) < 5) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COX\n");
         return AVERROR_INVALIDDATA;
+    }
 
     /*  nreslevels = number of resolution levels
                    = number of decomposition level +1 */
     c->nreslevels = bytestream2_get_byteu(&s->g) + 1;
-
-    if (c->nreslevels > JPEG2000_MAX_RESLEVELS)
+    if (c->nreslevels >= JPEG2000_MAX_RESLEVELS) {
+        av_log(s->avctx, AV_LOG_ERROR, "nreslevels %d is invalid\n", c->nreslevels);
         return AVERROR_INVALIDDATA;
+    }
+
+    if (c->nreslevels <= s->reduction_factor) {
+        /* we are forced to update reduction_factor as its requested value is
+           not compatible with this bitstream, and as we might have used it
+           already in setup earlier we have to fail this frame until
+           reinitialization is implemented */
+        av_log(s->avctx, AV_LOG_ERROR, "reduction_factor too large for this bitstream, max is %d\n", c->nreslevels - 1);
+        s->reduction_factor = c->nreslevels - 1;
+        return AVERROR(EINVAL);
+    }
 
     /* compute number of resolution levels to decode */
-    if (c->nreslevels < s->reduction_factor)
-        c->nreslevels2decode = 1;
-    else
-        c->nreslevels2decode = c->nreslevels - s->reduction_factor;
+    c->nreslevels2decode = c->nreslevels - s->reduction_factor;
 
-    c->log2_cblk_width  = bytestream2_get_byteu(&s->g) + 2; // cblk width
-    c->log2_cblk_height = bytestream2_get_byteu(&s->g) + 2; // cblk height
+    c->log2_cblk_width  = (bytestream2_get_byteu(&s->g) & 15) + 2; // cblk width
+    c->log2_cblk_height = (bytestream2_get_byteu(&s->g) & 15) + 2; // cblk height
 
     if (c->log2_cblk_width > 10 || c->log2_cblk_height > 10 ||
         c->log2_cblk_width + c->log2_cblk_height > 12) {
@@ -309,13 +487,17 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
 
     c->cblk_style = bytestream2_get_byteu(&s->g);
     if (c->cblk_style != 0) { // cblk style
-        avpriv_request_sample(s->avctx, "Support for extra cblk styles");
-        return AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "extra cblk styles %X\n", c->cblk_style);
+        if (c->cblk_style & JPEG2000_CBLK_BYPASS)
+            av_log(s->avctx, AV_LOG_WARNING, "Selective arithmetic coding bypass\n");
     }
     c->transform = bytestream2_get_byteu(&s->g); // DWT transformation type
     /* set integer 9/7 DWT in case of BITEXACT flag */
     if ((s->avctx->flags & AV_CODEC_FLAG_BITEXACT) && (c->transform == FF_DWT97))
         c->transform = FF_DWT97_INT;
+    else if (c->transform == FF_DWT53) {
+        s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
+    }
 
     if (c->csty & JPEG2000_CSTY_PREC) {
         int i;
@@ -323,6 +505,13 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
             byte = bytestream2_get_byte(&s->g);
             c->log2_prec_widths[i]  =  byte       & 0x0F;    // precinct PPx
             c->log2_prec_heights[i] = (byte >> 4) & 0x0F;    // precinct PPy
+            if (i)
+                if (c->log2_prec_widths[i] == 0 || c->log2_prec_heights[i] == 0) {
+                    av_log(s->avctx, AV_LOG_ERROR, "PPx %d PPy %d invalid\n",
+                           c->log2_prec_widths[i], c->log2_prec_heights[i]);
+                    c->log2_prec_widths[i] = c->log2_prec_heights[i] = 1;
+                    return AVERROR_INVALIDDATA;
+                }
         }
     } else {
         memset(c->log2_prec_widths , 15, sizeof(c->log2_prec_widths ));
@@ -338,8 +527,10 @@ static int get_cod(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c,
     Jpeg2000CodingStyle tmp;
     int compno, ret;
 
-    if (bytestream2_get_bytes_left(&s->g) < 5)
+    if (bytestream2_get_bytes_left(&s->g) < 5) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COD\n");
         return AVERROR_INVALIDDATA;
+    }
 
     tmp.csty = bytestream2_get_byteu(&s->g);
 
@@ -372,8 +563,10 @@ static int get_coc(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c,
 {
     int compno, ret;
 
-    if (bytestream2_get_bytes_left(&s->g) < 2)
+    if (bytestream2_get_bytes_left(&s->g) < 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COC\n");
         return AVERROR_INVALIDDATA;
+    }
 
     compno = bytestream2_get_byteu(&s->g);
 
@@ -410,7 +603,7 @@ static int get_qcx(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q)
     if (q->quantsty == JPEG2000_QSTY_NONE) {
         n -= 3;
         if (bytestream2_get_bytes_left(&s->g) < n ||
-            n > JPEG2000_MAX_DECLEVELS)
+            n > JPEG2000_MAX_DECLEVELS*3)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < n; i++)
             q->expn[i] = bytestream2_get_byteu(&s->g) >> 3;
@@ -428,7 +621,7 @@ static int get_qcx(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q)
     } else {
         n = (n - 3) >> 1;
         if (bytestream2_get_bytes_left(&s->g) < 2 * n ||
-            n > JPEG2000_MAX_DECLEVELS)
+            n > JPEG2000_MAX_DECLEVELS*3)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < n; i++) {
             x          = bytestream2_get_be16u(&s->g);
@@ -446,6 +639,8 @@ static int get_qcd(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q,
     Jpeg2000QuantStyle tmp;
     int compno, ret;
 
+    memset(&tmp, 0, sizeof(tmp));
+
     if ((ret = get_qcx(s, n, &tmp)) < 0)
         return ret;
     for (compno = 0; compno < s->ncomponents; compno++)
@@ -477,38 +672,100 @@ static int get_qcc(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q,
     return get_qcx(s, n - 1, q + compno);
 }
 
+static int get_poc(Jpeg2000DecoderContext *s, int size, Jpeg2000POC *p)
+{
+    int i;
+    int elem_size = s->ncomponents <= 257 ? 7 : 9;
+    Jpeg2000POC tmp = {{{0}}};
+
+    if (bytestream2_get_bytes_left(&s->g) < 5 || size < 2 + elem_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for POC\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (elem_size > 7) {
+        avpriv_request_sample(s->avctx, "Fat POC not supported");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    tmp.nb_poc = (size - 2) / elem_size;
+    if (tmp.nb_poc > MAX_POCS) {
+        avpriv_request_sample(s->avctx, "Too many POCs (%d)", tmp.nb_poc);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    for (i = 0; i<tmp.nb_poc; i++) {
+        Jpeg2000POCEntry *e = &tmp.poc[i];
+        e->RSpoc  = bytestream2_get_byteu(&s->g);
+        e->CSpoc  = bytestream2_get_byteu(&s->g);
+        e->LYEpoc = bytestream2_get_be16u(&s->g);
+        e->REpoc  = bytestream2_get_byteu(&s->g);
+        e->CEpoc  = bytestream2_get_byteu(&s->g);
+        e->Ppoc   = bytestream2_get_byteu(&s->g);
+        if (!e->CEpoc)
+            e->CEpoc = 256;
+        if (e->CEpoc > s->ncomponents)
+            e->CEpoc = s->ncomponents;
+        if (   e->RSpoc >= e->REpoc || e->REpoc > 33
+            || e->CSpoc >= e->CEpoc || e->CEpoc > s->ncomponents
+            || !e->LYEpoc) {
+            av_log(s->avctx, AV_LOG_ERROR, "POC Entry %d is invalid (%d, %d, %d, %d, %d, %d)\n", i,
+                e->RSpoc, e->CSpoc, e->LYEpoc, e->REpoc, e->CEpoc, e->Ppoc
+            );
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (!p->nb_poc || p->is_default) {
+        *p = tmp;
+    } else {
+        if (p->nb_poc + tmp.nb_poc > MAX_POCS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for POC\n");
+            return AVERROR_INVALIDDATA;
+        }
+        memcpy(p->poc + p->nb_poc, tmp.poc, tmp.nb_poc * sizeof(tmp.poc[0]));
+        p->nb_poc += tmp.nb_poc;
+    }
+
+    p->is_default = 0;
+
+    return 0;
+}
+
+
 /* Get start of tile segment. */
 static int get_sot(Jpeg2000DecoderContext *s, int n)
 {
     Jpeg2000TilePart *tp;
     uint16_t Isot;
     uint32_t Psot;
-    uint8_t TPsot;
+    unsigned TPsot;
 
     if (bytestream2_get_bytes_left(&s->g) < 8)
         return AVERROR_INVALIDDATA;
 
+    s->curtileno = 0;
     Isot = bytestream2_get_be16u(&s->g);        // Isot
     if (Isot >= s->numXtiles * s->numYtiles)
         return AVERROR_INVALIDDATA;
 
-    if (Isot) {
-        avpriv_request_sample(s->avctx, "Support for more than one tile");
-        return AVERROR_PATCHWELCOME;
-    }
+    s->curtileno = Isot;
     Psot  = bytestream2_get_be32u(&s->g);       // Psot
     TPsot = bytestream2_get_byteu(&s->g);       // TPsot
 
     /* Read TNSot but not used */
     bytestream2_get_byteu(&s->g);               // TNsot
 
-    if (Psot > bytestream2_get_bytes_left(&s->g) + n + 2) {
+    if (!Psot)
+        Psot = bytestream2_get_bytes_left(&s->g) - 2 + n + 2;
+
+    if (Psot > bytestream2_get_bytes_left(&s->g) - 2 + n + 2) {
         av_log(s->avctx, AV_LOG_ERROR, "Psot %"PRIu32" too big\n", Psot);
         return AVERROR_INVALIDDATA;
     }
 
     if (TPsot >= FF_ARRAY_ELEMS(s->tile[Isot].tile_part)) {
-        avpriv_request_sample(s->avctx, "Support for %"PRIu8" components", TPsot);
+        avpriv_request_sample(s->avctx, "Too many tile parts");
         return AVERROR_PATCHWELCOME;
     }
 
@@ -523,6 +780,8 @@ static int get_sot(Jpeg2000DecoderContext *s, int n)
         /* copy defaults */
         memcpy(tile->codsty, s->codsty, s->ncomponents * sizeof(Jpeg2000CodingStyle));
         memcpy(tile->qntsty, s->qntsty, s->ncomponents * sizeof(Jpeg2000QuantStyle));
+        memcpy(&tile->poc  , &s->poc  , sizeof(tile->poc));
+        tile->poc.is_default = 1;
     }
 
     return 0;
@@ -570,6 +829,22 @@ static uint8_t get_tlm(Jpeg2000DecoderContext *s, int n)
     return 0;
 }
 
+static uint8_t get_plt(Jpeg2000DecoderContext *s, int n)
+{
+    int i;
+
+    av_log(s->avctx, AV_LOG_DEBUG,
+            "PLT marker at pos 0x%X\n", bytestream2_tell(&s->g) - 4);
+
+    /*Zplt =*/ bytestream2_get_byte(&s->g);
+
+    for (i = 0; i < n - 3; i++) {
+        bytestream2_get_byte(&s->g);
+    }
+
+    return 0;
+}
+
 static int init_tile(Jpeg2000DecoderContext *s, int tileno)
 {
     int compno;
@@ -580,16 +855,27 @@ static int init_tile(Jpeg2000DecoderContext *s, int tileno)
     if (!tile->comp)
         return AVERROR(ENOMEM);
 
+    tile->coord[0][0] = av_clip(tilex       * (int64_t)s->tile_width  + s->tile_offset_x, s->image_offset_x, s->width);
+    tile->coord[0][1] = av_clip((tilex + 1) * (int64_t)s->tile_width  + s->tile_offset_x, s->image_offset_x, s->width);
+    tile->coord[1][0] = av_clip(tiley       * (int64_t)s->tile_height + s->tile_offset_y, s->image_offset_y, s->height);
+    tile->coord[1][1] = av_clip((tiley + 1) * (int64_t)s->tile_height + s->tile_offset_y, s->image_offset_y, s->height);
+
     for (compno = 0; compno < s->ncomponents; compno++) {
         Jpeg2000Component *comp = tile->comp + compno;
         Jpeg2000CodingStyle *codsty = tile->codsty + compno;
         Jpeg2000QuantStyle  *qntsty = tile->qntsty + compno;
         int ret; // global bandno
 
-        comp->coord_o[0][0] = FFMAX(tilex       * s->tile_width  + s->tile_offset_x, s->image_offset_x);
-        comp->coord_o[0][1] = FFMIN((tilex + 1) * s->tile_width  + s->tile_offset_x, s->width);
-        comp->coord_o[1][0] = FFMAX(tiley       * s->tile_height + s->tile_offset_y, s->image_offset_y);
-        comp->coord_o[1][1] = FFMIN((tiley + 1) * s->tile_height + s->tile_offset_y, s->height);
+        comp->coord_o[0][0] = tile->coord[0][0];
+        comp->coord_o[0][1] = tile->coord[0][1];
+        comp->coord_o[1][0] = tile->coord[1][0];
+        comp->coord_o[1][1] = tile->coord[1][1];
+        if (compno) {
+            comp->coord_o[0][0] /= s->cdx[compno];
+            comp->coord_o[0][1] /= s->cdx[compno];
+            comp->coord_o[1][0] /= s->cdy[compno];
+            comp->coord_o[1][1] /= s->cdy[compno];
+        }
 
         comp->coord[0][0] = ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], s->reduction_factor);
         comp->coord[0][1] = ff_jpeg2000_ceildivpow2(comp->coord_o[0][1], s->reduction_factor);
@@ -631,12 +917,26 @@ static int getlblockinc(Jpeg2000DecoderContext *s)
     return res;
 }
 
-static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
+static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile, int *tp_index,
                                   Jpeg2000CodingStyle *codsty,
                                   Jpeg2000ResLevel *rlevel, int precno,
                                   int layno, uint8_t *expn, int numgbits)
 {
     int bandno, cblkno, ret, nb_code_blocks;
+    int cwsno;
+
+    if (layno < rlevel->band[0].prec[precno].decoded_layers)
+        return 0;
+    rlevel->band[0].prec[precno].decoded_layers = layno + 1;
+
+    if (bytestream2_get_bytes_left(&s->g) == 0 && s->bit_index == 8) {
+        if (*tp_index < FF_ARRAY_ELEMS(tile->tile_part) - 1) {
+            s->g = tile->tile_part[++(*tp_index)].tpg;
+        }
+    }
+
+    if (bytestream2_peek_be32(&s->g) == JPEG2000_SOP_FIXED_BYTES)
+        bytestream2_skip(&s->g, JPEG2000_SOP_BYTE_LENGTH);
 
     if (!(ret = get_bits(s, 1))) {
         jpeg2000_flush(s);
@@ -656,6 +956,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
         for (cblkno = 0; cblkno < nb_code_blocks; cblkno++) {
             Jpeg2000Cblk *cblk = prec->cblk + cblkno;
             int incl, newpasses, llen;
+            void *tmp;
 
             if (cblk->npasses)
                 incl = get_bits(s, 1);
@@ -669,28 +970,71 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
             if (!cblk->npasses) {
                 int v = expn[bandno] + numgbits - 1 -
                         tag_tree_decode(s, prec->zerobits + cblkno, 100);
-                if (v < 0) {
+                if (v < 0 || v > 30) {
                     av_log(s->avctx, AV_LOG_ERROR,
-                           "nonzerobits %d invalid\n", v);
+                           "nonzerobits %d invalid or unsupported\n", v);
                     return AVERROR_INVALIDDATA;
                 }
                 cblk->nonzerobits = v;
             }
             if ((newpasses = getnpasses(s)) < 0)
                 return newpasses;
+            av_assert2(newpasses > 0);
+            if (cblk->npasses + newpasses >= JPEG2000_MAX_PASSES) {
+                avpriv_request_sample(s->avctx, "Too many passes");
+                return AVERROR_PATCHWELCOME;
+            }
             if ((llen = getlblockinc(s)) < 0)
                 return llen;
-            cblk->lblock += llen;
-            if ((ret = get_bits(s, av_log2(newpasses) + cblk->lblock)) < 0)
-                return ret;
-            if (ret > sizeof(cblk->data)) {
+            if (cblk->lblock + llen + av_log2(newpasses) > 16) {
                 avpriv_request_sample(s->avctx,
-                                      "Block with lengthinc greater than %zu",
-                                      sizeof(cblk->data));
+                                      "Block with length beyond 16 bits");
                 return AVERROR_PATCHWELCOME;
             }
-            cblk->lengthinc = ret;
-            cblk->npasses  += newpasses;
+
+            cblk->lblock += llen;
+
+            cblk->nb_lengthinc = 0;
+            cblk->nb_terminationsinc = 0;
+            av_free(cblk->lengthinc);
+            cblk->lengthinc  = av_mallocz_array(newpasses    , sizeof(*cblk->lengthinc));
+            if (!cblk->lengthinc)
+                return AVERROR(ENOMEM);
+            tmp = av_realloc_array(cblk->data_start, cblk->nb_terminations + newpasses + 1, sizeof(*cblk->data_start));
+            if (!tmp)
+                return AVERROR(ENOMEM);
+            cblk->data_start = tmp;
+            do {
+                int newpasses1 = 0;
+
+                while (newpasses1 < newpasses) {
+                    newpasses1 ++;
+                    if (needs_termination(codsty->cblk_style, cblk->npasses + newpasses1 - 1)) {
+                        cblk->nb_terminationsinc ++;
+                        break;
+                    }
+                }
+
+                if ((ret = get_bits(s, av_log2(newpasses1) + cblk->lblock)) < 0)
+                    return ret;
+                if (ret > cblk->data_allocated) {
+                    size_t new_size = FFMAX(2*cblk->data_allocated, ret);
+                    void *new = av_realloc(cblk->data, new_size);
+                    if (new) {
+                        cblk->data = new;
+                        cblk->data_allocated = new_size;
+                    }
+                }
+                if (ret > cblk->data_allocated) {
+                    avpriv_request_sample(s->avctx,
+                                        "Block with lengthinc greater than %"SIZE_SPECIFIER"",
+                                        cblk->data_allocated);
+                    return AVERROR_PATCHWELCOME;
+                }
+                cblk->lengthinc[cblk->nb_lengthinc++] = ret;
+                cblk->npasses  += newpasses1;
+                newpasses -= newpasses1;
+            } while(newpasses);
         }
     }
     jpeg2000_flush(s);
@@ -699,7 +1043,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
         if (bytestream2_peek_be16(&s->g) == JPEG2000_EPH)
             bytestream2_skip(&s->g, 2);
         else
-            av_log(s->avctx, AV_LOG_ERROR, "EPH marker not found.\n");
+            av_log(s->avctx, AV_LOG_ERROR, "EPH marker not found. instead %X\n", bytestream2_peek_be32(&s->g));
     }
 
     for (bandno = 0; bandno < rlevel->nbands; bandno++) {
@@ -709,148 +1053,348 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
         nb_code_blocks = prec->nb_codeblocks_height * prec->nb_codeblocks_width;
         for (cblkno = 0; cblkno < nb_code_blocks; cblkno++) {
             Jpeg2000Cblk *cblk = prec->cblk + cblkno;
-            if (bytestream2_get_bytes_left(&s->g) < cblk->lengthinc)
-                return AVERROR_INVALIDDATA;
-            /* Code-block data can be empty. In that case initialize data
-             * with 0xFFFF. */
-            if (cblk->lengthinc > 0) {
-                bytestream2_get_bufferu(&s->g, cblk->data, cblk->lengthinc);
-            } else {
-                cblk->data[0] = 0xFF;
-                cblk->data[1] = 0xFF;
-            }
-            cblk->length   += cblk->lengthinc;
-            cblk->lengthinc = 0;
+            if (!cblk->nb_terminationsinc && !cblk->lengthinc)
+                continue;
+            for (cwsno = 0; cwsno < cblk->nb_lengthinc; cwsno ++) {
+                if (cblk->data_allocated < cblk->length + cblk->lengthinc[cwsno] + 4) {
+                    size_t new_size = FFMAX(2*cblk->data_allocated, cblk->length + cblk->lengthinc[cwsno] + 4);
+                    void *new = av_realloc(cblk->data, new_size);
+                    if (new) {
+                        cblk->data = new;
+                        cblk->data_allocated = new_size;
+                    }
+                }
+                if (   bytestream2_get_bytes_left(&s->g) < cblk->lengthinc[cwsno]
+                    || cblk->data_allocated < cblk->length + cblk->lengthinc[cwsno] + 4
+                ) {
+                    av_log(s->avctx, AV_LOG_ERROR,
+                        "Block length %"PRIu16" or lengthinc %d is too large, left %d\n",
+                        cblk->length, cblk->lengthinc[cwsno], bytestream2_get_bytes_left(&s->g));
+                    return AVERROR_INVALIDDATA;
+                }
 
-            if (cblk->length > sizeof(cblk->data)) {
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "Block length %"PRIu16" > data size %zd\n",
-                       cblk->length, sizeof(cblk->data));
-                return AVERROR_INVALIDDATA;
+                bytestream2_get_bufferu(&s->g, cblk->data + cblk->length, cblk->lengthinc[cwsno]);
+                cblk->length   += cblk->lengthinc[cwsno];
+                cblk->lengthinc[cwsno] = 0;
+                if (cblk->nb_terminationsinc) {
+                    cblk->nb_terminationsinc--;
+                    cblk->nb_terminations++;
+                    cblk->data[cblk->length++] = 0xFF;
+                    cblk->data[cblk->length++] = 0xFF;
+                    cblk->data_start[cblk->nb_terminations] = cblk->length;
+                }
             }
+            av_freep(&cblk->lengthinc);
         }
     }
     return 0;
 }
 
-static int decode_pgod_lrcp(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
+static int jpeg2000_decode_packets_po_iteration(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
+                                             int RSpoc, int CSpoc,
+                                             int LYEpoc, int REpoc, int CEpoc,
+                                             int Ppoc, int *tp_index)
 {
+    int ret = 0;
     int layno, reslevelno, compno, precno, ok_reslevel;
-    int ret;
+    int x, y;
+    int step_x, step_y;
 
-    for (layno = 0; layno < tile->codsty[0].nlayers; layno++) {
+    switch (Ppoc) {
+    case JPEG2000_PGOD_RLCP:
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order RLCP\n");
         ok_reslevel = 1;
-        for (reslevelno = 0; ok_reslevel; reslevelno++) {
+        for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
             ok_reslevel = 0;
-            for (compno = 0; compno < s->ncomponents; compno++) {
-                Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-                Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
-                if (reslevelno < codsty->nreslevels) {
-                    Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
-                                               reslevelno;
-                    ok_reslevel = 1;
-                    for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
-                        if ((ret = jpeg2000_decode_packet(s,
-                                                          codsty, rlevel,
-                                                          precno, layno,
-                                                          qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
-                                                          qntsty->nguardbits)) < 0)
-                            return ret;
+            for (layno = 0; layno < LYEpoc; layno++) {
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    if (reslevelno < codsty->nreslevels) {
+                        Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
+                                                reslevelno;
+                        ok_reslevel = 1;
+                        for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                              codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                    }
                 }
             }
         }
-    }
-
-    return 0;
-}
-
-static int decode_pgod_cprl(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
-{
-    int layno, reslevelno, compno, precno;
-    int ret, x, y;
-
-    for (compno = 0; compno < s->ncomponents; compno++) {
-        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-        Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
-
-        /* Set bit stream buffer address according to tile-part.
-         * For DCinema one tile-part per component, so can be
-         * indexed by component. */
-        s->g = tile->tile_part[compno].tpg;
-
-        /* Position loop (y axis)
-         * TODO: Automate computing of step 256.
-         * Fixed here, but to be computed before entering here. */
-        for (y = 0; y < s->height; y += 256) {
-            /* Position loop (y axis)
-             * TODO: automate computing of step 256.
-             * Fixed here, but to be computed before entering here. */
-            for (x = 0; x < s->width; x += 256) {
-                for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++) {
-                    uint16_t prcx, prcy;
-                    uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
-                    Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel + reslevelno;
-
-                    if (!((y % (1 << (rlevel->log2_prec_height + reducedresno)) == 0) ||
-                          (y == 0))) // TODO: 2nd condition simplified as try0 always =0 for dcinema
-                        continue;
-
-                    if (!((x % (1 << (rlevel->log2_prec_width + reducedresno)) == 0) ||
-                          (x == 0))) // TODO: 2nd condition simplified as try0 always =0 for dcinema
-                        continue;
+        break;
 
-                    // check if a precinct exists
-                    prcx   = ff_jpeg2000_ceildivpow2(x, reducedresno) >> rlevel->log2_prec_width;
-                    prcy   = ff_jpeg2000_ceildivpow2(y, reducedresno) >> rlevel->log2_prec_height;
-                    precno = prcx + rlevel->num_precincts_x * prcy;
-                    for (layno = 0; layno < tile->codsty[0].nlayers; layno++) {
-                        if ((ret = jpeg2000_decode_packet(s, codsty, rlevel,
-                                                          precno, layno,
-                                                          qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
-                                                          qntsty->nguardbits)) < 0)
-                            return ret;
+    case JPEG2000_PGOD_LRCP:
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order LRCP\n");
+        for (layno = 0; layno < LYEpoc; layno++) {
+            ok_reslevel = 1;
+            for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
+                ok_reslevel = 0;
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    if (reslevelno < codsty->nreslevels) {
+                        Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
+                                                reslevelno;
+                        ok_reslevel = 1;
+                        for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                              codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
                     }
                 }
             }
         }
-    }
-
-    return 0;
-}
-
-static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
-{
-    int ret = 0;
-
-    s->bit_index = 8;
-    switch (tile->codsty[0].prog_order) {
-    case JPEG2000_PGOD_LRCP:
-        ret = decode_pgod_lrcp(s, tile);
         break;
 
     case JPEG2000_PGOD_CPRL:
-        ret = decode_pgod_cprl(s, tile);
-        break;
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order CPRL\n");
+        for (compno = CSpoc; compno < CEpoc; compno++) {
+            Jpeg2000Component *comp     = tile->comp + compno;
+            Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+            Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+            step_x = 32;
+            step_y = 32;
+
+            if (RSpoc >= FFMIN(codsty->nreslevels, REpoc))
+                continue;
 
-    case JPEG2000_PGOD_RLCP:
-        avpriv_request_sample(s->avctx, "Progression order RLCP");
-        ret = AVERROR_PATCHWELCOME;
+            for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+            }
+            av_assert0(step_x < 32 && step_y < 32);
+            step_x = 1<<step_x;
+            step_y = 1<<step_y;
+
+            for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+                for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                    for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                        unsigned prcx, prcy;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                        int xc = x / s->cdx[compno];
+                        int yc = y / s->cdy[compno];
+
+                        if (yc % (1LL << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1LL << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                        for (layno = 0; layno < LYEpoc; layno++) {
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index, codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                        }
+                    }
+                }
+            }
+        }
         break;
 
     case JPEG2000_PGOD_RPCL:
-        avpriv_request_sample(s->avctx, "Progression order RPCL");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "Progression order RPCL\n");
+        ok_reslevel = 1;
+        for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
+            ok_reslevel = 0;
+            step_x = 30;
+            step_y = 30;
+            for (compno = CSpoc; compno < CEpoc; compno++) {
+                Jpeg2000Component *comp     = tile->comp + compno;
+                Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+                if (reslevelno < codsty->nreslevels) {
+                    uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                    Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                    step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                    step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+                }
+            }
+            step_x = 1<<step_x;
+            step_y = 1<<step_y;
+
+            for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+                for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                    for (compno = CSpoc; compno < CEpoc; compno++) {
+                        Jpeg2000Component *comp     = tile->comp + compno;
+                        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                        Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                        unsigned prcx, prcy;
+
+                        int xc = x / s->cdx[compno];
+                        int yc = y / s->cdy[compno];
+
+                        if (reslevelno >= codsty->nreslevels)
+                            continue;
+
+                        if (yc % (1LL << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1LL << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        ok_reslevel = 1;
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                            for (layno = 0; layno < LYEpoc; layno++) {
+                                if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                                codsty, rlevel,
+                                                                precno, layno,
+                                                                qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                                qntsty->nguardbits)) < 0)
+                                    return ret;
+                            }
+                    }
+                }
+            }
+        }
         break;
 
     case JPEG2000_PGOD_PCRL:
-        avpriv_request_sample(s->avctx, "Progression order PCRL");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "Progression order PCRL\n");
+        step_x = 32;
+        step_y = 32;
+        for (compno = CSpoc; compno < CEpoc; compno++) {
+            Jpeg2000Component *comp     = tile->comp + compno;
+            Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+            for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+            }
+        }
+        if (step_x >= 31 || step_y >= 31){
+            avpriv_request_sample(s->avctx, "PCRL with large step");
+            return AVERROR_PATCHWELCOME;
+        }
+        step_x = 1<<step_x;
+        step_y = 1<<step_y;
+
+        for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+            for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000Component *comp     = tile->comp + compno;
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    int xc = x / s->cdx[compno];
+                    int yc = y / s->cdy[compno];
+
+                    for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                        unsigned prcx, prcy;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+
+                        if (yc % (1LL << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1LL << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                        for (layno = 0; layno < LYEpoc; layno++) {
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index, codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                        }
+                    }
+                }
+            }
+        }
         break;
 
     default:
         break;
     }
 
+    return ret;
+}
+
+static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
+{
+    int ret = AVERROR_BUG;
+    int i;
+    int tp_index = 0;
+
+    s->bit_index = 8;
+    if (tile->poc.nb_poc) {
+        for (i=0; i<tile->poc.nb_poc; i++) {
+            Jpeg2000POCEntry *e = &tile->poc.poc[i];
+            ret = jpeg2000_decode_packets_po_iteration(s, tile,
+                e->RSpoc, e->CSpoc,
+                FFMIN(e->LYEpoc, tile->codsty[0].nlayers),
+                e->REpoc,
+                FFMIN(e->CEpoc, s->ncomponents),
+                e->Ppoc, &tp_index
+                );
+            if (ret < 0)
+                return ret;
+        }
+    } else {
+        ret = jpeg2000_decode_packets_po_iteration(s, tile,
+            0, 0,
+            tile->codsty[0].nlayers,
+            33,
+            s->ncomponents,
+            tile->codsty[0].prog_order,
+            &tp_index
+        );
+    }
     /* EOC marker reached */
     bytestream2_skip(&s->g, 2);
 
@@ -859,7 +1403,7 @@ static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
 
 /* TIER-1 routines */
 static void decode_sigpass(Jpeg2000T1Context *t1, int width, int height,
-                           int bpno, int bandno, int bpass_csty_symbol,
+                           int bpno, int bandno,
                            int vert_causal_ctx_csty_symbol)
 {
     int mask = 3 << (bpno - 1), y0, x, y;
@@ -867,29 +1411,29 @@ static void decode_sigpass(Jpeg2000T1Context *t1, int width, int height,
     for (y0 = 0; y0 < height; y0 += 4)
         for (x = 0; x < width; x++)
             for (y = y0; y < height && y < y0 + 4; y++) {
-                if ((t1->flags[y+1][x+1] & JPEG2000_T1_SIG_NB)
-                && !(t1->flags[y+1][x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
-                    int flags_mask = -1;
-                    if (vert_causal_ctx_csty_symbol && y == y0 + 3)
-                        flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE);
-                    if (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[y+1][x+1] & flags_mask, bandno))) {
-                        int xorbit, ctxno = ff_jpeg2000_getsgnctxno(t1->flags[y+1][x+1], &xorbit);
-                        if (bpass_csty_symbol)
-                             t1->data[y][x] = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ? -mask : mask;
+                int flags_mask = -1;
+                if (vert_causal_ctx_csty_symbol && y == y0 + 3)
+                    flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
+                if ((t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG_NB & flags_mask)
+                && !(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
+                    if (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask, bandno))) {
+                        int xorbit, ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask, &xorbit);
+                        if (t1->mqc.raw)
+                             t1->data[(y) * t1->stride + x] = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ? -mask : mask;
                         else
-                             t1->data[y][x] = (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ^ xorbit) ?
+                             t1->data[(y) * t1->stride + x] = (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ^ xorbit) ?
                                                -mask : mask;
 
                         ff_jpeg2000_set_significance(t1, x, y,
-                                                     t1->data[y][x] < 0);
+                                                     t1->data[(y) * t1->stride + x] < 0);
                     }
-                    t1->flags[y + 1][x + 1] |= JPEG2000_T1_VIS;
+                    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_VIS;
                 }
             }
 }
 
 static void decode_refpass(Jpeg2000T1Context *t1, int width, int height,
-                           int bpno)
+                           int bpno, int vert_causal_ctx_csty_symbol)
 {
     int phalf, nhalf;
     int y0, x, y;
@@ -900,13 +1444,15 @@ static void decode_refpass(Jpeg2000T1Context *t1, int width, int height,
     for (y0 = 0; y0 < height; y0 += 4)
         for (x = 0; x < width; x++)
             for (y = y0; y < height && y < y0 + 4; y++)
-                if ((t1->flags[y + 1][x + 1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG) {
-                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[y + 1][x + 1]);
+                if ((t1->flags[(y + 1) * t1->stride + x + 1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG) {
+                    int flags_mask = (vert_causal_ctx_csty_symbol && y == y0 + 3) ?
+                        ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S) : -1;
+                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[(y + 1) * t1->stride + x + 1] & flags_mask);
                     int r     = ff_mqc_decode(&t1->mqc,
                                               t1->mqc.cx_states + ctxno)
                                 ? phalf : nhalf;
-                    t1->data[y][x]          += t1->data[y][x] < 0 ? -r : r;
-                    t1->flags[y + 1][x + 1] |= JPEG2000_T1_REF;
+                    t1->data[(y) * t1->stride + x]          += t1->data[(y) * t1->stride + x] < 0 ? -r : r;
+                    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_REF;
                 }
 }
 
@@ -918,11 +1464,14 @@ static void decode_clnpass(Jpeg2000DecoderContext *s, Jpeg2000T1Context *t1,
 
     for (y0 = 0; y0 < height; y0 += 4) {
         for (x = 0; x < width; x++) {
+            int flags_mask = -1;
+            if (vert_causal_ctx_csty_symbol)
+                flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
             if (y0 + 3 < height &&
-                !((t1->flags[y0 + 1][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 2][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 3][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 4][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)))) {
+                !((t1->flags[(y0 + 1) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 2) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 3) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 4) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG) & flags_mask))) {
                 if (!ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + MQC_CX_RL))
                     continue;
                 runlen = ff_mqc_decode(&t1->mqc,
@@ -937,27 +1486,27 @@ static void decode_clnpass(Jpeg2000DecoderContext *s, Jpeg2000T1Context *t1,
             }
 
             for (y = y0 + runlen; y < y0 + 4 && y < height; y++) {
+                int flags_mask = -1;
+                if (vert_causal_ctx_csty_symbol && y == y0 + 3)
+                    flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
                 if (!dec) {
-                    if (!(t1->flags[y+1][x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
-                        int flags_mask = -1;
-                        if (vert_causal_ctx_csty_symbol && y == y0 + 3)
-                            flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE);
-                        dec = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[y+1][x+1] & flags_mask,
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
+                        dec = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask,
                                                                                              bandno));
                     }
                 }
                 if (dec) {
                     int xorbit;
-                    int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[y + 1][x + 1],
+                    int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y + 1) * t1->stride + x + 1] & flags_mask,
                                                         &xorbit);
-                    t1->data[y][x] = (ff_mqc_decode(&t1->mqc,
+                    t1->data[(y) * t1->stride + x] = (ff_mqc_decode(&t1->mqc,
                                                     t1->mqc.cx_states + ctxno) ^
                                       xorbit)
                                      ? -mask : mask;
-                    ff_jpeg2000_set_significance(t1, x, y, t1->data[y][x] < 0);
+                    ff_jpeg2000_set_significance(t1, x, y, t1->data[(y) * t1->stride + x] < 0);
                 }
                 dec = 0;
-                t1->flags[y + 1][x + 1] &= ~JPEG2000_T1_VIS;
+                t1->flags[(y + 1) * t1->stride + x + 1] &= ~JPEG2000_T1_VIS;
             }
         }
     }
@@ -977,53 +1526,78 @@ static int decode_cblk(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *codsty,
                        Jpeg2000T1Context *t1, Jpeg2000Cblk *cblk,
                        int width, int height, int bandpos)
 {
-    int passno = cblk->npasses, pass_t = 2, bpno = cblk->nonzerobits - 1, y;
-    int clnpass_cnt = 0;
-    int bpass_csty_symbol           = codsty->cblk_style & JPEG2000_CBLK_BYPASS;
+    int passno = cblk->npasses, pass_t = 2, bpno = cblk->nonzerobits - 1;
+    int pass_cnt = 0;
     int vert_causal_ctx_csty_symbol = codsty->cblk_style & JPEG2000_CBLK_VSC;
+    int term_cnt = 0;
+    int coder_type;
 
-    for (y = 0; y < height; y++)
-        memset(t1->data[y], 0, width * sizeof(**t1->data));
+    av_assert0(width <= 1024U && height <= 1024U);
+    av_assert0(width*height <= 4096);
+
+    memset(t1->data, 0, t1->stride * height * sizeof(*t1->data));
 
     /* If code-block contains no compressed data: nothing to do. */
     if (!cblk->length)
         return 0;
-    for (y = 0; y < height + 2; y++)
-        memset(t1->flags[y], 0, (width + 2) * sizeof(**t1->flags));
 
-    ff_mqc_initdec(&t1->mqc, cblk->data);
-    cblk->data[cblk->length]     = 0xff;
-    cblk->data[cblk->length + 1] = 0xff;
+    memset(t1->flags, 0, t1->stride * (height + 2) * sizeof(*t1->flags));
+
+    cblk->data[cblk->length] = 0xff;
+    cblk->data[cblk->length+1] = 0xff;
+    ff_mqc_initdec(&t1->mqc, cblk->data, 0, 1);
 
     while (passno--) {
-        switch (pass_t) {
+        if (bpno < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "bpno became negative\n");
+            return AVERROR_INVALIDDATA;
+        }
+        switch(pass_t) {
         case 0:
             decode_sigpass(t1, width, height, bpno + 1, bandpos,
-                           bpass_csty_symbol && (clnpass_cnt >= 4),
                            vert_causal_ctx_csty_symbol);
             break;
         case 1:
-            decode_refpass(t1, width, height, bpno + 1);
-            if (bpass_csty_symbol && clnpass_cnt >= 4)
-                ff_mqc_initdec(&t1->mqc, cblk->data);
+            decode_refpass(t1, width, height, bpno + 1, vert_causal_ctx_csty_symbol);
             break;
         case 2:
+            av_assert2(!t1->mqc.raw);
             decode_clnpass(s, t1, width, height, bpno + 1, bandpos,
                            codsty->cblk_style & JPEG2000_CBLK_SEGSYM,
                            vert_causal_ctx_csty_symbol);
-            clnpass_cnt = clnpass_cnt + 1;
-            if (bpass_csty_symbol && clnpass_cnt >= 4)
-                ff_mqc_initdec(&t1->mqc, cblk->data);
             break;
         }
+        if (codsty->cblk_style & JPEG2000_CBLK_RESET) // XXX no testcase for just this
+            ff_mqc_init_contexts(&t1->mqc);
+
+        if (passno && (coder_type = needs_termination(codsty->cblk_style, pass_cnt))) {
+            if (term_cnt >= cblk->nb_terminations) {
+                av_log(s->avctx, AV_LOG_ERROR, "Missing needed termination \n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (FFABS(cblk->data + cblk->data_start[term_cnt + 1] - 2 - t1->mqc.bp) > 0) {
+                av_log(s->avctx, AV_LOG_WARNING, "Mid mismatch %"PTRDIFF_SPECIFIER" in pass %d of %d\n",
+                    cblk->data + cblk->data_start[term_cnt + 1] - 2 - t1->mqc.bp,
+                    pass_cnt, cblk->npasses);
+            }
+
+            ff_mqc_initdec(&t1->mqc, cblk->data + cblk->data_start[++term_cnt], coder_type == 2, 0);
+        }
 
         pass_t++;
         if (pass_t == 3) {
             bpno--;
             pass_t = 0;
         }
+        pass_cnt ++;
     }
-    return 0;
+
+    if (cblk->data + cblk->length - 2*(term_cnt < cblk->nb_terminations) != t1->mqc.bp) {
+        av_log(s->avctx, AV_LOG_WARNING, "End mismatch %"PTRDIFF_SPECIFIER"\n",
+               cblk->data + cblk->length - 2*(term_cnt < cblk->nb_terminations) - t1->mqc.bp);
+    }
+
+    return 1;
 }
 
 /* TODO: Verify dequantization for lossless case
@@ -1041,7 +1615,7 @@ static void dequantization_float(int x, int y, Jpeg2000Cblk *cblk,
     int w = cblk->coord[0][1] - cblk->coord[0][0];
     for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
         float *datap = &comp->f_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
-        int *src = t1->data[j];
+        int *src = t1->data + j*t1->stride;
         for (i = 0; i < w; ++i)
             datap[i] = src[i] * band->f_stepsize;
     }
@@ -1056,9 +1630,29 @@ static void dequantization_int(int x, int y, Jpeg2000Cblk *cblk,
     int w = cblk->coord[0][1] - cblk->coord[0][0];
     for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
         int32_t *datap = &comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
-        int *src = t1->data[j];
+        int *src = t1->data + j*t1->stride;
+        if (band->i_stepsize == 32768) {
+            for (i = 0; i < w; ++i)
+                datap[i] = src[i] / 2;
+        } else {
+            // This should be VERY uncommon
+            for (i = 0; i < w; ++i)
+                datap[i] = (src[i] * (int64_t)band->i_stepsize) / 65536;
+        }
+    }
+}
+
+static void dequantization_int_97(int x, int y, Jpeg2000Cblk *cblk,
+                               Jpeg2000Component *comp,
+                               Jpeg2000T1Context *t1, Jpeg2000Band *band)
+{
+    int i, j;
+    int w = cblk->coord[0][1] - cblk->coord[0][0];
+    for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
+        int32_t *datap = &comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
+        int *src = t1->data + j*t1->stride;
         for (i = 0; i < w; ++i)
-            datap[i] = (src[i] * band->i_stepsize + (1 << 15)) >> 16;
+            datap[i] = (src[i] * (int64_t)band->i_stepsize + (1<<15)) >> 16;
     }
 }
 
@@ -1067,6 +1661,17 @@ static inline void mct_decode(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
     int i, csize = 1;
     void *src[3];
 
+    for (i = 1; i < 3; i++) {
+        if (tile->codsty[0].transform != tile->codsty[i].transform) {
+            av_log(s->avctx, AV_LOG_ERROR, "Transforms mismatch, MCT not supported\n");
+            return;
+        }
+        if (memcmp(tile->comp[0].coord, tile->comp[i].coord, sizeof(tile->comp[0].coord))) {
+            av_log(s->avctx, AV_LOG_ERROR, "Coords mismatch, MCT not supported\n");
+            return;
+        }
+    }
+
     for (i = 0; i < 3; i++)
         if (tile->codsty[0].transform == FF_DWT97)
             src[i] = tile->comp[i].f_data;
@@ -1086,18 +1691,22 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
     int compno, reslevelno, bandno;
 
     /* Loop on tile components */
-
     for (compno = 0; compno < s->ncomponents; compno++) {
         Jpeg2000Component *comp     = tile->comp + compno;
         Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+        int coded = 0;
+
+        t1.stride = (1<<codsty->log2_cblk_width) + 2;
+
         /* Loop on resolution levels */
         for (reslevelno = 0; reslevelno < codsty->nreslevels2decode; reslevelno++) {
             Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
             /* Loop on bands */
             for (bandno = 0; bandno < rlevel->nbands; bandno++) {
-                uint16_t nb_precincts, precno;
+                int nb_precincts, precno;
                 Jpeg2000Band *band = rlevel->band + bandno;
                 int cblkno = 0, bandpos;
+
                 bandpos = bandno + (reslevelno > 0);
 
                 if (band->coord[0][0] == band->coord[0][1] ||
@@ -1115,16 +1724,19 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
                          cblkno++) {
                         int x, y;
                         Jpeg2000Cblk *cblk = prec->cblk + cblkno;
-                        decode_cblk(s, codsty, &t1, cblk,
+                        int ret = decode_cblk(s, codsty, &t1, cblk,
                                     cblk->coord[0][1] - cblk->coord[0][0],
                                     cblk->coord[1][1] - cblk->coord[1][0],
                                     bandpos);
-
-                        x = cblk->coord[0][0];
-                        y = cblk->coord[1][0];
+                        if (ret)
+                            coded = 1;
+                        x = cblk->coord[0][0] - band->coord[0][0];
+                        y = cblk->coord[1][0] - band->coord[1][0];
 
                         if (codsty->transform == FF_DWT97)
                             dequantization_float(x, y, cblk, comp, &t1, band);
+                        else if (codsty->transform == FF_DWT97_INT)
+                            dequantization_int_97(x, y, cblk, comp, &t1, band);
                         else
                             dequantization_int(x, y, cblk, comp, &t1, band);
                    } /* end cblk */
@@ -1133,15 +1745,20 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
         } /* end reslevel */
 
         /* inverse DWT */
-        ff_dwt_decode(&comp->dwt, codsty->transform == FF_DWT97 ? (void*)comp->f_data : (void*)comp->i_data);
+        if (coded)
+            ff_dwt_decode(&comp->dwt, codsty->transform == FF_DWT97 ? (void*)comp->f_data : (void*)comp->i_data);
+
     } /*end comp */
 }
 
 #define WRITE_FRAME(D, PIXEL)                                                                     \
     static inline void write_frame_ ## D(Jpeg2000DecoderContext * s, Jpeg2000Tile * tile,         \
-                                         AVFrame * picture)                                       \
+                                         AVFrame * picture, int precision)                        \
     {                                                                                             \
-        int linesize = picture->linesize[0] / sizeof(PIXEL);                                      \
+        const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(s->avctx->pix_fmt);               \
+        int planar    = !!(pixdesc->flags & AV_PIX_FMT_FLAG_PLANAR);                              \
+        int pixelsize = planar ? 1 : pixdesc->nb_components;                                      \
+                                                                                                  \
         int compno;                                                                               \
         int x, y;                                                                                 \
                                                                                                   \
@@ -1153,35 +1770,39 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
             int32_t *i_datap = comp->i_data;                                                      \
             int cbps         = s->cbps[compno];                                                   \
             int w            = tile->comp[compno].coord[0][1] - s->image_offset_x;                \
+            int plane        = 0;                                                                 \
+                                                                                                  \
+            if (planar)                                                                           \
+                plane = s->cdef[compno] ? s->cdef[compno]-1 : (s->ncomponents-1);                 \
                                                                                                   \
-            y    = tile->comp[compno].coord[1][0] - s->image_offset_y;                            \
-            line = (PIXEL *)picture->data[0] + y * linesize;                                      \
-            for (; y < tile->comp[compno].coord[1][1] - s->image_offset_y; y += s->cdy[compno]) { \
+            y    = tile->comp[compno].coord[1][0] - s->image_offset_y / s->cdy[compno];           \
+            line = (PIXEL *)picture->data[plane] + y * (picture->linesize[plane] / sizeof(PIXEL));\
+            for (; y < tile->comp[compno].coord[1][1] - s->image_offset_y; y++) {                 \
                 PIXEL *dst;                                                                       \
                                                                                                   \
-                x   = tile->comp[compno].coord[0][0] - s->image_offset_x;                         \
-                dst = line + x * s->ncomponents + compno;                                         \
+                x   = tile->comp[compno].coord[0][0] - s->image_offset_x / s->cdx[compno];        \
+                dst = line + x * pixelsize + compno*!planar;                                      \
                                                                                                   \
                 if (codsty->transform == FF_DWT97) {                                              \
-                    for (; x < w; x += s->cdx[compno]) {                                          \
+                    for (; x < w; x++) {                                                          \
                         int val = lrintf(*datap) + (1 << (cbps - 1));                             \
                         /* DC level shift and clip see ISO 15444-1:2002 G.1.2 */                  \
                         val  = av_clip(val, 0, (1 << cbps) - 1);                                  \
-                        *dst = val << (8 * sizeof(PIXEL) - cbps);                                 \
+                        *dst = val << (precision - cbps);                                         \
                         datap++;                                                                  \
-                        dst += s->ncomponents;                                                    \
+                        dst += pixelsize;                                                         \
                     }                                                                             \
                 } else {                                                                          \
-                    for (; x < w; x += s->cdx[compno]) {                                          \
+                    for (; x < w; x++) {                                                          \
                         int val = *i_datap + (1 << (cbps - 1));                                   \
                         /* DC level shift and clip see ISO 15444-1:2002 G.1.2 */                  \
                         val  = av_clip(val, 0, (1 << cbps) - 1);                                  \
-                        *dst = val << (8 * sizeof(PIXEL) - cbps);                                 \
+                        *dst = val << (precision - cbps);                                         \
                         i_datap++;                                                                \
-                        dst += s->ncomponents;                                                    \
+                        dst += pixelsize;                                                         \
                     }                                                                             \
                 }                                                                                 \
-                line += linesize;                                                                 \
+                line += picture->linesize[plane] / sizeof(PIXEL);                                 \
             }                                                                                     \
         }                                                                                         \
                                                                                                   \
@@ -1192,19 +1813,40 @@ WRITE_FRAME(16, uint16_t)
 
 #undef WRITE_FRAME
 
-static int jpeg2000_decode_tile(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
-                                AVFrame *picture)
+static int jpeg2000_decode_tile(AVCodecContext *avctx, void *td,
+                                int jobnr, int threadnr)
 {
+    Jpeg2000DecoderContext *s = avctx->priv_data;
+    AVFrame *picture = td;
+    Jpeg2000Tile *tile = s->tile + jobnr;
+    int x;
+
     tile_codeblocks(s, tile);
 
     /* inverse MCT transformation */
     if (tile->codsty[0].mct)
         mct_decode(s, tile);
 
+    for (x = 0; x < s->ncomponents; x++) {
+        if (s->cdef[x] < 0) {
+            for (x = 0; x < s->ncomponents; x++) {
+                s->cdef[x] = x + 1;
+            }
+            if ((s->ncomponents & 1) == 0)
+                s->cdef[s->ncomponents-1] = 0;
+            break;
+        }
+    }
+
     if (s->precision <= 8) {
-        write_frame_8(s, tile, picture);
+        write_frame_8(s, tile, picture, 8);
     } else {
-        write_frame_16(s, tile, picture);
+        int precision = picture->format == AV_PIX_FMT_XYZ12 ||
+                        picture->format == AV_PIX_FMT_RGB48 ||
+                        picture->format == AV_PIX_FMT_RGBA64 ||
+                        picture->format == AV_PIX_FMT_GRAY16 ? 16 : s->precision;
+
+        write_frame_16(s, tile, picture, precision);
     }
 
     return 0;
@@ -1214,22 +1856,30 @@ static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
 {
     int tileno, compno;
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
-        for (compno = 0; compno < s->ncomponents; compno++) {
-            Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
-            Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
+        if (s->tile[tileno].comp) {
+            for (compno = 0; compno < s->ncomponents; compno++) {
+                Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
+                Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
 
-            ff_jpeg2000_cleanup(comp, codsty);
+                ff_jpeg2000_cleanup(comp, codsty);
+            }
+            av_freep(&s->tile[tileno].comp);
         }
-        av_freep(&s->tile[tileno].comp);
     }
     av_freep(&s->tile);
+    memset(s->codsty, 0, sizeof(s->codsty));
+    memset(s->qntsty, 0, sizeof(s->qntsty));
+    memset(s->properties, 0, sizeof(s->properties));
+    memset(&s->poc  , 0, sizeof(s->poc));
     s->numXtiles = s->numYtiles = 0;
+    s->ncomponents = 0;
 }
 
 static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
 {
     Jpeg2000CodingStyle *codsty = s->codsty;
     Jpeg2000QuantStyle *qntsty  = s->qntsty;
+    Jpeg2000POC         *poc    = &s->poc;
     uint8_t *properties         = s->properties;
 
     for (;;) {
@@ -1249,17 +1899,21 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
             Jpeg2000Tile *tile;
             Jpeg2000TilePart *tp;
 
-            if (s->curtileno < 0) {
-                av_log(s->avctx, AV_LOG_ERROR, "Missing SOT\n");
-                return AVERROR_INVALIDDATA;
-            }
             if (!s->tile) {
                 av_log(s->avctx, AV_LOG_ERROR, "Missing SIZ\n");
                 return AVERROR_INVALIDDATA;
             }
+            if (s->curtileno < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "Missing SOT\n");
+                return AVERROR_INVALIDDATA;
+            }
 
             tile = s->tile + s->curtileno;
             tp = tile->tile_part + tile->tp_idx;
+            if (tp->tp_end < s->g.buffer) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid tpend\n");
+                return AVERROR_INVALIDDATA;
+            }
             bytestream2_init(&tp->tpg, s->g.buffer, tp->tp_end - s->g.buffer);
             bytestream2_skip(&s->g, tp->tp_end - s->g.buffer);
 
@@ -1268,13 +1922,21 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
         if (marker == JPEG2000_EOC)
             break;
 
-        len = bytestream2_get_be16u(&s->g);
-        if (len < 2 || bytestream2_get_bytes_left(&s->g) < len - 2)
+        len = bytestream2_get_be16(&s->g);
+        if (len < 2 || bytestream2_get_bytes_left(&s->g) < len - 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid len %d left=%d\n", len, bytestream2_get_bytes_left(&s->g));
             return AVERROR_INVALIDDATA;
+        }
 
         switch (marker) {
         case JPEG2000_SIZ:
+            if (s->ncomponents) {
+                av_log(s->avctx, AV_LOG_ERROR, "Duplicate SIZ\n");
+                return AVERROR_INVALIDDATA;
+            }
             ret = get_siz(s);
+            if (!s->tile)
+                s->numXtiles = s->numYtiles = 0;
             break;
         case JPEG2000_COC:
             ret = get_coc(s, codsty, properties);
@@ -1288,15 +1950,18 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
         case JPEG2000_QCD:
             ret = get_qcd(s, len, qntsty, properties);
             break;
+        case JPEG2000_POC:
+            ret = get_poc(s, len, poc);
+            break;
         case JPEG2000_SOT:
             if (!(ret = get_sot(s, len))) {
+                av_assert1(s->curtileno >= 0);
                 codsty = s->tile[s->curtileno].codsty;
                 qntsty = s->tile[s->curtileno].qntsty;
+                poc    = &s->tile[s->curtileno].poc;
                 properties = s->tile[s->curtileno].properties;
             }
             break;
-        case JPEG2000_PLT:
-            // the PLT marker is ignored
         case JPEG2000_PLM:
             // the PLM marker is ignored
         case JPEG2000_COM:
@@ -1307,6 +1972,10 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
             // Tile-part lengths
             ret = get_tlm(s, len);
             break;
+        case JPEG2000_PLT:
+            // Packet length, tile-part header
+            ret = get_plt(s, len);
+            break;
         default:
             av_log(s->avctx, AV_LOG_ERROR,
                    "unsupported marker 0x%.4"PRIX16" at pos 0x%X\n",
@@ -1333,11 +2002,11 @@ static int jpeg2000_read_bitstream_packets(Jpeg2000DecoderContext *s)
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
         Jpeg2000Tile *tile = s->tile + tileno;
 
-        if (ret = init_tile(s, tileno))
+        if ((ret = init_tile(s, tileno)) < 0)
             return ret;
 
         s->g = tile->tile_part[0].tpg;
-        if (ret = jpeg2000_decode_packets(s, tile))
+        if ((ret = jpeg2000_decode_packets(s, tile)) < 0)
             return ret;
     }
 
@@ -1346,33 +2015,159 @@ static int jpeg2000_read_bitstream_packets(Jpeg2000DecoderContext *s)
 
 static int jp2_find_codestream(Jpeg2000DecoderContext *s)
 {
-    uint32_t atom_size, atom;
-    int found_codestream = 0, search_range = 10;
+    uint32_t atom_size, atom, atom_end;
+    int search_range = 10;
 
-    while(!found_codestream && search_range
-          &&
-          bytestream2_get_bytes_left(&s->g) >= 8) {
+    while (search_range
+           &&
+           bytestream2_get_bytes_left(&s->g) >= 8) {
         atom_size = bytestream2_get_be32u(&s->g);
         atom      = bytestream2_get_be32u(&s->g);
-        if (atom == JP2_CODESTREAM) {
-            found_codestream = 1;
-        } else {
-            if (bytestream2_get_bytes_left(&s->g) < atom_size - 8)
+        if (atom_size == 1) {
+            if (bytestream2_get_be32u(&s->g)) {
+                avpriv_request_sample(s->avctx, "Huge atom");
                 return 0;
-            bytestream2_skipu(&s->g, atom_size - 8);
+            }
+            atom_size = bytestream2_get_be32u(&s->g);
+            atom_end  = bytestream2_tell(&s->g) + atom_size - 16;
+        } else {
+            atom_end  = bytestream2_tell(&s->g) + atom_size -  8;
+        }
+
+        if (atom == JP2_CODESTREAM)
+            return 1;
+
+        if (bytestream2_get_bytes_left(&s->g) < atom_size || atom_end < atom_size)
+            return 0;
+
+        if (atom == JP2_HEADER &&
+                   atom_size >= 16) {
+            uint32_t atom2_size, atom2, atom2_end;
+            do {
+                atom2_size = bytestream2_get_be32u(&s->g);
+                atom2      = bytestream2_get_be32u(&s->g);
+                atom2_end  = bytestream2_tell(&s->g) + atom2_size - 8;
+                if (atom2_size < 8 || atom2_end > atom_end || atom2_end < atom2_size)
+                    break;
+                atom2_size -= 8;
+                if (atom2 == JP2_CODESTREAM) {
+                    return 1;
+                } else if (atom2 == MKBETAG('c','o','l','r') && atom2_size >= 7) {
+                    int method = bytestream2_get_byteu(&s->g);
+                    bytestream2_skipu(&s->g, 2);
+                    if (method == 1) {
+                        s->colour_space = bytestream2_get_be32u(&s->g);
+                    }
+                } else if (atom2 == MKBETAG('p','c','l','r') && atom2_size >= 6) {
+                    int i, size, colour_count, colour_channels, colour_depth[3];
+                    colour_count = bytestream2_get_be16u(&s->g);
+                    colour_channels = bytestream2_get_byteu(&s->g);
+                    // FIXME: Do not ignore channel_sign
+                    colour_depth[0] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    colour_depth[1] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    colour_depth[2] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    size = (colour_depth[0] + 7 >> 3) * colour_count +
+                           (colour_depth[1] + 7 >> 3) * colour_count +
+                           (colour_depth[2] + 7 >> 3) * colour_count;
+                    if (colour_count > AVPALETTE_COUNT ||
+                        colour_channels != 3 ||
+                        colour_depth[0] > 16 ||
+                        colour_depth[1] > 16 ||
+                        colour_depth[2] > 16 ||
+                        atom2_size < size) {
+                        avpriv_request_sample(s->avctx, "Unknown palette");
+                        bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+                        continue;
+                    }
+                    s->pal8 = 1;
+                    for (i = 0; i < colour_count; i++) {
+                        uint32_t r, g, b;
+                        if (colour_depth[0] <= 8) {
+                            r = bytestream2_get_byteu(&s->g) << 8 - colour_depth[0];
+                            r |= r >> colour_depth[0];
+                        } else {
+                            r = bytestream2_get_be16u(&s->g) >> colour_depth[0] - 8;
+                        }
+                        if (colour_depth[1] <= 8) {
+                            g = bytestream2_get_byteu(&s->g) << 8 - colour_depth[1];
+                            g |= g >> colour_depth[1];
+                        } else {
+                            g = bytestream2_get_be16u(&s->g) >> colour_depth[1] - 8;
+                        }
+                        if (colour_depth[2] <= 8) {
+                            b = bytestream2_get_byteu(&s->g) << 8 - colour_depth[2];
+                            b |= b >> colour_depth[2];
+                        } else {
+                            b = bytestream2_get_be16u(&s->g) >> colour_depth[2] - 8;
+                        }
+                        s->palette[i] = 0xffu << 24 | r << 16 | g << 8 | b;
+                    }
+                } else if (atom2 == MKBETAG('c','d','e','f') && atom2_size >= 2) {
+                    int n = bytestream2_get_be16u(&s->g);
+                    for (; n>0; n--) {
+                        int cn   = bytestream2_get_be16(&s->g);
+                        int av_unused typ  = bytestream2_get_be16(&s->g);
+                        int asoc = bytestream2_get_be16(&s->g);
+                        if (cn < 4 && asoc < 4)
+                            s->cdef[cn] = asoc;
+                    }
+                } else if (atom2 == MKBETAG('r','e','s',' ') && atom2_size >= 18) {
+                    int64_t vnum, vden, hnum, hden, vexp, hexp;
+                    uint32_t resx;
+                    bytestream2_skip(&s->g, 4);
+                    resx = bytestream2_get_be32u(&s->g);
+                    if (resx != MKBETAG('r','e','s','c') && resx != MKBETAG('r','e','s','d')) {
+                        bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+                        continue;
+                    }
+                    vnum = bytestream2_get_be16u(&s->g);
+                    vden = bytestream2_get_be16u(&s->g);
+                    hnum = bytestream2_get_be16u(&s->g);
+                    hden = bytestream2_get_be16u(&s->g);
+                    vexp = bytestream2_get_byteu(&s->g);
+                    hexp = bytestream2_get_byteu(&s->g);
+                    if (!vnum || !vden || !hnum || !hden) {
+                        bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+                        av_log(s->avctx, AV_LOG_WARNING, "RES box invalid\n");
+                        continue;
+                    }
+                    if (vexp > hexp) {
+                        vexp -= hexp;
+                        hexp = 0;
+                    } else {
+                        hexp -= vexp;
+                        vexp = 0;
+                    }
+                    if (   INT64_MAX / (hnum * vden) > pow(10, hexp)
+                        && INT64_MAX / (vnum * hden) > pow(10, vexp))
+                        av_reduce(&s->sar.den, &s->sar.num,
+                                  hnum * vden * pow(10, hexp),
+                                  vnum * hden * pow(10, vexp),
+                                  INT32_MAX);
+                }
+                bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+            } while (atom_end - atom2_end >= 8);
+        } else {
             search_range--;
         }
+        bytestream2_seek(&s->g, atom_end, SEEK_SET);
     }
 
-    if (found_codestream)
-        return 1;
     return 0;
 }
 
+static av_cold void jpeg2000_init_static_data(void)
+{
+    ff_jpeg2000_init_tier1_luts();
+    ff_mqc_init_context_tables();
+}
+
 static av_cold int jpeg2000_decode_init(AVCodecContext *avctx)
 {
+    static AVOnce init_static_once = AV_ONCE_INIT;
     Jpeg2000DecoderContext *s = avctx->priv_data;
 
+    ff_thread_once(&init_static_once, jpeg2000_init_static_data);
     ff_jpeg2000dsp_init(&s->dsp);
 
     return 0;
@@ -1384,11 +2179,12 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
     Jpeg2000DecoderContext *s = avctx->priv_data;
     ThreadFrame frame = { .f = data };
     AVFrame *picture = data;
-    int tileno, ret;
+    int ret;
 
     s->avctx     = avctx;
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
-    s->curtileno = 0; // TODO: only one tile in DCI JP2K. to implement for more tiles
+    s->curtileno = -1;
+    memset(s->cdef, -1, sizeof(s->cdef));
 
     if (bytestream2_get_bytes_left(&s->g) < 2) {
         ret = AVERROR_INVALIDDATA;
@@ -1410,6 +2206,9 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
         bytestream2_seek(&s->g, 0, SEEK_SET);
     }
 
+    while (bytestream2_get_bytes_left(&s->g) >= 3 && bytestream2_peek_be16(&s->g) != JPEG2000_SOC)
+        bytestream2_skip(&s->g, 1);
+
     if (bytestream2_get_be16u(&s->g) != JPEG2000_SOC) {
         av_log(avctx, AV_LOG_ERROR, "SOC marker not present\n");
         ret = AVERROR_INVALIDDATA;
@@ -1419,23 +2218,26 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
         goto end;
 
     /* get picture buffer */
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed.\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         goto end;
-    }
     picture->pict_type = AV_PICTURE_TYPE_I;
     picture->key_frame = 1;
 
     if (ret = jpeg2000_read_bitstream_packets(s))
         goto end;
-    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++)
-        if (ret = jpeg2000_decode_tile(s, s->tile + tileno, picture))
-            goto end;
+
+    avctx->execute2(avctx, jpeg2000_decode_tile, picture, NULL, s->numXtiles * s->numYtiles);
 
     jpeg2000_dec_cleanup(s);
 
     *got_frame = 1;
 
+    if (s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        memcpy(picture->data[1], s->palette, 256 * sizeof(uint32_t));
+    if (s->sar.num && s->sar.den)
+        avctx->sample_aspect_ratio = s->sar;
+    s->sar.num = s->sar.den = 0;
+
     return bytestream2_tell(&s->g);
 
 end:
@@ -1443,12 +2245,6 @@ end:
     return ret;
 }
 
-static av_cold void jpeg2000_init_static_data(AVCodec *codec)
-{
-    ff_jpeg2000_init_tier1_luts();
-    ff_mqc_init_context_tables();
-}
-
 #define OFFSET(x) offsetof(Jpeg2000DecoderContext, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
@@ -1458,7 +2254,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass jpeg2000_class = {
     .class_name = "jpeg2000",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -1470,11 +2266,11 @@ AVCodec ff_jpeg2000_decoder = {
     .long_name        = NULL_IF_CONFIG_SMALL("JPEG 2000"),
     .type             = AVMEDIA_TYPE_VIDEO,
     .id               = AV_CODEC_ID_JPEG2000,
-    .capabilities     = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_DR1,
+    .capabilities     = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_DR1,
     .priv_data_size   = sizeof(Jpeg2000DecoderContext),
-    .init_static_data = jpeg2000_init_static_data,
     .init             = jpeg2000_decode_init,
     .decode           = jpeg2000_decode_frame,
-    .priv_class       = &class,
+    .priv_class       = &jpeg2000_class,
+    .max_lowres       = 5,
     .profiles         = NULL_IF_CONFIG_SMALL(ff_jpeg2000_profiles)
 };
diff --git a/libavcodec/jpeg2000dsp.c b/libavcodec/jpeg2000dsp.c
index 6e04c3a..90e73b1 100644
--- a/libavcodec/jpeg2000dsp.c
+++ b/libavcodec/jpeg2000dsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,10 +64,10 @@ static void ict_int(void *_src0, void *_src1, void *_src2, int csize)
     int i;
 
     for (i = 0; i < csize; i++) {
-        i0 = *src0 + (((i_ict_params[0] * *src2) + (1 << 15)) >> 16);
-        i1 = *src0 - (((i_ict_params[1] * *src1) + (1 << 15)) >> 16)
-                   - (((i_ict_params[2] * *src2) + (1 << 15)) >> 16);
-        i2 = *src0 + (((i_ict_params[3] * *src1) + (1 << 15)) >> 16);
+        i0 = *src0 + *src2 + ((int)((26345U * *src2) + (1 << 15)) >> 16);
+        i1 = *src0 - ((int)(((unsigned)i_ict_params[1] * *src1) + (1 << 15)) >> 16)
+                   - ((int)(((unsigned)i_ict_params[2] * *src2) + (1 << 15)) >> 16);
+        i2 = *src0 + (2 * *src1) + ((int)((-14942U * *src1) + (1 << 15)) >> 16);
         *src0++ = i0;
         *src1++ = i1;
         *src2++ = i2;
@@ -95,4 +95,7 @@ av_cold void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c)
     c->mct_decode[FF_DWT97]     = ict_float;
     c->mct_decode[FF_DWT53]     = rct_int;
     c->mct_decode[FF_DWT97_INT] = ict_int;
+
+    if (ARCH_X86)
+        ff_jpeg2000dsp_init_x86(c);
 }
diff --git a/libavcodec/jpeg2000dsp.h b/libavcodec/jpeg2000dsp.h
index 45a32c0..1ae5b95 100644
--- a/libavcodec/jpeg2000dsp.h
+++ b/libavcodec/jpeg2000dsp.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,5 +31,6 @@ typedef struct Jpeg2000DSPContext {
 } Jpeg2000DSPContext;
 
 void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c);
+void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c);
 
 #endif /* AVCODEC_JPEG2000DSP_H */
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 6642a53..badf0f8 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * Discrete wavelet transform
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/mem.h"
 #include "jpeg2000dwt.h"
@@ -36,21 +37,16 @@
 #define F_LFTG_BETA   0.052980118572961f
 #define F_LFTG_GAMMA  0.882911075530934f
 #define F_LFTG_DELTA  0.443506852043971f
-#define F_LFTG_K      1.230174104914001f
-#define F_LFTG_X      1.625732422f
-/* FIXME: Why use 1.625732422 instead of 1/F_LFTG_K?
- * Incorrect value in JPEG2000 norm.
- * see (ISO/IEC 15444:1 (version 2002) F.3.8.2 */
 
 /* Lifting parameters in integer format.
  * Computed as param = (float param) * (1 << 16) */
-#define I_LFTG_ALPHA  103949
-#define I_LFTG_BETA     3472
-#define I_LFTG_GAMMA   57862
-#define I_LFTG_DELTA   29066
-#define I_LFTG_K       80621
-#define I_LFTG_X      106544
-
+#define I_LFTG_ALPHA  103949ll
+#define I_LFTG_BETA     3472ll
+#define I_LFTG_GAMMA   57862ll
+#define I_LFTG_DELTA   29066ll
+#define I_LFTG_K       80621ll
+#define I_LFTG_X       53274ll
+#define I_PRESHIFT 8
 
 static inline void extend53(int *p, int i0, int i1)
 {
@@ -80,19 +76,251 @@ static inline void extend97_int(int32_t *p, int i0, int i1)
     }
 }
 
-static void sr_1d53(int *p, int i0, int i1)
+static void sd_1d53(int *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] <<= 1;
         return;
+    }
 
     extend53(p, i0, i1);
 
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
-        p[2 * i] -= (p[2 * i - 1] + p[2 * i + 1] + 2) >> 2;
-    for (i = i0 / 2; i < i1 / 2; i++)
-        p[2 * i + 1] += (p[2 * i] + p[2 * i + 2]) >> 1;
+    for (i = ((i0+1)>>1) - 1; i < (i1+1)>>1; i++)
+        p[2*i+1] -= (p[2*i] + p[2*i+2]) >> 1;
+    for (i = ((i0+1)>>1); i < (i1+1)>>1; i++)
+        p[2*i] += (p[2*i-1] + p[2*i+1] + 2) >> 2;
+}
+
+static void dwt_encode53(DWTContext *s, int *t)
+{
+    int lev,
+        w = s->linelen[s->ndeclevels-1][0];
+    int *line = s->i_linebuf;
+    line += 3;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        int *l;
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d53(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d53(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+    }
+}
+static void sd_1d97_float(float *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] *= F_LFTG_X * 2;
+        else
+            p[0] *= F_LFTG_K;
+        return;
+    }
+
+    extend97_float(p, i0, i1);
+    i0++; i1++;
+
+    for (i = (i0>>1) - 2; i < (i1>>1) + 1; i++)
+        p[2*i+1] -= 1.586134 * (p[2*i] + p[2*i+2]);
+    for (i = (i0>>1) - 1; i < (i1>>1) + 1; i++)
+        p[2*i] -= 0.052980 * (p[2*i-1] + p[2*i+1]);
+    for (i = (i0>>1) - 1; i < (i1>>1); i++)
+        p[2*i+1] += 0.882911 * (p[2*i] + p[2*i+2]);
+    for (i = (i0>>1); i < (i1>>1); i++)
+        p[2*i] += 0.443506 * (p[2*i-1] + p[2*i+1]);
+}
+
+static void dwt_encode97_float(DWTContext *s, float *t)
+{
+    int lev,
+        w = s->linelen[s->ndeclevels-1][0];
+    float *line = s->f_linebuf;
+    line += 5;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        float *l;
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d97_float(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d97_float(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+    }
+}
+
+static void sd_1d97_int(int *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] = (p[1] * I_LFTG_X + (1<<14)) >> 15;
+        else
+            p[0] = (p[0] * I_LFTG_K + (1<<15)) >> 16;
+        return;
+    }
+
+    extend97_int(p, i0, i1);
+    i0++; i1++;
+
+    for (i = (i0>>1) - 2; i < (i1>>1) + 1; i++)
+        p[2 * i + 1] -= (I_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0>>1) - 1; i < (i1>>1) + 1; i++)
+        p[2 * i]     -= (I_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+    for (i = (i0>>1) - 1; i < (i1>>1); i++)
+        p[2 * i + 1] += (I_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0>>1); i < (i1>>1); i++)
+        p[2 * i]     += (I_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+}
+
+static void dwt_encode97_int(DWTContext *s, int *t)
+{
+    int lev;
+    int w = s->linelen[s->ndeclevels-1][0];
+    int h = s->linelen[s->ndeclevels-1][1];
+    int i;
+    int *line = s->i_linebuf;
+    line += 5;
+
+    for (i = 0; i < w * h; i++)
+        t[i] <<= I_PRESHIFT;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        int *l;
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d97_int(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = ((l[i] * I_LFTG_X) + (1 << 15)) >> 16;
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d97_int(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = ((l[i] * I_LFTG_X) + (1 << 15)) >> 16;
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+
+    }
+
+    for (i = 0; i < w * h; i++)
+        t[i] = (t[i] + ((1<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
+}
+
+static void sr_1d53(unsigned *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] = (int)p[1] >> 1;
+        return;
+    }
+
+    extend53(p, i0, i1);
+
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
+        p[2 * i] -= (int)(p[2 * i - 1] + p[2 * i + 1] + 2) >> 2;
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
+        p[2 * i + 1] += (int)(p[2 * i] + p[2 * i + 2]) >> 1;
 }
 
 static void dwt_decode53(DWTContext *s, int *t)
@@ -148,21 +376,26 @@ static void sr_1d97_float(float *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] *= F_LFTG_K/2;
+        else
+            p[0] *= F_LFTG_X;
         return;
+    }
 
     extend97_float(p, i0, i1);
 
-    for (i = i0 / 2 - 1; i < i1 / 2 + 2; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 2; i++)
         p[2 * i]     -= F_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]);
     /* step 4 */
-    for (i = i0 / 2 - 1; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 1; i++)
         p[2 * i + 1] -= F_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]);
     /*step 5*/
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
         p[2 * i]     += F_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]);
     /* step 6 */
-    for (i = i0 / 2; i < i1 / 2; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
         p[2 * i + 1] += F_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]);
 }
 
@@ -188,9 +421,9 @@ static void dwt_decode97_float(DWTContext *s, float *t)
             int i, j = 0;
             // copy with interleaving
             for (i = mh; i < lh; i += 2, j++)
-                l[i] = data[w * lp + j] * F_LFTG_K;
+                l[i] = data[w * lp + j];
             for (i = 1 - mh; i < lh; i += 2, j++)
-                l[i] = data[w * lp + j] * F_LFTG_X;
+                l[i] = data[w * lp + j];
 
             sr_1d97_float(line, mh, mh + lh);
 
@@ -204,9 +437,9 @@ static void dwt_decode97_float(DWTContext *s, float *t)
             int i, j = 0;
             // copy with interleaving
             for (i = mv; i < lv; i += 2, j++)
-                l[i] = data[w * j + lp] * F_LFTG_K;
+                l[i] = data[w * j + lp];
             for (i = 1 - mv; i < lv; i += 2, j++)
-                l[i] = data[w * j + lp] * F_LFTG_X;
+                l[i] = data[w * j + lp];
 
             sr_1d97_float(line, mv, mv + lv);
 
@@ -220,33 +453,43 @@ static void sr_1d97_int(int32_t *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] = (p[1] * I_LFTG_K + (1<<16)) >> 17;
+        else
+            p[0] = (p[0] * I_LFTG_X + (1<<15)) >> 16;
         return;
+    }
 
     extend97_int(p, i0, i1);
 
-    for (i = i0 / 2 - 1; i < i1 / 2 + 2; i++)
-        p[2 * i]     -= (I_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 2; i++)
+        p[2 * i]     -= (I_LFTG_DELTA * (p[2 * i - 1] + (int64_t)p[2 * i + 1]) + (1 << 15)) >> 16;
     /* step 4 */
-    for (i = i0 / 2 - 1; i < i1 / 2 + 1; i++)
-        p[2 * i + 1] -= (I_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 1; i++)
+        p[2 * i + 1] -= (I_LFTG_GAMMA * (p[2 * i]     + (int64_t)p[2 * i + 2]) + (1 << 15)) >> 16;
     /*step 5*/
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
-        p[2 * i]     += (I_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
+        p[2 * i]     += (I_LFTG_BETA  * (p[2 * i - 1] + (int64_t)p[2 * i + 1]) + (1 << 15)) >> 16;
     /* step 6 */
-    for (i = i0 / 2; i < i1 / 2; i++)
-        p[2 * i + 1] += (I_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
+        p[2 * i + 1] += (I_LFTG_ALPHA * (p[2 * i]     + (int64_t)p[2 * i + 2]) + (1 << 15)) >> 16;
 }
 
 static void dwt_decode97_int(DWTContext *s, int32_t *t)
 {
     int lev;
     int w       = s->linelen[s->ndeclevels - 1][0];
+    int h       = s->linelen[s->ndeclevels - 1][1];
+    int i;
     int32_t *line = s->i_linebuf;
     int32_t *data = t;
     /* position at index O of line range [0-5,w+5] cf. extend function */
     line += 5;
 
+    for (i = 0; i < w * h; i++)
+        data[i] *= 1LL << I_PRESHIFT;
+
     for (lev = 0; lev < s->ndeclevels; lev++) {
         int lh = s->linelen[lev][0],
             lv = s->linelen[lev][1],
@@ -262,7 +505,7 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
             for (i = mh; i < lh; i += 2, j++)
                 l[i] = ((data[w * lp + j] * I_LFTG_K) + (1 << 15)) >> 16;
             for (i = 1 - mh; i < lh; i += 2, j++)
-                l[i] = ((data[w * lp + j] * I_LFTG_X) + (1 << 15)) >> 16;
+                l[i] = data[w * lp + j];
 
             sr_1d97_int(line, mh, mh + lh);
 
@@ -278,7 +521,7 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
             for (i = mv; i < lv; i += 2, j++)
                 l[i] = ((data[w * j + lp] * I_LFTG_K) + (1 << 15)) >> 16;
             for (i = 1 - mv; i < lv; i += 2, j++)
-                l[i] = ((data[w * j + lp] * I_LFTG_X) + (1 << 15)) >> 16;
+                l[i] = data[w * j + lp];
 
             sr_1d97_int(line, mv, mv + lv);
 
@@ -286,9 +529,12 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
                 data[w * i + lp] = l[i];
         }
     }
+
+    for (i = 0; i < w * h; i++)
+        data[i] = (data[i] + ((1LL<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
 }
 
-int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
+int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
                          int decomp_levels, int type)
 {
     int i, j, lev = decomp_levels, maxlen,
@@ -312,17 +558,17 @@ int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
         }
     switch (type) {
     case FF_DWT97:
-        s->f_linebuf = av_malloc((maxlen + 12) * sizeof(*s->f_linebuf));
+        s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
         break;
      case FF_DWT97_INT:
-        s->i_linebuf = av_malloc((maxlen + 12) * sizeof(*s->i_linebuf));
+        s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
     case FF_DWT53:
-        s->i_linebuf = av_malloc((maxlen +  6) * sizeof(*s->i_linebuf));
+        s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
@@ -332,8 +578,29 @@ int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
     return 0;
 }
 
+int ff_dwt_encode(DWTContext *s, void *t)
+{
+    if (s->ndeclevels == 0)
+        return 0;
+
+    switch(s->type){
+        case FF_DWT97:
+            dwt_encode97_float(s, t); break;
+        case FF_DWT97_INT:
+            dwt_encode97_int(s, t); break;
+        case FF_DWT53:
+            dwt_encode53(s, t); break;
+        default:
+            return -1;
+    }
+    return 0;
+}
+
 int ff_dwt_decode(DWTContext *s, void *t)
 {
+    if (s->ndeclevels == 0)
+        return 0;
+
     switch (s->type) {
     case FF_DWT97:
         dwt_decode97_float(s, t);
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index f08340d..718d183 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -2,20 +2,20 @@
  * Discrete wavelet transform
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 #include <stdint.h>
 
 #define FF_DWT_MAX_DECLVLS 32 ///< max number of decomposition levels
+#define F_LFTG_K      1.230174104914001f
+#define F_LFTG_X      0.812893066115961f
 
 enum DWTType {
     FF_DWT97,
@@ -40,7 +42,7 @@ enum DWTType {
 
 typedef struct DWTContext {
     /// line lengths { horizontal, vertical } in consecutive decomposition levels
-    uint16_t linelen[FF_DWT_MAX_DECLVLS][2];
+    int linelen[FF_DWT_MAX_DECLVLS][2];
     uint8_t mod[FF_DWT_MAX_DECLVLS][2];  ///< coordinates (x0, y0) of decomp. levels mod 2
     uint8_t ndeclevels;                  ///< number of decomposition levels
     uint8_t type;                        ///< 0 for 9/7; 1 for 5/3
@@ -55,9 +57,10 @@ typedef struct DWTContext {
  * @param decomp_levels     number of decomposition levels
  * @param type              0 for DWT 9/7; 1 for DWT 5/3
  */
-int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
+int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
                          int decomp_levels, int type);
 
+int ff_dwt_encode(DWTContext *s, void *t);
 int ff_dwt_decode(DWTContext *s, void *t);
 
 void ff_dwt_destroy(DWTContext *s);
diff --git a/libavcodec/jpegls.c b/libavcodec/jpegls.c
index 19d461f..7f9fa8d 100644
--- a/libavcodec/jpegls.c
+++ b/libavcodec/jpegls.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,10 +39,8 @@ void ff_jpegls_init_state(JLSState *state)
     for (state->qbpp = 0; (1 << state->qbpp) < state->range; state->qbpp++)
         ;
 
-    if (state->bpp < 8)
-        state->limit = 2 * state->bpp - state->qbpp + 16;
-    else
-        state->limit = 4 * state->bpp - state->qbpp;
+    state->bpp   = FFMAX(av_log2(state->maxval) + 1, 2);
+    state->limit = 2*(state->bpp + FFMAX(state->bpp, 8)) - state->qbpp;
 
     for (i = 0; i < 367; i++) {
         state->A[i] = FFMAX(state->range + 32 >> 6, 2);
diff --git a/libavcodec/jpegls.h b/libavcodec/jpegls.h
index eae3943..6b89b2a 100644
--- a/libavcodec/jpegls.h
+++ b/libavcodec/jpegls.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,9 @@
 
 #include "libavutil/common.h"
 #include "avcodec.h"
+#include "internal.h"
+
+#undef near /* This file uses struct member 'near' which in windows.h is defined as empty. */
 
 typedef struct JpeglsContext {
     AVCodecContext *avctx;
@@ -40,11 +43,9 @@ typedef struct JLSState {
     int A[367], B[367], C[365], N[367];
     int limit, reset, bpp, qbpp, maxval, range;
     int near, twonear;
-    int run_index[3];
+    int run_index[4];
 } JLSState;
 
-extern const uint8_t ff_log2_run[32];
-
 /**
  * Calculate initial JPEG-LS parameters
  */
@@ -98,6 +99,8 @@ static inline void ff_jpegls_downscale_state(JLSState *state, int Q)
 static inline int ff_jpegls_update_state_regular(JLSState *state,
                                                  int Q, int err)
 {
+    if(FFABS(err) > 0xFFFF)
+        return -0x10000;
     state->A[Q] += FFABS(err);
     err         *= state->twonear;
     state->B[Q] += err;
diff --git a/libavcodec/jpeglsdec.c b/libavcodec/jpeglsdec.c
index 8d1e763..5308b74 100644
--- a/libavcodec/jpeglsdec.c
+++ b/libavcodec/jpeglsdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 #include "avcodec.h"
 #include "get_bits.h"
-#include "golomb_legacy.h"
+#include "golomb.h"
 #include "internal.h"
 #include "mathops.h"
 #include "mjpeg.h"
@@ -40,7 +40,7 @@
  * (or test broken JPEG-LS decoder) and slow down ordinary decoding a bit.
  *
  * There is no Golomb code with length >= 32 bits possible, so check and
- * avoid situation of 32 zeros, Libav Golomb decoder is painfully slow
+ * avoid situation of 32 zeros, FFmpeg Golomb decoder is painfully slow
  * on this errors.
  */
 //#define JLS_BROKEN
@@ -51,27 +51,87 @@
 int ff_jpegls_decode_lse(MJpegDecodeContext *s)
 {
     int id;
+    int tid, wt, maxtab, i, j;
 
-    skip_bits(&s->gb, 16);  /* length: FIXME: verify field validity */
+    int len = get_bits(&s->gb, 16);
     id = get_bits(&s->gb, 8);
 
     switch (id) {
     case 1:
+        if (len < 13)
+            return AVERROR_INVALIDDATA;
+
         s->maxval = get_bits(&s->gb, 16);
         s->t1     = get_bits(&s->gb, 16);
         s->t2     = get_bits(&s->gb, 16);
         s->t3     = get_bits(&s->gb, 16);
         s->reset  = get_bits(&s->gb, 16);
 
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
+            av_log(s->avctx, AV_LOG_DEBUG, "Coding parameters maxval:%d T1:%d T2:%d T3:%d reset:%d\n",
+                   s->maxval, s->t1, s->t2, s->t3, s->reset);
+        }
+
 //        ff_jpegls_reset_coding_parameters(s, 0);
         //FIXME quant table?
         break;
     case 2:
+        s->palette_index = 0;
     case 3:
-        av_log(s->avctx, AV_LOG_ERROR, "palette not supported\n");
-        return AVERROR(ENOSYS);
+        tid= get_bits(&s->gb, 8);
+        wt = get_bits(&s->gb, 8);
+
+        if (len < 5)
+            return AVERROR_INVALIDDATA;
+
+        if (wt < 1 || wt > MAX_COMPONENTS) {
+            avpriv_request_sample(s->avctx, "wt %d", wt);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        if (!s->maxval)
+            maxtab = 255;
+        else if ((5 + wt*(s->maxval+1)) < 65535)
+            maxtab = s->maxval;
+        else
+            maxtab = 65530/wt - 1;
+
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
+            av_log(s->avctx, AV_LOG_DEBUG, "LSE palette %d tid:%d wt:%d maxtab:%d\n", id, tid, wt, maxtab);
+        }
+        if (maxtab >= 256) {
+            avpriv_request_sample(s->avctx, ">8bit palette");
+            return AVERROR_PATCHWELCOME;
+        }
+        maxtab = FFMIN(maxtab, (len - 5) / wt + s->palette_index);
+
+        if (s->palette_index > maxtab)
+            return AVERROR_INVALIDDATA;
+
+        if ((s->avctx->pix_fmt == AV_PIX_FMT_GRAY8 || s->avctx->pix_fmt == AV_PIX_FMT_PAL8) &&
+            (s->picture_ptr->format == AV_PIX_FMT_GRAY8 || s->picture_ptr->format == AV_PIX_FMT_PAL8)) {
+            uint32_t *pal = (uint32_t *)s->picture_ptr->data[1];
+            int shift = 0;
+
+            if (s->avctx->bits_per_raw_sample > 0 && s->avctx->bits_per_raw_sample < 8) {
+                maxtab = FFMIN(maxtab, (1<<s->avctx->bits_per_raw_sample)-1);
+                shift = 8 - s->avctx->bits_per_raw_sample;
+            }
+
+            s->picture_ptr->format =
+            s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            for (i=s->palette_index; i<=maxtab; i++) {
+                uint8_t k = i << shift;
+                pal[k] = 0;
+                for (j=0; j<wt; j++) {
+                    pal[k] |= get_bits(&s->gb, 8) << (8*(wt-j-1));
+                }
+            }
+            s->palette_index = i;
+        }
+        break;
     case 4:
-        av_log(s->avctx, AV_LOG_ERROR, "oversize image not supported\n");
+        avpriv_request_sample(s->avctx, "oversize image");
         return AVERROR(ENOSYS);
     default:
         av_log(s->avctx, AV_LOG_ERROR, "invalid id %d\n", id);
@@ -149,6 +209,8 @@ static inline int ls_get_code_runterm(GetBitContext *gb, JLSState *state,
         ret = ret >> 1;
     }
 
+    if(FFABS(ret) > 0xFFFF)
+        return -0x10000;
     /* update state */
     state->A[Q] += FFABS(ret) - RItype;
     ret         *= state->twonear;
@@ -171,6 +233,9 @@ static inline void ls_decode_line(JLSState *state, MJpegDecodeContext *s,
     while (x < w) {
         int err, pred;
 
+        if (get_bits_left(&s->gb) <= 0)
+            return;
+
         /* compute gradients */
         Ra = x ? R(dst, x - stride) : R(last, x);
         Rb = R(last, x);
@@ -208,11 +273,20 @@ static inline void ls_decode_line(JLSState *state, MJpegDecodeContext *s,
             r = ff_log2_run[state->run_index[comp]];
             if (r)
                 r = get_bits_long(&s->gb, r);
+            if (x + r * stride > w) {
+                r = (w - x) / stride;
+            }
             for (i = 0; i < r; i++) {
                 W(dst, x, Ra);
                 x += stride;
             }
 
+            if (x >= w) {
+                av_log(NULL, AV_LOG_ERROR, "run overflow\n");
+                av_assert0(x <= w);
+                return;
+            }
+
             /* decode run termination value */
             Rb     = R(last, x);
             RItype = (FFABS(Ra - Rb) <= state->near) ? 1 : 0;
@@ -304,21 +378,32 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
     else
         shift = point_transform + (16 - s->bits);
 
-    ff_dlog(s->avctx,
-            "JPEG-LS params: %ix%i NEAR=%i MV=%i T(%i,%i,%i) "
-            "RESET=%i, LIMIT=%i, qbpp=%i, RANGE=%i\n",
-            s->width, s->height, state->near, state->maxval,
-            state->T1, state->T2, state->T3,
-            state->reset, state->limit, state->qbpp, state->range);
-    ff_dlog(s->avctx, "JPEG params: ILV=%i Pt=%i BPP=%i, scan = %i\n",
-            ilv, point_transform, s->bits, s->cur_scan);
+    if (shift >= 16) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "JPEG-LS params: %ix%i NEAR=%i MV=%i T(%i,%i,%i) "
+               "RESET=%i, LIMIT=%i, qbpp=%i, RANGE=%i\n",
+                s->width, s->height, state->near, state->maxval,
+                state->T1, state->T2, state->T3,
+                state->reset, state->limit, state->qbpp, state->range);
+        av_log(s->avctx, AV_LOG_DEBUG, "JPEG params: ILV=%i Pt=%i BPP=%i, scan = %i\n",
+                ilv, point_transform, s->bits, s->cur_scan);
+    }
+    if (get_bits_left(&s->gb) < s->height) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
     if (ilv == 0) { /* separate planes */
         if (s->cur_scan > s->nb_components) {
             ret = AVERROR_INVALIDDATA;
             goto end;
         }
-        off    = s->cur_scan - 1;
         stride = (s->nb_components > 1) ? 3 : 1;
+        off    = av_clip(s->cur_scan - 1, 0, stride - 1);
         width  = s->width * stride;
         cur   += off;
         for (i = 0; i < s->height; i++) {
@@ -340,12 +425,13 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
     } else if (ilv == 1) { /* line interleaving */
         int j;
         int Rc[3] = { 0, 0, 0 };
+        stride = (s->nb_components > 1) ? 3 : 1;
         memset(cur, 0, s->picture_ptr->linesize[0]);
-        width = s->width * 3;
+        width = s->width * stride;
         for (i = 0; i < s->height; i++) {
-            for (j = 0; j < 3; j++) {
+            for (j = 0; j < stride; j++) {
                 ls_decode_line(state, s, last + j, cur + j,
-                               Rc[j], width, 3, j, 8);
+                               Rc[j], width, stride, j, 8);
                 Rc[j] = last[j];
 
                 if (s->restart_interval && !--s->restart_count) {
@@ -360,6 +446,57 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
         avpriv_report_missing_feature(s->avctx, "Sample interleaved images");
         ret = AVERROR_PATCHWELCOME;
         goto end;
+    } else { /* unknown interleaving */
+        avpriv_report_missing_feature(s->avctx, "Unknown interleaved images");
+        ret = AVERROR_PATCHWELCOME;
+        goto end;
+    }
+
+    if (s->xfrm && s->nb_components == 3) {
+        int x, w;
+
+        w = s->width * s->nb_components;
+
+        if (s->bits <= 8) {
+            uint8_t *src = s->picture_ptr->data[0];
+
+            for (i = 0; i < s->height; i++) {
+                switch(s->xfrm) {
+                case 1:
+                    for (x = off; x < w; x += 3) {
+                        src[x  ] += src[x+1] + 128;
+                        src[x+2] += src[x+1] + 128;
+                    }
+                    break;
+                case 2:
+                    for (x = off; x < w; x += 3) {
+                        src[x  ] += src[x+1] + 128;
+                        src[x+2] += ((src[x  ] + src[x+1])>>1) + 128;
+                    }
+                    break;
+                case 3:
+                    for (x = off; x < w; x += 3) {
+                        int g = src[x+0] - ((src[x+2]+src[x+1])>>2) + 64;
+                        src[x+0] = src[x+2] + g + 128;
+                        src[x+2] = src[x+1] + g + 128;
+                        src[x+1] = g;
+                    }
+                    break;
+                case 4:
+                    for (x = off; x < w; x += 3) {
+                        int r    = src[x+0] - ((                       359 * (src[x+2]-128) + 490) >> 8);
+                        int g    = src[x+0] - (( 88 * (src[x+1]-128) - 183 * (src[x+2]-128) +  30) >> 8);
+                        int b    = src[x+0] + ((454 * (src[x+1]-128)                        + 574) >> 8);
+                        src[x+0] = av_clip_uint8(r);
+                        src[x+1] = av_clip_uint8(g);
+                        src[x+2] = av_clip_uint8(b);
+                    }
+                    break;
+                }
+                src += s->picture_ptr->linesize[0];
+            }
+        }else
+            avpriv_report_missing_feature(s->avctx, "16bit xfrm");
     }
 
     if (shift) { /* we need to do point transform or normalize samples */
diff --git a/libavcodec/jpeglsdec.h b/libavcodec/jpeglsdec.h
index d60a87b..0cafaba 100644
--- a/libavcodec/jpeglsdec.h
+++ b/libavcodec/jpeglsdec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jpeglsenc.c b/libavcodec/jpeglsenc.c
index fb3c69f..1208cda 100644
--- a/libavcodec/jpeglsenc.c
+++ b/libavcodec/jpeglsenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,8 @@
 
 #include "avcodec.h"
 #include "get_bits.h"
-#include "golomb_legacy.h"
+#include "put_bits.h"
+#include "golomb.h"
 #include "internal.h"
 #include "mathops.h"
 #include "mjpeg.h"
@@ -263,7 +264,7 @@ static int encode_picture_ls(AVCodecContext *avctx, AVPacket *pkt,
     uint8_t *zero = NULL;
     uint8_t *cur  = NULL;
     uint8_t *last = NULL;
-    JLSState *state;
+    JLSState *state = NULL;
     int i, size, ret;
     int comps;
 
@@ -280,11 +281,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     else
         comps = 3;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height * comps * 4 +
-                               AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width  *avctx->height * comps * 4 +
+                                AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     buf2 = av_malloc(pkt->size);
     if (!buf2)
@@ -330,7 +329,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     ls_store_lse(state, &pb);
 
-    zero = last = av_mallocz(p->linesize[0]);
+    zero = last = av_mallocz(FFABS(p->linesize[0]));
     if (!zero)
         goto memfail;
 
@@ -472,6 +471,7 @@ AVCodec ff_jpegls_encoder = {
     .priv_data_size = sizeof(JPEGLSContext),
     .priv_class     = &jpegls_class,
     .init           = encode_init_ls,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .encode2        = encode_picture_ls,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_BGR24, AV_PIX_FMT_RGB24,
diff --git a/libavcodec/jpegtables.c b/libavcodec/jpegtables.c
index ce2bae2..cbe5523 100644
--- a/libavcodec/jpegtables.c
+++ b/libavcodec/jpegtables.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-const unsigned char std_luminance_quant_tbl[64] = {
+static const unsigned char std_luminance_quant_tbl[64] = {
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
     14,  13,  16,  24,  40,  57,  69,  56,
@@ -48,7 +48,7 @@ const unsigned char std_luminance_quant_tbl[64] = {
     49,  64,  78,  87, 103, 121, 120, 101,
     72,  92,  95,  98, 112, 100, 103,  99
 };
-const unsigned char std_chrominance_quant_tbl[64] = {
+static const unsigned char std_chrominance_quant_tbl[64] = {
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
     24,  26,  56,  99,  99,  99,  99,  99,
diff --git a/libavcodec/jpegtables.h b/libavcodec/jpegtables.h
index 44c2aca..aa38df4 100644
--- a/libavcodec/jpegtables.h
+++ b/libavcodec/jpegtables.h
@@ -1,20 +1,20 @@
 /*
  * JPEG-related tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jrevdct.c b/libavcodec/jrevdct.c
index 808f583..a1a0f57 100644
--- a/libavcodec/jrevdct.c
+++ b/libavcodec/jrevdct.c
@@ -63,6 +63,7 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
 #include "dct.h"
 #include "idctdsp.h"
@@ -234,7 +235,7 @@ void ff_j_rev_dct(DCTBLOCK data)
      * row DCT calculations can be simplified this way.
      */
 
-    register int *idataptr = (int*)dataptr;
+    register uint8_t *idataptr = (uint8_t*)dataptr;
 
     /* WARNING: we do the same permutation as MMX idct to simplify the
        video core */
@@ -254,10 +255,10 @@ void ff_j_rev_dct(DCTBLOCK data)
           int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
           register int v = (dcval & 0xffff) | ((dcval * (1 << 16)) & 0xffff0000);
 
-          idataptr[0] = v;
-          idataptr[1] = v;
-          idataptr[2] = v;
-          idataptr[3] = v;
+          AV_WN32A(&idataptr[ 0], v);
+          AV_WN32A(&idataptr[ 4], v);
+          AV_WN32A(&idataptr[ 8], v);
+          AV_WN32A(&idataptr[12], v);
       }
 
       dataptr += DCTSIZE;       /* advance pointer to next row */
@@ -943,14 +944,227 @@ void ff_j_rev_dct(DCTBLOCK data)
   }
 }
 
+#undef DCTSIZE
+#define DCTSIZE 4
+#define DCTSTRIDE 8
+
+void ff_j_rev_dct4(DCTBLOCK data)
+{
+  int32_t tmp0, tmp1, tmp2, tmp3;
+  int32_t tmp10, tmp11, tmp12, tmp13;
+  int32_t z1;
+  int32_t d0, d2, d4, d6;
+  register int16_t *dataptr;
+  int rowctr;
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  data[0] += 4;
+
+  dataptr = data;
+
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any row in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * row DCT calculations can be simplified this way.
+     */
+
+    register uint8_t *idataptr = (uint8_t*)dataptr;
+
+    d0 = dataptr[0];
+    d2 = dataptr[1];
+    d4 = dataptr[2];
+    d6 = dataptr[3];
+
+    if ((d2 | d4 | d6) == 0) {
+      /* AC terms all zero */
+      if (d0) {
+          /* Compute a 32 bit value to assign. */
+          int16_t dcval = (int16_t) (d0 << PASS1_BITS);
+          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
+
+          AV_WN32A(&idataptr[0], v);
+          AV_WN32A(&idataptr[4], v);
+      }
+
+      dataptr += DCTSTRIDE;     /* advance pointer to next row */
+      continue;
+    }
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            }
+    } else {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+            }
+      }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSTRIDE;       /* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  dataptr = data;
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Columns of zeroes can be exploited in the same way as we did with rows.
+     * However, the row calculation has created many nonzero AC terms, so the
+     * simplification applies less often (typically 5% to 10% of the time).
+     * On machines with very fast multiplication, it's possible that the
+     * test takes more time than it's worth.  In that case this section
+     * may be commented out.
+     */
+
+    d0 = dataptr[DCTSTRIDE*0];
+    d2 = dataptr[DCTSTRIDE*1];
+    d4 = dataptr[DCTSTRIDE*2];
+    d6 = dataptr[DCTSTRIDE*3];
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            }
+    } else {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+            }
+    }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
+
+    dataptr++;                  /* advance pointer to next column */
+  }
+}
+
+void ff_j_rev_dct2(DCTBLOCK data){
+  int d00, d01, d10, d11;
+
+  data[0] += 4;
+  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
+  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
+  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
+  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
+
+  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
+  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
+  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
+  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
+}
+
+void ff_j_rev_dct1(DCTBLOCK data){
+  data[0] = (data[0] + 4)>>3;
+}
+
+#undef FIX
+#undef CONST_BITS
+
 void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct(block);
-    ff_put_pixels_clamped(block, dest, line_size);
+    ff_put_pixels_clamped_c(block, dest, line_size);
 }
 
 void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_j_rev_dct(block);
-    ff_add_pixels_clamped(block, dest, line_size);
+    ff_add_pixels_clamped_c(block, dest, line_size);
 }
diff --git a/libavcodec/jvdec.c b/libavcodec/jvdec.c
index 3a92b58..4337d56 100644
--- a/libavcodec/jvdec.c
+++ b/libavcodec/jvdec.c
@@ -2,20 +2,20 @@
  * Bitmap Brothers JV video decoder
  * Copyright (c) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,8 @@
 #include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "blockdsp.h"
+#include "get_bits.h"
 #include "internal.h"
 
 typedef struct JvContext {
@@ -55,91 +55,91 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
-    ff_blockdsp_init(&s->bdsp);
+    ff_blockdsp_init(&s->bdsp, avctx);
     return 0;
 }
 
 /**
  * Decode 2x2 block
  */
-static inline void decode2x2(BitstreamContext *bc, uint8_t *dst, int linesize)
+static inline void decode2x2(GetBitContext *gb, uint8_t *dst, int linesize)
 {
     int i, j, v[2];
 
-    switch (bitstream_read(bc, 2)) {
+    switch (get_bits(gb, 2)) {
     case 1:
-        v[0] = bitstream_read(bc, 8);
+        v[0] = get_bits(gb, 8);
         for (j = 0; j < 2; j++)
             memset(dst + j * linesize, v[0], 2);
         break;
     case 2:
-        v[0] = bitstream_read(bc, 8);
-        v[1] = bitstream_read(bc, 8);
+        v[0] = get_bits(gb, 8);
+        v[1] = get_bits(gb, 8);
         for (j = 0; j < 2; j++)
             for (i = 0; i < 2; i++)
-                dst[j * linesize + i] = v[bitstream_read_bit(bc)];
+                dst[j * linesize + i] = v[get_bits1(gb)];
         break;
     case 3:
         for (j = 0; j < 2; j++)
             for (i = 0; i < 2; i++)
-                dst[j * linesize + i] = bitstream_read(bc, 8);
+                dst[j * linesize + i] = get_bits(gb, 8);
     }
 }
 
 /**
  * Decode 4x4 block
  */
-static inline void decode4x4(BitstreamContext *bc, uint8_t *dst, int linesize)
+static inline void decode4x4(GetBitContext *gb, uint8_t *dst, int linesize)
 {
     int i, j, v[2];
 
-    switch (bitstream_read(bc, 2)) {
+    switch (get_bits(gb, 2)) {
     case 1:
-        v[0] = bitstream_read(bc, 8);
+        v[0] = get_bits(gb, 8);
         for (j = 0; j < 4; j++)
             memset(dst + j * linesize, v[0], 4);
         break;
     case 2:
-        v[0] = bitstream_read(bc, 8);
-        v[1] = bitstream_read(bc, 8);
+        v[0] = get_bits(gb, 8);
+        v[1] = get_bits(gb, 8);
         for (j = 2; j >= 0; j -= 2) {
             for (i = 0; i < 4; i++)
-                dst[j * linesize + i] = v[bitstream_read_bit(bc)];
+                dst[j * linesize + i] = v[get_bits1(gb)];
             for (i = 0; i < 4; i++)
-                dst[(j + 1) * linesize + i] = v[bitstream_read_bit(bc)];
+                dst[(j + 1) * linesize + i] = v[get_bits1(gb)];
         }
         break;
     case 3:
         for (j = 0; j < 4; j += 2)
             for (i = 0; i < 4; i += 2)
-                decode2x2(bc, dst + j * linesize + i, linesize);
+                decode2x2(gb, dst + j * linesize + i, linesize);
     }
 }
 
 /**
  * Decode 8x8 block
  */
-static inline void decode8x8(BitstreamContext *bc, uint8_t *dst, int linesize,
+static inline void decode8x8(GetBitContext *gb, uint8_t *dst, int linesize,
                              BlockDSPContext *bdsp)
 {
     int i, j, v[2];
 
-    switch (bitstream_read(bc, 2)) {
+    switch (get_bits(gb, 2)) {
     case 1:
-        v[0] = bitstream_read(bc, 8);
+        v[0] = get_bits(gb, 8);
         bdsp->fill_block_tab[1](dst, v[0], linesize, 8);
         break;
     case 2:
-        v[0] = bitstream_read(bc, 8);
-        v[1] = bitstream_read(bc, 8);
+        v[0] = get_bits(gb, 8);
+        v[1] = get_bits(gb, 8);
         for (j = 7; j >= 0; j--)
             for (i = 0; i < 8; i++)
-                dst[j * linesize + i] = v[bitstream_read_bit(bc)];
+                dst[j * linesize + i] = v[get_bits1(gb)];
         break;
     case 3:
         for (j = 0; j < 8; j += 4)
             for (i = 0; i < 8; i += 4)
-                decode4x4(bc, dst + j * linesize + i, linesize);
+                decode4x4(gb, dst + j * linesize + i, linesize);
     }
 }
 
@@ -147,39 +147,46 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
     JvContext *s = avctx->priv_data;
-    int buf_size = avpkt->size;
     const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = buf + buf_size;
+    const uint8_t *buf_end = buf + avpkt->size;
     int video_size, video_type, i, j, ret;
 
+    if (avpkt->size < 6)
+        return AVERROR_INVALIDDATA;
+
     video_size = AV_RL32(buf);
     video_type = buf[4];
     buf += 5;
 
     if (video_size) {
-        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return ret;
+        if (video_size < 0 || video_size > avpkt->size - 5) {
+            av_log(avctx, AV_LOG_ERROR, "video size %d invalid\n", video_size);
+            return AVERROR_INVALIDDATA;
         }
+        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+            return ret;
 
         if (video_type == 0 || video_type == 1) {
-            BitstreamContext bc;
-            bitstream_init8(&bc, buf, FFMIN(video_size, buf_end - buf));
+            GetBitContext gb;
+            init_get_bits(&gb, buf, 8 * video_size);
+
+            if (avctx->height/8 * (avctx->width/8) > 4 * video_size) {
+                av_log(avctx, AV_LOG_ERROR, "Insufficient input data for dimensions\n");
+                return AVERROR_INVALIDDATA;
+            }
 
             for (j = 0; j < avctx->height; j += 8)
                 for (i = 0; i < avctx->width; i += 8)
-                    decode8x8(&bc,
+                    decode8x8(&gb,
                               s->frame->data[0] + j * s->frame->linesize[0] + i,
                               s->frame->linesize[0], &s->bdsp);
 
             buf += video_size;
         } else if (video_type == 2) {
-            if (buf + 1 <= buf_end) {
-                int v = *buf++;
-                for (j = 0; j < avctx->height; j++)
-                    memset(s->frame->data[0] + j * s->frame->linesize[0],
-                           v, avctx->width);
-            }
+            int v = *buf++;
+            for (j = 0; j < avctx->height; j++)
+                memset(s->frame->data[0] + j * s->frame->linesize[0],
+                       v, avctx->width);
         } else {
             av_log(avctx, AV_LOG_WARNING,
                    "unsupported frame type %i\n", video_type);
@@ -187,9 +194,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
-    if (buf < buf_end) {
-        for (i = 0; i < AVPALETTE_COUNT && buf + 3 <= buf_end; i++) {
-            s->palette[i] = AV_RB24(buf) << 2;
+    if (buf_end - buf >= AVPALETTE_COUNT * 3) {
+        for (i = 0; i < AVPALETTE_COUNT; i++) {
+            uint32_t pal = AV_RB24(buf);
+            s->palette[i] = 0xFFU << 24 | pal << 2 | ((pal >> 4) & 0x30303);
             buf += 3;
         }
         s->palette_has_changed = 1;
@@ -207,7 +215,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         *got_frame = 1;
     }
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_close(AVCodecContext *avctx)
diff --git a/libavcodec/kbdwin.c b/libavcodec/kbdwin.c
index 1b7313d..bf32aeb 100644
--- a/libavcodec/kbdwin.c
+++ b/libavcodec/kbdwin.c
@@ -1,22 +1,22 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
+#include "libavutil/avassert.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/attributes.h"
 #include "kbdwin.h"
@@ -30,7 +30,7 @@ av_cold void ff_kbd_window_init(float *window, float alpha, int n)
    double local_window[FF_KBD_WINDOW_MAX];
    double alpha2 = (alpha * M_PI / n) * (alpha * M_PI / n);
 
-   assert(n <= FF_KBD_WINDOW_MAX);
+   av_assert0(n <= FF_KBD_WINDOW_MAX);
 
    for (i = 0; i < n; i++) {
        tmp = i * (n - i) * alpha2;
@@ -45,3 +45,13 @@ av_cold void ff_kbd_window_init(float *window, float alpha, int n)
    for (i = 0; i < n; i++)
        window[i] = sqrt(local_window[i] / sum);
 }
+
+av_cold void ff_kbd_window_init_fixed(int32_t *window, float alpha, int n)
+{
+    int i;
+    float local_window[FF_KBD_WINDOW_MAX];
+
+    ff_kbd_window_init(local_window, alpha, n);
+    for (i = 0; i < n; i++)
+        window[i] = (int)floor(2147483647.0 * local_window[i] + 0.5);
+}
diff --git a/libavcodec/kbdwin.h b/libavcodec/kbdwin.h
index 89b569a..4185c42 100644
--- a/libavcodec/kbdwin.h
+++ b/libavcodec/kbdwin.h
@@ -1,24 +1,26 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_KBDWIN_H
 #define AVCODEC_KBDWIN_H
 
+#include <stdint.h>
+
 /**
  * Maximum window size for ff_kbd_window_init.
  */
@@ -31,5 +33,6 @@
  * @param   n       size of half window, max FF_KBD_WINDOW_MAX
  */
 void ff_kbd_window_init(float *window, float alpha, int n);
+void ff_kbd_window_init_fixed(int32_t *window, float alpha, int n);
 
 #endif /* AVCODEC_KBDWIN_H */
diff --git a/libavcodec/kgv1dec.c b/libavcodec/kgv1dec.c
index 0bf322e..a6bd940 100644
--- a/libavcodec/kgv1dec.c
+++ b/libavcodec/kgv1dec.c
@@ -2,20 +2,20 @@
  * Kega Game Video (KGV1) decoder
  * Copyright (c) 2010 Daniel Verkamp
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,6 @@
 #include "internal.h"
 
 typedef struct KgvContext {
-    AVCodecContext *avctx;
     uint16_t *frame_buffer;
     uint16_t *last_frame_buffer;
 } KgvContext;
@@ -52,7 +51,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     const uint8_t *buf_end = buf + avpkt->size;
     KgvContext * const c = avctx->priv_data;
     int offsets[8];
-    uint16_t *out, *prev;
+    uint8_t *out, *prev;
     int outcnt = 0, maxcnt;
     int w, h, i, res;
 
@@ -63,6 +62,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     h = (buf[1] + 1) * 8;
     buf += 2;
 
+    if (avpkt->size < 2 + w*h / 513)
+        return AVERROR_INVALIDDATA;
+
     if (w != avctx->width || h != avctx->height) {
         av_freep(&c->frame_buffer);
         av_freep(&c->last_frame_buffer);
@@ -83,22 +85,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    out  = c->frame_buffer;
-    prev = c->last_frame_buffer;
+    out  = (uint8_t*)c->frame_buffer;
+    prev = (uint8_t*)c->last_frame_buffer;
 
     for (i = 0; i < 8; i++)
         offsets[i] = -1;
 
-    while (outcnt < maxcnt && buf_end - 2 > buf) {
+    while (outcnt < maxcnt && buf_end - 2 >= buf) {
         int code = AV_RL16(buf);
         buf += 2;
 
         if (!(code & 0x8000)) {
-            out[outcnt++] = code; // rgb555 pixel coded directly
+            AV_WN16A(&out[2 * outcnt], code); // rgb555 pixel coded directly
+            outcnt++;
         } else {
             int count;
-            int inp_off;
-            uint16_t *inp;
 
             if ((code & 0x6000) == 0x6000) {
                 // copy from previous frame
@@ -116,7 +117,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
                 start = (outcnt + offsets[oidx]) % maxcnt;
 
-                if (maxcnt - start < count)
+                if (maxcnt - start < count || maxcnt - outcnt < count)
                     break;
 
                 if (!prev) {
@@ -125,8 +126,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     break;
                 }
 
-                inp = prev;
-                inp_off = start;
+                memcpy(out + 2 * outcnt, prev + 2 * start, 2 * count);
             } else {
                 // copy from earlier in this frame
                 int offset = (code & 0x1FFF) + 1;
@@ -141,19 +141,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     count = 4 + *buf++;
                 }
 
-                if (outcnt < offset)
+                if (outcnt < offset || maxcnt - outcnt < count)
                     break;
 
-                inp = out;
-                inp_off = outcnt - offset;
-            }
-
-            if (maxcnt - outcnt < count)
-                break;
-
-            for (i = inp_off; i < count + inp_off; i++) {
-                out[outcnt++] = inp[i];
+                av_memcpy_backptr(out + 2 * outcnt, 2 * offset, 2 * count);
             }
+            outcnt += count;
         }
     }
 
@@ -172,9 +165,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    KgvContext * const c = avctx->priv_data;
-
-    c->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_RGB555;
 
     return 0;
diff --git a/libavcodec/kmvc.c b/libavcodec/kmvc.c
index ca6b79f..ffe6a14 100644
--- a/libavcodec/kmvc.c
+++ b/libavcodec/kmvc.c
@@ -2,20 +2,20 @@
  * KMVC decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -107,6 +107,10 @@ static int kmvc_decode_intra_8x8(KmvcContext * ctx, int w, int h)
                             val = bytestream2_get_byte(&ctx->g);
                             mx = val & 0xF;
                             my = val >> 4;
+                            if ((l0x-mx) + 320*(l0y-my) < 0 || (l0x-mx) + 320*(l0y-my) > 320*197 - 4) {
+                                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                return AVERROR_INVALIDDATA;
+                            }
                             for (j = 0; j < 16; j++)
                                 BLK(ctx->cur, l0x + (j & 3), l0y + (j >> 2)) =
                                     BLK(ctx->cur, l0x + (j & 3) - mx, l0y + (j >> 2) - my);
@@ -128,6 +132,10 @@ static int kmvc_decode_intra_8x8(KmvcContext * ctx, int w, int h)
                                     val = bytestream2_get_byte(&ctx->g);
                                     mx = val & 0xF;
                                     my = val >> 4;
+                                    if ((l1x-mx) + 320*(l1y-my) < 0 || (l1x-mx) + 320*(l1y-my) > 320*199 - 2) {
+                                        av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
                                     BLK(ctx->cur, l1x, l1y) = BLK(ctx->cur, l1x - mx, l1y - my);
                                     BLK(ctx->cur, l1x + 1, l1y) =
                                         BLK(ctx->cur, l1x + 1 - mx, l1y - my);
@@ -199,6 +207,10 @@ static int kmvc_decode_inter_8x8(KmvcContext * ctx, int w, int h)
                             val = bytestream2_get_byte(&ctx->g);
                             mx = (val & 0xF) - 8;
                             my = (val >> 4) - 8;
+                            if ((l0x+mx) + 320*(l0y+my) < 0 || (l0x+mx) + 320*(l0y+my) > 320*197 - 4) {
+                                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                return AVERROR_INVALIDDATA;
+                            }
                             for (j = 0; j < 16; j++)
                                 BLK(ctx->cur, l0x + (j & 3), l0y + (j >> 2)) =
                                     BLK(ctx->prev, l0x + (j & 3) + mx, l0y + (j >> 2) + my);
@@ -220,6 +232,10 @@ static int kmvc_decode_inter_8x8(KmvcContext * ctx, int w, int h)
                                     val = bytestream2_get_byte(&ctx->g);
                                     mx = (val & 0xF) - 8;
                                     my = (val >> 4) - 8;
+                                    if ((l1x+mx) + 320*(l1y+my) < 0 || (l1x+mx) + 320*(l1y+my) > 320*199 - 2) {
+                                        av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
                                     BLK(ctx->cur, l1x, l1y) = BLK(ctx->prev, l1x + mx, l1y + my);
                                     BLK(ctx->cur, l1x + 1, l1y) =
                                         BLK(ctx->prev, l1x + 1 + mx, l1y + my);
@@ -252,14 +268,13 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
     int i, ret;
     int header;
     int blocksize;
-    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+    int pal_size;
+    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &pal_size);
 
     bytestream2_init(&ctx->g, avpkt->data, avpkt->size);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     header = bytestream2_get_byte(&ctx->g);
 
@@ -267,7 +282,7 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
     if (bytestream2_peek_byte(&ctx->g) == 127) {
         bytestream2_skip(&ctx->g, 3);
         for (i = 0; i < 127; i++) {
-            ctx->pal[i + (header & 0x81)] = bytestream2_get_be24(&ctx->g);
+            ctx->pal[i + (header & 0x81)] = 0xFFU << 24 | bytestream2_get_be24(&ctx->g);
             bytestream2_skip(&ctx->g, 1);
         }
         bytestream2_seek(&ctx->g, -127 * 4 - 3, SEEK_CUR);
@@ -285,13 +300,15 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
         frame->palette_has_changed = 1;
         // palette starts from index 1 and has 127 entries
         for (i = 1; i <= ctx->palsize; i++) {
-            ctx->pal[i] = bytestream2_get_be24(&ctx->g);
+            ctx->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&ctx->g);
         }
     }
 
-    if (pal) {
+    if (pal && pal_size == AVPALETTE_SIZE) {
         frame->palette_has_changed = 1;
         memcpy(ctx->pal, pal, AVPALETTE_SIZE);
+    } else if (pal) {
+        av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", pal_size);
     }
 
     if (ctx->setpal) {
@@ -369,7 +386,7 @@ static av_cold int decode_init(AVCodecContext * avctx)
     c->prev = c->frm1;
 
     for (i = 0; i < 256; i++) {
-        c->pal[i] = i * 0x10101;
+        c->pal[i] = 0xFFU << 24 | i * 0x10101;
     }
 
     if (avctx->extradata_size < 12) {
@@ -378,7 +395,8 @@ static av_cold int decode_init(AVCodecContext * avctx)
         c->palsize = 127;
     } else {
         c->palsize = AV_RL16(avctx->extradata + 10);
-        if (c->palsize >= MAX_PALSIZE) {
+        if (c->palsize >= (unsigned)MAX_PALSIZE) {
+            c->palsize = 127;
             av_log(avctx, AV_LOG_ERROR, "KMVC palette too large\n");
             return AVERROR_INVALIDDATA;
         }
diff --git a/libavcodec/lagarith.c b/libavcodec/lagarith.c
index 95e6aba..59169be 100644
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@@ -2,20 +2,20 @@
  * Lagarith lossless decoder
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,10 @@
 #include <inttypes.h>
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "mathops.h"
-#include "huffyuvdsp.h"
 #include "lagarithrac.h"
+#include "lossless_videodsp.h"
 #include "thread.h"
 
 enum LagarithFrameType {
@@ -50,12 +50,9 @@ enum LagarithFrameType {
 
 typedef struct LagarithContext {
     AVCodecContext *avctx;
-    HuffYUVDSPContext hdsp;
+    LLVidDSPContext llviddsp;
     int zeros;                  /**< number of consecutive zero bytes encountered */
     int zeros_rem;              /**< number of zero bytes remaining to output */
-    uint8_t *rgb_planes;
-    int      rgb_planes_allocated;
-    int rgb_stride;
 } LagarithContext;
 
 /**
@@ -91,17 +88,17 @@ static uint32_t softfloat_mul(uint32_t x, uint64_t mantissa)
     uint64_t h = x * (mantissa >> 32);
     h += l >> 32;
     l &= 0xffffffff;
-    l += 1 << av_log2(h >> 21);
+    l += 1LL << av_log2(h >> 21);
     h += l >> 32;
     return h >> 20;
 }
 
 static uint8_t lag_calc_zero_run(int8_t x)
 {
-    return (x << 1) ^ (x >> 7);
+    return (x * 2) ^ (x >> 7);
 }
 
-static int lag_decode_prob(BitstreamContext *bc, uint32_t *value)
+static int lag_decode_prob(GetBitContext *gb, uint32_t *value)
 {
     static const uint8_t series[] = { 1, 2, 3, 5, 8, 13, 21 };
     int i;
@@ -114,7 +111,7 @@ static int lag_decode_prob(BitstreamContext *bc, uint32_t *value)
         if (prevbit && bit)
             break;
         prevbit = bit;
-        bit = bitstream_read_bit(bc);
+        bit = get_bits1(gb);
         if (bit && !prevbit)
             bits += series[i];
     }
@@ -127,26 +124,27 @@ static int lag_decode_prob(BitstreamContext *bc, uint32_t *value)
         return 0;
     }
 
-    val  = bitstream_read(bc, bits);
-    val |= 1 << bits;
+    val  = get_bits_long(gb, bits);
+    val |= 1U << bits;
 
     *value = val - 1;
 
     return 0;
 }
 
-static int lag_read_prob_header(lag_rac *rac, BitstreamContext *bc)
+static int lag_read_prob_header(lag_rac *rac, GetBitContext *gb)
 {
     int i, j, scale_factor;
     unsigned prob, cumulative_target;
     unsigned cumul_prob = 0;
     unsigned scaled_cumul_prob = 0;
+    int nnz = 0;
 
     rac->prob[0] = 0;
     rac->prob[257] = UINT_MAX;
     /* Read probabilities from bitstream */
     for (i = 1; i < 257; i++) {
-        if (lag_decode_prob(bc, &rac->prob[i]) < 0) {
+        if (lag_decode_prob(gb, &rac->prob[i]) < 0) {
             av_log(rac->avctx, AV_LOG_ERROR, "Invalid probability encountered.\n");
             return -1;
         }
@@ -156,14 +154,16 @@ static int lag_read_prob_header(lag_rac *rac, BitstreamContext *bc)
         }
         cumul_prob += rac->prob[i];
         if (!rac->prob[i]) {
-            if (lag_decode_prob(bc, &prob)) {
+            if (lag_decode_prob(gb, &prob)) {
                 av_log(rac->avctx, AV_LOG_ERROR, "Invalid probability run encountered.\n");
                 return -1;
             }
-            if (prob > 257 - i)
-                prob = 257 - i;
+            if (prob > 256 - i)
+                prob = 256 - i;
             for (j = 0; j < prob; j++)
                 rac->prob[++i] = 0;
+        }else {
+            nnz++;
         }
     }
 
@@ -172,18 +172,32 @@ static int lag_read_prob_header(lag_rac *rac, BitstreamContext *bc)
         return -1;
     }
 
+    if (nnz == 1 && (show_bits_long(gb, 32) & 0xFFFFFF)) {
+        return AVERROR_INVALIDDATA;
+    }
+
     /* Scale probabilities so cumulative probability is an even power of 2. */
     scale_factor = av_log2(cumul_prob);
 
     if (cumul_prob & (cumul_prob - 1)) {
         uint64_t mul = softfloat_reciprocal(cumul_prob);
-        for (i = 1; i < 257; i++) {
+        for (i = 1; i <= 128; i++) {
+            rac->prob[i] = softfloat_mul(rac->prob[i], mul);
+            scaled_cumul_prob += rac->prob[i];
+        }
+        if (scaled_cumul_prob <= 0) {
+            av_log(rac->avctx, AV_LOG_ERROR, "Scaled probabilities invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (; i < 257; i++) {
             rac->prob[i] = softfloat_mul(rac->prob[i], mul);
             scaled_cumul_prob += rac->prob[i];
         }
 
         scale_factor++;
-        cumulative_target = 1 << scale_factor;
+        if (scale_factor >= 32U)
+            return AVERROR_INVALIDDATA;
+        cumulative_target = 1U << scale_factor;
 
         if (scaled_cumul_prob > cumulative_target) {
             av_log(rac->avctx, AV_LOG_ERROR,
@@ -251,11 +265,8 @@ static void lag_pred_line(LagarithContext *l, uint8_t *buf,
     int L, TL;
 
     if (!line) {
-        int i, align_width = (width - 1) & ~31;
         /* Left prediction only for first line */
-        L = l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);
-        for (i = align_width + 1; i < width; i++)
-            buf[i] += buf[i - 1];
+        L = l->llviddsp.add_left_pred(buf, buf, width, 0);
     } else {
         /* Left pixel is actually prev_row[width] */
         L = buf[width - stride - 1];
@@ -281,18 +292,12 @@ static void lag_pred_line_yuy2(LagarithContext *l, uint8_t *buf,
     int L, TL;
 
     if (!line) {
-        int i, align_width;
-        if (is_luma) {
-            buf++;
-            width--;
-        }
-
-        align_width = (width - 1) & ~31;
-        l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);
-
-        for (i = align_width + 1; i < width; i++)
-            buf[i] += buf[i - 1];
-
+        L= buf[0];
+        if (is_luma)
+            buf[0] = 0;
+        l->llviddsp.add_left_pred(buf, buf, width, 0);
+        if (is_luma)
+            buf[0] = L;
         return;
     }
     if (line == 1) {
@@ -313,7 +318,7 @@ static void lag_pred_line_yuy2(LagarithContext *l, uint8_t *buf,
     } else {
         TL = buf[width - (2 * stride) - 1];
         L  = buf[width - stride - 1];
-        l->hdsp.add_hfyu_median_pred(buf, buf - stride, buf, width, &L, &TL);
+        l->llviddsp.add_median_pred(buf, buf - stride, buf, width, &L, &TL);
     }
 }
 
@@ -371,6 +376,10 @@ static int lag_decode_zero_run_line(LagarithContext *l, uint8_t *dst,
     uint8_t mask2 = -(esc_count < 3);
     uint8_t *end = dst + (width - 2);
 
+    avpriv_request_sample(l->avctx, "zero_run_line");
+
+    memset(dst, 0, width);
+
 output_zeros:
     if (l->zeros_rem) {
         count = FFMIN(l->zeros_rem, width - i);
@@ -388,7 +397,7 @@ output_zeros:
         i = 0;
         while (!zero_run && dst + i < end) {
             i++;
-            if (src + i >= src_end)
+            if (i+2 >= src_end - src)
                 return AVERROR_INVALIDDATA;
             zero_run =
                 !(src[i] | (src[i + 1] & mask1) | (src[i + 2] & mask2));
@@ -408,7 +417,7 @@ output_zeros:
             dst += i;
         }
     }
-    return src_start - src;
+    return  src - src_start;
 }
 
 
@@ -421,31 +430,41 @@ static int lag_decode_arith_plane(LagarithContext *l, uint8_t *dst,
     int read = 0;
     uint32_t length;
     uint32_t offset = 1;
-    int esc_count = src[0];
-    BitstreamContext bc;
+    int esc_count;
+    GetBitContext gb;
     lag_rac rac;
     const uint8_t *src_end = src + src_size;
+    int ret;
 
     rac.avctx = l->avctx;
     l->zeros = 0;
 
+    if(src_size < 2)
+        return AVERROR_INVALIDDATA;
+
+    esc_count = src[0];
     if (esc_count < 4) {
         length = width * height;
+        if(src_size < 5)
+            return AVERROR_INVALIDDATA;
         if (esc_count && AV_RL32(src + 1) < length) {
             length = AV_RL32(src + 1);
             offset += 4;
         }
 
-        bitstream_init8(&bc, src + offset, src_size);
+        if ((ret = init_get_bits8(&gb, src + offset, src_size - offset)) < 0)
+            return ret;
 
-        if (lag_read_prob_header(&rac, &bc) < 0)
+        if (lag_read_prob_header(&rac, &gb) < 0)
             return -1;
 
-        ff_lag_rac_init(&rac, &bc, length - stride);
-
-        for (i = 0; i < height; i++)
+        ff_lag_rac_init(&rac, &gb, length - stride);
+        for (i = 0; i < height; i++) {
+            if (rac.overread > MAX_OVERREAD)
+                return AVERROR_INVALIDDATA;
             read += lag_decode_line(l, &rac, dst + (i * stride), width,
                                     stride, esc_count);
+        }
 
         if (read > length)
             av_log(l->avctx, AV_LOG_WARNING,
@@ -453,6 +472,8 @@ static int lag_decode_arith_plane(LagarithContext *l, uint8_t *dst,
                    length);
     } else if (esc_count < 8) {
         esc_count -= 4;
+        src ++;
+        src_size --;
         if (esc_count > 0) {
             /* Zero run coding only, no range coding. */
             for (i = 0; i < height; i++) {
@@ -513,17 +534,19 @@ static int lag_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_frame, AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
+    unsigned int buf_size = avpkt->size;
     LagarithContext *l = avctx->priv_data;
     ThreadFrame frame = { .f = data };
     AVFrame *const p  = data;
-    uint8_t frametype = 0;
+    uint8_t frametype;
     uint32_t offset_gu = 0, offset_bv = 0, offset_ry = 9;
     uint32_t offs[4];
-    uint8_t *srcs[4], *dst;
+    uint8_t *srcs[4];
     int i, j, planes = 3;
+    int ret;
 
     p->key_frame = 1;
+    p->pict_type = AV_PICTURE_TYPE_I;
 
     frametype = buf[0];
 
@@ -532,93 +555,96 @@ static int lag_decode_frame(AVCodecContext *avctx,
 
     switch (frametype) {
     case FRAME_SOLID_RGBA:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+    case FRAME_SOLID_GRAY:
+        if (frametype == FRAME_SOLID_GRAY)
+            if (avctx->bits_per_coded_sample == 24) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+                planes = 4;
+            }
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
+
+        if (frametype == FRAME_SOLID_RGBA) {
+            for (i = 0; i < avctx->height; i++) {
+                memset(p->data[0] + i * p->linesize[0], buf[2], avctx->width);
+                memset(p->data[1] + i * p->linesize[1], buf[1], avctx->width);
+                memset(p->data[2] + i * p->linesize[2], buf[3], avctx->width);
+                memset(p->data[3] + i * p->linesize[3], buf[4], avctx->width);
+            }
+        } else {
+            for (i = 0; i < avctx->height; i++) {
+                for (j = 0; j < planes; j++)
+                    memset(p->data[j] + i * p->linesize[j], buf[1], avctx->width);
+            }
+        }
+        break;
+    case FRAME_SOLID_COLOR:
+        if (avctx->bits_per_coded_sample == 24) {
+            avctx->pix_fmt = AV_PIX_FMT_GBRP;
+        } else {
+            avctx->pix_fmt = AV_PIX_FMT_GBRAP;
         }
 
-        dst = p->data[0];
-        for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++)
-                AV_WN32(dst + i * 4, offset_gu);
-            dst += p->linesize[0];
+        if ((ret = ff_thread_get_buffer(avctx, &frame,0)) < 0)
+            return ret;
+
+        for (i = 0; i < avctx->height; i++) {
+            memset(p->data[0] + i * p->linesize[0], buf[2], avctx->width);
+            memset(p->data[1] + i * p->linesize[1], buf[1], avctx->width);
+            memset(p->data[2] + i * p->linesize[2], buf[3], avctx->width);
+            if (avctx->pix_fmt == AV_PIX_FMT_GBRAP)
+                memset(p->data[3] + i * p->linesize[3], 0xFFu, avctx->width);
         }
         break;
     case FRAME_ARITH_RGBA:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP;
         planes = 4;
         offset_ry += 4;
         offs[3] = AV_RL32(buf + 9);
     case FRAME_ARITH_RGB24:
     case FRAME_U_RGB24:
         if (frametype == FRAME_ARITH_RGB24 || frametype == FRAME_U_RGB24)
-            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            avctx->pix_fmt = AV_PIX_FMT_GBRP;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         offs[0] = offset_bv;
         offs[1] = offset_gu;
         offs[2] = offset_ry;
 
-        l->rgb_stride = FFALIGN(avctx->width, 16);
-        av_fast_malloc(&l->rgb_planes, &l->rgb_planes_allocated,
-                       l->rgb_stride * avctx->height * planes + 1);
-        if (!l->rgb_planes) {
-            av_log(avctx, AV_LOG_ERROR, "cannot allocate temporary buffer\n");
-            return AVERROR(ENOMEM);
-        }
         for (i = 0; i < planes; i++)
-            srcs[i] = l->rgb_planes + (i + 1) * l->rgb_stride * avctx->height - l->rgb_stride;
-        if (offset_ry >= buf_size ||
-            offset_gu >= buf_size ||
-            offset_bv >= buf_size ||
-            (planes == 4 && offs[3] >= buf_size)) {
-            av_log(avctx, AV_LOG_ERROR,
-                    "Invalid frame offsets\n");
-            return AVERROR_INVALIDDATA;
-        }
+            srcs[i] = p->data[i] + (avctx->height - 1) * p->linesize[i];
+        for (i = 0; i < planes; i++)
+            if (buf_size <= offs[i]) {
+                av_log(avctx, AV_LOG_ERROR,
+                        "Invalid frame offsets\n");
+                return AVERROR_INVALIDDATA;
+            }
+
         for (i = 0; i < planes; i++)
             lag_decode_arith_plane(l, srcs[i],
                                    avctx->width, avctx->height,
-                                   -l->rgb_stride, buf + offs[i],
+                                   -p->linesize[i], buf + offs[i],
                                    buf_size - offs[i]);
-        dst = p->data[0];
-        for (i = 0; i < planes; i++)
-            srcs[i] = l->rgb_planes + i * l->rgb_stride * avctx->height;
-        for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++) {
-                uint8_t r, g, b, a;
-                r = srcs[0][i];
-                g = srcs[1][i];
-                b = srcs[2][i];
-                r += g;
-                b += g;
-                if (frametype == FRAME_ARITH_RGBA) {
-                    a = srcs[3][i];
-                    AV_WN32(dst + i * 4, MKBETAG(a, r, g, b));
-                } else {
-                    dst[i * 3 + 0] = r;
-                    dst[i * 3 + 1] = g;
-                    dst[i * 3 + 2] = b;
-                }
-            }
-            dst += p->linesize[0];
-            for (i = 0; i < planes; i++)
-                srcs[i] += l->rgb_stride;
+        for (i = 0; i < avctx->height; i++) {
+            l->llviddsp.add_bytes(p->data[0] + i * p->linesize[0], p->data[1] + i * p->linesize[1], avctx->width);
+            l->llviddsp.add_bytes(p->data[2] + i * p->linesize[2], p->data[1] + i * p->linesize[1], avctx->width);
         }
+        FFSWAP(uint8_t*, p->data[0], p->data[1]);
+        FFSWAP(int, p->linesize[0], p->linesize[1]);
+        FFSWAP(uint8_t*, p->data[2], p->data[1]);
+        FFSWAP(int, p->linesize[2], p->linesize[1]);
         break;
     case FRAME_ARITH_YUY2:
         avctx->pix_fmt = AV_PIX_FMT_YUV422P;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         if (offset_ry >= buf_size ||
             offset_gu >= buf_size ||
@@ -631,20 +657,18 @@ static int lag_decode_frame(AVCodecContext *avctx,
         lag_decode_arith_plane(l, p->data[0], avctx->width, avctx->height,
                                p->linesize[0], buf + offset_ry,
                                buf_size - offset_ry);
-        lag_decode_arith_plane(l, p->data[1], avctx->width / 2,
+        lag_decode_arith_plane(l, p->data[1], (avctx->width + 1) / 2,
                                avctx->height, p->linesize[1],
                                buf + offset_gu, buf_size - offset_gu);
-        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
+        lag_decode_arith_plane(l, p->data[2], (avctx->width + 1) / 2,
                                avctx->height, p->linesize[2],
                                buf + offset_bv, buf_size - offset_bv);
         break;
     case FRAME_ARITH_YV12:
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         if (offset_ry >= buf_size ||
             offset_gu >= buf_size ||
@@ -657,17 +681,17 @@ static int lag_decode_frame(AVCodecContext *avctx,
         lag_decode_arith_plane(l, p->data[0], avctx->width, avctx->height,
                                p->linesize[0], buf + offset_ry,
                                buf_size - offset_ry);
-        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
-                               avctx->height / 2, p->linesize[2],
+        lag_decode_arith_plane(l, p->data[2], (avctx->width + 1) / 2,
+                               (avctx->height + 1) / 2, p->linesize[2],
                                buf + offset_gu, buf_size - offset_gu);
-        lag_decode_arith_plane(l, p->data[1], avctx->width / 2,
-                               avctx->height / 2, p->linesize[1],
+        lag_decode_arith_plane(l, p->data[1], (avctx->width + 1) / 2,
+                               (avctx->height + 1) / 2, p->linesize[1],
                                buf + offset_bv, buf_size - offset_bv);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported Lagarith frame type: %#"PRIx8"\n", frametype);
-        return -1;
+        return AVERROR_PATCHWELCOME;
     }
 
     *got_frame = 1;
@@ -680,19 +704,20 @@ static av_cold int lag_decode_init(AVCodecContext *avctx)
     LagarithContext *l = avctx->priv_data;
     l->avctx = avctx;
 
-    ff_huffyuvdsp_init(&l->hdsp);
+    ff_llviddsp_init(&l->llviddsp);
 
     return 0;
 }
 
-static av_cold int lag_decode_end(AVCodecContext *avctx)
+#if HAVE_THREADS
+static av_cold int lag_decode_init_thread_copy(AVCodecContext *avctx)
 {
     LagarithContext *l = avctx->priv_data;
-
-    av_freep(&l->rgb_planes);
+    l->avctx = avctx;
 
     return 0;
 }
+#endif
 
 AVCodec ff_lagarith_decoder = {
     .name           = "lagarith",
@@ -701,7 +726,7 @@ AVCodec ff_lagarith_decoder = {
     .id             = AV_CODEC_ID_LAGARITH,
     .priv_data_size = sizeof(LagarithContext),
     .init           = lag_decode_init,
-    .close          = lag_decode_end,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(lag_decode_init_thread_copy),
     .decode         = lag_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/lagarithrac.c b/libavcodec/lagarithrac.c
index 7441dc7..cdda67f 100644
--- a/libavcodec/lagarithrac.c
+++ b/libavcodec/lagarithrac.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,33 +27,31 @@
  * @author David Conrad
  */
 
-#include "bitstream.h"
+#include "get_bits.h"
 #include "lagarithrac.h"
 
-void ff_lag_rac_init(lag_rac *l, BitstreamContext *bc, int length)
+void ff_lag_rac_init(lag_rac *l, GetBitContext *gb, int length)
 {
     int i, j, left;
 
     /* According to reference decoder "1st byte is garbage",
-     * however, it gets skipped by the call to bitstream_align()
+     * however, it gets skipped by the call to align_get_bits()
      */
-    bitstream_align(bc);
-    left                = bitstream_bits_left(bc) >> 3;
+    align_get_bits(gb);
+    left                = get_bits_left(gb) >> 3;
     l->bytestream_start =
-    l->bytestream       = bc->buffer + bitstream_tell(bc) / 8;
-    l->bytestream_end   = l->bytestream_start + FFMIN(length, left);
+    l->bytestream       = gb->buffer + get_bits_count(gb) / 8;
+    l->bytestream_end   = l->bytestream_start + left;
 
     l->range        = 0x80;
     l->low          = *l->bytestream >> 1;
-    l->hash_shift   = FFMAX(l->scale, 8) - 8;
+    l->hash_shift   = FFMAX(l->scale, 10) - 10;
+    l->overread     = 0;
 
-    for (i = j = 0; i < 256; i++) {
+    for (i = j = 0; i < 1024; i++) {
         unsigned r = i << l->hash_shift;
         while (l->prob[j + 1] <= r)
             j++;
         l->range_hash[i] = j;
     }
-
-    /* Add conversion factor to hash_shift so we don't have to in lag_get_rac. */
-    l->hash_shift += 23;
 }
diff --git a/libavcodec/lagarithrac.h b/libavcodec/lagarithrac.h
index 3b30b15..ee836d0 100644
--- a/libavcodec/lagarithrac.h
+++ b/libavcodec/lagarithrac.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,12 +31,10 @@
 #define AVCODEC_LAGARITHRAC_H
 
 #include <stdint.h>
-
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 
 typedef struct lag_rac {
     AVCodecContext *avctx;
@@ -49,11 +47,14 @@ typedef struct lag_rac {
     const uint8_t *bytestream;        /**< Current position in input bytestream. */
     const uint8_t *bytestream_end;    /**< End position of input bytestream. */
 
+    int overread;
+#define MAX_OVERREAD 4
+
     uint32_t prob[258];         /**< Table of cumulative probability for each symbol. */
-    uint8_t  range_hash[256];   /**< Hash table mapping upper byte to approximate symbol. */
+    uint8_t  range_hash[1024];   /**< Hash table mapping upper byte to approximate symbol. */
 } lag_rac;
 
-void ff_lag_rac_init(lag_rac *l, BitstreamContext *bc, int length);
+void ff_lag_rac_init(lag_rac *l, GetBitContext *gb, int length);
 
 /* TODO: Optimize */
 static inline void lag_rac_refill(lag_rac *l)
@@ -64,6 +65,8 @@ static inline void lag_rac_refill(lag_rac *l)
         l->low |= 0xff & (AV_RB16(l->bytestream) >> 1);
         if (l->bytestream < l->bytestream_end)
             l->bytestream++;
+        else
+            l->overread++;
     }
 }
 
@@ -74,9 +77,8 @@ static inline void lag_rac_refill(lag_rac *l)
  */
 static inline uint8_t lag_get_rac(lag_rac *l)
 {
-    unsigned range_scaled, low_scaled, div;
+    unsigned range_scaled, low_scaled;
     int val;
-    uint8_t shift;
 
     lag_rac_refill(l);
 
@@ -87,18 +89,9 @@ static inline uint8_t lag_get_rac(lag_rac *l)
         if (l->low < range_scaled * l->prob[1]) {
             val = 0;
         } else {
-            /* FIXME __builtin_clz is ~20% faster here, but not allowed in generic code. */
-            shift = 30 - av_log2(range_scaled);
-            div = ((range_scaled << shift) + (1 << 23) - 1) >> 23;
-            /* low>>24 ensures that any cases too big for exact FASTDIV are
-             * under- rather than over-estimated
-             */
-            low_scaled = FASTDIV(l->low - (l->low >> 24), div);
-            shift -= l->hash_shift;
-            shift &= 31;
-            low_scaled = (low_scaled << shift) | (low_scaled >> (32 - shift));
-            /* low_scaled is now a lower bound of low/range_scaled */
-            val = l->range_hash[(uint8_t) low_scaled];
+            low_scaled = l->low / (range_scaled<<(l->hash_shift));
+
+            val = l->range_hash[low_scaled];
             while (l->low >= range_scaled * l->prob[val + 1])
                 val++;
         }
diff --git a/libavcodec/latm_parser.c b/libavcodec/latm_parser.c
index 6fdb897..3820f58 100644
--- a/libavcodec/latm_parser.c
+++ b/libavcodec/latm_parser.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2008 Paul Kendall <paul@kcbbs.gen.nz>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,6 @@ static int latm_find_frame_end(AVCodecParserContext *s1, const uint8_t *buf,
     pic_found = pc->frame_start_found;
     state     = pc->state;
 
-    i = 0;
     if (!pic_found) {
         for (i = 0; i < buf_size; i++) {
             state = (state<<8) | buf[i];
diff --git a/libavcodec/lcl.h b/libavcodec/lcl.h
index 4e7e170..b60c0e9 100644
--- a/libavcodec/lcl.h
+++ b/libavcodec/lcl.h
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lcldec.c b/libavcodec/lcldec.c
index dffc20a..104defa 100644
--- a/libavcodec/lcldec.c
+++ b/libavcodec/lcldec.c
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,10 +41,12 @@
 #include <stdlib.h>
 
 #include "libavutil/mem.h"
+#include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "lcl.h"
+#include "thread.h"
 
 #if CONFIG_ZLIB_DECODER
 #include <zlib.h>
@@ -91,7 +93,13 @@ static unsigned int mszh_decomp(const unsigned char * srcptr, int srclen, unsign
             ofs = FFMIN(ofs, destptr - destptr_bak);
             cnt *= 4;
             cnt = FFMIN(cnt, destptr_end - destptr);
-            av_memcpy_backptr(destptr, ofs, cnt);
+            if (ofs) {
+                av_memcpy_backptr(destptr, ofs, cnt);
+            } else {
+                // Not known what the correct behaviour is, but
+                // this at least avoids uninitialized data.
+                memset(destptr, 0, cnt);
+            }
             destptr += cnt;
         }
         maskbit >>= 1;
@@ -128,7 +136,7 @@ static int zlib_decomp(AVCodecContext *avctx, const uint8_t *src, int src_len, i
         av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", zret);
         return AVERROR_UNKNOWN;
     }
-    c->zstream.next_in = src;
+    c->zstream.next_in = (uint8_t *)src;
     c->zstream.avail_in = src_len;
     c->zstream.next_out = c->decomp_buf + offset;
     c->zstream.avail_out = c->decomp_size - offset;
@@ -150,12 +158,13 @@ static int zlib_decomp(AVCodecContext *avctx, const uint8_t *src, int src_len, i
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
     AVFrame *frame = data;
+    ThreadFrame tframe = { .f = data };
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     LclDecContext * const c = avctx->priv_data;
     unsigned int pixel_ptr;
     int row, col;
-    unsigned char *encoded, *outptr;
+    unsigned char *encoded = avpkt->data, *outptr;
     uint8_t *y_out, *u_out, *v_out;
     unsigned int width = avctx->width; // Real image width
     unsigned int height = avctx->height; // Real image height
@@ -164,11 +173,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int uqvq, ret;
     unsigned int mthread_inlen, mthread_outlen;
     unsigned int len = buf_size;
+    int linesize;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
-    }
 
     outptr = frame->data[0]; // Output image pointer
 
@@ -177,8 +185,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case AV_CODEC_ID_MSZH:
         switch (c->compression) {
         case COMP_MSZH:
-            if (c->flags & FLAG_MULTITHREAD) {
+            if (c->imgtype == IMGTYPE_RGB24 && len == FFALIGN(width * 3, 4) * height ||
+                c->imgtype == IMGTYPE_YUV111 && len == width * height * 3) {
+                ;
+            } else if (c->flags & FLAG_MULTITHREAD) {
                 mthread_inlen = AV_RL32(buf);
+                if (len < 8) {
+                    av_log(avctx, AV_LOG_ERROR, "len %d is too small\n", len);
+                    return AVERROR_INVALIDDATA;
+                }
                 mthread_inlen = FFMIN(mthread_inlen, len - 8);
                 mthread_outlen = AV_RL32(buf + 4);
                 mthread_outlen = FFMIN(mthread_outlen, c->decomp_size);
@@ -390,10 +405,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         }
         break;
     case IMGTYPE_RGB24:
+        linesize = len < FFALIGN(3 * width, 4) * height ? 3 * width : FFALIGN(3 * width, 4);
         for (row = height - 1; row >= 0; row--) {
             pixel_ptr = row * frame->linesize[0];
             memcpy(outptr + pixel_ptr, encoded, 3 * width);
-            encoded += 3 * width;
+            encoded += linesize;
         }
         break;
     case IMGTYPE_YUV411:
@@ -444,6 +460,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         return AVERROR_INVALIDDATA;
     }
 
+    frame->key_frame = 1;
+    frame->pict_type = AV_PICTURE_TYPE_I;
+
     *got_frame = 1;
 
     /* always report that the buffer was completely consumed */
@@ -457,6 +476,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     unsigned int max_basesize = FFALIGN(avctx->width,  4) *
                                 FFALIGN(avctx->height, 4);
     unsigned int max_decomp_size;
+    int subsample_h, subsample_v;
 
     if (avctx->extradata_size < 8) {
         av_log(avctx, AV_LOG_ERROR, "Extradata size too small.\n");
@@ -482,6 +502,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         max_decomp_size = max_basesize * 2;
         avctx->pix_fmt = AV_PIX_FMT_YUV422P;
         av_log(avctx, AV_LOG_DEBUG, "Image type is YUV 4:2:2.\n");
+        if (avctx->width % 4) {
+            avpriv_request_sample(avctx, "Unsupported dimensions");
+            return AVERROR_INVALIDDATA;
+        }
         break;
     case IMGTYPE_RGB24:
         c->decomp_size = basesize * 3;
@@ -512,6 +536,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &subsample_h, &subsample_v);
+    if (avctx->width % (1<<subsample_h) || avctx->height % (1<<subsample_v)) {
+        avpriv_request_sample(avctx, "Unsupported dimensions");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* Detect compression method */
     c->compression = (int8_t)avctx->extradata[5];
     switch (avctx->codec_id) {
@@ -593,6 +623,13 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    return decode_init(avctx);
+}
+#endif
+
 static av_cold int decode_end(AVCodecContext *avctx)
 {
     LclDecContext * const c = avctx->priv_data;
@@ -614,9 +651,10 @@ AVCodec ff_mszh_decoder = {
     .id             = AV_CODEC_ID_MSZH,
     .priv_data_size = sizeof(LclDecContext),
     .init           = decode_init,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
     .close          = decode_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
 #endif
@@ -629,9 +667,10 @@ AVCodec ff_zlib_decoder = {
     .id             = AV_CODEC_ID_ZLIB,
     .priv_data_size = sizeof(LclDecContext),
     .init           = decode_init,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
     .close          = decode_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
 #endif
diff --git a/libavcodec/lclenc.c b/libavcodec/lclenc.c
index 2e00807..357313d 100644
--- a/libavcodec/lclenc.c
+++ b/libavcodec/lclenc.c
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "lcl.h"
@@ -62,19 +63,15 @@ typedef struct LclEncContext {
 } LclEncContext;
 
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+                        const AVFrame *p, int *got_packet)
 {
     LclEncContext *c = avctx->priv_data;
-    const AVFrame * const p = pict;
     int i, ret;
     int zret; // Zlib return code
     int max_size = deflateBound(&c->zstream, avctx->width * avctx->height * 3);
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, max_size)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error allocating packet of size %d.\n", max_size);
-            return ret;
-    }
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_size, 0)) < 0)
+        return ret;
 
     if(avctx->pix_fmt != AV_PIX_FMT_BGR24){
         av_log(avctx, AV_LOG_ERROR, "Format not supported!\n");
@@ -118,9 +115,9 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
     c->avctx= avctx;
 
-    assert(avctx->width && avctx->height);
+    av_assert0(avctx->width && avctx->height);
 
-    avctx->extradata= av_mallocz(8);
+    avctx->extradata = av_mallocz(8 + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
@@ -131,8 +128,9 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    // Will be user settable someday
-    c->compression = 6;
+    c->compression = avctx->compression_level == FF_COMPRESSION_DEFAULT ?
+                            COMP_ZLIB_NORMAL :
+                            av_clip(avctx->compression_level, 0, 9);
     c->flags = 0;
     c->imgtype = IMGTYPE_RGB24;
     avctx->bits_per_coded_sample= 24;
@@ -178,6 +176,7 @@ AVCodec ff_zlib_encoder = {
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_BGR24, AV_PIX_FMT_NONE },
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
                       FF_CODEC_CAP_INIT_CLEANUP,
diff --git a/libavcodec/libaom.c b/libavcodec/libaom.c
deleted file mode 100644
index bfc25eb..0000000
--- a/libavcodec/libaom.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <aom/aom_image.h>
-
-#include "libaom.h"
-
-#define HIGH_DEPTH(fmt)                      \
-case AOM_IMG_FMT_I ## fmt ## 16:             \
-    switch (depth) {                         \
-    case  8:                                 \
-        return AV_PIX_FMT_YUV ## fmt ## P;   \
-    case 10:                                 \
-        return AV_PIX_FMT_YUV ## fmt ## P10; \
-    case 12:                                 \
-        return AV_PIX_FMT_YUV ## fmt ## P12; \
-    default:                                 \
-        return AV_PIX_FMT_NONE;              \
-    }
-
-enum AVPixelFormat ff_aom_imgfmt_to_pixfmt(aom_img_fmt_t img, int depth)
-{
-    switch (img) {
-    case AOM_IMG_FMT_I420:
-        return AV_PIX_FMT_YUV420P;
-    case AOM_IMG_FMT_I422:
-        return AV_PIX_FMT_YUV422P;
-    case AOM_IMG_FMT_I444:
-        return AV_PIX_FMT_YUV444P;
-    HIGH_DEPTH(420)
-    HIGH_DEPTH(422)
-    HIGH_DEPTH(444)
-    default:
-        return AV_PIX_FMT_NONE;
-    }
-}
-
-#undef HIGH_DEPTH
-
-aom_img_fmt_t ff_aom_pixfmt_to_imgfmt(enum AVPixelFormat pix)
-{
-    switch (pix) {
-    case AV_PIX_FMT_YUV420P:
-        return AOM_IMG_FMT_I420;
-    case AV_PIX_FMT_YUV422P:
-        return AOM_IMG_FMT_I422;
-    case AV_PIX_FMT_YUV444P:
-        return AOM_IMG_FMT_I444;
-    case AV_PIX_FMT_YUV420P10:
-        return AOM_IMG_FMT_I42016;
-    case AV_PIX_FMT_YUV422P10:
-        return AOM_IMG_FMT_I42216;
-    case AV_PIX_FMT_YUV444P10:
-        return AOM_IMG_FMT_I44416;
-    case AV_PIX_FMT_YUV420P12:
-        return AOM_IMG_FMT_I42016;
-    case AV_PIX_FMT_YUV422P12:
-        return AOM_IMG_FMT_I42216;
-    case AV_PIX_FMT_YUV444P12:
-        return AOM_IMG_FMT_I44416;
-    default:
-        return AOM_IMG_FMT_NONE;
-    }
-}
diff --git a/libavcodec/libaomdec.c b/libavcodec/libaomdec.c
index 859c772..a72ac98 100644
--- a/libavcodec/libaomdec.c
+++ b/libavcodec/libaomdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,20 +31,19 @@
 
 #include "avcodec.h"
 #include "internal.h"
-#include "libaom.h"
+#include "profiles.h"
 
 typedef struct AV1DecodeContext {
     struct aom_codec_ctx decoder;
 } AV1DecodeContext;
 
-static av_cold int aom_init(AVCodecContext *avctx)
+static av_cold int aom_init(AVCodecContext *avctx,
+                            const struct aom_codec_iface *iface)
 {
     AV1DecodeContext *ctx           = avctx->priv_data;
     struct aom_codec_dec_cfg deccfg = {
-        /* token partitions+1 would be a decent choice */
-        .threads = FFMIN(avctx->thread_count, 16)
+        .threads = FFMIN(avctx->thread_count ? avctx->thread_count : av_cpu_count(), 16)
     };
-    const struct aom_codec_iface *iface = &aom_codec_av1_dx_algo;
 
     av_log(avctx, AV_LOG_INFO, "%s\n", aom_codec_version_str());
     av_log(avctx, AV_LOG_VERBOSE, "%s\n", aom_codec_build_config());
@@ -61,9 +60,10 @@ static av_cold int aom_init(AVCodecContext *avctx)
 
 static void image_copy_16_to_8(AVFrame *pic, struct aom_image *img)
 {
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pic->format);
     int i;
 
-    for (i = 0; i < 3; i++) {
+    for (i = 0; i < desc->nb_components; i++) {
         int w = img->d_w;
         int h = img->d_h;
         int x, y;
@@ -82,6 +82,77 @@ static void image_copy_16_to_8(AVFrame *pic, struct aom_image *img)
     }
 }
 
+// returns 0 on success, AVERROR_INVALIDDATA otherwise
+static int set_pix_fmt(AVCodecContext *avctx, struct aom_image *img)
+{
+    static const enum AVColorRange color_ranges[] = {
+        AVCOL_RANGE_MPEG, AVCOL_RANGE_JPEG
+    };
+    avctx->color_range = color_ranges[img->range];
+    avctx->color_primaries = img->cp;
+    avctx->colorspace  = img->mc;
+    avctx->color_trc   = img->tc;
+
+    switch (img->fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I42016:
+        if (img->bit_depth == 8) {
+            avctx->pix_fmt = img->monochrome ?
+                             AV_PIX_FMT_GRAY8 : AV_PIX_FMT_YUV420P;
+            avctx->profile = FF_PROFILE_AV1_MAIN;
+            return 0;
+        } else if (img->bit_depth == 10) {
+            avctx->pix_fmt = img->monochrome ?
+                             AV_PIX_FMT_GRAY10 : AV_PIX_FMT_YUV420P10;
+            avctx->profile = FF_PROFILE_AV1_MAIN;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = img->monochrome ?
+                             AV_PIX_FMT_GRAY12 : AV_PIX_FMT_YUV420P12;
+            avctx->profile = FF_PROFILE_AV1_PROFESSIONAL;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I42216:
+        if (img->bit_depth == 8) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            avctx->profile = FF_PROFILE_AV1_PROFESSIONAL;
+            return 0;
+        } else if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            avctx->profile = FF_PROFILE_AV1_PROFESSIONAL;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+            avctx->profile = FF_PROFILE_AV1_PROFESSIONAL;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    case AOM_IMG_FMT_I444:
+    case AOM_IMG_FMT_I44416:
+        if (img->bit_depth == 8) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+            avctx->profile = FF_PROFILE_AV1_HIGH;
+            return 0;
+        } else if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+            avctx->profile = FF_PROFILE_AV1_HIGH;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P12;
+            avctx->profile = FF_PROFILE_AV1_PROFESSIONAL;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+
+    default:
+        return AVERROR_INVALIDDATA;
+    }
+}
 
 static int aom_decode(AVCodecContext *avctx, void *data, int *got_frame,
                       AVPacket *avpkt)
@@ -105,11 +176,16 @@ static int aom_decode(AVCodecContext *avctx, void *data, int *got_frame,
     }
 
     if ((img = aom_codec_get_frame(&ctx->decoder, &iter))) {
-        avctx->pix_fmt = ff_aom_imgfmt_to_pixfmt(img->fmt, img->bit_depth);
-        if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
-            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (0x%02x %dbits)\n",
+        if (img->d_w > img->w || img->d_h > img->h) {
+            av_log(avctx, AV_LOG_ERROR, "Display dimensions %dx%d exceed storage %dx%d\n",
+                   img->d_w, img->d_h, img->w, img->h);
+            return AVERROR_EXTERNAL;
+        }
+
+        if ((ret = set_pix_fmt(avctx, img)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d) / bit_depth (%d)\n",
                    img->fmt, img->bit_depth);
-            return AVERROR_INVALIDDATA;
+            return ret;
         }
 
         if ((int)img->d_w != avctx->width || (int)img->d_h != avctx->height) {
@@ -126,14 +202,6 @@ static int aom_decode(AVCodecContext *avctx, void *data, int *got_frame,
         else
             av_image_copy(picture->data, picture->linesize, (const uint8_t **)img->planes,
                           img->stride, avctx->pix_fmt, img->d_w, img->d_h);
-        switch (img->range) {
-        case AOM_CR_STUDIO_RANGE:
-            picture->color_range = AVCOL_RANGE_MPEG;
-            break;
-        case AOM_CR_FULL_RANGE:
-            picture->color_range = AVCOL_RANGE_JPEG;
-            break;
-        }
         *got_frame = 1;
     }
     return avpkt->size;
@@ -146,15 +214,21 @@ static av_cold int aom_free(AVCodecContext *avctx)
     return 0;
 }
 
+static av_cold int av1_init(AVCodecContext *avctx)
+{
+    return aom_init(avctx, &aom_codec_av1_dx_algo);
+}
+
 AVCodec ff_libaom_av1_decoder = {
     .name           = "libaom-av1",
     .long_name      = NULL_IF_CONFIG_SMALL("libaom AV1"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_AV1,
     .priv_data_size = sizeof(AV1DecodeContext),
-    .init           = aom_init,
+    .init           = av1_init,
     .close          = aom_free,
     .decode         = aom_decode,
     .capabilities   = AV_CODEC_CAP_AUTO_THREADS | AV_CODEC_CAP_DR1,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_av1_profiles),
     .wrapper_name   = "libaom",
 };
diff --git a/libavcodec/libaomenc.c b/libavcodec/libaomenc.c
index 3219bd3..faec61c 100644
--- a/libavcodec/libaomenc.c
+++ b/libavcodec/libaomenc.c
@@ -1,54 +1,65 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * @file
+ * AV1 encoder support via libaom
+ */
+
 #define AOM_DISABLE_CTRL_TYPECHECKS 1
 #include <aom/aom_encoder.h>
 #include <aom/aomcx.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/base64.h"
 #include "libavutil/common.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 
+#include "av1.h"
 #include "avcodec.h"
 #include "internal.h"
-#include "libaom.h"
+#include "profiles.h"
 
 /*
  * Portion of struct aom_codec_cx_pkt from aom_encoder.h.
  * One encoded frame returned from the library.
  */
 struct FrameListData {
-    void *buf;                       /* compressed data buffer */
-    size_t sz;                       /* length of compressed data */
-    int64_t pts;                     /* time stamp to show frame
-                                      * (in timebase units) */
-    unsigned long duration;          /* duration to show frame
-                                      * (in timebase units) */
-    uint32_t flags;                  /* flags for this frame */
+    void *buf;                       /**< compressed data buffer */
+    size_t sz;                       /**< length of compressed data */
+    int64_t pts;                     /**< time stamp to show frame
+                                          (in timebase units) */
+    unsigned long duration;          /**< duration to show frame
+                                          (in timebase units) */
+    uint32_t flags;                  /**< flags for this frame */
+    uint64_t sse[4];
+    int have_sse;                    /**< true if we have pending sse[] */
+    uint64_t frame_number;
     struct FrameListData *next;
 };
 
 typedef struct AOMEncoderContext {
     AVClass *class;
+    AVBSFContext *bsf;
     struct aom_codec_ctx encoder;
     struct aom_image rawimg;
     struct aom_fixed_buf twopass_stats;
@@ -60,7 +71,14 @@ typedef struct AOMEncoderContext {
     int crf;
     int static_thresh;
     int drop_threshold;
-    int noise_sensitivity;
+    uint64_t sse[4];
+    int have_sse; /**< true if we have pending sse[] */
+    uint64_t frame_number;
+    int tile_cols, tile_rows;
+    int tile_cols_log2, tile_rows_log2;
+    aom_superblock_size_t superblock_size;
+    int uniform_tiles;
+    int row_mt;
 } AOMContext;
 
 static const char *const ctlidstr[] = {
@@ -68,8 +86,16 @@ static const char *const ctlidstr[] = {
     [AOME_SET_CQ_LEVEL]         = "AOME_SET_CQ_LEVEL",
     [AOME_SET_ENABLEAUTOALTREF] = "AOME_SET_ENABLEAUTOALTREF",
     [AOME_SET_STATIC_THRESHOLD] = "AOME_SET_STATIC_THRESHOLD",
-    [AV1E_SET_CHROMA_SUBSAMPLING_X] = "AV1E_SET_CHROMA_SUBSAMPLING_X",
-    [AV1E_SET_CHROMA_SUBSAMPLING_Y] = "AV1E_SET_CHROMA_SUBSAMPLING_Y",
+    [AV1E_SET_COLOR_RANGE]      = "AV1E_SET_COLOR_RANGE",
+    [AV1E_SET_COLOR_PRIMARIES]  = "AV1E_SET_COLOR_PRIMARIES",
+    [AV1E_SET_MATRIX_COEFFICIENTS] = "AV1E_SET_MATRIX_COEFFICIENTS",
+    [AV1E_SET_TRANSFER_CHARACTERISTICS] = "AV1E_SET_TRANSFER_CHARACTERISTICS",
+    [AV1E_SET_SUPERBLOCK_SIZE]  = "AV1E_SET_SUPERBLOCK_SIZE",
+    [AV1E_SET_TILE_COLUMNS]     = "AV1E_SET_TILE_COLUMNS",
+    [AV1E_SET_TILE_ROWS]        = "AV1E_SET_TILE_ROWS",
+#ifdef AOM_CTRL_AV1E_SET_ROW_MT
+    [AV1E_SET_ROW_MT]           = "AV1E_SET_ROW_MT",
+#endif
 };
 
 static av_cold void log_encoder_error(AVCodecContext *avctx, const char *desc)
@@ -92,22 +118,25 @@ static av_cold void dump_enc_cfg(AVCodecContext *avctx,
     av_log(avctx, level, "aom_codec_enc_cfg\n");
     av_log(avctx, level, "generic settings\n"
                          "  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n"
+                         "  %*s%u\n  %*s%u\n"
                          "  %*s{%u/%u}\n  %*s%u\n  %*s%d\n  %*s%u\n",
-           width, "g_usage:", cfg->g_usage,
-           width, "g_threads:", cfg->g_threads,
-           width, "g_profile:", cfg->g_profile,
-           width, "g_w:", cfg->g_w,
-           width, "g_h:", cfg->g_h,
-           width, "g_timebase:", cfg->g_timebase.num, cfg->g_timebase.den,
+           width, "g_usage:",           cfg->g_usage,
+           width, "g_threads:",         cfg->g_threads,
+           width, "g_profile:",         cfg->g_profile,
+           width, "g_w:",               cfg->g_w,
+           width, "g_h:",               cfg->g_h,
+           width, "g_bit_depth:",       cfg->g_bit_depth,
+           width, "g_input_bit_depth:", cfg->g_input_bit_depth,
+           width, "g_timebase:",        cfg->g_timebase.num, cfg->g_timebase.den,
            width, "g_error_resilient:", cfg->g_error_resilient,
-           width, "g_pass:", cfg->g_pass,
-           width, "g_lag_in_frames:", cfg->g_lag_in_frames);
+           width, "g_pass:",            cfg->g_pass,
+           width, "g_lag_in_frames:",   cfg->g_lag_in_frames);
     av_log(avctx, level, "rate control settings\n"
-                         "  %*s%u\n  %*s%d\n  %*s%p(%zu)\n  %*s%u\n",
+                         "  %*s%u\n  %*s%d\n  %*s%p(%"SIZE_SPECIFIER")\n  %*s%u\n",
            width, "rc_dropframe_thresh:", cfg->rc_dropframe_thresh,
-           width, "rc_end_usage:", cfg->rc_end_usage,
+           width, "rc_end_usage:",        cfg->rc_end_usage,
            width, "rc_twopass_stats_in:", cfg->rc_twopass_stats_in.buf, cfg->rc_twopass_stats_in.sz,
-           width, "rc_target_bitrate:", cfg->rc_target_bitrate);
+           width, "rc_target_bitrate:",   cfg->rc_target_bitrate);
     av_log(avctx, level, "quantizer settings\n"
                          "  %*s%u\n  %*s%u\n",
            width, "rc_min_quantizer:", cfg->rc_min_quantizer,
@@ -115,22 +144,26 @@ static av_cold void dump_enc_cfg(AVCodecContext *avctx,
     av_log(avctx, level, "bitrate tolerance\n"
                          "  %*s%u\n  %*s%u\n",
            width, "rc_undershoot_pct:", cfg->rc_undershoot_pct,
-           width, "rc_overshoot_pct:", cfg->rc_overshoot_pct);
+           width, "rc_overshoot_pct:",  cfg->rc_overshoot_pct);
     av_log(avctx, level, "decoder buffer model\n"
                          "  %*s%u\n  %*s%u\n  %*s%u\n",
-           width, "rc_buf_sz:", cfg->rc_buf_sz,
+           width, "rc_buf_sz:",         cfg->rc_buf_sz,
            width, "rc_buf_initial_sz:", cfg->rc_buf_initial_sz,
            width, "rc_buf_optimal_sz:", cfg->rc_buf_optimal_sz);
     av_log(avctx, level, "2 pass rate control settings\n"
                          "  %*s%u\n  %*s%u\n  %*s%u\n",
-           width, "rc_2pass_vbr_bias_pct:", cfg->rc_2pass_vbr_bias_pct,
+           width, "rc_2pass_vbr_bias_pct:",       cfg->rc_2pass_vbr_bias_pct,
            width, "rc_2pass_vbr_minsection_pct:", cfg->rc_2pass_vbr_minsection_pct,
            width, "rc_2pass_vbr_maxsection_pct:", cfg->rc_2pass_vbr_maxsection_pct);
     av_log(avctx, level, "keyframing settings\n"
                          "  %*s%d\n  %*s%u\n  %*s%u\n",
-           width, "kf_mode:", cfg->kf_mode,
+           width, "kf_mode:",     cfg->kf_mode,
            width, "kf_min_dist:", cfg->kf_min_dist,
            width, "kf_max_dist:", cfg->kf_max_dist);
+    av_log(avctx, level, "tile settings\n"
+                         "  %*s%d\n  %*s%d\n",
+           width, "tile_width_count:",  cfg->tile_width_count,
+           width, "tile_height_count:", cfg->tile_height_count);
     av_log(avctx, level, "\n");
 }
 
@@ -162,7 +195,12 @@ static av_cold void free_frame_list(struct FrameListData *list)
 }
 
 static av_cold int codecctl_int(AVCodecContext *avctx,
-                                enum aome_enc_control_id id, int val)
+#ifdef UENUM1BYTE
+                                aome_enc_control_id id,
+#else
+                                enum aome_enc_control_id id,
+#endif
+                                int val)
 {
     AOMContext *ctx = avctx->priv_data;
     char buf[80];
@@ -191,16 +229,265 @@ static av_cold int aom_free(AVCodecContext *avctx)
     av_freep(&ctx->twopass_stats.buf);
     av_freep(&avctx->stats_out);
     free_frame_list(ctx->coded_frame_list);
+    av_bsf_free(&ctx->bsf);
     return 0;
 }
 
-static av_cold int aom_init(AVCodecContext *avctx)
+static int set_pix_fmt(AVCodecContext *avctx, aom_codec_caps_t codec_caps,
+                       struct aom_codec_enc_cfg *enccfg, aom_codec_flags_t *flags,
+                       aom_img_fmt_t *img_fmt)
+{
+    AOMContext av_unused *ctx = avctx->priv_data;
+    enccfg->g_bit_depth = enccfg->g_input_bit_depth = 8;
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        enccfg->g_profile = FF_PROFILE_AV1_MAIN;
+        *img_fmt = AOM_IMG_FMT_I420;
+        return 0;
+    case AV_PIX_FMT_YUV422P:
+        enccfg->g_profile = FF_PROFILE_AV1_PROFESSIONAL;
+        *img_fmt = AOM_IMG_FMT_I422;
+        return 0;
+    case AV_PIX_FMT_YUV444P:
+        enccfg->g_profile = FF_PROFILE_AV1_HIGH;
+        *img_fmt = AOM_IMG_FMT_I444;
+        return 0;
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
+        if (codec_caps & AOM_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV420P10 ? 10 : 12;
+            enccfg->g_profile =
+                enccfg->g_bit_depth == 10 ? FF_PROFILE_AV1_MAIN : FF_PROFILE_AV1_PROFESSIONAL;
+            *img_fmt = AOM_IMG_FMT_I42016;
+            *flags |= AOM_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
+        if (codec_caps & AOM_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV422P10 ? 10 : 12;
+            enccfg->g_profile = FF_PROFILE_AV1_PROFESSIONAL;
+            *img_fmt = AOM_IMG_FMT_I42216;
+            *flags |= AOM_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
+        if (codec_caps & AOM_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV444P10 ? 10 : 12;
+            enccfg->g_profile =
+                enccfg->g_bit_depth == 10 ? FF_PROFILE_AV1_HIGH : FF_PROFILE_AV1_PROFESSIONAL;
+            *img_fmt = AOM_IMG_FMT_I44416;
+            *flags |= AOM_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    default:
+        break;
+    }
+    av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format.\n");
+    return AVERROR_INVALIDDATA;
+}
+
+static void set_color_range(AVCodecContext *avctx)
+{
+    aom_color_range_t aom_cr;
+    switch (avctx->color_range) {
+    case AVCOL_RANGE_UNSPECIFIED:
+    case AVCOL_RANGE_MPEG:       aom_cr = AOM_CR_STUDIO_RANGE; break;
+    case AVCOL_RANGE_JPEG:       aom_cr = AOM_CR_FULL_RANGE;   break;
+    default:
+        av_log(avctx, AV_LOG_WARNING, "Unsupported color range (%d)\n",
+               avctx->color_range);
+        return;
+    }
+
+    codecctl_int(avctx, AV1E_SET_COLOR_RANGE, aom_cr);
+}
+
+static int count_uniform_tiling(int dim, int sb_size, int tiles_log2)
+{
+    int sb_dim   = (dim + sb_size - 1) / sb_size;
+    int tile_dim = (sb_dim + (1 << tiles_log2) - 1) >> tiles_log2;
+    av_assert0(tile_dim > 0);
+    return (sb_dim + tile_dim - 1) / tile_dim;
+}
+
+static int choose_tiling(AVCodecContext *avctx,
+                         struct aom_codec_enc_cfg *enccfg)
+{
+    AOMContext *ctx = avctx->priv_data;
+    int sb_128x128_possible, sb_size, sb_width, sb_height;
+    int uniform_rows, uniform_cols;
+    int uniform_64x64_possible, uniform_128x128_possible;
+    int tile_size, rounding, i;
+
+    if (ctx->tile_cols_log2 >= 0)
+        ctx->tile_cols = 1 << ctx->tile_cols_log2;
+    if (ctx->tile_rows_log2 >= 0)
+        ctx->tile_rows = 1 << ctx->tile_rows_log2;
+
+    if (ctx->tile_cols == 0) {
+        ctx->tile_cols = (avctx->width + AV1_MAX_TILE_WIDTH - 1) /
+            AV1_MAX_TILE_WIDTH;
+        if (ctx->tile_cols > 1) {
+            av_log(avctx, AV_LOG_DEBUG, "Automatically using %d tile "
+                   "columns to fill width.\n", ctx->tile_cols);
+        }
+    }
+    av_assert0(ctx->tile_cols > 0);
+    if (ctx->tile_rows == 0) {
+        int max_tile_width =
+            FFALIGN((FFALIGN(avctx->width, 128) +
+                     ctx->tile_cols - 1) / ctx->tile_cols, 128);
+        ctx->tile_rows =
+            (max_tile_width * FFALIGN(avctx->height, 128) +
+             AV1_MAX_TILE_AREA - 1) / AV1_MAX_TILE_AREA;
+        if (ctx->tile_rows > 1) {
+            av_log(avctx, AV_LOG_DEBUG, "Automatically using %d tile "
+                   "rows to fill area.\n", ctx->tile_rows);
+        }
+    }
+    av_assert0(ctx->tile_rows > 0);
+
+    if ((avctx->width  + 63) / 64 < ctx->tile_cols ||
+        (avctx->height + 63) / 64 < ctx->tile_rows) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid tile sizing: frame not "
+               "large enough to fit specified tile arrangement.\n");
+        return AVERROR(EINVAL);
+    }
+    if (ctx->tile_cols > AV1_MAX_TILE_COLS ||
+        ctx->tile_rows > AV1_MAX_TILE_ROWS) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid tile sizing: AV1 does "
+               "not allow more than %dx%d tiles.\n",
+               AV1_MAX_TILE_COLS, AV1_MAX_TILE_ROWS);
+        return AVERROR(EINVAL);
+    }
+    if (avctx->width / ctx->tile_cols > AV1_MAX_TILE_WIDTH) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid tile sizing: AV1 does "
+               "not allow tiles of width greater than %d.\n",
+               AV1_MAX_TILE_WIDTH);
+        return AVERROR(EINVAL);
+    }
+
+    ctx->superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC;
+
+    if (ctx->tile_cols == 1 && ctx->tile_rows == 1) {
+        av_log(avctx, AV_LOG_DEBUG, "Using a single tile.\n");
+        return 0;
+    }
+
+    sb_128x128_possible =
+        (avctx->width  + 127) / 128 >= ctx->tile_cols &&
+        (avctx->height + 127) / 128 >= ctx->tile_rows;
+
+    ctx->tile_cols_log2 = ctx->tile_cols == 1 ? 0 :
+        av_log2(ctx->tile_cols - 1) + 1;
+    ctx->tile_rows_log2 = ctx->tile_rows == 1 ? 0 :
+        av_log2(ctx->tile_rows - 1) + 1;
+
+    uniform_cols = count_uniform_tiling(avctx->width,
+                                        64, ctx->tile_cols_log2);
+    uniform_rows = count_uniform_tiling(avctx->height,
+                                        64, ctx->tile_rows_log2);
+    av_log(avctx, AV_LOG_DEBUG, "Uniform with 64x64 superblocks "
+           "-> %dx%d tiles.\n", uniform_cols, uniform_rows);
+    uniform_64x64_possible = uniform_cols == ctx->tile_cols &&
+                             uniform_rows == ctx->tile_rows;
+
+    if (sb_128x128_possible) {
+        uniform_cols = count_uniform_tiling(avctx->width,
+                                            128, ctx->tile_cols_log2);
+        uniform_rows = count_uniform_tiling(avctx->height,
+                                            128, ctx->tile_rows_log2);
+        av_log(avctx, AV_LOG_DEBUG, "Uniform with 128x128 superblocks "
+               "-> %dx%d tiles.\n", uniform_cols, uniform_rows);
+        uniform_128x128_possible = uniform_cols == ctx->tile_cols &&
+                                   uniform_rows == ctx->tile_rows;
+    } else {
+        av_log(avctx, AV_LOG_DEBUG, "128x128 superblocks not possible.\n");
+        uniform_128x128_possible = 0;
+    }
+
+    ctx->uniform_tiles = 1;
+    if (uniform_64x64_possible && uniform_128x128_possible) {
+        av_log(avctx, AV_LOG_DEBUG, "Using uniform tiling with dynamic "
+               "superblocks (tile_cols_log2 = %d, tile_rows_log2 = %d).\n",
+               ctx->tile_cols_log2, ctx->tile_rows_log2);
+        return 0;
+    }
+    if (uniform_64x64_possible && !sb_128x128_possible) {
+        av_log(avctx, AV_LOG_DEBUG, "Using uniform tiling with 64x64 "
+               "superblocks (tile_cols_log2 = %d, tile_rows_log2 = %d).\n",
+               ctx->tile_cols_log2, ctx->tile_rows_log2);
+        ctx->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+        return 0;
+    }
+    if (uniform_128x128_possible) {
+        av_log(avctx, AV_LOG_DEBUG, "Using uniform tiling with 128x128 "
+               "superblocks (tile_cols_log2 = %d, tile_rows_log2 = %d).\n",
+               ctx->tile_cols_log2, ctx->tile_rows_log2);
+        ctx->superblock_size = AOM_SUPERBLOCK_SIZE_128X128;
+        return 0;
+    }
+    ctx->uniform_tiles = 0;
+
+    if (sb_128x128_possible) {
+        sb_size = 128;
+        ctx->superblock_size = AOM_SUPERBLOCK_SIZE_128X128;
+    } else {
+        sb_size = 64;
+        ctx->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Using fixed tiling with %dx%d "
+           "superblocks (tile_cols = %d, tile_rows = %d).\n",
+           sb_size, sb_size, ctx->tile_cols, ctx->tile_rows);
+
+    enccfg->tile_width_count  = ctx->tile_cols;
+    enccfg->tile_height_count = ctx->tile_rows;
+
+    sb_width  = (avctx->width  + sb_size - 1) / sb_size;
+    sb_height = (avctx->height + sb_size - 1) / sb_size;
+
+    tile_size = sb_width / ctx->tile_cols;
+    rounding  = sb_width % ctx->tile_cols;
+    for (i = 0; i < ctx->tile_cols; i++) {
+        enccfg->tile_widths[i] = tile_size +
+            (i < rounding / 2 ||
+             i > ctx->tile_cols - 1 - (rounding + 1) / 2);
+    }
+
+    tile_size = sb_height / ctx->tile_rows;
+    rounding  = sb_height % ctx->tile_rows;
+    for (i = 0; i < ctx->tile_rows; i++) {
+        enccfg->tile_heights[i] = tile_size +
+            (i < rounding / 2 ||
+             i > ctx->tile_rows - 1 - (rounding + 1) / 2);
+    }
+
+    return 0;
+}
+
+static av_cold int aom_init(AVCodecContext *avctx,
+                            const struct aom_codec_iface *iface)
 {
     AOMContext *ctx = avctx->priv_data;
     struct aom_codec_enc_cfg enccfg = { 0 };
+#ifdef AOM_FRAME_IS_INTRAONLY
+    aom_codec_flags_t flags =
+        (avctx->flags & AV_CODEC_FLAG_PSNR) ? AOM_CODEC_USE_PSNR : 0;
+#else
+    aom_codec_flags_t flags = 0;
+#endif
     AVCPBProperties *cpb_props;
-    int res, h_shift, v_shift;
-    const struct aom_codec_iface *iface = &aom_codec_av1_cx_algo;
+    int res;
+    aom_img_fmt_t img_fmt;
+    aom_codec_caps_t codec_caps = aom_codec_get_caps(iface);
 
     av_log(avctx, AV_LOG_INFO, "%s\n", aom_codec_version_str());
     av_log(avctx, AV_LOG_VERBOSE, "%s\n", aom_codec_build_config());
@@ -210,13 +497,24 @@ static av_cold int aom_init(AVCodecContext *avctx)
                aom_codec_err_to_string(res));
         return AVERROR(EINVAL);
     }
+
+    if (set_pix_fmt(avctx, codec_caps, &enccfg, &flags, &img_fmt))
+        return AVERROR(EINVAL);
+
+    if(!avctx->bit_rate)
+        if(avctx->rc_max_rate || avctx->rc_buffer_size || avctx->rc_initial_buffer_occupancy) {
+            av_log( avctx, AV_LOG_ERROR, "Rate control parameters set without a bitrate\n");
+            return AVERROR(EINVAL);
+        }
+
     dump_enc_cfg(avctx, &enccfg);
 
     enccfg.g_w            = avctx->width;
     enccfg.g_h            = avctx->height;
     enccfg.g_timebase.num = avctx->time_base.num;
     enccfg.g_timebase.den = avctx->time_base.den;
-    enccfg.g_threads      = avctx->thread_count;
+    enccfg.g_threads      =
+        FFMIN(avctx->thread_count ? avctx->thread_count : av_cpu_count(), 64);
 
     if (ctx->lag_in_frames >= 0)
         enccfg.g_lag_in_frames = ctx->lag_in_frames;
@@ -228,29 +526,50 @@ static av_cold int aom_init(AVCodecContext *avctx)
     else
         enccfg.g_pass = AOM_RC_ONE_PASS;
 
-    if (!avctx->bit_rate)
-        avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
-    else
+    if (avctx->rc_min_rate == avctx->rc_max_rate &&
+        avctx->rc_min_rate == avctx->bit_rate && avctx->bit_rate) {
+        enccfg.rc_end_usage = AOM_CBR;
+    } else if (ctx->crf >= 0) {
+        enccfg.rc_end_usage = AOM_CQ;
+        if (!avctx->bit_rate)
+            enccfg.rc_end_usage = AOM_Q;
+    }
+
+    if (avctx->bit_rate) {
         enccfg.rc_target_bitrate = av_rescale_rnd(avctx->bit_rate, 1, 1000,
                                                   AV_ROUND_NEAR_INF);
+    } else if (enccfg.rc_end_usage != AOM_Q) {
+        if (enccfg.rc_end_usage == AOM_CQ) {
+            enccfg.rc_target_bitrate = 1000000;
+        } else {
+            avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
+            av_log(avctx, AV_LOG_WARNING,
+                   "Neither bitrate nor constrained quality specified, using default bitrate of %dkbit/sec\n",
+                   enccfg.rc_target_bitrate);
+        }
+    }
 
-    if (ctx->crf)
-        enccfg.rc_end_usage = AOM_CQ;
-    else if (avctx->rc_min_rate == avctx->rc_max_rate &&
-             avctx->rc_min_rate == avctx->bit_rate)
-        enccfg.rc_end_usage = AOM_CBR;
-
-    if (avctx->qmin > 0)
+    if (avctx->qmin >= 0)
         enccfg.rc_min_quantizer = avctx->qmin;
-    if (avctx->qmax > 0)
+    if (avctx->qmax >= 0)
         enccfg.rc_max_quantizer = avctx->qmax;
 
+    if (enccfg.rc_end_usage == AOM_CQ || enccfg.rc_end_usage == AOM_Q) {
+        if (ctx->crf < enccfg.rc_min_quantizer || ctx->crf > enccfg.rc_max_quantizer) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "CQ level %d must be between minimum and maximum quantizer value (%d-%d)\n",
+                   ctx->crf, enccfg.rc_min_quantizer, enccfg.rc_max_quantizer);
+            return AVERROR(EINVAL);
+        }
+    }
+
     enccfg.rc_dropframe_thresh = ctx->drop_threshold;
 
     // 0-100 (0 => CBR, 100 => VBR)
     enccfg.rc_2pass_vbr_bias_pct       = round(avctx->qcompress * 100);
-    enccfg.rc_2pass_vbr_minsection_pct =
-        avctx->rc_min_rate * 100LL / avctx->bit_rate;
+    if (avctx->bit_rate)
+        enccfg.rc_2pass_vbr_minsection_pct =
+            avctx->rc_min_rate * 100LL / avctx->bit_rate;
     if (avctx->rc_max_rate)
         enccfg.rc_2pass_vbr_maxsection_pct =
             avctx->rc_max_rate * 100LL / avctx->bit_rate;
@@ -283,8 +602,9 @@ static av_cold int aom_init(AVCodecContext *avctx)
         ret                   = av_reallocp(&ctx->twopass_stats.buf, ctx->twopass_stats.sz);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "Stat buffer alloc (%zu bytes) failed\n",
+                   "Stat buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                    ctx->twopass_stats.sz);
+            ctx->twopass_stats.sz = 0;
             return ret;
         }
         decode_size = av_base64_decode(ctx->twopass_stats.buf, avctx->stats_in,
@@ -303,22 +623,16 @@ static av_cold int aom_init(AVCodecContext *avctx)
      * quality. */
     if (avctx->profile != FF_PROFILE_UNKNOWN)
         enccfg.g_profile = avctx->profile;
-    else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-             avctx->pix_fmt == AV_PIX_FMT_YUV420P10)
-        avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_MAIN;
-    else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P ||
-             avctx->pix_fmt == AV_PIX_FMT_YUV444P10)
-        avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_HIGH;
-    else {
-        avctx->profile = enccfg.g_profile = FF_PROFILE_AV1_PROFESSIONAL;
-    }
-
 
     enccfg.g_error_resilient = ctx->error_resilient;
 
+    res = choose_tiling(avctx, &enccfg);
+    if (res < 0)
+        return res;
+
     dump_enc_cfg(avctx, &enccfg);
     /* Construct Encoder Context */
-    res = aom_codec_enc_init(&ctx->encoder, iface, &enccfg, 0);
+    res = aom_codec_enc_init(&ctx->encoder, iface, &enccfg, flags);
     if (res != AOM_CODEC_OK) {
         log_encoder_error(avctx, "Failed to initialize encoder");
         return AVERROR(EINVAL);
@@ -326,50 +640,60 @@ static av_cold int aom_init(AVCodecContext *avctx)
 
     // codec control failures are currently treated only as warnings
     av_log(avctx, AV_LOG_DEBUG, "aom_codec_control\n");
-    if (ctx->cpu_used != INT_MIN)
-        codecctl_int(avctx, AOME_SET_CPUUSED, ctx->cpu_used);
+    codecctl_int(avctx, AOME_SET_CPUUSED, ctx->cpu_used);
     if (ctx->auto_alt_ref >= 0)
         codecctl_int(avctx, AOME_SET_ENABLEAUTOALTREF, ctx->auto_alt_ref);
 
     codecctl_int(avctx, AOME_SET_STATIC_THRESHOLD, ctx->static_thresh);
-    codecctl_int(avctx, AOME_SET_CQ_LEVEL, ctx->crf);
-
-    res = av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &h_shift, &v_shift);
-    if (res < 0)
-        return res;
+    if (ctx->crf >= 0)
+        codecctl_int(avctx, AOME_SET_CQ_LEVEL,          ctx->crf);
+
+    codecctl_int(avctx, AV1E_SET_COLOR_PRIMARIES, avctx->color_primaries);
+    codecctl_int(avctx, AV1E_SET_MATRIX_COEFFICIENTS, avctx->colorspace);
+    codecctl_int(avctx, AV1E_SET_TRANSFER_CHARACTERISTICS, avctx->color_trc);
+    set_color_range(avctx);
+
+    codecctl_int(avctx, AV1E_SET_SUPERBLOCK_SIZE, ctx->superblock_size);
+    if (ctx->uniform_tiles) {
+        codecctl_int(avctx, AV1E_SET_TILE_COLUMNS, ctx->tile_cols_log2);
+        codecctl_int(avctx, AV1E_SET_TILE_ROWS,    ctx->tile_rows_log2);
+    }
 
-    codecctl_int(avctx, AV1E_SET_CHROMA_SUBSAMPLING_X, h_shift);
-    codecctl_int(avctx, AV1E_SET_CHROMA_SUBSAMPLING_Y, v_shift);
+#ifdef AOM_CTRL_AV1E_SET_ROW_MT
+    codecctl_int(avctx, AV1E_SET_ROW_MT, ctx->row_mt);
+#endif
 
     // provide dummy value to initialize wrapper, values will be updated each _encode()
-    aom_img_wrap(&ctx->rawimg, ff_aom_pixfmt_to_imgfmt(avctx->pix_fmt),
-                 avctx->width, avctx->height, 1, (unsigned char *)1);
+    aom_img_wrap(&ctx->rawimg, img_fmt, avctx->width, avctx->height, 1,
+                 (unsigned char*)1);
+
+    if (codec_caps & AOM_CODEC_CAP_HIGHBITDEPTH)
+        ctx->rawimg.bit_depth = enccfg.g_bit_depth;
 
     cpb_props = ff_add_cpb_side_data(avctx);
     if (!cpb_props)
         return AVERROR(ENOMEM);
 
     if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
-        aom_fixed_buf_t *seq = aom_codec_get_global_headers(&ctx->encoder);
-        if (!seq)
-            return AVERROR_UNKNOWN;
-
-        avctx->extradata = av_malloc(seq->sz + AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!avctx->extradata) {
-            free(seq->buf);
-            free(seq);
-            return AVERROR(ENOMEM);
+        const AVBitStreamFilter *filter = av_bsf_get_by_name("extract_extradata");
+        int ret;
+
+        if (!filter) {
+            av_log(avctx, AV_LOG_ERROR, "extract_extradata bitstream filter "
+                   "not found. This is a bug, please report it.\n");
+            return AVERROR_BUG;
         }
-        avctx->extradata_size = seq->sz;
-        memcpy(avctx->extradata, seq->buf, seq->sz);
-        memset(avctx->extradata + seq->sz, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        ret = av_bsf_alloc(filter, &ctx->bsf);
+        if (ret < 0)
+            return ret;
+
+        ret = avcodec_parameters_from_context(ctx->bsf->par_in, avctx);
+        if (ret < 0)
+           return ret;
 
-        /* Doxy says: "The caller owns the memory associated with this buffer.
-         *             Memory is allocated using malloc(), and should be freed
-         *             via call to free()"
-         */
-        free(seq->buf);
-        free(seq);
+        ret = av_bsf_init(ctx->bsf);
+        if (ret < 0)
+           return ret;
     }
 
     if (enccfg.rc_end_usage == AOM_CBR ||
@@ -383,7 +707,8 @@ static av_cold int aom_init(AVCodecContext *avctx)
     return 0;
 }
 
-static inline void cx_pktcpy(struct FrameListData *dst,
+static inline void cx_pktcpy(AOMContext *ctx,
+                             struct FrameListData *dst,
                              const struct aom_codec_cx_pkt *src)
 {
     dst->pts      = src->data.frame.pts;
@@ -391,6 +716,17 @@ static inline void cx_pktcpy(struct FrameListData *dst,
     dst->flags    = src->data.frame.flags;
     dst->sz       = src->data.frame.sz;
     dst->buf      = src->data.frame.buf;
+#ifdef AOM_FRAME_IS_INTRAONLY
+    dst->have_sse = 0;
+    dst->frame_number = ++ctx->frame_number;
+    dst->have_sse = ctx->have_sse;
+    if (ctx->have_sse) {
+        /* associate last-seen SSE to the frame. */
+        /* Transfers ownership from ctx to dst. */
+        memcpy(dst->sse, ctx->sse, sizeof(dst->sse));
+        ctx->have_sse = 0;
+    }
+#endif
 }
 
 /**
@@ -403,17 +739,54 @@ static inline void cx_pktcpy(struct FrameListData *dst,
 static int storeframe(AVCodecContext *avctx, struct FrameListData *cx_frame,
                       AVPacket *pkt)
 {
-    int ret = ff_alloc_packet(pkt, cx_frame->sz);
+    AOMContext *ctx = avctx->priv_data;
+    int pict_type;
+    int ret = ff_alloc_packet2(avctx, pkt, cx_frame->sz, 0);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR,
-               "Error getting output packet of size %zu.\n", cx_frame->sz);
+               "Error getting output packet of size %"SIZE_SPECIFIER".\n", cx_frame->sz);
         return ret;
     }
     memcpy(pkt->data, cx_frame->buf, pkt->size);
     pkt->pts = pkt->dts = cx_frame->pts;
 
-    if (!!(cx_frame->flags & AOM_FRAME_IS_KEY))
+    if (!!(cx_frame->flags & AOM_FRAME_IS_KEY)) {
         pkt->flags |= AV_PKT_FLAG_KEY;
+#ifdef AOM_FRAME_IS_INTRAONLY
+        pict_type = AV_PICTURE_TYPE_I;
+    } else if (cx_frame->flags & AOM_FRAME_IS_INTRAONLY) {
+        pict_type = AV_PICTURE_TYPE_I;
+    } else {
+        pict_type = AV_PICTURE_TYPE_P;
+    }
+
+    ff_side_data_set_encoder_stats(pkt, 0, cx_frame->sse + 1,
+                                   cx_frame->have_sse ? 3 : 0, pict_type);
+
+    if (cx_frame->have_sse) {
+        int i;
+        for (i = 0; i < 3; ++i) {
+            avctx->error[i] += cx_frame->sse[i + 1];
+        }
+        cx_frame->have_sse = 0;
+#endif
+    }
+
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+        ret = av_bsf_send_packet(ctx->bsf, pkt);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "extract_extradata filter "
+                   "failed to send input packet\n");
+            return ret;
+        }
+        ret = av_bsf_receive_packet(ctx->bsf, pkt);
+
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "extract_extradata filter "
+                   "failed to receive output packet\n");
+            return ret;
+        }
+    }
     return pkt->size;
 }
 
@@ -452,8 +825,8 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 
                 /* avoid storing the frame when the list is empty and we haven't yet
                  * provided a frame for output */
-                assert(!ctx->coded_frame_list);
-                cx_pktcpy(&cx_frame, pkt);
+                av_assert0(!ctx->coded_frame_list);
+                cx_pktcpy(ctx, &cx_frame, pkt);
                 size = storeframe(avctx, &cx_frame, pkt_out);
                 if (size < 0)
                     return size;
@@ -466,12 +839,12 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
                            "Frame queue element alloc failed\n");
                     return AVERROR(ENOMEM);
                 }
-                cx_pktcpy(cx_frame, pkt);
+                cx_pktcpy(ctx, cx_frame, pkt);
                 cx_frame->buf = av_malloc(cx_frame->sz);
 
                 if (!cx_frame->buf) {
                     av_log(avctx, AV_LOG_ERROR,
-                           "Data buffer alloc (%zu bytes) failed\n",
+                           "Data buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                            cx_frame->sz);
                     av_freep(&cx_frame);
                     return AVERROR(ENOMEM);
@@ -496,7 +869,18 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
             stats->sz += pkt->data.twopass_stats.sz;
             break;
         }
-        case AOM_CODEC_PSNR_PKT: // FIXME add support for AV_CODEC_FLAG_PSNR
+#ifdef AOM_FRAME_IS_INTRAONLY
+        case AOM_CODEC_PSNR_PKT:
+        {
+            av_assert0(!ctx->have_sse);
+            ctx->sse[0] = pkt->data.psnr.sse[0];
+            ctx->sse[1] = pkt->data.psnr.sse[1];
+            ctx->sse[2] = pkt->data.psnr.sse[2];
+            ctx->sse[3] = pkt->data.psnr.sse[3];
+            ctx->have_sse = 1;
+            break;
+        }
+#endif
         case AOM_CODEC_CUSTOM_PKT:
             // ignore unsupported/unrecognized packet types
             break;
@@ -550,7 +934,7 @@ static int aom_encode(AVCodecContext *avctx, AVPacket *pkt,
 
         avctx->stats_out = av_malloc(b64_size);
         if (!avctx->stats_out) {
-            av_log(avctx, AV_LOG_ERROR, "Stat buffer alloc (%zu bytes) failed\n",
+            av_log(avctx, AV_LOG_ERROR, "Stat buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                    b64_size);
             return AVERROR(ENOMEM);
         }
@@ -562,24 +946,62 @@ static int aom_encode(AVCodecContext *avctx, AVPacket *pkt,
     return 0;
 }
 
+static const enum AVPixelFormat av1_pix_fmts[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat av1_pix_fmts_highbd[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV420P10,
+    AV_PIX_FMT_YUV422P10,
+    AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_YUV420P12,
+    AV_PIX_FMT_YUV422P12,
+    AV_PIX_FMT_YUV444P12,
+    AV_PIX_FMT_NONE
+};
+
+static av_cold void av1_init_static(AVCodec *codec)
+{
+    aom_codec_caps_t codec_caps = aom_codec_get_caps(aom_codec_av1_cx());
+    if (codec_caps & AOM_CODEC_CAP_HIGHBITDEPTH)
+        codec->pix_fmts = av1_pix_fmts_highbd;
+    else
+        codec->pix_fmts = av1_pix_fmts;
+}
+
+static av_cold int av1_init(AVCodecContext *avctx)
+{
+    return aom_init(avctx, aom_codec_av1_cx());
+}
+
 #define OFFSET(x) offsetof(AOMContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "cpu-used",        "Quality/Speed ratio modifier",           OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1}, INT_MIN, INT_MAX, VE},
+    { "cpu-used",        "Quality/Speed ratio modifier",           OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1}, 0, 8, VE},
     { "auto-alt-ref",    "Enable use of alternate reference "
-                         "frames (2-pass only)",                   OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1},      -1,      1,       VE},
+                         "frames (2-pass only)",                   OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1},      -1,      2,       VE},
     { "lag-in-frames",   "Number of frames to look ahead at for "
                          "alternate reference frame selection",    OFFSET(lag_in_frames),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
     { "error-resilience", "Error resilience configuration", OFFSET(error_resilient), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, VE, "er"},
     { "default",         "Improve resiliency against losses of whole frames", 0, AV_OPT_TYPE_CONST, {.i64 = AOM_ERROR_RESILIENT_DEFAULT}, 0, 0, VE, "er"},
-    { "crf",              "Select the quality for constant quality mode", offsetof(AOMContext, crf), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 63, VE },
+    { "crf",              "Select the quality for constant quality mode", offsetof(AOMContext, crf), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 63, VE },
     { "static-thresh",    "A change threshold on blocks below which they will be skipped by the encoder", OFFSET(static_thresh), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
     { "drop-threshold",   "Frame drop threshold", offsetof(AOMContext, drop_threshold), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, VE },
-    { "noise-sensitivity", "Noise sensitivity", OFFSET(noise_sensitivity), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 4, VE},
+    { "tiles",            "Tile columns x rows", OFFSET(tile_cols), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, VE },
+    { "tile-columns",     "Log2 of number of tile columns to use", OFFSET(tile_cols_log2), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 6, VE},
+    { "tile-rows",        "Log2 of number of tile rows to use",    OFFSET(tile_rows_log2), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 6, VE},
+    { "row-mt",           "Enable row based multi-threading",      OFFSET(row_mt),         AV_OPT_TYPE_BOOL, {.i64 = 0},  0, 1, VE},
     { NULL }
 };
 
 static const AVCodecDefault defaults[] = {
+    { "b",          "256*1000" },
     { "qmin",             "-1" },
     { "qmax",             "-1" },
     { "g",                "-1" },
@@ -588,7 +1010,7 @@ static const AVCodecDefault defaults[] = {
 };
 
 static const AVClass class_aom = {
-    .class_name = "libaom encoder",
+    .class_name = "libaom-av1 encoder",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
@@ -600,12 +1022,13 @@ AVCodec ff_libaom_av1_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_AV1,
     .priv_data_size = sizeof(AOMContext),
-    .init           = aom_init,
+    .init           = av1_init,
     .encode2        = aom_encode,
     .close          = aom_free,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS | AV_CODEC_CAP_EXPERIMENTAL,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_av1_profiles),
     .priv_class     = &class_aom,
     .defaults       = defaults,
+    .init_static_data = av1_init_static,
     .wrapper_name   = "libaom",
 };
diff --git a/libavcodec/libaribb24.c b/libavcodec/libaribb24.c
new file mode 100644
index 0000000..3a59938
--- /dev/null
+++ b/libavcodec/libaribb24.c
@@ -0,0 +1,395 @@
+/*
+ * ARIB STD-B24 caption decoder using the libaribb24 library
+ * Copyright (c) 2019 Jan Ekström
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "libavcodec/ass.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+
+#include <aribb24/aribb24.h>
+#include <aribb24/parser.h>
+#include <aribb24/decoder.h>
+
+typedef struct Libaribb24Context {
+    AVClass *class;
+
+    arib_instance_t *lib_instance;
+    arib_parser_t *parser;
+    arib_decoder_t *decoder;
+
+    int read_order;
+
+    char        *aribb24_base_path;
+    unsigned int aribb24_skip_ruby;
+} Libaribb24Context;
+
+static unsigned int get_profile_font_size(int profile)
+{
+    switch (profile) {
+    case FF_PROFILE_ARIB_PROFILE_A:
+        return 36;
+    case FF_PROFILE_ARIB_PROFILE_C:
+        return 18;
+    default:
+        return 0;
+    }
+}
+
+static void libaribb24_log(void *p, const char *msg)
+{
+    av_log((AVCodecContext *)p, AV_LOG_INFO, "%s\n", msg);
+}
+
+static int libaribb24_generate_ass_header(AVCodecContext *avctx)
+{
+    unsigned int plane_width = 0;
+    unsigned int plane_height = 0;
+    unsigned int font_size = 0;
+
+    switch (avctx->profile) {
+    case FF_PROFILE_ARIB_PROFILE_A:
+        plane_width = 960;
+        plane_height = 540;
+        font_size = get_profile_font_size(avctx->profile);
+        break;
+    case FF_PROFILE_ARIB_PROFILE_C:
+        plane_width = 320;
+        plane_height = 180;
+        font_size = get_profile_font_size(avctx->profile);
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown or unsupported profile set!\n");
+        return AVERROR(EINVAL);
+    }
+
+    avctx->subtitle_header = av_asprintf(
+             "[Script Info]\r\n"
+             "; Script generated by FFmpeg/Lavc%s\r\n"
+             "ScriptType: v4.00+\r\n"
+             "PlayResX: %d\r\n"
+             "PlayResY: %d\r\n"
+             "\r\n"
+             "[V4+ Styles]\r\n"
+
+             /* ASSv4 header */
+             "Format: Name, "
+             "Fontname, Fontsize, "
+             "PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+             "Bold, Italic, Underline, StrikeOut, "
+             "ScaleX, ScaleY, "
+             "Spacing, Angle, "
+             "BorderStyle, Outline, Shadow, "
+             "Alignment, MarginL, MarginR, MarginV, "
+             "Encoding\r\n"
+
+             "Style: "
+             "Default,"             /* Name */
+             "%s,%d,"               /* Font{name,size} */
+             "&H%x,&H%x,&H%x,&H%x," /* {Primary,Secondary,Outline,Back}Colour */
+             "%d,%d,%d,0,"          /* Bold, Italic, Underline, StrikeOut */
+             "100,100,"             /* Scale{X,Y} */
+             "0,0,"                 /* Spacing, Angle */
+             "%d,1,0,"              /* BorderStyle, Outline, Shadow */
+             "%d,10,10,10,"         /* Alignment, Margin[LRV] */
+             "0\r\n"                /* Encoding */
+
+             "\r\n"
+             "[Events]\r\n"
+             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n",
+             !(avctx->flags & AV_CODEC_FLAG_BITEXACT) ? AV_STRINGIFY(LIBAVCODEC_VERSION) : "",
+             plane_width, plane_height,
+             ASS_DEFAULT_FONT, font_size, ASS_DEFAULT_COLOR,
+             ASS_DEFAULT_COLOR, ASS_DEFAULT_BACK_COLOR, ASS_DEFAULT_BACK_COLOR,
+             -ASS_DEFAULT_BOLD, -ASS_DEFAULT_ITALIC, -ASS_DEFAULT_UNDERLINE,
+             ASS_DEFAULT_BORDERSTYLE, ASS_DEFAULT_ALIGNMENT);
+
+    if (!avctx->subtitle_header)
+        return AVERROR(ENOMEM);
+
+    avctx->subtitle_header_size = strlen(avctx->subtitle_header);
+
+    return 0;
+}
+
+static int libaribb24_init(AVCodecContext *avctx)
+{
+    Libaribb24Context *b24 = avctx->priv_data;
+    void(* arib_dec_init)(arib_decoder_t* decoder) = NULL;
+    int ret_code = AVERROR_EXTERNAL;
+
+    if (!(b24->lib_instance = arib_instance_new(avctx))) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to initialize libaribb24!\n");
+        goto init_fail;
+    }
+
+    if (b24->aribb24_base_path) {
+        av_log(avctx, AV_LOG_INFO, "Setting the libaribb24 base path to '%s'\n",
+               b24->aribb24_base_path);
+        arib_set_base_path(b24->lib_instance, b24->aribb24_base_path);
+    }
+
+    arib_register_messages_callback(b24->lib_instance, libaribb24_log);
+
+    if (!(b24->parser = arib_get_parser(b24->lib_instance))) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to initialize libaribb24 PES parser!\n");
+        goto init_fail;
+    }
+    if (!(b24->decoder = arib_get_decoder(b24->lib_instance))) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to initialize libaribb24 decoder!\n");
+        goto init_fail;
+    }
+
+    switch (avctx->profile) {
+    case FF_PROFILE_ARIB_PROFILE_A:
+        arib_dec_init = arib_initialize_decoder_a_profile;
+        break;
+    case FF_PROFILE_ARIB_PROFILE_C:
+        arib_dec_init = arib_initialize_decoder_c_profile;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown or unsupported profile set!\n");
+        ret_code = AVERROR(EINVAL);
+        goto init_fail;
+    }
+
+    arib_dec_init(b24->decoder);
+
+    if (libaribb24_generate_ass_header(avctx) < 0) {
+        ret_code = AVERROR(ENOMEM);
+        goto init_fail;
+    }
+
+    return 0;
+
+init_fail:
+    if (b24->decoder)
+        arib_finalize_decoder(b24->decoder);
+
+    if (b24->lib_instance)
+        arib_instance_destroy(b24->lib_instance);
+
+    return ret_code;
+}
+
+static int libaribb24_close(AVCodecContext *avctx)
+{
+    Libaribb24Context *b24 = avctx->priv_data;
+
+    if (b24->decoder)
+        arib_finalize_decoder(b24->decoder);
+
+    if (b24->lib_instance)
+        arib_instance_destroy(b24->lib_instance);
+
+    return 0;
+}
+
+#define RGB_TO_BGR(c) (((c) & 0xff) << 16 | ((c) & 0xff00) | (((c) >> 16) & 0xff))
+
+static int libaribb24_handle_regions(AVCodecContext *avctx, AVSubtitle *sub)
+{
+    Libaribb24Context *b24 = avctx->priv_data;
+    const arib_buf_region_t *region = arib_decoder_get_regions(b24->decoder);
+    unsigned int profile_font_size = get_profile_font_size(avctx->profile);
+    AVBPrint buf = { 0 };
+    int ret = 0;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    while (region) {
+        ptrdiff_t region_length = region->p_end - region->p_start;
+        unsigned int ruby_region =
+            region->i_fontheight == (profile_font_size / 2);
+
+        // ASS requires us to make the colors BGR, so we convert here
+        int foreground_bgr_color = RGB_TO_BGR(region->i_foreground_color);
+        int background_bgr_color = RGB_TO_BGR(region->i_background_color);
+
+        if (region_length < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid negative region length!\n");
+            ret = AVERROR_INVALIDDATA;
+            break;
+        }
+
+        if (region_length == 0 || (ruby_region && b24->aribb24_skip_ruby)) {
+            goto next_region;
+        }
+
+        // color and alpha
+        if (foreground_bgr_color != ASS_DEFAULT_COLOR)
+            av_bprintf(&buf, "{\\1c&H%06x&}", foreground_bgr_color);
+
+        if (region->i_foreground_alpha != 0)
+            av_bprintf(&buf, "{\\1a&H%02x&}", region->i_foreground_alpha);
+
+        if (background_bgr_color != ASS_DEFAULT_BACK_COLOR)
+            av_bprintf(&buf, "{\\3c&H%06x&}", background_bgr_color);
+
+        if (region->i_background_alpha != 0)
+            av_bprintf(&buf, "{\\3a&H%02x&}", region->i_background_alpha);
+
+        // font size
+        if (region->i_fontwidth  != profile_font_size ||
+            region->i_fontheight != profile_font_size) {
+            av_bprintf(&buf, "{\\fscx%"PRId64"\\fscy%"PRId64"}",
+                       av_rescale(region->i_fontwidth, 100,
+                                  profile_font_size),
+                       av_rescale(region->i_fontheight, 100,
+                                  profile_font_size));
+        }
+
+        // TODO: positioning
+
+        av_bprint_append_data(&buf, region->p_start, region_length);
+
+        av_bprintf(&buf, "{\\r}");
+
+next_region:
+        region = region->p_next;
+    }
+
+    if (!av_bprint_is_complete(&buf))
+        ret = AVERROR(ENOMEM);
+
+    if (ret == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "Styled ASS line: %s\n",
+               buf.str);
+
+        ret = ff_ass_add_rect(sub, buf.str, b24->read_order++,
+                              0, NULL, NULL);
+    }
+
+    av_bprint_finalize(&buf, NULL);
+
+    return ret;
+}
+
+static int libaribb24_decode(AVCodecContext *avctx, void *data, int *got_sub_ptr, AVPacket *pkt)
+{
+    Libaribb24Context *b24 = avctx->priv_data;
+    AVSubtitle *sub = data;
+    size_t parsed_data_size = 0;
+    size_t decoded_subtitle_size = 0;
+    const unsigned char *parsed_data = NULL;
+    char *decoded_subtitle = NULL;
+    time_t subtitle_duration = 0;
+    int ret = 0;
+
+    if (pkt->size <= 0)
+        return pkt->size;
+
+    arib_parse_pes(b24->parser, pkt->data, pkt->size);
+
+    parsed_data = arib_parser_get_data(b24->parser,
+                                       &parsed_data_size);
+    if (!parsed_data || !parsed_data_size) {
+        av_log(avctx, AV_LOG_DEBUG, "No decode'able data was received from "
+                                    "packet (dts: %"PRId64", pts: %"PRId64").\n",
+               pkt->dts, pkt->pts);
+        return pkt->size;
+    }
+
+    decoded_subtitle_size = parsed_data_size * 4;
+    if (!(decoded_subtitle = av_mallocz(decoded_subtitle_size + 1))) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Failed to allocate buffer for decoded subtitle!\n");
+        return AVERROR(ENOMEM);
+    }
+
+    decoded_subtitle_size = arib_decode_buffer(b24->decoder,
+                                               parsed_data,
+                                               parsed_data_size,
+                                               decoded_subtitle,
+                                               decoded_subtitle_size);
+
+    subtitle_duration = arib_decoder_get_time(b24->decoder);
+
+    if (avctx->pkt_timebase.num && pkt->pts != AV_NOPTS_VALUE)
+        sub->pts = av_rescale_q(pkt->pts,
+                                avctx->pkt_timebase, AV_TIME_BASE_Q);
+
+    sub->end_display_time = subtitle_duration ?
+                            av_rescale_q(subtitle_duration,
+                                         AV_TIME_BASE_Q,
+                                         (AVRational){1, 1000}) :
+                            UINT32_MAX;
+
+    av_log(avctx, AV_LOG_DEBUG,
+           "Result: '%s' (size: %zu, pkt_pts: %"PRId64", sub_pts: %"PRId64" "
+           "duration: %"PRIu32", pkt_timebase: %d/%d, time_base: %d/%d')\n",
+           decoded_subtitle ? decoded_subtitle : "<no subtitle>",
+           decoded_subtitle_size,
+           pkt->pts, sub->pts,
+           sub->end_display_time,
+           avctx->pkt_timebase.num, avctx->pkt_timebase.den,
+           avctx->time_base.num, avctx->time_base.den);
+
+    if (decoded_subtitle)
+        ret = libaribb24_handle_regions(avctx, sub);
+
+    *got_sub_ptr = sub->num_rects > 0;
+
+    av_free(decoded_subtitle);
+
+    // flush the region buffers, otherwise the linked list keeps getting
+    // longer and longer...
+    arib_finalize_decoder(b24->decoder);
+
+    return ret < 0 ? ret : pkt->size;
+}
+
+static void libaribb24_flush(AVCodecContext *avctx)
+{
+    Libaribb24Context *b24 = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        b24->read_order = 0;
+}
+
+#define OFFSET(x) offsetof(Libaribb24Context, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "aribb24-base-path", "set the base path for the libaribb24 library",
+      OFFSET(aribb24_base_path), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, SD },
+    { "aribb24-skip-ruby-text", "skip ruby text blocks during decoding",
+      OFFSET(aribb24_skip_ruby), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, SD },
+    { NULL }
+};
+
+static const AVClass aribb24_class = {
+    .class_name = "libaribb24 decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libaribb24_decoder = {
+    .name      = "libaribb24",
+    .long_name = NULL_IF_CONFIG_SMALL("libaribb24 ARIB STD-B24 caption decoder"),
+    .type      = AVMEDIA_TYPE_SUBTITLE,
+    .id        = AV_CODEC_ID_ARIB_CAPTION,
+    .priv_data_size = sizeof(Libaribb24Context),
+    .init      = libaribb24_init,
+    .close     = libaribb24_close,
+    .decode    = libaribb24_decode,
+    .flush     = libaribb24_flush,
+    .priv_class= &aribb24_class,
+    .wrapper_name = "libaribb24",
+};
diff --git a/libavcodec/libcelt_dec.c b/libavcodec/libcelt_dec.c
new file mode 100644
index 0000000..75b438b
--- /dev/null
+++ b/libavcodec/libcelt_dec.c
@@ -0,0 +1,141 @@
+/*
+ * Xiph CELT decoder using libcelt
+ * Copyright (c) 2011 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <celt/celt.h>
+#include <celt/celt_header.h>
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+struct libcelt_context {
+    CELTMode *mode;
+    CELTDecoder *dec;
+    int discard;
+};
+
+static int ff_celt_error_to_averror(int err)
+{
+    switch (err) {
+        case CELT_BAD_ARG:          return AVERROR(EINVAL);
+#ifdef CELT_BUFFER_TOO_SMALL
+        case CELT_BUFFER_TOO_SMALL: return AVERROR(ENOBUFS);
+#endif
+        case CELT_INTERNAL_ERROR:   return AVERROR(EFAULT);
+        case CELT_CORRUPTED_DATA:   return AVERROR_INVALIDDATA;
+        case CELT_UNIMPLEMENTED:    return AVERROR(ENOSYS);
+#ifdef ENOTRECOVERABLE
+        case CELT_INVALID_STATE:    return AVERROR(ENOTRECOVERABLE);
+#endif
+        case CELT_ALLOC_FAIL:       return AVERROR(ENOMEM);
+        default:                    return AVERROR(EINVAL);
+    }
+}
+
+static int ff_celt_bitstream_version_hack(CELTMode *mode)
+{
+    CELTHeader header = { .version_id = 0 };
+    celt_header_init(&header, mode, 960, 2);
+    return header.version_id;
+}
+
+static av_cold int libcelt_dec_init(AVCodecContext *c)
+{
+    struct libcelt_context *celt = c->priv_data;
+    int err;
+
+    if (!c->channels || !c->frame_size ||
+        c->frame_size > INT_MAX / sizeof(int16_t) / c->channels)
+        return AVERROR(EINVAL);
+    celt->mode = celt_mode_create(c->sample_rate, c->frame_size, &err);
+    if (!celt->mode)
+        return ff_celt_error_to_averror(err);
+    celt->dec = celt_decoder_create_custom(celt->mode, c->channels, &err);
+    if (!celt->dec) {
+        celt_mode_destroy(celt->mode);
+        return ff_celt_error_to_averror(err);
+    }
+    if (c->extradata_size >= 4) {
+        celt->discard = AV_RL32(c->extradata);
+        if (celt->discard < 0 || celt->discard >= c->frame_size) {
+            av_log(c, AV_LOG_WARNING,
+                   "Invalid overlap (%d), ignored.\n", celt->discard);
+            celt->discard = 0;
+        }
+    }
+    if (c->extradata_size >= 8) {
+        unsigned version = AV_RL32(c->extradata + 4);
+        unsigned lib_version = ff_celt_bitstream_version_hack(celt->mode);
+        if (version != lib_version)
+            av_log(c, AV_LOG_WARNING,
+                   "CELT bitstream version 0x%x may be "
+                   "improperly decoded by libcelt for version 0x%x.\n",
+                   version, lib_version);
+    }
+    c->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+}
+
+static av_cold int libcelt_dec_close(AVCodecContext *c)
+{
+    struct libcelt_context *celt = c->priv_data;
+
+    celt_decoder_destroy(celt->dec);
+    celt_mode_destroy(celt->mode);
+    return 0;
+}
+
+static int libcelt_dec_decode(AVCodecContext *c, void *data,
+                              int *got_frame_ptr, AVPacket *pkt)
+{
+    struct libcelt_context *celt = c->priv_data;
+    AVFrame *frame = data;
+    int err;
+    int16_t *pcm;
+
+    frame->nb_samples = c->frame_size;
+    if ((err = ff_get_buffer(c, frame, 0)) < 0)
+        return err;
+    pcm = (int16_t *)frame->data[0];
+    err = celt_decode(celt->dec, pkt->data, pkt->size, pcm, c->frame_size);
+    if (err < 0)
+        return ff_celt_error_to_averror(err);
+    if (celt->discard) {
+        frame->nb_samples -= celt->discard;
+        memmove(pcm, pcm + celt->discard * c->channels,
+                frame->nb_samples * c->channels * sizeof(int16_t));
+        celt->discard = 0;
+    }
+    *got_frame_ptr = 1;
+    return pkt->size;
+}
+
+AVCodec ff_libcelt_decoder = {
+    .name           = "libcelt",
+    .long_name      = NULL_IF_CONFIG_SMALL("Xiph CELT decoder using libcelt"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_CELT,
+    .priv_data_size = sizeof(struct libcelt_context),
+    .init           = libcelt_dec_init,
+    .close          = libcelt_dec_close,
+    .decode         = libcelt_dec_decode,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .wrapper_name   = "libcelt",
+};
diff --git a/libavcodec/libcodec2.c b/libavcodec/libcodec2.c
new file mode 100644
index 0000000..1d6bed0
--- /dev/null
+++ b/libavcodec/libcodec2.c
@@ -0,0 +1,213 @@
+/*
+ * codec2 encoder/decoder using libcodec2
+ * Copyright (c) 2017 Tomas Härdin
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <codec2/codec2.h>
+#include "avcodec.h"
+#include "libavutil/opt.h"
+#include "internal.h"
+#include "codec2utils.h"
+
+typedef struct {
+    const AVClass *class;
+    struct CODEC2 *codec;
+    int mode;
+} LibCodec2Context;
+
+static const AVOption options[] = {
+    //not AV_OPT_FLAG_DECODING_PARAM since mode should come from the demuxer
+    //1300 (aka FreeDV 1600) is the most common mode on-the-air, default to it here as well
+    AVPRIV_CODEC2_AVOPTIONS("codec2 mode", LibCodec2Context, 0, 4 /*CODEC2_MODE_1300*/, AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_ENCODING_PARAM),
+    { NULL },
+};
+
+static const AVClass libcodec2_enc_class = {
+    .class_name = "libcodec2 encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass libcodec2_dec_class = {
+    .class_name = "libcodec2 decoder",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static av_cold int libcodec2_init_common(AVCodecContext *avctx, int mode)
+{
+    LibCodec2Context *c2 = avctx->priv_data;
+    //Grab mode name from options, unless it's some weird number.
+    const char *modename = mode >= 0 && mode <= AVPRIV_CODEC2_MODE_MAX ? options[mode+1].name : "?";
+
+    c2->codec = codec2_create(mode);
+    if (!c2->codec) {
+        //Out of memory or unsupported mode. The latter seems most likely,
+        //but we can't tell for sure with the current API.
+        goto libcodec2_init_common_error;
+    }
+
+    avctx->frame_size = codec2_samples_per_frame(c2->codec);
+    avctx->block_align = (codec2_bits_per_frame(c2->codec) + 7) / 8;
+
+    if (avctx->frame_size <= 0 || avctx->block_align <= 0) {
+        //codec2_create() may succeed for some modes but still fail at codec2_samples_per_frame()
+        //example is -mode 700C on libcodec2 0.4
+        codec2_destroy(c2->codec);
+        c2->codec = NULL;
+        goto libcodec2_init_common_error;
+    }
+
+    codec2_set_natural_or_gray(c2->codec, 1);
+
+    return 0;
+
+libcodec2_init_common_error:
+    av_log(avctx, AV_LOG_ERROR,
+        "Mode %i (%s) not supported with the linked version of libcodec2\n",
+        mode, modename);
+    return AVERROR(EINVAL);
+}
+
+static av_cold int libcodec2_init_decoder(AVCodecContext *avctx)
+{
+    avctx->sample_rate      = 8000;
+    avctx->channels         = 1;
+    avctx->sample_fmt       = AV_SAMPLE_FMT_S16;
+    avctx->channel_layout   = AV_CH_LAYOUT_MONO;
+
+    if (avctx->extradata_size != AVPRIV_CODEC2_EXTRADATA_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "must have exactly %i bytes of extradata (got %i)\n",
+               AVPRIV_CODEC2_EXTRADATA_SIZE, avctx->extradata_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return libcodec2_init_common(avctx, avpriv_codec2_mode_from_extradata(avctx->extradata));
+}
+
+static av_cold int libcodec2_init_encoder(AVCodecContext *avctx)
+{
+    LibCodec2Context *c2 = avctx->priv_data;
+
+    //will need to be smarter once we get wideband support
+    if (avctx->sample_rate != 8000 ||
+        avctx->channels != 1 ||
+        avctx->sample_fmt != AV_SAMPLE_FMT_S16) {
+        av_log(avctx, AV_LOG_ERROR, "only 8 kHz 16-bit mono allowed\n");
+        return AVERROR(EINVAL);
+    }
+
+    avctx->extradata = av_mallocz(AVPRIV_CODEC2_EXTRADATA_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata) {
+        return AVERROR(ENOMEM);
+    }
+
+    avctx->extradata_size = AVPRIV_CODEC2_EXTRADATA_SIZE;
+    avpriv_codec2_make_extradata(avctx->extradata, c2->mode);
+
+    return libcodec2_init_common(avctx, c2->mode);
+}
+
+static av_cold int libcodec2_close(AVCodecContext *avctx)
+{
+    LibCodec2Context *c2 = avctx->priv_data;
+
+    codec2_destroy(c2->codec);
+    return 0;
+}
+
+static int libcodec2_decode(AVCodecContext *avctx, void *data,
+                            int *got_frame_ptr, AVPacket *pkt)
+{
+    LibCodec2Context *c2 = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret, nframes, i;
+    uint8_t *input;
+    int16_t *output;
+
+    nframes           = pkt->size / avctx->block_align;
+    frame->nb_samples = avctx->frame_size * nframes;
+
+    ret = ff_get_buffer(avctx, frame, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    input  = pkt->data;
+    output = (int16_t *)frame->data[0];
+
+    for (i = 0; i < nframes; i++) {
+        codec2_decode(c2->codec, output, input);
+        input  += avctx->block_align;
+        output += avctx->frame_size;
+    }
+
+    *got_frame_ptr = nframes > 0;
+    return nframes * avctx->block_align;
+}
+
+static int libcodec2_encode(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet_ptr)
+{
+    LibCodec2Context *c2 = avctx->priv_data;
+    int16_t *samples = (int16_t *)frame->data[0];
+
+    int ret = ff_alloc_packet2(avctx, avpkt, avctx->block_align, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    codec2_encode(c2->codec, avpkt->data, samples);
+    *got_packet_ptr = 1;
+
+    return 0;
+}
+
+AVCodec ff_libcodec2_decoder = {
+    .name                   = "libcodec2",
+    .long_name              = NULL_IF_CONFIG_SMALL("codec2 decoder using libcodec2"),
+    .type                   = AVMEDIA_TYPE_AUDIO,
+    .id                     = AV_CODEC_ID_CODEC2,
+    .priv_data_size         = sizeof(LibCodec2Context),
+    .init                   = libcodec2_init_decoder,
+    .close                  = libcodec2_close,
+    .decode                 = libcodec2_decode,
+    .capabilities           = 0,
+    .supported_samplerates  = (const int[]){ 8000, 0 },
+    .sample_fmts            = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .channel_layouts        = (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0 },
+    .priv_class             = &libcodec2_dec_class,
+};
+
+AVCodec ff_libcodec2_encoder = {
+    .name                   = "libcodec2",
+    .long_name              = NULL_IF_CONFIG_SMALL("codec2 encoder using libcodec2"),
+    .type                   = AVMEDIA_TYPE_AUDIO,
+    .id                     = AV_CODEC_ID_CODEC2,
+    .priv_data_size         = sizeof(LibCodec2Context),
+    .init                   = libcodec2_init_encoder,
+    .close                  = libcodec2_close,
+    .encode2                = libcodec2_encode,
+    .capabilities           = 0,
+    .supported_samplerates  = (const int[]){ 8000, 0 },
+    .sample_fmts            = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .channel_layouts        = (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0 },
+    .priv_class             = &libcodec2_enc_class,
+};
diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
index c6ccc38..8c8584f 100644
--- a/libavcodec/libdav1d.c
+++ b/libavcodec/libdav1d.c
@@ -2,28 +2,28 @@
  * Copyright (c) 2018 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (c) 2018 James Almer <jamrial gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <dav1d/dav1d.h>
 
 #include "libavutil/avassert.h"
-#include "libavutil/common.h"
-#include "libavutil/internal.h"
+#include "libavutil/mastering_display_metadata.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
@@ -33,12 +33,81 @@
 typedef struct Libdav1dContext {
     AVClass *class;
     Dav1dContext *c;
+    AVBufferPool *pool;
+    int pool_size;
 
     Dav1dData data;
     int tile_threads;
     int apply_grain;
 } Libdav1dContext;
 
+static const enum AVPixelFormat pix_fmt[][3] = {
+    [DAV1D_PIXEL_LAYOUT_I400] = { AV_PIX_FMT_GRAY8,   AV_PIX_FMT_GRAY10,    AV_PIX_FMT_GRAY12 },
+    [DAV1D_PIXEL_LAYOUT_I420] = { AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12 },
+    [DAV1D_PIXEL_LAYOUT_I422] = { AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12 },
+    [DAV1D_PIXEL_LAYOUT_I444] = { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12 },
+};
+
+static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl)
+{
+    AVCodecContext *c = opaque;
+
+    av_vlog(c, AV_LOG_ERROR, fmt, vl);
+}
+
+static int libdav1d_picture_allocator(Dav1dPicture *p, void *cookie)
+{
+    Libdav1dContext *dav1d = cookie;
+    enum AVPixelFormat format = pix_fmt[p->p.layout][p->seq_hdr->hbd];
+    int ret, linesize[4], h = FFALIGN(p->p.h, 128);
+    uint8_t *aligned_ptr, *data[4];
+    AVBufferRef *buf;
+
+    ret = av_image_fill_arrays(data, linesize, NULL, format, FFALIGN(p->p.w, 128),
+                               h, DAV1D_PICTURE_ALIGNMENT);
+    if (ret < 0)
+        return ret;
+
+    if (ret != dav1d->pool_size) {
+        av_buffer_pool_uninit(&dav1d->pool);
+        // Use twice the amount of required padding bytes for aligned_ptr below.
+        dav1d->pool = av_buffer_pool_init(ret + DAV1D_PICTURE_ALIGNMENT * 2, NULL);
+        if (!dav1d->pool)
+            return AVERROR(ENOMEM);
+        dav1d->pool_size = ret;
+    }
+    buf = av_buffer_pool_get(dav1d->pool);
+    if (!buf)
+        return AVERROR(ENOMEM);
+
+    // libdav1d requires DAV1D_PICTURE_ALIGNMENT aligned buffers, which av_malloc()
+    // doesn't guarantee for example when AVX is disabled at configure time.
+    // Use the extra DAV1D_PICTURE_ALIGNMENT padding bytes in the buffer to align it
+    // if required.
+    aligned_ptr = (uint8_t *)FFALIGN((uintptr_t)buf->data, DAV1D_PICTURE_ALIGNMENT);
+    ret = av_image_fill_pointers(data, format, h, aligned_ptr, linesize);
+    if (ret < 0) {
+        av_buffer_unref(&buf);
+        return ret;
+    }
+
+    p->data[0] = data[0];
+    p->data[1] = data[1];
+    p->data[2] = data[2];
+    p->stride[0] = linesize[0];
+    p->stride[1] = linesize[1];
+    p->allocator_data = buf;
+
+    return 0;
+}
+
+static void libdav1d_picture_release(Dav1dPicture *p, void *cookie)
+{
+    AVBufferRef *buf = p->allocator_data;
+
+    av_buffer_unref(&buf);
+}
+
 static av_cold int libdav1d_init(AVCodecContext *c)
 {
     Libdav1dContext *dav1d = c->priv_data;
@@ -48,6 +117,11 @@ static av_cold int libdav1d_init(AVCodecContext *c)
     av_log(c, AV_LOG_INFO, "libdav1d %s\n", dav1d_version());
 
     dav1d_default_settings(&s);
+    s.logger.cookie = c;
+    s.logger.callback = libdav1d_log_callback;
+    s.allocator.cookie = dav1d;
+    s.allocator.alloc_picture_callback = libdav1d_picture_allocator;
+    s.allocator.release_picture_callback = libdav1d_picture_release;
     s.n_tile_threads = dav1d->tile_threads;
     s.apply_grain = dav1d->apply_grain;
     s.n_frame_threads = FFMIN(c->thread_count ? c->thread_count : av_cpu_count(), DAV1D_MAX_FRAME_THREADS);
@@ -80,13 +154,6 @@ static void libdav1d_frame_free(void *opaque, uint8_t *data) {
     av_free(p);
 }
 
-static const enum AVPixelFormat pix_fmt[][3] = {
-    [DAV1D_PIXEL_LAYOUT_I400] = { AV_PIX_FMT_GRAY8,   AV_PIX_FMT_GRAY10,    AV_PIX_FMT_GRAY12 },
-    [DAV1D_PIXEL_LAYOUT_I420] = { AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12 },
-    [DAV1D_PIXEL_LAYOUT_I422] = { AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12 },
-    [DAV1D_PIXEL_LAYOUT_I444] = { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12 },
-};
-
 static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
 {
     Libdav1dContext *dav1d = c->priv_data;
@@ -109,6 +176,8 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
             }
 
             data->m.timestamp = pkt.pts;
+            data->m.offset = pkt.pos;
+            data->m.duration = pkt.duration;
 
             pkt.buf = NULL;
             av_packet_unref(&pkt);
@@ -117,9 +186,9 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
 
     res = dav1d_send_data(dav1d->c, data);
     if (res < 0) {
-        if (res == -EINVAL)
+        if (res == AVERROR(EINVAL))
             res = AVERROR_INVALIDDATA;
-        if (res != -EAGAIN)
+        if (res != AVERROR(EAGAIN))
             return res;
     }
 
@@ -129,9 +198,9 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
 
     res = dav1d_get_picture(dav1d->c, p);
     if (res < 0) {
-        if (res == -EINVAL)
+        if (res == AVERROR(EINVAL))
             res = AVERROR_INVALIDDATA;
-        else if (res == -EAGAIN && c->internal->draining)
+        else if (res == AVERROR(EAGAIN) && c->internal->draining)
             res = AVERROR_EOF;
 
         av_free(p);
@@ -162,7 +231,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
     if (c->width != p->p.w || c->height != p->p.h) {
         res = ff_set_dimensions(c, p->p.w, p->p.h);
         if (res < 0)
-            return res;
+            goto fail;
     }
 
     switch (p->seq_hdr->chr) {
@@ -179,13 +248,16 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
     frame->color_range = c->color_range = p->seq_hdr->color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
 
     // match timestamps and packet size
-    frame->pts = p->m.timestamp;
+    frame->pts = frame->best_effort_timestamp = p->m.timestamp;
 #if FF_API_PKT_PTS
 FF_DISABLE_DEPRECATION_WARNINGS
     frame->pkt_pts = p->m.timestamp;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     frame->pkt_dts = p->m.timestamp;
+    frame->pkt_pos = p->m.offset;
+    frame->pkt_size = p->m.size;
+    frame->pkt_duration = p->m.duration;
     frame->key_frame = p->frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY;
 
     switch (p->frame_hdr->frame_type) {
@@ -200,16 +272,52 @@ FF_ENABLE_DEPRECATION_WARNINGS
         frame->pict_type = AV_PICTURE_TYPE_SP;
         break;
     default:
-        return AVERROR_INVALIDDATA;
+        res = AVERROR_INVALIDDATA;
+        goto fail;
     }
 
-    return 0;
+    if (p->mastering_display) {
+        AVMasteringDisplayMetadata *mastering = av_mastering_display_metadata_create_side_data(frame);
+        if (!mastering) {
+            res = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        for (int i = 0; i < 3; i++) {
+            mastering->display_primaries[i][0] = av_make_q(p->mastering_display->primaries[i][0], 1 << 16);
+            mastering->display_primaries[i][1] = av_make_q(p->mastering_display->primaries[i][1], 1 << 16);
+        }
+        mastering->white_point[0] = av_make_q(p->mastering_display->white_point[0], 1 << 16);
+        mastering->white_point[1] = av_make_q(p->mastering_display->white_point[1], 1 << 16);
+
+        mastering->max_luminance = av_make_q(p->mastering_display->max_luminance, 1 << 8);
+        mastering->min_luminance = av_make_q(p->mastering_display->min_luminance, 1 << 14);
+
+        mastering->has_primaries = 1;
+        mastering->has_luminance = 1;
+    }
+    if (p->content_light) {
+        AVContentLightMetadata *light = av_content_light_metadata_create_side_data(frame);
+        if (!light) {
+            res = AVERROR(ENOMEM);
+            goto fail;
+        }
+        light->MaxCLL = p->content_light->max_content_light_level;
+        light->MaxFALL = p->content_light->max_frame_average_light_level;
+    }
+
+    res = 0;
+fail:
+    if (res < 0)
+        av_frame_unref(frame);
+    return res;
 }
 
 static av_cold int libdav1d_close(AVCodecContext *c)
 {
     Libdav1dContext *dav1d = c->priv_data;
 
+    av_buffer_pool_uninit(&dav1d->pool);
     dav1d_data_unref(&dav1d->data);
     dav1d_close(&dav1d->c);
 
@@ -220,7 +328,7 @@ static av_cold int libdav1d_close(AVCodecContext *c)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption libdav1d_options[] = {
     { "tilethreads", "Tile threads", OFFSET(tile_threads), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, DAV1D_MAX_TILE_THREADS, VD },
-    { "filmgrain", "Apply Film Grain", OFFSET(apply_grain), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VD },
+    { "filmgrain", "Apply Film Grain", OFFSET(apply_grain), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VD },
     { NULL }
 };
 
diff --git a/libavcodec/libdavs2.c b/libavcodec/libdavs2.c
new file mode 100644
index 0000000..0808721
--- /dev/null
+++ b/libavcodec/libdavs2.c
@@ -0,0 +1,212 @@
+/*
+ * AVS2 decoding using the davs2 library
+ *
+ * Copyright (C) 2018 Yiqun Xu, <yiqun.xu@vipl.ict.ac.cn>
+ *                    Falei Luo, <falei.luo@gmail.com>
+ *                    Huiwen Ren, <hwrenx@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "davs2.h"
+
+typedef struct DAVS2Context {
+    void *decoder;
+
+    AVFrame *frame;
+    davs2_param_t    param;      // decoding parameters
+    davs2_packet_t   packet;     // input bitstream
+
+    davs2_picture_t  out_frame;  // output data, frame data
+    davs2_seq_info_t headerset;  // output data, sequence header
+
+}DAVS2Context;
+
+static av_cold int davs2_init(AVCodecContext *avctx)
+{
+    DAVS2Context *cad = avctx->priv_data;
+    int cpu_flags = av_get_cpu_flags();
+
+    /* init the decoder */
+    cad->param.threads      = avctx->thread_count;
+    cad->param.info_level   = 0;
+    cad->param.disable_avx  = !(cpu_flags & AV_CPU_FLAG_AVX &&
+                                cpu_flags & AV_CPU_FLAG_AVX2);
+    cad->decoder            = davs2_decoder_open(&cad->param);
+
+    if (!cad->decoder) {
+        av_log(avctx, AV_LOG_ERROR, "decoder created error.");
+        return AVERROR_EXTERNAL;
+    }
+
+    av_log(avctx, AV_LOG_VERBOSE, "decoder created. %p\n", cad->decoder);
+    return 0;
+}
+
+static int davs2_dump_frames(AVCodecContext *avctx, davs2_picture_t *pic, int *got_frame,
+                             davs2_seq_info_t *headerset, int ret_type, AVFrame *frame)
+{
+    DAVS2Context *cad    = avctx->priv_data;
+    int bytes_per_sample = pic->bytes_per_sample;
+    int plane = 0;
+    int line  = 0;
+
+    if (!headerset) {
+        *got_frame = 0;
+        return 0;
+    }
+
+    if (!pic || ret_type == DAVS2_GOT_HEADER) {
+        avctx->width     = headerset->width;
+        avctx->height    = headerset->height;
+        avctx->pix_fmt   = headerset->output_bit_depth == 10 ?
+                           AV_PIX_FMT_YUV420P10 : AV_PIX_FMT_YUV420P;
+
+        avctx->framerate = av_d2q(headerset->frame_rate,4096);
+        *got_frame = 0;
+        return 0;
+    }
+
+    switch (pic->type) {
+    case DAVS2_PIC_I:
+    case DAVS2_PIC_G:
+        frame->pict_type = AV_PICTURE_TYPE_I;
+        break;
+    case DAVS2_PIC_P:
+    case DAVS2_PIC_S:
+        frame->pict_type = AV_PICTURE_TYPE_P;
+        break;
+    case DAVS2_PIC_B:
+        frame->pict_type = AV_PICTURE_TYPE_B;
+        break;
+    case DAVS2_PIC_F:
+        frame->pict_type = AV_PICTURE_TYPE_S;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Decoder error: unknown frame type\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    for (plane = 0; plane < 3; ++plane) {
+        int size_line = pic->widths[plane] * bytes_per_sample;
+        frame->buf[plane]  = av_buffer_alloc(size_line * pic->lines[plane]);
+
+        if (!frame->buf[plane]){
+            av_log(avctx, AV_LOG_ERROR, "dump error: alloc failed.\n");
+            return AVERROR(ENOMEM);
+        }
+
+        frame->data[plane]     = frame->buf[plane]->data;
+        frame->linesize[plane] = size_line;
+
+        for (line = 0; line < pic->lines[plane]; ++line)
+            memcpy(frame->data[plane] + line * size_line,
+                   pic->planes[plane] + line * pic->strides[plane],
+                   pic->widths[plane] * bytes_per_sample);
+    }
+
+    frame->width     = cad->headerset.width;
+    frame->height    = cad->headerset.height;
+    frame->pts       = cad->out_frame.pts;
+    frame->format    = avctx->pix_fmt;
+
+    *got_frame = 1;
+    return 0;
+}
+
+static int send_delayed_frame(AVCodecContext *avctx, AVFrame *frame, int *got_frame)
+{
+    DAVS2Context *cad      = avctx->priv_data;
+    int           ret      = DAVS2_DEFAULT;
+
+    ret = davs2_decoder_flush(cad->decoder, &cad->headerset, &cad->out_frame);
+    if (ret == DAVS2_ERROR) {
+        av_log(avctx, AV_LOG_ERROR, "Decoder error: can't flush delayed frame\n");
+        return AVERROR_EXTERNAL;
+    }
+    if (ret == DAVS2_GOT_FRAME) {
+        ret = davs2_dump_frames(avctx, &cad->out_frame, got_frame, &cad->headerset, ret, frame);
+        davs2_decoder_frame_unref(cad->decoder, &cad->out_frame);
+    }
+    return ret;
+}
+
+static av_cold int davs2_end(AVCodecContext *avctx)
+{
+    DAVS2Context *cad = avctx->priv_data;
+
+    /* close the decoder */
+    if (cad->decoder) {
+        davs2_decoder_close(cad->decoder);
+        cad->decoder = NULL;
+    }
+
+    return 0;
+}
+
+static int davs2_decode_frame(AVCodecContext *avctx, void *data,
+                              int *got_frame, AVPacket *avpkt)
+{
+    DAVS2Context *cad      = avctx->priv_data;
+    int           buf_size = avpkt->size;
+    uint8_t      *buf_ptr  = avpkt->data;
+    AVFrame      *frame    = data;
+    int           ret      = DAVS2_DEFAULT;
+
+    /* end of stream, output what is still in the buffers */
+    if (!buf_size) {
+        return send_delayed_frame(avctx, frame, got_frame);
+    }
+
+    cad->packet.data = buf_ptr;
+    cad->packet.len  = buf_size;
+    cad->packet.pts  = avpkt->pts;
+    cad->packet.dts  = avpkt->dts;
+
+    ret = davs2_decoder_send_packet(cad->decoder, &cad->packet);
+
+
+    if (ret == DAVS2_ERROR) {
+        av_log(avctx, AV_LOG_ERROR, "Decoder error: can't read packet\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    ret = davs2_decoder_recv_frame(cad->decoder, &cad->headerset, &cad->out_frame);
+
+    if (ret != DAVS2_DEFAULT) {
+        ret = davs2_dump_frames(avctx, &cad->out_frame, got_frame, &cad->headerset, ret, frame);
+        davs2_decoder_frame_unref(cad->decoder, &cad->out_frame);
+    }
+
+    return ret == 0 ? buf_size : ret;
+}
+
+AVCodec ff_libdavs2_decoder = {
+    .name           = "libdavs2",
+    .long_name      = NULL_IF_CONFIG_SMALL("libdavs2 AVS2-P2/IEEE1857.4"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVS2,
+    .priv_data_size = sizeof(DAVS2Context),
+    .init           = davs2_init,
+    .close          = davs2_end,
+    .decode         = davs2_decode_frame,
+    .capabilities   =  AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
+                                                     AV_PIX_FMT_NONE },
+    .wrapper_name   = "libdavs2",
+};
diff --git a/libavcodec/libdcadec.c b/libavcodec/libdcadec.c
deleted file mode 100644
index d44f4d9..0000000
--- a/libavcodec/libdcadec.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * libdcadec decoder wrapper
- * Copyright (C) 2015 Hendrik Leppkes
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <libdcadec/dca_context.h>
-
-#include "libavutil/channel_layout.h"
-#include "libavutil/common.h"
-#include "libavutil/opt.h"
-
-#include "avcodec.h"
-#include "dca.h"
-#include "dca_syncwords.h"
-#include "internal.h"
-
-typedef struct DCADecContext {
-    struct dcadec_context *ctx;
-    uint8_t *buffer;
-    int buffer_size;
-} DCADecContext;
-
-static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
-                               int *got_frame_ptr, AVPacket *avpkt)
-{
-    DCADecContext *s = avctx->priv_data;
-    AVFrame *frame = data;
-    int ret, i, k;
-    int **samples, nsamples, channel_mask, sample_rate, bits_per_sample, profile;
-    uint32_t mrk;
-    uint8_t *input = avpkt->data;
-    int input_size = avpkt->size;
-
-    /* convert bytestream syntax to RAW BE format if required */
-    if (input_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Input size too small\n");
-        return AVERROR_INVALIDDATA;
-    }
-    mrk = AV_RB32(input);
-    if (mrk != DCA_SYNCWORD_CORE_BE && mrk != DCA_SYNCWORD_SUBSTREAM) {
-        s->buffer = av_fast_realloc(s->buffer, &s->buffer_size, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!s->buffer)
-            return AVERROR(ENOMEM);
-
-        if ((ret = ff_dca_convert_bitstream(avpkt->data, avpkt->size, s->buffer, s->buffer_size)) < 0)
-            return ret;
-
-        input      = s->buffer;
-        input_size = ret;
-    }
-
-    if ((ret = dcadec_context_parse(s->ctx, input, input_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "dcadec_context_parse() failed: %d (%s)\n", -ret, dcadec_strerror(ret));
-        return AVERROR_UNKNOWN;
-    }
-    if ((ret = dcadec_context_filter(s->ctx, &samples, &nsamples, &channel_mask,
-                                     &sample_rate, &bits_per_sample, &profile)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "dcadec_context_filter() failed: %d (%s)\n", -ret, dcadec_strerror(ret));
-        return AVERROR_UNKNOWN;
-    }
-
-    avctx->channels       = av_get_channel_layout_nb_channels(channel_mask);
-    avctx->channel_layout = channel_mask;
-    avctx->sample_rate    = sample_rate;
-
-    if (bits_per_sample == 16)
-        avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
-    else if (bits_per_sample <= 24)
-        avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
-    else {
-        av_log(avctx, AV_LOG_ERROR, "Unsupported number of bits per sample: %d\n",
-               bits_per_sample);
-        return AVERROR(ENOSYS);
-    }
-
-    avctx->bits_per_raw_sample = bits_per_sample;
-
-    switch (profile) {
-    case DCADEC_PROFILE_DS:
-        avctx->profile = FF_PROFILE_DTS;
-        break;
-    case DCADEC_PROFILE_DS_96_24:
-        avctx->profile = FF_PROFILE_DTS_96_24;
-        break;
-    case DCADEC_PROFILE_DS_ES:
-        avctx->profile = FF_PROFILE_DTS_ES;
-        break;
-    case DCADEC_PROFILE_HD_HRA:
-        avctx->profile = FF_PROFILE_DTS_HD_HRA;
-        break;
-    case DCADEC_PROFILE_HD_MA:
-        avctx->profile = FF_PROFILE_DTS_HD_MA;
-        break;
-    case DCADEC_PROFILE_EXPRESS:
-        avctx->profile = FF_PROFILE_DTS_EXPRESS;
-        break;
-    case DCADEC_PROFILE_UNKNOWN:
-    default:
-        avctx->profile = FF_PROFILE_UNKNOWN;
-        break;
-    }
-
-    /* bitrate is only meaningful if there are no HD extensions, as they distort the bitrate */
-    if (profile == DCADEC_PROFILE_DS || profile == DCADEC_PROFILE_DS_96_24 || profile == DCADEC_PROFILE_DS_ES) {
-        struct dcadec_core_info *info = dcadec_context_get_core_info(s->ctx);
-        avctx->bit_rate = info->bit_rate;
-        dcadec_context_free_core_info(info);
-    } else
-        avctx->bit_rate = 0;
-
-    frame->nb_samples = nsamples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
-        return ret;
-
-    for (i = 0; i < avctx->channels; i++) {
-        if (frame->format == AV_SAMPLE_FMT_S16P) {
-            int16_t *plane = (int16_t *)frame->extended_data[i];
-            for (k = 0; k < nsamples; k++)
-                plane[k] = samples[i][k];
-        } else {
-            int32_t *plane = (int32_t *)frame->extended_data[i];
-            int shift = 32 - bits_per_sample;
-            for (k = 0; k < nsamples; k++)
-                plane[k] = samples[i][k] << shift;
-        }
-    }
-
-    *got_frame_ptr = 1;
-
-    return avpkt->size;
-}
-
-static av_cold void dcadec_flush(AVCodecContext *avctx)
-{
-    DCADecContext *s = avctx->priv_data;
-    dcadec_context_clear(s->ctx);
-}
-
-static av_cold int dcadec_close(AVCodecContext *avctx)
-{
-    DCADecContext *s = avctx->priv_data;
-
-    dcadec_context_destroy(s->ctx);
-    s->ctx = NULL;
-
-    av_freep(&s->buffer);
-
-    return 0;
-}
-
-static av_cold int dcadec_init(AVCodecContext *avctx)
-{
-    DCADecContext *s = avctx->priv_data;
-
-    s->ctx = dcadec_context_create(0);
-    if (!s->ctx)
-        return AVERROR(ENOMEM);
-
-    avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
-    avctx->bits_per_raw_sample = 24;
-
-    return 0;
-}
-
-static const AVProfile profiles[] = {
-    { FF_PROFILE_DTS,         "DTS"         },
-    { FF_PROFILE_DTS_ES,      "DTS-ES"      },
-    { FF_PROFILE_DTS_96_24,   "DTS 96/24"   },
-    { FF_PROFILE_DTS_HD_HRA,  "DTS-HD HRA"  },
-    { FF_PROFILE_DTS_HD_MA,   "DTS-HD MA"   },
-    { FF_PROFILE_DTS_EXPRESS, "DTS Express" },
-    { FF_PROFILE_UNKNOWN },
-};
-
-AVCodec ff_libdcadec_decoder = {
-    .name           = "libdcadec",
-    .long_name      = NULL_IF_CONFIG_SMALL("dcadec DCA decoder"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_DTS,
-    .priv_data_size = sizeof(DCADecContext),
-    .init           = dcadec_init,
-    .decode         = dcadec_decode_frame,
-    .close          = dcadec_close,
-    .flush          = dcadec_flush,
-    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_CHANNEL_CONF,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S32P, AV_SAMPLE_FMT_S16P,
-                                                      AV_SAMPLE_FMT_NONE },
-    .profiles       = NULL_IF_CONFIG_SMALL(profiles),
-    .wrapper_name   = "libdcadec",
-};
diff --git a/libavcodec/libfaac.c b/libavcodec/libfaac.c
deleted file mode 100644
index db04a37..0000000
--- a/libavcodec/libfaac.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Interface to libfaac for aac encoding
- * Copyright (c) 2002 Gildas Bazin <gbazin@netcourrier.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * Interface to libfaac for aac encoding.
- */
-
-#include <faac.h>
-
-#include "libavutil/channel_layout.h"
-#include "libavutil/common.h"
-#include "avcodec.h"
-#include "audio_frame_queue.h"
-#include "internal.h"
-
-
-/* libfaac has an encoder delay of 1024 samples */
-#define FAAC_DELAY_SAMPLES 1024
-
-typedef struct FaacAudioContext {
-    faacEncHandle faac_handle;
-    AudioFrameQueue afq;
-} FaacAudioContext;
-
-
-static av_cold int Faac_encode_close(AVCodecContext *avctx)
-{
-    FaacAudioContext *s = avctx->priv_data;
-
-    av_freep(&avctx->extradata);
-    ff_af_queue_close(&s->afq);
-
-    if (s->faac_handle)
-        faacEncClose(s->faac_handle);
-
-    return 0;
-}
-
-static const int channel_maps[][6] = {
-    { 2, 0, 1 },          //< C L R
-    { 2, 0, 1, 3 },       //< C L R Cs
-    { 2, 0, 1, 3, 4 },    //< C L R Ls Rs
-    { 2, 0, 1, 4, 5, 3 }, //< C L R Ls Rs LFE
-};
-
-static av_cold int Faac_encode_init(AVCodecContext *avctx)
-{
-    FaacAudioContext *s = avctx->priv_data;
-    faacEncConfigurationPtr faac_cfg;
-    unsigned long samples_input, max_bytes_output;
-    int ret;
-
-    /* number of channels */
-    if (avctx->channels < 1 || avctx->channels > 6) {
-        av_log(avctx, AV_LOG_ERROR, "encoding %d channel(s) is not allowed\n", avctx->channels);
-        ret = AVERROR(EINVAL);
-        goto error;
-    }
-
-    s->faac_handle = faacEncOpen(avctx->sample_rate,
-                                 avctx->channels,
-                                 &samples_input, &max_bytes_output);
-    if (!s->faac_handle) {
-        av_log(avctx, AV_LOG_ERROR, "error in faacEncOpen()\n");
-        ret = AVERROR_UNKNOWN;
-        goto error;
-    }
-
-    /* check faac version */
-    faac_cfg = faacEncGetCurrentConfiguration(s->faac_handle);
-    if (faac_cfg->version != FAAC_CFG_VERSION) {
-        av_log(avctx, AV_LOG_ERROR, "wrong libfaac version (compiled for: %d, using %d)\n", FAAC_CFG_VERSION, faac_cfg->version);
-        ret = AVERROR(EINVAL);
-        goto error;
-    }
-
-    /* put the options in the configuration struct */
-    switch(avctx->profile) {
-        case FF_PROFILE_AAC_MAIN:
-            faac_cfg->aacObjectType = MAIN;
-            break;
-        case FF_PROFILE_UNKNOWN:
-        case FF_PROFILE_AAC_LOW:
-            faac_cfg->aacObjectType = LOW;
-            break;
-        case FF_PROFILE_AAC_SSR:
-            faac_cfg->aacObjectType = SSR;
-            break;
-        case FF_PROFILE_AAC_LTP:
-            faac_cfg->aacObjectType = LTP;
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "invalid AAC profile\n");
-            ret = AVERROR(EINVAL);
-            goto error;
-    }
-    faac_cfg->mpegVersion = MPEG4;
-    faac_cfg->useTns = 0;
-    faac_cfg->allowMidside = 1;
-    faac_cfg->bitRate = avctx->bit_rate / avctx->channels;
-    faac_cfg->bandWidth = avctx->cutoff;
-    if(avctx->flags & AV_CODEC_FLAG_QSCALE) {
-        faac_cfg->bitRate = 0;
-        faac_cfg->quantqual = avctx->global_quality / FF_QP2LAMBDA;
-    }
-    faac_cfg->outputFormat = 1;
-    faac_cfg->inputFormat = FAAC_INPUT_16BIT;
-    if (avctx->channels > 2)
-        memcpy(faac_cfg->channel_map, channel_maps[avctx->channels-3],
-               avctx->channels * sizeof(int));
-
-    avctx->frame_size = samples_input / avctx->channels;
-
-    /* Set decoder specific info */
-    avctx->extradata_size = 0;
-    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
-
-        unsigned char *buffer = NULL;
-        unsigned long decoder_specific_info_size;
-
-        if (!faacEncGetDecoderSpecificInfo(s->faac_handle, &buffer,
-                                           &decoder_specific_info_size)) {
-            avctx->extradata = av_malloc(decoder_specific_info_size + AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!avctx->extradata) {
-                ret = AVERROR(ENOMEM);
-                goto error;
-            }
-            avctx->extradata_size = decoder_specific_info_size;
-            memcpy(avctx->extradata, buffer, avctx->extradata_size);
-            faac_cfg->outputFormat = 0;
-        }
-        free(buffer);
-    }
-
-    if (!faacEncSetConfiguration(s->faac_handle, faac_cfg)) {
-        av_log(avctx, AV_LOG_ERROR, "libfaac doesn't support this output format!\n");
-        ret = AVERROR(EINVAL);
-        goto error;
-    }
-
-    avctx->initial_padding = FAAC_DELAY_SAMPLES;
-    ff_af_queue_init(avctx, &s->afq);
-
-    return 0;
-error:
-    Faac_encode_close(avctx);
-    return ret;
-}
-
-static int Faac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
-                             const AVFrame *frame, int *got_packet_ptr)
-{
-    FaacAudioContext *s = avctx->priv_data;
-    int bytes_written, ret;
-    int num_samples  = frame ? frame->nb_samples : 0;
-    void *samples    = frame ? frame->data[0]    : NULL;
-
-    if ((ret = ff_alloc_packet(avpkt, (7 + 768) * avctx->channels))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
-        return ret;
-    }
-
-    bytes_written = faacEncEncode(s->faac_handle, samples,
-                                  num_samples * avctx->channels,
-                                  avpkt->data, avpkt->size);
-    if (bytes_written < 0) {
-        av_log(avctx, AV_LOG_ERROR, "faacEncEncode() error\n");
-        return bytes_written;
-    }
-
-    /* add current frame to the queue */
-    if (frame) {
-        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
-            return ret;
-    }
-
-    if (!bytes_written)
-        return 0;
-
-    /* Get the next frame pts/duration */
-    ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
-                       &avpkt->duration);
-
-    avpkt->size = bytes_written;
-    *got_packet_ptr = 1;
-    return 0;
-}
-
-static const AVProfile profiles[] = {
-    { FF_PROFILE_AAC_MAIN, "Main" },
-    { FF_PROFILE_AAC_LOW,  "LC"   },
-    { FF_PROFILE_AAC_SSR,  "SSR"  },
-    { FF_PROFILE_AAC_LTP,  "LTP"  },
-    { FF_PROFILE_UNKNOWN },
-};
-
-static const uint64_t faac_channel_layouts[] = {
-    AV_CH_LAYOUT_MONO,
-    AV_CH_LAYOUT_STEREO,
-    AV_CH_LAYOUT_SURROUND,
-    AV_CH_LAYOUT_4POINT0,
-    AV_CH_LAYOUT_5POINT0_BACK,
-    AV_CH_LAYOUT_5POINT1_BACK,
-    0
-};
-
-AVCodec ff_libfaac_encoder = {
-    .name           = "libfaac",
-    .long_name      = NULL_IF_CONFIG_SMALL("libfaac AAC (Advanced Audio Coding)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_AAC,
-    .priv_data_size = sizeof(FaacAudioContext),
-    .init           = Faac_encode_init,
-    .encode2        = Faac_encode_frame,
-    .close          = Faac_encode_close,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
-    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
-                                                     AV_SAMPLE_FMT_NONE },
-    .profiles       = NULL_IF_CONFIG_SMALL(profiles),
-    .channel_layouts = faac_channel_layouts,
-    .wrapper_name   = "libfaac",
-};
diff --git a/libavcodec/libfdk-aacdec.c b/libavcodec/libfdk-aacdec.c
index 6385623..1abe1d8 100644
--- a/libavcodec/libfdk-aacdec.c
+++ b/libavcodec/libfdk-aacdec.c
@@ -2,7 +2,7 @@
  * AAC decoder wrapper
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -50,7 +50,7 @@ typedef struct FDKAACDecContext {
     uint8_t *decoder_buffer;
     int decoder_buffer_size;
     uint8_t *anc_buffer;
-    enum ConcealMethod conceal_method;
+    int conceal_method;
     int drc_level;
     int drc_boost;
     int drc_heavy;
@@ -93,7 +93,7 @@ static const AVClass fdk_aac_dec_class = {
     .class_name = "libfdk-aac decoder",
     .item_name  = av_default_item_name,
     .option     = fdk_aac_dec_options,
-    .version    = LIBAVUTIL_VERSION_INT
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static int get_stream_info(AVCodecContext *avctx)
@@ -213,8 +213,8 @@ static av_cold int fdk_aac_decode_close(AVCodecContext *avctx)
 
     if (s->handle)
         aacDecoder_Close(s->handle);
-    av_free(s->decoder_buffer);
-    av_free(s->anc_buffer);
+    av_freep(&s->decoder_buffer);
+    av_freep(&s->anc_buffer);
 
     return 0;
 }
@@ -364,10 +364,9 @@ static int fdk_aac_decode_frame(AVCodecContext *avctx, void *data,
         goto end;
     frame->nb_samples = avctx->frame_size;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         goto end;
-    }
+
     memcpy(frame->extended_data[0], s->decoder_buffer,
            avctx->channels * avctx->frame_size *
            av_get_bytes_per_sample(avctx->sample_fmt));
diff --git a/libavcodec/libfdk-aacenc.c b/libavcodec/libfdk-aacenc.c
index 3b492ef..5620bb5 100644
--- a/libavcodec/libfdk-aacenc.c
+++ b/libavcodec/libfdk-aacenc.c
@@ -2,7 +2,7 @@
  * AAC encoder wrapper
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -251,7 +251,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
         }
         if ((err = aacEncoder_SetParam(s->handle, AACENC_BITRATE,
                                        avctx->bit_rate)) != AACENC_OK) {
-            av_log(avctx, AV_LOG_ERROR, "Unable to set the bitrate %d: %s\n",
+            av_log(avctx, AV_LOG_ERROR, "Unable to set the bitrate %"PRId64": %s\n",
                    avctx->bit_rate, aac_get_error(err));
             goto error;
         }
@@ -390,10 +390,8 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     in_buf.bufElSizes        = &in_buffer_element_size;
 
     /* The maximum packet size is 6144 bits aka 768 bytes per channel. */
-    if ((ret = ff_alloc_packet(avpkt, FFMAX(8192, 768 * avctx->channels)))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, FFMAX(8192, 768 * avctx->channels), 0)) < 0)
         return ret;
-    }
 
     out_ptr                   = avpkt->data;
     out_buffer_size           = avpkt->size;
diff --git a/libavcodec/libgsmdec.c b/libavcodec/libgsmdec.c
index 419f364..89e1de0 100644
--- a/libavcodec/libgsmdec.c
+++ b/libavcodec/libgsmdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Alban Bedel <albeu@free.fr>
  * Copyright (c) 2006, 2007 Michel Bardiaux <mbardiaux@mediaxim.be>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,12 @@
 
 // The idiosyncrasies of GSM-in-WAV are explained at http://kbs.cs.tu-berlin.de/~jutta/toast.html
 
+#include "config.h"
+#if HAVE_GSM_H
 #include <gsm.h>
+#else
+#include <gsm/gsm.h>
+#endif
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/common.h"
@@ -45,7 +50,8 @@ static av_cold int libgsm_decode_init(AVCodecContext *avctx) {
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 8000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
 
     s->state = gsm_create();
@@ -91,10 +97,8 @@ static int libgsm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avctx->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     for (i = 0; i < avctx->frame_size / GSM_FRAME_SIZE; i++) {
@@ -119,6 +123,7 @@ static void libgsm_flush(AVCodecContext *avctx) {
         gsm_option(s->state, GSM_OPT_WAV49, &one);
 }
 
+#if CONFIG_LIBGSM_DECODER
 AVCodec ff_libgsm_decoder = {
     .name           = "libgsm",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM"),
@@ -132,7 +137,8 @@ AVCodec ff_libgsm_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
     .wrapper_name   = "libgsm",
 };
-
+#endif
+#if CONFIG_LIBGSM_MS_DECODER
 AVCodec ff_libgsm_ms_decoder = {
     .name           = "libgsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM Microsoft variant"),
@@ -146,3 +152,4 @@ AVCodec ff_libgsm_ms_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
     .wrapper_name   = "libgsm",
 };
+#endif
diff --git a/libavcodec/libgsmenc.c b/libavcodec/libgsmenc.c
index 1d039b1..fdb11c7 100644
--- a/libavcodec/libgsmenc.c
+++ b/libavcodec/libgsmenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Alban Bedel <albeu@free.fr>
  * Copyright (c) 2006, 2007 Michel Bardiaux <mbardiaux@mediaxim.be>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,12 @@
 
 // The idiosyncrasies of GSM-in-WAV are explained at http://kbs.cs.tu-berlin.de/~jutta/toast.html
 
+#include "config.h"
+#if HAVE_GSM_H
 #include <gsm.h>
+#else
+#include <gsm/gsm.h>
+#endif
 
 #include "libavutil/common.h"
 
@@ -35,6 +40,12 @@
 #include "internal.h"
 #include "gsm.h"
 
+static av_cold int libgsm_encode_close(AVCodecContext *avctx) {
+    gsm_destroy(avctx->priv_data);
+    avctx->priv_data = NULL;
+    return 0;
+}
+
 static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     if (avctx->channels > 1) {
         av_log(avctx, AV_LOG_ERROR, "Mono required for GSM, got %d channels\n",
@@ -51,13 +62,15 @@ static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     if (avctx->bit_rate != 13000 /* Official */ &&
         avctx->bit_rate != 13200 /* Very common */ &&
         avctx->bit_rate != 0 /* Unknown; a.o. mov does not set bitrate when decoding */ ) {
-        av_log(avctx, AV_LOG_ERROR, "Bitrate 13000bps required for GSM, got %dbps\n",
+        av_log(avctx, AV_LOG_ERROR, "Bitrate 13000bps required for GSM, got %"PRId64"bps\n",
                avctx->bit_rate);
         if (avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL)
             return -1;
     }
 
     avctx->priv_data = gsm_create();
+    if (!avctx->priv_data)
+        goto error;
 
     switch(avctx->codec_id) {
     case AV_CODEC_ID_GSM:
@@ -73,12 +86,9 @@ static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     }
 
     return 0;
-}
-
-static av_cold int libgsm_encode_close(AVCodecContext *avctx) {
-    gsm_destroy(avctx->priv_data);
-    avctx->priv_data = NULL;
-    return 0;
+error:
+    libgsm_encode_close(avctx);
+    return -1;
 }
 
 static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
@@ -88,10 +98,8 @@ static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     gsm_signal *samples = (gsm_signal *)frame->data[0];
     struct gsm_state *state = avctx->priv_data;
 
-    if ((ret = ff_alloc_packet(avpkt, avctx->block_align))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, avctx->block_align, 0)) < 0)
         return ret;
-    }
 
     switch(avctx->codec_id) {
     case AV_CODEC_ID_GSM:
@@ -106,7 +114,12 @@ static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     return 0;
 }
 
+static const AVCodecDefault libgsm_defaults[] = {
+    { "b",                "13000" },
+    { NULL },
+};
 
+#if CONFIG_LIBGSM_ENCODER
 AVCodec ff_libgsm_encoder = {
     .name           = "libgsm",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM"),
@@ -115,11 +128,14 @@ AVCodec ff_libgsm_encoder = {
     .init           = libgsm_encode_init,
     .encode2        = libgsm_encode_frame,
     .close          = libgsm_encode_close,
+    .defaults       = libgsm_defaults,
+    .channel_layouts= (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0 },
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
     .wrapper_name   = "libgsm",
 };
-
+#endif
+#if CONFIG_LIBGSM_MS_ENCODER
 AVCodec ff_libgsm_ms_encoder = {
     .name           = "libgsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM Microsoft variant"),
@@ -128,7 +144,10 @@ AVCodec ff_libgsm_ms_encoder = {
     .init           = libgsm_encode_init,
     .encode2        = libgsm_encode_frame,
     .close          = libgsm_encode_close,
+    .defaults       = libgsm_defaults,
+    .channel_layouts= (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0 },
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
     .wrapper_name   = "libgsm",
 };
+#endif
diff --git a/libavcodec/libilbc.c b/libavcodec/libilbc.c
index 4b16199..9a56cc8 100644
--- a/libavcodec/libilbc.c
+++ b/libavcodec/libilbc.c
@@ -2,20 +2,20 @@
  * iLBC decoder/encoder stub
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -93,10 +93,8 @@ static int ilbc_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame->nb_samples = s->decoder.blockl;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     WebRtcIlbcfix_DecodeImpl((int16_t *) frame->data[0], (const uint16_t *) buf, &s->decoder, 1);
 
@@ -168,10 +166,8 @@ static int ilbc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ILBCEncContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, 50))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 50, 0)) < 0)
         return ret;
-    }
 
     WebRtcIlbcfix_EncodeImpl((uint16_t *) avpkt->data, (const int16_t *) frame->data[0], &s->encoder);
 
diff --git a/libavcodec/libkvazaar.c b/libavcodec/libkvazaar.c
index fa64bf5..a89ca7f 100644
--- a/libavcodec/libkvazaar.c
+++ b/libavcodec/libkvazaar.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2015 Tampere University of Technology
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/dict.h"
 #include "libavutil/error.h"
 #include "libavutil/imgutils.h"
@@ -54,12 +55,6 @@ static av_cold int libkvazaar_init(AVCodecContext *avctx)
     kvz_config *cfg = NULL;
     kvz_encoder *enc = NULL;
 
-    if (avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Set -strict experimental to use this encoder.\n");
-        return AVERROR_EXPERIMENTAL;
-    }
-
     /* Kvazaar requires width and height to be multiples of eight. */
     if (avctx->width % 8 || avctx->height % 8) {
         av_log(avctx, AV_LOG_ERROR,
@@ -84,8 +79,23 @@ static av_cold int libkvazaar_init(AVCodecContext *avctx)
     cfg->width  = avctx->width;
     cfg->height = avctx->height;
 
-    cfg->framerate_num   = avctx->time_base.den;
-    cfg->framerate_denom = avctx->time_base.num * avctx->ticks_per_frame;
+    if (avctx->framerate.num > 0 && avctx->framerate.den > 0) {
+        if (avctx->ticks_per_frame > INT_MAX / avctx->framerate.den) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Could not set framerate for kvazaar: integer overflow\n");
+            return AVERROR(EINVAL);
+        }
+        cfg->framerate_num   = avctx->framerate.num;
+        cfg->framerate_denom = avctx->time_base.den * avctx->ticks_per_frame;
+    } else {
+        if (avctx->ticks_per_frame > INT_MAX / avctx->time_base.num) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Could not set framerate for kvazaar: integer overflow\n");
+            return AVERROR(EINVAL);
+        }
+        cfg->framerate_num   = avctx->time_base.den;
+        cfg->framerate_denom = avctx->time_base.num * avctx->ticks_per_frame;
+    }
     cfg->target_bitrate = avctx->bit_rate;
     cfg->vui.sar_width  = avctx->sample_aspect_ratio.num;
     cfg->vui.sar_height = avctx->sample_aspect_ratio.den;
@@ -143,8 +153,8 @@ static av_cold int libkvazaar_close(AVCodecContext *avctx)
     LibkvazaarContext *ctx = avctx->priv_data;
 
     if (ctx->api) {
-      ctx->api->encoder_close(ctx->encoder);
-      ctx->api->config_destroy(ctx->config);
+        ctx->api->encoder_close(ctx->encoder);
+        ctx->api->config_destroy(ctx->config);
     }
 
     if (avctx->extradata)
@@ -166,9 +176,11 @@ static int libkvazaar_encode(AVCodecContext *avctx,
     uint32_t len_out = 0;
     int retval = 0;
 
+    *got_packet_ptr = 0;
+
     if (frame) {
         if (frame->width != ctx->config->width ||
-                frame->height != ctx->config->height) {
+            frame->height != ctx->config->height) {
             av_log(avctx, AV_LOG_ERROR,
                    "Changing video dimensions during encoding is not supported. "
                    "(changed from %dx%d to %dx%d)\n",
@@ -205,7 +217,7 @@ static int libkvazaar_encode(AVCodecContext *avctx,
               0
             };
             av_image_copy(input_pic->data, dst_linesizes,
-                          frame->data, frame->linesize,
+                          (const uint8_t **)frame->data, frame->linesize,
                           frame->format, frame->width, frame->height);
         }
 
@@ -221,19 +233,21 @@ static int libkvazaar_encode(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Failed to encode frame.\n");
         retval = AVERROR_INVALIDDATA;
         goto done;
-    }
+    } else
+        retval = 0; /* kvazaar returns 1 on success */
 
     if (data_out) {
         kvz_data_chunk *chunk = NULL;
         uint64_t written = 0;
 
-        retval = ff_alloc_packet(avpkt, len_out);
+        retval = ff_alloc_packet2(avctx, avpkt, len_out, len_out);
         if (retval < 0) {
             av_log(avctx, AV_LOG_ERROR, "Failed to allocate output packet.\n");
             goto done;
         }
 
         for (chunk = data_out; chunk != NULL; chunk = chunk->next) {
+            av_assert0(written + chunk->len <= len_out);
             memcpy(avpkt->data + written, chunk->data, chunk->len);
             written += chunk->len;
         }
@@ -244,7 +258,7 @@ static int libkvazaar_encode(AVCodecContext *avctx,
         // IRAP VCL NAL unit types span the range
         // [BLA_W_LP (16), RSV_IRAP_VCL23 (23)].
         if (frame_info.nal_unit_type >= KVZ_NAL_BLA_W_LP &&
-                frame_info.nal_unit_type <= KVZ_NAL_RSV_IRAP_VCL23) {
+            frame_info.nal_unit_type <= KVZ_NAL_RSV_IRAP_VCL23) {
             avpkt->flags |= AV_PKT_FLAG_KEY;
         }
 
@@ -268,7 +282,6 @@ static const enum AVPixelFormat pix_fmts[] = {
 static const AVOption options[] = {
     { "kvazaar-params", "Set kvazaar parameters as a comma-separated list of key=value pairs.",
         OFFSET(kvz_params), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VE },
-
     { NULL },
 };
 
@@ -289,7 +302,7 @@ AVCodec ff_libkvazaar_encoder = {
     .long_name        = NULL_IF_CONFIG_SMALL("libkvazaar H.265 / HEVC"),
     .type             = AVMEDIA_TYPE_VIDEO,
     .id               = AV_CODEC_ID_HEVC,
-    .capabilities     = AV_CODEC_CAP_DELAY,
+    .capabilities     = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
     .pix_fmts         = pix_fmts,
 
     .priv_class       = &class,
diff --git a/libavcodec/libmp3lame.c b/libavcodec/libmp3lame.c
index 1fe26d1..ecdd2e3 100644
--- a/libavcodec/libmp3lame.c
+++ b/libavcodec/libmp3lame.c
@@ -2,20 +2,20 @@
  * Interface to libmp3lame for mp3 encoding
  * Copyright (c) 2002 Lennert Buytenhek <buytenh@gnu.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
 #include "mpegaudio.h"
 #include "mpegaudiodecheader.h"
 
-#define BUFFER_SIZE (7200 + 2 * MPA_FRAME_SIZE + MPA_FRAME_SIZE / 4)
+#define BUFFER_SIZE (7200 + 2 * MPA_FRAME_SIZE + MPA_FRAME_SIZE / 4+1000) // FIXME: Buffer size to small? Adding 1000 to make up for it.
 
 typedef struct LAMEContext {
     AVClass *class;
@@ -50,9 +50,10 @@ typedef struct LAMEContext {
     int reservoir;
     int joint_stereo;
     int abr;
+    int delay_sent;
     float *samples_flt[2];
     AudioFrameQueue afq;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 } LAMEContext;
 
 
@@ -79,6 +80,7 @@ static av_cold int mp3lame_encode_close(AVCodecContext *avctx)
     av_freep(&s->samples_flt[0]);
     av_freep(&s->samples_flt[1]);
     av_freep(&s->buffer);
+    av_freep(&s->fdsp);
 
     ff_af_queue_close(&s->afq);
 
@@ -97,6 +99,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (!(s->gfp = lame_init()))
         return AVERROR(ENOMEM);
 
+
     lame_set_num_channels(s->gfp, avctx->channels);
     lame_set_mode(s->gfp, avctx->channels > 1 ? s->joint_stereo ? JOINT_STEREO : STEREO : MONO);
 
@@ -105,9 +108,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     lame_set_out_samplerate(s->gfp, avctx->sample_rate);
 
     /* algorithmic quality */
-    if (avctx->compression_level == FF_COMPRESSION_DEFAULT)
-        lame_set_quality(s->gfp, 5);
-    else
+    if (avctx->compression_level != FF_COMPRESSION_DEFAULT)
         lame_set_quality(s->gfp, avctx->compression_level);
 
     /* rate control */
@@ -124,6 +125,10 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
         }
     }
 
+    /* lowpass cutoff frequency */
+    if (avctx->cutoff)
+        lame_set_lowpassfreq(s->gfp, avctx->cutoff);
+
     /* do not get a Xing VBR header frame from LAME */
     lame_set_bWriteVbrTag(s->gfp,0);
 
@@ -146,7 +151,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (avctx->sample_fmt == AV_SAMPLE_FMT_FLTP) {
         int ch;
         for (ch = 0; ch < avctx->channels; ch++) {
-            s->samples_flt[ch] = av_malloc(avctx->frame_size *
+            s->samples_flt[ch] = av_malloc_array(avctx->frame_size,
                                            sizeof(*s->samples_flt[ch]));
             if (!s->samples_flt[ch]) {
                 ret = AVERROR(ENOMEM);
@@ -159,7 +164,12 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (ret < 0)
         goto error;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+
 
     return 0;
 error:
@@ -180,7 +190,7 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 {
     LAMEContext *s = avctx->priv_data;
     MPADecodeHeader hdr;
-    int len, ret, ch;
+    int len, ret, ch, discard_padding;
     int lame_result;
     uint32_t h;
 
@@ -198,7 +208,7 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                 return AVERROR(EINVAL);
             }
             for (ch = 0; ch < avctx->channels; ch++) {
-                s->fdsp.vector_fmul_scalar(s->samples_flt[ch],
+                s->fdsp->vector_fmul_scalar(s->samples_flt[ch],
                                            (const float *)frame->data[ch],
                                            32768.0f,
                                            FFALIGN(frame->nb_samples, 8));
@@ -208,6 +218,8 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         default:
             return AVERROR_BUG;
         }
+    } else if (!s->afq.frame_alloc) {
+        lame_result = 0;
     } else {
         lame_result = lame_encode_flush(s->gfp, s->buffer + s->buffer_index,
                                         s->buffer_size - s->buffer_index);
@@ -252,10 +264,8 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ff_dlog(avctx, "in:%d packet-len:%d index:%d\n", avctx->frame_size, len,
             s->buffer_index);
     if (len <= s->buffer_index) {
-        if ((ret = ff_alloc_packet(avpkt, len))) {
-            av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        if ((ret = ff_alloc_packet2(avctx, avpkt, len, 0)) < 0)
             return ret;
-        }
         memcpy(avpkt->data, s->buffer, len);
         s->buffer_index -= len;
         memmove(s->buffer, s->buffer + len, s->buffer_index);
@@ -264,6 +274,30 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
                            &avpkt->duration);
 
+        discard_padding = avctx->frame_size - avpkt->duration;
+        // Check if subtraction resulted in an overflow
+        if ((discard_padding < avctx->frame_size) != (avpkt->duration > 0)) {
+            av_log(avctx, AV_LOG_ERROR, "discard padding overflow\n");
+            av_packet_unref(avpkt);
+            av_free(avpkt);
+            return AVERROR(EINVAL);
+        }
+        if ((!s->delay_sent && avctx->initial_padding > 0) || discard_padding > 0) {
+            uint8_t* side_data = av_packet_new_side_data(avpkt,
+                                                         AV_PKT_DATA_SKIP_SAMPLES,
+                                                         10);
+            if(!side_data) {
+                av_packet_unref(avpkt);
+                av_free(avpkt);
+                return AVERROR(ENOMEM);
+            }
+            if (!s->delay_sent) {
+                AV_WL32(side_data, avctx->initial_padding);
+                s->delay_sent = 1;
+            }
+            AV_WL32(side_data + 4, discard_padding);
+        }
+
         avpkt->size = len;
         *got_packet_ptr = 1;
     }
@@ -273,9 +307,9 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 #define OFFSET(x) offsetof(LAMEContext, x)
 #define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "reservoir", "Use bit reservoir.", OFFSET(reservoir), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AE },
-    { "joint_stereo", "Use joint stereo.", OFFSET(joint_stereo), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AE },
-    { "abr", "Use ABR", OFFSET(abr), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AE },
+    { "reservoir",    "use bit reservoir", OFFSET(reservoir),    AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, AE },
+    { "joint_stereo", "use joint stereo",  OFFSET(joint_stereo), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, AE },
+    { "abr",          "use ABR",           OFFSET(abr),          AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AE },
     { NULL },
 };
 
diff --git a/libavcodec/libopencore-amr.c b/libavcodec/libopencore-amr.c
index 8ce5a71..516f625 100644
--- a/libavcodec/libopencore-amr.c
+++ b/libavcodec/libopencore-amr.c
@@ -2,20 +2,20 @@
  * AMR Audio decoder stub
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,8 @@ static int amr_decode_fix_avctx(AVCodecContext *avctx)
 {
     const int is_amr_wb = 1 + (avctx->codec_id == AV_CODEC_ID_AMR_WB);
 
-    avctx->sample_rate = 8000 * is_amr_wb;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000 * is_amr_wb;
 
     if (avctx->channels > 1) {
         avpriv_report_missing_feature(avctx, "multi-channel AMR");
@@ -105,10 +106,8 @@ static int amr_nb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 160;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     dec_mode    = (buf[0] >> 3) & 0x000F;
     packet_size = block_size[dec_mode] + 1;
@@ -183,7 +182,7 @@ static const AVOption options[] = {
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass amrnb_class = {
     .class_name = "libopencore_amrnb",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -194,7 +193,7 @@ static av_cold int amr_nb_encode_init(AVCodecContext *avctx)
 {
     AMRContext *s = avctx->priv_data;
 
-    if (avctx->sample_rate != 8000) {
+    if (avctx->sample_rate != 8000 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
         av_log(avctx, AV_LOG_ERROR, "Only 8000Hz sample rate supported\n");
         return AVERROR(ENOSYS);
     }
@@ -242,14 +241,12 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         s->enc_bitrate = avctx->bit_rate;
     }
 
-    if ((ret = ff_alloc_packet(avpkt, 32))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 32, 0)) < 0)
         return ret;
-    }
 
     if (frame) {
         if (frame->nb_samples < avctx->frame_size) {
-            flush_buf = av_mallocz(avctx->frame_size * sizeof(*flush_buf));
+            flush_buf = av_mallocz_array(avctx->frame_size, sizeof(*flush_buf));
             if (!flush_buf)
                 return AVERROR(ENOMEM);
             memcpy(flush_buf, samples, frame->nb_samples * sizeof(*flush_buf));
@@ -264,7 +261,7 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     } else {
         if (s->enc_last_frame < 0)
             return 0;
-        flush_buf = av_mallocz(avctx->frame_size * sizeof(*flush_buf));
+        flush_buf = av_mallocz_array(avctx->frame_size, sizeof(*flush_buf));
         if (!flush_buf)
             return AVERROR(ENOMEM);
         samples = flush_buf;
@@ -273,8 +270,8 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, samples,
                                        avpkt->data, 0);
-    ff_dlog(avctx, "amr_nb_encode_frame encoded %d bytes, bitrate %d, first byte was %#02"PRIx8"\n",
-            written, s->enc_mode, *frame->data[0]);
+    ff_dlog(avctx, "amr_nb_encode_frame encoded %u bytes, bitrate %u, first byte was %#02x\n",
+            written, s->enc_mode, avpkt->data[0]);
 
     /* Get the next frame pts/duration */
     ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
@@ -298,7 +295,7 @@ AVCodec ff_libopencore_amrnb_encoder = {
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &amrnb_class,
 };
 #endif /* CONFIG_LIBOPENCORE_AMRNB_ENCODER */
 
@@ -340,10 +337,8 @@ static int amr_wb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 320;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     mode        = (buf[0] >> 3) & 0x000F;
     packet_size = block_size[mode];
@@ -353,6 +348,10 @@ static int amr_wb_decode_frame(AVCodecContext *avctx, void *data,
                buf_size, packet_size + 1);
         return AVERROR_INVALIDDATA;
     }
+    if (!packet_size) {
+        av_log(avctx, AV_LOG_ERROR, "amr packet_size invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     D_IF_decode(s->state, buf, (short *)frame->data[0], _good_frame);
 
diff --git a/libavcodec/libopenh264.c b/libavcodec/libopenh264.c
index 6252cfd..59c61a3 100644
--- a/libavcodec/libopenh264.c
+++ b/libavcodec/libopenh264.c
@@ -2,20 +2,20 @@
  * OpenH264 shared utils
  * Copyright (C) 2014 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,8 @@
 
 #include "libopenh264.h"
 
-// Convert libopenh264 log level to equivalent libav log level.
-static int libopenh264_to_libav_log_level(int libopenh264_log_level)
+// Convert libopenh264 log level to equivalent ffmpeg log level.
+static int libopenh264_to_ffmpeg_log_level(int libopenh264_log_level)
 {
     if      (libopenh264_log_level >= WELS_LOG_DETAIL)  return AV_LOG_TRACE;
     else if (libopenh264_log_level >= WELS_LOG_DEBUG)   return AV_LOG_DEBUG;
@@ -40,10 +40,10 @@ static int libopenh264_to_libav_log_level(int libopenh264_log_level)
 
 void ff_libopenh264_trace_callback(void *ctx, int level, const char *msg)
 {
-    // The message will be logged only if the requested EQUIVALENT libav log level is
-    // less than or equal to the current libav log level.
-    int equiv_libav_log_level = libopenh264_to_libav_log_level(level);
-    av_log(ctx, equiv_libav_log_level, "%s\n", msg);
+    // The message will be logged only if the requested EQUIVALENT ffmpeg log level is
+    // less than or equal to the current ffmpeg log level.
+    int equiv_ffmpeg_log_level = libopenh264_to_ffmpeg_log_level(level);
+    av_log(ctx, equiv_ffmpeg_log_level, "%s\n", msg);
 }
 
 int ff_libopenh264_check_version(void *logctx)
diff --git a/libavcodec/libopenh264.h b/libavcodec/libopenh264.h
index 7c69481..dbb9c5d 100644
--- a/libavcodec/libopenh264.h
+++ b/libavcodec/libopenh264.h
@@ -2,20 +2,20 @@
  * OpenH264 shared utils
  * Copyright (C) 2014 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libopenh264dec.c b/libavcodec/libopenh264dec.c
index 6adf984..c7aa7fa 100644
--- a/libavcodec/libopenh264dec.c
+++ b/libavcodec/libopenh264dec.c
@@ -2,20 +2,20 @@
  * OpenH264 video decoder
  * Copyright (C) 2016 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -92,7 +92,7 @@ static int svc_decode_frame(AVCodecContext *avctx, void *data,
     SVCContext *s = avctx->priv_data;
     SBufferInfo info = { 0 };
     uint8_t* ptrs[3];
-    int linesize[3];
+    int ret, linesize[3];
     AVFrame *avframe = data;
     DECODING_STATE state;
 #if OPENH264_VER_AT_LEAST(1, 7)
@@ -128,7 +128,9 @@ static int svc_decode_frame(AVCodecContext *avctx, void *data,
         return avpkt->size;
     }
 
-    ff_set_dimensions(avctx, info.UsrData.sSystemBuffer.iWidth, info.UsrData.sSystemBuffer.iHeight);
+    ret = ff_set_dimensions(avctx, info.UsrData.sSystemBuffer.iWidth, info.UsrData.sSystemBuffer.iHeight);
+    if (ret < 0)
+        return ret;
     // The decoder doesn't (currently) support decoding into a user
     // provided buffer, so do a copy instead.
     if (ff_get_buffer(avctx, avframe, 0) < 0) {
diff --git a/libavcodec/libopenh264enc.c b/libavcodec/libopenh264enc.c
index d075cb0..ae6d17c 100644
--- a/libavcodec/libopenh264enc.c
+++ b/libavcodec/libopenh264enc.c
@@ -2,20 +2,20 @@
  * OpenH264 video encoder
  * Copyright (C) 2014 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,23 +53,23 @@ typedef struct SVCContext {
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
 #if OPENH264_VER_AT_LEAST(1, 6)
-    { "slice_mode", "Slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_FIXEDSLCNUM_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
+    { "slice_mode", "set slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_FIXEDSLCNUM_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
 #else
-    { "slice_mode", "Slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_AUTO_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
+    { "slice_mode", "set slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_AUTO_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
 #endif
-    { "fixed", "A fixed number of slices", 0, AV_OPT_TYPE_CONST, { .i64 = SM_FIXEDSLCNUM_SLICE }, 0, 0, VE, "slice_mode" },
+        { "fixed", "a fixed number of slices", 0, AV_OPT_TYPE_CONST, { .i64 = SM_FIXEDSLCNUM_SLICE }, 0, 0, VE, "slice_mode" },
 #if OPENH264_VER_AT_LEAST(1, 6)
-    { "dyn", "Size limited (compatibility name)", 0, AV_OPT_TYPE_CONST, { .i64 = SM_SIZELIMITED_SLICE }, 0, 0, VE, "slice_mode" },
-    { "sizelimited", "Size limited", 0, AV_OPT_TYPE_CONST, { .i64 = SM_SIZELIMITED_SLICE }, 0, 0, VE, "slice_mode" },
+        { "dyn", "Size limited (compatibility name)", 0, AV_OPT_TYPE_CONST, { .i64 = SM_SIZELIMITED_SLICE }, 0, 0, VE, "slice_mode" },
+        { "sizelimited", "Size limited", 0, AV_OPT_TYPE_CONST, { .i64 = SM_SIZELIMITED_SLICE }, 0, 0, VE, "slice_mode" },
 #else
-    { "rowmb", "One slice per row of macroblocks", 0, AV_OPT_TYPE_CONST, { .i64 = SM_ROWMB_SLICE }, 0, 0, VE, "slice_mode" },
-    { "auto", "Automatic number of slices according to number of threads", 0, AV_OPT_TYPE_CONST, { .i64 = SM_AUTO_SLICE }, 0, 0, VE, "slice_mode" },
-    { "dyn", "Dynamic slicing", 0, AV_OPT_TYPE_CONST, { .i64 = SM_DYN_SLICE }, 0, 0, VE, "slice_mode" },
+        { "rowmb", "one slice per row of macroblocks", 0, AV_OPT_TYPE_CONST, { .i64 = SM_ROWMB_SLICE }, 0, 0, VE, "slice_mode" },
+        { "auto", "automatic number of slices according to number of threads", 0, AV_OPT_TYPE_CONST, { .i64 = SM_AUTO_SLICE }, 0, 0, VE, "slice_mode" },
+        { "dyn", "Dynamic slicing", 0, AV_OPT_TYPE_CONST, { .i64 = SM_DYN_SLICE }, 0, 0, VE, "slice_mode" },
 #endif
-    { "loopfilter", "Enable loop filter", OFFSET(loopfilter), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
-    { "profile", "Set profile restrictions", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VE },
-    { "max_nal_size", "Set maximum NAL size in bytes", OFFSET(max_nal_size), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
-    { "allow_skip_frames", "Allow skipping frames to hit the target bitrate", OFFSET(skip_frames), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "loopfilter", "enable loop filter", OFFSET(loopfilter), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
+    { "profile", "set profile restrictions", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VE },
+    { "max_nal_size", "set maximum NAL size in bytes", OFFSET(max_nal_size), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "allow_skip_frames", "allow skipping frames to hit the target bitrate", OFFSET(skip_frames), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "cabac", "Enable cabac", OFFSET(cabac), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { NULL }
 };
@@ -129,7 +129,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    param.fMaxFrameRate              = avctx->time_base.den / avctx->time_base.num;
+    param.fMaxFrameRate              = 1/av_q2d(avctx->time_base);
     param.iPicWidth                  = avctx->width;
     param.iPicHeight                 = avctx->height;
     param.iTargetBitrate             = avctx->bit_rate;
@@ -164,6 +164,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
     param.sSpatialLayers[0].iSpatialBitrate     = param.iTargetBitrate;
     param.sSpatialLayers[0].iMaxSpatialBitrate  = param.iMaxBitrate;
 
+#if OPENH264_VER_AT_LEAST(1, 7)
+    if (avctx->sample_aspect_ratio.num && avctx->sample_aspect_ratio.den) {
+        // Table E-1.
+        static const AVRational sar_idc[] = {
+            {   0,  0 }, // Unspecified (never written here).
+            {   1,  1 }, {  12, 11 }, {  10, 11 }, {  16, 11 },
+            {  40, 33 }, {  24, 11 }, {  20, 11 }, {  32, 11 },
+            {  80, 33 }, {  18, 11 }, {  15, 11 }, {  64, 33 },
+            { 160, 99 }, // Last 3 are unknown to openh264: {   4,  3 }, {   3,  2 }, {   2,  1 },
+        };
+        static const ESampleAspectRatio asp_idc[] = {
+            ASP_UNSPECIFIED,
+            ASP_1x1,      ASP_12x11,   ASP_10x11,   ASP_16x11,
+            ASP_40x33,    ASP_24x11,   ASP_20x11,   ASP_32x11,
+            ASP_80x33,    ASP_18x11,   ASP_15x11,   ASP_64x33,
+            ASP_160x99,
+        };
+        int num, den, i;
+
+        av_reduce(&num, &den, avctx->sample_aspect_ratio.num,
+                  avctx->sample_aspect_ratio.den, 65535);
+
+        for (i = 1; i < FF_ARRAY_ELEMS(sar_idc); i++) {
+            if (num == sar_idc[i].num &&
+                den == sar_idc[i].den)
+                break;
+        }
+        if (i == FF_ARRAY_ELEMS(sar_idc)) {
+            param.sSpatialLayers[0].eAspectRatio = ASP_EXT_SAR;
+            param.sSpatialLayers[0].sAspectRatioExtWidth = num;
+            param.sSpatialLayers[0].sAspectRatioExtHeight = den;
+        } else {
+            param.sSpatialLayers[0].eAspectRatio = asp_idc[i];
+        }
+        param.sSpatialLayers[0].bAspectRatioPresent = true;
+    }
+    else {
+        param.sSpatialLayers[0].bAspectRatioPresent = false;
+    }
+#endif
+
     if ((avctx->slices > 1) && (s->max_nal_size)) {
         av_log(avctx, AV_LOG_ERROR,
                "Invalid combination -slices %d and -max_nal_size %d.\n",
@@ -246,6 +287,10 @@ static int svc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     sp.iPicWidth  = avctx->width;
     sp.iPicHeight = avctx->height;
 
+    if (frame->pict_type == AV_PICTURE_TYPE_I) {
+        (*s->encoder)->ForceIntraFrame(s->encoder, true);
+    }
+
     encoded = (*s->encoder)->EncodeFrame(s->encoder, &sp, &fbi);
     if (encoded != cmResultSuccess) {
         av_log(avctx, AV_LOG_ERROR, "EncodeFrame failed\n");
@@ -271,7 +316,7 @@ static int svc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
     av_log(avctx, AV_LOG_DEBUG, "%d slices\n", fbi.sLayerInfo[fbi.iLayerNum - 1].iNalCount);
 
-    if ((ret = ff_alloc_packet(avpkt, size))) {
+    if ((ret = ff_alloc_packet2(avctx, avpkt, size, size))) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
         return ret;
     }
diff --git a/libavcodec/libopenjpegdec.c b/libavcodec/libopenjpegdec.c
index 798b33f..344c5ba 100644
--- a/libavcodec/libopenjpegdec.c
+++ b/libavcodec/libopenjpegdec.c
@@ -2,20 +2,20 @@
  * JPEG 2000 decoding support via OpenJPEG
  * Copyright (c) 2009 Jaikrishnan Menon <realityman@gmx.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,6 @@
  * JPEG 2000 decoder using libopenjpeg
  */
 
-#define  OPJ_STATIC
-#include <openjpeg.h>
-
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
@@ -37,6 +34,8 @@
 #include "internal.h"
 #include "thread.h"
 
+#include <openjpeg.h>
+
 #define JP2_SIG_TYPE    0x6A502020
 #define JP2_SIG_VALUE   0x0D0A870A
 
@@ -46,72 +45,145 @@
                            AV_PIX_FMT_RGB48, AV_PIX_FMT_RGBA64
 
 #define GRAY_PIXEL_FORMATS AV_PIX_FMT_GRAY8, AV_PIX_FMT_YA8,                  \
-                           AV_PIX_FMT_GRAY16
-
-#define YUV_PIXEL_FORMATS  AV_PIX_FMT_YUV410P,   AV_PIX_FMT_YUV411P,          \
-                           AV_PIX_FMT_YUVA420P,                               \
-                           AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,          \
-                           AV_PIX_FMT_YUV440P,   AV_PIX_FMT_YUV444P,          \
-                           AV_PIX_FMT_YUV420P9,  AV_PIX_FMT_YUV422P9,         \
-                           AV_PIX_FMT_YUV444P9,                               \
-                           AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10,        \
-                           AV_PIX_FMT_YUV444P10,                              \
-                           AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16,        \
-                           AV_PIX_FMT_YUV444P16
+                           AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12, AV_PIX_FMT_GRAY14, \
+                           AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA16
+
+#define YUV_PIXEL_FORMATS  AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUVA420P, \
+                           AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVA422P, \
+                           AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA444P, \
+                           AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9, \
+                           AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA444P9, \
+                           AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10, \
+                           AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA444P10, \
+                           AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12, \
+                           AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14, \
+                           AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16, \
+                           AV_PIX_FMT_YUVA420P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA444P16
 
 #define XYZ_PIXEL_FORMATS  AV_PIX_FMT_XYZ12
 
-static const enum AVPixelFormat rgb_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_rgb_pix_fmts[]  = {
     RGB_PIXEL_FORMATS
 };
-static const enum AVPixelFormat gray_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_gray_pix_fmts[] = {
     GRAY_PIXEL_FORMATS
 };
-static const enum AVPixelFormat yuv_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_yuv_pix_fmts[]  = {
     YUV_PIXEL_FORMATS
 };
-static const enum AVPixelFormat any_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_all_pix_fmts[]  = {
     RGB_PIXEL_FORMATS, GRAY_PIXEL_FORMATS, YUV_PIXEL_FORMATS, XYZ_PIXEL_FORMATS
 };
 
 typedef struct LibOpenJPEGContext {
     AVClass *class;
     opj_dparameters_t dec_params;
-    int lowres;
     int lowqual;
 } LibOpenJPEGContext;
 
-static int libopenjpeg_matches_pix_fmt(const opj_image_t *img,
-                                       enum AVPixelFormat pix_fmt)
+static void error_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_ERROR, "%s", msg);
+}
+
+static void warning_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_WARNING, "%s", msg);
+}
+
+static void info_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_DEBUG, "%s", msg);
+}
+
+typedef struct BufferReader {
+    int pos;
+    int size;
+    const uint8_t *buffer;
+} BufferReader;
+
+static OPJ_SIZE_T stream_read(void *out_buffer, OPJ_SIZE_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    int remaining;
+
+    if (reader->pos == reader->size) {
+        return (OPJ_SIZE_T)-1;
+    }
+    remaining = reader->size - reader->pos;
+    if (nb_bytes > remaining) {
+        nb_bytes = remaining;
+    }
+    memcpy(out_buffer, reader->buffer + reader->pos, nb_bytes);
+    reader->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_OFF_T stream_skip(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    if (nb_bytes < 0) {
+        if (reader->pos == 0) {
+            return (OPJ_SIZE_T)-1;
+        }
+        if (nb_bytes + reader->pos < 0) {
+            nb_bytes = -reader->pos;
+        }
+    } else {
+        int remaining;
+
+        if (reader->pos == reader->size) {
+            return (OPJ_SIZE_T)-1;
+        }
+        remaining = reader->size - reader->pos;
+        if (nb_bytes > remaining) {
+            nb_bytes = remaining;
+        }
+    }
+    reader->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_BOOL stream_seek(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    if (nb_bytes < 0 || nb_bytes > reader->size) {
+        return OPJ_FALSE;
+    }
+    reader->pos = (int)nb_bytes;
+    return OPJ_TRUE;
+}
+
+static inline int libopenjpeg_matches_pix_fmt(const opj_image_t *image, enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     int match = 1;
 
-    if (desc->nb_components != img->numcomps) {
+    if (desc->nb_components != image->numcomps) {
         return 0;
     }
 
     switch (desc->nb_components) {
     case 4:
         match = match &&
-                desc->comp[3].depth >= img->comps[3].prec &&
-                1 == img->comps[3].dx &&
-                1 == img->comps[3].dy;
+                desc->comp[3].depth >= image->comps[3].prec &&
+                1 == image->comps[3].dx &&
+                1 == image->comps[3].dy;
     case 3:
         match = match &&
-                desc->comp[2].depth >= img->comps[2].prec &&
-                1 << desc->log2_chroma_w == img->comps[2].dx &&
-                1 << desc->log2_chroma_h == img->comps[2].dy;
+                desc->comp[2].depth >= image->comps[2].prec &&
+                1 << desc->log2_chroma_w == image->comps[2].dx &&
+                1 << desc->log2_chroma_h == image->comps[2].dy;
     case 2:
         match = match &&
-                desc->comp[1].depth >= img->comps[1].prec &&
-                1 << desc->log2_chroma_w == img->comps[1].dx &&
-                1 << desc->log2_chroma_h == img->comps[1].dy;
+                desc->comp[1].depth >= image->comps[1].prec &&
+                1 << desc->log2_chroma_w == image->comps[1].dx &&
+                1 << desc->log2_chroma_h == image->comps[1].dy;
     case 1:
         match = match &&
-                desc->comp[0].depth >= img->comps[0].prec &&
-                1 == img->comps[0].dx &&
-                1 == img->comps[0].dy;
+                desc->comp[0].depth >= image->comps[0].prec &&
+                1 == image->comps[0].dx &&
+                1 == image->comps[0].dy;
     default:
         break;
     }
@@ -119,28 +191,27 @@ static int libopenjpeg_matches_pix_fmt(const opj_image_t *img,
     return match;
 }
 
-static enum AVPixelFormat libopenjpeg_guess_pix_fmt(const opj_image_t *image)
-{
+static inline enum AVPixelFormat libopenjpeg_guess_pix_fmt(const opj_image_t *image) {
     int index;
     const enum AVPixelFormat *possible_fmts = NULL;
     int possible_fmts_nb = 0;
 
     switch (image->color_space) {
-    case CLRSPC_SRGB:
-        possible_fmts    = rgb_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(rgb_pix_fmts);
+    case OPJ_CLRSPC_SRGB:
+        possible_fmts    = libopenjpeg_rgb_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_rgb_pix_fmts);
         break;
-    case CLRSPC_GRAY:
-        possible_fmts    = gray_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(gray_pix_fmts);
+    case OPJ_CLRSPC_GRAY:
+        possible_fmts    = libopenjpeg_gray_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_gray_pix_fmts);
         break;
-    case CLRSPC_SYCC:
-        possible_fmts    = yuv_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(yuv_pix_fmts);
+    case OPJ_CLRSPC_SYCC:
+        possible_fmts    = libopenjpeg_yuv_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_yuv_pix_fmts);
         break;
     default:
-        possible_fmts    = any_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(any_pix_fmts);
+        possible_fmts    = libopenjpeg_all_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_all_pix_fmts);
         break;
     }
 
@@ -167,40 +238,37 @@ static inline int libopenjpeg_ispacked(enum AVPixelFormat pix_fmt)
     return 1;
 }
 
-static void libopenjpeg_copy_to_packed8(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copy_to_packed8(AVFrame *picture, opj_image_t *image) {
     uint8_t *img_ptr;
     int index, x, y, c;
-
     for (y = 0; y < picture->height; y++) {
         index   = y * picture->width;
         img_ptr = picture->data[0] + y * picture->linesize[0];
         for (x = 0; x < picture->width; x++, index++)
             for (c = 0; c < image->numcomps; c++)
-                *img_ptr++ = image->comps[c].data[index];
+                *img_ptr++ = 0x80 * image->comps[c].sgnd + image->comps[c].data[index];
     }
 }
 
-static void libopenjpeg_copy_to_packed16(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copy_to_packed16(AVFrame *picture, opj_image_t *image) {
     uint16_t *img_ptr;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(picture->format);
     int index, x, y, c;
     int adjust[4];
-
     for (x = 0; x < image->numcomps; x++)
-        adjust[x] = FFMAX(FFMIN(16 - image->comps[x].prec, 8), 0);
+        adjust[x] = FFMAX(FFMIN(desc->comp[x].depth - image->comps[x].prec, 8), 0) + desc->comp[x].shift;
 
     for (y = 0; y < picture->height; y++) {
         index   = y * picture->width;
         img_ptr = (uint16_t *) (picture->data[0] + y * picture->linesize[0]);
         for (x = 0; x < picture->width; x++, index++)
             for (c = 0; c < image->numcomps; c++)
-                *img_ptr++ = image->comps[c].data[index] << adjust[c];
+                *img_ptr++ = (1 << image->comps[c].prec - 1) * image->comps[c].sgnd +
+                             (unsigned)image->comps[c].data[index] << adjust[c];
     }
 }
 
-static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image) {
     int *comp_data;
     uint8_t *img_ptr;
     int index, x, y;
@@ -210,7 +278,7 @@ static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
         for (y = 0; y < image->comps[index].h; y++) {
             img_ptr = picture->data[index] + y * picture->linesize[index];
             for (x = 0; x < image->comps[index].w; x++) {
-                *img_ptr = (uint8_t) *comp_data;
+                *img_ptr = 0x80 * image->comps[index].sgnd + *comp_data;
                 img_ptr++;
                 comp_data++;
             }
@@ -218,18 +286,22 @@ static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
     }
 }
 
-static void libopenjpeg_copyto16(AVFrame *p, opj_image_t *image)
-{
+static inline void libopenjpeg_copyto16(AVFrame *picture, opj_image_t *image) {
     int *comp_data;
     uint16_t *img_ptr;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(picture->format);
     int index, x, y;
+    int adjust[4];
+    for (x = 0; x < image->numcomps; x++)
+        adjust[x] = FFMAX(FFMIN(desc->comp[x].depth - image->comps[x].prec, 8), 0) + desc->comp[x].shift;
 
     for (index = 0; index < image->numcomps; index++) {
         comp_data = image->comps[index].data;
         for (y = 0; y < image->comps[index].h; y++) {
-            img_ptr = (uint16_t *)(p->data[index] + y * p->linesize[index]);
+            img_ptr = (uint16_t *)(picture->data[index] + y * picture->linesize[index]);
             for (x = 0; x < image->comps[index].w; x++) {
-                *img_ptr = *comp_data;
+                *img_ptr = (1 << image->comps[index].prec - 1) * image->comps[index].sgnd +
+                           (unsigned)*comp_data << adjust[index];
                 img_ptr++;
                 comp_data++;
             }
@@ -255,13 +327,14 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     ThreadFrame frame       = { .f = data };
     AVFrame *picture        = data;
     const AVPixFmtDescriptor *desc;
-    opj_dinfo_t *dec;
-    opj_cio_t *stream;
-    opj_image_t *image;
     int width, height, ret;
     int pixel_size = 0;
     int ispacked   = 0;
     int i;
+    opj_image_t *image = NULL;
+    BufferReader reader = {0, avpkt->size, avpkt->data};
+    opj_codec_t *dec = NULL;
+    opj_stream_t *stream = NULL;
 
     *got_frame = 0;
 
@@ -269,53 +342,61 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     if ((AV_RB32(buf) == 12) &&
         (AV_RB32(buf + 4) == JP2_SIG_TYPE) &&
         (AV_RB32(buf + 8) == JP2_SIG_VALUE)) {
-        dec = opj_create_decompress(CODEC_JP2);
+        dec = opj_create_decompress(OPJ_CODEC_JP2);
     } else {
         /* If the AVPacket contains a jp2c box, then skip to
          * the starting byte of the codestream. */
         if (AV_RB32(buf + 4) == AV_RB32("jp2c"))
             buf += 8;
-        dec = opj_create_decompress(CODEC_J2K);
+        dec = opj_create_decompress(OPJ_CODEC_J2K);
     }
 
     if (!dec) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing decoder.\n");
-        return AVERROR_UNKNOWN;
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
-    opj_set_event_mgr((opj_common_ptr) dec, NULL, NULL);
 
-    ctx->dec_params.cp_limit_decoding = LIMIT_TO_MAIN_HEADER;
-    ctx->dec_params.cp_reduce         = ctx->lowres;
-    ctx->dec_params.cp_layer          = ctx->lowqual;
+    if (!opj_set_error_handler(dec, error_callback, avctx) ||
+        !opj_set_warning_handler(dec, warning_callback, avctx) ||
+        !opj_set_info_handler(dec, info_callback, avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting decoder handlers.\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    ctx->dec_params.cp_layer = ctx->lowqual;
+    ctx->dec_params.cp_reduce = avctx->lowres;
+
     // Tie decoder with decoding parameters
     opj_setup_decoder(dec, &ctx->dec_params);
-    stream = opj_cio_open((opj_common_ptr) dec, buf, buf_size);
+
+    stream = opj_stream_default_create(OPJ_STREAM_READ);
 
     if (!stream) {
         av_log(avctx, AV_LOG_ERROR,
                "Codestream could not be opened for reading.\n");
-        opj_destroy_decompress(dec);
-        return AVERROR_UNKNOWN;
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
 
+    opj_stream_set_read_function(stream, stream_read);
+    opj_stream_set_skip_function(stream, stream_skip);
+    opj_stream_set_seek_function(stream, stream_seek);
+    opj_stream_set_user_data(stream, &reader, NULL);
+    opj_stream_set_user_data_length(stream, avpkt->size);
     // Decode the header only.
-    image = opj_decode_with_info(dec, stream, NULL);
-    opj_cio_close(stream);
+    ret = !opj_read_header(stream, dec, &image);
 
-    if (!image) {
-        av_log(avctx, AV_LOG_ERROR, "Error decoding codestream.\n");
-        opj_destroy_decompress(dec);
-        return AVERROR_UNKNOWN;
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "Error decoding codestream header.\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
 
     width  = image->x1 - image->x0;
     height = image->y1 - image->y0;
 
-    if (ctx->lowres) {
-        width  = (width + (1 << ctx->lowres) - 1) >> ctx->lowres;
-        height = (height + (1 << ctx->lowres) - 1) >> ctx->lowres;
-    }
-
     ret = ff_set_dimensions(avctx, width, height);
     if (ret < 0)
         goto done;
@@ -328,42 +409,34 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
         avctx->pix_fmt = libopenjpeg_guess_pix_fmt(image);
 
     if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to determine pixel format\n");
-        ret = AVERROR_INVALIDDATA;
+        av_log(avctx, AV_LOG_ERROR, "Unable to determine pixel format.\n");
+        ret = AVERROR_UNKNOWN;
         goto done;
     }
-
     for (i = 0; i < image->numcomps; i++)
         if (image->comps[i].prec > avctx->bits_per_raw_sample)
             avctx->bits_per_raw_sample = image->comps[i].prec;
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         goto done;
-    }
 
-    ctx->dec_params.cp_limit_decoding = NO_LIMITATION;
-    // Tie decoder with decoding parameters.
-    opj_setup_decoder(dec, &ctx->dec_params);
-    stream = opj_cio_open((opj_common_ptr) dec, buf, buf_size);
-    if (!stream) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Codestream could not be opened for reading.\n");
-        ret = AVERROR_UNKNOWN;
-        goto done;
-    }
-
-    opj_image_destroy(image);
-    // Decode the codestream
-    image = opj_decode_with_info(dec, stream, NULL);
-    opj_cio_close(stream);
+    ret = !opj_decode(dec, stream, image);
 
-    if (!image) {
+    if (ret) {
         av_log(avctx, AV_LOG_ERROR, "Error decoding codestream.\n");
-        ret = AVERROR_UNKNOWN;
+        ret = AVERROR_EXTERNAL;
         goto done;
     }
 
+    for (i = 0; i < image->numcomps; i++) {
+        if (!image->comps[i].data) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Image component %d contains no data.\n", i);
+            ret = AVERROR_INVALIDDATA;
+            goto done;
+        }
+    }
+
     desc       = av_pix_fmt_desc_get(avctx->pix_fmt);
     pixel_size = desc->comp[0].step;
     ispacked   = libopenjpeg_ispacked(avctx->pix_fmt);
@@ -402,11 +475,14 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     }
 
     *got_frame = 1;
+    picture->pict_type = AV_PICTURE_TYPE_I;
+    picture->key_frame = 1;
     ret        = buf_size;
 
 done:
     opj_image_destroy(image);
-    opj_destroy_decompress(dec);
+    opj_stream_destroy(stream);
+    opj_destroy_codec(dec);
     return ret;
 }
 
@@ -416,12 +492,10 @@ done:
 static const AVOption options[] = {
     { "lowqual", "Limit the number of layers used for decoding",
         OFFSET(lowqual), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VD },
-    { "lowres",  "Lower the decoding resolution by a power of two",
-        OFFSET(lowres),  AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VD },
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass openjpeg_class = {
     .class_name = "libopenjpeg",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -437,6 +511,7 @@ AVCodec ff_libopenjpeg_decoder = {
     .init           = libopenjpeg_decode_init,
     .decode         = libopenjpeg_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
-    .priv_class     = &class,
+    .max_lowres     = 31,
+    .priv_class     = &openjpeg_class,
     .wrapper_name   = "libopenjpeg",
 };
diff --git a/libavcodec/libopenjpegenc.c b/libavcodec/libopenjpegenc.c
index 953d733..1998008 100644
--- a/libavcodec/libopenjpegenc.c
+++ b/libavcodec/libopenjpegenc.c
@@ -1,21 +1,21 @@
 /*
  * JPEG 2000 encoding support via OpenJPEG
- * Copyright (c) 2011 Michael Bradshaw <mbradshaw@sorensonmedia.com>
+ * Copyright (c) 2011 Michael Bradshaw <mjbshaw gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,30 +24,25 @@
  * JPEG 2000 encoder using libopenjpeg
  */
 
-#define  OPJ_STATIC
-#include <openjpeg.h>
-
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
+#include <openjpeg.h>
 
 typedef struct LibOpenJPEGContext {
     AVClass *avclass;
-    opj_image_t *image;
     opj_cparameters_t enc_params;
-    opj_cinfo_t *compress;
-    opj_event_mgr_t event_mgr;
     int format;
     int profile;
     int prog_order;
     int cinema_mode;
     int numresolution;
-    int numlayers;
+    int irreversible;
     int disto_alloc;
-    int fixed_alloc;
     int fixed_quality;
 } LibOpenJPEGContext;
 
@@ -66,38 +61,153 @@ static void info_callback(const char *msg, void *data)
     av_log(data, AV_LOG_DEBUG, "%s\n", msg);
 }
 
-static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
-                                             opj_cparameters_t *parameters)
+typedef struct PacketWriter {
+    int pos;
+    AVPacket *packet;
+} PacketWriter;
+
+static OPJ_SIZE_T stream_write(void *out_buffer, OPJ_SIZE_T nb_bytes, void *user_data)
+{
+    PacketWriter *writer = user_data;
+    AVPacket *packet = writer->packet;
+    int remaining = packet->size - writer->pos;
+    if (nb_bytes > remaining) {
+        OPJ_SIZE_T needed = nb_bytes - remaining;
+        int max_growth = INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - packet->size;
+        if (needed > max_growth) {
+            return (OPJ_SIZE_T)-1;
+        }
+        if (av_grow_packet(packet, (int)needed)) {
+            return (OPJ_SIZE_T)-1;
+        }
+    }
+    memcpy(packet->data + writer->pos, out_buffer, nb_bytes);
+    writer->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_OFF_T stream_skip(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    PacketWriter *writer = user_data;
+    AVPacket *packet = writer->packet;
+    if (nb_bytes < 0) {
+        if (writer->pos == 0) {
+            return (OPJ_SIZE_T)-1;
+        }
+        if (nb_bytes + writer->pos < 0) {
+            nb_bytes = -writer->pos;
+        }
+    } else {
+        int remaining = packet->size - writer->pos;
+        if (nb_bytes > remaining) {
+            OPJ_SIZE_T needed = nb_bytes - remaining;
+            int max_growth = INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - packet->size;
+            if (needed > max_growth) {
+                return (OPJ_SIZE_T)-1;
+            }
+            if (av_grow_packet(packet, (int)needed)) {
+                return (OPJ_SIZE_T)-1;
+            }
+        }
+    }
+    writer->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_BOOL stream_seek(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    PacketWriter *writer = user_data;
+    AVPacket *packet = writer->packet;
+    if (nb_bytes < 0) {
+        return OPJ_FALSE;
+    }
+    if (nb_bytes > packet->size) {
+        if (nb_bytes > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE ||
+            av_grow_packet(packet, (int)nb_bytes - packet->size)) {
+            return OPJ_FALSE;
+        }
+    }
+    writer->pos = (int)nb_bytes;
+    return OPJ_TRUE;
+}
+
+static void cinema_parameters(opj_cparameters_t *p)
+{
+    p->tile_size_on = 0;
+    p->cp_tdx = 1;
+    p->cp_tdy = 1;
+
+    /* Tile part */
+    p->tp_flag = 'C';
+    p->tp_on = 1;
+
+    /* Tile and Image shall be at (0, 0) */
+    p->cp_tx0 = 0;
+    p->cp_ty0 = 0;
+    p->image_offset_x0 = 0;
+    p->image_offset_y0 = 0;
+
+    /* Codeblock size= 32 * 32 */
+    p->cblockw_init = 32;
+    p->cblockh_init = 32;
+    p->csty |= 0x01;
+
+    /* The progression order shall be CPRL */
+    p->prog_order = OPJ_CPRL;
+
+    /* No ROI */
+    p->roi_compno = -1;
+
+    /* No subsampling */
+    p->subsampling_dx = 1;
+    p->subsampling_dy = 1;
+
+    /* 9-7 transform */
+    p->irreversible = 1;
+
+    p->tcp_mct = 1;
+}
+
+static opj_image_t *mj2_create_image(AVCodecContext *avctx, opj_cparameters_t *parameters)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
-    opj_image_cmptparm_t *cmptparm;
-    OPJ_COLOR_SPACE color_space;
+    opj_image_cmptparm_t cmptparm[4] = {{0}};
     opj_image_t *img;
     int i;
     int sub_dx[4];
     int sub_dy[4];
-    int numcomps = desc->nb_components;
+    int numcomps;
+    OPJ_COLOR_SPACE color_space = OPJ_CLRSPC_UNKNOWN;
+
+    sub_dx[0] = sub_dx[3] = 1;
+    sub_dy[0] = sub_dy[3] = 1;
+    sub_dx[1] = sub_dx[2] = 1 << desc->log2_chroma_w;
+    sub_dy[1] = sub_dy[2] = 1 << desc->log2_chroma_h;
 
-    sub_dx[0] =
-    sub_dx[3] = 1;
-    sub_dy[0] =
-    sub_dy[3] = 1;
-    sub_dx[1] =
-    sub_dx[2] = 1 << desc->log2_chroma_w;
-    sub_dy[1] =
-    sub_dy[2] = 1 << desc->log2_chroma_h;
+    numcomps = desc->nb_components;
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_GRAY8:
-    case AV_PIX_FMT_GRAY16:
     case AV_PIX_FMT_YA8:
-        color_space = CLRSPC_GRAY;
+    case AV_PIX_FMT_GRAY10:
+    case AV_PIX_FMT_GRAY12:
+    case AV_PIX_FMT_GRAY14:
+    case AV_PIX_FMT_GRAY16:
+    case AV_PIX_FMT_YA16:
+        color_space = OPJ_CLRSPC_GRAY;
         break;
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_RGB48:
     case AV_PIX_FMT_RGBA64:
-        color_space = CLRSPC_SRGB;
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_XYZ12:
+        color_space = OPJ_CLRSPC_SRGB;
         break;
     case AV_PIX_FMT_YUV410P:
     case AV_PIX_FMT_YUV411P:
@@ -106,16 +216,33 @@ static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUVA420P:
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_YUVA444P:
     case AV_PIX_FMT_YUV420P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA444P9:
     case AV_PIX_FMT_YUV420P10:
     case AV_PIX_FMT_YUV422P10:
     case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV444P14:
     case AV_PIX_FMT_YUV420P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV444P16:
-        color_space = CLRSPC_SYCC;
+    case AV_PIX_FMT_YUVA420P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA444P16:
+        color_space = OPJ_CLRSPC_SYCC;
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
@@ -124,183 +251,357 @@ static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
         return NULL;
     }
 
-    cmptparm = av_mallocz(numcomps * sizeof(*cmptparm));
-    if (!cmptparm) {
-        av_log(avctx, AV_LOG_ERROR, "Not enough memory");
-        return NULL;
-    }
-
     for (i = 0; i < numcomps; i++) {
         cmptparm[i].prec = desc->comp[i].depth;
         cmptparm[i].bpp  = desc->comp[i].depth;
         cmptparm[i].sgnd = 0;
-        cmptparm[i].dx   = sub_dx[i];
-        cmptparm[i].dy   = sub_dy[i];
-        cmptparm[i].w    = avctx->width / sub_dx[i];
-        cmptparm[i].h    = avctx->height / sub_dy[i];
+        cmptparm[i].dx = sub_dx[i];
+        cmptparm[i].dy = sub_dy[i];
+        cmptparm[i].w = (avctx->width + sub_dx[i] - 1) / sub_dx[i];
+        cmptparm[i].h = (avctx->height + sub_dy[i] - 1) / sub_dy[i];
     }
 
     img = opj_image_create(numcomps, cmptparm, color_space);
-    av_freep(&cmptparm);
+
+    if (!img)
+        return NULL;
+
+    // x0, y0 is the top left corner of the image
+    // x1, y1 is the width, height of the reference grid
+    img->x0 = 0;
+    img->y0 = 0;
+    img->x1 = (avctx->width  - 1) * parameters->subsampling_dx + 1;
+    img->y1 = (avctx->height - 1) * parameters->subsampling_dy + 1;
+
     return img;
 }
 
 static av_cold int libopenjpeg_encode_init(AVCodecContext *avctx)
 {
     LibOpenJPEGContext *ctx = avctx->priv_data;
-    int err = AVERROR(ENOMEM);
+    int err = 0;
 
     opj_set_default_encoder_parameters(&ctx->enc_params);
 
-    ctx->enc_params.cp_rsiz          = ctx->profile;
-    ctx->enc_params.mode             = !!avctx->global_quality;
-    ctx->enc_params.cp_cinema        = ctx->cinema_mode;
-    ctx->enc_params.prog_order       = ctx->prog_order;
-    ctx->enc_params.numresolution    = ctx->numresolution;
-    ctx->enc_params.cp_disto_alloc   = ctx->disto_alloc;
-    ctx->enc_params.cp_fixed_alloc   = ctx->fixed_alloc;
-    ctx->enc_params.cp_fixed_quality = ctx->fixed_quality;
-    ctx->enc_params.tcp_numlayers    = ctx->numlayers;
-    ctx->enc_params.tcp_rates[0]     = FFMAX(avctx->compression_level, 0) * 2;
+    switch (ctx->cinema_mode) {
+    case OPJ_CINEMA2K_24:
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_2K;
+        ctx->enc_params.max_cs_size = OPJ_CINEMA_24_CS;
+        ctx->enc_params.max_comp_size = OPJ_CINEMA_24_COMP;
+        break;
+    case OPJ_CINEMA2K_48:
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_2K;
+        ctx->enc_params.max_cs_size = OPJ_CINEMA_48_CS;
+        ctx->enc_params.max_comp_size = OPJ_CINEMA_48_COMP;
+        break;
+    case OPJ_CINEMA4K_24:
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_4K;
+        ctx->enc_params.max_cs_size = OPJ_CINEMA_24_CS;
+        ctx->enc_params.max_comp_size = OPJ_CINEMA_24_COMP;
+        break;
+    }
 
-    ctx->compress = opj_create_compress(ctx->format);
-    if (!ctx->compress) {
-        av_log(avctx, AV_LOG_ERROR, "Error creating the compressor\n");
-        return AVERROR(ENOMEM);
+    switch (ctx->profile) {
+    case OPJ_CINEMA2K:
+        if (ctx->enc_params.rsiz == OPJ_PROFILE_CINEMA_4K) {
+            err = AVERROR(EINVAL);
+            break;
+        }
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_2K;
+        break;
+    case OPJ_CINEMA4K:
+        if (ctx->enc_params.rsiz == OPJ_PROFILE_CINEMA_2K) {
+            err = AVERROR(EINVAL);
+            break;
+        }
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_4K;
+        break;
     }
 
-    ctx->image = libopenjpeg_create_image(avctx, &ctx->enc_params);
-    if (!ctx->image) {
-        av_log(avctx, AV_LOG_ERROR, "Error creating the mj2 image\n");
-        err = AVERROR(EINVAL);
-        goto fail;
+    if (err) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid parameter pairing: cinema_mode and profile conflict.\n");
+        return err;
     }
 
-    ctx->event_mgr.info_handler    = info_callback;
-    ctx->event_mgr.error_handler   = error_callback;
-    ctx->event_mgr.warning_handler = warning_callback;
-    opj_set_event_mgr((opj_common_ptr) ctx->compress, &ctx->event_mgr, avctx);
+    if (!ctx->numresolution) {
+        ctx->numresolution = 6;
+        while (FFMIN(avctx->width, avctx->height) >> ctx->numresolution < 1)
+            ctx->numresolution --;
+    }
 
-    return 0;
+    ctx->enc_params.prog_order = ctx->prog_order;
+    ctx->enc_params.numresolution = ctx->numresolution;
+    ctx->enc_params.irreversible = ctx->irreversible;
+    ctx->enc_params.cp_disto_alloc = ctx->disto_alloc;
+    ctx->enc_params.cp_fixed_quality = ctx->fixed_quality;
+    ctx->enc_params.tcp_numlayers = 1;
+    ctx->enc_params.tcp_rates[0] = FFMAX(avctx->compression_level, 0) * 2;
+
+    if (ctx->cinema_mode > 0) {
+        cinema_parameters(&ctx->enc_params);
+    }
 
-fail:
-    av_freep(&ctx->compress);
-    return err;
+    return 0;
 }
 
-static void libopenjpeg_copy_packed8(AVCodecContext *avctx,
-                                     const AVFrame *frame, opj_image_t *image)
+static int libopenjpeg_copy_packed8(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
 
-    for (compno = 0; compno < numcomps; ++compno)
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         for (y = 0; y < avctx->height; ++y) {
-            image_index = y * avctx->width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * frame->linesize[0] + compno;
             for (x = 0; x < avctx->width; ++x) {
-                image->comps[compno].data[image_index++] =
-                    frame->data[0][frame_index];
+                image_line[x] = frame->data[0][frame_index];
                 frame_index += numcomps;
             }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
         }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - (int)image->comps[compno].w];
+            }
+        }
+    }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_packed16(AVCodecContext *avctx,
-                                      const AVFrame *frame, opj_image_t *image)
+// for XYZ 12 bit
+static int libopenjpeg_copy_packed12(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
     int x, y;
-    int image_index, frame_index;
+    int *image_line;
+    int frame_index;
     const int numcomps  = image->numcomps;
     uint16_t *frame_ptr = (uint16_t *)frame->data[0];
 
-    for (compno = 0; compno < numcomps; ++compno)
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         for (y = 0; y < avctx->height; ++y) {
-            image_index = y * avctx->width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * (frame->linesize[0] / 2) + compno;
             for (x = 0; x < avctx->width; ++x) {
-                image->comps[compno].data[image_index++] =
-                    frame_ptr[frame_index];
+                image_line[x] = frame_ptr[frame_index] >> 4;
                 frame_index += numcomps;
             }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
         }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - (int)image->comps[compno].w];
+            }
+        }
+    }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_unpacked8(AVCodecContext *avctx,
-                                       const AVFrame *frame, opj_image_t *image)
+static int libopenjpeg_copy_packed16(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int width, height;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
+    uint16_t *frame_ptr = (uint16_t*)frame->data[0];
 
     for (compno = 0; compno < numcomps; ++compno) {
-        width  = avctx->width / image->comps[compno].dx;
-        height = avctx->height / image->comps[compno].dy;
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        for (y = 0; y < avctx->height; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            frame_index = y * (frame->linesize[0] / 2) + compno;
+            for (x = 0; x < avctx->width; ++x) {
+                image_line[x] = frame_ptr[frame_index];
+                frame_index += numcomps;
+            }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - (int)image->comps[compno].w];
+            }
+        }
+    }
+
+    return 1;
+}
+
+static int libopenjpeg_copy_unpacked8(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
+{
+    int compno;
+    int x;
+    int y;
+    int width;
+    int height;
+    int *image_line;
+    int frame_index;
+    const int numcomps = image->numcomps;
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[compno]) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        width  = (avctx->width + image->comps[compno].dx - 1) / image->comps[compno].dx;
+        height = (avctx->height + image->comps[compno].dy - 1) / image->comps[compno].dy;
         for (y = 0; y < height; ++y) {
-            image_index = y * width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * frame->linesize[compno];
             for (x = 0; x < width; ++x)
-                image->comps[compno].data[image_index++] =
-                    frame->data[compno][frame_index++];
+                image_line[x] = frame->data[compno][frame_index++];
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - (int)image->comps[compno].w];
+            }
         }
     }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_unpacked16(AVCodecContext *avctx,
-                                        const AVFrame *frame,
-                                        opj_image_t *image)
+static int libopenjpeg_copy_unpacked16(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int width, height;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int width;
+    int height;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
     uint16_t *frame_ptr;
 
     for (compno = 0; compno < numcomps; ++compno) {
-        width     = avctx->width / image->comps[compno].dx;
-        height    = avctx->height / image->comps[compno].dy;
+        if (image->comps[compno].w > frame->linesize[compno]) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        width     = (avctx->width + image->comps[compno].dx - 1) / image->comps[compno].dx;
+        height    = (avctx->height + image->comps[compno].dy - 1) / image->comps[compno].dy;
         frame_ptr = (uint16_t *)frame->data[compno];
         for (y = 0; y < height; ++y) {
-            image_index = y * width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * (frame->linesize[compno] / 2);
             for (x = 0; x < width; ++x)
-                image->comps[compno].data[image_index++] =
-                    frame_ptr[frame_index++];
+                image_line[x] = frame_ptr[frame_index++];
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - (int)image->comps[compno].w];
+            }
         }
     }
+
+    return 1;
 }
 
 static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                     const AVFrame *frame, int *got_packet)
 {
     LibOpenJPEGContext *ctx = avctx->priv_data;
-    opj_cinfo_t *compress   = ctx->compress;
-    opj_image_t *image      = ctx->image;
-    opj_cio_t *stream;
-    int ret, len;
-
-    // x0, y0 is the top left corner of the image
-    // x1, y1 is the width, height of the reference grid
-    image->x0 = 0;
-    image->y0 = 0;
-    image->x1 = (avctx->width - 1) * ctx->enc_params.subsampling_dx + 1;
-    image->y1 = (avctx->height - 1) * ctx->enc_params.subsampling_dy + 1;
+    int ret;
+    AVFrame *gbrframe;
+    int cpyresult = 0;
+    PacketWriter writer     = { 0 };
+    opj_codec_t *compress   = NULL;
+    opj_stream_t *stream    = NULL;
+    opj_image_t *image      = mj2_create_image(avctx, &ctx->enc_params);
+    if (!image) {
+        av_log(avctx, AV_LOG_ERROR, "Error creating the mj2 image\n");
+        ret = AVERROR(EINVAL);
+        goto done;
+    }
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_YA8:
-        libopenjpeg_copy_packed8(avctx, frame, image);
+        cpyresult = libopenjpeg_copy_packed8(avctx, frame, image);
+        break;
+    case AV_PIX_FMT_XYZ12:
+        cpyresult = libopenjpeg_copy_packed12(avctx, frame, image);
         break;
     case AV_PIX_FMT_RGB48:
     case AV_PIX_FMT_RGBA64:
-        libopenjpeg_copy_packed16(avctx, frame, image);
+    case AV_PIX_FMT_YA16:
+        cpyresult = libopenjpeg_copy_packed16(avctx, frame, image);
+        break;
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+        gbrframe = av_frame_clone(frame);
+        if (!gbrframe) {
+            ret = AVERROR(ENOMEM);
+            goto done;
+        }
+        gbrframe->data[0] = frame->data[2]; // swap to be rgb
+        gbrframe->data[1] = frame->data[0];
+        gbrframe->data[2] = frame->data[1];
+        gbrframe->linesize[0] = frame->linesize[2];
+        gbrframe->linesize[1] = frame->linesize[0];
+        gbrframe->linesize[2] = frame->linesize[1];
+        if (avctx->pix_fmt == AV_PIX_FMT_GBR24P) {
+            cpyresult = libopenjpeg_copy_unpacked8(avctx, gbrframe, image);
+        } else {
+            cpyresult = libopenjpeg_copy_unpacked16(avctx, gbrframe, image);
+        }
+        av_frame_free(&gbrframe);
         break;
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_YUV410P:
@@ -310,93 +611,144 @@ static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUVA420P:
-        libopenjpeg_copy_unpacked8(avctx, frame, image);
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_YUVA444P:
+        cpyresult = libopenjpeg_copy_unpacked8(avctx, frame, image);
         break;
+    case AV_PIX_FMT_GRAY10:
+    case AV_PIX_FMT_GRAY12:
+    case AV_PIX_FMT_GRAY14:
     case AV_PIX_FMT_GRAY16:
     case AV_PIX_FMT_YUV420P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA444P9:
     case AV_PIX_FMT_YUV444P10:
     case AV_PIX_FMT_YUV422P10:
     case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV444P14:
     case AV_PIX_FMT_YUV444P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV420P16:
-        libopenjpeg_copy_unpacked16(avctx, frame, image);
+    case AV_PIX_FMT_YUVA444P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA420P16:
+        cpyresult = libopenjpeg_copy_unpacked16(avctx, frame, image);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
                "The frame's pixel format '%s' is not supported\n",
                av_get_pix_fmt_name(avctx->pix_fmt));
-        return AVERROR(EINVAL);
+        ret = AVERROR(EINVAL);
+        goto done;
         break;
     }
 
-    opj_setup_encoder(compress, &ctx->enc_params, image);
-    stream = opj_cio_open((opj_common_ptr) compress, NULL, 0);
+    if (!cpyresult) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Could not copy the frame data to the internal image buffer\n");
+        ret = -1;
+        goto done;
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, 1024, 0)) < 0) {
+        goto done;
+    }
+
+    compress = opj_create_compress(ctx->format);
+    if (!compress) {
+        av_log(avctx, AV_LOG_ERROR, "Error creating the compressor\n");
+        ret = AVERROR(ENOMEM);
+        goto done;
+    }
+
+    if (!opj_set_error_handler(compress, error_callback, avctx) ||
+        !opj_set_warning_handler(compress, warning_callback, avctx) ||
+        !opj_set_info_handler(compress, info_callback, avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting the compressor handlers\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    if (!opj_setup_encoder(compress, &ctx->enc_params, image)) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting up the compressor\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+    stream = opj_stream_default_create(OPJ_STREAM_WRITE);
+
     if (!stream) {
         av_log(avctx, AV_LOG_ERROR, "Error creating the cio stream\n");
-        return AVERROR(ENOMEM);
+        ret = AVERROR(ENOMEM);
+        goto done;
     }
 
-    if (!opj_encode(compress, stream, image, NULL)) {
-        opj_cio_close(stream);
+    writer.packet = pkt;
+    opj_stream_set_write_function(stream, stream_write);
+    opj_stream_set_skip_function(stream, stream_skip);
+    opj_stream_set_seek_function(stream, stream_seek);
+    opj_stream_set_user_data(stream, &writer, NULL);
+
+    if (!opj_start_compress(compress, image, stream) ||
+        !opj_encode(compress, stream) ||
+        !opj_end_compress(compress, stream)) {
         av_log(avctx, AV_LOG_ERROR, "Error during the opj encode\n");
-        return -1;
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
 
-    len = cio_tell(stream);
-    if ((ret = ff_alloc_packet(pkt, len)) < 0) {
-        opj_cio_close(stream);
-        return ret;
-    }
+    av_shrink_packet(pkt, writer.pos);
 
-    memcpy(pkt->data, stream->buffer, len);
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
-    opj_cio_close(stream);
-    return 0;
-}
+    ret = 0;
 
-static av_cold int libopenjpeg_encode_close(AVCodecContext *avctx)
-{
-    LibOpenJPEGContext *ctx = avctx->priv_data;
-
-    opj_destroy_compress(ctx->compress);
-    opj_image_destroy(ctx->image);
-    return 0;
+done:
+    opj_stream_destroy(stream);
+    opj_destroy_codec(compress);
+    opj_image_destroy(image);
+    return ret;
 }
 
 #define OFFSET(x) offsetof(LibOpenJPEGContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "format",        "Codec Format",      OFFSET(format),        AV_OPT_TYPE_INT,   { .i64 = CODEC_JP2   }, CODEC_J2K, CODEC_JP2,   VE, "format"      },
-    { "j2k",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_J2K   }, 0,         0,           VE, "format"      },
-    { "jp2",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_JP2   }, 0,         0,           VE, "format"      },
-    { "profile",       NULL,                OFFSET(profile),       AV_OPT_TYPE_INT,   { .i64 = STD_RSIZ    }, STD_RSIZ,  CINEMA4K,    VE, "profile"     },
-    { "jpeg2000",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = STD_RSIZ    }, 0,         0,           VE, "profile"     },
-    { "cinema2k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA2K    }, 0,         0,           VE, "profile"     },
-    { "cinema4k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA4K    }, 0,         0,           VE, "profile"     },
-    { "cinema_mode",   "Digital Cinema",    OFFSET(cinema_mode),   AV_OPT_TYPE_INT,   { .i64 = OFF         }, OFF,       CINEMA4K_24, VE, "cinema_mode" },
-    { "off",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OFF         }, 0,         0,           VE, "cinema_mode" },
-    { "2k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA2K_24 }, 0,         0,           VE, "cinema_mode" },
-    { "2k_48",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA2K_48 }, 0,         0,           VE, "cinema_mode" },
-    { "4k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA4K_24 }, 0,         0,           VE, "cinema_mode" },
-    { "prog_order",    "Progression Order", OFFSET(prog_order),    AV_OPT_TYPE_INT,   { .i64 = LRCP        }, LRCP,      CPRL,        VE, "prog_order"  },
-    { "lrcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = LRCP        }, 0,         0,           VE, "prog_order"  },
-    { "rlcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = RLCP        }, 0,         0,           VE, "prog_order"  },
-    { "rpcl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = RPCL        }, 0,         0,           VE, "prog_order"  },
-    { "pcrl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = PCRL        }, 0,         0,           VE, "prog_order"  },
-    { "cprl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CPRL        }, 0,         0,           VE, "prog_order"  },
-    { "numresolution", NULL,                OFFSET(numresolution), AV_OPT_TYPE_INT,   { .i64 = 6           }, 1,         10,          VE },
-    { "numlayers",     NULL,                OFFSET(numlayers),     AV_OPT_TYPE_INT,   { .i64 = 1           }, 1,         10,          VE },
-    { "disto_alloc",   NULL,                OFFSET(disto_alloc),   AV_OPT_TYPE_INT,   { .i64 = 1           }, 0,         1,           VE },
-    { "fixed_alloc",   NULL,                OFFSET(fixed_alloc),   AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE },
-    { "fixed_quality", NULL,                OFFSET(fixed_quality), AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE },
+    { "format",        "Codec Format",      OFFSET(format),        AV_OPT_TYPE_INT,   { .i64 = OPJ_CODEC_JP2   }, OPJ_CODEC_J2K, OPJ_CODEC_JP2,   VE, "format"      },
+    { "j2k",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CODEC_J2K   }, 0,         0,           VE, "format"      },
+    { "jp2",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CODEC_JP2   }, 0,         0,           VE, "format"      },
+    { "profile",       NULL,                OFFSET(profile),       AV_OPT_TYPE_INT,   { .i64 = OPJ_STD_RSIZ    }, OPJ_STD_RSIZ,  OPJ_CINEMA4K,    VE, "profile"     },
+    { "jpeg2000",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_STD_RSIZ    }, 0,         0,           VE, "profile"     },
+    { "cinema2k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CINEMA2K    }, 0,         0,           VE, "profile"     },
+    { "cinema4k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CINEMA4K    }, 0,         0,           VE, "profile"     },
+    { "cinema_mode",   "Digital Cinema",    OFFSET(cinema_mode),   AV_OPT_TYPE_INT,   { .i64 = OPJ_OFF         }, OPJ_OFF,       OPJ_CINEMA4K_24, VE, "cinema_mode" },
+    { "off",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_OFF         }, 0,         0,           VE, "cinema_mode" },
+    { "2k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CINEMA2K_24 }, 0,         0,           VE, "cinema_mode" },
+    { "2k_48",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CINEMA2K_48 }, 0,         0,           VE, "cinema_mode" },
+    { "4k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CINEMA4K_24 }, 0,         0,           VE, "cinema_mode" },
+    { "prog_order",    "Progression Order", OFFSET(prog_order),    AV_OPT_TYPE_INT,   { .i64 = OPJ_LRCP    }, OPJ_LRCP,  OPJ_CPRL,    VE, "prog_order"  },
+    { "lrcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_LRCP    }, 0,         0,           VE, "prog_order"  },
+    { "rlcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_RLCP    }, 0,         0,           VE, "prog_order"  },
+    { "rpcl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_RPCL    }, 0,         0,           VE, "prog_order"  },
+    { "pcrl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_PCRL    }, 0,         0,           VE, "prog_order"  },
+    { "cprl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ_CPRL    }, 0,         0,           VE, "prog_order"  },
+    { "numresolution", NULL,                OFFSET(numresolution), AV_OPT_TYPE_INT,   { .i64 = 6            }, 0,         33,          VE                },
+    { "irreversible",  NULL,                OFFSET(irreversible),  AV_OPT_TYPE_INT,   { .i64 = 0            }, 0,         1,           VE                },
+    { "disto_alloc",   NULL,                OFFSET(disto_alloc),   AV_OPT_TYPE_INT,   { .i64 = 1            }, 0,         1,           VE                },
+    { "fixed_quality", NULL,                OFFSET(fixed_quality), AV_OPT_TYPE_INT,   { .i64 = 0            }, 0,         1,           VE                },
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass openjpeg_class = {
     .class_name = "libopenjpeg",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -411,20 +763,27 @@ AVCodec ff_libopenjpeg_encoder = {
     .priv_data_size = sizeof(LibOpenJPEGContext),
     .init           = libopenjpeg_encode_init,
     .encode2        = libopenjpeg_encode_frame,
-    .close          = libopenjpeg_encode_close,
-    .capabilities   = 0,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA, AV_PIX_FMT_RGB48,
-        AV_PIX_FMT_RGBA64,
-        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA8,
+        AV_PIX_FMT_RGBA64, AV_PIX_FMT_GBR24P,
+        AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_YA8, AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA16,
+        AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12, AV_PIX_FMT_GRAY14,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVA420P,
-        AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P,
+        AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA422P,
+        AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUVA444P,
         AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9,
+        AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA444P9,
         AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA444P10,
+        AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
+        AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14,
         AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
+        AV_PIX_FMT_YUVA420P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA444P16,
+        AV_PIX_FMT_XYZ12,
         AV_PIX_FMT_NONE
     },
-    .priv_class     = &class,
+    .priv_class     = &openjpeg_class,
     .wrapper_name   = "libopenjpeg",
 };
diff --git a/libavcodec/libopus.c b/libavcodec/libopus.c
index 9a0d5b0..16395c7 100644
--- a/libavcodec/libopus.c
+++ b/libavcodec/libopus.c
@@ -2,20 +2,20 @@
  * libopus encoder/decoder common code
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libopus.h b/libavcodec/libopus.h
index 2334c84..a8223d1 100644
--- a/libavcodec/libopus.h
+++ b/libavcodec/libopus.h
@@ -2,20 +2,20 @@
  * libopus encoder/decoder common code
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libopusdec.c b/libavcodec/libopusdec.c
index 1dac1a0..1724a49 100644
--- a/libavcodec/libopusdec.c
+++ b/libavcodec/libopusdec.c
@@ -2,27 +2,30 @@
  * Opus decoder using libopus
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <opus.h>
 #include <opus_multistream.h>
 
+#include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/ffmath.h"
+#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
@@ -31,7 +34,15 @@
 #include "libopus.h"
 
 struct libopus_context {
+    AVClass *class;
     OpusMSDecoder *dec;
+    int pre_skip;
+#ifndef OPUS_SET_GAIN
+    union { int i; double d; } gain;
+#endif
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    int apply_phase_inv;
+#endif
 };
 
 #define OPUS_HEAD_SIZE 19
@@ -42,12 +53,6 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
     int ret, channel_map = 0, gain_db = 0, nb_streams, nb_coupled;
     uint8_t mapping_arr[8] = { 0, 1 }, *mapping;
 
-    if (avc->channels <= 0) {
-        av_log(avc, AV_LOG_WARNING,
-               "Invalid number of channels %d, defaulting to stereo\n", avc->channels);
-        avc->channels = 2;
-    }
-
     avc->channels = avc->extradata_size >= 10 ? avc->extradata[9] : (avc->channels == 1) ? 1 : 2;
     if (avc->channels <= 0) {
         av_log(avc, AV_LOG_WARNING,
@@ -62,6 +67,7 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
                           ff_vorbis_channel_layouts[avc->channels - 1];
 
     if (avc->extradata_size >= OPUS_HEAD_SIZE) {
+        opus->pre_skip = AV_RL16(avc->extradata + 10);
         gain_db     = sign_extend(AV_RL16(avc->extradata + 16), 16);
         channel_map = AV_RL8 (avc->extradata + 18);
     }
@@ -86,7 +92,7 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
         const uint8_t *vorbis_offset = ff_vorbis_channel_layout_offsets[avc->channels - 1];
         int ch;
 
-        /* Remap channels from Vorbis order to libav order */
+        /* Remap channels from Vorbis order to ffmpeg order */
         for (ch = 0; ch < avc->channels; ch++)
             mapping_arr[ch] = mapping[vorbis_offset[ch]];
         mapping = mapping_arr;
@@ -101,12 +107,32 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
         return ff_opus_error_to_averror(ret);
     }
 
+#ifdef OPUS_SET_GAIN
     ret = opus_multistream_decoder_ctl(opus->dec, OPUS_SET_GAIN(gain_db));
     if (ret != OPUS_OK)
         av_log(avc, AV_LOG_WARNING, "Failed to set gain: %s\n",
                opus_strerror(ret));
+#else
+    {
+        double gain_lin = ff_exp10(gain_db / (20.0 * 256));
+        if (avc->sample_fmt == AV_SAMPLE_FMT_FLT)
+            opus->gain.d = gain_lin;
+        else
+            opus->gain.i = FFMIN(gain_lin * 65536, INT_MAX);
+    }
+#endif
+
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    ret = opus_multistream_decoder_ctl(opus->dec,
+                                       OPUS_SET_PHASE_INVERSION_DISABLED(!opus->apply_phase_inv));
+    if (ret != OPUS_OK)
+        av_log(avc, AV_LOG_WARNING,
+               "Unable to set phase inversion: %s\n",
+               opus_strerror(ret));
+#endif
 
-    avc->delay = 3840;  /* Decoder delay (in samples) at 48kHz */
+    /* Decoder delay (in samples) at 48kHz */
+    avc->delay = avc->internal->skip_samples = opus->pre_skip;
 
     return 0;
 }
@@ -115,7 +141,10 @@ static av_cold int libopus_decode_close(AVCodecContext *avc)
 {
     struct libopus_context *opus = avc->priv_data;
 
-    opus_multistream_decoder_destroy(opus->dec);
+    if (opus->dec) {
+        opus_multistream_decoder_destroy(opus->dec);
+        opus->dec = NULL;
+    }
     return 0;
 }
 
@@ -129,11 +158,8 @@ static int libopus_decode(AVCodecContext *avc, void *data,
     int ret, nb_samples;
 
     frame->nb_samples = MAX_FRAME_SIZE;
-    ret = ff_get_buffer(avc, frame, 0);
-    if (ret < 0) {
-        av_log(avc, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avc, frame, 0)) < 0)
         return ret;
-    }
 
     if (avc->sample_fmt == AV_SAMPLE_FMT_S16)
         nb_samples = opus_multistream_decode(opus->dec, pkt->data, pkt->size,
@@ -150,6 +176,21 @@ static int libopus_decode(AVCodecContext *avc, void *data,
         return ff_opus_error_to_averror(nb_samples);
     }
 
+#ifndef OPUS_SET_GAIN
+    {
+        int i = avc->channels * nb_samples;
+        if (avc->sample_fmt == AV_SAMPLE_FMT_FLT) {
+            float *pcm = (float *)frame->data[0];
+            for (; i > 0; i--, pcm++)
+                *pcm = av_clipf(*pcm * opus->gain.d, -1, 1);
+        } else {
+            int16_t *pcm = (int16_t *)frame->data[0];
+            for (; i > 0; i--, pcm++)
+                *pcm = av_clip_int16(((int64_t)opus->gain.i * *pcm) >> 16);
+        }
+    }
+#endif
+
     frame->nb_samples = nb_samples;
     *got_frame_ptr    = 1;
 
@@ -161,8 +202,29 @@ static void libopus_flush(AVCodecContext *avc)
     struct libopus_context *opus = avc->priv_data;
 
     opus_multistream_decoder_ctl(opus->dec, OPUS_RESET_STATE);
+    /* The stream can have been extracted by a tool that is not Opus-aware.
+       Therefore, any packet can become the first of the stream. */
+    avc->internal->skip_samples = opus->pre_skip;
 }
 
+
+#define OFFSET(x) offsetof(struct libopus_context, x)
+#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption libopusdec_options[] = {
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+#endif
+    { NULL },
+};
+
+static const AVClass libopusdec_class = {
+    .class_name = "libopusdec",
+    .item_name  = av_default_item_name,
+    .option     = libopusdec_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+
 AVCodec ff_libopus_decoder = {
     .name           = "libopus",
     .long_name      = NULL_IF_CONFIG_SMALL("libopus Opus"),
@@ -174,8 +236,10 @@ AVCodec ff_libopus_decoder = {
     .decode         = libopus_decode,
     .flush          = libopus_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLT,
                                                      AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &libopusdec_class,
     .wrapper_name   = "libopus",
 };
diff --git a/libavcodec/libopusenc.c b/libavcodec/libopusenc.c
index 823cab1..7c025a6 100644
--- a/libavcodec/libopusenc.c
+++ b/libavcodec/libopusenc.c
@@ -2,20 +2,20 @@
  * Opus encoder using libopus
  * Copyright (c) 2012 Nathan Caldwell
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,10 @@ typedef struct LibopusEncOpts {
     float frame_duration;
     int packet_size;
     int max_bandwidth;
+    int mapping_family;
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    int apply_phase_inv;
+#endif
 } LibopusEncOpts;
 
 typedef struct LibopusEncContext {
@@ -47,6 +51,7 @@ typedef struct LibopusEncContext {
     uint8_t *samples;
     LibopusEncOpts opts;
     AudioFrameQueue afq;
+    const uint8_t *encoder_channel_map;
 } LibopusEncContext;
 
 static const uint8_t opus_coupled_streams[8] = {
@@ -65,8 +70,8 @@ static const uint8_t opus_vorbis_channel_map[8][8] = {
     { 0, 6, 1, 2, 3, 4, 5, 7 },
 };
 
-/* libav to libopus channel order mapping, passed to libopus */
-static const uint8_t libav_libopus_channel_map[8][8] = {
+/* libavcodec to libopus channel order mapping, passed to libopus */
+static const uint8_t libavcodec_libopus_channel_map[8][8] = {
     { 0 },
     { 0, 1 },
     { 0, 1, 2 },
@@ -79,6 +84,7 @@ static const uint8_t libav_libopus_channel_map[8][8] = {
 
 static void libopus_write_header(AVCodecContext *avctx, int stream_count,
                                  int coupled_stream_count,
+                                 int mapping_family,
                                  const uint8_t *channel_mapping)
 {
     uint8_t *p   = avctx->extradata;
@@ -92,13 +98,11 @@ static void libopus_write_header(AVCodecContext *avctx, int stream_count,
     bytestream_put_le16(&p, 0); /* Gain of 0dB is recommended. */
 
     /* Channel mapping */
-    if (channels > 2) {
-        bytestream_put_byte(&p, channels <= 8 ? 1 : 255);
+    bytestream_put_byte(&p, mapping_family);
+    if (mapping_family != 0) {
         bytestream_put_byte(&p, stream_count);
         bytestream_put_byte(&p, coupled_stream_count);
         bytestream_put_buffer(&p, channel_mapping, channels);
-    } else {
-        bytestream_put_byte(&p, 0);
     }
 }
 
@@ -107,6 +111,13 @@ static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
 {
     int ret;
 
+    if (avctx->global_quality) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Quality-based encoding not supported, "
+               "please specify a bitrate and VBR setting.\n");
+        return AVERROR(EINVAL);
+    }
+
     ret = opus_multistream_encoder_ctl(enc, OPUS_SET_BITRATE(avctx->bit_rate));
     if (ret != OPUS_OK) {
         av_log(avctx, AV_LOG_ERROR,
@@ -146,44 +157,104 @@ static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
                    "Unable to set maximum bandwidth: %s\n", opus_strerror(ret));
     }
 
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    ret = opus_multistream_encoder_ctl(enc,
+                                       OPUS_SET_PHASE_INVERSION_DISABLED(!opts->apply_phase_inv));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set phase inversion: %s\n",
+               opus_strerror(ret));
+#endif
     return OPUS_OK;
 }
 
-static int av_cold libopus_encode_init(AVCodecContext *avctx)
-{
-    LibopusEncContext *opus = avctx->priv_data;
-    const uint8_t *channel_mapping;
-    OpusMSEncoder *enc;
-    int ret = OPUS_OK;
-    int coupled_stream_count, header_size, frame_size;
-
-    coupled_stream_count = opus_coupled_streams[avctx->channels - 1];
-    opus->stream_count   = avctx->channels - coupled_stream_count;
-    channel_mapping      = libav_libopus_channel_map[avctx->channels - 1];
-
-    /* FIXME: Opus can handle up to 255 channels. However, the mapping for
-     * anything greater than 8 is undefined. */
-    if (avctx->channels > 8) {
-        avpriv_report_missing_feature(avctx,
-                                      "Undefined channel layout for %d channels",
-                                      avctx->channels);
-        return AVERROR_PATCHWELCOME;
+static int libopus_check_max_channels(AVCodecContext *avctx,
+                                      int max_channels) {
+    if (avctx->channels > max_channels) {
+        av_log(avctx, AV_LOG_ERROR, "Opus mapping family undefined for %d channels.\n",
+               avctx->channels);
+        return AVERROR(EINVAL);
     }
-    if (!avctx->bit_rate) {
-        /* Sane default copied from opusenc */
-        avctx->bit_rate = 64000 * opus->stream_count +
-                          32000 * coupled_stream_count;
+
+    return 0;
+}
+
+static int libopus_check_vorbis_layout(AVCodecContext *avctx, int mapping_family) {
+    av_assert2(avctx->channels < FF_ARRAY_ELEMS(ff_vorbis_channel_layouts));
+
+    if (!avctx->channel_layout) {
         av_log(avctx, AV_LOG_WARNING,
-               "No bit rate set. Defaulting to %d bps.\n", avctx->bit_rate);
-    }
+               "No channel layout specified. Opus encoder will use Vorbis "
+               "channel layout for %d channels.\n", avctx->channels);
+    } else if (avctx->channel_layout != ff_vorbis_channel_layouts[avctx->channels - 1]) {
+        char name[32];
+        av_get_channel_layout_string(name, sizeof(name), avctx->channels,
+                                     avctx->channel_layout);
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid channel layout %s for specified mapping family %d.\n",
+               name, mapping_family);
 
-    if (avctx->bit_rate < 500 || avctx->bit_rate > 256000 * avctx->channels) {
-        av_log(avctx, AV_LOG_ERROR, "The bit rate %d bps is unsupported. "
-               "Please choose a value between 500 and %d.\n", avctx->bit_rate,
-               256000 * avctx->channels);
         return AVERROR(EINVAL);
     }
 
+    return 0;
+}
+
+static int libopus_validate_layout_and_get_channel_map(
+        AVCodecContext *avctx,
+        int mapping_family,
+        const uint8_t ** channel_map_result)
+{
+    const uint8_t * channel_map = NULL;
+    int ret;
+
+    switch (mapping_family) {
+    case -1:
+        ret = libopus_check_max_channels(avctx, 8);
+        if (ret == 0) {
+            ret = libopus_check_vorbis_layout(avctx, mapping_family);
+            /* Channels do not need to be reordered. */
+        }
+
+        break;
+    case 0:
+        ret = libopus_check_max_channels(avctx, 2);
+        if (ret == 0) {
+            ret = libopus_check_vorbis_layout(avctx, mapping_family);
+        }
+        break;
+    case 1:
+        /* Opus expects channels to be in Vorbis order. */
+        ret = libopus_check_max_channels(avctx, 8);
+        if (ret == 0) {
+            ret = libopus_check_vorbis_layout(avctx, mapping_family);
+            channel_map = ff_vorbis_channel_layout_offsets[avctx->channels - 1];
+        }
+        break;
+    case 255:
+        ret = libopus_check_max_channels(avctx, 254);
+        break;
+    default:
+        av_log(avctx, AV_LOG_WARNING,
+               "Unknown channel mapping family %d. Output channel layout may be invalid.\n",
+               mapping_family);
+        ret = 0;
+    }
+
+    *channel_map_result = channel_map;
+    return ret;
+}
+
+static av_cold int libopus_encode_init(AVCodecContext *avctx)
+{
+    LibopusEncContext *opus = avctx->priv_data;
+    OpusMSEncoder *enc;
+    uint8_t libopus_channel_mapping[255];
+    int ret = OPUS_OK;
+    int av_ret;
+    int coupled_stream_count, header_size, frame_size;
+    int mapping_family;
+
     frame_size = opus->opts.frame_duration * 48000 / 1000;
     switch (frame_size) {
     case 120:
@@ -200,12 +271,22 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
     case 960:
     case 1920:
     case 2880:
+#ifdef OPUS_FRAMESIZE_120_MS
+    case 3840:
+    case 4800:
+    case 5760:
+#endif
         opus->opts.packet_size =
         avctx->frame_size      = frame_size * avctx->sample_rate / 48000;
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Invalid frame duration: %g.\n"
-               "Frame duration must be exactly one of: 2.5, 5, 10, 20, 40 or 60.\n",
+               "Frame duration must be exactly one of: 2.5, 5, 10, 20, 40"
+#ifdef OPUS_FRAMESIZE_120_MS
+               ", 60, 80, 100 or 120.\n",
+#else
+               " or 60.\n",
+#endif
                opus->opts.frame_duration);
         return AVERROR(EINVAL);
     }
@@ -245,24 +326,71 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
         }
     }
 
-    enc = opus_multistream_encoder_create(avctx->sample_rate, avctx->channels,
-                                          opus->stream_count,
-                                          coupled_stream_count,
-                                          channel_mapping,
-                                          opus->opts.application, &ret);
+    /* Channels may need to be reordered to match opus mapping. */
+    av_ret = libopus_validate_layout_and_get_channel_map(avctx, opus->opts.mapping_family,
+                                                         &opus->encoder_channel_map);
+    if (av_ret) {
+        return av_ret;
+    }
+
+    if (opus->opts.mapping_family == -1) {
+        /* By default, use mapping family 1 for the header but use the older
+         * libopus multistream API to avoid surround masking. */
+
+        /* Set the mapping family so that the value is correct in the header */
+        mapping_family = avctx->channels > 2 ? 1 : 0;
+        coupled_stream_count = opus_coupled_streams[avctx->channels - 1];
+        opus->stream_count   = avctx->channels - coupled_stream_count;
+        memcpy(libopus_channel_mapping,
+               opus_vorbis_channel_map[avctx->channels - 1],
+               avctx->channels * sizeof(*libopus_channel_mapping));
+
+        enc = opus_multistream_encoder_create(
+            avctx->sample_rate, avctx->channels, opus->stream_count,
+            coupled_stream_count,
+            libavcodec_libopus_channel_map[avctx->channels - 1],
+            opus->opts.application, &ret);
+    } else {
+        /* Use the newer multistream API. The encoder will set the channel
+         * mapping and coupled stream counts to its internal defaults and will
+         * use surround masking analysis to save bits. */
+        mapping_family = opus->opts.mapping_family;
+        enc = opus_multistream_surround_encoder_create(
+            avctx->sample_rate, avctx->channels, mapping_family,
+            &opus->stream_count, &coupled_stream_count, libopus_channel_mapping,
+            opus->opts.application, &ret);
+    }
+
     if (ret != OPUS_OK) {
         av_log(avctx, AV_LOG_ERROR,
                "Failed to create encoder: %s\n", opus_strerror(ret));
         return ff_opus_error_to_averror(ret);
     }
 
+    if (!avctx->bit_rate) {
+        /* Sane default copied from opusenc */
+        avctx->bit_rate = 64000 * opus->stream_count +
+                          32000 * coupled_stream_count;
+        av_log(avctx, AV_LOG_WARNING,
+               "No bit rate set. Defaulting to %"PRId64" bps.\n", avctx->bit_rate);
+    }
+
+    if (avctx->bit_rate < 500 || avctx->bit_rate > 256000 * avctx->channels) {
+        av_log(avctx, AV_LOG_ERROR, "The bit rate %"PRId64" bps is unsupported. "
+               "Please choose a value between 500 and %d.\n", avctx->bit_rate,
+               256000 * avctx->channels);
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
     ret = libopus_configure_encoder(avctx, enc, &opus->opts);
     if (ret != OPUS_OK) {
         ret = ff_opus_error_to_averror(ret);
         goto fail;
     }
 
-    header_size = 19 + (avctx->channels > 2 ? 2 + avctx->channels : 0);
+    /* Header includes channel mapping table if and only if mapping family is NOT 0 */
+    header_size = 19 + (mapping_family == 0 ? 0 : 2 + avctx->channels);
     avctx->extradata = av_malloc(header_size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!avctx->extradata) {
         av_log(avctx, AV_LOG_ERROR, "Failed to allocate extradata.\n");
@@ -271,7 +399,7 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
     }
     avctx->extradata_size = header_size;
 
-    opus->samples = av_mallocz(frame_size * avctx->channels *
+    opus->samples = av_mallocz_array(frame_size, avctx->channels *
                                av_get_bytes_per_sample(avctx->sample_fmt));
     if (!opus->samples) {
         av_log(avctx, AV_LOG_ERROR, "Failed to allocate samples buffer.\n");
@@ -286,7 +414,7 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
                opus_strerror(ret));
 
     libopus_write_header(avctx, opus->stream_count, coupled_stream_count,
-                         opus_vorbis_channel_map[avctx->channels - 1]);
+                         mapping_family, libopus_channel_mapping);
 
     ff_af_queue_init(avctx, &opus->afq);
 
@@ -300,38 +428,56 @@ fail:
     return ret;
 }
 
+static void libopus_copy_samples_with_channel_map(
+    uint8_t *dst, const uint8_t *src, const uint8_t *channel_map,
+    int nb_channels, int nb_samples, int bytes_per_sample) {
+    int sample, channel;
+    for (sample = 0; sample < nb_samples; ++sample) {
+        for (channel = 0; channel < nb_channels; ++channel) {
+            const size_t src_pos = bytes_per_sample * (nb_channels * sample + channel);
+            const size_t dst_pos = bytes_per_sample * (nb_channels * sample + channel_map[channel]);
+
+            memcpy(&dst[dst_pos], &src[src_pos], bytes_per_sample);
+        }
+    }
+}
+
 static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
                           const AVFrame *frame, int *got_packet_ptr)
 {
     LibopusEncContext *opus = avctx->priv_data;
-    const int sample_size   = avctx->channels *
-                              av_get_bytes_per_sample(avctx->sample_fmt);
+    const int bytes_per_sample = av_get_bytes_per_sample(avctx->sample_fmt);
+    const int sample_size      = avctx->channels * bytes_per_sample;
     uint8_t *audio;
     int ret;
+    int discard_padding;
 
     if (frame) {
         ret = ff_af_queue_add(&opus->afq, frame);
         if (ret < 0)
             return ret;
-        if (frame->nb_samples < opus->opts.packet_size) {
+        if (opus->encoder_channel_map != NULL) {
+            audio = opus->samples;
+            libopus_copy_samples_with_channel_map(
+                audio, frame->data[0], opus->encoder_channel_map,
+                avctx->channels, frame->nb_samples, bytes_per_sample);
+        } else if (frame->nb_samples < opus->opts.packet_size) {
             audio = opus->samples;
             memcpy(audio, frame->data[0], frame->nb_samples * sample_size);
         } else
             audio = frame->data[0];
     } else {
-        if (!opus->afq.remaining_samples)
+        if (!opus->afq.remaining_samples || (!opus->afq.frame_alloc && !opus->afq.frame_count))
             return 0;
         audio = opus->samples;
         memset(audio, 0, opus->opts.packet_size * sample_size);
     }
 
-    /* Maximum packet size taken from opusenc in opus-tools. 60ms packets
-     * consist of 3 frames in one packet. The maximum frame size is 1275
+    /* Maximum packet size taken from opusenc in opus-tools. 120ms packets
+     * consist of 6 frames in one packet. The maximum frame size is 1275
      * bytes along with the largest possible packet header of 7 bytes. */
-    if (ret = ff_alloc_packet(avpkt, (1275 * 3 + 7) * opus->stream_count)) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, (1275 * 6 + 7) * opus->stream_count, 0)) < 0)
         return ret;
-    }
 
     if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT)
         ret = opus_multistream_encode_float(opus->enc, (float *)audio,
@@ -353,12 +499,31 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
     ff_af_queue_remove(&opus->afq, opus->opts.packet_size,
                        &avpkt->pts, &avpkt->duration);
 
+    discard_padding = opus->opts.packet_size - avpkt->duration;
+    // Check if subtraction resulted in an overflow
+    if ((discard_padding < opus->opts.packet_size) != (avpkt->duration > 0)) {
+        av_packet_unref(avpkt);
+        av_free(avpkt);
+        return AVERROR(EINVAL);
+    }
+    if (discard_padding > 0) {
+        uint8_t* side_data = av_packet_new_side_data(avpkt,
+                                                     AV_PKT_DATA_SKIP_SAMPLES,
+                                                     10);
+        if(!side_data) {
+            av_packet_unref(avpkt);
+            av_free(avpkt);
+            return AVERROR(ENOMEM);
+        }
+        AV_WL32(side_data + 4, discard_padding);
+    }
+
     *got_packet_ptr = 1;
 
     return 0;
 }
 
-static int av_cold libopus_encode_close(AVCodecContext *avctx)
+static av_cold int libopus_encode_close(AVCodecContext *avctx)
 {
     LibopusEncContext *opus = avctx->priv_data;
 
@@ -379,12 +544,16 @@ static const AVOption libopus_options[] = {
         { "voip",           "Favor improved speech intelligibility",   0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_VOIP },                0, 0, FLAGS, "application" },
         { "audio",          "Favor faithfulness to the input",         0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_AUDIO },               0, 0, FLAGS, "application" },
         { "lowdelay",       "Restrict to only the lowest delay modes", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_RESTRICTED_LOWDELAY }, 0, 0, FLAGS, "application" },
-    { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 60.0, FLAGS },
+    { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 120.0, FLAGS },
     { "packet_loss",    "Expected packet loss percentage",     OFFSET(packet_loss),    AV_OPT_TYPE_INT,   { .i64 = 0 },    0,   100,  FLAGS },
     { "vbr",            "Variable bit rate mode",              OFFSET(vbr),            AV_OPT_TYPE_INT,   { .i64 = 1 },    0,   2,    FLAGS, "vbr" },
         { "off",            "Use constant bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "vbr" },
         { "on",             "Use variable bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "vbr" },
         { "constrained",    "Use constrained VBR",   0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "vbr" },
+    { "mapping_family", "Channel Mapping Family",              OFFSET(mapping_family), AV_OPT_TYPE_INT,   { .i64 = -1 },   -1,  255,  FLAGS, "mapping_family" },
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+#endif
     { NULL },
 };
 
@@ -418,7 +587,6 @@ AVCodec ff_libopus_encoder = {
     .sample_fmts     = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                       AV_SAMPLE_FMT_FLT,
                                                       AV_SAMPLE_FMT_NONE },
-    .channel_layouts = ff_vorbis_channel_layouts,
     .supported_samplerates = libopus_sample_rates,
     .priv_class      = &libopus_class,
     .defaults        = libopus_defaults,
diff --git a/libavcodec/librsvgdec.c b/libavcodec/librsvgdec.c
new file mode 100644
index 0000000..6697785
--- /dev/null
+++ b/libavcodec/librsvgdec.c
@@ -0,0 +1,130 @@
+/*
+ * Librsvg rasterization wrapper
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/opt.h"
+#include "librsvg-2.0/librsvg/rsvg.h"
+
+typedef struct LibRSVGContext {
+    AVClass *class;
+
+    int width;
+    int height;
+    int keep_ar;
+} LibRSVGContext;
+
+static int librsvg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *pkt)
+{
+    int ret;
+    LibRSVGContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+
+    RsvgHandle *handle;
+    RsvgDimensionData unscaled_dimensions, dimensions;
+    cairo_surface_t *image;
+    cairo_t *crender = NULL;
+    GError *error = NULL;
+
+    *got_frame = 0;
+
+    handle = rsvg_handle_new_from_data(pkt->data, pkt->size, &error);
+    if (error) {
+        av_log(avctx, AV_LOG_ERROR, "Error parsing svg!\n");
+        g_error_free(error);
+        return AVERROR_INVALIDDATA;
+    }
+
+    rsvg_handle_get_dimensions(handle, &dimensions);
+    rsvg_handle_get_dimensions(handle, &unscaled_dimensions);
+    dimensions.width  = s->width  ? s->width  : dimensions.width;
+    dimensions.height = s->height ? s->height : dimensions.height;
+    if (s->keep_ar && (s->width || s->height)) {
+        double default_ar = unscaled_dimensions.width/(double)unscaled_dimensions.height;
+        if (!s->width)
+            dimensions.width  = lrintf(dimensions.height * default_ar);
+        else
+            dimensions.height = lrintf(dimensions.width  / default_ar);
+    }
+
+    if ((ret = ff_set_dimensions(avctx, dimensions.width, dimensions.height)))
+        return ret;
+    avctx->pix_fmt = AV_PIX_FMT_RGB32;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)))
+        return ret;
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+
+    image = cairo_image_surface_create_for_data(frame->data[0], CAIRO_FORMAT_ARGB32,
+                                                frame->width, frame->height,
+                                                frame->linesize[0]);
+    if (cairo_surface_status(image) != CAIRO_STATUS_SUCCESS)
+        return AVERROR_INVALIDDATA;
+
+    crender = cairo_create(image);
+
+    cairo_save(crender);
+    cairo_set_operator(crender, CAIRO_OPERATOR_CLEAR);
+    cairo_paint(crender);
+    cairo_restore(crender);
+
+    cairo_scale(crender, dimensions.width / (double)unscaled_dimensions.width,
+                dimensions.height / (double)unscaled_dimensions.height);
+
+    rsvg_handle_render_cairo(handle, crender);
+
+    cairo_destroy(crender);
+    cairo_surface_destroy(image);
+    g_object_unref(handle);
+
+    *got_frame = 1;
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(LibRSVGContext, x)
+#define DEC (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption options[] = {
+    { "width", "Width to render to (0 for default)", OFFSET(width), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, DEC },
+    { "height", "Height to render to (0 for default)", OFFSET(height), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, DEC },
+    { "keep_ar", "Keep aspect ratio with custom width/height", OFFSET(keep_ar), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, DEC },
+    { NULL },
+};
+
+static const AVClass librsvg_decoder_class = {
+    .class_name = "Librsvg",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_librsvg_decoder = {
+    .name           = "librsvg",
+    .long_name      = NULL_IF_CONFIG_SMALL("Librsvg rasterizer"),
+    .priv_class     = &librsvg_decoder_class,
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SVG,
+    .decode         = librsvg_decode_frame,
+    .priv_data_size = sizeof(LibRSVGContext),
+    .capabilities   = AV_CODEC_CAP_LOSSLESS | AV_CODEC_CAP_DR1,
+    .wrapper_name    = "librsvg",
+};
diff --git a/libavcodec/libschroedinger.c b/libavcodec/libschroedinger.c
deleted file mode 100644
index af3000f..0000000
--- a/libavcodec/libschroedinger.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
-* @file
-* function definitions common to libschroedinger decoder and encoder
-*/
-
-#include "libavutil/attributes.h"
-#include "libavutil/mem.h"
-#include "libschroedinger.h"
-#include "internal.h"
-
-static const SchroVideoFormatInfo ff_schro_video_format_info[] = {
-    { 640,  480,  24000, 1001},
-    { 176,  120,  15000, 1001},
-    { 176,  144,  25,    2   },
-    { 352,  240,  15000, 1001},
-    { 352,  288,  25,    2   },
-    { 704,  480,  15000, 1001},
-    { 704,  576,  25,    2   },
-    { 720,  480,  30000, 1001},
-    { 720,  576,  25,    1   },
-    { 1280, 720,  60000, 1001},
-    { 1280, 720,  50,    1   },
-    { 1920, 1080, 30000, 1001},
-    { 1920, 1080, 25,    1   },
-    { 1920, 1080, 60000, 1001},
-    { 1920, 1080, 50,    1   },
-    { 2048, 1080, 24,    1   },
-    { 4096, 2160, 24,    1   },
-};
-
-static unsigned int get_video_format_idx(AVCodecContext *avctx)
-{
-    unsigned int ret_idx = 0;
-    unsigned int idx;
-    unsigned int num_formats = sizeof(ff_schro_video_format_info) /
-                               sizeof(ff_schro_video_format_info[0]);
-
-    for (idx = 1; idx < num_formats; ++idx) {
-        const SchroVideoFormatInfo *vf = &ff_schro_video_format_info[idx];
-        if (avctx->width  == vf->width &&
-            avctx->height == vf->height) {
-            ret_idx = idx;
-            if (avctx->time_base.den == vf->frame_rate_num &&
-                avctx->time_base.num == vf->frame_rate_denom)
-                return idx;
-        }
-    }
-    return ret_idx;
-}
-
-av_cold void ff_schro_queue_init(FFSchroQueue *queue)
-{
-    queue->p_head = queue->p_tail = NULL;
-    queue->size = 0;
-}
-
-void ff_schro_queue_free(FFSchroQueue *queue, void (*free_func)(void *))
-{
-    while (queue->p_head)
-        free_func(ff_schro_queue_pop(queue));
-}
-
-int ff_schro_queue_push_back(FFSchroQueue *queue, void *p_data)
-{
-    FFSchroQueueElement *p_new = av_mallocz(sizeof(FFSchroQueueElement));
-
-    if (!p_new)
-        return -1;
-
-    p_new->data = p_data;
-
-    if (!queue->p_head)
-        queue->p_head = p_new;
-    else
-        queue->p_tail->next = p_new;
-    queue->p_tail = p_new;
-
-    ++queue->size;
-    return 0;
-}
-
-void *ff_schro_queue_pop(FFSchroQueue *queue)
-{
-    FFSchroQueueElement *top = queue->p_head;
-
-    if (top) {
-        void *data = top->data;
-        queue->p_head = queue->p_head->next;
-        --queue->size;
-        av_freep(&top);
-        return data;
-    }
-
-    return NULL;
-}
-
-/**
-* Schroedinger video preset table. Ensure that this tables matches up correctly
-* with the ff_schro_video_format_info table.
-*/
-static const SchroVideoFormatEnum ff_schro_video_formats[]={
-    SCHRO_VIDEO_FORMAT_CUSTOM     ,
-    SCHRO_VIDEO_FORMAT_QSIF       ,
-    SCHRO_VIDEO_FORMAT_QCIF       ,
-    SCHRO_VIDEO_FORMAT_SIF        ,
-    SCHRO_VIDEO_FORMAT_CIF        ,
-    SCHRO_VIDEO_FORMAT_4SIF       ,
-    SCHRO_VIDEO_FORMAT_4CIF       ,
-    SCHRO_VIDEO_FORMAT_SD480I_60  ,
-    SCHRO_VIDEO_FORMAT_SD576I_50  ,
-    SCHRO_VIDEO_FORMAT_HD720P_60  ,
-    SCHRO_VIDEO_FORMAT_HD720P_50  ,
-    SCHRO_VIDEO_FORMAT_HD1080I_60 ,
-    SCHRO_VIDEO_FORMAT_HD1080I_50 ,
-    SCHRO_VIDEO_FORMAT_HD1080P_60 ,
-    SCHRO_VIDEO_FORMAT_HD1080P_50 ,
-    SCHRO_VIDEO_FORMAT_DC2K_24    ,
-    SCHRO_VIDEO_FORMAT_DC4K_24    ,
-};
-
-SchroVideoFormatEnum ff_get_schro_video_format_preset(AVCodecContext *avctx)
-{
-    unsigned int num_formats = sizeof(ff_schro_video_formats) /
-                               sizeof(ff_schro_video_formats[0]);
-
-    unsigned int idx = get_video_format_idx(avctx);
-
-    return (idx < num_formats) ? ff_schro_video_formats[idx] :
-                                 SCHRO_VIDEO_FORMAT_CUSTOM;
-}
-
-int ff_get_schro_frame_format (SchroChromaFormat schro_pix_fmt,
-                               SchroFrameFormat  *schro_frame_fmt)
-{
-    unsigned int num_formats = sizeof(schro_pixel_format_map) /
-                               sizeof(schro_pixel_format_map[0]);
-
-    int idx;
-
-    for (idx = 0; idx < num_formats; ++idx) {
-        if (schro_pixel_format_map[idx].schro_pix_fmt == schro_pix_fmt) {
-            *schro_frame_fmt = schro_pixel_format_map[idx].schro_frame_fmt;
-            return 0;
-        }
-    }
-    return -1;
-}
-
-static void free_schro_frame(SchroFrame *frame, void *priv)
-{
-    AVFrame *p_pic = priv;
-    av_frame_free(&p_pic);
-}
-
-SchroFrame *ff_create_schro_frame(AVCodecContext *avctx,
-                                  SchroFrameFormat schro_frame_fmt)
-{
-    AVFrame *p_pic;
-    SchroFrame *p_frame;
-    int y_width, uv_width;
-    int y_height, uv_height;
-    int i;
-
-    y_width   = avctx->width;
-    y_height  = avctx->height;
-    uv_width  = y_width  >> (SCHRO_FRAME_FORMAT_H_SHIFT(schro_frame_fmt));
-    uv_height = y_height >> (SCHRO_FRAME_FORMAT_V_SHIFT(schro_frame_fmt));
-
-    p_pic = av_frame_alloc();
-    if (!p_pic)
-        return NULL;
-
-    if (ff_get_buffer(avctx, p_pic, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_frame_free(&p_pic);
-        av_log(avctx, AV_LOG_ERROR, "Unable to allocate buffer\n");
-        return NULL;
-    }
-
-    p_frame         = schro_frame_new();
-    p_frame->format = schro_frame_fmt;
-    p_frame->width  = y_width;
-    p_frame->height = y_height;
-    schro_frame_set_free_callback(p_frame, free_schro_frame, p_pic);
-
-    for (i = 0; i < 3; ++i) {
-        p_frame->components[i].width  = i ? uv_width : y_width;
-        p_frame->components[i].stride = p_pic->linesize[i];
-        p_frame->components[i].height = i ? uv_height : y_height;
-        p_frame->components[i].length =
-                 p_frame->components[i].stride * p_frame->components[i].height;
-        p_frame->components[i].data   = p_pic->data[i];
-
-        if (i) {
-            p_frame->components[i].v_shift =
-                SCHRO_FRAME_FORMAT_V_SHIFT(p_frame->format);
-            p_frame->components[i].h_shift =
-                SCHRO_FRAME_FORMAT_H_SHIFT(p_frame->format);
-        }
-    }
-
-    return p_frame;
-}
diff --git a/libavcodec/libschroedinger.h b/libavcodec/libschroedinger.h
deleted file mode 100644
index 5481f92..0000000
--- a/libavcodec/libschroedinger.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
-* @file
-* data structures common to libschroedinger decoder and encoder
-*/
-
-#ifndef AVCODEC_LIBSCHROEDINGER_H
-#define AVCODEC_LIBSCHROEDINGER_H
-
-#include <schroedinger/schrobitstream.h>
-#include <schroedinger/schroframe.h>
-
-#include "avcodec.h"
-
-typedef struct SchroVideoFormatInfo {
-    uint16_t width;
-    uint16_t height;
-    uint16_t frame_rate_num;
-    uint16_t frame_rate_denom;
-} SchroVideoFormatInfo;
-
-/**
-* contains a single encoded frame returned from Dirac or Schroedinger
-*/
-typedef struct FFSchroEncodedFrame {
-    /** encoded frame data */
-    uint8_t *p_encbuf;
-
-    /** encoded frame size */
-    uint32_t size;
-
-    /** encoded frame number. Will be used as pts */
-    uint32_t frame_num;
-
-    /** key frame flag. 1 : is key frame , 0 : in not key frame */
-    uint16_t key_frame;
-} FFSchroEncodedFrame;
-
-/**
-* queue element
-*/
-typedef struct FFSchroQueueElement {
-    /** Data to be stored in queue*/
-    void *data;
-    /** Pointer to next element queue */
-    struct FFSchroQueueElement *next;
-} FFSchroQueueElement;
-
-
-/**
-* A simple queue implementation used in libschroedinger
-*/
-typedef struct FFSchroQueue {
-    /** Pointer to head of queue */
-    FFSchroQueueElement *p_head;
-    /** Pointer to tail of queue */
-    FFSchroQueueElement *p_tail;
-    /** Queue size*/
-    int size;
-} FFSchroQueue;
-
-/**
-* Initialise the queue
-*/
-void ff_schro_queue_init(FFSchroQueue *queue);
-
-/**
-* Add an element to the end of the queue
-*/
-int ff_schro_queue_push_back(FFSchroQueue *queue, void *p_data);
-
-/**
-* Return the first element in the queue
-*/
-void *ff_schro_queue_pop(FFSchroQueue *queue);
-
-/**
-* Free the queue resources. free_func is a function supplied by the caller to
-* free any resources allocated by the caller. The data field of the queue
-* element is passed to it.
-*/
-void ff_schro_queue_free(FFSchroQueue *queue, void (*free_func)(void *));
-
-static const struct {
-    enum AVPixelFormat  ff_pix_fmt;
-    SchroChromaFormat schro_pix_fmt;
-    SchroFrameFormat  schro_frame_fmt;
-} schro_pixel_format_map[] = {
-    { AV_PIX_FMT_YUV420P, SCHRO_CHROMA_420, SCHRO_FRAME_FORMAT_U8_420 },
-    { AV_PIX_FMT_YUV422P, SCHRO_CHROMA_422, SCHRO_FRAME_FORMAT_U8_422 },
-    { AV_PIX_FMT_YUV444P, SCHRO_CHROMA_444, SCHRO_FRAME_FORMAT_U8_444 },
-};
-
-/**
-* Returns the video format preset matching the input video dimensions and
-* time base.
-*/
-SchroVideoFormatEnum ff_get_schro_video_format_preset (AVCodecContext *avctx);
-
-/**
-* Sets the Schroedinger frame format corresponding to the Schro chroma format
-* passed. Returns 0 on success, -1 on failure.
-*/
-int ff_get_schro_frame_format(SchroChromaFormat schro_chroma_fmt,
-                              SchroFrameFormat  *schro_frame_fmt);
-
-/**
-* Create a Schro frame based on the dimensions and frame format
-* passed. Returns a pointer to a frame on success, NULL on failure.
-*/
-SchroFrame *ff_create_schro_frame(AVCodecContext *avctx,
-                                  SchroFrameFormat schro_frame_fmt);
-
-#endif /* AVCODEC_LIBSCHROEDINGER_H */
diff --git a/libavcodec/libschroedingerdec.c b/libavcodec/libschroedingerdec.c
deleted file mode 100644
index 246ac48..0000000
--- a/libavcodec/libschroedingerdec.c
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- * Dirac decoder support via Schroedinger libraries
- * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
-* @file
-* Dirac decoder support via libschroedinger-1.0 libraries. More details about
-* the Schroedinger project can be found at http://www.diracvideo.org/.
-* The library implements Dirac Specification Version 2.2.
-* (http://dirac.sourceforge.net/specification.html).
-*/
-
-#include <string.h>
-
-#include "libavutil/imgutils.h"
-#include "libavutil/internal.h"
-#include "libavutil/intreadwrite.h"
-#include "libavutil/mem.h"
-#include "avcodec.h"
-#include "internal.h"
-#include "libschroedinger.h"
-
-#include <schroedinger/schro.h>
-#include <schroedinger/schrodebug.h>
-#include <schroedinger/schrovideoformat.h>
-
-/** SchroFrame and Pts relation */
-typedef struct LibSchroFrameContext {
-     SchroFrame *frame;
-     int64_t pts;
-} LibSchroFrameContext;
-
-/** libschroedinger decoder private data */
-typedef struct SchroDecoderParams {
-    /** Schroedinger video format */
-    SchroVideoFormat *format;
-
-    /** Schroedinger frame format */
-    SchroFrameFormat frame_format;
-
-    /** decoder handle */
-    SchroDecoder* decoder;
-
-    /** queue storing decoded frames */
-    FFSchroQueue dec_frame_queue;
-
-    /** end of sequence signalled */
-    int eos_signalled;
-
-    /** end of sequence pulled */
-    int eos_pulled;
-} SchroDecoderParams;
-
-typedef struct SchroParseUnitContext {
-    const uint8_t *buf;
-    int           buf_size;
-} SchroParseUnitContext;
-
-
-static void libschroedinger_decode_buffer_free(SchroBuffer *schro_buf,
-                                               void *priv)
-{
-    av_freep(&priv);
-}
-
-static void parse_context_init(SchroParseUnitContext *parse_ctx,
-                               const uint8_t *buf, int buf_size)
-{
-    parse_ctx->buf           = buf;
-    parse_ctx->buf_size      = buf_size;
-}
-
-static SchroBuffer *find_next_parse_unit(SchroParseUnitContext *parse_ctx)
-{
-    SchroBuffer *enc_buf = NULL;
-    int next_pu_offset = 0;
-    unsigned char *in_buf;
-
-    if (parse_ctx->buf_size < 13 ||
-        parse_ctx->buf[0] != 'B' ||
-        parse_ctx->buf[1] != 'B' ||
-        parse_ctx->buf[2] != 'C' ||
-        parse_ctx->buf[3] != 'D')
-        return NULL;
-
-    next_pu_offset = (parse_ctx->buf[5] << 24) +
-                     (parse_ctx->buf[6] << 16) +
-                     (parse_ctx->buf[7] <<  8) +
-                      parse_ctx->buf[8];
-
-    if (next_pu_offset == 0 &&
-        SCHRO_PARSE_CODE_IS_END_OF_SEQUENCE(parse_ctx->buf[4]))
-        next_pu_offset = 13;
-
-    if (next_pu_offset <= 0 || parse_ctx->buf_size < next_pu_offset)
-        return NULL;
-
-    in_buf = av_malloc(next_pu_offset);
-    if (!in_buf) {
-        av_log(parse_ctx, AV_LOG_ERROR, "Unable to allocate input buffer\n");
-        return NULL;
-    }
-
-    memcpy(in_buf, parse_ctx->buf, next_pu_offset);
-    enc_buf       = schro_buffer_new_with_data(in_buf, next_pu_offset);
-    enc_buf->free = libschroedinger_decode_buffer_free;
-    enc_buf->priv = in_buf;
-
-    parse_ctx->buf      += next_pu_offset;
-    parse_ctx->buf_size -= next_pu_offset;
-
-    return enc_buf;
-}
-
-/**
-* Returns Libav chroma format.
-*/
-static enum AVPixelFormat get_chroma_format(SchroChromaFormat schro_pix_fmt)
-{
-    int num_formats = sizeof(schro_pixel_format_map) /
-                      sizeof(schro_pixel_format_map[0]);
-    int idx;
-
-    for (idx = 0; idx < num_formats; ++idx)
-        if (schro_pixel_format_map[idx].schro_pix_fmt == schro_pix_fmt)
-            return schro_pixel_format_map[idx].ff_pix_fmt;
-    return AV_PIX_FMT_NONE;
-}
-
-static av_cold int libschroedinger_decode_init(AVCodecContext *avctx)
-{
-
-    SchroDecoderParams *p_schro_params = avctx->priv_data;
-    /* First of all, initialize our supporting libraries. */
-    schro_init();
-
-    schro_debug_set_level(avctx->debug);
-    p_schro_params->decoder = schro_decoder_new();
-    schro_decoder_set_skip_ratio(p_schro_params->decoder, 1);
-
-    if (!p_schro_params->decoder)
-        return -1;
-
-    /* Initialize the decoded frame queue. */
-    ff_schro_queue_init(&p_schro_params->dec_frame_queue);
-    return 0;
-}
-
-static void libschroedinger_decode_frame_free(void *frame)
-{
-    schro_frame_unref(frame);
-}
-
-static void libschroedinger_handle_first_access_unit(AVCodecContext *avctx)
-{
-    SchroDecoderParams *p_schro_params = avctx->priv_data;
-    SchroDecoder *decoder = p_schro_params->decoder;
-
-    p_schro_params->format = schro_decoder_get_video_format(decoder);
-
-    /* Tell Libav about sequence details. */
-    if (av_image_check_size(p_schro_params->format->width,
-                            p_schro_params->format->height, 0, avctx) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "invalid dimensions (%dx%d)\n",
-               p_schro_params->format->width, p_schro_params->format->height);
-        avctx->height = avctx->width = 0;
-        return;
-    }
-    avctx->height  = p_schro_params->format->height;
-    avctx->width   = p_schro_params->format->width;
-    avctx->pix_fmt = get_chroma_format(p_schro_params->format->chroma_format);
-
-    if (ff_get_schro_frame_format(p_schro_params->format->chroma_format,
-                                  &p_schro_params->frame_format) == -1) {
-        av_log(avctx, AV_LOG_ERROR,
-               "This codec currently only supports planar YUV 4:2:0, 4:2:2 "
-               "and 4:4:4 formats.\n");
-        return;
-    }
-
-    avctx->framerate.num = p_schro_params->format->frame_rate_numerator;
-    avctx->framerate.den = p_schro_params->format->frame_rate_denominator;
-}
-
-static int libschroedinger_decode_frame(AVCodecContext *avctx,
-                                        void *data, int *got_frame,
-                                        AVPacket *avpkt)
-{
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
-    int64_t pts  = avpkt->pts;
-    SchroTag *tag;
-
-    SchroDecoderParams *p_schro_params = avctx->priv_data;
-    SchroDecoder *decoder = p_schro_params->decoder;
-    SchroBuffer *enc_buf;
-    SchroFrame* frame;
-    AVFrame *avframe = data;
-    int state;
-    int go = 1;
-    int outer = 1;
-    SchroParseUnitContext parse_ctx;
-    LibSchroFrameContext *framewithpts = NULL;
-    int ret;
-
-    *got_frame = 0;
-
-    parse_context_init(&parse_ctx, buf, buf_size);
-    if (!buf_size) {
-        if (!p_schro_params->eos_signalled) {
-            state = schro_decoder_push_end_of_stream(decoder);
-            p_schro_params->eos_signalled = 1;
-        }
-    }
-
-    /* Loop through all the individual parse units in the input buffer */
-    do {
-        if ((enc_buf = find_next_parse_unit(&parse_ctx))) {
-            /* Set Schrotag with the pts to be recovered after decoding*/
-            enc_buf->tag = schro_tag_new(av_malloc(sizeof(int64_t)), av_free);
-            if (!enc_buf->tag->value) {
-                av_log(avctx, AV_LOG_ERROR, "Unable to allocate SchroTag\n");
-                return AVERROR(ENOMEM);
-            }
-            AV_WN(64, enc_buf->tag->value, pts);
-            /* Push buffer into decoder. */
-            if (SCHRO_PARSE_CODE_IS_PICTURE(enc_buf->data[4]) &&
-                SCHRO_PARSE_CODE_NUM_REFS(enc_buf->data[4]) > 0)
-                avctx->has_b_frames = 1;
-            state = schro_decoder_push(decoder, enc_buf);
-            if (state == SCHRO_DECODER_FIRST_ACCESS_UNIT)
-                libschroedinger_handle_first_access_unit(avctx);
-            go = 1;
-        } else
-            outer = 0;
-
-        while (go) {
-            /* Parse data and process result. */
-            state = schro_decoder_wait(decoder);
-            switch (state) {
-            case SCHRO_DECODER_FIRST_ACCESS_UNIT:
-                libschroedinger_handle_first_access_unit(avctx);
-                break;
-
-            case SCHRO_DECODER_NEED_BITS:
-                /* Need more input data - stop iterating over what we have. */
-                go = 0;
-                break;
-
-            case SCHRO_DECODER_NEED_FRAME:
-                /* Decoder needs a frame - create one and push it in. */
-                frame = ff_create_schro_frame(avctx,
-                                              p_schro_params->frame_format);
-                if (!frame)
-                    return AVERROR(ENOMEM);
-                schro_decoder_add_output_picture(decoder, frame);
-                break;
-
-            case SCHRO_DECODER_OK:
-                /* Pull a frame out of the decoder. */
-                tag   = schro_decoder_get_picture_tag(decoder);
-                frame = schro_decoder_pull(decoder);
-
-                if (frame) {
-                    /* Add relation between schroframe and pts. */
-                    framewithpts = av_malloc(sizeof(LibSchroFrameContext));
-                    if (!framewithpts) {
-                        av_log(avctx, AV_LOG_ERROR, "Unable to allocate FrameWithPts\n");
-                        return AVERROR(ENOMEM);
-                    }
-                    framewithpts->frame = frame;
-                    framewithpts->pts   = AV_RN64(tag->value);
-                    ff_schro_queue_push_back(&p_schro_params->dec_frame_queue,
-                                             framewithpts);
-                }
-                break;
-            case SCHRO_DECODER_EOS:
-                go = 0;
-                p_schro_params->eos_pulled = 1;
-                schro_decoder_reset(decoder);
-                outer = 0;
-                break;
-
-            case SCHRO_DECODER_ERROR:
-                return -1;
-                break;
-            }
-        }
-    } while (outer);
-
-    /* Grab next frame to be returned from the top of the queue. */
-    framewithpts = ff_schro_queue_pop(&p_schro_params->dec_frame_queue);
-
-    if (framewithpts && framewithpts->frame && framewithpts->frame->components[0].stride) {
-        if ((ret = ff_get_buffer(avctx, avframe, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Unable to allocate buffer\n");
-            goto end;
-        }
-
-        memcpy(avframe->data[0],
-               framewithpts->frame->components[0].data,
-               framewithpts->frame->components[0].length);
-
-        memcpy(avframe->data[1],
-               framewithpts->frame->components[1].data,
-               framewithpts->frame->components[1].length);
-
-        memcpy(avframe->data[2],
-               framewithpts->frame->components[2].data,
-               framewithpts->frame->components[2].length);
-
-        /* Fill frame with current buffer data from Schroedinger. */
-        avframe->pts = framewithpts->pts;
-#if FF_API_PKT_PTS
-FF_DISABLE_DEPRECATION_WARNINGS
-        avframe->pkt_pts = avframe->pts;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-        avframe->linesize[0] = framewithpts->frame->components[0].stride;
-        avframe->linesize[1] = framewithpts->frame->components[1].stride;
-        avframe->linesize[2] = framewithpts->frame->components[2].stride;
-
-        *got_frame      = 1;
-    } else {
-        data       = NULL;
-        *got_frame = 0;
-    }
-    ret = buf_size;
-end:
-    /* Now free the frame resources. */
-    if (framewithpts && framewithpts->frame)
-        libschroedinger_decode_frame_free(framewithpts->frame);
-    av_freep(&framewithpts);
-    return ret;
-}
-
-
-static av_cold int libschroedinger_decode_close(AVCodecContext *avctx)
-{
-    SchroDecoderParams *p_schro_params = avctx->priv_data;
-    /* Free the decoder. */
-    schro_decoder_free(p_schro_params->decoder);
-    av_freep(&p_schro_params->format);
-
-    /* Free data in the output frame queue. */
-    ff_schro_queue_free(&p_schro_params->dec_frame_queue,
-                        libschroedinger_decode_frame_free);
-
-    return 0;
-}
-
-static void libschroedinger_flush(AVCodecContext *avctx)
-{
-    /* Got a seek request. Free the decoded frames queue and then reset
-     * the decoder */
-    SchroDecoderParams *p_schro_params = avctx->priv_data;
-
-    /* Free data in the output frame queue. */
-    ff_schro_queue_free(&p_schro_params->dec_frame_queue,
-                        libschroedinger_decode_frame_free);
-
-    ff_schro_queue_init(&p_schro_params->dec_frame_queue);
-    schro_decoder_reset(p_schro_params->decoder);
-    p_schro_params->eos_pulled = 0;
-    p_schro_params->eos_signalled = 0;
-}
-
-AVCodec ff_libschroedinger_decoder = {
-    .name           = "libschroedinger",
-    .long_name      = NULL_IF_CONFIG_SMALL("libschroedinger Dirac 2.2"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_DIRAC,
-    .priv_data_size = sizeof(SchroDecoderParams),
-    .init           = libschroedinger_decode_init,
-    .close          = libschroedinger_decode_close,
-    .decode         = libschroedinger_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
-    .flush          = libschroedinger_flush,
-    .wrapper_name   = "libschroedinger",
-};
diff --git a/libavcodec/libschroedingerenc.c b/libavcodec/libschroedingerenc.c
deleted file mode 100644
index 4a1be94..0000000
--- a/libavcodec/libschroedingerenc.c
+++ /dev/null
@@ -1,485 +0,0 @@
-/*
- * Dirac encoder support via Schroedinger libraries
- * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
-* @file
-* Dirac encoder support via libschroedinger-1.0 libraries. More details about
-* the Schroedinger project can be found at http://www.diracvideo.org/.
-* The library implements Dirac Specification Version 2.2
-* (http://dirac.sourceforge.net/specification.html).
-*/
-
-#include <schroedinger/schro.h>
-#include <schroedinger/schrodebug.h>
-#include <schroedinger/schrovideoformat.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/imgutils.h"
-#include "libavutil/opt.h"
-
-#include "avcodec.h"
-#include "internal.h"
-#include "libschroedinger.h"
-#include "bytestream.h"
-
-
-/** libschroedinger encoder private data */
-typedef struct SchroEncoderParams {
-    /** Schroedinger video format */
-    SchroVideoFormat *format;
-
-    /** Schroedinger frame format */
-    SchroFrameFormat frame_format;
-
-    /** frame size */
-    int frame_size;
-
-    /** Schroedinger encoder handle*/
-    SchroEncoder* encoder;
-
-    /** buffer to store encoder output before writing it to the frame queue*/
-    unsigned char *enc_buf;
-
-    /** Size of encoder buffer*/
-    int enc_buf_size;
-
-    /** queue storing encoded frames */
-    FFSchroQueue enc_frame_queue;
-
-    /** end of sequence signalled */
-    int eos_signalled;
-
-    /** end of sequence pulled */
-    int eos_pulled;
-
-    /* counter for frames submitted to encoder, used as dts */
-    int64_t dts;
-
-    /** enable noarith */
-    int noarith;
-} SchroEncoderParams;
-
-/**
-* Works out Schro-compatible chroma format.
-*/
-static int set_chroma_format(AVCodecContext *avctx)
-{
-    int num_formats = sizeof(schro_pixel_format_map) /
-                      sizeof(schro_pixel_format_map[0]);
-    int idx;
-
-    SchroEncoderParams *p_schro_params = avctx->priv_data;
-
-    for (idx = 0; idx < num_formats; ++idx) {
-        if (schro_pixel_format_map[idx].ff_pix_fmt == avctx->pix_fmt) {
-            p_schro_params->format->chroma_format =
-                            schro_pixel_format_map[idx].schro_pix_fmt;
-            return 0;
-        }
-    }
-
-    av_log(avctx, AV_LOG_ERROR,
-           "This codec currently only supports planar YUV 4:2:0, 4:2:2"
-           " and 4:4:4 formats.\n");
-
-    return -1;
-}
-
-static av_cold int libschroedinger_encode_init(AVCodecContext *avctx)
-{
-    SchroEncoderParams *p_schro_params = avctx->priv_data;
-    SchroVideoFormatEnum preset;
-
-    /* Initialize the libraries that libschroedinger depends on. */
-    schro_init();
-
-    /* Create an encoder object. */
-    p_schro_params->encoder = schro_encoder_new();
-
-    if (!p_schro_params->encoder) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Unrecoverable Error: schro_encoder_new failed. ");
-        return -1;
-    }
-
-    /* Initialize the format. */
-    preset = ff_get_schro_video_format_preset(avctx);
-    p_schro_params->format =
-                    schro_encoder_get_video_format(p_schro_params->encoder);
-    schro_video_format_set_std_video_format(p_schro_params->format, preset);
-    p_schro_params->format->width  = avctx->width;
-    p_schro_params->format->height = avctx->height;
-
-    if (set_chroma_format(avctx) == -1)
-        return -1;
-
-    if (avctx->color_primaries == AVCOL_PRI_BT709) {
-        p_schro_params->format->colour_primaries = SCHRO_COLOUR_PRIMARY_HDTV;
-    } else if (avctx->color_primaries == AVCOL_PRI_BT470BG) {
-        p_schro_params->format->colour_primaries = SCHRO_COLOUR_PRIMARY_SDTV_625;
-    } else if (avctx->color_primaries == AVCOL_PRI_SMPTE170M) {
-        p_schro_params->format->colour_primaries = SCHRO_COLOUR_PRIMARY_SDTV_525;
-    }
-
-    if (avctx->colorspace == AVCOL_SPC_BT709) {
-        p_schro_params->format->colour_matrix = SCHRO_COLOUR_MATRIX_HDTV;
-    } else if (avctx->colorspace == AVCOL_SPC_BT470BG) {
-        p_schro_params->format->colour_matrix = SCHRO_COLOUR_MATRIX_SDTV;
-    }
-
-    if (avctx->color_trc == AVCOL_TRC_BT709) {
-        p_schro_params->format->transfer_function = SCHRO_TRANSFER_CHAR_TV_GAMMA;
-    }
-
-    if (ff_get_schro_frame_format(p_schro_params->format->chroma_format,
-                                  &p_schro_params->frame_format) == -1) {
-        av_log(avctx, AV_LOG_ERROR,
-               "This codec currently supports only planar YUV 4:2:0, 4:2:2"
-               " and 4:4:4 formats.\n");
-        return -1;
-    }
-
-    p_schro_params->format->frame_rate_numerator   = avctx->time_base.den;
-    p_schro_params->format->frame_rate_denominator = avctx->time_base.num;
-
-    p_schro_params->frame_size = av_image_get_buffer_size(avctx->pix_fmt,
-                                                          avctx->width,
-                                                          avctx->height, 1);
-
-    if (!avctx->gop_size) {
-        schro_encoder_setting_set_double(p_schro_params->encoder,
-                                         "gop_structure",
-                                         SCHRO_ENCODER_GOP_INTRA_ONLY);
-
-#if FF_API_CODER_TYPE
-FF_DISABLE_DEPRECATION_WARNINGS
-        if (avctx->coder_type != FF_CODER_TYPE_VLC)
-            p_schro_params->noarith = 0;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-        schro_encoder_setting_set_double(p_schro_params->encoder,
-                                         "enable_noarith",
-                                         p_schro_params->noarith);
-    } else {
-        schro_encoder_setting_set_double(p_schro_params->encoder,
-                                         "au_distance", avctx->gop_size);
-        avctx->has_b_frames = 1;
-        p_schro_params->dts = -1;
-    }
-
-    /* FIXME - Need to handle SCHRO_ENCODER_RATE_CONTROL_LOW_DELAY. */
-    if (avctx->flags & AV_CODEC_FLAG_QSCALE) {
-        if (!avctx->global_quality) {
-            /* lossless coding */
-            schro_encoder_setting_set_double(p_schro_params->encoder,
-                                             "rate_control",
-                                             SCHRO_ENCODER_RATE_CONTROL_LOSSLESS);
-        } else {
-            int quality;
-            schro_encoder_setting_set_double(p_schro_params->encoder,
-                                             "rate_control",
-                                             SCHRO_ENCODER_RATE_CONTROL_CONSTANT_QUALITY);
-
-            quality = avctx->global_quality / FF_QP2LAMBDA;
-            if (quality > 10)
-                quality = 10;
-            schro_encoder_setting_set_double(p_schro_params->encoder,
-                                             "quality", quality);
-        }
-    } else {
-        schro_encoder_setting_set_double(p_schro_params->encoder,
-                                         "rate_control",
-                                         SCHRO_ENCODER_RATE_CONTROL_CONSTANT_BITRATE);
-
-        schro_encoder_setting_set_double(p_schro_params->encoder,
-                                         "bitrate", avctx->bit_rate);
-    }
-
-    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_ME)
-        /* All material can be coded as interlaced or progressive
-           irrespective of the type of source material. */
-        schro_encoder_setting_set_double(p_schro_params->encoder,
-                                         "interlaced_coding", 1);
-
-    schro_encoder_setting_set_double(p_schro_params->encoder, "open_gop",
-                                     !(avctx->flags & AV_CODEC_FLAG_CLOSED_GOP));
-
-    /* FIXME: Signal range hardcoded to 8-bit data until both libschroedinger
-     * and libdirac support other bit-depth data. */
-    schro_video_format_set_std_signal_range(p_schro_params->format,
-                                            SCHRO_SIGNAL_RANGE_8BIT_VIDEO);
-
-    /* Set the encoder format. */
-    schro_encoder_set_video_format(p_schro_params->encoder,
-                                   p_schro_params->format);
-
-    /* Set the debug level. */
-    schro_debug_set_level(avctx->debug);
-
-    schro_encoder_start(p_schro_params->encoder);
-
-    /* Initialize the encoded frame queue. */
-    ff_schro_queue_init(&p_schro_params->enc_frame_queue);
-    return 0;
-}
-
-static SchroFrame *libschroedinger_frame_from_data(AVCodecContext *avctx,
-                                                   const AVFrame *frame)
-{
-    SchroEncoderParams *p_schro_params = avctx->priv_data;
-    SchroFrame *in_frame = ff_create_schro_frame(avctx,
-                                                 p_schro_params->frame_format);
-
-    if (in_frame) {
-        /* Copy input data to SchroFrame buffers (they match the ones
-         * referenced by the AVFrame stored in priv) */
-        if (av_frame_copy(in_frame->priv, frame) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Failed to copy input data\n");
-            return NULL;
-        }
-    }
-
-    return in_frame;
-}
-
-static void libschroedinger_free_frame(void *data)
-{
-    FFSchroEncodedFrame *enc_frame = data;
-
-    av_freep(&enc_frame->p_encbuf);
-    av_free(enc_frame);
-}
-
-static int libschroedinger_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                                        const AVFrame *frame, int *got_packet)
-{
-    int enc_size = 0;
-    SchroEncoderParams *p_schro_params = avctx->priv_data;
-    SchroEncoder *encoder = p_schro_params->encoder;
-    struct FFSchroEncodedFrame *p_frame_output = NULL;
-    int go = 1;
-    SchroBuffer *enc_buf;
-    int presentation_frame;
-    int parse_code;
-    int last_frame_in_sequence = 0;
-    int pkt_size, ret;
-
-    if (!frame) {
-        /* Push end of sequence if not already signalled. */
-        if (!p_schro_params->eos_signalled) {
-            schro_encoder_end_of_stream(encoder);
-            p_schro_params->eos_signalled = 1;
-        }
-    } else {
-        /* Allocate frame data to schro input buffer. */
-        SchroFrame *in_frame = libschroedinger_frame_from_data(avctx, frame);
-        if (!in_frame)
-            return AVERROR(ENOMEM);
-        /* Load next frame. */
-        schro_encoder_push_frame(encoder, in_frame);
-    }
-
-    if (p_schro_params->eos_pulled)
-        go = 0;
-
-    /* Now check to see if we have any output from the encoder. */
-    while (go) {
-        int err;
-        SchroStateEnum state;
-        state = schro_encoder_wait(encoder);
-        switch (state) {
-        case SCHRO_STATE_HAVE_BUFFER:
-        case SCHRO_STATE_END_OF_STREAM:
-            enc_buf = schro_encoder_pull(encoder, &presentation_frame);
-            if (enc_buf->length <= 0)
-                return AVERROR_BUG;
-            parse_code = enc_buf->data[4];
-
-            /* All non-frame data is prepended to actual frame data to
-             * be able to set the pts correctly. So we don't write data
-             * to the frame output queue until we actually have a frame
-             */
-            if ((err = av_reallocp(&p_schro_params->enc_buf,
-                                   p_schro_params->enc_buf_size +
-                                   enc_buf->length)) < 0) {
-                p_schro_params->enc_buf_size = 0;
-                return err;
-            }
-
-            memcpy(p_schro_params->enc_buf + p_schro_params->enc_buf_size,
-                   enc_buf->data, enc_buf->length);
-            p_schro_params->enc_buf_size += enc_buf->length;
-
-
-            if (state == SCHRO_STATE_END_OF_STREAM) {
-                p_schro_params->eos_pulled = 1;
-                go = 0;
-            }
-
-            if (!SCHRO_PARSE_CODE_IS_PICTURE(parse_code)) {
-                schro_buffer_unref(enc_buf);
-                break;
-            }
-
-            /* Create output frame. */
-            p_frame_output = av_mallocz(sizeof(FFSchroEncodedFrame));
-            if (!p_frame_output)
-                return AVERROR(ENOMEM);
-            /* Set output data. */
-            p_frame_output->size     = p_schro_params->enc_buf_size;
-            p_frame_output->p_encbuf = p_schro_params->enc_buf;
-            if (SCHRO_PARSE_CODE_IS_INTRA(parse_code) &&
-                SCHRO_PARSE_CODE_IS_REFERENCE(parse_code))
-                p_frame_output->key_frame = 1;
-
-            /* Parse the coded frame number from the bitstream. Bytes 14
-             * through 17 represent the frame number. */
-            p_frame_output->frame_num = AV_RB32(enc_buf->data + 13);
-
-            ff_schro_queue_push_back(&p_schro_params->enc_frame_queue,
-                                     p_frame_output);
-            p_schro_params->enc_buf_size = 0;
-            p_schro_params->enc_buf      = NULL;
-
-            schro_buffer_unref(enc_buf);
-
-            break;
-
-        case SCHRO_STATE_NEED_FRAME:
-            go = 0;
-            break;
-
-        case SCHRO_STATE_AGAIN:
-            break;
-
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unknown Schro Encoder state\n");
-            return -1;
-        }
-    }
-
-    /* Copy 'next' frame in queue. */
-
-    if (p_schro_params->enc_frame_queue.size == 1 &&
-        p_schro_params->eos_pulled)
-        last_frame_in_sequence = 1;
-
-    p_frame_output = ff_schro_queue_pop(&p_schro_params->enc_frame_queue);
-
-    if (!p_frame_output)
-        return 0;
-
-    pkt_size = p_frame_output->size;
-    if (last_frame_in_sequence && p_schro_params->enc_buf_size > 0)
-        pkt_size += p_schro_params->enc_buf_size;
-    if ((ret = ff_alloc_packet(pkt, pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", pkt_size);
-        goto error;
-    }
-
-    memcpy(pkt->data, p_frame_output->p_encbuf, p_frame_output->size);
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->key_frame = p_frame_output->key_frame;
-    avctx->coded_frame->pts = p_frame_output->frame_num;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-    /* Use the frame number of the encoded frame as the pts. It is OK to
-     * do so since Dirac is a constant frame rate codec. It expects input
-     * to be of constant frame rate. */
-    pkt->pts = p_frame_output->frame_num;
-    pkt->dts = p_schro_params->dts++;
-    enc_size = p_frame_output->size;
-
-    /* Append the end of sequence information to the last frame in the
-     * sequence. */
-    if (last_frame_in_sequence && p_schro_params->enc_buf_size > 0) {
-        memcpy(pkt->data + enc_size, p_schro_params->enc_buf,
-               p_schro_params->enc_buf_size);
-        enc_size += p_schro_params->enc_buf_size;
-        av_freep(&p_schro_params->enc_buf);
-        p_schro_params->enc_buf_size = 0;
-    }
-
-    if (p_frame_output->key_frame)
-        pkt->flags |= AV_PKT_FLAG_KEY;
-    *got_packet = 1;
-
-error:
-    /* free frame */
-    libschroedinger_free_frame(p_frame_output);
-    return ret;
-}
-
-
-static int libschroedinger_encode_close(AVCodecContext *avctx)
-{
-    SchroEncoderParams *p_schro_params = avctx->priv_data;
-
-    /* Close the encoder. */
-    schro_encoder_free(p_schro_params->encoder);
-
-    /* Free data in the output frame queue. */
-    ff_schro_queue_free(&p_schro_params->enc_frame_queue,
-                        libschroedinger_free_frame);
-
-
-    /* Free the encoder buffer. */
-    if (p_schro_params->enc_buf_size)
-        av_freep(&p_schro_params->enc_buf);
-
-    /* Free the video format structure. */
-    av_freep(&p_schro_params->format);
-
-    return 0;
-}
-
-#define OFFSET(x) offsetof(SchroEncoderParams, x)
-#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "noarith", "Enable noarith", OFFSET(noarith), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
-
-    { NULL },
-};
-
-static const AVClass libschroedinger_class = {
-    .class_name = "libschroedinger",
-    .item_name  = av_default_item_name,
-    .option     = options,
-    .version    = LIBAVUTIL_VERSION_INT,
-};
-
-AVCodec ff_libschroedinger_encoder = {
-    .name           = "libschroedinger",
-    .long_name      = NULL_IF_CONFIG_SMALL("libschroedinger Dirac 2.2"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_DIRAC,
-    .priv_data_size = sizeof(SchroEncoderParams),
-    .priv_class     = &libschroedinger_class,
-    .init           = libschroedinger_encode_init,
-    .encode2        = libschroedinger_encode_frame,
-    .close          = libschroedinger_encode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY,
-    .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_NONE
-    },
-    .wrapper_name   = "libschroedinger",
-};
diff --git a/libavcodec/libshine.c b/libavcodec/libshine.c
new file mode 100644
index 0000000..7056fcd
--- /dev/null
+++ b/libavcodec/libshine.c
@@ -0,0 +1,150 @@
+/*
+ * Interface to libshine for mp3 encoding
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <shine/layer3.h>
+
+#include "libavutil/intreadwrite.h"
+#include "audio_frame_queue.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "mpegaudio.h"
+#include "mpegaudiodecheader.h"
+
+#define BUFFER_SIZE (4096 * 20)
+
+typedef struct SHINEContext {
+    shine_config_t  config;
+    shine_t         shine;
+    uint8_t         buffer[BUFFER_SIZE];
+    int             buffer_index;
+    AudioFrameQueue afq;
+} SHINEContext;
+
+static av_cold int libshine_encode_init(AVCodecContext *avctx)
+{
+    SHINEContext *s = avctx->priv_data;
+
+    if (avctx->channels <= 0 || avctx->channels > 2){
+        av_log(avctx, AV_LOG_ERROR, "only mono or stereo is supported\n");
+        return AVERROR(EINVAL);
+    }
+
+    shine_set_config_mpeg_defaults(&s->config.mpeg);
+    if (avctx->bit_rate)
+        s->config.mpeg.bitr = avctx->bit_rate / 1000;
+    s->config.mpeg.mode = avctx->channels == 2 ? STEREO : MONO;
+    s->config.wave.samplerate = avctx->sample_rate;
+    s->config.wave.channels   = avctx->channels == 2 ? PCM_STEREO : PCM_MONO;
+    if (shine_check_config(s->config.wave.samplerate, s->config.mpeg.bitr) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "invalid configuration\n");
+        return AVERROR(EINVAL);
+    }
+    s->shine = shine_initialise(&s->config);
+    if (!s->shine)
+        return AVERROR(ENOMEM);
+    avctx->frame_size = shine_samples_per_pass(s->shine);
+    ff_af_queue_init(avctx, &s->afq);
+    return 0;
+}
+
+static int libshine_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                 const AVFrame *frame, int *got_packet_ptr)
+{
+    SHINEContext *s = avctx->priv_data;
+    MPADecodeHeader hdr;
+    unsigned char *data;
+    int written;
+    int ret, len;
+
+    if (frame)
+        data = shine_encode_buffer(s->shine, (int16_t **)frame->data, &written);
+    else
+        data = shine_flush(s->shine, &written);
+    if (written < 0)
+        return -1;
+    if (written > 0) {
+        if (s->buffer_index + written > BUFFER_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "internal buffer too small\n");
+            return AVERROR_BUG;
+        }
+        memcpy(s->buffer + s->buffer_index, data, written);
+        s->buffer_index += written;
+    }
+    if (frame) {
+        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
+            return ret;
+    }
+
+    if (s->buffer_index < 4 || !s->afq.frame_count)
+        return 0;
+    if (avpriv_mpegaudio_decode_header(&hdr, AV_RB32(s->buffer))) {
+        av_log(avctx, AV_LOG_ERROR, "free format output not supported\n");
+        return -1;
+    }
+
+    len = hdr.frame_size;
+    if (len <= s->buffer_index) {
+        if ((ret = ff_alloc_packet2(avctx, avpkt, len, 0)))
+            return ret;
+        memcpy(avpkt->data, s->buffer, len);
+        s->buffer_index -= len;
+        memmove(s->buffer, s->buffer + len, s->buffer_index);
+
+        ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
+                           &avpkt->duration);
+
+        avpkt->size = len;
+        *got_packet_ptr = 1;
+    }
+    return 0;
+}
+
+static av_cold int libshine_encode_close(AVCodecContext *avctx)
+{
+    SHINEContext *s = avctx->priv_data;
+
+    ff_af_queue_close(&s->afq);
+    shine_close(s->shine);
+    return 0;
+}
+
+static const int libshine_sample_rates[] = {
+    44100, 48000, 32000, 0
+};
+
+AVCodec ff_libshine_encoder = {
+    .name                  = "libshine",
+    .long_name             = NULL_IF_CONFIG_SMALL("libshine MP3 (MPEG audio layer 3)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP3,
+    .priv_data_size        = sizeof(SHINEContext),
+    .init                  = libshine_encode_init,
+    .encode2               = libshine_encode_frame,
+    .close                 = libshine_encode_close,
+    .capabilities          = AV_CODEC_CAP_DELAY,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16P,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = libshine_sample_rates,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO,
+                                                  0 },
+    .wrapper_name          = "libshine",
+};
diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index e8775fd..d67c68c 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@ typedef struct LibSpeexContext {
     SpeexStereoState stereo;
     void *dec_state;
     int frame_size;
+    int pktsize;
 } LibSpeexContext;
 
 
@@ -43,14 +44,30 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
     SpeexHeader *header = NULL;
     int spx_mode;
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
     if (avctx->extradata && avctx->extradata_size >= 80) {
         header = speex_packet_to_header(avctx->extradata,
                                         avctx->extradata_size);
         if (!header)
             av_log(avctx, AV_LOG_WARNING, "Invalid Speex header\n");
     }
-    if (header) {
+    if (avctx->codec_tag == MKTAG('S', 'P', 'X', 'N')) {
+        int quality;
+        if (!avctx->extradata || avctx->extradata && avctx->extradata_size < 47) {
+            av_log(avctx, AV_LOG_ERROR, "Missing or invalid extradata.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        quality = avctx->extradata[37];
+        if (quality > 10) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported quality mode %d.\n", quality);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        s->pktsize = ((const int[]){5,10,15,20,20,28,28,38,38,46,62})[quality];
+
+        spx_mode           = 0;
+    } else if (header) {
+        avctx->sample_rate = header->rate;
         avctx->channels    = header->nb_channels;
         spx_mode           = header->mode;
         speex_header_free(header);
@@ -73,8 +90,9 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", spx_mode);
         return AVERROR_INVALIDDATA;
     }
-    avctx->sample_rate = 8000 << spx_mode;
     s->frame_size      =  160 << spx_mode;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000 << spx_mode;
 
     if (avctx->channels < 1 || avctx->channels > 2) {
         /* libspeex can handle mono or stereo if initialized as stereo */
@@ -113,13 +131,12 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame     = data;
     int16_t *output;
     int ret, consumed = 0;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
 
     /* get output buffer */
     frame->nb_samples = s->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output = (int16_t *)frame->data[0];
 
     /* if there is not enough data left for the smallest possible frame or the
@@ -133,9 +150,11 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
             *got_frame_ptr = 0;
             return buf_size;
         }
+        if (s->pktsize && buf_size == 62)
+            buf_size = s->pktsize;
         /* set new buffer */
         speex_bits_read_from(&s->bits, buf, buf_size);
-        consumed = buf_size;
+        consumed = avpkt->size;
     }
 
     /* decode a single frame */
@@ -149,6 +168,8 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
+    if (!avctx->bit_rate)
+        speex_decoder_ctl(s->dec_state, SPEEX_GET_BITRATE, &avctx->bit_rate);
     return consumed;
 }
 
diff --git a/libavcodec/libspeexenc.c b/libavcodec/libspeexenc.c
index b17761f..6a37dbc 100644
--- a/libavcodec/libspeexenc.c
+++ b/libavcodec/libspeexenc.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2009 Justin Ruggles
  * Copyright (c) 2009 Xuggle Incorporated
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,7 +76,7 @@
  *     encodes them with just enough bits to reproduce the background noise.
  *
  * Discontinuous Transmission (DTX)
- *     DTX is an addition to VAD/VBR operation, that allows to stop transmitting
+ *     DTX is an addition to VAD/VBR operation, that makes it possible to stop transmitting
  *     completely when the background noise is stationary.
  *     In file-based operation only 5 bits are used for such frames.
  */
@@ -92,6 +92,7 @@
 #include "internal.h"
 #include "audio_frame_queue.h"
 
+/* TODO: Think about converting abr, vad, dtx and such flags to a bit field */
 typedef struct LibSpeexEncContext {
     AVClass *class;             ///< AVClass for private options
     SpeexBits bits;             ///< libspeex bitwriter context
@@ -124,10 +125,10 @@ static av_cold void print_enc_params(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_DEBUG, "  quality: %f\n", s->vbr_quality);
     } else if (s->abr) {
         av_log(avctx, AV_LOG_DEBUG, "rate control: ABR\n");
-        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %d bps\n", avctx->bit_rate);
+        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %"PRId64" bps\n", avctx->bit_rate);
     } else {
         av_log(avctx, AV_LOG_DEBUG, "rate control: CBR\n");
-        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %d bps\n", avctx->bit_rate);
+        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %"PRId64" bps\n", avctx->bit_rate);
     }
     av_log(avctx, AV_LOG_DEBUG, "complexity: %d\n",
            avctx->compression_level);
@@ -293,10 +294,8 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     /* write output if all frames for the packet have been encoded */
     if (s->pkt_frame_count == s->frames_per_packet) {
         s->pkt_frame_count = 0;
-        if ((ret = ff_alloc_packet(avpkt, speex_bits_nbytes(&s->bits)))) {
-            av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        if ((ret = ff_alloc_packet2(avctx, avpkt, speex_bits_nbytes(&s->bits), 0)) < 0)
             return ret;
-        }
         ret = speex_bits_write(&s->bits, avpkt->data, avpkt->size);
         speex_bits_reset(&s->bits);
 
@@ -335,7 +334,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass speex_class = {
     .class_name = "libspeex",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -364,7 +363,7 @@ AVCodec ff_libspeex_encoder = {
                                            AV_CH_LAYOUT_STEREO,
                                            0 },
     .supported_samplerates = (const int[]){ 8000, 16000, 32000, 0 },
-    .priv_class     = &class,
+    .priv_class     = &speex_class,
     .defaults       = defaults,
     .wrapper_name   = "libspeex",
 };
diff --git a/libavcodec/libtheoraenc.c b/libavcodec/libtheoraenc.c
index 2676df4..16966ed 100644
--- a/libavcodec/libtheoraenc.c
+++ b/libavcodec/libtheoraenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Paul Richards <paul.richards@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
  * and o_ prefixes on variables which are libogg types.
  */
 
-/* Libav includes */
+/* FFmpeg includes */
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/pixdesc.h"
@@ -96,13 +96,14 @@ static int get_stats(AVCodecContext *avctx, int eos)
     bytes = th_encode_ctl(h->t_state, TH_ENCCTL_2PASS_OUT, &buf, sizeof(buf));
     if (bytes < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting first pass stats\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
     if (!eos) {
-        h->stats = av_fast_realloc(h->stats, &h->stats_size,
+        void *tmp = av_fast_realloc(h->stats, &h->stats_size,
                                    h->stats_offset + bytes);
-        if (!h->stats)
+        if (!tmp)
             return AVERROR(ENOMEM);
+        h->stats = tmp;
         memcpy(h->stats + h->stats_offset, buf, bytes);
         h->stats_offset += bytes;
     } else {
@@ -117,7 +118,7 @@ static int get_stats(AVCodecContext *avctx, int eos)
     return 0;
 #else
     av_log(avctx, AV_LOG_ERROR, "libtheora too old to support 2pass\n");
-    return -1;
+    return AVERROR(ENOSUP);
 #endif
 }
 
@@ -131,12 +132,14 @@ static int submit_stats(AVCodecContext *avctx)
     if (!h->stats) {
         if (!avctx->stats_in) {
             av_log(avctx, AV_LOG_ERROR, "No statsfile for second pass\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
         h->stats_size = strlen(avctx->stats_in) * 3/4;
         h->stats      = av_malloc(h->stats_size);
-        if (!h->stats)
+        if (!h->stats) {
+            h->stats_size = 0;
             return AVERROR(ENOMEM);
+        }
         h->stats_size = av_base64_decode(h->stats, avctx->stats_in, h->stats_size);
     }
     while (h->stats_size - h->stats_offset > 0) {
@@ -145,7 +148,7 @@ static int submit_stats(AVCodecContext *avctx)
                               h->stats_size - h->stats_offset);
         if (bytes < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error submitting stats\n");
-            return -1;
+            return AVERROR_EXTERNAL;
         }
         if (!bytes)
             return 0;
@@ -154,7 +157,7 @@ static int submit_stats(AVCodecContext *avctx)
     return 0;
 #else
     av_log(avctx, AV_LOG_ERROR, "libtheora too old to support 2pass\n");
-    return -1;
+    return AVERROR(ENOSUP);
 #endif
 }
 
@@ -166,6 +169,7 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     unsigned int offset;
     TheoraContext *h = avc_context->priv_data;
     uint32_t gop_size = avc_context->gop_size;
+    int ret;
 
     /* Set up the theora_info struct */
     th_info_init(&t_info);
@@ -202,17 +206,18 @@ static av_cold int encode_init(AVCodecContext* avc_context)
         t_info.pixel_fmt = TH_PF_444;
     else {
         av_log(avc_context, AV_LOG_ERROR, "Unsupported pix_fmt\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
-    av_pix_fmt_get_chroma_sub_sample(avc_context->pix_fmt,
-                                     &h->uv_hshift, &h->uv_vshift);
+    ret = av_pix_fmt_get_chroma_sub_sample(avc_context->pix_fmt, &h->uv_hshift, &h->uv_vshift);
+    if (ret)
+        return ret;
 
     if (avc_context->flags & AV_CODEC_FLAG_QSCALE) {
-        /* to be constant with the libvorbis implementation, clip global_quality to 0 - 10
-           Theora accepts a quality parameter p, which is:
-                * 0 <= p <=63
-                * an int value
-         */
+        /* Clip global_quality in QP units to the [0 - 10] range
+           to be consistent with the libvorbis implementation.
+           Theora accepts a quality parameter which is an int value in
+           the [0 - 63] range.
+        */
         t_info.quality        = av_clipf(avc_context->global_quality / (float)FF_QP2LAMBDA, 0, 10) * 6.3;
         t_info.target_bitrate = 0;
     } else {
@@ -224,7 +229,7 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     h->t_state = th_encode_alloc(&t_info);
     if (!h->t_state) {
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_init failed\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     h->keyframe_mask = (1 << t_info.keyframe_granule_shift) - 1;
@@ -234,16 +239,16 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     if (th_encode_ctl(h->t_state, TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
                       &gop_size, sizeof(gop_size))) {
         av_log(avc_context, AV_LOG_ERROR, "Error setting GOP size\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     // need to enable 2 pass (via TH_ENCCTL_2PASS_) before encoding headers
     if (avc_context->flags & AV_CODEC_FLAG_PASS1) {
-        if (get_stats(avc_context, 0))
-            return -1;
+        if ((ret = get_stats(avc_context, 0)) < 0)
+            return ret;
     } else if (avc_context->flags & AV_CODEC_FLAG_PASS2) {
-        if (submit_stats(avc_context))
-            return -1;
+        if ((ret = submit_stats(avc_context)) < 0)
+            return ret;
     }
 
     /*
@@ -259,8 +264,8 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     th_comment_init(&t_comment);
 
     while (th_encode_flushheader(h->t_state, &t_comment, &o_packet))
-        if (concatenate_packet(&offset, avc_context, &o_packet))
-            return -1;
+        if ((ret = concatenate_packet(&offset, avc_context, &o_packet)) < 0)
+            return ret;
 
     th_comment_clear(&t_comment);
 
@@ -279,8 +284,8 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
     if (!frame) {
         th_encode_packetout(h->t_state, 1, &o_packet);
         if (avc_context->flags & AV_CODEC_FLAG_PASS1)
-            if (get_stats(avc_context, 1))
-                return -1;
+            if ((ret = get_stats(avc_context, 1)) < 0)
+                return ret;
         return 0;
     }
 
@@ -293,8 +298,8 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
     }
 
     if (avc_context->flags & AV_CODEC_FLAG_PASS2)
-        if (submit_stats(avc_context))
-            return -1;
+        if ((ret = submit_stats(avc_context)) < 0)
+            return ret;
 
     /* Now call into theora_encode_YUVin */
     result = th_encode_ycbcr_in(h->t_state, t_yuv_buffer);
@@ -312,12 +317,12 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
             break;
         }
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_YUVin failed (%s) [%d]\n", message, result);
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     if (avc_context->flags & AV_CODEC_FLAG_PASS1)
-        if (get_stats(avc_context, 0))
-            return -1;
+        if ((ret = get_stats(avc_context, 0)) < 0)
+            return ret;
 
     /* Pick up returned ogg_packet */
     result = th_encode_packetout(h->t_state, 0, &o_packet);
@@ -330,14 +335,12 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
         break;
     default:
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_packetout failed [%d]\n", result);
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     /* Copy ogg_packet content out to buffer */
-    if ((ret = ff_alloc_packet(pkt, o_packet.bytes)) < 0) {
-        av_log(avc_context, AV_LOG_ERROR, "Error getting output packet of size %ld.\n", o_packet.bytes);
+    if ((ret = ff_alloc_packet2(avc_context, pkt, o_packet.bytes, 0)) < 0)
         return ret;
-    }
     memcpy(pkt->data, o_packet.packet, o_packet.bytes);
 
     // HACK: assumes no encoder delay, this is true until libtheora becomes
diff --git a/libavcodec/libtwolame.c b/libavcodec/libtwolame.c
index 9b0fe16..030f888 100644
--- a/libavcodec/libtwolame.c
+++ b/libavcodec/libtwolame.c
@@ -2,20 +2,20 @@
  * Interface to libtwolame for mp2 encoding
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,6 +77,10 @@ static av_cold int twolame_encode_init(AVCodecContext *avctx)
     twolame_set_num_channels(s->glopts, avctx->channels);
     twolame_set_in_samplerate(s->glopts, avctx->sample_rate);
     twolame_set_out_samplerate(s->glopts, avctx->sample_rate);
+
+    if (!avctx->bit_rate)
+        avctx->bit_rate = avctx->sample_rate < 28000 ? 160000 : 384000;
+
     if (avctx->flags & AV_CODEC_FLAG_QSCALE || !avctx->bit_rate) {
         twolame_set_VBR(s->glopts, TRUE);
         twolame_set_VBR_level(s->glopts,
@@ -102,7 +106,7 @@ static int twolame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     TWOLAMEContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, MPA_MAX_CODED_FRAME_SIZE)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MPA_MAX_CODED_FRAME_SIZE, 0)) < 0)
         return ret;
 
     if (frame) {
@@ -190,7 +194,7 @@ static const AVClass twolame_class = {
 };
 
 static const AVCodecDefault twolame_defaults[] = {
-    { "b", "384000" },
+    { "b", "0" },
     { NULL },
 };
 
diff --git a/libavcodec/libvo-aacenc.c b/libavcodec/libvo-aacenc.c
deleted file mode 100644
index ae3b54e..0000000
--- a/libavcodec/libvo-aacenc.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * AAC encoder wrapper
- * Copyright (c) 2010 Martin Storsjo
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <vo-aacenc/voAAC.h>
-#include <vo-aacenc/cmnMemory.h>
-
-#include "avcodec.h"
-#include "audio_frame_queue.h"
-#include "internal.h"
-#include "mpeg4audio.h"
-
-#define FRAME_SIZE 1024
-#define ENC_DELAY  1600
-
-typedef struct AACContext {
-    VO_AUDIO_CODECAPI codec_api;
-    VO_HANDLE handle;
-    VO_MEM_OPERATOR mem_operator;
-    VO_CODEC_INIT_USERDATA user_data;
-    VO_PBYTE end_buffer;
-    AudioFrameQueue afq;
-    int last_frame;
-    int last_samples;
-} AACContext;
-
-
-static int aac_encode_close(AVCodecContext *avctx)
-{
-    AACContext *s = avctx->priv_data;
-
-    s->codec_api.Uninit(s->handle);
-    av_freep(&avctx->extradata);
-    ff_af_queue_close(&s->afq);
-    av_freep(&s->end_buffer);
-
-    return 0;
-}
-
-static av_cold int aac_encode_init(AVCodecContext *avctx)
-{
-    AACContext *s = avctx->priv_data;
-    AACENC_PARAM params = { 0 };
-    int index, ret;
-
-    avctx->frame_size = FRAME_SIZE;
-    avctx->initial_padding = ENC_DELAY;
-    s->last_frame     = 2;
-    ff_af_queue_init(avctx, &s->afq);
-
-    s->end_buffer = av_mallocz(avctx->frame_size * avctx->channels * 2);
-    if (!s->end_buffer) {
-        ret = AVERROR(ENOMEM);
-        goto error;
-    }
-
-    voGetAACEncAPI(&s->codec_api);
-
-    s->mem_operator.Alloc = cmnMemAlloc;
-    s->mem_operator.Copy = cmnMemCopy;
-    s->mem_operator.Free = cmnMemFree;
-    s->mem_operator.Set = cmnMemSet;
-    s->mem_operator.Check = cmnMemCheck;
-    s->user_data.memflag = VO_IMF_USERMEMOPERATOR;
-    s->user_data.memData = &s->mem_operator;
-    s->codec_api.Init(&s->handle, VO_AUDIO_CodingAAC, &s->user_data);
-
-    params.sampleRate = avctx->sample_rate;
-    params.bitRate    = avctx->bit_rate;
-    params.nChannels  = avctx->channels;
-    params.adtsUsed   = !(avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER);
-    if (s->codec_api.SetParam(s->handle, VO_PID_AAC_ENCPARAM, &params)
-        != VO_ERR_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to set encoding parameters\n");
-        ret = AVERROR(EINVAL);
-        goto error;
-    }
-
-    for (index = 0; index < 16; index++)
-        if (avctx->sample_rate == avpriv_mpeg4audio_sample_rates[index])
-            break;
-    if (index == 16) {
-        av_log(avctx, AV_LOG_ERROR, "Unsupported sample rate %d\n",
-                                    avctx->sample_rate);
-        ret = AVERROR(ENOSYS);
-        goto error;
-    }
-    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
-        avctx->extradata_size = 2;
-        avctx->extradata      = av_mallocz(avctx->extradata_size +
-                                           AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!avctx->extradata) {
-            ret = AVERROR(ENOMEM);
-            goto error;
-        }
-
-        avctx->extradata[0] = 0x02 << 3 | index >> 1;
-        avctx->extradata[1] = (index & 0x01) << 7 | avctx->channels << 3;
-    }
-    return 0;
-error:
-    aac_encode_close(avctx);
-    return ret;
-}
-
-static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
-                            const AVFrame *frame, int *got_packet_ptr)
-{
-    AACContext *s = avctx->priv_data;
-    VO_CODECBUFFER input = { 0 }, output = { 0 };
-    VO_AUDIO_OUTPUTINFO output_info = { { 0 } };
-    VO_PBYTE samples;
-    int ret;
-
-    /* handle end-of-stream small frame and flushing */
-    if (!frame) {
-        if (s->last_frame <= 0)
-            return 0;
-        if (s->last_samples > 0 && s->last_samples < ENC_DELAY - FRAME_SIZE) {
-            s->last_samples = 0;
-            s->last_frame--;
-        }
-        s->last_frame--;
-        memset(s->end_buffer, 0, 2 * avctx->channels * avctx->frame_size);
-        samples = s->end_buffer;
-    } else {
-        if (frame->nb_samples < avctx->frame_size) {
-            s->last_samples = frame->nb_samples;
-            memcpy(s->end_buffer, frame->data[0], 2 * avctx->channels * frame->nb_samples);
-            samples = s->end_buffer;
-        } else {
-            samples = (VO_PBYTE)frame->data[0];
-        }
-        /* add current frame to the queue */
-        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
-            return ret;
-    }
-
-    if ((ret = ff_alloc_packet(avpkt, FFMAX(8192, 768 * avctx->channels)))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
-        return ret;
-    }
-
-    input.Buffer  = samples;
-    input.Length  = 2 * avctx->channels * avctx->frame_size;
-    output.Buffer = avpkt->data;
-    output.Length = avpkt->size;
-
-    s->codec_api.SetInputData(s->handle, &input);
-    if (s->codec_api.GetOutputData(s->handle, &output, &output_info)
-        != VO_ERR_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to encode frame\n");
-        return AVERROR(EINVAL);
-    }
-
-    /* Get the next frame pts/duration */
-    ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
-                       &avpkt->duration);
-
-    avpkt->size = output.Length;
-    *got_packet_ptr = 1;
-    return 0;
-}
-
-AVCodec ff_libvo_aacenc_encoder = {
-    .name           = "libvo_aacenc",
-    .long_name      = NULL_IF_CONFIG_SMALL("Android VisualOn AAC (Advanced Audio Coding)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_AAC,
-    .priv_data_size = sizeof(AACContext),
-    .init           = aac_encode_init,
-    .encode2        = aac_encode_frame,
-    .close          = aac_encode_close,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
-    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
-                                                     AV_SAMPLE_FMT_NONE },
-    .wrapper_name   = "libvo_aacenc",
-};
diff --git a/libavcodec/libvo-amrwbenc.c b/libavcodec/libvo-amrwbenc.c
index 7be14c2..77d0cce 100644
--- a/libavcodec/libvo-amrwbenc.c
+++ b/libavcodec/libvo-amrwbenc.c
@@ -2,20 +2,20 @@
  * AMR Audio encoder stub
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,7 @@ static const AVOption options[] = {
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass amrwb_class = {
     .class_name = "libvo_amrwbenc",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -82,7 +82,7 @@ static av_cold int amr_wb_encode_init(AVCodecContext *avctx)
 {
     AMRWBContext *s = avctx->priv_data;
 
-    if (avctx->sample_rate != 16000) {
+    if (avctx->sample_rate != 16000 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
         av_log(avctx, AV_LOG_ERROR, "Only 16000Hz sample rate supported\n");
         return AVERROR(ENOSYS);
     }
@@ -118,10 +118,8 @@ static int amr_wb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     const int16_t *samples = (const int16_t *)frame->data[0];
     int size, ret;
 
-    if ((ret = ff_alloc_packet(avpkt, MAX_PACKET_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MAX_PACKET_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (s->last_bitrate != avctx->bit_rate) {
         s->mode         = get_wb_bitrate_mode(avctx->bit_rate, avctx);
@@ -153,6 +151,6 @@ AVCodec ff_libvo_amrwbenc_encoder = {
     .close          = amr_wb_encode_close,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &amrwb_class,
     .wrapper_name   = "libvo_amrwbenc",
 };
diff --git a/libavcodec/libvorbisdec.c b/libavcodec/libvorbisdec.c
new file mode 100644
index 0000000..ecf690a
--- /dev/null
+++ b/libavcodec/libvorbisdec.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <vorbis/vorbisenc.h>
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+typedef struct OggVorbisDecContext {
+    vorbis_info vi;                     /**< vorbis_info used during init   */
+    vorbis_dsp_state vd;                /**< DSP state used for analysis    */
+    vorbis_block vb;                    /**< vorbis_block used for analysis */
+    vorbis_comment vc;                  /**< VorbisComment info             */
+    ogg_packet op;                      /**< ogg packet                     */
+} OggVorbisDecContext;
+
+static int oggvorbis_decode_close(AVCodecContext *avccontext);
+
+static int oggvorbis_decode_init(AVCodecContext *avccontext) {
+    OggVorbisDecContext *context = avccontext->priv_data ;
+    uint8_t *p= avccontext->extradata;
+    int i, hsizes[3], ret;
+    unsigned char *headers[3], *extradata = avccontext->extradata;
+
+    if(! avccontext->extradata_size || ! p) {
+        av_log(avccontext, AV_LOG_ERROR, "vorbis extradata absent\n");
+        return AVERROR(EINVAL);
+    }
+
+    vorbis_info_init(&context->vi) ;
+    vorbis_comment_init(&context->vc) ;
+
+    if(p[0] == 0 && p[1] == 30) {
+        for(i = 0; i < 3; i++){
+            hsizes[i] = bytestream_get_be16((const uint8_t **)&p);
+            headers[i] = p;
+            p += hsizes[i];
+        }
+    } else if(*p == 2) {
+        unsigned int offset = 1;
+        p++;
+        for(i=0; i<2; i++) {
+            hsizes[i] = 0;
+            while((*p == 0xFF) && (offset < avccontext->extradata_size)) {
+                hsizes[i] += 0xFF;
+                offset++;
+                p++;
+            }
+            if(offset >= avccontext->extradata_size - 1) {
+                av_log(avccontext, AV_LOG_ERROR,
+                       "vorbis header sizes damaged\n");
+                ret = AVERROR_INVALIDDATA;
+                goto error;
+            }
+            hsizes[i] += *p;
+            offset++;
+            p++;
+        }
+        hsizes[2] = avccontext->extradata_size - hsizes[0]-hsizes[1]-offset;
+#if 0
+        av_log(avccontext, AV_LOG_DEBUG,
+               "vorbis header sizes: %d, %d, %d, / extradata_len is %d \n",
+               hsizes[0], hsizes[1], hsizes[2], avccontext->extradata_size);
+#endif
+        headers[0] = extradata + offset;
+        headers[1] = extradata + offset + hsizes[0];
+        headers[2] = extradata + offset + hsizes[0] + hsizes[1];
+    } else {
+        av_log(avccontext, AV_LOG_ERROR,
+               "vorbis initial header len is wrong: %d\n", *p);
+        ret = AVERROR_INVALIDDATA;
+        goto error;
+    }
+
+    for(i=0; i<3; i++){
+        context->op.b_o_s= i==0;
+        context->op.bytes = hsizes[i];
+        context->op.packet = headers[i];
+        if(vorbis_synthesis_headerin(&context->vi, &context->vc, &context->op)<0){
+            av_log(avccontext, AV_LOG_ERROR, "%d. vorbis header damaged\n", i+1);
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
+    }
+
+    avccontext->channels = context->vi.channels;
+    avccontext->sample_rate = context->vi.rate;
+    avccontext->sample_fmt = AV_SAMPLE_FMT_S16;
+    avccontext->time_base= (AVRational){1, avccontext->sample_rate};
+
+    vorbis_synthesis_init(&context->vd, &context->vi);
+    vorbis_block_init(&context->vd, &context->vb);
+
+    return 0 ;
+
+  error:
+    oggvorbis_decode_close(avccontext);
+    return ret;
+}
+
+
+static inline int conv(int samples, float **pcm, char *buf, int channels) {
+    int i, j;
+    ogg_int16_t *ptr, *data = (ogg_int16_t*)buf ;
+    float *mono ;
+
+    for(i = 0 ; i < channels ; i++){
+        ptr = &data[i];
+        mono = pcm[i] ;
+
+        for(j = 0 ; j < samples ; j++) {
+            *ptr = av_clip_int16(mono[j] * 32767.f);
+            ptr += channels;
+        }
+    }
+
+    return 0 ;
+}
+
+static int oggvorbis_decode_frame(AVCodecContext *avccontext, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    OggVorbisDecContext *context = avccontext->priv_data ;
+    AVFrame *frame = data;
+    float **pcm ;
+    ogg_packet *op= &context->op;
+    int samples, total_samples, total_bytes;
+    int ret;
+    int16_t *output;
+
+    if(!avpkt->size){
+    //FIXME flush
+        return 0;
+    }
+
+    frame->nb_samples = 8192*4;
+    if ((ret = ff_get_buffer(avccontext, frame, 0)) < 0)
+        return ret;
+    output = (int16_t *)frame->data[0];
+
+
+    op->packet = avpkt->data;
+    op->bytes  = avpkt->size;
+
+//    av_log(avccontext, AV_LOG_DEBUG, "%d %d %d %"PRId64" %"PRId64" %d %d\n", op->bytes, op->b_o_s, op->e_o_s, op->granulepos, op->packetno, buf_size, context->vi.rate);
+
+/*    for(i=0; i<op->bytes; i++)
+      av_log(avccontext, AV_LOG_DEBUG, "%02X ", op->packet[i]);
+    av_log(avccontext, AV_LOG_DEBUG, "\n");*/
+
+    if(vorbis_synthesis(&context->vb, op) == 0)
+        vorbis_synthesis_blockin(&context->vd, &context->vb) ;
+
+    total_samples = 0 ;
+    total_bytes = 0 ;
+
+    while((samples = vorbis_synthesis_pcmout(&context->vd, &pcm)) > 0) {
+        conv(samples, pcm, (char*)output + total_bytes, context->vi.channels) ;
+        total_bytes += samples * 2 * context->vi.channels ;
+        total_samples += samples ;
+        vorbis_synthesis_read(&context->vd, samples) ;
+    }
+
+    frame->nb_samples = total_samples;
+    *got_frame_ptr   = total_samples > 0;
+    return avpkt->size;
+}
+
+
+static int oggvorbis_decode_close(AVCodecContext *avccontext) {
+    OggVorbisDecContext *context = avccontext->priv_data ;
+
+    vorbis_block_clear(&context->vb);
+    vorbis_dsp_clear(&context->vd);
+    vorbis_info_clear(&context->vi) ;
+    vorbis_comment_clear(&context->vc) ;
+
+    return 0 ;
+}
+
+
+AVCodec ff_libvorbis_decoder = {
+    .name           = "libvorbis",
+    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_VORBIS,
+    .priv_data_size = sizeof(OggVorbisDecContext),
+    .init           = oggvorbis_decode_init,
+    .decode         = oggvorbis_decode_frame,
+    .close          = oggvorbis_decode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+};
diff --git a/libavcodec/libvorbis.c b/libavcodec/libvorbisenc.c
index 972ca6a..f78f872 100644
--- a/libavcodec/libvorbis.c
+++ b/libavcodec/libvorbisenc.c
@@ -1,42 +1,34 @@
 /*
- * copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/**
- * @file
- * Vorbis encoding support via libvorbisenc.
- * @author Mark Hills <mark@pogo.org.uk>
- */
-
 #include <vorbis/vorbisenc.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "audio_frame_queue.h"
-#include "bytestream.h"
 #include "internal.h"
 #include "vorbis.h"
 #include "vorbis_parser.h"
 
-#undef NDEBUG
-#include <assert.h>
 
 /* Number of samples the user should send in each call.
  * This value is used because it is the LCD of all possible frame sizes, so
@@ -47,7 +39,7 @@
 
 #define BUFFER_SIZE (1024 * 64)
 
-typedef struct LibvorbisContext {
+typedef struct LibvorbisEncContext {
     AVClass *av_class;                  /**< class for AVOptions            */
     vorbis_info vi;                     /**< vorbis_info used during init   */
     vorbis_dsp_state vd;                /**< DSP state used for analysis    */
@@ -56,14 +48,13 @@ typedef struct LibvorbisContext {
     int eof;                            /**< end-of-file flag               */
     int dsp_initialized;                /**< vd has been initialized        */
     vorbis_comment vc;                  /**< VorbisComment info             */
-    ogg_packet op;                      /**< ogg packet                     */
     double iblock;                      /**< impulse block bias option      */
     AVVorbisParseContext *vp;           /**< parse context to get durations */
     AudioFrameQueue afq;                /**< frame queue for timestamps     */
-} LibvorbisContext;
+} LibvorbisEncContext;
 
 static const AVOption options[] = {
-    { "iblock", "Sets the impulse block bias", offsetof(LibvorbisContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
+    { "iblock", "Sets the impulse block bias", offsetof(LibvorbisEncContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
     { NULL }
 };
 
@@ -72,14 +63,13 @@ static const AVCodecDefault defaults[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass vorbis_class = {
     .class_name = "libvorbis",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-
 static int vorbis_error_to_averror(int ov_err)
 {
     switch (ov_err) {
@@ -92,7 +82,7 @@ static int vorbis_error_to_averror(int ov_err)
 
 static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     double cfreq;
     int ret;
 
@@ -122,14 +112,14 @@ static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
         /* variable bitrate by estimate, disable slow rate management */
         if (minrate == -1 && maxrate == -1)
             if ((ret = vorbis_encode_ctl(vi, OV_ECTL_RATEMANAGE2_SET, NULL)))
-                goto error;
+                goto error; /* should not happen */
     }
 
     /* cutoff frequency */
     if (avctx->cutoff > 0) {
         cfreq = avctx->cutoff / 1000.0;
         if ((ret = vorbis_encode_ctl(vi, OV_ECTL_LOWPASS_SET, &cfreq)))
-            goto error;
+            goto error; /* should not happen */
     }
 
     /* impulse block bias */
@@ -138,6 +128,35 @@ static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
             goto error;
     }
 
+    if (avctx->channels == 3 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_STEREO|AV_CH_FRONT_CENTER) ||
+        avctx->channels == 4 &&
+            avctx->channel_layout != AV_CH_LAYOUT_2_2 &&
+            avctx->channel_layout != AV_CH_LAYOUT_QUAD ||
+        avctx->channels == 5 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0_BACK ||
+        avctx->channels == 6 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1_BACK ||
+        avctx->channels == 7 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_5POINT1|AV_CH_BACK_CENTER) ||
+        avctx->channels == 8 &&
+            avctx->channel_layout != AV_CH_LAYOUT_7POINT1) {
+        if (avctx->channel_layout) {
+            char name[32];
+            av_get_channel_layout_string(name, sizeof(name), avctx->channels,
+                                         avctx->channel_layout);
+            av_log(avctx, AV_LOG_ERROR, "%s not supported by Vorbis: "
+                                             "output stream will have incorrect "
+                                             "channel layout.\n", name);
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+                                               "will use Vorbis channel layout for "
+                                               "%d channels.\n", avctx->channels);
+        }
+    }
+
     if ((ret = vorbis_encode_setup_init(vi)))
         goto error;
 
@@ -154,7 +173,7 @@ static int xiph_len(int l)
 
 static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
 
     /* notify vorbisenc this is EOF */
     if (s->dsp_initialized)
@@ -164,7 +183,7 @@ static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
     vorbis_dsp_clear(&s->vd);
     vorbis_info_clear(&s->vi);
 
-    av_fifo_free(s->pkt_fifo);
+    av_fifo_freep(&s->pkt_fifo);
     ff_af_queue_close(&s->afq);
     av_freep(&avctx->extradata);
 
@@ -175,7 +194,7 @@ static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
 
 static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     ogg_packet header, header_comm, header_code;
     uint8_t *p;
     unsigned int offset;
@@ -199,7 +218,8 @@ static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
     }
 
     vorbis_comment_init(&s->vc);
-    vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
+    if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT))
+        vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
 
     if ((ret = vorbis_analysis_headerout(&s->vd, &s->vc, &header, &header_comm,
                                          &header_code))) {
@@ -226,7 +246,7 @@ static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
     offset += header_comm.bytes;
     memcpy(&p[offset], header_code.packet, header_code.bytes);
     offset += header_code.bytes;
-    assert(offset == avctx->extradata_size);
+    av_assert0(offset == avctx->extradata_size);
 
     s->vp = av_vorbis_parse_init(avctx->extradata, avctx->extradata_size);
     if (!s->vp) {
@@ -254,7 +274,7 @@ error:
 static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                                   const AVFrame *frame, int *got_packet_ptr)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     ogg_packet op;
     int ret, duration;
 
@@ -278,7 +298,7 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
             return ret;
     } else {
-        if (!s->eof)
+        if (!s->eof && s->afq.frame_alloc)
             if ((ret = vorbis_analysis_wrote(&s->vd, 0)) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n");
                 return vorbis_error_to_averror(ret);
@@ -296,7 +316,7 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         /* add any available packets to the output packet buffer */
         while ((ret = vorbis_bitrate_flushpacket(&s->vd, &op)) == 1) {
             if (av_fifo_space(s->pkt_fifo) < sizeof(ogg_packet) + op.bytes) {
-                av_log(avctx, AV_LOG_ERROR, "packet buffer is too small");
+                av_log(avctx, AV_LOG_ERROR, "packet buffer is too small\n");
                 return AVERROR_BUG;
             }
             av_fifo_generic_write(s->pkt_fifo, &op, sizeof(ogg_packet), NULL);
@@ -318,10 +338,8 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     av_fifo_generic_read(s->pkt_fifo, &op, sizeof(ogg_packet), NULL);
 
-    if ((ret = ff_alloc_packet(avpkt, op.bytes))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, op.bytes, 0)) < 0)
         return ret;
-    }
     av_fifo_generic_read(s->pkt_fifo, avpkt->data, op.bytes, NULL);
 
     avpkt->pts = ff_samples_to_time_base(avctx, op.granulepos);
@@ -330,9 +348,12 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     if (duration > 0) {
         /* we do not know encoder delay until we get the first packet from
          * libvorbis, so we have to update the AudioFrameQueue counts */
-        if (!avctx->initial_padding) {
+        if (!avctx->initial_padding && s->afq.frames) {
             avctx->initial_padding    = duration;
-            s->afq.remaining_delay   += duration;
+            av_assert0(!s->afq.remaining_delay);
+            s->afq.frames->duration  += duration;
+            if (s->afq.frames->pts != AV_NOPTS_VALUE)
+                s->afq.frames->pts       -= duration;
             s->afq.remaining_samples += duration;
         }
         ff_af_queue_remove(&s->afq, duration, &avpkt->pts, &avpkt->duration);
@@ -344,17 +365,17 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
 AVCodec ff_libvorbis_encoder = {
     .name           = "libvorbis",
-    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis Vorbis"),
+    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis"),
     .type           = AVMEDIA_TYPE_AUDIO,
     .id             = AV_CODEC_ID_VORBIS,
-    .priv_data_size = sizeof(LibvorbisContext),
+    .priv_data_size = sizeof(LibvorbisEncContext),
     .init           = libvorbis_encode_init,
     .encode2        = libvorbis_encode_frame,
     .close          = libvorbis_encode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &vorbis_class,
     .defaults       = defaults,
     .wrapper_name   = "libvorbis",
 };
diff --git a/libavcodec/libvpx.c b/libavcodec/libvpx.c
index 49f966d..cc055a0 100644
--- a/libavcodec/libvpx.c
+++ b/libavcodec/libvpx.c
@@ -1,79 +1,80 @@
 /*
  * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <vpx/vpx_codec.h>
-
 #include "libvpx.h"
+#include "config.h"
 
-enum AVPixelFormat ff_vpx_imgfmt_to_pixfmt(vpx_img_fmt_t img)
-{
-    switch (img) {
-    case VPX_IMG_FMT_RGB24:     return AV_PIX_FMT_RGB24;
-    case VPX_IMG_FMT_RGB565:    return AV_PIX_FMT_RGB565BE;
-    case VPX_IMG_FMT_RGB555:    return AV_PIX_FMT_RGB555BE;
-    case VPX_IMG_FMT_UYVY:      return AV_PIX_FMT_UYVY422;
-    case VPX_IMG_FMT_YUY2:      return AV_PIX_FMT_YUYV422;
-    case VPX_IMG_FMT_YVYU:      return AV_PIX_FMT_YVYU422;
-    case VPX_IMG_FMT_BGR24:     return AV_PIX_FMT_BGR24;
-    case VPX_IMG_FMT_ARGB:      return AV_PIX_FMT_ARGB;
-    case VPX_IMG_FMT_ARGB_LE:   return AV_PIX_FMT_BGRA;
-    case VPX_IMG_FMT_RGB565_LE: return AV_PIX_FMT_RGB565LE;
-    case VPX_IMG_FMT_RGB555_LE: return AV_PIX_FMT_RGB555LE;
-    case VPX_IMG_FMT_I420:      return AV_PIX_FMT_YUV420P;
-    case VPX_IMG_FMT_I422:      return AV_PIX_FMT_YUV422P;
-    case VPX_IMG_FMT_I444:      return AV_PIX_FMT_YUV444P;
-    case VPX_IMG_FMT_444A:      return AV_PIX_FMT_YUVA444P;
-#if VPX_IMAGE_ABI_VERSION >= 3
-    case VPX_IMG_FMT_I440:      return AV_PIX_FMT_YUV440P;
-    case VPX_IMG_FMT_I42016:    return AV_PIX_FMT_YUV420P16BE;
-    case VPX_IMG_FMT_I42216:    return AV_PIX_FMT_YUV422P16BE;
-    case VPX_IMG_FMT_I44416:    return AV_PIX_FMT_YUV444P16BE;
+#if CONFIG_LIBVPX_VP9_ENCODER
+#include <vpx/vpx_encoder.h>
+#include <vpx/vp8cx.h>
 #endif
-    default:                    return AV_PIX_FMT_NONE;
-    }
-}
 
-vpx_img_fmt_t ff_vpx_pixfmt_to_imgfmt(enum AVPixelFormat pix)
-{
-    switch (pix) {
-    case AV_PIX_FMT_RGB24:        return VPX_IMG_FMT_RGB24;
-    case AV_PIX_FMT_RGB565BE:     return VPX_IMG_FMT_RGB565;
-    case AV_PIX_FMT_RGB555BE:     return VPX_IMG_FMT_RGB555;
-    case AV_PIX_FMT_UYVY422:      return VPX_IMG_FMT_UYVY;
-    case AV_PIX_FMT_YUYV422:      return VPX_IMG_FMT_YUY2;
-    case AV_PIX_FMT_YVYU422:      return VPX_IMG_FMT_YVYU;
-    case AV_PIX_FMT_BGR24:        return VPX_IMG_FMT_BGR24;
-    case AV_PIX_FMT_ARGB:         return VPX_IMG_FMT_ARGB;
-    case AV_PIX_FMT_BGRA:         return VPX_IMG_FMT_ARGB_LE;
-    case AV_PIX_FMT_RGB565LE:     return VPX_IMG_FMT_RGB565_LE;
-    case AV_PIX_FMT_RGB555LE:     return VPX_IMG_FMT_RGB555_LE;
-    case AV_PIX_FMT_YUV420P:      return VPX_IMG_FMT_I420;
-    case AV_PIX_FMT_YUV422P:      return VPX_IMG_FMT_I422;
-    case AV_PIX_FMT_YUV444P:      return VPX_IMG_FMT_I444;
-    case AV_PIX_FMT_YUVA444P:     return VPX_IMG_FMT_444A;
-#if VPX_IMAGE_ABI_VERSION >= 3
-    case AV_PIX_FMT_YUV440P:      return VPX_IMG_FMT_I440;
-    case AV_PIX_FMT_YUV420P16BE:  return VPX_IMG_FMT_I42016;
-    case AV_PIX_FMT_YUV422P16BE:  return VPX_IMG_FMT_I42216;
-    case AV_PIX_FMT_YUV444P16BE:  return VPX_IMG_FMT_I44416;
+static const enum AVPixelFormat vp9_pix_fmts_def[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUVA420P,
+    AV_PIX_FMT_NONE
+};
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+static const enum AVPixelFormat vp9_pix_fmts_highcol[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUVA420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV440P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat vp9_pix_fmts_highbd[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUVA420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV440P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV420P10,
+    AV_PIX_FMT_YUV422P10,
+    AV_PIX_FMT_YUV440P10,
+    AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_YUV420P12,
+    AV_PIX_FMT_YUV422P12,
+    AV_PIX_FMT_YUV440P12,
+    AV_PIX_FMT_YUV444P12,
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_GBRP10,
+    AV_PIX_FMT_GBRP12,
+    AV_PIX_FMT_NONE
+};
 #endif
-    default:                      return VPX_IMG_FMT_NONE;
+
+av_cold void ff_vp9_init_static(AVCodec *codec)
+{
+    codec->pix_fmts = vp9_pix_fmts_def;
+#if CONFIG_LIBVPX_VP9_ENCODER
+    {
+        vpx_codec_caps_t codec_caps = vpx_codec_get_caps(vpx_codec_vp9_cx());
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH)
+            codec->pix_fmts = vp9_pix_fmts_highbd;
+        else
+            codec->pix_fmts = vp9_pix_fmts_highcol;
     }
+#endif
 }
diff --git a/libavcodec/libvpx.h b/libavcodec/libvpx.h
index b437f37..22b697f 100644
--- a/libavcodec/libvpx.h
+++ b/libavcodec/libvpx.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,10 @@
 
 #include "avcodec.h"
 
+void ff_vp9_init_static(AVCodec *codec);
+#if 0
 enum AVPixelFormat ff_vpx_imgfmt_to_pixfmt(vpx_img_fmt_t img);
 vpx_img_fmt_t ff_vpx_pixfmt_to_imgfmt(enum AVPixelFormat pix);
+#endif
 
 #endif /* AVCODEC_LIBVPX_H */
diff --git a/libavcodec/libvpxdec.c b/libavcodec/libvpxdec.c
index 8480670..164dbda 100644
--- a/libavcodec/libvpxdec.c
+++ b/libavcodec/libvpxdec.c
@@ -1,26 +1,26 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * VP8 decoder support via libvpx
+ * VP8/9 decoder support via libvpx
  */
 
 #define VPX_CODEC_DISABLE_COMPAT 1
@@ -29,27 +29,33 @@
 
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "libvpx.h"
+#include "profiles.h"
 
-typedef struct VP8DecoderContext {
+typedef struct VPxDecoderContext {
     struct vpx_codec_ctx decoder;
-} VP8Context;
+    struct vpx_codec_ctx decoder_alpha;
+    int has_alpha_channel;
+} VPxContext;
 
 static av_cold int vpx_init(AVCodecContext *avctx,
-                            const struct vpx_codec_iface *iface)
+                            const struct vpx_codec_iface *iface,
+                            int is_alpha_decoder)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
     struct vpx_codec_dec_cfg deccfg = {
-        /* token partitions+1 would be a decent choice */
-        .threads = FFMIN(avctx->thread_count, 16)
+        .threads = FFMIN(avctx->thread_count ? avctx->thread_count : av_cpu_count(), 16)
     };
 
     av_log(avctx, AV_LOG_INFO, "%s\n", vpx_codec_version_str());
     av_log(avctx, AV_LOG_VERBOSE, "%s\n", vpx_codec_build_config());
 
-    if (vpx_codec_dec_init(&ctx->decoder, iface, &deccfg, 0) != VPX_CODEC_OK) {
+    if (vpx_codec_dec_init(
+            is_alpha_decoder ? &ctx->decoder_alpha : &ctx->decoder,
+            iface, &deccfg, 0) != VPX_CODEC_OK) {
         const char *error = vpx_codec_error(&ctx->decoder);
         av_log(avctx, AV_LOG_ERROR, "Failed to initialize decoder: %s\n",
                error);
@@ -59,33 +65,175 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     return 0;
 }
 
-static int vp8_decode(AVCodecContext *avctx,
-                      void *data, int *got_frame, AVPacket *avpkt)
+// returns 0 on success, AVERROR_INVALIDDATA otherwise
+static int set_pix_fmt(AVCodecContext *avctx, struct vpx_image *img,
+                       int has_alpha_channel)
 {
-    VP8Context *ctx = avctx->priv_data;
-    AVFrame *picture = data;
-    const void *iter = NULL;
-    struct vpx_image *img;
-    int ret;
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+#if VPX_IMAGE_ABI_VERSION >= 4
+    static const enum AVColorRange color_ranges[] = {
+        AVCOL_RANGE_MPEG, AVCOL_RANGE_JPEG
+    };
+    avctx->color_range = color_ranges[img->range];
+#endif
+    avctx->colorspace = colorspaces[img->cs];
+    if (avctx->codec_id == AV_CODEC_ID_VP8 && img->fmt != VPX_IMG_FMT_I420)
+        return AVERROR_INVALIDDATA;
+    switch (img->fmt) {
+    case VPX_IMG_FMT_I420:
+        if (avctx->codec_id == AV_CODEC_ID_VP9)
+            avctx->profile = FF_PROFILE_VP9_0;
+        avctx->pix_fmt =
+            has_alpha_channel ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
+        return 0;
+#if CONFIG_LIBVPX_VP9_DECODER
+    case VPX_IMG_FMT_I422:
+        avctx->profile = FF_PROFILE_VP9_1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        return 0;
+    case VPX_IMG_FMT_I440:
+        avctx->profile = FF_PROFILE_VP9_1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+        return 0;
+    case VPX_IMG_FMT_I444:
+        avctx->profile = FF_PROFILE_VP9_1;
+        avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                         AV_PIX_FMT_GBRP : AV_PIX_FMT_YUV444P;
+        return 0;
+    case VPX_IMG_FMT_I42016:
+        avctx->profile = FF_PROFILE_VP9_2;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P12;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    case VPX_IMG_FMT_I42216:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    case VPX_IMG_FMT_I44016:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P10;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P12;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    case VPX_IMG_FMT_I44416:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                             AV_PIX_FMT_GBRP10 : AV_PIX_FMT_YUV444P10;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                             AV_PIX_FMT_GBRP12 : AV_PIX_FMT_YUV444P12;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+#endif
+    default:
+        return AVERROR_INVALIDDATA;
+    }
+}
 
-    if (vpx_codec_decode(&ctx->decoder, avpkt->data, avpkt->size, NULL, 0) !=
-        VPX_CODEC_OK) {
-        const char *error  = vpx_codec_error(&ctx->decoder);
-        const char *detail = vpx_codec_error_detail(&ctx->decoder);
+static int decode_frame(AVCodecContext *avctx, vpx_codec_ctx_t *decoder,
+                        uint8_t *data, uint32_t data_sz)
+{
+    if (vpx_codec_decode(decoder, data, data_sz, NULL, 0) != VPX_CODEC_OK) {
+        const char *error  = vpx_codec_error(decoder);
+        const char *detail = vpx_codec_error_detail(decoder);
 
         av_log(avctx, AV_LOG_ERROR, "Failed to decode frame: %s\n", error);
-        if (detail)
+        if (detail) {
             av_log(avctx, AV_LOG_ERROR, "  Additional information: %s\n",
                    detail);
+        }
         return AVERROR_INVALIDDATA;
     }
+    return 0;
+}
 
-    if ((img = vpx_codec_get_frame(&ctx->decoder, &iter))) {
-        avctx->pix_fmt = ff_vpx_imgfmt_to_pixfmt(img->fmt);
-        if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
-            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d)\n",
-                   img->fmt);
-            return AVERROR_INVALIDDATA;
+static int vpx_decode(AVCodecContext *avctx,
+                      void *data, int *got_frame, AVPacket *avpkt)
+{
+    VPxContext *ctx = avctx->priv_data;
+    AVFrame *picture = data;
+    const void *iter = NULL;
+    const void *iter_alpha = NULL;
+    struct vpx_image *img, *img_alpha;
+    int ret;
+    uint8_t *side_data = NULL;
+    int side_data_size = 0;
+
+    ret = decode_frame(avctx, &ctx->decoder, avpkt->data, avpkt->size);
+    if (ret)
+        return ret;
+
+    side_data = av_packet_get_side_data(avpkt,
+                                        AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+                                        &side_data_size);
+    if (side_data_size > 1) {
+        const uint64_t additional_id = AV_RB64(side_data);
+        side_data += 8;
+        side_data_size -= 8;
+        if (additional_id == 1) {  // 1 stands for alpha channel data.
+            if (!ctx->has_alpha_channel) {
+                ctx->has_alpha_channel = 1;
+                ret = vpx_init(avctx,
+#if CONFIG_LIBVPX_VP8_DECODER && CONFIG_LIBVPX_VP9_DECODER
+                               (avctx->codec_id == AV_CODEC_ID_VP8) ?
+                               &vpx_codec_vp8_dx_algo : &vpx_codec_vp9_dx_algo,
+#elif CONFIG_LIBVPX_VP8_DECODER
+                               &vpx_codec_vp8_dx_algo,
+#else
+                               &vpx_codec_vp9_dx_algo,
+#endif
+                               1);
+                if (ret)
+                    return ret;
+            }
+            ret = decode_frame(avctx, &ctx->decoder_alpha, side_data,
+                               side_data_size);
+            if (ret)
+                return ret;
+        }
+    }
+
+    if ((img = vpx_codec_get_frame(&ctx->decoder, &iter)) &&
+        (!ctx->has_alpha_channel ||
+         (img_alpha = vpx_codec_get_frame(&ctx->decoder_alpha, &iter_alpha)))) {
+        uint8_t *planes[4];
+        int linesizes[4];
+
+        if (img->d_w > img->w || img->d_h > img->h) {
+            av_log(avctx, AV_LOG_ERROR, "Display dimensions %dx%d exceed storage %dx%d\n",
+                   img->d_w, img->d_h, img->w, img->h);
+            return AVERROR_EXTERNAL;
+        }
+
+        if ((ret = set_pix_fmt(avctx, img, ctx->has_alpha_channel)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d) / bit_depth (%d)\n",
+                   img->fmt, img->bit_depth);
+            return ret;
         }
 
         if ((int) img->d_w != avctx->width || (int) img->d_h != avctx->height) {
@@ -97,34 +245,37 @@ static int vp8_decode(AVCodecContext *avctx,
         }
         if ((ret = ff_get_buffer(avctx, picture, 0)) < 0)
             return ret;
-        av_image_copy(picture->data, picture->linesize, (const uint8_t **) img->planes,
-                      img->stride, avctx->pix_fmt, img->d_w, img->d_h);
-#if VPX_IMAGE_ABI_VERSION >= 4
-        switch (img->range) {
-        case VPX_CR_STUDIO_RANGE:
-            picture->color_range = AVCOL_RANGE_MPEG;
-            break;
-        case VPX_CR_FULL_RANGE:
-            picture->color_range = AVCOL_RANGE_JPEG;
-            break;
-        }
-#endif
+
+        planes[0] = img->planes[VPX_PLANE_Y];
+        planes[1] = img->planes[VPX_PLANE_U];
+        planes[2] = img->planes[VPX_PLANE_V];
+        planes[3] =
+            ctx->has_alpha_channel ? img_alpha->planes[VPX_PLANE_Y] : NULL;
+        linesizes[0] = img->stride[VPX_PLANE_Y];
+        linesizes[1] = img->stride[VPX_PLANE_U];
+        linesizes[2] = img->stride[VPX_PLANE_V];
+        linesizes[3] =
+            ctx->has_alpha_channel ? img_alpha->stride[VPX_PLANE_Y] : 0;
+        av_image_copy(picture->data, picture->linesize, (const uint8_t**)planes,
+                      linesizes, avctx->pix_fmt, img->d_w, img->d_h);
         *got_frame           = 1;
     }
     return avpkt->size;
 }
 
-static av_cold int vp8_free(AVCodecContext *avctx)
+static av_cold int vpx_free(AVCodecContext *avctx)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
     vpx_codec_destroy(&ctx->decoder);
+    if (ctx->has_alpha_channel)
+        vpx_codec_destroy(&ctx->decoder_alpha);
     return 0;
 }
 
 #if CONFIG_LIBVPX_VP8_DECODER
 static av_cold int vp8_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp8_dx_algo);
+    return vpx_init(avctx, &vpx_codec_vp8_dx_algo, 0);
 }
 
 AVCodec ff_libvpx_vp8_decoder = {
@@ -132,10 +283,10 @@ AVCodec ff_libvpx_vp8_decoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("libvpx VP8"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_VP8,
-    .priv_data_size = sizeof(VP8Context),
+    .priv_data_size = sizeof(VPxContext),
     .init           = vp8_init,
-    .close          = vp8_free,
-    .decode         = vp8_decode,
+    .close          = vpx_free,
+    .decode         = vpx_decode,
     .capabilities   = AV_CODEC_CAP_AUTO_THREADS | AV_CODEC_CAP_DR1,
     .wrapper_name   = "libvpx",
 };
@@ -144,7 +295,7 @@ AVCodec ff_libvpx_vp8_decoder = {
 #if CONFIG_LIBVPX_VP9_DECODER
 static av_cold int vp9_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp9_dx_algo);
+    return vpx_init(avctx, &vpx_codec_vp9_dx_algo, 0);
 }
 
 AVCodec ff_libvpx_vp9_decoder = {
@@ -152,11 +303,13 @@ AVCodec ff_libvpx_vp9_decoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("libvpx VP9"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_VP9,
-    .priv_data_size = sizeof(VP8Context),
+    .priv_data_size = sizeof(VPxContext),
     .init           = vp9_init,
-    .close          = vp8_free,
-    .decode         = vp8_decode,
-    .capabilities   = AV_CODEC_CAP_AUTO_THREADS,
+    .close          = vpx_free,
+    .decode         = vpx_decode,
+    .capabilities   = AV_CODEC_CAP_AUTO_THREADS | AV_CODEC_CAP_DR1,
+    .init_static_data = ff_vp9_init_static,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
     .wrapper_name   = "libvpx",
 };
 #endif /* CONFIG_LIBVPX_VP9_DECODER */
diff --git a/libavcodec/libvpxenc.c b/libavcodec/libvpxenc.c
index 74d5a0c..c823b8a 100644
--- a/libavcodec/libvpxenc.c
+++ b/libavcodec/libvpxenc.c
@@ -1,26 +1,26 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * VP8 encoder support via libvpx
+ * VP8/9 encoder support via libvpx
  */
 
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
@@ -30,9 +30,14 @@
 
 #include "avcodec.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libvpx.h"
+#include "profiles.h"
+#include "libavutil/avstring.h"
 #include "libavutil/base64.h"
 #include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
 
@@ -43,50 +48,119 @@
 struct FrameListData {
     void *buf;                       /**< compressed data buffer */
     size_t sz;                       /**< length of compressed data */
+    void *buf_alpha;
+    size_t sz_alpha;
     int64_t pts;                     /**< time stamp to show frame
                                           (in timebase units) */
     unsigned long duration;          /**< duration to show frame
                                           (in timebase units) */
     uint32_t flags;                  /**< flags for this frame */
+    uint64_t sse[4];
+    int have_sse;                    /**< true if we have pending sse[] */
+    uint64_t frame_number;
     struct FrameListData *next;
 };
 
-typedef struct VP8EncoderContext {
+typedef struct VPxEncoderContext {
     AVClass *class;
     struct vpx_codec_ctx encoder;
     struct vpx_image rawimg;
+    struct vpx_codec_ctx encoder_alpha;
+    struct vpx_image rawimg_alpha;
+    uint8_t is_alpha;
     struct vpx_fixed_buf twopass_stats;
-    unsigned long deadline; //i.e., RT/GOOD/BEST
+    int deadline; //i.e., RT/GOOD/BEST
+    uint64_t sse[4];
+    int have_sse; /**< true if we have pending sse[] */
+    uint64_t frame_number;
     struct FrameListData *coded_frame_list;
+
     int cpu_used;
+    int sharpness;
+    /**
+     * VP8 specific flags, see VP8F_* below.
+     */
+    int flags;
+#define VP8F_ERROR_RESILIENT 0x00000001 ///< Enable measures appropriate for streaming over lossy links
+#define VP8F_AUTO_ALT_REF    0x00000002 ///< Enable automatic alternate reference frame generation
+
     int auto_alt_ref;
+
     int arnr_max_frames;
     int arnr_strength;
     int arnr_type;
+
+    int tune;
+
     int lag_in_frames;
     int error_resilient;
     int crf;
     int static_thresh;
+    int max_intra_rate;
+    int rc_undershoot_pct;
+    int rc_overshoot_pct;
+
+    char *vp8_ts_parameters;
+
+    // VP9-only
+    int lossless;
+    int tile_columns;
+    int tile_rows;
+    int frame_parallel;
+    int aq_mode;
     int drop_threshold;
     int noise_sensitivity;
-} VP8Context;
+    int vpx_cs;
+    float level;
+    int row_mt;
+    int tune_content;
+    int corpus_complexity;
+    int tpl_model;
+} VPxContext;
 
 /** String mappings for enum vp8e_enc_control_id */
 static const char *const ctlidstr[] = {
-    [VP8E_SET_ARNR_MAXFRAMES]    = "VP8E_SET_ARNR_MAXFRAMES",
-    [VP8E_SET_ARNR_STRENGTH]     = "VP8E_SET_ARNR_STRENGTH",
-    [VP8E_SET_ARNR_TYPE]         = "VP8E_SET_ARNR_TYPE",
     [VP8E_SET_CPUUSED]           = "VP8E_SET_CPUUSED",
-    [VP8E_SET_CQ_LEVEL]          = "VP8E_SET_CQ_LEVEL",
     [VP8E_SET_ENABLEAUTOALTREF]  = "VP8E_SET_ENABLEAUTOALTREF",
     [VP8E_SET_NOISE_SENSITIVITY] = "VP8E_SET_NOISE_SENSITIVITY",
     [VP8E_SET_STATIC_THRESHOLD]  = "VP8E_SET_STATIC_THRESHOLD",
     [VP8E_SET_TOKEN_PARTITIONS]  = "VP8E_SET_TOKEN_PARTITIONS",
+    [VP8E_SET_ARNR_MAXFRAMES]    = "VP8E_SET_ARNR_MAXFRAMES",
+    [VP8E_SET_ARNR_STRENGTH]     = "VP8E_SET_ARNR_STRENGTH",
+    [VP8E_SET_ARNR_TYPE]         = "VP8E_SET_ARNR_TYPE",
+    [VP8E_SET_TUNING]            = "VP8E_SET_TUNING",
+    [VP8E_SET_CQ_LEVEL]          = "VP8E_SET_CQ_LEVEL",
+    [VP8E_SET_MAX_INTRA_BITRATE_PCT] = "VP8E_SET_MAX_INTRA_BITRATE_PCT",
+    [VP8E_SET_SHARPNESS]               = "VP8E_SET_SHARPNESS",
+#if CONFIG_LIBVPX_VP9_ENCODER
+    [VP9E_SET_LOSSLESS]                = "VP9E_SET_LOSSLESS",
+    [VP9E_SET_TILE_COLUMNS]            = "VP9E_SET_TILE_COLUMNS",
+    [VP9E_SET_TILE_ROWS]               = "VP9E_SET_TILE_ROWS",
+    [VP9E_SET_FRAME_PARALLEL_DECODING] = "VP9E_SET_FRAME_PARALLEL_DECODING",
+    [VP9E_SET_AQ_MODE]                 = "VP9E_SET_AQ_MODE",
+    [VP9E_SET_COLOR_SPACE]             = "VP9E_SET_COLOR_SPACE",
+#if VPX_ENCODER_ABI_VERSION >= 11
+    [VP9E_SET_COLOR_RANGE]             = "VP9E_SET_COLOR_RANGE",
+#endif
+#if VPX_ENCODER_ABI_VERSION >= 12
+    [VP9E_SET_TARGET_LEVEL]            = "VP9E_SET_TARGET_LEVEL",
+    [VP9E_GET_LEVEL]                   = "VP9E_GET_LEVEL",
+#endif
+#ifdef VPX_CTRL_VP9E_SET_ROW_MT
+    [VP9E_SET_ROW_MT]                  = "VP9E_SET_ROW_MT",
+#endif
+#ifdef VPX_CTRL_VP9E_SET_TUNE_CONTENT
+    [VP9E_SET_TUNE_CONTENT]            = "VP9E_SET_TUNE_CONTENT",
+#endif
+#ifdef VPX_CTRL_VP9E_SET_TPL
+    [VP9E_SET_TPL]                     = "VP9E_SET_TPL",
+#endif
+#endif
 };
 
 static av_cold void log_encoder_error(AVCodecContext *avctx, const char *desc)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
     const char *error  = vpx_codec_error(&ctx->encoder);
     const char *detail = vpx_codec_error_detail(&ctx->encoder);
 
@@ -100,23 +174,31 @@ static av_cold void dump_enc_cfg(AVCodecContext *avctx,
 {
     int width = -30;
     int level = AV_LOG_DEBUG;
+    int i;
 
     av_log(avctx, level, "vpx_codec_enc_cfg\n");
     av_log(avctx, level, "generic settings\n"
            "  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n"
+#if CONFIG_LIBVPX_VP9_ENCODER
+           "  %*s%u\n  %*s%u\n"
+#endif
            "  %*s{%u/%u}\n  %*s%u\n  %*s%d\n  %*s%u\n",
            width, "g_usage:",           cfg->g_usage,
            width, "g_threads:",         cfg->g_threads,
            width, "g_profile:",         cfg->g_profile,
            width, "g_w:",               cfg->g_w,
            width, "g_h:",               cfg->g_h,
+#if CONFIG_LIBVPX_VP9_ENCODER
+           width, "g_bit_depth:",       cfg->g_bit_depth,
+           width, "g_input_bit_depth:", cfg->g_input_bit_depth,
+#endif
            width, "g_timebase:",        cfg->g_timebase.num, cfg->g_timebase.den,
            width, "g_error_resilient:", cfg->g_error_resilient,
            width, "g_pass:",            cfg->g_pass,
            width, "g_lag_in_frames:",   cfg->g_lag_in_frames);
     av_log(avctx, level, "rate control settings\n"
            "  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n"
-           "  %*s%d\n  %*s%p(%zu)\n  %*s%u\n",
+           "  %*s%d\n  %*s%p(%"SIZE_SPECIFIER")\n  %*s%u\n",
            width, "rc_dropframe_thresh:",   cfg->rc_dropframe_thresh,
            width, "rc_resize_allowed:",     cfg->rc_resize_allowed,
            width, "rc_resize_up_thresh:",   cfg->rc_resize_up_thresh,
@@ -132,6 +214,25 @@ static av_cold void dump_enc_cfg(AVCodecContext *avctx,
            "  %*s%u\n  %*s%u\n",
            width, "rc_undershoot_pct:", cfg->rc_undershoot_pct,
            width, "rc_overshoot_pct:",  cfg->rc_overshoot_pct);
+    av_log(avctx, level, "temporal layering settings\n"
+           "  %*s%u\n", width, "ts_number_layers:", cfg->ts_number_layers);
+    av_log(avctx, level,
+           "\n  %*s", width, "ts_target_bitrate:");
+    for (i = 0; i < VPX_TS_MAX_LAYERS; i++)
+        av_log(avctx, level, "%u ", cfg->ts_target_bitrate[i]);
+    av_log(avctx, level, "\n");
+    av_log(avctx, level,
+           "\n  %*s", width, "ts_rate_decimator:");
+    for (i = 0; i < VPX_TS_MAX_LAYERS; i++)
+        av_log(avctx, level, "%u ", cfg->ts_rate_decimator[i]);
+    av_log(avctx, level, "\n");
+    av_log(avctx, level,
+           "\n  %*s%u\n", width, "ts_periodicity:", cfg->ts_periodicity);
+    av_log(avctx, level,
+           "\n  %*s", width, "ts_layer_id:");
+    for (i = 0; i < VPX_TS_MAX_PERIODICITY; i++)
+        av_log(avctx, level, "%u ", cfg->ts_layer_id[i]);
+    av_log(avctx, level, "\n");
     av_log(avctx, level, "decoder buffer model\n"
             "  %*s%u\n  %*s%u\n  %*s%u\n",
             width, "rc_buf_sz:",         cfg->rc_buf_sz,
@@ -142,6 +243,10 @@ static av_cold void dump_enc_cfg(AVCodecContext *avctx,
            width, "rc_2pass_vbr_bias_pct:",       cfg->rc_2pass_vbr_bias_pct,
            width, "rc_2pass_vbr_minsection_pct:", cfg->rc_2pass_vbr_minsection_pct,
            width, "rc_2pass_vbr_maxsection_pct:", cfg->rc_2pass_vbr_maxsection_pct);
+#if VPX_ENCODER_ABI_VERSION >= 14
+    av_log(avctx, level, "  %*s%u\n",
+           width, "rc_2pass_vbr_corpus_complexity:", cfg->rc_2pass_vbr_corpus_complexity);
+#endif
     av_log(avctx, level, "keyframing settings\n"
            "  %*s%d\n  %*s%u\n  %*s%u\n",
            width, "kf_mode:",     cfg->kf_mode,
@@ -163,6 +268,8 @@ static void coded_frame_add(void *list, struct FrameListData *cx_frame)
 static av_cold void free_coded_frame(struct FrameListData *cx_frame)
 {
     av_freep(&cx_frame->buf);
+    if (cx_frame->buf_alpha)
+        av_freep(&cx_frame->buf_alpha);
     av_freep(&cx_frame);
 }
 
@@ -180,7 +287,7 @@ static av_cold void free_frame_list(struct FrameListData *list)
 static av_cold int codecctl_int(AVCodecContext *avctx,
                                 enum vp8e_enc_control_id id, int val)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
     char buf[80];
     int width = -30;
     int res;
@@ -198,43 +305,259 @@ static av_cold int codecctl_int(AVCodecContext *avctx,
     return res == VPX_CODEC_OK ? 0 : AVERROR(EINVAL);
 }
 
-static av_cold int vp8_free(AVCodecContext *avctx)
+#if VPX_ENCODER_ABI_VERSION >= 12
+static av_cold int codecctl_intp(AVCodecContext *avctx,
+                                 enum vp8e_enc_control_id id, int *val)
+{
+    VPxContext *ctx = avctx->priv_data;
+    char buf[80];
+    int width = -30;
+    int res;
+
+    snprintf(buf, sizeof(buf), "%s:", ctlidstr[id]);
+    av_log(avctx, AV_LOG_DEBUG, "  %*s%d\n", width, buf, *val);
+
+    res = vpx_codec_control(&ctx->encoder, id, val);
+    if (res != VPX_CODEC_OK) {
+        snprintf(buf, sizeof(buf), "Failed to set %s codec control",
+                 ctlidstr[id]);
+        log_encoder_error(avctx, buf);
+    }
+
+    return res == VPX_CODEC_OK ? 0 : AVERROR(EINVAL);
+}
+#endif
+
+static av_cold int vpx_free(AVCodecContext *avctx)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
+
+#if VPX_ENCODER_ABI_VERSION >= 12
+    if (avctx->codec_id == AV_CODEC_ID_VP9 && ctx->level >= 0 &&
+        !(avctx->flags & AV_CODEC_FLAG_PASS1)) {
+        int level_out = 0;
+        if (!codecctl_intp(avctx, VP9E_GET_LEVEL, &level_out))
+            av_log(avctx, AV_LOG_INFO, "Encoded level %.1f\n", level_out * 0.1);
+    }
+#endif
 
     vpx_codec_destroy(&ctx->encoder);
+    if (ctx->is_alpha)
+        vpx_codec_destroy(&ctx->encoder_alpha);
     av_freep(&ctx->twopass_stats.buf);
     av_freep(&avctx->stats_out);
     free_frame_list(ctx->coded_frame_list);
     return 0;
 }
 
+static void vp8_ts_parse_int_array(int *dest, char *value, size_t value_len, int max_entries)
+{
+    int dest_idx = 0;
+    char *saveptr = NULL;
+    char *token = av_strtok(value, ",", &saveptr);
+
+    while (token && dest_idx < max_entries) {
+        dest[dest_idx++] = strtoul(token, NULL, 10);
+        token = av_strtok(NULL, ",", &saveptr);
+    }
+}
+
+static int vp8_ts_param_parse(struct vpx_codec_enc_cfg *enccfg, char *key, char *value)
+{
+    size_t value_len = strlen(value);
+
+    if (!value_len)
+        return -1;
+
+    if (!strcmp(key, "ts_number_layers"))
+        enccfg->ts_number_layers = strtoul(value, &value, 10);
+    else if (!strcmp(key, "ts_target_bitrate"))
+        vp8_ts_parse_int_array(enccfg->ts_target_bitrate, value, value_len, VPX_TS_MAX_LAYERS);
+    else if (!strcmp(key, "ts_rate_decimator"))
+      vp8_ts_parse_int_array(enccfg->ts_rate_decimator, value, value_len, VPX_TS_MAX_LAYERS);
+    else if (!strcmp(key, "ts_periodicity"))
+        enccfg->ts_periodicity = strtoul(value, &value, 10);
+    else if (!strcmp(key, "ts_layer_id"))
+        vp8_ts_parse_int_array(enccfg->ts_layer_id, value, value_len, VPX_TS_MAX_PERIODICITY);
+
+    return 0;
+}
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+static int set_pix_fmt(AVCodecContext *avctx, vpx_codec_caps_t codec_caps,
+                       struct vpx_codec_enc_cfg *enccfg, vpx_codec_flags_t *flags,
+                       vpx_img_fmt_t *img_fmt)
+{
+    VPxContext av_unused *ctx = avctx->priv_data;
+    enccfg->g_bit_depth = enccfg->g_input_bit_depth = 8;
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUVA420P:
+        enccfg->g_profile = 0;
+        *img_fmt = VPX_IMG_FMT_I420;
+        return 0;
+    case AV_PIX_FMT_YUV422P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I422;
+        return 0;
+    case AV_PIX_FMT_YUV440P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I440;
+        return 0;
+    case AV_PIX_FMT_GBRP:
+        ctx->vpx_cs = VPX_CS_SRGB;
+    case AV_PIX_FMT_YUV444P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I444;
+        return 0;
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV420P10 ? 10 : 12;
+            enccfg->g_profile = 2;
+            *img_fmt = VPX_IMG_FMT_I42016;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV422P10 ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I42216;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_YUV440P10:
+    case AV_PIX_FMT_YUV440P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV440P10 ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I44016;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+        ctx->vpx_cs = VPX_CS_SRGB;
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV444P10 ||
+                avctx->pix_fmt == AV_PIX_FMT_GBRP10 ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I44416;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    default:
+        break;
+    }
+    av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format.\n");
+    return AVERROR_INVALIDDATA;
+}
+
+static void set_colorspace(AVCodecContext *avctx)
+{
+    enum vpx_color_space vpx_cs;
+    VPxContext *ctx = avctx->priv_data;
+
+    if (ctx->vpx_cs) {
+        vpx_cs = ctx->vpx_cs;
+    } else {
+        switch (avctx->colorspace) {
+        case AVCOL_SPC_RGB:         vpx_cs = VPX_CS_SRGB;      break;
+        case AVCOL_SPC_BT709:       vpx_cs = VPX_CS_BT_709;    break;
+        case AVCOL_SPC_UNSPECIFIED: vpx_cs = VPX_CS_UNKNOWN;   break;
+        case AVCOL_SPC_RESERVED:    vpx_cs = VPX_CS_RESERVED;  break;
+        case AVCOL_SPC_BT470BG:     vpx_cs = VPX_CS_BT_601;    break;
+        case AVCOL_SPC_SMPTE170M:   vpx_cs = VPX_CS_SMPTE_170; break;
+        case AVCOL_SPC_SMPTE240M:   vpx_cs = VPX_CS_SMPTE_240; break;
+        case AVCOL_SPC_BT2020_NCL:  vpx_cs = VPX_CS_BT_2020;   break;
+        default:
+            av_log(avctx, AV_LOG_WARNING, "Unsupported colorspace (%d)\n",
+                   avctx->colorspace);
+            return;
+        }
+    }
+    codecctl_int(avctx, VP9E_SET_COLOR_SPACE, vpx_cs);
+}
+
+#if VPX_ENCODER_ABI_VERSION >= 11
+static void set_color_range(AVCodecContext *avctx)
+{
+    enum vpx_color_range vpx_cr;
+    switch (avctx->color_range) {
+    case AVCOL_RANGE_UNSPECIFIED:
+    case AVCOL_RANGE_MPEG:       vpx_cr = VPX_CR_STUDIO_RANGE; break;
+    case AVCOL_RANGE_JPEG:       vpx_cr = VPX_CR_FULL_RANGE;   break;
+    default:
+        av_log(avctx, AV_LOG_WARNING, "Unsupported color range (%d)\n",
+               avctx->color_range);
+        return;
+    }
+
+    codecctl_int(avctx, VP9E_SET_COLOR_RANGE, vpx_cr);
+}
+#endif
+#endif
+
 static av_cold int vpx_init(AVCodecContext *avctx,
                             const struct vpx_codec_iface *iface)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
     struct vpx_codec_enc_cfg enccfg = { 0 };
+    struct vpx_codec_enc_cfg enccfg_alpha;
+    vpx_codec_flags_t flags = (avctx->flags & AV_CODEC_FLAG_PSNR) ? VPX_CODEC_USE_PSNR : 0;
     AVCPBProperties *cpb_props;
     int res;
+    vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420;
+#if CONFIG_LIBVPX_VP9_ENCODER
+    vpx_codec_caps_t codec_caps = vpx_codec_get_caps(iface);
+#endif
 
     av_log(avctx, AV_LOG_INFO, "%s\n", vpx_codec_version_str());
     av_log(avctx, AV_LOG_VERBOSE, "%s\n", vpx_codec_build_config());
 
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P)
+        ctx->is_alpha = 1;
+
     if ((res = vpx_codec_enc_config_default(iface, &enccfg, 0)) != VPX_CODEC_OK) {
         av_log(avctx, AV_LOG_ERROR, "Failed to get config: %s\n",
                vpx_codec_err_to_string(res));
         return AVERROR(EINVAL);
     }
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9) {
+        if (set_pix_fmt(avctx, codec_caps, &enccfg, &flags, &img_fmt))
+            return AVERROR(EINVAL);
+    }
+#endif
+
+    if(!avctx->bit_rate)
+        if(avctx->rc_max_rate || avctx->rc_buffer_size || avctx->rc_initial_buffer_occupancy) {
+            av_log( avctx, AV_LOG_ERROR, "Rate control parameters set without a bitrate\n");
+            return AVERROR(EINVAL);
+        }
+
     dump_enc_cfg(avctx, &enccfg);
 
     enccfg.g_w            = avctx->width;
     enccfg.g_h            = avctx->height;
     enccfg.g_timebase.num = avctx->time_base.num;
     enccfg.g_timebase.den = avctx->time_base.den;
-    enccfg.g_threads      = avctx->thread_count;
-
-    if (ctx->lag_in_frames >= 0)
-        enccfg.g_lag_in_frames = ctx->lag_in_frames;
+    enccfg.g_threads      =
+        FFMIN(avctx->thread_count ? avctx->thread_count : av_cpu_count(), 16);
+    enccfg.g_lag_in_frames= ctx->lag_in_frames;
 
     if (avctx->flags & AV_CODEC_FLAG_PASS1)
         enccfg.g_pass = VPX_RC_FIRST_PASS;
@@ -243,22 +566,56 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     else
         enccfg.g_pass = VPX_RC_ONE_PASS;
 
-    if (!avctx->bit_rate)
-        avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
-    else
+    if (avctx->rc_min_rate == avctx->rc_max_rate &&
+        avctx->rc_min_rate == avctx->bit_rate && avctx->bit_rate) {
+        enccfg.rc_end_usage = VPX_CBR;
+    } else if (ctx->crf >= 0) {
+        enccfg.rc_end_usage = VPX_CQ;
+#if CONFIG_LIBVPX_VP9_ENCODER
+        if (!avctx->bit_rate && avctx->codec_id == AV_CODEC_ID_VP9)
+            enccfg.rc_end_usage = VPX_Q;
+#endif
+    }
+
+    if (avctx->bit_rate) {
         enccfg.rc_target_bitrate = av_rescale_rnd(avctx->bit_rate, 1, 1000,
-                                              AV_ROUND_NEAR_INF);
+                                                  AV_ROUND_NEAR_INF);
+#if CONFIG_LIBVPX_VP9_ENCODER
+    } else if (enccfg.rc_end_usage == VPX_Q) {
+#endif
+    } else {
+        if (enccfg.rc_end_usage == VPX_CQ) {
+            enccfg.rc_target_bitrate = 1000000;
+        } else {
+            avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
+            av_log(avctx, AV_LOG_WARNING,
+                   "Neither bitrate nor constrained quality specified, using default bitrate of %dkbit/sec\n",
+                   enccfg.rc_target_bitrate);
+        }
+    }
 
-    if (ctx->crf)
-        enccfg.rc_end_usage = VPX_CQ;
-    else if (avctx->rc_min_rate == avctx->rc_max_rate &&
-             avctx->rc_min_rate == avctx->bit_rate)
-        enccfg.rc_end_usage = VPX_CBR;
+    if (avctx->codec_id == AV_CODEC_ID_VP9 && ctx->lossless == 1) {
+        enccfg.rc_min_quantizer =
+        enccfg.rc_max_quantizer = 0;
+    } else {
+        if (avctx->qmin >= 0)
+            enccfg.rc_min_quantizer = avctx->qmin;
+        if (avctx->qmax >= 0)
+            enccfg.rc_max_quantizer = avctx->qmax;
+    }
 
-    if (avctx->qmin > 0)
-        enccfg.rc_min_quantizer = avctx->qmin;
-    if (avctx->qmax > 0)
-        enccfg.rc_max_quantizer = avctx->qmax;
+    if (enccfg.rc_end_usage == VPX_CQ
+#if CONFIG_LIBVPX_VP9_ENCODER
+        || enccfg.rc_end_usage == VPX_Q
+#endif
+       ) {
+        if (ctx->crf < enccfg.rc_min_quantizer || ctx->crf > enccfg.rc_max_quantizer) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "CQ level %d must be between minimum and maximum quantizer value (%d-%d)\n",
+                   ctx->crf, enccfg.rc_min_quantizer, enccfg.rc_max_quantizer);
+            return AVERROR(EINVAL);
+        }
+    }
 
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -269,12 +626,21 @@ FF_ENABLE_DEPRECATION_WARNINGS
     enccfg.rc_dropframe_thresh = ctx->drop_threshold;
 
     //0-100 (0 => CBR, 100 => VBR)
-    enccfg.rc_2pass_vbr_bias_pct           = round(avctx->qcompress * 100);
-    enccfg.rc_2pass_vbr_minsection_pct     =
-        avctx->rc_min_rate * 100LL / avctx->bit_rate;
+    enccfg.rc_2pass_vbr_bias_pct           = lrint(avctx->qcompress * 100);
+    if (avctx->bit_rate)
+        enccfg.rc_2pass_vbr_minsection_pct =
+            avctx->rc_min_rate * 100LL / avctx->bit_rate;
     if (avctx->rc_max_rate)
         enccfg.rc_2pass_vbr_maxsection_pct =
             avctx->rc_max_rate * 100LL / avctx->bit_rate;
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9) {
+#if VPX_ENCODER_ABI_VERSION >= 14
+        if (ctx->corpus_complexity >= 0)
+            enccfg.rc_2pass_vbr_corpus_complexity = ctx->corpus_complexity;
+#endif
+    }
+#endif
 
     if (avctx->rc_buffer_size)
         enccfg.rc_buf_sz         =
@@ -283,6 +649,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
         enccfg.rc_buf_initial_sz =
             avctx->rc_initial_buffer_occupancy * 1000LL / avctx->bit_rate;
     enccfg.rc_buf_optimal_sz     = enccfg.rc_buf_sz * 5 / 6;
+    if (ctx->rc_undershoot_pct >= 0)
+        enccfg.rc_undershoot_pct = ctx->rc_undershoot_pct;
+    if (ctx->rc_overshoot_pct >= 0)
+        enccfg.rc_overshoot_pct = ctx->rc_overshoot_pct;
 
     //_enc_init() will balk if kf_min_dist differs from max w/VPX_KF_AUTO
     if (avctx->keyint_min >= 0 && avctx->keyint_min == avctx->gop_size)
@@ -304,8 +674,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
         ret = av_reallocp(&ctx->twopass_stats.buf, ctx->twopass_stats.sz);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "Stat buffer alloc (%zu bytes) failed\n",
+                   "Stat buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                    ctx->twopass_stats.sz);
+            ctx->twopass_stats.sz = 0;
             return ret;
         }
         decode_size = av_base64_decode(ctx->twopass_stats.buf, avctx->stats_in,
@@ -324,35 +695,68 @@ FF_ENABLE_DEPRECATION_WARNINGS
        quality. */
     if (avctx->profile != FF_PROFILE_UNKNOWN)
         enccfg.g_profile = avctx->profile;
-    else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P)
-        avctx->profile = enccfg.g_profile = FF_PROFILE_VP9_0;
-    else
-        avctx->profile = enccfg.g_profile = FF_PROFILE_VP9_1;
 
-    enccfg.g_error_resilient = ctx->error_resilient;
+    enccfg.g_error_resilient = ctx->error_resilient || ctx->flags & VP8F_ERROR_RESILIENT;
+
+    if (CONFIG_LIBVPX_VP8_ENCODER && avctx->codec_id == AV_CODEC_ID_VP8 && ctx->vp8_ts_parameters) {
+        AVDictionary *dict    = NULL;
+        AVDictionaryEntry* en = NULL;
+
+        if (!av_dict_parse_string(&dict, ctx->vp8_ts_parameters, "=", ":", 0)) {
+            while ((en = av_dict_get(dict, "", en, AV_DICT_IGNORE_SUFFIX))) {
+                if (vp8_ts_param_parse(&enccfg, en->key, en->value) < 0)
+                    av_log(avctx, AV_LOG_WARNING,
+                           "Error parsing option '%s = %s'.\n",
+                           en->key, en->value);
+            }
+
+            av_dict_free(&dict);
+        }
+    }
 
     dump_enc_cfg(avctx, &enccfg);
     /* Construct Encoder Context */
-    res = vpx_codec_enc_init(&ctx->encoder, iface, &enccfg, 0);
+    res = vpx_codec_enc_init(&ctx->encoder, iface, &enccfg, flags);
     if (res != VPX_CODEC_OK) {
         log_encoder_error(avctx, "Failed to initialize encoder");
         return AVERROR(EINVAL);
     }
 
+    if (ctx->is_alpha) {
+        enccfg_alpha = enccfg;
+        res = vpx_codec_enc_init(&ctx->encoder_alpha, iface, &enccfg_alpha, flags);
+        if (res != VPX_CODEC_OK) {
+            log_encoder_error(avctx, "Failed to initialize alpha encoder");
+            return AVERROR(EINVAL);
+        }
+    }
+
     //codec control failures are currently treated only as warnings
     av_log(avctx, AV_LOG_DEBUG, "vpx_codec_control\n");
-    if (ctx->cpu_used != INT_MIN)
-        codecctl_int(avctx, VP8E_SET_CPUUSED,          ctx->cpu_used);
+    codecctl_int(avctx, VP8E_SET_CPUUSED,          ctx->cpu_used);
+    if (ctx->flags & VP8F_AUTO_ALT_REF)
+        ctx->auto_alt_ref = 1;
     if (ctx->auto_alt_ref >= 0)
-        codecctl_int(avctx, VP8E_SET_ENABLEAUTOALTREF, ctx->auto_alt_ref);
+        codecctl_int(avctx, VP8E_SET_ENABLEAUTOALTREF,
+                     avctx->codec_id == AV_CODEC_ID_VP8 ? !!ctx->auto_alt_ref : ctx->auto_alt_ref);
     if (ctx->arnr_max_frames >= 0)
         codecctl_int(avctx, VP8E_SET_ARNR_MAXFRAMES,   ctx->arnr_max_frames);
     if (ctx->arnr_strength >= 0)
         codecctl_int(avctx, VP8E_SET_ARNR_STRENGTH,    ctx->arnr_strength);
     if (ctx->arnr_type >= 0)
         codecctl_int(avctx, VP8E_SET_ARNR_TYPE,        ctx->arnr_type);
+    if (ctx->tune >= 0)
+        codecctl_int(avctx, VP8E_SET_TUNING,           ctx->tune);
 
-    if (CONFIG_LIBVPX_VP8_ENCODER && iface == &vpx_codec_vp8_cx_algo) {
+    if (ctx->auto_alt_ref && ctx->is_alpha && avctx->codec_id == AV_CODEC_ID_VP8) {
+        av_log(avctx, AV_LOG_ERROR, "Transparency encoding with auto_alt_ref does not work\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (ctx->sharpness >= 0)
+        codecctl_int(avctx, VP8E_SET_SHARPNESS, ctx->sharpness);
+
+    if (CONFIG_LIBVPX_VP8_ENCODER && avctx->codec_id == AV_CODEC_ID_VP8) {
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
         if (avctx->noise_reduction)
@@ -363,11 +767,58 @@ FF_ENABLE_DEPRECATION_WARNINGS
         codecctl_int(avctx, VP8E_SET_TOKEN_PARTITIONS,  av_log2(avctx->slices));
     }
     codecctl_int(avctx, VP8E_SET_STATIC_THRESHOLD,  ctx->static_thresh);
-    codecctl_int(avctx, VP8E_SET_CQ_LEVEL,          ctx->crf);
+    if (ctx->crf >= 0)
+        codecctl_int(avctx, VP8E_SET_CQ_LEVEL,          ctx->crf);
+    if (ctx->max_intra_rate >= 0)
+        codecctl_int(avctx, VP8E_SET_MAX_INTRA_BITRATE_PCT, ctx->max_intra_rate);
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9) {
+        if (ctx->lossless >= 0)
+            codecctl_int(avctx, VP9E_SET_LOSSLESS, ctx->lossless);
+        if (ctx->tile_columns >= 0)
+            codecctl_int(avctx, VP9E_SET_TILE_COLUMNS, ctx->tile_columns);
+        if (ctx->tile_rows >= 0)
+            codecctl_int(avctx, VP9E_SET_TILE_ROWS, ctx->tile_rows);
+        if (ctx->frame_parallel >= 0)
+            codecctl_int(avctx, VP9E_SET_FRAME_PARALLEL_DECODING, ctx->frame_parallel);
+        if (ctx->aq_mode >= 0)
+            codecctl_int(avctx, VP9E_SET_AQ_MODE, ctx->aq_mode);
+        set_colorspace(avctx);
+#if VPX_ENCODER_ABI_VERSION >= 11
+        set_color_range(avctx);
+#endif
+#if VPX_ENCODER_ABI_VERSION >= 12
+        codecctl_int(avctx, VP9E_SET_TARGET_LEVEL, ctx->level < 0 ? 255 : lrint(ctx->level * 10));
+#endif
+#ifdef VPX_CTRL_VP9E_SET_ROW_MT
+        if (ctx->row_mt >= 0)
+            codecctl_int(avctx, VP9E_SET_ROW_MT, ctx->row_mt);
+#endif
+#ifdef VPX_CTRL_VP9E_SET_TUNE_CONTENT
+        if (ctx->tune_content >= 0)
+            codecctl_int(avctx, VP9E_SET_TUNE_CONTENT, ctx->tune_content);
+#endif
+#ifdef VPX_CTRL_VP9E_SET_TPL
+        if (ctx->tpl_model >= 0)
+            codecctl_int(avctx, VP9E_SET_TPL, ctx->tpl_model);
+#endif
+    }
+#endif
+
+    av_log(avctx, AV_LOG_DEBUG, "Using deadline: %d\n", ctx->deadline);
 
     //provide dummy value to initialize wrapper, values will be updated each _encode()
-    vpx_img_wrap(&ctx->rawimg, ff_vpx_pixfmt_to_imgfmt(avctx->pix_fmt),
-                 avctx->width, avctx->height, 1, (unsigned char *)1);
+    vpx_img_wrap(&ctx->rawimg, img_fmt, avctx->width, avctx->height, 1,
+                 (unsigned char*)1);
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9 && (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH))
+        ctx->rawimg.bit_depth = enccfg.g_bit_depth;
+#endif
+
+    if (ctx->is_alpha)
+        vpx_img_wrap(&ctx->rawimg_alpha, VPX_IMG_FMT_I420, avctx->width, avctx->height, 1,
+                     (unsigned char*)1);
 
     cpb_props = ff_add_cpb_side_data(avctx);
     if (!cpb_props)
@@ -385,13 +836,38 @@ FF_ENABLE_DEPRECATION_WARNINGS
 }
 
 static inline void cx_pktcpy(struct FrameListData *dst,
-                             const struct vpx_codec_cx_pkt *src)
+                             const struct vpx_codec_cx_pkt *src,
+                             const struct vpx_codec_cx_pkt *src_alpha,
+                             VPxContext *ctx)
 {
     dst->pts      = src->data.frame.pts;
     dst->duration = src->data.frame.duration;
     dst->flags    = src->data.frame.flags;
     dst->sz       = src->data.frame.sz;
     dst->buf      = src->data.frame.buf;
+    dst->have_sse = 0;
+    /* For alt-ref frame, don't store PSNR or increment frame_number */
+    if (!(dst->flags & VPX_FRAME_IS_INVISIBLE)) {
+        dst->frame_number = ++ctx->frame_number;
+        dst->have_sse = ctx->have_sse;
+        if (ctx->have_sse) {
+            /* associate last-seen SSE to the frame. */
+            /* Transfers ownership from ctx to dst. */
+            /* WARNING! This makes the assumption that PSNR_PKT comes
+               just before the frame it refers to! */
+            memcpy(dst->sse, ctx->sse, sizeof(dst->sse));
+            ctx->have_sse = 0;
+        }
+    } else {
+        dst->frame_number = -1;   /* sanity marker */
+    }
+    if (src_alpha) {
+        dst->buf_alpha = src_alpha->data.frame.buf;
+        dst->sz_alpha = src_alpha->data.frame.sz;
+    } else {
+        dst->buf_alpha = NULL;
+        dst->sz_alpha = 0;
+    }
 }
 
 /**
@@ -404,8 +880,10 @@ static inline void cx_pktcpy(struct FrameListData *dst,
 static int storeframe(AVCodecContext *avctx, struct FrameListData *cx_frame,
                       AVPacket *pkt)
 {
-    int ret = ff_alloc_packet(pkt, cx_frame->sz);
+    int ret = ff_alloc_packet2(avctx, pkt, cx_frame->sz, 0);
+    uint8_t *side_data;
     if (ret >= 0) {
+        int pict_type;
         memcpy(pkt->data, cx_frame->buf, pkt->size);
         pkt->pts = pkt->dts = cx_frame->pts;
 #if FF_API_CODED_FRAME
@@ -416,22 +894,54 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
         if (!!(cx_frame->flags & VPX_FRAME_IS_KEY)) {
+            pict_type = AV_PICTURE_TYPE_I;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+            avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
             pkt->flags |= AV_PKT_FLAG_KEY;
         } else {
+            pict_type = AV_PICTURE_TYPE_P;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+            avctx->coded_frame->pict_type = pict_type;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+        }
+
+        ff_side_data_set_encoder_stats(pkt, 0, cx_frame->sse + 1,
+                                       cx_frame->have_sse ? 3 : 0, pict_type);
+
+        if (cx_frame->have_sse) {
+            int i;
+            /* Beware of the Y/U/V/all order! */
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+            avctx->coded_frame->error[0] = cx_frame->sse[1];
+            avctx->coded_frame->error[1] = cx_frame->sse[2];
+            avctx->coded_frame->error[2] = cx_frame->sse[3];
+            avctx->coded_frame->error[3] = 0;    // alpha
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+            for (i = 0; i < 3; ++i) {
+                avctx->error[i] += cx_frame->sse[i + 1];
+            }
+            cx_frame->have_sse = 0;
+        }
+        if (cx_frame->sz_alpha > 0) {
+            side_data = av_packet_new_side_data(pkt,
+                                                AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+                                                cx_frame->sz_alpha + 8);
+            if(!side_data) {
+                av_packet_unref(pkt);
+                av_free(pkt);
+                return AVERROR(ENOMEM);
+            }
+            AV_WB64(side_data, 1);
+            memcpy(side_data + 8, cx_frame->buf_alpha, cx_frame->sz_alpha);
         }
     } else {
-        av_log(avctx, AV_LOG_ERROR,
-               "Error getting output packet of size %zu.\n", cx_frame->sz);
         return ret;
     }
     return pkt->size;
@@ -447,9 +957,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
  */
 static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
     const struct vpx_codec_cx_pkt *pkt;
+    const struct vpx_codec_cx_pkt *pkt_alpha = NULL;
     const void *iter = NULL;
+    const void *iter_alpha = NULL;
     int size = 0;
 
     if (ctx->coded_frame_list) {
@@ -464,7 +976,9 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 
     /* consume all available output from the encoder before returning. buffers
        are only good through the next vpx_codec call */
-    while ((pkt = vpx_codec_get_cx_data(&ctx->encoder, &iter))) {
+    while ((pkt = vpx_codec_get_cx_data(&ctx->encoder, &iter)) &&
+           (!ctx->is_alpha ||
+            (ctx->is_alpha && (pkt_alpha = vpx_codec_get_cx_data(&ctx->encoder_alpha, &iter_alpha))))) {
         switch (pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT:
             if (!size) {
@@ -472,8 +986,8 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 
                 /* avoid storing the frame when the list is empty and we haven't yet
                    provided a frame for output */
-                assert(!ctx->coded_frame_list);
-                cx_pktcpy(&cx_frame, pkt);
+                av_assert0(!ctx->coded_frame_list);
+                cx_pktcpy(&cx_frame, pkt, pkt_alpha, ctx);
                 size = storeframe(avctx, &cx_frame, pkt_out);
                 if (size < 0)
                     return size;
@@ -486,17 +1000,28 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
                            "Frame queue element alloc failed\n");
                     return AVERROR(ENOMEM);
                 }
-                cx_pktcpy(cx_frame, pkt);
+                cx_pktcpy(cx_frame, pkt, pkt_alpha, ctx);
                 cx_frame->buf = av_malloc(cx_frame->sz);
 
                 if (!cx_frame->buf) {
                     av_log(avctx, AV_LOG_ERROR,
-                           "Data buffer alloc (%zu bytes) failed\n",
+                           "Data buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                            cx_frame->sz);
                     av_freep(&cx_frame);
                     return AVERROR(ENOMEM);
                 }
                 memcpy(cx_frame->buf, pkt->data.frame.buf, pkt->data.frame.sz);
+                if (ctx->is_alpha) {
+                    cx_frame->buf_alpha = av_malloc(cx_frame->sz_alpha);
+                    if (!cx_frame->buf_alpha) {
+                        av_log(avctx, AV_LOG_ERROR,
+                               "Data buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
+                               cx_frame->sz_alpha);
+                        av_free(cx_frame);
+                        return AVERROR(ENOMEM);
+                    }
+                    memcpy(cx_frame->buf_alpha, pkt_alpha->data.frame.buf, pkt_alpha->data.frame.sz);
+                }
                 coded_frame_add(&ctx->coded_frame_list, cx_frame);
             }
             break;
@@ -515,7 +1040,14 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
             stats->sz += pkt->data.twopass_stats.sz;
             break;
         }
-        case VPX_CODEC_PSNR_PKT: //FIXME add support for AV_CODEC_FLAG_PSNR
+        case VPX_CODEC_PSNR_PKT:
+            av_assert0(!ctx->have_sse);
+            ctx->sse[0] = pkt->data.psnr.sse[0];
+            ctx->sse[1] = pkt->data.psnr.sse[1];
+            ctx->sse[2] = pkt->data.psnr.sse[2];
+            ctx->sse[3] = pkt->data.psnr.sse[3];
+            ctx->have_sse = 1;
+            break;
         case VPX_CODEC_CUSTOM_PKT:
             //ignore unsupported/unrecognized packet types
             break;
@@ -525,11 +1057,12 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
     return size;
 }
 
-static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
+static int vpx_encode(AVCodecContext *avctx, AVPacket *pkt,
                       const AVFrame *frame, int *got_packet)
 {
-    VP8Context *ctx = avctx->priv_data;
+    VPxContext *ctx = avctx->priv_data;
     struct vpx_image *rawimg = NULL;
+    struct vpx_image *rawimg_alpha = NULL;
     int64_t timestamp = 0;
     int res, coded_size;
     vpx_enc_frame_flags_t flags = 0;
@@ -542,6 +1075,25 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
         rawimg->stride[VPX_PLANE_Y] = frame->linesize[0];
         rawimg->stride[VPX_PLANE_U] = frame->linesize[1];
         rawimg->stride[VPX_PLANE_V] = frame->linesize[2];
+        if (ctx->is_alpha) {
+            uint8_t *u_plane, *v_plane;
+            rawimg_alpha = &ctx->rawimg_alpha;
+            rawimg_alpha->planes[VPX_PLANE_Y] = frame->data[3];
+            u_plane = av_malloc(frame->linesize[1] * frame->height);
+            v_plane = av_malloc(frame->linesize[2] * frame->height);
+            if (!u_plane || !v_plane) {
+                av_free(u_plane);
+                av_free(v_plane);
+                return AVERROR(ENOMEM);
+            }
+            memset(u_plane, 0x80, frame->linesize[1] * frame->height);
+            rawimg_alpha->planes[VPX_PLANE_U] = u_plane;
+            memset(v_plane, 0x80, frame->linesize[2] * frame->height);
+            rawimg_alpha->planes[VPX_PLANE_V] = v_plane;
+            rawimg_alpha->stride[VPX_PLANE_Y] = frame->linesize[0];
+            rawimg_alpha->stride[VPX_PLANE_U] = frame->linesize[1];
+            rawimg_alpha->stride[VPX_PLANE_V] = frame->linesize[2];
+        }
         timestamp                   = frame->pts;
 #if VPX_IMAGE_ABI_VERSION >= 4
         switch (frame->color_range) {
@@ -555,6 +1107,12 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
 #endif
         if (frame->pict_type == AV_PICTURE_TYPE_I)
             flags |= VPX_EFLAG_FORCE_KF;
+        if (CONFIG_LIBVPX_VP8_ENCODER && avctx->codec_id == AV_CODEC_ID_VP8 && frame->metadata) {
+            AVDictionaryEntry* en = av_dict_get(frame->metadata, "vp8-flags", NULL, 0);
+            if (en) {
+                flags |= strtoul(en->value, NULL, 10);
+            }
+        }
     }
 
     res = vpx_codec_encode(&ctx->encoder, rawimg, timestamp,
@@ -563,6 +1121,16 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
         log_encoder_error(avctx, "Error encoding frame");
         return AVERROR_INVALIDDATA;
     }
+
+    if (ctx->is_alpha) {
+        res = vpx_codec_encode(&ctx->encoder_alpha, rawimg_alpha, timestamp,
+                               avctx->ticks_per_frame, flags, ctx->deadline);
+        if (res != VPX_CODEC_OK) {
+            log_encoder_error(avctx, "Error encoding alpha frame");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     coded_size = queue_frames(avctx, pkt);
 
     if (!frame && avctx->flags & AV_CODEC_FLAG_PASS1) {
@@ -578,42 +1146,124 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
                          ctx->twopass_stats.sz);
     }
 
+    if (rawimg_alpha) {
+        av_freep(&rawimg_alpha->planes[VPX_PLANE_U]);
+        av_freep(&rawimg_alpha->planes[VPX_PLANE_V]);
+    }
+
     *got_packet = !!coded_size;
     return 0;
 }
 
-#define OFFSET(x) offsetof(VP8Context, x)
+#define OFFSET(x) offsetof(VPxContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "cpu-used",        "Quality/Speed ratio modifier",           OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1}, INT_MIN, INT_MAX, VE},
+
+#define COMMON_OPTIONS \
+    { "lag-in-frames",   "Number of frames to look ahead for " \
+                         "alternate reference frame selection",    OFFSET(lag_in_frames),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-maxframes",  "altref noise reduction max frame count", OFFSET(arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-strength",   "altref noise reduction filter strength", OFFSET(arnr_strength),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-type",       "altref noise reduction filter type",     OFFSET(arnr_type),       AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "arnr_type"}, \
+    { "backward",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "arnr_type" }, \
+    { "forward",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "arnr_type" }, \
+    { "centered",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "arnr_type" }, \
+    { "tune",            "Tune the encoding to a specific scenario", OFFSET(tune),          AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "tune"}, \
+    { "psnr",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VP8_TUNE_PSNR}, 0, 0, VE, "tune"}, \
+    { "ssim",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VP8_TUNE_SSIM}, 0, 0, VE, "tune"}, \
+    { "deadline",        "Time to spend encoding, in microseconds.", OFFSET(deadline),      AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"}, \
+    { "best",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_BEST_QUALITY}, 0, 0, VE, "quality"}, \
+    { "good",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_GOOD_QUALITY}, 0, 0, VE, "quality"}, \
+    { "realtime",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_REALTIME},     0, 0, VE, "quality"}, \
+    { "error-resilient", "Error resilience configuration", OFFSET(error_resilient), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, VE, "er"}, \
+    { "max-intra-rate",  "Maximum I-frame bitrate (pct) 0=unlimited",  OFFSET(max_intra_rate),  AV_OPT_TYPE_INT,  {.i64 = -1}, -1,      INT_MAX, VE}, \
+    { "default",         "Improve resiliency against losses of whole frames", 0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_DEFAULT}, 0, 0, VE, "er"}, \
+    { "partitions",      "The frame partitions are independently decodable " \
+                         "by the bool decoder, meaning that partitions can be decoded even " \
+                         "though earlier partitions have been lost. Note that intra predicition" \
+                         " is still done over the partition boundary.",       0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_PARTITIONS}, 0, 0, VE, "er"}, \
+    { "crf",              "Select the quality for constant quality mode", offsetof(VPxContext, crf), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 63, VE }, \
+    { "static-thresh",    "A change threshold on blocks below which they will be skipped by the encoder", OFFSET(static_thresh), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE }, \
+    { "drop-threshold",   "Frame drop threshold", offsetof(VPxContext, drop_threshold), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, VE }, \
+    { "noise-sensitivity", "Noise sensitivity", OFFSET(noise_sensitivity), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 4, VE}, \
+    { "undershoot-pct",  "Datarate undershoot (min) target (%)", OFFSET(rc_undershoot_pct), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 100, VE }, \
+    { "overshoot-pct",   "Datarate overshoot (max) target (%)", OFFSET(rc_overshoot_pct), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1000, VE }, \
+
+#define LEGACY_OPTIONS \
+    {"speed", "", offsetof(VPxContext, cpu_used), AV_OPT_TYPE_INT, {.i64 = 1}, -16, 16, VE}, \
+    {"quality", "", offsetof(VPxContext, deadline), AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"}, \
+    {"vp8flags", "", offsetof(VPxContext, flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, UINT_MAX, VE, "flags"}, \
+    {"error_resilient", "enable error resilience", 0, AV_OPT_TYPE_CONST, {.i64 = VP8F_ERROR_RESILIENT}, INT_MIN, INT_MAX, VE, "flags"}, \
+    {"altref", "enable use of alternate reference frames (VP8/2-pass only)", 0, AV_OPT_TYPE_CONST, {.i64 = VP8F_AUTO_ALT_REF}, INT_MIN, INT_MAX, VE, "flags"}, \
+    {"arnr_max_frames", "altref noise reduction max frame count", offsetof(VPxContext, arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 15, VE}, \
+    {"arnr_strength", "altref noise reduction filter strength", offsetof(VPxContext, arnr_strength), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 6, VE}, \
+    {"arnr_type", "altref noise reduction filter type", offsetof(VPxContext, arnr_type), AV_OPT_TYPE_INT, {.i64 = 3}, 1, 3, VE}, \
+    {"rc_lookahead", "Number of frames to look ahead for alternate reference frame selection", offsetof(VPxContext, lag_in_frames), AV_OPT_TYPE_INT, {.i64 = 25}, 0, 25, VE}, \
+    {"sharpness", "Increase sharpness at the expense of lower PSNR", offsetof(VPxContext, sharpness), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 7, VE},
+
+#if CONFIG_LIBVPX_VP8_ENCODER
+static const AVOption vp8_options[] = {
+    COMMON_OPTIONS
+    { "auto-alt-ref",    "Enable use of alternate reference "
+                         "frames (2-pass only)",                        OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1}, -1,  2, VE},
+    { "cpu-used",        "Quality/Speed ratio modifier",                OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1}, -16, 16, VE},
+    { "ts-parameters",   "Temporal scaling configuration using a "
+                         ":-separated list of key=value parameters",    OFFSET(vp8_ts_parameters), AV_OPT_TYPE_STRING, {.str=NULL},  0,  0, VE},
+    LEGACY_OPTIONS
+    { NULL }
+};
+#endif
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+static const AVOption vp9_options[] = {
+    COMMON_OPTIONS
     { "auto-alt-ref",    "Enable use of alternate reference "
-                         "frames (2-pass only)",                   OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1},      -1,      1,       VE},
-    { "lag-in-frames",   "Number of frames to look ahead for "
-                         "alternate reference frame selection",    OFFSET(lag_in_frames),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-maxframes",  "altref noise reduction max frame count", OFFSET(arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-strength",   "altref noise reduction filter strength", OFFSET(arnr_strength),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-type",       "altref noise reduction filter type",     OFFSET(arnr_type),       AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "arnr_type"},
-    { "backward",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "arnr_type" },
-    { "forward",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "arnr_type" },
-    { "centered",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "arnr_type" },
-    { "deadline",        "Time to spend encoding, in microseconds.", OFFSET(deadline),      AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"},
-    { "best",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_BEST_QUALITY}, 0, 0, VE, "quality"},
-    { "good",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_GOOD_QUALITY}, 0, 0, VE, "quality"},
-    { "realtime",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_REALTIME},     0, 0, VE, "quality"},
-    { "error-resilient", "Error resilience configuration", OFFSET(error_resilient), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, VE, "er"},
-#ifdef VPX_ERROR_RESILIENT_DEFAULT
-    { "default",         "Improve resiliency against losses of whole frames", 0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_DEFAULT}, 0, 0, VE, "er"},
-    { "partitions",      "The frame partitions are independently decodable "
-                         "by the bool decoder, meaning that partitions can be decoded even "
-                         "though earlier partitions have been lost. Note that intra predicition"
-                         " is still done over the partition boundary.",       0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_PARTITIONS}, 0, 0, VE, "er"},
-#endif
-    { "crf",              "Select the quality for constant quality mode", offsetof(VP8Context, crf), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 63, VE },
-    { "static-thresh",    "A change threshold on blocks below which they will be skipped by the encoder", OFFSET(static_thresh), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
-    { "drop-threshold",   "Frame drop threshold", offsetof(VP8Context, drop_threshold), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, VE },
-    { "noise-sensitivity", "Noise sensitivity", OFFSET(noise_sensitivity), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 4, VE},
+                         "frames (2-pass only)",                        OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1}, -1, 6, VE},
+    { "cpu-used",        "Quality/Speed ratio modifier",                OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1},  -8, 8, VE},
+    { "lossless",        "Lossless mode",                               OFFSET(lossless),        AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, VE},
+    { "tile-columns",    "Number of tile columns to use, log2",         OFFSET(tile_columns),    AV_OPT_TYPE_INT, {.i64 = -1}, -1, 6, VE},
+    { "tile-rows",       "Number of tile rows to use, log2",            OFFSET(tile_rows),       AV_OPT_TYPE_INT, {.i64 = -1}, -1, 2, VE},
+    { "frame-parallel",  "Enable frame parallel decodability features", OFFSET(frame_parallel),  AV_OPT_TYPE_BOOL,{.i64 = -1}, -1, 1, VE},
+#if VPX_ENCODER_ABI_VERSION >= 12
+    { "aq-mode",         "adaptive quantization mode",                  OFFSET(aq_mode),         AV_OPT_TYPE_INT, {.i64 = -1}, -1, 4, VE, "aq_mode"},
+#else
+    { "aq-mode",         "adaptive quantization mode",                  OFFSET(aq_mode),         AV_OPT_TYPE_INT, {.i64 = -1}, -1, 3, VE, "aq_mode"},
+#endif
+    { "none",            "Aq not used",         0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 0, VE, "aq_mode" },
+    { "variance",        "Variance based Aq",   0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "aq_mode" },
+    { "complexity",      "Complexity based Aq", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "aq_mode" },
+    { "cyclic",          "Cyclic Refresh Aq",   0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "aq_mode" },
+#if VPX_ENCODER_ABI_VERSION >= 12
+    { "equator360",      "360 video Aq",        0, AV_OPT_TYPE_CONST, {.i64 = 4}, 0, 0, VE, "aq_mode" },
+    {"level", "Specify level", OFFSET(level), AV_OPT_TYPE_FLOAT, {.dbl=-1}, -1, 6.2, VE},
+#endif
+#ifdef VPX_CTRL_VP9E_SET_ROW_MT
+    {"row-mt", "Row based multi-threading", OFFSET(row_mt), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, VE},
+#endif
+#ifdef VPX_CTRL_VP9E_SET_TUNE_CONTENT
+#if VPX_ENCODER_ABI_VERSION >= 14
+    { "tune-content",    "Tune content type", OFFSET(tune_content), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 2, VE, "tune_content" },
+#else
+    { "tune-content",    "Tune content type", OFFSET(tune_content), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, VE, "tune_content" },
+#endif
+    { "default",         "Regular video content",                  0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 0, VE, "tune_content" },
+    { "screen",          "Screen capture content",                 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "tune_content" },
+#if VPX_ENCODER_ABI_VERSION >= 14
+    { "film",            "Film content; improves grain retention", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "tune_content" },
+#endif
+#endif
+#if VPX_ENCODER_ABI_VERSION >= 14
+    { "corpus-complexity", "corpus vbr complexity midpoint", OFFSET(corpus_complexity), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 10000, VE },
+#endif
+#ifdef VPX_CTRL_VP9E_SET_TPL
+    { "enable-tpl",      "Enable temporal dependency model", OFFSET(tpl_model), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, VE },
+#endif
+    LEGACY_OPTIONS
     { NULL }
 };
+#endif
+
+#undef COMMON_OPTIONS
+#undef LEGACY_OPTIONS
 
 static const AVCodecDefault defaults[] = {
     { "qmin",             "-1" },
@@ -626,13 +1276,13 @@ static const AVCodecDefault defaults[] = {
 #if CONFIG_LIBVPX_VP8_ENCODER
 static av_cold int vp8_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp8_cx_algo);
+    return vpx_init(avctx, vpx_codec_vp8_cx());
 }
 
 static const AVClass class_vp8 = {
-    .class_name = "libvpx encoder",
+    .class_name = "libvpx-vp8 encoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = vp8_options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
@@ -641,12 +1291,12 @@ AVCodec ff_libvpx_vp8_encoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("libvpx VP8"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_VP8,
-    .priv_data_size = sizeof(VP8Context),
+    .priv_data_size = sizeof(VPxContext),
     .init           = vp8_init,
-    .encode2        = vp8_encode,
-    .close          = vp8_free,
+    .encode2        = vpx_encode,
+    .close          = vpx_free,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P, AV_PIX_FMT_NONE },
     .priv_class     = &class_vp8,
     .defaults       = defaults,
     .wrapper_name   = "libvpx",
@@ -656,46 +1306,30 @@ AVCodec ff_libvpx_vp8_encoder = {
 #if CONFIG_LIBVPX_VP9_ENCODER
 static av_cold int vp9_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp9_cx_algo);
+    return vpx_init(avctx, vpx_codec_vp9_cx());
 }
 
 static const AVClass class_vp9 = {
-    .class_name = "libvpx encoder",
+    .class_name = "libvpx-vp9 encoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = vp9_options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static const AVProfile profiles[] = {
-    { FF_PROFILE_VP9_0, "Profile 0" },
-    { FF_PROFILE_VP9_1, "Profile 1" },
-    { FF_PROFILE_VP9_2, "Profile 2" },
-    { FF_PROFILE_VP9_3, "Profile 3" },
-    { FF_PROFILE_UNKNOWN },
-};
-
 AVCodec ff_libvpx_vp9_encoder = {
     .name           = "libvpx-vp9",
     .long_name      = NULL_IF_CONFIG_SMALL("libvpx VP9"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_VP9,
-    .priv_data_size = sizeof(VP8Context),
+    .priv_data_size = sizeof(VPxContext),
     .init           = vp9_init,
-    .encode2        = vp8_encode,
-    .close          = vp8_free,
+    .encode2        = vpx_encode,
+    .close          = vpx_free,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_YUV420P,
-#if VPX_IMAGE_ABI_VERSION >= 3
-        AV_PIX_FMT_YUV422P,
-        AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV440P,
-#endif
-        AV_PIX_FMT_NONE,
-    },
-    .profiles       = NULL_IF_CONFIG_SMALL(profiles),
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
     .priv_class     = &class_vp9,
     .defaults       = defaults,
+    .init_static_data = ff_vp9_init_static,
     .wrapper_name   = "libvpx",
 };
 #endif /* CONFIG_LIBVPX_VP9_ENCODER */
diff --git a/libavcodec/libwavpackenc.c b/libavcodec/libwavpackenc.c
index 3f4fc7c..e84b074 100644
--- a/libavcodec/libwavpackenc.c
+++ b/libavcodec/libwavpackenc.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libwebpenc.c b/libavcodec/libwebpenc.c
index 29683d7..48f45b6 100644
--- a/libavcodec/libwebpenc.c
+++ b/libavcodec/libwebpenc.c
@@ -2,213 +2,48 @@
  * WebP encoding support via libwebp
  * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * WebP encoder using libwebp
+ * WebP encoder using libwebp (WebPEncode API)
  */
 
-#include <webp/encode.h>
+#include "libwebpenc_common.h"
 
-#include "libavutil/common.h"
-#include "libavutil/frame.h"
-#include "libavutil/imgutils.h"
-#include "libavutil/opt.h"
-#include "avcodec.h"
-#include "internal.h"
-
-typedef struct LibWebPContext {
-    AVClass *class;         // class for AVOptions
-    float quality;          // lossy quality 0 - 100
-    int lossless;           // use lossless encoding
-    int preset;             // configuration preset
-    int chroma_warning;     // chroma linesize mismatch warning has been printed
-    int conversion_warning; // pixel format conversion warning has been printed
-    WebPConfig config;      // libwebp configuration
-} LibWebPContext;
-
-static int libwebp_error_to_averror(int err)
-{
-    switch (err) {
-    case VP8_ENC_ERROR_OUT_OF_MEMORY:
-    case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
-        return AVERROR(ENOMEM);
-    case VP8_ENC_ERROR_NULL_PARAMETER:
-    case VP8_ENC_ERROR_INVALID_CONFIGURATION:
-    case VP8_ENC_ERROR_BAD_DIMENSION:
-        return AVERROR(EINVAL);
-    }
-    return AVERROR_UNKNOWN;
-}
+typedef LibWebPContextCommon LibWebPContext;
 
 static av_cold int libwebp_encode_init(AVCodecContext *avctx)
 {
-    LibWebPContext *s = avctx->priv_data;
-    int ret;
-
-    if (avctx->global_quality < 0)
-        avctx->global_quality = 75 * FF_QP2LAMBDA;
-    s->quality = av_clipf(avctx->global_quality / (float)FF_QP2LAMBDA,
-                          0.0f, 100.0f);
-
-    if (avctx->compression_level < 0 || avctx->compression_level > 6) {
-        av_log(avctx, AV_LOG_WARNING, "invalid compression level: %d\n",
-               avctx->compression_level);
-        avctx->compression_level = av_clip(avctx->compression_level, 0, 6);
-    }
-
-    if (s->preset >= WEBP_PRESET_DEFAULT) {
-        ret = WebPConfigPreset(&s->config, s->preset, s->quality);
-        if (!ret)
-            return AVERROR_UNKNOWN;
-        s->lossless              = s->config.lossless;
-        s->quality               = s->config.quality;
-        avctx->compression_level = s->config.method;
-    } else {
-        ret = WebPConfigInit(&s->config);
-        if (!ret)
-            return AVERROR_UNKNOWN;
-
-        s->config.lossless = s->lossless;
-        s->config.quality  = s->quality;
-        s->config.method   = avctx->compression_level;
-
-        ret = WebPValidateConfig(&s->config);
-        if (!ret)
-            return AVERROR(EINVAL);
-    }
-
-    av_log(avctx, AV_LOG_DEBUG, "%s - quality=%.1f method=%d\n",
-           s->lossless ? "Lossless" : "Lossy", s->quality,
-           avctx->compression_level);
-
-    return 0;
+    return ff_libwebp_encode_init_common(avctx);
 }
 
 static int libwebp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                 const AVFrame *frame, int *got_packet)
 {
     LibWebPContext *s  = avctx->priv_data;
-    AVFrame *alt_frame = NULL;
     WebPPicture *pic = NULL;
+    AVFrame *alt_frame = NULL;
     WebPMemoryWriter mw = { 0 };
-    int ret;
 
-    if (avctx->width > WEBP_MAX_DIMENSION || avctx->height > WEBP_MAX_DIMENSION) {
-        av_log(avctx, AV_LOG_ERROR, "Picture size is too large. Max is %dx%d.\n",
-               WEBP_MAX_DIMENSION, WEBP_MAX_DIMENSION);
-        return AVERROR(EINVAL);
-    }
-
-    pic = av_malloc(sizeof(*pic));
-    if (!pic)
-        return AVERROR(ENOMEM);
-
-    ret = WebPPictureInit(pic);
-    if (!ret) {
-        ret = AVERROR_UNKNOWN;
+    int ret = ff_libwebp_get_frame(avctx, s, frame, &alt_frame, &pic);
+    if (ret < 0)
         goto end;
-    }
-    pic->width  = avctx->width;
-    pic->height = avctx->height;
-
-    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
-        if (!s->lossless) {
-            /* libwebp will automatically convert RGB input to YUV when
-               encoding lossy. */
-            if (!s->conversion_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Using libwebp for RGB-to-YUV conversion. You may want "
-                       "to consider passing in YUV instead for lossy "
-                       "encoding.\n");
-                s->conversion_warning = 1;
-            }
-        }
-        pic->use_argb    = 1;
-        pic->argb        = (uint32_t *)frame->data[0];
-        pic->argb_stride = frame->linesize[0] / 4;
-    } else {
-        if (frame->linesize[1] != frame->linesize[2]) {
-            if (!s->chroma_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Copying frame due to differing chroma linesizes.\n");
-                s->chroma_warning = 1;
-            }
-            alt_frame = av_frame_alloc();
-            if (!alt_frame) {
-                ret = AVERROR(ENOMEM);
-                goto end;
-            }
-            alt_frame->width  = frame->width;
-            alt_frame->height = frame->height;
-            alt_frame->format = frame->format;
-            ret = av_frame_get_buffer(alt_frame, 32);
-            if (ret < 0)
-                goto end;
-            av_frame_copy(alt_frame, frame);
-            frame = alt_frame;
-        }
-        pic->use_argb  = 0;
-        pic->y         = frame->data[0];
-        pic->u         = frame->data[1];
-        pic->v         = frame->data[2];
-        pic->y_stride  = frame->linesize[0];
-        pic->uv_stride = frame->linesize[1];
-        if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
-            pic->colorspace = WEBP_YUV420A;
-            pic->a          = frame->data[3];
-            pic->a_stride   = frame->linesize[3];
-        } else {
-            pic->colorspace = WEBP_YUV420;
-        }
-
-        if (s->lossless) {
-            /* We do not have a way to automatically prioritize RGB over YUV
-               in automatic pixel format conversion based on whether we're
-               encoding lossless or lossy, so we do conversion with libwebp as
-               a convenience. */
-            if (!s->conversion_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Using libwebp for YUV-to-RGB conversion. You may want "
-                       "to consider passing in RGB instead for lossless "
-                       "encoding.\n");
-                s->conversion_warning = 1;
-            }
-
-#if (WEBP_ENCODER_ABI_VERSION <= 0x201)
-            /* libwebp should do the conversion automatically, but there is a
-               bug that causes it to return an error instead, so a work-around
-               is required.
-               See https://code.google.com/p/webp/issues/detail?id=178 */
-            pic->memory_ = (void*)1;  /* something non-null */
-            ret = WebPPictureYUVAToARGB(pic);
-            if (!ret) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "WebPPictureYUVAToARGB() failed with error: %d\n",
-                       pic->error_code);
-                ret = libwebp_error_to_averror(pic->error_code);
-                goto end;
-            }
-            pic->memory_ = NULL;  /* restore pointer */
-#endif
-        }
-    }
 
     WebPMemoryWriterInit(&mw);
     pic->custom_ptr = &mw;
@@ -218,11 +53,11 @@ static int libwebp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (!ret) {
         av_log(avctx, AV_LOG_ERROR, "WebPEncode() failed with error: %d\n",
                pic->error_code);
-        ret = libwebp_error_to_averror(pic->error_code);
+        ret = ff_libwebp_error_to_averror(pic->error_code);
         goto end;
     }
 
-    ret = ff_alloc_packet(pkt, mw.size);
+    ret = ff_alloc_packet2(avctx, pkt, mw.size, mw.size);
     if (ret < 0)
         goto end;
     memcpy(pkt->data, mw.mem, mw.size);
@@ -243,20 +78,13 @@ end:
     return ret;
 }
 
-#define OFFSET(x) offsetof(LibWebPContext, x)
-#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "lossless",   "Use lossless mode",       OFFSET(lossless), AV_OPT_TYPE_INT,   { .i64 =  0 },  0, 1,                           VE           },
-    { "preset",     "Configuration preset",    OFFSET(preset),   AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, WEBP_PRESET_TEXT,            VE, "preset" },
-    { "none",       "do not use a preset",                              0, AV_OPT_TYPE_CONST, { .i64 = -1                  }, 0, 0, VE, "preset" },
-    { "default",    "default preset",                                   0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DEFAULT }, 0, 0, VE, "preset" },
-    { "picture",    "digital picture, like portrait, inner shot",       0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PICTURE }, 0, 0, VE, "preset" },
-    { "photo",      "outdoor photograph, with natural lighting",        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PHOTO   }, 0, 0, VE, "preset" },
-    { "drawing",    "hand or line drawing, with high-contrast details", 0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DRAWING }, 0, 0, VE, "preset" },
-    { "icon",       "small-sized colorful images",                      0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_ICON    }, 0, 0, VE, "preset" },
-    { "text",       "text-like",                                        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_TEXT    }, 0, 0, VE, "preset" },
-    { NULL },
-};
+static int libwebp_encode_close(AVCodecContext *avctx)
+{
+    LibWebPContextCommon *s  = avctx->priv_data;
+    av_frame_free(&s->ref);
+
+    return 0;
+}
 
 static const AVClass class = {
     .class_name = "libwebp",
@@ -265,12 +93,6 @@ static const AVClass class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static const AVCodecDefault libwebp_defaults[] = {
-    { "compression_level",  "4"  },
-    { "global_quality",     "-1" },
-    { NULL },
-};
-
 AVCodec ff_libwebp_encoder = {
     .name           = "libwebp",
     .long_name      = NULL_IF_CONFIG_SMALL("libwebp WebP image"),
@@ -279,6 +101,7 @@ AVCodec ff_libwebp_encoder = {
     .priv_data_size = sizeof(LibWebPContext),
     .init           = libwebp_encode_init,
     .encode2        = libwebp_encode_frame,
+    .close          = libwebp_encode_close,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB32,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P,
diff --git a/libavcodec/libwebpenc_animencoder.c b/libavcodec/libwebpenc_animencoder.c
new file mode 100644
index 0000000..7f35a0b
--- /dev/null
+++ b/libavcodec/libwebpenc_animencoder.c
@@ -0,0 +1,152 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2015 Urvang Joshi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp (WebPAnimEncoder API)
+ */
+
+#include "config.h"
+#include "libwebpenc_common.h"
+
+#include <webp/mux.h>
+
+typedef struct LibWebPAnimContext {
+    LibWebPContextCommon cc;
+    WebPAnimEncoder *enc;     // the main AnimEncoder object
+    int64_t prev_frame_pts;   // pts of the previously encoded frame.
+    int done;                 // If true, we have assembled the bitstream already
+} LibWebPAnimContext;
+
+static av_cold int libwebp_anim_encode_init(AVCodecContext *avctx)
+{
+    int ret = ff_libwebp_encode_init_common(avctx);
+    if (!ret) {
+        LibWebPAnimContext *s = avctx->priv_data;
+        WebPAnimEncoderOptions enc_options = { { 0 } };
+        WebPAnimEncoderOptionsInit(&enc_options);
+        enc_options.verbose = av_log_get_level() >= AV_LOG_VERBOSE;
+        // TODO(urvang): Expose some options on command-line perhaps.
+        s->enc = WebPAnimEncoderNew(avctx->width, avctx->height, &enc_options);
+        if (!s->enc)
+            return AVERROR(EINVAL);
+        s->prev_frame_pts = -1;
+        s->done = 0;
+    }
+    return ret;
+}
+
+static int libwebp_anim_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                     const AVFrame *frame, int *got_packet) {
+    LibWebPAnimContext *s = avctx->priv_data;
+    int ret;
+
+    if (!frame) {
+        if (s->done) {  // Second flush: return empty package to denote finish.
+            *got_packet = 0;
+            return 0;
+        } else {  // First flush: assemble bitstream and return it.
+            WebPData assembled_data = { 0 };
+            ret = WebPAnimEncoderAssemble(s->enc, &assembled_data);
+            if (ret) {
+                ret = ff_alloc_packet2(avctx, pkt, assembled_data.size, assembled_data.size);
+                if (ret < 0)
+                    return ret;
+                memcpy(pkt->data, assembled_data.bytes, assembled_data.size);
+                s->done = 1;
+                pkt->flags |= AV_PKT_FLAG_KEY;
+                pkt->pts = pkt->dts = s->prev_frame_pts + 1;
+                *got_packet = 1;
+                return 0;
+            } else {
+                av_log(s, AV_LOG_ERROR,
+                       "WebPAnimEncoderAssemble() failed with error: %d\n",
+                       VP8_ENC_ERROR_OUT_OF_MEMORY);
+                return AVERROR(ENOMEM);
+            }
+        }
+    } else {
+        int timestamp_ms;
+        WebPPicture *pic = NULL;
+        AVFrame *alt_frame = NULL;
+        ret = ff_libwebp_get_frame(avctx, &s->cc, frame, &alt_frame, &pic);
+        if (ret < 0)
+            goto end;
+
+        timestamp_ms =
+            avctx->time_base.num * frame->pts * 1000 / avctx->time_base.den;
+        ret = WebPAnimEncoderAdd(s->enc, pic, timestamp_ms, &s->cc.config);
+        if (!ret) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Encoding WebP frame failed with error: %d\n",
+                   pic->error_code);
+            ret = ff_libwebp_error_to_averror(pic->error_code);
+            goto end;
+        }
+
+        pkt->pts = pkt->dts = frame->pts;
+        s->prev_frame_pts = frame->pts;  // Save for next frame.
+        ret = 0;
+        *got_packet = 1;
+
+end:
+        WebPPictureFree(pic);
+        av_freep(&pic);
+        av_frame_free(&alt_frame);
+        return ret;
+    }
+}
+
+static int libwebp_anim_encode_close(AVCodecContext *avctx)
+{
+    LibWebPAnimContext *s = avctx->priv_data;
+    av_frame_free(&s->cc.ref);
+    WebPAnimEncoderDelete(s->enc);
+
+    return 0;
+}
+
+static const AVClass class = {
+    .class_name = "libwebp_anim",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libwebp_anim_encoder = {
+    .name           = "libwebp_anim",
+    .long_name      = NULL_IF_CONFIG_SMALL("libwebp WebP image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WEBP,
+    .priv_data_size = sizeof(LibWebPAnimContext),
+    .init           = libwebp_anim_encode_init,
+    .encode2        = libwebp_anim_encode_frame,
+    .close          = libwebp_anim_encode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB32,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &class,
+    .defaults       = libwebp_defaults,
+    .wrapper_name   = "libwebp",
+};
diff --git a/libavcodec/libwebpenc_common.c b/libavcodec/libwebpenc_common.c
new file mode 100644
index 0000000..21d7ada
--- /dev/null
+++ b/libavcodec/libwebpenc_common.c
@@ -0,0 +1,254 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp: common structs and methods.
+ */
+
+#include "libwebpenc_common.h"
+
+int ff_libwebp_error_to_averror(int err)
+{
+    switch (err) {
+    case VP8_ENC_ERROR_OUT_OF_MEMORY:
+    case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
+        return AVERROR(ENOMEM);
+    case VP8_ENC_ERROR_NULL_PARAMETER:
+    case VP8_ENC_ERROR_INVALID_CONFIGURATION:
+    case VP8_ENC_ERROR_BAD_DIMENSION:
+        return AVERROR(EINVAL);
+    }
+    return AVERROR_UNKNOWN;
+}
+
+av_cold int ff_libwebp_encode_init_common(AVCodecContext *avctx)
+{
+    LibWebPContextCommon *s = avctx->priv_data;
+    int ret;
+
+    if (avctx->global_quality >= 0)
+        s->quality = av_clipf(avctx->global_quality / (float)FF_QP2LAMBDA,
+                              0.0f, 100.0f);
+
+    if (avctx->compression_level < 0 || avctx->compression_level > 6) {
+        av_log(avctx, AV_LOG_WARNING, "invalid compression level: %d\n",
+               avctx->compression_level);
+        avctx->compression_level = av_clip(avctx->compression_level, 0, 6);
+    }
+
+    if (s->preset >= WEBP_PRESET_DEFAULT) {
+        ret = WebPConfigPreset(&s->config, s->preset, s->quality);
+        if (!ret)
+            return AVERROR_UNKNOWN;
+        s->lossless              = s->config.lossless;
+        s->quality               = s->config.quality;
+        avctx->compression_level = s->config.method;
+    } else {
+        ret = WebPConfigInit(&s->config);
+        if (!ret)
+            return AVERROR_UNKNOWN;
+
+        s->config.lossless = s->lossless;
+        s->config.quality  = s->quality;
+        s->config.method   = avctx->compression_level;
+
+        ret = WebPValidateConfig(&s->config);
+        if (!ret)
+            return AVERROR(EINVAL);
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "%s - quality=%.1f method=%d\n",
+           s->lossless ? "Lossless" : "Lossy", s->quality,
+           avctx->compression_level);
+
+    return 0;
+}
+
+int ff_libwebp_get_frame(AVCodecContext *avctx, LibWebPContextCommon *s,
+                         const AVFrame *frame, AVFrame **alt_frame_ptr,
+                         WebPPicture **pic_ptr) {
+    int ret;
+    WebPPicture *pic = NULL;
+    AVFrame *alt_frame = NULL;
+
+    if (avctx->width > WEBP_MAX_DIMENSION || avctx->height > WEBP_MAX_DIMENSION) {
+        av_log(avctx, AV_LOG_ERROR, "Picture size is too large. Max is %dx%d.\n",
+               WEBP_MAX_DIMENSION, WEBP_MAX_DIMENSION);
+        return AVERROR(EINVAL);
+    }
+
+    *pic_ptr = av_malloc(sizeof(*pic));
+    pic = *pic_ptr;
+    if (!pic)
+        return AVERROR(ENOMEM);
+
+    ret = WebPPictureInit(pic);
+    if (!ret) {
+        ret = AVERROR_UNKNOWN;
+        goto end;
+    }
+    pic->width  = avctx->width;
+    pic->height = avctx->height;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+        if (!s->lossless) {
+            /* libwebp will automatically convert RGB input to YUV when
+               encoding lossy. */
+            if (!s->conversion_warning) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Using libwebp for RGB-to-YUV conversion. You may want "
+                       "to consider passing in YUV instead for lossy "
+                       "encoding.\n");
+                s->conversion_warning = 1;
+            }
+        }
+        pic->use_argb    = 1;
+        pic->argb        = (uint32_t *)frame->data[0];
+        pic->argb_stride = frame->linesize[0] / 4;
+    } else {
+        if (frame->linesize[1] != frame->linesize[2] || s->cr_threshold) {
+            if (!s->chroma_warning && !s->cr_threshold) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Copying frame due to differing chroma linesizes.\n");
+                s->chroma_warning = 1;
+            }
+            *alt_frame_ptr = av_frame_alloc();
+            alt_frame = *alt_frame_ptr;
+            if (!alt_frame) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            alt_frame->width  = frame->width;
+            alt_frame->height = frame->height;
+            alt_frame->format = frame->format;
+            if (s->cr_threshold)
+                alt_frame->format = AV_PIX_FMT_YUVA420P;
+            ret = av_frame_get_buffer(alt_frame, 32);
+            if (ret < 0)
+                goto end;
+            alt_frame->format = frame->format;
+            av_frame_copy(alt_frame, frame);
+            frame = alt_frame;
+            if (s->cr_threshold) {
+                int x,y, x2, y2, p;
+                int bs = s->cr_size;
+
+                if (!s->ref) {
+                    s->ref = av_frame_clone(frame);
+                    if (!s->ref) {
+                        ret = AVERROR(ENOMEM);
+                        goto end;
+                    }
+                }
+
+                alt_frame->format = AV_PIX_FMT_YUVA420P;
+                for (y = 0; y < frame->height; y+= bs) {
+                    for (x = 0; x < frame->width; x+= bs) {
+                        int skip;
+                        int sse = 0;
+                        for (p = 0; p < 3; p++) {
+                            int bs2 = bs >> !!p;
+                            int w = AV_CEIL_RSHIFT(frame->width , !!p);
+                            int h = AV_CEIL_RSHIFT(frame->height, !!p);
+                            int xs = x >> !!p;
+                            int ys = y >> !!p;
+                            for (y2 = ys; y2 < FFMIN(ys + bs2, h); y2++) {
+                                for (x2 = xs; x2 < FFMIN(xs + bs2, w); x2++) {
+                                    int diff =  frame->data[p][frame->linesize[p] * y2 + x2]
+                                              -s->ref->data[p][frame->linesize[p] * y2 + x2];
+                                    sse += diff*diff;
+                                }
+                            }
+                        }
+                        skip = sse < s->cr_threshold && frame->data[3] != s->ref->data[3];
+                        if (!skip)
+                            for (p = 0; p < 3; p++) {
+                                int bs2 = bs >> !!p;
+                                int w = AV_CEIL_RSHIFT(frame->width , !!p);
+                                int h = AV_CEIL_RSHIFT(frame->height, !!p);
+                                int xs = x >> !!p;
+                                int ys = y >> !!p;
+                                for (y2 = ys; y2 < FFMIN(ys + bs2, h); y2++) {
+                                    memcpy(&s->ref->data[p][frame->linesize[p] * y2 + xs],
+                                            & frame->data[p][frame->linesize[p] * y2 + xs], FFMIN(bs2, w-xs));
+                                }
+                            }
+                        for (y2 = y; y2 < FFMIN(y+bs, frame->height); y2++) {
+                            memset(&frame->data[3][frame->linesize[3] * y2 + x],
+                                    skip ? 0 : 255,
+                                    FFMIN(bs, frame->width-x));
+                        }
+                    }
+                }
+            }
+        }
+
+        pic->use_argb  = 0;
+        pic->y         = frame->data[0];
+        pic->u         = frame->data[1];
+        pic->v         = frame->data[2];
+        pic->y_stride  = frame->linesize[0];
+        pic->uv_stride = frame->linesize[1];
+        if (frame->format == AV_PIX_FMT_YUVA420P) {
+            pic->colorspace = WEBP_YUV420A;
+            pic->a          = frame->data[3];
+            pic->a_stride   = frame->linesize[3];
+            if (alt_frame)
+                WebPCleanupTransparentArea(pic);
+        } else {
+            pic->colorspace = WEBP_YUV420;
+        }
+
+        if (s->lossless) {
+            /* We do not have a way to automatically prioritize RGB over YUV
+               in automatic pixel format conversion based on whether we're
+               encoding lossless or lossy, so we do conversion with libwebp as
+               a convenience. */
+            if (!s->conversion_warning) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Using libwebp for YUV-to-RGB conversion. You may want "
+                       "to consider passing in RGB instead for lossless "
+                       "encoding.\n");
+                s->conversion_warning = 1;
+            }
+
+#if (WEBP_ENCODER_ABI_VERSION <= 0x201)
+            /* libwebp should do the conversion automatically, but there is a
+               bug that causes it to return an error instead, so a work-around
+               is required.
+               See https://code.google.com/p/webp/issues/detail?id=178 */
+            pic->memory_ = (void*)1;  /* something non-null */
+            ret = WebPPictureYUVAToARGB(pic);
+            if (!ret) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "WebPPictureYUVAToARGB() failed with error: %d\n",
+                       pic->error_code);
+                ret = libwebp_error_to_averror(pic->error_code);
+                goto end;
+            }
+            pic->memory_ = NULL;  /* restore pointer */
+#endif
+        }
+    }
+end:
+    return ret;
+}
diff --git a/libavcodec/libwebpenc_common.h b/libavcodec/libwebpenc_common.h
new file mode 100644
index 0000000..e74e579
--- /dev/null
+++ b/libavcodec/libwebpenc_common.h
@@ -0,0 +1,84 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp: common structs and methods.
+ */
+
+#ifndef AVCODEC_LIBWEBPENC_COMMON_H
+#define AVCODEC_LIBWEBPENC_COMMON_H
+
+#include <webp/encode.h>
+
+#include "libavutil/common.h"
+#include "libavutil/frame.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+
+typedef struct LibWebPContextCommon {
+    AVClass *class;         // class for AVOptions
+    float quality;          // lossy quality 0 - 100
+    int lossless;           // use lossless encoding
+    int preset;             // configuration preset
+    int chroma_warning;     // chroma linesize mismatch warning has been printed
+    int conversion_warning; // pixel format conversion warning has been printed
+    WebPConfig config;      // libwebp configuration
+    AVFrame *ref;
+    int cr_size;
+    int cr_threshold;
+} LibWebPContextCommon;
+
+int ff_libwebp_error_to_averror(int err);
+
+av_cold int ff_libwebp_encode_init_common(AVCodecContext *avctx);
+
+int ff_libwebp_get_frame(AVCodecContext *avctx, LibWebPContextCommon *s,
+                         const AVFrame *frame, AVFrame **alt_frame_ptr,
+                         WebPPicture **pic_ptr);
+
+#define OFFSET(x) offsetof(LibWebPContextCommon, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "lossless",   "Use lossless mode",       OFFSET(lossless), AV_OPT_TYPE_INT,   { .i64 =  0 },  0, 1,                           VE           },
+    { "preset",     "Configuration preset",    OFFSET(preset),   AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, WEBP_PRESET_TEXT,            VE, "preset" },
+    { "none",       "do not use a preset",                              0, AV_OPT_TYPE_CONST, { .i64 = -1                  }, 0, 0, VE, "preset" },
+    { "default",    "default preset",                                   0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DEFAULT }, 0, 0, VE, "preset" },
+    { "picture",    "digital picture, like portrait, inner shot",       0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PICTURE }, 0, 0, VE, "preset" },
+    { "photo",      "outdoor photograph, with natural lighting",        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PHOTO   }, 0, 0, VE, "preset" },
+    { "drawing",    "hand or line drawing, with high-contrast details", 0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DRAWING }, 0, 0, VE, "preset" },
+    { "icon",       "small-sized colorful images",                      0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_ICON    }, 0, 0, VE, "preset" },
+    { "text",       "text-like",                                        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_TEXT    }, 0, 0, VE, "preset" },
+    { "cr_threshold","Conditional replenishment threshold",     OFFSET(cr_threshold), AV_OPT_TYPE_INT, { .i64 =  0  },  0, INT_MAX, VE           },
+    { "cr_size"     ,"Conditional replenishment block size",    OFFSET(cr_size)     , AV_OPT_TYPE_INT, { .i64 =  16 },  0, 256,     VE           },
+    { "quality"     ,"Quality",                OFFSET(quality),  AV_OPT_TYPE_FLOAT, { .dbl =  75 }, 0, 100,                         VE           },
+    { NULL },
+};
+
+static const AVCodecDefault libwebp_defaults[] = {
+    { "compression_level",  "4"  },
+    { "global_quality",     "-1" },
+    { NULL },
+};
+
+#endif /* AVCODEC_LIBWEBPENC_COMMON_H */
diff --git a/libavcodec/libx264.c b/libavcodec/libx264.c
index f30a4cc..a3493f3 100644
--- a/libavcodec/libx264.c
+++ b/libavcodec/libx264.c
@@ -2,28 +2,30 @@
  * H.264 encoding using the x264 library
  * Copyright (C) 2005  Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/eval.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/stereo3d.h"
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
 
@@ -38,6 +40,10 @@
 #include <stdlib.h>
 #include <string.h>
 
+// from x264.h, for quant_offsets, Macroblocks are 16x16
+// blocks of pixels (with respect to the luma plane)
+#define MB_SIZE 16
+
 typedef struct X264Context {
     AVClass        *class;
     x264_param_t    params;
@@ -48,7 +54,10 @@ typedef struct X264Context {
     char *preset;
     char *tune;
     char *profile;
+    char *level;
     int fastfirstpass;
+    char *wpredp;
+    char *x264opts;
     float crf;
     float crf_max;
     int cqp;
@@ -76,9 +85,11 @@ typedef struct X264Context {
     int slice_max_size;
     char *stats;
     int nal_hrd;
+    int avcintra_class;
     int motion_est;
     int forced_idr;
     int coder;
+    int a53_cc;
     int b_frame_strategy;
     int chroma_offset;
     int scenechange_threshold;
@@ -107,7 +118,7 @@ static void X264_log(void *p, int level, const char *fmt, va_list args)
 
 
 static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
-                       x264_nal_t *nals, int nnal)
+                       const x264_nal_t *nals, int nnal)
 {
     X264Context *x4 = ctx->priv_data;
     uint8_t *p;
@@ -119,16 +130,21 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     for (i = 0; i < nnal; i++)
         size += nals[i].i_payload;
 
-    if ((ret = ff_alloc_packet(pkt, size)) < 0)
+    if ((ret = ff_alloc_packet2(ctx, pkt, size, 0)) < 0)
         return ret;
 
     p = pkt->data;
 
     /* Write the SEI as part of the first frame. */
     if (x4->sei_size > 0 && nnal > 0) {
+        if (x4->sei_size > size) {
+            av_log(ctx, AV_LOG_ERROR, "Error: nal buffer is too small\n");
+            return -1;
+        }
         memcpy(p, x4->sei, x4->sei_size);
         p += x4->sei_size;
         x4->sei_size = 0;
+        av_freep(&x4->sei);
     }
 
     for (i = 0; i < nnal; i++){
@@ -139,18 +155,41 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     return 1;
 }
 
+static int avfmt2_num_planes(int avfmt)
+{
+    switch (avfmt) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUVJ420P:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV444P:
+        return 3;
+
+    case AV_PIX_FMT_BGR0:
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_GRAY10:
+        return 1;
+
+    default:
+        return 3;
+    }
+}
+
 static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
 {
     X264Context *x4 = ctx->priv_data;
     AVFrameSideData *side_data;
 
 
-    if (x4->params.b_tff != frame->top_field_first) {
+  if (x4->avcintra_class < 0) {
+    if (x4->params.b_interlaced && x4->params.b_tff != frame->top_field_first) {
+
         x4->params.b_tff = frame->top_field_first;
         x264_encoder_reconfig(x4->enc, &x4->params);
     }
-    if (x4->params.vui.i_sar_height != ctx->sample_aspect_ratio.den ||
-        x4->params.vui.i_sar_width  != ctx->sample_aspect_ratio.num) {
+    if (x4->params.vui.i_sar_height*ctx->sample_aspect_ratio.num != ctx->sample_aspect_ratio.den * x4->params.vui.i_sar_width) {
         x4->params.vui.i_sar_height = ctx->sample_aspect_ratio.den;
         x4->params.vui.i_sar_width  = ctx->sample_aspect_ratio.num;
         x264_encoder_reconfig(x4->enc, &x4->params);
@@ -177,6 +216,7 @@ static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
     }
 
     if (x4->params.rc.i_rc_method == X264_RC_CQP &&
+        x4->cqp >= 0 &&
         x4->params.rc.i_qp_constant != x4->cqp) {
         x4->params.rc.i_qp_constant = x4->cqp;
         x264_encoder_reconfig(x4->enc, &x4->params);
@@ -187,6 +227,7 @@ static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
         x4->params.rc.f_rf_constant_max = x4->crf_max;
         x264_encoder_reconfig(x4->enc, &x4->params);
     }
+  }
 
     side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_STEREO3D);
     if (side_data) {
@@ -242,8 +283,10 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
     X264Context *x4 = ctx->priv_data;
     x264_nal_t *nal;
     int nnal, i, ret;
-    x264_picture_t pic_out;
+    x264_picture_t pic_out = {0};
+    int pict_type;
     int64_t *out_opaque;
+    AVFrameSideData *sd;
 
     x264_picture_init( &x4->pic );
     x4->pic.img.i_csp   = x4->params.i_csp;
@@ -253,10 +296,10 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
     if (x264_bit_depth > 8)
 #endif
         x4->pic.img.i_csp |= X264_CSP_HIGH_DEPTH;
-    x4->pic.img.i_plane = 3;
+    x4->pic.img.i_plane = avfmt2_num_planes(ctx->pix_fmt);
 
     if (frame) {
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < x4->pic.img.i_plane; i++) {
             x4->pic.img.plane[i]    = frame->data[i];
             x4->pic.img.i_stride[i] = frame->linesize[i];
         }
@@ -270,8 +313,8 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
 
         switch (frame->pict_type) {
         case AV_PICTURE_TYPE_I:
-            x4->pic.i_type = x4->forced_idr ? X264_TYPE_IDR
-                                            : X264_TYPE_KEYFRAME;
+            x4->pic.i_type = x4->forced_idr > 0 ? X264_TYPE_IDR
+                                                : X264_TYPE_KEYFRAME;
             break;
         case AV_PICTURE_TYPE_P:
             x4->pic.i_type = X264_TYPE_P;
@@ -284,10 +327,91 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
             break;
         }
         reconfig_encoder(ctx, frame);
+
+        if (x4->a53_cc) {
+            void *sei_data;
+            size_t sei_size;
+
+            ret = ff_alloc_a53_sei(frame, 0, &sei_data, &sei_size);
+            if (ret < 0) {
+                av_log(ctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+            } else if (sei_data) {
+                x4->pic.extra_sei.payloads = av_mallocz(sizeof(x4->pic.extra_sei.payloads[0]));
+                if (x4->pic.extra_sei.payloads == NULL) {
+                    av_log(ctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+                    av_free(sei_data);
+                } else {
+                    x4->pic.extra_sei.sei_free = av_free;
+
+                    x4->pic.extra_sei.payloads[0].payload_size = sei_size;
+                    x4->pic.extra_sei.payloads[0].payload = sei_data;
+                    x4->pic.extra_sei.num_payloads = 1;
+                    x4->pic.extra_sei.payloads[0].payload_type = 4;
+                }
+            }
+        }
+
+        sd = av_frame_get_side_data(frame, AV_FRAME_DATA_REGIONS_OF_INTEREST);
+        if (sd) {
+            if (x4->params.rc.i_aq_mode == X264_AQ_NONE) {
+                av_log(ctx, AV_LOG_WARNING, "Adaptive quantization must be enabled to use ROI encoding, skipping ROI.\n");
+            } else {
+                if (frame->interlaced_frame == 0) {
+                    int mbx = (frame->width + MB_SIZE - 1) / MB_SIZE;
+                    int mby = (frame->height + MB_SIZE - 1) / MB_SIZE;
+                    int nb_rois;
+                    AVRegionOfInterest* roi;
+                    float* qoffsets;
+                    qoffsets = av_mallocz_array(mbx * mby, sizeof(*qoffsets));
+                    if (!qoffsets)
+                        return AVERROR(ENOMEM);
+
+                    nb_rois = sd->size / sizeof(AVRegionOfInterest);
+                    roi = (AVRegionOfInterest*)sd->data;
+                    for (int count = 0; count < nb_rois; count++) {
+                        int starty = FFMIN(mby, roi->top / MB_SIZE);
+                        int endy   = FFMIN(mby, (roi->bottom + MB_SIZE - 1)/ MB_SIZE);
+                        int startx = FFMIN(mbx, roi->left / MB_SIZE);
+                        int endx   = FFMIN(mbx, (roi->right + MB_SIZE - 1)/ MB_SIZE);
+                        float qoffset;
+
+                        if (roi->qoffset.den == 0) {
+                            av_free(qoffsets);
+                            av_log(ctx, AV_LOG_ERROR, "AVRegionOfInterest.qoffset.den should not be zero.\n");
+                            return AVERROR(EINVAL);
+                        }
+                        qoffset = roi->qoffset.num * 1.0f / roi->qoffset.den;
+                        qoffset = av_clipf(qoffset, -1.0f, 1.0f);
+
+                        // 25 is a number that I think it is a possible proper scale value.
+                        qoffset = qoffset * 25;
+
+                        for (int y = starty; y < endy; y++) {
+                            for (int x = startx; x < endx; x++) {
+                                qoffsets[x + y*mbx] = qoffset;
+                            }
+                        }
+
+                        if (roi->self_size == 0) {
+                            av_free(qoffsets);
+                            av_log(ctx, AV_LOG_ERROR, "AVRegionOfInterest.self_size should be set to sizeof(AVRegionOfInterest).\n");
+                            return AVERROR(EINVAL);
+                        }
+                        roi = (AVRegionOfInterest*)((char*)roi + roi->self_size);
+                    }
+
+                    x4->pic.prop.quant_offsets = qoffsets;
+                    x4->pic.prop.quant_offsets_free = av_free;
+                } else {
+                    av_log(ctx, AV_LOG_WARNING, "interlaced_frame not supported for ROI encoding yet, skipping ROI.\n");
+                }
+            }
+        }
     }
+
     do {
         if (x264_encoder_encode(x4->enc, &nal, &nnal, frame? &x4->pic: NULL, &pic_out) < 0)
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
 
         ret = encode_nals(ctx, pkt, nal, nnal);
         if (ret < 0)
@@ -306,31 +430,30 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
         ctx->reordered_opaque = 0;
     }
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
     switch (pic_out.i_type) {
     case X264_TYPE_IDR:
     case X264_TYPE_I:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case X264_TYPE_P:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case X264_TYPE_B:
     case X264_TYPE_BREF:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
+    default:
+        pict_type = AV_PICTURE_TYPE_NONE;
     }
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    ctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     pkt->flags |= AV_PKT_FLAG_KEY*pic_out.b_keyframe;
     if (ret) {
-        uint8_t *sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                              sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA;
+        ff_side_data_set_encoder_stats(pkt, (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
 
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -359,6 +482,20 @@ static av_cold int X264_close(AVCodecContext *avctx)
     return 0;
 }
 
+#define OPT_STR(opt, param)                                                   \
+    do {                                                                      \
+        int ret;                                                              \
+        if ((ret = x264_param_parse(&x4->params, opt, param)) < 0) { \
+            if(ret == X264_PARAM_BAD_NAME)                                    \
+                av_log(avctx, AV_LOG_ERROR,                                   \
+                        "bad option '%s': '%s'\n", opt, param);               \
+            else                                                              \
+                av_log(avctx, AV_LOG_ERROR,                                   \
+                        "bad value for '%s': '%s'\n", opt, param);            \
+            return -1;                                                        \
+        }                                                                     \
+    } while (0)
+
 static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
 {
     switch (pix_fmt) {
@@ -373,12 +510,25 @@ static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
     case AV_PIX_FMT_YUVJ444P:
     case AV_PIX_FMT_YUV444P9:
     case AV_PIX_FMT_YUV444P10: return X264_CSP_I444;
+#if CONFIG_LIBX264RGB_ENCODER
+    case AV_PIX_FMT_BGR0:
+        return X264_CSP_BGRA;
+    case AV_PIX_FMT_BGR24:
+        return X264_CSP_BGR;
+
+    case AV_PIX_FMT_RGB24:
+        return X264_CSP_RGB;
+#endif
     case AV_PIX_FMT_NV12:      return X264_CSP_NV12;
     case AV_PIX_FMT_NV16:
     case AV_PIX_FMT_NV20:      return X264_CSP_NV16;
 #ifdef X264_CSP_NV21
     case AV_PIX_FMT_NV21:      return X264_CSP_NV21;
 #endif
+#ifdef X264_CSP_I400
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_GRAY10:    return X264_CSP_I400;
+#endif
     };
     return 0;
 }
@@ -393,21 +543,33 @@ static av_cold int X264_init(AVCodecContext *avctx)
 {
     X264Context *x4 = avctx->priv_data;
     AVCPBProperties *cpb_props;
+    int sw,sh;
+
+    if (avctx->global_quality > 0)
+        av_log(avctx, AV_LOG_WARNING, "-qscale is ignored, -crf is recommended.\n");
 
 #if CONFIG_LIBX262_ENCODER
     if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
         x4->params.b_mpeg2 = 1;
         x264_param_default_mpeg2(&x4->params);
     } else
-#else
-    x264_param_default(&x4->params);
 #endif
+    x264_param_default(&x4->params);
 
     x4->params.b_deblocking_filter         = avctx->flags & AV_CODEC_FLAG_LOOP_FILTER;
 
     if (x4->preset || x4->tune)
         if (x264_param_default_preset(&x4->params, x4->preset, x4->tune) < 0) {
+            int i;
             av_log(avctx, AV_LOG_ERROR, "Error setting preset/tune %s/%s.\n", x4->preset, x4->tune);
+            av_log(avctx, AV_LOG_INFO, "Possible presets:");
+            for (i = 0; x264_preset_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_preset_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
+            av_log(avctx, AV_LOG_INFO, "Possible tunes:");
+            for (i = 0; x264_tune_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_tune_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
             return AVERROR(EINVAL);
         }
 
@@ -422,6 +584,8 @@ static av_cold int X264_init(AVCodecContext *avctx)
     x4->params.i_bitdepth           = av_pix_fmt_desc_get(avctx->pix_fmt)->comp[0].depth;
 #endif
 
+    PARSE_X264_OPT("weightp", wpredp);
+
     if (avctx->bit_rate) {
         x4->params.rc.i_bitrate   = avctx->bit_rate / 1000;
         x4->params.rc.i_rc_method = X264_RC_ABR;
@@ -450,9 +614,12 @@ static av_cold int X264_init(AVCodecContext *avctx)
             (float)avctx->rc_initial_buffer_occupancy / avctx->rc_buffer_size;
     }
 
+    PARSE_X264_OPT("level", level);
+
     if (avctx->i_quant_factor > 0)
         x4->params.rc.f_ip_factor         = 1 / fabs(avctx->i_quant_factor);
-    x4->params.rc.f_pb_factor             = avctx->b_quant_factor;
+    if (avctx->b_quant_factor > 0)
+        x4->params.rc.f_pb_factor         = avctx->b_quant_factor;
 
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -489,6 +656,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
         x4->params.rc.f_qcompress       = avctx->qcompress; /* 0.0 => cbr, 1.0 => constant qp */
     if (avctx->refs >= 0)
         x4->params.i_frame_reference    = avctx->refs;
+    else if (x4->level) {
+        int i;
+        int mbn = AV_CEIL_RSHIFT(avctx->width, 4) * AV_CEIL_RSHIFT(avctx->height, 4);
+        int level_id = -1;
+        char *tail;
+        int scale = X264_BUILD < 129 ? 384 : 1;
+
+        if (!strcmp(x4->level, "1b")) {
+            level_id = 9;
+        } else if (strlen(x4->level) <= 3){
+            level_id = av_strtod(x4->level, &tail) * 10 + 0.5;
+            if (*tail)
+                level_id = -1;
+        }
+        if (level_id <= 0)
+            av_log(avctx, AV_LOG_WARNING, "Failed to parse level\n");
+
+        for (i = 0; i<x264_levels[i].level_idc; i++)
+            if (x264_levels[i].level_idc == level_id)
+                x4->params.i_frame_reference = av_clip(x264_levels[i].dpb / mbn / scale, 1, x4->params.i_frame_reference);
+    }
+
     if (avctx->trellis >= 0)
         x4->params.analyse.i_trellis    = avctx->trellis;
     if (avctx->me_range >= 0)
@@ -547,6 +736,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         x4->params.b_bluray_compat = x4->bluray_compat;
         x4->params.b_vfr_input = 0;
     }
+    if (x4->avcintra_class >= 0)
+#if X264_BUILD >= 142
+        x4->params.i_avcintra_class = x4->avcintra_class;
+#else
+        av_log(avctx, AV_LOG_ERROR,
+               "x264 too old for AVC Intra, at least version 142 needed\n");
+#endif
     if (x4->b_bias != INT_MIN)
         x4->params.i_bframe_bias              = x4->b_bias;
     if (x4->b_pyramid >= 0)
@@ -570,6 +766,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (x4->fastfirstpass)
         x264_param_apply_fastfirstpass(&x4->params);
 
+    /* Allow specifying the x264 profile through AVCodecContext. */
+    if (!x4->profile)
+        switch (avctx->profile) {
+        case FF_PROFILE_H264_BASELINE:
+            x4->profile = av_strdup("baseline");
+            break;
+        case FF_PROFILE_H264_HIGH:
+            x4->profile = av_strdup("high");
+            break;
+        case FF_PROFILE_H264_HIGH_10:
+            x4->profile = av_strdup("high10");
+            break;
+        case FF_PROFILE_H264_HIGH_422:
+            x4->profile = av_strdup("high422");
+            break;
+        case FF_PROFILE_H264_HIGH_444:
+            x4->profile = av_strdup("high444");
+            break;
+        case FF_PROFILE_H264_MAIN:
+            x4->profile = av_strdup("main");
+            break;
+        default:
+            break;
+        }
+
     if (x4->nal_hrd >= 0)
         x4->params.i_nal_hrd = x4->nal_hrd;
 
@@ -584,16 +805,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (x4->profile)
         if (x264_param_apply_profile(&x4->params, x4->profile) < 0) {
+            int i;
             av_log(avctx, AV_LOG_ERROR, "Error setting profile %s.\n", x4->profile);
+            av_log(avctx, AV_LOG_INFO, "Possible profiles:");
+            for (i = 0; x264_profile_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_profile_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
             return AVERROR(EINVAL);
         }
 
     x4->params.i_width          = avctx->width;
     x4->params.i_height         = avctx->height;
-    x4->params.vui.i_sar_width  = avctx->sample_aspect_ratio.num;
-    x4->params.vui.i_sar_height = avctx->sample_aspect_ratio.den;
-    x4->params.i_fps_num = x4->params.i_timebase_den = avctx->time_base.den;
-    x4->params.i_fps_den = x4->params.i_timebase_num = avctx->time_base.num;
+    av_reduce(&sw, &sh, avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den, 4096);
+    x4->params.vui.i_sar_width  = sw;
+    x4->params.vui.i_sar_height = sh;
+    x4->params.i_timebase_den = avctx->time_base.den;
+    x4->params.i_timebase_num = avctx->time_base.num;
+    x4->params.i_fps_num = avctx->time_base.den;
+    x4->params.i_fps_den = avctx->time_base.num * avctx->ticks_per_frame;
 
     x4->params.analyse.b_psnr = avctx->flags & AV_CODEC_FLAG_PSNR;
 
@@ -612,14 +841,29 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                  avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
                                  avctx->color_range == AVCOL_RANGE_JPEG;
 
-    // x264 validates the values internally
-    x4->params.vui.i_colorprim = avctx->color_primaries;
-    x4->params.vui.i_transfer  = avctx->color_trc;
-    x4->params.vui.i_colmatrix = avctx->colorspace;
+    if (avctx->colorspace != AVCOL_SPC_UNSPECIFIED)
+        x4->params.vui.i_colmatrix = avctx->colorspace;
+    if (avctx->color_primaries != AVCOL_PRI_UNSPECIFIED)
+        x4->params.vui.i_colorprim = avctx->color_primaries;
+    if (avctx->color_trc != AVCOL_TRC_UNSPECIFIED)
+        x4->params.vui.i_transfer  = avctx->color_trc;
 
     if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)
         x4->params.b_repeat_headers = 0;
 
+    if(x4->x264opts){
+        const char *p= x4->x264opts;
+        while(p){
+            char param[4096]={0}, val[4096]={0};
+            if(sscanf(p, "%4095[^:=]=%4095[^:]", param, val) == 1){
+                OPT_STR(param, "1");
+            }else
+                OPT_STR(param, val);
+            p= strchr(p, ':');
+            p+=!!p;
+        }
+    }
+
     if (x4->x264_params) {
         AVDictionary *dict    = NULL;
         AVDictionaryEntry *en = NULL;
@@ -646,7 +890,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     x4->enc = x264_encoder_open(&x4->params);
     if (!x4->enc)
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
 
     if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
         x264_nal_t *nal;
@@ -693,7 +937,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
-#if X264_BUILD < 153
 static const enum AVPixelFormat pix_fmts_8bit[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_YUVJ420P,
@@ -720,7 +963,6 @@ static const enum AVPixelFormat pix_fmts_10bit[] = {
     AV_PIX_FMT_NV20,
     AV_PIX_FMT_NONE
 };
-#else
 static const enum AVPixelFormat pix_fmts_all[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_YUVJ420P,
@@ -730,11 +972,24 @@ static const enum AVPixelFormat pix_fmts_all[] = {
     AV_PIX_FMT_YUVJ444P,
     AV_PIX_FMT_NV12,
     AV_PIX_FMT_NV16,
+#ifdef X264_CSP_NV21
     AV_PIX_FMT_NV21,
+#endif
     AV_PIX_FMT_YUV420P10,
     AV_PIX_FMT_YUV422P10,
     AV_PIX_FMT_YUV444P10,
     AV_PIX_FMT_NV20,
+#ifdef X264_CSP_I400
+    AV_PIX_FMT_GRAY8,
+    AV_PIX_FMT_GRAY10,
+#endif
+    AV_PIX_FMT_NONE
+};
+#if CONFIG_LIBX264RGB_ENCODER
+static const enum AVPixelFormat pix_fmts_8bit_rgb[] = {
+    AV_PIX_FMT_BGR0,
+    AV_PIX_FMT_BGR24,
+    AV_PIX_FMT_RGB24,
     AV_PIX_FMT_NONE
 };
 #endif
@@ -759,36 +1014,44 @@ static const AVOption options[] = {
     { "preset",        "Set the encoding preset (cf. x264 --fullhelp)",   OFFSET(preset),        AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE},
     { "tune",          "Tune the encoding params (cf. x264 --fullhelp)",  OFFSET(tune),          AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE},
     { "profile",       "Set profile restrictions (cf. x264 --fullhelp) ", OFFSET(profile),       AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE},
-    { "fastfirstpass", "Use fast settings when encoding first pass",      OFFSET(fastfirstpass), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 1, VE},
+    { "fastfirstpass", "Use fast settings when encoding first pass",      OFFSET(fastfirstpass), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VE},
+    {"level", "Specify level (as defined by Annex A)", OFFSET(level), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"passlogfile", "Filename for 2 pass stats", OFFSET(stats), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"wpredp", "Weighted prediction for P-frames", OFFSET(wpredp), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"a53cc",          "Use A53 Closed Captions (if available)",          OFFSET(a53_cc),        AV_OPT_TYPE_BOOL,   {.i64 = 1}, 0, 1, VE},
+    {"x264opts", "x264 options", OFFSET(x264opts), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
     { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE },
     { "crf_max",       "In CRF mode, prevents VBV from lowering quality beyond this point.",OFFSET(crf_max), AV_OPT_TYPE_FLOAT, {.dbl = -1 }, -1, FLT_MAX, VE },
     { "qp",            "Constant quantization parameter rate control method",OFFSET(cqp),        AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE },
     { "aq-mode",       "AQ method",                                       OFFSET(aq_mode),       AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE, "aq_mode"},
     { "none",          NULL,                              0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_NONE},         INT_MIN, INT_MAX, VE, "aq_mode" },
     { "variance",      "Variance AQ (complexity mask)",   0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_VARIANCE},     INT_MIN, INT_MAX, VE, "aq_mode" },
-    { "autovariance",  "Auto-variance AQ (experimental)", 0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE}, INT_MIN, INT_MAX, VE, "aq_mode" },
+    { "autovariance",  "Auto-variance AQ",                0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE}, INT_MIN, INT_MAX, VE, "aq_mode" },
+#if X264_BUILD >= 144
+    { "autovariance-biased", "Auto-variance AQ with bias to dark scenes", 0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE_BIASED}, INT_MIN, INT_MAX, VE, "aq_mode" },
+#endif
     { "aq-strength",   "AQ strength. Reduces blocking and blurring in flat and textured areas.", OFFSET(aq_strength), AV_OPT_TYPE_FLOAT, {.dbl = -1}, -1, FLT_MAX, VE},
-    { "psy",           "Use psychovisual optimizations.",                 OFFSET(psy),           AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
+    { "psy",           "Use psychovisual optimizations.",                 OFFSET(psy),           AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
     { "psy-rd",        "Strength of psychovisual optimization, in <psy-rd>:<psy-trellis> format.", OFFSET(psy_rd), AV_OPT_TYPE_STRING,  {0 }, 0, 0, VE},
     { "rc-lookahead",  "Number of frames to look ahead for frametype and ratecontrol", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, VE },
-    { "weightb",       "Weighted prediction for B-frames.",               OFFSET(weightb),       AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
+    { "weightb",       "Weighted prediction for B-frames.",               OFFSET(weightb),       AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
     { "weightp",       "Weighted prediction analysis method.",            OFFSET(weightp),       AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE, "weightp" },
     { "none",          NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_WEIGHTP_NONE},   INT_MIN, INT_MAX, VE, "weightp" },
     { "simple",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_WEIGHTP_SIMPLE}, INT_MIN, INT_MAX, VE, "weightp" },
     { "smart",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_WEIGHTP_SMART},  INT_MIN, INT_MAX, VE, "weightp" },
-    { "ssim",          "Calculate and print SSIM stats.",                 OFFSET(ssim),          AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
-    { "intra-refresh", "Use Periodic Intra Refresh instead of IDR frames.",OFFSET(intra_refresh),AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
-    { "bluray-compat", "Bluray compatibility workarounds.",               OFFSET(bluray_compat) ,AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
+    { "ssim",          "Calculate and print SSIM stats.",                 OFFSET(ssim),          AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
+    { "intra-refresh", "Use Periodic Intra Refresh instead of IDR frames.",OFFSET(intra_refresh),AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
+    { "bluray-compat", "Bluray compatibility workarounds.",               OFFSET(bluray_compat) ,AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
     { "b-bias",        "Influences how often B-frames are used",          OFFSET(b_bias),        AV_OPT_TYPE_INT,    { .i64 = INT_MIN}, INT_MIN, INT_MAX, VE },
     { "b-pyramid",     "Keep some B-frames as references.",               OFFSET(b_pyramid),     AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE, "b_pyramid" },
     { "none",          NULL,                                  0, AV_OPT_TYPE_CONST, {.i64 = X264_B_PYRAMID_NONE},   INT_MIN, INT_MAX, VE, "b_pyramid" },
     { "strict",        "Strictly hierarchical pyramid",       0, AV_OPT_TYPE_CONST, {.i64 = X264_B_PYRAMID_STRICT}, INT_MIN, INT_MAX, VE, "b_pyramid" },
     { "normal",        "Non-strict (not Blu-ray compatible)", 0, AV_OPT_TYPE_CONST, {.i64 = X264_B_PYRAMID_NORMAL}, INT_MIN, INT_MAX, VE, "b_pyramid" },
-    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_INT, { .i64 = -1}, -1, 1, VE },
-    { "8x8dct",        "High profile 8x8 transform.",                     OFFSET(dct8x8),        AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
-    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
-    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
-    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
+    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_BOOL, { .i64 = -1}, -1, 1, VE },
+    { "8x8dct",        "High profile 8x8 transform.",                     OFFSET(dct8x8),        AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
+    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
+    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
+    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
     { "deblock",       "Loop filter parameters, in <alpha:beta> form.",   OFFSET(deblock),       AV_OPT_TYPE_STRING, { 0 },  0, 0, VE},
     { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE},
     { "partitions",    "A comma-separated list of partitions to consider. "
@@ -805,17 +1068,21 @@ static const AVOption options[] = {
     { "none",          NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_NONE}, INT_MIN, INT_MAX, VE, "nal-hrd" },
     { "vbr",           NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_VBR},  INT_MIN, INT_MAX, VE, "nal-hrd" },
     { "cbr",           NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_CBR},  INT_MIN, INT_MAX, VE, "nal-hrd" },
+    { "avcintra-class","AVC-Intra class 50/100/200",                      OFFSET(avcintra_class),AV_OPT_TYPE_INT,     { .i64 = -1 }, -1, 200   , VE},
+    { "me_method",    "Set motion estimation method",                     OFFSET(motion_est),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, X264_ME_TESA, VE, "motion-est"},
     { "motion-est",   "Set motion estimation method",                     OFFSET(motion_est),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, X264_ME_TESA, VE, "motion-est"},
     { "dia",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_DIA },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "hex",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_HEX },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "umh",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_UMH },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "esa",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_ESA },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "tesa",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_TESA }, INT_MIN, INT_MAX, VE, "motion-est" },
-    { "forced-idr",   "If forwarding iframes, require them to be IDR frames.", OFFSET(forced_idr),  AV_OPT_TYPE_INT,    { .i64 = 0 }, 0, 1, VE },
+    { "forced-idr",   "If forcing keyframes, force them as IDR frames.",                                  OFFSET(forced_idr),  AV_OPT_TYPE_BOOL,   { .i64 = 0 }, -1, 1, VE },
     { "coder",    "Coder type",                                           OFFSET(coder), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VE, "coder" },
     { "default",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = -1 }, INT_MIN, INT_MAX, VE, "coder" },
     { "cavlc",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 },  INT_MIN, INT_MAX, VE, "coder" },
     { "cabac",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 },  INT_MIN, INT_MAX, VE, "coder" },
+    { "vlc",              NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 },  INT_MIN, INT_MAX, VE, "coder" },
+    { "ac",               NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 },  INT_MIN, INT_MAX, VE, "coder" },
     { "b_strategy",   "Strategy to choose between I/P/B-frames",          OFFSET(b_frame_strategy), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 2, VE },
     { "chromaoffset", "QP difference between chroma and luma",            OFFSET(chroma_offset), AV_OPT_TYPE_INT, { .i64 = -1 }, INT_MIN, INT_MAX, VE },
     { "sc_threshold", "Scene change threshold",                           OFFSET(scenechange_threshold), AV_OPT_TYPE_INT, { .i64 = -1 }, INT_MIN, INT_MAX, VE },
@@ -828,13 +1095,16 @@ static const AVOption options[] = {
 static const AVCodecDefault x264_defaults[] = {
     { "b",                "0" },
     { "bf",               "-1" },
+    { "flags2",           "0" },
     { "g",                "-1" },
     { "i_qfactor",        "-1" },
+    { "b_qfactor",        "-1" },
     { "qmin",             "-1" },
     { "qmax",             "-1" },
     { "qdiff",            "-1" },
     { "qblur",            "-1" },
     { "qcomp",            "-1" },
+//     { "rc_lookahead",     "-1" },
     { "refs",             "-1" },
 #if FF_API_PRIVATE_OPT
     { "sc_threshold",     "-1" },
@@ -861,7 +1131,7 @@ static const AVCodecDefault x264_defaults[] = {
 };
 
 #if CONFIG_LIBX264_ENCODER
-static const AVClass class = {
+static const AVClass x264_class = {
     .class_name = "libx264",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -879,15 +1149,40 @@ AVCodec ff_libx264_encoder = {
     .close            = X264_close,
     .capabilities     = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS |
                         AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
-    .priv_class       = &class,
+    .priv_class       = &x264_class,
     .defaults         = x264_defaults,
     .init_static_data = X264_init_static,
-    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
-                        FF_CODEC_CAP_INIT_CLEANUP,
+    .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
     .wrapper_name     = "libx264",
 };
 #endif
 
+#if CONFIG_LIBX264RGB_ENCODER
+static const AVClass rgbclass = {
+    .class_name = "libx264rgb",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libx264rgb_encoder = {
+    .name           = "libx264rgb",
+    .long_name      = NULL_IF_CONFIG_SMALL("libx264 H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 RGB"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(X264Context),
+    .init           = X264_init,
+    .encode2        = X264_frame,
+    .close          = X264_close,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS |
+                      AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
+    .priv_class     = &rgbclass,
+    .defaults       = x264_defaults,
+    .pix_fmts       = pix_fmts_8bit_rgb,
+    .wrapper_name   = "libx264",
+};
+#endif
+
 #if CONFIG_LIBX262_ENCODER
 static const AVClass X262_class = {
     .class_name = "libx262",
@@ -910,8 +1205,7 @@ AVCodec ff_libx262_encoder = {
     .priv_class       = &X262_class,
     .defaults         = x264_defaults,
     .pix_fmts         = pix_fmts_8bit,
-    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
-                        FF_CODEC_CAP_INIT_CLEANUP,
+    .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
     .wrapper_name     = "libx264",
 };
 #endif
diff --git a/libavcodec/libx265.c b/libavcodec/libx265.c
index 8f1d60b..fe39f45 100644
--- a/libavcodec/libx265.c
+++ b/libavcodec/libx265.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013-2014 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,6 +45,7 @@ typedef struct libx265Context {
     int   forced_idr;
     char *preset;
     char *tune;
+    char *profile;
     char *x265_opts;
 } libx265Context;
 
@@ -78,19 +79,12 @@ static av_cold int libx265_encode_close(AVCodecContext *avctx)
 static av_cold int libx265_encode_init(AVCodecContext *avctx)
 {
     libx265Context *ctx = avctx->priv_data;
+    AVCPBProperties *cpb_props = NULL;
 
     ctx->api = x265_api_get(av_pix_fmt_desc_get(avctx->pix_fmt)->comp[0].depth);
     if (!ctx->api)
         ctx->api = x265_api_get(0);
 
-    if (avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL &&
-        !av_pix_fmt_desc_get(avctx->pix_fmt)->log2_chroma_w) {
-        av_log(avctx, AV_LOG_ERROR,
-               "4:2:2 and 4:4:4 support is not fully defined for HEVC yet. "
-               "Set -strict experimental to encode anyway.\n");
-        return AVERROR(ENOSYS);
-    }
-
     ctx->params = ctx->api->param_alloc();
     if (!ctx->params) {
         av_log(avctx, AV_LOG_ERROR, "Could not allocate x265 param structure.\n");
@@ -121,6 +115,7 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
     ctx->params->sourceWidth     = avctx->width;
     ctx->params->sourceHeight    = avctx->height;
     ctx->params->bEnablePsnr     = !!(avctx->flags & AV_CODEC_FLAG_PSNR);
+    ctx->params->bOpenGOP        = !(avctx->flags & AV_CODEC_FLAG_CLOSED_GOP);
 
     /* Tune the CTU size based on input resolution. */
     if (ctx->params->sourceWidth < 64 || ctx->params->sourceHeight < 64)
@@ -133,11 +128,11 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    if ((avctx->color_primaries <= AVCOL_PRI_BT2020 &&
+    if ((avctx->color_primaries <= AVCOL_PRI_SMPTE432 &&
          avctx->color_primaries != AVCOL_PRI_UNSPECIFIED) ||
-        (avctx->color_trc <= AVCOL_TRC_BT2020_12 &&
+        (avctx->color_trc <= AVCOL_TRC_ARIB_STD_B67 &&
          avctx->color_trc != AVCOL_TRC_UNSPECIFIED) ||
-        (avctx->colorspace <= AVCOL_SPC_BT2020_CL &&
+        (avctx->colorspace <= AVCOL_SPC_ICTCP &&
          avctx->colorspace != AVCOL_SPC_UNSPECIFIED)) {
 
         ctx->params->vui.bEnableVideoSignalTypePresentFlag  = 1;
@@ -166,16 +161,36 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
         ctx->params->internalCsp = X265_CSP_I420;
         break;
     case AV_PIX_FMT_YUV422P:
     case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
         ctx->params->internalCsp = X265_CSP_I422;
         break;
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+        ctx->params->vui.matrixCoeffs = AVCOL_SPC_RGB;
+        ctx->params->vui.bEnableVideoSignalTypePresentFlag  = 1;
+        ctx->params->vui.bEnableColorDescriptionPresentFlag = 1;
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
         ctx->params->internalCsp = X265_CSP_I444;
         break;
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_GRAY10:
+    case AV_PIX_FMT_GRAY12:
+        if (ctx->api->api_build_number < 85) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "libx265 version is %d, must be at least 85 for gray encoding.\n",
+                   ctx->api->api_build_number);
+            return AVERROR_INVALIDDATA;
+        }
+        ctx->params->internalCsp = X265_CSP_I400;
+        break;
     }
 
     if (ctx->crf >= 0) {
@@ -191,6 +206,16 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
         ctx->params->rc.rateControlMode = X265_RC_ABR;
     }
 
+    ctx->params->rc.vbvBufferSize = avctx->rc_buffer_size / 1000;
+    ctx->params->rc.vbvMaxBitrate = avctx->rc_max_rate    / 1000;
+
+    cpb_props = ff_add_cpb_side_data(avctx);
+    if (!cpb_props)
+        return AVERROR(ENOMEM);
+    cpb_props->buffer_size = ctx->params->rc.vbvBufferSize * 1000;
+    cpb_props->max_bitrate = ctx->params->rc.vbvMaxBitrate * 1000;
+    cpb_props->avg_bitrate = ctx->params->rc.bitrate       * 1000;
+
     if (!(avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER))
         ctx->params->bRepeatHeaders = 1;
 
@@ -219,6 +244,23 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
         }
     }
 
+    if (ctx->params->rc.vbvBufferSize && avctx->rc_initial_buffer_occupancy > 1000 &&
+        ctx->params->rc.vbvBufferInit == 0.9) {
+        ctx->params->rc.vbvBufferInit = (float)avctx->rc_initial_buffer_occupancy / 1000;
+    }
+
+    if (ctx->profile) {
+        if (ctx->api->param_apply_profile(ctx->params, ctx->profile) < 0) {
+            int i;
+            av_log(avctx, AV_LOG_ERROR, "Invalid or incompatible profile set: %s.\n", ctx->profile);
+            av_log(avctx, AV_LOG_INFO, "Possible profiles:");
+            for (i = 0; x265_profile_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x265_profile_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
+            return AVERROR(EINVAL);
+        }
+    }
+
     ctx->encoder = ctx->api->encoder_open(ctx->params);
     if (!ctx->encoder) {
         av_log(avctx, AV_LOG_ERROR, "Cannot open libx265 encoder.\n");
@@ -251,6 +293,65 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static av_cold int libx265_encode_set_roi(libx265Context *ctx, const AVFrame *frame, x265_picture* pic)
+{
+    AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_REGIONS_OF_INTEREST);
+    if (sd) {
+        if (ctx->params->rc.aqMode == X265_AQ_NONE) {
+            av_log(ctx, AV_LOG_WARNING, "Adaptive quantization must be enabled to use ROI encoding, skipping ROI.\n");
+        } else {
+            /* 8x8 block when qg-size is 8, 16*16 block otherwise. */
+            int mb_size = (ctx->params->rc.qgSize == 8) ? 8 : 16;
+            int mbx = (frame->width + mb_size - 1) / mb_size;
+            int mby = (frame->height + mb_size - 1) / mb_size;
+            int nb_rois;
+            AVRegionOfInterest *roi;
+            float *qoffsets;         /* will be freed after encode is called. */
+            qoffsets = av_mallocz_array(mbx * mby, sizeof(*qoffsets));
+            if (!qoffsets)
+                return AVERROR(ENOMEM);
+
+            nb_rois = sd->size / sizeof(AVRegionOfInterest);
+            roi = (AVRegionOfInterest*)sd->data;
+            for (int count = 0; count < nb_rois; count++) {
+                int starty = FFMIN(mby, roi->top / mb_size);
+                int endy   = FFMIN(mby, (roi->bottom + mb_size - 1)/ mb_size);
+                int startx = FFMIN(mbx, roi->left / mb_size);
+                int endx   = FFMIN(mbx, (roi->right + mb_size - 1)/ mb_size);
+                float qoffset;
+
+                if (roi->self_size == 0) {
+                    av_free(qoffsets);
+                    av_log(ctx, AV_LOG_ERROR, "AVRegionOfInterest.self_size must be set to sizeof(AVRegionOfInterest).\n");
+                    return AVERROR(EINVAL);
+                }
+
+                if (roi->qoffset.den == 0) {
+                    av_free(qoffsets);
+                    av_log(ctx, AV_LOG_ERROR, "AVRegionOfInterest.qoffset.den must not be zero.\n");
+                    return AVERROR(EINVAL);
+                }
+                qoffset = roi->qoffset.num * 1.0f / roi->qoffset.den;
+                qoffset = av_clipf(qoffset, -1.0f, 1.0f);
+
+                /* qp range of x265 is from 0 to 51, just choose 25 as the scale value,
+                 * so the range of final qoffset is [-25.0, 25.0].
+                 */
+                qoffset = qoffset * 25;
+
+                for (int y = starty; y < endy; y++)
+                    for (int x = startx; x < endx; x++)
+                        qoffsets[x + y*mbx] = qoffset;
+
+                roi = (AVRegionOfInterest*)((char*)roi + roi->self_size);
+            }
+
+            pic->quantOffsets = qoffsets;
+        }
+    }
+    return 0;
+}
+
 static int libx265_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                 const AVFrame *pic, int *got_packet)
 {
@@ -280,12 +381,19 @@ static int libx265_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                             pic->pict_type == AV_PICTURE_TYPE_P ? X265_TYPE_P :
                             pic->pict_type == AV_PICTURE_TYPE_B ? X265_TYPE_B :
                             X265_TYPE_AUTO;
+
+        ret = libx265_encode_set_roi(ctx, pic, &x265pic);
+        if (ret < 0)
+            return ret;
     }
 
     ret = ctx->api->encoder_encode(ctx->encoder, &nal, &nnal,
                                    pic ? &x265pic : NULL, &x265pic_out);
+
+    av_freep(&x265pic.quantOffsets);
+
     if (ret < 0)
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
 
     if (!nnal)
         return 0;
@@ -293,7 +401,7 @@ static int libx265_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     for (i = 0; i < nnal; i++)
         payload += nal[i].sizeBytes;
 
-    ret = ff_alloc_packet(pkt, payload);
+    ret = ff_alloc_packet2(avctx, pkt, payload, payload);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
         return ret;
@@ -328,6 +436,13 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+#if X265_BUILD >= 130
+    if (x265pic_out.sliceType == X265_TYPE_B)
+#else
+    if (x265pic_out.frameData.sliceType == 'b')
+#endif
+        pkt->flags |= AV_PKT_FLAG_DISPOSABLE;
+
     *got_packet = 1;
     return 0;
 }
@@ -336,6 +451,22 @@ static const enum AVPixelFormat x265_csp_eight[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_YUV422P,
     AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_GRAY8,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat x265_csp_ten[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_YUV420P10,
+    AV_PIX_FMT_YUV422P10,
+    AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_GBRP10,
+    AV_PIX_FMT_GRAY8,
+    AV_PIX_FMT_GRAY10,
     AV_PIX_FMT_NONE
 };
 
@@ -343,27 +474,39 @@ static const enum AVPixelFormat x265_csp_twelve[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_YUV422P,
     AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
     AV_PIX_FMT_YUV420P10,
     AV_PIX_FMT_YUV422P10,
     AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_GBRP10,
+    AV_PIX_FMT_YUV420P12,
+    AV_PIX_FMT_YUV422P12,
+    AV_PIX_FMT_YUV444P12,
+    AV_PIX_FMT_GBRP12,
+    AV_PIX_FMT_GRAY8,
+    AV_PIX_FMT_GRAY10,
+    AV_PIX_FMT_GRAY12,
     AV_PIX_FMT_NONE
 };
 
 static av_cold void libx265_encode_init_csp(AVCodec *codec)
 {
-    if (x265_max_bit_depth == 8)
-        codec->pix_fmts = x265_csp_eight;
-    else if (x265_max_bit_depth == 12)
+    if (x265_api_get(12))
         codec->pix_fmts = x265_csp_twelve;
+    else if (x265_api_get(10))
+        codec->pix_fmts = x265_csp_ten;
+    else if (x265_api_get(8))
+        codec->pix_fmts = x265_csp_eight;
 }
 
 #define OFFSET(x) offsetof(libx265Context, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
     { "crf",         "set the x265 crf",                                                            OFFSET(crf),       AV_OPT_TYPE_FLOAT,  { .dbl = -1 }, -1, FLT_MAX, VE },
-    { "forced-idr",  "if forcing keyframes, force them as IDR frames",                              OFFSET(forced_idr),AV_OPT_TYPE_INT,    { .i64 =  0 },  0,       1, VE },
+    { "forced-idr",  "if forcing keyframes, force them as IDR frames",                              OFFSET(forced_idr),AV_OPT_TYPE_BOOL,   { .i64 =  0 },  0,       1, VE },
     { "preset",      "set the x265 preset",                                                         OFFSET(preset),    AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { "tune",        "set the x265 tune parameter",                                                 OFFSET(tune),      AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "profile",     "set the x265 profile",                                                        OFFSET(profile),   AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { "x265-params", "set the x265 configuration using a :-separated list of key=value parameters", OFFSET(x265_opts), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { NULL }
 };
diff --git a/libavcodec/libxavs.c b/libavcodec/libxavs.c
index 1a80b1a..801a05d 100644
--- a/libavcodec/libxavs.c
+++ b/libavcodec/libxavs.c
@@ -2,20 +2,20 @@
  * AVS encoding using the xavs library
  * Copyright (C) 2010 Amanda, Y.N. Wu <amanda11192003@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -93,10 +93,8 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     for (i = 0; i < nnal; i++)
         size += nals[i].i_payload;
 
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", size);
+    if ((ret = ff_alloc_packet2(ctx, pkt, size, 0)) < 0)
         return ret;
-    }
     p = pkt->data;
 
     /* Write the SEI as part of the first frame. */
@@ -124,7 +122,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
     xavs_nal_t *nal;
     int nnal, i, ret;
     xavs_picture_t pic_out;
-    uint8_t *sd;
+    int pict_type;
 
     x4->pic.img.i_csp   = XAVS_CSP_I420;
     x4->pic.img.i_plane = 3;
@@ -151,7 +149,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     if (!ret) {
         if (!frame && !(x4->end_of_stream)) {
-            if ((ret = ff_alloc_packet(pkt, 4)) < 0)
+            if ((ret = ff_alloc_packet2(avctx, pkt, 4, 0)) < 0)
                 return ret;
 
             pkt->data[0] = 0x0;
@@ -159,7 +157,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
             pkt->data[2] = 0x01;
             pkt->data[3] = 0xb1;
             pkt->dts = 2*x4->pts_buffer[(x4->out_frame_count-1)%(avctx->max_b_frames+1)] -
-                       x4->pts_buffer[(x4->out_frame_count-2)%(avctx->max_b_frames+1)];
+                         x4->pts_buffer[(x4->out_frame_count-2)%(avctx->max_b_frames+1)];
             x4->end_of_stream = END_OF_STREAM;
             *got_packet = 1;
         }
@@ -180,21 +178,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
     } else
         pkt->dts = pkt->pts;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
     switch (pic_out.i_type) {
     case XAVS_TYPE_IDR:
     case XAVS_TYPE_I:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case XAVS_TYPE_P:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case XAVS_TYPE_B:
     case XAVS_TYPE_BREF:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
+    default:
+        pict_type = AV_PICTURE_TYPE_NONE;
     }
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
@@ -215,10 +216,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA;
+    ff_side_data_set_encoder_stats(pkt, (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
 
     x4->out_frame_count++;
     *got_packet = ret;
@@ -230,7 +228,7 @@ static av_cold int XAVS_close(AVCodecContext *avctx)
     XavsContext *x4 = avctx->priv_data;
 
     av_freep(&avctx->extradata);
-    av_free(x4->sei);
+    av_freep(&x4->sei);
     av_freep(&x4->pts_buffer);
 
     if (x4->enc)
@@ -393,12 +391,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (!x4->enc)
         return -1;
 
-    if (!(x4->pts_buffer = av_mallocz((avctx->max_b_frames+1) * sizeof(*x4->pts_buffer))))
+    if (!(x4->pts_buffer = av_mallocz_array((avctx->max_b_frames+1), sizeof(*x4->pts_buffer))))
         return AVERROR(ENOMEM);
 
     /* TAG: Do we have GLOBAL HEADER in AVS */
     /* We Have PPS and SPS in AVS */
-    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER && 0) {
         xavs_nal_t *nal;
         int nnal, s, i, size;
         uint8_t *p;
@@ -428,20 +426,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define OFFSET(x) offsetof(XavsContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {-1 }, -1, FLT_MAX, VE },
+    { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE },
     { "qp",            "Constant quantization parameter rate control method",OFFSET(cqp),        AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, INT_MAX, VE },
     { "b-bias",        "Influences how often B-frames are used",          OFFSET(b_bias),        AV_OPT_TYPE_INT,    {.i64 = INT_MIN}, INT_MIN, INT_MAX, VE },
-    { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {-1 }, -1, FLT_MAX, VE},
+    { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE},
     { "direct-pred",   "Direct MV prediction mode",                       OFFSET(direct_pred),   AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, INT_MAX, VE, "direct-pred" },
     { "none",          NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_NONE },     0, 0, VE, "direct-pred" },
     { "spatial",       NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_SPATIAL },  0, 0, VE, "direct-pred" },
     { "temporal",      NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_TEMPORAL }, 0, 0, VE, "direct-pred" },
     { "auto",          NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_AUTO },     0, 0, VE, "direct-pred" },
-    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, 1, VE},
-    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, 1, VE},
-    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, VE },
-    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, 1, VE},
-    { "motion-est",   "Set motion estimation method",                     OFFSET(motion_est),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, XAVS_ME_TESA, VE, "motion-est"},
+    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_BOOL,    {.i64 = -1 }, -1, 1, VE},
+    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_BOOL,    {.i64 = -1 }, -1, 1, VE},
+    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, VE },
+    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_BOOL,    {.i64 = -1 }, -1, 1, VE},
+    { "motion-est",   "Set motion estimation method",                     OFFSET(motion_est),    AV_OPT_TYPE_INT,    { .i64 = XAVS_ME_DIA }, -1, XAVS_ME_TESA, VE, "motion-est"},
     { "dia",           NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_ME_DIA },               INT_MIN, INT_MAX, VE, "motion-est" },
     { "hex",           NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_ME_HEX },               INT_MIN, INT_MAX, VE, "motion-est" },
     { "umh",           NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_ME_UMH },               INT_MIN, INT_MAX, VE, "motion-est" },
@@ -455,7 +453,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass xavs_class = {
     .class_name = "libxavs",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -478,7 +476,7 @@ AVCodec ff_libxavs_encoder = {
     .close          = XAVS_close,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &xavs_class,
     .defaults       = xavs_defaults,
     .wrapper_name   = "libxavs",
 };
diff --git a/libavcodec/libxavs2.c b/libavcodec/libxavs2.c
new file mode 100644
index 0000000..d5c4557
--- /dev/null
+++ b/libavcodec/libxavs2.c
@@ -0,0 +1,293 @@
+/*
+ * AVS2 encoding using the xavs2 library
+ *
+ * Copyright (C) 2018 Yiqun Xu,   <yiqun.xu@vipl.ict.ac.cn>
+ *                    Falei Luo,  <falei.luo@gmail.com>
+ *                    Huiwen Ren, <hwrenx@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xavs2.h"
+#include "mpeg12.h"
+#include "libavutil/avstring.h"
+
+#define xavs2_opt_set2(name, format, ...) do{ \
+    char opt_str[16] = {0}; \
+    int err; \
+    av_strlcatf(opt_str, sizeof(opt_str), format, __VA_ARGS__); \
+    err = cae->api->opt_set2(cae->param, name, opt_str); \
+    if (err) {\
+        av_log(avctx, AV_LOG_WARNING, "Invalid value for %s: %s\n", name, opt_str);\
+    }\
+} while(0);
+
+typedef struct XAVS2EContext {
+    AVClass *class;
+
+    int lcu_row_threads;
+    int initial_qp;
+    int qp;
+    int max_qp;
+    int min_qp;
+    int preset_level;
+    int log_level;
+
+    void *encoder;
+    char *xavs2_opts;
+
+    xavs2_outpacket_t packet;
+    xavs2_param_t *param;
+
+    const xavs2_api_t *api;
+
+} XAVS2EContext;
+
+static av_cold int xavs2_init(AVCodecContext *avctx)
+{
+    XAVS2EContext *cae= avctx->priv_data;
+    int bit_depth, code;
+
+    bit_depth = avctx->pix_fmt == AV_PIX_FMT_YUV420P ? 8 : 10;
+
+    /* get API handler */
+    cae->api = xavs2_api_get(bit_depth);
+    if (!cae->api) {
+        av_log(avctx, AV_LOG_ERROR, "api get failed\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    cae->param = cae->api->opt_alloc();
+    if (!cae->param) {
+        av_log(avctx, AV_LOG_ERROR, "param alloc failed\n");
+        return AVERROR(ENOMEM);
+    }
+
+    xavs2_opt_set2("Width",     "%d", avctx->width);
+    xavs2_opt_set2("Height",    "%d", avctx->height);
+    xavs2_opt_set2("BFrames",   "%d", avctx->max_b_frames);
+    xavs2_opt_set2("BitDepth",  "%d", bit_depth);
+    xavs2_opt_set2("Log",       "%d", cae->log_level);
+    xavs2_opt_set2("Preset",    "%d", cae->preset_level);
+
+    xavs2_opt_set2("IntraPeriodMax",    "%d", avctx->gop_size);
+    xavs2_opt_set2("IntraPeriodMin",    "%d", avctx->gop_size);
+
+    xavs2_opt_set2("ThreadFrames",      "%d", avctx->thread_count);
+    xavs2_opt_set2("ThreadRows",        "%d", cae->lcu_row_threads);
+
+    xavs2_opt_set2("OpenGOP",  "%d", !(avctx->flags & AV_CODEC_FLAG_CLOSED_GOP));
+
+    if (cae->xavs2_opts) {
+        AVDictionary *dict    = NULL;
+        AVDictionaryEntry *en = NULL;
+
+        if (!av_dict_parse_string(&dict, cae->xavs2_opts, "=", ":", 0)) {
+            while ((en = av_dict_get(dict, "", en, AV_DICT_IGNORE_SUFFIX))) {
+                xavs2_opt_set2(en->key, "%s", en->value);
+            }
+            av_dict_free(&dict);
+        }
+    }
+
+    /* Rate control */
+    if (avctx->bit_rate > 0) {
+        xavs2_opt_set2("RateControl",   "%d", 1);
+        xavs2_opt_set2("TargetBitRate", "%"PRId64"", avctx->bit_rate);
+        xavs2_opt_set2("InitialQP",     "%d", cae->initial_qp);
+        xavs2_opt_set2("MaxQP",         "%d", avctx->qmax >= 0 ? avctx->qmax : cae->max_qp);
+        xavs2_opt_set2("MinQP",         "%d", avctx->qmin >= 0 ? avctx->qmin : cae->min_qp);
+    } else {
+        xavs2_opt_set2("InitialQP",     "%d", cae->qp);
+    }
+
+
+    ff_mpeg12_find_best_frame_rate(avctx->framerate, &code, NULL, NULL, 0);
+
+    xavs2_opt_set2("FrameRate",   "%d", code);
+
+    cae->encoder = cae->api->encoder_create(cae->param);
+
+    if (!cae->encoder) {
+        av_log(avctx,AV_LOG_ERROR, "Can not create encoder. Null pointer returned\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static void xavs2_copy_frame_with_shift(xavs2_picture_t *pic, const AVFrame *frame, const int shift_in)
+{
+    int j, k;
+    for (k = 0; k < 3; k++) {
+        int i_stride = pic->img.i_stride[k];
+        for (j = 0; j < pic->img.i_lines[k]; j++) {
+            uint16_t *p_plane = (uint16_t *)&pic->img.img_planes[k][j * i_stride];
+            int i;
+            uint8_t *p_buffer = frame->data[k] + frame->linesize[k] * j;
+            memset(p_plane, 0, i_stride);
+            for (i = 0; i < pic->img.i_width[k]; i++) {
+                p_plane[i] = p_buffer[i] << shift_in;
+            }
+        }
+    }
+}
+
+static void xavs2_copy_frame(xavs2_picture_t *pic, const AVFrame *frame)
+{
+    int j, k;
+    for (k = 0; k < 3; k++) {
+        for (j = 0; j < pic->img.i_lines[k]; j++) {
+            memcpy( pic->img.img_planes[k] + pic->img.i_stride[k] * j,
+                    frame->data[k]+frame->linesize[k] * j,
+                    pic->img.i_width[k] * pic->img.in_sample_size);
+        }
+    }
+}
+
+static int xavs2_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *frame, int *got_packet)
+{
+    XAVS2EContext *cae = avctx->priv_data;
+    xavs2_picture_t pic;
+    int ret;
+
+    /* create the XAVS2 video encoder */
+    /* read frame data and send to the XAVS2 video encoder */
+    if (cae->api->encoder_get_buffer(cae->encoder, &pic) < 0) {
+        av_log(avctx,AV_LOG_ERROR, "failed to get frame buffer\n");
+        return AVERROR_EXTERNAL;
+    }
+    if (frame) {
+        switch (frame->format) {
+        case AV_PIX_FMT_YUV420P:
+            if (pic.img.in_sample_size == pic.img.enc_sample_size) {
+                xavs2_copy_frame(&pic, frame);
+            } else {
+                const int shift_in = atoi(cae->api->opt_get(cae->param, "SampleShift"));
+                xavs2_copy_frame_with_shift(&pic, frame, shift_in);
+            }
+            break;
+        case AV_PIX_FMT_YUV420P10:
+            if (pic.img.in_sample_size == pic.img.enc_sample_size) {
+                xavs2_copy_frame(&pic, frame);
+                break;
+            }
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format\n");
+            return AVERROR(EINVAL);
+            break;
+        }
+
+        pic.i_state = 0;
+        pic.i_pts   = frame->pts;
+        pic.i_type  = XAVS2_TYPE_AUTO;
+
+        ret = cae->api->encoder_encode(cae->encoder, &pic, &cae->packet);
+
+        if (ret) {
+            av_log(avctx, AV_LOG_ERROR, "encode failed\n");
+            return AVERROR_EXTERNAL;
+        }
+
+    } else {
+        cae->api->encoder_encode(cae->encoder, NULL, &cae->packet);
+    }
+
+    if ((cae->packet.len) && (cae->packet.state != XAVS2_STATE_FLUSH_END)){
+
+        if (av_new_packet(pkt, cae->packet.len) < 0){
+            av_log(avctx, AV_LOG_ERROR, "packet alloc failed\n");
+            cae->api->encoder_packet_unref(cae->encoder, &cae->packet);
+            return AVERROR(ENOMEM);
+        }
+
+        pkt->pts = cae->packet.pts;
+        pkt->dts = cae->packet.dts;
+
+        memcpy(pkt->data, cae->packet.stream, cae->packet.len);
+        pkt->size = cae->packet.len;
+
+        cae->api->encoder_packet_unref(cae->encoder, &cae->packet);
+
+        *got_packet = 1;
+    } else {
+        *got_packet = 0;
+    }
+
+    return 0;
+}
+
+static av_cold int xavs2_close(AVCodecContext *avctx)
+{
+    XAVS2EContext *cae = avctx->priv_data;
+    /* destroy the encoder */
+    if (cae->api) {
+        cae->api->encoder_destroy(cae->encoder);
+
+        if (cae->param) {
+            cae->api->opt_destroy(cae->param);
+        }
+    }
+    return 0;
+}
+
+#define OFFSET(x) offsetof(XAVS2EContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption options[] = {
+    { "lcu_row_threads" ,   "number of parallel threads for rows" ,     OFFSET(lcu_row_threads) , AV_OPT_TYPE_INT, {.i64 =  0 },  0, INT_MAX,  VE },
+    { "initial_qp"      ,   "Quantization initial parameter"      ,     OFFSET(initial_qp)      , AV_OPT_TYPE_INT, {.i64 = 34 },  1,      63,  VE },
+    { "qp"              ,   "Quantization parameter"  ,                 OFFSET(qp)              , AV_OPT_TYPE_INT, {.i64 = 34 },  1,      63,  VE },
+    { "max_qp"          ,   "max qp for rate control" ,                 OFFSET(max_qp)          , AV_OPT_TYPE_INT, {.i64 = 55 },  0,      63,  VE },
+    { "min_qp"          ,   "min qp for rate control" ,                 OFFSET(min_qp)          , AV_OPT_TYPE_INT, {.i64 = 20 },  0,      63,  VE },
+    { "speed_level"     ,   "Speed level, higher is better but slower", OFFSET(preset_level)    , AV_OPT_TYPE_INT, {.i64 =  0 },  0,       9,  VE },
+    { "log_level"       ,   "log level: -1: none, 0: error, 1: warning, 2: info, 3: debug", OFFSET(log_level)    , AV_OPT_TYPE_INT, {.i64 =  0 },  -1,       3,  VE },
+    { "xavs2-params"    ,   "set the xavs2 configuration using a :-separated list of key=value parameters", OFFSET(xavs2_opts), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { NULL },
+};
+
+static const AVClass libxavs2 = {
+    .class_name = "XAVS2EContext",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVCodecDefault xavs2_defaults[] = {
+    { "b",                "0" },
+    { "g",                "48"},
+    { "bf",               "7" },
+    { NULL },
+};
+
+AVCodec ff_libxavs2_encoder = {
+    .name           = "libxavs2",
+    .long_name      = NULL_IF_CONFIG_SMALL("libxavs2 AVS2-P2/IEEE1857.4"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVS2,
+    .priv_data_size = sizeof(XAVS2EContext),
+    .init           = xavs2_init,
+    .encode2        = xavs2_encode_frame,
+    .close          = xavs2_close,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
+                                                     AV_PIX_FMT_NONE },
+    .priv_class     = &libxavs2,
+    .defaults       = xavs2_defaults,
+    .wrapper_name   = "libxavs2",
+} ;
diff --git a/libavcodec/libxvid.c b/libavcodec/libxvid.c
index ab2e676..cdaae20 100644
--- a/libavcodec/libxvid.c
+++ b/libavcodec/libxvid.c
@@ -2,20 +2,20 @@
  * Interface to xvidcore for MPEG-4 encoding
  * Copyright (c) 2004 Adam Thayer <krevnik@comcast.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,12 +26,12 @@
  */
 
 #include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
-#include <unistd.h>
 #include <xvid.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
+#include "libavutil/file.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
@@ -40,8 +40,17 @@
 
 #include "avcodec.h"
 #include "internal.h"
+#include "libxvid.h"
 #include "mpegutils.h"
 
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if HAVE_IO_H
+#include <io.h>
+#endif
+
 /**
  * Buffer management macros.
  */
@@ -54,7 +63,7 @@
  * This stores all the private context for the codec.
  */
 struct xvid_context {
-    AVClass *class;                /**< Handle for Xvid encoder */
+    AVClass *class;
     void *encoder_handle;          /**< Handle for Xvid encoder */
     int xsize;                     /**< Frame x size */
     int ysize;                     /**< Frame y size */
@@ -66,6 +75,7 @@ struct xvid_context {
     char *twopassbuffer;           /**< Character buffer for two-pass */
     char *old_twopassbuffer;       /**< Old character buffer (two-pass) */
     char *twopassfile;             /**< second pass temp file name */
+    int twopassfd;
     unsigned char *intra_matrix;   /**< P-Frame Quant Matrix */
     unsigned char *inter_matrix;   /**< I-Frame Quant Matrix */
     int lumi_aq;                   /**< Lumi masking as an aq method */
@@ -85,6 +95,7 @@ struct xvid_ff_pass1 {
     struct xvid_context *context;   /**< Pointer to private context */
 };
 
+static int xvid_encode_close(AVCodecContext *avctx);
 static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                              const AVFrame *picture, int *got_packet);
 
@@ -119,7 +130,7 @@ static int xvid_ff_2pass_create(xvid_plg_create_t *param, void **handle)
     /* This is because we can safely prevent a buffer overflow */
     log[0] = 0;
     snprintf(log, BUFFER_REMAINING(log),
-             "# avconv 2-pass log file, using xvid codec\n");
+             "# ffmpeg 2-pass log file, using xvid codec\n");
     snprintf(BUFFER_CAT(log), BUFFER_REMAINING(log),
              "# Do not modify. libxvidcore version: %d.%d.%d\n\n",
              XVID_VERSION_MAJOR(XVID_VERSION),
@@ -359,61 +370,36 @@ static void xvid_correct_framerate(AVCodecContext *avctx)
     }
 }
 
-/* Create temporary file using mkstemp(), tries /tmp first, if possible.
- * *prefix can be a character constant; *filename will be allocated internally.
- * Return file descriptor of opened file (or error code on error)
- * and opened file name in **filename. */
-static int xvid_tempfile(AVCodecContext *avctx, const char *prefix,
-                         char **filename)
-{
-    int fd = -1;
-    size_t len = strlen(prefix) + 12; /* room for "/tmp/" and "XXXXXX\0" */
-    *filename  = av_malloc(len);
-    if (!(*filename)) {
-        av_log(avctx, AV_LOG_ERROR, "xvid_tempfile: Cannot allocate file name\n");
-        return AVERROR(ENOMEM);
-    }
-    snprintf(*filename, len, "/tmp/%sXXXXXX", prefix);
-    fd = mkstemp(*filename);
-    if (fd < 0) {
-        snprintf(*filename, len, "./%sXXXXXX", prefix);
-        fd = mkstemp(*filename);
-    }
-    if (fd < 0) {
-        av_log(avctx, AV_LOG_ERROR, "xvid_tempfile: Cannot open temporary file %s\n", *filename);
-        return AVERROR(EIO);
-    }
-    return fd; /* success */
-}
-
 static av_cold int xvid_encode_init(AVCodecContext *avctx)
 {
-    int xerr, i;
+    int xerr, i, ret = -1;
     int xvid_flags = avctx->flags;
     struct xvid_context *x = avctx->priv_data;
     uint16_t *intra, *inter;
     int fd;
 
-    xvid_plugin_single_t single         = { 0 };
-    struct xvid_ff_pass1 rc2pass1       = { 0 };
-    xvid_plugin_2pass2_t rc2pass2       = { 0 };
-    xvid_plugin_lumimasking_t masking_l = { 0 }; /* For lumi masking */
-    xvid_plugin_lumimasking_t masking_v = { 0 }; /* For variance AQ */
-    xvid_plugin_ssim_t ssim             = { 0 };
-    xvid_gbl_init_t xvid_gbl_init       = { 0 };
-    xvid_enc_create_t xvid_enc_create   = { 0 };
-    xvid_enc_plugin_t plugins[7];
-
-    /* Bring in VOP flags from avconv command-line */
-    x->vop_flags = XVID_VOP_HALFPEL; /* Bare minimum quality */
+    xvid_plugin_single_t      single          = { 0 };
+    struct xvid_ff_pass1      rc2pass1        = { 0 };
+    xvid_plugin_2pass2_t      rc2pass2        = { 0 };
+    xvid_plugin_lumimasking_t masking_l       = { 0 }; /* For lumi masking */
+    xvid_plugin_lumimasking_t masking_v       = { 0 }; /* For variance AQ */
+    xvid_plugin_ssim_t        ssim            = { 0 };
+    xvid_gbl_init_t           xvid_gbl_init   = { 0 };
+    xvid_enc_create_t         xvid_enc_create = { 0 };
+    xvid_enc_plugin_t         plugins[4];
+
+    x->twopassfd = -1;
+
+    /* Bring in VOP flags from ffmpeg command-line */
+    x->vop_flags = XVID_VOP_HALFPEL;              /* Bare minimum quality */
     if (xvid_flags & AV_CODEC_FLAG_4MV)
-        x->vop_flags |= XVID_VOP_INTER4V; /* Level 3 */
+        x->vop_flags    |= XVID_VOP_INTER4V;      /* Level 3 */
     if (avctx->trellis)
-        x->vop_flags |= XVID_VOP_TRELLISQUANT; /* Level 5 */
+        x->vop_flags    |= XVID_VOP_TRELLISQUANT; /* Level 5 */
     if (xvid_flags & AV_CODEC_FLAG_AC_PRED)
-        x->vop_flags |= XVID_VOP_HQACPRED; /* Level 6 */
+        x->vop_flags    |= XVID_VOP_HQACPRED;     /* Level 6 */
     if (xvid_flags & AV_CODEC_FLAG_GRAY)
-        x->vop_flags |= XVID_VOP_GREYSCALE;
+        x->vop_flags    |= XVID_VOP_GREYSCALE;
 
     /* Decide which ME quality setting to use */
     x->me_flags = 0;
@@ -451,6 +437,7 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx)
         break;
     }
 
+    /* Bring in VOL flags from ffmpeg command-line */
     x->vol_flags = 0;
     if (x->gmc) {
         x->vol_flags |= XVID_VOL_GMC;
@@ -487,6 +474,18 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx)
     xvid_enc_create.num_zones = 0;
 
     xvid_enc_create.num_threads = avctx->thread_count;
+#if (XVID_VERSION <= 0x010303) && (XVID_VERSION >= 0x010300)
+    /* workaround for a bug in libxvidcore */
+    if (avctx->height <= 16) {
+        if (avctx->thread_count < 2) {
+            xvid_enc_create.num_threads = 0;
+        } else {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Too small height for threads > 1.");
+            return AVERROR(EINVAL);
+        }
+    }
+#endif
 
     xvid_enc_create.plugins     = plugins;
     xvid_enc_create.num_plugins = 0;
@@ -516,26 +515,29 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx)
         rc2pass2.version = XVID_VERSION;
         rc2pass2.bitrate = avctx->bit_rate;
 
-        fd = xvid_tempfile(avctx, "xvidff.", &x->twopassfile);
+        fd = avpriv_tempfile("xvidff.", &x->twopassfile, 0, avctx);
         if (fd < 0) {
             av_log(avctx, AV_LOG_ERROR, "Xvid: Cannot write 2-pass pipe\n");
             return fd;
         }
+        x->twopassfd = fd;
 
         if (!avctx->stats_in) {
             av_log(avctx, AV_LOG_ERROR,
                    "Xvid: No 2-pass information loaded for second pass\n");
-            return AVERROR_INVALIDDATA;
+            return AVERROR(EINVAL);
         }
 
-        if (strlen(avctx->stats_in) >
-            write(fd, avctx->stats_in, strlen(avctx->stats_in))) {
-            close(fd);
+        ret = write(fd, avctx->stats_in, strlen(avctx->stats_in));
+        if (ret == -1)
+            ret = AVERROR(errno);
+        else if (strlen(avctx->stats_in) > ret) {
             av_log(avctx, AV_LOG_ERROR, "Xvid: Cannot write to 2-pass pipe\n");
-            return AVERROR(EIO);
+            ret = AVERROR(EIO);
         }
+        if (ret < 0)
+            return ret;
 
-        close(fd);
         rc2pass2.filename                          = x->twopassfile;
         plugins[xvid_enc_create.num_plugins].func  = xvid_plugin_2pass2;
         plugins[xvid_enc_create.num_plugins].param = &rc2pass2;
@@ -553,12 +555,6 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx)
     if (avctx->lumi_masking != 0.0)
         x->lumi_aq = 1;
 
-    if (x->lumi_aq && x->variance_aq) {
-        x->variance_aq = 0;
-        av_log(avctx, AV_LOG_WARNING,
-               "variance_aq is ignored when lumi_aq is set.\n");
-    }
-
     /* Luminance Masking */
     if (x->lumi_aq) {
         masking_l.method                          = 0;
@@ -579,6 +575,11 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx)
         xvid_enc_create.num_plugins++;
     }
 
+    if (x->lumi_aq && x->variance_aq )
+        av_log(avctx, AV_LOG_INFO,
+               "Both lumi_aq and variance_aq are enabled. The resulting quality"
+               "will be the worse one of the two effects made by the AQ.\n");
+
     /* SSIM */
     if (x->ssim) {
         plugins[xvid_enc_create.num_plugins].func  = xvid_plugin_ssim;
@@ -678,11 +679,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->max_b_frames > 0 && !x->quicktime_format)
         xvid_enc_create.global |= XVID_GLOBAL_PACKED;
 
+    av_assert0(xvid_enc_create.num_plugins + (!!x->ssim) + (!!x->variance_aq) + (!!x->lumi_aq) <= FF_ARRAY_ELEMS(plugins));
+
     /* Encode a dummy frame to get the extradata immediately */
     if (x->quicktime_format) {
         AVFrame *picture;
-        AVPacket packet;
-        int got_packet, ret;
+        AVPacket packet = {0};
+        int size, got_packet, ret;
 
         av_init_packet(&packet);
 
@@ -691,26 +694,26 @@ FF_ENABLE_DEPRECATION_WARNINGS
             return AVERROR(ENOMEM);
 
         xerr = xvid_encore(NULL, XVID_ENC_CREATE, &xvid_enc_create, NULL);
-        if (xerr) {
+        if( xerr ) {
             av_frame_free(&picture);
             av_log(avctx, AV_LOG_ERROR, "Xvid: Could not create encoder reference\n");
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
         }
         x->encoder_handle = xvid_enc_create.handle;
-
-        picture->width  = avctx->width;
-        picture->height = avctx->height;
-        picture->format = avctx->pix_fmt;
-
-        if ((ret = av_frame_get_buffer(picture, 32)) < 0) {
-            xvid_encore(x->encoder_handle, XVID_ENC_DESTROY, NULL, NULL);
+        size = ((avctx->width + 1) & ~1) * ((avctx->height + 1) & ~1);
+        picture->data[0] = av_malloc(size + size / 2);
+        if (!picture->data[0]) {
             av_frame_free(&picture);
-            return ret;
+            return AVERROR(ENOMEM);
         }
-
+        picture->data[1] = picture->data[0] + size;
+        picture->data[2] = picture->data[1] + size / 4;
+        memset(picture->data[0], 0, size);
+        memset(picture->data[1], 128, size / 2);
         ret = xvid_encode_frame(avctx, &packet, picture, &got_packet);
         if (!ret && got_packet)
             av_packet_unref(&packet);
+        av_free(picture->data[0]);
         av_frame_free(&picture);
         xvid_encore(x->encoder_handle, XVID_ENC_DESTROY, NULL, NULL);
     }
@@ -719,7 +722,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     xerr = xvid_encore(NULL, XVID_ENC_CREATE, &xvid_enc_create, NULL);
     if (xerr) {
         av_log(avctx, AV_LOG_ERROR, "Xvid: Could not create encoder reference\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     x->encoder_handle  = xvid_enc_create.handle;
@@ -739,11 +742,8 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     xvid_enc_frame_t xvid_enc_frame = { 0 };
     xvid_enc_stats_t xvid_enc_stats = { 0 };
 
-    if (!user_packet &&
-        (ret = av_new_packet(pkt, mb_width * mb_height * MAX_MB_BYTES + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, mb_width*(int64_t)mb_height*MAX_MB_BYTES + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     /* Start setting up the frame */
     xvid_enc_frame.version = XVID_VERSION;
@@ -757,7 +757,7 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (avctx->pix_fmt != AV_PIX_FMT_YUV420P) {
         av_log(avctx, AV_LOG_ERROR,
                "Xvid: Color spaces other than 420P not supported\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     xvid_enc_frame.input.csp = XVID_CSP_PLANAR; /* YUV420P */
@@ -778,11 +778,13 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                                   XVID_TYPE_AUTO;
 
     /* Pixel aspect ratio setting */
-    if (avctx->sample_aspect_ratio.num < 1 || avctx->sample_aspect_ratio.num > 255 ||
-        avctx->sample_aspect_ratio.den < 1 || avctx->sample_aspect_ratio.den > 255) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid pixel aspect ratio %i/%i\n",
+    if (avctx->sample_aspect_ratio.num < 0 || avctx->sample_aspect_ratio.num > 255 ||
+        avctx->sample_aspect_ratio.den < 0 || avctx->sample_aspect_ratio.den > 255) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Invalid pixel aspect ratio %i/%i, limit is 255/255 reducing\n",
                avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den);
-        return -1;
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den, 255);
     }
     xvid_enc_frame.par        = XVID_PAR_EXT;
     xvid_enc_frame.par_width  = avctx->sample_aspect_ratio.num;
@@ -815,27 +817,28 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     if (xerr > 0) {
-        uint8_t *sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                              sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = xvid_enc_stats.quant * FF_QP2LAMBDA;
+        int pict_type;
 
         *got_packet = 1;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-        avctx->coded_frame->quality = xvid_enc_stats.quant * FF_QP2LAMBDA;
         if (xvid_enc_stats.type == XVID_TYPE_PVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+            pict_type = AV_PICTURE_TYPE_P;
         else if (xvid_enc_stats.type == XVID_TYPE_BVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+            pict_type = AV_PICTURE_TYPE_B;
         else if (xvid_enc_stats.type == XVID_TYPE_SVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_S;
+            pict_type = AV_PICTURE_TYPE_S;
         else
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+            pict_type = AV_PICTURE_TYPE_I;
+
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+        avctx->coded_frame->pict_type = pict_type;
+        avctx->coded_frame->quality = xvid_enc_stats.quant * FF_QP2LAMBDA;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+
+        ff_side_data_set_encoder_stats(pkt, xvid_enc_stats.quant * FF_QP2LAMBDA, NULL, 0, pict_type);
+
         if (xvid_enc_frame.out_flags & XVID_KEYFRAME) {
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -864,7 +867,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             return 0;
         av_log(avctx, AV_LOG_ERROR,
                "Xvid: Encoding Error Occurred: %i\n", xerr);
-        return xerr;
+        return AVERROR_EXTERNAL;
     }
 }
 
@@ -879,12 +882,18 @@ static av_cold int xvid_encode_close(AVCodecContext *avctx)
 
     av_freep(&avctx->extradata);
     if (x->twopassbuffer) {
-        av_free(x->twopassbuffer);
-        av_free(x->old_twopassbuffer);
+        av_freep(&x->twopassbuffer);
+        av_freep(&x->old_twopassbuffer);
+        avctx->stats_out = NULL;
+    }
+    if (x->twopassfd>=0) {
+        unlink(x->twopassfile);
+        close(x->twopassfd);
+        x->twopassfd = -1;
     }
-    av_free(x->twopassfile);
-    av_free(x->intra_matrix);
-    av_free(x->inter_matrix);
+    av_freep(&x->twopassfile);
+    av_freep(&x->intra_matrix);
+    av_freep(&x->inter_matrix);
 
     return 0;
 }
@@ -900,7 +909,7 @@ static const AVOption options[] = {
     { "frame",       NULL,                                                0, AV_OPT_TYPE_CONST, { .i64 = 2 }, INT_MIN, INT_MAX, VE, "ssim" },
     { "ssim_acc",    "SSIM accuracy",                   OFFSET(ssim_acc),    AV_OPT_TYPE_INT,   { .i64 = 2 },       0,       4, VE         },
     { "gmc",         "use GMC",                         OFFSET(gmc),         AV_OPT_TYPE_INT,   { .i64 = 0 },       0,       1, VE         },
-    { "me_quality",  "Motion estimation quality",       OFFSET(me_quality),  AV_OPT_TYPE_INT,   { .i64 = 0 },       0,       6, VE         },
+    { "me_quality",  "Motion estimation quality",       OFFSET(me_quality),  AV_OPT_TYPE_INT,   { .i64 = 4 },       0,       6, VE         },
     { "mpeg_quant",  "Use MPEG quantizers instead of H.263", OFFSET(mpeg_quant), AV_OPT_TYPE_INT, { .i64 = 0 },     0,       1, VE         },
     { NULL },
 };
diff --git a/libavcodec/libxvid.h b/libavcodec/libxvid.h
new file mode 100644
index 0000000..58bef61
--- /dev/null
+++ b/libavcodec/libxvid.h
@@ -0,0 +1,36 @@
+/*
+ * copyright (C) 2006 Corey Hickey
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LIBXVID_H
+#define AVCODEC_LIBXVID_H
+
+/**
+ * @file
+ * common functions for use with the Xvid wrappers
+ */
+
+struct MpegEncContext;
+
+/* rate control */
+int ff_xvid_rate_control_init(struct MpegEncContext *s);
+void ff_xvid_rate_control_uninit(struct MpegEncContext *s);
+float ff_xvid_rate_estimate_qscale(struct MpegEncContext *s, int dry_run);
+
+#endif /* AVCODEC_LIBXVID_H */
diff --git a/libavcodec/libzvbi-teletextdec.c b/libavcodec/libzvbi-teletextdec.c
new file mode 100644
index 0000000..3515f33
--- /dev/null
+++ b/libavcodec/libzvbi-teletextdec.c
@@ -0,0 +1,829 @@
+/*
+ * Teletext decoding for ffmpeg
+ * Copyright (c) 2005-2010, 2012 Wolfram Gloger
+ * Copyright (c) 2013 Marton Balint
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "libavcodec/ass.h"
+#include "libavcodec/dvbtxt.h"
+#include "libavutil/opt.h"
+#include "libavutil/bprint.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+#include "libavutil/common.h"
+
+#include <libzvbi.h>
+
+#define TEXT_MAXSZ    (25 * (56 + 1) * 4 + 2)
+#define VBI_NB_COLORS 40
+#define VBI_TRANSPARENT_BLACK 8
+#define RGBA(r,g,b,a) (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
+#define VBI_R(rgba)   (((rgba) >> 0) & 0xFF)
+#define VBI_G(rgba)   (((rgba) >> 8) & 0xFF)
+#define VBI_B(rgba)   (((rgba) >> 16) & 0xFF)
+#define VBI_A(rgba)   (((rgba) >> 24) & 0xFF)
+#define MAX_BUFFERED_PAGES 25
+#define BITMAP_CHAR_WIDTH  12
+#define BITMAP_CHAR_HEIGHT 10
+#define MAX_SLICES 64
+
+typedef struct TeletextPage
+{
+    AVSubtitleRect *sub_rect;
+    int pgno;
+    int subno;
+    int64_t pts;
+} TeletextPage;
+
+typedef struct TeletextContext
+{
+    AVClass        *class;
+    char           *pgno;
+    int             x_offset;
+    int             y_offset;
+    int             format_id; /* 0 = bitmap, 1 = text/ass, 2 = ass */
+    int             chop_top;
+    int             sub_duration; /* in msec */
+    int             transparent_bg;
+    int             opacity;
+    int             chop_spaces;
+
+    int             lines_processed;
+    TeletextPage    *pages;
+    int             nb_pages;
+    int64_t         pts;
+    int             handler_ret;
+
+    vbi_decoder *   vbi;
+    vbi_sliced      sliced[MAX_SLICES];
+
+    int             readorder;
+    uint8_t         subtitle_map[2048];
+    int             last_pgno;
+    int             last_p5;
+    int             last_ass_alignment;
+} TeletextContext;
+
+static int my_ass_subtitle_header(AVCodecContext *avctx)
+{
+    int ret = ff_ass_subtitle_header_default(avctx);
+    char *new_header;
+    uint8_t *event_pos;
+
+    if (ret < 0)
+        return ret;
+
+    event_pos = strstr(avctx->subtitle_header, "\r\n[Events]\r\n");
+    if (!event_pos)
+        return AVERROR_BUG;
+
+    new_header = av_asprintf("%.*s%s%s",
+        (int)(event_pos - avctx->subtitle_header), avctx->subtitle_header,
+        "Style: "
+        "Teletext,"            /* Name */
+        "Monospace,11,"        /* Font{name,size} */
+        "&Hffffff,&Hffffff,&H0,&H0," /* {Primary,Secondary,Outline,Back}Colour */
+        "0,0,0,0,"             /* Bold, Italic, Underline, StrikeOut */
+        "160,100,"             /* Scale{X,Y} */
+        "0,0,"                 /* Spacing, Angle */
+        "3,0.1,0,"             /* BorderStyle, Outline, Shadow */
+        "5,1,1,1,"             /* Alignment, Margin[LRV] */
+        "0\r\n"                /* Encoding */
+        "Style: "
+        "Subtitle,"            /* Name */
+        "Monospace,16,"        /* Font{name,size} */
+        "&Hffffff,&Hffffff,&H0,&H0," /* {Primary,Secondary,Outline,Back}Colour */
+        "0,0,0,0,"             /* Bold, Italic, Underline, StrikeOut */
+        "100,100,"             /* Scale{X,Y} */
+        "0,0,"                 /* Spacing, Angle */
+        "1,1,1,"               /* BorderStyle, Outline, Shadow */
+        "8,48,48,20,"          /* Alignment, Margin[LRV] */
+        "0\r\n"                /* Encoding */
+        , event_pos);
+
+    if (!new_header)
+        return AVERROR(ENOMEM);
+
+    av_free(avctx->subtitle_header);
+    avctx->subtitle_header = new_header;
+    avctx->subtitle_header_size = strlen(new_header);
+    return 0;
+}
+
+static int chop_spaces_utf8(const unsigned char* t, int len)
+{
+    t += len;
+    while (len > 0) {
+        if (*--t != ' ' || (len-1 > 0 && *(t-1) & 0x80))
+            break;
+        --len;
+    }
+    return len;
+}
+
+static void subtitle_rect_free(AVSubtitleRect **sub_rect)
+{
+    av_freep(&(*sub_rect)->data[0]);
+    av_freep(&(*sub_rect)->data[1]);
+    av_freep(&(*sub_rect)->ass);
+    av_freep(sub_rect);
+}
+
+static char *create_ass_text(TeletextContext *ctx, const char *text)
+{
+    char *dialog;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    ff_ass_bprint_text_event(&buf, text, strlen(text), "", 0);
+    if (!av_bprint_is_complete(&buf)) {
+        av_bprint_finalize(&buf, NULL);
+        return NULL;
+    }
+    dialog = ff_ass_get_dialog(ctx->readorder++, 0, NULL, NULL, buf.str);
+    av_bprint_finalize(&buf, NULL);
+    return dialog;
+}
+
+/* Draw a page as text */
+static int gen_sub_text(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page, int chop_top)
+{
+    const char *in;
+    AVBPrint buf;
+    char *vbi_text = av_malloc(TEXT_MAXSZ);
+    int sz;
+
+    if (!vbi_text)
+        return AVERROR(ENOMEM);
+
+    sz = vbi_print_page_region(page, vbi_text, TEXT_MAXSZ-1, "UTF-8",
+                                   /*table mode*/ TRUE, FALSE,
+                                   0,             chop_top,
+                                   page->columns, page->rows-chop_top);
+    if (sz <= 0) {
+        av_log(ctx, AV_LOG_ERROR, "vbi_print error\n");
+        av_free(vbi_text);
+        return AVERROR_EXTERNAL;
+    }
+    vbi_text[sz] = '\0';
+    in  = vbi_text;
+    av_bprint_init(&buf, 0, TEXT_MAXSZ);
+
+    if (ctx->chop_spaces) {
+        for (;;) {
+            int nl, sz;
+
+            // skip leading spaces and newlines
+            in += strspn(in, " \n");
+            // compute end of row
+            for (nl = 0; in[nl]; ++nl)
+                if (in[nl] == '\n' && (nl==0 || !(in[nl-1] & 0x80)))
+                    break;
+            if (!in[nl])
+                break;
+            // skip trailing spaces
+            sz = chop_spaces_utf8(in, nl);
+            av_bprint_append_data(&buf, in, sz);
+            av_bprintf(&buf, "\n");
+            in += nl;
+        }
+    } else {
+        av_bprintf(&buf, "%s\n", vbi_text);
+    }
+    av_free(vbi_text);
+
+    if (!av_bprint_is_complete(&buf)) {
+        av_bprint_finalize(&buf, NULL);
+        return AVERROR(ENOMEM);
+    }
+
+    if (buf.len) {
+        sub_rect->type = SUBTITLE_ASS;
+        sub_rect->ass = create_ass_text(ctx, buf.str);
+
+        if (!sub_rect->ass) {
+            av_bprint_finalize(&buf, NULL);
+            return AVERROR(ENOMEM);
+        }
+        av_log(ctx, AV_LOG_DEBUG, "subtext:%s:txetbus\n", sub_rect->ass);
+    } else {
+        sub_rect->type = SUBTITLE_NONE;
+    }
+    av_bprint_finalize(&buf, NULL);
+    return 0;
+}
+
+static void bprint_color(const char *type, AVBPrint *buf, vbi_page *page, unsigned ci)
+{
+    int r = VBI_R(page->color_map[ci]);
+    int g = VBI_G(page->color_map[ci]);
+    int b = VBI_B(page->color_map[ci]);
+    av_bprintf(buf, "{\\%s&H%02X%02X%02X&}", type, b, g, r);
+}
+
+#define IS_TXT_SPACE(ch) ((ch).unicode < 0x0020 || (ch).unicode >= 0xe000 || (ch).unicode == 0x00a0 ||\
+                          (ch).size > VBI_DOUBLE_SIZE || (ch).opacity == VBI_TRANSPARENT_SPACE)
+
+static void get_trim_info(vbi_page *page, vbi_char *row, int *leading, int *trailing, int *olen)
+{
+    int i, len = 0;
+    int char_seen = 0;
+
+    *leading = 0;
+
+    for (i = 0; i < page->columns; i++) {
+        uint16_t out = IS_TXT_SPACE(row[i]) ? 32 : row[i].unicode;
+
+        if (out == 32 && !char_seen)
+            (*leading)++;
+        else if (out != 32)
+            char_seen = 1, len = i - (*leading) + 1;
+    }
+
+    *olen = len;
+    *trailing = len > 0 ? page->columns - *leading - len : page->columns;
+}
+
+static void decode_string(vbi_page *page, vbi_char *row, AVBPrint *buf,
+                          int start, int end, vbi_color *cur_color, vbi_color *cur_back_color)
+{
+    int i;
+
+    for (i = start; i < end; i++) {
+        uint16_t out = IS_TXT_SPACE(row[i]) ? 32 : row[i].unicode;
+
+        if (*cur_color != row[i].foreground) {
+            bprint_color("c", buf, page, row[i].foreground);
+            *cur_color = row[i].foreground;
+        }
+        if (*cur_back_color != row[i].background) {
+            bprint_color("3c", buf, page, row[i].background);
+            *cur_back_color = row[i].background;
+        }
+
+        if (out == 32) {
+            av_bprintf(buf, "\\h");
+        } else if (out == '\\' || out == '{' || out == '}') {
+            av_bprintf(buf, "\\%c", (char)out);
+        } else {
+            char tmp;
+            /* convert to utf-8 */
+            PUT_UTF8(out, tmp, av_bprint_chars(buf, tmp, 1););
+        }
+    }
+}
+
+/* Draw a page as ass formatted text */
+static int gen_sub_ass(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page, int chop_top)
+{
+    int i;
+    int leading, trailing, len;
+    int last_trailing = -1, last_leading = -1;
+    int min_trailing = page->columns, min_leading = page->columns;
+    int alignment = 2;
+    int vertical_align = -1;
+    int can_align_left = 1, can_align_right = 1, can_align_center = 1;
+    int is_subtitle_page = ctx->subtitle_map[page->pgno & 0x7ff];
+    int empty_lines = 0;
+    vbi_color cur_color = VBI_WHITE;
+    vbi_color cur_back_color = VBI_BLACK;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = chop_top; i < page->rows; i++) {
+        vbi_char *row = page->text + i * page->columns;
+
+        get_trim_info(page, row, &leading, &trailing, &len);
+
+        if (len) {
+            if (last_leading != -1 && last_leading != leading || leading > 5)
+                can_align_left = 0;
+            if (last_trailing != -1 && last_trailing != trailing || trailing > 2)
+                can_align_right = 0;
+            if (last_trailing != -1 && (FFABS((trailing - leading) - (last_trailing - last_leading)) > 1) || trailing - leading > 4)
+                can_align_center = 0;
+            last_leading = leading;
+            last_trailing = trailing;
+            min_leading = FFMIN(leading, min_leading);
+            min_trailing = FFMIN(trailing, min_trailing);
+        }
+    }
+
+    if (!can_align_right && can_align_left && !can_align_center) {
+        ctx->last_ass_alignment = alignment = 1;
+    } else if (!can_align_right && !can_align_left && can_align_center) {
+        ctx->last_ass_alignment = alignment = 2;
+    } else if (can_align_right && !can_align_left && !can_align_center) {
+        ctx->last_ass_alignment = alignment = 3;
+    } else {
+        if (ctx->last_ass_alignment == 1 && can_align_left ||
+            ctx->last_ass_alignment == 2 && can_align_center ||
+            ctx->last_ass_alignment == 3 && can_align_right)
+            alignment = ctx->last_ass_alignment;
+    }
+
+    for (i = chop_top; i < page->rows; i++) {
+        int j;
+        vbi_char *row = page->text + i * page->columns;
+        int is_transparent_line;
+
+        for (j = 0; j < page->columns; j++)
+            if (row[j].opacity != VBI_TRANSPARENT_SPACE)
+                break;
+        is_transparent_line = (j == page->columns);
+
+        len = is_transparent_line ? 0 : page->columns;
+        leading = trailing = is_transparent_line ? page->columns : 0;
+
+        if (is_subtitle_page) {
+            if (!is_transparent_line)
+                get_trim_info(page, row, &leading, &trailing, &len);
+
+            if (vertical_align == -1 && len) {
+                vertical_align = (2 - (av_clip(i + 1, 0, 23) / 8));
+                av_bprintf(&buf, "{\\an%d}", alignment + vertical_align * 3);
+                if (vertical_align != 2)
+                    empty_lines = 0;
+            }
+
+            if (len && empty_lines > 1)
+                for (empty_lines /= 2; empty_lines > 0; empty_lines--)
+                    av_bprintf(&buf, " \\N");
+
+            if (alignment == 1 || alignment == 2 && !can_align_center)
+                leading = min_leading;
+            if (alignment == 3 || alignment == 2 && !can_align_center)
+                trailing = min_trailing;
+        }
+
+        if (len || !is_subtitle_page) {
+            decode_string(page, row, &buf, leading, page->columns - trailing, &cur_color, &cur_back_color);
+            av_bprintf(&buf, " \\N");
+            empty_lines = 0;
+        } else {
+            empty_lines++;
+        }
+    }
+
+    if (vertical_align == 0)
+        for (empty_lines = (empty_lines - 1) / 2; empty_lines > 0; empty_lines--)
+            av_bprintf(&buf, " \\N");
+
+    if (!av_bprint_is_complete(&buf)) {
+        av_bprint_finalize(&buf, NULL);
+        return AVERROR(ENOMEM);
+    }
+
+    if (buf.len) {
+        sub_rect->type = SUBTITLE_ASS;
+        sub_rect->ass = ff_ass_get_dialog(ctx->readorder++, 0, is_subtitle_page ? "Subtitle" : "Teletext", NULL, buf.str);
+
+        if (!sub_rect->ass) {
+            av_bprint_finalize(&buf, NULL);
+            return AVERROR(ENOMEM);
+        }
+        av_log(ctx, AV_LOG_DEBUG, "subtext:%s:txetbus\n", sub_rect->ass);
+    } else {
+        sub_rect->type = SUBTITLE_NONE;
+    }
+    av_bprint_finalize(&buf, NULL);
+    return 0;
+}
+
+static void fix_transparency(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page,
+                             int chop_top, int resx, int resy)
+{
+    int iy;
+
+    // Hack for transparency, inspired by VLC code...
+    for (iy = 0; iy < resy; iy++) {
+        uint8_t *pixel = sub_rect->data[0] + iy * sub_rect->linesize[0];
+        vbi_char *vc = page->text + (iy / BITMAP_CHAR_HEIGHT + chop_top) * page->columns;
+        vbi_char *vcnext = vc + page->columns;
+        for (; vc < vcnext; vc++) {
+            uint8_t *pixelnext = pixel + BITMAP_CHAR_WIDTH;
+            switch (vc->opacity) {
+                case VBI_TRANSPARENT_SPACE:
+                    memset(pixel, VBI_TRANSPARENT_BLACK, BITMAP_CHAR_WIDTH);
+                    break;
+                case VBI_OPAQUE:
+                    if (!ctx->transparent_bg)
+                        break;
+                case VBI_SEMI_TRANSPARENT:
+                    if (ctx->opacity > 0) {
+                        if (ctx->opacity < 255)
+                            for(; pixel < pixelnext; pixel++)
+                                if (*pixel == vc->background)
+                                    *pixel += VBI_NB_COLORS;
+                        break;
+                    }
+                case VBI_TRANSPARENT_FULL:
+                    for(; pixel < pixelnext; pixel++)
+                        if (*pixel == vc->background)
+                            *pixel = VBI_TRANSPARENT_BLACK;
+                    break;
+            }
+            pixel = pixelnext;
+        }
+    }
+}
+
+/* Draw a page as bitmap */
+static int gen_sub_bitmap(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page, int chop_top)
+{
+    int resx = page->columns * BITMAP_CHAR_WIDTH;
+    int resy = (page->rows - chop_top) * BITMAP_CHAR_HEIGHT;
+    uint8_t ci;
+    vbi_char *vc = page->text + (chop_top * page->columns);
+    vbi_char *vcend = page->text + (page->rows * page->columns);
+
+    for (; vc < vcend; vc++) {
+        if (vc->opacity != VBI_TRANSPARENT_SPACE)
+            break;
+    }
+
+    if (vc >= vcend) {
+        av_log(ctx, AV_LOG_DEBUG, "dropping empty page %3x\n", page->pgno);
+        sub_rect->type = SUBTITLE_NONE;
+        return 0;
+    }
+
+    sub_rect->data[0] = av_mallocz(resx * resy);
+    sub_rect->linesize[0] = resx;
+    if (!sub_rect->data[0])
+        return AVERROR(ENOMEM);
+
+    vbi_draw_vt_page_region(page, VBI_PIXFMT_PAL8,
+                            sub_rect->data[0], sub_rect->linesize[0],
+                            0, chop_top, page->columns, page->rows - chop_top,
+                            /*reveal*/ 1, /*flash*/ 1);
+
+    fix_transparency(ctx, sub_rect, page, chop_top, resx, resy);
+    sub_rect->x = ctx->x_offset;
+    sub_rect->y = ctx->y_offset + chop_top * BITMAP_CHAR_HEIGHT;
+    sub_rect->w = resx;
+    sub_rect->h = resy;
+    sub_rect->nb_colors = ctx->opacity > 0 && ctx->opacity < 255 ? 2 * VBI_NB_COLORS : VBI_NB_COLORS;
+    sub_rect->data[1] = av_mallocz(AVPALETTE_SIZE);
+    if (!sub_rect->data[1]) {
+        av_freep(&sub_rect->data[0]);
+        return AVERROR(ENOMEM);
+    }
+    for (ci = 0; ci < VBI_NB_COLORS; ci++) {
+        int r, g, b, a;
+
+        r = VBI_R(page->color_map[ci]);
+        g = VBI_G(page->color_map[ci]);
+        b = VBI_B(page->color_map[ci]);
+        a = VBI_A(page->color_map[ci]);
+        ((uint32_t *)sub_rect->data[1])[ci] = RGBA(r, g, b, a);
+        ((uint32_t *)sub_rect->data[1])[ci + VBI_NB_COLORS] = RGBA(r, g, b, ctx->opacity);
+        ff_dlog(ctx, "palette %0x\n", ((uint32_t *)sub_rect->data[1])[ci]);
+    }
+    ((uint32_t *)sub_rect->data[1])[VBI_TRANSPARENT_BLACK] = RGBA(0, 0, 0, 0);
+    ((uint32_t *)sub_rect->data[1])[VBI_TRANSPARENT_BLACK + VBI_NB_COLORS] = RGBA(0, 0, 0, 0);
+    sub_rect->type = SUBTITLE_BITMAP;
+    return 0;
+}
+
+static void handler(vbi_event *ev, void *user_data)
+{
+    TeletextContext *ctx = user_data;
+    TeletextPage *new_pages;
+    vbi_page page;
+    int res;
+    char pgno_str[12];
+    int chop_top;
+    int is_subtitle_page = ctx->subtitle_map[ev->ev.ttx_page.pgno & 0x7ff];
+
+    snprintf(pgno_str, sizeof pgno_str, "%03x", ev->ev.ttx_page.pgno);
+    av_log(ctx, AV_LOG_DEBUG, "decoded page %s.%02x\n",
+           pgno_str, ev->ev.ttx_page.subno & 0xFF);
+
+    if (strcmp(ctx->pgno, "*") && (strcmp(ctx->pgno, "subtitle") || !is_subtitle_page) && !strstr(ctx->pgno, pgno_str))
+        return;
+    if (ctx->handler_ret < 0)
+        return;
+
+    res = vbi_fetch_vt_page(ctx->vbi, &page,
+                            ev->ev.ttx_page.pgno,
+                            ev->ev.ttx_page.subno,
+                            VBI_WST_LEVEL_3p5, 25, TRUE);
+
+    if (!res)
+        return;
+
+    chop_top = ctx->chop_top || ((page.rows > 1) && is_subtitle_page);
+
+    av_log(ctx, AV_LOG_DEBUG, "%d x %d page chop:%d\n",
+           page.columns, page.rows, chop_top);
+
+    if (ctx->nb_pages < MAX_BUFFERED_PAGES) {
+        if ((new_pages = av_realloc_array(ctx->pages, ctx->nb_pages + 1, sizeof(TeletextPage)))) {
+            TeletextPage *cur_page = new_pages + ctx->nb_pages;
+            ctx->pages = new_pages;
+            cur_page->sub_rect = av_mallocz(sizeof(*cur_page->sub_rect));
+            cur_page->pts = ctx->pts;
+            cur_page->pgno = ev->ev.ttx_page.pgno;
+            cur_page->subno = ev->ev.ttx_page.subno;
+            if (cur_page->sub_rect) {
+                switch (ctx->format_id) {
+                    case 0:
+                        res = gen_sub_bitmap(ctx, cur_page->sub_rect, &page, chop_top);
+                        break;
+                    case 1:
+                        res = gen_sub_text(ctx, cur_page->sub_rect, &page, chop_top);
+                        break;
+                    case 2:
+                        res = gen_sub_ass(ctx, cur_page->sub_rect, &page, chop_top);
+                        break;
+                    default:
+                        res = AVERROR_BUG;
+                        break;
+                }
+                if (res < 0) {
+                    av_freep(&cur_page->sub_rect);
+                    ctx->handler_ret = res;
+                } else {
+                    ctx->pages[ctx->nb_pages++] = *cur_page;
+                }
+            } else {
+                ctx->handler_ret = AVERROR(ENOMEM);
+            }
+        } else {
+            ctx->handler_ret = AVERROR(ENOMEM);
+        }
+    } else {
+        //TODO: If multiple packets contain more than one page, pages may got queued up, and this may happen...
+        av_log(ctx, AV_LOG_ERROR, "Buffered too many pages, dropping page %s.\n", pgno_str);
+        ctx->handler_ret = AVERROR(ENOSYS);
+    }
+
+    vbi_unref_page(&page);
+}
+
+static int slice_to_vbi_lines(TeletextContext *ctx, uint8_t* buf, int size)
+{
+    int lines = 0;
+    while (size >= 2 && lines < MAX_SLICES) {
+        int data_unit_id     = buf[0];
+        int data_unit_length = buf[1];
+        if (data_unit_length + 2 > size)
+            return AVERROR_INVALIDDATA;
+        if (ff_data_unit_id_is_teletext(data_unit_id)) {
+            if (data_unit_length != 0x2c)
+                return AVERROR_INVALIDDATA;
+            else {
+                int line_offset  = buf[2] & 0x1f;
+                int field_parity = buf[2] & 0x20;
+                uint8_t *p = ctx->sliced[lines].data;
+                int i, pmag;
+                ctx->sliced[lines].id = VBI_SLICED_TELETEXT_B;
+                ctx->sliced[lines].line = (line_offset > 0 ? (line_offset + (field_parity ? 0 : 313)) : 0);
+                for (i = 0; i < 42; i++)
+                    p[i] = vbi_rev8(buf[4 + i]);
+                /* Unfortunately libzvbi does not expose page flags, and
+                 * vbi_classify_page only checks MIP, so we have to manually
+                 * decode the page flags and store the results. */
+                pmag = vbi_unham16p(p);
+                if (pmag >= 0 && pmag >> 3 == 0) {   // We found a row 0 header
+                    int page = vbi_unham16p(p + 2);
+                    int flags1 = vbi_unham16p(p + 6);
+                    int flags2 = vbi_unham16p(p + 8);
+                    if (page >= 0 && flags1 >= 0 && flags2 >= 0) {
+                        int pgno = ((pmag & 7) << 8) + page;
+                        // Check for disabled NEWSFLASH flag and enabled SUBTITLE and SUPRESS_HEADER flags
+                        ctx->subtitle_map[pgno] = (!(flags1 & 0x40) && flags1 & 0x80 && flags2 & 0x01);
+                        // Propagate ERASE_PAGE flag for repeated page headers to work around a libzvbi bug
+                        if (ctx->subtitle_map[pgno] && pgno == ctx->last_pgno) {
+                            int last_byte9 = vbi_unham8(ctx->last_p5);
+                            if (last_byte9 >= 0 && last_byte9 & 0x8) {
+                                int byte9 = vbi_unham8(p[5]);
+                                if (byte9 >= 0)
+                                    p[5] = vbi_ham8(byte9 | 0x8);
+                            }
+                        }
+                        ctx->last_pgno = pgno;
+                        ctx->last_p5 = p[5];
+                    }
+                }
+                lines++;
+            }
+        }
+        size -= data_unit_length + 2;
+        buf += data_unit_length + 2;
+    }
+    if (size)
+        av_log(ctx, AV_LOG_WARNING, "%d bytes remained after slicing data\n", size);
+    return lines;
+}
+
+static int teletext_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *pkt)
+{
+    TeletextContext *ctx = avctx->priv_data;
+    AVSubtitle      *sub = data;
+    int             ret = 0;
+    int j;
+
+    if (!ctx->vbi) {
+        if (!(ctx->vbi = vbi_decoder_new()))
+            return AVERROR(ENOMEM);
+        if (!vbi_event_handler_register(ctx->vbi, VBI_EVENT_TTX_PAGE, handler, ctx)) {
+            vbi_decoder_delete(ctx->vbi);
+            ctx->vbi = NULL;
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    if (avctx->pkt_timebase.num && pkt->pts != AV_NOPTS_VALUE)
+        ctx->pts = av_rescale_q(pkt->pts, avctx->pkt_timebase, AV_TIME_BASE_Q);
+
+    if (pkt->size) {
+        int lines;
+        const int full_pes_size = pkt->size + 45; /* PES header is 45 bytes */
+
+        // We allow unreasonably big packets, even if the standard only allows a max size of 1472
+        if (full_pes_size < 184 || full_pes_size > 65504 || full_pes_size % 184 != 0)
+            return AVERROR_INVALIDDATA;
+
+        ctx->handler_ret = pkt->size;
+
+        if (ff_data_identifier_is_teletext(*pkt->data)) {
+            if ((lines = slice_to_vbi_lines(ctx, pkt->data + 1, pkt->size - 1)) < 0)
+                return lines;
+            ff_dlog(avctx, "ctx=%p buf_size=%d lines=%u pkt_pts=%7.3f\n",
+                    ctx, pkt->size, lines, (double)pkt->pts/90000.0);
+            if (lines > 0) {
+                vbi_decode(ctx->vbi, ctx->sliced, lines, 0.0);
+                ctx->lines_processed += lines;
+            }
+        }
+        ctx->pts = AV_NOPTS_VALUE;
+        ret = ctx->handler_ret;
+    }
+
+    if (ret < 0)
+        return ret;
+
+    // is there a subtitle to pass?
+    if (ctx->nb_pages) {
+        int i;
+        sub->format = !!ctx->format_id;
+        sub->start_display_time = 0;
+        sub->end_display_time = ctx->sub_duration;
+        sub->num_rects = 0;
+        sub->pts = ctx->pages->pts;
+
+        if (ctx->pages->sub_rect->type != SUBTITLE_NONE) {
+            sub->rects = av_malloc(sizeof(*sub->rects));
+            if (sub->rects) {
+                sub->num_rects = 1;
+                sub->rects[0] = ctx->pages->sub_rect;
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+                for (j = 0; j < 4; j++) {
+                    sub->rects[0]->pict.data[j] = sub->rects[0]->data[j];
+                    sub->rects[0]->pict.linesize[j] = sub->rects[0]->linesize[j];
+                }
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+            } else {
+                ret = AVERROR(ENOMEM);
+            }
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "sending empty sub\n");
+            sub->rects = NULL;
+        }
+        if (!sub->rects) // no rect was passed
+            subtitle_rect_free(&ctx->pages->sub_rect);
+
+        for (i = 0; i < ctx->nb_pages - 1; i++)
+            ctx->pages[i] = ctx->pages[i + 1];
+        ctx->nb_pages--;
+
+        if (ret >= 0)
+            *data_size = 1;
+    } else
+        *data_size = 0;
+
+    return ret;
+}
+
+static int teletext_init_decoder(AVCodecContext *avctx)
+{
+    TeletextContext *ctx = avctx->priv_data;
+    unsigned int maj, min, rev;
+
+    vbi_version(&maj, &min, &rev);
+    if (!(maj > 0 || min > 2 || min == 2 && rev >= 26)) {
+        av_log(avctx, AV_LOG_ERROR, "decoder needs zvbi version >= 0.2.26.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    if (ctx->format_id == 0) {
+        avctx->width  = 41 * BITMAP_CHAR_WIDTH;
+        avctx->height = 25 * BITMAP_CHAR_HEIGHT;
+    }
+
+    ctx->vbi = NULL;
+    ctx->pts = AV_NOPTS_VALUE;
+    ctx->last_pgno = -1;
+    ctx->last_ass_alignment = 2;
+
+    if (ctx->opacity == -1)
+        ctx->opacity = ctx->transparent_bg ? 0 : 255;
+
+    av_log(avctx, AV_LOG_VERBOSE, "page filter: %s\n", ctx->pgno);
+
+    switch (ctx->format_id) {
+        case 0:
+            return 0;
+        case 1:
+            return ff_ass_subtitle_header_default(avctx);
+        case 2:
+            return my_ass_subtitle_header(avctx);
+    }
+    return AVERROR_BUG;
+}
+
+static int teletext_close_decoder(AVCodecContext *avctx)
+{
+    TeletextContext *ctx = avctx->priv_data;
+
+    ff_dlog(avctx, "lines_total=%u\n", ctx->lines_processed);
+    while (ctx->nb_pages)
+        subtitle_rect_free(&ctx->pages[--ctx->nb_pages].sub_rect);
+    av_freep(&ctx->pages);
+
+    vbi_decoder_delete(ctx->vbi);
+    ctx->vbi = NULL;
+    ctx->pts = AV_NOPTS_VALUE;
+    ctx->last_pgno = -1;
+    ctx->last_ass_alignment = 2;
+    memset(ctx->subtitle_map, 0, sizeof(ctx->subtitle_map));
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        ctx->readorder = 0;
+    return 0;
+}
+
+static void teletext_flush(AVCodecContext *avctx)
+{
+    teletext_close_decoder(avctx);
+}
+
+#define OFFSET(x) offsetof(TeletextContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    {"txt_page",        "page numbers to decode, subtitle for subtitles, * for all", OFFSET(pgno),   AV_OPT_TYPE_STRING, {.str = "*"},      0, 0,        SD},
+    {"txt_chop_top",    "discards the top teletext line",                    OFFSET(chop_top),       AV_OPT_TYPE_INT,    {.i64 = 1},        0, 1,        SD},
+    {"txt_format",      "format of the subtitles (bitmap or text or ass)",   OFFSET(format_id),      AV_OPT_TYPE_INT,    {.i64 = 0},        0, 2,        SD,  "txt_format"},
+    {"bitmap",          NULL,                                                0,                      AV_OPT_TYPE_CONST,  {.i64 = 0},        0, 0,        SD,  "txt_format"},
+    {"text",            NULL,                                                0,                      AV_OPT_TYPE_CONST,  {.i64 = 1},        0, 0,        SD,  "txt_format"},
+    {"ass",             NULL,                                                0,                      AV_OPT_TYPE_CONST,  {.i64 = 2},        0, 0,        SD,  "txt_format"},
+    {"txt_left",        "x offset of generated bitmaps",                     OFFSET(x_offset),       AV_OPT_TYPE_INT,    {.i64 = 0},        0, 65535,    SD},
+    {"txt_top",         "y offset of generated bitmaps",                     OFFSET(y_offset),       AV_OPT_TYPE_INT,    {.i64 = 0},        0, 65535,    SD},
+    {"txt_chop_spaces", "chops leading and trailing spaces from text",       OFFSET(chop_spaces),    AV_OPT_TYPE_INT,    {.i64 = 1},        0, 1,        SD},
+    {"txt_duration",    "display duration of teletext pages in msecs",       OFFSET(sub_duration),   AV_OPT_TYPE_INT,    {.i64 = -1},      -1, 86400000, SD},
+    {"txt_transparent", "force transparent background of the teletext",      OFFSET(transparent_bg), AV_OPT_TYPE_INT,    {.i64 = 0},        0, 1,        SD},
+    {"txt_opacity",     "set opacity of the transparent background",         OFFSET(opacity),        AV_OPT_TYPE_INT,    {.i64 = -1},      -1, 255,      SD},
+    { NULL },
+};
+
+static const AVClass teletext_class = {
+    .class_name = "libzvbi_teletextdec",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libzvbi_teletext_decoder = {
+    .name      = "libzvbi_teletextdec",
+    .long_name = NULL_IF_CONFIG_SMALL("Libzvbi DVB teletext decoder"),
+    .type      = AVMEDIA_TYPE_SUBTITLE,
+    .id        = AV_CODEC_ID_DVB_TELETEXT,
+    .priv_data_size = sizeof(TeletextContext),
+    .init      = teletext_init_decoder,
+    .close     = teletext_close_decoder,
+    .decode    = teletext_decode_frame,
+    .capabilities = AV_CODEC_CAP_DELAY,
+    .flush     = teletext_flush,
+    .priv_class= &teletext_class,
+    .wrapper_name = "libzvbi",
+};
diff --git a/libavcodec/ljpegenc.c b/libavcodec/ljpegenc.c
index b6d73a4..924d2e2 100644
--- a/libavcodec/ljpegenc.c
+++ b/libavcodec/ljpegenc.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,7 +40,6 @@
 #include "jpegtables.h"
 #include "mjpegenc_common.h"
 #include "mjpeg.h"
-#include "mjpegenc.h"
 
 typedef struct LJpegEncContext {
     AVClass *class;
@@ -48,8 +47,8 @@ typedef struct LJpegEncContext {
     ScanTable scantable;
     uint16_t matrix[64];
 
-    int vsample[3];
-    int hsample[3];
+    int vsample[4];
+    int hsample[4];
 
     uint16_t huff_code_dc_luminance[12];
     uint16_t huff_code_dc_chrominance[12];
@@ -68,7 +67,7 @@ static int ljpeg_encode_bgr(AVCodecContext *avctx, PutBitContext *pb,
     const int height      = frame->height;
     const int linesize    = frame->linesize[0];
     uint16_t (*buffer)[4] = s->scratch;
-    int left[3], top[3], topleft[3];
+    int left[4], top[4], topleft[4];
     int x, y, i;
 
 #if FF_API_PRIVATE_OPT
@@ -78,27 +77,35 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    for (i = 0; i < 3; i++)
+    for (i = 0; i < 4; i++)
         buffer[0][i] = 1 << (9 - 1);
 
     for (y = 0; y < height; y++) {
         const int modified_predictor = y ? s->pred : 1;
         uint8_t *ptr = frame->data[0] + (linesize * y);
 
-        if (pb->buf_end - pb->buf - (put_bits_count(pb) >> 3) < width * 3 * 3) {
+        if (pb->buf_end - pb->buf - (put_bits_count(pb) >> 3) < width * 4 * 4) {
             av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
             return -1;
         }
 
-        for (i = 0; i < 3; i++)
+        for (i = 0; i < 4; i++)
             top[i]= left[i]= topleft[i]= buffer[0][i];
 
         for (x = 0; x < width; x++) {
-            buffer[x][1] =  ptr[3 * x + 0] -     ptr[3 * x + 1] + 0x100;
-            buffer[x][2] =  ptr[3 * x + 2] -     ptr[3 * x + 1] + 0x100;
-            buffer[x][0] = (ptr[3 * x + 0] + 2 * ptr[3 * x + 1] + ptr[3 * x + 2]) >> 2;
+            if(avctx->pix_fmt == AV_PIX_FMT_BGR24){
+                buffer[x][1] =  ptr[3 * x + 0] -     ptr[3 * x + 1] + 0x100;
+                buffer[x][2] =  ptr[3 * x + 2] -     ptr[3 * x + 1] + 0x100;
+                buffer[x][0] = (ptr[3 * x + 0] + 2 * ptr[3 * x + 1] + ptr[3 * x + 2]) >> 2;
+            }else{
+                buffer[x][1] =  ptr[4 * x + 0] -     ptr[4 * x + 1] + 0x100;
+                buffer[x][2] =  ptr[4 * x + 2] -     ptr[4 * x + 1] + 0x100;
+                buffer[x][0] = (ptr[4 * x + 0] + 2 * ptr[4 * x + 1] + ptr[4 * x + 2]) >> 2;
+                if (avctx->pix_fmt == AV_PIX_FMT_BGRA)
+                    buffer[x][3] =  ptr[4 * x + 3];
+            }
 
-            for (i = 0; i < 3; i++) {
+            for (i = 0; i < 3 + (avctx->pix_fmt == AV_PIX_FMT_BGRA); i++) {
                 int pred, diff;
 
                 PREDICT(pred, topleft[i], top[i], left[i], modified_predictor);
@@ -110,7 +117,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
                 diff       = ((left[i] - pred + 0x100) & 0x1FF) - 0x100;
 
-                if (i == 0)
+                if (i == 0 || i == 3)
                     ff_mjpeg_encode_dc(pb, diff, s->huff_size_dc_luminance, s->huff_code_dc_luminance); //FIXME ugly
                 else
                     ff_mjpeg_encode_dc(pb, diff, s->huff_size_dc_chrominance, s->huff_code_dc_chrominance);
@@ -227,25 +234,29 @@ static int ljpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int max_pkt_size = AV_INPUT_BUFFER_MIN_SIZE;
     int ret, header_bits;
 
-    if (avctx->pix_fmt == AV_PIX_FMT_BGR24)
-        max_pkt_size += width * height * 3 * 3;
+    if(    avctx->pix_fmt == AV_PIX_FMT_BGR0
+        || avctx->pix_fmt == AV_PIX_FMT_BGR24)
+        max_pkt_size += width * height * 3 * 4;
+    else if(avctx->pix_fmt == AV_PIX_FMT_BGRA)
+        max_pkt_size += width * height * 4 * 4;
     else {
         max_pkt_size += mb_width * mb_height * 3 * 4
                         * s->hsample[0] * s->vsample[0];
     }
-    if ((ret = ff_alloc_packet(pkt, max_pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", max_pkt_size);
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_pkt_size, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&pb, pkt->data, pkt->size);
 
     ff_mjpeg_encode_picture_header(avctx, &pb, &s->scantable,
-                                   s->pred, s->matrix);
+                                   s->pred, s->matrix, s->matrix);
 
     header_bits = put_bits_count(&pb);
 
-    if (avctx->pix_fmt == AV_PIX_FMT_BGR24)
+    if(    avctx->pix_fmt == AV_PIX_FMT_BGR0
+        || avctx->pix_fmt == AV_PIX_FMT_BGRA
+        || avctx->pix_fmt == AV_PIX_FMT_BGR24)
         ret = ljpeg_encode_bgr(avctx, &pb, pict);
     else
         ret = ljpeg_encode_yuv(avctx, &pb, pict);
@@ -254,6 +265,7 @@ static int ljpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     emms_c();
 
+    ff_mjpeg_escape_FF(&pb, header_bits >> 3);
     ff_mjpeg_encode_picture_trailer(&pb, header_bits);
 
     flush_put_bits(&pb);
@@ -276,7 +288,6 @@ static av_cold int ljpeg_encode_close(AVCodecContext *avctx)
 static av_cold int ljpeg_encode_init(AVCodecContext *avctx)
 {
     LJpegEncContext *s = avctx->priv_data;
-    int chroma_v_shift, chroma_h_shift;
 
     if ((avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
          avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
@@ -297,26 +308,14 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     s->scratch = av_malloc_array(avctx->width + 1, sizeof(*s->scratch));
+    if (!s->scratch)
+        goto fail;
 
     ff_idctdsp_init(&s->idsp, avctx);
     ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
                       ff_zigzag_direct);
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift,
-                                     &chroma_v_shift);
-
-    if (avctx->pix_fmt   == AV_PIX_FMT_BGR24) {
-        s->vsample[0] = s->hsample[0] =
-        s->vsample[1] = s->hsample[1] =
-        s->vsample[2] = s->hsample[2] = 1;
-    } else {
-        s->vsample[0] = 2;
-        s->vsample[1] = 2 >> chroma_v_shift;
-        s->vsample[2] = 2 >> chroma_v_shift;
-        s->hsample[0] = 2;
-        s->hsample[1] = 2 >> chroma_h_shift;
-        s->hsample[2] = 2 >> chroma_h_shift;
-    }
+    ff_mjpeg_init_hvsample(avctx, s->hsample, s->vsample);
 
     ff_mjpeg_build_huffman_codes(s->huff_size_dc_luminance,
                                  s->huff_code_dc_luminance,
@@ -328,6 +327,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                  avpriv_mjpeg_val_dc);
 
     return 0;
+fail:
+    ljpeg_encode_close(avctx);
+    return AVERROR(ENOMEM);
 }
 
 #define OFFSET(x) offsetof(LJpegEncContext, x)
@@ -358,12 +360,10 @@ AVCodec ff_ljpeg_encoder = {
     .init           = ljpeg_encode_init,
     .encode2        = ljpeg_encode_frame,
     .close          = ljpeg_encode_close,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVJ420P,
-                                                    AV_PIX_FMT_YUVJ422P,
-                                                    AV_PIX_FMT_YUVJ444P,
-                                                    AV_PIX_FMT_BGR24,
-                                                    AV_PIX_FMT_YUV420P,
-                                                    AV_PIX_FMT_YUV422P,
-                                                    AV_PIX_FMT_YUV444P,
-                                                    AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_BGR24   , AV_PIX_FMT_BGRA    , AV_PIX_FMT_BGR0,
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
+        AV_PIX_FMT_YUV420P , AV_PIX_FMT_YUV444P , AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_NONE},
 };
diff --git a/libavcodec/loco.c b/libavcodec/loco.c
index fa4c5ed..741db3b 100644
--- a/libavcodec/loco.c
+++ b/libavcodec/loco.c
@@ -2,20 +2,20 @@
  * LOCO codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "golomb.h"
 #include "internal.h"
 #include "mathops.h"
@@ -46,11 +46,11 @@ enum LOCO_MODE {
 typedef struct LOCOContext {
     AVCodecContext *avctx;
     int lossy;
-    int mode;
+    enum LOCO_MODE mode;
 } LOCOContext;
 
 typedef struct RICEContext {
-    BitstreamContext bc;
+    GetBitContext gb;
     int save, run, run2; /* internal rice decoder state */
     int sum, count; /* sum and count for getting rice parameter */
     int lossy;
@@ -88,11 +88,11 @@ static inline int loco_get_rice(RICEContext *r)
         loco_update_rice_param(r, 0);
         return 0;
     }
-    v = get_ur_golomb_jpegls(&r->bc, loco_get_rice_param(r), INT_MAX, 0);
+    v = get_ur_golomb_jpegls(&r->gb, loco_get_rice_param(r), INT_MAX, 0);
     loco_update_rice_param(r, (v + 1) >> 1);
     if (!v) {
         if (r->save >= 0) {
-            r->run = get_ur_golomb_jpegls(&r->bc, 2, INT_MAX, 0);
+            r->run = get_ur_golomb_jpegls(&r->gb, 2, INT_MAX, 0);
             if (r->run > 1)
                 r->save += r->run + 1;
             else
@@ -114,25 +114,31 @@ static inline int loco_get_rice(RICEContext *r)
 }
 
 /* LOCO main predictor - LOCO-I/JPEG-LS predictor */
-static inline int loco_predict(uint8_t* data, int stride, int step)
+static inline int loco_predict(uint8_t* data, int stride)
 {
     int a, b, c;
 
     a = data[-stride];
-    b = data[-step];
-    c = data[-stride - step];
+    b = data[-1];
+    c = data[-stride - 1];
 
     return mid_pred(a, a + b - c, b);
 }
 
 static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int height,
-                             int stride, const uint8_t *buf, int buf_size, int step)
+                             int stride, const uint8_t *buf, int buf_size)
 {
     RICEContext rc;
     int val;
+    int ret;
     int i, j;
 
-    bitstream_init8(&rc.bc, buf, buf_size);
+    if(buf_size<=0)
+        return -1;
+
+    if ((ret = init_get_bits8(&rc.gb, buf, buf_size)) < 0)
+        return ret;
+
     rc.save  = 0;
     rc.run   = 0;
     rc.run2  = 0;
@@ -147,7 +153,7 @@ static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int heigh
     /* restore top line */
     for (i = 1; i < width; i++) {
         val = loco_get_rice(&rc);
-        data[i * step] = data[i * step - step] + val;
+        data[i] = data[i - 1] + val;
     }
     data += stride;
     for (j = 1; j < height; j++) {
@@ -157,12 +163,28 @@ static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int heigh
         /* restore all other pixels */
         for (i = 1; i < width; i++) {
             val = loco_get_rice(&rc);
-            data[i * step] = loco_predict(&data[i * step], stride, step) + val;
+            data[i] = loco_predict(&data[i], stride) + val;
         }
         data += stride;
     }
 
-    return (bitstream_tell(&rc.bc) + 7) >> 3;
+    return (get_bits_count(&rc.gb) + 7) >> 3;
+}
+
+static void rotate_faulty_loco(uint8_t *data, int width, int height, int stride)
+{
+    int y;
+
+    for (y=1; y<height; y++) {
+        if (width>=y) {
+            memmove(data + y*stride,
+                    data + y*(stride + 1),
+                    (width-y));
+            if (y+1 < height)
+                memmove(data + y*stride + (width-y),
+                        data + (y+1)*stride, y);
+        }
+    }
 }
 
 static int decode_frame(AVCodecContext *avctx,
@@ -175,88 +197,75 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p     = data;
     int decoded, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->key_frame = 1;
 
+#define ADVANCE_BY_DECODED do { \
+    if (decoded < 0 || decoded >= buf_size) goto buf_too_small; \
+    buf += decoded; buf_size -= decoded; \
+} while(0)
     switch(l->mode) {
     case LOCO_CYUY2: case LOCO_YUY2: case LOCO_UYVY:
         decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+                                    p->linesize[0], buf, buf_size);
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height,
-                                    p->linesize[1], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+                                    p->linesize[1], buf, buf_size);
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height,
-                                    p->linesize[2], buf, buf_size, 1);
+                                    p->linesize[2], buf, buf_size);
         break;
     case LOCO_CYV12: case LOCO_YV12:
         decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+                                    p->linesize[0], buf, buf_size);
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height / 2,
-                                    p->linesize[2], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+                                    p->linesize[2], buf, buf_size);
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height / 2,
-                                    p->linesize[1], buf, buf_size, 1);
+                                    p->linesize[1], buf, buf_size);
         break;
     case LOCO_CRGB: case LOCO_RGB:
+        decoded = loco_decode_plane(l, p->data[1] + p->linesize[1]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[1], buf, buf_size);
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height,
-                                    -p->linesize[0], buf, buf_size, 3);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 1, avctx->width, avctx->height,
-                                    -p->linesize[0], buf, buf_size, 3);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 2, avctx->width, avctx->height,
-                                    -p->linesize[0], buf, buf_size, 3);
+                                    -p->linesize[0], buf, buf_size);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[2] + p->linesize[2]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[2], buf, buf_size);
+        if (avctx->width & 1) {
+            rotate_faulty_loco(p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height, -p->linesize[0]);
+            rotate_faulty_loco(p->data[1] + p->linesize[1]*(avctx->height-1), avctx->width, avctx->height, -p->linesize[1]);
+            rotate_faulty_loco(p->data[2] + p->linesize[2]*(avctx->height-1), avctx->width, avctx->height, -p->linesize[2]);
+        }
         break;
+    case LOCO_CRGBA:
     case LOCO_RGBA:
-        decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 1, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 2, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 3, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
+        decoded = loco_decode_plane(l, p->data[1] + p->linesize[1]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[1], buf, buf_size);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[2] + p->linesize[2]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[2], buf, buf_size);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[3] + p->linesize[3]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[3], buf, buf_size);
         break;
+    default:
+        av_assert0(0);
     }
 
+    if (decoded < 0 || decoded > buf_size)
+        goto buf_too_small;
+    buf_size -= decoded;
+
     *got_frame      = 1;
 
-    return buf_size;
+    return avpkt->size - buf_size;
 buf_too_small:
     av_log(avctx, AV_LOG_ERROR, "Input data too small.\n");
     return AVERROR(EINVAL);
@@ -295,7 +304,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case LOCO_CRGB:
     case LOCO_RGB:
-        avctx->pix_fmt = AV_PIX_FMT_BGR24;
+        avctx->pix_fmt = AV_PIX_FMT_GBRP;
         break;
     case LOCO_CYV12:
     case LOCO_YV12:
@@ -303,7 +312,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case LOCO_CRGBA:
     case LOCO_RGBA:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP;
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "Unknown colorspace, index = %i\n", l->mode);
diff --git a/libavcodec/lossless_audiodsp.c b/libavcodec/lossless_audiodsp.c
new file mode 100644
index 0000000..3a9f9b2
--- /dev/null
+++ b/libavcodec/lossless_audiodsp.c
@@ -0,0 +1,67 @@
+/*
+ * Monkey's Audio lossless audio decoder
+ * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
+ *  based upon libdemac from Dave Chapman.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "lossless_audiodsp.h"
+
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul)
+{
+    int res = 0;
+
+    do {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    } while (order-=2);
+    return res;
+}
+
+static int32_t scalarproduct_and_madd_int32_c(int16_t *v1, const int32_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul)
+{
+    int res = 0;
+
+    do {
+        res   += *v1 * (uint32_t)*v2++;
+        *v1++ += mul * *v3++;
+        res   += *v1 * (uint32_t)*v2++;
+        *v1++ += mul * *v3++;
+    } while (order-=2);
+    return res;
+}
+
+av_cold void ff_llauddsp_init(LLAudDSPContext *c)
+{
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+    c->scalarproduct_and_madd_int32 = scalarproduct_and_madd_int32_c;
+
+    if (ARCH_ARM)
+        ff_llauddsp_init_arm(c);
+    if (ARCH_PPC)
+        ff_llauddsp_init_ppc(c);
+    if (ARCH_X86)
+        ff_llauddsp_init_x86(c);
+}
diff --git a/libavcodec/apedsp.h b/libavcodec/lossless_audiodsp.h
index 64e2749..eea5d49 100644
--- a/libavcodec/apedsp.h
+++ b/libavcodec/lossless_audiodsp.h
@@ -3,42 +3,49 @@
  * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
  *  based upon libdemac from Dave Chapman.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_APEDSP_H
-#define AVCODEC_APEDSP_H
+#ifndef AVCODEC_LOSSLESS_AUDIODSP_H
+#define AVCODEC_LOSSLESS_AUDIODSP_H
 
 #include <stdint.h>
 
-typedef struct APEDSPContext {
+typedef struct LLAudDSPContext {
     /**
      * Calculate scalar product of v1 and v2,
      * and v1[i] += v3[i] * mul
-     * @param len length of vectors, should be multiple of 16
+     * @param len length of vectors, should be multiple of 16,
+     *            or padd v3 and v1 or v2 with zeros.
      */
     int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
                                             const int16_t *v2,
                                             const int16_t *v3,
                                             int len, int mul);
-} APEDSPContext;
 
-void ff_apedsp_init_arm(APEDSPContext *c);
-void ff_apedsp_init_ppc(APEDSPContext *c);
-void ff_apedsp_init_x86(APEDSPContext *c);
+    int32_t (*scalarproduct_and_madd_int32)(int16_t *v1 /* align 16 */,
+                                            const int32_t *v2,
+                                            const int16_t *v3,
+                                            int len, int mul);
+} LLAudDSPContext;
+
+void ff_llauddsp_init(LLAudDSPContext *c);
+void ff_llauddsp_init_arm(LLAudDSPContext *c);
+void ff_llauddsp_init_ppc(LLAudDSPContext *c);
+void ff_llauddsp_init_x86(LLAudDSPContext *c);
 
-#endif /* AVCODEC_APEDSP_H */
+#endif /* AVCODEC_LOSSLESS_AUDIODSP_H */
diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c
new file mode 100644
index 0000000..cff94c2
--- /dev/null
+++ b/libavcodec/lossless_videodsp.c
@@ -0,0 +1,125 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "lossless_videodsp.h"
+#include "libavcodec/mathops.h"
+
+// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
+#define pb_7f (~0UL / 255 * 0x7f)
+#define pb_80 (~0UL / 255 * 0x80)
+
+static void add_bytes_c(uint8_t *dst, uint8_t *src, ptrdiff_t w)
+{
+    long i;
+
+    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
+        long a = *(long *) (src + i);
+        long b = *(long *) (dst + i);
+        *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
+    }
+    for (; i < w; i++)
+        dst[i + 0] += src[i + 0];
+}
+
+static void add_median_pred_c(uint8_t *dst, const uint8_t *src1,
+                              const uint8_t *diff, ptrdiff_t w,
+                              int *left, int *left_top)
+{
+    int i;
+    uint8_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for (i = 0; i < w; i++) {
+        l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
+        lt     = src1[i];
+        dst[i] = l;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static int add_left_pred_c(uint8_t *dst, const uint8_t *src, ptrdiff_t w,
+                           int acc)
+{
+    int i;
+
+    for (i = 0; i < w - 1; i++) {
+        acc   += src[i];
+        dst[i] = acc;
+        i++;
+        acc   += src[i];
+        dst[i] = acc;
+    }
+
+    for (; i < w; i++) {
+        acc   += src[i];
+        dst[i] = acc;
+    }
+
+    return acc;
+}
+
+static int add_left_pred_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc){
+    int i;
+
+    for(i=0; i<w-1; i++){
+        acc+= src[i];
+        dst[i]= acc &= mask;
+        i++;
+        acc+= src[i];
+        dst[i]= acc &= mask;
+    }
+
+    for(; i<w; i++){
+        acc+= src[i];
+        dst[i]= acc &= mask;
+    }
+
+    return acc;
+}
+
+static void add_gradient_pred_c(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width){
+    int A, B, C, i;
+
+    for (i = 0; i < width; i++) {
+        A = src[i - stride];
+        B = src[i - (stride + 1)];
+        C = src[i - 1];
+        src[i] = (A - B + C + src[i]) & 0xFF;
+    }
+}
+
+void ff_llviddsp_init(LLVidDSPContext *c)
+{
+    c->add_bytes                  = add_bytes_c;
+    c->add_median_pred            = add_median_pred_c;
+    c->add_left_pred              = add_left_pred_c;
+
+    c->add_left_pred_int16        = add_left_pred_int16_c;
+    c->add_gradient_pred          = add_gradient_pred_c;
+
+    if (ARCH_PPC)
+        ff_llviddsp_init_ppc(c);
+    if (ARCH_X86)
+        ff_llviddsp_init_x86(c);
+}
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h
new file mode 100644
index 0000000..8077898
--- /dev/null
+++ b/libavcodec/lossless_videodsp.h
@@ -0,0 +1,49 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#ifndef AVCODEC_LOSSLESS_VIDEODSP_H
+#define AVCODEC_LOSSLESS_VIDEODSP_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "avcodec.h"
+#include "libavutil/cpu.h"
+
+typedef struct LLVidDSPContext {
+    void (*add_bytes)(uint8_t *dst /* align 32 */, uint8_t *src /* align 32 */,
+                      ptrdiff_t w);
+    void (*add_median_pred)(uint8_t *dst, const uint8_t *top,
+                            const uint8_t *diff, ptrdiff_t w,
+                            int *left, int *left_top);
+    int (*add_left_pred)(uint8_t *dst, const uint8_t *src,
+                         ptrdiff_t w, int left);
+
+    int  (*add_left_pred_int16)(uint16_t *dst, const uint16_t *src,
+                                unsigned mask, ptrdiff_t w, unsigned left);
+    void (*add_gradient_pred)(uint8_t *src /* align 32 */, const ptrdiff_t stride, const ptrdiff_t width);
+} LLVidDSPContext;
+
+void ff_llviddsp_init(LLVidDSPContext *llviddsp);
+void ff_llviddsp_init_x86(LLVidDSPContext *llviddsp);
+void ff_llviddsp_init_ppc(LLVidDSPContext *llviddsp);
+
+#endif //AVCODEC_LOSSLESS_VIDEODSP_H
diff --git a/libavcodec/lossless_videoencdsp.c b/libavcodec/lossless_videoencdsp.c
new file mode 100644
index 0000000..ed70329
--- /dev/null
+++ b/libavcodec/lossless_videoencdsp.c
@@ -0,0 +1,99 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "lossless_videoencdsp.h"
+#include "mathops.h"
+
+// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
+#define pb_7f (~0UL / 255 * 0x7f)
+#define pb_80 (~0UL / 255 * 0x80)
+
+static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w)
+{
+    long i;
+
+#if !HAVE_FAST_UNALIGNED
+    if (((long)src1 | (long)src2) & (sizeof(long) - 1)) {
+        for (i = 0; i + 7 < w; i += 8) {
+            dst[i + 0] = src1[i + 0] - src2[i + 0];
+            dst[i + 1] = src1[i + 1] - src2[i + 1];
+            dst[i + 2] = src1[i + 2] - src2[i + 2];
+            dst[i + 3] = src1[i + 3] - src2[i + 3];
+            dst[i + 4] = src1[i + 4] - src2[i + 4];
+            dst[i + 5] = src1[i + 5] - src2[i + 5];
+            dst[i + 6] = src1[i + 6] - src2[i + 6];
+            dst[i + 7] = src1[i + 7] - src2[i + 7];
+        }
+    } else
+#endif
+    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
+        long a = *(long *) (src1 + i);
+        long b = *(long *) (src2 + i);
+        *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
+                              ((a ^ b ^ pb_80) & pb_80);
+    }
+    for (; i < w; i++)
+        dst[i + 0] = src1[i + 0] - src2[i + 0];
+}
+
+static void sub_median_pred_c(uint8_t *dst, const uint8_t *src1,
+                              const uint8_t *src2, intptr_t w,
+                              int *left, int *left_top)
+{
+    int i;
+    uint8_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for (i = 0; i < w; i++) {
+        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
+        lt     = src1[i];
+        l      = src2[i];
+        dst[i] = l - pred;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static void sub_left_predict_c(uint8_t *dst, uint8_t *src,
+                               ptrdiff_t stride, ptrdiff_t width, int height)
+{
+    int i, j;
+    uint8_t prev = 0x80; /* Set the initial value */
+    for (j = 0; j < height; j++) {
+        for (i = 0; i < width; i++) {
+            *dst++ = src[i] - prev;
+            prev   = src[i];
+        }
+        src += stride;
+    }
+}
+
+av_cold void ff_llvidencdsp_init(LLVidEncDSPContext *c)
+{
+    c->diff_bytes      = diff_bytes_c;
+    c->sub_median_pred = sub_median_pred_c;
+    c->sub_left_predict = sub_left_predict_c;
+
+    if (ARCH_X86)
+        ff_llvidencdsp_init_x86(c);
+}
diff --git a/libavcodec/lossless_videoencdsp.h b/libavcodec/lossless_videoencdsp.h
new file mode 100644
index 0000000..faa6c32
--- /dev/null
+++ b/libavcodec/lossless_videoencdsp.h
@@ -0,0 +1,46 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOSSLESS_VIDEOENCDSP_H
+#define AVCODEC_LOSSLESS_VIDEOENCDSP_H
+
+#include <stdint.h>
+
+#include "avcodec.h"
+
+typedef struct LLVidEncDSPContext {
+    void (*diff_bytes)(uint8_t *dst /* align 16 */,
+                       const uint8_t *src1 /* align 16 */,
+                       const uint8_t *src2 /* align 1 */,
+                       intptr_t w);
+    /**
+     * Subtract HuffYUV's variant of median prediction.
+     * Note, this might read from src1[-1], src2[-1].
+     */
+    void (*sub_median_pred)(uint8_t *dst, const uint8_t *src1,
+                            const uint8_t *src2, intptr_t w,
+                            int *left, int *left_top);
+
+    void (*sub_left_predict)(uint8_t *dst, uint8_t *src,
+                          ptrdiff_t stride, ptrdiff_t width, int height);
+} LLVidEncDSPContext;
+
+void ff_llvidencdsp_init(LLVidEncDSPContext *c);
+void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c);
+
+#endif /* AVCODEC_LOSSLESS_VIDEOENCDSP_H */
diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index 1482e57..f8da1e1 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -2,20 +2,20 @@
  * LPC utility code
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #define LPC_USE_DOUBLE
 #include "lpc.h"
+#include "libavutil/avassert.h"
 
 
 /**
@@ -36,13 +37,19 @@ static void lpc_apply_welch_window_c(const int32_t *data, int len,
     double w;
     double c;
 
-    /* The optimization in commit fa4ed8c does not support odd len.
-     * If someone wants odd len extend that change. */
-    assert(!(len & 1));
-
     n2 = (len >> 1);
     c = 2.0 / (len - 1.0);
 
+    if (len & 1) {
+        for(i=0; i<n2; i++) {
+            w = c - i - 1.0;
+            w = 1.0 - (w * w);
+            w_data[i] = data[i] * w;
+            w_data[len-1-i] = data[len-1-i] * w;
+        }
+        return;
+    }
+
     w_data+=n2;
       data+=n2;
     for(i=0; i<n2; i++) {
@@ -86,7 +93,8 @@ static void lpc_compute_autocorr_c(const double *data, int len, int lag,
  * Quantize LPC coefficients
  */
 static void quantize_lpc_coefs(double *lpc_in, int order, int precision,
-                               int32_t *lpc_out, int *shift, int max_shift, int zero_shift)
+                               int32_t *lpc_out, int *shift, int min_shift,
+                               int max_shift, int zero_shift)
 {
     int i;
     double cmax, error;
@@ -111,7 +119,7 @@ static void quantize_lpc_coefs(double *lpc_in, int order, int precision,
 
     /* calculate level shift which scales max coeff to available bits */
     sh = max_shift;
-    while((cmax * (1 << sh) > qmax) && (sh > 0)) {
+    while((cmax * (1 << sh) > qmax) && (sh > min_shift)) {
         sh--;
     }
 
@@ -160,6 +168,29 @@ int ff_lpc_calc_ref_coefs(LPCContext *s,
     return order;
 }
 
+double ff_lpc_calc_ref_coefs_f(LPCContext *s, const float *samples, int len,
+                               int order, double *ref)
+{
+    int i;
+    double signal = 0.0f, avg_err = 0.0f;
+    double autoc[MAX_LPC_ORDER+1] = {0}, error[MAX_LPC_ORDER+1] = {0};
+    const double a = 0.5f, b = 1.0f - a;
+
+    /* Apply windowing */
+    for (i = 0; i <= len / 2; i++) {
+        double weight = a - b*cos((2*M_PI*i)/(len - 1));
+        s->windowed_samples[i] = weight*samples[i];
+        s->windowed_samples[len-1-i] = weight*samples[len-1-i];
+    }
+
+    s->lpc_compute_autocorr(s->windowed_samples, len, order, autoc);
+    signal = autoc[0];
+    compute_ref_coefs(autoc, order, ref, error);
+    for (i = 0; i < order; i++)
+        avg_err = (avg_err + error[i])/2.0f;
+    return signal/avg_err;
+}
+
 /**
  * Calculate LPC coefficients for multiple orders
  *
@@ -171,7 +202,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
                       int max_order, int precision,
                       int32_t coefs[][MAX_LPC_ORDER], int *shift,
                       enum FFLPCType lpc_type, int lpc_passes,
-                      int omethod, int max_shift, int zero_shift)
+                      int omethod, int min_shift, int max_shift, int zero_shift)
 {
     double autoc[MAX_LPC_ORDER+1];
     double ref[MAX_LPC_ORDER] = { 0 };
@@ -179,8 +210,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
     int i, j, pass = 0;
     int opt_order;
 
-    assert(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
+    av_assert2(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
            lpc_type > FF_LPC_TYPE_FIXED);
+    av_assert0(lpc_type == FF_LPC_TYPE_CHOLESKY || lpc_type == FF_LPC_TYPE_LEVINSON);
 
     /* reinit LPC context if parameters have changed */
     if (blocksize != s->blocksize || max_order != s->max_order ||
@@ -189,6 +221,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
         ff_lpc_init(s, blocksize, max_order, lpc_type);
     }
 
+    if(lpc_passes <= 0)
+        lpc_passes = 2;
+
     if (lpc_type == FF_LPC_TYPE_LEVINSON || (lpc_type == FF_LPC_TYPE_CHOLESKY && lpc_passes > 1)) {
         s->lpc_apply_welch_window(samples, blocksize, s->windowed_samples);
 
@@ -203,7 +238,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
     }
 
     if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
-        LLSModel m[2];
+        LLSModel *m = s->lls_models;
         LOCAL_ALIGNED(32, double, var, [FFALIGN(MAX_LPC_ORDER+1,4)]);
         double av_uninit(weight);
         memset(var, 0, FFALIGN(MAX_LPC_ORDER+1,4)*sizeof(*var));
@@ -244,15 +279,18 @@ int ff_lpc_calc_coefs(LPCContext *s,
         for(i=max_order-1; i>0; i--)
             ref[i] = ref[i-1] - ref[i];
     }
+
     opt_order = max_order;
 
     if(omethod == ORDER_METHOD_EST) {
         opt_order = estimate_best_order(ref, min_order, max_order);
         i = opt_order-1;
-        quantize_lpc_coefs(lpc[i], i+1, precision, coefs[i], &shift[i], max_shift, zero_shift);
+        quantize_lpc_coefs(lpc[i], i+1, precision, coefs[i], &shift[i],
+                           min_shift, max_shift, zero_shift);
     } else {
         for(i=min_order-1; i<max_order; i++) {
-            quantize_lpc_coefs(lpc[i], i+1, precision, coefs[i], &shift[i], max_shift, zero_shift);
+            quantize_lpc_coefs(lpc[i], i+1, precision, coefs[i], &shift[i],
+                               min_shift, max_shift, zero_shift);
         }
     }
 
diff --git a/libavcodec/lpc.h b/libavcodec/lpc.h
index 642854c..88ca247 100644
--- a/libavcodec/lpc.h
+++ b/libavcodec/lpc.h
@@ -2,20 +2,20 @@
  * LPC utility code
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,9 @@
 #define AVCODEC_LPC_H
 
 #include <stdint.h>
+#include "libavutil/avassert.h"
+#include "libavutil/lls.h"
+#include "aac_defines.h"
 
 #define ORDER_METHOD_EST     0
 #define ORDER_METHOD_2LEVEL  1
@@ -66,7 +69,7 @@ typedef struct LPCContext {
     /**
      * Perform autocorrelation on input samples with delay of 0 to lag.
      * @param data  input samples.
-     *              constraints: no alignment needed, but must have have at
+     *              constraints: no alignment needed, but must have at
      *              least lag*sizeof(double) valid bytes preceding it, and
      *              size must be at least (len+1)*sizeof(double) if data is
      *              16-byte aligned or (len+2)*sizeof(double) if data is
@@ -78,6 +81,9 @@ typedef struct LPCContext {
      */
     void (*lpc_compute_autocorr)(const double *data, int len, int lag,
                                  double *autoc);
+
+    // TODO: these should be allocated to reduce ABI compatibility issues
+    LLSModel lls_models[2];
 } LPCContext;
 
 
@@ -89,11 +95,14 @@ int ff_lpc_calc_coefs(LPCContext *s,
                       int max_order, int precision,
                       int32_t coefs[][MAX_LPC_ORDER], int *shift,
                       enum FFLPCType lpc_type, int lpc_passes,
-                      int omethod, int max_shift, int zero_shift);
+                      int omethod, int min_shift, int max_shift, int zero_shift);
 
 int ff_lpc_calc_ref_coefs(LPCContext *s,
                           const int32_t *samples, int order, double *ref);
 
+double ff_lpc_calc_ref_coefs_f(LPCContext *s, const float *samples, int len,
+                               int order, double *ref);
+
 /**
  * Initialize LPCContext.
  */
@@ -106,11 +115,18 @@ void ff_lpc_init_x86(LPCContext *s);
  */
 void ff_lpc_end(LPCContext *s);
 
+#if USE_FIXED
+typedef int LPC_TYPE;
+typedef unsigned LPC_TYPE_U;
+#else
 #ifdef LPC_USE_DOUBLE
-#define LPC_TYPE double
+typedef double LPC_TYPE;
+typedef double LPC_TYPE_U;
 #else
-#define LPC_TYPE float
+typedef float LPC_TYPE;
+typedef float LPC_TYPE_U;
 #endif
+#endif // USE_FIXED
 
 /**
  * Schur recursion.
@@ -147,7 +163,7 @@ static inline void compute_ref_coefs(const LPC_TYPE *autoc, int max_order,
  * Levinson-Durbin recursion.
  * Produce LPC coefficients from autocorrelation data.
  */
-static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
+static inline int AAC_RENAME(compute_lpc_coefs)(const LPC_TYPE *autoc, int max_order,
                                     LPC_TYPE *lpc, int lpc_stride, int fail,
                                     int normalize)
 {
@@ -155,6 +171,8 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
     LPC_TYPE err = 0;
     LPC_TYPE *lpc_last = lpc;
 
+    av_assert2(normalize || !fail);
+
     if (normalize)
         err = *autoc++;
 
@@ -162,14 +180,14 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
         return -1;
 
     for(i=0; i<max_order; i++) {
-        LPC_TYPE r = -autoc[i];
+        LPC_TYPE r = AAC_SRA_R(-autoc[i], 5);
 
         if (normalize) {
             for(j=0; j<i; j++)
                 r -= lpc_last[j] * autoc[i-j-1];
 
             r /= err;
-            err *= 1.0 - (r * r);
+            err *= FIXR(1.0) - (r * r);
         }
 
         lpc[i] = r;
@@ -177,8 +195,8 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
         for(j=0; j < (i+1)>>1; j++) {
             LPC_TYPE f = lpc_last[    j];
             LPC_TYPE b = lpc_last[i-1-j];
-            lpc[    j] = f + r * b;
-            lpc[i-1-j] = b + r * f;
+            lpc[    j] = f + (LPC_TYPE_U)AAC_MUL26(r, b);
+            lpc[i-1-j] = b + (LPC_TYPE_U)AAC_MUL26(r, f);
         }
 
         if (fail && err < 0)
diff --git a/libavcodec/lsp.c b/libavcodec/lsp.c
index 982c87e..9aba020 100644
--- a/libavcodec/lsp.c
+++ b/libavcodec/lsp.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet (QCELP decoder)
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@
 #define FRAC_BITS 14
 #include "mathops.h"
 #include "lsp.h"
+#include "libavcodec/mips/lsp_mips.h"
+#include "libavutil/avassert.h"
 
 void ff_acelp_reorder_lsf(int16_t* lsfq, int lsfq_min_distance, int lsfq_min, int lsfq_max, int lp_order)
 {
@@ -73,7 +75,7 @@ static int16_t ff_cos(uint16_t arg)
     uint8_t offset= arg;
     uint8_t ind = arg >> 8;
 
-    assert(arg <= 0x3fff);
+    av_assert2(arg <= 0x3fff);
 
     return tab_cos[ind] + (offset * (tab_cos[ind+1] - tab_cos[ind]) >> 8);
 }
@@ -173,7 +175,11 @@ void ff_acelp_lp_decode(int16_t* lp_1st, int16_t* lp_2nd, const int16_t* lsp_2nd
 
     /* LSP values for first subframe (3.2.5 of G.729, Equation 24)*/
     for(i=0; i<lp_order; i++)
+#ifdef G729_BITEXACT
+        lsp_1st[i] = (lsp_2nd[i] >> 1) + (lsp_prev[i] >> 1);
+#else
         lsp_1st[i] = (lsp_2nd[i] + lsp_prev[i]) >> 1;
+#endif
 
     ff_acelp_lsp2lpc(lp_1st, lsp_1st, lp_order >> 1);
 
@@ -181,6 +187,7 @@ void ff_acelp_lp_decode(int16_t* lp_1st, int16_t* lp_2nd, const int16_t* lsp_2nd
     ff_acelp_lsp2lpc(lp_2nd, lsp_2nd, lp_order >> 1);
 }
 
+#ifndef ff_lsp2polyf
 void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
 {
     int i, j;
@@ -197,13 +204,14 @@ void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
         f[1] += val;
     }
 }
+#endif /* ff_lsp2polyf */
 
 void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
 {
     double pa[MAX_LP_HALF_ORDER+1], qa[MAX_LP_HALF_ORDER+1];
     float *lpc2 = lpc + (lp_half_order << 1) - 1;
 
-    assert(lp_half_order <= MAX_LP_HALF_ORDER);
+    av_assert2(lp_half_order <= MAX_LP_HALF_ORDER);
 
     ff_lsp2polyf(lsp,     pa, lp_half_order);
     ff_lsp2polyf(lsp + 1, qa, lp_half_order);
diff --git a/libavcodec/lsp.h b/libavcodec/lsp.h
index 1f9481c..621ebea 100644
--- a/libavcodec/lsp.h
+++ b/libavcodec/lsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzf.c b/libavcodec/lzf.c
index 0329fe0..5b7526e 100644
--- a/libavcodec/lzf.c
+++ b/libavcodec/lzf.c
@@ -2,20 +2,20 @@
  * lzf decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzf.h b/libavcodec/lzf.h
index 4951f25..0ad73d9 100644
--- a/libavcodec/lzf.h
+++ b/libavcodec/lzf.h
@@ -2,20 +2,20 @@
  * lzf decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzw.c b/libavcodec/lzw.c
index fae5687..e26e482 100644
--- a/libavcodec/lzw.c
+++ b/libavcodec/lzw.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,6 +71,9 @@ static int lzw_get_code(struct LZWState * s)
 {
     int c;
 
+    if (s->bbits < s->cursize && bytestream2_get_bytes_left(&s->gb) <= 0)
+        return s->end_code;
+
     if(s->mode == FF_LZW_GIF) {
         while (s->bbits < s->cursize) {
             if (!s->bs) {
@@ -93,13 +96,7 @@ static int lzw_get_code(struct LZWState * s)
     return c & s->curmask;
 }
 
-int ff_lzw_size_read(LZWState *p)
-{
-    struct LZWState *s = p;
-    return bytestream2_tell(&s->gb);
-}
-
-void ff_lzw_decode_tail(LZWState *p)
+int ff_lzw_decode_tail(LZWState *p)
 {
     struct LZWState *s = (struct LZWState *)p;
 
@@ -110,6 +107,7 @@ void ff_lzw_decode_tail(LZWState *p)
         }
     }else
         bytestream2_skip(&s->gb, bytestream2_get_bytes_left(&s->gb));
+    return bytestream2_tell(&s->gb);
 }
 
 av_cold void ff_lzw_decode_open(LZWState **p)
diff --git a/libavcodec/lzw.h b/libavcodec/lzw.h
index d925d35..6af8a6b 100644
--- a/libavcodec/lzw.h
+++ b/libavcodec/lzw.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,8 +47,7 @@ void ff_lzw_decode_open(LZWState **p);
 void ff_lzw_decode_close(LZWState **p);
 int ff_lzw_decode_init(LZWState *s, int csize, const uint8_t *buf, int buf_size, int mode);
 int ff_lzw_decode(LZWState *s, uint8_t *buf, int len);
-int ff_lzw_size_read(LZWState *lzw);
-void ff_lzw_decode_tail(LZWState *lzw);
+int ff_lzw_decode_tail(LZWState *lzw);
 
 /** LZW encode state */
 struct LZWEncodeState;
diff --git a/libavcodec/lzwenc.c b/libavcodec/lzwenc.c
index 7c37bf2..03080ee 100644
--- a/libavcodec/lzwenc.c
+++ b/libavcodec/lzwenc.c
@@ -2,20 +2,20 @@
  * LZW encoder
  * Copyright (c) 2007 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,7 +77,7 @@ static inline int hash(int head, const int add)
     head ^= (add << LZW_HASH_SHIFT);
     if (head >= LZW_HASH_SIZE)
         head -= LZW_HASH_SIZE;
-    assert(head >= 0 && head < LZW_HASH_SIZE);
+    av_assert2(head >= 0 && head < LZW_HASH_SIZE);
     return head;
 }
 
@@ -112,7 +112,7 @@ static inline int hashOffset(const int head)
  */
 static inline void writeCode(LZWEncodeState * s, int c)
 {
-    assert(0 <= c && c < 1 << s->bits);
+    av_assert2(0 <= c && c < 1 << s->bits);
     s->put_bits(&s->pb, s->bits, c);
 }
 
@@ -208,7 +208,7 @@ void ff_lzw_encode_init(LZWEncodeState *s, uint8_t *outbuf, int outsize,
     s->maxbits = maxbits;
     init_put_bits(&s->pb, outbuf, outsize);
     s->bufsize = outsize;
-    assert(s->maxbits >= 9 && s->maxbits <= LZW_MAXBITS);
+    av_assert0(s->maxbits >= 9 && s->maxbits <= LZW_MAXBITS);
     s->maxcode = 1 << s->maxbits;
     s->output_bytes = 0;
     s->last_code = LZW_PREFIX_EMPTY;
@@ -263,6 +263,9 @@ int ff_lzw_encode_flush(LZWEncodeState *s,
     if (s->last_code != -1)
         writeCode(s, s->last_code);
     writeCode(s, s->end_code);
+    if (s->mode == FF_LZW_GIF)
+        s->put_bits(&s->pb, 1, 0);
+
     lzw_flush_put_bits(&s->pb);
     s->last_code = -1;
 
diff --git a/libavcodec/m101.c b/libavcodec/m101.c
new file mode 100644
index 0000000..d254966
--- /dev/null
+++ b/libavcodec/m101.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+
+#include "avcodec.h"
+#include "internal.h"
+
+
+static av_cold int m101_decode_init(AVCodecContext *avctx)
+{
+    if (avctx->extradata_size < 6*4) {
+        avpriv_request_sample(avctx, "Missing or too small extradata (size %d)", avctx->extradata_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (avctx->extradata[2*4] == 10)
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+    else if (avctx->extradata[2*4] == 8) {
+        avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+    } else {
+        avpriv_request_sample(avctx, "BPS %d", avctx->extradata[2*4]);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int m101_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                      AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int stride, ret;
+    int x, y;
+    int min_stride = 2 * avctx->width;
+    int bits = avctx->extradata[2*4];
+    AVFrame *frame = data;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+
+    stride = AV_RL32(avctx->extradata + 5*4);
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV422P10)
+        min_stride = (avctx->width + 15) / 16 * 20;
+
+    if (stride < min_stride || avpkt->size < stride * (uint64_t)avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "stride (%d) is invalid for packet sized %d\n",
+               stride, avpkt->size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    frame->interlaced_frame = ((avctx->extradata[3*4] & 3) != 3);
+    if (frame->interlaced_frame)
+        frame->top_field_first = avctx->extradata[3*4] & 1;
+
+    for (y = 0; y < avctx->height; y++) {
+        int src_y = y;
+        if (frame->interlaced_frame)
+            src_y = ((y&1)^frame->top_field_first) ? y/2 : (y/2 + avctx->height/2);
+        if (bits == 8) {
+            uint8_t *line = frame->data[0] + y*frame->linesize[0];
+            memcpy(line, buf + src_y*stride, 2*avctx->width);
+        } else {
+            int block;
+            uint16_t *luma = (uint16_t*)&frame->data[0][y*frame->linesize[0]];
+            uint16_t *cb   = (uint16_t*)&frame->data[1][y*frame->linesize[1]];
+            uint16_t *cr   = (uint16_t*)&frame->data[2][y*frame->linesize[2]];
+            for (block = 0; 16*block < avctx->width; block ++) {
+                const uint8_t *buf_src = buf + src_y*stride + 40*block;
+                for (x = 0; x < 16 && x + 16*block < avctx->width; x++) {
+                    int xd = x + 16*block;
+                    if (x&1) {
+                        luma [xd] = (4*buf_src[2*x + 0]) + ((buf_src[32 + (x>>1)]>>4)&3);
+                    } else {
+                        luma [xd] = (4*buf_src[2*x + 0]) +  (buf_src[32 + (x>>1)]    &3);
+                        cb[xd>>1] = (4*buf_src[2*x + 1]) + ((buf_src[32 + (x>>1)]>>2)&3);
+                        cr[xd>>1] = (4*buf_src[2*x + 3]) +  (buf_src[32 + (x>>1)]>>6);
+                    }
+                }
+            }
+        }
+    }
+
+    *got_frame = 1;
+    return avpkt->size;
+}
+
+AVCodec ff_m101_decoder = {
+    .name           = "m101",
+    .long_name      = NULL_IF_CONFIG_SMALL("Matrox Uncompressed SD"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_M101,
+    .init           = m101_decode_init,
+    .decode         = m101_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/mace.c b/libavcodec/mace.c
index c6eddc0..e332a72 100644
--- a/libavcodec/mace.c
+++ b/libavcodec/mace.c
@@ -2,20 +2,20 @@
  * MACE decoder
  * Copyright (c) 2002 Laszlo Torok <torokl@alpha.dfmk.hu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -244,12 +244,17 @@ static int mace_decode_frame(AVCodecContext *avctx, void *data,
     int i, j, k, l, ret;
     int is_mace3 = (avctx->codec_id == AV_CODEC_ID_MACE3);
 
+    if (buf_size % (avctx->channels << is_mace3)) {
+        av_log(avctx, AV_LOG_ERROR, "buffer size %d is odd\n", buf_size);
+        buf_size -= buf_size % (avctx->channels << is_mace3);
+        if (!buf_size)
+            return AVERROR_INVALIDDATA;
+    }
+
     /* get output buffer */
     frame->nb_samples = 3 * (buf_size << (1 - is_mace3)) / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t **)frame->extended_data;
 
     for(i = 0; i < avctx->channels; i++) {
diff --git a/libavcodec/magicyuv.c b/libavcodec/magicyuv.c
index 027143f..1a129c2 100644
--- a/libavcodec/magicyuv.c
+++ b/libavcodec/magicyuv.c
@@ -2,20 +2,20 @@
  * MagicYUV decoder
  * Copyright (c) 2016 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,14 +23,15 @@
 #include <string.h>
 
 #include "libavutil/pixdesc.h"
+#include "libavutil/qsort.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "huffyuvdsp.h"
 #include "internal.h"
+#include "lossless_videodsp.h"
 #include "thread.h"
-#include "vlc.h"
 
 typedef struct Slice {
     uint32_t start;
@@ -44,26 +45,33 @@ typedef enum Prediction {
 } Prediction;
 
 typedef struct HuffEntry {
-    uint8_t  sym;
+    uint16_t sym;
     uint8_t  len;
     uint32_t code;
 } HuffEntry;
 
 typedef struct MagicYUVContext {
     AVFrame          *p;
+    int               max;
+    int               bps;
     int               slice_height;
     int               nb_slices;
     int               planes;         // number of encoded planes in bitstream
     int               decorrelate;    // postprocessing work
+    int               color_matrix;   // video color matrix
+    int               flags;
     int               interlaced;     // video is interlaced
     uint8_t          *buf;            // pointer to AVPacket->data
     int               hshift[4];
     int               vshift[4];
     Slice            *slices[4];      // slice bitstream positions for each plane
     unsigned int      slices_size[4]; // slice sizes for each plane
-    uint8_t           len[4][256];    // table of code lengths for each plane
+    uint8_t           len[4][4096];   // table of code lengths for each plane
     VLC               vlc[4];         // VLC for each plane
-    HuffYUVDSPContext hdsp;
+    int (*huff_build)(VLC *vlc, uint8_t *len);
+    int (*magy_decode_slice)(AVCodecContext *avctx, void *tdata,
+                             int j, int threadnr);
+    LLVidDSPContext   llviddsp;
 } MagicYUVContext;
 
 static int huff_cmp_len(const void *a, const void *b)
@@ -72,6 +80,82 @@ static int huff_cmp_len(const void *a, const void *b)
     return (aa->len - bb->len) * 256 + aa->sym - bb->sym;
 }
 
+static int huff_cmp_len10(const void *a, const void *b)
+{
+    const HuffEntry *aa = a, *bb = b;
+    return (aa->len - bb->len) * 1024 + aa->sym - bb->sym;
+}
+
+static int huff_cmp_len12(const void *a, const void *b)
+{
+    const HuffEntry *aa = a, *bb = b;
+    return (aa->len - bb->len) * 4096 + aa->sym - bb->sym;
+}
+
+static int huff_build10(VLC *vlc, uint8_t *len)
+{
+    HuffEntry he[1024];
+    uint32_t codes[1024];
+    uint8_t bits[1024];
+    uint16_t syms[1024];
+    uint32_t code;
+    int i;
+
+    for (i = 0; i < 1024; i++) {
+        he[i].sym = 1023 - i;
+        he[i].len = len[i];
+        if (len[i] == 0 || len[i] > 32)
+            return AVERROR_INVALIDDATA;
+    }
+    AV_QSORT(he, 1024, HuffEntry, huff_cmp_len10);
+
+    code = 1;
+    for (i = 1023; i >= 0; i--) {
+        codes[i] = code >> (32 - he[i].len);
+        bits[i]  = he[i].len;
+        syms[i]  = he[i].sym;
+        code += 0x80000000u >> (he[i].len - 1);
+    }
+
+    ff_free_vlc(vlc);
+    return ff_init_vlc_sparse(vlc, FFMIN(he[1023].len, 12), 1024,
+                              bits,  sizeof(*bits),  sizeof(*bits),
+                              codes, sizeof(*codes), sizeof(*codes),
+                              syms,  sizeof(*syms),  sizeof(*syms), 0);
+}
+
+static int huff_build12(VLC *vlc, uint8_t *len)
+{
+    HuffEntry he[4096];
+    uint32_t codes[4096];
+    uint8_t bits[4096];
+    uint16_t syms[4096];
+    uint32_t code;
+    int i;
+
+    for (i = 0; i < 4096; i++) {
+        he[i].sym = 4095 - i;
+        he[i].len = len[i];
+        if (len[i] == 0 || len[i] > 32)
+            return AVERROR_INVALIDDATA;
+    }
+    AV_QSORT(he, 4096, HuffEntry, huff_cmp_len12);
+
+    code = 1;
+    for (i = 4095; i >= 0; i--) {
+        codes[i] = code >> (32 - he[i].len);
+        bits[i]  = he[i].len;
+        syms[i]  = he[i].sym;
+        code += 0x80000000u >> (he[i].len - 1);
+    }
+
+    ff_free_vlc(vlc);
+    return ff_init_vlc_sparse(vlc, FFMIN(he[4095].len, 14), 4096,
+                              bits,  sizeof(*bits),  sizeof(*bits),
+                              codes, sizeof(*codes), sizeof(*codes),
+                              syms,  sizeof(*syms),  sizeof(*syms), 0);
+}
+
 static int huff_build(VLC *vlc, uint8_t *len)
 {
     HuffEntry he[256];
@@ -84,8 +168,10 @@ static int huff_build(VLC *vlc, uint8_t *len)
     for (i = 0; i < 256; i++) {
         he[i].sym = 255 - i;
         he[i].len = len[i];
+        if (len[i] == 0 || len[i] > 32)
+            return AVERROR_INVALIDDATA;
     }
-    qsort(he, 256, sizeof(HuffEntry), huff_cmp_len);
+    AV_QSORT(he, 256, HuffEntry, huff_cmp_len);
 
     code = 1;
     for (i = 255; i >= 0; i--) {
@@ -102,38 +188,193 @@ static int huff_build(VLC *vlc, uint8_t *len)
                               syms,  sizeof(*syms),  sizeof(*syms), 0);
 }
 
+static void magicyuv_median_pred16(uint16_t *dst, const uint16_t *src1,
+                                   const uint16_t *diff, intptr_t w,
+                                   int *left, int *left_top, int max)
+{
+    int i;
+    uint16_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for (i = 0; i < w; i++) {
+        l      = mid_pred(l, src1[i], (l + src1[i] - lt)) + diff[i];
+        l     &= max;
+        lt     = src1[i];
+        dst[i] = l;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static int magy_decode_slice10(AVCodecContext *avctx, void *tdata,
+                               int j, int threadnr)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    int interlaced = s->interlaced;
+    const int bps = s->bps;
+    const int max = s->max - 1;
+    AVFrame *p = s->p;
+    int i, k, x;
+    GetBitContext gb;
+    uint16_t *dst;
+
+    for (i = 0; i < s->planes; i++) {
+        int left, lefttop, top;
+        int height = AV_CEIL_RSHIFT(FFMIN(s->slice_height, avctx->coded_height - j * s->slice_height), s->vshift[i]);
+        int width = AV_CEIL_RSHIFT(avctx->coded_width, s->hshift[i]);
+        int sheight = AV_CEIL_RSHIFT(s->slice_height, s->vshift[i]);
+        ptrdiff_t fake_stride = (p->linesize[i] / 2) * (1 + interlaced);
+        ptrdiff_t stride = p->linesize[i] / 2;
+        int flags, pred;
+        int ret = init_get_bits8(&gb, s->buf + s->slices[i][j].start,
+                                 s->slices[i][j].size);
+
+        if (ret < 0)
+            return ret;
+
+        flags = get_bits(&gb, 8);
+        pred  = get_bits(&gb, 8);
+
+        dst = (uint16_t *)p->data[i] + j * sheight * stride;
+        if (flags & 1) {
+            if (get_bits_left(&gb) < bps * width * height)
+                return AVERROR_INVALIDDATA;
+            for (k = 0; k < height; k++) {
+                for (x = 0; x < width; x++)
+                    dst[x] = get_bits(&gb, bps);
+
+                dst += stride;
+            }
+        } else {
+            for (k = 0; k < height; k++) {
+                for (x = 0; x < width; x++) {
+                    int pix;
+                    if (get_bits_left(&gb) <= 0)
+                        return AVERROR_INVALIDDATA;
+
+                    pix = get_vlc2(&gb, s->vlc[i].table, s->vlc[i].bits, 3);
+                    if (pix < 0)
+                        return AVERROR_INVALIDDATA;
+
+                    dst[x] = max - pix;
+                }
+                dst += stride;
+            }
+        }
+
+        switch (pred) {
+        case LEFT:
+            dst = (uint16_t *)p->data[i] + j * sheight * stride;
+            s->llviddsp.add_left_pred_int16(dst, dst, max, width, 0);
+            dst += stride;
+            if (interlaced) {
+                s->llviddsp.add_left_pred_int16(dst, dst, max, width, 0);
+                dst += stride;
+            }
+            for (k = 1 + interlaced; k < height; k++) {
+                s->llviddsp.add_left_pred_int16(dst, dst, max, width, dst[-fake_stride]);
+                dst += stride;
+            }
+            break;
+        case GRADIENT:
+            dst = (uint16_t *)p->data[i] + j * sheight * stride;
+            s->llviddsp.add_left_pred_int16(dst, dst, max, width, 0);
+            dst += stride;
+            if (interlaced) {
+                s->llviddsp.add_left_pred_int16(dst, dst, max, width, 0);
+                dst += stride;
+            }
+            for (k = 1 + interlaced; k < height; k++) {
+                top = dst[-fake_stride];
+                left = top + dst[0];
+                dst[0] = left & max;
+                for (x = 1; x < width; x++) {
+                    top = dst[x - fake_stride];
+                    lefttop = dst[x - (fake_stride + 1)];
+                    left += top - lefttop + dst[x];
+                    dst[x] = left & max;
+                }
+                dst += stride;
+            }
+            break;
+        case MEDIAN:
+            dst = (uint16_t *)p->data[i] + j * sheight * stride;
+            s->llviddsp.add_left_pred_int16(dst, dst, max, width, 0);
+            dst += stride;
+            if (interlaced) {
+                s->llviddsp.add_left_pred_int16(dst, dst, max, width, 0);
+                dst += stride;
+            }
+            lefttop = left = dst[0];
+            for (k = 1 + interlaced; k < height; k++) {
+                magicyuv_median_pred16(dst, dst - fake_stride, dst, width, &left, &lefttop, max);
+                lefttop = left = dst[0];
+                dst += stride;
+            }
+            break;
+        default:
+            avpriv_request_sample(avctx, "Unknown prediction: %d", pred);
+        }
+    }
+
+    if (s->decorrelate) {
+        int height = FFMIN(s->slice_height, avctx->coded_height - j * s->slice_height);
+        int width = avctx->coded_width;
+        uint16_t *r = (uint16_t *)p->data[0] + j * s->slice_height * p->linesize[0] / 2;
+        uint16_t *g = (uint16_t *)p->data[1] + j * s->slice_height * p->linesize[1] / 2;
+        uint16_t *b = (uint16_t *)p->data[2] + j * s->slice_height * p->linesize[2] / 2;
+
+        for (i = 0; i < height; i++) {
+            for (k = 0; k < width; k++) {
+                b[k] = (b[k] + g[k]) & max;
+                r[k] = (r[k] + g[k]) & max;
+            }
+            b += p->linesize[0] / 2;
+            g += p->linesize[1] / 2;
+            r += p->linesize[2] / 2;
+        }
+    }
+
+    return 0;
+}
+
 static int magy_decode_slice(AVCodecContext *avctx, void *tdata,
                              int j, int threadnr)
 {
     MagicYUVContext *s = avctx->priv_data;
     int interlaced = s->interlaced;
     AVFrame *p = s->p;
-    int i, k, x;
-    BitstreamContext bc;
+    int i, k, x, min_width;
+    GetBitContext gb;
     uint8_t *dst;
 
     for (i = 0; i < s->planes; i++) {
         int left, lefttop, top;
-        int height = AV_CEIL_RSHIFT(FFMIN(s->slice_height, avctx->height - j * s->slice_height), s->vshift[i]);
-        int width = AV_CEIL_RSHIFT(avctx->width, s->hshift[i]);
+        int height = AV_CEIL_RSHIFT(FFMIN(s->slice_height, avctx->coded_height - j * s->slice_height), s->vshift[i]);
+        int width = AV_CEIL_RSHIFT(avctx->coded_width, s->hshift[i]);
         int sheight = AV_CEIL_RSHIFT(s->slice_height, s->vshift[i]);
         ptrdiff_t fake_stride = p->linesize[i] * (1 + interlaced);
         ptrdiff_t stride = p->linesize[i];
         int flags, pred;
-        int ret = bitstream_init8(&bc, s->buf + s->slices[i][j].start,
-                                  s->slices[i][j].size);
+        int ret = init_get_bits8(&gb, s->buf + s->slices[i][j].start,
+                                 s->slices[i][j].size);
 
         if (ret < 0)
             return ret;
 
-        flags = bitstream_read(&bc, 8);
-        pred  = bitstream_read(&bc, 8);
+        flags = get_bits(&gb, 8);
+        pred  = get_bits(&gb, 8);
 
         dst = p->data[i] + j * sheight * stride;
         if (flags & 1) {
+            if (get_bits_left(&gb) < 8* width * height)
+                return AVERROR_INVALIDDATA;
             for (k = 0; k < height; k++) {
                 for (x = 0; x < width; x++)
-                    dst[x] = bitstream_read(&bc, 8);
+                    dst[x] = get_bits(&gb, 8);
 
                 dst += stride;
             }
@@ -141,10 +382,10 @@ static int magy_decode_slice(AVCodecContext *avctx, void *tdata,
             for (k = 0; k < height; k++) {
                 for (x = 0; x < width; x++) {
                     int pix;
-                    if (bitstream_bits_left(&bc) <= 0)
+                    if (get_bits_left(&gb) <= 0)
                         return AVERROR_INVALIDDATA;
 
-                    pix = bitstream_read_vlc(&bc, s->vlc[i].table, s->vlc[i].bits, 3);
+                    pix = get_vlc2(&gb, s->vlc[i].table, s->vlc[i].bits, 3);
                     if (pix < 0)
                         return AVERROR_INVALIDDATA;
 
@@ -157,52 +398,52 @@ static int magy_decode_slice(AVCodecContext *avctx, void *tdata,
         switch (pred) {
         case LEFT:
             dst = p->data[i] + j * sheight * stride;
-            s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+            s->llviddsp.add_left_pred(dst, dst, width, 0);
             dst += stride;
             if (interlaced) {
-                s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+                s->llviddsp.add_left_pred(dst, dst, width, 0);
                 dst += stride;
             }
             for (k = 1 + interlaced; k < height; k++) {
-                s->hdsp.add_hfyu_left_pred(dst, dst, width, dst[-fake_stride]);
+                s->llviddsp.add_left_pred(dst, dst, width, dst[-fake_stride]);
                 dst += stride;
             }
             break;
         case GRADIENT:
             dst = p->data[i] + j * sheight * stride;
-            s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
-            left = lefttop = 0;
+            s->llviddsp.add_left_pred(dst, dst, width, 0);
             dst += stride;
             if (interlaced) {
-                s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
-                left = lefttop = 0;
+                s->llviddsp.add_left_pred(dst, dst, width, 0);
                 dst += stride;
             }
+            min_width = FFMIN(width, 32);
             for (k = 1 + interlaced; k < height; k++) {
                 top = dst[-fake_stride];
                 left = top + dst[0];
                 dst[0] = left;
-                for (x = 1; x < width; x++) {
+                for (x = 1; x < min_width; x++) { /* dsp need aligned 32 */
                     top = dst[x - fake_stride];
                     lefttop = dst[x - (fake_stride + 1)];
                     left += top - lefttop + dst[x];
                     dst[x] = left;
                 }
+                if (width > 32)
+                    s->llviddsp.add_gradient_pred(dst + 32, fake_stride, width - 32);
                 dst += stride;
             }
             break;
         case MEDIAN:
             dst = p->data[i] + j * sheight * stride;
-            lefttop = left = dst[0];
-            s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+            s->llviddsp.add_left_pred(dst, dst, width, 0);
             dst += stride;
             if (interlaced) {
-                lefttop = left = dst[0];
-                s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+                s->llviddsp.add_left_pred(dst, dst, width, 0);
                 dst += stride;
             }
+            lefttop = left = dst[0];
             for (k = 1 + interlaced; k < height; k++) {
-                s->hdsp.add_hfyu_median_pred(dst, dst - fake_stride,
+                s->llviddsp.add_median_pred(dst, dst - fake_stride,
                                              dst, width, &left, &lefttop);
                 lefttop = left = dst[0];
                 dst += stride;
@@ -214,15 +455,15 @@ static int magy_decode_slice(AVCodecContext *avctx, void *tdata,
     }
 
     if (s->decorrelate) {
-        int height = FFMIN(s->slice_height, avctx->height - j * s->slice_height);
-        int width = avctx->width;
+        int height = FFMIN(s->slice_height, avctx->coded_height - j * s->slice_height);
+        int width = avctx->coded_width;
         uint8_t *b = p->data[0] + j * s->slice_height * p->linesize[0];
         uint8_t *g = p->data[1] + j * s->slice_height * p->linesize[1];
         uint8_t *r = p->data[2] + j * s->slice_height * p->linesize[2];
 
         for (i = 0; i < height; i++) {
-            s->hdsp.add_bytes(b, g, width);
-            s->hdsp.add_bytes(r, g, width);
+            s->llviddsp.add_bytes(b, g, width);
+            s->llviddsp.add_bytes(r, g, width);
             b += p->linesize[0];
             g += p->linesize[1];
             r += p->linesize[2];
@@ -232,6 +473,46 @@ static int magy_decode_slice(AVCodecContext *avctx, void *tdata,
     return 0;
 }
 
+static int build_huffman(AVCodecContext *avctx, GetBitContext *gbit, int max)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    int i = 0, j = 0, k;
+
+    memset(s->len, 0, sizeof(s->len));
+    while (get_bits_left(gbit) >= 8) {
+        int b = get_bits(gbit, 1);
+        int x = get_bits(gbit, 7);
+        int l = get_bitsz(gbit, b * 8) + 1;
+
+        for (k = 0; k < l; k++)
+            if (j + k < max)
+                s->len[i][j + k] = x;
+
+        j += l;
+        if (j == max) {
+            j = 0;
+            if (s->huff_build(&s->vlc[i], s->len[i])) {
+                av_log(avctx, AV_LOG_ERROR, "Cannot build Huffman codes\n");
+                return AVERROR_INVALIDDATA;
+            }
+            i++;
+            if (i == s->planes) {
+                break;
+            }
+        } else if (j > max) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid Huffman codes\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (i != s->planes) {
+        av_log(avctx, AV_LOG_ERROR, "Huffman tables too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
 static int magy_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
@@ -239,10 +520,10 @@ static int magy_decode_frame(AVCodecContext *avctx, void *data,
     ThreadFrame frame = { .f = data };
     AVFrame *p = data;
     GetByteContext gbyte;
-    BitstreamContext bc;
+    GetBitContext gbit;
     uint32_t first_offset, offset, next_offset, header_size, slice_width;
     int width, height, format, version, table_size;
-    int ret, i, j, k;
+    int ret, i, j;
 
     bytestream2_init(&gbyte, avpkt->data, avpkt->size);
     if (bytestream2_get_le32(&gbyte) != MKTAG('M', 'A', 'G', 'Y'))
@@ -266,6 +547,10 @@ static int magy_decode_frame(AVCodecContext *avctx, void *data,
     s->hshift[2] =
     s->vshift[2] = 0;
     s->decorrelate = 0;
+    s->max = 256;
+    s->bps = 8;
+    s->huff_build = huff_build;
+    s->magy_decode_slice = magy_decode_slice;
 
     format = bytestream2_get_byte(&gbyte);
     switch (format) {
@@ -298,14 +583,64 @@ static int magy_decode_frame(AVCodecContext *avctx, void *data,
     case 0x6b:
         avctx->pix_fmt = AV_PIX_FMT_GRAY8;
         break;
+    case 0x6c:
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        s->hshift[1] =
+        s->hshift[2] = 1;
+        s->max = 1024;
+        s->huff_build = huff_build10;
+        s->magy_decode_slice = magy_decode_slice10;
+        s->bps = 10;
+        break;
+    case 0x6d:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        s->decorrelate = 1;
+        s->max = 1024;
+        s->huff_build = huff_build10;
+        s->magy_decode_slice = magy_decode_slice10;
+        s->bps = 10;
+        break;
+    case 0x6e:
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        s->decorrelate = 1;
+        s->max = 1024;
+        s->huff_build = huff_build10;
+        s->magy_decode_slice = magy_decode_slice10;
+        s->bps = 10;
+        break;
+    case 0x6f:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+        s->decorrelate = 1;
+        s->max = 4096;
+        s->huff_build = huff_build12;
+        s->magy_decode_slice = magy_decode_slice10;
+        s->bps = 12;
+        break;
+    case 0x70:
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP12;
+        s->decorrelate = 1;
+        s->max = 4096;
+        s->huff_build = huff_build12;
+        s->magy_decode_slice = magy_decode_slice10;
+        s->bps = 12;
+        break;
+    case 0x73:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY10;
+        s->max = 1024;
+        s->huff_build = huff_build10;
+        s->magy_decode_slice = magy_decode_slice10;
+        s->bps = 10;
+        break;
     default:
         avpriv_request_sample(avctx, "Format 0x%X", format);
         return AVERROR_PATCHWELCOME;
     }
     s->planes = av_pix_fmt_count_planes(avctx->pix_fmt);
 
-    bytestream2_skip(&gbyte, 2);
-    s->interlaced = !!(bytestream2_get_byte(&gbyte) & 2);
+    bytestream2_skip(&gbyte, 1);
+    s->color_matrix = bytestream2_get_byte(&gbyte);
+    s->flags        = bytestream2_get_byte(&gbyte);
+    s->interlaced   = !!(s->flags & 2);
     bytestream2_skip(&gbyte, 3);
 
     width  = bytestream2_get_le32(&gbyte);
@@ -315,12 +650,12 @@ static int magy_decode_frame(AVCodecContext *avctx, void *data,
         return ret;
 
     slice_width = bytestream2_get_le32(&gbyte);
-    if (slice_width != width) {
+    if (slice_width != avctx->coded_width) {
         avpriv_request_sample(avctx, "Slice width %"PRIu32, slice_width);
         return AVERROR_PATCHWELCOME;
     }
     s->slice_height = bytestream2_get_le32(&gbyte);
-    if (s->slice_height <= 0 || s->slice_height > INT_MAX - height) {
+    if (s->slice_height <= 0 || s->slice_height > INT_MAX - avctx->coded_height) {
         av_log(avctx, AV_LOG_ERROR,
                "invalid slice height: %d\n", s->slice_height);
         return AVERROR_INVALIDDATA;
@@ -328,7 +663,7 @@ static int magy_decode_frame(AVCodecContext *avctx, void *data,
 
     bytestream2_skip(&gbyte, 4);
 
-    s->nb_slices = (height + s->slice_height - 1) / s->slice_height;
+    s->nb_slices = (avctx->coded_height + s->slice_height - 1) / s->slice_height;
     if (s->nb_slices > INT_MAX / sizeof(Slice)) {
         av_log(avctx, AV_LOG_ERROR,
                "invalid number of slices: %d\n", s->nb_slices);
@@ -371,41 +706,13 @@ static int magy_decode_frame(AVCodecContext *avctx, void *data,
     if (table_size < 2)
         return AVERROR_INVALIDDATA;
 
-    ret = bitstream_init8(&bc, avpkt->data + bytestream2_tell(&gbyte), table_size);
+    ret = init_get_bits8(&gbit, avpkt->data + bytestream2_tell(&gbyte), table_size);
     if (ret < 0)
         return ret;
 
-    memset(s->len, 0, sizeof(s->len));
-    j = i = 0;
-    while (bitstream_bits_left(&bc) >= 8) {
-        int b = bitstream_read(&bc, 4);
-        int x = bitstream_read(&bc, 4);
-        int l = bitstream_read(&bc, b) + 1;
-
-        for (k = 0; k < l; k++)
-            if (j + k < 256)
-                s->len[i][j + k] = x;
-
-        j += l;
-        if (j == 256) {
-            j = 0;
-            if (huff_build(&s->vlc[i], s->len[i])) {
-                av_log(avctx, AV_LOG_ERROR, "Cannot build Huffman codes\n");
-                return AVERROR_INVALIDDATA;
-            }
-            i++;
-            if (i == s->planes) {
-                break;
-            }
-        } else if (j > 256) {
-            return AVERROR_INVALIDDATA;
-        }
-    }
-
-    if (i != s->planes) {
-        av_log(avctx, AV_LOG_ERROR, "Huffman tables too short\n");
-        return AVERROR_INVALIDDATA;
-    }
+    ret = build_huffman(avctx, &gbit, s->max);
+    if (ret < 0)
+        return ret;
 
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
@@ -415,12 +722,26 @@ static int magy_decode_frame(AVCodecContext *avctx, void *data,
 
     s->buf = avpkt->data;
     s->p = p;
-    avctx->execute2(avctx, magy_decode_slice, NULL, NULL, s->nb_slices);
-
-    if (avctx->pix_fmt == AV_PIX_FMT_GBRP ||
-        avctx->pix_fmt == AV_PIX_FMT_GBRAP) {
+    avctx->execute2(avctx, s->magy_decode_slice, NULL, NULL, s->nb_slices);
+
+    if (avctx->pix_fmt == AV_PIX_FMT_GBRP   ||
+        avctx->pix_fmt == AV_PIX_FMT_GBRAP  ||
+        avctx->pix_fmt == AV_PIX_FMT_GBRP10 ||
+        avctx->pix_fmt == AV_PIX_FMT_GBRAP10||
+        avctx->pix_fmt == AV_PIX_FMT_GBRAP12||
+        avctx->pix_fmt == AV_PIX_FMT_GBRP12) {
         FFSWAP(uint8_t*, p->data[0], p->data[1]);
         FFSWAP(int, p->linesize[0], p->linesize[1]);
+    } else {
+        switch (s->color_matrix) {
+        case 1:
+            p->colorspace = AVCOL_SPC_BT470BG;
+            break;
+        case 2:
+            p->colorspace = AVCOL_SPC_BT709;
+            break;
+        }
+        p->color_range = (s->flags & 4) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
     }
 
     *got_frame = 1;
@@ -446,7 +767,7 @@ static int magy_init_thread_copy(AVCodecContext *avctx)
 static av_cold int magy_decode_init(AVCodecContext *avctx)
 {
     MagicYUVContext *s = avctx->priv_data;
-    ff_huffyuvdsp_init(&s->hdsp);
+    ff_llviddsp_init(&s->llviddsp);
     return 0;
 }
 
diff --git a/libavcodec/magicyuvenc.c b/libavcodec/magicyuvenc.c
new file mode 100644
index 0000000..16e9a1c
--- /dev/null
+++ b/libavcodec/magicyuvenc.c
@@ -0,0 +1,590 @@
+/*
+ * MagicYUV encoder
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/qsort.h"
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "put_bits.h"
+#include "internal.h"
+#include "thread.h"
+#include "lossless_videoencdsp.h"
+
+typedef enum Prediction {
+    LEFT = 1,
+    GRADIENT,
+    MEDIAN,
+} Prediction;
+
+typedef struct HuffEntry {
+    uint8_t  sym;
+    uint8_t  len;
+    uint32_t code;
+} HuffEntry;
+
+typedef struct PTable {
+    int     value;  ///< input value
+    int64_t prob;   ///< number of occurences of this value in input
+} PTable;
+
+typedef struct MagicYUVContext {
+    const AVClass       *class;
+    int                  frame_pred;
+    PutBitContext        pb;
+    int                  planes;
+    uint8_t              format;
+    AVFrame             *p;
+    int                  slice_height;
+    int                  nb_slices;
+    int                  correlate;
+    int                  hshift[4];
+    int                  vshift[4];
+    uint8_t             *slices[4];
+    unsigned             slice_pos[4];
+    unsigned             tables_size;
+    HuffEntry            he[4][256];
+    LLVidEncDSPContext   llvidencdsp;
+    void (*predict)(struct MagicYUVContext *s, uint8_t *src, uint8_t *dst,
+                    ptrdiff_t stride, int width, int height);
+} MagicYUVContext;
+
+static void left_predict(MagicYUVContext *s,
+                         uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                         int width, int height)
+{
+    uint8_t prev = 0;
+    int i, j;
+
+    for (i = 0; i < width; i++) {
+        dst[i] = src[i] - prev;
+        prev   = src[i];
+    }
+    dst += width;
+    src += stride;
+    for (j = 1; j < height; j++) {
+        prev = src[-stride];
+        for (i = 0; i < width; i++) {
+            dst[i] = src[i] - prev;
+            prev   = src[i];
+        }
+        dst += width;
+        src += stride;
+    }
+}
+
+static void gradient_predict(MagicYUVContext *s,
+                             uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                             int width, int height)
+{
+    int left = 0, top, lefttop;
+    int i, j;
+
+    for (i = 0; i < width; i++) {
+        dst[i] = src[i] - left;
+        left   = src[i];
+    }
+    dst += width;
+    src += stride;
+    for (j = 1; j < height; j++) {
+        top = src[-stride];
+        left = src[0] - top;
+        dst[0] = left;
+        for (i = 1; i < width; i++) {
+            top = src[i - stride];
+            lefttop = src[i - (stride + 1)];
+            left = src[i-1];
+            dst[i] = (src[i] - top) - left + lefttop;
+        }
+        dst += width;
+        src += stride;
+    }
+}
+
+static void median_predict(MagicYUVContext *s,
+                           uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                           int width, int height)
+{
+    int left = 0, lefttop;
+    int i, j;
+
+    for (i = 0; i < width; i++) {
+        dst[i] = src[i] - left;
+        left   = src[i];
+    }
+    dst += width;
+    src += stride;
+    for (j = 1; j < height; j++) {
+        left = lefttop = src[-stride];
+        s->llvidencdsp.sub_median_pred(dst, src - stride, src, width, &left, &lefttop);
+        dst += width;
+        src += stride;
+    }
+}
+
+static av_cold int magy_encode_init(AVCodecContext *avctx)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    PutByteContext pb;
+    int i;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_GBRP:
+        avctx->codec_tag = MKTAG('M', '8', 'R', 'G');
+        s->correlate = 1;
+        s->format = 0x65;
+        break;
+    case AV_PIX_FMT_GBRAP:
+        avctx->codec_tag = MKTAG('M', '8', 'R', 'A');
+        s->correlate = 1;
+        s->format = 0x66;
+        break;
+    case AV_PIX_FMT_YUV420P:
+        avctx->codec_tag = MKTAG('M', '8', 'Y', '0');
+        s->hshift[1] =
+        s->vshift[1] =
+        s->hshift[2] =
+        s->vshift[2] = 1;
+        s->format = 0x69;
+        break;
+    case AV_PIX_FMT_YUV422P:
+        avctx->codec_tag = MKTAG('M', '8', 'Y', '2');
+        s->hshift[1] =
+        s->hshift[2] = 1;
+        s->format = 0x68;
+        break;
+    case AV_PIX_FMT_YUV444P:
+        avctx->codec_tag = MKTAG('M', '8', 'Y', '4');
+        s->format = 0x67;
+        break;
+    case AV_PIX_FMT_YUVA444P:
+        avctx->codec_tag = MKTAG('M', '8', 'Y', 'A');
+        s->format = 0x6a;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        avctx->codec_tag = MKTAG('M', '8', 'G', '0');
+        s->format = 0x6b;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format: %d\n",
+               avctx->pix_fmt);
+        return AVERROR_INVALIDDATA;
+    }
+
+    ff_llvidencdsp_init(&s->llvidencdsp);
+
+    s->planes = av_pix_fmt_count_planes(avctx->pix_fmt);
+
+    s->nb_slices = 1;
+
+    for (i = 0; i < s->planes; i++) {
+        s->slices[i] = av_malloc(avctx->width * (avctx->height + 2) +
+                                 AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!s->slices[i]) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n");
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    switch (s->frame_pred) {
+    case LEFT:     s->predict = left_predict;     break;
+    case GRADIENT: s->predict = gradient_predict; break;
+    case MEDIAN:   s->predict = median_predict;   break;
+    }
+
+    avctx->extradata_size = 32;
+
+    avctx->extradata = av_mallocz(avctx->extradata_size +
+                                  AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if (!avctx->extradata) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate extradata.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    bytestream2_init_writer(&pb, avctx->extradata, avctx->extradata_size);
+    bytestream2_put_le32(&pb, MKTAG('M', 'A', 'G', 'Y'));
+    bytestream2_put_le32(&pb, 32);
+    bytestream2_put_byte(&pb, 7);
+    bytestream2_put_byte(&pb, s->format);
+    bytestream2_put_byte(&pb, 12);
+    bytestream2_put_byte(&pb, 0);
+
+    bytestream2_put_byte(&pb, 0);
+    bytestream2_put_byte(&pb, 0);
+    bytestream2_put_byte(&pb, 32);
+    bytestream2_put_byte(&pb, 0);
+
+    bytestream2_put_le32(&pb, avctx->width);
+    bytestream2_put_le32(&pb, avctx->height);
+    bytestream2_put_le32(&pb, avctx->width);
+    bytestream2_put_le32(&pb, avctx->height);
+
+    return 0;
+}
+
+static int magy_huff_cmp_len(const void *a, const void *b)
+{
+    const HuffEntry *aa = a, *bb = b;
+    return (aa->len - bb->len) * 256 + aa->sym - bb->sym;
+}
+
+static int huff_cmp_sym(const void *a, const void *b)
+{
+    const HuffEntry *aa = a, *bb = b;
+    return bb->sym - aa->sym;
+}
+
+static void calculate_codes(HuffEntry *he)
+{
+    uint32_t code;
+    int i;
+
+    AV_QSORT(he, 256, HuffEntry, magy_huff_cmp_len);
+
+    code = 1;
+    for (i = 255; i >= 0; i--) {
+        he[i].code  = code >> (32 - he[i].len);
+        code       += 0x80000000u >> (he[i].len - 1);
+    }
+
+    AV_QSORT(he, 256, HuffEntry, huff_cmp_sym);
+}
+
+static void count_usage(uint8_t *src, int width,
+                        int height, PTable *counts)
+{
+    int i, j;
+
+    for (j = 0; j < height; j++) {
+        for (i = 0; i < width; i++) {
+            counts[src[i]].prob++;
+        }
+        src += width;
+    }
+}
+
+typedef struct PackageMergerList {
+    int nitems;             ///< number of items in the list and probability      ex. 4
+    int item_idx[515];      ///< index range for each item in items                   0, 2, 5, 9, 13
+    int probability[514];   ///< probability of each item                             3, 8, 18, 46
+    int items[257 * 16];    ///< chain of all individual values that make up items    A, B, A, B, C, A, B, C, D, C, D, D, E
+} PackageMergerList;
+
+static int compare_by_prob(const void *a, const void *b)
+{
+    PTable a_val = *(PTable *)a;
+    PTable b_val = *(PTable *)b;
+    return a_val.prob - b_val.prob;
+}
+
+static void magy_huffman_compute_bits(PTable *prob_table, HuffEntry *distincts,
+                                      int size, int max_length)
+{
+    PackageMergerList list_a, list_b, *to = &list_a, *from = &list_b, *temp;
+    int times, i, j, k;
+    int nbits[257] = {0};
+    int min;
+
+    av_assert0(max_length > 0);
+
+    to->nitems = 0;
+    from->nitems = 0;
+    to->item_idx[0] = 0;
+    from->item_idx[0] = 0;
+    AV_QSORT(prob_table, size, PTable, compare_by_prob);
+
+    for (times = 0; times <= max_length; times++) {
+        to->nitems = 0;
+        to->item_idx[0] = 0;
+
+        j = 0;
+        k = 0;
+
+        if (times < max_length) {
+            i = 0;
+        }
+        while (i < size || j + 1 < from->nitems) {
+            to->nitems++;
+            to->item_idx[to->nitems] = to->item_idx[to->nitems - 1];
+            if (i < size &&
+                (j + 1 >= from->nitems ||
+                 prob_table[i].prob <
+                     from->probability[j] + from->probability[j + 1])) {
+                to->items[to->item_idx[to->nitems]++] = prob_table[i].value;
+                to->probability[to->nitems - 1] = prob_table[i].prob;
+                i++;
+            } else {
+                for (k = from->item_idx[j]; k < from->item_idx[j + 2]; k++) {
+                    to->items[to->item_idx[to->nitems]++] = from->items[k];
+                }
+                to->probability[to->nitems - 1] =
+                    from->probability[j] + from->probability[j + 1];
+                j += 2;
+            }
+        }
+        temp = to;
+        to = from;
+        from = temp;
+    }
+
+    min = (size - 1 < from->nitems) ? size - 1 : from->nitems;
+    for (i = 0; i < from->item_idx[min]; i++) {
+        nbits[from->items[i]]++;
+    }
+
+    for (i = 0; i < size; i++) {
+        distincts[i].sym = i;
+        distincts[i].len = nbits[i];
+    }
+}
+
+static int encode_table(AVCodecContext *avctx, uint8_t *dst,
+                        int width, int height,
+                        PutBitContext *pb, HuffEntry *he)
+{
+    PTable counts[256] = { {0} };
+    int i;
+
+    count_usage(dst, width, height, counts);
+
+    for (i = 0; i < 256; i++) {
+        counts[i].prob++;
+        counts[i].value = 255 - i;
+    }
+
+    magy_huffman_compute_bits(counts, he, 256, 12);
+
+    calculate_codes(he);
+
+    for (i = 0; i < 256; i++) {
+        put_bits(pb, 1, 0);
+        put_bits(pb, 7, he[i].len);
+    }
+
+    return 0;
+}
+
+static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size,
+                        int width, int height, HuffEntry *he, int prediction)
+{
+    PutBitContext pb;
+    int i, j;
+    int count;
+
+    init_put_bits(&pb, dst, dst_size);
+
+    put_bits(&pb, 8, 0);
+    put_bits(&pb, 8, prediction);
+
+    for (j = 0; j < height; j++) {
+        for (i = 0; i < width; i++) {
+            const int idx = src[i];
+            put_bits(&pb, he[idx].len, he[idx].code);
+        }
+
+        src += width;
+    }
+
+    count = put_bits_count(&pb) & 0x1F;
+
+    if (count)
+        put_bits(&pb, 32 - count, 0);
+
+    count = put_bits_count(&pb);
+
+    flush_put_bits(&pb);
+
+    return count >> 3;
+}
+
+static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *frame, int *got_packet)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    PutByteContext pb;
+    const int width = avctx->width, height = avctx->height;
+    int pos, slice, i, j, ret = 0;
+
+    ret = ff_alloc_packet2(avctx, pkt, (256 + 4 * s->nb_slices + width * height) *
+                           s->planes + 256, 0);
+    if (ret < 0)
+        return ret;
+
+    bytestream2_init_writer(&pb, pkt->data, pkt->size);
+    bytestream2_put_le32(&pb, MKTAG('M', 'A', 'G', 'Y'));
+    bytestream2_put_le32(&pb, 32); // header size
+    bytestream2_put_byte(&pb, 7);  // version
+    bytestream2_put_byte(&pb, s->format);
+    bytestream2_put_byte(&pb, 12); // max huffman length
+    bytestream2_put_byte(&pb, 0);
+
+    bytestream2_put_byte(&pb, 0);
+    bytestream2_put_byte(&pb, 0);
+    bytestream2_put_byte(&pb, 32); // coder type
+    bytestream2_put_byte(&pb, 0);
+
+    bytestream2_put_le32(&pb, avctx->width);
+    bytestream2_put_le32(&pb, avctx->height);
+    bytestream2_put_le32(&pb, avctx->width);
+    bytestream2_put_le32(&pb, avctx->height);
+    bytestream2_put_le32(&pb, 0);
+
+    for (i = 0; i < s->planes; i++) {
+        bytestream2_put_le32(&pb, 0);
+        for (j = 1; j < s->nb_slices; j++) {
+            bytestream2_put_le32(&pb, 0);
+        }
+    }
+
+    bytestream2_put_byte(&pb, s->planes);
+
+    for (i = 0; i < s->planes; i++) {
+        for (slice = 0; slice < s->nb_slices; slice++) {
+            bytestream2_put_byte(&pb, i);
+        }
+    }
+
+    if (s->correlate) {
+        uint8_t *r, *g, *b;
+        AVFrame *p = av_frame_clone(frame);
+
+        g = p->data[0];
+        b = p->data[1];
+        r = p->data[2];
+
+        for (i = 0; i < height; i++) {
+            s->llvidencdsp.diff_bytes(b, b, g, width);
+            s->llvidencdsp.diff_bytes(r, r, g, width);
+            g += p->linesize[0];
+            b += p->linesize[1];
+            r += p->linesize[2];
+        }
+
+        FFSWAP(uint8_t*, p->data[0], p->data[1]);
+        FFSWAP(int, p->linesize[0], p->linesize[1]);
+
+        for (i = 0; i < s->planes; i++) {
+            for (slice = 0; slice < s->nb_slices; slice++) {
+                s->predict(s, p->data[i], s->slices[i], p->linesize[i],
+                               p->width, p->height);
+            }
+        }
+
+        av_frame_free(&p);
+    } else {
+        for (i = 0; i < s->planes; i++) {
+            for (slice = 0; slice < s->nb_slices; slice++) {
+                s->predict(s, frame->data[i], s->slices[i], frame->linesize[i],
+                           AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
+                           AV_CEIL_RSHIFT(frame->height, s->vshift[i]));
+            }
+        }
+    }
+
+    init_put_bits(&s->pb, pkt->data + bytestream2_tell_p(&pb), bytestream2_get_bytes_left_p(&pb));
+
+    for (i = 0; i < s->planes; i++) {
+        encode_table(avctx, s->slices[i],
+                     AV_CEIL_RSHIFT(frame->width,  s->hshift[i]),
+                     AV_CEIL_RSHIFT(frame->height, s->vshift[i]),
+                     &s->pb, s->he[i]);
+    }
+    s->tables_size = (put_bits_count(&s->pb) + 7) >> 3;
+    bytestream2_skip_p(&pb, s->tables_size);
+
+    for (i = 0; i < s->planes; i++) {
+        unsigned slice_size;
+
+        s->slice_pos[i] = bytestream2_tell_p(&pb);
+        slice_size = encode_slice(s->slices[i], pkt->data + bytestream2_tell_p(&pb),
+                                  bytestream2_get_bytes_left_p(&pb),
+                                  AV_CEIL_RSHIFT(frame->width,  s->hshift[i]),
+                                  AV_CEIL_RSHIFT(frame->height, s->vshift[i]),
+                                  s->he[i], s->frame_pred);
+        bytestream2_skip_p(&pb, slice_size);
+    }
+
+    pos = bytestream2_tell_p(&pb);
+    bytestream2_seek_p(&pb, 32, SEEK_SET);
+    bytestream2_put_le32(&pb, s->slice_pos[0] - 32);
+    for (i = 0; i < s->planes; i++) {
+        bytestream2_put_le32(&pb, s->slice_pos[i] - 32);
+    }
+    bytestream2_seek_p(&pb, pos, SEEK_SET);
+
+    pkt->size   = bytestream2_tell_p(&pb);
+    pkt->flags |= AV_PKT_FLAG_KEY;
+
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int magy_encode_close(AVCodecContext *avctx)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->planes; i++)
+        av_freep(&s->slices[i]);
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(MagicYUVContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "pred", "Prediction method", OFFSET(frame_pred), AV_OPT_TYPE_INT, {.i64=LEFT}, LEFT, MEDIAN, VE, "pred" },
+    { "left",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LEFT },     0, 0, VE, "pred" },
+    { "gradient", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = GRADIENT }, 0, 0, VE, "pred" },
+    { "median",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MEDIAN },   0, 0, VE, "pred" },
+    { NULL},
+};
+
+static const AVClass magicyuv_class = {
+    .class_name = "magicyuv",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_magicyuv_encoder = {
+    .name             = "magicyuv",
+    .long_name        = NULL_IF_CONFIG_SMALL("MagicYUV video"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_MAGICYUV,
+    .priv_data_size   = sizeof(MagicYUVContext),
+    .priv_class       = &magicyuv_class,
+    .init             = magy_encode_init,
+    .close            = magy_encode_close,
+    .encode2          = magy_encode_frame,
+    .capabilities     = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts         = (const enum AVPixelFormat[]) {
+                          AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP, AV_PIX_FMT_YUV422P,
+                          AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA444P, AV_PIX_FMT_GRAY8,
+                          AV_PIX_FMT_NONE
+                      },
+};
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
index 0afc82a..1c35664 100644
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2001, 2002 Fabrice Bellard
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_MATHOPS_H
@@ -25,23 +25,21 @@
 #include <stdint.h>
 
 #include "libavutil/common.h"
+#include "libavutil/reverse.h"
 #include "config.h"
 
 #define MAX_NEG_CROP 1024
 
 extern const uint32_t ff_inverse[257];
-extern const uint8_t  ff_reverse[256];
 extern const uint8_t ff_sqrt_tab[256];
 extern const uint8_t ff_crop_tab[256 + 2 * MAX_NEG_CROP];
 extern const uint8_t ff_zigzag_direct[64];
-extern const uint8_t ff_zigzag_scan[16];
+extern const uint8_t ff_zigzag_scan[16+1];
 
 #if   ARCH_ARM
 #   include "arm/mathops.h"
 #elif ARCH_AVR32
 #   include "avr32/mathops.h"
-#elif ARCH_BFIN
-#   include "bfin/mathops.h"
 #elif ARCH_MIPS
 #   include "mips/mathops.h"
 #elif ARCH_PPC
@@ -114,6 +112,20 @@ static inline av_const int mid_pred(int a, int b, int c)
 }
 #endif
 
+#ifndef median4
+#define median4 median4
+static inline av_const int median4(int a, int b, int c, int d)
+{
+    if (a < b) {
+        if (c < d) return (FFMIN(b, d) + FFMAX(a, c)) / 2;
+        else       return (FFMIN(b, c) + FFMAX(a, d)) / 2;
+    } else {
+        if (c < d) return (FFMIN(a, d) + FFMAX(b, c)) / 2;
+        else       return (FFMIN(a, c) + FFMAX(b, d)) / 2;
+    }
+}
+#endif
+
 #ifndef sign_extend
 static inline av_const int sign_extend(int val, unsigned bits)
 {
@@ -190,6 +202,8 @@ if ((y) < (x)) {\
 #   define FASTDIV(a,b) ((uint32_t)((((uint64_t)a) * ff_inverse[b]) >> 32))
 #endif /* FASTDIV */
 
+#ifndef ff_sqrt
+#define ff_sqrt ff_sqrt
 static inline av_const unsigned int ff_sqrt(unsigned int a)
 {
     unsigned int b;
@@ -209,6 +223,12 @@ static inline av_const unsigned int ff_sqrt(unsigned int a)
 
     return b - (a < b * b);
 }
+#endif
+
+static inline av_const float ff_sqrf(float a)
+{
+    return a*a;
+}
 
 static inline int8_t ff_u8_to_s8(uint8_t a)
 {
@@ -220,4 +240,12 @@ static inline int8_t ff_u8_to_s8(uint8_t a)
     return b.s8;
 }
 
+static av_always_inline uint32_t bitswap_32(uint32_t x)
+{
+    return (uint32_t)ff_reverse[ x        & 0xFF] << 24 |
+           (uint32_t)ff_reverse[(x >> 8)  & 0xFF] << 16 |
+           (uint32_t)ff_reverse[(x >> 16) & 0xFF] << 8  |
+           (uint32_t)ff_reverse[ x >> 24];
+}
+
 #endif /* AVCODEC_MATHOPS_H */
diff --git a/libavcodec/mathtables.c b/libavcodec/mathtables.c
index d198225..81eabc7 100644
--- a/libavcodec/mathtables.c
+++ b/libavcodec/mathtables.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,25 +71,6 @@ const uint8_t ff_sqrt_tab[256]={
 240,240,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255,255,255
 };
 
-const uint8_t ff_reverse[256] = {
-0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
-0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
-0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
-0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
-0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
-0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
-0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
-0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
-0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
-0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
-0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
-0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
-0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
-0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
-0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
-0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF,
-};
-
 #define times4(x) x, x, x, x
 #define times256(x) times4(times4(times4(times4(times4(x)))))
 
@@ -123,7 +106,7 @@ const uint8_t ff_zigzag_direct[64] = {
     53, 60, 61, 54, 47, 55, 62, 63
 };
 
-const uint8_t ff_zigzag_scan[16] = {
+const uint8_t ff_zigzag_scan[16+1] = {
     0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4,
     1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
     1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4,
diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
new file mode 100644
index 0000000..6f35059
--- /dev/null
+++ b/libavcodec/mdct15.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Celt non-power of 2 iMDCT
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stddef.h>
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+
+#include "mdct15.h"
+
+#define FFT_FLOAT 1
+#include "fft-internal.h"
+
+#define CMUL3(c, a, b) CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
+
+av_cold void ff_mdct15_uninit(MDCT15Context **ps)
+{
+    MDCT15Context *s = *ps;
+
+    if (!s)
+        return;
+
+    ff_fft_end(&s->ptwo_fft);
+
+    av_freep(&s->pfa_prereindex);
+    av_freep(&s->pfa_postreindex);
+    av_freep(&s->twiddle_exptab);
+    av_freep(&s->tmp);
+
+    av_freep(ps);
+}
+
+static inline int init_pfa_reindex_tabs(MDCT15Context *s)
+{
+    int i, j;
+    const int b_ptwo = s->ptwo_fft.nbits; /* Bits for the power of two FFTs */
+    const int l_ptwo = 1 << b_ptwo; /* Total length for the power of two FFTs */
+    const int inv_1 = l_ptwo << ((4 - b_ptwo) & 3); /* (2^b_ptwo)^-1 mod 15 */
+    const int inv_2 = 0xeeeeeeef & ((1U << b_ptwo) - 1); /* 15^-1 mod 2^b_ptwo */
+
+    s->pfa_prereindex = av_malloc_array(15 * l_ptwo, sizeof(*s->pfa_prereindex));
+    if (!s->pfa_prereindex)
+        return 1;
+
+    s->pfa_postreindex = av_malloc_array(15 * l_ptwo, sizeof(*s->pfa_postreindex));
+    if (!s->pfa_postreindex)
+        return 1;
+
+    /* Pre/Post-reindex */
+    for (i = 0; i < l_ptwo; i++) {
+        for (j = 0; j < 15; j++) {
+            const int q_pre = ((l_ptwo * j)/15 + i) >> b_ptwo;
+            const int q_post = (((j*inv_1)/15) + (i*inv_2)) >> b_ptwo;
+            const int k_pre = 15*i + (j - q_pre*15)*(1 << b_ptwo);
+            const int k_post = i*inv_2*15 + j*inv_1 - 15*q_post*l_ptwo;
+            s->pfa_prereindex[i*15 + j] = k_pre << 1;
+            s->pfa_postreindex[k_post] = l_ptwo*j + i;
+        }
+    }
+
+    return 0;
+}
+
+/* Stride is hardcoded to 3 */
+static inline void fft5(FFTComplex *out, FFTComplex *in, FFTComplex exptab[2])
+{
+    FFTComplex z0[4], t[6];
+
+    t[0].re = in[3].re + in[12].re;
+    t[0].im = in[3].im + in[12].im;
+    t[1].im = in[3].re - in[12].re;
+    t[1].re = in[3].im - in[12].im;
+    t[2].re = in[6].re + in[ 9].re;
+    t[2].im = in[6].im + in[ 9].im;
+    t[3].im = in[6].re - in[ 9].re;
+    t[3].re = in[6].im - in[ 9].im;
+
+    out[0].re = in[0].re + in[3].re + in[6].re + in[9].re + in[12].re;
+    out[0].im = in[0].im + in[3].im + in[6].im + in[9].im + in[12].im;
+
+    t[4].re = exptab[0].re * t[2].re - exptab[1].re * t[0].re;
+    t[4].im = exptab[0].re * t[2].im - exptab[1].re * t[0].im;
+    t[0].re = exptab[0].re * t[0].re - exptab[1].re * t[2].re;
+    t[0].im = exptab[0].re * t[0].im - exptab[1].re * t[2].im;
+    t[5].re = exptab[0].im * t[3].re - exptab[1].im * t[1].re;
+    t[5].im = exptab[0].im * t[3].im - exptab[1].im * t[1].im;
+    t[1].re = exptab[0].im * t[1].re + exptab[1].im * t[3].re;
+    t[1].im = exptab[0].im * t[1].im + exptab[1].im * t[3].im;
+
+    z0[0].re = t[0].re - t[1].re;
+    z0[0].im = t[0].im - t[1].im;
+    z0[1].re = t[4].re + t[5].re;
+    z0[1].im = t[4].im + t[5].im;
+
+    z0[2].re = t[4].re - t[5].re;
+    z0[2].im = t[4].im - t[5].im;
+    z0[3].re = t[0].re + t[1].re;
+    z0[3].im = t[0].im + t[1].im;
+
+    out[1].re = in[0].re + z0[3].re;
+    out[1].im = in[0].im + z0[0].im;
+    out[2].re = in[0].re + z0[2].re;
+    out[2].im = in[0].im + z0[1].im;
+    out[3].re = in[0].re + z0[1].re;
+    out[3].im = in[0].im + z0[2].im;
+    out[4].re = in[0].re + z0[0].re;
+    out[4].im = in[0].im + z0[3].im;
+}
+
+static void fft15_c(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride)
+{
+    int k;
+    FFTComplex tmp1[5], tmp2[5], tmp3[5];
+
+    fft5(tmp1, in + 0, exptab + 19);
+    fft5(tmp2, in + 1, exptab + 19);
+    fft5(tmp3, in + 2, exptab + 19);
+
+    for (k = 0; k < 5; k++) {
+        FFTComplex t[2];
+
+        CMUL3(t[0], tmp2[k], exptab[k]);
+        CMUL3(t[1], tmp3[k], exptab[2 * k]);
+        out[stride*k].re = tmp1[k].re + t[0].re + t[1].re;
+        out[stride*k].im = tmp1[k].im + t[0].im + t[1].im;
+
+        CMUL3(t[0], tmp2[k], exptab[k + 5]);
+        CMUL3(t[1], tmp3[k], exptab[2 * (k + 5)]);
+        out[stride*(k + 5)].re = tmp1[k].re + t[0].re + t[1].re;
+        out[stride*(k + 5)].im = tmp1[k].im + t[0].im + t[1].im;
+
+        CMUL3(t[0], tmp2[k], exptab[k + 10]);
+        CMUL3(t[1], tmp3[k], exptab[2 * k + 5]);
+        out[stride*(k + 10)].re = tmp1[k].re + t[0].re + t[1].re;
+        out[stride*(k + 10)].im = tmp1[k].im + t[0].im + t[1].im;
+    }
+}
+
+static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride)
+{
+    int i, j;
+    const int len4 = s->len4, len3 = len4 * 3, len8 = len4 >> 1;
+    const int l_ptwo = 1 << s->ptwo_fft.nbits;
+    FFTComplex fft15in[15];
+
+    /* Folding and pre-reindexing */
+    for (i = 0; i < l_ptwo; i++) {
+        for (j = 0; j < 15; j++) {
+            const int k = s->pfa_prereindex[i*15 + j];
+            FFTComplex tmp, exp = s->twiddle_exptab[k >> 1];
+            if (k < len4) {
+                tmp.re = -src[ len4 + k] + src[1*len4 - 1 - k];
+                tmp.im = -src[ len3 + k] - src[1*len3 - 1 - k];
+            } else {
+                tmp.re = -src[ len4 + k] - src[5*len4 - 1 - k];
+                tmp.im =  src[-len4 + k] - src[1*len3 - 1 - k];
+            }
+            CMUL(fft15in[j].im, fft15in[j].re, tmp.re, tmp.im, exp.re, exp.im);
+        }
+        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
+    }
+
+    /* Then a 15xN FFT (where N is a power of two) */
+    for (i = 0; i < 15; i++)
+        s->ptwo_fft.fft_calc(&s->ptwo_fft, s->tmp + l_ptwo*i);
+
+    /* Reindex again, apply twiddles and output */
+    for (i = 0; i < len8; i++) {
+        const int i0 = len8 + i, i1 = len8 - i - 1;
+        const int s0 = s->pfa_postreindex[i0], s1 = s->pfa_postreindex[i1];
+
+        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], s->tmp[s0].re, s->tmp[s0].im,
+             s->twiddle_exptab[i0].im, s->twiddle_exptab[i0].re);
+        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], s->tmp[s1].re, s->tmp[s1].im,
+             s->twiddle_exptab[i1].im, s->twiddle_exptab[i1].re);
+    }
+}
+
+static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
+                         ptrdiff_t stride)
+{
+    FFTComplex fft15in[15];
+    FFTComplex *z = (FFTComplex *)dst;
+    int i, j, len8 = s->len4 >> 1, l_ptwo = 1 << s->ptwo_fft.nbits;
+    const float *in1 = src, *in2 = src + (s->len2 - 1) * stride;
+
+    /* Reindex input, putting it into a buffer and doing an Nx15 FFT */
+    for (i = 0; i < l_ptwo; i++) {
+        for (j = 0; j < 15; j++) {
+            const int k = s->pfa_prereindex[i*15 + j];
+            FFTComplex tmp = { in2[-k*stride], in1[k*stride] };
+            CMUL3(fft15in[j], tmp, s->twiddle_exptab[k >> 1]);
+        }
+        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
+    }
+
+    /* Then a 15xN FFT (where N is a power of two) */
+    for (i = 0; i < 15; i++)
+        s->ptwo_fft.fft_calc(&s->ptwo_fft, s->tmp + l_ptwo*i);
+
+    /* Reindex again, apply twiddles and output */
+    s->postreindex(z, s->tmp, s->twiddle_exptab, s->pfa_postreindex, len8);
+}
+
+static void postrotate_c(FFTComplex *out, FFTComplex *in, FFTComplex *exp,
+                         int *lut, ptrdiff_t len8)
+{
+    int i;
+
+    /* Reindex again, apply twiddles and output */
+    for (i = 0; i < len8; i++) {
+        const int i0 = len8 + i, i1 = len8 - i - 1;
+        const int s0 = lut[i0], s1 = lut[i1];
+
+        CMUL(out[i1].re, out[i0].im, in[s1].im, in[s1].re, exp[i1].im, exp[i1].re);
+        CMUL(out[i0].re, out[i1].im, in[s0].im, in[s0].re, exp[i0].im, exp[i0].re);
+    }
+}
+
+av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
+{
+    MDCT15Context *s;
+    double alpha, theta;
+    int len2 = 15 * (1 << N);
+    int len  = 2 * len2;
+    int i;
+
+    /* Tested and verified to work on everything in between */
+    if ((N < 2) || (N > 13))
+        return AVERROR(EINVAL);
+
+    s = av_mallocz(sizeof(*s));
+    if (!s)
+        return AVERROR(ENOMEM);
+
+    s->fft_n       = N - 1;
+    s->len4        = len2 / 2;
+    s->len2        = len2;
+    s->inverse     = inverse;
+    s->fft15       = fft15_c;
+    s->mdct        = mdct15;
+    s->imdct_half  = imdct15_half;
+    s->postreindex = postrotate_c;
+
+    if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
+        goto fail;
+
+    if (init_pfa_reindex_tabs(s))
+        goto fail;
+
+    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
+    if (!s->tmp)
+        goto fail;
+
+    s->twiddle_exptab = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
+    if (!s->twiddle_exptab)
+        goto fail;
+
+    theta = 0.125f + (scale < 0 ? s->len4 : 0);
+    scale = sqrt(fabs(scale));
+    for (i = 0; i < s->len4; i++) {
+        alpha = 2 * M_PI * (i + theta) / len;
+        s->twiddle_exptab[i].re = cosf(alpha) * scale;
+        s->twiddle_exptab[i].im = sinf(alpha) * scale;
+    }
+
+    /* 15-point FFT exptab */
+    for (i = 0; i < 19; i++) {
+        if (i < 15) {
+            double theta = (2.0f * M_PI * i) / 15.0f;
+            if (!s->inverse)
+                theta *= -1;
+            s->exptab[i].re = cosf(theta);
+            s->exptab[i].im = sinf(theta);
+        } else { /* Wrap around to simplify fft15 */
+            s->exptab[i] = s->exptab[i - 15];
+        }
+    }
+
+    /* 5-point FFT exptab */
+    s->exptab[19].re = cosf(2.0f * M_PI / 5.0f);
+    s->exptab[19].im = sinf(2.0f * M_PI / 5.0f);
+    s->exptab[20].re = cosf(1.0f * M_PI / 5.0f);
+    s->exptab[20].im = sinf(1.0f * M_PI / 5.0f);
+
+    /* Invert the phase for an inverse transform, do nothing for a forward transform */
+    if (s->inverse) {
+        s->exptab[19].im *= -1;
+        s->exptab[20].im *= -1;
+    }
+
+    if (ARCH_X86)
+        ff_mdct15_init_x86(s);
+
+    *ps = s;
+
+    return 0;
+
+fail:
+    ff_mdct15_uninit(&s);
+    return AVERROR(ENOMEM);
+}
diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
new file mode 100644
index 0000000..42e60f3
--- /dev/null
+++ b/libavcodec/mdct15.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MDCT15_H
+#define AVCODEC_MDCT15_H
+
+#include <stddef.h>
+
+#include "fft.h"
+
+typedef struct MDCT15Context {
+    int fft_n;
+    int len2;
+    int len4;
+    int inverse;
+    int *pfa_prereindex;
+    int *pfa_postreindex;
+
+    FFTContext ptwo_fft;
+    FFTComplex *tmp;
+    FFTComplex *twiddle_exptab;
+
+    DECLARE_ALIGNED(32, FFTComplex, exptab)[64];
+
+    /* 15-point FFT */
+    void (*fft15)(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+
+    /* PFA postrotate and exptab */
+    void (*postreindex)(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
+
+    /* Calculate a full 2N -> N MDCT */
+    void (*mdct)(struct MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride);
+
+    /* Calculate the middle half of the iMDCT */
+    void (*imdct_half)(struct MDCT15Context *s, float *dst, const float *src,
+                       ptrdiff_t stride);
+} MDCT15Context;
+
+/* Init an (i)MDCT of the length 2 * 15 * (2^N) */
+int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
+void ff_mdct15_uninit(MDCT15Context **ps);
+
+void ff_mdct15_init_x86(MDCT15Context *s);
+
+#endif /* AVCODEC_MDCT15_H */
diff --git a/libavcodec/mdct_fixed.c b/libavcodec/mdct_fixed.c
index 9e06861..aabf0c8 100644
--- a/libavcodec/mdct_fixed.c
+++ b/libavcodec/mdct_fixed.c
@@ -1,22 +1,23 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 0
+#define FFT_FIXED_32 0
 #include "mdct_template.c"
 
 /* same as ff_mdct_calcw_c with double-width unscaled output */
@@ -38,13 +39,13 @@ void ff_mdct_calcw_c(FFTContext *s, FFTDouble *out, const FFTSample *input)
 
     /* pre rotation */
     for(i=0;i<n8;i++) {
-        re = RSCALE(-input[2*i+n3] - input[n3-1-2*i]);
-        im = RSCALE(-input[n4+2*i] + input[n4-1-2*i]);
+        re = RSCALE(-input[2*i+n3], - input[n3-1-2*i]);
+        im = RSCALE(-input[n4+2*i], + input[n4-1-2*i]);
         j = revtab[i];
         CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]);
 
-        re = RSCALE( input[2*i]    - input[n2-1-2*i]);
-        im = RSCALE(-input[n2+2*i] - input[ n-1-2*i]);
+        re = RSCALE( input[2*i]   , - input[n2-1-2*i]);
+        im = RSCALE(-input[n2+2*i], - input[ n-1-2*i]);
         j = revtab[n8 + i];
         CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
     }
diff --git a/libavcodec/mdct_fixed_32.c b/libavcodec/mdct_fixed_32.c
new file mode 100644
index 0000000..5a34dfe
--- /dev/null
+++ b/libavcodec/mdct_fixed_32.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "mdct_template.c"
diff --git a/libavcodec/mdct_float.c b/libavcodec/mdct_float.c
index a0a62b3..cff2d21 100644
--- a/libavcodec/mdct_float.c
+++ b/libavcodec/mdct_float.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 1
+#define FFT_FIXED_32 0
 #include "mdct_template.c"
diff --git a/libavcodec/mdct_template.c b/libavcodec/mdct_template.c
index 5b3a6ff..e0ad9f1 100644
--- a/libavcodec/mdct_template.c
+++ b/libavcodec/mdct_template.c
@@ -2,26 +2,27 @@
  * MDCT/IMDCT transforms
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdlib.h>
 #include <string.h>
 #include "libavutil/common.h"
+#include "libavutil/libm.h"
 #include "libavutil/mathematics.h"
 #include "fft.h"
 #include "fft-internal.h"
@@ -32,9 +33,13 @@
  */
 
 #if FFT_FLOAT
-#   define RSCALE(x) (x)
+#   define RSCALE(x, y) ((x) + (y))
 #else
-#   define RSCALE(x) ((x) >> 1)
+#if FFT_FIXED_32
+#   define RSCALE(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6)
+#else /* FFT_FIXED_32 */
+#   define RSCALE(x, y) ((int)((x) + (unsigned)(y)) >> 1)
+#endif /* FFT_FIXED_32 */
 #endif
 
 /**
@@ -56,27 +61,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
     if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
         goto fail;
 
-    s->imdct_calc  = ff_imdct_calc_c;
-    s->imdct_half  = ff_imdct_half_c;
-    s->mdct_calc   = ff_mdct_calc_c;
-
-#if FFT_FLOAT
-    if (ARCH_AARCH64)
-        ff_mdct_init_aarch64(s);
-    if (ARCH_ARM)
-        ff_mdct_init_arm(s);
-    if (ARCH_PPC)
-        ff_mdct_init_ppc(s);
-    if (ARCH_X86)
-        ff_mdct_init_x86(s);
-    s->mdct_calcw  = s->mdct_calc;
-#else
-    s->mdct_calcw  = ff_mdct_calcw_c;
-    if (ARCH_ARM)
-        ff_mdct_fixed_init_arm(s);
-#endif
-
-    s->tcos = av_malloc(n/2 * sizeof(FFTSample));
+    s->tcos = av_malloc_array(n/2, sizeof(FFTSample));
     if (!s->tcos)
         goto fail;
 
@@ -97,8 +82,13 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
     scale = sqrt(fabs(scale));
     for(i=0;i<n4;i++) {
         alpha = 2 * M_PI * (i + theta) / n;
+#if FFT_FIXED_32
+        s->tcos[i*tstep] = lrint(-cos(alpha) * 2147483648.0);
+        s->tsin[i*tstep] = lrint(-sin(alpha) * 2147483648.0);
+#else
         s->tcos[i*tstep] = FIX15(-cos(alpha) * scale);
         s->tsin[i*tstep] = FIX15(-sin(alpha) * scale);
+#endif
     }
     return 0;
  fail:
@@ -191,13 +181,13 @@ void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
 
     /* pre rotation */
     for(i=0;i<n8;i++) {
-        re = RSCALE(-input[2*i+n3] - input[n3-1-2*i]);
-        im = RSCALE(-input[n4+2*i] + input[n4-1-2*i]);
+        re = RSCALE(-input[2*i+n3], - input[n3-1-2*i]);
+        im = RSCALE(-input[n4+2*i], + input[n4-1-2*i]);
         j = revtab[i];
         CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]);
 
-        re = RSCALE( input[2*i]    - input[n2-1-2*i]);
-        im = RSCALE(-input[n2+2*i] - input[ n-1-2*i]);
+        re = RSCALE( input[2*i]   , - input[n2-1-2*i]);
+        im = RSCALE(-input[n2+2*i], - input[ n-1-2*i]);
         j = revtab[n8 + i];
         CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
     }
diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c
index c0db368..330b761 100644
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -4,20 +4,20 @@
  *
  * based upon code from Sebastian Jedruszkiewicz <elf@frogger.rules.pl>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "bswapdsp.h"
 #include "idctdsp.h"
 #include "mpeg12.h"
 #include "thread.h"
@@ -36,6 +37,7 @@
 typedef struct MDECContext {
     AVCodecContext *avctx;
     BlockDSPContext bdsp;
+    BswapDSPContext bbdsp;
     IDCTDSPContext idsp;
     ThreadFrame frame;
     GetBitContext gb;
@@ -46,7 +48,8 @@ typedef struct MDECContext {
     int mb_width;
     int mb_height;
     int mb_x, mb_y;
-    DECLARE_ALIGNED(16, int16_t, block)[6][64];
+    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64];
     uint8_t *bitstream_buffer;
     unsigned int bitstream_buffer_size;
     int block_last_index[6];
@@ -59,7 +62,7 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
     int component;
     RLTable *rl = &ff_rl_mpeg1;
     uint8_t * const scantable = a->scantable.permutated;
-    const uint16_t *quant_matrix = ff_mpeg1_default_intra_matrix;
+    const uint16_t *quant_matrix = a->quant_matrix;
     const int qscale = a->qscale;
 
     /* DC coefficient */
@@ -71,7 +74,7 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
         if (diff >= 0xffff)
             return AVERROR_INVALIDDATA;
         a->last_dc[component] += diff;
-        block[0] = a->last_dc[component] << 3;
+        block[0] = a->last_dc[component] * (1 << 3);
     }
 
     i = 0;
@@ -109,11 +112,11 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
                 j = scantable[i];
                 if (level < 0) {
                     level = -level;
-                    level = (level * qscale * quant_matrix[j]) >> 3;
+                    level = (level * (unsigned)qscale * quant_matrix[j]) >> 3;
                     level = (level - 1) | 1;
                     level = -level;
                 } else {
-                    level = (level * qscale * quant_matrix[j]) >> 3;
+                    level = (level * (unsigned)qscale * quant_matrix[j]) >> 3;
                     level = (level - 1) | 1;
                 }
             }
@@ -171,23 +174,19 @@ static int decode_frame(AVCodecContext *avctx,
     const uint8_t *buf    = avpkt->data;
     int buf_size          = avpkt->size;
     ThreadFrame frame     = { .f = data };
-    int i, ret;
+    int ret;
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
     frame.f->pict_type = AV_PICTURE_TYPE_I;
     frame.f->key_frame = 1;
 
-    av_fast_malloc(&a->bitstream_buffer, &a->bitstream_buffer_size, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&a->bitstream_buffer, &a->bitstream_buffer_size, buf_size);
     if (!a->bitstream_buffer)
         return AVERROR(ENOMEM);
-    for (i = 0; i < buf_size; i += 2) {
-        a->bitstream_buffer[i]     = buf[i + 1];
-        a->bitstream_buffer[i + 1] = buf[i];
-    }
-    init_get_bits(&a->gb, a->bitstream_buffer, buf_size * 8);
+    a->bbdsp.bswap16_buf((uint16_t *)a->bitstream_buffer, (uint16_t *)buf, (buf_size + 1) / 2);
+    if ((ret = init_get_bits8(&a->gb, a->bitstream_buffer, buf_size)) < 0)
+        return ret;
 
     /* skip over 4 preamble bytes in stream (typically 0xXX 0xXX 0x00 0x38) */
     skip_bits(&a->gb, 32);
@@ -214,26 +213,34 @@ static int decode_frame(AVCodecContext *avctx,
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     MDECContext * const a = avctx->priv_data;
+    int i;
 
     a->mb_width  = (avctx->coded_width  + 15) / 16;
     a->mb_height = (avctx->coded_height + 15) / 16;
 
     a->avctx           = avctx;
 
-    ff_blockdsp_init(&a->bdsp);
+    ff_blockdsp_init(&a->bdsp, avctx);
+    ff_bswapdsp_init(&a->bbdsp);
     ff_idctdsp_init(&a->idsp, avctx);
     ff_mpeg12_init_vlcs();
     ff_init_scantable(a->idsp.idct_permutation, &a->scantable,
                       ff_zigzag_direct);
 
-    if (avctx->idct_algo == FF_IDCT_AUTO)
-        avctx->idct_algo = FF_IDCT_SIMPLE;
     avctx->pix_fmt  = AV_PIX_FMT_YUVJ420P;
     avctx->color_range = AVCOL_RANGE_JPEG;
 
+    /* init q matrix */
+    for (i = 0; i < 64; i++) {
+        int j = a->idsp.idct_permutation[i];
+
+        a->quant_matrix[j] = ff_mpeg1_default_intra_matrix[i];
+    }
+
     return 0;
 }
 
+#if HAVE_THREADS
 static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 {
     MDECContext * const a = avctx->priv_data;
@@ -242,6 +249,7 @@ static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index eb98a72..ae248c5 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1,22 +1,27 @@
 /*
- * This file is part of Libav.
+ * DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/internal.h"
 #include "avcodec.h"
 #include "copy_block.h"
 #include "simple_idct.h"
@@ -24,13 +29,47 @@
 #include "mpegvideo.h"
 #include "config.h"
 
-uint32_t ff_square_tab[512] = { 0, };
+/* (i - 256) * (i - 256) */
+const uint32_t ff_square_tab[512] = {
+    65536, 65025, 64516, 64009, 63504, 63001, 62500, 62001, 61504, 61009, 60516, 60025, 59536, 59049, 58564, 58081,
+    57600, 57121, 56644, 56169, 55696, 55225, 54756, 54289, 53824, 53361, 52900, 52441, 51984, 51529, 51076, 50625,
+    50176, 49729, 49284, 48841, 48400, 47961, 47524, 47089, 46656, 46225, 45796, 45369, 44944, 44521, 44100, 43681,
+    43264, 42849, 42436, 42025, 41616, 41209, 40804, 40401, 40000, 39601, 39204, 38809, 38416, 38025, 37636, 37249,
+    36864, 36481, 36100, 35721, 35344, 34969, 34596, 34225, 33856, 33489, 33124, 32761, 32400, 32041, 31684, 31329,
+    30976, 30625, 30276, 29929, 29584, 29241, 28900, 28561, 28224, 27889, 27556, 27225, 26896, 26569, 26244, 25921,
+    25600, 25281, 24964, 24649, 24336, 24025, 23716, 23409, 23104, 22801, 22500, 22201, 21904, 21609, 21316, 21025,
+    20736, 20449, 20164, 19881, 19600, 19321, 19044, 18769, 18496, 18225, 17956, 17689, 17424, 17161, 16900, 16641,
+    16384, 16129, 15876, 15625, 15376, 15129, 14884, 14641, 14400, 14161, 13924, 13689, 13456, 13225, 12996, 12769,
+    12544, 12321, 12100, 11881, 11664, 11449, 11236, 11025, 10816, 10609, 10404, 10201, 10000,  9801,  9604,  9409,
+     9216,  9025,  8836,  8649,  8464,  8281,  8100,  7921,  7744,  7569,  7396,  7225,  7056,  6889,  6724,  6561,
+     6400,  6241,  6084,  5929,  5776,  5625,  5476,  5329,  5184,  5041,  4900,  4761,  4624,  4489,  4356,  4225,
+     4096,  3969,  3844,  3721,  3600,  3481,  3364,  3249,  3136,  3025,  2916,  2809,  2704,  2601,  2500,  2401,
+     2304,  2209,  2116,  2025,  1936,  1849,  1764,  1681,  1600,  1521,  1444,  1369,  1296,  1225,  1156,  1089,
+     1024,   961,   900,   841,   784,   729,   676,   625,   576,   529,   484,   441,   400,   361,   324,   289,
+      256,   225,   196,   169,   144,   121,   100,    81,    64,    49,    36,    25,    16,     9,     4,     1,
+        0,     1,     4,     9,    16,    25,    36,    49,    64,    81,   100,   121,   144,   169,   196,   225,
+      256,   289,   324,   361,   400,   441,   484,   529,   576,   625,   676,   729,   784,   841,   900,   961,
+     1024,  1089,  1156,  1225,  1296,  1369,  1444,  1521,  1600,  1681,  1764,  1849,  1936,  2025,  2116,  2209,
+     2304,  2401,  2500,  2601,  2704,  2809,  2916,  3025,  3136,  3249,  3364,  3481,  3600,  3721,  3844,  3969,
+     4096,  4225,  4356,  4489,  4624,  4761,  4900,  5041,  5184,  5329,  5476,  5625,  5776,  5929,  6084,  6241,
+     6400,  6561,  6724,  6889,  7056,  7225,  7396,  7569,  7744,  7921,  8100,  8281,  8464,  8649,  8836,  9025,
+     9216,  9409,  9604,  9801, 10000, 10201, 10404, 10609, 10816, 11025, 11236, 11449, 11664, 11881, 12100, 12321,
+    12544, 12769, 12996, 13225, 13456, 13689, 13924, 14161, 14400, 14641, 14884, 15129, 15376, 15625, 15876, 16129,
+    16384, 16641, 16900, 17161, 17424, 17689, 17956, 18225, 18496, 18769, 19044, 19321, 19600, 19881, 20164, 20449,
+    20736, 21025, 21316, 21609, 21904, 22201, 22500, 22801, 23104, 23409, 23716, 24025, 24336, 24649, 24964, 25281,
+    25600, 25921, 26244, 26569, 26896, 27225, 27556, 27889, 28224, 28561, 28900, 29241, 29584, 29929, 30276, 30625,
+    30976, 31329, 31684, 32041, 32400, 32761, 33124, 33489, 33856, 34225, 34596, 34969, 35344, 35721, 36100, 36481,
+    36864, 37249, 37636, 38025, 38416, 38809, 39204, 39601, 40000, 40401, 40804, 41209, 41616, 42025, 42436, 42849,
+    43264, 43681, 44100, 44521, 44944, 45369, 45796, 46225, 46656, 47089, 47524, 47961, 48400, 48841, 49284, 49729,
+    50176, 50625, 51076, 51529, 51984, 52441, 52900, 53361, 53824, 54289, 54756, 55225, 55696, 56169, 56644, 57121,
+    57600, 58081, 58564, 59049, 59536, 60025, 60516, 61009, 61504, 62001, 62500, 63001, 63504, 64009, 64516, 65025,
+};
 
 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h)
 {
     int s = 0, i;
-    uint32_t *sq = ff_square_tab + 256;
+    const uint32_t *sq = ff_square_tab + 256;
 
     for (i = 0; i < h; i++) {
         s    += sq[pix1[0] - pix2[0]];
@@ -47,7 +86,7 @@ static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   ptrdiff_t stride, int h)
 {
     int s = 0, i;
-    uint32_t *sq = ff_square_tab + 256;
+    const uint32_t *sq = ff_square_tab + 256;
 
     for (i = 0; i < h; i++) {
         s    += sq[pix1[0] - pix2[0]];
@@ -68,7 +107,7 @@ static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                    ptrdiff_t stride, int h)
 {
     int s = 0, i;
-    uint32_t *sq = ff_square_tab + 256;
+    const uint32_t *sq = ff_square_tab + 256;
 
     for (i = 0; i < h; i++) {
         s += sq[pix1[0]  - pix2[0]];
@@ -103,8 +142,8 @@ static int sum_abs_dctelem_c(int16_t *block)
     return sum;
 }
 
-#define avg2(a, b) ((a + b + 1) >> 1)
-#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
+#define avg2(a, b) (((a) + (b) + 1) >> 1)
+#define avg4(a, b, c, d) (((a) + (b) + (c) + (d) + 2) >> 2)
 
 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                               ptrdiff_t stride, int h)
@@ -134,6 +173,45 @@ static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
     return s;
 }
 
+static inline int pix_median_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             ptrdiff_t stride, int h)
+{
+    int s = 0, i, j;
+
+#define V(x) (pix1[x] - pix2[x])
+
+    s    += abs(V(0));
+    s    += abs(V(1) - V(0));
+    s    += abs(V(2) - V(1));
+    s    += abs(V(3) - V(2));
+    s    += abs(V(4) - V(3));
+    s    += abs(V(5) - V(4));
+    s    += abs(V(6) - V(5));
+    s    += abs(V(7) - V(6));
+    s    += abs(V(8) - V(7));
+    s    += abs(V(9) - V(8));
+    s    += abs(V(10) - V(9));
+    s    += abs(V(11) - V(10));
+    s    += abs(V(12) - V(11));
+    s    += abs(V(13) - V(12));
+    s    += abs(V(14) - V(13));
+    s    += abs(V(15) - V(14));
+
+    pix1 += stride;
+    pix2 += stride;
+
+    for (i = 1; i < h; i++) {
+        s    += abs(V(0) - V(-stride));
+        for (j = 1; j < 16; j++)
+            s    += abs(V(j) - mid_pred(V(j-stride), V(j-1), V(j-stride) + V(j-1) - V(j-stride-1)));
+        pix1 += stride;
+        pix2 += stride;
+
+    }
+#undef V
+    return s;
+}
+
 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                           ptrdiff_t stride, int h)
 {
@@ -242,6 +320,37 @@ static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
     return s;
 }
 
+static inline int pix_median_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             ptrdiff_t stride, int h)
+{
+    int s = 0, i, j;
+
+#define V(x) (pix1[x] - pix2[x])
+
+    s    += abs(V(0));
+    s    += abs(V(1) - V(0));
+    s    += abs(V(2) - V(1));
+    s    += abs(V(3) - V(2));
+    s    += abs(V(4) - V(3));
+    s    += abs(V(5) - V(4));
+    s    += abs(V(6) - V(5));
+    s    += abs(V(7) - V(6));
+
+    pix1 += stride;
+    pix2 += stride;
+
+    for (i = 1; i < h; i++) {
+        s    += abs(V(0) - V(-stride));
+        for (j = 1; j < 8; j++)
+            s    += abs(V(j) - mid_pred(V(j-stride), V(j-1), V(j-stride) + V(j-1) - V(j-stride-1)));
+        pix1 += stride;
+        pix2 += stride;
+
+    }
+#undef V
+    return s;
+}
+
 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                          ptrdiff_t stride, int h)
 {
@@ -373,6 +482,9 @@ void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type)
         case FF_CMP_SAD:
             cmp[i] = c->sad[i];
             break;
+        case FF_CMP_MEDIAN_SAD:
+            cmp[i] = c->median_sad[i];
+            break;
         case FF_CMP_SATD:
             cmp[i] = c->hadamard8_diff[i];
             break;
@@ -409,6 +521,14 @@ void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type)
         case FF_CMP_NSSE:
             cmp[i] = c->nsse[i];
             break;
+#if CONFIG_DWT
+        case FF_CMP_W53:
+            cmp[i]= c->w53[i];
+            break;
+        case FF_CMP_W97:
+            cmp[i]= c->w97[i];
+            break;
+#endif
         default:
             av_log(NULL, AV_LOG_ERROR,
                    "internal error in cmp function selection\n");
@@ -436,7 +556,7 @@ static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
 {
     int i, temp[64], sum = 0;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     for (i = 0; i < 8; i++) {
         // FIXME: try pointer walks
@@ -488,7 +608,7 @@ static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
 {
     int i, temp[64], sum = 0;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     for (i = 0; i < 8; i++) {
         // FIXME: try pointer walks
@@ -540,9 +660,9 @@ static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
 {
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
+    s->pdsp.diff_pixels_unaligned(temp, src1, src2, stride);
     s->fdsp.fdct(temp);
     return s->mecc.sum_abs_dctelem(temp);
 }
@@ -582,7 +702,7 @@ static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
     int16_t dct[8][8];
     int i, sum = 0;
 
-    s->pdsp.diff_pixels(dct[0], src1, src2, stride);
+    s->pdsp.diff_pixels_unaligned(dct[0], src1, src2, stride);
 
 #define SRC(x) dct[i][x]
 #define DST(x, v) dct[i][x] = v
@@ -607,9 +727,9 @@ static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
     int sum = 0, i;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
+    s->pdsp.diff_pixels_unaligned(temp, src1, src2, stride);
     s->fdsp.fdct(temp);
 
     for (i = 0; i < 64; i++)
@@ -625,17 +745,17 @@ static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
     int16_t *const bak = temp + 64;
     int sum = 0, i;
 
-    assert(h == 8);
+    av_assert2(h == 8);
     s->mb_intra = 0;
 
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
+    s->pdsp.diff_pixels_unaligned(temp, src1, src2, stride);
 
     memcpy(bak, temp, 64 * sizeof(int16_t));
 
     s->block_last_index[0 /* FIXME */] =
         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
     s->dct_unquantize_inter(s, temp, 0, s->qscale);
-    ff_simple_idct_8(temp); // FIXME
+    ff_simple_idct_int16_8bit(temp); // FIXME
 
     for (i = 0; i < 64; i++)
         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
@@ -654,7 +774,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
     const int esc_length = s->ac_esc_length;
     uint8_t *length, *last_length;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     copy_block8(lsrc1, src1, 8, stride, 8);
     copy_block8(lsrc2, src2, 8, stride, 8);
@@ -698,7 +818,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
 
         level = temp[i] + 64;
 
-        assert(level - 64);
+        av_assert2(level - 64);
 
         if ((level & (~127)) == 0) {
             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
@@ -729,9 +849,9 @@ static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
     const int esc_length = s->ac_esc_length;
     uint8_t *length, *last_length;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
+    s->pdsp.diff_pixels_unaligned(temp, src1, src2, stride);
 
     s->block_last_index[0 /* FIXME */] =
     last                               =
@@ -770,7 +890,7 @@ static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
 
         level = temp[i] + 64;
 
-        assert(level - 64);
+        av_assert2(level - 64);
 
         if ((level & (~127)) == 0)
             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
@@ -803,20 +923,24 @@ static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
 VSAD_INTRA(8)
 VSAD_INTRA(16)
 
-static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
-                    ptrdiff_t stride, int h)
-{
-    int score = 0, x, y;
-
-    for (y = 1; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
-        s1 += stride;
-        s2 += stride;
-    }
-
-    return score;
+#define VSAD(size)                                                             \
+static int vsad ## size ## _c(MpegEncContext *c,                               \
+                              uint8_t *s1, uint8_t *s2,                        \
+                              ptrdiff_t stride, int h)                               \
+{                                                                              \
+    int score = 0, x, y;                                                       \
+                                                                               \
+    for (y = 1; y < h; y++) {                                                  \
+        for (x = 0; x < size; x++)                                             \
+            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);   \
+        s1 += stride;                                                          \
+        s2 += stride;                                                          \
+    }                                                                          \
+                                                                               \
+    return score;                                                              \
 }
+VSAD(8)
+VSAD(16)
 
 #define SQ(a) ((a) * (a))
 #define VSSE_INTRA(size)                                                \
@@ -841,20 +965,23 @@ static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
 VSSE_INTRA(8)
 VSSE_INTRA(16)
 
-static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
-                    ptrdiff_t stride, int h)
-{
-    int score = 0, x, y;
-
-    for (y = 1; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
-        s1 += stride;
-        s2 += stride;
-    }
-
-    return score;
+#define VSSE(size)                                                             \
+static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,     \
+                              ptrdiff_t stride, int h)                         \
+{                                                                              \
+    int score = 0, x, y;                                                       \
+                                                                               \
+    for (y = 1; y < h; y++) {                                                  \
+        for (x = 0; x < size; x++)                                             \
+            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);      \
+        s1 += stride;                                                          \
+        s2 += stride;                                                          \
+    }                                                                          \
+                                                                               \
+    return score;                                                              \
 }
+VSSE(8)
+VSSE(16)
 
 #define WRAPPER8_16_SQ(name8, name16)                                   \
 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
@@ -884,16 +1011,31 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
 
-av_cold void ff_me_cmp_init_static(void)
+int ff_check_alignment(void)
 {
-    int i;
+    static int did_fail = 0;
+    LOCAL_ALIGNED_16(int, aligned, [4]);
 
-    for (i = 0; i < 512; i++)
-        ff_square_tab[i] = (i - 256) * (i - 256);
+    if ((intptr_t)aligned & 15) {
+        if (!did_fail) {
+#if HAVE_MMX || HAVE_ALTIVEC
+            av_log(NULL, AV_LOG_ERROR,
+                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
+                "and may be very slow or crash. This is not a bug in libavcodec,\n"
+                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
+                "Do not report crashes to FFmpeg developers.\n");
+#endif
+            did_fail=1;
+        }
+        return -1;
+    }
+    return 0;
 }
 
 av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
 {
+    ff_check_alignment();
+
     c->sum_abs_dctelem = sum_abs_dctelem_c;
 
     /* TODO [0] 16  [1] 8 */
@@ -927,18 +1069,30 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     SET_CMP_FUNC(rd)
     SET_CMP_FUNC(bit)
     c->vsad[0] = vsad16_c;
+    c->vsad[1] = vsad8_c;
     c->vsad[4] = vsad_intra16_c;
     c->vsad[5] = vsad_intra8_c;
     c->vsse[0] = vsse16_c;
+    c->vsse[1] = vsse8_c;
     c->vsse[4] = vsse_intra16_c;
     c->vsse[5] = vsse_intra8_c;
     c->nsse[0] = nsse16_c;
     c->nsse[1] = nsse8_c;
+#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
+    ff_dsputil_init_dwt(c);
+#endif
 
+    if (ARCH_ALPHA)
+        ff_me_cmp_init_alpha(c, avctx);
     if (ARCH_ARM)
         ff_me_cmp_init_arm(c, avctx);
     if (ARCH_PPC)
         ff_me_cmp_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_me_cmp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_me_cmp_init_mips(c, avctx);
+
+    c->median_sad[0] = pix_median_abs16_c;
+    c->median_sad[1] = pix_median_abs8_c;
 }
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index 725f9b2..0a589e3 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,7 +23,21 @@
 
 #include "avcodec.h"
 
-extern uint32_t ff_square_tab[512];
+extern const uint32_t ff_square_tab[512];
+
+
+/* minimum alignment rules ;)
+ * If you notice errors in the align stuff, need more alignment for some ASM code
+ * for some CPU or need to use a function with less aligned data then send a mail
+ * to the ffmpeg-devel mailing list, ...
+ *
+ * !warning These alignments might not match reality, (missing attribute((align))
+ * stuff somewhere possible).
+ * I (Michael) did not check them, these are just the alignments which I think
+ * could be reached easily ...
+ *
+ * !future video codecs might need functions with less strict alignment
+ */
 
 struct MpegEncContext;
 /* Motion estimation:
@@ -49,6 +63,8 @@ typedef struct MECmpContext {
     me_cmp_func vsad[6];
     me_cmp_func vsse[6];
     me_cmp_func nsse[6];
+    me_cmp_func w53[6];
+    me_cmp_func w97[6];
     me_cmp_func dct_max[6];
     me_cmp_func dct264_sad[6];
 
@@ -60,15 +76,20 @@ typedef struct MECmpContext {
     me_cmp_func frame_skip_cmp[6]; // only width 8 used
 
     me_cmp_func pix_abs[2][4];
+    me_cmp_func median_sad[6];
 } MECmpContext;
 
-void ff_me_cmp_init_static(void);
+int ff_check_alignment(void);
 
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
 
 void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type);
 
+void ff_dsputil_init_dwt(MECmpContext *c);
+
 #endif /* AVCODEC_ME_CMP_H */
diff --git a/libavcodec/mediacodec.c b/libavcodec/mediacodec.c
new file mode 100644
index 0000000..aa14624
--- /dev/null
+++ b/libavcodec/mediacodec.c
@@ -0,0 +1,149 @@
+/*
+ * Android MediaCodec public API functions
+ *
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/error.h"
+
+#include "mediacodec.h"
+
+#if CONFIG_MEDIACODEC
+
+#include <jni.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavutil/mem.h"
+
+#include "ffjni.h"
+#include "mediacodecdec_common.h"
+#include "version.h"
+
+AVMediaCodecContext *av_mediacodec_alloc_context(void)
+{
+    return av_mallocz(sizeof(AVMediaCodecContext));
+}
+
+int av_mediacodec_default_init(AVCodecContext *avctx, AVMediaCodecContext *ctx, void *surface)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    env = ff_jni_get_env(avctx);
+    if (!env) {
+        return AVERROR_EXTERNAL;
+    }
+
+    ctx->surface = (*env)->NewGlobalRef(env, surface);
+    if (ctx->surface) {
+        avctx->hwaccel_context = ctx;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Could not create new global reference\n");
+        ret = AVERROR_EXTERNAL;
+    }
+
+    return ret;
+}
+
+void av_mediacodec_default_free(AVCodecContext *avctx)
+{
+    JNIEnv *env = NULL;
+
+    AVMediaCodecContext *ctx = avctx->hwaccel_context;
+
+    if (!ctx) {
+        return;
+    }
+
+    env = ff_jni_get_env(avctx);
+    if (!env) {
+        return;
+    }
+
+    if (ctx->surface) {
+        (*env)->DeleteGlobalRef(env, ctx->surface);
+        ctx->surface = NULL;
+    }
+
+    av_freep(&avctx->hwaccel_context);
+}
+
+int av_mediacodec_release_buffer(AVMediaCodecBuffer *buffer, int render)
+{
+    MediaCodecDecContext *ctx = buffer->ctx;
+    int released = atomic_fetch_add(&buffer->released, 1);
+
+    if (!released && (ctx->delay_flush || buffer->serial == atomic_load(&ctx->serial))) {
+        atomic_fetch_sub(&ctx->hw_buffer_count, 1);
+        av_log(ctx->avctx, AV_LOG_DEBUG,
+               "Releasing output buffer %zd (%p) ts=%"PRId64" with render=%d [%d pending]\n",
+               buffer->index, buffer, buffer->pts, render, atomic_load(&ctx->hw_buffer_count));
+        return ff_AMediaCodec_releaseOutputBuffer(ctx->codec, buffer->index, render);
+    }
+
+    return 0;
+}
+
+int av_mediacodec_render_buffer_at_time(AVMediaCodecBuffer *buffer, int64_t time)
+{
+    MediaCodecDecContext *ctx = buffer->ctx;
+    int released = atomic_fetch_add(&buffer->released, 1);
+
+    if (!released && (ctx->delay_flush || buffer->serial == atomic_load(&ctx->serial))) {
+        atomic_fetch_sub(&ctx->hw_buffer_count, 1);
+        av_log(ctx->avctx, AV_LOG_DEBUG,
+               "Rendering output buffer %zd (%p) ts=%"PRId64" with time=%"PRId64" [%d pending]\n",
+               buffer->index, buffer, buffer->pts, time, atomic_load(&ctx->hw_buffer_count));
+        return ff_AMediaCodec_releaseOutputBufferAtTime(ctx->codec, buffer->index, time);
+    }
+
+    return 0;
+}
+
+#else
+
+#include <stdlib.h>
+
+AVMediaCodecContext *av_mediacodec_alloc_context(void)
+{
+    return NULL;
+}
+
+int av_mediacodec_default_init(AVCodecContext *avctx, AVMediaCodecContext *ctx, void *surface)
+{
+    return AVERROR(ENOSYS);
+}
+
+void av_mediacodec_default_free(AVCodecContext *avctx)
+{
+}
+
+int av_mediacodec_release_buffer(AVMediaCodecBuffer *buffer, int render)
+{
+    return AVERROR(ENOSYS);
+}
+
+int av_mediacodec_render_buffer_at_time(AVMediaCodecBuffer *buffer, int64_t time)
+{
+    return AVERROR(ENOSYS);
+}
+
+#endif
diff --git a/libavcodec/mediacodec.h b/libavcodec/mediacodec.h
new file mode 100644
index 0000000..4c8545d
--- /dev/null
+++ b/libavcodec/mediacodec.h
@@ -0,0 +1,101 @@
+/*
+ * Android MediaCodec public API
+ *
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODEC_H
+#define AVCODEC_MEDIACODEC_H
+
+#include "libavcodec/avcodec.h"
+
+/**
+ * This structure holds a reference to a android/view/Surface object that will
+ * be used as output by the decoder.
+ *
+ */
+typedef struct AVMediaCodecContext {
+
+    /**
+     * android/view/Surface object reference.
+     */
+    void *surface;
+
+} AVMediaCodecContext;
+
+/**
+ * Allocate and initialize a MediaCodec context.
+ *
+ * When decoding with MediaCodec is finished, the caller must free the
+ * MediaCodec context with av_mediacodec_default_free.
+ *
+ * @return a pointer to a newly allocated AVMediaCodecContext on success, NULL otherwise
+ */
+AVMediaCodecContext *av_mediacodec_alloc_context(void);
+
+/**
+ * Convenience function that sets up the MediaCodec context.
+ *
+ * @param avctx codec context
+ * @param ctx MediaCodec context to initialize
+ * @param surface reference to an android/view/Surface
+ * @return 0 on success, < 0 otherwise
+ */
+int av_mediacodec_default_init(AVCodecContext *avctx, AVMediaCodecContext *ctx, void *surface);
+
+/**
+ * This function must be called to free the MediaCodec context initialized with
+ * av_mediacodec_default_init().
+ *
+ * @param avctx codec context
+ */
+void av_mediacodec_default_free(AVCodecContext *avctx);
+
+/**
+ * Opaque structure representing a MediaCodec buffer to render.
+ */
+typedef struct MediaCodecBuffer AVMediaCodecBuffer;
+
+/**
+ * Release a MediaCodec buffer and render it to the surface that is associated
+ * with the decoder. This function should only be called once on a given
+ * buffer, once released the underlying buffer returns to the codec, thus
+ * subsequent calls to this function will have no effect.
+ *
+ * @param buffer the buffer to render
+ * @param render 1 to release and render the buffer to the surface or 0 to
+ * discard the buffer
+ * @return 0 on success, < 0 otherwise
+ */
+int av_mediacodec_release_buffer(AVMediaCodecBuffer *buffer, int render);
+
+/**
+ * Release a MediaCodec buffer and render it at the given time to the surface
+ * that is associated with the decoder. The timestamp must be within one second
+ * of the current java/lang/System#nanoTime() (which is implemented using
+ * CLOCK_MONOTONIC on Android). See the Android MediaCodec documentation
+ * of android/media/MediaCodec#releaseOutputBuffer(int,long) for more details.
+ *
+ * @param buffer the buffer to render
+ * @param time timestamp in nanoseconds of when to render the buffer
+ * @return 0 on success, < 0 otherwise
+ */
+int av_mediacodec_render_buffer_at_time(AVMediaCodecBuffer *buffer, int64_t time);
+
+#endif /* AVCODEC_MEDIACODEC_H */
diff --git a/libavcodec/mediacodec_surface.c b/libavcodec/mediacodec_surface.c
new file mode 100644
index 0000000..aada1ec
--- /dev/null
+++ b/libavcodec/mediacodec_surface.c
@@ -0,0 +1,56 @@
+/*
+ * Android MediaCodec Surface functions
+ *
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <jni.h>
+
+#include "ffjni.h"
+#include "mediacodec_surface.h"
+
+void *ff_mediacodec_surface_ref(void *surface, void *log_ctx)
+{
+    JNIEnv *env = NULL;
+
+    void *reference = NULL;
+
+    env = ff_jni_get_env(log_ctx);
+    if (!env) {
+        return NULL;
+    }
+
+    reference = (*env)->NewGlobalRef(env, surface);
+
+    return reference;
+}
+
+int ff_mediacodec_surface_unref(void *surface, void *log_ctx)
+{
+    JNIEnv *env = NULL;
+
+    env = ff_jni_get_env(log_ctx);
+    if (!env) {
+        return AVERROR_EXTERNAL;
+    }
+
+    (*env)->DeleteGlobalRef(env, surface);
+
+    return 0;
+}
diff --git a/libavcodec/mediacodec_surface.h b/libavcodec/mediacodec_surface.h
new file mode 100644
index 0000000..0178b8a
--- /dev/null
+++ b/libavcodec/mediacodec_surface.h
@@ -0,0 +1,31 @@
+/*
+ * Android MediaCodec Surface functions
+ *
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODEC_SURFACE_H
+#define AVCODEC_MEDIACODEC_SURFACE_H
+
+#include "libavcodec/avcodec.h"
+
+void *ff_mediacodec_surface_ref(void *surface, void *log_ctx);
+int ff_mediacodec_surface_unref(void *surface, void *log_ctx);
+
+#endif /* AVCODEC_MEDIACODEC_SURFACE_H */
diff --git a/libavcodec/mediacodec_sw_buffer.c b/libavcodec/mediacodec_sw_buffer.c
new file mode 100644
index 0000000..92428e8
--- /dev/null
+++ b/libavcodec/mediacodec_sw_buffer.c
@@ -0,0 +1,339 @@
+/*
+ * Android MediaCodec software buffer copy functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include <sys/types.h>
+
+#include "libavutil/frame.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "mediacodec_wrapper.h"
+#include "mediacodec_sw_buffer.h"
+#include "mediacodecdec_common.h"
+
+#define QCOM_TILE_WIDTH 64
+#define QCOM_TILE_HEIGHT 32
+#define QCOM_TILE_SIZE (QCOM_TILE_WIDTH * QCOM_TILE_HEIGHT)
+#define QCOM_TILE_GROUP_SIZE (4 * QCOM_TILE_SIZE)
+
+/**
+ * The code handling the various YUV color formats is taken from the
+ * GStreamer project.
+ *
+ * Gstreamer reference:
+ * https://cgit.freedesktop.org/gstreamer/gst-plugins-bad/tree/sys/androidmedia/
+ *
+ * Copyright (C) 2012, Collabora Ltd.
+ *   Author: Sebastian Dröge <sebastian.droege@collabora.co.uk>
+ *
+ * Copyright (C) 2012, Rafaël Carré <funman@videolanorg>
+ *
+ * Copyright (C) 2015, Sebastian Dröge <sebastian@centricular.com>
+ *
+ * Copyright (C) 2014-2015, Collabora Ltd.
+ *   Author: Matthieu Bouron <matthieu.bouron@gcollabora.com>
+ *
+ * Copyright (C) 2015, Edward Hervey
+ *   Author: Edward Hervey <bilboed@gmail.com>
+ *
+ * Copyright (C) 2015, Matthew Waters <matthew@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+void ff_mediacodec_sw_buffer_copy_yuv420_planar(AVCodecContext *avctx,
+                                                MediaCodecDecContext *s,
+                                                uint8_t *data,
+                                                size_t size,
+                                                FFAMediaCodecBufferInfo *info,
+                                                AVFrame *frame)
+{
+    int i;
+    uint8_t *src = NULL;
+
+    for (i = 0; i < 3; i++) {
+        int stride = s->stride;
+        int height;
+
+        src = data + info->offset;
+        if (i == 0) {
+            height = avctx->height;
+
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        } else {
+            height = avctx->height / 2;
+            stride = (s->stride + 1) / 2;
+
+            src += s->slice_height * s->stride;
+
+            if (i == 2) {
+                src += ((s->slice_height + 1) / 2) * stride;
+            }
+
+            src += s->crop_top * stride;
+            src += (s->crop_left / 2);
+        }
+
+        if (frame->linesize[i] == stride) {
+            memcpy(frame->data[i], src, height * stride);
+        } else {
+            int j, width;
+            uint8_t *dst = frame->data[i];
+
+            if (i == 0) {
+                width = avctx->width;
+            } else if (i >= 1) {
+                width = FFMIN(frame->linesize[i], FFALIGN(avctx->width, 2) / 2);
+            }
+
+            for (j = 0; j < height; j++) {
+                memcpy(dst, src, width);
+                src += stride;
+                dst += frame->linesize[i];
+            }
+        }
+    }
+}
+
+void ff_mediacodec_sw_buffer_copy_yuv420_semi_planar(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame)
+{
+    int i;
+    uint8_t *src = NULL;
+
+    for (i = 0; i < 2; i++) {
+        int height;
+
+        src = data + info->offset;
+        if (i == 0) {
+            height = avctx->height;
+
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        } else if (i == 1) {
+            height = avctx->height / 2;
+
+            src += s->slice_height * s->stride;
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        }
+
+        if (frame->linesize[i] == s->stride) {
+            memcpy(frame->data[i], src, height * s->stride);
+        } else {
+            int j, width;
+            uint8_t *dst = frame->data[i];
+
+            if (i == 0) {
+                width = avctx->width;
+            } else if (i == 1) {
+                width = FFMIN(frame->linesize[i], FFALIGN(avctx->width, 2));
+            }
+
+            for (j = 0; j < height; j++) {
+                memcpy(dst, src, width);
+                src += s->stride;
+                dst += frame->linesize[i];
+            }
+        }
+    }
+}
+
+
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar(AVCodecContext *avctx,
+                                                            MediaCodecDecContext *s,
+                                                            uint8_t *data,
+                                                            size_t size,
+                                                            FFAMediaCodecBufferInfo *info,
+                                                            AVFrame *frame)
+{
+    int i;
+    uint8_t *src = NULL;
+
+    for (i = 0; i < 2; i++) {
+        int height;
+
+        src = data + info->offset;
+        if (i == 0) {
+            height = avctx->height;
+        } else if (i == 1) {
+            height = avctx->height / 2;
+
+            src += (s->slice_height - s->crop_top / 2) * s->stride;
+
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        }
+
+        if (frame->linesize[i] == s->stride) {
+            memcpy(frame->data[i], src, height * s->stride);
+        } else {
+            int j, width;
+            uint8_t *dst = frame->data[i];
+
+            if (i == 0) {
+                width = avctx->width;
+            } else if (i == 1) {
+                width = FFMIN(frame->linesize[i], FFALIGN(avctx->width, 2));
+            }
+
+            for (j = 0; j < height; j++) {
+                memcpy(dst, src, width);
+                src += s->stride;
+                dst += frame->linesize[i];
+            }
+        }
+    }
+}
+
+/**
+ * The code handling the QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka
+ * color format is taken from the VLC project.
+ *
+ * VLC reference:
+ * http://git.videolan.org/?p=vlc.git;a=blob;f=modules/codec/omxil/qcom.c;hb=HEAD
+ *
+ * VLC copyright notice:
+ *
+ *****************************************************************************
+ * qcom.c : pixel format translation for Qualcomm tiled nv12
+ *****************************************************************************
+ * Copyright © 2012 Rafaël Carré
+ *
+ * Authors: Rafaël Carré <funman@videolanorg>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *
+ */
+
+static size_t qcom_tile_pos(size_t x, size_t y, size_t w, size_t h)
+{
+  size_t flim = x + (y & ~1) * w;
+
+  if (y & 1) {
+    flim += (x & ~3) + 2;
+  } else if ((h & 1) == 0 || y != (h - 1)) {
+    flim += (x + 2) & ~3;
+  }
+
+  return flim;
+}
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar_64x32Tile2m8ka(AVCodecContext *avctx,
+                                                                           MediaCodecDecContext *s,
+                                                                           uint8_t *data,
+                                                                           size_t size,
+                                                                           FFAMediaCodecBufferInfo *info,
+                                                                           AVFrame *frame)
+{
+    size_t width = frame->width;
+    size_t linesize = frame->linesize[0];
+    size_t height = frame->height;
+
+    const size_t tile_w = (width - 1) / QCOM_TILE_WIDTH + 1;
+    const size_t tile_w_align = (tile_w + 1) & ~1;
+    const size_t tile_h_luma = (height - 1) / QCOM_TILE_HEIGHT + 1;
+    const size_t tile_h_chroma = (height / 2 - 1) / QCOM_TILE_HEIGHT + 1;
+
+    size_t luma_size = tile_w_align * tile_h_luma * QCOM_TILE_SIZE;
+    if((luma_size % QCOM_TILE_GROUP_SIZE) != 0)
+        luma_size = (((luma_size - 1) / QCOM_TILE_GROUP_SIZE) + 1) * QCOM_TILE_GROUP_SIZE;
+
+    for(size_t y = 0; y < tile_h_luma; y++) {
+        size_t row_width = width;
+        for(size_t x = 0; x < tile_w; x++) {
+            size_t tile_width = row_width;
+            size_t tile_height = height;
+            /* dest luma memory index for this tile */
+            size_t luma_idx = y * QCOM_TILE_HEIGHT * linesize + x * QCOM_TILE_WIDTH;
+            /* dest chroma memory index for this tile */
+            /* XXX: remove divisions */
+            size_t chroma_idx = (luma_idx / linesize) * linesize / 2 + (luma_idx % linesize);
+
+            /* luma source pointer for this tile */
+            const uint8_t *src_luma  = data
+                + qcom_tile_pos(x, y,tile_w_align, tile_h_luma) * QCOM_TILE_SIZE;
+
+            /* chroma source pointer for this tile */
+            const uint8_t *src_chroma = data + luma_size
+                + qcom_tile_pos(x, y/2, tile_w_align, tile_h_chroma) * QCOM_TILE_SIZE;
+            if (y & 1)
+                src_chroma += QCOM_TILE_SIZE/2;
+
+            /* account for right columns */
+            if (tile_width > QCOM_TILE_WIDTH)
+                tile_width = QCOM_TILE_WIDTH;
+
+            /* account for bottom rows */
+            if (tile_height > QCOM_TILE_HEIGHT)
+                tile_height = QCOM_TILE_HEIGHT;
+
+            tile_height /= 2;
+            while (tile_height--) {
+                memcpy(frame->data[0] + luma_idx, src_luma, tile_width);
+                src_luma += QCOM_TILE_WIDTH;
+                luma_idx += linesize;
+
+                memcpy(frame->data[0] + luma_idx, src_luma, tile_width);
+                src_luma += QCOM_TILE_WIDTH;
+                luma_idx += linesize;
+
+                memcpy(frame->data[1] + chroma_idx, src_chroma, tile_width);
+                src_chroma += QCOM_TILE_WIDTH;
+                chroma_idx += linesize;
+            }
+            row_width -= QCOM_TILE_WIDTH;
+        }
+        height -= QCOM_TILE_HEIGHT;
+    }
+}
diff --git a/libavcodec/mediacodec_sw_buffer.h b/libavcodec/mediacodec_sw_buffer.h
new file mode 100644
index 0000000..574fb52
--- /dev/null
+++ b/libavcodec/mediacodec_sw_buffer.h
@@ -0,0 +1,62 @@
+/*
+ * Android MediaCodec software buffer copy functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODEC_SW_BUFFER_H
+#define AVCODEC_MEDIACODEC_SW_BUFFER_H
+
+#include <sys/types.h>
+
+#include "libavutil/frame.h"
+
+#include "avcodec.h"
+#include "mediacodec_wrapper.h"
+#include "mediacodecdec_common.h"
+
+void ff_mediacodec_sw_buffer_copy_yuv420_planar(AVCodecContext *avctx,
+                                                MediaCodecDecContext *s,
+                                                uint8_t *data,
+                                                size_t size,
+                                                FFAMediaCodecBufferInfo *info,
+                                                AVFrame *frame);
+
+void ff_mediacodec_sw_buffer_copy_yuv420_semi_planar(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame);
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame);
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar_64x32Tile2m8ka(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame);
+
+#endif /* AVCODEC_MEDIACODEC_SW_BUFFER_H */
diff --git a/libavcodec/mediacodec_wrapper.c b/libavcodec/mediacodec_wrapper.c
new file mode 100644
index 0000000..a024e3b
--- /dev/null
+++ b/libavcodec/mediacodec_wrapper.c
@@ -0,0 +1,1704 @@
+/*
+ * Android MediaCodec Wrapper
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <jni.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/avstring.h"
+
+#include "avcodec.h"
+#include "ffjni.h"
+#include "version.h"
+#include "mediacodec_wrapper.h"
+
+struct JNIAMediaCodecListFields {
+
+    jclass mediacodec_list_class;
+    jmethodID init_id;
+    jmethodID find_decoder_for_format_id;
+
+    jmethodID get_codec_count_id;
+    jmethodID get_codec_info_at_id;
+
+    jclass mediacodec_info_class;
+    jmethodID get_name_id;
+    jmethodID get_codec_capabilities_id;
+    jmethodID get_supported_types_id;
+    jmethodID is_encoder_id;
+
+    jclass codec_capabilities_class;
+    jfieldID color_formats_id;
+    jfieldID profile_levels_id;
+
+    jclass codec_profile_level_class;
+    jfieldID profile_id;
+    jfieldID level_id;
+
+    jfieldID avc_profile_baseline_id;
+    jfieldID avc_profile_main_id;
+    jfieldID avc_profile_extended_id;
+    jfieldID avc_profile_high_id;
+    jfieldID avc_profile_high10_id;
+    jfieldID avc_profile_high422_id;
+    jfieldID avc_profile_high444_id;
+
+    jfieldID hevc_profile_main_id;
+    jfieldID hevc_profile_main10_id;
+    jfieldID hevc_profile_main10_hdr10_id;
+
+};
+
+static const struct FFJniField jni_amediacodeclist_mapping[] = {
+    { "android/media/MediaCodecList", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, mediacodec_list_class), 1 },
+        { "android/media/MediaCodecList", "<init>", "(I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, init_id), 0 },
+        { "android/media/MediaCodecList", "findDecoderForFormat", "(Landroid/media/MediaFormat;)Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, find_decoder_for_format_id), 0 },
+
+        { "android/media/MediaCodecList", "getCodecCount", "()I", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_count_id), 1 },
+        { "android/media/MediaCodecList", "getCodecInfoAt", "(I)Landroid/media/MediaCodecInfo;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_info_at_id), 1 },
+
+    { "android/media/MediaCodecInfo", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, mediacodec_info_class), 1 },
+        { "android/media/MediaCodecInfo", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_name_id), 1 },
+        { "android/media/MediaCodecInfo", "getCapabilitiesForType", "(Ljava/lang/String;)Landroid/media/MediaCodecInfo$CodecCapabilities;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_capabilities_id), 1 },
+        { "android/media/MediaCodecInfo", "getSupportedTypes", "()[Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_supported_types_id), 1 },
+        { "android/media/MediaCodecInfo", "isEncoder", "()Z", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, is_encoder_id), 1 },
+
+    { "android/media/MediaCodecInfo$CodecCapabilities", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, codec_capabilities_class), 1 },
+        { "android/media/MediaCodecInfo$CodecCapabilities", "colorFormats", "[I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, color_formats_id), 1 },
+        { "android/media/MediaCodecInfo$CodecCapabilities", "profileLevels", "[Landroid/media/MediaCodecInfo$CodecProfileLevel;", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, profile_levels_id), 1 },
+
+    { "android/media/MediaCodecInfo$CodecProfileLevel", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, codec_profile_level_class), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "profile", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, profile_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "level", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, level_id), 1 },
+
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileBaseline", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_baseline_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileMain", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_main_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileExtended", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_extended_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh10", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high10_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh422", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high422_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh444", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high444_id), 1 },
+
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "HEVCProfileMain", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, hevc_profile_main_id), 0 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "HEVCProfileMain10", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, hevc_profile_main10_id), 0 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "HEVCProfileMain10HDR10", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, hevc_profile_main10_hdr10_id), 0 },
+
+    { NULL }
+};
+
+struct JNIAMediaFormatFields {
+
+    jclass mediaformat_class;
+
+    jmethodID init_id;
+
+    jmethodID contains_key_id;
+
+    jmethodID get_integer_id;
+    jmethodID get_long_id;
+    jmethodID get_float_id;
+    jmethodID get_bytebuffer_id;
+    jmethodID get_string_id;
+
+    jmethodID set_integer_id;
+    jmethodID set_long_id;
+    jmethodID set_float_id;
+    jmethodID set_bytebuffer_id;
+    jmethodID set_string_id;
+
+    jmethodID to_string_id;
+
+};
+
+static const struct FFJniField jni_amediaformat_mapping[] = {
+    { "android/media/MediaFormat", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaFormatFields, mediaformat_class), 1 },
+
+        { "android/media/MediaFormat", "<init>", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, init_id), 1 },
+
+        { "android/media/MediaFormat", "containsKey", "(Ljava/lang/String;)Z", FF_JNI_METHOD,offsetof(struct JNIAMediaFormatFields, contains_key_id), 1 },
+
+        { "android/media/MediaFormat", "getInteger", "(Ljava/lang/String;)I", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_integer_id), 1 },
+        { "android/media/MediaFormat", "getLong", "(Ljava/lang/String;)J", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_long_id), 1 },
+        { "android/media/MediaFormat", "getFloat", "(Ljava/lang/String;)F", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_float_id), 1 },
+        { "android/media/MediaFormat", "getByteBuffer", "(Ljava/lang/String;)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_bytebuffer_id), 1 },
+        { "android/media/MediaFormat", "getString", "(Ljava/lang/String;)Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_string_id), 1 },
+
+        { "android/media/MediaFormat", "setInteger", "(Ljava/lang/String;I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_integer_id), 1 },
+        { "android/media/MediaFormat", "setLong", "(Ljava/lang/String;J)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_long_id), 1 },
+        { "android/media/MediaFormat", "setFloat", "(Ljava/lang/String;F)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_float_id), 1 },
+        { "android/media/MediaFormat", "setByteBuffer", "(Ljava/lang/String;Ljava/nio/ByteBuffer;)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_bytebuffer_id), 1 },
+        { "android/media/MediaFormat", "setString", "(Ljava/lang/String;Ljava/lang/String;)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_string_id), 1 },
+
+        { "android/media/MediaFormat", "toString", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, to_string_id), 1 },
+
+    { NULL }
+};
+
+static const AVClass amediaformat_class = {
+    .class_name = "amediaformat",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+struct FFAMediaFormat {
+
+    const AVClass *class;
+    struct JNIAMediaFormatFields jfields;
+    jobject object;
+};
+
+struct JNIAMediaCodecFields {
+
+    jclass mediacodec_class;
+
+    jfieldID info_try_again_later_id;
+    jfieldID info_output_buffers_changed_id;
+    jfieldID info_output_format_changed_id;
+
+    jfieldID buffer_flag_codec_config_id;
+    jfieldID buffer_flag_end_of_stream_id;
+    jfieldID buffer_flag_key_frame_id;
+
+    jfieldID configure_flag_encode_id;
+
+    jmethodID create_by_codec_name_id;
+    jmethodID create_decoder_by_type_id;
+    jmethodID create_encoder_by_type_id;
+
+    jmethodID get_name_id;
+
+    jmethodID configure_id;
+    jmethodID start_id;
+    jmethodID flush_id;
+    jmethodID stop_id;
+    jmethodID release_id;
+
+    jmethodID get_output_format_id;
+
+    jmethodID dequeue_input_buffer_id;
+    jmethodID queue_input_buffer_id;
+    jmethodID get_input_buffer_id;
+    jmethodID get_input_buffers_id;
+
+    jmethodID dequeue_output_buffer_id;
+    jmethodID get_output_buffer_id;
+    jmethodID get_output_buffers_id;
+    jmethodID release_output_buffer_id;
+    jmethodID release_output_buffer_at_time_id;
+
+    jclass mediainfo_class;
+
+    jmethodID init_id;
+
+    jfieldID flags_id;
+    jfieldID offset_id;
+    jfieldID presentation_time_us_id;
+    jfieldID size_id;
+
+};
+
+static const struct FFJniField jni_amediacodec_mapping[] = {
+    { "android/media/MediaCodec", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecFields, mediacodec_class), 1 },
+
+        { "android/media/MediaCodec", "INFO_TRY_AGAIN_LATER", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_try_again_later_id), 1 },
+        { "android/media/MediaCodec", "INFO_OUTPUT_BUFFERS_CHANGED", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_output_buffers_changed_id), 1 },
+        { "android/media/MediaCodec", "INFO_OUTPUT_FORMAT_CHANGED", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_output_format_changed_id), 1 },
+
+        { "android/media/MediaCodec", "BUFFER_FLAG_CODEC_CONFIG", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_codec_config_id), 1 },
+        { "android/media/MediaCodec", "BUFFER_FLAG_END_OF_STREAM", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_end_of_stream_id), 1 },
+        { "android/media/MediaCodec", "BUFFER_FLAG_KEY_FRAME", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_key_frame_id), 0 },
+
+        { "android/media/MediaCodec", "CONFIGURE_FLAG_ENCODE", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, configure_flag_encode_id), 1 },
+
+        { "android/media/MediaCodec", "createByCodecName", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_by_codec_name_id), 1 },
+        { "android/media/MediaCodec", "createDecoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_decoder_by_type_id), 1 },
+        { "android/media/MediaCodec", "createEncoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_encoder_by_type_id), 1 },
+
+        { "android/media/MediaCodec", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_name_id), 1 },
+
+        { "android/media/MediaCodec", "configure", "(Landroid/media/MediaFormat;Landroid/view/Surface;Landroid/media/MediaCrypto;I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, configure_id), 1 },
+        { "android/media/MediaCodec", "start", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, start_id), 1 },
+        { "android/media/MediaCodec", "flush", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, flush_id), 1 },
+        { "android/media/MediaCodec", "stop", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, stop_id), 1 },
+        { "android/media/MediaCodec", "release", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_id), 1 },
+
+        { "android/media/MediaCodec", "getOutputFormat", "()Landroid/media/MediaFormat;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_format_id), 1 },
+
+        { "android/media/MediaCodec", "dequeueInputBuffer", "(J)I", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, dequeue_input_buffer_id), 1 },
+        { "android/media/MediaCodec", "queueInputBuffer", "(IIIJI)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, queue_input_buffer_id), 1 },
+        { "android/media/MediaCodec", "getInputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_input_buffer_id), 0 },
+        { "android/media/MediaCodec", "getInputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_input_buffers_id), 1 },
+
+        { "android/media/MediaCodec", "dequeueOutputBuffer", "(Landroid/media/MediaCodec$BufferInfo;J)I", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, dequeue_output_buffer_id), 1 },
+        { "android/media/MediaCodec", "getOutputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_buffer_id), 0 },
+        { "android/media/MediaCodec", "getOutputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_buffers_id), 1 },
+        { "android/media/MediaCodec", "releaseOutputBuffer", "(IZ)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_output_buffer_id), 1 },
+        { "android/media/MediaCodec", "releaseOutputBuffer", "(IJ)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_output_buffer_at_time_id), 0 },
+
+    { "android/media/MediaCodec$BufferInfo", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecFields, mediainfo_class), 1 },
+
+        { "android/media/MediaCodec.BufferInfo", "<init>", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, init_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "flags", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, flags_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "offset", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, offset_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "presentationTimeUs", "J", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, presentation_time_us_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "size", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, size_id), 1 },
+
+    { NULL }
+};
+
+static const AVClass amediacodec_class = {
+    .class_name = "amediacodec",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+struct FFAMediaCodec {
+
+    const AVClass *class;
+
+    struct JNIAMediaCodecFields jfields;
+
+    jobject object;
+    jobject buffer_info;
+
+    jobject input_buffers;
+    jobject output_buffers;
+
+    int INFO_TRY_AGAIN_LATER;
+    int INFO_OUTPUT_BUFFERS_CHANGED;
+    int INFO_OUTPUT_FORMAT_CHANGED;
+
+    int BUFFER_FLAG_CODEC_CONFIG;
+    int BUFFER_FLAG_END_OF_STREAM;
+    int BUFFER_FLAG_KEY_FRAME;
+
+    int CONFIGURE_FLAG_ENCODE;
+
+    int has_get_i_o_buffer;
+};
+
+#define JNI_GET_ENV_OR_RETURN(env, log_ctx, ret) do {              \
+    (env) = ff_jni_get_env(log_ctx);                               \
+    if (!(env)) {                                                  \
+        return ret;                                                \
+    }                                                              \
+} while (0)
+
+#define JNI_GET_ENV_OR_RETURN_VOID(env, log_ctx) do {              \
+    (env) = ff_jni_get_env(log_ctx);                               \
+    if (!(env)) {                                                  \
+        return;                                                    \
+    }                                                              \
+} while (0)
+
+int ff_AMediaCodecProfile_getProfileFromAVCodecContext(AVCodecContext *avctx)
+{
+    int ret = -1;
+
+    JNIEnv *env = NULL;
+    struct JNIAMediaCodecListFields jfields = { 0 };
+    jfieldID field_id = 0;
+
+    JNI_GET_ENV_OR_RETURN(env, avctx, -1);
+
+    if (ff_jni_init_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, avctx) < 0) {
+        goto done;
+    }
+
+    if (avctx->codec_id == AV_CODEC_ID_H264) {
+        switch(avctx->profile) {
+        case FF_PROFILE_H264_BASELINE:
+        case FF_PROFILE_H264_CONSTRAINED_BASELINE:
+            field_id = jfields.avc_profile_baseline_id;
+            break;
+        case FF_PROFILE_H264_MAIN:
+            field_id = jfields.avc_profile_main_id;
+            break;
+        case FF_PROFILE_H264_EXTENDED:
+            field_id = jfields.avc_profile_extended_id;
+            break;
+        case FF_PROFILE_H264_HIGH:
+            field_id = jfields.avc_profile_high_id;
+            break;
+        case FF_PROFILE_H264_HIGH_10:
+        case FF_PROFILE_H264_HIGH_10_INTRA:
+            field_id = jfields.avc_profile_high10_id;
+            break;
+        case FF_PROFILE_H264_HIGH_422:
+        case FF_PROFILE_H264_HIGH_422_INTRA:
+            field_id = jfields.avc_profile_high422_id;
+            break;
+        case FF_PROFILE_H264_HIGH_444:
+        case FF_PROFILE_H264_HIGH_444_INTRA:
+        case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
+            field_id = jfields.avc_profile_high444_id;
+            break;
+        }
+    } else if (avctx->codec_id == AV_CODEC_ID_HEVC) {
+        switch (avctx->profile) {
+        case FF_PROFILE_HEVC_MAIN:
+        case FF_PROFILE_HEVC_MAIN_STILL_PICTURE:
+            field_id = jfields.hevc_profile_main_id;
+            break;
+        case FF_PROFILE_HEVC_MAIN_10:
+            field_id = jfields.hevc_profile_main10_id;
+            break;
+        }
+    }
+
+        if (field_id) {
+            ret = (*env)->GetStaticIntField(env, jfields.codec_profile_level_class, field_id);
+            if (ff_jni_exception_check(env, 1, avctx) < 0) {
+                ret = -1;
+                goto done;
+            }
+        }
+
+done:
+    ff_jni_reset_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, avctx);
+
+    return ret;
+}
+
+char *ff_AMediaCodecList_getCodecNameByType(const char *mime, int profile, int encoder, void *log_ctx)
+{
+    int ret;
+    int i;
+    int codec_count;
+    int found_codec = 0;
+    char *name = NULL;
+    char *supported_type = NULL;
+
+    JNIEnv *env = NULL;
+    struct JNIAMediaCodecListFields jfields = { 0 };
+    struct JNIAMediaFormatFields mediaformat_jfields = { 0 };
+
+    jobject format = NULL;
+    jobject codec = NULL;
+    jobject codec_name = NULL;
+
+    jobject info = NULL;
+    jobject type = NULL;
+    jobjectArray types = NULL;
+
+    jobject capabilities = NULL;
+    jobject profile_level = NULL;
+    jobjectArray profile_levels = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, log_ctx, NULL);
+
+    if ((ret = ff_jni_init_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, log_ctx)) < 0) {
+        goto done;
+    }
+
+    if ((ret = ff_jni_init_jfields(env, &mediaformat_jfields, jni_amediaformat_mapping, 0, log_ctx)) < 0) {
+        goto done;
+    }
+
+    codec_count = (*env)->CallStaticIntMethod(env, jfields.mediacodec_list_class, jfields.get_codec_count_id);
+    if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+        goto done;
+    }
+
+    for(i = 0; i < codec_count; i++) {
+        int j;
+        int type_count;
+        int is_encoder;
+
+        info = (*env)->CallStaticObjectMethod(env, jfields.mediacodec_list_class, jfields.get_codec_info_at_id, i);
+        if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+            goto done;
+        }
+
+        types = (*env)->CallObjectMethod(env, info, jfields.get_supported_types_id);
+        if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+            goto done;
+        }
+
+        is_encoder = (*env)->CallBooleanMethod(env, info, jfields.is_encoder_id);
+        if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+            goto done;
+        }
+
+        if (is_encoder != encoder) {
+            goto done_with_info;
+        }
+
+        type_count = (*env)->GetArrayLength(env, types);
+        for (j = 0; j < type_count; j++) {
+            int k;
+            int profile_count;
+
+            type = (*env)->GetObjectArrayElement(env, types, j);
+            if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                goto done;
+            }
+
+            supported_type = ff_jni_jstring_to_utf_chars(env, type, log_ctx);
+            if (!supported_type) {
+                goto done;
+            }
+
+            if (!av_strcasecmp(supported_type, mime)) {
+                codec_name = (*env)->CallObjectMethod(env, info, jfields.get_name_id);
+                if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                    goto done;
+                }
+
+                name = ff_jni_jstring_to_utf_chars(env, codec_name, log_ctx);
+                if (!name) {
+                    goto done;
+                }
+
+                /* Skip software decoders */
+                if (
+                    strstr(name, "OMX.google") ||
+                    strstr(name, "OMX.ffmpeg") ||
+                    (strstr(name, "OMX.SEC") && strstr(name, ".sw.")) ||
+                    !strcmp(name, "OMX.qcom.video.decoder.hevcswvdec")) {
+                    av_freep(&name);
+                    goto done_with_type;
+                }
+
+                capabilities = (*env)->CallObjectMethod(env, info, jfields.get_codec_capabilities_id, type);
+                if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                    goto done;
+                }
+
+                profile_levels = (*env)->GetObjectField(env, capabilities, jfields.profile_levels_id);
+                if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                    goto done;
+                }
+
+                profile_count = (*env)->GetArrayLength(env, profile_levels);
+                if (!profile_count) {
+                    found_codec = 1;
+                }
+                for (k = 0; k < profile_count; k++) {
+                    int supported_profile = 0;
+
+                    if (profile < 0) {
+                        found_codec = 1;
+                        break;
+                    }
+
+                    profile_level = (*env)->GetObjectArrayElement(env, profile_levels, k);
+                    if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                        goto done;
+                    }
+
+                    supported_profile = (*env)->GetIntField(env, profile_level, jfields.profile_id);
+                    if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                        goto done;
+                    }
+
+                    found_codec = profile == supported_profile;
+
+                    if (profile_level) {
+                        (*env)->DeleteLocalRef(env, profile_level);
+                        profile_level = NULL;
+                    }
+
+                    if (found_codec) {
+                        break;
+                    }
+                }
+            }
+
+done_with_type:
+            if (profile_levels) {
+                (*env)->DeleteLocalRef(env, profile_levels);
+                profile_levels = NULL;
+            }
+
+            if (capabilities) {
+                (*env)->DeleteLocalRef(env, capabilities);
+                capabilities = NULL;
+            }
+
+            if (type) {
+                (*env)->DeleteLocalRef(env, type);
+                type = NULL;
+            }
+
+            av_freep(&supported_type);
+
+            if (found_codec) {
+                break;
+            }
+
+            av_freep(&name);
+        }
+
+done_with_info:
+        if (info) {
+            (*env)->DeleteLocalRef(env, info);
+            info = NULL;
+        }
+
+        if (types) {
+            (*env)->DeleteLocalRef(env, types);
+            types = NULL;
+        }
+
+        if (found_codec) {
+            break;
+        }
+    }
+
+done:
+    if (format) {
+        (*env)->DeleteLocalRef(env, format);
+    }
+
+    if (codec) {
+        (*env)->DeleteLocalRef(env, codec);
+    }
+
+    if (codec_name) {
+        (*env)->DeleteLocalRef(env, codec_name);
+    }
+
+    if (info) {
+        (*env)->DeleteLocalRef(env, info);
+    }
+
+    if (type) {
+        (*env)->DeleteLocalRef(env, type);
+    }
+
+    if (types) {
+        (*env)->DeleteLocalRef(env, types);
+    }
+
+    if (capabilities) {
+        (*env)->DeleteLocalRef(env, capabilities);
+    }
+
+    if (profile_level) {
+        (*env)->DeleteLocalRef(env, profile_level);
+    }
+
+    if (profile_levels) {
+        (*env)->DeleteLocalRef(env, profile_levels);
+    }
+
+    av_freep(&supported_type);
+
+    ff_jni_reset_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, log_ctx);
+    ff_jni_reset_jfields(env, &mediaformat_jfields, jni_amediaformat_mapping, 0, log_ctx);
+
+    if (!found_codec) {
+        av_freep(&name);
+    }
+
+    return name;
+}
+
+FFAMediaFormat *ff_AMediaFormat_new(void)
+{
+    JNIEnv *env = NULL;
+    FFAMediaFormat *format = NULL;
+    jobject object = NULL;
+
+    format = av_mallocz(sizeof(FFAMediaFormat));
+    if (!format) {
+        return NULL;
+    }
+    format->class = &amediaformat_class;
+
+    env = ff_jni_get_env(format);
+    if (!env) {
+        av_freep(&format);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format) < 0) {
+        goto fail;
+    }
+
+    object = (*env)->NewObject(env, format->jfields.mediaformat_class, format->jfields.init_id);
+    if (!object) {
+        goto fail;
+    }
+
+    format->object = (*env)->NewGlobalRef(env, object);
+    if (!format->object) {
+        goto fail;
+    }
+
+fail:
+    if (object) {
+        (*env)->DeleteLocalRef(env, object);
+    }
+
+    if (!format->object) {
+        ff_jni_reset_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format);
+        av_freep(&format);
+    }
+
+    return format;
+}
+
+static FFAMediaFormat *ff_AMediaFormat_newFromObject(void *object)
+{
+    JNIEnv *env = NULL;
+    FFAMediaFormat *format = NULL;
+
+    format = av_mallocz(sizeof(FFAMediaFormat));
+    if (!format) {
+        return NULL;
+    }
+    format->class = &amediaformat_class;
+
+    env = ff_jni_get_env(format);
+    if (!env) {
+        av_freep(&format);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format) < 0) {
+        goto fail;
+    }
+
+    format->object = (*env)->NewGlobalRef(env, object);
+    if (!format->object) {
+        goto fail;
+    }
+
+    return format;
+fail:
+    ff_jni_reset_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format);
+
+    av_freep(&format);
+
+    return NULL;
+}
+
+int ff_AMediaFormat_delete(FFAMediaFormat* format)
+{
+    int ret = 0;
+
+    JNIEnv *env = NULL;
+
+    if (!format) {
+        return 0;
+    }
+
+    JNI_GET_ENV_OR_RETURN(env, format, AVERROR_EXTERNAL);
+
+    (*env)->DeleteGlobalRef(env, format->object);
+    format->object = NULL;
+
+    ff_jni_reset_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format);
+
+    av_freep(&format);
+
+    return ret;
+}
+
+char* ff_AMediaFormat_toString(FFAMediaFormat* format)
+{
+    char *ret = NULL;
+
+    JNIEnv *env = NULL;
+    jstring description = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN(env, format, NULL);
+
+    description = (*env)->CallObjectMethod(env, format->object, format->jfields.to_string_id);
+    if (ff_jni_exception_check(env, 1, NULL) < 0) {
+        goto fail;
+    }
+
+    ret = ff_jni_jstring_to_utf_chars(env, description, format);
+fail:
+    if (description) {
+        (*env)->DeleteLocalRef(env, description);
+    }
+
+    return ret;
+}
+
+int ff_AMediaFormat_getInt32(FFAMediaFormat* format, const char *name, int32_t *out)
+{
+    int ret = 1;
+
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jboolean contains_key;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN(env, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    contains_key = (*env)->CallBooleanMethod(env, format->object, format->jfields.contains_key_id, key);
+    if (!contains_key || (ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = (*env)->CallIntMethod(env, format->object, format->jfields.get_integer_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    return ret;
+}
+
+int ff_AMediaFormat_getInt64(FFAMediaFormat* format, const char *name, int64_t *out)
+{
+    int ret = 1;
+
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jboolean contains_key;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN(env, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    contains_key = (*env)->CallBooleanMethod(env, format->object, format->jfields.contains_key_id, key);
+    if (!contains_key || (ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = (*env)->CallLongMethod(env, format->object, format->jfields.get_long_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    return ret;
+}
+
+int ff_AMediaFormat_getFloat(FFAMediaFormat* format, const char *name, float *out)
+{
+    int ret = 1;
+
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jboolean contains_key;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN(env, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    contains_key = (*env)->CallBooleanMethod(env, format->object, format->jfields.contains_key_id, key);
+    if (!contains_key || (ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = (*env)->CallFloatMethod(env, format->object, format->jfields.get_float_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    return ret;
+}
+
+int ff_AMediaFormat_getBuffer(FFAMediaFormat* format, const char *name, void** data, size_t *size)
+{
+    int ret = 1;
+
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jboolean contains_key;
+    jobject result = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN(env, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    contains_key = (*env)->CallBooleanMethod(env, format->object, format->jfields.contains_key_id, key);
+    if (!contains_key || (ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    result = (*env)->CallObjectMethod(env, format->object, format->jfields.get_bytebuffer_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    *data = (*env)->GetDirectBufferAddress(env, result);
+    *size = (*env)->GetDirectBufferCapacity(env, result);
+
+    if (*data && *size) {
+        void *src = *data;
+        *data = av_malloc(*size);
+        if (!*data) {
+            ret = 0;
+            goto fail;
+        }
+
+        memcpy(*data, src, *size);
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (result) {
+        (*env)->DeleteLocalRef(env, result);
+    }
+
+    return ret;
+}
+
+int ff_AMediaFormat_getString(FFAMediaFormat* format, const char *name, const char **out)
+{
+    int ret = 1;
+
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jboolean contains_key;
+    jstring result = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN(env, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    contains_key = (*env)->CallBooleanMethod(env, format->object, format->jfields.contains_key_id, key);
+    if (!contains_key || (ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    result = (*env)->CallObjectMethod(env, format->object, format->jfields.get_string_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = ff_jni_jstring_to_utf_chars(env, result, format);
+    if (!*out) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (result) {
+        (*env)->DeleteLocalRef(env, result);
+    }
+
+    return ret;
+}
+
+void ff_AMediaFormat_setInt32(FFAMediaFormat* format, const char* name, int32_t value)
+{
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN_VOID(env, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_integer_id, key, value);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+}
+
+void ff_AMediaFormat_setInt64(FFAMediaFormat* format, const char* name, int64_t value)
+{
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN_VOID(env, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_long_id, key, value);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+}
+
+void ff_AMediaFormat_setFloat(FFAMediaFormat* format, const char* name, float value)
+{
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN_VOID(env, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_float_id, key, value);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+}
+
+void ff_AMediaFormat_setString(FFAMediaFormat* format, const char* name, const char* value)
+{
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jstring string = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN_VOID(env, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    string = ff_jni_utf_chars_to_jstring(env, value, format);
+    if (!string) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_string_id, key, string);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (string) {
+        (*env)->DeleteLocalRef(env, string);
+    }
+}
+
+void ff_AMediaFormat_setBuffer(FFAMediaFormat* format, const char* name, void* data, size_t size)
+{
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jobject buffer = NULL;
+    void *buffer_data = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_GET_ENV_OR_RETURN_VOID(env, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    if (!data || !size) {
+        goto fail;
+    }
+
+    buffer_data = av_malloc(size);
+    if (!buffer_data) {
+        goto fail;
+    }
+
+    memcpy(buffer_data, data, size);
+
+    buffer = (*env)->NewDirectByteBuffer(env, buffer_data, size);
+    if (!buffer) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_bytebuffer_id, key, buffer);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (buffer) {
+        (*env)->DeleteLocalRef(env, buffer);
+    }
+}
+
+static int codec_init_static_fields(FFAMediaCodec *codec)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    codec->INFO_TRY_AGAIN_LATER = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_try_again_later_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->BUFFER_FLAG_CODEC_CONFIG = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.buffer_flag_codec_config_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->BUFFER_FLAG_END_OF_STREAM = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.buffer_flag_end_of_stream_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    if (codec->jfields.buffer_flag_key_frame_id) {
+        codec->BUFFER_FLAG_KEY_FRAME = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.buffer_flag_key_frame_id);
+        if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+            goto fail;
+        }
+    }
+
+    codec->CONFIGURE_FLAG_ENCODE = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.configure_flag_encode_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->INFO_TRY_AGAIN_LATER = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_try_again_later_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->INFO_OUTPUT_BUFFERS_CHANGED = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_output_buffers_changed_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->INFO_OUTPUT_FORMAT_CHANGED = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_output_format_changed_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+fail:
+
+    return ret;
+}
+
+#define CREATE_CODEC_BY_NAME   0
+#define CREATE_DECODER_BY_TYPE 1
+#define CREATE_ENCODER_BY_TYPE 2
+
+static inline FFAMediaCodec *codec_create(int method, const char *arg)
+{
+    int ret = -1;
+    JNIEnv *env = NULL;
+    FFAMediaCodec *codec = NULL;
+    jstring jarg = NULL;
+    jobject object = NULL;
+    jobject buffer_info = NULL;
+    jmethodID create_id = NULL;
+
+    codec = av_mallocz(sizeof(FFAMediaCodec));
+    if (!codec) {
+        return NULL;
+    }
+    codec->class = &amediacodec_class;
+
+    env = ff_jni_get_env(codec);
+    if (!env) {
+        av_freep(&codec);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec) < 0) {
+        goto fail;
+    }
+
+    jarg = ff_jni_utf_chars_to_jstring(env, arg, codec);
+    if (!jarg) {
+        goto fail;
+    }
+
+    switch (method) {
+    case CREATE_CODEC_BY_NAME:   create_id = codec->jfields.create_by_codec_name_id;   break;
+    case CREATE_DECODER_BY_TYPE: create_id = codec->jfields.create_decoder_by_type_id; break;
+    case CREATE_ENCODER_BY_TYPE: create_id = codec->jfields.create_encoder_by_type_id; break;
+    default:
+        av_assert0(0);
+    }
+
+    object = (*env)->CallStaticObjectMethod(env,
+                                            codec->jfields.mediacodec_class,
+                                            create_id,
+                                            jarg);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    codec->object = (*env)->NewGlobalRef(env, object);
+    if (!codec->object) {
+        goto fail;
+    }
+
+    if (codec_init_static_fields(codec) < 0) {
+        goto fail;
+    }
+
+    if (codec->jfields.get_input_buffer_id && codec->jfields.get_output_buffer_id) {
+        codec->has_get_i_o_buffer = 1;
+    }
+
+    buffer_info = (*env)->NewObject(env, codec->jfields.mediainfo_class, codec->jfields.init_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    codec->buffer_info = (*env)->NewGlobalRef(env, buffer_info);
+    if (!codec->buffer_info) {
+        goto fail;
+    }
+
+    ret = 0;
+fail:
+    if (jarg) {
+        (*env)->DeleteLocalRef(env, jarg);
+    }
+
+    if (object) {
+        (*env)->DeleteLocalRef(env, object);
+    }
+
+    if (buffer_info) {
+        (*env)->DeleteLocalRef(env, buffer_info);
+    }
+
+    if (ret < 0) {
+        if (codec->object) {
+            (*env)->DeleteGlobalRef(env, codec->object);
+        }
+
+        if (codec->buffer_info) {
+            (*env)->DeleteGlobalRef(env, codec->buffer_info);
+        }
+
+        ff_jni_reset_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec);
+        av_freep(&codec);
+    }
+
+    return codec;
+}
+
+#define DECLARE_FF_AMEDIACODEC_CREATE_FUNC(name, method) \
+FFAMediaCodec *ff_AMediaCodec_##name(const char *arg)    \
+{                                                        \
+    return codec_create(method, arg);                    \
+}                                                        \
+
+DECLARE_FF_AMEDIACODEC_CREATE_FUNC(createCodecByName,   CREATE_CODEC_BY_NAME)
+DECLARE_FF_AMEDIACODEC_CREATE_FUNC(createDecoderByType, CREATE_DECODER_BY_TYPE)
+DECLARE_FF_AMEDIACODEC_CREATE_FUNC(createEncoderByType, CREATE_ENCODER_BY_TYPE)
+
+int ff_AMediaCodec_delete(FFAMediaCodec* codec)
+{
+    int ret = 0;
+
+    JNIEnv *env = NULL;
+
+    if (!codec) {
+        return 0;
+    }
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.release_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+    }
+
+    (*env)->DeleteGlobalRef(env, codec->object);
+    codec->object = NULL;
+
+    (*env)->DeleteGlobalRef(env, codec->buffer_info);
+    codec->buffer_info = NULL;
+
+    ff_jni_reset_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec);
+
+    av_freep(&codec);
+
+    return ret;
+}
+
+char *ff_AMediaCodec_getName(FFAMediaCodec *codec)
+{
+    char *ret = NULL;
+    JNIEnv *env = NULL;
+    jobject *name = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, NULL);
+
+    name = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_name_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    ret = ff_jni_jstring_to_utf_chars(env, name, codec);
+
+fail:
+    return ret;
+}
+
+int ff_AMediaCodec_configure(FFAMediaCodec* codec, const FFAMediaFormat* format, void* surface, void *crypto, uint32_t flags)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.configure_id, format->object, surface, NULL, flags);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+int ff_AMediaCodec_start(FFAMediaCodec* codec)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.start_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+int ff_AMediaCodec_stop(FFAMediaCodec* codec)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.stop_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+int ff_AMediaCodec_flush(FFAMediaCodec* codec)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.flush_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+int ff_AMediaCodec_releaseOutputBuffer(FFAMediaCodec* codec, size_t idx, int render)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.release_output_buffer_id, (jint)idx, (jboolean)render);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+int ff_AMediaCodec_releaseOutputBufferAtTime(FFAMediaCodec *codec, size_t idx, int64_t timestampNs)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.release_output_buffer_at_time_id, (jint)idx, (jlong)timestampNs);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+ssize_t ff_AMediaCodec_dequeueInputBuffer(FFAMediaCodec* codec, int64_t timeoutUs)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    ret = (*env)->CallIntMethod(env, codec->object, codec->jfields.dequeue_input_buffer_id, timeoutUs);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+int ff_AMediaCodec_queueInputBuffer(FFAMediaCodec* codec, size_t idx, off_t offset, size_t size, uint64_t time, uint32_t flags)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.queue_input_buffer_id, (jint)idx, (jint)offset, (jint)size, time, flags);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    return ret;
+}
+
+ssize_t ff_AMediaCodec_dequeueOutputBuffer(FFAMediaCodec* codec, FFAMediaCodecBufferInfo *info, int64_t timeoutUs)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, AVERROR_EXTERNAL);
+
+    ret = (*env)->CallIntMethod(env, codec->object, codec->jfields.dequeue_output_buffer_id, codec->buffer_info, timeoutUs);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        return AVERROR_EXTERNAL;
+    }
+
+    info->flags = (*env)->GetIntField(env, codec->buffer_info, codec->jfields.flags_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        return AVERROR_EXTERNAL;
+    }
+
+    info->offset = (*env)->GetIntField(env, codec->buffer_info, codec->jfields.offset_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        return AVERROR_EXTERNAL;
+    }
+
+    info->presentationTimeUs = (*env)->GetLongField(env, codec->buffer_info, codec->jfields.presentation_time_us_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        return AVERROR_EXTERNAL;
+    }
+
+    info->size = (*env)->GetIntField(env, codec->buffer_info, codec->jfields.size_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        return AVERROR_EXTERNAL;
+    }
+
+    return ret;
+}
+
+uint8_t* ff_AMediaCodec_getInputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size)
+{
+    uint8_t *ret = NULL;
+    JNIEnv *env = NULL;
+
+    jobject buffer = NULL;
+    jobject input_buffers = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, NULL);
+
+    if (codec->has_get_i_o_buffer) {
+        buffer = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_input_buffer_id, (jint)idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    } else {
+        if (!codec->input_buffers) {
+            input_buffers = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_input_buffers_id);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+
+            codec->input_buffers = (*env)->NewGlobalRef(env, input_buffers);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+        }
+
+        buffer = (*env)->GetObjectArrayElement(env, codec->input_buffers, idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    }
+
+    ret = (*env)->GetDirectBufferAddress(env, buffer);
+    *out_size = (*env)->GetDirectBufferCapacity(env, buffer);
+fail:
+    if (buffer) {
+        (*env)->DeleteLocalRef(env, buffer);
+    }
+
+    if (input_buffers) {
+        (*env)->DeleteLocalRef(env, input_buffers);
+    }
+
+    return ret;
+}
+
+uint8_t* ff_AMediaCodec_getOutputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size)
+{
+    uint8_t *ret = NULL;
+    JNIEnv *env = NULL;
+
+    jobject buffer = NULL;
+    jobject output_buffers = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, NULL);
+
+    if (codec->has_get_i_o_buffer) {
+        buffer = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_output_buffer_id, (jint)idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    } else {
+        if (!codec->output_buffers) {
+            output_buffers = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_output_buffers_id);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+
+            codec->output_buffers = (*env)->NewGlobalRef(env, output_buffers);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+        }
+
+        buffer = (*env)->GetObjectArrayElement(env, codec->output_buffers, idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    }
+
+    ret = (*env)->GetDirectBufferAddress(env, buffer);
+    *out_size = (*env)->GetDirectBufferCapacity(env, buffer);
+fail:
+    if (buffer) {
+        (*env)->DeleteLocalRef(env, buffer);
+    }
+
+    if (output_buffers) {
+        (*env)->DeleteLocalRef(env, output_buffers);
+    }
+
+    return ret;
+}
+
+FFAMediaFormat* ff_AMediaCodec_getOutputFormat(FFAMediaCodec* codec)
+{
+    FFAMediaFormat *ret = NULL;
+    JNIEnv *env = NULL;
+
+    jobject mediaformat = NULL;
+
+    JNI_GET_ENV_OR_RETURN(env, codec, NULL);
+
+    mediaformat = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_output_format_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    ret = ff_AMediaFormat_newFromObject(mediaformat);
+fail:
+    if (mediaformat) {
+        (*env)->DeleteLocalRef(env, mediaformat);
+    }
+
+    return ret;
+}
+
+int ff_AMediaCodec_infoTryAgainLater(FFAMediaCodec *codec, ssize_t idx)
+{
+    return idx == codec->INFO_TRY_AGAIN_LATER;
+}
+
+int ff_AMediaCodec_infoOutputBuffersChanged(FFAMediaCodec *codec, ssize_t idx)
+{
+    return idx == codec->INFO_OUTPUT_BUFFERS_CHANGED;
+}
+
+int ff_AMediaCodec_infoOutputFormatChanged(FFAMediaCodec *codec, ssize_t idx)
+{
+    return idx == codec->INFO_OUTPUT_FORMAT_CHANGED;
+}
+
+int ff_AMediaCodec_getBufferFlagCodecConfig(FFAMediaCodec *codec)
+{
+    return codec->BUFFER_FLAG_CODEC_CONFIG;
+}
+
+int ff_AMediaCodec_getBufferFlagEndOfStream(FFAMediaCodec *codec)
+{
+    return codec->BUFFER_FLAG_END_OF_STREAM;
+}
+
+int ff_AMediaCodec_getBufferFlagKeyFrame(FFAMediaCodec *codec)
+{
+    return codec->BUFFER_FLAG_KEY_FRAME;
+}
+
+int ff_AMediaCodec_getConfigureFlagEncode(FFAMediaCodec *codec)
+{
+    return codec->CONFIGURE_FLAG_ENCODE;
+}
+
+int ff_AMediaCodec_cleanOutputBuffers(FFAMediaCodec *codec)
+{
+    int ret = 0;
+
+    if (!codec->has_get_i_o_buffer) {
+        if (codec->output_buffers) {
+            JNIEnv *env = NULL;
+
+            env = ff_jni_get_env(codec);
+            if (!env) {
+                ret = AVERROR_EXTERNAL;
+                goto fail;
+            }
+
+            (*env)->DeleteGlobalRef(env, codec->output_buffers);
+            codec->output_buffers = NULL;
+        }
+    }
+
+fail:
+    return ret;
+}
+
+int ff_Build_SDK_INT(AVCodecContext *avctx)
+{
+    int ret = -1;
+    JNIEnv *env = NULL;
+    jclass versionClass;
+    jfieldID sdkIntFieldID;
+    JNI_GET_ENV_OR_RETURN(env, avctx, -1);
+
+    versionClass = (*env)->FindClass(env, "android/os/Build$VERSION");
+    sdkIntFieldID = (*env)->GetStaticFieldID(env, versionClass, "SDK_INT", "I");
+    ret = (*env)->GetStaticIntField(env, versionClass, sdkIntFieldID);
+    (*env)->DeleteLocalRef(env, versionClass);
+    return ret;
+}
diff --git a/libavcodec/mediacodec_wrapper.h b/libavcodec/mediacodec_wrapper.h
new file mode 100644
index 0000000..f0de16d
--- /dev/null
+++ b/libavcodec/mediacodec_wrapper.h
@@ -0,0 +1,129 @@
+/*
+ * Android MediaCodec Wrapper
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODEC_WRAPPER_H
+#define AVCODEC_MEDIACODEC_WRAPPER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+/**
+ * The following API around MediaCodec and MediaFormat is based on the
+ * NDK one provided by Google since Android 5.0.
+ *
+ * Differences from the NDK API:
+ *
+ * Buffers returned by ff_AMediaFormat_toString and ff_AMediaFormat_getString
+ * are newly allocated buffer and must be freed by the user after use.
+ *
+ * The MediaCrypto API is not implemented.
+ *
+ * ff_AMediaCodec_infoTryAgainLater, ff_AMediaCodec_infoOutputBuffersChanged,
+ * ff_AMediaCodec_infoOutputFormatChanged, ff_AMediaCodec_cleanOutputBuffers
+ * ff_AMediaCodec_getName and ff_AMediaCodec_getBufferFlagEndOfStream are not
+ * part of the original NDK API and are convenience functions to hide JNI
+ * implementation.
+ *
+ * The API around MediaCodecList is not part of the NDK (and is lacking as
+ * we still need to retrieve the codec name to work around faulty decoders
+ * and encoders).
+ *
+ * For documentation, please refers to NdkMediaCodec.h NdkMediaFormat.h and
+ * http://developer.android.com/reference/android/media/MediaCodec.html.
+ *
+ */
+
+int ff_AMediaCodecProfile_getProfileFromAVCodecContext(AVCodecContext *avctx);
+
+char *ff_AMediaCodecList_getCodecNameByType(const char *mime, int profile, int encoder, void *log_ctx);
+
+struct FFAMediaFormat;
+typedef struct FFAMediaFormat FFAMediaFormat;
+
+FFAMediaFormat *ff_AMediaFormat_new(void);
+int ff_AMediaFormat_delete(FFAMediaFormat* format);
+
+char* ff_AMediaFormat_toString(FFAMediaFormat* format);
+
+int ff_AMediaFormat_getInt32(FFAMediaFormat* format, const char *name, int32_t *out);
+int ff_AMediaFormat_getInt64(FFAMediaFormat* format, const char *name, int64_t *out);
+int ff_AMediaFormat_getFloat(FFAMediaFormat* format, const char *name, float *out);
+int ff_AMediaFormat_getBuffer(FFAMediaFormat* format, const char *name, void** data, size_t *size);
+int ff_AMediaFormat_getString(FFAMediaFormat* format, const char *name, const char **out);
+
+void ff_AMediaFormat_setInt32(FFAMediaFormat* format, const char* name, int32_t value);
+void ff_AMediaFormat_setInt64(FFAMediaFormat* format, const char* name, int64_t value);
+void ff_AMediaFormat_setFloat(FFAMediaFormat* format, const char* name, float value);
+void ff_AMediaFormat_setString(FFAMediaFormat* format, const char* name, const char* value);
+void ff_AMediaFormat_setBuffer(FFAMediaFormat* format, const char* name, void* data, size_t size);
+
+struct FFAMediaCodec;
+typedef struct FFAMediaCodec FFAMediaCodec;
+typedef struct FFAMediaCodecCryptoInfo FFAMediaCodecCryptoInfo;
+
+struct FFAMediaCodecBufferInfo {
+    int32_t offset;
+    int32_t size;
+    int64_t presentationTimeUs;
+    uint32_t flags;
+};
+typedef struct FFAMediaCodecBufferInfo FFAMediaCodecBufferInfo;
+
+char *ff_AMediaCodec_getName(FFAMediaCodec *codec);
+
+FFAMediaCodec* ff_AMediaCodec_createCodecByName(const char *name);
+FFAMediaCodec* ff_AMediaCodec_createDecoderByType(const char *mime_type);
+FFAMediaCodec* ff_AMediaCodec_createEncoderByType(const char *mime_type);
+
+int ff_AMediaCodec_configure(FFAMediaCodec* codec, const FFAMediaFormat* format, void* surface, void *crypto, uint32_t flags);
+int ff_AMediaCodec_start(FFAMediaCodec* codec);
+int ff_AMediaCodec_stop(FFAMediaCodec* codec);
+int ff_AMediaCodec_flush(FFAMediaCodec* codec);
+int ff_AMediaCodec_delete(FFAMediaCodec* codec);
+
+uint8_t* ff_AMediaCodec_getInputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size);
+uint8_t* ff_AMediaCodec_getOutputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size);
+
+ssize_t ff_AMediaCodec_dequeueInputBuffer(FFAMediaCodec* codec, int64_t timeoutUs);
+int ff_AMediaCodec_queueInputBuffer(FFAMediaCodec* codec, size_t idx, off_t offset, size_t size, uint64_t time, uint32_t flags);
+
+ssize_t ff_AMediaCodec_dequeueOutputBuffer(FFAMediaCodec* codec, FFAMediaCodecBufferInfo *info, int64_t timeoutUs);
+FFAMediaFormat* ff_AMediaCodec_getOutputFormat(FFAMediaCodec* codec);
+
+int ff_AMediaCodec_releaseOutputBuffer(FFAMediaCodec* codec, size_t idx, int render);
+int ff_AMediaCodec_releaseOutputBufferAtTime(FFAMediaCodec *codec, size_t idx, int64_t timestampNs);
+
+int ff_AMediaCodec_infoTryAgainLater(FFAMediaCodec *codec, ssize_t idx);
+int ff_AMediaCodec_infoOutputBuffersChanged(FFAMediaCodec *codec, ssize_t idx);
+int ff_AMediaCodec_infoOutputFormatChanged(FFAMediaCodec *codec, ssize_t indx);
+
+int ff_AMediaCodec_getBufferFlagCodecConfig (FFAMediaCodec *codec);
+int ff_AMediaCodec_getBufferFlagEndOfStream(FFAMediaCodec *codec);
+int ff_AMediaCodec_getBufferFlagKeyFrame(FFAMediaCodec *codec);
+
+int ff_AMediaCodec_getConfigureFlagEncode(FFAMediaCodec *codec);
+
+int ff_AMediaCodec_cleanOutputBuffers(FFAMediaCodec *codec);
+
+int ff_Build_SDK_INT(AVCodecContext *avctx);
+
+#endif /* AVCODEC_MEDIACODEC_WRAPPER_H */
diff --git a/libavcodec/mediacodecdec.c b/libavcodec/mediacodecdec.c
new file mode 100644
index 0000000..3a4240a
--- /dev/null
+++ b/libavcodec/mediacodecdec.c
@@ -0,0 +1,554 @@
+/*
+ * Android MediaCodec MPEG-2 / H.264 / H.265 / MPEG-4 / VP8 / VP9 decoders
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/opt.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/internal.h"
+
+#include "avcodec.h"
+#include "decode.h"
+#include "h264_parse.h"
+#include "hevc_parse.h"
+#include "hwaccel.h"
+#include "internal.h"
+#include "mediacodec_wrapper.h"
+#include "mediacodecdec_common.h"
+
+typedef struct MediaCodecH264DecContext {
+
+    AVClass *avclass;
+
+    MediaCodecDecContext *ctx;
+
+    AVPacket buffered_pkt;
+
+    int delay_flush;
+    int amlogic_mpeg2_api23_workaround;
+
+} MediaCodecH264DecContext;
+
+static av_cold int mediacodec_decode_close(AVCodecContext *avctx)
+{
+    MediaCodecH264DecContext *s = avctx->priv_data;
+
+    ff_mediacodec_dec_close(avctx, s->ctx);
+    s->ctx = NULL;
+
+    av_packet_unref(&s->buffered_pkt);
+
+    return 0;
+}
+
+#if CONFIG_H264_MEDIACODEC_DECODER || CONFIG_HEVC_MEDIACODEC_DECODER
+static int h2645_ps_to_nalu(const uint8_t *src, int src_size, uint8_t **out, int *out_size)
+{
+    int i;
+    int ret = 0;
+    uint8_t *p = NULL;
+    static const uint8_t nalu_header[] = { 0x00, 0x00, 0x00, 0x01 };
+
+    if (!out || !out_size) {
+        return AVERROR(EINVAL);
+    }
+
+    p = av_malloc(sizeof(nalu_header) + src_size);
+    if (!p) {
+        return AVERROR(ENOMEM);
+    }
+
+    *out = p;
+    *out_size = sizeof(nalu_header) + src_size;
+
+    memcpy(p, nalu_header, sizeof(nalu_header));
+    memcpy(p + sizeof(nalu_header), src, src_size);
+
+    /* Escape 0x00, 0x00, 0x0{0-3} pattern */
+    for (i = 4; i < *out_size; i++) {
+        if (i < *out_size - 3 &&
+            p[i + 0] == 0 &&
+            p[i + 1] == 0 &&
+            p[i + 2] <= 3) {
+            uint8_t *new;
+
+            *out_size += 1;
+            new = av_realloc(*out, *out_size);
+            if (!new) {
+                ret = AVERROR(ENOMEM);
+                goto done;
+            }
+            *out = p = new;
+
+            i = i + 2;
+            memmove(p + i + 1, p + i, *out_size - (i + 1));
+            p[i] = 0x03;
+        }
+    }
+done:
+    if (ret < 0) {
+        av_freep(out);
+        *out_size = 0;
+    }
+
+    return ret;
+}
+#endif
+
+#if CONFIG_H264_MEDIACODEC_DECODER
+static int h264_set_extradata(AVCodecContext *avctx, FFAMediaFormat *format)
+{
+    int i;
+    int ret;
+
+    H264ParamSets ps;
+    const PPS *pps = NULL;
+    const SPS *sps = NULL;
+    int is_avc = 0;
+    int nal_length_size = 0;
+
+    memset(&ps, 0, sizeof(ps));
+
+    ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                   &ps, &is_avc, &nal_length_size, 0, avctx);
+    if (ret < 0) {
+        goto done;
+    }
+
+    for (i = 0; i < MAX_PPS_COUNT; i++) {
+        if (ps.pps_list[i]) {
+            pps = (const PPS*)ps.pps_list[i]->data;
+            break;
+        }
+    }
+
+    if (pps) {
+        if (ps.sps_list[pps->sps_id]) {
+            sps = (const SPS*)ps.sps_list[pps->sps_id]->data;
+        }
+    }
+
+    if (pps && sps) {
+        uint8_t *data = NULL;
+        int data_size = 0;
+
+        if ((ret = h2645_ps_to_nalu(sps->data, sps->data_size, &data, &data_size)) < 0) {
+            goto done;
+        }
+        ff_AMediaFormat_setBuffer(format, "csd-0", (void*)data, data_size);
+        av_freep(&data);
+
+        if ((ret = h2645_ps_to_nalu(pps->data, pps->data_size, &data, &data_size)) < 0) {
+            goto done;
+        }
+        ff_AMediaFormat_setBuffer(format, "csd-1", (void*)data, data_size);
+        av_freep(&data);
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Could not extract PPS/SPS from extradata");
+        ret = AVERROR_INVALIDDATA;
+    }
+
+done:
+    ff_h264_ps_uninit(&ps);
+
+    return ret;
+}
+#endif
+
+#if CONFIG_HEVC_MEDIACODEC_DECODER
+static int hevc_set_extradata(AVCodecContext *avctx, FFAMediaFormat *format)
+{
+    int i;
+    int ret;
+
+    HEVCParamSets ps;
+    HEVCSEI sei;
+
+    const HEVCVPS *vps = NULL;
+    const HEVCPPS *pps = NULL;
+    const HEVCSPS *sps = NULL;
+    int is_nalff = 0;
+    int nal_length_size = 0;
+
+    uint8_t *vps_data = NULL;
+    uint8_t *sps_data = NULL;
+    uint8_t *pps_data = NULL;
+    int vps_data_size = 0;
+    int sps_data_size = 0;
+    int pps_data_size = 0;
+
+    memset(&ps, 0, sizeof(ps));
+    memset(&sei, 0, sizeof(sei));
+
+    ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                   &ps, &sei, &is_nalff, &nal_length_size, 0, 1, avctx);
+    if (ret < 0) {
+        goto done;
+    }
+
+    for (i = 0; i < HEVC_MAX_VPS_COUNT; i++) {
+        if (ps.vps_list[i]) {
+            vps = (const HEVCVPS*)ps.vps_list[i]->data;
+            break;
+        }
+    }
+
+    for (i = 0; i < HEVC_MAX_PPS_COUNT; i++) {
+        if (ps.pps_list[i]) {
+            pps = (const HEVCPPS*)ps.pps_list[i]->data;
+            break;
+        }
+    }
+
+    if (pps) {
+        if (ps.sps_list[pps->sps_id]) {
+            sps = (const HEVCSPS*)ps.sps_list[pps->sps_id]->data;
+        }
+    }
+
+    if (vps && pps && sps) {
+        uint8_t *data;
+        int data_size;
+
+        if ((ret = h2645_ps_to_nalu(vps->data, vps->data_size, &vps_data, &vps_data_size)) < 0 ||
+            (ret = h2645_ps_to_nalu(sps->data, sps->data_size, &sps_data, &sps_data_size)) < 0 ||
+            (ret = h2645_ps_to_nalu(pps->data, pps->data_size, &pps_data, &pps_data_size)) < 0) {
+            goto done;
+        }
+
+        data_size = vps_data_size + sps_data_size + pps_data_size;
+        data = av_mallocz(data_size);
+        if (!data) {
+            ret = AVERROR(ENOMEM);
+            goto done;
+        }
+
+        memcpy(data                                , vps_data, vps_data_size);
+        memcpy(data + vps_data_size                , sps_data, sps_data_size);
+        memcpy(data + vps_data_size + sps_data_size, pps_data, pps_data_size);
+
+        ff_AMediaFormat_setBuffer(format, "csd-0", data, data_size);
+
+        av_freep(&data);
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Could not extract VPS/PPS/SPS from extradata");
+        ret = AVERROR_INVALIDDATA;
+    }
+
+done:
+    ff_hevc_ps_uninit(&ps);
+
+    av_freep(&vps_data);
+    av_freep(&sps_data);
+    av_freep(&pps_data);
+
+    return ret;
+}
+#endif
+
+#if CONFIG_MPEG2_MEDIACODEC_DECODER || \
+    CONFIG_MPEG4_MEDIACODEC_DECODER || \
+    CONFIG_VP8_MEDIACODEC_DECODER   || \
+    CONFIG_VP9_MEDIACODEC_DECODER
+static int common_set_extradata(AVCodecContext *avctx, FFAMediaFormat *format)
+{
+    int ret = 0;
+
+    if (avctx->extradata) {
+        ff_AMediaFormat_setBuffer(format, "csd-0", avctx->extradata, avctx->extradata_size);
+    }
+
+    return ret;
+}
+#endif
+
+static av_cold int mediacodec_decode_init(AVCodecContext *avctx)
+{
+    int ret;
+    int sdk_int;
+
+    const char *codec_mime = NULL;
+
+    FFAMediaFormat *format = NULL;
+    MediaCodecH264DecContext *s = avctx->priv_data;
+
+    format = ff_AMediaFormat_new();
+    if (!format) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create media format\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    switch (avctx->codec_id) {
+#if CONFIG_H264_MEDIACODEC_DECODER
+    case AV_CODEC_ID_H264:
+        codec_mime = "video/avc";
+
+        ret = h264_set_extradata(avctx, format);
+        if (ret < 0)
+            goto done;
+        break;
+#endif
+#if CONFIG_HEVC_MEDIACODEC_DECODER
+    case AV_CODEC_ID_HEVC:
+        codec_mime = "video/hevc";
+
+        ret = hevc_set_extradata(avctx, format);
+        if (ret < 0)
+            goto done;
+        break;
+#endif
+#if CONFIG_MPEG2_MEDIACODEC_DECODER
+    case AV_CODEC_ID_MPEG2VIDEO:
+        codec_mime = "video/mpeg2";
+
+        ret = common_set_extradata(avctx, format);
+        if (ret < 0)
+            goto done;
+        break;
+#endif
+#if CONFIG_MPEG4_MEDIACODEC_DECODER
+    case AV_CODEC_ID_MPEG4:
+        codec_mime = "video/mp4v-es",
+
+        ret = common_set_extradata(avctx, format);
+        if (ret < 0)
+            goto done;
+        break;
+#endif
+#if CONFIG_VP8_MEDIACODEC_DECODER
+    case AV_CODEC_ID_VP8:
+        codec_mime = "video/x-vnd.on2.vp8";
+
+        ret = common_set_extradata(avctx, format);
+        if (ret < 0)
+            goto done;
+        break;
+#endif
+#if CONFIG_VP9_MEDIACODEC_DECODER
+    case AV_CODEC_ID_VP9:
+        codec_mime = "video/x-vnd.on2.vp9";
+
+        ret = common_set_extradata(avctx, format);
+        if (ret < 0)
+            goto done;
+        break;
+#endif
+    default:
+        av_assert0(0);
+    }
+
+    ff_AMediaFormat_setString(format, "mime", codec_mime);
+    ff_AMediaFormat_setInt32(format, "width", avctx->width);
+    ff_AMediaFormat_setInt32(format, "height", avctx->height);
+
+    s->ctx = av_mallocz(sizeof(*s->ctx));
+    if (!s->ctx) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate MediaCodecDecContext\n");
+        ret = AVERROR(ENOMEM);
+        goto done;
+    }
+
+    s->ctx->delay_flush = s->delay_flush;
+
+    if ((ret = ff_mediacodec_dec_init(avctx, s->ctx, codec_mime, format)) < 0) {
+        s->ctx = NULL;
+        goto done;
+    }
+
+    av_log(avctx, AV_LOG_INFO,
+           "MediaCodec started successfully: codec = %s, ret = %d\n",
+           s->ctx->codec_name, ret);
+
+    sdk_int = ff_Build_SDK_INT(avctx);
+    if (sdk_int <= 23 &&
+        strcmp(s->ctx->codec_name, "OMX.amlogic.mpeg2.decoder.awesome") == 0) {
+        av_log(avctx, AV_LOG_INFO, "Enabling workaround for %s on API=%d\n",
+               s->ctx->codec_name, sdk_int);
+        s->amlogic_mpeg2_api23_workaround = 1;
+    }
+
+done:
+    if (format) {
+        ff_AMediaFormat_delete(format);
+    }
+
+    if (ret < 0) {
+        mediacodec_decode_close(avctx);
+    }
+
+    return ret;
+}
+
+static int mediacodec_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    MediaCodecH264DecContext *s = avctx->priv_data;
+    int ret;
+    ssize_t index;
+
+    /* In delay_flush mode, wait until the user has released or rendered
+       all retained frames. */
+    if (s->delay_flush && ff_mediacodec_dec_is_flushing(avctx, s->ctx)) {
+        if (!ff_mediacodec_dec_flush(avctx, s->ctx)) {
+            return AVERROR(EAGAIN);
+        }
+    }
+
+    /* poll for new frame */
+    ret = ff_mediacodec_dec_receive(avctx, s->ctx, frame, false);
+    if (ret != AVERROR(EAGAIN))
+        return ret;
+
+    /* feed decoder */
+    while (1) {
+        if (s->ctx->current_input_buffer < 0) {
+            /* poll for input space */
+            index = ff_AMediaCodec_dequeueInputBuffer(s->ctx->codec, 0);
+            if (index < 0) {
+                /* no space, block for an output frame to appear */
+                return ff_mediacodec_dec_receive(avctx, s->ctx, frame, true);
+            }
+            s->ctx->current_input_buffer = index;
+        }
+
+        /* try to flush any buffered packet data */
+        if (s->buffered_pkt.size > 0) {
+            ret = ff_mediacodec_dec_send(avctx, s->ctx, &s->buffered_pkt, false);
+            if (ret >= 0) {
+                s->buffered_pkt.size -= ret;
+                s->buffered_pkt.data += ret;
+                if (s->buffered_pkt.size <= 0)
+                    av_packet_unref(&s->buffered_pkt);
+            } else if (ret < 0 && ret != AVERROR(EAGAIN)) {
+                return ret;
+            }
+
+            if (s->amlogic_mpeg2_api23_workaround && s->buffered_pkt.size <= 0) {
+                /* fallthrough to fetch next packet regardless of input buffer space */
+            } else {
+                /* poll for space again */
+                continue;
+            }
+        }
+
+        /* fetch new packet or eof */
+        ret = ff_decode_get_packet(avctx, &s->buffered_pkt);
+        if (ret == AVERROR_EOF) {
+            AVPacket null_pkt = { 0 };
+            ret = ff_mediacodec_dec_send(avctx, s->ctx, &null_pkt, true);
+            if (ret < 0)
+                return ret;
+        } else if (ret == AVERROR(EAGAIN) && s->ctx->current_input_buffer < 0) {
+            return ff_mediacodec_dec_receive(avctx, s->ctx, frame, true);
+        } else if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return AVERROR(EAGAIN);
+}
+
+static void mediacodec_decode_flush(AVCodecContext *avctx)
+{
+    MediaCodecH264DecContext *s = avctx->priv_data;
+
+    av_packet_unref(&s->buffered_pkt);
+
+    ff_mediacodec_dec_flush(avctx, s->ctx);
+}
+
+static const AVCodecHWConfigInternal *mediacodec_hw_configs[] = {
+    &(const AVCodecHWConfigInternal) {
+        .public          = {
+            .pix_fmt     = AV_PIX_FMT_MEDIACODEC,
+            .methods     = AV_CODEC_HW_CONFIG_METHOD_AD_HOC |
+                           AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX,
+            .device_type = AV_HWDEVICE_TYPE_MEDIACODEC,
+        },
+        .hwaccel         = NULL,
+    },
+    NULL
+};
+
+#define OFFSET(x) offsetof(MediaCodecH264DecContext, x)
+#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption ff_mediacodec_vdec_options[] = {
+    { "delay_flush", "Delay flush until hw output buffers are returned to the decoder",
+                     OFFSET(delay_flush), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, VD },
+    { NULL }
+};
+
+#define DECLARE_MEDIACODEC_VCLASS(short_name)                   \
+static const AVClass ff_##short_name##_mediacodec_dec_class = { \
+    .class_name = #short_name "_mediacodec",                    \
+    .item_name  = av_default_item_name,                         \
+    .option     = ff_mediacodec_vdec_options,                   \
+    .version    = LIBAVUTIL_VERSION_INT,                        \
+};
+
+#define DECLARE_MEDIACODEC_VDEC(short_name, full_name, codec_id, bsf)                          \
+DECLARE_MEDIACODEC_VCLASS(short_name)                                                          \
+AVCodec ff_##short_name##_mediacodec_decoder = {                                               \
+    .name           = #short_name "_mediacodec",                                               \
+    .long_name      = NULL_IF_CONFIG_SMALL(full_name " Android MediaCodec decoder"),           \
+    .type           = AVMEDIA_TYPE_VIDEO,                                                      \
+    .id             = codec_id,                                                                \
+    .priv_class     = &ff_##short_name##_mediacodec_dec_class,                                 \
+    .priv_data_size = sizeof(MediaCodecH264DecContext),                                        \
+    .init           = mediacodec_decode_init,                                                  \
+    .receive_frame  = mediacodec_receive_frame,                                                \
+    .flush          = mediacodec_decode_flush,                                                 \
+    .close          = mediacodec_decode_close,                                                 \
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HARDWARE, \
+    .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS,                                               \
+    .bsfs           = bsf,                                                                     \
+    .hw_configs     = mediacodec_hw_configs,                                                   \
+    .wrapper_name   = "mediacodec",                                                            \
+};                                                                                             \
+
+#if CONFIG_H264_MEDIACODEC_DECODER
+DECLARE_MEDIACODEC_VDEC(h264, "H.264", AV_CODEC_ID_H264, "h264_mp4toannexb")
+#endif
+
+#if CONFIG_HEVC_MEDIACODEC_DECODER
+DECLARE_MEDIACODEC_VDEC(hevc, "H.265", AV_CODEC_ID_HEVC, "hevc_mp4toannexb")
+#endif
+
+#if CONFIG_MPEG2_MEDIACODEC_DECODER
+DECLARE_MEDIACODEC_VDEC(mpeg2, "MPEG-2", AV_CODEC_ID_MPEG2VIDEO, NULL)
+#endif
+
+#if CONFIG_MPEG4_MEDIACODEC_DECODER
+DECLARE_MEDIACODEC_VDEC(mpeg4, "MPEG-4", AV_CODEC_ID_MPEG4, NULL)
+#endif
+
+#if CONFIG_VP8_MEDIACODEC_DECODER
+DECLARE_MEDIACODEC_VDEC(vp8, "VP8", AV_CODEC_ID_VP8, NULL)
+#endif
+
+#if CONFIG_VP9_MEDIACODEC_DECODER
+DECLARE_MEDIACODEC_VDEC(vp9, "VP9", AV_CODEC_ID_VP9, NULL)
+#endif
diff --git a/libavcodec/mediacodecdec_common.c b/libavcodec/mediacodecdec_common.c
new file mode 100644
index 0000000..7c2661f
--- /dev/null
+++ b/libavcodec/mediacodecdec_common.c
@@ -0,0 +1,807 @@
+/*
+ * Android MediaCodec decoder
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include <sys/types.h>
+
+#include "libavutil/common.h"
+#include "libavutil/hwcontext_mediacodec.h"
+#include "libavutil/mem.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/time.h"
+#include "libavutil/timestamp.h"
+
+#include "avcodec.h"
+#include "internal.h"
+
+#include "mediacodec.h"
+#include "mediacodec_surface.h"
+#include "mediacodec_sw_buffer.h"
+#include "mediacodec_wrapper.h"
+#include "mediacodecdec_common.h"
+
+/**
+ * OMX.k3.video.decoder.avc, OMX.NVIDIA.* OMX.SEC.avc.dec and OMX.google
+ * codec workarounds used in various place are taken from the Gstreamer
+ * project.
+ *
+ * Gstreamer references:
+ * https://cgit.freedesktop.org/gstreamer/gst-plugins-bad/tree/sys/androidmedia/
+ *
+ * Gstreamer copyright notice:
+ *
+ * Copyright (C) 2012, Collabora Ltd.
+ *   Author: Sebastian Dröge <sebastian.droege@collabora.co.uk>
+ *
+ * Copyright (C) 2012, Rafaël Carré <funman@videolanorg>
+ *
+ * Copyright (C) 2015, Sebastian Dröge <sebastian@centricular.com>
+ *
+ * Copyright (C) 2014-2015, Collabora Ltd.
+ *   Author: Matthieu Bouron <matthieu.bouron@gcollabora.com>
+ *
+ * Copyright (C) 2015, Edward Hervey
+ *   Author: Edward Hervey <bilboed@gmail.com>
+ *
+ * Copyright (C) 2015, Matthew Waters <matthew@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#define INPUT_DEQUEUE_TIMEOUT_US 8000
+#define OUTPUT_DEQUEUE_TIMEOUT_US 8000
+#define OUTPUT_DEQUEUE_BLOCK_TIMEOUT_US 1000000
+
+enum {
+    COLOR_FormatYUV420Planar                              = 0x13,
+    COLOR_FormatYUV420SemiPlanar                          = 0x15,
+    COLOR_FormatYCbYCr                                    = 0x19,
+    COLOR_FormatAndroidOpaque                             = 0x7F000789,
+    COLOR_QCOM_FormatYUV420SemiPlanar                     = 0x7fa30c00,
+    COLOR_QCOM_FormatYUV420SemiPlanar32m                  = 0x7fa30c04,
+    COLOR_QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka = 0x7fa30c03,
+    COLOR_TI_FormatYUV420PackedSemiPlanar                 = 0x7f000100,
+    COLOR_TI_FormatYUV420PackedSemiPlanarInterlaced       = 0x7f000001,
+};
+
+static const struct {
+
+    int color_format;
+    enum AVPixelFormat pix_fmt;
+
+} color_formats[] = {
+
+    { COLOR_FormatYUV420Planar,                              AV_PIX_FMT_YUV420P },
+    { COLOR_FormatYUV420SemiPlanar,                          AV_PIX_FMT_NV12    },
+    { COLOR_QCOM_FormatYUV420SemiPlanar,                     AV_PIX_FMT_NV12    },
+    { COLOR_QCOM_FormatYUV420SemiPlanar32m,                  AV_PIX_FMT_NV12    },
+    { COLOR_QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka, AV_PIX_FMT_NV12    },
+    { COLOR_TI_FormatYUV420PackedSemiPlanar,                 AV_PIX_FMT_NV12    },
+    { COLOR_TI_FormatYUV420PackedSemiPlanarInterlaced,       AV_PIX_FMT_NV12    },
+    { 0 }
+};
+
+static enum AVPixelFormat mcdec_map_color_format(AVCodecContext *avctx,
+                                                 MediaCodecDecContext *s,
+                                                 int color_format)
+{
+    int i;
+    enum AVPixelFormat ret = AV_PIX_FMT_NONE;
+
+    if (s->surface) {
+        return AV_PIX_FMT_MEDIACODEC;
+    }
+
+    if (!strcmp(s->codec_name, "OMX.k3.video.decoder.avc") && color_format == COLOR_FormatYCbYCr) {
+        s->color_format = color_format = COLOR_TI_FormatYUV420PackedSemiPlanar;
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(color_formats); i++) {
+        if (color_formats[i].color_format == color_format) {
+            return color_formats[i].pix_fmt;
+        }
+    }
+
+    av_log(avctx, AV_LOG_ERROR, "Output color format 0x%x (value=%d) is not supported\n",
+        color_format, color_format);
+
+    return ret;
+}
+
+static void ff_mediacodec_dec_ref(MediaCodecDecContext *s)
+{
+    atomic_fetch_add(&s->refcount, 1);
+}
+
+static void ff_mediacodec_dec_unref(MediaCodecDecContext *s)
+{
+    if (!s)
+        return;
+
+    if (atomic_fetch_sub(&s->refcount, 1) == 1) {
+        if (s->codec) {
+            ff_AMediaCodec_delete(s->codec);
+            s->codec = NULL;
+        }
+
+        if (s->format) {
+            ff_AMediaFormat_delete(s->format);
+            s->format = NULL;
+        }
+
+        if (s->surface) {
+            ff_mediacodec_surface_unref(s->surface, NULL);
+            s->surface = NULL;
+        }
+
+        av_freep(&s->codec_name);
+        av_freep(&s);
+    }
+}
+
+static void mediacodec_buffer_release(void *opaque, uint8_t *data)
+{
+    AVMediaCodecBuffer *buffer = opaque;
+    MediaCodecDecContext *ctx = buffer->ctx;
+    int released = atomic_load(&buffer->released);
+
+    if (!released && (ctx->delay_flush || buffer->serial == atomic_load(&ctx->serial))) {
+        atomic_fetch_sub(&ctx->hw_buffer_count, 1);
+        av_log(ctx->avctx, AV_LOG_DEBUG,
+               "Releasing output buffer %zd (%p) ts=%"PRId64" on free() [%d pending]\n",
+               buffer->index, buffer, buffer->pts, atomic_load(&ctx->hw_buffer_count));
+        ff_AMediaCodec_releaseOutputBuffer(ctx->codec, buffer->index, 0);
+    }
+
+    if (ctx->delay_flush)
+        ff_mediacodec_dec_unref(ctx);
+    av_freep(&buffer);
+}
+
+static int mediacodec_wrap_hw_buffer(AVCodecContext *avctx,
+                                  MediaCodecDecContext *s,
+                                  ssize_t index,
+                                  FFAMediaCodecBufferInfo *info,
+                                  AVFrame *frame)
+{
+    int ret = 0;
+    int status = 0;
+    AVMediaCodecBuffer *buffer = NULL;
+
+    frame->buf[0] = NULL;
+    frame->width = avctx->width;
+    frame->height = avctx->height;
+    frame->format = avctx->pix_fmt;
+    frame->sample_aspect_ratio = avctx->sample_aspect_ratio;
+
+    if (avctx->pkt_timebase.num && avctx->pkt_timebase.den) {
+        frame->pts = av_rescale_q(info->presentationTimeUs,
+                                      AV_TIME_BASE_Q,
+                                      avctx->pkt_timebase);
+    } else {
+        frame->pts = info->presentationTimeUs;
+    }
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+    frame->pkt_pts = frame->pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    frame->pkt_dts = AV_NOPTS_VALUE;
+
+    buffer = av_mallocz(sizeof(AVMediaCodecBuffer));
+    if (!buffer) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    atomic_init(&buffer->released, 0);
+
+    frame->buf[0] = av_buffer_create(NULL,
+                                     0,
+                                     mediacodec_buffer_release,
+                                     buffer,
+                                     AV_BUFFER_FLAG_READONLY);
+
+    if (!frame->buf[0]) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+
+    }
+
+    buffer->ctx = s;
+    buffer->serial = atomic_load(&s->serial);
+    if (s->delay_flush)
+        ff_mediacodec_dec_ref(s);
+
+    buffer->index = index;
+    buffer->pts = info->presentationTimeUs;
+
+    frame->data[3] = (uint8_t *)buffer;
+
+    atomic_fetch_add(&s->hw_buffer_count, 1);
+    av_log(avctx, AV_LOG_DEBUG,
+            "Wrapping output buffer %zd (%p) ts=%"PRId64" [%d pending]\n",
+            buffer->index, buffer, buffer->pts, atomic_load(&s->hw_buffer_count));
+
+    return 0;
+fail:
+    av_freep(buffer);
+    av_buffer_unref(&frame->buf[0]);
+    status = ff_AMediaCodec_releaseOutputBuffer(s->codec, index, 0);
+    if (status < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to release output buffer\n");
+        ret = AVERROR_EXTERNAL;
+    }
+
+    return ret;
+}
+
+static int mediacodec_wrap_sw_buffer(AVCodecContext *avctx,
+                                  MediaCodecDecContext *s,
+                                  uint8_t *data,
+                                  size_t size,
+                                  ssize_t index,
+                                  FFAMediaCodecBufferInfo *info,
+                                  AVFrame *frame)
+{
+    int ret = 0;
+    int status = 0;
+
+    frame->width = avctx->width;
+    frame->height = avctx->height;
+    frame->format = avctx->pix_fmt;
+
+    /* MediaCodec buffers needs to be copied to our own refcounted buffers
+     * because the flush command invalidates all input and output buffers.
+     */
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer\n");
+        goto done;
+    }
+
+    /* Override frame->pkt_pts as ff_get_buffer will override its value based
+     * on the last avpacket received which is not in sync with the frame:
+     *   * N avpackets can be pushed before 1 frame is actually returned
+     *   * 0-sized avpackets are pushed to flush remaining frames at EOS */
+    if (avctx->pkt_timebase.num && avctx->pkt_timebase.den) {
+        frame->pts = av_rescale_q(info->presentationTimeUs,
+                                      AV_TIME_BASE_Q,
+                                      avctx->pkt_timebase);
+    } else {
+        frame->pts = info->presentationTimeUs;
+    }
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+    frame->pkt_pts = frame->pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    frame->pkt_dts = AV_NOPTS_VALUE;
+
+    av_log(avctx, AV_LOG_TRACE,
+            "Frame: width=%d stride=%d height=%d slice-height=%d "
+            "crop-top=%d crop-bottom=%d crop-left=%d crop-right=%d encoder=%s "
+            "destination linesizes=%d,%d,%d\n" ,
+            avctx->width, s->stride, avctx->height, s->slice_height,
+            s->crop_top, s->crop_bottom, s->crop_left, s->crop_right, s->codec_name,
+            frame->linesize[0], frame->linesize[1], frame->linesize[2]);
+
+    switch (s->color_format) {
+    case COLOR_FormatYUV420Planar:
+        ff_mediacodec_sw_buffer_copy_yuv420_planar(avctx, s, data, size, info, frame);
+        break;
+    case COLOR_FormatYUV420SemiPlanar:
+    case COLOR_QCOM_FormatYUV420SemiPlanar:
+    case COLOR_QCOM_FormatYUV420SemiPlanar32m:
+        ff_mediacodec_sw_buffer_copy_yuv420_semi_planar(avctx, s, data, size, info, frame);
+        break;
+    case COLOR_TI_FormatYUV420PackedSemiPlanar:
+    case COLOR_TI_FormatYUV420PackedSemiPlanarInterlaced:
+        ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar(avctx, s, data, size, info, frame);
+        break;
+    case COLOR_QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka:
+        ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar_64x32Tile2m8ka(avctx, s, data, size, info, frame);
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported color format 0x%x (value=%d)\n",
+            s->color_format, s->color_format);
+        ret = AVERROR(EINVAL);
+        goto done;
+    }
+
+    ret = 0;
+done:
+    status = ff_AMediaCodec_releaseOutputBuffer(s->codec, index, 0);
+    if (status < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to release output buffer\n");
+        ret = AVERROR_EXTERNAL;
+    }
+
+    return ret;
+}
+
+#define AMEDIAFORMAT_GET_INT32(name, key, mandatory) do {                              \
+    int32_t value = 0;                                                                 \
+    if (ff_AMediaFormat_getInt32(s->format, key, &value)) {                            \
+        (name) = value;                                                                \
+    } else if (mandatory) {                                                            \
+        av_log(avctx, AV_LOG_ERROR, "Could not get %s from format %s\n", key, format); \
+        ret = AVERROR_EXTERNAL;                                                        \
+        goto fail;                                                                     \
+    }                                                                                  \
+} while (0)                                                                            \
+
+static int mediacodec_dec_parse_format(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    int ret = 0;
+    int width = 0;
+    int height = 0;
+    char *format = NULL;
+
+    if (!s->format) {
+        av_log(avctx, AV_LOG_ERROR, "Output MediaFormat is not set\n");
+        return AVERROR(EINVAL);
+    }
+
+    format = ff_AMediaFormat_toString(s->format);
+    if (!format) {
+        return AVERROR_EXTERNAL;
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Parsing MediaFormat %s\n", format);
+
+    /* Mandatory fields */
+    AMEDIAFORMAT_GET_INT32(s->width,  "width", 1);
+    AMEDIAFORMAT_GET_INT32(s->height, "height", 1);
+
+    AMEDIAFORMAT_GET_INT32(s->stride, "stride", 0);
+    s->stride = s->stride > 0 ? s->stride : s->width;
+
+    AMEDIAFORMAT_GET_INT32(s->slice_height, "slice-height", 0);
+
+    if (strstr(s->codec_name, "OMX.Nvidia.") && s->slice_height == 0) {
+        s->slice_height = FFALIGN(s->height, 16);
+    } else if (strstr(s->codec_name, "OMX.SEC.avc.dec")) {
+        s->slice_height = avctx->height;
+        s->stride = avctx->width;
+    } else if (s->slice_height == 0) {
+        s->slice_height = s->height;
+    }
+
+    AMEDIAFORMAT_GET_INT32(s->color_format, "color-format", 1);
+    avctx->pix_fmt = mcdec_map_color_format(avctx, s, s->color_format);
+    if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
+        av_log(avctx, AV_LOG_ERROR, "Output color format is not supported\n");
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    /* Optional fields */
+    AMEDIAFORMAT_GET_INT32(s->crop_top,    "crop-top",    0);
+    AMEDIAFORMAT_GET_INT32(s->crop_bottom, "crop-bottom", 0);
+    AMEDIAFORMAT_GET_INT32(s->crop_left,   "crop-left",   0);
+    AMEDIAFORMAT_GET_INT32(s->crop_right,  "crop-right",  0);
+
+    width = s->crop_right + 1 - s->crop_left;
+    height = s->crop_bottom + 1 - s->crop_top;
+
+    AMEDIAFORMAT_GET_INT32(s->display_width,  "display-width",  0);
+    AMEDIAFORMAT_GET_INT32(s->display_height, "display-height", 0);
+
+    if (s->display_width && s->display_height) {
+        AVRational sar = av_div_q(
+            (AVRational){ s->display_width, s->display_height },
+            (AVRational){ width, height });
+        ff_set_sar(avctx, sar);
+    }
+
+    av_log(avctx, AV_LOG_INFO,
+        "Output crop parameters top=%d bottom=%d left=%d right=%d, "
+        "resulting dimensions width=%d height=%d\n",
+        s->crop_top, s->crop_bottom, s->crop_left, s->crop_right,
+        width, height);
+
+    av_freep(&format);
+    return ff_set_dimensions(avctx, width, height);
+fail:
+    av_freep(&format);
+    return ret;
+}
+
+static int mediacodec_dec_flush_codec(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    FFAMediaCodec *codec = s->codec;
+    int status;
+
+    s->output_buffer_count = 0;
+
+    s->draining = 0;
+    s->flushing = 0;
+    s->eos = 0;
+    atomic_fetch_add(&s->serial, 1);
+    atomic_init(&s->hw_buffer_count, 0);
+    s->current_input_buffer = -1;
+
+    status = ff_AMediaCodec_flush(codec);
+    if (status < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to flush codec\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+int ff_mediacodec_dec_init(AVCodecContext *avctx, MediaCodecDecContext *s,
+                           const char *mime, FFAMediaFormat *format)
+{
+    int ret = 0;
+    int status;
+    int profile;
+
+    enum AVPixelFormat pix_fmt;
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_MEDIACODEC,
+        AV_PIX_FMT_NONE,
+    };
+
+    s->avctx = avctx;
+    atomic_init(&s->refcount, 1);
+    atomic_init(&s->hw_buffer_count, 0);
+    atomic_init(&s->serial, 1);
+    s->current_input_buffer = -1;
+
+    pix_fmt = ff_get_format(avctx, pix_fmts);
+    if (pix_fmt == AV_PIX_FMT_MEDIACODEC) {
+        AVMediaCodecContext *user_ctx = avctx->hwaccel_context;
+
+        if (avctx->hw_device_ctx) {
+            AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)(avctx->hw_device_ctx->data);
+            if (device_ctx->type == AV_HWDEVICE_TYPE_MEDIACODEC) {
+                if (device_ctx->hwctx) {
+                    AVMediaCodecDeviceContext *mediacodec_ctx = (AVMediaCodecDeviceContext *)device_ctx->hwctx;
+                    s->surface = ff_mediacodec_surface_ref(mediacodec_ctx->surface, avctx);
+                    av_log(avctx, AV_LOG_INFO, "Using surface %p\n", s->surface);
+                }
+            }
+        }
+
+        if (!s->surface && user_ctx && user_ctx->surface) {
+            s->surface = ff_mediacodec_surface_ref(user_ctx->surface, avctx);
+            av_log(avctx, AV_LOG_INFO, "Using surface %p\n", s->surface);
+        }
+    }
+
+    profile = ff_AMediaCodecProfile_getProfileFromAVCodecContext(avctx);
+    if (profile < 0) {
+        av_log(avctx, AV_LOG_WARNING, "Unsupported or unknown profile\n");
+    }
+
+    s->codec_name = ff_AMediaCodecList_getCodecNameByType(mime, profile, 0, avctx);
+    if (!s->codec_name) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Found decoder %s\n", s->codec_name);
+    s->codec = ff_AMediaCodec_createCodecByName(s->codec_name);
+    if (!s->codec) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create media decoder for type %s and name %s\n", mime, s->codec_name);
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    status = ff_AMediaCodec_configure(s->codec, format, s->surface, NULL, 0);
+    if (status < 0) {
+        char *desc = ff_AMediaFormat_toString(format);
+        av_log(avctx, AV_LOG_ERROR,
+            "Failed to configure codec (status = %d) with format %s\n",
+            status, desc);
+        av_freep(&desc);
+
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    status = ff_AMediaCodec_start(s->codec);
+    if (status < 0) {
+        char *desc = ff_AMediaFormat_toString(format);
+        av_log(avctx, AV_LOG_ERROR,
+            "Failed to start codec (status = %d) with format %s\n",
+            status, desc);
+        av_freep(&desc);
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    s->format = ff_AMediaCodec_getOutputFormat(s->codec);
+    if (s->format) {
+        if ((ret = mediacodec_dec_parse_format(avctx, s)) < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                "Failed to configure context\n");
+            goto fail;
+        }
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "MediaCodec %p started successfully\n", s->codec);
+
+    return 0;
+
+fail:
+    av_log(avctx, AV_LOG_ERROR, "MediaCodec %p failed to start\n", s->codec);
+    ff_mediacodec_dec_close(avctx, s);
+    return ret;
+}
+
+int ff_mediacodec_dec_send(AVCodecContext *avctx, MediaCodecDecContext *s,
+                           AVPacket *pkt, bool wait)
+{
+    int offset = 0;
+    int need_draining = 0;
+    uint8_t *data;
+    ssize_t index = s->current_input_buffer;
+    size_t size;
+    FFAMediaCodec *codec = s->codec;
+    int status;
+    int64_t input_dequeue_timeout_us = wait ? INPUT_DEQUEUE_TIMEOUT_US : 0;
+    int64_t pts;
+
+    if (s->flushing) {
+        av_log(avctx, AV_LOG_ERROR, "Decoder is flushing and cannot accept new buffer "
+                                    "until all output buffers have been released\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    if (pkt->size == 0) {
+        need_draining = 1;
+    }
+
+    if (s->draining && s->eos) {
+        return AVERROR_EOF;
+    }
+
+    while (offset < pkt->size || (need_draining && !s->draining)) {
+        if (index < 0) {
+            index = ff_AMediaCodec_dequeueInputBuffer(codec, input_dequeue_timeout_us);
+            if (ff_AMediaCodec_infoTryAgainLater(codec, index)) {
+                av_log(avctx, AV_LOG_TRACE, "No input buffer available, try again later\n");
+                break;
+            }
+
+            if (index < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to dequeue input buffer (status=%zd)\n", index);
+                return AVERROR_EXTERNAL;
+            }
+        }
+        s->current_input_buffer = -1;
+
+        data = ff_AMediaCodec_getInputBuffer(codec, index, &size);
+        if (!data) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get input buffer\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        pts = pkt->pts;
+        if (pts != AV_NOPTS_VALUE && avctx->pkt_timebase.num && avctx->pkt_timebase.den) {
+            pts = av_rescale_q(pts, avctx->pkt_timebase, AV_TIME_BASE_Q);
+        }
+
+        if (need_draining) {
+            uint32_t flags = ff_AMediaCodec_getBufferFlagEndOfStream(codec);
+
+            av_log(avctx, AV_LOG_DEBUG, "Sending End Of Stream signal\n");
+
+            status = ff_AMediaCodec_queueInputBuffer(codec, index, 0, 0, pts, flags);
+            if (status < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to queue input empty buffer (status = %d)\n", status);
+                return AVERROR_EXTERNAL;
+            }
+
+            av_log(avctx, AV_LOG_TRACE,
+                   "Queued input buffer %zd size=%zd ts=%"PRIi64"\n", index, size, pts);
+
+            s->draining = 1;
+            break;
+        } else {
+            size = FFMIN(pkt->size - offset, size);
+            memcpy(data, pkt->data + offset, size);
+            offset += size;
+
+            status = ff_AMediaCodec_queueInputBuffer(codec, index, 0, size, pts, 0);
+            if (status < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to queue input buffer (status = %d)\n", status);
+                return AVERROR_EXTERNAL;
+            }
+
+            av_log(avctx, AV_LOG_TRACE,
+                   "Queued input buffer %zd size=%zd ts=%"PRIi64"\n", index, size, pts);
+        }
+    }
+
+    if (offset == 0)
+        return AVERROR(EAGAIN);
+    return offset;
+}
+
+int ff_mediacodec_dec_receive(AVCodecContext *avctx, MediaCodecDecContext *s,
+                              AVFrame *frame, bool wait)
+{
+    int ret;
+    uint8_t *data;
+    ssize_t index;
+    size_t size;
+    FFAMediaCodec *codec = s->codec;
+    FFAMediaCodecBufferInfo info = { 0 };
+    int status;
+    int64_t output_dequeue_timeout_us = OUTPUT_DEQUEUE_TIMEOUT_US;
+
+    if (s->draining && s->eos) {
+        return AVERROR_EOF;
+    }
+
+    if (s->draining) {
+        /* If the codec is flushing or need to be flushed, block for a fair
+         * amount of time to ensure we got a frame */
+        output_dequeue_timeout_us = OUTPUT_DEQUEUE_BLOCK_TIMEOUT_US;
+    } else if (s->output_buffer_count == 0 || !wait) {
+        /* If the codec hasn't produced any frames, do not block so we
+         * can push data to it as fast as possible, and get the first
+         * frame */
+        output_dequeue_timeout_us = 0;
+    }
+
+    index = ff_AMediaCodec_dequeueOutputBuffer(codec, &info, output_dequeue_timeout_us);
+    if (index >= 0) {
+        av_log(avctx, AV_LOG_TRACE, "Got output buffer %zd"
+                " offset=%" PRIi32 " size=%" PRIi32 " ts=%" PRIi64
+                " flags=%" PRIu32 "\n", index, info.offset, info.size,
+                info.presentationTimeUs, info.flags);
+
+        if (info.flags & ff_AMediaCodec_getBufferFlagEndOfStream(codec)) {
+            s->eos = 1;
+        }
+
+        if (info.size) {
+            if (s->surface) {
+                if ((ret = mediacodec_wrap_hw_buffer(avctx, s, index, &info, frame)) < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to wrap MediaCodec buffer\n");
+                    return ret;
+                }
+            } else {
+                data = ff_AMediaCodec_getOutputBuffer(codec, index, &size);
+                if (!data) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to get output buffer\n");
+                    return AVERROR_EXTERNAL;
+                }
+
+                if ((ret = mediacodec_wrap_sw_buffer(avctx, s, data, size, index, &info, frame)) < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "Failed to wrap MediaCodec buffer\n");
+                    return ret;
+                }
+            }
+
+            s->output_buffer_count++;
+            return 0;
+        } else {
+            status = ff_AMediaCodec_releaseOutputBuffer(codec, index, 0);
+            if (status < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to release output buffer\n");
+            }
+        }
+
+    } else if (ff_AMediaCodec_infoOutputFormatChanged(codec, index)) {
+        char *format = NULL;
+
+        if (s->format) {
+            status = ff_AMediaFormat_delete(s->format);
+            if (status < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to delete MediaFormat %p\n", s->format);
+            }
+        }
+
+        s->format = ff_AMediaCodec_getOutputFormat(codec);
+        if (!s->format) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get output format\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        format = ff_AMediaFormat_toString(s->format);
+        if (!format) {
+            return AVERROR_EXTERNAL;
+        }
+        av_log(avctx, AV_LOG_INFO, "Output MediaFormat changed to %s\n", format);
+        av_freep(&format);
+
+        if ((ret = mediacodec_dec_parse_format(avctx, s)) < 0) {
+            return ret;
+        }
+
+    } else if (ff_AMediaCodec_infoOutputBuffersChanged(codec, index)) {
+        ff_AMediaCodec_cleanOutputBuffers(codec);
+    } else if (ff_AMediaCodec_infoTryAgainLater(codec, index)) {
+        if (s->draining) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to dequeue output buffer within %" PRIi64 "ms "
+                                        "while draining remaining frames, output will probably lack frames\n",
+                                        output_dequeue_timeout_us / 1000);
+        } else {
+            av_log(avctx, AV_LOG_TRACE, "No output buffer available, try again later\n");
+        }
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Failed to dequeue output buffer (status=%zd)\n", index);
+        return AVERROR_EXTERNAL;
+    }
+
+    return AVERROR(EAGAIN);
+}
+
+/*
+* ff_mediacodec_dec_flush returns 0 if the flush cannot be performed on
+* the codec (because the user retains frames). The codec stays in the
+* flushing state.
+*
+* ff_mediacodec_dec_flush returns 1 if the flush can actually be
+* performed on the codec. The codec leaves the flushing state and can
+* process again packets.
+*
+* ff_mediacodec_dec_flush returns a negative value if an error has
+* occurred.
+*/
+int ff_mediacodec_dec_flush(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    if (!s->surface || atomic_load(&s->refcount) == 1) {
+        int ret;
+
+        /* No frames (holding a reference to the codec) are retained by the
+         * user, thus we can flush the codec and returns accordingly */
+        if ((ret = mediacodec_dec_flush_codec(avctx, s)) < 0) {
+            return ret;
+        }
+
+        return 1;
+    }
+
+    s->flushing = 1;
+    return 0;
+}
+
+int ff_mediacodec_dec_close(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    ff_mediacodec_dec_unref(s);
+
+    return 0;
+}
+
+int ff_mediacodec_dec_is_flushing(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    return s->flushing;
+}
diff --git a/libavcodec/mediacodecdec_common.h b/libavcodec/mediacodecdec_common.h
new file mode 100644
index 0000000..0b21129
--- /dev/null
+++ b/libavcodec/mediacodecdec_common.h
@@ -0,0 +1,109 @@
+/*
+ * Android MediaCodec decoder
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODECDEC_COMMON_H
+#define AVCODEC_MEDIACODECDEC_COMMON_H
+
+#include <stdint.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "libavutil/frame.h"
+#include "libavutil/pixfmt.h"
+
+#include "avcodec.h"
+#include "mediacodec_wrapper.h"
+
+typedef struct MediaCodecDecContext {
+
+    AVCodecContext *avctx;
+    atomic_int refcount;
+    atomic_int hw_buffer_count;
+
+    char *codec_name;
+
+    FFAMediaCodec *codec;
+    FFAMediaFormat *format;
+
+    void *surface;
+
+    int started;
+    int draining;
+    int flushing;
+    int eos;
+
+    int width;
+    int height;
+    int stride;
+    int slice_height;
+    int color_format;
+    int crop_top;
+    int crop_bottom;
+    int crop_left;
+    int crop_right;
+    int display_width;
+    int display_height;
+
+    uint64_t output_buffer_count;
+    ssize_t current_input_buffer;
+
+    bool delay_flush;
+    atomic_int serial;
+
+} MediaCodecDecContext;
+
+int ff_mediacodec_dec_init(AVCodecContext *avctx,
+                           MediaCodecDecContext *s,
+                           const char *mime,
+                           FFAMediaFormat *format);
+
+int ff_mediacodec_dec_send(AVCodecContext *avctx,
+                           MediaCodecDecContext *s,
+                           AVPacket *pkt,
+                           bool wait);
+
+int ff_mediacodec_dec_receive(AVCodecContext *avctx,
+                              MediaCodecDecContext *s,
+                              AVFrame *frame,
+                              bool wait);
+
+int ff_mediacodec_dec_flush(AVCodecContext *avctx,
+                            MediaCodecDecContext *s);
+
+int ff_mediacodec_dec_close(AVCodecContext *avctx,
+                            MediaCodecDecContext *s);
+
+int ff_mediacodec_dec_is_flushing(AVCodecContext *avctx,
+                                  MediaCodecDecContext *s);
+
+typedef struct MediaCodecBuffer {
+
+    MediaCodecDecContext *ctx;
+    ssize_t index;
+    int64_t pts;
+    atomic_int released;
+    int serial;
+
+} MediaCodecBuffer;
+
+#endif /* AVCODEC_MEDIACODECDEC_COMMON_H */
diff --git a/libavcodec/metasound.c b/libavcodec/metasound.c
index 4cd9051..87cd7cb 100644
--- a/libavcodec/metasound.c
+++ b/libavcodec/metasound.c
@@ -4,20 +4,20 @@
  * based on TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,8 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "fft.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "lsp.h"
 #include "sinewin.h"
@@ -149,7 +149,7 @@ static void dec_bark_env(TwinVQContext *tctx, const uint8_t *in, int use_hist,
         }
 }
 
-static void read_cb_data(TwinVQContext *tctx, BitstreamContext *bc,
+static void read_cb_data(TwinVQContext *tctx, GetBitContext *gb,
                          uint8_t *dst, enum TwinVQFrameType ftype)
 {
     int i;
@@ -157,8 +157,8 @@ static void read_cb_data(TwinVQContext *tctx, BitstreamContext *bc,
     for (i = 0; i < tctx->n_div[ftype]; i++) {
         int bs_second_part = (i >= tctx->bits_main_spec_change[ftype]);
 
-        *dst++ = bitstream_read(bc, tctx->bits_main_spec[0][ftype][bs_second_part]);
-        *dst++ = bitstream_read(bc, tctx->bits_main_spec[1][ftype][bs_second_part]);
+        *dst++ = get_bits(gb, tctx->bits_main_spec[0][ftype][bs_second_part]);
+        *dst++ = get_bits(gb, tctx->bits_main_spec[1][ftype][bs_second_part]);
     }
 }
 
@@ -169,16 +169,17 @@ static int metasound_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
     const TwinVQModeTab *mtab = tctx->mtab;
     int channels              = tctx->avctx->channels;
     int sub;
-    BitstreamContext bc;
-    int i, j, k;
+    GetBitContext gb;
+    int i, j, k, ret;
 
-    bitstream_init8(&bc, buf, buf_size);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
 
     for (tctx->cur_frame = 0; tctx->cur_frame < tctx->frames_per_packet;
          tctx->cur_frame++) {
         bits = tctx->bits + tctx->cur_frame;
 
-        bits->window_type = bitstream_read(&bc, TWINVQ_WINDOW_TYPE_BITS);
+        bits->window_type = get_bits(&gb, TWINVQ_WINDOW_TYPE_BITS);
 
         if (bits->window_type > 8) {
             av_log(avctx, AV_LOG_ERROR, "Invalid window type, broken sample?\n");
@@ -190,54 +191,54 @@ static int metasound_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
         sub = mtab->fmode[bits->ftype].sub;
 
         if (bits->ftype != TWINVQ_FT_SHORT && !tctx->is_6kbps)
-            bitstream_read(&bc, 2);
+            get_bits(&gb, 2);
 
-        read_cb_data(tctx, &bc, bits->main_coeffs, bits->ftype);
+        read_cb_data(tctx, &gb, bits->main_coeffs, bits->ftype);
 
         for (i = 0; i < channels; i++)
             for (j = 0; j < sub; j++)
                 for (k = 0; k < mtab->fmode[bits->ftype].bark_n_coef; k++)
                     bits->bark1[i][j][k] =
-                        bitstream_read(&bc, mtab->fmode[bits->ftype].bark_n_bit);
+                        get_bits(&gb, mtab->fmode[bits->ftype].bark_n_bit);
 
         for (i = 0; i < channels; i++)
             for (j = 0; j < sub; j++)
-                bits->bark_use_hist[i][j] = bitstream_read_bit(&bc);
+                bits->bark_use_hist[i][j] = get_bits1(&gb);
 
         if (bits->ftype == TWINVQ_FT_LONG) {
             for (i = 0; i < channels; i++)
-                bits->gain_bits[i] = bitstream_read(&bc, TWINVQ_GAIN_BITS);
+                bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS);
         } else {
             for (i = 0; i < channels; i++) {
-                bits->gain_bits[i] = bitstream_read(&bc, TWINVQ_GAIN_BITS);
+                bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS);
                 for (j = 0; j < sub; j++)
                     bits->sub_gain_bits[i * sub + j] =
-                        bitstream_read(&bc, TWINVQ_SUB_GAIN_BITS);
+                        get_bits(&gb, TWINVQ_SUB_GAIN_BITS);
             }
         }
 
         for (i = 0; i < channels; i++) {
-            bits->lpc_hist_idx[i] = bitstream_read(&bc, mtab->lsp_bit0);
-            bits->lpc_idx1[i]     = bitstream_read(&bc, mtab->lsp_bit1);
+            bits->lpc_hist_idx[i] = get_bits(&gb, mtab->lsp_bit0);
+            bits->lpc_idx1[i]     = get_bits(&gb, mtab->lsp_bit1);
 
             for (j = 0; j < mtab->lsp_split; j++)
-                bits->lpc_idx2[i][j] = bitstream_read(&bc, mtab->lsp_bit2);
+                bits->lpc_idx2[i][j] = get_bits(&gb, mtab->lsp_bit2);
         }
 
         if (bits->ftype == TWINVQ_FT_LONG) {
-            read_cb_data(tctx, &bc, bits->ppc_coeffs, 3);
+            read_cb_data(tctx, &gb, bits->ppc_coeffs, 3);
             for (i = 0; i < channels; i++) {
-                bits->p_coef[i] = bitstream_read(&bc, mtab->ppc_period_bit);
-                bits->g_coef[i] = bitstream_read(&bc, mtab->pgain_bit);
+                bits->p_coef[i] = get_bits(&gb, mtab->ppc_period_bit);
+                bits->g_coef[i] = get_bits(&gb, mtab->pgain_bit);
             }
         }
 
         // subframes are aligned to nibbles
-        if (bitstream_tell(&bc) & 3)
-            bitstream_skip(&bc, 4 - (bitstream_tell(&bc) & 3));
+        if (get_bits_count(&gb) & 3)
+            skip_bits(&gb, 4 - (get_bits_count(&gb) & 3));
     }
 
-    return 0;
+    return (get_bits_count(&gb) + 7) / 8;
 }
 
 typedef struct MetasoundProps {
diff --git a/libavcodec/metasound_data.c b/libavcodec/metasound_data.c
index b399b75..da2548a 100644
--- a/libavcodec/metasound_data.c
+++ b/libavcodec/metasound_data.c
@@ -2,20 +2,20 @@
  * MetaSound decoder
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -11208,6 +11208,14 @@ static const int16_t fcb16m[] = {
       -688,   -209,    915,    622,  -1038,   -474,   -343,    -91,
       -173,   -104,    255,     96,   1547,    773,   -625,   2272,
        -90,   -509,   -527,   -247,   -147,   -234,    -45,    166,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
 };
 
 static const int16_t fcb16sl[] = {
diff --git a/libavcodec/metasound_data.h b/libavcodec/metasound_data.h
index 4925516..5c33411 100644
--- a/libavcodec/metasound_data.h
+++ b/libavcodec/metasound_data.h
@@ -2,20 +2,20 @@
  * MetaSound decoder
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/microdvddec.c b/libavcodec/microdvddec.c
new file mode 100644
index 0000000..dad0ec8
--- /dev/null
+++ b/libavcodec/microdvddec.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * MicroDVD subtitle decoder
+ *
+ * Based on the specifications found here:
+ * https://trac.videolan.org/vlc/ticket/1825#comment:6
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/bprint.h"
+#include "avcodec.h"
+#include "ass.h"
+
+static int indexof(const char *s, int c)
+{
+    char *f = strchr(s, c);
+    return f ? (f - s) : -1;
+}
+
+struct microdvd_tag {
+    char key;
+    int persistent;
+    uint32_t data1;
+    uint32_t data2;
+    char *data_string;
+    int data_string_len;
+};
+
+#define MICRODVD_PERSISTENT_OFF     0
+#define MICRODVD_PERSISTENT_ON      1
+#define MICRODVD_PERSISTENT_OPENED  2
+
+// Color, Font, Size, cHarset, stYle, Position, cOordinate
+#define MICRODVD_TAGS "cfshyYpo"
+
+static void microdvd_set_tag(struct microdvd_tag *tags, struct microdvd_tag tag)
+{
+    int tag_index = indexof(MICRODVD_TAGS, tag.key);
+
+    if (tag_index < 0)
+        return;
+    memcpy(&tags[tag_index], &tag, sizeof(tag));
+}
+
+// italic, bold, underline, strike-through
+#define MICRODVD_STYLES "ibus"
+
+/* some samples have lines that start with a / indicating non persistent italic
+ * marker */
+static char *check_for_italic_slash_marker(struct microdvd_tag *tags, char *s)
+{
+    if (*s == '/') {
+        struct microdvd_tag tag = tags[indexof(MICRODVD_TAGS, 'y')];
+        tag.key = 'y';
+        tag.data1 |= 1 << 0 /* 'i' position in MICRODVD_STYLES */;
+        microdvd_set_tag(tags, tag);
+        s++;
+    }
+    return s;
+}
+
+static char *microdvd_load_tags(struct microdvd_tag *tags, char *s)
+{
+    s = check_for_italic_slash_marker(tags, s);
+
+    while (*s == '{') {
+        char *start = s;
+        char tag_char = *(s + 1);
+        struct microdvd_tag tag = {0};
+
+        if (!tag_char || *(s + 2) != ':')
+            break;
+        s += 3;
+
+        switch (tag_char) {
+
+        /* Style */
+        case 'Y':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'y':
+            while (*s && *s != '}' && s - start < 256) {
+                int style_index = indexof(MICRODVD_STYLES, *s);
+
+                if (style_index >= 0)
+                    tag.data1 |= (1 << style_index);
+                s++;
+            }
+            if (*s != '}')
+                break;
+            /* We must distinguish persistent and non-persistent styles
+             * to handle this kind of style tags: {y:ib}{Y:us} */
+            tag.key = tag_char;
+            break;
+
+        /* Color */
+        case 'C':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'c':
+            while (*s == '$' || *s == '#')
+                s++;
+            tag.data1 = strtol(s, &s, 16) & 0x00ffffff;
+            if (*s != '}')
+                break;
+            tag.key = 'c';
+            break;
+
+        /* Font name */
+        case 'F':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'f': {
+            int len = indexof(s, '}');
+            if (len < 0)
+                break;
+            tag.data_string = s;
+            tag.data_string_len = len;
+            s += len;
+            tag.key = 'f';
+            break;
+        }
+
+        /* Font size */
+        case 'S':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 's':
+            tag.data1 = strtol(s, &s, 10);
+            if (*s != '}')
+                break;
+            tag.key = 's';
+            break;
+
+        /* Charset */
+        case 'H': {
+            //TODO: not yet handled, just parsed.
+            int len = indexof(s, '}');
+            if (len < 0)
+                break;
+            tag.data_string = s;
+            tag.data_string_len = len;
+            s += len;
+            tag.key = 'h';
+            break;
+        }
+
+        /* Position */
+        case 'P':
+            if (!*s)
+                break;
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+            tag.data1 = (*s++ == '1');
+            if (*s != '}')
+                break;
+            tag.key = 'p';
+            break;
+
+        /* Coordinates */
+        case 'o':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+            tag.data1 = strtol(s, &s, 10);
+            if (*s != ',')
+                break;
+            s++;
+            tag.data2 = strtol(s, &s, 10);
+            if (*s != '}')
+                break;
+            tag.key = 'o';
+            break;
+
+        default:    /* Unknown tag, we consider it's text */
+            break;
+        }
+
+        if (tag.key == 0)
+            return start;
+
+        microdvd_set_tag(tags, tag);
+        s++;
+    }
+    return check_for_italic_slash_marker(tags, s);
+}
+
+static void microdvd_open_tags(AVBPrint *new_line, struct microdvd_tag *tags)
+{
+    int i, sidx;
+    for (i = 0; i < sizeof(MICRODVD_TAGS) - 1; i++) {
+        if (tags[i].persistent == MICRODVD_PERSISTENT_OPENED)
+            continue;
+        switch (tags[i].key) {
+        case 'Y':
+        case 'y':
+            for (sidx = 0; sidx < sizeof(MICRODVD_STYLES) - 1; sidx++)
+                if (tags[i].data1 & (1 << sidx))
+                    av_bprintf(new_line, "{\\%c1}", MICRODVD_STYLES[sidx]);
+            break;
+
+        case 'c':
+            av_bprintf(new_line, "{\\c&H%06"PRIX32"&}", tags[i].data1);
+            break;
+
+        case 'f':
+            av_bprintf(new_line, "{\\fn%.*s}",
+                       tags[i].data_string_len, tags[i].data_string);
+            break;
+
+        case 's':
+            av_bprintf(new_line, "{\\fs%"PRId32"}", tags[i].data1);
+            break;
+
+        case 'p':
+            if (tags[i].data1 == 0)
+                av_bprintf(new_line, "{\\an8}");
+            break;
+
+        case 'o':
+            av_bprintf(new_line, "{\\pos(%"PRId32",%"PRId32")}",
+                       tags[i].data1, tags[i].data2);
+            break;
+        }
+        if (tags[i].persistent == MICRODVD_PERSISTENT_ON)
+            tags[i].persistent = MICRODVD_PERSISTENT_OPENED;
+    }
+}
+
+static void microdvd_close_no_persistent_tags(AVBPrint *new_line,
+                                              struct microdvd_tag *tags)
+{
+    int i, sidx;
+
+    for (i = sizeof(MICRODVD_TAGS) - 2; i >= 0; i--) {
+        if (tags[i].persistent != MICRODVD_PERSISTENT_OFF)
+            continue;
+        switch (tags[i].key) {
+
+        case 'y':
+            for (sidx = sizeof(MICRODVD_STYLES) - 2; sidx >= 0; sidx--)
+                if (tags[i].data1 & (1 << sidx))
+                    av_bprintf(new_line, "{\\%c0}", MICRODVD_STYLES[sidx]);
+            break;
+
+        case 'c':
+            av_bprintf(new_line, "{\\c}");
+            break;
+
+        case 'f':
+            av_bprintf(new_line, "{\\fn}");
+            break;
+
+        case 's':
+            av_bprintf(new_line, "{\\fs}");
+            break;
+        }
+        tags[i].key = 0;
+    }
+}
+
+static int microdvd_decode_frame(AVCodecContext *avctx,
+                                 void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    AVBPrint new_line;
+    char *line = avpkt->data;
+    char *end = avpkt->data + avpkt->size;
+    FFASSDecoderContext *s = avctx->priv_data;
+    struct microdvd_tag tags[sizeof(MICRODVD_TAGS) - 1] = {{0}};
+
+    if (avpkt->size <= 0)
+        return avpkt->size;
+
+    av_bprint_init(&new_line, 0, 2048);
+
+    // subtitle content
+    while (line < end && *line) {
+
+        // parse MicroDVD tags, and open them in ASS
+        line = microdvd_load_tags(tags, line);
+        microdvd_open_tags(&new_line, tags);
+
+        // simple copy until EOL or forced carriage return
+        while (line < end && *line && *line != '|') {
+            av_bprint_chars(&new_line, *line, 1);
+            line++;
+        }
+
+        // line split
+        if (line < end && *line == '|') {
+            microdvd_close_no_persistent_tags(&new_line, tags);
+            av_bprintf(&new_line, "\\N");
+            line++;
+        }
+    }
+    if (new_line.len) {
+        int ret = ff_ass_add_rect(sub, new_line.str, s->readorder++, 0, NULL, NULL);
+        av_bprint_finalize(&new_line, NULL);
+        if (ret < 0)
+            return ret;
+    }
+
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static int microdvd_init(AVCodecContext *avctx)
+{
+    int i, sidx;
+    AVBPrint font_buf;
+    int font_size    = ASS_DEFAULT_FONT_SIZE;
+    int color        = ASS_DEFAULT_COLOR;
+    int bold         = ASS_DEFAULT_BOLD;
+    int italic       = ASS_DEFAULT_ITALIC;
+    int underline    = ASS_DEFAULT_UNDERLINE;
+    int alignment    = ASS_DEFAULT_ALIGNMENT;
+    struct microdvd_tag tags[sizeof(MICRODVD_TAGS) - 1] = {{0}};
+
+    av_bprint_init(&font_buf, 0, AV_BPRINT_SIZE_AUTOMATIC);
+    av_bprintf(&font_buf, "%s", ASS_DEFAULT_FONT);
+
+    if (avctx->extradata) {
+        microdvd_load_tags(tags, avctx->extradata);
+        for (i = 0; i < sizeof(MICRODVD_TAGS) - 1; i++) {
+            switch (av_tolower(tags[i].key)) {
+            case 'y':
+                for (sidx = 0; sidx < sizeof(MICRODVD_STYLES) - 1; sidx++) {
+                    if (tags[i].data1 & (1 << sidx)) {
+                        switch (MICRODVD_STYLES[sidx]) {
+                        case 'i': italic    = 1; break;
+                        case 'b': bold      = 1; break;
+                        case 'u': underline = 1; break;
+                        }
+                    }
+                }
+                break;
+
+            case 'c': color     = tags[i].data1; break;
+            case 's': font_size = tags[i].data1; break;
+            case 'p': alignment =             8; break;
+
+            case 'f':
+                av_bprint_clear(&font_buf);
+                av_bprintf(&font_buf, "%.*s",
+                           tags[i].data_string_len, tags[i].data_string);
+                break;
+            }
+        }
+    }
+    return ff_ass_subtitle_header(avctx, font_buf.str, font_size, color,
+                                  ASS_DEFAULT_BACK_COLOR, bold, italic,
+                                  underline, ASS_DEFAULT_BORDERSTYLE,
+                                  alignment);
+}
+
+AVCodec ff_microdvd_decoder = {
+    .name         = "microdvd",
+    .long_name    = NULL_IF_CONFIG_SMALL("MicroDVD subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_MICRODVD,
+    .init         = microdvd_init,
+    .decode       = microdvd_decode_frame,
+    .flush        = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index da8289e..1d463e9 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2005  Ole André Vadla Ravnås <oleavr@gmail.com>
  * Copyright (C) 2008  Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,15 +24,14 @@
 #include <stdint.h>
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "blockdsp.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "bytestream.h"
 #include "bswapdsp.h"
 #include "hpeldsp.h"
 #include "idctdsp.h"
 #include "thread.h"
-#include "vlc.h"
 
 #define MIMIC_HEADER_SIZE   20
 
@@ -50,9 +49,9 @@ typedef struct MimicContext {
 
     ThreadFrame     frames     [16];
 
-    DECLARE_ALIGNED(16, int16_t, dct_block)[64];
+    DECLARE_ALIGNED(32, int16_t, dct_block)[64];
 
-    BitstreamContext bc;
+    GetBitContext   gb;
     ScanTable       scantable;
     BlockDSPContext bdsp;
     BswapDSPContext bbdsp;
@@ -120,7 +119,8 @@ static av_cold int mimic_decode_end(AVCodecContext *avctx)
     MimicContext *ctx = avctx->priv_data;
     int i;
 
-    av_free(ctx->swap_buf);
+    av_freep(&ctx->swap_buf);
+    ctx->swap_buf_size = 0;
 
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->frames); i++) {
         if (ctx->frames[i].f)
@@ -149,7 +149,7 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "error initializing vlc table\n");
         return ret;
     }
-    ff_blockdsp_init(&ctx->bdsp);
+    ff_blockdsp_init(&ctx->bdsp, avctx);
     ff_bswapdsp_init(&ctx->bbdsp);
     ff_hpeldsp_init(&ctx->hdsp, avctx->flags);
     ff_idctdsp_init(&ctx->idsp, avctx);
@@ -166,6 +166,7 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCodecContext *avctx_from)
 {
     MimicContext *dst = avctx->priv_data, *src = avctx_from->priv_data;
@@ -179,7 +180,7 @@ static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCod
 
     for (i = 0; i < FF_ARRAY_ELEMS(dst->frames); i++) {
         ff_thread_release_buffer(avctx, &dst->frames[i]);
-        if (src->frames[i].f->data[0]) {
+        if (i != src->next_cur_index && src->frames[i].f->data[0]) {
             ret = ff_thread_ref_frame(&dst->frames[i], &src->frames[i]);
             if (ret < 0)
                 return ret;
@@ -188,6 +189,7 @@ static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCod
 
     return 0;
 }
+#endif
 
 static const int8_t vlcdec_lookup[9][64] = {
     {    0, },
@@ -233,14 +235,14 @@ static int vlc_decode_block(MimicContext *ctx, int num_coeffs, int qscale)
 
     ctx->bdsp.clear_block(block);
 
-    block[0] = bitstream_read(&ctx->bc, 8) << 3;
+    block[0] = get_bits(&ctx->gb, 8) << 3;
 
     for (pos = 1; pos < num_coeffs; pos++) {
         uint32_t vlc, num_bits;
         int value;
         int coeff;
 
-        vlc = bitstream_read_vlc(&ctx->bc, ctx->vlc.table, ctx->vlc.bits, 3);
+        vlc = get_vlc2(&ctx->gb, ctx->vlc.table, ctx->vlc.bits, 3);
         if (!vlc) /* end-of-block code */
             return 0;
         if (vlc == -1)
@@ -253,14 +255,14 @@ static int vlc_decode_block(MimicContext *ctx, int num_coeffs, int qscale)
         if (pos >= 64)
             return AVERROR_INVALIDDATA;
 
-        value = bitstream_read(&ctx->bc, num_bits);
+        value = get_bits(&ctx->gb, num_bits);
 
-        /* Libav's IDCT behaves somewhat different from the original code, so
+        /* FFmpeg's IDCT behaves somewhat different from the original code, so
          * a factor of 4 was added to the input */
 
-        coeff = vlcdec_lookup[num_bits][value];
+        coeff = ((int8_t*)vlcdec_lookup[num_bits])[value];
         if (pos < 3)
-            coeff <<= 4;
+            coeff *= 16;
         else /* TODO Use >> 10 instead of / 1001 */
             coeff = (coeff * qscale) / 1001;
 
@@ -287,13 +289,13 @@ static int decode(MimicContext *ctx, int quality, int num_coeffs,
             for (x = 0; x < ctx->num_hblocks[plane]; x++) {
                 /* Check for a change condition in the current block.
                  * - iframes always change.
-                 * - Luma plane changes on bitstream_read_bit == 0
-                 * - Chroma planes change on bitstream_read_bit == 1 */
-                if (is_iframe || bitstream_read_bit(&ctx->bc) == is_chroma) {
+                 * - Luma plane changes on get_bits1 == 0
+                 * - Chroma planes change on get_bits1 == 1 */
+                if (is_iframe || get_bits1(&ctx->gb) == is_chroma) {
                     /* Luma planes may use a backreference from the 15 last
-                     * frames preceding the previous. (bitstream_read_bit == 1)
+                     * frames preceding the previous. (get_bits1 == 1)
                      * Chroma planes don't use backreferences. */
-                    if (is_chroma || is_iframe || !bitstream_read_bit(&ctx->bc)) {
+                    if (is_chroma || is_iframe || !get_bits1(&ctx->gb)) {
                         if ((ret = vlc_decode_block(ctx, num_coeffs,
                                                     qscale)) < 0) {
                             av_log(ctx->avctx, AV_LOG_ERROR, "Error decoding "
@@ -302,7 +304,7 @@ static int decode(MimicContext *ctx, int quality, int num_coeffs,
                         }
                         ctx->idsp.idct_put(dst, stride, ctx->dct_block);
                     } else {
-                        unsigned int backref = bitstream_read(&ctx->bc, 4);
+                        unsigned int backref = get_bits(&ctx->gb, 4);
                         int index            = (ctx->cur_index + backref) & 15;
                         uint8_t *p           = ctx->frames[index].f->data[0];
 
@@ -388,9 +390,11 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
             return AVERROR_INVALIDDATA;
         }
 
+        res = ff_set_dimensions(avctx, width, height);
+        if (res < 0)
+            return res;
+
         ctx->avctx     = avctx;
-        avctx->width   = width;
-        avctx->height  = height;
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
         for (i = 0; i < 3; i++) {
             ctx->num_vblocks[i] = AV_CEIL_RSHIFT(height,   3 + !!i);
@@ -410,10 +414,8 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     ctx->frames[ctx->cur_index].f->pict_type = is_pframe ? AV_PICTURE_TYPE_P :
                                                            AV_PICTURE_TYPE_I;
     if ((res = ff_thread_get_buffer(avctx, &ctx->frames[ctx->cur_index],
-                                    AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                                    AV_GET_BUFFER_FLAG_REF)) < 0)
         return res;
-    }
 
     ctx->next_prev_index = ctx->cur_index;
     ctx->next_cur_index  = (ctx->cur_index - 1) & 15;
@@ -427,7 +429,7 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     ctx->bbdsp.bswap_buf(ctx->swap_buf,
                          (const uint32_t *) (buf + MIMIC_HEADER_SIZE),
                          swap_buf_size >> 2);
-    bitstream_init8(&ctx->bc, ctx->swap_buf, swap_buf_size);
+    init_get_bits(&ctx->gb, ctx->swap_buf, swap_buf_size << 3);
 
     res = decode(ctx, quality, num_coeffs, !is_pframe);
     ff_thread_report_progress(&ctx->frames[ctx->cur_index], INT_MAX, 0);
@@ -449,6 +451,7 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     return buf_size;
 }
 
+#if HAVE_THREADS
 static av_cold int mimic_init_thread_copy(AVCodecContext *avctx)
 {
     MimicContext *ctx = avctx->priv_data;
@@ -464,6 +467,7 @@ static av_cold int mimic_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 AVCodec ff_mimic_decoder = {
     .name                  = "mimic",
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
new file mode 100644
index 0000000..c5b54d5
--- /dev/null
+++ b/libavcodec/mips/Makefile
@@ -0,0 +1,91 @@
+MIPSFPU-OBJS-$(CONFIG_AMRNB_DECODER)      += mips/acelp_filters_mips.o     \
+                                             mips/celp_filters_mips.o      \
+                                             mips/celp_math_mips.o         \
+                                             mips/acelp_vectors_mips.o
+MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER)      += mips/acelp_filters_mips.o     \
+                                             mips/celp_filters_mips.o      \
+                                             mips/amrwbdec_mips.o          \
+                                             mips/celp_math_mips.o         \
+                                             mips/acelp_vectors_mips.o
+MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP)       += mips/mpegaudiodsp_mips_float.o
+MIPSDSP-OBJS-$(CONFIG_MPEGAUDIODSP)       += mips/mpegaudiodsp_mips_fixed.o
+MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
+MIPSFPU-OBJS-$(CONFIG_FMTCONVERT)         += mips/fmtconvert_mips.o
+OBJS-$(CONFIG_AC3DSP)                     += mips/ac3dsp_mips.o
+OBJS-$(CONFIG_AAC_DECODER)                += mips/aacdec_mips.o            \
+                                             mips/aacsbr_mips.o            \
+                                             mips/sbrdsp_mips.o            \
+                                             mips/aacpsdsp_mips.o
+MIPSDSP-OBJS-$(CONFIG_AAC_ENCODER)        += mips/aaccoder_mips.o
+MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER)        += mips/iirfilter_mips.o
+OBJS-$(CONFIG_HEVC_DECODER)               += mips/hevcdsp_init_mips.o      \
+                                             mips/hevcpred_init_mips.o
+OBJS-$(CONFIG_VP9_DECODER)                += mips/vp9dsp_init_mips.o
+OBJS-$(CONFIG_VP8_DECODER)                += mips/vp8dsp_init_mips.o
+OBJS-$(CONFIG_VP3DSP)                     += mips/vp3dsp_init_mips.o
+OBJS-$(CONFIG_H264DSP)                    += mips/h264dsp_init_mips.o
+OBJS-$(CONFIG_H264QPEL)                   += mips/h264qpel_init_mips.o
+OBJS-$(CONFIG_H264CHROMA)                 += mips/h264chroma_init_mips.o
+OBJS-$(CONFIG_H264PRED)                   += mips/h264pred_init_mips.o
+OBJS-$(CONFIG_H263DSP)                    += mips/h263dsp_init_mips.o
+OBJS-$(CONFIG_QPELDSP)                    += mips/qpeldsp_init_mips.o
+OBJS-$(CONFIG_HPELDSP)                    += mips/hpeldsp_init_mips.o
+OBJS-$(CONFIG_BLOCKDSP)                   += mips/blockdsp_init_mips.o
+OBJS-$(CONFIG_PIXBLOCKDSP)                += mips/pixblockdsp_init_mips.o
+OBJS-$(CONFIG_IDCTDSP)                    += mips/idctdsp_init_mips.o
+OBJS-$(CONFIG_MPEGVIDEO)                  += mips/mpegvideo_init_mips.o
+OBJS-$(CONFIG_MPEGVIDEOENC)               += mips/mpegvideoencdsp_init_mips.o
+OBJS-$(CONFIG_ME_CMP)                     += mips/me_cmp_init_mips.o
+OBJS-$(CONFIG_MPEG4_DECODER)              += mips/xvididct_init_mips.o
+OBJS-$(CONFIG_VC1DSP)                     += mips/vc1dsp_init_mips.o
+OBJS-$(CONFIG_WMV2DSP)                    += mips/wmv2dsp_init_mips.o
+OBJS-$(CONFIG_VIDEODSP)                   += mips/videodsp_init.o
+MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
+                                             mips/hevc_mc_uni_msa.o        \
+                                             mips/hevc_mc_uniw_msa.o       \
+                                             mips/hevc_mc_bi_msa.o         \
+                                             mips/hevc_mc_biw_msa.o        \
+                                             mips/hevc_idct_msa.o          \
+                                             mips/hevc_lpf_sao_msa.o       \
+                                             mips/hevcpred_msa.o
+MSA-OBJS-$(CONFIG_VP9_DECODER)            += mips/vp9_mc_msa.o             \
+                                             mips/vp9_lpf_msa.o            \
+                                             mips/vp9_idct_msa.o           \
+                                             mips/vp9_intra_msa.o
+MSA-OBJS-$(CONFIG_VP8_DECODER)            += mips/vp8_mc_msa.o             \
+                                             mips/vp8_idct_msa.o           \
+                                             mips/vp8_lpf_msa.o
+MSA-OBJS-$(CONFIG_VP3DSP)                 += mips/vp3dsp_idct_msa.o
+MSA-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_msa.o            \
+                                             mips/h264idct_msa.o
+MSA-OBJS-$(CONFIG_H264QPEL)               += mips/h264qpel_msa.o
+MSA-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_msa.o
+MSA-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_msa.o
+MSA-OBJS-$(CONFIG_H263DSP)                += mips/h263dsp_msa.o
+MSA-OBJS-$(CONFIG_QPELDSP)                += mips/qpeldsp_msa.o
+MSA-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_msa.o
+MSA-OBJS-$(CONFIG_BLOCKDSP)               += mips/blockdsp_msa.o
+MSA-OBJS-$(CONFIG_PIXBLOCKDSP)            += mips/pixblockdsp_msa.o
+MSA-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_msa.o           \
+                                             mips/simple_idct_msa.o
+MSA-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_msa.o
+MSA-OBJS-$(CONFIG_MPEGVIDEOENC)           += mips/mpegvideoencdsp_msa.o
+MSA-OBJS-$(CONFIG_ME_CMP)                 += mips/me_cmp_msa.o
+MMI-OBJS                                  += mips/constants.o
+MMI-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_mmi.o
+MMI-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_mmi.o
+MMI-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_mmi.o
+MMI-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_mmi.o
+MMI-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_mmi.o           \
+                                             mips/simple_idct_mmi.o
+MMI-OBJS-$(CONFIG_MPEG4_DECODER)          += mips/xvid_idct_mmi.o
+MMI-OBJS-$(CONFIG_BLOCKDSP)               += mips/blockdsp_mmi.o
+MMI-OBJS-$(CONFIG_PIXBLOCKDSP)            += mips/pixblockdsp_mmi.o
+MMI-OBJS-$(CONFIG_H264QPEL)               += mips/h264qpel_mmi.o
+MMI-OBJS-$(CONFIG_VP8_DECODER)            += mips/vp8dsp_mmi.o
+MMI-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_mmi.o
+MMI-OBJS-$(CONFIG_VC1_DECODER)            += mips/vc1dsp_mmi.o
+MMI-OBJS-$(CONFIG_WMV2DSP)                += mips/wmv2dsp_mmi.o
+MMI-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_mmi.o
+MMI-OBJS-$(CONFIG_VP3DSP)                 += mips/vp3dsp_idct_mmi.o
+MMI-OBJS-$(CONFIG_VP9_DECODER)            += mips/vp9_mc_mmi.o
diff --git a/libavcodec/mips/aaccoder_mips.c b/libavcodec/mips/aaccoder_mips.c
new file mode 100644
index 0000000..d690c8c
--- /dev/null
+++ b/libavcodec/mips/aaccoder_mips.c
@@ -0,0 +1,2502 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ *          Szabolcs Pal     (sabolc@mips.com)
+ *
+ * AAC coefficients encoder optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aaccoder.c
+ */
+
+#include "libavutil/libm.h"
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/put_bits.h"
+#include "libavcodec/aac.h"
+#include "libavcodec/aacenc.h"
+#include "libavcodec/aactab.h"
+#include "libavcodec/aacenctab.h"
+#include "libavcodec/aacenc_utils.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+typedef struct BandCodingPath {
+    int prev_idx;
+    float cost;
+    int run;
+} BandCodingPath;
+
+static const uint8_t uquad_sign_bits[81] = {
+    0, 1, 1, 1, 2, 2, 1, 2, 2,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    2, 3, 3, 3, 4, 4, 3, 4, 4
+};
+
+static const uint8_t upair7_sign_bits[64] = {
+    0, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const uint8_t upair12_sign_bits[169] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+static const uint8_t esc_sign_bits[289] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+/**
+ * Functions developed from template function and optimized for quantizing and encoding band
+ */
+static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        put_bits(pb, p_bits[curidx], p_codes[curidx]);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = vec[0] * IQ;
+            e2 = vec[1] * IQ;
+            e3 = vec[2] * IQ;
+            e4 = vec[3] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx, sign, count;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      2       \n\t"
+            "ori    %[sign],    $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign],    %[t0],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc2]  \n\t"
+            "slt    %[t4],      $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count],   $zero,      %[qc3]  \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t2]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count],   %[count],   %[t4]   \n\t"
+            "addu   %[count],   %[count],   %[t1]   \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc4]  \n\t"
+            "addu   %[count],   %[count],   %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign]"=&r"(sign), [count]"=&r"(count),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
+        v_bits  = p_bits[curidx] + count;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = copysignf(vec[0] * IQ, in[i+0]);
+            e2 = copysignf(vec[1] * IQ, in[i+1]);
+            e3 = copysignf(vec[2] * IQ, in[i+2]);
+            e4 = copysignf(vec[3] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  4       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2 = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
+        v_bits  = p_bits[curidx] + p_bits[curidx2];
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx*2 ];
+            vec2 = &p_vec[curidx2*2];
+            e1 = vec1[0] * IQ;
+            e2 = vec1[1] * IQ;
+            e3 = vec2[0] * IQ;
+            e4 = vec2[1] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
+                                                      PutBitContext *pb, const float *in, float *out,
+                                                      const float *scaled, int size, int scale_idx,
+                                                      int cb, const float lambda, const float uplim,
+                                                      int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx1, curidx2, sign1, count1, sign2, count2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      7       \n\t"
+            "ori    %[sign1],   $zero,      0       \n\t"
+            "ori    %[sign2],   $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "sll    %[t0],      %[sign1],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign2],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count1],  %[count1],  %[t1]   \n\t"
+            "addu   %[count2],  %[count2],  %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "t0", "t1", "t2", "t3", "t4",
+              "memory"
+        );
+
+        curidx1  = 8 * qc1;
+        curidx1 += qc2;
+
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
+        put_bits(pb, v_bits, v_codes);
+
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
+
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
+                                                       PutBitContext *pb, const float *in, float *out,
+                                                       const float *scaled, int size, int scale_idx,
+                                                       int cb, const float lambda, const float uplim,
+                                                       int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx1, curidx2, sign1, count1, sign2, count2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      12      \n\t"
+            "ori    %[sign1],   $zero,      0       \n\t"
+            "ori    %[sign2],   $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "sll    %[t0],      %[sign1],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign2],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count1],  %[count1],  %[t1]   \n\t"
+            "addu   %[count2],  %[count2],  %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx1  = 13 * qc1;
+        curidx1 += qc2;
+
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
+        put_bits(pb, v_bits, v_codes);
+
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
+
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
+                                                   PutBitContext *pb, const float *in, float *out,
+                                                   const float *scaled, int size, int scale_idx,
+                                                   int cb, const float lambda, const float uplim,
+                                                   int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+
+    if (cb < 11) {
+        for (i = 0; i < size; i += 4) {
+            int curidx, curidx2, sign1, count1, sign2, count2;
+            int *in_int = (int *)&in[i];
+            uint8_t v_bits;
+            unsigned int v_codes;
+            int t0, t1, t2, t3, t4;
+            const float *vec1, *vec2;
+
+            qc1 = scaled[i  ] * Q34 + ROUNDING;
+            qc2 = scaled[i+1] * Q34 + ROUNDING;
+            qc3 = scaled[i+2] * Q34 + ROUNDING;
+            qc4 = scaled[i+3] * Q34 + ROUNDING;
+
+            __asm__ volatile (
+                ".set push                                  \n\t"
+                ".set noreorder                             \n\t"
+
+                "ori        %[t4],      $zero,      16      \n\t"
+                "ori        %[sign1],   $zero,      0       \n\t"
+                "ori        %[sign2],   $zero,      0       \n\t"
+                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
+                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
+                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
+                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
+                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
+                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
+                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
+                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
+                "lw         %[t0],      0(%[in_int])        \n\t"
+                "lw         %[t1],      4(%[in_int])        \n\t"
+                "lw         %[t2],      8(%[in_int])        \n\t"
+                "lw         %[t3],      12(%[in_int])       \n\t"
+                "slt        %[t0],      %[t0],      $zero   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
+                "slt        %[t2],      %[t2],      $zero   \n\t"
+                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
+                "slt        %[t1],      %[t1],      $zero   \n\t"
+                "sll        %[t0],      %[sign1],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t1]   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
+                "slt        %[t3],      %[t3],      $zero   \n\t"
+                "sll        %[t0],      %[sign2],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t3]   \n\t"
+                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
+                "slt        %[t1],      $zero,      %[qc2]  \n\t"
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
+                "slt        %[t2],      $zero,      %[qc4]  \n\t"
+                "addu       %[count1],  %[count1],  %[t1]   \n\t"
+                "addu       %[count2],  %[count2],  %[t2]   \n\t"
+
+                ".set pop                                   \n\t"
+
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+                  [t4]"=&r"(t4)
+                : [in_int]"r"(in_int)
+                : "memory"
+            );
+
+            curidx = 17 * qc1;
+            curidx += qc2;
+            curidx2 = 17 * qc3;
+            curidx2 += qc4;
+
+            v_codes = (p_codes[curidx] << count1) | sign1;
+            v_bits  = p_bits[curidx] + count1;
+            put_bits(pb, v_bits, v_codes);
+
+            v_codes = (p_codes[curidx2] << count2) | sign2;
+            v_bits  = p_bits[curidx2] + count2;
+            put_bits(pb, v_bits, v_codes);
+
+            if (out || energy) {
+                float e1,e2,e3,e4;
+                vec1 = &p_vectors[curidx*2 ];
+                vec2 = &p_vectors[curidx2*2];
+                e1 = copysignf(vec1[0] * IQ, in[i+0]);
+                e2 = copysignf(vec1[1] * IQ, in[i+1]);
+                e3 = copysignf(vec2[0] * IQ, in[i+2]);
+                e4 = copysignf(vec2[1] * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+            }
+        }
+    } else {
+        for (i = 0; i < size; i += 4) {
+            int curidx, curidx2, sign1, count1, sign2, count2;
+            int *in_int = (int *)&in[i];
+            uint8_t v_bits;
+            unsigned int v_codes;
+            int c1, c2, c3, c4;
+            int t0, t1, t2, t3, t4;
+
+            qc1 = scaled[i  ] * Q34 + ROUNDING;
+            qc2 = scaled[i+1] * Q34 + ROUNDING;
+            qc3 = scaled[i+2] * Q34 + ROUNDING;
+            qc4 = scaled[i+3] * Q34 + ROUNDING;
+
+            __asm__ volatile (
+                ".set push                                  \n\t"
+                ".set noreorder                             \n\t"
+
+                "ori        %[t4],      $zero,      16      \n\t"
+                "ori        %[sign1],   $zero,      0       \n\t"
+                "ori        %[sign2],   $zero,      0       \n\t"
+                "shll_s.w   %[c1],      %[qc1],     18      \n\t"
+                "shll_s.w   %[c2],      %[qc2],     18      \n\t"
+                "shll_s.w   %[c3],      %[qc3],     18      \n\t"
+                "shll_s.w   %[c4],      %[qc4],     18      \n\t"
+                "srl        %[c1],      %[c1],      18      \n\t"
+                "srl        %[c2],      %[c2],      18      \n\t"
+                "srl        %[c3],      %[c3],      18      \n\t"
+                "srl        %[c4],      %[c4],      18      \n\t"
+                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
+                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
+                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
+                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
+                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
+                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
+                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
+                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
+                "lw         %[t0],      0(%[in_int])        \n\t"
+                "lw         %[t1],      4(%[in_int])        \n\t"
+                "lw         %[t2],      8(%[in_int])        \n\t"
+                "lw         %[t3],      12(%[in_int])       \n\t"
+                "slt        %[t0],      %[t0],      $zero   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
+                "slt        %[t2],      %[t2],      $zero   \n\t"
+                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
+                "slt        %[t1],      %[t1],      $zero   \n\t"
+                "sll        %[t0],      %[sign1],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t1]   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
+                "slt        %[t3],      %[t3],      $zero   \n\t"
+                "sll        %[t0],      %[sign2],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t3]   \n\t"
+                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
+                "slt        %[t1],      $zero,      %[qc2]  \n\t"
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
+                "slt        %[t2],      $zero,      %[qc4]  \n\t"
+                "addu       %[count1],  %[count1],  %[t1]   \n\t"
+                "addu       %[count2],  %[count2],  %[t2]   \n\t"
+
+                ".set pop                                   \n\t"
+
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+                  [c1]"=&r"(c1), [c2]"=&r"(c2),
+                  [c3]"=&r"(c3), [c4]"=&r"(c4),
+                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+                  [t4]"=&r"(t4)
+                : [in_int]"r"(in_int)
+                : "memory"
+            );
+
+            curidx = 17 * qc1;
+            curidx += qc2;
+
+            curidx2 = 17 * qc3;
+            curidx2 += qc4;
+
+            v_codes = (p_codes[curidx] << count1) | sign1;
+            v_bits  = p_bits[curidx] + count1;
+            put_bits(pb, v_bits, v_codes);
+
+            if (p_vectors[curidx*2  ] == 64.0f) {
+                int len = av_log2(c1);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
+                put_bits(pb, len * 2 - 3, v_codes);
+            }
+            if (p_vectors[curidx*2+1] == 64.0f) {
+                int len = av_log2(c2);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
+                put_bits(pb, len*2-3, v_codes);
+            }
+
+            v_codes = (p_codes[curidx2] << count2) | sign2;
+            v_bits  = p_bits[curidx2] + count2;
+            put_bits(pb, v_bits, v_codes);
+
+            if (p_vectors[curidx2*2  ] == 64.0f) {
+                int len = av_log2(c3);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
+                put_bits(pb, len* 2 - 3, v_codes);
+            }
+            if (p_vectors[curidx2*2+1] == 64.0f) {
+                int len = av_log2(c4);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
+                put_bits(pb, len * 2 - 3, v_codes);
+            }
+
+            if (out || energy) {
+                float e1, e2, e3, e4;
+                e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
+                e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
+                e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
+                e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+            }
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, float *energy, const float ROUNDING) {
+    av_assert0(0);
+}
+
+static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, float *energy, const float ROUNDING) {
+    int i;
+    if (bits)
+        *bits = 0;
+    if (out) {
+        for (i = 0; i < size; i += 4) {
+           out[i  ] = 0.0f;
+           out[i+1] = 0.0f;
+           out[i+2] = 0.0f;
+           out[i+3] = 0.0f;
+        }
+    }
+    if (energy)
+        *energy = 0.0f;
+}
+
+static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, float *energy, const float ROUNDING) = {
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_SQUAD_mips,
+    quantize_and_encode_band_cost_SQUAD_mips,
+    quantize_and_encode_band_cost_UQUAD_mips,
+    quantize_and_encode_band_cost_UQUAD_mips,
+    quantize_and_encode_band_cost_SPAIR_mips,
+    quantize_and_encode_band_cost_SPAIR_mips,
+    quantize_and_encode_band_cost_UPAIR7_mips,
+    quantize_and_encode_band_cost_UPAIR7_mips,
+    quantize_and_encode_band_cost_UPAIR12_mips,
+    quantize_and_encode_band_cost_UPAIR12_mips,
+    quantize_and_encode_band_cost_ESC_mips,
+    quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_ZERO_mips,
+};
+
+#define quantize_and_encode_band_cost(                                       \
+                                s, pb, in, out, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy, ROUNDING)       \
+    quantize_and_encode_band_cost_arr[cb](                                   \
+                                s, pb, in, out, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy, ROUNDING)
+
+static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
+                                          const float *in, float *out, int size, int scale_idx,
+                                          int cb, const float lambda, int rtz)
+{
+    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
+                                  INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
+}
+
+/**
+ * Functions developed from template function and optimized for getting the number of bits
+ */
+static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits)
+{
+    return 0;
+}
+
+static float get_band_numbits_NONE_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits)
+{
+    av_assert0(0);
+    return 0;
+}
+
+static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        curbits += p_bits[curidx];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int curbits = 0;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  2       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += uquad_sign_bits[curidx];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  4       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx  = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2  = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        curbits += p_bits[curidx] + p_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
+                                          PutBitContext *pb, const float *in,
+                                          const float *scaled, int size, int scale_idx,
+                                          int cb, const float lambda, const float uplim,
+                                          int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  7       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx  = 8 * qc1;
+        curidx += qc2;
+
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx] +
+                   upair7_sign_bits[curidx] +
+                   p_bits[curidx2] +
+                   upair7_sign_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
+                                           PutBitContext *pb, const float *in,
+                                           const float *scaled, int size, int scale_idx,
+                                           int cb, const float lambda, const float uplim,
+                                           int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  12      \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx  = 13 * qc1;
+        curidx += qc2;
+
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx] +
+                   p_bits[curidx2] +
+                   upair12_sign_bits[curidx] +
+                   upair12_sign_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_ESC_mips(struct AACEncContext *s,
+                                       PutBitContext *pb, const float *in,
+                                       const float *scaled, int size, int scale_idx,
+                                       int cb, const float lambda, const float uplim,
+                                       int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int cond0, cond1, cond2, cond3;
+        int c1, c2, c3, c4;
+        int t4, t5;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],      $zero,  15          \n\t"
+            "ori        %[t5],      $zero,  16          \n\t"
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
+            "srl        %[c1],      %[c1],  18          \n\t"
+            "srl        %[c2],      %[c2],  18          \n\t"
+            "srl        %[c3],      %[c3],  18          \n\t"
+            "srl        %[c4],      %[c4],  18          \n\t"
+            "slt        %[cond0],   %[t4],  %[qc1]      \n\t"
+            "slt        %[cond1],   %[t4],  %[qc2]      \n\t"
+            "slt        %[cond2],   %[t4],  %[qc3]      \n\t"
+            "slt        %[cond3],   %[t4],  %[qc4]      \n\t"
+            "movn       %[qc1],     %[t5],  %[cond0]    \n\t"
+            "movn       %[qc2],     %[t5],  %[cond1]    \n\t"
+            "movn       %[qc3],     %[t5],  %[cond2]    \n\t"
+            "movn       %[qc4],     %[t5],  %[cond3]    \n\t"
+            "ori        %[t5],      $zero,  31          \n\t"
+            "clz        %[c1],      %[c1]               \n\t"
+            "clz        %[c2],      %[c2]               \n\t"
+            "clz        %[c3],      %[c3]               \n\t"
+            "clz        %[c4],      %[c4]               \n\t"
+            "subu       %[c1],      %[t5],  %[c1]       \n\t"
+            "subu       %[c2],      %[t5],  %[c2]       \n\t"
+            "subu       %[c3],      %[t5],  %[c3]       \n\t"
+            "subu       %[c4],      %[t5],  %[c4]       \n\t"
+            "sll        %[c1],      %[c1],  1           \n\t"
+            "sll        %[c2],      %[c2],  1           \n\t"
+            "sll        %[c3],      %[c3],  1           \n\t"
+            "sll        %[c4],      %[c4],  1           \n\t"
+            "addiu      %[c1],      %[c1],  -3          \n\t"
+            "addiu      %[c2],      %[c2],  -3          \n\t"
+            "addiu      %[c3],      %[c3],  -3          \n\t"
+            "addiu      %[c4],      %[c4],  -3          \n\t"
+            "subu       %[cond0],   $zero,  %[cond0]    \n\t"
+            "subu       %[cond1],   $zero,  %[cond1]    \n\t"
+            "subu       %[cond2],   $zero,  %[cond2]    \n\t"
+            "subu       %[cond3],   $zero,  %[cond3]    \n\t"
+            "and        %[c1],      %[c1],  %[cond0]    \n\t"
+            "and        %[c2],      %[c2],  %[cond1]    \n\t"
+            "and        %[c3],      %[c3],  %[cond2]    \n\t"
+            "and        %[c4],      %[c4],  %[cond3]    \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
+              [c3]"=&r"(c3), [c4]"=&r"(c4),
+              [t4]"=&r"(t4), [t5]"=&r"(t5)
+        );
+
+        curidx = 17 * qc1;
+        curidx += qc2;
+
+        curidx2 = 17 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += esc_sign_bits[curidx];
+        curbits += p_bits[curidx2];
+        curbits += esc_sign_bits[curidx2];
+
+        curbits += c1;
+        curbits += c2;
+        curbits += c3;
+        curbits += c4;
+    }
+    return curbits;
+}
+
+static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
+                                             PutBitContext *pb, const float *in,
+                                             const float *scaled, int size, int scale_idx,
+                                             int cb, const float lambda, const float uplim,
+                                             int *bits) = {
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_SQUAD_mips,
+    get_band_numbits_SQUAD_mips,
+    get_band_numbits_UQUAD_mips,
+    get_band_numbits_UQUAD_mips,
+    get_band_numbits_SPAIR_mips,
+    get_band_numbits_SPAIR_mips,
+    get_band_numbits_UPAIR7_mips,
+    get_band_numbits_UPAIR7_mips,
+    get_band_numbits_UPAIR12_mips,
+    get_band_numbits_UPAIR12_mips,
+    get_band_numbits_ESC_mips,
+    get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_ZERO_mips,
+};
+
+#define get_band_numbits(                                  \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)                    \
+    get_band_numbits_arr[cb](                              \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)
+
+static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits, float *energy, int rtz)
+{
+    return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
+}
+
+/**
+ * Functions developed from template function and optimized for getting the band cost
+ */
+#if HAVE_MIPSFPU
+static float get_band_cost_ZERO_mips(struct AACEncContext *s,
+                                     PutBitContext *pb, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits, float *energy)
+{
+    int i;
+    float cost = 0;
+
+    for (i = 0; i < size; i += 4) {
+        cost += in[i  ] * in[i  ];
+        cost += in[i+1] * in[i+1];
+        cost += in[i+2] * in[i+2];
+        cost += in[i+3] * in[i+3];
+    }
+    if (bits)
+        *bits = 0;
+    if (energy)
+        *energy = 0.0f;
+    return cost * lambda;
+}
+
+static float get_band_cost_NONE_mips(struct AACEncContext *s,
+                                     PutBitContext *pb, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits, float *energy)
+{
+    av_assert0(0);
+    return 0;
+}
+
+static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec;
+        int curidx;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "slt        %[qc1], $zero,  %[qc1]          \n\t"
+            "slt        %[qc2], $zero,  %[qc2]          \n\t"
+            "slt        %[qc3], $zero,  %[qc3]          \n\t"
+            "slt        %[qc4], $zero,  %[qc4]          \n\t"
+            "lw         %[t0],  0(%[in_int])            \n\t"
+            "lw         %[t1],  4(%[in_int])            \n\t"
+            "lw         %[t2],  8(%[in_int])            \n\t"
+            "lw         %[t3],  12(%[in_int])           \n\t"
+            "srl        %[t0],  %[t0],  31              \n\t"
+            "srl        %[t1],  %[t1],  31              \n\t"
+            "srl        %[t2],  %[t2],  31              \n\t"
+            "srl        %[t3],  %[t3],  31              \n\t"
+            "subu       %[t4],  $zero,  %[qc1]          \n\t"
+            "subu       %[t5],  $zero,  %[qc2]          \n\t"
+            "subu       %[t6],  $zero,  %[qc3]          \n\t"
+            "subu       %[t7],  $zero,  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t5],  %[t1]           \n\t"
+            "movn       %[qc3], %[t6],  %[t2]           \n\t"
+            "movn       %[qc4], %[t7],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        curbits += p_bits[curidx];
+        vec     = &p_codes[curidx*4];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
+            "lwc1       $f1,    0(%[vec])               \n\t"
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
+            "lwc1       $f3,    4(%[vec])               \n\t"
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
+            "lwc1       $f5,    8(%[vec])               \n\t"
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
+            "lwc1       $f7,    12(%[vec])              \n\t"
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "$f4", "$f5", "$f6", "$f7",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int curbits = 0;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec;
+        int curidx;
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],  $zero,  2               \n\t"
+            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
+            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
+            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
+            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t4],  %[t1]           \n\t"
+            "movn       %[qc3], %[t4],  %[t2]           \n\t"
+            "movn       %[qc4], %[t4],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += uquad_sign_bits[curidx];
+        vec     = &p_codes[curidx*4];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       %[di0], 0(%[in_pos])            \n\t"
+            "lwc1       %[di1], 4(%[in_pos])            \n\t"
+            "lwc1       %[di2], 8(%[in_pos])            \n\t"
+            "lwc1       %[di3], 12(%[in_pos])           \n\t"
+            "abs.s      %[di0], %[di0]                  \n\t"
+            "abs.s      %[di1], %[di1]                  \n\t"
+            "abs.s      %[di2], %[di2]                  \n\t"
+            "abs.s      %[di3], %[di3]                  \n\t"
+            "lwc1       $f0,    0(%[vec])               \n\t"
+            "lwc1       $f1,    4(%[vec])               \n\t"
+            "lwc1       $f2,    8(%[vec])               \n\t"
+            "lwc1       $f3,    12(%[vec])              \n\t"
+            "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],  $zero,  4               \n\t"
+            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
+            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
+            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
+            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t4],  %[t1]           \n\t"
+            "movn       %[qc3], %[t4],  %[t2]           \n\t"
+            "movn       %[qc4], %[t4],  %[t3]           \n\t"
+            "lw         %[t0],  0(%[in_int])            \n\t"
+            "lw         %[t1],  4(%[in_int])            \n\t"
+            "lw         %[t2],  8(%[in_int])            \n\t"
+            "lw         %[t3],  12(%[in_int])           \n\t"
+            "srl        %[t0],  %[t0],  31              \n\t"
+            "srl        %[t1],  %[t1],  31              \n\t"
+            "srl        %[t2],  %[t2],  31              \n\t"
+            "srl        %[t3],  %[t3],  31              \n\t"
+            "subu       %[t4],  $zero,  %[qc1]          \n\t"
+            "subu       %[t5],  $zero,  %[qc2]          \n\t"
+            "subu       %[t6],  $zero,  %[qc3]          \n\t"
+            "subu       %[t7],  $zero,  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t5],  %[t1]           \n\t"
+            "movn       %[qc3], %[t6],  %[t2]           \n\t"
+            "movn       %[qc4], %[t7],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2 = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        curbits += p_bits[curidx];
+        curbits += p_bits[curidx2];
+
+        vec     = &p_codes[curidx*2];
+        vec2    = &p_codes[curidx2*2];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
+            "lwc1       $f1,    0(%[vec])               \n\t"
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
+            "lwc1       $f3,    4(%[vec])               \n\t"
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
+            "lwc1       $f5,    0(%[vec2])              \n\t"
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
+            "lwc1       $f7,    4(%[vec2])              \n\t"
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "$f4", "$f5", "$f6", "$f7",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
+                                       PutBitContext *pb, const float *in,
+                                       const float *scaled, int size, int scale_idx,
+                                       int cb, const float lambda, const float uplim,
+                                       int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2, sign1, count1, sign2, count2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "ori        %[t4],      $zero,      7               \n\t"
+            "ori        %[sign1],   $zero,      0               \n\t"
+            "ori        %[sign2],   $zero,      0               \n\t"
+            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
+            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
+            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
+            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
+            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
+            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
+            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
+            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
+            "lw         %[t0],      0(%[in_int])                \n\t"
+            "lw         %[t1],      4(%[in_int])                \n\t"
+            "lw         %[t2],      8(%[in_int])                \n\t"
+            "lw         %[t3],      12(%[in_int])               \n\t"
+            "slt        %[t0],      %[t0],      $zero           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
+            "slt        %[t2],      %[t2],      $zero           \n\t"
+            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
+            "slt        %[t1],      %[t1],      $zero           \n\t"
+            "sll        %[t0],      %[sign1],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t1]           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
+            "slt        %[t3],      %[t3],      $zero           \n\t"
+            "sll        %[t0],      %[sign2],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t3]           \n\t"
+            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
+            "slt        %[t1],      $zero,      %[qc2]          \n\t"
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
+            "slt        %[t2],      $zero,      %[qc4]          \n\t"
+            "addu       %[count1],  %[count1],  %[t1]           \n\t"
+            "addu       %[count2],  %[count2],  %[t2]           \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 8 * qc1;
+        curidx += qc2;
+
+        curidx2 = 8 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += upair7_sign_bits[curidx];
+        vec     = &p_codes[curidx*2];
+
+        curbits += p_bits[curidx2];
+        curbits += upair7_sign_bits[curidx2];
+        vec2    = &p_codes[curidx2*2];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
+            "abs.s      %[di0],     %[di0]                      \n\t"
+            "abs.s      %[di1],     %[di1]                      \n\t"
+            "abs.s      %[di2],     %[di2]                      \n\t"
+            "abs.s      %[di3],     %[di3]                      \n\t"
+            "lwc1       $f0,        0(%[vec])                   \n\t"
+            "lwc1       $f1,        4(%[vec])                   \n\t"
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        int sign1, count1, sign2, count2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "ori        %[t4],      $zero,      12              \n\t"
+            "ori        %[sign1],   $zero,      0               \n\t"
+            "ori        %[sign2],   $zero,      0               \n\t"
+            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
+            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
+            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
+            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
+            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
+            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
+            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
+            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
+            "lw         %[t0],      0(%[in_int])                \n\t"
+            "lw         %[t1],      4(%[in_int])                \n\t"
+            "lw         %[t2],      8(%[in_int])                \n\t"
+            "lw         %[t3],      12(%[in_int])               \n\t"
+            "slt        %[t0],      %[t0],      $zero           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
+            "slt        %[t2],      %[t2],      $zero           \n\t"
+            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
+            "slt        %[t1],      %[t1],      $zero           \n\t"
+            "sll        %[t0],      %[sign1],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t1]           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
+            "slt        %[t3],      %[t3],      $zero           \n\t"
+            "sll        %[t0],      %[sign2],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t3]           \n\t"
+            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
+            "slt        %[t1],      $zero,      %[qc2]          \n\t"
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
+            "slt        %[t2],      $zero,      %[qc4]          \n\t"
+            "addu       %[count1],  %[count1],  %[t1]           \n\t"
+            "addu       %[count2],  %[count2],  %[t2]           \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 13 * qc1;
+        curidx += qc2;
+
+        curidx2 = 13 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += p_bits[curidx2];
+        curbits += upair12_sign_bits[curidx];
+        curbits += upair12_sign_bits[curidx2];
+        vec     = &p_codes[curidx*2];
+        vec2    = &p_codes[curidx2*2];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
+            "abs.s      %[di0],     %[di0]                      \n\t"
+            "abs.s      %[di1],     %[di1]                      \n\t"
+            "abs.s      %[di2],     %[di2]                      \n\t"
+            "abs.s      %[di3],     %[di3]                      \n\t"
+            "lwc1       $f0,        0(%[vec])                   \n\t"
+            "lwc1       $f1,        4(%[vec])                   \n\t"
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_ESC_mips(struct AACEncContext *s,
+                                    PutBitContext *pb, const float *in,
+                                    const float *scaled, int size, int scale_idx,
+                                    int cb, const float lambda, const float uplim,
+                                    int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    const float CLIPPED_ESCAPE = 165140.0f * IQ;
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        float t1, t2, t3, t4, V;
+        float di1, di2, di3, di4;
+        int cond0, cond1, cond2, cond3;
+        int c1, c2, c3, c4;
+        int t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t6],      $zero,  15          \n\t"
+            "ori        %[t7],      $zero,  16          \n\t"
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
+            "srl        %[c1],      %[c1],  18          \n\t"
+            "srl        %[c2],      %[c2],  18          \n\t"
+            "srl        %[c3],      %[c3],  18          \n\t"
+            "srl        %[c4],      %[c4],  18          \n\t"
+            "slt        %[cond0],   %[t6],  %[qc1]      \n\t"
+            "slt        %[cond1],   %[t6],  %[qc2]      \n\t"
+            "slt        %[cond2],   %[t6],  %[qc3]      \n\t"
+            "slt        %[cond3],   %[t6],  %[qc4]      \n\t"
+            "movn       %[qc1],     %[t7],  %[cond0]    \n\t"
+            "movn       %[qc2],     %[t7],  %[cond1]    \n\t"
+            "movn       %[qc3],     %[t7],  %[cond2]    \n\t"
+            "movn       %[qc4],     %[t7],  %[cond3]    \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
+              [c3]"=&r"(c3), [c4]"=&r"(c4),
+              [t6]"=&r"(t6), [t7]"=&r"(t7)
+        );
+
+        curidx = 17 * qc1;
+        curidx += qc2;
+
+        curidx2 = 17 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += esc_sign_bits[curidx];
+        vec     = &p_codes[curidx*2];
+
+        curbits += p_bits[curidx2];
+        curbits += esc_sign_bits[curidx2];
+        vec2     = &p_codes[curidx2*2];
+
+        curbits += (av_log2(c1) * 2 - 3) & (-cond0);
+        curbits += (av_log2(c2) * 2 - 3) & (-cond1);
+        curbits += (av_log2(c3) * 2 - 3) & (-cond2);
+        curbits += (av_log2(c4) * 2 - 3) & (-cond3);
+
+        t1 = fabsf(in[i  ]);
+        t2 = fabsf(in[i+1]);
+        t3 = fabsf(in[i+2]);
+        t4 = fabsf(in[i+3]);
+
+        if (cond0) {
+            if (t1 >= CLIPPED_ESCAPE) {
+                di1 = t1 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di1 = t1 - (V = vec[0] * IQ);
+            qenergy += V*V;
+        }
+
+        if (cond1) {
+            if (t2 >= CLIPPED_ESCAPE) {
+                di2 = t2 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di2 = t2 - (V = vec[1] * IQ);
+            qenergy += V*V;
+        }
+
+        if (cond2) {
+            if (t3 >= CLIPPED_ESCAPE) {
+                di3 = t3 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di3 = t3 - (V = vec2[0] * IQ);
+            qenergy += V*V;
+        }
+
+        if (cond3) {
+            if (t4 >= CLIPPED_ESCAPE) {
+                di4 = t4 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di4 = t4 - (V = vec2[1]*IQ);
+            qenergy += V*V;
+        }
+
+        cost += di1 * di1 + di2 * di2
+                + di3 * di3 + di4 * di4;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float (*const get_band_cost_arr[])(struct AACEncContext *s,
+                                          PutBitContext *pb, const float *in,
+                                          const float *scaled, int size, int scale_idx,
+                                          int cb, const float lambda, const float uplim,
+                                          int *bits, float *energy) = {
+    get_band_cost_ZERO_mips,
+    get_band_cost_SQUAD_mips,
+    get_band_cost_SQUAD_mips,
+    get_band_cost_UQUAD_mips,
+    get_band_cost_UQUAD_mips,
+    get_band_cost_SPAIR_mips,
+    get_band_cost_SPAIR_mips,
+    get_band_cost_UPAIR7_mips,
+    get_band_cost_UPAIR7_mips,
+    get_band_cost_UPAIR12_mips,
+    get_band_cost_UPAIR12_mips,
+    get_band_cost_ESC_mips,
+    get_band_cost_NONE_mips, /* cb 12 doesn't exist */
+    get_band_cost_ZERO_mips,
+    get_band_cost_ZERO_mips,
+    get_band_cost_ZERO_mips,
+};
+
+#define get_band_cost(                                  \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy)            \
+    get_band_cost_arr[cb](                              \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy)
+
+static float quantize_band_cost(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
+}
+
+#include "libavcodec/aacenc_quantization_misc.h"
+
+#include "libavcodec/aaccoder_twoloop.h"
+
+static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
+{
+    int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
+    uint8_t nextband0[128], nextband1[128];
+    float M[128], S[128];
+    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
+    const float lambda = s->lambda;
+    const float mslambda = FFMIN(1.0f, lambda / 120.f);
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    if (!cpe->common_window)
+        return;
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce0, nextband0);
+    ff_init_nextband_map(sce1, nextband1);
+
+    prev_mid = sce0->sf_idx[0];
+    prev_side = sce1->sf_idx[0];
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce0->ics.num_swb; g++) {
+            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
+            if (!cpe->is_mask[w*16+g])
+                cpe->ms_mask[w*16+g] = 0;
+            if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
+                float Mmax = 0.0f, Smax = 0.0f;
+
+                /* Must compute mid/side SF and book for the whole window group */
+                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                        S[i] =  M[i]
+                              - sce1->coeffs[start+(w+w2)*128+i];
+                    }
+                    abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
+                        Mmax = FFMAX(Mmax, M34[i]);
+                        Smax = FFMAX(Smax, S34[i]);
+                    }
+                }
+
+                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
+                    float dist1 = 0.0f, dist2 = 0.0f;
+                    int B0 = 0, B1 = 0;
+                    int minidx;
+                    int mididx, sididx;
+                    int midcb, sidcb;
+
+                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
+                    mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
+                        && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
+                            || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
+                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
+                        continue;
+                    }
+
+                    midcb = find_min_book(Mmax, mididx);
+                    sidcb = find_min_book(Smax, sididx);
+
+                    /* No CB can be zero */
+                    midcb = FFMAX(1,midcb);
+                    sidcb = FFMAX(1,sidcb);
+
+                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                        float minthr = FFMIN(band0->threshold, band1->threshold);
+                        int b1,b2,b3,b4;
+                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                            S[i] =  M[i]
+                                  - sce1->coeffs[start+(w+w2)*128+i];
+                        }
+
+                        abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
+                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                    L34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[w*16+g],
+                                                    sce0->band_type[w*16+g],
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
+                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                    R34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[w*16+g],
+                                                    sce1->band_type[w*16+g],
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
+                        dist2 += quantize_band_cost(s, M,
+                                                    M34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    mididx,
+                                                    midcb,
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
+                        dist2 += quantize_band_cost(s, S,
+                                                    S34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sididx,
+                                                    sidcb,
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
+                        B0 += b1+b2;
+                        B1 += b3+b4;
+                        dist1 -= b1+b2;
+                        dist2 -= b3+b4;
+                    }
+                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
+                    if (cpe->ms_mask[w*16+g]) {
+                        if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
+                            sce0->sf_idx[w*16+g] = mididx;
+                            sce1->sf_idx[w*16+g] = sididx;
+                            sce0->band_type[w*16+g] = midcb;
+                            sce1->band_type[w*16+g] = sidcb;
+                        } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
+                            /* ms_mask unneeded, and it confuses some decoders */
+                            cpe->ms_mask[w*16+g] = 0;
+                        }
+                        break;
+                    } else if (B1 > B0) {
+                        /* More boost won't fix this */
+                        break;
+                    }
+                }
+            }
+            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
+                prev_mid = sce0->sf_idx[w*16+g];
+            if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                prev_side = sce1->sf_idx[w*16+g];
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+}
+#endif /*HAVE_MIPSFPU */
+
+#include "libavcodec/aaccoder_trellis.h"
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aac_coder_init_mips(AACEncContext *c) {
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    AACCoefficientsEncoder *e = c->coder;
+    int option = c->options.coder;
+
+    if (option == 2) {
+        e->quantize_and_encode_band = quantize_and_encode_band_mips;
+        e->encode_window_bands_info = codebook_trellis_rate;
+#if HAVE_MIPSFPU
+        e->search_for_quantizers    = search_for_quantizers_twoloop;
+#endif /* HAVE_MIPSFPU */
+    }
+#if HAVE_MIPSFPU
+    e->search_for_ms            = search_for_ms_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacdec_mips.c b/libavcodec/mips/aacdec_mips.c
new file mode 100644
index 0000000..253cdeb
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#include "libavcodec/aac.h"
+#include "aacdec_mips.h"
+#include "libavcodec/aactab.h"
+#include "libavcodec/sinewin.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static av_always_inline void float_copy(float *dst, const float *src, int count)
+{
+    // Copy 'count' floats from src to dst
+    const float *loop_end = src + count;
+    int temp[8];
+
+    // count must be a multiple of 8
+    av_assert2(count % 8 == 0);
+
+    // loop unrolled 8 times
+    __asm__ volatile (
+        ".set push                               \n\t"
+        ".set noreorder                          \n\t"
+    "1:                                          \n\t"
+        "lw      %[temp0],    0(%[src])          \n\t"
+        "lw      %[temp1],    4(%[src])          \n\t"
+        "lw      %[temp2],    8(%[src])          \n\t"
+        "lw      %[temp3],    12(%[src])         \n\t"
+        "lw      %[temp4],    16(%[src])         \n\t"
+        "lw      %[temp5],    20(%[src])         \n\t"
+        "lw      %[temp6],    24(%[src])         \n\t"
+        "lw      %[temp7],    28(%[src])         \n\t"
+        PTR_ADDIU "%[src],    %[src],      32    \n\t"
+        "sw      %[temp0],    0(%[dst])          \n\t"
+        "sw      %[temp1],    4(%[dst])          \n\t"
+        "sw      %[temp2],    8(%[dst])          \n\t"
+        "sw      %[temp3],    12(%[dst])         \n\t"
+        "sw      %[temp4],    16(%[dst])         \n\t"
+        "sw      %[temp5],    20(%[dst])         \n\t"
+        "sw      %[temp6],    24(%[dst])         \n\t"
+        "sw      %[temp7],    28(%[dst])         \n\t"
+        "bne     %[src],      %[loop_end], 1b    \n\t"
+        PTR_ADDIU "%[dst],    %[dst],      32    \n\t"
+        ".set pop                                \n\t"
+
+        : [temp0]"=&r"(temp[0]), [temp1]"=&r"(temp[1]),
+          [temp2]"=&r"(temp[2]), [temp3]"=&r"(temp[3]),
+          [temp4]"=&r"(temp[4]), [temp5]"=&r"(temp[5]),
+          [temp6]"=&r"(temp[6]), [temp7]"=&r"(temp[7]),
+          [src]"+r"(src), [dst]"+r"(dst)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+}
+
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+static void imdct_and_windowing_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *in    = sce->coeffs;
+    float *out   = sce->ret;
+    float *saved = sce->saved;
+    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
+    float *buf  = ac->buf_mdct;
+    int i;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 1024; i += 128)
+            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
+    } else
+        ac->mdct.imdct_half(&ac->mdct, buf, in);
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp->vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+    } else {
+        float_copy(out, saved, 448);
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            {
+                float wi;
+                float wj;
+                int i;
+                float temp0, temp1, temp2, temp3;
+                float *dst0 = out + 448 + 0*128;
+                float *dst1 = dst0 + 64 + 63;
+                float *dst2 = saved + 63;
+                float *win0 = (float*)swindow;
+                float *win1 = win0 + 64 + 63;
+                float *win0_prev = (float*)swindow_prev;
+                float *win1_prev = win0_prev + 64 + 63;
+                float *src0_prev = saved + 448;
+                float *src1_prev = buf + 0*128 + 63;
+                float *src0 = buf + 0*128 + 64;
+                float *src1 = buf + 1*128 + 63;
+
+                for(i = 0; i < 64; i++)
+                {
+                    temp0 = src0_prev[0];
+                    temp1 = src1_prev[0];
+                    wi = *win0_prev;
+                    wj = *win1_prev;
+                    temp2 = src0[0];
+                    temp3 = src1[0];
+                    dst0[0] = temp0 * wj - temp1 * wi;
+                    dst1[0] = temp0 * wi + temp1 * wj;
+
+                    wi = *win0;
+                    wj = *win1;
+
+                    temp0 = src0[128];
+                    temp1 = src1[128];
+                    dst0[128] = temp2 * wj - temp3 * wi;
+                    dst1[128] = temp2 * wi + temp3 * wj;
+
+                    temp2 = src0[256];
+                    temp3 = src1[256];
+                    dst0[256] = temp0 * wj - temp1 * wi;
+                    dst1[256] = temp0 * wi + temp1 * wj;
+                    dst0[384] = temp2 * wj - temp3 * wi;
+                    dst1[384] = temp2 * wi + temp3 * wj;
+
+                    temp0 = src0[384];
+                    temp1 = src1[384];
+                    dst0[512] = temp0 * wj - temp1 * wi;
+                    dst2[0] = temp0 * wi + temp1 * wj;
+
+                    src0++;
+                    src1--;
+                    src0_prev++;
+                    src1_prev--;
+                    win0++;
+                    win1--;
+                    win0_prev++;
+                    win1_prev--;
+                    dst0++;
+                    dst1--;
+                    dst2--;
+                }
+            }
+        } else {
+            ac->fdsp->vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            float_copy(out + 576, buf + 64, 448);
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        ac->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        float_copy(saved + 448, buf + 7*128 + 64, 64);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float_copy(saved, buf + 512, 448);
+        float_copy(saved + 448, buf + 7*128 + 64, 64);
+    } else { // LONG_STOP or ONLY_LONG
+        float_copy(saved, buf + 512, 512);
+    }
+}
+
+static void apply_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    const LongTermPrediction *ltp = &sce->ics.ltp;
+    const uint16_t *offsets = sce->ics.swb_offset;
+    int i, sfb;
+    int j, k;
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        float *predTime = sce->ret;
+        float *predFreq = ac->buf_mdct;
+        float *p_predTime;
+        int16_t num_samples = 2048;
+
+        if (ltp->lag < 1024)
+            num_samples = ltp->lag + 1024;
+            j = (2048 - num_samples) >> 2;
+            k = (2048 - num_samples) & 3;
+            p_predTime = &predTime[num_samples];
+
+        for (i = 0; i < num_samples; i++)
+            predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
+        for (i = 0; i < j; i++) {
+
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                "sw      $0,              4(%[p_predTime])        \n\t"
+                "sw      $0,              8(%[p_predTime])        \n\t"
+                "sw      $0,              12(%[p_predTime])       \n\t"
+                PTR_ADDIU "%[p_predTime], %[p_predTime],     16   \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+        for (i = 0; i < k; i++) {
+
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                PTR_ADDIU "%[p_predTime], %[p_predTime],     4    \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+
+        if (sce->tns.present)
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+
+        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
+            if (ltp->used[sfb])
+                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
+                    sce->coeffs[i] += predFreq[i];
+    }
+}
+
+#if HAVE_MIPSFPU
+static av_always_inline void fmul_and_reverse(float *dst, const float *src0, const float *src1, int count)
+{
+    /* Multiply 'count' floats in src0 by src1 and store the results in dst in reverse */
+    /* This should be equivalent to a normal fmul, followed by reversing dst */
+
+    // count must be a multiple of 4
+    av_assert2(count % 4 == 0);
+
+    // move src0 and src1 to the last element of their arrays
+    src0 += count - 1;
+    src1 += count - 1;
+
+    for (; count > 0; count -= 4){
+        float temp[12];
+
+        /* loop unrolled 4 times */
+        __asm__ volatile (
+            "lwc1    %[temp0],    0(%[ptr2])                \n\t"
+            "lwc1    %[temp1],    -4(%[ptr2])               \n\t"
+            "lwc1    %[temp2],    -8(%[ptr2])               \n\t"
+            "lwc1    %[temp3],    -12(%[ptr2])              \n\t"
+            "lwc1    %[temp4],    0(%[ptr3])                \n\t"
+            "lwc1    %[temp5],    -4(%[ptr3])               \n\t"
+            "lwc1    %[temp6],    -8(%[ptr3])               \n\t"
+            "lwc1    %[temp7],    -12(%[ptr3])              \n\t"
+            "mul.s   %[temp8],    %[temp0],     %[temp4]    \n\t"
+            "mul.s   %[temp9],    %[temp1],     %[temp5]    \n\t"
+            "mul.s   %[temp10],   %[temp2],     %[temp6]    \n\t"
+            "mul.s   %[temp11],   %[temp3],     %[temp7]    \n\t"
+            "swc1    %[temp8],    0(%[ptr1])                \n\t"
+            "swc1    %[temp9],    4(%[ptr1])                \n\t"
+            "swc1    %[temp10],   8(%[ptr1])                \n\t"
+            "swc1    %[temp11],   12(%[ptr1])               \n\t"
+            PTR_ADDIU "%[ptr1],   %[ptr1],      16          \n\t"
+            PTR_ADDIU "%[ptr2],   %[ptr2],      -16         \n\t"
+            PTR_ADDIU "%[ptr3],   %[ptr3],      -16         \n\t"
+
+            : [temp0]"=&f"(temp[0]), [temp1]"=&f"(temp[1]),
+              [temp2]"=&f"(temp[2]), [temp3]"=&f"(temp[3]),
+              [temp4]"=&f"(temp[4]), [temp5]"=&f"(temp[5]),
+              [temp6]"=&f"(temp[6]), [temp7]"=&f"(temp[7]),
+              [temp8]"=&f"(temp[8]), [temp9]"=&f"(temp[9]),
+              [temp10]"=&f"(temp[10]), [temp11]"=&f"(temp[11]),
+              [ptr1]"+r"(dst), [ptr2]"+r"(src0), [ptr3]"+r"(src1)
+            :
+            : "memory"
+        );
+    }
+}
+
+static void update_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *saved     = sce->saved;
+    float *saved_ltp = sce->coeffs;
+    const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        float *p_saved_ltp = saved_ltp + 576;
+        float *loop_end1 = p_saved_ltp + 448;
+
+        float_copy(saved_ltp, saved, 512);
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+        "1:                                                   \n\t"
+            "sw     $0,              0(%[p_saved_ltp])        \n\t"
+            "sw     $0,              4(%[p_saved_ltp])        \n\t"
+            "sw     $0,              8(%[p_saved_ltp])        \n\t"
+            "sw     $0,              12(%[p_saved_ltp])       \n\t"
+            "sw     $0,              16(%[p_saved_ltp])       \n\t"
+            "sw     $0,              20(%[p_saved_ltp])       \n\t"
+            "sw     $0,              24(%[p_saved_ltp])       \n\t"
+            "sw     $0,              28(%[p_saved_ltp])       \n\t"
+            PTR_ADDIU "%[p_saved_ltp],%[p_saved_ltp],    32   \n\t"
+            "bne    %[p_saved_ltp],  %[loop_end1],       1b   \n\t"
+
+            : [p_saved_ltp]"+r"(p_saved_ltp)
+            : [loop_end1]"r"(loop_end1)
+            : "memory"
+        );
+
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float *buff0 = saved;
+        float *buff1 = saved_ltp;
+        float *loop_end = saved + 448;
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+        "1:                                             \n\t"
+            "lw      %[temp0],    0(%[src])             \n\t"
+            "lw      %[temp1],    4(%[src])             \n\t"
+            "lw      %[temp2],    8(%[src])             \n\t"
+            "lw      %[temp3],    12(%[src])            \n\t"
+            "lw      %[temp4],    16(%[src])            \n\t"
+            "lw      %[temp5],    20(%[src])            \n\t"
+            "lw      %[temp6],    24(%[src])            \n\t"
+            "lw      %[temp7],    28(%[src])            \n\t"
+            PTR_ADDIU "%[src],    %[src],         32    \n\t"
+            "sw      %[temp0],    0(%[dst])             \n\t"
+            "sw      %[temp1],    4(%[dst])             \n\t"
+            "sw      %[temp2],    8(%[dst])             \n\t"
+            "sw      %[temp3],    12(%[dst])            \n\t"
+            "sw      %[temp4],    16(%[dst])            \n\t"
+            "sw      %[temp5],    20(%[dst])            \n\t"
+            "sw      %[temp6],    24(%[dst])            \n\t"
+            "sw      %[temp7],    28(%[dst])            \n\t"
+            "sw      $0,          2304(%[dst])          \n\t"
+            "sw      $0,          2308(%[dst])          \n\t"
+            "sw      $0,          2312(%[dst])          \n\t"
+            "sw      $0,          2316(%[dst])          \n\t"
+            "sw      $0,          2320(%[dst])          \n\t"
+            "sw      $0,          2324(%[dst])          \n\t"
+            "sw      $0,          2328(%[dst])          \n\t"
+            "sw      $0,          2332(%[dst])          \n\t"
+            "bne     %[src],      %[loop_end],    1b    \n\t"
+            PTR_ADDIU "%[dst],    %[dst],         32    \n\t"
+            ".set pop                                   \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [src]"+r"(buff0), [dst]"+r"(buff1)
+            : [loop_end]"r"(loop_end)
+            : "memory"
+        );
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64);
+    } else { // LONG_STOP or ONLY_LONG
+        ac->fdsp->vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 512, lwindow, 512);
+    }
+
+    float_copy(sce->ltp_state, sce->ltp_state + 1024, 1024);
+    float_copy(sce->ltp_state + 1024, sce->ret, 1024);
+    float_copy(sce->ltp_state + 2048, saved_ltp, 1024);
+}
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aacdec_init_mips(AACContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->imdct_and_windowing         = imdct_and_windowing_mips;
+    c->apply_ltp                   = apply_ltp_mips;
+#if HAVE_MIPSFPU
+    c->update_ltp                  = update_ltp_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacdec_mips.h b/libavcodec/mips/aacdec_mips.h
new file mode 100644
index 0000000..758266f
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * AAC Spectral Band Replication decoding functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#ifndef AVCODEC_MIPS_AACDEC_MIPS_H
+#define AVCODEC_MIPS_AACDEC_MIPS_H
+
+#include "libavcodec/aac.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static inline float *VMUL2_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    float temp0, temp1, temp2;
+    int temp3, temp4;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp3],  %[idx],       0x0F         \n\t"
+        "andi    %[temp4],  %[idx],       0xF0         \n\t"
+        "sll     %[temp3],  %[temp3],     2            \n\t"
+        "srl     %[temp4],  %[temp4],     2            \n\t"
+        "lwc1    %[temp2],  0(%[scale])                \n\t"
+        "lwxc1   %[temp0],  %[temp3](%[v])             \n\t"
+        "lwxc1   %[temp1],  %[temp4](%[v])             \n\t"
+        "mul.s   %[temp0],  %[temp0],     %[temp2]     \n\t"
+        "mul.s   %[temp1],  %[temp1],     %[temp2]     \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       8            \n\t"
+        "swc1    %[temp0],  0(%[dst])                  \n\t"
+        "swc1    %[temp1],  4(%[dst])                  \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    int temp0, temp1, temp2, temp3;
+    float temp4, temp5, temp6, temp7, temp8;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       0x03        \n\t"
+        "andi    %[temp1],  %[idx],       0x0C        \n\t"
+        "andi    %[temp2],  %[idx],       0x30        \n\t"
+        "andi    %[temp3],  %[idx],       0xC0        \n\t"
+        "sll     %[temp0],  %[temp0],     2           \n\t"
+        "srl     %[temp2],  %[temp2],     2           \n\t"
+        "srl     %[temp3],  %[temp3],     4           \n\t"
+        "lwc1    %[temp4],  0(%[scale])               \n\t"
+        "lwxc1   %[temp5],  %[temp0](%[v])            \n\t"
+        "lwxc1   %[temp6],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp7],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp8],  %[temp3](%[v])            \n\t"
+        "mul.s   %[temp5],  %[temp5],     %[temp4]    \n\t"
+        "mul.s   %[temp6],  %[temp6],     %[temp4]    \n\t"
+        "mul.s   %[temp7],  %[temp7],     %[temp4]    \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp4]    \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       16          \n\t"
+        "swc1    %[temp5],  0(%[dst])                 \n\t"
+        "swc1    %[temp6],  4(%[dst])                 \n\t"
+        "swc1    %[temp7],  8(%[dst])                 \n\t"
+        "swc1    %[temp8],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL2S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       0x0F       \n\t"
+        "andi    %[temp1],  %[idx],       0xF0       \n\t"
+        "lw      %[temp4],  0(%[scale])              \n\t"
+        "srl     %[temp2],  %[sign],      1          \n\t"
+        "sll     %[temp3],  %[sign],      31         \n\t"
+        "sll     %[temp2],  %[temp2],     31         \n\t"
+        "sll     %[temp0],  %[temp0],     2          \n\t"
+        "srl     %[temp1],  %[temp1],     2          \n\t"
+        "lwxc1   %[temp8],  %[temp0](%[v])           \n\t"
+        "lwxc1   %[temp9],  %[temp1](%[v])           \n\t"
+        "xor     %[temp5],  %[temp4],     %[temp2]   \n\t"
+        "xor     %[temp4],  %[temp4],     %[temp3]   \n\t"
+        "mtc1    %[temp5],  %[temp6]                 \n\t"
+        "mtc1    %[temp4],  %[temp7]                 \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp6]   \n\t"
+        "mul.s   %[temp9],  %[temp9],     %[temp7]   \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       8          \n\t"
+        "swc1    %[temp8],  0(%[dst])                \n\t"
+        "swc1    %[temp9],  4(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
+          [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [sign]"r"(sign)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4;
+    float temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+    float *ret;
+    unsigned int mask = 1U << 31;
+
+    __asm__ volatile(
+        "lw      %[temp0],   0(%[scale])               \n\t"
+        "andi    %[temp1],  %[idx],       0x03         \n\t"
+        "andi    %[temp2],  %[idx],       0x0C         \n\t"
+        "andi    %[temp3],  %[idx],       0x30         \n\t"
+        "andi    %[temp4],  %[idx],       0xC0         \n\t"
+        "sll     %[temp1],  %[temp1],     2            \n\t"
+        "srl     %[temp3],  %[temp3],     2            \n\t"
+        "srl     %[temp4],  %[temp4],     4            \n\t"
+        "lwxc1   %[temp10],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp11],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp12],  %[temp3](%[v])            \n\t"
+        "lwxc1   %[temp13],  %[temp4](%[v])            \n\t"
+        "and     %[temp1],   %[sign],      %[mask]     \n\t"
+        "srl     %[temp2],   %[idx],       12          \n\t"
+        "srl     %[temp3],   %[idx],       13          \n\t"
+        "srl     %[temp4],   %[idx],       14          \n\t"
+        "andi    %[temp2],   %[temp2],     1           \n\t"
+        "andi    %[temp3],   %[temp3],     1           \n\t"
+        "andi    %[temp4],   %[temp4],     1           \n\t"
+        "sllv    %[sign],    %[sign],      %[temp2]    \n\t"
+        "xor     %[temp1],   %[temp0],     %[temp1]    \n\t"
+        "and     %[temp2],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp1],   %[temp14]                 \n\t"
+        "xor     %[temp2],   %[temp0],     %[temp2]    \n\t"
+        "sllv    %[sign],    %[sign],      %[temp3]    \n\t"
+        "mtc1    %[temp2],   %[temp15]                 \n\t"
+        "and     %[temp3],   %[sign],      %[mask]     \n\t"
+        "sllv    %[sign],    %[sign],      %[temp4]    \n\t"
+        "xor     %[temp3],   %[temp0],     %[temp3]    \n\t"
+        "and     %[temp4],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp3],   %[temp16]                 \n\t"
+        "xor     %[temp4],   %[temp0],     %[temp4]    \n\t"
+        "mtc1    %[temp4],   %[temp17]                 \n\t"
+        "mul.s   %[temp10],  %[temp10],    %[temp14]   \n\t"
+        "mul.s   %[temp11],  %[temp11],    %[temp15]   \n\t"
+        "mul.s   %[temp12],  %[temp12],    %[temp16]   \n\t"
+        "mul.s   %[temp13],  %[temp13],    %[temp17]   \n\t"
+        PTR_ADDIU "%[ret],   %[dst],       16          \n\t"
+        "swc1    %[temp10],  0(%[dst])                 \n\t"
+        "swc1    %[temp11],  4(%[dst])                 \n\t"
+        "swc1    %[temp12],  8(%[dst])                 \n\t"
+        "swc1    %[temp13],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp10]"=&f"(temp10),
+          [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+          [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+          [temp15]"=&f"(temp15), [temp16]"=&f"(temp16),
+          [temp17]"=&f"(temp17), [ret]"=&r"(ret),
+          [sign]"+r"(sign)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [mask]"r"(mask)
+        : "memory"
+    );
+    return ret;
+}
+
+#define VMUL2 VMUL2_mips
+#define VMUL4 VMUL4_mips
+#define VMUL2S VMUL2S_mips
+#define VMUL4S VMUL4S_mips
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+
+#endif /* AVCODEC_MIPS_AACDEC_MIPS_H */
diff --git a/libavcodec/mips/aacpsdsp_mips.c b/libavcodec/mips/aacpsdsp_mips.c
new file mode 100644
index 0000000..83fdc2f
--- /dev/null
+++ b/libavcodec/mips/aacpsdsp_mips.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacpsdsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/aacpsdsp.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void ps_hybrid_analysis_ileave_mips(float (*out)[32][2], float L[2][38][64],
+                                        int i, int len)
+{
+    int temp0, temp1, temp2, temp3;
+    int temp4, temp5, temp6, temp7;
+    float *out1=&out[i][0][0];
+    float *L1=&L[0][0][i];
+    float *j=out1+ len*2;
+
+    for (; i < 64; i++) {
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+        "1:                                          \n\t"
+            "lw      %[temp0],   0(%[L1])            \n\t"
+            "lw      %[temp1],   9728(%[L1])         \n\t"
+            "lw      %[temp2],   256(%[L1])          \n\t"
+            "lw      %[temp3],   9984(%[L1])         \n\t"
+            "lw      %[temp4],   512(%[L1])          \n\t"
+            "lw      %[temp5],   10240(%[L1])        \n\t"
+            "lw      %[temp6],   768(%[L1])          \n\t"
+            "lw      %[temp7],   10496(%[L1])        \n\t"
+            "sw      %[temp0],   0(%[out1])          \n\t"
+            "sw      %[temp1],   4(%[out1])          \n\t"
+            "sw      %[temp2],   8(%[out1])          \n\t"
+            "sw      %[temp3],   12(%[out1])         \n\t"
+            "sw      %[temp4],   16(%[out1])         \n\t"
+            "sw      %[temp5],   20(%[out1])         \n\t"
+            "sw      %[temp6],   24(%[out1])         \n\t"
+            "sw      %[temp7],   28(%[out1])         \n\t"
+            PTR_ADDIU "%[out1],  %[out1],      32    \n\t"
+            PTR_ADDIU "%[L1],    %[L1],        1024  \n\t"
+            "bne     %[out1],    %[j],         1b    \n\t"
+
+            : [out1]"+r"(out1), [L1]"+r"(L1), [j]"+r"(j),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            : [len]"r"(len)
+            : "memory"
+        );
+        out1-=(len<<1)-64;
+        L1-=(len<<6)-1;
+        j+=len*2;
+    }
+}
+
+static void ps_hybrid_synthesis_deint_mips(float out[2][38][64],
+                                        float (*in)[32][2],
+                                        int i, int len)
+{
+    int n;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    float *out1 = (float*)out + i;
+    float *out2 = (float*)out + 2432 + i;
+    float *in1 = (float*)in + 64 * i;
+    float *in2 = (float*)in + 64 * i + 1;
+
+    for (; i < 64; i++) {
+        for (n = 0; n < 7; n++) {
+
+            /* loop unrolled 8 times */
+            __asm__ volatile (
+                 "lw      %[temp0],   0(%[in1])               \n\t"
+                 "lw      %[temp1],   0(%[in2])               \n\t"
+                 "lw      %[temp2],   8(%[in1])               \n\t"
+                 "lw      %[temp3],   8(%[in2])               \n\t"
+                 "lw      %[temp4],   16(%[in1])              \n\t"
+                 "lw      %[temp5],   16(%[in2])              \n\t"
+                 "lw      %[temp6],   24(%[in1])              \n\t"
+                 "lw      %[temp7],   24(%[in2])              \n\t"
+                 PTR_ADDIU "%[out1],  %[out1],         1024   \n\t"
+                 PTR_ADDIU "%[out2],  %[out2],         1024   \n\t"
+                 PTR_ADDIU "%[in1],   %[in1],          32     \n\t"
+                 PTR_ADDIU "%[in2],   %[in2],          32     \n\t"
+                 "sw      %[temp0],   -1024(%[out1])          \n\t"
+                 "sw      %[temp1],   -1024(%[out2])          \n\t"
+                 "sw      %[temp2],   -768(%[out1])           \n\t"
+                 "sw      %[temp3],   -768(%[out2])           \n\t"
+                 "sw      %[temp4],   -512(%[out1])           \n\t"
+                 "sw      %[temp5],   -512(%[out2])           \n\t"
+                 "sw      %[temp6],   -256(%[out1])           \n\t"
+                 "sw      %[temp7],   -256(%[out2])           \n\t"
+
+                 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                   [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                   [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                   [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                   [out1]"+r"(out1), [out2]"+r"(out2),
+                   [in1]"+r"(in1), [in2]"+r"(in2)
+                 :
+                 : "memory"
+            );
+        }
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            "lw      %[temp0],   0(%[in1])               \n\t"
+            "lw      %[temp1],   0(%[in2])               \n\t"
+            "lw      %[temp2],   8(%[in1])               \n\t"
+            "lw      %[temp3],   8(%[in2])               \n\t"
+            "lw      %[temp4],   16(%[in1])              \n\t"
+            "lw      %[temp5],   16(%[in2])              \n\t"
+            "lw      %[temp6],   24(%[in1])              \n\t"
+            "lw      %[temp7],   24(%[in2])              \n\t"
+            PTR_ADDIU "%[out1],  %[out1],        -7164   \n\t"
+            PTR_ADDIU "%[out2],  %[out2],        -7164   \n\t"
+            PTR_ADDIU "%[in1],   %[in1],         32      \n\t"
+            PTR_ADDIU "%[in2],   %[in2],         32      \n\t"
+            "sw      %[temp0],   7164(%[out1])           \n\t"
+            "sw      %[temp1],   7164(%[out2])           \n\t"
+            "sw      %[temp2],   7420(%[out1])           \n\t"
+            "sw      %[temp3],   7420(%[out2])           \n\t"
+            "sw      %[temp4],   7676(%[out1])           \n\t"
+            "sw      %[temp5],   7676(%[out2])           \n\t"
+            "sw      %[temp6],   7932(%[out1])           \n\t"
+            "sw      %[temp7],   7932(%[out2])           \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [out1]"+r"(out1), [out2]"+r"(out2),
+              [in1]"+r"(in1), [in2]"+r"(in2)
+            :
+            : "memory"
+        );
+    }
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ps_add_squares_mips(float *dst, const float (*src)[2], int n)
+{
+    int i;
+    float temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9, temp10, temp11;
+    float *src0 = (float*)&src[0][0];
+    float *dst0 = &dst[0];
+
+    for (i = 0; i < 8; i++) {
+        /* loop unrolled 4 times */
+        __asm__ volatile (
+            "lwc1     %[temp0],    0(%[src0])                          \n\t"
+            "lwc1     %[temp1],    4(%[src0])                          \n\t"
+            "lwc1     %[temp2],    8(%[src0])                          \n\t"
+            "lwc1     %[temp3],    12(%[src0])                         \n\t"
+            "lwc1     %[temp4],    16(%[src0])                         \n\t"
+            "lwc1     %[temp5],    20(%[src0])                         \n\t"
+            "lwc1     %[temp6],    24(%[src0])                         \n\t"
+            "lwc1     %[temp7],    28(%[src0])                         \n\t"
+            "lwc1     %[temp8],    0(%[dst0])                          \n\t"
+            "lwc1     %[temp9],    4(%[dst0])                          \n\t"
+            "lwc1     %[temp10],   8(%[dst0])                          \n\t"
+            "lwc1     %[temp11],   12(%[dst0])                         \n\t"
+            "mul.s    %[temp1],    %[temp1],    %[temp1]               \n\t"
+            "mul.s    %[temp3],    %[temp3],    %[temp3]               \n\t"
+            "mul.s    %[temp5],    %[temp5],    %[temp5]               \n\t"
+            "mul.s    %[temp7],    %[temp7],    %[temp7]               \n\t"
+            "madd.s   %[temp0],    %[temp1],    %[temp0],   %[temp0]   \n\t"
+            "madd.s   %[temp2],    %[temp3],    %[temp2],   %[temp2]   \n\t"
+            "madd.s   %[temp4],    %[temp5],    %[temp4],   %[temp4]   \n\t"
+            "madd.s   %[temp6],    %[temp7],    %[temp6],   %[temp6]   \n\t"
+            "add.s    %[temp0],    %[temp8],    %[temp0]               \n\t"
+            "add.s    %[temp2],    %[temp9],    %[temp2]               \n\t"
+            "add.s    %[temp4],    %[temp10],   %[temp4]               \n\t"
+            "add.s    %[temp6],    %[temp11],   %[temp6]               \n\t"
+            "swc1     %[temp0],    0(%[dst0])                          \n\t"
+            "swc1     %[temp2],    4(%[dst0])                          \n\t"
+            "swc1     %[temp4],    8(%[dst0])                          \n\t"
+            "swc1     %[temp6],    12(%[dst0])                         \n\t"
+            PTR_ADDIU "%[dst0],    %[dst0],     16                     \n\t"
+            PTR_ADDIU "%[src0],    %[src0],     32                     \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [dst0]"+r"(dst0), [src0]"+r"(src0),
+              [temp10]"=&f"(temp10), [temp11]"=&f"(temp11)
+            :
+            : "memory"
+        );
+   }
+}
+
+static void ps_mul_pair_single_mips(float (*dst)[2], float (*src0)[2], float *src1,
+                                 int n)
+{
+    float temp0, temp1, temp2;
+    float *p_d, *p_s0, *p_s1, *end;
+    p_d = &dst[0][0];
+    p_s0 = &src0[0][0];
+    p_s1 = &src1[0];
+    end = p_s1 + n;
+
+    __asm__ volatile(
+        ".set push                                      \n\t"
+        ".set noreorder                                 \n\t"
+        "1:                                             \n\t"
+        "lwc1     %[temp2],   0(%[p_s1])                \n\t"
+        "lwc1     %[temp0],   0(%[p_s0])                \n\t"
+        "lwc1     %[temp1],   4(%[p_s0])                \n\t"
+        PTR_ADDIU "%[p_d],    %[p_d],       8           \n\t"
+        "mul.s    %[temp0],   %[temp0],     %[temp2]    \n\t"
+        "mul.s    %[temp1],   %[temp1],     %[temp2]    \n\t"
+        PTR_ADDIU "%[p_s0],   %[p_s0],      8           \n\t"
+        "swc1     %[temp0],   -8(%[p_d])                \n\t"
+        "swc1     %[temp1],   -4(%[p_d])                \n\t"
+        "bne      %[p_s1],    %[end],       1b          \n\t"
+        PTR_ADDIU "%[p_s1],   %[p_s1],      4           \n\t"
+        ".set pop                                       \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [p_d]"+r"(p_d),
+          [p_s0]"+r"(p_s0), [p_s1]"+r"(p_s1)
+        : [end]"r"(end)
+        : "memory"
+    );
+}
+
+static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2],
+                             float (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
+                             const float phi_fract[2], const float (*Q_fract)[2],
+                             const float *transient_gain,
+                             float g_decay_slope,
+                             int len)
+{
+    float *p_delay = &delay[0][0];
+    float *p_out = &out[0][0];
+    float *p_ap_delay = &ap_delay[0][0][0];
+    const float *p_t_gain = transient_gain;
+    const float *p_Q_fract = &Q_fract[0][0];
+    float ag0, ag1, ag2;
+    float phi_fract0 = phi_fract[0];
+    float phi_fract1 = phi_fract[1];
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+
+    float *p_delay_end = (p_delay + (len << 1));
+
+    /* merged 2 loops */
+    __asm__ volatile(
+        ".set    push                                                    \n\t"
+        ".set    noreorder                                               \n\t"
+        "li.s    %[ag0],        0.65143905753106                         \n\t"
+        "li.s    %[ag1],        0.56471812200776                         \n\t"
+        "li.s    %[ag2],        0.48954165955695                         \n\t"
+        "mul.s   %[ag0],        %[ag0],        %[g_decay_slope]          \n\t"
+        "mul.s   %[ag1],        %[ag1],        %[g_decay_slope]          \n\t"
+        "mul.s   %[ag2],        %[ag2],        %[g_decay_slope]          \n\t"
+    "1:                                                                  \n\t"
+        "lwc1    %[temp0],      0(%[p_delay])                            \n\t"
+        "lwc1    %[temp1],      4(%[p_delay])                            \n\t"
+        "lwc1    %[temp4],      16(%[p_ap_delay])                        \n\t"
+        "lwc1    %[temp5],      20(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp3],      %[temp0],      %[phi_fract1]             \n\t"
+        "lwc1    %[temp6],      0(%[p_Q_fract])                          \n\t"
+        "mul.s   %[temp2],      %[temp1],      %[phi_fract1]             \n\t"
+        "lwc1    %[temp7],      4(%[p_Q_fract])                          \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[temp1], %[phi_fract0]   \n\t"
+        "msub.s  %[temp2],      %[temp2],      %[temp0], %[phi_fract0]   \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "lwc1    %[temp7],      12(%[p_Q_fract])                         \n\t"
+        "mul.s   %[temp0],      %[ag0],        %[temp2]                  \n\t"
+        "mul.s   %[temp1],      %[ag0],        %[temp3]                  \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "lwc1    %[temp4],      304(%[p_ap_delay])                       \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "lwc1    %[temp5],      308(%[p_ap_delay])                       \n\t"
+        "sub.s   %[temp0],      %[temp8],      %[temp0]                  \n\t"
+        "sub.s   %[temp1],      %[temp9],      %[temp1]                  \n\t"
+        "madd.s  %[temp2],      %[temp2],      %[ag0],   %[temp0]        \n\t"
+        "lwc1    %[temp6],      8(%[p_Q_fract])                          \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[ag0],   %[temp1]        \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "lwc1    %[temp7],      20(%[p_Q_fract])                         \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "swc1    %[temp2],      40(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp2],      %[ag1],        %[temp0]                  \n\t"
+        "swc1    %[temp3],      44(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp3],      %[ag1],        %[temp1]                  \n\t"
+        "lwc1    %[temp4],      592(%[p_ap_delay])                       \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "lwc1    %[temp5],      596(%[p_ap_delay])                       \n\t"
+        "sub.s   %[temp2],      %[temp8],      %[temp2]                  \n\t"
+        "sub.s   %[temp3],      %[temp9],      %[temp3]                  \n\t"
+        "lwc1    %[temp6],      16(%[p_Q_fract])                         \n\t"
+        "madd.s  %[temp0],      %[temp0],      %[ag1],   %[temp2]        \n\t"
+        "madd.s  %[temp1],      %[temp1],      %[ag1],   %[temp3]        \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "swc1    %[temp0],      336(%[p_ap_delay])                       \n\t"
+        "mul.s   %[temp0],      %[ag2],        %[temp2]                  \n\t"
+        "swc1    %[temp1],      340(%[p_ap_delay])                       \n\t"
+        "mul.s   %[temp1],      %[ag2],        %[temp3]                  \n\t"
+        "lwc1    %[temp4],      0(%[p_t_gain])                           \n\t"
+        "sub.s   %[temp0],      %[temp8],      %[temp0]                  \n\t"
+        PTR_ADDIU "%[p_ap_delay], %[p_ap_delay], 8                       \n\t"
+        "sub.s   %[temp1],      %[temp9],      %[temp1]                  \n\t"
+        PTR_ADDIU "%[p_t_gain], %[p_t_gain],   4                         \n\t"
+        "madd.s  %[temp2],      %[temp2],      %[ag2],   %[temp0]        \n\t"
+        PTR_ADDIU "%[p_delay],  %[p_delay],    8                         \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[ag2],   %[temp1]        \n\t"
+        PTR_ADDIU "%[p_out],    %[p_out],      8                         \n\t"
+        "mul.s   %[temp5],      %[temp4],      %[temp0]                  \n\t"
+        "mul.s   %[temp6],      %[temp4],      %[temp1]                  \n\t"
+        "swc1    %[temp2],      624(%[p_ap_delay])                       \n\t"
+        "swc1    %[temp3],      628(%[p_ap_delay])                       \n\t"
+        "swc1    %[temp5],      -8(%[p_out])                             \n\t"
+        "swc1    %[temp6],      -4(%[p_out])                             \n\t"
+        "bne     %[p_delay],    %[p_delay_end],1b                        \n\t"
+        " swc1   %[temp6],      -4(%[p_out])                             \n\t"
+        ".set    pop                                                     \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+          [temp9]"=&f"(temp9), [p_delay]"+r"(p_delay), [p_ap_delay]"+r"(p_ap_delay),
+          [p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out),
+          [ag0]"=&f"(ag0), [ag1]"=&f"(ag1), [ag2]"=&f"(ag2)
+        : [phi_fract0]"f"(phi_fract0), [phi_fract1]"f"(phi_fract1),
+          [p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope)
+        : "memory"
+    );
+}
+
+static void ps_stereo_interpolate_mips(float (*l)[2], float (*r)[2],
+                                    float h[2][4], float h_step[2][4],
+                                    int len)
+{
+    float h0 = h[0][0];
+    float h1 = h[0][1];
+    float h2 = h[0][2];
+    float h3 = h[0][3];
+    float hs0 = h_step[0][0];
+    float hs1 = h_step[0][1];
+    float hs2 = h_step[0][2];
+    float hs3 = h_step[0][3];
+    float temp0, temp1, temp2, temp3;
+    float l_re, l_im, r_re, r_im;
+
+    float *l_end = ((float *)l + (len << 1));
+
+    __asm__ volatile(
+        ".set    push                                     \n\t"
+        ".set    noreorder                                \n\t"
+    "1:                                                   \n\t"
+        "add.s   %[h0],     %[h0],     %[hs0]             \n\t"
+        "lwc1    %[l_re],   0(%[l])                       \n\t"
+        "add.s   %[h1],     %[h1],     %[hs1]             \n\t"
+        "lwc1    %[r_re],   0(%[r])                       \n\t"
+        "add.s   %[h2],     %[h2],     %[hs2]             \n\t"
+        "lwc1    %[l_im],   4(%[l])                       \n\t"
+        "add.s   %[h3],     %[h3],     %[hs3]             \n\t"
+        "lwc1    %[r_im],   4(%[r])                       \n\t"
+        "mul.s   %[temp0],  %[h0],     %[l_re]            \n\t"
+        PTR_ADDIU "%[l],    %[l],      8                  \n\t"
+        "mul.s   %[temp2],  %[h1],     %[l_re]            \n\t"
+        PTR_ADDIU "%[r],    %[r],      8                  \n\t"
+        "madd.s  %[temp0],  %[temp0],  %[h2],   %[r_re]   \n\t"
+        "madd.s  %[temp2],  %[temp2],  %[h3],   %[r_re]   \n\t"
+        "mul.s   %[temp1],  %[h0],     %[l_im]            \n\t"
+        "mul.s   %[temp3],  %[h1],     %[l_im]            \n\t"
+        "madd.s  %[temp1],  %[temp1],  %[h2],   %[r_im]   \n\t"
+        "madd.s  %[temp3],  %[temp3],  %[h3],   %[r_im]   \n\t"
+        "swc1    %[temp0],  -8(%[l])                      \n\t"
+        "swc1    %[temp2],  -8(%[r])                      \n\t"
+        "swc1    %[temp1],  -4(%[l])                      \n\t"
+        "bne     %[l],      %[l_end],  1b                 \n\t"
+        " swc1   %[temp3],  -4(%[r])                      \n\t"
+        ".set    pop                                      \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
+          [h0]"+f"(h0), [h1]"+f"(h1), [h2]"+f"(h2),
+          [h3]"+f"(h3), [l]"+r"(l), [r]"+r"(r),
+          [l_re]"=&f"(l_re), [l_im]"=&f"(l_im),
+          [r_re]"=&f"(r_re), [r_im]"=&f"(r_im)
+        : [hs0]"f"(hs0), [hs1]"f"(hs1), [hs2]"f"(hs2),
+          [hs3]"f"(hs3), [l_end]"r"(l_end)
+        : "memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_psdsp_init_mips(PSDSPContext *s)
+{
+#if HAVE_INLINE_ASM
+    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_mips;
+    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_mips;
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->add_squares            = ps_add_squares_mips;
+    s->mul_pair_single        = ps_mul_pair_single_mips;
+    s->decorrelate            = ps_decorrelate_mips;
+    s->stereo_interpolate[0]  = ps_stereo_interpolate_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacpsy_mips.h b/libavcodec/mips/aacpsy_mips.h
new file mode 100644
index 0000000..a1fe5cc
--- /dev/null
+++ b/libavcodec/mips/aacpsy_mips.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic   (bojan@mips.com)
+ *
+ * AAC encoder psychoacoustic model routines optimized
+ * for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacpsy.c
+ */
+
+#ifndef AVCODEC_MIPS_AACPSY_MIPS_H
+#define AVCODEC_MIPS_AACPSY_MIPS_H
+
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU && ( PSY_LAME_FIR_LEN == 21 )
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void calc_thr_3gpp_mips(const FFPsyWindowInfo *wi, const int num_bands,
+                               AacPsyChannel *pch, const uint8_t *band_sizes,
+                               const float *coefs, const int cutoff)
+{
+    int i, w, g;
+    int start = 0, wstart = 0;
+    for (w = 0; w < wi->num_windows*16; w += 16) {
+        wstart = 0;
+        for (g = 0; g < num_bands; g++) {
+            AacPsyBand *band = &pch->band[w+g];
+
+            float form_factor = 0.0f;
+            float Temp;
+            band->energy = 0.0f;
+            if (wstart < cutoff) {
+                for (i = 0; i < band_sizes[g]; i+=4) {
+                    float a, b, c, d;
+                    float ax, bx, cx, dx;
+                    float *cf = (float *)&coefs[start+i];
+
+                    __asm__ volatile (
+                        "lwc1   %[a],   0(%[cf])                \n\t"
+                        "lwc1   %[b],   4(%[cf])                \n\t"
+                        "lwc1   %[c],   8(%[cf])                \n\t"
+                        "lwc1   %[d],   12(%[cf])               \n\t"
+                        "abs.s  %[a],   %[a]                    \n\t"
+                        "abs.s  %[b],   %[b]                    \n\t"
+                        "abs.s  %[c],   %[c]                    \n\t"
+                        "abs.s  %[d],   %[d]                    \n\t"
+                        "sqrt.s %[ax],  %[a]                    \n\t"
+                        "sqrt.s %[bx],  %[b]                    \n\t"
+                        "sqrt.s %[cx],  %[c]                    \n\t"
+                        "sqrt.s %[dx],  %[d]                    \n\t"
+                        "madd.s %[e],   %[e],   %[a],   %[a]    \n\t"
+                        "madd.s %[e],   %[e],   %[b],   %[b]    \n\t"
+                        "madd.s %[e],   %[e],   %[c],   %[c]    \n\t"
+                        "madd.s %[e],   %[e],   %[d],   %[d]    \n\t"
+                        "add.s  %[f],   %[f],   %[ax]           \n\t"
+                        "add.s  %[f],   %[f],   %[bx]           \n\t"
+                        "add.s  %[f],   %[f],   %[cx]           \n\t"
+                        "add.s  %[f],   %[f],   %[dx]           \n\t"
+
+                        : [a]"=&f"(a), [b]"=&f"(b),
+                          [c]"=&f"(c), [d]"=&f"(d),
+                          [e]"+f"(band->energy), [f]"+f"(form_factor),
+                          [ax]"=&f"(ax), [bx]"=&f"(bx),
+                          [cx]"=&f"(cx), [dx]"=&f"(dx)
+                        : [cf]"r"(cf)
+                        : "memory"
+                    );
+                }
+            }
+
+            Temp = sqrtf((float)band_sizes[g] / band->energy);
+            band->thr      = band->energy * 0.001258925f;
+            band->nz_lines = form_factor * sqrtf(Temp);
+            start += band_sizes[g];
+            wstart += band_sizes[g];
+        }
+    }
+}
+
+static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float * psy_fir_coeffs)
+{
+    float sum1, sum2, sum3, sum4;
+    float *fb = (float*)firbuf;
+    float *fb_end = fb + AAC_BLOCK_SIZE_LONG;
+    float *hp = hpfsmpl;
+
+    float coeff0 = psy_fir_coeffs[1];
+    float coeff1 = psy_fir_coeffs[3];
+    float coeff2 = psy_fir_coeffs[5];
+    float coeff3 = psy_fir_coeffs[7];
+    float coeff4 = psy_fir_coeffs[9];
+
+    __asm__ volatile (
+        ".set push                                          \n\t"
+        ".set noreorder                                     \n\t"
+
+        "li.s   $f12,       32768                           \n\t"
+        "1:                                                 \n\t"
+        "lwc1   $f0,        40(%[fb])                       \n\t"
+        "lwc1   $f1,        4(%[fb])                        \n\t"
+        "lwc1   $f2,        80(%[fb])                       \n\t"
+        "lwc1   $f3,        44(%[fb])                       \n\t"
+        "lwc1   $f4,        8(%[fb])                        \n\t"
+        "madd.s %[sum1],    $f0,        $f1,    %[coeff0]   \n\t"
+        "lwc1   $f5,        84(%[fb])                       \n\t"
+        "lwc1   $f6,        48(%[fb])                       \n\t"
+        "madd.s %[sum2],    $f3,        $f4,    %[coeff0]   \n\t"
+        "lwc1   $f7,        12(%[fb])                       \n\t"
+        "madd.s %[sum1],    %[sum1],    $f2,    %[coeff0]   \n\t"
+        "lwc1   $f8,        88(%[fb])                       \n\t"
+        "lwc1   $f9,        52(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f5,    %[coeff0]   \n\t"
+        "madd.s %[sum3],    $f6,        $f7,    %[coeff0]   \n\t"
+        "lwc1   $f10,       16(%[fb])                       \n\t"
+        "lwc1   $f11,       92(%[fb])                       \n\t"
+        "madd.s %[sum1],    %[sum1],    $f7,    %[coeff1]   \n\t"
+        "lwc1   $f1,        72(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff0]   \n\t"
+        "madd.s %[sum4],    $f9,        $f10,   %[coeff0]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f10,   %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f1,    %[coeff1]   \n\t"
+        "lwc1   $f4,        76(%[fb])                       \n\t"
+        "lwc1   $f8,        20(%[fb])                       \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff0]   \n\t"
+        "lwc1   $f11,       24(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f4,    %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f8,    %[coeff2]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff1]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff1]   \n\t"
+        "lwc1   $f7,        64(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f11,   %[coeff2]   \n\t"
+        "lwc1   $f10,       68(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f2,    %[coeff1]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f5,    %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f7,    %[coeff2]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f10,   %[coeff2]   \n\t"
+        "lwc1   $f2,        28(%[fb])                       \n\t"
+        "lwc1   $f5,        32(%[fb])                       \n\t"
+        "lwc1   $f8,        56(%[fb])                       \n\t"
+        "lwc1   $f11,       60(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f2,    %[coeff2]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f5,    %[coeff2]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f2,    %[coeff3]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f5,    %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f1,    %[coeff2]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f4,    %[coeff2]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f8,    %[coeff3]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f11,   %[coeff3]   \n\t"
+        "lwc1   $f1,        36(%[fb])                       \n\t"
+        PTR_ADDIU "%[fb],   %[fb],      16                  \n\t"
+        "madd.s %[sum4],    %[sum4],    $f0,    %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f1,    %[coeff3]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f1,    %[coeff4]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f0,    %[coeff4]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f10,   %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f7,    %[coeff3]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f6,    %[coeff4]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f9,    %[coeff4]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f6,    %[coeff4]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f3,    %[coeff4]   \n\t"
+        "mul.s  %[sum1],    %[sum1],    $f12                \n\t"
+        "mul.s  %[sum2],    %[sum2],    $f12                \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff4]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff4]   \n\t"
+        "swc1   %[sum1],    0(%[hp])                        \n\t"
+        "swc1   %[sum2],    4(%[hp])                        \n\t"
+        "mul.s  %[sum4],    %[sum4],    $f12                \n\t"
+        "mul.s  %[sum3],    %[sum3],    $f12                \n\t"
+        "swc1   %[sum4],    12(%[hp])                       \n\t"
+        "swc1   %[sum3],    8(%[hp])                        \n\t"
+        "bne    %[fb],      %[fb_end],  1b                  \n\t"
+        PTR_ADDIU "%[hp],   %[hp],      16                  \n\t"
+
+        ".set pop                                           \n\t"
+
+        : [sum1]"=&f"(sum1), [sum2]"=&f"(sum2),
+          [sum3]"=&f"(sum3), [sum4]"=&f"(sum4),
+          [fb]"+r"(fb), [hp]"+r"(hp)
+        : [coeff0]"f"(coeff0), [coeff1]"f"(coeff1),
+          [coeff2]"f"(coeff2), [coeff3]"f"(coeff3),
+          [coeff4]"f"(coeff4), [fb_end]"r"(fb_end)
+        : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6",
+          "$f7", "$f8", "$f9", "$f10", "$f11", "$f12",
+          "memory"
+    );
+}
+
+#define calc_thr_3gpp calc_thr_3gpp_mips
+#define psy_hp_filter psy_hp_filter_mips
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+#endif /* AVCODEC_MIPS_AACPSY_MIPS_H */
diff --git a/libavcodec/mips/aacsbr_mips.c b/libavcodec/mips/aacsbr_mips.c
new file mode 100644
index 0000000..56aa4e8
--- /dev/null
+++ b/libavcodec/mips/aacsbr_mips.c
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacsbr.c
+ */
+
+#include "libavcodec/aac.h"
+#include "libavcodec/aacsbr.h"
+#include "libavutil/mips/asmdefs.h"
+
+#define ENVELOPE_ADJUSTMENT_OFFSET 2
+
+#if HAVE_INLINE_ASM
+static int sbr_lf_gen_mips(AACContext *ac, SpectralBandReplication *sbr,
+                      float X_low[32][40][2], const float W[2][32][32][2],
+                      int buf_idx)
+{
+    int i, k;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    float *p_x_low = &X_low[0][8][0];
+    float *p_w = (float*)&W[buf_idx][0][0][0];
+    float *p_x1_low = &X_low[0][0][0];
+    float *p_w1 = (float*)&W[1-buf_idx][24][0][0];
+
+    float *loop_end=p_x1_low + 2560;
+
+    /* loop unrolled 8 times */
+    __asm__ volatile (
+    "1:                                                 \n\t"
+        "sw     $0,            0(%[p_x1_low])           \n\t"
+        "sw     $0,            4(%[p_x1_low])           \n\t"
+        "sw     $0,            8(%[p_x1_low])           \n\t"
+        "sw     $0,            12(%[p_x1_low])          \n\t"
+        "sw     $0,            16(%[p_x1_low])          \n\t"
+        "sw     $0,            20(%[p_x1_low])          \n\t"
+        "sw     $0,            24(%[p_x1_low])          \n\t"
+        "sw     $0,            28(%[p_x1_low])          \n\t"
+        PTR_ADDIU "%[p_x1_low],%[p_x1_low],      32     \n\t"
+        "bne    %[p_x1_low],   %[loop_end],      1b     \n\t"
+        PTR_ADDIU "%[p_x1_low],%[p_x1_low],      -10240 \n\t"
+
+        : [p_x1_low]"+r"(p_x1_low)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = 0; i < 32; i+=4) {
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lw     %[temp0],   0(%[p_w])               \n\t"
+                "lw     %[temp1],   4(%[p_w])               \n\t"
+                "lw     %[temp2],   256(%[p_w])             \n\t"
+                "lw     %[temp3],   260(%[p_w])             \n\t"
+                "lw     %[temp4],   512(%[p_w])             \n\t"
+                "lw     %[temp5],   516(%[p_w])             \n\t"
+                "lw     %[temp6],   768(%[p_w])             \n\t"
+                "lw     %[temp7],   772(%[p_w])             \n\t"
+                "sw     %[temp0],   0(%[p_x_low])           \n\t"
+                "sw     %[temp1],   4(%[p_x_low])           \n\t"
+                "sw     %[temp2],   8(%[p_x_low])           \n\t"
+                "sw     %[temp3],   12(%[p_x_low])          \n\t"
+                "sw     %[temp4],   16(%[p_x_low])          \n\t"
+                "sw     %[temp5],   20(%[p_x_low])          \n\t"
+                "sw     %[temp6],   24(%[p_x_low])          \n\t"
+                "sw     %[temp7],   28(%[p_x_low])          \n\t"
+                PTR_ADDIU "%[p_x_low], %[p_x_low],  32      \n\t"
+                PTR_ADDIU "%[p_w],     %[p_w],      1024    \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
+                :
+                : "memory"
+            );
+        }
+        p_x_low += 16;
+        p_w -= 2046;
+    }
+
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < 2; i++) {
+
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lw     %[temp0],    0(%[p_w1])             \n\t"
+                "lw     %[temp1],    4(%[p_w1])             \n\t"
+                "lw     %[temp2],    256(%[p_w1])           \n\t"
+                "lw     %[temp3],    260(%[p_w1])           \n\t"
+                "lw     %[temp4],    512(%[p_w1])           \n\t"
+                "lw     %[temp5],    516(%[p_w1])           \n\t"
+                "lw     %[temp6],    768(%[p_w1])           \n\t"
+                "lw     %[temp7],    772(%[p_w1])           \n\t"
+                "sw     %[temp0],    0(%[p_x1_low])         \n\t"
+                "sw     %[temp1],    4(%[p_x1_low])         \n\t"
+                "sw     %[temp2],    8(%[p_x1_low])         \n\t"
+                "sw     %[temp3],    12(%[p_x1_low])        \n\t"
+                "sw     %[temp4],    16(%[p_x1_low])        \n\t"
+                "sw     %[temp5],    20(%[p_x1_low])        \n\t"
+                "sw     %[temp6],    24(%[p_x1_low])        \n\t"
+                "sw     %[temp7],    28(%[p_x1_low])        \n\t"
+                PTR_ADDIU "%[p_x1_low], %[p_x1_low], 32     \n\t"
+                PTR_ADDIU "%[p_w1],     %[p_w1],     1024   \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
+                :
+                : "memory"
+            );
+        }
+        p_x1_low += 64;
+        p_w1 -= 510;
+    }
+    return 0;
+}
+
+static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
+                     const float Y0[38][64][2], const float Y1[38][64][2],
+                     const float X_low[32][40][2], int ch)
+{
+    int k, i;
+    const int i_f = 32;
+    int temp0, temp1, temp2, temp3;
+    const float *X_low1, *Y01, *Y11;
+    float *x1=&X[0][0][0];
+    float *j=x1+4864;
+    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
+
+    /* loop unrolled 8 times */
+    __asm__ volatile (
+    "1:                                       \n\t"
+        "sw     $0,      0(%[x1])             \n\t"
+        "sw     $0,      4(%[x1])             \n\t"
+        "sw     $0,      8(%[x1])             \n\t"
+        "sw     $0,      12(%[x1])            \n\t"
+        "sw     $0,      16(%[x1])            \n\t"
+        "sw     $0,      20(%[x1])            \n\t"
+        "sw     $0,      24(%[x1])            \n\t"
+        "sw     $0,      28(%[x1])            \n\t"
+        PTR_ADDIU "%[x1],%[x1],      32       \n\t"
+        "bne    %[x1],   %[j],       1b       \n\t"
+        PTR_ADDIU "%[x1],%[x1],      -19456   \n\t"
+
+        : [x1]"+r"(x1)
+        : [j]"r"(j)
+        : "memory"
+    );
+
+    if (i_Temp != 0) {
+
+        X_low1=&X_low[0][2][0];
+
+        for (k = 0; k < sbr->kx[0]; k++) {
+
+            __asm__ volatile (
+                "move    %[i],        $zero                  \n\t"
+            "2:                                              \n\t"
+                "lw      %[temp0],    0(%[X_low1])           \n\t"
+                "lw      %[temp1],    4(%[X_low1])           \n\t"
+                "sw      %[temp0],    0(%[x1])               \n\t"
+                "sw      %[temp1],    9728(%[x1])            \n\t"
+                PTR_ADDIU "%[x1],     %[x1],         256     \n\t"
+                PTR_ADDIU "%[X_low1], %[X_low1],     8       \n\t"
+                "addiu   %[i],        %[i],          1       \n\t"
+                "bne     %[i],        %[i_Temp],     2b      \n\t"
+
+                : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
+                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+                : [i_Temp]"r"(i_Temp)
+                : "memory"
+            );
+            x1-=(i_Temp<<6)-1;
+            X_low1-=(i_Temp<<1)-80;
+        }
+
+        x1=&X[0][0][k];
+        Y01=(float*)&Y0[32][k][0];
+
+        for (; k < sbr->kx[0] + sbr->m[0]; k++) {
+            __asm__ volatile (
+                "move    %[i],       $zero               \n\t"
+            "3:                                          \n\t"
+                "lw      %[temp0],   0(%[Y01])           \n\t"
+                "lw      %[temp1],   4(%[Y01])           \n\t"
+                "sw      %[temp0],   0(%[x1])            \n\t"
+                "sw      %[temp1],   9728(%[x1])         \n\t"
+                PTR_ADDIU "%[x1],    %[x1],      256     \n\t"
+                PTR_ADDIU "%[Y01],   %[Y01],     512     \n\t"
+                "addiu   %[i],       %[i],       1       \n\t"
+                "bne     %[i],       %[i_Temp],  3b      \n\t"
+
+                : [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
+                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+                : [i_Temp]"r"(i_Temp)
+                : "memory"
+            );
+            x1 -=(i_Temp<<6)-1;
+            Y01 -=(i_Temp<<7)-2;
+        }
+    }
+
+    x1=&X[0][i_Temp][0];
+    X_low1=&X_low[0][i_Temp+2][0];
+    temp3=38;
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+
+        __asm__ volatile (
+            "move    %[i],       %[i_Temp]              \n\t"
+        "4:                                             \n\t"
+            "lw      %[temp0],   0(%[X_low1])           \n\t"
+            "lw      %[temp1],   4(%[X_low1])           \n\t"
+            "sw      %[temp0],   0(%[x1])               \n\t"
+            "sw      %[temp1],   9728(%[x1])            \n\t"
+            PTR_ADDIU "%[x1],    %[x1],         256     \n\t"
+            PTR_ADDIU "%[X_low1],%[X_low1],     8       \n\t"
+            "addiu   %[i],       %[i],          1       \n\t"
+            "bne     %[i],       %[temp3],      4b      \n\t"
+
+            : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2)
+            : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
+            : "memory"
+        );
+        x1 -= ((38-i_Temp)<<6)-1;
+        X_low1 -= ((38-i_Temp)<<1)- 80;
+    }
+
+    x1=&X[0][i_Temp][k];
+    Y11=&Y1[i_Temp][k][0];
+    temp2=32;
+
+    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
+
+        __asm__ volatile (
+           "move    %[i],       %[i_Temp]               \n\t"
+        "5:                                             \n\t"
+           "lw      %[temp0],   0(%[Y11])               \n\t"
+           "lw      %[temp1],   4(%[Y11])               \n\t"
+           "sw      %[temp0],   0(%[x1])                \n\t"
+           "sw      %[temp1],   9728(%[x1])             \n\t"
+           PTR_ADDIU "%[x1],    %[x1],          256     \n\t"
+           PTR_ADDIU "%[Y11],   %[Y11],         512     \n\t"
+           "addiu   %[i],       %[i],           1       \n\t"
+           "bne     %[i],       %[temp2],       5b      \n\t"
+
+           : [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
+             [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+           : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
+             [temp2]"r"(temp2)
+           : "memory"
+        );
+
+        x1 -= ((32-i_Temp)<<6)-1;
+        Y11 -= ((32-i_Temp)<<7)-2;
+   }
+      return 0;
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void sbr_hf_assemble_mips(float Y1[38][64][2],
+                            const float X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2])
+{
+    int e, i, j, m;
+    const int h_SL = 4 * !sbr->bs_smoothing_mode;
+    const int kx = sbr->kx[1];
+    const int m_max = sbr->m[1];
+    static const float h_smooth[5] = {
+        0.33333333333333,
+        0.30150283239582,
+        0.21816949906249,
+        0.11516383427084,
+        0.03183050093751,
+    };
+
+    float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
+    int indexnoise = ch_data->f_indexnoise;
+    int indexsine  = ch_data->f_indexsine;
+    float *g_temp1, *q_temp1, *pok, *pok1;
+    float temp1, temp2, temp3, temp4;
+    int size = m_max;
+
+    if (sbr->reset) {
+        for (i = 0; i < h_SL; i++) {
+            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    } else if (h_SL) {
+        memcpy(g_temp[2*ch_data->t_env[0]], g_temp[2*ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
+        memcpy(q_temp[2*ch_data->t_env[0]], q_temp[2*ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            g_temp1 = g_temp[h_SL + i];
+            pok = sbr->gain[e];
+            q_temp1 = q_temp[h_SL + i];
+            pok1 = sbr->q_m[e];
+
+            /* loop unrolled 4 times */
+            for (j=0; j<(size>>2); j++) {
+                __asm__ volatile (
+                    "lw      %[temp1],   0(%[pok])               \n\t"
+                    "lw      %[temp2],   4(%[pok])               \n\t"
+                    "lw      %[temp3],   8(%[pok])               \n\t"
+                    "lw      %[temp4],   12(%[pok])              \n\t"
+                    "sw      %[temp1],   0(%[g_temp1])           \n\t"
+                    "sw      %[temp2],   4(%[g_temp1])           \n\t"
+                    "sw      %[temp3],   8(%[g_temp1])           \n\t"
+                    "sw      %[temp4],   12(%[g_temp1])          \n\t"
+                    "lw      %[temp1],   0(%[pok1])              \n\t"
+                    "lw      %[temp2],   4(%[pok1])              \n\t"
+                    "lw      %[temp3],   8(%[pok1])              \n\t"
+                    "lw      %[temp4],   12(%[pok1])             \n\t"
+                    "sw      %[temp1],   0(%[q_temp1])           \n\t"
+                    "sw      %[temp2],   4(%[q_temp1])           \n\t"
+                    "sw      %[temp3],   8(%[q_temp1])           \n\t"
+                    "sw      %[temp4],   12(%[q_temp1])          \n\t"
+                    PTR_ADDIU "%[pok],     %[pok],         16    \n\t"
+                    PTR_ADDIU "%[g_temp1], %[g_temp1],     16    \n\t"
+                    PTR_ADDIU "%[pok1],    %[pok1],        16    \n\t"
+                    PTR_ADDIU "%[q_temp1], %[q_temp1],     16    \n\t"
+
+                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
+                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
+                    :
+                    : "memory"
+                );
+            }
+
+            for (j=0; j<(size&3); j++) {
+                __asm__ volatile (
+                    "lw      %[temp1],   0(%[pok])              \n\t"
+                    "lw      %[temp2],   0(%[pok1])             \n\t"
+                    "sw      %[temp1],   0(%[g_temp1])          \n\t"
+                    "sw      %[temp2],   0(%[q_temp1])          \n\t"
+                    PTR_ADDIU "%[pok],     %[pok],        4     \n\t"
+                    PTR_ADDIU "%[g_temp1], %[g_temp1],    4     \n\t"
+                    PTR_ADDIU "%[pok1],    %[pok1],       4     \n\t"
+                    PTR_ADDIU "%[q_temp1], %[q_temp1],    4     \n\t"
+
+                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
+                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
+                    :
+                    : "memory"
+                );
+            }
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
+            LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
+            float *g_filt, *q_filt;
+
+            if (h_SL && e != e_a[0] && e != e_a[1]) {
+                g_filt = g_filt_tab;
+                q_filt = q_filt_tab;
+
+                for (m = 0; m < m_max; m++) {
+                    const int idx1 = i + h_SL;
+                    g_filt[m] = 0.0f;
+                    q_filt[m] = 0.0f;
+
+                    for (j = 0; j <= h_SL; j++) {
+                        g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
+                        q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
+                    }
+                }
+            } else {
+                g_filt = g_temp[i + h_SL];
+                q_filt = q_temp[i];
+            }
+
+            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
+                               i + ENVELOPE_ADJUSTMENT_OFFSET);
+
+            if (e != e_a[0] && e != e_a[1]) {
+                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
+                                                   q_filt, indexnoise,
+                                                   kx, m_max);
+            } else {
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                float *out = &Y1[i][kx][idx];
+                float *in  = sbr->s_m[e];
+                float temp0, temp1, temp2, temp3, temp4, temp5;
+                float A_f = (float)A;
+                float B_f = (float)B;
+
+                for (m = 0; m+1 < m_max; m+=2) {
+
+                    temp2 = out[0];
+                    temp3 = out[2];
+
+                    __asm__ volatile(
+                        "lwc1    %[temp0],  0(%[in])                     \n\t"
+                        "lwc1    %[temp1],  4(%[in])                     \n\t"
+                        "madd.s  %[temp4],  %[temp2],  %[temp0], %[A_f]  \n\t"
+                        "madd.s  %[temp5],  %[temp3],  %[temp1], %[B_f]  \n\t"
+                        "swc1    %[temp4],  0(%[out])                    \n\t"
+                        "swc1    %[temp5],  8(%[out])                    \n\t"
+                        PTR_ADDIU "%[in],   %[in],     8                 \n\t"
+                        PTR_ADDIU "%[out],  %[out],    16                \n\t"
+
+                        : [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
+                          [temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
+                          [in]"+r"(in), [out]"+r"(out)
+                        : [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
+                          [temp3]"f"(temp3)
+                        : "memory"
+                    );
+                }
+                if(m_max&1)
+                    out[2*m  ] += in[m  ] * A;
+            }
+            indexnoise = (indexnoise + m_max) & 0x1ff;
+            indexsine = (indexsine + 1) & 3;
+        }
+    }
+    ch_data->f_indexnoise = indexnoise;
+    ch_data->f_indexsine  = indexsine;
+}
+
+static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
+                                  float (*alpha0)[2], float (*alpha1)[2],
+                                  const float X_low[32][40][2], int k0)
+{
+    int k;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
+    float *phi1, *alpha_1, *alpha_0, res1, res2, temp_real, temp_im;
+
+    c = 1.000001f;
+
+    for (k = 0; k < k0; k++) {
+        LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
+        float dk;
+        phi1 = &phi[0][0][0];
+        alpha_1 = &alpha1[k][0];
+        alpha_0 = &alpha0[k][0];
+        dsp->autocorrelate(X_low[k], phi);
+
+        __asm__ volatile (
+            "lwc1    %[temp0],  40(%[phi1])                       \n\t"
+            "lwc1    %[temp1],  16(%[phi1])                       \n\t"
+            "lwc1    %[temp2],  24(%[phi1])                       \n\t"
+            "lwc1    %[temp3],  28(%[phi1])                       \n\t"
+            "mul.s   %[dk],     %[temp0],    %[temp1]             \n\t"
+            "lwc1    %[temp4],  0(%[phi1])                        \n\t"
+            "mul.s   %[res2],   %[temp2],    %[temp2]             \n\t"
+            "lwc1    %[temp5],  4(%[phi1])                        \n\t"
+            "madd.s  %[res2],   %[res2],     %[temp3],  %[temp3]  \n\t"
+            "lwc1    %[temp6],  8(%[phi1])                        \n\t"
+            "div.s   %[res2],   %[res2],     %[c]                 \n\t"
+            "lwc1    %[temp0],  12(%[phi1])                       \n\t"
+            "sub.s   %[dk],     %[dk],       %[res2]              \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
+            : [phi1]"r"(phi1), [c]"f"(c)
+            : "memory"
+        );
+
+        if (!dk) {
+            alpha_1[0] = 0;
+            alpha_1[1] = 0;
+        } else {
+            __asm__ volatile (
+                "mul.s   %[temp_real], %[temp4],     %[temp2]            \n\t"
+                "nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3]  \n\t"
+                "nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1]  \n\t"
+                "mul.s   %[temp_im],   %[temp4],     %[temp3]            \n\t"
+                "madd.s  %[temp_im],   %[temp_im],   %[temp5], %[temp2]  \n\t"
+                "nmsub.s %[temp_im],   %[temp_im],   %[temp0], %[temp1]  \n\t"
+                "div.s   %[temp_real], %[temp_real], %[dk]               \n\t"
+                "div.s   %[temp_im],   %[temp_im],   %[dk]               \n\t"
+                "swc1    %[temp_real], 0(%[alpha_1])                     \n\t"
+                "swc1    %[temp_im],   4(%[alpha_1])                     \n\t"
+
+                : [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
+                : [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
+                  [temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
+                  [temp5]"f"(temp5), [temp6]"f"(temp6),
+                  [alpha_1]"r"(alpha_1), [dk]"f"(dk)
+                : "memory"
+            );
+        }
+
+        if (!phi1[4]) {
+            alpha_0[0] = 0;
+            alpha_0[1] = 0;
+        } else {
+            __asm__ volatile (
+                "lwc1    %[temp6],     0(%[alpha_1])                     \n\t"
+                "lwc1    %[temp7],     4(%[alpha_1])                     \n\t"
+                "mul.s   %[temp_real], %[temp6],     %[temp2]            \n\t"
+                "add.s   %[temp_real], %[temp_real], %[temp4]            \n\t"
+                "madd.s  %[temp_real], %[temp_real], %[temp7], %[temp3]  \n\t"
+                "mul.s   %[temp_im],   %[temp7],     %[temp2]            \n\t"
+                "add.s   %[temp_im],   %[temp_im],   %[temp5]            \n\t"
+                "nmsub.s %[temp_im],   %[temp_im],   %[temp6], %[temp3]  \n\t"
+                "div.s   %[temp_real], %[temp_real], %[temp1]            \n\t"
+                "div.s   %[temp_im],   %[temp_im],   %[temp1]            \n\t"
+                "neg.s   %[temp_real], %[temp_real]                      \n\t"
+                "neg.s   %[temp_im],   %[temp_im]                        \n\t"
+                "swc1    %[temp_real], 0(%[alpha_0])                     \n\t"
+                "swc1    %[temp_im],   4(%[alpha_0])                     \n\t"
+
+                : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+                  [res1]"=&f"(res1), [res2]"=&f"(res2)
+                : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
+                  [temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
+                  [temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
+                : "memory"
+            );
+        }
+
+        __asm__ volatile (
+            "lwc1    %[temp1],      0(%[alpha_1])                           \n\t"
+            "lwc1    %[temp2],      4(%[alpha_1])                           \n\t"
+            "lwc1    %[temp_real],  0(%[alpha_0])                           \n\t"
+            "lwc1    %[temp_im],    4(%[alpha_0])                           \n\t"
+            "mul.s   %[res1],       %[temp1],      %[temp1]                 \n\t"
+            "madd.s  %[res1],       %[res1],       %[temp2],    %[temp2]    \n\t"
+            "mul.s   %[res2],       %[temp_real],  %[temp_real]             \n\t"
+            "madd.s  %[res2],       %[res2],       %[temp_im],  %[temp_im]  \n\t"
+
+            : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
+              [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [res1]"=&f"(res1), [res2]"=&f"(res2)
+            : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
+            : "memory"
+        );
+
+        if (res1 >= 16.0f || res2 >= 16.0f) {
+            alpha_1[0] = 0;
+            alpha_1[1] = 0;
+            alpha_0[0] = 0;
+            alpha_0[1] = 0;
+        }
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->sbr_lf_gen            = sbr_lf_gen_mips;
+    c->sbr_x_gen             = sbr_x_gen_mips;
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
+    c->sbr_hf_assemble       = sbr_hf_assemble_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacsbr_mips.h b/libavcodec/mips/aacsbr_mips.h
new file mode 100644
index 0000000..4461e76
--- /dev/null
+++ b/libavcodec/mips/aacsbr_mips.h
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacsbr.c
+ */
+
+#ifndef AVCODEC_MIPS_AACSBR_MIPS_H
+#define AVCODEC_MIPS_AACSBR_MIPS_H
+
+#include "libavcodec/aac.h"
+#include "libavcodec/sbr.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void sbr_qmf_analysis_mips(AVFloatDSPContext *fdsp, FFTContext *mdct,
+                             SBRDSPContext *sbrdsp, const float *in, float *x,
+                             float z[320], float W[2][32][32][2], int buf_idx)
+{
+    int i;
+    float *w0;
+    float *w1;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    w0 = x;
+    w1 = x + 1024;
+    for(i = 0; i < 36; i++)
+    {
+        /* loop unrolled 8 times */
+        __asm__ volatile(
+            "lw      %[temp0],   0(%[w1])         \n\t"
+            "lw      %[temp1],   4(%[w1])         \n\t"
+            "lw      %[temp2],   8(%[w1])         \n\t"
+            "lw      %[temp3],   12(%[w1])        \n\t"
+            "lw      %[temp4],   16(%[w1])        \n\t"
+            "lw      %[temp5],   20(%[w1])        \n\t"
+            "lw      %[temp6],   24(%[w1])        \n\t"
+            "lw      %[temp7],   28(%[w1])        \n\t"
+            "sw      %[temp0],   0(%[w0])         \n\t"
+            "sw      %[temp1],   4(%[w0])         \n\t"
+            "sw      %[temp2],   8(%[w0])         \n\t"
+            "sw      %[temp3],   12(%[w0])        \n\t"
+            "sw      %[temp4],   16(%[w0])        \n\t"
+            "sw      %[temp5],   20(%[w0])        \n\t"
+            "sw      %[temp6],   24(%[w0])        \n\t"
+            "sw      %[temp7],   28(%[w0])        \n\t"
+            PTR_ADDIU " %[w0],      %[w0],     32 \n\t"
+            PTR_ADDIU " %[w1],      %[w1],     32 \n\t"
+
+            : [w0]"+r"(w0), [w1]"+r"(w1),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            :
+            : "memory"
+        );
+    }
+
+    w0 = x + 288;
+    w1 = (float*)in;
+    for(i = 0; i < 128; i++)
+    {
+        /* loop unrolled 8 times */
+        __asm__ volatile(
+            "lw       %[temp0],    0(%[w1])        \n\t"
+            "lw       %[temp1],    4(%[w1])        \n\t"
+            "lw       %[temp2],    8(%[w1])        \n\t"
+            "lw       %[temp3],    12(%[w1])       \n\t"
+            "lw       %[temp4],    16(%[w1])       \n\t"
+            "lw       %[temp5],    20(%[w1])       \n\t"
+            "lw       %[temp6],    24(%[w1])       \n\t"
+            "lw       %[temp7],    28(%[w1])       \n\t"
+            "sw       %[temp0],    0(%[w0])        \n\t"
+            "sw       %[temp1],    4(%[w0])        \n\t"
+            "sw       %[temp2],    8(%[w0])        \n\t"
+            "sw       %[temp3],    12(%[w0])       \n\t"
+            "sw       %[temp4],    16(%[w0])       \n\t"
+            "sw       %[temp5],    20(%[w0])       \n\t"
+            "sw       %[temp6],    24(%[w0])       \n\t"
+            "sw       %[temp7],    28(%[w0])       \n\t"
+            PTR_ADDIU "  %[w0],       %[w0],    32 \n\t"
+            PTR_ADDIU "  %[w1],       %[w1],    32 \n\t"
+
+            : [w0]"+r"(w0), [w1]"+r"(w1),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            :
+            : "memory"
+        );
+    }
+
+    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
+                               // are not supported
+        fdsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
+        sbrdsp->sum64x5(z);
+        sbrdsp->qmf_pre_shuffle(z);
+        mdct->imdct_half(mdct, z, z+64);
+        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
+        x += 32;
+    }
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void sbr_qmf_synthesis_mips(FFTContext *mdct,
+                              SBRDSPContext *sbrdsp, AVFloatDSPContext *fdsp,
+                              float *out, float X[2][38][64],
+                              float mdct_buf[2][64],
+                              float *v0, int *v_off, const unsigned int div)
+{
+    int i, n;
+    const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
+    const int step = 128 >> div;
+    float *v;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+    float temp14, temp15, temp16, temp17, temp18, temp19;
+    float *vv0, *s0, *dst;
+    dst = out;
+
+    for (i = 0; i < 32; i++) {
+        if (*v_off < step) {
+            int saved_samples = (1280 - 128) >> div;
+            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(float));
+            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
+        } else {
+            *v_off -= step;
+        }
+        v = v0 + *v_off;
+        if (div) {
+            for (n = 0; n < 32; n++) {
+                X[0][i][   n] = -X[0][i][n];
+                X[0][i][32+n] =  X[1][i][31-n];
+            }
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
+        } else {
+            sbrdsp->neg_odd_64(X[1][i]);
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
+            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
+        }
+
+        if(div == 0)
+        {
+            float *v0_end;
+            vv0 = v;
+            v0_end = v + 60;
+            s0 = (float*)sbr_qmf_window;
+
+            /* 10 calls of function vector_fmul_add merged into one loop
+               and loop unrolled 4 times */
+            __asm__ volatile(
+                ".set    push                                           \n\t"
+                ".set    noreorder                                      \n\t"
+                "lwc1    %[temp4],   0(%[v0])                           \n\t"
+                "lwc1    %[temp5],   0(%[s0])                           \n\t"
+                "lwc1    %[temp6],   4(%[v0])                           \n\t"
+                "lwc1    %[temp7],   4(%[s0])                           \n\t"
+                "lwc1    %[temp8],   8(%[v0])                           \n\t"
+                "lwc1    %[temp9],   8(%[s0])                           \n\t"
+                "lwc1    %[temp10],  12(%[v0])                          \n\t"
+                "lwc1    %[temp11],  12(%[s0])                          \n\t"
+                "lwc1    %[temp12],  768(%[v0])                         \n\t"
+                "lwc1    %[temp13],  256(%[s0])                         \n\t"
+                "lwc1    %[temp14],  772(%[v0])                         \n\t"
+                "lwc1    %[temp15],  260(%[s0])                         \n\t"
+                "lwc1    %[temp16],  776(%[v0])                         \n\t"
+                "lwc1    %[temp17],  264(%[s0])                         \n\t"
+                "lwc1    %[temp18],  780(%[v0])                         \n\t"
+                "lwc1    %[temp19],  268(%[s0])                         \n\t"
+            "1:                                                         \n\t"
+                "mul.s   %[temp0],   %[temp4],   %[temp5]               \n\t"
+                "lwc1    %[temp4],   1024(%[v0])                        \n\t"
+                "mul.s   %[temp1],   %[temp6],   %[temp7]               \n\t"
+                "lwc1    %[temp5],   512(%[s0])                         \n\t"
+                "mul.s   %[temp2],   %[temp8],   %[temp9]               \n\t"
+                "lwc1    %[temp6],   1028(%[v0])                        \n\t"
+                "mul.s   %[temp3],   %[temp10],  %[temp11]              \n\t"
+                "lwc1    %[temp7],   516(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   1032(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   520(%[s0])                         \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  1036(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  524(%[s0])                         \n\t"
+                "lwc1    %[temp12],  1792(%[v0])                        \n\t"
+                "lwc1    %[temp13],  768(%[s0])                         \n\t"
+                "lwc1    %[temp14],  1796(%[v0])                        \n\t"
+                "lwc1    %[temp15],  772(%[s0])                         \n\t"
+                "lwc1    %[temp16],  1800(%[v0])                        \n\t"
+                "lwc1    %[temp17],  776(%[s0])                         \n\t"
+                "lwc1    %[temp18],  1804(%[v0])                        \n\t"
+                "lwc1    %[temp19],  780(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   2048(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1024(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   2052(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1028(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   2056(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1032(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  2060(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1036(%[s0])                        \n\t"
+                "lwc1    %[temp12],  2816(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1280(%[s0])                        \n\t"
+                "lwc1    %[temp14],  2820(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1284(%[s0])                        \n\t"
+                "lwc1    %[temp16],  2824(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1288(%[s0])                        \n\t"
+                "lwc1    %[temp18],  2828(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1292(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   3072(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1536(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   3076(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1540(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   3080(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1544(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  3084(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1548(%[s0])                        \n\t"
+                "lwc1    %[temp12],  3840(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1792(%[s0])                        \n\t"
+                "lwc1    %[temp14],  3844(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1796(%[s0])                        \n\t"
+                "lwc1    %[temp16],  3848(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1800(%[s0])                        \n\t"
+                "lwc1    %[temp18],  3852(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1804(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   4096(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   2048(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4100(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   2052(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   4104(%[v0])                        \n\t"
+                PTR_ADDIU "%[dst],     %[dst],      16                  \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   2056(%[s0])                        \n\t"
+                PTR_ADDIU " %[s0],      %[s0],      16                  \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  4108(%[v0])                        \n\t"
+                PTR_ADDIU " %[v0],      %[v0],      16                  \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  2044(%[s0])                        \n\t"
+                "lwc1    %[temp12],  4848(%[v0])                        \n\t"
+                "lwc1    %[temp13],  2288(%[s0])                        \n\t"
+                "lwc1    %[temp14],  4852(%[v0])                        \n\t"
+                "lwc1    %[temp15],  2292(%[s0])                        \n\t"
+                "lwc1    %[temp16],  4856(%[v0])                        \n\t"
+                "lwc1    %[temp17],  2296(%[s0])                        \n\t"
+                "lwc1    %[temp18],  4860(%[v0])                        \n\t"
+                "lwc1    %[temp19],  2300(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   0(%[v0])                           \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   0(%[s0])                           \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4(%[v0])                           \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   4(%[s0])                           \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   8(%[v0])                           \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   8(%[s0])                           \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  12(%[v0])                          \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  12(%[s0])                          \n\t"
+                "lwc1    %[temp12],  768(%[v0])                         \n\t"
+                "lwc1    %[temp13],  256(%[s0])                         \n\t"
+                "lwc1    %[temp14],  772(%[v0])                         \n\t"
+                "lwc1    %[temp15],  260(%[s0])                         \n\t"
+                "lwc1    %[temp16],  776(%[v0])                         \n\t"
+                "lwc1    %[temp17],  264(%[s0])                         \n\t"
+                "lwc1    %[temp18],  780(%[v0])                         \n\t"
+                "lwc1    %[temp19],  268(%[s0])                         \n\t"
+                "swc1    %[temp0],   -16(%[dst])                        \n\t"
+                "swc1    %[temp1],   -12(%[dst])                        \n\t"
+                "swc1    %[temp2],   -8(%[dst])                         \n\t"
+                "bne     %[v0],      %[v0_end],  1b                     \n\t"
+                " swc1   %[temp3],   -4(%[dst])                         \n\t"
+                "mul.s   %[temp0],   %[temp4],   %[temp5]               \n\t"
+                "lwc1    %[temp4],   1024(%[v0])                        \n\t"
+                "mul.s   %[temp1],   %[temp6],   %[temp7]               \n\t"
+                "lwc1    %[temp5],   512(%[s0])                         \n\t"
+                "mul.s   %[temp2],   %[temp8],   %[temp9]               \n\t"
+                "lwc1    %[temp6],   1028(%[v0])                        \n\t"
+                "mul.s   %[temp3],   %[temp10],  %[temp11]              \n\t"
+                "lwc1    %[temp7],   516(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   1032(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   520(%[s0])                         \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  1036(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  524(%[s0])                         \n\t"
+                "lwc1    %[temp12],  1792(%[v0])                        \n\t"
+                "lwc1    %[temp13],  768(%[s0])                         \n\t"
+                "lwc1    %[temp14],  1796(%[v0])                        \n\t"
+                "lwc1    %[temp15],  772(%[s0])                         \n\t"
+                "lwc1    %[temp16],  1800(%[v0])                        \n\t"
+                "lwc1    %[temp17],  776(%[s0])                         \n\t"
+                "lwc1    %[temp18],  1804(%[v0])                        \n\t"
+                "lwc1    %[temp19],  780(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   2048(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1024(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   2052(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1028(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   2056(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1032(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  2060(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1036(%[s0])                        \n\t"
+                "lwc1    %[temp12],  2816(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1280(%[s0])                        \n\t"
+                "lwc1    %[temp14],  2820(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1284(%[s0])                        \n\t"
+                "lwc1    %[temp16],  2824(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1288(%[s0])                        \n\t"
+                "lwc1    %[temp18],  2828(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1292(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   3072(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1536(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   3076(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1540(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   3080(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1544(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  3084(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1548(%[s0])                        \n\t"
+                "lwc1    %[temp12],  3840(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1792(%[s0])                        \n\t"
+                "lwc1    %[temp14],  3844(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1796(%[s0])                        \n\t"
+                "lwc1    %[temp16],  3848(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1800(%[s0])                        \n\t"
+                "lwc1    %[temp18],  3852(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1804(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   4096(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   2048(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4100(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   2052(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   4104(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   2056(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  4108(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  2060(%[s0])                        \n\t"
+                "lwc1    %[temp12],  4864(%[v0])                        \n\t"
+                "lwc1    %[temp13],  2304(%[s0])                        \n\t"
+                "lwc1    %[temp14],  4868(%[v0])                        \n\t"
+                "lwc1    %[temp15],  2308(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp16],  4872(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp17],  2312(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp18],  4876(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp19],  2316(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                PTR_ADDIU "%[dst],     %[dst],     16                   \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "swc1    %[temp0],   -16(%[dst])                        \n\t"
+                "swc1    %[temp1],   -12(%[dst])                        \n\t"
+                "swc1    %[temp2],   -8(%[dst])                         \n\t"
+                "swc1    %[temp3],   -4(%[dst])                         \n\t"
+                ".set    pop                                            \n\t"
+
+                : [dst]"+r"(dst), [v0]"+r"(vv0), [s0]"+r"(s0),
+                  [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+                  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+                  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+                  [temp12]"=&f"(temp12), [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+                  [temp15]"=&f"(temp15), [temp16]"=&f"(temp16), [temp17]"=&f"(temp17),
+                  [temp18]"=&f"(temp18), [temp19]"=&f"(temp19)
+                : [v0_end]"r"(v0_end)
+                : "memory"
+            );
+        }
+        else
+        {
+            fdsp->vector_fmul   (out, v                , sbr_qmf_window                       , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
+            out += 64 >> div;
+        }
+    }
+}
+
+#define sbr_qmf_analysis sbr_qmf_analysis_mips
+#define sbr_qmf_synthesis sbr_qmf_synthesis_mips
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_AACSBR_MIPS_H */
diff --git a/libavcodec/mips/ac3dsp_mips.c b/libavcodec/mips/ac3dsp_mips.c
new file mode 100644
index 0000000..e5cee16
--- /dev/null
+++ b/libavcodec/mips/ac3dsp_mips.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Branimir Vasic (bvasic@mips.com)
+ *           Nedeljko Babic (nbabic@mips.com)
+ *
+ * Various AC-3 DSP Utils optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/ac3dsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/ac3dsp.h"
+#include "libavcodec/ac3.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSP
+static void ac3_bit_alloc_calc_bap_mips(int16_t *mask, int16_t *psd,
+                                        int start, int end,
+                                        int snr_offset, int floor,
+                                        const uint8_t *bap_tab, uint8_t *bap)
+{
+    int band, band_end, cond;
+    int m, address1, address2;
+    int16_t *psd1, *psd_end;
+    uint8_t *bap1;
+
+    if (snr_offset == -960) {
+        memset(bap, 0, AC3_MAX_COEFS);
+        return;
+    }
+
+    psd1 = &psd[start];
+    bap1 = &bap[start];
+    band = ff_ac3_bin_to_band_tab[start];
+
+    do {
+        m = (FFMAX(mask[band] - snr_offset - floor, 0) & 0x1FE0) + floor;
+        band_end = ff_ac3_band_start_tab[++band];
+        band_end = FFMIN(band_end, end);
+        psd_end = psd + band_end - 1;
+
+        __asm__ volatile (
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "beqz       %[cond],        1f                          \n\t"
+            "2:                                                     \n\t"
+            "lh         %[address1],    0(%[psd1])                  \n\t"
+            "lh         %[address2],    2(%[psd1])                  \n\t"
+            PTR_ADDIU " %[psd1],        %[psd1],        4           \n\t"
+            "subu       %[address1],    %[address1],    %[m]        \n\t"
+            "sra        %[address1],    %[address1],    5           \n\t"
+            "addiu      %[address1],    %[address1],    -32         \n\t"
+            "shll_s.w   %[address1],    %[address1],    26          \n\t"
+            "subu       %[address2],    %[address2],    %[m]        \n\t"
+            "sra        %[address2],    %[address2],    5           \n\t"
+            "sra        %[address1],    %[address1],    26          \n\t"
+            "addiu      %[address1],    %[address1],    32          \n\t"
+            "lbux       %[address1],    %[address1](%[bap_tab])     \n\t"
+            "addiu      %[address2],    %[address2],    -32         \n\t"
+            "shll_s.w   %[address2],    %[address2],    26          \n\t"
+            "sb         %[address1],    0(%[bap1])                  \n\t"
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "sra        %[address2],    %[address2],    26          \n\t"
+            "addiu      %[address2],    %[address2],    32          \n\t"
+            "lbux       %[address2],    %[address2](%[bap_tab])     \n\t"
+            "sb         %[address2],    1(%[bap1])                  \n\t"
+            PTR_ADDIU " %[bap1],        %[bap1],        2           \n\t"
+            "bnez       %[cond],        2b                          \n\t"
+            PTR_ADDIU " %[psd_end],     %[psd_end],     2           \n\t"
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "beqz       %[cond],        3f                          \n\t"
+            "1:                                                     \n\t"
+            "lh         %[address1],    0(%[psd1])                  \n\t"
+            PTR_ADDIU " %[psd1],        %[psd1],        2           \n\t"
+            "subu       %[address1],    %[address1],    %[m]        \n\t"
+            "sra        %[address1],    %[address1],    5           \n\t"
+            "addiu      %[address1],    %[address1],    -32         \n\t"
+            "shll_s.w   %[address1],    %[address1],    26          \n\t"
+            "sra        %[address1],    %[address1],    26          \n\t"
+            "addiu      %[address1],    %[address1],    32          \n\t"
+            "lbux       %[address1],    %[address1](%[bap_tab])     \n\t"
+            "sb         %[address1],    0(%[bap1])                  \n\t"
+            PTR_ADDIU " %[bap1],        %[bap1],        1           \n\t"
+            "3:                                                     \n\t"
+
+            : [address1]"=&r"(address1), [address2]"=&r"(address2),
+              [cond]"=&r"(cond), [bap1]"+r"(bap1),
+              [psd1]"+r"(psd1), [psd_end]"+r"(psd_end)
+            : [m]"r"(m), [bap_tab]"r"(bap_tab)
+            : "memory"
+        );
+    } while (end > band_end);
+}
+
+static void ac3_update_bap_counts_mips(uint16_t mant_cnt[16], uint8_t *bap,
+                                       int len)
+{
+    void *temp0, *temp2, *temp4, *temp5, *temp6, *temp7;
+    int temp1, temp3;
+
+    __asm__ volatile (
+        "andi   %[temp3],   %[len],         3               \n\t"
+        PTR_ADDU "%[temp2], %[bap],         %[len]          \n\t"
+        PTR_ADDU "%[temp4], %[bap],         %[temp3]        \n\t"
+        "beq    %[temp2],   %[temp4],       4f              \n\t"
+        "1:                                                 \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        "lbu    %[temp5],   -2(%[temp2])                    \n\t"
+        "lbu    %[temp6],   -3(%[temp2])                    \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        PTR_ADDU "%[temp0], %[mant_cnt],    %[temp0]        \n\t"
+        "sll    %[temp5],   %[temp5],       1               \n\t"
+        PTR_ADDU "%[temp5], %[mant_cnt],    %[temp5]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "sll    %[temp6],   %[temp6],       1               \n\t"
+        PTR_ADDU "%[temp6], %[mant_cnt],    %[temp6]        \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "lhu    %[temp1],   0(%[temp5])                     \n\t"
+        "lbu    %[temp7],   -4(%[temp2])                    \n\t"
+        PTR_ADDIU "%[temp2],%[temp2],       -4              \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp5])                     \n\t"
+        "lhu    %[temp1],   0(%[temp6])                     \n\t"
+        "sll    %[temp7],   %[temp7],       1               \n\t"
+        PTR_ADDU "%[temp7], %[mant_cnt],    %[temp7]        \n\t"
+        "addiu  %[temp1],   %[temp1],1                      \n\t"
+        "sh     %[temp1],   0(%[temp6])                     \n\t"
+        "lhu    %[temp1],   0(%[temp7])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp7])                     \n\t"
+        "bne    %[temp2],   %[temp4],       1b              \n\t"
+        "4:                                                 \n\t"
+        "beqz   %[temp3],   2f                              \n\t"
+        "3:                                                 \n\t"
+        "addiu  %[temp3],   %[temp3],       -1              \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        PTR_ADDIU "%[temp2],%[temp2],       -1              \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        PTR_ADDU "%[temp0], %[mant_cnt],    %[temp0]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "bgtz   %[temp3],   3b                              \n\t"
+        "2:                                                 \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
+          [temp6] "=&r" (temp6), [temp7] "=&r" (temp7)
+        : [len] "r" (len), [bap] "r" (bap),
+          [mant_cnt] "r" (mant_cnt)
+        : "memory"
+    );
+}
+#endif
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void float_to_fixed24_mips(int32_t *dst, const float *src, unsigned int len)
+{
+    const float scale = 1 << 24;
+    float src0, src1, src2, src3, src4, src5, src6, src7;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    do {
+        __asm__ volatile (
+            "lwc1       %[src0],    0(%[src])               \n\t"
+            "lwc1       %[src1],    4(%[src])               \n\t"
+            "lwc1       %[src2],    8(%[src])               \n\t"
+            "lwc1       %[src3],    12(%[src])              \n\t"
+            "lwc1       %[src4],    16(%[src])              \n\t"
+            "lwc1       %[src5],    20(%[src])              \n\t"
+            "lwc1       %[src6],    24(%[src])              \n\t"
+            "lwc1       %[src7],    28(%[src])              \n\t"
+            "mul.s      %[src0],    %[src0],    %[scale]    \n\t"
+            "mul.s      %[src1],    %[src1],    %[scale]    \n\t"
+            "mul.s      %[src2],    %[src2],    %[scale]    \n\t"
+            "mul.s      %[src3],    %[src3],    %[scale]    \n\t"
+            "mul.s      %[src4],    %[src4],    %[scale]    \n\t"
+            "mul.s      %[src5],    %[src5],    %[scale]    \n\t"
+            "mul.s      %[src6],    %[src6],    %[scale]    \n\t"
+            "mul.s      %[src7],    %[src7],    %[scale]    \n\t"
+            "cvt.w.s    %[src0],    %[src0]                 \n\t"
+            "cvt.w.s    %[src1],    %[src1]                 \n\t"
+            "cvt.w.s    %[src2],    %[src2]                 \n\t"
+            "cvt.w.s    %[src3],    %[src3]                 \n\t"
+            "cvt.w.s    %[src4],    %[src4]                 \n\t"
+            "cvt.w.s    %[src5],    %[src5]                 \n\t"
+            "cvt.w.s    %[src6],    %[src6]                 \n\t"
+            "cvt.w.s    %[src7],    %[src7]                 \n\t"
+            "mfc1       %[temp0],   %[src0]                 \n\t"
+            "mfc1       %[temp1],   %[src1]                 \n\t"
+            "mfc1       %[temp2],   %[src2]                 \n\t"
+            "mfc1       %[temp3],   %[src3]                 \n\t"
+            "mfc1       %[temp4],   %[src4]                 \n\t"
+            "mfc1       %[temp5],   %[src5]                 \n\t"
+            "mfc1       %[temp6],   %[src6]                 \n\t"
+            "mfc1       %[temp7],   %[src7]                 \n\t"
+            "sw         %[temp0],   0(%[dst])               \n\t"
+            "sw         %[temp1],   4(%[dst])               \n\t"
+            "sw         %[temp2],   8(%[dst])               \n\t"
+            "sw         %[temp3],   12(%[dst])              \n\t"
+            "sw         %[temp4],   16(%[dst])              \n\t"
+            "sw         %[temp5],   20(%[dst])              \n\t"
+            "sw         %[temp6],   24(%[dst])              \n\t"
+            "sw         %[temp7],   28(%[dst])              \n\t"
+
+            : [dst] "+r" (dst), [src] "+r" (src),
+              [src0] "=&f" (src0), [src1] "=&f" (src1),
+              [src2] "=&f" (src2), [src3] "=&f" (src3),
+              [src4] "=&f" (src4), [src5] "=&f" (src5),
+              [src6] "=&f" (src6), [src7] "=&f" (src7),
+              [temp0] "=r" (temp0), [temp1] "=r" (temp1),
+              [temp2] "=r" (temp2), [temp3] "=r" (temp3),
+              [temp4] "=r" (temp4), [temp5] "=r" (temp5),
+              [temp6] "=r" (temp6), [temp7] "=r" (temp7)
+            : [scale] "f" (scale)
+            : "memory"
+        );
+        src = src + 8;
+        dst = dst + 8;
+        len -= 8;
+    } while (len > 0);
+}
+
+static void ac3_downmix_mips(float **samples, float (*matrix)[2],
+                          int out_ch, int in_ch, int len)
+{
+    int i, j, i1, i2, i3;
+    float v0, v1, v2, v3;
+    float v4, v5, v6, v7;
+    float samples0, samples1, samples2, samples3, matrix_j, matrix_j2;
+    float *samples_p, *samples_sw, *matrix_p, **samples_x, **samples_end;
+
+    __asm__ volatile(
+        ".set   push                                                \n\t"
+        ".set   noreorder                                           \n\t"
+
+        "li     %[i1],          2                                   \n\t"
+        "sll    %[len],         2                                   \n\t"
+        "move   %[i],           $zero                               \n\t"
+        "sll    %[j],           %[in_ch],             " PTRLOG "    \n\t"
+
+        "bne    %[out_ch],      %[i1],                  3f          \n\t"   // if (out_ch == 2)
+        " li    %[i2],          1                                   \n\t"
+
+        "2:                                                         \n\t"   // start of the for loop (for (i = 0; i < len; i+=4))
+        "move   %[matrix_p],    %[matrix]                           \n\t"
+        "move   %[samples_x],   %[samples]                          \n\t"
+        "mtc1   $zero,          %[v0]                               \n\t"
+        "mtc1   $zero,          %[v1]                               \n\t"
+        "mtc1   $zero,          %[v2]                               \n\t"
+        "mtc1   $zero,          %[v3]                               \n\t"
+        "mtc1   $zero,          %[v4]                               \n\t"
+        "mtc1   $zero,          %[v5]                               \n\t"
+        "mtc1   $zero,          %[v6]                               \n\t"
+        "mtc1   $zero,          %[v7]                               \n\t"
+        "addiu  %[i1],          %[i],                  4            \n\t"
+        "addiu  %[i2],          %[i],                  8            \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+        "addiu  %[i3],          %[i],                  12           \n\t"
+        PTR_ADDU "%[samples_end],%[samples_x],         %[j]         \n\t"
+        "move   %[samples_sw],  %[samples_p]                        \n\t"
+
+        "1:                                                         \n\t"   // start of the inner for loop (for (j = 0; j < in_ch; j++))
+        "lwc1   %[matrix_j],    0(%[matrix_p])                      \n\t"
+        "lwc1   %[matrix_j2],   4(%[matrix_p])                      \n\t"
+        "lwxc1  %[samples0],    %[i](%[samples_p])                  \n\t"
+        "lwxc1  %[samples1],    %[i1](%[samples_p])                 \n\t"
+        "lwxc1  %[samples2],    %[i2](%[samples_p])                 \n\t"
+        "lwxc1  %[samples3],    %[i3](%[samples_p])                 \n\t"
+        PTR_ADDIU "%[matrix_p], 8                                   \n\t"
+        PTR_ADDIU "%[samples_x]," PTRSIZE "                         \n\t"
+        "madd.s %[v0],          %[v0],  %[samples0],    %[matrix_j] \n\t"
+        "madd.s %[v1],          %[v1],  %[samples1],    %[matrix_j] \n\t"
+        "madd.s %[v2],          %[v2],  %[samples2],    %[matrix_j] \n\t"
+        "madd.s %[v3],          %[v3],  %[samples3],    %[matrix_j] \n\t"
+        "madd.s %[v4],          %[v4],  %[samples0],    %[matrix_j2]\n\t"
+        "madd.s %[v5],          %[v5],  %[samples1],    %[matrix_j2]\n\t"
+        "madd.s %[v6],          %[v6],  %[samples2],    %[matrix_j2]\n\t"
+        "madd.s %[v7],          %[v7],  %[samples3],    %[matrix_j2]\n\t"
+        "bne    %[samples_x],   %[samples_end],         1b          \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+
+        PTR_L " %[samples_p],  " PTRSIZE "(%[samples])              \n\t"
+        "swxc1  %[v0],          %[i](%[samples_sw])                 \n\t"
+        "swxc1  %[v1],          %[i1](%[samples_sw])                \n\t"
+        "swxc1  %[v2],          %[i2](%[samples_sw])                \n\t"
+        "swxc1  %[v3],          %[i3](%[samples_sw])                \n\t"
+        "swxc1  %[v4],          %[i](%[samples_p])                  \n\t"
+        "addiu  %[i],           16                                  \n\t"
+        "swxc1  %[v5],          %[i1](%[samples_p])                 \n\t"
+        "swxc1  %[v6],          %[i2](%[samples_p])                 \n\t"
+        "bne    %[i],           %[len],                 2b          \n\t"
+        " swxc1 %[v7],          %[i3](%[samples_p])                 \n\t"
+
+        "3:                                                         \n\t"
+        "bne    %[out_ch],      %[i2],                  6f          \n\t"   // if (out_ch == 1)
+        " nop                                                       \n\t"
+
+        "5:                                                         \n\t"   // start of the outer for loop (for (i = 0; i < len; i+=4))
+        "move   %[matrix_p],    %[matrix]                           \n\t"
+        "move   %[samples_x],   %[samples]                          \n\t"
+        "mtc1   $zero,          %[v0]                               \n\t"
+        "mtc1   $zero,          %[v1]                               \n\t"
+        "mtc1   $zero,          %[v2]                               \n\t"
+        "mtc1   $zero,          %[v3]                               \n\t"
+        "addiu  %[i1],          %[i],                  4            \n\t"
+        "addiu  %[i2],          %[i],                  8            \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+        "addiu  %[i3],          %[i],                  12           \n\t"
+        PTR_ADDU "%[samples_end],%[samples_x],         %[j]         \n\t"
+        "move   %[samples_sw],  %[samples_p]                        \n\t"
+
+        "4:                                                         \n\t"   // start of the inner for loop (for (j = 0; j < in_ch; j++))
+        "lwc1   %[matrix_j],    0(%[matrix_p])                      \n\t"
+        "lwxc1  %[samples0],    %[i](%[samples_p])                  \n\t"
+        "lwxc1  %[samples1],    %[i1](%[samples_p])                 \n\t"
+        "lwxc1  %[samples2],    %[i2](%[samples_p])                 \n\t"
+        "lwxc1  %[samples3],    %[i3](%[samples_p])                 \n\t"
+        PTR_ADDIU "%[matrix_p], 8                                   \n\t"
+        PTR_ADDIU "%[samples_x]," PTRSIZE "                         \n\t"
+        "madd.s %[v0],          %[v0],  %[samples0],    %[matrix_j] \n\t"
+        "madd.s %[v1],          %[v1],  %[samples1],    %[matrix_j] \n\t"
+        "madd.s %[v2],          %[v2],  %[samples2],    %[matrix_j] \n\t"
+        "madd.s %[v3],          %[v3],  %[samples3],    %[matrix_j] \n\t"
+        "bne    %[samples_x],   %[samples_end],         4b          \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+
+        "swxc1  %[v0],          %[i](%[samples_sw])                 \n\t"
+        "addiu  %[i],           16                                  \n\t"
+        "swxc1  %[v1],          %[i1](%[samples_sw])                \n\t"
+        "swxc1  %[v2],          %[i2](%[samples_sw])                \n\t"
+        "bne    %[i],           %[len],                 5b          \n\t"
+        " swxc1 %[v3],          %[i3](%[samples_sw])                \n\t"
+        "6:                                                         \n\t"
+
+        ".set   pop"
+        :[samples_p]"=&r"(samples_p), [matrix_j]"=&f"(matrix_j), [matrix_j2]"=&f"(matrix_j2),
+         [samples0]"=&f"(samples0), [samples1]"=&f"(samples1),
+         [samples2]"=&f"(samples2), [samples3]"=&f"(samples3),
+         [v0]"=&f"(v0), [v1]"=&f"(v1), [v2]"=&f"(v2), [v3]"=&f"(v3),
+         [v4]"=&f"(v4), [v5]"=&f"(v5), [v6]"=&f"(v6), [v7]"=&f"(v7),
+         [samples_x]"=&r"(samples_x), [matrix_p]"=&r"(matrix_p),
+         [samples_end]"=&r"(samples_end), [samples_sw]"=&r"(samples_sw),
+         [i1]"=&r"(i1), [i2]"=&r"(i2), [i3]"=&r"(i3), [i]"=&r"(i),
+         [j]"=&r"(j), [len]"+r"(len)
+        :[samples]"r"(samples), [matrix]"r"(matrix),
+         [in_ch]"r"(in_ch), [out_ch]"r"(out_ch)
+        :"memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact) {
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSP
+    c->bit_alloc_calc_bap = ac3_bit_alloc_calc_bap_mips;
+    c->update_bap_counts  = ac3_update_bap_counts_mips;
+#endif
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->float_to_fixed24 = float_to_fixed24_mips;
+    //c->downmix          = ac3_downmix_mips;
+#endif
+#endif
+
+#endif
+}
diff --git a/libavcodec/mips/acelp_filters_mips.c b/libavcodec/mips/acelp_filters_mips.c
new file mode 100644
index 0000000..478db85
--- /dev/null
+++ b/libavcodec/mips/acelp_filters_mips.c
@@ -0,0 +1,221 @@
+ /*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * various filters for ACELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/acelp_filters.c
+ */
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/acelp_filters.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_acelp_interpolatef_mips(float *out, const float *in,
+                           const float *filter_coeffs, int precision,
+                           int frac_pos, int filter_length, int length)
+{
+    int n, i;
+    int prec = precision * 4;
+    int fc_offset = precision - frac_pos;
+    float in_val_p, in_val_m, fc_val_p, fc_val_m;
+
+    for (n = 0; n < length; n++) {
+        /**
+        * four pointers are defined in order to minimize number of
+        * computations done in inner loop
+        */
+        const float *p_in_p = &in[n];
+        const float *p_in_m = &in[n-1];
+        const float *p_filter_coeffs_p = &filter_coeffs[frac_pos];
+        const float *p_filter_coeffs_m = filter_coeffs + fc_offset;
+        float v = 0;
+
+        for (i = 0; i < filter_length;i++) {
+            __asm__ volatile (
+                "lwc1   %[in_val_p],           0(%[p_in_p])                    \n\t"
+                "lwc1   %[fc_val_p],           0(%[p_filter_coeffs_p])         \n\t"
+                "lwc1   %[in_val_m],           0(%[p_in_m])                    \n\t"
+                "lwc1   %[fc_val_m],           0(%[p_filter_coeffs_m])         \n\t"
+                PTR_ADDIU "%[p_in_p],          %[p_in_p],              4       \n\t"
+                "madd.s %[v],%[v],             %[in_val_p],%[fc_val_p]         \n\t"
+                PTR_ADDIU "%[p_in_m],          %[p_in_m],              -4      \n\t"
+                PTR_ADDU "%[p_filter_coeffs_p],%[p_filter_coeffs_p],   %[prec] \n\t"
+                PTR_ADDU "%[p_filter_coeffs_m],%[p_filter_coeffs_m],   %[prec] \n\t"
+                "madd.s %[v],%[v],%[in_val_m], %[fc_val_m]                     \n\t"
+
+                : [v] "+&f" (v),[p_in_p] "+r" (p_in_p), [p_in_m] "+r" (p_in_m),
+                  [p_filter_coeffs_p] "+r" (p_filter_coeffs_p),
+                  [in_val_p] "=&f" (in_val_p), [in_val_m] "=&f" (in_val_m),
+                  [fc_val_p] "=&f" (fc_val_p), [fc_val_m] "=&f" (fc_val_m),
+                  [p_filter_coeffs_m] "+r" (p_filter_coeffs_m)
+                : [prec] "r" (prec)
+                : "memory"
+            );
+        }
+        out[n] = v;
+    }
+}
+
+static void ff_acelp_apply_order_2_transfer_function_mips(float *out, const float *in,
+                                              const float zero_coeffs[2],
+                                              const float pole_coeffs[2],
+                                              float gain, float mem[2], int n)
+{
+    /**
+    * loop is unrolled eight times
+    */
+
+    __asm__ volatile (
+        "lwc1   $f0,    0(%[mem])                                              \n\t"
+        "blez   %[n],   ff_acelp_apply_order_2_transfer_function_end%=         \n\t"
+        "lwc1   $f1,    4(%[mem])                                              \n\t"
+        "lwc1   $f2,    0(%[pole_coeffs])                                      \n\t"
+        "lwc1   $f3,    4(%[pole_coeffs])                                      \n\t"
+        "lwc1   $f4,    0(%[zero_coeffs])                                      \n\t"
+        "lwc1   $f5,    4(%[zero_coeffs])                                      \n\t"
+
+        "ff_acelp_apply_order_2_transfer_function_madd%=:                      \n\t"
+
+        "lwc1   $f6,    0(%[in])                                               \n\t"
+        "mul.s  $f9,    $f3,      $f1                                          \n\t"
+        "mul.s  $f7,    $f2,      $f0                                          \n\t"
+        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
+        "sub.s  $f7,    $f7,      $f9                                          \n\t"
+        "madd.s $f8,    $f7,      $f4,     $f0                                 \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f1                                 \n\t"
+        "lwc1   $f11,   4(%[in])                                               \n\t"
+        "mul.s  $f12,   $f3,      $f0                                          \n\t"
+        "mul.s  $f13,   $f2,      $f7                                          \n\t"
+        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
+        "sub.s  $f13,   $f13,     $f12                                         \n\t"
+        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f0                                 \n\t"
+        "swc1   $f8,    0(%[out])                                              \n\t"
+        "lwc1   $f6,    8(%[in])                                               \n\t"
+        "mul.s  $f9,    $f3,      $f7                                          \n\t"
+        "mul.s  $f15,   $f2,      $f13                                         \n\t"
+        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
+        "sub.s  $f15,   $f15,     $f9                                          \n\t"
+        "madd.s $f8,    $f15,     $f4,     $f13                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
+        "swc1   $f14,   4(%[out])                                              \n\t"
+        "lwc1   $f11,   12(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f13                                         \n\t"
+        "mul.s  $f16,   $f2,      $f15                                         \n\t"
+        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
+        "sub.s  $f16,   $f16,     $f12                                         \n\t"
+        "madd.s $f14,   $f16,     $f4,     $f15                                \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
+        "swc1   $f8,    8(%[out])                                              \n\t"
+        "lwc1   $f6,    16(%[in])                                              \n\t"
+        "mul.s  $f9,    $f3,      $f15                                         \n\t"
+        "mul.s  $f7,    $f2,      $f16                                         \n\t"
+        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
+        "sub.s  $f7,    $f7,      $f9                                          \n\t"
+        "madd.s $f8,    $f7,      $f4,     $f16                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f15                                \n\t"
+        "swc1   $f14,   12(%[out])                                             \n\t"
+        "lwc1   $f11,   20(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f16                                         \n\t"
+        "mul.s  $f13,   $f2,      $f7                                          \n\t"
+        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
+        "sub.s  $f13,   $f13,     $f12                                         \n\t"
+        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f16                                \n\t"
+        "swc1   $f8,    16(%[out])                                             \n\t"
+        "lwc1   $f6,    24(%[in])                                              \n\t"
+        "mul.s  $f9,    $f3,      $f7                                          \n\t"
+        "mul.s  $f15,   $f2,      $f13                                         \n\t"
+        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
+        "sub.s  $f1,    $f15,     $f9                                          \n\t"
+        "madd.s $f8,    $f1,      $f4,     $f13                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
+        "swc1   $f14,   20(%[out])                                             \n\t"
+        "lwc1   $f11,   28(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f13                                         \n\t"
+        "mul.s  $f16,   $f2,      $f1                                          \n\t"
+        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
+        "sub.s  $f0,    $f16,     $f12                                         \n\t"
+        "madd.s $f14,   $f0,      $f4,     $f1                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
+        "swc1   $f8,    24(%[out])                                             \n\t"
+        PTR_ADDIU "%[out], 32                                                  \n\t"
+        PTR_ADDIU "%[in],  32                                                  \n\t"
+        "addiu  %[n],   -8                                                     \n\t"
+        "swc1   $f14,   -4(%[out])                                             \n\t"
+        "bnez   %[n],   ff_acelp_apply_order_2_transfer_function_madd%=        \n\t"
+        "swc1   $f1,    4(%[mem])                                              \n\t"
+        "swc1   $f0,    0(%[mem])                                              \n\t"
+
+        "ff_acelp_apply_order_2_transfer_function_end%=:                       \n\t"
+
+         : [out] "+r" (out),
+           [in] "+r" (in), [gain] "+f" (gain),
+           [n] "+r" (n), [mem] "+r" (mem)
+         : [zero_coeffs] "r" (zero_coeffs),
+           [pole_coeffs] "r" (pole_coeffs)
+         : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
+           "$f6", "$f7",  "$f8", "$f9", "$f10", "$f11",
+           "$f12", "$f13", "$f14", "$f15", "$f16", "memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_acelp_filter_init_mips(ACELPFContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->acelp_interpolatef                      = ff_acelp_interpolatef_mips;
+    c->acelp_apply_order_2_transfer_function   = ff_acelp_apply_order_2_transfer_function_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/acelp_vectors_mips.c b/libavcodec/mips/acelp_vectors_mips.c
new file mode 100644
index 0000000..0ab2b6a
--- /dev/null
+++ b/libavcodec/mips/acelp_vectors_mips.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * adaptive and fixed codebook vector operations for ACELP-based codecs
+ * optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/acelp_vectors.c
+ */
+#include "config.h"
+#include "libavcodec/acelp_vectors.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_weighted_vector_sumf_mips(
+                  float *out, const float *in_a, const float *in_b,
+                  float weight_coeff_a, float weight_coeff_b, int length)
+{
+    const float *a_end = in_a + length;
+
+    /* loop unrolled two times */
+    __asm__ volatile (
+        "blez   %[length], ff_weighted_vector_sumf_end%=                     \n\t"
+
+        "ff_weighted_vector_sumf_madd%=:                                     \n\t"
+        "lwc1   $f0,       0(%[in_a])                                        \n\t"
+        "lwc1   $f3,       4(%[in_a])                                        \n\t"
+        "lwc1   $f1,       0(%[in_b])                                        \n\t"
+        "lwc1   $f4,       4(%[in_b])                                        \n\t"
+        "mul.s  $f2,       %[weight_coeff_a], $f0                            \n\t"
+        "mul.s  $f5,       %[weight_coeff_a], $f3                            \n\t"
+        "madd.s $f2,       $f2,               %[weight_coeff_b], $f1         \n\t"
+        "madd.s $f5,       $f5,               %[weight_coeff_b], $f4         \n\t"
+        PTR_ADDIU "%[in_a],8                                                 \n\t"
+        PTR_ADDIU "%[in_b],8                                                 \n\t"
+        "swc1   $f2,       0(%[out])                                         \n\t"
+        "swc1   $f5,       4(%[out])                                         \n\t"
+        PTR_ADDIU "%[out], 8                                                 \n\t"
+        "bne   %[in_a],    %[a_end],          ff_weighted_vector_sumf_madd%= \n\t"
+
+        "ff_weighted_vector_sumf_end%=:                                      \n\t"
+
+        : [out] "+r" (out), [in_a] "+r" (in_a),   [in_b] "+r" (in_b)
+        : [weight_coeff_a] "f" (weight_coeff_a),
+          [weight_coeff_b] "f" (weight_coeff_b),
+          [length] "r" (length), [a_end]"r"(a_end)
+        : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_acelp_vectors_init_mips(ACELPVContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->weighted_vector_sumf = ff_weighted_vector_sumf_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/amrwbdec_mips.c b/libavcodec/mips/amrwbdec_mips.c
new file mode 100644
index 0000000..5dc0543
--- /dev/null
+++ b/libavcodec/mips/amrwbdec_mips.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/amrwbdec.c
+ */
+#include "libavutil/avutil.h"
+#include "libavcodec/amrwbdata.h"
+#include "amrwbdec_mips.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+void ff_hb_fir_filter_mips(float *out, const float fir_coef[HB_FIR_SIZE + 1],
+                          float mem[HB_FIR_SIZE], const float *in)
+{
+    int i;
+    float data[AMRWB_SFR_SIZE_16k + HB_FIR_SIZE]; // past and current samples
+
+    memcpy(data, mem, HB_FIR_SIZE * sizeof(float));
+    memcpy(data + HB_FIR_SIZE, in, AMRWB_SFR_SIZE_16k * sizeof(float));
+
+    for (i = 0; i < AMRWB_SFR_SIZE_16k; i++) {
+        float output;
+        float * p_data = (data+i);
+
+        /**
+        * inner loop is entirely unrolled and instructions are scheduled
+        * to minimize pipeline stall
+        */
+        __asm__ volatile(
+            "mtc1       $zero,     %[output]                      \n\t"
+            "lwc1       $f0,       0(%[p_data])                   \n\t"
+            "lwc1       $f1,       0(%[fir_coef])                 \n\t"
+            "lwc1       $f2,       4(%[p_data])                   \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f3,       4(%[fir_coef])                 \n\t"
+            "lwc1       $f4,       8(%[p_data])                   \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       8(%[fir_coef])                 \n\t"
+
+            "lwc1       $f0,       12(%[p_data])                  \n\t"
+            "lwc1       $f1,       12(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f2,       16(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f3,       16(%[fir_coef])                \n\t"
+            "lwc1       $f4,       20(%[p_data])                  \n\t"
+            "lwc1       $f5,       20(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       24(%[p_data])                  \n\t"
+            "lwc1       $f1,       24(%[fir_coef])                \n\t"
+            "lwc1       $f2,       28(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       28(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       32(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       32(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+
+            "lwc1       $f0,       36(%[p_data])                  \n\t"
+            "lwc1       $f1,       36(%[fir_coef])                \n\t"
+            "lwc1       $f2,       40(%[p_data])                  \n\t"
+            "lwc1       $f3,       40(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       44(%[p_data])                  \n\t"
+            "lwc1       $f5,       44(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       48(%[p_data])                  \n\t"
+            "lwc1       $f1,       48(%[fir_coef])                \n\t"
+            "lwc1       $f2,       52(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       52(%[fir_coef])                \n\t"
+            "lwc1       $f4,       56(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       56(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       60(%[p_data])                  \n\t"
+            "lwc1       $f1,       60(%[fir_coef])                \n\t"
+            "lwc1       $f2,       64(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       64(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       68(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       68(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+
+            "lwc1       $f0,       72(%[p_data])                  \n\t"
+            "lwc1       $f1,       72(%[fir_coef])                \n\t"
+            "lwc1       $f2,       76(%[p_data])                  \n\t"
+            "lwc1       $f3,       76(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       80(%[p_data])                  \n\t"
+            "lwc1       $f5,       80(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       84(%[p_data])                  \n\t"
+            "lwc1       $f1,       84(%[fir_coef])                \n\t"
+            "lwc1       $f2,       88(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       88(%[fir_coef])                \n\t"
+            "lwc1       $f4,       92(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       92(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       96(%[p_data])                  \n\t"
+            "lwc1       $f1,       96(%[fir_coef])                \n\t"
+            "lwc1       $f2,       100(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       100(%[fir_coef])               \n\t"
+            "lwc1       $f4,       104(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       104(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       108(%[p_data])                 \n\t"
+            "lwc1       $f1,       108(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f2,       112(%[p_data])                 \n\t"
+            "lwc1       $f3,       112(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       116(%[p_data])                 \n\t"
+            "lwc1       $f5,       116(%[fir_coef])               \n\t"
+            "lwc1       $f0,       120(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f1,       120(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+
+            : [output]"=&f"(output)
+            : [fir_coef]"r"(fir_coef), [p_data]"r"(p_data)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory"
+        );
+        out[i] = output;
+    }
+    memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/mips/amrwbdec_mips.h b/libavcodec/mips/amrwbdec_mips.h
new file mode 100644
index 0000000..a9f66fe
--- /dev/null
+++ b/libavcodec/mips/amrwbdec_mips.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/amrwbdec.c
+ */
+#ifndef AVCODEC_MIPS_AMRWBDEC_MIPS_H
+#define AVCODEC_MIPS_AMRWBDEC_MIPS_H
+#include "config.h"
+
+#if HAVE_MIPSFPU && HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+void ff_hb_fir_filter_mips(float *out, const float fir_coef[],
+                          float mem[], const float *in);
+#define hb_fir_filter ff_hb_fir_filter_mips
+#endif
+#endif
+
+#endif /* AVCODEC_MIPS_AMRWBDEC_MIPS_H  */
diff --git a/libavcodec/mips/blockdsp_init_mips.c b/libavcodec/mips/blockdsp_init_mips.c
new file mode 100644
index 0000000..55ac1c3
--- /dev/null
+++ b/libavcodec/mips/blockdsp_init_mips.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "blockdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void blockdsp_init_msa(BlockDSPContext *c)
+{
+    c->clear_block = ff_clear_block_msa;
+    c->clear_blocks = ff_clear_blocks_msa;
+
+    c->fill_block_tab[0] = ff_fill_block16_msa;
+    c->fill_block_tab[1] = ff_fill_block8_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void blockdsp_init_mmi(BlockDSPContext *c)
+{
+    c->clear_block = ff_clear_block_mmi;
+    c->clear_blocks = ff_clear_blocks_mmi;
+
+    c->fill_block_tab[0] = ff_fill_block16_mmi;
+    c->fill_block_tab[1] = ff_fill_block8_mmi;
+}
+#endif /* HAVE_MMI */
+
+void ff_blockdsp_init_mips(BlockDSPContext *c)
+{
+#if HAVE_MMI
+    blockdsp_init_mmi(c);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    blockdsp_init_msa(c);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/blockdsp_mips.h b/libavcodec/mips/blockdsp_mips.h
new file mode 100644
index 0000000..1742b12
--- /dev/null
+++ b/libavcodec/mips/blockdsp_mips.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H
+#define AVCODEC_MIPS_BLOCKDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_fill_block16_msa(uint8_t *src, uint8_t val, ptrdiff_t stride, int height);
+void ff_fill_block8_msa(uint8_t *src, uint8_t val, ptrdiff_t stride, int height);
+void ff_clear_block_msa(int16_t *block);
+void ff_clear_blocks_msa(int16_t *block);
+
+void ff_fill_block16_mmi(uint8_t *block, uint8_t value, ptrdiff_t line_size, int h);
+void ff_fill_block8_mmi(uint8_t *block, uint8_t value, ptrdiff_t line_size, int h);
+void ff_clear_block_mmi(int16_t *block);
+void ff_clear_blocks_mmi(int16_t *block);
+
+#endif  // #ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H
diff --git a/libavcodec/mips/blockdsp_mmi.c b/libavcodec/mips/blockdsp_mmi.c
new file mode 100644
index 0000000..68641e2
--- /dev/null
+++ b/libavcodec/mips/blockdsp_mmi.c
@@ -0,0 +1,159 @@
+/*
+ * Loongson SIMD optimized blockdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "blockdsp_mips.h"
+#include "libavutil/mips/mmiutils.h"
+
+void ff_fill_block16_mmi(uint8_t *block, uint8_t value, ptrdiff_t line_size, int h)
+{
+    double ftmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "mtc1       %[value],   %[ftmp0]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDI   "%[h],       %[h],           -0x01                   \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [h]"+&r"(h)
+        : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_fill_block8_mmi(uint8_t *block, uint8_t value, ptrdiff_t line_size, int h)
+{
+    double ftmp0;
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "mtc1       %[value],   %[ftmp0]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDI   "%[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp0),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [h]"+&r"(h)
+        : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_clear_block_mmi(int16_t *block)
+{
+    double ftmp[2];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x00)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x10)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x20)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x30)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x40)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x50)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x60)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x70)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
+        : [block]"r"(block)
+        : "memory"
+    );
+}
+
+void ff_clear_blocks_mmi(int16_t *block)
+{
+    double ftmp[2];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x00)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x10)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x20)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x30)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x40)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x50)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x60)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x70)
+
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x80)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x90)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xa0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xb0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xc0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xd0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xe0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xf0)
+
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x100)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x110)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x120)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x130)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x140)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x150)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x160)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x170)
+
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x180)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x190)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1a0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1b0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1c0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1d0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1e0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1f0)
+
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x200)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x210)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x220)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x230)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x240)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x250)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x260)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x270)
+
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x280)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x290)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2a0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2b0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2c0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2d0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2e0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2f0)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
+        : [block]"r"((uint64_t *)block)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/blockdsp_msa.c b/libavcodec/mips/blockdsp_msa.c
new file mode 100644
index 0000000..2b78c28
--- /dev/null
+++ b/libavcodec/mips/blockdsp_msa.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "blockdsp_mips.h"
+
+static void copy_8bit_value_width8_msa(uint8_t *src, uint8_t val,
+                                       int32_t src_stride, int32_t height)
+{
+    int32_t cnt;
+    uint64_t dst0;
+    v16u8 val0;
+
+    val0 = (v16u8) __msa_fill_b(val);
+    dst0 = __msa_copy_u_d((v2i64) val0, 0);
+
+    for (cnt = (height >> 2); cnt--;) {
+        SD4(dst0, dst0, dst0, dst0, src, src_stride);
+        src += (4 * src_stride);
+    }
+}
+
+static void copy_8bit_value_width16_msa(uint8_t *src, uint8_t val,
+                                        int32_t src_stride, int32_t height)
+{
+    int32_t cnt;
+    v16u8 val0;
+
+    val0 = (v16u8) __msa_fill_b(val);
+
+    for (cnt = (height >> 3); cnt--;) {
+        ST_UB8(val0, val0, val0, val0, val0, val0, val0, val0, src, src_stride);
+        src += (8 * src_stride);
+    }
+}
+
+static void memset_zero_16width_msa(uint8_t *src, int32_t stride,
+                                    int32_t height)
+{
+    int8_t cnt;
+    v16u8 zero = { 0 };
+
+    for (cnt = (height / 2); cnt--;) {
+        ST_UB(zero, src);
+        src += stride;
+        ST_UB(zero, src);
+        src += stride;
+    }
+}
+
+void ff_fill_block16_msa(uint8_t *src, uint8_t val, ptrdiff_t stride, int height)
+{
+    copy_8bit_value_width16_msa(src, val, stride, height);
+}
+
+void ff_fill_block8_msa(uint8_t *src, uint8_t val, ptrdiff_t stride, int height)
+{
+    copy_8bit_value_width8_msa(src, val, stride, height);
+}
+
+void ff_clear_block_msa(int16_t *block)
+{
+    memset_zero_16width_msa((uint8_t *) block, 16, 8);
+}
+
+void ff_clear_blocks_msa(int16_t *block)
+{
+    memset_zero_16width_msa((uint8_t *) block, 16, 8 * 6);
+}
diff --git a/libavcodec/mips/cabac.h b/libavcodec/mips/cabac.h
new file mode 100644
index 0000000..82cee29
--- /dev/null
+++ b/libavcodec/mips/cabac.h
@@ -0,0 +1,119 @@
+/*
+ * Loongson SIMD optimized h264chroma
+ *
+ * Copyright (c) 2018 Loongson Technology Corporation Limited
+ * Copyright (c) 2018 Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_CABAC_H
+#define AVCODEC_MIPS_CABAC_H
+
+#include "libavcodec/cabac.h"
+#include "libavutil/mips/mmiutils.h"
+#include "config.h"
+
+#define get_cabac_inline get_cabac_inline_mips
+static av_always_inline int get_cabac_inline(CABACContext *c,
+                                             uint8_t * const state){
+    mips_reg tmp0, tmp1, tmp2, bit;
+
+    __asm__ volatile (
+        "lbu          %[bit],        0(%[state])                   \n\t"
+        "and          %[tmp0],       %[c_range],     0xC0          \n\t"
+        PTR_ADDU     "%[tmp0],       %[tmp0],        %[tmp0]       \n\t"
+        PTR_ADDU     "%[tmp0],       %[tmp0],        %[tables]     \n\t"
+        PTR_ADDU     "%[tmp0],       %[tmp0],        %[bit]        \n\t"
+        /* tmp1: RangeLPS */
+        "lbu          %[tmp1],       %[lps_off](%[tmp0])           \n\t"
+
+        PTR_SUBU     "%[c_range],    %[c_range],     %[tmp1]       \n\t"
+        PTR_SLL      "%[tmp0],       %[c_range],     0x11          \n\t"
+        PTR_SUBU     "%[tmp0],       %[tmp0],        %[c_low]      \n\t"
+
+        /* tmp2: lps_mask */
+        PTR_SRA      "%[tmp2],       %[tmp0],        0x1F          \n\t"
+        /* If tmp0 < 0, lps_mask ==  0xffffffff*/
+        /* If tmp0 >= 0, lps_mask ==  0x00000000*/
+        "beqz         %[tmp2],       1f                            \n\t"
+        PTR_SLL      "%[tmp0],       %[c_range],     0x11          \n\t"
+        PTR_SUBU     "%[c_low],      %[c_low],       %[tmp0]       \n\t"
+        PTR_SUBU     "%[tmp0],       %[tmp1],        %[c_range]    \n\t"
+        PTR_ADDU     "%[c_range],    %[c_range],     %[tmp0]       \n\t"
+        "xor          %[bit],        %[bit],         %[tmp2]       \n\t"
+
+        "1:                                                        \n\t"
+        /* tmp1: *state */
+        PTR_ADDU     "%[tmp0],       %[tables],      %[bit]        \n\t"
+        "lbu          %[tmp1],       %[mlps_off](%[tmp0])          \n\t"
+        /* tmp2: lps_mask */
+        PTR_ADDU     "%[tmp0],       %[tables],      %[c_range]    \n\t"
+        "lbu          %[tmp2],       %[norm_off](%[tmp0])          \n\t"
+
+        "sb           %[tmp1],       0(%[state])                   \n\t"
+        "and          %[bit],        %[bit],         0x01          \n\t"
+        PTR_SLL      "%[c_range],    %[c_range],     %[tmp2]       \n\t"
+        PTR_SLL      "%[c_low],      %[c_low],       %[tmp2]       \n\t"
+
+        "and          %[tmp0],       %[c_low],       %[cabac_mask] \n\t"
+        "bnez         %[tmp0],       1f                            \n\t"
+        PTR_ADDI     "%[tmp0],       %[c_low],       -0X01         \n\t"
+        "xor          %[tmp0],       %[c_low],       %[tmp0]       \n\t"
+        PTR_SRA      "%[tmp0],       %[tmp0],        0x0f          \n\t"
+        PTR_ADDU     "%[tmp0],       %[tmp0],        %[tables]     \n\t"
+        "lbu          %[tmp2],       %[norm_off](%[tmp0])          \n\t"
+#if CABAC_BITS == 16
+        "lbu          %[tmp0],       0(%[c_bytestream])            \n\t"
+        "lbu          %[tmp1],       1(%[c_bytestream])            \n\t"
+        PTR_SLL      "%[tmp0],       %[tmp0],        0x09          \n\t"
+        PTR_SLL      "%[tmp1],       %[tmp1],        0x01          \n\t"
+        PTR_ADDU     "%[tmp0],       %[tmp0],        %[tmp1]       \n\t"
+#else
+        "lbu          %[tmp0],       0(%[c_bytestream])            \n\t"
+        PTR_SLL      "%[tmp0],       %[tmp0],        0x01          \n\t"
+#endif
+        PTR_SUBU     "%[tmp0],       %[tmp0],        %[cabac_mask] \n\t"
+
+        "li           %[tmp1],       0x07                          \n\t"
+        PTR_SUBU     "%[tmp1],       %[tmp1],        %[tmp2]       \n\t"
+        PTR_SLL      "%[tmp0],       %[tmp0],        %[tmp1]       \n\t"
+        PTR_ADDU     "%[c_low],      %[c_low],       %[tmp0]       \n\t"
+
+#if !UNCHECKED_BITSTREAM_READER
+        "bge          %[c_bytestream], %[c_bytestream_end], 1f     \n\t"
+#endif
+        PTR_ADDIU    "%[c_bytestream], %[c_bytestream],     0X02   \n\t"
+        "1:                                                        \n\t"
+    : [bit]"=&r"(bit), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2),
+      [c_range]"+&r"(c->range), [c_low]"+&r"(c->low),
+      [c_bytestream]"+&r"(c->bytestream)
+    : [state]"r"(state), [tables]"r"(ff_h264_cabac_tables),
+#if !UNCHECKED_BITSTREAM_READER
+      [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+      [lps_off]"i"(H264_LPS_RANGE_OFFSET),
+      [mlps_off]"i"(H264_MLPS_STATE_OFFSET + 128),
+      [norm_off]"i"(H264_NORM_SHIFT_OFFSET),
+      [cabac_mask]"i"(CABAC_MASK)
+    : "memory"
+    );
+
+    return bit;
+}
+
+#endif /* AVCODEC_MIPS_CABAC_H */
diff --git a/libavcodec/mips/celp_filters_mips.c b/libavcodec/mips/celp_filters_mips.c
new file mode 100644
index 0000000..926f1cb
--- /dev/null
+++ b/libavcodec/mips/celp_filters_mips.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * various filters for CELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/celp_filters.c
+ */
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavcodec/celp_filters.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_celp_lp_synthesis_filterf_mips(float *out,
+                                  const float *filter_coeffs,
+                                  const float* in, int buffer_length,
+                                  int filter_length)
+{
+    int i,n;
+
+    float out0, out1, out2, out3;
+    float old_out0, old_out1, old_out2, old_out3;
+    float a,b,c;
+    const float *p_filter_coeffs;
+    float *p_out;
+
+    a = filter_coeffs[0];
+    b = filter_coeffs[1];
+    c = filter_coeffs[2];
+    b -= filter_coeffs[0] * filter_coeffs[0];
+    c -= filter_coeffs[1] * filter_coeffs[0];
+    c -= filter_coeffs[0] * b;
+
+    old_out0 = out[-4];
+    old_out1 = out[-3];
+    old_out2 = out[-2];
+    old_out3 = out[-1];
+    for (n = 0; n <= buffer_length - 4; n+=4) {
+        p_filter_coeffs = filter_coeffs;
+        p_out = out;
+
+        out0 = in[0];
+        out1 = in[1];
+        out2 = in[2];
+        out3 = in[3];
+
+        __asm__ volatile(
+            "lwc1       $f2,     8(%[filter_coeffs])                        \n\t"
+            "lwc1       $f1,     4(%[filter_coeffs])                        \n\t"
+            "lwc1       $f0,     0(%[filter_coeffs])                        \n\t"
+            "nmsub.s    %[out0], %[out0],             $f2, %[old_out1]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f2, %[old_out2]      \n\t"
+            "nmsub.s    %[out2], %[out2],             $f2, %[old_out3]      \n\t"
+            "lwc1       $f3,     12(%[filter_coeffs])                       \n\t"
+            "nmsub.s    %[out0], %[out0],             $f1, %[old_out2]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f1, %[old_out3]      \n\t"
+            "nmsub.s    %[out2], %[out2],             $f3, %[old_out2]      \n\t"
+            "nmsub.s    %[out0], %[out0],             $f0, %[old_out3]      \n\t"
+            "nmsub.s    %[out3], %[out3],             $f3, %[old_out3]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f3, %[old_out1]      \n\t"
+            "nmsub.s    %[out0], %[out0],             $f3, %[old_out0]      \n\t"
+
+            : [out0]"+f"(out0), [out1]"+f"(out1),
+              [out2]"+f"(out2), [out3]"+f"(out3)
+            : [old_out0]"f"(old_out0), [old_out1]"f"(old_out1),
+              [old_out2]"f"(old_out2), [old_out3]"f"(old_out3),
+              [filter_coeffs]"r"(filter_coeffs)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "memory"
+        );
+
+        for (i = 5; i <= filter_length; i += 2) {
+            __asm__ volatile(
+                "lwc1    %[old_out3], -20(%[p_out])                         \n\t"
+                "lwc1    $f5,         16(%[p_filter_coeffs])                \n\t"
+                PTR_ADDIU "%[p_out],  -8                                    \n\t"
+                PTR_ADDIU "%[p_filter_coeffs], 8                            \n\t"
+                "nmsub.s %[out1],     %[out1],      $f5, %[old_out0]        \n\t"
+                "nmsub.s %[out3],     %[out3],      $f5, %[old_out2]        \n\t"
+                "lwc1    $f4,         12(%[p_filter_coeffs])                \n\t"
+                "lwc1    %[old_out2], -16(%[p_out])                         \n\t"
+                "nmsub.s %[out0],     %[out0],      $f5, %[old_out3]        \n\t"
+                "nmsub.s %[out2],     %[out2],      $f5, %[old_out1]        \n\t"
+                "nmsub.s %[out1],     %[out1],      $f4, %[old_out3]        \n\t"
+                "nmsub.s %[out3],     %[out3],      $f4, %[old_out1]        \n\t"
+                "mov.s   %[old_out1], %[old_out3]                           \n\t"
+                "nmsub.s %[out0],     %[out0],      $f4, %[old_out2]        \n\t"
+                "nmsub.s %[out2],     %[out2],      $f4, %[old_out0]        \n\t"
+
+                : [out0]"+f"(out0), [out1]"+f"(out1),
+                  [out2]"+f"(out2), [out3]"+f"(out3), [old_out0]"+f"(old_out0),
+                  [old_out1]"+f"(old_out1), [old_out2]"+f"(old_out2),
+                  [old_out3]"+f"(old_out3),[p_filter_coeffs]"+r"(p_filter_coeffs),
+                  [p_out]"+r"(p_out)
+                :
+                : "$f4", "$f5", "memory"
+            );
+            FFSWAP(float, old_out0, old_out2);
+        }
+
+        __asm__ volatile(
+            "nmsub.s    %[out3], %[out3], %[a], %[out2]                     \n\t"
+            "nmsub.s    %[out2], %[out2], %[a], %[out1]                     \n\t"
+            "nmsub.s    %[out3], %[out3], %[b], %[out1]                     \n\t"
+            "nmsub.s    %[out1], %[out1], %[a], %[out0]                     \n\t"
+            "nmsub.s    %[out2], %[out2], %[b], %[out0]                     \n\t"
+            "nmsub.s    %[out3], %[out3], %[c], %[out0]                     \n\t"
+
+            : [out0]"+f"(out0), [out1]"+f"(out1),
+              [out2]"+f"(out2), [out3]"+f"(out3)
+            : [a]"f"(a), [b]"f"(b), [c]"f"(c)
+        );
+
+        out[0] = out0;
+        out[1] = out1;
+        out[2] = out2;
+        out[3] = out3;
+
+        old_out0 = out0;
+        old_out1 = out1;
+        old_out2 = out2;
+        old_out3 = out3;
+
+        out += 4;
+        in  += 4;
+    }
+
+    out -= n;
+    in -= n;
+    for (; n < buffer_length; n++) {
+        float out_val, out_val_i, fc_val;
+        p_filter_coeffs = filter_coeffs;
+        p_out = &out[n];
+        out_val = in[n];
+        for (i = 1; i <= filter_length; i++) {
+            __asm__ volatile(
+                "lwc1    %[fc_val],          0(%[p_filter_coeffs])                        \n\t"
+                "lwc1    %[out_val_i],       -4(%[p_out])                                 \n\t"
+                PTR_ADDIU "%[p_filter_coeffs], 4                                          \n\t"
+                PTR_ADDIU "%[p_out],         -4                                           \n\t"
+                "nmsub.s %[out_val],         %[out_val],          %[fc_val], %[out_val_i] \n\t"
+
+                : [fc_val]"=&f"(fc_val), [out_val]"+f"(out_val),
+                  [out_val_i]"=&f"(out_val_i), [p_out]"+r"(p_out),
+                  [p_filter_coeffs]"+r"(p_filter_coeffs)
+                :
+                : "memory"
+            );
+        }
+        out[n] = out_val;
+    }
+}
+
+static void ff_celp_lp_zero_synthesis_filterf_mips(float *out,
+                                       const float *filter_coeffs,
+                                       const float *in, int buffer_length,
+                                       int filter_length)
+{
+    int i,n;
+    float sum_out8, sum_out7, sum_out6, sum_out5, sum_out4, fc_val;
+    float sum_out3, sum_out2, sum_out1;
+    const float *p_filter_coeffs, *p_in;
+
+    for (n = 0; n < buffer_length; n+=8) {
+        p_in = &in[n];
+        p_filter_coeffs = filter_coeffs;
+        sum_out8 = in[n+7];
+        sum_out7 = in[n+6];
+        sum_out6 = in[n+5];
+        sum_out5 = in[n+4];
+        sum_out4 = in[n+3];
+        sum_out3 = in[n+2];
+        sum_out2 = in[n+1];
+        sum_out1 = in[n];
+        i = filter_length;
+
+        /* i is always greater than 0
+        * outer loop is unrolled eight times so there is less memory access
+        * inner loop is unrolled two times
+        */
+        __asm__ volatile(
+            "filt_lp_inner%=:                                               \n\t"
+            "lwc1   %[fc_val],   0(%[p_filter_coeffs])                      \n\t"
+            "lwc1   $f7,         6*4(%[p_in])                               \n\t"
+            "lwc1   $f6,         5*4(%[p_in])                               \n\t"
+            "lwc1   $f5,         4*4(%[p_in])                               \n\t"
+            "lwc1   $f4,         3*4(%[p_in])                               \n\t"
+            "lwc1   $f3,         2*4(%[p_in])                               \n\t"
+            "lwc1   $f2,         4(%[p_in])                                 \n\t"
+            "lwc1   $f1,         0(%[p_in])                                 \n\t"
+            "lwc1   $f0,         -4(%[p_in])                                \n\t"
+            "addiu  %[i],        -2                                         \n\t"
+            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f7       \n\t"
+            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f6       \n\t"
+            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f5       \n\t"
+            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f4       \n\t"
+            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f3       \n\t"
+            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f2       \n\t"
+            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f1       \n\t"
+            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f0       \n\t"
+            "lwc1   %[fc_val],   4(%[p_filter_coeffs])                      \n\t"
+            "lwc1   $f7,         -8(%[p_in])                                \n\t"
+            PTR_ADDIU "%[p_filter_coeffs], 8                                \n\t"
+            PTR_ADDIU "%[p_in],  -8                                         \n\t"
+            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f6       \n\t"
+            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f5       \n\t"
+            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f4       \n\t"
+            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f3       \n\t"
+            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f2       \n\t"
+            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f1       \n\t"
+            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f0       \n\t"
+            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f7       \n\t"
+            "bgtz   %[i],        filt_lp_inner%=                            \n\t"
+
+            : [sum_out8]"+f"(sum_out8), [sum_out7]"+f"(sum_out7),
+              [sum_out6]"+f"(sum_out6), [sum_out5]"+f"(sum_out5),
+              [sum_out4]"+f"(sum_out4), [sum_out3]"+f"(sum_out3),
+              [sum_out2]"+f"(sum_out2), [sum_out1]"+f"(sum_out1),
+              [fc_val]"=&f"(fc_val), [p_filter_coeffs]"+r"(p_filter_coeffs),
+              [p_in]"+r"(p_in), [i]"+r"(i)
+            :
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "memory"
+        );
+
+        out[n+7] = sum_out8;
+        out[n+6] = sum_out7;
+        out[n+5] = sum_out6;
+        out[n+4] = sum_out5;
+        out[n+3] = sum_out4;
+        out[n+2] = sum_out3;
+        out[n+1] = sum_out2;
+        out[n] = sum_out1;
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_celp_filter_init_mips(CELPFContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->celp_lp_synthesis_filterf        = ff_celp_lp_synthesis_filterf_mips;
+    c->celp_lp_zero_synthesis_filterf   = ff_celp_lp_zero_synthesis_filterf_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/celp_math_mips.c b/libavcodec/mips/celp_math_mips.c
new file mode 100644
index 0000000..ce711bd
--- /dev/null
+++ b/libavcodec/mips/celp_math_mips.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * Math operations optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/celp_math.c
+ */
+#include "config.h"
+#include "libavcodec/celp_math.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static float ff_dot_productf_mips(const float* a, const float* b,
+                                              int length)
+{
+    float sum;
+    const float* a_end = a + length;
+
+    __asm__ volatile (
+        "mtc1   $zero,      %[sum]                              \n\t"
+        "blez   %[length],  ff_dot_productf_end%=               \n\t"
+        "ff_dot_productf_madd%=:                                \n\t"
+        "lwc1   $f2,        0(%[a])                             \n\t"
+        "lwc1   $f1,        0(%[b])                             \n\t"
+        PTR_ADDIU "%[a],    %[a],      4                        \n\t"
+        PTR_ADDIU "%[b],    %[b],      4                        \n\t"
+        "madd.s %[sum],     %[sum],    $f1, $f2                 \n\t"
+        "bne   %[a],        %[a_end],  ff_dot_productf_madd%=   \n\t"
+        "ff_dot_productf_end%=:                                 \n\t"
+
+        : [sum] "=&f" (sum), [a] "+r" (a), [b] "+r" (b)
+        : [a_end]"r"(a_end), [length] "r" (length)
+        : "$f1", "$f2", "memory"
+    );
+    return sum;
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_celp_math_init_mips(CELPMContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->dot_productf = ff_dot_productf_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/compute_antialias_fixed.h b/libavcodec/mips/compute_antialias_fixed.h
new file mode 100644
index 0000000..a967f67
--- /dev/null
+++ b/libavcodec/mips/compute_antialias_fixed.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * Compute antialias function optimised for MIPS fixed-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodec.c
+ */
+
+#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H
+#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H
+
+#if HAVE_INLINE_ASM
+static void compute_antialias_mips_fixed(MPADecodeContext *s,
+                                        GranuleDef *g)
+{
+    int32_t *ptr, *csa;
+    int n, i;
+    int MAX_lo = 0xffffffff;
+
+    /* we antialias only "long" bands */
+    if (g->block_type == 2) {
+        if (!g->switch_point)
+            return;
+        /* XXX: check this for 8000Hz case */
+        n = 1;
+    } else {
+        n = SBLIMIT - 1;
+    }
+
+
+    ptr = g->sb_hybrid + 18;
+
+    for(i = n;i > 0;i--) {
+        int tmp0, tmp1, tmp2, tmp00, tmp11;
+        int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
+        csa = &csa_table[0][0];
+
+        /**
+         * instructions are scheduled to minimize pipeline stall.
+         */
+        __asm__ volatile (
+            "lw   %[tmp0],      -1*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      0*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 0*4(%[csa])                             \n\t"
+            "lw   %[temp_reg2], 2*4(%[csa])                             \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 3*4(%[csa])                             \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "lw   %[tmp00],     -2*4(%[ptr])                            \n\t"
+            "lw   %[tmp11],     1*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg4], 4*4(%[csa])                             \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 6*4(%[csa])                             \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg6], 7*4(%[csa])                             \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sw   %[temp_reg1], -1*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "sw   %[temp_reg2], 0*4(%[ptr])                             \n\t"
+            "lw   %[tmp0],      -3*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      2*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 8*4(%[csa])                             \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sw   %[temp_reg4], -2*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg5], 1*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg2], 10*4(%[csa])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "lw   %[temp_reg3], 11*4(%[csa])                            \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "lw   %[tmp00],     -4*4(%[ptr])                            \n\t"
+            "lw   %[tmp11],     3*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[temp_reg4], 12*4(%[csa])                            \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg5], 14*4(%[csa])                            \n\t"
+            "lw   %[temp_reg6], 15*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "sw   %[temp_reg1], -3*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "sw   %[temp_reg2], 2*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "lw   %[tmp0],      -5*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      4*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 16*4(%[csa])                            \n\t"
+            "lw   %[temp_reg2], 18*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 19*4(%[csa])                            \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -4*4(%[ptr])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "sw   %[temp_reg5], 3*4(%[ptr])                             \n\t"
+            "lw   %[tmp00],     -6*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[tmp11],     5*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg4], 20*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 22*4(%[csa])                            \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg6], 23*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sw   %[temp_reg1], -5*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg2], 4*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "lw   %[tmp0],      -7*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "lw   %[tmp1],      6*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 24*4(%[csa])                            \n\t"
+            "lw   %[temp_reg2], 26*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 27*4(%[csa])                            \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -6*4(%[ptr])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "sw   %[temp_reg5], 5*4(%[ptr])                             \n\t"
+            "lw   %[tmp00],     -8*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[tmp11],     7*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg4], 28*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 30*4(%[csa])                            \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg6], 31*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sw   %[temp_reg1], -7*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg2], 6*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -8*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg5], 7*4(%[ptr])                             \n\t"
+
+            : [tmp0] "=&r" (tmp0), [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+              [tmp00] "=&r" (tmp00), [tmp11] "=&r" (tmp11),
+              [temp_reg1] "=&r" (temp_reg1), [temp_reg2] "=&r" (temp_reg2),
+              [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
+              [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6)
+            : [csa] "r" (csa), [ptr] "r" (ptr),
+              [MAX_lo] "r" (MAX_lo)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+         );
+
+        ptr += 18;
+    }
+}
+#define compute_antialias compute_antialias_mips_fixed
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H */
diff --git a/libavcodec/mips/compute_antialias_float.h b/libavcodec/mips/compute_antialias_float.h
new file mode 100644
index 0000000..e2b4f29
--- /dev/null
+++ b/libavcodec/mips/compute_antialias_float.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * Compute antialias function optimised for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodec.c
+ */
+
+#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H
+#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H
+
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void compute_antialias_mips_float(MPADecodeContext *s,
+                                        GranuleDef *g)
+{
+    float *ptr, *ptr_end;
+    float *csa = &csa_table[0][0];
+    /* temporary variables */
+    float in1, in2, in3, in4, in5, in6, in7, in8;
+    float out1, out2, out3, out4;
+
+    ptr = g->sb_hybrid + 18;
+    /* we antialias only "long" bands */
+    if (g->block_type == 2) {
+        if (!g->switch_point)
+            return;
+        /* XXX: check this for 8000Hz case */
+        ptr_end = ptr + 18;
+    } else {
+        ptr_end = ptr + 558;
+    }
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    */
+
+    __asm__ volatile (
+        "compute_antialias_float_loop%=:                                \t\n"
+        "lwc1    %[in1],  -1*4(%[ptr])                                  \t\n"
+        "lwc1    %[in2],  0(%[csa])                                     \t\n"
+        "lwc1    %[in3],  1*4(%[csa])                                   \t\n"
+        "lwc1    %[in4],  0(%[ptr])                                     \t\n"
+        "lwc1    %[in5],  -2*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  4*4(%[csa])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in7],  5*4(%[csa])                                   \t\n"
+        "lwc1    %[in8],  1*4(%[ptr])                                   \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "lwc1    %[in1],  -3*4(%[ptr])                                  \t\n"
+        "swc1    %[out1], -1*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 0(%[ptr])                                     \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  8*4(%[csa])                                   \t\n"
+        "swc1    %[out3], -2*4(%[ptr])                                  \t\n"
+        "swc1    %[out4], 1*4(%[ptr])                                   \t\n"
+        "lwc1    %[in3],  9*4(%[csa])                                   \t\n"
+        "lwc1    %[in4],  2*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "lwc1    %[in5],  -4*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  12*4(%[csa])                                  \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in7],  13*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "lwc1    %[in8],  3*4(%[ptr])                                   \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -3*4(%[ptr])                                  \t\n"
+        "lwc1    %[in1],  -5*4(%[ptr])                                  \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "swc1    %[out2], 2*4(%[ptr])                                   \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  16*4(%[csa])                                  \t\n"
+        "lwc1    %[in3],  17*4(%[csa])                                  \t\n"
+        "swc1    %[out3], -4*4(%[ptr])                                  \t\n"
+        "lwc1    %[in4],  4*4(%[ptr])                                   \t\n"
+        "swc1    %[out4], 3*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in5],  -6*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  20*4(%[csa])                                  \t\n"
+        "lwc1    %[in7],  21*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "lwc1    %[in8],  5*4(%[ptr])                                   \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -5*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 4*4(%[ptr])                                   \t\n"
+        "lwc1    %[in1],  -7*4(%[ptr])                                  \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  24*4(%[csa])                                  \t\n"
+        "lwc1    %[in3],  25*4(%[csa])                                  \t\n"
+        "lwc1    %[in4],  6*4(%[ptr])                                   \t\n"
+        "swc1    %[out3], -6*4(%[ptr])                                  \t\n"
+        "swc1    %[out4], 5*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "lwc1    %[in5],  -8*4(%[ptr])                                  \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in6],  28*4(%[csa])                                  \t\n"
+        "lwc1    %[in7],  29*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "lwc1    %[in8],  7*4(%[ptr])                                   \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -7*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 6*4(%[ptr])                                   \t\n"
+        PTR_ADDIU "%[ptr],%[ptr],  72                                   \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "swc1    %[out3], -26*4(%[ptr])                                 \t\n"
+        "swc1    %[out4], -11*4(%[ptr])                                 \t\n"
+        "bne     %[ptr],  %[ptr_end],  compute_antialias_float_loop%=   \t\n"
+
+        : [ptr] "+r" (ptr),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [in7] "=&f" (in7), [in8] "=&f" (in8),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4)
+        : [csa] "r" (csa), [ptr_end] "r" (ptr_end)
+        : "memory"
+    );
+}
+#define compute_antialias compute_antialias_mips_float
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H */
diff --git a/libavcodec/mips/constants.c b/libavcodec/mips/constants.c
new file mode 100644
index 0000000..a7c4a5c
--- /dev/null
+++ b/libavcodec/mips/constants.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/mem.h"
+#include "constants.h"
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_1) =       {0x0001000100010001ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_2) =       {0x0002000200020002ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_3) =       {0x0003000300030003ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_4) =       {0x0004000400040004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_5) =       {0x0005000500050005ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_6) =       {0x0006000600060006ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_8) =       {0x0008000800080008ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_9) =       {0x0009000900090009ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_10) =      {0x000A000A000A000AULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_12) =      {0x000C000C000C000CULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) =      {0x000F000F000F000FULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_16) =      {0x0010001000100010ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_17) =      {0x0011001100110011ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_18) =      {0x0012001200120012ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) =      {0x0014001400140014ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_22) =      {0x0016001600160016ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_28) =      {0x001C001C001C001CULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_32) =      {0x0020002000200020ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) =      {0x0035003500350035ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_64) =      {0x0040004000400040ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) =     {0x0080008000800080ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_512) =     {0x0200020002000200ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_m8tom5) =  {0xFFFBFFFAFFF9FFF8ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_m4tom1) =  {0xFFFFFFFEFFFDFFFCULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_1to4) =    {0x0004000300020001ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_5to8) =    {0x0008000700060005ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_0to3) =    {0x0003000200010000ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_4to7) =    {0x0007000600050004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_8tob) =    {0x000b000a00090008ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_ctof) =    {0x000f000e000d000cULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_1) =       {0x0101010101010101ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_3) =       {0x0303030303030303ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_80) =      {0x8080808080808080ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1) =      {0xA1A1A1A1A1A1A1A1ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_FE) =      {0xFEFEFEFEFEFEFEFEULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd) =        {0x0004000400040004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd2) =       {0x0040004000400040ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd3) =       {0x0020002000200020ULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_wm1010) =     {0xFFFF0000FFFF0000ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_d40000) =     {0x0000000000040000ULL};
diff --git a/libavcodec/mips/constants.h b/libavcodec/mips/constants.h
new file mode 100644
index 0000000..2604559
--- /dev/null
+++ b/libavcodec/mips/constants.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_CONSTANTS_H
+#define AVCODEC_MIPS_CONSTANTS_H
+
+#include <stdint.h>
+
+extern const uint64_t ff_pw_1;
+extern const uint64_t ff_pw_2;
+extern const uint64_t ff_pw_3;
+extern const uint64_t ff_pw_4;
+extern const uint64_t ff_pw_5;
+extern const uint64_t ff_pw_6;
+extern const uint64_t ff_pw_8;
+extern const uint64_t ff_pw_9;
+extern const uint64_t ff_pw_10;
+extern const uint64_t ff_pw_12;
+extern const uint64_t ff_pw_15;
+extern const uint64_t ff_pw_16;
+extern const uint64_t ff_pw_17;
+extern const uint64_t ff_pw_18;
+extern const uint64_t ff_pw_20;
+extern const uint64_t ff_pw_22;
+extern const uint64_t ff_pw_28;
+extern const uint64_t ff_pw_32;
+extern const uint64_t ff_pw_53;
+extern const uint64_t ff_pw_64;
+extern const uint64_t ff_pw_128;
+extern const uint64_t ff_pw_512;
+extern const uint64_t ff_pw_m8tom5;
+extern const uint64_t ff_pw_m4tom1;
+extern const uint64_t ff_pw_1to4;
+extern const uint64_t ff_pw_5to8;
+extern const uint64_t ff_pw_0to3;
+extern const uint64_t ff_pw_4to7;
+extern const uint64_t ff_pw_8tob;
+extern const uint64_t ff_pw_ctof;
+
+extern const uint64_t ff_pb_1;
+extern const uint64_t ff_pb_3;
+extern const uint64_t ff_pb_80;
+extern const uint64_t ff_pb_A1;
+extern const uint64_t ff_pb_FE;
+
+extern const uint64_t ff_rnd;
+extern const uint64_t ff_rnd2;
+extern const uint64_t ff_rnd3;
+
+extern const uint64_t ff_wm1010;
+extern const uint64_t ff_d40000;
+
+#endif /* AVCODEC_MIPS_CONSTANTS_H */
diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
new file mode 100644
index 0000000..03dcbad
--- /dev/null
+++ b/libavcodec/mips/fft_mips.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ * Author:  Zoran Lukic (zoranl@mips.com)
+ *
+ * Optimized MDCT/IMDCT and FFT transforms
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft_table.h"
+#include "libavutil/mips/asmdefs.h"
+
+/**
+ * FFT transform
+ */
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
+{
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    FFTComplex *tmpz;
+    float w_re, w_im;
+    float *w_re_ptr, *w_im_ptr;
+    const int fft_size = (1 << s->nbits);
+    float pom,  pom1,  pom2,  pom3;
+    float temp, temp1, temp3, temp4;
+    FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
+    FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
+
+    num_transforms = (21845 >> (17 - s->nbits)) | 1;
+
+    for (n=0; n<num_transforms; n++) {
+        offset = ff_fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[0].re + tmpz[1].re;
+        tmp5 = tmpz[2].re + tmpz[3].re;
+        tmp2 = tmpz[0].im + tmpz[1].im;
+        tmp6 = tmpz[2].im + tmpz[3].im;
+        tmp3 = tmpz[0].re - tmpz[1].re;
+        tmp8 = tmpz[2].im - tmpz[3].im;
+        tmp4 = tmpz[0].im - tmpz[1].im;
+        tmp7 = tmpz[2].re - tmpz[3].re;
+
+        tmpz[0].re = tmp1 + tmp5;
+        tmpz[2].re = tmp1 - tmp5;
+        tmpz[0].im = tmp2 + tmp6;
+        tmpz[2].im = tmp2 - tmp6;
+        tmpz[1].re = tmp3 + tmp8;
+        tmpz[3].re = tmp3 - tmp8;
+        tmpz[1].im = tmp4 - tmp7;
+        tmpz[3].im = tmp4 + tmp7;
+
+    }
+
+    if (fft_size < 8)
+        return;
+
+    num_transforms = (num_transforms >> 1) | 1;
+
+    for (n=0; n<num_transforms; n++) {
+        offset = ff_fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        __asm__ volatile (
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
+            "lwc1  %[pom1], 56(%[tmpz])                     \n\t"
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 44(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
+            "add.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re + tmpz[5].re;
+            "add.s %[tmp3], %[tmp3],    %[pom1]             \n\t"  // tmp3 = tmpz[6].re + tmpz[7].re;
+            "add.s %[tmp2], %[tmp2],    %[pom2]             \n\t"  // tmp2 = tmpz[4].im + tmpz[5].im;
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
+            "add.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im + tmpz[7].im;
+            "add.s %[tmp5], %[tmp1],    %[tmp3]             \n\t"  // tmp5 = tmp1 + tmp3;
+            "sub.s %[tmp7], %[tmp1],    %[tmp3]             \n\t"  // tmp7 = tmp1 - tmp3;
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
+            "lwc1  %[pom1], 44(%[tmpz])                     \n\t"
+            "add.s %[tmp6], %[tmp2],    %[tmp4]             \n\t"  // tmp6 = tmp2 + tmp4;
+            "sub.s %[tmp8], %[tmp2],    %[tmp4]             \n\t"  // tmp8 = tmp2 - tmp4;
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 56(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
+            "sub.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re - tmpz[5].re;
+            "lwc1  %[pom],  0(%[tmpz])                      \n\t"
+            "sub.s %[tmp2], %[tmp2],    %[pom1]             \n\t"  // tmp2 = tmpz[4].im - tmpz[5].im;
+            "sub.s %[tmp3], %[tmp3],    %[pom2]             \n\t"  // tmp3 = tmpz[6].re - tmpz[7].re;
+            "lwc1  %[pom2], 4(%[tmpz])                      \n\t"
+            "sub.s %[pom1], %[pom],     %[tmp5]             \n\t"
+            "sub.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im - tmpz[7].im;
+            "add.s %[pom3], %[pom],     %[tmp5]             \n\t"
+            "sub.s %[pom],  %[pom2],    %[tmp6]             \n\t"
+            "add.s %[pom2], %[pom2],    %[tmp6]             \n\t"
+            "swc1  %[pom1], 32(%[tmpz])                     \n\t"  // tmpz[4].re = tmpz[0].re - tmp5;
+            "swc1  %[pom3], 0(%[tmpz])                      \n\t"  // tmpz[0].re = tmpz[0].re + tmp5;
+            "swc1  %[pom],  36(%[tmpz])                     \n\t"  // tmpz[4].im = tmpz[0].im - tmp6;
+            "swc1  %[pom2], 4(%[tmpz])                      \n\t"  // tmpz[0].im = tmpz[0].im + tmp6;
+            "lwc1  %[pom1], 16(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 20(%[tmpz])                     \n\t"
+            "li.s  %[pom],  0.7071067812                    \n\t"  // float pom = 0.7071067812f;
+            "add.s %[temp1],%[tmp1],    %[tmp2]             \n\t"
+            "sub.s %[temp], %[pom1],    %[tmp8]             \n\t"
+            "add.s %[pom2], %[pom3],    %[tmp7]             \n\t"
+            "sub.s %[temp3],%[tmp3],    %[tmp4]             \n\t"
+            "sub.s %[temp4],%[tmp2],    %[tmp1]             \n\t"
+            "swc1  %[temp], 48(%[tmpz])                     \n\t"  // tmpz[6].re = tmpz[2].re - tmp8;
+            "swc1  %[pom2], 52(%[tmpz])                     \n\t"  // tmpz[6].im = tmpz[2].im + tmp7;
+            "add.s %[pom1], %[pom1],    %[tmp8]             \n\t"
+            "sub.s %[pom3], %[pom3],    %[tmp7]             \n\t"
+            "add.s %[tmp3], %[tmp3],    %[tmp4]             \n\t"
+            "mul.s %[tmp5], %[pom],     %[temp1]            \n\t"  // tmp5 = pom * (tmp1 + tmp2);
+            "mul.s %[tmp7], %[pom],     %[temp3]            \n\t"  // tmp7 = pom * (tmp3 - tmp4);
+            "mul.s %[tmp6], %[pom],     %[temp4]            \n\t"  // tmp6 = pom * (tmp2 - tmp1);
+            "mul.s %[tmp8], %[pom],     %[tmp3]             \n\t"  // tmp8 = pom * (tmp3 + tmp4);
+            "swc1  %[pom1], 16(%[tmpz])                     \n\t"  // tmpz[2].re = tmpz[2].re + tmp8;
+            "swc1  %[pom3], 20(%[tmpz])                     \n\t"  // tmpz[2].im = tmpz[2].im - tmp7;
+            "add.s %[tmp1], %[tmp5],    %[tmp7]             \n\t"  // tmp1 = tmp5 + tmp7;
+            "sub.s %[tmp3], %[tmp5],    %[tmp7]             \n\t"  // tmp3 = tmp5 - tmp7;
+            "add.s %[tmp2], %[tmp6],    %[tmp8]             \n\t"  // tmp2 = tmp6 + tmp8;
+            "sub.s %[tmp4], %[tmp6],    %[tmp8]             \n\t"  // tmp4 = tmp6 - tmp8;
+            "lwc1  %[temp], 8(%[tmpz])                      \n\t"
+            "lwc1  %[temp1],12(%[tmpz])                     \n\t"
+            "lwc1  %[pom],  24(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 28(%[tmpz])                     \n\t"
+            "sub.s %[temp4],%[temp],    %[tmp1]             \n\t"
+            "sub.s %[temp3],%[temp1],   %[tmp2]             \n\t"
+            "add.s %[temp], %[temp],    %[tmp1]             \n\t"
+            "add.s %[temp1],%[temp1],   %[tmp2]             \n\t"
+            "sub.s %[pom1], %[pom],     %[tmp4]             \n\t"
+            "add.s %[pom3], %[pom2],    %[tmp3]             \n\t"
+            "add.s %[pom],  %[pom],     %[tmp4]             \n\t"
+            "sub.s %[pom2], %[pom2],    %[tmp3]             \n\t"
+            "swc1  %[temp4],40(%[tmpz])                     \n\t"  // tmpz[5].re = tmpz[1].re - tmp1;
+            "swc1  %[temp3],44(%[tmpz])                     \n\t"  // tmpz[5].im = tmpz[1].im - tmp2;
+            "swc1  %[temp], 8(%[tmpz])                      \n\t"  // tmpz[1].re = tmpz[1].re + tmp1;
+            "swc1  %[temp1],12(%[tmpz])                     \n\t"  // tmpz[1].im = tmpz[1].im + tmp2;
+            "swc1  %[pom1], 56(%[tmpz])                     \n\t"  // tmpz[7].re = tmpz[3].re - tmp4;
+            "swc1  %[pom3], 60(%[tmpz])                     \n\t"  // tmpz[7].im = tmpz[3].im + tmp3;
+            "swc1  %[pom],  24(%[tmpz])                     \n\t"  // tmpz[3].re = tmpz[3].re + tmp4;
+            "swc1  %[pom2], 28(%[tmpz])                     \n\t"  // tmpz[3].im = tmpz[3].im - tmp3;
+            : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),   [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
+              [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5),  [tmp7]"=&f"(tmp7),
+              [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
+            : [tmpz]"r"(tmpz)
+            : "memory"
+        );
+    }
+
+    step = 1 << (MAX_LOG2_NFFT - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++) {
+        num_transforms = (num_transforms >> 1) | 1;
+        n2  = 2 * n4;
+        n34 = 3 * n4;
+
+        for (n=0; n<num_transforms; n++) {
+            offset = ff_fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            tmpz_n2  = tmpz +  n2;
+            tmpz_n4  = tmpz +  n4;
+            tmpz_n34 = tmpz +  n34;
+
+            __asm__ volatile (
+                "lwc1  %[pom1], 0(%[tmpz_n2])            \n\t"
+                "lwc1  %[pom],  0(%[tmpz_n34])           \n\t"
+                "lwc1  %[pom2], 4(%[tmpz_n2])            \n\t"
+                "lwc1  %[pom3], 4(%[tmpz_n34])           \n\t"
+                "lwc1  %[temp1],0(%[tmpz])               \n\t"
+                "lwc1  %[temp3],4(%[tmpz])               \n\t"
+                "add.s %[tmp5], %[pom1],      %[pom]     \n\t"   //  tmp5 = tmpz[ n2].re + tmpz[n34].re;
+                "sub.s %[tmp1], %[pom1],      %[pom]     \n\t"   //  tmp1 = tmpz[ n2].re - tmpz[n34].re;
+                "add.s %[tmp6], %[pom2],      %[pom3]    \n\t"   //  tmp6 = tmpz[ n2].im + tmpz[n34].im;
+                "sub.s %[tmp2], %[pom2],      %[pom3]    \n\t"   //  tmp2 = tmpz[ n2].im - tmpz[n34].im;
+                "sub.s %[temp], %[temp1],     %[tmp5]    \n\t"
+                "add.s %[temp1],%[temp1],     %[tmp5]    \n\t"
+                "sub.s %[temp4],%[temp3],     %[tmp6]    \n\t"
+                "add.s %[temp3],%[temp3],     %[tmp6]    \n\t"
+                "swc1  %[temp], 0(%[tmpz_n2])            \n\t"   //  tmpz[ n2].re = tmpz[ 0].re - tmp5;
+                "swc1  %[temp1],0(%[tmpz])               \n\t"   //  tmpz[  0].re = tmpz[ 0].re + tmp5;
+                "lwc1  %[pom1], 0(%[tmpz_n4])            \n\t"
+                "swc1  %[temp4],4(%[tmpz_n2])            \n\t"   //  tmpz[ n2].im = tmpz[ 0].im - tmp6;
+                "lwc1  %[temp], 4(%[tmpz_n4])            \n\t"
+                "swc1  %[temp3],4(%[tmpz])               \n\t"   //  tmpz[  0].im = tmpz[ 0].im + tmp6;
+                "sub.s %[pom],  %[pom1],      %[tmp2]    \n\t"
+                "add.s %[pom1], %[pom1],      %[tmp2]    \n\t"
+                "add.s %[temp1],%[temp],      %[tmp1]    \n\t"
+                "sub.s %[temp], %[temp],      %[tmp1]    \n\t"
+                "swc1  %[pom],  0(%[tmpz_n34])           \n\t"   //  tmpz[n34].re = tmpz[n4].re - tmp2;
+                "swc1  %[pom1], 0(%[tmpz_n4])            \n\t"   //  tmpz[ n4].re = tmpz[n4].re + tmp2;
+                "swc1  %[temp1],4(%[tmpz_n34])           \n\t"   //  tmpz[n34].im = tmpz[n4].im + tmp1;
+                "swc1  %[temp], 4(%[tmpz_n4])            \n\t"   //  tmpz[ n4].im = tmpz[n4].im - tmp1;
+                : [tmp5]"=&f"(tmp5),
+                  [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),        [pom1]"=&f"(pom1),        [pom2]"=&f"(pom2),
+                  [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6),          [pom3]"=&f"(pom3),
+                  [temp]"=&f"(temp), [temp1]"=&f"(temp1),     [temp3]"=&f"(temp3),       [temp4]"=&f"(temp4)
+                : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
+                : "memory"
+            );
+
+            w_re_ptr = (float*)(ff_cos_131072 + step);
+            w_im_ptr = (float*)(ff_cos_131072 + MAX_FFT_SIZE/4 - step);
+
+            for (i=1; i<n4; i++) {
+                w_re = w_re_ptr[0];
+                w_im = w_im_ptr[0];
+                tmpz_n2_i = tmpz_n2  + i;
+                tmpz_n4_i = tmpz_n4  + i;
+                tmpz_n34_i= tmpz_n34 + i;
+                tmpz_i    = tmpz     + i;
+
+                __asm__ volatile (
+                    "lwc1     %[temp],  0(%[tmpz_n2_i])               \n\t"
+                    "lwc1     %[temp1], 4(%[tmpz_n2_i])               \n\t"
+                    "lwc1     %[pom],   0(%[tmpz_n34_i])              \n\t"
+                    "lwc1     %[pom1],  4(%[tmpz_n34_i])              \n\t"
+                    "mul.s    %[temp3], %[w_im],    %[temp]           \n\t"
+                    "mul.s    %[temp4], %[w_im],    %[temp1]          \n\t"
+                    "mul.s    %[pom2],  %[w_im],    %[pom1]           \n\t"
+                    "mul.s    %[pom3],  %[w_im],    %[pom]            \n\t"
+                    "msub.s   %[tmp2],  %[temp3],   %[w_re], %[temp1] \n\t"  // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
+                    "madd.s   %[tmp1],  %[temp4],   %[w_re], %[temp]  \n\t"  // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
+                    "msub.s   %[tmp3],  %[pom2],    %[w_re], %[pom]   \n\t"  // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
+                    "madd.s   %[tmp4],  %[pom3],    %[w_re], %[pom1]  \n\t"  // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
+                    "lwc1     %[temp],  0(%[tmpz_i])                  \n\t"
+                    "lwc1     %[pom],   4(%[tmpz_i])                  \n\t"
+                    "add.s    %[tmp5],  %[tmp1],    %[tmp3]           \n\t"  // tmp5 = tmp1 + tmp3;
+                    "sub.s    %[tmp1],  %[tmp1],    %[tmp3]           \n\t"  // tmp1 = tmp1 - tmp3;
+                    "add.s    %[tmp6],  %[tmp2],    %[tmp4]           \n\t"  // tmp6 = tmp2 + tmp4;
+                    "sub.s    %[tmp2],  %[tmp2],    %[tmp4]           \n\t"  // tmp2 = tmp2 - tmp4;
+                    "sub.s    %[temp1], %[temp],    %[tmp5]           \n\t"
+                    "add.s    %[temp],  %[temp],    %[tmp5]           \n\t"
+                    "sub.s    %[pom1],  %[pom],     %[tmp6]           \n\t"
+                    "add.s    %[pom],   %[pom],     %[tmp6]           \n\t"
+                    "lwc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"
+                    "lwc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"
+                    "swc1     %[temp1], 0(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].re = tmpz[   i].re - tmp5;
+                    "swc1     %[temp],  0(%[tmpz_i])                  \n\t"  // tmpz[    i].re = tmpz[   i].re + tmp5;
+                    "swc1     %[pom1],  4(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].im = tmpz[   i].im - tmp6;
+                    "swc1     %[pom] ,  4(%[tmpz_i])                  \n\t"  // tmpz[    i].im = tmpz[   i].im + tmp6;
+                    "sub.s    %[temp4], %[temp3],   %[tmp2]           \n\t"
+                    "add.s    %[pom3],  %[pom2],    %[tmp1]           \n\t"
+                    "add.s    %[temp3], %[temp3],   %[tmp2]           \n\t"
+                    "sub.s    %[pom2],  %[pom2],    %[tmp1]           \n\t"
+                    "swc1     %[temp4], 0(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+                    "swc1     %[pom3],  4(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+                    "swc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+                    "swc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+                    : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
+                      [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
+                      [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+                      [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
+                    : [w_re]"f"(w_re), [w_im]"f"(w_im),
+                      [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
+                      [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
+                    : "memory"
+                );
+                w_re_ptr += step;
+                w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+
+/**
+ * MDCT/IMDCT transforms.
+ */
+
+static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k, n8, n4, n2, n, j;
+    const uint16_t *revtab = s->revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    const FFTSample *in1, *in2, *in3, *in4;
+    FFTComplex *z = (FFTComplex *)output;
+
+    int j1;
+    const float *tcos1, *tsin1, *tcos2, *tsin2;
+    float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+        temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+    FFTComplex *z1, *z2;
+
+    n = 1 << s->mdct_bits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    /* pre rotation */
+    in1 = input;
+    in2 = input + n2 - 1;
+    in3 = input + 2;
+    in4 = input + n2 - 3;
+
+    tcos1 = tcos;
+    tsin1 = tsin;
+
+    /* n4 = 64 or 128 */
+    for(k = 0; k < n4; k += 2) {
+        j  = revtab[k    ];
+        j1 = revtab[k + 1];
+
+        __asm__ volatile (
+            "lwc1           %[temp1],       0(%[in2])                           \t\n"
+            "lwc1           %[temp2],       0(%[tcos1])                         \t\n"
+            "lwc1           %[temp3],       0(%[tsin1])                         \t\n"
+            "lwc1           %[temp4],       0(%[in1])                           \t\n"
+            "lwc1           %[temp5],       0(%[in4])                           \t\n"
+            "mul.s          %[temp9],       %[temp1],   %[temp2]                \t\n"
+            "mul.s          %[temp10],      %[temp1],   %[temp3]                \t\n"
+            "lwc1           %[temp6],       4(%[tcos1])                         \t\n"
+            "lwc1           %[temp7],       4(%[tsin1])                         \t\n"
+            "nmsub.s        %[temp9],       %[temp9],   %[temp4],   %[temp3]    \t\n"
+            "madd.s         %[temp10],      %[temp10],  %[temp4],   %[temp2]    \t\n"
+            "mul.s          %[temp11],      %[temp5],   %[temp6]                \t\n"
+            "mul.s          %[temp12],      %[temp5],   %[temp7]                \t\n"
+            "lwc1           %[temp8],       0(%[in3])                           \t\n"
+            PTR_ADDIU "     %[tcos1],       %[tcos1],   8                       \t\n"
+            PTR_ADDIU "     %[tsin1],       %[tsin1],   8                       \t\n"
+            PTR_ADDIU "     %[in1],         %[in1],     16                      \t\n"
+            "nmsub.s        %[temp11],      %[temp11],  %[temp8],   %[temp7]    \t\n"
+            "madd.s         %[temp12],      %[temp12],  %[temp8],   %[temp6]    \t\n"
+            PTR_ADDIU "     %[in2],         %[in2],     -16                     \t\n"
+            PTR_ADDIU "     %[in3],         %[in3],     16                      \t\n"
+            PTR_ADDIU "     %[in4],         %[in4],     -16                     \t\n"
+
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+              [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
+              [in1]"+r"(in1), [in2]"+r"(in2),
+              [in3]"+r"(in3), [in4]"+r"(in4)
+            :
+            : "memory"
+        );
+
+        z[j ].re = temp9;
+        z[j ].im = temp10;
+        z[j1].re = temp11;
+        z[j1].im = temp12;
+    }
+
+    s->fft_calc(s, z);
+
+    /* post rotation + reordering */
+    /* n8 = 32 or 64 */
+    for(k = 0; k < n8; k += 2) {
+        tcos1 = &tcos[n8 - k - 2];
+        tsin1 = &tsin[n8 - k - 2];
+        tcos2 = &tcos[n8 + k];
+        tsin2 = &tsin[n8 + k];
+        z1 = &z[n8 - k - 2];
+        z2 = &z[n8 + k    ];
+
+        __asm__ volatile (
+            "lwc1       %[temp1],   12(%[z1])                           \t\n"
+            "lwc1       %[temp2],   4(%[tsin1])                         \t\n"
+            "lwc1       %[temp3],   4(%[tcos1])                         \t\n"
+            "lwc1       %[temp4],   8(%[z1])                            \t\n"
+            "lwc1       %[temp5],   4(%[z1])                            \t\n"
+            "mul.s      %[temp9],   %[temp1],   %[temp2]                \t\n"
+            "mul.s      %[temp10],  %[temp1],   %[temp3]                \t\n"
+            "lwc1       %[temp6],   0(%[tsin1])                         \t\n"
+            "lwc1       %[temp7],   0(%[tcos1])                         \t\n"
+            "nmsub.s    %[temp9],   %[temp9],   %[temp4],   %[temp3]    \t\n"
+            "madd.s     %[temp10],  %[temp10],  %[temp4],   %[temp2]    \t\n"
+            "mul.s      %[temp11],  %[temp5],   %[temp6]                \t\n"
+            "mul.s      %[temp12],  %[temp5],   %[temp7]                \t\n"
+            "lwc1       %[temp8],   0(%[z1])                            \t\n"
+            "lwc1       %[temp1],   4(%[z2])                            \t\n"
+            "lwc1       %[temp2],   0(%[tsin2])                         \t\n"
+            "lwc1       %[temp3],   0(%[tcos2])                         \t\n"
+            "nmsub.s    %[temp11],  %[temp11],  %[temp8],   %[temp7]    \t\n"
+            "madd.s     %[temp12],  %[temp12],  %[temp8],   %[temp6]    \t\n"
+            "mul.s      %[temp13],  %[temp1],   %[temp2]                \t\n"
+            "mul.s      %[temp14],  %[temp1],   %[temp3]                \t\n"
+            "lwc1       %[temp4],   0(%[z2])                            \t\n"
+            "lwc1       %[temp5],   12(%[z2])                           \t\n"
+            "lwc1       %[temp6],   4(%[tsin2])                         \t\n"
+            "lwc1       %[temp7],   4(%[tcos2])                         \t\n"
+            "nmsub.s    %[temp13],  %[temp13],  %[temp4],   %[temp3]    \t\n"
+            "madd.s     %[temp14],  %[temp14],  %[temp4],   %[temp2]    \t\n"
+            "mul.s      %[temp15],  %[temp5],   %[temp6]                \t\n"
+            "mul.s      %[temp16],  %[temp5],   %[temp7]                \t\n"
+            "lwc1       %[temp8],   8(%[z2])                            \t\n"
+            "nmsub.s    %[temp15],  %[temp15],  %[temp8],   %[temp7]    \t\n"
+            "madd.s     %[temp16],  %[temp16],  %[temp8],   %[temp6]    \t\n"
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+              [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+              [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
+            : [z1]"r"(z1), [z2]"r"(z2),
+              [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
+              [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
+            : "memory"
+        );
+
+        z1[1].re = temp9;
+        z1[1].im = temp14;
+        z2[0].re = temp13;
+        z2[0].im = temp10;
+
+        z1[0].re = temp11;
+        z1[0].im = temp16;
+        z2[1].re = temp15;
+        z2[1].im = temp12;
+    }
+}
+
+/**
+ * Compute inverse MDCT of size N = 2^nbits
+ * @param output N samples
+ * @param input N/2 samples
+ */
+static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k;
+    int n = 1 << s->mdct_bits;
+    int n2 = n >> 1;
+    int n4 = n >> 2;
+
+    ff_imdct_half_mips(s, output+n4, input);
+
+    for(k = 0; k < n4; k+=4) {
+        output[k] = -output[n2-k-1];
+        output[k+1] = -output[n2-k-2];
+        output[k+2] = -output[n2-k-3];
+        output[k+3] = -output[n2-k-4];
+
+        output[n-k-1] = output[n2+k];
+        output[n-k-2] = output[n2+k+1];
+        output[n-k-3] = output[n2+k+2];
+        output[n-k-4] = output[n2+k+3];
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_fft_init_mips(FFTContext *s)
+{
+    int n=0;
+
+    ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n);
+    ff_init_ff_cos_tabs(17);
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->fft_calc     = ff_fft_calc_mips;
+#if CONFIG_MDCT
+    s->imdct_calc   = ff_imdct_calc_mips;
+    s->imdct_half   = ff_imdct_half_mips;
+#endif
+#endif
+#endif
+}
diff --git a/libavcodec/mips/fmtconvert_mips.c b/libavcodec/mips/fmtconvert_mips.c
new file mode 100644
index 0000000..9909584
--- /dev/null
+++ b/libavcodec/mips/fmtconvert_mips.c
@@ -0,0 +1,141 @@
+/*
+ * Format Conversion Utils for MIPS
+ *
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Zoran Lukic (zoranl@mips.com)
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void int32_to_float_fmul_scalar_mips(float *dst, const int *src,
+        float mul, int len)
+{
+    /*
+     * variables used in inline assembler
+     */
+    float temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15;
+
+    int rpom1, rpom2, rpom11, rpom21, rpom12, rpom22, rpom13, rpom23;
+    const int *src_end = src + len;
+    /*
+     * loop is 8 times unrolled in assembler in order to achieve better performance
+     */
+    __asm__ volatile (
+        "i32tf_lp%=:                                    \n\t"
+        "lw       %[rpom11],     0(%[src])              \n\t"
+        "lw       %[rpom21],     4(%[src])              \n\t"
+        "lw       %[rpom1],      8(%[src])              \n\t"
+        "lw       %[rpom2],      12(%[src])             \n\t"
+        "mtc1     %[rpom11],     %[temp1]               \n\t"
+        "mtc1     %[rpom21],     %[temp3]               \n\t"
+        "mtc1     %[rpom1],      %[temp5]               \n\t"
+        "mtc1     %[rpom2],      %[temp7]               \n\t"
+
+        "lw       %[rpom13],     16(%[src])             \n\t"
+        "lw       %[rpom23],     20(%[src])             \n\t"
+        "lw       %[rpom12],     24(%[src])             \n\t"
+        "lw       %[rpom22],     28(%[src])             \n\t"
+        "mtc1     %[rpom13],     %[temp9]               \n\t"
+        "mtc1     %[rpom23],     %[temp11]              \n\t"
+        "mtc1     %[rpom12],     %[temp13]              \n\t"
+        "mtc1     %[rpom22],     %[temp15]              \n\t"
+
+        PTR_ADDIU "%[src],       32                     \n\t"
+        "cvt.s.w  %[temp1],      %[temp1]               \n\t"
+        "cvt.s.w  %[temp3],      %[temp3]               \n\t"
+        "cvt.s.w  %[temp5],      %[temp5]               \n\t"
+        "cvt.s.w  %[temp7],      %[temp7]               \n\t"
+
+        "cvt.s.w  %[temp9],      %[temp9]               \n\t"
+        "cvt.s.w  %[temp11],     %[temp11]              \n\t"
+        "cvt.s.w  %[temp13],     %[temp13]              \n\t"
+        "cvt.s.w  %[temp15],     %[temp15]              \n\t"
+
+        "mul.s   %[temp1],       %[temp1],    %[mul]    \n\t"
+        "mul.s   %[temp3],       %[temp3],    %[mul]    \n\t"
+        "mul.s   %[temp5],       %[temp5],    %[mul]    \n\t"
+        "mul.s   %[temp7],       %[temp7],    %[mul]    \n\t"
+
+        "mul.s   %[temp9],       %[temp9],    %[mul]    \n\t"
+        "mul.s   %[temp11],      %[temp11],   %[mul]    \n\t"
+        "mul.s   %[temp13],      %[temp13],   %[mul]    \n\t"
+        "mul.s   %[temp15],      %[temp15],   %[mul]    \n\t"
+
+        "swc1    %[temp1],       0(%[dst])              \n\t" /*dst[i] = src[i] * mul;    */
+        "swc1    %[temp3],       4(%[dst])              \n\t" /*dst[i+1] = src[i+1] * mul;*/
+        "swc1    %[temp5],       8(%[dst])              \n\t" /*dst[i+2] = src[i+2] * mul;*/
+        "swc1    %[temp7],       12(%[dst])             \n\t" /*dst[i+3] = src[i+3] * mul;*/
+
+        "swc1    %[temp9],       16(%[dst])             \n\t" /*dst[i+4] = src[i+4] * mul;*/
+        "swc1    %[temp11],      20(%[dst])             \n\t" /*dst[i+5] = src[i+5] * mul;*/
+        "swc1    %[temp13],      24(%[dst])             \n\t" /*dst[i+6] = src[i+6] * mul;*/
+        "swc1    %[temp15],      28(%[dst])             \n\t" /*dst[i+7] = src[i+7] * mul;*/
+        PTR_ADDIU "%[dst],       32                     \n\t"
+        "bne     %[src],        %[src_end], i32tf_lp%=  \n\t"
+        : [temp1]"=&f"(temp1),   [temp11]"=&f"(temp11),
+          [temp13]"=&f"(temp13), [temp15]"=&f"(temp15),
+          [temp3]"=&f"(temp3),   [temp5]"=&f"(temp5),
+          [temp7]"=&f"(temp7),   [temp9]"=&f"(temp9),
+          [rpom1]"=&r"(rpom1),   [rpom2]"=&r"(rpom2),
+          [rpom11]"=&r"(rpom11), [rpom21]"=&r"(rpom21),
+          [rpom12]"=&r"(rpom12), [rpom22]"=&r"(rpom22),
+          [rpom13]"=&r"(rpom13), [rpom23]"=&r"(rpom23),
+          [dst]"+r"(dst),       [src]"+r"(src)
+        : [mul]"f"(mul),        [src_end]"r"(src_end)
+        : "memory"
+    );
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips;
+#endif
+}
diff --git a/libavcodec/mips/h263dsp_init_mips.c b/libavcodec/mips/h263dsp_init_mips.c
new file mode 100644
index 0000000..09bd937
--- /dev/null
+++ b/libavcodec/mips/h263dsp_init_mips.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h263dsp_init_msa(H263DSPContext *c)
+{
+    c->h263_h_loop_filter = ff_h263_h_loop_filter_msa;
+    c->h263_v_loop_filter = ff_h263_v_loop_filter_msa;
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_h263dsp_init_mips(H263DSPContext *c)
+{
+#if HAVE_MSA
+    h263dsp_init_msa(c);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/h263dsp_mips.h b/libavcodec/mips/h263dsp_mips.h
new file mode 100644
index 0000000..99a43cd
--- /dev/null
+++ b/libavcodec/mips/h263dsp_mips.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H263DSP_MIPS_H
+#define AVCODEC_MIPS_H263DSP_MIPS_H
+
+#include "libavcodec/mpegvideo.h"
+
+void ff_h263_h_loop_filter_msa(uint8_t *src, int stride, int q_scale);
+void ff_h263_v_loop_filter_msa(uint8_t *src, int stride, int q_scale);
+void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block,
+                                       int32_t index, int32_t q_scale);
+void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block,
+                                      int32_t index, int32_t q_scale);
+void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block,
+                                      int32_t index, int32_t q_scale);
+int ff_pix_sum_msa(uint8_t *pix, int line_size);
+
+#endif  // #ifndef AVCODEC_MIPS_H263DSP_MIPS_H
diff --git a/libavcodec/mips/h263dsp_msa.c b/libavcodec/mips/h263dsp_msa.c
new file mode 100644
index 0000000..472bcbd
--- /dev/null
+++ b/libavcodec/mips/h263dsp_msa.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h263dsp_mips.h"
+
+static const uint8_t h263_loop_filter_strength_msa[32] = {
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7,
+    7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
+};
+
+static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
+{
+    int32_t strength = h263_loop_filter_strength_msa[qscale];
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 temp0, temp1, temp2;
+    v8i16 diff0, diff2, diff4, diff6, diff8;
+    v8i16 d0, a_d0, str_x2, str;
+
+    src -= 2;
+    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in3, in2, in1);
+
+    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
+    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
+    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
+    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
+    temp2 <<= 2;
+    diff0 = a_d0 + temp2;
+    diff2 = -(-diff0 >> 3);
+    str_x2 = __msa_fill_h(-(strength << 1));
+    temp0 = (str_x2 <= diff2);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff2;
+    str = __msa_fill_h(-strength);
+    temp0 = (diff2 < str);
+    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
+    diff4 = diff0 >> 3;
+    str_x2 = __msa_fill_h(strength << 1);
+    temp0 = (diff4 <= str_x2);
+    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff4;
+    str = __msa_fill_h(strength);
+    temp0 = (str < diff4);
+    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(diff0, 0);
+    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff2 = -diff2 >> 1;
+    diff4 >>= 1;
+    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff6 = (-a_d0) >> 2;
+    diff6 = -(diff6);
+    temp2 = -diff8;
+    temp0 = (diff6 < temp2);
+    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
+    diff2 = a_d0 >> 2;
+    temp0 = (diff2 <= diff8);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(a_d0, 0);
+    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
+    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
+    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
+    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
+    in3 = __msa_xori_b(in3, 128);
+    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
+    in3 = __msa_xori_b(in3, 128);
+    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
+    ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
+    in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
+    in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride);
+    src += 4 * stride;
+    ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
+    src += 4 * stride;
+}
+
+static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
+{
+    int32_t strength = h263_loop_filter_strength_msa[qscale];
+    uint64_t res0, res1, res2, res3;
+    v16u8 in0, in1, in2, in3;
+    v8i16 temp0, temp2, diff0, diff2, diff4, diff6, diff8;
+    v8i16 d0, a_d0, str_x2, str;
+
+    src -= 2 * stride;
+    LD_UB4(src, stride, in0, in3, in2, in1);
+    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
+    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
+    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
+    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
+    temp2 <<= 2;
+    diff0 = a_d0 + temp2;
+    diff2 = -(-diff0 >> 3);
+    str_x2 = __msa_fill_h(-(strength << 1));
+    temp0 = (str_x2 <= diff2);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff2;
+    str = __msa_fill_h(-strength);
+    temp0 = (diff2 < str);
+    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
+    diff4 = diff0 >> 3;
+    str_x2 = __msa_fill_h(strength << 1);
+    temp0 = (diff4 <= str_x2);
+    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff4;
+    str = __msa_fill_h(strength);
+    temp0 = (str < diff4);
+    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(diff0, 0);
+    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff2 = -diff2 >> 1;
+    diff4 >>= 1;
+    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff6 = (-a_d0) >> 2;
+    diff6 = -(diff6);
+    temp2 = -diff8;
+    temp0 = (diff6 < temp2);
+    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
+    diff2 = a_d0 >> 2;
+    temp0 = (diff2 <= diff8);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(a_d0, 0);
+    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
+    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
+    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
+    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
+    in3 = __msa_xori_b(in3, 128);
+    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
+    in3 = __msa_xori_b(in3, 128);
+    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
+    res0 = __msa_copy_u_d((v2i64) in0, 0);
+    res1 = __msa_copy_u_d((v2i64) in3, 0);
+    res2 = __msa_copy_u_d((v2i64) in2, 0);
+    res3 = __msa_copy_u_d((v2i64) in1, 0);
+    SD4(res0, res1, res2, res3, src, stride);
+}
+
+void ff_h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
+{
+    h263_h_loop_filter_msa(src, stride, q_scale);
+}
+
+void ff_h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
+{
+    h263_v_loop_filter_msa(src, stride, q_scale);
+}
diff --git a/libavcodec/mips/h264chroma_init_mips.c b/libavcodec/mips/h264chroma_init_mips.c
new file mode 100644
index 0000000..ae817e4
--- /dev/null
+++ b/libavcodec/mips/h264chroma_init_mips.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264chroma_init_msa(H264ChromaContext *c, int bit_depth)
+{
+    const int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_msa;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_msa;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_msa;
+
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_msa;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_msa;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
+{
+    int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmi;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmi;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmi;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_MMI
+    h264chroma_init_mmi(c, bit_depth);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    h264chroma_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/h264chroma_mips.h b/libavcodec/mips/h264chroma_mips.h
new file mode 100644
index 0000000..996384d
--- /dev/null
+++ b/libavcodec/mips/h264chroma_mips.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H264CHROMA_MIPS_H
+#define AVCODEC_MIPS_H264CHROMA_MIPS_H
+
+#include "libavcodec/h264dec.h"
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int height, int x, int y);
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int height, int x, int y);
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int height, int x, int y);
+
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y);
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y);
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y);
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y);
+
+#endif /* AVCODEC_MIPS_H264CHROMA_MIPS_H */
diff --git a/libavcodec/mips/h264chroma_mmi.c b/libavcodec/mips/h264chroma_mmi.c
new file mode 100644
index 0000000..739dd7d
--- /dev/null
+++ b/libavcodec/mips/h264chroma_mmi.c
@@ -0,0 +1,752 @@
+/*
+ * Loongson SIMD optimized h264chroma
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_mips.h"
+#include "constants.h"
+#include "libavutil/mips/mmiutils.h"
+
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y)
+{
+    int A = 64, B, C, D, E;
+    double ftmp[12];
+    uint64_t tmp[1];
+
+    if (!(x || y)) {
+        /* x=0, y=0, A=64 */
+        __asm__ volatile (
+            "1:                                                        \n\t"
+            MMI_ULDC1(%[ftmp0], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+            MMI_ULDC1(%[ftmp2], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+            MMI_ULDC1(%[ftmp3], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+
+            "addi       %[h],       %[h],           -0x04              \n\t"
+
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            MMI_SDC1(%[ftmp2], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            MMI_SDC1(%[ftmp3], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            "bnez       %[h],       1b                                 \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride)
+            : "memory"
+        );
+    } else if (x && y) {
+        /* x!=0, y!=0 */
+        D = x * y;
+        B = (x << 3) - D;
+        C = (y << 3) - D;
+        A = 64 - D - B - C;
+
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]           \n\t"
+            "dli        %[tmp0],    0x06                               \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]           \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]           \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                           \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]           \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]           \n\t"
+
+            "1:                                                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+            MMI_ULDC1(%[ftmp3], %[src], 0x00)
+            MMI_ULDC1(%[ftmp4], %[src], 0x01)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+            MMI_ULDC1(%[ftmp10], %[src], 0x00)
+            MMI_ULDC1(%[ftmp11], %[src], 0x01)
+            "addi       %[h],       %[h],           -0x02              \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]           \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]           \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]               \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[B]               \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]           \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[A]               \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[B]               \n\t"
+            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]           \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]           \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]           \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]               \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[D]               \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]           \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[C]               \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[D]               \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp8]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]        \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]           \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]           \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]           \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]           \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]           \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]               \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[B]               \n\t"
+            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]           \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[A]               \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[B]               \n\t"
+            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]           \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp10],      %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp10],      %[ftmp0]           \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp11],      %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp11],      %[ftmp0]           \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]               \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[D]               \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]           \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[C]               \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[D]               \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp8]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]           \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_32]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_32]        \n\t"
+            "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp9]           \n\t"
+            "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]           \n\t"
+            "packushb   %[ftmp3],   %[ftmp3],       %[ftmp4]           \n\t"
+
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            MMI_SDC1(%[ftmp3], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            "bnez       %[h],       1b                                 \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (x) {
+        /* x!=0, y==0 */
+        E = x << 3;
+        A = 64 - E;
+
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]           \n\t"
+            "dli        %[tmp0],    0x06                               \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]           \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]           \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                           \n\t"
+
+            "1:                                                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            "addi       %[h],       %[h],           -0x01              \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]           \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]           \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]               \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[E]               \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]           \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[A]               \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[E]               \n\t"
+            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]           \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]        \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]           \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]           \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]           \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            "bnez       %[h],       1b                                 \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        /* x==0, y!=0 */
+        E = y << 3;
+        A = 64 - E;
+
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]           \n\t"
+            "dli        %[tmp0],    0x06                               \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]           \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]           \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                           \n\t"
+
+            "1:                                                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+            MMI_ULDC1(%[ftmp2], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]          \n\t"
+            MMI_ULDC1(%[ftmp8], %[src], 0x00)
+            "addi       %[h],       %[h],           -0x02              \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]           \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]           \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]               \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[E]               \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]           \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[A]               \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[E]               \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_32]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_32]        \n\t"
+            "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp7]           \n\t"
+            "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp7]           \n\t"
+            "packushb   %[ftmp1],   %[ftmp3],       %[ftmp4]           \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp2],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]           \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp8],       %[ftmp0]           \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp8],       %[ftmp0]           \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]               \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[E]               \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]           \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[A]               \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[E]               \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_32]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_32]        \n\t"
+            "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp7]           \n\t"
+            "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp7]           \n\t"
+            "packushb   %[ftmp2],   %[ftmp3],       %[ftmp4]           \n\t"
+
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            MMI_SDC1(%[ftmp2], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]          \n\t"
+            "bnez       %[h],       1b                                 \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    }
+}
+
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y)
+{
+    int A = 64, B, C, D, E;
+    double ftmp[10];
+    uint64_t tmp[1];
+
+    if(!(x || y)){
+        /* x=0, y=0, A=64 */
+        __asm__ volatile (
+            "1:                                                         \n\t"
+            MMI_ULDC1(%[ftmp0], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            MMI_LDC1(%[ftmp3], %[dst], 0x00)
+            PTR_SUBU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "addi       %[h],       %[h],           -0x02               \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride)
+            : "memory"
+        );
+    } else if (x && y) {
+        /* x!=0, y!=0 */
+        D = x * y;
+        B = (x << 3) - D;
+        C = (y << 3) - D;
+        A = 64 - D - B - C;
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]       \n\t"
+            "dli        %[tmp0],    0x06                           \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]       \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]       \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                       \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]       \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]       \n\t"
+
+            "1:                                                    \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            PTR_ADDU   "%[src],     %[src],         %[stride]      \n\t"
+            MMI_ULDC1(%[ftmp3], %[src], 0x00)
+            MMI_ULDC1(%[ftmp4], %[src], 0x01)
+            "addi       %[h],       %[h],           -0x01          \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]       \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]       \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]           \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[B]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]       \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[A]           \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[B]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]       \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]       \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]       \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]           \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[D]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]       \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[C]           \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[D]           \n\t"
+            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]       \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]       \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]       \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]    \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]    \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]       \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]       \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]      \n\t"
+            "bnez       %[h],       1b                             \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (x) {
+        /* x!=0, y==0 */
+        E = x << 3;
+        A = 64 - E;
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]       \n\t"
+            "dli        %[tmp0],    0x06                           \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]       \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]       \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                       \n\t"
+
+            "1:                                                    \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            PTR_ADDU   "%[src],     %[src],         %[stride]      \n\t"
+            "addi       %[h],       %[h],           -0x01          \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]       \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]       \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]           \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[E]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]       \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[A]           \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[E]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]       \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]    \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]    \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]       \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]       \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]      \n\t"
+            "bnez       %[h],       1b                             \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        /* x==0, y!=0 */
+        E = y << 3;
+        A = 64 - E;
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]       \n\t"
+            "dli        %[tmp0],    0x06                           \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]       \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]       \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                       \n\t"
+
+            "1:                                                    \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]      \n\t"
+            MMI_ULDC1(%[ftmp2], %[src], 0x00)
+            "addi       %[h],       %[h],           -0x01          \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]       \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]       \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]       \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]           \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[E]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]       \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[A]           \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[E]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]       \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]    \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]    \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]       \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]       \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]       \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]      \n\t"
+            "bnez       %[h],       1b                             \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    }
+}
+
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    const int E = B + C;
+    double ftmp[8];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    DECLARE_VAR_LOW32;
+
+    if (D) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
+            "1:                                                         \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[src], 0x01)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
+            MMI_ULWC1(%[ftmp4], %[src], 0x01)
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (E) {
+        const int step = C ? stride : 1;
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+
+            "1:                                                         \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
+            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        __asm__ volatile (
+            "1:                                                         \n\t"
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "addi       %[h],       %[h],           -0x02               \n\t"
+            MMI_SWC1(%[ftmp0], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              RESTRICT_ASM_LOW32
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride)
+            : "memory"
+        );
+    }
+}
+
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) *(8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    const int E = B + C;
+    double ftmp[8];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    DECLARE_VAR_LOW32;
+
+    if (D) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
+            "1:                                                         \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[src], 0x01)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
+            MMI_ULWC1(%[ftmp4], %[src], 0x01)
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (E) {
+        const int step = C ? stride : 1;
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+
+            "1:                                                         \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
+            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        __asm__ volatile (
+            "1:                                                         \n\t"
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "addi       %[h],       %[h],           -0x02               \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            MMI_SWC1(%[ftmp0], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            MMI_LWC1(%[ftmp3], %[dst], 0x00)
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              RESTRICT_ASM_LOW32
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride)
+            : "memory"
+        );
+    }
+}
diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
new file mode 100644
index 0000000..4c25761
--- /dev/null
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -0,0 +1,2018 @@
+/*
+ * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264chroma_mips.h"
+
+static const uint8_t chroma_mask_arr[16 * 5] = {
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, stride, src0, src1);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    out0 = __msa_copy_u_h(res, 0);
+    out1 = __msa_copy_u_h(res, 2);
+
+    SH(out0, dst);
+    dst += stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB4(src, stride, src0, src1, src2, src3);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, stride);
+}
+
+static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16i8 src0, src1;
+    v8u16 res_r;
+    v4i32 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, stride, src0, src1);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST4x2_UB(res, dst, stride);
+}
+
+static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, out;
+    v8u16 res0_r, res1_r;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB4(src, stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+    res0_r <<= 3;
+    res1_r <<= 3;
+    SRARI_H2_UH(res0_r, res1_r, 6);
+    SAT_UH2_UH(res0_r, res1_r, 7);
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
+    v16i8 mask;
+    v8u16 res0, res1, res2, res3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
+    DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    ST4x8_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+    LD_UB4(src, stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 out0, out1, out2, out3;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res4, res5, res6, res7);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SLLI_4V(res4, res5, res6, res7, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SRARI_H4_UH(res4, res5, res6, res7, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    SAT_UH4_UH(res4, res5, res6, res7, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
+                                      int32_t stride, uint32_t coeff0,
+                                      uint32_t coeff1, int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, stride, src0, src1, src2, src3);
+        src += (4 * stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+        ST8x4_UB(out0, out1, dst, stride);
+        dst += (4 * stride);
+    }
+
+    if (0 != (height % 4)) {
+        for (row = (height % 4); row--;) {
+            src0 = LD_UB(src);
+            src += stride;
+
+            src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+
+            res0 = __msa_dotp_u_h(src0, coeff_vec);
+            res0 <<= 3;
+            res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
+            res0 = __msa_sat_u_h(res0, 7);
+            res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
+
+            ST8x1_UB(res0, dst);
+            dst += stride;
+        }
+    }
+}
+
+static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
+    } else {
+        avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
+    }
+}
+
+static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1, src2;
+    v16u8 tmp0, tmp1;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, stride, src0, src1, src2);
+
+    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    out0 = __msa_copy_u_h(res, 0);
+    out1 = __msa_copy_u_h(res, 2);
+
+    SH(out0, dst);
+    dst += stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, stride);
+}
+
+static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2;
+    v16u8 tmp0, tmp1;
+    v4i32 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB3(src, stride, src0, src1, src2);
+    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST4x2_UB(res, dst, stride);
+}
+
+static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 out;
+    v8u16 res0_r, res1_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
+               tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+    res0_r <<= 3;
+    res1_r <<= 3;
+    SRARI_H2_UH(res0_r, res1_r, 6);
+    SAT_UH2_UH(res0_r, res1_r, 7);
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
+               tmp3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
+               tmp7);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+    ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
+    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
+    DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    ST4x8_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
+               src3);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 out0, out1, out2, out3;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
+               src3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
+               src7);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res4, res5, res6, res7);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SLLI_4V(res4, res5, res6, res7, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SRARI_H4_UH(res4, res5, res6, res7, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    uint16_t out0, out1;
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res_vert;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB3(src, stride, src0, src1, src2);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    out0 = __msa_copy_u_h(res_vert, 0);
+    out1 = __msa_copy_u_h(res_vert, 1);
+
+    SH(out0, dst);
+    dst += stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST2x4_UB(res, 0, dst, stride);
+}
+
+static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                              coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                              coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 mask;
+    v4i32 res;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+    LD_UB3(src, stride, src0, src1, src2);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST4x2_UB(res, dst, stride);
+}
+
+static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+    v4i32 res0, res1;
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                res_hz3);
+    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
+         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
+    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+    SRARI_H2_UH(res_vt0, res_vt1, 6);
+    SAT_UH2_UH(res_vt0, res_vt1, 7);
+    PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, stride);
+}
+
+static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+    VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
+    VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
+    DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
+    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
+         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
+    MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
+         res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
+    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+    ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
+    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
+    ST4x8_UB(res0, res1, dst, stride);
+}
+
+static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                              coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                              coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                              coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    src0 = LD_UB(src);
+    src += stride;
+
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+
+    LD_UB4(src, stride, src1, src2, src3, src4);
+    src += (4 * stride);
+
+    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
+    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
+         res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
+
+    res_vt0 += (res_hz0 * coeff_vt_vec1);
+    res_vt1 += (res_hz1 * coeff_vt_vec1);
+    res_vt2 += (res_hz2 * coeff_vt_vec1);
+    res_vt3 += (res_hz3 * coeff_vt_vec1);
+
+    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 out0, out1, out2, out3;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+    v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+    VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
+    VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                res_hz4);
+    DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
+    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+         coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+         res_vt3);
+    MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
+         coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
+         res_vt7);
+    res_vt0 += (res_hz0 * coeff_vt_vec1);
+    res_vt1 += (res_hz1 * coeff_vt_vec1);
+    res_vt2 += (res_hz2 * coeff_vt_vec1);
+    res_vt3 += (res_hz3 * coeff_vt_vec1);
+    res_vt4 += (res_hz4 * coeff_vt_vec1);
+    res_vt5 += (res_hz5 * coeff_vt_vec1);
+    res_vt6 += (res_hz6 * coeff_vt_vec1);
+    res_vt7 += (res_hz7 * coeff_vt_vec1);
+    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+    SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
+    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+    SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
+    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+    PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                              coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                              coef_ver1);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16u8 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, stride, src0, src1);
+
+    out0 = LH(dst);
+    out1 = LH(dst + stride);
+
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b(res, dst_data);
+
+    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
+    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+
+    SH(out0, dst);
+    dst += stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint16_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst_data = { 0 };
+    v8u16 res_r;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB4(src, stride, src0, src1, src2, src3);
+    tp0 = LH(dst);
+    tp1 = LH(dst + stride);
+    tp2 = LH(dst + 2 * stride);
+    tp3 = LH(dst + 3 * stride);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst0 = __msa_aver_u_b(dst0, dst_data);
+
+    ST2x4_UB(dst0, 0, dst, stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride, uint32_t coeff0,
+                                              uint32_t coeff1, int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16i8 src0, src1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16i8 res, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, stride, src0, src1);
+
+    LW2(dst, stride, load0, load1);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
+
+    ST4x2_UB(dst_data, dst, stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 out, dst_data = { 0 };
+    v16i8 mask;
+    v8u16 res0_r, res1_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB4(src, stride, src0, src1, src2, src3);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+    res0_r <<= 3;
+    res1_r <<= 3;
+    SRARI_H2_UH(res0_r, res1_r, 6);
+    SAT_UH2_UH(res0_r, res1_r, 7);
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
+    out = __msa_aver_u_b(out, dst_data);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
+    v16u8 dst0 = { 0 }, dst1 = { 0 };
+    v16i8 mask;
+    v8u16 res0, res1, res2, res3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+    DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
+    DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+    ST4x8_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride, uint32_t coeff0,
+                                              uint32_t coeff1, int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, out0, out1;
+    v16u8 dst0 = { 0 }, dst1 = { 0 };
+    v8u16 res0, res1, res2, res3;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+    LD_UB4(src, stride, src0, src1, src2, src3);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 out0, out1, out2, out3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res4, res5, res6, res7);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SLLI_4V(res4, res5, res6, res7, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SRARI_H4_UH(res4, res5, res6, res7, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    SAT_UH4_UH(res4, res5, res6, res7, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride, uint32_t coeff0,
+                                              uint32_t coeff1, int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1, src2, tmp0, tmp1, res;
+    v16u8 dst_data = { 0 };
+    v8i16 out;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, stride, src0, src1, src2);
+    out0 = LH(dst);
+    out1 = LH(dst + stride);
+
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
+
+    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+    out0 = __msa_copy_u_h(out, 0);
+    out1 = __msa_copy_u_h(out, 2);
+
+    SH(out0, dst);
+    dst += stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint16_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+    v16u8 dst_data = { 0 };
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+
+    tp0 = LH(dst);
+    tp1 = LH(dst + stride);
+    tp2 = LH(dst + 2 * stride);
+    tp3 = LH(dst + 3 * stride);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+
+    ST2x4_UB(res, 0, dst, stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride, uint32_t coeff0,
+                                              uint32_t coeff1, int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16u8 src0, src1, src2, tmp0, tmp1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16u8 res;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB3(src, stride, src0, src1, src2);
+
+    LW2(dst, stride, load0, load1);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = __msa_aver_u_b(res, dst_data);
+
+    ST4x2_UB(res, dst, stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 dst0 = { 0 };
+    v8u16 res0_r, res1_r;
+    v16u8 out;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
+               tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+    res0_r <<= 3;
+    res1_r <<= 3;
+    SRARI_H2_UH(res0_r, res1_r, 6);
+    SAT_UH2_UH(res0_r, res1_r, 7);
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
+    out = __msa_aver_u_b(out, dst0);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
+    v16u8 dst0 = { 0 }, dst1 = { 0 };
+    v8u16 res0, res1, res2, res3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
+               tmp3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
+               tmp7);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+    ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
+    DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
+    DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+    ST4x8_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride, uint32_t coeff0,
+                                              uint32_t coeff1, int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 };
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src0, src1, src2, src3);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride, uint32_t coeff0,
+                                               uint32_t coeff1)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 out0, out1, out2, out3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src0, src1, src2, src3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+               src4, src5, src6, src7);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res0, res1, res2, res3);
+    DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
+                coeff_vec, res4, res5, res6, res7);
+    SLLI_4V(res0, res1, res2, res3, 3);
+    SLLI_4V(res4, res5, res6, res7, 3);
+    SRARI_H4_UH(res0, res1, res2, res3, 6);
+    SRARI_H4_UH(res4, res5, res6, res7, 6);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    SAT_UH4_UH(res0, res1, res2, res3, 7);
+    PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+    PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride, uint32_t coeff0,
+                                              uint32_t coeff1, int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint16_t out0, out1;
+    v16u8 dst0 = { 0 };
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB3(src, stride, src0, src1, src2);
+    out0 = LH(dst);
+    out1 = LH(dst + stride);
+    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
+    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+    out0 = __msa_copy_u_h((v8i16) dst0, 0);
+    out1 = __msa_copy_u_h((v8i16) dst0, 1);
+
+    SH(out0, dst);
+    dst += stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint16_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 dst0 = { 0 };
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    tp0 = LH(dst);
+    tp1 = LH(dst + stride);
+    tp2 = LH(dst + 2 * stride);
+    tp3 = LH(dst + 3 * stride);
+    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
+    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
+    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
+    dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST2x4_UB(dst0, 0, dst, stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_and_aver_dst_2x2_msa(src, dst, stride, coef_hor0,
+                                           coef_hor1, coef_ver0, coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_and_aver_dst_2x4_msa(src, dst, stride, coef_hor0,
+                                           coef_hor1, coef_ver0, coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint32_t tp0, tp1;
+    v16u8 src0, src1, src2;
+    v16u8 dst0, dst_data = { 0 };
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB3(src, stride, src0, src1, src2);
+    LW2(dst, stride, tp0, tp1);
+    INSERT_W2_UB(tp0, tp1, dst_data);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = __msa_aver_u_b(dst0, dst_data);
+
+    ST4x2_UB(dst0, dst, stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 out, dst_data = { 0 };
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                res_hz3);
+    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
+         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
+    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+    SRARI_H2_UH(res_vt0, res_vt1, 6);
+    SAT_UH2_UH(res_vt0, res_vt1, 7);
+    out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
+    out = __msa_aver_u_b(out, dst_data);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
+    v16u8 dst0 = { 0 }, dst1 = { 0 };
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+    VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
+    VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
+    DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
+    DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
+    MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
+         res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
+    MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
+         res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
+    ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+    ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
+    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST4x8_UB(res0, res1, dst, stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_and_aver_dst_4x2_msa(src, dst, stride, coef_hor0,
+                                           coef_hor1, coef_ver0, coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_and_aver_dst_4x4_msa(src, dst, stride, coef_hor0,
+                                           coef_hor1, coef_ver0, coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_and_aver_dst_4x8_msa(src, dst, stride, coef_hor0,
+                                           coef_hor1, coef_ver0, coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res_hz0, res_hz1, res_hz2;
+    v8u16 res_hz3, res_hz4;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 };
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    src0 = LD_UB(src);
+    src += stride;
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+    LD_UB4(src, stride, src1, src2, src3, src4);
+    src += (4 * stride);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
+    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
+         res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
+    res_vt0 += (res_hz0 * coeff_vt_vec1);
+    res_vt1 += (res_hz1 * coeff_vt_vec1);
+    res_vt2 += (res_hz2 * coeff_vt_vec1);
+    res_vt3 += (res_hz3 * coeff_vt_vec1);
+    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
+                                               int32_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 out0, out1, out2, out3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+    v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    LD_UB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_UB4(src, stride, src5, src6, src7, src8);
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+    VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+    VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
+    VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+    DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                res_hz4);
+    DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+                coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
+    MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+         coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+         res_vt3);
+    MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
+         coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
+         res_vt7);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+    res_vt0 += (res_hz0 * coeff_vt_vec1);
+    res_vt1 += (res_hz1 * coeff_vt_vec1);
+    res_vt2 += (res_hz2 * coeff_vt_vec1);
+    res_vt3 += (res_hz3 * coeff_vt_vec1);
+    res_vt4 += (res_hz4 * coeff_vt_vec1);
+    res_vt5 += (res_hz5 * coeff_vt_vec1);
+    res_vt6 += (res_hz6 * coeff_vt_vec1);
+    res_vt7 += (res_hz7 * coeff_vt_vec1);
+    SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+    SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
+    SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+    SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
+    PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+    PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+    AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hv_and_aver_dst_8x4_msa(src, dst, stride, coef_hor0,
+                                           coef_hor1, coef_ver0, coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_and_aver_dst_8x8_msa(src, dst, stride, coef_hor0,
+                                           coef_hor1, coef_ver0, coef_ver1);
+    }
+}
+
+static void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                            int32_t height)
+{
+    uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+
+    if (8 == height) {
+        LW4(src, stride, tp0, tp1, tp2, tp3);
+        src += 4 * stride;
+        LW4(src, stride, tp4, tp5, tp6, tp7);
+        SW4(tp0, tp1, tp2, tp3, dst, stride);
+        dst += 4 * stride;
+        SW4(tp4, tp5, tp6, tp7, dst, stride);
+    } else if (4 == height) {
+        LW4(src, stride, tp0, tp1, tp2, tp3);
+        SW4(tp0, tp1, tp2, tp3, dst, stride);
+    } else if (2 == height) {
+        LW2(src, stride, tp0, tp1);
+        SW(tp0, dst);
+        dst += stride;
+        SW(tp1, dst);
+    }
+}
+
+static void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                            int32_t height)
+{
+    uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (8 == height) {
+        LD4(src, stride, src0, src1, src2, src3);
+        src += 4 * stride;
+        LD4(src, stride, src4, src5, src6, src7);
+        SD4(src0, src1, src2, src3, dst, stride);
+        dst += 4 * stride;
+        SD4(src4, src5, src6, src7, dst, stride);
+    } else if (4 == height) {
+        LD4(src, stride, src0, src1, src2, src3);
+        SD4(src0, src1, src2, src3, dst, stride);
+    }
+}
+
+static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                           int32_t height)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
+
+    if (8 == height) {
+        LW4(src, stride, tp0, tp1, tp2, tp3);
+        src += 4 * stride;
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+        LW4(src, stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
+        LW4(dst, stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+        LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, stride);
+    } else if (4 == height) {
+        LW4(src, stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+        LW4(dst, stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+        dst0 = __msa_aver_u_b(src0, dst0);
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+    } else if (2 == height) {
+        LW2(src, stride, tp0, tp1);
+        INSERT_W2_UB(tp0, tp1, src0);
+        LW2(dst, stride, tp0, tp1);
+        INSERT_W2_UB(tp0, tp1, dst0);
+        dst0 = __msa_aver_u_b(src0, dst0);
+        ST4x2_UB(dst0, dst, stride);
+    }
+}
+
+static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                           int32_t height)
+{
+    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+
+    if (8 == height) {
+        LD4(src, stride, tp0, tp1, tp2, tp3);
+        src += 4 * stride;
+        LD4(src, stride, tp4, tp5, tp6, tp7);
+        INSERT_D2_UB(tp0, tp1, src0);
+        INSERT_D2_UB(tp2, tp3, src1);
+        INSERT_D2_UB(tp4, tp5, src2);
+        INSERT_D2_UB(tp6, tp7, src3);
+        LD4(dst, stride, tp0, tp1, tp2, tp3);
+        LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        INSERT_D2_UB(tp4, tp5, dst2);
+        INSERT_D2_UB(tp6, tp7, dst3);
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                    dst2, dst3);
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+    } else if (4 == height) {
+        LD4(src, stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, src0);
+        INSERT_D2_UB(tp2, tp3, src1);
+        LD4(dst, stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, stride);
+    }
+}
+
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
+                                ptrdiff_t stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
+    } else {
+        copy_width8_msa(src, dst, stride, height);
+    }
+}
+
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
+                                ptrdiff_t stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
+    } else {
+        copy_width4_msa(src, dst, stride, height);
+    }
+}
+
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
+                                ptrdiff_t stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            *((uint16_t *) dst) = *((uint16_t *) src);
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
+
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
+                                ptrdiff_t stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height);
+    } else {
+        avg_width8_msa(src, dst, stride, height);
+    }
+}
+
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
+                                ptrdiff_t stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height);
+    } else {
+        avg_width4_msa(src, dst, stride, height);
+    }
+}
+
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
+                                ptrdiff_t stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            dst[0] = (dst[0] + src[0] + 1) >> 1;
+            dst[1] = (dst[1] + src[1] + 1) >> 1;
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
diff --git a/libavcodec/mips/h264dsp_init_mips.c b/libavcodec/mips/h264dsp_init_mips.c
new file mode 100644
index 0000000..dc08a25
--- /dev/null
+++ b/libavcodec/mips/h264dsp_init_mips.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264dsp_init_msa(H264DSPContext *c,
+                                     const int bit_depth,
+                                     const int chroma_format_idc)
+{
+    if (8 == bit_depth) {
+        c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_inter_msa;
+        c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_inter_msa;
+        c->h264_h_loop_filter_luma_mbaff =
+            ff_h264_h_loop_filter_luma_mbaff_msa;
+        c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_msa;
+        c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_msa;
+        c->h264_h_loop_filter_luma_mbaff_intra =
+            ff_h264_h_loop_filter_luma_mbaff_intra_msa;
+        c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_inter_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_inter_msa;
+        else
+            c->h264_h_loop_filter_chroma =
+                ff_h264_h_loop_filter_chroma422_msa;
+
+        if (chroma_format_idc > 1)
+            c->h264_h_loop_filter_chroma_mbaff =
+                ff_h264_h_loop_filter_chroma422_mbaff_msa;
+
+        c->h264_v_loop_filter_chroma_intra =
+            ff_h264_v_lpf_chroma_intra_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_h_loop_filter_chroma_intra =
+                ff_h264_h_lpf_chroma_intra_msa;
+
+        /* Weighted MC */
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_msa;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_msa;
+        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_msa;
+
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_msa;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_msa;
+        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_msa;
+
+        c->h264_idct_add = ff_h264_idct_add_msa;
+        c->h264_idct8_add = ff_h264_idct8_addblk_msa;
+        c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_msa;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_msa;
+        c->h264_idct_add16 = ff_h264_idct_add16_msa;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8 = ff_h264_idct_add8_msa;
+        else
+            c->h264_idct_add8 = ff_h264_idct_add8_422_msa;
+
+        c->h264_idct_add16intra = ff_h264_idct_add16_intra_msa;
+        c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_msa;
+    }  // if (8 == bit_depth)
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264dsp_init_mmi(H264DSPContext * c, const int bit_depth,
+        const int chroma_format_idc)
+{
+    if (bit_depth == 8) {
+        c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_mmi;
+        c->h264_idct_add = ff_h264_idct_add_8_mmi;
+        c->h264_idct8_add = ff_h264_idct8_add_8_mmi;
+        c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmi;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmi;
+        c->h264_idct_add16 = ff_h264_idct_add16_8_mmi;
+        c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmi;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmi;
+
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8 = ff_h264_idct_add8_8_mmi;
+        else
+            c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmi;
+
+        c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_mmi;
+
+        if (chroma_format_idc <= 1)
+            c->h264_chroma_dc_dequant_idct =
+                ff_h264_chroma_dc_dequant_idct_8_mmi;
+        else
+            c->h264_chroma_dc_dequant_idct =
+                ff_h264_chroma422_dc_dequant_idct_8_mmi;
+
+        c->weight_h264_pixels_tab[0] = ff_h264_weight_pixels16_8_mmi;
+        c->weight_h264_pixels_tab[1] = ff_h264_weight_pixels8_8_mmi;
+        c->weight_h264_pixels_tab[2] = ff_h264_weight_pixels4_8_mmi;
+
+        c->biweight_h264_pixels_tab[0] = ff_h264_biweight_pixels16_8_mmi;
+        c->biweight_h264_pixels_tab[1] = ff_h264_biweight_pixels8_8_mmi;
+        c->biweight_h264_pixels_tab[2] = ff_h264_biweight_pixels4_8_mmi;
+
+        c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmi;
+        c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmi;
+
+        if (chroma_format_idc <= 1) {
+            c->h264_h_loop_filter_chroma =
+                ff_deblock_h_chroma_8_mmi;
+            c->h264_h_loop_filter_chroma_intra =
+                ff_deblock_h_chroma_intra_8_mmi;
+        }
+
+        c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmi;
+        c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmi;
+        c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmi;
+        c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
+                                  const int chroma_format_idc)
+{
+#if HAVE_MMI
+    h264dsp_init_mmi(c, bit_depth, chroma_format_idc);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    h264dsp_init_msa(c, bit_depth, chroma_format_idc);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/h264dsp_mips.h b/libavcodec/mips/h264dsp_mips.h
new file mode 100644
index 0000000..21b7de0
--- /dev/null
+++ b/libavcodec/mips/h264dsp_mips.h
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+                      Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H264DSP_MIPS_H
+#define AVCODEC_MIPS_H264DSP_MIPS_H
+
+#include "libavcodec/h264dec.h"
+#include "constants.h"
+
+void ff_h264_h_lpf_luma_inter_msa(uint8_t *src, int stride,
+                                  int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_msa(uint8_t *src, int stride,
+                                  int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_chroma_inter_msa(uint8_t *src, int stride,
+                                    int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_chroma_inter_msa(uint8_t *src, int stride,
+                                    int alpha, int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
+                                         int32_t alpha, int32_t beta,
+                                         int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
+                                               int32_t alpha, int32_t beta,
+                                               int8_t *tc0);
+void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t stride,
+                                          int32_t alpha, int32_t beta,
+                                          int8_t *tc0);
+
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride);
+void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                  int32_t de_q_val);
+void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset,
+                            int16_t *block, int32_t stride,
+                            const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset,
+                                  int16_t *block, int32_t dst_stride,
+                                  const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset,
+                           int16_t *block, int32_t dst_stride,
+                           const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nnzc[15 * 8]);
+void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                 int32_t dst_stride);
+void ff_h264_idct8_add4_msa(uint8_t *dst, const int *blk_offset,
+                            int16_t *blk, int dst_stride,
+                            const uint8_t nnzc[15 * 8]);
+
+void ff_h264_h_lpf_luma_intra_msa(uint8_t *src, int stride,
+                                  int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_msa(uint8_t *src, int stride,
+                                  int alpha, int beta);
+void ff_h264_h_lpf_chroma_intra_msa(uint8_t *src, int stride,
+                                    int alpha, int beta);
+void ff_h264_v_lpf_chroma_intra_msa(uint8_t *src, int stride,
+                                    int alpha, int beta);
+void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int stride,
+                                                int alpha, int beta);
+
+void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride, int height, int log2_denom,
+                                     int weightd, int weights, int offset);
+void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height, int log2_denom,
+                                    int weightd, int weights, int offset);
+void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height, int log2_denom,
+                                    int weightd, int weights, int offset);
+void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, int height,
+                                   int log2_denom, int weight, int offset);
+void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, int height,
+                                  int log2_denom, int weight, int offset);
+void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, int height,
+                                  int log2_denom, int weight, int offset);
+
+void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+
+void ff_h264_add_pixels4_8_mmi(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
+        int qmul);
+void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul);
+void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul);
+
+void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, ptrdiff_t stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+
+void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+#endif  // #ifndef AVCODEC_MIPS_H264DSP_MIPS_H
diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
new file mode 100644
index 0000000..ac65a20
--- /dev/null
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -0,0 +1,2744 @@
+/*
+ * Loongson SIMD optimized h264dsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *                    Heiher <r@hev.cc>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+#include "h264dsp_mips.h"
+#include "libavutil/mips/mmiutils.h"
+
+void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
+{
+    double ftmp[9];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        MMI_LDC1(%[ftmp2], %[src], 0x08)
+        MMI_LDC1(%[ftmp3], %[src], 0x10)
+        MMI_LDC1(%[ftmp4], %[src], 0x18)
+        MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
+        MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
+        MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
+        MMI_ULWC1(%[ftmp8], %[dst3], 0x00)
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
+
+        /* memset(src, 0, 32); */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_LOW32
+          [ftmp8]"=&f"(ftmp[8])
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [src]"r"(src)
+        : "memory"
+    );
+
+}
+
+void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    double ftmp[12];
+    uint64_t tmp[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x01                                    \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
+        "dli        %[tmp0],    0x06                                    \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp10],  %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp11],  %[ftmp5],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp2],   %[ftmp10],      %[ftmp5]                \n\t"
+        "paddh      %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp11],      %[ftmp10]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp11],      %[ftmp10]               \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp10],  %[ftmp1],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_32]             \n\t"
+        "psrah      %[ftmp4],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp10],      %[ftmp5]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp7], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], %[block], 0x08)
+        MMI_SDC1(%[ftmp7], %[block], 0x10)
+        MMI_SDC1(%[ftmp7], %[block], 0x18)
+        MMI_ULWC1(%[ftmp2], %[dst], 0x00)
+        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        MMI_SWC1(%[ftmp2], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp2], %[dst], 0x00)
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        MMI_SWC1(%[ftmp2], %[dst], 0x00)
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+
+        /* memset(block, 0, 32) */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
+          [tmp0]"=&r"(tmp[0])
+        : [dst]"r"(dst),                    [block]"r"(block),
+          [stride]"r"((mips_reg)stride),    [ff_pw_32]"f"(ff_pw_32)
+        : "memory"
+    );
+
+}
+
+void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    double ftmp[16];
+    uint64_t tmp[7];
+    mips_reg addr[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "lhu        %[tmp0],    0x00(%[block])                          \n\t"
+        PTR_ADDI   "$29,        $29,            -0x20                   \n\t"
+        PTR_ADDIU  "%[tmp0],    %[tmp0],        0x20                    \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x10)
+        "sh         %[tmp0],    0x00(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x20)
+        "dli        %[tmp0],    0x01                                    \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x30)
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        MMI_LDC1(%[ftmp5], %[block], 0x50)
+        MMI_LDC1(%[ftmp6], %[block], 0x60)
+        MMI_LDC1(%[ftmp7], %[block], 0x70)
+        "mov.d      %[ftmp0],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp5],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp4],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp6]                                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x00)
+        MMI_LDC1(%[ftmp5], %[block], 0x40)
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        MMI_SDC1(%[ftmp6], %[block], 0x00)
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], $29, 0x00)
+        MMI_SDC1(%[ftmp1], $29, 0x10)
+        "dmfc1      %[tmp1],    %[ftmp6]                                \n\t"
+        "dmfc1      %[tmp3],    %[ftmp3]                                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp5],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp5],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp5], $29, 0x08)
+        MMI_SDC1(%[ftmp0], $29, 0x18)
+        "dmfc1      %[tmp2],    %[ftmp3]                                \n\t"
+        "dmfc1      %[tmp4],    %[ftmp4]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x18)
+        MMI_LDC1(%[ftmp6], %[block], 0x28)
+        MMI_LDC1(%[ftmp2], %[block], 0x38)
+        MMI_LDC1(%[ftmp0], %[block], 0x58)
+        MMI_LDC1(%[ftmp3], %[block], 0x68)
+        MMI_LDC1(%[ftmp4], %[block], 0x78)
+        "mov.d      %[ftmp7],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp5],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp2],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "mov.d      %[ftmp0],   %[ftmp3]                                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp6], %[block], 0x08)
+        MMI_LDC1(%[ftmp0], %[block], 0x48)
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x08)
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhhw  %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp4],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp7], %[block], 0x08)
+        "dmfc1      %[tmp5],    %[ftmp4]                                \n\t"
+        "mov.d      %[ftmp10],  %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp12],  %[ftmp3]                                \n\t"
+        "mov.d      %[ftmp14],  %[ftmp2]                                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp5],       %[ftmp7]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "punpckhwd  %[ftmp7],   %[ftmp0],       %[ftmp5]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "dmfc1      %[tmp6],    %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp11],  %[ftmp7]                                \n\t"
+        "mov.d      %[ftmp13],  %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp15],  %[ftmp5]                                \n\t"
+        PTR_ADDIU  "%[addr0],   %[dst],         0x04                    \n\t"
+        "mov.d      %[ftmp7],   %[ftmp10]                               \n\t"
+        "dmtc1      %[tmp3],    %[ftmp6]                                \n\t"
+        MMI_LDC1(%[ftmp1], $29, 0x10)
+        "dmtc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp14]               \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp14]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp14]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp14],      %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp12]                               \n\t"
+        "psrah      %[ftmp2],   %[ftmp12],      %[ftmp8]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp3], $29, 0x00)
+        "dmtc1      %[tmp5],    %[ftmp7]                                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp3], $29, 0x00)
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_SDC1(%[ftmp0], $29, 0x10)
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp2], %[block], 0x00)
+        MMI_SDC1(%[ftmp2], %[block], 0x08)
+        MMI_SDC1(%[ftmp2], %[block], 0x10)
+        MMI_SDC1(%[ftmp2], %[block], 0x18)
+        MMI_SDC1(%[ftmp2], %[block], 0x20)
+        MMI_SDC1(%[ftmp2], %[block], 0x28)
+        MMI_SDC1(%[ftmp2], %[block], 0x30)
+        MMI_SDC1(%[ftmp2], %[block], 0x38)
+        MMI_SDC1(%[ftmp2], %[block], 0x40)
+        MMI_SDC1(%[ftmp2], %[block], 0x48)
+        MMI_SDC1(%[ftmp2], %[block], 0x50)
+        MMI_SDC1(%[ftmp2], %[block], 0x58)
+        MMI_SDC1(%[ftmp2], %[block], 0x60)
+        MMI_SDC1(%[ftmp2], %[block], 0x68)
+        MMI_SDC1(%[ftmp2], %[block], 0x70)
+        MMI_SDC1(%[ftmp2], %[block], 0x78)
+        "dli        %[tmp3],    0x06                                    \n\t"
+        "mtc1       %[tmp3],    %[ftmp10]                               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        MMI_LDC1(%[ftmp5], $29, 0x00)
+        MMI_LDC1(%[ftmp4], $29, 0x10)
+        "dmtc1      %[tmp1],    %[ftmp6]                                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "dmtc1      %[tmp4],    %[ftmp1]                                \n\t"
+        "dmtc1      %[tmp2],    %[ftmp6]                                \n\t"
+        MMI_LDC1(%[ftmp4], $29, 0x18)
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp11],      %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp11]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp11],      %[ftmp1]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp15]               \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp15]               \n\t"
+        "psrah      %[ftmp2],   %[ftmp15],      %[ftmp8]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "mov.d      %[ftmp2],   %[ftmp4]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp13]                               \n\t"
+        "psrah      %[ftmp0],   %[ftmp13],      %[ftmp8]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        MMI_LDC1(%[ftmp6], $29, 0x08)
+        "dmtc1      %[tmp6],    %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp6], $29, 0x08)
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp7], $29, 0x18)
+        "dmfc1      %[tmp2],    %[ftmp0]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp2], $29, 0x08)
+        MMI_LDC1(%[ftmp5], $29, 0x18)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "dmtc1      %[tmp2],    %[ftmp1]                                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
+
+        /* memset(block, 0, 128) */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x20(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x30(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x40(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x50(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x60(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x70(%[block])          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
+          [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
+          [tmp6]"=&r"(tmp[6]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
+        : [dst]"r"(dst),                    [block]"r"(block),
+          [stride]"r"((mips_reg)stride)
+        : "$29","memory"
+    );
+
+}
+
+void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    double ftmp[6];
+    DECLARE_VAR_LOW32;
+
+    block[0] = 0;
+
+    __asm__ volatile (
+        "mtc1       %[dc],      %[ftmp5]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        MMI_ULWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_ULWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_ULWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_ULWC1(%[ftmp4], %[dst3], 0x00)
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          RESTRICT_ASM_LOW32
+          [ftmp5]"=&f"(ftmp[5])
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [dc]"r"(dc)
+        : "memory"
+    );
+}
+
+void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    double ftmp[10];
+    DECLARE_VAR_ALL64;
+
+    block[0] = 0;
+
+    __asm__ volatile (
+        "mtc1       %[dc],      %[ftmp5]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp1], %[dst0], 0x00)
+        MMI_LDC1(%[ftmp2], %[dst1], 0x00)
+        MMI_LDC1(%[ftmp3], %[dst2], 0x00)
+        MMI_LDC1(%[ftmp4], %[dst3], 0x00)
+        "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        MMI_SDC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SDC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SDC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst3], 0x00)
+
+        MMI_LDC1(%[ftmp1], %[dst4], 0x00)
+        MMI_LDC1(%[ftmp2], %[dst5], 0x00)
+        MMI_LDC1(%[ftmp3], %[dst6], 0x00)
+        MMI_LDC1(%[ftmp4], %[dst7], 0x00)
+        "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        MMI_SDC1(%[ftmp1], %[dst4], 0x00)
+        MMI_SDC1(%[ftmp2], %[dst5], 0x00)
+        MMI_SDC1(%[ftmp3], %[dst6], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst7], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          [ftmp9]"=&f"(ftmp[9])
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [dst4]"r"(dst+4*stride),          [dst5]"r"(dst+5*stride),
+          [dst6]"r"(dst+6*stride),          [dst7]"r"(dst+7*stride),
+          [dc]"r"(dc)
+        : "memory"
+    );
+}
+
+void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+            else
+                ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+        }
+    }
+}
+
+void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride);
+        else if(((int16_t*)block)[i*16])
+            ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
+                    stride);
+    }
+}
+
+void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((int16_t*)block)[i*16])
+                ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i],
+                        block + i*16, stride);
+            else
+                ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i, j;
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i, j;
+
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+        }
+    }
+
+    for(j=1; j<3; j++){
+        for(i=j*16+4; i<j*16+8; i++){
+            if(nnzc[ scan8[i+4] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4],
+                        block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
+        int qmul)
+{
+    double ftmp[10];
+    uint64_t tmp[2];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        ".set       noreorder                                           \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        MMI_LDC1(%[ftmp3], %[input], 0x18)
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        MMI_LDC1(%[ftmp2], %[input], 0x10)
+        "dli        %[tmp0],    0x20                                    \n\t"
+        MMI_LDC1(%[ftmp1], %[input], 0x08)
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        MMI_LDC1(%[ftmp0], %[input], 0x00)
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "mov.d      %[ftmp0],   %[ftmp4]                                \n\t"
+        "punpcklwd  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp2]                                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp4]                                \n\t"
+        "daddi      %[tmp0],    %[qmul],        -0x7fff                 \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "bgtz       %[tmp0],    1f                                      \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "ori        %[tmp0],    $0,             0x80                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp0],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
+        "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp0]                                \n\t"
+        "sh         %[tmp1],    0x00(%[output])                         \n\t"
+        "sh         %[input],   0x80(%[output])                         \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x20(%[output])                         \n\t"
+        "sh         %[input],   0xa0(%[output])                         \n\t"
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp2]                                \n\t"
+        "sh         %[tmp1],    0x40(%[output])                         \n\t"
+        "sh         %[input],   0xc0(%[output])                         \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x60(%[output])                         \n\t"
+        "sh         %[input],   0xe0(%[output])                         \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp3]                                \n\t"
+        "sh         %[tmp1],    0x100(%[output])                        \n\t"
+        "sh         %[input],   0x180(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x120(%[output])                        \n\t"
+        "sh         %[input],   0x1a0(%[output])                        \n\t"
+        "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
+        "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp4]                                \n\t"
+        "sh         %[tmp1],    0x140(%[output])                        \n\t"
+        "sh         %[input],   0x1c0(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x160(%[output])                        \n\t"
+        "j          2f                                                  \n\t"
+        "sh         %[input],   0x1e0(%[output])                        \n\t"
+        "1:                                                             \n\t"
+        "ori        %[tmp0],    $0,             0x1f                    \n\t"
+#if HAVE_LOONGSON3
+        "clz        %[tmp1],    %[qmul]                                 \n\t"
+#elif HAVE_LOONGSON2
+#endif
+        "ori        %[input],   $0,             0x07                    \n\t"
+        "dsubu      %[tmp1],    %[tmp0],        %[tmp1]                 \n\t"
+        "ori        %[tmp0],    $0,             0x80                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
+        "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        "dsubu      %[tmp0],    %[tmp1],        %[input]                \n\t"
+        "movn       %[tmp1],    %[input],       %[tmp0]                 \n\t"
+        PTR_ADDIU  "%[input],   %[input],       0x01                    \n\t"
+        "andi       %[tmp0],    %[tmp1],        0xff                    \n\t"
+        "srlv       %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        PTR_SUBU   "%[input],   %[input],       %[tmp1]                 \n\t"
+        "mtc1       %[input],   %[ftmp6]                                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
+        "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "sh         %[tmp1],    0x00(%[output])                         \n\t"
+        "mfc1       %[input],   %[ftmp0]                                \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        "sh         %[input],   0x80(%[output])                         \n\t"
+        "sh         %[tmp1],    0x20(%[output])                         \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "sh         %[input],   0xa0(%[output])                         \n\t"
+        "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
+        "sh         %[tmp1],    0x40(%[output])                         \n\t"
+        "mfc1       %[input],   %[ftmp2]                                \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        "sh         %[input],   0xc0(%[output])                         \n\t"
+        "sh         %[tmp1],    0x60(%[output])                         \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[input],   0xe0(%[output])                         \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp3]                                \n\t"
+        "sh         %[tmp1],    0x100(%[output])                        \n\t"
+        "sh         %[input],   0x180(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x120(%[output])                        \n\t"
+        "sh         %[input],   0x1a0(%[output])                        \n\t"
+        "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
+        "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp4]                                \n\t"
+        "sh         %[tmp1],    0x140(%[output])                        \n\t"
+        "sh         %[input],   0x1c0(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x160(%[output])                        \n\t"
+        "sh         %[input],   0x1e0(%[output])                        \n\t"
+        "2:                                                             \n\t"
+        ".set       reorder                                             \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ALL64
+          [output]"+&r"(output),            [input]"+&r"(input),
+          [qmul]"+&r"(qmul)
+        : [ff_pw_1]"f"(ff_pw_1)
+        : "memory"
+    );
+}
+
+void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
+{
+    int temp[8];
+    int t[8];
+
+    temp[0] = block[0] + block[16];
+    temp[1] = block[0] - block[16];
+    temp[2] = block[32] + block[48];
+    temp[3] = block[32] - block[48];
+    temp[4] = block[64] + block[80];
+    temp[5] = block[64] - block[80];
+    temp[6] = block[96] + block[112];
+    temp[7] = block[96] - block[112];
+
+    t[0] = temp[0] + temp[4] + temp[2] + temp[6];
+    t[1] = temp[0] - temp[4] + temp[2] - temp[6];
+    t[2] = temp[0] - temp[4] - temp[2] + temp[6];
+    t[3] = temp[0] + temp[4] - temp[2] - temp[6];
+    t[4] = temp[1] + temp[5] + temp[3] + temp[7];
+    t[5] = temp[1] - temp[5] + temp[3] - temp[7];
+    t[6] = temp[1] - temp[5] - temp[3] + temp[7];
+    t[7] = temp[1] + temp[5] - temp[3] - temp[7];
+
+    block[  0]= (t[0]*qmul + 128) >> 8;
+    block[ 32]= (t[1]*qmul + 128) >> 8;
+    block[ 64]= (t[2]*qmul + 128) >> 8;
+    block[ 96]= (t[3]*qmul + 128) >> 8;
+    block[ 16]= (t[4]*qmul + 128) >> 8;
+    block[ 48]= (t[5]*qmul + 128) >> 8;
+    block[ 80]= (t[6]*qmul + 128) >> 8;
+    block[112]= (t[7]*qmul + 128) >> 8;
+}
+
+void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
+{
+    int a,b,c,d;
+
+    d = block[0] - block[16];
+    a = block[0] + block[16];
+    b = block[32] - block[48];
+    c = block[32] + block[48];
+    block[0] = ((a+c)*qmul) >> 7;
+    block[16]= ((d+b)*qmul) >> 7;
+    block[32]= ((a-c)*qmul) >> 7;
+    block[48]= ((d-b)*qmul) >> 7;
+}
+
+void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+    double ftmp[8];
+    DECLARE_VAR_ALL64;
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            MMI_LDC1(%[ftmp1], %[block0], 0x00)
+            MMI_LDC1(%[ftmp2], %[block1], 0x00)
+            "mtc1       %[weight],  %[ftmp3]                            \n\t"
+            "mtc1       %[offset],  %[ftmp4]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
+            "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            MMI_SDC1(%[ftmp1], %[block0], 0x00)
+            MMI_SDC1(%[ftmp2], %[block1], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),
+              RESTRICT_ASM_ALL64
+              [ftmp7]"=&f"(ftmp[7])
+            : [block0]"r"(block),           [block1]"r"(block+8),
+              [weight]"r"(weight),          [offset]"r"(offset),
+              [log2_denom]"r"(log2_denom)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
+{
+    int y;
+    double ftmp[9];
+    DECLARE_VAR_ALL64;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            MMI_LDC1(%[ftmp1], %[src0], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst0], 0x00)
+            "mtc1       %[weights], %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            MMI_SDC1(%[ftmp1], %[dst0], 0x00)
+            MMI_LDC1(%[ftmp1], %[src1], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst1], 0x00)
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            MMI_SDC1(%[ftmp1], %[dst1], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
+              [ftmp8]"=&f"(ftmp[8])
+            : [dst0]"r"(dst),               [dst1]"r"(dst+8),
+              [src0]"r"(src),               [src1]"r"(src+8),
+              [weights]"r"(weights),        [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+    double ftmp[6];
+    DECLARE_VAR_ALL64;
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            MMI_LDC1(%[ftmp1], %[block], 0x00)
+            "mtc1       %[weight],  %[ftmp2]                            \n\t"
+            "mtc1       %[offset],  %[ftmp3]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
+            "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            MMI_SDC1(%[ftmp1], %[block], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),
+              RESTRICT_ASM_ALL64
+              [ftmp5]"=&f"(ftmp[5])
+            : [block]"r"(block),            [weight]"r"(weight),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
+{
+    int y;
+    double ftmp[9];
+    DECLARE_VAR_ALL64;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            MMI_LDC1(%[ftmp1], %[src], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
+            "mtc1       %[weights], %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
+              [ftmp8]"=&f"(ftmp[8])
+            : [dst]"r"(dst),                [src]"r"(src),
+              [weights]"r"(weights),        [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+    double ftmp[5];
+    DECLARE_VAR_LOW32;
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            MMI_ULWC1(%[ftmp1], %[block], 0x00)
+            "mtc1       %[weight],  %[ftmp2]                            \n\t"
+            "mtc1       %[offset],  %[ftmp3]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp4]            \n\t"
+            "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            MMI_SWC1(%[ftmp1], %[block], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              RESTRICT_ASM_LOW32
+              [ftmp4]"=&f"(ftmp[4])
+            : [block]"r"(block),            [weight]"r"(weight),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
+{
+    int y;
+    double ftmp[7];
+    DECLARE_VAR_LOW32;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[dst], 0x00)
+            "mtc1       %[weight],  %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              RESTRICT_ASM_LOW32
+              [ftmp6]"=&f"(ftmp[6])
+            : [dst]"r"(dst),                [src]"r"(src),
+              [weight]"r"(weights),         [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
+        );
+    }
+}
+
+void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    double ftmp[12];
+    mips_reg addr[2];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        PTR_ADDU   "%[addr1],   %[stride],      %[addr0]                \n\t"
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        PTR_SUBU   "%[addr1],   $0,             %[addr1]                \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[pix]                  \n\t"
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp1], %[addr1], %[stride], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        MMI_ULWC1(%[ftmp5], %[tc0], 0x00)
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp5]                \n\t"
+        "pcmpgtb    %[ftmp5],   %[ftmp9],       %[ftmp4]                \n\t"
+        MMI_LDC1(%[ftmp4], %[addr1], 0x00)
+        "and        %[ftmp10],  %[ftmp5],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp4],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "and        %[ftmp5],   %[ftmp10],      %[ftmp9]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp5],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp2],       %[ftmp3]                \n\t"
+        MMI_LDC1(%[ftmp11], %[addr1], 0x00)
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp11]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddusb    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "pmaxub     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        MMI_SDXC1(%[ftmp4], %[addr1], %[stride], 0x00)
+        MMI_LDXC1(%[ftmp5], %[pix], %[addr0], 0x00)
+        "psubusb    %[ftmp4],   %[ftmp5],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp6],   %[ftmp9],       %[ftmp7]                \n\t"
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
+        "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+        MMI_LDXC1(%[ftmp11], %[pix], %[addr0], 0x00)
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ff_pb_1]              \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "pmaxub     %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pminub     %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
+        "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        MMI_SDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"((mips_reg)alpha),      [beta]"r"((mips_reg)beta),
+          [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
+          [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
+    );
+}
+
+static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    DECLARE_ALIGNED(8, const uint64_t, stack[0x0a]);
+    double ftmp[16];
+    uint64_t tmp[1];
+    mips_reg addr[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "ori        %[tmp0],    $0,             0x01                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
+        PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
+        PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
+        PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
+        "bltz       %[alpha],   1f                                      \n\t"
+        PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
+        PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
+        "bltz       %[beta],    1f                                      \n\t"
+        PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp1], %[addr0], %[addr2], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[addr1], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp5], %[stack], 0x10)
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp5], %[stack], 0x10)
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
+        MMI_SDC1(%[ftmp8], %[stack], 0x20)
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp15], %[stack], 0x20)
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+        MMI_LDXC1(%[ftmp15], %[addr0], %[stride], 0x00)
+        "psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_LDXC1(%[ftmp14], %[pix], %[addr2], 0x00)
+        MMI_SDC1(%[ftmp5], %[stack], 0x30)
+        "psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp5], %[stack], 0x40)
+        "pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        MMI_SDC1(%[ftmp6], %[stack], 0x10)
+        "paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[stack], 0x00)
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
+        "psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x10)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+        "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x30)
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x20)
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        MMI_SDXC1(%[ftmp6], %[addr0], %[addr1], 0x00)
+        MMI_LDC1(%[ftmp6], %[addr0], 0x00)
+        "paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x30)
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        MMI_SDXC1(%[ftmp5], %[addr0], %[addr2], 0x00)
+        MMI_SDXC1(%[ftmp6], %[addr0], %[stride], 0x00)
+        "pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        MMI_SDC1(%[ftmp6], %[stack], 0x10)
+        "paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[stack], 0x00)
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x10)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x40)
+        "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x20)
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp6], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp6], %[pix], %[addr1], 0x00)
+        "paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x40)
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
+        MMI_SDXC1(%[ftmp6], %[pix], %[addr2], 0x00)
+        "1:                                                             \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [stack]"r"(stack),                [ff_pb_1]"m"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, ptrdiff_t stride, int alpha,
+        int beta, int8_t *tc0)
+{
+    double ftmp[9];
+    mips_reg addr[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        "or         %[addr0],   $0,             %[pix]                  \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
+
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        MMI_ULWC1(%[ftmp7], %[tc0], 0x00)
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+
+        MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"(alpha),                [beta]"r"(beta),
+          [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
+          [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
+    );
+}
+
+void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    double ftmp[9];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        "or         %[addr0],   $0,             %[pix]                  \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
+
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp3]                                \n\t"
+        "xor        %[ftmp5],   %[ftmp2],       %[ftmp4]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "xor        %[ftmp5],   %[ftmp3],       %[ftmp1]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "and        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "and        %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "paddb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+
+        MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"(alpha),                [beta]"r"(beta),
+          [ff_pb_1]"f"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    double ftmp[11];
+    mips_reg addr[6];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
+        "or         %[addr5],   $0,             %[pix]                  \n\t"
+        PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
+        MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
+        MMI_ULWC1(%[ftmp3], %[pix], 0x00)
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
+        MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
+        MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "mov.d      %[ftmp9],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp10],  %[ftmp3]                                \n\t"
+
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "mtc1       %[alpha],   %[ftmp4]                                \n\t"
+        "mtc1       %[beta],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        MMI_ULWC1(%[ftmp6], %[tc0], 0x00)
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "xor        %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ff_pb_A1],    %[ftmp3]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "pminub     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "punpckhwd  %[ftmp4],   %[ftmp9],       %[ftmp9]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp9],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp10],      %[ftmp10]               \n\t"
+        MMI_USWC1(%[ftmp0], %[pix], 0x00)
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr4], 0x00)
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
+        MMI_USWC1(%[ftmp9], %[addr3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          RESTRICT_ASM_LOW32
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [pix]"+&r"(pix)
+        : [alpha]"r"(alpha),                [beta]"r"(beta),
+          [stride]"r"((mips_reg)stride),    [tc0]"r"(tc0),
+          [ff_pb_1]"f"(ff_pb_1),            [ff_pb_3]"f"(ff_pb_3),
+          [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
+    );
+}
+
+void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    double ftmp[11];
+    mips_reg addr[6];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
+        "or         %[addr5],   $0,             %[pix]                  \n\t"
+        PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
+        MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
+        MMI_ULWC1(%[ftmp3], %[pix], 0x00)
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
+        MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
+        MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "mtc1       %[alpha],   %[ftmp4]                                \n\t"
+        "mtc1       %[beta],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
+        "xor        %[ftmp4],   %[ftmp1],       %[ftmp3]                \n\t"
+        "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "xor        %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
+        "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "and        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "and        %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        MMI_USWC1(%[ftmp0], %[pix], 0x00)
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        MMI_USWC1(%[ftmp4], %[addr4], 0x00)
+        "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
+        MMI_USWC1(%[ftmp9], %[addr3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          RESTRICT_ASM_LOW32
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [pix]"+&r"(pix)
+        : [alpha]"r"(alpha),                [beta]"r"(beta),
+          [stride]"r"((mips_reg)stride),    [ff_pb_1]"f"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    if ((tc0[0] & tc0[1]) >= 0)
+        ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0);
+    if ((tc0[2] & tc0[3]) >= 0)
+        ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2);
+}
+
+void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
+    deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
+}
+
+void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    DECLARE_ALIGNED(8, const uint64_t, stack[0x0d]);
+    double ftmp[9];
+    mips_reg addr[8];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x4                    \n\t"
+        PTR_ADDU   "%[addr2],   %[stride],      %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
+        MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x10)
+        MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
+        PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        MMI_LDC1(%[ftmp8], %[stack], 0x10)
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp0], %[stack], 0x00)
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x10)
+        MMI_SDC1(%[ftmp3], %[stack], 0x20)
+        MMI_SDC1(%[ftmp7], %[stack], 0x30)
+        MMI_SDC1(%[ftmp5], %[stack], 0x40)
+        MMI_SDC1(%[ftmp6], %[stack], 0x50)
+        PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
+        MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x18)
+        MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        MMI_LDC1(%[ftmp8], %[stack], 0x18)
+        MMI_SDC1(%[ftmp0], %[stack], 0x08)
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x18)
+        MMI_SDC1(%[ftmp3], %[stack], 0x28)
+        MMI_SDC1(%[ftmp7], %[stack], 0x38)
+        MMI_SDC1(%[ftmp5], %[stack], 0x48)
+        MMI_SDC1(%[ftmp6], %[stack], 0x58)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6]),            [addr7]"=&r"(addr[7])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [stack]"r"(stack)
+        : "memory"
+    );
+
+    ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0);
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x02                   \n\t"
+        PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        MMI_LDC1(%[ftmp0], %[stack], 0x10)
+        MMI_LDC1(%[ftmp1], %[stack], 0x20)
+        MMI_LDC1(%[ftmp2], %[stack], 0x30)
+        MMI_LDC1(%[ftmp3], %[stack], 0x40)
+        "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr1], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr5], 0x00)
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
+        MMI_USWC1(%[ftmp4], %[addr3], 0x00)
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
+        MMI_LDC1(%[ftmp0], %[stack], 0x18)
+        MMI_LDC1(%[ftmp1], %[stack], 0x28)
+        MMI_LDC1(%[ftmp2], %[stack], 0x38)
+        MMI_LDC1(%[ftmp3], %[stack], 0x48)
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
+        PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr1], 0x00)
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr5], 0x00)
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr5], 0x00)
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        MMI_USWC1(%[ftmp4], %[addr3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6]),            [addr7]"=&r"(addr[7])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [stack]"r"(stack)
+        : "memory"
+    );
+}
+
+void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    DECLARE_ALIGNED(8, const uint64_t, ptmp[0x11]);
+    DECLARE_ALIGNED(8, const uint64_t, pdat[0x04]);
+    double ftmp[9];
+    mips_reg addr[7];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x04                   \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr3],   %[addr0],       %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
+        MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x00)
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_SDC1(%[ftmp2], %[ptmp], 0x20)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x00)
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x00)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x10)
+        MMI_SDC1(%[ftmp7], %[ptmp], 0x40)
+        MMI_SDC1(%[ftmp4], %[ptmp], 0x50)
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x20)
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x20)
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x30)
+        MMI_SDC1(%[ftmp6], %[ptmp], 0x60)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x70)
+        PTR_ADDU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
+        MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x08)
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_SDC1(%[ftmp2], %[ptmp], 0x28)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x08)
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x08)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x18)
+        MMI_SDC1(%[ftmp7], %[ptmp], 0x48)
+        MMI_SDC1(%[ftmp4], %[ptmp], 0x58)
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x28)
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x28)
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x38)
+        MMI_SDC1(%[ftmp6], %[ptmp], 0x68)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x78)
+        PTR_S      "%[addr1],   0x00(%[pdat])                           \n\t"
+        PTR_S      "%[addr2],   0x08(%[pdat])                           \n\t"
+        PTR_S      "%[addr0],   0x10(%[pdat])                           \n\t"
+        PTR_S      "%[addr3],   0x18(%[pdat])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [ptmp]"r"(ptmp),                  [pdat]"r"(pdat)
+        : "memory"
+    );
+
+    ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta);
+
+    __asm__ volatile (
+        PTR_L      "%[addr1],   0x00(%[pdat])                           \n\t"
+        PTR_L      "%[addr2],   0x08(%[pdat])                           \n\t"
+        PTR_L      "%[addr0],   0x10(%[pdat])                           \n\t"
+        PTR_L      "%[addr3],   0x18(%[pdat])                           \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        MMI_LDC1(%[ftmp0], %[ptmp], 0x08)
+        MMI_LDC1(%[ftmp1], %[ptmp], 0x18)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x28)
+        MMI_LDC1(%[ftmp3], %[ptmp], 0x38)
+        MMI_LDC1(%[ftmp4], %[ptmp], 0x48)
+        MMI_LDC1(%[ftmp5], %[ptmp], 0x58)
+        MMI_LDC1(%[ftmp6], %[ptmp], 0x68)
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x78)
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        MMI_USDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_USDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        MMI_USDC1(%[ftmp0], %[addr1], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
+        MMI_USDC1(%[ftmp5], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        MMI_USDC1(%[ftmp7], %[addr6], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        MMI_USDC1(%[ftmp4], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        MMI_USDC1(%[ftmp3], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        MMI_USDC1(%[ftmp0], %[addr4], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
+        MMI_USDC1(%[ftmp6], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
+        MMI_USDC1(%[ftmp5], %[addr6], 0x00)
+        PTR_SUBU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
+        PTR_SUBU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
+        MMI_LDC1(%[ftmp0], %[ptmp], 0x00)
+        MMI_LDC1(%[ftmp1], %[ptmp], 0x10)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x20)
+        MMI_LDC1(%[ftmp3], %[ptmp], 0x30)
+        MMI_LDC1(%[ftmp4], %[ptmp], 0x40)
+        MMI_LDC1(%[ftmp5], %[ptmp], 0x50)
+        MMI_LDC1(%[ftmp6], %[ptmp], 0x60)
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x70)
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        MMI_USDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_USDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        MMI_USDC1(%[ftmp0], %[addr1], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
+        MMI_USDC1(%[ftmp5], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        MMI_USDC1(%[ftmp7], %[addr6], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        MMI_USDC1(%[ftmp4], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        MMI_USDC1(%[ftmp3], %[addr5], 0x00)
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        MMI_USDC1(%[ftmp0], %[addr4], 0x00)
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
+        MMI_USDC1(%[ftmp6], %[addr5], 0x00)
+        MMI_USDC1(%[ftmp5], %[addr6], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [ptmp]"r"(ptmp),                  [pdat]"r"(pdat)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
new file mode 100644
index 0000000..e50f5ca
--- /dev/null
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -0,0 +1,2600 @@
+/*
+ * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
+                            int32_t log2_denom, int32_t src_weight,
+                            int32_t offset_in)
+{
+    uint32_t tp0, tp1, offset_val;
+    v16u8 zero = { 0 };
+    v16u8 src0 = { 0 };
+    v8i16 src0_r, tmp0, wgt, denom, offset;
+
+    offset_val = (unsigned) offset_in << log2_denom;
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_val);
+    denom = __msa_fill_h(log2_denom);
+
+    LW2(data, stride, tp0, tp1);
+    INSERT_W2_UB(tp0, tp1, src0);
+    src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
+    tmp0 = wgt * src0_r;
+    tmp0 = __msa_adds_s_h(tmp0, offset);
+    tmp0 = __msa_maxi_s_h(tmp0, 0);
+    tmp0 = __msa_srlr_h(tmp0, denom);
+    tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
+    src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+    ST4x2_UB(src0, data, stride);
+}
+
+static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+                            int32_t src_weight, int32_t offset_in)
+{
+    uint32_t tp0, tp1, tp2, tp3, offset_val;
+    v16u8 src0 = { 0 };
+    v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
+
+    offset_val = (unsigned) offset_in << log2_denom;
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_val);
+    denom = __msa_fill_h(log2_denom);
+
+    LW4(data, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+    UNPCK_UB_SH(src0, src0_r, src1_r);
+    MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
+    ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
+    MAXI_SH2_SH(tmp0, tmp1, 0);
+    tmp0 = __msa_srlr_h(tmp0, denom);
+    tmp1 = __msa_srlr_h(tmp1, denom);
+    SAT_UH2_SH(tmp0, tmp1, 7);
+    src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
+}
+
+static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+                            int32_t src_weight, int32_t offset_in)
+{
+    uint32_t tp0, tp1, tp2, tp3, offset_val;
+    v16u8 src0 = { 0 }, src1 = { 0 };
+    v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
+    v8i16 wgt, denom, offset;
+
+    offset_val = (unsigned) offset_in << log2_denom;
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_val);
+    denom = __msa_fill_h(log2_denom);
+
+    LW4(data, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+    LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
+    UNPCK_UB_SH(src0, src0_r, src1_r);
+    UNPCK_UB_SH(src1, src2_r, src3_r);
+    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
+         tmp3);
+    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+                tmp1, tmp2, tmp3);
+    MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
+    SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
+    SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    ST4x8_UB(src0, src1, data, stride);
+}
+
+static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+                            int32_t src_weight, int32_t offset_in)
+{
+    uint32_t offset_val;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0 = { 0 }, src1 = { 0 };
+    v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
+    v8i16 wgt, denom, offset;
+
+    offset_val = (unsigned) offset_in << log2_denom;
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_val);
+    denom = __msa_fill_h(log2_denom);
+
+    LD4(data, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, src0);
+    INSERT_D2_UB(tp2, tp3, src1);
+    UNPCK_UB_SH(src0, src0_r, src1_r);
+    UNPCK_UB_SH(src1, src2_r, src3_r);
+    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
+         tmp3);
+    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+                tmp1, tmp2, tmp3);
+    MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
+    SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
+    SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    ST8x4_UB(src0, src1, data, stride);
+}
+
+static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+                            int32_t src_weight, int32_t offset_in)
+{
+    uint32_t offset_val;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 wgt, denom, offset;
+
+    offset_val = (unsigned) offset_in << log2_denom;
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_val);
+    denom = __msa_fill_h(log2_denom);
+
+    LD4(data, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, src0);
+    INSERT_D2_UB(tp2, tp3, src1);
+    LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, src2);
+    INSERT_D2_UB(tp2, tp3, src3);
+    UNPCK_UB_SH(src0, src0_r, src1_r);
+    UNPCK_UB_SH(src1, src2_r, src3_r);
+    UNPCK_UB_SH(src2, src4_r, src5_r);
+    UNPCK_UB_SH(src3, src6_r, src7_r);
+    MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
+         tmp3);
+    MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
+         tmp7);
+    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+                tmp1, tmp2, tmp3);
+    ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
+                tmp5, tmp6, tmp7);
+    MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+    SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+    SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
+                src2, src3);
+    ST8x8_UB(src0, src1, src2, src3, data, stride);
+}
+
+static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
+                             int32_t src_weight, int32_t offset_in)
+{
+    uint32_t offset_val, cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 wgt, denom, offset;
+
+    offset_val = (unsigned) offset_in << log2_denom;
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_val);
+    denom = __msa_fill_h(log2_denom);
+
+    for (cnt = 2; cnt--;) {
+        LD4(data, stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, src0);
+        INSERT_D2_UB(tp2, tp3, src1);
+        LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, src2);
+        INSERT_D2_UB(tp2, tp3, src3);
+        UNPCK_UB_SH(src0, src0_r, src1_r);
+        UNPCK_UB_SH(src1, src2_r, src3_r);
+        UNPCK_UB_SH(src2, src4_r, src5_r);
+        UNPCK_UB_SH(src3, src6_r, src7_r);
+        MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
+             tmp2, tmp3);
+        MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
+             tmp6, tmp7);
+        ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
+                    tmp0, tmp1, tmp2, tmp3);
+        ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
+                    tmp4, tmp5, tmp6, tmp7);
+        MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+        SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+        SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
+                    src2, src3);
+        ST8x8_UB(src0, src1, src2, src3, data, stride);
+        data += 8 * stride;
+    }
+}
+
+static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                              int32_t log2_denom, int32_t src_weight,
+                              int32_t dst_weight, int32_t offset_in)
+{
+    uint32_t tp0, tp1;
+    v16i8 src_wgt, dst_wgt, wgt, vec0;
+    v16u8 src0 = { 0 }, dst0 = { 0 };
+    v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
+
+    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+    offset_in += (128 * (src_weight + dst_weight));
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    LW2(src, stride, tp0, tp1);
+    INSERT_W2_UB(tp0, tp1, src0);
+    LW2(dst, stride, tp0, tp1);
+    INSERT_W2_UB(tp0, tp1, dst0);
+    XORI_B2_128_UB(src0, dst0);
+    vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
+    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+    tmp0 >>= denom;
+    tmp0 = __msa_maxi_s_h(tmp0, 0);
+    tmp0 = __msa_min_s_h(max255, tmp0);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+    ST4x2_UB(dst0, dst, stride);
+}
+
+static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                              int32_t log2_denom, int32_t src_weight,
+                              int32_t dst_weight, int32_t offset_in)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
+    v16u8 src0, dst0;
+    v8i16 tmp0, tmp1, denom, offset;
+
+    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+    offset_in += (128 * (src_weight + dst_weight));
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    LW4(src, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    XORI_B2_128_UB(src0, dst0);
+    ILVRL_B2_SB(dst0, src0, vec0, vec1);
+    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+    tmp0 >>= denom;
+    tmp1 >>= denom;
+    CLIP_SH2_0_255(tmp0, tmp1);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                              int32_t log2_denom, int32_t src_weight,
+                              int32_t dst_weight, int32_t offset_in)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
+    v16u8 src0, src1, dst0, dst1;
+    v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
+
+    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+    offset_in += (128 * (src_weight + dst_weight));
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    LW4(src, stride, tp0, tp1, tp2, tp3);
+    src += 4 * stride;
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+    LW4(src, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    XORI_B4_128_UB(src0, src1, dst0, dst1);
+    ILVRL_B2_SB(dst0, src0, vec0, vec1);
+    ILVRL_B2_SB(dst1, src1, vec2, vec3);
+    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+    ST4x8_UB(dst0, dst1, dst, stride);
+}
+
+static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                              int32_t log2_denom, int32_t src_weight,
+                              int32_t dst_weight, int32_t offset_in)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
+    v16u8 src0, src1, dst0, dst1;
+    v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
+
+    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+    offset_in += (128 * (src_weight + dst_weight));
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    LD4(src, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, src0);
+    INSERT_D2_UB(tp2, tp3, src1);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    XORI_B4_128_UB(src0, src1, dst0, dst1);
+    ILVRL_B2_SB(dst0, src0, vec0, vec1);
+    ILVRL_B2_SB(dst1, src1, vec2, vec3);
+    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+}
+
+static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                              int32_t log2_denom, int32_t src_weight,
+                              int32_t dst_weight, int32_t offset_in)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
+
+    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+    offset_in += (128 * (src_weight + dst_weight));
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    LD4(src, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, src0);
+    INSERT_D2_UB(tp2, tp3, src1);
+    LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, src2);
+    INSERT_D2_UB(tp2, tp3, src3);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+    XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
+    ILVRL_B2_SB(dst0, src0, vec0, vec1);
+    ILVRL_B2_SB(dst1, src1, vec2, vec3);
+    ILVRL_B2_SB(dst2, src2, vec4, vec5);
+    ILVRL_B2_SB(dst3, src3, vec6, vec7);
+    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+    tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+    tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+    tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+    tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+    SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+    PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+                               int32_t log2_denom, int32_t src_weight,
+                               int32_t dst_weight, int32_t offset_in)
+{
+    uint8_t cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 denom, offset;
+
+    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+    offset_in += (128 * (src_weight + dst_weight));
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    for (cnt = 2; cnt--;) {
+        LD4(src, stride, tp0, tp1, tp2, tp3);
+        src += 4 * stride;
+        INSERT_D2_UB(tp0, tp1, src0);
+        INSERT_D2_UB(tp2, tp3, src1);
+        LD4(src, stride, tp0, tp1, tp2, tp3);
+        src += 4 * stride;
+        INSERT_D2_UB(tp0, tp1, src2);
+        INSERT_D2_UB(tp2, tp3, src3);
+        LD4(dst, stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst2);
+        INSERT_D2_UB(tp2, tp3, dst3);
+        XORI_B4_128_UB(src0, src1, src2, src3);
+        XORI_B4_128_UB(dst0, dst1, dst2, dst3);
+        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   vec0, vec2, vec4, vec6);
+        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   vec1, vec3, vec5, vec7);
+
+        temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+        temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+        temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+        temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+        temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+        temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+        temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+        temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+
+        SRA_4V(temp0, temp1, temp2, temp3, denom);
+        SRA_4V(temp4, temp5, temp6, temp7, denom);
+        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+        CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+        PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                    dst0, dst1, dst2, dst3);
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+        dst += 8 * stride;
+    }
+}
+
+#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
+                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
+                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
+                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
+{                                                                           \
+    v8i16 threshold;                                                        \
+    v8i16 const3 = __msa_ldi_h(3);                                          \
+                                                                            \
+    threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \
+    threshold += (p1_or_q1_org_in);                                         \
+                                                                            \
+    (p0_or_q0_out) = threshold << 1;                                        \
+    (p0_or_q0_out) += (p2_or_q2_org_in);                                    \
+    (p0_or_q0_out) += (q1_or_p1_org_in);                                    \
+    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \
+                                                                            \
+    (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \
+    (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \
+                                                                            \
+    (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \
+    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
+    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
+    (p2_or_q2_out) += threshold;                                            \
+    (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \
+}
+
+/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
+#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \
+                         p1_or_q1_org_in, p0_or_q0_out)      \
+{                                                            \
+    (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \
+    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
+    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
+    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \
+}
+
+#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \
+                         p1_or_q1_org_in, p2_or_q2_org_in,    \
+                         negate_tc_in, tc_in, p1_or_q1_out)   \
+{                                                             \
+    v8i16 clip3, temp;                                        \
+                                                              \
+    clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \
+                                   (v8u16) q0_or_p0_org_in);  \
+    temp = p1_or_q1_org_in << 1;                              \
+    clip3 = clip3 - temp;                                     \
+    clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
+    clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);              \
+    p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
+}
+
+#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \
+                     p1_or_q1_org_in, q1_or_p1_org_in,          \
+                     negate_threshold_in, threshold_in,         \
+                     p0_or_q0_out, q0_or_p0_out)                \
+{                                                               \
+    v8i16 q0_sub_p0, p1_sub_q1, delta;                          \
+                                                                \
+    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \
+    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \
+    q0_sub_p0 <<= 2;                                            \
+    p1_sub_q1 += 4;                                             \
+    delta = q0_sub_p0 + p1_sub_q1;                              \
+    delta >>= 3;                                                \
+                                                                \
+    delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+                                                                \
+    p0_or_q0_out = p0_or_q0_org_in + delta;                     \
+    q0_or_p0_out = q0_or_p0_org_in - delta;                     \
+                                                                \
+    CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \
+}
+
+#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \
+{                                                                        \
+    uint32_t load0, load1, load2, load3;                                 \
+    v16u8 src0 = { 0 };                                                  \
+    v16u8 src1 = { 0 };                                                  \
+    v16u8 src2 = { 0 };                                                  \
+    v16u8 src3 = { 0 };                                                  \
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \
+    v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \
+    v8i16 res0_r, res1_r;                                                \
+    v16i8 zeros = { 0 };                                                 \
+    v16u8 res0, res1;                                                    \
+                                                                         \
+    LW4((src - 2), stride, load0, load1, load2, load3);                  \
+    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \
+    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \
+    src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \
+    src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \
+                                                                         \
+    TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \
+                                                                         \
+    p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \
+    p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \
+    q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \
+                                                                         \
+    tc = __msa_fill_h(tc_val);                                           \
+                                                                         \
+    is_less_than_alpha = (p0_asub_q0 < alpha);                           \
+    is_less_than_beta = (p1_asub_p0 < beta);                             \
+    is_less_than = is_less_than_alpha & is_less_than_beta;               \
+    is_less_than_beta = (q1_asub_q0 < beta);                             \
+    is_less_than = is_less_than_beta & is_less_than;                     \
+                                                                         \
+    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \
+    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \
+                                                                         \
+    q0_sub_p0 <<= 2;                                                     \
+    delta = q0_sub_p0 + p1_sub_q1;                                       \
+    delta = __msa_srari_h(delta, 3);                                     \
+                                                                         \
+    delta = CLIP_SH(delta, -tc, tc);                                     \
+                                                                         \
+    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
+                                                                         \
+    res0_r += delta;                                                     \
+    res1_r -= delta;                                                     \
+                                                                         \
+    CLIP_SH2_0_255(res0_r, res1_r);                                      \
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \
+                                                                         \
+    res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \
+    res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \
+                                                                         \
+    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \
+}
+
+#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \
+{                                                            \
+    v16i8 zero_m = { 0 };                                    \
+                                                             \
+    out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
+    out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \
+    SLDI_B2_0_UB(out1, out2, out2, out3, 2);                 \
+}
+
+#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
+{                                                                          \
+    uint32_t load0, load1;                                                 \
+    v16u8 src0 = { 0 };                                                    \
+    v16u8 src1 = { 0 };                                                    \
+    v16u8 src2 = { 0 };                                                    \
+    v16u8 src3 = { 0 };                                                    \
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \
+    v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \
+    v16i8 zeros = { 0 };                                                   \
+    v16u8 res0, res1;                                                      \
+                                                                           \
+    load0 = LW(src - 2);                                                   \
+    load1 = LW(src - 2 + stride);                                          \
+                                                                           \
+    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \
+    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \
+                                                                           \
+    TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \
+                                                                           \
+    p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \
+    p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \
+    q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \
+                                                                           \
+    tc = __msa_fill_h(tc_val);                                             \
+                                                                           \
+    is_less_than_alpha = (p0_asub_q0 < alpha);                             \
+    is_less_than_beta = (p1_asub_p0 < beta);                               \
+    is_less_than = is_less_than_alpha & is_less_than_beta;                 \
+    is_less_than_beta = (q1_asub_q0 < beta);                               \
+    is_less_than = is_less_than_beta & is_less_than;                       \
+                                                                           \
+    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \
+    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \
+                                                                           \
+    q0_sub_p0 <<= 2;                                                       \
+    delta = q0_sub_p0 + p1_sub_q1;                                         \
+    delta = __msa_srari_h(delta, 3);                                       \
+    delta = CLIP_SH(delta, -tc, tc);                                       \
+                                                                           \
+    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
+                                                                           \
+    res0_r += delta;                                                       \
+    res1_r -= delta;                                                       \
+                                                                           \
+    CLIP_SH2_0_255(res0_r, res1_r);                                        \
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \
+                                                                           \
+    res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \
+    res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \
+                                                                           \
+    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \
+}
+
+static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+
+    LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
+
+    p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha_in);
+    is_less_than_beta = (p1_asub_p0 < beta_in);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta_in);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
+        v8i16 p0_r = { 0 };
+        v8i16 q0_r = { 0 };
+        v8i16 p0_l = { 0 };
+        v8i16 q0_l = { 0 };
+        v16i8 zero = { 0 };
+        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+        v16u8 q2_org = LD_UB(data + (2 * img_width));
+        v16u8 p2_org = LD_UB(data - (3 * img_width));
+        v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
+
+        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+        tmp_flag = (p0_asub_q0 < tmp_flag);
+
+        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+        is_less_than_beta = (p2_asub_p0 < beta_in);
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+        q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v8i16 p3_org_l, p3_org_r;
+            v16u8 p3_org = LD_UB(data - (img_width << 2));
+            v16u8 p2, p1;
+            v8i16 p2_r = { 0 };
+            v8i16 p2_l = { 0 };
+            v8i16 p1_r = { 0 };
+            v8i16 p1_l = { 0 };
+
+            ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
+                                     p2_r, q1_org_r, p0_r, p1_r, p2_r);
+
+            ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
+                                     p2_l, q1_org_l, p0_l, p1_l, p2_l);
+
+            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
+            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
+
+            ST_UB(p1_org, data - (2 * img_width));
+            ST_UB(p2_org, data - (3 * img_width));
+        }
+
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
+        AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
+
+        /* combine */
+        p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
+        p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
+
+        ST_UB(p0_org, data - img_width);
+
+        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
+        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+        is_less_than_beta = (q2_asub_q0 < beta_in);
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v8i16 q3_org_r, q3_org_l;
+            v16u8 q3_org = LD_UB(data + (3 * img_width));
+            v16u8 q1, q2;
+            v8i16 q2_r = { 0 };
+            v8i16 q2_l = { 0 };
+            v8i16 q1_r = { 0 };
+            v8i16 q1_l = { 0 };
+
+            ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
+                                     q2_r, p1_org_r, q0_r, q1_r, q2_r);
+
+            ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
+                                     q2_l, p1_org_l, q0_l, q1_l, q2_l);
+
+            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
+            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
+
+            ST_UB(q1_org, data + img_width);
+            ST_UB(q2_org, data + 2 * img_width);
+        }
+
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
+        AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
+
+        /* combine */
+        q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
+        q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
+
+        ST_UB(q0_org, data);
+    }
+}
+
+static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    uint8_t *src = data - 4;
+    v16u8 alpha, beta, p0_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v16u8 p1_asub_p0, q1_asub_q0;
+
+
+    {
+        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+        v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+        LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
+        LD_UB8(src + (8 * img_width), img_width,
+               row8, row9, row10, row11, row12, row13, row14, row15);
+
+        TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
+                            row4, row5, row6, row7,
+                            row8, row9, row10, row11,
+                            row12, row13, row14, row15,
+                            p3_org, p2_org, p1_org, p0_org,
+                            q0_org, q1_org, q2_org, q3_org);
+    }
+
+    p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        v8i16 p0_r = { 0 };
+        v8i16 q0_r = { 0 };
+        v8i16 p0_l = { 0 };
+        v8i16 q0_l = { 0 };
+        v16i8 zero = { 0 };
+        v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
+        v16u8 negate_is_less_than_beta;
+        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+
+        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+        UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
+
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = (p0_asub_q0 < tmp_flag);
+
+        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+        is_less_than_beta = (p2_asub_p0 < beta);
+        is_less_than_beta = tmp_flag & is_less_than_beta;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v16u8 p2, p1;
+            v8i16 p3_org_r, p3_org_l;
+            v8i16 p2_l = { 0 };
+            v8i16 p2_r = { 0 };
+            v8i16 p1_l = { 0 };
+            v8i16 p1_r = { 0 };
+
+            ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
+                                     p2_r, q1_org_r, p0_r, p1_r, p2_r);
+
+            ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
+                                         p2_l, q1_org_l, p0_l, p1_l, p2_l);
+
+            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
+            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
+        }
+
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
+        AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
+
+        p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
+        p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
+
+        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+        is_less_than_beta = (q2_asub_q0 < beta);
+
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v16u8 q1, q2;
+            v8i16 q3_org_r, q3_org_l;
+            v8i16 q1_l = { 0 };
+            v8i16 q1_r = { 0 };
+            v8i16 q2_l = { 0 };
+            v8i16 q2_r = { 0 };
+
+            ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
+                                     q2_r, p1_org_r, q0_r, q1_r, q2_r);
+
+            ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
+            AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
+                                     q2_l, p1_org_l, q0_l, q1_l, q2_l);
+
+            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
+            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
+        }
+
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
+        AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
+
+        q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
+        q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
+
+    {
+        v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+        ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
+        ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
+        ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
+
+        ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
+        ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
+
+        src = data - 3;
+        ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp2, 0, src + 4, img_width);
+        src += 4 * img_width;
+        ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp2, 4, src + 4, img_width);
+        src += 4 * img_width;
+
+        ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp5, 0, src + 4, img_width);
+        src += 4 * img_width;
+        ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp5, 4, src + 4, img_width);
+    }
+    }
+}
+
+static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
+                                                   int32_t alpha_in,
+                                                   int32_t beta_in)
+{
+    uint64_t load0, load1;
+    uint32_t out0, out2;
+    uint16_t out1, out3;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
+    v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
+    v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 alpha, beta;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
+    v16u8 is_less_than_beta1, is_less_than_beta2;
+    v16i8 src0 = { 0 };
+    v16i8 src1 = { 0 };
+    v16i8 src2 = { 0 };
+    v16i8 src3 = { 0 };
+    v16i8 src4 = { 0 };
+    v16i8 src5 = { 0 };
+    v16i8 src6 = { 0 };
+    v16i8 src7 = { 0 };
+    v16i8 zeros = { 0 };
+
+    load0 = LD(src - 4);
+    load1 = LD(src + stride - 4);
+    src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
+    src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
+
+    load0 = LD(src + (2 * stride) - 4);
+    load1 = LD(src + (3 * stride) - 4);
+    src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
+    src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
+
+    load0 = LD(src + (4 * stride) - 4);
+    load1 = LD(src + (5 * stride) - 4);
+    src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
+    src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
+
+    load0 = LD(src + (6 * stride) - 4);
+    load1 = LD(src + (7 * stride) - 4);
+    src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
+    src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+               src0, src1, src2, src3);
+
+    ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
+    ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
+
+    ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
+    ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
+    SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
+
+    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
+    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
+    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_alpha & is_less_than_beta;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than & is_less_than_beta;
+
+    alpha >>= 2;
+    alpha += 2;
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+
+    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
+    is_less_than_beta1 = (p2_asub_p0 < beta);
+    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
+    is_less_than_beta2 = (q2_asub_q0 < beta);
+
+    ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
+               src4_r, src5_r, src6_r, src7_r);
+
+    dst2_x_r = src1_r + src2_r + src3_r;
+    dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
+    dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
+    dst1_r = src0_r + src1_r + src2_r + src3_r;
+    dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
+
+    dst0_r = (2 * src6_r) + (3 * src0_r);
+    dst0_r += src1_r + src2_r + src3_r;
+    dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
+    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
+    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
+
+    PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
+    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
+
+    dst3_x_r = src2_r + src3_r + src4_r;
+    dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
+    dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
+    dst4_r = src2_r + src3_r + src4_r + src5_r;
+    dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
+
+    dst5_r = (2 * src7_r) + (3 * src5_r);
+    dst5_r += src4_r + src3_r + src2_r;
+    dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
+    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
+    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
+
+    PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
+    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
+
+    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
+    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
+    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
+    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
+
+    PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
+
+    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
+    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
+    dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
+    dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
+
+    is_less_than = is_less_than_alpha & is_less_than;
+    dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
+    is_less_than_beta1 = is_less_than_beta1 & is_less_than;
+    dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
+
+    dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+    dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
+    dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
+    is_less_than_beta2 = is_less_than_beta2 & is_less_than;
+    dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
+    dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
+    dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
+
+    ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
+    dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
+    ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
+    ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
+
+    ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
+    SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
+    dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
+    dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
+    SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
+
+    out0 = __msa_copy_u_w((v4i32) dst0, 0);
+    out1 = __msa_copy_u_h((v8i16) dst0, 2);
+    out2 = __msa_copy_u_w((v4i32) dst1, 0);
+    out3 = __msa_copy_u_h((v8i16) dst1, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
+    out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
+    out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
+    out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst4, 0);
+    out1 = __msa_copy_u_h((v8i16) dst4, 2);
+    out2 = __msa_copy_u_w((v4i32) dst5, 0);
+    out3 = __msa_copy_u_h((v8i16) dst5, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
+    out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
+    out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
+    out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+}
+
+static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v16u8 alpha, beta;
+    v16u8 is_less_than;
+    v8i16 p0_or_q0, q0_or_p0;
+    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
+    v16i8 zero = { 0 };
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than_beta;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
+           p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
+
+    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
+                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
+        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
+
+        p0_or_q0_org =
+            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
+        q0_or_p0_org =
+            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
+
+        ST_UB(q0_or_p0_org, data_cb_or_cr);
+        ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v8i16 tmp1;
+    v16u8 alpha, beta, is_less_than;
+    v8i16 p0_or_q0, q0_or_p0;
+    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
+    v16i8 zero = { 0 };
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than_beta;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+    {
+        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+
+        LD_UB8((data_cb_or_cr - 2), img_width,
+               row0, row1, row2, row3, row4, row5, row6, row7);
+
+        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                           p1_or_q1_org, p0_or_q0_org,
+                           q0_or_p0_org, q1_or_p1_org);
+    }
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
+                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
+
+        /* convert 16 bit output into 8 bit output */
+        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
+
+        p0_or_q0_org =
+            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
+        q0_or_p0_org =
+            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
+        tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
+
+        data_cb_or_cr -= 1;
+        ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
+        data_cb_or_cr += 4 * img_width;
+        ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
+                                                   uint8_t bs0, uint8_t bs1,
+                                                   uint8_t bs2, uint8_t bs3,
+                                                   uint8_t tc0, uint8_t tc1,
+                                                   uint8_t tc2, uint8_t tc3,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    v16u8 tmp_vec, bs = { 0 };
+
+    tmp_vec = (v16u8) __msa_fill_b(bs0);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs1);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs2);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs3);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
+
+    if (!__msa_test_bz_v(bs)) {
+        uint8_t *src = data - 4;
+        v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
+        v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
+        v16u8 is_bs_greater_than0;
+        v16u8 tc = { 0 };
+        v16i8 zero = { 0 };
+
+        tmp_vec = (v16u8) __msa_fill_b(tc0);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc1);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc2);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc3);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
+
+        is_bs_greater_than0 = (zero < bs);
+
+        {
+            v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+            v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+            LD_UB8(src, img_width,
+                   row0, row1, row2, row3, row4, row5, row6, row7);
+            src += (8 * img_width);
+            LD_UB8(src, img_width,
+                   row8, row9, row10, row11, row12, row13, row14, row15);
+
+            TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                                row8, row9, row10, row11,
+                                row12, row13, row14, row15,
+                                p3_org, p2_org, p1_org, p0_org,
+                                q0_org, q1_org, q2_org, q3_org);
+        }
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_less_than & is_bs_greater_than0;
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            v16i8 negate_tc, sign_negate_tc;
+            v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
+            v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
+            v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+            v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+            v8i16 p0_r, q0_r, p0_l, q0_l;
+
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
+
+            UNPCK_UB_SH(tc, tc_r, tc_l);
+            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+            is_less_than_beta = (p2_asub_p0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                v16u8 p1;
+                v8i16 p1_r = { 0 };
+                v8i16 p1_l = { 0 };
+                v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
+                v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
+
+                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                 negate_tc_r, tc_r, p1_r);
+                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                 i16_negatetc_l, tc_l, p1_l);
+
+                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
+                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + is_less_than_beta;
+            }
+
+            q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+            is_less_than_beta = (q2_asub_q0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+            q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                v16u8 q1;
+                v8i16 q1_r = { 0 };
+                v8i16 q1_l = { 0 };
+                v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
+                v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
+
+                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                 negate_tc_r, tc_r, q1_r);
+                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                 i16_negatetc_l, tc_l, q1_l);
+
+                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
+                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + is_less_than_beta;
+            }
+
+            {
+                v8i16 threshold_r, negate_thresh_r;
+                v8i16 threshold_l, negate_thresh_l;
+                v16i8 negate_thresh, sign_negate_thresh;
+
+                negate_thresh = zero - (v16i8) tc;
+                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
+
+                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
+                           threshold_r, negate_thresh_r);
+
+                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                             negate_thresh_r, threshold_r, p0_r, q0_r);
+
+                threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
+                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
+                                                       negate_thresh);
+
+                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                             negate_thresh_l, threshold_l, p0_l, q0_l);
+            }
+
+            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+
+        {
+            v16i8 tp0, tp1, tp2, tp3;
+            v8i16 tmp2, tmp5;
+            v4i32 tmp3, tmp4, tmp6, tmp7;
+            uint32_t out0, out2;
+            uint16_t out1, out3;
+
+            src = data - 3;
+
+            ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
+            ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
+            ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
+
+            ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
+            ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
+
+            out0 = __msa_copy_u_w(tmp3, 0);
+            out1 = __msa_copy_u_h(tmp2, 0);
+            out2 = __msa_copy_u_w(tmp3, 1);
+            out3 = __msa_copy_u_h(tmp2, 1);
+
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp3, 2);
+            out1 = __msa_copy_u_h(tmp2, 2);
+            out2 = __msa_copy_u_w(tmp3, 3);
+            out3 = __msa_copy_u_h(tmp2, 3);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp4, 0);
+            out1 = __msa_copy_u_h(tmp2, 4);
+            out2 = __msa_copy_u_w(tmp4, 1);
+            out3 = __msa_copy_u_h(tmp2, 5);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp4, 2);
+            out1 = __msa_copy_u_h(tmp2, 6);
+            out2 = __msa_copy_u_w(tmp4, 3);
+            out3 = __msa_copy_u_h(tmp2, 7);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp6, 0);
+            out1 = __msa_copy_u_h(tmp5, 0);
+            out2 = __msa_copy_u_w(tmp6, 1);
+            out3 = __msa_copy_u_h(tmp5, 1);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp6, 2);
+            out1 = __msa_copy_u_h(tmp5, 2);
+            out2 = __msa_copy_u_w(tmp6, 3);
+            out3 = __msa_copy_u_h(tmp5, 3);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp7, 0);
+            out1 = __msa_copy_u_h(tmp5, 4);
+            out2 = __msa_copy_u_w(tmp7, 1);
+            out3 = __msa_copy_u_h(tmp5, 5);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp7, 2);
+            out1 = __msa_copy_u_h(tmp5, 6);
+            out2 = __msa_copy_u_w(tmp7, 3);
+            out3 = __msa_copy_u_h(tmp5, 7);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+        }
+        }
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
+                                                   uint8_t bs0, uint8_t bs1,
+                                                   uint8_t bs2, uint8_t bs3,
+                                                   uint8_t tc0, uint8_t tc1,
+                                                   uint8_t tc2, uint8_t tc3,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t image_width)
+{
+    v16u8 tmp_vec;
+    v16u8 bs = { 0 };
+
+    tmp_vec = (v16u8) __msa_fill_b(bs0);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs1);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs2);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs3);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
+
+    if (!__msa_test_bz_v(bs)) {
+        v16u8 alpha, beta, is_less_than, is_less_than_beta;
+        v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
+        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+        v16u8 is_less_than_alpha, is_bs_greater_than0;
+        v8i16 p0_r, q0_r, p0_l, q0_l;
+        v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+        v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+        v16i8 zero = { 0 };
+        v16i8 tc = { 0 };
+
+        tmp_vec = (v16u8) __msa_fill_b(tc0);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc1);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc2);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc3);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        LD_UB5(data - (3 * image_width), image_width,
+               p2_org, p1_org, p0_org, q0_org, q1_org);
+
+        is_bs_greater_than0 = ((v16u8) zero < bs);
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_less_than & is_bs_greater_than0;
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            v16i8 sign_negate_tc, negate_tc;
+            v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
+            v16u8 p2_asub_p0, q2_asub_q0;
+
+            q2_org = LD_UB(data + (2 * image_width));
+            negate_tc = zero - tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
+
+            UNPCK_UB_SH(tc, tc_r, tc_l);
+            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+            is_less_than_beta = (p2_asub_p0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                v16u8 p1;
+                v8i16 p1_r = { 0 };
+                v8i16 p1_l = { 0 };
+                v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
+                v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
+
+                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                 negate_tc_r, tc_r, p1_r);
+                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                 i16_negatetc_l, tc_l, p1_l);
+
+                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
+                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+                ST_UB(p1_org, data - (2 * image_width));
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + (v16i8) is_less_than_beta;
+            }
+
+            q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+            is_less_than_beta = (q2_asub_q0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+            q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                v16u8 q1;
+                v8i16 q1_r = { 0 };
+                v8i16 q1_l = { 0 };
+                v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
+                v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
+
+                AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                 negate_tc_r, tc_r, q1_r);
+                AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                 i16_negatetc_l, tc_l, q1_l);
+
+                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
+                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+                ST_UB(q1_org, data + image_width);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + (v16i8) is_less_than_beta;
+            }
+            {
+                v16i8 negate_thresh, sign_negate_thresh;
+                v8i16 threshold_r, threshold_l;
+                v8i16 negate_thresh_l, negate_thresh_r;
+
+                negate_thresh = zero - tc;
+                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
+
+                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
+                           threshold_r, negate_thresh_r);
+                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                             negate_thresh_r, threshold_r, p0_r, q0_r);
+
+                threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
+                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
+                                                       negate_thresh);
+                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                             negate_thresh_l, threshold_l, p0_l, q0_l);
+            }
+
+            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+
+            ST_UB(p0_org, (data - image_width));
+            ST_UB(q0_org, data);
+        }
+    }
+}
+
+static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
+                                             int32_t alpha_in, int32_t beta_in,
+                                             int8_t *tc0)
+{
+    uint8_t *data = in;
+    uint32_t out0, out1, out2, out3;
+    uint64_t load;
+    uint32_t tc_val;
+    v16u8 alpha, beta;
+    v16i8 inp0 = { 0 };
+    v16i8 inp1 = { 0 };
+    v16i8 inp2 = { 0 };
+    v16i8 inp3 = { 0 };
+    v16i8 inp4 = { 0 };
+    v16i8 inp5 = { 0 };
+    v16i8 inp6 = { 0 };
+    v16i8 inp7 = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 src4, src5, src6, src7;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
+    v16u8 is_less_than_beta1, is_less_than_beta2;
+    v8i16 tc, tc_orig_r, tc_plus1;
+    v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
+    v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
+    v8u16 src2_r, src3_r;
+    v8i16 p2_r, p1_r, q2_r, q1_r;
+    v16u8 p2, q2, p0, q0;
+    v4i32 dst0, dst1;
+    v16i8 zeros = { 0 };
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    if (tc0[0] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
+        load = LD(data - 3 + stride);
+        inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[1] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
+        load = LD(data - 3 + stride);
+        inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[2] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
+        load = LD(data - 3 + stride);
+        inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[3] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
+        load = LD(data - 3 + stride);
+        inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
+        data += (2 * stride);
+    }
+
+    ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
+               src0, src1, src2, src3);
+
+    ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
+    ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
+
+    src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
+    src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
+    src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
+    src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
+    src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
+    src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
+
+    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
+    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
+    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
+    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
+    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_alpha & is_less_than_beta;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    is_less_than_beta1 = (p2_asub_p0 < beta);
+    is_less_than_beta2 = (q2_asub_q0 < beta);
+
+    p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
+    p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
+    p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
+
+    ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
+    p2_r += p0_add_q0;
+    p2_r >>= 1;
+    p2_r -= p1_r;
+    ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
+    q2_r += p0_add_q0;
+    q2_r >>= 1;
+    q2_r -= q1_r;
+
+    tc_val = LW(tc0);
+    tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
+    tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
+    is_tc_orig1 = tc_orig;
+    is_tc_orig2 = tc_orig;
+    tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
+    tc = tc_orig_r;
+
+    p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
+    q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
+
+    p2_r += p1_r;
+    q2_r += q1_r;
+
+    PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
+
+    is_tc_orig1 = (zeros < is_tc_orig1);
+    is_tc_orig2 = is_tc_orig1;
+    is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
+    is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
+    is_tc_orig1 = is_less_than & is_tc_orig1;
+    is_tc_orig2 = is_less_than & is_tc_orig2;
+
+    p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
+    q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
+
+    q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
+    q0_sub_p0 <<= 2;
+    p1_sub_q1 = p1_r - q1_r;
+    q0_sub_p0 += p1_sub_q1;
+    q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
+
+    tc_plus1 = tc + 1;
+    is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
+                                              (v16i8) is_less_than_beta1);
+    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
+    tc_plus1 = tc + 1;
+    is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
+                                              (v16i8) is_less_than_beta2);
+    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
+
+    q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
+
+    ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
+    src2_r += q0_sub_p0;
+    src3_r -= q0_sub_p0;
+
+    src2_r = (v8u16) CLIP_SH_0_255(src2_r);
+    src3_r = (v8u16) CLIP_SH_0_255(src3_r);
+
+    PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
+
+    p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
+    q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
+
+    ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
+
+    ILVRL_H2_SW(q2, p2, dst0, dst1);
+
+    data = in;
+
+    out0 = __msa_copy_u_w(dst0, 0);
+    out1 = __msa_copy_u_w(dst0, 1);
+    out2 = __msa_copy_u_w(dst0, 2);
+    out3 = __msa_copy_u_w(dst0, 3);
+
+    if (tc0[0] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out0, (data - 2));
+        data += stride;
+        SW(out1, (data - 2));
+        data += stride;
+    }
+
+    if (tc0[1] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out2, (data - 2));
+        data += stride;
+        SW(out3, (data - 2));
+        data += stride;
+    }
+
+    out0 = __msa_copy_u_w(dst1, 0);
+    out1 = __msa_copy_u_w(dst1, 1);
+    out2 = __msa_copy_u_w(dst1, 2);
+    out3 = __msa_copy_u_w(dst1, 3);
+
+    if (tc0[2] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out0, (data - 2));
+        data += stride;
+        SW(out1, (data - 2));
+        data += stride;
+    }
+
+    if (tc0[3] >= 0) {
+        SW(out2, (data - 2));
+        data += stride;
+        SW(out3, (data - 2));
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
+                                                       uint8_t bs0, uint8_t bs1,
+                                                       uint8_t bs2, uint8_t bs3,
+                                                       uint8_t tc0, uint8_t tc1,
+                                                       uint8_t tc2, uint8_t tc3,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v16u8 alpha, beta;
+    v8i16 tmp_vec;
+    v8i16 bs = { 0 };
+    v8i16 tc = { 0 };
+    v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than;
+    v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
+    v8i16 p0_r, q0_r;
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v16i8 negate_tc, sign_negate_tc;
+    v8i16 tc_r, negate_tc_r;
+    v16i8 zero = { 0 };
+
+    tmp_vec = (v8i16) __msa_fill_b(bs0);
+    bs = __msa_insve_h(bs, 0, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs1);
+    bs = __msa_insve_h(bs, 1, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs2);
+    bs = __msa_insve_h(bs, 2, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs3);
+    bs = __msa_insve_h(bs, 3, tmp_vec);
+
+    if (!__msa_test_bz_v((v16u8) bs)) {
+        tmp_vec = (v8i16) __msa_fill_b(tc0);
+        tc = __msa_insve_h(tc, 0, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc1);
+        tc = __msa_insve_h(tc, 1, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc2);
+        tc = __msa_insve_h(tc, 2, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc3);
+        tc = __msa_insve_h(tc, 3, tmp_vec);
+
+        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        LD_UB4(data - (img_width << 1), img_width,
+               p1_org, p0_org, q0_org, q1_org);
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_less_than & is_bs_greater_than0;
+
+        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
+
+            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
+                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
+                         tc_r, p0_r, q0_r);
+
+            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+
+            ST_UB(q0_org, data);
+            ST_UB(p0_org, (data - img_width));
+        }
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
+                                                       uint8_t bs0, uint8_t bs1,
+                                                       uint8_t bs2, uint8_t bs3,
+                                                       uint8_t tc0, uint8_t tc1,
+                                                       uint8_t tc2, uint8_t tc3,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    uint8_t *src;
+    v16u8 alpha, beta;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
+    v16u8 p0, q0;
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v16u8 is_bs_greater_than0;
+    v8i16 tc_r, negate_tc_r;
+    v16i8 negate_tc, sign_negate_tc;
+    v16i8 zero = { 0 };
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v8i16 tmp1, tmp_vec, bs = { 0 };
+    v8i16 tc = { 0 };
+
+    tmp_vec = (v8i16) __msa_fill_b(bs0);
+    bs = __msa_insve_h(bs, 0, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs1);
+    bs = __msa_insve_h(bs, 1, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs2);
+    bs = __msa_insve_h(bs, 2, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs3);
+    bs = __msa_insve_h(bs, 3, tmp_vec);
+
+    if (!__msa_test_bz_v((v16u8) bs)) {
+        tmp_vec = (v8i16) __msa_fill_b(tc0);
+        tc = __msa_insve_h(tc, 0, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc1);
+        tc = __msa_insve_h(tc, 1, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc2);
+        tc = __msa_insve_h(tc, 2, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc3);
+        tc = __msa_insve_h(tc, 3, tmp_vec);
+
+        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
+
+        LD_UB8((data - 2), img_width,
+               row0, row1, row2, row3, row4, row5, row6, row7);
+
+        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
+                           row4, row5, row6, row7,
+                           p1_org, p0_org, q0_org, q1_org);
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_bs_greater_than0 & is_less_than;
+
+        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
+                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
+
+            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
+                         tc_r, p0_r, q0_r);
+
+            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+            tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
+            src = data - 1;
+            ST2x4_UB(tmp1, 0, src, img_width);
+            src += 4 * img_width;
+            ST2x4_UB(tmp1, 4, src, img_width);
+        }
+    }
+}
+
+static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
+                                            int32_t alpha_in, int32_t beta_in,
+                                            int8_t *tc0)
+{
+    int32_t col, tc_val;
+    v16u8 alpha, beta, res;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    for (col = 0; col < 4; col++) {
+        tc_val = (tc0[col] - 1) + 1;
+
+        if (tc_val <= 0) {
+            src += (4 * stride);
+            continue;
+        }
+
+        AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
+        ST2x4_UB(res, 0, (src - 1), stride);
+        src += (4 * stride);
+    }
+}
+
+static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
+                                                  int32_t alpha_in,
+                                                  int32_t beta_in,
+                                                  int8_t *tc0)
+{
+    int32_t col, tc_val;
+    int16_t out0, out1;
+    v16u8 alpha, beta, res;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    for (col = 0; col < 4; col++) {
+        tc_val = (tc0[col] - 1) + 1;
+
+        if (tc_val <= 0) {
+            src += 4 * stride;
+            continue;
+        }
+
+        AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
+
+        out0 = __msa_copy_s_h((v8i16) res, 0);
+        out1 = __msa_copy_s_h((v8i16) res, 1);
+
+        SH(out0, (src - 1));
+        src += stride;
+        SH(out1, (src - 1));
+        src += stride;
+    }
+}
+
+void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
+                                           tc[0], tc[1], tc[2], tc[3],
+                                           alpha, beta, img_width);
+}
+
+void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta, int8_t *tc)
+{
+
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
+                                           tc[0], tc[1], tc[2], tc[3],
+                                           alpha, beta, img_width);
+}
+
+void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
+                                               tc[0], tc[1], tc[2], tc[3],
+                                               alpha, beta, img_width);
+}
+
+void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
+                                               tc[0], tc[1], tc[2], tc[3],
+                                               alpha, beta, img_width);
+}
+
+void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta)
+{
+    avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
+                                           (uint8_t) beta,
+                                           (unsigned int) img_width);
+}
+
+void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta)
+{
+    avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
+                                           (uint8_t) beta,
+                                           (unsigned int) img_width);
+}
+
+void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta)
+{
+    avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
+                                               (uint8_t) beta,
+                                               (unsigned int) img_width);
+}
+
+void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta)
+{
+    avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
+                                               (uint8_t) beta,
+                                               (unsigned int) img_width);
+}
+
+void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
+                                         int32_t ystride,
+                                         int32_t alpha, int32_t beta,
+                                         int8_t *tc0)
+{
+    avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
+                                               int32_t ystride,
+                                               int32_t alpha,
+                                               int32_t beta,
+                                               int8_t *tc0)
+{
+    avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
+                                          int32_t ystride,
+                                          int32_t alpha,
+                                          int32_t beta,
+                                          int8_t *tc0)
+{
+    avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
+                                                int32_t ystride,
+                                                int32_t alpha,
+                                                int32_t beta)
+{
+    avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
+}
+
+void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
+                                   int height, int log2_denom,
+                                   int weight_src, int offset_in)
+{
+    uint32_t offset_val;
+    v16i8 zero = { 0 };
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
+    v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+    v8i16 wgt, denom, offset;
+
+    offset_val = (unsigned) offset_in << log2_denom;
+
+    wgt = __msa_fill_h(weight_src);
+    offset = __msa_fill_h(offset_val);
+    denom = __msa_fill_h(log2_denom);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
+               src2_r, src3_r);
+    ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
+               src2_l, src3_l);
+    ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
+               src6_r, src7_r);
+    ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
+               src6_l, src7_l);
+    MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
+         tmp3);
+    MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
+         tmp7);
+    MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
+         tmp11);
+    MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
+         tmp14, tmp15);
+    ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
+                tmp1, tmp2, tmp3);
+    ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
+                tmp5, tmp6, tmp7);
+    ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
+                tmp9, tmp10, tmp11);
+    ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
+                tmp12, tmp13, tmp14, tmp15);
+    MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+    MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
+    SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+    SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
+    SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+    SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
+    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+                dst2, dst3);
+    PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+                dst5, dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
+    src += 8 * stride;
+
+    if (16 == height) {
+        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
+                   src1_r, src2_r, src3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
+                   src1_l, src2_l, src3_l);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
+                   src5_r, src6_r, src7_r);
+        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
+                   src5_l, src6_l, src7_l);
+        MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
+             tmp2, tmp3);
+        MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
+             tmp6, tmp7);
+        MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
+             tmp10, tmp11);
+        MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
+             tmp14, tmp15);
+        ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
+                    tmp0, tmp1, tmp2, tmp3);
+        ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
+                    tmp4, tmp5, tmp6, tmp7);
+        ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
+                    tmp8, tmp9, tmp10, tmp11);
+        ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
+                    tmp12, tmp13, tmp14, tmp15);
+        MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
+        MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
+        SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
+        SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
+        SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
+        SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
+        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+                    dst2, dst3);
+        PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+                    dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
+    }
+}
+
+void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset)
+{
+    if (4 == height) {
+        avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
+    } else if (8 == height) {
+        avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
+    } else {
+        avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
+    }
+}
+
+void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset)
+{
+    if (2 == height) {
+        avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
+    } else if (4 == height) {
+        avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
+    } else {
+        avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
+    }
+}
+
+void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride, int height,
+                                     int log2_denom, int weight_dst,
+                                     int weight_src, int offset_in)
+{
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+    v8i16 denom, offset;
+
+    offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+    offset_in += (128 * (weight_src + weight_dst));
+
+    src_wgt = __msa_fill_b(weight_src);
+    dst_wgt = __msa_fill_b(weight_dst);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += 8 * stride;
+    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
+               vec6);
+    ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
+               vec7);
+    ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
+               vec12, vec14);
+    ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
+               vec13, vec15);
+    tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+    tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+    tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+    tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+    tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+    tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+    tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+    tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+    tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
+    tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
+    tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
+    tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
+    tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
+    tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
+    tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
+    tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
+    SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+    SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
+    SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
+    SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
+    CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+    CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
+    CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
+    PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+                dst2, dst3);
+    PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+                dst5, dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+    dst += 8 * stride;
+
+    if (16 == height) {
+        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
+        XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
+                   vec4, vec6);
+        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
+                   vec5, vec7);
+        ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
+                   vec12, vec14);
+        ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
+                   vec13, vec15);
+        tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+        tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+        tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+        tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+        tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+        tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+        tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+        tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+        tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
+        tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
+        tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
+        tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
+        tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
+        tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
+        tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
+        tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
+        SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+        SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
+        SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
+        SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
+        CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+        CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
+        CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
+        PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+                    dst2, dst3);
+        PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+                    dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+    }
+}
+
+void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset)
+{
+    if (4 == height) {
+        avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+                          offset);
+    } else if (8 == height) {
+        avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+                          offset);
+    } else {
+        avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+                           offset);
+    }
+}
+
+void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset)
+{
+    if (2 == height) {
+        avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+                          offset);
+    } else if (4 == height) {
+        avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+                          offset);
+    } else {
+        avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+                          offset);
+    }
+}
diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
new file mode 100644
index 0000000..1e1a5c8
--- /dev/null
+++ b/libavcodec/mips/h264idct_msa.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+
+#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)          \
+{                                                                         \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                          \
+    tmp0_m = in0 + in2;                                                   \
+    tmp1_m = in0 - in2;                                                   \
+    tmp2_m = in1 >> 1;                                                    \
+    tmp2_m = tmp2_m - in3;                                                \
+    tmp3_m = in3 >> 1;                                                    \
+    tmp3_m = in1 + tmp3_m;                                                \
+                                                                          \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3);  \
+}
+
+static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                     int32_t de_q_val)
+{
+#define DC_DEST_STRIDE 16
+    int16_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 src1, src3;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
+    const v4i32 de_q_vec = __msa_fill_w(de_q_val);
+    const v8i16 src0 = LD_SH(src);
+    const v8i16 src2 = LD_SH(src + 8);
+
+    ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
+    TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    BUTTERFLY_4(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
+    TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+    BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
+    UNPCK_R_SH_SW(vres0, vres0_r);
+    UNPCK_R_SH_SW(vres1, vres1_r);
+    UNPCK_R_SH_SW(vres2, vres2_r);
+    UNPCK_R_SH_SW(vres3, vres3_r);
+
+    vres0_r *= de_q_vec;
+    vres1_r *= de_q_vec;
+    vres2_r *= de_q_vec;
+    vres3_r *= de_q_vec;
+
+    SRARI_W4_SW(vres0_r, vres1_r, vres2_r, vres3_r, 8);
+    PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1);
+
+    out0 = __msa_copy_s_h(vec0, 0);
+    out1 = __msa_copy_s_h(vec0, 1);
+    out2 = __msa_copy_s_h(vec0, 2);
+    out3 = __msa_copy_s_h(vec0, 3);
+    out4 = __msa_copy_s_h(vec0, 4);
+    out5 = __msa_copy_s_h(vec0, 5);
+    out6 = __msa_copy_s_h(vec0, 6);
+    out7 = __msa_copy_s_h(vec0, 7);
+    SH(out0, (dst + 0  * DC_DEST_STRIDE));
+    SH(out1, (dst + 2  * DC_DEST_STRIDE));
+    SH(out2, (dst + 8  * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+    SH(out4, (dst + 1  * DC_DEST_STRIDE));
+    SH(out5, (dst + 3  * DC_DEST_STRIDE));
+    SH(out6, (dst + 9  * DC_DEST_STRIDE));
+    SH(out7, (dst + 11 * DC_DEST_STRIDE));
+
+    out0 = __msa_copy_s_h(vec1, 0);
+    out1 = __msa_copy_s_h(vec1, 1);
+    out2 = __msa_copy_s_h(vec1, 2);
+    out3 = __msa_copy_s_h(vec1, 3);
+    out4 = __msa_copy_s_h(vec1, 4);
+    out5 = __msa_copy_s_h(vec1, 5);
+    out6 = __msa_copy_s_h(vec1, 6);
+    out7 = __msa_copy_s_h(vec1, 7);
+    SH(out0, (dst + 4  * DC_DEST_STRIDE));
+    SH(out1, (dst + 6  * DC_DEST_STRIDE));
+    SH(out2, (dst + 12 * DC_DEST_STRIDE));
+    SH(out3, (dst + 14 * DC_DEST_STRIDE));
+    SH(out4, (dst + 5  * DC_DEST_STRIDE));
+    SH(out5, (dst + 7  * DC_DEST_STRIDE));
+    SH(out6, (dst + 13 * DC_DEST_STRIDE));
+    SH(out7, (dst + 15 * DC_DEST_STRIDE));
+
+#undef DC_DEST_STRIDE
+}
+
+static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
+{
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
+    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
+    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
+    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
+    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 zeros = { 0 };
+
+    src[0] += 32;
+
+    LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+    ST_SH8(zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, src, 8);
+
+    vec0 = src0 + src4;
+    vec1 = src0 - src4;
+    vec2 = src2 >> 1;
+    vec2 = vec2 - src6;
+    vec3 = src6 >> 1;
+    vec3 = src2 + vec3;
+
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
+
+    vec0 = src7 >> 1;
+    vec0 = src5 - vec0 - src3 - src7;
+    vec1 = src3 >> 1;
+    vec1 = src1 - vec1 + src7 - src3;
+    vec2 = src5 >> 1;
+    vec2 = vec2 - src1 + src7 + src5;
+    vec3 = src1 >> 1;
+    vec3 = vec3 + src3 + src5 + src1;
+    tmp4 = vec3 >> 2;
+    tmp4 += vec0;
+    tmp5 = vec2 >> 2;
+    tmp5 += vec1;
+    tmp6 = vec1 >> 2;
+    tmp6 -= vec2;
+    tmp7 = vec0 >> 2;
+    tmp7 = vec3 - tmp7;
+
+    BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                res0, res1, res2, res3, res4, res5, res6, res7);
+    TRANSPOSE8x8_SH_SH(res0, res1, res2, res3, res4, res5, res6, res7,
+                       res0, res1, res2, res3, res4, res5, res6, res7);
+    UNPCK_SH_SW(res0, tmp0_r, tmp0_l);
+    UNPCK_SH_SW(res1, tmp1_r, tmp1_l);
+    UNPCK_SH_SW(res2, tmp2_r, tmp2_l);
+    UNPCK_SH_SW(res3, tmp3_r, tmp3_l);
+    UNPCK_SH_SW(res4, tmp4_r, tmp4_l);
+    UNPCK_SH_SW(res5, tmp5_r, tmp5_l);
+    UNPCK_SH_SW(res6, tmp6_r, tmp6_l);
+    UNPCK_SH_SW(res7, tmp7_r, tmp7_l);
+    BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r);
+
+    vec2_r = tmp2_r >> 1;
+    vec2_l = tmp2_l >> 1;
+    vec2_r -= tmp6_r;
+    vec2_l -= tmp6_l;
+    vec3_r = tmp6_r >> 1;
+    vec3_l = tmp6_l >> 1;
+    vec3_r += tmp2_r;
+    vec3_l += tmp2_l;
+
+    BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r);
+    BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l);
+
+    vec0_r = tmp7_r >> 1;
+    vec0_l = tmp7_l >> 1;
+    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
+    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
+    vec1_r = tmp3_r >> 1;
+    vec1_l = tmp3_l >> 1;
+    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
+    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
+    vec2_r = tmp5_r >> 1;
+    vec2_l = tmp5_l >> 1;
+    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
+    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
+    vec3_r = tmp1_r >> 1;
+    vec3_l = tmp1_l >> 1;
+    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
+    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
+    tmp1_r = vec3_r >> 2;
+    tmp1_l = vec3_l >> 2;
+    tmp1_r += vec0_r;
+    tmp1_l += vec0_l;
+    tmp3_r = vec2_r >> 2;
+    tmp3_l = vec2_l >> 2;
+    tmp3_r += vec1_r;
+    tmp3_l += vec1_l;
+    tmp5_r = vec1_r >> 2;
+    tmp5_l = vec1_l >> 2;
+    tmp5_r -= vec2_r;
+    tmp5_l -= vec2_l;
+    tmp7_r = vec0_r >> 2;
+    tmp7_l = vec0_l >> 2;
+    tmp7_r = vec3_r - tmp7_r;
+    tmp7_l = vec3_l - tmp7_l;
+
+    BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r);
+    BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r);
+    BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r);
+    BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r);
+    SRA_4V(res0_r, res0_l, res1_r, res1_l, 6);
+    SRA_4V(res2_r, res2_l, res3_r, res3_l, 6);
+    SRA_4V(res4_r, res4_l, res5_r, res5_l, 6);
+    SRA_4V(res6_r, res6_l, res7_r, res7_l, 6);
+    PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
+                res0, res1, res2, res3);
+    PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
+                res4, res5, res6, res7);
+    LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
+               tmp4, tmp5, tmp6, tmp7);
+    ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
+         res0, res1, res2, res3);
+    ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
+         res4, res5, res6, res7);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    CLIP_SH4_0_255(res4, res5, res6, res7);
+    PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                dst0, dst1, dst2, dst3);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x4_UB(dst2, dst3, dst, dst_stride);
+}
+
+static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                    int32_t dst_stride)
+{
+    int32_t dc_val;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dc;
+    v16i8 zeros = { 0 };
+
+    dc_val = (src[0] + 32) >> 6;
+    dc = __msa_fill_h(dc_val);
+
+    src[0] = 0;
+
+    LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
+               dst0_r, dst1_r, dst2_r, dst3_r);
+    ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
+               dst4_r, dst5_r, dst6_r, dst7_r);
+    ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc,
+         dst0_r, dst1_r, dst2_r, dst3_r);
+    ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
+         dst4_r, dst5_r, dst6_r, dst7_r);
+    CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
+    CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
+    PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
+                dst0, dst1, dst2, dst3);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x4_UB(dst2, dst3, dst, dst_stride);
+}
+
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
+{
+    uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
+    v16i8 dst0_m = { 0 };
+    v16i8 dst1_m = { 0 };
+    v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
+    v8i16 inp0_m, inp1_m, res0_m, res1_m, src1, src3;
+    const v8i16 src0 = LD_SH(src);
+    const v8i16 src2 = LD_SH(src + 8);
+    const v8i16 zero = { 0 };
+    const uint8_t *dst1 = dst + dst_stride;
+    const uint8_t *dst2 = dst + 2 * dst_stride;
+    const uint8_t *dst3 = dst + 3 * dst_stride;
+
+    ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
+    ST_SH2(zero, zero, src, 8);
+    AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
+    TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+    AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
+    src0_m = LW(dst);
+    src1_m = LW(dst1);
+    SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
+    src2_m = LW(dst2);
+    src3_m = LW(dst3);
+    ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
+    INSERT_W2_SB(src0_m, src1_m, dst0_m);
+    INSERT_W2_SB(src2_m, src3_m, dst1_m);
+    ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
+    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
+    CLIP_SH2_0_255(res0_m, res1_m);
+    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
+    out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
+    out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
+    out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
+    out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
+    SW(out0_m, dst);
+    SW(out1_m, dst1);
+    SW(out2_m, dst2);
+    SW(out3_m, dst3);
+}
+
+void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
+                              int32_t dst_stride)
+{
+    avc_idct8_addblk_msa(dst, src, dst_stride);
+}
+
+void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride)
+{
+    v16u8 pred = { 0 };
+    v16i8 out;
+    v8i16 pred_r, pred_l;
+    const uint32_t src0 = LW(dst);
+    const uint32_t src1 = LW(dst + dst_stride);
+    const uint32_t src2 = LW(dst + 2 * dst_stride);
+    const uint32_t src3 = LW(dst + 3 * dst_stride);
+    const int16_t dc = (src[0] + 32) >> 6;
+    const v8i16 input_dc = __msa_fill_h(dc);
+
+    src[0] = 0;
+    INSERT_W4_UB(src0, src1, src2, src3, pred);
+    UNPCK_UB_SH(pred, pred_r, pred_l);
+    ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
+    CLIP_SH2_0_255(pred_r, pred_l);
+    out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                 int32_t dst_stride)
+{
+    avc_idct8_dc_addblk_msa(dst, src, dst_stride);
+}
+
+void ff_h264_idct_add16_msa(uint8_t *dst,
+                            const int32_t *blk_offset,
+                            int16_t *block, int32_t dst_stride,
+                            const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        int32_t nnz = nzc[scan8[i]];
+
+        if (nnz) {
+            if (nnz == 1 && ((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+            else
+                ff_h264_idct_add_msa(dst + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset,
+                            int16_t *block, int32_t dst_stride,
+                            const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz) {
+            if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
+                ff_h264_idct8_dc_addblk_msa(dst + blk_offset[cnt],
+                                            block + cnt * 16 * sizeof(pixel),
+                                            dst_stride);
+            else
+                ff_h264_idct8_addblk_msa(dst + blk_offset[cnt],
+                                         block + cnt * 16 * sizeof(pixel),
+                                         dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_msa(uint8_t **dst,
+                           const int32_t *blk_offset,
+                           int16_t *block, int32_t dst_stride,
+                           const uint8_t nzc[15 * 8])
+{
+    int32_t i, j;
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16); i < (j * 16 + 4); i++) {
+            if (nzc[scan8[i]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_422_msa(uint8_t **dst,
+                               const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8])
+{
+    int32_t i, j;
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16); i < (j * 16 + 4); i++) {
+            if (nzc[scan8[i]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16 + 4); i < (j * 16 + 8); i++) {
+            if (nzc[scan8[i + 4]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i + 4],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i + 4],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add16_intra_msa(uint8_t *dst,
+                                  const int32_t *blk_offset,
+                                  int16_t *block,
+                                  int32_t dst_stride,
+                                  const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_msa(dst + blk_offset[i],
+                                 block + i * 16 * sizeof(pixel), dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
+                                          block + i * 16 * sizeof(pixel),
+                                          dst_stride);
+    }
+}
+
+void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                  int32_t de_qval)
+{
+    avc_deq_idct_luma_dc_msa(dst, src, de_qval);
+}
diff --git a/libavcodec/mips/h264pred_init_mips.c b/libavcodec/mips/h264pred_init_mips.c
new file mode 100644
index 0000000..63637b8
--- /dev/null
+++ b/libavcodec/mips/h264pred_init_mips.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "h264dsp_mips.h"
+#include "h264pred_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264_pred_init_msa(H264PredContext *h, int codec_id,
+                                       const int bit_depth,
+                                       const int chroma_format_idc)
+{
+    if (8 == bit_depth) {
+        if (chroma_format_idc == 1) {
+            h->pred8x8[VERT_PRED8x8] = ff_h264_intra_pred_vert_8x8_msa;
+            h->pred8x8[HOR_PRED8x8] = ff_h264_intra_pred_horiz_8x8_msa;
+        }
+
+        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[PLANE_PRED8x8] = ff_h264_intra_predict_plane_8x8_msa;
+            }
+        }
+        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7
+            && codec_id != AV_CODEC_ID_VP8) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[DC_PRED8x8] = ff_h264_intra_predict_dc_4blk_8x8_msa;
+                h->pred8x8[LEFT_DC_PRED8x8] =
+                    ff_h264_intra_predict_hor_dc_8x8_msa;
+                h->pred8x8[TOP_DC_PRED8x8] =
+                    ff_h264_intra_predict_vert_dc_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa;
+            }
+        } else {
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred8x8[7] = ff_vp8_pred8x8_127_dc_8_msa;
+                h->pred8x8[8] = ff_vp8_pred8x8_129_dc_8_msa;
+            }
+        }
+
+        if (chroma_format_idc == 1) {
+            h->pred8x8[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_8x8_msa;
+        }
+
+        h->pred16x16[DC_PRED8x8] = ff_h264_intra_pred_dc_16x16_msa;
+        h->pred16x16[VERT_PRED8x8] = ff_h264_intra_pred_vert_16x16_msa;
+        h->pred16x16[HOR_PRED8x8] = ff_h264_intra_pred_horiz_16x16_msa;
+
+        switch (codec_id) {
+        case AV_CODEC_ID_SVQ3:
+            ;
+            break;
+        case AV_CODEC_ID_RV40:
+            ;
+            break;
+        case AV_CODEC_ID_VP7:
+        case AV_CODEC_ID_VP8:
+            h->pred16x16[7] = ff_vp8_pred16x16_127_dc_8_msa;
+            h->pred16x16[8] = ff_vp8_pred16x16_129_dc_8_msa;
+            break;
+        default:
+            h->pred16x16[PLANE_PRED8x8] =
+                ff_h264_intra_predict_plane_16x16_msa;
+            break;
+        }
+
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_h264_intra_pred_dc_left_16x16_msa;
+        h->pred16x16[TOP_DC_PRED8x8] = ff_h264_intra_pred_dc_top_16x16_msa;
+        h->pred16x16[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_16x16_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264_pred_init_mmi(H264PredContext *h, int codec_id,
+        const int bit_depth, const int chroma_format_idc)
+{
+    if (bit_depth == 8) {
+        if (chroma_format_idc == 1) {
+            h->pred8x8  [VERT_PRED8x8       ] = ff_pred8x8_vertical_8_mmi;
+            h->pred8x8  [HOR_PRED8x8        ] = ff_pred8x8_horizontal_8_mmi;
+        } else {
+            h->pred8x8  [VERT_PRED8x8       ] = ff_pred8x16_vertical_8_mmi;
+            h->pred8x8  [HOR_PRED8x8        ] = ff_pred8x16_horizontal_8_mmi;
+        }
+
+        h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_8_mmi;
+        h->pred16x16[VERT_PRED8x8           ] = ff_pred16x16_vertical_8_mmi;
+        h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_8_mmi;
+        h->pred8x8l [TOP_DC_PRED            ] = ff_pred8x8l_top_dc_8_mmi;
+        h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_8_mmi;
+
+#if ARCH_MIPS64
+        switch (codec_id) {
+        case AV_CODEC_ID_SVQ3:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_svq3_8_mmi;
+            break;
+        case AV_CODEC_ID_RV40:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_rv40_8_mmi;
+            break;
+        case AV_CODEC_ID_VP7:
+        case AV_CODEC_ID_VP8:
+            break;
+        default:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_h264_8_mmi;
+            break;
+        }
+#endif
+
+        if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[TOP_DC_PRED8x8   ] = ff_pred8x8_top_dc_8_mmi;
+                h->pred8x8[DC_PRED8x8       ] = ff_pred8x8_dc_8_mmi;
+            }
+        }
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
+                                    int bit_depth,
+                                    const int chroma_format_idc)
+{
+#if HAVE_MMI
+    h264_pred_init_mmi(h, codec_id, bit_depth, chroma_format_idc);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    h264_pred_init_msa(h, codec_id, bit_depth, chroma_format_idc);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/h264pred_mips.h b/libavcodec/mips/h264pred_mips.h
new file mode 100644
index 0000000..136e291
--- /dev/null
+++ b/libavcodec/mips/h264pred_mips.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H264PRED_MIPS_H
+#define AVCODEC_MIPS_H264PRED_MIPS_H
+
+#include "constants.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride);
+void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride);
+void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride);
+void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
+        ptrdiff_t stride);
+void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+
+#endif  /* AVCODEC_MIPS_H264PRED_MIPS_H */
diff --git a/libavcodec/mips/h264pred_mmi.c b/libavcodec/mips/h264pred_mmi.c
new file mode 100644
index 0000000..f4fe091
--- /dev/null
+++ b/libavcodec/mips/h264pred_mmi.c
@@ -0,0 +1,985 @@
+/*
+ * Loongson SIMD optimized h264pred
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264pred_mips.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavutil/mips/mmiutils.h"
+#include "constants.h"
+
+void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[2];
+    uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x08                                    \n\t"
+        MMI_LDC1(%[ftmp0], %[srcA], 0x00)
+        MMI_LDC1(%[ftmp1], %[srcA], 0x08)
+
+        "1:                                                             \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp1], %[src], 0x08)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp1], %[src], 0x08)
+
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride),    [srcA]"r"((mips_reg)(src-stride))
+        : "memory"
+    );
+}
+
+void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[3];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "dli        %[tmp2],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp1],    %[tmp0],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp1],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x00(%[addr1])                          \n\t"
+        "swl        %[tmp1],    0x0f(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x08(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp1],    %[tmp0],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp1],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x00(%[addr1])                          \n\t"
+        "swl        %[tmp1],    0x0f(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x08(%[addr1])                          \n\t"
+        "daddi      %[tmp2],    %[tmp2],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "bnez       %[tmp2],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[4];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "xor        %[tmp3],    %[tmp3],        %[tmp3]                 \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+
+        "dli        %[tmp0],    0x08                                    \n\t"
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        "2:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x01                    \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x01                    \n\t"
+        "bnez       %[tmp0],    2b                                      \n\t"
+
+        "daddiu     %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsra       %[tmp3],    0x05                                    \n\t"
+        "dmul       %[tmp2],    %[tmp3],        %[ff_pb_1]              \n\t"
+        PTR_ADDU   "%[addr0],   %[src],         $0                      \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "3:                                                             \n\t"
+        "swl        %[tmp2],    0x07(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr0])                          \n\t"
+        "swl        %[tmp2],    0x0f(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x08(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "swl        %[tmp2],    0x07(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr0])                          \n\t"
+        "swl        %[tmp2],    0x0f(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x08(%[addr0])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "bnez       %[tmp0],    3b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride)
+{
+    uint32_t dc;
+    double ftmp[11];
+    mips_reg tmp[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_ULDC1(%[ftmp10], %[srcA], 0x00)
+        MMI_ULDC1(%[ftmp9], %[src0], 0x00)
+        MMI_ULDC1(%[ftmp8], %[src1], 0x00)
+
+        "punpcklbh  %[ftmp7],   %[ftmp10],      %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp10],      %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp9],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp9],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp8],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp2],   %[ftmp8],       %[ftmp0]                \n\t"
+        "bnez       %[has_topleft],             1f                      \n\t"
+        "pinsrh_0   %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+
+        "1:                                                             \n\t"
+        "bnez       %[has_topright],            2f                      \n\t"
+        "pinsrh_3   %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+
+        "2:                                                             \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_2]              \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_2]              \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ff_pw_2]              \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ff_pw_2]              \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp9],   %[ftmp7],       %[ftmp6]                \n\t"
+        "biadd      %[ftmp10],  %[ftmp9]                                \n\t"
+        "mfc1       %[tmp1],    %[ftmp10]                               \n\t"
+        "addiu      %[tmp1],    %[tmp1],        0x04                    \n\t"
+        "srl        %[tmp1],    %[tmp1],        0x03                    \n\t"
+        "mul        %[dc],      %[tmp1],        %[ff_pb_1]              \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ALL64
+          [dc]"=r"(dc)
+        : [srcA]"r"((mips_reg)(src-stride-1)),
+          [src0]"r"((mips_reg)(src-stride)),
+          [src1]"r"((mips_reg)(src-stride+1)),
+          [has_topleft]"r"(has_topleft),    [has_topright]"r"(has_topright),
+          [ff_pb_1]"r"(ff_pb_1),            [ff_pw_2]"f"(ff_pw_2)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+
+        "1:                                                             \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
+
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [src]"+&r"(src)
+        : [dc]"f"(dc),                      [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride)
+{
+    uint32_t dc, dc1, dc2;
+    double ftmp[14];
+    mips_reg tmp[1];
+
+    const int l0 = ((has_topleft ? src[-1+-1*stride] : src[-1+0*stride]) + 2*src[-1+0*stride] + src[-1+1*stride] + 2) >> 2;
+    const int l1 = (src[-1+0*stride] + 2*src[-1+1*stride] + src[-1+2*stride] + 2) >> 2;
+    const int l2 = (src[-1+1*stride] + 2*src[-1+2*stride] + src[-1+3*stride] + 2) >> 2;
+    const int l3 = (src[-1+2*stride] + 2*src[-1+3*stride] + src[-1+4*stride] + 2) >> 2;
+    const int l4 = (src[-1+3*stride] + 2*src[-1+4*stride] + src[-1+5*stride] + 2) >> 2;
+    const int l5 = (src[-1+4*stride] + 2*src[-1+5*stride] + src[-1+6*stride] + 2) >> 2;
+    const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
+    const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
+
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        MMI_ULDC1(%[ftmp4], %[srcA], 0x00)
+        MMI_ULDC1(%[ftmp5], %[src0], 0x00)
+        MMI_ULDC1(%[ftmp6], %[src1], 0x00)
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x03                                    \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp11],  %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp12],  %[ftmp6],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp8],       %[ftmp1]                \n\t"
+        "pshufh     %[ftmp13],  %[ftmp12],      %[ftmp1]                \n\t"
+        "pinsrh_3   %[ftmp8],   %[ftmp8],       %[ftmp13]               \n\t"
+        "pinsrh_3   %[ftmp12],  %[ftmp12],      %[ftmp3]                \n\t"
+        "bnez       %[has_topleft],             1f                      \n\t"
+        "pinsrh_0   %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+
+        "1:                                                             \n\t"
+        "bnez       %[has_topright],            2f                      \n\t"
+        "pshufh     %[ftmp13],  %[ftmp10],      %[ftmp1]                \n\t"
+        "pinsrh_3   %[ftmp8],   %[ftmp8],       %[ftmp13]               \n\t"
+
+        "2:                                                             \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp9],   %[ftmp9],       %[ftmp2]                \n\t"
+        "pmullh     %[ftmp10],  %[ftmp10],      %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp12]               \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp8],   %[ftmp8],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp7],       %[ftmp8]                \n\t"
+        "biadd      %[ftmp4],   %[ftmp5]                                \n\t"
+        "mfc1       %[dc2],     %[ftmp4]                                \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dc2]"=r"(dc2)
+        : [srcA]"r"((mips_reg)(src-stride-1)),
+          [src0]"r"((mips_reg)(src-stride)),
+          [src1]"r"((mips_reg)(src-stride+1)),
+          [has_topleft]"r"(has_topleft),    [has_topright]"r"(has_topright)
+        : "memory"
+    );
+
+    dc1 = l0+l1+l2+l3+l4+l5+l6+l7;
+    dc = ((dc1+dc2+8)>>4)*0x01010101U;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+
+        "1:                                                             \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
+
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [src]"+&r"(src)
+        : [dc]"f"(dc),                      [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride)
+{
+    double ftmp[12];
+    mips_reg tmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp3], %[srcA], 0x00)
+        MMI_LDC1(%[ftmp4], %[src0], 0x00)
+        MMI_LDC1(%[ftmp5], %[src1], 0x00)
+        "punpcklbh  %[ftmp6],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp11],  %[ftmp5],       %[ftmp0]                \n\t"
+        "bnez       %[has_topleft],             1f                      \n\t"
+        "pinsrh_0   %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+
+        "1:                                                             \n\t"
+        "bnez       %[has_topright],            2f                      \n\t"
+        "pinsrh_3   %[ftmp11],  %[ftmp11],      %[ftmp9]                \n\t"
+
+        "2:                                                             \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "pmullh     %[ftmp9],   %[ftmp9],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp4], %[src], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src]"=r"(src)
+        : [srcA]"r"((mips_reg)(src-stride-1)),
+          [src0]"r"((mips_reg)(src-stride)),
+          [src1]"r"((mips_reg)(src-stride+1)),
+          [has_topleft]"r"(has_topleft),    [has_topright]"r"(has_topright)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+
+        "1:                                                             \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
+        ptrdiff_t stride)
+{
+    const int dc = (src[-stride] + src[1-stride] + src[2-stride]
+                 + src[3-stride] + src[-1+0*stride] + src[-1+1*stride]
+                 + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+    uint64_t tmp[2];
+    mips_reg addr[1];
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[tmp0],    %[dc],          $0                      \n\t"
+        "dmul       %[tmp1],    %[tmp0],        %[ff_pb_1]              \n\t"
+        "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [dc]"r"(dc),                      [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[2];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "ldl        %[tmp0],    0x07(%[addr0])                          \n\t"
+        "ldr        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dli        %[tmp1],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        "sdl        %[tmp0],    0x07(%[addr1])                          \n\t"
+        "sdr        %[tmp0],    0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr1],   %[stride]                               \n\t"
+        "sdl        %[tmp0],    0x07(%[addr1])                          \n\t"
+        "sdr        %[tmp0],    0x00(%[addr1])                          \n\t"
+        "daddi      %[tmp1],    -0x01                                   \n\t"
+        PTR_ADDU   "%[addr1],   %[stride]                               \n\t"
+        "bnez       %[tmp1],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[3];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[4];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "biadd      %[ftmp2],   %[ftmp2]                                \n\t"
+        "biadd      %[ftmp3],   %[ftmp3]                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "paddush    %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[5];
+    mips_reg addr[7];
+
+    __asm__ volatile (
+        "negu       %[addr0],   %[stride]                               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[src]                  \n\t"
+        PTR_ADDIU  "%[addr1],   %[addr0],       0x04                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   $0,             %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   $0,             %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "dli        %[addr2],  -0x01                                    \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[src]                  \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   $0,             %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr5],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr5],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr5],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   $0,             %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr6],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr6],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr6],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr5]                \n\t"
+        PTR_ADDIU  "%[addr3],   %[addr3],       0x04                    \n\t"
+        PTR_ADDIU  "%[addr4],   %[addr4],       0x02                    \n\t"
+        PTR_ADDIU  "%[addr1],   %[addr6],       0x02                    \n\t"
+        PTR_ADDU   "%[addr2],   %[addr4],       %[addr1]                \n\t"
+        PTR_SRL    "%[addr3],   0x03                                    \n\t"
+        PTR_SRL    "%[addr4],   0x02                                    \n\t"
+        PTR_SRL    "%[addr1],   0x02                                    \n\t"
+        PTR_SRL    "%[addr2],   0x03                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr3],   %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr4],   %[ftmp2]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr1],   %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr2],   %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp3],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[addr0],   $0,             %[src]                  \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[1];
+    uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        MMI_LDC1(%[ftmp0], %[srcA], 0x00)
+        "dli        %[tmp0],    0x04                                    \n\t"
+
+        "1:                                                             \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride),    [srcA]"r"((mips_reg)(src-stride))
+        : "memory"
+    );
+}
+
+void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[3];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
+        const int svq3, const int rv40)
+{
+    double ftmp[11];
+    uint64_t tmp[6];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile(
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        "dli        %[tmp0],    0x20                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr0], -0x01)
+        MMI_ULDC1(%[ftmp2], %[addr0],  0x08)
+        "dsrl       %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pmullh     %[ftmp0],   %[ftmp0],       %[ff_pw_m8tom5]         \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_m4tom1]         \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_1to4]           \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5to8]           \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "dli        %[tmp0],    0x0e                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp5],   %[ftmp0],       %[ftmp1]                \n\t"
+
+        PTR_ADDIU  "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        "lbu        %[tmp5],    0x10(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp0]                                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp1]                                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp2]                                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "daddu      %[tmp5],    %[tmp5],        %[tmp0]                 \n\t"
+        "daddiu     %[tmp5],    %[tmp5],        0x01                    \n\t"
+        "dsll       %[tmp5],    %[tmp5],        0x04                    \n\t"
+
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp3]                                \n\t"
+
+        "pmullh     %[ftmp0],   %[ftmp0],       %[ff_pw_m8tom5]         \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_m4tom1]         \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_1to4]           \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5to8]           \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "dli        %[tmp0],    0x0e                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp0],       %[ftmp1]                \n\t"
+
+        "dmfc1      %[tmp0],    %[ftmp5]                                \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "dsra       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "dmfc1      %[tmp1],    %[ftmp6]                                \n\t"
+        "dsll       %[tmp1],    %[tmp1],        0x30                    \n\t"
+        "dsra       %[tmp1],    %[tmp1],        0x30                    \n\t"
+
+        "beqz       %[svq3],    1f                                      \n\t"
+        "dli        %[tmp2],    0x04                                    \n\t"
+        "ddiv       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "ddiv       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "dli        %[tmp2],    0x05                                    \n\t"
+        "dmul       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "dmul       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "dli        %[tmp2],    0x10                                    \n\t"
+        "ddiv       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "ddiv       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "daddu      %[tmp2],    %[tmp0],        $0                      \n\t"
+        "daddu      %[tmp0],    %[tmp1],        $0                      \n\t"
+        "daddu      %[tmp1],    %[tmp2],        $0                      \n\t"
+        "b          2f                                                  \n\t"
+
+        "1:                                                             \n\t"
+        "beqz       %[rv40],    1f                                      \n\t"
+        "dsra       %[tmp2],    %[tmp0],        0x02                    \n\t"
+        "daddu      %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "dsra       %[tmp2],    %[tmp1],        0x02                    \n\t"
+        "daddu      %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "dsra       %[tmp0],    %[tmp0],        0x04                    \n\t"
+        "dsra       %[tmp1],    %[tmp1],        0x04                    \n\t"
+        "b          2f                                                  \n\t"
+
+        "1:                                                             \n\t"
+        "dli        %[tmp2],    0x05                                    \n\t"
+        "dmul       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "dmul       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "daddiu     %[tmp0],    %[tmp0],        0x20                    \n\t"
+        "daddiu     %[tmp1],    %[tmp1],        0x20                    \n\t"
+        "dsra       %[tmp0],    %[tmp0],        0x06                    \n\t"
+        "dsra       %[tmp1],    %[tmp1],        0x06                    \n\t"
+
+        "2:                                                             \n\t"
+        "daddu      %[tmp3],    %[tmp0],        %[tmp1]                 \n\t"
+        "dli        %[tmp2],    0x07                                    \n\t"
+        "dmul       %[tmp3],    %[tmp3],        %[tmp2]                 \n\t"
+        "dsubu      %[tmp5],    %[tmp5],        %[tmp3]                 \n\t"
+
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp0]                                \n\t"
+        "pshufh     %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "dmtc1      %[tmp1],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "dmtc1      %[tmp5],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
+        "pmullh     %[ftmp1],   %[ff_pw_0to3],  %[ftmp0]                \n\t"
+        "dmtc1      %[ff_pw_4to7],              %[ftmp2]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "dmtc1      %[ff_pw_8tob],              %[ftmp3]                \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "dmtc1      %[ff_pw_ctof],              %[ftmp4]                \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+
+        "dli        %[tmp0],    0x10                                    \n\t"
+        PTR_ADDU   "%[addr0],   %[src],         $0                      \n\t"
+        "1:                                                             \n\t"
+        "paddsh     %[ftmp8],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp8],       %[ftmp9]                \n\t"
+        MMI_SDC1(%[ftmp0], %[addr0], 0x00)
+
+        "paddsh     %[ftmp8],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp8],       %[ftmp9]                \n\t"
+        MMI_SDC1(%[ftmp0], %[addr0], 0x08)
+
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "daddiu     %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
+          [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0])
+        : [src]"r"(src),                    [stride]"r"((mips_reg)stride),
+          [svq3]"r"(svq3),                  [rv40]"r"(rv40),
+          [ff_pw_m8tom5]"f"(ff_pw_m8tom5),  [ff_pw_m4tom1]"f"(ff_pw_m4tom1),
+          [ff_pw_1to4]"f"(ff_pw_1to4),      [ff_pw_5to8]"f"(ff_pw_5to8),
+          [ff_pw_0to3]"f"(ff_pw_0to3),      [ff_pw_4to7]"r"(ff_pw_4to7),
+          [ff_pw_8tob]"r"(ff_pw_8tob),      [ff_pw_ctof]"r"(ff_pw_ctof)
+        : "memory"
+    );
+}
+
+void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    pred16x16_plane_compat_mmi(src, stride, 0, 0);
+}
+
+void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    pred16x16_plane_compat_mmi(src, stride, 1, 0);
+}
+
+void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    pred16x16_plane_compat_mmi(src, stride, 0, 1);
+}
diff --git a/libavcodec/mips/h264pred_msa.c b/libavcodec/mips/h264pred_msa.c
new file mode 100644
index 0000000..b9990c1
--- /dev/null
+++ b/libavcodec/mips/h264pred_msa.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride)
+{
+    uint64_t out = LD(src);
+
+    SD4(out, out, out, out, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    v16u8 out = LD_UB(src);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
+                                        uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+    out0 = src[0 * src_stride] * 0x0101010101010101;
+    out1 = src[1 * src_stride] * 0x0101010101010101;
+    out2 = src[2 * src_stride] * 0x0101010101010101;
+    out3 = src[3 * src_stride] * 0x0101010101010101;
+    out4 = src[4 * src_stride] * 0x0101010101010101;
+    out5 = src[5 * src_stride] * 0x0101010101010101;
+    out6 = src[6 * src_stride] * 0x0101010101010101;
+    out7 = src[7 * src_stride] * 0x0101010101010101;
+
+    SD4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    uint8_t inp0, inp1, inp2, inp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    inp0 = src[0 * src_stride];
+    inp1 = src[1 * src_stride];
+    inp2 = src[2 * src_stride];
+    inp3 = src[3 * src_stride];
+    src0 = (v16u8) __msa_fill_b(inp0);
+    src1 = (v16u8) __msa_fill_b(inp1);
+    src2 = (v16u8) __msa_fill_b(inp2);
+    src3 = (v16u8) __msa_fill_b(inp3);
+    inp0 = src[4 * src_stride];
+    inp1 = src[5 * src_stride];
+    inp2 = src[6 * src_stride];
+    inp3 = src[7 * src_stride];
+    src4 = (v16u8) __msa_fill_b(inp0);
+    src5 = (v16u8) __msa_fill_b(inp1);
+    src6 = (v16u8) __msa_fill_b(inp2);
+    src7 = (v16u8) __msa_fill_b(inp3);
+    inp0 = src[ 8 * src_stride];
+    inp1 = src[ 9 * src_stride];
+    inp2 = src[10 * src_stride];
+    inp3 = src[11 * src_stride];
+    src8 = (v16u8) __msa_fill_b(inp0);
+    src9 = (v16u8) __msa_fill_b(inp1);
+    src10 = (v16u8) __msa_fill_b(inp2);
+    src11 = (v16u8) __msa_fill_b(inp3);
+    inp0 = src[12 * src_stride];
+    inp1 = src[13 * src_stride];
+    inp2 = src[14 * src_stride];
+    inp3 = src[15 * src_stride];
+    src12 = (v16u8) __msa_fill_b(inp0);
+    src13 = (v16u8) __msa_fill_b(inp1);
+    src14 = (v16u8) __msa_fill_b(inp2);
+    src15 = (v16u8) __msa_fill_b(inp3);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+           dst, dst_stride);
+}
+
+#define INTRA_PREDICT_VALDC_8X8_MSA(val)                                       \
+static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, int32_t dst_stride)  \
+{                                                                              \
+    v16i8 store = __msa_fill_b(val);                                           \
+    uint64_t out = __msa_copy_u_d((v2i64) store, 0);                           \
+                                                                               \
+    SD4(out, out, out, out, dst, dst_stride);                                  \
+    dst += (4 * dst_stride);                                                   \
+    SD4(out, out, out, out, dst, dst_stride);                                  \
+}
+
+INTRA_PREDICT_VALDC_8X8_MSA(127);
+INTRA_PREDICT_VALDC_8X8_MSA(129);
+
+#define INTRA_PREDICT_VALDC_16X16_MSA(val)                            \
+static void intra_predict_##val##dc_16x16_msa(uint8_t *dst,           \
+                                              int32_t dst_stride)     \
+{                                                                     \
+    v16u8 out = (v16u8) __msa_fill_b(val);                            \
+                                                                      \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+    dst += (8 * dst_stride);                                          \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+}
+
+INTRA_PREDICT_VALDC_16X16_MSA(127);
+INTRA_PREDICT_VALDC_16X16_MSA(129);
+
+static void intra_predict_plane_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lpcnt;
+    int32_t res, res0, res1, res2, res3;
+    uint64_t out0, out1;
+    v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 src_top;
+    v8i16 vec9, vec10, vec11;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
+    v2i64 sum;
+
+    src_top = LD_UB(src - (stride + 1));
+    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
+
+    vec9 = __msa_hsub_u_h(src_top, src_top);
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w(vec9, vec9);
+    sum = __msa_hadd_s_d(vec8, vec8);
+
+    res0 = __msa_copy_s_w((v4i32) sum, 0);
+
+    res1 = (src[4 * stride - 1] - src[2 * stride - 1]) +
+        2 * (src[5 * stride - 1] - src[stride - 1]) +
+        3 * (src[6 * stride - 1] - src[-1]) +
+        4 * (src[7 * stride - 1] - src[-stride - 1]);
+
+    res0 *= 17;
+    res1 *= 17;
+    res0 = (res0 + 16) >> 5;
+    res1 = (res1 + 16) >> 5;
+
+    res3 = 3 * (res0 + res1);
+    res2 = 16 * (src[7 * stride - 1] + src[-stride + 7] + 1);
+    res = res2 - res3;
+
+    vec8 = __msa_fill_w(res0);
+    vec4 = __msa_fill_w(res);
+    vec2 = __msa_fill_w(res1);
+    vec5 = vec8 * int_multiplier;
+    vec3 = vec8 * 4;
+
+    for (lpcnt = 4; lpcnt--;) {
+        vec0 = vec5;
+        vec0 += vec4;
+        vec1 = vec0 + vec3;
+        vec6 = vec5;
+        vec4 += vec2;
+        vec6 += vec4;
+        vec7 = vec6 + vec3;
+
+        SRA_4V(vec0, vec1, vec6, vec7, 5);
+        PCKEV_H2_SH(vec1, vec0, vec7, vec6, vec10, vec11);
+        CLIP_SH2_0_255(vec10, vec11);
+        PCKEV_B2_SH(vec10, vec10, vec11, vec11, vec10, vec11);
+
+        out0 = __msa_copy_s_d((v2i64) vec10, 0);
+        out1 = __msa_copy_s_d((v2i64) vec11, 0);
+        SD(out0, src);
+        src += stride;
+        SD(out1, src);
+        src += stride;
+
+        vec4 += vec2;
+    }
+}
+
+static void intra_predict_plane_16x16_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lpcnt;
+    int32_t res0, res1, res2, res3;
+    uint64_t load0, load1;
+    v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 src_top = { 0 };
+    v16u8 store0, store1;
+    v8i16 vec9, vec10, vec11, vec12;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
+    v4i32 reg0, reg1, reg2, reg3;
+
+    load0 = LD(src - (stride + 1));
+    load1 = LD(src - (stride + 1) + 9);
+
+    INSERT_D2_UB(load0, load1, src_top);
+
+    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
+
+    vec9 = __msa_hsub_u_h(src_top, src_top);
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w(vec9, vec9);
+    res_add = (v4i32) __msa_hadd_s_d(vec8, vec8);
+
+    res0 = __msa_copy_s_w(res_add, 0) + __msa_copy_s_w(res_add, 2);
+
+    res1 = (src[8 * stride - 1] - src[6 * stride - 1]) +
+        2 * (src[9 * stride - 1] - src[5 * stride - 1]) +
+        3 * (src[10 * stride - 1] - src[4 * stride - 1]) +
+        4 * (src[11 * stride - 1] - src[3 * stride - 1]) +
+        5 * (src[12 * stride - 1] - src[2 * stride - 1]) +
+        6 * (src[13 * stride - 1] - src[stride - 1]) +
+        7 * (src[14 * stride - 1] - src[-1]) +
+        8 * (src[15 * stride - 1] - src[-1 * stride - 1]);
+
+    res0 *= 5;
+    res1 *= 5;
+    res0 = (res0 + 32) >> 6;
+    res1 = (res1 + 32) >> 6;
+
+    res3 = 7 * (res0 + res1);
+    res2 = 16 * (src[15 * stride - 1] + src[-stride + 15] + 1);
+    res2 -= res3;
+
+    vec8 = __msa_fill_w(res0);
+    vec4 = __msa_fill_w(res2);
+    vec5 = __msa_fill_w(res1);
+    vec6 = vec8 * 4;
+    vec7 = vec8 * int_multiplier;
+
+    for (lpcnt = 8; lpcnt--;) {
+        vec0 = vec7;
+        reg0 = vec7;
+        vec0 += vec4;
+        vec4 += vec5;
+        reg0 += vec4;
+        vec1 = vec0 + vec6;
+        reg1 = reg0 + vec6;
+        vec2 = vec1 + vec6;
+        reg2 = reg1 + vec6;
+        vec3 = vec2 + vec6;
+        reg3 = reg2 + vec6;
+
+        SRA_4V(vec0, vec1, vec2, vec3, 5);
+        SRA_4V(reg0, reg1, reg2, reg3, 5);
+        PCKEV_H2_SH(vec1, vec0, vec3, vec2, vec9, vec10);
+        PCKEV_H2_SH(reg1, reg0, reg3, reg2, vec11, vec12);
+        CLIP_SH2_0_255(vec9, vec10);
+        CLIP_SH2_0_255(vec11, vec12);
+        PCKEV_B2_UB(vec10, vec9, vec12, vec11, store0, store1);
+        ST_UB2(store0, store1, src, stride);
+        src += 2 * stride;
+
+        vec4 += vec5;
+    }
+}
+
+static void intra_predict_dc_4blk_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t src0, src1, src3, src2;
+    uint32_t out0, out1, out2, out3;
+    uint64_t store0, store1;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h((v16u8) src_top, (v16u8) src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+    src0 += src[0 * stride - 1];
+    src0 += src[1 * stride - 1];
+    src0 += src[2 * stride - 1];
+    src0 += src[3 * stride - 1];
+    src2  = src[4 * stride - 1];
+    src2 += src[5 * stride - 1];
+    src2 += src[6 * stride - 1];
+    src2 += src[7 * stride - 1];
+    src0 = (src0 + 4) >> 3;
+    src3 = (src1 + src2 + 4) >> 3;
+    src1 = (src1 + 2) >> 2;
+    src2 = (src2 + 2) >> 2;
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+    out3 = src3 * 0x01010101;
+    store0 = ((uint64_t) out1 << 32) | out0;
+    store1 = ((uint64_t) out3 << 32) | out2;
+
+    SD4(store0, store0, store0, store0, src, stride);
+    src += (4 * stride);
+    SD4(store1, store1, store1, store1, src, stride);
+}
+
+static void intra_predict_hor_dc_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t src0, src1;
+    uint64_t out0, out1;
+
+    src0  = src[0 * stride - 1];
+    src0 += src[1 * stride - 1];
+    src0 += src[2 * stride - 1];
+    src0 += src[3 * stride - 1];
+    src1  = src[4 * stride - 1];
+    src1 += src[5 * stride - 1];
+    src1 += src[6 * stride - 1];
+    src1 += src[7 * stride - 1];
+    src0 = (src0 + 2) >> 2;
+    src1 = (src1 + 2) >> 2;
+    out0 = src0 * 0x0101010101010101;
+    out1 = src1 * 0x0101010101010101;
+
+    SD4(out0, out0, out0, out0, src, stride);
+    src += (4 * stride);
+    SD4(out1, out1, out1, out1, src, stride);
+}
+
+static void intra_predict_vert_dc_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint64_t out0;
+    v16i8 mask = { 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 src_top, res0;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    sum = (v4u32) __msa_srari_w((v4i32) sum, 2);
+    res0 = (v16u8) __msa_vshf_b(mask, (v16i8) sum, (v16i8) sum);
+    out0 = __msa_copy_u_d((v2i64) res0, 0);
+
+    SD4(out0, out0, out0, out0, src, stride);
+    src += (4 * stride);
+    SD4(out0, out0, out0, out0, src, stride);
+}
+
+static void intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t src0, src1, src2;
+    uint32_t out0, out1, out2;
+    uint64_t store0, store1;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    src2  = src[0 * stride - 1];
+    src2 += src[1 * stride - 1];
+    src2 += src[2 * stride - 1];
+    src2 += src[3 * stride - 1];
+    src2 = (src0 + src2 + 4) >> 3;
+    src0 = (src0 + 2) >> 2;
+    src1 = (src1 + 2) >> 2;
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+    store1 = ((uint64_t) out1 << 32);
+    store0 = store1 | ((uint64_t) out2);
+    store1 = store1 | ((uint64_t) out0);
+
+    SD4(store0, store0, store0, store0, src, stride);
+    src += (4 * stride);
+    SD4(store1, store1, store1, store1, src, stride);
+}
+
+static void intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t src0, src1, src2, src3;
+    uint32_t out0, out1, out2, out3;
+    uint64_t store0, store1;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    src2  = src[4 * stride - 1];
+    src2 += src[5 * stride - 1];
+    src2 += src[6 * stride - 1];
+    src2 += src[7 * stride - 1];
+    src0 = (src0 + 2) >> 2;
+    src3 = (src1 + src2 + 4) >> 3;
+    src1 = (src1 + 2) >> 2;
+    src2 = (src2 + 2) >> 2;
+
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+    out3 = src3 * 0x01010101;
+    store0 = ((uint64_t) out1 << 32) | out0;
+    store1 = ((uint64_t) out3 << 32) | out2;
+
+    SD4(store0, store0, store0, store0, src, stride);
+    src += (4 * stride);
+    SD4(store1, store1, store1, store1, src, stride);
+}
+
+static void intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t src0;
+    uint64_t out0, out1;
+
+    src0  = src[0 * stride - 1];
+    src0 += src[1 * stride - 1];
+    src0 += src[2 * stride - 1];
+    src0 += src[3 * stride - 1];
+    src0 = (src0 + 2) >> 2;
+    out0 = src0 * 0x0101010101010101;
+    out1 = 0x8080808080808080;
+
+    SD4(out0, out0, out0, out0, src, stride);
+    src += (4 * stride);
+    SD4(out1, out1, out1, out1, src, stride);
+}
+
+static void intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t src0;
+    uint64_t out0, out1;
+
+    src0  = src[4 * stride - 1];
+    src0 += src[5 * stride - 1];
+    src0 += src[6 * stride - 1];
+    src0 += src[7 * stride - 1];
+    src0 = (src0 + 2) >> 2;
+
+    out0 = 0x8080808080808080;
+    out1 = src0 * 0x0101010101010101;
+
+    SD4(out0, out0, out0, out0, src, stride);
+    src += (4 * stride);
+    SD4(out1, out1, out1, out1, src, stride);
+}
+
+void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_plane_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_dc_4blk_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_hor_dc_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_vert_dc_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_l0t_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_0lt_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_l00_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_0l0_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_plane_16x16_msa(src, stride);
+}
+
+void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_vert_8x8_msa(src - stride, dst, stride);
+}
+
+void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_horiz_8x8_msa(src - 1, stride, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+    uint32_t addition = 0;
+    v16u8 src_above, out;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    src_above = LD_UB(src_top);
+
+    sum_above = __msa_hadd_u_h(src_above, src_above);
+    sum_top = __msa_hadd_u_w(sum_above, sum_above);
+    sum = __msa_hadd_u_d(sum_top, sum_top);
+    sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = __msa_hadd_u_d(sum_top, sum_top);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    addition += src_left[ 0 * stride];
+    addition += src_left[ 1 * stride];
+    addition += src_left[ 2 * stride];
+    addition += src_left[ 3 * stride];
+    addition += src_left[ 4 * stride];
+    addition += src_left[ 5 * stride];
+    addition += src_left[ 6 * stride];
+    addition += src_left[ 7 * stride];
+    addition += src_left[ 8 * stride];
+    addition += src_left[ 9 * stride];
+    addition += src_left[10 * stride];
+    addition += src_left[11 * stride];
+    addition += src_left[12 * stride];
+    addition += src_left[13 * stride];
+    addition += src_left[14 * stride];
+    addition += src_left[15 * stride];
+    addition = (addition + 16) >> 5;
+    out = (v16u8) __msa_fill_b(addition);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
+    dst += (8 * stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
+}
+
+void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_vert_16x16_msa(src - stride, dst, stride);
+}
+
+void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_horiz_16x16_msa(src - 1, stride, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+    uint32_t addition;
+    v16u8 out;
+
+    addition  = src_left[ 0 * stride];
+    addition += src_left[ 1 * stride];
+    addition += src_left[ 2 * stride];
+    addition += src_left[ 3 * stride];
+    addition += src_left[ 4 * stride];
+    addition += src_left[ 5 * stride];
+    addition += src_left[ 6 * stride];
+    addition += src_left[ 7 * stride];
+    addition += src_left[ 8 * stride];
+    addition += src_left[ 9 * stride];
+    addition += src_left[10 * stride];
+    addition += src_left[11 * stride];
+    addition += src_left[12 * stride];
+    addition += src_left[13 * stride];
+    addition += src_left[14 * stride];
+    addition += src_left[15 * stride];
+
+    addition = (addition + 8) >> 4;
+    out = (v16u8) __msa_fill_b(addition);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
+    dst += (8 * stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *dst = src;
+    v16u8 src_above, out;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    src_above = LD_UB(src_top);
+
+    sum_above = __msa_hadd_u_h(src_above, src_above);
+    sum_top = __msa_hadd_u_w(sum_above, sum_above);
+    sum = __msa_hadd_u_d(sum_top, sum_top);
+    sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = __msa_hadd_u_d(sum_top, sum_top);
+    sum = (v2u64) __msa_srari_d((v2i64) sum, 4);
+    out = (v16u8) __msa_splati_b((v16i8) sum, 0);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
+    dst += (8 * stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t out;
+    v16u8 store;
+
+    store = (v16u8) __msa_fill_b(128);
+    out = __msa_copy_u_d((v2i64) store, 0);
+
+    SD4(out, out, out, out, src, stride);
+    src += (4 * stride);
+    SD4(out, out, out, out, src, stride);
+}
+
+void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    v16u8 out;
+
+    out = (v16u8) __msa_fill_b(128);
+
+    ST_UB8(out, out, out, out, out, out, out, out, src, stride);
+    src += (8 * stride);
+    ST_UB8(out, out, out, out, out, out, out, out, src, stride);
+}
+
+void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_127dc_8x8_msa(src, stride);
+}
+
+void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_129dc_8x8_msa(src, stride);
+}
+
+void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_127dc_16x16_msa(src, stride);
+}
+
+void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_129dc_16x16_msa(src, stride);
+}
diff --git a/libavcodec/mips/h264qpel_init_mips.c b/libavcodec/mips/h264qpel_init_mips.c
new file mode 100644
index 0000000..33bae30
--- /dev/null
+++ b/libavcodec/mips/h264qpel_init_mips.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264qpel_init_msa(H264QpelContext *c, int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_msa;
+        c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_msa;
+        c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_msa;
+        c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_msa;
+        c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_msa;
+        c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_msa;
+        c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_msa;
+        c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_msa;
+        c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_msa;
+        c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_msa;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_msa;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_msa;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_msa;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_msa;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_msa;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_msa;
+
+        c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_msa;
+        c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_msa;
+        c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_msa;
+        c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_msa;
+        c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_msa;
+        c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_msa;
+        c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_msa;
+        c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_msa;
+        c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_msa;
+        c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_msa;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_msa;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_msa;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_msa;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_msa;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_msa;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_msa;
+
+        c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_msa;
+        c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_msa;
+        c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_msa;
+        c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_msa;
+        c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_msa;
+        c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_msa;
+        c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_msa;
+        c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_msa;
+        c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_msa;
+        c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_msa;
+        c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_msa;
+        c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_msa;
+        c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_msa;
+        c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_msa;
+        c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264qpel_init_mmi(H264QpelContext *c, int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_mmi;
+
+        c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_mmi;
+
+        c->put_h264_qpel_pixels_tab[2][0] = ff_put_h264_qpel4_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_MMI
+    h264qpel_init_mmi(c, bit_depth);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    h264qpel_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/h264qpel_mmi.c b/libavcodec/mips/h264qpel_mmi.c
new file mode 100644
index 0000000..13fbebf
--- /dev/null
+++ b/libavcodec/mips/h264qpel_mmi.c
@@ -0,0 +1,3134 @@
+/*
+ * Loongson SIMD optimized h264qpel
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+#include "hpeldsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavutil/mips/mmiutils.h"
+
+static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    double ftmp[1];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp0], %[src], 0x00)
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        "addi       %[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          RESTRICT_ASM_LOW32
+          [h]"+&r"(h)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride)
+        : "memory"
+    );
+}
+
+static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    double ftmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        "addi       %[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          RESTRICT_ASM_ALL64
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride)
+        : "memory"
+    );
+}
+
+static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    double ftmp[1];
+    uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[src], 0x00)
+        "ldl        %[tmp0],    0x0f(%[src])                            \n\t"
+        "ldr        %[tmp0],    0x08(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        "sdl        %[tmp0],    0x0f(%[dst])                            \n\t"
+        "sdr        %[tmp0],    0x08(%[dst])                            \n\t"
+        "addi       %[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride)
+        : "memory"
+    );
+}
+
+#define op2_avg(a, b)  a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
+#define op2_put(a, b)  a = CLIP(((b) + 512)>>10)
+static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[10];
+    uint64_t tmp[1];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
+
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp9], %[dst],  0x00)
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[11];
+    uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], -0x02)
+        MMI_ULDC1(%[ftmp2], %[src], -0x01)
+        MMI_ULDC1(%[ftmp3], %[src],  0x00)
+        MMI_ULDC1(%[ftmp4], %[src],  0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
+        MMI_ULDC1(%[ftmp6], %[src],  0x03)
+        "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp9], %[dst],  0x00)
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[11];
+    uint64_t tmp[1];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
+        MMI_LWC1(%[ftmp10], %[dst],  0x00)
+        "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
+        MMI_SWC1(%[ftmp9], %[dst],  0x00)
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[11];
+    uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], -0x02)
+        MMI_ULDC1(%[ftmp2], %[src], -0x01)
+        MMI_ULDC1(%[ftmp3], %[src],  0x00)
+        MMI_ULDC1(%[ftmp4], %[src],  0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
+        MMI_ULDC1(%[ftmp6], %[src],  0x03)
+        "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
+        MMI_LDC1(%[ftmp10], %[dst], 0x00)
+        "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
+        MMI_SDC1(%[ftmp9], %[dst], 0x00)
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[12];
+    uint64_t tmp[1];
+    DECLARE_VAR_LOW32;
+
+    src -= 2 * srcStride;
+
+    __asm__ volatile (
+        ".set       push                                                \n\t"
+        ".set       noreorder                                           \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
+        "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
+        "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        MMI_LWC1(%[ftmp3], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        MMI_LWC1(%[ftmp4], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        MMI_LWC1(%[ftmp5], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        MMI_LWC1(%[ftmp6], %[src], 0x00)
+        "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
+        "paddh      %[ftmp7],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
+        "paddh      %[ftmp7],   %[ftmp5],       %[ftmp6]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        MMI_LWC1(%[ftmp3], %[src], 0x00)
+        "paddh      %[ftmp7],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        ".set       pop                                                 \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int w = 2;
+    int h = 8;
+    double ftmp[10];
+    uint64_t tmp[1];
+    DECLARE_VAR_LOW32;
+
+    src -= 2 * srcStride;
+
+    while (w--) {
+        __asm__ volatile (
+            ".set       push                                            \n\t"
+            ".set       noreorder                                       \n\t"
+            "dli        %[tmp0],    0x02                                \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "dli        %[tmp0],    0x05                                \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3] ,  %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "bne        %[h],       0x10,           2f                  \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "2:                                                         \n\t"
+            ".set       pop                                             \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
+              [src]"+&r"(src),              [dst]"+&r"(dst),
+              [h]"+&r"(h)
+            : [dstStride]"r"((mips_reg)dstStride),
+              [srcStride]"r"((mips_reg)srcStride),
+              [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+            : "memory"
+        );
+
+        src += 4 - (h + 5) * srcStride;
+        dst += 4 - h * dstStride;
+    }
+}
+
+static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[10];
+    uint64_t tmp[1];
+
+    src -= 2 * srcStride;
+
+    __asm__ volatile (
+        ".set       push                                                \n\t"
+        ".set       noreorder                                           \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        MMI_LWC1(%[ftmp0], %[src], 0x00)
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        MMI_LWC1(%[ftmp3], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        MMI_LWC1(%[ftmp4], %[src], 0x00)
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        MMI_LWC1(%[ftmp5], %[src], 0x00)
+        "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        MMI_LWC1(%[ftmp0], %[dst], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        MMI_LWC1(%[ftmp0], %[src], 0x00)
+        "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        MMI_LWC1(%[ftmp1], %[dst], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
+        "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        MMI_LWC1(%[ftmp2], %[dst], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
+        "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        MMI_LWC1(%[ftmp3], %[dst], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        ".set       pop                                                 \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src),              [dst]"+&r"(dst)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int w = 2;
+    int h = 8;
+    double ftmp[10];
+    uint64_t tmp[1];
+    DECLARE_VAR_LOW32;
+
+    src -= 2 * srcStride;
+
+    while (w--) {
+        __asm__ volatile (
+            ".set       push                                            \n\t"
+            ".set       noreorder                                       \n\t"
+            "dli        %[tmp0],    0x02                                \n\t"
+            "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "dli        %[tmp0],    0x05                                \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp0], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp1], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp3], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp4], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp5], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp0], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp1], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            "bne        %[h],       0x10,           2f                  \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp3], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp4], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp5], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp0], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp1], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            MMI_LWC1(%[ftmp3], %[dst], 0x00)
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "2:                                                         \n\t"
+            ".set       pop                                             \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
+              [src]"+&r"(src),              [dst]"+&r"(dst),
+              [h]"+&r"(h)
+            : [dstStride]"r"((mips_reg)dstStride),
+              [srcStride]"r"((mips_reg)srcStride),
+              [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+            : "memory"
+        );
+
+        src += 4 - (h + 5) * srcStride;
+        dst += 4 - h * dstStride;
+    }
+}
+
+static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    INIT_CLIP
+    int i;
+    int16_t _tmp[36];
+    int16_t *tmp = _tmp;
+    double ftmp[10];
+    uint64_t tmp0;
+    DECLARE_VAR_LOW32;
+
+    src -= 2*srcStride;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x09                                    \n\t"
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        MMI_SDC1(%[ftmp9], %[tmp], 0x00)
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp0),
+          RESTRICT_ASM_LOW32
+          [tmp]"+&r"(tmp),                  [src]"+&r"(src)
+        : [tmpStride]"r"(8),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
+        : "memory"
+    );
+
+    tmp -= 28;
+
+    for (i=0; i<4; i++) {
+        const int16_t tmpB= tmp[-8];
+        const int16_t tmpA= tmp[-4];
+        const int16_t tmp0= tmp[ 0];
+        const int16_t tmp1= tmp[ 4];
+        const int16_t tmp2= tmp[ 8];
+        const int16_t tmp3= tmp[12];
+        const int16_t tmp4= tmp[16];
+        const int16_t tmp5= tmp[20];
+        const int16_t tmp6= tmp[24];
+        op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        dst++;
+        tmp++;
+    }
+}
+
+static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
+        const uint8_t *src, ptrdiff_t tmpStride, ptrdiff_t srcStride, int size)
+{
+    int w = (size + 8) >> 2;
+    double ftmp[11];
+    uint64_t tmp0;
+    DECLARE_VAR_LOW32;
+
+    src -= 2 * srcStride + 2;
+
+    while (w--) {
+        __asm__ volatile (
+            "dli        %[tmp0],    0x02                                \n\t"
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
+            "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            MMI_ULWC1(%[ftmp4], %[src], 0x00)
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            MMI_ULWC1(%[ftmp5], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x00)
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x30)
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x60)
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x90)
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0xc0)
+            MMI_ULWC1(%[ftmp4], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0xf0)
+            MMI_ULWC1(%[ftmp5], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x120)
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x150)
+            "bne        %[size],    0x10,           2f                  \n\t"
+
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x180)
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x1b0)
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x1e0)
+            MMI_ULWC1(%[ftmp4], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x210)
+            MMI_ULWC1(%[ftmp5], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x240)
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x270)
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x2a0)
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x2d0)
+            "2:                                                         \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [ftmp10]"=&f"(ftmp[10]),
+              [tmp0]"=&r"(tmp0),
+              RESTRICT_ASM_LOW32
+              [src]"+&r"(src)
+            : [tmp]"r"(tmp),                [size]"r"(size),
+              [srcStride]"r"((mips_reg)srcStride),
+              [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+            : "memory"
+        );
+
+        tmp += 4;
+        src += 4 - (size + 5) * srcStride;
+    }
+}
+
+static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
+        int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
+{
+    int w = size >> 4;
+    double ftmp[10];
+    uint64_t tmp0;
+    DECLARE_VAR_ALL64;
+
+    do {
+        int h = size;
+
+        __asm__ volatile (
+            "dli        %[tmp0],    0x02                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "1:                                                         \n\t"
+            MMI_LDC1(%[ftmp0], %[tmp], 0x00)
+            MMI_LDC1(%[ftmp3], %[tmp], 0x08)
+            MMI_LDC1(%[ftmp6], %[tmp], 0x10)
+            MMI_ULDC1(%[ftmp1], %[tmp], 0x02)
+            MMI_ULDC1(%[ftmp4], %[tmp], 0x0a)
+            MMI_ULDC1(%[ftmp5], %[tmp], 0x12)
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
+            MMI_ULDC1(%[ftmp2], %[tmp], 0x04)
+            MMI_ULDC1(%[ftmp6], %[tmp], 0x06)
+            MMI_ULDC1(%[ftmp5], %[tmp], 0x0c)
+            MMI_ULDC1(%[ftmp7], %[tmp], 0x0e)
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp3] ,  %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            PTR_ADDIU  "%[tmp],     %[tmp],         0x30                \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp0),
+              RESTRICT_ASM_ALL64
+              [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
+              [h]"+&r"(h)
+            : [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        tmp += 8 - size * 24;
+        dst += 8 - size * dstStride;
+    } while (w--);
+}
+
+static void put_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride, int size)
+{
+    put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
+    put_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
+}
+
+static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 8);
+}
+
+static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 16);
+}
+
+static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    int h = 8;
+    double ftmp[9];
+    uint64_t tmp[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], 0x00)
+        MMI_ULDC1(%[ftmp3], %[src], 0x01)
+        "punpckhbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psllh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        MMI_ULDC1(%[ftmp3], %[src], -0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
+        "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
+        MMI_ULWC1(%[ftmp3], %[src], -0x02)
+        MMI_ULWC1(%[ftmp6], %[src], 0x07)
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp5], %[src2],  0x00)
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[h],       %[h],           -0x01                   \n\t"
+        MMI_SDC1(%[ftmp1], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
+        "bgtz       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [src]"+&r"(src),                  [dst]"+&r"(dst),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [src2Stride]"r"((mips_reg)src2Stride),
+          [dstStride]"r"((mips_reg)dstStride),
+          [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
+{
+    double ftmp[7];
+    uint64_t tmp0;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    do {
+        __asm__ volatile (
+            "dli        %[tmp0],    0x05                                \n\t"
+            MMI_ULDC1(%[ftmp0], %[src16], 0x00)
+            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+            MMI_ULDC1(%[ftmp1], %[src16], 0x08)
+            MMI_ULDC1(%[ftmp2], %[src16], 0x30)
+            MMI_ULDC1(%[ftmp3], %[src16], 0x38)
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
+            MMI_LDC1(%[ftmp5], %[src8], 0x00)
+            MMI_LDXC1(%[ftmp4], %[src8], %[src8Stride], 0x00)
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            MMI_SDXC1(%[ftmp2], %[dst], %[dstStride], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),
+              RESTRICT_ASM_ALL64
+              RESTRICT_ASM_ADDRT
+              [tmp0]"=&r"(tmp0)
+            : [src8]"r"(src8),              [src16]"r"(src16),
+              [dst]"r"(dst),
+              [src8Stride]"r"((mips_reg)src8Stride),
+              [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        src8  += 2 * src8Stride;
+        src16 += 48;
+        dst   += 2 * dstStride;
+    } while (h -= 2);
+}
+
+static void put_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+
+    src += 8 * dstStride;
+    dst += 8 * dstStride;
+    src2 += 8 * src2Stride;
+
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+}
+
+static void put_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
+{
+    put_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, h);
+    put_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
+            src8Stride, h);
+}
+
+static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    INIT_CLIP
+    int i;
+    int16_t _tmp[36];
+    int16_t *tmp = _tmp;
+    double ftmp[10];
+    uint64_t tmp0;
+    DECLARE_VAR_LOW32;
+
+    src -= 2*srcStride;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x09                                    \n\t"
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        MMI_SDC1(%[ftmp9], %[tmp], 0x00)
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp0),
+          RESTRICT_ASM_LOW32
+          [tmp]"+&r"(tmp),                  [src]"+&r"(src)
+        : [tmpStride]"r"(8),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
+        : "memory"
+    );
+
+    tmp -= 28;
+
+    for (i=0; i<4; i++) {
+        const int16_t tmpB= tmp[-8];
+        const int16_t tmpA= tmp[-4];
+        const int16_t tmp0= tmp[ 0];
+        const int16_t tmp1= tmp[ 4];
+        const int16_t tmp2= tmp[ 8];
+        const int16_t tmp3= tmp[12];
+        const int16_t tmp4= tmp[16];
+        const int16_t tmp5= tmp[20];
+        const int16_t tmp6= tmp[24];
+        op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        dst++;
+        tmp++;
+    }
+}
+
+static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
+        int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
+{
+    int w = size >> 4;
+    double ftmp[11];
+    uint64_t tmp0;
+    DECLARE_VAR_ALL64;
+
+    do {
+        int h = size;
+        __asm__ volatile (
+            "dli        %[tmp0],    0x02                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
+            "1:                                                         \n\t"
+            MMI_LDC1(%[ftmp0], %[tmp], 0x00)
+            MMI_LDC1(%[ftmp3], %[tmp], 0x08)
+            MMI_ULDC1(%[ftmp1], %[tmp], 0x02)
+            MMI_ULDC1(%[ftmp4], %[tmp], 0x0a)
+            MMI_LDC1(%[ftmp7], %[tmp], 0x10)
+            MMI_ULDC1(%[ftmp8], %[tmp], 0x12)
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            MMI_ULDC1(%[ftmp2], %[tmp], 0x04)
+            MMI_ULDC1(%[ftmp5], %[tmp], 0x0c)
+            MMI_ULDC1(%[ftmp7], %[tmp], 0x06)
+            MMI_ULDC1(%[ftmp8], %[tmp], 0x0e)
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+            MMI_LDC1(%[ftmp6], %[dst], 0x00)
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            PTR_ADDI   "%[tmp],     %[tmp],         0x30                \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [ftmp10]"=&f"(ftmp[10]),
+              [tmp0]"=&r"(tmp0),
+              RESTRICT_ASM_ALL64
+              [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
+              [h]"+&r"(h)
+            : [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        tmp += 8 - size * 24;
+        dst += 8 - size * dstStride;
+    } while (w--);
+}
+
+static void avg_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride, int size)
+{
+    put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
+    avg_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
+}
+
+static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 8);
+}
+
+static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 16);
+}
+
+static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    double ftmp[10];
+    uint64_t tmp[2];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "dli        %[tmp1],    0x02                                    \n\t"
+        "ori        %[tmp0],    $0,             0x8                     \n\t"
+        "mtc1       %[tmp1],    %[ftmp7]                                \n\t"
+        "dli        %[tmp1],    0x05                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp1],    %[ftmp8]                                \n\t"
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src], 0x01)
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psllh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        MMI_ULDC1(%[ftmp2], %[src], -0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
+        "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
+        MMI_ULWC1(%[ftmp2], %[src], -0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x07)
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp5], %[src2], 0x00)
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_LDC1(%[ftmp9], %[dst], 0x00)
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
+        MMI_SDC1(%[ftmp1], %[dst], 0x00)
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
+        "bgtz       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [src2]"+&r"(src2)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [src2Stride]"r"((mips_reg)src2Stride),
+          [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+
+    src += 8 * dstStride;
+    dst += 8 * dstStride;
+    src2 += 8 * src2Stride;
+
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+}
+
+static void avg_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
+{
+    double ftmp[8];
+    uint64_t tmp0;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    do {
+        __asm__ volatile (
+            "dli        %[tmp0],    0x05                                \n\t"
+            MMI_ULDC1(%[ftmp0], %[src16], 0x00)
+            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+            MMI_ULDC1(%[ftmp1], %[src16], 0x08)
+            MMI_ULDC1(%[ftmp2], %[src16], 0x30)
+            MMI_ULDC1(%[ftmp3], %[src16], 0x38)
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            MMI_LDC1(%[ftmp4], %[src8], 0x00)
+            MMI_LDXC1(%[ftmp5], %[src8], %[src8Stride], 0x00)
+            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
+            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
+            MMI_LDC1(%[ftmp7], %[dst], 0x00)
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            MMI_LDXC1(%[ftmp7], %[dst], %[dstStride], 0x00)
+            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            MMI_SDXC1(%[ftmp2], %[dst], %[dstStride], 0x00)
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
+              RESTRICT_ASM_ADDRT
+              [tmp0]"=&r"(tmp0)
+            : [src8]"r"(src8),              [src16]"r"(src16),
+              [dst]"r"(dst),
+              [src8Stride]"r"((mips_reg)src8Stride),
+              [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        src8  += 2 * src8Stride;
+        src16 += 48;
+        dst   += 2 * dstStride;
+    } while (b -= 2);
+}
+
+static void avg_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
+{
+    avg_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, b);
+    avg_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
+            src8Stride, b);
+}
+
+//DEF_H264_MC_MMI(put_, 4)
+void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_put_pixels4_8_mmi(dst, src, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+//DEF_H264_MC_MMI(avg_, 4)
+void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_avg_pixels4_8_mmi(dst, src, stride, 4);
+}
+
+void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
+}
+
+void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+//DEF_H264_MC_MMI(put_, 8)
+void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_put_pixels8_8_mmi(dst, src, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_put_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_put_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[192];
+
+    put_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
+}
+
+void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
+}
+
+//DEF_H264_MC_MMI(avg_, 8)
+void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_avg_pixels8_8_mmi(dst, src, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_avg_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_avg_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[192];
+
+    avg_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
+}
+
+void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
+}
+
+//DEF_H264_MC_MMI(put_, 16)
+void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_put_pixels16_8_mmi(dst, src, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_put_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_put_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[384];
+
+    put_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
+}
+
+void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
+}
+
+//DEF_H264_MC_MMI(avg_, 16)
+void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_avg_pixels16_8_mmi(dst, src, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_avg_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_avg_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[384];
+
+    avg_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
+}
+
+void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
+}
+
+#undef op2_avg
+#undef op2_put
diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
new file mode 100644
index 0000000..9c779bd
--- /dev/null
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -0,0 +1,5789 @@
+/*
+ * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
+    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
+    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+
+    /* 4 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
+    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
+    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
+};
+
+#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,  \
+                                        out1, out2)                          \
+{                                                                            \
+    v16i8 tmp0_m, tmp1_m;                                                    \
+    v16i8 minus5b_m = __msa_ldi_b(-5);                                       \
+    v16i8 plus20b_m = __msa_ldi_b(20);                                       \
+                                                                             \
+    ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m);                                 \
+    HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2);                                 \
+    ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m);                                 \
+    DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2);          \
+    ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m);                                 \
+    DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);          \
+}
+
+#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)  \
+( {                                                        \
+    v8i16 out0_m;                                          \
+    v16i8 tmp0_m;                                          \
+    v16i8 minus5b = __msa_ldi_b(-5);                       \
+    v16i8 plus20b = __msa_ldi_b(20);                       \
+                                                           \
+    tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0);        \
+    out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);               \
+                                                           \
+    tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0);        \
+    out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m);     \
+                                                           \
+    tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0);        \
+    out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m);     \
+                                                           \
+    out0_m;                                                \
+} )
+
+#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)       \
+( {                                                                 \
+    v8i16 out0_m;                                                   \
+                                                                    \
+    out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
+    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
+    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
+                                                                    \
+    out0_m;                                                         \
+} )
+
+#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2)       \
+( {                                                                 \
+    v4i32 out0_m;                                                   \
+                                                                    \
+    out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0);           \
+    out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1);  \
+    out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2);  \
+    out0_m = __msa_srari_w(out0_m, 10);                             \
+    out0_m = __msa_sat_s_w(out0_m, 7);                              \
+    out0_m;                                                         \
+} )
+
+static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                    uint8_t *dst, int32_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
+    v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
+    v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
+    v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * stride);
+
+    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
+
+    SRARI_H2_SH(hz_out0, hz_out1, 5);
+    SAT_SH2_SH(hz_out0, hz_out1, 7);
+
+    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
+
+    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
+    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
+    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
+    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
+
+    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+    ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
+    ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
+    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
+                             filt2);
+    vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
+                             filt2);
+    SRARI_H2_SH(vt_out0, vt_out1, 5);
+    SAT_SH2_SH(vt_out0, vt_out1, 7);
+
+    out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+    out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                    uint8_t *dst, int32_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out0, out1;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
+    v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
+    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
+    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
+    v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
+    v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * stride);
+
+    XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+
+    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+    src_x += (4 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
+
+    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
+    src_y += (4 * stride);
+    XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
+
+    ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
+               src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
+    ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
+               src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
+    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
+                             filt2);
+    vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
+                             filt2);
+    vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
+                             filt2);
+    vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
+                             filt2);
+    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
+    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
+
+    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
+    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
+
+    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    ST8x4_UB(out0, out1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
+    XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
+
+    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+    ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
+               src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
+               src_vt1211_r);
+    vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
+                             filt2);
+    vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
+                             filt2);
+    vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
+                             filt2);
+    vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
+                             filt1, filt2);
+    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
+    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
+
+    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
+    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
+
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    ST8x4_UB(out0, out1, dst, stride);
+    dst += (4 * stride);
+}
+
+static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
+                                      const uint8_t *src_y, uint8_t *dst,
+                                      int32_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    const uint8_t *src_x_tmp = src_x;
+    const uint8_t *src_y_tmp = src_y;
+    uint8_t *dst_tmp = dst;
+    uint32_t multiple8_cnt, loop_cnt;
+    v16u8 tmp0, tmp1;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
+    v16i8 src_vt7, src_vt8;
+    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
+    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
+    v8i16 vt_out3, out0, out1, out2, out3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        src_x = src_x_tmp;
+        src_y = src_y_tmp;
+        dst = dst_tmp;
+
+        LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+        src_y += (5 * stride);
+
+        XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+            XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+            src_x += (4 * stride);
+
+            hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
+            hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
+            hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
+            hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
+            SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+            SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+            LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
+            src_y += (4 * stride);
+
+            XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
+            ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
+                       src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
+                       src_vt43_r);
+            ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
+                       src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
+                       src_vt87_r);
+            vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
+                                     filt1, filt2);
+            vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
+                                     filt1, filt2);
+            vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
+                                     filt1, filt2);
+            vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
+                                     filt1, filt2);
+            SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
+            SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
+
+            out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+            out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+            out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
+            out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
+
+            SAT_SH4_SH(out0, out1, out2, out3, 7);
+            tmp0 = PCKEV_XORI128_UB(out0, out1);
+            tmp1 = PCKEV_XORI128_UB(out2, out3);
+            ST8x4_UB(tmp0, tmp1, dst, stride);
+            dst += (4 * stride);
+
+            src_vt0 = src_vt4;
+            src_vt1 = src_vt5;
+            src_vt2 = src_vt6;
+            src_vt3 = src_vt7;
+            src_vt4 = src_vt8;
+        }
+
+        src_x_tmp += 8;
+        src_y_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
+                                                 const uint8_t *src_y,
+                                                 uint8_t *dst,
+                                                 int32_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 res, dst0 = { 0 };
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
+    v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
+    v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
+    v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * stride);
+
+    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
+
+    SRARI_H2_SH(hz_out0, hz_out1, 5);
+    SAT_SH2_SH(hz_out0, hz_out1, 7);
+
+    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
+
+    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
+    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
+    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
+    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
+
+    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+    ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
+    ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
+    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
+                             filt2);
+    vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
+                             filt2);
+    SRARI_H2_SH(vt_out0, vt_out1, 5);
+    SAT_SH2_SH(vt_out0, vt_out1, 7);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+
+    res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+    res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+
+    SAT_SH2_SH(res0, res1, 7);
+    res = PCKEV_XORI128_UB(res0, res1);
+    dst0 = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
+                                                 const uint8_t *src_y,
+                                                 uint8_t *dst,
+                                                 int32_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
+    v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
+    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
+    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
+    v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
+    v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * stride);
+
+    XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+
+    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+    src_x += (4 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
+
+    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+    LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
+    src_y += (4 * stride);
+    XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
+
+    ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
+               src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
+    ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
+               src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
+    vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
+                             filt2);
+    vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
+                             filt2);
+    vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
+                             filt2);
+    vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
+                             filt2);
+    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
+    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
+
+    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
+    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
+
+    LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
+    XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
+
+    SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+    SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+    ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
+               src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
+               src_vt1211_r);
+    vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
+                             filt2);
+    vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
+                             filt2);
+    vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
+                             filt2);
+    vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
+                             filt1, filt2);
+    SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
+    SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
+
+    tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+    tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+    tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
+    tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+    dst += (4 * stride);
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
+                                                   const uint8_t *src_y,
+                                                   uint8_t *dst,
+                                                   int32_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    const uint8_t *src_x_tmp = src_x;
+    const uint8_t *src_y_tmp = src_y;
+    uint8_t *dst_tmp = dst;
+    uint32_t multiple8_cnt, loop_cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
+    v16i8 src_vt7, src_vt8;
+    v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
+    v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
+    v8i16 vt_out3, out0, out1, out2, out3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        src_x = src_x_tmp;
+        src_y = src_y_tmp;
+        dst = dst_tmp;
+
+        LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+        src_y += (5 * stride);
+
+        XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
+            XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+            src_x += (4 * stride);
+
+            hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
+            hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
+            hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
+            hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
+            SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+            SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+            LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
+            src_y += (4 * stride);
+
+            XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
+            ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
+                       src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
+                       src_vt43_r);
+            ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
+                       src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
+                       src_vt87_r);
+            vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
+                                     filt1, filt2);
+            vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
+                                     filt1, filt2);
+            vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
+                                     filt1, filt2);
+            vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
+                                     filt1, filt2);
+            SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
+            SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
+
+            out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
+            out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
+            out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
+            out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
+
+            LD4(dst, stride, tp0, tp1, tp2, tp3);
+            INSERT_D2_UB(tp0, tp1, dst0);
+            INSERT_D2_UB(tp2, tp3, dst1);
+
+            SAT_SH4_SH(out0, out1, out2, out3, 7);
+            tmp0 = PCKEV_XORI128_UB(out0, out1);
+            tmp1 = PCKEV_XORI128_UB(out2, out3);
+            AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
+            ST8x4_UB(dst0, dst1, dst, stride);
+            dst += (4 * stride);
+
+            src_vt0 = src_vt4;
+            src_vt1 = src_vt5;
+            src_vt2 = src_vt6;
+            src_vt3 = src_vt7;
+            src_vt4 = src_vt8;
+        }
+
+        src_x_tmp += 8;
+        src_y_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * stride);
+    LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
+    dst += (8 * stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
+}
+
+void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD4(src, stride, src0, src1, src2, src3);
+    src += 4 * stride;
+    LD4(src, stride, src4, src5, src6, src7);
+    SD4(src0, src1, src2, src3, dst, stride);
+    dst += 4 * stride;
+    SD4(src4, src5, src6, src7, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * stride);
+    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+                dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+    dst += (8 * stride);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+                dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+
+    LD4(src, stride, tp0, tp1, tp2, tp3);
+    src += 4 * stride;
+    LD4(src, stride, tp4, tp5, tp6, tp7);
+    INSERT_D2_UB(tp0, tp1, src0);
+    INSERT_D2_UB(tp2, tp3, src1);
+    INSERT_D2_UB(tp4, tp5, src2);
+    INSERT_D2_UB(tp6, tp7, src3);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    INSERT_D2_UB(tp4, tp5, dst2);
+    INSERT_D2_UB(tp6, tp7, dst3);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0 = { 0 }, dst0 = { 0 };
+
+    LW4(src, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+
+    dst0 = __msa_aver_u_b(src0, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t loop_cnt;
+    v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    mask3 = mask0 + 8;
+    mask4 = mask1 + 8;
+    mask5 = mask2 + 8;
+    src -= 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += stride;
+        LD_SB2(src, 16, src2, src3);
+        src += stride;
+        LD_SB2(src, 16, src4, src5);
+        src += stride;
+        LD_SB2(src, 16, src6, src7);
+        src += stride;
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
+        SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
+        PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
+        dst0 = __msa_aver_s_b(dst0, src0);
+        dst1 = __msa_aver_s_b(dst1, src2);
+        dst2 = __msa_aver_s_b(dst2, src4);
+        dst3 = __msa_aver_s_b(dst3, src6);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t loop_cnt;
+    v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    mask3 = mask0 + 8;
+    mask4 = mask1 + 8;
+    mask5 = mask2 + 8;
+    src -= 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += stride;
+        LD_SB2(src, 16, src2, src3);
+        src += stride;
+        LD_SB2(src, 16, src4, src5);
+        src += stride;
+        LD_SB2(src, 16, src6, src7);
+        src += stride;
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
+        SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
+        PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
+        dst0 = __msa_aver_s_b(dst0, src0);
+        dst1 = __msa_aver_s_b(dst1, src2);
+        dst2 = __msa_aver_s_b(dst2, src4);
+        dst3 = __msa_aver_s_b(dst3, src6);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res4, res5, res6, res7);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
+    SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
+    SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
+    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
+    SRARI_H4_SH(res0, res1, res2, res3, 5);
+    SRARI_H4_SH(res4, res5, res6, res7, 5);
+    SAT_SH4_SH(res0, res1, res2, res3, 7);
+    SAT_SH4_SH(res4, res5, res6, res7, 7);
+    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
+    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
+    tmp0 = __msa_aver_s_b(tmp0, src0);
+    tmp1 = __msa_aver_s_b(tmp1, src1);
+    tmp2 = __msa_aver_s_b(tmp2, src4);
+    tmp3 = __msa_aver_s_b(tmp3, src5);
+    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
+    ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
+}
+
+void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res4, res5, res6, res7);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
+    SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
+    SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
+    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
+    SRARI_H4_SH(res0, res1, res2, res3, 5);
+    SRARI_H4_SH(res4, res5, res6, res7, 5);
+    SAT_SH4_SH(res0, res1, res2, res3, 7);
+    SAT_SH4_SH(res4, res5, res6, res7, 7);
+    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
+    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
+    tmp0 = __msa_aver_s_b(tmp0, src0);
+    tmp1 = __msa_aver_s_b(tmp1, src1);
+    tmp2 = __msa_aver_s_b(tmp2, src4);
+    tmp3 = __msa_aver_s_b(tmp3, src5);
+    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
+    ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 res0, res1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src - 2, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+    SRARI_H2_SH(res0, res1, 5);
+    SAT_SH2_SH(res0, res1, 7);
+    res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
+    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
+    res = __msa_aver_s_b(res, src0);
+    res = (v16i8) __msa_xori_b((v16u8) res, 128);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 res0, res1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src - 2, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+    SRARI_H2_SH(res0, res1, 5);
+    SAT_SH2_SH(res0, res1, 7);
+    res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
+    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
+    res = __msa_aver_s_b(res, src0);
+    res = (v16i8) __msa_xori_b((v16u8) res, 128);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 vec11;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    src -= 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += stride;
+        LD_SB2(src, 8, src2, src3);
+        src += stride;
+        LD_SB2(src, 8, src4, src5);
+        src += stride;
+        LD_SB2(src, 8, src6, src7);
+        src += stride;
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
+                    vec2, vec3);
+        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
+        ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 vec11;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                 plus20b, res0, res1, res2, res3);
+    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                 plus20b, res4, res5, res6, res7);
+    SRARI_H4_SH(res0, res1, res2, res3, 5);
+    SRARI_H4_SH(res4, res5, res6, res7, 5);
+    SAT_SH4_SH(res0, res1, res2, res3, 7);
+    SAT_SH4_SH(res4, res5, res6, res7, 7);
+    out0 = PCKEV_XORI128_UB(res0, res1);
+    out1 = PCKEV_XORI128_UB(res2, res3);
+    out2 = PCKEV_XORI128_UB(res4, res5);
+    out3 = PCKEV_XORI128_UB(res6, res7);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 res0, res1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src - 2, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+    SRARI_H2_SH(res0, res1, 5);
+    SAT_SH2_SH(res0, res1, 7);
+    out = PCKEV_XORI128_UB(res0, res1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res0, res1, res2, res3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, stride, src5, src6, src7, src8);
+        src += (4 * stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
+        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
+        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
+        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, stride);
+        dst += (4 * stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res0, res1, res2, res3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, stride, src5, src6, src7, src8);
+        src += (4 * stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
+        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
+        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
+        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, stride);
+        dst += (4 * stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
+    v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
+    XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
+               src109_r, src1110_r, src1211_r);
+    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
+    out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
+    out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
+    out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
+    PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
+    PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
+    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
+    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
+    out0 = __msa_aver_s_b(out0, tmp0);
+    out1 = __msa_aver_s_b(out1, tmp1);
+    out2 = __msa_aver_s_b(out2, tmp2);
+    out3 = __msa_aver_s_b(out3, tmp3);
+    XORI_B4_128_SB(out0, out1, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
+    v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
+               src109_r, src1110_r, src1211_r);
+    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
+    out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
+    out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
+    out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
+    PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
+    PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
+    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
+    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
+    out0 = __msa_aver_s_b(out0, tmp0);
+    out1 = __msa_aver_s_b(out1, tmp1);
+    out2 = __msa_aver_s_b(out2, tmp2);
+    out3 = __msa_aver_s_b(out3, tmp3);
+    XORI_B4_128_SB(out0, out1, out2, out3);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v8i16 out10, out32;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
+    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+    out = __msa_aver_u_b(out, (v16u8) src32_r);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v8i16 out10, out32;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
+    src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
+    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+    out = __msa_aver_u_b(out, (v16u8) src32_r);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride);
+}
+
+void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst,
+                              stride);
+}
+
+void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst,
+                              stride);
+}
+
+void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride);
+}
+
+void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride);
+}
+
+void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride);
+}
+
+void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst,
+                            stride);
+}
+
+
+void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride);
+}
+
+void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride);
+}
+
+void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst,
+                            stride);
+}
+
+void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t *dst_tmp = dst;
+    const uint8_t *src_tmp = src - (2 * stride) - 2;
+    uint32_t multiple8_cnt, loop_cnt;
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
+    v16i8 mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
+    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
+    v8i16 hz_out87_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        dst = dst_tmp;
+        src = src_tmp;
+
+        LD_SB5(src, stride, src0, src1, src2, src3, src4);
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+        src += (5 * stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB4(src, stride, src5, src6, src7, src8);
+            src += (4 * stride);
+
+            XORI_B4_128_SB(src5, src6, src7, src8);
+
+            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+
+            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
+                       hz_out43_r);
+            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
+                       hz_out43_l);
+            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
+                       hz_out87_r);
+            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
+                       hz_out87_l);
+
+            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
+                                  filt1, filt2);
+            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
+                                  filt1, filt2);
+            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
+                                  filt1, filt2);
+            dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
+                                  filt1, filt2);
+            dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+            dst1 = __msa_srari_h(hz_out2, 5);
+            dst3 = __msa_srari_h(hz_out3, 5);
+            dst5 = __msa_srari_h(hz_out4, 5);
+            dst7 = __msa_srari_h(hz_out5, 5);
+            SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
+
+            dst0 = __msa_aver_s_h(dst0, dst1);
+            dst1 = __msa_aver_s_h(dst2, dst3);
+            dst2 = __msa_aver_s_h(dst4, dst5);
+            dst3 = __msa_aver_s_h(dst6, dst7);
+
+            out0 = PCKEV_XORI128_UB(dst0, dst1);
+            out1 = PCKEV_XORI128_UB(dst2, dst3);
+            ST8x4_UB(out0, out1, dst, stride);
+            dst += (4 * stride);
+
+            hz_out0 = hz_out4;
+            hz_out1 = hz_out5;
+            hz_out2 = hz_out6;
+            hz_out3 = hz_out7;
+            hz_out4 = hz_out8;
+        }
+
+        src_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t *dst_tmp = dst;
+    const uint8_t *src_tmp = src - (2 * stride) - 2;
+    uint32_t multiple8_cnt, loop_cnt;
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
+    v16i8 mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
+    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
+    v8i16 hz_out87_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        dst = dst_tmp;
+        src = src_tmp;
+
+        LD_SB5(src, stride, src0, src1, src2, src3, src4);
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+        src += (5 * stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB4(src, stride, src5, src6, src7, src8);
+            src += (4 * stride);
+
+            XORI_B4_128_SB(src5, src6, src7, src8);
+
+            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+
+            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
+                       hz_out43_r);
+            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
+                       hz_out43_l);
+            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
+                       hz_out87_r);
+            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
+                       hz_out87_l);
+
+            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
+                                  filt1, filt2);
+            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
+                                  filt1, filt2);
+            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
+                                  filt1, filt2);
+            dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
+                                  filt1, filt2);
+            dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+            dst1 = __msa_srari_h(hz_out3, 5);
+            dst3 = __msa_srari_h(hz_out4, 5);
+            dst5 = __msa_srari_h(hz_out5, 5);
+            dst7 = __msa_srari_h(hz_out6, 5);
+            SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
+
+            dst0 = __msa_aver_s_h(dst0, dst1);
+            dst1 = __msa_aver_s_h(dst2, dst3);
+            dst2 = __msa_aver_s_h(dst4, dst5);
+            dst3 = __msa_aver_s_h(dst6, dst7);
+
+            out0 = PCKEV_XORI128_UB(dst0, dst1);
+            out1 = PCKEV_XORI128_UB(dst2, dst3);
+            ST8x4_UB(out0, out1, dst, stride);
+            dst += (4 * stride);
+
+            hz_out0 = hz_out4;
+            hz_out1 = hz_out5;
+            hz_out2 = hz_out6;
+            hz_out3 = hz_out7;
+            hz_out4 = hz_out8;
+        }
+
+        src_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
+    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
+    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
+    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
+    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    src += (4 * stride);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
+                          filt2);
+    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
+                          filt2);
+    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
+    SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out2);
+    dst1 = __msa_aver_s_h(dst1, hz_out3);
+    dst2 = __msa_aver_s_h(dst2, hz_out4);
+    dst3 = __msa_aver_s_h(dst3, hz_out5);
+
+    out0 = PCKEV_XORI128_UB(dst0, dst1);
+    out1 = PCKEV_XORI128_UB(dst2, dst3);
+    ST8x4_UB(out0, out1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src, stride, src9, src10, src11, src12);
+    XORI_B4_128_SB(src9, src10, src11, src12);
+    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
+    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
+    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
+    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
+               hz_out1211_r);
+    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
+               hz_out1211_l);
+    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
+                          filt2);
+    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
+                          filt2);
+    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
+    SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out6);
+    dst1 = __msa_aver_s_h(dst1, hz_out7);
+    dst2 = __msa_aver_s_h(dst2, hz_out8);
+    dst3 = __msa_aver_s_h(dst3, hz_out9);
+
+    out0 = PCKEV_XORI128_UB(dst0, dst1);
+    out1 = PCKEV_XORI128_UB(dst2, dst3);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
+    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
+    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
+    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
+    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    src += (4 * stride);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
+                          filt2);
+    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
+                          filt2);
+    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
+    SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out3);
+    dst1 = __msa_aver_s_h(dst1, hz_out4);
+    dst2 = __msa_aver_s_h(dst2, hz_out5);
+    dst3 = __msa_aver_s_h(dst3, hz_out6);
+
+    out0 = PCKEV_XORI128_UB(dst0, dst1);
+    out1 = PCKEV_XORI128_UB(dst2, dst3);
+    ST8x4_UB(out0, out1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src, stride, src9, src10, src11, src12);
+    XORI_B4_128_SB(src9, src10, src11, src12);
+    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
+    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
+    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
+    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
+               hz_out1211_r);
+    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
+               hz_out1211_l);
+    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
+                          filt2);
+    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
+                          filt2);
+    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
+    SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out7);
+    dst1 = __msa_aver_s_h(dst1, hz_out8);
+    dst2 = __msa_aver_s_h(dst2, hz_out9);
+    dst3 = __msa_aver_s_h(dst3, hz_out10);
+
+    out0 = PCKEV_XORI128_UB(dst0, dst1);
+    out1 = PCKEV_XORI128_UB(dst2, dst3);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 res;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    SRARI_H2_SH(hz_out2, hz_out4, 5);
+    SAT_SH2_SH(hz_out2, hz_out4, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out2);
+    dst1 = __msa_aver_s_h(dst1, hz_out4);
+
+    res = PCKEV_XORI128_UB(dst0, dst1);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 res;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
+    SRARI_H2_SH(hz_out0, hz_out1, 5);
+    SAT_SH2_SH(hz_out0, hz_out1, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out0);
+    dst1 = __msa_aver_s_h(dst1, hz_out1);
+
+    res = PCKEV_XORI128_UB(dst0, dst1);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res0, res1, res2, res3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, stride, src5, src6, src7, src8);
+        src += (4 * stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, stride);
+        dst += (4 * stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
+    v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
+    v16i8 filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * stride);
+    LD_SB5(src, stride, src8, src9, src10, src11, src12);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
+               src98_r, src109_r);
+    ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
+               src910_r, src1110_r, src1211_r);
+    XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
+    XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
+    XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
+    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
+    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
+    out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
+    out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
+    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
+    out3 = PCKEV_XORI128_UB(out6_r, out7_r);
+    ST8x8_UB(out0, out1, out2, out3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v8i16 out10, out32;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+               src76_r, src2110, src4332, src6554, src8776);
+    XORI_B4_128_SB(src2110, src4332, src6554, src8776);
+    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t row;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src7, src8, src9, src10, src11);
+
+    for (row = 16; row--;) {
+        LD_SB2(src, 8, src5, src6);
+        src += stride;
+        XORI_B2_128_SB(src5, src6);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        dst0 = __msa_srari_h(shf_vec2, 5);
+        dst1 = __msa_srari_h(shf_vec5, 5);
+        dst2 = __msa_srari_h(shf_vec8, 5);
+        dst3 = __msa_srari_h(shf_vec11, 5);
+        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
+        PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
+        dst0 = __msa_aver_s_h(dst2, dst0);
+        dst1 = __msa_aver_s_h(dst3, dst1);
+        out = PCKEV_XORI128_UB(dst0, dst1);
+        ST_UB(out, dst);
+        dst += stride;
+
+        src0 = src1;
+        src1 = src2;
+        src2 = src3;
+        src3 = src4;
+        src4 = src5;
+        src7 = src8;
+        src8 = src9;
+        src9 = src10;
+        src10 = src11;
+        src11 = src6;
+    }
+}
+
+void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t row;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src7, src8, src9, src10, src11);
+
+    for (row = 16; row--;) {
+        LD_SB2(src, 8, src5, src6);
+        src += stride;
+        XORI_B2_128_SB(src5, src6);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        dst0 = __msa_srari_h(shf_vec2, 5);
+        dst1 = __msa_srari_h(shf_vec5, 5);
+        dst2 = __msa_srari_h(shf_vec8, 5);
+        dst3 = __msa_srari_h(shf_vec11, 5);
+        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
+        dst0 = __msa_pckod_h(dst2, dst0);
+        dst1 = __msa_pckod_h(dst3, dst1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
+        dst0 = __msa_aver_s_h(dst2, dst0);
+        dst1 = __msa_aver_s_h(dst3, dst1);
+        out = PCKEV_XORI128_UB(dst0, dst1);
+        ST_UB(out, dst);
+        dst += stride;
+
+        src0 = src1;
+        src1 = src2;
+        src2 = src3;
+        src3 = src4;
+        src4 = src5;
+        src7 = src8;
+        src8 = src9;
+        src9 = src10;
+        src10 = src11;
+        src11 = src6;
+    }
+}
+
+void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t row;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
+    v8i16 mask3, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = 4; row--;) {
+        LD_SB2(src, stride, src5, src6);
+        src += (2 * stride);
+        XORI_B2_128_SB(src5, src6);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        dst0 = __msa_srari_h(shf_vec2, 5);
+        dst1 = __msa_srari_h(shf_vec5, 5);
+        dst2 = __msa_srari_h(shf_vec8, 5);
+        dst3 = __msa_srari_h(shf_vec11, 5);
+        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
+        PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
+        dst0 = __msa_aver_s_h(dst2, dst0);
+        dst1 = __msa_aver_s_h(dst3, dst1);
+        out = PCKEV_XORI128_UB(dst0, dst1);
+        ST8x2_UB(out, dst, stride);
+        dst += (2 * stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t row;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
+    v8i16 mask3, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = 4; row--;) {
+        LD_SB2(src, stride, src5, src6);
+        src += (2 * stride);
+        XORI_B2_128_SB(src5, src6);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        dst0 = __msa_srari_h(shf_vec2, 5);
+        dst1 = __msa_srari_h(shf_vec5, 5);
+        dst2 = __msa_srari_h(shf_vec8, 5);
+        dst3 = __msa_srari_h(shf_vec11, 5);
+        SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
+        dst0 = __msa_pckod_h(dst2, dst0);
+        dst1 = __msa_pckod_h(dst3, dst1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
+        dst0 = __msa_aver_s_h(dst2, dst0);
+        dst1 = __msa_aver_s_h(dst3, dst1);
+        out = PCKEV_XORI128_UB(dst0, dst1);
+        ST8x2_UB(out, dst, stride);
+        dst += (2 * stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
+    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
+    v16i8 src76_l, src87_l, filt0, filt1, filt2;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
+               src76_l, src87_l);
+    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
+    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
+    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
+
+    SRARI_W2_SW(hz_res0, hz_res1, 10);
+    SAT_SW2_SW(hz_res0, hz_res1, 7);
+    SRARI_W2_SW(hz_res2, hz_res3, 10);
+    SAT_SW2_SW(hz_res2, hz_res3, 7);
+
+    dst0 = __msa_srari_h(shf_vec2, 5);
+    dst1 = __msa_srari_h(shf_vec5, 5);
+    dst2 = __msa_srari_h(shf_vec6, 5);
+    dst3 = __msa_srari_h(shf_vec7, 5);
+
+    SAT_SH2_SH(dst0, dst1, 7);
+    SAT_SH2_SH(dst2, dst3, 7);
+    ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
+    ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
+
+    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
+    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
+    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
+    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
+
+    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
+    out = PCKEV_XORI128_UB(dst0, dst2);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
+    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
+    v16i8 src76_l, src87_l, filt0, filt1, filt2;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
+               src76_l, src87_l);
+
+    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
+    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
+    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
+
+    SRARI_W2_SW(hz_res0, hz_res1, 10);
+    SAT_SW2_SW(hz_res0, hz_res1, 7);
+    SRARI_W2_SW(hz_res2, hz_res3, 10);
+    SAT_SW2_SW(hz_res2, hz_res3, 7);
+
+    dst0 = __msa_srari_h(shf_vec2, 5);
+    dst1 = __msa_srari_h(shf_vec5, 5);
+    dst2 = __msa_srari_h(shf_vec6, 5);
+    dst3 = __msa_srari_h(shf_vec7, 5);
+
+    SAT_SH2_SH(dst0, dst1, 7);
+    SAT_SH2_SH(dst2, dst3, 7);
+
+    dst0 = __msa_ilvod_h(zeros, dst0);
+    dst1 = __msa_ilvod_h(zeros, dst1);
+    dst2 = __msa_ilvod_h(zeros, dst2);
+    dst3 = __msa_ilvod_h(zeros, dst3);
+
+    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
+    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
+    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
+    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
+
+    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
+    out = PCKEV_XORI128_UB(dst0, dst2);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    const uint8_t *src_tmp = src - (2 * stride) - 2;
+    uint8_t *dst_tmp = dst;
+    uint32_t multiple8_cnt, loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
+    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
+    v8i16 hz_out87_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        src = src_tmp;
+        dst = dst_tmp;
+
+        LD_SB5(src, stride, src0, src1, src2, src3, src4);
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+        src += (5 * stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB4(src, stride, src0, src1, src2, src3);
+            XORI_B4_128_SB(src0, src1, src2, src3);
+            src += (4 * stride);
+
+            hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+            hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+            hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+            hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+
+            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
+                       hz_out43_r);
+            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
+                       hz_out43_l);
+            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
+                       hz_out87_r);
+            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
+                       hz_out87_l);
+
+            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
+                                  filt1, filt2);
+            dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
+                                  filt1, filt2);
+            dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
+                                  filt1, filt2);
+            dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
+                                  filt1, filt2);
+            dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+            out0 = PCKEV_XORI128_UB(dst0, dst1);
+            out1 = PCKEV_XORI128_UB(dst2, dst3);
+            ST8x4_UB(out0, out1, dst, stride);
+            dst += (4 * stride);
+
+            hz_out0 = hz_out4;
+            hz_out1 = hz_out5;
+            hz_out2 = hz_out6;
+            hz_out3 = hz_out7;
+            hz_out4 = hz_out8;
+        }
+
+        src_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
+    v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
+    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
+    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
+    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    src -= ((2 * stride) + 2);
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+    LD_SB4(src, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * stride);
+    hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
+                          filt2);
+    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
+                          filt2);
+    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    out0 = PCKEV_XORI128_UB(dst0, dst1);
+    out1 = PCKEV_XORI128_UB(dst2, dst3);
+    ST8x4_UB(out0, out1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
+               hz_out1211_r);
+    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
+               hz_out1211_l);
+    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
+                          filt2);
+    dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
+                          filt2);
+    dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    out0 = PCKEV_XORI128_UB(dst0, dst1);
+    out1 = PCKEV_XORI128_UB(dst2, dst3);
+    ST8x4_UB(out0, out1, dst, stride);
+}
+
+void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 res;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    res = PCKEV_XORI128_UB(dst0, dst1);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t loop_cnt;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    mask3 = mask0 + 8;
+    mask4 = mask1 + 8;
+    mask5 = mask2 + 8;
+    src -= 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += stride;
+        LD_SB2(src, 16, src2, src3);
+        src += stride;
+        LD_SB2(src, 16, src4, src5);
+        src += stride;
+        LD_SB2(src, 16, src6, src7);
+        src += stride;
+
+        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
+        SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
+        PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
+        out0 = __msa_aver_s_b(out0, src0);
+        out1 = __msa_aver_s_b(out1, src2);
+        out2 = __msa_aver_s_b(out2, src4);
+        out3 = __msa_aver_s_b(out3, src6);
+        XORI_B4_128_SB(out0, out1, out2, out3);
+        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+        AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t loop_cnt;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    mask3 = mask0 + 8;
+    mask4 = mask1 + 8;
+    mask5 = mask2 + 8;
+    src -= 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += stride;
+        LD_SB2(src, 16, src2, src3);
+        src += stride;
+        LD_SB2(src, 16, src4, src5);
+        src += stride;
+        LD_SB2(src, 16, src6, src7);
+        src += stride;
+
+        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
+        SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
+        PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
+        out0 = __msa_aver_s_b(out0, src0);
+        out1 = __msa_aver_s_b(out1, src2);
+        out2 = __msa_aver_s_b(out2, src4);
+        out3 = __msa_aver_s_b(out3, src6);
+        XORI_B4_128_SB(out0, out1, out2, out3);
+        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+        AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res4, res5, res6, res7);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
+    SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
+    SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
+    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
+    SRARI_H4_SH(res0, res1, res2, res3, 5);
+    SRARI_H4_SH(res4, res5, res6, res7, 5);
+    SAT_SH4_SH(res0, res1, res2, res3, 7);
+    SAT_SH4_SH(res4, res5, res6, res7, 7);
+    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
+    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
+    tmp0 = __msa_aver_s_b(tmp0, src0);
+    tmp1 = __msa_aver_s_b(tmp1, src1);
+    tmp2 = __msa_aver_s_b(tmp2, src4);
+    tmp3 = __msa_aver_s_b(tmp3, src5);
+    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+    AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
+    AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res4, res5, res6, res7);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
+    SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
+    SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
+    PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
+    SRARI_H4_SH(res0, res1, res2, res3, 5);
+    SRARI_H4_SH(res4, res5, res6, res7, 5);
+    SAT_SH4_SH(res0, res1, res2, res3, 7);
+    SAT_SH4_SH(res4, res5, res6, res7, 7);
+    PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
+    PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
+    tmp0 = __msa_aver_s_b(tmp0, src0);
+    tmp1 = __msa_aver_s_b(tmp1, src1);
+    tmp2 = __msa_aver_s_b(tmp2, src4);
+    tmp3 = __msa_aver_s_b(tmp3, src5);
+    XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+    AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
+    AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 mask0, mask1, mask2;
+    v8i16 out0, out1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src - 2, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
+    SRARI_H2_SH(out0, out1, 5);
+    SAT_SH2_SH(out0, out1, 7);
+    res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
+    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
+    res = __msa_aver_s_b(res, src0);
+    res = (v16i8) __msa_xori_b((v16u8) res, 128);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 mask0, mask1, mask2;
+    v8i16 out0, out1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src - 2, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
+    SRARI_H2_SH(out0, out1, 5);
+    SAT_SH2_SH(out0, out1, 7);
+    res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
+    SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
+    SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
+    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
+    res = __msa_aver_s_b(res, src0);
+    res = (v16i8) __msa_xori_b((v16u8) res, 128);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t loop_cnt;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 vec11;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    src -= 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += stride;
+        LD_SB2(src, 8, src2, src3);
+        src += stride;
+        LD_SB2(src, 8, src4, src5);
+        src += stride;
+        LD_SB2(src, 8, src6, src7);
+        src += stride;
+
+        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
+                    vec2, vec3);
+        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
+        AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
+        AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
+    v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 vec11;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res0, res1, res2, res3);
+    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
+    HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                 res4, res5, res6, res7);
+    VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
+    VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
+                 res4, res5, res6, res7);
+    SRARI_H4_SH(res0, res1, res2, res3, 5);
+    SRARI_H4_SH(res4, res5, res6, res7, 5);
+    SAT_SH4_SH(res0, res1, res2, res3, 7);
+    SAT_SH4_SH(res4, res5, res6, res7, 7);
+    out0 = PCKEV_XORI128_UB(res0, res1);
+    out1 = PCKEV_XORI128_UB(res2, res3);
+    out4 = PCKEV_XORI128_UB(res4, res5);
+    out5 = PCKEV_XORI128_UB(res6, res7);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, out2);
+    INSERT_D2_UB(tp2, tp3, out3);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, out6);
+    INSERT_D2_UB(tp2, tp3, out7);
+    AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
+    AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
+    ST8x8_UB(out0, out1, out4, out5, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 res, dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 mask0, mask1, mask2;
+    v8i16 res0, res1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src - 2, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+    SRARI_H2_SH(res0, res1, 5);
+    SAT_SH2_SH(res0, res1, 7);
+    res = PCKEV_XORI128_UB(res0, res1);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    res = __msa_aver_u_b(res, dst0);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, stride, src5, src6, src7, src8);
+        src += (4 * stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
+        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
+        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
+        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
+        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+        AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, stride, src5, src6, src7, src8);
+        src += (4 * stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
+        res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
+        res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
+        res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
+        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+        AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
+    v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
+    XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
+    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1);
+    ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
+               src21_r, src32_r, src43_r);
+    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
+    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
+    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
+    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
+    PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+
+    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
+    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
+    out0 = __msa_aver_s_b(out0, tmp0);
+    out1 = __msa_aver_s_b(out1, tmp1);
+    out2 = __msa_aver_s_b(out2, tmp2);
+    out3 = __msa_aver_s_b(out3, tmp3);
+    XORI_B4_128_SB(out0, out1, out2, out3);
+    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
+                dst2, dst3);
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
+    v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
+    XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
+    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1);
+    ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
+               src21_r, src32_r, src43_r);
+    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
+    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
+    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
+    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
+    PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+
+    PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
+    PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
+    out0 = __msa_aver_s_b(out0, tmp0);
+    out1 = __msa_aver_s_b(out1, tmp1);
+    out2 = __msa_aver_s_b(out2, tmp2);
+    out3 = __msa_aver_s_b(out3, tmp3);
+    XORI_B4_128_SB(out0, out1, out2, out3);
+    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
+                dst2, dst3);
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res, dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v8i16 out10, out32;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
+    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    res = PCKEV_XORI128_UB(out10, out32);
+    res = __msa_aver_u_b(res, (v16u8) src32_r);
+    dst0 = __msa_aver_u_b(res, dst0);
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res, dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v8i16 out10, out32;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    res = PCKEV_XORI128_UB(out10, out32);
+    src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
+    src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
+    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+    res = __msa_aver_u_b(res, (v16u8) src32_r);
+    dst0 = __msa_aver_u_b(res, dst0);
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
+                                           src - (stride * 2),
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
+                                           src - (stride * 2) +
+                                           sizeof(uint8_t),
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
+                                           src - (stride * 2),
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
+                                           src - (stride * 2) +
+                                           sizeof(uint8_t),
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
+                                         src - (stride * 2),
+                                         dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
+                                         src - (stride * 2),
+                                         dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), dst, stride);
+}
+
+
+void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
+                                         src - (stride * 2),
+                                         dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
+                                         src - (stride * 2),
+                                         dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    uint8_t *dst_tmp = dst;
+    const uint8_t *src_tmp = src - (2 * stride) - 2;
+    uint32_t multiple8_cnt, loop_cnt;
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
+    v16i8 mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
+    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
+    v8i16 hz_out87_l, filt0, filt1, filt2;
+    v4i32 tmp0_w, tmp1_w;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        dst = dst_tmp;
+        src = src_tmp;
+
+        LD_SB5(src, stride, src0, src1, src2, src3, src4);
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+        src += (5 * stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB2(src, stride, src5, src6);
+            src += (2 * stride);
+
+            XORI_B2_128_SB(src5, src6);
+            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
+                       hz_out43_r);
+            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
+                       hz_out43_l);
+            ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
+                       hz_out65_r);
+            ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
+                       hz_out65_l);
+            tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
+                                    filt1, filt2);
+            tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+            tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
+                                    filt1, filt2);
+            tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+            tmp1 = __msa_srari_h(hz_out2, 5);
+            tmp3 = __msa_srari_h(hz_out3, 5);
+            SAT_SH2_SH(tmp1, tmp3, 7);
+
+            tmp0 = __msa_aver_s_h(tmp0, tmp1);
+            tmp1 = __msa_aver_s_h(tmp2, tmp3);
+
+            LD2(dst, stride, tp0, tp1);
+            INSERT_D2_UB(tp0, tp1, dst0);
+
+            out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+            dst0 = __msa_aver_u_b(out0, dst0);
+            ST8x2_UB(dst0, dst, stride);
+            dst += (2 * stride);
+
+            LD_SB2(src, stride, src7, src8);
+            src += (2 * stride);
+
+            XORI_B2_128_SB(src7, src8);
+            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+            ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
+                       hz_out87_r);
+            ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
+                       hz_out87_l);
+            tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
+                                    filt1, filt2);
+            tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+            tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
+                                    filt1, filt2);
+            tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+            tmp5 = __msa_srari_h(hz_out4, 5);
+            tmp7 = __msa_srari_h(hz_out5, 5);
+            SAT_SH2_SH(tmp5, tmp7, 7);
+
+            tmp2 = __msa_aver_s_h(tmp4, tmp5);
+            tmp3 = __msa_aver_s_h(tmp6, tmp7);
+
+            LD2(dst, stride, tp2, tp3);
+            INSERT_D2_UB(tp2, tp3, dst1);
+
+            out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+            dst1 = __msa_aver_u_b(out1, dst1);
+            ST8x2_UB(dst1, dst, stride);
+            dst += (2 * stride);
+
+            hz_out0 = hz_out4;
+            hz_out1 = hz_out5;
+            hz_out2 = hz_out6;
+            hz_out3 = hz_out7;
+            hz_out4 = hz_out8;
+        }
+
+        src_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    uint8_t *dst_tmp = dst;
+    const uint8_t *src_tmp = src - (2 * stride) - 2;
+    uint32_t multiple8_cnt, loop_cnt;
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
+    v16i8 mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
+    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
+    v8i16 hz_out87_l, filt0, filt1, filt2;
+    v4i32 tmp0_w, tmp1_w;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        dst = dst_tmp;
+        src = src_tmp;
+
+        LD_SB5(src, stride, src0, src1, src2, src3, src4);
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+        src += (5 * stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB2(src, stride, src5, src6);
+            src += (2 * stride);
+
+            XORI_B2_128_SB(src5, src6);
+            hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+            hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
+                       hz_out43_r);
+            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
+                       hz_out43_l);
+            ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
+            ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
+
+            tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
+                                    filt1, filt2);
+            tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+            tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
+                                    filt1, filt2);
+            tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+            tmp1 = __msa_srari_h(hz_out3, 5);
+            tmp3 = __msa_srari_h(hz_out4, 5);
+            SAT_SH2_SH(tmp1, tmp3, 7);
+
+            tmp0 = __msa_aver_s_h(tmp0, tmp1);
+            tmp1 = __msa_aver_s_h(tmp2, tmp3);
+
+            LD2(dst, stride, tp0, tp1);
+            INSERT_D2_UB(tp0, tp1, dst0);
+            out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+            dst0 = __msa_aver_u_b(out0, dst0);
+            ST8x2_UB(dst0, dst, stride);
+            dst += (2 * stride);
+
+            LD_SB2(src, stride, src7, src8);
+            src += (2 * stride);
+
+            XORI_B2_128_SB(src7, src8);
+            hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+            hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+            ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
+                       hz_out87_r);
+            ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
+                       hz_out87_l);
+            tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
+                                    filt1, filt2);
+            tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+            tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
+                                    filt1, filt2);
+            tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
+                                    filt1, filt2);
+            tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+            tmp5 = __msa_srari_h(hz_out5, 5);
+            tmp7 = __msa_srari_h(hz_out6, 5);
+            SAT_SH2_SH(tmp5, tmp7, 7);
+
+            tmp2 = __msa_aver_s_h(tmp4, tmp5);
+            tmp3 = __msa_aver_s_h(tmp6, tmp7);
+
+            LD2(dst, stride, tp2, tp3);
+            INSERT_D2_UB(tp2, tp3, dst1);
+            out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+            dst1 = __msa_aver_u_b(out1, dst1);
+            ST8x2_UB(dst1, dst, stride);
+            dst += (2 * stride);
+
+            hz_out0 = hz_out4;
+            hz_out1 = hz_out5;
+            hz_out2 = hz_out6;
+            hz_out3 = hz_out7;
+            hz_out4 = hz_out8;
+        }
+
+        src_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
+    v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
+    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
+    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
+    v4i32 tmp0_w, tmp1_w;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    src += (4 * stride);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
+
+    tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
+                            filt2);
+    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
+                            filt2);
+    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
+                            filt2);
+    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
+                            filt2);
+    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+    SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
+    SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+
+    tmp0 = __msa_aver_s_h(tmp0, hz_out2);
+    tmp1 = __msa_aver_s_h(tmp1, hz_out3);
+    tmp2 = __msa_aver_s_h(tmp2, hz_out4);
+    tmp3 = __msa_aver_s_h(tmp3, hz_out5);
+
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src, stride, src9, src10, src11, src12);
+    XORI_B4_128_SB(src9, src10, src11, src12);
+    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
+    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
+    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
+    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
+               hz_out1211_r);
+    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
+               hz_out1211_l);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
+                            filt2);
+    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
+                            filt2);
+    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
+                            filt2);
+    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
+                            filt2);
+    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+    SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
+    SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+
+    tmp0 = __msa_aver_s_h(tmp0, hz_out6);
+    tmp1 = __msa_aver_s_h(tmp1, hz_out7);
+    tmp2 = __msa_aver_s_h(tmp2, hz_out8);
+    tmp3 = __msa_aver_s_h(tmp3, hz_out9);
+
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
+    v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
+    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
+    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
+    v4i32 tmp0_w, tmp1_w;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    src += (4 * stride);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
+    hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
+
+    tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
+                            filt2);
+    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
+                            filt2);
+    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
+                            filt2);
+    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
+                            filt2);
+    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+    SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
+    SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+
+    tmp0 = __msa_aver_s_h(tmp0, hz_out3);
+    tmp1 = __msa_aver_s_h(tmp1, hz_out4);
+    tmp2 = __msa_aver_s_h(tmp2, hz_out5);
+    tmp3 = __msa_aver_s_h(tmp3, hz_out6);
+
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src, stride, src9, src10, src11, src12);
+    XORI_B4_128_SB(src9, src10, src11, src12);
+    hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
+    hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
+    hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
+    hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
+               hz_out1211_r);
+    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
+               hz_out1211_l);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
+                            filt2);
+    tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
+                            filt2);
+    tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
+                            filt2);
+    tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+    tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
+                            filt2);
+    tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
+                            filt2);
+    tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
+
+    SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
+    SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+
+    tmp0 = __msa_aver_s_h(tmp0, hz_out7);
+    tmp1 = __msa_aver_s_h(tmp1, hz_out8);
+    tmp2 = __msa_aver_s_h(tmp2, hz_out9);
+    tmp3 = __msa_aver_s_h(tmp3, hz_out10);
+
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    v16u8 res, out = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    SRARI_H2_SH(hz_out2, hz_out4, 5);
+    SAT_SH2_SH(hz_out2, hz_out4, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out2);
+    dst1 = __msa_aver_s_h(dst1, hz_out4);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
+    res = PCKEV_XORI128_UB(dst0, dst1);
+    res = __msa_aver_u_b(res, out);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 res, out = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
+
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+    PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
+    SRARI_H2_SH(hz_out0, hz_out1, 5);
+    SAT_SH2_SH(hz_out0, hz_out1, 7);
+
+    dst0 = __msa_aver_s_h(dst0, hz_out0);
+    dst1 = __msa_aver_s_h(dst1, hz_out1);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
+    res = PCKEV_XORI128_UB(dst0, dst1);
+    res = __msa_aver_u_b(res, out);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, stride, src5, src6, src7, src8);
+        src += (4 * stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+        AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, stride);
+        dst += (4 * stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 filt0, filt1, filt2;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+
+    LD_SB4(src, stride, src7, src8, src9, src10);
+    src += (4 * stride);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+
+    LD_SB4(src, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
+               src21_r, src32_r, src43_r);
+    out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
+    out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
+    out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
+    out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
+
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+    SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
+    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
+    out3 = PCKEV_XORI128_UB(out6_r, out7_r);
+    AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
+                dst2, dst3);
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 res, dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v8i16 out10, out32;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= (stride * 2);
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    res = PCKEV_XORI128_UB(out10, out32);
+    dst0 = __msa_aver_u_b(res, dst0);
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t row;
+    v16u8 out, dst0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src7, src8, src9, src10, src11);
+
+    for (row = 16; row--;) {
+        LD_SB2(src, 8, src5, src6);
+        src += stride;
+        XORI_B2_128_SB(src5, src6);
+        dst0 = LD_UB(dst);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        tmp0 = __msa_srari_h(shf_vec2, 5);
+        tmp1 = __msa_srari_h(shf_vec5, 5);
+        tmp2 = __msa_srari_h(shf_vec8, 5);
+        tmp3 = __msa_srari_h(shf_vec11, 5);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
+        tmp0 = __msa_aver_s_h(tmp2, tmp0);
+        tmp1 = __msa_aver_s_h(tmp3, tmp1);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        out = __msa_aver_u_b(out, dst0);
+        ST_UB(out, dst);
+        dst += stride;
+
+        src0 = src1;
+        src1 = src2;
+        src2 = src3;
+        src3 = src4;
+        src4 = src5;
+        src7 = src8;
+        src8 = src9;
+        src9 = src10;
+        src10 = src11;
+        src11 = src6;
+    }
+}
+
+void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint32_t row;
+    v16u8 out, dst0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src7, src8, src9, src10, src11);
+
+    for (row = 16; row--;) {
+        LD_SB2(src, 8, src5, src6);
+        src += stride;
+        XORI_B2_128_SB(src5, src6);
+        dst0 = LD_UB(dst);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        tmp0 = __msa_srari_h(shf_vec2, 5);
+        tmp1 = __msa_srari_h(shf_vec5, 5);
+        tmp2 = __msa_srari_h(shf_vec8, 5);
+        tmp3 = __msa_srari_h(shf_vec11, 5);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        tmp0 = __msa_pckod_h(tmp2, tmp0);
+        tmp1 = __msa_pckod_h(tmp3, tmp1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
+        tmp0 = __msa_aver_s_h(tmp2, tmp0);
+        tmp1 = __msa_aver_s_h(tmp3, tmp1);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        out = __msa_aver_u_b(out, dst0);
+        ST_UB(out, dst);
+        dst += stride;
+
+        src0 = src1;
+        src1 = src2;
+        src2 = src3;
+        src3 = src4;
+        src4 = src5;
+        src7 = src8;
+        src8 = src9;
+        src9 = src10;
+        src10 = src11;
+        src11 = src6;
+    }
+}
+
+void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t row;
+    uint64_t tp0, tp1;
+    v16u8 out, dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
+    v8i16 mask3, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = 4; row--;) {
+        LD_SB2(src, stride, src5, src6);
+        src += (2 * stride);
+        XORI_B2_128_SB(src5, src6);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        tmp0 = __msa_srari_h(shf_vec2, 5);
+        tmp1 = __msa_srari_h(shf_vec5, 5);
+        tmp2 = __msa_srari_h(shf_vec8, 5);
+        tmp3 = __msa_srari_h(shf_vec11, 5);
+        LD2(dst, stride, tp0, tp1);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
+        tmp0 = __msa_aver_s_h(tmp2, tmp0);
+        tmp1 = __msa_aver_s_h(tmp3, tmp1);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        out = __msa_aver_u_b(out, dst0);
+        ST8x2_UB(out, dst, stride);
+        dst += (2 * stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t row;
+    uint64_t tp0, tp1;
+    v16u8 out, dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
+    v8i16 mask3, mask4, mask5;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+
+    mask3 = mask0 + 4;
+    mask4 = mask1 + 4;
+    mask5 = mask2 + 4;
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = 4; row--;) {
+        LD_SB2(src, stride, src5, src6);
+        src += (2 * stride);
+        XORI_B2_128_SB(src5, src6);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+                   mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+                   mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
+                   mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
+                   mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
+        hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+        DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
+        DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
+        SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
+        SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
+        tmp0 = __msa_srari_h(shf_vec2, 5);
+        tmp1 = __msa_srari_h(shf_vec5, 5);
+        tmp2 = __msa_srari_h(shf_vec8, 5);
+        tmp3 = __msa_srari_h(shf_vec11, 5);
+        LD2(dst, stride, tp0, tp1);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        tmp0 = __msa_pckod_h(tmp2, tmp0);
+        tmp1 = __msa_pckod_h(tmp3, tmp1);
+        PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
+        tmp0 = __msa_aver_s_h(tmp2, tmp0);
+        tmp1 = __msa_aver_s_h(tmp3, tmp1);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        out = __msa_aver_u_b(out, dst0);
+        ST8x2_UB(out, dst, stride);
+        dst += (2 * stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out, dstv = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
+    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
+    v16i8 src76_l, src87_l, filt0, filt1, filt2;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
+               src76_l, src87_l);
+    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
+    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
+    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
+
+    SRARI_W2_SW(hz_res0, hz_res1, 10);
+    SAT_SW2_SW(hz_res0, hz_res1, 7);
+    SRARI_W2_SW(hz_res2, hz_res3, 10);
+    SAT_SW2_SW(hz_res2, hz_res3, 7);
+
+    dst0 = __msa_srari_h(shf_vec2, 5);
+    dst1 = __msa_srari_h(shf_vec5, 5);
+    dst2 = __msa_srari_h(shf_vec6, 5);
+    dst3 = __msa_srari_h(shf_vec7, 5);
+
+    SAT_SH2_SH(dst0, dst1, 7);
+    SAT_SH2_SH(dst2, dst3, 7);
+    ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
+    ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
+
+    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
+    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
+    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
+    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
+
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
+    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
+    out = PCKEV_XORI128_UB(dst0, dst2);
+    out = __msa_aver_u_b(out, dstv);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    const int16_t filt_const0 = 0xfb01;
+    const int16_t filt_const1 = 0x1414;
+    const int16_t filt_const2 = 0x1fb;
+    v16u8 out, dstv = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
+    v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
+    v16i8 src76_l, src87_l, filt0, filt1, filt2;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
+    v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
+               src32_l, src43_l);
+    ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
+               src76_l, src87_l);
+    vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+    hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+    hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+    vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
+               mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
+    VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
+               mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
+    hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+    DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
+    hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+    DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
+
+    SRARI_W2_SW(hz_res0, hz_res1, 10);
+    SAT_SW2_SW(hz_res0, hz_res1, 7);
+    SRARI_W2_SW(hz_res2, hz_res3, 10);
+    SAT_SW2_SW(hz_res2, hz_res3, 7);
+
+    dst0 = __msa_srari_h(shf_vec2, 5);
+    dst1 = __msa_srari_h(shf_vec5, 5);
+    dst2 = __msa_srari_h(shf_vec6, 5);
+    dst3 = __msa_srari_h(shf_vec7, 5);
+
+    SAT_SH2_SH(dst0, dst1, 7);
+    SAT_SH2_SH(dst2, dst3, 7);
+
+    dst0 = __msa_ilvod_h(zeros, dst0);
+    dst1 = __msa_ilvod_h(zeros, dst1);
+    dst2 = __msa_ilvod_h(zeros, dst2);
+    dst3 = __msa_ilvod_h(zeros, dst3);
+
+    hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
+    hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
+    hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
+    hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
+
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
+    PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
+    out = PCKEV_XORI128_UB(dst0, dst2);
+    out = __msa_aver_u_b(out, dstv);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    const uint8_t *src_tmp = src - (2 * stride) - 2;
+    uint8_t *dst_tmp = dst;
+    uint64_t tp0, tp1, tp2, tp3;
+    uint32_t multiple8_cnt, loop_cnt;
+    v16u8 dst0, dst1, out0, out1;
+    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
+    v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
+    v8i16 hz_out87_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        src = src_tmp;
+        dst = dst_tmp;
+
+        LD_SB5(src, stride, src0, src1, src2, src3, src4);
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+        src += (5 * stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+        hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+        for (loop_cnt = 4; loop_cnt--;) {
+            LD_SB4(src, stride, src0, src1, src2, src3);
+            XORI_B4_128_SB(src0, src1, src2, src3);
+            src += (4 * stride);
+
+            hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+            hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+            hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+            hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+            ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
+                       hz_out43_r);
+            ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
+                       hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
+                       hz_out43_l);
+            ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
+                       hz_out87_r);
+            ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
+                       hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
+                       hz_out87_l);
+
+            tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
+                                  filt1, filt2);
+            res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
+                                  filt1, filt2);
+            res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
+                                  filt1, filt2);
+            res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+            tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
+                                  filt1, filt2);
+            tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
+                                  filt1, filt2);
+            res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+
+            LD4(dst, stride, tp0, tp1, tp2, tp3);
+            INSERT_D2_UB(tp0, tp1, dst0);
+            INSERT_D2_UB(tp2, tp3, dst1);
+            out0 = PCKEV_XORI128_UB(res0, res1);
+            out1 = PCKEV_XORI128_UB(res2, res3);
+            AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+            ST8x4_UB(out0, out1, dst, stride);
+            dst += (4 * stride);
+
+            hz_out0 = hz_out4;
+            hz_out1 = hz_out5;
+            hz_out2 = hz_out6;
+            hz_out3 = hz_out7;
+            hz_out4 = hz_out8;
+        }
+
+        src_tmp += 8;
+        dst_tmp += 8;
+    }
+}
+
+void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
+    v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
+    v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
+    v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
+    v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
+    v4i32 tmp0, tmp1;
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    src -= ((2 * stride) + 2);
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
+
+    LD_SB4(src, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * stride);
+    hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+    ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
+                          filt2);
+    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
+                          filt2);
+    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
+                          filt2);
+    res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
+                          filt2);
+    res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    out0 = PCKEV_XORI128_UB(res0, res1);
+    out1 = PCKEV_XORI128_UB(res2, res3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+    dst += (4 * stride);
+
+    LD_SB4(src, stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
+    hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
+    hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
+    hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
+    ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
+               hz_out1211_r);
+    ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
+               hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
+               hz_out1211_l);
+    tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
+                          filt2);
+    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
+                          filt2);
+    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
+                          filt2);
+    res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
+                          filt2);
+    res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    out0 = PCKEV_XORI128_UB(res0, res1);
+    out1 = PCKEV_XORI128_UB(res2, res3);
+    AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+    ST8x4_UB(dst0, dst1, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    const int32_t filt_const0 = 0xfffb0001;
+    const int32_t filt_const1 = 0x140014;
+    const int32_t filt_const2 = 0x1fffb;
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 res, dst0 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
+    v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
+    v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
+    v4i32 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    filt0 = (v8i16) __msa_fill_w(filt_const0);
+    filt1 = (v8i16) __msa_fill_w(filt_const1);
+    filt2 = (v8i16) __msa_fill_w(filt_const2);
+
+    src -= ((2 * stride) + 2);
+
+    LD_SB5(src, stride, src0, src1, src2, src3, src4);
+    src += (5 * stride);
+    LD_SB4(src, stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
+    hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
+    hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+    PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
+    ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
+               hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
+    ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
+               hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
+
+    tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
+                          filt2);
+    res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
+                          filt2);
+    tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
+                          filt2);
+    res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    res = PCKEV_XORI128_UB(res0, res1);
+    res = __msa_aver_u_b(res, dst0);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+}
diff --git a/libavcodec/mips/hevc_idct_msa.c b/libavcodec/mips/hevc_idct_msa.c
new file mode 100644
index 0000000..0943119
--- /dev/null
+++ b/libavcodec/mips/hevc_idct_msa.c
@@ -0,0 +1,1026 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
+    64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
+    64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+    64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+    64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+    64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
+    90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+    90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+    88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+    85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+    82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+    78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+    73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+    67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+    61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+    54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+    46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+    38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+    31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+    22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+    13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+    4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
+    90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+    80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+    57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+    25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
+    89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,          \
+                         sum0, sum1, sum2, sum3, shift)       \
+{                                                             \
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5;                 \
+    v4i32 cnst64 = __msa_ldi_w(64);                           \
+    v4i32 cnst83 = __msa_ldi_w(83);                           \
+    v4i32 cnst36 = __msa_ldi_w(36);                           \
+                                                              \
+    DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64,   \
+                cnst83, cnst36, vec0, vec2, vec1, vec3);      \
+    DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5);    \
+                                                              \
+    sum0 = vec0 + vec2;                                       \
+    sum1 = vec0 - vec2;                                       \
+    sum3 = sum0;                                              \
+    sum2 = sum1;                                              \
+                                                              \
+    vec1 += vec3;                                             \
+    vec4 -= vec5;                                             \
+                                                              \
+    sum0 += vec1;                                             \
+    sum1 += vec4;                                             \
+    sum2 -= vec4;                                             \
+    sum3 -= vec1;                                             \
+                                                              \
+    SRARI_W4_SW(sum0, sum1, sum2, sum3, shift);               \
+    SAT_SW4_SW(sum0, sum1, sum2, sum3, 15);                   \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
+{                                                                        \
+    v8i16 src0_r, src1_r, src2_r, src3_r;                                \
+    v8i16 src0_l, src1_l, src2_l, src3_l;                                \
+    v8i16 filt0, filter0, filter1, filter2, filter3;                     \
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r;          \
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l;          \
+    v4i32 sum0_r, sum1_r, sum2_r, sum3_r;                                \
+    v4i32 sum0_l, sum1_l, sum2_l, sum3_l;                                \
+                                                                         \
+    ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7,                   \
+               src0_r, src1_r, src2_r, src3_r);                          \
+    ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7,                   \
+               src0_l, src1_l, src2_l, src3_l);                          \
+                                                                         \
+    filt0 = LD_SH(filter);                                               \
+    SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);             \
+    DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0,        \
+                filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l);   \
+                                                                         \
+    BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,      \
+                sum1_l, sum1_r);                                         \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l,  filter2, filter2,       \
+                filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l);   \
+                                                                         \
+    temp2_r += temp3_r;                                                  \
+    temp2_l += temp3_l;                                                  \
+    sum0_r += temp2_r;                                                   \
+    sum0_l += temp2_l;                                                   \
+    sum3_r -= temp2_r;                                                   \
+    sum3_l -= temp2_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift);                  \
+    SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15);                      \
+    PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7);               \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l,  filter3, filter3,       \
+                filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l);   \
+                                                                         \
+    temp4_r -= temp5_r;                                                  \
+    temp4_l -= temp5_l;                                                  \
+    sum1_r += temp4_r;                                                   \
+    sum1_l += temp4_l;                                                   \
+    sum2_r -= temp4_r;                                                   \
+    sum2_l -= temp4_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift);                  \
+    SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15);                      \
+    PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4);               \
+                                                                         \
+    filt0 = LD_SH(filter + 8);                                           \
+    SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);             \
+    DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l,  filter0, filter0,       \
+                filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l);   \
+                                                                         \
+    BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,      \
+                sum1_l, sum1_r);                                         \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2,        \
+                filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l);   \
+                                                                         \
+    temp2_r += temp3_r;                                                  \
+    temp2_l += temp3_l;                                                  \
+    sum0_r += temp2_r;                                                   \
+    sum0_l += temp2_l;                                                   \
+    sum3_r -= temp2_r;                                                   \
+    sum3_l -= temp2_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift);                  \
+    SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15);                      \
+    PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6);               \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3,        \
+                filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l);   \
+                                                                         \
+    temp4_r -= temp5_r;                                                  \
+    temp4_l -= temp5_l;                                                  \
+    sum1_r -= temp4_r;                                                   \
+    sum1_l -= temp4_l;                                                   \
+    sum2_r += temp4_r;                                                   \
+    sum2_l += temp4_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift);                  \
+    SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15);                      \
+    PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5);               \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,                \
+                           src4_r, src5_r, src6_r, src7_r,                \
+                           src0_l, src1_l, src2_l, src3_l,                \
+                           src4_l, src5_l, src6_l, src7_l, shift)         \
+{                                                                         \
+    int16_t *ptr0, *ptr1;                                                 \
+    v8i16 filt0, filt1, dst0, dst1;                                       \
+    v8i16 filter0, filter1, filter2, filter3;                             \
+    v4i32 temp0_r, temp1_r, temp0_l, temp1_l;                             \
+    v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l;         \
+    v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l;                         \
+                                                                          \
+    ptr0 = (buf_ptr + 112);                                               \
+    ptr1 = (buf_ptr + 128);                                               \
+    k = -1;                                                               \
+                                                                          \
+    for (j = 0; j < 4; j++)                                               \
+    {                                                                     \
+        LD_SH2(filter, 8, filt0, filt1)                                   \
+        filter += 16;                                                     \
+        SPLATI_W2_SH(filt0, 0, filter0, filter1);                         \
+        SPLATI_W2_SH(filt1, 0, filter2, filter3);                         \
+        DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l,  filter0, filter0,    \
+                    filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l);    \
+        DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l);    \
+        DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l,  filter1, filter1,   \
+                     filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l);   \
+        DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l);   \
+                                                                          \
+        sum1_r = sum0_r;                                                  \
+        sum1_l = sum0_l;                                                  \
+                                                                          \
+        SPLATI_W2_SH(filt0, 2, filter0, filter1);                         \
+        SPLATI_W2_SH(filt1, 2, filter2, filter3);                         \
+        DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l);  \
+        DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l);   \
+        DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l);  \
+                                                                          \
+        sum0_r += temp0_r;                                                \
+        sum0_l += temp0_l;                                                \
+        sum1_r -= temp0_r;                                                \
+        sum1_l -= temp0_l;                                                \
+                                                                          \
+        sum3_r = temp1_r - sum3_r;                                        \
+        sum3_l = temp1_l - sum3_l;                                        \
+                                                                          \
+        DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l);  \
+        DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3,    \
+                     filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l);   \
+                                                                          \
+        sum0_r += temp0_r;                                                \
+        sum0_l += temp0_l;                                                \
+        sum1_r -= temp0_r;                                                \
+        sum1_l -= temp0_l;                                                \
+                                                                          \
+        BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l,       \
+                    res1_l, res1_r);                                      \
+        SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift);               \
+        SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15);                   \
+        PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1);          \
+        ST_SH(dst0, buf_ptr);                                             \
+        ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16)));                   \
+                                                                          \
+        BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l,       \
+                    res1_l, res1_r);                                      \
+        SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift);               \
+        SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15);                   \
+        PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1);          \
+        ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16)));           \
+        ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16)));           \
+                                                                          \
+        k *= -1;                                                          \
+        buf_ptr += 16;                                                    \
+    }                                                                     \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)  \
+{                                                                     \
+    LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l);                  \
+    tmp1_r = sum0_r;                                                  \
+    tmp1_l = sum0_l;                                                  \
+    sum0_r += tmp0_r;                                                 \
+    sum0_l += tmp0_l;                                                 \
+    ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4);                \
+    tmp1_r -= tmp0_r;                                                 \
+    tmp1_l -= tmp0_l;                                                 \
+    ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4);               \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,     \
+                              res0, res1, res2, res3, shift)  \
+{                                                             \
+    v4i32 vec0, vec1, vec2, vec3;                             \
+    v4i32 cnst74 = __msa_ldi_w(74);                           \
+    v4i32 cnst55 = __msa_ldi_w(55);                           \
+    v4i32 cnst29 = __msa_ldi_w(29);                           \
+                                                              \
+    vec0 = in_r0 + in_r1;                                     \
+    vec2 = in_r0 - in_l1;                                     \
+    res0 = vec0 * cnst29;                                     \
+    res1 = vec2 * cnst55;                                     \
+    res2 = in_r0 - in_r1;                                     \
+    vec1 = in_r1 + in_l1;                                     \
+    res2 += in_l1;                                            \
+    vec3 = in_l0 * cnst74;                                    \
+    res3 = vec0 * cnst55;                                     \
+                                                              \
+    res0 += vec1 * cnst55;                                    \
+    res1 -= vec1 * cnst29;                                    \
+    res2 *= cnst74;                                           \
+    res3 += vec2 * cnst29;                                    \
+                                                              \
+    res0 += vec3;                                             \
+    res1 += vec3;                                             \
+    res3 -= vec3;                                             \
+                                                              \
+    SRARI_W4_SW(res0, res1, res2, res3, shift);               \
+    SAT_SW4_SW(res0, res1, res2, res3, 15);                   \
+}
+
+static void hevc_idct_4x4_msa(int16_t *coeffs)
+{
+    v8i16 in0, in1;
+    v4i32 in_r0, in_l0, in_r1, in_l1;
+    v4i32 sum0, sum1, sum2, sum3;
+    v8i16 zeros = { 0 };
+
+    LD_SH2(coeffs, 8, in0, in1);
+    ILVRL_H2_SW(zeros, in0, in_r0, in_l0);
+    ILVRL_H2_SW(zeros, in1, in_r1, in_l1);
+
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+    TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+
+    /* Pack and transpose */
+    PCKEV_H2_SH(sum2, sum0, sum3, sum1, in0, in1);
+    ILVRL_H2_SW(in1, in0, sum0, sum1);
+    ILVRL_W2_SH(sum1, sum0, in0, in1);
+
+    ST_SH2(in0, in1, coeffs, 8);
+}
+
+static void hevc_idct_8x8_msa(int16_t *coeffs)
+{
+    const int16_t *filter = &gt8x8_cnst[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
+}
+
+static void hevc_idct_16x16_msa(int16_t *coeffs)
+{
+    int16_t i, j, k;
+    int16_t buf[256];
+    int16_t *buf_ptr = &buf[0];
+    int16_t *src = coeffs;
+    const int16_t *filter = &gt16x16_cnst[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+    for (i = 2; i--;) {
+        LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
+                in8, in9, in10, in11, in12, in13, in14, in15);
+
+        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_r, src5_r, src6_r, src7_r);
+        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_l, src1_l, src2_l, src3_l);
+        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 7);
+
+        src += 8;
+        buf_ptr = (&buf[0] + 8);
+        filter = &gt16x16_cnst[0];
+    }
+
+    src = &buf[0];
+    buf_ptr = coeffs;
+    filter = &gt16x16_cnst[0];
+
+    for (i = 2; i--;) {
+        LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
+                in4, in12, in5, in13, in6, in14, in7, in15);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                           in8, in9, in10, in11, in12, in13, in14, in15);
+        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_r, src5_r, src6_r, src7_r);
+        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_l, src1_l, src2_l, src3_l);
+        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 12);
+
+        src += 128;
+        buf_ptr = coeffs + 8;
+        filter = &gt16x16_cnst[0];
+    }
+
+    LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);
+
+    LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
+    TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);
+
+    LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
+}
+
+static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch,
+                                      uint8_t round)
+{
+    uint8_t i;
+    const int16_t *filter_ptr0 = &gt32x32_cnst0[0];
+    const int16_t *filter_ptr1 = &gt32x32_cnst1[0];
+    const int16_t *filter_ptr2 = &gt32x32_cnst2[0];
+    const int16_t *filter_ptr3 = &gt8x8_cnst[0];
+    int16_t *src0 = (coeffs + buf_pitch);
+    int16_t *src1 = (coeffs + 2 * buf_pitch);
+    int16_t *src2 = (coeffs + 4 * buf_pitch);
+    int16_t *src3 = (coeffs);
+    int32_t cnst0, cnst1;
+    int32_t tmp_buf[8 * 32 + 15];
+    int32_t *tmp_buf_ptr = tmp_buf + 15;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+    v8i16 filt0, filter0, filter1, filter2, filter3;
+    v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+    /* Align pointer to 64 byte boundary */
+    tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+    /* process coeff 4, 12, 20, 28 */
+    LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
+    ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
+    ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
+
+    LD_SH2(src3, 16 * buf_pitch, in4, in6);
+    LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7);
+    ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r);
+    ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 2; i++) {
+        /* processing single column of constants */
+        cnst0 = LW(filter_ptr2);
+        cnst1 = LW(filter_ptr2 + 2);
+
+        filter0 = (v8i16) __msa_fill_w(cnst0);
+        filter1 = (v8i16) __msa_fill_w(cnst1);
+
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
+        ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4);
+
+        /* processing single column of constants */
+        cnst0 = LW(filter_ptr2 + 4);
+        cnst1 = LW(filter_ptr2 + 6);
+
+        filter0 = (v8i16) __msa_fill_w(cnst0);
+        filter1 = (v8i16) __msa_fill_w(cnst1);
+
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
+        ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4);
+
+        filter_ptr2 += 8;
+    }
+
+    /* process coeff 0, 8, 16, 24 */
+    /* loop for all columns of constants */
+    for (i = 0; i < 2; i++) {
+        /* processing first column of filter constants */
+        cnst0 = LW(filter_ptr3);
+        cnst1 = LW(filter_ptr3 + 2);
+
+        filter0 = (v8i16) __msa_fill_w(cnst0);
+        filter1 = (v8i16) __msa_fill_w(cnst1);
+
+        DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1,
+                    filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+
+        sum1_r = sum0_r - tmp1_r;
+        sum1_l = sum0_l - tmp1_l;
+        sum0_r = sum0_r + tmp1_r;
+        sum0_l = sum0_l + tmp1_l;
+
+        HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i));
+        HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i));
+
+        filter_ptr3 += 8;
+    }
+
+    /* process coeff 2 6 10 14 18 22 26 30 */
+    LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_l, src1_l, src2_l, src3_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 8; i++) {
+        /* processing single column of constants */
+        filt0 = LD_SH(filter_ptr1);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
+
+        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r += sum0_r;
+        tmp0_l += sum0_l;
+        ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
+        tmp1_r -= sum0_r;
+        tmp1_l -= sum0_l;
+        ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);
+
+        filter_ptr1 += 8;
+    }
+
+    /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    src0 += 16 * buf_pitch;
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_l, src1_l, src2_l, src3_l);
+
+    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src4_r, src5_r, src6_r, src7_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src4_l, src5_l, src6_l, src7_l);
+
+    /* loop for all columns of filter constants */
+    for (i = 0; i < 16; i++) {
+        /* processing single column of constants */
+        filt0 = LD_SH(filter_ptr0);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
+
+        tmp1_r = sum0_r;
+        tmp1_l = sum0_l;
+
+        filt0 = LD_SH(filter_ptr0 + 8);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);
+
+        sum0_r += tmp1_r;
+        sum0_l += tmp1_l;
+
+        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r += sum0_r;
+        tmp0_l += sum0_l;
+        sum1_r = __msa_fill_w(round);
+        SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r);
+        SAT_SW2_SW(tmp0_r, tmp0_l, 15);
+        in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
+        ST_SH(in0, (coeffs + i * buf_pitch));
+        tmp1_r -= sum0_r;
+        tmp1_l -= sum0_l;
+        SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r);
+        SAT_SW2_SW(tmp1_r, tmp1_l, 15);
+        in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
+        ST_SH(in0, (coeffs + (31 - i) * buf_pitch));
+
+        filter_ptr0 += 16;
+    }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+    uint8_t i;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8);
+    }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+    uint8_t i;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
+    }
+}
+
+static void hevc_idct_32x32_msa(int16_t *coeffs)
+{
+    uint8_t row_cnt, col_cnt;
+    int16_t *src = coeffs;
+    int16_t tmp_buf[8 * 32 + 31];
+    int16_t *tmp_buf_ptr = tmp_buf + 31;
+    uint8_t round;
+    uint8_t buf_pitch;
+
+    /* Align pointer to 64 byte boundary */
+    tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+    /* column transform */
+    round = 7;
+    buf_pitch = 32;
+    for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+        /* process 8x32 blocks */
+        hevc_idct_8x32_column_msa((coeffs + col_cnt * 8), buf_pitch, round);
+    }
+
+    /* row transform */
+    round = 12;
+    buf_pitch = 8;
+    for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+        /* process 32x8 blocks */
+        src = (coeffs + 32 * 8 * row_cnt);
+
+        hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+        hevc_idct_8x32_column_msa(tmp_buf_ptr, buf_pitch, round);
+        hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+    }
+}
+
+static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
+{
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    ST_SH2(dst, dst, coeffs, 8);
+}
+
+static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
+{
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+}
+
+static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
+{
+    uint8_t loop;
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    for (loop = 4; loop--;) {
+        ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+        coeffs += 8 * 8;
+    }
+}
+
+static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
+{
+    uint8_t loop;
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    for (loop = 16; loop--;) {
+        ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+        coeffs += 8 * 8;
+    }
+}
+
+static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint32_t dst0, dst1, dst2, dst3;
+    v8i16 dst_r0, dst_l0, in0, in1;
+    v4i32 dst_vec = { 0 };
+    v16u8 zeros = { 0 };
+
+    LD_SH2(coeffs, 8, in0, in1);
+    LW4(dst, stride, dst0, dst1, dst2, dst3);
+    INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
+    ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
+    ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
+    CLIP_SH2_0_255(dst_r0, dst_l0);
+    dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
+    ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t *temp_dst = dst;
+    uint64_t dst0, dst1, dst2, dst3;
+    v2i64 dst_vec0 = { 0 };
+    v2i64 dst_vec1 = { 0 };
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 zeros = { 0 };
+
+    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
+    temp_dst += (4 * stride);
+
+    INSERT_D2_SD(dst0, dst1, dst_vec0);
+    INSERT_D2_SD(dst2, dst3, dst_vec1);
+    ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
+    ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
+    ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
+         dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
+    ST8x4_UB(dst_r0, dst_r1, dst, stride);
+    dst += (4 * stride);
+
+    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
+    INSERT_D2_SD(dst0, dst1, dst_vec0);
+    INSERT_D2_SD(dst2, dst3, dst_vec1);
+    UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
+    UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
+    ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
+         dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
+    ST8x4_UB(dst_r0, dst_r1, dst, stride);
+}
+
+static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t loop_cnt;
+    uint8_t *temp_dst = dst;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* Pre-load for next iteration */
+    LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
+    temp_dst += (4 * stride);
+    LD_SH4(coeffs, 16, in0, in2, in4, in6);
+    LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
+    coeffs += 64;
+
+    for (loop_cnt = 3; loop_cnt--;) {
+        UNPCK_UB_SH(dst4, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst5, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst6, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst7, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        /* Pre-load for next iteration */
+        LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
+        temp_dst += (4 * stride);
+        LD_SH4(coeffs, 16, in0, in2, in4, in6);
+        LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
+        coeffs += 64;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+
+    UNPCK_UB_SH(dst4, dst_r0, dst_l0);
+    UNPCK_UB_SH(dst5, dst_r1, dst_l1);
+    UNPCK_UB_SH(dst6, dst_r2, dst_l2);
+    UNPCK_UB_SH(dst7, dst_r3, dst_l3);
+
+    dst_r0 += in0;
+    dst_l0 += in1;
+    dst_r1 += in2;
+    dst_l1 += in3;
+    dst_r2 += in4;
+    dst_l2 += in5;
+    dst_r3 += in6;
+    dst_l3 += in7;
+
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+    PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                dst_r3, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+}
+
+static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t loop_cnt;
+    uint8_t *temp_dst = dst;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* Pre-load for next iteration */
+    LD_UB2(temp_dst, 16, dst4, dst5);
+    temp_dst += stride;
+    LD_UB2(temp_dst, 16, dst6, dst7);
+    temp_dst += stride;
+    LD_SH4(coeffs, 16, in0, in2, in4, in6);
+    LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
+    coeffs += 64;
+
+    for (loop_cnt = 14; loop_cnt--;) {
+        UNPCK_UB_SH(dst4, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst5, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst6, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst7, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        /* Pre-load for next iteration */
+        LD_UB2(temp_dst, 16, dst4, dst5);
+        temp_dst += stride;
+        LD_UB2(temp_dst, 16, dst6, dst7);
+        temp_dst += stride;
+        LD_SH4(coeffs, 16, in0, in2, in4, in6);
+        LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
+        coeffs += 64;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+        ST_UB2(dst0, dst1, dst, 16);
+        dst += stride;
+        ST_UB2(dst2, dst3, dst, 16);
+        dst += stride;
+    }
+
+    UNPCK_UB_SH(dst4, dst_r0, dst_l0);
+    UNPCK_UB_SH(dst5, dst_r1, dst_l1);
+    UNPCK_UB_SH(dst6, dst_r2, dst_l2);
+    UNPCK_UB_SH(dst7, dst_r3, dst_l3);
+
+    dst_r0 += in0;
+    dst_l0 += in1;
+    dst_r1 += in2;
+    dst_l1 += in3;
+    dst_r2 += in4;
+    dst_l2 += in5;
+    dst_r3 += in6;
+    dst_l3 += in7;
+
+    /* Pre-load for next iteration */
+    LD_UB2(temp_dst, 16, dst4, dst5);
+    temp_dst += stride;
+    LD_UB2(temp_dst, 16, dst6, dst7);
+    temp_dst += stride;
+    LD_SH4(coeffs, 16, in0, in2, in4, in6);
+    LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
+
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+    PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                dst_r3, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst1, dst, 16);
+    dst += stride;
+    ST_UB2(dst2, dst3, dst, 16);
+    dst += stride;
+
+    UNPCK_UB_SH(dst4, dst_r0, dst_l0);
+    UNPCK_UB_SH(dst5, dst_r1, dst_l1);
+    UNPCK_UB_SH(dst6, dst_r2, dst_l2);
+    UNPCK_UB_SH(dst7, dst_r3, dst_l3);
+
+    dst_r0 += in0;
+    dst_l0 += in1;
+    dst_r1 += in2;
+    dst_l1 += in3;
+    dst_r2 += in4;
+    dst_l2 += in5;
+    dst_r3 += in6;
+    dst_l3 += in7;
+
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+    PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                dst_r3, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst1, dst, 16);
+    dst += stride;
+    ST_UB2(dst2, dst3, dst, 16);
+}
+
+static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
+{
+    v8i16 in0, in1, dst0, dst1;
+    v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;
+
+    LD_SH2(coeffs, 8, in0, in1);
+    UNPCK_SH_SW(in0, in_r0, in_l0);
+    UNPCK_SH_SW(in1, in_r1, in_l1);
+    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
+                          7);
+    TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
+                          12);
+
+    /* Pack and transpose */
+    PCKEV_H2_SH(res2, res0, res3, res1, dst0, dst1);
+    ILVRL_H2_SW(dst1, dst0, res0, res1);
+    ILVRL_W2_SH(res1, res0, dst0, dst1);
+
+    ST_SH2(dst0, dst1, coeffs, 8);
+}
+
+void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_4x4_msa(coeffs);
+}
+
+void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_8x8_msa(coeffs);
+}
+
+void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_16x16_msa(coeffs);
+}
+
+void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_32x32_msa(coeffs);
+}
+
+void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_4x4_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_8x8_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_16x16_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_32x32_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_4x4_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_8x8_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_16x16_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_32x32_msa(coeffs);
+}
+
+void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
+{
+    hevc_idct_luma_4x4_msa(coeffs);
+}
diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c
new file mode 100644
index 0000000..adcafde
--- /dev/null
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -0,0 +1,2748 @@
+/*
+ * Copyright (c) 2015 -2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
+                                         int32_t beta, int32_t *tc,
+                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    uint8_t *p3 = src - (stride << 2);
+    uint8_t *p2 = src - ((stride << 1) + stride);
+    uint8_t *p1 = src - (stride << 1);
+    uint8_t *p0 = src - stride;
+    uint8_t *q0 = src;
+    uint8_t *q1 = src + stride;
+    uint8_t *q2 = src + (stride << 1);
+    uint8_t *q3 = src + (stride << 1) + stride;
+    uint8_t flag0, flag1;
+    int32_t dp00, dq00, dp30, dq30, d00, d30;
+    int32_t d0030, d0434;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+    uint64_t dst_val0, dst_val1;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
+    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+    v2i64 cmp3;
+    v8u16 temp0, temp1;
+    v8i16 temp2;
+    v8i16 tc_pos, tc_neg;
+    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    v16i8 zero = { 0 };
+    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
+    dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
+    dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
+    dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
+    dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
+    dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
+    dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+
+    p_is_pcm0 = p_is_pcm[0];
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm0 = q_is_pcm[0];
+    q_is_pcm4 = q_is_pcm[1];
+
+    cmp0 = __msa_fill_d(p_is_pcm0);
+    cmp1 = __msa_fill_d(p_is_pcm4);
+    p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+    p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+    d0030 = (d00 + d30) >= beta;
+    d0434 = (d04 + d34) >= beta;
+
+    cmp0 = (v2i64) __msa_fill_w(d0030);
+    cmp1 = (v2i64) __msa_fill_w(d0434);
+    cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
+    cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
+
+    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+        (!d0030 || !d0434)) {
+        p3_src = LD_UH(p3);
+        p2_src = LD_UH(p2);
+        p1_src = LD_UH(p1);
+        p0_src = LD_UH(p0);
+
+        cmp0 = __msa_fill_d(q_is_pcm0);
+        cmp1 = __msa_fill_d(q_is_pcm4);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        tc0 = tc[0];
+        beta30 = beta >> 3;
+        beta20 = beta >> 2;
+        tc250 = ((tc0 * 5 + 1) >> 1);
+        tc4 = tc[1];
+        tc254 = ((tc4 * 5 + 1) >> 1);
+
+        cmp0 = (v2i64) __msa_fill_h(tc0);
+        cmp1 = (v2i64) __msa_fill_h(tc4);
+
+        ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+                   p3_src, p2_src, p1_src, p0_src);
+        q0_src = LD_UH(q0);
+        q1_src = LD_UH(q1);
+        q2_src = LD_UH(q2);
+        q3_src = LD_UH(q3);
+
+        flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+                abs(p0[0] - q0[0]) < tc250;
+        flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+                abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
+                (d30 << 1) < beta20);
+
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+        ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+                   q0_src, q1_src, q2_src, q3_src);
+        flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+                abs(p0[4] - q0[4]) < tc254;
+        flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+                abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
+                (d34 << 1) < beta20);
+
+        cmp0 = (v2i64) __msa_fill_w(flag0);
+        cmp1 = (v2i64) __msa_fill_w(flag1);
+        cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
+        cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
+
+        if (flag0 && flag1) { /* strong only */
+            /* strong filter */
+            tc_pos <<= 1;
+            tc_neg = -tc_pos;
+
+            /* p part */
+            temp0 = (p1_src + p0_src + q0_src);
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            /* q part */
+            temp0 = (q1_src + p0_src + q0_src);
+
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+
+            /* pack results to 8 bit */
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+            dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+
+            /* pack src to 8 bit */
+            PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
+            dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
+
+            dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
+            dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
+            dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
+
+            dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
+            dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
+
+            ST8x4_UB(dst0, dst1, p2, stride);
+            p2 += (4 * stride);
+            SD(dst_val0, p2);
+            p2 += stride;
+            SD(dst_val1, p2);
+            /* strong filter ends */
+        } else if (flag0 == flag1) { /* weak only */
+            /* weak filter */
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (diff0 << 3) + diff0;
+            diff1 = (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+
+            temp0 = (v8u16) (delta0 + p0_src);
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) (q0_src - delta0);
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            p_is_pcm_vec = ~p_is_pcm_vec;
+            q_is_pcm_vec = ~q_is_pcm_vec;
+            tmp = (beta + (beta >> 1)) >> 3;
+            cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
+            cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
+            cmp0 = __msa_ilvev_d(cmp1, cmp0);
+            cmp0 = __msa_ceqi_d(cmp0, 0);
+            p_is_pcm_vec = p_is_pcm_vec | cmp0;
+
+            cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
+            cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
+            cmp0 = __msa_ilvev_d(cmp1, cmp0);
+            cmp0 = __msa_ceqi_d(cmp0, 0);
+            q_is_pcm_vec = q_is_pcm_vec | cmp0;
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+
+            dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                                       (v16u8) abs_delta0);
+            dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                       (v16u8) abs_delta0);
+            dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                       (v16u8) abs_delta0);
+            dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                                       (v16u8) abs_delta0);
+            /* pack results to 8 bit */
+            PCKEV_B2_UB(dst2, dst1, dst4, dst3, dst0, dst1);
+
+            /* pack src to 8 bit */
+            PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
+
+            dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
+            dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
+
+            p2 += stride;
+            ST8x4_UB(dst0, dst1, p2, stride);
+            /* weak filter ends */
+        } else { /* strong + weak */
+            /* strong filter */
+            tc_pos <<= 1;
+            tc_neg = -tc_pos;
+
+            /* p part */
+            temp0 = (p1_src + p0_src + q0_src);
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            /* q part */
+            temp0 = (q1_src + p0_src + q0_src);
+
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+
+            /* pack strong results to 8 bit */
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+            dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+            /* strong filter ends */
+
+            /* weak filter */
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (diff0 << 3) + diff0;
+            diff1 = (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+
+            temp0 = (v8u16) (delta0 + p0_src);
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) (q0_src - delta0);
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            p_is_pcm_vec = ~p_is_pcm_vec;
+            q_is_pcm_vec = ~q_is_pcm_vec;
+            tmp = (beta + (beta >> 1)) >> 3;
+            cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
+            cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
+            cmp0 = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
+
+            cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
+            cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
+            cmp0 = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+
+            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                                         (v16u8) abs_delta0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) abs_delta0);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) abs_delta0);
+            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                                         (v16u8) abs_delta0);
+            /* weak filter ends */
+
+            /* pack weak results to 8 bit */
+            PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
+            dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
+
+            /* select between weak or strong */
+            dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
+            dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
+            dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
+
+            /* pack src to 8 bit */
+            PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
+            dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
+
+            dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
+            dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
+            dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
+
+            dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
+            dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
+
+            ST8x4_UB(dst0, dst1, p2, stride);
+            p2 += (4 * stride);
+            SD(dst_val0, p2);
+            p2 += stride;
+            SD(dst_val1, p2);
+        }
+    }
+}
+
+static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
+                                         int32_t beta, int32_t *tc,
+                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    uint8_t *p3 = src;
+    uint8_t *p2 = src + 3 * stride;
+    uint8_t *p1 = src + (stride << 2);
+    uint8_t *p0 = src + 7 * stride;
+    uint8_t flag0, flag1;
+    uint16_t tmp0, tmp1;
+    uint32_t tmp2, tmp3;
+    int32_t dp00, dq00, dp30, dq30, d00, d30;
+    int32_t d0030, d0434;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+    v2i64 cmp3;
+    v8u16 temp0, temp1;
+    v8i16 temp2;
+    v8i16 tc_pos, tc_neg;
+    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    v16i8 zero = { 0 };
+    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
+    dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
+    dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
+    dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    p_is_pcm0 = p_is_pcm[0];
+    q_is_pcm0 = q_is_pcm[0];
+
+    dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
+    dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
+    dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
+    dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm4 = q_is_pcm[1];
+
+    cmp0 = __msa_fill_d(p_is_pcm0);
+    cmp1 = __msa_fill_d(p_is_pcm4);
+    p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+    p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+    d0030 = (d00 + d30) >= beta;
+    d0434 = (d04 + d34) >= beta;
+
+    cmp0 = __msa_fill_d(d0030);
+    cmp1 = __msa_fill_d(d0434);
+    cmp3 = __msa_ilvev_d(cmp1, cmp0);
+    cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
+
+    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+        (!d0030 || !d0434)) {
+        src -= 4;
+        LD_UH8(src, stride, p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+               q2_src, q3_src);
+
+        cmp0 = __msa_fill_d(q_is_pcm0);
+        cmp1 = __msa_fill_d(q_is_pcm4);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        tc0 = tc[0];
+        beta30 = beta >> 3;
+        beta20 = beta >> 2;
+        tc250 = ((tc0 * 5 + 1) >> 1);
+
+        tc4 = tc[1];
+        tc254 = ((tc4 * 5 + 1) >> 1);
+        cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
+        cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+
+        TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+                           q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+                           q0_src, q1_src, q2_src, q3_src);
+
+        flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+                abs(p3[-1] - p3[0]) < tc250;
+        flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+                abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
+                (d30 << 1) < beta20);
+        cmp0 = __msa_fill_d(flag0);
+        ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+                   p3_src, p2_src, p1_src, p0_src);
+
+        flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+                abs(p1[-1] - p1[0]) < tc254;
+        flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+                abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
+                (d34 << 1) < beta20);
+        ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+                   q0_src, q1_src, q2_src, q3_src);
+
+        cmp1 = __msa_fill_d(flag1);
+        cmp2 = __msa_ilvev_d(cmp1, cmp0);
+        cmp2 = __msa_ceqi_d(cmp2, 0);
+
+        if (flag0 && flag1) { /* strong only */
+            /* strong filter */
+            tc_neg = -tc_pos;
+
+            /* p part */
+            temp0 = (p1_src + p0_src + q0_src);
+
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            /* q part */
+            temp0 = (q1_src + p0_src + q0_src);
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+            /* strong filter ends */
+        } else if (flag0 == flag1) { /* weak only */
+            /* weak filter */
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (diff0 << 3) + diff0;
+            diff1 = (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+            temp0 = (v8u16) (delta0 + p0_src);
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) (q0_src - delta0);
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            tmp = ((beta + (beta >> 1)) >> 3);
+            cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
+            cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
+            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+
+            dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                               (v16u8) abs_delta0);
+            dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                               (v16u8) abs_delta0);
+            dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                               (v16u8) abs_delta0);
+            dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                               (v16u8) abs_delta0);
+            /* weak filter ends */
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
+            dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
+            dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
+
+            PCKEV_B2_UB(dst2, dst0, dst3, dst1, dst0, dst1);
+
+            /* transpose */
+            ILVRL_B2_UB(dst1, dst0, dst4, dst5);
+            ILVRL_H2_UB(dst5, dst4, dst0, dst1);
+
+            src += 2;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
+            tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
+            SW(tmp2, src);
+            src += stride;
+            SW(tmp3, src);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
+            SW(tmp2, src);
+            src += stride;
+            SW(tmp3, src);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
+            tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
+            SW(tmp2, src);
+            src += stride;
+            SW(tmp3, src);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
+            SW(tmp2, src);
+            src += stride;
+            SW(tmp3, src);
+
+            return;
+        } else { /* strong + weak */
+            /* strong filter */
+            tc_neg = -tc_pos;
+
+            /* p part */
+            temp0 = (p1_src + p0_src + q0_src);
+
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            /* q part */
+            temp0 = (q1_src + p0_src + q0_src);
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+            /* strong filter ends */
+
+            /* weak filter */
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (diff0 << 3) + diff0;
+            diff1 = (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+
+            temp0 = (v8u16) (delta0 + p0_src);
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) (q0_src - delta0);
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            tmp = (beta + (beta >> 1)) >> 3;
+            cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
+            cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
+            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                                         (v16u8) abs_delta0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) abs_delta0);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) abs_delta0);
+            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                                         (v16u8) abs_delta0);
+            /* weak filter ends*/
+
+            /* select between weak or strong */
+            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
+            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
+            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
+            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
+            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
+            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
+        }
+
+        dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
+        dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
+        dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
+        dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
+        dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
+        dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
+
+        /* pack results to 8 bit */
+        PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
+                    dst2, dst3);
+
+        /* transpose */
+        ILVRL_B2_UB(dst1, dst0, dst4, dst5);
+        ILVRL_B2_UB(dst3, dst2, dst6, dst7);
+        ILVRL_H2_UB(dst5, dst4, dst0, dst1);
+        ILVRL_H2_UB(dst7, dst6, dst2, dst3);
+
+        src += 1;
+
+        tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
+        tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
+        tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
+        tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
+        SW(tmp2, src);
+        SH(tmp0, src + 4);
+        src += stride;
+        SW(tmp3, src);
+        SH(tmp1, src + 4);
+        src += stride;
+
+        tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
+        tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
+        tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
+        tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
+        SW(tmp2, src);
+        SH(tmp0, src + 4);
+        src += stride;
+        SW(tmp3, src);
+        SH(tmp1, src + 4);
+        src += stride;
+
+        tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
+        tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
+        tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
+        tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
+        SW(tmp2, src);
+        SH(tmp0, src + 4);
+        src += stride;
+        SW(tmp3, src);
+        SH(tmp1, src + 4);
+        src += stride;
+
+        tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
+        tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
+        tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
+        tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
+        SW(tmp2, src);
+        SH(tmp0, src + 4);
+        src += stride;
+        SW(tmp3, src);
+        SH(tmp1, src + 4);
+    }
+}
+
+static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
+                                           int32_t *tc, uint8_t *p_is_pcm,
+                                           uint8_t *q_is_pcm)
+{
+    uint8_t *p1_ptr = src - (stride << 1);
+    uint8_t *p0_ptr = src - stride;
+    uint8_t *q0_ptr = src;
+    uint8_t *q1_ptr = src + stride;
+    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    v8u16 p1, p0, q0, q1;
+    v8i16 tc_pos, tc_neg;
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        cmp0 = (v2i64) __msa_fill_h(tc[0]);
+        cmp1 = (v2i64) __msa_fill_h(tc[1]);
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+        tc_neg = -tc_pos;
+
+        cmp0 = __msa_fill_d(p_is_pcm[0]);
+        cmp1 = __msa_fill_d(p_is_pcm[1]);
+        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+        cmp0 = __msa_fill_d(q_is_pcm[0]);
+        cmp1 = __msa_fill_d(q_is_pcm[1]);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        p1 = LD_UH(p1_ptr);
+        p0 = LD_UH(p0_ptr);
+        q0 = LD_UH(q0_ptr);
+        q1 = LD_UH(q1_ptr);
+
+        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
+
+        temp0 = (v8i16) (q0 - p0);
+        temp1 = (v8i16) (p1 - q1);
+        temp0 <<= 2;
+        temp0 += temp1;
+        delta = __msa_srari_h((v8i16) temp0, 3);
+        delta = CLIP_SH(delta, tc_neg, tc_pos);
+
+        temp0 = (v8i16) ((v8i16) p0 + delta);
+        temp0 = CLIP_SH_0_255(temp0);
+        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
+                                    (v16u8) p_is_pcm_vec);
+
+        temp1 = (v8i16) ((v8i16) q0 - delta);
+        temp1 = CLIP_SH_0_255(temp1);
+        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
+                                    (v16u8) q_is_pcm_vec);
+
+        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
+        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
+        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
+
+        temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
+        ST8x2_UB(temp0, p0_ptr, stride);
+    }
+}
+
+static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
+                                           int32_t *tc, uint8_t *p_is_pcm,
+                                           uint8_t *q_is_pcm)
+{
+    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 p1, p0, q0, q1;
+    v8i16 tc_pos, tc_neg;
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        cmp0 = (v2i64) __msa_fill_h(tc[0]);
+        cmp1 = (v2i64) __msa_fill_h(tc[1]);
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+        tc_neg = -tc_pos;
+
+        cmp0 = __msa_fill_d(p_is_pcm[0]);
+        cmp1 = __msa_fill_d(p_is_pcm[1]);
+        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+        cmp0 = __msa_fill_d(q_is_pcm[0]);
+        cmp1 = __msa_fill_d(q_is_pcm[1]);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        src -= 2;
+        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7,
+                           p1, p0, q0, q1);
+        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
+
+        temp0 = (v8i16) (q0 - p0);
+        temp1 = (v8i16) (p1 - q1);
+        temp0 <<= 2;
+        temp0 += temp1;
+        delta = __msa_srari_h((v8i16) temp0, 3);
+        delta = CLIP_SH(delta, tc_neg, tc_pos);
+
+        temp0 = (v8i16) ((v8i16) p0 + delta);
+        temp0 = CLIP_SH_0_255(temp0);
+        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
+                                    (v16u8) p_is_pcm_vec);
+
+        temp1 = (v8i16) ((v8i16) q0 - delta);
+        temp1 = CLIP_SH_0_255(temp1);
+        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
+                                    (v16u8) q_is_pcm_vec);
+
+        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
+        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
+        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
+
+        temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
+
+        src += 1;
+        ST2x4_UB(temp0, 0, src, stride);
+        src += (4 * stride);
+        ST2x4_UB(temp0, 4, src, stride);
+    }
+}
+
+static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
+                                            uint8_t *src, int32_t src_stride,
+                                            int32_t sao_left_class,
+                                            int16_t *sao_offset_val,
+                                            int32_t height)
+{
+    v16u8 src0, src1, src2, src3;
+    v16i8 src0_r, src1_r;
+    v16i8 offset, offset_val, mask;
+    v16i8 dst0, offset0, offset1;
+    v16i8 zero = { 0 };
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    /* load in advance. */
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (height -= 4; height; height -= 4) {
+        src += (4 * src_stride);
+
+        ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
+
+        src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
+        mask = __msa_srli_b(src0_r, 3);
+        offset = __msa_vshf_b(mask, offset1, offset0);
+
+        src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
+        dst0 = __msa_adds_s_b(src0_r, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        /* load in advance. */
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+        /* store results */
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+
+    ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
+
+    src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
+    mask = __msa_srli_b(src0_r, 3);
+    offset = __msa_vshf_b(mask, offset1, offset0);
+
+    src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
+    dst0 = __msa_adds_s_b(src0_r, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    /* store results */
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
+                                            uint8_t *src, int32_t src_stride,
+                                            int32_t sao_left_class,
+                                            int16_t *sao_offset_val,
+                                            int32_t height)
+{
+    v16u8 src0, src1, src2, src3;
+    v16i8 src0_r, src1_r, mask0, mask1;
+    v16i8 offset_mask0, offset_mask1, offset_val;
+    v16i8 offset0, offset1, dst0, dst1;
+    v16i8 zero = { 0 };
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    /* load in advance. */
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (height -= 4; height; height -= 4) {
+        src += src_stride << 2;
+
+        ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
+
+        mask0 = __msa_srli_b(src0_r, 3);
+        mask1 = __msa_srli_b(src1_r, 3);
+
+        offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
+        offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
+
+        /* load in advance. */
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+        XORI_B2_128_SB(src0_r, src1_r);
+
+        dst0 = __msa_adds_s_b(src0_r, offset_mask0);
+        dst1 = __msa_adds_s_b(src1_r, offset_mask1);
+
+        XORI_B2_128_SB(dst0, dst1);
+
+        /* store results */
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += dst_stride << 2;
+    }
+
+    ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
+
+    mask0 = __msa_srli_b(src0_r, 3);
+    mask1 = __msa_srli_b(src1_r, 3);
+
+    offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
+    offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
+
+    XORI_B2_128_SB(src0_r, src1_r);
+
+    dst0 = __msa_adds_s_b(src0_r, offset_mask0);
+    dst1 = __msa_adds_s_b(src1_r, offset_mask1);
+
+    XORI_B2_128_SB(dst0, dst1);
+
+    /* store results */
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+}
+
+static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
+                                                int32_t dst_stride,
+                                                uint8_t *src,
+                                                int32_t src_stride,
+                                                int32_t sao_left_class,
+                                                int16_t *sao_offset_val,
+                                                int32_t width, int32_t height)
+{
+    int32_t w_cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 out0, out1, out2, out3;
+    v16i8 mask0, mask1, mask2, mask3;
+    v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
+    v16i8 offset0, offset1;
+    v16i8 zero = { 0 };
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    while (height > 0) {
+        /* load in advance */
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+        for (w_cnt = 16; w_cnt < width; w_cnt += 16) {
+            mask0 = __msa_srli_b((v16i8) src0, 3);
+            mask1 = __msa_srli_b((v16i8) src1, 3);
+            mask2 = __msa_srli_b((v16i8) src2, 3);
+            mask3 = __msa_srli_b((v16i8) src3, 3);
+
+            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
+                       tmp0, tmp1);
+            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
+                       tmp2, tmp3);
+            XORI_B4_128_UB(src0, src1, src2, src3);
+
+            out0 = __msa_adds_s_b((v16i8) src0, tmp0);
+            out1 = __msa_adds_s_b((v16i8) src1, tmp1);
+            out2 = __msa_adds_s_b((v16i8) src2, tmp2);
+            out3 = __msa_adds_s_b((v16i8) src3, tmp3);
+
+            /* load for next iteration */
+            LD_UB4(src + w_cnt, src_stride, src0, src1, src2, src3);
+
+            XORI_B4_128_SB(out0, out1, out2, out3);
+
+            ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
+        }
+
+        mask0 = __msa_srli_b((v16i8) src0, 3);
+        mask1 = __msa_srli_b((v16i8) src1, 3);
+        mask2 = __msa_srli_b((v16i8) src2, 3);
+        mask3 = __msa_srli_b((v16i8) src3, 3);
+
+        VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0,
+                   tmp1);
+        VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2,
+                   tmp3);
+        XORI_B4_128_UB(src0, src1, src2, src3);
+
+        out0 = __msa_adds_s_b((v16i8) src0, tmp0);
+        out1 = __msa_adds_s_b((v16i8) src1, tmp1);
+        out2 = __msa_adds_s_b((v16i8) src2, tmp2);
+        out3 = __msa_adds_s_b((v16i8) src3, tmp3);
+
+        XORI_B4_128_SB(out0, out1, out2, out3);
+
+        ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
+
+        src += src_stride << 2;
+        dst += dst_stride << 2;
+        height -= 4;
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    uint32_t dst_val0, dst_val1;
+    v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16i8 sao_offset = LD_SB(sao_offset_val);
+    v16i8 src_plus10, offset, src0, dst0;
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+    src -= 1;
+
+    /* load in advance */
+    LD_UB2(src, src_stride, src_minus10, src_minus11);
+
+    for (height -= 2; height; height -= 2) {
+        src += (2 * src_stride);
+
+        src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
+                                            (v2i64) src_minus10);
+
+        src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
+        src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
+
+        cmp_minus10 = ((v16u8) src0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
+        diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
+
+        offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
+
+        /* load in advance */
+        LD_UB2(src, src_stride, src_minus10, src_minus11);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+                   offset, offset);
+
+        src0 = (v16i8) __msa_xori_b((v16u8) src0, 128);
+        dst0 = __msa_adds_s_b(src0, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+        dst += dst_stride;
+    }
+
+    src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
+                                        (v2i64) src_minus10);
+
+    src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
+    src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
+
+    cmp_minus10 = ((v16u8) src0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
+    diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
+
+    offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+               offset, offset);
+
+    src0 = (v16i8) __msa_xori_b((v16u8) src0, 128);
+    dst0 = __msa_adds_s_b(src0, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+
+    SW(dst_val0, dst);
+    dst += dst_stride;
+    SW(dst_val1, dst);
+}
+
+static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    uint64_t dst_val0, dst_val1;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, diff_minus10, diff_minus11;
+    v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
+    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+    src -= 1;
+
+    /* load in advance */
+    LD_UB2(src, src_stride, src_minus10, src_minus11);
+
+    for (height -= 2; height; height -= 2) {
+        src += (src_stride << 1);
+
+        SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1);
+        SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+
+        PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
+                    src_minus10, src_plus10);
+        src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0);
+
+        cmp_minus10 = (src0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < src0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus10 = (src0 == src_plus10);
+        diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_plus10 < src0);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
+
+        offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
+
+        /* load in advance */
+        LD_UB2(src, src_stride, src_minus10, src_minus11);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+                   offset, offset);
+
+        src0 = __msa_xori_b(src0, 128);
+        dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset);
+        dst0 = __msa_xori_b(dst0, 128);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+    }
+
+    SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1);
+    SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+
+    PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
+                src_plus10);
+    src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0);
+
+    cmp_minus10 = ((v16u8) src0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus10 = (src0 ==  src_plus10);
+    diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_plus10 < src0);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
+
+    offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
+
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+               offset, offset);
+
+    src0 = __msa_xori_b(src0, 128);
+    dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset);
+    dst0 = __msa_xori_b(dst0, 128);
+
+    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+    SD(dst_val0, dst);
+    dst += dst_stride;
+    SD(dst_val1, dst);
+}
+
+static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        uint8_t *src,
+                                                        int32_t src_stride,
+                                                        int16_t *sao_offset_val,
+                                                        int32_t width,
+                                                        int32_t height)
+{
+    uint8_t *dst_ptr, *src_minus1;
+    int32_t v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 sao_offset;
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13;
+    v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
+    v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
+    v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (; height; height -= 4) {
+        src_minus1 = src - 1;
+        LD_UB4(src_minus1, src_stride,
+               src_minus10, src_minus11, src_minus12, src_minus13);
+
+        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+            src_minus1 += 16;
+            dst_ptr = dst + v_cnt;
+            LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
+
+            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
+                       src_zero1, 1);
+            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
+                       src_zero3, 1);
+            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
+                       src_plus11, 2);
+            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
+                       src_plus13, 2);
+
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
+            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
+            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                       offset_mask0, offset_mask0, offset_mask0);
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                       offset_mask1, offset_mask1, offset_mask1);
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
+                       offset_mask2, offset_mask2, offset_mask2);
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
+                       offset_mask3, offset_mask3, offset_mask3);
+
+            XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
+
+            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
+            dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
+            dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
+            dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
+
+            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
+
+            src_minus10 = src10;
+            ST_UB(dst0, dst_ptr);
+            src_minus11 = src11;
+            ST_UB(dst1, dst_ptr + dst_stride);
+            src_minus12 = src12;
+            ST_UB(dst2, dst_ptr + (dst_stride << 1));
+            src_minus13 = src13;
+            ST_UB(dst3, dst_ptr + (dst_stride * 3));
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint32_t dst_val0, dst_val1;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 dst0;
+    v16i8 sao_offset = LD_SB(sao_offset_val);
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11, src10, src11;
+    v16i8 src_zero0, src_zero1;
+    v16i8 offset;
+    v8i16 offset_mask0, offset_mask1;
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    /* load in advance */
+    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+    LD_UB2(src + src_stride, src_stride, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src += (src_stride << 1);
+
+        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+                   offset, offset);
+
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+        dst0 = __msa_adds_s_b(dst0, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        LD_UB2(src + src_stride, src_stride, src10, src11);
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+    }
+
+    src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+    src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+    src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+    src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
+               offset, offset, offset);
+
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+    dst0 = __msa_adds_s_b(dst0, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+    SW(dst_val0, dst);
+    dst += dst_stride;
+    SW(dst_val1, dst);
+}
+
+static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint64_t dst_val0, dst_val1;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
+    v16i8 src_zero0, src_zero1, dst0;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11, src10, src11;
+    v8i16 offset_mask0, offset_mask1;
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    /* load in advance */
+    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+    LD_UB2(src + src_stride, src_stride, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src += (src_stride << 1);
+
+        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
+                   offset, offset, offset);
+
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+        dst0 = __msa_adds_s_b(dst0, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        LD_UB2(src + src_stride, src_stride, src10, src11);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+    }
+
+    src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+    src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+    src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+    src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+               offset, offset);
+
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+    dst0 = __msa_adds_s_b(dst0, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+    SD(dst_val0, dst);
+    dst += dst_stride;
+    SD(dst_val1, dst);
+}
+
+static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13;
+    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
+    v16u8 src12, dst2, src13, dst3;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+        src = src_orig + v_cnt;
+        dst = dst_orig + v_cnt;
+
+        LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+
+        for (h_cnt = (height >> 2); h_cnt--;) {
+            LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
+
+            cmp_minus10 = (src_minus11 == src_minus10);
+            cmp_plus10 = (src_minus11 == src10);
+            cmp_minus11 = (src10 == src_minus11);
+            cmp_plus11 = (src10 == src11);
+            cmp_minus12 = (src11 == src10);
+            cmp_plus12 = (src11 == src12);
+            cmp_minus13 = (src12 == src11);
+            cmp_plus13 = (src12 == src13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < src_minus11);
+            cmp_plus10 = (src10 < src_minus11);
+            cmp_minus11 = (src_minus11 < src10);
+            cmp_plus11 = (src11 < src10);
+            cmp_minus12 = (src10 < src11);
+            cmp_plus12 = (src12 < src11);
+            cmp_minus13 = (src11 < src12);
+            cmp_plus13 = (src13 < src12);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            src_minus10 = src12;
+            XORI_B4_128_UB(src_minus11, src10, src11, src12);
+
+            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0);
+            dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1);
+            dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2);
+            dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3);
+
+            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
+            src_minus11 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+
+            src += (src_stride << 2);
+            dst += (dst_stride << 2);
+        }
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    uint32_t dst_val0, dst_val1;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
+    v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus11, src10, src11;
+    v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
+    v8i16 offset_mask0, offset_mask1;
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    src_orig = src - 1;
+
+    /* load in advance */
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += (src_stride << 1);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
+
+        ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
+                   offset, offset, offset);
+
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+        dst0 = __msa_adds_s_b(dst0, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+    }
+
+    SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+    SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
+
+    ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
+               src_minus11);
+    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+               src_zero1);
+
+    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+               offset, offset);
+
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+    dst0 = __msa_adds_s_b(dst0, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+    SW(dst_val0, dst);
+    dst += dst_stride;
+    SW(dst_val1, dst);
+}
+
+static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    uint64_t dst_val0, dst_val1;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
+    v8i16 offset_mask0, offset_mask1;
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+    src_orig = src - 1;
+
+    /* load in advance */
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += (src_stride << 1);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
+
+        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
+                   src_minus10, src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
+                   src_zero0, src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+                   offset, offset);
+
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+        dst0 = __msa_adds_s_b(dst0, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+    }
+
+    SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+    SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
+    ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
+               src_minus11);
+    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+               src_zero1);
+
+    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+               offset, offset);
+
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+    dst0 = __msa_adds_s_b(dst0, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    src_minus10 = src10;
+    src_minus11 = src11;
+
+    /* load in advance */
+    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+    SD(dst_val0, dst);
+    dst += dst_stride;
+    SD(dst_val1, dst);
+}
+
+static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13, src_minus14, src_plus13;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
+    v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
+    v16i8 src_zero3, sao_offset;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (; height; height -= 4) {
+        src_orig = src - 1;
+        dst_orig = dst;
+        LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13,
+               src_minus14);
+
+        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+            src_minus10 = LD_UB(src_orig - src_stride);
+            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
+            src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2));
+            src_orig += 16;
+
+            SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
+                       src_zero1, 1);
+            SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
+                       src_zero3, 1);
+            SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
+                       src_plus11, 2);
+
+            src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
+
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
+            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
+            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
+
+            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
+            dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
+            dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
+            dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
+
+            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_minus12 = src11;
+            src_minus13 = src12;
+            src_minus14 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
+            dst_orig += 16;
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    uint32_t dst_val0, dst_val1;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
+    v16i8 src_zero0, src_zero1, dst0;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v8i16 offset_mask0, offset_mask1;
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+    src_orig = src - 1;
+
+    /* load in advance */
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += (src_stride << 1);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+
+        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+                   offset, offset);
+
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+        dst0 = __msa_adds_s_b(dst0, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+    }
+
+    SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+    SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+
+    ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+               src_minus11);
+    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+               src_zero1);
+
+    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+               offset, offset);
+
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+    dst0 = __msa_adds_s_b(dst0, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+    dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+
+    SW(dst_val0, dst);
+    dst += dst_stride;
+    SW(dst_val1, dst);
+    dst += dst_stride;
+}
+
+static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    uint64_t dst_val0, dst_val1;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 offset, sao_offset = LD_SB(sao_offset_val);
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v16i8 src_zero0, src_zero1, dst0;
+    v8i16 offset_mask0, offset_mask1;
+
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+    src_orig = src - 1;
+
+    /* load in advance */
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+    LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += (src_stride << 1);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+        VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+                   offset, offset);
+
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+        dst0 = __msa_adds_s_b(dst0, offset);
+        dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+    }
+
+    SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+    SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+    ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+               src_minus11);
+    ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+               src_zero1);
+
+    cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+    diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+    cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+    diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+    cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+    diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+    cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+    diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+    offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+    offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+    offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+    dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
+
+    VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
+               offset, offset);
+
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+    dst0 = __msa_adds_s_b(dst0, offset);
+    dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
+
+    dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+    dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
+
+    SD(dst_val0, dst);
+    dst += dst_stride;
+    SD(dst_val1, dst);
+    dst += dst_stride;
+}
+
+static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst,
+                                                          int32_t dst_stride,
+                                                          uint8_t *src,
+                                                          int32_t src_stride,
+                                                          int16_t *
+                                                          sao_offset_val,
+                                                          int32_t width,
+                                                          int32_t height)
+{
+    uint8_t *src_orig, *dst_orig;
+    int32_t v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
+    v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
+    v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
+    v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
+    v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
+    v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (; height; height -= 4) {
+        src_orig = src - 1;
+        dst_orig = dst;
+
+        LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11,
+               src_plus12);
+
+        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+            src_minus10 = LD_UB(src_orig + 2 - src_stride);
+            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
+            src_plus13 = LD_UB(src_orig + (src_stride << 2));
+            src_orig += 16;
+
+            src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
+
+            src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
+            src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
+                                               (v16i8) src_minus11, 2);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
+
+            src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
+            src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
+            cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
+
+            src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
+            src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
+            cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
+
+            dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
+            dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
+            dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
+            dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
+
+            XORI_B4_128_UB(dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_plus10 = src11;
+            src_plus11 = src12;
+            src_plus12 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
+            dst_orig += 16;
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q)
+{
+    hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q)
+{
+    hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q)
+{
+    hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q)
+{
+    hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q);
+}
+
+void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                     int16_t *sao_offset_val, int sao_left_class,
+                                     int width, int height)
+{
+    if (width >> 4) {
+        hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src,
+                                            sao_left_class, sao_offset_val,
+                                            width - (width % 16), height);
+        dst += width - (width % 16);
+        src += width - (width % 16);
+        width %= 16;
+    }
+
+    if (width >> 3) {
+        hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src,
+                                        sao_left_class, sao_offset_val, height);
+        dst += 8;
+        src += 8;
+        width %= 8;
+    }
+
+    if (width) {
+        hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src,
+                                        sao_left_class, sao_offset_val, height);
+    }
+}
+
+void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height)
+{
+    ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t);
+
+    switch (eo) {
+    case 0:
+        if (width >> 4) {
+            hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst,
+                                                        src, stride_src,
+                                                        sao_offset_val,
+                                                        width - (width % 16),
+                                                        height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+        }
+        break;
+
+    case 1:
+        if (width >> 4) {
+            hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width % 16),
+                                                         height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 2:
+        if (width >> 4) {
+            hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width % 16),
+                                                         height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 3:
+        if (width >> 4) {
+            hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst,
+                                                          src, stride_src,
+                                                          sao_offset_val,
+                                                          width - (width % 16),
+                                                          height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+        }
+        break;
+    }
+}
diff --git a/libavcodec/mips/hevc_macros_msa.h b/libavcodec/mips/hevc_macros_msa.h
new file mode 100644
index 0000000..ea53812
--- /dev/null
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVC_MACROS_MSA_H
+#define AVCODEC_MIPS_HEVC_MACROS_MSA_H
+
+#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,                    \
+                          filt0, filt1, filt2, filt3)            \
+( {                                                              \
+    v8i16 out_m;                                                 \
+                                                                 \
+    out_m = __msa_dotp_s_h((v16i8) in0, (v16i8) filt0);          \
+    out_m = __msa_dpadd_s_h(out_m, (v16i8) in1, (v16i8) filt1);  \
+    DPADD_SB2_SH(in2, in3, filt2, filt3, out_m, out_m);          \
+    out_m;                                                       \
+} )
+
+#define HEVC_FILT_8TAP(in0, in1, in2, in3,                       \
+                       filt0, filt1, filt2, filt3)               \
+( {                                                              \
+    v4i32 out_m;                                                 \
+                                                                 \
+    out_m = __msa_dotp_s_w((v8i16) in0, (v8i16) filt0);          \
+    out_m = __msa_dpadd_s_w(out_m, (v8i16) in1, (v8i16) filt1);  \
+    DPADD_SH2_SW(in2, in3, filt2, filt3, out_m, out_m);          \
+    out_m;                                                       \
+} )
+
+#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)                \
+( {                                                              \
+    v8i16 out_m;                                                 \
+                                                                 \
+    out_m = __msa_dotp_s_h((v16i8) in0, (v16i8) filt0);          \
+    out_m = __msa_dpadd_s_h(out_m, (v16i8) in1, (v16i8) filt1);  \
+    out_m;                                                       \
+} )
+
+#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)           \
+( {                                                      \
+    v4i32 out_m;                                         \
+                                                         \
+    out_m = __msa_dotp_s_w(in0, (v8i16) filt0);          \
+    out_m = __msa_dpadd_s_w(out_m, in1, (v8i16) filt1);  \
+    out_m;                                               \
+} )
+
+#endif  /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */
diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
new file mode 100644
index 0000000..b555517
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -0,0 +1,5014 @@
+/*
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
+{                                                                     \
+    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                    \
+    SRARI_H2_SH(out0, out1, rnd_val);                                 \
+    CLIP_SH2_0_255(out0, out1);                                       \
+}
+
+#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,                      \
+                          vec0, vec1, vec2, vec3, rnd_val,         \
+                          out0, out1, out2, out3)                  \
+{                                                                  \
+    HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
+    HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
+}
+
+#define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val,  \
+                                   out0, out1)                     \
+{                                                                  \
+    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                 \
+    SRARI_H2_SH(out0, out1, rnd_val);                              \
+    CLIP_SH2_0_255_MAX_SATU(out0, out1);                           \
+}
+
+#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,    \
+                                   vec3,  rnd_val, out0, out1, out2, out3)  \
+{                                                                           \
+    HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
+    HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
+}
+
+static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
+    uint64_t tpd0, tpd1, tpd2, tpd3;
+    v16i8 src0 = { 0 }, src1 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v8i16 dst0, dst1, dst2, dst3;
+
+    if (2 == height) {
+        LW2(src0_ptr, src_stride, tp0, tp1);
+        INSERT_W2_SB(tp0, tp1, src0);
+        LD2(src1_ptr, src2_stride, tpd0, tpd1);
+        INSERT_D2_SH(tpd0, tpd1, in0);
+
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+        dst0 += in0;
+        dst0 = __msa_srari_h(dst0, 7);
+        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+        ST4x2_UB(dst0, dst, dst_stride);
+    } else if (4 == height) {
+        LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+        LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+        INSERT_D2_SH(tpd0, tpd1, in0);
+        INSERT_D2_SH(tpd2, tpd3, in1);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        SLLI_2V(dst0, dst1, 6);
+        HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+            src0_ptr += 4 * src_stride;
+            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+            src0_ptr += 4 * src_stride;
+            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
+            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+            src1_ptr += (4 * src2_stride);
+            INSERT_D2_SH(tpd0, tpd1, in0);
+            INSERT_D2_SH(tpd2, tpd3, in1);
+            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+            src1_ptr += (4 * src2_stride);
+            INSERT_D2_SH(tpd0, tpd1, in2);
+            INSERT_D2_SH(tpd2, tpd3, in3);
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
+                                       dst3, 7, dst0, dst1, dst2, dst3);
+            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+            ST4x8_UB(dst0, dst1, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 out0, out1, out2, out3;
+    v16i8 zero = { 0 };
+    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        src0_ptr += (4 * src_stride);
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        src0_ptr += (4 * src_stride);
+        INSERT_D2_SB(tp0, tp1, src2);
+        INSERT_D2_SB(tp2, tp3, src3);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+                                   7, dst4, dst5, dst6, dst7);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST6x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST6x4_UB(out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    if (2 == height) {
+        LD2(src0_ptr, src_stride, tp0, tp1);
+        INSERT_D2_SB(tp0, tp1, src0);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        SLLI_2V(dst0, dst1, 6);
+        HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
+        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST8x2_UB(out0, dst, dst_stride);
+    } else if (4 == height) {
+        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+    } else if (6 == height) {
+        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        src0_ptr += 4 * src_stride;
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        LD2(src0_ptr, src_stride, tp0, tp1);
+        INSERT_D2_SB(tp0, tp1, src2);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_2V(dst4, dst5, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(out2, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+            src0_ptr += 4 * src_stride;
+            INSERT_D2_SB(tp0, tp1, src0);
+            INSERT_D2_SB(tp2, tp3, src1);
+            LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+            src0_ptr += 4 * src_stride;
+            INSERT_D2_SB(tp0, tp1, src2);
+            INSERT_D2_SB(tp2, tp3, src3);
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            ILVRL_B2_SH(zero, src2, dst4, dst5);
+            ILVRL_B2_SH(zero, src3, dst6, dst7);
+            LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
+                   in7);
+            src1_ptr += (8 * src2_stride);
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            SLLI_4V(dst4, dst5, dst6, dst7, 6);
+            HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
+                                       dst3, 7, dst0, dst1, dst2, dst3);
+            HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
+                                       dst7, 7, dst4, dst5, dst6, dst7);
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+            ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
+                   dst2, dst3);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+        SLLI_2V(dst4, dst5, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST12x4_UB(out0, out1, out2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v16i8 zero = { 0 };
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
+        ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
+        ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
+        ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
+        SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
+                                   dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
+                                   dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
+        PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
+        PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
+        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3, out4, out5;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
+        LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
+        src1_ptr += (4 * src2_stride);
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
+        ILVRL_B2_SH(zero, src4, dst6, dst7);
+        ILVRL_B2_SH(zero, src5, dst8, dst9);
+        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        SLLI_4V(dst8, dst9, dst10, dst11, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
+                                   7, dst4, dst5, dst6, dst7);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
+                                   dst11, 7, dst8, dst9, dst10, dst11);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
+        ST8x4_UB(out2, out5, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 16, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
+        src1_ptr += src2_stride;
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+                                   7, dst4, dst5, dst6, dst7);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST_UB2(out0, out1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out2, out3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3, out4, out5;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB3(src0_ptr, 16, src0, src1, src2);
+        src0_ptr += src_stride;
+        LD_SB3(src0_ptr, 16, src3, src4, src5);
+        src0_ptr += src_stride;
+
+        LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
+        src1_ptr += src2_stride;
+        LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
+        src1_ptr += src2_stride;
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        ILVRL_B2_SH(zero, src4, dst8, dst9);
+        ILVRL_B2_SH(zero, src5, dst10, dst11);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        SLLI_4V(dst8, dst9, dst10, dst11, 6);
+
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+                                   7, dst4, dst5, dst6, dst7);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
+                                   dst11, 7, dst8, dst9, dst10, dst11);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+        ST_UB2(out0, out1, dst, 16);
+        ST_UB(out2, dst + 32);
+        dst += dst_stride;
+        ST_UB2(out3, out4, dst, 16);
+        ST_UB(out5, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += src2_stride;
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                                   7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
+                                   7, dst4, dst5, dst6, dst7);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+
+        ST_UB4(out0, out1, out2, out3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+
+    src0_ptr -= 3;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
+               src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
+        VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t tmp0, tmp1;
+    int64_t tmp2, tmp3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 vec0, vec1, vec2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v8i16 dst0, dst1, dst2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB2(src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        src1_ptr += src2_stride;
+        LD_SH2(src1_ptr, 8, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+
+        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
+        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
+        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
+        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
+        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1);
+        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
+        VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1);
+        dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
+
+        in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+        dst2 = __msa_adds_s_h(in2, dst2);
+        dst2 = __msa_srari_h(dst2, 7);
+        dst2 = CLIP_SH_0_255(dst2);
+        PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
+
+        tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
+        tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
+        tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
+        tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
+        SD(tmp2, dst);
+        SW(tmp0, dst + 8);
+        dst += dst_stride;
+        SD(tmp3, dst);
+        SW(tmp1, dst + 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+
+    src0_ptr -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        src1_ptr += src2_stride;
+        LD_SH2(src1_ptr, 8, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0;
+    v16i8 src0, src1, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2;
+    v8i16 in0, in1, in2;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+
+    src0_ptr = src0_ptr - 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        src1_ptr += src2_stride;
+        XORI_B2_128_SB(src0, src1);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
+                     dst1, dst2, dst0);
+        VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
+                     dst2, dst0, dst1);
+        VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
+        VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
+                     dst0, dst1, dst2);
+
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+        dst2 = __msa_adds_s_h(dst2, in2);
+        dst2 = __msa_srari_h(dst2, 7);
+        dst2 = CLIP_SH_0_255(dst2);
+
+        PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
+        dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
+        ST_SB(tmp0, dst);
+        SD(dst_val0, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+
+    src0_ptr -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        ST_SB2(tmp0, tmp1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 tmp0, tmp1, tmp2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = 64; loop_cnt--;) {
+        LD_SB3(src0_ptr, 16, src0, src1, src2);
+        src3 = LD_SB(src0_ptr + 40);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+        HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        ST_SB(tmp0, dst);
+        ST_SB(tmp1, dst + 16);
+
+        LD_SH2(src1_ptr + 32, 8, in4, in5);
+        src1_ptr += src2_stride;
+
+        dst4 = const_vec;
+        dst5 = const_vec;
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
+                     dst5, dst4, dst5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
+                     dst5, dst4, dst5);
+
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST_SB(tmp2, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        LD_SB2(src0_ptr + 32, 16, src3, src4);
+        src5 = LD_SB(src0_ptr + 56);
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7,
+                          dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        ST_SB2(tmp0, tmp1, dst, 16);
+
+        src0 = src3;
+        src1 = src4;
+        src2 = src5;
+
+        LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7,
+                          dst0, dst1, dst2, dst3);
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        ST_SB2(tmp0, tmp1, dst + 32, 16);
+        src1_ptr += src2_stride;
+        src0_ptr += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst10, dst32, dst54, dst76, 7,
+                          dst10, dst32, dst54, dst76);
+
+        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
+        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+        dst0_l = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3,
+                     dst0_l, dst0_l, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3,
+                     dst1_l, dst1_l, dst1_l, dst1_l);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
+
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
+        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
+                                           int32_t src_stride,
+                                           int16_t *src1_ptr,
+                                           int32_t src2_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter,
+                                           int32_t height, int32_t width)
+{
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0_r, dst1_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            src0_ptr_tmp += (2 * src_stride);
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
+            src1_ptr_tmp += (2 * src2_stride);
+            XORI_B2_128_SB(src7, src8);
+
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            dst0_r = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3,
+                         dst0_r, dst0_r, dst0_r, dst0_r);
+            dst1_r = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3,
+                         dst1_r, dst1_r, dst1_r, dst1_r);
+            dst0_l = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3,
+                         dst0_l, dst0_l, dst0_l, dst0_l);
+            dst1_l = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3,
+                         dst1_l, dst1_l, dst1_l, dst1_l);
+
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                              dst0_r, dst1_r, dst0_l, dst1_l);
+
+            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 16);
+}
+
+static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 16);
+    hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                         dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 32);
+}
+
+static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 48);
+}
+
+static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 64);
+}
+
+static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 in0 = { 0 }, in1 = { 0 };
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 out0, out1;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
+    v4i32 dst0, dst1, dst2, dst3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src0_ptr -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in0);
+        src1_ptr += (2 * src2_stride);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in1);
+        src1_ptr += (2 * src2_stride);
+
+        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+
+        dst76 = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98 = __msa_ilvr_h(dst66, dst108);
+
+        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1);
+        ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
+        ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
+        SRARI_H2_SH(out0, out1, 7);
+        CLIP_SH2_0_255_MAX_SATU(out0, out1);
+        out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10 = dst54;
+        dst32 = dst76;
+        dst54 = dst98;
+        dst21 = dst65;
+        dst43 = dst87;
+        dst65 = dst109;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height, int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, tmp;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 dst0_r, dst0_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+
+        for (loop_cnt = height; loop_cnt--;) {
+            src7 = LD_SB(src0_ptr_tmp);
+            src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
+            src0_ptr_tmp += src_stride;
+
+            in0 = LD_SH(src1_ptr_tmp);
+            src1_ptr_tmp += src2_stride;
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+            ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+            ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
+            tmp = __msa_srari_h(tmp, 7);
+            tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+            out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
+            ST8x1_UB(out, dst_tmp);
+            dst_tmp += dst_stride;
+
+            dst0 = dst1;
+            dst1 = dst2;
+            dst2 = dst3;
+            dst3 = dst4;
+            dst4 = dst5;
+            dst5 = dst6;
+            dst6 = dst7;
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 8);
+}
+
+static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *src0_ptr_tmp, *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    uint64_t tp0, tp1;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    src0_ptr_tmp = src0_ptr;
+    dst_tmp = dst;
+    src1_ptr_tmp = src1_ptr;
+
+    LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
+           src6);
+    src0_ptr_tmp += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+               vec3);
+    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
+               vec7);
+    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
+               vec15);
+    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                             filt3);
+    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                             filt3);
+    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                             filt2, filt3);
+    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+               vec3);
+    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
+               vec7);
+    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                             filt3);
+    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                             filt3);
+
+    for (loop_cnt = 16; loop_cnt--;) {
+        src7 = LD_SB(src0_ptr_tmp);
+        src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
+        src0_ptr_tmp += src_stride;
+
+        in0 = LD_SH(src1_ptr_tmp);
+        src1_ptr_tmp += src2_stride;
+
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                 filt2, filt3);
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+        ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
+        tmp = __msa_srari_h(tmp, 7);
+        tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+        out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
+        ST8x1_UB(out, dst_tmp);
+        dst_tmp += dst_stride;
+
+        dst0 = dst1;
+        dst1 = dst2;
+        dst2 = dst3;
+        dst3 = dst4;
+        dst4 = dst5;
+        dst5 = dst6;
+        dst6 = dst7;
+    }
+
+    src0_ptr += 8;
+    dst += 8;
+    src1_ptr += 8;
+
+    mask4 = LD_SB(ff_hevc_mask_arr + 16);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
+               vec12, vec13, vec14, vec15);
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in0);
+        src1_ptr += (2 * src2_stride);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in1);
+        src1_ptr += (2 * src2_stride);
+
+        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
+                   vec3);
+        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
+                   vec7);
+        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+
+        dst76 = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98 = __msa_ilvr_h(dst66, dst108);
+
+        tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
+        PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
+        ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
+        SRARI_H2_SH(out0, out1, 7);
+        CLIP_SH2_0_255_MAX_SATU(out0, out1);
+        out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10 = dst54;
+        dst32 = dst76;
+        dst54 = dst98;
+        dst21 = dst65;
+        dst43 = dst87;
+        dst65 = dst109;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 16);
+}
+
+static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 24);
+}
+
+static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 32);
+}
+
+static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 48);
+}
+
+static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 64);
+}
+
+static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, dst0, vec0, vec1;
+    v8i16 in0, in1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    v16i8 mask1;
+    v8i16 tmp0;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    tmp0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
+
+    tmp0 = __msa_adds_s_h(tmp0, in0);
+    tmp0 = __msa_srari_h(tmp0, 7);
+    tmp0 = CLIP_SH_0_255(tmp0);
+    dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+
+    ST4x2_UB(dst0, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v16i8 vec2, vec3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    v16i8 mask1;
+    v8i16 tmp0, tmp1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    tmp0 = const_vec;
+    tmp1 = const_vec;
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
+                 tmp0, tmp1);
+    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
+    dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 dst0, dst1;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    v16i8 mask1, vec0, vec1, vec2, vec3;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        tmp0 = const_vec;
+        tmp1 = const_vec;
+        tmp2 = const_vec;
+        tmp3 = const_vec;
+        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
+                     tmp1, tmp2, tmp3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
+                     tmp1, tmp2, tmp3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (4 == height) {
+        hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (8 == height || 16 == height) {
+        hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src0, src1);
+
+    dst0 = const_vec;
+    dst1 = const_vec;
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
+                 dst0, dst1);
+    HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST8x2_UB(dst0, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    LD_SH2(src1_ptr, src2_stride, in4, in5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+
+    dst0 = const_vec;
+    dst1 = const_vec;
+    dst2 = const_vec;
+    dst3 = const_vec;
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
+                 dst2, dst3);
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
+                 dst2, dst3);
+    dst4 = const_vec;
+    dst5 = const_vec;
+
+    VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+    VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
+                 dst4, dst5);
+
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+    HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+    dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (6 == height) {
+        hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (0 == (height % 4)) {
+        hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1, mask3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        dst4 = const_vec;
+        dst5 = const_vec;
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
+    v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src0, src2);
+        LD_SB2(src0_ptr + 8, src_stride, src1, src3);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in2);
+        LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
+        src1_ptr += (2 * src2_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    dst_tmp = dst + 16;
+    src1_ptr_tmp = src1_ptr + 16;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+
+        dst4 = const_vec;
+        dst5 = const_vec;
+        dst6 = const_vec;
+        dst7 = const_vec;
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
+                     dst5, dst6, dst7);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
+                     dst5, dst6, dst7);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
+
+        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
+                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+        src1_ptr_tmp += (4 * src2_stride);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v8i16 dst10;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst10 = __msa_adds_s_h(dst10, in0);
+    dst10 = __msa_srari_h(dst10, 7);
+    dst10 = CLIP_SH_0_255(dst10);
+
+    dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
+    ST4x2_UB(dst10, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
+
+    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
+    ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+        src0_ptr += (6 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src0_ptr, src_stride, src9, src2);
+        src0_ptr += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst10, dst32, dst54, dst76, 7,
+                          dst10, dst32, dst54, dst76);
+
+        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
+        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (4 == height) {
+        hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else {
+        hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    LD_SB2(src0_ptr, src_stride, src5, src6);
+    src0_ptr += (2 * src_stride);
+    LD_SB2(src0_ptr, src_stride, src7, src8);
+    src0_ptr += (2 * src_stride);
+    LD_SB2(src0_ptr, src_stride, src9, src10);
+    src0_ptr += (2 * src_stride);
+
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    XORI_B2_128_SB(src3, src4);
+    XORI_B2_128_SB(src5, src6);
+    XORI_B2_128_SB(src7, src8);
+    XORI_B2_128_SB(src9, src10);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+
+    dst2_r = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+    dst3_r = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                      dst0_r, dst1_r, dst2_r, dst3_r);
+
+    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+    ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
+
+    dst2_r = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+    dst3_r = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                      dst0_r, dst1_r, dst2_r, dst3_r);
+
+    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+    ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, dst0_r, dst1_r;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
+    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+    dst2_r = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+    dst3_r = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+    dst4_r = const_vec;
+    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
+    dst5_r = const_vec;
+    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                      dst0_r, dst1_r, dst2_r, dst3_r);
+    HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
+
+    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+    dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (6 == height) {
+        hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else {
+        hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst0_l, dst1_l, filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (1 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SB2(src0_ptr, src_stride, src5, src6);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B2_128_SB(src3, src4);
+        XORI_B2_128_SB(src5, src6);
+
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+        ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
+        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
+        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2 = src6;
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src2110 = src6554;
+    }
+}
+
+static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* 8width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        /* 8width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        /* 16width */
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        /* 8width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    /* next 16width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        /* 16width */
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* next 16width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+        /* next 16width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
+        /* next 16width */
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst2_r, dst3_r, dst2_l, dst3_l, 7,
+                          dst2_r, dst3_r, dst2_l, dst3_l);
+
+        PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
+        ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y)
+{
+    uint64_t tp0, tp1;
+    v16u8 out;
+    v8i16 in0 = { 0 };
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp;
+    v4i32 dst0, dst1;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in0);
+    in0 = __msa_adds_s_h(in0, const_vec);
+
+    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
+
+    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
+    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
+
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst0 >>= 6;
+    dst1 >>= 6;
+    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
+    tmp = __msa_adds_s_h(tmp, in0);
+    tmp = __msa_srari_h(tmp, 7);
+    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y)
+{
+    uint64_t tp0, tp1;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 tmp0, tmp1;
+    v8i16 in0 = { 0 }, in1 = { 0 };
+    v8i16 dst30, dst41, dst52, dst63;
+    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
+    v4i32 dst0, dst1, dst2, dst3;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    src1_ptr += 2 * src2_stride;
+    INSERT_D2_SH(tp0, tp1, in0);
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in1);
+
+    ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
+
+    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
+
+    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
+    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
+    SRA_4V(dst0, dst1, dst2, dst3, 6);
+    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
+    ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
+    SRARI_H2_SH(tmp0, tmp1, 7);
+    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst98_r, dst109_r;
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src0_ptr += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
+
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in0);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in1);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in2);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in3);
+
+        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
+                    const_vec, in0, in1, in2, in3);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
+        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
+                    tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y);
+    } else if (0 == (height % 8)) {
+        hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride,
+                                      filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    uint32_t tpw0, tpw1, tpw2, tpw3;
+    uint64_t tp0, tp1;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
+    v8i16 dsth10, tmp4, tmp5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
+    v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
+    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v8i16 in4 = { 0 }, in5 = { 0 };
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    LD_SB8(src0_ptr, src_stride,
+           src3, src4, src5, src6, src7, src8, src9, src10);
+    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
+
+    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
+    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
+    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
+    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
+    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
+    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
+    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
+
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in0);
+    LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in1);
+
+    LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in2);
+    LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in3);
+
+    ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
+                in0, in1, in2, in3);
+    ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
+                tmp3);
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST4x8_UB(out0, out1, dst, dst_stride);
+
+    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
+    src1_ptr += (4 * src2_stride);
+    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
+    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
+    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
+    ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
+    ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
+    SRARI_H2_SH(tmp4, tmp5, 7);
+    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+    ST2x4_UB(out2, 0, dst + 4, dst_stride);
+    dst += 4 * dst_stride;
+    ST2x4_UB(out2, 4, dst + 4, dst_stride);
+}
+
+static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 tmp0, tmp1;
+    v8i16 in0, in1;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+    ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
+    SRARI_H2_SH(tmp0, tmp1, 7);
+    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
+                                      int32_t src_stride,
+                                      int16_t *src1_ptr,
+                                      int32_t src2_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t width8mult)
+{
+    uint32_t cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+    v8i16 in0, in1, in2, in3;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width8mult; cnt--;) {
+        LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr += 8;
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += 8;
+        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
+                    const_vec, in0, in1, in2, in3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, tmp0, tmp1, tmp2, tmp3);
+        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += 8;
+    }
+}
+
+static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y)
+{
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+    src0_ptr += (5 * src_stride);
+    LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
+                in0, in1, in2, in3);
+    ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
+
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
+    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
+    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
+    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
+    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
+                tmp0, tmp1, tmp2, tmp3);
+    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
+    ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+                tmp0, tmp1, tmp2, tmp3);
+    ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    SRARI_H2_SH(tmp4, tmp5, 7);
+    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height,
+                                          int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+        src0_ptr_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            src1_ptr_tmp += (4 * src2_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
+                        const_vec, in0, in1, in2, in3);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                        dst3_r, tmp0, tmp1, tmp2, tmp3);
+            ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+                        tmp0, tmp1, tmp2, tmp3);
+            SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+            ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dst2 = dst6;
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y, 1);
+    } else if (6 == height) {
+        hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y);
+    } else {
+        hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride,
+                                      filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1;
+    uint8_t *src0_ptr_tmp, *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0, mask1, mask2, mask3;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    src0_ptr_tmp = src0_ptr;
+    dst_tmp = dst;
+    src1_ptr_tmp = src1_ptr;
+
+    LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+    src0_ptr_tmp += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+        src0_ptr_tmp += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+        src1_ptr_tmp += (4 * src2_stride);
+        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
+                    const_vec, in0, in1, in2, in3);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, tmp0, tmp1, tmp2, tmp3);
+        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst10_l = dst54_l;
+        dst21_r = dst65_r;
+        dst21_l = dst65_l;
+        dsth2 = dsth6;
+    }
+
+    src0_ptr += 8;
+    dst += 8;
+    src1_ptr += 8;
+
+    mask2 = LD_SB(ff_hevc_mask_arr + 16);
+    mask3 = mask2 + 2;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
+
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src0_ptr += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
+
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in0);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in1);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in2);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in3);
+
+        ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
+                    const_vec, in0, in1, in2, in3);
+
+        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        SRA_4V(dst4, dst5, dst6, dst7, 6);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    tmp0, tmp1, tmp2, tmp3);
+        ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    if (4 == height) {
+        hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y, 2);
+    } else {
+        hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
+                                      src2_stride, dst, dst_stride, filter_x,
+                                      filter_y, height, 16);
+    }
+}
+
+static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 24);
+}
+
+static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 32);
+}
+
+#define BI_MC_COPY(WIDTH)                                                 \
+void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                   ptrdiff_t dst_stride,  \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int16_t *src_16bit,    \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
+                                dst, dst_stride, height);                 \
+}
+
+BI_MC_COPY(4);
+BI_MC_COPY(6);
+BI_MC_COPY(8);
+BI_MC_COPY(12);
+BI_MC_COPY(16);
+BI_MC_COPY(24);
+BI_MC_COPY(32);
+BI_MC_COPY(48);
+BI_MC_COPY(64);
+
+#undef BI_MC_COPY
+
+#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                      ptrdiff_t dst_stride,  \
+                                                      uint8_t *src,          \
+                                                      ptrdiff_t src_stride,  \
+                                                      int16_t *src_16bit,    \
+                                                      int height,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)             \
+{                                                                            \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];            \
+                                                                             \
+    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,     \
+                                             MAX_PB_SIZE, dst, dst_stride,   \
+                                             filter, height);                \
+}
+
+BI_MC(qpel, h, 4, 8, hz, mx);
+BI_MC(qpel, h, 8, 8, hz, mx);
+BI_MC(qpel, h, 12, 8, hz, mx);
+BI_MC(qpel, h, 16, 8, hz, mx);
+BI_MC(qpel, h, 24, 8, hz, mx);
+BI_MC(qpel, h, 32, 8, hz, mx);
+BI_MC(qpel, h, 48, 8, hz, mx);
+BI_MC(qpel, h, 64, 8, hz, mx);
+
+BI_MC(qpel, v, 4, 8, vt, my);
+BI_MC(qpel, v, 8, 8, vt, my);
+BI_MC(qpel, v, 12, 8, vt, my);
+BI_MC(qpel, v, 16, 8, vt, my);
+BI_MC(qpel, v, 24, 8, vt, my);
+BI_MC(qpel, v, 32, 8, vt, my);
+BI_MC(qpel, v, 48, 8, vt, my);
+BI_MC(qpel, v, 64, 8, vt, my);
+
+BI_MC(epel, h, 4, 4, hz, mx);
+BI_MC(epel, h, 8, 4, hz, mx);
+BI_MC(epel, h, 6, 4, hz, mx);
+BI_MC(epel, h, 12, 4, hz, mx);
+BI_MC(epel, h, 16, 4, hz, mx);
+BI_MC(epel, h, 24, 4, hz, mx);
+BI_MC(epel, h, 32, 4, hz, mx);
+
+BI_MC(epel, v, 4, 4, vt, my);
+BI_MC(epel, v, 8, 4, vt, my);
+BI_MC(epel, v, 6, 4, vt, my);
+BI_MC(epel, v, 12, 4, vt, my);
+BI_MC(epel, v, 16, 4, vt, my);
+BI_MC(epel, v, 24, 4, vt, my);
+BI_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_MC
+
+#define BI_MC_HV(PEL, WIDTH, TAP)                                         \
+void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
+                                                   ptrdiff_t dst_stride,  \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int16_t *src_16bit,    \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];             \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];             \
+                                                                          \
+    hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,        \
+                                       MAX_PB_SIZE, dst, dst_stride,      \
+                                       filter_x, filter_y, height);       \
+}
+
+BI_MC_HV(qpel, 4, 8);
+BI_MC_HV(qpel, 8, 8);
+BI_MC_HV(qpel, 12, 8);
+BI_MC_HV(qpel, 16, 8);
+BI_MC_HV(qpel, 24, 8);
+BI_MC_HV(qpel, 32, 8);
+BI_MC_HV(qpel, 48, 8);
+BI_MC_HV(qpel, 64, 8);
+
+BI_MC_HV(epel, 4, 4);
+BI_MC_HV(epel, 8, 4);
+BI_MC_HV(epel, 6, 4);
+BI_MC_HV(epel, 12, 4);
+BI_MC_HV(epel, 16, 4);
+BI_MC_HV(epel, 24, 4);
+BI_MC_HV(epel, 32, 4);
+
+#undef BI_MC_HV
diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c
new file mode 100644
index 0000000..ea65f00
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -0,0 +1,6045 @@
+/*
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,  \
+                           out0, out1)                              \
+{                                                                   \
+    v4i32 out0_r, out1_r, out0_l, out1_l;                           \
+                                                                    \
+    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);               \
+    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);               \
+                                                                    \
+    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);  \
+    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);  \
+    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);  \
+    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);  \
+                                                                    \
+    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                \
+    PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);        \
+    CLIP_SH2_0_255(out0, out1);                                     \
+}
+
+#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3,       \
+                           wgt, rnd, offset, out0, out1, out2, out3)         \
+{                                                                            \
+    HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1);  \
+    HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3);  \
+}
+
+#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd,  \
+                                    offset, out0, out1)              \
+{                                                                    \
+    v4i32 out0_r, out1_r, out0_l, out1_l;                            \
+                                                                     \
+    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);                \
+    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);                \
+    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);   \
+    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);   \
+    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);   \
+    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);   \
+    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \
+    PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1);         \
+    CLIP_SH2_0_255_MAX_SATU(out0, out1);                             \
+}
+
+#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,  \
+                                    vec3, wgt, rnd, offset, out0, out1,    \
+                                    out2, out3)                            \
+{                                                                          \
+    HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset,    \
+                                out0, out1);                               \
+    HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset,    \
+                                out2, out3);                               \
+}
+
+static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
+    uint64_t tpd0, tpd1, tpd2, tpd3;
+    int32_t offset, weight;
+    v16u8 out0, out1;
+    v16i8 zero = { 0 };
+    v16i8 src0 = { 0 }, src1 = { 0 };
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, weight_vec;
+    v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    if (2 == height) {
+        LW2(src0_ptr, src_stride, tp0, tp1);
+        INSERT_W2_SB(tp0, tp1, src0);
+        LD2(src1_ptr, src2_stride, tpd0, tpd1);
+        INSERT_D2_SH(tpd0, tpd1, in0);
+
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+
+        ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
+        dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
+        dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
+        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+        dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+        out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+        ST4x2_UB(out0, dst, dst_stride);
+    } else if (4 == height) {
+        LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+        LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+        INSERT_D2_SH(tpd0, tpd1, in0);
+        INSERT_D2_SH(tpd2, tpd3, in1);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        SLLI_2V(dst0, dst1, 6);
+        HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
+                                    offset_vec, dst0, dst1);
+        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+            src0_ptr += 4 * src_stride;
+            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+            LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+            src0_ptr += 4 * src_stride;
+            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
+            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+            src1_ptr += (4 * src2_stride);
+            INSERT_D2_SH(tpd0, tpd1, in0);
+            INSERT_D2_SH(tpd2, tpd3, in1);
+            LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
+            src1_ptr += (4 * src2_stride);
+            INSERT_D2_SH(tpd0, tpd1, in2);
+            INSERT_D2_SH(tpd2, tpd3, in3);
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
+                                        in3, weight_vec, rnd_vec, offset_vec,
+                                        dst0, dst1, dst2, dst3);
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+            ST4x8_UB(out0, out1, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16u8 out0, out1;
+    v16i8 zero = { 0 };
+    v16i8 src0 = { 0 }, src1 = { 0 };
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        src0_ptr += (4 * src_stride);
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
+                                    in0, in1, in2, in3,
+                                    weight_vec, rnd_vec, offset_vec,
+                                    dst0, dst1, dst2, dst3);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST6x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    int32_t offset, weight;
+    v16u8 out0, out1, out2;
+    v16i8 zero = { 0 };
+    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    if (2 == height) {
+        LD2(src0_ptr, src_stride, tp0, tp1);
+        INSERT_D2_SB(tp0, tp1, src0);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        SLLI_2V(dst0, dst1, 6);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1);
+
+        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST8x2_UB(out0, dst, dst_stride);
+    } else if (6 == height) {
+        LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+        src0_ptr += 4 * src_stride;
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        LD2(src0_ptr, src_stride, tp0, tp1);
+        INSERT_D2_SB(tp0, tp1, src2);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_2V(dst4, dst5, 6);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
+                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
+                                    dst2, dst3);
+        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
+                                    offset_vec, dst4, dst5);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(out2, dst, dst_stride);
+    } else if (0 == height % 4) {
+        uint32_t loop_cnt;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
+            src0_ptr += (4 * src_stride);
+            INSERT_D2_SB(tp0, tp1, src0);
+            INSERT_D2_SB(tp2, tp3, src1);
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+            src1_ptr += (4 * src2_stride);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
+                                        in3, weight_vec, rnd_vec, offset_vec,
+                                        dst0, dst1, dst2, dst3);
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+            ST8x4_UB(out0, out1, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (16 >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
+                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
+                                    dst2, dst3);
+        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
+                                    offset_vec, dst4, dst5);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST12x4_UB(out0, out1, out2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16u8 out0, out1, out2, out3;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
+                   tmp2, tmp3);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
+                   tmp6, tmp7);
+        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
+                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
+                                    tmp4, tmp5);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
+                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
+                                    tmp6, tmp7);
+        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
+        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
+        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16u8 out0, out1, out2, out3, out4, out5;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
+        LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
+        src1_ptr += (4 * src2_stride);
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
+        ILVRL_B2_SH(zero, src4, dst6, dst7);
+        ILVRL_B2_SH(zero, src5, dst8, dst9);
+        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        SLLI_4V(dst8, dst9, dst10, dst11, 6);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
+                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
+                                    dst2, dst3);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
+                                    weight_vec, rnd_vec, offset_vec, dst4, dst5,
+                                    dst6, dst7);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
+                                    in11, weight_vec, rnd_vec, offset_vec,
+                                    dst8, dst9, dst10, dst11);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
+        ST8x4_UB(out2, out5, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16u8 out0, out1, out2, out3;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 16, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
+        src1_ptr += src2_stride;
+
+        ILVRL_B2_SH(zero, src0, tmp0, tmp4);
+        ILVRL_B2_SH(zero, src1, tmp1, tmp5);
+        ILVRL_B2_SH(zero, src2, tmp2, tmp6);
+        ILVRL_B2_SH(zero, src3, tmp3, tmp7);
+        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
+                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
+                                    tmp1, tmp5);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
+                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
+                                    tmp3, tmp7);
+        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
+        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
+        ST_UB2(out0, out1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out2, out3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = 64; loop_cnt--;) {
+        LD_SB3(src0_ptr, 16, src0, src1, src2);
+        src0_ptr += src_stride;
+        LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
+        src1_ptr += src2_stride;
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_2V(dst4, dst5, 6);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
+                                    weight_vec, rnd_vec, offset_vec, dst0, dst1,
+                                    dst2, dst3);
+        HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
+                                    offset_vec, dst4, dst5);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST_UB2(out0, out1, dst, 16);
+        ST_UB(out2, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16u8 out0, out1, out2, out3;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += src2_stride;
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
+                   tmp2, tmp3);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
+                   tmp6, tmp7);
+        SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+        SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
+                                    weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
+                                    tmp1, tmp5);
+        HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
+                                    weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
+                                    tmp3, tmp7);
+        PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
+        PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
+        ST_UB4(out0, out1, out2, out3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, out0, out1;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+
+    src0_ptr -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1);
+
+        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, out0, out1, out2, out3;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1, out2, out3);
+
+        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
+    v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset = (offset0 + offset1) << rnd_val;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec, out0, out1, out2,
+                           out3);
+        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
+                   vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
+                   vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
+                           offset_vec, out0, out1);
+        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, out0, out1, out2, out3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        src1_ptr += src2_stride;
+        LD_SH2(src1_ptr, 8, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1, out2, out3);
+
+        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+        ST_SH2(out0, out1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1;
+    v8i16 in0, in1, in2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2;
+    v4i32 dst2_r, dst2_l;
+    v8i16 filter_vec, out0, out1, out2;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+
+    src0_ptr = src0_ptr - 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    LD_SB2(src0_ptr, 16, src0, src1);
+    src0_ptr += src_stride;
+    LD_SH2(src1_ptr, 8, in0, in1);
+    in2 = LD_SH(src1_ptr + 16);
+    src1_ptr += src2_stride;
+    XORI_B2_128_SB(src0, src1);
+
+    for (loop_cnt = 31; loop_cnt--;) {
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1);
+
+        ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+        out2 = CLIP_SH_0_255(dst2_r);
+
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        src1_ptr += src2_stride;
+        XORI_B2_128_SB(src0, src1);
+        PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
+        dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
+        ST_SH(out0, dst);
+        SD(dst_val0, dst + 16);
+        dst += dst_stride;
+    }
+
+    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
+    dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
+                       out0, out1);
+    ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
+    dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
+    dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+    dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+    out2 = CLIP_SH_0_255(dst2_r);
+    PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
+    dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
+    ST_SH(out0, dst);
+    SD(dst_val0, dst + 16);
+    dst += dst_stride;
+}
+
+static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, out0, out1, out2, out3;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1, out2, out3);
+
+        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+        ST_SH2(out0, out1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, out0, out1, out2, out3;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = 64; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        XORI_B3_128_SB(src0, src1, src2);
+        LD_SB2(src0_ptr + 32, 8, src3, src4);
+        src0_ptr += src_stride;
+        XORI_B2_128_SB(src3, src4);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1, out2, out3);
+
+        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+        ST_SH2(out0, out1, dst, 16);
+
+        LD_SH2(src1_ptr + 32, 8, in2, in3);
+        src1_ptr += src2_stride;
+
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1);
+
+        out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
+        ST_SH(out0, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint8_t *src0_ptr_tmp;
+    uint8_t *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    uint32_t loop_cnt, cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, out0, out1, out2, out3;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src0_ptr_tmp, 16, src0, src1);
+            src2 = LD_SB(src0_ptr_tmp + 24);
+            src0_ptr_tmp += 32;
+            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
+            src1_ptr_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec0, vec1, vec2, vec3);
+            dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+
+            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               out0, out1, out2, out3);
+
+            PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+            ST_SH2(out0, out1, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src0_ptr += src_stride;
+        src1_ptr += src2_stride;
+        dst += dst_stride;
+
+    }
+}
+
+static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src13, src14;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, out0, out1, out2, out3;
+    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_w(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    weight1_vec = __msa_fill_w(weight1);
+    offset_vec += const_vec * weight1_vec;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
+                    filt0, dst10, dst32, dst54, dst76);
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
+                     filt1, dst10, dst32, dst54, dst76);
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
+                     filt2, filt2, dst10, dst32, dst54, dst76);
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
+                     filt3, filt3, dst10, dst32, dst54, dst76);
+
+        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1, out2, out3);
+
+        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, out0, out1, out2, out3;
+    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_w(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    weight1_vec = __msa_fill_w(weight1);
+    offset_vec += const_vec * weight1_vec;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
+                    filt0, tmp0, tmp1, tmp2, tmp3);
+        DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
+                     filt1, tmp0, tmp1, tmp2, tmp3);
+        DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
+                     filt2, tmp0, tmp1, tmp2, tmp3);
+        DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
+                     filt3, tmp0, tmp1, tmp2, tmp3);
+
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1, out2, out3);
+
+        PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 out0, out1, out2, filter_vec;
+    v4i32 dst2_r, dst2_l;
+    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_w(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    weight1_vec = __msa_fill_w(weight1);
+    offset_vec += const_vec * weight1_vec;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src7, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
+        XORI_B2_128_SB(src7, src8);
+
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
+
+        DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2);
+        DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
+        tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
+        DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
+        tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
+        DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
+        tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
+
+        HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           out0, out1);
+
+        ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+        out2 = CLIP_SH_0_255(dst2_r);
+        PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
+        ST8x2_UB(out0, dst, dst_stride);
+        ST4x2_UB(out2, dst + 8, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src32_r = src54_r;
+        src54_r = src76_r;
+        src21_r = src43_r;
+        src43_r = src65_r;
+        src65_r = src87_r;
+        src2110 = src4332;
+        src4332 = src6554;
+        src6554 = src8776;
+        src6 = src8;
+    }
+}
+
+static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
+                                              int32_t src_stride,
+                                              int16_t *src1_ptr,
+                                              int32_t src2_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight0,
+                                              int32_t weight1,
+                                              int32_t offset0,
+                                              int32_t offset1,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec;
+    v8i16 out0, out1, out2, out3;
+    v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_w(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    weight1_vec = __msa_fill_w(weight1);
+    offset_vec += const_vec * weight1_vec;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            src0_ptr_tmp += (2 * src_stride);
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            XORI_B2_128_SB(src7, src8);
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
+                        filt0, filt0, tmp0, tmp1, tmp2, tmp3);
+            DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
+                         filt1, filt1, tmp0, tmp1, tmp2, tmp3);
+            DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
+                         filt2, filt2, tmp0, tmp1, tmp2, tmp3);
+            DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
+                         filt3, filt3, tmp0, tmp1, tmp2, tmp3);
+
+            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               out0, out1, out2, out3);
+
+            PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
+            ST_SH2(out0, out1, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 16);
+}
+
+static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 16);
+    hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
+                            src1_ptr + 16, src2_stride,
+                            dst + 16, dst_stride, filter, height,
+                            weight0, weight1, offset0, offset1, rnd_val);
+}
+
+static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 32);
+}
+
+static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 48);
+}
+
+static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 64);
+}
+
+static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1;
+    int32_t offset, weight;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 in0 = { 0 }, in1 = { 0 };
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, weight_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
+    v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+    weight_vec = (v8i16) __msa_fill_w(weight);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in0);
+        src1_ptr += (2 * src2_stride);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in1);
+        src1_ptr += (2 * src2_stride);
+
+        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+
+        dst76 = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98 = __msa_ilvr_h(dst66, dst108);
+
+        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
+        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
+        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
+        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
+        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10 = dst54;
+        dst32 = dst76;
+        dst54 = dst98;
+        dst21 = dst65;
+        dst43 = dst87;
+        dst65 = dst109;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width8mult)
+{
+    uint32_t loop_cnt, cnt;
+    int32_t offset, weight;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, weight_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+    weight_vec = (v8i16) __msa_fill_w(weight);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width8mult; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        /* row 4 row 5 row 6 */
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+
+        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src0_ptr_tmp += 2 * src_stride;
+
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
+                       dst32_r, dst54_r, dst21_r);
+            ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
+                       dst32_l, dst54_l, dst21_l);
+            ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+            ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 8 */
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
+            ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
+            ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
+            dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+            dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+            dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+            dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+            SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
+            CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
+            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+            out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+            ST8x2_UB(out, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst0 = dst2;
+            dst1 = dst3;
+            dst2 = dst4;
+            dst3 = dst5;
+            dst4 = dst6;
+            dst5 = dst7;
+            dst6 = dst8;
+        }
+
+        src0_ptr += 8;
+        src1_ptr += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 1);
+}
+
+static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *src0_ptr_tmp, *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    int32_t offset, weight;
+    uint64_t tp0, tp1;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v8i16 in0 = { 0 }, in1 = { 0 };
+    v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
+    v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
+    v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+    weight_vec = (v8i16) __msa_fill_w(weight);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    src0_ptr_tmp = src0_ptr;
+    src1_ptr_tmp = src1_ptr;
+    dst_tmp = dst;
+
+    LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr_tmp += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
+               vec15);
+    dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                              filt2, filt3);
+    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+        src0_ptr_tmp += (2 * src_stride);
+        XORI_B2_128_SB(src7, src8);
+
+        LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+        src1_ptr_tmp += (2 * src2_stride);
+
+        ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
+        ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
+
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+
+        ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
+        dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                              filt_h1, filt_h2, filt_h3);
+        dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
+                              filt_h1, filt_h2, filt_h3);
+        dst0 >>= 6;
+        dst1 >>= 6;
+
+        VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+
+        ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
+        dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
+                              filt_h1, filt_h2, filt_h3);
+        dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
+                              filt_h1, filt_h2, filt_h3);
+        dst2 >>= 6;
+        dst3 >>= 6;
+
+        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
+        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
+        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
+        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+        SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
+        CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
+        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+        ST8x2_UB(out, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        dsth0 = dsth2;
+        dsth1 = dsth3;
+        dsth2 = dsth4;
+        dsth3 = dsth5;
+        dsth4 = dsth6;
+        dsth5 = dsth7;
+        dsth6 = dsth8;
+    }
+
+    src0_ptr += 8;
+    src1_ptr += 8;
+    dst += 8;
+
+    mask4 = LD_SB(ff_hevc_mask_arr + 16);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
+               vec15);
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in0);
+        src1_ptr += (2 * src2_stride);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        INSERT_D2_SH(tp0, tp1, in1);
+        src1_ptr += (2 * src2_stride);
+
+        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
+                   vec3);
+        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
+                   vec7);
+        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+
+        dst76 = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87, dst109);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98 = __msa_ilvr_h(dst66, dst108);
+
+        dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
+                              filt_h2, filt_h3);
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
+        ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
+        ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
+        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
+        PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10 = dst54;
+        dst32 = dst76;
+        dst54 = dst98;
+        dst21 = dst65;
+        dst43 = dst87;
+        dst65 = dst109;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 2);
+}
+
+static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 3);
+}
+
+static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 4);
+}
+
+static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 6);
+}
+
+static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 8);
+}
+
+static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0;
+    v4i32 dst0_r, dst0_l;
+    v8i16 out0, filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+
+    ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
+    dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
+    dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+    out0 = CLIP_SH_0_255(dst0_r);
+    out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
+    ST4x2_UB(out0, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    v16i8 mask1;
+    v8i16 dst0, dst1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0, dst1);
+
+    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t weight, offset, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0, dst1);
+
+    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST8x2_UB(dst0, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    LD_SH2(src1_ptr, src2_stride, in4, in5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+    dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0, dst1, dst2, dst3);
+    HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst4, dst5);
+
+    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+    dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst3, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (6 == height) {
+        hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST12x4_UB(dst0, dst1, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
+                           in4, in5, in6, in7,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = 16; loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src0, src2);
+        LD_SB2(src0_ptr + 16, src_stride, src1, src3);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in2);
+        LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, dst_stride);
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST8x2_UB(dst0, (dst + 16), dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset, constant;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, dst10;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v4i32 dst10_r, dst10_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, out;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    src1_ptr += (2 * src2_stride);
+
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+
+    ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
+    dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
+    dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
+    dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
+    out = CLIP_SH_0_255(dst10_r);
+    out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset, constant;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    src0_ptr += (4 * src_stride);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+    dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
+
+    HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst10, dst32);
+
+    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
+    ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t weight, offset, constant;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+        src0_ptr += (6 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+        dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
+        dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
+
+        LD_SB2(src0_ptr, src_stride, src9, src2);
+        src0_ptr += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+        dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst10, dst32, dst54, dst76);
+
+        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
+        ST4x8_UB(dst10, dst32, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+
+        LD_SB2(src0_ptr, src_stride, src1, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+        ST6x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, tmp0, tmp1;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+    tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       tmp0, tmp1);
+
+    tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST8x2_UB(tmp0, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+    tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+    tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+    tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+    tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
+    tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
+    HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       tmp0, tmp1, tmp2, tmp3);
+    HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       tmp4, tmp5);
+
+    PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+    tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(tmp3, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+
+        LD_SB2(src0_ptr, src_stride, src1, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (6 == height) {
+        hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else {
+        hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (1 * src_stride);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B2_128_SB(src3, src4);
+
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+
+        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
+        tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp2, tmp3);
+        HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp4, tmp5);
+
+        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+        ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
+        ST_SH2(tmp0, tmp1, dst, dst_stride);
+        dst += (2 * dst_stride);
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
+        tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
+        ST_SH2(tmp0, tmp1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* 8width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        /* 16width */
+        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+        /* 8width */
+        tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp4, tmp5);
+        /* 8width */
+        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp2, tmp3);
+        /* 16width */
+        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
+        /* 8width */
+        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
+        ST_SH2(tmp0, tmp1, dst, dst_stride);
+        ST8x2_UB(tmp2, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        /* 16width */
+        tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
+        tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
+        tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
+        /* 8width */
+        tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp4, tmp5);
+        /* 8width */
+        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp2, tmp3);
+        /* 16width */
+        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
+
+        /* 8width */
+        tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
+        ST_SH2(tmp0, tmp1, dst, dst_stride);
+        ST8x2_UB(tmp2, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    int32_t offset, weight, constant;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+    constant = 128 * weight1;
+    constant <<= 6;
+    offset += constant;
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* next 16width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 16width */
+        tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp0, tmp1, tmp4, tmp5);
+        /* 16width */
+        PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
+        ST_SH2(tmp0, tmp1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        /* next 16width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+        /* next 16width */
+        tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+        tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
+        tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+        tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
+        /* next 16width */
+        HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
+                           in4, in5, in6, in7,
+                           weight_vec, rnd_vec, offset_vec,
+                           tmp2, tmp3, tmp6, tmp7);
+
+        /* next 16width */
+        PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
+        ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint64_t tp0, tp1;
+    int32_t offset, weight;
+    v8i16 in0 = { 0 };
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, tmp, weight_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
+    v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+
+    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
+
+    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
+    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
+
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst0 >>= 6;
+    dst1 >>= 6;
+    dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
+
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in0);
+
+    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+    SRAR_W2_SW(dst0, dst1, rnd_vec);
+    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
+    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint64_t tp0, tp1;
+    int32_t offset, weight;
+    v16u8 out;
+    v8i16 in0 = { 0 }, in1 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, weight_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst30, dst41, dst52, dst63;
+    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
+    v4i32 offset_vec, rnd_vec, const_vec;
+    v4i32 dst0, dst1, dst2, dst3;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
+
+    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
+    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
+    SRA_4V(dst0, dst1, dst2, dst3, 6);
+    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
+
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in0);
+    src1_ptr += (2 * src2_stride);
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in1);
+
+    ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
+
+    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
+    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1;
+    int32_t offset, weight;
+    v16u8 out0, out1;
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, weight_vec;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst98_r, dst109_r;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src0_ptr += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
+
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in0);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in1);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in2);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in3);
+
+        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        SRA_4V(dst4, dst5, dst6, dst7, 6);
+        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
+                    dst2, dst3);
+        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
+        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
+        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
+        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
+        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
+        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
+        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
+                    tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter_x, filter_y,
+                                         height, weight0, weight1,
+                                         offset0, offset1, rnd_val);
+    }
+}
+
+static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t tpw0, tpw1, tpw2, tpw3;
+    uint64_t tp0, tp1;
+    int32_t offset, weight;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v8i16 in4 = { 0 }, in5 = { 0 };
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
+    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
+    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
+    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
+           src10);
+    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
+
+    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
+    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
+    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
+    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
+    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
+    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+    PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
+    PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
+
+    LD2(src1_ptr, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in0);
+    LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in1);
+
+    LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in2);
+    LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
+    INSERT_D2_SH(tp0, tp1, in3);
+
+    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+    ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
+    ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
+    ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
+    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+    dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
+    dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
+    dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
+    dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
+    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+    SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+    PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
+                tmp2, tmp3);
+    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST4x8_UB(out0, out1, dst, dst_stride);
+
+    PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
+
+    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
+    src1_ptr += (4 * src2_stride);
+    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
+    LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
+    INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
+
+    ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
+    ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
+
+    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
+
+    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+    ST2x4_UB(out2, 0, dst + 4, dst_stride);
+    dst += 4 * dst_stride;
+    ST2x4_UB(out2, 4, dst + 4, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, weight_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v8i16 in0, in1;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+
+    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
+
+    ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
+
+    dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+    dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+    dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+    dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
+                                         int32_t src_stride,
+                                         int16_t *src1_ptr,
+                                         int32_t src2_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride,
+                                         const int8_t *filter_x,
+                                         const int8_t *filter_y,
+                                         int32_t weight0,
+                                         int32_t weight1,
+                                         int32_t offset0,
+                                         int32_t offset1,
+                                         int32_t rnd_val,
+                                         int32_t width8mult)
+{
+    int32_t weight, offset;
+    uint32_t cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+    weight_vec = (v8i16) __msa_fill_w(weight);
+
+    for (cnt = width8mult; cnt--;) {
+        LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr += 8;
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += 8;
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+        ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, dst0, dst1, dst2, dst3);
+
+        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
+        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
+        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
+        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
+        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
+        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
+        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += 8;
+    }
+}
+
+static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t offset, weight;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, weight_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+
+    LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
+    src0_ptr += (5 * src_stride);
+    LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
+
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+    dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
+    dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
+    dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
+    dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
+    PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
+                dst0, dst1, dst2, dst3);
+
+    ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+    ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
+    ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
+    ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
+    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+    dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
+    dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
+    dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
+    dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
+    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+    SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+    PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+
+    PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
+    ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
+    ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
+    dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+    dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+    dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+    dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
+    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    int32_t offset, weight;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = (v8i16) __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+        src0_ptr_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+        ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            src1_ptr_tmp += (4 * src2_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+            dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+            dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+            dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+            dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+            ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+            ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+            ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+            ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+            PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                        dst3_r, dst0, dst1, dst2, dst3);
+            ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+            ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
+            ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
+            ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
+            dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+            dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+            dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+            dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+            dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
+            dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
+            dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
+            dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
+            SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+            SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+            PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                        tmp0, tmp1, tmp2, tmp3);
+            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+            ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dsth2 = dsth6;
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
+                                     src2_stride, dst, dst_stride, filter_x,
+                                     filter_y, weight0, weight1, offset0,
+                                     offset1, rnd_val, 1);
+    } else if (6 == height) {
+        hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter_x, filter_y,
+                                         height, weight0,
+                                         weight1, offset0, offset1, rnd_val, 8);
+    }
+}
+
+static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1;
+    int32_t offset, weight;
+    uint8_t *src0_ptr_tmp, *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0, mask1, mask2, mask3;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
+    v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 offset_vec, rnd_vec, const_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_fill_w((128 * weight1));
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+    offset_vec += const_vec;
+    weight_vec = (v8i16) __msa_fill_w(weight);
+
+    src0_ptr_tmp = src0_ptr;
+    dst_tmp = dst;
+    src1_ptr_tmp = src1_ptr;
+
+    LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+    src0_ptr_tmp += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+        src0_ptr_tmp += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+        src1_ptr_tmp += (4 * src2_stride);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, dst0, dst1, dst2, dst3);
+        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
+        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
+        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
+        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
+        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
+        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
+        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst10_l = dst54_l;
+        dst21_r = dst65_r;
+        dst21_l = dst65_l;
+        dsth2 = dsth6;
+    }
+
+    src0_ptr += 8;
+    dst += 8;
+    src1_ptr += 8;
+
+    mask2 = LD_SB(ff_hevc_mask_arr + 16);
+    mask3 = mask2 + 2;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
+
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
+               src10);
+        src0_ptr += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
+
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in0);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in1);
+
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in2);
+        LD2(src1_ptr, src2_stride, tp0, tp1);
+        src1_ptr += 2 * src2_stride;
+        INSERT_D2_SH(tp0, tp1, in3);
+
+        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        SRA_4V(dst4, dst5, dst6, dst7, 6);
+        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    dst0, dst1, dst2, dst3);
+        ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
+        ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
+        ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
+        ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
+        dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
+        dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
+        dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
+        dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
+        dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
+        dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
+        dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
+        dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    tmp0, tmp1, tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    if (4 == height) {
+        hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
+                                     src2_stride, dst, dst_stride, filter_x,
+                                     filter_y, weight0, weight1, offset0,
+                                     offset1, rnd_val, 2);
+    } else {
+        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
+                                         src2_stride, dst, dst_stride,
+                                         filter_x, filter_y, height, weight0,
+                                         weight1, offset0, offset1, rnd_val, 16);
+    }
+}
+
+static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 24);
+}
+
+static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 32);
+}
+
+#define BI_W_MC_COPY(WIDTH)                                                  \
+void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,           \
+                                                     ptrdiff_t dst_stride,   \
+                                                     uint8_t *src,           \
+                                                     ptrdiff_t src_stride,   \
+                                                     int16_t *src_16bit,     \
+                                                     int height,             \
+                                                     int denom,              \
+                                                     int weight0,            \
+                                                     int weight1,            \
+                                                     int offset0,            \
+                                                     int offset1,            \
+                                                     intptr_t mx,            \
+                                                     intptr_t my,            \
+                                                     int width)              \
+{                                                                            \
+    int shift = 14 + 1 - 8;                                                  \
+    int log2Wd = denom + shift - 1;                                          \
+                                                                             \
+    hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
+                                   dst, dst_stride, height,                  \
+                                   weight0, weight1, offset0,                \
+                                   offset1, log2Wd);                         \
+}
+
+BI_W_MC_COPY(4);
+BI_W_MC_COPY(6);
+BI_W_MC_COPY(8);
+BI_W_MC_COPY(12);
+BI_W_MC_COPY(16);
+BI_W_MC_COPY(24);
+BI_W_MC_COPY(32);
+BI_W_MC_COPY(48);
+BI_W_MC_COPY(64);
+
+#undef BI_W_MC_COPY
+
+#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                         \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,         \
+                                                        ptrdiff_t             \
+                                                        dst_stride,           \
+                                                        uint8_t *src,         \
+                                                        ptrdiff_t             \
+                                                        src_stride,           \
+                                                        int16_t *src_16bit,   \
+                                                        int height,           \
+                                                        int denom,            \
+                                                        int weight0,          \
+                                                        int weight1,          \
+                                                        int offset0,          \
+                                                        int offset1,          \
+                                                        intptr_t mx,          \
+                                                        intptr_t my,          \
+                                                        int width)            \
+{                                                                             \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
+    int log2Wd = denom + 14 - 8;                                              \
+                                                                              \
+    hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,   \
+                                                MAX_PB_SIZE, dst, dst_stride, \
+                                                filter, height, weight0,      \
+                                                weight1, offset0, offset1,    \
+                                                log2Wd);                      \
+}
+
+BI_W_MC(qpel, h, 4, 8, hz, mx);
+BI_W_MC(qpel, h, 8, 8, hz, mx);
+BI_W_MC(qpel, h, 12, 8, hz, mx);
+BI_W_MC(qpel, h, 16, 8, hz, mx);
+BI_W_MC(qpel, h, 24, 8, hz, mx);
+BI_W_MC(qpel, h, 32, 8, hz, mx);
+BI_W_MC(qpel, h, 48, 8, hz, mx);
+BI_W_MC(qpel, h, 64, 8, hz, mx);
+
+BI_W_MC(qpel, v, 4, 8, vt, my);
+BI_W_MC(qpel, v, 8, 8, vt, my);
+BI_W_MC(qpel, v, 12, 8, vt, my);
+BI_W_MC(qpel, v, 16, 8, vt, my);
+BI_W_MC(qpel, v, 24, 8, vt, my);
+BI_W_MC(qpel, v, 32, 8, vt, my);
+BI_W_MC(qpel, v, 48, 8, vt, my);
+BI_W_MC(qpel, v, 64, 8, vt, my);
+
+BI_W_MC(epel, h, 4, 4, hz, mx);
+BI_W_MC(epel, h, 8, 4, hz, mx);
+BI_W_MC(epel, h, 6, 4, hz, mx);
+BI_W_MC(epel, h, 12, 4, hz, mx);
+BI_W_MC(epel, h, 16, 4, hz, mx);
+BI_W_MC(epel, h, 24, 4, hz, mx);
+BI_W_MC(epel, h, 32, 4, hz, mx);
+
+BI_W_MC(epel, v, 4, 4, vt, my);
+BI_W_MC(epel, v, 8, 4, vt, my);
+BI_W_MC(epel, v, 6, 4, vt, my);
+BI_W_MC(epel, v, 12, 4, vt, my);
+BI_W_MC(epel, v, 16, 4, vt, my);
+BI_W_MC(epel, v, 24, 4, vt, my);
+BI_W_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_W_MC
+
+#define BI_W_MC_HV(PEL, WIDTH, TAP)                                         \
+void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
+                                                     ptrdiff_t dst_stride,  \
+                                                     uint8_t *src,          \
+                                                     ptrdiff_t src_stride,  \
+                                                     int16_t *src_16bit,    \
+                                                     int height,            \
+                                                     int denom,             \
+                                                     int weight0,           \
+                                                     int weight1,           \
+                                                     int offset0,           \
+                                                     int offset1,           \
+                                                     intptr_t mx,           \
+                                                     intptr_t my,           \
+                                                     int width)             \
+{                                                                           \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];               \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];               \
+    int log2Wd = denom + 14 - 8;                                            \
+                                                                            \
+    hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
+                                          MAX_PB_SIZE, dst, dst_stride,     \
+                                          filter_x, filter_y, height,       \
+                                          weight0, weight1, offset0,        \
+                                          offset1, log2Wd);                 \
+}
+
+BI_W_MC_HV(qpel, 4, 8);
+BI_W_MC_HV(qpel, 8, 8);
+BI_W_MC_HV(qpel, 12, 8);
+BI_W_MC_HV(qpel, 16, 8);
+BI_W_MC_HV(qpel, 24, 8);
+BI_W_MC_HV(qpel, 32, 8);
+BI_W_MC_HV(qpel, 48, 8);
+BI_W_MC_HV(qpel, 64, 8);
+
+BI_W_MC_HV(epel, 4, 4);
+BI_W_MC_HV(epel, 8, 4);
+BI_W_MC_HV(epel, 6, 4);
+BI_W_MC_HV(epel, 12, 4);
+BI_W_MC_HV(epel, 16, 4);
+BI_W_MC_HV(epel, 24, 4);
+BI_W_MC_HV(epel, 32, 4);
+
+#undef BI_W_MC_HV
diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c
new file mode 100644
index 0000000..740c970
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -0,0 +1,4147 @@
+/*
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1)                              \
+{                                                                           \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
+                                                                            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                  \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                 \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
+    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                 \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1);                 \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2, mask3,                \
+                                   filt0, filt1, filt2, filt3,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
+                 out0, out1, out2, out3);                                     \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void copy_width8_msa(uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+    if (2 == height) {
+        LD2(src, src_stride, out0, out1);
+        SD(out0, dst);
+        dst += dst_stride;
+        SD(out1, dst);
+    } else if (6 == height) {
+        LD4(src, src_stride, out0, out1, out2, out3);
+        src += (4 * src_stride);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        LD2(src, src_stride, out0, out1);
+        SD(out0, dst);
+        dst += dst_stride;
+        SD(out1, dst);
+    } else if (0 == (height % 8)) {
+        for (cnt = (height >> 3); cnt--;) {
+            LD4(src, src_stride, out0, out1, out2, out3);
+            src += (4 * src_stride);
+            LD4(src, src_stride, out4, out5, out6, out7);
+            src += (4 * src_stride);
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 4)) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD4(src, src_stride, out0, out1, out2, out3);
+            src += (4 * src_stride);
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width12_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+}
+
+static void copy_width16_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (12 == height) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+        dst += (8 * dst_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    } else if (0 == (height % 8)) {
+        for (cnt = (height >> 3); cnt--;) {
+            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
+                   src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
+                   dst_stride);
+            dst += (8 * dst_stride);
+        }
+    } else if (0 == (height % 4)) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width24_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+    for (cnt = 4; cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD4(src + 16, src_stride, out0, out1, out2, out3);
+        src += (4 * src_stride);
+        LD4(src + 16, src_stride, out4, out5, out6, out7);
+        src += (4 * src_stride);
+
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+        SD4(out0, out1, out2, out3, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+        SD4(out4, out5, out6, out7, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void copy_width32_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (height >> 2); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+        src += (4 * src_stride);
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void copy_width48_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 src11;
+
+    for (cnt = (height >> 2); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+        LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
+        src += (4 * src_stride);
+
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+        ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void copy_width64_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    for (cnt = (height >> 2); cnt--;) {
+        LD_UB4(src, 16, src0, src1, src2, src3);
+        src += src_stride;
+        LD_UB4(src, 16, src4, src5, src6, src7);
+        src += src_stride;
+        LD_UB4(src, 16, src8, src9, src10, src11);
+        src += src_stride;
+        LD_UB4(src, 16, src12, src13, src14, src15);
+        src += src_stride;
+
+        ST_UB4(src0, src1, src2, src3, dst, 16);
+        dst += dst_stride;
+        ST_UB4(src4, src5, src6, src7, dst, 16);
+        dst += dst_stride;
+        ST_UB4(src8, src9, src10, src11, dst, 16);
+        dst += dst_stride;
+        ST_UB4(src12, src13, src14, src15, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    SRARI_H2_SH(out0, out1, 6);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (16 == height) {
+        common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
+        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
+        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
+                     out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
+        DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
+        DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
+                     out0, out1, out2, out3);
+
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
+    v16u8 tmp0, tmp1, tmp2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3, out4, out5;
+
+    mask00 = LD_UB(&ff_hevc_mask_arr[0]);
+    mask0 = LD_UB(&ff_hevc_mask_arr[16]);
+
+    src = src - 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask00 + 2;
+    mask2 = mask00 + 4;
+    mask3 = mask00 + 6;
+    mask4 = mask0 + 2;
+    mask5 = mask0 + 4;
+    mask6 = mask0 + 6;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        /* 8 width */
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        /* 4 width */
+        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(src4, src5, src6, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
+                    out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
+                     out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
+                     out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
+                     out1, out2, out3);
+
+        /* 4 width */
+        VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
+        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
+        VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
+        VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
+        VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
+        DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
+
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SRARI_H2_SH(out4, out5, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH2_SH(out4, out5, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        tmp2 = PCKEV_XORI128_UB(out4, out5);
+
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+
+        LD_SB2(src, src_stride, src4, src6);
+        LD_SB2(src + 8, src_stride, src5, src7);
+        src += (2 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(src4, src5, src6, src7);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+
+        HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 vec11;
+    v8i16 out0, out1, out2, out3, out8, out9, filt;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = 16; loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 16, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
+        VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
+        DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
+                    out8, out2, out9);
+        DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
+        VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
+        DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
+                     out0, out8, out2, out9);
+        DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
+        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
+                     out0, out8, out2, out9);
+        DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
+        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
+                     out0, out8, out2, out9);
+        DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
+        SRARI_H4_SH(out0, out8, out2, out9, 6);
+        SRARI_H2_SH(out1, out3, 6);
+        SAT_SH4_SH(out0, out8, out2, out9, 7);
+        SAT_SH2_SH(out1, out3, 7);
+        out = PCKEV_XORI128_UB(out8, out9);
+        ST8x2_UB(out, dst + 16, dst_stride);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src1 = LD_SB(src + 8);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        src4 = LD_SB(src);
+        src5 = LD_SB(src + 8);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B4_128_SB(src4, src5, src6, src7);
+
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+
+        HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
+    v16i8 src4;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = 64; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src1 = LD_SB(src + 8);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 32);
+        src4 = LD_SB(src + 40);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
+
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt2);
+
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt3);
+
+        SRARI_H2_SH(out0, out1, 6);
+        out3 = __msa_srari_h(out2, 6);
+        SAT_SH3_SH(out0, out1, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+
+        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
+        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
+        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt2);
+        VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt3);
+
+        SRARI_H2_SH(out0, out1, 6);
+        out2 = __msa_srari_h(out2, 6);
+        SAT_SH3_SH(out0, out1, out2, 7);
+        out = PCKEV_XORI128_UB(out3, out0);
+        ST_UB(out, dst + 16);
+        out = PCKEV_XORI128_UB(out1, out2);
+        ST_UB(out, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 filt0, filt1, filt2, filt3;
+    v8i16 res0, res1, res2, res3, filt;
+
+    mask0 = LD_UB(&ff_hevc_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += src_stride;
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
+                    res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
+                     res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
+                     res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
+                     res1, res2, res3);
+
+        SRARI_H4_SH(res0, res1, res2, res3, 6);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        out = PCKEV_XORI128_UB(res0, res1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(res2, res3);
+        ST_UB(out, dst + 16);
+
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
+                    res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
+                     res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
+                     res1, res2, res3);
+        VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
+                     res1, res2, res3);
+
+        SRARI_H4_SH(res0, res1, res2, res3, 6);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        out = PCKEV_XORI128_UB(res0, res1);
+        ST_UB(out, dst + 32);
+        out = PCKEV_XORI128_UB(res2, res3);
+        ST_UB(out, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v8i16 filt, out10, out32, out54, out76;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        LD_SB4(src, src_stride, src11, src12, src13, src14);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
+                   src12111110, src14131312);
+        XORI_B2_128_SB(src8776, src10998);
+        XORI_B2_128_SB(src12111110, src14131312);
+
+        DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
+        DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
+        DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
+        DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
+        DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
+        DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
+        DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
+        DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
+        SRARI_H2_SH(out10, out32, 6);
+        SRARI_H2_SH(out54, out76, 6);
+        SAT_SH2_SH(out10, out32, 7);
+        SAT_SH2_SH(out54, out76, 7);
+        out0 = PCKEV_XORI128_UB(out10, out32);
+        out1 = PCKEV_XORI128_UB(out54, out76);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
+                    filt0, out0_r, out1_r, out2_r, out3_r);
+        DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
+                     filt1, out0_r, out1_r, out2_r, out3_r);
+        DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
+                     filt2, out0_r, out1_r, out2_r, out3_r);
+        DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
+                     filt3, out0_r, out1_r, out2_r, out3_r);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    uint32_t out2, out3;
+    uint64_t out0, out1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                   src87_l, src98_l, src109_l);
+        out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+        out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+        out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+        out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+        out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+        out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+        out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+        out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+
+        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
+        out1 = __msa_copy_u_d((v2i64) tmp1, 0);
+        out2 = __msa_copy_u_w((v4i32) tmp0, 2);
+        out3 = __msa_copy_u_w((v4i32) tmp1, 2);
+        SD(out0, dst);
+        SW(out2, (dst + 8));
+        dst += dst_stride;
+        SD(out1, dst);
+        SW(out3, (dst + 8));
+        dst += dst_stride;
+        out0 = __msa_copy_u_d((v2i64) tmp2, 0);
+        out1 = __msa_copy_u_d((v2i64) tmp3, 0);
+        out2 = __msa_copy_u_w((v4i32) tmp2, 2);
+        out3 = __msa_copy_u_w((v4i32) tmp3, 2);
+        SD(out0, dst);
+        SW(out2, (dst + 8));
+        dst += dst_stride;
+        SD(out1, dst);
+        SW(out3, (dst + 8));
+        dst += dst_stride;
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src54_l = src98_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src65_l = src109_l;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                   src87_l, src98_l, src109_l);
+        out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+        out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+        out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+        out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+        out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+        out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+        out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+        out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src54_l = src98_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src65_l = src109_l;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      int32_t width)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
+                                       filt0, filt1, filt2, filt3);
+            out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
+                                       filt0, filt1, filt2, filt3);
+            out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
+                                       filt0, filt1, filt2, filt3);
+            out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
+                                       filt0, filt1, filt2, filt3);
+            out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
+                                       filt0, filt1, filt2, filt3);
+            out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
+                                       filt0, filt1, filt2, filt3);
+            out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
+                                       filt0, filt1, filt2, filt3);
+            out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
+                                       filt0, filt1, filt2, filt3);
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              16);
+
+    common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
+                        height);
+}
+
+static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              32);
+}
+
+static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              48);
+}
+
+static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              64);
+}
+
+static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src9, src10, src11, src12, src13, src14;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+
+    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
+    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
+    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
+               src14);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
+
+        VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+
+        dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                   filt3);
+        dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+        dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
+                                   filt2, filt3);
+        dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                   filt2, filt3);
+
+        dst76_r = __msa_ilvr_h(dst117, dst66);
+        ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
+        ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
+        ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
+        dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
+        dst1110_r = __msa_ilvr_h(dst117, dst1410);
+
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+        SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+        SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
+        SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
+        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
+        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
+        out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst32_r = dst1110_r;
+        dst54_r = dst1312_r;
+        dst21_r = dst109_r;
+        dst43_r = dst1211_r;
+        dst65_r = dst1413_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
+    }
+}
+
+static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter_x,
+                                           const int8_t *filter_y,
+                                           int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= ((3 * src_stride) + 3);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src_tmp += 2 * src_stride;
+
+            ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                       dst10_r, dst32_r, dst54_r, dst21_r);
+            ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                       dst10_l, dst32_l, dst54_l, dst21_l);
+            ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+            ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+            SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
+
+            PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
+            out = PCKEV_XORI128_UB(dst0, dst1);
+            ST8x2_UB(out, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst0 = dst2;
+            dst1 = dst3;
+            dst2 = dst4;
+            dst3 = dst5;
+            dst4 = dst6;
+            dst5 = dst7;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src13, src14;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
+    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
+    v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
+    v8i16 dst1413_r, dst87_l, filter_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l;
+
+    src -= ((3 * src_stride) + 3);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
+               vec15);
+    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                             filt3);
+    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                             filt3);
+    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                             filt2, filt3);
+
+    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                             filt3);
+    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                             filt3);
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB2(src_tmp, src_stride, src7, src8);
+        XORI_B2_128_SB(src7, src8);
+        src_tmp += 2 * src_stride;
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
+                   dst32_r, dst54_r, dst21_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
+                   dst32_l, dst54_l, dst21_l);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+
+        ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+        SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
+
+        PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
+        out0 = PCKEV_XORI128_UB(dst0, dst1);
+        ST8x2_UB(out0, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        dst0 = dst2;
+        dst1 = dst3;
+        dst2 = dst4;
+        dst3 = dst5;
+        dst4 = dst6;
+        dst5 = dst7;
+        dst6 = dst8;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask4 = LD_SB(ff_hevc_mask_arr + 16);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
+               vec15);
+
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+
+    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
+    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
+    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
+               src14);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
+
+        VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
+                   vec3);
+        VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
+                   vec7);
+        VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
+                   vec11);
+        VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
+                   vec14, vec15);
+
+        dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                   filt3);
+        dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+        dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
+                                   filt2, filt3);
+        dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                   filt2, filt3);
+
+        dst76_r = __msa_ilvr_h(dst117, dst66);
+        ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
+        ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
+        ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
+        dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
+        dst1110_r = __msa_ilvr_h(dst117, dst1410);
+
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+        SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+        SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
+        SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
+        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
+        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
+        out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst32_r = dst1110_r;
+        dst54_r = dst1312_r;
+        dst21_r = dst109_r;
+        dst43_r = dst1211_r;
+        dst65_r = dst1413_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
+    }
+}
+
+static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 64);
+}
+
+static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
+    v16u8 out;
+    v8i16 filt, res0;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    res0 = __msa_srari_h(res0, 6);
+    res0 = __msa_sat_s_h(res0, 7);
+    out = PCKEV_XORI128_UB(res0, res0);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRARI_H2_SH(out0, out1, 6);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (2 == height) {
+        common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (4 == height) {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (16 == height) {
+        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out4, out5;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out4 = PCKEV_XORI128_UB(out0, out1);
+    out5 = PCKEV_XORI128_UB(out2, out3);
+    ST6x4_UB(out4, out5, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 6);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out4 = PCKEV_XORI128_UB(out0, out1);
+    out5 = PCKEV_XORI128_UB(out2, out3);
+    ST6x4_UB(out4, out5, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, vec0, vec1, vec2, vec3;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+        VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
+        VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
+        SRARI_H2_SH(vec0, vec1, 6);
+        SAT_SH2_SH(vec0, vec1, 7);
+        out = PCKEV_XORI128_UB(vec0, vec1);
+        ST8x2_UB(out, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if ((2 == height) || (6 == height)) {
+        common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    } else {
+        common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask2 = LD_SB(&ff_hevc_mask_arr[32]);
+
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
+        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
+        SRARI_H2_SH(out0, out1, 6);
+        SAT_SH2_SH(out0, out1, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
+
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
+        DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out2, out3, out4, out5);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
+                     out2, out3, out4, out5);
+        SRARI_H4_SH(out2, out3, out4, out5, 6);
+        SAT_SH4_SH(out2, out3, out4, out5, 7);
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        tmp1 = PCKEV_XORI128_UB(out4, out5);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
+        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
+        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
+        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
+        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
+                     out4, out5, out6, out7);
+        SRARI_H4_SH(out4, out5, out6, out7, 6);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint8_t *dst1 = dst + 16;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
+    v8i16 filt, out0, out1, out2, out3;
+    v16u8 tmp0, tmp1;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask00 = mask0 + 8;
+    mask11 = mask0 + 10;
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
+        VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
+
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
+        dst1 += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src1 = LD_SB(src + 8);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src5 = LD_SB(src + 8);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
+        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
+        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
+        DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
+        DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
+                     out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, 6);
+        SRARI_H4_SH(out4, out5, out6, out7, 6);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v16u8 out;
+    v8i16 filt, out10;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src, src_stride, src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+    out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+    out10 = __msa_srari_h(out10, 6);
+    out10 = __msa_sat_s_h(out10, 7);
+    out = PCKEV_XORI128_UB(out10, out10);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride,
+                                         const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+        out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
+        SRARI_H2_SH(out10, out32, 6);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (2 == height) {
+        common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
+                                     height);
+    }
+}
+
+static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
+
+    src -= src_stride;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+    dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+
+    dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+    dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+
+    SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+    SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
+    out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
+    out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
+    ST6x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
+
+    dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
+    dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+
+    dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+    dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+
+    SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+    SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
+    out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
+    out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
+    ST6x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
+    v16u8 out;
+
+    src -= src_stride;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
+    tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
+    ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
+    tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
+    SRARI_H2_SH(tmp0, tmp1, 6);
+    SAT_SH2_SH(tmp0, tmp1, 7);
+    out = PCKEV_XORI128_UB(tmp0, tmp1);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    uint32_t loop_cnt;
+    uint64_t out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
+    v8i16 filt, filt0, filt1;
+
+    src -= src_stride;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+
+        XORI_B3_128_SB(src3, src4, src5);
+        ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
+        tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
+        SRARI_H2_SH(tmp0, tmp1, 6);
+        tmp2 = __msa_srari_h(tmp2, 6);
+        SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
+        PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
+        XORI_B2_128_SH(tmp0, tmp2);
+
+        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
+        out1 = __msa_copy_u_d((v2i64) tmp0, 1);
+        out2 = __msa_copy_u_d((v2i64) tmp2, 0);
+        SD(out0, dst);
+        dst += dst_stride;
+        SD(out1, dst);
+        dst += dst_stride;
+        SD(out2, dst);
+        dst += dst_stride;
+
+        src2 = src5;
+        vec0 = vec3;
+        vec2 = vec4;
+    }
+}
+
+static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
+        out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
+        out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
+        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (2 == height) {
+        common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (6 == height) {
+        common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
+                                 filter, height);
+    }
+}
+
+static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16u8 out0, out1;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
+    v8i16 filter_vec;
+
+    src -= (1 * src_stride);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+        ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
+        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+        dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+        dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+        dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
+
+        SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SRARI_H2_SH(dst0_l, dst1_l, 6);
+        SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
+        SAT_SH2_SH(dst0_l, dst1_l, 7);
+        out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
+        out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2 = src6;
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src2110 = src6554;
+    }
+}
+
+static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+        out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+        out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
+        out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, filt0, filt1;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
+    v16u8 out;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    /* 16 width */
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    /* 8 width */
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        /* 16 width */
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 8 width */
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        /* 16 width */
+        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+
+        /* 8 width */
+        out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+
+        /* 16 + 8 width */
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        SRARI_H2_SH(out0_l, out1_l, 6);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH2_SH(out0_l, out1_l, 7);
+        out = PCKEV_XORI128_UB(out0_r, out0_l);
+        ST_UB(out, dst);
+        PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
+        XORI_B2_128_SH(out2_r, out3_r);
+        out0 = __msa_copy_u_d((v2i64) out2_r, 0);
+        out1 = __msa_copy_u_d((v2i64) out3_r, 0);
+        SD(out0, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out1_r, out1_l);
+        ST_UB(out, dst);
+        SD(out1, dst + 16);
+        dst += dst_stride;
+
+        /* 16 width */
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        /* 8 width */
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        /* 16 width */
+        out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
+        out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
+        out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
+        out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
+
+        /* 8 width */
+        out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
+        out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
+
+        /* 16 + 8 width */
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        SRARI_H2_SH(out0_l, out1_l, 6);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH2_SH(out0_l, out1_l, 7);
+        out = PCKEV_XORI128_UB(out0_r, out0_l);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2_r, out2_r);
+        ST8x1_UB(out, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out1_r, out1_l);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out3_r, out3_r);
+        ST8x1_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt;
+    v16i8 filt0, filt1;
+    v16u8 out;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    /* 16 width */
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    /* next 16 width */
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16 width */
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 16 width */
+        out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+
+        /* 16 width */
+        SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
+        SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
+        out = PCKEV_XORI128_UB(out0_r, out0_l);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out1_r, out1_l);
+        ST_UB(out, dst + dst_stride);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        /* next 16 width */
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+        /* next 16 width */
+        out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+        out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
+        out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+        out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
+
+        /* next 16 width */
+        SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
+        SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
+        out = PCKEV_XORI128_UB(out2_r, out2_l);
+        ST_UB(out, dst + 16);
+        out = PCKEV_XORI128_UB(out3_r, out3_l);
+        ST_UB(out, dst + 16 + dst_stride);
+
+        dst += 2 * dst_stride;
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, tmp;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
+    v4i32 dst0, dst1;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
+
+    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
+    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
+
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst0 >>= 6;
+    dst1 >>= 6;
+    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
+    tmp = __msa_srari_h(tmp, 6);
+    tmp = __msa_sat_s_h(tmp, 7);
+    out = PCKEV_XORI128_UB(tmp, tmp);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filter_vec, tmp0, tmp1;
+    v8i16 dst30, dst41, dst52, dst63;
+    v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
+    v4i32 dst0, dst1, dst2, dst3;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
+
+    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
+    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
+    SRA_4V(dst0, dst1, dst2, dst3, 6);
+    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
+    SRARI_H2_SH(tmp0, tmp1, 6);
+    SAT_SH2_SH(tmp0, tmp1, 7);
+    out = PCKEV_XORI128_UB(tmp0, tmp1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter_x,
+                                           const int8_t *filter_y,
+                                           int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst98_r, dst109_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
+
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y);
+    } else if (0 == (height % 8)) {
+        hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                       filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
+    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
+    v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
+    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
+
+    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
+    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
+    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
+
+    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
+    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
+    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
+    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
+    SRARI_H2_SH(tmp4, tmp5, 6);
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
+    SAT_SH2_SH(tmp4, tmp5,7);
+    out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    out2 = PCKEV_XORI128_UB(tmp4, tmp5);
+    ST4x8_UB(out0, out1, dst, dst_stride);
+    ST2x4_UB(out2, 0, dst + 4, dst_stride);
+    dst += 4 * dst_stride;
+    ST2x4_UB(out2, 4, dst + 4, dst_stride);
+}
+
+static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 out0_r, out1_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
+    SRARI_H2_SH(out0_r, out1_r, 6);
+    SAT_SH2_SH(out0_r, out1_r, 7);
+    out = PCKEV_XORI128_UB(out0_r, out1_r);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t width8mult)
+{
+    uint32_t cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    for (cnt = width8mult; cnt--;) {
+        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src += 8;
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y)
+{
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
+
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
+    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
+    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
+    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
+    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
+    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+    SRARI_H2_SH(out4_r, out5_r, 6);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH2_SH(out4_r, out5_r, 7);
+    out0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    out1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    out2 = PCKEV_XORI128_UB(out4_r, out5_r);
+
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter_x,
+                                           const int8_t *filter_y,
+                                           int32_t height,
+                                           int32_t width8mult)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    for (cnt = width8mult; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r,
+                        out0_r, out1_r, out2_r, out3_r);
+
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            out0 = PCKEV_XORI128_UB(out0_r, out1_r);
+            out1 = PCKEV_XORI128_UB(out2_r, out3_r);
+            ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dst2 = dst6;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, 1);
+    } else if (6 == height) {
+        hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y);
+    } else if (0 == (height % 4)) {
+        hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                       filter_x, filter_y, height, 1);
+    }
+}
+
+static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0, mask1, mask2, mask3;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB3(src_tmp, src_stride, src0, src1, src2);
+    src_tmp += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+        src_tmp += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst10_l = dst54_l;
+        dst21_r = dst65_r;
+        dst21_l = dst65_l;
+        dsth2 = dsth6;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask2 = LD_SB(ff_hevc_mask_arr + 16);
+    mask3 = mask2 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
+
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
+
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        SRA_4V(dst4, dst5, dst6, dst7, 6);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    if (4 == height) {
+        hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
+                                   filter_y, 2);
+    } else {
+        hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                       filter_x, filter_y, height, 2);
+    }
+}
+
+static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 4);
+}
+
+#define UNI_MC_COPY(WIDTH)                                                 \
+void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                    ptrdiff_t dst_stride,  \
+                                                    uint8_t *src,          \
+                                                    ptrdiff_t src_stride,  \
+                                                    int height,            \
+                                                    intptr_t mx,           \
+                                                    intptr_t my,           \
+                                                    int width)             \
+{                                                                          \
+    copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height);     \
+}
+
+UNI_MC_COPY(8);
+UNI_MC_COPY(12);
+UNI_MC_COPY(16);
+UNI_MC_COPY(24);
+UNI_MC_COPY(32);
+UNI_MC_COPY(48);
+UNI_MC_COPY(64);
+
+#undef UNI_MC_COPY
+
+#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
+                                                       ptrdiff_t dst_stride,   \
+                                                       uint8_t *src,           \
+                                                       ptrdiff_t src_stride,   \
+                                                       int height,             \
+                                                       intptr_t mx,            \
+                                                       intptr_t my,            \
+                                                       int width)              \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
+                                            filter, height);                   \
+}
+
+UNI_MC(qpel, h, 4, 8, hz, mx);
+UNI_MC(qpel, h, 8, 8, hz, mx);
+UNI_MC(qpel, h, 12, 8, hz, mx);
+UNI_MC(qpel, h, 16, 8, hz, mx);
+UNI_MC(qpel, h, 24, 8, hz, mx);
+UNI_MC(qpel, h, 32, 8, hz, mx);
+UNI_MC(qpel, h, 48, 8, hz, mx);
+UNI_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_MC(qpel, v, 4, 8, vt, my);
+UNI_MC(qpel, v, 8, 8, vt, my);
+UNI_MC(qpel, v, 12, 8, vt, my);
+UNI_MC(qpel, v, 16, 8, vt, my);
+UNI_MC(qpel, v, 24, 8, vt, my);
+UNI_MC(qpel, v, 32, 8, vt, my);
+UNI_MC(qpel, v, 48, 8, vt, my);
+UNI_MC(qpel, v, 64, 8, vt, my);
+
+UNI_MC(epel, h, 4, 4, hz, mx);
+UNI_MC(epel, h, 6, 4, hz, mx);
+UNI_MC(epel, h, 8, 4, hz, mx);
+UNI_MC(epel, h, 12, 4, hz, mx);
+UNI_MC(epel, h, 16, 4, hz, mx);
+UNI_MC(epel, h, 24, 4, hz, mx);
+UNI_MC(epel, h, 32, 4, hz, mx);
+
+UNI_MC(epel, v, 4, 4, vt, my);
+UNI_MC(epel, v, 6, 4, vt, my);
+UNI_MC(epel, v, 8, 4, vt, my);
+UNI_MC(epel, v, 12, 4, vt, my);
+UNI_MC(epel, v, 16, 4, vt, my);
+UNI_MC(epel, v, 24, 4, vt, my);
+UNI_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_MC
+
+#define UNI_MC_HV(PEL, WIDTH, TAP)                                         \
+void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,          \
+                                                    ptrdiff_t dst_stride,  \
+                                                    uint8_t *src,          \
+                                                    ptrdiff_t src_stride,  \
+                                                    int height,            \
+                                                    intptr_t mx,           \
+                                                    intptr_t my,           \
+                                                    int width)             \
+{                                                                          \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];              \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];              \
+                                                                           \
+    hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
+                                        filter_x, filter_y, height);       \
+}
+
+UNI_MC_HV(qpel, 4, 8);
+UNI_MC_HV(qpel, 8, 8);
+UNI_MC_HV(qpel, 12, 8);
+UNI_MC_HV(qpel, 16, 8);
+UNI_MC_HV(qpel, 24, 8);
+UNI_MC_HV(qpel, 32, 8);
+UNI_MC_HV(qpel, 48, 8);
+UNI_MC_HV(qpel, 64, 8);
+
+UNI_MC_HV(epel, 4, 4);
+UNI_MC_HV(epel, 6, 4);
+UNI_MC_HV(epel, 8, 4);
+UNI_MC_HV(epel, 12, 4);
+UNI_MC_HV(epel, 16, 4);
+UNI_MC_HV(epel, 24, 4);
+UNI_MC_HV(epel, 32, 4);
+
+#undef UNI_MC_HV
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c b/libavcodec/mips/hevc_mc_uniw_msa.c
new file mode 100644
index 0000000..f9ecb41
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -0,0 +1,5368 @@
+/*
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,  \
+                                       out0_h, out1_h)                        \
+{                                                                             \
+    v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m;                                 \
+                                                                              \
+    ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m);                              \
+    ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m);                              \
+    DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w,      \
+                wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m);                   \
+    SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w);                    \
+    PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h);          \
+    ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h);          \
+    CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h);                                  \
+}
+
+#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,  \
+                                       offset_h, rnd_w, out0_h, out1_h,    \
+                                       out2_h, out3_h)                     \
+{                                                                          \
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,   \
+                                   out0_h, out1_h);                        \
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w,   \
+                                   out2_h, out3_h);                        \
+}
+
+static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt, tp0, tp1, tp2, tp3;
+    v16i8 zero = { 0 };
+    v16u8 out0, out1;
+    v16i8 src0 = { 0 }, src1 = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, offset_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    if (2 == height) {
+        v4i32 dst0_r, dst0_l;
+
+        LW2(src, src_stride, tp0, tp1);
+        INSERT_W2_SB(tp0, tp1, src0);
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+
+        ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
+        DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+        dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+        dst0 += offset_vec;
+        dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+        out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+        ST4x2_UB(out0, dst, dst_stride);
+    } else if (4 == height) {
+        LW4(src, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        SLLI_2V(dst0, dst1, 6);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
+                                       rnd_vec, dst0, dst1);
+        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == (height % 8)) {
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LW4(src, src_stride, tp0, tp1, tp2, tp3);
+            src += 4 * src_stride;
+            INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
+            LW4(src, src_stride, tp0, tp1, tp2, tp3);
+            src += 4 * src_stride;
+            INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                           offset_vec, rnd_vec, dst0, dst1,
+                                           dst2, dst3);
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+            ST4x8_UB(out0, out1, dst, dst_stride);
+            dst += 8 * dst_stride;
+        }
+    }
+}
+
+static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 zero = { 0 };
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD4(src, src_stride, tp0, tp1, tp2, tp3);
+        src += (4 * src_stride);
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        LD4(src, src_stride, tp0, tp1, tp2, tp3);
+        src += (4 * src_stride);
+        INSERT_D2_SB(tp0, tp1, src2);
+        INSERT_D2_SB(tp2, tp3, src3);
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+
+        ST6x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST6x4_UB(out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v16i8 zero = { 0 };
+    v16u8 out0, out1, out2, out3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    if (2 == height) {
+        LD2(src, src_stride, tp0, tp1);
+        INSERT_D2_SB(tp0, tp1, src0);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        SLLI_2V(dst0, dst1, 6);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
+                                       rnd_vec, dst0, dst1);
+        out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST8x2_UB(out0, dst, dst_stride);
+    } else if (4 == height) {
+        LD4(src, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+    } else if (6 == height) {
+        LD4(src, src_stride, tp0, tp1, tp2, tp3);
+        src += 4 * src_stride;
+        INSERT_D2_SB(tp0, tp1, src0);
+        INSERT_D2_SB(tp2, tp3, src1);
+        LD2(src, src_stride, tp0, tp1);
+        INSERT_D2_SB(tp0, tp1, src2);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_2V(dst4, dst5, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(out2, dst, dst_stride);
+    } else if (0 == height % 8) {
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD4(src, src_stride, tp0, tp1, tp2, tp3);
+            src += 4 * src_stride;
+            INSERT_D2_SB(tp0, tp1, src0);
+            INSERT_D2_SB(tp2, tp3, src1);
+            LD4(src, src_stride, tp0, tp1, tp2, tp3);
+            src += 4 * src_stride;
+            INSERT_D2_SB(tp0, tp1, src2);
+            INSERT_D2_SB(tp2, tp3, src3);
+
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            ILVRL_B2_SH(zero, src2, dst4, dst5);
+            ILVRL_B2_SH(zero, src3, dst6, dst7);
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            SLLI_4V(dst4, dst5, dst6, dst7, 6);
+            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                           offset_vec, rnd_vec, dst0, dst1,
+                                           dst2, dst3);
+            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                           offset_vec, rnd_vec, dst4, dst5,
+                                           dst6, dst7);
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+            ST8x4_UB(out0, out1, dst, dst_stride);
+            dst += (4 * dst_stride);
+            ST8x4_UB(out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 offset_vec;
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_2V(dst4, dst5, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST12x4_UB(out0, out1, out2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_16w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3, out4, out5;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
+    v8i16 dst8, dst9, dst10, dst11;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src4, src5);
+        LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
+        src += (4 * src_stride);
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
+        ILVRL_B2_SH(zero, src4, dst6, dst7);
+        ILVRL_B2_SH(zero, src5, dst8, dst9);
+        ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        SLLI_4V(dst8, dst9, dst10, dst11, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
+                                       offset_vec, rnd_vec, dst8, dst9, dst10,
+                                       dst11);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+        ST_UB4(out0, out1, out3, out4, dst, dst_stride);
+        ST8x4_UB(out2, out5, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_32w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        LD_SB2(src + 16, src_stride, src2, src3);
+        src += (2 * src_stride);
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST_UB2(out0, out1, dst, dst_stride);
+        ST_UB2(out2, out3, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_48w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3, out4, out5;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
+    v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src += src_stride;
+        LD_SB3(src, 16, src3, src4, src5);
+        src += src_stride;
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        ILVRL_B2_SH(zero, src4, dst8, dst9);
+        ILVRL_B2_SH(zero, src5, dst10, dst11);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        SLLI_4V(dst8, dst9, dst10, dst11, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
+                                       offset_vec, rnd_vec, dst8, dst9, dst10,
+                                       dst11);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+        ST_UB2(out0, out1, dst, 16);
+        ST_UB(out2, dst + 32);
+        dst += dst_stride;
+        ST_UB2(out3, out4, dst, 16);
+        ST_UB(out5, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_uniwgt_copy_64w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
+    v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+    v4i32 weight_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_h(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB4(src, 16, src0, src1, src2, src3);
+        src += src_stride;
+        LD_SB4(src, 16, src4, src5, src6, src7);
+        src += src_stride;
+
+        ILVRL_B2_SH(zero, src0, dst0, dst1);
+        ILVRL_B2_SH(zero, src1, dst2, dst3);
+        ILVRL_B2_SH(zero, src2, dst4, dst5);
+        ILVRL_B2_SH(zero, src3, dst6, dst7);
+        ILVRL_B2_SH(zero, src4, dst8, dst9);
+        ILVRL_B2_SH(zero, src5, dst10, dst11);
+        ILVRL_B2_SH(zero, src6, dst12, dst13);
+        ILVRL_B2_SH(zero, src7, dst14, dst15);
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        SLLI_4V(dst8, dst9, dst10, dst11, 6);
+        SLLI_4V(dst12, dst13, dst14, dst15, 6);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
+                                       offset_vec, rnd_vec, dst8, dst9, dst10,
+                                       dst11);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
+                                       offset_vec, rnd_vec, dst12, dst13, dst14,
+                                       dst15);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
+        PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
+        ST_UB4(out0, out1, out2, out3, dst, 16);
+        dst += dst_stride;
+        ST_UB4(out4, out5, out6, out7, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
+    v8i16 filter_vec, dst01, dst23, dst45, dst67;
+    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                  filt3);
+        dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                  filt3);
+        dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                  filt2, filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0, mask1, mask2, mask3;
+    v8i16 filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 filter_vec;
+    v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = LD_SB(&ff_hevc_mask_arr[16]);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
+        src += (4 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+        VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
+                   vec4, vec5, vec6, vec7);
+        dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                  filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0, mask1, mask2, mask3;
+    v8i16 filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST_UB2(out0, out1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = 16; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 16, src2, src3);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+
+        PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
+        ST_UB2(out0, out1, dst, dst_stride);
+        ST8x2_UB(out2, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0, mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 filter_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB4(src, 8, src0, src1, src2, src3);
+        src += src_stride;
+        LD_SB4(src, 8, src4, src5, src6, src7);
+        src += src_stride;
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST_UB2(out0, out1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out2, out3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = 64; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src3 = LD_SB(src + 40);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST_UB2(out0, out1, dst, 16);
+        ST_UB(out2, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 3;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src_tmp, 16, src0, src1);
+            src2 = LD_SB(src_tmp + 24);
+            src_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec4, vec5, vec6, vec7);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec8, vec9, vec10, vec11);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec12, vec13, vec14, vec15);
+            dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+            dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1,
+                                     filt2, filt3);
+            dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
+                                     filt2, filt3);
+            dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                     filt2, filt3);
+
+            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                           offset_vec, rnd_vec, dst0, dst1,
+                                           dst2, dst3);
+
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+            ST_UB2(out0, out1, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src9, src10, src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 filter_vec, dst01, dst23, dst45, dst67;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    ILVR_D3_SB(src21_r, src10_r, src43_r,
+               src32_r, src65_r, src54_r, src2110, src4332, src6554);
+
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src += (8 * src_stride);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+        dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
+                                  filt1, filt2, filt3);
+        dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
+                                  filt1, filt2, filt3);
+        dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110,
+                                  filt0, filt1, filt2, filt3);
+        dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312,
+                                  filt0, filt1, filt2, filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec;
+    v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+        dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+        dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+        dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+        dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+        dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+        dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+        dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
+                                 filt1, filt2, filt3);
+        dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
+                                 filt1, filt2, filt3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               const int8_t *filter,
+                                               int32_t height,
+                                               int32_t weight,
+                                               int32_t offset,
+                                               int32_t rnd_val,
+                                               int32_t weightmul16)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    int32_t loop_cnt, cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v16i8 src98_r, src109_r, src98_l, src109_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = weightmul16; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+
+            ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                       src10_r, src32_r, src54_r, src21_r);
+            ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+            ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                       src10_l, src32_l, src54_l, src21_l);
+            ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_r, src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_l, src87_l, src98_l, src109_l);
+
+            dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+            dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
+                                     filt1, filt2, filt3);
+            dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+            dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
+                                     filt1, filt2, filt3);
+            dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+            dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
+                                     filt1, filt2, filt3);
+            dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+            dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
+                                     filt1, filt2, filt3);
+
+            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                           offset_vec, rnd_vec, dst0, dst1,
+                                           dst2, dst3);
+            HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                           offset_vec, rnd_vec, dst4, dst5,
+                                           dst6, dst7);
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+            PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+            ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src0 = src4;
+            src1 = src5;
+            src2 = src6;
+            src3 = src7;
+            src4 = src8;
+            src5 = src9;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 1);
+}
+
+static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, 32, weight,
+                                       offset, rnd_val, 1);
+
+    hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                             filter, 32, weight, offset, rnd_val);
+}
+
+static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 2);
+}
+
+static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, 64, weight,
+                                       offset, rnd_val, 3);
+}
+
+static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 4);
+}
+
+static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+    denom_vec = rnd_vec - 6;
+
+    const_128 = __msa_ldi_w(128);
+    const_128 *= weight_vec;
+    offset_vec += __msa_srar_w(const_128, denom_vec);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+
+    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
+    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
+    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+
+        dst76_r = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98_r = __msa_ilvr_h(dst66, dst108);
+
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
+        SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
+        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
+        ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
+        CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
+        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst32_r = dst76_r;
+        dst54_r = dst98_r;
+        dst21_r = dst65_r;
+        dst43_r = dst87_r;
+        dst65_r = dst109_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= ((3 * src_stride) + 3);
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+    denom_vec = rnd_vec - 6;
+
+    const_128 = __msa_ldi_w(128);
+    const_128 *= weight_vec;
+    offset_vec += __msa_srar_w(const_128, denom_vec);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+        dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                                 filt2, filt3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                 filt3);
+        dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                                 filt3);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            src_tmp += 2 * src_stride;
+            XORI_B2_128_SB(src7, src8);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 8 */
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
+                                     filt2, filt3);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
+            MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
+            SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
+            ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+            ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
+            CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst0_l, dst1_l);
+
+            PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+            dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 8);
+}
+
+static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
+    v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+    v8i16 dst76_l, filter_vec;
+    v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
+    v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
+
+    src -= ((3 * src_stride) + 3);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+    denom_vec = rnd_vec - 6;
+
+    const_128 = __msa_ldi_w(128);
+    const_128 *= weight_vec;
+    offset_vec += __msa_srar_w(const_128, denom_vec);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
+               vec15);
+    dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                             filt3);
+    dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                             filt3);
+    dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
+                             filt2, filt3);
+    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                             filt3);
+    dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                             filt3);
+    dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                             filt3);
+
+    for (loop_cnt = 16; loop_cnt--;) {
+        src7 = LD_SB(src_tmp);
+        src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
+        src_tmp += src_stride;
+
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                 filt3);
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
+        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+        ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+        CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l);
+        dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+        out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+        ST8x1_UB(out, dst_tmp);
+        dst_tmp += dst_stride;
+
+        dst0 = dst1;
+        dst1 = dst2;
+        dst2 = dst3;
+        dst3 = dst4;
+        dst4 = dst5;
+        dst5 = dst6;
+        dst6 = dst7;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask4 = LD_SB(ff_hevc_mask_arr + 16);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
+               vec15);
+    dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                              filt3);
+    dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                              filt3);
+    dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
+                              filt3);
+    dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
+                              filt3);
+    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
+    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
+    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
+                   vec3);
+        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
+                   vec7);
+        dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+                                  filt3);
+        dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
+                                   filt3);
+
+        dst76_r = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98_r = __msa_ilvr_h(dst66, dst108);
+
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
+        SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
+        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
+        ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
+        CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
+        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst32_r = dst76_r;
+        dst54_r = dst98_r;
+        dst21_r = dst65_r;
+        dst43_r = dst87_r;
+        dst65_r = dst109_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 16);
+}
+
+static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 24);
+}
+
+static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 32);
+}
+
+static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 48);
+}
+
+static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 64);
+}
+
+static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, vec0, vec1;
+    v16i8 mask1;
+    v8i16 dst0;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+
+    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
+    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+    dst0 = __msa_adds_s_h(dst0, offset_vec);
+    dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+    out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+    ST4x2_UB(out, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+
+    src -= 1;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1);
+
+    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec;
+    v8i16 weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (8 == height || 16 == height) {
+        hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight,
+                                          offset, rnd_val);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    v16u8 out0, out1, out2, out3;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
+    dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1, dst2, dst3);
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst4, dst5, dst6, dst7);
+
+    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+    PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+    ST6x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST6x4_UB(out2, out3, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v8i16 filt0, filt1, dst0, dst1;
+    v16i8 src0, src1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1);
+
+    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1, dst2, dst3);
+
+    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out0, out1, out2;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v16i8 vec11;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
+
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1, dst2, dst3);
+
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                                   dst4, dst5);
+
+    PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
+        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst0, dst1, dst2, dst3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst4, dst5, dst6, dst7);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (6 == height) {
+        hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else {
+        hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v16i8 mask3, vec11;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst0, dst1, dst2, dst3);
+
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST12x4_UB(out0, out1, out2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
+        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst0, dst1, dst2, dst3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst4, dst5, dst6, dst7);
+
+        PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    out0, out1, out2, out3);
+
+        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1;
+    v16i8 mask0, mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = 16; loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 16, src_stride, src1, src3);
+        src += (2 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
+        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst0, dst1, dst2, dst3);
+
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST_UB2(out0, out1, dst, dst_stride);
+        ST8x2_UB(out2, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+        LD_SB2(src, 16, src3, src4);
+        src5 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
+        dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst0, dst1, dst2, dst3);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst4, dst5, dst6, dst7);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST_UB2(out0, out1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out2, out3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332;
+    v8i16 dst0;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
+    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+    dst0 = __msa_adds_s_h(dst0, offset_vec);
+    dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
+    out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst0, dst1;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+    dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1);
+
+    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v16i8 src10998;
+    v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src109_r, src98_r, src4332, src6554, src8776, src10998);
+        XORI_B4_128_SB(src4332, src6554, src8776, src10998);
+        dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
+
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                       weight_vec, offset_vec, rnd_vec,
+                                       dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2 = src10;
+        src2110 = src10998;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+    XORI_B3_128_SB(src0, src1, src2);
+    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
+    dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
+    dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+    dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1, dst2, dst3);
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst4, dst5, dst6, dst7);
+
+    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+    PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+    ST6x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST6x4_UB(out2, out3, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0, dst1;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                                   dst0, dst1);
+
+    out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src5, src6, src54_r, src65_r;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (3 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                   offset_vec, rnd_vec, dst0, dst1, dst2,
+                                   dst3);
+    PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
+    dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
+    HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                   offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
+    HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                                   dst4, dst5);
+    PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height,
+                                          int32_t weight,
+                                          int32_t offset,
+                                          int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
+        PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
+        ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2 = src10;
+        src10_r = src98_r;
+        src21_r = src109_r;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else if (6 == height) {
+        hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter, weight, offset, rnd_val);
+    } else {
+        hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter, height, weight, offset,
+                                      rnd_val);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16u8 out0, out1, out2, out3, out4, out5;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
+    v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (1 * src_stride);
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        ILVRL_B2_SB(src3, src2, src32_r, src32_l);
+        ILVRL_B2_SB(src4, src3, src43_r, src43_l);
+        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
+        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
+                                       rnd_vec, dst4, dst5);
+        PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
+        ST12x4_UB(out0, out1, out2, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVRL_B2_SB(src7, src6, src76_r, src76_l);
+        ILVRL_B2_SB(src8, src7, src87_r, src87_l);
+        ILVRL_B2_SB(src9, src8, src98_r, src98_l);
+        ILVRL_B2_SB(src10, src9, src109_r, src109_l);
+        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
+        src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
+        dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
+        dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+        dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+        dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
+        dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec,
+                                       offset_vec, rnd_vec, dst6, dst7, dst8,
+                                       dst9);
+        HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
+                                       rnd_vec, dst10, dst11);
+        PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
+        ST12x4_UB(out3, out4, out5, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2 = src10;
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2110 = src10998;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v16i8 src54_r, src54_l, src65_r, src65_l, src6;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVRL_B2_SB(src3, src2, src32_r, src32_l);
+        ILVRL_B2_SB(src4, src3, src43_r, src43_l);
+        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
+        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
+        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
+                    out2, out3);
+        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2 = src6;
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3, out4, out5;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    LD_SB3(src + 16, src_stride, src7, src8, src9);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    XORI_B3_128_SB(src7, src8, src9);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        XORI_B4_128_SB(src10, src11, src12, src13);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        ILVRL_B2_SB(src5, src4, src54_r, src54_l);
+        ILVRL_B2_SB(src6, src5, src65_r, src65_l);
+        ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
+        ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
+        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
+        dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
+        dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1);
+        dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1);
+        dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
+                                       offset_vec, rnd_vec, dst8, dst9, dst10,
+                                       dst11);
+        PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
+                    out2, out3);
+        PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
+        ST_UB4(out0, out1, out2, out3, dst, dst_stride);
+        ST8x4_UB(out4, out5, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2 = src6;
+        src9 = src13;
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src87_r = src1211_r;
+        src98_r = src1312_r;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1, out2, out3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= src_stride;
+
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    weight *= 128;
+    rnd_val -= 6;
+
+    weight_vec_h = __msa_fill_h(weight);
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val);
+
+    weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
+    offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    LD_SB3(src + 16, src_stride, src5, src6, src7);
+    src += (3 * src_stride);
+    XORI_B6_128_SB(src0, src1, src2, src5, src6, src7);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
+    ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        LD_SB2(src + 16, src_stride, src8, src9);
+        src += (2 * src_stride);
+        XORI_B4_128_SB(src3, src4, src8, src9);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        ILVRL_B2_SB(src8, src7, src87_r, src87_l);
+        ILVRL_B2_SB(src9, src8, src98_r, src98_l);
+        dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
+        dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1);
+        dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
+                                       offset_vec, rnd_vec, dst0, dst1, dst2,
+                                       dst3);
+        HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
+                                       offset_vec, rnd_vec, dst4, dst5, dst6,
+                                       dst7);
+        PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
+                    out2, out3);
+        ST_UB2(out0, out2, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out1, out3, dst, 16);
+        dst += dst_stride;
+
+        src2 = src4;
+        src7 = src9;
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src65_r = src87_r;
+        src76_r = src98_r;
+        src65_l = src87_l;
+        src76_l = src98_l;
+    }
+}
+
+static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filt_h0, filt_h1, filter_vec, tmp;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
+    v8i16 offset_vec, const_128, denom_vec;
+    v4i32 dst0, dst1, weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
+    dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
+    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst0 >>= 6;
+    dst1 >>= 6;
+    MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
+    SRAR_W2_SW(dst0, dst1, rnd_vec);
+    tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
+    tmp += offset_vec;
+    tmp = CLIP_SH_0_255_MAX_SATU(tmp);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
+    v8i16 offset_vec, const_128, denom_vec;
+    v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
+    dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
+    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
+    SRA_4V(dst0, dst1, dst2, dst3, 6);
+    MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
+    MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
+    SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+    PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
+    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        SRA_4V(dst4, dst5, dst6, dst7, 6);
+        MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
+        MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
+        MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
+        MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
+                    tmp2, tmp3);
+        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, weight,
+                                  offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_x,filter_y, weight,
+                                  offset, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                          filter_x, filter_y, height, weight,
+                                          offset, rnd_val);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
+    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
+    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
+    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
+    v8i16 offset_vec, const_128, denom_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+    dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
+    dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
+    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
+    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
+    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
+    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
+    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+    MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
+    MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
+    MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
+    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
+    MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
+    SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
+    SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
+    SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
+    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
+    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+    ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
+    ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
+    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
+    ST4x8_UB(out0, out1, dst, dst_stride);
+    ST2x4_UB(out2, 0, dst + 4, dst_stride);
+    dst += 4 * dst_stride;
+    ST2x4_UB(out2, 4, dst + 4, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 tmp0, tmp1;
+    v8i16 offset_vec, const_128, denom_vec;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
+    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+    CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t width8mult,
+                                          int32_t weight,
+                                          int32_t offset,
+                                          int32_t rnd_val)
+{
+    uint32_t cnt;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v8i16 offset_vec, const_128, denom_vec;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    for (cnt = width8mult; cnt--;) {
+        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src += 8;
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+        dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
+        MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
+        MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
+        SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
+        SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, tmp0, tmp1, tmp2, tmp3);
+        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16u8 out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 offset_vec, const_128, denom_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
+    dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+    dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
+    dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
+    dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
+    dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
+    dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
+    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+    MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
+    MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
+    MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
+    MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
+    MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
+    SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
+    SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
+    SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
+    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
+                tmp0, tmp1, tmp2, tmp3);
+    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
+    ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+    ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
+    ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
+    CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+    CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
+    PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width8mult)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1, filter_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 offset_vec, const_128, denom_vec;
+    v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    for (cnt = width8mult; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+            dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+            dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+            dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+            dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+            MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+            MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
+            MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
+            MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
+            SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
+            SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                        dst3_r, tmp0, tmp1, tmp2, tmp3);
+            ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+            ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
+            CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+            PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+            ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dst2 = dst6;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+
+    if (2 == height) {
+        hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, weight,
+                                  offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, 1, weight,
+                                      offset, rnd_val);
+    } else if (6 == height) {
+        hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, weight,
+                                  offset, rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                          filter_x, filter_y, height, weight,
+                                          offset, rnd_val, 1);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0, mask1, mask2, mask3;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v8i16 offset_vec, const_128, denom_vec;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    offset_vec = __msa_fill_h(offset);
+    denom_vec = __msa_fill_h(rnd_val - 6);
+    const_128 = __msa_fill_h((128 * weight));
+    offset_vec += __msa_srar_h(const_128, denom_vec);
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB3(src_tmp, src_stride, src0, src1, src2);
+    src_tmp += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+        src_tmp += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+        dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+        MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
+        MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
+        MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
+        SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
+        SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, tmp0, tmp1, tmp2, tmp3);
+        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst10_l = dst54_l;
+        dst21_r = dst65_r;
+        dst21_l = dst65_l;
+        dsth2 = dsth6;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask2 = LD_SB(ff_hevc_mask_arr + 16);
+    mask3 = mask2 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
+    dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+    dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
+               src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
+        dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+        dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+        dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+        dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        SRA_4V(dst4, dst5, dst6, dst7, 6);
+        MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
+        MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
+        MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
+        MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
+        SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
+        SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
+        PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
+                    tmp2, tmp3);
+        ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
+        ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
+        CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
+        PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST4x8_UB(out0, out1, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    if (4 == height) {
+        hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, 2, weight, offset,
+                                      rnd_val);
+    } else {
+        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                          filter_x, filter_y, height, weight,
+                                          offset, rnd_val, 2);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 3);
+}
+
+static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 4);
+}
+
+#define UNIWGT_MC_COPY(WIDTH)                                                \
+void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                      ptrdiff_t dst_stride,  \
+                                                      uint8_t *src,          \
+                                                      ptrdiff_t src_stride,  \
+                                                      int height,            \
+                                                      int denom,             \
+                                                      int weight,            \
+                                                      int offset,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)             \
+{                                                                            \
+    int shift = denom + 14 - 8;                                              \
+    hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride,        \
+                                    height, weight, offset, shift);          \
+}
+
+UNIWGT_MC_COPY(4);
+UNIWGT_MC_COPY(6);
+UNIWGT_MC_COPY(8);
+UNIWGT_MC_COPY(12);
+UNIWGT_MC_COPY(16);
+UNIWGT_MC_COPY(24);
+UNIWGT_MC_COPY(32);
+UNIWGT_MC_COPY(48);
+UNIWGT_MC_COPY(64);
+
+#undef UNIWGT_MC_COPY
+
+#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                        \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,        \
+                                                         ptrdiff_t            \
+                                                         dst_stride,          \
+                                                         uint8_t *src,        \
+                                                         ptrdiff_t            \
+                                                         src_stride,          \
+                                                         int height,          \
+                                                         int denom,           \
+                                                         int weight,          \
+                                                         int offset,          \
+                                                         intptr_t mx,         \
+                                                         intptr_t my,         \
+                                                         int width)           \
+{                                                                             \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
+    int shift = denom + 14 - 8;                                               \
+                                                                              \
+    hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,        \
+                                                 dst_stride, filter, height,  \
+                                                 weight, offset, shift);      \
+}
+
+UNI_W_MC(qpel, h, 4, 8, hz, mx);
+UNI_W_MC(qpel, h, 8, 8, hz, mx);
+UNI_W_MC(qpel, h, 12, 8, hz, mx);
+UNI_W_MC(qpel, h, 16, 8, hz, mx);
+UNI_W_MC(qpel, h, 24, 8, hz, mx);
+UNI_W_MC(qpel, h, 32, 8, hz, mx);
+UNI_W_MC(qpel, h, 48, 8, hz, mx);
+UNI_W_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_W_MC(qpel, v, 4, 8, vt, my);
+UNI_W_MC(qpel, v, 8, 8, vt, my);
+UNI_W_MC(qpel, v, 12, 8, vt, my);
+UNI_W_MC(qpel, v, 16, 8, vt, my);
+UNI_W_MC(qpel, v, 24, 8, vt, my);
+UNI_W_MC(qpel, v, 32, 8, vt, my);
+UNI_W_MC(qpel, v, 48, 8, vt, my);
+UNI_W_MC(qpel, v, 64, 8, vt, my);
+
+UNI_W_MC(epel, h, 4, 4, hz, mx);
+UNI_W_MC(epel, h, 6, 4, hz, mx);
+UNI_W_MC(epel, h, 8, 4, hz, mx);
+UNI_W_MC(epel, h, 12, 4, hz, mx);
+UNI_W_MC(epel, h, 16, 4, hz, mx);
+UNI_W_MC(epel, h, 24, 4, hz, mx);
+UNI_W_MC(epel, h, 32, 4, hz, mx);
+
+UNI_W_MC(epel, v, 4, 4, vt, my);
+UNI_W_MC(epel, v, 6, 4, vt, my);
+UNI_W_MC(epel, v, 8, 4, vt, my);
+UNI_W_MC(epel, v, 12, 4, vt, my);
+UNI_W_MC(epel, v, 16, 4, vt, my);
+UNI_W_MC(epel, v, 24, 4, vt, my);
+UNI_W_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_W_MC
+
+#define UNI_W_MC_HV(PEL, WIDTH, TAP)                                          \
+void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst,           \
+                                                      ptrdiff_t dst_stride,   \
+                                                      uint8_t *src,           \
+                                                      ptrdiff_t src_stride,   \
+                                                      int height,             \
+                                                      int denom,              \
+                                                      int weight,             \
+                                                      int offset,             \
+                                                      intptr_t mx,            \
+                                                      intptr_t my,            \
+                                                      int width)              \
+{                                                                             \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                 \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                 \
+    int shift = denom + 14 - 8;                                               \
+                                                                              \
+    hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
+                                           filter_x, filter_y,  height,       \
+                                           weight, offset, shift);            \
+}
+
+UNI_W_MC_HV(qpel, 4, 8);
+UNI_W_MC_HV(qpel, 8, 8);
+UNI_W_MC_HV(qpel, 12, 8);
+UNI_W_MC_HV(qpel, 16, 8);
+UNI_W_MC_HV(qpel, 24, 8);
+UNI_W_MC_HV(qpel, 32, 8);
+UNI_W_MC_HV(qpel, 48, 8);
+UNI_W_MC_HV(qpel, 64, 8);
+
+UNI_W_MC_HV(epel, 4, 4);
+UNI_W_MC_HV(epel, 6, 4);
+UNI_W_MC_HV(epel, 8, 4);
+UNI_W_MC_HV(epel, 12, 4);
+UNI_W_MC_HV(epel, 16, 4);
+UNI_W_MC_HV(epel, 24, 4);
+UNI_W_MC_HV(epel, 32, 4);
+
+#undef UNI_W_MC_HV
diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c
new file mode 100644
index 0000000..88337f4
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+#if HAVE_MMI
+static av_cold void hevc_dsp_init_mmi(HEVCDSPContext *c,
+                                      const int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_mmi;
+        c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_mmi;
+        c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_mmi;
+        c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_mmi;
+        c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_mmi;
+        c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_mmi;
+        c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_mmi;
+        c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_mmi;
+
+        c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_mmi;
+        c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_mmi;
+        c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_mmi;
+        c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_mmi;
+        c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_mmi;
+        c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_mmi;
+        c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_mmi;
+        c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_mmi;
+
+        c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_qpel_bi_h4_8_mmi;
+        c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_qpel_bi_h8_8_mmi;
+        c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_qpel_bi_h12_8_mmi;
+        c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_mmi;
+        c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_qpel_bi_h24_8_mmi;
+        c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_qpel_bi_h32_8_mmi;
+        c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_qpel_bi_h48_8_mmi;
+        c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h64_8_mmi;
+
+        c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_qpel_bi_hv4_8_mmi;
+        c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_qpel_bi_hv8_8_mmi;
+        c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_qpel_bi_hv12_8_mmi;
+        c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_qpel_bi_hv16_8_mmi;
+        c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_qpel_bi_hv24_8_mmi;
+        c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_qpel_bi_hv32_8_mmi;
+        c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_qpel_bi_hv48_8_mmi;
+        c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_qpel_bi_hv64_8_mmi;
+
+        c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_pel_bi_pixels8_8_mmi;
+        c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_pel_bi_pixels16_8_mmi;
+        c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_pel_bi_pixels24_8_mmi;
+        c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_pel_bi_pixels32_8_mmi;
+        c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_pel_bi_pixels48_8_mmi;
+        c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_pel_bi_pixels64_8_mmi;
+
+        c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_pel_bi_pixels8_8_mmi;
+        c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_pel_bi_pixels16_8_mmi;
+        c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_pel_bi_pixels24_8_mmi;
+        c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_pel_bi_pixels32_8_mmi;
+
+        c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_epel_bi_hv4_8_mmi;
+        c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_epel_bi_hv8_8_mmi;
+        c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_epel_bi_hv12_8_mmi;
+        c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_epel_bi_hv16_8_mmi;
+        c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_epel_bi_hv24_8_mmi;
+        c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_epel_bi_hv32_8_mmi;
+
+        c->put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_qpel_uni_hv4_8_mmi;
+        c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_qpel_uni_hv8_8_mmi;
+        c->put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_qpel_uni_hv12_8_mmi;
+        c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_qpel_uni_hv16_8_mmi;
+        c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_qpel_uni_hv24_8_mmi;
+        c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_qpel_uni_hv32_8_mmi;
+        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_qpel_uni_hv48_8_mmi;
+        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_qpel_uni_hv64_8_mmi;
+    }
+}
+#endif // #if HAVE_MMI
+
+#if HAVE_MSA
+static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
+                                      const int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+        c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+        c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+        c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+        c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+        c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+        c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+        c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_msa;
+        c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_msa;
+        c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_msa;
+        c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_msa;
+        c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_msa;
+        c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_msa;
+        c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_msa;
+        c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_msa;
+        c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_msa;
+
+        c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_msa;
+        c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_msa;
+        c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_msa;
+        c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_msa;
+        c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_msa;
+        c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_msa;
+        c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_msa;
+        c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_msa;
+
+        c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_msa;
+        c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_msa;
+        c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_msa;
+        c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_msa;
+        c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_msa;
+        c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_msa;
+        c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_msa;
+        c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_msa;
+
+        c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+        c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+        c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+        c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+        c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+        c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+        c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+
+        c->put_hevc_epel[1][0][1] = ff_hevc_put_hevc_epel_h4_8_msa;
+        c->put_hevc_epel[2][0][1] = ff_hevc_put_hevc_epel_h6_8_msa;
+        c->put_hevc_epel[3][0][1] = ff_hevc_put_hevc_epel_h8_8_msa;
+        c->put_hevc_epel[4][0][1] = ff_hevc_put_hevc_epel_h12_8_msa;
+        c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_8_msa;
+        c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_8_msa;
+        c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_msa;
+
+        c->put_hevc_epel[1][1][0] = ff_hevc_put_hevc_epel_v4_8_msa;
+        c->put_hevc_epel[2][1][0] = ff_hevc_put_hevc_epel_v6_8_msa;
+        c->put_hevc_epel[3][1][0] = ff_hevc_put_hevc_epel_v8_8_msa;
+        c->put_hevc_epel[4][1][0] = ff_hevc_put_hevc_epel_v12_8_msa;
+        c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_msa;
+        c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_msa;
+        c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_msa;
+
+        c->put_hevc_epel[1][1][1] = ff_hevc_put_hevc_epel_hv4_8_msa;
+        c->put_hevc_epel[2][1][1] = ff_hevc_put_hevc_epel_hv6_8_msa;
+        c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_msa;
+        c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_msa;
+        c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_msa;
+        c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_msa;
+        c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+        c->put_hevc_qpel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+        c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+        c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+        c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+        c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_msa;
+        c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_msa;
+        c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_msa;
+        c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_msa;
+        c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_msa;
+        c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_msa;
+        c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_msa;
+        c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_msa;
+        c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_uni[1][1][0] = ff_hevc_put_hevc_uni_qpel_v4_8_msa;
+        c->put_hevc_qpel_uni[3][1][0] = ff_hevc_put_hevc_uni_qpel_v8_8_msa;
+        c->put_hevc_qpel_uni[4][1][0] = ff_hevc_put_hevc_uni_qpel_v12_8_msa;
+        c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_8_msa;
+        c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_msa;
+        c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa;
+        c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa;
+        c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa;
+        c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa;
+        c->put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa;
+        c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa;
+        c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa;
+        c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
+        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
+        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+        c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+        c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+        c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+        c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
+        c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
+        c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
+        c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
+        c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
+        c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
+        c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
+        c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
+        c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
+        c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
+        c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
+        c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
+        c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
+        c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
+        c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
+        c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
+        c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
+        c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
+        c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+        c->put_hevc_qpel_uni_w[3][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+        c->put_hevc_qpel_uni_w[4][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+        c->put_hevc_qpel_uni_w[5][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+        c->put_hevc_qpel_uni_w[6][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+        c->put_hevc_qpel_uni_w[7][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+        c->put_hevc_qpel_uni_w[8][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa;
+        c->put_hevc_qpel_uni_w[9][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa;
+        c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa;
+        c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa;
+        c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa;
+        c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa;
+        c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa;
+        c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa;
+        c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa;
+        c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa;
+        c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa;
+        c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa;
+        c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa;
+        c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa;
+        c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa;
+        c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa;
+        c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa;
+        c->put_hevc_qpel_uni_w[4][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa;
+        c->put_hevc_qpel_uni_w[5][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa;
+        c->put_hevc_qpel_uni_w[6][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa;
+        c->put_hevc_qpel_uni_w[7][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa;
+        c->put_hevc_qpel_uni_w[8][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa;
+        c->put_hevc_qpel_uni_w[9][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_uni_w[1][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+        c->put_hevc_epel_uni_w[2][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa;
+        c->put_hevc_epel_uni_w[3][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+        c->put_hevc_epel_uni_w[4][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+        c->put_hevc_epel_uni_w[5][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+        c->put_hevc_epel_uni_w[6][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+        c->put_hevc_epel_uni_w[7][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa;
+        c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa;
+        c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa;
+        c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa;
+        c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa;
+        c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa;
+        c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa;
+        c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa;
+        c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa;
+        c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa;
+        c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa;
+        c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa;
+        c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa;
+        c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa;
+        c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa;
+        c->put_hevc_epel_uni_w[4][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv12_8_msa;
+        c->put_hevc_epel_uni_w[5][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv16_8_msa;
+        c->put_hevc_epel_uni_w[6][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv24_8_msa;
+        c->put_hevc_epel_uni_w[7][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+        c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+        c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+        c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+        c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+        c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+        c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa;
+        c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa;
+        c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa;
+        c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa;
+        c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa;
+        c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa;
+        c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa;
+        c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa;
+        c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa;
+        c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa;
+        c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa;
+        c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa;
+        c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa;
+        c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa;
+        c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa;
+        c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa;
+        c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa;
+        c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa;
+        c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa;
+        c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa;
+        c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa;
+        c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa;
+        c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+        c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa;
+        c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+        c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+        c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+        c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+        c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa;
+        c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa;
+        c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa;
+        c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa;
+        c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa;
+        c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa;
+        c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa;
+
+        c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa;
+        c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa;
+        c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa;
+        c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa;
+        c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa;
+        c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa;
+        c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa;
+
+        c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa;
+        c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa;
+        c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa;
+        c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa;
+        c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa;
+        c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa;
+        c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+        c->put_hevc_qpel_bi_w[3][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+        c->put_hevc_qpel_bi_w[4][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+        c->put_hevc_qpel_bi_w[5][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+        c->put_hevc_qpel_bi_w[6][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+        c->put_hevc_qpel_bi_w[7][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+        c->put_hevc_qpel_bi_w[8][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels48_8_msa;
+        c->put_hevc_qpel_bi_w[9][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_qpel_h4_8_msa;
+        c->put_hevc_qpel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_qpel_h8_8_msa;
+        c->put_hevc_qpel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_qpel_h12_8_msa;
+        c->put_hevc_qpel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_qpel_h16_8_msa;
+        c->put_hevc_qpel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_qpel_h24_8_msa;
+        c->put_hevc_qpel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_qpel_h32_8_msa;
+        c->put_hevc_qpel_bi_w[8][0][1] = ff_hevc_put_hevc_bi_w_qpel_h48_8_msa;
+        c->put_hevc_qpel_bi_w[9][0][1] = ff_hevc_put_hevc_bi_w_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_qpel_v4_8_msa;
+        c->put_hevc_qpel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_qpel_v8_8_msa;
+        c->put_hevc_qpel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_qpel_v12_8_msa;
+        c->put_hevc_qpel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_qpel_v16_8_msa;
+        c->put_hevc_qpel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_qpel_v24_8_msa;
+        c->put_hevc_qpel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_qpel_v32_8_msa;
+        c->put_hevc_qpel_bi_w[8][1][0] = ff_hevc_put_hevc_bi_w_qpel_v48_8_msa;
+        c->put_hevc_qpel_bi_w[9][1][0] = ff_hevc_put_hevc_bi_w_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv4_8_msa;
+        c->put_hevc_qpel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv8_8_msa;
+        c->put_hevc_qpel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv12_8_msa;
+        c->put_hevc_qpel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv16_8_msa;
+        c->put_hevc_qpel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv24_8_msa;
+        c->put_hevc_qpel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv32_8_msa;
+        c->put_hevc_qpel_bi_w[8][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv48_8_msa;
+        c->put_hevc_qpel_bi_w[9][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_bi_w[1][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+        c->put_hevc_epel_bi_w[2][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels6_8_msa;
+        c->put_hevc_epel_bi_w[3][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+        c->put_hevc_epel_bi_w[4][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+        c->put_hevc_epel_bi_w[5][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+        c->put_hevc_epel_bi_w[6][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+        c->put_hevc_epel_bi_w[7][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_epel_h4_8_msa;
+        c->put_hevc_epel_bi_w[2][0][1] = ff_hevc_put_hevc_bi_w_epel_h6_8_msa;
+        c->put_hevc_epel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_epel_h8_8_msa;
+        c->put_hevc_epel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_epel_h12_8_msa;
+        c->put_hevc_epel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_epel_h16_8_msa;
+        c->put_hevc_epel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_epel_h24_8_msa;
+        c->put_hevc_epel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_epel_h32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_epel_v4_8_msa;
+        c->put_hevc_epel_bi_w[2][1][0] = ff_hevc_put_hevc_bi_w_epel_v6_8_msa;
+        c->put_hevc_epel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_epel_v8_8_msa;
+        c->put_hevc_epel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_epel_v12_8_msa;
+        c->put_hevc_epel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_epel_v16_8_msa;
+        c->put_hevc_epel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_epel_v24_8_msa;
+        c->put_hevc_epel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_epel_v32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_epel_hv4_8_msa;
+        c->put_hevc_epel_bi_w[2][1][1] = ff_hevc_put_hevc_bi_w_epel_hv6_8_msa;
+        c->put_hevc_epel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_epel_hv8_8_msa;
+        c->put_hevc_epel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_epel_hv12_8_msa;
+        c->put_hevc_epel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_epel_hv16_8_msa;
+        c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa;
+        c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa;
+
+        c->sao_band_filter[0] =
+        c->sao_band_filter[1] =
+        c->sao_band_filter[2] =
+        c->sao_band_filter[3] =
+        c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa;
+
+        c->sao_edge_filter[0] =
+        c->sao_edge_filter[1] =
+        c->sao_edge_filter[2] =
+        c->sao_edge_filter[3] =
+        c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa;
+
+        c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa;
+        c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa;
+
+        c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa;
+        c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa;
+
+        c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa;
+        c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa;
+
+        c->hevc_h_loop_filter_chroma_c =
+            ff_hevc_loop_filter_chroma_h_8_msa;
+        c->hevc_v_loop_filter_chroma_c =
+            ff_hevc_loop_filter_chroma_v_8_msa;
+
+        c->idct[0] = ff_hevc_idct_4x4_msa;
+        c->idct[1] = ff_hevc_idct_8x8_msa;
+        c->idct[2] = ff_hevc_idct_16x16_msa;
+        c->idct[3] = ff_hevc_idct_32x32_msa;
+        c->idct_dc[0] = ff_hevc_idct_dc_4x4_msa;
+        c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa;
+        c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa;
+        c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa;
+        c->add_residual[0] = ff_hevc_addblk_4x4_msa;
+        c->add_residual[1] = ff_hevc_addblk_8x8_msa;
+        c->add_residual[2] = ff_hevc_addblk_16x16_msa;
+        c->add_residual[3] = ff_hevc_addblk_32x32_msa;
+        c->transform_4x4_luma = ff_hevc_idct_luma_4x4_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth)
+{
+#if HAVE_MMI
+    hevc_dsp_init_mmi(c, bit_depth);
+#endif  // #if HAVE_MMI
+#if HAVE_MSA
+    hevc_dsp_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h
new file mode 100644
index 0000000..c84e08d
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H
+#define AVCODEC_MIPS_HEVCDSP_MIPS_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define MC(PEL, DIR, WIDTH)                                                 \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,            \
+                                                     uint8_t *src,          \
+                                                     ptrdiff_t src_stride,  \
+                                                     int height,            \
+                                                     intptr_t mx,           \
+                                                     intptr_t my,           \
+                                                     int width)
+
+MC(pel, pixels, 4);
+MC(pel, pixels, 6);
+MC(pel, pixels, 8);
+MC(pel, pixels, 12);
+MC(pel, pixels, 16);
+MC(pel, pixels, 24);
+MC(pel, pixels, 32);
+MC(pel, pixels, 48);
+MC(pel, pixels, 64);
+
+MC(qpel, h, 4);
+MC(qpel, h, 8);
+MC(qpel, h, 12);
+MC(qpel, h, 16);
+MC(qpel, h, 24);
+MC(qpel, h, 32);
+MC(qpel, h, 48);
+MC(qpel, h, 64);
+
+MC(qpel, v, 4);
+MC(qpel, v, 8);
+MC(qpel, v, 12);
+MC(qpel, v, 16);
+MC(qpel, v, 24);
+MC(qpel, v, 32);
+MC(qpel, v, 48);
+MC(qpel, v, 64);
+
+MC(qpel, hv, 4);
+MC(qpel, hv, 8);
+MC(qpel, hv, 12);
+MC(qpel, hv, 16);
+MC(qpel, hv, 24);
+MC(qpel, hv, 32);
+MC(qpel, hv, 48);
+MC(qpel, hv, 64);
+
+MC(epel, h, 4);
+MC(epel, h, 6);
+MC(epel, h, 8);
+MC(epel, h, 12);
+MC(epel, h, 16);
+MC(epel, h, 24);
+MC(epel, h, 32);
+MC(epel, h, 48);
+MC(epel, h, 64);
+
+MC(epel, v, 4);
+MC(epel, v, 6);
+MC(epel, v, 8);
+MC(epel, v, 12);
+MC(epel, v, 16);
+MC(epel, v, 24);
+MC(epel, v, 32);
+MC(epel, v, 48);
+MC(epel, v, 64);
+
+MC(epel, hv, 4);
+MC(epel, hv, 6);
+MC(epel, hv, 8);
+MC(epel, hv, 12);
+MC(epel, hv, 16);
+MC(epel, hv, 24);
+MC(epel, hv, 32);
+MC(epel, hv, 48);
+MC(epel, hv, 64);
+
+#undef MC
+
+#define UNI_MC(PEL, DIR, WIDTH)                                                \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
+                                                         ptrdiff_t dst_stride, \
+                                                         uint8_t *src,         \
+                                                         ptrdiff_t src_stride, \
+                                                         int height,           \
+                                                         intptr_t mx,          \
+                                                         intptr_t my,          \
+                                                         int width)
+
+UNI_MC(pel, pixels, 4);
+UNI_MC(pel, pixels, 6);
+UNI_MC(pel, pixels, 8);
+UNI_MC(pel, pixels, 12);
+UNI_MC(pel, pixels, 16);
+UNI_MC(pel, pixels, 24);
+UNI_MC(pel, pixels, 32);
+UNI_MC(pel, pixels, 48);
+UNI_MC(pel, pixels, 64);
+
+UNI_MC(qpel, h, 4);
+UNI_MC(qpel, h, 8);
+UNI_MC(qpel, h, 12);
+UNI_MC(qpel, h, 16);
+UNI_MC(qpel, h, 24);
+UNI_MC(qpel, h, 32);
+UNI_MC(qpel, h, 48);
+UNI_MC(qpel, h, 64);
+
+UNI_MC(qpel, v, 4);
+UNI_MC(qpel, v, 8);
+UNI_MC(qpel, v, 12);
+UNI_MC(qpel, v, 16);
+UNI_MC(qpel, v, 24);
+UNI_MC(qpel, v, 32);
+UNI_MC(qpel, v, 48);
+UNI_MC(qpel, v, 64);
+
+UNI_MC(qpel, hv, 4);
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 12);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
+UNI_MC(epel, h, 4);
+UNI_MC(epel, h, 6);
+UNI_MC(epel, h, 8);
+UNI_MC(epel, h, 12);
+UNI_MC(epel, h, 16);
+UNI_MC(epel, h, 24);
+UNI_MC(epel, h, 32);
+UNI_MC(epel, h, 48);
+UNI_MC(epel, h, 64);
+
+UNI_MC(epel, v, 4);
+UNI_MC(epel, v, 6);
+UNI_MC(epel, v, 8);
+UNI_MC(epel, v, 12);
+UNI_MC(epel, v, 16);
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+UNI_MC(epel, v, 48);
+UNI_MC(epel, v, 64);
+
+UNI_MC(epel, hv, 4);
+UNI_MC(epel, hv, 6);
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+UNI_MC(epel, hv, 48);
+UNI_MC(epel, hv, 64);
+
+#undef UNI_MC
+
+#define UNI_W_MC(PEL, DIR, WIDTH)                                         \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,    \
+                                                           ptrdiff_t      \
+                                                           dst_stride,    \
+                                                           uint8_t *src,  \
+                                                           ptrdiff_t      \
+                                                           src_stride,    \
+                                                           int height,    \
+                                                           int denom,     \
+                                                           int weight,    \
+                                                           int offset,    \
+                                                           intptr_t mx,   \
+                                                           intptr_t my,   \
+                                                           int width)
+
+UNI_W_MC(pel, pixels, 4);
+UNI_W_MC(pel, pixels, 6);
+UNI_W_MC(pel, pixels, 8);
+UNI_W_MC(pel, pixels, 12);
+UNI_W_MC(pel, pixels, 16);
+UNI_W_MC(pel, pixels, 24);
+UNI_W_MC(pel, pixels, 32);
+UNI_W_MC(pel, pixels, 48);
+UNI_W_MC(pel, pixels, 64);
+
+UNI_W_MC(qpel, h, 4);
+UNI_W_MC(qpel, h, 8);
+UNI_W_MC(qpel, h, 12);
+UNI_W_MC(qpel, h, 16);
+UNI_W_MC(qpel, h, 24);
+UNI_W_MC(qpel, h, 32);
+UNI_W_MC(qpel, h, 48);
+UNI_W_MC(qpel, h, 64);
+
+UNI_W_MC(qpel, v, 4);
+UNI_W_MC(qpel, v, 8);
+UNI_W_MC(qpel, v, 12);
+UNI_W_MC(qpel, v, 16);
+UNI_W_MC(qpel, v, 24);
+UNI_W_MC(qpel, v, 32);
+UNI_W_MC(qpel, v, 48);
+UNI_W_MC(qpel, v, 64);
+
+UNI_W_MC(qpel, hv, 4);
+UNI_W_MC(qpel, hv, 8);
+UNI_W_MC(qpel, hv, 12);
+UNI_W_MC(qpel, hv, 16);
+UNI_W_MC(qpel, hv, 24);
+UNI_W_MC(qpel, hv, 32);
+UNI_W_MC(qpel, hv, 48);
+UNI_W_MC(qpel, hv, 64);
+
+UNI_W_MC(epel, h, 4);
+UNI_W_MC(epel, h, 6);
+UNI_W_MC(epel, h, 8);
+UNI_W_MC(epel, h, 12);
+UNI_W_MC(epel, h, 16);
+UNI_W_MC(epel, h, 24);
+UNI_W_MC(epel, h, 32);
+UNI_W_MC(epel, h, 48);
+UNI_W_MC(epel, h, 64);
+
+UNI_W_MC(epel, v, 4);
+UNI_W_MC(epel, v, 6);
+UNI_W_MC(epel, v, 8);
+UNI_W_MC(epel, v, 12);
+UNI_W_MC(epel, v, 16);
+UNI_W_MC(epel, v, 24);
+UNI_W_MC(epel, v, 32);
+UNI_W_MC(epel, v, 48);
+UNI_W_MC(epel, v, 64);
+
+UNI_W_MC(epel, hv, 4);
+UNI_W_MC(epel, hv, 6);
+UNI_W_MC(epel, hv, 8);
+UNI_W_MC(epel, hv, 12);
+UNI_W_MC(epel, hv, 16);
+UNI_W_MC(epel, hv, 24);
+UNI_W_MC(epel, hv, 32);
+UNI_W_MC(epel, hv, 48);
+UNI_W_MC(epel, hv, 64);
+
+#undef UNI_W_MC
+
+#define BI_MC(PEL, DIR, WIDTH)                                                 \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
+                                                        ptrdiff_t dst_stride,  \
+                                                        uint8_t *src,          \
+                                                        ptrdiff_t src_stride,  \
+                                                        int16_t *src_16bit,    \
+                                                        int height,            \
+                                                        intptr_t mx,           \
+                                                        intptr_t my,           \
+                                                        int width)
+
+BI_MC(pel, pixels, 4);
+BI_MC(pel, pixels, 6);
+BI_MC(pel, pixels, 8);
+BI_MC(pel, pixels, 12);
+BI_MC(pel, pixels, 16);
+BI_MC(pel, pixels, 24);
+BI_MC(pel, pixels, 32);
+BI_MC(pel, pixels, 48);
+BI_MC(pel, pixels, 64);
+
+BI_MC(qpel, h, 4);
+BI_MC(qpel, h, 8);
+BI_MC(qpel, h, 12);
+BI_MC(qpel, h, 16);
+BI_MC(qpel, h, 24);
+BI_MC(qpel, h, 32);
+BI_MC(qpel, h, 48);
+BI_MC(qpel, h, 64);
+
+BI_MC(qpel, v, 4);
+BI_MC(qpel, v, 8);
+BI_MC(qpel, v, 12);
+BI_MC(qpel, v, 16);
+BI_MC(qpel, v, 24);
+BI_MC(qpel, v, 32);
+BI_MC(qpel, v, 48);
+BI_MC(qpel, v, 64);
+
+BI_MC(qpel, hv, 4);
+BI_MC(qpel, hv, 8);
+BI_MC(qpel, hv, 12);
+BI_MC(qpel, hv, 16);
+BI_MC(qpel, hv, 24);
+BI_MC(qpel, hv, 32);
+BI_MC(qpel, hv, 48);
+BI_MC(qpel, hv, 64);
+
+BI_MC(epel, h, 4);
+BI_MC(epel, h, 6);
+BI_MC(epel, h, 8);
+BI_MC(epel, h, 12);
+BI_MC(epel, h, 16);
+BI_MC(epel, h, 24);
+BI_MC(epel, h, 32);
+BI_MC(epel, h, 48);
+BI_MC(epel, h, 64);
+
+BI_MC(epel, v, 4);
+BI_MC(epel, v, 6);
+BI_MC(epel, v, 8);
+BI_MC(epel, v, 12);
+BI_MC(epel, v, 16);
+BI_MC(epel, v, 24);
+BI_MC(epel, v, 32);
+BI_MC(epel, v, 48);
+BI_MC(epel, v, 64);
+
+BI_MC(epel, hv, 4);
+BI_MC(epel, hv, 6);
+BI_MC(epel, hv, 8);
+BI_MC(epel, hv, 12);
+BI_MC(epel, hv, 16);
+BI_MC(epel, hv, 24);
+BI_MC(epel, hv, 32);
+BI_MC(epel, hv, 48);
+BI_MC(epel, hv, 64);
+
+#undef BI_MC
+
+#define BI_W_MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                          ptrdiff_t            \
+                                                          dst_stride,          \
+                                                          uint8_t *src,        \
+                                                          ptrdiff_t            \
+                                                          src_stride,          \
+                                                          int16_t *src_16bit,  \
+                                                          int height,          \
+                                                          int denom,           \
+                                                          int weight0,         \
+                                                          int weight1,         \
+                                                          int offset0,         \
+                                                          int offset1,         \
+                                                          intptr_t mx,         \
+                                                          intptr_t my,         \
+                                                          int width)
+
+BI_W_MC(pel, pixels, 4);
+BI_W_MC(pel, pixels, 6);
+BI_W_MC(pel, pixels, 8);
+BI_W_MC(pel, pixels, 12);
+BI_W_MC(pel, pixels, 16);
+BI_W_MC(pel, pixels, 24);
+BI_W_MC(pel, pixels, 32);
+BI_W_MC(pel, pixels, 48);
+BI_W_MC(pel, pixels, 64);
+
+BI_W_MC(qpel, h, 4);
+BI_W_MC(qpel, h, 8);
+BI_W_MC(qpel, h, 12);
+BI_W_MC(qpel, h, 16);
+BI_W_MC(qpel, h, 24);
+BI_W_MC(qpel, h, 32);
+BI_W_MC(qpel, h, 48);
+BI_W_MC(qpel, h, 64);
+
+BI_W_MC(qpel, v, 4);
+BI_W_MC(qpel, v, 8);
+BI_W_MC(qpel, v, 12);
+BI_W_MC(qpel, v, 16);
+BI_W_MC(qpel, v, 24);
+BI_W_MC(qpel, v, 32);
+BI_W_MC(qpel, v, 48);
+BI_W_MC(qpel, v, 64);
+
+BI_W_MC(qpel, hv, 4);
+BI_W_MC(qpel, hv, 8);
+BI_W_MC(qpel, hv, 12);
+BI_W_MC(qpel, hv, 16);
+BI_W_MC(qpel, hv, 24);
+BI_W_MC(qpel, hv, 32);
+BI_W_MC(qpel, hv, 48);
+BI_W_MC(qpel, hv, 64);
+
+BI_W_MC(epel, h, 4);
+BI_W_MC(epel, h, 6);
+BI_W_MC(epel, h, 8);
+BI_W_MC(epel, h, 12);
+BI_W_MC(epel, h, 16);
+BI_W_MC(epel, h, 24);
+BI_W_MC(epel, h, 32);
+BI_W_MC(epel, h, 48);
+BI_W_MC(epel, h, 64);
+
+BI_W_MC(epel, v, 4);
+BI_W_MC(epel, v, 6);
+BI_W_MC(epel, v, 8);
+BI_W_MC(epel, v, 12);
+BI_W_MC(epel, v, 16);
+BI_W_MC(epel, v, 24);
+BI_W_MC(epel, v, 32);
+BI_W_MC(epel, v, 48);
+BI_W_MC(epel, v, 64);
+
+BI_W_MC(epel, hv, 4);
+BI_W_MC(epel, hv, 6);
+BI_W_MC(epel, hv, 8);
+BI_W_MC(epel, hv, 12);
+BI_W_MC(epel, hv, 16);
+BI_W_MC(epel, hv, 24);
+BI_W_MC(epel, hv, 32);
+BI_W_MC(epel, hv, 48);
+BI_W_MC(epel, hv, 64);
+
+#undef BI_W_MC
+
+void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+
+void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+
+void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+
+void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+
+void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                     int16_t *sao_offset_val, int sao_left_class,
+                                     int width, int height);
+
+void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height);
+
+void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs);
+void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                            ptrdiff_t stride);
+void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                            ptrdiff_t stride);
+void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                              ptrdiff_t stride);
+void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                              ptrdiff_t stride);
+void ff_hevc_idct_luma_4x4_msa(int16_t *pi16Coeffs);
+
+/* Loongson optimization */
+#define L_MC(PEL, DIR, WIDTH, TYPE)                                          \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_##TYPE(int16_t *dst,          \
+                                                      uint8_t *src,          \
+                                                      ptrdiff_t src_stride,  \
+                                                      int height,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)
+L_MC(qpel, h, 4, mmi);
+L_MC(qpel, h, 8, mmi);
+L_MC(qpel, h, 12, mmi);
+L_MC(qpel, h, 16, mmi);
+L_MC(qpel, h, 24, mmi);
+L_MC(qpel, h, 32, mmi);
+L_MC(qpel, h, 48, mmi);
+L_MC(qpel, h, 64, mmi);
+
+L_MC(qpel, hv, 4, mmi);
+L_MC(qpel, hv, 8, mmi);
+L_MC(qpel, hv, 12, mmi);
+L_MC(qpel, hv, 16, mmi);
+L_MC(qpel, hv, 24, mmi);
+L_MC(qpel, hv, 32, mmi);
+L_MC(qpel, hv, 48, mmi);
+L_MC(qpel, hv, 64, mmi);
+
+#define L_BI_MC(PEL, DIR, WIDTH, TYPE)                                          \
+void ff_hevc_put_hevc_##PEL##_bi_##DIR##WIDTH##_8_##TYPE(uint8_t *dst,          \
+                                                         ptrdiff_t dst_stride,  \
+                                                         uint8_t *src,          \
+                                                         ptrdiff_t src_stride,  \
+                                                         int16_t *src2,         \
+                                                         int height,            \
+                                                         intptr_t mx,           \
+                                                         intptr_t my,           \
+                                                         int width)
+
+L_BI_MC(pel, pixels, 8, mmi);
+L_BI_MC(pel, pixels, 16, mmi);
+L_BI_MC(pel, pixels, 24, mmi);
+L_BI_MC(pel, pixels, 32, mmi);
+L_BI_MC(pel, pixels, 48, mmi);
+L_BI_MC(pel, pixels, 64, mmi);
+
+L_BI_MC(qpel, hv, 4, mmi);
+L_BI_MC(qpel, hv, 8, mmi);
+L_BI_MC(qpel, hv, 12, mmi);
+L_BI_MC(qpel, hv, 16, mmi);
+L_BI_MC(qpel, hv, 24, mmi);
+L_BI_MC(qpel, hv, 32, mmi);
+L_BI_MC(qpel, hv, 48, mmi);
+L_BI_MC(qpel, hv, 64, mmi);
+
+L_BI_MC(qpel, h, 4, mmi);
+L_BI_MC(qpel, h, 8, mmi);
+L_BI_MC(qpel, h, 12, mmi);
+L_BI_MC(qpel, h, 16, mmi);
+L_BI_MC(qpel, h, 24, mmi);
+L_BI_MC(qpel, h, 32, mmi);
+L_BI_MC(qpel, h, 48, mmi);
+L_BI_MC(qpel, h, 64, mmi);
+
+L_BI_MC(epel, hv, 4, mmi);
+L_BI_MC(epel, hv, 8, mmi);
+L_BI_MC(epel, hv, 12, mmi);
+L_BI_MC(epel, hv, 16, mmi);
+L_BI_MC(epel, hv, 24, mmi);
+L_BI_MC(epel, hv, 32, mmi);
+#undef L_BI_MC
+
+#define L_UNI_MC(PEL, DIR, WIDTH, TYPE)                                         \
+void ff_hevc_put_hevc_##PEL##_uni_##DIR##WIDTH##_8_##TYPE(uint8_t *dst,         \
+                                                          ptrdiff_t dst_stride, \
+                                                          uint8_t *src,         \
+                                                          ptrdiff_t src_stride, \
+                                                          int height,           \
+                                                          intptr_t mx,          \
+                                                          intptr_t my,          \
+                                                          int width)
+
+L_UNI_MC(qpel, hv, 4, mmi);
+L_UNI_MC(qpel, hv, 8, mmi);
+L_UNI_MC(qpel, hv, 12, mmi);
+L_UNI_MC(qpel, hv, 16, mmi);
+L_UNI_MC(qpel, hv, 24, mmi);
+L_UNI_MC(qpel, hv, 32, mmi);
+L_UNI_MC(qpel, hv, 48, mmi);
+L_UNI_MC(qpel, hv, 64, mmi);
+#undef L_UNI_MC
+
+#endif  // #ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H
diff --git a/libavcodec/mips/hevcdsp_mmi.c b/libavcodec/mips/hevcdsp_mmi.c
new file mode 100644
index 0000000..aa83e1f
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_mmi.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2019 Shiyou Yin (yinshiyou-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/hevcdec.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavutil/mips/mmiutils.h"
+
+#define PUT_HEVC_QPEL_H(w, x_step, src_step, dst_step)                   \
+void ff_hevc_put_hevc_qpel_h##w##_8_mmi(int16_t *dst, uint8_t *_src,     \
+                                        ptrdiff_t _srcstride,            \
+                                        int height, intptr_t mx,         \
+                                        intptr_t my, int width)          \
+{                                                                        \
+    int x, y;                                                            \
+    pixel *src = (pixel*)_src - 3;                                       \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                    \
+    uint64_t ftmp[15];                                                   \
+    uint64_t rtmp[1];                                                    \
+    const int8_t *filter = ff_hevc_qpel_filters[mx - 1];                 \
+                                                                         \
+    x = x_step;                                                          \
+    y = height;                                                          \
+    __asm__ volatile(                                                    \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
+        "li           %[rtmp0],      0x08                       \n\t"    \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
+        "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"    \
+                                                                         \
+        "1:                                                     \n\t"    \
+        "2:                                                     \n\t"    \
+        "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"    \
+        "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"    \
+        "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"    \
+        "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
+                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])            \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
+        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
+        "gssdlc1      %[ftmp3],      0x07(%[dst])               \n\t"    \
+        "gssdrc1      %[ftmp3],      0x00(%[dst])               \n\t"    \
+                                                                         \
+        "daddi        %[x],          %[x],         -0x01        \n\t"    \
+        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"    \
+        PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"    \
+        "bnez         %[x],          2b                         \n\t"    \
+                                                                         \
+        "daddi        %[y],          %[y],         -0x01        \n\t"    \
+        "li           %[x],        " #x_step "                  \n\t"    \
+        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"    \
+        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"    \
+        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"    \
+        PTR_ADDIU    "%[dst],        %[dst],        0x80        \n\t"    \
+        "bnez         %[y],          1b                         \n\t"    \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
+          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),                \
+          [src]"+&r"(src), [dst]"+&r"(dst), [y]"+&r"(y),                 \
+          [x]"+&r"(x)                                                    \
+        : [filter]"r"(filter), [stride]"r"(srcstride)                    \
+        : "memory"                                                       \
+    );                                                                   \
+}
+
+PUT_HEVC_QPEL_H(4, 1, -4, -8);
+PUT_HEVC_QPEL_H(8, 2, -8, -16);
+PUT_HEVC_QPEL_H(12, 3, -12, -24);
+PUT_HEVC_QPEL_H(16, 4, -16, -32);
+PUT_HEVC_QPEL_H(24, 6, -24, -48);
+PUT_HEVC_QPEL_H(32, 8, -32, -64);
+PUT_HEVC_QPEL_H(48, 12, -48, -96);
+PUT_HEVC_QPEL_H(64, 16, -64, -128);
+
+#define PUT_HEVC_QPEL_HV(w, x_step, src_step, dst_step)                  \
+void ff_hevc_put_hevc_qpel_hv##w##_8_mmi(int16_t *dst, uint8_t *_src,    \
+                                     ptrdiff_t _srcstride,               \
+                                     int height, intptr_t mx,            \
+                                     intptr_t my, int width)             \
+{                                                                        \
+    int x, y;                                                            \
+    const int8_t *filter;                                                \
+    pixel *src = (pixel*)_src;                                           \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                    \
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];         \
+    int16_t *tmp = tmp_array;                                            \
+    uint64_t ftmp[15];                                                   \
+    uint64_t rtmp[1];                                                    \
+                                                                         \
+    src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                        \
+    filter = ff_hevc_qpel_filters[mx - 1];                               \
+    x = x_step;                                                          \
+    y = height + QPEL_EXTRA;                                             \
+    __asm__ volatile(                                                    \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
+        "li           %[rtmp0],      0x08                       \n\t"    \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
+        "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"    \
+                                                                         \
+        "1:                                                     \n\t"    \
+        "2:                                                     \n\t"    \
+        "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"    \
+        "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"    \
+        "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"    \
+        "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"    \
+        "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"    \
+        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"    \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"    \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
+                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])            \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
+        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
+        "gssdlc1      %[ftmp3],      0x07(%[tmp])               \n\t"    \
+        "gssdrc1      %[ftmp3],      0x00(%[tmp])               \n\t"    \
+                                                                         \
+        "daddi        %[x],          %[x],         -0x01        \n\t"    \
+        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"    \
+        "bnez         %[x],          2b                         \n\t"    \
+                                                                         \
+        "daddi        %[y],          %[y],         -0x01        \n\t"    \
+        "li           %[x],        " #x_step "                  \n\t"    \
+        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],     " #dst_step "  \n\t"    \
+        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "bnez         %[y],          1b                         \n\t"    \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
+          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),                \
+          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                 \
+          [x]"+&r"(x)                                                    \
+        : [filter]"r"(filter), [stride]"r"(srcstride)                    \
+        : "memory"                                                       \
+    );                                                                   \
+                                                                         \
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * 4 -12;                      \
+    filter = ff_hevc_qpel_filters[my - 1];                               \
+    x = x_step;                                                          \
+    y = height;                                                          \
+    __asm__ volatile(                                                    \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
+        "li           %[rtmp0],      0x08                       \n\t"    \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
+        "li           %[rtmp0],      0x06                       \n\t"    \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
+                                                                         \
+        "1:                                                     \n\t"    \
+        "2:                                                     \n\t"    \
+        "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "gsldlc1      %[ftmp7],      0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp7],      0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "gsldlc1      %[ftmp8],      0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp8],      0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "gsldlc1      %[ftmp9],      0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp9],      0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "gsldlc1      %[ftmp10],     0x07(%[tmp])               \n\t"    \
+        "gsldrc1      %[ftmp10],     0x00(%[tmp])               \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"    \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
+                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])         \
+        TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],            \
+                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])         \
+        "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"    \
+        "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"    \
+        "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"    \
+        "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"    \
+        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"    \
+        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"    \
+        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])           \
+        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"    \
+        "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"    \
+        "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"    \
+        "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"    \
+        "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"    \
+        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"    \
+        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"    \
+        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])           \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"    \
+        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
+        "gssdlc1      %[ftmp3],      0x07(%[dst])               \n\t"    \
+        "gssdrc1      %[ftmp3],      0x00(%[dst])               \n\t"    \
+                                                                         \
+        "daddi        %[x],          %[x],         -0x01        \n\t"    \
+        PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"    \
+        "bnez         %[x],          2b                         \n\t"    \
+                                                                         \
+        "daddi        %[y],          %[y],         -0x01        \n\t"    \
+        "li           %[x],        " #x_step "                  \n\t"    \
+        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],     " #dst_step "  \n\t"    \
+        PTR_ADDIU    "%[dst],        %[dst],        0x80        \n\t"    \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
+        "bnez         %[y],          1b                         \n\t"    \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
+          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),              \
+          [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),              \
+          [ftmp14]"=&f"(ftmp[14]), [rtmp0]"=&r"(rtmp[0]),                \
+          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y),                 \
+          [x]"+&r"(x)                                                    \
+        : [filter]"r"(filter), [stride]"r"(srcstride)                    \
+        : "memory"                                                       \
+    );                                                                   \
+}
+
+PUT_HEVC_QPEL_HV(4, 1, -4, -8);
+PUT_HEVC_QPEL_HV(8, 2, -8, -16);
+PUT_HEVC_QPEL_HV(12, 3, -12, -24);
+PUT_HEVC_QPEL_HV(16, 4, -16, -32);
+PUT_HEVC_QPEL_HV(24, 6, -24, -48);
+PUT_HEVC_QPEL_HV(32, 8, -32, -64);
+PUT_HEVC_QPEL_HV(48, 12, -48, -96);
+PUT_HEVC_QPEL_HV(64, 16, -64, -128);
+
+#define PUT_HEVC_QPEL_BI_H(w, x_step, src_step, src2_step, dst_step)    \
+void ff_hevc_put_hevc_qpel_bi_h##w##_8_mmi(uint8_t *_dst,               \
+                                           ptrdiff_t _dststride,        \
+                                           uint8_t *_src,               \
+                                           ptrdiff_t _srcstride,        \
+                                           int16_t *src2, int height,   \
+                                           intptr_t mx, intptr_t my,    \
+                                           int width)                   \
+{                                                                       \
+    int x, y;                                                           \
+    pixel        *src       = (pixel*)_src - 3;                         \
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);               \
+    pixel *dst          = (pixel *)_dst;                                \
+    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];             \
+    uint64_t ftmp[20];                                                  \
+    uint64_t rtmp[1];                                                   \
+    int shift = 7;                                                      \
+    int offset = 64;                                                    \
+                                                                        \
+    x = width >> 2;                                                     \
+    y = height;                                                         \
+    __asm__ volatile(                                                   \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
+        "li           %[rtmp0],      0x08                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
+        "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
+        "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"   \
+        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
+                                                                        \
+        "1:                                                     \n\t"   \
+        "li           %[x],        " #x_step "                  \n\t"   \
+        "2:                                                     \n\t"   \
+        "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
+                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
+        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp3],      %[offset]   \n\t"   \
+        "gsldlc1      %[ftmp4],      0x07(%[src2])              \n\t"   \
+        "gsldrc1      %[ftmp4],      0x00(%[src2])              \n\t"   \
+        "li           %[rtmp0],      0x10                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
+        "punpcklhw    %[ftmp5],      %[ftmp0],      %[ftmp3]    \n\t"   \
+        "punpckhhw    %[ftmp6],      %[ftmp0],      %[ftmp3]    \n\t"   \
+        "punpckhhw    %[ftmp3],      %[ftmp0],      %[ftmp4]    \n\t"   \
+        "punpcklhw    %[ftmp4],      %[ftmp0],      %[ftmp4]    \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
+        "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
+        "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
+        "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "and          %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
+        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
+        "gsswlc1      %[ftmp3],      0x03(%[dst])               \n\t"   \
+        "gsswrc1      %[ftmp3],      0x00(%[dst])               \n\t"   \
+                                                                        \
+        "daddi        %[x],          %[x],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
+        "bnez         %[x],          2b                         \n\t"   \
+                                                                        \
+        "daddi        %[y],          %[y],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
+        PTR_ADDU     "%[src],        %[src],    %[src_stride]   \n\t"   \
+        PTR_ADDU     "%[dst],        %[dst],    %[dst_stride]   \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
+        "bnez         %[y],          1b                         \n\t"   \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
+          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
+          [ftmp12]"=&f"(ftmp[12]), [src2]"+&r"(src2),                   \
+          [dst]"+&r"(dst), [src]"+&r"(src), [y]"+&r"(y), [x]"=&r"(x),   \
+          [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0])                  \
+        : [src_stride]"r"(srcstride), [dst_stride]"r"(dststride),       \
+          [filter]"r"(filter), [shift]"f"(shift)                        \
+        : "memory"                                                      \
+    );                                                                  \
+}
+
+PUT_HEVC_QPEL_BI_H(4, 1, -4, -8, -4);
+PUT_HEVC_QPEL_BI_H(8, 2, -8, -16, -8);
+PUT_HEVC_QPEL_BI_H(12, 3, -12, -24, -12);
+PUT_HEVC_QPEL_BI_H(16, 4, -16, -32, -16);
+PUT_HEVC_QPEL_BI_H(24, 6, -24, -48, -24);
+PUT_HEVC_QPEL_BI_H(32, 8, -32, -64, -32);
+PUT_HEVC_QPEL_BI_H(48, 12, -48, -96, -48);
+PUT_HEVC_QPEL_BI_H(64, 16, -64, -128, -64);
+
+#define PUT_HEVC_QPEL_BI_HV(w, x_step, src_step, src2_step, dst_step)   \
+void ff_hevc_put_hevc_qpel_bi_hv##w##_8_mmi(uint8_t *_dst,              \
+                                            ptrdiff_t _dststride,       \
+                                            uint8_t *_src,              \
+                                            ptrdiff_t _srcstride,       \
+                                            int16_t *src2, int height,  \
+                                            intptr_t mx, intptr_t my,   \
+                                            int width)                  \
+{                                                                       \
+    int x, y;                                                           \
+    const int8_t *filter;                                               \
+    pixel *src = (pixel*)_src;                                          \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
+    pixel *dst          = (pixel *)_dst;                                \
+    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];        \
+    int16_t *tmp = tmp_array;                                           \
+    uint64_t ftmp[20];                                                  \
+    uint64_t rtmp[1];                                                   \
+    int shift = 7;                                                      \
+    int offset = 64;                                                    \
+                                                                        \
+    src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                       \
+    filter = ff_hevc_qpel_filters[mx - 1];                              \
+    x = width >> 2;                                                     \
+    y = height + QPEL_EXTRA;                                            \
+    __asm__ volatile(                                                   \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
+        "li           %[rtmp0],      0x08                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
+        "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
+                                                                        \
+        "1:                                                     \n\t"   \
+        "2:                                                     \n\t"   \
+        "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
+                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
+        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
+        "gssdlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
+        "gssdrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
+                                                                        \
+        "daddi        %[x],          %[x],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
+        "bnez         %[x],          2b                         \n\t"   \
+                                                                        \
+        "daddi        %[y],          %[y],         -0x01        \n\t"   \
+        "li           %[x],        " #x_step "                  \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
+        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "bnez         %[y],          1b                         \n\t"   \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
+          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),               \
+          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
+          [x]"+&r"(x)                                                   \
+        : [filter]"r"(filter), [stride]"r"(srcstride)                   \
+        : "memory"                                                      \
+    );                                                                  \
+                                                                        \
+    tmp    = tmp_array;                                                 \
+    filter = ff_hevc_qpel_filters[my - 1];                              \
+    x = width >> 2;                                                     \
+    y = height;                                                         \
+    __asm__ volatile(                                                   \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
+        "li           %[rtmp0],      0x08                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
+        "li           %[rtmp0],      0x06                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
+                                                                        \
+        "1:                                                     \n\t"   \
+        "li           %[x],        " #x_step "                  \n\t"   \
+        "2:                                                     \n\t"   \
+        "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp7],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp7],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp8],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp8],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp9],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp9],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp10],     0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp10],     0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"   \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
+                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
+        TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],           \
+                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
+        "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"   \
+        "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
+        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
+        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])          \
+        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"   \
+        "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"   \
+        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
+        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
+        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])          \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
+        "gsldlc1      %[ftmp4],      0x07(%[src2])              \n\t"   \
+        "gsldrc1      %[ftmp4],      0x00(%[src2])              \n\t"   \
+        "xor          %[ftmp7],      %[ftmp7],      %[ftmp7]    \n\t"   \
+        "li           %[rtmp0],      0x10                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
+        "punpcklhw    %[ftmp5],      %[ftmp7],      %[ftmp3]    \n\t"   \
+        "punpckhhw    %[ftmp6],      %[ftmp7],      %[ftmp3]    \n\t"   \
+        "punpckhhw    %[ftmp3],      %[ftmp7],      %[ftmp4]    \n\t"   \
+        "punpcklhw    %[ftmp4],      %[ftmp7],      %[ftmp4]    \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
+        "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
+        "paddw        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"   \
+        "paddw        %[ftmp6],      %[ftmp6],      %[offset]   \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
+        "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
+        "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp7]    \n\t"   \
+        "and          %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
+        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
+        "gsswlc1      %[ftmp3],      0x03(%[dst])               \n\t"   \
+        "gsswrc1      %[ftmp3],      0x00(%[dst])               \n\t"   \
+                                                                        \
+        "daddi        %[x],          %[x],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
+        "bnez         %[x],          2b                         \n\t"   \
+                                                                        \
+        "daddi        %[y],          %[y],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
+        PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "bnez         %[y],          1b                         \n\t"   \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
+          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
+          [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),             \
+          [ftmp14]"=&f"(ftmp[14]), [src2]"+&r"(src2),                   \
+          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
+          [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0])                  \
+        : [filter]"r"(filter), [stride]"r"(dststride),                  \
+          [shift]"f"(shift)                                             \
+        : "memory"                                                      \
+    );                                                                  \
+}
+
+PUT_HEVC_QPEL_BI_HV(4, 1, -4, -8, -4);
+PUT_HEVC_QPEL_BI_HV(8, 2, -8, -16, -8);
+PUT_HEVC_QPEL_BI_HV(12, 3, -12, -24, -12);
+PUT_HEVC_QPEL_BI_HV(16, 4, -16, -32, -16);
+PUT_HEVC_QPEL_BI_HV(24, 6, -24, -48, -24);
+PUT_HEVC_QPEL_BI_HV(32, 8, -32, -64, -32);
+PUT_HEVC_QPEL_BI_HV(48, 12, -48, -96, -48);
+PUT_HEVC_QPEL_BI_HV(64, 16, -64, -128, -64);
+
+#define PUT_HEVC_EPEL_BI_HV(w, x_step, src_step, src2_step, dst_step)   \
+void ff_hevc_put_hevc_epel_bi_hv##w##_8_mmi(uint8_t *_dst,              \
+                                            ptrdiff_t _dststride,       \
+                                            uint8_t *_src,              \
+                                            ptrdiff_t _srcstride,       \
+                                            int16_t *src2, int height,  \
+                                            intptr_t mx, intptr_t my,   \
+                                            int width)                  \
+{                                                                       \
+    int x, y;                                                           \
+    pixel *src = (pixel *)_src;                                         \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
+    pixel *dst          = (pixel *)_dst;                                \
+    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];                \
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];        \
+    int16_t *tmp = tmp_array;                                           \
+    uint64_t ftmp[12];                                                  \
+    uint64_t rtmp[1];                                                   \
+    int shift = 7;                                                      \
+    int offset = 64;                                                    \
+                                                                        \
+    src -= (EPEL_EXTRA_BEFORE * srcstride + 1);                         \
+    x = width >> 2;                                                     \
+    y = height + EPEL_EXTRA;                                            \
+    __asm__ volatile(                                                   \
+        MMI_LWC1(%[ftmp1], %[filter], 0x00)                             \
+        "li           %[rtmp0],      0x08                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
+        "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
+                                                                        \
+        "1:                                                     \n\t"   \
+        "2:                                                     \n\t"   \
+        "gslwlc1      %[ftmp2],      0x03(%[src])               \n\t"   \
+        "gslwrc1      %[ftmp2],      0x00(%[src])               \n\t"   \
+        "gslwlc1      %[ftmp3],      0x04(%[src])               \n\t"   \
+        "gslwrc1      %[ftmp3],      0x01(%[src])               \n\t"   \
+        "gslwlc1      %[ftmp4],      0x05(%[src])               \n\t"   \
+        "gslwrc1      %[ftmp4],      0x02(%[src])               \n\t"   \
+        "gslwlc1      %[ftmp5],      0x06(%[src])               \n\t"   \
+        "gslwrc1      %[ftmp5],      0x03(%[src])               \n\t"   \
+        "punpcklbh    %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp2],      %[ftmp2],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp3],      %[ftmp3],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp4],      %[ftmp4],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp4],      %[ftmp4],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp5],      %[ftmp5],      %[ftmp1]    \n\t"   \
+        TRANSPOSE_4H(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],            \
+                     %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9])            \
+        "paddh        %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"   \
+        "paddh        %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"   \
+        "paddh        %[ftmp2],      %[ftmp2],      %[ftmp4]    \n\t"   \
+        "gssdlc1      %[ftmp2],      0x07(%[tmp])               \n\t"   \
+        "gssdrc1      %[ftmp2],      0x00(%[tmp])               \n\t"   \
+                                                                        \
+        "daddi        %[x],          %[x],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
+        "bnez         %[x],          2b                         \n\t"   \
+                                                                        \
+        "daddi        %[y],          %[y],         -0x01        \n\t"   \
+        "li           %[x],        " #x_step "                  \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
+        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "bnez         %[y],          1b                         \n\t"   \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
+          [rtmp0]"=&r"(rtmp[0]),                                        \
+          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
+          [x]"+&r"(x)                                                   \
+        : [filter]"r"(filter), [stride]"r"(srcstride)                   \
+        : "memory"                                                      \
+    );                                                                  \
+                                                                        \
+    tmp      = tmp_array;                                               \
+    filter = ff_hevc_epel_filters[my - 1];                              \
+    x = width >> 2;                                                     \
+    y = height;                                                         \
+    __asm__ volatile(                                                   \
+        MMI_LWC1(%[ftmp1], %[filter], 0x00)                             \
+        "li           %[rtmp0],      0x08                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
+        "li           %[rtmp0],      0x06                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
+        "xor          %[ftmp2],      %[ftmp2],      %[ftmp2]    \n\t"   \
+                                                                        \
+        "1:                                                     \n\t"   \
+        "li           %[x],        " #x_step "                  \n\t"   \
+        "2:                                                     \n\t"   \
+        "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],       -0x180       \n\t"   \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
+                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
+        "pmaddhw      %[ftmp7],      %[ftmp3],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp8],      %[ftmp4],      %[ftmp1]    \n\t"   \
+        TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp3], %[ftmp4])            \
+        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "pmaddhw      %[ftmp7],      %[ftmp5],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp8],      %[ftmp6],      %[ftmp1]    \n\t"   \
+        TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp5], %[ftmp6])            \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
+        "gsldlc1      %[ftmp4],      0x07(%[src2])              \n\t"   \
+        "gsldrc1      %[ftmp4],      0x00(%[src2])              \n\t"   \
+        "li           %[rtmp0],      0x10                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
+        "punpcklhw    %[ftmp5],      %[ftmp2],      %[ftmp3]    \n\t"   \
+        "punpckhhw    %[ftmp6],      %[ftmp2],      %[ftmp3]    \n\t"   \
+        "punpckhhw    %[ftmp3],      %[ftmp2],      %[ftmp4]    \n\t"   \
+        "punpcklhw    %[ftmp4],      %[ftmp2],      %[ftmp4]    \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
+        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
+        "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
+        "paddw        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"   \
+        "paddw        %[ftmp6],      %[ftmp6],      %[offset]   \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
+        "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
+        "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp2]    \n\t"   \
+        "and          %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
+        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
+        "gsswlc1      %[ftmp3],      0x03(%[dst])               \n\t"   \
+        "gsswrc1      %[ftmp3],      0x00(%[dst])               \n\t"   \
+                                                                        \
+        "daddi        %[x],          %[x],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
+        "bnez         %[x],          2b                         \n\t"   \
+                                                                        \
+        "daddi        %[y],          %[y],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
+        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
+        PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "bnez         %[y],          1b                         \n\t"   \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
+          [ftmp10]"=&f"(ftmp[10]), [src2]"+&r"(src2),                   \
+          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
+          [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0])                  \
+        : [filter]"r"(filter), [stride]"r"(dststride),                  \
+          [shift]"f"(shift)                                             \
+        : "memory"                                                      \
+    );                                                                  \
+}
+
+PUT_HEVC_EPEL_BI_HV(4, 1, -4, -8, -4);
+PUT_HEVC_EPEL_BI_HV(8, 2, -8, -16, -8);
+PUT_HEVC_EPEL_BI_HV(12, 3, -12, -24, -12);
+PUT_HEVC_EPEL_BI_HV(16, 4, -16, -32, -16);
+PUT_HEVC_EPEL_BI_HV(24, 6, -24, -48, -24);
+PUT_HEVC_EPEL_BI_HV(32, 8, -32, -64, -32);
+
+#define PUT_HEVC_PEL_BI_PIXELS(w, x_step, src_step, dst_step, src2_step)  \
+void ff_hevc_put_hevc_pel_bi_pixels##w##_8_mmi(uint8_t *_dst,             \
+                                               ptrdiff_t _dststride,      \
+                                               uint8_t *_src,             \
+                                               ptrdiff_t _srcstride,      \
+                                               int16_t *src2, int height, \
+                                               intptr_t mx, intptr_t my,  \
+                                               int width)                 \
+{                                                                         \
+    int x, y;                                                             \
+    pixel *src          = (pixel *)_src;                                  \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                     \
+    pixel *dst          = (pixel *)_dst;                                  \
+    ptrdiff_t dststride = _dststride / sizeof(pixel);                     \
+    uint64_t ftmp[12];                                                    \
+    uint64_t rtmp[1];                                                     \
+    int shift = 7;                                                        \
+                                                                          \
+    y = height;                                                           \
+    x = width >> 3;                                                       \
+    __asm__ volatile(                                                     \
+        "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"     \
+        "li           %[rtmp0],      0x06                       \n\t"     \
+        "dmtc1        %[rtmp0],      %[ftmp1]                   \n\t"     \
+        "li           %[rtmp0],      0x10                       \n\t"     \
+        "dmtc1        %[rtmp0],      %[ftmp10]                  \n\t"     \
+        "li           %[rtmp0],      0x40                       \n\t"     \
+        "dmtc1        %[rtmp0],      %[offset]                  \n\t"     \
+        "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"     \
+        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"     \
+                                                                          \
+        "1:                                                     \n\t"     \
+        "2:                                                     \n\t"     \
+        "gsldlc1      %[ftmp5],      0x07(%[src])               \n\t"     \
+        "gsldrc1      %[ftmp5],      0x00(%[src])               \n\t"     \
+        "gsldlc1      %[ftmp2],      0x07(%[src2])              \n\t"     \
+        "gsldrc1      %[ftmp2],      0x00(%[src2])              \n\t"     \
+        "gsldlc1      %[ftmp3],      0x0f(%[src2])              \n\t"     \
+        "gsldrc1      %[ftmp3],      0x08(%[src2])              \n\t"     \
+        "punpcklbh    %[ftmp4],      %[ftmp5],      %[ftmp0]    \n\t"     \
+        "punpckhbh    %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"     \
+        "psllh        %[ftmp4],      %[ftmp4],      %[ftmp1]    \n\t"     \
+        "psllh        %[ftmp5],      %[ftmp5],      %[ftmp1]    \n\t"     \
+        "paddh        %[ftmp4],      %[ftmp4],      %[offset]   \n\t"     \
+        "paddh        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"     \
+        "punpcklhw    %[ftmp6],      %[ftmp4],      %[ftmp0]    \n\t"     \
+        "punpckhhw    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"     \
+        "punpcklhw    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"     \
+        "punpckhhw    %[ftmp9],      %[ftmp5],      %[ftmp0]    \n\t"     \
+        "punpcklhw    %[ftmp4],      %[ftmp0],      %[ftmp3]    \n\t"     \
+        "punpckhhw    %[ftmp5],      %[ftmp0],      %[ftmp3]    \n\t"     \
+        "punpckhhw    %[ftmp3],      %[ftmp0],      %[ftmp2]    \n\t"     \
+        "punpcklhw    %[ftmp2],      %[ftmp0],      %[ftmp2]    \n\t"     \
+        "psraw        %[ftmp2],      %[ftmp2],      %[ftmp10]   \n\t"     \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp10]   \n\t"     \
+        "psraw        %[ftmp4],      %[ftmp4],      %[ftmp10]   \n\t"     \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp10]   \n\t"     \
+        "paddw        %[ftmp2],      %[ftmp2],      %[ftmp6]    \n\t"     \
+        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp7]    \n\t"     \
+        "paddw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"     \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp9]    \n\t"     \
+        "psraw        %[ftmp2],      %[ftmp2],      %[shift]    \n\t"     \
+        "psraw        %[ftmp3],      %[ftmp3],      %[shift]    \n\t"     \
+        "psraw        %[ftmp4],      %[ftmp4],      %[shift]    \n\t"     \
+        "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"     \
+        "packsswh     %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"     \
+        "packsswh     %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"     \
+        "pcmpgth      %[ftmp3],      %[ftmp2],      %[ftmp0]    \n\t"     \
+        "pcmpgth      %[ftmp5],      %[ftmp4],      %[ftmp0]    \n\t"     \
+        "and          %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"     \
+        "and          %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"     \
+        "packushb     %[ftmp2],      %[ftmp2],      %[ftmp4]    \n\t"     \
+        "gssdlc1      %[ftmp2],      0x07(%[dst])               \n\t"     \
+        "gssdrc1      %[ftmp2],      0x00(%[dst])               \n\t"     \
+                                                                          \
+        "daddi        %[x],          %[x],         -0x01        \n\t"     \
+        PTR_ADDIU    "%[src],        %[src],        0x08        \n\t"     \
+        PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"     \
+        PTR_ADDIU    "%[src2],       %[src2],       0x10        \n\t"     \
+        "bnez         %[x],          2b                         \n\t"     \
+                                                                          \
+        PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"     \
+        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"     \
+        PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"     \
+        "li           %[x],        " #x_step "                  \n\t"     \
+        "daddi        %[y],          %[y],         -0x01        \n\t"     \
+        PTR_ADDU     "%[src],        %[src],       %[srcstride] \n\t"     \
+        PTR_ADDU     "%[dst],        %[dst],       %[dststride] \n\t"     \
+        PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"     \
+        "bnez         %[y],          1b                         \n\t"     \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                   \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                   \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                   \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                   \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                   \
+          [ftmp10]"=&f"(ftmp[10]), [offset]"=&f"(ftmp[11]),               \
+          [src2]"+&r"(src2), [dst]"+&r"(dst), [src]"+&r"(src),            \
+          [x]"+&r"(x), [y]"+&r"(y), [rtmp0]"=&r"(rtmp[0])                 \
+        : [dststride]"r"(dststride), [shift]"f"(shift),                   \
+          [srcstride]"r"(srcstride)                                       \
+        : "memory"                                                        \
+    );                                                                    \
+}                                                                         \
+
+PUT_HEVC_PEL_BI_PIXELS(8, 1, -8, -8, -16);
+PUT_HEVC_PEL_BI_PIXELS(16, 2, -16, -16, -32);
+PUT_HEVC_PEL_BI_PIXELS(24, 3, -24, -24, -48);
+PUT_HEVC_PEL_BI_PIXELS(32, 4, -32, -32, -64);
+PUT_HEVC_PEL_BI_PIXELS(48, 6, -48, -48, -96);
+PUT_HEVC_PEL_BI_PIXELS(64, 8, -64, -64, -128);
+
+#define PUT_HEVC_QPEL_UNI_HV(w, x_step, src_step, dst_step, tmp_step)   \
+void ff_hevc_put_hevc_qpel_uni_hv##w##_8_mmi(uint8_t *_dst,             \
+                                             ptrdiff_t _dststride,      \
+                                             uint8_t *_src,             \
+                                             ptrdiff_t _srcstride,      \
+                                             int height,                \
+                                             intptr_t mx, intptr_t my,  \
+                                             int width)                 \
+{                                                                       \
+    int x, y;                                                           \
+    const int8_t *filter;                                               \
+    pixel *src = (pixel*)_src;                                          \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
+    pixel *dst          = (pixel *)_dst;                                \
+    ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];        \
+    int16_t *tmp = tmp_array;                                           \
+    uint64_t ftmp[20];                                                  \
+    uint64_t rtmp[1];                                                   \
+    int shift = 6;                                                      \
+    int offset = 32;                                                    \
+                                                                        \
+    src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                       \
+    filter = ff_hevc_qpel_filters[mx - 1];                              \
+    x = width >> 2;                                                     \
+    y = height + QPEL_EXTRA;                                            \
+    __asm__ volatile(                                                   \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
+        "li           %[rtmp0],      0x08                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
+        "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
+                                                                        \
+        "1:                                                     \n\t"   \
+        "2:                                                     \n\t"   \
+        "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"   \
+        "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"   \
+        "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
+        "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
+        "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
+        "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
+                     %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
+        "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
+        "gssdlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
+        "gssdrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
+                                                                        \
+        "daddi        %[x],          %[x],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
+        "bnez         %[x],          2b                         \n\t"   \
+                                                                        \
+        "daddi        %[y],          %[y],         -0x01        \n\t"   \
+        "li           %[x],        " #x_step "                  \n\t"   \
+        PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],      " #tmp_step " \n\t"   \
+        PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "bnez         %[y],          1b                         \n\t"   \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
+          [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),               \
+          [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
+          [x]"+&r"(x)                                                   \
+        : [filter]"r"(filter), [stride]"r"(srcstride)                   \
+        : "memory"                                                      \
+    );                                                                  \
+                                                                        \
+    tmp    = tmp_array;                                                 \
+    filter = ff_hevc_qpel_filters[my - 1];                              \
+    x = width >> 2;                                                     \
+    y = height;                                                         \
+    __asm__ volatile(                                                   \
+        MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
+        "li           %[rtmp0],      0x08                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
+        "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
+        "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
+        "li           %[rtmp0],      0x06                       \n\t"   \
+        "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
+        "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"   \
+        "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
+                                                                        \
+        "1:                                                     \n\t"   \
+        "li           %[x],        " #x_step "                  \n\t"   \
+        "2:                                                     \n\t"   \
+        "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp7],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp7],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp8],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp8],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp9],      0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp9],      0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "gsldlc1      %[ftmp10],     0x07(%[tmp])               \n\t"   \
+        "gsldrc1      %[ftmp10],     0x00(%[tmp])               \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"   \
+        TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
+                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
+        TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],           \
+                     %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
+        "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"   \
+        "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"   \
+        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
+        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
+        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])          \
+        "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
+        "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
+        "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"   \
+        "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"   \
+        "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"   \
+        "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
+        "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
+        TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])          \
+        "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
+        "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
+        "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
+        "paddh        %[ftmp3],      %[ftmp3],      %[offset]   \n\t"   \
+        "psrah        %[ftmp3],      %[ftmp3],      %[shift]    \n\t"   \
+        "xor          %[ftmp7],      %[ftmp7],      %[ftmp7]    \n\t"   \
+        "pcmpgth      %[ftmp7],      %[ftmp3],      %[ftmp7]    \n\t"   \
+        "and          %[ftmp3],      %[ftmp3],      %[ftmp7]    \n\t"   \
+        "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
+        "gsswlc1      %[ftmp3],      0x03(%[dst])               \n\t"   \
+        "gsswrc1      %[ftmp3],      0x00(%[dst])               \n\t"   \
+                                                                        \
+        "daddi        %[x],          %[x],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
+        "bnez         %[x],          2b                         \n\t"   \
+                                                                        \
+        "daddi        %[y],          %[y],         -0x01        \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],     " #tmp_step "  \n\t"   \
+        PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
+        PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
+        PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
+        "bnez         %[y],          1b                         \n\t"   \
+        : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
+          [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
+          [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
+          [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
+          [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
+          [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),             \
+          [ftmp14]"=&f"(ftmp[14]),                                      \
+          [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
+          [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0])                  \
+        : [filter]"r"(filter), [stride]"r"(dststride),                  \
+          [shift]"f"(shift)                                             \
+        : "memory"                                                      \
+    );                                                                  \
+}
+
+PUT_HEVC_QPEL_UNI_HV(4, 1, -4, -4, -8);
+PUT_HEVC_QPEL_UNI_HV(8, 2, -8, -8, -16);
+PUT_HEVC_QPEL_UNI_HV(12, 3, -12, -12, -24);
+PUT_HEVC_QPEL_UNI_HV(16, 4, -16, -16, -32);
+PUT_HEVC_QPEL_UNI_HV(24, 6, -24, -24, -48);
+PUT_HEVC_QPEL_UNI_HV(32, 8, -32, -32, -64);
+PUT_HEVC_QPEL_UNI_HV(48, 12, -48, -48, -96);
+PUT_HEVC_QPEL_UNI_HV(64, 16, -64, -64, -128);
diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
new file mode 100644
index 0000000..81db62b
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -0,0 +1,4325 @@
+/*
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0;
+
+        LD_SB2(src, src_stride, src0, src1);
+
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+        in0 = (v8i16) __msa_ilvr_b(zero, src0);
+        in0 <<= 6;
+        ST8x2_UB(in0, dst, 2 * dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST8x4_UB(in0, in1, dst, 2 * dst_stride);
+    } else if (0 == height % 8) {
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3;
+        uint32_t loop_cnt;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0, in1, in2, in3);
+            SLLI_4V(in0, in1, in2, in3, 6);
+            ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in4, in5, in6, in7);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        SLLI_4V(in4, in5, in6, in7, 6);
+        ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1;
+
+        LD_SB2(src, src_stride, src0, src1);
+
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH2(in0, in1, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        ST_SH4(in0, in1, in2, in3, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 in0, in1, in2, in3, in4, in5;
+
+        LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        in4 <<= 6;
+        in5 <<= 6;
+        ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0, in1, in2, in3);
+            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                       in4, in5, in6, in7);
+            SLLI_4V(in0, in1, in2, in3, 6);
+            SLLI_4V(in4, in5, in6, in7, 6);
+            ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_r, in1_r, in2_r, in3_r);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0_r, in1_r, in2_r, in3_r;
+        v8i16 in0_l, in1_l, in2_l, in3_l;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+    } else if (12 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v16i8 src8, src9, src10, src11;
+        v8i16 in0_r, in1_r, in2_r, in3_r;
+        v8i16 in0_l, in1_l, in2_l, in3_l;
+
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_SB4(src, src_stride, src8, src9, src10, src11);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+    } else if (0 == (height % 8)) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
+                   src7);
+            src += (8 * src_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
+                       in1_r, in2_r, in3_r);
+            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
+                       in1_l, in2_l, in3_l);
+            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+            dst += (4 * dst_stride);
+
+            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
+                       in1_r, in2_r, in3_r);
+            ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
+                       in1_l, in2_l, in3_l);
+            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
+        src += (4 * src_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
+                   in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
+                   in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
+                   in2_r, in3_r);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
+                   in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
+                   in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
+        dst += dst_stride;
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
+                   in2_r, in3_r);
+        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l,
+                   in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 src8, src9, src10, src11;
+    v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
+    v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src += src_stride;
+        LD_SB3(src, 16, src3, src4, src5);
+        src += src_stride;
+        LD_SB3(src, 16, src6, src7, src8);
+        src += src_stride;
+        LD_SB3(src, 16, src9, src10, src11);
+        src += src_stride;
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r);
+        ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
+        ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
+        dst += dst_stride;
+
+        ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
+                   in0_l, in1_l, in2_l, in3_l);
+        ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r);
+        ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
+        ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB4(src, 16, src0, src1, src2, src3);
+        src += src_stride;
+        LD_SB4(src, 16, src4, src5, src6, src7);
+        src += src_stride;
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
+        ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
+        dst += dst_stride;
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
+        ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    int64_t res0, res1, res2, res3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = LD_SB(ff_hevc_mask_arr + 16);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
+        src += (4 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        dst4 = const_vec;
+        dst5 = const_vec;
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
+        VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
+        VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
+
+        res0 = __msa_copy_s_d((v2i64) dst4, 0);
+        res1 = __msa_copy_s_d((v2i64) dst4, 1);
+        res2 = __msa_copy_s_d((v2i64) dst5, 0);
+        res3 = __msa_copy_s_d((v2i64) dst5, 1);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        SD4(res0, res1, res2, res3, (dst + 8), dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+
+        ST_SH2(dst0, dst2, dst, dst_stride);
+        ST_SH2(dst1, dst3, dst + 8, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 16, src2, src3);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        dst4 = const_vec;
+        dst5 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
+        VSHF_B2_SB(src2, src3, src3, src3, mask4, mask0, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src3, src3, src3, mask5, mask1, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
+        VSHF_B2_SB(src2, src3, src3, src3, mask6, mask2, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
+        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
+        VSHF_B2_SB(src2, src3, src3, src3, mask7, mask3, vec4, vec5);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+        DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        ST_SH(dst2, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst3, dst4, dst, 8);
+        ST_SH(dst5, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src3 = LD_SB(src + 40);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        dst4 = const_vec;
+        dst5 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
+                     dst1, dst2, dst3);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
+        ST_SH2(dst4, dst5, (dst + 32), 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+
+    src -= 3;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src, 16, src0, src1, src2, src3);
+        src4 = LD_SB(src + 56);
+        src += src_stride;
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        ST_SH(dst0, dst);
+
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        ST_SH(dst1, dst + 8);
+
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        ST_SH(dst2, dst + 16);
+
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        ST_SH(dst3, dst + 24);
+
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        ST_SH(dst4, dst + 32);
+
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        ST_SH(dst5, dst + 40);
+
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+        ST_SH(dst6, dst + 48);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst7 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst7, dst7, dst7, dst7);
+        ST_SH(dst7, dst + 56);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src9, src10, src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src -= (3 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src += (8 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
+                   src1211_r, src1110_r, src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+        dst0_l = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3,
+                     dst0_l, dst0_l, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3,
+                     dst1_l, dst1_l, dst1_l, dst1_l);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
+                                        int32_t src_stride,
+                                        int16_t *dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter,
+                                        int32_t height,
+                                        int32_t width)
+{
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = width >> 4; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_r, src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_l, src87_l, src98_l, src109_l);
+
+            dst0_r = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3,
+                         dst0_r, dst0_r, dst0_r, dst0_r);
+            dst1_r = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3,
+                         dst1_r, dst1_r, dst1_r, dst1_r);
+            dst2_r = const_vec;
+            DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                         filt0, filt1, filt2, filt3,
+                         dst2_r, dst2_r, dst2_r, dst2_r);
+            dst3_r = const_vec;
+            DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                         filt0, filt1, filt2, filt3,
+                         dst3_r, dst3_r, dst3_r, dst3_r);
+            dst0_l = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3,
+                         dst0_l, dst0_l, dst0_l, dst0_l);
+            dst1_l = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3,
+                         dst1_l, dst1_l, dst1_l, dst1_l);
+            dst2_l = const_vec;
+            DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
+                         filt0, filt1, filt2, filt3,
+                         dst2_l, dst2_l, dst2_l, dst2_l);
+            dst3_l = const_vec;
+            DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
+                         filt0, filt1, filt2, filt3,
+                         dst3_l, dst3_l, dst3_l, dst3_l);
+
+            ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
+            ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+    hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                      filter, height);
+}
+
+static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 64);
+}
+
+static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t dst_stride_in_bytes = 2 * dst_stride;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
+    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
+    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        dst97 = const_vec;
+        dst108 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst97, dst97, dst97, dst97);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst108, dst108, dst108, dst108);
+
+        dst76_r = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98_r = __msa_ilvr_h(dst66, dst108);
+
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
+        ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
+        dst += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst32_r = dst76_r;
+        dst54_r = dst98_r;
+        dst21_r = dst65_r;
+        dst43_r = dst87_r;
+        dst65_r = dst109_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_8t_8multx1mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 dst0_r, dst0_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        /* row 4 row 5 row 6 */
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        for (loop_cnt = height; loop_cnt--;) {
+            src7 = LD_SB(src_tmp);
+            src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
+            src_tmp += src_stride;
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+            ST_SW(dst0_r, dst_tmp);
+            dst_tmp += dst_stride;
+
+            dst0 = dst1;
+            dst1 = dst2;
+            dst2 = dst3;
+            dst3 = dst4;
+            dst4 = dst5;
+            dst5 = dst6;
+            dst6 = dst7;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t dst_stride_in_bytes = 2 * dst_stride;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+    v8i16 filter_vec, const_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
+    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
+               vec15);
+    dst0 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0,
+                 dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1,
+                 dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2,
+                 dst2, dst2, dst2);
+    dst3 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3,
+                 dst3, dst3, dst3);
+
+    /* row 4 row 5 row 6 */
+    VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
+               vec11);
+    dst4 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4,
+                 dst4, dst4);
+    dst5 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5,
+                 dst5, dst5);
+    dst6 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6,
+                 dst6, dst6, dst6);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src7 = LD_SB(src_tmp);
+        src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
+        src_tmp += src_stride;
+
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
+                   vec3);
+        dst7 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7,
+                     dst7, dst7, dst7);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+        ST_SW(dst0_r, dst_tmp);
+        dst_tmp += dst_stride;
+
+        dst0 = dst1;
+        dst1 = dst2;
+        dst2 = dst3;
+        dst3 = dst4;
+        dst4 = dst5;
+        dst5 = dst6;
+        dst6 = dst7;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask4 = LD_SB(ff_hevc_mask_arr + 16);
+    mask5 = mask4 + 2;
+    mask6 = mask4 + 4;
+    mask7 = mask4 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
+               vec11);
+    VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
+               vec15);
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30,
+                 dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41,
+                 dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52,
+                 dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63,
+                 dst63, dst63, dst63);
+
+    ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
+    ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
+    ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
+                   vec3);
+        VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
+                   vec7);
+        dst97 = const_vec;
+        dst108 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97,
+                     dst97, dst97, dst97);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108,
+                     dst108, dst108, dst108);
+
+        dst76_r = __msa_ilvr_h(dst97, dst66);
+        ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
+        dst98_r = __msa_ilvr_h(dst66, dst108);
+
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
+                                filt_h1, filt_h2, filt_h3);
+        SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+        PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
+        ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
+        dst += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst32_r = dst76_r;
+        dst54_r = dst98_r;
+        dst21_r = dst65_r;
+        dst43_r = dst87_r;
+        dst65_r = dst109_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
+    }
+}
+
+static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    ST8x2_UB(dst0, dst, 2 * dst_stride);
+}
+
+static void hevc_hz_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
+}
+
+static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (4 == height) {
+        hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (0 == height % 8) {
+        hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_hz_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, dst0, dst1;
+    v16i8 src0, src1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height || 6 == height) {
+        hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    } else {
+        hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_hz_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask3;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
+        ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    int16_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1, mask00, mask11;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask00 = mask0 + 8;
+    mask11 = mask0 + 10;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16 width */
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        dst3 = const_vec;
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
+                     dst1, dst2, dst3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
+                     dst1, dst2, dst3);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332;
+    v8i16 dst10;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+
+    ST8x2_UB(dst10, dst, 2 * dst_stride);
+}
+
+static void hevc_vt_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+
+    ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
+}
+
+static void hevc_vt_4t_4x8_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+    src += (8 * src_stride);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+               src76_r, src87_r, src98_r, src109_r);
+    ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
+               src98_r, src4332, src6554, src8776, src10998);
+    XORI_B4_128_SB(src4332, src6554, src8776, src10998);
+    dst10 = const_vec;
+    dst32 = const_vec;
+    dst54 = const_vec;
+    dst76 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+    DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
+    ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+    dst += (8 * dst_stride);
+}
+
+static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                int16_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998;
+    v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+    src += (8 * src_stride);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
+               src54_r, src65_r);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
+               src98_r, src4332, src6554, src8776, src10998);
+    XORI_B4_128_SB(src4332, src6554, src8776, src10998);
+
+    dst10 = const_vec;
+    dst32 = const_vec;
+    dst54 = const_vec;
+    dst76 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+    DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
+    ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+    dst += (8 * dst_stride);
+
+    src2 = src10;
+    src2110 = src10998;
+
+    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+    src += (8 * src_stride);
+
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
+               src54_r, src65_r);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
+               src98_r, src4332, src6554, src8776, src10998);
+    XORI_B4_128_SB(src4332, src6554, src8776, src10998);
+
+    dst10 = const_vec;
+    dst32 = const_vec;
+    dst54 = const_vec;
+    dst76 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+    DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
+    ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+    dst += (8 * dst_stride);
+}
+
+static void hevc_vt_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (4 == height) {
+        hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
+    } else if (8 == height) {
+        hevc_vt_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, height);
+    } else if (16 == height) {
+        hevc_vt_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_vt_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    int32_t loop_cnt;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_8x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_vt_4t_8x6_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_SB2(src, src_stride, src1, src2);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src1, src2);
+
+    ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+        dst0_r = const_vec;
+        dst1_r = const_vec;
+        dst2_r = const_vec;
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+        DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2 = src6;
+        src10_r = src54_r;
+        src21_r = src65_r;
+    }
+}
+
+static void hevc_vt_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (6 == height) {
+        hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_vt_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v16i8 src54_r, src65_r, src6554;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= (1 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        XORI_B2_128_SB(src5, src6);
+
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+        ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+        ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
+        src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
+        dst += (4 * dst_stride);
+
+        src2 = src6;
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src2110 = src6554;
+    }
+}
+
+static void hevc_vt_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        ST_SH(dst2_r, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        ST_SH(dst3_r, dst + 16);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        ST_SH(dst2_r, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        ST_SH(dst3_r, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
+
+        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
+
+        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hv_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    int32_t dst_stride_in_bytes = 2 * dst_stride;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
+    v4i32 dst0, dst1;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
+
+    dst20 = const_vec;
+    dst31 = const_vec;
+    dst42 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst20, dst20);
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst31, dst31);
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst42, dst42);
+    ILVRL_H2_SH(dst31, dst20, dst10, dst32);
+    ILVRL_H2_SH(dst42, dst31, dst21, dst43);
+
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst0 >>= 6;
+    dst1 >>= 6;
+    dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
+    ST8x2_UB(dst0, dst, dst_stride_in_bytes);
+}
+
+static void hevc_hv_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    int32_t dst_stride_in_bytes = 2 * dst_stride;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filter_vec, const_vec;
+    v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
+    v4i32 dst0, dst1, dst2, dst3;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
+
+    dst30 = const_vec;
+    dst41 = const_vec;
+    dst52 = const_vec;
+    dst63 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst30, dst30);
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst41, dst41);
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst52, dst52);
+    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst63, dst63);
+
+    ILVRL_H2_SH(dst41, dst30, dst10, dst43);
+    ILVRL_H2_SH(dst52, dst41, dst21, dst54);
+    ILVRL_H2_SH(dst63, dst52, dst32, dst65);
+
+    dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
+    dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
+    dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
+    dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
+    SRA_4V(dst0, dst1, dst2, dst3, 6);
+    PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2);
+    ST8x4_UB(dst0, dst2, dst, dst_stride_in_bytes);
+}
+
+
+static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+    v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
+    dst10 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
+    dst21 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
+
+        dst73 = const_vec;
+        dst84 = const_vec;
+        dst95 = const_vec;
+        dst106 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
+        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+        SRA_4V(dst0, dst1, dst2, dst3, 6);
+        SRA_4V(dst4, dst5, dst6, dst7, 6);
+        PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                    dst0, dst1, dst2, dst3);
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (0 == (height % 8)) {
+        hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+    int32_t dst_stride_in_bytes = 2 * dst_stride;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
+    v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
+    v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
+    v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dsth0 = const_vec;
+    dsth1 = const_vec;
+    dsth2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth0, dsth0);
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth1, dsth1);
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth2, dsth2);
+
+    ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+    LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
+    XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+    dsth3 = const_vec;
+    dsth4 = const_vec;
+    dsth5 = const_vec;
+    dsth6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth3, dsth3);
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth4, dsth4);
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth5, dsth5);
+    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth6, dsth6);
+
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
+
+    dsth7 = const_vec;
+    dsth8 = const_vec;
+    dsth9 = const_vec;
+    dsth10 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth7, dsth7);
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth8, dsth8);
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth9, dsth9);
+    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10);
+
+    ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
+    ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
+    ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
+
+    PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
+    PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
+    dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+    dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+    SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
+    SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+    PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
+    PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride_in_bytes);
+    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
+    dst += 4 * dst_stride;
+    ST8x4_UB(tmp2, tmp3, dst, dst_stride_in_bytes);
+    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
+}
+
+static void hevc_hv_4t_8x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_hv_4t_8multx4_msa(uint8_t *src, int32_t src_stride,
+                                   int16_t *dst, int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y, int32_t width8mult)
+{
+    int32_t cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width8mult; cnt--;) {
+        LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src += 8;
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = const_vec;
+        dst1 = const_vec;
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+        dst3 = const_vec;
+        dst4 = const_vec;
+        dst5 = const_vec;
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
+        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
+
+        ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        dst += 8;
+    }
+}
+
+static void hevc_hv_4t_8x6_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    XORI_B4_128_SB(src5, src6, src7, src8);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec10, vec11, filt0, filt1, dst5, dst5);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec12, vec13, filt0, filt1, dst6, dst6);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec14, vec15, filt0, filt1, dst7, dst7);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec16, vec17, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+
+    SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+    SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+    SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
+
+    PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+    PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
+
+    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+    ST_SW2(dst2_r, dst3_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+    ST_SW2(dst4_r, dst5_r, dst, dst_stride);
+}
+
+static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height,
+                                       int32_t width8mult)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v8i16 filt_h0, filt_h1;
+    v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width8mult; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+            dst3 = const_vec;
+            dst4 = const_vec;
+            dst5 = const_vec;
+            dst6 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+            DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
+            DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
+            DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+            ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+            SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+
+            PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r,
+                        dst0_r, dst1_r, dst2_r, dst3_r);
+
+            ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dst2 = dst6;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+
+    if (2 == height) {
+        hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, 1);
+    } else if (6 == height) {
+        hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (0 == (height % 4)) {
+        hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 1);
+    }
+}
+
+static void hevc_hv_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 mask0, mask1, mask2, mask3;
+    v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
+    v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+    SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+    mask0 = LD_SB(ff_hevc_mask_arr);
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB3(src_tmp, src_stride, src0, src1, src2);
+    src_tmp += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    dst1 = const_vec;
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+        src_tmp += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+        dst3 = const_vec;
+        dst4 = const_vec;
+        dst5 = const_vec;
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
+        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+        ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+        SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+        SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+                    dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+        ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+
+        dst10_r = dst54_r;
+        dst10_l = dst54_l;
+        dst21_r = dst65_r;
+        dst21_l = dst65_l;
+        dst2 = dst6;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask2 = LD_SB(ff_hevc_mask_arr + 16);
+    mask3 = mask2 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+    VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
+    dst10 = const_vec;
+    dst21 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
+    ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+    dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
+               src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
+        VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
+        VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
+        VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
+
+        dst73 = const_vec;
+        dst84 = const_vec;
+        dst95 = const_vec;
+        dst106 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
+        DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
+
+        dst32_r = __msa_ilvr_h(dst73, dst22);
+        ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+        ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+        ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+        dst76_r = __msa_ilvr_h(dst22, dst106);
+
+        tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        tmp2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        tmp3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        tmp4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        tmp5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        tmp6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+        tmp7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+
+        SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
+        SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
+        PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
+                    tmp2, tmp3);
+        ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+    }
+}
+
+static void hevc_hv_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    if (4 == height) {
+        hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, 2);
+    } else {
+        hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 2);
+    }
+}
+
+static void hevc_hv_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 4);
+}
+
+#define MC_COPY(WIDTH)                                                    \
+void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst,             \
+                                                uint8_t *src,             \
+                                                ptrdiff_t src_stride,     \
+                                                int height,               \
+                                                intptr_t mx,              \
+                                                intptr_t my,              \
+                                                int width)                \
+{                                                                         \
+    hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height);  \
+}
+
+MC_COPY(4);
+MC_COPY(6);
+MC_COPY(8);
+MC_COPY(12);
+MC_COPY(16);
+MC_COPY(24);
+MC_COPY(32);
+MC_COPY(48);
+MC_COPY(64);
+
+#undef MC_COPY
+
+#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,          \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];         \
+                                                                          \
+    hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,           \
+                                          MAX_PB_SIZE, filter, height);   \
+}
+
+MC(qpel, h, 4, 8, hz, mx);
+MC(qpel, h, 8, 8, hz, mx);
+MC(qpel, h, 12, 8, hz, mx);
+MC(qpel, h, 16, 8, hz, mx);
+MC(qpel, h, 24, 8, hz, mx);
+MC(qpel, h, 32, 8, hz, mx);
+MC(qpel, h, 48, 8, hz, mx);
+MC(qpel, h, 64, 8, hz, mx);
+
+MC(qpel, v, 4, 8, vt, my);
+MC(qpel, v, 8, 8, vt, my);
+MC(qpel, v, 12, 8, vt, my);
+MC(qpel, v, 16, 8, vt, my);
+MC(qpel, v, 24, 8, vt, my);
+MC(qpel, v, 32, 8, vt, my);
+MC(qpel, v, 48, 8, vt, my);
+MC(qpel, v, 64, 8, vt, my);
+
+MC(epel, h, 4, 4, hz, mx);
+MC(epel, h, 6, 4, hz, mx);
+MC(epel, h, 8, 4, hz, mx);
+MC(epel, h, 12, 4, hz, mx);
+MC(epel, h, 16, 4, hz, mx);
+MC(epel, h, 24, 4, hz, mx);
+MC(epel, h, 32, 4, hz, mx);
+
+MC(epel, v, 4, 4, vt, my);
+MC(epel, v, 6, 4, vt, my);
+MC(epel, v, 8, 4, vt, my);
+MC(epel, v, 12, 4, vt, my);
+MC(epel, v, 16, 4, vt, my);
+MC(epel, v, 24, 4, vt, my);
+MC(epel, v, 32, 4, vt, my);
+
+#undef MC
+
+#define MC_HV(PEL, WIDTH, TAP)                                          \
+void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst,           \
+                                                uint8_t *src,           \
+                                                ptrdiff_t src_stride,   \
+                                                int height,             \
+                                                intptr_t mx,            \
+                                                intptr_t my,            \
+                                                int width)              \
+{                                                                       \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];           \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];           \
+                                                                        \
+    hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE,  \
+                                          filter_x, filter_y, height);  \
+}
+
+MC_HV(qpel, 4, 8);
+MC_HV(qpel, 8, 8);
+MC_HV(qpel, 12, 8);
+MC_HV(qpel, 16, 8);
+MC_HV(qpel, 24, 8);
+MC_HV(qpel, 32, 8);
+MC_HV(qpel, 48, 8);
+MC_HV(qpel, 64, 8);
+
+MC_HV(epel, 4, 4);
+MC_HV(epel, 6, 4);
+MC_HV(epel, 8, 4);
+MC_HV(epel, 12, 4);
+MC_HV(epel, 16, 4);
+MC_HV(epel, 24, 4);
+MC_HV(epel, 32, 4);
+
+#undef MC_HV
diff --git a/libavcodec/mips/hevcpred_init_mips.c b/libavcodec/mips/hevcpred_init_mips.c
new file mode 100644
index 0000000..e987698
--- /dev/null
+++ b/libavcodec/mips/hevcpred_init_mips.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/mips/hevcpred_mips.h"
+
+#if HAVE_MSA
+static av_cold void hevc_pred_init_msa(HEVCPredContext *c, const int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->intra_pred[2] = ff_intra_pred_8_16x16_msa;
+        c->intra_pred[3] = ff_intra_pred_8_32x32_msa;
+        c->pred_planar[0] = ff_hevc_intra_pred_planar_0_msa;
+        c->pred_planar[1] = ff_hevc_intra_pred_planar_1_msa;
+        c->pred_planar[2] = ff_hevc_intra_pred_planar_2_msa;
+        c->pred_planar[3] = ff_hevc_intra_pred_planar_3_msa;
+        c->pred_dc = ff_hevc_intra_pred_dc_msa;
+        c->pred_angular[0] = ff_pred_intra_pred_angular_0_msa;
+        c->pred_angular[1] = ff_pred_intra_pred_angular_1_msa;
+        c->pred_angular[2] = ff_pred_intra_pred_angular_2_msa;
+        c->pred_angular[3] = ff_pred_intra_pred_angular_3_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+void ff_hevc_pred_init_mips(HEVCPredContext *c, const int bit_depth)
+{
+#if HAVE_MSA
+    hevc_pred_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hevcpred_mips.h b/libavcodec/mips/hevcpred_mips.h
new file mode 100644
index 0000000..f22feff
--- /dev/null
+++ b/libavcodec/mips/hevcpred_mips.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H
+#define AVCODEC_MIPS_HEVCPRED_MIPS_H
+
+#include "libavcodec/hevcpred.h"
+
+void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
+                               const uint8_t *src_left,
+                               ptrdiff_t stride, int log2, int c_idx);
+
+void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_intra_pred_8_16x16_msa(struct HEVCContext *s, int x0, int y0, int c_idx);
+void ff_intra_pred_8_32x32_msa(struct HEVCContext *s, int x0, int y0, int c_idx);
+
+#endif  // #ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H
diff --git a/libavcodec/mips/hevcpred_msa.c b/libavcodec/mips/hevcpred_msa.c
new file mode 100644
index 0000000..963c64c
--- /dev/null
+++ b/libavcodec/mips/hevcpred_msa.c
@@ -0,0 +1,3084 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/hevcdec.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "hevcpred_mips.h"
+
+static const int8_t intra_pred_angle_up[17] = {
+    -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
+};
+
+static const int8_t intra_pred_angle_low[16] = {
+    32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
+};
+
+#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,          \
+                              mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,  \
+                              res0, res1, mul_val_b0, mul_val_b1, round)       \
+{                                                                              \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
+                                                                               \
+    MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1,                 \
+         mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m);                    \
+                                                                               \
+    res0_m += mul_val_h1 * tmp0;                                               \
+    res1_m += mul_val_h3 * tmp0;                                               \
+    res2_m += mul_val_h1 * tmp0;                                               \
+    res3_m += mul_val_h3 * tmp0;                                               \
+                                                                               \
+    res0_m += mul_val_b0 * src0_r;                                             \
+    res1_m += mul_val_b0 * src0_l;                                             \
+    res2_m += (mul_val_b0 - 1) * src0_r;                                       \
+    res3_m += (mul_val_b0 - 1) * src0_l;                                       \
+                                                                               \
+    res0_m += mul_val_b1 * tmp1;                                               \
+    res1_m += mul_val_b1 * tmp1;                                               \
+    res2_m += (mul_val_b1 + 1) * tmp1;                                         \
+    res3_m += (mul_val_b1 + 1) * tmp1;                                         \
+                                                                               \
+    SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round);                        \
+    PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1);                   \
+}
+
+static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint32_t col;
+    uint32_t src_data;
+    v8i16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_data = LW(src_top);
+    SW4(src_data, src_data, src_data, src_data, dst, stride);
+
+    if (0 == flag) {
+        src_data = LW(src_left);
+
+        vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
+        vec2 -= vec0;
+        vec2 >>= 1;
+        vec2 += vec1;
+        vec2 = CLIP_SH_0_255(vec2);
+
+        for (col = 0; col < 4; col++) {
+            dst[stride * col] = (uint8_t) vec2[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    uint16_t val0, val1, val2, val3;
+    uint64_t src_data1;
+    v8i16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_data1 = LD(src_top);
+
+    for (row = 8; row--;) {
+        SD(src_data1, tmp_dst);
+        tmp_dst += stride;
+    }
+
+    if (0 == flag) {
+        src_data1 = LD(src_left);
+
+        vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
+        vec2 -= vec0;
+        vec2 >>= 1;
+        vec2 += vec1;
+        vec2 = CLIP_SH_0_255(vec2);
+
+        val0 = vec2[0];
+        val1 = vec2[1];
+        val2 = vec2[2];
+        val3 = vec2[3];
+
+        dst[0] = val0;
+        dst[stride] = val1;
+        dst[2 * stride] = val2;
+        dst[3 * stride] = val3;
+
+        val0 = vec2[4];
+        val1 = vec2[5];
+        val2 = vec2[6];
+        val3 = vec2[7];
+
+        dst[4 * stride] = val0;
+        dst[5 * stride] = val1;
+        dst[6 * stride] = val2;
+        dst[7 * stride] = val3;
+    }
+}
+
+static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
+                                           const uint8_t *src_left,
+                                           uint8_t *dst, int32_t stride,
+                                           int32_t flag)
+{
+    int32_t col;
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    v16u8 src;
+    v8i16 vec0, vec1, vec2, vec3;
+
+    src = LD_UB(src_top);
+
+    for (row = 16; row--;) {
+        ST_UB(src, tmp_dst);
+        tmp_dst += stride;
+    }
+
+    if (0 == flag) {
+        src = LD_UB(src_left);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        UNPCK_UB_SH(src, vec2, vec3);
+        SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
+
+        vec2 >>= 1;
+        vec3 >>= 1;
+
+        ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
+        CLIP_SH2_0_255(vec2, vec3);
+
+        src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
+
+        for (col = 0; col < 16; col++) {
+            dst[stride * col] = src[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          int32_t flag)
+{
+    uint32_t val0, val1, val2, val3;
+    v16i8 src0;
+    v8i16 src0_r, src_top_val, src_left_val;
+    v16i8 zero = { 0 };
+
+    val0 = src_left[0] * 0x01010101;
+    val1 = src_left[1] * 0x01010101;
+    val2 = src_left[2] * 0x01010101;
+    val3 = src_left[3] * 0x01010101;
+    SW4(val0, val1, val2, val3, dst, stride);
+
+    if (0 == flag) {
+        val0 = LW(src_top);
+        src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
+        src_top_val = __msa_fill_h(src_top[-1]);
+        src_left_val = __msa_fill_h(src_left[0]);
+
+        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
+
+        src0_r -= src_top_val;
+        src0_r >>= 1;
+        src0_r += src_left_val;
+        src0_r = CLIP_SH_0_255(src0_r);
+        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
+        val0 = __msa_copy_s_w((v4i32) src0, 0);
+        SW(val0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          int32_t flag)
+{
+    uint64_t val0, val1, val2, val3;
+    v16i8 src0;
+    v8i16 src0_r, src_top_val, src_left_val;
+    v16i8 zero = { 0 };
+
+    val0 = src_left[0] * 0x0101010101010101;
+    val1 = src_left[1] * 0x0101010101010101;
+    val2 = src_left[2] * 0x0101010101010101;
+    val3 = src_left[3] * 0x0101010101010101;
+    SD4(val0, val1, val2, val3, dst, stride);
+
+    val0 = src_left[4] * 0x0101010101010101;
+    val1 = src_left[5] * 0x0101010101010101;
+    val2 = src_left[6] * 0x0101010101010101;
+    val3 = src_left[7] * 0x0101010101010101;
+    SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
+
+    if (0 == flag) {
+        val0 = LD(src_top);
+        src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
+        src_top_val = __msa_fill_h(src_top[-1]);
+        src_left_val = __msa_fill_h(src_left[0]);
+
+        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
+
+        src0_r -= src_top_val;
+        src0_r >>= 1;
+        src0_r += src_left_val;
+        src0_r = CLIP_SH_0_255(src0_r);
+        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
+        val0 = __msa_copy_s_d((v2i64) src0, 0);
+        SD(val0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride,
+                                            int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16i8 src0, src1, src2, src3;
+    v8i16 src0_r, src0_l, src_left_val, src_top_val;
+
+    src_left_val = __msa_fill_h(src_left[0]);
+
+    for (row = 4; row--;) {
+        inp0 = src_left[0];
+        inp1 = src_left[1];
+        inp2 = src_left[2];
+        inp3 = src_left[3];
+        src_left += 4;
+
+        src0 = __msa_fill_b(inp0);
+        src1 = __msa_fill_b(inp1);
+        src2 = __msa_fill_b(inp2);
+        src3 = __msa_fill_b(inp3);
+
+        ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
+        tmp_dst += (4 * stride);
+    }
+
+    if (0 == flag) {
+        src0 = LD_SB(src_top);
+        src_top_val = __msa_fill_h(src_top[-1]);
+
+        UNPCK_UB_SH(src0, src0_r, src0_l);
+        SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
+
+        src0_r >>= 1;
+        src0_l >>= 1;
+
+        ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
+        CLIP_SH2_0_255(src0_r, src0_l);
+        src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
+        ST_SB(src0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16i8 src0, src1, src2, src3;
+
+    for (row = 0; row < 8; row++) {
+        inp0 = src_left[row * 4];
+        inp1 = src_left[row * 4 + 1];
+        inp2 = src_left[row * 4 + 2];
+        inp3 = src_left[row * 4 + 3];
+
+        src0 = __msa_fill_b(inp0);
+        src1 = __msa_fill_b(inp1);
+        src2 = __msa_fill_b(inp2);
+        src3 = __msa_fill_b(inp3);
+
+        ST_SB2(src0, src0, dst, 16);
+        dst += stride;
+        ST_SB2(src1, src1, dst, 16);
+        dst += stride;
+        ST_SB2(src2, src2, dst, 16);
+        dst += stride;
+        ST_SB2(src3, src3, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t stride,
+                                       int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t addition = 0;
+    uint32_t val0, val1, val2;
+    v16i8 src = { 0 };
+    v16u8 store;
+    v16i8 zero = { 0 };
+    v8u16 sum, vec0, vec1;
+
+    val0 = LW(src_top);
+    val1 = LW(src_left);
+    INSERT_W2_SB(val0, val1, src);
+    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+    val0 = __msa_copy_u_w((v4i32) store, 0);
+    SW4(val0, val0, val0, val0, dst, stride)
+
+        if (0 == flag) {
+        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
+
+        vec1 += vec0;
+        vec0 += vec0;
+        vec1 += vec0;
+
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
+        val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
+        val0 = __msa_copy_u_w((v4i32) store, 0);
+        SW(val0, tmp_dst);
+
+        val0 = src_left[1];
+        val1 = src_left[2];
+        val2 = src_left[3];
+
+        addition *= 3;
+
+        ADD2(val0, addition, val1, addition, val0, val1);
+        val2 += addition;
+
+        val0 += 2;
+        val1 += 2;
+        val2 += 2;
+        val0 >>= 2;
+        val1 >>= 2;
+        val2 >>= 2;
+
+        tmp_dst[stride * 1] = val0;
+        tmp_dst[stride * 2] = val1;
+        tmp_dst[stride * 3] = val2;
+    }
+}
+
+static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t stride,
+                                       int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row, col, val;
+    uint32_t addition = 0;
+    uint64_t val0, val1;
+    v16u8 src = { 0 };
+    v16u8 store;
+    v8u16 sum, vec0, vec1;
+    v16i8 zero = { 0 };
+
+    val0 = LD(src_top);
+    val1 = LD(src_left);
+    INSERT_D2_UB(val0, val1, src);
+    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+    val0 = __msa_copy_u_d((v2i64) store, 0);
+
+    for (row = 8; row--;) {
+        SD(val0, dst);
+        dst += stride;
+    }
+
+    if (0 == flag) {
+        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
+
+        vec1 += vec0;
+        vec0 += vec0;
+        vec1 += vec0;
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
+        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
+        val0 = __msa_copy_u_d((v2i64) store, 0);
+        SD(val0, tmp_dst);
+
+        val0 = LD(src_left);
+        src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
+        vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
+        vec0 = (v8u16) __msa_fill_h(addition);
+        vec0 *= 3;
+        vec1 += vec0;
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+
+        for (col = 1; col < 8; col++) {
+            tmp_dst[stride * col] = vec1[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row, col, val;
+    uint32_t addition = 0;
+    v16u8 src_above1, store, src_left1;
+    v8u16 sum, sum_above, sum_left;
+    v8u16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_above1 = LD_UB(src_top);
+    src_left1 = LD_UB(src_left);
+
+    HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
+    sum = sum_above + sum_left;
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+
+    for (row = 16; row--;) {
+        ST_UB(store, dst);
+        dst += stride;
+    }
+
+    if (0 == flag) {
+        vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
+        ILVRL_B2_UH(zero, src_above1, vec1, vec2);
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        vec0 += vec0;
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        SRARI_H2_UH(vec1, vec2, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
+        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
+        ST_UB(store, tmp_dst);
+
+        ILVRL_B2_UH(zero, src_left1, vec1, vec2);
+        vec0 = (v8u16) __msa_fill_h(addition);
+        vec0 *= 3;
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        SRARI_H2_UH(vec1, vec2, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
+
+        for (col = 1; col < 16; col++) {
+            tmp_dst[stride * col] = store[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride)
+{
+    uint32_t row;
+    v16u8 src_above1, src_above2, store, src_left1, src_left2;
+    v8u16 sum_above1, sum_above2;
+    v8u16 sum_left1, sum_left2;
+    v8u16 sum, sum_above, sum_left;
+
+    LD_UB2(src_top, 16, src_above1, src_above2);
+    LD_UB2(src_left, 16, src_left1, src_left2);
+    HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
+    HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
+    sum_above = sum_above1 + sum_above2;
+    sum_left = sum_left1 + sum_left2;
+    sum = sum_above + sum_left;
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
+    store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+
+    for (row = 16; row--;) {
+        ST_UB2(store, store, dst, 16);
+        dst += stride;
+        ST_UB2(store, store, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride)
+{
+    uint32_t src0, src1;
+    v16i8 src_vec0, src_vec1;
+    v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
+    v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
+    v16i8 zero = { 0 };
+
+    src0 = LW(src_top);
+    src1 = LW(src_left);
+
+    mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
+
+    src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
+    src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
+
+    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
+    SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
+
+    tmp0 = __msa_fill_h(src_top[4]);
+    tmp1 = __msa_fill_h(src_left[4]);
+
+    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
+         res0, res1, res2, res3);
+
+    res0 += mul_val1 * tmp0;
+    res1 += mul_val1 * tmp0;
+    res2 += mul_val1 * tmp0;
+    res3 += mul_val1 * tmp0;
+
+    res0 += 3 * src_vec0_r;
+    res1 += 2 * src_vec0_r;
+    res2 += src_vec0_r;
+    res0 += tmp1;
+    res1 += 2 * tmp1;
+    res2 += 3 * tmp1;
+    res3 += 4 * tmp1;
+
+    PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
+    SRARI_H2_SH(res0, res1, 3);
+    src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
+    ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride)
+{
+    uint64_t src0, src1;
+    v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
+    v8i16 src_vec0_r, src_vec1_r;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v8i16 tmp0, tmp1, tmp2;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16i8 zero = { 0 };
+
+    src0 = LD(src_top);
+    src1 = LD(src_left);
+
+    src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
+    src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
+
+    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
+    SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
+    SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
+
+    tmp0 = __msa_fill_h(src_top[8]);
+    tmp1 = __msa_fill_h(src_left[8]);
+
+    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
+         res0, res1, res2, res3);
+    MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
+         res4, res5, res6, res7);
+
+    tmp2 = mul_val1 * tmp0;
+    res0 += tmp2;
+    res1 += tmp2;
+    res2 += tmp2;
+    res3 += tmp2;
+    res4 += tmp2;
+    res5 += tmp2;
+    res6 += tmp2;
+    res7 += tmp2;
+
+    res0 += 7 * src_vec0_r;
+    res1 += 6 * src_vec0_r;
+    res2 += 5 * src_vec0_r;
+    res3 += 4 * src_vec0_r;
+    res4 += 3 * src_vec0_r;
+    res5 += 2 * src_vec0_r;
+    res6 += src_vec0_r;
+
+    res0 += tmp1;
+    res1 += 2 * tmp1;
+    res2 += 3 * tmp1;
+    res3 += 4 * tmp1;
+    res4 += 5 * tmp1;
+    res5 += 6 * tmp1;
+    res6 += 7 * tmp1;
+    res7 += 8 * tmp1;
+
+    SRARI_H4_SH(res0, res1, res2, res3, 4);
+    SRARI_H4_SH(res4, res5, res6, res7, 4);
+    PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                src_vec0, src_vec1, src_vec2, src_vec3);
+
+    ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
+}
+
+static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    v16u8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1;
+    v8i16 res0, res1, tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
+
+    src0 = LD_UB(src_top);
+    src1 = LD_UB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    tmp0 = __msa_fill_h(src_top[16]);
+    tmp1 = __msa_fill_h(src_left[16]);
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 15, 1, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 13, 3, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 11, 5, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 9, 7, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 7, 9, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 5, 11, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 3, 13, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 1, 15, 5);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void process_intra_upper_16x16_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          uint8_t offset)
+{
+    v16i8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1, res0, res1;
+    v8i16 tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    tmp0 = __msa_fill_h(src_top[32 - offset]);
+    tmp1 = __msa_fill_h(src_left[32]);
+
+    src0 = LD_SB(src_top);
+    src1 = LD_SB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val1 += offset;
+    mul_val0 -= offset;
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 31, 1, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 29, 3, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 27, 5, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 25, 7, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 23, 9, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 21, 11, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 19, 13, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 17, 15, 6);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void process_intra_lower_16x16_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          uint8_t offset)
+{
+    v16i8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    tmp0 = __msa_fill_h(src_top[32 - offset]);
+    tmp1 = __msa_fill_h(src_left[16]);
+
+    src0 = LD_SB(src_top);
+    src1 = LD_SB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val1 += offset;
+    mul_val0 -= offset;
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 15, 17, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 13, 19, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 11, 21, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 9, 23, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 7, 25, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 5, 27, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 3, 29, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 1, 31, 6);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
+    process_intra_upper_16x16_msa((src_top + 16), src_left,
+                                  (dst + 16), stride, 16);
+    dst += (16 * stride);
+    src_left += 16;
+
+    process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
+    process_intra_lower_16x16_msa((src_top + 16), src_left,
+                                  (dst + 16), stride, 16);
+}
+
+static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 4;
+    const uint8_t *ref;
+    int32_t last;
+    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, offset;
+    uint64_t tmp0;
+    v16i8 top0, top1, top2, top3;
+    v16i8 dst_val0;
+    v16i8 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = (angle) >> 3;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (angle < 0 && last < -1) {
+        inv_angle_val = inv_angle[mode - 18];
+
+        tmp0 = LD(ref);
+        SD(tmp0, ref_tmp);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
+            ref_tmp[h_cnt] = src_left[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    idx0 = angle_loop >> 5;
+    fact_val0 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx1 = angle_loop >> 5;
+    fact_val1 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx2 = angle_loop >> 5;
+    fact_val2 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx3 = angle_loop >> 5;
+    fact_val3 = angle_loop & 31;
+
+    top0 = LD_SB(ref + idx0 + 1);
+    top1 = LD_SB(ref + idx1 + 1);
+    top2 = LD_SB(ref + idx2 + 1);
+    top3 = LD_SB(ref + idx3 + 1);
+
+    fact0 = __msa_fill_h(fact_val0);
+    fact1 = __msa_fill_h(32 - fact_val0);
+
+    fact2 = __msa_fill_h(fact_val1);
+    fact3 = __msa_fill_h(32 - fact_val1);
+
+    fact4 = __msa_fill_h(fact_val2);
+    fact5 = __msa_fill_h(32 - fact_val2);
+
+    fact6 = __msa_fill_h(fact_val3);
+    fact7 = __msa_fill_h(32 - fact_val3);
+
+    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
+    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
+    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
+               diff0, diff2, diff4, diff6);
+    SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
+    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
+    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
+    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
+
+    diff1 += diff0 * fact1;
+    diff3 += diff2 * fact3;
+
+    SRARI_H2_SH(diff1, diff3, 5);
+    dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
+    ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 8;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t last, offset;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    int32_t tmp0, tmp1, tmp2;
+    v16i8 top0, top1, top2, top3;
+    v16u8 dst_val0, dst_val1;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = (angle) >> 2;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+
+        tmp0 = LW(ref);
+        tmp1 = LW(ref + 4);
+        tmp2 = LW(ref + 8);
+        SW(tmp0, ref_tmp);
+        SW(tmp1, ref_tmp + 4);
+        SW(tmp2, ref_tmp + 8);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx2 = (angle_loop) >> 5;
+        fact_val2 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx3 = (angle_loop) >> 5;
+        fact_val3 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top1 = LD_SB(ref + idx1 + 1);
+        top2 = LD_SB(ref + idx2 + 1);
+        top3 = LD_SB(ref + idx3 + 1);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+
+        SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
+        SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
+        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
+             diff1, diff3, diff5, diff7);
+
+        diff1 += diff0 * fact1;
+        diff3 += diff2 * fact3;
+        diff5 += diff4 * fact5;
+        diff7 += diff6 * fact7;
+
+        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
+        PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
+        ST8x4_UB(dst_val0, dst_val1, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t tmp0;
+    int32_t angle, angle_loop, offset;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 16;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t last;
+    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = angle >> 1;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+
+        top0 = LD_UB(ref);
+        tmp0 = LW(ref + 16);
+        ST_UB(top0, ref_tmp);
+        SW(tmp0, ref_tmp + 16);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 4; v_cnt--;) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx2 = (angle_loop) >> 5;
+        fact_val2 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx3 = (angle_loop) >> 5;
+        fact_val3 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        LD_UB2(ref + idx0 + 1, 16, top0, top1);
+        LD_UB2(ref + idx1 + 1, 16, top2, top3);
+        LD_UB2(ref + idx2 + 1, 16, top4, top5);
+        LD_UB2(ref + idx3 + 1, 16, top6, top7);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact3;
+        diff7 += diff5 * fact3;
+        diff10 += diff8 * fact5;
+        diff11 += diff9 * fact5;
+        diff14 += diff12 * fact7;
+        diff15 += diff13 * fact7;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst0, dst1, dst2, dst3);
+        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t tmp0, tmp1, tmp2, tmp3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    int32_t last, offset;
+    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 fact0, fact1, fact2, fact3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+
+    ref_tmp = ref_array + 32;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = angle;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+        LD_UB2(ref, 16, top0, top1);
+        tmp0 = ref[32];
+        tmp1 = ref[33];
+        tmp2 = ref[34];
+        tmp3 = ref[35];
+
+        ST_UB2(top0, top1, ref_tmp, 16);
+        ref_tmp[32] = tmp0;
+        ref_tmp[33] = tmp1;
+        ref_tmp[34] = tmp2;
+        ref_tmp[35] = tmp3;
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 16; v_cnt--;) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        top0 = LD_UB(ref + idx0 + 1);
+        top4 = LD_UB(ref + idx1 + 1);
+        top1 = LD_UB(ref + idx0 + 17);
+        top5 = LD_UB(ref + idx1 + 17);
+        top3 = LD_UB(ref + idx0 + 33);
+        top7 = LD_UB(ref + idx1 + 33);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+
+        top2 = top1;
+        top6 = top5;
+
+        SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact1;
+        diff7 += diff5 * fact1;
+        diff10 += diff8 * fact3;
+        diff11 += diff9 * fact3;
+        diff14 += diff12 * fact3;
+        diff15 += diff13 * fact3;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst0, dst1, dst2, dst3);
+
+        ST_SB2(dst0, dst1, dst, 16);
+        dst += stride;
+        ST_SB2(dst2, dst3, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 4;
+    const uint8_t *ref;
+    int32_t last, offset;
+    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop, inv_angle_val;
+    uint64_t tmp0;
+    v16i8 dst_val0, dst_val1;
+    v16u8 top0, top1, top2, top3;
+    v16u8 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = angle >> 3;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        tmp0 = LD(ref);
+        SD(tmp0, ref_tmp);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
+            ref_tmp[h_cnt] = src_top[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    idx0 = angle_loop >> 5;
+    fact_val0 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx1 = angle_loop >> 5;
+    fact_val1 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx2 = angle_loop >> 5;
+    fact_val2 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx3 = angle_loop >> 5;
+    fact_val3 = angle_loop & 31;
+
+    top0 = LD_UB(ref + idx0 + 1);
+    top1 = LD_UB(ref + idx1 + 1);
+    top2 = LD_UB(ref + idx2 + 1);
+    top3 = LD_UB(ref + idx3 + 1);
+
+    fact0 = __msa_fill_h(fact_val0);
+    fact1 = __msa_fill_h(32 - fact_val0);
+    fact2 = __msa_fill_h(fact_val1);
+    fact3 = __msa_fill_h(32 - fact_val1);
+    fact4 = __msa_fill_h(fact_val2);
+    fact5 = __msa_fill_h(32 - fact_val2);
+    fact6 = __msa_fill_h(fact_val3);
+    fact7 = __msa_fill_h(32 - fact_val3);
+
+    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
+    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
+    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
+               diff0, diff2, diff4, diff6);
+    SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
+    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
+    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
+    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
+
+    diff1 += diff0 * fact1;
+    diff3 += diff2 * fact3;
+
+    SRARI_H2_SH(diff1, diff3, 5);
+    PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
+
+    diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
+    diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
+
+    diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
+
+    dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
+    dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
+
+    ST4x2_UB(dst_val0, dst, stride);
+    dst += (2 * stride);
+    ST4x2_UB(dst_val1, dst, stride);
+}
+
+static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 8;
+    const uint8_t *ref;
+    const uint8_t *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last, offset, tmp0, tmp1, tmp2;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop, inv_angle_val;
+    v16i8 top0, top1, top2, top3;
+    v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = (angle) >> 2;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        tmp0 = LW(ref);
+        tmp1 = LW(ref + 4);
+        tmp2 = LW(ref + 8);
+        SW(tmp0, ref_tmp);
+        SW(tmp1, ref_tmp + 4);
+        SW(tmp2, ref_tmp + 8);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
+        dst_org = dst;
+
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx2 = angle_loop >> 5;
+        fact_val2 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx3 = angle_loop >> 5;
+        fact_val3 = angle_loop & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top1 = LD_SB(ref + idx1 + 1);
+        top2 = LD_SB(ref + idx2 + 1);
+        top3 = LD_SB(ref + idx3 + 1);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
+        SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
+        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
+             diff1, diff3, diff5, diff7);
+
+        diff1 += diff0 * fact1;
+        diff3 += diff2 * fact3;
+        diff5 += diff4 * fact5;
+        diff7 += diff6 * fact7;
+
+        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
+        PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
+        ILVRL_H2_SH(diff1, diff0, diff3, diff4);
+        ST4x8_UB(diff3, diff4, dst_org, stride);
+        dst += 4;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
+    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
+    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+    int32_t angle, angle_loop, inv_angle_val, offset;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 16;
+    const uint8_t *ref, *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = (angle) >> 1;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        top0 = LD_SB(ref);
+        tmp0 = LW(ref + 16);
+        ST_SB(top0, ref_tmp);
+        SW(tmp0, ref_tmp + 16);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 4; v_cnt++) {
+        dst_org = dst;
+
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx2 = angle_loop >> 5;
+        fact_val2 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx3 = angle_loop >> 5;
+        fact_val3 = angle_loop & 31;
+        angle_loop += angle;
+
+        LD_SB2(ref + idx0 + 1, 16, top0, top1);
+        LD_SB2(ref + idx1 + 1, 16, top2, top3);
+        LD_SB2(ref + idx2 + 1, 16, top4, top5);
+        LD_SB2(ref + idx3 + 1, 16, top6, top7);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact3;
+        diff7 += diff5 * fact3;
+        diff10 += diff8 * fact5;
+        diff11 += diff9 * fact5;
+        diff14 += diff12 * fact7;
+        diff15 += diff13 * fact7;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
+        ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
+        ILVRL_H2_SH(diff1, diff0, diff4, diff5);
+        ILVRL_H2_SH(diff3, diff2, diff6, diff7);
+        ST4x8_UB(diff4, diff5, dst_org, stride);
+        dst_org += (8 * stride);
+        ST4x8_UB(diff6, diff7, dst_org, stride);
+        dst += 4;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
+    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
+    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
+    v8i16 fact0, fact1, fact2, fact3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+    int32_t angle, angle_loop, inv_angle_val, offset;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 32;
+    const uint8_t *ref, *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = angle;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        LD_SB2(ref, 16, top0, top1);
+        tmp0 = LW(ref + 32);
+        ST_SB2(top0, top1, ref_tmp, 16);
+        SW(tmp0, ref_tmp + 32);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 16; v_cnt++) {
+        dst_org = dst;
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top4 = LD_SB(ref + idx1 + 1);
+        top1 = LD_SB(ref + idx0 + 17);
+        top5 = LD_SB(ref + idx1 + 17);
+        top3 = LD_SB(ref + idx0 + 33);
+        top7 = LD_SB(ref + idx1 + 33);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+
+        top2 = top1;
+        top6 = top5;
+
+        SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact1;
+        diff7 += diff5 * fact1;
+        diff10 += diff8 * fact3;
+        diff11 += diff9 * fact3;
+        diff14 += diff12 * fact3;
+        diff15 += diff13 * fact3;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
+        ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
+
+        ST2x4_UB(diff0, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff0, 4, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff1, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff1, 4, dst_org, stride);
+        dst_org += (4 * stride);
+
+        ST2x4_UB(diff2, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff2, 4, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff3, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff3, 4, dst_org, stride);
+        dst_org += (4 * stride);
+
+        dst += 2;
+    }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    uint32_t row;
+    v16u8 src1, src2;
+
+    src1 = LD_UB(src);
+    src2 = LD_UB(src + 16);
+
+    for (row = 32; row--;) {
+        ST_UB2(src1, src2, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
+                               const uint8_t *src_left,
+                               ptrdiff_t stride, int log2, int c_idx)
+{
+    switch (log2) {
+    case 2:
+        hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 3:
+        hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 4:
+        hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 5:
+        hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
+        break;
+    }
+}
+
+void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
+    } else if (mode == 26) {
+        intra_predict_vert_32x32_msa(src_top, dst, stride);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    }
+}
+
+void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
+{
+    v16u8 vec0;
+    HEVCLocalContext *lc = s->HEVClc;
+    int i;
+    int hshift = s->ps.sps->hshift[c_idx];
+    int vshift = s->ps.sps->vshift[c_idx];
+    int size_in_luma_h = 16 << hshift;
+    int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_v = 16 << vshift;
+    int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+    int x = x0 >> hshift;
+    int y = y0 >> vshift;
+    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
+    int cur_tb_addr =
+        s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
+
+    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
+    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
+
+    int min_pu_width = s->ps.sps->min_pu_width;
+
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+        lc->tu.intra_pred_mode;
+    uint32_t a;
+    uint8_t left_array[2 * 32 + 1];
+    uint8_t filtered_left_array[2 * 32 + 1];
+    uint8_t top_array[2 * 32 + 1];
+    uint8_t filtered_top_array[2 * 32 + 1];
+
+    uint8_t *left = left_array + 1;
+    uint8_t *top = top_array + 1;
+    uint8_t *filtered_left = filtered_left_array + 1;
+    uint8_t *filtered_top = filtered_top_array + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
+                               (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
+    int cand_left = lc->na.cand_left;
+    int cand_up_left = lc->na.cand_up_left;
+    int cand_up = lc->na.cand_up;
+    int cand_up_right = lc->na.cand_up_right
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
+                               ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
+
+    int bottom_left_size =
+        (((y0 + 2 * size_in_luma_v) >
+          (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
+                                                 2 * size_in_luma_v)) -
+         (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size =
+        (((x0 + 2 * size_in_luma_h) >
+          (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
+         (x0 + size_in_luma_h)) >> hshift;
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+        int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+        int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
+        if (cand_bottom_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_bottom_pu =
+                ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_bottom_pu) ? (s->ps.sps->min_pu_height -
+                                  y_bottom_pu) : (size_in_luma_pu_v));
+            cand_bottom_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_bottom_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_left_pu) ? (s->ps.sps->min_pu_height -
+                                y_left_pu) : (size_in_luma_pu_v));
+            cand_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_left_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_up_left == 1) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            cand_up_left =
+                (s->ref->tab_mvf[(x_left_pu) +
+                                 (y_top_pu) * min_pu_width]).pred_flag ==
+                PF_INTRA;
+        }
+        if (cand_up == 1 && on_pu_edge_y) {
+            int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_top_pu) ? (s->ps.sps->min_pu_width -
+                               x_top_pu) : (size_in_luma_pu_h));
+            cand_up = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up |=
+                    ((s->ref->tab_mvf[(x_top_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        if (cand_up_right == 1 && on_pu_edge_y) {
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int x_right_pu =
+                ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_right_pu) ? (s->ps.sps->min_pu_width -
+                                 x_right_pu) : (size_in_luma_pu_h));
+            cand_up_right = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |=
+                    ((s->ref->tab_mvf[(x_right_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+
+        vec0 = (v16u8) __msa_ldi_b(128);
+
+        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+
+        top[-1] = 128;
+    }
+    if (cand_up_left) {
+        left[-1] = src[(-1) + stride * (-1)];
+        top[-1] = left[-1];
+    }
+    if (cand_up) {
+        vec0 = LD_UB(src - stride);
+        ST_UB(vec0, top);
+    }
+    if (cand_up_right) {
+        vec0 = LD_UB(src - stride + 16);
+        ST_UB(vec0, (top + 16));
+
+        do {
+            uint32_t pix =
+                ((src[(16 + top_right_size - 1) + stride * (-1)]) *
+                 0x01010101U);
+            for (i = 0; i < (16 - top_right_size); i += 4)
+                ((((union unaligned_32 *) (top + 16 + top_right_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+    if (cand_left)
+        for (i = 0; i < 16; i++)
+            left[i] = src[(-1) + stride * (i)];
+    if (cand_bottom_left) {
+        for (i = 16; i < 16 + bottom_left_size; i++)
+            left[i] = src[(-1) + stride * (i)];
+        do {
+            uint32_t pix =
+                ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
+                 0x01010101U);
+            for (i = 0; i < (16 - bottom_left_size); i += 4)
+                ((((union unaligned_32 *) (left + 16 + bottom_left_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        if (cand_bottom_left || cand_left || cand_up_left || cand_up
+            || cand_up_right) {
+            int size_max_x =
+                x0 + ((2 * 16) << hshift) <
+                s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
+            int size_max_y =
+                y0 + ((2 * 16) << vshift) <
+                s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
+            int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
+            if (!cand_up_right) {
+                size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
+                    16 : (s->ps.sps->width - x0) >> hshift;
+            }
+            if (!cand_bottom_left) {
+                size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
+                    16 : (s->ps.sps->height - y0) >> vshift;
+            }
+            if (cand_bottom_left || cand_left || cand_up_left) {
+                while (j > -1
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((j) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j--;
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((j)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA)) {
+                    j = 0;
+                    while (j < size_max_x
+                           &&
+                           !((s->ref->tab_mvf[(((x0 +
+                                                 ((j) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                        j++;
+                    for (i = j; i > (j) - (j + 1); i--)
+                        if (!
+                            ((s->ref->tab_mvf[(((x0 +
+                                                 ((i -
+                                                   1) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                            top[i - 1] = top[i];
+                    left[-1] = top[-1];
+                }
+            } else {
+                j = 0;
+                while (j < size_max_x
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((j) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j++;
+                if (j > 0)
+                    if (x0 > 0) {
+                        for (i = j; i > (j) - (j + 1); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                    } else {
+                        for (i = j; i > (j) - (j); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                        top[-1] = top[0];
+                    }
+                left[-1] = top[-1];
+            }
+            left[-1] = top[-1];
+            if (cand_bottom_left || cand_left) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_y); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
+                    else
+                        a = ((left[i + 3]) * 0x01010101U);
+            }
+            if (!cand_left) {
+                vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+                ST_UB(vec0, left);
+            }
+            if (!cand_bottom_left) {
+
+                vec0 = (v16u8) __msa_fill_b(left[15]);
+
+                ST_UB(vec0, (left + 16));
+            }
+            if (x0 != 0 && y0 != 0) {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((-1)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA))
+                    left[-1] = left[0];
+            } else if (x0 == 0) {
+                do {
+                    uint32_t pix = ((0) * 0x01010101U);
+                    for (i = 0; i < (size_max_y); i += 4)
+                        ((((union unaligned_32 *) (left + i))->l) = (pix));
+                } while (0);
+            } else {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+            }
+            top[-1] = left[-1];
+            if (y0 != 0) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_x); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((i) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
+                    else
+                        a = ((top[i + 3]) * 0x01010101U);
+            }
+        }
+    }
+
+    if (!cand_bottom_left) {
+        if (cand_left) {
+            vec0 = (v16u8) __msa_fill_b(left[15]);
+
+            ST_UB(vec0, (left + 16));
+
+        } else if (cand_up_left) {
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_left = 1;
+        } else if (cand_up) {
+            left[-1] = top[0];
+
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_up_left = 1;
+            cand_left = 1;
+        } else if (cand_up_right) {
+            vec0 = (v16u8) __msa_fill_b(top[16]);
+
+            ST_UB(vec0, top);
+
+            left[-1] = top[16];
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_up = 1;
+            cand_up_left = 1;
+            cand_left = 1;
+        } else {
+            left[-1] = 128;
+            vec0 = (v16u8) __msa_ldi_b(128);
+
+            ST_UB2(vec0, vec0, top, 16);
+            ST_UB2(vec0, vec0, left, 16);
+        }
+    }
+
+    if (!cand_left) {
+        vec0 = (v16u8) __msa_fill_b(left[16]);
+        ST_UB(vec0, left);
+    }
+    if (!cand_up_left) {
+        left[-1] = left[0];
+    }
+    if (!cand_up) {
+        vec0 = (v16u8) __msa_fill_b(left[-1]);
+        ST_UB(vec0, top);
+    }
+    if (!cand_up_right) {
+        vec0 = (v16u8) __msa_fill_b(top[15]);
+        ST_UB(vec0, (top + 16));
+    }
+
+    top[-1] = left[-1];
+
+
+    if (!s->ps.sps->intra_smoothing_disabled_flag
+        && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && 16 != 4) {
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor =
+                (((((int) (mode - 26U)) >=
+                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
+                 ((((int) (mode - 10U)) >=
+                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 ? ((((int) (mode - 10U)) >=
+                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 : ((((int) (mode - 26U)) >=
+                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
+                filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
+                filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
+                for (i = 2 * 16 - 2; i >= 0; i--)
+                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                        left[i - 1] + 2) >> 2;
+                filtered_top[-1] =
+                    filtered_left[-1] =
+                    (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                for (i = 2 * 16 - 2; i >= 0; i--)
+                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                       top[i - 1] + 2) >> 2;
+                left = filtered_left;
+                top = filtered_top;
+            }
+        }
+    }
+
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
+                                   (uint8_t *) left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
+                       (uint8_t *) left, stride, 4, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
+                                    (uint8_t *) left, stride, c_idx, mode);
+        break;
+    }
+}
+
+void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
+{
+    v16u8 vec0, vec1;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3;
+    v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    HEVCLocalContext *lc = s->HEVClc;
+    int i;
+    int hshift = s->ps.sps->hshift[c_idx];
+    int vshift = s->ps.sps->vshift[c_idx];
+    int size_in_luma_h = 32 << hshift;
+    int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_v = 32 << vshift;
+    int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+    int x = x0 >> hshift;
+    int y = y0 >> vshift;
+    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
+    int cur_tb_addr =
+        s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
+
+    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
+    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
+
+    int min_pu_width = s->ps.sps->min_pu_width;
+
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+        lc->tu.intra_pred_mode;
+    uint32_t a;
+    uint8_t left_array[2 * 32 + 1];
+    uint8_t filtered_left_array[2 * 32 + 1];
+    uint8_t top_array[2 * 32 + 1];
+    uint8_t filtered_top_array[2 * 32 + 1];
+
+    uint8_t *left = left_array + 1;
+    uint8_t *top = top_array + 1;
+    uint8_t *filtered_left = filtered_left_array + 1;
+    uint8_t *filtered_top = filtered_top_array + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
+                               (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
+    int cand_left = lc->na.cand_left;
+    int cand_up_left = lc->na.cand_up_left;
+    int cand_up = lc->na.cand_up;
+    int cand_up_right = lc->na.cand_up_right
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
+                               ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
+
+    int bottom_left_size =
+        (((y0 + 2 * size_in_luma_v) >
+          (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
+                                                 2 * size_in_luma_v)) -
+         (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size =
+        (((x0 + 2 * size_in_luma_h) >
+          (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
+         (x0 + size_in_luma_h)) >> hshift;
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+        int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+        int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
+        if (cand_bottom_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_bottom_pu =
+                ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_bottom_pu) ? (s->ps.sps->min_pu_height -
+                                  y_bottom_pu) : (size_in_luma_pu_v));
+            cand_bottom_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_bottom_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_left_pu) ? (s->ps.sps->min_pu_height -
+                                y_left_pu) : (size_in_luma_pu_v));
+            cand_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_left_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_up_left == 1) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            cand_up_left =
+                (s->ref->tab_mvf[(x_left_pu) +
+                                 (y_top_pu) * min_pu_width]).pred_flag ==
+                PF_INTRA;
+        }
+        if (cand_up == 1 && on_pu_edge_y) {
+            int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_top_pu) ? (s->ps.sps->min_pu_width -
+                               x_top_pu) : (size_in_luma_pu_h));
+            cand_up = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up |=
+                    ((s->ref->tab_mvf[(x_top_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        if (cand_up_right == 1 && on_pu_edge_y) {
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int x_right_pu =
+                ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_right_pu) ? (s->ps.sps->min_pu_width -
+                                 x_right_pu) : (size_in_luma_pu_h));
+            cand_up_right = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |=
+                    ((s->ref->tab_mvf[(x_right_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        vec0 = (v16u8) __msa_ldi_b(128);
+
+        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+
+        top[-1] = 128;
+    }
+    if (cand_up_left) {
+        left[-1] = src[(-1) + stride * (-1)];
+        top[-1] = left[-1];
+    }
+    if (cand_up) {
+        LD_UB2(src - stride, 16, vec0, vec1);
+        ST_UB2(vec0, vec1, top, 16);
+    }
+
+    if (cand_up_right) {
+        LD_UB2(src - stride + 32, 16, vec0, vec1);
+        ST_UB2(vec0, vec1, (top + 32), 16);
+        do {
+            uint32_t pix =
+                ((src[(32 + top_right_size - 1) + stride * (-1)]) *
+                 0x01010101U);
+            for (i = 0; i < (32 - top_right_size); i += 4)
+                ((((union unaligned_32 *) (top + 32 + top_right_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+    if (cand_left)
+        for (i = 0; i < 32; i++)
+            left[i] = src[(-1) + stride * (i)];
+    if (cand_bottom_left) {
+        for (i = 32; i < 32 + bottom_left_size; i++)
+            left[i] = src[(-1) + stride * (i)];
+        do {
+            uint32_t pix =
+                ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
+                 0x01010101U);
+            for (i = 0; i < (32 - bottom_left_size); i += 4)
+                ((((union unaligned_32 *) (left + 32 + bottom_left_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        if (cand_bottom_left || cand_left || cand_up_left || cand_up
+            || cand_up_right) {
+            int size_max_x =
+                x0 + ((2 * 32) << hshift) <
+                s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
+            int size_max_y =
+                y0 + ((2 * 32) << vshift) <
+                s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
+            int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
+            if (!cand_up_right) {
+                size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
+                    32 : (s->ps.sps->width - x0) >> hshift;
+            }
+            if (!cand_bottom_left) {
+                size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
+                    32 : (s->ps.sps->height - y0) >> vshift;
+            }
+            if (cand_bottom_left || cand_left || cand_up_left) {
+                while (j > -1
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((j) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j--;
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((j)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA)) {
+                    j = 0;
+                    while (j < size_max_x
+                           &&
+                           !((s->ref->tab_mvf[(((x0 +
+                                                 ((j) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                        j++;
+                    for (i = j; i > (j) - (j + 1); i--)
+                        if (!
+                            ((s->ref->tab_mvf[(((x0 +
+                                                 ((i -
+                                                   1) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                            top[i - 1] = top[i];
+                    left[-1] = top[-1];
+                }
+            } else {
+                j = 0;
+                while (j < size_max_x
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((j) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j++;
+                if (j > 0)
+                    if (x0 > 0) {
+                        for (i = j; i > (j) - (j + 1); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                    } else {
+                        for (i = j; i > (j) - (j); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                        top[-1] = top[0];
+                    }
+                left[-1] = top[-1];
+            }
+            left[-1] = top[-1];
+            if (cand_bottom_left || cand_left) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_y); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
+                    else
+                        a = ((left[i + 3]) * 0x01010101U);
+            }
+            if (!cand_left) {
+                vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+                ST_UB2(vec0, vec0, left, 16);
+            }
+            if (!cand_bottom_left) {
+                vec0 = (v16u8) __msa_fill_b(left[31]);
+
+                ST_UB2(vec0, vec0, (left + 32), 16);
+            }
+            if (x0 != 0 && y0 != 0) {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((-1)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA))
+                    left[-1] = left[0];
+            } else if (x0 == 0) {
+                do {
+                    uint32_t pix = ((0) * 0x01010101U);
+                    for (i = 0; i < (size_max_y); i += 4)
+                        ((((union unaligned_32 *) (left + i))->l) = (pix));
+                } while (0);
+            } else {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+            }
+            top[-1] = left[-1];
+            if (y0 != 0) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_x); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((i) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
+                    else
+                        a = ((top[i + 3]) * 0x01010101U);
+            }
+        }
+    }
+
+    if (!cand_bottom_left) {
+        if (cand_left) {
+            vec0 = (v16u8) __msa_fill_b(left[31]);
+
+            ST_UB2(vec0, vec0, (left + 32), 16);
+        } else if (cand_up_left) {
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_left = 1;
+        } else if (cand_up) {
+            left[-1] = top[0];
+
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_up_left = 1;
+            cand_left = 1;
+        } else if (cand_up_right) {
+            vec0 = (v16u8) __msa_fill_b(top[32]);
+
+            ST_UB2(vec0, vec0, top, 16);
+
+            left[-1] = top[32];
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_up = 1;
+            cand_up_left = 1;
+            cand_left = 1;
+        } else {
+            left[-1] = 128;
+
+            vec0 = (v16u8) __msa_ldi_b(128);
+
+            ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+        }
+    }
+
+    if (!cand_left) {
+        vec0 = (v16u8) __msa_fill_b(left[32]);
+
+        ST_UB2(vec0, vec0, left, 16);
+    }
+    if (!cand_up_left) {
+        left[-1] = left[0];
+    }
+    if (!cand_up) {
+        vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+        ST_UB2(vec0, vec0, top, 16);
+    }
+    if (!cand_up_right) {
+        vec0 = (v16u8) __msa_fill_b(top[31]);
+
+        ST_UB2(vec0, vec0, (top + 32), 16);
+    }
+
+    top[-1] = left[-1];
+
+
+    if (!s->ps.sps->intra_smoothing_disabled_flag
+        && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && 32 != 4) {
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor =
+                (((((int) (mode - 26U)) >=
+                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
+                 ((((int) (mode - 10U)) >=
+                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 ? ((((int) (mode - 10U)) >=
+                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 : ((((int) (mode - 26U)) >=
+                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
+                int threshold = 1 << (8 - 5);
+                if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
+                    && c_idx == 0
+                    && ((top[-1] + top[63] - 2 * top[31]) >=
+                        0 ? (top[-1] + top[63] -
+                             2 * top[31]) : (-(top[-1] + top[63] -
+                                               2 * top[31]))) < threshold
+                    && ((left[-1] + left[63] - 2 * left[31]) >=
+                        0 ? (left[-1] + left[63] -
+                             2 * left[31]) : (-(left[-1] + left[63] -
+                                                2 * left[31]))) < threshold) {
+
+
+                    filtered_top[-1] = top[-1];
+                    filtered_top[63] = top[63];
+
+
+                    for (i = 0; i < 63; i++) {
+                        filtered_top[i] =
+                            ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
+                    }
+
+                    tmp0 = __msa_fill_h(top[-1]);
+                    tmp1 = __msa_fill_h(top[63]);
+
+                    tmp2 = mul_val0 - 8;
+                    tmp3 = mul_val0 - 16;
+                    tmp4 = mul_val0 - 24;
+                    tmp5 = mul_val1 + 8;
+                    tmp6 = mul_val1 + 16;
+                    tmp7 = mul_val1 + 24;
+
+                    res0 = mul_val0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res3 = tmp4 * tmp0;
+                    res0 += mul_val1 * tmp1;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, filtered_top, 16);
+
+                    res0 = mul_val0 - 32;
+                    tmp2 = mul_val0 - 40;
+                    tmp3 = mul_val0 - 48;
+                    tmp4 = mul_val0 - 56;
+                    res3 = mul_val1 + 32;
+                    tmp5 = mul_val1 + 40;
+                    tmp6 = mul_val1 + 48;
+                    tmp7 = mul_val1 + 56;
+
+                    res0 = res0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res0 += res3 * tmp1;
+                    res3 = tmp4 * tmp0;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, (filtered_top + 32), 16);
+
+                    filtered_top[63] = top[63];
+
+                    tmp0 = __msa_fill_h(left[-1]);
+                    tmp1 = __msa_fill_h(left[63]);
+
+                    tmp2 = mul_val0 - 8;
+                    tmp3 = mul_val0 - 16;
+                    tmp4 = mul_val0 - 24;
+                    tmp5 = mul_val1 + 8;
+                    tmp6 = mul_val1 + 16;
+                    tmp7 = mul_val1 + 24;
+
+                    res0 = mul_val0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res3 = tmp4 * tmp0;
+                    res0 += mul_val1 * tmp1;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, left, 16);
+
+                    res0 = mul_val0 - 32;
+                    tmp2 = mul_val0 - 40;
+                    tmp3 = mul_val0 - 48;
+                    tmp4 = mul_val0 - 56;
+                    res3 = mul_val1 + 32;
+                    tmp5 = mul_val1 + 40;
+                    tmp6 = mul_val1 + 48;
+                    tmp7 = mul_val1 + 56;
+
+                    res0 = res0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res0 += res3 * tmp1;
+                    res3 = tmp4 * tmp0;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, (left + 32), 16);
+
+                    left[63] = tmp1[0];
+
+                    top = filtered_top;
+                } else {
+                    filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
+                    filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
+                    for (i = 2 * 32 - 2; i >= 0; i--)
+                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                            left[i - 1] + 2) >> 2;
+                    filtered_top[-1] =
+                        filtered_left[-1] =
+                        (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                    for (i = 2 * 32 - 2; i >= 0; i--)
+                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                           top[i - 1] + 2) >> 2;
+                    left = filtered_left;
+                    top = filtered_top;
+                }
+            }
+        }
+    }
+
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
+                               (uint8_t *) left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
+                       (uint8_t *) left, stride, 5, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
+                                (uint8_t *) left, stride, c_idx, mode);
+        break;
+    }
+}
diff --git a/libavcodec/mips/hpeldsp_init_mips.c b/libavcodec/mips/hpeldsp_init_mips.c
new file mode 100644
index 0000000..d6f7a97
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_init_mips.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../hpeldsp.h"
+#include "libavcodec/mips/hpeldsp_mips.h"
+
+#if HAVE_MSA
+static void ff_hpeldsp_init_msa(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_msa;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_msa;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_msa;
+
+    c->put_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_msa;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_msa;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_msa;
+
+    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_msa;
+    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_msa;
+    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_msa;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_msa;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_msa;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_msa;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_msa;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_msa;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_msa;
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_msa;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_msa;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_msa;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_msa;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[2][0] = ff_avg_pixels4_msa;
+    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_msa;
+    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_msa;
+    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static void ff_hpeldsp_init_mmi(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_8_mmi;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_8_mmi;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_8_mmi;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_8_mmi;
+
+    c->put_pixels_tab[1][0] = ff_put_pixels8_8_mmi;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_8_mmi;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_8_mmi;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_8_mmi;
+
+    c->put_pixels_tab[2][0] = ff_put_pixels4_8_mmi;
+    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_8_mmi;
+    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_8_mmi;
+    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_8_mmi;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_8_mmi;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_8_mmi;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_8_mmi;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_8_mmi;
+
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_8_mmi;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_8_mmi;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_8_mmi;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_8_mmi;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_8_mmi;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_8_mmi;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_8_mmi;
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_8_mmi;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_8_mmi;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_8_mmi;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_8_mmi;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_8_mmi;
+
+    c->avg_pixels_tab[2][0] = ff_avg_pixels4_8_mmi;
+    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_8_mmi;
+    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_8_mmi;
+    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_8_mmi;
+}
+#endif  // #if HAVE_MMI
+
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags)
+{
+#if HAVE_MMI
+    ff_hpeldsp_init_mmi(c, flags);
+#endif  // #if HAVE_MMI
+#if HAVE_MSA
+    ff_hpeldsp_init_msa(c, flags);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hpeldsp_mips.h b/libavcodec/mips/hpeldsp_mips.h
new file mode 100644
index 0000000..f527c1d
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_mips.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
+#define AVCODEC_MIPS_HPELDSP_MIPS_H
+
+#include "libavcodec/bit_depth_template.c"
+
+void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+
+void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_no_rnd_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+
+void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+
+#endif  // #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
diff --git a/libavcodec/mips/hpeldsp_mmi.c b/libavcodec/mips/hpeldsp_mmi.c
new file mode 100644
index 0000000..e69b2bd
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_mmi.c
@@ -0,0 +1,1117 @@
+/*
+ * Loongson SIMD optimized qpeldsp
+ *
+ * Copyright (c) 2016 Loongson Technology Corporation Limited
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hpeldsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavutil/mips/mmiutils.h"
+#include "constants.h"
+
+void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULWC1(%[ftmp1], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
+
+        MMI_SWC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SWC1(%[ftmp1], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_LOW32
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp2], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp3], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp1], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp2], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[8];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "1:                                                            \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp6], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp5], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp7], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                  \n\t"
+
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp2], %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1(%[ftmp1], %[block], 0x00)
+        MMI_SDC1(%[ftmp3], %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1(%[ftmp4], %[block], 0x00)
+        MMI_SDC1(%[ftmp6], %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1(%[ftmp5], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+
+        "bnez       %[h],       1b                                     \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[2];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        MMI_ULWC1(%[ftmp2], %[block], 0x00)
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
+
+        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
+
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_SWC1(%[ftmp0], %[block], 0x00)
+        MMI_SWC1(%[ftmp1], %[addr1], 0x00)
+        PTR_ADDU   "%[pixels],  %[addr0],       %[line_size]            \n\t"
+        PTR_ADDU   "%[block],   %[addr1],       %[line_size]            \n\t"
+
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_LOW32
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[8];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp6], %[block], 0x08)
+        PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp4], %[block], 0x08)
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp5], %[addr0], 0x08)
+        PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
+
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp6], %[block], 0x08)
+        PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp4], %[block], 0x08)
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp5], %[addr0], 0x08)
+        PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
+
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[4];
+    mips_reg addr[5];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
+        PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
+
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
+        MMI_SWC1(%[ftmp1], %[dst], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
+
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[4];
+    mips_reg addr[5];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[8];
+    mips_reg addr[5];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp4], %[src1], 0x08)
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp6], %[src2], 0x08)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst], 0x08)
+        MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp4], %[src1], 0x08)
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp6], %[src2], 0x08)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst], 0x08)
+        MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[6];
+    mips_reg addr[6];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
+        PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr2],   %[dst],         %[dst_stride]           \n\t"
+        MMI_ULWC1(%[ftmp4], %[dst], 0x00)
+        MMI_ULWC1(%[ftmp5], %[addr2], 0x00)
+        PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        MMI_SWC1(%[ftmp1], %[addr2], 0x00)
+        PTR_ADDU   "%[dst],     %[addr2],       %[dst_stride]           \n\t"
+
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_LOW32
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[6];
+    mips_reg addr[6];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        MMI_ULDC1(%[ftmp4], %[dst], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        MMI_ULDC1(%[ftmp4], %[dst], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
+            src_stride2, h);
+    ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
+            src_stride1, src_stride2, h);
+}
+
+void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[5];
+    mips_reg addr[5];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
+            line_size, line_size, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
+}
+
+void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int i;
+    const uint32_t a = AV_RN32(pixels);
+    const uint32_t b = AV_RN32(pixels + 1);
+    uint32_t l0 = (a & 0x03030303UL) +
+                  (b & 0x03030303UL) +
+                       0x02020202UL;
+    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                  ((b & 0xFCFCFCFCUL) >> 2);
+    uint32_t l1, h1;
+
+    pixels += line_size;
+    for (i = 0; i < h; i += 2) {
+        uint32_t a = AV_RN32(pixels);
+        uint32_t b = AV_RN32(pixels + 1);
+        l1 = (a & 0x03030303UL) +
+             (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block  += line_size;
+        a  = AV_RN32(pixels);
+        b  = AV_RN32(pixels + 1);
+        l0 = (a & 0x03030303UL) +
+             (b & 0x03030303UL) +
+                  0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block  += line_size;
+    }
+}
+
+void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+#if 1
+    double ftmp[10];
+    mips_reg addr[2];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "dli        %[addr0],   0x0f                                    \n\t"
+        "pcmpeqw    %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
+        "dli        %[addr0],   0x01                                    \n\t"
+        "psrlh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+
+        "dli        %[addr0],   0x02                                    \n\t"
+        "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        ".p2align   3                                                   \n\t"
+
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
+        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
+        MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
+        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
+        PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [h]"+&r"(h),                      [pixels]"+&r"(pixels)
+        : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+#else
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x02020202UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x02020202UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+#endif
+}
+
+void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int i;
+    const uint32_t a = AV_RN32(pixels);
+    const uint32_t b = AV_RN32(pixels + 1);
+    uint32_t l0 = (a & 0x03030303UL) +
+                  (b & 0x03030303UL) +
+                       0x02020202UL;
+    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                  ((b & 0xFCFCFCFCUL) >> 2);
+    uint32_t l1, h1;
+
+    pixels += line_size;
+    for (i = 0; i < h; i += 2) {
+        uint32_t a = AV_RN32(pixels);
+        uint32_t b = AV_RN32(pixels + 1);
+        l1 = (a & 0x03030303UL) +
+             (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+        pixels += line_size;
+        block  += line_size;
+        a  = AV_RN32(pixels);
+        b  = AV_RN32(pixels + 1);
+        l0 = (a & 0x03030303UL) +
+             (b & 0x03030303UL) +
+                  0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+        pixels += line_size;
+        block  += line_size;
+    }
+}
+
+void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x02020202UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x02020202UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+}
+
+void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x01010101UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x01010101UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+}
+
+void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
diff --git a/libavcodec/mips/hpeldsp_msa.c b/libavcodec/mips/hpeldsp_msa.c
new file mode 100644
index 0000000..40a0dca
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_msa.c
@@ -0,0 +1,1498 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hpeldsp_mips.h"
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
+{                                                             \
+    v16u8 tmp_m;                                              \
+                                                              \
+    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
+    ST_UB(tmp_m, (pdst));                                     \
+}
+
+#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                           \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                                   \
+                                                                            \
+    PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7,                     \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                            \
+    ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride);                 \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride)                                \
+{                                                                       \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                               \
+                                                                        \
+    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                           \
+}
+
+static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        SLDI_B4_0_SB(src0, src1, src2, src3,
+                     src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+        AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                      src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src8, src9, src10, src11, src12, src13, src14, src15);
+        src += (8 * src_stride);
+
+        AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+    v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    SLDI_B4_0_SB(src0, src1, src2, src3,
+                 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+    SLDI_B4_0_SB(src4, src5, src6, src7,
+                 src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1);
+
+    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
+                 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    SLDI_B4_0_SB(src0, src1, src2, src3,
+                 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+    src += (8 * src_stride);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
+    src += (4 * src_stride);
+
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB4(src, src_stride, src4, src5, src6, src7);
+    LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
+    src += (4 * src_stride);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+}
+
+static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t dst0, dst1, out0, out1;
+    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
+    v16u8 tmp0 = { 0 };
+    v16u8 tmp1 = { 0 };
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+
+        dst0 = LW(dst);
+        dst1 = LW(dst + dst_stride);
+        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
+        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
+
+        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
+        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        SLDI_B4_0_SB(src0, src1, src2, src3,
+                     src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+
+        AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
+                          src3, src3_sld1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src8, src9, src10, src11, src12, src13, src14, src15);
+        src += (8 * src_stride);
+
+        AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+        AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16u8 src0, src1, src2, res0, res1;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                      dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+    }
+}
+
+static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+        AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                 dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                 dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4;
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                 dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+    src += (8 * src_stride);
+    src16 = LD_UB(src);
+
+    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src12, src13, src13, src14,
+                  src14, src15, src15, src16, dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                  dst, dst_stride);
+}
+
+static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1, dst0, dst1;
+    v16u8 src0, src1, src2;
+    v16u8 tmp0 = { 0 };
+    v16u8 tmp1 = { 0 };
+    v16u8 res0, res1;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        dst0 = LW(dst);
+        dst1 = LW(dst + dst_stride);
+        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
+        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
+        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
+        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+        src0 = src2;
+    }
+}
+
+static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                          dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+        AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                    res0, res1, res2, res3);
+        AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                    res4, res5, res6, res7);
+
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
+                    res0, res1, res2, res3);
+        AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
+                    res4, res5, res6, res7);
+        ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t res0, res1;
+    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
+    v16u8 src0_r, src1_r, src2_r, res;
+    v8u16 add0, add1, add2, sum0, sum1;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
+                   src0_r, src1_r, src2_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        ADD2(add0, add1, add1, add2, sum0, sum1);
+        SRARI_H2_UH(sum0, sum1, 2);
+        res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
+        res0 = __msa_copy_u_w((v4i32) res, 0);
+        res1 = __msa_copy_u_w((v4i32) res, 2);
+        SW(res0, dst);
+        dst += dst_stride;
+        SW(res1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB2_UH(src3_r, src4_r, add3, add4);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
+             sum0, sum1, sum2, sum3);
+        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
+        PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
+        ST8x4_UB(src0, src1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src9, src10, src11, src12, src13, src14, src15, src16);
+        src += (8 * src_stride);
+
+        src8 = LD_UB(src);
+        src17 = LD_UB(src + 1);
+
+        ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+        ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+        ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+        ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+        ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+        ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+        ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+        ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+        ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+        HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+        HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+        HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+        HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+        HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+        ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
+             sum0_r, sum1_r, sum2_r, sum3_r);
+        ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
+             sum4_r, sum5_r, sum6_r, sum7_r);
+        ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
+             sum0_l, sum1_l, sum2_l, sum3_l);
+        ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
+             sum4_l, sum5_l, sum6_l, sum7_l);
+        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+        PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
+                     sum3_l, sum3_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+        PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
+                     sum7_l, sum7_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+    v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
+    v8u16 src0_r, src1_r, src2_r, src3_r;
+    v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
+    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
+    v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+    v16i8 out0, out1;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1,
+                 src3_sld1, 1);
+    SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1);
+    SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1);
+    ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
+               src3, src0_r, src1_r, src2_r, src3_r);
+    ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
+               src5_r, src6_r);
+    ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
+    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
+
+    sum0 = add0 + add1 + 1;
+    sum1 = add1 + add2 + 1;
+    sum2 = add2 + add3 + 1;
+    sum3 = add3 + add4 + 1;
+    sum4 = add4 + add5 + 1;
+    sum5 = add5 + add6 + 1;
+    sum6 = add6 + add7 + 1;
+    sum7 = add7 + add8 + 1;
+
+    SRA_4V(sum0, sum1, sum2, sum3, 2);
+    SRA_4V(sum4, sum5, sum6, sum7, 2);
+    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+    v16i8 out0, out1;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    src4 = LD_SB(src);
+
+    SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+    SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+    ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+               src1_r, src2_r);
+    ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+    HADD_UB2_UH(src3_r, src4_r, add3, add4);
+
+    sum0 = add0 + add1 + 1;
+    sum1 = add1 + add2 + 1;
+    sum2 = add2 + add3 + 1;
+    sum3 = add3 + add4 + 1;
+
+    SRA_4V(sum0, sum1, sum2, sum3, 2);
+    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+}
+
+static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
+    v16u8 src0_r, src1_r, src2_r;
+    v8u16 add0, add1, add2, sum0, sum1;
+    v16u8 dst0, dst1, res0, res1;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        ADD2(add0, add1, add1, add2, sum0, sum1);
+        SRARI_H2_UH(sum0, sum1, 2);
+        PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB2_UH(src3_r, src4_r, add3, add4);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
+             sum0, sum1, sum2, sum3);
+        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
+        PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
+                           sum2, dst2, sum3, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 src11, src12, src13, src14, src15, src16, src17;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v16u8 src7_l, src8_l;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src9, src10, src11, src12, src13, src14, src15, src16);
+        src += (8 * src_stride);
+
+        src8 = LD_UB(src);
+        src17 = LD_UB(src + 1);
+
+        ILVRL_B2_UB(src9, src0, src0_r, src0_l);
+        ILVRL_B2_UB(src10, src1, src1_r, src1_l);
+        ILVRL_B2_UB(src11, src2, src2_r, src2_l);
+        ILVRL_B2_UB(src12, src3, src3_r, src3_l);
+        ILVRL_B2_UB(src13, src4, src4_r, src4_l);
+        ILVRL_B2_UB(src14, src5, src5_r, src5_l);
+        ILVRL_B2_UB(src15, src6, src6_r, src6_l);
+        ILVRL_B2_UB(src16, src7, src7_r, src7_l);
+        ILVRL_B2_UB(src17, src8, src8_r, src8_l);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
+        HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
+             sum2_r, sum3_r);
+        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
+             sum6_r, sum7_r);
+        HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
+        HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
+        HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
+             sum2_l, sum3_l);
+        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
+             sum6_l, sum7_l);
+        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h)
+{
+    copy_width16_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h)
+{
+    common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    copy_width8_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
+                                    const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h)
+{
+    avg_width16_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    avg_width8_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    avg_width4_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
diff --git a/libavcodec/mips/idctdsp_init_mips.c b/libavcodec/mips/idctdsp_init_mips.c
new file mode 100644
index 0000000..85b76ca
--- /dev/null
+++ b/libavcodec/mips/idctdsp_init_mips.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "xvididct_mips.h"
+
+#if HAVE_MSA
+static av_cold void idctdsp_init_msa(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+        (avctx->bits_per_raw_sample != 10) &&
+        (avctx->bits_per_raw_sample != 12) &&
+        (avctx->idct_algo == FF_IDCT_AUTO)) {
+                c->idct_put = ff_simple_idct_put_msa;
+                c->idct_add = ff_simple_idct_add_msa;
+                c->idct = ff_simple_idct_msa;
+                c->perm_type = FF_IDCT_PERM_NONE;
+    }
+
+    c->put_pixels_clamped = ff_put_pixels_clamped_msa;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_msa;
+    c->add_pixels_clamped = ff_add_pixels_clamped_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void idctdsp_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+    if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+        (avctx->bits_per_raw_sample != 10) &&
+        (avctx->bits_per_raw_sample != 12) &&
+        ((avctx->idct_algo == FF_IDCT_AUTO) || (avctx->idct_algo == FF_IDCT_SIMPLE))) {
+                c->idct_put = ff_simple_idct_put_8_mmi;
+                c->idct_add = ff_simple_idct_add_8_mmi;
+                c->idct = ff_simple_idct_8_mmi;
+                c->perm_type = FF_IDCT_PERM_NONE;
+    }
+
+    c->put_pixels_clamped = ff_put_pixels_clamped_mmi;
+    c->add_pixels_clamped = ff_add_pixels_clamped_mmi;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth)
+{
+#if HAVE_MMI
+    idctdsp_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    idctdsp_init_msa(c, avctx, high_bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/idctdsp_mips.h b/libavcodec/mips/idctdsp_mips.h
new file mode 100644
index 0000000..829efeb
--- /dev/null
+++ b/libavcodec/mips/idctdsp_mips.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
+#define AVCODEC_MIPS_IDCTDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_put_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_msa(const int16_t *block,
+                                      uint8_t *av_restrict pixels,
+                                      ptrdiff_t line_size);
+void ff_add_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size);
+void ff_j_rev_dct_msa(int16_t *data);
+void ff_jref_idct_put_msa(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_jref_idct_add_msa(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_simple_idct_msa(int16_t *block);
+void ff_simple_idct_put_msa(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
+void ff_simple_idct_add_msa(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
+
+void ff_put_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_add_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_simple_idct_8_mmi(int16_t *block);
+void ff_simple_idct_put_8_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_8_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+#endif  // #ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c
new file mode 100644
index 0000000..a96dac4
--- /dev/null
+++ b/libavcodec/mips/idctdsp_mmi.c
@@ -0,0 +1,193 @@
+/*
+ * Loongson SIMD optimized idctdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "constants.h"
+#include "libavutil/mips/mmiutils.h"
+
+void ff_put_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    double ftmp[8];
+
+    __asm__ volatile (
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        MMI_LDC1(%[ftmp4], %[block], 0x20)
+        MMI_LDC1(%[ftmp5], %[block], 0x28)
+        MMI_LDC1(%[ftmp6], %[block], 0x30)
+        MMI_LDC1(%[ftmp7], %[block], 0x38)
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp0], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp2], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp4], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp6], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+
+        MMI_LDC1(%[ftmp0], %[block], 0x40)
+        MMI_LDC1(%[ftmp1], %[block], 0x48)
+        MMI_LDC1(%[ftmp2], %[block], 0x50)
+        MMI_LDC1(%[ftmp3], %[block], 0x58)
+        MMI_LDC1(%[ftmp4], %[block], 0x60)
+        MMI_LDC1(%[ftmp5], %[block], 0x68)
+        MMI_LDC1(%[ftmp6], %[block], 0x70)
+        MMI_LDC1(%[ftmp7], %[block], 0x78)
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp0], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp2], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp4], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp6], %[pixels], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [pixels]"+&r"(pixels)
+        : [line_size]"r"((mips_reg)line_size),
+          [block]"r"(block)
+        : "memory"
+    );
+}
+
+void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
+    uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    double ftmp[5];
+
+    __asm__ volatile (
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp0], %[block], 0x08)
+        "packsshb   %[ftmp1],       %[ftmp1],       %[ftmp0]            \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp0], %[block], 0x18)
+        "packsshb   %[ftmp2],       %[ftmp2],       %[ftmp0]            \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x20)
+        MMI_LDC1(%[ftmp0], %[block], 0x28)
+        "packsshb   %[ftmp3],       %[ftmp3],       %[ftmp0]            \n\t"
+        MMI_LDC1(%[ftmp4], %[block], 0x30)
+        MMI_LDC1(%[ftmp0], %[block], 0x38)
+        "packsshb   %[ftmp4],       %[ftmp4],       %[ftmp0]            \n\t"
+        "paddb      %[ftmp1],       %[ftmp1],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp2],       %[ftmp2],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp3],       %[ftmp3],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp4],       %[ftmp4],       %[ff_pb_80]         \n\t"
+        MMI_SDC1(%[ftmp1], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],      %[pixels],      %[line_size]        \n\t"
+        MMI_SDC1(%[ftmp2], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],      %[pixels],      %[line_size]        \n\t"
+        MMI_SDC1(%[ftmp3], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],      %[pixels],      %[line_size]        \n\t"
+        MMI_SDC1(%[ftmp4], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],      %[pixels],      %[line_size]        \n\t"
+
+        MMI_LDC1(%[ftmp1], %[block], 0x40)
+        MMI_LDC1(%[ftmp0], %[block], 0x48)
+        "packsshb   %[ftmp1],       %[ftmp1],       %[ftmp0]            \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x50)
+        MMI_LDC1(%[ftmp0], %[block], 0x58)
+        "packsshb   %[ftmp2],       %[ftmp2],       %[ftmp0]            \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x60)
+        MMI_LDC1(%[ftmp0], %[block], 0x68)
+        "packsshb   %[ftmp3],       %[ftmp3],       %[ftmp0]            \n\t"
+        MMI_LDC1(%[ftmp4], %[block], 0x70)
+        MMI_LDC1(%[ftmp0], %[block], 0x78)
+        "packsshb   %[ftmp4],       %[ftmp4],       %[ftmp0]            \n\t"
+        "paddb      %[ftmp1],       %[ftmp1],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp2],       %[ftmp2],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp3],       %[ftmp3],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp4],       %[ftmp4],       %[ff_pb_80]         \n\t"
+        MMI_SDC1(%[ftmp1], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],      %[pixels],      %[line_size]        \n\t"
+        MMI_SDC1(%[ftmp2], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],      %[pixels],      %[line_size]        \n\t"
+        MMI_SDC1(%[ftmp3], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],      %[pixels],      %[line_size]        \n\t"
+        MMI_SDC1(%[ftmp4], %[pixels], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [pixels]"+&r"(pixels)
+        : [block]"r"(block),
+          [line_size]"r"((mips_reg)line_size),
+          [ff_pb_80]"f"(ff_pb_80)
+        : "memory"
+    );
+}
+
+void ff_add_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    double ftmp[9];
+    uint64_t tmp[1];
+    __asm__ volatile (
+        "li         %[tmp0],    0x04                           \n\t"
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]           \n\t"
+        "1:                                                    \n\t"
+        MMI_LDC1(%[ftmp5], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        MMI_LDC1(%[ftmp6], %[pixels], 0x00)
+        PTR_SUBU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp2], %[block], 0x08)
+        MMI_LDC1(%[ftmp3], %[block], 0x10)
+        MMI_LDC1(%[ftmp4], %[block], 0x18)
+        PTR_ADDIU  "%[block],   %[block],   0x20               \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp5],   %[ftmp0]           \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]           \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp6],   %[ftmp0]           \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]           \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp5]           \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp7]           \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp6]           \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp8]           \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp2]           \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp4]           \n\t"
+        MMI_SDC1(%[ftmp1], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        MMI_SDC1(%[ftmp3], %[pixels], 0x00)
+        "addi       %[tmp0],    %[tmp0],    -0x01              \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        "bnez       %[tmp0],    1b                             \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [tmp0]"=&r"(tmp[0]),
+          [pixels]"+&r"(pixels),            [block]"+&r"(block)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/idctdsp_msa.c b/libavcodec/mips/idctdsp_msa.c
new file mode 100644
index 0000000..b29e420
--- /dev/null
+++ b/libavcodec/mips/idctdsp_msa.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "idctdsp_mips.h"
+
+static void put_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                   int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+static void put_signed_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                          int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+    in0 += 128;
+    in1 += 128;
+    in2 += 128;
+    in3 += 128;
+    in4 += 128;
+    in5 += 128;
+    in6 += 128;
+    in7 += 128;
+
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+static void add_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                   int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 pix_in0, pix_in1, pix_in2, pix_in3;
+    v16u8 pix_in4, pix_in5, pix_in6, pix_in7;
+    v8u16 pix0, pix1, pix2, pix3, pix4, pix5, pix6, pix7;
+    v8i16 zero = { 0 };
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    LD_UB8(pixels, stride, pix_in0, pix_in1, pix_in2,
+           pix_in3, pix_in4, pix_in5, pix_in6, pix_in7);
+
+    ILVR_B4_UH(zero, pix_in0, zero, pix_in1, zero, pix_in2, zero, pix_in3,
+               pix0, pix1, pix2, pix3);
+    ILVR_B4_UH(zero, pix_in4, zero, pix_in5, zero, pix_in6, zero, pix_in7,
+               pix4, pix5, pix6, pix7);
+
+    in0 += (v8i16) pix0;
+    in1 += (v8i16) pix1;
+    in2 += (v8i16) pix2;
+    in3 += (v8i16) pix3;
+    in4 += (v8i16) pix4;
+    in5 += (v8i16) pix5;
+    in6 += (v8i16) pix6;
+    in7 += (v8i16) pix7;
+
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+void ff_put_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size)
+{
+    put_pixels_clamped_msa(block, pixels, line_size);
+}
+
+void ff_put_signed_pixels_clamped_msa(const int16_t *block,
+                                      uint8_t *av_restrict pixels,
+                                      ptrdiff_t line_size)
+{
+    put_signed_pixels_clamped_msa(block, pixels, line_size);
+}
+
+void ff_add_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size)
+{
+    add_pixels_clamped_msa(block, pixels, line_size);
+}
diff --git a/libavcodec/mips/iirfilter_mips.c b/libavcodec/mips/iirfilter_mips.c
new file mode 100644
index 0000000..3a1352a
--- /dev/null
+++ b/libavcodec/mips/iirfilter_mips.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * IIR filter optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ /**
+ * @file
+ * Reference: libavcodec/iirfilter.c
+ */
+
+#include "config.h"
+#include "libavcodec/iirfilter.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+typedef struct FFIIRFilterCoeffs {
+    int   order;
+    float gain;
+    int   *cx;
+    float *cy;
+} FFIIRFilterCoeffs;
+
+typedef struct FFIIRFilterState {
+    float x[1];
+} FFIIRFilterState;
+
+static void iir_filter_flt_mips(const struct FFIIRFilterCoeffs *c,
+                                struct FFIIRFilterState *s, int size,
+                                const float *src, ptrdiff_t sstep, float *dst, ptrdiff_t dstep)
+{
+    if (c->order == 2) {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        for (i = 0; i < size; i++) {
+            float in = *src0 * c->gain  + s->x[0] * c->cy[0] + s->x[1] * c->cy[1];
+            *dst0 = s->x[0] + in + s->x[1] * c->cx[1];
+            s->x[0] = s->x[1];
+            s->x[1] = in;
+            src0 += sstep;
+            dst0 += dstep;
+        }
+    } else if (c->order == 4) {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        float four = 4.0;
+        float six  = 6.0;
+        for (i = 0; i < size; i += 4) {
+            float in1, in2, in3, in4;
+            float res1, res2, res3, res4;
+            float *x  = s->x;
+            float *cy = c->cy;
+            float gain = c->gain;
+            float src0_0 = src0[0      ];
+            float src0_1 = src0[sstep  ];
+            float src0_2 = src0[2*sstep];
+            float src0_3 = src0[3*sstep];
+
+            __asm__ volatile (
+                "lwc1   $f0,        0(%[cy])                    \n\t"
+                "lwc1   $f4,        0(%[x])                     \n\t"
+                "lwc1   $f5,        4(%[x])                     \n\t"
+                "lwc1   $f6,        8(%[x])                     \n\t"
+                "lwc1   $f7,        12(%[x])                    \n\t"
+                "mul.s  %[in1],     %[src0_0],  %[gain]         \n\t"
+                "mul.s  %[in2],     %[src0_1],  %[gain]         \n\t"
+                "mul.s  %[in3],     %[src0_2],  %[gain]         \n\t"
+                "mul.s  %[in4],     %[src0_3],  %[gain]         \n\t"
+                "lwc1   $f1,        4(%[cy])                    \n\t"
+                "madd.s %[in1],     %[in1],     $f0,    $f4     \n\t"
+                "madd.s %[in2],     %[in2],     $f0,    $f5     \n\t"
+                "madd.s %[in3],     %[in3],     $f0,    $f6     \n\t"
+                "madd.s %[in4],     %[in4],     $f0,    $f7     \n\t"
+                "lwc1   $f2,        8(%[cy])                    \n\t"
+                "madd.s %[in1],     %[in1],     $f1,    $f5     \n\t"
+                "madd.s %[in2],     %[in2],     $f1,    $f6     \n\t"
+                "madd.s %[in3],     %[in3],     $f1,    $f7     \n\t"
+                "lwc1   $f3,        12(%[cy])                   \n\t"
+                "add.s  $f8,        $f5,        $f7             \n\t"
+                "madd.s %[in1],     %[in1],     $f2,    $f6     \n\t"
+                "madd.s %[in2],     %[in2],     $f2,    $f7     \n\t"
+                "mul.s  $f9,        $f6,        %[six]          \n\t"
+                "mul.s  $f10,       $f7,        %[six]          \n\t"
+                "madd.s %[in1],     %[in1],     $f3,    $f7     \n\t"
+                "madd.s %[in2],     %[in2],     $f3,    %[in1]  \n\t"
+                "madd.s %[in3],     %[in3],     $f2,    %[in1]  \n\t"
+                "madd.s %[in4],     %[in4],     $f1,    %[in1]  \n\t"
+                "add.s  %[res1],    $f4,        %[in1]          \n\t"
+                "swc1   %[in1],     0(%[x])                     \n\t"
+                "add.s  $f0,        $f6,        %[in1]          \n\t"
+                "madd.s %[in3],     %[in3],     $f3,    %[in2]  \n\t"
+                "madd.s %[in4],     %[in4],     $f2,    %[in2]  \n\t"
+                "add.s  %[res2],    $f5,        %[in2]          \n\t"
+                "madd.s %[res1],    %[res1],    $f8,    %[four] \n\t"
+                "add.s  $f8,        $f7,        %[in2]          \n\t"
+                "swc1   %[in2],     4(%[x])                     \n\t"
+                "madd.s %[in4],     %[in4],     $f3,    %[in3]  \n\t"
+                "add.s  %[res3],    $f6,        %[in3]          \n\t"
+                "add.s  %[res1],    %[res1],    $f9             \n\t"
+                "madd.s %[res2],    %[res2],    $f0,    %[four] \n\t"
+                "swc1   %[in3],     8(%[x])                     \n\t"
+                "add.s  %[res4],    $f7,        %[in4]          \n\t"
+                "madd.s %[res3],    %[res3],    $f8,    %[four] \n\t"
+                "swc1   %[in4],     12(%[x])                    \n\t"
+                "add.s  %[res2],    %[res2],    $f10            \n\t"
+                "add.s  $f8,        %[in1],     %[in3]          \n\t"
+                "madd.s %[res3],    %[res3],    %[in1], %[six]  \n\t"
+                "madd.s %[res4],    %[res4],    $f8,    %[four] \n\t"
+                "madd.s %[res4],    %[res4],    %[in2], %[six]  \n\t"
+
+                : [in1]"=&f"(in1), [in2]"=&f"(in2),
+                  [in3]"=&f"(in3), [in4]"=&f"(in4),
+                  [res1]"=&f"(res1), [res2]"=&f"(res2),
+                  [res3]"=&f"(res3), [res4]"=&f"(res4)
+                : [src0_0]"f"(src0_0), [src0_1]"f"(src0_1),
+                  [src0_2]"f"(src0_2), [src0_3]"f"(src0_3),
+                  [gain]"f"(gain), [x]"r"(x), [cy]"r"(cy),
+                  [four]"f"(four), [six]"f"(six)
+                : "$f0", "$f1", "$f2", "$f3",
+                  "$f4", "$f5", "$f6", "$f7",
+                  "$f8", "$f9", "$f10",
+                  "memory"
+            );
+
+            dst0[0      ] = res1;
+            dst0[sstep  ] = res2;
+            dst0[2*sstep] = res3;
+            dst0[3*sstep] = res4;
+
+            src0 += 4*sstep;
+            dst0 += 4*dstep;
+        }
+    } else {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        for (i = 0; i < size; i++) {
+            int j;
+            float in, res;
+            in = *src0 * c->gain;
+            for(j = 0; j < c->order; j++)
+                in += c->cy[j] * s->x[j];
+            res = s->x[0] + in + s->x[c->order >> 1] * c->cx[c->order >> 1];
+            for(j = 1; j < c->order >> 1; j++)
+                res += (s->x[j] + s->x[c->order - j]) * c->cx[j];
+            for(j = 0; j < c->order - 1; j++)
+                s->x[j] = s->x[j + 1];
+            *dst0 = res;
+            s->x[c->order - 1] = in;
+            src0 += sstep;
+            dst0 += dstep;
+        }
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_iir_filter_init_mips(FFIIRFilterContext *f) {
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    f->filter_flt = iir_filter_flt_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/lsp_mips.h b/libavcodec/mips/lsp_mips.h
new file mode 100644
index 0000000..6219c5a
--- /dev/null
+++ b/libavcodec/mips/lsp_mips.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * LSP routines for ACELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/lsp.c
+ */
+#ifndef AVCODEC_MIPS_LSP_MIPS_H
+#define AVCODEC_MIPS_LSP_MIPS_H
+
+#if HAVE_MIPSFPU && HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+#include "libavutil/mips/asmdefs.h"
+
+static av_always_inline void ff_lsp2polyf_mips(const double *lsp, double *f, int lp_half_order)
+{
+    int i, j = 0;
+    double * p_fi = f;
+    double * p_f = 0;
+
+    f[0] = 1.0;
+    f[1] = -2 * lsp[0];
+    lsp -= 2;
+
+    for(i=2; i<=lp_half_order; i++)
+    {
+        double tmp, f_j_2, f_j_1, f_j;
+        double val = lsp[2*i];
+
+        __asm__ volatile(
+            "move   %[p_f],     %[p_fi]                         \n\t"
+            "add.d  %[val],     %[val],     %[val]              \n\t"
+            PTR_ADDIU "%[p_fi], 8                               \n\t"
+            "ldc1   %[f_j_1],   0(%[p_f])                       \n\t"
+            "ldc1   %[f_j],     8(%[p_f])                       \n\t"
+            "neg.d  %[val],     %[val]                          \n\t"
+            "add.d  %[tmp],     %[f_j_1],   %[f_j_1]            \n\t"
+            "madd.d %[tmp],     %[tmp],     %[f_j], %[val]      \n\t"
+            "addiu  %[j],       %[i], -2                        \n\t"
+            "ldc1   %[f_j_2],   -8(%[p_f])                      \n\t"
+            "sdc1   %[tmp],     16(%[p_f])                      \n\t"
+            "beqz   %[j],       ff_lsp2polyf_lp_j_end%=         \n\t"
+            "ff_lsp2polyf_lp_j%=:                               \n\t"
+            "add.d  %[tmp],     %[f_j],     %[f_j_2]            \n\t"
+            "madd.d %[tmp],     %[tmp],     %[f_j_1], %[val]    \n\t"
+            "mov.d  %[f_j],     %[f_j_1]                        \n\t"
+            "addiu  %[j],       -1                              \n\t"
+            "mov.d  %[f_j_1],   %[f_j_2]                        \n\t"
+            "ldc1   %[f_j_2],   -16(%[p_f])                     \n\t"
+            "sdc1   %[tmp],     8(%[p_f])                       \n\t"
+            PTR_ADDIU "%[p_f], -8                              \n\t"
+            "bgtz   %[j],       ff_lsp2polyf_lp_j%=             \n\t"
+            "ff_lsp2polyf_lp_j_end%=:                           \n\t"
+
+            : [f_j_2]"=&f"(f_j_2), [f_j_1]"=&f"(f_j_1), [val]"+f"(val),
+              [tmp]"=&f"(tmp), [f_j]"=&f"(f_j), [p_f]"+r"(p_f),
+              [j]"+r"(j), [p_fi]"+r"(p_fi)
+            : [i]"r"(i)
+            : "memory"
+        );
+        f[1] += val;
+    }
+}
+#define ff_lsp2polyf ff_lsp2polyf_mips
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU && HAVE_INLINE_ASM */
+#endif /* AVCODEC_MIPS_LSP_MIPS_H */
diff --git a/libavcodec/mips/mathops.h b/libavcodec/mips/mathops.h
index 573d325..bb9dc83 100644
--- a/libavcodec/mips/mathops.h
+++ b/libavcodec/mips/mathops.h
@@ -1,20 +1,21 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,58 +28,39 @@
 
 #if HAVE_INLINE_ASM
 
-#if HAVE_LOONGSON
-#if ARCH_MIPS64
+#if HAVE_LOONGSON3
 
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
+#define MULH MULH
+static inline av_const int MULH(int a, int b)
 {
-    int64_t m;
-    __asm__ ("dmult %2, %3     \n\t"
-             "mflo  %1         \n\t"
-             "daddu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
+    int c;
+    __asm__ ("dmult %1, %2      \n\t"
+             "mflo %0           \n\t"
+             "dsrl %0, %0, 32   \n\t"
+             : "=r"(c)
+             : "r"(a),"r"(b)
              : "hi", "lo");
-    return d;
+    return c;
 }
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
 
-static inline av_const int64_t MLS64(int64_t d, int a, int b)
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
 {
-    int64_t m;
-    __asm__ ("dmult %2, %3     \n\t"
-             "mflo  %1         \n\t"
-             "dsubu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
-             : "hi", "lo");
-    return d;
-}
-#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
-
-#else
-
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
-{
-    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
-             "daddu   %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
-    return d;
+    int t = b;
+    __asm__ ("sgt $8, %1, %2    \n\t"
+             "movn %0, %1, $8   \n\t"
+             "movn %1, %2, $8   \n\t"
+             "sgt $8, %1, %3    \n\t"
+             "movz %1, %3, $8   \n\t"
+             "sgt $8, %0, %1    \n\t"
+             "movn %0, %1, $8   \n\t"
+             : "+&r"(t),"+&r"(a)
+             : "r"(b),"r"(c)
+             : "$8");
+    return t;
 }
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
-
-static inline av_const int64_t MLS64(int64_t d, int a, int b)
-{
-    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
-             "dsubu   %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
-    return d;
-}
-#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
-
-#endif
 
-#endif /* HAVE_LOONGSON */
+#endif /* HAVE_LOONGSON3 */
 
 #endif /* HAVE_INLINE_ASM */
 
diff --git a/libavcodec/mips/me_cmp_init_mips.c b/libavcodec/mips/me_cmp_init_mips.c
new file mode 100644
index 0000000..219a0dc
--- /dev/null
+++ b/libavcodec/mips/me_cmp_init_mips.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "me_cmp_mips.h"
+
+#if HAVE_MSA
+static av_cold void me_cmp_msa(MECmpContext *c, AVCodecContext *avctx)
+{
+#if BIT_DEPTH == 8
+    c->pix_abs[0][0] = ff_pix_abs16_msa;
+    c->pix_abs[0][1] = ff_pix_abs16_x2_msa;
+    c->pix_abs[0][2] = ff_pix_abs16_y2_msa;
+    c->pix_abs[0][3] = ff_pix_abs16_xy2_msa;
+    c->pix_abs[1][0] = ff_pix_abs8_msa;
+    c->pix_abs[1][1] = ff_pix_abs8_x2_msa;
+    c->pix_abs[1][2] = ff_pix_abs8_y2_msa;
+    c->pix_abs[1][3] = ff_pix_abs8_xy2_msa;
+
+    c->hadamard8_diff[0] = ff_hadamard8_diff16_msa;
+    c->hadamard8_diff[1] = ff_hadamard8_diff8x8_msa;
+
+    c->hadamard8_diff[4] = ff_hadamard8_intra16_msa;
+    c->hadamard8_diff[5] = ff_hadamard8_intra8x8_msa;
+
+    c->sad[0] = ff_pix_abs16_msa;
+    c->sad[1] = ff_pix_abs8_msa;
+    c->sse[0] = ff_sse16_msa;
+    c->sse[1] = ff_sse8_msa;
+    c->sse[2] = ff_sse4_msa;
+#endif
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx)
+{
+#if HAVE_MSA
+    me_cmp_msa(c, avctx);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/me_cmp_mips.h b/libavcodec/mips/me_cmp_mips.h
new file mode 100644
index 0000000..e0d0f51
--- /dev/null
+++ b/libavcodec/mips/me_cmp_mips.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
+#define AVCODEC_MIPS_ME_CMP_MIPS_H
+
+#include "../mpegvideo.h"
+#include "libavcodec/bit_depth_template.c"
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h);
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                              ptrdiff_t stride, int h);
+int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                            ptrdiff_t stride, int h);
+int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h);
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_sse16_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                 ptrdiff_t stride, int i32Height);
+int ff_sse8_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                ptrdiff_t stride, int i32Height);
+int ff_sse4_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                ptrdiff_t stride, int i32Height);
+void ff_add_pixels8_msa(uint8_t *av_restrict pixels, int16_t *block,
+                        ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
diff --git a/libavcodec/mips/me_cmp_msa.c b/libavcodec/mips/me_cmp_msa.c
new file mode 100644
index 0000000..0e3165c
--- /dev/null
+++ b/libavcodec/mips/me_cmp_msa.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "me_cmp_mips.h"
+
+static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
+                               uint8_t *ref, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                    src0, src1, ref0, ref1);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *ref, int32_t ref_stride,
+                                int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, ref0, ref1;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *ref,
+                                                     int32_t ref_stride,
+                                                     int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+        SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+        SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+        SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+        SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *ref,
+                                                      int32_t ref_stride,
+                                                      int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+        ref += (4 * ref_stride);
+
+        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+        ref += (4 * ref_stride);
+
+        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *ref,
+                                                    int32_t ref_stride,
+                                                    int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *ref,
+                                                     int32_t ref_stride,
+                                                     int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+        ref += (5 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+        ref4 = ref3;
+
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (3 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *ref,
+                                                  int32_t ref_stride,
+                                                  int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, temp0, temp1, diff;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v8u16 comp0, comp1, comp2, comp3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+
+        VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp0 += comp1;
+        comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
+        comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
+
+        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
+        comp2 = __msa_hadd_u_h(temp0, temp0);
+        comp1 += comp2;
+        comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
+        comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
+        comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
+        diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
+        comp3 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp3;
+        comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
+        comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
+
+        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp3 += comp0;
+        comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
+        comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
+        comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
+        diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
+        sad += __msa_hadd_u_h(diff, diff);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *ref,
+                                                   int32_t ref_stride,
+                                                   int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp, diff;
+    v16u8 temp0, temp1, temp2, temp3;
+    v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
+    v8u16 comp0, comp1, comp2, comp3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
+        LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
+        ref += (5 * ref_stride);
+
+        ILVRL_B2_UB(ref14, ref04, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src0, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src1, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src2, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src3, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
+        LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
+        ref += (3 * ref_stride);
+
+        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src0, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src1, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src2, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src3, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+#define CALC_MSE_B(src, ref, var)                                    \
+{                                                                    \
+    v16u8 src_l0_m, src_l1_m;                                        \
+    v8i16 res_l0_m, res_l1_m;                                        \
+                                                                     \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+}
+
+static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
+                               uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    uint32_t src0, src1, src2, src3;
+    uint32_t ref0, ref1, ref2, ref3;
+    v16u8 src = { 0 };
+    v16u8 ref = { 0 };
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LW4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+        ref_ptr += (4 * ref_stride);
+
+        INSERT_W4_UB(src0, src1, src2, src3, src);
+        INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+        CALC_MSE_B(src, ref, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
+                               uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref2, ref3;
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+        ref_ptr += (4 * ref_stride);
+
+        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                    src0, src1, ref0, ref1);
+        CALC_MSE_B(src0, ref0, var);
+        CALC_MSE_B(src1, ref1, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
+                                uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    v16u8 src, ref;
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *ref, int32_t ref_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 sum = { 0 };
+    v8i16 zero = { 0 };
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+    ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
+               src4, ref4, src5, ref5, src6, ref6, src7, ref7,
+               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
+    HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
+    TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
+                       diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+         diff0, diff1, diff2, diff3);
+    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+    sum += __msa_add_a_h((v8i16) diff0, zero);
+    sum += __msa_add_a_h((v8i16) diff1, zero);
+    sum += __msa_add_a_h((v8i16) diff2, zero);
+    sum += __msa_add_a_h((v8i16) diff3, zero);
+
+    return (HADD_UH_U32(sum));
+}
+
+static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *ref, int32_t ref_stride)
+{
+    int32_t sum_res = 0;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 sum = { 0 };
+    v16i8 zero = { 0 };
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
+                       src0, src1, src2, src3, src4, src5, src6, src7);
+    ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+               zero, src4, zero, src5, zero, src6, zero, src7,
+               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+         diff0, diff1, diff2, diff3);
+    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+    sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
+    sum_res = (HADD_UH_U32(sum));
+    sum_res -= abs(temp0[0] + temp4[0]);
+
+    return sum_res;
+}
+
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                     ptrdiff_t stride, int height)
+{
+    return sad_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                    ptrdiff_t stride, int height)
+{
+    return sad_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h)
+{
+    return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h)
+{
+    return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h)
+{
+    return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                 ptrdiff_t stride, int height)
+{
+    return sse_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                ptrdiff_t stride, int height)
+{
+    return sse_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                ptrdiff_t stride, int height)
+{
+    return sse_4width_msa(src, stride, ref, stride, height);
+}
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h)
+{
+    return hadamard_diff_8x8_msa(src, stride, dst, stride);
+}
+
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                              ptrdiff_t stride, int h)
+{
+    return hadamard_intra_8x8_msa(src, stride, dst, stride);
+}
+
+/* Hadamard Transform functions */
+#define WRAPPER8_16_SQ(name8, name16)                      \
+int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \
+           ptrdiff_t stride, int h)                        \
+{                                                          \
+    int score = 0;                                         \
+    score += name8(s, dst, src, stride, 8);                \
+    score += name8(s, dst + 8, src + 8, stride, 8);        \
+    if(h == 16) {                                          \
+        dst += 8 * stride;                                 \
+        src += 8 * stride;                                 \
+        score +=name8(s, dst, src, stride, 8);             \
+        score +=name8(s, dst + 8, src + 8, stride, 8);     \
+    }                                                      \
+    return score;                                          \
+}
+
+WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
+WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
diff --git a/libavcodec/mips/mpegaudiodsp_mips_fixed.c b/libavcodec/mips/mpegaudiodsp_mips_fixed.c
new file mode 100644
index 0000000..1c9c68d
--- /dev/null
+++ b/libavcodec/mips/mpegaudiodsp_mips_fixed.c
@@ -0,0 +1,918 @@
+    /*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * MPEG Audio decoder optimized for MIPS fixed-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodsp_template.c
+ */
+
+#include <string.h>
+
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+
+static void ff_mpadsp_apply_window_mips_fixed(int32_t *synth_buf, int32_t *window,
+                               int *dither_state, int16_t *samples, ptrdiff_t incr)
+{
+    register const int32_t *w, *w2, *p;
+    int j;
+    int16_t *samples2;
+    int w_asm, p_asm, w_asm1, p_asm1, w_asm2, p_asm2;
+    int w2_asm, w2_asm1, *p_temp1, *p_temp2;
+    int sum1 = 0;
+    int const min_asm = -32768, max_asm = 32767;
+    int temp1, temp2 = 0, temp3 = 0;
+    int64_t sum;
+
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+    samples2 = samples + 31 * incr;
+    w = window;
+    w2 = window + 31;
+    sum = *dither_state;
+    p = synth_buf + 16;
+    p_temp1 = synth_buf + 16;
+    p_temp2 = synth_buf + 48;
+    temp1 = sum;
+
+    /**
+    * use of round_sample function from the original code is eliminated,
+    * changed with appropriate assembly instructions.
+    */
+    __asm__ volatile (
+         "mthi   $zero                                                    \n\t"
+         "mtlo   %[temp1]                                                 \n\t"
+         "lw     %[w_asm],  0(%[w])                                       \n\t"
+         "lw     %[p_asm],  0(%[p])                                       \n\t"
+         "lw     %[w_asm1], 64*4(%[w])                                    \n\t"
+         "lw     %[p_asm1], 64*4(%[p])                                    \n\t"
+         "lw     %[w_asm2], 128*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 128*4(%[p])                                   \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "madd   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  192*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  192*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 256*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 256*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 320*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 320*4(%[p])                                   \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "madd   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  384*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  384*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 448*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 448*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 32*4(%[w])                                    \n\t"
+         "lw     %[p_asm2], 32*4(%[p])                                    \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  96*4(%[w])                                    \n\t"
+         "lw     %[p_asm],  96*4(%[p])                                    \n\t"
+         "lw     %[w_asm1], 160*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 160*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 224*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 224*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "msub   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  288*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  288*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 352*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 352*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "lw     %[w_asm],  480*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  480*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 416*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 416*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "msub   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+
+         /*round_sample function from the original code is eliminated,
+          * changed with appropriate assembly instructions
+          * code example:
+
+         "extr.w  %[sum1],$ac0,24                                       \n\t"
+         "mflo %[temp3],  $ac0                                          \n\t"
+         "and  %[temp1],  %[temp3],  0x00ffffff                         \n\t"
+         "slt  %[temp2],  %[sum1],   %[min_asm]                         \n\t"
+         "movn %[sum1],   %[min_asm],%[temp2]                           \n\t"
+         "slt  %[temp2],  %[max_asm],%[sum1]                            \n\t"
+         "movn %[sum1],   %[max_asm],%[temp2]                           \n\t"
+         "sh   %[sum1],   0(%[samples])                                 \n\t"
+         */
+
+         "extr.w %[sum1],   $ac0,       24                                \n\t"
+         "mflo   %[temp3]                                                 \n\t"
+         PTR_ADDIU "%[w],   %[w],       4                                 \n\t"
+         "and    %[temp1],  %[temp3],   0x00ffffff                        \n\t"
+         "slt    %[temp2],  %[sum1],    %[min_asm]                        \n\t"
+         "movn   %[sum1],   %[min_asm], %[temp2]                          \n\t"
+         "slt    %[temp2],  %[max_asm], %[sum1]                           \n\t"
+         "movn   %[sum1],   %[max_asm], %[temp2]                          \n\t"
+         "sh     %[sum1],   0(%[samples])                                 \n\t"
+
+        : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+          [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+          [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2),
+          [sum1] "+r" (sum1), [w] "+r" (w), [temp3] "+r" (temp3)
+        : [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
+          [max_asm] "r" (max_asm)
+        : "memory", "hi","lo"
+     );
+
+     samples += incr;
+
+    /* we calculate two samples at the same time to avoid one memory
+       access per two sample */
+
+    for(j = 1; j < 16; j++) {
+        __asm__ volatile (
+             "mthi   $0,         $ac1                                      \n\t"
+             "mtlo   $0,         $ac1                                      \n\t"
+             "mthi   $0                                                    \n\t"
+             "mtlo   %[temp1]                                              \n\t"
+             PTR_ADDIU "%[p_temp1], %[p_temp1],    4                       \n\t"
+             "lw     %[w_asm],   0(%[w])                                   \n\t"
+             "lw     %[p_asm],   0(%[p_temp1])                             \n\t"
+             "lw     %[w2_asm],  0(%[w2])                                  \n\t"
+             "lw     %[w_asm1],  64*4(%[w])                                \n\t"
+             "lw     %[p_asm1],  64*4(%[p_temp1])                          \n\t"
+             "lw     %[w2_asm1], 64*4(%[w2])                               \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   128*4(%[w])                               \n\t"
+             "lw     %[p_asm],   128*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  128*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  192*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  192*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 192*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   256*4(%[w])                               \n\t"
+             "lw     %[p_asm],   256*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  256*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  320*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  320*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 320*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   384*4(%[w])                               \n\t"
+             "lw     %[p_asm],   384*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  384*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  448*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  448*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 448*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             PTR_ADDIU "%[p_temp2], %[p_temp2],   -4                      \n\t"
+             "lw     %[w_asm],   32*4(%[w])                                \n\t"
+             "lw     %[p_asm],   0(%[p_temp2])                             \n\t"
+             "lw     %[w2_asm],  32*4(%[w2])                               \n\t"
+             "lw     %[w_asm1],  96*4(%[w])                                \n\t"
+             "lw     %[p_asm1],  64*4(%[p_temp2])                          \n\t"
+             "lw     %[w2_asm1], 96*4(%[w2])                               \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   160*4(%[w])                               \n\t"
+             "lw     %[p_asm],   128*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  160*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  224*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  192*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 224*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   288*4(%[w])                               \n\t"
+             "lw     %[p_asm],   256*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  288*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  352*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  320*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 352*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   416*4(%[w])                               \n\t"
+             "lw     %[p_asm],   384*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  416*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  480*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  448*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 480*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             PTR_ADDIU "%[w],    %[w],             4                       \n\t"
+             PTR_ADDIU "%[w2],   %[w2],            -4                      \n\t"
+             "mflo   %[temp2]                                              \n\t"
+             "extr.w %[sum1],    $ac0,             24                      \n\t"
+             "li     %[temp3],   1                                         \n\t"
+             "and    %[temp1],   %[temp2],         0x00ffffff              \n\t"
+             "madd   $ac1,       %[temp1],         %[temp3]                \n\t"
+             "slt    %[temp2],   %[sum1],          %[min_asm]              \n\t"
+             "movn   %[sum1],    %[min_asm],       %[temp2]                \n\t"
+             "slt    %[temp2],   %[max_asm],       %[sum1]                 \n\t"
+             "movn   %[sum1],    %[max_asm],       %[temp2]                \n\t"
+             "sh     %[sum1],    0(%[samples])                             \n\t"
+             "mflo   %[temp3],   $ac1                                      \n\t"
+             "extr.w %[sum1],    $ac1,             24                      \n\t"
+             "and    %[temp1],   %[temp3],         0x00ffffff              \n\t"
+             "slt    %[temp2],   %[sum1],          %[min_asm]              \n\t"
+             "movn   %[sum1],    %[min_asm],       %[temp2]                \n\t"
+             "slt    %[temp2],   %[max_asm],       %[sum1]                 \n\t"
+             "movn   %[sum1],    %[max_asm],       %[temp2]                \n\t"
+             "sh     %[sum1],    0(%[samples2])                            \n\t"
+
+            : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+              [p_asm1] "=&r" (p_asm1), [w2_asm1] "=&r" (w2_asm1),
+              [w2_asm] "=&r" (w2_asm), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+              [p_temp1] "+r" (p_temp1), [p_temp2] "+r" (p_temp2), [sum1] "+r" (sum1),
+              [w] "+r" (w), [w2] "+r" (w2), [samples] "+r" (samples),
+              [samples2] "+r" (samples2), [temp3] "+r" (temp3)
+            : [min_asm] "r" (min_asm), [max_asm] "r" (max_asm)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
+        );
+
+        samples += incr;
+        samples2 -= incr;
+    }
+
+    p = synth_buf + 32;
+
+    __asm__ volatile (
+        "mthi   $0                                                        \n\t"
+        "mtlo   %[temp1]                                                  \n\t"
+        "lw     %[w_asm],  32*4(%[w])                                     \n\t"
+        "lw     %[p_asm],  0(%[p])                                        \n\t"
+        "lw     %[w_asm1], 96*4(%[w])                                     \n\t"
+        "lw     %[p_asm1], 64*4(%[p])                                     \n\t"
+        "lw     %[w_asm2], 160*4(%[w])                                    \n\t"
+        "lw     %[p_asm2], 128*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "msub   %[w_asm2], %[p_asm2]                                      \n\t"
+        "lw     %[w_asm],  224*4(%[w])                                    \n\t"
+        "lw     %[p_asm],  192*4(%[p])                                    \n\t"
+        "lw     %[w_asm1], 288*4(%[w])                                    \n\t"
+        "lw     %[p_asm1], 256*4(%[p])                                    \n\t"
+        "lw     %[w_asm2], 352*4(%[w])                                    \n\t"
+        "lw     %[p_asm2], 320*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "msub   %[w_asm2], %[p_asm2]                                      \n\t"
+        "lw     %[w_asm],  416*4(%[w])                                    \n\t"
+        "lw     %[p_asm],  384*4(%[p])                                    \n\t"
+        "lw     %[w_asm1], 480*4(%[w])                                    \n\t"
+        "lw     %[p_asm1], 448*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "extr.w %[sum1],   $ac0,       24                                 \n\t"
+        "mflo   %[temp2]                                                  \n\t"
+        "and    %[temp1],  %[temp2],   0x00ffffff                         \n\t"
+        "slt    %[temp2],  %[sum1],    %[min_asm]                         \n\t"
+        "movn   %[sum1],   %[min_asm], %[temp2]                           \n\t"
+        "slt    %[temp2],  %[max_asm], %[sum1]                            \n\t"
+        "movn   %[sum1],   %[max_asm], %[temp2]                           \n\t"
+        "sh     %[sum1],   0(%[samples])                                  \n\t"
+
+        : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+          [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+          [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), [sum1] "+r" (sum1)
+        : [w] "r" (w), [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
+          [max_asm] "r" (max_asm)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
+     );
+
+    *dither_state= temp1;
+}
+
+static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win)
+{
+    int j;
+    int t0, t1, t2, t3, s0, s1, s2, s3;
+    int tmp[18], *tmp1, *in1;
+    /* temporary variables */
+    int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
+    int t4, t5, t6, t8, t7;
+
+   /* values defined in macros and tables are
+    * eliminated - they are directly loaded in appropriate variables
+    */
+    int const C_1  =  4229717092; /* cos(pi*1/18)*2  */
+    int const C_2  =  4035949074; /* cos(pi*2/18)*2  */
+    int const C_3  =  575416510;  /* -cos(pi*3/18)*2 */
+    int const C_3A =  3719550786; /* cos(pi*3/18)*2  */
+    int const C_4  =  1004831466; /* -cos(pi*4/18)*2 */
+    int const C_5  =  1534215534; /* -cos(pi*5/18)*2 */
+    int const C_7  = -1468965330; /* -cos(pi*7/18)*2 */
+    int const C_8  = -745813244;  /* -cos(pi*8/18)*2 */
+
+   /*
+    * instructions of the first two loops are reorganized and loops are unrolled,
+    * in order to eliminate unnecessary readings and writings in array
+    */
+
+    __asm__ volatile (
+        "lw   %[t1], 17*4(%[in])                                         \n\t"
+        "lw   %[t2], 16*4(%[in])                                         \n\t"
+        "lw   %[t3], 15*4(%[in])                                         \n\t"
+        "lw   %[t4], 14*4(%[in])                                         \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "addu %[t2], %[t2],      %[t3]                                   \n\t"
+        "addu %[t3], %[t3],      %[t4]                                   \n\t"
+        "lw   %[t5], 13*4(%[in])                                         \n\t"
+        "addu %[t1], %[t1],      %[t3]                                   \n\t"
+        "sw   %[t2], 16*4(%[in])                                         \n\t"
+        "lw   %[t6], 12*4(%[in])                                         \n\t"
+        "sw   %[t1], 17*4(%[in])                                         \n\t"
+        "addu %[t4], %[t4],      %[t5]                                   \n\t"
+        "addu %[t5], %[t5],      %[t6]                                   \n\t"
+        "lw   %[t7], 11*4(%[in])                                         \n\t"
+        "addu %[t3], %[t3],      %[t5]                                   \n\t"
+        "sw   %[t4], 14*4(%[in])                                         \n\t"
+        "lw   %[t8], 10*4(%[in])                                         \n\t"
+        "sw   %[t3], 15*4(%[in])                                         \n\t"
+        "addu %[t6], %[t6],      %[t7]                                   \n\t"
+        "addu %[t7], %[t7],      %[t8]                                   \n\t"
+        "sw   %[t6], 12*4(%[in])                                         \n\t"
+        "addu %[t5], %[t5],      %[t7]                                   \n\t"
+        "lw   %[t1], 9*4(%[in])                                          \n\t"
+        "lw   %[t2], 8*4(%[in])                                          \n\t"
+        "sw   %[t5], 13*4(%[in])                                         \n\t"
+        "addu %[t8], %[t8],      %[t1]                                   \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "sw   %[t8], 10*4(%[in])                                         \n\t"
+        "addu %[t7], %[t7],      %[t1]                                   \n\t"
+        "lw   %[t3], 7*4(%[in])                                          \n\t"
+        "lw   %[t4], 6*4(%[in])                                          \n\t"
+        "sw   %[t7], 11*4(%[in])                                         \n\t"
+        "addu %[t2], %[t2],      %[t3]                                   \n\t"
+        "addu %[t3], %[t3],      %[t4]                                   \n\t"
+        "sw   %[t2], 8*4(%[in])                                          \n\t"
+        "addu %[t1], %[t1],      %[t3]                                   \n\t"
+        "lw   %[t5], 5*4(%[in])                                          \n\t"
+        "lw   %[t6], 4*4(%[in])                                          \n\t"
+        "sw   %[t1], 9*4(%[in])                                          \n\t"
+        "addu %[t4], %[t4],      %[t5]                                   \n\t"
+        "addu %[t5], %[t5],      %[t6]                                   \n\t"
+        "sw   %[t4], 6*4(%[in])                                          \n\t"
+        "addu %[t3], %[t3],      %[t5]                                   \n\t"
+        "lw   %[t7], 3*4(%[in])                                          \n\t"
+        "lw   %[t8], 2*4(%[in])                                          \n\t"
+        "sw   %[t3], 7*4(%[in])                                          \n\t"
+        "addu %[t6], %[t6],      %[t7]                                   \n\t"
+        "addu %[t7], %[t7],      %[t8]                                   \n\t"
+        "sw   %[t6], 4*4(%[in])                                          \n\t"
+        "addu %[t5], %[t5],      %[t7]                                   \n\t"
+        "lw   %[t1], 1*4(%[in])                                          \n\t"
+        "lw   %[t2], 0*4(%[in])                                          \n\t"
+        "sw   %[t5], 5*4(%[in])                                          \n\t"
+        "addu %[t8], %[t8],      %[t1]                                   \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "sw   %[t8], 2*4(%[in])                                          \n\t"
+        "addu %[t7], %[t7],      %[t1]                                   \n\t"
+        "sw   %[t7], 3*4(%[in])                                          \n\t"
+        "sw   %[t1], 1*4(%[in])                                          \n\t"
+
+        : [in] "+r" (in), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
+          [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6),
+          [t7] "=&r" (t7), [t8] "=&r" (t8)
+        :
+        : "memory"
+    );
+
+    for(j = 0; j < 2; j++) {
+
+        tmp1 = tmp + j;
+        in1 = in + j;
+
+         /**
+         *  Original constants are multiplied by two in advanced
+         *  for assembly optimization (e.g. C_2 = 2 * C2).
+         *  That can lead to overflow in operations where they are used.
+         *
+         *  Example of the solution:
+         *
+         *  in original code:
+         *  t0 = ((int64_t)(in1[2*2] + in1[2*4]) * (int64_t)(2*C2))>>32
+         *
+         *  in assembly:
+         *  C_2 = 2 * C2;
+         *   .
+         *   .
+         *  "lw   %[t7],       4*4(%[in1])                               \n\t"
+         *  "lw   %[t8],       8*4(%[in1])                               \n\t"
+         *  "addu %[temp_reg2],%[t7],       %[t8]                        \n\t"
+         *  "multu %[C_2],     %[temp_reg2]                              \n\t"
+         *  "mfhi %[temp_reg1]                                           \n\t"
+         *  "sra  %[temp_reg2],%[temp_reg2],31                           \n\t"
+         *  "move %[t0],       $0                                        \n\t"
+         *  "movn %[t0],       %[C_2],      %[temp_reg2]                 \n\t"
+         *  "sub  %[t0],       %[temp_reg1],%[t0]                        \n\t"
+         */
+
+        __asm__ volatile (
+            "lw    %[t7],        4*4(%[in1])                               \n\t"
+            "lw    %[t8],        8*4(%[in1])                               \n\t"
+            "lw    %[t6],        16*4(%[in1])                              \n\t"
+            "lw    %[t4],        0*4(%[in1])                               \n\t"
+            "addu  %[temp_reg2], %[t7],        %[t8]                       \n\t"
+            "addu  %[t2],        %[t6],        %[t8]                       \n\t"
+            "multu %[C_2],       %[temp_reg2]                              \n\t"
+            "lw    %[t5],        12*4(%[in1])                              \n\t"
+            "sub   %[t2],        %[t2],        %[t7]                       \n\t"
+            "sub   %[t1],        %[t4],        %[t5]                       \n\t"
+            "sra   %[t3],        %[t5],        1                           \n\t"
+            "sra   %[temp_reg1], %[t2],        1                           \n\t"
+            "addu  %[t3],        %[t3],        %[t4]                       \n\t"
+            "sub   %[temp_reg1], %[t1],        %[temp_reg1]                \n\t"
+            "sra   %[temp_reg2], %[temp_reg2], 31                          \n\t"
+            "sw    %[temp_reg1], 6*4(%[tmp1])                              \n\t"
+            "move  %[t0],        $0                                        \n\t"
+            "movn  %[t0],        %[C_2],       %[temp_reg2]                \n\t"
+            "mfhi  %[temp_reg1]                                            \n\t"
+            "addu  %[t1],        %[t1],        %[t2]                       \n\t"
+            "sw    %[t1],        16*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg4], %[t8],        %[t6]                       \n\t"
+            "add   %[temp_reg2], %[t7],        %[t6]                       \n\t"
+            "mult  $ac1,         %[C_8],       %[temp_reg4]                \n\t"
+            "multu $ac2,         %[C_4],       %[temp_reg2]                \n\t"
+            "sub   %[t0],        %[temp_reg1], %[t0]                       \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "move  %[t2],        $0                                        \n\t"
+            "movn  %[t2],        %[C_4],       %[temp_reg1]                \n\t"
+            "mfhi  %[t1],        $ac1                                      \n\t"
+            "mfhi  %[temp_reg1], $ac2                                      \n\t"
+            "lw    %[t6],        10*4(%[in1])                              \n\t"
+            "lw    %[t8],        14*4(%[in1])                              \n\t"
+            "lw    %[t7],        2*4(%[in1])                               \n\t"
+            "lw    %[t4],        6*4(%[in1])                               \n\t"
+            "sub   %[temp_reg3], %[t3],        %[t0]                       \n\t"
+            "add   %[temp_reg4], %[t3],        %[t0]                       \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "add   %[temp_reg4], %[temp_reg4], %[t1]                       \n\t"
+            "sub   %[t2],        %[temp_reg1], %[t2]                       \n\t"
+            "sw    %[temp_reg4], 2*4(%[tmp1])                              \n\t"
+            "sub   %[temp_reg3], %[temp_reg3], %[t2]                       \n\t"
+            "add   %[temp_reg1], %[t3],        %[t2]                       \n\t"
+            "sw    %[temp_reg3], 10*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[t1]                       \n\t"
+            "addu  %[temp_reg2], %[t6],        %[t8]                       \n\t"
+            "sw    %[temp_reg1], 14*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg2], %[temp_reg2], %[t7]                       \n\t"
+            "addu  %[temp_reg3], %[t7],        %[t6]                       \n\t"
+            "multu $ac3,         %[C_3],       %[temp_reg2]                \n\t"
+            "multu %[C_1],       %[temp_reg3]                              \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "move  %[t1],        $0                                        \n\t"
+            "sra   %[temp_reg3], %[temp_reg3], 31                          \n\t"
+            "movn  %[t1],        %[C_3],       %[temp_reg1]                \n\t"
+            "mfhi  %[temp_reg1], $ac3                                      \n\t"
+            "mfhi  %[temp_reg4]                                            \n\t"
+            "move  %[t2],        $0                                        \n\t"
+            "movn  %[t2],        %[C_1],       %[temp_reg3]                \n\t"
+            "sub   %[temp_reg3], %[t6],        %[t8]                       \n\t"
+            "sub   %[t2],        %[temp_reg4], %[t2]                       \n\t"
+            "multu $ac1,         %[C_7],       %[temp_reg3]                \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "sra   %[temp_reg4], %[temp_reg3], 31                          \n\t"
+            "sub   %[t1],        %[temp_reg1], %[t1]                       \n\t"
+            "move  %[t3],        $0                                        \n\t"
+            "sw    %[t1],        4*4(%[tmp1])                              \n\t"
+            "movn  %[t3],        %[C_7],       %[temp_reg4]                \n\t"
+            "multu $ac2,         %[C_3A],      %[t4]                       \n\t"
+            "add   %[temp_reg2], %[t7],        %[t8]                       \n\t"
+            "move  %[t1],        $0                                        \n\t"
+            "mfhi  %[temp_reg4], $ac1                                      \n\t"
+            "multu $ac3,%[C_5],  %[temp_reg2]                              \n\t"
+            "move  %[t0],        $0                                        \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "movn  %[t1],%[C_5], %[temp_reg1]                              \n\t"
+            "sub   %[temp_reg4], %[temp_reg4], %[temp_reg3]                \n\t"
+            "mfhi  %[temp_reg1], $ac3                                      \n\t"
+            "sra   %[temp_reg3], %[t4],        31                          \n\t"
+            "movn  %[t0],        %[C_3A],      %[temp_reg3]                \n\t"
+            "mfhi  %[temp_reg3], $ac2                                      \n\t"
+            "sub   %[t3],        %[temp_reg4], %[t3]                       \n\t"
+            "add   %[temp_reg4], %[t3],        %[t2]                       \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "sub   %[t1],        %[temp_reg1], %[t1]                       \n\t"
+            "sub   %[t0],        %[temp_reg3], %[t0]                       \n\t"
+            "add   %[temp_reg1], %[t2],        %[t1]                       \n\t"
+            "add   %[temp_reg4], %[temp_reg4], %[t0]                       \n\t"
+            "sub   %[temp_reg2], %[t3],        %[t1]                       \n\t"
+            "sw    %[temp_reg4], 0*4(%[tmp1])                              \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[t0]                       \n\t"
+            "sub   %[temp_reg2], %[temp_reg2], %[t0]                       \n\t"
+            "sw    %[temp_reg1], 12*4(%[tmp1])                             \n\t"
+            "sw    %[temp_reg2], 8*4(%[tmp1])                              \n\t"
+
+            : [t7] "=&r" (t7), [temp_reg1] "=&r" (temp_reg1),
+              [temp_reg2] "=&r" (temp_reg2), [temp_reg4] "=&r" (temp_reg4),
+              [temp_reg3] "=&r" (temp_reg3), [t8] "=&r" (t8), [t0] "=&r" (t0),
+              [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r"(t6), [t2] "=&r" (t2),
+              [t3] "=&r" (t3), [t1] "=&r" (t1)
+            : [C_2] "r" (C_2), [in1] "r" (in1), [tmp1] "r" (tmp1), [C_8] "r" (C_8),
+              [C_4] "r" (C_4), [C_3] "r" (C_3), [C_1] "r" (C_1), [C_7] "r" (C_7),
+              [C_3A] "r" (C_3A), [C_5] "r" (C_5)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+         );
+    }
+
+    /**
+    * loop is unrolled four times
+    *
+    * values defined in tables(icos36[] and icos36h[]) are not loaded from
+    * these tables - they are directly loaded in appropriate registers
+    *
+    */
+
+    __asm__ volatile (
+        "lw     %[t2],        1*4(%[tmp])                                  \n\t"
+        "lw     %[t3],        3*4(%[tmp])                                  \n\t"
+        "lw     %[t0],        0*4(%[tmp])                                  \n\t"
+        "lw     %[t1],        2*4(%[tmp])                                  \n\t"
+        "addu   %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0x807D2B1E                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg2], %[temp_reg1]                                 \n\t"
+        "sra    %[temp_reg1], %[temp_reg1], 31                             \n\t"
+        "movn   %[s1],        %[temp_reg2], %[temp_reg1]                   \n\t"
+        "sub    %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x2de5151                                    \n\t"
+        "mfhi   %[temp_reg2]                                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg5], 9*4(%[win])                                  \n\t"
+        "mult   $ac1,         %[temp_reg4], %[temp_reg3]                   \n\t"
+        "lw     %[temp_reg6], 4*9*4(%[buf])                                \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg3], 29*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg2], %[s1]                          \n\t"
+        "lw     %[temp_reg4], 28*4(%[win])                                 \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "extr.w %[s3],        $ac1,23                                      \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg3]                   \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "lw     %[temp_reg1], 4*8*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg5]                                 \n\t"
+        "lw     %[temp_reg2], 8*4(%[win])                                  \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg4]                   \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "sw     %[temp_reg3], 4*9*4(%[buf])                                \n\t"
+        "mfhi   %[temp_reg4], $ac3                                         \n\t"
+        "lw     %[temp_reg3], 37*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "lw     %[temp_reg6], 17*4(%[win])                                 \n\t"
+        "sw     %[temp_reg5], 32*9*4(%[out])                               \n\t"
+        "sw     %[temp_reg4], 4*8*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg6]                                 \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg2], 0*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 4*17*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 8*32*4(%[out])                               \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg4], 20*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 0(%[buf])                                    \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg3]                   \n\t"
+        "mult   %[t0],        %[temp_reg4]                                 \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "lw     %[t0],        4*4(%[tmp])                                  \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg4]                                               \n\t"
+        "sw     %[temp_reg5], 17*32*4(%[out])                              \n\t"
+        "lw     %[t1],        6*4(%[tmp])                                  \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[t2],        5*4(%[tmp])                                  \n\t"
+        "sw     %[temp_reg1], 0*32*4(%[out])                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg3], 4*17*4(%[buf])                               \n\t"
+        "lw     %[t3],        7*4(%[tmp])                                  \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg4], 0(%[buf])                                    \n\t"
+        "addu   %[temp_reg5], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg6], 0x8483EE0C                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg6], %[temp_reg5]                                 \n\t"
+        "sub    %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0xf746ea                                     \n\t"
+        "sra    %[temp_reg5], %[temp_reg5], 31                             \n\t"
+        "mult   $ac1,         %[temp_reg2], %[temp_reg1]                   \n\t"
+        "movn   %[s1],        %[temp_reg6], %[temp_reg5]                   \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "lw     %[temp_reg3], 10*4(%[win])                                 \n\t"
+        "lw     %[temp_reg4], 4*10*4(%[buf])                               \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg1], 4*7*4(%[buf])                                \n\t"
+        "lw     %[temp_reg2], 7*4(%[win])                                  \n\t"
+        "lw     %[temp_reg6], 30*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg5], %[s1]                          \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg3]                   \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   %[t0],        %[temp_reg6]                                 \n\t"
+        "lw     %[temp_reg5], 27*4(%[win])                                 \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg5]                   \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg2], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[temp_reg4], 16*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg5], $ac1                                         \n\t"
+        "sw     %[temp_reg3], 32*10*4(%[out])                              \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg3], 4*16*4(%[buf])                               \n\t"
+        "sw     %[temp_reg6], 4*10*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 7*32*4(%[out])                               \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg4]                   \n\t"
+        "sw     %[temp_reg5], 4*7*4(%[buf])                                \n\t"
+        "lw     %[temp_reg6], 1*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 4*1*4(%[buf])                                \n\t"
+        "lw     %[temp_reg1], 36*4(%[win])                                 \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg6]                   \n\t"
+        "lw     %[temp_reg2], 21*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg4], $ac2                                         \n\t"
+        "mult   %[t0],        %[temp_reg1]                                 \n\t"
+        "mult   $ac1,         %[t0],%[temp_reg2]                           \n\t"
+        "lw     %[t0],        8*4(%[tmp])                                  \n\t"
+        "mfhi   %[temp_reg6], $ac3                                         \n\t"
+        "lw     %[t1],        10*4(%[tmp])                                 \n\t"
+        "lw     %[t3],        11*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg1]                                               \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[t2],        9*4(%[tmp])                                  \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "sw     %[temp_reg3], 16*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg5], 1*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg1], 4*16*4(%[buf])                               \n\t"
+        "addu   %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x8D3B7CD6                                   \n\t"
+        "sw     %[temp_reg2], 4*1*4(%[buf])                                \n\t"
+        "multu  %[temp_reg4],%[temp_reg3]                                  \n\t"
+        "sra    %[temp_reg3], %[temp_reg3], 31                             \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "movn   %[s1],        %[temp_reg4], %[temp_reg3]                   \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "mfhi   %[temp_reg3]                                               \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "sub    %[temp_reg5], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg6], 0x976fd9                                     \n\t"
+        "lw     %[temp_reg2], 11*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 4*11*4(%[buf])                               \n\t"
+        "mult   $ac1,         %[temp_reg6], %[temp_reg5]                   \n\t"
+        "subu   %[s1],        %[temp_reg3], %[s1]                          \n\t"
+        "lw     %[temp_reg5], 31*4(%[win])                                 \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   %[t0],        %[temp_reg5]                                 \n\t"
+        "lw     %[temp_reg4], 6*4(%[win])                                  \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg3], 4*6*4(%[buf])                                \n\t"
+        "mfhi   %[temp_reg2], $ac2                                         \n\t"
+        "lw     %[temp_reg6], 26*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg4]                   \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg6]                   \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg2], %[temp_reg2], %[temp_reg1]                   \n\t"
+        "mfhi   %[temp_reg4], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6], $ac1                                         \n\t"
+        "sw     %[temp_reg5], 4*11*4(%[buf])                               \n\t"
+        "sw     %[temp_reg2], 32*11*4(%[out])                              \n\t"
+        "lw     %[temp_reg1], 4*15*4(%[buf])                               \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[temp_reg2], 15*4(%[win])                                 \n\t"
+        "sw     %[temp_reg3], 6*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg6], 4*6*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg2]                                 \n\t"
+        "lw     %[temp_reg3], 2*4(%[win])                                  \n\t"
+        "lw     %[temp_reg4], 4*2*4(%[buf])                                \n\t"
+        "lw     %[temp_reg5], 35*4(%[win])                                 \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg3]                   \n\t"
+        "mfhi   %[temp_reg2]                                               \n\t"
+        "lw     %[temp_reg6], 22*4(%[win])                                 \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg5]                   \n\t"
+        "lw     %[t1],        14*4(%[tmp])                                 \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg6]                   \n\t"
+        "lw     %[t0],        12*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg3], $ac1                                         \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "mfhi   %[temp_reg5], $ac2                                         \n\t"
+        "sw     %[temp_reg1], 15*32*4(%[out])                              \n\t"
+        "mfhi   %[temp_reg6], $ac3                                         \n\t"
+        "lw     %[t2],        13*4(%[tmp])                                 \n\t"
+        "lw     %[t3],        15*4(%[tmp])                                 \n\t"
+        "add    %[temp_reg4], %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sw     %[temp_reg5], 4*15*4(%[buf])                               \n\t"
+        "addu   %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0x9C42577C                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg2], %[temp_reg1]                                 \n\t"
+        "sw     %[temp_reg4], 2*32*4(%[out])                               \n\t"
+        "sra    %[temp_reg1], %[temp_reg1], 31                             \n\t"
+        "movn   %[s1],        %[temp_reg2], %[temp_reg1]                   \n\t"
+        "sub    %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x6f94a2                                     \n\t"
+        "mfhi   %[temp_reg1]                                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg6], 4*2*4(%[buf])                                \n\t"
+        "mult   $ac1,         %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg5], 12*4(%[win])                                 \n\t"
+        "lw     %[temp_reg6], 4*12*4(%[buf])                               \n\t"
+        "subu   %[s1],        %[temp_reg1], %[s1]                          \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "lw     %[temp_reg3], 32*4(%[win])                                 \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg5]                   \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg2], 5*4(%[win])                                  \n\t"
+        "mult   %[t0],        %[temp_reg3]                                 \n\t"
+        "mfhi   %[temp_reg5], $ac2                                         \n\t"
+        "lw     %[temp_reg4], 25*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 4*5*4(%[buf])                                \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg4]                   \n\t"
+        "mfhi   %[temp_reg3]                                               \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg2], $ac3                                         \n\t"
+        "mfhi   %[temp_reg4], $ac1                                         \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "sw     %[temp_reg5], 32*12*4(%[out])                              \n\t"
+        "sw     %[temp_reg3], 4*12*4(%[buf])                               \n\t"
+        "lw     %[temp_reg6], 14*4(%[win])                                 \n\t"
+        "lw     %[temp_reg5], 4*14*4(%[buf])                               \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "sw     %[temp_reg4], 4*5*4(%[buf])                                \n\t"
+        "sw     %[temp_reg1], 5*32*4(%[out])                               \n\t"
+        "mult   %[t1],        %[temp_reg6]                                 \n\t"
+        "lw     %[temp_reg4], 34*4(%[win])                                 \n\t"
+        "lw     %[temp_reg2], 3*4(%[win])                                  \n\t"
+        "lw     %[temp_reg1], 4*3*4(%[buf])                                \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg4]                   \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg3], 23*4(%[win])                                 \n\t"
+        "lw     %[s0],        16*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg4], $ac2                                         \n\t"
+        "lw     %[t1],        17*4(%[tmp])                                 \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg3]                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "sw     %[temp_reg5], 14*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg4], 4*14*4(%[buf])                               \n\t"
+        "mfhi   %[temp_reg3], $ac3                                         \n\t"
+        "li     %[temp_reg5], 0xB504F334                                   \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "multu  %[temp_reg5], %[t1]                                        \n\t"
+        "lw     %[temp_reg2], 4*13*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 3*32*4(%[out])                               \n\t"
+        "sra    %[t1],        %[t1],        31                             \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "movn   %[s1],        %[temp_reg5], %[t1]                          \n\t"
+        "sw     %[temp_reg3], 4*3*4(%[buf])                                \n\t"
+        "lw     %[temp_reg1], 13*4(%[win])                                 \n\t"
+        "lw     %[temp_reg4], 4*4*4(%[buf])                                \n\t"
+        "lw     %[temp_reg3], 4*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 33*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg6], %[s1]                          \n\t"
+        "lw     %[temp_reg6], 24*4(%[win])                                 \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg1]                   \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg3]                   \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg5]                   \n\t"
+        "mult   %[t0],        %[temp_reg6]                                 \n\t"
+        "mfhi   %[temp_reg1], $ac1                                         \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg5], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "add    %[temp_reg2], %[temp_reg2], %[temp_reg1]                   \n\t"
+        "add    %[temp_reg4], %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sw     %[temp_reg2], 13*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg4], 4*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg5], 4*13*4(%[buf])                               \n\t"
+        "sw     %[temp_reg6], 4*4*4(%[buf])                                \n\t"
+
+        : [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
+          [s0] "=&r" (s0), [s2] "=&r" (s2), [temp_reg1] "=&r" (temp_reg1),
+          [temp_reg2] "=&r" (temp_reg2), [s1] "=&r" (s1), [s3] "=&r" (s3),
+          [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
+          [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6),
+          [out] "+r" (out)
+        : [tmp] "r" (tmp), [win] "r" (win), [buf] "r" (buf)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+          "$ac3hi", "$ac3lo"
+    );
+}
+
+static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in,
+                               int count, int switch_point, int block_type)
+{
+    int j;
+    for (j=0 ; j < count; j++) {
+        /* apply window & overlap with previous buffer */
+
+        /* select window */
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;
+        int *win = ff_mdct_win_fixed[win_idx + (4 & -(j & 1))];
+
+        imdct36_mips_fixed(out, buf, in, win);
+
+        in  += 18;
+        buf += ((j&3) != 3 ? 1 : (72-3));
+        out++;
+    }
+}
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_mpadsp_init_mipsdsp(MPADSPContext *s)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->apply_window_fixed   = ff_mpadsp_apply_window_mips_fixed;
+    s->imdct36_blocks_fixed = ff_imdct36_blocks_mips_fixed;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/mpegaudiodsp_mips_float.c b/libavcodec/mips/mpegaudiodsp_mips_float.c
new file mode 100644
index 0000000..481b69c
--- /dev/null
+++ b/libavcodec/mips/mpegaudiodsp_mips_float.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * MPEG Audio decoder optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodsp_template.c
+ *            libavcodec/dct32.c
+ */
+
+#include <string.h>
+
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+
+static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window,
+                               int *dither_state, float *samples, ptrdiff_t incr)
+{
+    register const float *w, *w2, *p;
+    int j;
+    float *samples2;
+    float sum, sum2;
+    /* temporary variables */
+    int incr1 = incr << 2;
+    int t_sample;
+    float in1, in2, in3, in4, in5, in6, in7, in8;
+    float *p2;
+
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    * use of round_sample function from the original code is
+    * changed with appropriate assembly instructions.
+    */
+
+    __asm__ volatile (
+        "lwc1    %[sum],      0(%[dither_state])                            \t\n"
+        "sll     %[t_sample], %[incr1],     5                               \t\n"
+        "sub     %[t_sample], %[t_sample],  %[incr1]                        \n\t"
+        "li      %[j],        4                                             \t\n"
+        "lwc1    %[in1],      0(%[window])                                  \t\n"
+        "lwc1    %[in2],      16*4(%[synth_buf])                            \t\n"
+        "sw      $zero,       0(%[dither_state])                            \t\n"
+        "lwc1    %[in3],      64*4(%[window])                               \t\n"
+        "lwc1    %[in4],      80*4(%[synth_buf])                            \t\n"
+        PTR_ADDU "%[samples2],%[samples],   %[t_sample]                     \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      128*4(%[window])                              \t\n"
+        "lwc1    %[in6],      144*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in7],      192*4(%[window])                              \t\n"
+        "madd.s  %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in8],      208*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in1],      256*4(%[window])                              \t\n"
+        "lwc1    %[in2],      272*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      320*4(%[window])                              \t\n"
+        "lwc1    %[in4],      336*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in5],      384*4(%[window])                              \t\n"
+        "madd.s  %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in6],      400*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in7],      448*4(%[window])                              \t\n"
+        "lwc1    %[in8],      464*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in1],      32*4(%[window])                               \t\n"
+        "lwc1    %[in2],      48*4(%[synth_buf])                            \t\n"
+        "madd.s  %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in3],      96*4(%[window])                               \t\n"
+        "lwc1    %[in4],      112*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in5],      160*4(%[window])                              \t\n"
+        "lwc1    %[in6],      176*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in7],      224*4(%[window])                              \t\n"
+        "lwc1    %[in8],      240*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in1],      288*4(%[window])                              \t\n"
+        "lwc1    %[in2],      304*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in3],      352*4(%[window])                              \t\n"
+        "lwc1    %[in4],      368*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in5],      416*4(%[window])                              \t\n"
+        "lwc1    %[in6],      432*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in7],      480*4(%[window])                              \t\n"
+        "lwc1    %[in8],      496*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        PTR_ADDU "%[w],       %[window],    4                               \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        PTR_ADDU "%[w2],      %[window],    124                             \t\n"
+        PTR_ADDIU "%[p],      %[synth_buf], 68                              \t\n"
+        PTR_ADDIU "%[p2],     %[synth_buf], 188                             \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+        PTR_ADDU "%[samples], %[samples],   %[incr1]                        \t\n"
+
+        /* calculate two samples at the same time to avoid one memory
+           access per two sample */
+
+        "ff_mpadsp_apply_window_loop%=:                                     \t\n"
+        "lwc1    %[in1],      0(%[w])                                       \t\n"
+        "lwc1    %[in2],      0(%[p])                                       \t\n"
+        "lwc1    %[in3],      0(%[w2])                                      \t\n"
+        "lwc1    %[in4],      64*4(%[w])                                    \t\n"
+        "lwc1    %[in5],      64*4(%[p])                                    \t\n"
+        "lwc1    %[in6],      64*4(%[w2])                                   \t\n"
+        "mul.s   %[sum],      %[in1],       %[in2]                          \t\n"
+        "mul.s   %[sum2],     %[in2],       %[in3]                          \t\n"
+        "lwc1    %[in1],      128*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      128*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmadd.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      128*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      192*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      192*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      192*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      256*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      256*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      256*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      320*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      320*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      320*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      384*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      384*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      384*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      448*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      448*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      448*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in1],      32*4(%[w])                                    \t\n"
+        "lwc1    %[in2],      0(%[p2])                                      \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      32*4(%[w2])                                   \t\n"
+        "lwc1    %[in4],      96*4(%[w])                                    \t\n"
+        "lwc1    %[in5],      64*4(%[p2])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      96*4(%[w2])                                   \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      160*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      128*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      160*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      224*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      192*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      224*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      288*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      256*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      288*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      352*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      320*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      352*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      416*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      384*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      416*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      480*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      448*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      480*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        PTR_ADDIU "%[w],      %[w],         4                               \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        PTR_ADDIU "%[w2],     %[w2],        -4                              \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "addu    %[j],        %[j],         4                               \t\n"
+        PTR_ADDIU "%[p],      4                                             \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+        PTR_ADDIU "%[p2],     -4                                            \t\n"
+        "swc1    %[sum2],     0(%[samples2])                                \t\n"
+        PTR_ADDU "%[samples], %[samples],   %[incr1]                        \t\n"
+        PTR_SUBU "%[samples2],%[samples2],  %[incr1]                        \t\n"
+        "bne     %[j],        64,           ff_mpadsp_apply_window_loop%=   \t\n"
+
+        "lwc1    %[in1],      48*4(%[window])                               \t\n"
+        "lwc1    %[in2],      32*4(%[synth_buf])                            \t\n"
+        "lwc1    %[in3],      112*4(%[window])                              \t\n"
+        "lwc1    %[in4],      96*4(%[synth_buf])                            \t\n"
+        "lwc1    %[in5],      176*4(%[window])                              \t\n"
+        "lwc1    %[in6],      160*4(%[synth_buf])                           \t\n"
+        "mul.s   %[sum],      %[in1],       %[in2]                          \t\n"
+        "lwc1    %[in7],      240*4(%[window])                              \t\n"
+        "lwc1    %[in8],      224*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in1],      304*4(%[window])                              \t\n"
+        "nmadd.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in2],      288*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in3],      368*4(%[window])                              \t\n"
+        "lwc1    %[in4],      352*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in5],      432*4(%[window])                              \t\n"
+        "lwc1    %[in6],      416*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in7],      496*4(%[window])                              \t\n"
+        "lwc1    %[in8],      480*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+
+        : [sum] "=&f" (sum), [sum2] "=&f" (sum2),
+          [w2] "=&r" (w2),   [w] "=&r" (w),
+          [p] "=&r" (p), [p2] "=&r" (p2), [j] "=&r" (j),
+          [samples] "+r" (samples), [samples2] "=&r" (samples2),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [in7] "=&f" (in7), [in8] "=&f" (in8),
+          [t_sample] "=&r" (t_sample)
+        : [synth_buf] "r" (synth_buf), [window] "r" (window),
+          [dither_state] "r" (dither_state), [incr1] "r" (incr1)
+        : "memory"
+    );
+}
+
+static void ff_dct32_mips_float(float *out, const float *tab)
+{
+    float val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7,
+          val8 , val9 , val10, val11, val12, val13, val14, val15,
+          val16, val17, val18, val19, val20, val21, val22, val23,
+          val24, val25, val26, val27, val28, val29, val30, val31;
+    float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8,
+          fTmp9, fTmp10, fTmp11;
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    */
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       0*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       31*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       15*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       16*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.50241928618815570551                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.50060299823519630134                  \n\t"
+        "li.s       %[fTmp11],      10.19000812354805681150                 \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val0],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val15],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       7*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       24*4(%[tab])                            \n\t"
+        "madd.s     %[val16],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val31],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val15],       %[val15],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       8*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp4],       23*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val31],       %[val31],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       5.10114861868916385802                  \n\t"
+        "li.s       %[fTmp10],      0.67480834145500574602                  \n\t"
+        "li.s       %[fTmp11],      0.74453627100229844977                  \n\t"
+        "add.s      %[val7],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val8],        %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.50979557910415916894                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val7]                 \n\t"
+        "mul.s      %[val8],        %[val8],        %[fTmp7]                \n\t"
+        "madd.s     %[val23],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val24],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "add.s      %[val0],        %[val0],        %[val7]                 \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val8]                 \n\t"
+        "add.s      %[val8],        %[val15],       %[val8]                 \n\t"
+        "mul.s      %[val24],       %[val24],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val16],       %[val23]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val23]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp4],       %[val31],       %[val24]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp3]                \n\t"
+        "add.s      %[val24],       %[val31],       %[val24]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp4]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val0]  "=f" (val0),  [val7]  "=f" (val7),
+          [val8]  "=f" (val8),  [val15] "=f" (val15),
+          [val16] "=f" (val16), [val23] "=f" (val23),
+          [val24] "=f" (val24), [val31] "=f" (val31)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       3*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       28*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       12*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       19*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.64682178335999012954                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.53104259108978417447                  \n\t"
+        "li.s       %[fTmp11],      1.48416461631416627724                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val3],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val12],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       4*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       27*4(%[tab])                            \n\t"
+        "madd.s     %[val19],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val28],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val12],       %[val12],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       11*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       20*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val28],       %[val28],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "li.s       %[fTmp7],       0.78815462345125022473                  \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.55310389603444452782                  \n\t"
+        "li.s       %[fTmp11],      1.16943993343288495515                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val4],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val11],       %[fTmp5],       %[fTmp6]                \n\t"
+        "li.s       %[fTmp1],       2.56291544774150617881                  \n\t"
+        "madd.s     %[val20],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val27],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val11],       %[val11],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp2],       %[val3],        %[val4]                 \n\t"
+        "add.s      %[val3],        %[val3],        %[val4]                 \n\t"
+        "sub.s      %[fTmp4],       %[val19],       %[val20]                \n\t"
+        "mul.s      %[val27],       %[val27],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val12],       %[val11]                \n\t"
+        "mul.s      %[val4],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val11],       %[val12],       %[val11]                \n\t"
+        "add.s      %[val19],       %[val19],       %[val20]                \n\t"
+        "mul.s      %[val20],       %[fTmp1],       %[fTmp4]                \n\t"
+        "mul.s      %[val12],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val28],       %[val27]                \n\t"
+        "add.s      %[val27],       %[val28],       %[val27]                \n\t"
+        "mul.s      %[val28],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val3]  "=f" (val3),  [val4]  "=f" (val4),
+          [val11] "=f" (val11), [val12] "=f" (val12),
+          [val19] "=f" (val19), [val20] "=f" (val20),
+          [val27] "=f" (val27), [val28] "=f" (val28)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       0.54119610014619698439                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val3]                 \n\t"
+        "add.s      %[val0],        %[val0],        %[val3]                 \n\t"
+        "sub.s      %[fTmp3],       %[val7],        %[val4]                 \n\t"
+        "add.s      %[val4],        %[val7],        %[val4]                 \n\t"
+        "sub.s      %[fTmp4],       %[val8],        %[val11]                \n\t"
+        "mul.s      %[val3],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val8],        %[val8],        %[val11]                \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val12]                \n\t"
+        "mul.s      %[val11],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val12],       %[val15],       %[val12]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [val0]  "+f" (val0),   [val3] "+f" (val3),
+          [val4]  "+f" (val4),   [val7] "+f" (val7),
+          [val8]  "+f" (val8),   [val11] "+f" (val11),
+          [val12] "+f" (val12),  [val15] "+f" (val15),
+          [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4)
+        :
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val16],       %[val19]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val19]                \n\t"
+        "sub.s      %[fTmp3],       %[val23],       %[val20]                \n\t"
+        "add.s      %[val20],       %[val23],       %[val20]                \n\t"
+        "sub.s      %[fTmp4],       %[val24],       %[val27]                \n\t"
+        "mul.s      %[val19],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val27]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val31],       %[val28]                \n\t"
+        "mul.s      %[val27],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val28],       %[val31],       %[val28]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20),
+          [val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27),
+          [val28] "+f" (val28), [val31] "+f" (val31)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       1*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       30*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       14*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       17*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.52249861493968888062                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.50547095989754365998                  \n\t"
+        "li.s       %[fTmp11],      3.40760841846871878570                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val1],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val14],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       6*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       25*4(%[tab])                            \n\t"
+        "madd.s     %[val17],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val30],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val14],       %[val14],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       9*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp4],       22*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val30],       %[val30],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       1.72244709823833392782                  \n\t"
+        "li.s       %[fTmp10],      0.62250412303566481615                  \n\t"
+        "li.s       %[fTmp11],      0.83934964541552703873                  \n\t"
+        "add.s      %[val6],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val9],        %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.60134488693504528054                  \n\t"
+        "sub.s      %[fTmp2],       %[val1],        %[val6]                 \n\t"
+        "add.s      %[val1],        %[val1],        %[val6]                 \n\t"
+        "mul.s      %[val9],        %[val9],        %[fTmp7]                \n\t"
+        "madd.s     %[val22],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val25],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val6],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val14],       %[val9]                 \n\t"
+        "add.s      %[val9],        %[val14],       %[val9]                 \n\t"
+        "mul.s      %[val25],       %[val25],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val17],       %[val22]                \n\t"
+        "add.s      %[val17],       %[val17],       %[val22]                \n\t"
+        "mul.s      %[val14],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val30],       %[val25]                \n\t"
+        "mul.s      %[val22],       %[fTmp1],       %[fTmp3]                \n\t"
+        "add.s      %[val25],       %[val30],       %[val25]                \n\t"
+        "mul.s      %[val30],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val1]  "=f" (val1),  [val6]  "=f" (val6),
+          [val9]  "=f" (val9),  [val14] "=f" (val14),
+          [val17] "=f" (val17), [val22] "=f" (val22),
+          [val25] "=f" (val25), [val30] "=f" (val30)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       2*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       29*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       13*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       18*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.56694403481635770368                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.51544730992262454697                  \n\t"
+        "li.s       %[fTmp11],      2.05778100995341155085                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val2],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val13],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       5*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       26*4(%[tab])                            \n\t"
+        "madd.s     %[val18],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val29],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val13],       %[val13],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       10*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       21*4(%[tab])                            \n\t"
+        "mul.s      %[val29],       %[val29],       %[fTmp7]                \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       1.06067768599034747134                  \n\t"
+        "li.s       %[fTmp10],      0.58293496820613387367                  \n\t"
+        "li.s       %[fTmp11],      0.97256823786196069369                  \n\t"
+        "add.s      %[val5],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val10],       %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.89997622313641570463                  \n\t"
+        "sub.s      %[fTmp2],       %[val2],        %[val5]                 \n\t"
+        "mul.s      %[val10],       %[val10],       %[fTmp7]                \n\t"
+        "madd.s     %[val21],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val26],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "add.s      %[val2],        %[val2],        %[val5]                 \n\t"
+        "mul.s      %[val5],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp3],       %[val13],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val13],       %[val10]                \n\t"
+        "mul.s      %[val26],       %[val26],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp4],       %[val18],       %[val21]                \n\t"
+        "add.s      %[val18],       %[val18],       %[val21]                \n\t"
+        "mul.s      %[val13],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val29],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val29],       %[val26]                \n\t"
+        "mul.s      %[val21],       %[fTmp1],       %[fTmp4]                \n\t"
+        "mul.s      %[val29],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val2]  "=f" (val2),  [val5]  "=f" (val5),
+          [val10] "=f" (val10), [val13] "=f" (val13),
+          [val18] "=f" (val18), [val21] "=f" (val21),
+          [val26] "=f" (val26), [val29] "=f" (val29)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       1.30656296487637652785                  \n\t"
+        "sub.s      %[fTmp2],       %[val1],        %[val2]                 \n\t"
+        "add.s      %[val1],        %[val1],        %[val2]                 \n\t"
+        "sub.s      %[fTmp3],       %[val6],        %[val5]                 \n\t"
+        "add.s      %[val5],        %[val6],        %[val5]                 \n\t"
+        "sub.s      %[fTmp4],       %[val9],        %[val10]                \n\t"
+        "mul.s      %[val2],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val9],        %[val9],        %[val10]                \n\t"
+        "mul.s      %[val6],        %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val14],       %[val13]                \n\t"
+        "mul.s      %[val10],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val13],       %[val14],       %[val13]                \n\t"
+        "mul.s      %[val14],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val1]  "+f" (val1),  [val2]  "+f" (val2),
+          [val5]  "+f" (val5),  [val6]  "+f" (val6),
+          [val9]  "+f" (val9),  [val10] "+f" (val10),
+          [val13] "+f" (val13), [val14] "+f" (val14)
+        :
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val17],       %[val18]                \n\t"
+        "add.s      %[val17],       %[val17],       %[val18]                \n\t"
+        "sub.s      %[fTmp3],       %[val22],       %[val21]                \n\t"
+        "add.s      %[val21],       %[val22],       %[val21]                \n\t"
+        "sub.s      %[fTmp4],       %[val25],       %[val26]                \n\t"
+        "mul.s      %[val18],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val25],       %[val25],       %[val26]                \n\t"
+        "mul.s      %[val22],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val30],       %[val29]                \n\t"
+        "mul.s      %[val26],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val29],       %[val30],       %[val29]                \n\t"
+        "mul.s      %[val30],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21),
+          [val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26),
+          [val29] "+f" (val29), [val30] "+f" (val30)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       0.70710678118654752439                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val1]                 \n\t"
+        "add.s      %[val0],        %[val0],        %[val1]                 \n\t"
+        "sub.s      %[fTmp3],       %[val3],        %[val2]                 \n\t"
+        "add.s      %[val2],        %[val3],        %[val2]                 \n\t"
+        "sub.s      %[fTmp4],       %[val4],        %[val5]                 \n\t"
+        "mul.s      %[val1],        %[fTmp1],       %[fTmp2]                \n\t"
+        "swc1       %[val0],        0(%[out])                               \n\t"
+        "mul.s      %[val3],        %[fTmp3],       %[fTmp1]                \n\t"
+        "add.s      %[val4],        %[val4],        %[val5]                 \n\t"
+        "mul.s      %[val5],        %[fTmp1],       %[fTmp4]                \n\t"
+        "swc1       %[val1],        16*4(%[out])                            \n\t"
+        "sub.s      %[fTmp2],       %[val7],        %[val6]                 \n\t"
+        "add.s      %[val2],        %[val2],        %[val3]                 \n\t"
+        "swc1       %[val3],        24*4(%[out])                            \n\t"
+        "add.s      %[val6],        %[val7],        %[val6]                 \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp2]                \n\t"
+        "swc1       %[val2],        8*4(%[out])                             \n\t"
+        "add.s      %[val6],        %[val6],        %[val7]                 \n\t"
+        "swc1       %[val7],        28*4(%[out])                            \n\t"
+        "add.s      %[val4],        %[val4],        %[val6]                 \n\t"
+        "add.s      %[val6],        %[val6],        %[val5]                 \n\t"
+        "add.s      %[val5],        %[val5],        %[val7]                 \n\t"
+        "swc1       %[val4],        4*4(%[out])                             \n\t"
+        "swc1       %[val5],        20*4(%[out])                            \n\t"
+        "swc1       %[val6],        12*4(%[out])                            \n\t"
+
+        : [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val0] "+f" (val0), [val1] "+f" (val1),
+          [val2] "+f" (val2), [val3] "+f" (val3),
+          [val4] "+f" (val4), [val5] "+f" (val5),
+          [val6] "+f" (val6), [val7] "+f" (val7)
+        : [out] "r" (out)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val8],        %[val9]                 \n\t"
+        "add.s      %[val8],        %[val8],        %[val9]                 \n\t"
+        "sub.s      %[fTmp3],       %[val11],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val11],       %[val10]                \n\t"
+        "sub.s      %[fTmp4],       %[val12],       %[val13]                \n\t"
+        "mul.s      %[val9],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val13]                \n\t"
+        "mul.s      %[val11],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val14]                \n\t"
+        "mul.s      %[val13],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val14],       %[val15],       %[val14]                \n\t"
+        "add.s      %[val10],       %[val10],       %[val11]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val15]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val14]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val13]                \n\t"
+        "add.s      %[val13],       %[val13],       %[val15]                \n\t"
+        "add.s      %[val8],        %[val8],        %[val12]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val10],       %[val14]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val9]                 \n\t"
+        "add.s      %[val9],        %[val9],        %[val13]                \n\t"
+        "add.s      %[val13],       %[val13],       %[val11]                \n\t"
+        "add.s      %[val11],       %[val11],       %[val15]                \n\t"
+        "swc1       %[val8],         2*4(%[out])                            \n\t"
+        "swc1       %[val9],        18*4(%[out])                            \n\t"
+        "swc1       %[val10],       10*4(%[out])                            \n\t"
+        "swc1       %[val11],       26*4(%[out])                            \n\t"
+        "swc1       %[val12],        6*4(%[out])                            \n\t"
+        "swc1       %[val13],       22*4(%[out])                            \n\t"
+        "swc1       %[val14],       14*4(%[out])                            \n\t"
+        "swc1       %[val15],       30*4(%[out])                            \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val8]  "+f" (val8),  [val9]  "+f" (val9),  [val10] "+f" (val10),
+          [val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13),
+          [val14] "+f" (val14), [val15] "+f" (val15)
+        : [fTmp1] "f" (fTmp1), [out] "r" (out)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val16],       %[val17]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val17]                \n\t"
+        "sub.s      %[fTmp3],       %[val19],       %[val18]                \n\t"
+        "add.s      %[val18],       %[val19],       %[val18]                \n\t"
+        "sub.s      %[fTmp4],       %[val20],       %[val21]                \n\t"
+        "mul.s      %[val17],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val20],       %[val20],       %[val21]                \n\t"
+        "mul.s      %[val19],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val23],       %[val22]                \n\t"
+        "mul.s      %[val21],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val22],       %[val23],       %[val22]                \n\t"
+        "add.s      %[val18],       %[val18],       %[val19]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val22],       %[val22],       %[val23]                \n\t"
+        "add.s      %[val20],       %[val20],       %[val22]                \n\t"
+        "add.s      %[val22],       %[val22],       %[val21]                \n\t"
+        "add.s      %[val21],       %[val21],       %[val23]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18),
+          [val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21),
+          [val22] "+f" (val22), [val23] "+f" (val23)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val24],       %[val25]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val25]                \n\t"
+        "sub.s      %[fTmp3],       %[val27],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val27],       %[val26]                \n\t"
+        "sub.s      %[fTmp4],       %[val28],       %[val29]                \n\t"
+        "mul.s      %[val25],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val29]                \n\t"
+        "mul.s      %[val27],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val31],       %[val30]                \n\t"
+        "mul.s      %[val29],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val30],       %[val31],       %[val30]                \n\t"
+        "add.s      %[val26],       %[val26],       %[val27]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val31]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val30]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val29]                \n\t"
+        "add.s      %[val29],       %[val29],       %[val31]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val28]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val26],       %[val30]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val25]                \n\t"
+        "add.s      %[val25],       %[val25],       %[val29]                \n\t"
+        "add.s      %[val29],       %[val29],       %[val27]                \n\t"
+        "add.s      %[val27],       %[val27],       %[val31]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26),
+          [val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29),
+          [val30] "+f" (val30), [val31] "+f" (val31)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    out[ 1] = val16 + val24;
+    out[17] = val17 + val25;
+    out[ 9] = val18 + val26;
+    out[25] = val19 + val27;
+    out[ 5] = val20 + val28;
+    out[21] = val21 + val29;
+    out[13] = val22 + val30;
+    out[29] = val23 + val31;
+    out[ 3] = val24 + val20;
+    out[19] = val25 + val21;
+    out[11] = val26 + val22;
+    out[27] = val27 + val23;
+    out[ 7] = val28 + val18;
+    out[23] = val29 + val19;
+    out[15] = val30 + val17;
+    out[31] = val31;
+}
+
+static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
+{
+    float t0, t1, t2, t3, s0, s1, s2, s3;
+    float tmp[18];
+    /* temporary variables */
+    float in1, in2, in3, in4, in5, in6;
+    float out1, out2, out3, out4, out5;
+    float c1, c2, c3, c4, c5, c6, c7, c8, c9;
+
+    /**
+    * all loops are unrolled totally, and instructions are scheduled to
+    * minimize pipeline stall. instructions of the first two loops are
+    * reorganized, in order to eliminate unnecessary readings and
+    * writings into array. values defined in macros and tables are
+    * eliminated - they are directly loaded in appropriate variables
+    */
+
+    /* loop 1 and 2 */
+    __asm__ volatile (
+        "lwc1   %[in1],  17*4(%[in])                                    \t\n"
+        "lwc1   %[in2],  16*4(%[in])                                    \t\n"
+        "lwc1   %[in3],  15*4(%[in])                                    \t\n"
+        "lwc1   %[in4],  14*4(%[in])                                    \t\n"
+        "lwc1   %[in5],  13*4(%[in])                                    \t\n"
+        "lwc1   %[in6],  12*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[in1],  %[in2]                                \t\n"
+        "add.s  %[out2], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out3], %[in3],  %[in4]                                \t\n"
+        "add.s  %[out4], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out5], %[in5],  %[in6]                                \t\n"
+        "lwc1   %[in1],  11*4(%[in])                                    \t\n"
+        "swc1   %[out2], 16*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[out1], %[out3]                               \t\n"
+        "swc1   %[out4], 14*4(%[in])                                    \t\n"
+        "add.s  %[out3], %[out3], %[out5]                               \t\n"
+        "lwc1   %[in2],  10*4(%[in])                                    \t\n"
+        "lwc1   %[in3],  9*4(%[in])                                     \t\n"
+        "swc1   %[out1], 17*4(%[in])                                    \t\n"
+        "lwc1   %[in4],  8*4(%[in])                                     \t\n"
+        "swc1   %[out3], 15*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[in6],  %[in1]                                \t\n"
+        "add.s  %[out2], %[in1],  %[in2]                                \t\n"
+        "add.s  %[out3], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out4], %[in3],  %[in4]                                \t\n"
+        "lwc1   %[in5],  7*4(%[in])                                     \t\n"
+        "swc1   %[out1], 12*4(%[in])                                    \t\n"
+        "add.s  %[out5], %[out5], %[out2]                               \t\n"
+        "swc1   %[out3], 10*4(%[in])                                    \t\n"
+        "add.s  %[out2], %[out2], %[out4]                               \t\n"
+        "lwc1   %[in6],  6*4(%[in])                                     \t\n"
+        "lwc1   %[in1],  5*4(%[in])                                     \t\n"
+        "swc1   %[out5], 13*4(%[in])                                    \t\n"
+        "lwc1   %[in2],  4*4(%[in])                                     \t\n"
+        "swc1   %[out2], 11*4(%[in])                                    \t\n"
+        "add.s  %[out5], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out1], %[in5],  %[in6]                                \t\n"
+        "add.s  %[out2], %[in6],  %[in1]                                \t\n"
+        "add.s  %[out3], %[in1],  %[in2]                                \t\n"
+        "lwc1   %[in3],  3*4(%[in])                                     \t\n"
+        "swc1   %[out5], 8*4(%[in])                                     \t\n"
+        "add.s  %[out4], %[out4], %[out1]                               \t\n"
+        "swc1   %[out2], 6*4(%[in])                                     \t\n"
+        "add.s  %[out1], %[out1], %[out3]                               \t\n"
+        "lwc1   %[in4],  2*4(%[in])                                     \t\n"
+        "lwc1   %[in5],  1*4(%[in])                                     \t\n"
+        "swc1   %[out4], 9*4(%[in])                                     \t\n"
+        "lwc1   %[in6],  0(%[in])                                       \t\n"
+        "swc1   %[out1], 7*4(%[in])                                     \t\n"
+        "add.s  %[out4], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out5], %[in3],  %[in4]                                \t\n"
+        "add.s  %[out1], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out2], %[in5],  %[in6]                                \t\n"
+        "swc1   %[out4], 4*4(%[in])                                     \t\n"
+        "add.s  %[out3], %[out3], %[out5]                               \t\n"
+        "swc1   %[out1], 2*4(%[in])                                     \t\n"
+        "add.s  %[out5], %[out5], %[out2]                               \t\n"
+        "swc1   %[out2], 1*4(%[in])                                     \t\n"
+        "swc1   %[out3], 5*4(%[in])                                     \t\n"
+        "swc1   %[out5], 3*4(%[in])                                     \t\n"
+
+        : [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4),
+          [out5] "=&f" (out5)
+        : [in] "r" (in)
+        : "memory"
+    );
+
+    /* loop 3 */
+    __asm__ volatile (
+        "li.s    %[c1],   0.5                                           \t\n"
+        "lwc1    %[in1],  8*4(%[in])                                    \t\n"
+        "lwc1    %[in2],  16*4(%[in])                                   \t\n"
+        "lwc1    %[in3],  4*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  0(%[in])                                      \t\n"
+        "lwc1    %[in5],  12*4(%[in])                                   \t\n"
+        "li.s    %[c2],   0.93969262078590838405                        \t\n"
+        "add.s   %[t2],   %[in1],  %[in2]                               \t\n"
+        "add.s   %[t0],   %[in1],  %[in3]                               \t\n"
+        "li.s    %[c3],   -0.76604444311897803520                       \t\n"
+        "madd.s  %[t3],   %[in4],  %[in5], %[c1]                        \t\n"
+        "sub.s   %[t1],   %[in4],  %[in5]                               \t\n"
+        "sub.s   %[t2],   %[t2],   %[in3]                               \t\n"
+        "mul.s   %[t0],   %[t0],   %[c2]                                \t\n"
+        "li.s    %[c4],   -0.17364817766693034885                       \t\n"
+        "li.s    %[c5],   -0.86602540378443864676                       \t\n"
+        "li.s    %[c6],   0.98480775301220805936                        \t\n"
+        "nmsub.s %[out1], %[t1],   %[t2],  %[c1]                        \t\n"
+        "add.s   %[out2], %[t1],   %[t2]                                \t\n"
+        "add.s   %[t2],   %[in2],  %[in3]                               \t\n"
+        "sub.s   %[t1],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[t3],   %[t0]                                \t\n"
+        "swc1    %[out1], 6*4(%[tmp])                                   \t\n"
+        "swc1    %[out2], 16*4(%[tmp])                                  \t\n"
+        "mul.s   %[t2],   %[t2],   %[c3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c4]                                \t\n"
+        "add.s   %[out1], %[t3],   %[t0]                                \t\n"
+        "lwc1    %[in1],  10*4(%[in])                                   \t\n"
+        "lwc1    %[in2],  14*4(%[in])                                   \t\n"
+        "sub.s   %[out3], %[out3], %[t2]                                \t\n"
+        "add.s   %[out2], %[t3],   %[t2]                                \t\n"
+        "add.s   %[out1], %[out1], %[t1]                                \t\n"
+        "lwc1    %[in3],  2*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  6*4(%[in])                                    \t\n"
+        "swc1    %[out3], 10*4(%[tmp])                                  \t\n"
+        "sub.s   %[out2], %[out2], %[t1]                                \t\n"
+        "swc1    %[out1], 2*4(%[tmp])                                   \t\n"
+        "add.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "add.s   %[t2],   %[in1],  %[in3]                               \t\n"
+        "sub.s   %[t3],   %[in1],  %[in2]                               \t\n"
+        "swc1    %[out2], 14*4(%[tmp])                                  \t\n"
+        "li.s    %[c7],   -0.34202014332566873304                       \t\n"
+        "sub.s   %[out1], %[out1], %[in3]                               \t\n"
+        "mul.s   %[t2],   %[t2],   %[c6]                                \t\n"
+        "mul.s   %[t3],   %[t3],   %[c7]                                \t\n"
+        "li.s    %[c8],   0.86602540378443864676                        \t\n"
+        "mul.s   %[t0],   %[in4],  %[c8]                                \t\n"
+        "mul.s   %[out1], %[out1], %[c5]                                \t\n"
+        "add.s   %[t1],   %[in2],  %[in3]                               \t\n"
+        "li.s    %[c9],   -0.64278760968653932632                       \t\n"
+        "add.s   %[out2], %[t2],   %[t3]                                \t\n"
+        "lwc1    %[in1],  9*4(%[in])                                    \t\n"
+        "swc1    %[out1], 4*4(%[tmp])                                   \t\n"
+        "mul.s   %[t1],   %[t1],   %[c9]                                \t\n"
+        "lwc1    %[in2],  17*4(%[in])                                   \t\n"
+        "add.s   %[out2], %[out2], %[t0]                                \t\n"
+        "lwc1    %[in3],  5*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  1*4(%[in])                                    \t\n"
+        "add.s   %[out3], %[t2],   %[t1]                                \t\n"
+        "sub.s   %[out1], %[t3],   %[t1]                                \t\n"
+        "swc1    %[out2], 0(%[tmp])                                     \t\n"
+        "lwc1    %[in5],  13*4(%[in])                                   \t\n"
+        "add.s   %[t2],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[out3], %[t0]                                \t\n"
+        "sub.s   %[out1], %[out1], %[t0]                                \t\n"
+        "add.s   %[t0],   %[in1],  %[in3]                               \t\n"
+        "madd.s  %[t3],   %[in4],  %[in5], %[c1]                        \t\n"
+        "sub.s   %[t2],   %[t2],   %[in3]                               \t\n"
+        "swc1    %[out3], 12*4(%[tmp])                                  \t\n"
+        "swc1    %[out1], 8*4(%[tmp])                                   \t\n"
+        "sub.s   %[t1],   %[in4],  %[in5]                               \t\n"
+        "mul.s   %[t0],   %[t0],   %[c2]                                \t\n"
+        "nmsub.s %[out1], %[t1],   %[t2],  %[c1]                        \t\n"
+        "add.s   %[out2], %[t1],   %[t2]                                \t\n"
+        "add.s   %[t2],   %[in2],  %[in3]                               \t\n"
+        "sub.s   %[t1],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[t3],   %[t0]                                \t\n"
+        "swc1    %[out1], 7*4(%[tmp])                                   \t\n"
+        "swc1    %[out2], 17*4(%[tmp])                                  \t\n"
+        "mul.s   %[t2],   %[t2],   %[c3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c4]                                \t\n"
+        "add.s   %[out1], %[t3],   %[t0]                                \t\n"
+        "lwc1    %[in1],  11*4(%[in])                                   \t\n"
+        "lwc1    %[in2],  15*4(%[in])                                   \t\n"
+        "sub.s   %[out3], %[out3], %[t2]                                \t\n"
+        "add.s   %[out2], %[t3],   %[t2]                                \t\n"
+        "add.s   %[out1], %[out1], %[t1]                                \t\n"
+        "lwc1    %[in3],  3*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  7*4(%[in])                                    \t\n"
+        "swc1    %[out3], 11*4(%[tmp])                                  \t\n"
+        "sub.s   %[out2], %[out2], %[t1]                                \t\n"
+        "swc1    %[out1], 3*4(%[tmp])                                   \t\n"
+        "add.s   %[out3], %[in1],  %[in2]                               \t\n"
+        "add.s   %[t2],   %[in1],  %[in3]                               \t\n"
+        "sub.s   %[t3],   %[in1],  %[in2]                               \t\n"
+        "swc1    %[out2], 15*4(%[tmp])                                  \t\n"
+        "mul.s   %[t0],   %[in4],  %[c8]                                \t\n"
+        "sub.s   %[out3], %[out3], %[in3]                               \t\n"
+        "mul.s   %[t2],   %[t2],   %[c6]                                \t\n"
+        "mul.s   %[t3],   %[t3],   %[c7]                                \t\n"
+        "add.s   %[t1],   %[in2],  %[in3]                               \t\n"
+        "mul.s   %[out3], %[out3], %[c5]                                \t\n"
+        "add.s   %[out1], %[t2],   %[t3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c9]                                \t\n"
+        "swc1    %[out3], 5*4(%[tmp])                                   \t\n"
+        "add.s   %[out1], %[out1], %[t0]                                \t\n"
+        "add.s   %[out2], %[t2],   %[t1]                                \t\n"
+        "sub.s   %[out3], %[t3],   %[t1]                                \t\n"
+        "swc1    %[out1], 1*4(%[tmp])                                   \t\n"
+        "sub.s   %[out2], %[out2], %[t0]                                \t\n"
+        "sub.s   %[out3], %[out3], %[t0]                                \t\n"
+        "swc1    %[out2], 13*4(%[tmp])                                  \t\n"
+        "swc1    %[out3], 9*4(%[tmp])                                   \t\n"
+
+        : [t0] "=&f" (t0), [t1] "=&f" (t1),
+          [t2] "=&f" (t2), [t3] "=&f" (t3),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3),
+          [c1] "=&f" (c1), [c2] "=&f" (c2),
+          [c3] "=&f" (c3), [c4] "=&f" (c4),
+          [c5] "=&f" (c5), [c6] "=&f" (c6),
+          [c7] "=&f" (c7), [c8] "=&f" (c8),
+          [c9] "=&f" (c9)
+        : [in] "r" (in), [tmp] "r" (tmp)
+        : "memory"
+    );
+
+    /* loop 4 */
+    __asm__ volatile (
+        "lwc1   %[in1],  2*4(%[tmp])                                    \t\n"
+        "lwc1   %[in2],  0(%[tmp])                                      \t\n"
+        "lwc1   %[in3],  3*4(%[tmp])                                    \t\n"
+        "lwc1   %[in4],  1*4(%[tmp])                                    \t\n"
+        "li.s   %[c1],   0.50190991877167369479                         \t\n"
+        "li.s   %[c2],   5.73685662283492756461                         \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  9*4(%[win])                                    \t\n"
+        "lwc1   %[in2],  4*9*4(%[buf])                                  \t\n"
+        "lwc1   %[in3],  8*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*8*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  29*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  28*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "li.s   %[c1],   0.51763809020504152469                         \t\n"
+        "li.s   %[c2],   1.93185165257813657349                         \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out3], 4*9*4(%[buf])                                  \t\n"
+        "swc1   %[out1], 288*4(%[out])                                  \t\n"
+        "swc1   %[out2], 256*4(%[out])                                  \t\n"
+        "swc1   %[out4], 4*8*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  17*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*17*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  0(%[win])                                      \t\n"
+        "lwc1   %[in4],  0(%[buf])                                      \t\n"
+        "lwc1   %[in5],  37*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  20*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  6*4(%[tmp])                                    \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 544*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  4*4(%[tmp])                                    \t\n"
+        "swc1   %[out2], 0(%[out])                                      \t\n"
+        "swc1   %[out3], 4*17*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 0(%[buf])                                      \t\n"
+        "lwc1   %[in3],  7*4(%[tmp])                                    \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in4],  5*4(%[tmp])                                    \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  10*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*10*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  7*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[in4],  4*7*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  30*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  27*4(%[win])                                   \t\n"
+        "li.s   %[c1],   0.55168895948124587824                         \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out1], 320*4(%[out])                                  \t\n"
+        "swc1   %[out2], 224*4(%[out])                                  \t\n"
+        "swc1   %[out3], 4*10*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*7*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  16*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*16*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  1*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*1*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  36*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  21*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  10*4(%[tmp])                                   \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "swc1   %[out1], 512*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  8*4(%[tmp])                                    \t\n"
+        "swc1   %[out2], 32*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*16*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*1*4(%[buf])                                  \t\n"
+        "li.s   %[c2],   1.18310079157624925896                         \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in3],  11*4(%[tmp])                                   \t\n"
+        "lwc1   %[in4],  9*4(%[tmp])                                    \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  11*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*11*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  6*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*6*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  31*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  26*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "swc1   %[out3], 4*11*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*6*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out1], 352*4(%[out])                                  \t\n"
+        "swc1   %[out2], 192*4(%[out])                                  \t\n"
+        "lwc1   %[in1],  15*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*15*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  2*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*2*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  35*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  22*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  14*4(%[tmp])                                   \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 480*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  12*4(%[tmp])                                   \t\n"
+        "swc1   %[out2], 64*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*15*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*2*4(%[buf])                                  \t\n"
+        "lwc1   %[in3],  15*4(%[tmp])                                   \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in4],  13*4(%[tmp])                                   \t\n"
+        "li.s   %[c1],   0.61038729438072803416                         \t\n"
+        "li.s   %[c2],   0.87172339781054900991                         \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  12*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*12*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  5*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*5*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  32*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  25*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[s0],   16*4(%[tmp])                                   \t\n"
+        "lwc1   %[s1],   17*4(%[tmp])                                   \t\n"
+        "li.s   %[c1],   0.70710678118654752439                         \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out3], 4*12*4(%[buf])                                 \t\n"
+        "swc1   %[out1], 384*4(%[out])                                  \t\n"
+        "swc1   %[out2], 160*4(%[out])                                  \t\n"
+        "swc1   %[out4], 4*5*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  14*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*14*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  3*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*3*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  34*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  23*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "swc1   %[out1], 448*4(%[out])                                  \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "swc1   %[out2], 96*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*14*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*3*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[in1],  13*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*13*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  4*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*4*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  33*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  24*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 416*4(%[out])                                  \t\n"
+        "swc1   %[out2], 128*4(%[out])                                  \t\n"
+        "swc1   %[out3], 4*13*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*4*4(%[buf])                                  \t\n"
+
+        : [c1] "=&f" (c1), [c2] "=&f" (c2),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4),
+          [t0] "=&f" (t0), [t1] "=&f" (t1),
+          [t2] "=&f" (t2), [t3] "=&f" (t3),
+          [s0] "=&f" (s0), [s1] "=&f" (s1),
+          [s2] "=&f" (s2), [s3] "=&f" (s3)
+        : [tmp] "r" (tmp), [win] "r" (win),
+          [buf] "r" (buf), [out] "r" (out)
+        : "memory"
+    );
+}
+
+static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in,
+                               int count, int switch_point, int block_type)
+{
+    int j;
+    for (j=0 ; j < count; j++) {
+        /* apply window & overlap with previous buffer */
+
+        /* select window */
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;
+        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];
+
+        imdct36_mips_float(out, buf, in, win);
+
+        in  += 18;
+        buf += ((j&3) != 3 ? 1 : (72-3));
+        out++;
+    }
+}
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+
+void ff_mpadsp_init_mipsfpu(MPADSPContext *s)
+{
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->apply_window_float   = ff_mpadsp_apply_window_mips_float;
+    s->imdct36_blocks_float = ff_imdct36_blocks_mips_float;
+    s->dct32_float          = ff_dct32_mips_float;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/mpegvideo_init_mips.c b/libavcodec/mips/mpegvideo_init_mips.c
new file mode 100644
index 0000000..be77308
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_init_mips.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+#include "mpegvideo_mips.h"
+
+#if HAVE_MSA
+static av_cold void dct_unquantize_init_msa(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_msa;
+    s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_msa;
+    if (!s->q_scale_type)
+        s->dct_unquantize_mpeg2_inter = ff_dct_unquantize_mpeg2_inter_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void dct_unquantize_init_mmi(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_mmi;
+    s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_mmi;
+    s->dct_unquantize_mpeg1_intra = ff_dct_unquantize_mpeg1_intra_mmi;
+    s->dct_unquantize_mpeg1_inter = ff_dct_unquantize_mpeg1_inter_mmi;
+
+    if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT))
+        if (!s->q_scale_type)
+            s->dct_unquantize_mpeg2_intra = ff_dct_unquantize_mpeg2_intra_mmi;
+
+    s->denoise_dct= ff_denoise_dct_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_mpv_common_init_mips(MpegEncContext *s)
+{
+#if HAVE_MMI
+    dct_unquantize_init_mmi(s);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    dct_unquantize_init_msa(s);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
new file mode 100644
index 0000000..760d7b3
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_MPEGVIDEO_MIPS_H
+#define AVCODEC_MIPS_MPEGVIDEO_MIPS_H
+
+#include "libavcodec/mpegvideo.h"
+
+void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block);
+
+#endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c
new file mode 100644
index 0000000..18058e4
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_mmi.c
@@ -0,0 +1,506 @@
+/*
+ * Loongson SIMD optimized mpegvideo
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mpegvideo_mips.h"
+#include "libavutil/mips/mmiutils.h"
+
+void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t level, qmul, qadd, nCoeffs;
+    double ftmp[6];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+
+    qmul = qscale << 1;
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+
+    if (!s->h263_aic) {
+        if (n<4)
+            level = block[0] * s->y_dc_scale;
+        else
+            level = block[0] * s->c_dc_scale;
+        qadd = (qscale-1) | 1;
+    } else {
+        qadd = 0;
+        level = block[0];
+    }
+
+    if(s->ac_pred)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[qadd]                 \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        ".p2align   4                                                   \n\t"
+
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[block],       %[nCoeffs]              \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDC1(%[ftmp2], %[addr0], 0x08)
+        "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[qmul]                 \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[qmul]                 \n\t"
+        "pcmpgth    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "pcmpgth    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "pcmpeqh    %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pcmpeqh    %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pandn      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pandn      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        PTR_ADDIU  "%[nCoeffs], %[nCoeffs],     0x10                    \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp2], %[addr0], 0x08)
+        "blez       %[nCoeffs], 1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qmul]"f"(qmul),                  [qadd]"f"(qadd)
+        : "memory"
+    );
+
+    block[0] = level;
+}
+
+void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t qmul, qadd, nCoeffs;
+    double ftmp[6];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+
+    qmul = qscale << 1;
+    qadd = (qscale - 1) | 1;
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    __asm__ volatile (
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[qadd]                 \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        ".p2align   4                                                   \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[block],       %[nCoeffs]              \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDC1(%[ftmp2], %[addr0], 0x08)
+        "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[qmul]                 \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[qmul]                 \n\t"
+        "pcmpgth    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "pcmpgth    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "pcmpeqh    %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pcmpeqh    %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pandn      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pandn      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        PTR_ADDIU  "%[nCoeffs], %[nCoeffs],     0x10                    \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp2], %[addr0], 0x08)
+        "blez       %[nCoeffs], 1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qmul]"f"(qmul),                  [qadd]"f"(qadd)
+        : "memory"
+    );
+}
+
+void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    av_assert2(s->block_last_index[n]>=0);
+    nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
+
+    if (n<4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+
+    /* XXX: only mpeg1 */
+    quant_matrix = s->intra_matrix;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x0f                                    \n\t"
+        "pcmpeqh    %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "dmtc1      %[qscale],  %[ftmp1]                                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
+        ".p2align   4                                                   \n\t"
+
+        "1:                                                             \n\t"
+        MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
+        MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp3]                                \n\t"
+        MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
+        MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "pcmpgth    %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "pcmpgth    %[ftmp9],   %[ftmp9],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "dli        %[tmp0],    0x03                                    \n\t"
+        "pcmpeqh    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "or         %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "or         %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "pandn      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
+        "bltz       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qscale]"r"(qscale)
+        : "memory"
+    );
+
+    block[0] = block0;
+}
+
+void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    av_assert2(s->block_last_index[n] >= 0);
+    nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
+    quant_matrix = s->inter_matrix;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x0f                                    \n\t"
+        "pcmpeqh    %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "dmtc1      %[qscale],  %[ftmp1]                                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
+        ".p2align   4                                                   \n\t"
+
+        "1:                                                             \n\t"
+        MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
+        MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp3]                                \n\t"
+        MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
+        MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "pcmpgth    %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "pcmpgth    %[ftmp9],   %[ftmp9],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "pcmpeqh    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "or         %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "or         %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "pandn      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
+        "bltz       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qscale]"r"(qscale)
+        : "memory"
+    );
+}
+
+void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    uint64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    assert(s->block_last_index[n]>=0);
+
+    if (s->alternate_scan)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
+
+    if (n < 4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+
+    quant_matrix = s->intra_matrix;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x0f                                    \n\t"
+        "pcmpeqh    %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "mtc1       %[qscale],  %[ftmp9]                                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "packsswh   %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "packsswh   %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
+        ".p2align   4                                                   \n\t"
+
+        "1:                                                             \n\t"
+        MMI_LDXC1(%[ftmp1], %[addr0], %[block], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x08)
+        "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        MMI_LDXC1(%[ftmp5], %[addr0], %[quant], 0x00)
+        MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x08)
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "pcmpgth    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "pcmpgth    %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "dli        %[tmp0],    0x03                                    \n\t"
+        "pcmpeqh    %[ftmp6] ,  %[ftmp6],       %[ftmp4]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "pandn      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
+        MMI_SDXC1(%[ftmp5], %[addr0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x08)
+        "blez       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qscale]"r"(qscale)
+        : "memory"
+    );
+
+    block[0]= block0;
+}
+
+void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
+{
+    const int intra = s->mb_intra;
+    int *sum = s->dct_error_sum[intra];
+    uint16_t *offset = s->dct_offset[intra];
+    double ftmp[8];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+
+    s->dct_count[intra]++;
+
+    __asm__ volatile(
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x08)
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "pcmpgth    %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "pcmpgth    %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        MMI_LDC1(%[ftmp6], %[offset], 0x00)
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "psubush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        MMI_LDC1(%[ftmp6], %[offset], 0x08)
+        "mov.d      %[ftmp7],   %[ftmp3]                                \n\t"
+        "psubush    %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp1], %[block], 0x00)
+        MMI_SDC1(%[ftmp3], %[block], 0x08)
+        "mov.d      %[ftmp1],   %[ftmp5]                                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp7]                                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x00)
+        "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x08)
+        "paddw      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x10)
+        "paddw      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x18)
+        "paddw      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp5], %[sum], 0x00)
+        MMI_SDC1(%[ftmp1], %[sum], 0x08)
+        MMI_SDC1(%[ftmp7], %[sum], 0x10)
+        MMI_SDC1(%[ftmp3], %[sum], 0x18)
+        PTR_ADDIU  "%[block],   %[block],       0x10                    \n\t"
+        PTR_ADDIU  "%[sum],     %[sum],         0x20                    \n\t"
+        PTR_SUBU   "%[addr0],   %[block1],      %[block]                \n\t"
+        PTR_ADDIU  "%[offset],  %[offset],      0x10                    \n\t"
+        "bgtz       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),
+          [block]"+&r"(block),              [sum]"+&r"(sum),
+          [offset]"+&r"(offset)
+        : [block1]"r"(block+64)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c
new file mode 100644
index 0000000..aa9ef77
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_msa.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h263dsp_mips.h"
+
+static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
+                                    int16_t qadd, int8_t n_coeffs,
+                                    uint8_t loop_start)
+{
+    int16_t *block_dup = block;
+    int32_t level, cnt;
+    v8i16 block_vec, qmul_vec, qadd_vec, sub;
+    v8i16 add, mask, mul, zero_mask;
+
+    qmul_vec = __msa_fill_h(qmul);
+    qadd_vec = __msa_fill_h(qadd);
+    for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
+        block_vec = LD_SH(block_dup + loop_start);
+        mask = __msa_clti_s_h(block_vec, 0);
+        zero_mask = __msa_ceqi_h(block_vec, 0);
+        mul = block_vec * qmul_vec;
+        sub = mul - qadd_vec;
+        add = mul + qadd_vec;
+        add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
+                                         (v16u8) zero_mask);
+        ST_SH(block_vec, block_dup + loop_start);
+        block_dup += 8;
+    }
+
+    cnt = ((n_coeffs >> 3) * 8) + loop_start;
+
+    for (; cnt <= n_coeffs; cnt++) {
+        level = block[cnt];
+        if (level) {
+            if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[cnt] = level;
+        }
+    }
+}
+
+static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block,
+                                              int32_t qscale,
+                                              const int16_t *quant_matrix)
+{
+    int32_t cnt, sum_res = -1;
+    v8i16 block_vec, block_neg, qscale_vec, mask;
+    v8i16 block_org0, block_org1, block_org2, block_org3;
+    v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
+    v8i16 sum, mul, zero_mask;
+    v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
+    v4i32 block_l, block_r, sad;
+
+    qscale_vec = __msa_fill_h(qscale);
+    for (cnt = 0; cnt < 2; cnt++) {
+        LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
+        LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
+        mask = __msa_clti_s_h(block_org0, 0);
+        zero_mask = __msa_ceqi_h(block_org0, 0);
+        block_neg = -block_org0;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org1, 0);
+        zero_mask = __msa_ceqi_h(block_org1, 0);
+        block_neg = - block_org1;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org2, 0);
+        zero_mask = __msa_ceqi_h(block_org2, 0);
+        block_neg = - block_org2;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org3, 0);
+        zero_mask = __msa_ceqi_h(block_org3, 0);
+        block_neg = - block_org3;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+    }
+
+    return sum_res;
+}
+
+void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
+                                      int16_t *block, int32_t index,
+                                      int32_t qscale)
+{
+    int32_t qmul, qadd;
+    int32_t nCoeffs;
+
+    av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
+
+    qmul = qscale << 1;
+
+    if (!s->h263_aic) {
+        block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    } else {
+        qadd = 0;
+    }
+    if (s->ac_pred)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
+
+    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
+}
+
+void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
+                                      int16_t *block, int32_t index,
+                                      int32_t qscale)
+{
+    int32_t qmul, qadd;
+    int32_t nCoeffs;
+
+    av_assert2(s->block_last_index[index] >= 0);
+
+    qadd = (qscale - 1) | 1;
+    qmul = qscale << 1;
+
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
+
+    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
+}
+
+void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
+                                       int16_t *block, int32_t index,
+                                       int32_t qscale)
+{
+    const uint16_t *quant_matrix;
+    int32_t sum = -1;
+
+    quant_matrix = s->inter_matrix;
+
+    sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
+
+    block[63] ^= sum & 1;
+}
diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c b/libavcodec/mips/mpegvideoencdsp_init_mips.c
new file mode 100644
index 0000000..9bfe94e
--- /dev/null
+++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+#include "h263dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void mpegvideoencdsp_init_msa(MpegvideoEncDSPContext *c,
+                                             AVCodecContext *avctx)
+{
+#if BIT_DEPTH == 8
+    c->pix_sum = ff_pix_sum_msa;
+#endif
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
+                                          AVCodecContext *avctx)
+{
+#if HAVE_MSA
+    mpegvideoencdsp_init_msa(c, avctx);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/mpegvideoencdsp_msa.c b/libavcodec/mips/mpegvideoencdsp_msa.c
new file mode 100644
index 0000000..46473da
--- /dev/null
+++ b/libavcodec/mips/mpegvideoencdsp_msa.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+#include "libavutil/mips/generic_macros_msa.h"
+
+static int32_t sum_u8src_16width_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t sum = 0;
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 in8, in9, in10, in11, in12, in13, in14, in15;
+
+    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+    src += (8 * stride);
+    LD_UB8(src, stride, in8, in9, in10, in11, in12, in13, in14, in15);
+
+    HADD_UB4_UB(in0, in1, in2, in3, in0, in1, in2, in3);
+    HADD_UB4_UB(in4, in5, in6, in7, in4, in5, in6, in7);
+    HADD_UB4_UB(in8, in9, in10, in11, in8, in9, in10, in11);
+    HADD_UB4_UB(in12, in13, in14, in15, in12, in13, in14, in15);
+
+    sum = HADD_UH_U32(in0);
+    sum += HADD_UH_U32(in1);
+    sum += HADD_UH_U32(in2);
+    sum += HADD_UH_U32(in3);
+    sum += HADD_UH_U32(in4);
+    sum += HADD_UH_U32(in5);
+    sum += HADD_UH_U32(in6);
+    sum += HADD_UH_U32(in7);
+    sum += HADD_UH_U32(in8);
+    sum += HADD_UH_U32(in9);
+    sum += HADD_UH_U32(in10);
+    sum += HADD_UH_U32(in11);
+    sum += HADD_UH_U32(in12);
+    sum += HADD_UH_U32(in13);
+    sum += HADD_UH_U32(in14);
+    sum += HADD_UH_U32(in15);
+
+    return sum;
+}
+
+int ff_pix_sum_msa(uint8_t *pix, int line_size)
+{
+    return sum_u8src_16width_msa(pix, line_size);
+}
diff --git a/libavcodec/mips/pixblockdsp_init_mips.c b/libavcodec/mips/pixblockdsp_init_mips.c
new file mode 100644
index 0000000..fd0238d
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_init_mips.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "pixblockdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void pixblockdsp_init_msa(PixblockDSPContext *c,
+                                         AVCodecContext *avctx,
+                                         unsigned high_bit_depth)
+{
+    c->diff_pixels = ff_diff_pixels_msa;
+
+    switch (avctx->bits_per_raw_sample) {
+    case 9:
+    case 10:
+    case 12:
+    case 14:
+        c->get_pixels = ff_get_pixels_16_msa;
+        break;
+    default:
+        if (avctx->bits_per_raw_sample <= 8 || avctx->codec_type !=
+            AVMEDIA_TYPE_VIDEO) {
+            c->get_pixels = ff_get_pixels_8_msa;
+        }
+        break;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void pixblockdsp_init_mmi(PixblockDSPContext *c,
+        AVCodecContext *avctx, unsigned high_bit_depth)
+{
+    c->diff_pixels = ff_diff_pixels_mmi;
+
+    if (!high_bit_depth || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
+        c->get_pixels = ff_get_pixels_8_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth)
+{
+#if HAVE_MMI
+    pixblockdsp_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    pixblockdsp_init_msa(c, avctx, high_bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/pixblockdsp_mips.h b/libavcodec/mips/pixblockdsp_mips.h
new file mode 100644
index 0000000..a12b1a6
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_mips.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+#define AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
+                        const uint8_t *src2, ptrdiff_t stride);
+void ff_get_pixels_16_msa(int16_t *restrict dst, const uint8_t *src,
+                          ptrdiff_t stride);
+void ff_get_pixels_8_msa(int16_t *restrict dst, const uint8_t *src,
+                         ptrdiff_t stride);
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
+                         ptrdiff_t stride);
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
+                        const uint8_t *src2, ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
diff --git a/libavcodec/mips/pixblockdsp_mmi.c b/libavcodec/mips/pixblockdsp_mmi.c
new file mode 100644
index 0000000..a915a3c
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_mmi.c
@@ -0,0 +1,135 @@
+/*
+ * Loongson SIMD optimized pixblockdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "pixblockdsp_mips.h"
+#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
+                         ptrdiff_t stride)
+{
+    double ftmp[7];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x00)
+        MMI_SDC1(%[ftmp4], %[block], 0x08)
+        MMI_SDC1(%[ftmp5], %[block], 0x10)
+        MMI_SDC1(%[ftmp6], %[block], 0x18)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[stride_x2]            \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x20)
+        MMI_SDC1(%[ftmp4], %[block], 0x28)
+        MMI_SDC1(%[ftmp5], %[block], 0x30)
+        MMI_SDC1(%[ftmp6], %[block], 0x38)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[stride_x2]            \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x40)
+        MMI_SDC1(%[ftmp4], %[block], 0x48)
+        MMI_SDC1(%[ftmp5], %[block], 0x50)
+        MMI_SDC1(%[ftmp6], %[block], 0x58)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[stride_x2]            \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x60)
+        MMI_SDC1(%[ftmp4], %[block], 0x68)
+        MMI_SDC1(%[ftmp5], %[block], 0x70)
+        MMI_SDC1(%[ftmp6], %[block], 0x78)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [pixels]"+&r"(pixels)
+        : [block]"r"((mips_reg)block),      [stride]"r"((mips_reg)stride),
+          [stride_x2]"r"((mips_reg)(stride<<1))
+        : "memory"
+    );
+}
+
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
+        const uint8_t *src2, ptrdiff_t stride)
+{
+    double ftmp[5];
+    mips_reg tmp[1];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "li         %[tmp0],    0x08                                    \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "1:                                                             \n\t"
+        MMI_LDC1(%[ftmp0], %[src1], 0x00)
+        "or         %[ftmp1],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp2], %[src2], 0x00)
+        "or         %[ftmp3],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp1], %[block], 0x08)
+        PTR_ADDI   "%[tmp0],    %[tmp0], -0x01                          \n\t"
+        PTR_ADDIU  "%[block],   %[block], 0x10                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[stride]               \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[stride]               \n\t"
+        "bgtz       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [src1]"+&r"(src1),
+          [src2]"+&r"(src2)
+        : [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/pixblockdsp_msa.c b/libavcodec/mips/pixblockdsp_msa.c
new file mode 100644
index 0000000..86a4576
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_msa.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "pixblockdsp_mips.h"
+
+static void diff_pixels_msa(int16_t *block, const uint8_t *src1,
+                            const uint8_t *src2, int32_t stride)
+{
+    v16u8 in10, in11, in12, in13, in14, in15, in16, in17;
+    v16u8 in20, in21, in22, in23, in24, in25, in26, in27;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+
+    LD_UB8(src1, stride, in10, in11, in12, in13, in14, in15, in16, in17);
+    LD_UB8(src2, stride, in20, in21, in22, in23, in24, in25, in26, in27);
+    ILVR_B4_SH(in10, in20, in11, in21, in12, in22, in13, in23,
+               out0, out1, out2, out3);
+    ILVR_B4_SH(in14, in24, in15, in25, in16, in26, in17, in27,
+               out4, out5, out6, out7);
+    HSUB_UB4_SH(out0, out1, out2, out3, out0, out1, out2, out3);
+    HSUB_UB4_SH(out4, out5, out6, out7, out4, out5, out6, out7);
+    ST_SH8(out0, out1, out2, out3, out4, out5, out6, out7, block, 8);
+}
+
+static void copy_8bit_to_16bit_width8_msa(const uint8_t *src, int32_t src_stride,
+                                          int16_t *dst, int32_t dst_stride,
+                                          int32_t height)
+{
+    uint8_t *dst_ptr;
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 zero = { 0 };
+
+    dst_ptr = (uint8_t *) dst;
+
+    for (cnt = (height >> 2); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        ILVR_B4_UB(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0, src1, src2, src3);
+
+        ST_UB4(src0, src1, src2, src3, dst_ptr, (dst_stride * 2));
+        dst_ptr += (4 * 2 * dst_stride);
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+void ff_get_pixels_16_msa(int16_t *av_restrict dest, const uint8_t *src,
+                          ptrdiff_t stride)
+{
+    copy_width16_msa(src, stride, (uint8_t *) dest, 16, 8);
+}
+
+void ff_get_pixels_8_msa(int16_t *av_restrict dest, const uint8_t *src,
+                         ptrdiff_t stride)
+{
+    copy_8bit_to_16bit_width8_msa(src, stride, dest, 8, 8);
+}
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
+                        const uint8_t *src2, ptrdiff_t stride)
+{
+    diff_pixels_msa(block, src1, src2, stride);
+}
diff --git a/libavcodec/mips/qpeldsp_init_mips.c b/libavcodec/mips/qpeldsp_init_mips.c
new file mode 100644
index 0000000..140e8f8
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_init_mips.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "qpeldsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void qpeldsp_init_msa(QpelDSPContext *c)
+{
+    c->put_qpel_pixels_tab[0][0] = ff_copy_16x16_msa;
+    c->put_qpel_pixels_tab[0][1] = ff_horiz_mc_qpel_aver_src0_16width_msa;
+    c->put_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_16width_msa;
+    c->put_qpel_pixels_tab[0][3] = ff_horiz_mc_qpel_aver_src1_16width_msa;
+    c->put_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_aver_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][5] = ff_hv_mc_qpel_aver_hv_src00_16x16_msa;
+    c->put_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_aver_v_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][7] = ff_hv_mc_qpel_aver_hv_src10_16x16_msa;
+    c->put_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_16x16_msa;
+    c->put_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_aver_h_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_16x16_msa;
+    c->put_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_aver_h_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_aver_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][13] = ff_hv_mc_qpel_aver_hv_src01_16x16_msa;
+    c->put_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_aver_v_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][15] = ff_hv_mc_qpel_aver_hv_src11_16x16_msa;
+
+    c->put_qpel_pixels_tab[1][0] = ff_copy_8x8_msa;
+    c->put_qpel_pixels_tab[1][1] = ff_horiz_mc_qpel_aver_src0_8width_msa;
+    c->put_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_8width_msa;
+    c->put_qpel_pixels_tab[1][3] = ff_horiz_mc_qpel_aver_src1_8width_msa;
+    c->put_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_aver_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_aver_hv_src00_8x8_msa;
+    c->put_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_aver_v_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_aver_hv_src10_8x8_msa;
+    c->put_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_8x8_msa;
+    c->put_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_aver_h_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_8x8_msa;
+    c->put_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_aver_h_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_aver_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_aver_hv_src01_8x8_msa;
+    c->put_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_aver_v_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_aver_hv_src11_8x8_msa;
+
+    c->put_no_rnd_qpel_pixels_tab[0][0] = ff_copy_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][1] =
+        ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_no_rnd_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][3] =
+        ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][4] =
+        ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][5] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][6] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][7] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_no_rnd_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][9] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_no_rnd_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][11] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][12] =
+        ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][13] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][14] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][15] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa;
+
+    c->put_no_rnd_qpel_pixels_tab[1][0] = ff_copy_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][1] =
+        ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_no_rnd_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][3] =
+        ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][4] =
+        ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][5] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][6] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][7] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_no_rnd_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][9] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_no_rnd_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][11] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][12] =
+        ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][13] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][14] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][15] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa;
+
+    c->avg_qpel_pixels_tab[0][0] = ff_avg_width16_msa;
+    c->avg_qpel_pixels_tab[0][1] =
+        ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa;
+    c->avg_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_avg_dst_16width_msa;
+    c->avg_qpel_pixels_tab[0][3] =
+        ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa;
+    c->avg_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][5] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa;
+    c->avg_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][7] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa;
+    c->avg_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_avg_dst_16x16_msa;
+    c->avg_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_avg_dst_16x16_msa;
+    c->avg_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][13] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa;
+    c->avg_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][15] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa;
+
+    c->avg_qpel_pixels_tab[1][0] = ff_avg_width8_msa;
+    c->avg_qpel_pixels_tab[1][1] =
+        ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa;
+    c->avg_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_avg_dst_8width_msa;
+    c->avg_qpel_pixels_tab[1][3] =
+        ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa;
+    c->avg_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa;
+    c->avg_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa;
+    c->avg_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_avg_dst_8x8_msa;
+    c->avg_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_avg_dst_8x8_msa;
+    c->avg_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa;
+    c->avg_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa;
+}
+#endif  // #if HAVE_MSA
+
+void ff_qpeldsp_init_mips(QpelDSPContext *c)
+{
+#if HAVE_MSA
+    qpeldsp_init_msa(c);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/qpeldsp_mips.h b/libavcodec/mips/qpeldsp_mips.h
new file mode 100644
index 0000000..704d221
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_mips.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_QPELDSP_MIPS_H
+#define AVCODEC_MIPS_QPELDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_copy_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_copy_16x16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_width8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_width16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dst, const uint8_t *src,
+                                            ptrdiff_t stride);
+void ff_horiz_mc_qpel_8width_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_horiz_mc_qpel_16width_msa(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dst, const uint8_t *src,
+                                            ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dst,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dst, const uint8_t *src,
+                                          ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dst,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride);
+
+void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_vert_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t stride);
+void ff_vert_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+
+void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t stride);
+void ff_hv_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_QPELDSP_MIPS_H
diff --git a/libavcodec/mips/qpeldsp_msa.c b/libavcodec/mips/qpeldsp_msa.c
new file mode 100644
index 0000000..4710b3f
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_msa.c
@@ -0,0 +1,6518 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "qpeldsp_mips.h"
+
+#define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2)  \
+( {                                                                     \
+    v16u8 out, tmp0, tmp1;                                              \
+    v16u8 data0, data1, data2, data3, data4, data5;                     \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
+    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
+    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
+    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
+    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
+    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
+    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
+    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
+    sum0_r *= (v8u16) (coef0);                                          \
+    sum0_l *= (v8u16) (coef0);                                          \
+    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
+    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
+    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    SRARI_H2_SH(res_r, res_l, 5);                                       \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,                       \
+                                      mask0, mask1, mask2, mask3,       \
+                                      coef0, coef1, coef2)              \
+( {                                                                     \
+    v16u8 out;                                                          \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                               \
+    v8i16 res0_r, res1_r;                                               \
+                                                                        \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);   \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);   \
+    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                        \
+    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);          \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);   \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);   \
+    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);         \
+    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);         \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
+    res1_r = (v8i16) (sum4_r - sum7_r);                                 \
+    SRARI_H2_SH(res0_r, res1_r, 5);                                     \
+    CLIP_SH2_0_255(res0_r, res1_r);                                     \
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);        \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,                        \
+                                           mask0, mask1, mask2, mask3,  \
+                                           coef0, coef1, coef2)         \
+( {                                                                     \
+    v16u8 out;                                                          \
+    v8i16 res0_r;                                                       \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+                                                                        \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);   \
+    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);            \
+    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);             \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);   \
+    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);         \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
+    res0_r = __msa_srari_h(res0_r, 5);                                  \
+    res0_r = CLIP_SH_0_255(res0_r);                                     \
+    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);        \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,   \
+                                                    mask2, mask3, coef0,  \
+                                                    coef1, coef2)         \
+( {                                                                       \
+    v16u8 out;                                                            \
+    v8i16 res0_r;                                                         \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                 \
+                                                                          \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);     \
+    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);              \
+    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);               \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);     \
+    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);           \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                   \
+    res0_r += 15;                                                         \
+    res0_r >>= 5;                                                         \
+    res0_r = CLIP_SH_0_255(res0_r);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);          \
+                                                                          \
+    out;                                                                  \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,              \
+                                         coef0, coef1, coef2)           \
+( {                                                                     \
+    v16u8 out, tmp0, tmp1;                                              \
+    v16u8 data0, data1, data2, data3, data4, data5;                     \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
+    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
+    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
+    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
+    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
+    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
+    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
+    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
+    sum0_r *= (v8u16) (coef0);                                          \
+    sum0_l *= (v8u16) (coef0);                                          \
+    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
+    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
+    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    res_r += 15;                                                        \
+    res_l += 15;                                                        \
+    res_r >>= 5;                                                        \
+    res_l >>= 5;                                                        \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1,                  \
+                                               mask0, mask1, mask2, mask3,  \
+                                               coef0, coef1, coef2)         \
+( {                                                                         \
+    v16u8 out;                                                              \
+    v8i16 res0_r, res1_r;                                                   \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                   \
+    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                                   \
+                                                                            \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);       \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);       \
+    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                            \
+    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);              \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);       \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);       \
+    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);             \
+    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);             \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                     \
+    res1_r = (v8i16) (sum4_r - sum7_r);                                     \
+    res0_r += 15;                                                           \
+    res1_r += 15;                                                           \
+    res0_r >>= 5;                                                           \
+    res1_r >>= 5;                                                           \
+    CLIP_SH2_0_255(res0_r, res1_r);                                         \
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);            \
+                                                                            \
+    out;                                                                    \
+} )
+
+#define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3,                  \
+                               inp4, inp5, inp6, inp7,                  \
+                               coef0, coef1, coef2)                     \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
+    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
+    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
+    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    SRARI_H2_SH(res_r, res_l, 5);                                       \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03,        \
+                                     inp04, inp05, inp06, inp07,        \
+                                     inp10, inp11, inp12, inp13,        \
+                                     inp14, inp15, inp16, inp17,        \
+                                     coef0, coef1, coef2)               \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 val0, val1;                                                   \
+    v8u16 sum00, sum01, sum02, sum03;                                   \
+    v8u16 sum10, sum11, sum12, sum13;                                   \
+                                                                        \
+    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,  \
+               sum00, sum10, sum03, sum13);                             \
+    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);              \
+    HADD_UB2_UH(sum03, sum13, sum03, sum13);                            \
+    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,  \
+               sum02, sum12, sum01, sum11);                             \
+    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);             \
+    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);             \
+    val0 = (v8i16) (sum00 - sum03);                                     \
+    val1 = (v8i16) (sum10 - sum13);                                     \
+    SRARI_H2_SH(val0, val1, 5);                                         \
+    CLIP_SH2_0_255(val0, val1);                                         \
+    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);            \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3,         \
+                                        inp4, inp5, inp6, inp7,         \
+                                        coef0, coef1, coef2)            \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
+    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
+    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
+    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    res_r += 15;                                                        \
+    res_l += 15;                                                        \
+    res_r >>= 5;                                                        \
+    res_l >>= 5;                                                        \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03,  \
+                                              inp04, inp05, inp06, inp07,  \
+                                              inp10, inp11, inp12, inp13,  \
+                                              inp14, inp15, inp16, inp17,  \
+                                              coef0, coef1, coef2)         \
+( {                                                                        \
+    v16u8 res;                                                             \
+    v8i16 val0, val1;                                                      \
+    v8u16 sum00, sum01, sum02, sum03;                                      \
+    v8u16 sum10, sum11, sum12, sum13;                                      \
+                                                                           \
+    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,     \
+               sum00, sum10, sum03, sum13);                                \
+    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);                 \
+    HADD_UB2_UH(sum03, sum13, sum03, sum13);                               \
+    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,     \
+               sum02, sum12, sum01, sum11);                                \
+    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);                \
+    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);                \
+    val0 = (v8i16) (sum00 - sum03);                                        \
+    val1 = (v8i16) (sum10 - sum13);                                        \
+    val0 += 15;                                                            \
+    val1 += 15;                                                            \
+    val0 >>= 5;                                                            \
+    val1 >>= 5;                                                            \
+    CLIP_SH2_0_255(val0, val1);                                            \
+    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);               \
+                                                                           \
+    res;                                                                   \
+} )
+
+static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride,
+                                                int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_8width_msa(const uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_16width_msa(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride,
+                                                int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        res0 = __msa_ave_u_b(inp0, res0);
+        res1 = __msa_ave_u_b(inp2, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src,
+                                            int32_t src_stride,
+                                            uint8_t *dst,
+                                            int32_t dst_stride,
+                                            int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        res0 = __msa_ave_u_b(inp0, res0);
+        res1 = __msa_ave_u_b(inp2, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src,
+                                                        int32_t src_stride,
+                                                        uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1;
+    v16u8 dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1;
+    v16u8 dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src,
+                                                        int32_t src_stride,
+                                                        uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+
+static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp0);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp8, inp9);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp10, inp11);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp12, inp13);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp14, inp15);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp15);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_8x8_msa(const uint8_t *src,
+                                 int32_t src_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_16x16_msa(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    inp4 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+}
+
+static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    inp4 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp16);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp0);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+}
+
+static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
+                                        int32_t src_stride,
+                                        uint8_t *dst,
+                                        int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp16);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp8, inp9);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp10, inp11);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp12, inp13);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp14, inp15);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    res = __msa_ave_u_b(inp0, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    res = __msa_ave_u_b(inp1, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
+                                        int32_t src_stride,
+                                        uint8_t *dst,
+                                        int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    res = __msa_aver_u_b(inp0, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
+                                            int32_t src_stride,
+                                            uint8_t *dst,
+                                            int32_t dst_stride,
+                                            int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    res = __msa_aver_u_b(inp1, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_16x16_msa(const uint8_t *src,
+                                 int32_t src_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
+    res0 = __msa_aver_u_b(avg0, res0);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                         mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+
+}
+
+static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                       uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
+                                        horiz6, horiz7, horiz8, horiz5, horiz4,
+                                        horiz3, horiz2, horiz6, horiz7, horiz8,
+                                        horiz8, const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
+                                        horiz8, horiz8, horiz7, horiz7, horiz6,
+                                        horiz5, horiz4, horiz8, horiz8, horiz7,
+                                        horiz6, const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
+                                        horiz2, horiz3, horiz4, horiz1, horiz0,
+                                        horiz0, horiz1, horiz2, horiz3, horiz4,
+                                        horiz5, const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
+                                        horiz4, horiz5, horiz6, horiz3, horiz2,
+                                        horiz1, horiz0, horiz4, horiz5, horiz6,
+                                        horiz7, const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
+                                        horiz6, horiz7, horiz8, horiz5, horiz4,
+                                        horiz3, horiz2, horiz6, horiz7, horiz8,
+                                        horiz8, const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
+                                        horiz8, horiz8, horiz7, horiz7, horiz6,
+                                        horiz5, horiz4, horiz8, horiz8, horiz7,
+                                        horiz6, const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t src0, src1;
+    int32_t loop_cnt;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        src0 = LD(src);
+        src += src_stride;
+        src1 = LD(src);
+        src += src_stride;
+
+        SD(src0, dst);
+        dst += dst_stride;
+        SD(src1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+           dst, dst_stride);
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    copy_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    copy_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest,
+                                            const uint8_t *src,
+                                            ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_16width_msa(uint8_t *dest,
+                                  const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest,
+                                            const uint8_t *src,
+                                            ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest,
+                                        const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    avg_width8_msa(src, stride, dest, stride, 8);
+}
+
+void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    avg_width16_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest,
+                                          const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+
+void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
+                             ptrdiff_t stride)
+{
+    vert_mc_qpel_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
+                               ptrdiff_t stride)
+{
+    vert_mc_qpel_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
+                                    const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
+                                      const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
+                                     const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+/* HV cases */
+void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
+                             ptrdiff_t stride)
+{
+    hv_mc_qpel_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
+                           ptrdiff_t stride)
+{
+    hv_mc_qpel_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
+                                     const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
+                                   const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
+                                    const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
+                                  const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
diff --git a/libavcodec/mips/sbrdsp_mips.c b/libavcodec/mips/sbrdsp_mips.c
new file mode 100644
index 0000000..1b0a106
--- /dev/null
+++ b/libavcodec/mips/sbrdsp_mips.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * AAC Spectral Band Replication decoding functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/sbrdsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/sbrdsp.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void sbr_qmf_pre_shuffle_mips(float *z)
+{
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6;
+    float *z1 = &z[66];
+    float *z2 = &z[59];
+    float *z3 = &z[2];
+    float *z4 = z1 + 60;
+
+    /* loop unrolled 5 times */
+    __asm__ volatile (
+        "lui    %[Temp6],   0x8000                  \n\t"
+    "1:                                             \n\t"
+        "lw     %[Temp1],   0(%[z2])                \n\t"
+        "lw     %[Temp2],   4(%[z2])                \n\t"
+        "lw     %[Temp3],   8(%[z2])                \n\t"
+        "lw     %[Temp4],   12(%[z2])               \n\t"
+        "lw     %[Temp5],   16(%[z2])               \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
+        "xor    %[Temp2],   %[Temp2],   %[Temp6]    \n\t"
+        "xor    %[Temp3],   %[Temp3],   %[Temp6]    \n\t"
+        "xor    %[Temp4],   %[Temp4],   %[Temp6]    \n\t"
+        "xor    %[Temp5],   %[Temp5],   %[Temp6]    \n\t"
+        PTR_ADDIU "%[z2],   %[z2],      -20         \n\t"
+        "sw     %[Temp1],   32(%[z1])               \n\t"
+        "sw     %[Temp2],   24(%[z1])               \n\t"
+        "sw     %[Temp3],   16(%[z1])               \n\t"
+        "sw     %[Temp4],   8(%[z1])                \n\t"
+        "sw     %[Temp5],   0(%[z1])                \n\t"
+        "lw     %[Temp1],   0(%[z3])                \n\t"
+        "lw     %[Temp2],   4(%[z3])                \n\t"
+        "lw     %[Temp3],   8(%[z3])                \n\t"
+        "lw     %[Temp4],   12(%[z3])               \n\t"
+        "lw     %[Temp5],   16(%[z3])               \n\t"
+        "sw     %[Temp1],   4(%[z1])                \n\t"
+        "sw     %[Temp2],   12(%[z1])               \n\t"
+        "sw     %[Temp3],   20(%[z1])               \n\t"
+        "sw     %[Temp4],   28(%[z1])               \n\t"
+        "sw     %[Temp5],   36(%[z1])               \n\t"
+        PTR_ADDIU "%[z3],   %[z3],      20          \n\t"
+        PTR_ADDIU "%[z1],   %[z1],      40          \n\t"
+        "bne    %[z1],      %[z4],      1b          \n\t"
+        "lw     %[Temp1],   132(%[z])               \n\t"
+        "lw     %[Temp2],   128(%[z])               \n\t"
+        "lw     %[Temp3],   0(%[z])                 \n\t"
+        "lw     %[Temp4],   4(%[z])                 \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
+        "sw     %[Temp1],   504(%[z])               \n\t"
+        "sw     %[Temp2],   508(%[z])               \n\t"
+        "sw     %[Temp3],   256(%[z])               \n\t"
+        "sw     %[Temp4],   260(%[z])               \n\t"
+
+        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
+          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
+          [Temp5]"=&r"(Temp5), [Temp6]"=&r"(Temp6),
+          [z1]"+r"(z1), [z2]"+r"(z2), [z3]"+r"(z3)
+        : [z4]"r"(z4), [z]"r"(z)
+        : "memory"
+    );
+}
+
+static void sbr_qmf_post_shuffle_mips(float W[32][2], const float *z)
+{
+    int Temp1, Temp2, Temp3, Temp4, Temp5;
+    float *W_ptr = (float *)W;
+    float *z1    = (float *)z;
+    float *z2    = (float *)&z[60];
+    float *z_end = z1 + 32;
+
+     /* loop unrolled 4 times */
+    __asm__ volatile (
+        "lui    %[Temp5],   0x8000                  \n\t"
+    "1:                                             \n\t"
+        "lw     %[Temp1],   0(%[z2])                \n\t"
+        "lw     %[Temp2],   4(%[z2])                \n\t"
+        "lw     %[Temp3],   8(%[z2])                \n\t"
+        "lw     %[Temp4],   12(%[z2])               \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp5]    \n\t"
+        "xor    %[Temp2],   %[Temp2],   %[Temp5]    \n\t"
+        "xor    %[Temp3],   %[Temp3],   %[Temp5]    \n\t"
+        "xor    %[Temp4],   %[Temp4],   %[Temp5]    \n\t"
+        PTR_ADDIU "%[z2],   %[z2],      -16         \n\t"
+        "sw     %[Temp1],   24(%[W_ptr])            \n\t"
+        "sw     %[Temp2],   16(%[W_ptr])            \n\t"
+        "sw     %[Temp3],   8(%[W_ptr])             \n\t"
+        "sw     %[Temp4],   0(%[W_ptr])             \n\t"
+        "lw     %[Temp1],   0(%[z1])                \n\t"
+        "lw     %[Temp2],   4(%[z1])                \n\t"
+        "lw     %[Temp3],   8(%[z1])                \n\t"
+        "lw     %[Temp4],   12(%[z1])               \n\t"
+        "sw     %[Temp1],   4(%[W_ptr])             \n\t"
+        "sw     %[Temp2],   12(%[W_ptr])            \n\t"
+        "sw     %[Temp3],   20(%[W_ptr])            \n\t"
+        "sw     %[Temp4],   28(%[W_ptr])            \n\t"
+        PTR_ADDIU "%[z1],   %[z1],      16          \n\t"
+        PTR_ADDIU "%[W_ptr],%[W_ptr],   32          \n\t"
+        "bne    %[z1],      %[z_end],   1b          \n\t"
+
+        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
+          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
+          [Temp5]"=&r"(Temp5), [z1]"+r"(z1),
+          [z2]"+r"(z2), [W_ptr]"+r"(W_ptr)
+        : [z_end]"r"(z_end)
+        : "memory"
+    );
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void sbr_sum64x5_mips(float *z)
+{
+    int k;
+    float *z1;
+    float f1, f2, f3, f4, f5, f6, f7, f8;
+    for (k = 0; k < 64; k += 8) {
+
+        z1 = &z[k];
+
+         /* loop unrolled 8 times */
+        __asm__ volatile (
+            "lwc1   $f0,    0(%[z1])        \n\t"
+            "lwc1   $f1,    256(%[z1])      \n\t"
+            "lwc1   $f2,    4(%[z1])        \n\t"
+            "lwc1   $f3,    260(%[z1])      \n\t"
+            "lwc1   $f4,    8(%[z1])        \n\t"
+            "add.s  %[f1],  $f0,    $f1     \n\t"
+            "lwc1   $f5,    264(%[z1])      \n\t"
+            "add.s  %[f2],  $f2,    $f3     \n\t"
+            "lwc1   $f6,    12(%[z1])       \n\t"
+            "lwc1   $f7,    268(%[z1])      \n\t"
+            "add.s  %[f3],  $f4,    $f5     \n\t"
+            "lwc1   $f8,    16(%[z1])       \n\t"
+            "lwc1   $f9,    272(%[z1])      \n\t"
+            "add.s  %[f4],  $f6,    $f7     \n\t"
+            "lwc1   $f10,   20(%[z1])       \n\t"
+            "lwc1   $f11,   276(%[z1])      \n\t"
+            "add.s  %[f5],  $f8,    $f9     \n\t"
+            "lwc1   $f12,   24(%[z1])       \n\t"
+            "lwc1   $f13,   280(%[z1])      \n\t"
+            "add.s  %[f6],  $f10,   $f11    \n\t"
+            "lwc1   $f14,   28(%[z1])       \n\t"
+            "lwc1   $f15,   284(%[z1])      \n\t"
+            "add.s  %[f7],  $f12,   $f13    \n\t"
+            "lwc1   $f0,    512(%[z1])      \n\t"
+            "lwc1   $f1,    516(%[z1])      \n\t"
+            "add.s  %[f8],  $f14,   $f15    \n\t"
+            "lwc1   $f2,    520(%[z1])      \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    524(%[z1])      \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    528(%[z1])      \n\t"
+            "lwc1   $f5,    532(%[z1])      \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    536(%[z1])      \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    540(%[z1])      \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "lwc1   $f0,    768(%[z1])      \n\t"
+            "lwc1   $f1,    772(%[z1])      \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "lwc1   $f2,    776(%[z1])      \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    780(%[z1])      \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    784(%[z1])      \n\t"
+            "lwc1   $f5,    788(%[z1])      \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    792(%[z1])      \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    796(%[z1])      \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "lwc1   $f0,    1024(%[z1])     \n\t"
+            "lwc1   $f1,    1028(%[z1])     \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "lwc1   $f2,    1032(%[z1])     \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    1036(%[z1])     \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    1040(%[z1])     \n\t"
+            "lwc1   $f5,    1044(%[z1])     \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    1048(%[z1])     \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    1052(%[z1])     \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "swc1   %[f1],  0(%[z1])        \n\t"
+            "swc1   %[f2],  4(%[z1])        \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "swc1   %[f3],  8(%[z1])        \n\t"
+            "swc1   %[f4],  12(%[z1])       \n\t"
+            "swc1   %[f5],  16(%[z1])       \n\t"
+            "swc1   %[f6],  20(%[z1])       \n\t"
+            "swc1   %[f7],  24(%[z1])       \n\t"
+            "swc1   %[f8],  28(%[z1])       \n\t"
+
+            : [f1]"=&f"(f1), [f2]"=&f"(f2), [f3]"=&f"(f3),
+              [f4]"=&f"(f4), [f5]"=&f"(f5), [f6]"=&f"(f6),
+              [f7]"=&f"(f7), [f8]"=&f"(f8)
+            : [z1]"r"(z1)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
+              "$f6", "$f7", "$f8", "$f9", "$f10", "$f11",
+              "$f12", "$f13", "$f14", "$f15",
+              "memory"
+        );
+    }
+}
+
+static float sbr_sum_square_mips(float (*x)[2], int n)
+{
+    float sum0 = 0.0f, sum1 = 0.0f;
+    float *p_x;
+    float temp0, temp1, temp2, temp3;
+    float *loop_end;
+    p_x = &x[0][0];
+    loop_end = p_x + (n >> 1)*4 - 4;
+
+    __asm__ volatile (
+        ".set      push                                             \n\t"
+        ".set      noreorder                                        \n\t"
+        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
+        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
+        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
+        "lwc1      %[temp3],   12(%[p_x])                           \n\t"
+    "1:                                                             \n\t"
+        PTR_ADDIU "%[p_x],     %[p_x],       16                     \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
+        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
+        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
+        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
+        "bne       %[p_x],     %[loop_end],  1b                     \n\t"
+        " lwc1     %[temp3],   12(%[p_x])                           \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
+        ".set      pop                                              \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [sum0]"+f"(sum0), [sum1]"+f"(sum1),
+          [p_x]"+r"(p_x)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+    return sum0 + sum1;
+}
+
+static void sbr_qmf_deint_bfly_mips(float *v, const float *src0, const float *src1)
+{
+    int i;
+    float temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9, temp10, temp11;
+    float *v0 = v;
+    float *v1 = &v[127];
+    float *psrc0 = (float*)src0;
+    float *psrc1 = (float*)&src1[63];
+
+    for (i = 0; i < 4; i++) {
+
+         /* loop unrolled 16 times */
+        __asm__ volatile(
+            "lwc1       %[temp0],   0(%[src0])             \n\t"
+            "lwc1       %[temp1],   0(%[src1])             \n\t"
+            "lwc1       %[temp3],   4(%[src0])             \n\t"
+            "lwc1       %[temp4],   -4(%[src1])            \n\t"
+            "lwc1       %[temp6],   8(%[src0])             \n\t"
+            "lwc1       %[temp7],   -8(%[src1])            \n\t"
+            "lwc1       %[temp9],   12(%[src0])            \n\t"
+            "lwc1       %[temp10],  -12(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   0(%[v1])               \n\t"
+            "swc1       %[temp0],   0(%[v0])               \n\t"
+            "swc1       %[temp5],   -4(%[v1])              \n\t"
+            "swc1       %[temp3],   4(%[v0])               \n\t"
+            "swc1       %[temp8],   -8(%[v1])              \n\t"
+            "swc1       %[temp6],   8(%[v0])               \n\t"
+            "swc1       %[temp11],  -12(%[v1])             \n\t"
+            "swc1       %[temp9],   12(%[v0])              \n\t"
+            "lwc1       %[temp0],   16(%[src0])            \n\t"
+            "lwc1       %[temp1],   -16(%[src1])           \n\t"
+            "lwc1       %[temp3],   20(%[src0])            \n\t"
+            "lwc1       %[temp4],   -20(%[src1])           \n\t"
+            "lwc1       %[temp6],   24(%[src0])            \n\t"
+            "lwc1       %[temp7],   -24(%[src1])           \n\t"
+            "lwc1       %[temp9],   28(%[src0])            \n\t"
+            "lwc1       %[temp10],  -28(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -16(%[v1])             \n\t"
+            "swc1       %[temp0],   16(%[v0])              \n\t"
+            "swc1       %[temp5],   -20(%[v1])             \n\t"
+            "swc1       %[temp3],   20(%[v0])              \n\t"
+            "swc1       %[temp8],   -24(%[v1])             \n\t"
+            "swc1       %[temp6],   24(%[v0])              \n\t"
+            "swc1       %[temp11],  -28(%[v1])             \n\t"
+            "swc1       %[temp9],   28(%[v0])              \n\t"
+            "lwc1       %[temp0],   32(%[src0])            \n\t"
+            "lwc1       %[temp1],   -32(%[src1])           \n\t"
+            "lwc1       %[temp3],   36(%[src0])            \n\t"
+            "lwc1       %[temp4],   -36(%[src1])           \n\t"
+            "lwc1       %[temp6],   40(%[src0])            \n\t"
+            "lwc1       %[temp7],   -40(%[src1])           \n\t"
+            "lwc1       %[temp9],   44(%[src0])            \n\t"
+            "lwc1       %[temp10],  -44(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -32(%[v1])             \n\t"
+            "swc1       %[temp0],   32(%[v0])              \n\t"
+            "swc1       %[temp5],   -36(%[v1])             \n\t"
+            "swc1       %[temp3],   36(%[v0])              \n\t"
+            "swc1       %[temp8],   -40(%[v1])             \n\t"
+            "swc1       %[temp6],   40(%[v0])              \n\t"
+            "swc1       %[temp11],  -44(%[v1])             \n\t"
+            "swc1       %[temp9],   44(%[v0])              \n\t"
+            "lwc1       %[temp0],   48(%[src0])            \n\t"
+            "lwc1       %[temp1],   -48(%[src1])           \n\t"
+            "lwc1       %[temp3],   52(%[src0])            \n\t"
+            "lwc1       %[temp4],   -52(%[src1])           \n\t"
+            "lwc1       %[temp6],   56(%[src0])            \n\t"
+            "lwc1       %[temp7],   -56(%[src1])           \n\t"
+            "lwc1       %[temp9],   60(%[src0])            \n\t"
+            "lwc1       %[temp10],  -60(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -48(%[v1])             \n\t"
+            "swc1       %[temp0],   48(%[v0])              \n\t"
+            "swc1       %[temp5],   -52(%[v1])             \n\t"
+            "swc1       %[temp3],   52(%[v0])              \n\t"
+            "swc1       %[temp8],   -56(%[v1])             \n\t"
+            "swc1       %[temp6],   56(%[v0])              \n\t"
+            "swc1       %[temp11],  -60(%[v1])             \n\t"
+            "swc1       %[temp9],   60(%[v0])              \n\t"
+            PTR_ADDIU " %[src0],    %[src0],    64         \n\t"
+            PTR_ADDIU " %[src1],    %[src1],    -64        \n\t"
+            PTR_ADDIU " %[v0],      %[v0],      64         \n\t"
+            PTR_ADDIU " %[v1],      %[v1],      -64        \n\t"
+
+            : [v0]"+r"(v0), [v1]"+r"(v1), [src0]"+r"(psrc0), [src1]"+r"(psrc1),
+              [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11)
+            :
+            :"memory"
+        );
+    }
+}
+
+static void sbr_autocorrelate_mips(const float x[40][2], float phi[3][2][2])
+{
+    int i;
+    float real_sum_0 = 0.0f;
+    float real_sum_1 = 0.0f;
+    float real_sum_2 = 0.0f;
+    float imag_sum_1 = 0.0f;
+    float imag_sum_2 = 0.0f;
+    float *p_x, *p_phi;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+    float temp7, temp_r, temp_r1, temp_r2, temp_r3, temp_r4;
+    p_x = (float*)&x[0][0];
+    p_phi = &phi[0][0][0];
+
+    __asm__ volatile (
+        "lwc1    %[temp0],      8(%[p_x])                           \n\t"
+        "lwc1    %[temp1],      12(%[p_x])                          \n\t"
+        "lwc1    %[temp2],      16(%[p_x])                          \n\t"
+        "lwc1    %[temp3],      20(%[p_x])                          \n\t"
+        "lwc1    %[temp4],      24(%[p_x])                          \n\t"
+        "lwc1    %[temp5],      28(%[p_x])                          \n\t"
+        "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
+        "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
+        "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
+        "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
+        "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
+        "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
+        "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
+        "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
+        "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
+        "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
+        "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+        "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+        "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+        "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+        "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+        PTR_ADDIU "%[p_x],      %[p_x],        8                    \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+          [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
+          [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1), [temp_r2]"=&f"(temp_r2),
+          [temp_r3]"=&f"(temp_r3), [temp_r4]"=&f"(temp_r4),
+          [p_x]"+r"(p_x), [imag_sum_2]"+f"(imag_sum_2)
+        :
+        : "memory"
+    );
+
+    for (i = 0; i < 12; i++) {
+        __asm__ volatile (
+            "lwc1    %[temp0],      8(%[p_x])                           \n\t"
+            "lwc1    %[temp1],      12(%[p_x])                          \n\t"
+            "lwc1    %[temp2],      16(%[p_x])                          \n\t"
+            "lwc1    %[temp3],      20(%[p_x])                          \n\t"
+            "lwc1    %[temp4],      24(%[p_x])                          \n\t"
+            "lwc1    %[temp5],      28(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
+            "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
+            "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
+            "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
+            "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            "lwc1    %[temp0],      32(%[p_x])                          \n\t"
+            "lwc1    %[temp1],      36(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp3],      %[temp3]             \n\t"
+            "mul.s   %[temp_r1],    %[temp3],      %[temp5]             \n\t"
+            "mul.s   %[temp_r2],    %[temp3],      %[temp4]             \n\t"
+            "mul.s   %[temp_r3],    %[temp3],      %[temp1]             \n\t"
+            "mul.s   %[temp_r4],    %[temp3],      %[temp0]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp2],  %[temp2]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp2],  %[temp4]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp2],  %[temp5]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp2],  %[temp0]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp2],  %[temp1]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            "lwc1    %[temp2],      40(%[p_x])                          \n\t"
+            "lwc1    %[temp3],      44(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp5],      %[temp5]             \n\t"
+            "mul.s   %[temp_r1],    %[temp5],      %[temp1]             \n\t"
+            "mul.s   %[temp_r2],    %[temp5],      %[temp0]             \n\t"
+            "mul.s   %[temp_r3],    %[temp5],      %[temp3]             \n\t"
+            "mul.s   %[temp_r4],    %[temp5],      %[temp2]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp4],  %[temp4]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp4],  %[temp0]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp4],  %[temp1]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp4],  %[temp2]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp4],  %[temp3]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            PTR_ADDIU "%[p_x],      %[p_x],        24                   \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+              [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
+              [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1),
+              [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
+              [temp_r4]"=&f"(temp_r4), [p_x]"+r"(p_x),
+              [imag_sum_2]"+f"(imag_sum_2)
+            :
+            : "memory"
+        );
+    }
+    __asm__ volatile (
+        "lwc1    %[temp0],    -296(%[p_x])                        \n\t"
+        "lwc1    %[temp1],    -292(%[p_x])                        \n\t"
+        "lwc1    %[temp2],    8(%[p_x])                           \n\t"
+        "lwc1    %[temp3],    12(%[p_x])                          \n\t"
+        "lwc1    %[temp4],    -288(%[p_x])                        \n\t"
+        "lwc1    %[temp5],    -284(%[p_x])                        \n\t"
+        "lwc1    %[temp6],    -280(%[p_x])                        \n\t"
+        "lwc1    %[temp7],    -276(%[p_x])                        \n\t"
+        "madd.s  %[temp_r],   %[real_sum_0], %[temp0],  %[temp0]  \n\t"
+        "madd.s  %[temp_r1],  %[real_sum_0], %[temp2],  %[temp2]  \n\t"
+        "madd.s  %[temp_r2],  %[real_sum_1], %[temp0],  %[temp4]  \n\t"
+        "madd.s  %[temp_r3],  %[imag_sum_1], %[temp0],  %[temp5]  \n\t"
+        "madd.s  %[temp_r],   %[temp_r],     %[temp1],  %[temp1]  \n\t"
+        "madd.s  %[temp_r1],  %[temp_r1],    %[temp3],  %[temp3]  \n\t"
+        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp5]  \n\t"
+        "nmsub.s  %[temp_r3], %[temp_r3],    %[temp1],  %[temp4]  \n\t"
+        "lwc1    %[temp4],    16(%[p_x])                          \n\t"
+        "lwc1    %[temp5],    20(%[p_x])                          \n\t"
+        "swc1    %[temp_r],   40(%[p_phi])                        \n\t"
+        "swc1    %[temp_r1],  16(%[p_phi])                        \n\t"
+        "swc1    %[temp_r2],  24(%[p_phi])                        \n\t"
+        "swc1    %[temp_r3],  28(%[p_phi])                        \n\t"
+        "madd.s  %[temp_r],   %[real_sum_1], %[temp2],  %[temp4]  \n\t"
+        "madd.s  %[temp_r1],  %[imag_sum_1], %[temp2],  %[temp5]  \n\t"
+        "madd.s  %[temp_r2],  %[real_sum_2], %[temp0],  %[temp6]  \n\t"
+        "madd.s  %[temp_r3],  %[imag_sum_2], %[temp0],  %[temp7]  \n\t"
+        "madd.s  %[temp_r],   %[temp_r],     %[temp3],  %[temp5]  \n\t"
+        "nmsub.s %[temp_r1],  %[temp_r1],    %[temp3],  %[temp4]  \n\t"
+        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp7]  \n\t"
+        "nmsub.s %[temp_r3],  %[temp_r3],    %[temp1],  %[temp6]  \n\t"
+        "swc1    %[temp_r],   0(%[p_phi])                         \n\t"
+        "swc1    %[temp_r1],  4(%[p_phi])                         \n\t"
+        "swc1    %[temp_r2],  8(%[p_phi])                         \n\t"
+        "swc1    %[temp_r3],  12(%[p_phi])                        \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp_r]"=&f"(temp_r),
+          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+          [real_sum_2]"+f"(real_sum_2), [imag_sum_1]"+f"(imag_sum_1),
+          [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
+          [temp_r1]"=&f"(temp_r1), [p_phi]"+r"(p_phi),
+          [imag_sum_2]"+f"(imag_sum_2)
+        : [p_x]"r"(p_x)
+        : "memory"
+    );
+}
+
+static void sbr_hf_gen_mips(float (*X_high)[2], const float (*X_low)[2],
+                         const float alpha0[2], const float alpha1[2],
+                         float bw, int start, int end)
+{
+    float alpha[4];
+    int i;
+    float *p_x_low = (float*)&X_low[0][0] + 2*start;
+    float *p_x_high = &X_high[0][0] + 2*start;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+    float temp7, temp8, temp9, temp10, temp11, temp12;
+
+    alpha[0] = alpha1[0] * bw * bw;
+    alpha[1] = alpha1[1] * bw * bw;
+    alpha[2] = alpha0[0] * bw;
+    alpha[3] = alpha0[1] * bw;
+
+    for (i = start; i < end; i++) {
+        __asm__ volatile (
+            "lwc1    %[temp0],    -16(%[p_x_low])                        \n\t"
+            "lwc1    %[temp1],    -12(%[p_x_low])                        \n\t"
+            "lwc1    %[temp2],    -8(%[p_x_low])                         \n\t"
+            "lwc1    %[temp3],    -4(%[p_x_low])                         \n\t"
+            "lwc1    %[temp5],    0(%[p_x_low])                          \n\t"
+            "lwc1    %[temp6],    4(%[p_x_low])                          \n\t"
+            "lwc1    %[temp7],    0(%[alpha])                            \n\t"
+            "lwc1    %[temp8],    4(%[alpha])                            \n\t"
+            "lwc1    %[temp9],    8(%[alpha])                            \n\t"
+            "lwc1    %[temp10],   12(%[alpha])                           \n\t"
+            PTR_ADDIU "%[p_x_high], %[p_x_high],   8                     \n\t"
+            PTR_ADDIU "%[p_x_low],  %[p_x_low],    8                     \n\t"
+            "mul.s   %[temp11],   %[temp1],        %[temp8]              \n\t"
+            "msub.s  %[temp11],   %[temp11],       %[temp0],  %[temp7]   \n\t"
+            "madd.s  %[temp11],   %[temp11],       %[temp2],  %[temp9]   \n\t"
+            "nmsub.s %[temp11],   %[temp11],       %[temp3],  %[temp10]  \n\t"
+            "add.s   %[temp11],   %[temp11],       %[temp5]              \n\t"
+            "swc1    %[temp11],   -8(%[p_x_high])                        \n\t"
+            "mul.s   %[temp12],   %[temp1],        %[temp7]              \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp0],  %[temp8]   \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp3],  %[temp9]   \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp2],  %[temp10]  \n\t"
+            "add.s   %[temp12],   %[temp12],       %[temp6]              \n\t"
+            "swc1    %[temp12],   -4(%[p_x_high])                        \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+              [temp12]"=&f"(temp12), [p_x_high]"+r"(p_x_high),
+              [p_x_low]"+r"(p_x_low)
+            : [alpha]"r"(alpha)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_g_filt_mips(float (*Y)[2], const float (*X_high)[40][2],
+                            const float *g_filt, int m_max, intptr_t ixh)
+{
+    const float *p_x, *p_g, *loop_end;
+    float *p_y;
+    float temp0, temp1, temp2;
+
+    p_g = &g_filt[0];
+    p_y = &Y[0][0];
+    p_x = &X_high[0][ixh][0];
+    loop_end = p_g + m_max;
+
+    __asm__ volatile(
+        ".set    push                                \n\t"
+        ".set    noreorder                           \n\t"
+    "1:                                              \n\t"
+        "lwc1    %[temp0],   0(%[p_g])               \n\t"
+        "lwc1    %[temp1],   0(%[p_x])               \n\t"
+        "lwc1    %[temp2],   4(%[p_x])               \n\t"
+        "mul.s   %[temp1],   %[temp1],     %[temp0]  \n\t"
+        "mul.s   %[temp2],   %[temp2],     %[temp0]  \n\t"
+        PTR_ADDIU "%[p_g],   %[p_g],       4         \n\t"
+        PTR_ADDIU "%[p_x],   %[p_x],       320       \n\t"
+        "swc1    %[temp1],   0(%[p_y])               \n\t"
+        "swc1    %[temp2],   4(%[p_y])               \n\t"
+        "bne     %[p_g],     %[loop_end],  1b        \n\t"
+        PTR_ADDIU "%[p_y],   %[p_y],       8         \n\t"
+        ".set    pop                                 \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [p_x]"+r"(p_x),
+          [p_y]"+r"(p_y), [p_g]"+r"(p_g)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+}
+
+static void sbr_hf_apply_noise_0_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    int m;
+
+    for (m = 0; m < m_max; m++){
+
+        float *Y1=&Y[m][0];
+        float *ff_table;
+        float y0,y1, temp1, temp2, temp4, temp5;
+        int temp0, temp3;
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1    %[y0],       0(%[Y1])                                    \n\t"
+            "lwc1    %[temp1],    0(%[s_m1])                                  \n\t"
+            "addiu   %[noise],    %[noise],              1                    \n\t"
+            "andi    %[noise],    %[noise],              0x1ff                \n\t"
+            "sll     %[temp0],    %[noise], 3                                 \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]             \n\t"
+            "add.s   %[y0],       %[y0],                 %[temp1]             \n\t"
+            "mfc1    %[temp3],    %[temp1]                                    \n\t"
+            "bne     %[temp3],    $0,                    1f                   \n\t"
+            "lwc1    %[y1],       4(%[Y1])                                    \n\t"
+            "lwc1    %[temp2],    0(%[q_filt1])                               \n\t"
+            "lwc1    %[temp4],    0(%[ff_table])                              \n\t"
+            "lwc1    %[temp5],    4(%[ff_table])                              \n\t"
+            "madd.s  %[y0],       %[y0],                 %[temp2],  %[temp4]  \n\t"
+            "madd.s  %[y1],       %[y1],                 %[temp2],  %[temp5]  \n\t"
+            "swc1    %[y1],       4(%[Y1])                                    \n\t"
+        "1:                                                                   \n\t"
+            "swc1    %[y0],       0(%[Y1])                                    \n\t"
+
+            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
+              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_apply_noise_1_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    float y0,y1,temp1, temp2, temp4, temp5;
+    int temp0, temp3, m;
+    float phi_sign = 1 - 2 * (kx & 1);
+
+    for (m = 0; m < m_max; m++) {
+
+        float *ff_table;
+        float *Y1=&Y[m][0];
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1   %[y1],       4(%[Y1])                                     \n\t"
+            "lwc1   %[temp1],    0(%[s_m1])                                   \n\t"
+            "lw     %[temp3],    0(%[s_m1])                                   \n\t"
+            "addiu  %[noise],    %[noise],               1                    \n\t"
+            "andi   %[noise],    %[noise],               0x1ff                \n\t"
+            "sll    %[temp0],    %[noise],               3                    \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]              \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
+            "bne    %[temp3],    $0,                    1f                    \n\t"
+            "lwc1   %[y0],       0(%[Y1])                                     \n\t"
+            "lwc1   %[temp2],    0(%[q_filt1])                                \n\t"
+            "lwc1   %[temp4],    0(%[ff_table])                               \n\t"
+            "lwc1   %[temp5],    4(%[ff_table])                               \n\t"
+            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
+            "swc1   %[y0],       0(%[Y1])                                     \n\t"
+        "1:                                                                   \n\t"
+            "swc1   %[y1],       4(%[Y1])                                     \n\t"
+
+            : [ff_table] "=&r" (ff_table), [y0] "=&f" (y0), [y1] "=&f" (y1),
+              [temp0] "=&r" (temp0), [temp1] "=&f" (temp1), [temp2] "=&f" (temp2),
+              [temp3] "=&r" (temp3), [temp4] "=&f" (temp4), [temp5] "=&f" (temp5)
+            : [ff_sbr_noise_table] "r" (ff_sbr_noise_table), [noise] "r" (noise),
+              [Y1] "r" (Y1), [s_m1] "r" (s_m1), [q_filt1] "r" (q_filt1),
+              [phi_sign] "f" (phi_sign)
+            : "memory"
+        );
+        phi_sign = -phi_sign;
+    }
+}
+
+static void sbr_hf_apply_noise_2_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    int m;
+    float *ff_table;
+    float y0,y1, temp0, temp1, temp2, temp3, temp4, temp5;
+
+    for (m = 0; m < m_max; m++) {
+
+        float *Y1=&Y[m][0];
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1   %[y0],       0(%[Y1])                                  \n\t"
+            "lwc1   %[temp1],    0(%[s_m1])                                \n\t"
+            "addiu  %[noise],    %[noise],              1                  \n\t"
+            "andi   %[noise],    %[noise],              0x1ff              \n\t"
+            "sll    %[temp0],    %[noise],              3                  \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]           \n\t"
+            "sub.s  %[y0],       %[y0],                 %[temp1]           \n\t"
+            "mfc1   %[temp3],    %[temp1]                                  \n\t"
+            "bne    %[temp3],    $0,                    1f                 \n\t"
+            "lwc1   %[y1],       4(%[Y1])                                  \n\t"
+            "lwc1   %[temp2],    0(%[q_filt1])                             \n\t"
+            "lwc1   %[temp4],    0(%[ff_table])                            \n\t"
+            "lwc1   %[temp5],    4(%[ff_table])                            \n\t"
+            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4] \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5] \n\t"
+            "swc1   %[y1],       4(%[Y1])                                  \n\t"
+        "1:                                                                \n\t"
+            "swc1   %[y0],       0(%[Y1])                                  \n\t"
+
+            : [temp0]"=&r"(temp0), [ff_table]"=&r"(ff_table), [y0]"=&f"(y0),
+              [y1]"=&f"(y1), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_apply_noise_3_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    float phi_sign = 1 - 2 * (kx & 1);
+    int m;
+
+    for (m = 0; m < m_max; m++) {
+
+        float *Y1=&Y[m][0];
+        float *ff_table;
+        float y0,y1, temp1, temp2, temp4, temp5;
+        int temp0, temp3;
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1    %[y1],       4(%[Y1])                                     \n\t"
+            "lwc1    %[temp1],    0(%[s_m1])                                   \n\t"
+            "addiu   %[noise],    %[noise],              1                     \n\t"
+            "andi    %[noise],    %[noise],              0x1ff                 \n\t"
+            "sll     %[temp0],    %[noise],              3                     \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]              \n\t"
+            "nmsub.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
+            "mfc1    %[temp3],    %[temp1]                                     \n\t"
+            "bne     %[temp3],    $0,                    1f                    \n\t"
+            "lwc1    %[y0],       0(%[Y1])                                     \n\t"
+            "lwc1    %[temp2],    0(%[q_filt1])                                \n\t"
+            "lwc1    %[temp4],    0(%[ff_table])                               \n\t"
+            "lwc1    %[temp5],    4(%[ff_table])                               \n\t"
+            "madd.s  %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
+            "madd.s  %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
+            "swc1    %[y0],       0(%[Y1])                                     \n\t"
+            "1:                                                                \n\t"
+            "swc1    %[y1],       4(%[Y1])                                     \n\t"
+
+            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
+              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1),
+              [phi_sign]"f"(phi_sign)
+            : "memory"
+        );
+       phi_sign = -phi_sign;
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_sbrdsp_init_mips(SBRDSPContext *s)
+{
+#if HAVE_INLINE_ASM
+    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_mips;
+    s->qmf_post_shuffle = sbr_qmf_post_shuffle_mips;
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->sum64x5 = sbr_sum64x5_mips;
+    s->sum_square = sbr_sum_square_mips;
+    s->qmf_deint_bfly = sbr_qmf_deint_bfly_mips;
+    s->autocorrelate = sbr_autocorrelate_mips;
+    s->hf_gen = sbr_hf_gen_mips;
+    s->hf_g_filt = sbr_hf_g_filt_mips;
+
+    s->hf_apply_noise[0] = sbr_hf_apply_noise_0_mips;
+    s->hf_apply_noise[1] = sbr_hf_apply_noise_1_mips;
+    s->hf_apply_noise[2] = sbr_hf_apply_noise_2_mips;
+    s->hf_apply_noise[3] = sbr_hf_apply_noise_3_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c
new file mode 100644
index 0000000..7f4bb74
--- /dev/null
+++ b/libavcodec/mips/simple_idct_mmi.c
@@ -0,0 +1,423 @@
+/*
+ * Loongson SIMD optimized simple idct
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "constants.h"
+#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
+
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+#define DC_SHIFT 3
+
+DECLARE_ALIGNED(8, const int16_t, W_arr)[46] = {
+    W4,  W2,  W4,  W6,
+    W1,  W3,  W5,  W7,
+    W4,  W6, -W4, -W2,
+    W3, -W7, -W1, -W5,
+    W4, -W6, -W4,  W2,
+    W5, -W1,  W7,  W3,
+    W4, -W2,  W4, -W6,
+    W7, -W5,  W3, -W1,
+    1024, 0,  1024, 0, //ff_p32_1024 = 0x0000040000000400ULL
+    0,   -1,  -1,  -1, //mask = 0xffffffffffff0000ULL
+    32,  32,  32,  32  //ff_p16_32 = 0x0020002000200020ULL
+};
+
+void ff_simple_idct_8_mmi(int16_t *block)
+{
+    BACKUP_REG
+    __asm__ volatile (
+
+#define IDCT_ROW_COND_DC(src1, src2)                                  \
+        "dmfc1        $11,      "#src1"                         \n\t" \
+        "dmfc1        $12,      "#src2"                         \n\t" \
+        "and          $11,       $11,       $9                  \n\t" \
+        "or           $10,       $11,       $12                 \n\t" \
+        "beqz         $10,       1f                             \n\t" \
+                                                                      \
+        "punpcklhw    $f30,     "#src1",   "#src2"              \n\t" \
+        "punpckhhw    $f31,     "#src1",   "#src2"              \n\t" \
+        /* s6, s4, s2, s0 */                                          \
+        "punpcklhw   "#src1",    $f30,      $f31                \n\t" \
+        /* s7, s5, s3, s1 */                                          \
+        "punpckhhw   "#src2",    $f30,      $f31                \n\t" \
+                                                                      \
+        "pmaddhw      $f30,     "#src1",    $f18                \n\t" \
+        "pmaddhw      $f31,     "#src2",    $f19                \n\t" \
+        "paddw        $f28,      $f30,      $f31                \n\t" \
+        "psubw        $f29,      $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,      $f28,      $f29                \n\t" \
+        "punpckhwd    $f31,      $f28,      $f29                \n\t" \
+        "paddw        $f26,      $f30,      $f31                \n\t" \
+        "paddw        $f26,      $f26,      $f16                \n\t" \
+        /* $f26: src[7], src[0] */                                    \
+        "psraw        $f26,      $f26,      $f17                \n\t" \
+                                                                      \
+        "pmaddhw      $f30,     "#src1",    $f20                \n\t" \
+        "pmaddhw      $f31,     "#src2",    $f21                \n\t" \
+        "paddw        $f28,      $f30,      $f31                \n\t" \
+        "psubw        $f29,      $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,      $f28,      $f29                \n\t" \
+        "punpckhwd    $f31,      $f28,      $f29                \n\t" \
+        "paddw        $f27,      $f30,      $f31                \n\t" \
+        "paddw        $f27,      $f27,      $f16                \n\t" \
+        /* $f27: src[6], src[1] */                                    \
+        "psraw        $f27,      $f27,      $f17                \n\t" \
+                                                                      \
+        "pmaddhw      $f30,     "#src1",    $f22                \n\t" \
+        "pmaddhw      $f31,     "#src2",    $f23                \n\t" \
+        "paddw        $f28,      $f30,      $f31                \n\t" \
+        "psubw        $f29,      $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,      $f28,      $f29                \n\t" \
+        "punpckhwd    $f31,      $f28,      $f29                \n\t" \
+        "paddw        $f28,      $f30,      $f31                \n\t" \
+        "paddw        $f28,      $f28,      $f16                \n\t" \
+        /* $f28: src[5], src[2] */                                    \
+        "psraw        $f28,      $f28,      $f17                \n\t" \
+                                                                      \
+        "pmaddhw      $f30,     "#src1",    $f24                \n\t" \
+        "pmaddhw      $f31,     "#src2",    $f25                \n\t" \
+        "paddw       "#src1",    $f30,      $f31                \n\t" \
+        "psubw       "#src2",    $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,     "#src1",   "#src2"              \n\t" \
+        "punpckhwd    $f31,     "#src1",   "#src2"              \n\t" \
+        "paddw        $f29,      $f30,      $f31                \n\t" \
+        "paddw        $f29,      $f29,      $f16                \n\t" \
+        /* $f29: src[4], src[3] */                                    \
+        "psraw        $f29,      $f29,      $f17                \n\t" \
+                                                                      \
+        "punpcklhw   "#src1",    $f26,      $f27                \n\t" \
+        "punpckhhw    $f30,      $f27,      $f26                \n\t" \
+        "punpcklhw    $f31,      $f28,      $f29                \n\t" \
+        "punpckhhw   "#src2",    $f29,      $f28                \n\t" \
+        /* src[3], src[2], src[1], src[0] */                          \
+        "punpcklwd   "#src1",   "#src1",    $f31                \n\t" \
+        /* src[7], src[6], src[5], src[4] */                          \
+        "punpcklwd   "#src2",   "#src2",    $f30                \n\t" \
+        "j                       2f                             \n\t" \
+                                                                      \
+        "1:                                                     \n\t" \
+        "li           $10,       3                              \n\t" \
+        "dmtc1        $10,       $f30                           \n\t" \
+        "psllh        $f28,     "#src1",    $f30                \n\t" \
+        "dmtc1        $9,        $f31                           \n\t" \
+        "punpcklhw    $f29,      $f28,      $f28                \n\t" \
+        "and          $f29,      $f29,      $f31                \n\t" \
+        "paddw        $f28,      $f28,      $f29                \n\t" \
+        "punpcklwd   "#src1",    $f28,      $f28                \n\t" \
+        "punpcklwd   "#src2",    $f28,      $f28                \n\t" \
+        "2:                                                     \n\t" \
+
+        /* idctRowCondDC row0~8 */
+
+        /* load W */
+        "gslqc1       $f19,      $f18,      0x00(%[w_arr])      \n\t"
+        "gslqc1       $f21,      $f20,      0x10(%[w_arr])      \n\t"
+        "gslqc1       $f23,      $f22,      0x20(%[w_arr])      \n\t"
+        "gslqc1       $f25,      $f24,      0x30(%[w_arr])      \n\t"
+        "gslqc1       $f17,      $f16,      0x40(%[w_arr])      \n\t"
+        /* load source in block */
+        "gslqc1       $f1,       $f0,       0x00(%[block])      \n\t"
+        "gslqc1       $f3,       $f2,       0x10(%[block])      \n\t"
+        "gslqc1       $f5,       $f4,       0x20(%[block])      \n\t"
+        "gslqc1       $f7,       $f6,       0x30(%[block])      \n\t"
+        "gslqc1       $f9,       $f8,       0x40(%[block])      \n\t"
+        "gslqc1       $f11,      $f10,      0x50(%[block])      \n\t"
+        "gslqc1       $f13,      $f12,      0x60(%[block])      \n\t"
+        "gslqc1       $f15,      $f14,      0x70(%[block])      \n\t"
+
+        /* $9: mask ; $f17: ROW_SHIFT */
+        "dmfc1        $9,        $f17                           \n\t"
+        "li           $10,       11                             \n\t"
+        "mtc1         $10,       $f17                           \n\t"
+        IDCT_ROW_COND_DC($f0,$f1)
+        IDCT_ROW_COND_DC($f2,$f3)
+        IDCT_ROW_COND_DC($f4,$f5)
+        IDCT_ROW_COND_DC($f6,$f7)
+        IDCT_ROW_COND_DC($f8,$f9)
+        IDCT_ROW_COND_DC($f10,$f11)
+        IDCT_ROW_COND_DC($f12,$f13)
+        IDCT_ROW_COND_DC($f14,$f15)
+
+#define IDCT_COL_CASE1(src, out1, out2)                               \
+        "pmaddhw      $f26,     "#src",     $f18                \n\t" \
+        "pmaddhw      $f27,     "#src",     $f20                \n\t" \
+        "pmaddhw      $f28,     "#src",     $f22                \n\t" \
+        "pmaddhw      $f29,     "#src",     $f24                \n\t" \
+                                                                      \
+        "punpcklwd    $f30,      $f26,      $f26                \n\t" \
+        "punpckhwd    $f31,      $f26,      $f26                \n\t" \
+        /* $f26: src[0], src[56] */                                   \
+        "paddw        $f26,      $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,      $f27,      $f27                \n\t" \
+        "punpckhwd    $f31,      $f27,      $f27                \n\t" \
+        /* $f27: src[8], src[48] */                                   \
+        "paddw        $f27,      $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,      $f28,      $f28                \n\t" \
+        "punpckhwd    $f31,      $f28,      $f28                \n\t" \
+        /* $f28: src[16], src[40] */                                  \
+        "paddw        $f28,      $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,      $f29,      $f29                \n\t" \
+        "punpckhwd    $f31,      $f29,      $f29                \n\t" \
+        /* $f29: src[24], src[32] */                                  \
+        "paddw        $f29,      $f30,      $f31                \n\t" \
+                                                                      \
+        /* out1: src[24], src[16], src[8], src[0] */                  \
+        /* out2: src[56], src[48], src[40], src[32] */                \
+        "punpcklhw    $f30,      $f26,      $f27                \n\t" \
+        "punpcklhw    $f31,      $f28,      $f29                \n\t" \
+        "punpckhwd   "#out1",    $f30,      $f31                \n\t" \
+        "psrah       "#out1",   "#out1",    $f16                \n\t" \
+        "punpcklhw    $f30,      $f27,      $f26                \n\t" \
+        "punpcklhw    $f31,      $f29,      $f28                \n\t" \
+        "punpckhwd   "#out2",    $f31,      $f30                \n\t" \
+        "psrah       "#out2",   "#out2",    $f16                \n\t"
+
+#define IDCT_COL_CASE2(src1, src2, out1, out2)                        \
+        "pmaddhw      $f28,     "#src1",    $f18                \n\t" \
+        "pmaddhw      $f29,     "#src2",    $f19                \n\t" \
+        "paddw        $f30,      $f28,      $f29                \n\t" \
+        "psubw        $f31,      $f28,      $f29                \n\t" \
+        "punpcklwd    $f28,      $f30,      $f31                \n\t" \
+        "punpckhwd    $f29,      $f30,      $f31                \n\t" \
+        "pmaddhw      $f30,     "#src1",    $f20                \n\t" \
+        "pmaddhw      $f31,     "#src2",    $f21                \n\t" \
+        /* $f26: src[0], src[56] */                                   \
+        "paddw        $f26,      $f28,      $f29                \n\t" \
+        "paddw        $f28,      $f30,      $f31                \n\t" \
+        "psubw        $f29,      $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,      $f28,      $f29                \n\t" \
+        "punpckhwd    $f31,      $f28,      $f29                \n\t" \
+        "pmaddhw      $f28,     "#src1",    $f22                \n\t" \
+        "pmaddhw      $f29,     "#src2",    $f23                \n\t" \
+        /* $f27: src[8], src[48] */                                   \
+        "paddw        $f27,      $f30,      $f31                \n\t" \
+        "paddw        $f30,      $f28,      $f29                \n\t" \
+        "psubw        $f31,      $f28,      $f29                \n\t" \
+        "punpcklwd    $f28,      $f30,      $f31                \n\t" \
+        "punpckhwd    $f29,      $f30,      $f31                \n\t" \
+        "pmaddhw      $f30,     "#src1",    $f24                \n\t" \
+        "pmaddhw      $f31,     "#src2",    $f25                \n\t" \
+        /* $f28: src[16], src[40] */                                  \
+        "paddw        $f28,      $f28,      $f29                \n\t" \
+        "paddw       "#out1",    $f30,      $f31                \n\t" \
+        "psubw       "#out2",    $f30,      $f31                \n\t" \
+        "punpcklwd    $f30,     "#out1",   "#out2"              \n\t" \
+        "punpckhwd    $f31,     "#out1",   "#out2"              \n\t" \
+        /* $f29: src[24], src[32] */                                  \
+        "paddw        $f29,      $f30,      $f31                \n\t" \
+                                                                      \
+        /* out1: src[24], src[16], src[8], src[0] */                  \
+        /* out2: src[56], src[48], src[40], src[32] */                \
+        "punpcklhw   "#out1",    $f26,      $f27                \n\t" \
+        "punpckhhw   "#out2",    $f27,      $f26                \n\t" \
+        "punpcklhw    $f30,      $f28,      $f29                \n\t" \
+        "punpckhhw    $f31,      $f29,      $f28                \n\t" \
+        "punpckhwd   "#out1",   "#out1",    $f30                \n\t" \
+        "punpckhwd   "#out2",    $f31,     "#out2"              \n\t" \
+        "psrah       "#out1",   "#out1",    $f16                \n\t" \
+        "psrah       "#out2",   "#out2",    $f16                \n\t"
+
+
+        /* idctSparseCol col0~3 */
+
+        /* $f17: ff_p16_32; $f16: COL_SHIFT-16 */
+        "gsldlc1      $f17,      0x57(%[w_arr])                 \n\t"
+        "gsldrc1      $f17,      0x50(%[w_arr])                 \n\t"
+        "li           $10,       4                              \n\t"
+        "dmtc1        $10,       $f16                           \n\t"
+        "paddh        $f0,       $f0,       $f17                \n\t"
+        /* Transpose row[0,2,4,6] */
+        "punpcklhw    $f26,      $f0,       $f4                 \n\t"
+        "punpckhhw    $f27,      $f0,       $f4                 \n\t"
+        "punpcklhw    $f28,      $f8,       $f12                \n\t"
+        "punpckhhw    $f29,      $f8,       $f12                \n\t"
+        "punpcklwd    $f0,       $f26,      $f28                \n\t"
+        "punpckhwd    $f4,       $f26,      $f28                \n\t"
+        "punpcklwd    $f8,       $f27,      $f29                \n\t"
+        "punpckhwd    $f12,      $f27,      $f29                \n\t"
+
+        "or           $f26,      $f2,       $f6                 \n\t"
+        "or           $f26,      $f26,      $f10                \n\t"
+        "or           $f26,      $f26,      $f14                \n\t"
+        "dmfc1        $10,       $f26                           \n\t"
+        "bnez         $10,       1f                             \n\t"
+        /* case1: In this case, row[1,3,5,7] are all zero */
+        /* col0: $f0: col[24,16,8,0]; $f2: col[56,48,40,32] */
+        IDCT_COL_CASE1($f0, $f0, $f2)
+        /* col1: $f4: col[25,17,9,1]; $f6: col[57,49,41,33] */
+        IDCT_COL_CASE1($f4, $f4, $f6)
+        /* col2: $f8: col[26,18,10,2]; $f10: col[58,50,42,34] */
+        IDCT_COL_CASE1($f8, $f8, $f10)
+        /* col3: $f12: col[27,19,11,3]; $f14: col[59,51,43,35] */
+        IDCT_COL_CASE1($f12, $f12, $f14)
+        "j                                  2f                  \n\t"
+
+        "1:                                                     \n\t"
+        /* case2: row[1,3,5,7] are not all zero */
+        /* Transpose */
+        "punpcklhw    $f26,      $f2,       $f6                 \n\t"
+        "punpckhhw    $f27,      $f2,       $f6                 \n\t"
+        "punpcklhw    $f28,      $f10,      $f14                \n\t"
+        "punpckhhw    $f29,      $f10,      $f14                \n\t"
+        "punpcklwd    $f2,       $f26,      $f28                \n\t"
+        "punpckhwd    $f6,       $f26,      $f28                \n\t"
+        "punpcklwd    $f10,      $f27,      $f29                \n\t"
+        "punpckhwd    $f14,      $f27,      $f29                \n\t"
+
+        /* col0: $f0: col[24,16,8,0]; $f2: col[56,48,40,32] */
+        IDCT_COL_CASE2($f0, $f2, $f0, $f2)
+        /* col1: $f4: col[25,17,9,1]; $f6: col[57,49,41,33] */
+        IDCT_COL_CASE2($f4, $f6, $f4, $f6)
+        /* col2: $f8: col[26,18,10,2]; $f10: col[58,50,42,34] */
+        IDCT_COL_CASE2($f8, $f10, $f8, $f10)
+        /* col3: $f12: col[27,19,11,3]; $f14: col[59,51,43,35] */
+        IDCT_COL_CASE2($f12, $f14, $f12, $f14)
+
+        "2:                                                     \n\t"
+        /* Transpose */
+        "punpcklhw    $f26,      $f0,       $f4                 \n\t"
+        "punpckhhw    $f27,      $f0,       $f4                 \n\t"
+        "punpcklhw    $f28,      $f8,       $f12                \n\t"
+        "punpckhhw    $f29,      $f8,       $f12                \n\t"
+        "punpcklwd    $f0,       $f26,      $f28                \n\t"
+        "punpckhwd    $f4,       $f26,      $f28                \n\t"
+        "punpcklwd    $f8,       $f27,      $f29                \n\t"
+        "punpckhwd    $f12,      $f27,      $f29                \n\t"
+        /* Transpose */
+        "punpcklhw    $f26,      $f2,       $f6                 \n\t"
+        "punpckhhw    $f27,      $f2,       $f6                 \n\t"
+        "punpcklhw    $f28,      $f10,      $f14                \n\t"
+        "punpckhhw    $f29,      $f10,      $f14                \n\t"
+        "punpcklwd    $f2,       $f26,      $f28                \n\t"
+        "punpckhwd    $f6,       $f26,      $f28                \n\t"
+        "punpcklwd    $f10,      $f27,      $f29                \n\t"
+        "punpckhwd    $f14,      $f27,      $f29                \n\t"
+
+        /* idctSparseCol col4~7 */
+
+        "paddh        $f1,       $f1,       $f17                \n\t"
+        /* Transpose */
+        "punpcklhw    $f26,      $f1,       $f5                 \n\t"
+        "punpckhhw    $f27,      $f1,       $f5                 \n\t"
+        "punpcklhw    $f28,      $f9,       $f13                \n\t"
+        "punpckhhw    $f29,      $f9,       $f13                \n\t"
+        "punpcklwd    $f1,       $f26,      $f28                \n\t"
+        "punpckhwd    $f5,       $f26,      $f28                \n\t"
+        "punpcklwd    $f9,       $f27,      $f29                \n\t"
+        "punpckhwd    $f13,      $f27,      $f29                \n\t"
+
+        "or           $f26,      $f3,       $f7                 \n\t"
+        "or           $f26,      $f26,      $f11                \n\t"
+        "or           $f26,      $f26,      $f15                \n\t"
+        "dmfc1        $10,       $f26                           \n\t"
+        "bnez         $10,       1f                             \n\t"
+        /* case1: In this case, row[1,3,5,7] are all zero */
+        /* col4: $f1: col[24,16,8,0]; $f3: col[56,48,40,32] */
+        IDCT_COL_CASE1($f1, $f1, $f3)
+        /* col5: $f5: col[25,17,9,1]; $f7: col[57,49,41,33] */
+        IDCT_COL_CASE1($f5, $f5, $f7)
+        /* col6: $f9: col[26,18,10,2]; $f11: col[58,50,42,34] */
+        IDCT_COL_CASE1($f9, $f9, $f11)
+        /* col7: $f13: col[27,19,11,3]; $f15: col[59,51,43,35] */
+        IDCT_COL_CASE1($f13, $f13, $f15)
+        "j                                  2f                  \n\t"
+
+        "1:                                                     \n\t"
+        /* case2: row[1,3,5,7] are not all zero */
+        /* Transpose */
+        "punpcklhw    $f26,      $f3,       $f7                 \n\t"
+        "punpckhhw    $f27,      $f3,       $f7                 \n\t"
+        "punpcklhw    $f28,      $f11,      $f15                \n\t"
+        "punpckhhw    $f29,      $f11,      $f15                \n\t"
+        "punpcklwd    $f3,       $f26,      $f28                \n\t"
+        "punpckhwd    $f7,       $f26,      $f28                \n\t"
+        "punpcklwd    $f11,      $f27,      $f29                \n\t"
+        "punpckhwd    $f15,      $f27,      $f29                \n\t"
+
+        /* col4: $f1: col[24,16,8,0]; $f3: col[56,48,40,32] */
+        IDCT_COL_CASE2($f1, $f3, $f1, $f3)
+        /* col5: $f5: col[25,17,9,1]; $f7: col[57,49,41,33] */
+        IDCT_COL_CASE2($f5, $f7, $f5, $f7)
+        /* col6: $f9: col[26,18,10,2]; $f11: col[58,50,42,34] */
+        IDCT_COL_CASE2($f9, $f11, $f9, $f11)
+        /* col7: $f13: col[27,19,11,3]; $f15: col[59,51,43,35] */
+        IDCT_COL_CASE2($f13, $f15, $f13, $f15)
+
+        "2:                                                     \n\t"
+        /* Transpose */
+        "punpcklhw    $f26,      $f1,       $f5                 \n\t"
+        "punpckhhw    $f27,      $f1,       $f5                 \n\t"
+        "punpcklhw    $f28,      $f9,       $f13                \n\t"
+        "punpckhhw    $f29,      $f9,       $f13                \n\t"
+        "punpcklwd    $f1,       $f26,      $f28                \n\t"
+        "punpckhwd    $f5,       $f26,      $f28                \n\t"
+        "punpcklwd    $f9,       $f27,      $f29                \n\t"
+        "punpckhwd    $f13,      $f27,      $f29                \n\t"
+        /* Transpose */
+        "punpcklhw    $f26,      $f3,       $f7                 \n\t"
+        "punpckhhw    $f27,      $f3,       $f7                 \n\t"
+        "punpcklhw    $f28,      $f11,      $f15                \n\t"
+        "punpckhhw    $f29,      $f11,      $f15                \n\t"
+        "punpcklwd    $f3,       $f26,      $f28                \n\t"
+        "punpckhwd    $f7,       $f26,      $f28                \n\t"
+        "punpcklwd    $f11,      $f27,      $f29                \n\t"
+        "punpckhwd    $f15,      $f27,      $f29                \n\t"
+        /* Store */
+        "gssqc1       $f1,       $f0,       0x00(%[block])      \n\t"
+        "gssqc1       $f5,       $f4,       0x10(%[block])      \n\t"
+        "gssqc1       $f9,       $f8,       0x20(%[block])      \n\t"
+        "gssqc1       $f13,      $f12,      0x30(%[block])      \n\t"
+        "gssqc1       $f3,       $f2,       0x40(%[block])      \n\t"
+        "gssqc1       $f7,       $f6,       0x50(%[block])      \n\t"
+        "gssqc1       $f11,      $f10,      0x60(%[block])      \n\t"
+        "gssqc1       $f15,      $f14,      0x70(%[block])      \n\t"
+
+        : [block]"+&r"(block)
+        : [w_arr]"r"(W_arr)
+        : "memory"
+    );
+
+    RECOVER_REG
+}
+
+void ff_simple_idct_put_8_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_simple_idct_8_mmi(block);
+    ff_put_pixels_clamped_mmi(block, dest, line_size);
+}
+void ff_simple_idct_add_8_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    ff_simple_idct_8_mmi(block);
+    ff_add_pixels_clamped_mmi(block, dest, line_size);
+}
diff --git a/libavcodec/mips/simple_idct_msa.c b/libavcodec/mips/simple_idct_msa.c
new file mode 100644
index 0000000..8a72359
--- /dev/null
+++ b/libavcodec/mips/simple_idct_msa.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "idctdsp_mips.h"
+
+static void simple_idct_msa(int16_t *block)
+{
+    int32_t const_val;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    temp = in0 << 3;
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp1_r, temp1_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp2_r, temp2_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
+         a1_r, a1_l, a2_r, a2_l);
+    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
+         a3_r, a3_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp1_r, temp1_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp2_r, temp2_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
+         a1_r, a1_l, a2_r, a2_l);
+    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
+         a3_r, a3_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
+           block, 8);
+}
+
+static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
+                                int16_t *block)
+{
+    int32_t const_val;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    temp = in0 << 3;
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
+    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
+    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
+    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
+    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
+                temp2_r, temp2_r, temp3_r, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += 4 * dst_stride;
+    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
+    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
+    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
+    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
+    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
+                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
+    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
+    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += 4 * dst_stride;
+}
+
+static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
+                                int16_t *block)
+{
+    int32_t const_val;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    UNPCK_SH_SW(in4, temp4_r, temp4_l);
+    UNPCK_SH_SW(in6, temp7_r, temp7_l);
+    ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
+    temp = in0 << 3;
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
+    MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
+    MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
+    ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
+    SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
+    SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
+    ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
+    ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
+    SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
+    ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
+    SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    LD_SH4(dst, dst_stride, in0, in1, in2, in3);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
+               temp0_l, temp1_l, temp2_l, temp3_l);
+    temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
+    temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
+    temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
+    temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
+    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
+    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
+    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
+    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
+    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
+                temp2_r, temp2_r, temp3_r, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
+               a3_l, a2_l, a1_l, a0_l);
+    a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
+    a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
+    a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
+    a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
+    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
+    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
+    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
+    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
+    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
+                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
+    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_simple_idct_msa(int16_t *block)
+{
+    simple_idct_msa(block);
+}
+
+void ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
+{
+    simple_idct_put_msa(dst, dst_stride, block);
+}
+
+void ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
+{
+    simple_idct_add_msa(dst, dst_stride, block);
+}
diff --git a/libavcodec/mips/vc1dsp_init_mips.c b/libavcodec/mips/vc1dsp_init_mips.c
new file mode 100644
index 0000000..4adc9e1
--- /dev/null
+++ b/libavcodec/mips/vc1dsp_init_mips.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/vc1dsp.h"
+#include "vc1dsp_mips.h"
+#include "config.h"
+
+#if HAVE_MMI
+static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
+{
+#if _MIPS_SIM != _ABIO32
+    dsp->vc1_inv_trans_8x8    = ff_vc1_inv_trans_8x8_mmi;
+    dsp->vc1_inv_trans_4x8    = ff_vc1_inv_trans_4x8_mmi;
+    dsp->vc1_inv_trans_8x4    = ff_vc1_inv_trans_8x4_mmi;
+#endif
+    dsp->vc1_inv_trans_4x4    = ff_vc1_inv_trans_4x4_mmi;
+    dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmi;
+    dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmi;
+    dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmi;
+    dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmi;
+
+    dsp->vc1_h_overlap        = ff_vc1_h_overlap_mmi;
+    dsp->vc1_v_overlap        = ff_vc1_v_overlap_mmi;
+    dsp->vc1_h_s_overlap      = ff_vc1_h_s_overlap_mmi;
+    dsp->vc1_v_s_overlap      = ff_vc1_v_s_overlap_mmi;
+
+    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_mmi;
+    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_mmi;
+    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_mmi;
+    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_mmi;
+    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_mmi;
+    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_mmi;
+
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
+
+    FN_ASSIGN(put_, 0, 0, _mmi);
+    FN_ASSIGN(put_, 0, 1, _mmi);
+    FN_ASSIGN(put_, 0, 2, _mmi);
+    FN_ASSIGN(put_, 0, 3, _mmi);
+
+    FN_ASSIGN(put_, 1, 0, _mmi);
+    //FN_ASSIGN(put_, 1, 1, _mmi);//FIXME
+    //FN_ASSIGN(put_, 1, 2, _mmi);//FIXME
+    //FN_ASSIGN(put_, 1, 3, _mmi);//FIXME
+
+    FN_ASSIGN(put_, 2, 0, _mmi);
+    //FN_ASSIGN(put_, 2, 1, _mmi);//FIXME
+    //FN_ASSIGN(put_, 2, 2, _mmi);//FIXME
+    //FN_ASSIGN(put_, 2, 3, _mmi);//FIXME
+
+    FN_ASSIGN(put_, 3, 0, _mmi);
+    //FN_ASSIGN(put_, 3, 1, _mmi);//FIXME
+    //FN_ASSIGN(put_, 3, 2, _mmi);//FIXME
+    //FN_ASSIGN(put_, 3, 3, _mmi);//FIXME
+
+    FN_ASSIGN(avg_, 0, 0, _mmi);
+    FN_ASSIGN(avg_, 0, 1, _mmi);
+    FN_ASSIGN(avg_, 0, 2, _mmi);
+    FN_ASSIGN(avg_, 0, 3, _mmi);
+
+    FN_ASSIGN(avg_, 1, 0, _mmi);
+    //FN_ASSIGN(avg_, 1, 1, _mmi);//FIXME
+    //FN_ASSIGN(avg_, 1, 2, _mmi);//FIXME
+    //FN_ASSIGN(avg_, 1, 3, _mmi);//FIXME
+
+    FN_ASSIGN(avg_, 2, 0, _mmi);
+    //FN_ASSIGN(avg_, 2, 1, _mmi);//FIXME
+    //FN_ASSIGN(avg_, 2, 2, _mmi);//FIXME
+    //FN_ASSIGN(avg_, 2, 3, _mmi);//FIXME
+
+    FN_ASSIGN(avg_, 3, 0, _mmi);
+    //FN_ASSIGN(avg_, 3, 1, _mmi);//FIXME
+    //FN_ASSIGN(avg_, 3, 2, _mmi);//FIXME
+    //FN_ASSIGN(avg_, 3, 3, _mmi);//FIXME
+
+    dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_no_rnd_vc1_chroma_mc8_mmi;
+    dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_no_rnd_vc1_chroma_mc8_mmi;
+    dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_no_rnd_vc1_chroma_mc4_mmi;
+    dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_no_rnd_vc1_chroma_mc4_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_vc1dsp_init_mips(VC1DSPContext *dsp)
+{
+#if HAVE_MMI
+    vc1dsp_init_mmi(dsp);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h
new file mode 100644
index 0000000..0db85fa
--- /dev/null
+++ b/libavcodec/mips/vc1dsp_mips.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VC1DSP_MIPS_H
+#define AVCODEC_MIPS_VC1DSP_MIPS_H
+
+#include "libavcodec/vc1dsp.h"
+
+void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc01_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc02_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc03_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc10_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc11_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc12_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc13_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc20_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc21_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc22_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc23_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc30_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc31_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc32_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc33_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+
+void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc01_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc02_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc03_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc10_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc11_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc12_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc13_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc20_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc21_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc22_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc23_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc30_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc31_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc32_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc33_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd);
+
+
+void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc01_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc02_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc03_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc10_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc11_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc12_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc13_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc20_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc21_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc22_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc23_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc30_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc31_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc32_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_put_vc1_mspel_mc33_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+
+void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc01_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc02_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc03_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc10_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc11_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc12_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc13_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc20_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc21_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc22_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc23_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc30_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc31_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc32_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+void ff_avg_vc1_mspel_mc33_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd);
+
+void ff_vc1_inv_trans_8x8_mmi(int16_t block[64]);
+void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+
+void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
+
+void ff_vc1_v_overlap_mmi(uint8_t *src, int stride);
+void ff_vc1_h_overlap_mmi(uint8_t *src, int stride);
+void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom);
+void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags);
+
+void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq);
+
+void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y);
+void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y);
+void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y);
+void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y);
+
+#endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */
diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c
new file mode 100644
index 0000000..db314de
--- /dev/null
+++ b/libavcodec/mips/vc1dsp_mmi.c
@@ -0,0 +1,2462 @@
+/*
+ * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
+ *
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavcodec/vc1dsp.h"
+#include "constants.h"
+#include "vc1dsp_mips.h"
+#include "hpeldsp_mips.h"
+#include "libavutil/mips/mmiutils.h"
+
+#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)                  \
+        "li         %[tmp0],    "#r1"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
+        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
+        "li         %[tmp0],    "#r2"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
+        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
+        "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
+                                                                            \
+        "li         %[tmp0],    "#r3"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
+        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
+        "li         %[tmp0],    "#r4"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
+        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
+        "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
+        "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
+                                                                            \
+        "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
+        "paddw      %[ftmp2],   %[ftmp2],   "#c0"                     \n\t" \
+        "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
+        "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
+        "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
+        "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
+        "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
+        "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
+        "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
+        "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
+        "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
+        "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
+        "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
+        "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
+        "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
+        "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
+
+#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)              \
+        "li         %[tmp0],    "#r1"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
+        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
+        "li         %[tmp0],    "#r2"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
+        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
+        "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
+                                                                            \
+        "li         %[tmp0],    "#r3"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
+        "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
+        "li         %[tmp0],    "#r4"                                 \n\t" \
+        "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
+        "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
+        "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
+        "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
+        "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
+        "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
+                                                                            \
+        "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
+        "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
+        "paddw      %[ftmp14],  %[ftmp14],  "#c1"                     \n\t" \
+        "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
+        "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
+        "paddw      %[ftmp3],   %[ftmp3],   "#c1"                     \n\t" \
+        "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
+        "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
+        "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
+        "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
+        "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
+        "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
+        "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
+        "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
+        "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
+        "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
+        "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
+        "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
+        "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
+        "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
+
+/* Do inverse transform on 8x8 block */
+void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+{
+    int dc = block[0];
+    double ftmp[9];
+    mips_reg addr[1];
+    int count;
+
+    dc = (3 * dc +  1) >> 1;
+    dc = (3 * dc + 16) >> 5;
+
+    __asm__ volatile(
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
+        "li         %[count],   0x02                                    \n\t"
+
+        "1:                                                             \n\t"
+        MMI_LDC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
+        MMI_LDC1(%[ftmp2], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
+        MMI_LDC1(%[ftmp3], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
+        MMI_LDC1(%[ftmp4], %[addr0], 0x00)
+
+        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+
+        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
+        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
+
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+
+        MMI_SDC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
+        MMI_SDC1(%[ftmp3], %[addr0], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
+        MMI_SDC1(%[ftmp4], %[addr0], 0x00)
+
+        "addiu      %[count],   %[count],       -0x01                   \n\t"
+        PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
+        "bnez       %[count],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0]),
+          [count]"=&r"(count),          [dest]"+&r"(dest)
+        : [linesize]"r"((mips_reg)linesize),
+          [dc]"f"(dc)
+        : "memory"
+    );
+}
+
+#if _MIPS_SIM != _ABIO32
+void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
+{
+    DECLARE_ALIGNED(16, int16_t, temp[64]);
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
+    double ftmp[23];
+    uint64_t tmp[1];
+
+    __asm__ volatile (
+        /* 1st loop: start */
+        "li         %[tmp0],    0x03                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+
+       // 1st part
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp11], %[block], 0x10)
+        MMI_LDC1(%[ftmp2], %[block], 0x20)
+        MMI_LDC1(%[ftmp12], %[block], 0x30)
+        MMI_LDC1(%[ftmp3], %[block], 0x40)
+        MMI_LDC1(%[ftmp13], %[block], 0x50)
+        MMI_LDC1(%[ftmp4], %[block], 0x60)
+        MMI_LDC1(%[ftmp14], %[block], 0x70)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
+
+        "punpcklhw  %[ftmp9],  %[ftmp11],  %[ftmp12]                    \n\t"
+        "punpckhhw  %[ftmp10], %[ftmp11],  %[ftmp12]                    \n\t"
+        "punpcklhw  %[ftmp11], %[ftmp13],  %[ftmp14]                    \n\t"
+        "punpckhhw  %[ftmp12], %[ftmp13],  %[ftmp14]                    \n\t"
+
+        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
+                               0x000f0010, 0x00040009, %[ff_pw_4])
+
+        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
+                               0xfffc000f, 0xfff7fff0, %[ff_pw_4])
+
+        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
+                               0xfff00009, 0x000f0004, %[ff_pw_4])
+
+        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
+                               0xfff70004, 0xfff0000f, %[ff_pw_4])
+
+        TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
+
+        TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
+
+        MMI_SDC1(%[ftmp15], %[temp], 0x00)
+        MMI_SDC1(%[ftmp19], %[temp], 0x08)
+        MMI_SDC1(%[ftmp16], %[temp], 0x10)
+        MMI_SDC1(%[ftmp20], %[temp], 0x18)
+        MMI_SDC1(%[ftmp17], %[temp], 0x20)
+        MMI_SDC1(%[ftmp21], %[temp], 0x28)
+        MMI_SDC1(%[ftmp18], %[temp], 0x30)
+        MMI_SDC1(%[ftmp22], %[temp], 0x38)
+
+       // 2nd part
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
+        MMI_LDC1(%[ftmp11], %[block], 0x18)
+        MMI_LDC1(%[ftmp2], %[block], 0x28)
+        MMI_LDC1(%[ftmp12], %[block], 0x38)
+        MMI_LDC1(%[ftmp3], %[block], 0x48)
+        MMI_LDC1(%[ftmp13], %[block], 0x58)
+        MMI_LDC1(%[ftmp4], %[block], 0x68)
+        MMI_LDC1(%[ftmp14], %[block], 0x78)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
+
+        "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
+        "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
+
+        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
+                               0x000f0010, 0x00040009, %[ff_pw_4])
+
+        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
+                               0xfffc000f, 0xfff7fff0, %[ff_pw_4])
+
+        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
+                               0xfff00009, 0x000f0004, %[ff_pw_4])
+
+        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
+        VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
+                               0xfff70004, 0xfff0000f, %[ff_pw_4])
+
+        TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
+
+        TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
+
+        MMI_SDC1(%[ftmp19], %[temp], 0x48)
+        MMI_SDC1(%[ftmp20], %[temp], 0x58)
+        MMI_SDC1(%[ftmp21], %[temp], 0x68)
+        MMI_SDC1(%[ftmp22], %[temp], 0x78)
+        /* 1st loop: end */
+
+        /* 2nd loop: start */
+        "li         %[tmp0],    0x07                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+
+        // 1st part
+        MMI_LDC1(%[ftmp1], %[temp], 0x00)
+        MMI_LDC1(%[ftmp11], %[temp], 0x10)
+        MMI_LDC1(%[ftmp2], %[temp], 0x20)
+        MMI_LDC1(%[ftmp12], %[temp], 0x30)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp15],  %[ftmp17]                   \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp15],  %[ftmp17]                   \n\t"
+
+        "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpcklhw  %[ftmp11],  %[ftmp16],  %[ftmp18]                   \n\t"
+        "punpckhhw  %[ftmp12],  %[ftmp16],  %[ftmp18]                   \n\t"
+
+        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
+                               0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
+                               0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
+                               0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
+                               0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
+
+        MMI_SDC1(%[ftmp15], %[block], 0x00)
+        MMI_SDC1(%[ftmp16], %[block], 0x10)
+        MMI_SDC1(%[ftmp17], %[block], 0x20)
+        MMI_SDC1(%[ftmp18], %[block], 0x30)
+        MMI_SDC1(%[ftmp19], %[block], 0x40)
+        MMI_SDC1(%[ftmp20], %[block], 0x50)
+        MMI_SDC1(%[ftmp21], %[block], 0x60)
+        MMI_SDC1(%[ftmp22], %[block], 0x70)
+
+       // 2nd part
+        MMI_LDC1(%[ftmp1], %[temp], 0x08)
+        MMI_LDC1(%[ftmp11], %[temp], 0x18)
+        MMI_LDC1(%[ftmp2], %[temp], 0x28)
+        MMI_LDC1(%[ftmp12], %[temp], 0x38)
+        MMI_LDC1(%[ftmp3], %[temp], 0x48)
+        MMI_LDC1(%[ftmp13], %[temp], 0x58)
+        MMI_LDC1(%[ftmp4], %[temp], 0x68)
+        MMI_LDC1(%[ftmp14], %[temp], 0x78)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
+
+        "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
+        "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
+
+        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
+                               0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
+                               0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
+                               0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
+                               0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
+
+        MMI_SDC1(%[ftmp15], %[block], 0x08)
+        MMI_SDC1(%[ftmp16], %[block], 0x18)
+        MMI_SDC1(%[ftmp17], %[block], 0x28)
+        MMI_SDC1(%[ftmp18], %[block], 0x38)
+        MMI_SDC1(%[ftmp19], %[block], 0x48)
+        MMI_SDC1(%[ftmp20], %[block], 0x58)
+        MMI_SDC1(%[ftmp21], %[block], 0x68)
+        MMI_SDC1(%[ftmp22], %[block], 0x78)
+        /* 2nd loop: end */
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
+          [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
+          [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
+          [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
+          [ftmp22]"=&f"(ftmp[22]),
+          [tmp0]"=&r"(tmp[0])
+        : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
+          [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
+          [temp]"r"(temp)
+        : "memory"
+    );
+}
+#endif
+
+/* Do inverse transform on 8x4 part of block */
+void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+{
+    int dc = block[0];
+    double ftmp[9];
+
+    dc = ( 3 * dc +  1) >> 1;
+    dc = (17 * dc + 64) >> 7;
+
+    __asm__ volatile(
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
+
+        MMI_LDC1(%[ftmp1], %[dest0], 0x00)
+        MMI_LDC1(%[ftmp2], %[dest1], 0x00)
+        MMI_LDC1(%[ftmp3], %[dest2], 0x00)
+        MMI_LDC1(%[ftmp4], %[dest3], 0x00)
+
+        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+
+        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
+        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
+
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+
+        MMI_SDC1(%[ftmp1], %[dest0], 0x00)
+        MMI_SDC1(%[ftmp2], %[dest1], 0x00)
+        MMI_SDC1(%[ftmp3], %[dest2], 0x00)
+        MMI_SDC1(%[ftmp4], %[dest3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8])
+        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
+          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
+          [dc]"f"(dc)
+        : "memory"
+    );
+}
+
+#if _MIPS_SIM != _ABIO32
+void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+{
+    int16_t *src = block;
+    int16_t *dst = block;
+    double ftmp[16];
+    uint32_t tmp[1];
+    int16_t count = 4;
+    DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
+    DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
+    int16_t coeff[64] = {12, 16,  16,  15,  12,   9,   6,   4,
+                         12, 15,   6,  -4, -12, -16, -16,  -9,
+                         12,  9,  -6, -16, -12,   4,  16,  15,
+                         12,  4, -16,  -9,  12,  15,  -6, -16,
+                         12, -4, -16,   9,  12, -15,  -6,  16,
+                         12, -9,  -6,  16, -12,  -4,  16, -15,
+                         12, -15,  6,   4, -12,  16, -16,   9,
+                         12, -16, 16, -15,  12,  -9,   6,  -4};
+
+    // 1st loop
+    __asm__ volatile (
+        "li         %[tmp0],    0x03                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+
+        "1:                                                             \n\t"
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        MMI_LDC1(%[ftmp2], %[src], 0x08)
+
+        /* ftmp11: dst1,dst0 */
+        MMI_LDC1(%[ftmp3], %[coeff], 0x00)
+        MMI_LDC1(%[ftmp4], %[coeff], 0x08)
+        MMI_LDC1(%[ftmp5], %[coeff], 0x10)
+        MMI_LDC1(%[ftmp6], %[coeff], 0x18)
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "paddw      %[ftmp11],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "paddw      %[ftmp11],  %[ftmp11],  %[ff_pw_4]                  \n\t"
+
+        /* ftmp12: dst3,dst2 */
+        MMI_LDC1(%[ftmp3], %[coeff], 0x20)
+        MMI_LDC1(%[ftmp4], %[coeff], 0x28)
+        MMI_LDC1(%[ftmp5], %[coeff], 0x30)
+        MMI_LDC1(%[ftmp6], %[coeff], 0x38)
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "paddw      %[ftmp12],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "paddw      %[ftmp12],  %[ftmp12],  %[ff_pw_4]                  \n\t"
+
+        /* ftmp13: dst5,dst4 */
+        MMI_LDC1(%[ftmp3], %[coeff], 0x40)
+        MMI_LDC1(%[ftmp4], %[coeff], 0x48)
+        MMI_LDC1(%[ftmp5], %[coeff], 0x50)
+        MMI_LDC1(%[ftmp6], %[coeff], 0x58)
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "paddw      %[ftmp13],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "paddw      %[ftmp13],  %[ftmp13],  %[ff_pw_4]                  \n\t"
+
+        /* ftmp14: dst7,dst6 */
+        MMI_LDC1(%[ftmp3], %[coeff], 0x60)
+        MMI_LDC1(%[ftmp4], %[coeff], 0x68)
+        MMI_LDC1(%[ftmp5], %[coeff], 0x70)
+        MMI_LDC1(%[ftmp6], %[coeff], 0x78)
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
+        "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "paddw      %[ftmp14],  %[ftmp7],   %[ftmp8]                    \n\t"
+        "paddw      %[ftmp14],  %[ftmp14],  %[ff_pw_4]                  \n\t"
+
+        /* ftmp9: dst3,dst2,dst1,dst0    ftmp10: dst7,dst6,dst5,dst4 */
+        "psraw      %[ftmp11],  %[ftmp11],  %[ftmp0]                    \n\t"
+        "psraw      %[ftmp12],  %[ftmp12],  %[ftmp0]                    \n\t"
+        "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                    \n\t"
+        "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp11],  %[ftmp12]                   \n\t"
+        "punpcklhw  %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp13],  %[ftmp14]                   \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp13],  %[ftmp14]                   \n\t"
+        "punpcklhw  %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
+        MMI_SDC1(%[ftmp9], %[dst], 0x00)
+        MMI_SDC1(%[ftmp10], %[dst], 0x08)
+
+        PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
+        PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
+        "addiu      %[count],   %[count],   -0x01                       \n\t"
+        "bnez       %[count],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),      [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
+        : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
+        : "memory"
+    );
+
+    src = block;
+
+    // 2nd loop
+    __asm__ volatile (
+        "li         %[tmp0],    0x44                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
+
+        // 1st part
+        "li         %[tmp0],    0x07                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        MMI_LDC1(%[ftmp2], %[src], 0x10)
+        MMI_LDC1(%[ftmp3], %[src], 0x20)
+        MMI_LDC1(%[ftmp4], %[src], 0x30)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
+
+        /* ftmp11: dst03,dst02,dst01,dst00 */
+        "li         %[tmp0],    0x00160011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0x000a0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp12: dst13,dst12,dst11,dst10 */
+        "li         %[tmp0],    0x000a0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0xffeaffef                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp13: dst23,dst22,dst21,dst20 */
+        "li         %[tmp0],    0xfff60011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0x0016ffef                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp14: dst33,dst32,dst31,dst30 */
+        "li         %[tmp0],    0xffea0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0xfff60011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        MMI_LWC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
+        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
+        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+        MMI_SWC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
+
+        // 2nd part
+        "li         %[tmp0],    0x07                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[src], 0x08)
+        MMI_LDC1(%[ftmp2], %[src], 0x18)
+        MMI_LDC1(%[ftmp3], %[src], 0x28)
+        MMI_LDC1(%[ftmp4], %[src], 0x38)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
+
+        /* ftmp11: dst03,dst02,dst01,dst00 */
+        "li         %[tmp0],    0x00160011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0x000a0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp12: dst13,dst12,dst11,dst10 */
+        "li         %[tmp0],    0x000a0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0xffeaffef                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp13: dst23,dst22,dst21,dst20 */
+        "li         %[tmp0],    0xfff60011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0x0016ffef                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp14: dst33,dst32,dst31,dst30 */
+        "li         %[tmp0],    0xffea0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0xfff60011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        MMI_LWC1(%[ftmp1], %[dest], 0x04)
+        PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
+        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
+        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+        MMI_SWC1(%[ftmp1], %[dest], 0x04)
+        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
+
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
+          [tmp0]"=&r"(tmp[0])
+        : [ff_pw_64]"f"(ff_pw_64_local),
+          [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
+        :"memory"
+    );
+}
+#endif
+
+/* Do inverse transform on 4x8 parts of block */
+void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+{
+    int dc = block[0];
+    double ftmp[9];
+    DECLARE_VAR_LOW32;
+
+    dc = (17 * dc +  4) >> 3;
+    dc = (12 * dc + 64) >> 7;
+
+    __asm__ volatile(
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
+
+        MMI_LWC1(%[ftmp1], %[dest0], 0x00)
+        MMI_LWC1(%[ftmp2], %[dest1], 0x00)
+        MMI_LWC1(%[ftmp3], %[dest2], 0x00)
+        MMI_LWC1(%[ftmp4], %[dest3], 0x00)
+        MMI_LWC1(%[ftmp5], %[dest4], 0x00)
+        MMI_LWC1(%[ftmp6], %[dest5], 0x00)
+        MMI_LWC1(%[ftmp7], %[dest6], 0x00)
+        MMI_LWC1(%[ftmp8], %[dest7], 0x00)
+
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+
+        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
+        "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
+
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+
+        MMI_SWC1(%[ftmp1], %[dest0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dest1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dest2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dest3], 0x00)
+        MMI_SWC1(%[ftmp5], %[dest4], 0x00)
+        MMI_SWC1(%[ftmp6], %[dest5], 0x00)
+        MMI_SWC1(%[ftmp7], %[dest6], 0x00)
+        MMI_SWC1(%[ftmp8], %[dest7], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_LOW32
+          [ftmp8]"=&f"(ftmp[8])
+        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
+          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
+          [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
+          [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
+          [dc]"f"(dc)
+        : "memory"
+    );
+}
+
+#if _MIPS_SIM != _ABIO32
+void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+{
+    int16_t *src = block;
+    int16_t *dst = block;
+    double ftmp[23];
+    uint32_t count = 8, tmp[1];
+    int16_t coeff[16] = {17, 22, 17, 10,
+                         17, 10,-17,-22,
+                         17,-10,-17, 22,
+                         17,-22, 17,-10};
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
+
+    // 1st loop
+    __asm__ volatile (
+
+        "li         %[tmp0],    0x03                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+
+        MMI_LDC1(%[ftmp2], %[coeff], 0x00)
+        MMI_LDC1(%[ftmp3], %[coeff], 0x08)
+        MMI_LDC1(%[ftmp4], %[coeff], 0x10)
+        MMI_LDC1(%[ftmp5], %[coeff], 0x18)
+        "1:                                                             \n\t"
+        /* ftmp8: dst3,dst2,dst1,dst0 */
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
+        "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
+        "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
+        "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
+        "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
+        "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
+        "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
+        "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
+        MMI_SDC1(%[ftmp8], %[dst], 0x00)
+
+        PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
+        PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
+        "addiu      %[count],   %[count],   -0x01                       \n\t"
+        "bnez       %[count],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
+          [src]"+&r"(src),              [dst]"+&r"(dst)
+        : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
+        : "memory"
+    );
+
+    src = block;
+
+    // 2nd loop
+    __asm__ volatile (
+        "li         %[tmp0],    0x07                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        MMI_LDC1(%[ftmp2], %[src], 0x20)
+        MMI_LDC1(%[ftmp3], %[src], 0x40)
+        MMI_LDC1(%[ftmp4], %[src], 0x60)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
+
+        MMI_LDC1(%[ftmp1], %[src], 0x10)
+        MMI_LDC1(%[ftmp2], %[src], 0x30)
+        MMI_LDC1(%[ftmp3], %[src], 0x50)
+        MMI_LDC1(%[ftmp4], %[src], 0x70)
+        "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
+
+        /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
+                               0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
+                               0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
+                               0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
+
+        /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
+        VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
+                               0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
+
+        MMI_LWC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU  "%[tmp0],   %[dest],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
+        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
+        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
+        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
+        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
+        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
+        PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
+
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp15]                   \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp16]                   \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp17]                   \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp18]                   \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp19]                   \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],   %[ftmp20]                   \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],   %[ftmp21]                   \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],   %[ftmp22]                   \n\t"
+
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
+
+        MMI_SWC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
+
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
+          [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
+          [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
+          [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
+          [ftmp22]"=&f"(ftmp[22]),
+          [tmp0]"=&r"(tmp[0])
+        : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
+          [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
+        : "memory"
+    );
+}
+#endif
+
+/* Do inverse transform on 4x4 part of block */
+void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+{
+    int dc = block[0];
+    double ftmp[5];
+    DECLARE_VAR_LOW32;
+
+    dc = (17 * dc +  4) >> 3;
+    dc = (17 * dc + 64) >> 7;
+
+    __asm__ volatile(
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
+
+        MMI_LWC1(%[ftmp1], %[dest0], 0x00)
+        MMI_LWC1(%[ftmp2], %[dest1], 0x00)
+        MMI_LWC1(%[ftmp3], %[dest2], 0x00)
+        MMI_LWC1(%[ftmp4], %[dest3], 0x00)
+
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+
+        "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
+
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+
+        MMI_SWC1(%[ftmp1], %[dest0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dest1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dest2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dest3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_LOW32
+          [ftmp4]"=&f"(ftmp[4])
+        : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
+          [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
+          [dc]"f"(dc)
+        : "memory"
+    );
+}
+
+void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+{
+    int16_t *src = block;
+    int16_t *dst = block;
+    double ftmp[16];
+    uint32_t count = 4, tmp[1];
+    int16_t coeff[16] = {17, 22, 17, 10,
+                         17, 10,-17,-22,
+                         17,-10,-17, 22,
+                         17,-22, 17,-10};
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
+    DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
+    // 1st loop
+    __asm__ volatile (
+
+        "li         %[tmp0],    0x03                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+        MMI_LDC1(%[ftmp2], %[coeff], 0x00)
+        MMI_LDC1(%[ftmp3], %[coeff], 0x08)
+        MMI_LDC1(%[ftmp4], %[coeff], 0x10)
+        MMI_LDC1(%[ftmp5], %[coeff], 0x18)
+        "1:                                                             \n\t"
+        /* ftmp8: dst3,dst2,dst1,dst0 */
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
+        "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
+        "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
+        "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
+        "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
+        "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
+        "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
+        "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
+        "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
+        "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
+        MMI_SDC1(%[ftmp8], %[dst], 0x00)
+
+        PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
+        PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
+        "addiu      %[count],   %[count],   -0x01                       \n\t"
+        "bnez       %[count],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
+          [src]"+&r"(src),              [dst]"+&r"(dst)
+        : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
+        : "memory"
+    );
+
+    src = block;
+
+    // 2nd loop
+    __asm__ volatile (
+        "li         %[tmp0],    0x07                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
+        "li         %[tmp0],    0x44                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
+
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        MMI_LDC1(%[ftmp2], %[src], 0x10)
+        MMI_LDC1(%[ftmp3], %[src], 0x20)
+        MMI_LDC1(%[ftmp4], %[src], 0x30)
+        "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
+        "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
+
+        /* ftmp11: dst03,dst02,dst01,dst00 */
+        "li         %[tmp0],    0x00160011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0x000a0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp12: dst13,dst12,dst11,dst10 */
+        "li         %[tmp0],    0x000a0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0xffeaffef                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp13: dst23,dst22,dst21,dst20 */
+        "li         %[tmp0],    0xfff60011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0x0016ffef                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        /* ftmp14: dst33,dst32,dst31,dst30 */
+        "li         %[tmp0],    0xffea0011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
+        "li         %[tmp0],    0xfff60011                              \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
+        "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
+        "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
+        "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
+        "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
+        "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
+
+        MMI_LWC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
+        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
+        PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
+        MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
+
+        MMI_SWC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
+        PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
+        MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
+
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
+          [tmp0]"=&r"(tmp[0])
+        : [ff_pw_64]"f"(ff_pw_64_local),
+          [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
+        :"memory"
+    );
+}
+
+/* Apply overlap transform to horizontal edge */
+void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
+{
+    int i;
+    int a, b, c, d;
+    int d1, d2;
+    int rnd = 1;
+    for (i = 0; i < 8; i++) {
+        a  = src[-2];
+        b  = src[-1];
+        c  = src[0];
+        d  = src[1];
+        d1 = (a - d + 3 + rnd) >> 3;
+        d2 = (a - d + b - c + 4 - rnd) >> 3;
+
+        src[-2] = a - d1;
+        src[-1] = av_clip_uint8(b - d2);
+        src[0]  = av_clip_uint8(c + d2);
+        src[1]  = d + d1;
+        src    += stride;
+        rnd     = !rnd;
+    }
+}
+
+void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
+{
+    int i;
+    int a, b, c, d;
+    int d1, d2;
+    int rnd1 = flags & 2 ? 3 : 4;
+    int rnd2 = 7 - rnd1;
+    for (i = 0; i < 8; i++) {
+        a  = left[6];
+        b  = left[7];
+        c  = right[0];
+        d  = right[1];
+        d1 = a - d;
+        d2 = a - d + b - c;
+
+        left[6]  = ((a << 3) - d1 + rnd1) >> 3;
+        left[7]  = ((b << 3) - d2 + rnd2) >> 3;
+        right[0] = ((c << 3) + d2 + rnd1) >> 3;
+        right[1] = ((d << 3) + d1 + rnd2) >> 3;
+
+        right += right_stride;
+        left  += left_stride;
+        if (flags & 1) {
+            rnd2   = 7 - rnd2;
+            rnd1   = 7 - rnd1;
+        }
+    }
+}
+
+/* Apply overlap transform to vertical edge */
+void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
+{
+    int i;
+    int a, b, c, d;
+    int d1, d2;
+    int rnd = 1;
+    for (i = 0; i < 8; i++) {
+        a  = src[-2 * stride];
+        b  = src[-stride];
+        c  = src[0];
+        d  = src[stride];
+        d1 = (a - d + 3 + rnd) >> 3;
+        d2 = (a - d + b - c + 4 - rnd) >> 3;
+
+        src[-2 * stride] = a - d1;
+        src[-stride]     = av_clip_uint8(b - d2);
+        src[0]           = av_clip_uint8(c + d2);
+        src[stride]      = d + d1;
+        src++;
+        rnd = !rnd;
+    }
+}
+
+void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
+{
+    int i;
+    int a, b, c, d;
+    int d1, d2;
+    int rnd1 = 4, rnd2 = 3;
+    for (i = 0; i < 8; i++) {
+        a  = top[48];
+        b  = top[56];
+        c  = bottom[0];
+        d  = bottom[8];
+        d1 = a - d;
+        d2 = a - d + b - c;
+
+        top[48]   = ((a << 3) - d1 + rnd1) >> 3;
+        top[56]   = ((b << 3) - d2 + rnd2) >> 3;
+        bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
+        bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
+
+        bottom++;
+        top++;
+        rnd2 = 7 - rnd2;
+        rnd1 = 7 - rnd1;
+    }
+}
+
+/**
+ * VC-1 in-loop deblocking filter for one line
+ * @param src source block type
+ * @param stride block stride
+ * @param pq block quantizer
+ * @return whether other 3 pairs should be filtered or not
+ * @see 8.6
+ */
+static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
+{
+    int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
+              5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
+    int a0_sign = a0 >> 31;        /* Store sign */
+
+    a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
+    if (a0 < pq) {
+        int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
+                        5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
+        int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
+                        5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
+        if (a1 < a0 || a2 < a0) {
+            int clip      = src[-1 * stride] - src[0 * stride];
+            int clip_sign = clip >> 31;
+
+            clip = ((clip ^ clip_sign) - clip_sign) >> 1;
+            if (clip) {
+                int a3     = FFMIN(a1, a2);
+                int d      = 5 * (a3 - a0);
+                int d_sign = (d >> 31);
+
+                d       = ((d ^ d_sign) - d_sign) >> 3;
+                d_sign ^= a0_sign;
+
+                if (d_sign ^ clip_sign)
+                    d = 0;
+                else {
+                    d = FFMIN(d, clip);
+                    d = (d ^ d_sign) - d_sign; /* Restore sign */
+                    src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
+                    src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
+                }
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * VC-1 in-loop deblocking filter
+ * @param src source block type
+ * @param step distance between horizontally adjacent elements
+ * @param stride distance between vertically adjacent elements
+ * @param len edge length to filter (4 or 8 pixels)
+ * @param pq block quantizer
+ * @see 8.6
+ */
+static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
+                                   int len, int pq)
+{
+    int i;
+    int filt3;
+
+    for (i = 0; i < len; i += 4) {
+        filt3 = vc1_filter_line(src + 2 * step, stride, pq);
+        if (filt3) {
+            vc1_filter_line(src + 0 * step, stride, pq);
+            vc1_filter_line(src + 1 * step, stride, pq);
+            vc1_filter_line(src + 3 * step, stride, pq);
+        }
+        src += step * 4;
+    }
+}
+
+void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
+{
+    vc1_loop_filter(src, 1, stride, 4, pq);
+}
+
+void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
+{
+    vc1_loop_filter(src, stride, 1, 4, pq);
+}
+
+void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
+{
+    vc1_loop_filter(src, 1, stride, 8, pq);
+}
+
+void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
+{
+    vc1_loop_filter(src, stride, 1, 8, pq);
+}
+
+void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
+{
+    vc1_loop_filter(src, 1, stride, 16, pq);
+}
+
+void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
+{
+    vc1_loop_filter(src, stride, 1, 16, pq);
+}
+
+void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd)
+{
+    ff_put_pixels8_8_mmi(dst, src, stride, 8);
+}
+void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd)
+{
+    ff_put_pixels16_8_mmi(dst, src, stride, 16);
+}
+void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels8_8_mmi(dst, src, stride, 8);
+}
+void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels16_8_mmi(dst, src, stride, 16);
+}
+
+#define OP_PUT(S, D)
+#define OP_AVG(S, D)                                                        \
+    "ldc1       $f16,   "#S"                        \n\t"                   \
+    "pavgb      "#D",   "#D",   $f16                \n\t"
+
+/** Add rounder from $f14 to $f6 and pack result at destination */
+#define NORMALIZE_MMI(SHIFT)                                                \
+    "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
+    "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
+    "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
+    "psrah      $f8,    $f8,    "SHIFT"             \n\t"
+
+#define TRANSFER_DO_PACK(OP)                                                \
+    "packushb   $f6,    $f6,    $f8                 \n\t"                   \
+    OP((%[dst]), $f6)                                                       \
+    "sdc1       $f6,    0x00(%[dst])                \n\t"
+
+#define TRANSFER_DONT_PACK(OP)                                              \
+     OP(0(%[dst]), $f6)                                                     \
+     OP(8(%[dst]), $f8)                                                     \
+     "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
+     "sdc1      $f8,    0x08(%[dst])                \n\t"
+
+/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
+#define DO_UNPACK(reg)                                                      \
+    "punpcklbh  "reg",  "reg",  $f0                 \n\t"
+#define DONT_UNPACK(reg)
+
+/** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
+#define LOAD_ROUNDER_MMI(ROUND)                                             \
+    "lwc1       $f14,   "ROUND"                     \n\t"                   \
+    "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
+    "punpcklwd  $f14,   $f14,   $f14                \n\t"
+
+
+#define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
+    "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
+    PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
+    MMI_ULWC1(R0, $9, 0x00)                                                 \
+    "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
+    "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
+    PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
+    MMI_ULWC1(R3, $9, 0x00)                                                 \
+    "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
+    "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
+    "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
+    "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
+    "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
+    MMI_SDC1(R1, %[dst], OFF)                                               \
+    PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
+
+/** Sacrificing $f12 makes it possible to pipeline loads from src */
+static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
+                                       const uint8_t *src, mips_reg stride,
+                                       int rnd, int64_t shift)
+{
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
+
+    __asm__ volatile(
+        "xor        $f0,    $f0,    $f0             \n\t"
+        "li         $8,     0x03                    \n\t"
+        LOAD_ROUNDER_MMI("%[rnd]")
+        "ldc1       $f12,   %[ff_pw_9]              \n\t"
+        "1:                                         \n\t"
+        MMI_ULWC1($f4, %[src], 0x00)
+        PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
+        MMI_ULWC1($f6, %[src], 0x00)
+        "punpcklbh  $f4,    $f4,    $f0             \n\t"
+        "punpcklbh  $f6,    $f6,    $f0             \n\t"
+        SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
+        SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
+        SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
+        SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
+        SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
+        SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
+        SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
+        SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
+        PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
+        PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
+        "addiu      $8,     $8,    -0x01            \n\t"
+        "bnez       $8,     1b                      \n\t"
+        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
+          [src]"+r"(src),               [dst]"+r"(dst)
+        : [stride]"r"(stride),          [stride1]"r"(-2*stride),
+          [shift]"f"(shift),            [rnd]"m"(rnd),
+          [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
+        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+          "$f14", "$f16", "memory"
+    );
+}
+
+/**
+ * Data is already unpacked, so some operations can directly be made from
+ * memory.
+ */
+#define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
+static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
+                                             const int16_t *src, int rnd)   \
+{                                                                           \
+    int h = 8;                                                              \
+    DECLARE_VAR_ALL64;                                                      \
+    DECLARE_VAR_ADDRT;                                                      \
+                                                                            \
+    src -= 1;                                                               \
+    rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
+                                                                            \
+    __asm__ volatile(                                                       \
+        LOAD_ROUNDER_MMI("%[rnd]")                                          \
+        "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
+        "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
+        "1:                                         \n\t"                   \
+        MMI_ULDC1($f2, %[src], 0x00)                                        \
+        MMI_ULDC1($f4, %[src], 0x08)                                        \
+        MMI_ULDC1($f6, %[src], 0x02)                                        \
+        MMI_ULDC1($f8, %[src], 0x0a)                                        \
+        MMI_ULDC1($f0, %[src], 0x06)                                        \
+        "paddh      $f2,    $f2,    $f0             \n\t"                   \
+        MMI_ULDC1($f0, %[src], 0x0e)                                        \
+        "paddh      $f4,    $f4,    $f0             \n\t"                   \
+        MMI_ULDC1($f0, %[src], 0x04)                                        \
+        "paddh      $f6,    $f6,    $f0             \n\t"                   \
+        MMI_ULDC1($f0, %[src], 0x0b)                                        \
+        "paddh      $f8,    $f8,    $f0             \n\t"                   \
+        "pmullh     $f6,    $f6,    $f10            \n\t"                   \
+        "pmullh     $f8,    $f8,    $f10            \n\t"                   \
+        "psubh      $f6,    $f6,    $f2             \n\t"                   \
+        "psubh      $f8,    $f8,    $f4             \n\t"                   \
+        "li         $8,     0x07                    \n\t"                   \
+        "mtc1       $8,     $f16                    \n\t"                   \
+        NORMALIZE_MMI("$f16")                                               \
+        /* Remove bias */                                                   \
+        "paddh      $f6,    $f6,    $f12            \n\t"                   \
+        "paddh      $f8,    $f8,    $f12            \n\t"                   \
+        TRANSFER_DO_PACK(OP)                                                \
+        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
+        PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
+        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
+        "bnez       %[h],   1b                      \n\t"                   \
+        : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
+          [h]"+r"(h),                                                       \
+          [src]"+r"(src),               [dst]"+r"(dst)                      \
+        : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
+          [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
+        : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
+          "$f16", "memory"                                                  \
+    );                                                                      \
+}
+
+VC1_HOR_16B_SHIFT2(OP_PUT, put_)
+VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
+
+/**
+ * Purely vertical or horizontal 1/2 shift interpolation.
+ * Sacrify $f12 for *9 factor.
+ */
+#define VC1_SHIFT2(OP, OPNAME)\
+static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
+                                     mips_reg stride, int rnd,              \
+                                     mips_reg offset)                       \
+{                                                                           \
+    DECLARE_VAR_LOW32;                                                      \
+    DECLARE_VAR_ADDRT;                                                      \
+                                                                            \
+    rnd = 8 - rnd;                                                          \
+                                                                            \
+    __asm__ volatile(                                                       \
+        "xor        $f0,    $f0,    $f0             \n\t"                   \
+        "li         $10,    0x08                    \n\t"                   \
+        LOAD_ROUNDER_MMI("%[rnd]")                                          \
+        "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
+        "1:                                         \n\t"                   \
+        MMI_ULWC1($f6, %[src], 0x00)                                        \
+        MMI_ULWC1($f8, %[src], 0x04)                                        \
+        PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
+        MMI_ULWC1($f2, $9, 0x00)                                            \
+        MMI_ULWC1($f4, $9, 0x04)                                            \
+        PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
+        "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
+        "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
+        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
+        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
+        "paddh      $f6,    $f6,    $f2             \n\t"                   \
+        "paddh      $f8,    $f8,    $f4             \n\t"                   \
+        PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
+        MMI_ULWC1($f2, $9, 0x00)                                            \
+        MMI_ULWC1($f4, $9, 0x04)                                            \
+        "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
+        "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
+        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
+        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
+        "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
+        "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
+        PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
+        MMI_ULWC1($f2, $9, 0x00)                                            \
+        MMI_ULWC1($f4, $9, 0x04)                                            \
+        "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
+        "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
+        "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
+        "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
+        "li         $8,     0x04                    \n\t"                   \
+        "mtc1       $8,     $f16                    \n\t"                   \
+        NORMALIZE_MMI("$f16")                                               \
+        "packushb   $f6,    $f6,    $f8             \n\t"                   \
+        OP((%[dst]), $f6)                                                   \
+        "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
+        "addiu      $10,    $10,   -0x01            \n\t"                   \
+        PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
+        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
+        "bnez       $10,    1b                      \n\t"                   \
+        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
+          [src]"+r"(src),               [dst]"+r"(dst)                      \
+        : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
+          [stride]"r"(stride),          [rnd]"m"(rnd),                      \
+          [stride1]"r"(stride-offset),                                      \
+          [ff_pw_9]"m"(ff_pw_9)                                             \
+        : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
+          "$f12", "$f14", "$f16", "memory"                                  \
+    );                                                                      \
+}
+
+VC1_SHIFT2(OP_PUT, put_)
+VC1_SHIFT2(OP_AVG, avg_)
+
+/**
+ * Core of the 1/4 and 3/4 shift bicubic interpolation.
+ *
+ * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
+ * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
+ * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
+ * @param A1      Stride address of 1st tap (beware of unpacked/packed).
+ * @param A2      Stride address of 2nd tap
+ * @param A3      Stride address of 3rd tap
+ * @param A4      Stride address of 4th tap
+ */
+#define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
+    PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
+    LOAD($f2, $9, M*0)                                                      \
+    LOAD($f4, $9, M*4)                                                      \
+    UNPACK("$f2")                                                           \
+    UNPACK("$f4")                                                           \
+    "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
+    "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
+    PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
+    LOAD($f6, $9, M*0)                                                      \
+    LOAD($f8, $9, M*4)                                                      \
+    UNPACK("$f6")                                                           \
+    UNPACK("$f8")                                                           \
+    "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
+    "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
+    "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
+    "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
+    PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
+    LOAD($f2, $9, M*0)                                                      \
+    LOAD($f4, $9, M*4)                                                      \
+    UNPACK("$f2")                                                           \
+    UNPACK("$f4")                                                           \
+    "li         $8,     0x02                    \n\t"                       \
+    "mtc1       $8,     $f16                    \n\t"                       \
+    "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
+    "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
+    "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
+    "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
+    PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
+    LOAD($f2, $9, M*0)                                                      \
+    LOAD($f4, $9, M*4)                                                      \
+    UNPACK("$f2")                                                           \
+    UNPACK("$f4")                                                           \
+    "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
+    "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
+    "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
+    "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
+
+/**
+ * Macro to build the vertical 16bits version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
+ *
+ * @param  NAME   Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
+static void                                                                 \
+vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
+                                 mips_reg src_stride,                       \
+                                 int rnd, int64_t shift)                    \
+{                                                                           \
+    int h = 8;                                                              \
+    DECLARE_VAR_LOW32;                                                      \
+    DECLARE_VAR_ADDRT;                                                      \
+                                                                            \
+    src -= src_stride;                                                      \
+                                                                            \
+    __asm__ volatile(                                                       \
+        "xor        $f0,    $f0,    $f0             \n\t"                   \
+        LOAD_ROUNDER_MMI("%[rnd]")                                          \
+        "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
+        "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
+        ".p2align 3                                 \n\t"                   \
+        "1:                                         \n\t"                   \
+        MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
+        NORMALIZE_MMI("%[shift]")                                           \
+        TRANSFER_DONT_PACK(OP_PUT)                                          \
+        /* Last 3 (in fact 4) bytes on the line */                          \
+        PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
+        MMI_ULWC1($f2, $9, 0x08)                                            \
+        DO_UNPACK("$f2")                                                    \
+        "mov.d      $f6,    $f2                     \n\t"                   \
+        "paddh      $f2,    $f2,    $f2             \n\t"                   \
+        "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
+        PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
+        MMI_ULWC1($f6, $9, 0x08)                                            \
+        DO_UNPACK("$f6")                                                    \
+        "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
+        "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
+        PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
+        MMI_ULWC1($f2, $9, 0x08)                                            \
+        DO_UNPACK("$f2")                                                    \
+        "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
+        "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
+        PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
+        MMI_ULWC1($f2, $9, 0x08)                                            \
+        DO_UNPACK("$f2")                                                    \
+        "li         $8,     0x02                    \n\t"                   \
+        "mtc1       $8,     $f16                    \n\t"                   \
+        "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
+        "psubh      $f6,    $f6,    $f2             \n\t"                   \
+        "paddh      $f6,    $f6,    $f14            \n\t"                   \
+        "li         $8,     0x06                    \n\t"                   \
+        "mtc1       $8,     $f16                    \n\t"                   \
+        "psrah      $f6,    $f6,    $f16            \n\t"                   \
+        "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
+        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
+        PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
+        PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
+        "bnez       %[h],   1b                      \n\t"                   \
+        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
+          [h]"+r"(h),                                                       \
+          [src]"+r"(src),               [dst]"+r"(dst)                      \
+        : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
+          [stride_x3]"r"(3*src_stride),                                     \
+          [rnd]"m"(rnd),                [shift]"f"(shift),                  \
+          [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
+          [ff_pw_3]"f"(ff_pw_3)                                             \
+        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
+          "$f14", "$f16", "memory"                                          \
+    );                                                                      \
+}
+
+/**
+ * Macro to build the horizontal 16bits version of vc1_put_shift[13].
+ * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
+ *
+ * @param  NAME   Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
+static void                                                                 \
+OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
+                                       const int16_t *src, int rnd)         \
+{                                                                           \
+    int h = 8;                                                              \
+    DECLARE_VAR_ALL64;                                                      \
+    DECLARE_VAR_ADDRT;                                                      \
+                                                                            \
+    src -= 1;                                                               \
+    rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
+                                                                            \
+    __asm__ volatile(                                                       \
+        "xor        $f0,    $f0,    $f0             \n\t"                   \
+        LOAD_ROUNDER_MMI("%[rnd]")                                          \
+        "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
+        "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
+        ".p2align 3                                 \n\t"                   \
+        "1:                                         \n\t"                   \
+        MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
+        "li         $8,     0x07                    \n\t"                   \
+        "mtc1       $8,     $f16                    \n\t"                   \
+        NORMALIZE_MMI("$f16")                                               \
+        /* Remove bias */                                                   \
+        "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
+        "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
+        TRANSFER_DO_PACK(OP)                                                \
+        "addiu      %[h],   %[h],  -0x01            \n\t"                   \
+        PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
+        PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
+        "bnez       %[h],   1b                      \n\t"                   \
+        : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
+          [h]"+r"(h),                                                       \
+          [src]"+r"(src),               [dst]"+r"(dst)                      \
+        : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
+          [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
+          [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
+        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
+          "$f14", "$f16", "memory"                                          \
+    );                                                                      \
+}
+
+/**
+ * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (offset), %4 (2*offset) and %5 (3*offset).
+ *
+ * @param  NAME   Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
+static void                                                                 \
+OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
+                              mips_reg stride, int rnd, mips_reg offset)    \
+{                                                                           \
+    int h = 8;                                                              \
+    DECLARE_VAR_LOW32;                                                      \
+    DECLARE_VAR_ADDRT;                                                      \
+                                                                            \
+    src -= offset;                                                          \
+    rnd = 32-rnd;                                                           \
+                                                                            \
+    __asm__ volatile (                                                      \
+        "xor        $f0,    $f0,    $f0             \n\t"                   \
+        LOAD_ROUNDER_MMI("%[rnd]")                                          \
+        "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
+        "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
+        ".p2align 3                                 \n\t"                   \
+        "1:                                         \n\t"                   \
+        MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
+        "li         $8,     0x06                    \n\t"                   \
+        "mtc1       $8,     $f16                    \n\t"                   \
+        NORMALIZE_MMI("$f16")                                               \
+        TRANSFER_DO_PACK(OP)                                                \
+        "addiu      %[h],   %[h],      -0x01        \n\t"                   \
+        PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
+        PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
+        "bnez       %[h],   1b                      \n\t"                   \
+        : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
+          [h]"+r"(h),                                                       \
+          [src]"+r"(src),               [dst]"+r"(dst)                      \
+        : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
+          [offset_x3]"r"(3*offset),     [stride]"r"(stride),                \
+          [rnd]"m"(rnd),                                                    \
+          [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
+          [ff_pw_3]"f"(ff_pw_3)                                             \
+        : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
+          "$f14", "$f16", "memory"                                          \
+    );                                                                      \
+}
+
+
+/** 1/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
+MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
+MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
+MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
+MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
+
+/** 3/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
+MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
+MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
+MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
+MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
+
+typedef void (*vc1_mspel_mc_filter_ver_16bits)
+             (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
+              int64_t shift);
+typedef void (*vc1_mspel_mc_filter_hor_16bits)
+             (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
+typedef void (*vc1_mspel_mc_filter_8bits)
+             (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
+              mips_reg offset);
+
+/**
+ * Interpolate fractional pel values by applying proper vertical then
+ * horizontal filter.
+ *
+ * @param  dst     Destination buffer for interpolated pels.
+ * @param  src     Source buffer.
+ * @param  stride  Stride for both src and dst buffers.
+ * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
+ * @param  hmode   Vertical filter.
+ * @param  rnd     Rounding bias.
+ */
+#define VC1_MSPEL_MC(OP)                                                    \
+static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
+                               int hmode, int vmode, int rnd)               \
+{                                                                           \
+    static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
+         { NULL, vc1_put_ver_16b_shift1_mmi,                                \
+                 vc1_put_ver_16b_shift2_mmi,                                \
+                 vc1_put_ver_16b_shift3_mmi };                              \
+    static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
+         { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
+                 OP ## vc1_hor_16b_shift2_mmi,                              \
+                 OP ## vc1_hor_16b_shift3_mmi };                            \
+    static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
+         { NULL, OP ## vc1_shift1_mmi,                                      \
+                 OP ## vc1_shift2_mmi,                                      \
+                 OP ## vc1_shift3_mmi };                                    \
+                                                                            \
+    if (vmode) { /* Vertical filter to apply */                             \
+        if (hmode) { /* Horizontal filter to apply, output to tmp */        \
+            static const int shift_value[] = { 0, 5, 1, 5 };                \
+            int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
+            int    r;                                                       \
+            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
+                                                                            \
+            r = (1<<(shift-1)) + rnd-1;                                     \
+            vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
+                                                                            \
+            vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
+            return;                                                         \
+        }                                                                   \
+        else { /* No horizontal filter, output 8 lines to dst */            \
+            vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
+            return;                                                         \
+        }                                                                   \
+    }                                                                       \
+                                                                            \
+    /* Horizontal mode with no vertical mode */                             \
+    vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
+}                                                                           \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
+                                  int stride, int hmode, int vmode, int rnd)\
+{                                                                           \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
+    dst += 8*stride; src += 8*stride;                                       \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
+}
+
+VC1_MSPEL_MC(put_)
+VC1_MSPEL_MC(avg_)
+
+/** Macro to ease bicubic filter interpolation functions declarations */
+#define DECLARE_FUNCTION(a, b)                                              \
+void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
+                                           const uint8_t *src,              \
+                                           ptrdiff_t stride,                \
+                                           int rnd)                         \
+{                                                                           \
+     put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
+}                                                                           \
+void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
+                                           const uint8_t *src,              \
+                                           ptrdiff_t stride,                \
+                                           int rnd)                         \
+{                                                                           \
+     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
+}                                                                           \
+void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
+                                              const uint8_t *src,           \
+                                              ptrdiff_t stride,             \
+                                              int rnd)                      \
+{                                                                           \
+     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
+}                                                                           \
+void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
+                                              const uint8_t *src,           \
+                                              ptrdiff_t stride,             \
+                                              int rnd)                      \
+{                                                                           \
+     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
+}
+
+DECLARE_FUNCTION(0, 1)
+DECLARE_FUNCTION(0, 2)
+DECLARE_FUNCTION(0, 3)
+
+DECLARE_FUNCTION(1, 0)
+DECLARE_FUNCTION(1, 1)
+DECLARE_FUNCTION(1, 2)
+DECLARE_FUNCTION(1, 3)
+
+DECLARE_FUNCTION(2, 0)
+DECLARE_FUNCTION(2, 1)
+DECLARE_FUNCTION(2, 2)
+DECLARE_FUNCTION(2, 3)
+
+DECLARE_FUNCTION(3, 0)
+DECLARE_FUNCTION(3, 1)
+DECLARE_FUNCTION(3, 2)
+DECLARE_FUNCTION(3, 3)
+
+#define CHROMA_MC_8_MMI                                                     \
+        "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
+        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
+        "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
+        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
+        "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
+        "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
+        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
+                                                                            \
+        "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
+        "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
+        "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
+        "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
+                                                                            \
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
+        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
+                                                                            \
+        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
+        "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
+        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
+        "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
+                                                                            \
+        "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
+        "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
+
+
+#define CHROMA_MC_4_MMI                                                     \
+        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
+        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
+        "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
+                                                                            \
+        "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
+        "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
+                                                                            \
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
+        "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
+                                                                            \
+        "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
+
+
+void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B =     (x) * (8 - y);
+    const int C = (8 - x) *     (y);
+    const int D =     (x) *     (y);
+    double ftmp[10];
+    uint32_t tmp[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    __asm__ volatile(
+        "li         %[tmp0],    0x06                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
+        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
+        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
+        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src], 0x01)
+        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
+        MMI_ULDC1(%[ftmp3], %[src], 0x00)
+        MMI_ULDC1(%[ftmp4], %[src], 0x01)
+
+        CHROMA_MC_8_MMI
+
+        MMI_SDC1(%[ftmp1], %[dst], 0x00)
+        "addiu      %[h],       %[h],      -0x01                        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src),              [dst]"+&r"(dst),
+          [h]"+&r"(h)
+        : [stride]"r"((mips_reg)stride),
+          [A]"f"(A),                    [B]"f"(B),
+          [C]"f"(C),                    [D]"f"(D),
+          [ff_pw_28]"f"(ff_pw_28)
+        : "memory"
+    );
+}
+
+void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B =     (x) * (8 - y);
+    const int C = (8 - x) *     (y);
+    const int D =     (x) *     (y);
+    double ftmp[6];
+    uint32_t tmp[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    __asm__ volatile(
+        "li         %[tmp0],    0x06                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
+        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
+        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
+        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
+        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], 0x00)
+        MMI_ULWC1(%[ftmp2], %[src], 0x01)
+        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
+        MMI_ULWC1(%[ftmp3], %[src], 0x00)
+        MMI_ULWC1(%[ftmp4], %[src], 0x01)
+
+        CHROMA_MC_4_MMI
+
+        MMI_SWC1(%[ftmp1], %[dst], 0x00)
+        "addiu      %[h],       %[h],      -0x01                        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
+          [src]"+&r"(src),              [dst]"+&r"(dst),
+          [h]"+&r"(h)
+        : [stride]"r"((mips_reg)stride),
+          [A]"f"(A),                    [B]"f"(B),
+          [C]"f"(C),                    [D]"f"(D),
+          [ff_pw_28]"f"(ff_pw_28)
+        : "memory"
+    );
+}
+
+void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B =     (x) * (8 - y);
+    const int C = (8 - x) *     (y);
+    const int D =     (x) *     (y);
+    double ftmp[10];
+    uint32_t tmp[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    __asm__ volatile(
+        "li         %[tmp0],    0x06                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
+        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
+        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
+        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src], 0x01)
+        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
+        MMI_ULDC1(%[ftmp3], %[src], 0x00)
+        MMI_ULDC1(%[ftmp4], %[src], 0x01)
+
+        CHROMA_MC_8_MMI
+
+        MMI_LDC1(%[ftmp2], %[dst], 0x00)
+        "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
+
+        MMI_SDC1(%[ftmp1], %[dst], 0x00)
+        "addiu      %[h],       %[h],      -0x01                        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [src]"+&r"(src),              [dst]"+&r"(dst),
+          [h]"+&r"(h)
+        : [stride]"r"((mips_reg)stride),
+          [A]"f"(A),                    [B]"f"(B),
+          [C]"f"(C),                    [D]"f"(D),
+          [ff_pw_28]"f"(ff_pw_28)
+        : "memory"
+    );
+}
+
+void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
+                                      uint8_t *src /* align 1 */,
+                                      int stride, int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = (    x) * (8 - y);
+    const int C = (8 - x) * (    y);
+    const int D = (    x) * (    y);
+    double ftmp[6];
+    uint32_t tmp[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    __asm__ volatile(
+        "li         %[tmp0],    0x06                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
+        "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
+        "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
+        "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
+        "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
+
+        "1:                                                             \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], 0x00)
+        MMI_ULWC1(%[ftmp2], %[src], 0x01)
+        PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
+        MMI_ULWC1(%[ftmp3], %[src], 0x00)
+        MMI_ULWC1(%[ftmp4], %[src], 0x01)
+
+        CHROMA_MC_4_MMI
+
+        MMI_LWC1(%[ftmp2], %[dst], 0x00)
+        "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
+
+        MMI_SWC1(%[ftmp1], %[dst], 0x00)
+        "addiu      %[h],       %[h],      -0x01                        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
+          [src]"+&r"(src),              [dst]"+&r"(dst),
+          [h]"+&r"(h)
+        : [stride]"r"((mips_reg)stride),
+          [A]"f"(A),                    [B]"f"(B),
+          [C]"f"(C),                    [D]"f"(D),
+          [ff_pw_28]"f"(ff_pw_28)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/videodsp_init.c b/libavcodec/mips/videodsp_init.c
new file mode 100644
index 0000000..8170404
--- /dev/null
+++ b/libavcodec/mips/videodsp_init.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 Kaustubh Raste (kaustubh.raste@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/videodsp.h"
+
+#if HAVE_MSA
+static void prefetch_mips(uint8_t *mem, ptrdiff_t stride, int h)
+{
+    register const uint8_t *p = mem;
+
+    __asm__ volatile (
+        "1:                                     \n\t"
+        "pref          4,  0(%[p])              \n\t"
+        "pref          4,  32(%[p])             \n\t"
+        PTR_ADDIU"  %[h],  %[h],     -1         \n\t"
+        PTR_ADDU "  %[p],  %[p],     %[stride]  \n\t"
+
+        "bnez       %[h],  1b                   \n\t"
+
+        : [p] "+r" (p), [h] "+r" (h)
+        : [stride] "r" (stride)
+    );
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc)
+{
+#if HAVE_MSA
+    ctx->prefetch = prefetch_mips;
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/vp3dsp_idct_mmi.c b/libavcodec/mips/vp3dsp_idct_mmi.c
new file mode 100644
index 0000000..c5c4cf3
--- /dev/null
+++ b/libavcodec/mips/vp3dsp_idct_mmi.c
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp3dsp_mips.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mips/mmiutils.h"
+#include "libavutil/common.h"
+#include "libavcodec/rnd_avg.h"
+
+#define LOAD_CONST(dst, value)                        \
+    "li     %[tmp1],      "#value"              \n\t" \
+    "dmtc1  %[tmp1],      "#dst"                \n\t" \
+    "pshufh "#dst",       "#dst",     %[ftmp10] \n\t"
+
+static void idct_row_mmi(int16_t *input)
+{
+    double ftmp[23];
+    uint64_t tmp[2];
+    __asm__ volatile (
+        "xor        %[ftmp10],      %[ftmp10],        %[ftmp10] \n\t"
+        LOAD_CONST(%[csth_1], 1)
+        "li         %[tmp0],        0x02                        \n\t"
+        "1:                                                     \n\t"
+        /* Load input */
+        "ldc1       %[ftmp0],       0x00(%[input])              \n\t"
+        "ldc1       %[ftmp1],       0x10(%[input])              \n\t"
+        "ldc1       %[ftmp2],       0x20(%[input])              \n\t"
+        "ldc1       %[ftmp3],       0x30(%[input])              \n\t"
+        "ldc1       %[ftmp4],       0x40(%[input])              \n\t"
+        "ldc1       %[ftmp5],       0x50(%[input])              \n\t"
+        "ldc1       %[ftmp6],       0x60(%[input])              \n\t"
+        "ldc1       %[ftmp7],       0x70(%[input])              \n\t"
+        LOAD_CONST(%[ftmp8], 64277)
+        LOAD_CONST(%[ftmp9], 12785)
+        "pmulhh     %[A],           %[ftmp9],         %[ftmp7]  \n\t"
+        "pcmpgth    %[C],           %[ftmp10],        %[ftmp1]  \n\t"
+        "or         %[mask],        %[C],             %[csth_1] \n\t"
+        "pmullh     %[B],           %[ftmp1],         %[mask]   \n\t"
+        "pmulhuh    %[B],           %[ftmp8],         %[B]      \n\t"
+        "pmullh     %[B],           %[B],             %[mask]   \n\t"
+        "paddh      %[A],           %[A],             %[B]      \n\t"
+        "paddh      %[A],           %[A],             %[C]      \n\t"
+        "pcmpgth    %[D],           %[ftmp10],        %[ftmp7]  \n\t"
+        "or         %[mask],        %[D],             %[csth_1] \n\t"
+        "pmullh     %[ftmp7],       %[ftmp7],         %[mask]   \n\t"
+        "pmulhuh    %[B],           %[ftmp8],         %[ftmp7]  \n\t"
+        "pmullh     %[B],           %[B],             %[mask]   \n\t"
+        "pmulhh     %[C],           %[ftmp9],         %[ftmp1]  \n\t"
+        "psubh      %[B],           %[C],             %[B]      \n\t"
+        "psubh      %[B],           %[B],             %[D]      \n\t"
+
+        LOAD_CONST(%[ftmp8], 54491)
+        LOAD_CONST(%[ftmp9], 36410)
+        "pcmpgth    %[Ad],          %[ftmp10],        %[ftmp5]  \n\t"
+        "or         %[mask],        %[Ad],            %[csth_1] \n\t"
+        "pmullh     %[ftmp1],       %[ftmp5],         %[mask]   \n\t"
+        "pmulhuh    %[C],           %[ftmp9],         %[ftmp1]  \n\t"
+        "pmullh     %[C],           %[C],             %[mask]   \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],        %[ftmp3]  \n\t"
+        "or         %[mask],        %[Bd],            %[csth_1] \n\t"
+        "pmullh     %[D],           %[ftmp3],         %[mask]   \n\t"
+        "pmulhuh    %[D],           %[ftmp8],         %[D]      \n\t"
+        "pmullh     %[D],           %[D],             %[mask]   \n\t"
+        "paddh      %[C],           %[C],             %[D]      \n\t"
+        "paddh      %[C],           %[C],             %[Ad]     \n\t"
+        "paddh      %[C],           %[C],             %[Bd]     \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],        %[ftmp3]  \n\t"
+        "or         %[mask],        %[Bd],            %[csth_1] \n\t"
+        "pmullh     %[ftmp1],       %[ftmp3],         %[mask]   \n\t"
+        "pmulhuh    %[D],           %[ftmp9],         %[ftmp1]  \n\t"
+        "pmullh     %[D],           %[D],             %[mask]   \n\t"
+        "pcmpgth    %[Ed],          %[ftmp10],        %[ftmp5]  \n\t"
+        "or         %[mask],        %[Ed],            %[csth_1] \n\t"
+        "pmullh     %[Ad],          %[ftmp5],         %[mask]   \n\t"
+        "pmulhuh    %[Ad],          %[ftmp8],         %[Ad]     \n\t"
+        "pmullh     %[Ad],          %[Ad],            %[mask]   \n\t"
+        "psubh      %[D],           %[Ad],            %[D]      \n\t"
+        "paddh      %[D],           %[D],             %[Ed]     \n\t"
+        "psubh      %[D],           %[D],             %[Bd]     \n\t"
+
+        LOAD_CONST(%[ftmp8], 46341)
+        "psubh      %[Ad],          %[A],             %[C]      \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],        %[Ad]     \n\t"
+        "or         %[mask],        %[Bd],            %[csth_1] \n\t"
+        "pmullh     %[Ad],          %[Ad],            %[mask]   \n\t"
+        "pmulhuh    %[Ad],          %[ftmp8],         %[Ad]     \n\t"
+        "pmullh     %[Ad],          %[Ad],            %[mask]   \n\t"
+        "paddh      %[Ad],          %[Ad],            %[Bd]     \n\t"
+        "psubh      %[Bd],          %[B],             %[D]      \n\t"
+        "pcmpgth    %[Cd],          %[ftmp10],        %[Bd]     \n\t"
+        "or         %[mask],        %[Cd],            %[csth_1] \n\t"
+        "pmullh     %[Bd],          %[Bd],            %[mask]   \n\t"
+        "pmulhuh    %[Bd],          %[ftmp8],         %[Bd]     \n\t"
+        "pmullh     %[Bd],          %[Bd],            %[mask]   \n\t"
+        "paddh      %[Bd],          %[Bd],            %[Cd]     \n\t"
+        "paddh      %[Cd],          %[A],             %[C]      \n\t"
+        "paddh      %[Dd],          %[B],             %[D]      \n\t"
+        "paddh      %[A],           %[ftmp0],         %[ftmp4]  \n\t"
+        "pcmpgth    %[B],           %[ftmp10],        %[A]      \n\t"
+        "or         %[mask],        %[B],             %[csth_1] \n\t"
+        "pmullh     %[A],           %[A],             %[mask]   \n\t"
+        "pmulhuh    %[A],           %[ftmp8],         %[A]      \n\t"
+        "pmullh     %[A],           %[A],             %[mask]   \n\t"
+        "paddh      %[A],           %[A],             %[B]      \n\t"
+        "psubh      %[B],           %[ftmp0],         %[ftmp4]  \n\t"
+        "pcmpgth    %[C],           %[ftmp10],        %[B]      \n\t"
+        "or         %[mask],        %[C],             %[csth_1] \n\t"
+        "pmullh     %[B],           %[B],             %[mask]   \n\t"
+        "pmulhuh    %[B],           %[ftmp8],         %[B]      \n\t"
+        "pmullh     %[B],           %[B],             %[mask]   \n\t"
+        "paddh      %[B],           %[B],             %[C]      \n\t"
+
+        LOAD_CONST(%[ftmp8], 60547)
+        LOAD_CONST(%[ftmp9], 25080)
+        "pmulhh     %[C],           %[ftmp9],         %[ftmp6]  \n\t"
+        "pcmpgth    %[D],           %[ftmp10],        %[ftmp2]  \n\t"
+        "or         %[mask],        %[D],             %[csth_1] \n\t"
+        "pmullh     %[Ed],          %[ftmp2],         %[mask]   \n\t"
+        "pmulhuh    %[Ed],          %[ftmp8],         %[Ed]     \n\t"
+        "pmullh     %[Ed],          %[Ed],            %[mask]   \n\t"
+        "paddh      %[C],           %[C],             %[Ed]     \n\t"
+        "paddh      %[C],           %[C],             %[D]      \n\t"
+        "pcmpgth    %[Ed],          %[ftmp10],        %[ftmp6]  \n\t"
+        "or         %[mask],        %[Ed],            %[csth_1] \n\t"
+        "pmullh     %[ftmp6],       %[ftmp6],         %[mask]   \n\t"
+        "pmulhuh    %[D],           %[ftmp8],         %[ftmp6]  \n\t"
+        "pmullh     %[D],           %[D],             %[mask]   \n\t"
+        "pmulhh     %[Gd],          %[ftmp9],         %[ftmp2]  \n\t"
+        "psubh      %[D],           %[Gd],            %[D]      \n\t"
+        "psubh      %[D],           %[D],             %[Ed]     \n\t"
+        "psubh      %[Ed],          %[A],             %[C]      \n\t"
+        "paddh      %[Gd],          %[A],             %[C]      \n\t"
+        "paddh      %[A],           %[B],             %[Ad]     \n\t"
+        "psubh      %[C],           %[B],             %[Ad]     \n\t"
+        "psubh      %[B],           %[Bd],            %[D]      \n\t"
+        "paddh      %[D],           %[Bd],            %[D]      \n\t"
+        /* Final sequence of operations over-write original inputs */
+        "paddh      %[ftmp0],       %[Gd],            %[Cd]     \n\t"
+        "paddh      %[ftmp1],       %[A],             %[D]      \n\t"
+        "psubh      %[ftmp2],       %[A],             %[D]      \n\t"
+        "paddh      %[ftmp3],       %[Ed],            %[Dd]     \n\t"
+        "psubh      %[ftmp4],       %[Ed],            %[Dd]     \n\t"
+        "paddh      %[ftmp5],       %[C],             %[B]      \n\t"
+        "psubh      %[ftmp6],       %[C],             %[B]      \n\t"
+        "psubh      %[ftmp7],       %[Gd],            %[Cd]     \n\t"
+        "sdc1       %[ftmp0],       0x00(%[input])              \n\t"
+        "sdc1       %[ftmp1],       0x10(%[input])              \n\t"
+        "sdc1       %[ftmp2],       0x20(%[input])              \n\t"
+        "sdc1       %[ftmp3],       0x30(%[input])              \n\t"
+        "sdc1       %[ftmp4],       0x40(%[input])              \n\t"
+        "sdc1       %[ftmp5],       0x50(%[input])              \n\t"
+        "sdc1       %[ftmp6],       0x60(%[input])              \n\t"
+        "sdc1       %[ftmp7],       0x70(%[input])              \n\t"
+        PTR_ADDU   "%[tmp0],        %[tmp0],          -0x01     \n\t"
+        PTR_ADDIU  "%[input],       %[input],         0x08      \n\t"
+        "bnez       %[tmp0],        1b                          \n\t"
+        : [input]"+&r"(input), [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+          [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+          [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [mask]"=&f"(ftmp[11]),
+          [A]"=&f"(ftmp[12]), [B]"=&f"(ftmp[13]), [C]"=&f"(ftmp[14]),
+          [D]"=&f"(ftmp[15]), [Ad]"=&f"(ftmp[16]), [Bd]"=&f"(ftmp[17]),
+          [Cd]"=&f"(ftmp[18]), [Dd]"=&f"(ftmp[19]), [Ed]"=&f"(ftmp[20]),
+          [Gd]"=&f"(ftmp[21]), [csth_1]"=&f"(ftmp[22])
+        :
+        : "memory"
+    );
+}
+
+static void idct_column_true_mmi(uint8_t *dst, int stride, int16_t *input)
+{
+    uint8_t temp_value[8];
+    double ftmp[23];
+    uint64_t tmp[2];
+    for (int i = 0; i < 8; ++i)
+        temp_value[i] = av_clip_uint8(128 + ((46341 * input[i << 3] + (8 << 16)) >> 20));
+    __asm__ volatile (
+        "xor        %[ftmp10],      %[ftmp10],          %[ftmp10] \n\t"
+        "li         %[tmp0],        0x02                          \n\t"
+        "1:                                                       \n\t"
+        "ldc1       %[ftmp0],       0x00(%[input])                \n\t"
+        "ldc1       %[ftmp4],       0x08(%[input])                \n\t"
+        "ldc1       %[ftmp1],       0x10(%[input])                \n\t"
+        "ldc1       %[ftmp5],       0x18(%[input])                \n\t"
+        "ldc1       %[ftmp2],       0x20(%[input])                \n\t"
+        "ldc1       %[ftmp6],       0x28(%[input])                \n\t"
+        "ldc1       %[ftmp3],       0x30(%[input])                \n\t"
+        "ldc1       %[ftmp7],       0x38(%[input])                \n\t"
+        TRANSPOSE_4H(%[ftmp0], %[ftmp1], %[ftmp2], %[ftmp3],
+                     %[A], %[B], %[C], %[D])
+        TRANSPOSE_4H(%[ftmp4], %[ftmp5], %[ftmp6], %[ftmp7],
+                     %[A], %[B], %[C], %[D])
+        LOAD_CONST(%[ftmp8], 64277)
+        LOAD_CONST(%[ftmp9], 12785)
+        LOAD_CONST(%[Gd], 1)
+        "pmulhh     %[A],           %[ftmp9],           %[ftmp7]  \n\t"
+        "pcmpgth    %[C],           %[ftmp10],          %[ftmp1]  \n\t"
+        "or         %[mask],        %[C],               %[Gd]     \n\t"
+        "pmullh     %[B],           %[ftmp1],           %[mask]   \n\t"
+        "pmulhuh    %[B],           %[ftmp8],           %[B]      \n\t"
+        "pmullh     %[B],           %[B],               %[mask]   \n\t"
+        "paddh      %[A],           %[A],               %[B]      \n\t"
+        "paddh      %[A],           %[A],               %[C]      \n\t"
+        "pcmpgth    %[D],           %[ftmp10],          %[ftmp7]  \n\t"
+        "or         %[mask],        %[D],               %[Gd]     \n\t"
+        "pmullh     %[Ad],          %[ftmp7],           %[mask]   \n\t"
+        "pmulhuh    %[B],           %[ftmp8],           %[Ad]     \n\t"
+        "pmullh     %[B],           %[B],               %[mask]   \n\t"
+        "pmulhh     %[C],           %[ftmp9],           %[ftmp1]  \n\t"
+        "psubh      %[B],           %[C],               %[B]      \n\t"
+        "psubh      %[B],           %[B],               %[D]      \n\t"
+
+        LOAD_CONST(%[ftmp8], 54491)
+        LOAD_CONST(%[ftmp9], 36410)
+        "pcmpgth    %[Ad],          %[ftmp10],          %[ftmp5]  \n\t"
+        "or         %[mask],        %[Ad],              %[Gd]     \n\t"
+        "pmullh     %[Cd],          %[ftmp5],           %[mask]   \n\t"
+        "pmulhuh    %[C],           %[ftmp9],           %[Cd]     \n\t"
+        "pmullh     %[C],           %[C],               %[mask]   \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],          %[ftmp3]  \n\t"
+        "or         %[mask],        %[Bd],              %[Gd]     \n\t"
+        "pmullh     %[D],           %[ftmp3],           %[mask]   \n\t"
+        "pmulhuh    %[D],           %[ftmp8],           %[D]      \n\t"
+        "pmullh     %[D],           %[D],               %[mask]   \n\t"
+        "paddh      %[C],           %[C],               %[D]      \n\t"
+        "paddh      %[C],           %[C],               %[Ad]     \n\t"
+        "paddh      %[C],           %[C],               %[Bd]     \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],          %[ftmp3]  \n\t"
+        "or         %[mask],        %[Bd],              %[Gd]     \n\t"
+        "pmullh     %[Cd],          %[ftmp3],           %[mask]   \n\t"
+        "pmulhuh    %[D],           %[ftmp9],           %[Cd]     \n\t"
+        "pmullh     %[D],           %[D],               %[mask]   \n\t"
+        "pcmpgth    %[Ed],          %[ftmp10],          %[ftmp5]  \n\t"
+        "or         %[mask],        %[Ed],              %[Gd]     \n\t"
+        "pmullh     %[Ad],          %[ftmp5],           %[mask]   \n\t"
+        "pmulhuh    %[Ad],          %[ftmp8],           %[Ad]     \n\t"
+        "pmullh     %[Ad],          %[Ad],              %[mask]   \n\t"
+        "psubh      %[D],           %[Ad],              %[D]      \n\t"
+        "paddh      %[D],           %[D],               %[Ed]     \n\t"
+        "psubh      %[D],           %[D],               %[Bd]     \n\t"
+
+        LOAD_CONST(%[ftmp8], 46341)
+        "psubh      %[Ad],          %[A],             %[C]        \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],        %[Ad]       \n\t"
+        "or         %[mask],        %[Bd],            %[Gd]       \n\t"
+        "pmullh     %[Ad],          %[Ad],            %[mask]     \n\t"
+        "pmulhuh    %[Ad],          %[ftmp8],         %[Ad]       \n\t"
+        "pmullh     %[Ad],          %[Ad],            %[mask]     \n\t"
+        "paddh      %[Ad],          %[Ad],            %[Bd]       \n\t"
+        "psubh      %[Bd],          %[B],             %[D]        \n\t"
+        "pcmpgth    %[Cd],          %[ftmp10],        %[Bd]       \n\t"
+        "or         %[mask],        %[Cd],            %[Gd]       \n\t"
+        "pmullh     %[Bd],          %[Bd],            %[mask]     \n\t"
+        "pmulhuh    %[Bd],          %[ftmp8],         %[Bd]       \n\t"
+        "pmullh     %[Bd],          %[Bd],            %[mask]     \n\t"
+        "paddh      %[Bd],          %[Bd],            %[Cd]       \n\t"
+        "paddh      %[Cd],          %[A],             %[C]        \n\t"
+        "paddh      %[Dd],          %[B],             %[D]        \n\t"
+
+        LOAD_CONST(%[Ed], 2056)
+        "paddh      %[A],           %[ftmp0],         %[ftmp4]    \n\t"
+        "pcmpgth    %[B],           %[ftmp10],        %[A]        \n\t"
+        "or         %[mask],        %[B],             %[Gd]       \n\t"
+        "pmullh     %[A],           %[A],             %[mask]     \n\t"
+        "pmulhuh    %[A],           %[ftmp8],         %[A]        \n\t"
+        "pmullh     %[A],           %[A],             %[mask]     \n\t"
+        "paddh      %[A],           %[A],             %[B]        \n\t"
+        "paddh      %[A],           %[A],             %[Ed]       \n\t"
+        "psubh      %[B],           %[ftmp0],         %[ftmp4]    \n\t"
+        "pcmpgth    %[C],           %[ftmp10],        %[B]        \n\t"
+        "or         %[mask],        %[C],             %[Gd]       \n\t"
+        "pmullh     %[B],           %[B],             %[mask]     \n\t"
+        "pmulhuh    %[B],           %[ftmp8],         %[B]        \n\t"
+        "pmullh     %[B],           %[B],             %[mask]     \n\t"
+        "paddh      %[B],           %[B],             %[C]        \n\t"
+        "paddh      %[B],           %[B],             %[Ed]       \n\t"
+
+        LOAD_CONST(%[ftmp8], 60547)
+        LOAD_CONST(%[ftmp9], 25080)
+        "pmulhh     %[C],           %[ftmp9],         %[ftmp6]    \n\t"
+        "pcmpgth    %[D],           %[ftmp10],        %[ftmp2]    \n\t"
+        "or         %[mask],        %[D],             %[Gd]       \n\t"
+        "pmullh     %[Ed],          %[ftmp2],         %[mask]     \n\t"
+        "pmulhuh    %[Ed],          %[ftmp8],         %[Ed]       \n\t"
+        "pmullh     %[Ed],          %[Ed],            %[mask]     \n\t"
+        "paddh      %[C],           %[C],             %[Ed]       \n\t"
+        "paddh      %[C],           %[C],             %[D]        \n\t"
+        "pcmpgth    %[Ed],          %[ftmp10],        %[ftmp6]    \n\t"
+        "or         %[mask],        %[Ed],            %[Gd]       \n\t"
+        "pmullh     %[D],           %[ftmp6],         %[mask]     \n\t"
+        "pmulhuh    %[D],           %[ftmp8],         %[D]        \n\t"
+        "pmullh     %[D],           %[D],             %[mask]     \n\t"
+        "pmulhh     %[Gd],          %[ftmp9],         %[ftmp2]    \n\t"
+        "psubh      %[D],           %[Gd],            %[D]        \n\t"
+        "psubh      %[D],           %[D],             %[Ed]       \n\t"
+        "psubh      %[Ed],          %[A],             %[C]        \n\t"
+        "paddh      %[Gd],          %[A],             %[C]        \n\t"
+        "paddh      %[A],           %[B],             %[Ad]       \n\t"
+        "psubh      %[C],           %[B],             %[Ad]       \n\t"
+        "psubh      %[B],           %[Bd],            %[D]        \n\t"
+        "paddh      %[D],           %[Bd],            %[D]        \n\t"
+        "or         %[mask],        %[ftmp1],         %[ftmp2]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp3]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp4]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp5]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp6]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp7]    \n\t"
+        "pcmpeqh    %[mask],        %[mask],          %[ftmp10]   \n\t"
+        "packushb   %[mask],        %[mask],          %[ftmp10]   \n\t"
+        "li         %[tmp1],        0x04                          \n\t"
+        "dmtc1      %[tmp1],        %[ftmp8]                      \n\t"
+        "paddh      %[ftmp0],       %[Gd],            %[Cd]       \n\t"
+        "psrah      %[ftmp0],       %[ftmp0],         %[ftmp8]    \n\t"
+        "paddh      %[ftmp1],       %[A],             %[D]        \n\t"
+        "psrah      %[ftmp1],       %[ftmp1],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp2],       %[A],             %[D]        \n\t"
+        "psrah      %[ftmp2],       %[ftmp2],         %[ftmp8]    \n\t"
+        "paddh      %[ftmp3],       %[Ed],            %[Dd]       \n\t"
+        "psrah      %[ftmp3],       %[ftmp3],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp4],       %[Ed],            %[Dd]       \n\t"
+        "psrah      %[ftmp4],       %[ftmp4],         %[ftmp8]    \n\t"
+        "paddh      %[ftmp5],       %[C],             %[B]        \n\t"
+        "psrah      %[ftmp5],       %[ftmp5],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp6],       %[C],             %[B]        \n\t"
+        "psrah      %[ftmp6],       %[ftmp6],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp7],       %[Gd],            %[Cd]       \n\t"
+        "psrah      %[ftmp7],       %[ftmp7],         %[ftmp8]    \n\t"
+        "pmaxsh     %[ftmp0],       %[ftmp0],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp0],       %[ftmp0],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp1],       %[ftmp1],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp1],       %[ftmp1],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp2],       %[ftmp2],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp2],       %[ftmp2],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp3],       %[ftmp3],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp3],       %[ftmp3],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp4],       %[ftmp4],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp4],       %[ftmp4],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp5],       %[ftmp5],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp5],       %[ftmp5],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp6],       %[ftmp6],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp6],       %[ftmp6],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp7],       %[ftmp7],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp7],       %[ftmp7],         %[ftmp10]   \n\t"
+
+        "lwc1       %[Ed],          0x00(%[temp_value])           \n\t"
+        "and        %[Ed],          %[Ed],            %[mask]     \n\t"
+        "paddb      %[ftmp0],       %[ftmp0],         %[Ed]       \n\t"
+        "paddb      %[ftmp1],       %[ftmp1],         %[Ed]       \n\t"
+        "paddb      %[ftmp2],       %[ftmp2],         %[Ed]       \n\t"
+        "paddb      %[ftmp3],       %[ftmp3],         %[Ed]       \n\t"
+        "paddb      %[ftmp4],       %[ftmp4],         %[Ed]       \n\t"
+        "paddb      %[ftmp5],       %[ftmp5],         %[Ed]       \n\t"
+        "paddb      %[ftmp6],       %[ftmp6],         %[Ed]       \n\t"
+        "paddb      %[ftmp7],       %[ftmp7],         %[Ed]       \n\t"
+        "swc1       %[ftmp0],       0x00(%[dst])                  \n\t"
+        PTR_ADDU   "%[tmp1],        %[dst],           %[stride]   \n\t"
+        "swc1       %[ftmp1],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp2],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp3],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp4],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp5],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp6],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp7],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDIU  "%[dst],         %[dst],           0x04        \n\t"
+        PTR_ADDIU  "%[input],       %[input],         0x40        \n\t"
+        PTR_ADDIU  "%[temp_value],  %[temp_value],    0x04        \n\t"
+        PTR_ADDIU  "%[tmp0],        %[tmp0],          -0x01       \n\t"
+        "bnez       %[tmp0],        1b                            \n\t"
+        : [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+          [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+          [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [mask]"=&f"(ftmp[11]),
+          [A]"=&f"(ftmp[12]), [B]"=&f"(ftmp[13]), [C]"=&f"(ftmp[14]),
+          [D]"=&f"(ftmp[15]), [Ad]"=&f"(ftmp[16]), [Bd]"=&f"(ftmp[17]),
+          [Cd]"=&f"(ftmp[18]), [Dd]"=&f"(ftmp[19]), [Ed]"=&f"(ftmp[20]),
+          [Gd]"=&f"(ftmp[21]), [input]"+&r"(input)
+        : [stride]"r"(stride), [temp_value]"r"(temp_value)
+        : "memory"
+    );
+}
+
+static void idct_column_false_mmi(uint8_t *dst, int stride, int16_t *input)
+{
+    int16_t temp_value[8];
+    double ftmp[23];
+    uint64_t tmp[2];
+    for (int i = 0; i < 8; ++i)
+        temp_value[i] = (46341 * input[i << 3] + (8 << 16)) >> 20;
+    __asm__ volatile (
+        "xor        %[ftmp10],      %[ftmp10],          %[ftmp10] \n\t"
+        "li         %[tmp0],        0x02                          \n\t"
+        "1:                                                       \n\t"
+        "ldc1       %[ftmp0],       0x00(%[input])                \n\t"
+        "ldc1       %[ftmp4],       0x08(%[input])                \n\t"
+        "ldc1       %[ftmp1],       0x10(%[input])                \n\t"
+        "ldc1       %[ftmp5],       0x18(%[input])                \n\t"
+        "ldc1       %[ftmp2],       0x20(%[input])                \n\t"
+        "ldc1       %[ftmp6],       0x28(%[input])                \n\t"
+        "ldc1       %[ftmp3],       0x30(%[input])                \n\t"
+        "ldc1       %[ftmp7],       0x38(%[input])                \n\t"
+        TRANSPOSE_4H(%[ftmp0], %[ftmp1], %[ftmp2], %[ftmp3],
+                     %[A], %[B], %[C], %[D])
+        TRANSPOSE_4H(%[ftmp4], %[ftmp5], %[ftmp6], %[ftmp7],
+                     %[A], %[B], %[C], %[D])
+        LOAD_CONST(%[ftmp8], 64277)
+        LOAD_CONST(%[ftmp9], 12785)
+        LOAD_CONST(%[Gd], 1)
+        "pmulhh     %[A],           %[ftmp9],           %[ftmp7]  \n\t"
+        "pcmpgth    %[C],           %[ftmp10],          %[ftmp1]  \n\t"
+        "or         %[mask],        %[C],               %[Gd]     \n\t"
+        "pmullh     %[B],           %[ftmp1],           %[mask]   \n\t"
+        "pmulhuh    %[B],           %[ftmp8],           %[B]      \n\t"
+        "pmullh     %[B],           %[B],               %[mask]   \n\t"
+        "paddh      %[A],           %[A],               %[B]      \n\t"
+        "paddh      %[A],           %[A],               %[C]      \n\t"
+        "pcmpgth    %[D],           %[ftmp10],          %[ftmp7]  \n\t"
+        "or         %[mask],        %[D],               %[Gd]     \n\t"
+        "pmullh     %[Ad],          %[ftmp7],           %[mask]   \n\t"
+        "pmulhuh    %[B],           %[ftmp8],           %[Ad]     \n\t"
+        "pmullh     %[B],           %[B],               %[mask]   \n\t"
+        "pmulhh     %[C],           %[ftmp9],           %[ftmp1]  \n\t"
+        "psubh      %[B],           %[C],               %[B]      \n\t"
+        "psubh      %[B],           %[B],               %[D]      \n\t"
+
+        LOAD_CONST(%[ftmp8], 54491)
+        LOAD_CONST(%[ftmp9], 36410)
+        "pcmpgth    %[Ad],          %[ftmp10],          %[ftmp5]  \n\t"
+        "or         %[mask],        %[Ad],              %[Gd]     \n\t"
+        "pmullh     %[Cd],          %[ftmp5],           %[mask]   \n\t"
+        "pmulhuh    %[C],           %[ftmp9],           %[Cd]     \n\t"
+        "pmullh     %[C],           %[C],               %[mask]   \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],          %[ftmp3]  \n\t"
+        "or         %[mask],        %[Bd],              %[Gd]     \n\t"
+        "pmullh     %[D],           %[ftmp3],           %[mask]   \n\t"
+        "pmulhuh    %[D],           %[ftmp8],           %[D]      \n\t"
+        "pmullh     %[D],           %[D],               %[mask]   \n\t"
+        "paddh      %[C],           %[C],               %[D]      \n\t"
+        "paddh      %[C],           %[C],               %[Ad]     \n\t"
+        "paddh      %[C],           %[C],               %[Bd]     \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],          %[ftmp3]  \n\t"
+        "or         %[mask],        %[Bd],              %[Gd]     \n\t"
+        "pmullh     %[Cd],          %[ftmp3],           %[mask]   \n\t"
+        "pmulhuh    %[D],           %[ftmp9],           %[Cd]     \n\t"
+        "pmullh     %[D],           %[D],               %[mask]   \n\t"
+        "pcmpgth    %[Ed],          %[ftmp10],          %[ftmp5]  \n\t"
+        "or         %[mask],        %[Ed],              %[Gd]     \n\t"
+        "pmullh     %[Ad],          %[ftmp5],           %[mask]   \n\t"
+        "pmulhuh    %[Ad],          %[ftmp8],           %[Ad]     \n\t"
+        "pmullh     %[Ad],          %[Ad],              %[mask]   \n\t"
+        "psubh      %[D],           %[Ad],              %[D]      \n\t"
+        "paddh      %[D],           %[D],               %[Ed]     \n\t"
+        "psubh      %[D],           %[D],               %[Bd]     \n\t"
+
+        LOAD_CONST(%[ftmp8], 46341)
+        "psubh      %[Ad],          %[A],             %[C]        \n\t"
+        "pcmpgth    %[Bd],          %[ftmp10],        %[Ad]       \n\t"
+        "or         %[mask],        %[Bd],            %[Gd]       \n\t"
+        "pmullh     %[Ad],          %[Ad],            %[mask]     \n\t"
+        "pmulhuh    %[Ad],          %[ftmp8],         %[Ad]       \n\t"
+        "pmullh     %[Ad],          %[Ad],            %[mask]     \n\t"
+        "paddh      %[Ad],          %[Ad],            %[Bd]       \n\t"
+        "psubh      %[Bd],          %[B],             %[D]        \n\t"
+        "pcmpgth    %[Cd],          %[ftmp10],        %[Bd]       \n\t"
+        "or         %[mask],        %[Cd],            %[Gd]       \n\t"
+        "pmullh     %[Bd],          %[Bd],            %[mask]     \n\t"
+        "pmulhuh    %[Bd],          %[ftmp8],         %[Bd]       \n\t"
+        "pmullh     %[Bd],          %[Bd],            %[mask]     \n\t"
+        "paddh      %[Bd],          %[Bd],            %[Cd]       \n\t"
+        "paddh      %[Cd],          %[A],             %[C]        \n\t"
+        "paddh      %[Dd],          %[B],             %[D]        \n\t"
+
+        LOAD_CONST(%[Ed], 8)
+        "paddh      %[A],           %[ftmp0],         %[ftmp4]    \n\t"
+        "pcmpgth    %[B],           %[ftmp10],        %[A]        \n\t"
+        "or         %[mask],        %[B],             %[Gd]       \n\t"
+        "pmullh     %[A],           %[A],             %[mask]     \n\t"
+        "pmulhuh    %[A],           %[ftmp8],         %[A]        \n\t"
+        "pmullh     %[A],           %[A],             %[mask]     \n\t"
+        "paddh      %[A],           %[A],             %[B]        \n\t"
+        "paddh      %[A],           %[A],             %[Ed]       \n\t"
+        "psubh      %[B],           %[ftmp0],         %[ftmp4]    \n\t"
+        "pcmpgth    %[C],           %[ftmp10],        %[B]        \n\t"
+        "or         %[mask],        %[C],             %[Gd]       \n\t"
+        "pmullh     %[B],           %[B],             %[mask]     \n\t"
+        "pmulhuh    %[B],           %[ftmp8],         %[B]        \n\t"
+        "pmullh     %[B],           %[B],             %[mask]     \n\t"
+        "paddh      %[B],           %[B],             %[C]        \n\t"
+        "paddh      %[B],           %[B],             %[Ed]       \n\t"
+
+        LOAD_CONST(%[ftmp8], 60547)
+        LOAD_CONST(%[ftmp9], 25080)
+        "pmulhh     %[C],           %[ftmp9],         %[ftmp6]    \n\t"
+        "pcmpgth    %[D],           %[ftmp10],        %[ftmp2]    \n\t"
+        "or         %[mask],        %[D],             %[Gd]       \n\t"
+        "pmullh     %[Ed],          %[ftmp2],         %[mask]     \n\t"
+        "pmulhuh    %[Ed],          %[ftmp8],         %[Ed]       \n\t"
+        "pmullh     %[Ed],          %[Ed],            %[mask]     \n\t"
+        "paddh      %[C],           %[C],             %[Ed]       \n\t"
+        "paddh      %[C],           %[C],             %[D]        \n\t"
+        "pcmpgth    %[Ed],          %[ftmp10],        %[ftmp6]    \n\t"
+        "or         %[mask],        %[Ed],            %[Gd]       \n\t"
+        "pmullh     %[D],           %[ftmp6],         %[mask]     \n\t"
+        "pmulhuh    %[D],           %[ftmp8],         %[D]        \n\t"
+        "pmullh     %[D],           %[D],             %[mask]     \n\t"
+        "pmulhh     %[Gd],          %[ftmp9],         %[ftmp2]    \n\t"
+        "psubh      %[D],           %[Gd],            %[D]        \n\t"
+        "psubh      %[D],           %[D],             %[Ed]       \n\t"
+        "psubh      %[Ed],          %[A],             %[C]        \n\t"
+        "paddh      %[Gd],          %[A],             %[C]        \n\t"
+        "paddh      %[A],           %[B],             %[Ad]       \n\t"
+        "psubh      %[C],           %[B],             %[Ad]       \n\t"
+        "psubh      %[B],           %[Bd],            %[D]        \n\t"
+        "paddh      %[D],           %[Bd],            %[D]        \n\t"
+        "or         %[mask],        %[ftmp1],         %[ftmp2]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp3]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp4]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp5]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp6]    \n\t"
+        "or         %[mask],        %[mask],          %[ftmp7]    \n\t"
+        "pcmpeqh    %[mask],        %[mask],          %[ftmp10]   \n\t"
+        "li         %[tmp1],        0x04                          \n\t"
+        "dmtc1      %[tmp1],        %[ftmp8]                      \n\t"
+        "paddh      %[ftmp0],       %[Gd],            %[Cd]       \n\t"
+        "psrah      %[ftmp0],       %[ftmp0],         %[ftmp8]    \n\t"
+        "paddh      %[ftmp1],       %[A],             %[D]        \n\t"
+        "psrah      %[ftmp1],       %[ftmp1],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp2],       %[A],             %[D]        \n\t"
+        "psrah      %[ftmp2],       %[ftmp2],         %[ftmp8]    \n\t"
+        "paddh      %[ftmp3],       %[Ed],            %[Dd]       \n\t"
+        "psrah      %[ftmp3],       %[ftmp3],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp4],       %[Ed],            %[Dd]       \n\t"
+        "psrah      %[ftmp4],       %[ftmp4],         %[ftmp8]    \n\t"
+        "paddh      %[ftmp5],       %[C],             %[B]        \n\t"
+        "psrah      %[ftmp5],       %[ftmp5],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp6],       %[C],             %[B]        \n\t"
+        "psrah      %[ftmp6],       %[ftmp6],         %[ftmp8]    \n\t"
+        "psubh      %[ftmp7],       %[Gd],            %[Cd]       \n\t"
+        "psrah      %[ftmp7],       %[ftmp7],         %[ftmp8]    \n\t"
+
+        /* Load from dst */
+        "lwc1       %[A],           0x00(%[dst])                  \n\t"
+        PTR_ADDU   "%[tmp1],        %[dst],           %[stride]   \n\t"
+        "lwc1       %[B],           0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "lwc1       %[C],           0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "lwc1       %[D],           0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "lwc1       %[Ad],          0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "lwc1       %[Bd],          0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "lwc1       %[Cd],          0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "lwc1       %[Dd],          0x00(%[tmp1])                 \n\t"
+        "punpcklbh  %[A],           %[A],             %[ftmp10]   \n\t"
+        "punpcklbh  %[B],           %[B],             %[ftmp10]   \n\t"
+        "punpcklbh  %[C],           %[C],             %[ftmp10]   \n\t"
+        "punpcklbh  %[D],           %[D],             %[ftmp10]   \n\t"
+        "punpcklbh  %[Ad],          %[Ad],            %[ftmp10]   \n\t"
+        "punpcklbh  %[Bd],          %[Bd],            %[ftmp10]   \n\t"
+        "punpcklbh  %[Cd],          %[Cd],            %[ftmp10]   \n\t"
+        "punpcklbh  %[Dd],          %[Dd],            %[ftmp10]   \n\t"
+        "ldc1       %[Ed],          0x00(%[temp_value])           \n\t"
+        "and        %[Ed],          %[Ed],            %[mask]     \n\t"
+        "nor        %[mask],        %[mask],          %[mask]     \n\t"
+        "and        %[ftmp0],       %[ftmp0],         %[mask]     \n\t"
+        "and        %[ftmp1],       %[ftmp1],         %[mask]     \n\t"
+        "and        %[ftmp2],       %[ftmp2],         %[mask]     \n\t"
+        "and        %[ftmp3],       %[ftmp3],         %[mask]     \n\t"
+        "and        %[ftmp4],       %[ftmp4],         %[mask]     \n\t"
+        "and        %[ftmp5],       %[ftmp5],         %[mask]     \n\t"
+        "and        %[ftmp6],       %[ftmp6],         %[mask]     \n\t"
+        "and        %[ftmp7],       %[ftmp7],         %[mask]     \n\t"
+        "paddh      %[ftmp0],       %[ftmp0],         %[A]        \n\t"
+        "paddh      %[ftmp1],       %[ftmp1],         %[B]        \n\t"
+        "paddh      %[ftmp2],       %[ftmp2],         %[C]        \n\t"
+        "paddh      %[ftmp3],       %[ftmp3],         %[D]        \n\t"
+        "paddh      %[ftmp4],       %[ftmp4],         %[Ad]       \n\t"
+        "paddh      %[ftmp5],       %[ftmp5],         %[Bd]       \n\t"
+        "paddh      %[ftmp6],       %[ftmp6],         %[Cd]       \n\t"
+        "paddh      %[ftmp7],       %[ftmp7],         %[Dd]       \n\t"
+        "paddh      %[ftmp0],       %[ftmp0],         %[Ed]       \n\t"
+        "paddh      %[ftmp1],       %[ftmp1],         %[Ed]       \n\t"
+        "paddh      %[ftmp2],       %[ftmp2],         %[Ed]       \n\t"
+        "paddh      %[ftmp3],       %[ftmp3],         %[Ed]       \n\t"
+        "paddh      %[ftmp4],       %[ftmp4],         %[Ed]       \n\t"
+        "paddh      %[ftmp5],       %[ftmp5],         %[Ed]       \n\t"
+        "paddh      %[ftmp6],       %[ftmp6],         %[Ed]       \n\t"
+        "paddh      %[ftmp7],       %[ftmp7],         %[Ed]       \n\t"
+        "pmaxsh     %[ftmp0],       %[ftmp0],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp0],       %[ftmp0],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp1],       %[ftmp1],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp1],       %[ftmp1],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp2],       %[ftmp2],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp2],       %[ftmp2],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp3],       %[ftmp3],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp3],       %[ftmp3],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp4],       %[ftmp4],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp4],       %[ftmp4],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp5],       %[ftmp5],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp5],       %[ftmp5],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp6],       %[ftmp6],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp6],       %[ftmp6],         %[ftmp10]   \n\t"
+        "pmaxsh     %[ftmp7],       %[ftmp7],         %[ftmp10]   \n\t"
+        "packushb   %[ftmp7],       %[ftmp7],         %[ftmp10]   \n\t"
+        "swc1       %[ftmp0],       0x00(%[dst])                  \n\t"
+        PTR_ADDU   "%[tmp1],        %[dst],           %[stride]   \n\t"
+        "swc1       %[ftmp1],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp2],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp3],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp4],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp5],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp6],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDU   "%[tmp1],        %[tmp1],          %[stride]   \n\t"
+        "swc1       %[ftmp7],       0x00(%[tmp1])                 \n\t"
+        PTR_ADDIU  "%[dst],         %[dst],           0x04        \n\t"
+        PTR_ADDIU  "%[input],       %[input],         0x40        \n\t"
+        PTR_ADDIU  "%[temp_value],  %[temp_value],    0x08        \n\t"
+        PTR_ADDIU  "%[tmp0],        %[tmp0],          -0x01       \n\t"
+        "bnez       %[tmp0],        1b                            \n\t"
+        : [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+          [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+          [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [mask]"=&f"(ftmp[11]),
+          [A]"=&f"(ftmp[12]), [B]"=&f"(ftmp[13]), [C]"=&f"(ftmp[14]),
+          [D]"=&f"(ftmp[15]), [Ad]"=&f"(ftmp[16]), [Bd]"=&f"(ftmp[17]),
+          [Cd]"=&f"(ftmp[18]), [Dd]"=&f"(ftmp[19]), [Ed]"=&f"(ftmp[20]),
+          [Gd]"=&f"(ftmp[21]), [input]"+&r"(input)
+        : [stride]"r"(stride), [temp_value]"r"(temp_value)
+        : "memory"
+    );
+}
+static void idct_mmi(uint8_t *dst, int stride, int16_t *input, int type)
+{
+    idct_row_mmi(input);
+    if (type == 1)
+        idct_column_true_mmi(dst, stride, input);
+    else
+        idct_column_false_mmi(dst, stride, input);
+}
+
+void ff_vp3_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    idct_mmi(dest, line_size, block, 1);
+    memset(block, 0, sizeof(*block) << 6);
+}
+
+void ff_vp3_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    idct_mmi(dest, line_size, block, 2);
+    memset(block, 0, sizeof(*block) << 6);
+}
+void ff_vp3_idct_dc_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    int dc = (block[0] + 15) >> 5;
+
+    double ftmp[7];
+    uint64_t tmp;
+    __asm__ volatile (
+        "xor        %[ftmp0],     %[ftmp0],           %[ftmp0]      \n\t"
+        "mtc1       %[dc],        %[ftmp5]                          \n\t"
+        "pshufh     %[ftmp5],     %[ftmp5],           %[ftmp0]      \n\t"
+        "li         %[tmp0],      0x08                              \n\t"
+        "1:                                                         \n\t"
+        "ldc1       %[ftmp1],     0x00(%[dest])                     \n\t"
+        "punpcklbh  %[ftmp2],     %[ftmp1],           %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp3],     %[ftmp1],           %[ftmp0]      \n\t"
+        "paddh      %[ftmp4],     %[ftmp2],           %[ftmp5]      \n\t"
+        "paddh      %[ftmp6],     %[ftmp3],           %[ftmp5]      \n\t"
+        "packushb   %[ftmp4],     %[ftmp4],           %[ftmp0]      \n\t"
+        "packushb   %[ftmp6],     %[ftmp6],           %[ftmp0]      \n\t"
+        "swc1       %[ftmp4],     0x00(%[dest])                     \n\t"
+        "swc1       %[ftmp6],     0x04(%[dest])                     \n\t"
+        PTR_ADDU   "%[dest],      %[dest],            %[line_size]  \n\t"
+        PTR_ADDIU  "%[tmp0],      %[tmp0],            -0x01         \n\t"
+        "bnez       %[tmp0],      1b                                \n\t"
+        : [dest]"+&r"(dest), [block]"+&r"(block), [tmp0]"=&r"(tmp),
+          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+          [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6])
+        : [line_size]"r"(line_size), [dc]"r"(dc)
+        : "memory"
+    );
+    block[0] = 0;
+}
+
+void ff_put_no_rnd_pixels_l2_mmi(uint8_t *dst, const uint8_t *src1,
+                                 const uint8_t *src2, ptrdiff_t stride, int h)
+{
+    if (h == 8) {
+        double ftmp[6];
+        uint64_t tmp[2];
+        __asm__ volatile (
+            "li          %[tmp0],        0x08                            \n\t"
+            "li          %[tmp1],        0xfefefefe                      \n\t"
+            "dmtc1       %[tmp1],        %[ftmp4]                        \n\t"
+            "punpcklwd   %[ftmp4],       %[ftmp4],             %[ftmp4]  \n\t"
+            "li          %[tmp1],        0x01                            \n\t"
+            "dmtc1       %[tmp1],        %[ftmp5]                        \n\t"
+            "1:                                                          \n\t"
+            "gsldlc1     %[ftmp1],       0x07(%[src1])                   \n\t"
+            "gsldrc1     %[ftmp1],       0x00(%[src1])                   \n\t"
+            "gsldlc1     %[ftmp2],       0x07(%[src2])                   \n\t"
+            "gsldrc1     %[ftmp2],       0x00(%[src2])                   \n\t"
+            "xor         %[ftmp3],       %[ftmp1],             %[ftmp2]  \n\t"
+            "and         %[ftmp3],       %[ftmp3],             %[ftmp4]  \n\t"
+            "psrlw       %[ftmp3],       %[ftmp3],             %[ftmp5]  \n\t"
+            "and         %[ftmp6],       %[ftmp1],             %[ftmp2]  \n\t"
+            "paddw       %[ftmp3],       %[ftmp3],             %[ftmp6]  \n\t"
+            "sdc1        %[ftmp3],       0x00(%[dst])                    \n\t"
+            PTR_ADDU    "%[src1],        %[src1],              %[stride] \n\t"
+            PTR_ADDU    "%[src2],        %[src2],              %[stride] \n\t"
+            PTR_ADDU    "%[dst],         %[dst],               %[stride] \n\t"
+            PTR_ADDIU   "%[tmp0],        %[tmp0],              -0x01     \n\t"
+            "bnez        %[tmp0],        1b                              \n\t"
+            : [dst]"+&r"(dst), [src1]"+&r"(src1), [src2]"+&r"(src2),
+              [ftmp1]"=&f"(ftmp[0]), [ftmp2]"=&f"(ftmp[1]), [ftmp3]"=&f"(ftmp[2]),
+              [ftmp4]"=&f"(ftmp[3]), [ftmp5]"=&f"(ftmp[4]), [ftmp6]"=&f"(ftmp[5]),
+              [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1])
+            : [stride]"r"(stride)
+            : "memory"
+        );
+    } else {
+        int i;
+
+        for (i = 0; i < h; i++) {
+            uint32_t a, b;
+
+            a = AV_RN32(&src1[i * stride]);
+            b = AV_RN32(&src2[i * stride]);
+            AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
+            a = AV_RN32(&src1[i * stride + 4]);
+            b = AV_RN32(&src2[i * stride + 4]);
+            AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));
+        }
+    }
+}
diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c
new file mode 100644
index 0000000..5427ac5
--- /dev/null
+++ b/libavcodec/mips/vp3dsp_idct_msa.c
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp3dsp_mips.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavutil/intreadwrite.h"
+#include "libavcodec/rnd_avg.h"
+
+static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
+{
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
+    v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
+          r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
+    v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
+    v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
+    v16u8 sign_l;
+    v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
+    v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
+    v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
+    v4i32 sign_t;
+    v16i8 zero = {0};
+    v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+    v4i32 cnst64277w = {64277, 64277, 64277, 64277};
+    v4i32 cnst60547w = {60547, 60547, 60547, 60547};
+    v4i32 cnst54491w = {54491, 54491, 54491, 54491};
+    v4i32 cnst46341w = {46341, 46341, 46341, 46341};
+    v4i32 cnst36410w = {36410, 36410, 36410, 36410};
+    v4i32 cnst25080w = {25080, 25080, 25080, 25080};
+    v4i32 cnst12785w = {12785, 12785, 12785, 12785};
+    v4i32 cnst8w = {8, 8, 8, 8};
+    v4i32 cnst2048w = {2048, 2048, 2048, 2048};
+    v4i32 cnst128w = {128, 128, 128, 128};
+    int nstride = stride;
+
+    /* Extended input data */
+    LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
+    sign = __msa_clti_s_h(r0, 0);
+    r0_r = (v4i32) __msa_ilvr_h(sign, r0);
+    r0_l = (v4i32) __msa_ilvl_h(sign, r0);
+    sign = __msa_clti_s_h(r1, 0);
+    r1_r = (v4i32) __msa_ilvr_h(sign, r1);
+    r1_l = (v4i32) __msa_ilvl_h(sign, r1);
+    sign = __msa_clti_s_h(r2, 0);
+    r2_r = (v4i32) __msa_ilvr_h(sign, r2);
+    r2_l = (v4i32) __msa_ilvl_h(sign, r2);
+    sign = __msa_clti_s_h(r3, 0);
+    r3_r = (v4i32) __msa_ilvr_h(sign, r3);
+    r3_l = (v4i32) __msa_ilvl_h(sign, r3);
+    sign = __msa_clti_s_h(r4, 0);
+    r4_r = (v4i32) __msa_ilvr_h(sign, r4);
+    r4_l = (v4i32) __msa_ilvl_h(sign, r4);
+    sign = __msa_clti_s_h(r5, 0);
+    r5_r = (v4i32) __msa_ilvr_h(sign, r5);
+    r5_l = (v4i32) __msa_ilvl_h(sign, r5);
+    sign = __msa_clti_s_h(r6, 0);
+    r6_r = (v4i32) __msa_ilvr_h(sign, r6);
+    r6_l = (v4i32) __msa_ilvl_h(sign, r6);
+    sign = __msa_clti_s_h(r7, 0);
+    r7_r = (v4i32) __msa_ilvr_h(sign, r7);
+    r7_l = (v4i32) __msa_ilvl_h(sign, r7);
+
+    /* Right part */
+    A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
+    B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
+    C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
+    D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
+    Ad = ((A - C) * cnst46341w) >> 16;
+    Bd = ((B - D) * cnst46341w) >> 16;
+    Cd = A + C;
+    Dd = B + D;
+    E = ((r0_r + r4_r) * cnst46341w) >> 16;
+    F = ((r0_r - r4_r) * cnst46341w) >> 16;
+    G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
+    H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
+    Ed = E - G;
+    Gd = E + G;
+    Add = F + Ad;
+    Bdd = Bd - H;
+    Fd = F - Ad;
+    Hd = Bd + H;
+    r0_r = Gd + Cd;
+    r7_r = Gd - Cd;
+    r1_r = Add + Hd;
+    r2_r = Add - Hd;
+    r3_r = Ed + Dd;
+    r4_r = Ed - Dd;
+    r5_r = Fd + Bdd;
+    r6_r = Fd - Bdd;
+
+    /* Left part */
+    A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
+    B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
+    C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
+    D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
+    Ad = ((A - C) * cnst46341w) >> 16;
+    Bd = ((B - D) * cnst46341w) >> 16;
+    Cd = A + C;
+    Dd = B + D;
+    E = ((r0_l + r4_l) * cnst46341w) >> 16;
+    F = ((r0_l - r4_l) * cnst46341w) >> 16;
+    G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
+    H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
+    Ed = E - G;
+    Gd = E + G;
+    Add = F + Ad;
+    Bdd = Bd - H;
+    Fd = F - Ad;
+    Hd = Bd + H;
+    r0_l = Gd + Cd;
+    r7_l = Gd - Cd;
+    r1_l = Add + Hd;
+    r2_l = Add - Hd;
+    r3_l = Ed + Dd;
+    r4_l = Ed - Dd;
+    r5_l = Fd + Bdd;
+    r6_l = Fd - Bdd;
+
+    /* Row 0 to 3 */
+    TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r,
+                       r0_r, r1_r, r2_r, r3_r);
+    TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l,
+                       r0_l, r1_l, r2_l, r3_l);
+    A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
+    B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
+    C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
+    D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
+    Ad = ((A - C) * cnst46341w) >> 16;
+    Bd = ((B - D) * cnst46341w) >> 16;
+    Cd = A + C;
+    Dd = B + D;
+    E = ((r0_r + r0_l) * cnst46341w) >> 16;
+    E += cnst8w;
+    F = ((r0_r - r0_l) * cnst46341w) >> 16;
+    F += cnst8w;
+    if (type == 1) { // HACK
+        E += cnst2048w;
+        F += cnst2048w;
+    }
+    G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
+    H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
+    Ed = E - G;
+    Gd = E + G;
+    Add = F + Ad;
+    Bdd = Bd - H;
+    Fd = F - Ad;
+    Hd = Bd + H;
+    A = (Gd + Cd) >> 4;
+    B = (Gd - Cd) >> 4;
+    C = (Add + Hd) >> 4;
+    D = (Add - Hd) >> 4;
+    E = (Ed + Dd) >> 4;
+    F = (Ed - Dd) >> 4;
+    G = (Fd + Bdd) >> 4;
+    H = (Fd - Bdd) >> 4;
+    if (type != 1) {
+        LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7);
+        ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
+                   f0, f1, f2, f3);
+        ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
+                   f4, f5, f6, f7);
+        ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
+                   c0, c1, c2, c3);
+        ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
+                   c4, c5, c6, c7);
+        A += c0;
+        B += c7;
+        C += c1;
+        D += c2;
+        E += c3;
+        F += c4;
+        G += c5;
+        H += c6;
+    }
+    A = CLIP_SW_0_255(A);
+    B = CLIP_SW_0_255(B);
+    C = CLIP_SW_0_255(C);
+    D = CLIP_SW_0_255(D);
+    E = CLIP_SW_0_255(E);
+    F = CLIP_SW_0_255(F);
+    G = CLIP_SW_0_255(G);
+    H = CLIP_SW_0_255(H);
+    sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
+    sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
+    sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
+    sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
+    sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
+    sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
+    sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
+    Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
+    if (type == 1) {
+        Bdd = Add + cnst128w;
+        Bdd = CLIP_SW_0_255(Bdd);
+        Ad = Bdd;
+        Bd = Bdd;
+        Cd = Bdd;
+        Dd = Bdd;
+        Ed = Bdd;
+        Fd = Bdd;
+        Gd = Bdd;
+        Hd = Bdd;
+    } else {
+        Ad = Add + c0;
+        Bd = Add + c1;
+        Cd = Add + c2;
+        Dd = Add + c3;
+        Ed = Add + c4;
+        Fd = Add + c5;
+        Gd = Add + c6;
+        Hd = Add + c7;
+        Ad = CLIP_SW_0_255(Ad);
+        Bd = CLIP_SW_0_255(Bd);
+        Cd = CLIP_SW_0_255(Cd);
+        Dd = CLIP_SW_0_255(Dd);
+        Ed = CLIP_SW_0_255(Ed);
+        Fd = CLIP_SW_0_255(Fd);
+        Gd = CLIP_SW_0_255(Gd);
+        Hd = CLIP_SW_0_255(Hd);
+    }
+    Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
+    Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
+    Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
+    Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
+    Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
+    Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
+    Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
+    Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
+    sign_t = __msa_ceqi_w(sign_t, 0);
+    A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
+    B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
+    C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
+    D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
+    E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
+    F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
+    G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
+    H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
+    r0_r = Ad + A;
+    r1_r = Bd + C;
+    r2_r = Cd + D;
+    r3_r = Dd + E;
+    r0_l = Ed + F;
+    r1_l = Fd + G;
+    r2_l = Gd + H;
+    r3_l = Hd + B;
+
+    /* Row 4 to 7 */
+    TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r,
+                       r4_r, r5_r, r6_r, r7_r);
+    TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l,
+                       r4_l, r5_l, r6_l, r7_l);
+    A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
+    B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
+    C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
+    D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
+    Ad = ((A - C) * cnst46341w) >> 16;
+    Bd = ((B - D) * cnst46341w) >> 16;
+    Cd = A + C;
+    Dd = B + D;
+    E = ((r4_r + r4_l) * cnst46341w) >> 16;
+    E += cnst8w;
+    F = ((r4_r - r4_l) * cnst46341w) >> 16;
+    F += cnst8w;
+    if (type == 1) { // HACK
+        E += cnst2048w;
+        F += cnst2048w;
+    }
+    G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
+    H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
+    Ed = E - G;
+    Gd = E + G;
+    Add = F + Ad;
+    Bdd = Bd - H;
+    Fd = F - Ad;
+    Hd = Bd + H;
+    A = (Gd + Cd) >> 4;
+    B = (Gd - Cd) >> 4;
+    C = (Add + Hd) >> 4;
+    D = (Add - Hd) >> 4;
+    E = (Ed + Dd) >> 4;
+    F = (Ed - Dd) >> 4;
+    G = (Fd + Bdd) >> 4;
+    H = (Fd - Bdd) >> 4;
+    if (type != 1) {
+        ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
+                   c0, c1, c2, c3);
+        ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
+                   c4, c5, c6, c7);
+        A += c0;
+        B += c7;
+        C += c1;
+        D += c2;
+        E += c3;
+        F += c4;
+        G += c5;
+        H += c6;
+    }
+    A = CLIP_SW_0_255(A);
+    B = CLIP_SW_0_255(B);
+    C = CLIP_SW_0_255(C);
+    D = CLIP_SW_0_255(D);
+    E = CLIP_SW_0_255(E);
+    F = CLIP_SW_0_255(F);
+    G = CLIP_SW_0_255(G);
+    H = CLIP_SW_0_255(H);
+    sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
+    sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
+    sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
+    sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
+    sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
+    sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
+    sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
+    Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
+    if (type == 1) {
+        Bdd = Add + cnst128w;
+        Bdd = CLIP_SW_0_255(Bdd);
+        Ad = Bdd;
+        Bd = Bdd;
+        Cd = Bdd;
+        Dd = Bdd;
+        Ed = Bdd;
+        Fd = Bdd;
+        Gd = Bdd;
+        Hd = Bdd;
+    } else {
+        Ad = Add + c0;
+        Bd = Add + c1;
+        Cd = Add + c2;
+        Dd = Add + c3;
+        Ed = Add + c4;
+        Fd = Add + c5;
+        Gd = Add + c6;
+        Hd = Add + c7;
+        Ad = CLIP_SW_0_255(Ad);
+        Bd = CLIP_SW_0_255(Bd);
+        Cd = CLIP_SW_0_255(Cd);
+        Dd = CLIP_SW_0_255(Dd);
+        Ed = CLIP_SW_0_255(Ed);
+        Fd = CLIP_SW_0_255(Fd);
+        Gd = CLIP_SW_0_255(Gd);
+        Hd = CLIP_SW_0_255(Hd);
+    }
+    Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
+    Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
+    Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
+    Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
+    Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
+    Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
+    Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
+    Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
+    sign_t = __msa_ceqi_w(sign_t, 0);
+    A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
+    B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
+    C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
+    D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
+    E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
+    F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
+    G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
+    H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
+    r4_r = Ad + A;
+    r5_r = Bd + C;
+    r6_r = Cd + D;
+    r7_r = Dd + E;
+    r4_l = Ed + F;
+    r5_l = Fd + G;
+    r6_l = Gd + H;
+    r7_l = Hd + B;
+    VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1);
+    VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3);
+    VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5);
+    VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
+
+    /* Final sequence of operations over-write original dst */
+    ST8x1_UB(d0, dst);
+    ST8x1_UB(d1, dst + nstride);
+    nstride += stride;
+    ST8x1_UB(d2, dst + nstride);
+    nstride += stride;
+    ST8x1_UB(d3, dst + nstride);
+    nstride += stride;
+    ST8x1_UB(d4, dst + nstride);
+    nstride += stride;
+    ST8x1_UB(d5, dst + nstride);
+    nstride += stride;
+    ST8x1_UB(d6, dst + nstride);
+    nstride += stride;
+    ST8x1_UB(d7, dst + nstride);
+}
+
+void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    idct_msa(dest, line_size, block, 1);
+    memset(block, 0, sizeof(*block) * 64);
+}
+
+void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    idct_msa(dest, line_size, block, 2);
+    memset(block, 0, sizeof(*block) * 64);
+}
+
+void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
+{
+    int i = (block[0] + 15) >> 5;
+    v4i32 dc = {i, i, i, i};
+    v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
+    v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
+    v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
+    v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
+    v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+    v16i8 zero = {0};
+    int nstride = line_size;
+
+    LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
+    ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
+               c0, c1, c2, c3);
+    ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
+               c4, c5, c6, c7);
+    /* Right part */
+    ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
+               e0, e1, e2, e3);
+    ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
+               e4, e5, e6, e7);
+    e0 += dc;
+    e1 += dc;
+    e2 += dc;
+    e3 += dc;
+    e4 += dc;
+    e5 += dc;
+    e6 += dc;
+    e7 += dc;
+    e0 = CLIP_SW_0_255(e0);
+    e1 = CLIP_SW_0_255(e1);
+    e2 = CLIP_SW_0_255(e2);
+    e3 = CLIP_SW_0_255(e3);
+    e4 = CLIP_SW_0_255(e4);
+    e5 = CLIP_SW_0_255(e5);
+    e6 = CLIP_SW_0_255(e6);
+    e7 = CLIP_SW_0_255(e7);
+
+    /* Left part */
+    ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
+               r0, r1, r2, r3);
+    ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
+               r4, r5, r6, r7);
+    r0 += dc;
+    r1 += dc;
+    r2 += dc;
+    r3 += dc;
+    r4 += dc;
+    r5 += dc;
+    r6 += dc;
+    r7 += dc;
+    r0 = CLIP_SW_0_255(r0);
+    r1 = CLIP_SW_0_255(r1);
+    r2 = CLIP_SW_0_255(r2);
+    r3 = CLIP_SW_0_255(r3);
+    r4 = CLIP_SW_0_255(r4);
+    r5 = CLIP_SW_0_255(r5);
+    r6 = CLIP_SW_0_255(r6);
+    r7 = CLIP_SW_0_255(r7);
+    VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
+    VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
+    VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
+    VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
+
+    /* Final sequence of operations over-write original dst */
+    ST8x1_UB(d0, dest);
+    ST8x1_UB(d1, dest + nstride);
+    nstride += line_size;
+    ST8x1_UB(d2, dest + nstride);
+    nstride += line_size;
+    ST8x1_UB(d3, dest + nstride);
+    nstride += line_size;
+    ST8x1_UB(d4, dest + nstride);
+    nstride += line_size;
+    ST8x1_UB(d5, dest + nstride);
+    nstride += line_size;
+    ST8x1_UB(d6, dest + nstride);
+    nstride += line_size;
+    ST8x1_UB(d7, dest + nstride);
+
+    block[0] = 0;
+}
+
+void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
+                              int *bounding_values)
+{
+    int nstride = -stride;
+    v4i32 e0, e1, f0, f1, g0, g1;
+    v16i8 zero = {0};
+    v16i8 d0, d1, d2, d3;
+    v8i16 c0, c1, c2, c3;
+    v8i16 r0;
+    v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
+          cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
+    v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+    int16_t temp_16[8];
+    int temp_32[8];
+
+    LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3);
+    ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
+               c0, c1, c2, c3);
+    r0 = (c0 - c3) + (c2 - c1) * cnst3h;
+    r0 += cnst4h;
+    r0 = r0 >> 3;
+    /* Get filter_value from bounding_values one by one */
+    ST_SH(r0, temp_16);
+    for (int i = 0; i < 8; i++)
+        temp_32[i] = bounding_values[temp_16[i]];
+    LD_SW2(temp_32, 4, e0, e1);
+    ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
+    ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
+    f0 += e0;
+    f1 += e1;
+    g0 -= e0;
+    g1 -= e1;
+    f0 = CLIP_SW_0_255(f0);
+    f1 = CLIP_SW_0_255(f1);
+    g0 = CLIP_SW_0_255(g0);
+    g1 = CLIP_SW_0_255(g1);
+    VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
+
+    /* Final move to first_pixel */
+    ST8x1_UB(d1, first_pixel + nstride);
+    ST8x1_UB(d2, first_pixel);
+}
+
+void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
+                              int *bounding_values)
+{
+    v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
+    v8i16 c0, c1, c2, c3, c4, c5, c6, c7;
+    v8i16 r0;
+    v4i32 e0, e1, f0, f1, g0, g1;
+    v16i8 zero = {0};
+    v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
+          cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
+    v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+    int16_t temp_16[8];
+    int temp_32[8];
+
+    LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
+    ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
+               c0, c1, c2, c3);
+    ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7,
+               c4, c5, c6, c7);
+    TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7,
+                       c0, c1, c2, c3, c4, c5, c6, c7);
+    r0 = (c0 - c3) + (c2 - c1) * cnst3h;
+    r0 += cnst4h;
+    r0 = r0 >> 3;
+
+    /* Get filter_value from bounding_values one by one */
+    ST_SH(r0, temp_16);
+    for (int i = 0; i < 8; i++)
+        temp_32[i] = bounding_values[temp_16[i]];
+    LD_SW2(temp_32, 4, e0, e1);
+    ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
+    ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
+    f0 += e0;
+    f1 += e1;
+    g0 -= e0;
+    g1 -= e1;
+    f0 = CLIP_SW_0_255(f0);
+    f1 = CLIP_SW_0_255(f1);
+    g0 = CLIP_SW_0_255(g0);
+    g1 = CLIP_SW_0_255(g1);
+    VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
+    /* Final move to first_pixel */
+    ST2x4_UB(d1, 0, first_pixel - 1, stride);
+    ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride);
+}
+
+void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
+                                 const uint8_t *src2, ptrdiff_t stride, int h)
+{
+    if (h == 8) {
+        v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
+        v16i8 c0, c1, c2, c3;
+        v4i32 a0, a1, a2, a3, b0, b1, b2, b3;
+        v4i32 e0, e1, e2;
+        v4i32 f0, f1, f2;
+        v4u32 t0, t1, t2, t3;
+        v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
+        int32_t value = 0xfefefefe;
+        v4i32 fmask = {value, value, value, value};
+
+        LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7);
+        VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
+        VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
+        a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
+        a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
+        a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
+        a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
+
+        LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
+        VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
+        VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
+        b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
+        b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
+        b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
+        b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
+
+        e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0);
+        e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
+        t0 = ((v4u32)e0) >> 1;
+        e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0);
+        t0 = t0 + (v4u32)e2;
+
+        e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1);
+        e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
+        t1 = ((v4u32)e1) >> 1;
+        e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1);
+        t1 = t1 + (v4u32)e2;
+
+        f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2);
+        f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
+        t2 = ((v4u32)f0) >> 1;
+        f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2);
+        t2 = t2 + (v4u32)f2;
+
+        f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3);
+        f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
+        t3 = ((v4u32)f1) >> 1;
+        f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
+        t3 = t3 + (v4u32)f2;
+
+        ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride);
+        ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride);
+        ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride);
+        ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride);
+    } else {
+        int i;
+
+        for (i = 0; i < h; i++) {
+            uint32_t a, b;
+
+            a = AV_RN32(&src1[i * stride]);
+            b = AV_RN32(&src2[i * stride]);
+            AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
+            a = AV_RN32(&src1[i * stride + 4]);
+            b = AV_RN32(&src2[i * stride + 4]);
+            AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));
+        }
+    }
+}
diff --git a/libavcodec/mips/vp3dsp_init_mips.c b/libavcodec/mips/vp3dsp_init_mips.c
new file mode 100644
index 0000000..e183db3
--- /dev/null
+++ b/libavcodec/mips/vp3dsp_init_mips.c
@@ -0,0 +1,60 @@
+
+/*
+ * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vp3dsp.h"
+#include "vp3dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void vp3dsp_init_msa(VP3DSPContext *c, int flags)
+{
+    c->put_no_rnd_pixels_l2 = ff_put_no_rnd_pixels_l2_msa;
+
+    c->idct_add      = ff_vp3_idct_add_msa;
+    c->idct_put      = ff_vp3_idct_put_msa;
+    c->idct_dc_add   = ff_vp3_idct_dc_add_msa;
+    c->v_loop_filter = ff_vp3_v_loop_filter_msa;
+    c->h_loop_filter = ff_vp3_h_loop_filter_msa;
+}
+#endif /* HAVE_MSA */
+
+#if HAVE_MMI
+static av_cold void vp3dsp_init_mmi(VP3DSPContext *c, int flags)
+{
+    c->put_no_rnd_pixels_l2 = ff_put_no_rnd_pixels_l2_mmi;
+
+    c->idct_add      = ff_vp3_idct_add_mmi;
+    c->idct_put      = ff_vp3_idct_put_mmi;
+    c->idct_dc_add   = ff_vp3_idct_dc_add_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags)
+{
+#if HAVE_MMI
+    vp3dsp_init_mmi(c, flags);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    vp3dsp_init_msa(c, flags);
+#endif /* HAVE_MSA */
+}
diff --git a/libavcodec/mips/vp3dsp_mips.h b/libavcodec/mips/vp3dsp_mips.h
new file mode 100644
index 0000000..4685a82
--- /dev/null
+++ b/libavcodec/mips/vp3dsp_mips.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VP3DSP_MIPS_H
+#define AVCODEC_MIPS_VP3DSP_MIPS_H
+
+#include "libavcodec/vp3dsp.h"
+#include <string.h>
+
+void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
+                              int *bounding_values);
+void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
+                                 const uint8_t *src2, ptrdiff_t stride, int h);
+void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
+                              int *bounding_values);
+
+void ff_vp3_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_vp3_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_vp3_idct_dc_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_put_no_rnd_pixels_l2_mmi(uint8_t *dst, const uint8_t *src1,
+                                 const uint8_t *src2, ptrdiff_t stride, int h);
+
+#endif /* #ifndef AVCODEC_MIPS_VP3DSP_MIPS_H */
diff --git a/libavcodec/mips/vp8_idct_msa.c b/libavcodec/mips/vp8_idct_msa.c
new file mode 100644
index 0000000..11ac9ff
--- /dev/null
+++ b/libavcodec/mips/vp8_idct_msa.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)    \
+{                                                                    \
+    v4i32 a1_m, b1_m, c1_m, d1_m;                                    \
+    v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
+    v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
+                                                                     \
+    const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);     \
+    sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                      \
+    a1_m = in0 + in2;                                                \
+    b1_m = in0 - in2;                                                \
+    c_tmp1_m = ((in1) * sinpi8_sqrt2_m) >> 16;                       \
+    c_tmp2_m = in3 + (((in3) * const_cospi8sqrt2minus1_m) >> 16);    \
+    c1_m = c_tmp1_m - c_tmp2_m;                                      \
+    d_tmp1_m = (in1) + (((in1) * const_cospi8sqrt2minus1_m) >> 16);  \
+    d_tmp2_m = ((in3) * sinpi8_sqrt2_m) >> 16;                       \
+    d1_m = d_tmp1_m + d_tmp2_m;                                      \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);     \
+}
+
+void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v4i32 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1;
+    v16i8 mask = { 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    /* load short vector elements of 4x4 block */
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(dst, stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+               res0, res1, res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+               res0, res1, res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
+    ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+
+    memset(input, 0, 4 * 4 * sizeof(*input));
+}
+
+void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride)
+{
+    v8i16 vec;
+    v8i16 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1;
+    v16i8 mask = { 0, 2, 4, 6, 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    vec = __msa_fill_h(in_dc[0]);
+    vec = __msa_srari_h(vec, 3);
+    LD_SB4(dst, stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+               res0, res1, res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
+    ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+
+    in_dc[0] = 0;
+}
+
+void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t input[16])
+{
+    int16_t *mb_dq_coeff = &block[0][0][0];
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
+    v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+
+    /* load short vector elements of 4x4 block */
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
+    ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
+    SRA_4V(vt0, vt1, vt2, vt3, 3);
+    mb_dq_coeff[0] = __msa_copy_s_h((v8i16) vt0, 0);
+    mb_dq_coeff[16] = __msa_copy_s_h((v8i16) vt1, 0);
+    mb_dq_coeff[32] = __msa_copy_s_h((v8i16) vt2, 0);
+    mb_dq_coeff[48] = __msa_copy_s_h((v8i16) vt3, 0);
+    mb_dq_coeff[64] = __msa_copy_s_h((v8i16) vt0, 2);
+    mb_dq_coeff[80] = __msa_copy_s_h((v8i16) vt1, 2);
+    mb_dq_coeff[96] = __msa_copy_s_h((v8i16) vt2, 2);
+    mb_dq_coeff[112] = __msa_copy_s_h((v8i16) vt3, 2);
+    mb_dq_coeff[128] = __msa_copy_s_h((v8i16) vt0, 4);
+    mb_dq_coeff[144] = __msa_copy_s_h((v8i16) vt1, 4);
+    mb_dq_coeff[160] = __msa_copy_s_h((v8i16) vt2, 4);
+    mb_dq_coeff[176] = __msa_copy_s_h((v8i16) vt3, 4);
+    mb_dq_coeff[192] = __msa_copy_s_h((v8i16) vt0, 6);
+    mb_dq_coeff[208] = __msa_copy_s_h((v8i16) vt1, 6);
+    mb_dq_coeff[224] = __msa_copy_s_h((v8i16) vt2, 6);
+    mb_dq_coeff[240] = __msa_copy_s_h((v8i16) vt3, 6);
+
+    memset(input, 0, 4 * 4 * sizeof(int16_t));
+}
+
+void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16],
+                              ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 8, &block[2][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 12, &block[3][0], stride);
+}
+
+void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + stride * 4, &block[2][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + stride * 4 + 4, &block[3][0], stride);
+}
diff --git a/libavcodec/mips/vp8_lpf_msa.c b/libavcodec/mips/vp8_lpf_msa.c
new file mode 100644
index 0000000..3590961
--- /dev/null
+++ b/libavcodec/mips/vp8_lpf_msa.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)           \
+{                                                                \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                              \
+                                                                 \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                        \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                        \
+    p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);      \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);             \
+    mask = ((v16u8) mask <= b_limit);                            \
+}
+
+#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
+                           mask_in, hev_in)                             \
+{                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80);                       \
+    p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80);                       \
+    q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80);                       \
+    q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80);                       \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8) hev_in;                                       \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);           \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);  \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                     \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0);           \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h);  \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                     \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);               \
+    filt = filt & (v16i8) mask_in;                                      \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80);                       \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80);                       \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                        \
+    filt = filt & (v16i8) hev_in;                                       \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80);                       \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80);                       \
+}
+
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)           \
+{                                                                   \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;        \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;            \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;         \
+                                                                    \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                       \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                       \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                       \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                       \
+                                                                    \
+    filt = __msa_subs_s_b(p1_m, q1_m);                              \
+                                                                    \
+    q0_sub_p0 = q0_m - p0_m;                                        \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+                                                                    \
+    cnst3h = __msa_ldi_h(3);                                        \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                          \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \
+    filt_r += q0_sub_p0_r;                                          \
+    filt_r = __msa_sat_s_h(filt_r, 7);                              \
+                                                                    \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                          \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \
+    filt_l += q0_sub_p0_l;                                          \
+    filt_l = __msa_sat_s_h(filt_l, 7);                              \
+                                                                    \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \
+    filt = filt & (v16i8) (mask);                                   \
+                                                                    \
+    cnst4b = __msa_ldi_b(4);                                        \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                           \
+    filt1 >>= 3;                                                    \
+                                                                    \
+    cnst3b = __msa_ldi_b(3);                                        \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                           \
+    filt2 >>= 3;                                                    \
+                                                                    \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                             \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                             \
+    q0_in = __msa_xori_b((v16u8) q0_m, 0x80);                       \
+    p0_in = __msa_xori_b((v16u8) p0_m, 0x80);                       \
+}
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)             \
+{                                                                   \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                       \
+    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                          \
+    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;               \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;       \
+    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                        \
+                                                                    \
+    cnst3h = __msa_ldi_h(3);                                        \
+                                                                    \
+    p2_m = (v16i8) __msa_xori_b(p2, 0x80);                          \
+    p1_m = (v16i8) __msa_xori_b(p1, 0x80);                          \
+    p0_m = (v16i8) __msa_xori_b(p0, 0x80);                          \
+    q0_m = (v16i8) __msa_xori_b(q0, 0x80);                          \
+    q1_m = (v16i8) __msa_xori_b(q1, 0x80);                          \
+    q2_m = (v16i8) __msa_xori_b(q2, 0x80);                          \
+                                                                    \
+    filt = __msa_subs_s_b(p1_m, q1_m);                              \
+    q0_sub_p0 = q0_m - p0_m;                                        \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+                                                                    \
+    /* right part */                                                \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                          \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \
+    filt_r = filt_r + q0_sub_p0_r;                                  \
+    filt_r = __msa_sat_s_h(filt_r, 7);                              \
+                                                                    \
+    /* left part */                                                 \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                          \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \
+    filt_l = filt_l + q0_sub_p0_l;                                  \
+    filt_l = __msa_sat_s_h(filt_l, 7);                              \
+                                                                    \
+    /* combine left and right part */                               \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \
+    filt = filt & (v16i8) mask;                                     \
+    filt2 = filt & (v16i8) hev;                                     \
+                                                                    \
+    /* filt_val &= ~hev */                                          \
+    hev = __msa_xori_b(hev, 0xff);                                  \
+    filt = filt & (v16i8) hev;                                      \
+    cnst4b = __msa_ldi_b(4);                                        \
+    filt1 = __msa_adds_s_b(filt2, cnst4b);                          \
+    filt1 >>= 3;                                                    \
+    cnst3b = __msa_ldi_b(3);                                        \
+    filt2 = __msa_adds_s_b(filt2, cnst3b);                          \
+    filt2 >>= 3;                                                    \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                             \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                             \
+                                                                    \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                   \
+                                                                    \
+    cnst27h = __msa_ldi_h(27);                                      \
+    cnst63h = __msa_ldi_h(63);                                      \
+                                                                    \
+    /* right part */                                                \
+    u_r = filt_r * cnst27h;                                         \
+    u_r += cnst63h;                                                 \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+    /* left part */                                                 \
+    u_l = filt_l * cnst27h;                                         \
+    u_l += cnst63h;                                                 \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q0_m = __msa_subs_s_b(q0_m, u);                                 \
+    q0 = __msa_xori_b((v16u8) q0_m, 0x80);                          \
+    p0_m = __msa_adds_s_b(p0_m, u);                                 \
+    p0 = __msa_xori_b((v16u8) p0_m, 0x80);                          \
+    cnst18h = __msa_ldi_h(18);                                      \
+    u_r = filt_r * cnst18h;                                         \
+    u_r += cnst63h;                                                 \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+                                                                    \
+    /* left part */                                                 \
+    u_l = filt_l * cnst18h;                                         \
+    u_l += cnst63h;                                                 \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q1_m = __msa_subs_s_b(q1_m, u);                                 \
+    q1 = __msa_xori_b((v16u8) q1_m, 0x80);                          \
+    p1_m = __msa_adds_s_b(p1_m, u);                                 \
+    p1 = __msa_xori_b((v16u8) p1_m, 0x80);                          \
+    u_r = filt_r << 3;                                              \
+    u_r += filt_r + cnst63h;                                        \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+                                                                    \
+    /* left part */                                                 \
+    u_l = filt_l << 3;                                              \
+    u_l += filt_l + cnst63h;                                        \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q2_m = __msa_subs_s_b(q2_m, u);                                 \
+    q2 = __msa_xori_b((v16u8) q2_m, 0x80);                          \
+    p2_m = __msa_adds_s_b(p2_m, u);                                 \
+    p2 = __msa_xori_b((v16u8) p2_m, 0x80);                          \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    /* absolute subtraction of pixel values */                     \
+    p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \
+    p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \
+    p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \
+    q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \
+    q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \
+    q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \
+    p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \
+    p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \
+    /* calculation of hev */                                       \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = (thresh_in) < (v16u8) flat_out;                      \
+    /* calculation of mask */                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+    mask_out = (b_limit_in) < p0_asub_q0_m;                        \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+    mask_out = (limit_in) < (v16u8) mask_out;                      \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \
+{                                                               \
+    uint16_t tmp0_h;                                            \
+    uint32_t tmp0_w;                                            \
+                                                                \
+    tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx);              \
+    tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx);              \
+    SW(tmp0_w, pdst);                                           \
+    SH(tmp0_h, pdst + stride);                                  \
+}
+
+void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    /* load vector elements */
+    temp_src = src - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    /* store vector elements */
+    temp_src = src - 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
+    temp_src += (4 * pitch);
+    ST_UB2(q1, q2, temp_src, pitch);
+}
+
+void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
+                                 int thresh_in)
+{
+    uint8_t *temp_src;
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+
+    temp_src = src_u - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    temp_src = src_v - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+
+    /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    p2_d = __msa_copy_u_d((v2i64) p2, 0);
+    p1_d = __msa_copy_u_d((v2i64) p1, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1, 0);
+    q2_d = __msa_copy_u_d((v2i64) q2, 0);
+    src_u -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
+    src_u += 4 * pitch;
+    SD(q1_d, src_u);
+    src_u += pitch;
+    SD(q2_d, src_u);
+
+    p2_d = __msa_copy_u_d((v2i64) p2, 1);
+    p1_d = __msa_copy_u_d((v2i64) p1, 1);
+    p0_d = __msa_copy_u_d((v2i64) p0, 1);
+    q0_d = __msa_copy_u_d((v2i64) q0, 1);
+    q1_d = __msa_copy_u_d((v2i64) q1, 1);
+    q2_d = __msa_copy_u_d((v2i64) q2, 1);
+    src_v -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
+    src_v += 4 * pitch;
+    SD(q1_d, src_v);
+    src_v += pitch;
+    SD(q2_d, src_v);
+}
+
+void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    temp_src = src - 4;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    temp_src = src - 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+void ff_vp8_h_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
+                                 int thresh_in)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    src_u -= 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
+
+    src_v -= 3;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
+                                     int b_limit_ptr)
+{
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    /* load vector elements */
+    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ST_UB2(p0, q0, (src - pitch), pitch);
+}
+
+void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
+                                     int b_limit_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    temp_src = src - 2;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+
+    src -= 1;
+    ST2x4_UB(tmp1, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp1, 4, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 4, src, pitch);
+    src += 4 * pitch;
+}
+
+void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
+                                       ptrdiff_t pitch, int b_limit_in,
+                                       int limit_in, int thresh_in)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+
+    src_u = src_u - (pitch << 2);
+    LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    src_u += (5 * pitch);
+    src_v = src_v - (pitch << 2);
+    LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+    src_v += (5 * pitch);
+
+    /* right 8 element of p3 are u pixel and
+       left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    p1_d = __msa_copy_u_d((v2i64) p1, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1, 0);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
+
+    p1_d = __msa_copy_u_d((v2i64) p1, 1);
+    p0_d = __msa_copy_u_d((v2i64) p0, 1);
+    q0_d = __msa_copy_u_d((v2i64) q0, 1);
+    q1_d = __msa_copy_u_d((v2i64) q1, 1);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
+}
+
+void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
+                                       ptrdiff_t pitch, int b_limit_in,
+                                       int limit_in, int thresh_in)
+{
+    uint8_t *temp_src_u, *temp_src_v;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+    tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1);
+    tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
+    ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+
+    temp_src_u = src_u - 2;
+    ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
+    temp_src_u += 4 * pitch;
+    ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
+
+    temp_src_v = src_v - 2;
+    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
+    temp_src_v += 4 * pitch;
+    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+}
+
+void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    thresh = (v16u8) __msa_fill_b(h);
+    b_limit = (v16u8) __msa_fill_b(e);
+    limit = (v16u8) __msa_fill_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(h);
+    b_limit = (v16u8) __msa_fill_b(e);
+    limit = (v16u8) __msa_fill_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/libavcodec/mips/vp8_mc_msa.c b/libavcodec/mips/vp8_mc_msa.c
new file mode 100644
index 0000000..2bf0abd
--- /dev/null
+++ b/libavcodec/mips/vp8_mc_msa.c
@@ -0,0 +1,2332 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static const int8_t subpel_filters_msa[7][8] = {
+    {-6, 123, 12, -1, 0, 0, 0, 0},
+    {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-9, 93, 50, -6, 0, 0, 0, 0},
+    {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
+    {-6, 50, 93, -9, 0, 0, 0, 0},
+    {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-1, 12, 123, -6, 0, 0, 0, 0},
+};
+
+static const int8_t bilinear_filters_msa[7][2] = {
+    {112, 16},
+    {96, 32},
+    {80, 48},
+    {64, 64},
+    {48, 80},
+    {32, 96},
+    {16, 112}
+};
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
+                        filt_h0, filt_h1, filt_h2)                       \
+( {                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m;                                        \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
+               vec0_m, vec1_m, vec2_m);                                  \
+    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
+                            filt_h0, filt_h1, filt_h2);                  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+} )
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, mask2,                \
+                                   filt0, filt1, filt2,                \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
+    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
+}
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2,                       \
+                                   filt0, filt1, filt2,                       \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
+( {                                                             \
+    v8i16 tmp0;                                                 \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
+( {                                                                    \
+    v16i8 vec0_m, vec1_m;                                              \
+    v8i16 hz_out_m;                                                    \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
+    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
+                                                                       \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                             \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
+                                                                       \
+    hz_out_m;                                                          \
+} )
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (4 * src_stride);
+
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SRARI_H4_SH(out4, out5, out6, out7, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src4 = src8;
+    }
+}
+
+void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
+               src10_r, src32_r, src21_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
+               src32_r, src43_r, src21_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
+               src32_l, src43_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
+                              filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
+                              filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
+                              filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
+                              filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
+                              filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
+                              filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
+                              filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
+                              filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 tmp0, tmp1;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (2 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src7, src8);
+        hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (2 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out7;
+        out3 = out5;
+        out4 = out6;
+    }
+}
+
+
+void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (16 == height) {
+        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                                   filt1, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SRARI_H4_SH(out4, out5, out6, out7, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (1 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B2_128_SB(src3, src4);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out0, out1;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (1 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        vec0 = vec4;
+        vec2 = vec1;
+    }
+}
+
+void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 res0, res1, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (2 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+        XORI_B2_128_UB(res0, res1);
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+    v16u8 out0, out1;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= (2 + src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v16u8 out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+
+    src -= (1 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 vec0, vec1;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= (1 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out6;
+        out3 = out5;
+        out4 = out7;
+    }
+}
+
+void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, 7);
+    SRARI_H4_UH(out4, out5, out6, out7, 7);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          const int8_t *filter_horiz,
+                                          const int8_t *filter_vert,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, 7);
+        SAT_UH2_UH(tmp3, tmp4, 7);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                  uint8_t *src, ptrdiff_t src_stride,
+                                  int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                            uint8_t *src, ptrdiff_t src_stride,
+                            int height, int mx, int my)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                            uint8_t *src, ptrdiff_t src_stride,
+                            int height, int mx, int my)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3;
+
+    if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
diff --git a/libavcodec/mips/vp8dsp_init_mips.c b/libavcodec/mips/vp8dsp_init_mips.c
new file mode 100644
index 0000000..7fd8fb0
--- /dev/null
+++ b/libavcodec/mips/vp8dsp_init_mips.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * VP8 compatible video decoder
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp_mips.h"
+
+#define VP8_MC_MIPS_FUNC(IDX, SIZE)            \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][1] =  \
+        ff_put_vp8_epel##SIZE##_h4_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][2] =  \
+        ff_put_vp8_epel##SIZE##_h6_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][0] =  \
+        ff_put_vp8_epel##SIZE##_v4_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][1] =  \
+        ff_put_vp8_epel##SIZE##_h4v4_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][2] =  \
+        ff_put_vp8_epel##SIZE##_h6v4_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][0] =  \
+        ff_put_vp8_epel##SIZE##_v6_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][1] =  \
+        ff_put_vp8_epel##SIZE##_h4v6_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][2] =  \
+        ff_put_vp8_epel##SIZE##_h6v6_msa
+
+#define VP8_BILINEAR_MC_MIPS_FUNC(IDX, SIZE)       \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][1] =  \
+        ff_put_vp8_bilinear##SIZE##_h_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][2] =  \
+        ff_put_vp8_bilinear##SIZE##_h_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][0] =  \
+        ff_put_vp8_bilinear##SIZE##_v_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][1] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][2] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][0] =  \
+        ff_put_vp8_bilinear##SIZE##_v_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa
+
+#define VP8_MC_MIPS_COPY(IDX, SIZE)                \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][0] =      \
+        ff_put_vp8_pixels##SIZE##_msa;             \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] =  \
+        ff_put_vp8_pixels##SIZE##_msa;
+
+#if HAVE_MSA
+static av_cold void vp8dsp_init_msa(VP8DSPContext *dsp)
+{
+    dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_msa;
+    dsp->vp8_idct_add = ff_vp8_idct_add_msa;
+    dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_msa;
+    dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_msa;
+    dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_msa;
+
+    VP8_MC_MIPS_FUNC(0, 16);
+    VP8_MC_MIPS_FUNC(1, 8);
+    VP8_MC_MIPS_FUNC(2, 4);
+
+    VP8_BILINEAR_MC_MIPS_FUNC(0, 16);
+    VP8_BILINEAR_MC_MIPS_FUNC(1, 8);
+    VP8_BILINEAR_MC_MIPS_FUNC(2, 4);
+
+    VP8_MC_MIPS_COPY(0, 16);
+    VP8_MC_MIPS_COPY(1, 8);
+
+    dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_msa;
+    dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_msa;
+    dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_msa;
+    dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_msa;
+
+    dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_msa;
+    dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_msa;
+    dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_msa;
+    dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_msa;
+
+    dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_msa;
+    dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void vp8dsp_init_mmi(VP8DSPContext *dsp)
+{
+    dsp->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmi;
+    dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_mmi;
+    dsp->vp8_idct_add       = ff_vp8_idct_add_mmi;
+    dsp->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmi;
+    dsp->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmi;
+    dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmi;
+
+    dsp->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_mmi;
+    dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_mmi;
+    dsp->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_mmi;
+    dsp->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_mmi;
+    dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_mmi;
+
+    dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_mmi;
+    dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_mmi;
+    dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_mmi;
+    dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_mmi;
+    dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_mmi;
+
+    dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_mmi;
+    dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_mmi;
+    dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_mmi;
+    dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_mmi;
+    dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_mmi;
+    dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_mmi;
+
+    dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilinear16_h_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilinear16_h_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilinear16_v_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilinear16_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilinear16_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilinear16_v_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilinear16_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilinear16_hv_mmi;
+
+    dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilinear8_h_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilinear8_h_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilinear8_v_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilinear8_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilinear8_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilinear8_v_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilinear8_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilinear8_hv_mmi;
+
+    dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilinear4_h_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilinear4_h_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilinear4_v_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilinear4_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilinear4_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilinear4_v_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilinear4_hv_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilinear4_hv_mmi;
+
+    dsp->put_vp8_epel_pixels_tab[0][0][0]     = ff_put_vp8_pixels16_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmi;
+
+    dsp->put_vp8_epel_pixels_tab[1][0][0]     = ff_put_vp8_pixels8_mmi;
+    dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmi;
+
+    dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_mmi;
+    dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_mmi;
+    dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mmi;
+    dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mmi;
+
+    dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_mmi;
+    dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_mmi;
+    dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmi;
+    dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmi;
+
+    dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmi;
+    dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_vp8dsp_init_mips(VP8DSPContext *dsp)
+{
+#if HAVE_MMI
+    vp8dsp_init_mmi(dsp);
+#endif /* HAVE_MMI */
+#if HAVE_MSA
+    vp8dsp_init_msa(dsp);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/vp8dsp_mips.h b/libavcodec/mips/vp8dsp_mips.h
new file mode 100644
index 0000000..07666ab
--- /dev/null
+++ b/libavcodec/mips/vp8dsp_mips.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VP8DSP_MIPS_H
+#define AVCODEC_MIPS_VP8DSP_MIPS_H
+
+#include "libavutil/mem.h"
+#include "libavcodec/vp8dsp.h"
+#include "libavcodec/mathops.h"
+#include "constants.h"
+
+void ff_put_vp8_pixels4_msa(uint8_t *dst, ptrdiff_t dststride,
+                            uint8_t *src, ptrdiff_t srcstride,
+                            int h, int x, int y);
+void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dststride,
+                            uint8_t *src, ptrdiff_t srcstride,
+                            int h, int x, int y);
+void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int x, int y);
+
+void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+
+void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+
+void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+
+void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                  uint8_t *src, ptrdiff_t srcstride,
+                                  int h, int mx, int my);
+
+void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+
+void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+
+/* loop filter */
+void ff_vp8_h_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+void ff_vp8_v_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                       ptrdiff_t stride,
+                                       int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                       ptrdiff_t stride,
+                                       int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim);
+void ff_vp8_v_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim);
+
+/* Idct functions */
+void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_idct_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16],
+                              ptrdiff_t stride);
+
+void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
+        ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
+        ptrdiff_t stride);
+
+void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int x, int y);
+void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int x, int y);
+void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int x, int y);
+
+void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+
+void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+
+void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+
+void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+
+void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+
+void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my);
+
+// loop filter applied to edges between macroblocks
+void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
+        int flim_I, int hev_thresh);
+void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
+        int flim_I, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh);
+
+// loop filter applied to inner macroblock edges
+void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh);
+void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh);
+
+void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim);
+void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim);
+
+#endif  // #ifndef AVCODEC_MIPS_VP8DSP_MIPS_H
diff --git a/libavcodec/mips/vp8dsp_mmi.c b/libavcodec/mips/vp8dsp_mmi.c
new file mode 100644
index 0000000..bd80aa1
--- /dev/null
+++ b/libavcodec/mips/vp8dsp_mmi.c
@@ -0,0 +1,3326 @@
+/*
+ * Loongson SIMD optimized vp8dsp
+ *
+ * Copyright (c) 2016 Loongson Technology Corporation Limited
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp8dsp_mips.h"
+#include "constants.h"
+#include "libavutil/mips/mmiutils.h"
+
+#define DECLARE_DOUBLE_1            double db_1
+#define DECLARE_DOUBLE_2            double db_2
+#define DECLARE_UINT32_T            uint32_t  it_1
+#define RESTRICT_ASM_DOUBLE_1       [db_1]"=&f"(db_1)
+#define RESTRICT_ASM_DOUBLE_2       [db_2]"=&f"(db_2)
+#define RESTRICT_ASM_UINT32_T       [it_1]"=&r"(it_1)
+
+#define MMI_PCMPGTUB(dst, src1, src2)                                       \
+        "pcmpeqb    %[db_1],    "#src1",        "#src2"             \n\t"   \
+        "pmaxub     %[db_2],    "#src1",        "#src2"             \n\t"   \
+        "pcmpeqb    %[db_2],    %[db_2],        "#src1"             \n\t"   \
+        "xor        "#dst",     %[db_2],        %[db_1]             \n\t"
+
+#define MMI_BTOH(dst_l, dst_r, src)                                         \
+        "xor        %[db_1],    %[db_1],        %[db_1]             \n\t"   \
+        "pcmpgtb    %[db_2],    %[db_1],        "#src"              \n\t"   \
+        "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
+        "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
+
+#define MMI_VP8_LOOP_FILTER                                                 \
+        /* Calculation of hev */                                            \
+        "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "pasubub    %[ftmp0],   %[p1],          %[p0]               \n\t"   \
+        "pasubub    %[ftmp1],   %[q1],          %[q0]               \n\t"   \
+        "pmaxub     %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"   \
+        MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3])                            \
+        /* Calculation of mask */                                           \
+        "pasubub    %[ftmp1],   %[p0],          %[q0]               \n\t"   \
+        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
+        "pasubub    %[ftmp2],   %[p1],          %[q1]               \n\t"   \
+        "li         %[tmp0],    0x09                                \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp3]                            \n\t"   \
+        PSRLB_MMI(%[ftmp2],  %[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp2])     \
+        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
+        "dmtc1      %[e],       %[ftmp3]                            \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3])                           \
+        "pmaxub     %[mask],    %[mask],        %[ftmp0]            \n\t"   \
+        "pasubub    %[ftmp1],   %[p3],          %[p2]               \n\t"   \
+        "pasubub    %[ftmp2],   %[p2],          %[p1]               \n\t"   \
+        "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
+        "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
+        "pasubub    %[ftmp1],   %[q3],          %[q2]               \n\t"   \
+        "pasubub    %[ftmp2],   %[q2],          %[q1]               \n\t"   \
+        "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
+        "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
+        "dmtc1      %[i],       %[ftmp3]                            \n\t"   \
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3])                            \
+        "pcmpeqw    %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
+        "xor        %[mask],    %[mask],        %[ftmp3]            \n\t"   \
+        /* VP8_MBFILTER */                                                  \
+        "li         %[tmp0],    0x80808080                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp7]                            \n\t"   \
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"   \
+        "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"   \
+        "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
+        "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
+        "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
+        "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
+        "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
+        "psubsb     %[ftmp4],   %[p1],          %[q1]               \n\t"   \
+        "psubb      %[ftmp5],   %[q0],          %[p0]               \n\t"   \
+        MMI_BTOH(%[ftmp1],  %[ftmp0],  %[ftmp5])                            \
+        MMI_BTOH(%[ftmp3],  %[ftmp2],  %[ftmp4])                            \
+        /* Right part */                                                    \
+        "paddh      %[ftmp5],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"   \
+        "paddh      %[ftmp0],   %[ftmp2],       %[ftmp0]            \n\t"   \
+        /* Left part */                                                     \
+        "paddh      %[ftmp5],   %[ftmp1],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"   \
+        "paddh      %[ftmp1],   %[ftmp3],       %[ftmp1]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp1],   %[ftmp0],       %[ftmp1]            \n\t"   \
+        "and        %[ftmp1],   %[ftmp1],       %[mask]             \n\t"   \
+        "and        %[ftmp2],   %[ftmp1],       %[hev]              \n\t"   \
+        "li         %[tmp0],    0x04040404                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        "paddsb     %[ftmp3],   %[ftmp2],       %[ftmp0]            \n\t"   \
+        "li         %[tmp0],    0x0B                                \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp4]                            \n\t"   \
+        PSRAB_MMI(%[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp6],  %[ftmp3])     \
+        "li         %[tmp0],    0x03030303                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        "paddsb     %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"   \
+        "li         %[tmp0],    0x0B                                \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
+        PSRAB_MMI(%[ftmp4],  %[ftmp2],  %[ftmp5],  %[ftmp6],  %[ftmp4])     \
+        "psubsb     %[q0],      %[q0],          %[ftmp3]            \n\t"   \
+        "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
+        /* filt_val &= ~hev */                                              \
+        "pcmpeqw    %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        "xor        %[hev],     %[hev],         %[ftmp0]            \n\t"   \
+        "and        %[ftmp1],   %[ftmp1],       %[hev]              \n\t"   \
+        MMI_BTOH(%[ftmp5],  %[ftmp6],  %[ftmp1])                            \
+        "li         %[tmp0],    0x07                                \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
+        "li         %[tmp0],    0x001b001b                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
+        "li         %[tmp0],    0x003f003f                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
+        /* Right part */                                                    \
+        "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+        /* Left part */                                                     \
+        "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "psubsb     %[q0],      %[q0],          %[ftmp4]            \n\t"   \
+        "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
+        "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
+        "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
+        "li         %[tmp0],    0x00120012                          \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
+        /* Right part */                                                    \
+        "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+        /* Left part */                                                     \
+        "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "psubsb     %[q1],      %[q1],          %[ftmp4]            \n\t"   \
+        "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
+        "paddsb     %[p1],      %[p1],          %[ftmp4]            \n\t"   \
+        "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
+        "li         %[tmp0],    0x03                                \n\t"   \
+        "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
+        /* Right part */                                                    \
+        "psllh      %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"   \
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+        /* Left part */                                                     \
+        "psllh      %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"   \
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
+        /* Combine left and right part */                                   \
+        "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "psubsb     %[q2],      %[q2],          %[ftmp4]            \n\t"   \
+        "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
+        "paddsb     %[p2],      %[p2],          %[ftmp4]            \n\t"   \
+        "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"
+
+#define PUT_VP8_EPEL4_H6_MMI(src, dst)                                      \
+        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, -0x02)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
+        "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, 0x03)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
+                                                                            \
+        MMI_SWC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_EPEL4_H4_MMI(src, dst)                                      \
+        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
+                                                                            \
+        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        MMI_SWC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)                     \
+        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
+                                                                            \
+        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
+        "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
+                                                                            \
+        MMI_SWC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)                     \
+        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
+                                                                            \
+        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
+                                                                            \
+        MMI_SWC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_EPEL8_H6_MMI(src, dst)                                      \
+        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, -0x02)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
+        "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, 0x03)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
+                                                                            \
+        MMI_SDC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_EPEL8_H4_MMI(src, dst)                                      \
+        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
+                                                                            \
+        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
+        MMI_SDC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)                     \
+        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
+                                                                            \
+        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
+        "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
+                                                                            \
+        MMI_SDC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)                     \
+        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
+                                                                            \
+        PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
+        "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
+        "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
+        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
+                                                                            \
+        MMI_SDC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_BILINEAR8_H_MMI(src, dst)                                   \
+        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[a]                \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[a]                \n\t"   \
+                                                                            \
+        MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[b]                \n\t"   \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
+                                                                            \
+        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
+        MMI_SDC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_BILINEAR4_H_MMI(src, dst)                                   \
+        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[a]                \n\t"   \
+                                                                            \
+        MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
+                                                                            \
+        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        MMI_SWC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)                    \
+        MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp5],   %[ftmp2],       %[c]                \n\t"   \
+        "pmullh     %[ftmp6],   %[ftmp3],       %[c]                \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
+        MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp3],       %[d]                \n\t"   \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
+                                                                            \
+        "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
+        MMI_SDC1(%[ftmp1], dst, 0x00)
+
+
+#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)                    \
+        MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp3],   %[ftmp2],       %[c]                \n\t"   \
+                                                                            \
+        PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
+        MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
+        "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
+                                                                            \
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
+                                                                            \
+        "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
+        MMI_SWC1(%[ftmp1], dst, 0x00)
+
+
+DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
+   {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
+    0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
+
+   {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
+    0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
+
+   {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
+    0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
+
+   {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
+    0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
+
+   {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
+    0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
+
+   {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
+    0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
+
+   {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
+    0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
+};
+
+#if 0
+#define FILTER_6TAP(src, F, stride)                                           \
+    cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
+        F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] -             \
+        F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
+
+#define FILTER_4TAP(src, F, stride)                                           \
+    cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
+        F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
+
+static const uint8_t subpel_filters[7][6] = {
+    { 0,  6, 123,  12,  1, 0 },
+    { 2, 11, 108,  36,  8, 1 },
+    { 0,  9,  93,  50,  6, 0 },
+    { 3, 16,  77,  77, 16, 3 },
+    { 0,  6,  50,  93,  9, 0 },
+    { 1,  8,  36, 108, 11, 2 },
+    { 0,  1,  12, 123,  6, 0 },
+};
+
+#define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
+#define MUL_35468(a)  (((a) * 35468) >> 16)
+#endif
+
+#define clip_int8(n) (cm[(n) + 0x80] - 0x80)
+static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
+        ptrdiff_t stride)
+{
+    int av_unused p1 = p[-2 * stride];
+    int av_unused p0 = p[-1 * stride];
+    int av_unused q0 = p[ 0 * stride];
+    int av_unused q1 = p[ 1 * stride];
+    int a, f1, f2;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+
+    a = 3 * (q0 - p0);
+    a += clip_int8(p1 - q1);
+    a = clip_int8(a);
+
+    // We deviate from the spec here with c(a+3) >> 3
+    // since that's what libvpx does.
+    f1 = FFMIN(a + 4, 127) >> 3;
+    f2 = FFMIN(a + 3, 127) >> 3;
+
+    // Despite what the spec says, we do need to clamp here to
+    // be bitexact with libvpx.
+    p[-1 * stride] = cm[p0 + f2];
+    p[ 0 * stride] = cm[q0 - f1];
+}
+
+static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
+        ptrdiff_t stride)
+{
+    int av_unused p1 = p[-2 * stride];
+    int av_unused p0 = p[-1 * stride];
+    int av_unused q0 = p[ 0 * stride];
+    int av_unused q1 = p[ 1 * stride];
+    int a, f1, f2;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+
+    a = 3 * (q0 - p0);
+    a = clip_int8(a);
+
+    // We deviate from the spec here with c(a+3) >> 3
+    // since that's what libvpx does.
+    f1 = FFMIN(a + 4, 127) >> 3;
+    f2 = FFMIN(a + 3, 127) >> 3;
+
+    // Despite what the spec says, we do need to clamp here to
+    // be bitexact with libvpx.
+    p[-1 * stride] = cm[p0 + f2];
+    p[ 0 * stride] = cm[q0 - f1];
+    a              = (f1 + 1) >> 1;
+    p[-2 * stride] = cm[p1 + a];
+    p[ 1 * stride] = cm[q1 - a];
+}
+
+static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
+        int flim)
+{
+    int av_unused p1 = p[-2 * stride];
+    int av_unused p0 = p[-1 * stride];
+    int av_unused q0 = p[ 0 * stride];
+    int av_unused q1 = p[ 1 * stride];
+
+    return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
+}
+
+static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
+{
+    int av_unused p1 = p[-2 * stride];
+    int av_unused p0 = p[-1 * stride];
+    int av_unused q0 = p[ 0 * stride];
+    int av_unused q1 = p[ 1 * stride];
+
+    return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
+}
+
+static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
+{
+    int a0, a1, a2, w;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+
+    int av_unused p2 = p[-3 * stride];
+    int av_unused p1 = p[-2 * stride];
+    int av_unused p0 = p[-1 * stride];
+    int av_unused q0 = p[ 0 * stride];
+    int av_unused q1 = p[ 1 * stride];
+    int av_unused q2 = p[ 2 * stride];
+
+    w = clip_int8(p1 - q1);
+    w = clip_int8(w + 3 * (q0 - p0));
+
+    a0 = (27 * w + 63) >> 7;
+    a1 = (18 * w + 63) >> 7;
+    a2 =  (9 * w + 63) >> 7;
+
+    p[-3 * stride] = cm[p2 + a2];
+    p[-2 * stride] = cm[p1 + a1];
+    p[-1 * stride] = cm[p0 + a0];
+    p[ 0 * stride] = cm[q0 - a0];
+    p[ 1 * stride] = cm[q1 - a1];
+    p[ 2 * stride] = cm[q2 - a2];
+}
+
+static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
+        int E, int I)
+{
+    int av_unused p3 = p[-4 * stride];
+    int av_unused p2 = p[-3 * stride];
+    int av_unused p1 = p[-2 * stride];
+    int av_unused p0 = p[-1 * stride];
+    int av_unused q0 = p[ 0 * stride];
+    int av_unused q1 = p[ 1 * stride];
+    int av_unused q2 = p[ 2 * stride];
+    int av_unused q3 = p[ 3 * stride];
+
+    return vp8_simple_limit(p, stride, E) &&
+           FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
+           FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
+           FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
+}
+
+static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
+{
+    double ftmp[18];
+    uint32_t tmp[1];
+    DECLARE_DOUBLE_1;
+    DECLARE_DOUBLE_2;
+    DECLARE_UINT32_T;
+    __asm__ volatile(
+        /* Get data from dst */
+        "gsldlc1    %[q0],      0x07(%[dst])                      \n\t"
+        "gsldrc1    %[q0],      0x00(%[dst])                      \n\t"
+        PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gsldlc1    %[p0],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p0],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[p1],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p1],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[p2],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p2],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[p3],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[p3],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gsldlc1    %[q1],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[q1],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[q2],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[q2],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gsldlc1    %[q3],      0x07(%[tmp0])                     \n\t"
+        "gsldrc1    %[q3],      0x00(%[tmp0])                     \n\t"
+        MMI_VP8_LOOP_FILTER
+        /* Move to dst */
+        "gssdlc1    %[q0],      0x07(%[dst])                      \n\t"
+        "gssdrc1    %[q0],      0x00(%[dst])                      \n\t"
+        PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gssdlc1    %[p0],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[p0],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gssdlc1    %[p1],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[p1],      0x00(%[tmp0])                     \n\t"
+        PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gssdlc1    %[p2],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[p2],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
+        "gssdlc1    %[q1],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[q1],      0x00(%[tmp0])                     \n\t"
+        PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
+        "gssdlc1    %[q2],      0x07(%[tmp0])                     \n\t"
+        "gssdrc1    %[q2],      0x00(%[tmp0])                     \n\t"
+        : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
+          [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
+          [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
+          [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
+          [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
+          [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
+          [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
+          [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
+          [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
+          [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
+          RESTRICT_ASM_UINT32_T
+        : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
+          [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+        if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
+            int hv = hev(dst + i * 1, stride, hev_thresh);
+            if (hv)
+                vp8_filter_common_is4tap(dst + i * 1, stride);
+            else
+                vp8_filter_common_isnot4tap(dst + i * 1, stride);
+        }
+}
+
+static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
+{
+    double ftmp[18];
+    uint32_t tmp[1];
+    DECLARE_DOUBLE_1;
+    DECLARE_DOUBLE_2;
+    DECLARE_UINT32_T;
+    __asm__ volatile(
+        /* Get data from dst */
+        "gsldlc1    %[p3],        0x03(%[dst])                    \n\t"
+        "gsldrc1    %[p3],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[tmp0],     %[dst],           %[stride]     \n\t"
+        "gsldlc1    %[p2],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[p2],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[p1],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[p1],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[p0],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[p0],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q0],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q0],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q1],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q1],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q2],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q2],        -0x04(%[tmp0])                  \n\t"
+        PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
+        "gsldlc1    %[q3],        0x03(%[tmp0])                   \n\t"
+        "gsldrc1    %[q3],        -0x04(%[tmp0])                  \n\t"
+        /* Matrix transpose */
+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
+                     %[q0], %[q1], %[q2], %[q3],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
+        MMI_VP8_LOOP_FILTER
+        /* Matrix transpose */
+        TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
+                     %[q0], %[q1], %[q2], %[q3],
+                     %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
+        /* Move to dst */
+        "gssdlc1    %[p3],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p3],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[p2],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p2],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[p1],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p1],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[p0],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[p0],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q0],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q0],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q1],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q1],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q2],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q2],        -0x04(%[dst])                   \n\t"
+        PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
+        "gssdlc1    %[q3],        0x03(%[dst])                    \n\t"
+        "gssdrc1    %[q3],        -0x04(%[dst])                   \n\t"
+        : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
+          [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
+          [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
+          [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
+          [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
+          [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
+          [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
+          [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
+          [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
+          [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
+          RESTRICT_ASM_UINT32_T
+        : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
+          [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+        if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
+            int hv = hev(dst + i * stride, 1, hev_thresh);
+            if (hv)
+                vp8_filter_common_is4tap(dst + i * stride, 1);
+            else
+                vp8_filter_common_isnot4tap(dst + i * stride, 1);
+        }
+}
+
+void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
+{
+#if 1
+    double ftmp[8];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        MMI_LDC1(%[ftmp0], %[dc], 0x00)
+        MMI_LDC1(%[ftmp1], %[dc], 0x08)
+        MMI_LDC1(%[ftmp2], %[dc], 0x10)
+        MMI_LDC1(%[ftmp3], %[dc], 0x18)
+        "paddsh     %[ftmp4],   %[ftmp0],       %[ftmp3]            \n\t"
+        "psubsh     %[ftmp5],   %[ftmp0],       %[ftmp3]            \n\t"
+        "paddsh     %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+        "psubsh     %[ftmp7],   %[ftmp1],       %[ftmp2]            \n\t"
+        "paddsh     %[ftmp0],   %[ftmp4],       %[ftmp6]            \n\t"
+        "paddsh     %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+        "psubsh     %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
+        "psubsh     %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
+        MMI_SDC1(%[ftmp0], %[dc], 0x00)
+        MMI_SDC1(%[ftmp1], %[dc], 0x08)
+        MMI_SDC1(%[ftmp2], %[dc], 0x10)
+        MMI_SDC1(%[ftmp3], %[dc], 0x18)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),
+          RESTRICT_ASM_ALL64
+          [ftmp7]"=&f"(ftmp[7])
+        : [dc]"r"((uint8_t*)dc)
+        : "memory"
+    );
+
+    block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
+    block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
+    block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
+    block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
+
+    block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
+    block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
+    block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
+    block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
+
+    block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
+    block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
+    block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
+    block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
+
+    block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
+    block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
+    block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
+    block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        MMI_SDC1(%[ftmp0], %[dc], 0x00)
+        MMI_SDC1(%[ftmp0], %[dc], 0x08)
+        MMI_SDC1(%[ftmp0], %[dc], 0x10)
+        MMI_SDC1(%[ftmp0], %[dc], 0x18)
+        : RESTRICT_ASM_ALL64
+          [ftmp0]"=&f"(ftmp[0])
+        : [dc]"r"((uint8_t *)dc)
+        : "memory"
+    );
+#else
+    int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
+
+    t00 = dc[0] + dc[12];
+    t10 = dc[1] + dc[13];
+    t20 = dc[2] + dc[14];
+    t30 = dc[3] + dc[15];
+
+    t03 = dc[0] - dc[12];
+    t13 = dc[1] - dc[13];
+    t23 = dc[2] - dc[14];
+    t33 = dc[3] - dc[15];
+
+    t01 = dc[4] + dc[ 8];
+    t11 = dc[5] + dc[ 9];
+    t21 = dc[6] + dc[10];
+    t31 = dc[7] + dc[11];
+
+    t02 = dc[4] - dc[ 8];
+    t12 = dc[5] - dc[ 9];
+    t22 = dc[6] - dc[10];
+    t32 = dc[7] - dc[11];
+
+    dc[ 0] = t00 + t01;
+    dc[ 1] = t10 + t11;
+    dc[ 2] = t20 + t21;
+    dc[ 3] = t30 + t31;
+
+    dc[ 4] = t03 + t02;
+    dc[ 5] = t13 + t12;
+    dc[ 6] = t23 + t22;
+    dc[ 7] = t33 + t32;
+
+    dc[ 8] = t00 - t01;
+    dc[ 9] = t10 - t11;
+    dc[10] = t20 - t21;
+    dc[11] = t30 - t31;
+
+    dc[12] = t03 - t02;
+    dc[13] = t13 - t12;
+    dc[14] = t23 - t22;
+    dc[15] = t33 - t32;
+
+    block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
+    block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
+    block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
+    block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
+
+    block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
+    block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
+    block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
+    block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
+
+    block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
+    block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
+    block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
+    block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
+
+    block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
+    block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
+    block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
+    block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
+
+    AV_ZERO64(dc + 0);
+    AV_ZERO64(dc + 4);
+    AV_ZERO64(dc + 8);
+    AV_ZERO64(dc + 12);
+#endif
+}
+
+void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
+{
+    int val = (dc[0] + 3) >> 3;
+
+    dc[0] = 0;
+
+    block[0][0][0] = val;
+    block[0][1][0] = val;
+    block[0][2][0] = val;
+    block[0][3][0] = val;
+    block[1][0][0] = val;
+    block[1][1][0] = val;
+    block[1][2][0] = val;
+    block[1][3][0] = val;
+    block[2][0][0] = val;
+    block[2][1][0] = val;
+    block[2][2][0] = val;
+    block[2][3][0] = val;
+    block[3][0][0] = val;
+    block[3][1][0] = val;
+    block[3][2][0] = val;
+    block[3][3][0] = val;
+}
+
+void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
+{
+#if 1
+    DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
+    DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
+    double ftmp[12];
+    uint32_t tmp[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp2], %[block], 0x08)
+        MMI_LDC1(%[ftmp3], %[block], 0x10)
+        MMI_LDC1(%[ftmp4], %[block], 0x18)
+
+        "li         %[tmp0],    0x02                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+
+        // block[0...3] + block[8...11]
+        "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+        // block[0...3] - block[8...11]
+        "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+        // MUL_35468(block[12...15])
+        "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+        "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+        // MUL_35468(block[4...7])
+        "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+        "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+        // MUL_20091(block[4...7]
+        "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+        "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
+        // MUL_20091(block[12...15])
+        "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+        "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
+
+        // tmp[0 4  8 12]
+        "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+        // tmp[1 5  9 13]
+        "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
+        // tmp[2 6 10 14]
+        "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
+        // tmp[3 7 11 15]
+        "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp0], %[block], 0x08)
+        MMI_SDC1(%[ftmp0], %[block], 0x10)
+        MMI_SDC1(%[ftmp0], %[block], 0x18)
+
+        TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
+
+        // t[0 4  8 12]
+        "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+        // t[1 5  9 13]
+        "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+        // t[2 6 10 14]
+        "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+        "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+        "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
+        "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
+        // t[3 7 11 15]
+        "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+        "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+        "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
+        "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
+
+        "li         %[tmp0],    0x03                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+        "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_4]          \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+        "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_4]          \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+        "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+        "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_4]          \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+        TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
+                     %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
+
+        MMI_LWC1(%[ftmp5], %[dst0], 0x00)
+        MMI_LWC1(%[ftmp6], %[dst1], 0x00)
+        MMI_LWC1(%[ftmp7], %[dst2], 0x00)
+        MMI_LWC1(%[ftmp8], %[dst3], 0x00)
+
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
+
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [tmp0]"=&r"(tmp[0])
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [block]"r"(block),                [ff_pw_4]"f"(ff_pw_4),
+          [ff_ph_4e7b]"f"(ff_ph_4e7b),      [ff_ph_22a3]"f"(ff_ph_22a3)
+        : "memory"
+    );
+#else
+    int i, t0, t1, t2, t3;
+    int16_t tmp[16];
+
+    for (i = 0; i < 4; i++) {
+        t0 = block[0 + i] + block[8 + i];
+        t1 = block[0 + i] - block[8 + i];
+        t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
+        t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
+        block[ 0 + i] = 0;
+        block[ 4 + i] = 0;
+        block[ 8 + i] = 0;
+        block[12 + i] = 0;
+
+        tmp[i * 4 + 0] = t0 + t3;
+        tmp[i * 4 + 1] = t1 + t2;
+        tmp[i * 4 + 2] = t1 - t2;
+        tmp[i * 4 + 3] = t0 - t3;
+    }
+
+    for (i = 0; i < 4; i++) {
+        t0 = tmp[0 + i] + tmp[8 + i];
+        t1 = tmp[0 + i] - tmp[8 + i];
+        t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
+        t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
+
+        dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
+        dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
+        dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
+        dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
+        dst   += stride;
+    }
+#endif
+}
+
+void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
+{
+#if 1
+    int dc = (block[0] + 4) >> 3;
+    double ftmp[6];
+    DECLARE_VAR_LOW32;
+
+    block[0] = 0;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "mtc1       %[dc],      %[ftmp5]                            \n\t"
+        MMI_LWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_LWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_LWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_LWC1(%[ftmp4], %[dst3], 0x00)
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          RESTRICT_ASM_LOW32
+          [ftmp5]"=&f"(ftmp[5])
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [dc]"r"(dc)
+        : "memory"
+    );
+#else
+    int i, dc = (block[0] + 4) >> 3;
+
+    block[0] = 0;
+
+    for (i = 0; i < 4; i++) {
+        dst[0] = av_clip_uint8(dst[0] + dc);
+        dst[1] = av_clip_uint8(dst[1] + dc);
+        dst[2] = av_clip_uint8(dst[2] + dc);
+        dst[3] = av_clip_uint8(dst[3] + dc);
+        dst   += stride;
+    }
+#endif
+}
+
+void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
+        ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_mmi(dst +  0, block[0], stride);
+    ff_vp8_idct_dc_add_mmi(dst +  4, block[1], stride);
+    ff_vp8_idct_dc_add_mmi(dst +  8, block[2], stride);
+    ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
+}
+
+void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
+        ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
+    ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
+    ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
+    ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
+}
+
+// loop filter applied to edges between macroblocks
+void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
+        int flim_I, int hev_thresh)
+{
+    vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
+    vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
+}
+
+void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
+        int flim_I, int hev_thresh)
+{
+    vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
+    vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
+                           hev_thresh);
+}
+
+void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh)
+{
+    vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
+    vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
+}
+
+void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh)
+{
+    vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
+    vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
+}
+
+// loop filter applied to inner macroblock edges
+void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+        if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
+            int hv = hev(dst + i * 1, stride, hev_thresh);
+            if (hv)
+                vp8_filter_common_is4tap(dst + i * 1, stride);
+            else
+                vp8_filter_common_isnot4tap(dst + i * 1, stride);
+        }
+}
+
+void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
+        int flim_E, int flim_I, int hev_thresh)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+        if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
+            int hv = hev(dst + i * stride, 1, hev_thresh);
+            if (hv)
+                vp8_filter_common_is4tap(dst + i * stride, 1);
+            else
+                vp8_filter_common_isnot4tap(dst + i * stride, 1);
+        }
+}
+
+void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
+{
+    vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
+    vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
+}
+
+void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
+        ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
+{
+    vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
+    vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
+}
+
+void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+        if (vp8_simple_limit(dst + i, stride, flim))
+            vp8_filter_common_is4tap(dst + i, stride);
+}
+
+void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+        if (vp8_simple_limit(dst + i * stride, 1, flim))
+            vp8_filter_common_is4tap(dst + i * stride, 1);
+}
+
+void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int x, int y)
+{
+#if 1
+    double ftmp[2];
+    uint64_t tmp[2];
+    mips_reg addr[2];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "1:                                                         \n\t"
+        PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
+        MMI_ULDC1(%[ftmp0], %[src], 0x00)
+        "ldl        %[tmp0],    0x0f(%[src])                        \n\t"
+        "ldr        %[tmp0],    0x08(%[src])                        \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        "ldl        %[tmp1],    0x0f(%[addr0])                      \n\t"
+        "ldr        %[tmp1],    0x08(%[addr0])                      \n\t"
+        PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        "sdl        %[tmp0],    0x0f(%[dst])                        \n\t"
+        "sdr        %[tmp0],    0x08(%[dst])                        \n\t"
+        "addiu      %[h],       %[h],           -0x02               \n\t"
+        MMI_SDC1(%[ftmp1], %[addr1], 0x00)
+        PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
+        "sdl        %[tmp1],    0x0f(%[addr1])                      \n\t"
+        "sdr        %[tmp1],    0x08(%[addr1])                      \n\t"
+        PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h)
+        : [dststride]"r"((mips_reg)dststride),
+          [srcstride]"r"((mips_reg)srcstride)
+        : "memory"
+    );
+#else
+    int i;
+
+    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
+        memcpy(dst, src, 16);
+#endif
+}
+
+void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int x, int y)
+{
+#if 1
+    double ftmp[1];
+    uint64_t tmp[1];
+    mips_reg addr[2];
+    DECLARE_VAR_ALL64;
+
+    __asm__ volatile (
+        "1:                                                         \n\t"
+        PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
+        MMI_ULDC1(%[ftmp0], %[src], 0x00)
+        "ldl        %[tmp0],    0x07(%[addr0])                      \n\t"
+        "ldr        %[tmp0],    0x00(%[addr0])                      \n\t"
+        PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        "addiu      %[h],       %[h],           -0x02               \n\t"
+        "sdl        %[tmp0],    0x07(%[addr1])                      \n\t"
+        "sdr        %[tmp0],    0x00(%[addr1])                      \n\t"
+        PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h)
+        : [dststride]"r"((mips_reg)dststride),
+          [srcstride]"r"((mips_reg)srcstride)
+        : "memory"
+    );
+#else
+    int i;
+
+    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
+        memcpy(dst, src, 8);
+#endif
+}
+
+void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int x, int y)
+{
+#if 1
+    double ftmp[1];
+    uint64_t tmp[1];
+    mips_reg addr[2];
+    DECLARE_VAR_LOW32;
+
+    __asm__ volatile (
+        "1:                                                         \n\t"
+        PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
+        MMI_LWC1(%[ftmp0], %[src], 0x00)
+        "lwl        %[tmp0],    0x03(%[addr0])                      \n\t"
+        "lwr        %[tmp0],    0x00(%[addr0])                      \n\t"
+        PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        "addiu      %[h],       %[h],           -0x02               \n\t"
+        "swl        %[tmp0],    0x03(%[addr1])                      \n\t"
+        "swr        %[tmp0],    0x00(%[addr1])                      \n\t"
+        PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h)
+        : [dststride]"r"((mips_reg)dststride),
+          [srcstride]"r"((mips_reg)srcstride)
+        : "memory"
+    );
+#else
+    int i;
+
+    for (i = 0; i < h; i++, dst += dststride, src += srcstride)
+        memcpy(dst, src, 4);
+#endif
+}
+
+void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    mips_reg src1, dst1;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
+    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
+    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
+    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
+    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
+    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
+    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
+    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
+
+    dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
+    dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
+    dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
+    dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
+    dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
+    dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
+    dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
+    dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        // 0 - 7
+        PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
+        PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
+        PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
+        // 8 - 15
+        PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
+          [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_4TAP(src, filter, 1);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
+    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
+    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
+    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
+    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
+    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
+    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
+    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
+          [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_4TAP(src, filter, 1);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
+    double ftmp[6];
+    uint32_t tmp[1];
+    DECLARE_VAR_LOW32;
+
+    /*
+    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
+    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
+    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
+    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
+          [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_4TAP(src, filter, 1);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    mips_reg src1, dst1;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
+    dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
+    dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
+    dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
+    dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
+    dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
+    dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
+    dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
+
+    dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
+    dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
+    dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
+    dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
+    dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
+    dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
+    dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
+    dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        // 0 - 7
+        PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
+        PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
+        PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
+        // 8 - 15
+        PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
+          [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
+          [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_6TAP(src, filter, 1);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
+    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
+    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
+    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
+    dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
+    dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
+    dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
+    dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
+          [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
+          [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_6TAP(src, filter, 1);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[mx - 1];
+    double ftmp[6];
+    uint32_t tmp[1];
+    DECLARE_VAR_LOW32;
+
+    /*
+    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
+    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
+    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
+    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
+          [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
+          [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_6TAP(src, filter, 1);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[my - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    mips_reg src0, src1, dst0;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
+    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
+    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
+    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
+    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
+    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
+    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
+    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
+
+    dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
+    dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
+    dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
+    dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
+    dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
+    dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
+    dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
+    dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        // 0 - 7
+        PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
+        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
+        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
+        // 8 - 15
+        PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
+          [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[my - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_4TAP(src, filter, srcstride);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[my - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    mips_reg src1;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
+    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
+    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
+    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
+    dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
+    dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
+    dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
+    dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
+          [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[my - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_4TAP(src, filter, srcstride);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[my - 1];
+    double ftmp[6];
+    uint32_t tmp[1];
+    mips_reg src1;
+    DECLARE_VAR_LOW32;
+
+    /*
+    dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
+    dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
+    dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
+    dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
+          [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[my - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_4TAP(src, filter, srcstride);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[my - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    mips_reg src0, src1, dst0;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
+    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
+    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
+    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
+    dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
+    dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
+    dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
+    dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
+
+    dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
+    dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
+    dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
+    dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
+    dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
+    dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
+    dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
+    dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        // 0 - 7
+        PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
+        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
+        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
+        // 8 - 15
+        PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
+          [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
+          [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[my - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_6TAP(src, filter, srcstride);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[my - 1];
+    double ftmp[9];
+    uint32_t tmp[1];
+    mips_reg src1;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
+    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
+    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
+    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
+    dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
+    dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
+    dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
+    dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
+          [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
+          [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[my - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_6TAP(src, filter, srcstride);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    const uint64_t *filter = fourtap_subpel_filters[my - 1];
+    double ftmp[6];
+    uint32_t tmp[1];
+    mips_reg src1;
+    DECLARE_VAR_LOW32;
+
+    /*
+    dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
+    dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
+    dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
+    dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x07                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [ff_pw_64]"f"(ff_pw_64),
+          [srcstride]"r"((mips_reg)srcstride),
+          [dststride]"r"((mips_reg)dststride),
+          [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
+          [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
+          [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
+        : "memory"
+    );
+#else
+    const uint8_t *filter = subpel_filters[my - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_6TAP(src, filter, srcstride);
+        dst += dststride;
+        src += srcstride;
+    }
+#endif
+}
+
+void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+    ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
+    tmp = tmp_array + 16;
+    ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[560];
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+
+    for (y = 0; y < h + 3; y++) {
+        for (x = 0; x < 16; x++)
+            tmp[x] = FILTER_4TAP(src, filter, 1);
+        tmp += 16;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 16;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_4TAP(tmp, filter, 16);
+        dst += dststride;
+        tmp += 16;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+    ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
+    tmp = tmp_array + 8;
+    ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[152];
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+
+    for (y = 0; y < h + 3; y++) {
+        for (x = 0; x < 8; x++)
+            tmp[x] = FILTER_4TAP(src, filter, 1);
+        tmp += 8;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 8;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_4TAP(tmp, filter, 8);
+        dst += dststride;
+        tmp += 8;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+    ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
+    tmp = tmp_array + 4;
+    ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[44];
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+
+    for (y = 0; y < h + 3; y++) {
+        for (x = 0; x < 4; x++)
+            tmp[x] = FILTER_4TAP(src, filter, 1);
+        tmp += 4;
+        src += srcstride;
+    }
+    tmp    = tmp_array + 4;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_4TAP(tmp, filter, 4);
+        dst += dststride;
+        tmp += 4;
+    }
+#endif
+}
+
+void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+    ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
+    tmp    = tmp_array + 32;
+    ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[592];
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+
+    for (y = 0; y < h + 5; y++) {
+        for (x = 0; x < 16; x++)
+            tmp[x] = FILTER_4TAP(src, filter, 1);
+        tmp += 16;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 32;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_6TAP(tmp, filter, 16);
+        dst += dststride;
+        tmp += 16;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+    ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
+    tmp    = tmp_array + 16;
+    ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[168];
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+
+    for (y = 0; y < h + 5; y++) {
+        for (x = 0; x < 8; x++)
+            tmp[x] = FILTER_4TAP(src, filter, 1);
+        tmp += 8;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 16;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_6TAP(tmp, filter, 8);
+        dst += dststride;
+        tmp += 8;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+    ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
+    tmp    = tmp_array + 8;
+    ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[52];
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+
+    for (y = 0; y < h + 5; y++) {
+        for (x = 0; x < 4; x++)
+            tmp[x] = FILTER_4TAP(src, filter, 1);
+        tmp += 4;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 8;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_6TAP(tmp, filter, 4);
+        dst += dststride;
+        tmp += 4;
+    }
+#endif
+}
+
+void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+    ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
+    tmp    = tmp_array + 16;
+    ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[560];
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+
+    for (y = 0; y < h + 3; y++) {
+        for (x = 0; x < 16; x++)
+            tmp[x] = FILTER_6TAP(src, filter, 1);
+        tmp += 16;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 16;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_4TAP(tmp, filter, 16);
+        dst += dststride;
+        tmp += 16;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+    ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
+    tmp    = tmp_array + 8;
+    ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[152];
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+
+    for (y = 0; y < h + 3; y++) {
+        for (x = 0; x < 8; x++)
+            tmp[x] = FILTER_6TAP(src, filter, 1);
+        tmp += 8;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 8;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_4TAP(tmp, filter, 8);
+        dst += dststride;
+        tmp += 8;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+    ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
+    tmp    = tmp_array + 4;
+    ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[44];
+    uint8_t *tmp = tmp_array;
+
+    src -= srcstride;
+
+    for (y = 0; y < h + 3; y++) {
+        for (x = 0; x < 4; x++)
+            tmp[x] = FILTER_6TAP(src, filter, 1);
+        tmp += 4;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 4;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_4TAP(tmp, filter, 4);
+        dst += dststride;
+        tmp += 4;
+    }
+#endif
+}
+
+void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+    ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
+    tmp    = tmp_array + 32;
+    ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[592];
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+
+    for (y = 0; y < h + 5; y++) {
+        for (x = 0; x < 16; x++)
+            tmp[x] = FILTER_6TAP(src, filter, 1);
+        tmp += 16;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 32;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = FILTER_6TAP(tmp, filter, 16);
+        dst += dststride;
+        tmp += 16;
+    }
+#endif
+}
+
+void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+    ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
+    tmp    = tmp_array + 16;
+    ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[168];
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+
+    for (y = 0; y < h + 5; y++) {
+        for (x = 0; x < 8; x++)
+            tmp[x] = FILTER_6TAP(src, filter, 1);
+        tmp += 8;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 16;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = FILTER_6TAP(tmp, filter, 8);
+        dst += dststride;
+        tmp += 8;
+    }
+#endif
+}
+
+void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
+        ptrdiff_t srcstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+    ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
+    tmp    = tmp_array + 8;
+    ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
+#else
+    const uint8_t *filter = subpel_filters[mx - 1];
+    const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
+    int x, y;
+    uint8_t tmp_array[52];
+    uint8_t *tmp = tmp_array;
+
+    src -= 2 * srcstride;
+
+    for (y = 0; y < h + 5; y++) {
+        for (x = 0; x < 4; x++)
+            tmp[x] = FILTER_6TAP(src, filter, 1);
+        tmp += 4;
+        src += srcstride;
+    }
+
+    tmp    = tmp_array + 8;
+    filter = subpel_filters[my - 1];
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = FILTER_6TAP(tmp, filter, 4);
+        dst += dststride;
+        tmp += 4;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    int a = 8 - mx, b = mx;
+    double ftmp[7];
+    uint32_t tmp[1];
+    mips_reg dst0, src0;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
+    dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
+    dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
+    dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
+    dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
+    dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
+    dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
+    dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
+
+    dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
+    dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
+    dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
+    dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
+    dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
+    dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
+    dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
+    dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x03                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+        "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
+        "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
+
+        "1:                                                         \n\t"
+        // 0 - 7
+        PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
+        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
+        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
+        // 8 - 15
+        PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dst0]"=&r"(dst0),            [src0]"=&r"(src0),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),              [src]"+&r"(src),
+          [a]"+&f"(a),                  [b]"+&f"(b)
+        : [sstride]"r"((mips_reg)sstride),
+          [dstride]"r"((mips_reg)dstride),
+          [ff_pw_4]"f"(ff_pw_4)
+        : "memory"
+    );
+#else
+    int a = 8 - mx, b = mx;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
+        dst += dstride;
+        src += sstride;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    int c = 8 - my, d = my;
+    double ftmp[7];
+    uint32_t tmp[1];
+    mips_reg src0, src1, dst0;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
+    dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
+    dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
+    dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
+    dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
+    dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
+    dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
+    dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x03                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+        "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
+        "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
+
+        "1:                                                         \n\t"
+        // 0 - 7
+        PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
+        PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
+        PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
+        // 8 - 15
+        PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src0]"=&r"(src0),            [dst0]"=&r"(dst0),
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),              [src]"+&r"(src),
+          [c]"+&f"(c),                  [d]"+&f"(d)
+        : [sstride]"r"((mips_reg)sstride),
+          [dstride]"r"((mips_reg)dstride),
+          [ff_pw_4]"f"(ff_pw_4)
+        : "memory"
+    );
+#else
+    int c = 8 - my, d = my;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
+        dst += dstride;
+        src += sstride;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
+    uint8_t *tmp = tmp_array;
+
+    ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
+    ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
+#else
+    int a = 8 - mx, b = mx;
+    int c = 8 - my, d = my;
+    int x, y;
+    uint8_t tmp_array[528];
+    uint8_t *tmp = tmp_array;
+
+    for (y = 0; y < h + 1; y++) {
+        for (x = 0; x < 16; x++)
+            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
+        tmp += 16;
+        src += sstride;
+    }
+
+    tmp = tmp_array;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 16; x++)
+            dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
+        dst += dstride;
+        tmp += 16;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    int a = 8 - mx, b = mx;
+    double ftmp[7];
+    uint32_t tmp[1];
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
+    dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
+    dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
+    dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
+    dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
+    dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
+    dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
+    dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x03                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+        "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
+        "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),              [src]"+&r"(src),
+          [a]"+&f"(a),                  [b]"+&f"(b)
+        : [sstride]"r"((mips_reg)sstride),
+          [dstride]"r"((mips_reg)dstride),
+          [ff_pw_4]"f"(ff_pw_4)
+        : "memory"
+    );
+#else
+    int a = 8 - mx, b = mx;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
+        dst += dstride;
+        src += sstride;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    int c = 8 - my, d = my;
+    double ftmp[7];
+    uint32_t tmp[1];
+    mips_reg src1;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
+    dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
+    dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
+    dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
+    dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
+    dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
+    dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
+    dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x03                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+        "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
+        "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),              [src]"+&r"(src),
+          [c]"+&f"(c),                  [d]"+&f"(d)
+        : [sstride]"r"((mips_reg)sstride),
+          [dstride]"r"((mips_reg)dstride),
+          [ff_pw_4]"f"(ff_pw_4)
+        : "memory"
+    );
+#else
+    int c = 8 - my, d = my;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
+        dst += dstride;
+        src += sstride;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
+    uint8_t *tmp = tmp_array;
+
+    ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
+    ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
+#else
+    int a = 8 - mx, b = mx;
+    int c = 8 - my, d = my;
+    int x, y;
+    uint8_t tmp_array[136];
+    uint8_t *tmp = tmp_array;
+
+    for (y = 0; y < h + 1; y++) {
+        for (x = 0; x < 8; x++)
+            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
+        tmp += 8;
+        src += sstride;
+    }
+
+    tmp = tmp_array;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 8; x++)
+            dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
+        dst += dstride;
+        tmp += 8;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    int a = 8 - mx, b = mx;
+    double ftmp[5];
+    uint32_t tmp[1];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
+    dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
+    dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
+    dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x03                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+        "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
+        "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),              [src]"+&r"(src),
+          [a]"+&f"(a),                  [b]"+&f"(b)
+        : [sstride]"r"((mips_reg)sstride),
+          [dstride]"r"((mips_reg)dstride),
+          [ff_pw_4]"f"(ff_pw_4)
+        : "memory"
+    );
+#else
+    int a = 8 - mx, b = mx;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
+        dst += dstride;
+        src += sstride;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    int c = 8 - my, d = my;
+    double ftmp[7];
+    uint32_t tmp[1];
+    mips_reg src1;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+
+    /*
+    dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
+    dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
+    dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
+    dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
+    */
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+        "li         %[tmp0],    0x03                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+        "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
+        "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
+
+        "1:                                                         \n\t"
+        PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
+
+        "addiu      %[h],       %[h],           -0x01               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
+        "bnez       %[h],       1b                                  \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [src1]"=&r"(src1),
+          [h]"+&r"(h),
+          [dst]"+&r"(dst),              [src]"+&r"(src),
+          [c]"+&f"(c),                  [d]"+&f"(d)
+        : [sstride]"r"((mips_reg)sstride),
+          [dstride]"r"((mips_reg)dstride),
+          [ff_pw_4]"f"(ff_pw_4)
+        : "memory"
+    );
+#else
+    int c = 8 - my, d = my;
+    int x, y;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
+        dst += dstride;
+        src += sstride;
+    }
+#endif
+}
+
+void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
+        ptrdiff_t sstride, int h, int mx, int my)
+{
+#if 1
+    DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
+    uint8_t *tmp = tmp_array;
+
+    ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
+    ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
+#else
+    int a = 8 - mx, b = mx;
+    int c = 8 - my, d = my;
+    int x, y;
+    uint8_t tmp_array[36];
+    uint8_t *tmp = tmp_array;
+
+    for (y = 0; y < h + 1; y++) {
+        for (x = 0; x < 4; x++)
+            tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
+        tmp += 4;
+        src += sstride;
+    }
+
+    tmp = tmp_array;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < 4; x++)
+            dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
+        dst += dstride;
+        tmp += 4;
+    }
+#endif
+}
diff --git a/libavcodec/mips/vp9_idct_msa.c b/libavcodec/mips/vp9_idct_msa.c
new file mode 100644
index 0000000..bd762f2
--- /dev/null
+++ b/libavcodec/mips/vp9_idct_msa.c
@@ -0,0 +1,2160 @@
+/*
+ * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define VP9_DCT_CONST_BITS   14
+#define ROUND_POWER_OF_TWO(value, n)  (((value) + (1 << ((n) - 1))) >> (n))
+
+static const int32_t cospi_1_64 = 16364;
+static const int32_t cospi_2_64 = 16305;
+static const int32_t cospi_3_64 = 16207;
+static const int32_t cospi_4_64 = 16069;
+static const int32_t cospi_5_64 = 15893;
+static const int32_t cospi_6_64 = 15679;
+static const int32_t cospi_7_64 = 15426;
+static const int32_t cospi_8_64 = 15137;
+static const int32_t cospi_9_64 = 14811;
+static const int32_t cospi_10_64 = 14449;
+static const int32_t cospi_11_64 = 14053;
+static const int32_t cospi_12_64 = 13623;
+static const int32_t cospi_13_64 = 13160;
+static const int32_t cospi_14_64 = 12665;
+static const int32_t cospi_15_64 = 12140;
+static const int32_t cospi_16_64 = 11585;
+static const int32_t cospi_17_64 = 11003;
+static const int32_t cospi_18_64 = 10394;
+static const int32_t cospi_19_64 = 9760;
+static const int32_t cospi_20_64 = 9102;
+static const int32_t cospi_21_64 = 8423;
+static const int32_t cospi_22_64 = 7723;
+static const int32_t cospi_23_64 = 7005;
+static const int32_t cospi_24_64 = 6270;
+static const int32_t cospi_25_64 = 5520;
+static const int32_t cospi_26_64 = 4756;
+static const int32_t cospi_27_64 = 3981;
+static const int32_t cospi_28_64 = 3196;
+static const int32_t cospi_29_64 = 2404;
+static const int32_t cospi_30_64 = 1606;
+static const int32_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const int32_t sinpi_1_9 = 5283;
+static const int32_t sinpi_2_9 = 9929;
+static const int32_t sinpi_3_9 = 13377;
+static const int32_t sinpi_4_9 = 15212;
+
+#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)  \
+{                                                                  \
+    v8i16 k0_m = __msa_fill_h(cnst0);                              \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                  \
+                                                                   \
+    s0_m = (v4i32) __msa_fill_h(cnst1);                            \
+    k0_m = __msa_ilvev_h((v8i16) s0_m, k0_m);                      \
+                                                                   \
+    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                        \
+    ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                           \
+    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);               \
+    SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS);                   \
+    out0 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m);              \
+                                                                   \
+    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);               \
+    SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS);                   \
+    out1 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m);              \
+}
+
+#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                                      dst0, dst1, dst2, dst3)              \
+{                                                                          \
+    v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                               \
+    v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                               \
+                                                                           \
+    DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5,                    \
+                tp0_m, tp2_m, tp3_m, tp4_m);                               \
+    DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7,                    \
+                tp5_m, tp6_m, tp7_m, tp8_m);                               \
+    BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);   \
+    BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);   \
+    SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, VP9_DCT_CONST_BITS);           \
+    SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, VP9_DCT_CONST_BITS);           \
+    PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m,    \
+                dst0, dst1, dst2, dst3);                                   \
+}
+
+#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2)          \
+( {                                                       \
+    v8i16 dst_m;                                          \
+    v4i32 tp0_m, tp1_m;                                   \
+                                                          \
+    DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);        \
+    SRARI_W2_SW(tp1_m, tp0_m, VP9_DCT_CONST_BITS);        \
+    dst_m = __msa_pckev_h((v8i16) tp1_m, (v8i16) tp0_m);  \
+                                                          \
+    dst_m;                                                \
+} )
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,                 \
+                  out0, out1, out2, out3, out4, out5, out6, out7)         \
+{                                                                         \
+    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                    \
+    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                     \
+    v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,  \
+        cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };             \
+    v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
+        -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                  \
+                                                                          \
+    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                       \
+    cnst2_m = -cnst0_m;                                                   \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                       \
+    cnst4_m = -cnst2_m;                                                   \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                          \
+    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst1_m, cnst2_m, cnst3_m, in7, in0,        \
+                              in4, in3);                                  \
+                                                                          \
+    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                       \
+    cnst2_m = -cnst0_m;                                                   \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                       \
+    cnst4_m = -cnst2_m;                                                   \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                          \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+                                                                          \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst1_m, cnst2_m, cnst3_m, in5, in2,        \
+                              in6, in1);                                  \
+    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                \
+    out7 = -s0_m;                                                         \
+    out0 = s1_m;                                                          \
+                                                                          \
+    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
+                 cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
+                                                                          \
+    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);    \
+    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+    cnst1_m = cnst0_m;                                                    \
+                                                                          \
+    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst2_m, cnst3_m, cnst1_m, out1, out6,      \
+                              s0_m, s1_m);                                \
+                                                                          \
+    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                       \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+                                                                          \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                              \
+    out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);            \
+    out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);            \
+    out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);            \
+    out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);            \
+                                                                          \
+    out1 = -out1;                                                         \
+    out3 = -out3;                                                         \
+    out5 = -out5;                                                         \
+}
+
+#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1)                        \
+{                                                                         \
+    v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                             \
+    v8i16 madd_s0_m, madd_s1_m;                                           \
+                                                                          \
+    ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                            \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,               \
+                c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);      \
+    SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, VP9_DCT_CONST_BITS);  \
+    PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);          \
+}
+
+#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,       \
+                    out0, out1, out2, out3)                               \
+{                                                                         \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                     \
+                                                                          \
+    ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                        \
+    ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                        \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+                cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+                m4_m, m5_m, tmp3_m, tmp2_m);                              \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);          \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                  \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+                cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+                m4_m, m5_m, tmp3_m, tmp2_m);                              \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);          \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                  \
+}
+
+#define VP9_SET_COSPI_PAIR(c0_h, c1_h)   \
+( {                                      \
+    v8i16 out0_m, r0_m, r1_m;            \
+                                         \
+    r0_m = __msa_fill_h(c0_h);           \
+    r1_m = __msa_fill_h(c1_h);           \
+    out0_m = __msa_ilvev_h(r1_m, r0_m);  \
+                                         \
+    out0_m;                              \
+} )
+
+#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)  \
+{                                                                 \
+    uint8_t *dst_m = (uint8_t *) (dst);                           \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                         \
+    v16i8 tmp0_m, tmp1_m;                                         \
+    v16i8 zero_m = { 0 };                                         \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                         \
+                                                                  \
+    LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);    \
+    ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,    \
+               zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);   \
+    ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,      \
+         res0_m, res1_m, res2_m, res3_m);                         \
+    CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);               \
+    PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);  \
+    ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                  \
+}
+
+#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
+{                                                                     \
+    v8i16 c0_m, c1_m, c2_m, c3_m;                                     \
+    v8i16 step0_m, step1_m;                                           \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+                                                                      \
+    c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+    c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+    step0_m = __msa_ilvr_h(in2, in0);                                 \
+    DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);        \
+                                                                      \
+    c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+    c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+    step1_m = __msa_ilvr_h(in3, in1);                                 \
+    DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);        \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);  \
+                                                                      \
+    PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);      \
+    SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                  \
+    BUTTERFLY_4((v8i16) tmp0_m, (v8i16) tmp1_m,                       \
+                (v8i16) tmp2_m, (v8i16) tmp3_m,                       \
+                out0, out1, out2, out3);                              \
+}
+
+#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)      \
+{                                                                     \
+    v8i16 res0_m, res1_m, c0_m, c1_m;                                 \
+    v8i16 k1_m, k2_m, k3_m, k4_m;                                     \
+    v8i16 zero_m = { 0 };                                             \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+    v4i32 int0_m, int1_m, int2_m, int3_m;                             \
+    v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
+        sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                \
+        -sinpi_4_9 };                                                 \
+                                                                      \
+    SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);         \
+    ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                  \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);          \
+    int0_m = tmp2_m + tmp1_m;                                         \
+                                                                      \
+    SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                           \
+    ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                  \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+    int1_m = tmp0_m + tmp1_m;                                         \
+                                                                      \
+    c0_m = __msa_splati_h(mask_m, 6);                                 \
+    ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                 \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+    int2_m = tmp0_m + tmp1_m;                                         \
+                                                                      \
+    c0_m = __msa_splati_h(mask_m, 6);                                 \
+    c0_m = __msa_ilvev_h(c0_m, k1_m);                                 \
+                                                                      \
+    res0_m = __msa_ilvr_h((in1), (in3));                              \
+    tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                            \
+    int3_m = tmp2_m + tmp0_m;                                         \
+                                                                      \
+    res0_m = __msa_ilvr_h((in2), (in3));                              \
+    c1_m = __msa_ilvev_h(k4_m, k3_m);                                 \
+                                                                      \
+    tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                            \
+    res1_m = __msa_ilvr_h((in0), (in2));                              \
+    c1_m = __msa_ilvev_h(k1_m, zero_m);                               \
+                                                                      \
+    tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                            \
+    int3_m += tmp2_m;                                                 \
+    int3_m += tmp3_m;                                                 \
+                                                                      \
+    SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, VP9_DCT_CONST_BITS);  \
+    PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);          \
+    PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);          \
+}
+
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,          \
+                           out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                           \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                   \
+    v8i16 zero_m = { 0 };                                                   \
+                                                                            \
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                      \
+               tmp0_n, tmp1_n, tmp2_n, tmp3_n);                             \
+    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                            \
+    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                            \
+                                                                            \
+    out0 = (v8i16) __msa_ilvr_d((v2i64) tmp1_m, (v2i64) tmp0_m);            \
+    out1 = (v8i16) __msa_ilvl_d((v2i64) tmp1_m, (v2i64) tmp0_m);            \
+    out2 = (v8i16) __msa_ilvr_d((v2i64) tmp3_m, (v2i64) tmp2_m);            \
+    out3 = (v8i16) __msa_ilvl_d((v2i64) tmp3_m, (v2i64) tmp2_m);            \
+                                                                            \
+    out4 = zero_m;                                                          \
+    out5 = zero_m;                                                          \
+    out6 = zero_m;                                                          \
+    out7 = zero_m;                                                          \
+}
+
+static void vp9_idct4x4_1_add_msa(int16_t *input, uint8_t *dst,
+                                  int32_t dst_stride)
+{
+    int16_t out;
+    v8i16 vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 4);
+    vec = __msa_fill_h(out);
+    input[0] = 0;
+
+    ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
+}
+
+static void vp9_idct4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    ST_SH2(zero, zero, input, 8);
+    /* rows */
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_iadst4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    ST_SH2(zero, zero, input, 8);
+    /* rows */
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_iadst_idct_4x4_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    ST_SH2(zero, zero, input, 8);
+    /* cols */
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_idct_iadst_4x4_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    ST_SH2(zero, zero, input, 8);
+    /* cols */
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)     \
+( {                                                    \
+    v8i16 c0_m, c1_m;                                  \
+                                                       \
+    SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m);  \
+    c0_m = __msa_ilvev_h(c1_m, c0_m);                  \
+                                                       \
+    c0_m;                                              \
+} )
+
+/* multiply and add macro */
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,          \
+                 out0, out1, out2, out3)                                  \
+{                                                                         \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                          \
+    ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                        \
+    ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                        \
+    DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
+                cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);      \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);              \
+    DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
+                cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);      \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);              \
+}
+
+/* idct 8x8 macro */
+#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,                 \
+                       out0, out1, out2, out3, out4, out5, out6, out7)         \
+{                                                                              \
+    v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;              \
+    v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;              \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
+       cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };                 \
+                                                                               \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
+    k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
+    k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
+    k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
+    VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
+    SUB2(in1, in3, in7, in5, res0_m, res1_m);                                  \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
+    k1_m = __msa_splati_h(mask_m, 4);                                          \
+                                                                               \
+    ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                               \
+    DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,        \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                               \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);           \
+    tp4_m = in1 + in3;                                                         \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                 \
+    tp7_m = in7 + in5;                                                         \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
+    VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
+             in0, in4, in2, in6);                                              \
+    BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);               \
+    BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
+                out0, out1, out2, out3, out4, out5, out6, out7);               \
+}
+
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,              \
+                        out0, out1, out2, out3, out4, out5, out6, out7)      \
+{                                                                            \
+    v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                    \
+    v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                \
+    v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;          \
+    v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
+        cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
+    v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
+        cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };    \
+    v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
+        -cospi_16_64, 0, 0, 0, 0 };                                          \
+                                                                             \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                \
+    ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                \
+    ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                     \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                         \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                \
+    ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                \
+    ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                     \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                         \
+    ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                     \
+    BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);        \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                \
+    ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                   \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                          \
+    SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                           \
+    k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                \
+    ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                m0_m, m1_m, m2_m, m3_m);                                     \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                          \
+    ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                       \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+                m0_m, m1_m, m2_m, m3_m);                                     \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                          \
+                                                                             \
+    out1 = -in1;                                                             \
+    out3 = -in3;                                                             \
+    out5 = -in5;                                                             \
+    out7 = -in7;                                                             \
+}
+
+static void vp9_idct8x8_1_add_msa(int16_t *input, uint8_t *dst,
+                                  int32_t dst_stride)
+{
+    int16_t out;
+    int32_t val;
+    v8i16 vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    val = ROUND_POWER_OF_TWO(out, 5);
+    vec = __msa_fill_h(val);
+    input[0] = 0;
+
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+}
+
+static void vp9_idct8x8_12_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
+    v4i32 tmp0, tmp1, tmp2, tmp3;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+
+    /* stage1 */
+    ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
+    k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+    DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+    SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+    PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+    BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
+
+    /* stage2 */
+    ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
+    k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+    DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+    SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+    PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+    BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
+
+    /* stage3 */
+    s0 = __msa_ilvr_h(s6, s5);
+
+    k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+    DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
+    SRARI_W2_SW(tmp0, tmp1, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
+
+    /* stage4 */
+    BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_idct8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 cnst0, cnst1, cnst2, cnst3, cnst4;
+    v8i16 temp0, temp1, temp2, temp3, s0, s1;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
+
+    /* 1D adst8x8 */
+    VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+              in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    cnst0 = __msa_fill_h(cospi_2_64);
+    cnst1 = __msa_fill_h(cospi_30_64);
+    cnst2 = -cnst0;
+    ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1);
+    cnst2 = __msa_fill_h(cospi_18_64);
+    cnst3 = __msa_fill_h(cospi_14_64);
+    cnst4 = -cnst2;
+    ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3);
+
+    ILVRL_H2_SH(in0, in7, temp1, temp0);
+    ILVRL_H2_SH(in4, in3, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2,
+                              cnst3, in7, in0, in4, in3);
+
+    cnst0 = __msa_fill_h(cospi_10_64);
+    cnst1 = __msa_fill_h(cospi_22_64);
+    cnst2 = -cnst0;
+    ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1);
+    cnst2 = __msa_fill_h(cospi_26_64);
+    cnst3 = __msa_fill_h(cospi_6_64);
+    cnst4 = -cnst2;
+    ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3);
+
+    ILVRL_H2_SH(in2, in5, temp1, temp0);
+    ILVRL_H2_SH(in6, in1, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2,
+                              cnst3, in5, in2, in6, in1);
+    BUTTERFLY_4(in7, in0, in2, in5, s1, s0, in2, in5);
+    out7 = -s0;
+    out0 = s1;
+    SRARI_H2_SH(out0, out7, 5);
+    dst0 = LD_UB(dst + 0 * dst_stride);
+    dst7 = LD_UB(dst + 7 * dst_stride);
+
+    res0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst0);
+    res0 += out0;
+    res0 = CLIP_SH_0_255(res0);
+    res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
+    ST8x1_UB(res0, dst);
+
+    res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7);
+    res7 += out7;
+    res7 = CLIP_SH_0_255(res7);
+    res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7);
+    ST8x1_UB(res7, dst + 7 * dst_stride);
+
+    cnst1 = __msa_fill_h(cospi_24_64);
+    cnst0 = __msa_fill_h(cospi_8_64);
+    cnst3 = -cnst1;
+    cnst2 = -cnst0;
+
+    ILVEV_H2_SH(cnst3, cnst0, cnst1, cnst2, cnst3, cnst2);
+    cnst0 = __msa_ilvev_h(cnst1, cnst0);
+    cnst1 = cnst0;
+
+    ILVRL_H2_SH(in4, in3, temp1, temp0);
+    ILVRL_H2_SH(in6, in1, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst2, cnst3,
+                              cnst1, out1, out6, s0, s1);
+    out1 = -out1;
+    SRARI_H2_SH(out1, out6, 5);
+    dst1 = LD_UB(dst + 1 * dst_stride);
+    dst6 = LD_UB(dst + 6 * dst_stride);
+    ILVR_B2_SH(zero, dst1, zero, dst6, res1, res6);
+    ADD2(res1, out1, res6, out6, res1, res6);
+    CLIP_SH2_0_255(res1, res6);
+    PCKEV_B2_SH(res1, res1, res6, res6, res1, res6);
+    ST8x1_UB(res1, dst + dst_stride);
+    ST8x1_UB(res6, dst + 6 * dst_stride);
+
+    cnst0 = __msa_fill_h(cospi_16_64);
+    cnst1 = -cnst0;
+    cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+    ILVRL_H2_SH(in2, in5, temp1, temp0);
+    ILVRL_H2_SH(s0, s1, temp3, temp2);
+    out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst0);
+    out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst1);
+    out3 = -out3;
+    SRARI_H2_SH(out3, out4, 5);
+    dst3 = LD_UB(dst + 3 * dst_stride);
+    dst4 = LD_UB(dst + 4 * dst_stride);
+    ILVR_B2_SH(zero, dst3, zero, dst4, res3, res4);
+    ADD2(res3, out3, res4, out4, res3, res4);
+    CLIP_SH2_0_255(res3, res4);
+    PCKEV_B2_SH(res3, res3, res4, res4, res3, res4);
+    ST8x1_UB(res3, dst + 3 * dst_stride);
+    ST8x1_UB(res4, dst + 4 * dst_stride);
+
+    out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst0);
+    out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst1);
+    out5 = -out5;
+    SRARI_H2_SH(out2, out5, 5);
+    dst2 = LD_UB(dst + 2 * dst_stride);
+    dst5 = LD_UB(dst + 5 * dst_stride);
+    ILVR_B2_SH(zero, dst2, zero, dst5, res2, res5);
+    ADD2(res2, out2, res5, out5, res2, res5);
+    CLIP_SH2_0_255(res2, res5);
+    PCKEV_B2_SH(res2, res2, res5, res5, res2, res5);
+    ST8x1_UB(res2, dst + 2 * dst_stride);
+    ST8x1_UB(res5, dst + 5 * dst_stride);
+}
+
+static void vp9_iadst_idct_8x8_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in1, in6, in3, in4, in5, in2, in7, in0);
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
+    /* 1D idct8x8 */
+    VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                    in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_idct_iadst_8x8_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 8);
+
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in1, in6, in3, in4, in5, in2, in7, in0);
+    /* 1D idct8x8 */
+    VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                    in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,          \
+                         r9, r10, r11, r12, r13, r14, r15,            \
+                         out0, out1, out2, out3, out4, out5,          \
+                         out6, out7, out8, out9, out10, out11,        \
+                         out12, out13, out14, out15)                  \
+{                                                                     \
+    v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;             \
+    v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;       \
+    v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;             \
+    v8i16 h8_m, h9_m, h10_m, h11_m;                                   \
+    v8i16 k0_m, k1_m, k2_m, k3_m;                                     \
+                                                                      \
+    /* stage 1 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);              \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);             \
+    VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,              \
+                g0_m, g1_m, g2_m, g3_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);              \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);             \
+    VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,             \
+                g4_m, g5_m, g6_m, g7_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);               \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);              \
+    VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,             \
+                g8_m, g9_m, g10_m, g11_m);                            \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);             \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);               \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);              \
+    VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,              \
+                g12_m, g13_m, g14_m, g15_m);                          \
+                                                                      \
+    /* stage 2 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);              \
+    VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,      \
+                h0_m, h1_m, h2_m, h3_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);             \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);             \
+    VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,     \
+                h4_m, h5_m, h6_m, h7_m);                              \
+    BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);    \
+    BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
+                h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
+                                                                      \
+    /* stage 3 */                                                     \
+    BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);  \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);              \
+    VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,       \
+                out4, out6, out5, out7);                              \
+    VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,       \
+                out12, out14, out13, out15);                          \
+                                                                      \
+    /* stage 4 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);            \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+    k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);             \
+    VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);             \
+    VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);               \
+    VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);           \
+    VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);           \
+}
+
+static void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v8i16 loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+    v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+    v8i16 tmp5, tmp6, tmp7;
+    v8i16 zero = { 0 };
+
+    /* load up 8x16 */
+    LD_SH16(input, 16,
+            reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+            reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16);
+    input += 8 * 16;
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16);
+
+    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+    BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+    BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+    reg0 = reg2 - loc1;
+    reg2 = reg2 + loc1;
+    reg12 = reg14 - loc0;
+    reg14 = reg14 + loc0;
+    reg4 = reg6 - loc3;
+    reg6 = reg6 + loc3;
+    reg8 = reg10 - loc2;
+    reg10 = reg10 + loc2;
+
+    /* stage 2 */
+    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+    reg9 = reg1 - loc2;
+    reg1 = reg1 + loc2;
+    reg7 = reg15 - loc3;
+    reg15 = reg15 + loc3;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+    BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+    loc1 = reg15 + reg3;
+    reg3 = reg15 - reg3;
+    loc2 = reg2 + loc1;
+    reg15 = reg2 - loc1;
+
+    loc1 = reg1 + reg13;
+    reg13 = reg1 - reg13;
+    loc0 = reg0 + loc1;
+    loc1 = reg0 - loc1;
+    tmp6 = loc0;
+    tmp7 = loc1;
+    reg0 = loc2;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+    VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5,
+                        reg11);
+
+    loc0 = reg9 + reg5;
+    reg5 = reg9 - reg5;
+    reg2 = reg6 + loc0;
+    reg1 = reg6 - loc0;
+
+    loc0 = reg7 + reg11;
+    reg11 = reg7 - reg11;
+    loc1 = reg4 + loc0;
+    loc2 = reg4 - loc0;
+    tmp5 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+    BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+    reg10 = loc0;
+    reg11 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+    BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+    reg13 = loc2;
+
+    /* Transpose and store the output */
+    reg12 = tmp5;
+    reg14 = tmp6;
+    reg3 = tmp7;
+
+    SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
+}
+
+static void vp9_idct16_1d_columns_msa(int16_t *input, int16_t *output)
+{
+    v8i16 loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+    v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+    v8i16 tmp5, tmp6, tmp7;
+    v8i16 zero = { 0 };
+
+    /* load up 8x16 */
+    LD_SH16(input, 16,
+            reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+            reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16);
+    input += 16 * 8;
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16);
+
+    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+    BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+    BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+    reg0 = reg2 - loc1;
+    reg2 = reg2 + loc1;
+    reg12 = reg14 - loc0;
+    reg14 = reg14 + loc0;
+    reg4 = reg6 - loc3;
+    reg6 = reg6 + loc3;
+    reg8 = reg10 - loc2;
+    reg10 = reg10 + loc2;
+
+    /* stage 2 */
+    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+    reg9 = reg1 - loc2;
+    reg1 = reg1 + loc2;
+    reg7 = reg15 - loc3;
+    reg15 = reg15 + loc3;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+    BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+    loc1 = reg15 + reg3;
+    reg3 = reg15 - reg3;
+    loc2 = reg2 + loc1;
+    reg15 = reg2 - loc1;
+
+    loc1 = reg1 + reg13;
+    reg13 = reg1 - reg13;
+    loc0 = reg0 + loc1;
+    loc1 = reg0 - loc1;
+    tmp6 = loc0;
+    tmp7 = loc1;
+    reg0 = loc2;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+    VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5,
+                        reg11);
+
+    loc0 = reg9 + reg5;
+    reg5 = reg9 - reg5;
+    reg2 = reg6 + loc0;
+    reg1 = reg6 - loc0;
+
+    loc0 = reg7 + reg11;
+    reg11 = reg7 - reg11;
+    loc1 = reg4 + loc0;
+    loc2 = reg4 - loc0;
+
+    tmp5 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+    BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+    reg10 = loc0;
+    reg11 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+    BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+    reg13 = loc2;
+
+    /* Transpose and store the output */
+    reg12 = tmp5;
+    reg14 = tmp6;
+    reg3 = tmp7;
+
+    /* transpose block */
+    TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
+                       reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+    ST_SH4(reg0, reg2, reg4, reg6, output, 16);
+    ST_SH4(reg8, reg10, reg12, reg14, (output + 4 * 16), 16);
+
+    /* transpose block */
+    TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
+                       reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+    ST_SH4(reg3, reg13, reg11, reg5, (output + 8), 16);
+    ST_SH4(reg7, reg9, reg1, reg15, (output + 8 + 4 * 16), 16);
+}
+
+static void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst,
+                                    int32_t dst_stride)
+{
+    uint8_t i;
+    int16_t out;
+    v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 6);
+    input[0] = 0;
+
+    vec = __msa_fill_h(out);
+
+    for (i = 4; i--;) {
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        UNPCK_UB_SH(dst0, res0, res4);
+        UNPCK_UB_SH(dst1, res1, res5);
+        UNPCK_UB_SH(dst2, res2, res6);
+        UNPCK_UB_SH(dst3, res3, res7);
+        ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2,
+             res3);
+        ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
+             res7);
+        CLIP_SH4_0_255(res0, res1, res2, res3);
+        CLIP_SH4_0_255(res4, res5, res6, res7);
+        PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                    tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void vp9_idct16x16_10_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+
+    /* transform rows */
+    vp9_idct16_1d_columns_msa(input, out);
+
+    /* short case just considers top 4 rows as valid output */
+    out += 4 * 16;
+    for (i = 12; i--;) {
+        __asm__ volatile (
+            "sw     $zero,   0(%[out])     \n\t"
+            "sw     $zero,   4(%[out])     \n\t"
+            "sw     $zero,   8(%[out])     \n\t"
+            "sw     $zero,  12(%[out])     \n\t"
+            "sw     $zero,  16(%[out])     \n\t"
+            "sw     $zero,  20(%[out])     \n\t"
+            "sw     $zero,  24(%[out])     \n\t"
+            "sw     $zero,  28(%[out])     \n\t"
+
+            :
+            : [out] "r" (out)
+        );
+
+        out += 16;
+    }
+
+    out = out_arr;
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+    }
+}
+
+static void vp9_idct16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                            int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_msa((input + (i << 3)), (out + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+    }
+}
+
+static void vp9_iadst16_1d_columns_msa(int16_t *input, int16_t *output)
+{
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+    v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+    v8i16 zero = { 0 };
+
+    /* load input data */
+    LD_SH16(input, 16,
+            l0, l1, l2, l3, l4, l5, l6, l7,
+            l8, l9, l10, l11, l12, l13, l14, l15);
+
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16);
+    input += 16 * 8;
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, input, 16);
+
+    /* ADST in horizontal */
+    VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
+                     l8, l9, l10, l11, l12, l13, l14, l15,
+                     r0, r1, r2, r3, r4, r5, r6, r7,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+
+    l1 = -r8;
+    l3 = -r4;
+    l13 = -r13;
+    l15 = -r1;
+
+    TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
+                       l0, l1, l2, l3, l4, l5, l6, l7);
+    ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
+    TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
+                       l8, l9, l10, l11, l12, l13, l14, l15);
+    ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
+}
+
+static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                              int32_t dst_stride)
+{
+    v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+    v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+    v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+    v16i8 zero = { 0 };
+
+    r0 = LD_SH(input + 0 * 16);
+    r3 = LD_SH(input + 3 * 16);
+    r4 = LD_SH(input + 4 * 16);
+    r7 = LD_SH(input + 7 * 16);
+    r8 = LD_SH(input + 8 * 16);
+    r11 = LD_SH(input + 11 * 16);
+    r12 = LD_SH(input + 12 * 16);
+    r15 = LD_SH(input + 15 * 16);
+
+    /* stage 1 */
+    k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+    VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+    k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+    VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+    BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+    k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+    VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+    r1 = LD_SH(input + 1 * 16);
+    r2 = LD_SH(input + 2 * 16);
+    r5 = LD_SH(input + 5 * 16);
+    r6 = LD_SH(input + 6 * 16);
+    r9 = LD_SH(input + 9 * 16);
+    r10 = LD_SH(input + 10 * 16);
+    r13 = LD_SH(input + 13 * 16);
+    r14 = LD_SH(input + 14 * 16);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+    VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+    k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+    VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+    BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+    BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+    out1 = -out1;
+    SRARI_H2_SH(out0, out1, 6);
+    dst0 = LD_UB(dst + 0 * dst_stride);
+    dst1 = LD_UB(dst + 15 * dst_stride);
+    ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
+    ADD2(res0, out0, res1, out1, res0, res1);
+    CLIP_SH2_0_255(res0, res1);
+    PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
+    ST8x1_UB(res0, dst);
+    ST8x1_UB(res1, dst + 15 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+    k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+    VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+    BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+    out8 = -out8;
+
+    SRARI_H2_SH(out8, out9, 6);
+    dst8 = LD_UB(dst + 1 * dst_stride);
+    dst9 = LD_UB(dst + 14 * dst_stride);
+    ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
+    ADD2(res8, out8, res9, out9, res8, res9);
+    CLIP_SH2_0_255(res8, res9);
+    PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
+    ST8x1_UB(res8, dst + dst_stride);
+    ST8x1_UB(res9, dst + 14 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+    VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+    out4 = -out4;
+    SRARI_H2_SH(out4, out5, 6);
+    dst4 = LD_UB(dst + 3 * dst_stride);
+    dst5 = LD_UB(dst + 12 * dst_stride);
+    ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
+    ADD2(res4, out4, res5, out5, res4, res5);
+    CLIP_SH2_0_255(res4, res5);
+    PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
+    ST8x1_UB(res4, dst + 3 * dst_stride);
+    ST8x1_UB(res5, dst + 12 * dst_stride);
+
+    VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+    out13 = -out13;
+    SRARI_H2_SH(out12, out13, 6);
+    dst12 = LD_UB(dst + 2 * dst_stride);
+    dst13 = LD_UB(dst + 13 * dst_stride);
+    ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
+    ADD2(res12, out12, res13, out13, res12, res13);
+    CLIP_SH2_0_255(res12, res13);
+    PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
+    ST8x1_UB(res12, dst + 2 * dst_stride);
+    ST8x1_UB(res13, dst + 13 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+    k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+    VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+    SRARI_H2_SH(out6, out7, 6);
+    dst6 = LD_UB(dst + 4 * dst_stride);
+    dst7 = LD_UB(dst + 11 * dst_stride);
+    ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
+    ADD2(res6, out6, res7, out7, res6, res7);
+    CLIP_SH2_0_255(res6, res7);
+    PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
+    ST8x1_UB(res6, dst + 4 * dst_stride);
+    ST8x1_UB(res7, dst + 11 * dst_stride);
+
+    VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+    SRARI_H2_SH(out10, out11, 6);
+    dst10 = LD_UB(dst + 6 * dst_stride);
+    dst11 = LD_UB(dst + 9 * dst_stride);
+    ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
+    ADD2(res10, out10, res11, out11, res10, res11);
+    CLIP_SH2_0_255(res10, res11);
+    PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
+    ST8x1_UB(res10, dst + 6 * dst_stride);
+    ST8x1_UB(res11, dst + 9 * dst_stride);
+
+    k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+    VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+    SRARI_H2_SH(out2, out3, 6);
+    dst2 = LD_UB(dst + 7 * dst_stride);
+    dst3 = LD_UB(dst + 8 * dst_stride);
+    ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
+    ADD2(res2, out2, res3, out3, res2, res3);
+    CLIP_SH2_0_255(res2, res3);
+    PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
+    ST8x1_UB(res2, dst + 7 * dst_stride);
+    ST8x1_UB(res3, dst + 8 * dst_stride);
+
+    VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+    SRARI_H2_SH(out14, out15, 6);
+    dst14 = LD_UB(dst + 5 * dst_stride);
+    dst15 = LD_UB(dst + 10 * dst_stride);
+    ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
+    ADD2(res14, out14, res15, out15, res14, res15);
+    CLIP_SH2_0_255(res14, res15);
+    PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
+    ST8x1_UB(res14, dst + 5 * dst_stride);
+    ST8x1_UB(res15, dst + 10 * dst_stride);
+}
+
+static void vp9_iadst16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+    int32_t i;
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 16 * 8 block */
+        vp9_iadst16_1d_columns_msa((input + (i << 3)), (out + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                          dst_stride);
+    }
+}
+
+static void vp9_iadst_idct_16x16_add_msa(int16_t *input, uint8_t *dst,
+                                         int32_t dst_stride, int32_t eob)
+{
+    int32_t i;
+    int16_t out[16 * 16];
+    int16_t *out_ptr = &out[0];
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                         (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct_iadst_16x16_add_msa(int16_t *input, uint8_t *dst,
+                                         int32_t dst_stride, int32_t eob)
+{
+    int32_t i;
+    int16_t out[16 * 16];
+    int16_t *out_ptr = &out[0];
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf,
+                                               int16_t *tmp_odd_buf,
+                                               int16_t *dst)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+    /* FINAL BUTTERFLY : Dependency on Even & Odd */
+    vec0 = LD_SH(tmp_odd_buf);
+    vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+    loc0 = LD_SH(tmp_eve_buf);
+    loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+
+    /* Transpose : 16 vectors */
+    /* 1st & 2nd 8x8 */
+    TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                       m0, n0, m1, n1, m2, n2, m3, n3);
+    ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
+    ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
+
+    TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                       m4, n4, m5, n5, m6, n6, m7, n7);
+    ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
+    ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
+
+    /* 3rd & 4th 8x8 */
+    LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+    LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+    TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                       m0, n0, m1, n1, m2, n2, m3, n3);
+    ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
+    ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
+
+    TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                       m4, n4, m5, n5, m6, n6, m7, n7);
+    ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
+    ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
+}
+
+static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                                   int16_t *tmp_eve_buf)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+    v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+    v8i16 zero = { 0 };
+
+    /* Even stage 1 */
+    LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, tmp_buf, (4 * 32));
+    tmp_buf += (2 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+    VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+    BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+    VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+    loc1 = vec3;
+    loc0 = vec1;
+
+    VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+    VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+    BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+    BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+    BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+    /* Even stage 2 */
+    /* Load 8 */
+    LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    ST_SH8(zero, zero, zero, zero, zero, zero, zero, zero, tmp_buf, (4 * 32));
+
+    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+    vec0 = reg0 + reg4;
+    reg0 = reg0 - reg4;
+    reg4 = reg6 + reg2;
+    reg6 = reg6 - reg2;
+    reg2 = reg1 + reg5;
+    reg1 = reg1 - reg5;
+    reg5 = reg7 + reg3;
+    reg7 = reg7 - reg3;
+    reg3 = vec0;
+
+    vec1 = reg2;
+    reg2 = reg3 + reg4;
+    reg3 = reg3 - reg4;
+    reg4 = reg5 - vec1;
+    reg5 = reg5 + vec1;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+    vec0 = reg0 - reg6;
+    reg0 = reg0 + reg6;
+    vec1 = reg7 - reg1;
+    reg7 = reg7 + reg1;
+
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+    /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+    /* Store 8 */
+    BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, tmp_eve_buf, 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
+
+    BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
+
+    /* Store 8 */
+    BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
+
+    BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
+}
+
+static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                                  int16_t *tmp_odd_buf)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+    v8i16 zero = { 0 };
+
+    /* Odd stage 1 */
+    reg0 = LD_SH(tmp_buf + 32);
+    reg1 = LD_SH(tmp_buf + 7 * 32);
+    reg2 = LD_SH(tmp_buf + 9 * 32);
+    reg3 = LD_SH(tmp_buf + 15 * 32);
+    reg4 = LD_SH(tmp_buf + 17 * 32);
+    reg5 = LD_SH(tmp_buf + 23 * 32);
+    reg6 = LD_SH(tmp_buf + 25 * 32);
+    reg7 = LD_SH(tmp_buf + 31 * 32);
+
+    ST_SH(zero, tmp_buf + 32);
+    ST_SH(zero, tmp_buf + 7 * 32);
+    ST_SH(zero, tmp_buf + 9 * 32);
+    ST_SH(zero, tmp_buf + 15 * 32);
+    ST_SH(zero, tmp_buf + 17 * 32);
+    ST_SH(zero, tmp_buf + 23 * 32);
+    ST_SH(zero, tmp_buf + 25 * 32);
+    ST_SH(zero, tmp_buf + 31 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+    vec0 = reg0 + reg3;
+    reg0 = reg0 - reg3;
+    reg3 = reg7 + reg4;
+    reg7 = reg7 - reg4;
+    reg4 = reg1 + reg2;
+    reg1 = reg1 - reg2;
+    reg2 = reg6 + reg5;
+    reg6 = reg6 - reg5;
+    reg5 = vec0;
+
+    /* 4 Stores */
+    ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+    SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+    ST_SH2(vec0, vec1, tmp_odd_buf, 8);
+
+    /* 4 Stores */
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+    BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+    VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+    ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+    /* Odd stage 2 */
+    /* 8 loads */
+    reg0 = LD_SH(tmp_buf + 3 * 32);
+    reg1 = LD_SH(tmp_buf + 5 * 32);
+    reg2 = LD_SH(tmp_buf + 11 * 32);
+    reg3 = LD_SH(tmp_buf + 13 * 32);
+    reg4 = LD_SH(tmp_buf + 19 * 32);
+    reg5 = LD_SH(tmp_buf + 21 * 32);
+    reg6 = LD_SH(tmp_buf + 27 * 32);
+    reg7 = LD_SH(tmp_buf + 29 * 32);
+
+    ST_SH(zero, tmp_buf + 3 * 32);
+    ST_SH(zero, tmp_buf + 5 * 32);
+    ST_SH(zero, tmp_buf + 11 * 32);
+    ST_SH(zero, tmp_buf + 13 * 32);
+    ST_SH(zero, tmp_buf + 19 * 32);
+    ST_SH(zero, tmp_buf + 21 * 32);
+    ST_SH(zero, tmp_buf + 27 * 32);
+    ST_SH(zero, tmp_buf + 29 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+    VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+    /* 4 Stores */
+    SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
+         vec0, vec1, vec2, vec3);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+    BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+    /* 4 Stores */
+    ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7,
+         vec0, vec1, vec2, vec3);
+    BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+    ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
+    VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+    ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+    /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+    /* Load 8 & Store 8 */
+    LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+    LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+    ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+         loc0, loc1, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+    SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+    SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+    /* Load 8 & Store 8 */
+    LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+    LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+    ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+         loc0, loc1, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+    SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+    SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                                 int16_t *tmp_odd_buf,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+    /* FINAL BUTTERFLY : Dependency on Even & Odd */
+    vec0 = LD_SH(tmp_odd_buf);
+    vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+    loc0 = LD_SH(tmp_eve_buf);
+    loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+    SRARI_H4_SH(m0, m2, m4, m6, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
+    SRARI_H4_SH(m0, m2, m4, m6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
+                        m0, m2, m4, m6);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+    SRARI_H4_SH(m1, m3, m5, m7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
+                        m1, m3, m5, m7);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
+    SRARI_H4_SH(m1, m3, m5, m7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
+                        m1, m3, m5, m7);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+    SRARI_H4_SH(n0, n2, n4, n6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
+                        n0, n2, n4, n6);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
+    SRARI_H4_SH(n0, n2, n4, n6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
+                        n0, n2, n4, n6);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+    SRARI_H4_SH(n1, n3, n5, n7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
+                        n1, n3, n5, n7);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
+    SRARI_H4_SH(n1, n3, n5, n7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
+                        n1, n3, n5, n7);
+}
+
+static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+
+    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+    vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
+                                         dst, dst_stride);
+}
+
+static void vp9_idct8x32_1d_columns_msa(int16_t *input, int16_t *output,
+                                        int16_t *tmp_buf)
+{
+    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+
+    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+    vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0],
+                                       &tmp_odd_buf[0], output);
+}
+
+static void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst,
+                                    int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 6);
+    input[0] = 0;
+
+    vec = __msa_fill_h(out);
+
+    for (i = 16; i--;) {
+        LD_UB2(dst, 16, dst0, dst1);
+        LD_UB2(dst + dst_stride, 16, dst2, dst3);
+
+        UNPCK_UB_SH(dst0, res0, res4);
+        UNPCK_UB_SH(dst1, res1, res5);
+        UNPCK_UB_SH(dst2, res2, res6);
+        UNPCK_UB_SH(dst3, res3, res7);
+        ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2,
+             res3);
+        ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
+             res7);
+        CLIP_SH4_0_255(res0, res1, res2, res3);
+        CLIP_SH4_0_255(res4, res5, res6, res7);
+        PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                    tmp0, tmp1, tmp2, tmp3);
+
+        ST_UB2(tmp0, tmp1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(tmp2, tmp3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void vp9_idct32x32_34_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out_ptr = out_arr;
+    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT);
+
+    for (i = 32; i--;) {
+        __asm__ volatile (
+            "sw     $zero,       (%[out_ptr])     \n\t"
+            "sw     $zero,      4(%[out_ptr])     \n\t"
+            "sw     $zero,      8(%[out_ptr])     \n\t"
+            "sw     $zero,     12(%[out_ptr])     \n\t"
+            "sw     $zero,     16(%[out_ptr])     \n\t"
+            "sw     $zero,     20(%[out_ptr])     \n\t"
+            "sw     $zero,     24(%[out_ptr])     \n\t"
+            "sw     $zero,     28(%[out_ptr])     \n\t"
+            "sw     $zero,     32(%[out_ptr])     \n\t"
+            "sw     $zero,     36(%[out_ptr])     \n\t"
+            "sw     $zero,     40(%[out_ptr])     \n\t"
+            "sw     $zero,     44(%[out_ptr])     \n\t"
+            "sw     $zero,     48(%[out_ptr])     \n\t"
+            "sw     $zero,     52(%[out_ptr])     \n\t"
+            "sw     $zero,     56(%[out_ptr])     \n\t"
+            "sw     $zero,     60(%[out_ptr])     \n\t"
+
+            :
+            : [out_ptr] "r" (out_ptr)
+        );
+
+        out_ptr += 32;
+    }
+
+    out_ptr = out_arr;
+
+    /* process 8*32 block */
+    vp9_idct8x32_1d_columns_msa(input, out_ptr, &tmp_buf[0]);
+
+    /* transform columns */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                           (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct32x32_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                            int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out_ptr = out_arr;
+    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT);
+
+    /* transform rows */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 8)),
+                                    &tmp_buf[0]);
+    }
+
+    /* transform columns */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                           (dst + (i << 3)), dst_stride);
+    }
+}
+
+void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob)
+{
+    if (eob > 1) {
+        vp9_idct4x4_colcol_addblk_msa(block, dst, stride);
+    }
+    else {
+        vp9_idct4x4_1_add_msa(block, dst, stride);
+    }
+}
+
+void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob)
+{
+    if (eob == 1) {
+        vp9_idct8x8_1_add_msa(block, dst, stride);
+    }
+    else if (eob <= 12) {
+        vp9_idct8x8_12_colcol_addblk_msa(block, dst, stride);
+    }
+    else {
+        vp9_idct8x8_colcol_addblk_msa(block, dst, stride);
+    }
+}
+
+void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    if (eob == 1) {
+        /* DC only DCT coefficient. */
+        vp9_idct16x16_1_add_msa(block, dst, stride);
+    }
+    else if (eob <= 10) {
+        vp9_idct16x16_10_colcol_addblk_msa(block, dst, stride);
+    }
+    else {
+        vp9_idct16x16_colcol_addblk_msa(block, dst, stride);
+    }
+}
+
+void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    if (eob == 1) {
+        vp9_idct32x32_1_add_msa(block, dst, stride);
+    }
+    else if (eob <= 34) {
+        vp9_idct32x32_34_colcol_addblk_msa(block, dst, stride);
+    }
+    else {
+        vp9_idct32x32_colcol_addblk_msa(block, dst, stride);
+    }
+}
+
+void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    vp9_iadst4x4_colcol_addblk_msa(block, dst, stride);
+}
+
+void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    vp9_iadst8x8_colcol_addblk_msa(block, dst, stride);
+}
+
+void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                  int16_t *block, int eob)
+{
+    vp9_iadst16x16_colcol_addblk_msa(block, dst, stride);
+}
+
+void ff_idct_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_idct_iadst_4x4_add_msa(block, dst, stride, eob);
+}
+
+void ff_idct_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_idct_iadst_8x8_add_msa(block, dst, stride, eob);
+}
+
+void ff_idct_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob)
+{
+    vp9_idct_iadst_16x16_add_msa(block, dst, stride, eob);
+}
+
+void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_iadst_idct_4x4_add_msa(block, dst, stride, eob);
+}
+
+void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_iadst_idct_8x8_add_msa(block, dst, stride, eob);
+}
+
+void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob)
+{
+    vp9_iadst_idct_16x16_add_msa(block, dst, stride, eob);
+}
diff --git a/libavcodec/mips/vp9_intra_msa.c b/libavcodec/mips/vp9_intra_msa.c
new file mode 100644
index 0000000..54cf0ae
--- /dev/null
+++ b/libavcodec/mips/vp9_intra_msa.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)  \
+{                                                \
+    out0 = __msa_subs_u_h(out0, in0);            \
+    out1 = __msa_subs_u_h(out1, in1);            \
+}
+
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    uint32_t row;
+    v16u8 src0;
+
+    src0 = LD_UB(src);
+
+    for (row = 16; row--;) {
+        ST_UB(src0, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    uint32_t row;
+    v16u8 src1, src2;
+
+    src1 = LD_UB(src);
+    src2 = LD_UB(src + 16);
+
+    for (row = 32; row--;) {
+        ST_UB2(src1, src2, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    uint32_t row, inp;
+    v16u8 src0, src1, src2, src3;
+
+    src += 12;
+    for (row = 4; row--;) {
+        inp = LW(src);
+        src -= 4;
+
+        src0 = (v16u8) __msa_fill_b(inp >> 24);
+        src1 = (v16u8) __msa_fill_b(inp >> 16);
+        src2 = (v16u8) __msa_fill_b(inp >> 8);
+        src3 = (v16u8) __msa_fill_b(inp);
+
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    uint32_t row, inp;
+    v16u8 src0, src1, src2, src3;
+
+    src += 28;
+    for (row = 8; row--;) {
+        inp = LW(src);
+        src -= 4;
+
+        src0 = (v16u8) __msa_fill_b(inp >> 24);
+        src1 = (v16u8) __msa_fill_b(inp >> 16);
+        src2 = (v16u8) __msa_fill_b(inp >> 8);
+        src3 = (v16u8) __msa_fill_b(inp);
+
+        ST_UB2(src0, src0, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src1, src1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src2, src2, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src3, src3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    uint32_t val0, val1;
+    v16i8 store, src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    val0 = LW(src_top);
+    val1 = LW(src_left);
+    INSERT_W2_SB(val0, val1, src);
+    sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);
+    store = __msa_splati_b((v16i8) sum_w, 0);
+    val0 = __msa_copy_u_w((v4i32) store, 0);
+
+    SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_4x4(dir)                                    \
+void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                           const uint8_t *left,                 \
+                           const uint8_t *top)                  \
+{                                                               \
+    uint32_t val0;                                              \
+    v16i8 store, data = { 0 };                                  \
+    v8u16 sum_h;                                                \
+    v4u32 sum_w;                                                \
+                                                                \
+    val0 = LW(dir);                                             \
+    data = (v16i8) __msa_insert_w((v4i32) data, 0, val0);       \
+    sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data);         \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2);            \
+    store = __msa_splati_b((v16i8) sum_w, 0);                   \
+    val0 = __msa_copy_u_w((v4i32) store, 0);                    \
+                                                                \
+    SW4(val0, val0, val0, val0, dst, dst_stride);               \
+}
+INTRA_DC_TL_4x4(top);
+INTRA_DC_TL_4x4(left);
+
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    uint64_t val0, val1;
+    v16i8 store;
+    v16u8 src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    val0 = LD(src_top);
+    val1 = LD(src_left);
+    INSERT_D2_UB(val0, val1, src);
+    sum_h = __msa_hadd_u_h(src, src);
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);
+    store = __msa_splati_b((v16i8) sum_w, 0);
+    val0 = __msa_copy_u_d((v2i64) store, 0);
+
+    SD4(val0, val0, val0, val0, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_8x8(dir)                                    \
+void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                           const uint8_t *left,                 \
+                           const uint8_t *top)                  \
+{                                                               \
+    uint64_t val0;                                              \
+    v16i8 store;                                                \
+    v16u8 data = { 0 };                                         \
+    v8u16 sum_h;                                                \
+    v4u32 sum_w;                                                \
+    v2u64 sum_d;                                                \
+                                                                \
+    val0 = LD(dir);                                             \
+    data = (v16u8) __msa_insert_d((v2i64) data, 0, val0);       \
+    sum_h = __msa_hadd_u_h(data, data);                         \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                       \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);            \
+    store = __msa_splati_b((v16i8) sum_w, 0);                   \
+    val0 = __msa_copy_u_d((v2i64) store, 0);                    \
+                                                                \
+    SD4(val0, val0, val0, val0, dst, dst_stride);               \
+    dst += (4 * dst_stride);                                    \
+    SD4(val0, val0, val0, val0, dst, dst_stride);               \
+}
+
+INTRA_DC_TL_8x8(top);
+INTRA_DC_TL_8x8(left);
+
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    v16u8 top, left, out;
+    v8u16 sum_h, sum_top, sum_left;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    top = LD_UB(src_top);
+    left = LD_UB(src_left);
+    HADD_UB2_UH(top, left, sum_top, sum_left);
+    sum_h = sum_top + sum_left;
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_16x16(dir)                                        \
+void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,      \
+                             const uint8_t *left,                     \
+                             const uint8_t *top)                      \
+{                                                                     \
+    v16u8 data, out;                                                  \
+    v8u16 sum_h;                                                      \
+    v4u32 sum_w;                                                      \
+    v2u64 sum_d;                                                      \
+                                                                      \
+    data = LD_UB(dir);                                                \
+    sum_h = __msa_hadd_u_h(data, data);                               \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                             \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);      \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);                  \
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);                   \
+                                                                      \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+    dst += (8 * dst_stride);                                          \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+}
+INTRA_DC_TL_16x16(top);
+INTRA_DC_TL_16x16(left);
+
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    uint32_t row;
+    v16u8 top0, top1, left0, left1, out;
+    v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    LD_UB2(src_top, 16, top0, top1);
+    LD_UB2(src_left, 16, left0, left1);
+    HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+    HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+    sum_h = sum_top0 + sum_top1;
+    sum_h += sum_left0 + sum_left1;
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6);
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
+
+    for (row = 16; row--;)
+    {
+        ST_UB2(out, out, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out, out, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+#define INTRA_DC_TL_32x32(dir)                                    \
+void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                             const uint8_t *left,                 \
+                             const uint8_t *top)                  \
+{                                                                 \
+    uint32_t row;                                                 \
+    v16u8 data0, data1, out;                                      \
+    v8u16 sum_h, sum_data0, sum_data1;                            \
+    v4u32 sum_w;                                                  \
+    v2u64 sum_d;                                                  \
+                                                                  \
+    LD_UB2(dir, 16, data0, data1);                                \
+    HADD_UB2_UH(data0, data1, sum_data0, sum_data1);              \
+    sum_h = sum_data0 + sum_data1;                                \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                         \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);  \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);              \
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);               \
+                                                                  \
+    for (row = 16; row--;)                                        \
+    {                                                             \
+        ST_UB2(out, out, dst, 16);                                \
+        dst += dst_stride;                                        \
+        ST_UB2(out, out, dst, 16);                                \
+        dst += dst_stride;                                        \
+    }                                                             \
+}
+INTRA_DC_TL_32x32(top);
+INTRA_DC_TL_32x32(left);
+
+#define INTRA_PREDICT_VALDC_16X16_MSA(val)                             \
+void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
+                             const uint8_t *left, const uint8_t *top)  \
+{                                                                      \
+    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
+                                                                       \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
+    dst += (8 * dst_stride);                                           \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
+}
+
+INTRA_PREDICT_VALDC_16X16_MSA(127);
+INTRA_PREDICT_VALDC_16X16_MSA(128);
+INTRA_PREDICT_VALDC_16X16_MSA(129);
+
+#define INTRA_PREDICT_VALDC_32X32_MSA(val)                             \
+void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
+                             const uint8_t *left, const uint8_t *top)  \
+{                                                                      \
+    uint32_t row;                                                      \
+    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
+                                                                       \
+    for (row = 16; row--;)                                             \
+    {                                                                  \
+        ST_UB2(out, out, dst, 16);                                     \
+        dst += dst_stride;                                             \
+        ST_UB2(out, out, dst, 16);                                     \
+        dst += dst_stride;                                             \
+    }                                                                  \
+}
+
+INTRA_PREDICT_VALDC_32X32_MSA(127);
+INTRA_PREDICT_VALDC_32X32_MSA(128);
+INTRA_PREDICT_VALDC_32X32_MSA(129);
+
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint32_t left;
+    uint8_t top_left = src_top_ptr[-1];
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+    v16u8 src0, src1, src2, src3;
+    v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+    src_top = LD_SB(src_top_ptr);
+    left = LW(src_left);
+    src_left0 = __msa_fill_b(left >> 24);
+    src_left1 = __msa_fill_b(left >> 16);
+    src_left2 = __msa_fill_b(left >> 8);
+    src_left3 = __msa_fill_b(left);
+
+    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+               src_left3, src_top, src0, src1, src2, src3);
+    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+    ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+    v8u16 src_top_left, vec0, vec1, vec2, vec3;
+    v16u8 src0, src1, src2, src3;
+
+    src_top = LD_SB(src_top_ptr);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 4;
+    for (loop_cnt = 2; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+                   src_left3, src_top, src0, src1, src2, src3);
+        HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+        SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+    v8u16 src_top_left, res_r, res_l;
+
+    src_top = LD_SB(src_top_ptr);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 12;
+    for (loop_cnt = 4; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+    v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+    src_top0 = LD_SB(src_top_ptr);
+    src_top1 = LD_SB(src_top_ptr + 16);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 28;
+    for (loop_cnt = 8; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+    }
+}
diff --git a/libavcodec/mips/vp9_lpf_msa.c b/libavcodec/mips/vp9_lpf_msa.c
new file mode 100644
index 0000000..c82a9e9
--- /dev/null
+++ b/libavcodec/mips/vp9_lpf_msa.c
@@ -0,0 +1,2533 @@
+/*
+ * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out)               \
+{                                                                        \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2;         \
+    const v16i8 cnst4b = __msa_ldi_b(4);                                 \
+    const v16i8 cnst3b = __msa_ldi_b(3);                                 \
+                                                                         \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
+                                                                         \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                   \
+                                                                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                              \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
+    filt = filt & (v16i8) mask_in;                                       \
+                                                                         \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                                \
+    filt1 >>= 3;                                                         \
+                                                                         \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                                \
+    filt2 >>= 3;                                                         \
+                                                                         \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
+    q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
+    p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
+                                                                         \
+    filt = __msa_srari_b(filt1, 1);                                      \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                   \
+    q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                   \
+    p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
+}
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)  \
+{                                                                      \
+    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;     \
+    v16u8 zero_in = { 0 };                                             \
+                                                                       \
+    tmp = __msa_ori_b(zero_in, 1);                                     \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                        \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                        \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                        \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                        \
+                                                                       \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);             \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                   \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);             \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                   \
+                                                                       \
+    flat_out = (tmp < (v16u8) flat_out);                               \
+    flat_out = __msa_xori_b(flat_out, 0xff);                           \
+    flat_out = flat_out & (mask);                                      \
+}
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
+                  q5_in, q6_in, q7_in, flat_in, flat2_out)          \
+{                                                                   \
+    v16u8 tmp, zero_in = { 0 };                                     \
+    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;       \
+    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;       \
+                                                                    \
+    tmp = __msa_ori_b(zero_in, 1);                                  \
+    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                     \
+    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                     \
+    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                     \
+    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                     \
+    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                     \
+    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                     \
+    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                     \
+    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                     \
+                                                                    \
+    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);            \
+    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);              \
+    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);              \
+    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);              \
+                                                                    \
+    flat2_out = (tmp < (v16u8) flat2_out);                          \
+    flat2_out = __msa_xori_b(flat2_out, 0xff);                      \
+    flat2_out = flat2_out & flat_in;                                \
+}
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                \
+                    q0_in, q1_in, q2_in, q3_in,                \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out,  \
+                    q0_filt8_out, q1_filt8_out, q2_filt8_out)  \
+{                                                              \
+    v8u16 tmp0, tmp1, tmp2;                                    \
+                                                               \
+    tmp2 = p2_in + p1_in + p0_in;                              \
+    tmp0 = p3_in << 1;                                         \
+                                                               \
+    tmp0 = tmp0 + tmp2 + q0_in;                                \
+    tmp1 = tmp0 + p3_in + p2_in;                               \
+    p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = tmp0 + p1_in + q1_in;                               \
+    p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = q2_in + q1_in + q0_in;                              \
+    tmp2 = tmp2 + tmp1;                                        \
+    tmp0 = tmp2 + (p0_in);                                     \
+    tmp0 = tmp0 + (p3_in);                                     \
+    p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3);     \
+                                                               \
+    tmp0 = q2_in + q3_in;                                      \
+    tmp0 = p0_in + tmp1 + tmp0;                                \
+    tmp1 = q3_in + q3_in;                                      \
+    tmp1 = tmp1 + tmp0;                                        \
+    q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp0 = tmp2 + q3_in;                                       \
+    tmp1 = tmp0 + q0_in;                                       \
+    q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = tmp0 - p2_in;                                       \
+    tmp0 = q1_in + q3_in;                                      \
+    tmp1 = tmp0 + tmp1;                                        \
+    q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    /* absolute subtraction of pixel values */                     \
+    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+                                                                   \
+    /* calculation of hev */                                       \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = thresh_in < (v16u8) flat_out;                        \
+                                                                   \
+    /* calculation of mask */                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+                                                                   \
+    mask_out = b_limit_in < p0_asub_q0_m;                          \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+                                                                   \
+    mask_out = limit_in < (v16u8) mask_out;                        \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+
+void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
+    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
+
+    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
+    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
+
+    limit0 = (v16u8) __msa_fill_b(limit_ptr);
+    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                    p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                    q0_filter8);
+        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
+
+        p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
+
+        src -= 3 * pitch;
+
+        SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+        src += (4 * pitch);
+        SD(q1_d, src);
+        src += pitch;
+        SD(q2_d, src);
+    }
+}
+
+void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
+                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
+                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
+                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+                    q1_filt8_l, q2_filt8_l);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
+                                        uint8_t *filter48,
+                                        int32_t b_limit_ptr,
+                                        int32_t limit_ptr,
+                                        int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
+{
+    v16u8 flat, flat2, filter8;
+    v16i8 zero = { 0 };
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+    v8i16 l_out, r_out;
+
+    flat = LD_UB(filter48 + 96);
+
+    LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        src -= 3 * pitch;
+        ST_UB4(p2, p1, p0, q0, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1, q2, src, pitch);
+    } else {
+        src -= 7 * pitch;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+                   p5_l_in, p4_l_in);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+                   p1_l_in, p0_l_in);
+        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
+
+        tmp0_l = p7_l_in << 3;
+        tmp0_l -= p7_l_in;
+        tmp0_l += p6_l_in;
+        tmp0_l += q0_l_in;
+        tmp1_l = p6_l_in + p5_l_in;
+        tmp1_l += p4_l_in;
+        tmp1_l += p3_l_in;
+        tmp1_l += p2_l_in;
+        tmp1_l += p1_l_in;
+        tmp1_l += p0_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST_UB(p6, src);
+        src += pitch;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
+        tmp0_l = p5_l_in - p6_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST_UB(p5, src);
+        src += pitch;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
+        tmp0_l = p4_l_in - p5_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST_UB(p4, src);
+        src += pitch;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
+        tmp0_l = p3_l_in - p4_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST_UB(p3, src);
+        src += pitch;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
+        tmp0_l = p2_l_in - p3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
+        tmp0_l = p1_l_in - p2_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
+        tmp0_l = p0_l_in - p1_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
+        tmp0_l = q7_l_in - p0_l_in;
+        tmp0_l += q0_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q0_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p6_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q1_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p5_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q2_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p4_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST_UB(q3, src);
+        src += pitch;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p3_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST_UB(q4, src);
+        src += pitch;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q4_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p2_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST_UB(q5, src);
+        src += pitch;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q5_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p1_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST_UB(q6, src);
+    }
+}
+
+void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t early_exit = 0;
+
+    early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
+                                          b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        vp9_hz_lpf_t16_16w(src, pitch, filter48);
+    }
+}
+
+void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
+                               int32_t b_limit_ptr,
+                               int32_t limit_ptr,
+                               int32_t thresh_ptr)
+{
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    uint64_t dword0, dword1;
+    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 p0_filter16, p1_filter16;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+    v8u16 tmp0, tmp1, tmp2;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+    } else {
+        /* convert 8 bit input data into 16 bit */
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
+                   q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
+                   q1_r, q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
+                    p2_filter8, p1_filter8, p0_filter8, q0_filter8,
+                    q1_filter8, q2_filter8);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                    q0_filter8);
+        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
+                    q2_filter8);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
+
+        /* load 16 vector elements */
+        LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+        LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+        VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+        /* if flat2 is zero for all pixels, then no need to calculate other filter */
+        if (__msa_test_bz_v(flat2)) {
+            p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
+            p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+            p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+            q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+            q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+            q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
+
+            SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+            SD(q1_d, src + pitch);
+            SD(q2_d, src + 2 * pitch);
+        } else {
+            /* LSB(right) 8 pixel operation */
+            ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
+                       zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
+                       q4_r, q5_r, q6_r, q7_r);
+
+            tmp0 = p7_r << 3;
+            tmp0 -= p7_r;
+            tmp0 += p6_r;
+            tmp0 += q0_r;
+
+            src -= 7 * pitch;
+
+            /* calculation of p6 and p5 */
+            tmp1 = p6_r + p5_r + p4_r + p3_r;
+            tmp1 += (p2_r + p1_r + p0_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp0 = p5_r - p6_r + q1_r - p7_r;
+            tmp1 += tmp0;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p4 and p3 */
+            tmp0 = p4_r - p5_r + q2_r - p7_r;
+            tmp2 = p3_r - p4_r + q3_r - p7_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p2 and p1 */
+            tmp0 = p2_r - p3_r + q4_r - p7_r;
+            tmp2 = p1_r - p2_r + q5_r - p7_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p0 and q0 */
+            tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+            tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q1 and q2 */
+            tmp0 = q7_r - q0_r + q1_r - p6_r;
+            tmp2 = q7_r - q1_r + q2_r - p5_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q3 and q4 */
+            tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+            tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q5 and q6 */
+            tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+            tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+        }
+    }
+}
+
+void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat, limit, thresh, b_limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v8i16 vec0, vec1, vec2, vec3;
+
+    LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                       p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+    ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+    src -= 2;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    src += 4 * pitch;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
+    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
+
+    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
+    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
+
+    limit0 = (v16u8) __msa_fill_b(limit_ptr);
+    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
+
+void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    /* load vector elements */
+    LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                       p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        /* Store 4 pixels p1-_q1 */
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+        src -= 2;
+        ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+        src += 4 * pitch;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+                    p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        /* Store 6 pixels p2-_q2 */
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
+
+        src -= 3;
+        ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec4, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec4, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+
+        /* filter8 */
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
+                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
+                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
+                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+                    q1_filt8_l, q2_filt8_l);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                       uint8_t *output, int32_t out_pitch)
+{
+    v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+    v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch,
+           p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
+    /* 8x8 transpose */
+    TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+                       p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+    /* 8x8 transpose */
+    ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+    ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+    ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+    ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+    SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+    output += (8 * out_pitch);
+    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                       uint8_t *output, int32_t out_pitch)
+{
+    v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+    TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+                        q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+    ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
+                                uint8_t *output, int32_t out_pitch)
+{
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+    v4i32 tmp2, tmp3;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    input += (8 * in_pitch);
+    LD_UB8(input, in_pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p7, p6, p5, p4, p3, p2, p1, p0);
+
+    /* transpose 16x8 matrix into 8x16 */
+    /* total 8 intermediate register and 32 instructions */
+    q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
+    q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
+    q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
+    q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
+    q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
+    q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
+    q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
+    q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
+
+    ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+    tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
+    tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
+
+    ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+    tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
+    tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
+
+    ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+    q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
+    tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
+    q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+    q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
+    tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
+    q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+    output += (8 * out_pitch);
+    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                       uint8_t *src_org, int32_t pitch_org,
+                                       int32_t b_limit_ptr,
+                                       int32_t limit_ptr,
+                                       int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16i8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3;
+
+    /* load vector elements */
+    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
+        p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
+        p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
+        q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
+        q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
+        q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
+                                 uint8_t *filter48)
+{
+    v16i8 zero = { 0 };
+    v16u8 filter8, flat, flat2;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 tmp0_r, tmp1_r;
+    v8i16 r_out;
+
+    flat = LD_UB(filter48 + 6 * 16);
+
+    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        v8i16 vec0, vec1, vec2, vec3, vec4;
+
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
+
+        src_org -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+        return 1;
+    } else {
+        src -= 7 * 16;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST8x1_UB(p6, src);
+        src += 16;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST8x1_UB(p5, src);
+        src += 16;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST8x1_UB(p4, src);
+        src += 16;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST8x1_UB(p3, src);
+        src += 16;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST8x1_UB(q3, src);
+        src += 16;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST8x1_UB(q4, src);
+        src += 16;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST8x1_UB(q5, src);
+        src += 16;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST8x1_UB(q6, src);
+
+        return 0;
+    }
+}
+
+void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
+                               int32_t b_limit_ptr,
+                               int32_t limit_ptr,
+                               int32_t thresh_ptr)
+{
+    uint8_t early_exit = 0;
+    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t *filter48 = &transposed_input[16 * 16];
+
+    vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+    early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+                                         &filter48[0], src, pitch,
+                                         b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+                                       &filter48[0]);
+
+        if (0 == early_exit) {
+            vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+        }
+    }
+}
+
+static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                        uint8_t *src_org, ptrdiff_t pitch,
+                                        int32_t b_limit_ptr,
+                                        int32_t limit_ptr,
+                                        int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16i8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+    /* load vector elements */
+    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src_org -= 2;
+        ST4x8_UB(vec2, vec3, src_org, pitch);
+        src_org += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src_org, pitch);
+
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
+                                  uint8_t *filter48)
+{
+    v16u8 flat, flat2, filter8;
+    v16i8 zero = { 0 };
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+    v8i16 l_out, r_out;
+
+    flat = LD_UB(filter48 + 6 * 16);
+
+    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src_org -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+        return 1;
+    } else {
+        src -= 7 * 16;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+                   p5_l_in, p4_l_in);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+                   p1_l_in, p0_l_in);
+        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
+
+        tmp0_l = p7_l_in << 3;
+        tmp0_l -= p7_l_in;
+        tmp0_l += p6_l_in;
+        tmp0_l += q0_l_in;
+        tmp1_l = p6_l_in + p5_l_in;
+        tmp1_l += p4_l_in;
+        tmp1_l += p3_l_in;
+        tmp1_l += p2_l_in;
+        tmp1_l += p1_l_in;
+        tmp1_l += p0_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST_UB(p6, src);
+        src += 16;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
+        tmp0_l = p5_l_in - p6_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST_UB(p5, src);
+        src += 16;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
+        tmp0_l = p4_l_in - p5_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST_UB(p4, src);
+        src += 16;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
+        tmp0_l = p3_l_in - p4_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST_UB(p3, src);
+        src += 16;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
+        tmp0_l = p2_l_in - p3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
+        tmp0_l = p1_l_in - p2_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
+        tmp0_l = p0_l_in - p1_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
+        tmp0_l = q7_l_in - p0_l_in;
+        tmp0_l += q0_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q0_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p6_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q1_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p5_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q2_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p4_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST_UB(q3, src);
+        src += 16;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p3_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST_UB(q4, src);
+        src += 16;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q4_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p2_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST_UB(q5, src);
+        src += 16;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q5_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p1_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST_UB(q6, src);
+
+        return 0;
+    }
+}
+
+void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t early_exit = 0;
+    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t *filter48 = &transposed_input[16 * 16];
+
+    vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+    early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+                                          &filter48[0], src, pitch,
+                                          b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+                                        &filter48[0]);
+
+        if (0 == early_exit) {
+            vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
+        }
+    }
+}
diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c
new file mode 100644
index 0000000..e7a8387
--- /dev/null
+++ b/libavcodec/mips/vp9_mc_mmi.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2019 gxw <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/mmiutils.h"
+#include "vp9dsp_mips.h"
+
+#define GET_DATA_H_MMI                                       \
+    "pmaddhw    %[ftmp4],    %[ftmp4],   %[filter1]    \n\t" \
+    "pmaddhw    %[ftmp5],    %[ftmp5],   %[filter2]    \n\t" \
+    "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+    "punpckhwd  %[ftmp5],    %[ftmp4],   %[ftmp0]      \n\t" \
+    "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+    "pmaddhw    %[ftmp6],    %[ftmp6],   %[filter1]    \n\t" \
+    "pmaddhw    %[ftmp7],    %[ftmp7],   %[filter2]    \n\t" \
+    "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+    "punpckhwd  %[ftmp7],    %[ftmp6],   %[ftmp0]      \n\t" \
+    "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+    "punpcklwd  %[srcl],     %[ftmp4],   %[ftmp6]      \n\t" \
+    "pmaddhw    %[ftmp8],    %[ftmp8],   %[filter1]    \n\t" \
+    "pmaddhw    %[ftmp9],    %[ftmp9],   %[filter2]    \n\t" \
+    "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+    "punpckhwd  %[ftmp9],    %[ftmp8],   %[ftmp0]      \n\t" \
+    "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+    "pmaddhw    %[ftmp10],   %[ftmp10],  %[filter1]    \n\t" \
+    "pmaddhw    %[ftmp11],   %[ftmp11],  %[filter2]    \n\t" \
+    "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+    "punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]      \n\t" \
+    "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+    "punpcklwd  %[srch],     %[ftmp8],   %[ftmp10]     \n\t"
+
+#define GET_DATA_V_MMI                                       \
+    "punpcklhw  %[srcl],     %[ftmp4],   %[ftmp5]      \n\t" \
+    "pmaddhw    %[srcl],     %[srcl],    %[filter10]   \n\t" \
+    "punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+    "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+    "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+    "punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+    "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+    "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+    "punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+    "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+    "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+    "punpckhhw  %[srch],     %[ftmp4],   %[ftmp5]      \n\t" \
+    "pmaddhw    %[srch],     %[srch],    %[filter10]   \n\t" \
+    "punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+    "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+    "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+    "punpckhhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+    "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+    "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+    "punpckhhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+    "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+    "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t"
+
+static void convolve_horiz_mmi(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const uint16_t *filter_x, int32_t w,
+                               int32_t h)
+{
+    double ftmp[15];
+    uint32_t tmp[2];
+    src -= 3;
+    src_stride -= w;
+    dst_stride -= w;
+    __asm__ volatile (
+        "move       %[tmp1],    %[width]                   \n\t"
+        "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+        "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+        "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+        "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+        "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+        "li         %[tmp0],    0x07                       \n\t"
+        "dmtc1      %[tmp0],    %[ftmp13]                  \n\t"
+        "punpcklwd  %[ftmp13],  %[ftmp13],   %[ftmp13]     \n\t"
+        "1:                                                \n\t"
+        /* Get 8 data per row */
+        "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+        "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+        "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+        "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+        "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+        "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+        "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+        PTR_ADDIU  "%[width],   %[width],    -0x04         \n\t"
+        /* Get raw data */
+        GET_DATA_H_MMI
+        ROUND_POWER_OF_TWO_MMI(%[srcl], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        ROUND_POWER_OF_TWO_MMI(%[srch], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        "packsswh   %[srcl],    %[srcl],     %[srch]       \n\t"
+        "packushb   %[ftmp12],  %[srcl],     %[ftmp0]      \n\t"
+        "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+        PTR_ADDIU  "%[dst],     %[dst],      0x04          \n\t"
+        PTR_ADDIU  "%[src],     %[src],      0x04          \n\t"
+        /* Loop count */
+        "bnez       %[width],   1b                         \n\t"
+        "move       %[width],   %[tmp1]                    \n\t"
+        PTR_ADDU   "%[src],     %[src],      %[src_stride] \n\t"
+        PTR_ADDU   "%[dst],     %[dst],      %[dst_stride] \n\t"
+        PTR_ADDIU  "%[height],  %[height],   -0x01         \n\t"
+        "bnez       %[height],  1b                         \n\t"
+        : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+          [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+          [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+          [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+          [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+          [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+          [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+          [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+          [src]"+&r"(src),          [width]"+&r"(w),
+          [dst]"+&r"(dst),          [height]"+&r"(h),
+          [ftmp13]"=&f"(ftmp[14])
+        : [filter]"r"(filter_x),
+          [src_stride]"r"((mips_reg)src_stride),
+          [dst_stride]"r"((mips_reg)dst_stride)
+        : "memory"
+    );
+}
+
+static void convolve_vert_mmi(const uint8_t *src, int32_t src_stride,
+                              uint8_t *dst, int32_t dst_stride,
+                              const int16_t *filter_y, int32_t w,
+                              int32_t h)
+{
+    double ftmp[17];
+    uint32_t tmp[1];
+    ptrdiff_t addr = src_stride;
+    src_stride -= w;
+    dst_stride -= w;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+        "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+        "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+        "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+        "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+        "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+        "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+        "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+        "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+        "li         %[tmp0],     0x07                      \n\t"
+        "dmtc1      %[tmp0],     %[ftmp13]                 \n\t"
+        "punpcklwd  %[ftmp13],   %[ftmp13],  %[ftmp13]     \n\t"
+        "1:                                                \n\t"
+        /* Get 8 data per column */
+        "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+        "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+        PTR_ADDU   "%[tmp0],     %[src],     %[addr]       \n\t"
+        "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+        "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+        PTR_ADDIU  "%[width],    %[width],   -0x04         \n\t"
+        /* Get raw data */
+        GET_DATA_V_MMI
+        ROUND_POWER_OF_TWO_MMI(%[srcl], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        ROUND_POWER_OF_TWO_MMI(%[srch], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        "packsswh   %[srcl],     %[srcl],    %[srch]       \n\t"
+        "packushb   %[ftmp12],   %[srcl],    %[ftmp0]      \n\t"
+        "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+        PTR_ADDIU  "%[dst],      %[dst],      0x04         \n\t"
+        PTR_ADDIU  "%[src],      %[src],      0x04         \n\t"
+        /* Loop count */
+        "bnez       %[width],    1b                        \n\t"
+        PTR_SUBU   "%[width],    %[addr],    %[src_stride] \n\t"
+        PTR_ADDU   "%[src],      %[src],     %[src_stride] \n\t"
+        PTR_ADDU   "%[dst],      %[dst],     %[dst_stride] \n\t"
+        PTR_ADDIU  "%[height],   %[height],  -0x01         \n\t"
+        "bnez       %[height],   1b                        \n\t"
+        : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+          [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+          [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+          [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+          [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+          [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+          [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+          [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+          [src]"+&r"(src),          [dst]"+&r"(dst),
+          [width]"+&r"(w),          [height]"+&r"(h),
+          [tmp0]"=&r"(tmp[0]),      [ftmp13]"=&f"(ftmp[16])
+        : [filter]"r"(filter_y),
+          [src_stride]"r"((mips_reg)src_stride),
+          [dst_stride]"r"((mips_reg)dst_stride),
+          [addr]"r"((mips_reg)addr)
+        : "memory"
+    );
+}
+
+static void convolve_avg_horiz_mmi(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const uint16_t *filter_x, int32_t w,
+                                   int32_t h)
+{
+    double ftmp[15];
+    uint32_t tmp[2];
+    src -= 3;
+    src_stride -= w;
+    dst_stride -= w;
+
+    __asm__ volatile (
+        "move       %[tmp1],    %[width]                   \n\t"
+        "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+        "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+        "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+        "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+        "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+        "li         %[tmp0],    0x07                       \n\t"
+        "dmtc1      %[tmp0],    %[ftmp13]                  \n\t"
+        "punpcklwd  %[ftmp13],  %[ftmp13],   %[ftmp13]     \n\t"
+        "1:                                                \n\t"
+        /* Get 8 data per row */
+        "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+        "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+        "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+        "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+        "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+        "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+        "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+        "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+        PTR_ADDIU  "%[width],   %[width],    -0x04         \n\t"
+        /* Get raw data */
+        GET_DATA_H_MMI
+        ROUND_POWER_OF_TWO_MMI(%[srcl], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        ROUND_POWER_OF_TWO_MMI(%[srch], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        "packsswh   %[srcl],    %[srcl],     %[srch]       \n\t"
+        "packushb   %[ftmp12],  %[srcl],     %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[dst])               \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[dst])               \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],    %[ftmp0]      \n\t"
+        "paddh      %[ftmp12],  %[ftmp12],   %[ftmp4]      \n\t"
+        "li         %[tmp0],    0x10001                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp5]                   \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],    %[ftmp5]      \n\t"
+        "paddh      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+        "psrah      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+        "packushb   %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+        "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+        PTR_ADDIU  "%[dst],     %[dst],      0x04          \n\t"
+        PTR_ADDIU  "%[src],     %[src],      0x04          \n\t"
+        /* Loop count */
+        "bnez       %[width],   1b                         \n\t"
+        "move       %[width],   %[tmp1]                    \n\t"
+        PTR_ADDU   "%[src],     %[src],      %[src_stride] \n\t"
+        PTR_ADDU   "%[dst],     %[dst],      %[dst_stride] \n\t"
+        PTR_ADDIU  "%[height],  %[height],   -0x01         \n\t"
+        "bnez       %[height],  1b                         \n\t"
+        : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+          [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+          [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+          [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+          [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+          [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+          [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+          [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+          [src]"+&r"(src),          [width]"+&r"(w),
+          [dst]"+&r"(dst),          [height]"+&r"(h),
+          [ftmp13]"=&f"(ftmp[14])
+        : [filter]"r"(filter_x),
+          [src_stride]"r"((mips_reg)src_stride),
+          [dst_stride]"r"((mips_reg)dst_stride)
+        : "memory"
+    );
+}
+
+static void convolve_avg_vert_mmi(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int16_t *filter_y, int32_t w,
+                                  int32_t h)
+{
+    double ftmp[17];
+    uint32_t tmp[1];
+    ptrdiff_t addr = src_stride;
+    src_stride -= w;
+    dst_stride -= w;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+        "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+        "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+        "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+        "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+        "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+        "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+        "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+        "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+        "li         %[tmp0],     0x07                      \n\t"
+        "dmtc1      %[tmp0],     %[ftmp13]                 \n\t"
+        "punpcklwd  %[ftmp13],   %[ftmp13],  %[ftmp13]     \n\t"
+        "1:                                                \n\t"
+        /* Get 8 data per column */
+        "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+        "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+        PTR_ADDU   "%[tmp0],     %[src],     %[addr]       \n\t"
+        "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+        PTR_ADDU   "%[tmp0],     %[tmp0],    %[addr]       \n\t"
+        "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+        "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+        "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+        PTR_ADDIU  "%[width],    %[width],   -0x04         \n\t"
+        /* Get raw data */
+        GET_DATA_V_MMI
+        ROUND_POWER_OF_TWO_MMI(%[srcl], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        ROUND_POWER_OF_TWO_MMI(%[srch], %[ftmp13], %[ftmp5],
+                               %[ftmp6], %[tmp0])
+        "packsswh   %[srcl],     %[srcl],    %[srch]       \n\t"
+        "packushb   %[ftmp12],   %[srcl],    %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+        "gsldlc1    %[ftmp4],    0x07(%[dst])              \n\t"
+        "gsldrc1    %[ftmp4],    0x00(%[dst])              \n\t"
+        "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+        "paddh      %[ftmp12],   %[ftmp12],  %[ftmp4]      \n\t"
+        "li         %[tmp0],     0x10001                   \n\t"
+        "dmtc1      %[tmp0],     %[ftmp5]                  \n\t"
+        "punpcklhw  %[ftmp5],    %[ftmp5],   %[ftmp5]      \n\t"
+        "paddh      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+        "psrah      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+        "packushb   %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+        "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+        PTR_ADDIU  "%[dst],      %[dst],     0x04          \n\t"
+        PTR_ADDIU  "%[src],      %[src],     0x04          \n\t"
+        /* Loop count */
+        "bnez       %[width],    1b                        \n\t"
+        PTR_SUBU   "%[width],    %[addr],    %[src_stride] \n\t"
+        PTR_ADDU   "%[src],      %[src],     %[src_stride] \n\t"
+        PTR_ADDU   "%[dst],      %[dst],     %[dst_stride] \n\t"
+        PTR_ADDIU  "%[height],   %[height],  -0x01         \n\t"
+        "bnez       %[height],   1b                        \n\t"
+        : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+          [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+          [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+          [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+          [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+          [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+          [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+          [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+          [src]"+&r"(src),          [dst]"+&r"(dst),
+          [width]"+&r"(w),          [height]"+&r"(h),
+          [tmp0]"=&r"(tmp[0]),      [ftmp13]"=&f"(ftmp[16])
+        : [filter]"r"(filter_y),
+          [src_stride]"r"((mips_reg)src_stride),
+          [dst_stride]"r"((mips_reg)dst_stride),
+          [addr]"r"((mips_reg)addr)
+        : "memory"
+    );
+}
+
+static void convolve_avg_mmi(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t w, int32_t h)
+{
+    double ftmp[4];
+    uint32_t tmp[2];
+    src_stride -= w;
+    dst_stride -= w;
+
+    __asm__ volatile (
+        "move       %[tmp1],    %[width]                  \n\t"
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
+        "li         %[tmp0],    0x10001                   \n\t"
+        "dmtc1      %[tmp0],    %[ftmp3]                  \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
+        "1:                                               \n\t"
+        "gslwlc1    %[ftmp1],   0x07(%[src])              \n\t"
+        "gslwrc1    %[ftmp1],   0x00(%[src])              \n\t"
+        "gslwlc1    %[ftmp2],   0x07(%[dst])              \n\t"
+        "gslwrc1    %[ftmp2],   0x00(%[dst])              \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+        "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
+        PTR_ADDIU  "%[width],   %[width],   -0x04         \n\t"
+        PTR_ADDIU  "%[dst],     %[dst],     0x04          \n\t"
+        PTR_ADDIU  "%[src],     %[src],     0x04          \n\t"
+        "bnez       %[width],   1b                        \n\t"
+        "move       %[width],   %[tmp1]                   \n\t"
+        PTR_ADDU   "%[dst],     %[dst],     %[dst_stride] \n\t"
+        PTR_ADDU   "%[src],     %[src],     %[src_stride] \n\t"
+        PTR_ADDIU  "%[height],  %[height],  -0x01         \n\t"
+        "bnez       %[height],  1b                        \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
+          [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
+          [src]"+&r"(src),        [dst]"+&r"(dst),
+          [width]"+&r"(w),        [height]"+&r"(h)
+        : [src_stride]"r"((mips_reg)src_stride),
+          [dst_stride]"r"((mips_reg)dst_stride)
+        : "memory"
+    );
+}
+
+static const int16_t vp9_subpel_filters_mmi[3][15][8] = {
+    [FILTER_8TAP_REGULAR] = {
+         {0, 1, -5, 126, 8, -3, 1, 0},
+         {-1, 3, -10, 122, 18, -6, 2, 0},
+         {-1, 4, -13, 118, 27, -9, 3, -1},
+         {-1, 4, -16, 112, 37, -11, 4, -1},
+         {-1, 5, -18, 105, 48, -14, 4, -1},
+         {-1, 5, -19, 97, 58, -16, 5, -1},
+         {-1, 6, -19, 88, 68, -18, 5, -1},
+         {-1, 6, -19, 78, 78, -19, 6, -1},
+         {-1, 5, -18, 68, 88, -19, 6, -1},
+         {-1, 5, -16, 58, 97, -19, 5, -1},
+         {-1, 4, -14, 48, 105, -18, 5, -1},
+         {-1, 4, -11, 37, 112, -16, 4, -1},
+         {-1, 3, -9, 27, 118, -13, 4, -1},
+         {0, 2, -6, 18, 122, -10, 3, -1},
+         {0, 1, -3, 8, 126, -5, 1, 0},
+    }, [FILTER_8TAP_SHARP] = {
+        {-1, 3, -7, 127, 8, -3, 1, 0},
+        {-2, 5, -13, 125, 17, -6, 3, -1},
+        {-3, 7, -17, 121, 27, -10, 5, -2},
+        {-4, 9, -20, 115, 37, -13, 6, -2},
+        {-4, 10, -23, 108, 48, -16, 8, -3},
+        {-4, 10, -24, 100, 59, -19, 9, -3},
+        {-4, 11, -24, 90, 70, -21, 10, -4},
+        {-4, 11, -23, 80, 80, -23, 11, -4},
+        {-4, 10, -21, 70, 90, -24, 11, -4},
+        {-3, 9, -19, 59, 100, -24, 10, -4},
+        {-3, 8, -16, 48, 108, -23, 10, -4},
+        {-2, 6, -13, 37, 115, -20, 9, -4},
+        {-2, 5, -10, 27, 121, -17, 7, -3},
+        {-1, 3, -6, 17, 125, -13, 5, -2},
+        {0, 1, -3, 8, 127, -7, 3, -1},
+    }, [FILTER_8TAP_SMOOTH] = {
+        {-3, -1, 32, 64, 38, 1, -3, 0},
+        {-2, -2, 29, 63, 41, 2, -3, 0},
+        {-2, -2, 26, 63, 43, 4, -4, 0},
+        {-2, -3, 24, 62, 46, 5, -4, 0},
+        {-2, -3, 21, 60, 49, 7, -4, 0},
+        {-1, -4, 18, 59, 51, 9, -4, 0},
+        {-1, -4, 16, 57, 53, 12, -4, -1},
+        {-1, -4, 14, 55, 55, 14, -4, -1},
+        {-1, -4, 12, 53, 57, 16, -4, -1},
+        {0, -4, 9, 51, 59, 18, -4, -1},
+        {0, -4, 7, 49, 60, 21, -3, -2},
+        {0, -4, 5, 46, 62, 24, -3, -2},
+        {0, -4, 4, 43, 63, 26, -2, -2},
+        {0, -3, 2, 41, 63, 29, -2, -2},
+        {0, -3, 1, 38, 64, 32, -1, -3},
+    }
+};
+
+#define VP9_8TAP_MIPS_MMI_FUNC(SIZE, TYPE, TYPE_IDX)                           \
+void ff_put_8tap_##TYPE##_##SIZE##h_mmi(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int16_t *filter = vp9_subpel_filters_mmi[TYPE_IDX][mx-1];            \
+                                                                               \
+    convolve_horiz_mmi(src, srcstride, dst, dststride, filter, SIZE, h);       \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##TYPE##_##SIZE##v_mmi(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int16_t *filter = vp9_subpel_filters_mmi[TYPE_IDX][my-1];            \
+                                                                               \
+    src -= (3 * srcstride);                                                    \
+    convolve_vert_mmi(src, srcstride, dst, dststride, filter, SIZE, h);        \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##TYPE##_##SIZE##hv_mmi(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const uint16_t *hfilter = vp9_subpel_filters_mmi[TYPE_IDX][mx-1];          \
+    const uint16_t *vfilter = vp9_subpel_filters_mmi[TYPE_IDX][my-1];          \
+                                                                               \
+    int tmp_h = h + 7;                                                         \
+    uint8_t temp[64 * 71];                                                     \
+    src -= (3 * srcstride);                                                    \
+    convolve_horiz_mmi(src, srcstride, temp, 64, hfilter, SIZE, tmp_h);        \
+    convolve_vert_mmi(temp, 64, dst, dststride, vfilter, SIZE, h);             \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##TYPE##_##SIZE##h_mmi(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int16_t *filter = vp9_subpel_filters_mmi[TYPE_IDX][mx-1];            \
+                                                                               \
+    convolve_avg_horiz_mmi(src, srcstride, dst, dststride, filter, SIZE, h);   \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##TYPE##_##SIZE##v_mmi(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int16_t *filter = vp9_subpel_filters_mmi[TYPE_IDX][my-1];            \
+                                                                               \
+    src -= (3 * srcstride);                                                    \
+    convolve_avg_vert_mmi(src, srcstride, dst, dststride, filter, SIZE, h);    \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##TYPE##_##SIZE##hv_mmi(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const uint16_t *hfilter = vp9_subpel_filters_mmi[TYPE_IDX][mx-1];          \
+    const uint16_t *vfilter = vp9_subpel_filters_mmi[TYPE_IDX][my-1];          \
+                                                                               \
+    uint8_t temp1[64 * 64];                                                    \
+    uint8_t temp2[64 * 71];                                                    \
+    int tmp_h = h + 7;                                                         \
+    src -= (3 * srcstride);                                                    \
+    convolve_horiz_mmi(src, srcstride, temp2, 64, hfilter, SIZE, tmp_h);       \
+    convolve_vert_mmi(temp2, 64, temp1, 64, vfilter, SIZE, h);                 \
+    convolve_avg_mmi(temp1, 64, dst, dststride, SIZE, h);                      \
+}
+
+VP9_8TAP_MIPS_MMI_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MMI_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MMI_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+#undef VP9_8TAP_MIPS_MMI_FUNC
diff --git a/libavcodec/mips/vp9_mc_msa.c b/libavcodec/mips/vp9_mc_msa.c
new file mode 100644
index 0000000..749e8cb
--- /dev/null
+++ b/libavcodec/mips/vp9_mc_msa.c
@@ -0,0 +1,4447 @@
+/*
+ * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static const int8_t vp9_bilinear_filters_msa[15][2] = {
+    {120, 8},
+    {112, 16},
+    {104, 24},
+    {96, 32},
+    {88, 40},
+    {80, 48},
+    {72, 56},
+    {64, 64},
+    {56, 72},
+    {48, 80},
+    {40, 88},
+    {32, 96},
+    {24, 104},
+    {16, 112},
+    {8, 120}
+};
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
+                            filt0, filt1, filt2, filt3)         \
+( {                                                             \
+    v8i16 tmp0, tmp1;                                           \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+    tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \
+    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
+    tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,          \
+                        filt_h0, filt_h1, filt_h2, filt_h3)              \
+( {                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
+               vec0_m, vec1_m, vec2_m, vec3_m);                          \
+    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
+                                   filt_h0, filt_h1, filt_h2, filt_h3);  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+} )
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1)                              \
+{                                                                           \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                   \
+                                                                            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2, mask3,                \
+                                   filt0, filt1, filt2, filt3,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                res0_m, res1_m, res2_m, res3_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+                res4_m, res5_m, res6_m, res7_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+                 res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+                 res4_m, res5_m, res6_m, res7_m);                             \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+                res7_m, out0, out1, out2, out3);                              \
+}
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)  \
+{                                                     \
+    v16u8 tmp_m;                                      \
+                                                      \
+    tmp_m = PCKEV_XORI128_UB(in1, in0);               \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);       \
+    ST_UB(tmp_m, (pdst));                             \
+}
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
+{                                                             \
+    v16u8 tmp_m;                                              \
+                                                              \
+    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
+    ST_UB(tmp_m, (pdst));                                     \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3,  dst0, dst1,   \
+                           pdst, stride)                      \
+{                                                             \
+    v16u8 tmp0_m, tmp1_m;                                     \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                     \
+                                                              \
+    PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);          \
+    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                 \
+}
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+
+        src0 = LD_SB(src + 32);
+        src2 = LD_SB(src + 48);
+        src3 = LD_SB(src + 56);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst + 32);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                   src87_l, src98_l, src109_l);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                     filt1, filt2, filt3);
+        out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                     filt1, filt2, filt3);
+        out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                     filt1, filt2, filt3);
+        out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                     filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src54_l = src98_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src65_l = src109_l;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      int32_t width)
+{
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              64);
+}
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
+        out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out5 = hz_out9;
+        out0 = out2;
+        out1 = out3;
+        out2 = out4;
+    }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
+                                   filt_vt1, filt_vt2, filt_vt3);
+
+        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
+        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out6 = hz_out10;
+        out0 = out2;
+        out1 = out3;
+        out2 = out8;
+        out4 = out6;
+        out5 = out7;
+        out6 = out9;
+    }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 8; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst0, res;
+    v16u8 mask0, mask1, mask2, mask3;
+    v8i16 filt, res0, res1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, res0, res1);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    SRARI_H2_SH(res0, res1, 7);
+    SAT_SH2_SH(res0, res1, 7);
+    res = PCKEV_XORI128_UB(res0, res1);
+    res = (v16u8) __msa_aver_u_b(res, dst0);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+    v16u8 dst0, dst1;
+    v8i16 filt, vec0, vec1, vec2, vec3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, vec0, vec1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, vec2, vec3);
+    SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
+    SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                res0, res1, res2, res3);
+    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+    XORI_B2_128_UB(res0, res2);
+    AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+    ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    int32_t loop_cnt;
+    int64_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                   vec12);
+        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                   vec13);
+        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                   vec14);
+        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                   vec15);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                    vec9, vec10, vec11);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                     vec1, vec2, vec3);
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                     vec8, vec9, vec10, vec11);
+        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                    out1, out2, out3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+        dst += dst_stride;
+        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                   vec12);
+        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                   vec13);
+        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                   vec14);
+        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                   vec15);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                    vec9, vec10, vec11);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                     vec1, vec2, vec3);
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                     vec8, vec9, vec10, vec11);
+        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                    out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        LD_UB2(dst, 16, dst1, dst2);
+        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        for (cnt = 0; cnt < 2; ++cnt) {
+            src0 = LD_SB(&src[cnt << 5]);
+            src2 = LD_SB(&src[16 + (cnt << 5)]);
+            src3 = LD_SB(&src[24 + (cnt << 5)]);
+            src1 = __msa_sldi_b(src2, src0, 8);
+
+            XORI_B4_128_SB(src0, src1, src2, src3);
+            VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                       vec12);
+            VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                       vec13);
+            VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
+                       vec10, vec14);
+            VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
+                       vec11, vec15);
+            DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                        vec0, vec1, vec2, vec3);
+            DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
+                        vec8, vec9, vec10, vec11);
+            DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                         vec0, vec1, vec2, vec3);
+            DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                         vec8, vec9, vec10, vec11);
+            ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                        out1, out2, out3);
+            SRARI_H4_SH(out0, out1, out2, out3, 7);
+            SAT_SH4_SH(out0, out1, out2, out3, 7);
+            LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+            PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+            PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, out;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v8i16 filt, out10, out32;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        out = __msa_aver_u_b(out, dst0);
+
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, dst1;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+        out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+        out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+        out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter,
+                                                   int32_t height,
+                                                   int32_t width)
+{
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+
+            LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                        dst0, dst1, dst2, dst3);
+            ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 64);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter_horiz,
+                                                  const int8_t *filter_vert,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, res, mask0, mask1, mask2, mask3;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
+        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        SRARI_H2_SH(res0, res1, 7);
+        SAT_SH2_SH(res0, res1, 7);
+        res = PCKEV_XORI128_UB(res0, res1);
+        res = (v16u8) __msa_aver_u_b(res, dst0);
+        ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out5 = hz_out9;
+        vec0 = vec2;
+        vec1 = vec3;
+        vec2 = vec4;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter_horiz,
+                                                  const int8_t *filter_vert,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+    v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
+        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out6 = hz_out10;
+        out0 = out2;
+        out1 = out3;
+        out2 = out8;
+        out4 = out6;
+        out5 = out7;
+        out6 = out9;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 8; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, 7);
+    SRARI_H4_UH(out4, out5, out6, out7, 7);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src5 = __msa_sldi_b(src6, src4, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        PCKEV_ST_SB(out2, out3, dst + 16);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        PCKEV_ST_SB(out6, out7, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src4 = LD_SB(src + 32);
+        src6 = LD_SB(src + 48);
+        src7 = LD_SB(src + 56);
+        SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        PCKEV_ST_SB(out2, out3, dst + 16);
+        PCKEV_ST_SB(out4, out5, dst + 32);
+        PCKEV_ST_SB(out6, out7, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src5 = LD_UB(src + 16);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+        src += (4 * src_stride);
+
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src5 = src9;
+    }
+}
+
+void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB4(src, 16, src0, src3, src6, src9);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        LD_UB2(src + 16, src_stride, src4, src5);
+        LD_UB2(src + 32, src_stride, src7, src8);
+        LD_UB2(src + 48, src_stride, src10, src11);
+        src += (2 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src3 = src5;
+        src6 = src8;
+        src9 = src11;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int8_t *filter_horiz, const int8_t *filter_vert,
+                                   int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, 7);
+        SAT_UH2_UH(tmp3, tmp4, 7);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, vec0, vec1, res;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+
+    res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
+    res = (v16u8) __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v16u8 dst0, dst1;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+                vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+                res2, res3);
+    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+    AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+    ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    int64_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter,
+                                                  int32_t height)
+{
+    int64_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
+    }
+}
+
+void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else {
+        common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                              filter, height);
+    }
+}
+
+void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+                res2, res3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+                res6, res7);
+    SRARI_H4_UH(res0, res1, res2, res3, 7);
+    SRARI_H4_UH(res4, res5, res6, res7, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+    dst += dst_stride;
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
+                    res1, res2, res3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
+                    res5, res6, res7);
+        SRARI_H4_UH(res0, res1, res2, res3, 7);
+        SRARI_H4_UH(res4, res5, res6, res7, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src5 = __msa_sldi_b(src6, src4, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    res0, res1, res2, res3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    res4, res5, res6, res7);
+        SRARI_H4_UH(res0, res1, res2, res3, 7);
+        SRARI_H4_UH(res4, res5, res6, res7, 7);
+        LD_UB2(dst, 16, dst0, dst1);
+        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+        PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+        dst += dst_stride;
+        LD_UB2(dst, 16, dst2, dst3);
+        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+        PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src, 16, src0, src2, src4, src6);
+        src7 = LD_SB(src + 56);
+        SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+        PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+        PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+        PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 dst0, out, filt0, src2110, src4332;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    src4 = LD_SB(src);
+    src += src_stride;
+
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    out = __msa_aver_u_b(out, dst0);
+
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 dst0, dst1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16u8 src2110, src4332, src6554, src8776, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+    ST4x8_UB(src2110, src4332, dst, dst_stride);
+}
+
+void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    int64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    int64_t tp0, tp1, tp2, tp3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst2);
+        INSERT_D2_UB(tp2, tp3, dst3);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else {
+        common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                              filter, height);
+    }
+}
+
+void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_UB2(src, 16, src0, src5);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+        LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+        src += (4 * src_stride);
+
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src5 = src9;
+    }
+}
+
+void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5;
+    v16u8 src6, src7, src8, src9, src10, src11, filt0;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8u16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_UB4(src, 16, src0, src3, src6, src9);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        LD_UB2(src + 16, src_stride, src4, src5);
+        LD_UB2(dst + 16, dst_stride, dst2, dst3);
+        LD_UB2(src + 32, src_stride, src7, src8);
+        LD_UB2(dst + 32, dst_stride, dst4, dst5);
+        LD_UB2(src + 48, src_stride, src10, src11);
+        LD_UB2(dst + 48, dst_stride, dst6, dst7);
+        src += (2 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src3 = src5;
+        src6 = src8;
+        src9 = src11;
+    }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v16u8 dst0, out;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    out = __msa_aver_u_b(out, dst0);
+
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+    v16u8 dst0, dst1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+    LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST4x8_UB(res0, res1, dst, dst_stride);
+}
+
+void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       const int8_t *filter_horiz,
+                                                       const int8_t *filter_vert,
+                                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t tp0, tp1, tp2, tp3;
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride,
+                                                   dst, dst_stride,
+                                                   filter_horiz, filter_vert,
+                                                   height);
+    }
+}
+
+void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+    if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD4(src, src_stride, out0, out1, out2, out3);
+            src += (4 * src_stride);
+            LD4(src, src_stride, out4, out5, out6, out7);
+            src += (4 * src_stride);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD4(src, src_stride, out0, out1, out2, out3);
+            src += (4 * src_stride);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (8 == height) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    } else if (16 == height) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+        dst += (8 * dst_stride);
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    } else if (32 == height) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+        dst += (8 * dst_stride);
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+        dst += (8 * dst_stride);
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+        dst += (8 * dst_stride);
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 8) {
+        for (cnt = (height >> 3); cnt--;) {
+            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+            LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6,
+                   src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16,
+                   dst_stride);
+            dst += (8 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    for (cnt = (height >> 2); cnt--;) {
+        LD_UB4(src, 16, src0, src1, src2, src3);
+        src += src_stride;
+        LD_UB4(src, 16, src4, src5, src6, src7);
+        src += src_stride;
+        LD_UB4(src, 16, src8, src9, src10, src11);
+        src += src_stride;
+        LD_UB4(src, 16, src12, src13, src14, src15);
+        src += src_stride;
+
+        ST_UB4(src0, src1, src2, src3, dst, 16);
+        dst += dst_stride;
+        ST_UB4(src4, src5, src6, src7, dst, 16);
+        dst += dst_stride;
+        ST_UB4(src8, src9, src10, src11, dst, 16);
+        dst += dst_stride;
+        ST_UB4(src12, src13, src14, src15, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
+
+    if (8 == height) {
+        LW4(src, src_stride, tp0, tp1, tp2, tp3);
+        src += 4 * src_stride;
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+        LW4(src, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
+        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+        LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+    } else if (4 == height) {
+        LW4(src, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+        LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+        dst0 = __msa_aver_u_b(src0, dst0);
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 8)) {
+        for (cnt = (height >> 3); cnt--;) {
+            LD4(src, src_stride, tp0, tp1, tp2, tp3);
+            src += 4 * src_stride;
+            LD4(src, src_stride, tp4, tp5, tp6, tp7);
+            src += 4 * src_stride;
+            INSERT_D2_UB(tp0, tp1, src0);
+            INSERT_D2_UB(tp2, tp3, src1);
+            INSERT_D2_UB(tp4, tp5, src2);
+            INSERT_D2_UB(tp6, tp7, src3);
+            LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+            LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
+            INSERT_D2_UB(tp0, tp1, dst0);
+            INSERT_D2_UB(tp2, tp3, dst1);
+            INSERT_D2_UB(tp4, tp5, dst2);
+            INSERT_D2_UB(tp6, tp7, dst3);
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
+                        dst1, dst2, dst3);
+            ST8x8_UB(dst0, dst1, dst2, dst3, dst, dst_stride);
+            dst += 8 * dst_stride;
+        }
+    } else if (4 == height) {
+        LD4(src, src_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, src0);
+        INSERT_D2_UB(tp2, tp3, src1);
+        LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+        INSERT_D2_UB(tp0, tp1, dst0);
+        INSERT_D2_UB(tp2, tp3, dst1);
+        AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    if (0 == (height % 8)) {
+        for (cnt = (height / 8); cnt--;) {
+            LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                        dst4, dst5, dst6, dst7);
+            ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    } else if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint8_t *dst_dup = dst;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+    if (0 == (height % 8)) {
+        for (cnt = (height / 8); cnt--;) {
+            LD_UB4(src, src_stride, src0, src2, src4, src6);
+            LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+            src += (4 * src_stride);
+            LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+            LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+            dst_dup += (4 * dst_stride);
+            LD_UB4(src, src_stride, src8, src10, src12, src14);
+            LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+            src += (4 * src_stride);
+            LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+            LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+            dst_dup += (4 * dst_stride);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                        dst4, dst5, dst6, dst7);
+            AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                        dst8, dst9, dst10, dst11);
+            AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                        dst12, dst13, dst14, dst15);
+
+            ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+            ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+            ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+            ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src2, src4, src6);
+            LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+            src += (4 * src_stride);
+            LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+            LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+            dst_dup += (4 * dst_stride);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+            AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                        dst4, dst5, dst6, dst7);
+
+            ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+            ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint8_t *dst_dup = dst;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, 16, src0, src1, src2, src3);
+        src += src_stride;
+        LD_UB4(src, 16, src4, src5, src6, src7);
+        src += src_stride;
+        LD_UB4(src, 16, src8, src9, src10, src11);
+        src += src_stride;
+        LD_UB4(src, 16, src12, src13, src14, src15);
+        src += src_stride;
+
+        LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+        dst_dup += dst_stride;
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                    dst8, dst9, dst10, dst11);
+        AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                    dst12, dst13, dst14, dst15);
+
+        ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static const int8_t vp9_subpel_filters_msa[3][15][8] = {
+    [FILTER_8TAP_REGULAR] = {
+         {0, 1, -5, 126, 8, -3, 1, 0},
+         {-1, 3, -10, 122, 18, -6, 2, 0},
+         {-1, 4, -13, 118, 27, -9, 3, -1},
+         {-1, 4, -16, 112, 37, -11, 4, -1},
+         {-1, 5, -18, 105, 48, -14, 4, -1},
+         {-1, 5, -19, 97, 58, -16, 5, -1},
+         {-1, 6, -19, 88, 68, -18, 5, -1},
+         {-1, 6, -19, 78, 78, -19, 6, -1},
+         {-1, 5, -18, 68, 88, -19, 6, -1},
+         {-1, 5, -16, 58, 97, -19, 5, -1},
+         {-1, 4, -14, 48, 105, -18, 5, -1},
+         {-1, 4, -11, 37, 112, -16, 4, -1},
+         {-1, 3, -9, 27, 118, -13, 4, -1},
+         {0, 2, -6, 18, 122, -10, 3, -1},
+         {0, 1, -3, 8, 126, -5, 1, 0},
+    }, [FILTER_8TAP_SHARP] = {
+        {-1, 3, -7, 127, 8, -3, 1, 0},
+        {-2, 5, -13, 125, 17, -6, 3, -1},
+        {-3, 7, -17, 121, 27, -10, 5, -2},
+        {-4, 9, -20, 115, 37, -13, 6, -2},
+        {-4, 10, -23, 108, 48, -16, 8, -3},
+        {-4, 10, -24, 100, 59, -19, 9, -3},
+        {-4, 11, -24, 90, 70, -21, 10, -4},
+        {-4, 11, -23, 80, 80, -23, 11, -4},
+        {-4, 10, -21, 70, 90, -24, 11, -4},
+        {-3, 9, -19, 59, 100, -24, 10, -4},
+        {-3, 8, -16, 48, 108, -23, 10, -4},
+        {-2, 6, -13, 37, 115, -20, 9, -4},
+        {-2, 5, -10, 27, 121, -17, 7, -3},
+        {-1, 3, -6, 17, 125, -13, 5, -2},
+        {0, 1, -3, 8, 127, -7, 3, -1},
+    }, [FILTER_8TAP_SMOOTH] = {
+        {-3, -1, 32, 64, 38, 1, -3, 0},
+        {-2, -2, 29, 63, 41, 2, -3, 0},
+        {-2, -2, 26, 63, 43, 4, -4, 0},
+        {-2, -3, 24, 62, 46, 5, -4, 0},
+        {-2, -3, 21, 60, 49, 7, -4, 0},
+        {-1, -4, 18, 59, 51, 9, -4, 0},
+        {-1, -4, 16, 57, 53, 12, -4, -1},
+        {-1, -4, 14, 55, 55, 14, -4, -1},
+        {-1, -4, 12, 53, 57, 16, -4, -1},
+        {0, -4, 9, 51, 59, 18, -4, -1},
+        {0, -4, 7, 49, 60, 21, -3, -2},
+        {0, -4, 5, 46, 62, 24, -3, -2},
+        {0, -4, 4, 43, 63, 26, -2, -2},
+        {0, -3, 2, 41, 63, 29, -2, -2},
+        {0, -3, 1, 38, 64, 32, -1, -3},
+    }
+};
+
+#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                           \
+void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
+                                                                               \
+    common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
+                                                                               \
+    common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \
+    const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \
+                                                                               \
+    common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter,   \
+                                    vfilter, h);                               \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
+                                                                               \
+    common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,               \
+                                            dststride, filter, h);             \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
+                                                                               \
+    common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride,    \
+                                            filter, h);                        \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];            \
+    const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];            \
+                                                                               \
+    common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,          \
+                                                 dststride, hfilter,           \
+                                                 vfilter, h);                  \
+}
+
+#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
+void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                         const uint8_t *src, ptrdiff_t srcstride,  \
+                         int h, int mx, int my)                    \
+{                                                                  \
+                                                                   \
+    copy_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
+}                                                                  \
+                                                                   \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                        const uint8_t *src, ptrdiff_t srcstride,   \
+                        int h, int mx, int my)                     \
+{                                                                  \
+                                                                   \
+    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);      \
+}
+
+#define VP9_AVG_MIPS_MSA_FUNC(SIZE)                               \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                        const uint8_t *src, ptrdiff_t srcstride,  \
+                        int h, int mx, int my)                    \
+{                                                                 \
+                                                                  \
+    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
+}
+
+VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+VP9_COPY_AVG_MIPS_MSA_FUNC(64);
+VP9_COPY_AVG_MIPS_MSA_FUNC(32);
+VP9_COPY_AVG_MIPS_MSA_FUNC(16);
+VP9_COPY_AVG_MIPS_MSA_FUNC(8);
+VP9_AVG_MIPS_MSA_FUNC(4);
+
+#undef VP9_8TAP_MIPS_MSA_FUNC
+#undef VP9_COPY_AVG_MIPS_MSA_FUNC
+#undef VP9_AVG_MIPS_MSA_FUNC
diff --git a/libavcodec/mips/vp9dsp_init_mips.c b/libavcodec/mips/vp9dsp_init_mips.c
new file mode 100644
index 0000000..5990fa6
--- /dev/null
+++ b/libavcodec/mips/vp9dsp_init_mips.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/common.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void vp9dsp_intrapred_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_intra_pred_msa(tx, sz)                             \
+    dsp->intra_pred[tx][VERT_PRED]    = ff_vert_##sz##_msa;     \
+    dsp->intra_pred[tx][HOR_PRED]     = ff_hor_##sz##_msa;      \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_msa;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_128_PRED]  = ff_dc_128_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_127_PRED]  = ff_dc_127_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_129_PRED]  = ff_dc_129_##sz##_msa;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_msa;       \
+
+    init_intra_pred_msa(TX_16X16, 16x16);
+    init_intra_pred_msa(TX_32X32, 32x32);
+#undef init_intra_pred_msa
+
+#define init_intra_pred_msa(tx, sz)                             \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_msa;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_msa;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_msa;       \
+
+    init_intra_pred_msa(TX_4X4, 4x4);
+    init_intra_pred_msa(TX_8X8, 8x8);
+#undef init_intra_pred_msa
+    }
+}
+
+static av_cold void vp9dsp_itxfm_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_itxfm(tx, sz)                                         \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_idct_idct_##sz##_add_msa;   \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_iadst_idct_##sz##_add_msa;  \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_idct_iadst_##sz##_add_msa;  \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_iadst_iadst_##sz##_add_msa  \
+
+#define init_idct(tx, nm)                        \
+    dsp->itxfm_add[tx][DCT_DCT]   =              \
+    dsp->itxfm_add[tx][ADST_DCT]  =              \
+    dsp->itxfm_add[tx][DCT_ADST]  =              \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_msa
+
+    init_itxfm(TX_4X4, 4x4);
+    init_itxfm(TX_8X8, 8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32, ff_idct_idct_32x32);
+#undef init_itxfm
+#undef init_idct
+    }
+}
+
+static av_cold void vp9dsp_mc_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_fpel(idx1, idx2, sz, type)                                    \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_##type##sz##_msa
+
+#define init_copy_avg(idx, sz)    \
+    init_fpel(idx, 0, sz, copy);  \
+    init_fpel(idx, 1, sz, avg)
+
+#define init_avg(idx, sz)  \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_avg
+#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] =   \
+        ff_##type##_bilin_##sz##dir##_msa;                   \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_msa;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_msa;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_msa;
+
+#define init_subpel2(idx, idxh, idxv, dir, type)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type)         \
+    init_subpel2(idx, 1, 1, hv, type);  \
+    init_subpel2(idx, 0, 1, v, type);   \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+    }
+}
+
+static av_cold void vp9dsp_loopfilter_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+        dsp->loop_filter_8[0][0] = ff_loop_filter_h_4_8_msa;
+        dsp->loop_filter_8[0][1] = ff_loop_filter_v_4_8_msa;
+        dsp->loop_filter_8[1][0] = ff_loop_filter_h_8_8_msa;
+        dsp->loop_filter_8[1][1] = ff_loop_filter_v_8_8_msa;
+        dsp->loop_filter_8[2][0] = ff_loop_filter_h_16_8_msa;
+        dsp->loop_filter_8[2][1] = ff_loop_filter_v_16_8_msa;
+
+        dsp->loop_filter_16[0] = ff_loop_filter_h_16_16_msa;
+        dsp->loop_filter_16[1] = ff_loop_filter_v_16_16_msa;
+
+        dsp->loop_filter_mix2[0][0][0] = ff_loop_filter_h_44_16_msa;
+        dsp->loop_filter_mix2[0][0][1] = ff_loop_filter_v_44_16_msa;
+        dsp->loop_filter_mix2[0][1][0] = ff_loop_filter_h_48_16_msa;
+        dsp->loop_filter_mix2[0][1][1] = ff_loop_filter_v_48_16_msa;
+        dsp->loop_filter_mix2[1][0][0] = ff_loop_filter_h_84_16_msa;
+        dsp->loop_filter_mix2[1][0][1] = ff_loop_filter_v_84_16_msa;
+        dsp->loop_filter_mix2[1][1][0] = ff_loop_filter_h_88_16_msa;
+        dsp->loop_filter_mix2[1][1][1] = ff_loop_filter_v_88_16_msa;
+    }
+}
+
+static av_cold void vp9dsp_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    vp9dsp_intrapred_init_msa(dsp, bpp);
+    vp9dsp_itxfm_init_msa(dsp, bpp);
+    vp9dsp_mc_init_msa(dsp, bpp);
+    vp9dsp_loopfilter_init_msa(dsp, bpp);
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void vp9dsp_mc_init_mmi(VP9DSPContext *dsp)
+{
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_mmi;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_mmi;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_mmi;
+
+#define init_subpel2(idx, idxh, idxv, dir, type)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type)         \
+    init_subpel2(idx, 1, 1, hv, type);  \
+    init_subpel2(idx, 0, 1, v, type);   \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+}
+
+static av_cold void vp9dsp_init_mmi(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+        vp9dsp_mc_init_mmi(dsp);
+    }
+}
+#endif  // #if HAVE_MMI
+
+av_cold void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp)
+{
+#if HAVE_MMI
+    vp9dsp_init_mmi(dsp, bpp);
+#endif  // #if HAVE_MMI
+#if HAVE_MSA
+    vp9dsp_init_msa(dsp, bpp);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/vp9dsp_mips.h b/libavcodec/mips/vp9dsp_mips.h
new file mode 100644
index 0000000..0b6ce7c
--- /dev/null
+++ b/libavcodec/mips/vp9dsp_mips.h
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VP9DSP_MIPS_H
+#define AVCODEC_MIPS_VP9DSP_MIPS_H
+
+#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                         \
+void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);             \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);
+
+#define VP9_BILINEAR_MIPS_MSA_FUNC(SIZE)                                   \
+void ff_put_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_put_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_put_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                                 const uint8_t *src, ptrdiff_t srcstride,  \
+                                 int h, int mx, int my);                   \
+                                                                           \
+void ff_avg_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_avg_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_avg_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                                 const uint8_t *src, ptrdiff_t srcstride,  \
+                                 int h, int mx, int my);
+
+#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
+void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                         const uint8_t *src, ptrdiff_t srcstride,  \
+                         int h, int mx, int my);                   \
+                                                                   \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                        const uint8_t *src, ptrdiff_t srcstride,   \
+                        int h, int mx, int my);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+VP9_BILINEAR_MIPS_MSA_FUNC(64);
+VP9_BILINEAR_MIPS_MSA_FUNC(32);
+VP9_BILINEAR_MIPS_MSA_FUNC(16);
+VP9_BILINEAR_MIPS_MSA_FUNC(8);
+VP9_BILINEAR_MIPS_MSA_FUNC(4);
+
+VP9_COPY_AVG_MIPS_MSA_FUNC(64);
+VP9_COPY_AVG_MIPS_MSA_FUNC(32);
+VP9_COPY_AVG_MIPS_MSA_FUNC(16);
+VP9_COPY_AVG_MIPS_MSA_FUNC(8);
+VP9_COPY_AVG_MIPS_MSA_FUNC(4);
+
+#undef VP9_8TAP_MIPS_MSA_FUNC
+#undef VP9_BILINEAR_MIPS_MSA_FUNC
+#undef VP9_COPY_AVG_MIPS_MSA_FUNC
+
+void ff_loop_filter_h_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_v_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_h_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                  int16_t *block, int eob);
+void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob);
+void ff_idct_iadst_4x4_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_idct_iadst_8x8_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_idct_iadst_16x16_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                                 int16_t *block, int eob);
+void ff_iwht_iwht_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+
+#define VP9_8TAP_MIPS_MMI_FUNC(SIZE, type, type_idx)                         \
+void ff_put_8tap_##type##_##SIZE##h_mmi(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##v_mmi(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##hv_mmi(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);             \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##h_mmi(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##v_mmi(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##hv_mmi(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);
+
+VP9_8TAP_MIPS_MMI_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MMI_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MMI_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MMI_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MMI_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MMI_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+#undef VP9_8TAP_MIPS_MMI_FUNC
+
+#endif  // #ifndef AVCODEC_MIPS_VP9DSP_MIPS_H
diff --git a/libavcodec/mips/wmv2dsp_init_mips.c b/libavcodec/mips/wmv2dsp_init_mips.c
new file mode 100644
index 0000000..51dd207
--- /dev/null
+++ b/libavcodec/mips/wmv2dsp_init_mips.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "wmv2dsp_mips.h"
+
+#if HAVE_MMI
+static av_cold void wmv2dsp_init_mmi(WMV2DSPContext *c)
+{
+    c->idct_add  = ff_wmv2_idct_add_mmi;
+    c->idct_put  = ff_wmv2_idct_put_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_wmv2dsp_init_mips(WMV2DSPContext *c)
+{
+#if HAVE_MMI
+    wmv2dsp_init_mmi(c);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/wmv2dsp_mips.h b/libavcodec/mips/wmv2dsp_mips.h
new file mode 100644
index 0000000..22894c5
--- /dev/null
+++ b/libavcodec/mips/wmv2dsp_mips.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_WMV2DSP_MIPS_H
+#define AVCODEC_MIPS_WMV2DSP_MIPS_H
+
+#include "libavcodec/wmv2dsp.h"
+
+void ff_wmv2_idct_add_mmi(uint8_t *dest, int line_size, int16_t *block);
+void ff_wmv2_idct_put_mmi(uint8_t *dest, int line_size, int16_t *block);
+
+#endif /* AVCODEC_MIPS_WMV2DSP_MIPS_H */
diff --git a/libavcodec/mips/wmv2dsp_mmi.c b/libavcodec/mips/wmv2dsp_mmi.c
new file mode 100644
index 0000000..1f6ccb2
--- /dev/null
+++ b/libavcodec/mips/wmv2dsp_mmi.c
@@ -0,0 +1,278 @@
+/*
+ * WMV2 - DSP functions Loongson MMI-optimized
+ *
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "constants.h"
+#include "wmv2dsp_mips.h"
+#include "libavutil/mips/mmiutils.h"
+
+#define W0 2048
+#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
+#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
+#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
+#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
+#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
+#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
+#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
+
+static void wmv2_idct_row_mmi(short * b)
+{
+    int s1, s2;
+    int a0, a1, a2, a3, a4, a5, a6, a7;
+
+    /* step 1 */
+    a0 = W0 * b[0] + W0 * b[4];
+    a1 = W1 * b[1] + W7 * b[7];
+    a2 = W2 * b[2] + W6 * b[6];
+    a3 = W3 * b[5] - W5 * b[3];
+    a4 = W0 * b[0] - W0 * b[4];
+    a5 = W5 * b[5] + W3 * b[3];
+    a6 = W6 * b[2] - W2 * b[6];
+    a7 = W7 * b[1] - W1 * b[7];
+
+    /* step 2 */
+    s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8; // 1, 3, 5, 7
+    s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8;
+
+    /* step 3 */
+    b[0] = (a0 + a2 + a1 + a5 + 128) >> 8;
+    b[1] = (a4 + a6 + s1      + 128) >> 8;
+    b[2] = (a4 - a6 + s2      + 128) >> 8;
+    b[3] = (a0 - a2 + a7 + a3 + 128) >> 8;
+    b[4] = (a0 - a2 - a7 - a3 + 128) >> 8;
+    b[5] = (a4 - a6 - s2      + 128) >> 8;
+    b[6] = (a4 + a6 - s1      + 128) >> 8;
+    b[7] = (a0 + a2 - a1 - a5 + 128) >> 8;
+}
+
+static void wmv2_idct_col_mmi(short * b)
+{
+    int s1, s2;
+    int a0, a1, a2, a3, a4, a5, a6, a7;
+
+    /* step 1, with extended precision */
+    a0 = (W0 * b[ 0] + W0 * b[32]    ) >> 3;
+    a1 = (W1 * b[ 8] + W7 * b[56] + 4) >> 3;
+    a2 = (W2 * b[16] + W6 * b[48] + 4) >> 3;
+    a3 = (W3 * b[40] - W5 * b[24] + 4) >> 3;
+    a4 = (W0 * b[ 0] - W0 * b[32]    ) >> 3;
+    a5 = (W5 * b[40] + W3 * b[24] + 4) >> 3;
+    a6 = (W6 * b[16] - W2 * b[48] + 4) >> 3;
+    a7 = (W7 * b[ 8] - W1 * b[56] + 4) >> 3;
+
+    /* step 2 */
+    s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8;
+    s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8;
+
+    /* step 3 */
+    b[ 0] = (a0 + a2 + a1 + a5 + 8192) >> 14;
+    b[ 8] = (a4 + a6 + s1      + 8192) >> 14;
+    b[16] = (a4 - a6 + s2      + 8192) >> 14;
+    b[24] = (a0 - a2 + a7 + a3 + 8192) >> 14;
+
+    b[32] = (a0 - a2 - a7 - a3 + 8192) >> 14;
+    b[40] = (a4 - a6 - s2      + 8192) >> 14;
+    b[48] = (a4 + a6 - s1      + 8192) >> 14;
+    b[56] = (a0 + a2 - a1 - a5 + 8192) >> 14;
+}
+
+void ff_wmv2_idct_add_mmi(uint8_t *dest, int line_size, int16_t *block)
+{
+    int i;
+    double ftmp[11];
+
+    for (i = 0; i < 64; i += 8)
+        wmv2_idct_row_mmi(block + i);
+    for (i = 0; i < 8; i++)
+        wmv2_idct_col_mmi(block + i);
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
+
+        // low 4 loop
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp2], %[block], 0x08)
+        MMI_LDC1(%[ftmp3], %[block], 0x10)
+        MMI_LDC1(%[ftmp4], %[block], 0x18)
+        MMI_LDC1(%[ftmp5], %[block], 0x20)
+        MMI_LDC1(%[ftmp6], %[block], 0x28)
+        MMI_LDC1(%[ftmp7], %[block], 0x30)
+        MMI_LDC1(%[ftmp8], %[block], 0x38)
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
+        MMI_SDC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp4]                    \n\t"
+        MMI_SDC1(%[ftmp3], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],   %[ftmp6]                    \n\t"
+        MMI_SDC1(%[ftmp5], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],   %[ftmp8]                    \n\t"
+        MMI_SDC1(%[ftmp7], %[dest], 0x00)
+
+        PTR_ADDIU  "%[block],   %[block],   0x40                        \n\t"
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        // high 4 loop
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp2], %[block], 0x08)
+        MMI_LDC1(%[ftmp3], %[block], 0x10)
+        MMI_LDC1(%[ftmp4], %[block], 0x18)
+        MMI_LDC1(%[ftmp5], %[block], 0x20)
+        MMI_LDC1(%[ftmp6], %[block], 0x28)
+        MMI_LDC1(%[ftmp7], %[block], 0x30)
+        MMI_LDC1(%[ftmp8], %[block], 0x38)
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
+        MMI_SDC1(%[ftmp1], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp4]                    \n\t"
+        MMI_SDC1(%[ftmp3], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],   %[ftmp6]                    \n\t"
+        MMI_SDC1(%[ftmp5], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        MMI_LDC1(%[ftmp9], %[dest], 0x00)
+        "punpckhbh  %[ftmp10],  %[ftmp9],   %[ftmp0]                    \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],   %[ftmp9]                    \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],   %[ftmp10]                   \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],   %[ftmp8]                    \n\t"
+        MMI_SDC1(%[ftmp7], %[dest], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [block]"+&r"(block),              [dest]"+&r"(dest)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_wmv2_idct_put_mmi(uint8_t *dest, int line_size, int16_t *block)
+{
+    int i;
+    double ftmp[8];
+
+    for (i = 0; i < 64; i += 8)
+        wmv2_idct_row_mmi(block + i);
+    for (i = 0; i < 8; i++)
+        wmv2_idct_col_mmi(block + i);
+
+    __asm__ volatile (
+        // low 4 loop
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        MMI_LDC1(%[ftmp4], %[block], 0x20)
+        MMI_LDC1(%[ftmp5], %[block], 0x28)
+        MMI_LDC1(%[ftmp6], %[block], 0x30)
+        MMI_LDC1(%[ftmp7], %[block], 0x38)
+        "packushb   %[ftmp0],   %[ftmp0],   %[ftmp1]                    \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp3]                    \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp5]                    \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],   %[ftmp7]                    \n\t"
+        MMI_SDC1(%[ftmp0], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+        MMI_SDC1(%[ftmp2], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+        MMI_SDC1(%[ftmp4], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+        MMI_SDC1(%[ftmp6], %[dest], 0x00)
+
+        PTR_ADDIU  "%[block],   %[block],   0x40                        \n\t"
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+
+        // high 4 loop
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        MMI_LDC1(%[ftmp4], %[block], 0x20)
+        MMI_LDC1(%[ftmp5], %[block], 0x28)
+        MMI_LDC1(%[ftmp6], %[block], 0x30)
+        MMI_LDC1(%[ftmp7], %[block], 0x38)
+        "packushb   %[ftmp0],   %[ftmp0],   %[ftmp1]                    \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],   %[ftmp3]                    \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],   %[ftmp5]                    \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],   %[ftmp7]                    \n\t"
+        MMI_SDC1(%[ftmp0], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+        MMI_SDC1(%[ftmp2], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+        MMI_SDC1(%[ftmp4], %[dest], 0x00)
+        PTR_ADDU   "%[dest],    %[dest],    %[line_size]                \n\t"
+        MMI_SDC1(%[ftmp6], %[dest], 0x00)
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [block]"+&r"(block),              [dest]"+&r"(dest)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/xvid_idct_mmi.c b/libavcodec/mips/xvid_idct_mmi.c
new file mode 100644
index 0000000..d3f9acb
--- /dev/null
+++ b/libavcodec/mips/xvid_idct_mmi.c
@@ -0,0 +1,253 @@
+/*
+ * Loongson SIMD optimized xvid idct
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "xvididct_mips.h"
+
+#define BITS_INV_ACC    5                           // 4 or 5 for IEEE
+#define SHIFT_INV_ROW   (16 - BITS_INV_ACC)         //11
+#define SHIFT_INV_COL   (1 + BITS_INV_ACC)          //6
+#define RND_INV_ROW     (1024 * (6 - BITS_INV_ACC))
+#define RND_INV_COL     (16 * (BITS_INV_ACC - 3))
+#define RND_INV_CORR    (RND_INV_COL - 1)
+
+#define BITS_FRW_ACC    3                           // 2 or 3 for accuracy
+#define SHIFT_FRW_COL   BITS_FRW_ACC
+#define SHIFT_FRW_ROW   (BITS_FRW_ACC + 17)
+#define RND_FRW_ROW     (262144*(BITS_FRW_ACC - 1))
+
+DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
+     13036, 13036, 13036, 13036,    //  tg * (2<<16) + 0.5
+     27146, 27146, 27146, 27146,    //  tg * (2<<16) + 0.5
+    -21746,-21746,-21746,-21746,    //  tg * (2<<16) + 0.5
+     23170, 23170, 23170, 23170     // cos * (2<<15) + 0.5
+};
+
+DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
+    65536,65536,
+     3597, 3597,
+     2260, 2260,
+     1203, 1203,
+        0,    0,
+      120,  120,
+      512,  512,
+      512,  512
+};
+
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmi)[32*4] = {
+     16384, 21407, 16384,  8867,    // w05 w04 w01 w00
+     16384,  8867,-16384,-21407,    // w07 w06 w03 w02
+     16384, -8867, 16384,-21407,    // w13 w12 w09 w08
+    -16384, 21407, 16384, -8867,    // w15 w14 w11 w10
+     22725, 19266, 19266, -4520,    // w21 w20 w17 w16
+     12873,  4520,-22725,-12873,    // w23 w22 w19 w18
+     12873,-22725,  4520,-12873,    // w29 w28 w25 w24
+      4520, 19266, 19266,-22725,    // w31 w30 w27 w26
+
+     22725, 29692, 22725, 12299,    // w05 w04 w01 w00
+     22725, 12299,-22725,-29692,    // w07 w06 w03 w02
+     22725,-12299, 22725,-29692,    // w13 w12 w09 w08
+    -22725, 29692, 22725,-12299,    // w15 w14 w11 w10
+     31521, 26722, 26722, -6270,    // w21 w20 w17 w16
+     17855,  6270,-31521,-17855,    // w23 w22 w19 w18
+     17855,-31521,  6270,-17855,    // w29 w28 w25 w24
+      6270, 26722, 26722,-31521,    // w31 w30 w27 w26
+
+     21407, 27969, 21407, 11585,    // w05 w04 w01 w00
+     21407, 11585,-21407,-27969,    // w07 w06 w03 w02
+     21407,-11585, 21407,-27969,    // w13 w12 w09 w08
+    -21407, 27969, 21407,-11585,    // w15 w14 w11 w10
+     29692, 25172, 25172, -5906,    // w21 w20 w17 w16
+     16819,  5906,-29692,-16819,    // w23 w22 w19 w18
+     16819,-29692,  5906,-16819,    // w29 w28 w25 w24
+      5906, 25172, 25172,-29692,    // w31 w30 w27 w26
+
+     19266, 25172, 19266, 10426,    // w05 w04 w01 w00
+     19266, 10426,-19266,-25172,    // w07 w06 w03 w02
+     19266,-10426, 19266,-25172,    // w13 w12 w09 w08
+    -19266, 25172, 19266,-10426,    // w15 w14 w11 w10
+     26722, 22654, 22654, -5315,    // w21 w20 w17 w16
+     15137,  5315,-26722,-15137,    // w23 w22 w19 w18
+     15137,-26722,  5315,-15137,    // w29 w28 w25 w24
+      5315, 22654, 22654,-26722,    // w31 w30 w27 w26
+};
+
+#define DCT_8_INV_ROW_MMI(A1,A2,A3,A4)                                      \
+    "dli $10, 0x88              \n\t"                                       \
+    "ldc1 $f4, "#A1"            \n\t" /* 0; x3 x2 x1 x0                   */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "ldc1 $f10, 8+"#A1"         \n\t" /* 1; x7 x6 x5 x4                   */\
+    "ldc1 $f6, "#A3"            \n\t" /* 3; w05 w04 w01 w00               */\
+    "pshufh $f0, $f4, $f16      \n\t" /* x2 x0 x2 x0                      */\
+    "ldc1 $f8, 8+"#A3"          \n\t" /* 4; w07 w06 w03 w02               */\
+    "ldc1 $f12, 32+"#A3"        \n\t" /* 6; w21 w20 w17 w16               */\
+    "pmaddhw $f6, $f6, $f0      \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00      */\
+    "dli $10, 0xdd              \n\t"                                       \
+    "pshufh $f2, $f10, $f16     \n\t" /* x6 x4 x6 x4                      */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "pmaddhw $f8, $f8, $f2      \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02      */\
+    "ldc1 $f14, 40+"#A3"        \n\t" /* 7; w23 w22 w19 w18               */\
+    "pshufh $f4, $f4, $f16      \n\t" /* x3 x1 x3 x1                      */\
+    "pmaddhw $f12, $f12, $f4    \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16      */\
+    "pshufh $f10, $f10, $f16    \n\t" /* x7 x5 x7 x5                      */\
+    "ldc1 $f18, "#A4"           \n\t"                                       \
+    "pmaddhw $f14, $f14, $f10   \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18      */\
+    "paddw $f6, $f6, $f18       \n\t" /* +%4                              */\
+    "ldc1 $f16, 16+"#A3"        \n\t"                                       \
+    "pmaddhw $f0, $f0, $f16     \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08      */\
+    "ldc1 $f16, 24+"#A3"        \n\t"                                       \
+    "paddw $f6, $f6, $f8        \n\t" /* 4; a1=sum(even1) a0=sum(even0)   */\
+    "pmaddhw $f2, $f2, $f16     \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10      */\
+    "ldc1 $f16, 48+"#A3"        \n\t"                                       \
+    "pmaddhw $f4, $f4, $f16     \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24      */\
+    "ldc1 $f16, 56+"#A3"        \n\t"                                       \
+    "paddw $f12, $f12, $f14     \n\t" /* 7; b1=sum(odd1) b0=sum(odd0)     */\
+    "dli $10, 11                \n\t"                                       \
+    "pmaddhw $f10, $f10, $f16   \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26      */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "psubw $f8, $f6, $f12       \n\t" /* 6; a1-b1 a0-b0                   */\
+    "paddw $f6, $f6, $f12       \n\t" /* a1+b1 a0+b0                      */\
+    "paddw $f0, $f0, $f18       \n\t" /* +%4                              */\
+    "psraw $f6, $f6, $f16       \n\t" /* y1=a1+b1 y0=a0+b0                */\
+    "paddw $f0, $f0, $f2        \n\t" /* 1; a3=sum(even3) a2=sum(even2)   */\
+    "paddw $f4, $f4, $f10       \n\t" /* 5; b3=sum(odd3) b2=sum(odd2)     */\
+    "psraw $f8, $f8, $f16       \n\t" /* y6=a1-b1 y7=a0-b0                */\
+    "psubw $f14, $f0, $f4       \n\t" /* 2; a3-b3 a2-b2                   */\
+    "paddw $f0, $f0, $f4        \n\t" /* a3+b3 a2+b2                      */\
+    "psraw $f0, $f0, $f16       \n\t" /* y3=a3+b3 y2=a2+b2                */\
+    "psraw $f14, $f14, $f16     \n\t" /* y4=a3-b3 y5=a2-b2                */\
+    "dli $10, 0xb1              \n\t"                                       \
+    "packsswh $f6, $f6, $f0     \n\t" /* 0; y3 y2 y1 y0                   */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "packsswh $f14, $f14, $f8   \n\t" /* 4; y6 y7 y4 y5                   */\
+    "sdc1 $f6, "#A2"            \n\t" /* 3; save y3 y2 y1 y0              */\
+    "pshufh $f14, $f14, $f16    \n\t" /* y7 y6 y5 y4                      */\
+    "sdc1 $f14, 8+"#A2"         \n\t" /* 7; save y7 y6 y5 y4              */\
+
+
+#define DCT_8_INV_COL(A1,A2)                                                \
+    "ldc1 $f2, 2*8(%3)          \n\t"                                       \
+    "ldc1 $f6, 16*3+"#A1"       \n\t"                                       \
+    "ldc1 $f10, 16*5+"#A1"      \n\t"                                       \
+    "pmulhh $f0, $f2, $f6       \n\t" /* x3*(tg_3_16-1)                   */\
+    "ldc1 $f4, 0(%3)            \n\t"                                       \
+    "pmulhh $f2, $f2, $f10      \n\t" /* x5*(tg_3_16-1)                   */\
+    "ldc1 $f14, 16*7+"#A1"      \n\t"                                       \
+    "ldc1 $f12, 16*1+"#A1"      \n\t"                                       \
+    "pmulhh $f8, $f4, $f14      \n\t" /* x7*tg_1_16                       */\
+    "paddsh $f0, $f0, $f6       \n\t" /* x3*tg_3_16                       */\
+    "pmulhh $f4, $f4, $f12      \n\t" /* x1*tg_1_16                       */\
+    "paddsh $f2, $f2, $f6       \n\t" /* x3+x5*(tg_3_16-1)                */\
+    "psubsh $f0, $f0, $f10      \n\t" /* x3*tg_3_16-x5 = tm35             */\
+    "ldc1 $f6, 3*8(%3)          \n\t"                                       \
+    "paddsh $f2, $f2, $f10      \n\t" /* x3+x5*tg_3_16 = tp35             */\
+    "paddsh $f8, $f8, $f12      \n\t" /* x1+tg_1_16*x7 = tp17             */\
+    "psubsh $f4, $f4, $f14      \n\t" /* x1*tg_1_16-x7 = tm17             */\
+    "paddsh $f10, $f8, $f2      \n\t" /* tp17+tp35 = b0                   */\
+    "psubsh $f12, $f4, $f0      \n\t" /* tm17-tm35 = b3                   */\
+    "psubsh $f8, $f8, $f2       \n\t" /* tp17-tp35 = t1                   */\
+    "paddsh $f4, $f4, $f0       \n\t" /* tm17+tm35 = t2                   */\
+    "ldc1 $f14, 1*8(%3)         \n\t"                                       \
+    "sdc1 $f10, 3*16+"#A2"      \n\t" /* save b0                          */\
+    "paddsh $f2, $f8, $f4       \n\t" /* t1+t2                            */\
+    "sdc1 $f12, 5*16+"#A2"      \n\t" /* save b3                          */\
+    "psubsh $f8, $f8, $f4       \n\t" /* t1-t2                            */\
+    "ldc1 $f10, 2*16+"#A1"      \n\t"                                       \
+    "ldc1 $f12, 6*16+"#A1"      \n\t"                                       \
+    "pmulhh $f0, $f14, $f10     \n\t" /* x2*tg_2_16                       */\
+    "pmulhh $f14, $f14, $f12    \n\t" /* x6*tg_2_16                       */\
+    "pmulhh $f2, $f2, $f6       \n\t" /* ocos_4_16*(t1+t2) = b1/2         */\
+    "ldc1 $f4, 0*16+"#A1"       \n\t"                                       \
+    "pmulhh $f8, $f8, $f6       \n\t" /* ocos_4_16*(t1-t2) = b2/2         */\
+    "psubsh $f0, $f0, $f12      \n\t" /* t2*tg_2_16-x6 = tm26             */\
+    "ldc1 $f12, 4*16+"#A1"      \n\t"                                       \
+    "paddsh $f14, $f14, $f10    \n\t" /* x2+x6*tg_2_16 = tp26             */\
+    "psubsh $f6, $f4, $f12      \n\t" /* x0-x4 = tm04                     */\
+    "paddsh $f4, $f4, $f12      \n\t" /* x0+x4 = tp04                     */\
+    "paddsh $f10, $f4, $f14     \n\t" /* tp04+tp26 = a0                   */\
+    "psubsh $f12, $f6, $f0      \n\t" /* tm04-tm26 = a2                   */\
+    "psubsh $f4, $f4, $f14      \n\t" /* tp04-tp26 = a3                   */\
+    "paddsh $f6, $f6, $f0       \n\t" /* tm04+tm26 = a1                   */\
+    "paddsh $f2, $f2, $f2       \n\t" /* b1                               */\
+    "paddsh $f8, $f8, $f8       \n\t" /* b2                               */\
+    "psubsh $f14, $f6, $f2      \n\t" /* a1-b1                            */\
+    "dli $10, 6                 \n\t"                                       \
+    "paddsh $f6, $f6, $f2       \n\t" /* a1+b1                            */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "psubsh $f0, $f12, $f8      \n\t" /* a2-b2                            */\
+    "paddsh $f12, $f12, $f8     \n\t" /* a2+b2                            */\
+    "psrah $f6, $f6, $f16       \n\t" /* dst1                             */\
+    "psrah $f12, $f12, $f16     \n\t" /* dst2                             */\
+    "ldc1 $f2, 3*16+"#A2"       \n\t" /* load b0                          */\
+    "psrah $f14, $f14, $f16     \n\t" /* dst6                             */\
+    "psrah $f0, $f0, $f16       \n\t" /* dst5                             */\
+    "sdc1 $f6, 1*16+"#A2"       \n\t"                                       \
+    "psubsh $f8, $f10, $f2      \n\t" /* a0-b0                            */\
+    "paddsh $f10, $f10, $f2     \n\t" /* a0+b0                            */\
+    "sdc1 $f12, 2*16+"#A2"      \n\t"                                       \
+    "ldc1 $f6, 5*16+"#A2"       \n\t" /* load b3                          */\
+    "psrah $f10, $f10, $f16     \n\t" /* dst0                             */\
+    "psrah $f8, $f8, $f16       \n\t" /* dst7                             */\
+    "sdc1 $f0, 5*16+"#A2"       \n\t"                                       \
+    "psubsh $f12, $f4, $f6      \n\t" /* a3-b3                            */\
+    "paddsh $f4, $f4, $f6       \n\t" /* a3+b3                            */\
+    "sdc1 $f14, 6*16+"#A2"      \n\t"                                       \
+    "sdc1 $f10, 0*16+"#A2"      \n\t"                                       \
+    "psrah $f4, $f4, $f16       \n\t" /* dst3                             */\
+    "sdc1 $f8, 7*16+"#A2"       \n\t"                                       \
+    "psrah $f12, $f12, $f16     \n\t" /* dst4                             */\
+    "sdc1 $f4, 3*16+"#A2"       \n\t"                                       \
+    "sdc1 $f12, 4*16+"#A2"      \n\t"                                       \
+
+
+void ff_xvid_idct_mmi(int16_t *block)
+{
+    __asm__ volatile (
+        //# Process each row
+        DCT_8_INV_ROW_MMI(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+        DCT_8_INV_ROW_MMI(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+        DCT_8_INV_ROW_MMI(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+        DCT_8_INV_ROW_MMI(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+        DCT_8_INV_ROW_MMI(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+        DCT_8_INV_ROW_MMI(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+        DCT_8_INV_ROW_MMI(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+        DCT_8_INV_ROW_MMI(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+        //# Process the columns (4 at a time)
+        DCT_8_INV_COL(0(%0), 0(%0))
+        DCT_8_INV_COL(8(%0), 8(%0))
+        ::"r"(block),"r"(rounder_0),"r"(tab_i_04_mmi),"r"(tg_1_16)
+        : "$10"
+    );
+}
+
+void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
+{
+    ff_xvid_idct_mmi(block);
+    ff_put_pixels_clamped_mmi(block, dest, line_size);
+}
+
+void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
+{
+    ff_xvid_idct_mmi(block);
+    ff_add_pixels_clamped_mmi(block, dest, line_size);
+}
diff --git a/libavcodec/mips/xvididct_init_mips.c b/libavcodec/mips/xvididct_init_mips.c
new file mode 100644
index 0000000..c1d82cc
--- /dev/null
+++ b/libavcodec/mips/xvididct_init_mips.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xvididct_mips.h"
+
+#if HAVE_MMI
+static av_cold void xvid_idct_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+    if (!high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_XVID) {
+            c->idct_put = ff_xvid_idct_put_mmi;
+            c->idct_add = ff_xvid_idct_add_mmi;
+            c->idct = ff_xvid_idct_mmi;
+            c->perm_type = FF_IDCT_PERM_NONE;
+        }
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+#if HAVE_MMI
+    xvid_idct_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/xvididct_mips.h b/libavcodec/mips/xvididct_mips.h
new file mode 100644
index 0000000..0768aaa
--- /dev/null
+++ b/libavcodec/mips/xvididct_mips.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_XVIDIDCT_MIPS_H
+#define AVCODEC_MIPS_XVIDIDCT_MIPS_H
+
+#include "libavcodec/xvididct.h"
+
+void ff_xvid_idct_mmi(int16_t *block);
+void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+
+#endif /* AVCODEC_MIPS_XVIDIDCT_MIPS_H */
diff --git a/libavcodec/mjpeg.h b/libavcodec/mjpeg.h
index 1ebe283..cd5d0af 100644
--- a/libavcodec/mjpeg.h
+++ b/libavcodec/mjpeg.h
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -117,6 +117,7 @@ enum JpegMarker {
 
 #define PREDICT(ret, topleft, top, left, predictor)\
     switch(predictor){\
+        case 0: ret= 0; break;\
         case 1: ret= left; break;\
         case 2: ret= top; break;\
         case 3: ret= topleft; break;\
diff --git a/libavcodec/mjpeg2jpeg_bsf.c b/libavcodec/mjpeg2jpeg_bsf.c
index eec3469..6f02bc0 100644
--- a/libavcodec/mjpeg2jpeg_bsf.c
+++ b/libavcodec/mjpeg2jpeg_bsf.c
@@ -2,20 +2,20 @@
  * MJPEG/AVI1 to JPEG/JFIF bitstream format filter
  * Copyright (c) 2010 Adrian Daerr and Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,12 @@
 
 #include "libavutil/error.h"
 #include "libavutil/mem.h"
+#include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
 #include "bsf.h"
 #include "jpegtables.h"
+#include "mjpeg.h"
 
 static const uint8_t jpeg_header[] = {
     0xff, 0xd8,                     // SOI
@@ -84,19 +86,24 @@ static int mjpeg2jpeg_filter(AVBSFContext *ctx, AVPacket *out)
     uint8_t *output;
 
     ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
 
     if (in->size < 12) {
         av_log(ctx, AV_LOG_ERROR, "input is truncated\n");
         ret = AVERROR_INVALIDDATA;
         goto fail;
     }
-    if (memcmp("AVI1", in->data + 6, 4)) {
-        av_log(ctx, AV_LOG_ERROR, "input is not MJPEG/AVI1\n");
+    if (AV_RB16(in->data) != 0xffd8) {
+        av_log(ctx, AV_LOG_ERROR, "input is not MJPEG\n");
         ret = AVERROR_INVALIDDATA;
         goto fail;
     }
-
-    input_skip = (in->data[4] << 8) + in->data[5] + 4;
+    if (in->data[2] == 0xff && in->data[3] == APP0) {
+        input_skip = (in->data[4] << 8) + in->data[5] + 4;
+    } else {
+        input_skip = 2;
+    }
     if (in->size < input_skip) {
         av_log(ctx, AV_LOG_ERROR, "input is truncated\n");
         ret = AVERROR_INVALIDDATA;
@@ -125,7 +132,12 @@ fail:
     return ret;
 }
 
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_MJPEG, AV_CODEC_ID_NONE,
+};
+
 const AVBitStreamFilter ff_mjpeg2jpeg_bsf = {
     .name           = "mjpeg2jpeg",
     .filter         = mjpeg2jpeg_filter,
+    .codec_ids      = codec_ids,
 };
diff --git a/libavcodec/mjpeg_parser.c b/libavcodec/mjpeg_parser.c
index ab65461..07a6b2b 100644
--- a/libavcodec/mjpeg_parser.c
+++ b/libavcodec/mjpeg_parser.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Alex Beregszaszi
  * Copyright (c) 2003-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,27 +28,44 @@
 
 #include "parser.h"
 
+typedef struct MJPEGParserContext{
+    ParseContext pc;
+    int size;
+}MJPEGParserContext;
 
 /**
  * Find the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame, or -1
  */
-static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
+static int find_frame_end(MJPEGParserContext *m, const uint8_t *buf, int buf_size){
+    ParseContext *pc= &m->pc;
     int vop_found, i;
-    uint16_t state;
+    uint32_t state;
 
     vop_found= pc->frame_start_found;
     state= pc->state;
 
     i=0;
     if(!vop_found){
-        for(i=0; i<buf_size; i++){
+        for(i=0; i<buf_size;){
             state= (state<<8) | buf[i];
-            if(state == 0xFFD8){
-                i++;
-                vop_found=1;
-                break;
+            if(state>=0xFFC00000 && state<=0xFFFEFFFF){
+                if(state>=0xFFD80000 && state<=0xFFD8FFFF){
+                    i++;
+                    vop_found=1;
+                    break;
+                }else if(state<0xFFD00000 || state>0xFFD9FFFF){
+                    m->size= (state&0xFFFF)-1;
+                }
             }
+            if(m->size>0){
+                int size= FFMIN(buf_size-i, m->size);
+                i+=size;
+                m->size-=size;
+                state=0;
+                continue;
+            }else
+                i++;
         }
     }
 
@@ -56,13 +73,25 @@ static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
         /* EOF considered as end of frame */
         if (buf_size == 0)
             return 0;
-        for(; i<buf_size; i++){
+        for(; i<buf_size;){
             state= (state<<8) | buf[i];
-            if(state == 0xFFD8){
-                pc->frame_start_found=0;
-                pc->state=0;
-                return i-1;
+            if(state>=0xFFC00000 && state<=0xFFFEFFFF){
+                if(state>=0xFFD80000 && state<=0xFFD8FFFF){
+                    pc->frame_start_found=0;
+                    pc->state=0;
+                    return i-3;
+                } else if(state<0xFFD00000 || state>0xFFD9FFFF){
+                    m->size= (state&0xFFFF)-1;
+                }
             }
+            if(m->size>0){
+                int size= FFMIN(buf_size-i, m->size);
+                i+=size;
+                m->size-=size;
+                state=0;
+                continue;
+            }else
+                i++;
         }
     }
     pc->frame_start_found= vop_found;
@@ -75,13 +104,14 @@ static int jpeg_parse(AVCodecParserContext *s,
                       const uint8_t **poutbuf, int *poutbuf_size,
                       const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    MJPEGParserContext *m = s->priv_data;
+    ParseContext *pc = &m->pc;
     int next;
 
     if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
         next= buf_size;
     }else{
-        next= find_frame_end(pc, buf, buf_size);
+        next= find_frame_end(m, buf, buf_size);
 
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf = NULL;
@@ -97,8 +127,8 @@ static int jpeg_parse(AVCodecParserContext *s,
 
 
 AVCodecParser ff_mjpeg_parser = {
-    .codec_ids      = { AV_CODEC_ID_MJPEG },
-    .priv_data_size = sizeof(ParseContext),
+    .codec_ids      = { AV_CODEC_ID_MJPEG, AV_CODEC_ID_JPEGLS },
+    .priv_data_size = sizeof(MJPEGParserContext),
     .parser_parse   = jpeg_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/mjpega_dump_header_bsf.c b/libavcodec/mjpega_dump_header_bsf.c
index b3ce26a..ca5fb3a 100644
--- a/libavcodec/mjpega_dump_header_bsf.c
+++ b/libavcodec/mjpega_dump_header_bsf.c
@@ -2,20 +2,20 @@
  * MJPEG A dump header bitstream filter
  * Copyright (c) 2006 Baptiste Coudurier
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mjpegbdec.c b/libavcodec/mjpegbdec.c
index 3775aa3..37d7bb8 100644
--- a/libavcodec/mjpegbdec.c
+++ b/libavcodec/mjpegbdec.c
@@ -2,20 +2,20 @@
  * Apple MJPEG-B decoder
  * Copyright (c) 2002 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -55,6 +55,7 @@ static int mjpegb_decode_frame(AVCodecContext *avctx,
 
     buf_ptr = buf;
     buf_end = buf + buf_size;
+    s->got_picture = 0;
 
 read_header:
     /* reset on every SOI */
@@ -69,8 +70,7 @@ read_header:
 
     skip_bits(&hgb, 32); /* reserved zeros */
 
-    if (get_bits_long(&hgb, 32) != MKBETAG('m','j','p','g'))
-    {
+    if (get_bits_long(&hgb, 32) != MKBETAG('m','j','p','g')) {
         av_log(avctx, AV_LOG_WARNING, "not mjpeg-b (bad fourcc)\n");
         return AVERROR_INVALIDDATA;
     }
@@ -84,19 +84,17 @@ read_header:
 
     dqt_offs = read_offs(avctx, &hgb, buf_end - buf_ptr, "dqt is %d and size is %d\n");
     av_log(avctx, AV_LOG_DEBUG, "dqt offs: 0x%"PRIx32"\n", dqt_offs);
-    if (dqt_offs)
-    {
+    if (dqt_offs) {
         init_get_bits(&s->gb, buf_ptr+dqt_offs, (buf_end - (buf_ptr+dqt_offs))*8);
         s->start_code = DQT;
-        if (ff_mjpeg_decode_dqt(s) < 0 &&
-            (avctx->err_recognition & AV_EF_EXPLODE))
-          return AVERROR_INVALIDDATA;
+        ret = ff_mjpeg_decode_dqt(s);
+        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
+            return ret;
     }
 
     dht_offs = read_offs(avctx, &hgb, buf_end - buf_ptr, "dht is %d and size is %d\n");
     av_log(avctx, AV_LOG_DEBUG, "dht offs: 0x%"PRIx32"\n", dht_offs);
-    if (dht_offs)
-    {
+    if (dht_offs) {
         init_get_bits(&s->gb, buf_ptr+dht_offs, (buf_end - (buf_ptr+dht_offs))*8);
         s->start_code = DHT;
         ff_mjpeg_decode_dht(s);
@@ -104,42 +102,43 @@ read_header:
 
     sof_offs = read_offs(avctx, &hgb, buf_end - buf_ptr, "sof is %d and size is %d\n");
     av_log(avctx, AV_LOG_DEBUG, "sof offs: 0x%"PRIx32"\n", sof_offs);
-    if (sof_offs)
-    {
+    if (sof_offs) {
         init_get_bits(&s->gb, buf_ptr+sof_offs, (buf_end - (buf_ptr+sof_offs))*8);
         s->start_code = SOF0;
-        if (ff_mjpeg_decode_sof(s) < 0)
-            return -1;
+        if ((ret = ff_mjpeg_decode_sof(s)) < 0)
+            return ret;
     }
 
     sos_offs = read_offs(avctx, &hgb, buf_end - buf_ptr, "sos is %d and size is %d\n");
     av_log(avctx, AV_LOG_DEBUG, "sos offs: 0x%"PRIx32"\n", sos_offs);
     sod_offs = read_offs(avctx, &hgb, buf_end - buf_ptr, "sof is %d and size is %d\n");
     av_log(avctx, AV_LOG_DEBUG, "sod offs: 0x%"PRIx32"\n", sod_offs);
-    if (sos_offs)
-    {
+    if (sos_offs) {
         init_get_bits(&s->gb, buf_ptr + sos_offs,
                       8 * FFMIN(field_size, buf_end - buf_ptr - sos_offs));
         s->mjpb_skiptosod = (sod_offs - sos_offs - show_bits(&s->gb, 16));
         s->start_code = SOS;
-        if (ff_mjpeg_decode_sos(s, NULL, NULL) < 0 &&
-            (avctx->err_recognition & AV_EF_EXPLODE))
-          return AVERROR_INVALIDDATA;
+        ret = ff_mjpeg_decode_sos(s, NULL, 0, NULL);
+        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
+            return ret;
     }
 
     if (s->interlaced) {
         s->bottom_field ^= 1;
         /* if not bottom field, do not output image yet */
-        if (s->bottom_field != s->interlace_polarity && second_field_offs)
-        {
+        if (s->bottom_field != s->interlace_polarity && second_field_offs) {
             buf_ptr = buf + second_field_offs;
-            second_field_offs = 0;
             goto read_header;
-            }
+        }
     }
 
     //XXX FIXME factorize, this looks very similar to the EOI code
 
+    if(!s->got_picture) {
+        av_log(avctx, AV_LOG_WARNING, "no picture\n");
+        return buf_size;
+    }
+
     if ((ret = av_frame_ref(data, s->picture_ptr)) < 0)
         return ret;
     *got_frame = 1;
@@ -162,5 +161,6 @@ AVCodec ff_mjpegb_decoder = {
     .close          = ff_mjpeg_decode_end,
     .decode         = mjpegb_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index 296ca59..e82c185 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,19 +30,24 @@
  * MJPEG decoder.
  */
 
-#include <assert.h>
-
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "copy_block.h"
+#include "hwaccel.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "jpegtables.h"
 #include "mjpeg.h"
 #include "mjpegdec.h"
 #include "jpeglsdec.h"
+#include "profiles.h"
 #include "put_bits.h"
+#include "tiff.h"
+#include "exif.h"
+#include "bytestream.h"
 
 
 static int build_vlc(VLC *vlc, const uint8_t *bits_table,
@@ -54,7 +59,7 @@ static int build_vlc(VLC *vlc, const uint8_t *bits_table,
     uint16_t huff_sym[256];
     int i;
 
-    assert(nb_codes <= 256);
+    av_assert0(nb_codes <= 256);
 
     ff_mjpeg_build_huffman_codes(huff_size, huff_code, bits_table, val_table);
 
@@ -68,36 +73,67 @@ static int build_vlc(VLC *vlc, const uint8_t *bits_table,
                               huff_code, 2, 2, huff_sym, 2, 2, use_static);
 }
 
-static int build_basic_mjpeg_vlc(MJpegDecodeContext *s)
+static int init_default_huffman_tables(MJpegDecodeContext *s)
 {
-    int ret;
-
-    if ((ret = build_vlc(&s->vlcs[0][0], avpriv_mjpeg_bits_dc_luminance,
-                         avpriv_mjpeg_val_dc, 12, 0, 0)) < 0)
-        return ret;
-
-    if ((ret = build_vlc(&s->vlcs[0][1], avpriv_mjpeg_bits_dc_chrominance,
-                         avpriv_mjpeg_val_dc, 12, 0, 0)) < 0)
-        return ret;
-
-    if ((ret = build_vlc(&s->vlcs[1][0], avpriv_mjpeg_bits_ac_luminance,
-                         avpriv_mjpeg_val_ac_luminance, 251, 0, 1)) < 0)
-        return ret;
+    static const struct {
+        int class;
+        int index;
+        const uint8_t *bits;
+        const uint8_t *values;
+        int codes;
+        int length;
+    } ht[] = {
+        { 0, 0, avpriv_mjpeg_bits_dc_luminance,
+                avpriv_mjpeg_val_dc, 12, 12 },
+        { 0, 1, avpriv_mjpeg_bits_dc_chrominance,
+                avpriv_mjpeg_val_dc, 12, 12 },
+        { 1, 0, avpriv_mjpeg_bits_ac_luminance,
+                avpriv_mjpeg_val_ac_luminance,   251, 162 },
+        { 1, 1, avpriv_mjpeg_bits_ac_chrominance,
+                avpriv_mjpeg_val_ac_chrominance, 251, 162 },
+        { 2, 0, avpriv_mjpeg_bits_ac_luminance,
+                avpriv_mjpeg_val_ac_luminance,   251, 162 },
+        { 2, 1, avpriv_mjpeg_bits_ac_chrominance,
+                avpriv_mjpeg_val_ac_chrominance, 251, 162 },
+    };
+    int i, ret;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(ht); i++) {
+        ret = build_vlc(&s->vlcs[ht[i].class][ht[i].index],
+                        ht[i].bits, ht[i].values, ht[i].codes,
+                        0, ht[i].class == 1);
+        if (ret < 0)
+            return ret;
 
-    if ((ret = build_vlc(&s->vlcs[1][1], avpriv_mjpeg_bits_ac_chrominance,
-                         avpriv_mjpeg_val_ac_chrominance, 251, 0, 1)) < 0)
-        return ret;
+        if (ht[i].class < 2) {
+            memcpy(s->raw_huffman_lengths[ht[i].class][ht[i].index],
+                   ht[i].bits + 1, 16);
+            memcpy(s->raw_huffman_values[ht[i].class][ht[i].index],
+                   ht[i].values, ht[i].length);
+        }
+    }
 
-    if ((ret = build_vlc(&s->vlcs[2][0], avpriv_mjpeg_bits_ac_luminance,
-                         avpriv_mjpeg_val_ac_luminance, 251, 0, 0)) < 0)
-        return ret;
+    return 0;
+}
 
-    if ((ret = build_vlc(&s->vlcs[2][1], avpriv_mjpeg_bits_ac_chrominance,
-                         avpriv_mjpeg_val_ac_chrominance, 251, 0, 0)) < 0)
-        return ret;
+static void parse_avid(MJpegDecodeContext *s, uint8_t *buf, int len)
+{
+    s->buggy_avid = 1;
+    if (len > 14 && buf[12] == 1) /* 1 - NTSC */
+        s->interlace_polarity = 1;
+    if (len > 14 && buf[12] == 2) /* 2 - PAL */
+        s->interlace_polarity = 0;
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(s->avctx, AV_LOG_INFO, "AVID: len:%d %d\n", len, len > 14 ? buf[12] : -1);
+}
 
+static void init_idct(AVCodecContext *avctx)
+{
+    MJpegDecodeContext *s = avctx->priv_data;
 
-    return 0;
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
+                      ff_zigzag_direct);
 }
 
 av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
@@ -113,36 +149,46 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
     }
 
     s->avctx = avctx;
-    ff_blockdsp_init(&s->bdsp);
+    ff_blockdsp_init(&s->bdsp, avctx);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
-    ff_idctdsp_init(&s->idsp, avctx);
-    ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
-                      ff_zigzag_direct);
+    init_idct(avctx);
     s->buffer_size   = 0;
     s->buffer        = NULL;
     s->start_code    = -1;
     s->first_picture = 1;
+    s->got_picture   = 0;
     s->org_height    = avctx->coded_height;
     avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
     avctx->colorspace = AVCOL_SPC_BT470BG;
+    s->hwaccel_pix_fmt = s->hwaccel_sw_pix_fmt = AV_PIX_FMT_NONE;
 
-    if ((ret = build_basic_mjpeg_vlc(s)) < 0)
+    if ((ret = init_default_huffman_tables(s)) < 0)
         return ret;
 
     if (s->extern_huff) {
-        av_log(avctx, AV_LOG_INFO, "mjpeg: using external huffman table\n");
+        av_log(avctx, AV_LOG_INFO, "using external huffman table\n");
         if ((ret = init_get_bits(&s->gb, avctx->extradata, avctx->extradata_size * 8)) < 0)
             return ret;
-        if ((ret = ff_mjpeg_decode_dht(s))) {
+        if (ff_mjpeg_decode_dht(s)) {
             av_log(avctx, AV_LOG_ERROR,
-                   "mjpeg: error using external huffman table\n");
-            return ret;
+                   "error using external huffman table, switching back to internal\n");
+            init_default_huffman_tables(s);
         }
     }
     if (avctx->field_order == AV_FIELD_BB) { /* quicktime icefloe 019 */
         s->interlace_polarity = 1;           /* bottom field first */
-        av_log(avctx, AV_LOG_DEBUG, "mjpeg bottom field first\n");
+        av_log(avctx, AV_LOG_DEBUG, "bottom field first\n");
+    } else if (avctx->field_order == AV_FIELD_UNKNOWN) {
+        if (avctx->codec_tag == AV_RL32("MJPG"))
+            s->interlace_polarity = 1;
+    }
+
+    if (   avctx->extradata_size > 8
+        && AV_RL32(avctx->extradata) == 0x2C
+        && AV_RL32(avctx->extradata+4) == 0x18) {
+        parse_avid(s, avctx->extradata, avctx->extradata_size);
     }
+
     if (avctx->codec->id == AV_CODEC_ID_AMV)
         s->flipped = 1;
 
@@ -153,15 +199,20 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
 /* quantize tables */
 int ff_mjpeg_decode_dqt(MJpegDecodeContext *s)
 {
-    int len, index, i, j;
+    int len, index, i;
 
     len = get_bits(&s->gb, 16) - 2;
 
+    if (8*len > get_bits_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "dqt: len %d is too large\n", len);
+        return AVERROR_INVALIDDATA;
+    }
+
     while (len >= 65) {
-        /* only 8-bit precision handled */
-        if (get_bits(&s->gb, 4) != 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "dqt: 16-bit precision\n");
-            return -1;
+        int pr = get_bits(&s->gb, 4);
+        if (pr > 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "dqt: invalid precision\n");
+            return AVERROR_INVALIDDATA;
         }
         index = get_bits(&s->gb, 4);
         if (index >= 4)
@@ -169,16 +220,19 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s)
         av_log(s->avctx, AV_LOG_DEBUG, "index=%d\n", index);
         /* read quant table */
         for (i = 0; i < 64; i++) {
-            j = s->scantable.permutated[i];
-            s->quant_matrixes[index][j] = get_bits(&s->gb, 8);
+            s->quant_matrixes[index][i] = get_bits(&s->gb, pr ? 16 : 8);
+            if (s->quant_matrixes[index][i] == 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "dqt: 0 quant value\n");
+                return AVERROR_INVALIDDATA;
+            }
         }
 
         // XXX FIXME fine-tune, and perhaps add dc too
-        s->qscale[index] = FFMAX(s->quant_matrixes[index][s->scantable.permutated[1]],
-                                 s->quant_matrixes[index][s->scantable.permutated[8]]) >> 1;
+        s->qscale[index] = FFMAX(s->quant_matrixes[index][1],
+                                 s->quant_matrixes[index][8]) >> 1;
         av_log(s->avctx, AV_LOG_DEBUG, "qscale[%d]: %d\n",
                index, s->qscale[index]);
-        len -= 65;
+        len -= 1 + 64 * (1+pr);
     }
     return 0;
 }
@@ -193,6 +247,11 @@ int ff_mjpeg_decode_dht(MJpegDecodeContext *s)
 
     len = get_bits(&s->gb, 16) - 2;
 
+    if (8*len > get_bits_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "dht: len %d is too large\n", len);
+        return AVERROR_INVALIDDATA;
+    }
+
     while (len > 0) {
         if (len < 17)
             return AVERROR_INVALIDDATA;
@@ -234,27 +293,46 @@ int ff_mjpeg_decode_dht(MJpegDecodeContext *s)
                                  code_max + 1, 0, 0)) < 0)
                 return ret;
         }
+
+        for (i = 0; i < 16; i++)
+            s->raw_huffman_lengths[class][index][i] = bits_table[i + 1];
+        for (i = 0; i < 256; i++)
+            s->raw_huffman_values[class][index][i] = val_table[i];
     }
     return 0;
 }
 
 int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
 {
+    int len, nb_components, i, width, height, bits, ret, size_change;
+    unsigned pix_fmt_id;
     int h_count[MAX_COMPONENTS] = { 0 };
     int v_count[MAX_COMPONENTS] = { 0 };
-    int len, nb_components, i, width, height, bits, pix_fmt_id, ret;
 
-    /* XXX: verify len field validity */
+    s->cur_scan = 0;
+    memset(s->upscale_h, 0, sizeof(s->upscale_h));
+    memset(s->upscale_v, 0, sizeof(s->upscale_v));
+
     len     = get_bits(&s->gb, 16);
     bits    = get_bits(&s->gb, 8);
 
+    if (bits > 16 || bits < 1) {
+        av_log(s->avctx, AV_LOG_ERROR, "bits %d is invalid\n", bits);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->avctx->bits_per_raw_sample != bits) {
+        av_log(s->avctx, s->avctx->bits_per_raw_sample > 0 ? AV_LOG_INFO : AV_LOG_DEBUG, "Changing bps from %d to %d\n", s->avctx->bits_per_raw_sample, bits);
+        s->avctx->bits_per_raw_sample = bits;
+        init_idct(s->avctx);
+    }
     if (s->pegasus_rct)
         bits = 9;
     if (bits == 9 && !s->pegasus_rct)
         s->rct  = 1;    // FIXME ugly
 
-    if (bits != 8 && !s->lossless) {
-        av_log(s->avctx, AV_LOG_ERROR, "only 8 bits/component accepted\n");
+    if(s->lossless && s->avctx->lowres){
+        av_log(s->avctx, AV_LOG_ERROR, "lowres is not possible with lossless jpeg\n");
         return -1;
     }
 
@@ -268,6 +346,8 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
     av_log(s->avctx, AV_LOG_DEBUG, "sof0: picture: %dx%d\n", width, height);
     if (av_image_check_size(width, height, 0, s->avctx) < 0)
         return AVERROR_INVALIDDATA;
+    if (s->buf_size && (width + 7) / 8 * ((height + 7) / 8) > s->buf_size * 4LL)
+        return AVERROR_INVALIDDATA;
 
     nb_components = get_bits(&s->gb, 8);
     if (nb_components <= 0 ||
@@ -286,6 +366,11 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
                                       "bits/component or 16-bit gray");
         return AVERROR_PATCHWELCOME;
     }
+    if (len != 8 + 3 * nb_components) {
+        av_log(s->avctx, AV_LOG_ERROR, "decode_sof0: error, len(%d) mismatch %d components\n", len, nb_components);
+        return AVERROR_INVALIDDATA;
+    }
+
     s->nb_components = nb_components;
     s->h_max         = 1;
     s->v_max         = 1;
@@ -300,8 +385,10 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         if (v_count[i] > s->v_max)
             s->v_max = v_count[i];
         s->quant_index[i] = get_bits(&s->gb, 8);
-        if (s->quant_index[i] >= 4)
+        if (s->quant_index[i] >= 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "quant_index is invalid\n");
             return AVERROR_INVALIDDATA;
+        }
         if (!h_count[i] || !v_count[i]) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Invalid sampling factor in component %d %d:%d\n",
@@ -313,28 +400,36 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
                i, h_count[i], v_count[i],
                s->component_id[i], s->quant_index[i]);
     }
+    if (   nb_components == 4
+        && s->component_id[0] == 'C' - 1
+        && s->component_id[1] == 'M' - 1
+        && s->component_id[2] == 'Y' - 1
+        && s->component_id[3] == 'K' - 1)
+        s->adobe_transform = 0;
 
     if (s->ls && (s->h_max > 1 || s->v_max > 1)) {
         avpriv_report_missing_feature(s->avctx, "Subsampling in JPEG-LS");
         return AVERROR_PATCHWELCOME;
     }
 
-    if (s->v_max == 1 && s->h_max == 1 && s->lossless == 1)
-        s->rgb = 1;
 
     /* if different size, realloc/alloc picture */
     if (width != s->width || height != s->height || bits != s->bits ||
         memcmp(s->h_count, h_count, sizeof(h_count))                ||
         memcmp(s->v_count, v_count, sizeof(v_count))) {
+        size_change = 1;
+
         s->width      = width;
         s->height     = height;
         s->bits       = bits;
         memcpy(s->h_count, h_count, sizeof(h_count));
         memcpy(s->v_count, v_count, sizeof(v_count));
         s->interlaced = 0;
+        s->got_picture = 0;
 
         /* test interlaced mode */
         if (s->first_picture   &&
+            (s->multiscope != 2 || s->avctx->time_base.den >= 25 * s->avctx->time_base.num) &&
             s->org_height != 0 &&
             s->height < ((s->org_height * 3) / 4)) {
             s->interlaced                    = 1;
@@ -349,11 +444,22 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
             return ret;
 
         s->first_picture = 0;
+    } else {
+        size_change = 0;
     }
 
-    if (!(s->interlaced && (s->bottom_field == !s->interlace_polarity))) {
+    if (s->got_picture && s->interlaced && (s->bottom_field == !s->interlace_polarity)) {
+        if (s->progressive) {
+            avpriv_request_sample(s->avctx, "progressively coded interlaced picture");
+            return AVERROR_INVALIDDATA;
+        }
+    } else{
+        if (s->v_max == 1 && s->h_max == 1 && s->lossless==1 && (nb_components==3 || nb_components==4))
+            s->rgb = 1;
+        else if (!s->lossless)
+            s->rgb = 0;
     /* XXX: not complete test ! */
-    pix_fmt_id = (s->h_count[0] << 28) | (s->v_count[0] << 24) |
+    pix_fmt_id = ((unsigned)s->h_count[0] << 28) | (s->v_count[0] << 24) |
                  (s->h_count[1] << 20) | (s->v_count[1] << 16) |
                  (s->h_count[2] << 12) | (s->v_count[2] <<  8) |
                  (s->h_count[3] <<  4) |  s->v_count[3];
@@ -365,38 +471,197 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
     if (!(pix_fmt_id & 0x0D0D0D0D))
         pix_fmt_id -= (pix_fmt_id & 0x0F0F0F0F) >> 1;
 
+    for (i = 0; i < 8; i++) {
+        int j = 6 + (i&1) - (i&6);
+        int is = (pix_fmt_id >> (4*i)) & 0xF;
+        int js = (pix_fmt_id >> (4*j)) & 0xF;
+
+        if (is == 1 && js != 2 && (i < 2 || i > 5))
+            js = (pix_fmt_id >> ( 8 + 4*(i&1))) & 0xF;
+        if (is == 1 && js != 2 && (i < 2 || i > 5))
+            js = (pix_fmt_id >> (16 + 4*(i&1))) & 0xF;
+
+        if (is == 1 && js == 2) {
+            if (i & 1) s->upscale_h[j/2] = 1;
+            else       s->upscale_v[j/2] = 1;
+        }
+    }
+
     switch (pix_fmt_id) {
     case 0x11111100:
         if (s->rgb)
-            s->avctx->pix_fmt = AV_PIX_FMT_BGRA;
+            s->avctx->pix_fmt = s->bits <= 9 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_BGR48;
+        else {
+            if (   s->adobe_transform == 0
+                || s->component_id[0] == 'R' - 1 && s->component_id[1] == 'G' - 1 && s->component_id[2] == 'B' - 1) {
+                s->avctx->pix_fmt = s->bits <= 8 ? AV_PIX_FMT_GBRP : AV_PIX_FMT_GBRP16;
+            } else {
+                if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+                else              s->avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+            }
+        }
+        av_assert0(s->nb_components == 3);
+        break;
+    case 0x11111111:
+        if (s->rgb)
+            s->avctx->pix_fmt = s->bits <= 9 ? AV_PIX_FMT_ABGR : AV_PIX_FMT_RGBA64;
         else {
-            s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+            if (s->adobe_transform == 0 && s->bits <= 8) {
+                s->avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            } else {
+                s->avctx->pix_fmt = s->bits <= 8 ? AV_PIX_FMT_YUVA444P : AV_PIX_FMT_YUVA444P16;
+                s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+            }
+        }
+        av_assert0(s->nb_components == 4);
+        break;
+    case 0x22111122:
+    case 0x22111111:
+        if (s->adobe_transform == 0 && s->bits <= 8) {
+            s->avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+        } else if (s->adobe_transform == 2 && s->bits <= 8) {
+            s->avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        } else {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+            else              s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P16;
             s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         }
-        assert(s->nb_components == 3);
+        av_assert0(s->nb_components == 4);
+        break;
+    case 0x12121100:
+    case 0x22122100:
+    case 0x21211100:
+    case 0x22211200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        else
+            goto unk_pixfmt;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        break;
+    case 0x22221100:
+    case 0x22112200:
+    case 0x11222200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        else
+            goto unk_pixfmt;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     case 0x11000000:
-        s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+    case 0x13000000:
+    case 0x14000000:
+    case 0x31000000:
+    case 0x33000000:
+    case 0x34000000:
+    case 0x41000000:
+    case 0x43000000:
+    case 0x44000000:
+        if(s->bits <= 8)
+            s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        else
+            s->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
         break;
     case 0x12111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV440P : AV_PIX_FMT_YUVJ440P;
-        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+    case 0x14121200:
+    case 0x14111100:
+    case 0x22211100:
+    case 0x22112100:
+        if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            else
+                goto unk_pixfmt;
+            s->upscale_v[0] = s->upscale_v[1] = 1;
+        } else {
+            if (pix_fmt_id == 0x14111100)
+                s->upscale_v[1] = s->upscale_v[2] = 1;
+            if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV440P : AV_PIX_FMT_YUVJ440P;
+            else
+                goto unk_pixfmt;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        }
         break;
     case 0x21111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+        if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            else
+                goto unk_pixfmt;
+            s->upscale_h[0] = s->upscale_h[1] = 1;
+        } else {
+            if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+            else              s->avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        }
+        break;
+    case 0x31111100:
+        if (s->bits > 8)
+            goto unk_pixfmt;
+        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        s->upscale_h[1] = s->upscale_h[2] = 2;
+        break;
+    case 0x22121100:
+    case 0x22111200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+        else
+            goto unk_pixfmt;
         s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     case 0x22111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUVJ420P;
+    case 0x23111100:
+    case 0x42111100:
+    case 0x24111100:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUVJ420P;
+        else              s->avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        if (pix_fmt_id == 0x42111100) {
+            if (s->bits > 8)
+                goto unk_pixfmt;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+        } else if (pix_fmt_id == 0x24111100) {
+            if (s->bits > 8)
+                goto unk_pixfmt;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+        } else if (pix_fmt_id == 0x23111100) {
+            if (s->bits > 8)
+                goto unk_pixfmt;
+            s->upscale_v[1] = s->upscale_v[2] = 2;
+        }
+        break;
+    case 0x41111100:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV411P : AV_PIX_FMT_YUVJ411P;
+        else
+            goto unk_pixfmt;
         s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     default:
-        avpriv_report_missing_feature(s->avctx, "Pixel format 0x%x", pix_fmt_id);
+unk_pixfmt:
+        avpriv_report_missing_feature(s->avctx, "Pixel format 0x%x bits:%d", pix_fmt_id, s->bits);
+        memset(s->upscale_h, 0, sizeof(s->upscale_h));
+        memset(s->upscale_v, 0, sizeof(s->upscale_v));
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((AV_RB32(s->upscale_h) || AV_RB32(s->upscale_v)) && s->avctx->lowres) {
+        avpriv_report_missing_feature(s->avctx, "Lowres for weird subsampling");
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((AV_RB32(s->upscale_h) || AV_RB32(s->upscale_v)) && s->progressive && s->avctx->pix_fmt == AV_PIX_FMT_GBRP) {
+        avpriv_report_missing_feature(s->avctx, "progressive for weird subsampling");
         return AVERROR_PATCHWELCOME;
     }
     if (s->ls) {
-        if (s->nb_components > 1)
+        memset(s->upscale_h, 0, sizeof(s->upscale_h));
+        memset(s->upscale_v, 0, sizeof(s->upscale_v));
+        if (s->nb_components == 3) {
             s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else if (s->nb_components != 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported number of components %d\n", s->nb_components);
+            return AVERROR_PATCHWELCOME;
+        } else if (s->palette_index && s->bits <= 8)
+            s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
         else if (s->bits <= 8)
             s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
         else
@@ -409,24 +674,54 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         return AVERROR_BUG;
     }
 
+    if (s->avctx->pix_fmt == s->hwaccel_sw_pix_fmt && !size_change) {
+        s->avctx->pix_fmt = s->hwaccel_pix_fmt;
+    } else {
+        enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_MJPEG_NVDEC_HWACCEL
+            AV_PIX_FMT_CUDA,
+#endif
+#if CONFIG_MJPEG_VAAPI_HWACCEL
+            AV_PIX_FMT_VAAPI,
+#endif
+            s->avctx->pix_fmt,
+            AV_PIX_FMT_NONE,
+        };
+        s->hwaccel_pix_fmt = ff_get_format(s->avctx, pix_fmts);
+        if (s->hwaccel_pix_fmt < 0)
+            return AVERROR(EINVAL);
+
+        s->hwaccel_sw_pix_fmt = s->avctx->pix_fmt;
+        s->avctx->pix_fmt     = s->hwaccel_pix_fmt;
+    }
+
+    if (s->avctx->skip_frame == AVDISCARD_ALL) {
+        s->picture_ptr->pict_type = AV_PICTURE_TYPE_I;
+        s->picture_ptr->key_frame = 1;
+        s->got_picture            = 1;
+        return 0;
+    }
+
     av_frame_unref(s->picture_ptr);
-    if (ff_get_buffer(s->avctx, s->picture_ptr, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (ff_get_buffer(s->avctx, s->picture_ptr, AV_GET_BUFFER_FLAG_REF) < 0)
         return -1;
-    }
     s->picture_ptr->pict_type = AV_PICTURE_TYPE_I;
     s->picture_ptr->key_frame = 1;
     s->got_picture            = 1;
 
-    for (i = 0; i < 3; i++)
+    for (i = 0; i < 4; i++)
         s->linesize[i] = s->picture_ptr->linesize[i] << s->interlaced;
 
     ff_dlog(s->avctx, "%d %d %d %d %d %d\n",
             s->width, s->height, s->linesize[0], s->linesize[1],
             s->interlaced, s->avctx->height);
 
-    if (len != (8 + (3 * nb_components)))
-        av_log(s->avctx, AV_LOG_DEBUG, "decode_sof0: error, len(%d) mismatch\n", len);
+    }
+
+    if ((s->rgb && !s->lossless && !s->ls) ||
+        (!s->rgb && s->ls && s->nb_components > 1)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported coding and pixel format combination\n");
+        return AVERROR_PATCHWELCOME;
     }
 
     /* totally blank picture as progressive JPEG will only add details to it */
@@ -437,12 +732,27 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
             int size = bw * bh * s->h_count[i] * s->v_count[i];
             av_freep(&s->blocks[i]);
             av_freep(&s->last_nnz[i]);
-            s->blocks[i]       = av_malloc(size * sizeof(**s->blocks));
-            s->last_nnz[i]     = av_mallocz(size * sizeof(**s->last_nnz));
+            s->blocks[i]       = av_mallocz_array(size, sizeof(**s->blocks));
+            s->last_nnz[i]     = av_mallocz_array(size, sizeof(**s->last_nnz));
+            if (!s->blocks[i] || !s->last_nnz[i])
+                return AVERROR(ENOMEM);
             s->block_stride[i] = bw * s->h_count[i];
         }
         memset(s->coefs_finished, 0, sizeof(s->coefs_finished));
     }
+
+    if (s->avctx->hwaccel) {
+        s->hwaccel_picture_private =
+            av_mallocz(s->avctx->hwaccel->frame_priv_data_size);
+        if (!s->hwaccel_picture_private)
+            return AVERROR(ENOMEM);
+
+        ret = s->avctx->hwaccel->start_frame(s->avctx, s->raw_image_buffer,
+                                             s->raw_image_buffer_size);
+        if (ret < 0)
+            return ret;
+    }
+
     return 0;
 }
 
@@ -450,11 +760,11 @@ static inline int mjpeg_decode_dc(MJpegDecodeContext *s, int dc_index)
 {
     int code;
     code = get_vlc2(&s->gb, s->vlcs[0][dc_index].table, 9, 2);
-    if (code < 0) {
+    if (code < 0 || code > 16) {
         av_log(s->avctx, AV_LOG_WARNING,
                "mjpeg_decode_dc: bad vlc: %d:%d (%p)\n",
                0, dc_index, &s->vlcs[0][dc_index]);
-        return 0xffff;
+        return 0xfffff;
     }
 
     if (code)
@@ -465,17 +775,18 @@ static inline int mjpeg_decode_dc(MJpegDecodeContext *s, int dc_index)
 
 /* decode block and dequantize */
 static int decode_block(MJpegDecodeContext *s, int16_t *block, int component,
-                        int dc_index, int ac_index, int16_t *quant_matrix)
+                        int dc_index, int ac_index, uint16_t *quant_matrix)
 {
     int code, i, j, level, val;
 
     /* DC coef */
     val = mjpeg_decode_dc(s, dc_index);
-    if (val == 0xffff) {
+    if (val == 0xfffff) {
         av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
         return AVERROR_INVALIDDATA;
     }
-    val = val * quant_matrix[0] + s->last_dc[component];
+    val = val * (unsigned)quant_matrix[0] + s->last_dc[component];
+    val = av_clip_int16(val);
     s->last_dc[component] = val;
     block[0] = val;
     /* AC coefs */
@@ -504,7 +815,7 @@ static int decode_block(MJpegDecodeContext *s, int16_t *block, int component,
                 return AVERROR_INVALIDDATA;
             }
             j        = s->scantable.permutated[i];
-            block[j] = level * quant_matrix[j];
+            block[j] = level * quant_matrix[i];
         }
     } while (i < 63);
     CLOSE_READER(re, &s->gb);}
@@ -514,16 +825,16 @@ static int decode_block(MJpegDecodeContext *s, int16_t *block, int component,
 
 static int decode_dc_progressive(MJpegDecodeContext *s, int16_t *block,
                                  int component, int dc_index,
-                                 int16_t *quant_matrix, int Al)
+                                 uint16_t *quant_matrix, int Al)
 {
-    int val;
+    unsigned val;
     s->bdsp.clear_block(block);
     val = mjpeg_decode_dc(s, dc_index);
-    if (val == 0xffff) {
+    if (val == 0xfffff) {
         av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
         return AVERROR_INVALIDDATA;
     }
-    val = (val * quant_matrix[0] << Al) + s->last_dc[component];
+    val = (val * (quant_matrix[0] << Al)) + s->last_dc[component];
     s->last_dc[component] = val;
     block[0] = val;
     return 0;
@@ -532,10 +843,11 @@ static int decode_dc_progressive(MJpegDecodeContext *s, int16_t *block,
 /* decode block and dequantize - progressive JPEG version */
 static int decode_block_progressive(MJpegDecodeContext *s, int16_t *block,
                                     uint8_t *last_nnz, int ac_index,
-                                    int16_t *quant_matrix,
+                                    uint16_t *quant_matrix,
                                     int ss, int se, int Al, int *EOBRUN)
 {
-    int code, i, j, level, val, run;
+    int code, i, j, val, run;
+    unsigned level;
 
     if (*EOBRUN) {
         (*EOBRUN)--;
@@ -566,14 +878,14 @@ static int decode_block_progressive(MJpegDecodeContext *s, int16_t *block,
                 if (i >= se) {
                     if (i == se) {
                         j = s->scantable.permutated[se];
-                        block[j] = level * quant_matrix[j] << Al;
+                        block[j] = level * (quant_matrix[se] << Al);
                         break;
                     }
                     av_log(s->avctx, AV_LOG_ERROR, "error count: %d\n", i);
                     return AVERROR_INVALIDDATA;
                 }
                 j = s->scantable.permutated[i];
-                block[j] = level * quant_matrix[j] << Al;
+                block[j] = level * (quant_matrix[i] << Al);
             } else {
                 if (run == 0xF) {// ZRL - skip 15 coefficients
                     i += 15;
@@ -606,7 +918,7 @@ static int decode_block_progressive(MJpegDecodeContext *s, int16_t *block,
     UPDATE_CACHE(re, &s->gb);                                       \
     sign = block[j] >> 15;                                          \
     block[j] += SHOW_UBITS(re, &s->gb, 1) *                         \
-                ((quant_matrix[j] ^ sign) - sign) << Al;            \
+                ((quant_matrix[i] ^ sign) - sign) << Al;            \
     LAST_SKIP_BITS(re, &s->gb, 1);                                  \
 }
 
@@ -630,7 +942,7 @@ for (; ; i++) {                                                     \
 /* decode block and dequantize - progressive JPEG refinement pass */
 static int decode_block_refinement(MJpegDecodeContext *s, int16_t *block,
                                    uint8_t *last_nnz,
-                                   int ac_index, int16_t *quant_matrix,
+                                   int ac_index, uint16_t *quant_matrix,
                                    int ss, int se, int Al, int *EOBRUN)
 {
     int code, i = ss, j, sign, val, run;
@@ -652,7 +964,7 @@ static int decode_block_refinement(MJpegDecodeContext *s, int16_t *block,
                 ZERO_RUN;
                 j = s->scantable.permutated[i];
                 val--;
-                block[j] = ((quant_matrix[j]^val) - val) << Al;
+                block[j] = ((quant_matrix[i] << Al) ^ val) - val;
                 if (i == se) {
                     if (i > *last_nnz)
                         *last_nnz = i;
@@ -693,14 +1005,57 @@ static int decode_block_refinement(MJpegDecodeContext *s, int16_t *block,
 #undef REFINE_BIT
 #undef ZERO_RUN
 
-static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
-                                 int point_transform)
+static int handle_rstn(MJpegDecodeContext *s, int nb_components)
+{
+    int i;
+    int reset = 0;
+
+    if (s->restart_interval) {
+        s->restart_count--;
+        if(s->restart_count == 0 && s->avctx->codec_id == AV_CODEC_ID_THP){
+            align_get_bits(&s->gb);
+            for (i = 0; i < nb_components; i++) /* reset dc */
+                s->last_dc[i] = (4 << s->bits);
+        }
+
+        i = 8 + ((-get_bits_count(&s->gb)) & 7);
+        /* skip RSTn */
+        if (s->restart_count == 0) {
+            if(   show_bits(&s->gb, i) == (1 << i) - 1
+               || show_bits(&s->gb, i) == 0xFF) {
+                int pos = get_bits_count(&s->gb);
+                align_get_bits(&s->gb);
+                while (get_bits_left(&s->gb) >= 8 && show_bits(&s->gb, 8) == 0xFF)
+                    skip_bits(&s->gb, 8);
+                if (get_bits_left(&s->gb) >= 8 && (get_bits(&s->gb, 8) & 0xF8) == 0xD0) {
+                    for (i = 0; i < nb_components; i++) /* reset dc */
+                        s->last_dc[i] = (4 << s->bits);
+                    reset = 1;
+                } else
+                    skip_bits_long(&s->gb, pos - get_bits_count(&s->gb));
+            }
+        }
+    }
+    return reset;
+}
+
+static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int nb_components, int predictor, int point_transform)
 {
     int i, mb_x, mb_y;
     uint16_t (*buffer)[4];
-    int left[3], top[3], topleft[3];
+    int left[4], top[4], topleft[4];
     const int linesize = s->linesize[0];
-    const int mask     = (1 << s->bits) - 1;
+    const int mask     = ((1 << s->bits) - 1) << point_transform;
+    int resync_mb_y = 0;
+    int resync_mb_x = 0;
+
+    if (s->nb_components != 3 && s->nb_components != 4)
+        return AVERROR_INVALIDDATA;
+    if (s->v_max != 1 || s->h_max != 1 || !s->lossless)
+        return AVERROR_INVALIDDATA;
+
+
+    s->restart_count = s->restart_interval;
 
     av_fast_malloc(&s->ljpeg_buffer, &s->ljpeg_buffer_size,
                    (unsigned)s->mb_width * 4 * sizeof(s->ljpeg_buffer[0][0]));
@@ -709,33 +1064,50 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
 
     buffer = s->ljpeg_buffer;
 
-    for (i = 0; i < 3; i++)
-        buffer[0][i] = 1 << (s->bits + point_transform - 1);
+    for (i = 0; i < 4; i++)
+        buffer[0][i] = 1 << (s->bits - 1);
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
-        const int modified_predictor = mb_y ? predictor : 1;
         uint8_t *ptr = s->picture_ptr->data[0] + (linesize * mb_y);
 
         if (s->interlaced && s->bottom_field)
             ptr += linesize >> 1;
 
-        for (i = 0; i < 3; i++)
+        for (i = 0; i < 4; i++)
             top[i] = left[i] = topleft[i] = buffer[0][i];
 
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-            if (s->restart_interval && !s->restart_count)
+            int modified_predictor = predictor;
+
+            if (get_bits_left(&s->gb) < 1) {
+                av_log(s->avctx, AV_LOG_ERROR, "bitstream end in rgb_scan\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (s->restart_interval && !s->restart_count){
                 s->restart_count = s->restart_interval;
+                resync_mb_x = mb_x;
+                resync_mb_y = mb_y;
+                for(i=0; i<4; i++)
+                    top[i] = left[i]= topleft[i]= 1 << (s->bits - 1);
+            }
+            if (mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x || !mb_x)
+                modified_predictor = 1;
 
-            for (i = 0; i < 3; i++) {
-                int pred;
+            for (i=0;i<nb_components;i++) {
+                int pred, dc;
 
                 topleft[i] = top[i];
                 top[i]     = buffer[mb_x][i];
 
                 PREDICT(pred, topleft[i], top[i], left[i], modified_predictor);
 
+                dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                if(dc == 0xFFFFF)
+                    return -1;
+
                 left[i] = buffer[mb_x][i] =
-                    mask & (pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform));
+                    mask & (pred + (unsigned)(dc * (1 << point_transform)));
             }
 
             if (s->restart_interval && !--s->restart_count) {
@@ -743,24 +1115,54 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
                 skip_bits(&s->gb, 16); /* skip RSTn */
             }
         }
-
-        if (s->rct) {
+        if (s->rct && s->nb_components == 4) {
+            for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                ptr[4*mb_x + 2] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
+                ptr[4*mb_x + 1] = buffer[mb_x][1] + ptr[4*mb_x + 2];
+                ptr[4*mb_x + 3] = buffer[mb_x][2] + ptr[4*mb_x + 2];
+                ptr[4*mb_x + 0] = buffer[mb_x][3];
+            }
+        } else if (s->nb_components == 4) {
+            for(i=0; i<nb_components; i++) {
+                int c= s->comp_index[i];
+                if (s->bits <= 8) {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ptr[4*mb_x+3-c] = buffer[mb_x][i];
+                    }
+                } else if(s->bits == 9) {
+                    return AVERROR_PATCHWELCOME;
+                } else {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ((uint16_t*)ptr)[4*mb_x+c] = buffer[mb_x][i];
+                    }
+                }
+            }
+        } else if (s->rct) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
-                ptr[4 * mb_x + 0] = buffer[mb_x][1] + ptr[4 * mb_x + 1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][2] + ptr[4 * mb_x + 1];
+                ptr[3*mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
+                ptr[3*mb_x + 0] = buffer[mb_x][1] + ptr[3*mb_x + 1];
+                ptr[3*mb_x + 2] = buffer[mb_x][2] + ptr[3*mb_x + 1];
             }
         } else if (s->pegasus_rct) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2]) >> 2);
-                ptr[4 * mb_x + 0] = buffer[mb_x][1] + ptr[4 * mb_x + 1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][2] + ptr[4 * mb_x + 1];
+                ptr[3*mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2]) >> 2);
+                ptr[3*mb_x + 0] = buffer[mb_x][1] + ptr[3*mb_x + 1];
+                ptr[3*mb_x + 2] = buffer[mb_x][2] + ptr[3*mb_x + 1];
             }
         } else {
-            for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 0] = buffer[mb_x][2];
-                ptr[4 * mb_x + 1] = buffer[mb_x][1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][0];
+            for(i=0; i<nb_components; i++) {
+                int c= s->comp_index[i];
+                if (s->bits <= 8) {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ptr[3*mb_x+2-c] = buffer[mb_x][i];
+                    }
+                } else if(s->bits == 9) {
+                    return AVERROR_PATCHWELCOME;
+                } else {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ((uint16_t*)ptr)[3*mb_x+2-c] = buffer[mb_x][i];
+                    }
+                }
             }
         }
     }
@@ -770,48 +1172,95 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
 static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
                                  int point_transform, int nb_components)
 {
-    int i, mb_x, mb_y;
+    int i, mb_x, mb_y, mask;
+    int bits= (s->bits+7)&~7;
+    int resync_mb_y = 0;
+    int resync_mb_x = 0;
+
+    point_transform += bits - s->bits;
+    mask = ((1 << s->bits) - 1) << point_transform;
+
+    av_assert0(nb_components>=1 && nb_components<=4);
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-            if (s->restart_interval && !s->restart_count)
+            if (get_bits_left(&s->gb) < 1) {
+                av_log(s->avctx, AV_LOG_ERROR, "bitstream end in yuv_scan\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (s->restart_interval && !s->restart_count){
                 s->restart_count = s->restart_interval;
+                resync_mb_x = mb_x;
+                resync_mb_y = mb_y;
+            }
 
-            if (mb_x == 0 || mb_y == 0 || s->interlaced) {
+            if(!mb_x || mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x || s->interlaced){
+                int toprow  = mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x;
+                int leftcol = !mb_x || mb_y == resync_mb_y && mb_x == resync_mb_x;
                 for (i = 0; i < nb_components; i++) {
                     uint8_t *ptr;
+                    uint16_t *ptr16;
                     int n, h, v, x, y, c, j, linesize;
-                    n        = s->nb_blocks[i];
-                    c        = s->comp_index[i];
-                    h        = s->h_scount[i];
-                    v        = s->v_scount[i];
-                    x        = 0;
-                    y        = 0;
-                    linesize = s->linesize[c];
-
-                    for (j = 0; j < n; j++) {
-                        int pred;
-                        // FIXME optimize this crap
-                        ptr = s->picture_ptr->data[c] +
-                              (linesize * (v * mb_y + y)) +
-                              (h * mb_x + x);
-                        if (y == 0 && mb_y == 0) {
-                            if (x == 0 && mb_x == 0)
-                                pred = 128 << point_transform;
-                            else
-                                pred = ptr[-1];
-                        } else {
-                            if (x == 0 && mb_x == 0)
-                                pred = ptr[-linesize];
-                            else
-                                PREDICT(pred, ptr[-linesize - 1],
-                                        ptr[-linesize], ptr[-1], predictor);
-                       }
+                    n = s->nb_blocks[i];
+                    c = s->comp_index[i];
+                    h = s->h_scount[i];
+                    v = s->v_scount[i];
+                    x = 0;
+                    y = 0;
+                    linesize= s->linesize[c];
+
+                    if(bits>8) linesize /= 2;
+
+                    for(j=0; j<n; j++) {
+                        int pred, dc;
+
+                        dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                        if(dc == 0xFFFFF)
+                            return -1;
+                        if (   h * mb_x + x >= s->width
+                            || v * mb_y + y >= s->height) {
+                            // Nothing to do
+                        } else if (bits<=8) {
+                            ptr = s->picture_ptr->data[c] + (linesize * (v * mb_y + y)) + (h * mb_x + x); //FIXME optimize this crap
+                            if(y==0 && toprow){
+                                if(x==0 && leftcol){
+                                    pred= 1 << (bits - 1);
+                                }else{
+                                    pred= ptr[-1];
+                                }
+                            }else{
+                                if(x==0 && leftcol){
+                                    pred= ptr[-linesize];
+                                }else{
+                                    PREDICT(pred, ptr[-linesize-1], ptr[-linesize], ptr[-1], predictor);
+                                }
+                            }
 
-                        if (s->interlaced && s->bottom_field)
-                            ptr += linesize >> 1;
-                        *ptr = pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform);
+                            if (s->interlaced && s->bottom_field)
+                                ptr += linesize >> 1;
+                            pred &= mask;
+                            *ptr= pred + ((unsigned)dc << point_transform);
+                        }else{
+                            ptr16 = (uint16_t*)(s->picture_ptr->data[c] + 2*(linesize * (v * mb_y + y)) + 2*(h * mb_x + x)); //FIXME optimize this crap
+                            if(y==0 && toprow){
+                                if(x==0 && leftcol){
+                                    pred= 1 << (bits - 1);
+                                }else{
+                                    pred= ptr16[-1];
+                                }
+                            }else{
+                                if(x==0 && leftcol){
+                                    pred= ptr16[-linesize];
+                                }else{
+                                    PREDICT(pred, ptr16[-linesize-1], ptr16[-linesize], ptr16[-1], predictor);
+                                }
+                            }
 
+                            if (s->interlaced && s->bottom_field)
+                                ptr16 += linesize >> 1;
+                            pred &= mask;
+                            *ptr16= pred + ((unsigned)dc << point_transform);
+                        }
                         if (++x == h) {
                             x = 0;
                             y++;
@@ -821,7 +1270,8 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
             } else {
                 for (i = 0; i < nb_components; i++) {
                     uint8_t *ptr;
-                    int n, h, v, x, y, c, j, linesize;
+                    uint16_t *ptr16;
+                    int n, h, v, x, y, c, j, linesize, dc;
                     n        = s->nb_blocks[i];
                     c        = s->comp_index[i];
                     h        = s->h_scount[i];
@@ -830,16 +1280,33 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
                     y        = 0;
                     linesize = s->linesize[c];
 
+                    if(bits>8) linesize /= 2;
+
                     for (j = 0; j < n; j++) {
                         int pred;
 
-                        // FIXME optimize this crap
-                        ptr = s->picture_ptr->data[c] +
+                        dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                        if(dc == 0xFFFFF)
+                            return -1;
+                        if (   h * mb_x + x >= s->width
+                            || v * mb_y + y >= s->height) {
+                            // Nothing to do
+                        } else if (bits<=8) {
+                            ptr = s->picture_ptr->data[c] +
                               (linesize * (v * mb_y + y)) +
-                              (h * mb_x + x);
-                        PREDICT(pred, ptr[-linesize - 1],
-                                ptr[-linesize], ptr[-1], predictor);
-                        *ptr = pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform);
+                              (h * mb_x + x); //FIXME optimize this crap
+                            PREDICT(pred, ptr[-linesize-1], ptr[-linesize], ptr[-1], predictor);
+
+                            pred &= mask;
+                            *ptr = pred + ((unsigned)dc << point_transform);
+                        }else{
+                            ptr16 = (uint16_t*)(s->picture_ptr->data[c] + 2*(linesize * (v * mb_y + y)) + 2*(h * mb_x + x)); //FIXME optimize this crap
+                            PREDICT(pred, ptr16[-linesize-1], ptr16[-linesize], ptr16[-1], predictor);
+
+                            pred &= mask;
+                            *ptr16= pred + ((unsigned)dc << point_transform);
+                        }
+
                         if (++x == h) {
                             x = 0;
                             y++;
@@ -856,18 +1323,63 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
     return 0;
 }
 
+static av_always_inline void mjpeg_copy_block(MJpegDecodeContext *s,
+                                              uint8_t *dst, const uint8_t *src,
+                                              int linesize, int lowres)
+{
+    switch (lowres) {
+    case 0: s->hdsp.put_pixels_tab[1][0](dst, src, linesize, 8);
+        break;
+    case 1: copy_block4(dst, src, linesize, linesize, 4);
+        break;
+    case 2: copy_block2(dst, src, linesize, linesize, 2);
+        break;
+    case 3: *dst = *src;
+        break;
+    }
+}
+
+static void shift_output(MJpegDecodeContext *s, uint8_t *ptr, int linesize)
+{
+    int block_x, block_y;
+    int size = 8 >> s->avctx->lowres;
+    if (s->bits > 8) {
+        for (block_y=0; block_y<size; block_y++)
+            for (block_x=0; block_x<size; block_x++)
+                *(uint16_t*)(ptr + 2*block_x + block_y*linesize) <<= 16 - s->bits;
+    } else {
+        for (block_y=0; block_y<size; block_y++)
+            for (block_x=0; block_x<size; block_x++)
+                *(ptr + block_x + block_y*linesize) <<= 8 - s->bits;
+    }
+}
+
 static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                              int Al, const uint8_t *mb_bitmask,
+                             int mb_bitmask_size,
                              const AVFrame *reference)
 {
-    int i, mb_x, mb_y;
+    int i, mb_x, mb_y, chroma_h_shift, chroma_v_shift, chroma_width, chroma_height;
     uint8_t *data[MAX_COMPONENTS];
     const uint8_t *reference_data[MAX_COMPONENTS];
     int linesize[MAX_COMPONENTS];
-    GetBitContext mb_bitmask_gb;
+    GetBitContext mb_bitmask_gb = {0}; // initialize to silence gcc warning
+    int bytes_per_pixel = 1 + (s->bits > 8);
 
-    if (mb_bitmask)
+    if (mb_bitmask) {
+        if (mb_bitmask_size != (s->mb_width * s->mb_height + 7)>>3) {
+            av_log(s->avctx, AV_LOG_ERROR, "mb_bitmask_size mismatches\n");
+            return AVERROR_INVALIDDATA;
+        }
         init_get_bits(&mb_bitmask_gb, mb_bitmask, s->mb_width * s->mb_height);
+    }
+
+    s->restart_count = 0;
+
+    av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt, &chroma_h_shift,
+                                     &chroma_v_shift);
+    chroma_width  = AV_CEIL_RSHIFT(s->width,  chroma_h_shift);
+    chroma_height = AV_CEIL_RSHIFT(s->height, chroma_v_shift);
 
     for (i = 0; i < nb_components; i++) {
         int c   = s->comp_index[i];
@@ -900,27 +1412,36 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                 x = 0;
                 y = 0;
                 for (j = 0; j < n; j++) {
-                    block_offset = ((linesize[c] * (v * mb_y + y) * 8) +
-                                    (h * mb_x + x) * 8);
+                    block_offset = (((linesize[c] * (v * mb_y + y) * 8) +
+                                     (h * mb_x + x) * 8 * bytes_per_pixel) >> s->avctx->lowres);
 
                     if (s->interlaced && s->bottom_field)
                         block_offset += linesize[c] >> 1;
-                    ptr = data[c] + block_offset;
+                    if (   8*(h * mb_x + x) < ((c == 1) || (c == 2) ? chroma_width  : s->width)
+                        && 8*(v * mb_y + y) < ((c == 1) || (c == 2) ? chroma_height : s->height)) {
+                        ptr = data[c] + block_offset;
+                    } else
+                        ptr = NULL;
                     if (!s->progressive) {
-                        if (copy_mb)
-                            s->hdsp.put_pixels_tab[1][0](ptr,
-                                reference_data[c] + block_offset,
-                                linesize[c], 8);
-                        else {
+                        if (copy_mb) {
+                            if (ptr)
+                                mjpeg_copy_block(s, ptr, reference_data[c] + block_offset,
+                                                linesize[c], s->avctx->lowres);
+
+                        } else {
                             s->bdsp.clear_block(s->block);
                             if (decode_block(s, s->block, i,
                                              s->dc_index[i], s->ac_index[i],
-                                             s->quant_matrixes[s->quant_index[c]]) < 0) {
+                                             s->quant_matrixes[s->quant_sindex[i]]) < 0) {
                                 av_log(s->avctx, AV_LOG_ERROR,
                                        "error y=%d x=%d\n", mb_y, mb_x);
                                 return AVERROR_INVALIDDATA;
                             }
-                            s->idsp.idct_put(ptr, linesize[c], s->block);
+                            if (ptr) {
+                                s->idsp.idct_put(ptr, linesize[c], s->block);
+                                if (s->bits & 7)
+                                    shift_output(s, ptr, linesize[c]);
+                            }
                         }
                     } else {
                         int block_idx  = s->block_stride[c] * (v * mb_y + y) +
@@ -928,9 +1449,9 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                         int16_t *block = s->blocks[c][block_idx];
                         if (Ah)
                             block[0] += get_bits1(&s->gb) *
-                                        s->quant_matrixes[s->quant_index[c]][0] << Al;
+                                        s->quant_matrixes[s->quant_sindex[i]][0] << Al;
                         else if (decode_dc_progressive(s, block, i, s->dc_index[i],
-                                                       s->quant_matrixes[s->quant_index[c]],
+                                                       s->quant_matrixes[s->quant_sindex[i]],
                                                        Al) < 0) {
                             av_log(s->avctx, AV_LOG_ERROR,
                                    "error y=%d x=%d\n", mb_y, mb_x);
@@ -948,74 +1469,45 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                 }
             }
 
-            if (s->restart_interval) {
-                s->restart_count--;
-                i = 8 + ((-get_bits_count(&s->gb)) & 7);
-                /* skip RSTn */
-                if (show_bits(&s->gb, i) == (1 << i) - 1) {
-                    int pos = get_bits_count(&s->gb);
-                    align_get_bits(&s->gb);
-                    while (get_bits_left(&s->gb) >= 8 && show_bits(&s->gb, 8) == 0xFF)
-                        skip_bits(&s->gb, 8);
-                    if ((get_bits(&s->gb, 8) & 0xF8) == 0xD0) {
-                        for (i = 0; i < nb_components; i++) /* reset dc */
-                            s->last_dc[i] = 1024;
-                    } else
-                        skip_bits_long(&s->gb, pos - get_bits_count(&s->gb));
-                }
-            }
+            handle_rstn(s, nb_components);
         }
     }
     return 0;
 }
 
 static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss,
-                                            int se, int Ah, int Al,
-                                            const uint8_t *mb_bitmask,
-                                            const AVFrame *reference)
+                                            int se, int Ah, int Al)
 {
     int mb_x, mb_y;
     int EOBRUN = 0;
     int c = s->comp_index[0];
-    uint8_t *data = s->picture_ptr->data[c];
-    const uint8_t *reference_data = reference ? reference->data[c] : NULL;
-    int linesize  = s->linesize[c];
-    int last_scan = 0;
-    int16_t *quant_matrix = s->quant_matrixes[s->quant_index[c]];
-    GetBitContext mb_bitmask_gb;
-
-    if (ss < 0  || ss >= 64 ||
-        se < ss || se >= 64 ||
-        Ah < 0  || Al < 0)
-        return AVERROR_INVALIDDATA;
+    uint16_t *quant_matrix = s->quant_matrixes[s->quant_sindex[0]];
 
-    if (mb_bitmask)
-        init_get_bits(&mb_bitmask_gb, mb_bitmask, s->mb_width * s->mb_height);
-
-    if (!Al) {
-        // s->coefs_finished is a bitmask for coefficients coded
-        // ss and se are parameters telling start and end coefficients
-        s->coefs_finished[c] |= (~0ULL >> (63 - (se - ss))) << ss;
-        last_scan = !~s->coefs_finished[c];
+    av_assert0(ss>=0 && Ah>=0 && Al>=0);
+    if (se < ss || se > 63) {
+        av_log(s->avctx, AV_LOG_ERROR, "SS/SE %d/%d is invalid\n", ss, se);
+        return AVERROR_INVALIDDATA;
     }
 
-    if (s->interlaced && s->bottom_field) {
-        int offset      = linesize >> 1;
-        data           += offset;
-        reference_data += offset;
-    }
+    // s->coefs_finished is a bitmask for coefficients coded
+    // ss and se are parameters telling start and end coefficients
+    s->coefs_finished[c] |= (2ULL << se) - (1ULL << ss);
+
+    s->restart_count = 0;
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
-        int block_offset = mb_y * linesize * 8;
-        uint8_t *ptr     = data + block_offset;
         int block_idx    = mb_y * s->block_stride[c];
         int16_t (*block)[64] = &s->blocks[c][block_idx];
         uint8_t *last_nnz    = &s->last_nnz[c][block_idx];
+        if (get_bits_left(&s->gb) <= 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "bitstream truncated in mjpeg_decode_scan_progressive_ac\n");
+            return AVERROR_INVALIDDATA;
+        }
         for (mb_x = 0; mb_x < s->mb_width; mb_x++, block++, last_nnz++) {
-            const int copy_mb = mb_bitmask && !get_bits1(&mb_bitmask_gb);
-
-            if (!copy_mb) {
                 int ret;
+                if (s->restart_interval && !s->restart_count)
+                    s->restart_count = s->restart_interval;
+
                 if (Ah)
                     ret = decode_block_refinement(s, *block, last_nnz, s->ac_index[0],
                                                   quant_matrix, ss, se, Al, &EOBRUN);
@@ -1027,31 +1519,72 @@ static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss,
                            "error y=%d x=%d\n", mb_y, mb_x);
                     return AVERROR_INVALIDDATA;
                 }
-            }
 
-            if (last_scan) {
-                if (copy_mb) {
-                    s->hdsp.put_pixels_tab[1][0](ptr,
-                                                 reference_data + block_offset,
-                                                 linesize, 8);
-                } else {
-                    s->idsp.idct_put(ptr, linesize, *block);
-                    ptr += 8;
-                }
-            }
+            if (handle_rstn(s, 0))
+                EOBRUN = 0;
         }
     }
     return 0;
 }
 
+static void mjpeg_idct_scan_progressive_ac(MJpegDecodeContext *s)
+{
+    int mb_x, mb_y;
+    int c;
+    const int bytes_per_pixel = 1 + (s->bits > 8);
+    const int block_size = s->lossless ? 1 : 8;
+
+    for (c = 0; c < s->nb_components; c++) {
+        uint8_t *data = s->picture_ptr->data[c];
+        int linesize  = s->linesize[c];
+        int h = s->h_max / s->h_count[c];
+        int v = s->v_max / s->v_count[c];
+        int mb_width     = (s->width  + h * block_size - 1) / (h * block_size);
+        int mb_height    = (s->height + v * block_size - 1) / (v * block_size);
+
+        if (~s->coefs_finished[c])
+            av_log(s->avctx, AV_LOG_WARNING, "component %d is incomplete\n", c);
+
+        if (s->interlaced && s->bottom_field)
+            data += linesize >> 1;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            uint8_t *ptr     = data + (mb_y * linesize * 8 >> s->avctx->lowres);
+            int block_idx    = mb_y * s->block_stride[c];
+            int16_t (*block)[64] = &s->blocks[c][block_idx];
+            for (mb_x = 0; mb_x < mb_width; mb_x++, block++) {
+                s->idsp.idct_put(ptr, linesize, *block);
+                if (s->bits & 7)
+                    shift_output(s, ptr, linesize);
+                ptr += bytes_per_pixel*8 >> s->avctx->lowres;
+            }
+        }
+    }
+}
+
 int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
-                        const AVFrame *reference)
+                        int mb_bitmask_size, const AVFrame *reference)
 {
     int len, nb_components, i, h, v, predictor, point_transform;
     int index, id, ret;
     const int block_size = s->lossless ? 1 : 8;
     int ilv, prev_shift;
 
+    if (!s->got_picture) {
+        av_log(s->avctx, AV_LOG_WARNING,
+                "Can not process SOS before SOF, skipping\n");
+        return -1;
+    }
+
+    if (reference) {
+        if (reference->width  != s->picture_ptr->width  ||
+            reference->height != s->picture_ptr->height ||
+            reference->format != s->picture_ptr->format) {
+            av_log(s->avctx, AV_LOG_ERROR, "Reference mismatching\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     /* XXX: verify len field validity */
     len = get_bits(&s->gb, 16);
     nb_components = get_bits(&s->gb, 8);
@@ -1082,27 +1615,33 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
             && nb_components == 3 && s->nb_components == 3 && i)
             index = 3 - i;
 
-        s->comp_index[i] = index;
-
+        s->quant_sindex[i] = s->quant_index[index];
         s->nb_blocks[i] = s->h_count[index] * s->v_count[index];
         s->h_scount[i]  = s->h_count[index];
         s->v_scount[i]  = s->v_count[index];
 
+        if((nb_components == 1 || nb_components == 3) && s->nb_components == 3 && s->avctx->pix_fmt == AV_PIX_FMT_GBR24P)
+            index = (index+2)%3;
+
+        s->comp_index[i] = index;
+
         s->dc_index[i] = get_bits(&s->gb, 4);
         s->ac_index[i] = get_bits(&s->gb, 4);
 
         if (s->dc_index[i] <  0 || s->ac_index[i] < 0 ||
             s->dc_index[i] >= 4 || s->ac_index[i] >= 4)
             goto out_of_range;
-        if (!s->vlcs[0][s->dc_index[i]].table ||
-            !s->vlcs[1][s->ac_index[i]].table)
+        if (!s->vlcs[0][s->dc_index[i]].table || !(s->progressive ? s->vlcs[2][s->ac_index[0]].table : s->vlcs[1][s->ac_index[i]].table))
             goto out_of_range;
     }
 
     predictor = get_bits(&s->gb, 8);       /* JPEG Ss / lossless JPEG predictor /JPEG-LS NEAR */
     ilv = get_bits(&s->gb, 8);             /* JPEG Se / JPEG-LS ILV */
-    prev_shift      = get_bits(&s->gb, 4); /* Ah */
-    point_transform = get_bits(&s->gb, 4); /* Al */
+    if(s->avctx->codec_tag != AV_RL32("CJPG")){
+        prev_shift      = get_bits(&s->gb, 4); /* Ah */
+        point_transform = get_bits(&s->gb, 4); /* Al */
+    }else
+        prev_shift = point_transform = 0;
 
     if (nb_components > 1) {
         /* interleaved stream */
@@ -1119,27 +1658,33 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
     }
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(s->avctx, AV_LOG_DEBUG, "%s %s p:%d >>:%d ilv:%d bits:%d %s\n",
+        av_log(s->avctx, AV_LOG_DEBUG, "%s %s p:%d >>:%d ilv:%d bits:%d skip:%d %s comp:%d\n",
                s->lossless ? "lossless" : "sequential DCT", s->rgb ? "RGB" : "",
-               predictor, point_transform, ilv, s->bits,
-               s->pegasus_rct ? "PRCT" : (s->rct ? "RCT" : ""));
+               predictor, point_transform, ilv, s->bits, s->mjpb_skiptosod,
+               s->pegasus_rct ? "PRCT" : (s->rct ? "RCT" : ""), nb_components);
 
 
     /* mjpeg-b can have padding bytes between sos and image data, skip them */
     for (i = s->mjpb_skiptosod; i > 0; i--)
         skip_bits(&s->gb, 8);
 
-    if (s->lossless && s->rgb && nb_components != 3) {
-        avpriv_request_sample(s->avctx,
-                              "Lossless RGB image without 3 components");
-        return AVERROR_PATCHWELCOME;
-    }
-
 next_field:
     for (i = 0; i < nb_components; i++)
-        s->last_dc[i] = 1024;
+        s->last_dc[i] = (4 << s->bits);
 
-    if (s->lossless) {
+    if (s->avctx->hwaccel) {
+        int bytes_to_start = get_bits_count(&s->gb) / 8;
+        av_assert0(bytes_to_start >= 0 &&
+                   s->raw_scan_buffer_size >= bytes_to_start);
+
+        ret = s->avctx->hwaccel->decode_slice(s->avctx,
+                                              s->raw_scan_buffer      + bytes_to_start,
+                                              s->raw_scan_buffer_size - bytes_to_start);
+        if (ret < 0)
+            return ret;
+
+    } else if (s->lossless) {
+        av_assert0(s->picture_ptr == s->picture);
         if (CONFIG_JPEGLS_DECODER && s->ls) {
 //            for () {
 //            reset_ls_coding_parameters(s, 0);
@@ -1149,8 +1694,7 @@ next_field:
                 return ret;
         } else {
             if (s->rgb) {
-                if ((ret = ljpeg_decode_rgb_scan(s, predictor,
-                                                 point_transform)) < 0)
+                if ((ret = ljpeg_decode_rgb_scan(s, nb_components, predictor, point_transform)) < 0)
                     return ret;
             } else {
                 if ((ret = ljpeg_decode_yuv_scan(s, predictor,
@@ -1161,16 +1705,15 @@ next_field:
         }
     } else {
         if (s->progressive && predictor) {
+            av_assert0(s->picture_ptr == s->picture);
             if ((ret = mjpeg_decode_scan_progressive_ac(s, predictor,
                                                         ilv, prev_shift,
-                                                        point_transform,
-                                                        mb_bitmask,
-                                                        reference)) < 0)
+                                                        point_transform)) < 0)
                 return ret;
         } else {
             if ((ret = mjpeg_decode_scan(s, nb_components,
                                          prev_shift, point_transform,
-                                         mb_bitmask, reference)) < 0)
+                                         mb_bitmask, mb_bitmask_size, reference)) < 0)
                 return ret;
         }
     }
@@ -1181,7 +1724,7 @@ next_field:
         GetBitContext bak = s->gb;
         align_get_bits(&bak);
         if (show_bits(&bak, 16) == 0xFFD1) {
-            ff_dlog(s->avctx, "AVRn interlaced picture marker found\n");
+            av_log(s->avctx, AV_LOG_DEBUG, "AVRn interlaced picture marker found\n");
             s->gb = bak;
             skip_bits(&s->gb, 16);
             s->bottom_field ^= 1;
@@ -1214,22 +1757,22 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
     int len, id, i;
 
     len = get_bits(&s->gb, 16);
-    if (len < 5)
+    if (len < 6)
         return AVERROR_INVALIDDATA;
     if (8 * len > get_bits_left(&s->gb))
         return AVERROR_INVALIDDATA;
 
     id   = get_bits_long(&s->gb, 32);
-    id   = av_be2ne32(id);
     len -= 6;
 
     if (s->avctx->debug & FF_DEBUG_STARTCODE)
-        av_log(s->avctx, AV_LOG_DEBUG, "APPx %8X\n", id);
+        av_log(s->avctx, AV_LOG_DEBUG, "APPx (%s / %8X) len=%d\n",
+               av_fourcc2str(av_bswap32(id)), id, len);
 
     /* Buggy AVID, it puts EOI only at every 10th frame. */
     /* Also, this fourcc is used by non-avid files too, it holds some
        information, but it's always present in AVID-created files. */
-    if (id == AV_RL32("AVI1")) {
+    if (id == AV_RB32("AVI1")) {
         /* structure:
             4bytes      AVI1
             1bytes      polarity
@@ -1237,17 +1780,16 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
             4bytes      field_size
             4bytes      field_size_less_padding
         */
-        s->buggy_avid = 1;
-        i = get_bits(&s->gb, 8);
-        if (i == 2)
-            s->bottom_field = 1;
-        else if (i == 1)
-            s->bottom_field = 0;
+            s->buggy_avid = 1;
+        i = get_bits(&s->gb, 8); len--;
+        av_log(s->avctx, AV_LOG_DEBUG, "polarity %d\n", i);
         goto out;
     }
 
-    if (id == AV_RL32("JFIF")) {
+    if (id == AV_RB32("JFIF")) {
         int t_w, t_h, v1, v2;
+        if (len < 8)
+            goto out;
         skip_bits(&s->gb, 8); /* the trailing zero-byte */
         v1 = get_bits(&s->gb, 8);
         v2 = get_bits(&s->gb, 8);
@@ -1255,7 +1797,11 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
 
         s->avctx->sample_aspect_ratio.num = get_bits(&s->gb, 16);
         s->avctx->sample_aspect_ratio.den = get_bits(&s->gb, 16);
-        ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
+        if (   s->avctx->sample_aspect_ratio.num <= 0
+            || s->avctx->sample_aspect_ratio.den <= 0) {
+            s->avctx->sample_aspect_ratio.num = 0;
+            s->avctx->sample_aspect_ratio.den = 1;
+        }
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO)
             av_log(s->avctx, AV_LOG_INFO,
@@ -1264,29 +1810,38 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
                    s->avctx->sample_aspect_ratio.num,
                    s->avctx->sample_aspect_ratio.den);
 
-        t_w = get_bits(&s->gb, 8);
-        t_h = get_bits(&s->gb, 8);
-        if (t_w && t_h) {
-            /* skip thumbnail */
-            if (len -10 - (t_w * t_h * 3) > 0)
-                len -= t_w * t_h * 3;
+        len -= 8;
+        if (len >= 2) {
+            t_w = get_bits(&s->gb, 8);
+            t_h = get_bits(&s->gb, 8);
+            if (t_w && t_h) {
+                /* skip thumbnail */
+                if (len -10 - (t_w * t_h * 3) > 0)
+                    len -= t_w * t_h * 3;
+            }
+            len -= 2;
         }
-        len -= 10;
         goto out;
     }
 
-    if (id == AV_RL32("Adob") && (get_bits(&s->gb, 8) == 'e')) {
-        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-            av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n");
+    if (   id == AV_RB32("Adob")
+        && len >= 7
+        && show_bits(&s->gb, 8) == 'e'
+        && show_bits_long(&s->gb, 32) != AV_RB32("e_CM")) {
+        skip_bits(&s->gb,  8); /* 'e' */
         skip_bits(&s->gb, 16); /* version */
         skip_bits(&s->gb, 16); /* flags0 */
         skip_bits(&s->gb, 16); /* flags1 */
-        skip_bits(&s->gb,  8); /* transform */
+        s->adobe_transform = get_bits(&s->gb,  8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found, transform=%d\n", s->adobe_transform);
         len -= 7;
         goto out;
     }
 
-    if (id == AV_RL32("LJIF")) {
+    if (id == AV_RB32("LJIF")) {
+        int rgb = s->rgb;
+        int pegasus_rct = s->pegasus_rct;
         if (s->avctx->debug & FF_DEBUG_PICT_INFO)
             av_log(s->avctx, AV_LOG_INFO,
                    "Pegasus lossless jpeg header found\n");
@@ -1294,34 +1849,208 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
         skip_bits(&s->gb, 16); /* unknown always 0? */
         skip_bits(&s->gb, 16); /* unknown always 0? */
         skip_bits(&s->gb, 16); /* unknown always 0? */
-        switch (get_bits(&s->gb, 8)) {
+        switch (i=get_bits(&s->gb, 8)) {
         case 1:
-            s->rgb         = 1;
-            s->pegasus_rct = 0;
+            rgb         = 1;
+            pegasus_rct = 0;
             break;
         case 2:
-            s->rgb         = 1;
-            s->pegasus_rct = 1;
+            rgb         = 1;
+            pegasus_rct = 1;
             break;
         default:
-            av_log(s->avctx, AV_LOG_ERROR, "unknown colorspace\n");
+            av_log(s->avctx, AV_LOG_ERROR, "unknown colorspace %d\n", i);
         }
+
         len -= 9;
+        if (s->got_picture)
+            if (rgb != s->rgb || pegasus_rct != s->pegasus_rct) {
+                av_log(s->avctx, AV_LOG_WARNING, "Mismatching LJIF tag\n");
+                goto out;
+            }
+
+        s->rgb = rgb;
+        s->pegasus_rct = pegasus_rct;
+
+        goto out;
+    }
+    if (id == AV_RL32("colr") && len > 0) {
+        s->colr = get_bits(&s->gb, 8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "COLR %d\n", s->colr);
+        len --;
+        goto out;
+    }
+    if (id == AV_RL32("xfrm") && len > 0) {
+        s->xfrm = get_bits(&s->gb, 8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "XFRM %d\n", s->xfrm);
+        len --;
+        goto out;
+    }
+
+    /* JPS extension by VRex */
+    if (s->start_code == APP3 && id == AV_RB32("_JPS") && len >= 10) {
+        int flags, layout, type;
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "_JPSJPS_\n");
+
+        skip_bits(&s->gb, 32); len -= 4;  /* JPS_ */
+        skip_bits(&s->gb, 16); len -= 2;  /* block length */
+        skip_bits(&s->gb, 8);             /* reserved */
+        flags  = get_bits(&s->gb, 8);
+        layout = get_bits(&s->gb, 8);
+        type   = get_bits(&s->gb, 8);
+        len -= 4;
+
+        av_freep(&s->stereo3d);
+        s->stereo3d = av_stereo3d_alloc();
+        if (!s->stereo3d) {
+            goto out;
+        }
+        if (type == 0) {
+            s->stereo3d->type = AV_STEREO3D_2D;
+        } else if (type == 1) {
+            switch (layout) {
+            case 0x01:
+                s->stereo3d->type = AV_STEREO3D_LINES;
+                break;
+            case 0x02:
+                s->stereo3d->type = AV_STEREO3D_SIDEBYSIDE;
+                break;
+            case 0x03:
+                s->stereo3d->type = AV_STEREO3D_TOPBOTTOM;
+                break;
+            }
+            if (!(flags & 0x04)) {
+                s->stereo3d->flags = AV_STEREO3D_FLAG_INVERT;
+            }
+        }
+        goto out;
+    }
+
+    /* EXIF metadata */
+    if (s->start_code == APP1 && id == AV_RB32("Exif") && len >= 2) {
+        GetByteContext gbytes;
+        int ret, le, ifd_offset, bytes_read;
+        const uint8_t *aligned;
+
+        skip_bits(&s->gb, 16); // skip padding
+        len -= 2;
+
+        // init byte wise reading
+        aligned = align_get_bits(&s->gb);
+        bytestream2_init(&gbytes, aligned, len);
+
+        // read TIFF header
+        ret = ff_tdecode_header(&gbytes, &le, &ifd_offset);
+        if (ret) {
+            av_log(s->avctx, AV_LOG_ERROR, "mjpeg: invalid TIFF header in EXIF data\n");
+        } else {
+            bytestream2_seek(&gbytes, ifd_offset, SEEK_SET);
+
+            // read 0th IFD and store the metadata
+            // (return values > 0 indicate the presence of subimage metadata)
+            ret = ff_exif_decode_ifd(s->avctx, &gbytes, le, 0, &s->exif_metadata);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "mjpeg: error decoding EXIF data\n");
+            }
+        }
+
+        bytes_read = bytestream2_tell(&gbytes);
+        skip_bits(&s->gb, bytes_read << 3);
+        len -= bytes_read;
+
         goto out;
     }
 
     /* Apple MJPEG-A */
     if ((s->start_code == APP1) && (len > (0x28 - 8))) {
         id   = get_bits_long(&s->gb, 32);
-        id   = av_be2ne32(id);
         len -= 4;
         /* Apple MJPEG-A */
-        if (id == AV_RL32("mjpg")) {
+        if (id == AV_RB32("mjpg")) {
+            /* structure:
+                4bytes      field size
+                4bytes      pad field size
+                4bytes      next off
+                4bytes      quant off
+                4bytes      huff off
+                4bytes      image off
+                4bytes      scan off
+                4bytes      data off
+            */
             if (s->avctx->debug & FF_DEBUG_PICT_INFO)
                 av_log(s->avctx, AV_LOG_INFO, "mjpeg: Apple MJPEG-A header found\n");
         }
     }
 
+    if (s->start_code == APP2 && id == AV_RB32("ICC_") && len >= 10) {
+        int id2;
+        unsigned seqno;
+        unsigned nummarkers;
+
+        id   = get_bits_long(&s->gb, 32);
+        id2  = get_bits_long(&s->gb, 24);
+        len -= 7;
+        if (id != AV_RB32("PROF") || id2 != AV_RB24("ILE")) {
+            av_log(s->avctx, AV_LOG_WARNING, "Invalid ICC_PROFILE header in APP2\n");
+            goto out;
+        }
+
+        skip_bits(&s->gb, 8);
+        seqno  = get_bits(&s->gb, 8);
+        len   -= 2;
+        if (seqno == 0) {
+            av_log(s->avctx, AV_LOG_WARNING, "Invalid sequence number in APP2\n");
+            goto out;
+        }
+
+        nummarkers  = get_bits(&s->gb, 8);
+        len        -= 1;
+        if (nummarkers == 0) {
+            av_log(s->avctx, AV_LOG_WARNING, "Invalid number of markers coded in APP2\n");
+            goto out;
+        } else if (s->iccnum != 0 && nummarkers != s->iccnum) {
+            av_log(s->avctx, AV_LOG_WARNING, "Mistmatch in coded number of ICC markers between markers\n");
+            goto out;
+        } else if (seqno > nummarkers) {
+            av_log(s->avctx, AV_LOG_WARNING, "Mismatching sequence number and coded number of ICC markers\n");
+            goto out;
+        }
+
+        /* Allocate if this is the first APP2 we've seen. */
+        if (s->iccnum == 0) {
+            s->iccdata     = av_mallocz(nummarkers * sizeof(*(s->iccdata)));
+            s->iccdatalens = av_mallocz(nummarkers * sizeof(*(s->iccdatalens)));
+            if (!s->iccdata || !s->iccdatalens) {
+                av_log(s->avctx, AV_LOG_ERROR, "Could not allocate ICC data arrays\n");
+                return AVERROR(ENOMEM);
+            }
+            s->iccnum = nummarkers;
+        }
+
+        if (s->iccdata[seqno - 1]) {
+            av_log(s->avctx, AV_LOG_WARNING, "Duplicate ICC sequence number\n");
+            goto out;
+        }
+
+        s->iccdatalens[seqno - 1]  = len;
+        s->iccdata[seqno - 1]      = av_malloc(len);
+        if (!s->iccdata[seqno - 1]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Could not allocate ICC data buffer\n");
+            return AVERROR(ENOMEM);
+        }
+
+        memcpy(s->iccdata[seqno - 1], align_get_bits(&s->gb), len);
+        skip_bits(&s->gb, len << 3);
+        len = 0;
+        s->iccread++;
+
+        if (s->iccread > s->iccnum)
+            av_log(s->avctx, AV_LOG_WARNING, "Read more ICC markers than are supposed to be coded\n");
+    }
+
 out:
     /* slow but needed for extreme adobe jpegs */
     if (len < 0)
@@ -1350,16 +2079,20 @@ static int mjpeg_decode_com(MJpegDecodeContext *s)
             cbuf[i] = 0;
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-            av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf);
+            av_log(s->avctx, AV_LOG_INFO, "comment: '%s'\n", cbuf);
 
         /* buggy avid, it puts EOI only at every 10th frame */
-        if (!strcmp(cbuf, "AVID")) {
-            s->buggy_avid = 1;
+        if (!strncmp(cbuf, "AVID", 4)) {
+            parse_avid(s, cbuf, len);
         } else if (!strcmp(cbuf, "CS=ITU601"))
             s->cs_itu601 = 1;
-        else if ((len > 20 && !strncmp(cbuf, "Intel(R) JPEG Library", 21)) ||
-                 (len > 19 && !strncmp(cbuf, "Metasoft MJPEG Codec", 20)))
+        else if ((!strncmp(cbuf, "Intel(R) JPEG Library, version 1", 32) && s->avctx->codec_tag) ||
+                 (!strncmp(cbuf, "Metasoft MJPEG Codec", 20)))
             s->flipped = 1;
+        else if (!strcmp(cbuf, "MULTISCOPE II")) {
+            s->avctx->sample_aspect_ratio = (AVRational) { 1, 2 };
+            s->multiscope = 2;
+        }
 
         av_free(cbuf);
     }
@@ -1374,22 +2107,19 @@ static int find_marker(const uint8_t **pbuf_ptr, const uint8_t *buf_end)
     const uint8_t *buf_ptr;
     unsigned int v, v2;
     int val;
-#ifdef DEBUG
     int skipped = 0;
-#endif
 
     buf_ptr = *pbuf_ptr;
-    while (buf_ptr < buf_end) {
+    while (buf_end - buf_ptr > 1) {
         v  = *buf_ptr++;
         v2 = *buf_ptr;
         if ((v == 0xff) && (v2 >= 0xc0) && (v2 <= 0xfe) && buf_ptr < buf_end) {
             val = *buf_ptr++;
             goto found;
         }
-#ifdef DEBUG
         skipped++;
-#endif
     }
+    buf_ptr = buf_end;
     val = -1;
 found:
     ff_dlog(NULL, "find_marker skipped %d bytes\n", skipped);
@@ -1412,30 +2142,60 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
     /* unescape buffer of SOS, use special treatment for JPEG-LS */
     if (start_code == SOS && !s->ls) {
         const uint8_t *src = *buf_ptr;
+        const uint8_t *ptr = src;
         uint8_t *dst = s->buffer;
 
-        while (src < buf_end) {
-            uint8_t x = *(src++);
+        #define copy_data_segment(skip) do {       \
+            ptrdiff_t length = (ptr - src) - (skip);  \
+            if (length > 0) {                         \
+                memcpy(dst, src, length);             \
+                dst += length;                        \
+                src = ptr;                            \
+            }                                         \
+        } while (0)
+
+        if (s->avctx->codec_id == AV_CODEC_ID_THP) {
+            ptr = buf_end;
+            copy_data_segment(0);
+        } else {
+            while (ptr < buf_end) {
+                uint8_t x = *(ptr++);
 
-            *(dst++) = x;
-            if (s->avctx->codec_id != AV_CODEC_ID_THP) {
                 if (x == 0xff) {
-                    while (src < buf_end && x == 0xff)
-                        x = *(src++);
+                    ptrdiff_t skip = 0;
+                    while (ptr < buf_end && x == 0xff) {
+                        x = *(ptr++);
+                        skip++;
+                    }
 
-                    if (x >= 0xd0 && x <= 0xd7)
-                        *(dst++) = x;
-                    else if (x)
-                        break;
+                    /* 0xFF, 0xFF, ... */
+                    if (skip > 1) {
+                        copy_data_segment(skip);
+
+                        /* decrement src as it is equal to ptr after the
+                         * copy_data_segment macro and we might want to
+                         * copy the current value of x later on */
+                        src--;
+                    }
+
+                    if (x < 0xd0 || x > 0xd7) {
+                        copy_data_segment(1);
+                        if (x)
+                            break;
+                    }
                 }
             }
+            if (src < ptr)
+                copy_data_segment(0);
         }
+        #undef copy_data_segment
+
         *unescaped_buf_ptr  = s->buffer;
         *unescaped_buf_size = dst - s->buffer;
         memset(s->buffer + *unescaped_buf_size, 0,
                AV_INPUT_BUFFER_PADDING_SIZE);
 
-        av_log(s->avctx, AV_LOG_DEBUG, "escaping removed %td bytes\n",
+        av_log(s->avctx, AV_LOG_DEBUG, "escaping removed %"PTRDIFF_SPECIFIER" bytes\n",
                (buf_end - *buf_ptr) - (dst - s->buffer));
     } else if (start_code == SOS && s->ls) {
         const uint8_t *src = *buf_ptr;
@@ -1444,8 +2204,6 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
         int t = 0, b = 0;
         PutBitContext pb;
 
-        s->cur_scan++;
-
         /* find marker */
         while (src + t < buf_end) {
             uint8_t x = src[t++];
@@ -1453,7 +2211,7 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
                 while ((src + t < buf_end) && x == 0xff)
                     x = src[t++];
                 if (x & 0x80) {
-                    t -= 2;
+                    t -= FFMIN(2, t);
                     break;
                 }
             }
@@ -1465,8 +2223,12 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
         while (b < t) {
             uint8_t x = src[b++];
             put_bits(&pb, 8, x);
-            if (x == 0xFF) {
+            if (x == 0xFF && b < t) {
                 x = src[b++];
+                if (x & 0x80) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid escape sequence\n");
+                    x &= 0x7f;
+                }
                 put_bits(&pb, 7, x);
                 bit_count--;
             }
@@ -1485,6 +2247,20 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
     return start_code;
 }
 
+static void reset_icc_profile(MJpegDecodeContext *s)
+{
+    int i;
+
+    if (s->iccdata)
+        for (i = 0; i < s->iccnum; i++)
+            av_freep(&s->iccdata[i]);
+    av_freep(&s->iccdata);
+    av_freep(&s->iccdatalens);
+
+    s->iccread = 0;
+    s->iccnum  = 0;
+}
+
 int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                           AVPacket *avpkt)
 {
@@ -1494,11 +2270,22 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     MJpegDecodeContext *s = avctx->priv_data;
     const uint8_t *buf_end, *buf_ptr;
     const uint8_t *unescaped_buf_ptr;
+    int hshift, vshift;
     int unescaped_buf_size;
     int start_code;
+    int i, index;
     int ret = 0;
+    int is16bit;
+
+    s->buf_size = buf_size;
+
+    av_dict_free(&s->exif_metadata);
+    av_freep(&s->stereo3d);
+    s->adobe_transform = -1;
+
+    if (s->iccnum != 0)
+        reset_icc_profile(s);
 
-    s->got_picture = 0; // picture from previous image can not be reused
     buf_ptr = buf;
     buf_end = buf + buf_size;
     while (buf_ptr < buf_end) {
@@ -1508,21 +2295,22 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                                           &unescaped_buf_size);
         /* EOF */
         if (start_code < 0) {
-            goto the_end;
+            break;
         } else if (unescaped_buf_size > INT_MAX / 8) {
             av_log(avctx, AV_LOG_ERROR,
                    "MJPEG packet 0x%x too big (%d/%d), corrupt data?\n",
                    start_code, unescaped_buf_size, buf_size);
             return AVERROR_INVALIDDATA;
         }
-
-        av_log(avctx, AV_LOG_DEBUG, "marker=%x avail_size_in_buf=%td\n",
+        av_log(avctx, AV_LOG_DEBUG, "marker=%x avail_size_in_buf=%"PTRDIFF_SPECIFIER"\n",
                start_code, buf_end - buf_ptr);
 
-        ret = init_get_bits(&s->gb, unescaped_buf_ptr,
-                            unescaped_buf_size * 8);
-        if (ret < 0)
-            return ret;
+        ret = init_get_bits8(&s->gb, unescaped_buf_ptr, unescaped_buf_size);
+
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "invalid buffer\n");
+            goto fail;
+        }
 
         s->start_code = start_code;
         if (s->avctx->debug & FF_DEBUG_STARTCODE)
@@ -1535,75 +2323,106 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             /* APP fields */
         } else if (start_code >= APP0 && start_code <= APP15) {
             if ((ret = mjpeg_decode_app(s)) < 0)
-                return ret;
+                av_log(avctx, AV_LOG_ERROR, "unable to decode APP fields: %s\n",
+                       av_err2str(ret));
             /* Comment */
         } else if (start_code == COM) {
             ret = mjpeg_decode_com(s);
             if (ret < 0)
                 return ret;
+        } else if (start_code == DQT) {
+            ret = ff_mjpeg_decode_dqt(s);
+            if (ret < 0)
+                return ret;
         }
 
+        ret = -1;
+
         if (!CONFIG_JPEGLS_DECODER &&
             (start_code == SOF48 || start_code == LSE)) {
             av_log(avctx, AV_LOG_ERROR, "JPEG-LS support not enabled.\n");
             return AVERROR(ENOSYS);
         }
 
+        if (avctx->skip_frame == AVDISCARD_ALL) {
+            switch(start_code) {
+            case SOF0:
+            case SOF1:
+            case SOF2:
+            case SOF3:
+            case SOF48:
+            case SOI:
+            case SOS:
+            case EOI:
+                break;
+            default:
+                goto skip;
+            }
+        }
+
         switch (start_code) {
         case SOI:
             s->restart_interval = 0;
             s->restart_count    = 0;
+            s->raw_image_buffer      = buf_ptr;
+            s->raw_image_buffer_size = buf_end - buf_ptr;
             /* nothing to do on SOI */
             break;
-        case DQT:
-            if ((ret = ff_mjpeg_decode_dqt(s)) < 0)
-                return ret;
-            break;
         case DHT:
             if ((ret = ff_mjpeg_decode_dht(s)) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "huffman table decode error\n");
-                return ret;
+                goto fail;
             }
             break;
         case SOF0:
         case SOF1:
+            if (start_code == SOF0)
+                s->avctx->profile = FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT;
+            else
+                s->avctx->profile = FF_PROFILE_MJPEG_HUFFMAN_EXTENDED_SEQUENTIAL_DCT;
             s->lossless    = 0;
             s->ls          = 0;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF2:
+            s->avctx->profile = FF_PROFILE_MJPEG_HUFFMAN_PROGRESSIVE_DCT;
             s->lossless    = 0;
             s->ls          = 0;
             s->progressive = 1;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF3:
+            s->avctx->profile     = FF_PROFILE_MJPEG_HUFFMAN_LOSSLESS;
+            s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             s->lossless    = 1;
             s->ls          = 0;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF48:
+            s->avctx->profile     = FF_PROFILE_MJPEG_JPEG_LS;
+            s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             s->lossless    = 1;
             s->ls          = 1;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case LSE:
             if (!CONFIG_JPEGLS_DECODER ||
                 (ret = ff_jpegls_decode_lse(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case EOI:
-            s->cur_scan = 0;
-            if ((s->buggy_avid && !s->interlaced) || s->restart_interval)
-                break;
 eoi_parser:
+            if (!avctx->hwaccel && avctx->skip_frame != AVDISCARD_ALL &&
+                s->progressive && s->cur_scan && s->got_picture)
+                mjpeg_idct_scan_progressive_ac(s);
+            s->cur_scan = 0;
             if (!s->got_picture) {
                 av_log(avctx, AV_LOG_WARNING,
                        "Found EOI before any SOF, ignoring\n");
@@ -1613,43 +2432,53 @@ eoi_parser:
                 s->bottom_field ^= 1;
                 /* if not bottom field, do not output image yet */
                 if (s->bottom_field == !s->interlace_polarity)
-                    goto not_the_end;
+                    break;
+            }
+            if (avctx->skip_frame == AVDISCARD_ALL) {
+                s->got_picture = 0;
+                goto the_end_no_picture;
+            }
+            if (s->avctx->hwaccel) {
+                ret = s->avctx->hwaccel->end_frame(s->avctx);
+                if (ret < 0)
+                    return ret;
+
+                av_freep(&s->hwaccel_picture_private);
             }
             if ((ret = av_frame_ref(frame, s->picture_ptr)) < 0)
                 return ret;
-            if (s->flipped) {
-                int i;
-                for (i = 0; frame->data[i]; i++) {
-                    int h = frame->height >> ((i == 1 || i == 2) ?
-                                              s->pix_desc->log2_chroma_h : 0);
-                    frame->data[i] += frame->linesize[i] * (h - 1);
-                    frame->linesize[i] *= -1;
-                }
-            }
             *got_frame = 1;
+            s->got_picture = 0;
+
+            if (!s->lossless) {
+                int qp = FFMAX3(s->qscale[0],
+                                s->qscale[1],
+                                s->qscale[2]);
+                int qpw = (s->width + 15) / 16;
+                AVBufferRef *qp_table_buf = av_buffer_alloc(qpw);
+                if (qp_table_buf) {
+                    memset(qp_table_buf->data, qp, qpw);
+                    av_frame_set_qp_table(data, qp_table_buf, 0, FF_QSCALE_TYPE_MPEG1);
+                }
 
-            if (!s->lossless &&
-                avctx->debug & FF_DEBUG_QP) {
-                av_log(avctx, AV_LOG_DEBUG,
-                       "QP: %d\n", FFMAX3(s->qscale[0],
-                                          s->qscale[1],
-                                          s->qscale[2]));
+                if(avctx->debug & FF_DEBUG_QP)
+                    av_log(avctx, AV_LOG_DEBUG, "QP: %d\n", qp);
             }
 
             goto the_end;
         case SOS:
-            if (!s->got_picture) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Can not process SOS before SOF, skipping\n");
+            s->raw_scan_buffer      = buf_ptr;
+            s->raw_scan_buffer_size = buf_end - buf_ptr;
+
+            s->cur_scan++;
+            if (avctx->skip_frame == AVDISCARD_ALL) {
+                skip_bits(&s->gb, get_bits_left(&s->gb));
                 break;
-                }
-            if ((ret = ff_mjpeg_decode_sos(s, NULL, NULL)) < 0 &&
+            }
+
+            if ((ret = ff_mjpeg_decode_sos(s, NULL, 0, NULL)) < 0 &&
                 (avctx->err_recognition & AV_EF_EXPLODE))
-                return ret;
-            /* buggy avid puts EOI every 10-20th frame */
-            /* if restart period is over process EOI */
-            if ((s->buggy_avid && !s->interlaced) || s->restart_interval)
-                goto eoi_parser;
+                goto fail;
             break;
         case DRI:
             if ((ret = mjpeg_decode_dri(s)) < 0)
@@ -1670,21 +2499,242 @@ eoi_parser:
             break;
         }
 
-not_the_end:
+skip:
         /* eof process start code */
         buf_ptr += (get_bits_count(&s->gb) + 7) / 8;
         av_log(avctx, AV_LOG_DEBUG,
                "marker parser used %d bytes (%d bits)\n",
                (get_bits_count(&s->gb) + 7) / 8, get_bits_count(&s->gb));
     }
-    if (s->got_picture) {
+    if (s->got_picture && s->cur_scan) {
         av_log(avctx, AV_LOG_WARNING, "EOI missing, emulating\n");
         goto eoi_parser;
     }
     av_log(avctx, AV_LOG_FATAL, "No JPEG data found in image\n");
     return AVERROR_INVALIDDATA;
+fail:
+    s->got_picture = 0;
+    return ret;
 the_end:
-    av_log(avctx, AV_LOG_DEBUG, "mjpeg decode frame unused %td bytes\n",
+
+    is16bit = av_pix_fmt_desc_get(s->avctx->pix_fmt)->comp[0].step > 1;
+
+    if (AV_RB32(s->upscale_h)) {
+        int p;
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ440P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV440P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRP     ||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRAP
+                  );
+        ret = av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        if (ret)
+            return ret;
+
+        av_assert0(s->nb_components == av_pix_fmt_count_planes(s->picture_ptr->format));
+        for (p = 0; p<s->nb_components; p++) {
+            uint8_t *line = s->picture_ptr->data[p];
+            int w = s->width;
+            int h = s->height;
+            if (!s->upscale_h[p])
+                continue;
+            if (p==1 || p==2) {
+                w = AV_CEIL_RSHIFT(w, hshift);
+                h = AV_CEIL_RSHIFT(h, vshift);
+            }
+            if (s->upscale_v[p] == 1)
+                h = (h+1)>>1;
+            av_assert0(w > 0);
+            for (i = 0; i < h; i++) {
+                if (s->upscale_h[p] == 1) {
+                    if (is16bit) ((uint16_t*)line)[w - 1] = ((uint16_t*)line)[(w - 1) / 2];
+                    else                      line[w - 1] = line[(w - 1) / 2];
+                    for (index = w - 2; index > 0; index--) {
+                        if (is16bit)
+                            ((uint16_t*)line)[index] = (((uint16_t*)line)[index / 2] + ((uint16_t*)line)[(index + 1) / 2]) >> 1;
+                        else
+                            line[index] = (line[index / 2] + line[(index + 1) / 2]) >> 1;
+                    }
+                } else if (s->upscale_h[p] == 2) {
+                    if (is16bit) {
+                        ((uint16_t*)line)[w - 1] = ((uint16_t*)line)[(w - 1) / 3];
+                        if (w > 1)
+                            ((uint16_t*)line)[w - 2] = ((uint16_t*)line)[w - 1];
+                    } else {
+                        line[w - 1] = line[(w - 1) / 3];
+                        if (w > 1)
+                            line[w - 2] = line[w - 1];
+                    }
+                    for (index = w - 3; index > 0; index--) {
+                        line[index] = (line[index / 3] + line[(index + 1) / 3] + line[(index + 2) / 3] + 1) / 3;
+                    }
+                }
+                line += s->linesize[p];
+            }
+        }
+    }
+    if (AV_RB32(s->upscale_v)) {
+        int p;
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ422P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV422P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV440P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ440P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRP     ||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRAP
+                   );
+        ret = av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        if (ret)
+            return ret;
+
+        av_assert0(s->nb_components == av_pix_fmt_count_planes(s->picture_ptr->format));
+        for (p = 0; p < s->nb_components; p++) {
+            uint8_t *dst;
+            int w = s->width;
+            int h = s->height;
+            if (!s->upscale_v[p])
+                continue;
+            if (p==1 || p==2) {
+                w = AV_CEIL_RSHIFT(w, hshift);
+                h = AV_CEIL_RSHIFT(h, vshift);
+            }
+            dst = &((uint8_t *)s->picture_ptr->data[p])[(h - 1) * s->linesize[p]];
+            for (i = h - 1; i; i--) {
+                uint8_t *src1 = &((uint8_t *)s->picture_ptr->data[p])[i * s->upscale_v[p] / (s->upscale_v[p] + 1) * s->linesize[p]];
+                uint8_t *src2 = &((uint8_t *)s->picture_ptr->data[p])[(i + 1) * s->upscale_v[p] / (s->upscale_v[p] + 1) * s->linesize[p]];
+                if (s->upscale_v[p] != 2 && (src1 == src2 || i == h - 1)) {
+                    memcpy(dst, src1, w);
+                } else {
+                    for (index = 0; index < w; index++)
+                        dst[index] = (src1[index] + src2[index]) >> 1;
+                }
+                dst -= s->linesize[p];
+            }
+        }
+    }
+    if (s->flipped && !s->rgb) {
+        int j;
+        ret = av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        if (ret)
+            return ret;
+
+        av_assert0(s->nb_components == av_pix_fmt_count_planes(s->picture_ptr->format));
+        for (index=0; index<s->nb_components; index++) {
+            uint8_t *dst = s->picture_ptr->data[index];
+            int w = s->picture_ptr->width;
+            int h = s->picture_ptr->height;
+            if(index && index<3){
+                w = AV_CEIL_RSHIFT(w, hshift);
+                h = AV_CEIL_RSHIFT(h, vshift);
+            }
+            if(dst){
+                uint8_t *dst2 = dst + s->picture_ptr->linesize[index]*(h-1);
+                for (i=0; i<h/2; i++) {
+                    for (j=0; j<w; j++)
+                        FFSWAP(int, dst[j], dst2[j]);
+                    dst  += s->picture_ptr->linesize[index];
+                    dst2 -= s->picture_ptr->linesize[index];
+                }
+            }
+        }
+    }
+    if (s->adobe_transform == 0 && s->avctx->pix_fmt == AV_PIX_FMT_GBRAP) {
+        int w = s->picture_ptr->width;
+        int h = s->picture_ptr->height;
+        av_assert0(s->nb_components == 4);
+        for (i=0; i<h; i++) {
+            int j;
+            uint8_t *dst[4];
+            for (index=0; index<4; index++) {
+                dst[index] =   s->picture_ptr->data[index]
+                             + s->picture_ptr->linesize[index]*i;
+            }
+            for (j=0; j<w; j++) {
+                int k = dst[3][j];
+                int r = dst[0][j] * k;
+                int g = dst[1][j] * k;
+                int b = dst[2][j] * k;
+                dst[0][j] = g*257 >> 16;
+                dst[1][j] = b*257 >> 16;
+                dst[2][j] = r*257 >> 16;
+                dst[3][j] = 255;
+            }
+        }
+    }
+    if (s->adobe_transform == 2 && s->avctx->pix_fmt == AV_PIX_FMT_YUVA444P) {
+        int w = s->picture_ptr->width;
+        int h = s->picture_ptr->height;
+        av_assert0(s->nb_components == 4);
+        for (i=0; i<h; i++) {
+            int j;
+            uint8_t *dst[4];
+            for (index=0; index<4; index++) {
+                dst[index] =   s->picture_ptr->data[index]
+                             + s->picture_ptr->linesize[index]*i;
+            }
+            for (j=0; j<w; j++) {
+                int k = dst[3][j];
+                int r = (255 - dst[0][j]) * k;
+                int g = (128 - dst[1][j]) * k;
+                int b = (128 - dst[2][j]) * k;
+                dst[0][j] = r*257 >> 16;
+                dst[1][j] = (g*257 >> 16) + 128;
+                dst[2][j] = (b*257 >> 16) + 128;
+                dst[3][j] = 255;
+            }
+        }
+    }
+
+    if (s->stereo3d) {
+        AVStereo3D *stereo = av_stereo3d_create_side_data(data);
+        if (stereo) {
+            stereo->type  = s->stereo3d->type;
+            stereo->flags = s->stereo3d->flags;
+        }
+        av_freep(&s->stereo3d);
+    }
+
+    if (s->iccnum != 0 && s->iccnum == s->iccread) {
+        AVFrameSideData *sd;
+        size_t offset = 0;
+        int total_size = 0;
+        int i;
+
+        /* Sum size of all parts. */
+        for (i = 0; i < s->iccnum; i++)
+            total_size += s->iccdatalens[i];
+
+        sd = av_frame_new_side_data(data, AV_FRAME_DATA_ICC_PROFILE, total_size);
+        if (!sd) {
+            av_log(s->avctx, AV_LOG_ERROR, "Could not allocate frame side data\n");
+            return AVERROR(ENOMEM);
+        }
+
+        /* Reassemble the parts, which are now in-order. */
+        for (i = 0; i < s->iccnum; i++) {
+            memcpy(sd->data + offset, s->iccdata[i], s->iccdatalens[i]);
+            offset += s->iccdatalens[i];
+        }
+    }
+
+    av_dict_copy(&((AVFrame *) data)->metadata, s->exif_metadata, 0);
+    av_dict_free(&s->exif_metadata);
+
+the_end_no_picture:
+    av_log(avctx, AV_LOG_DEBUG, "decode frame unused %"PTRDIFF_SPECIFIER" bytes\n",
            buf_end - buf_ptr);
 //  return buf_end - buf_ptr;
     return buf_ptr - buf;
@@ -1695,13 +2745,18 @@ av_cold int ff_mjpeg_decode_end(AVCodecContext *avctx)
     MJpegDecodeContext *s = avctx->priv_data;
     int i, j;
 
+    if (s->interlaced && s->bottom_field == !s->interlace_polarity && s->got_picture && !avctx->frame_number) {
+        av_log(avctx, AV_LOG_INFO, "Single field\n");
+    }
+
     if (s->picture) {
         av_frame_free(&s->picture);
         s->picture_ptr = NULL;
     } else if (s->picture_ptr)
         av_frame_unref(s->picture_ptr);
 
-    av_free(s->buffer);
+    av_freep(&s->buffer);
+    av_freep(&s->stereo3d);
     av_freep(&s->ljpeg_buffer);
     s->ljpeg_buffer_size = 0;
 
@@ -1713,14 +2768,27 @@ av_cold int ff_mjpeg_decode_end(AVCodecContext *avctx)
         av_freep(&s->blocks[i]);
         av_freep(&s->last_nnz[i]);
     }
+    av_dict_free(&s->exif_metadata);
+
+    reset_icc_profile(s);
+
+    av_freep(&s->hwaccel_picture_private);
+
     return 0;
 }
 
+static void decode_flush(AVCodecContext *avctx)
+{
+    MJpegDecodeContext *s = avctx->priv_data;
+    s->got_picture = 0;
+}
+
+#if CONFIG_MJPEG_DECODER
 #define OFFSET(x) offsetof(MJpegDecodeContext, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption options[] = {
     { "extern_huff", "Use external huffman table.",
-      OFFSET(extern_huff), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VD },
+      OFFSET(extern_huff), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VD },
     { NULL },
 };
 
@@ -1740,11 +2808,25 @@ AVCodec ff_mjpeg_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = ff_mjpeg_decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .priv_class     = &mjpegdec_class,
-    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_mjpeg_profiles),
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .hw_configs     = (const AVCodecHWConfigInternal*[]) {
+#if CONFIG_MJPEG_NVDEC_HWACCEL
+                        HWACCEL_NVDEC(mjpeg),
+#endif
+#if CONFIG_MJPEG_VAAPI_HWACCEL
+                        HWACCEL_VAAPI(mjpeg),
+#endif
+                        NULL
+                    },
 };
-
+#endif
+#if CONFIG_THP_DECODER
 AVCodec ff_thp_decoder = {
     .name           = "thp",
     .long_name      = NULL_IF_CONFIG_SMALL("Nintendo Gamecube THP video"),
@@ -1754,6 +2836,9 @@ AVCodec ff_thp_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = ff_mjpeg_decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
+#endif
diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h
index b80a47b..653fe7c 100644
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Alex Beregszaszi
  * Copyright (c) 2003-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 
 #include "libavutil/log.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/stereo3d.h"
 
 #include "avcodec.h"
 #include "blockdsp.h"
@@ -38,18 +39,21 @@
 #include "hpeldsp.h"
 #include "idctdsp.h"
 
+#undef near /* This file uses struct member 'near' which in windows.h is defined as empty. */
+
 #define MAX_COMPONENTS 4
 
 typedef struct MJpegDecodeContext {
     AVClass *class;
     AVCodecContext *avctx;
     GetBitContext gb;
+    int buf_size;
 
     int start_code; /* current start code */
     int buffer_size;
     uint8_t *buffer;
 
-    int16_t quant_matrixes[4][64];
+    uint16_t quant_matrixes[4][64];
     VLC vlcs[3][4];
     int qscale[4];      ///< quantizer scale calculated from quant_matrixes
 
@@ -61,9 +65,14 @@ typedef struct MJpegDecodeContext {
     int ls;
     int progressive;
     int rgb;
+    uint8_t upscale_h[4];
+    uint8_t upscale_v[4];
     int rct;            /* standard rct */
     int pegasus_rct;    /* pegasus reversible colorspace transform */
     int bits;           /* bits per component */
+    int colr;
+    int xfrm;
+    int adobe_transform;
 
     int maxval;
     int near;         ///< near lossless bound (si 0 for lossless)
@@ -83,6 +92,7 @@ typedef struct MJpegDecodeContext {
     int nb_blocks[MAX_COMPONENTS];
     int h_scount[MAX_COMPONENTS];
     int v_scount[MAX_COMPONENTS];
+    int quant_sindex[MAX_COMPONENTS];
     int h_max, v_max; /* maximum h and v counts */
     int quant_index[4];   /* quant table index for each component */
     int last_dc[MAX_COMPONENTS]; /* last DEQUANTIZED dc (XXX: am I right to do that ?) */
@@ -91,10 +101,11 @@ typedef struct MJpegDecodeContext {
     int got_picture;                                ///< we found a SOF and picture is valid, too.
     int linesize[MAX_COMPONENTS];                   ///< linesize << interlaced
     int8_t *qscale_table;
-    DECLARE_ALIGNED(16, int16_t, block)[64];
+    DECLARE_ALIGNED(32, int16_t, block)[64];
     int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
     uint8_t *last_nnz[MAX_COMPONENTS];
     uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
+    int palette_index;
     ScanTable scantable;
     BlockDSPContext bdsp;
     HpelDSPContext hdsp;
@@ -106,6 +117,7 @@ typedef struct MJpegDecodeContext {
     int buggy_avid;
     int cs_itu601;
     int interlace_polarity;
+    int multiscope;
 
     int mjpb_skiptosod;
 
@@ -116,8 +128,29 @@ typedef struct MJpegDecodeContext {
     unsigned int ljpeg_buffer_size;
 
     int extern_huff;
+    AVDictionary *exif_metadata;
+
+    AVStereo3D *stereo3d; ///!< stereoscopic information (cached, since it is read before frame allocation)
 
     const AVPixFmtDescriptor *pix_desc;
+
+    uint8_t **iccdata;
+    int *iccdatalens;
+    int iccnum;
+    int iccread;
+
+    // Raw stream data for hwaccel use.
+    const uint8_t *raw_image_buffer;
+    size_t         raw_image_buffer_size;
+    const uint8_t *raw_scan_buffer;
+    size_t         raw_scan_buffer_size;
+
+    uint8_t raw_huffman_lengths[2][4][16];
+    uint8_t raw_huffman_values[2][4][256];
+
+    enum AVPixelFormat hwaccel_sw_pix_fmt;
+    enum AVPixelFormat hwaccel_pix_fmt;
+    void *hwaccel_picture_private;
 } MJpegDecodeContext;
 
 int ff_mjpeg_decode_init(AVCodecContext *avctx);
@@ -129,7 +162,8 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s);
 int ff_mjpeg_decode_dht(MJpegDecodeContext *s);
 int ff_mjpeg_decode_sof(MJpegDecodeContext *s);
 int ff_mjpeg_decode_sos(MJpegDecodeContext *s,
-                        const uint8_t *mb_bitmask, const AVFrame *reference);
+                        const uint8_t *mb_bitmask,int mb_bitmask_size,
+                        const AVFrame *reference);
 int ff_mjpeg_find_marker(MJpegDecodeContext *s,
                          const uint8_t **buf_ptr, const uint8_t *buf_end,
                          const uint8_t **unescaped_buf_ptr, int *unescaped_buf_size);
diff --git a/libavcodec/mjpegenc.c b/libavcodec/mjpegenc.c
index 8291113..0ea7bd3 100644
--- a/libavcodec/mjpegenc.c
+++ b/libavcodec/mjpegenc.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,19 +38,57 @@
 #include "mpegvideo.h"
 #include "mjpeg.h"
 #include "mjpegenc.h"
+#include "profiles.h"
+
+static int alloc_huffman(MpegEncContext *s)
+{
+    MJpegContext *m = s->mjpeg_ctx;
+    size_t num_mbs, num_blocks, num_codes;
+    int blocks_per_mb;
+
+    // We need to init this here as the mjpeg init is called before the common init,
+    s->mb_width  = (s->width  + 15) / 16;
+    s->mb_height = (s->height + 15) / 16;
+
+    switch (s->chroma_format) {
+    case CHROMA_420: blocks_per_mb =  6; break;
+    case CHROMA_422: blocks_per_mb =  8; break;
+    case CHROMA_444: blocks_per_mb = 12; break;
+    default: av_assert0(0);
+    };
+
+    // Make sure we have enough space to hold this frame.
+    num_mbs = s->mb_width * s->mb_height;
+    num_blocks = num_mbs * blocks_per_mb;
+    num_codes = num_blocks * 64;
+
+    m->huff_buffer = av_malloc_array(num_codes, sizeof(MJpegHuffmanCode));
+    if (!m->huff_buffer)
+        return AVERROR(ENOMEM);
+    return 0;
+}
 
 av_cold int ff_mjpeg_encode_init(MpegEncContext *s)
 {
     MJpegContext *m;
 
-    m = av_malloc(sizeof(MJpegContext));
+    av_assert0(s->slice_context_count == 1);
+
+    if (s->width > 65500 || s->height > 65500) {
+        av_log(s, AV_LOG_ERROR, "JPEG does not support resolutions above 65500x65500\n");
+        return AVERROR(EINVAL);
+    }
+
+    m = av_mallocz(sizeof(MJpegContext));
     if (!m)
         return AVERROR(ENOMEM);
 
     s->min_qcoeff=-1023;
     s->max_qcoeff= 1023;
 
-    /* build all the huffman tables */
+    // Build default Huffman tables.
+    // These may be overwritten later with more optimal Huffman tables, but
+    // they are needed at least right now for some processes like trellis.
     ff_mjpeg_build_huffman_codes(m->huff_size_dc_luminance,
                                  m->huff_code_dc_luminance,
                                  avpriv_mjpeg_bits_dc_luminance,
@@ -68,13 +106,120 @@ av_cold int ff_mjpeg_encode_init(MpegEncContext *s)
                                  avpriv_mjpeg_bits_ac_chrominance,
                                  avpriv_mjpeg_val_ac_chrominance);
 
+    ff_init_uni_ac_vlc(m->huff_size_ac_luminance,   m->uni_ac_vlc_len);
+    ff_init_uni_ac_vlc(m->huff_size_ac_chrominance, m->uni_chroma_ac_vlc_len);
+    s->intra_ac_vlc_length      =
+    s->intra_ac_vlc_last_length = m->uni_ac_vlc_len;
+    s->intra_chroma_ac_vlc_length      =
+    s->intra_chroma_ac_vlc_last_length = m->uni_chroma_ac_vlc_len;
+
+    // Buffers start out empty.
+    m->huff_ncode = 0;
     s->mjpeg_ctx = m;
+
+    if(s->huffman == HUFFMAN_TABLE_OPTIMAL)
+        return alloc_huffman(s);
+
     return 0;
 }
 
-void ff_mjpeg_encode_close(MpegEncContext *s)
+av_cold void ff_mjpeg_encode_close(MpegEncContext *s)
+{
+    av_freep(&s->mjpeg_ctx->huff_buffer);
+    av_freep(&s->mjpeg_ctx);
+}
+
+/**
+ * Add code and table_id to the JPEG buffer.
+ *
+ * @param s The MJpegContext which contains the JPEG buffer.
+ * @param table_id Which Huffman table the code belongs to.
+ * @param code The encoded exponent of the coefficients and the run-bits.
+ */
+static inline void ff_mjpeg_encode_code(MJpegContext *s, uint8_t table_id, int code)
+{
+    MJpegHuffmanCode *c = &s->huff_buffer[s->huff_ncode++];
+    c->table_id = table_id;
+    c->code = code;
+}
+
+/**
+ * Add the coefficient's data to the JPEG buffer.
+ *
+ * @param s The MJpegContext which contains the JPEG buffer.
+ * @param table_id Which Huffman table the code belongs to.
+ * @param val The coefficient.
+ * @param run The run-bits.
+ */
+static void ff_mjpeg_encode_coef(MJpegContext *s, uint8_t table_id, int val, int run)
+{
+    int mant, code;
+
+    if (val == 0) {
+        av_assert0(run == 0);
+        ff_mjpeg_encode_code(s, table_id, 0);
+    } else {
+        mant = val;
+        if (val < 0) {
+            val = -val;
+            mant--;
+        }
+
+        code = (run << 4) | (av_log2_16bit(val) + 1);
+
+        s->huff_buffer[s->huff_ncode].mant = mant;
+        ff_mjpeg_encode_code(s, table_id, code);
+    }
+}
+
+/**
+ * Add the block's data into the JPEG buffer.
+ *
+ * @param s The MJpegEncContext that contains the JPEG buffer.
+ * @param block The block.
+ * @param n The block's index or number.
+ */
+static void record_block(MpegEncContext *s, int16_t *block, int n)
 {
-    av_free(s->mjpeg_ctx);
+    int i, j, table_id;
+    int component, dc, last_index, val, run;
+    MJpegContext *m = s->mjpeg_ctx;
+
+    /* DC coef */
+    component = (n <= 3 ? 0 : (n&1) + 1);
+    table_id = (n <= 3 ? 0 : 1);
+    dc = block[0]; /* overflow is impossible */
+    val = dc - s->last_dc[component];
+
+    ff_mjpeg_encode_coef(m, table_id, val, 0);
+
+    s->last_dc[component] = dc;
+
+    /* AC coefs */
+
+    run = 0;
+    last_index = s->block_last_index[n];
+    table_id |= 2;
+
+    for(i=1;i<=last_index;i++) {
+        j = s->intra_scantable.permutated[i];
+        val = block[j];
+
+        if (val == 0) {
+            run++;
+        } else {
+            while (run >= 16) {
+                ff_mjpeg_encode_code(m, table_id, 0xf0);
+                run -= 16;
+            }
+            ff_mjpeg_encode_coef(m, table_id, val, run);
+            run = 0;
+        }
+    }
+
+    /* output EOB only if not already 64 values */
+    if (last_index < 63 || run != 0)
+        ff_mjpeg_encode_code(m, table_id, 0);
 }
 
 static void encode_block(MpegEncContext *s, int16_t *block, int n)
@@ -120,7 +265,7 @@ static void encode_block(MpegEncContext *s, int16_t *block, int n)
                 mant--;
             }
 
-            nbits= av_log2(val) + 1;
+            nbits= av_log2_16bit(val) + 1;
             code = (run << 4) | nbits;
 
             put_bits(&s->pb, huff_size_ac[code], huff_code_ac[code]);
@@ -135,36 +280,126 @@ static void encode_block(MpegEncContext *s, int16_t *block, int n)
         put_bits(&s->pb, huff_size_ac[0], huff_code_ac[0]);
 }
 
-void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[8][64])
+void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[12][64])
 {
     int i;
-    for(i=0;i<5;i++) {
-        encode_block(s, block[i], i);
-    }
-    if (s->chroma_format == CHROMA_420) {
-        encode_block(s, block[5], 5);
+    if (s->huffman == HUFFMAN_TABLE_OPTIMAL) {
+        if (s->chroma_format == CHROMA_444) {
+            record_block(s, block[0], 0);
+            record_block(s, block[2], 2);
+            record_block(s, block[4], 4);
+            record_block(s, block[8], 8);
+            record_block(s, block[5], 5);
+            record_block(s, block[9], 9);
+
+            if (16*s->mb_x+8 < s->width) {
+                record_block(s, block[1], 1);
+                record_block(s, block[3], 3);
+                record_block(s, block[6], 6);
+                record_block(s, block[10], 10);
+                record_block(s, block[7], 7);
+                record_block(s, block[11], 11);
+            }
+        } else {
+            for(i=0;i<5;i++) {
+                record_block(s, block[i], i);
+            }
+            if (s->chroma_format == CHROMA_420) {
+                record_block(s, block[5], 5);
+            } else {
+                record_block(s, block[6], 6);
+                record_block(s, block[5], 5);
+                record_block(s, block[7], 7);
+            }
+        }
     } else {
-        encode_block(s, block[6], 6);
-        encode_block(s, block[5], 5);
-        encode_block(s, block[7], 7);
+        if (s->chroma_format == CHROMA_444) {
+            encode_block(s, block[0], 0);
+            encode_block(s, block[2], 2);
+            encode_block(s, block[4], 4);
+            encode_block(s, block[8], 8);
+            encode_block(s, block[5], 5);
+            encode_block(s, block[9], 9);
+
+            if (16*s->mb_x+8 < s->width) {
+                encode_block(s, block[1], 1);
+                encode_block(s, block[3], 3);
+                encode_block(s, block[6], 6);
+                encode_block(s, block[10], 10);
+                encode_block(s, block[7], 7);
+                encode_block(s, block[11], 11);
+            }
+        } else {
+            for(i=0;i<5;i++) {
+                encode_block(s, block[i], i);
+            }
+            if (s->chroma_format == CHROMA_420) {
+                encode_block(s, block[5], 5);
+            } else {
+                encode_block(s, block[6], 6);
+                encode_block(s, block[5], 5);
+                encode_block(s, block[7], 7);
+            }
+        }
+
+        s->i_tex_bits += get_bits_diff(s);
+    }
+}
+
+#if CONFIG_AMV_ENCODER
+// maximum over s->mjpeg_vsample[i]
+#define V_MAX 2
+static int amv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *pic_arg, int *got_packet)
+{
+    MpegEncContext *s = avctx->priv_data;
+    AVFrame *pic;
+    int i, ret;
+    int chroma_h_shift, chroma_v_shift;
+
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift, &chroma_v_shift);
+
+    if ((avctx->height & 15) && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Heights which are not a multiple of 16 might fail with some decoders, "
+               "use vstrict=-1 / -strict -1 to use %d anyway.\n", avctx->height);
+        av_log(avctx, AV_LOG_WARNING, "If you have a device that plays AMV videos, please test if videos "
+               "with such heights work with it and report your findings to ffmpeg-devel@ffmpeg.org\n");
+        return AVERROR_EXPERIMENTAL;
     }
 
-    s->i_tex_bits += get_bits_diff(s);
+    pic = av_frame_clone(pic_arg);
+    if (!pic)
+        return AVERROR(ENOMEM);
+    //picture should be flipped upside-down
+    for(i=0; i < 3; i++) {
+        int vsample = i ? 2 >> chroma_v_shift : 2;
+        pic->data[i] += pic->linesize[i] * (vsample * s->height / V_MAX - 1);
+        pic->linesize[i] *= -1;
+    }
+    ret = ff_mpv_encode_picture(avctx, pkt, pic, got_packet);
+    av_frame_free(&pic);
+    return ret;
 }
+#endif
 
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
+FF_MPV_COMMON_OPTS
 { "pred", "Prediction method", OFFSET(pred), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 3, VE, "pred" },
     { "left",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, VE, "pred" },
     { "plane",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, INT_MIN, INT_MAX, VE, "pred" },
     { "median", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 3 }, INT_MIN, INT_MAX, VE, "pred" },
-
-    { NULL},
+{ "huffman", "Huffman table strategy", OFFSET(huffman), AV_OPT_TYPE_INT, { .i64 = HUFFMAN_TABLE_OPTIMAL }, 0, NB_HUFFMAN_TABLE_OPTION - 1, VE, "huffman" },
+    { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = HUFFMAN_TABLE_DEFAULT }, INT_MIN, INT_MAX, VE, "huffman" },
+    { "optimal", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = HUFFMAN_TABLE_OPTIMAL }, INT_MIN, INT_MAX, VE, "huffman" },
+{ NULL},
 };
 
+#if CONFIG_MJPEG_ENCODER
 static const AVClass mjpeg_class = {
-    .class_name = "mjpeg",
+    .class_name = "mjpeg encoder",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
@@ -176,11 +411,38 @@ AVCodec ff_mjpeg_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_MJPEG,
     .priv_data_size = sizeof(MpegEncContext),
-    .priv_class     = &mjpeg_class,
     .init           = ff_mpv_encode_init,
     .encode2        = ff_mpv_encode_picture,
     .close          = ff_mpv_encode_end,
-    .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_NONE
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_NONE
+    },
+    .priv_class     = &mjpeg_class,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_mjpeg_profiles),
+};
+#endif
+
+#if CONFIG_AMV_ENCODER
+static const AVClass amv_class = {
+    .class_name = "amv encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_amv_encoder = {
+    .name           = "amv",
+    .long_name      = NULL_IF_CONFIG_SMALL("AMV Video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AMV,
+    .priv_data_size = sizeof(MpegEncContext),
+    .init           = ff_mpv_encode_init,
+    .encode2        = amv_encode_picture,
+    .close          = ff_mpv_encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_NONE
     },
+    .priv_class     = &amv_class,
 };
+#endif
diff --git a/libavcodec/mjpegenc.h b/libavcodec/mjpegenc.h
index bbb0f0e..d7ddc35 100644
--- a/libavcodec/mjpegenc.h
+++ b/libavcodec/mjpegenc.h
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,18 +39,65 @@
 #include "mpegvideo.h"
 #include "put_bits.h"
 
+/**
+ * Buffer of JPEG frame data.
+ *
+ * Optimal Huffman table generation requires the frame data to be loaded into
+ * a buffer so that the tables can be computed.
+ * There are at most mb_width*mb_height*12*64 of these per frame.
+ */
+typedef struct MJpegHuffmanCode {
+    // 0=DC lum, 1=DC chrom, 2=AC lum, 3=AC chrom
+    uint8_t table_id; ///< The Huffman table id associated with the data.
+    uint8_t code;     ///< The exponent.
+    uint16_t mant;    ///< The mantissa.
+} MJpegHuffmanCode;
+
+/**
+ * Holds JPEG frame data and Huffman table data.
+ */
 typedef struct MJpegContext {
-    uint8_t huff_size_dc_luminance[12]; //FIXME use array [3] instead of lumi / chroma, for easier addressing
-    uint16_t huff_code_dc_luminance[12];
-    uint8_t huff_size_dc_chrominance[12];
-    uint16_t huff_code_dc_chrominance[12];
+    //FIXME use array [3] instead of lumi / chroma, for easier addressing
+    uint8_t huff_size_dc_luminance[12];     ///< DC luminance Huffman table size.
+    uint16_t huff_code_dc_luminance[12];    ///< DC luminance Huffman table codes.
+    uint8_t huff_size_dc_chrominance[12];   ///< DC chrominance Huffman table size.
+    uint16_t huff_code_dc_chrominance[12];  ///< DC chrominance Huffman table codes.
+
+    uint8_t huff_size_ac_luminance[256];    ///< AC luminance Huffman table size.
+    uint16_t huff_code_ac_luminance[256];   ///< AC luminance Huffman table codes.
+    uint8_t huff_size_ac_chrominance[256];  ///< AC chrominance Huffman table size.
+    uint16_t huff_code_ac_chrominance[256]; ///< AC chrominance Huffman table codes.
 
-    uint8_t huff_size_ac_luminance[256];
-    uint16_t huff_code_ac_luminance[256];
-    uint8_t huff_size_ac_chrominance[256];
-    uint16_t huff_code_ac_chrominance[256];
+    /** Storage for AC luminance VLC (in MpegEncContext) */
+    uint8_t uni_ac_vlc_len[64 * 64 * 2];
+    /** Storage for AC chrominance VLC (in MpegEncContext) */
+    uint8_t uni_chroma_ac_vlc_len[64 * 64 * 2];
+
+    // Default DC tables have exactly 12 values
+    uint8_t bits_dc_luminance[17];   ///< DC luminance Huffman bits.
+    uint8_t val_dc_luminance[12];    ///< DC luminance Huffman values.
+    uint8_t bits_dc_chrominance[17]; ///< DC chrominance Huffman bits.
+    uint8_t val_dc_chrominance[12];  ///< DC chrominance Huffman values.
+
+    // 8-bit JPEG has max 256 values
+    uint8_t bits_ac_luminance[17];   ///< AC luminance Huffman bits.
+    uint8_t val_ac_luminance[256];   ///< AC luminance Huffman values.
+    uint8_t bits_ac_chrominance[17]; ///< AC chrominance Huffman bits.
+    uint8_t val_ac_chrominance[256]; ///< AC chrominance Huffman values.
+
+    size_t huff_ncode;               ///< Number of current entries in the buffer.
+    MJpegHuffmanCode *huff_buffer;   ///< Buffer for Huffman code values.
 } MJpegContext;
 
+/**
+ * Enum for the Huffman encoding strategy.
+ */
+enum HuffmanTableOption {
+    HUFFMAN_TABLE_DEFAULT = 0, ///< Use the default Huffman tables.
+    HUFFMAN_TABLE_OPTIMAL = 1, ///< Compute and use optimal Huffman tables.
+    NB_HUFFMAN_TABLE_OPTION = 2
+};
+
 static inline void put_marker(PutBitContext *p, enum JpegMarker code)
 {
     put_bits(p, 8, 0xff);
@@ -59,6 +106,6 @@ static inline void put_marker(PutBitContext *p, enum JpegMarker code)
 
 int  ff_mjpeg_encode_init(MpegEncContext *s);
 void ff_mjpeg_encode_close(MpegEncContext *s);
-void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[8][64]);
+void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[12][64]);
 
 #endif /* AVCODEC_MJPEGENC_H */
diff --git a/libavcodec/mjpegenc_common.c b/libavcodec/mjpegenc_common.c
index 2262de6..31868c9 100644
--- a/libavcodec/mjpegenc_common.c
+++ b/libavcodec/mjpegenc_common.c
@@ -1,20 +1,22 @@
 /*
  * lossless JPEG shared bits
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,8 +33,35 @@
 #include "put_bits.h"
 #include "mjpegenc.h"
 #include "mjpegenc_common.h"
+#include "mjpegenc_huffman.h"
 #include "mjpeg.h"
 
+av_cold void ff_init_uni_ac_vlc(const uint8_t huff_size_ac[256], uint8_t *uni_ac_vlc_len)
+{
+    int i;
+
+    for (i = 0; i < 128; i++) {
+        int level = i - 64;
+        int run;
+        if (!level)
+            continue;
+        for (run = 0; run < 64; run++) {
+            int len, code, nbits;
+            int alevel = FFABS(level);
+
+            len = (run >> 4) * huff_size_ac[0xf0];
+
+            nbits= av_log2_16bit(alevel) + 1;
+            code = ((15&run) << 4) | nbits;
+
+            len += huff_size_ac[code] + nbits;
+
+            uni_ac_vlc_len[UNI_AC_ENC_INDEX(run, i)] = len;
+            // We ignore EOB as its just a constant which does not change generally
+        }
+    }
+}
+
 /* table_class: 0 = DC coef, 1 = AC coefs */
 static int put_huffman_table(PutBitContext *p, int table_class, int table_id,
                              const uint8_t *bits_table, const uint8_t *value_table)
@@ -54,20 +83,50 @@ static int put_huffman_table(PutBitContext *p, int table_class, int table_id,
     return n + 17;
 }
 
-static void jpeg_table_header(PutBitContext *p, ScanTable *intra_scantable,
-                              uint16_t intra_matrix[64])
+static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p,
+                              ScanTable *intra_scantable,
+                              uint16_t luma_intra_matrix[64],
+                              uint16_t chroma_intra_matrix[64],
+                              int hsample[3])
 {
     int i, j, size;
     uint8_t *ptr;
-
+    MpegEncContext *s = NULL;
+
+    /* Since avctx->priv_data will point to LJpegEncContext in this case */
+    if (avctx->codec_id != AV_CODEC_ID_LJPEG)
+        s = avctx->priv_data;
+
+    if (avctx->codec_id != AV_CODEC_ID_LJPEG) {
+        int matrix_count = 1 + !!memcmp(luma_intra_matrix,
+                                        chroma_intra_matrix,
+                                        sizeof(luma_intra_matrix[0]) * 64);
+    if (s && s->force_duplicated_matrix)
+        matrix_count = 2;
     /* quant matrixes */
     put_marker(p, DQT);
-    put_bits(p, 16, 2 + 1 * (1 + 64));
+    put_bits(p, 16, 2 + matrix_count * (1 + 64));
     put_bits(p, 4, 0); /* 8 bit precision */
     put_bits(p, 4, 0); /* table 0 */
     for(i=0;i<64;i++) {
         j = intra_scantable->permutated[i];
-        put_bits(p, 8, intra_matrix[j]);
+        put_bits(p, 8, luma_intra_matrix[j]);
+    }
+
+        if (matrix_count > 1) {
+            put_bits(p, 4, 0); /* 8 bit precision */
+            put_bits(p, 4, 1); /* table 1 */
+            for(i=0;i<64;i++) {
+                j = intra_scantable->permutated[i];
+                put_bits(p, 8, chroma_intra_matrix[j]);
+            }
+        }
+    }
+
+    if(avctx->active_thread_type & FF_THREAD_SLICE){
+        put_marker(p, DRI);
+        put_bits(p, 16, 4);
+        put_bits(p, 16, (avctx->width-1)/(8*hsample[0]) + 1);
     }
 
     /* huffman table */
@@ -76,15 +135,30 @@ static void jpeg_table_header(PutBitContext *p, ScanTable *intra_scantable,
     ptr = put_bits_ptr(p);
     put_bits(p, 16, 0); /* patched later */
     size = 2;
-    size += put_huffman_table(p, 0, 0, avpriv_mjpeg_bits_dc_luminance,
-                              avpriv_mjpeg_val_dc);
-    size += put_huffman_table(p, 0, 1, avpriv_mjpeg_bits_dc_chrominance,
-                              avpriv_mjpeg_val_dc);
-
-    size += put_huffman_table(p, 1, 0, avpriv_mjpeg_bits_ac_luminance,
-                              avpriv_mjpeg_val_ac_luminance);
-    size += put_huffman_table(p, 1, 1, avpriv_mjpeg_bits_ac_chrominance,
-                              avpriv_mjpeg_val_ac_chrominance);
+
+    // Only MJPEG can have a variable Huffman variable. All other
+    // formats use the default Huffman table.
+    if (s && s->huffman == HUFFMAN_TABLE_OPTIMAL) {
+        size += put_huffman_table(p, 0, 0, s->mjpeg_ctx->bits_dc_luminance,
+                                  s->mjpeg_ctx->val_dc_luminance);
+        size += put_huffman_table(p, 0, 1, s->mjpeg_ctx->bits_dc_chrominance,
+                                  s->mjpeg_ctx->val_dc_chrominance);
+
+        size += put_huffman_table(p, 1, 0, s->mjpeg_ctx->bits_ac_luminance,
+                                  s->mjpeg_ctx->val_ac_luminance);
+        size += put_huffman_table(p, 1, 1, s->mjpeg_ctx->bits_ac_chrominance,
+                                  s->mjpeg_ctx->val_ac_chrominance);
+    } else {
+        size += put_huffman_table(p, 0, 0, avpriv_mjpeg_bits_dc_luminance,
+                                  avpriv_mjpeg_val_dc);
+        size += put_huffman_table(p, 0, 1, avpriv_mjpeg_bits_dc_chrominance,
+                                  avpriv_mjpeg_val_dc);
+
+        size += put_huffman_table(p, 1, 0, avpriv_mjpeg_bits_ac_luminance,
+                                  avpriv_mjpeg_val_ac_luminance);
+        size += put_huffman_table(p, 1, 1, avpriv_mjpeg_bits_ac_chrominance,
+                                  avpriv_mjpeg_val_ac_chrominance);
+    }
     AV_WB16(ptr, size);
 }
 
@@ -94,6 +168,16 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
     uint8_t *ptr;
 
     if (avctx->sample_aspect_ratio.num > 0 && avctx->sample_aspect_ratio.den > 0) {
+        AVRational sar = avctx->sample_aspect_ratio;
+
+        if (sar.num > 65535 || sar.den > 65535) {
+            if (!av_reduce(&sar.num, &sar.den, avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den, 65535))
+                av_log(avctx, AV_LOG_WARNING,
+                    "Cannot store exact aspect ratio %d:%d\n",
+                    avctx->sample_aspect_ratio.num,
+                    avctx->sample_aspect_ratio.den);
+        }
+
         /* JFIF header */
         put_marker(p, APP0);
         put_bits(p, 16, 16);
@@ -103,8 +187,8 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
          * released revision. */
         put_bits(p, 16, 0x0102);
         put_bits(p,  8, 0);              /* units type: 0 - aspect ratio */
-        put_bits(p, 16, avctx->sample_aspect_ratio.num);
-        put_bits(p, 16, avctx->sample_aspect_ratio.den);
+        put_bits(p, 16, sar.num);
+        put_bits(p, 16, sar.den);
         put_bits(p, 8, 0); /* thumbnail width */
         put_bits(p, 8, 0); /* thumbnail height */
     }
@@ -120,9 +204,10 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
         AV_WB16(ptr, size);
     }
 
-    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-        avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
-        avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
+    if (((avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
+          avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+          avctx->pix_fmt == AV_PIX_FMT_YUV444P) && avctx->color_range != AVCOL_RANGE_JPEG)
+        || avctx->color_range == AVCOL_RANGE_MPEG) {
         put_marker(p, COM);
         flush_put_bits(p);
         ptr = put_bits_ptr(p);
@@ -133,22 +218,23 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
     }
 }
 
-void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
-                                    ScanTable *intra_scantable, int pred,
-                                    uint16_t intra_matrix[64])
+void ff_mjpeg_init_hvsample(AVCodecContext *avctx, int hsample[4], int vsample[4])
 {
     int chroma_h_shift, chroma_v_shift;
-    const int lossless = avctx->codec_id != AV_CODEC_ID_MJPEG;
-    int hsample[3], vsample[3];
 
     av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift,
                                      &chroma_v_shift);
-
     if (avctx->codec->id == AV_CODEC_ID_LJPEG &&
-        avctx->pix_fmt   == AV_PIX_FMT_BGR24) {
+        (   avctx->pix_fmt == AV_PIX_FMT_BGR0
+         || avctx->pix_fmt == AV_PIX_FMT_BGRA
+         || avctx->pix_fmt == AV_PIX_FMT_BGR24)) {
         vsample[0] = hsample[0] =
         vsample[1] = hsample[1] =
-        vsample[2] = hsample[2] = 1;
+        vsample[2] = hsample[2] =
+        vsample[3] = hsample[3] = 1;
+    } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P || avctx->pix_fmt == AV_PIX_FMT_YUVJ444P) {
+        vsample[0] = vsample[1] = vsample[2] = 2;
+        hsample[0] = hsample[1] = hsample[2] = 1;
     } else {
         vsample[0] = 2;
         vsample[1] = 2 >> chroma_v_shift;
@@ -157,27 +243,48 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
         hsample[1] = 2 >> chroma_h_shift;
         hsample[2] = 2 >> chroma_h_shift;
     }
+}
+
+void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
+                                    ScanTable *intra_scantable, int pred,
+                                    uint16_t luma_intra_matrix[64],
+                                    uint16_t chroma_intra_matrix[64])
+{
+    const int lossless = avctx->codec_id != AV_CODEC_ID_MJPEG && avctx->codec_id != AV_CODEC_ID_AMV;
+    int hsample[4], vsample[4];
+    int i;
+    int components = 3 + (avctx->pix_fmt == AV_PIX_FMT_BGRA);
+    int chroma_matrix = !!memcmp(luma_intra_matrix,
+                                 chroma_intra_matrix,
+                                 sizeof(luma_intra_matrix[0])*64);
+
+    ff_mjpeg_init_hvsample(avctx, hsample, vsample);
 
     put_marker(pb, SOI);
 
+    // hack for AMV mjpeg format
+    if(avctx->codec_id == AV_CODEC_ID_AMV) goto end;
+
     jpeg_put_comments(avctx, pb);
 
-    jpeg_table_header(pb, intra_scantable, intra_matrix);
+    jpeg_table_header(avctx, pb, intra_scantable, luma_intra_matrix, chroma_intra_matrix, hsample);
 
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_marker(pb, SOF0 ); break;
     case AV_CODEC_ID_LJPEG:  put_marker(pb, SOF3 ); break;
-    default: assert(0);
+    default: av_assert0(0);
     }
 
     put_bits(pb, 16, 17);
-    if (lossless && avctx->pix_fmt == AV_PIX_FMT_BGR24)
+    if (lossless && (  avctx->pix_fmt == AV_PIX_FMT_BGR0
+                    || avctx->pix_fmt == AV_PIX_FMT_BGRA
+                    || avctx->pix_fmt == AV_PIX_FMT_BGR24))
         put_bits(pb, 8, 9); /* 9 bits/component RCT */
     else
         put_bits(pb, 8, 8); /* 8 bits/component */
     put_bits(pb, 16, avctx->height);
     put_bits(pb, 16, avctx->width);
-    put_bits(pb, 8, 3); /* 3 components */
+    put_bits(pb, 8, components); /* 3 or 4 components */
 
     /* Y component */
     put_bits(pb, 8, 1); /* component number */
@@ -189,18 +296,25 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     put_bits(pb, 8, 2); /* component number */
     put_bits(pb, 4, hsample[1]); /* H factor */
     put_bits(pb, 4, vsample[1]); /* V factor */
-    put_bits(pb, 8, 0); /* select matrix */
+    put_bits(pb, 8, lossless ? 0 : chroma_matrix); /* select matrix */
 
     /* Cr component */
     put_bits(pb, 8, 3); /* component number */
     put_bits(pb, 4, hsample[2]); /* H factor */
     put_bits(pb, 4, vsample[2]); /* V factor */
-    put_bits(pb, 8, 0); /* select matrix */
+    put_bits(pb, 8, lossless ? 0 : chroma_matrix); /* select matrix */
+
+    if (components == 4) {
+        put_bits(pb, 8, 4); /* component number */
+        put_bits(pb, 4, hsample[3]); /* H factor */
+        put_bits(pb, 4, vsample[3]); /* V factor */
+        put_bits(pb, 8, 0); /* select matrix */
+    }
 
     /* scan header */
     put_marker(pb, SOS);
-    put_bits(pb, 16, 12); /* length */
-    put_bits(pb, 8, 3); /* 3 components */
+    put_bits(pb, 16, 6 + 2*components); /* length */
+    put_bits(pb, 8, components); /* 3 components */
 
     /* Y component */
     put_bits(pb, 8, 1); /* index */
@@ -217,25 +331,97 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     put_bits(pb, 4, 1); /* DC huffman table index */
     put_bits(pb, 4, lossless ? 0 : 1); /* AC huffman table index */
 
+    if (components == 4) {
+        /* Alpha component */
+        put_bits(pb, 8, 4); /* index */
+        put_bits(pb, 4, 0); /* DC huffman table index */
+        put_bits(pb, 4, 0); /* AC huffman table index */
+    }
+
     put_bits(pb, 8, lossless ? pred : 0); /* Ss (not used) */
 
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_bits(pb, 8, 63); break; /* Se (not used) */
     case AV_CODEC_ID_LJPEG:  put_bits(pb, 8,  0); break; /* not used */
-    default: assert(0);
+    default: av_assert0(0);
     }
 
     put_bits(pb, 8, 0); /* Ah/Al (not used) */
+
+end:
+    if (!lossless) {
+        MpegEncContext *s = avctx->priv_data;
+        av_assert0(avctx->codec->priv_data_size == sizeof(MpegEncContext));
+
+        s->esc_pos = put_bits_count(pb) >> 3;
+        for(i=1; i<s->slice_context_count; i++)
+            s->thread_context[i]->esc_pos = 0;
+    }
+}
+
+/**
+ * Encodes and outputs the entire frame in the JPEG format.
+ *
+ * @param s The MpegEncContext.
+ */
+void ff_mjpeg_encode_picture_frame(MpegEncContext *s)
+{
+    int i, nbits, code, table_id;
+    MJpegContext *m = s->mjpeg_ctx;
+    uint8_t *huff_size[4] = {m->huff_size_dc_luminance,
+                             m->huff_size_dc_chrominance,
+                             m->huff_size_ac_luminance,
+                             m->huff_size_ac_chrominance};
+    uint16_t *huff_code[4] = {m->huff_code_dc_luminance,
+                              m->huff_code_dc_chrominance,
+                              m->huff_code_ac_luminance,
+                              m->huff_code_ac_chrominance};
+    size_t total_bits = 0;
+    size_t bytes_needed;
+
+    s->header_bits = get_bits_diff(s);
+    // Estimate the total size first
+    for (i = 0; i < m->huff_ncode; i++) {
+        table_id = m->huff_buffer[i].table_id;
+        code = m->huff_buffer[i].code;
+        nbits = code & 0xf;
+
+        total_bits += huff_size[table_id][code] + nbits;
+    }
+
+    bytes_needed = (total_bits + 7) / 8;
+    ff_mpv_reallocate_putbitbuffer(s, bytes_needed, bytes_needed);
+
+    for (i = 0; i < m->huff_ncode; i++) {
+        table_id = m->huff_buffer[i].table_id;
+        code = m->huff_buffer[i].code;
+        nbits = code & 0xf;
+
+        put_bits(&s->pb, huff_size[table_id][code], huff_code[table_id][code]);
+        if (nbits != 0) {
+            put_sbits(&s->pb, nbits, m->huff_buffer[i].mant);
+        }
+    }
+
+    m->huff_ncode = 0;
+    s->i_tex_bits = get_bits_diff(s);
 }
 
-static void escape_FF(PutBitContext *pb, int start)
+void ff_mjpeg_escape_FF(PutBitContext *pb, int start)
 {
-    int size = put_bits_count(pb) - start * 8;
+    int size;
     int i, ff_count;
     uint8_t *buf = pb->buf + start;
     int align= (-(size_t)(buf))&3;
+    int pad = (-put_bits_count(pb))&7;
+
+    if (pad)
+        put_bits(pb, pad, (1<<pad)-1);
 
-    assert((size&7) == 0);
+    flush_put_bits(pb);
+    size = put_bits_count(pb) - start * 8;
+
+    av_assert1((size&7) == 0);
     size >>= 3;
 
     ff_count=0;
@@ -280,21 +466,127 @@ static void escape_FF(PutBitContext *pb, int start)
     }
 }
 
-void ff_mjpeg_encode_stuffing(PutBitContext * pbc)
+/**
+ * Builds all 4 optimal Huffman tables.
+ *
+ * Uses the data stored in the JPEG buffer to compute the tables.
+ * Stores the Huffman tables in the bits_* and val_* arrays in the MJpegContext.
+ *
+ * @param m MJpegContext containing the JPEG buffer.
+ */
+static void ff_mjpeg_build_optimal_huffman(MJpegContext *m)
 {
-    int length;
-    length= (-put_bits_count(pbc))&7;
-    if(length) put_bits(pbc, length, (1<<length)-1);
+    int i, table_id, code;
+
+    MJpegEncHuffmanContext dc_luminance_ctx;
+    MJpegEncHuffmanContext dc_chrominance_ctx;
+    MJpegEncHuffmanContext ac_luminance_ctx;
+    MJpegEncHuffmanContext ac_chrominance_ctx;
+    MJpegEncHuffmanContext *ctx[4] = {&dc_luminance_ctx,
+                                      &dc_chrominance_ctx,
+                                      &ac_luminance_ctx,
+                                      &ac_chrominance_ctx};
+    for (i = 0; i < 4; i++) {
+        ff_mjpeg_encode_huffman_init(ctx[i]);
+    }
+    for (i = 0; i < m->huff_ncode; i++) {
+        table_id = m->huff_buffer[i].table_id;
+        code = m->huff_buffer[i].code;
+
+        ff_mjpeg_encode_huffman_increment(ctx[table_id], code);
+    }
+
+    ff_mjpeg_encode_huffman_close(&dc_luminance_ctx,
+                                  m->bits_dc_luminance,
+                                  m->val_dc_luminance, 12);
+    ff_mjpeg_encode_huffman_close(&dc_chrominance_ctx,
+                                  m->bits_dc_chrominance,
+                                  m->val_dc_chrominance, 12);
+    ff_mjpeg_encode_huffman_close(&ac_luminance_ctx,
+                                  m->bits_ac_luminance,
+                                  m->val_ac_luminance, 256);
+    ff_mjpeg_encode_huffman_close(&ac_chrominance_ctx,
+                                  m->bits_ac_chrominance,
+                                  m->val_ac_chrominance, 256);
+
+    ff_mjpeg_build_huffman_codes(m->huff_size_dc_luminance,
+                                 m->huff_code_dc_luminance,
+                                 m->bits_dc_luminance,
+                                 m->val_dc_luminance);
+    ff_mjpeg_build_huffman_codes(m->huff_size_dc_chrominance,
+                                 m->huff_code_dc_chrominance,
+                                 m->bits_dc_chrominance,
+                                 m->val_dc_chrominance);
+    ff_mjpeg_build_huffman_codes(m->huff_size_ac_luminance,
+                                 m->huff_code_ac_luminance,
+                                 m->bits_ac_luminance,
+                                 m->val_ac_luminance);
+    ff_mjpeg_build_huffman_codes(m->huff_size_ac_chrominance,
+                                 m->huff_code_ac_chrominance,
+                                 m->bits_ac_chrominance,
+                                 m->val_ac_chrominance);
 }
 
-void ff_mjpeg_encode_picture_trailer(PutBitContext *pb, int header_bits)
+/**
+ * Writes the complete JPEG frame when optimal huffman tables are enabled,
+ * otherwise writes the stuffing.
+ *
+ * Header + values + stuffing.
+ *
+ * @param s The MpegEncContext.
+ * @return int Error code, 0 if successful.
+ */
+int ff_mjpeg_encode_stuffing(MpegEncContext *s)
 {
-    ff_mjpeg_encode_stuffing(pb);
-    flush_put_bits(pb);
+    int i;
+    PutBitContext *pbc = &s->pb;
+    int mb_y = s->mb_y - !s->mb_x;
+    int ret;
+    MJpegContext *m;
+
+    m = s->mjpeg_ctx;
+
+    if (s->huffman == HUFFMAN_TABLE_OPTIMAL) {
+        ff_mjpeg_build_optimal_huffman(m);
+
+        // Replace the VLCs with the optimal ones.
+        // The default ones may be used for trellis during quantization.
+        ff_init_uni_ac_vlc(m->huff_size_ac_luminance,   m->uni_ac_vlc_len);
+        ff_init_uni_ac_vlc(m->huff_size_ac_chrominance, m->uni_chroma_ac_vlc_len);
+        s->intra_ac_vlc_length      =
+        s->intra_ac_vlc_last_length = m->uni_ac_vlc_len;
+        s->intra_chroma_ac_vlc_length      =
+        s->intra_chroma_ac_vlc_last_length = m->uni_chroma_ac_vlc_len;
+
+        ff_mjpeg_encode_picture_header(s->avctx, &s->pb, &s->intra_scantable,
+                                       s->pred, s->intra_matrix, s->chroma_intra_matrix);
+        ff_mjpeg_encode_picture_frame(s);
+    }
 
-    assert((header_bits & 7) == 0);
+    ret = ff_mpv_reallocate_putbitbuffer(s, put_bits_count(&s->pb) / 8 + 100,
+                                            put_bits_count(&s->pb) / 4 + 1000);
 
-    escape_FF(pb, header_bits >> 3);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Buffer reallocation failed\n");
+        goto fail;
+    }
+
+    ff_mjpeg_escape_FF(pbc, s->esc_pos);
+
+    if((s->avctx->active_thread_type & FF_THREAD_SLICE) && mb_y < s->mb_height)
+        put_marker(pbc, RST0 + (mb_y&7));
+    s->esc_pos = put_bits_count(pbc) >> 3;
+fail:
+
+    for(i=0; i<3; i++)
+        s->last_dc[i] = 128 << s->intra_dc_precision;
+
+    return ret;
+}
+
+void ff_mjpeg_encode_picture_trailer(PutBitContext *pb, int header_bits)
+{
+    av_assert1((header_bits & 7) == 0);
 
     put_marker(pb, EOI);
 }
diff --git a/libavcodec/mjpegenc_common.h b/libavcodec/mjpegenc_common.h
index 9b5933e..e8698d1 100644
--- a/libavcodec/mjpegenc_common.h
+++ b/libavcodec/mjpegenc_common.h
@@ -1,20 +1,20 @@
 /*
  * lossless JPEG shared bits
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,14 +25,22 @@
 
 #include "avcodec.h"
 #include "idctdsp.h"
+#include "mpegvideo.h"
 #include "put_bits.h"
 
 void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
                                     ScanTable *intra_scantable, int pred,
-                                    uint16_t intra_matrix[64]);
+                                    uint16_t luma_intra_matrix[64],
+                                    uint16_t chroma_intra_matrix[64]);
+void ff_mjpeg_encode_picture_frame(MpegEncContext *s);
 void ff_mjpeg_encode_picture_trailer(PutBitContext *pb, int header_bits);
-void ff_mjpeg_encode_stuffing(PutBitContext *pbc);
+void ff_mjpeg_escape_FF(PutBitContext *pb, int start);
+int ff_mjpeg_encode_stuffing(MpegEncContext *s);
+void ff_mjpeg_init_hvsample(AVCodecContext *avctx, int hsample[4], int vsample[4]);
+
 void ff_mjpeg_encode_dc(PutBitContext *pb, int val,
                         uint8_t *huff_size, uint16_t *huff_code);
 
+av_cold void ff_init_uni_ac_vlc(const uint8_t huff_size_ac[256], uint8_t *uni_ac_vlc_len);
+
 #endif /* AVCODEC_MJPEGENC_COMMON_H */
diff --git a/libavcodec/mjpegenc_huffman.c b/libavcodec/mjpegenc_huffman.c
new file mode 100644
index 0000000..0e63f80
--- /dev/null
+++ b/libavcodec/mjpegenc_huffman.c
@@ -0,0 +1,192 @@
+/*
+ * MJPEG encoder
+ * Copyright (c) 2016 William Ma, Ted Ying, Jerry Jiang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/error.h"
+#include "libavutil/qsort.h"
+#include "mjpegenc_huffman.h"
+
+/**
+ * Comparison function for two PTables by prob
+ *
+ * @param a First PTable to compare
+ * @param b Second PTable to compare
+ * @return < 0 for less than, 0 for equals, > 0 for greater than
+ */
+static int compare_by_prob(const void *a, const void *b)
+{
+    PTable a_val = *(PTable *) a;
+    PTable b_val = *(PTable *) b;
+    return a_val.prob - b_val.prob;
+}
+
+/**
+ * Comparison function for two HuffTables by length
+ *
+ * @param a First HuffTable to compare
+ * @param b Second HuffTable to compare
+ * @return < 0 for less than, 0 for equals, > 0 for greater than
+ */
+static int compare_by_length(const void *a, const void *b)
+{
+    HuffTable a_val = *(HuffTable *) a;
+    HuffTable b_val = *(HuffTable *) b;
+    return a_val.length - b_val.length;
+}
+
+/**
+ * Computes the length of the Huffman encoding for each distinct input value.
+ * Uses package merge algorithm as follows:
+ * 1. start with an empty list, lets call it list(0), set i = 0
+ * 2. add 1 entry to list(i) for each symbol we have and give each a score equal to the probability of the respective symbol
+ * 3. merge the 2 symbols of least score and put them in list(i+1), and remove them from list(i). The new score will be the sum of the 2 scores
+ * 4. if there is more than 1 symbol left in the current list(i), then goto 3
+ * 5. i++
+ * 6. if i < 16 goto 2
+ * 7. select the n-1 elements in the last list with the lowest score (n = the number of symbols)
+ * 8. the length of the huffman code for symbol s will be equal to the number of times the symbol occurs in the select elements
+ * Go to guru.multimedia.cx/small-tasks-for-ffmpeg/ for more details
+ *
+ * All probabilities should be positive integers. The output is sorted by code,
+ * not by length.
+ *
+ * @param prob_table input array of a PTable for each distinct input value
+ * @param distincts  output array of a HuffTable that will be populated by this function
+ * @param size       size of the prob_table array
+ * @param max_length max length of an encoding
+ */
+void ff_mjpegenc_huffman_compute_bits(PTable *prob_table, HuffTable *distincts, int size, int max_length)
+{
+    PackageMergerList list_a, list_b, *to = &list_a, *from = &list_b, *temp;
+
+    int times, i, j, k;
+
+    int nbits[257] = {0};
+
+    int min;
+
+    av_assert0(max_length > 0);
+
+    to->nitems = 0;
+    from->nitems = 0;
+    to->item_idx[0] = 0;
+    from->item_idx[0] = 0;
+    AV_QSORT(prob_table, size, PTable, compare_by_prob);
+
+    for (times = 0; times <= max_length; times++) {
+        to->nitems = 0;
+        to->item_idx[0] = 0;
+
+        j = 0;
+        k = 0;
+
+        if (times < max_length) {
+            i = 0;
+        }
+        while (i < size || j + 1 < from->nitems) {
+            to->nitems++;
+            to->item_idx[to->nitems] = to->item_idx[to->nitems - 1];
+            if (i < size &&
+                (j + 1 >= from->nitems ||
+                 prob_table[i].prob <
+                     from->probability[j] + from->probability[j + 1])) {
+                to->items[to->item_idx[to->nitems]++] = prob_table[i].value;
+                to->probability[to->nitems - 1] = prob_table[i].prob;
+                i++;
+            } else {
+                for (k = from->item_idx[j]; k < from->item_idx[j + 2]; k++) {
+                    to->items[to->item_idx[to->nitems]++] = from->items[k];
+                }
+                to->probability[to->nitems - 1] =
+                    from->probability[j] + from->probability[j + 1];
+                j += 2;
+            }
+        }
+        temp = to;
+        to = from;
+        from = temp;
+    }
+
+    min = (size - 1 < from->nitems) ? size - 1 : from->nitems;
+    for (i = 0; i < from->item_idx[min]; i++) {
+        nbits[from->items[i]]++;
+    }
+    // we don't want to return the 256 bit count (it was just in here to prevent
+    // all 1s encoding)
+    j = 0;
+    for (i = 0; i < 256; i++) {
+        if (nbits[i] > 0) {
+            distincts[j].code = i;
+            distincts[j].length = nbits[i];
+            j++;
+        }
+    }
+}
+
+void ff_mjpeg_encode_huffman_init(MJpegEncHuffmanContext *s)
+{
+    memset(s->val_count, 0, sizeof(s->val_count));
+}
+
+/**
+ * Produces a Huffman encoding with a given input
+ *
+ * @param s         input to encode
+ * @param bits      output array where the ith character represents how many input values have i length encoding
+ * @param val       output array of input values sorted by their encoded length
+ * @param max_nval  maximum number of distinct input values
+ */
+void ff_mjpeg_encode_huffman_close(MJpegEncHuffmanContext *s, uint8_t bits[17],
+                                   uint8_t val[], int max_nval)
+{
+    int i, j;
+    int nval = 0;
+    PTable val_counts[257];
+    HuffTable distincts[256];
+
+    for (i = 0; i < 256; i++) {
+        if (s->val_count[i]) nval++;
+    }
+    av_assert0 (nval <= max_nval);
+
+    j = 0;
+    for (i = 0; i < 256; i++) {
+        if (s->val_count[i]) {
+            val_counts[j].value = i;
+            val_counts[j].prob = s->val_count[i];
+            j++;
+        }
+    }
+    val_counts[j].value = 256;
+    val_counts[j].prob = 0;
+    ff_mjpegenc_huffman_compute_bits(val_counts, distincts, nval + 1, 16);
+    AV_QSORT(distincts, nval, HuffTable, compare_by_length);
+
+    memset(bits, 0, sizeof(bits[0]) * 17);
+    for (i = 0; i < nval; i++) {
+        val[i] = distincts[i].code;
+        bits[distincts[i].length]++;
+    }
+}
diff --git a/libavcodec/mjpegenc_huffman.h b/libavcodec/mjpegenc_huffman.h
new file mode 100644
index 0000000..5fe6550
--- /dev/null
+++ b/libavcodec/mjpegenc_huffman.h
@@ -0,0 +1,76 @@
+/*
+ * MJPEG encoder
+ * Copyright (c) 2016 William Ma, Ted Ying, Jerry Jiang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Huffman table generation for MJPEG encoder.
+ */
+
+#ifndef AVCODEC_MJPEGENC_HUFFMAN_H
+#define AVCODEC_MJPEGENC_HUFFMAN_H
+
+#include <stdint.h>
+
+typedef struct MJpegEncHuffmanContext {
+    int val_count[256];
+} MJpegEncHuffmanContext;
+
+// Uses the package merge algorithm to compute the Huffman table.
+void ff_mjpeg_encode_huffman_init(MJpegEncHuffmanContext *s);
+static inline void ff_mjpeg_encode_huffman_increment(MJpegEncHuffmanContext *s,
+                                                     uint8_t val)
+{
+    s->val_count[val]++;
+}
+void ff_mjpeg_encode_huffman_close(MJpegEncHuffmanContext *s,
+                                   uint8_t bits[17], uint8_t val[],
+                                   int max_nval);
+
+
+/**
+ * Used to assign a occurrence count or "probability" to an input value
+ */
+typedef struct PTable {
+    int value;  ///< input value
+    int prob;   ///< number of occurences of this value in input
+} PTable;
+
+/**
+ * Used to store intermediate lists in the package merge algorithm
+ */
+typedef struct PackageMergerList {
+    int nitems;             ///< number of items in the list and probability      ex. 4
+    int item_idx[515];      ///< index range for each item in items                   0, 2, 5, 9, 13
+    int probability[514];   ///< probability of each item                             3, 8, 18, 46
+    int items[257 * 16];    ///< chain of all individual values that make up items    A, B, A, B, C, A, B, C, D, C, D, D, E
+} PackageMergerList;
+
+/**
+ * Used to store optimal huffman encoding results
+ */
+typedef struct HuffTable {
+    int code;       ///< code is the input value
+    int length;     ///< length of the encoding
+} HuffTable;
+
+void ff_mjpegenc_huffman_compute_bits(PTable *prob_table, HuffTable *distincts,
+                                      int size, int max_length);
+#endif /* AVCODEC_MJPEGENC_HUFFMAN_H */
diff --git a/libavcodec/mlp.c b/libavcodec/mlp.c
index 9615b66..ddbab60 100644
--- a/libavcodec/mlp.c
+++ b/libavcodec/mlp.c
@@ -2,20 +2,20 @@
  * MLP codec common code
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,6 +41,27 @@ const uint8_t ff_mlp_huffman_tables[3][18][2] = {
     }
 };
 
+const ChannelInformation ff_mlp_ch_info[21] = {
+    { 0x01, 0x01, 0x00, 0x1f }, { 0x03, 0x02, 0x00, 0x1b },
+    { 0x07, 0x02, 0x01, 0x1f }, { 0x0F, 0x02, 0x02, 0x19 },
+    { 0x07, 0x02, 0x01, 0x03 }, { 0x0F, 0x02, 0x02, 0x1f },
+    { 0x1F, 0x02, 0x03, 0x01 }, { 0x07, 0x02, 0x01, 0x1a },
+    { 0x0F, 0x02, 0x02, 0x1f }, { 0x1F, 0x02, 0x03, 0x18 },
+    { 0x0F, 0x02, 0x02, 0x02 }, { 0x1F, 0x02, 0x03, 0x1f },
+    { 0x3F, 0x02, 0x04, 0x00 }, { 0x0F, 0x03, 0x01, 0x1f },
+    { 0x1F, 0x03, 0x02, 0x18 }, { 0x0F, 0x03, 0x01, 0x02 },
+    { 0x1F, 0x03, 0x02, 0x1f }, { 0x3F, 0x03, 0x03, 0x00 },
+    { 0x1F, 0x04, 0x01, 0x01 }, { 0x1F, 0x04, 0x01, 0x18 },
+    { 0x3F, 0x04, 0x02, 0x00 },
+};
+
+const uint64_t ff_mlp_channel_layouts[12] = {
+    AV_CH_LAYOUT_MONO, AV_CH_LAYOUT_STEREO, AV_CH_LAYOUT_2_1,
+    AV_CH_LAYOUT_QUAD, AV_CH_LAYOUT_2POINT1, AV_CH_LAYOUT_SURROUND,
+    AV_CH_LAYOUT_4POINT0, AV_CH_LAYOUT_5POINT0_BACK, AV_CH_LAYOUT_3POINT1,
+    AV_CH_LAYOUT_4POINT1, AV_CH_LAYOUT_5POINT1_BACK, 0,
+};
+
 static int crc_init = 0;
 #if CONFIG_SMALL
 #define CRC_TABLE_SIZE 257
diff --git a/libavcodec/mlp.h b/libavcodec/mlp.h
index 8a1584e..41a45a3 100644
--- a/libavcodec/mlp.h
+++ b/libavcodec/mlp.h
@@ -2,20 +2,20 @@
  * MLP codec common header file
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,6 +76,9 @@ typedef struct FilterParams {
     uint8_t     shift; ///< Right shift to apply to output of filter.
 
     int32_t     state[MAX_FIR_ORDER];
+
+    int coeff_bits;
+    int coeff_shift;
 } FilterParams;
 
 /** sample data coding information */
@@ -96,6 +99,43 @@ typedef struct ChannelParams {
  */
 extern const uint8_t ff_mlp_huffman_tables[3][18][2];
 
+typedef struct {
+    uint8_t channel_occupancy;
+    uint8_t group1_channels;
+    uint8_t group2_channels;
+    uint8_t summary_info;
+} ChannelInformation;
+
+/** Tables defining channel information.
+ *
+ *  Possible channel arrangements are:
+ *
+ *  (Group 1)   C
+ *  (Group 1)   L,  R
+ *  (Group 1)   Lf, Rf          /  (Group 2)   S
+ *  (Group 1)   Lf, Rf          /  (Group 2)   Ls, Rs
+ *  (Group 1)   Lf, Rf          /  (Group 2)   LFE
+ *  (Group 1)   Lf, Rf          /  (Group 2)   LFE, S
+ *  (Group 1)   Lf, Rf          /  (Group 2)   LFE, Ls, Rs
+ *  (Group 1)   Lf, Rf          /  (Group 2)   C
+ *  (Group 1)   Lf, Rf          /  (Group 2)   C, S
+ *  (Group 1)   Lf, Rf          /  (Group 2)   C, Ls, Rs
+ *  (Group 1)   Lf, Rf          /  (Group 2)   C, LFE
+ *  (Group 1)   Lf, Rf          /  (Group 2)   C, LFE, S
+ *  (Group 1)   Lf, Rf          /  (Group 2)   C, LFE, Ls,  Rs
+ *  (Group 1)   Lf, Rf  C       /  (Group 2)   S
+ *  (Group 1)   Lf, Rf  C       /  (Group 2)   Ls, Rs
+ *  (Group 1)   Lf, Rf  C       /  (Group 2)   LFE
+ *  (Group 1)   Lf, Rf  C       /  (Group 2)   LFE, S
+ *  (Group 1)   Lf, Rf  C       /  (Group 2)   LFE, Ls, Rs
+ *  (Group 1)   Lf, Rf  Ls  Rs  /  (Group 2)   LFE
+ *  (Group 1)   Lf, Rf  Ls  Rs  /  (Group 2)   C
+ *  (Group 1)   Lf, Rf, Ls, Rs  /  (Group 2)   C, LFE
+ */
+extern const ChannelInformation ff_mlp_ch_info[21];
+
+extern const uint64_t ff_mlp_channel_layouts[12];
+
 /** MLP uses checksums that seem to be based on the standard CRC algorithm, but
  *  are not (in implementation terms, the table lookup and XOR are reversed).
  *  We can implement this behavior using a standard av_crc on all but the
diff --git a/libavcodec/mlp_parser.c b/libavcodec/mlp_parser.c
index bff6258..eb3435b 100644
--- a/libavcodec/mlp_parser.c
+++ b/libavcodec/mlp_parser.c
@@ -2,20 +2,20 @@
  * MLP parser
  * Copyright (c) 2007 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,7 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/crc.h"
 #include "libavutil/internal.h"
-
-#include "bitstream.h"
+#include "get_bits.h"
 #include "parser.h"
 #include "mlp_parser.h"
 #include "mlp.h"
@@ -45,28 +44,28 @@ static const uint8_t mlp_channels[32] = {
     5, 6, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
-static const uint64_t mlp_layout[32] = {
+const uint64_t ff_mlp_layout[32] = {
     AV_CH_LAYOUT_MONO,
     AV_CH_LAYOUT_STEREO,
     AV_CH_LAYOUT_2_1,
-    AV_CH_LAYOUT_2_2,
+    AV_CH_LAYOUT_QUAD,
     AV_CH_LAYOUT_STEREO|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_2_1|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_2_2|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_SURROUND,
     AV_CH_LAYOUT_4POINT0,
-    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT0_BACK,
     AV_CH_LAYOUT_SURROUND|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_4POINT0|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_5POINT1_BACK,
     AV_CH_LAYOUT_4POINT0,
-    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT0_BACK,
     AV_CH_LAYOUT_SURROUND|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_4POINT0|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT1,
-    AV_CH_LAYOUT_2_2|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT0,
-    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_5POINT1_BACK,
+    AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_5POINT0_BACK,
+    AV_CH_LAYOUT_5POINT1_BACK,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
@@ -109,7 +108,7 @@ static int truehd_channels(int chanmap)
     return channels;
 }
 
-static uint64_t truehd_layout(int chanmap)
+uint64_t ff_truehd_layout(int chanmap)
 {
     int i;
     uint64_t layout = 0;
@@ -140,84 +139,86 @@ static int mlp_get_major_sync_size(const uint8_t * buf, int bufsize)
 /** Read a major sync info header - contains high level information about
  *  the stream - sample rate, channel arrangement etc. Most of this
  *  information is not actually necessary for decoding, only for playback.
- *  bc must be a freshly-initialized BitstreamContext with no bits read.
+ *  gb must be a freshly initialized GetBitContext with no bits read.
  */
 
-int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, BitstreamContext *bc)
+int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb)
 {
     int ratebits, channel_arrangement, header_size;
     uint16_t checksum;
 
-    assert(bitstream_tell(bc) == 0);
+    av_assert1(get_bits_count(gb) == 0);
 
-    header_size = mlp_get_major_sync_size(bc->buffer, bc->size_in_bits >> 3);
-    if (header_size < 0 || bc->size_in_bits < header_size << 3) {
+    header_size = mlp_get_major_sync_size(gb->buffer, gb->size_in_bits >> 3);
+    if (header_size < 0 || gb->size_in_bits < header_size << 3) {
         av_log(log, AV_LOG_ERROR, "packet too short, unable to read major sync\n");
         return -1;
     }
 
-    checksum = ff_mlp_checksum16(bc->buffer, header_size - 2);
-    if (checksum != AV_RL16(bc->buffer + header_size - 2)) {
+    checksum = ff_mlp_checksum16(gb->buffer, header_size - 2);
+    if (checksum != AV_RL16(gb->buffer+header_size-2)) {
         av_log(log, AV_LOG_ERROR, "major sync info header checksum error\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (bitstream_read(bc, 24) != 0xf8726f) /* Sync words */
+    if (get_bits_long(gb, 24) != 0xf8726f) /* Sync words */
         return AVERROR_INVALIDDATA;
 
-    mh->stream_type = bitstream_read(bc, 8);
+    mh->stream_type = get_bits(gb, 8);
     mh->header_size = header_size;
 
     if (mh->stream_type == 0xbb) {
-        mh->group1_bits = mlp_quants[bitstream_read(bc, 4)];
-        mh->group2_bits = mlp_quants[bitstream_read(bc, 4)];
+        mh->group1_bits = mlp_quants[get_bits(gb, 4)];
+        mh->group2_bits = mlp_quants[get_bits(gb, 4)];
 
-        ratebits = bitstream_read(bc, 4);
+        ratebits = get_bits(gb, 4);
         mh->group1_samplerate = mlp_samplerate(ratebits);
-        mh->group2_samplerate = mlp_samplerate(bitstream_read(bc, 4));
+        mh->group2_samplerate = mlp_samplerate(get_bits(gb, 4));
 
-        bitstream_skip(bc, 11);
+        skip_bits(gb, 11);
 
-        channel_arrangement    = bitstream_read(bc, 5);
+        mh->channel_arrangement=
+        channel_arrangement    = get_bits(gb, 5);
         mh->channels_mlp       = mlp_channels[channel_arrangement];
-        mh->channel_layout_mlp = mlp_layout[channel_arrangement];
+        mh->channel_layout_mlp = ff_mlp_layout[channel_arrangement];
     } else if (mh->stream_type == 0xba) {
         mh->group1_bits = 24; // TODO: Is this information actually conveyed anywhere?
         mh->group2_bits = 0;
 
-        ratebits = bitstream_read(bc, 4);
+        ratebits = get_bits(gb, 4);
         mh->group1_samplerate = mlp_samplerate(ratebits);
         mh->group2_samplerate = 0;
 
-        bitstream_skip(bc, 4);
+        skip_bits(gb, 4);
 
-        mh->channel_modifier_thd_stream0 = bitstream_read(bc, 2);
-        mh->channel_modifier_thd_stream1 = bitstream_read(bc, 2);
+        mh->channel_modifier_thd_stream0 = get_bits(gb, 2);
+        mh->channel_modifier_thd_stream1 = get_bits(gb, 2);
 
-        channel_arrangement            = bitstream_read(bc, 5);
+        mh->channel_arrangement=
+        channel_arrangement            = get_bits(gb, 5);
         mh->channels_thd_stream1       = truehd_channels(channel_arrangement);
-        mh->channel_layout_thd_stream1 = truehd_layout(channel_arrangement);
+        mh->channel_layout_thd_stream1 = ff_truehd_layout(channel_arrangement);
 
-        mh->channel_modifier_thd_stream2 = bitstream_read(bc, 2);
+        mh->channel_modifier_thd_stream2 = get_bits(gb, 2);
 
-        channel_arrangement            = bitstream_read(bc, 13);
+        channel_arrangement            = get_bits(gb, 13);
         mh->channels_thd_stream2       = truehd_channels(channel_arrangement);
-        mh->channel_layout_thd_stream2 = truehd_layout(channel_arrangement);
+        mh->channel_layout_thd_stream2 = ff_truehd_layout(channel_arrangement);
     } else
         return AVERROR_INVALIDDATA;
 
     mh->access_unit_size = 40 << (ratebits & 7);
     mh->access_unit_size_pow2 = 64 << (ratebits & 7);
 
-    bitstream_skip(bc, 48);
+    skip_bits_long(gb, 48);
 
-    mh->is_vbr = bitstream_read_bit(bc);
+    mh->is_vbr = get_bits1(gb);
 
-    mh->peak_bitrate = (bitstream_read(bc, 15) * mh->group1_samplerate + 8) >> 4;
+    mh->peak_bitrate = (get_bits(gb, 15) * mh->group1_samplerate + 8) >> 4;
 
-    mh->num_substreams = bitstream_read(bc, 4);
+    mh->num_substreams = get_bits(gb, 4);
 
-    bitstream_skip(bc, 4 + (header_size - 17) * 8);
+    skip_bits_long(gb, 4 + (header_size - 17) * 8);
 
     return 0;
 }
@@ -248,65 +249,78 @@ static int mlp_parse(AVCodecParserContext *s,
     int sync_present;
     uint8_t parity_bits;
     int next;
+    int ret;
     int i, p = 0;
 
     *poutbuf_size = 0;
     if (buf_size == 0)
         return 0;
 
-    if (!mp->in_sync) {
-        // Not in sync - find a major sync header
-
-        for (i = 0; i < buf_size; i++) {
-            mp->pc.state = (mp->pc.state << 8) | buf[i];
-            if ((mp->pc.state & 0xfffffffe) == 0xf8726fba &&
-                // ignore if we do not have the data for the start of header
-                mp->pc.index + i >= 7) {
-                mp->in_sync = 1;
-                mp->bytes_left = 0;
-                break;
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        if (!mp->in_sync) {
+            // Not in sync - find a major sync header
+
+            for (i = 0; i < buf_size; i++) {
+                mp->pc.state = (mp->pc.state << 8) | buf[i];
+                if ((mp->pc.state & 0xfffffffe) == 0xf8726fba &&
+                    // ignore if we do not have the data for the start of header
+                    mp->pc.index + i >= 7) {
+                    mp->in_sync = 1;
+                    mp->bytes_left = 0;
+                    break;
+                }
             }
-        }
 
-        if (!mp->in_sync) {
-            ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size);
-            return buf_size;
+            if (!mp->in_sync) {
+                if (ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size) != -1)
+                    av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
+                return buf_size;
+            }
+
+            if ((ret = ff_combine_frame(&mp->pc, i - 7, &buf, &buf_size)) < 0) {
+                av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
+                return ret;
+            }
+
+            return i - 7;
         }
 
-        ff_combine_frame(&mp->pc, i - 7, &buf, &buf_size);
+        if (mp->bytes_left == 0) {
+            // Find length of this packet
 
-        return i - 7;
-    }
+            /* Copy overread bytes from last frame into buffer. */
+            for(; mp->pc.overread>0; mp->pc.overread--) {
+                mp->pc.buffer[mp->pc.index++]= mp->pc.buffer[mp->pc.overread_index++];
+            }
 
-    if (mp->bytes_left == 0) {
-        // Find length of this packet
+            if (mp->pc.index + buf_size < 2) {
+                if (ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size) != -1)
+                    av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
+                return buf_size;
+            }
 
-        /* Copy overread bytes from last frame into buffer. */
-        for(; mp->pc.overread>0; mp->pc.overread--) {
-            mp->pc.buffer[mp->pc.index++]= mp->pc.buffer[mp->pc.overread_index++];
+            mp->bytes_left = ((mp->pc.index > 0 ? mp->pc.buffer[0] : buf[0]) << 8)
+                           |  (mp->pc.index > 1 ? mp->pc.buffer[1] : buf[1-mp->pc.index]);
+            mp->bytes_left = (mp->bytes_left & 0xfff) * 2;
+            if (mp->bytes_left <= 0) { // prevent infinite loop
+                goto lost_sync;
+            }
+            mp->bytes_left -= mp->pc.index;
         }
 
-        if (mp->pc.index + buf_size < 2) {
-            ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size);
+        next = (mp->bytes_left > buf_size) ? END_NOT_FOUND : mp->bytes_left;
+
+        if (ff_combine_frame(&mp->pc, next, &buf, &buf_size) < 0) {
+            mp->bytes_left -= buf_size;
             return buf_size;
         }
 
-        mp->bytes_left = ((mp->pc.index > 0 ? mp->pc.buffer[0] : buf[0]) << 8)
-                       |  (mp->pc.index > 1 ? mp->pc.buffer[1] : buf[1-mp->pc.index]);
-        mp->bytes_left = (mp->bytes_left & 0xfff) * 2;
-        mp->bytes_left -= mp->pc.index;
+        mp->bytes_left = 0;
     }
 
-    next = (mp->bytes_left > buf_size) ? END_NOT_FOUND : mp->bytes_left;
-
-    if (ff_combine_frame(&mp->pc, next, &buf, &buf_size) < 0) {
-        mp->bytes_left -= buf_size;
-        return buf_size;
-    }
-
-    mp->bytes_left = 0;
-
-    sync_present = (AV_RB32(buf + 4) & 0xfffffffe) == 0xf8726fba;
+    sync_present = buf_size >= 8 && (AV_RB32(buf + 4) & 0xfffffffe) == 0xf8726fba;
 
     if (!sync_present) {
         /* The first nibble of a frame is a parity check of the 4-byte
@@ -329,11 +343,11 @@ static int mlp_parse(AVCodecParserContext *s,
             goto lost_sync;
         }
     } else {
-        BitstreamContext bc;
+        GetBitContext gb;
         MLPHeaderInfo mh;
 
-        bitstream_init8(&bc, buf + 4, buf_size - 4);
-        if (ff_mlp_read_major_sync(avctx, &mh, &bc) < 0)
+        init_get_bits(&gb, buf + 4, (buf_size - 4) << 3);
+        if (ff_mlp_read_major_sync(avctx, &mh, &gb) < 0)
             goto lost_sync;
 
         avctx->bits_per_raw_sample = mh.group1_bits;
@@ -344,6 +358,7 @@ static int mlp_parse(AVCodecParserContext *s,
         avctx->sample_rate = mh.group1_samplerate;
         s->duration = mh.access_unit_size;
 
+        if(!avctx->channels || !avctx->channel_layout) {
         if (mh.stream_type == 0xbb) {
             /* MLP stream */
             avctx->channels       = mh.channels_mlp;
@@ -358,6 +373,7 @@ static int mlp_parse(AVCodecParserContext *s,
                 avctx->channel_layout = mh.channel_layout_thd_stream2;
             }
         }
+        }
 
         if (!mh.is_vbr) /* Stream is CBR */
             avctx->bit_rate = mh.peak_bitrate;
diff --git a/libavcodec/mlp_parser.h b/libavcodec/mlp_parser.h
index 871b96d..c5a2883 100644
--- a/libavcodec/mlp_parser.h
+++ b/libavcodec/mlp_parser.h
@@ -2,20 +2,20 @@
  * MLP parser prototypes
  * Copyright (c) 2007 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #ifndef AVCODEC_MLP_PARSER_H
 #define AVCODEC_MLP_PARSER_H
 
-#include "bitstream.h"
+#include "get_bits.h"
 
 typedef struct MLPHeaderInfo
 {
@@ -40,6 +40,8 @@ typedef struct MLPHeaderInfo
     int group1_samplerate;                  ///< Sample rate of first substream
     int group2_samplerate;                  ///< Sample rate of second substream (MLP only)
 
+    int channel_arrangement;
+
     int channel_modifier_thd_stream0;       ///< Channel modifier for substream 0 of TrueHD streams ("2-channel presentation")
     int channel_modifier_thd_stream1;       ///< Channel modifier for substream 1 of TrueHD streams ("6-channel presentation")
     int channel_modifier_thd_stream2;       ///< Channel modifier for substream 2 of TrueHD streams ("8-channel presentation")
@@ -61,6 +63,9 @@ typedef struct MLPHeaderInfo
 } MLPHeaderInfo;
 
 
-int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, BitstreamContext *bc);
+int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb);
+uint64_t ff_truehd_layout(int chanmap);
+
+extern const uint64_t ff_mlp_layout[32];
 
 #endif /* AVCODEC_MLP_PARSER_H */
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index b377fd7..3139a01 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -2,20 +2,20 @@
  * MLP decoder
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,20 +26,18 @@
 
 #include <stdint.h>
 
+#include "avcodec.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/channel_layout.h"
-#include "libavutil/crc.h"
-
-#include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
+#include "libavutil/crc.h"
 #include "parser.h"
 #include "mlp_parser.h"
 #include "mlpdsp.h"
 #include "mlp.h"
 #include "config.h"
-#include "vlc.h"
 
 /** number of bits used for VLC lookup - longest Huffman code is 9 */
 #if ARCH_ARM
@@ -107,7 +105,7 @@ typedef struct SubStream {
     /// Whether the LSBs of the matrix output are encoded in the bitstream.
     uint8_t     lsb_bypass[MAX_MATRICES];
     /// Matrix coefficients, stored as 2.14 fixed point.
-    int32_t     matrix_coeff[MAX_MATRICES][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, matrix_coeff)[MAX_MATRICES][MAX_CHANNELS];
     /// Left shift to apply to noise values in 0x31eb substreams.
     uint8_t     matrix_noise_shift[MAX_MATRICES];
     //@}
@@ -146,6 +144,9 @@ typedef struct MLPDecodeContext {
     /// Index of the last substream to decode - further substreams are skipped.
     uint8_t     max_decoded_substream;
 
+    /// Stream needs channel reordering to comply with FFmpeg's channel order
+    uint8_t     needs_reordering;
+
     /// number of PCM samples contained in each frame
     int         access_unit_size;
     /// next power of two above the number of samples in each frame
@@ -158,7 +159,7 @@ typedef struct MLPDecodeContext {
 
     int8_t      noise_buffer[MAX_BLOCKSIZE_POW2];
     int8_t      bypassed_lsbs[MAX_BLOCKSIZE][MAX_CHANNELS];
-    int32_t     sample_buffer[MAX_BLOCKSIZE][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, sample_buffer)[MAX_BLOCKSIZE][MAX_CHANNELS];
 
     MLPDSPContext dsp;
 } MLPDecodeContext;
@@ -240,7 +241,7 @@ static inline int32_t calculate_sign_huff(MLPDecodeContext *m,
 /** Read a sample, consisting of either, both or neither of entropy-coded MSBs
  *  and plain LSBs. */
 
-static inline int read_huff_channels(MLPDecodeContext *m, BitstreamContext *bc,
+static inline int read_huff_channels(MLPDecodeContext *m, GetBitContext *gbp,
                                      unsigned int substr, unsigned int pos)
 {
     SubStream *s = &m->substream[substr];
@@ -248,7 +249,7 @@ static inline int read_huff_channels(MLPDecodeContext *m, BitstreamContext *bc,
 
     for (mat = 0; mat < s->num_primitive_matrices; mat++)
         if (s->lsb_bypass[mat])
-            m->bypassed_lsbs[pos + s->blockpos][mat] = bitstream_read_bit(bc);
+            m->bypassed_lsbs[pos + s->blockpos][mat] = get_bits1(gbp);
 
     for (channel = s->min_channel; channel <= s->max_channel; channel++) {
         ChannelParams *cp = &s->channel_params[channel];
@@ -258,18 +259,17 @@ static inline int read_huff_channels(MLPDecodeContext *m, BitstreamContext *bc,
         int result = 0;
 
         if (codebook > 0)
-            result = bitstream_read_vlc(bc, huff_vlc[codebook-1].table,
-                                        VLC_BITS,
-                                        (9 + VLC_BITS - 1) / VLC_BITS);
+            result = get_vlc2(gbp, huff_vlc[codebook-1].table,
+                            VLC_BITS, (9 + VLC_BITS - 1) / VLC_BITS);
 
         if (result < 0)
             return AVERROR_INVALIDDATA;
 
         if (lsb_bits > 0)
-            result = (result << lsb_bits) + bitstream_read(bc, lsb_bits);
+            result = (result << lsb_bits) + get_bits(gbp, lsb_bits);
 
         result  += cp->sign_huff_offset;
-        result <<= quant_step_size;
+        result *= 1 << quant_step_size;
 
         m->sample_buffer[pos + s->blockpos][channel] = result;
     }
@@ -296,12 +296,12 @@ static av_cold int mlp_decode_init(AVCodecContext *avctx)
  *  information is not actually necessary for decoding, only for playback.
  */
 
-static int read_major_sync(MLPDecodeContext *m, BitstreamContext *bc)
+static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
 {
     MLPHeaderInfo mh;
     int substr, ret;
 
-    if ((ret = ff_mlp_read_major_sync(m->avctx, &mh, bc)) != 0)
+    if ((ret = ff_mlp_read_major_sync(m->avctx, &mh, gb)) != 0)
         return ret;
 
     if (mh.group1_bits == 0) {
@@ -388,10 +388,22 @@ static int read_major_sync(MLPDecodeContext *m, BitstreamContext *bc)
      * substream is Stereo. Subsequent substreams' layouts are indicated in the
      * major sync. */
     if (m->avctx->codec_id == AV_CODEC_ID_MLP) {
+        if (mh.stream_type != 0xbb) {
+            avpriv_request_sample(m->avctx,
+                        "unexpected stream_type %X in MLP",
+                        mh.stream_type);
+            return AVERROR_PATCHWELCOME;
+        }
         if ((substr = (mh.num_substreams > 1)))
             m->substream[0].mask = AV_CH_LAYOUT_STEREO;
         m->substream[substr].mask = mh.channel_layout_mlp;
     } else {
+        if (mh.stream_type != 0xba) {
+            avpriv_request_sample(m->avctx,
+                        "unexpected stream_type %X in !MLP",
+                        mh.stream_type);
+            return AVERROR_PATCHWELCOME;
+        }
         if ((substr = (mh.num_substreams > 1)))
             m->substream[0].mask = AV_CH_LAYOUT_STEREO;
         if (mh.num_substreams > 2)
@@ -400,8 +412,17 @@ static int read_major_sync(MLPDecodeContext *m, BitstreamContext *bc)
             else
                 m->substream[2].mask = mh.channel_layout_thd_stream1;
         m->substream[substr].mask = mh.channel_layout_thd_stream1;
+
+        if (m->avctx->channels<=2 && m->substream[substr].mask == AV_CH_LAYOUT_MONO && m->max_decoded_substream == 1) {
+            av_log(m->avctx, AV_LOG_DEBUG, "Mono stream with 2 substreams, ignoring 2nd\n");
+            m->max_decoded_substream = 0;
+            if (m->avctx->channels==2)
+                m->avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+        }
     }
 
+    m->needs_reordering = mh.channel_arrangement >= 18 && mh.channel_arrangement <= 20;
+
     /* Parse the TrueHD decoder channel modifiers and set each substream's
      * AVMatrixEncoding accordingly.
      *
@@ -449,7 +470,7 @@ static int read_major_sync(MLPDecodeContext *m, BitstreamContext *bc)
  *  required to decode the audio that do not change very often. Generally
  *  (always) present only in blocks following a major sync. */
 
-static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
+static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,
                                const uint8_t *buf, unsigned int substr)
 {
     SubStream *s = &m->substream[substr];
@@ -457,13 +478,13 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
     int sync_word, tmp;
     uint8_t checksum;
     uint8_t lossless_check;
-    int start_count = bitstream_tell(bc);
-    int min_channel, max_channel, max_matrix_channel;
+    int start_count = get_bits_count(gbp);
+    int min_channel, max_channel, max_matrix_channel, noise_type;
     const int std_max_matrix_channel = m->avctx->codec_id == AV_CODEC_ID_MLP
                                      ? MAX_MATRIX_CHANNEL_MLP
                                      : MAX_MATRIX_CHANNEL_TRUEHD;
 
-    sync_word = bitstream_read(bc, 13);
+    sync_word = get_bits(gbp, 13);
 
     if (sync_word != 0x31ea >> 1) {
         av_log(m->avctx, AV_LOG_ERROR,
@@ -471,23 +492,23 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
         return AVERROR_INVALIDDATA;
     }
 
-    s->noise_type = bitstream_read_bit(bc);
+    noise_type = get_bits1(gbp);
 
-    if (m->avctx->codec_id == AV_CODEC_ID_MLP && s->noise_type) {
+    if (m->avctx->codec_id == AV_CODEC_ID_MLP && noise_type) {
         av_log(m->avctx, AV_LOG_ERROR, "MLP must have 0x31ea sync word.\n");
         return AVERROR_INVALIDDATA;
     }
 
-    bitstream_skip(bc, 16); /* Output timestamp */
+    skip_bits(gbp, 16); /* Output timestamp */
 
-    min_channel        = bitstream_read(bc, 4);
-    max_channel        = bitstream_read(bc, 4);
-    max_matrix_channel = bitstream_read(bc, 4);
+    min_channel        = get_bits(gbp, 4);
+    max_channel        = get_bits(gbp, 4);
+    max_matrix_channel = get_bits(gbp, 4);
 
     if (max_matrix_channel > std_max_matrix_channel) {
         av_log(m->avctx, AV_LOG_ERROR,
                "Max matrix channel cannot be greater than %d.\n",
-               max_matrix_channel);
+               std_max_matrix_channel);
         return AVERROR_INVALIDDATA;
     }
 
@@ -499,11 +520,11 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
 
     /* This should happen for TrueHD streams with >6 channels and MLP's noise
      * type. It is not yet known if this is allowed. */
-    if (s->max_channel > MAX_MATRIX_CHANNEL_MLP && !s->noise_type) {
+    if (max_channel > MAX_MATRIX_CHANNEL_MLP && !noise_type) {
         avpriv_request_sample(m->avctx,
                               "%d channels (more than the "
                               "maximum supported by the decoder)",
-                              s->max_channel + 2);
+                              max_channel + 2);
         return AVERROR_PATCHWELCOME;
     }
 
@@ -516,6 +537,7 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
     s->min_channel        = min_channel;
     s->max_channel        = max_channel;
     s->max_matrix_channel = max_matrix_channel;
+    s->noise_type         = noise_type;
 
     if (mlp_channel_layout_subset(m->avctx->request_channel_layout, s->mask) &&
         m->max_decoded_substream > substr) {
@@ -526,13 +548,13 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
         m->max_decoded_substream = substr;
     }
 
-    s->noise_shift   = bitstream_read(bc,  4);
-    s->noisegen_seed = bitstream_read(bc, 23);
+    s->noise_shift   = get_bits(gbp,  4);
+    s->noisegen_seed = get_bits(gbp, 23);
 
-    bitstream_skip(bc, 19);
+    skip_bits(gbp, 19);
 
-    s->data_check_present = bitstream_read_bit(bc);
-    lossless_check = bitstream_read(bc, 8);
+    s->data_check_present = get_bits1(gbp);
+    lossless_check = get_bits(gbp, 8);
     if (substr == m->max_decoded_substream
         && s->lossless_check_data != 0xffffffff) {
         tmp = xor_32_to_8(s->lossless_check_data);
@@ -542,12 +564,12 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
                    lossless_check, tmp);
     }
 
-    bitstream_skip(bc, 16);
+    skip_bits(gbp, 16);
 
     memset(s->ch_assign, 0, sizeof(s->ch_assign));
 
     for (ch = 0; ch <= s->max_matrix_channel; ch++) {
-        int ch_assign = bitstream_read(bc, 6);
+        int ch_assign = get_bits(gbp, 6);
         if (m->avctx->codec_id == AV_CODEC_ID_TRUEHD) {
             uint64_t channel = thd_channel_layout_extract_channel(s->mask,
                                                                   ch_assign);
@@ -563,9 +585,9 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
         s->ch_assign[ch_assign] = ch;
     }
 
-    checksum = ff_mlp_restart_checksum(buf, bitstream_tell(bc) - start_count);
+    checksum = ff_mlp_restart_checksum(buf, get_bits_count(gbp) - start_count);
 
-    if (checksum != bitstream_read(bc, 8))
+    if (checksum != get_bits(gbp, 8))
         av_log(m->avctx, AV_LOG_ERROR, "restart header checksum error\n");
 
     /* Set default decoding parameters. */
@@ -598,6 +620,20 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
                                                                s->output_shift,
                                                                s->max_matrix_channel,
                                                                m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+        if (m->avctx->codec_id == AV_CODEC_ID_MLP && m->needs_reordering) {
+            if (m->avctx->channel_layout == (AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY) ||
+                m->avctx->channel_layout == AV_CH_LAYOUT_5POINT0_BACK) {
+                int i = s->ch_assign[4];
+                s->ch_assign[4] = s->ch_assign[3];
+                s->ch_assign[3] = s->ch_assign[2];
+                s->ch_assign[2] = i;
+            } else if (m->avctx->channel_layout == AV_CH_LAYOUT_5POINT1_BACK) {
+                FFSWAP(int, s->ch_assign[2], s->ch_assign[4]);
+                FFSWAP(int, s->ch_assign[3], s->ch_assign[5]);
+            }
+        }
+
     }
 
     return 0;
@@ -605,7 +641,7 @@ static int read_restart_header(MLPDecodeContext *m, BitstreamContext *bc,
 
 /** Read parameters for one of the prediction filters. */
 
-static int read_filter_params(MLPDecodeContext *m, BitstreamContext *bc,
+static int read_filter_params(MLPDecodeContext *m, GetBitContext *gbp,
                               unsigned int substr, unsigned int channel,
                               unsigned int filter)
 {
@@ -616,14 +652,14 @@ static int read_filter_params(MLPDecodeContext *m, BitstreamContext *bc,
     int i, order;
 
     // Filter is 0 for FIR, 1 for IIR.
-    assert(filter < 2);
+    av_assert0(filter < 2);
 
     if (m->filter_changed[channel][filter]++ > 1) {
         av_log(m->avctx, AV_LOG_ERROR, "Filters may change only once per access unit.\n");
         return AVERROR_INVALIDDATA;
     }
 
-    order = bitstream_read(bc, 4);
+    order = get_bits(gbp, 4);
     if (order > max_order) {
         av_log(m->avctx, AV_LOG_ERROR,
                "%cIR filter order %d is greater than maximum %d.\n",
@@ -636,10 +672,10 @@ static int read_filter_params(MLPDecodeContext *m, BitstreamContext *bc,
         int32_t *fcoeff = s->channel_params[channel].coeff[filter];
         int coeff_bits, coeff_shift;
 
-        fp->shift = bitstream_read(bc, 4);
+        fp->shift = get_bits(gbp, 4);
 
-        coeff_bits  = bitstream_read(bc, 5);
-        coeff_shift = bitstream_read(bc, 3);
+        coeff_bits  = get_bits(gbp, 5);
+        coeff_shift = get_bits(gbp, 3);
         if (coeff_bits < 1 || coeff_bits > 16) {
             av_log(m->avctx, AV_LOG_ERROR,
                    "%cIR filter coeff_bits must be between 1 and 16.\n",
@@ -654,9 +690,9 @@ static int read_filter_params(MLPDecodeContext *m, BitstreamContext *bc,
         }
 
         for (i = 0; i < order; i++)
-            fcoeff[i] = bitstream_read_signed(bc, coeff_bits) << coeff_shift;
+            fcoeff[i] = get_sbits(gbp, coeff_bits) * (1 << coeff_shift);
 
-        if (bitstream_read_bit(bc)) {
+        if (get_bits1(gbp)) {
             int state_bits, state_shift;
 
             if (filter == FIR) {
@@ -665,13 +701,13 @@ static int read_filter_params(MLPDecodeContext *m, BitstreamContext *bc,
                 return AVERROR_INVALIDDATA;
             }
 
-            state_bits  = bitstream_read(bc, 4);
-            state_shift = bitstream_read(bc, 4);
+            state_bits  = get_bits(gbp, 4);
+            state_shift = get_bits(gbp, 4);
 
             /* TODO: Check validity of state data. */
 
             for (i = 0; i < order; i++)
-                fp->state[i] = bitstream_read_signed(bc, state_bits) << state_shift;
+                fp->state[i] = state_bits ? get_sbits(gbp, state_bits) * (1 << state_shift) : 0;
         }
     }
 
@@ -680,8 +716,7 @@ static int read_filter_params(MLPDecodeContext *m, BitstreamContext *bc,
 
 /** Read parameters for primitive matrices. */
 
-static int read_matrix_params(MLPDecodeContext *m, unsigned int substr,
-                              BitstreamContext *bc)
+static int read_matrix_params(MLPDecodeContext *m, unsigned int substr, GetBitContext *gbp)
 {
     SubStream *s = &m->substream[substr];
     unsigned int mat, ch;
@@ -694,31 +729,31 @@ static int read_matrix_params(MLPDecodeContext *m, unsigned int substr,
         return AVERROR_INVALIDDATA;
     }
 
-    s->num_primitive_matrices = bitstream_read(bc, 4);
+    s->num_primitive_matrices = get_bits(gbp, 4);
 
     if (s->num_primitive_matrices > max_primitive_matrices) {
         av_log(m->avctx, AV_LOG_ERROR,
                "Number of primitive matrices cannot be greater than %d.\n",
                max_primitive_matrices);
-        return AVERROR_INVALIDDATA;
+        goto error;
     }
 
     for (mat = 0; mat < s->num_primitive_matrices; mat++) {
         int frac_bits, max_chan;
-        s->matrix_out_ch[mat] = bitstream_read(bc, 4);
-        frac_bits             = bitstream_read(bc, 4);
-        s->lsb_bypass[mat]    = bitstream_read_bit(bc);
+        s->matrix_out_ch[mat] = get_bits(gbp, 4);
+        frac_bits             = get_bits(gbp, 4);
+        s->lsb_bypass   [mat] = get_bits1(gbp);
 
         if (s->matrix_out_ch[mat] > s->max_matrix_channel) {
             av_log(m->avctx, AV_LOG_ERROR,
                     "Invalid channel %d specified as output from matrix.\n",
                     s->matrix_out_ch[mat]);
-            return AVERROR_INVALIDDATA;
+            goto error;
         }
         if (frac_bits > 14) {
             av_log(m->avctx, AV_LOG_ERROR,
                     "Too many fractional bits specified.\n");
-            return AVERROR_INVALIDDATA;
+            goto error;
         }
 
         max_chan = s->max_matrix_channel;
@@ -727,25 +762,30 @@ static int read_matrix_params(MLPDecodeContext *m, unsigned int substr,
 
         for (ch = 0; ch <= max_chan; ch++) {
             int coeff_val = 0;
-            if (bitstream_read_bit(bc))
-                coeff_val = bitstream_read_signed(bc, frac_bits + 2);
+            if (get_bits1(gbp))
+                coeff_val = get_sbits(gbp, frac_bits + 2);
 
-            s->matrix_coeff[mat][ch] = coeff_val << (14 - frac_bits);
+            s->matrix_coeff[mat][ch] = coeff_val * (1 << (14 - frac_bits));
         }
 
         if (s->noise_type)
-            s->matrix_noise_shift[mat] = bitstream_read(bc, 4);
+            s->matrix_noise_shift[mat] = get_bits(gbp, 4);
         else
             s->matrix_noise_shift[mat] = 0;
     }
 
     return 0;
+error:
+    s->num_primitive_matrices = 0;
+    memset(s->matrix_out_ch, 0, sizeof(s->matrix_out_ch));
+
+    return AVERROR_INVALIDDATA;
 }
 
 /** Read channel parameters. */
 
 static int read_channel_params(MLPDecodeContext *m, unsigned int substr,
-                               BitstreamContext *bc, unsigned int ch)
+                               GetBitContext *gbp, unsigned int ch)
 {
     SubStream *s = &m->substream[substr];
     ChannelParams *cp = &s->channel_params[ch];
@@ -754,13 +794,13 @@ static int read_channel_params(MLPDecodeContext *m, unsigned int substr,
     int ret;
 
     if (s->param_presence_flags & PARAM_FIR)
-        if (bitstream_read_bit(bc))
-            if ((ret = read_filter_params(m, bc, substr, ch, FIR)) < 0)
+        if (get_bits1(gbp))
+            if ((ret = read_filter_params(m, gbp, substr, ch, FIR)) < 0)
                 return ret;
 
     if (s->param_presence_flags & PARAM_IIR)
-        if (bitstream_read_bit(bc))
-            if ((ret = read_filter_params(m, bc, substr, ch, IIR)) < 0)
+        if (get_bits1(gbp))
+            if ((ret = read_filter_params(m, gbp, substr, ch, IIR)) < 0)
                 return ret;
 
     if (fir->order + iir->order > 8) {
@@ -783,55 +823,60 @@ static int read_channel_params(MLPDecodeContext *m, unsigned int substr,
         fir->shift = iir->shift;
 
     if (s->param_presence_flags & PARAM_HUFFOFFSET)
-        if (bitstream_read_bit(bc))
-            cp->huff_offset = bitstream_read_signed(bc, 15);
+        if (get_bits1(gbp))
+            cp->huff_offset = get_sbits(gbp, 15);
 
-    cp->codebook  = bitstream_read(bc, 2);
-    cp->huff_lsbs = bitstream_read(bc, 5);
+    cp->codebook  = get_bits(gbp, 2);
+    cp->huff_lsbs = get_bits(gbp, 5);
 
     if (cp->huff_lsbs > 24) {
         av_log(m->avctx, AV_LOG_ERROR, "Invalid huff_lsbs.\n");
+        cp->huff_lsbs = 0;
         return AVERROR_INVALIDDATA;
     }
 
-    cp->sign_huff_offset = calculate_sign_huff(m, substr, ch);
-
     return 0;
 }
 
 /** Read decoding parameters that change more often than those in the restart
  *  header. */
 
-static int read_decoding_params(MLPDecodeContext *m, BitstreamContext *bc,
+static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
                                 unsigned int substr)
 {
     SubStream *s = &m->substream[substr];
     unsigned int ch;
-    int ret;
+    int ret = 0;
+    unsigned recompute_sho = 0;
 
     if (s->param_presence_flags & PARAM_PRESENCE)
-        if (bitstream_read_bit(bc))
-            s->param_presence_flags = bitstream_read(bc, 8);
+        if (get_bits1(gbp))
+            s->param_presence_flags = get_bits(gbp, 8);
 
     if (s->param_presence_flags & PARAM_BLOCKSIZE)
-        if (bitstream_read_bit(bc)) {
-            s->blocksize = bitstream_read(bc, 9);
+        if (get_bits1(gbp)) {
+            s->blocksize = get_bits(gbp, 9);
             if (s->blocksize < 8 || s->blocksize > m->access_unit_size) {
-                av_log(m->avctx, AV_LOG_ERROR, "Invalid blocksize.");
+                av_log(m->avctx, AV_LOG_ERROR, "Invalid blocksize.\n");
                 s->blocksize = 0;
                 return AVERROR_INVALIDDATA;
             }
         }
 
     if (s->param_presence_flags & PARAM_MATRIX)
-        if (bitstream_read_bit(bc))
-            if ((ret = read_matrix_params(m, substr, bc)) < 0)
+        if (get_bits1(gbp))
+            if ((ret = read_matrix_params(m, substr, gbp)) < 0)
                 return ret;
 
     if (s->param_presence_flags & PARAM_OUTSHIFT)
-        if (bitstream_read_bit(bc)) {
-            for (ch = 0; ch <= s->max_matrix_channel; ch++)
-                s->output_shift[ch] = bitstream_read_signed(bc, 4);
+        if (get_bits1(gbp)) {
+            for (ch = 0; ch <= s->max_matrix_channel; ch++) {
+                s->output_shift[ch] = get_sbits(gbp, 4);
+                if (s->output_shift[ch] < 0) {
+                    avpriv_request_sample(m->avctx, "Negative output_shift");
+                    s->output_shift[ch] = 0;
+                }
+            }
             if (substr == m->max_decoded_substream)
                 m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
                                                                        s->output_shift,
@@ -840,24 +885,41 @@ static int read_decoding_params(MLPDecodeContext *m, BitstreamContext *bc,
         }
 
     if (s->param_presence_flags & PARAM_QUANTSTEP)
-        if (bitstream_read_bit(bc))
+        if (get_bits1(gbp))
             for (ch = 0; ch <= s->max_channel; ch++) {
-                ChannelParams *cp = &s->channel_params[ch];
-
-                s->quant_step_size[ch] = bitstream_read(bc, 4);
+                s->quant_step_size[ch] = get_bits(gbp, 4);
 
-                cp->sign_huff_offset = calculate_sign_huff(m, substr, ch);
+                recompute_sho |= 1<<ch;
             }
 
     for (ch = s->min_channel; ch <= s->max_channel; ch++)
-        if (bitstream_read_bit(bc))
-            if ((ret = read_channel_params(m, substr, bc, ch)) < 0)
-                return ret;
+        if (get_bits1(gbp)) {
+            recompute_sho |= 1<<ch;
+            if ((ret = read_channel_params(m, substr, gbp, ch)) < 0)
+                goto fail;
+        }
 
-    return 0;
+
+fail:
+    for (ch = 0; ch <= s->max_channel; ch++) {
+        if (recompute_sho & (1<<ch)) {
+            ChannelParams *cp = &s->channel_params[ch];
+
+            if (cp->codebook > 0 && cp->huff_lsbs < s->quant_step_size[ch]) {
+                if (ret >= 0) {
+                    av_log(m->avctx, AV_LOG_ERROR, "quant_step_size larger than huff_lsbs\n");
+                    ret = AVERROR_INVALIDDATA;
+                }
+                s->quant_step_size[ch] = 0;
+            }
+
+            cp->sign_huff_offset = calculate_sign_huff(m, substr, ch);
+        }
+    }
+    return ret;
 }
 
-#define MSB_MASK(bits)  (-1u << bits)
+#define MSB_MASK(bits)  (-1u << (bits))
 
 /** Generate PCM samples using the prediction filters and residual values
  *  read from the data stream, and update the filter state. */
@@ -889,7 +951,7 @@ static void filter_channel(MLPDecodeContext *m, unsigned int substr,
 
 /** Read a block of PCM residual data (or actual if no filtering active). */
 
-static int read_block_data(MLPDecodeContext *m, BitstreamContext *bc,
+static int read_block_data(MLPDecodeContext *m, GetBitContext *gbp,
                            unsigned int substr)
 {
     SubStream *s = &m->substream[substr];
@@ -897,8 +959,8 @@ static int read_block_data(MLPDecodeContext *m, BitstreamContext *bc,
     int ret;
 
     if (s->data_check_present) {
-        expected_stream_pos  = bitstream_tell(bc);
-        expected_stream_pos += bitstream_read(bc, 16);
+        expected_stream_pos  = get_bits_count(gbp);
+        expected_stream_pos += get_bits(gbp, 16);
         avpriv_request_sample(m->avctx,
                               "Substreams with VLC block size check info");
     }
@@ -912,7 +974,7 @@ static int read_block_data(MLPDecodeContext *m, BitstreamContext *bc,
            s->blocksize * sizeof(m->bypassed_lsbs[0]));
 
     for (i = 0; i < s->blocksize; i++)
-        if ((ret = read_huff_channels(m, bc, substr, i)) < 0)
+        if ((ret = read_huff_channels(m, gbp, substr, i)) < 0)
             return ret;
 
     for (ch = s->min_channel; ch <= s->max_channel; ch++)
@@ -921,9 +983,9 @@ static int read_block_data(MLPDecodeContext *m, BitstreamContext *bc,
     s->blockpos += s->blocksize;
 
     if (s->data_check_present) {
-        if (bitstream_tell(bc) != expected_stream_pos)
+        if (get_bits_count(gbp) != expected_stream_pos)
             av_log(m->avctx, AV_LOG_ERROR, "block data length mismatch\n");
-        bitstream_skip(bc, 8);
+        skip_bits(gbp, 8);
     }
 
     return 0;
@@ -969,8 +1031,8 @@ static void generate_2_noise_channels(MLPDecodeContext *m, unsigned int substr)
 
     for (i = 0; i < s->blockpos; i++) {
         uint16_t seed_shr7 = seed >> 7;
-        m->sample_buffer[i][maxchan+1] = ((int8_t)(seed >> 15)) << s->noise_shift;
-        m->sample_buffer[i][maxchan+2] = ((int8_t) seed_shr7)   << s->noise_shift;
+        m->sample_buffer[i][maxchan+1] = ((int8_t)(seed >> 15)) * (1 << s->noise_shift);
+        m->sample_buffer[i][maxchan+2] = ((int8_t) seed_shr7)   * (1 << s->noise_shift);
 
         seed = (seed << 16) ^ seed_shr7 ^ (seed_shr7 << 5);
     }
@@ -995,15 +1057,27 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
     s->noisegen_seed = seed;
 }
 
+/** Write the audio data into the output buffer. */
 
-/** Apply the channel matrices in turn to reconstruct the original audio
- *  samples. */
-
-static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
+static int output_data(MLPDecodeContext *m, unsigned int substr,
+                       AVFrame *frame, int *got_frame_ptr)
 {
+    AVCodecContext *avctx = m->avctx;
     SubStream *s = &m->substream[substr];
     unsigned int mat;
     unsigned int maxchan;
+    int ret;
+    int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+    if (m->avctx->channels != s->max_matrix_channel + 1) {
+        av_log(m->avctx, AV_LOG_ERROR, "channel count mismatch\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!s->blockpos) {
+        av_log(avctx, AV_LOG_ERROR, "No samples to output.\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     maxchan = s->max_matrix_channel;
     if (!s->noise_type) {
@@ -1013,6 +1087,8 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
         fill_noise_buffer(m, substr);
     }
 
+    /* Apply the channel matrices in turn to reconstruct the original audio
+     * samples. */
     for (mat = 0; mat < s->num_primitive_matrices; mat++) {
         unsigned int dest_ch = s->matrix_out_ch[mat];
         m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
@@ -1027,34 +1103,11 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
                                     m->access_unit_size_pow2,
                                     MSB_MASK(s->quant_step_size[dest_ch]));
     }
-}
-
-/** Write the audio data into the output buffer. */
-
-static int output_data(MLPDecodeContext *m, unsigned int substr,
-                       AVFrame *frame, int *got_frame_ptr)
-{
-    AVCodecContext *avctx = m->avctx;
-    SubStream *s = &m->substream[substr];
-    int ret;
-    int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
-
-    if (m->avctx->channels != s->max_matrix_channel + 1) {
-        av_log(m->avctx, AV_LOG_ERROR, "channel count mismatch\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    if (!s->blockpos) {
-        av_log(avctx, AV_LOG_ERROR, "No samples to output.\n");
-        return AVERROR_INVALIDDATA;
-    }
 
     /* get output buffer */
     frame->nb_samples = s->blockpos;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
                                                     s->blockpos,
                                                     m->sample_buffer,
@@ -1083,7 +1136,7 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     MLPDecodeContext *m = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
     unsigned int length, substr;
     unsigned int substream_start;
     unsigned int header_size = 4;
@@ -1094,18 +1147,18 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
     int ret;
 
     if (buf_size < 4)
-        return 0;
+        return AVERROR_INVALIDDATA;
 
     length = (AV_RB16(buf) & 0xfff) * 2;
 
     if (length < 4 || length > buf_size)
         return AVERROR_INVALIDDATA;
 
-    bitstream_init8(&bc, buf + 4, length - 4);
+    init_get_bits(&gb, (buf + 4), (length - 4) * 8);
 
     m->is_major_sync_unit = 0;
-    if (bitstream_peek(&bc, 31) == (0xf8726fba >> 1)) {
-        if (read_major_sync(m, &bc) < 0)
+    if (show_bits_long(&gb, 31) == (0xf8726fba >> 1)) {
+        if (read_major_sync(m, &gb) < 0)
             goto error;
         m->is_major_sync_unit = 1;
         header_size += m->major_sync_header_size;
@@ -1123,12 +1176,12 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
     for (substr = 0; substr < m->num_substreams; substr++) {
         int extraword_present, checkdata_present, end, nonrestart_substr;
 
-        extraword_present = bitstream_read_bit(&bc);
-        nonrestart_substr = bitstream_read_bit(&bc);
-        checkdata_present = bitstream_read_bit(&bc);
-        bitstream_skip(&bc, 1);
+        extraword_present = get_bits1(&gb);
+        nonrestart_substr = get_bits1(&gb);
+        checkdata_present = get_bits1(&gb);
+        skip_bits1(&gb);
 
-        end = bitstream_read(&bc, 12) * 2;
+        end = get_bits(&gb, 12) * 2;
 
         substr_header_size += 2;
 
@@ -1137,10 +1190,15 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
                 av_log(m->avctx, AV_LOG_ERROR, "There must be no extraword for MLP.\n");
                 goto error;
             }
-            bitstream_skip(&bc, 16);
+            skip_bits(&gb, 16);
             substr_header_size += 2;
         }
 
+        if (length < header_size + substr_header_size) {
+            av_log(m->avctx, AV_LOG_ERROR, "Insufficient data for headers\n");
+            goto error;
+        }
+
         if (!(nonrestart_substr ^ m->is_major_sync_unit)) {
             av_log(m->avctx, AV_LOG_ERROR, "Invalid nonrestart_substr.\n");
             goto error;
@@ -1181,47 +1239,47 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
 
     for (substr = 0; substr <= m->max_decoded_substream; substr++) {
         SubStream *s = &m->substream[substr];
-        bitstream_init8(&bc, buf, substream_data_len[substr]);
+        init_get_bits(&gb, buf, substream_data_len[substr] * 8);
 
         m->matrix_changed = 0;
         memset(m->filter_changed, 0, sizeof(m->filter_changed));
 
         s->blockpos = 0;
         do {
-            if (bitstream_read_bit(&bc)) {
-                if (bitstream_read_bit(&bc)) {
+            if (get_bits1(&gb)) {
+                if (get_bits1(&gb)) {
                     /* A restart header should be present. */
-                    if (read_restart_header(m, &bc, buf, substr) < 0)
+                    if (read_restart_header(m, &gb, buf, substr) < 0)
                         goto next_substr;
                     s->restart_seen = 1;
                 }
 
                 if (!s->restart_seen)
                     goto next_substr;
-                if (read_decoding_params(m, &bc, substr) < 0)
+                if (read_decoding_params(m, &gb, substr) < 0)
                     goto next_substr;
             }
 
             if (!s->restart_seen)
                 goto next_substr;
 
-            if ((ret = read_block_data(m, &bc, substr)) < 0)
+            if ((ret = read_block_data(m, &gb, substr)) < 0)
                 return ret;
 
-            if (bitstream_tell(&bc) >= substream_data_len[substr] * 8)
+            if (get_bits_count(&gb) >= substream_data_len[substr] * 8)
                 goto substream_length_mismatch;
 
-        } while (!bitstream_read_bit(&bc));
+        } while (!get_bits1(&gb));
 
-        bitstream_skip(&bc, (-bitstream_tell(&bc)) & 15);
+        skip_bits(&gb, (-get_bits_count(&gb)) & 15);
 
-        if (substream_data_len[substr] * 8 - bitstream_tell(&bc) >= 32) {
+        if (substream_data_len[substr] * 8 - get_bits_count(&gb) >= 32) {
             int shorten_by;
 
-            if (bitstream_read(&bc, 16) != 0xD234)
+            if (get_bits(&gb, 16) != 0xD234)
                 return AVERROR_INVALIDDATA;
 
-            shorten_by = bitstream_read(&bc, 16);
+            shorten_by = get_bits(&gb, 16);
             if      (m->avctx->codec_id == AV_CODEC_ID_TRUEHD && shorten_by  & 0x2000)
                 s->blockpos -= FFMIN(shorten_by & 0x1FFF, s->blockpos);
             else if (m->avctx->codec_id == AV_CODEC_ID_MLP    && shorten_by != 0xD234)
@@ -1234,19 +1292,19 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
         if (substream_parity_present[substr]) {
             uint8_t parity, checksum;
 
-            if (substream_data_len[substr] * 8 - bitstream_tell(&bc) != 16)
+            if (substream_data_len[substr] * 8 - get_bits_count(&gb) != 16)
                 goto substream_length_mismatch;
 
             parity   = ff_mlp_calculate_parity(buf, substream_data_len[substr] - 2);
             checksum = ff_mlp_checksum8       (buf, substream_data_len[substr] - 2);
 
-            if ((bitstream_read(&bc, 8) ^ parity) != 0xa9)
+            if ((get_bits(&gb, 8) ^ parity) != 0xa9    )
                 av_log(m->avctx, AV_LOG_ERROR, "Substream %d parity check failed.\n", substr);
-            if (bitstream_read(&bc, 8) != checksum)
+            if ( get_bits(&gb, 8)           != checksum)
                 av_log(m->avctx, AV_LOG_ERROR, "Substream %d checksum failed.\n"    , substr);
         }
 
-        if (substream_data_len[substr] * 8 != bitstream_tell(&bc))
+        if (substream_data_len[substr] * 8 != get_bits_count(&gb))
             goto substream_length_mismatch;
 
 next_substr:
@@ -1257,8 +1315,6 @@ next_substr:
         buf += substream_data_len[substr];
     }
 
-    rematrix_channels(m, m->max_decoded_substream);
-
     if ((ret = output_data(m, m->max_decoded_substream, data, got_frame_ptr)) < 0)
         return ret;
 
@@ -1273,6 +1329,7 @@ error:
     return AVERROR_INVALIDDATA;
 }
 
+#if CONFIG_MLP_DECODER
 AVCodec ff_mlp_decoder = {
     .name           = "mlp",
     .long_name      = NULL_IF_CONFIG_SMALL("MLP (Meridian Lossless Packing)"),
@@ -1283,7 +1340,7 @@ AVCodec ff_mlp_decoder = {
     .decode         = read_access_unit,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
 #if CONFIG_TRUEHD_DECODER
 AVCodec ff_truehd_decoder = {
     .name           = "truehd",
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index aded554..32a4503 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2007-2008 Ian Caulfield
  *               2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -113,11 +113,11 @@ int32_t ff_mlp_pack_output(int32_t lossless_check_data,
     for (i = 0; i < blockpos; i++) {
         for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) {
             int mat_ch = ch_assign[out_ch];
-            int32_t sample = sample_buffer[i][mat_ch]
-                          << output_shift[mat_ch];
+            int32_t sample = sample_buffer[i][mat_ch] *
+                          (1U << output_shift[mat_ch]);
             lossless_check_data ^= (sample & 0xffffff) << mat_ch;
             if (is32)
-                *data_32++ = sample << 8;
+                *data_32++ = sample * 256U;
             else
                 *data_16++ = sample >> 8;
         }
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index acd48fc..a0edeb7 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -2,20 +2,20 @@
  * MLP codec common header file
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mlpenc.c b/libavcodec/mlpenc.c
new file mode 100644
index 0000000..7536d3b
--- /dev/null
+++ b/libavcodec/mlpenc.c
@@ -0,0 +1,2416 @@
+/**
+ * MLP encoder
+ * Copyright (c) 2008 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "audio_frame_queue.h"
+#include "libavutil/crc.h"
+#include "libavutil/avstring.h"
+#include "libavutil/samplefmt.h"
+#include "mlp.h"
+#include "lpc.h"
+
+#define MAJOR_HEADER_INTERVAL 16
+
+#define MLP_MIN_LPC_ORDER      1
+#define MLP_MAX_LPC_ORDER      8
+#define MLP_MIN_LPC_SHIFT      8
+#define MLP_MAX_LPC_SHIFT     15
+
+typedef struct {
+    uint8_t         min_channel;         ///< The index of the first channel coded in this substream.
+    uint8_t         max_channel;         ///< The index of the last channel coded in this substream.
+    uint8_t         max_matrix_channel;  ///< The number of channels input into the rematrix stage.
+
+    uint8_t         noise_shift;         ///< The left shift applied to random noise in 0x31ea substreams.
+    uint32_t        noisegen_seed;       ///< The current seed value for the pseudorandom noise generator(s).
+
+    int             data_check_present;  ///< Set if the substream contains extra info to check the size of VLC blocks.
+
+    int32_t         lossless_check_data; ///< XOR of all output samples
+
+    uint8_t         max_huff_lsbs;       ///< largest huff_lsbs
+    uint8_t         max_output_bits;     ///< largest output bit-depth
+} RestartHeader;
+
+typedef struct {
+    uint8_t         count;                  ///< number of matrices to apply
+
+    uint8_t         outch[MAX_MATRICES];    ///< output channel for each matrix
+    int32_t         forco[MAX_MATRICES][MAX_CHANNELS+2];    ///< forward coefficients
+    int32_t         coeff[MAX_MATRICES][MAX_CHANNELS+2];    ///< decoding coefficients
+    uint8_t         fbits[MAX_CHANNELS];    ///< fraction bits
+
+    int8_t          shift[MAX_CHANNELS];    ///< Left shift to apply to decoded PCM values to get final 24-bit output.
+} MatrixParams;
+
+enum ParamFlags {
+    PARAMS_DEFAULT       = 0xff,
+    PARAM_PRESENCE_FLAGS = 1 << 8,
+    PARAM_BLOCKSIZE      = 1 << 7,
+    PARAM_MATRIX         = 1 << 6,
+    PARAM_OUTSHIFT       = 1 << 5,
+    PARAM_QUANTSTEP      = 1 << 4,
+    PARAM_FIR            = 1 << 3,
+    PARAM_IIR            = 1 << 2,
+    PARAM_HUFFOFFSET     = 1 << 1,
+    PARAM_PRESENT        = 1 << 0,
+};
+
+typedef struct {
+    uint16_t        blocksize;                  ///< number of PCM samples in current audio block
+    uint8_t         quant_step_size[MAX_CHANNELS];  ///< left shift to apply to Huffman-decoded residuals
+
+    MatrixParams    matrix_params;
+
+    uint8_t         param_presence_flags;       ///< Bitmask of which parameter sets are conveyed in a decoding parameter block.
+} DecodingParams;
+
+typedef struct BestOffset {
+    int16_t offset;
+    int bitcount;
+    int lsb_bits;
+    int16_t min;
+    int16_t max;
+} BestOffset;
+
+#define HUFF_OFFSET_MIN    -16384
+#define HUFF_OFFSET_MAX     16383
+
+/** Number of possible codebooks (counting "no codebooks") */
+#define NUM_CODEBOOKS       4
+
+typedef struct {
+    AVCodecContext *avctx;
+
+    int             num_substreams;         ///< Number of substreams contained within this stream.
+
+    int             num_channels;   /**< Number of channels in major_scratch_buffer.
+                                     *   Normal channels + noise channels. */
+
+    int             coded_sample_fmt [2];   ///< sample format encoded for MLP
+    int             coded_sample_rate[2];   ///< sample rate encoded for MLP
+    int             coded_peak_bitrate;     ///< peak bitrate for this major sync header
+
+    int             flags;                  ///< major sync info flags
+
+    /* channel_meaning */
+    int             substream_info;
+    int             fs;
+    int             wordlength;
+    int             channel_occupancy;
+    int             summary_info;
+
+    int32_t        *inout_buffer;           ///< Pointer to data currently being read from lavc or written to bitstream.
+    int32_t        *major_inout_buffer;     ///< Buffer with all in/out data for one entire major frame interval.
+    int32_t        *write_buffer;           ///< Pointer to data currently being written to bitstream.
+    int32_t        *sample_buffer;          ///< Pointer to current access unit samples.
+    int32_t        *major_scratch_buffer;   ///< Scratch buffer big enough to fit all data for one entire major frame interval.
+    int32_t        *last_frame;             ///< Pointer to last frame with data to encode.
+
+    int32_t        *lpc_sample_buffer;
+
+    unsigned int    major_number_of_frames;
+    unsigned int    next_major_number_of_frames;
+
+    unsigned int    major_frame_size;       ///< Number of samples in current major frame being encoded.
+    unsigned int    next_major_frame_size;  ///< Counter of number of samples for next major frame.
+
+    int32_t        *lossless_check_data;    ///< Array with lossless_check_data for each access unit.
+
+    unsigned int   *max_output_bits;        ///< largest output bit-depth
+    unsigned int   *frame_size;             ///< Array with number of samples/channel in each access unit.
+    unsigned int    frame_index;            ///< Index of current frame being encoded.
+
+    unsigned int    one_sample_buffer_size; ///< Number of samples*channel for one access unit.
+
+    unsigned int    max_restart_interval;   ///< Max interval of access units in between two major frames.
+    unsigned int    min_restart_interval;   ///< Min interval of access units in between two major frames.
+    unsigned int    restart_intervals;      ///< Number of possible major frame sizes.
+
+    uint16_t        timestamp;              ///< Timestamp of current access unit.
+    uint16_t        dts;                    ///< Decoding timestamp of current access unit.
+
+    uint8_t         channel_arrangement;    ///< channel arrangement for MLP streams
+
+    uint8_t         ch_modifier_thd0;       ///< channel modifier for TrueHD stream 0
+    uint8_t         ch_modifier_thd1;       ///< channel modifier for TrueHD stream 1
+    uint8_t         ch_modifier_thd2;       ///< channel modifier for TrueHD stream 2
+
+    unsigned int    seq_size  [MAJOR_HEADER_INTERVAL];
+    unsigned int    seq_offset[MAJOR_HEADER_INTERVAL];
+    unsigned int    sequence_size;
+
+    ChannelParams  *channel_params;
+
+    BestOffset      best_offset[MAJOR_HEADER_INTERVAL+1][MAX_CHANNELS][NUM_CODEBOOKS];
+
+    DecodingParams *decoding_params;
+    RestartHeader   restart_header [MAX_SUBSTREAMS];
+
+    ChannelParams   major_channel_params[MAJOR_HEADER_INTERVAL+1][MAX_CHANNELS];       ///< ChannelParams to be written to bitstream.
+    DecodingParams  major_decoding_params[MAJOR_HEADER_INTERVAL+1][MAX_SUBSTREAMS];    ///< DecodingParams to be written to bitstream.
+    int             major_params_changed[MAJOR_HEADER_INTERVAL+1][MAX_SUBSTREAMS];     ///< params_changed to be written to bitstream.
+
+    unsigned int    major_cur_subblock_index;
+    unsigned int    major_filter_state_subblock;
+    unsigned int    major_number_of_subblocks;
+
+    BestOffset    (*cur_best_offset)[NUM_CODEBOOKS];
+    ChannelParams  *cur_channel_params;
+    DecodingParams *cur_decoding_params;
+    RestartHeader  *cur_restart_header;
+
+    AudioFrameQueue afq;
+
+    /* Analysis stage. */
+    unsigned int    starting_frame_index;
+    unsigned int    number_of_frames;
+    unsigned int    number_of_samples;
+    unsigned int    number_of_subblocks;
+    unsigned int    seq_index;              ///< Sequence index for high compression levels.
+
+    ChannelParams  *prev_channel_params;
+    DecodingParams *prev_decoding_params;
+
+    ChannelParams  *seq_channel_params;
+    DecodingParams *seq_decoding_params;
+
+    unsigned int    max_codebook_search;
+
+    LPCContext      lpc_ctx;
+} MLPEncodeContext;
+
+static ChannelParams   restart_channel_params[MAX_CHANNELS];
+static DecodingParams  restart_decoding_params[MAX_SUBSTREAMS];
+static BestOffset      restart_best_offset[NUM_CODEBOOKS] = {{0}};
+
+#define SYNC_MAJOR      0xf8726f
+#define MAJOR_SYNC_INFO_SIGNATURE   0xB752
+
+#define SYNC_MLP        0xbb
+#define SYNC_TRUEHD     0xba
+
+/* must be set for DVD-A */
+#define FLAGS_DVDA      0x4000
+/* FIFO delay must be constant */
+#define FLAGS_CONST     0x8000
+
+#define SUBSTREAM_INFO_MAX_2_CHAN   0x01
+#define SUBSTREAM_INFO_HIGH_RATE    0x02
+#define SUBSTREAM_INFO_ALWAYS_SET   0x04
+#define SUBSTREAM_INFO_2_SUBSTREAMS 0x08
+
+/****************************************************************************
+ ************ Functions that copy, clear, or compare parameters *************
+ ****************************************************************************/
+
+/** Compares two FilterParams structures and returns 1 if anything has
+ *  changed. Returns 0 if they are both equal.
+ */
+static int compare_filter_params(const ChannelParams *prev_cp, const ChannelParams *cp, int filter)
+{
+    const FilterParams *prev = &prev_cp->filter_params[filter];
+    const FilterParams *fp = &cp->filter_params[filter];
+    int i;
+
+    if (prev->order != fp->order)
+        return 1;
+
+    if (!prev->order)
+        return 0;
+
+    if (prev->shift != fp->shift)
+        return 1;
+
+    for (i = 0; i < fp->order; i++)
+        if (prev_cp->coeff[filter][i] != cp->coeff[filter][i])
+            return 1;
+
+    return 0;
+}
+
+/** Compare two primitive matrices and returns 1 if anything has changed.
+ *  Returns 0 if they are both equal.
+ */
+static int compare_matrix_params(MLPEncodeContext *ctx, const MatrixParams *prev, const MatrixParams *mp)
+{
+    RestartHeader *rh = ctx->cur_restart_header;
+    unsigned int channel, mat;
+
+    if (prev->count != mp->count)
+        return 1;
+
+    if (!prev->count)
+        return 0;
+
+    for (channel = rh->min_channel; channel <= rh->max_channel; channel++)
+        if (prev->fbits[channel] != mp->fbits[channel])
+            return 1;
+
+    for (mat = 0; mat < mp->count; mat++) {
+        if (prev->outch[mat] != mp->outch[mat])
+            return 1;
+
+        for (channel = 0; channel < ctx->num_channels; channel++)
+            if (prev->coeff[mat][channel] != mp->coeff[mat][channel])
+                return 1;
+    }
+
+    return 0;
+}
+
+/** Compares two DecodingParams and ChannelParams structures to decide if a
+ *  new decoding params header has to be written.
+ */
+static int compare_decoding_params(MLPEncodeContext *ctx)
+{
+    DecodingParams *prev = ctx->prev_decoding_params;
+    DecodingParams *dp = ctx->cur_decoding_params;
+    MatrixParams *prev_mp = &prev->matrix_params;
+    MatrixParams *mp = &dp->matrix_params;
+    RestartHeader  *rh = ctx->cur_restart_header;
+    unsigned int ch;
+    int retval = 0;
+
+    if (prev->param_presence_flags != dp->param_presence_flags)
+        retval |= PARAM_PRESENCE_FLAGS;
+
+    if (prev->blocksize != dp->blocksize)
+        retval |= PARAM_BLOCKSIZE;
+
+    if (compare_matrix_params(ctx, prev_mp, mp))
+        retval |= PARAM_MATRIX;
+
+    for (ch = 0; ch <= rh->max_matrix_channel; ch++)
+        if (prev_mp->shift[ch] != mp->shift[ch]) {
+            retval |= PARAM_OUTSHIFT;
+            break;
+        }
+
+    for (ch = 0; ch <= rh->max_channel; ch++)
+        if (prev->quant_step_size[ch] != dp->quant_step_size[ch]) {
+            retval |= PARAM_QUANTSTEP;
+            break;
+        }
+
+    for (ch = rh->min_channel; ch <= rh->max_channel; ch++) {
+        ChannelParams *prev_cp = &ctx->prev_channel_params[ch];
+        ChannelParams *cp = &ctx->cur_channel_params[ch];
+
+        if (!(retval & PARAM_FIR) &&
+            compare_filter_params(prev_cp, cp, FIR))
+            retval |= PARAM_FIR;
+
+        if (!(retval & PARAM_IIR) &&
+            compare_filter_params(prev_cp, cp, IIR))
+            retval |= PARAM_IIR;
+
+        if (prev_cp->huff_offset != cp->huff_offset)
+            retval |= PARAM_HUFFOFFSET;
+
+        if (prev_cp->codebook    != cp->codebook  ||
+            prev_cp->huff_lsbs   != cp->huff_lsbs  )
+            retval |= 0x1;
+    }
+
+    return retval;
+}
+
+static void copy_filter_params(ChannelParams *dst_cp, ChannelParams *src_cp, int filter)
+{
+    FilterParams *dst = &dst_cp->filter_params[filter];
+    FilterParams *src = &src_cp->filter_params[filter];
+    unsigned int order;
+
+    dst->order = src->order;
+
+    if (dst->order) {
+        dst->shift = src->shift;
+
+        dst->coeff_shift = src->coeff_shift;
+        dst->coeff_bits = src->coeff_bits;
+    }
+
+    for (order = 0; order < dst->order; order++)
+        dst_cp->coeff[filter][order] = src_cp->coeff[filter][order];
+}
+
+static void copy_matrix_params(MatrixParams *dst, MatrixParams *src)
+{
+    dst->count = src->count;
+
+    if (dst->count) {
+        unsigned int channel, count;
+
+        for (channel = 0; channel < MAX_CHANNELS; channel++) {
+
+            dst->fbits[channel] = src->fbits[channel];
+            dst->shift[channel] = src->shift[channel];
+
+            for (count = 0; count < MAX_MATRICES; count++)
+                dst->coeff[count][channel] = src->coeff[count][channel];
+        }
+
+        for (count = 0; count < MAX_MATRICES; count++)
+            dst->outch[count] = src->outch[count];
+    }
+}
+
+static void copy_restart_frame_params(MLPEncodeContext *ctx,
+                                      unsigned int substr)
+{
+    unsigned int index;
+
+    for (index = 0; index < ctx->number_of_subblocks; index++) {
+        DecodingParams *dp = ctx->seq_decoding_params + index*(ctx->num_substreams) + substr;
+        unsigned int channel;
+
+        copy_matrix_params(&dp->matrix_params, &ctx->cur_decoding_params->matrix_params);
+
+        for (channel = 0; channel < ctx->avctx->channels; channel++) {
+            ChannelParams *cp = ctx->seq_channel_params + index*(ctx->avctx->channels) + channel;
+            unsigned int filter;
+
+            dp->quant_step_size[channel] = ctx->cur_decoding_params->quant_step_size[channel];
+            dp->matrix_params.shift[channel] = ctx->cur_decoding_params->matrix_params.shift[channel];
+
+            if (index)
+                for (filter = 0; filter < NUM_FILTERS; filter++)
+                    copy_filter_params(cp, &ctx->cur_channel_params[channel], filter);
+        }
+    }
+}
+
+/** Clears a DecodingParams struct the way it should be after a restart header. */
+static void clear_decoding_params(MLPEncodeContext *ctx, DecodingParams decoding_params[MAX_SUBSTREAMS])
+{
+    unsigned int substr;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        DecodingParams *dp = &decoding_params[substr];
+
+        dp->param_presence_flags   = 0xff;
+        dp->blocksize              = 8;
+
+        memset(&dp->matrix_params , 0, sizeof(MatrixParams       ));
+        memset(dp->quant_step_size, 0, sizeof(dp->quant_step_size));
+    }
+}
+
+/** Clears a ChannelParams struct the way it should be after a restart header. */
+static void clear_channel_params(MLPEncodeContext *ctx, ChannelParams channel_params[MAX_CHANNELS])
+{
+    unsigned int channel;
+
+    for (channel = 0; channel < ctx->avctx->channels; channel++) {
+        ChannelParams *cp = &channel_params[channel];
+
+        memset(&cp->filter_params, 0, sizeof(cp->filter_params));
+
+        /* Default audio coding is 24-bit raw PCM. */
+        cp->huff_offset      =  0;
+        cp->codebook         =  0;
+        cp->huff_lsbs        = 24;
+    }
+}
+
+/** Sets default vales in our encoder for a DecodingParams struct. */
+static void default_decoding_params(MLPEncodeContext *ctx,
+     DecodingParams decoding_params[MAX_SUBSTREAMS])
+{
+    unsigned int substr;
+
+    clear_decoding_params(ctx, decoding_params);
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        DecodingParams *dp = &decoding_params[substr];
+        uint8_t param_presence_flags = 0;
+
+        param_presence_flags |= PARAM_BLOCKSIZE;
+        param_presence_flags |= PARAM_MATRIX;
+        param_presence_flags |= PARAM_OUTSHIFT;
+        param_presence_flags |= PARAM_QUANTSTEP;
+        param_presence_flags |= PARAM_FIR;
+/*      param_presence_flags |= PARAM_IIR; */
+        param_presence_flags |= PARAM_HUFFOFFSET;
+        param_presence_flags |= PARAM_PRESENT;
+
+        dp->param_presence_flags = param_presence_flags;
+    }
+}
+
+/****************************************************************************/
+
+/** Calculates the smallest number of bits it takes to encode a given signed
+ *  value in two's complement.
+ */
+static int inline number_sbits(int number)
+{
+    if (number < 0)
+        number++;
+
+    return av_log2(FFABS(number)) + 1 + !!number;
+}
+
+enum InputBitDepth {
+    BITS_16,
+    BITS_20,
+    BITS_24,
+};
+
+static int mlp_peak_bitrate(int peak_bitrate, int sample_rate)
+{
+    return ((peak_bitrate << 4) - 8) / sample_rate;
+}
+
+static av_cold int mlp_encode_init(AVCodecContext *avctx)
+{
+    MLPEncodeContext *ctx = avctx->priv_data;
+    unsigned int substr, index;
+    unsigned int sum = 0;
+    unsigned int size;
+    int ret;
+
+    ctx->avctx = avctx;
+
+    switch (avctx->sample_rate) {
+    case 44100 << 0:
+        avctx->frame_size         = 40  << 0;
+        ctx->coded_sample_rate[0] = 0x08 + 0;
+        ctx->fs                   = 0x08 + 1;
+        break;
+    case 44100 << 1:
+        avctx->frame_size         = 40  << 1;
+        ctx->coded_sample_rate[0] = 0x08 + 1;
+        ctx->fs                   = 0x0C + 1;
+        break;
+    case 44100 << 2:
+        ctx->substream_info      |= SUBSTREAM_INFO_HIGH_RATE;
+        avctx->frame_size         = 40  << 2;
+        ctx->coded_sample_rate[0] = 0x08 + 2;
+        ctx->fs                   = 0x10 + 1;
+        break;
+    case 48000 << 0:
+        avctx->frame_size         = 40  << 0;
+        ctx->coded_sample_rate[0] = 0x00 + 0;
+        ctx->fs                   = 0x08 + 2;
+        break;
+    case 48000 << 1:
+        avctx->frame_size         = 40  << 1;
+        ctx->coded_sample_rate[0] = 0x00 + 1;
+        ctx->fs                   = 0x0C + 2;
+        break;
+    case 48000 << 2:
+        ctx->substream_info      |= SUBSTREAM_INFO_HIGH_RATE;
+        avctx->frame_size         = 40  << 2;
+        ctx->coded_sample_rate[0] = 0x00 + 2;
+        ctx->fs                   = 0x10 + 2;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported sample rate %d. Supported "
+                            "sample rates are 44100, 88200, 176400, 48000, "
+                            "96000, and 192000.\n", avctx->sample_rate);
+        return -1;
+    }
+    ctx->coded_sample_rate[1] = -1 & 0xf;
+
+    /* TODO Keep count of bitrate and calculate real value. */
+    ctx->coded_peak_bitrate = mlp_peak_bitrate(9600000, avctx->sample_rate);
+
+    /* TODO support more channels. */
+    if (avctx->channels > 2) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Only mono and stereo are supported at the moment.\n");
+    }
+
+    ctx->substream_info |= SUBSTREAM_INFO_ALWAYS_SET;
+    if (avctx->channels <= 2) {
+        ctx->substream_info |= SUBSTREAM_INFO_MAX_2_CHAN;
+    }
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_S16:
+        ctx->coded_sample_fmt[0] = BITS_16;
+        ctx->wordlength = 16;
+        avctx->bits_per_raw_sample = 16;
+        break;
+    /* TODO 20 bits: */
+    case AV_SAMPLE_FMT_S32:
+        ctx->coded_sample_fmt[0] = BITS_24;
+        ctx->wordlength = 24;
+        avctx->bits_per_raw_sample = 24;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Sample format not supported. "
+               "Only 16- and 24-bit samples are supported.\n");
+        return -1;
+    }
+    ctx->coded_sample_fmt[1] = -1 & 0xf;
+
+    ctx->dts = -avctx->frame_size;
+
+    ctx->num_channels = avctx->channels + 2; /* +2 noise channels */
+    ctx->one_sample_buffer_size = avctx->frame_size
+                                * ctx->num_channels;
+    /* TODO Let user pass major header interval as parameter. */
+    ctx->max_restart_interval = MAJOR_HEADER_INTERVAL;
+
+    ctx->max_codebook_search = 3;
+    ctx->min_restart_interval = MAJOR_HEADER_INTERVAL;
+    ctx->restart_intervals = ctx->max_restart_interval / ctx->min_restart_interval;
+
+    /* TODO Let user pass parameters for LPC filter. */
+
+    size = avctx->frame_size * ctx->max_restart_interval;
+
+    ctx->lpc_sample_buffer = av_malloc_array(size, sizeof(int32_t));
+    if (!ctx->lpc_sample_buffer) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Not enough memory for buffering samples.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    size = ctx->one_sample_buffer_size * ctx->max_restart_interval;
+
+    ctx->major_scratch_buffer = av_malloc_array(size, sizeof(int32_t));
+    if (!ctx->major_scratch_buffer) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Not enough memory for buffering samples.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    ctx->major_inout_buffer = av_malloc_array(size, sizeof(int32_t));
+    if (!ctx->major_inout_buffer) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Not enough memory for buffering samples.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    ff_mlp_init_crc();
+
+    ctx->num_substreams = 1; // TODO: change this after adding multi-channel support for TrueHD
+
+    if (ctx->avctx->codec_id == AV_CODEC_ID_MLP) {
+        /* MLP */
+        switch(avctx->channel_layout) {
+        case AV_CH_LAYOUT_MONO:
+            ctx->channel_arrangement = 0; break;
+        case AV_CH_LAYOUT_STEREO:
+            ctx->channel_arrangement = 1; break;
+        case AV_CH_LAYOUT_2_1:
+            ctx->channel_arrangement = 2; break;
+        case AV_CH_LAYOUT_QUAD:
+            ctx->channel_arrangement = 3; break;
+        case AV_CH_LAYOUT_2POINT1:
+            ctx->channel_arrangement = 4; break;
+        case AV_CH_LAYOUT_SURROUND:
+            ctx->channel_arrangement = 7; break;
+        case AV_CH_LAYOUT_4POINT0:
+            ctx->channel_arrangement = 8; break;
+        case AV_CH_LAYOUT_5POINT0_BACK:
+            ctx->channel_arrangement = 9; break;
+        case AV_CH_LAYOUT_3POINT1:
+            ctx->channel_arrangement = 10; break;
+        case AV_CH_LAYOUT_4POINT1:
+            ctx->channel_arrangement = 11; break;
+        case AV_CH_LAYOUT_5POINT1_BACK:
+            ctx->channel_arrangement = 12; break;
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Unsupported channel arrangement\n");
+            return -1;
+        }
+        ctx->flags = FLAGS_DVDA;
+        ctx->channel_occupancy = ff_mlp_ch_info[ctx->channel_arrangement].channel_occupancy;
+        ctx->summary_info      = ff_mlp_ch_info[ctx->channel_arrangement].summary_info     ;
+    } else {
+        /* TrueHD */
+        switch(avctx->channel_layout) {
+        case AV_CH_LAYOUT_STEREO:
+            ctx->ch_modifier_thd0    = 0;
+            ctx->ch_modifier_thd1    = 0;
+            ctx->ch_modifier_thd2    = 0;
+            ctx->channel_arrangement = 1;
+            break;
+        case AV_CH_LAYOUT_5POINT0_BACK:
+            ctx->ch_modifier_thd0    = 1;
+            ctx->ch_modifier_thd1    = 1;
+            ctx->ch_modifier_thd2    = 1;
+            ctx->channel_arrangement = 11;
+            break;
+        case AV_CH_LAYOUT_5POINT1_BACK:
+            ctx->ch_modifier_thd0    = 2;
+            ctx->ch_modifier_thd1    = 1;
+            ctx->ch_modifier_thd2    = 2;
+            ctx->channel_arrangement = 15;
+            break;
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Unsupported channel arrangement\n");
+            return -1;
+        }
+        ctx->flags = 0;
+        ctx->channel_occupancy = 0;
+        ctx->summary_info = 0;
+    }
+
+    size = sizeof(unsigned int) * ctx->max_restart_interval;
+
+    ctx->frame_size = av_malloc(size);
+    if (!ctx->frame_size)
+        return AVERROR(ENOMEM);
+
+    ctx->max_output_bits = av_malloc(size);
+    if (!ctx->max_output_bits)
+        return AVERROR(ENOMEM);
+
+    size = sizeof(int32_t)
+         * ctx->num_substreams * ctx->max_restart_interval;
+
+    ctx->lossless_check_data = av_malloc(size);
+    if (!ctx->lossless_check_data)
+        return AVERROR(ENOMEM);
+
+    for (index = 0; index < ctx->restart_intervals; index++) {
+        ctx->seq_offset[index] = sum;
+        ctx->seq_size  [index] = ((index + 1) * ctx->min_restart_interval) + 1;
+        sum += ctx->seq_size[index];
+    }
+    ctx->sequence_size = sum;
+    size = sizeof(ChannelParams)
+         * ctx->restart_intervals * ctx->sequence_size * ctx->avctx->channels;
+    ctx->channel_params = av_malloc(size);
+    if (!ctx->channel_params) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Not enough memory for analysis context.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    size = sizeof(DecodingParams)
+         * ctx->restart_intervals * ctx->sequence_size * ctx->num_substreams;
+    ctx->decoding_params = av_malloc(size);
+    if (!ctx->decoding_params) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Not enough memory for analysis context.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        RestartHeader  *rh = &ctx->restart_header [substr];
+
+        /* TODO see if noisegen_seed is really worth it. */
+        rh->noisegen_seed      = 0;
+
+        rh->min_channel        = 0;
+        rh->max_channel        = avctx->channels - 1;
+        /* FIXME: this works for 1 and 2 channels, but check for more */
+        rh->max_matrix_channel = rh->max_channel;
+    }
+
+    clear_channel_params(ctx, restart_channel_params);
+    clear_decoding_params(ctx, restart_decoding_params);
+
+    if ((ret = ff_lpc_init(&ctx->lpc_ctx, ctx->number_of_samples,
+                    MLP_MAX_LPC_ORDER, FF_LPC_TYPE_LEVINSON)) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Not enough memory for LPC context.\n");
+        return ret;
+    }
+
+    ff_af_queue_init(avctx, &ctx->afq);
+
+    return 0;
+}
+
+/****************************************************************************
+ ****************** Functions that write to the bitstream *******************
+ ****************************************************************************/
+
+/** Writes a major sync header to the bitstream. */
+static void write_major_sync(MLPEncodeContext *ctx, uint8_t *buf, int buf_size)
+{
+    PutBitContext pb;
+
+    init_put_bits(&pb, buf, buf_size);
+
+    put_bits(&pb, 24, SYNC_MAJOR               );
+
+    if (ctx->avctx->codec_id == AV_CODEC_ID_MLP) {
+        put_bits(&pb,  8, SYNC_MLP                 );
+        put_bits(&pb,  4, ctx->coded_sample_fmt [0]);
+        put_bits(&pb,  4, ctx->coded_sample_fmt [1]);
+        put_bits(&pb,  4, ctx->coded_sample_rate[0]);
+        put_bits(&pb,  4, ctx->coded_sample_rate[1]);
+        put_bits(&pb,  4, 0                        ); /* ignored */
+        put_bits(&pb,  4, 0                        ); /* multi_channel_type */
+        put_bits(&pb,  3, 0                        ); /* ignored */
+        put_bits(&pb,  5, ctx->channel_arrangement );
+    } else if (ctx->avctx->codec_id == AV_CODEC_ID_TRUEHD) {
+        put_bits(&pb,  8, SYNC_TRUEHD              );
+        put_bits(&pb,  4, ctx->coded_sample_rate[0]);
+        put_bits(&pb,  4, 0                        ); /* ignored */
+        put_bits(&pb,  2, ctx->ch_modifier_thd0    );
+        put_bits(&pb,  2, ctx->ch_modifier_thd1    );
+        put_bits(&pb,  5, ctx->channel_arrangement );
+        put_bits(&pb,  2, ctx->ch_modifier_thd2    );
+        put_bits(&pb, 13, ctx->channel_arrangement );
+    }
+
+    put_bits(&pb, 16, MAJOR_SYNC_INFO_SIGNATURE);
+    put_bits(&pb, 16, ctx->flags               );
+    put_bits(&pb, 16, 0                        ); /* ignored */
+    put_bits(&pb,  1, 1                        ); /* is_vbr */
+    put_bits(&pb, 15, ctx->coded_peak_bitrate  );
+    put_bits(&pb,  4, 1                        ); /* num_substreams */
+    put_bits(&pb,  4, 0x1                      ); /* ignored */
+
+    /* channel_meaning */
+    put_bits(&pb,  8, ctx->substream_info      );
+    put_bits(&pb,  5, ctx->fs                  );
+    put_bits(&pb,  5, ctx->wordlength          );
+    put_bits(&pb,  6, ctx->channel_occupancy   );
+    put_bits(&pb,  3, 0                        ); /* ignored */
+    put_bits(&pb, 10, 0                        ); /* speaker_layout */
+    put_bits(&pb,  3, 0                        ); /* copy_protection */
+    put_bits(&pb, 16, 0x8080                   ); /* ignored */
+    put_bits(&pb,  7, 0                        ); /* ignored */
+    put_bits(&pb,  4, 0                        ); /* source_format */
+    put_bits(&pb,  5, ctx->summary_info        );
+
+    flush_put_bits(&pb);
+
+    AV_WL16(buf+26, ff_mlp_checksum16(buf, 26));
+}
+
+/** Writes a restart header to the bitstream. Damaged streams can start being
+ *  decoded losslessly again after such a header and the subsequent decoding
+ *  params header.
+ */
+static void write_restart_header(MLPEncodeContext *ctx, PutBitContext *pb)
+{
+    RestartHeader *rh = ctx->cur_restart_header;
+    int32_t lossless_check = xor_32_to_8(rh->lossless_check_data);
+    unsigned int start_count = put_bits_count(pb);
+    PutBitContext tmpb;
+    uint8_t checksum;
+    unsigned int ch;
+
+    put_bits(pb, 14, 0x31ea                ); /* TODO 0x31eb */
+    put_bits(pb, 16, ctx->timestamp        );
+    put_bits(pb,  4, rh->min_channel       );
+    put_bits(pb,  4, rh->max_channel       );
+    put_bits(pb,  4, rh->max_matrix_channel);
+    put_bits(pb,  4, rh->noise_shift       );
+    put_bits(pb, 23, rh->noisegen_seed     );
+    put_bits(pb,  4, 0                     ); /* TODO max_shift */
+    put_bits(pb,  5, rh->max_huff_lsbs     );
+    put_bits(pb,  5, rh->max_output_bits   );
+    put_bits(pb,  5, rh->max_output_bits   );
+    put_bits(pb,  1, rh->data_check_present);
+    put_bits(pb,  8, lossless_check        );
+    put_bits(pb, 16, 0                     ); /* ignored */
+
+    for (ch = 0; ch <= rh->max_matrix_channel; ch++)
+        put_bits(pb, 6, ch);
+
+    /* Data must be flushed for the checksum to be correct. */
+    tmpb = *pb;
+    flush_put_bits(&tmpb);
+
+    checksum = ff_mlp_restart_checksum(pb->buf, put_bits_count(pb) - start_count);
+
+    put_bits(pb,  8, checksum);
+}
+
+/** Writes matrix params for all primitive matrices to the bitstream. */
+static void write_matrix_params(MLPEncodeContext *ctx, PutBitContext *pb)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    MatrixParams *mp = &dp->matrix_params;
+    unsigned int mat;
+
+    put_bits(pb, 4, mp->count);
+
+    for (mat = 0; mat < mp->count; mat++) {
+        unsigned int channel;
+
+        put_bits(pb, 4, mp->outch[mat]); /* matrix_out_ch */
+        put_bits(pb, 4, mp->fbits[mat]);
+        put_bits(pb, 1, 0             ); /* lsb_bypass */
+
+        for (channel = 0; channel < ctx->num_channels; channel++) {
+            int32_t coeff = mp->coeff[mat][channel];
+
+            if (coeff) {
+                put_bits(pb, 1, 1);
+
+                coeff >>= 14 - mp->fbits[mat];
+
+                put_sbits(pb, mp->fbits[mat] + 2, coeff);
+            } else {
+                put_bits(pb, 1, 0);
+            }
+        }
+    }
+}
+
+/** Writes filter parameters for one filter to the bitstream. */
+static void write_filter_params(MLPEncodeContext *ctx, PutBitContext *pb,
+                                unsigned int channel, unsigned int filter)
+{
+    FilterParams *fp = &ctx->cur_channel_params[channel].filter_params[filter];
+
+    put_bits(pb, 4, fp->order);
+
+    if (fp->order > 0) {
+        int i;
+        int32_t *fcoeff = ctx->cur_channel_params[channel].coeff[filter];
+
+        put_bits(pb, 4, fp->shift      );
+        put_bits(pb, 5, fp->coeff_bits );
+        put_bits(pb, 3, fp->coeff_shift);
+
+        for (i = 0; i < fp->order; i++) {
+            put_sbits(pb, fp->coeff_bits, fcoeff[i] >> fp->coeff_shift);
+        }
+
+        /* TODO state data for IIR filter. */
+        put_bits(pb, 1, 0);
+    }
+}
+
+/** Writes decoding parameters to the bitstream. These change very often,
+ *  usually at almost every frame.
+ */
+static void write_decoding_params(MLPEncodeContext *ctx, PutBitContext *pb,
+                                  int params_changed)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    RestartHeader  *rh = ctx->cur_restart_header;
+    MatrixParams *mp = &dp->matrix_params;
+    unsigned int ch;
+
+    if (dp->param_presence_flags != PARAMS_DEFAULT &&
+        params_changed & PARAM_PRESENCE_FLAGS) {
+        put_bits(pb, 1, 1);
+        put_bits(pb, 8, dp->param_presence_flags);
+    } else {
+        put_bits(pb, 1, 0);
+    }
+
+    if (dp->param_presence_flags & PARAM_BLOCKSIZE) {
+        if (params_changed       & PARAM_BLOCKSIZE) {
+            put_bits(pb, 1, 1);
+            put_bits(pb, 9, dp->blocksize);
+        } else {
+            put_bits(pb, 1, 0);
+        }
+    }
+
+    if (dp->param_presence_flags & PARAM_MATRIX) {
+        if (params_changed       & PARAM_MATRIX) {
+            put_bits(pb, 1, 1);
+            write_matrix_params(ctx, pb);
+        } else {
+            put_bits(pb, 1, 0);
+        }
+    }
+
+    if (dp->param_presence_flags & PARAM_OUTSHIFT) {
+        if (params_changed       & PARAM_OUTSHIFT) {
+            put_bits(pb, 1, 1);
+            for (ch = 0; ch <= rh->max_matrix_channel; ch++)
+                put_sbits(pb, 4, mp->shift[ch]);
+        } else {
+            put_bits(pb, 1, 0);
+        }
+    }
+
+    if (dp->param_presence_flags & PARAM_QUANTSTEP) {
+        if (params_changed       & PARAM_QUANTSTEP) {
+            put_bits(pb, 1, 1);
+            for (ch = 0; ch <= rh->max_channel; ch++)
+                put_bits(pb, 4, dp->quant_step_size[ch]);
+        } else {
+            put_bits(pb, 1, 0);
+        }
+    }
+
+    for (ch = rh->min_channel; ch <= rh->max_channel; ch++) {
+        ChannelParams *cp = &ctx->cur_channel_params[ch];
+
+        if (dp->param_presence_flags & 0xF) {
+            put_bits(pb, 1, 1);
+
+            if (dp->param_presence_flags & PARAM_FIR) {
+                if (params_changed       & PARAM_FIR) {
+                    put_bits(pb, 1, 1);
+                    write_filter_params(ctx, pb, ch, FIR);
+                } else {
+                    put_bits(pb, 1, 0);
+                }
+            }
+
+            if (dp->param_presence_flags & PARAM_IIR) {
+                if (params_changed       & PARAM_IIR) {
+                    put_bits(pb, 1, 1);
+                    write_filter_params(ctx, pb, ch, IIR);
+                } else {
+                    put_bits(pb, 1, 0);
+                }
+            }
+
+            if (dp->param_presence_flags & PARAM_HUFFOFFSET) {
+                if (params_changed       & PARAM_HUFFOFFSET) {
+                    put_bits (pb,  1, 1);
+                    put_sbits(pb, 15, cp->huff_offset);
+                } else {
+                    put_bits(pb, 1, 0);
+                }
+            }
+
+            put_bits(pb, 2, cp->codebook );
+            put_bits(pb, 5, cp->huff_lsbs);
+        } else {
+            put_bits(pb, 1, 0);
+        }
+    }
+}
+
+/** Writes the residuals to the bitstream. That is, the VLC codes from the
+ *  codebooks (if any is used), and then the residual.
+ */
+static void write_block_data(MLPEncodeContext *ctx, PutBitContext *pb)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    RestartHeader  *rh = ctx->cur_restart_header;
+    int32_t *sample_buffer = ctx->write_buffer;
+    int32_t sign_huff_offset[MAX_CHANNELS];
+    int codebook_index      [MAX_CHANNELS];
+    int lsb_bits            [MAX_CHANNELS];
+    unsigned int i, ch;
+
+    for (ch = rh->min_channel; ch <= rh->max_channel; ch++) {
+        ChannelParams *cp = &ctx->cur_channel_params[ch];
+        int sign_shift;
+
+        lsb_bits        [ch] = cp->huff_lsbs - dp->quant_step_size[ch];
+        codebook_index  [ch] = cp->codebook  - 1;
+        sign_huff_offset[ch] = cp->huff_offset;
+
+        sign_shift = lsb_bits[ch] - 1;
+
+        if (cp->codebook > 0) {
+            sign_huff_offset[ch] -= 7 << lsb_bits[ch];
+            sign_shift += 3 - cp->codebook;
+        }
+
+        /* Unsign if needed. */
+        if (sign_shift >= 0)
+            sign_huff_offset[ch] -= 1 << sign_shift;
+    }
+
+    for (i = 0; i < dp->blocksize; i++) {
+        for (ch = rh->min_channel; ch <= rh->max_channel; ch++) {
+            int32_t sample = *sample_buffer++ >> dp->quant_step_size[ch];
+
+            sample -= sign_huff_offset[ch];
+
+            if (codebook_index[ch] >= 0) {
+                int vlc = sample >> lsb_bits[ch];
+                put_bits(pb, ff_mlp_huffman_tables[codebook_index[ch]][vlc][1],
+                             ff_mlp_huffman_tables[codebook_index[ch]][vlc][0]);
+            }
+
+            put_sbits(pb, lsb_bits[ch], sample);
+        }
+        sample_buffer += 2; /* noise channels */
+    }
+
+    ctx->write_buffer = sample_buffer;
+}
+
+/** Writes the substreams data to the bitstream. */
+static uint8_t *write_substrs(MLPEncodeContext *ctx, uint8_t *buf, int buf_size,
+                              int restart_frame,
+                              uint16_t substream_data_len[MAX_SUBSTREAMS])
+{
+    int32_t *lossless_check_data = ctx->lossless_check_data;
+    unsigned int substr;
+    int end = 0;
+
+    lossless_check_data += ctx->frame_index * ctx->num_substreams;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        unsigned int cur_subblock_index = ctx->major_cur_subblock_index;
+        unsigned int num_subblocks = ctx->major_filter_state_subblock;
+        unsigned int subblock;
+        RestartHeader  *rh = &ctx->restart_header [substr];
+        int substr_restart_frame = restart_frame;
+        uint8_t parity, checksum;
+        PutBitContext pb, tmpb;
+        int params_changed;
+
+        ctx->cur_restart_header = rh;
+
+        init_put_bits(&pb, buf, buf_size);
+
+        for (subblock = 0; subblock <= num_subblocks; subblock++) {
+            unsigned int subblock_index;
+
+            subblock_index = cur_subblock_index++;
+
+            ctx->cur_decoding_params = &ctx->major_decoding_params[subblock_index][substr];
+            ctx->cur_channel_params = ctx->major_channel_params[subblock_index];
+
+            params_changed = ctx->major_params_changed[subblock_index][substr];
+
+            if (substr_restart_frame || params_changed) {
+                put_bits(&pb, 1, 1);
+
+                if (substr_restart_frame) {
+                    put_bits(&pb, 1, 1);
+
+                    write_restart_header(ctx, &pb);
+                    rh->lossless_check_data = 0;
+                } else {
+                    put_bits(&pb, 1, 0);
+                }
+
+                write_decoding_params(ctx, &pb, params_changed);
+            } else {
+                put_bits(&pb, 1, 0);
+            }
+
+            write_block_data(ctx, &pb);
+
+            put_bits(&pb, 1, !substr_restart_frame);
+
+            substr_restart_frame = 0;
+        }
+
+        put_bits(&pb, (-put_bits_count(&pb)) & 15, 0);
+
+        rh->lossless_check_data ^= *lossless_check_data++;
+
+        if (ctx->last_frame == ctx->inout_buffer) {
+            /* TODO find a sample and implement shorten_by. */
+            put_bits(&pb, 32, END_OF_STREAM);
+        }
+
+        /* Data must be flushed for the checksum and parity to be correct. */
+        tmpb = pb;
+        flush_put_bits(&tmpb);
+
+        parity   = ff_mlp_calculate_parity(buf, put_bits_count(&pb) >> 3) ^ 0xa9;
+        checksum = ff_mlp_checksum8       (buf, put_bits_count(&pb) >> 3);
+
+        put_bits(&pb, 8, parity  );
+        put_bits(&pb, 8, checksum);
+
+        flush_put_bits(&pb);
+
+        end += put_bits_count(&pb) >> 3;
+        substream_data_len[substr] = end;
+
+        buf += put_bits_count(&pb) >> 3;
+    }
+
+    ctx->major_cur_subblock_index += ctx->major_filter_state_subblock + 1;
+    ctx->major_filter_state_subblock = 0;
+
+    return buf;
+}
+
+/** Writes the access unit and substream headers to the bitstream. */
+static void write_frame_headers(MLPEncodeContext *ctx, uint8_t *frame_header,
+                                uint8_t *substream_headers, unsigned int length,
+                                int restart_frame,
+                                uint16_t substream_data_len[MAX_SUBSTREAMS])
+{
+    uint16_t access_unit_header = 0;
+    uint16_t parity_nibble = 0;
+    unsigned int substr;
+
+    parity_nibble  = ctx->dts;
+    parity_nibble ^= length;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        uint16_t substr_hdr = 0;
+
+        substr_hdr |= (0 << 15); /* extraword */
+        substr_hdr |= (!restart_frame << 14); /* !restart_frame */
+        substr_hdr |= (1 << 13); /* checkdata */
+        substr_hdr |= (0 << 12); /* ??? */
+        substr_hdr |= (substream_data_len[substr] / 2) & 0x0FFF;
+
+        AV_WB16(substream_headers, substr_hdr);
+
+        parity_nibble ^= *substream_headers++;
+        parity_nibble ^= *substream_headers++;
+    }
+
+    parity_nibble ^= parity_nibble >> 8;
+    parity_nibble ^= parity_nibble >> 4;
+    parity_nibble &= 0xF;
+
+    access_unit_header |= (parity_nibble ^ 0xF) << 12;
+    access_unit_header |= length & 0xFFF;
+
+    AV_WB16(frame_header  , access_unit_header);
+    AV_WB16(frame_header+2, ctx->dts          );
+}
+
+/** Writes an entire access unit to the bitstream. */
+static unsigned int write_access_unit(MLPEncodeContext *ctx, uint8_t *buf,
+                                      int buf_size, int restart_frame)
+{
+    uint16_t substream_data_len[MAX_SUBSTREAMS];
+    uint8_t *buf1, *buf0 = buf;
+    unsigned int substr;
+    int total_length;
+
+    if (buf_size < 4)
+        return -1;
+
+    /* Frame header will be written at the end. */
+    buf      += 4;
+    buf_size -= 4;
+
+    if (restart_frame) {
+        if (buf_size < 28)
+            return -1;
+        write_major_sync(ctx, buf, buf_size);
+        buf      += 28;
+        buf_size -= 28;
+    }
+
+    buf1 = buf;
+
+    /* Substream headers will be written at the end. */
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        buf      += 2;
+        buf_size -= 2;
+    }
+
+    buf = write_substrs(ctx, buf, buf_size, restart_frame, substream_data_len);
+
+    total_length = buf - buf0;
+
+    write_frame_headers(ctx, buf0, buf1, total_length / 2, restart_frame, substream_data_len);
+
+    return total_length;
+}
+
+/****************************************************************************
+ ****************** Functions that input data to context ********************
+ ****************************************************************************/
+
+/** Inputs data from the samples passed by lavc into the context, shifts them
+ *  appropriately depending on the bit-depth, and calculates the
+ *  lossless_check_data that will be written to the restart header.
+ */
+static void input_data_internal(MLPEncodeContext *ctx, const uint8_t *samples,
+                                int is24)
+{
+    int32_t *lossless_check_data = ctx->lossless_check_data;
+    const int32_t *samples_32 = (const int32_t *) samples;
+    const int16_t *samples_16 = (const int16_t *) samples;
+    unsigned int substr;
+
+    lossless_check_data += ctx->frame_index * ctx->num_substreams;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        RestartHeader  *rh = &ctx->restart_header [substr];
+        int32_t *sample_buffer = ctx->inout_buffer;
+        int32_t temp_lossless_check_data = 0;
+        uint32_t greatest = 0;
+        unsigned int channel;
+        int i;
+
+        for (i = 0; i < ctx->frame_size[ctx->frame_index]; i++) {
+            for (channel = 0; channel <= rh->max_channel; channel++) {
+                uint32_t abs_sample;
+                int32_t sample;
+
+                sample = is24 ? *samples_32++ >> 8 : *samples_16++ << 8;
+
+                /* TODO Find out if number_sbits can be used for negative values. */
+                abs_sample = FFABS(sample);
+                if (greatest < abs_sample)
+                    greatest = abs_sample;
+
+                temp_lossless_check_data ^= (sample & 0x00ffffff) << channel;
+                *sample_buffer++ = sample;
+            }
+
+            sample_buffer += 2; /* noise channels */
+        }
+
+        ctx->max_output_bits[ctx->frame_index] = number_sbits(greatest);
+
+        *lossless_check_data++ = temp_lossless_check_data;
+    }
+}
+
+/** Wrapper function for inputting data in two different bit-depths. */
+static void input_data(MLPEncodeContext *ctx, void *samples)
+{
+    if (ctx->avctx->sample_fmt == AV_SAMPLE_FMT_S32)
+        input_data_internal(ctx, samples, 1);
+    else
+        input_data_internal(ctx, samples, 0);
+}
+
+static void input_to_sample_buffer(MLPEncodeContext *ctx)
+{
+    int32_t *sample_buffer = ctx->sample_buffer;
+    unsigned int index;
+
+    for (index = 0; index < ctx->number_of_frames; index++) {
+        unsigned int cur_index = (ctx->starting_frame_index + index) % ctx->max_restart_interval;
+        int32_t *input_buffer = ctx->inout_buffer + cur_index * ctx->one_sample_buffer_size;
+        unsigned int i, channel;
+
+        for (i = 0; i < ctx->frame_size[cur_index]; i++) {
+            for (channel = 0; channel < ctx->avctx->channels; channel++)
+                *sample_buffer++ = *input_buffer++;
+            sample_buffer += 2; /* noise_channels */
+            input_buffer += 2; /* noise_channels */
+        }
+    }
+}
+
+/****************************************************************************
+ ********* Functions that analyze the data and set the parameters ***********
+ ****************************************************************************/
+
+/** Counts the number of trailing zeroes in a value */
+static int number_trailing_zeroes(int32_t sample)
+{
+    int bits;
+
+    for (bits = 0; bits < 24 && !(sample & (1<<bits)); bits++);
+
+    /* All samples are 0. TODO Return previous quant_step_size to avoid
+     * writing a new header. */
+    if (bits == 24)
+        return 0;
+
+    return bits;
+}
+
+/** Determines how many bits are zero at the end of all samples so they can be
+ *  shifted out.
+ */
+static void determine_quant_step_size(MLPEncodeContext *ctx)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    RestartHeader  *rh = ctx->cur_restart_header;
+    MatrixParams *mp = &dp->matrix_params;
+    int32_t *sample_buffer = ctx->sample_buffer;
+    int32_t sample_mask[MAX_CHANNELS];
+    unsigned int channel;
+    int i;
+
+    memset(sample_mask, 0x00, sizeof(sample_mask));
+
+    for (i = 0; i < ctx->number_of_samples; i++) {
+        for (channel = 0; channel <= rh->max_channel; channel++)
+            sample_mask[channel] |= *sample_buffer++;
+
+        sample_buffer += 2; /* noise channels */
+    }
+
+    for (channel = 0; channel <= rh->max_channel; channel++)
+        dp->quant_step_size[channel] = number_trailing_zeroes(sample_mask[channel]) - mp->shift[channel];
+}
+
+/** Determines the smallest number of bits needed to encode the filter
+ *  coefficients, and if it's possible to right-shift their values without
+ *  losing any precision.
+ */
+static void code_filter_coeffs(MLPEncodeContext *ctx, FilterParams *fp, int32_t *fcoeff)
+{
+    int min = INT_MAX, max = INT_MIN;
+    int bits, shift;
+    int coeff_mask = 0;
+    int order;
+
+    for (order = 0; order < fp->order; order++) {
+        int coeff = fcoeff[order];
+
+        if (coeff < min)
+            min = coeff;
+        if (coeff > max)
+            max = coeff;
+
+        coeff_mask |= coeff;
+    }
+
+    bits = FFMAX(number_sbits(min), number_sbits(max));
+
+    for (shift = 0; shift < 7 && bits + shift < 16 && !(coeff_mask & (1<<shift)); shift++);
+
+    fp->coeff_bits  = bits;
+    fp->coeff_shift = shift;
+}
+
+/** Determines the best filter parameters for the given data and writes the
+ *  necessary information to the context.
+ *  TODO Add IIR filter predictor!
+ */
+static void set_filter_params(MLPEncodeContext *ctx,
+                              unsigned int channel, unsigned int filter,
+                              int clear_filter)
+{
+    ChannelParams *cp = &ctx->cur_channel_params[channel];
+    FilterParams *fp = &cp->filter_params[filter];
+
+    if ((filter == IIR && ctx->substream_info & SUBSTREAM_INFO_HIGH_RATE) ||
+        clear_filter) {
+        fp->order = 0;
+    } else if (filter == IIR) {
+        fp->order = 0;
+    } else if (filter == FIR) {
+        const int max_order = (ctx->substream_info & SUBSTREAM_INFO_HIGH_RATE)
+                              ? 4 : MLP_MAX_LPC_ORDER;
+        int32_t *sample_buffer = ctx->sample_buffer + channel;
+        int32_t coefs[MAX_LPC_ORDER][MAX_LPC_ORDER];
+        int32_t *lpc_samples = ctx->lpc_sample_buffer;
+        int32_t *fcoeff = ctx->cur_channel_params[channel].coeff[filter];
+        int shift[MLP_MAX_LPC_ORDER];
+        unsigned int i;
+        int order;
+
+        for (i = 0; i < ctx->number_of_samples; i++) {
+            *lpc_samples++ = *sample_buffer;
+            sample_buffer += ctx->num_channels;
+        }
+
+        order = ff_lpc_calc_coefs(&ctx->lpc_ctx, ctx->lpc_sample_buffer,
+                                  ctx->number_of_samples, MLP_MIN_LPC_ORDER,
+                                  max_order, 11, coefs, shift, FF_LPC_TYPE_LEVINSON, 0,
+                                  ORDER_METHOD_EST, MLP_MIN_LPC_SHIFT,
+                                  MLP_MAX_LPC_SHIFT, MLP_MIN_LPC_SHIFT);
+
+        fp->order = order;
+        fp->shift = shift[order-1];
+
+        for (i = 0; i < order; i++)
+            fcoeff[i] = coefs[order-1][i];
+
+        code_filter_coeffs(ctx, fp, fcoeff);
+    }
+}
+
+/** Tries to determine a good prediction filter, and applies it to the samples
+ *  buffer if the filter is good enough. Sets the filter data to be cleared if
+ *  no good filter was found.
+ */
+static void determine_filters(MLPEncodeContext *ctx)
+{
+    RestartHeader *rh = ctx->cur_restart_header;
+    int channel, filter;
+
+    for (channel = rh->min_channel; channel <= rh->max_channel; channel++) {
+        for (filter = 0; filter < NUM_FILTERS; filter++)
+            set_filter_params(ctx, channel, filter, 0);
+    }
+}
+
+enum MLPChMode {
+    MLP_CHMODE_LEFT_RIGHT,
+    MLP_CHMODE_LEFT_SIDE,
+    MLP_CHMODE_RIGHT_SIDE,
+    MLP_CHMODE_MID_SIDE,
+};
+
+static enum MLPChMode estimate_stereo_mode(MLPEncodeContext *ctx)
+{
+    uint64_t score[4], sum[4] = { 0, 0, 0, 0, };
+    int32_t *right_ch = ctx->sample_buffer + 1;
+    int32_t *left_ch  = ctx->sample_buffer;
+    int i;
+    enum MLPChMode best = 0;
+
+    for(i = 2; i < ctx->number_of_samples; i++) {
+        int32_t left  = left_ch [i * ctx->num_channels] - 2 * left_ch [(i - 1) * ctx->num_channels] + left_ch [(i - 2) * ctx->num_channels];
+        int32_t right = right_ch[i * ctx->num_channels] - 2 * right_ch[(i - 1) * ctx->num_channels] + right_ch[(i - 2) * ctx->num_channels];
+
+        sum[0] += FFABS( left        );
+        sum[1] += FFABS(        right);
+        sum[2] += FFABS((left + right) >> 1);
+        sum[3] += FFABS( left - right);
+    }
+
+    score[MLP_CHMODE_LEFT_RIGHT] = sum[0] + sum[1];
+    score[MLP_CHMODE_LEFT_SIDE]  = sum[0] + sum[3];
+    score[MLP_CHMODE_RIGHT_SIDE] = sum[1] + sum[3];
+    score[MLP_CHMODE_MID_SIDE]   = sum[2] + sum[3];
+
+    for(i = 1; i < 3; i++)
+        if(score[i] < score[best])
+            best = i;
+
+    return best;
+}
+
+/** Determines how many fractional bits are needed to encode matrix
+ *  coefficients. Also shifts the coefficients to fit within 2.14 bits.
+ */
+static void code_matrix_coeffs(MLPEncodeContext *ctx, unsigned int mat)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    MatrixParams *mp = &dp->matrix_params;
+    int32_t coeff_mask = 0;
+    unsigned int channel;
+    unsigned int bits;
+
+    for (channel = 0; channel < ctx->num_channels; channel++) {
+        int32_t coeff = mp->coeff[mat][channel];
+        coeff_mask |= coeff;
+    }
+
+    for (bits = 0; bits < 14 && !(coeff_mask & (1<<bits)); bits++);
+
+    mp->fbits   [mat] = 14 - bits;
+}
+
+/** Determines best coefficients to use for the lossless matrix. */
+static void lossless_matrix_coeffs(MLPEncodeContext *ctx)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    MatrixParams *mp = &dp->matrix_params;
+    unsigned int shift = 0;
+    unsigned int channel;
+    int mat;
+    enum MLPChMode mode;
+
+    /* No decorrelation for non-stereo. */
+    if (ctx->num_channels - 2 != 2) {
+        mp->count = 0;
+        return;
+    }
+
+    mode = estimate_stereo_mode(ctx);
+
+    switch(mode) {
+        /* TODO: add matrix for MID_SIDE */
+        case MLP_CHMODE_MID_SIDE:
+        case MLP_CHMODE_LEFT_RIGHT:
+            mp->count    = 0;
+            break;
+        case MLP_CHMODE_LEFT_SIDE:
+            mp->count    = 1;
+            mp->outch[0] = 1;
+            mp->coeff[0][0] =  1 << 14; mp->coeff[0][1] = -(1 << 14);
+            mp->coeff[0][2] =  0 << 14; mp->coeff[0][2] =   0 << 14;
+            mp->forco[0][0] =  1 << 14; mp->forco[0][1] = -(1 << 14);
+            mp->forco[0][2] =  0 << 14; mp->forco[0][2] =   0 << 14;
+            break;
+        case MLP_CHMODE_RIGHT_SIDE:
+            mp->count    = 1;
+            mp->outch[0] = 0;
+            mp->coeff[0][0] =  1 << 14; mp->coeff[0][1] =   1 << 14;
+            mp->coeff[0][2] =  0 << 14; mp->coeff[0][2] =   0 << 14;
+            mp->forco[0][0] =  1 << 14; mp->forco[0][1] = -(1 << 14);
+            mp->forco[0][2] =  0 << 14; mp->forco[0][2] =   0 << 14;
+            break;
+    }
+
+    for (mat = 0; mat < mp->count; mat++)
+        code_matrix_coeffs(ctx, mat);
+
+    for (channel = 0; channel < ctx->num_channels; channel++)
+        mp->shift[channel] = shift;
+}
+
+/** Min and max values that can be encoded with each codebook. The values for
+ *  the third codebook take into account the fact that the sign shift for this
+ *  codebook is outside the coded value, so it has one more bit of precision.
+ *  It should actually be -7 -> 7, shifted down by 0.5.
+ */
+static const int codebook_extremes[3][2] = {
+    {-9, 8}, {-8, 7}, {-15, 14},
+};
+
+/** Determines the amount of bits needed to encode the samples using no
+ *  codebooks and a specified offset.
+ */
+static void no_codebook_bits_offset(MLPEncodeContext *ctx,
+                                    unsigned int channel, int16_t offset,
+                                    int32_t min, int32_t max,
+                                    BestOffset *bo)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    int32_t unsign;
+    int lsb_bits;
+
+    min -= offset;
+    max -= offset;
+
+    lsb_bits = FFMAX(number_sbits(min), number_sbits(max)) - 1;
+
+    lsb_bits += !!lsb_bits;
+
+    unsign = 1 << (lsb_bits - 1);
+
+    bo->offset   = offset;
+    bo->lsb_bits = lsb_bits;
+    bo->bitcount = lsb_bits * dp->blocksize;
+    bo->min      = offset - unsign + 1;
+    bo->max      = offset + unsign;
+}
+
+/** Determines the least amount of bits needed to encode the samples using no
+ *  codebooks.
+ */
+static void no_codebook_bits(MLPEncodeContext *ctx,
+                             unsigned int channel,
+                             int32_t min, int32_t max,
+                             BestOffset *bo)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    int16_t offset;
+    int32_t unsign;
+    uint32_t diff;
+    int lsb_bits;
+
+    /* Set offset inside huffoffset's boundaries by adjusting extremes
+     * so that more bits are used, thus shifting the offset. */
+    if (min < HUFF_OFFSET_MIN)
+        max = FFMAX(max, 2 * HUFF_OFFSET_MIN - min + 1);
+    if (max > HUFF_OFFSET_MAX)
+        min = FFMIN(min, 2 * HUFF_OFFSET_MAX - max - 1);
+
+    /* Determine offset and minimum number of bits. */
+    diff = max - min;
+
+    lsb_bits = number_sbits(diff) - 1;
+
+    unsign = 1 << (lsb_bits - 1);
+
+    /* If all samples are the same (lsb_bits == 0), offset must be
+     * adjusted because of sign_shift. */
+    offset = min + diff / 2 + !!lsb_bits;
+
+    bo->offset   = offset;
+    bo->lsb_bits = lsb_bits;
+    bo->bitcount = lsb_bits * dp->blocksize;
+    bo->min      = max - unsign + 1;
+    bo->max      = min + unsign;
+}
+
+/** Determines the least amount of bits needed to encode the samples using a
+ *  given codebook and a given offset.
+ */
+static inline void codebook_bits_offset(MLPEncodeContext *ctx,
+                                        unsigned int channel, int codebook,
+                                        int32_t sample_min, int32_t sample_max,
+                                        int16_t offset, BestOffset *bo)
+{
+    int32_t codebook_min = codebook_extremes[codebook][0];
+    int32_t codebook_max = codebook_extremes[codebook][1];
+    int32_t *sample_buffer = ctx->sample_buffer + channel;
+    DecodingParams *dp = ctx->cur_decoding_params;
+    int codebook_offset  = 7 + (2 - codebook);
+    int32_t unsign_offset = offset;
+    int lsb_bits = 0, bitcount = 0;
+    int offset_min = INT_MAX, offset_max = INT_MAX;
+    int unsign, mask;
+    int i;
+
+    sample_min -= offset;
+    sample_max -= offset;
+
+    while (sample_min < codebook_min || sample_max > codebook_max) {
+        lsb_bits++;
+        sample_min >>= 1;
+        sample_max >>= 1;
+    }
+
+    unsign = 1 << lsb_bits;
+    mask   = unsign - 1;
+
+    if (codebook == 2) {
+        unsign_offset -= unsign;
+        lsb_bits++;
+    }
+
+    for (i = 0; i < dp->blocksize; i++) {
+        int32_t sample = *sample_buffer >> dp->quant_step_size[channel];
+        int temp_min, temp_max;
+
+        sample -= unsign_offset;
+
+        temp_min = sample & mask;
+        if (temp_min < offset_min)
+            offset_min = temp_min;
+
+        temp_max = unsign - temp_min - 1;
+        if (temp_max < offset_max)
+            offset_max = temp_max;
+
+        sample >>= lsb_bits;
+
+        bitcount += ff_mlp_huffman_tables[codebook][sample + codebook_offset][1];
+
+        sample_buffer += ctx->num_channels;
+    }
+
+    bo->offset   = offset;
+    bo->lsb_bits = lsb_bits;
+    bo->bitcount = lsb_bits * dp->blocksize + bitcount;
+    bo->min      = FFMAX(offset - offset_min, HUFF_OFFSET_MIN);
+    bo->max      = FFMIN(offset + offset_max, HUFF_OFFSET_MAX);
+}
+
+/** Determines the least amount of bits needed to encode the samples using a
+ *  given codebook. Searches for the best offset to minimize the bits.
+ */
+static inline void codebook_bits(MLPEncodeContext *ctx,
+                                 unsigned int channel, int codebook,
+                                 int offset, int32_t min, int32_t max,
+                                 BestOffset *bo, int direction)
+{
+    int previous_count = INT_MAX;
+    int offset_min, offset_max;
+    int is_greater = 0;
+
+    offset_min = FFMAX(min, HUFF_OFFSET_MIN);
+    offset_max = FFMIN(max, HUFF_OFFSET_MAX);
+
+    for (;;) {
+        BestOffset temp_bo;
+
+        codebook_bits_offset(ctx, channel, codebook,
+                             min, max, offset,
+                             &temp_bo);
+
+        if (temp_bo.bitcount < previous_count) {
+            if (temp_bo.bitcount < bo->bitcount)
+                *bo = temp_bo;
+
+            is_greater = 0;
+        } else if (++is_greater >= ctx->max_codebook_search)
+            break;
+
+        previous_count = temp_bo.bitcount;
+
+        if (direction) {
+            offset = temp_bo.max + 1;
+            if (offset > offset_max)
+                break;
+        } else {
+            offset = temp_bo.min - 1;
+            if (offset < offset_min)
+                break;
+        }
+    }
+}
+
+/** Determines the least amount of bits needed to encode the samples using
+ *  any or no codebook.
+ */
+static void determine_bits(MLPEncodeContext *ctx)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    RestartHeader  *rh = ctx->cur_restart_header;
+    unsigned int channel;
+
+    for (channel = 0; channel <= rh->max_channel; channel++) {
+        ChannelParams *cp = &ctx->cur_channel_params[channel];
+        int32_t *sample_buffer = ctx->sample_buffer + channel;
+        int32_t min = INT32_MAX, max = INT32_MIN;
+        int no_filters_used = !cp->filter_params[FIR].order;
+        int average = 0;
+        int offset = 0;
+        int i;
+
+        /* Determine extremes and average. */
+        for (i = 0; i < dp->blocksize; i++) {
+            int32_t sample = *sample_buffer >> dp->quant_step_size[channel];
+            if (sample < min)
+                min = sample;
+            if (sample > max)
+                max = sample;
+            average += sample;
+            sample_buffer += ctx->num_channels;
+        }
+        average /= dp->blocksize;
+
+        /* If filtering is used, we always set the offset to zero, otherwise
+         * we search for the offset that minimizes the bitcount. */
+        if (no_filters_used) {
+            no_codebook_bits(ctx, channel, min, max, &ctx->cur_best_offset[channel][0]);
+            offset = av_clip(average, HUFF_OFFSET_MIN, HUFF_OFFSET_MAX);
+        } else {
+            no_codebook_bits_offset(ctx, channel, offset, min, max, &ctx->cur_best_offset[channel][0]);
+        }
+
+        for (i = 1; i < NUM_CODEBOOKS; i++) {
+            BestOffset temp_bo = { 0, INT_MAX, 0, 0, 0, };
+            int16_t offset_max;
+
+            codebook_bits_offset(ctx, channel, i - 1,
+                                 min, max, offset,
+                                 &temp_bo);
+
+            if (no_filters_used) {
+                offset_max = temp_bo.max;
+
+                codebook_bits(ctx, channel, i - 1, temp_bo.min - 1,
+                            min, max, &temp_bo, 0);
+                codebook_bits(ctx, channel, i - 1, offset_max + 1,
+                            min, max, &temp_bo, 1);
+            }
+
+            ctx->cur_best_offset[channel][i] = temp_bo;
+        }
+    }
+}
+
+/****************************************************************************
+ *************** Functions that process the data in some way ****************
+ ****************************************************************************/
+
+#define SAMPLE_MAX(bitdepth) ((1 << (bitdepth - 1)) - 1)
+#define SAMPLE_MIN(bitdepth) (~SAMPLE_MAX(bitdepth))
+
+#define MSB_MASK(bits)  (-1u << bits)
+
+/** Applies the filter to the current samples, and saves the residual back
+ *  into the samples buffer. If the filter is too bad and overflows the
+ *  maximum amount of bits allowed (16 or 24), the samples buffer is left as is and
+ *  the function returns -1.
+ */
+static int apply_filter(MLPEncodeContext *ctx, unsigned int channel)
+{
+    FilterParams *fp[NUM_FILTERS] = { &ctx->cur_channel_params[channel].filter_params[FIR],
+                                      &ctx->cur_channel_params[channel].filter_params[IIR], };
+    int32_t *filter_state_buffer[NUM_FILTERS];
+    int32_t mask = MSB_MASK(ctx->cur_decoding_params->quant_step_size[channel]);
+    int32_t *sample_buffer = ctx->sample_buffer + channel;
+    unsigned int number_of_samples = ctx->number_of_samples;
+    unsigned int filter_shift = fp[FIR]->shift;
+    int filter;
+    int i;
+
+    for (i = 0; i < NUM_FILTERS; i++) {
+        unsigned int size = ctx->number_of_samples;
+        filter_state_buffer[i] = av_malloc(size*sizeof(int32_t));
+        if (!filter_state_buffer[i]) {
+            av_log(ctx->avctx, AV_LOG_ERROR,
+                   "Not enough memory for applying filters.\n");
+            return -1;
+        }
+    }
+
+    for (i = 0; i < 8; i++) {
+        filter_state_buffer[FIR][i] = *sample_buffer;
+        filter_state_buffer[IIR][i] = *sample_buffer;
+
+        sample_buffer += ctx->num_channels;
+    }
+
+    for (i = 8; i < number_of_samples; i++) {
+        int32_t sample = *sample_buffer;
+        unsigned int order;
+        int64_t accum = 0;
+        int32_t residual;
+
+        for (filter = 0; filter < NUM_FILTERS; filter++) {
+            int32_t *fcoeff = ctx->cur_channel_params[channel].coeff[filter];
+            for (order = 0; order < fp[filter]->order; order++)
+                accum += (int64_t)filter_state_buffer[filter][i - 1 - order] *
+                         fcoeff[order];
+        }
+
+        accum  >>= filter_shift;
+        residual = sample - (accum & mask);
+
+        if (residual < SAMPLE_MIN(ctx->wordlength) || residual > SAMPLE_MAX(ctx->wordlength))
+            return -1;
+
+        filter_state_buffer[FIR][i] = sample;
+        filter_state_buffer[IIR][i] = residual;
+
+        sample_buffer += ctx->num_channels;
+    }
+
+    sample_buffer = ctx->sample_buffer + channel;
+    for (i = 0; i < number_of_samples; i++) {
+        *sample_buffer = filter_state_buffer[IIR][i];
+
+        sample_buffer += ctx->num_channels;
+    }
+
+    for (i = 0; i < NUM_FILTERS; i++) {
+        av_freep(&filter_state_buffer[i]);
+    }
+
+    return 0;
+}
+
+static void apply_filters(MLPEncodeContext *ctx)
+{
+    RestartHeader *rh = ctx->cur_restart_header;
+    int channel;
+
+    for (channel = rh->min_channel; channel <= rh->max_channel; channel++) {
+        if (apply_filter(ctx, channel) < 0) {
+            /* Filter is horribly wrong.
+             * Clear filter params and update state. */
+            set_filter_params(ctx, channel, FIR, 1);
+            set_filter_params(ctx, channel, IIR, 1);
+            apply_filter(ctx, channel);
+        }
+    }
+}
+
+/** Generates two noise channels worth of data. */
+static void generate_2_noise_channels(MLPEncodeContext *ctx)
+{
+    int32_t *sample_buffer = ctx->sample_buffer + ctx->num_channels - 2;
+    RestartHeader *rh = ctx->cur_restart_header;
+    unsigned int i;
+    uint32_t seed = rh->noisegen_seed;
+
+    for (i = 0; i < ctx->number_of_samples; i++) {
+        uint16_t seed_shr7 = seed >> 7;
+        *sample_buffer++ = ((int8_t)(seed >> 15)) << rh->noise_shift;
+        *sample_buffer++ = ((int8_t) seed_shr7)   << rh->noise_shift;
+
+        seed = (seed << 16) ^ seed_shr7 ^ (seed_shr7 << 5);
+
+        sample_buffer += ctx->num_channels - 2;
+    }
+
+    rh->noisegen_seed = seed & ((1 << 24)-1);
+}
+
+/** Rematrixes all channels using chosen coefficients. */
+static void rematrix_channels(MLPEncodeContext *ctx)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    MatrixParams *mp = &dp->matrix_params;
+    int32_t *sample_buffer = ctx->sample_buffer;
+    unsigned int mat, i, maxchan;
+
+    maxchan = ctx->num_channels;
+
+    for (mat = 0; mat < mp->count; mat++) {
+        unsigned int msb_mask_bits = (ctx->avctx->sample_fmt == AV_SAMPLE_FMT_S16 ? 8 : 0) - mp->shift[mat];
+        int32_t mask = MSB_MASK(msb_mask_bits);
+        unsigned int outch = mp->outch[mat];
+
+        sample_buffer = ctx->sample_buffer;
+        for (i = 0; i < ctx->number_of_samples; i++) {
+            unsigned int src_ch;
+            int64_t accum = 0;
+
+            for (src_ch = 0; src_ch < maxchan; src_ch++) {
+                int32_t sample = *(sample_buffer + src_ch);
+                accum += (int64_t) sample * mp->forco[mat][src_ch];
+            }
+            sample_buffer[outch] = (accum >> 14) & mask;
+
+            sample_buffer += ctx->num_channels;
+        }
+    }
+}
+
+/****************************************************************************
+ **** Functions that deal with determining the best parameters and output ***
+ ****************************************************************************/
+
+typedef struct {
+    char    path[MAJOR_HEADER_INTERVAL + 3];
+    int     bitcount;
+} PathCounter;
+
+static const char *path_counter_codebook[] = { "0", "1", "2", "3", };
+
+#define ZERO_PATH               '0'
+#define CODEBOOK_CHANGE_BITS    21
+
+static void clear_path_counter(PathCounter *path_counter)
+{
+    unsigned int i;
+
+    for (i = 0; i < NUM_CODEBOOKS + 1; i++) {
+        path_counter[i].path[0]  = ZERO_PATH;
+        path_counter[i].path[1]  =      0x00;
+        path_counter[i].bitcount =         0;
+    }
+}
+
+static int compare_best_offset(BestOffset *prev, BestOffset *cur)
+{
+    if (prev->lsb_bits != cur->lsb_bits)
+        return 1;
+
+    return 0;
+}
+
+static int best_codebook_path_cost(MLPEncodeContext *ctx, unsigned int channel,
+                                   PathCounter *src, int cur_codebook)
+{
+    BestOffset *cur_bo, *prev_bo = restart_best_offset;
+    int bitcount = src->bitcount;
+    char *path = src->path + 1;
+    int prev_codebook;
+    int i;
+
+    for (i = 0; path[i]; i++)
+        prev_bo = ctx->best_offset[i][channel];
+
+    prev_codebook = path[i - 1] - ZERO_PATH;
+
+    cur_bo = ctx->best_offset[i][channel];
+
+    bitcount += cur_bo[cur_codebook].bitcount;
+
+    if (prev_codebook != cur_codebook ||
+        compare_best_offset(&prev_bo[prev_codebook], &cur_bo[cur_codebook]))
+        bitcount += CODEBOOK_CHANGE_BITS;
+
+    return bitcount;
+}
+
+static void set_best_codebook(MLPEncodeContext *ctx)
+{
+    DecodingParams *dp = ctx->cur_decoding_params;
+    RestartHeader *rh = ctx->cur_restart_header;
+    unsigned int channel;
+
+    for (channel = rh->min_channel; channel <= rh->max_channel; channel++) {
+        BestOffset *cur_bo, *prev_bo = restart_best_offset;
+        PathCounter path_counter[NUM_CODEBOOKS + 1];
+        unsigned int best_codebook;
+        unsigned int index;
+        char *best_path;
+
+        clear_path_counter(path_counter);
+
+        for (index = 0; index < ctx->number_of_subblocks; index++) {
+            unsigned int best_bitcount = INT_MAX;
+            unsigned int codebook;
+
+            cur_bo = ctx->best_offset[index][channel];
+
+            for (codebook = 0; codebook < NUM_CODEBOOKS; codebook++) {
+                int prev_best_bitcount = INT_MAX;
+                int last_best;
+
+                for (last_best = 0; last_best < 2; last_best++) {
+                    PathCounter *dst_path = &path_counter[codebook];
+                    PathCounter *src_path;
+                    int  temp_bitcount;
+
+                    /* First test last path with same headers,
+                     * then with last best. */
+                    if (last_best) {
+                        src_path = &path_counter[NUM_CODEBOOKS];
+                    } else {
+                        if (compare_best_offset(&prev_bo[codebook], &cur_bo[codebook]))
+                            continue;
+                        else
+                            src_path = &path_counter[codebook];
+                    }
+
+                    temp_bitcount = best_codebook_path_cost(ctx, channel, src_path, codebook);
+
+                    if (temp_bitcount < best_bitcount) {
+                        best_bitcount = temp_bitcount;
+                        best_codebook = codebook;
+                    }
+
+                    if (temp_bitcount < prev_best_bitcount) {
+                        prev_best_bitcount = temp_bitcount;
+                        if (src_path != dst_path)
+                            memcpy(dst_path, src_path, sizeof(PathCounter));
+                        av_strlcat(dst_path->path, path_counter_codebook[codebook], sizeof(dst_path->path));
+                        dst_path->bitcount = temp_bitcount;
+                    }
+                }
+            }
+
+            prev_bo = cur_bo;
+
+            memcpy(&path_counter[NUM_CODEBOOKS], &path_counter[best_codebook], sizeof(PathCounter));
+        }
+
+        best_path = path_counter[NUM_CODEBOOKS].path + 1;
+
+        /* Update context. */
+        for (index = 0; index < ctx->number_of_subblocks; index++) {
+            ChannelParams *cp = ctx->seq_channel_params + index*(ctx->avctx->channels) + channel;
+
+            best_codebook = *best_path++ - ZERO_PATH;
+            cur_bo = &ctx->best_offset[index][channel][best_codebook];
+
+            cp->huff_offset = cur_bo->offset;
+            cp->huff_lsbs   = cur_bo->lsb_bits + dp->quant_step_size[channel];
+            cp->codebook    = best_codebook;
+        }
+    }
+}
+
+/** Analyzes all collected bitcounts and selects the best parameters for each
+ *  individual access unit.
+ *  TODO This is just a stub!
+ */
+static void set_major_params(MLPEncodeContext *ctx)
+{
+    RestartHeader *rh = ctx->cur_restart_header;
+    unsigned int index;
+    unsigned int substr;
+    uint8_t max_huff_lsbs = 0;
+    uint8_t max_output_bits = 0;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        DecodingParams *seq_dp = (DecodingParams *) ctx->decoding_params+
+                                 (ctx->restart_intervals - 1)*(ctx->sequence_size)*(ctx->avctx->channels) +
+                                 (ctx->seq_offset[ctx->restart_intervals - 1])*(ctx->avctx->channels);
+
+        ChannelParams *seq_cp = (ChannelParams *) ctx->channel_params +
+                                (ctx->restart_intervals - 1)*(ctx->sequence_size)*(ctx->avctx->channels) +
+                                (ctx->seq_offset[ctx->restart_intervals - 1])*(ctx->avctx->channels);
+        unsigned int channel;
+        for (index = 0; index < ctx->seq_size[ctx->restart_intervals-1]; index++) {
+            memcpy(&ctx->major_decoding_params[index][substr], seq_dp + index*(ctx->num_substreams) + substr, sizeof(DecodingParams));
+            for (channel = 0; channel < ctx->avctx->channels; channel++) {
+                uint8_t huff_lsbs = (seq_cp + index*(ctx->avctx->channels) + channel)->huff_lsbs;
+                if (max_huff_lsbs < huff_lsbs)
+                    max_huff_lsbs = huff_lsbs;
+                memcpy(&ctx->major_channel_params[index][channel],
+                       (seq_cp + index*(ctx->avctx->channels) + channel),
+                       sizeof(ChannelParams));
+            }
+        }
+    }
+
+    rh->max_huff_lsbs = max_huff_lsbs;
+
+    for (index = 0; index < ctx->number_of_frames; index++)
+        if (max_output_bits < ctx->max_output_bits[index])
+            max_output_bits = ctx->max_output_bits[index];
+    rh->max_output_bits = max_output_bits;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+
+        ctx->cur_restart_header = &ctx->restart_header[substr];
+
+        ctx->prev_decoding_params = &restart_decoding_params[substr];
+        ctx->prev_channel_params = restart_channel_params;
+
+        for (index = 0; index < MAJOR_HEADER_INTERVAL + 1; index++) {
+                ctx->cur_decoding_params = &ctx->major_decoding_params[index][substr];
+                ctx->cur_channel_params = ctx->major_channel_params[index];
+
+                ctx->major_params_changed[index][substr] = compare_decoding_params(ctx);
+
+                ctx->prev_decoding_params = ctx->cur_decoding_params;
+                ctx->prev_channel_params = ctx->cur_channel_params;
+        }
+    }
+
+    ctx->major_number_of_subblocks = ctx->number_of_subblocks;
+    ctx->major_filter_state_subblock = 1;
+    ctx->major_cur_subblock_index = 0;
+}
+
+static void analyze_sample_buffer(MLPEncodeContext *ctx)
+{
+    ChannelParams *seq_cp = ctx->seq_channel_params;
+    DecodingParams *seq_dp = ctx->seq_decoding_params;
+    unsigned int index;
+    unsigned int substr;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+
+        ctx->cur_restart_header = &ctx->restart_header[substr];
+        ctx->cur_decoding_params = seq_dp + 1*(ctx->num_substreams) + substr;
+        ctx->cur_channel_params = seq_cp + 1*(ctx->avctx->channels);
+
+        determine_quant_step_size(ctx);
+        generate_2_noise_channels(ctx);
+        lossless_matrix_coeffs   (ctx);
+        rematrix_channels        (ctx);
+        determine_filters        (ctx);
+        apply_filters            (ctx);
+
+        copy_restart_frame_params(ctx, substr);
+
+        /* Copy frame_size from frames 0...max to decoding_params 1...max + 1
+         * decoding_params[0] is for the filter state subblock.
+         */
+        for (index = 0; index < ctx->number_of_frames; index++) {
+            DecodingParams *dp = seq_dp + (index + 1)*(ctx->num_substreams) + substr;
+            dp->blocksize = ctx->frame_size[index];
+        }
+        /* The official encoder seems to always encode a filter state subblock
+         * even if there are no filters. TODO check if it is possible to skip
+         * the filter state subblock for no filters.
+         */
+        (seq_dp + substr)->blocksize  = 8;
+        (seq_dp + 1*(ctx->num_substreams) + substr)->blocksize -= 8;
+
+        for (index = 0; index < ctx->number_of_subblocks; index++) {
+                ctx->cur_decoding_params = seq_dp + index*(ctx->num_substreams) + substr;
+                ctx->cur_channel_params = seq_cp + index*(ctx->avctx->channels);
+                ctx->cur_best_offset = ctx->best_offset[index];
+                determine_bits(ctx);
+                ctx->sample_buffer += ctx->cur_decoding_params->blocksize * ctx->num_channels;
+        }
+
+        set_best_codebook(ctx);
+    }
+}
+
+static void process_major_frame(MLPEncodeContext *ctx)
+{
+    unsigned int substr;
+
+    ctx->sample_buffer = ctx->major_inout_buffer;
+
+    ctx->starting_frame_index = 0;
+    ctx->number_of_frames = ctx->major_number_of_frames;
+    ctx->number_of_samples = ctx->major_frame_size;
+
+    for (substr = 0; substr < ctx->num_substreams; substr++) {
+        RestartHeader *rh = ctx->cur_restart_header;
+        unsigned int channel;
+
+        ctx->cur_restart_header = &ctx->restart_header[substr];
+
+        ctx->cur_decoding_params = &ctx->major_decoding_params[1][substr];
+        ctx->cur_channel_params = ctx->major_channel_params[1];
+
+        generate_2_noise_channels(ctx);
+        rematrix_channels        (ctx);
+
+        for (channel = rh->min_channel; channel <= rh->max_channel; channel++)
+            apply_filter(ctx, channel);
+    }
+}
+
+/****************************************************************************/
+
+static int mlp_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet)
+{
+    MLPEncodeContext *ctx = avctx->priv_data;
+    unsigned int bytes_written = 0;
+    int restart_frame, ret;
+    uint8_t *data;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 87500 * avctx->channels, 0)) < 0)
+        return ret;
+
+    if (!frame)
+        return 1;
+
+    /* add current frame to queue */
+    if (frame) {
+        if ((ret = ff_af_queue_add(&ctx->afq, frame)) < 0)
+            return ret;
+    }
+
+    data = frame->data[0];
+
+    ctx->frame_index = avctx->frame_number % ctx->max_restart_interval;
+
+    ctx->inout_buffer = ctx->major_inout_buffer
+                      + ctx->frame_index * ctx->one_sample_buffer_size;
+
+    if (ctx->last_frame == ctx->inout_buffer) {
+        return 0;
+    }
+
+    ctx->sample_buffer = ctx->major_scratch_buffer
+                       + ctx->frame_index * ctx->one_sample_buffer_size;
+
+    ctx->write_buffer = ctx->inout_buffer;
+
+    if (avctx->frame_number < ctx->max_restart_interval) {
+        if (data) {
+            goto input_and_return;
+        } else {
+            /* There are less frames than the requested major header interval.
+             * Update the context to reflect this.
+             */
+            ctx->max_restart_interval = avctx->frame_number;
+            ctx->frame_index = 0;
+
+            ctx->sample_buffer = ctx->major_scratch_buffer;
+            ctx->inout_buffer = ctx->major_inout_buffer;
+        }
+    }
+
+    if (ctx->frame_size[ctx->frame_index] > MAX_BLOCKSIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid frame size (%d > %d)\n",
+               ctx->frame_size[ctx->frame_index], MAX_BLOCKSIZE);
+        return -1;
+    }
+
+    restart_frame = !ctx->frame_index;
+
+    if (restart_frame) {
+        set_major_params(ctx);
+        if (ctx->min_restart_interval != ctx->max_restart_interval)
+        process_major_frame(ctx);
+    }
+
+    if (ctx->min_restart_interval == ctx->max_restart_interval)
+        ctx->write_buffer = ctx->sample_buffer;
+
+    bytes_written = write_access_unit(ctx, avpkt->data, avpkt->size, restart_frame);
+
+    ctx->timestamp += ctx->frame_size[ctx->frame_index];
+    ctx->dts       += ctx->frame_size[ctx->frame_index];
+
+input_and_return:
+
+    if (data) {
+        ctx->frame_size[ctx->frame_index] = avctx->frame_size;
+        ctx->next_major_frame_size += avctx->frame_size;
+        ctx->next_major_number_of_frames++;
+        input_data(ctx, data);
+    } else if (!ctx->last_frame) {
+        ctx->last_frame = ctx->inout_buffer;
+    }
+
+    restart_frame = (ctx->frame_index + 1) % ctx->min_restart_interval;
+
+    if (!restart_frame) {
+        int seq_index;
+
+        for (seq_index = 0;
+             seq_index < ctx->restart_intervals && (seq_index * ctx->min_restart_interval) <= ctx->avctx->frame_number;
+             seq_index++) {
+            unsigned int number_of_samples = 0;
+            unsigned int index;
+
+            ctx->sample_buffer = ctx->major_scratch_buffer;
+            ctx->inout_buffer = ctx->major_inout_buffer;
+            ctx->seq_index = seq_index;
+
+            ctx->starting_frame_index = (ctx->avctx->frame_number - (ctx->avctx->frame_number % ctx->min_restart_interval)
+                                      - (seq_index * ctx->min_restart_interval)) % ctx->max_restart_interval;
+            ctx->number_of_frames = ctx->next_major_number_of_frames;
+            ctx->number_of_subblocks = ctx->next_major_number_of_frames + 1;
+
+            ctx->seq_channel_params = (ChannelParams *) ctx->channel_params +
+                                      (ctx->frame_index / ctx->min_restart_interval)*(ctx->sequence_size)*(ctx->avctx->channels) +
+                                      (ctx->seq_offset[seq_index])*(ctx->avctx->channels);
+
+            ctx->seq_decoding_params = (DecodingParams *) ctx->decoding_params +
+                                       (ctx->frame_index / ctx->min_restart_interval)*(ctx->sequence_size)*(ctx->num_substreams) +
+                                       (ctx->seq_offset[seq_index])*(ctx->num_substreams);
+
+            for (index = 0; index < ctx->number_of_frames; index++) {
+                number_of_samples += ctx->frame_size[(ctx->starting_frame_index + index) % ctx->max_restart_interval];
+            }
+            ctx->number_of_samples = number_of_samples;
+
+            for (index = 0; index < ctx->seq_size[seq_index]; index++) {
+                clear_channel_params(ctx, ctx->seq_channel_params + index*(ctx->avctx->channels));
+                default_decoding_params(ctx, ctx->seq_decoding_params + index*(ctx->num_substreams));
+            }
+
+            input_to_sample_buffer(ctx);
+
+            analyze_sample_buffer(ctx);
+        }
+
+        if (ctx->frame_index == (ctx->max_restart_interval - 1)) {
+            ctx->major_frame_size = ctx->next_major_frame_size;
+            ctx->next_major_frame_size = 0;
+            ctx->major_number_of_frames = ctx->next_major_number_of_frames;
+            ctx->next_major_number_of_frames = 0;
+
+            if (!ctx->major_frame_size)
+                goto no_data_left;
+        }
+    }
+
+no_data_left:
+
+    ff_af_queue_remove(&ctx->afq, avctx->frame_size, &avpkt->pts,
+                       &avpkt->duration);
+    avpkt->size = bytes_written;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int mlp_encode_close(AVCodecContext *avctx)
+{
+    MLPEncodeContext *ctx = avctx->priv_data;
+
+    ff_lpc_end(&ctx->lpc_ctx);
+
+    av_freep(&ctx->lossless_check_data);
+    av_freep(&ctx->major_scratch_buffer);
+    av_freep(&ctx->major_inout_buffer);
+    av_freep(&ctx->lpc_sample_buffer);
+    av_freep(&ctx->decoding_params);
+    av_freep(&ctx->channel_params);
+    av_freep(&ctx->frame_size);
+    ff_af_queue_close(&ctx->afq);
+
+    return 0;
+}
+
+#if CONFIG_MLP_ENCODER
+AVCodec ff_mlp_encoder = {
+    .name                   ="mlp",
+    .long_name              = NULL_IF_CONFIG_SMALL("MLP (Meridian Lossless Packing)"),
+    .type                   = AVMEDIA_TYPE_AUDIO,
+    .id                     = AV_CODEC_ID_MLP,
+    .priv_data_size         = sizeof(MLPEncodeContext),
+    .init                   = mlp_encode_init,
+    .encode2                = mlp_encode_frame,
+    .close                  = mlp_encode_close,
+    .capabilities           = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_EXPERIMENTAL,
+    .sample_fmts            = (const enum AVSampleFormat[]) {AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE},
+    .supported_samplerates  = (const int[]) {44100, 48000, 88200, 96000, 176400, 192000, 0},
+    .channel_layouts        = ff_mlp_channel_layouts,
+};
+#endif
+#if CONFIG_TRUEHD_ENCODER
+AVCodec ff_truehd_encoder = {
+    .name                   ="truehd",
+    .long_name              = NULL_IF_CONFIG_SMALL("TrueHD"),
+    .type                   = AVMEDIA_TYPE_AUDIO,
+    .id                     = AV_CODEC_ID_TRUEHD,
+    .priv_data_size         = sizeof(MLPEncodeContext),
+    .init                   = mlp_encode_init,
+    .encode2                = mlp_encode_frame,
+    .close                  = mlp_encode_close,
+    .capabilities           = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_EXPERIMENTAL,
+    .sample_fmts            = (const enum AVSampleFormat[]) {AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE},
+    .supported_samplerates  = (const int[]) {44100, 48000, 88200, 96000, 176400, 192000, 0},
+    .channel_layouts        = (const uint64_t[]) {AV_CH_LAYOUT_STEREO, AV_CH_LAYOUT_5POINT0_BACK, AV_CH_LAYOUT_5POINT1_BACK, 0},
+};
+#endif
diff --git a/libavcodec/mlz.c b/libavcodec/mlz.c
new file mode 100644
index 0000000..ebce796
--- /dev/null
+++ b/libavcodec/mlz.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2016 Umair Khan <omerjerk@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mlz.h"
+
+av_cold void ff_mlz_init_dict(void* context, MLZ *mlz) {
+    mlz->dict = av_mallocz_array(TABLE_SIZE, sizeof(*mlz->dict));
+
+    mlz->flush_code            = FLUSH_CODE;
+    mlz->current_dic_index_max = DIC_INDEX_INIT;
+    mlz->dic_code_bit          = CODE_BIT_INIT;
+    mlz->bump_code             = (DIC_INDEX_INIT - 1);
+    mlz->next_code             = FIRST_CODE;
+    mlz->freeze_flag           = 0;
+    mlz->context               = context;
+}
+
+av_cold void ff_mlz_flush_dict(MLZ *mlz) {
+    MLZDict *dict = mlz->dict;
+    int i;
+    for ( i = 0; i < TABLE_SIZE; i++ ) {
+        dict[i].string_code = CODE_UNSET;
+        dict[i].parent_code = CODE_UNSET;
+        dict[i].match_len = 0;
+    }
+    mlz->current_dic_index_max = DIC_INDEX_INIT;
+    mlz->dic_code_bit          = CODE_BIT_INIT;  // DicCodeBitInit;
+    mlz->bump_code             = mlz->current_dic_index_max - 1;
+    mlz->next_code             = FIRST_CODE;
+    mlz->freeze_flag           = 0;
+}
+
+static void set_new_entry_dict(MLZDict* dict, int string_code, int parent_code, int char_code) {
+    dict[string_code].parent_code = parent_code;
+    dict[string_code].string_code = string_code;
+    dict[string_code].char_code   = char_code;
+    if (parent_code < FIRST_CODE) {
+        dict[string_code].match_len = 2;
+    } else {
+        dict[string_code].match_len = (dict[parent_code].match_len) + 1;
+    }
+}
+
+static int decode_string(MLZ* mlz, unsigned char *buff, int string_code, int *first_char_code, unsigned long bufsize) {
+    MLZDict* dict = mlz->dict;
+    unsigned long count, offset;
+    int current_code, parent_code, tmp_code;
+
+    count            = 0;
+    current_code     = string_code;
+    *first_char_code = CODE_UNSET;
+
+    while (count < bufsize) {
+        switch (current_code) {
+        case CODE_UNSET:
+            return count;
+            break;
+        default:
+            if (current_code < FIRST_CODE) {
+                *first_char_code = current_code;
+                buff[0] = current_code;
+                count++;
+                return count;
+            } else {
+                offset  = dict[current_code].match_len - 1;
+                tmp_code = dict[current_code].char_code;
+                if (offset >= bufsize) {
+                    av_log(mlz->context, AV_LOG_ERROR, "MLZ offset error.\n");
+                    return count;
+                }
+                buff[offset] = tmp_code;
+                count++;
+            }
+            current_code = dict[current_code].parent_code;
+            if ((current_code < 0) || (current_code > (DIC_INDEX_MAX - 1))) {
+                av_log(mlz->context, AV_LOG_ERROR, "MLZ dic index error.\n");
+                return count;
+            }
+            if (current_code > FIRST_CODE) {
+                parent_code = dict[current_code].parent_code;
+                offset = (dict[current_code].match_len) - 1;
+                if (parent_code < 0 || parent_code > DIC_INDEX_MAX-1) {
+                    av_log(mlz->context, AV_LOG_ERROR, "MLZ dic index error.\n");
+                    return count;
+                }
+                if (( offset > (DIC_INDEX_MAX - 1))) {
+                    av_log(mlz->context, AV_LOG_ERROR, "MLZ dic offset error.\n");
+                    return count;
+                }
+            }
+            break;
+        }
+    }
+    return count;
+}
+
+static int input_code(GetBitContext* gb, int len) {
+    int tmp_code = 0;
+    int i;
+    for (i = 0; i < len; ++i) {
+        tmp_code |= get_bits1(gb) << i;
+    }
+    return tmp_code;
+}
+
+int ff_mlz_decompression(MLZ* mlz, GetBitContext* gb, int size, unsigned char *buff) {
+    MLZDict *dict = mlz->dict;
+    unsigned long output_chars;
+    int string_code, last_string_code, char_code;
+
+    string_code = 0;
+    char_code   = -1;
+    last_string_code = -1;
+    output_chars = 0;
+
+    while (output_chars < size) {
+        string_code = input_code(gb, mlz->dic_code_bit);
+        switch (string_code) {
+            case FLUSH_CODE:
+            case MAX_CODE:
+                ff_mlz_flush_dict(mlz);
+                char_code = -1;
+                last_string_code = -1;
+                break;
+            case FREEZE_CODE:
+                mlz->freeze_flag = 1;
+                break;
+            default:
+                if (string_code > mlz->current_dic_index_max) {
+                    av_log(mlz->context, AV_LOG_ERROR, "String code %d exceeds maximum value of %d.\n", string_code, mlz->current_dic_index_max);
+                    return output_chars;
+                }
+                if (string_code == (int) mlz->bump_code) {
+                    ++mlz->dic_code_bit;
+                    mlz->current_dic_index_max *= 2;
+                    mlz->bump_code = mlz->current_dic_index_max - 1;
+                } else {
+                    if (string_code >= mlz->next_code) {
+                        int ret = decode_string(mlz, &buff[output_chars], last_string_code, &char_code, size - output_chars);
+                        if (ret < 0 || ret > size - output_chars) {
+                            av_log(mlz->context, AV_LOG_ERROR, "output chars overflow\n");
+                            return output_chars;
+                        }
+                        output_chars += ret;
+                        ret = decode_string(mlz, &buff[output_chars], char_code, &char_code, size - output_chars);
+                        if (ret < 0 || ret > size - output_chars) {
+                            av_log(mlz->context, AV_LOG_ERROR, "output chars overflow\n");
+                            return output_chars;
+                        }
+                        output_chars += ret;
+                        set_new_entry_dict(dict, mlz->next_code, last_string_code, char_code);
+                        if (mlz->next_code >= TABLE_SIZE - 1) {
+                            av_log(mlz->context, AV_LOG_ERROR, "Too many MLZ codes\n");
+                            return output_chars;
+                        }
+                        mlz->next_code++;
+                    } else {
+                        int ret = decode_string(mlz, &buff[output_chars], string_code, &char_code, size - output_chars);
+                        if (ret < 0 || ret > size - output_chars) {
+                            av_log(mlz->context, AV_LOG_ERROR, "output chars overflow\n");
+                            return output_chars;
+                        }
+                        output_chars += ret;
+                        if (output_chars <= size && !mlz->freeze_flag) {
+                            if (last_string_code != -1) {
+                                set_new_entry_dict(dict, mlz->next_code, last_string_code, char_code);
+                                if (mlz->next_code >= TABLE_SIZE - 1) {
+                                    av_log(mlz->context, AV_LOG_ERROR, "Too many MLZ codes\n");
+                                    return output_chars;
+                                }
+                                mlz->next_code++;
+                            }
+                        } else {
+                            break;
+                        }
+                    }
+                    last_string_code = string_code;
+                }
+                break;
+        }
+    }
+    return output_chars;
+}
diff --git a/libavcodec/mlz.h b/libavcodec/mlz.h
new file mode 100644
index 0000000..c3df52c
--- /dev/null
+++ b/libavcodec/mlz.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016 Umair Khan <omerjerk@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MLZ_H
+#define AVCODEC_MLZ_H
+
+#include "get_bits.h"
+
+#define CODE_UNSET          -1
+#define CODE_BIT_INIT       9
+#define DIC_INDEX_INIT      512     // 2^9
+#define DIC_INDEX_MAX       32768   // 2^15
+#define FLUSH_CODE          256
+#define FREEZE_CODE         257
+#define FIRST_CODE          258
+#define MAX_CODE            32767
+#define TABLE_SIZE          35023   // TABLE_SIZE must be a prime number
+
+/** Dictionary structure for mlz decompression
+ */
+typedef struct MLZDict {
+    int  string_code;
+    int  parent_code;
+    int  char_code;
+    int  match_len;
+} MLZDict;
+
+/** MLZ data strucure
+ */
+typedef struct MLZ {
+    int dic_code_bit;
+    int current_dic_index_max;
+    unsigned int bump_code;
+    unsigned int flush_code;
+    int next_code;
+    int freeze_flag;
+    MLZDict* dict;
+    void* context;
+} MLZ;
+
+/** Initialize the dictionary
+ */
+void ff_mlz_init_dict(void* context, MLZ *mlz);
+
+/** Flush the dictionary
+ */
+void ff_mlz_flush_dict(MLZ *dict);
+
+/** Run mlz decompression on the next size bits and the output will be stored in buff
+ */
+int ff_mlz_decompression(MLZ* mlz, GetBitContext* gb, int size, unsigned char *buff);
+
+#endif /*AVCODEC_MLZ_H*/
diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index 504e765d..647a22e 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -2,20 +2,20 @@
  * MMAL Video Decoder
  * Copyright (c) 2015 Rodger Combs
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -89,6 +89,8 @@ typedef struct MMALDecodeContext {
     int eos_received;
     int eos_sent;
     int extradata_sent;
+    int interlaced_frame;
+    int top_field_first;
 } MMALDecodeContext;
 
 // Assume decoder is guaranteed to produce output after at least this many
@@ -227,9 +229,8 @@ static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
         status = *(uint32_t *)buffer->data;
         av_log(avctx, AV_LOG_ERROR, "MMAL error %d on control port\n", (int)status);
     } else {
-        char s[20];
-        av_get_codec_tag_string(s, sizeof(s), buffer->cmd);
-        av_log(avctx, AV_LOG_WARNING, "Unknown MMAL event %s on control port\n", s);
+        av_log(avctx, AV_LOG_WARNING, "Unknown MMAL event %s on control port\n",
+               av_fourcc2str(buffer->cmd));
     }
 
     mmal_buffer_header_release(buffer);
@@ -276,6 +277,7 @@ static int ffmal_update_format(AVCodecContext *avctx)
     int ret = 0;
     MMAL_COMPONENT_T *decoder = ctx->decoder;
     MMAL_ES_FORMAT_T *format_out = decoder->output[0]->format;
+    MMAL_PARAMETER_VIDEO_INTERLACE_TYPE_T interlace_type;
 
     ffmmal_poolref_unref(ctx->pool_out);
     if (!(ctx->pool_out = av_mallocz(sizeof(*ctx->pool_out)))) {
@@ -302,6 +304,16 @@ static int ffmal_update_format(AVCodecContext *avctx)
     if ((status = mmal_port_format_commit(decoder->output[0])))
         goto fail;
 
+    interlace_type.hdr.id = MMAL_PARAMETER_VIDEO_INTERLACE_TYPE;
+    interlace_type.hdr.size = sizeof(MMAL_PARAMETER_VIDEO_INTERLACE_TYPE_T);
+    status = mmal_port_parameter_get(decoder->output[0], &interlace_type.hdr);
+    if (status != MMAL_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Cannot read MMAL interlace information!\n");
+    } else {
+        ctx->interlaced_frame = (interlace_type.eMode != MMAL_InterlaceProgressive);
+        ctx->top_field_first = (interlace_type.eMode == MMAL_InterlaceFieldsInterleavedUpperFirst);
+    }
+
     if ((ret = ff_set_dimensions(avctx, format_out->es->video.crop.x + format_out->es->video.crop.width,
                                         format_out->es->video.crop.y + format_out->es->video.crop.height)) < 0)
         goto fail;
@@ -310,6 +322,10 @@ static int ffmal_update_format(AVCodecContext *avctx)
         avctx->sample_aspect_ratio.num = format_out->es->video.par.num;
         avctx->sample_aspect_ratio.den = format_out->es->video.par.den;
     }
+    if (format_out->es->video.frame_rate.num && format_out->es->video.frame_rate.den) {
+        avctx->framerate.num = format_out->es->video.frame_rate.num;
+        avctx->framerate.den = format_out->es->video.frame_rate.den;
+    }
 
     avctx->colorspace = ffmmal_csp_to_av_csp(format_out->es->video.color_space);
 
@@ -336,7 +352,6 @@ static av_cold int ffmmal_init_decoder(AVCodecContext *avctx)
     MMAL_STATUS_T status;
     MMAL_ES_FORMAT_T *format_in;
     MMAL_COMPONENT_T *decoder;
-    char tmp[32];
     int ret = 0;
 
     bcm_host_init();
@@ -362,6 +377,9 @@ static av_cold int ffmmal_init_decoder(AVCodecContext *avctx)
     case AV_CODEC_ID_MPEG2VIDEO:
         format_in->encoding = MMAL_ENCODING_MP2V;
         break;
+    case AV_CODEC_ID_MPEG4:
+        format_in->encoding = MMAL_ENCODING_MP4V;
+        break;
     case AV_CODEC_ID_VC1:
         format_in->encoding = MMAL_ENCODING_WVC1;
         break;
@@ -380,8 +398,8 @@ static av_cold int ffmmal_init_decoder(AVCodecContext *avctx)
     format_in->es->video.par.den = avctx->sample_aspect_ratio.den;
     format_in->flags = MMAL_ES_FORMAT_FLAG_FRAMED;
 
-    av_get_codec_tag_string(tmp, sizeof(tmp), format_in->encoding);
-    av_log(avctx, AV_LOG_DEBUG, "Using MMAL %s encoding.\n", tmp);
+    av_log(avctx, AV_LOG_DEBUG, "Using MMAL %s encoding.\n",
+           av_fourcc2str(format_in->encoding));
 
 #if HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS
     if (mmal_port_parameter_set_uint32(decoder->input[0], MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS,
@@ -608,6 +626,9 @@ static int ffmal_copy_frame(AVCodecContext *avctx,  AVFrame *frame,
     MMALDecodeContext *ctx = avctx->priv_data;
     int ret = 0;
 
+    frame->interlaced_frame = ctx->interlaced_frame;
+    frame->top_field_first = ctx->top_field_first;
+
     if (avctx->pix_fmt == AV_PIX_FMT_MMAL) {
         if (!ctx->pool_out)
             return AVERROR_UNKNOWN; // format change code failed with OOM previously
@@ -720,9 +741,8 @@ static int ffmmal_read_frame(AVCodecContext *avctx, AVFrame *frame, int *got_fra
             mmal_buffer_header_release(buffer);
             continue;
         } else if (buffer->cmd) {
-            char s[20];
-            av_get_codec_tag_string(s, sizeof(s), buffer->cmd);
-            av_log(avctx, AV_LOG_WARNING, "Unknown MMAL event %s on output port\n", s);
+            av_log(avctx, AV_LOG_WARNING, "Unknown MMAL event %s on output port\n",
+                   av_fourcc2str(buffer->cmd));
             goto done;
         } else if (buffer->length == 0) {
             // Unused output buffer that got drained after format change.
@@ -831,4 +851,5 @@ static const AVOption options[]={
 
 FFMMAL_DEC(h264, AV_CODEC_ID_H264)
 FFMMAL_DEC(mpeg2, AV_CODEC_ID_MPEG2VIDEO)
+FFMMAL_DEC(mpeg4, AV_CODEC_ID_MPEG4)
 FFMMAL_DEC(vc1, AV_CODEC_ID_VC1)
diff --git a/libavcodec/mmvideo.c b/libavcodec/mmvideo.c
index 0736630..04de6bb 100644
--- a/libavcodec/mmvideo.c
+++ b/libavcodec/mmvideo.c
@@ -2,20 +2,20 @@
  * American Laser Games MM Video Decoder
  * Copyright (c) 2006,2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,7 @@
 typedef struct MmContext {
     AVCodecContext *avctx;
     AVFrame *frame;
-    int palette[AVPALETTE_COUNT];
+    unsigned int palette[AVPALETTE_COUNT];
     GetByteContext gb;
 } MmContext;
 
@@ -75,17 +75,15 @@ static av_cold int mm_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int mm_decode_pal(MmContext *s)
+static void mm_decode_pal(MmContext *s)
 {
     int i;
 
     bytestream2_skip(&s->gb, 4);
     for (i = 0; i < 128; i++) {
-        s->palette[i] = bytestream2_get_be24(&s->gb);
+        s->palette[i] = 0xFFU << 24 | bytestream2_get_be24(&s->gb);
         s->palette[i+128] = s->palette[i]<<2;
     }
-
-    return 0;
 }
 
 /**
@@ -99,8 +97,7 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
     while (bytestream2_get_bytes_left(&s->gb) > 0) {
         int run_length, color;
 
-        // writes one more line when half_vert is true
-        if (y >= s->avctx->height + !!half_vert)
+        if (y >= s->avctx->height)
             return 0;
 
         color = bytestream2_get_byte(&s->gb);
@@ -114,12 +111,12 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
         if (half_horiz)
             run_length *=2;
 
-        if (s->avctx->width - x < run_length)
+        if (run_length > s->avctx->width - x)
             return AVERROR_INVALIDDATA;
 
         if (color) {
             memset(s->frame->data[0] + y*s->frame->linesize[0] + x, color, run_length);
-            if (half_vert)
+            if (half_vert && y + half_vert < s->avctx->height)
                 memset(s->frame->data[0] + (y+1)*s->frame->linesize[0] + x, color, run_length);
         }
         x+= run_length;
@@ -133,7 +130,7 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
     return 0;
 }
 
-/*
+/**
  * @param half_horiz Half horizontal resolution (0 or 1)
  * @param half_vert Half vertical resolution (0 or 1)
  */
@@ -204,13 +201,11 @@ static int mm_decode_frame(AVCodecContext *avctx,
     buf_size -= MM_PREAMBLE_SIZE;
     bytestream2_init(&s->gb, buf, buf_size);
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
         return res;
-    }
 
     switch(type) {
-    case MM_TYPE_PALETTE   : res = mm_decode_pal(s); return buf_size;
+    case MM_TYPE_PALETTE   : mm_decode_pal(s); return avpkt->size;
     case MM_TYPE_INTRA     : res = mm_decode_intra(s, 0, 0); break;
     case MM_TYPE_INTRA_HH  : res = mm_decode_intra(s, 1, 0); break;
     case MM_TYPE_INTRA_HHV : res = mm_decode_intra(s, 1, 1); break;
@@ -231,7 +226,7 @@ static int mm_decode_frame(AVCodecContext *avctx,
 
     *got_frame      = 1;
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int mm_decode_end(AVCodecContext *avctx)
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index c6dd3b8..759eea4 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -5,20 +5,20 @@
  *
  * new motion estimation (X1/EPZS) by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,9 +38,6 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 #define P_LEFT P[1]
 #define P_TOP P[2]
 #define P_TOPRIGHT P[3]
@@ -118,7 +115,7 @@ static av_always_inline int cmp_direct_inline(MpegEncContext *s, const int x, co
     uint8_t * const * const src= c->src[src_index];
     int d;
     //FIXME check chroma 4mv, (no crashes ...)
-        assert(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1));
+        av_assert2(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1));
         if(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1)){
             const int time_pp= s->pp_time;
             const int time_pb= s->pb_time;
@@ -160,14 +157,14 @@ static av_always_inline int cmp_direct_inline(MpegEncContext *s, const int x, co
                     c->qpel_avg[1][bxy](c->temp     + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride     + 8*stride, stride);
                     c->qpel_avg[1][bxy](c->temp + 8 + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8 + 8*stride, stride);
                 }else{
-                    assert((fx>>1) + 16*s->mb_x >= -16);
-                    assert((fy>>1) + 16*s->mb_y >= -16);
-                    assert((fx>>1) + 16*s->mb_x <= s->width);
-                    assert((fy>>1) + 16*s->mb_y <= s->height);
-                    assert((bx>>1) + 16*s->mb_x >= -16);
-                    assert((by>>1) + 16*s->mb_y >= -16);
-                    assert((bx>>1) + 16*s->mb_x <= s->width);
-                    assert((by>>1) + 16*s->mb_y <= s->height);
+                    av_assert2((fx>>1) + 16*s->mb_x >= -16);
+                    av_assert2((fy>>1) + 16*s->mb_y >= -16);
+                    av_assert2((fx>>1) + 16*s->mb_x <= s->width);
+                    av_assert2((fy>>1) + 16*s->mb_y <= s->height);
+                    av_assert2((bx>>1) + 16*s->mb_x >= -16);
+                    av_assert2((by>>1) + 16*s->mb_y >= -16);
+                    av_assert2((bx>>1) + 16*s->mb_x <= s->width);
+                    av_assert2((by>>1) + 16*s->mb_y <= s->height);
 
                     c->hpel_put[0][fxy](c->temp, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 16);
                     c->hpel_avg[0][bxy](c->temp, ref[8] + (bx>>1) + (by>>1)*stride, stride, 16);
@@ -186,8 +183,8 @@ static av_always_inline int cmp_inline(MpegEncContext *s, const int x, const int
     const int stride= c->stride;
     const int uvstride= c->uvstride;
     const int dxy= subx + (suby<<(1+qpel)); //FIXME log2_subpel?
-    const int hx= subx + (x<<(1+qpel));
-    const int hy= suby + (y<<(1+qpel));
+    const int hx= subx + x*(1<<(1+qpel));
+    const int hy= suby + y*(1<<(1+qpel));
     uint8_t * const * const ref= c->ref[ref_index];
     uint8_t * const * const src= c->src[src_index];
     int d;
@@ -195,7 +192,13 @@ static av_always_inline int cmp_inline(MpegEncContext *s, const int x, const int
         int uvdxy;              /* no, it might not be used uninitialized */
         if(dxy){
             if(qpel){
-                c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+                if (h << size == 16) {
+                    c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+                } else if (size == 0 && h == 8) {
+                    c->qpel_put[1][dxy](c->temp    , ref[0] + x + y*stride    , stride);
+                    c->qpel_put[1][dxy](c->temp + 8, ref[0] + x + y*stride + 8, stride);
+                } else
+                    av_assert2(0);
                 if(chroma){
                     int cx= hx/2;
                     int cy= hy/2;
@@ -305,13 +308,16 @@ int ff_init_me(MpegEncContext *s){
     int cache_size= FFMIN(ME_MAP_SIZE>>ME_MAP_SHIFT, 1<<ME_MAP_SHIFT);
     int dia_size= FFMAX(FFABS(s->avctx->dia_size)&255, FFABS(s->avctx->pre_dia_size)&255);
 
-    if(FFMIN(s->avctx->dia_size, s->avctx->pre_dia_size) < -ME_MAP_SIZE){
+    if(FFMIN(s->avctx->dia_size, s->avctx->pre_dia_size) < -FFMIN(ME_MAP_SIZE, MAX_SAB_SIZE)){
         av_log(s->avctx, AV_LOG_ERROR, "ME_MAP size is too small for SAB diamond\n");
         return -1;
     }
 
     c->avctx= s->avctx;
 
+    if(s->codec_id == AV_CODEC_ID_H261)
+        c->avctx->me_sub_cmp = c->avctx->me_cmp;
+
     if(cache_size < 2*dia_size && !c->stride){
         av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n");
     }
@@ -360,12 +366,14 @@ int ff_init_me(MpegEncContext *s){
     /* 8x8 fullpel search would need a 4x4 chroma compare, which we do
      * not have yet, and even if we had, the motion estimation code
      * does not expect it. */
-    if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
-        s->mecc.me_cmp[2] = zero_cmp;
-    if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
-        s->mecc.me_sub_cmp[2] = zero_cmp;
-    c->hpel_put[2][0]= c->hpel_put[2][1]=
-    c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
+    if (s->codec_id != AV_CODEC_ID_SNOW) {
+        if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
+            s->mecc.me_cmp[2] = zero_cmp;
+        if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
+            s->mecc.me_sub_cmp[2] = zero_cmp;
+        c->hpel_put[2][0]= c->hpel_put[2][1]=
+        c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
+    }
 
     if(s->codec_id == AV_CODEC_ID_H261){
         c->sub_motion_search= no_sub_motion_search;
@@ -391,10 +399,9 @@ static int sad_hpel_motion_search(MpegEncContext * s,
     int mx, my, dminh;
     uint8_t *pix, *ptr;
     int stride= c->stride;
-    const int flags= c->sub_flags;
     LOAD_COMMON
 
-    assert(flags == 0);
+    av_assert2(c->sub_flags == 0);
 
     if(c->skip){
         *mx_ptr = 0;
@@ -414,13 +421,13 @@ static int sad_hpel_motion_search(MpegEncContext * s,
         my > ymin && my < ymax) {
         int dx=0, dy=0;
         int d, pen_x, pen_y;
-        const int index= (my<<ME_MAP_SHIFT) + mx;
+        const int index= my*(1<<ME_MAP_SHIFT) + mx;
         const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
         const int l= score_map[(index- 1               )&(ME_MAP_SIZE-1)];
         const int r= score_map[(index+ 1               )&(ME_MAP_SIZE-1)];
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
-        mx<<=1;
-        my<<=1;
+        mx += mx;
+        my += my;
 
 
         pen_x= pred_x + mx;
@@ -478,8 +485,8 @@ static int sad_hpel_motion_search(MpegEncContext * s,
         my+=dy;
 
     }else{
-        mx<<=1;
-        my<<=1;
+        mx += mx;
+        my += my;
     }
 
     *mx_ptr = mx;
@@ -518,6 +525,7 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
 {
     MotionEstContext * const c= &s->me;
     int range= c->avctx->me_range >> (1 + !!(c->flags&FLAG_QPEL));
+    int max_range = MAX_MV >> (1 + !!(c->flags&FLAG_QPEL));
 /*
     if(c->avctx->me_range) c->range= c->avctx->me_range >> 1;
     else                   c->range= 16;
@@ -525,8 +533,8 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
     if (s->unrestricted_mv) {
         c->xmin = - x - 16;
         c->ymin = - y - 16;
-        c->xmax = - x + s->mb_width *16;
-        c->ymax = - y + s->mb_height*16;
+        c->xmax = - x + s->width;
+        c->ymax = - y + s->height;
     } else if (s->out_format == FMT_H261){
         // Search range of H.261 is different from other codec standards
         c->xmin = (x > 15) ? - 15 : 0;
@@ -539,6 +547,8 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
         c->xmax = - x + s->mb_width *16 - 16;
         c->ymax = - y + s->mb_height*16 - 16;
     }
+    if(!range || range > max_range)
+        range = max_range;
     if(range){
         c->xmin = FFMAX(c->xmin,-range);
         c->xmax = FFMIN(c->xmax, range);
@@ -565,10 +575,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
     const int h=8;
     int block;
     int P[10][2];
-    int dmin_sum=0, mx4_sum=0, my4_sum=0;
+    int dmin_sum=0, mx4_sum=0, my4_sum=0, i;
     int same=1;
     const int stride= c->stride;
     uint8_t *mv_penalty= c->current_mv_penalty;
+    int safety_clipping= s->unrestricted_mv && (s->width&15) && (s->height&15);
 
     init_mv4_ref(c);
 
@@ -580,6 +591,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
         const int mot_stride = s->b8_stride;
         const int mot_xy = s->block_index[block];
 
+        if(safety_clipping){
+            c->xmax = - 16*s->mb_x + s->width  - 8*(block &1);
+            c->ymax = - 16*s->mb_y + s->height - 8*(block>>1);
+        }
+
         P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0];
         P_LEFT[1] = s->current_picture.motion_val[0][mot_xy - 1][1];
 
@@ -607,8 +623,17 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
         }
         P_MV1[0]= mx;
         P_MV1[1]= my;
+        if(safety_clipping)
+            for(i=1; i<10; i++){
+                if (s->first_slice_line && block<2 && i>1 && i<9)
+                    continue;
+                if (i>4 && i<9)
+                    continue;
+                if(P[i][0] > (c->xmax<<shift)) P[i][0]= (c->xmax<<shift);
+                if(P[i][1] > (c->ymax<<shift)) P[i][1]= (c->ymax<<shift);
+            }
 
-        dmin4 = epzs_motion_search4(s, &mx4, &my4, P, block, block, s->p_mv_table, (1<<16)>>shift);
+        dmin4 = epzs_motion_search2(s, &mx4, &my4, P, block, block, s->p_mv_table, (1<<16)>>shift, 1);
 
         dmin4= c->sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h);
 
@@ -741,8 +766,8 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
             int16_t (*mv_table)[2]= mv_tables[block][field_select];
 
             if(user_field_select){
-                assert(field_select==0 || field_select==1);
-                assert(field_select_tables[block][xy]==0 || field_select_tables[block][xy]==1);
+                av_assert1(field_select==0 || field_select==1);
+                av_assert1(field_select_tables[block][xy]==0 || field_select_tables[block][xy]==1);
                 if(field_select_tables[block][xy] != field_select)
                     continue;
             }
@@ -770,7 +795,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
             P_MV1[0]= mx; //FIXME not correct if block != field_select
             P_MV1[1]= my / 2;
 
-            dmin = epzs_motion_search2(s, &mx_i, &my_i, P, block, field_select+ref_index, mv_table, (1<<16)>>1);
+            dmin = epzs_motion_search2(s, &mx_i, &my_i, P, block, field_select+ref_index, mv_table, (1<<16)>>1, 0);
 
             dmin= c->sub_motion_search(s, &mx_i, &my_i, dmin, block, field_select+ref_index, size, h);
 
@@ -839,6 +864,10 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
         return lambda>>FF_LAMBDA_SHIFT;
     case FF_CMP_DCT:
         return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
+    case FF_CMP_W53:
+        return (4*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_W97:
+        return (2*lambda)>>(FF_LAMBDA_SHIFT);
     case FF_CMP_SATD:
     case FF_CMP_DCT264:
         return (2*lambda)>>FF_LAMBDA_SHIFT;
@@ -848,6 +877,7 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
     case FF_CMP_NSSE:
         return lambda2>>FF_LAMBDA_SHIFT;
     case FF_CMP_BIT:
+    case FF_CMP_MEDIAN_SAD:
         return 1;
     }
 }
@@ -867,14 +897,14 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 
     init_ref(c, s->new_picture.f->data, s->last_picture.f->data, NULL, 16*mb_x, 16*mb_y, 0);
 
-    assert(s->quarter_sample==0 || s->quarter_sample==1);
-    assert(s->linesize == c->stride);
-    assert(s->uvlinesize == c->uvstride);
+    av_assert0(s->quarter_sample==0 || s->quarter_sample==1);
+    av_assert0(s->linesize == c->stride);
+    av_assert0(s->uvlinesize == c->uvstride);
 
     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
-    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
+    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_DMV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
     c->skip=0;
@@ -906,10 +936,10 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             P_TOPRIGHT[1] = s->current_picture.motion_val[0][mot_xy - mot_stride + 2][1];
             if (P_TOP[1] > (c->ymax << shift))
                 P_TOP[1] =  c->ymax << shift;
-            if (P_TOPRIGHT[0] < (c->xmin << shift))
-                P_TOPRIGHT[0] =  c->xmin << shift;
-            if (P_TOPRIGHT[1] > (c->ymax << shift))
-                P_TOPRIGHT[1] =  c->ymax << shift;
+            if (P_TOPRIGHT[0] < (c->xmin * (1 << shift)))
+                P_TOPRIGHT[0] =  c->xmin * (1 << shift);
+            if (P_TOPRIGHT[1] > (c->ymax * (1 << shift)))
+                P_TOPRIGHT[1] =  c->ymax * (1 << shift);
 
             P_MEDIAN[0] = mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
             P_MEDIAN[1] = mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
@@ -1038,10 +1068,10 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
     const int xy= mb_x + mb_y*s->mb_stride;
     init_ref(c, s->new_picture.f->data, s->last_picture.f->data, NULL, 16*mb_x, 16*mb_y, 0);
 
-    assert(s->quarter_sample==0 || s->quarter_sample==1);
+    av_assert0(s->quarter_sample==0 || s->quarter_sample==1);
 
     c->pre_penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_pre_cmp);
-    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
+    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_DMV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
     c->skip=0;
@@ -1090,7 +1120,7 @@ static int estimate_motion_b(MpegEncContext *s, int mb_x, int mb_y,
     const int shift= 1+s->quarter_sample;
     const int mot_stride = s->mb_stride;
     const int mot_xy = mb_y*mot_stride + mb_x;
-    uint8_t * const mv_penalty= c->mv_penalty[f_code] + MAX_MV;
+    uint8_t * const mv_penalty= c->mv_penalty[f_code] + MAX_DMV;
     int mv_scale;
 
     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
@@ -1154,8 +1184,8 @@ static inline int check_bidir_mv(MpegEncContext * s,
     //FIXME better f_code prediction (max mv & distance)
     //FIXME pointers
     MotionEstContext * const c= &s->me;
-    uint8_t * const mv_penalty_f= c->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
-    uint8_t * const mv_penalty_b= c->mv_penalty[s->b_code] + MAX_MV; // f_code of the prev frame
+    uint8_t * const mv_penalty_f= c->mv_penalty[s->f_code] + MAX_DMV; // f_code of the prev frame
+    uint8_t * const mv_penalty_b= c->mv_penalty[s->b_code] + MAX_DMV; // f_code of the prev frame
     int stride= c->stride;
     uint8_t *dest_y = c->scratchpad;
     uint8_t *ptr;
@@ -1368,7 +1398,7 @@ static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
     int mx, my, xmin, xmax, ymin, ymax;
     int16_t (*mv_table)[2]= s->b_direct_mv_table;
 
-    c->current_mv_penalty= c->mv_penalty[1] + MAX_MV;
+    c->current_mv_penalty= c->mv_penalty[1] + MAX_DMV;
     ymin= xmin=(-32)>>shift;
     ymax= xmax=   31>>shift;
 
@@ -1406,7 +1436,7 @@ static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
         if(s->mv_type == MV_TYPE_16X16) break;
     }
 
-    assert(xmax <= 15 && ymax <= 15 && xmin >= -16 && ymin >= -16);
+    av_assert2(xmax <= 15 && ymax <= 15 && xmin >= -16 && ymin >= -16);
 
     if(xmax < 0 || xmin >0 || ymax < 0 || ymin > 0){
         s->b_direct_mv_table[mot_xy][0]= 0;
@@ -1504,11 +1534,11 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
     if (s->avctx->flags & AV_CODEC_FLAG_INTERLACED_ME) {
 //FIXME mb type penalty
         c->skip=0;
-        c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
+        c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_DMV;
         fimin= interlaced_search(s, 0,
                                  s->b_field_mv_table[0], s->b_field_select_table[0],
                                  s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 0);
-        c->current_mv_penalty= c->mv_penalty[s->b_code] + MAX_MV;
+        c->current_mv_penalty= c->mv_penalty[s->b_code] + MAX_DMV;
         bimin= interlaced_search(s, 2,
                                  s->b_field_mv_table[1], s->b_field_select_table[1],
                                  s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 0);
@@ -1623,12 +1653,12 @@ void ff_fix_long_p_mvs(MpegEncContext * s)
     MotionEstContext * const c= &s->me;
     const int f_code= s->f_code;
     int y, range;
-    assert(s->pict_type==AV_PICTURE_TYPE_P);
+    av_assert0(s->pict_type==AV_PICTURE_TYPE_P);
 
     range = (((s->out_format == FMT_MPEG1 || s->msmpeg4_version) ? 8 : 16) << f_code);
 
-    assert(range <= 16 || !s->msmpeg4_version);
-    assert(range <=256 || !(s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL));
+    av_assert0(range <= 16 || !s->msmpeg4_version);
+    av_assert0(range <=256 || !(s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL));
 
     if(c->avctx->me_range && range > c->avctx->me_range) range= c->avctx->me_range;
 
diff --git a/libavcodec/motion_est.h b/libavcodec/motion_est.h
index 3b63972..3b3a8d7 100644
--- a/libavcodec/motion_est.h
+++ b/libavcodec/motion_est.h
@@ -1,25 +1,25 @@
 /*
  * Motion estimation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_MOTIONEST_H
-#define AVCODEC_MOTIONEST_H
+#ifndef AVCODEC_MOTION_EST_H
+#define AVCODEC_MOTION_EST_H
 
 #include <stdint.h>
 
@@ -29,7 +29,12 @@
 
 struct MpegEncContext;
 
-#define MAX_MV 2048
+#if ARCH_IA64 // Limit static arrays to avoid gcc failing "short data segment overflowed"
+#define MAX_MV 1024
+#else
+#define MAX_MV 4096
+#endif
+#define MAX_DMV (2*MAX_MV)
 #define ME_MAP_SIZE 64
 
 #define FF_ME_ZERO 0
@@ -77,15 +82,15 @@ typedef struct MotionEstContext {
     int stride;
     int uvstride;
     /* temp variables for picture complexity calculation */
-    int mc_mb_var_sum_temp;
-    int mb_var_sum_temp;
+    int64_t mc_mb_var_sum_temp;
+    int64_t mb_var_sum_temp;
     int scene_change_score;
 
     op_pixels_func(*hpel_put)[4];
     op_pixels_func(*hpel_avg)[4];
     qpel_mc_func(*qpel_put)[16];
     qpel_mc_func(*qpel_avg)[16];
-    uint8_t (*mv_penalty)[MAX_MV * 2 + 1]; ///< bit amount needed to encode a MV
+    uint8_t (*mv_penalty)[MAX_DMV * 2 + 1]; ///< bit amount needed to encode a MV
     uint8_t *current_mv_penalty;
     int (*sub_motion_search)(struct MpegEncContext *s,
                              int *mx_ptr, int *my_ptr, int dmin,
@@ -127,4 +132,4 @@ void ff_fix_long_mvs(struct MpegEncContext *s, uint8_t *field_select_table,
                      int field_select, int16_t (*mv_table)[2], int f_code,
                      int type, int truncate);
 
-#endif /* AVCODEC_MOTIONEST_H */
+#endif /* AVCODEC_MOTION_EST_H */
diff --git a/libavcodec/motion_est_template.c b/libavcodec/motion_est_template.c
index c655e19..014038e 100644
--- a/libavcodec/motion_est_template.c
+++ b/libavcodec/motion_est_template.c
@@ -2,20 +2,20 @@
  * Motion estimation
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
  * Motion estimation template.
  */
 
+#include "libavutil/qsort.h"
 #include "mpegvideo.h"
 
 //Let us hope gcc will remove the unused vars ...(gcc 3.2.2 seems to do it ...)
@@ -91,19 +92,18 @@ static int hpel_motion_search(MpegEncContext * s,
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]
                      + (mv_penalty[bx   - pred_x] + mv_penalty[by+2 - pred_y])*c->penalty_factor;
 
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
         unsigned key;
         unsigned map_generation= c->map_generation;
-#ifndef NDEBUG
-        uint32_t *map= c->map;
-#endif
         key= ((my-1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
-        assert(map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
         key= ((my+1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
-        assert(map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
         key= ((my)<<ME_MAP_MV_BITS) + (mx+1) + map_generation;
-        assert(map[(index+1)&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index+1)&(ME_MAP_SIZE-1)] == key);
         key= ((my)<<ME_MAP_MV_BITS) + (mx-1) + map_generation;
-        assert(map[(index-1)&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index-1)&(ME_MAP_SIZE-1)] == key);
+#endif
         if(t<=b){
             CHECK_HALF_MV(0, 1, mx  ,my-1)
             if(l<=r){
@@ -143,7 +143,7 @@ static int hpel_motion_search(MpegEncContext * s,
             }
             CHECK_HALF_MV(0, 1, mx  , my)
         }
-        assert(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
+        av_assert2(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
     }
 
     *mx_ptr = bx;
@@ -181,9 +181,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my,
     cmp_sub        = s->mecc.mb_cmp[size];
     chroma_cmp_sub = s->mecc.mb_cmp[size + 1];
 
-//    assert(!c->skip);
-//    assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp);
-
     d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
     //FIXME check cbp before adding penalty for (0,0) vector
     if(add_rate && (mx || my || size>0))
@@ -302,7 +299,7 @@ static int qpel_motion_search(MpegEncContext * s,
             const int cy2= b + t - 2*c;
             int cxy;
 
-            if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == (my<<ME_MAP_MV_BITS) + mx + map_generation && 0){ //FIXME
+            if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == ((my-1)<<ME_MAP_MV_BITS) + (mx-1) + map_generation){
                 tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
             }else{
                 tl= cmp(s, mx-1, my-1, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);//FIXME wrong if chroma me is different
@@ -310,11 +307,11 @@ static int qpel_motion_search(MpegEncContext * s,
 
             cxy= 2*tl + (cx + cy)/4 - (cx2 + cy2) - 2*c;
 
-            assert(16*cx2 + 4*cx + 32*c == 32*r);
-            assert(16*cx2 - 4*cx + 32*c == 32*l);
-            assert(16*cy2 + 4*cy + 32*c == 32*b);
-            assert(16*cy2 - 4*cy + 32*c == 32*t);
-            assert(16*cxy + 16*cy2 + 16*cx2 - 4*cy - 4*cx + 32*c == 32*tl);
+            av_assert2(16*cx2 + 4*cx + 32*c == 32*r);
+            av_assert2(16*cx2 - 4*cx + 32*c == 32*l);
+            av_assert2(16*cy2 + 4*cy + 32*c == 32*b);
+            av_assert2(16*cy2 - 4*cy + 32*c == 32*t);
+            av_assert2(16*cxy + 16*cy2 + 16*cx2 - 4*cy - 4*cx + 32*c == 32*tl);
 
             for(ny= -3; ny <= 3; ny++){
                 for(nx= -3; nx <= 3; nx++){
@@ -347,7 +344,7 @@ static int qpel_motion_search(MpegEncContext * s,
             CHECK_QUARTER_MV(nx&3, ny&3, nx>>2, ny>>2)
         }
 
-        assert(bx >= xmin*4 && bx <= xmax*4 && by >= ymin*4 && by <= ymax*4);
+        av_assert2(bx >= xmin*4 && bx <= xmax*4 && by >= ymin*4 && by <= ymax*4);
 
         *mx_ptr = bx;
         *my_ptr = by;
@@ -362,17 +359,17 @@ static int qpel_motion_search(MpegEncContext * s,
 
 #define CHECK_MV(x,y)\
 {\
-    const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
-    assert((x) >= xmin);\
-    assert((x) <= xmax);\
-    assert((y) >= ymin);\
-    assert((y) <= ymax);\
+    const unsigned key = ((unsigned)(y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((unsigned)(y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    av_assert2((x) >= xmin);\
+    av_assert2((x) <= xmax);\
+    av_assert2((y) >= ymin);\
+    av_assert2((y) <= ymax);\
     if(map[index]!=key){\
         d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+        d += (mv_penalty[((x)*(1<<shift))-pred_x] + mv_penalty[((y)*(1<<shift))-pred_y])*penalty_factor;\
         COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
     }\
 }
@@ -388,13 +385,13 @@ static int qpel_motion_search(MpegEncContext * s,
 
 #define CHECK_MV_DIR(x,y,new_dir)\
 {\
-    const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    const unsigned key = ((unsigned)(y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((unsigned)(y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
     if(map[index]!=key){\
         d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+        d += (mv_penalty[(int)((unsigned)(x)<<shift)-pred_x] + mv_penalty[(int)((unsigned)(y)<<shift)-pred_y])*penalty_factor;\
         if(d<dmin){\
             best[0]=x;\
             best[1]=y;\
@@ -405,10 +402,10 @@ static int qpel_motion_search(MpegEncContext * s,
 }
 
 #define check(x,y,S,v)\
-if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
-if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
-if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
-if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
+if( (x)<(xmin<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
+if( (x)>(xmax<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
+if( (y)<(ymin<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
+if( (y)>(ymax<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
 
 #define LOAD_COMMON2\
     uint32_t *map= c->map;\
@@ -430,8 +427,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best,
     chroma_cmpf = s->mecc.me_cmp[size + 1];
 
     { /* ensure that the best point is in the MAP as h/qpel refinement needs it */
-        const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
-        const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
+        const unsigned key = ((unsigned)best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
+        const int index= (((unsigned)best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
         if (map[index] != key) { // this will be executed only very rarely
             score_map[index]= cmp(s, best[0], best[1], 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
             map[index]= key;
@@ -693,6 +690,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
     LOAD_COMMON2
     unsigned map_generation = c->map_generation;
 
+    av_assert1(minima_count <= MAX_SAB_SIZE);
+
     cmpf        = s->mecc.me_cmp[size];
     chroma_cmpf = s->mecc.me_cmp[size + 1];
 
@@ -725,7 +724,7 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
         j++;
     }
 
-    qsort(minima, j, sizeof(Minima), minima_cmp);
+    AV_QSORT(minima, j, Minima, minima_cmp);
 
     for(; j<minima_count; j++){
         minima[j].height=256*256*256*64;
@@ -890,7 +889,7 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int
 
     map_generation= update_map_generation(c);
 
-    assert(cmpf);
+    av_assert2(cmpf);
     dmin= cmp(s, 0, 0, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
     map[0]= map_generation;
     score_map[0]= dmin;
@@ -990,76 +989,16 @@ int ff_epzs_motion_search(MpegEncContext *s, int *mx_ptr, int *my_ptr,
     }
 }
 
-static int epzs_motion_search4(MpegEncContext * s,
-                             int *mx_ptr, int *my_ptr, int P[10][2],
-                             int src_index, int ref_index, int16_t (*last_mv)[2],
-                             int ref_mv_scale)
-{
-    MotionEstContext * const c= &s->me;
-    int best[2]={0, 0};
-    int d, dmin;
-    unsigned map_generation;
-    const int penalty_factor= c->penalty_factor;
-    const int size=1;
-    const int h=8;
-    const int ref_mv_stride= s->mb_stride;
-    const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride;
-    me_cmp_func cmpf, chroma_cmpf;
-    LOAD_COMMON
-    int flags= c->flags;
-    LOAD_COMMON2
-
-    cmpf        = s->mecc.me_cmp[size];
-    chroma_cmpf = s->mecc.me_cmp[size + 1];
-
-    map_generation= update_map_generation(c);
-
-    dmin = 1000000;
-
-    /* first line */
-    if (s->first_slice_line) {
-        CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
-        CHECK_CLIPPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16,
-                        (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
-        CHECK_MV(P_MV1[0]>>shift, P_MV1[1]>>shift)
-    }else{
-        CHECK_MV(P_MV1[0]>>shift, P_MV1[1]>>shift)
-        //FIXME try some early stop
-        CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
-        CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
-        CHECK_MV(P_TOP[0]>>shift, P_TOP[1]>>shift)
-        CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
-        CHECK_CLIPPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16,
-                        (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
-    }
-    if(dmin>64*4){
-        CHECK_CLIPPED_MV((last_mv[ref_mv_xy+1][0]*ref_mv_scale + (1<<15))>>16,
-                        (last_mv[ref_mv_xy+1][1]*ref_mv_scale + (1<<15))>>16)
-        if(s->mb_y+1<s->end_mb_y)  //FIXME replace at least with last_slice_line
-            CHECK_CLIPPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16,
-                            (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
-    }
-
-    dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
-
-    *mx_ptr= best[0];
-    *my_ptr= best[1];
-
-    return dmin;
-}
-
-//try to merge with above FIXME (needs PSNR test)
 static int epzs_motion_search2(MpegEncContext * s,
                              int *mx_ptr, int *my_ptr, int P[10][2],
                              int src_index, int ref_index, int16_t (*last_mv)[2],
-                             int ref_mv_scale)
+                             int ref_mv_scale, const int size)
 {
     MotionEstContext * const c= &s->me;
     int best[2]={0, 0};
     int d, dmin;
     unsigned map_generation;
     const int penalty_factor= c->penalty_factor;
-    const int size=0; //FIXME pass as arg
     const int h=8;
     const int ref_mv_stride= s->mb_stride;
     const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride;
diff --git a/libavcodec/motionpixels.c b/libavcodec/motionpixels.c
index 66ec5f5..a88b837 100644
--- a/libavcodec/motionpixels.c
+++ b/libavcodec/motionpixels.c
@@ -2,28 +2,27 @@
  * Motion Pixels Video Decoder
  * Copyright (c) 2008 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "bswapdsp.h"
 #include "internal.h"
-#include "vlc.h"
 
 #define MAX_HUFF_CODES 16
 
@@ -70,13 +69,24 @@ static av_cold int mp_decode_init(AVCodecContext *avctx)
     int w4 = (avctx->width  + 3) & ~3;
     int h4 = (avctx->height + 3) & ~3;
 
+    if(avctx->extradata_size < 2){
+        av_log(avctx, AV_LOG_ERROR, "extradata too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     motionpixels_tableinit();
     mp->avctx = avctx;
     ff_bswapdsp_init(&mp->bdsp);
-    mp->changes_map = av_mallocz(avctx->width * h4);
+    mp->changes_map = av_mallocz_array(avctx->width, h4);
     mp->offset_bits_len = av_log2(avctx->width * avctx->height) + 1;
-    mp->vpt = av_mallocz(avctx->height * sizeof(YuvPixel));
-    mp->hpt = av_mallocz(h4 * w4 / 16 * sizeof(YuvPixel));
+    mp->vpt = av_mallocz_array(avctx->height, sizeof(YuvPixel));
+    mp->hpt = av_mallocz_array(h4 / 4, w4 / 4 * sizeof(YuvPixel));
+    if (!mp->changes_map || !mp->vpt || !mp->hpt) {
+        av_freep(&mp->changes_map);
+        av_freep(&mp->vpt);
+        av_freep(&mp->hpt);
+        return AVERROR(ENOMEM);
+    }
     avctx->pix_fmt = AV_PIX_FMT_RGB555;
 
     mp->frame = av_frame_alloc();
@@ -88,18 +98,17 @@ static av_cold int mp_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static void mp_read_changes_map(MotionPixelsContext *mp, BitstreamContext *bc,
-                                int count, int bits_len, int read_color)
+static void mp_read_changes_map(MotionPixelsContext *mp, GetBitContext *gb, int count, int bits_len, int read_color)
 {
     uint16_t *pixels;
     int offset, w, h, color = 0, x, y, i;
 
     while (count--) {
-        offset = bitstream_read(bc, mp->offset_bits_len);
-        w      = bitstream_read(bc, bits_len) + 1;
-        h      = bitstream_read(bc, bits_len) + 1;
+        offset = get_bits_long(gb, mp->offset_bits_len);
+        w      = get_bits(gb, bits_len) + 1;
+        h      = get_bits(gb, bits_len) + 1;
         if (read_color)
-            color = bitstream_read(bc, 15);
+            color = get_bits(gb, 15);
         x = offset % mp->avctx->width;
         y = offset / mp->avctx->width;
         if (y >= mp->avctx->height)
@@ -118,39 +127,48 @@ static void mp_read_changes_map(MotionPixelsContext *mp, BitstreamContext *bc,
     }
 }
 
-static void mp_get_code(MotionPixelsContext *mp, BitstreamContext *bc,
-                        int size, int code)
+static int mp_get_code(MotionPixelsContext *mp, GetBitContext *gb, int size, int code)
 {
-    while (bitstream_read_bit(bc)) {
+    while (get_bits1(gb)) {
         ++size;
         if (size > mp->max_codes_bits) {
             av_log(mp->avctx, AV_LOG_ERROR, "invalid code size %d/%d\n", size, mp->max_codes_bits);
-            return;
+            return AVERROR_INVALIDDATA;
         }
         code <<= 1;
-        mp_get_code(mp, bc, size, code + 1);
+        if (mp_get_code(mp, gb, size, code + 1) < 0)
+            return AVERROR_INVALIDDATA;
     }
     if (mp->current_codes_count >= MAX_HUFF_CODES) {
         av_log(mp->avctx, AV_LOG_ERROR, "too many codes\n");
-        return;
+        return AVERROR_INVALIDDATA;
     }
+
     mp->codes[mp->current_codes_count  ].code = code;
     mp->codes[mp->current_codes_count++].size = size;
+    return 0;
 }
 
-static void mp_read_codes_table(MotionPixelsContext *mp, BitstreamContext *bc)
+static int mp_read_codes_table(MotionPixelsContext *mp, GetBitContext *gb)
 {
     if (mp->codes_count == 1) {
-        mp->codes[0].delta = bitstream_read(bc, 4);
+        mp->codes[0].delta = get_bits(gb, 4);
     } else {
         int i;
+        int ret;
 
-        mp->max_codes_bits = bitstream_read(bc, 4);
+        mp->max_codes_bits = get_bits(gb, 4);
         for (i = 0; i < mp->codes_count; ++i)
-            mp->codes[i].delta = bitstream_read(bc, 4);
+            mp->codes[i].delta = get_bits(gb, 4);
         mp->current_codes_count = 0;
-        mp_get_code(mp, bc, 0, 0);
+        if ((ret = mp_get_code(mp, gb, 0, 0)) < 0)
+            return ret;
+        if (mp->current_codes_count < mp->codes_count) {
+            av_log(mp->avctx, AV_LOG_ERROR, "too few codes\n");
+            return AVERROR_INVALIDDATA;
+        }
    }
+   return 0;
 }
 
 static int mp_gradient(MotionPixelsContext *mp, int component, int v)
@@ -178,16 +196,15 @@ static void mp_set_rgb_from_yuv(MotionPixelsContext *mp, int x, int y, const Yuv
     *(uint16_t *)&mp->frame->data[0][y * mp->frame->linesize[0] + x * 2] = color;
 }
 
-static int mp_get_vlc(MotionPixelsContext *mp, BitstreamContext *bc)
+static int mp_get_vlc(MotionPixelsContext *mp, GetBitContext *gb)
 {
     int i;
 
-    i = (mp->codes_count == 1) ? 0 : bitstream_read_vlc(bc, mp->vlc.table, mp->max_codes_bits, 1);
-    i = FFMIN(i, FF_ARRAY_ELEMS(mp->codes) - 1);
+    i = (mp->codes_count == 1) ? 0 : get_vlc2(gb, mp->vlc.table, mp->max_codes_bits, 1);
     return mp->codes[i].delta;
 }
 
-static void mp_decode_line(MotionPixelsContext *mp, BitstreamContext *bc, int y)
+static void mp_decode_line(MotionPixelsContext *mp, GetBitContext *gb, int y)
 {
     YuvPixel p;
     const int y0 = y * mp->avctx->width;
@@ -214,13 +231,13 @@ static void mp_decode_line(MotionPixelsContext *mp, BitstreamContext *bc, int y)
             memset(mp->gradient_scale, 1, sizeof(mp->gradient_scale));
             p = mp_get_yuv_from_rgb(mp, x - 1, y);
         } else {
-            p.y += mp_gradient(mp, 0, mp_get_vlc(mp, bc));
+            p.y += mp_gradient(mp, 0, mp_get_vlc(mp, gb));
             p.y = av_clip_uintp2(p.y, 5);
             if ((x & 3) == 0) {
                 if ((y & 3) == 0) {
-                    p.v += mp_gradient(mp, 1, mp_get_vlc(mp, bc));
+                    p.v += mp_gradient(mp, 1, mp_get_vlc(mp, gb));
                     p.v = av_clip_intp2(p.v, 5);
-                    p.u += mp_gradient(mp, 2, mp_get_vlc(mp, bc));
+                    p.u += mp_gradient(mp, 2, mp_get_vlc(mp, gb));
                     p.u = av_clip_intp2(p.u, 5);
                     mp->hpt[((y / 4) * mp->avctx->width + x) / 4] = p;
                 } else {
@@ -234,23 +251,24 @@ static void mp_decode_line(MotionPixelsContext *mp, BitstreamContext *bc, int y)
     }
 }
 
-static void mp_decode_frame_helper(MotionPixelsContext *mp,
-                                   BitstreamContext *bc)
+static void mp_decode_frame_helper(MotionPixelsContext *mp, GetBitContext *gb)
 {
     YuvPixel p;
     int y, y0;
 
+    av_assert1(mp->changes_map[0]);
+
     for (y = 0; y < mp->avctx->height; ++y) {
         if (mp->changes_map[y * mp->avctx->width] != 0) {
             memset(mp->gradient_scale, 1, sizeof(mp->gradient_scale));
             p = mp_get_yuv_from_rgb(mp, 0, y);
         } else {
-            p.y += mp_gradient(mp, 0, mp_get_vlc(mp, bc));
+            p.y += mp_gradient(mp, 0, mp_get_vlc(mp, gb));
             p.y = av_clip_uintp2(p.y, 5);
             if ((y & 3) == 0) {
-                p.v += mp_gradient(mp, 1, mp_get_vlc(mp, bc));
+                p.v += mp_gradient(mp, 1, mp_get_vlc(mp, gb));
                 p.v = av_clip_intp2(p.v, 5);
-                p.u += mp_gradient(mp, 2, mp_get_vlc(mp, bc));
+                p.u += mp_gradient(mp, 2, mp_get_vlc(mp, gb));
                 p.u = av_clip_intp2(p.u, 5);
             }
             mp->vpt[y] = p;
@@ -259,7 +277,7 @@ static void mp_decode_frame_helper(MotionPixelsContext *mp,
     }
     for (y0 = 0; y0 < 2; ++y0)
         for (y = y0; y < mp->avctx->height; y += 2)
-            mp_decode_line(mp, bc, y);
+            mp_decode_line(mp, gb, y);
 }
 
 static int mp_decode_frame(AVCodecContext *avctx,
@@ -269,46 +287,44 @@ static int mp_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     MotionPixelsContext *mp = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
     int i, count1, count2, sz, ret;
 
-    if ((ret = ff_reget_buffer(avctx, mp->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, mp->frame)) < 0)
         return ret;
-    }
 
     /* le32 bitstream msb first */
-    av_fast_malloc(&mp->bswapbuf, &mp->bswapbuf_size, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&mp->bswapbuf, &mp->bswapbuf_size, buf_size);
     if (!mp->bswapbuf)
         return AVERROR(ENOMEM);
     mp->bdsp.bswap_buf((uint32_t *) mp->bswapbuf, (const uint32_t *) buf,
                        buf_size / 4);
     if (buf_size & 3)
         memcpy(mp->bswapbuf + (buf_size & ~3), buf + (buf_size & ~3), buf_size & 3);
-    memset(mp->bswapbuf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-    bitstream_init8(&bc, mp->bswapbuf, buf_size);
+    init_get_bits(&gb, mp->bswapbuf, buf_size * 8);
 
     memset(mp->changes_map, 0, avctx->width * avctx->height);
     for (i = !(avctx->extradata[1] & 2); i < 2; ++i) {
-        count1 = bitstream_read(&bc, 12);
-        count2 = bitstream_read(&bc, 12);
-        mp_read_changes_map(mp, &bc, count1, 8, i);
-        mp_read_changes_map(mp, &bc, count2, 4, i);
+        count1 = get_bits(&gb, 12);
+        count2 = get_bits(&gb, 12);
+        mp_read_changes_map(mp, &gb, count1, 8, i);
+        mp_read_changes_map(mp, &gb, count2, 4, i);
     }
 
-    mp->codes_count = bitstream_read(&bc, 4);
+    mp->codes_count = get_bits(&gb, 4);
     if (mp->codes_count == 0)
         goto end;
 
     if (mp->changes_map[0] == 0) {
-        *(uint16_t *)mp->frame->data[0] = bitstream_read(&bc, 15);
+        *(uint16_t *)mp->frame->data[0] = get_bits(&gb, 15);
         mp->changes_map[0] = 1;
     }
-    mp_read_codes_table(mp, &bc);
+    if (mp_read_codes_table(mp, &gb) < 0)
+        goto end;
 
-    sz = bitstream_read(&bc, 18);
+    sz = get_bits(&gb, 18);
     if (avctx->extradata[0] != 5)
-        sz += bitstream_read(&bc, 18);
+        sz += get_bits(&gb, 18);
     if (sz == 0)
         goto end;
 
@@ -316,7 +332,7 @@ static int mp_decode_frame(AVCodecContext *avctx,
         goto end;
     if (init_vlc(&mp->vlc, mp->max_codes_bits, mp->codes_count, &mp->codes[0].size, sizeof(HuffCode), 1, &mp->codes[0].code, sizeof(HuffCode), 4, 0))
         goto end;
-    mp_decode_frame_helper(mp, &bc);
+    mp_decode_frame_helper(mp, &gb);
     ff_free_vlc(&mp->vlc);
 
 end:
diff --git a/libavcodec/motionpixels_tablegen.c b/libavcodec/motionpixels_tablegen.c
index 2f0df3c..1bebaf1 100644
--- a/libavcodec/motionpixels_tablegen.c
+++ b/libavcodec/motionpixels_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/motionpixels_tablegen.h b/libavcodec/motionpixels_tablegen.h
index 2d0c0ff..9239b6a 100644
--- a/libavcodec/motionpixels_tablegen.h
+++ b/libavcodec/motionpixels_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,13 +24,14 @@
 #define AVCODEC_MOTIONPIXELS_TABLEGEN_H
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
 
 typedef struct YuvPixel {
     int8_t y, v, u;
 } YuvPixel;
 
 static int mp_yuv_to_rgb(int y, int v, int u, int clip_rgb) {
-    static const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
     int r, g, b;
 
     r = (1000 * y + 701 * v) / 1000;
@@ -49,7 +50,7 @@ static int mp_yuv_to_rgb(int y, int v, int u, int clip_rgb) {
 #else
 static YuvPixel mp_rgb_yuv_table[1 << 15];
 
-static void mp_set_zero_yuv(YuvPixel *p)
+static av_cold void mp_set_zero_yuv(YuvPixel *p)
 {
     int i, j;
 
@@ -63,7 +64,7 @@ static void mp_set_zero_yuv(YuvPixel *p)
     }
 }
 
-static void mp_build_rgb_yuv_table(YuvPixel *p)
+static av_cold void mp_build_rgb_yuv_table(YuvPixel *p)
 {
     int y, v, u, i;
 
@@ -81,7 +82,7 @@ static void mp_build_rgb_yuv_table(YuvPixel *p)
         mp_set_zero_yuv(p + i * 32);
 }
 
-static void motionpixels_tableinit(void)
+static av_cold void motionpixels_tableinit(void)
 {
     if (!mp_rgb_yuv_table[0].u)
         mp_build_rgb_yuv_table(mp_rgb_yuv_table);
diff --git a/libavcodec/movsub_bsf.c b/libavcodec/movsub_bsf.c
index fc6b236..5878607 100644
--- a/libavcodec/movsub_bsf.c
+++ b/libavcodec/movsub_bsf.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Reimar Döffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,35 +62,23 @@ const AVBitStreamFilter ff_text2movsub_bsf = {
     .filter = text2movsub,
 };
 
-static int mov2textsub(AVBSFContext *ctx, AVPacket *out)
+static int mov2textsub(AVBSFContext *ctx, AVPacket *pkt)
 {
-    AVPacket *in;
     int ret = 0;
 
-    ret = ff_bsf_get_packet(ctx, &in);
+    ret = ff_bsf_get_packet_ref(ctx, pkt);
     if (ret < 0)
         return ret;
 
-    if (in->size < 2) {
-       ret = AVERROR_INVALIDDATA;
-       goto fail;
+    if (pkt->size < 2) {
+       av_packet_unref(pkt);
+       return AVERROR_INVALIDDATA;
     }
 
-    ret = av_new_packet(out, FFMIN(in->size - 2, AV_RB16(in->data)));
-    if (ret < 0)
-        goto fail;
-
-    ret = av_packet_copy_props(out, in);
-    if (ret < 0)
-        goto fail;
-
-    memcpy(out->data, in->data + 2, out->size);
+    pkt->data += 2;
+    pkt->size  = FFMIN(pkt->size - 2, AV_RB16(pkt->data));
 
-fail:
-    if (ret < 0)
-        av_packet_unref(out);
-    av_packet_free(&in);
-    return ret;
+    return 0;
 }
 
 const AVBitStreamFilter ff_mov2textsub_bsf = {
diff --git a/libavcodec/movtextdec.c b/libavcodec/movtextdec.c
new file mode 100644
index 0000000..c38c5ed
--- /dev/null
+++ b/libavcodec/movtextdec.c
@@ -0,0 +1,580 @@
+/*
+ * 3GPP TS 26.245 Timed Text decoder
+ * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/bprint.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+
+#define STYLE_FLAG_BOLD         (1<<0)
+#define STYLE_FLAG_ITALIC       (1<<1)
+#define STYLE_FLAG_UNDERLINE    (1<<2)
+
+#define BOX_SIZE_INITIAL    40
+
+#define STYL_BOX   (1<<0)
+#define HLIT_BOX   (1<<1)
+#define HCLR_BOX   (1<<2)
+#define TWRP_BOX   (1<<3)
+
+#define BOTTOM_LEFT     1
+#define BOTTOM_CENTER   2
+#define BOTTOM_RIGHT    3
+#define MIDDLE_LEFT     4
+#define MIDDLE_CENTER   5
+#define MIDDLE_RIGHT    6
+#define TOP_LEFT        7
+#define TOP_CENTER      8
+#define TOP_RIGHT       9
+
+typedef struct {
+    char *font;
+    int fontsize;
+    int color;
+    int back_color;
+    int bold;
+    int italic;
+    int underline;
+    int alignment;
+} MovTextDefault;
+
+typedef struct {
+    uint16_t fontID;
+    char *font;
+} FontRecord;
+
+typedef struct {
+    uint16_t style_start;
+    uint16_t style_end;
+    uint8_t style_flag;
+    uint8_t fontsize;
+    uint16_t style_fontID;
+} StyleBox;
+
+typedef struct {
+    uint16_t hlit_start;
+    uint16_t hlit_end;
+} HighlightBox;
+
+typedef struct {
+   uint8_t hlit_color[4];
+} HilightcolorBox;
+
+typedef struct {
+    uint8_t wrap_flag;
+} TextWrapBox;
+
+typedef struct {
+    StyleBox **s;
+    StyleBox *s_temp;
+    HighlightBox h;
+    HilightcolorBox c;
+    FontRecord **ftab;
+    FontRecord *ftab_temp;
+    TextWrapBox w;
+    MovTextDefault d;
+    uint8_t box_flags;
+    uint16_t style_entries, ftab_entries;
+    uint64_t tracksize;
+    int size_var;
+    int count_s, count_f;
+    int readorder;
+} MovTextContext;
+
+typedef struct {
+    uint32_t type;
+    size_t base_size;
+    int (*decode)(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt);
+} Box;
+
+static void mov_text_cleanup(MovTextContext *m)
+{
+    int i;
+    if (m->box_flags & STYL_BOX) {
+        for(i = 0; i < m->count_s; i++) {
+            av_freep(&m->s[i]);
+        }
+        av_freep(&m->s);
+        m->count_s = 0;
+        m->style_entries = 0;
+    }
+}
+
+static void mov_text_cleanup_ftab(MovTextContext *m)
+{
+    int i;
+    if (m->ftab_temp)
+        av_freep(&m->ftab_temp->font);
+    av_freep(&m->ftab_temp);
+    if (m->ftab) {
+        for(i = 0; i < m->count_f; i++) {
+            av_freep(&m->ftab[i]->font);
+            av_freep(&m->ftab[i]);
+        }
+    }
+    av_freep(&m->ftab);
+}
+
+static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
+{
+    uint8_t *tx3g_ptr = avctx->extradata;
+    int i, box_size, font_length;
+    int8_t v_align, h_align;
+    int style_fontID;
+    StyleBox s_default;
+
+    m->count_f = 0;
+    m->ftab_entries = 0;
+    box_size = BOX_SIZE_INITIAL; /* Size till ftab_entries */
+    if (avctx->extradata_size < box_size)
+        return -1;
+
+    // Display Flags
+    tx3g_ptr += 4;
+    // Alignment
+    h_align = *tx3g_ptr++;
+    v_align = *tx3g_ptr++;
+    if (h_align == 0) {
+        if (v_align == 0)
+            m->d.alignment = TOP_LEFT;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_LEFT;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_LEFT;
+    }
+    if (h_align == 1) {
+        if (v_align == 0)
+            m->d.alignment = TOP_CENTER;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_CENTER;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_CENTER;
+    }
+    if (h_align == -1) {
+        if (v_align == 0)
+            m->d.alignment = TOP_RIGHT;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_RIGHT;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_RIGHT;
+    }
+    // Background Color
+    m->d.back_color = AV_RB24(tx3g_ptr);
+    tx3g_ptr += 4;
+    // BoxRecord
+    tx3g_ptr += 8;
+    // StyleRecord
+    tx3g_ptr += 4;
+    // fontID
+    style_fontID = AV_RB16(tx3g_ptr);
+    tx3g_ptr += 2;
+    // face-style-flags
+    s_default.style_flag = *tx3g_ptr++;
+    m->d.bold = s_default.style_flag & STYLE_FLAG_BOLD;
+    m->d.italic = s_default.style_flag & STYLE_FLAG_ITALIC;
+    m->d.underline = s_default.style_flag & STYLE_FLAG_UNDERLINE;
+    // fontsize
+    m->d.fontsize = *tx3g_ptr++;
+    // Primary color
+    m->d.color = AV_RB24(tx3g_ptr);
+    tx3g_ptr += 4;
+    // FontRecord
+    // FontRecord Size
+    tx3g_ptr += 4;
+    // ftab
+    tx3g_ptr += 4;
+
+    m->ftab_entries = AV_RB16(tx3g_ptr);
+    tx3g_ptr += 2;
+
+    for (i = 0; i < m->ftab_entries; i++) {
+
+        box_size += 3;
+        if (avctx->extradata_size < box_size) {
+            mov_text_cleanup_ftab(m);
+            m->ftab_entries = 0;
+            return -1;
+        }
+        m->ftab_temp = av_mallocz(sizeof(*m->ftab_temp));
+        if (!m->ftab_temp) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        m->ftab_temp->fontID = AV_RB16(tx3g_ptr);
+        tx3g_ptr += 2;
+        font_length = *tx3g_ptr++;
+
+        box_size = box_size + font_length;
+        if (avctx->extradata_size < box_size) {
+            mov_text_cleanup_ftab(m);
+            m->ftab_entries = 0;
+            return -1;
+        }
+        m->ftab_temp->font = av_malloc(font_length + 1);
+        if (!m->ftab_temp->font) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        memcpy(m->ftab_temp->font, tx3g_ptr, font_length);
+        m->ftab_temp->font[font_length] = '\0';
+        av_dynarray_add(&m->ftab, &m->count_f, m->ftab_temp);
+        if (!m->ftab) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        m->ftab_temp = NULL;
+        tx3g_ptr = tx3g_ptr + font_length;
+    }
+    for (i = 0; i < m->ftab_entries; i++) {
+        if (style_fontID == m->ftab[i]->fontID)
+            m->d.font = m->ftab[i]->font;
+    }
+    return 0;
+}
+
+static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= TWRP_BOX;
+    m->w.wrap_flag = *tsmb++;
+    return 0;
+}
+
+static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= HLIT_BOX;
+    m->h.hlit_start = AV_RB16(tsmb);
+    tsmb += 2;
+    m->h.hlit_end = AV_RB16(tsmb);
+    tsmb += 2;
+    return 0;
+}
+
+static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= HCLR_BOX;
+    memcpy(m->c.hlit_color, tsmb, 4);
+    tsmb += 4;
+    return 0;
+}
+
+static int decode_styl(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    int i;
+    int style_entries = AV_RB16(tsmb);
+    tsmb += 2;
+    // A single style record is of length 12 bytes.
+    if (m->tracksize + m->size_var + 2 + style_entries * 12 > avpkt->size)
+        return -1;
+
+    m->style_entries = style_entries;
+
+    m->box_flags |= STYL_BOX;
+    for(i = 0; i < m->style_entries; i++) {
+        m->s_temp = av_malloc(sizeof(*m->s_temp));
+        if (!m->s_temp) {
+            mov_text_cleanup(m);
+            return AVERROR(ENOMEM);
+        }
+        m->s_temp->style_start = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_end = AV_RB16(tsmb);
+
+        if (   m->s_temp->style_end < m->s_temp->style_start
+            || (m->count_s && m->s_temp->style_start < m->s[m->count_s - 1]->style_end)) {
+            av_freep(&m->s_temp);
+            mov_text_cleanup(m);
+            return AVERROR(ENOMEM);
+        }
+
+        tsmb += 2;
+        m->s_temp->style_fontID = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_flag = AV_RB8(tsmb);
+        tsmb++;
+        m->s_temp->fontsize = AV_RB8(tsmb);
+        av_dynarray_add(&m->s, &m->count_s, m->s_temp);
+        if(!m->s) {
+            mov_text_cleanup(m);
+            return AVERROR(ENOMEM);
+        }
+        tsmb++;
+        // text-color-rgba
+        tsmb += 4;
+    }
+    return 0;
+}
+
+static const Box box_types[] = {
+    { MKBETAG('s','t','y','l'), 2, decode_styl },
+    { MKBETAG('h','l','i','t'), 4, decode_hlit },
+    { MKBETAG('h','c','l','r'), 4, decode_hclr },
+    { MKBETAG('t','w','r','p'), 1, decode_twrp }
+};
+
+const static size_t box_count = FF_ARRAY_ELEMS(box_types);
+
+// Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
+static int get_utf8_length_at(const char *text, const char *text_end)
+{
+    const char *start = text;
+    int err = 0;
+    uint32_t c;
+    GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
+    if (err)
+        goto error;
+    return text - start;
+error:
+    return 0;
+}
+
+static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
+                       AVCodecContext *avctx)
+{
+    MovTextContext *m = avctx->priv_data;
+    int i = 0;
+    int j = 0;
+    int text_pos = 0;
+
+    if (text < text_end && m->box_flags & TWRP_BOX) {
+        if (m->w.wrap_flag == 1) {
+            av_bprintf(buf, "{\\q1}"); /* End of line wrap */
+        } else {
+            av_bprintf(buf, "{\\q2}"); /* No wrap */
+        }
+    }
+
+    while (text < text_end) {
+        int len;
+
+        if (m->box_flags & STYL_BOX) {
+            for (i = 0; i < m->style_entries; i++) {
+                if (m->s[i]->style_flag && text_pos == m->s[i]->style_end) {
+                    av_bprintf(buf, "{\\r}");
+                }
+            }
+            for (i = 0; i < m->style_entries; i++) {
+                if (m->s[i]->style_flag && text_pos == m->s[i]->style_start) {
+                    if (m->s[i]->style_flag & STYLE_FLAG_BOLD)
+                        av_bprintf(buf, "{\\b1}");
+                    if (m->s[i]->style_flag & STYLE_FLAG_ITALIC)
+                        av_bprintf(buf, "{\\i1}");
+                    if (m->s[i]->style_flag & STYLE_FLAG_UNDERLINE)
+                        av_bprintf(buf, "{\\u1}");
+                    av_bprintf(buf, "{\\fs%d}", m->s[i]->fontsize);
+                    for (j = 0; j < m->ftab_entries; j++) {
+                        if (m->s[i]->style_fontID == m->ftab[j]->fontID)
+                            av_bprintf(buf, "{\\fn%s}", m->ftab[j]->font);
+                    }
+                }
+            }
+        }
+        if (m->box_flags & HLIT_BOX) {
+            if (text_pos == m->h.hlit_start) {
+                /* If hclr box is present, set the secondary color to the color
+                 * specified. Otherwise, set primary color to white and secondary
+                 * color to black. These colors will come from TextSampleModifier
+                 * boxes in future and inverse video technique for highlight will
+                 * be implemented.
+                 */
+                if (m->box_flags & HCLR_BOX) {
+                    av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
+                                m->c.hlit_color[1], m->c.hlit_color[0]);
+                } else {
+                    av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
+                }
+            }
+            if (text_pos == m->h.hlit_end) {
+                if (m->box_flags & HCLR_BOX) {
+                    av_bprintf(buf, "{\\2c&H000000&}");
+                } else {
+                    av_bprintf(buf, "{\\1c&HFFFFFF&}{\\2c&H000000&}");
+                }
+            }
+        }
+
+        len = get_utf8_length_at(text, text_end);
+        if (len < 1) {
+            av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
+            len = 1;
+        }
+        for (i = 0; i < len; i++) {
+            switch (*text) {
+            case '\r':
+                break;
+            case '\n':
+                av_bprintf(buf, "\\N");
+                break;
+            default:
+                av_bprint_chars(buf, *text, 1);
+                break;
+            }
+            text++;
+        }
+        text_pos++;
+    }
+
+    return 0;
+}
+
+static int mov_text_init(AVCodecContext *avctx) {
+    /*
+     * TODO: Handle the default text style.
+     * NB: Most players ignore styles completely, with the result that
+     * it's very common to find files where the default style is broken
+     * and respecting it results in a worse experience than ignoring it.
+     */
+    int ret;
+    MovTextContext *m = avctx->priv_data;
+    ret = mov_text_tx3g(avctx, m);
+    if (ret == 0) {
+        return ff_ass_subtitle_header(avctx, m->d.font, m->d.fontsize, m->d.color,
+                                m->d.back_color, m->d.bold, m->d.italic,
+                                m->d.underline, ASS_DEFAULT_BORDERSTYLE,
+                                m->d.alignment);
+    } else
+        return ff_ass_subtitle_header_default(avctx);
+}
+
+static int mov_text_decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    MovTextContext *m = avctx->priv_data;
+    int ret;
+    AVBPrint buf;
+    char *ptr = avpkt->data;
+    char *end;
+    int text_length, tsmb_type, ret_tsmb;
+    uint64_t tsmb_size;
+    const uint8_t *tsmb;
+    size_t i;
+
+    if (!ptr || avpkt->size < 2)
+        return AVERROR_INVALIDDATA;
+
+    /*
+     * A packet of size two with value zero is an empty subtitle
+     * used to mark the end of the previous non-empty subtitle.
+     * We can just drop them here as we have duration information
+     * already. If the value is non-zero, then it's technically a
+     * bad packet.
+     */
+    if (avpkt->size == 2)
+        return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
+
+    /*
+     * The first two bytes of the packet are the length of the text string
+     * In complex cases, there are style descriptors appended to the string
+     * so we can't just assume the packet size is the string size.
+     */
+    text_length = AV_RB16(ptr);
+    end = ptr + FFMIN(2 + text_length, avpkt->size);
+    ptr += 2;
+
+    mov_text_cleanup(m);
+
+    tsmb_size = 0;
+    m->tracksize = 2 + text_length;
+    m->style_entries = 0;
+    m->box_flags = 0;
+    m->count_s = 0;
+    // Note that the spec recommends lines be no longer than 2048 characters.
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (text_length + 2 != avpkt->size) {
+        while (m->tracksize + 8 <= avpkt->size) {
+            // A box is a minimum of 8 bytes.
+            tsmb = ptr + m->tracksize - 2;
+            tsmb_size = AV_RB32(tsmb);
+            tsmb += 4;
+            tsmb_type = AV_RB32(tsmb);
+            tsmb += 4;
+
+            if (tsmb_size == 1) {
+                if (m->tracksize + 16 > avpkt->size)
+                    break;
+                tsmb_size = AV_RB64(tsmb);
+                tsmb += 8;
+                m->size_var = 16;
+            } else
+                m->size_var = 8;
+            //size_var is equal to 8 or 16 depending on the size of box
+
+            if (tsmb_size == 0) {
+                av_log(avctx, AV_LOG_ERROR, "tsmb_size is 0\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (tsmb_size > avpkt->size - m->tracksize)
+                break;
+
+            for (i = 0; i < box_count; i++) {
+                if (tsmb_type == box_types[i].type) {
+                    if (m->tracksize + m->size_var + box_types[i].base_size > avpkt->size)
+                        break;
+                    ret_tsmb = box_types[i].decode(tsmb, m, avpkt);
+                    if (ret_tsmb == -1)
+                        break;
+                }
+            }
+            m->tracksize = m->tracksize + tsmb_size;
+        }
+        text_to_ass(&buf, ptr, end, avctx);
+        mov_text_cleanup(m);
+    } else
+        text_to_ass(&buf, ptr, end, avctx);
+
+    ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static int mov_text_decode_close(AVCodecContext *avctx)
+{
+    MovTextContext *m = avctx->priv_data;
+    mov_text_cleanup_ftab(m);
+    mov_text_cleanup(m);
+    return 0;
+}
+
+static void mov_text_flush(AVCodecContext *avctx)
+{
+    MovTextContext *m = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        m->readorder = 0;
+}
+
+AVCodec ff_movtext_decoder = {
+    .name         = "mov_text",
+    .long_name    = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_MOV_TEXT,
+    .priv_data_size = sizeof(MovTextContext),
+    .init         = mov_text_init,
+    .decode       = mov_text_decode_frame,
+    .close        = mov_text_decode_close,
+    .flush        = mov_text_flush,
+};
diff --git a/libavcodec/movtextenc.c b/libavcodec/movtextenc.c
new file mode 100644
index 0000000..c19ef38
--- /dev/null
+++ b/libavcodec/movtextenc.c
@@ -0,0 +1,441 @@
+/*
+ * 3GPP TS 26.245 Timed Text encoder
+ * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "libavutil/common.h"
+#include "ass_split.h"
+#include "ass.h"
+
+#define STYLE_FLAG_BOLD         (1<<0)
+#define STYLE_FLAG_ITALIC       (1<<1)
+#define STYLE_FLAG_UNDERLINE    (1<<2)
+#define STYLE_RECORD_SIZE       12
+#define SIZE_ADD                10
+
+#define STYL_BOX   (1<<0)
+#define HLIT_BOX   (1<<1)
+#define HCLR_BOX   (1<<2)
+
+#define av_bprint_append_any(buf, data, size)   av_bprint_append_data(buf, ((const char*)data), size)
+
+typedef struct {
+    uint16_t style_start;
+    uint16_t style_end;
+    uint8_t style_flag;
+} StyleBox;
+
+typedef struct {
+    uint16_t start;
+    uint16_t end;
+} HighlightBox;
+
+typedef struct {
+   uint32_t color;
+} HilightcolorBox;
+
+typedef struct {
+    AVCodecContext *avctx;
+
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    StyleBox **style_attributes;
+    StyleBox *style_attributes_temp;
+    HighlightBox hlit;
+    HilightcolorBox hclr;
+    int count;
+    uint8_t box_flags;
+    uint16_t style_entries;
+    uint16_t style_fontID;
+    uint8_t style_fontsize;
+    uint32_t style_color;
+    uint16_t text_pos;
+    uint16_t byte_count;
+} MovTextContext;
+
+typedef struct {
+    uint32_t type;
+    void (*encode)(MovTextContext *s, uint32_t tsmb_type);
+} Box;
+
+static void mov_text_cleanup(MovTextContext *s)
+{
+    int j;
+    if (s->box_flags & STYL_BOX) {
+        for (j = 0; j < s->count; j++) {
+            av_freep(&s->style_attributes[j]);
+        }
+        av_freep(&s->style_attributes);
+    }
+}
+
+static void encode_styl(MovTextContext *s, uint32_t tsmb_type)
+{
+    int j;
+    uint32_t tsmb_size;
+    if (s->box_flags & STYL_BOX) {
+        tsmb_size = s->count * STYLE_RECORD_SIZE + SIZE_ADD;
+        tsmb_size = AV_RB32(&tsmb_size);
+        s->style_entries = AV_RB16(&s->count);
+        s->style_fontID = 0x00 | 0x01<<8;
+        s->style_fontsize = 0x12;
+        s->style_color = MKTAG(0xFF, 0xFF, 0xFF, 0xFF);
+        /*The above three attributes are hard coded for now
+        but will come from ASS style in the future*/
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->style_entries, 2);
+        for (j = 0; j < s->count; j++) {
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_start, 2);
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_end, 2);
+            av_bprint_append_any(&s->buffer, &s->style_fontID, 2);
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_flag, 1);
+            av_bprint_append_any(&s->buffer, &s->style_fontsize, 1);
+            av_bprint_append_any(&s->buffer, &s->style_color, 4);
+        }
+        mov_text_cleanup(s);
+    }
+}
+
+static void encode_hlit(MovTextContext *s, uint32_t tsmb_type)
+{
+    uint32_t tsmb_size;
+    if (s->box_flags & HLIT_BOX) {
+        tsmb_size = 12;
+        tsmb_size = AV_RB32(&tsmb_size);
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->hlit.start, 2);
+        av_bprint_append_any(&s->buffer, &s->hlit.end, 2);
+    }
+}
+
+static void encode_hclr(MovTextContext *s, uint32_t tsmb_type)
+{
+    uint32_t tsmb_size;
+    if (s->box_flags & HCLR_BOX) {
+        tsmb_size = 12;
+        tsmb_size = AV_RB32(&tsmb_size);
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->hclr.color, 4);
+    }
+}
+
+static const Box box_types[] = {
+    { MKTAG('s','t','y','l'), encode_styl },
+    { MKTAG('h','l','i','t'), encode_hlit },
+    { MKTAG('h','c','l','r'), encode_hclr },
+};
+
+const static size_t box_count = FF_ARRAY_ELEMS(box_types);
+
+static av_cold int mov_text_encode_init(AVCodecContext *avctx)
+{
+    /*
+     * For now, we'll use a fixed default style. When we add styling
+     * support, this will be generated from the ASS style.
+     */
+    static const uint8_t text_sample_entry[] = {
+        0x00, 0x00, 0x00, 0x00, // uint32_t displayFlags
+        0x01,                   // int8_t horizontal-justification
+        0xFF,                   // int8_t vertical-justification
+        0x00, 0x00, 0x00, 0x00, // uint8_t background-color-rgba[4]
+        // BoxRecord {
+        0x00, 0x00,             // int16_t top
+        0x00, 0x00,             // int16_t left
+        0x00, 0x00,             // int16_t bottom
+        0x00, 0x00,             // int16_t right
+        // };
+        // StyleRecord {
+        0x00, 0x00,             // uint16_t startChar
+        0x00, 0x00,             // uint16_t endChar
+        0x00, 0x01,             // uint16_t font-ID
+        0x00,                   // uint8_t face-style-flags
+        0x12,                   // uint8_t font-size
+        0xFF, 0xFF, 0xFF, 0xFF, // uint8_t text-color-rgba[4]
+        // };
+        // FontTableBox {
+        0x00, 0x00, 0x00, 0x12, // uint32_t size
+        'f', 't', 'a', 'b',     // uint8_t name[4]
+        0x00, 0x01,             // uint16_t entry-count
+        // FontRecord {
+        0x00, 0x01,             // uint16_t font-ID
+        0x05,                   // uint8_t font-name-length
+        'S', 'e', 'r', 'i', 'f',// uint8_t font[font-name-length]
+        // };
+        // };
+    };
+
+    MovTextContext *s = avctx->priv_data;
+    s->avctx = avctx;
+
+    avctx->extradata_size = sizeof text_sample_entry;
+    avctx->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    memcpy(avctx->extradata, text_sample_entry, avctx->extradata_size);
+
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+static void mov_text_style_cb(void *priv, const char style, int close)
+{
+    MovTextContext *s = priv;
+    if (!close) {
+        if (!(s->box_flags & STYL_BOX)) {   //first style entry
+
+            s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+
+            if (!s->style_attributes_temp) {
+                av_bprint_clear(&s->buffer);
+                s->box_flags &= ~STYL_BOX;
+                return;
+            }
+
+            s->style_attributes_temp->style_flag = 0;
+            s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+        } else {
+            if (s->style_attributes_temp->style_flag) { //break the style record here and start a new one
+                s->style_attributes_temp->style_end = AV_RB16(&s->text_pos);
+                av_dynarray_add(&s->style_attributes, &s->count, s->style_attributes_temp);
+                s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+                if (!s->style_attributes_temp) {
+                    mov_text_cleanup(s);
+                    av_bprint_clear(&s->buffer);
+                    s->box_flags &= ~STYL_BOX;
+                    return;
+                }
+
+                s->style_attributes_temp->style_flag = s->style_attributes[s->count - 1]->style_flag;
+                s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+            } else {
+                s->style_attributes_temp->style_flag = 0;
+                s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+            }
+        }
+        switch (style){
+        case 'b':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_BOLD;
+            break;
+        case 'i':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_ITALIC;
+            break;
+        case 'u':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_UNDERLINE;
+            break;
+        }
+    } else if (!s->style_attributes_temp) {
+        av_log(s->avctx, AV_LOG_WARNING, "Ignoring unmatched close tag\n");
+        return;
+    } else {
+        s->style_attributes_temp->style_end = AV_RB16(&s->text_pos);
+        av_dynarray_add(&s->style_attributes, &s->count, s->style_attributes_temp);
+
+        s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+
+        if (!s->style_attributes_temp) {
+            mov_text_cleanup(s);
+            av_bprint_clear(&s->buffer);
+            s->box_flags &= ~STYL_BOX;
+            return;
+        }
+
+        s->style_attributes_temp->style_flag = s->style_attributes[s->count - 1]->style_flag;
+        switch (style){
+        case 'b':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_BOLD;
+            break;
+        case 'i':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_ITALIC;
+            break;
+        case 'u':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_UNDERLINE;
+            break;
+        }
+        if (s->style_attributes_temp->style_flag) { //start of new style record
+            s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+        }
+    }
+    s->box_flags |= STYL_BOX;
+}
+
+static void mov_text_color_cb(void *priv, unsigned int color, unsigned int color_id)
+{
+    MovTextContext *s = priv;
+    if (color_id == 2) {    //secondary color changes
+        if (s->box_flags & HLIT_BOX) {  //close tag
+            s->hlit.end = AV_RB16(&s->text_pos);
+        } else {
+            s->box_flags |= HCLR_BOX;
+            s->box_flags |= HLIT_BOX;
+            s->hlit.start = AV_RB16(&s->text_pos);
+            s->hclr.color = color | (0xFF << 24);  //set alpha value to FF
+        }
+    }
+    /* If there are more than one secondary color changes in ASS, take start of
+       first section and end of last section. Movtext allows only one
+       highlight box per sample.
+     */
+}
+
+static uint16_t utf8_strlen(const char *text, int len)
+{
+    uint16_t i = 0, ret = 0;
+    while (i < len) {
+        char c = text[i];
+        if ((c & 0x80) == 0)
+            i += 1;
+        else if ((c & 0xE0) == 0xC0)
+            i += 2;
+        else if ((c & 0xF0) == 0xE0)
+            i += 3;
+        else if ((c & 0xF8) == 0xF0)
+            i += 4;
+        else
+            return 0;
+        ret++;
+    }
+    return ret;
+}
+
+static void mov_text_text_cb(void *priv, const char *text, int len)
+{
+    uint16_t utf8_len = utf8_strlen(text, len);
+    MovTextContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+    // If it's not utf-8, just use the byte length
+    s->text_pos += utf8_len ? utf8_len : len;
+    s->byte_count += len;
+}
+
+static void mov_text_new_line_cb(void *priv, int forced)
+{
+    MovTextContext *s = priv;
+    av_bprint_append_data(&s->buffer, "\n", 1);
+    s->text_pos += 1;
+    s->byte_count += 1;
+}
+
+static const ASSCodesCallbacks mov_text_callbacks = {
+    .text     = mov_text_text_cb,
+    .new_line = mov_text_new_line_cb,
+    .style    = mov_text_style_cb,
+    .color    = mov_text_color_cb,
+};
+
+static int mov_text_encode_frame(AVCodecContext *avctx, unsigned char *buf,
+                                 int bufsize, const AVSubtitle *sub)
+{
+    MovTextContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i, length;
+    size_t j;
+
+    s->byte_count = 0;
+    s->text_pos = 0;
+    s->count = 0;
+    s->box_flags = 0;
+    s->style_entries = 0;
+    for (i = 0; i < sub->num_rects; i++) {
+        const char *ass = sub->rects[i]->ass;
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            int num;
+            dialog = ff_ass_split_dialog(s->ass_ctx, ass, 0, &num);
+            for (; dialog && num--; dialog++) {
+                ff_ass_split_override_codes(&mov_text_callbacks, s, dialog->text);
+            }
+        } else {
+#endif
+            dialog = ff_ass_split_dialog2(s->ass_ctx, ass);
+            if (!dialog)
+                return AVERROR(ENOMEM);
+            ff_ass_split_override_codes(&mov_text_callbacks, s, dialog->text);
+            ff_ass_free_dialog(&dialog);
+#if FF_API_ASS_TIMING
+        }
+#endif
+
+        for (j = 0; j < box_count; j++) {
+            box_types[j].encode(s, box_types[j].type);
+        }
+    }
+
+    AV_WB16(buf, s->byte_count);
+    buf += 2;
+
+    if (!av_bprint_is_complete(&s->buffer)) {
+        length = AVERROR(ENOMEM);
+        goto exit;
+    }
+
+    if (!s->buffer.len) {
+        length = 0;
+        goto exit;
+    }
+
+    if (s->buffer.len > bufsize - 3) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        length = AVERROR(EINVAL);
+        goto exit;
+    }
+
+    memcpy(buf, s->buffer.str, s->buffer.len);
+    length = s->buffer.len + 2;
+
+exit:
+    av_bprint_clear(&s->buffer);
+    return length;
+}
+
+static int mov_text_encode_close(AVCodecContext *avctx)
+{
+    MovTextContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+AVCodec ff_movtext_encoder = {
+    .name           = "mov_text",
+    .long_name      = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_MOV_TEXT,
+    .priv_data_size = sizeof(MovTextContext),
+    .init           = mov_text_encode_init,
+    .encode_sub     = mov_text_encode_frame,
+    .close          = mov_text_encode_close,
+};
diff --git a/libavcodec/mp3_header_decompress_bsf.c b/libavcodec/mp3_header_decompress_bsf.c
new file mode 100644
index 0000000..2948589
--- /dev/null
+++ b/libavcodec/mp3_header_decompress_bsf.c
@@ -0,0 +1,124 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "bsf.h"
+#include "mpegaudiodecheader.h"
+#include "mpegaudiodata.h"
+
+
+static int mp3_header_decompress(AVBSFContext *ctx, AVPacket *out)
+{
+    AVPacket *in;
+    uint32_t header;
+    int sample_rate= ctx->par_in->sample_rate;
+    int sample_rate_index=0;
+    int lsf, mpeg25, bitrate_index, frame_size, ret;
+    uint8_t *buf;
+    int buf_size;
+
+    ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
+
+    buf      = in->data;
+    buf_size = in->size;
+
+    header = AV_RB32(buf);
+    if(ff_mpa_check_header(header) >= 0){
+        av_packet_move_ref(out, in);
+        av_packet_free(&in);
+
+        return 0;
+    }
+
+    if(ctx->par_in->extradata_size != 15 || strcmp(ctx->par_in->extradata, "FFCMP3 0.0")){
+        av_log(ctx, AV_LOG_ERROR, "Extradata invalid %d\n", ctx->par_in->extradata_size);
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    header= AV_RB32(ctx->par_in->extradata+11) & MP3_MASK;
+
+    lsf     = sample_rate < (24000+32000)/2;
+    mpeg25  = sample_rate < (12000+16000)/2;
+    sample_rate_index= (header>>10)&3;
+    sample_rate= avpriv_mpa_freq_tab[sample_rate_index] >> (lsf + mpeg25); //in case sample rate is a little off
+
+    for(bitrate_index=2; bitrate_index<30; bitrate_index++){
+        frame_size = avpriv_mpa_bitrate_tab[lsf][2][bitrate_index>>1];
+        frame_size = (frame_size * 144000) / (sample_rate << lsf) + (bitrate_index&1);
+        if(frame_size == buf_size + 4)
+            break;
+        if(frame_size == buf_size + 6)
+            break;
+    }
+    if(bitrate_index == 30){
+        av_log(ctx, AV_LOG_ERROR, "Could not find bitrate_index.\n");
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    header |= (bitrate_index&1)<<9;
+    header |= (bitrate_index>>1)<<12;
+    header |= (frame_size == buf_size + 4)<<16; //FIXME actually set a correct crc instead of 0
+
+    ret = av_new_packet(out, frame_size);
+    if (ret < 0)
+        goto fail;
+    ret = av_packet_copy_props(out, in);
+    if (ret < 0) {
+        av_packet_unref(out);
+        goto fail;
+    }
+    memcpy(out->data + frame_size - buf_size, buf, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if(ctx->par_in->channels==2){
+        uint8_t *p= out->data + frame_size - buf_size;
+        if(lsf){
+            FFSWAP(int, p[1], p[2]);
+            header |= (p[1] & 0xC0)>>2;
+            p[1] &= 0x3F;
+        }else{
+            header |= p[1] & 0x30;
+            p[1] &= 0xCF;
+        }
+    }
+
+    AV_WB32(out->data, header);
+
+    ret = 0;
+
+fail:
+    av_packet_free(&in);
+    return ret;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_MP3, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_mp3_header_decompress_bsf = {
+    .name      = "mp3decomp",
+    .filter    = mp3_header_decompress,
+    .codec_ids = codec_ids,
+};
diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
index 88f7226..6cf9b9d 100644
--- a/libavcodec/mpc.c
+++ b/libavcodec/mpc.c
@@ -2,20 +2,20 @@
  * Musepack decoder core
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,13 +73,13 @@ void ff_mpc_dequantize_and_synth(MPCContext * c, int maxband, int16_t **out,
         for(ch = 0; ch < 2; ch++){
             if(bands[i].res[ch]){
                 j = 0;
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][0]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][0] & 0xFF];
                 for(; j < 12; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][1]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][1] & 0xFF];
                 for(; j < 24; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][2]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][2] & 0xFF];
                 for(; j < 36; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
             }
diff --git a/libavcodec/mpc.h b/libavcodec/mpc.h
index 39b8d63..df462af 100644
--- a/libavcodec/mpc.h
+++ b/libavcodec/mpc.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc7.c b/libavcodec/mpc7.c
index ebc3f52..e09f1b6 100644
--- a/libavcodec/mpc7.c
+++ b/libavcodec/mpc7.c
@@ -2,20 +2,20 @@
  * Musepack SV7 decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,20 +28,14 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/internal.h"
 #include "libavutil/lfg.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mpegaudiodsp.h"
-#include "vlc.h"
 
 #include "mpc.h"
 #include "mpc7data.h"
 
-#define BANDS            32
-#define SAMPLES_PER_BAND 36
-#define MPC_FRAME_SIZE   (BANDS * SAMPLES_PER_BAND)
-
 static VLC scfi_vlc, dscf_vlc, hdr_vlc, quant_vlc[MPC7_QUANT_VLC_TABLES][2];
 
 static const uint16_t quant_offsets[MPC7_QUANT_VLC_TABLES*2 + 1] =
@@ -53,9 +47,9 @@ static const uint16_t quant_offsets[MPC7_QUANT_VLC_TABLES*2 + 1] =
 
 static av_cold int mpc7_decode_init(AVCodecContext * avctx)
 {
-    int i, j;
+    int i, j, ret;
     MPCContext *c = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
     LOCAL_ALIGNED_16(uint8_t, buf, [16]);
     static int vlc_initialized = 0;
 
@@ -72,7 +66,7 @@ static av_cold int mpc7_decode_init(AVCodecContext * avctx)
 
     if(avctx->extradata_size < 16){
         av_log(avctx, AV_LOG_ERROR, "Too small extradata size (%i)!\n", avctx->extradata_size);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     memset(c->oldDSCF, 0, sizeof(c->oldDSCF));
     av_lfg_init(&c->rnd, 0xDEADBEEF);
@@ -80,18 +74,18 @@ static av_cold int mpc7_decode_init(AVCodecContext * avctx)
     ff_mpadsp_init(&c->mpadsp);
     c->bdsp.bswap_buf((uint32_t *) buf, (const uint32_t *) avctx->extradata, 4);
     ff_mpc_init();
-    bitstream_init(&bc, buf, 128);
+    init_get_bits(&gb, buf, 128);
 
-    c->IS       = bitstream_read_bit(&bc);
-    c->MSS      = bitstream_read_bit(&bc);
-    c->maxbands = bitstream_read(&bc, 6);
+    c->IS = get_bits1(&gb);
+    c->MSS = get_bits1(&gb);
+    c->maxbands = get_bits(&gb, 6);
     if(c->maxbands >= BANDS){
         av_log(avctx, AV_LOG_ERROR, "Too many bands: %i\n", c->maxbands);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
-    bitstream_skip(&bc, 88);
-    c->gapless      = bitstream_read_bit(&bc);
-    c->lastframelen = bitstream_read(&bc, 11);
+    skip_bits_long(&gb, 88);
+    c->gapless = get_bits1(&gb);
+    c->lastframelen = get_bits(&gb, 11);
     av_log(avctx, AV_LOG_DEBUG, "IS: %d, MSS: %d, TG: %d, LFL: %d, bands: %d\n",
             c->IS, c->MSS, c->gapless, c->lastframelen, c->maxbands);
     c->frames_to_skip = 0;
@@ -103,37 +97,37 @@ static av_cold int mpc7_decode_init(AVCodecContext * avctx)
     av_log(avctx, AV_LOG_DEBUG, "Initing VLC\n");
     scfi_vlc.table = scfi_table;
     scfi_vlc.table_allocated = 1 << MPC7_SCFI_BITS;
-    if(init_vlc(&scfi_vlc, MPC7_SCFI_BITS, MPC7_SCFI_SIZE,
+    if ((ret = init_vlc(&scfi_vlc, MPC7_SCFI_BITS, MPC7_SCFI_SIZE,
                 &mpc7_scfi[1], 2, 1,
-                &mpc7_scfi[0], 2, 1, INIT_VLC_USE_NEW_STATIC)){
+                &mpc7_scfi[0], 2, 1, INIT_VLC_USE_NEW_STATIC))) {
         av_log(avctx, AV_LOG_ERROR, "Cannot init SCFI VLC\n");
-        return -1;
+        return ret;
     }
     dscf_vlc.table = dscf_table;
     dscf_vlc.table_allocated = 1 << MPC7_DSCF_BITS;
-    if(init_vlc(&dscf_vlc, MPC7_DSCF_BITS, MPC7_DSCF_SIZE,
+    if ((ret = init_vlc(&dscf_vlc, MPC7_DSCF_BITS, MPC7_DSCF_SIZE,
                 &mpc7_dscf[1], 2, 1,
-                &mpc7_dscf[0], 2, 1, INIT_VLC_USE_NEW_STATIC)){
+                &mpc7_dscf[0], 2, 1, INIT_VLC_USE_NEW_STATIC))) {
         av_log(avctx, AV_LOG_ERROR, "Cannot init DSCF VLC\n");
-        return -1;
+        return ret;
     }
     hdr_vlc.table = hdr_table;
     hdr_vlc.table_allocated = 1 << MPC7_HDR_BITS;
-    if(init_vlc(&hdr_vlc, MPC7_HDR_BITS, MPC7_HDR_SIZE,
+    if ((ret = init_vlc(&hdr_vlc, MPC7_HDR_BITS, MPC7_HDR_SIZE,
                 &mpc7_hdr[1], 2, 1,
-                &mpc7_hdr[0], 2, 1, INIT_VLC_USE_NEW_STATIC)){
+                &mpc7_hdr[0], 2, 1, INIT_VLC_USE_NEW_STATIC))) {
         av_log(avctx, AV_LOG_ERROR, "Cannot init HDR VLC\n");
-        return -1;
+        return ret;
     }
     for(i = 0; i < MPC7_QUANT_VLC_TABLES; i++){
         for(j = 0; j < 2; j++){
             quant_vlc[i][j].table = &quant_tables[quant_offsets[i*2 + j]];
             quant_vlc[i][j].table_allocated = quant_offsets[i*2 + j + 1] - quant_offsets[i*2 + j];
-            if(init_vlc(&quant_vlc[i][j], 9, mpc7_quant_vlc_sizes[i],
+            if ((ret = init_vlc(&quant_vlc[i][j], 9, mpc7_quant_vlc_sizes[i],
                         &mpc7_quant_vlc[i][j][1], 4, 2,
-                        &mpc7_quant_vlc[i][j][0], 4, 2, INIT_VLC_USE_NEW_STATIC)){
+                        &mpc7_quant_vlc[i][j][0], 4, 2, INIT_VLC_USE_NEW_STATIC))) {
                 av_log(avctx, AV_LOG_ERROR, "Cannot init QUANT VLC %i,%i\n",i,j);
-                return -1;
+                return ret;
             }
         }
     }
@@ -145,7 +139,7 @@ static av_cold int mpc7_decode_init(AVCodecContext * avctx)
 /**
  * Fill samples for given subband
  */
-static inline void idx_to_quant(MPCContext *c, BitstreamContext *bc, int idx, int *dst)
+static inline void idx_to_quant(MPCContext *c, GetBitContext *gb, int idx, int *dst)
 {
     int i, i1, t;
     switch(idx){
@@ -155,44 +149,44 @@ static inline void idx_to_quant(MPCContext *c, BitstreamContext *bc, int idx, in
         }
         break;
     case 1:
-        i1 = bitstream_read_bit(bc);
+        i1 = get_bits1(gb);
         for(i = 0; i < SAMPLES_PER_BAND/3; i++){
-            t = bitstream_read_vlc(bc, quant_vlc[0][i1].table, 9, 2);
+            t = get_vlc2(gb, quant_vlc[0][i1].table, 9, 2);
             *dst++ = mpc7_idx30[t];
             *dst++ = mpc7_idx31[t];
             *dst++ = mpc7_idx32[t];
         }
         break;
     case 2:
-        i1 = bitstream_read_bit(bc);
+        i1 = get_bits1(gb);
         for(i = 0; i < SAMPLES_PER_BAND/2; i++){
-            t = bitstream_read_vlc(bc, quant_vlc[1][i1].table, 9, 2);
+            t = get_vlc2(gb, quant_vlc[1][i1].table, 9, 2);
             *dst++ = mpc7_idx50[t];
             *dst++ = mpc7_idx51[t];
         }
         break;
     case  3: case  4: case  5: case  6: case  7:
-        i1 = bitstream_read_bit(bc);
+        i1 = get_bits1(gb);
         for(i = 0; i < SAMPLES_PER_BAND; i++)
-            *dst++ = bitstream_read_vlc(bc, quant_vlc[idx - 1][i1].table, 9, 2) - mpc7_quant_vlc_off[idx - 1];
+            *dst++ = get_vlc2(gb, quant_vlc[idx-1][i1].table, 9, 2) - mpc7_quant_vlc_off[idx-1];
         break;
     case  8: case  9: case 10: case 11: case 12:
     case 13: case 14: case 15: case 16: case 17:
         t = (1 << (idx - 2)) - 1;
         for(i = 0; i < SAMPLES_PER_BAND; i++)
-            *dst++ = bitstream_read(bc, idx - 1) - t;
+            *dst++ = get_bits(gb, idx - 1) - t;
         break;
     default: // case 0 and -2..-17
         return;
     }
 }
 
-static int get_scale_idx(BitstreamContext *bc, int ref)
+static int get_scale_idx(GetBitContext *gb, int ref)
 {
-    int t = bitstream_read_vlc(bc, dscf_vlc.table, MPC7_DSCF_BITS, 1) - 7;
+    int t = get_vlc2(gb, dscf_vlc.table, MPC7_DSCF_BITS, 1) - 7;
     if (t == 8)
-        return bitstream_read(bc, 6);
-    return av_clip_uintp2(ref + t, 7);
+        return get_bits(gb, 6);
+    return ref + t;
 }
 
 static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
@@ -202,7 +196,7 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
     const uint8_t *buf = avpkt->data;
     int buf_size;
     MPCContext *c = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
     int i, ch;
     int mb = -1;
     Band *bands = c->bands;
@@ -228,60 +222,59 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
     buf_size  -= 4;
 
     /* get output buffer */
-    frame->nb_samples = last_frame ? c->lastframelen : MPC_FRAME_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = MPC_FRAME_SIZE;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     av_fast_padded_malloc(&c->bits, &c->buf_size, buf_size);
     if (!c->bits)
         return AVERROR(ENOMEM);
     c->bdsp.bswap_buf((uint32_t *) c->bits, (const uint32_t *) buf,
                       buf_size >> 2);
-    bitstream_init8(&bc, c->bits, buf_size);
-    bitstream_skip(&bc, skip);
+    if ((ret = init_get_bits8(&gb, c->bits, buf_size)) < 0)
+        return ret;
+    skip_bits_long(&gb, skip);
 
     /* read subband indexes */
     for(i = 0; i <= c->maxbands; i++){
         for(ch = 0; ch < 2; ch++){
             int t = 4;
-            if (i)
-                t = bitstream_read_vlc(&bc, hdr_vlc.table, MPC7_HDR_BITS, 1) - 5;
-            if (t == 4)
-                bands[i].res[ch] = bitstream_read(&bc, 4);
-            else bands[i].res[ch] = av_clip(bands[i-1].res[ch] + t, 0, 17);
+            if(i) t = get_vlc2(&gb, hdr_vlc.table, MPC7_HDR_BITS, 1) - 5;
+            if(t == 4) bands[i].res[ch] = get_bits(&gb, 4);
+            else bands[i].res[ch] = bands[i-1].res[ch] + t;
+            if (bands[i].res[ch] < -1 || bands[i].res[ch] > 17) {
+                av_log(avctx, AV_LOG_ERROR, "subband index invalid\n");
+                return AVERROR_INVALIDDATA;
+            }
         }
 
         if(bands[i].res[0] || bands[i].res[1]){
             mb = i;
-            if (c->MSS)
-                bands[i].msf = bitstream_read_bit(&bc);
+            if(c->MSS) bands[i].msf = get_bits1(&gb);
         }
     }
     /* get scale indexes coding method */
     for(i = 0; i <= mb; i++)
         for(ch = 0; ch < 2; ch++)
-            if (bands[i].res[ch])
-                bands[i].scfi[ch] = bitstream_read_vlc(&bc, scfi_vlc.table, MPC7_SCFI_BITS, 1);
+            if(bands[i].res[ch]) bands[i].scfi[ch] = get_vlc2(&gb, scfi_vlc.table, MPC7_SCFI_BITS, 1);
     /* get scale indexes */
     for(i = 0; i <= mb; i++){
         for(ch = 0; ch < 2; ch++){
             if(bands[i].res[ch]){
                 bands[i].scf_idx[ch][2] = c->oldDSCF[ch][i];
-                bands[i].scf_idx[ch][0] = get_scale_idx(&bc, bands[i].scf_idx[ch][2]);
+                bands[i].scf_idx[ch][0] = get_scale_idx(&gb, bands[i].scf_idx[ch][2]);
                 switch(bands[i].scfi[ch]){
                 case 0:
-                    bands[i].scf_idx[ch][1] = get_scale_idx(&bc, bands[i].scf_idx[ch][0]);
-                    bands[i].scf_idx[ch][2] = get_scale_idx(&bc, bands[i].scf_idx[ch][1]);
+                    bands[i].scf_idx[ch][1] = get_scale_idx(&gb, bands[i].scf_idx[ch][0]);
+                    bands[i].scf_idx[ch][2] = get_scale_idx(&gb, bands[i].scf_idx[ch][1]);
                     break;
                 case 1:
-                    bands[i].scf_idx[ch][1] = get_scale_idx(&bc, bands[i].scf_idx[ch][0]);
+                    bands[i].scf_idx[ch][1] = get_scale_idx(&gb, bands[i].scf_idx[ch][0]);
                     bands[i].scf_idx[ch][2] = bands[i].scf_idx[ch][1];
                     break;
                 case 2:
                     bands[i].scf_idx[ch][1] = bands[i].scf_idx[ch][0];
-                    bands[i].scf_idx[ch][2] = get_scale_idx(&bc, bands[i].scf_idx[ch][1]);
+                    bands[i].scf_idx[ch][2] = get_scale_idx(&gb, bands[i].scf_idx[ch][1]);
                     break;
                 case 3:
                     bands[i].scf_idx[ch][2] = bands[i].scf_idx[ch][1] = bands[i].scf_idx[ch][0];
@@ -296,15 +289,17 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
     off = 0;
     for(i = 0; i < BANDS; i++, off += SAMPLES_PER_BAND)
         for(ch = 0; ch < 2; ch++)
-            idx_to_quant(c, &bc, bands[i].res[ch], c->Q[ch] + off);
+            idx_to_quant(c, &gb, bands[i].res[ch], c->Q[ch] + off);
 
     ff_mpc_dequantize_and_synth(c, mb, (int16_t **)frame->extended_data, 2);
+    if(last_frame)
+        frame->nb_samples = c->lastframelen;
 
-    bits_used = bitstream_tell(&bc);
+    bits_used = get_bits_count(&gb);
     bits_avail = buf_size * 8;
     if (!last_frame && ((bits_avail < bits_used) || (bits_used + 32 <= bits_avail))) {
         av_log(avctx, AV_LOG_ERROR, "Error decoding frame: used %i of %i bits\n", bits_used, bits_avail);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     if(c->frames_to_skip){
         c->frames_to_skip--;
diff --git a/libavcodec/mpc7data.h b/libavcodec/mpc7data.h
index f205ffe..5609e8f 100644
--- a/libavcodec/mpc7data.h
+++ b/libavcodec/mpc7data.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc8.c b/libavcodec/mpc8.c
index 649eb02..3be2f79 100644
--- a/libavcodec/mpc8.c
+++ b/libavcodec/mpc8.c
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,10 @@
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/lfg.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mpegaudiodsp.h"
-#include "vlc.h"
 
 #include "mpc.h"
 #include "mpc8data.h"
@@ -44,22 +42,22 @@ static VLC q1_vlc, q2_vlc[2], q3_vlc[2], quant_vlc[4][2], q9up_vlc;
 static const int q3_offsets[2] = { MPC8_Q3_OFFSET, MPC8_Q4_OFFSET };
 static const int quant_offsets[6] = { MPC8_Q5_OFFSET, MPC8_Q6_OFFSET, MPC8_Q7_OFFSET, MPC8_Q8_OFFSET };
 
-static inline int mpc8_dec_base(BitstreamContext *bc, int k, int n)
+static inline int mpc8_dec_base(GetBitContext *gb, int k, int n)
 {
     int len = mpc8_cnk_len[k-1][n-1] - 1;
-    int code = len ? bitstream_read(bc, len) : 0;
+    int code = len ? get_bits_long(gb, len) : 0;
 
     if (code >= mpc8_cnk_lost[k-1][n-1])
-        code = ((code << 1) | bitstream_read_bit(bc)) - mpc8_cnk_lost[k - 1][n - 1];
+        code = ((code << 1) | get_bits1(gb)) - mpc8_cnk_lost[k-1][n-1];
 
     return code;
 }
 
-static inline int mpc8_dec_enum(BitstreamContext *bc, int k, int n)
+static inline int mpc8_dec_enum(GetBitContext *gb, int k, int n)
 {
     int bits = 0;
     const uint32_t * C = mpc8_cnk[k-1];
-    int code = mpc8_dec_base(bc, k, n);
+    int code = mpc8_dec_base(gb, k, n);
 
     do {
         n--;
@@ -74,18 +72,18 @@ static inline int mpc8_dec_enum(BitstreamContext *bc, int k, int n)
     return bits;
 }
 
-static inline int mpc8_get_mod_golomb(BitstreamContext *bc, int m)
+static inline int mpc8_get_mod_golomb(GetBitContext *gb, int m)
 {
     if(mpc8_cnk_len[0][m] < 1) return 0;
-    return mpc8_dec_base(bc, 1, m + 1);
+    return mpc8_dec_base(gb, 1, m+1);
 }
 
-static int mpc8_get_mask(BitstreamContext *bc, int size, int t)
+static int mpc8_get_mask(GetBitContext *gb, int size, int t)
 {
     int mask = 0;
 
     if(t && t != size)
-         mask = mpc8_dec_enum(bc, FFMIN(t, size - t), size);
+         mask = mpc8_dec_enum(gb, FFMIN(t, size - t), size);
     if((t << 1) > size) mask = ~mask;
 
     return mask;
@@ -99,7 +97,7 @@ static av_cold int mpc8_decode_init(AVCodecContext * avctx)
 {
     int i;
     MPCContext *c = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
     static int vlc_initialized = 0;
     int channels;
 
@@ -124,20 +122,25 @@ static av_cold int mpc8_decode_init(AVCodecContext * avctx)
 
     ff_mpc_init();
 
-    bitstream_init(&bc, avctx->extradata, 16);
+    init_get_bits(&gb, avctx->extradata, 16);
 
-    bitstream_skip(&bc, 3); // sample rate
-    c->maxbands = bitstream_read(&bc, 5) + 1;
-    channels    = bitstream_read(&bc, 4) + 1;
+    skip_bits(&gb, 3);//sample rate
+    c->maxbands = get_bits(&gb, 5) + 1;
+    if (c->maxbands >= BANDS) {
+        av_log(avctx,AV_LOG_ERROR, "maxbands %d too high\n", c->maxbands);
+        return AVERROR_INVALIDDATA;
+    }
+    channels = get_bits(&gb, 4) + 1;
     if (channels > 2) {
         avpriv_request_sample(avctx, "Multichannel MPC SV8");
         return AVERROR_PATCHWELCOME;
     }
-    c->MSS    = bitstream_read_bit(&bc);
-    c->frames = 1 << (bitstream_read(&bc, 3) * 2);
+    c->MSS = get_bits1(&gb);
+    c->frames = 1 << (get_bits(&gb, 3) * 2);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
-    avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
+    avctx->channel_layout = (channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
+    avctx->channels = channels;
 
     if(vlc_initialized) return 0;
     av_log(avctx, AV_LOG_DEBUG, "Initing VLC\n");
@@ -240,37 +243,40 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     MPCContext *c = avctx->priv_data;
-    BitstreamContext bc2, *bc = &bc2;
+    GetBitContext gb2, *gb = &gb2;
     int i, j, k, ch, cnt, res, t;
     Band *bands = c->bands;
     int off;
     int maxband, keyframe;
     int last[2];
 
-    /* get output buffer */
-    frame->nb_samples = MPC_FRAME_SIZE;
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return res;
-    }
-
     keyframe = c->cur_frame == 0;
 
     if(keyframe){
         memset(c->Q, 0, sizeof(c->Q));
         c->last_bits_used = 0;
     }
-    bitstream_init8(bc, buf, buf_size);
-    bitstream_skip(bc, c->last_bits_used & 7);
+    if ((res = init_get_bits8(gb, buf, buf_size)) < 0)
+        return res;
+
+    skip_bits(gb, c->last_bits_used & 7);
 
     if(keyframe)
-        maxband = mpc8_get_mod_golomb(bc, c->maxbands + 1);
+        maxband = mpc8_get_mod_golomb(gb, c->maxbands + 1);
     else{
-        maxband = c->last_max_band + bitstream_read_vlc(bc, band_vlc.table, MPC8_BANDS_BITS, 2);
+        maxband = c->last_max_band + get_vlc2(gb, band_vlc.table, MPC8_BANDS_BITS, 2);
         if(maxband > 32) maxband -= 33;
     }
-    if(maxband > c->maxbands + 1)
+
+    if (get_bits_left(gb) < 0) {
+        *got_frame_ptr = 0;
+        return buf_size;
+    }
+
+    if(maxband > c->maxbands + 1) {
+        av_log(avctx, AV_LOG_ERROR, "maxband %d too large\n",maxband);
         return AVERROR_INVALIDDATA;
+    }
     c->last_max_band = maxband;
 
     /* read subband indexes */
@@ -278,7 +284,7 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
         last[0] = last[1] = 0;
         for(i = maxband - 1; i >= 0; i--){
             for(ch = 0; ch < 2; ch++){
-                last[ch] = bitstream_read_vlc(bc, res_vlc[last[ch] > 2].table, MPC8_RES_BITS, 2) + last[ch];
+                last[ch] = get_vlc2(gb, res_vlc[last[ch] > 2].table, MPC8_RES_BITS, 2) + last[ch];
                 if(last[ch] > 15) last[ch] -= 17;
                 bands[i].res[ch] = last[ch];
             }
@@ -290,8 +296,8 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
             for(i = 0; i < maxband; i++)
                 if(bands[i].res[0] || bands[i].res[1])
                     cnt++;
-            t = mpc8_get_mod_golomb(bc, cnt);
-            mask = mpc8_get_mask(bc, cnt, t);
+            t = mpc8_get_mod_golomb(gb, cnt);
+            mask = mpc8_get_mask(gb, cnt, t);
             for(i = maxband - 1; i >= 0; i--)
                 if(bands[i].res[0] || bands[i].res[1]){
                     bands[i].msf = mask & 1;
@@ -311,7 +317,7 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
         if(bands[i].res[0] || bands[i].res[1]){
             cnt = !!bands[i].res[0] + !!bands[i].res[1] - 1;
             if(cnt >= 0){
-                t = bitstream_read_vlc(bc, scfi_vlc[cnt].table, scfi_vlc[cnt].bits, 1);
+                t = get_vlc2(gb, scfi_vlc[cnt].table, scfi_vlc[cnt].bits, 1);
                 if(bands[i].res[0]) bands[i].scfi[0] = t >> (2 * cnt);
                 if(bands[i].res[1]) bands[i].scfi[1] = t & 3;
             }
@@ -323,21 +329,21 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
             if(!bands[i].res[ch]) continue;
 
             if(c->oldDSCF[ch][i]){
-                bands[i].scf_idx[ch][0] = bitstream_read(bc, 7) - 6;
+                bands[i].scf_idx[ch][0] = get_bits(gb, 7) - 6;
                 c->oldDSCF[ch][i] = 0;
             }else{
-                t = bitstream_read_vlc(bc, dscf_vlc[1].table, MPC8_DSCF1_BITS, 2);
+                t = get_vlc2(gb, dscf_vlc[1].table, MPC8_DSCF1_BITS, 2);
                 if(t == 64)
-                    t += bitstream_read(bc, 6);
+                    t += get_bits(gb, 6);
                 bands[i].scf_idx[ch][0] = ((bands[i].scf_idx[ch][2] + t - 25) & 0x7F) - 6;
             }
             for(j = 0; j < 2; j++){
                 if((bands[i].scfi[ch] << j) & 2)
                     bands[i].scf_idx[ch][j + 1] = bands[i].scf_idx[ch][j];
                 else{
-                    t = bitstream_read_vlc(bc, dscf_vlc[0].table, MPC8_DSCF0_BITS, 2);
+                    t = get_vlc2(gb, dscf_vlc[0].table, MPC8_DSCF0_BITS, 2);
                     if(t == 31)
-                        t = 64 + bitstream_read(bc, 6);
+                        t = 64 + get_bits(gb, 6);
                     bands[i].scf_idx[ch][j + 1] = ((bands[i].scf_idx[ch][j] + t - 25) & 0x7F) - 6;
                 }
             }
@@ -356,16 +362,16 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
                 break;
             case 1:
                 for(j = 0; j < SAMPLES_PER_BAND; j += SAMPLES_PER_BAND / 2){
-                    cnt = bitstream_read_vlc(bc, q1_vlc.table, MPC8_Q1_BITS, 2);
-                    t = mpc8_get_mask(bc, 18, cnt);
+                    cnt = get_vlc2(gb, q1_vlc.table, MPC8_Q1_BITS, 2);
+                    t = mpc8_get_mask(gb, 18, cnt);
                     for(k = 0; k < SAMPLES_PER_BAND / 2; k++, t <<= 1)
-                        c->Q[ch][off + j + k] = (t & 0x20000) ? (bitstream_read_bit(bc) << 1) - 1 : 0;
+                        c->Q[ch][off + j + k] = (t & 0x20000) ? (get_bits1(gb) << 1) - 1 : 0;
                 }
                 break;
             case 2:
                 cnt = 6;//2*mpc8_thres[res]
                 for(j = 0; j < SAMPLES_PER_BAND; j += 3){
-                    t = bitstream_read_vlc(bc, q2_vlc[cnt > 3].table, MPC8_Q2_BITS, 2);
+                    t = get_vlc2(gb, q2_vlc[cnt > 3].table, MPC8_Q2_BITS, 2);
                     c->Q[ch][off + j + 0] = mpc8_idx50[t];
                     c->Q[ch][off + j + 1] = mpc8_idx51[t];
                     c->Q[ch][off + j + 2] = mpc8_idx52[t];
@@ -375,7 +381,7 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
             case 3:
             case 4:
                 for(j = 0; j < SAMPLES_PER_BAND; j += 2){
-                    t = bitstream_read_vlc(bc, q3_vlc[res - 3].table, MPC8_Q3_BITS, 2) + q3_offsets[res - 3];
+                    t = get_vlc2(gb, q3_vlc[res - 3].table, MPC8_Q3_BITS, 2) + q3_offsets[res - 3];
                     c->Q[ch][off + j + 1] = t >> 4;
                     c->Q[ch][off + j + 0] = (t & 8) ? (t & 0xF) - 16 : (t & 0xF);
                 }
@@ -386,17 +392,17 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
             case 8:
                 cnt = 2 * mpc8_thres[res];
                 for(j = 0; j < SAMPLES_PER_BAND; j++){
-                    t = bitstream_read_vlc(bc, quant_vlc[res - 5][cnt > mpc8_thres[res]].table, quant_vlc[res - 5][cnt > mpc8_thres[res]].bits, 2) + quant_offsets[res - 5];
+                    t = get_vlc2(gb, quant_vlc[res - 5][cnt > mpc8_thres[res]].table, quant_vlc[res - 5][cnt > mpc8_thres[res]].bits, 2) + quant_offsets[res - 5];
                     c->Q[ch][off + j] = t;
                     cnt = (cnt >> 1) + FFABS(c->Q[ch][off + j]);
                 }
                 break;
             default:
                 for(j = 0; j < SAMPLES_PER_BAND; j++){
-                    c->Q[ch][off + j] = bitstream_read_vlc(bc, q9up_vlc.table, MPC8_Q9UP_BITS, 2);
+                    c->Q[ch][off + j] = get_vlc2(gb, q9up_vlc.table, MPC8_Q9UP_BITS, 2);
                     if(res != 9){
                         c->Q[ch][off + j] <<= res - 9;
-                        c->Q[ch][off + j] |= bitstream_read(bc, res - 9);
+                        c->Q[ch][off + j] |= get_bits(gb, res - 9);
                     }
                     c->Q[ch][off + j] -= (1 << (res - 2)) - 1;
                 }
@@ -404,17 +410,25 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
         }
     }
 
+    frame->nb_samples = MPC_FRAME_SIZE;
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
+        return res;
+
     ff_mpc_dequantize_and_synth(c, maxband - 1,
                                 (int16_t **)frame->extended_data,
                                 avctx->channels);
 
     c->cur_frame++;
 
-    c->last_bits_used = bitstream_tell(bc);
-    if (bitstream_bits_left(bc) < 8) // we have only padding left
-        c->last_bits_used = buf_size << 3;
+    c->last_bits_used = get_bits_count(gb);
     if(c->cur_frame >= c->frames)
         c->cur_frame = 0;
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Overread %d\n", -get_bits_left(gb));
+        c->last_bits_used = buf_size << 3;
+    } else if (c->cur_frame == 0 && get_bits_left(gb) < 8) {// we have only padding left
+        c->last_bits_used = buf_size << 3;
+    }
 
     *got_frame_ptr = 1;
 
diff --git a/libavcodec/mpc8data.h b/libavcodec/mpc8data.h
index 2940b30..22c2be4 100644
--- a/libavcodec/mpc8data.h
+++ b/libavcodec/mpc8data.h
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc8huff.h b/libavcodec/mpc8huff.h
index 6005e21..8491037 100644
--- a/libavcodec/mpc8huff.h
+++ b/libavcodec/mpc8huff.h
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpcdata.h b/libavcodec/mpcdata.h
index 15724f3..64fb4ab 100644
--- a/libavcodec/mpcdata.h
+++ b/libavcodec/mpcdata.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,7 @@ static const float mpc_CC[18+1] = {
     4.0002, 2.0001, 1.0000
 };
 
-static const float mpc_SCF[128+6] = {
-    920.016296386718750000, 766.355773925781250000, 638.359558105468750000,
-    531.741149902343750000, 442.930114746093750000, 368.952209472656250000,
+static const float mpc_SCF[256] = {
     307.330047607421875000, 255.999984741210937500, 213.243041992187500000, 177.627334594726562500,
     147.960128784179687500, 123.247924804687500000, 102.663139343261718750, 85.516410827636718750,
     71.233520507812500000, 59.336143493652343750, 49.425861358642578125, 41.170787811279296875,
@@ -64,7 +62,39 @@ static const float mpc_SCF[128+6] = {
     0.000000396931966407, 0.000000330636652279, 0.000000275413924555, 0.000000229414467867,
     0.000000191097811353, 0.000000159180785886, 0.000000132594522029, 0.000000110448674207,
     0.000000092001613439, 0.000000076635565449, 0.000000063835940978, 0.000000053174105119,
-    0.000000044293003043, 0.000000036895215771, 0.000000030733001921, 0.000000025599996789
+    0.000000044293003043, 0.000000036895215771, 0.000000030733001921, 0.000000025599996789,
+    0.000000021324305018, 3689522167600.270019531250000000, 3073300627835.926757812500000000, 2560000000000.002929687500000000,
+    2132430501800.519042968750000000, 1776273376956.721923828125000000, 1479601378343.250244140625000000, 1232479339720.794189453125000000,
+    1026631459710.774291992187500000, 855164155779.391845703125000000, 712335206965.024780273437500000, 593361454233.829101562500000000,
+    494258618594.112609863281250000, 411707872682.763122558593750000, 342944697476.612365722656250000, 285666302081.983886718750000000,
+    237954506209.446411132812500000, 198211502766.368713378906250000, 165106349338.563323974609375000, 137530396629.095306396484375000,
+    114560161209.611633300781250000, 95426399240.062576293945312500, 79488345475.196502685546875000, 66212254855.064872741699218750,
+    55153528064.816276550292968750, 45941822471.611343383789062500, 38268649822.956413269042968750, 31877045369.216873168945312500,
+    26552962442.420688629150390625, 22118104306.789615631103515625, 18423953228.829509735107421875, 15346796808.164905548095703125,
+    12783585007.291271209716796875, 10648479137.463939666748046875, 8869977230.669750213623046875, 7388519530.061036109924316406,
+    6154493909.785535812377929688, 5126574428.270387649536132812, 4270337375.232155323028564453, 3557108465.595236301422119141,
+    2963002574.315670013427734375, 2468123854.056322574615478516, 2055899448.676229715347290039, 1712524489.450022459030151367,
+    1426499787.649837732315063477, 1188246741.404872417449951172, 989786560.561257958412170410, 824473067.192597866058349609,
+    686770123.591610312461853027, 572066234.090648531913757324, 476520111.962911486625671387, 396932039.637152194976806641,
+    330636714.243810534477233887, 275413990.026798009872436523, 229414528.498330980539321899, 191097866.455478429794311523,
+    159180827.835415601730346680, 132594551.788319095969200134, 110448697.892960876226425171, 92001629.793398514389991760,
+    76635578.744844585657119751, 63835955.327594503760337830, 53174116.504741288721561432, 44293010.914454914629459381,
+    36895221.676002673804759979, 30733006.278359245508909225, 25600000.000000011175870895, 21324305.018005173653364182,
+    17762733.769567202776670456, 14796013.783432489261031151, 12324793.397207930684089661, 10266314.597107734531164169,
+    8551641.557793911546468735, 7123352.069650243036448956, 5933614.542338287457823753, 4942586.185941123403608799,
+    4117078.726827629376202822, 3429446.974766122177243233, 2856663.020819837693125010, 2379545.062094463035464287,
+    1982115.027663686312735081, 1651063.493385632522404194, 1375303.966290952404960990, 1145601.612096115713939071,
+    954263.992400625254958868, 794883.454751964658498764, 662122.548550648498348892, 551535.280648162588477135,
+    459418.224716113239992410, 382686.498229563992936164, 318770.453692168579436839, 265529.624424206791445613,
+    221181.043067896069260314, 184239.532288295013131574, 153467.968081648985389620, 127835.850072912653558888,
+    106484.791374639346031472, 88699.772306697457679547, 73885.195300610314006917, 61544.939097855312866159,
+    51265.744282703839417081, 42703.373752321524079889, 35571.084655952341563534, 29630.025743156678800005,
+    24681.238540563208516687, 20558.994486762283486314, 17125.244894500214286381, 14264.997876498367986642,
+    11882.467414048716818797, 9897.865605612574654515, 8244.730671925974093028, 6867.701235916098994494,
+    5720.662340906482313585, 4765.201119629112326948, 3969.320396371519564127, 3306.367142438103201130,
+    2754.139900267978191550, 2294.145284983308101801, 1910.978664554782881169, 1591.808278354154936096,
+    1325.945517883190177599, 1104.486978929608085309, 920.016297933984674273, 766.355787448445425980,
+    638.359553275944676898, 531.741165047412550848, 442.930109144548907807, 368.952216760026544762,
 };
 
 #endif /* AVCODEC_MPCDATA_H */
diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c
index c0c680d..ab6c19c 100644
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,12 @@
  * MPEG-1/2 decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/timecode.h"
+
 #include "internal.h"
 #include "avcodec.h"
 #include "mpegvideo.h"
@@ -62,24 +67,17 @@ static const uint8_t table_mb_btype[11][2] = {
     { 2, 5 }, // 0x1E MB_QUANT|MB_FOR|MB_BACK|MB_PAT
 };
 
-#define INIT_2D_VLC_RL(rl, static_size)\
-{\
-    static RL_VLC_ELEM rl_vlc_table[static_size];\
-    INIT_VLC_STATIC(&rl.vlc, TEX_VLC_BITS, rl.n + 2,\
-                    &rl.table_vlc[0][1], 4, 2,\
-                    &rl.table_vlc[0][0], 4, 2, static_size);\
-\
-    rl.rl_vlc[0] = rl_vlc_table;\
-    init_2d_vlc_rl(&rl);\
-}
-
-static av_cold void init_2d_vlc_rl(RLTable *rl)
+av_cold void ff_init_2d_vlc_rl(RLTable *rl, unsigned static_size, int flags)
 {
     int i;
-
-    for (i = 0; i < rl->vlc.table_size; i++) {
-        int code = rl->vlc.table[i][0];
-        int len  = rl->vlc.table[i][1];
+    VLC_TYPE table[680][2] = {{0}};
+    VLC vlc = { .table = table, .table_allocated = static_size };
+    av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+    init_vlc(&vlc, TEX_VLC_BITS, rl->n + 2, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC | flags);
+
+    for (i = 0; i < vlc.table_size; i++) {
+        int code = vlc.table[i][0];
+        int len  = vlc.table[i][1];
         int level, run;
 
         if (len == 0) { // illegal code
@@ -168,8 +166,8 @@ av_cold void ff_mpeg12_init_vlcs(void)
         ff_rl_init(&ff_rl_mpeg1, ff_mpeg12_static_rl_table_store[0]);
         ff_rl_init(&ff_rl_mpeg2, ff_mpeg12_static_rl_table_store[1]);
 
-        INIT_2D_VLC_RL(ff_rl_mpeg1, 680);
-        INIT_2D_VLC_RL(ff_rl_mpeg2, 674);
+        INIT_2D_VLC_RL(ff_rl_mpeg1, 680, 0);
+        INIT_2D_VLC_RL(ff_rl_mpeg2, 674, 0);
     }
 }
 
@@ -195,7 +193,7 @@ int ff_mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size,
 */
 
     for (i = 0; i < buf_size; i++) {
-        assert(pc->frame_start_found >= 0 && pc->frame_start_found <= 4);
+        av_assert1(pc->frame_start_found >= 0 && pc->frame_start_found <= 4);
         if (pc->frame_start_found & 1) {
             if (state == EXT_START_CODE && (buf[i] & 0xF0) != 0x80)
                 pc->frame_start_found--;
@@ -229,7 +227,7 @@ int ff_mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size,
                 }
             }
             if (pc->frame_start_found == 0 && s && state == PICTURE_START_CODE) {
-                ff_fetch_timestamp(s, i - 3, 1);
+                ff_fetch_timestamp(s, i - 3, 1, i > 3);
             }
         }
     }
@@ -262,16 +260,18 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
 
     {
         OPEN_READER(re, gb);
+        UPDATE_CACHE(re, gb);
+        if (((int32_t)GET_CACHE(re, gb)) <= (int32_t)0xBFFFFFFF)
+            goto end;
+
         /* now quantify & encode AC coefficients */
         while (1) {
             int level, run, j;
 
-            UPDATE_CACHE(re, gb);
-            GET_RL_VLC(level, run, re, gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+            GET_RL_VLC(level, run, re, gb, rl->rl_vlc[0],
+                       TEX_VLC_BITS, 2, 0);
 
-            if (level == 127) {
-                break;
-            } else if (level != 0) {
+            if (level != 0) {
                 i += run;
                 if (i > MAX_INDEX)
                     break;
@@ -281,7 +281,7 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
                 level = (level - 1) | 1;
                 level = (level ^ SHOW_SBITS(re, gb, 1)) -
                         SHOW_SBITS(re, gb, 1);
-                LAST_SKIP_BITS(re, gb, 1);
+                SKIP_BITS(re, gb, 1);
             } else {
                 /* escape */
                 run = SHOW_UBITS(re, gb, 6) + 1;
@@ -292,10 +292,10 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
 
                 if (level == -128) {
                     level = SHOW_UBITS(re, gb, 8) - 256;
-                    LAST_SKIP_BITS(re, gb, 8);
+                    SKIP_BITS(re, gb, 8);
                 } else if (level == 0) {
                     level = SHOW_UBITS(re, gb, 8);
-                    LAST_SKIP_BITS(re, gb, 8);
+                    SKIP_BITS(re, gb, 8);
                 }
 
                 i += run;
@@ -315,7 +315,13 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
             }
 
             block[j] = level;
+            if (((int32_t)GET_CACHE(re, gb)) <= (int32_t)0xBFFFFFFF)
+               break;
+
+            UPDATE_CACHE(re, gb);
         }
+end:
+        LAST_SKIP_BITS(re, gb, 2);
         CLOSE_READER(re, gb);
     }
 
diff --git a/libavcodec/mpeg12.h b/libavcodec/mpeg12.h
index 26de7a4..1ec99f1 100644
--- a/libavcodec/mpeg12.h
+++ b/libavcodec/mpeg12.h
@@ -2,20 +2,20 @@
  * MPEG-1/2 common code
  * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,15 @@ extern uint8_t ff_mpeg12_static_rl_table_store[2][2][2*MAX_RUN + MAX_LEVEL + 3];
 
 void ff_mpeg12_common_init(MpegEncContext *s);
 
+#define INIT_2D_VLC_RL(rl, static_size, flags)\
+{\
+    static RL_VLC_ELEM rl_vlc_table[static_size];\
+    rl.rl_vlc[0] = rl_vlc_table;\
+    ff_init_2d_vlc_rl(&rl, static_size, flags);\
+}
+
+void ff_init_2d_vlc_rl(RLTable *rl, unsigned static_size, int flags);
+
 static inline int decode_dc(GetBitContext *gb, int component)
 {
     int code, diff;
diff --git a/libavcodec/mpeg12data.c b/libavcodec/mpeg12data.c
index acb2bc3..4da96d7 100644
--- a/libavcodec/mpeg12data.c
+++ b/libavcodec/mpeg12data.c
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 #include "mpeg12data.h"
 
-const uint16_t ff_mpeg1_default_intra_matrix[64] = {
+const uint16_t ff_mpeg1_default_intra_matrix[256] = {
         8, 16, 19, 22, 26, 27, 29, 34,
         16, 16, 22, 24, 27, 29, 34, 37,
         19, 22, 26, 27, 29, 34, 34, 38,
@@ -305,6 +305,72 @@ const uint8_t ff_mpeg12_mbMotionVectorTable[17][2] = {
 { 0xc, 10 },
 };
 
+const AVRational ff_mpeg2_frame_rate_tab[] = {
+    {      1,     1},
+    {      2,     1},
+    {      3,     1},
+    {      4,     1},
+    {      5,     1},
+    {      6,     1},
+    {      8,     1},
+    {      9,     1},
+    {     10,     1},
+    {     12,     1},
+    {     15,     1},
+    {     16,     1},
+    {     18,     1},
+    {     20,     1},
+    {     24,     1},
+    {     25,     1},
+    {     30,     1},
+    {     32,     1},
+    {     36,     1},
+    {     40,     1},
+    {     45,     1},
+    {     48,     1},
+    {     50,     1},
+    {     60,     1},
+    {     72,     1},
+    {     75,     1},
+    {     80,     1},
+    {     90,     1},
+    {     96,     1},
+    {    100,     1},
+    {    120,     1},
+    {    150,     1},
+    {    180,     1},
+    {    200,     1},
+    {    240,     1},
+    {    750,  1001},
+    {    800,  1001},
+    {    960,  1001},
+    {   1000,  1001},
+    {   1200,  1001},
+    {   1250,  1001},
+    {   1500,  1001},
+    {   1600,  1001},
+    {   1875,  1001},
+    {   2000,  1001},
+    {   2400,  1001},
+    {   2500,  1001},
+    {   3000,  1001},
+    {   3750,  1001},
+    {   4000,  1001},
+    {   4800,  1001},
+    {   5000,  1001},
+    {   6000,  1001},
+    {   7500,  1001},
+    {   8000,  1001},
+    {  10000,  1001},
+    {  12000,  1001},
+    {  15000,  1001},
+    {  20000,  1001},
+    {  24000,  1001},
+    {  30000,  1001},
+    {  60000,  1001},
+    {      0,     0},
+};
+
 const float ff_mpeg1_aspect[16]={
     0.0000,
     1.0000,
diff --git a/libavcodec/mpeg12data.h b/libavcodec/mpeg12data.h
index c6750b8..f51faf4 100644
--- a/libavcodec/mpeg12data.h
+++ b/libavcodec/mpeg12data.h
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 #include "libavutil/rational.h"
 #include "rl.h"
 
-extern const uint16_t ff_mpeg1_default_intra_matrix[64];
+extern const uint16_t ff_mpeg1_default_intra_matrix[];
 extern const uint16_t ff_mpeg1_default_non_intra_matrix[64];
 
 extern const uint16_t ff_mpeg12_vlc_dc_lum_code[12];
@@ -49,6 +49,7 @@ extern const uint8_t ff_mpeg12_mbPatTable[64][2];
 extern const uint8_t ff_mpeg12_mbMotionVectorTable[17][2];
 
 extern const AVRational ff_mpeg12_frame_rate_tab[];
+extern const AVRational ff_mpeg2_frame_rate_tab[];
 
 extern const float ff_mpeg1_aspect[16];
 extern const AVRational ff_mpeg2_aspect[16];
diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
index 532934c..83e5378 100644
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -1,22 +1,22 @@
 /*
  * MPEG-1/2 decoder
  * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,11 @@
  * MPEG-1/2 decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
 #include <inttypes.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/stereo3d.h"
 
@@ -46,6 +48,7 @@
 #include "profiles.h"
 #include "thread.h"
 #include "version.h"
+#include "xvmc_internal.h"
 
 typedef struct Mpeg1Context {
     MpegEncContext mpeg_enc_ctx;
@@ -59,11 +62,11 @@ typedef struct Mpeg1Context {
     uint8_t afd;
     int has_afd;
     int slice_count;
-    int save_aspect_info;
+    AVRational save_aspect;
     int save_width, save_height, save_progressive_seq;
     AVRational frame_rate_ext;  /* MPEG-2 specific framerate modificator */
     int sync;                   /* Did we reach a sync point like a GOP/SEQ/KEYFrame? */
-    int closed_gop;             /* GOP is closed */
+    int tmpgexs;
     int first_slice;
     int extradata_decoded;
 } Mpeg1Context;
@@ -94,13 +97,6 @@ static const uint32_t btype2mb_type[11] = {
     MB_TYPE_QUANT | MB_TYPE_L0L1 | MB_TYPE_CBP,
 };
 
-static const uint8_t non_linear_qscale[32] = {
-     0,  1,  2,  3,  4,  5,   6,   7,
-     8, 10, 12, 14, 16, 18,  20,  22,
-    24, 28, 32, 36, 40, 44,  48,  52,
-    56, 64, 72, 80, 88, 96, 104, 112,
-};
-
 /* as H.263, but only 17 codes */
 static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred)
 {
@@ -223,6 +219,11 @@ end:
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg1_fast_decode_block_inter(MpegEncContext *s,
                                                 int16_t *block, int n)
 {
@@ -394,6 +395,11 @@ end:
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
                                                     int16_t *block, int n)
 {
@@ -451,8 +457,9 @@ static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
         }
 
         block[j] = level;
-        if (((int32_t) GET_CACHE(re, &s->gb)) <= (int32_t) 0xBFFFFFFF)
+        if (((int32_t) GET_CACHE(re, &s->gb)) <= (int32_t) 0xBFFFFFFF || i > 63)
             break;
+
         UPDATE_CACHE(re, &s->gb);
     }
 end:
@@ -490,8 +497,8 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
     dc  = s->last_dc[component];
     dc += diff;
     s->last_dc[component] = dc;
-    block[0] = dc << (3 - s->intra_dc_precision);
-    ff_dlog(s->avctx, "dc=%d\n", block[0]);
+    block[0] = dc * (1 << (3 - s->intra_dc_precision));
+    ff_tlog(s->avctx, "dc=%d\n", block[0]);
     mismatch = block[0] ^ 1;
     i = 0;
     if (s->intra_vlc_format)
@@ -550,6 +557,11 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
                                                 int16_t *block, int n)
 {
@@ -589,12 +601,10 @@ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0],
                        TEX_VLC_BITS, 2, 0);
 
-            if (level == 127) {
+            if (level >= 64 || i > 63) {
                 break;
             } else if (level != 0) {
                 i += run;
-                if (i > MAX_INDEX)
-                    break;
                 j = scantable[i];
                 level = (level * qscale * quant_matrix[j]) >> 4;
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) -
@@ -608,8 +618,6 @@ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
                 level = SHOW_SBITS(re, &s->gb, 12);
                 SKIP_BITS(re, &s->gb, 12);
                 i += run;
-                if (i > MAX_INDEX)
-                    break;
                 j = scantable[i];
                 if (level < 0) {
                     level = (-level * qscale * quant_matrix[j]) >> 4;
@@ -641,15 +649,6 @@ static inline int get_dmv(MpegEncContext *s)
         return 0;
 }
 
-static inline int get_qscale(MpegEncContext *s)
-{
-    int qscale = get_bits(&s->gb, 5);
-    if (s->q_scale_type)
-        return non_linear_qscale[qscale];
-    else
-        return qscale << 1;
-}
-
 /* motion type (for MPEG-2) */
 #define MT_FIELD 1
 #define MT_FRAME 2
@@ -662,9 +661,9 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
     const int mb_block_count = 4 + (1 << s->chroma_format);
     int ret;
 
-    ff_dlog(s->avctx, "decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
+    ff_tlog(s->avctx, "decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
 
-    assert(s->mb_skipped == 0);
+    av_assert2(s->mb_skipped == 0);
 
     if (s->mb_skip_run-- != 0) {
         if (s->pict_type == AV_PICTURE_TYPE_P) {
@@ -679,11 +678,12 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             else
                 // FIXME not sure if this is allowed in MPEG at all
                 mb_type = s->current_picture.mb_type[s->mb_width + (s->mb_y - 1) * s->mb_stride - 1];
-            if (IS_INTRA(mb_type))
+            if (IS_INTRA(mb_type)) {
+                av_log(s->avctx, AV_LOG_ERROR, "skip with previntra\n");
                 return AVERROR_INVALIDDATA;
+            }
             s->current_picture.mb_type[s->mb_x + s->mb_y * s->mb_stride] =
                 mb_type | MB_TYPE_SKIP;
-//            assert(s->current_picture.mb_type[s->mb_x + s->mb_y * s->mb_stride - 1] & (MB_TYPE_16x16 | MB_TYPE_16x8));
 
             if ((s->mv[0][0][0] | s->mv[0][0][1] | s->mv[1][0][0] | s->mv[1][0][1]) == 0)
                 s->mb_skipped = 1;
@@ -726,7 +726,7 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
         mb_type = btype2mb_type[mb_type];
         break;
     }
-    ff_dlog(s->avctx, "mb_type=%x\n", mb_type);
+    ff_tlog(s->avctx, "mb_type=%x\n", mb_type);
 //    motion_type = 0; /* avoid warning */
     if (IS_INTRA(mb_type)) {
         s->bdsp.clear_blocks(s->block[0]);
@@ -741,7 +741,7 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             s->interlaced_dct = get_bits1(&s->gb);
 
         if (IS_QUANT(mb_type))
-            s->qscale = get_qscale(s);
+            s->qscale = mpeg_get_qscale(s);
 
         if (s->concealment_motion_vectors) {
             /* just parse them */
@@ -757,12 +757,15 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             s->last_mv[0][1][1] = mpeg_decode_motion(s, s->mpeg_f_code[0][1],
                                                      s->last_mv[0][0][1]);
 
-            skip_bits1(&s->gb); /* marker */
+            check_marker(s->avctx, &s->gb, "after concealment_motion_vectors");
         } else {
             /* reset mv prediction */
             memset(s->last_mv, 0, sizeof(s->last_mv));
         }
         s->mb_intra = 1;
+        // if 1, we memcpy blocks in xvmcvideo
+        if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
+            ff_xvmc_pack_pblocks(s, -1); // inter are always full blocks
 
         if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
             if (s->avctx->flags2 & AV_CODEC_FLAG2_FAST) {
@@ -791,11 +794,12 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
         }
     } else {
         if (mb_type & MB_TYPE_ZERO_MV) {
-            assert(mb_type & MB_TYPE_CBP);
+            av_assert2(mb_type & MB_TYPE_CBP);
 
             s->mv_dir = MV_DIR_FORWARD;
             if (s->picture_structure == PICT_FRAME) {
-                if (!s->frame_pred_frame_dct)
+                if (s->picture_structure == PICT_FRAME
+                    && !s->frame_pred_frame_dct)
                     s->interlaced_dct = get_bits1(&s->gb);
                 s->mv_type = MV_TYPE_16X16;
             } else {
@@ -805,7 +809,7 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             }
 
             if (IS_QUANT(mb_type))
-                s->qscale = get_qscale(s);
+                s->qscale = mpeg_get_qscale(s);
 
             s->last_mv[0][0][0] = 0;
             s->last_mv[0][0][1] = 0;
@@ -814,10 +818,10 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             s->mv[0][0][0]      = 0;
             s->mv[0][0][1]      = 0;
         } else {
-            assert(mb_type & MB_TYPE_L0L1);
+            av_assert2(mb_type & MB_TYPE_L0L1);
             // FIXME decide if MBs in field pictures are MB_TYPE_INTERLACED
             /* get additional motion vector type */
-            if (s->frame_pred_frame_dct) {
+            if (s->picture_structure == PICT_FRAME && s->frame_pred_frame_dct) {
                 motion_type = MT_FRAME;
             } else {
                 motion_type = get_bits(&s->gb, 2);
@@ -826,11 +830,11 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             }
 
             if (IS_QUANT(mb_type))
-                s->qscale = get_qscale(s);
+                s->qscale = mpeg_get_qscale(s);
 
             /* motion vectors */
             s->mv_dir = (mb_type >> 13) & 3;
-            ff_dlog(s->avctx, "motion_type=%d\n", motion_type);
+            ff_tlog(s->avctx, "motion_type=%d\n", motion_type);
             switch (motion_type) {
             case MT_FRAME: /* or MT_16X8 */
                 if (s->picture_structure == PICT_FRAME) {
@@ -851,8 +855,8 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
                                                    s->last_mv[i][0][1]);
                             /* full_pel: only for MPEG-1 */
                             if (s->full_pel[i]) {
-                                s->mv[i][0][0] <<= 1;
-                                s->mv[i][0][1] <<= 1;
+                                s->mv[i][0][0] *= 2;
+                                s->mv[i][0][1] *= 2;
                             }
                         }
                     }
@@ -887,16 +891,17 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
                                                          s->last_mv[i][j][0]);
                                 s->last_mv[i][j][0] = val;
                                 s->mv[i][j][0]      = val;
-                                ff_dlog(s->avctx, "fmx=%d\n", val);
+                                ff_tlog(s->avctx, "fmx=%d\n", val);
                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][1],
                                                          s->last_mv[i][j][1] >> 1);
-                                s->last_mv[i][j][1] = val << 1;
+                                s->last_mv[i][j][1] = 2 * val;
                                 s->mv[i][j][1]      = val;
-                                ff_dlog(s->avctx, "fmy=%d\n", val);
+                                ff_tlog(s->avctx, "fmy=%d\n", val);
                             }
                         }
                     }
                 } else {
+                    av_assert0(!s->progressive_sequence);
                     mb_type |= MB_TYPE_16x16 | MB_TYPE_INTERLACED;
                     for (i = 0; i < 2; i++) {
                         if (USES_LIST(mb_type, i)) {
@@ -913,6 +918,10 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
                 }
                 break;
             case MT_DMV:
+                if (s->progressive_sequence){
+                    av_log(s->avctx, AV_LOG_ERROR, "MT_DMV in progressive_sequence\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 s->mv_type = MV_TYPE_DMV;
                 for (i = 0; i < 2; i++) {
                     if (USES_LIST(mb_type, i)) {
@@ -929,8 +938,8 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
                         dmy = get_dmv(s);
 
 
-                        s->last_mv[i][0][1] = my << my_shift;
-                        s->last_mv[i][1][1] = my << my_shift;
+                        s->last_mv[i][0][1] = my * (1 << my_shift);
+                        s->last_mv[i][1][1] = my * (1 << my_shift);
 
                         s->mv[i][0][0] = mx;
                         s->mv[i][0][1] = my;
@@ -975,16 +984,20 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
 
             cbp = get_vlc2(&s->gb, ff_mb_pat_vlc.table, MB_PAT_VLC_BITS, 1);
             if (mb_block_count > 6) {
-                cbp <<= mb_block_count - 6;
+                cbp *= 1 << mb_block_count - 6;
                 cbp  |= get_bits(&s->gb, mb_block_count - 6);
                 s->bdsp.clear_blocks(s->block[6]);
             }
             if (cbp <= 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "invalid cbp at %d %d\n", s->mb_x, s->mb_y);
+                       "invalid cbp %d at %d %d\n", cbp, s->mb_x, s->mb_y);
                 return AVERROR_INVALIDDATA;
             }
 
+            // if 1, we memcpy blocks in xvmcvideo
+            if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
+                ff_xvmc_pack_pblocks(s, cbp);
+
             if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
                 if (s->avctx->flags2 & AV_CODEC_FLAG2_FAST) {
                     for (i = 0; i < 6; i++) {
@@ -1046,6 +1059,11 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
 
     ff_mpv_decode_defaults(s2);
 
+    if (   avctx->codec_tag != AV_RL32("VCR2")
+        && avctx->codec_tag != AV_RL32("BW10"))
+        avctx->coded_width = avctx->coded_height = 0; // do not trust dimensions from input
+    ff_mpv_decode_init(s2, avctx);
+
     s->mpeg_enc_ctx.avctx  = avctx;
 
     /* we need some permutation to store matrices,
@@ -1054,18 +1072,16 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
     ff_mpeg12_common_init(&s->mpeg_enc_ctx);
     ff_mpeg12_init_vlcs();
 
+    s2->chroma_format              = 1;
     s->mpeg_enc_ctx_allocated      = 0;
     s->mpeg_enc_ctx.picture_number = 0;
     s->repeat_field                = 0;
     s->mpeg_enc_ctx.codec_id       = avctx->codec->id;
     avctx->color_range             = AVCOL_RANGE_MPEG;
-    if (avctx->codec->id == AV_CODEC_ID_MPEG1VIDEO)
-        avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
-    else
-        avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
     return 0;
 }
 
+#if HAVE_THREADS
 static int mpeg_decode_update_thread_context(AVCodecContext *avctx,
                                              const AVCodecContext *avctx_from)
 {
@@ -1082,17 +1098,15 @@ static int mpeg_decode_update_thread_context(AVCodecContext *avctx,
     if (err)
         return err;
 
-    if (!ctx->mpeg_enc_ctx_allocated) {
-        // copy the whole context after the initial MpegEncContext structure
-        memcpy(ctx, ctx_from, sizeof(*ctx));
-        memset(&ctx->mpeg_enc_ctx, 0, sizeof(ctx->mpeg_enc_ctx));
-    }
+    if (!ctx->mpeg_enc_ctx_allocated)
+        memcpy(s + 1, s1 + 1, sizeof(Mpeg1Context) - sizeof(MpegEncContext));
 
     if (!(s->pict_type == AV_PICTURE_TYPE_B || s->low_delay))
         s->picture_number++;
 
     return 0;
 }
+#endif
 
 static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
                                  const uint8_t *new_perm)
@@ -1106,7 +1120,30 @@ static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
         matrix[new_perm[i]] = temp_matrix[old_perm[i]];
 }
 
-static const enum AVPixelFormat mpeg12_hwaccel_pixfmt_list_420[] = {
+static const enum AVPixelFormat mpeg1_hwaccel_pixfmt_list_420[] = {
+#if CONFIG_MPEG1_NVDEC_HWACCEL
+    AV_PIX_FMT_CUDA,
+#endif
+#if CONFIG_MPEG1_XVMC_HWACCEL
+    AV_PIX_FMT_XVMC,
+#endif
+#if CONFIG_MPEG1_VDPAU_HWACCEL
+    AV_PIX_FMT_VDPAU,
+#endif
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = {
+#if CONFIG_MPEG2_NVDEC_HWACCEL
+    AV_PIX_FMT_CUDA,
+#endif
+#if CONFIG_MPEG2_XVMC_HWACCEL
+    AV_PIX_FMT_XVMC,
+#endif
+#if CONFIG_MPEG2_VDPAU_HWACCEL
+    AV_PIX_FMT_VDPAU,
+#endif
 #if CONFIG_MPEG2_DXVA2_HWACCEL
     AV_PIX_FMT_DXVA2_VLD,
 #endif
@@ -1117,8 +1154,8 @@ static const enum AVPixelFormat mpeg12_hwaccel_pixfmt_list_420[] = {
 #if CONFIG_MPEG2_VAAPI_HWACCEL
     AV_PIX_FMT_VAAPI,
 #endif
-#if CONFIG_MPEG1_VDPAU_HWACCEL | CONFIG_MPEG2_VDPAU_HWACCEL
-    AV_PIX_FMT_VDPAU,
+#if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
+    AV_PIX_FMT_VIDEOTOOLBOX,
 #endif
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
@@ -1140,14 +1177,34 @@ static enum AVPixelFormat mpeg_get_pixelformat(AVCodecContext *avctx)
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     const enum AVPixelFormat *pix_fmts;
 
+    if (CONFIG_GRAY && (avctx->flags & AV_CODEC_FLAG_GRAY))
+        return AV_PIX_FMT_GRAY8;
+
     if (s->chroma_format < 2)
-        pix_fmts = mpeg12_hwaccel_pixfmt_list_420;
+        pix_fmts = avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO ?
+                                mpeg1_hwaccel_pixfmt_list_420 :
+                                mpeg2_hwaccel_pixfmt_list_420;
     else if (s->chroma_format == 2)
         pix_fmts = mpeg12_pixfmt_list_422;
     else
         pix_fmts = mpeg12_pixfmt_list_444;
 
-    return ff_get_format(avctx, pix_fmts);
+    return ff_thread_get_format(avctx, pix_fmts);
+}
+
+static void setup_hwaccel_for_pixfmt(AVCodecContext *avctx)
+{
+    // until then pix_fmt may be changed right after codec init
+    if (avctx->hwaccel)
+        if (avctx->idct_algo == FF_IDCT_AUTO)
+            avctx->idct_algo = FF_IDCT_NONE;
+
+    if (avctx->hwaccel && avctx->pix_fmt == AV_PIX_FMT_XVMC) {
+        Mpeg1Context *s1 = avctx->priv_data;
+        MpegEncContext *s = &s1->mpeg_enc_ctx;
+
+        s->pack_pblocks = 1;
+    }
 }
 
 /* Call this function when we know all parameters.
@@ -1159,27 +1216,84 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
     uint8_t old_permutation[64];
     int ret;
 
+    if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
+        // MPEG-1 aspect
+        AVRational aspect_inv = av_d2q(ff_mpeg1_aspect[s->aspect_ratio_info], 255);
+        avctx->sample_aspect_ratio = (AVRational) { aspect_inv.den, aspect_inv.num };
+    } else { // MPEG-2
+        // MPEG-2 aspect
+        if (s->aspect_ratio_info > 1) {
+            AVRational dar =
+                av_mul_q(av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                                  (AVRational) { s1->pan_scan.width,
+                                                 s1->pan_scan.height }),
+                         (AVRational) { s->width, s->height });
+
+            /* We ignore the spec here and guess a bit as reality does not
+             * match the spec, see for example res_change_ffmpeg_aspect.ts
+             * and sequence-display-aspect.mpg.
+             * issue1613, 621, 562 */
+            if ((s1->pan_scan.width == 0) || (s1->pan_scan.height == 0) ||
+                (av_cmp_q(dar, (AVRational) { 4, 3 }) &&
+                 av_cmp_q(dar, (AVRational) { 16, 9 }))) {
+                s->avctx->sample_aspect_ratio =
+                    av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                             (AVRational) { s->width, s->height });
+            } else {
+                s->avctx->sample_aspect_ratio =
+                    av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                             (AVRational) { s1->pan_scan.width, s1->pan_scan.height });
+// issue1613 4/3 16/9 -> 16/9
+// res_change_ffmpeg_aspect.ts 4/3 225/44 ->4/3
+// widescreen-issue562.mpg 4/3 16/9 -> 16/9
+//                s->avctx->sample_aspect_ratio = av_mul_q(s->avctx->sample_aspect_ratio, (AVRational) {s->width, s->height});
+                ff_dlog(avctx, "aspect A %d/%d\n",
+                        ff_mpeg2_aspect[s->aspect_ratio_info].num,
+                        ff_mpeg2_aspect[s->aspect_ratio_info].den);
+                ff_dlog(avctx, "aspect B %d/%d\n", s->avctx->sample_aspect_ratio.num,
+                        s->avctx->sample_aspect_ratio.den);
+            }
+        } else {
+            s->avctx->sample_aspect_ratio =
+                ff_mpeg2_aspect[s->aspect_ratio_info];
+        }
+    } // MPEG-2
+
+    if (av_image_check_sar(s->width, s->height,
+                           avctx->sample_aspect_ratio) < 0) {
+        av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n",
+                avctx->sample_aspect_ratio.num,
+                avctx->sample_aspect_ratio.den);
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+    }
+
     if ((s1->mpeg_enc_ctx_allocated == 0)                   ||
         avctx->coded_width       != s->width                ||
         avctx->coded_height      != s->height               ||
         s1->save_width           != s->width                ||
         s1->save_height          != s->height               ||
-        s1->save_aspect_info     != s->aspect_ratio_info    ||
-        s1->save_progressive_seq != s->progressive_sequence ||
+        av_cmp_q(s1->save_aspect, s->avctx->sample_aspect_ratio) ||
+        (s1->save_progressive_seq != s->progressive_sequence && FFALIGN(s->height, 16) != FFALIGN(s->height, 32)) ||
         0) {
         if (s1->mpeg_enc_ctx_allocated) {
             ParseContext pc = s->parse_context;
             s->parse_context.buffer = 0;
             ff_mpv_common_end(s);
             s->parse_context = pc;
+            s1->mpeg_enc_ctx_allocated = 0;
         }
 
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
 
-        avctx->bit_rate          = s->bit_rate;
-        s1->save_aspect_info     = s->aspect_ratio_info;
+        if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->bit_rate) {
+            avctx->rc_max_rate = s->bit_rate;
+        } else if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && s->bit_rate &&
+                   (s->bit_rate != 0x3FFFF*400 || s->vbv_delay != 0xFFFF)) {
+            avctx->bit_rate = s->bit_rate;
+        }
+        s1->save_aspect          = s->avctx->sample_aspect_ratio;
         s1->save_width           = s->width;
         s1->save_height          = s->height;
         s1->save_progressive_seq = s->progressive_sequence;
@@ -1191,61 +1305,28 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
         if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
             // MPEG-1 fps
             avctx->framerate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
-            // MPEG-1 aspect
-            avctx->sample_aspect_ratio = av_d2q(1.0 / ff_mpeg1_aspect[s->aspect_ratio_info], 255);
             avctx->ticks_per_frame     = 1;
+
+            avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
         } else { // MPEG-2
             // MPEG-2 fps
             av_reduce(&s->avctx->framerate.num,
                       &s->avctx->framerate.den,
-                      ff_mpeg12_frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num * 2,
+                      ff_mpeg12_frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num,
                       ff_mpeg12_frame_rate_tab[s->frame_rate_index].den * s1->frame_rate_ext.den,
                       1 << 30);
             avctx->ticks_per_frame = 2;
-            // MPEG-2 aspect
-            if (s->aspect_ratio_info > 1) {
-                AVRational dar =
-                    av_mul_q(av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                      (AVRational) { s1->pan_scan.width,
-                                                     s1->pan_scan.height }),
-                             (AVRational) { s->width, s->height });
 
-                /* We ignore the spec here and guess a bit as reality does not
-                 * match the spec, see for example res_change_ffmpeg_aspect.ts
-                 * and sequence-display-aspect.mpg.
-                 * issue1613, 621, 562 */
-                if ((s1->pan_scan.width == 0) || (s1->pan_scan.height == 0) ||
-                    (av_cmp_q(dar, (AVRational) { 4, 3 }) &&
-                     av_cmp_q(dar, (AVRational) { 16, 9 }))) {
-                    s->avctx->sample_aspect_ratio =
-                        av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                 (AVRational) { s->width, s->height });
-                } else {
-                    s->avctx->sample_aspect_ratio =
-                        av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                 (AVRational) { s1->pan_scan.width, s1->pan_scan.height });
-// issue1613 4/3 16/9 -> 16/9
-// res_change_ffmpeg_aspect.ts 4/3 225/44 ->4/3
-// widescreen-issue562.mpg 4/3 16/9 -> 16/9
-//                    s->avctx->sample_aspect_ratio = av_mul_q(s->avctx->sample_aspect_ratio, (AVRational) {s->width, s->height});
-                    ff_dlog(avctx, "A %d/%d\n",
-                            ff_mpeg2_aspect[s->aspect_ratio_info].num,
-                            ff_mpeg2_aspect[s->aspect_ratio_info].den);
-                    ff_dlog(avctx, "B %d/%d\n", s->avctx->sample_aspect_ratio.num,
-                            s->avctx->sample_aspect_ratio.den);
-                }
-            } else {
-                s->avctx->sample_aspect_ratio =
-                    ff_mpeg2_aspect[s->aspect_ratio_info];
+            switch (s->chroma_format) {
+            case 1: avctx->chroma_sample_location = AVCHROMA_LOC_LEFT; break;
+            case 2:
+            case 3: avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT; break;
+            default: av_assert0(0);
             }
         } // MPEG-2
 
-        ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
-
         avctx->pix_fmt = mpeg_get_pixelformat(avctx);
-        // until then pix_fmt may be changed right after codec init
-        if (avctx->hwaccel && avctx->idct_algo == FF_IDCT_AUTO)
-            avctx->idct_algo = FF_IDCT_SIMPLE;
+        setup_hwaccel_for_pixfmt(avctx);
 
         /* Quantization matrices may need reordering
          * if DCT permutation is changed. */
@@ -1280,20 +1361,23 @@ static int mpeg1_decode_picture(AVCodecContext *avctx, const uint8_t *buf,
         return AVERROR_INVALIDDATA;
 
     vbv_delay = get_bits(&s->gb, 16);
+    s->vbv_delay = vbv_delay;
     if (s->pict_type == AV_PICTURE_TYPE_P ||
         s->pict_type == AV_PICTURE_TYPE_B) {
         s->full_pel[0] = get_bits1(&s->gb);
         f_code = get_bits(&s->gb, 3);
-        if (f_code == 0 && (avctx->err_recognition & AV_EF_BITSTREAM))
+        if (f_code == 0 && (avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)))
             return AVERROR_INVALIDDATA;
+        f_code += !f_code;
         s->mpeg_f_code[0][0] = f_code;
         s->mpeg_f_code[0][1] = f_code;
     }
     if (s->pict_type == AV_PICTURE_TYPE_B) {
         s->full_pel[1] = get_bits1(&s->gb);
         f_code = get_bits(&s->gb, 3);
-        if (f_code == 0 && (avctx->err_recognition & AV_EF_BITSTREAM))
+        if (f_code == 0 && (avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)))
             return AVERROR_INVALIDDATA;
+        f_code += !f_code;
         s->mpeg_f_code[1][0] = f_code;
         s->mpeg_f_code[1][1] = f_code;
     }
@@ -1320,22 +1404,19 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
     s->avctx->level         = get_bits(&s->gb, 4);
     s->progressive_sequence = get_bits1(&s->gb);   /* progressive_sequence */
     s->chroma_format        = get_bits(&s->gb, 2); /* chroma_format 1=420, 2=422, 3=444 */
+
+    if (!s->chroma_format) {
+        s->chroma_format = 1;
+        av_log(s->avctx, AV_LOG_WARNING, "Chroma format invalid\n");
+    }
+
     horiz_size_ext          = get_bits(&s->gb, 2);
     vert_size_ext           = get_bits(&s->gb, 2);
     s->width  |= (horiz_size_ext << 12);
     s->height |= (vert_size_ext  << 12);
-
-    bit_rate_ext = get_bits(&s->gb, 12) << 18;
-    if (bit_rate_ext < INT_MAX / 400 &&
-        bit_rate_ext * 400 < INT_MAX - s->bit_rate) {
-        s->bit_rate += bit_rate_ext * 400;
-    } else {
-        av_log(s->avctx, AV_LOG_WARNING, "Invalid bit rate extension value: %d\n",
-               bit_rate_ext >> 18);
-        s->bit_rate = 0;
-    }
-
-    skip_bits1(&s->gb); /* marker */
+    bit_rate_ext = get_bits(&s->gb, 12);  /* XXX: handle it */
+    s->bit_rate += (bit_rate_ext << 18) * 400LL;
+    check_marker(s->avctx, &s->gb, "after bit rate extension");
     s->avctx->rc_buffer_size += get_bits(&s->gb, 8) * 1024 * 16 << 10;
 
     s->low_delay = get_bits1(&s->gb);
@@ -1350,8 +1431,8 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG,
-               "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n",
-               s->avctx->profile, s->avctx->level,
+               "profile: %d, level: %d ps: %d cf:%d vbv buffer: %d, bitrate:%"PRId64"\n",
+               s->avctx->profile, s->avctx->level, s->progressive_sequence, s->chroma_format,
                s->avctx->rc_buffer_size, s->bit_rate);
 }
 
@@ -1426,7 +1507,7 @@ static int load_matrix(MpegEncContext *s, uint16_t matrix0[64],
             return AVERROR_INVALIDDATA;
         }
         if (intra && i == 0 && v != 8) {
-            av_log(s->avctx, AV_LOG_ERROR, "intra matrix invalid, ignoring\n");
+            av_log(s->avctx, AV_LOG_DEBUG, "intra matrix specifies invalid DC quantizer %d, ignoring\n", v);
             v = 8; // needed by pink.mpg / issue1046
         }
         matrix0[j] = v;
@@ -1472,6 +1553,11 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1)
         s->current_picture.f->pict_type = s->pict_type;
         s->current_picture.f->key_frame = s->pict_type == AV_PICTURE_TYPE_I;
     }
+    s->mpeg_f_code[0][0] += !s->mpeg_f_code[0][0];
+    s->mpeg_f_code[0][1] += !s->mpeg_f_code[0][1];
+    s->mpeg_f_code[1][0] += !s->mpeg_f_code[1][0];
+    s->mpeg_f_code[1][1] += !s->mpeg_f_code[1][1];
+
     s->intra_dc_precision         = get_bits(&s->gb, 2);
     s->picture_structure          = get_bits(&s->gb, 2);
     s->top_field_first            = get_bits1(&s->gb);
@@ -1484,30 +1570,6 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1)
     s->chroma_420_type            = get_bits1(&s->gb);
     s->progressive_frame          = get_bits1(&s->gb);
 
-    if (s->progressive_sequence && !s->progressive_frame) {
-        s->progressive_frame = 1;
-        av_log(s->avctx, AV_LOG_ERROR,
-               "interlaced frame in progressive sequence, ignoring\n");
-    }
-
-    if (s->picture_structure == 0 ||
-        (s->progressive_frame && s->picture_structure != PICT_FRAME)) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "picture_structure %d invalid, ignoring\n",
-               s->picture_structure);
-        s->picture_structure = PICT_FRAME;
-    }
-
-    if (s->progressive_sequence && !s->frame_pred_frame_dct)
-        av_log(s->avctx, AV_LOG_WARNING, "invalid frame_pred_frame_dct\n");
-
-    if (s->picture_structure == PICT_FRAME) {
-        s->v_edge_pos  = 16 * s->mb_height;
-    } else {
-        s->v_edge_pos   = 8 * s->mb_height;
-        memset(s->mbskip_table, 0, s->mb_stride * s->mb_height);
-    }
-
     if (s->alternate_scan) {
         ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan);
         ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan);
@@ -1534,11 +1596,6 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
     Mpeg1Context *s1      = (Mpeg1Context *) s;
     int ret;
 
-    if (s->picture_structure == PICT_FRAME)
-        s->first_field = 0;
-    else
-        s->first_field ^= 1;
-
     /* start frame decoding */
     if (s->first_field || s->picture_structure == PICT_FRAME) {
         AVFrameSideData *pan_scan;
@@ -1609,9 +1666,11 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
 
         if (s->avctx->hwaccel &&
             (s->avctx->slice_flags & SLICE_FLAG_ALLOW_FIELD)) {
-            if (s->avctx->hwaccel->end_frame(s->avctx) < 0)
+            if ((ret = s->avctx->hwaccel->end_frame(s->avctx)) < 0) {
                 av_log(avctx, AV_LOG_ERROR,
                        "hardware accelerator failed to decode first field\n");
+                return ret;
+            }
         }
 
         for (i = 0; i < 4; i++) {
@@ -1643,20 +1702,23 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
                              const uint8_t **buf, int buf_size)
 {
     AVCodecContext *avctx = s->avctx;
+    const int lowres      = s->avctx->lowres;
     const int field_pic   = s->picture_structure != PICT_FRAME;
     int ret;
 
     s->resync_mb_x =
     s->resync_mb_y = -1;
 
-    assert(mb_y < s->mb_height);
+    av_assert0(mb_y < s->mb_height);
 
     init_get_bits(&s->gb, *buf, buf_size * 8);
+    if (s->codec_id != AV_CODEC_ID_MPEG1VIDEO && s->mb_height > 2800/16)
+        skip_bits(&s->gb, 3);
 
     ff_mpeg1_clean_buffers(s);
     s->interlaced_dct = 0;
 
-    s->qscale = get_qscale(s);
+    s->qscale = mpeg_get_qscale(s);
 
     if (s->qscale == 0) {
         av_log(s->avctx, AV_LOG_ERROR, "qscale == 0\n");
@@ -1664,8 +1726,8 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
     }
 
     /* extra slice info */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     s->mb_x = 0;
 
@@ -1695,7 +1757,7 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
         return AVERROR_INVALIDDATA;
     }
 
-    if (avctx->hwaccel) {
+    if (avctx->hwaccel && avctx->hwaccel->decode_slice) {
         const uint8_t *buf_end, *buf_start = *buf - 4; /* include start_code */
         int start_code = -1;
         buf_end = avpriv_find_start_code(buf_start + 2, *buf + buf_size, &start_code);
@@ -1735,6 +1797,10 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
     }
 
     for (;;) {
+        // If 1, we memcpy blocks in xvmcvideo.
+        if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
+            ff_xvmc_init_block(s); // set s->block
+
         if ((ret = mpeg_decode_mb(s, s->block)) < 0)
             return ret;
 
@@ -1765,22 +1831,23 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
                     s->current_picture.motion_val[dir][xy + 1][1] = motion_y;
                     s->current_picture.ref_index [dir][b8_xy]     =
                     s->current_picture.ref_index [dir][b8_xy + 1] = s->field_select[dir][i];
-                    assert(s->field_select[dir][i] == 0 ||
-                           s->field_select[dir][i] == 1);
+                    av_assert2(s->field_select[dir][i] == 0 ||
+                               s->field_select[dir][i] == 1);
                 }
                 xy    += wrap;
                 b8_xy += 2;
             }
         }
 
-        s->dest[0] += 16;
-        s->dest[1] += 16 >> s->chroma_x_shift;
-        s->dest[2] += 16 >> s->chroma_x_shift;
+        s->dest[0] += 16 >> lowres;
+        s->dest[1] +=(16 >> lowres) >> s->chroma_x_shift;
+        s->dest[2] +=(16 >> lowres) >> s->chroma_x_shift;
 
-        ff_mpv_decode_mb(s, s->block);
+        ff_mpv_reconstruct_mb(s, s->block);
 
         if (++s->mb_x >= s->mb_width) {
-            const int mb_size = 16;
+            const int mb_size = 16 >> s->avctx->lowres;
+            int left;
 
             ff_mpeg_draw_horiz_band(s, mb_size * (s->mb_y >> field_pic), mb_size);
             ff_mpv_report_decode_progress(s);
@@ -1798,15 +1865,40 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
                              s->progressive_frame == 0
                              /* vbv_delay == 0xBBB || 0xE10 */;
 
+                if (left >= 32 && !is_d10) {
+                    GetBitContext gb = s->gb;
+                    align_get_bits(&gb);
+                    if (show_bits(&gb, 24) == 0x060E2B) {
+                        av_log(avctx, AV_LOG_DEBUG, "Invalid MXF data found in video stream\n");
+                        is_d10 = 1;
+                    }
+                    if (left > 32 && show_bits_long(&gb, 32) == 0x201) {
+                        av_log(avctx, AV_LOG_DEBUG, "skipping m704 alpha (unsupported)\n");
+                        goto eos;
+                    }
+                }
+
                 if (left < 0 ||
                     (left && show_bits(&s->gb, FFMIN(left, 23)) && !is_d10) ||
-                    ((avctx->err_recognition & AV_EF_BUFFER) && left > 8)) {
-                    av_log(avctx, AV_LOG_ERROR, "end mismatch left=%d %0X\n",
-                           left, show_bits(&s->gb, FFMIN(left, 23)));
+                    ((avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_AGGRESSIVE)) && left > 8)) {
+                    av_log(avctx, AV_LOG_ERROR, "end mismatch left=%d %0X at %d %d\n",
+                           left, left>0 ? show_bits(&s->gb, FFMIN(left, 23)) : 0, s->mb_x, s->mb_y);
                     return AVERROR_INVALIDDATA;
                 } else
                     goto eos;
             }
+            // There are some files out there which are missing the last slice
+            // in cases where the slice is completely outside the visible
+            // area, we detect this here instead of running into the end expecting
+            // more data
+            left = get_bits_left(&s->gb);
+            if (s->mb_y >= ((s->height + 15) >> 4) &&
+                !s->progressive_sequence &&
+                left <= 25 &&
+                left >= 0 &&
+                s->mb_skip_run == -1 &&
+                (!left || show_bits(&s->gb, left) == 0))
+                goto eos;
 
             ff_init_block_index(s);
         }
@@ -1867,13 +1959,19 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
                     s->mv[0][0][1] = s->last_mv[0][0][1];
                     s->mv[1][0][0] = s->last_mv[1][0][0];
                     s->mv[1][0][1] = s->last_mv[1][0][1];
+                    s->field_select[0][0] = (s->picture_structure - 1) & 1;
+                    s->field_select[1][0] = (s->picture_structure - 1) & 1;
                 }
             }
         }
     }
 eos: // end of slice
+    if (get_bits_left(&s->gb) < 0) {
+        av_log(s, AV_LOG_ERROR, "overread %d\n", -get_bits_left(&s->gb));
+        return AVERROR_INVALIDDATA;
+    }
     *buf += (get_bits_count(&s->gb) - 1) / 8;
-    ff_dlog(s, "y %d %d %d %d\n", s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y);
+    ff_dlog(s, "Slice start:%d %d  end:%d %d\n", s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y);
     return 0;
 }
 
@@ -1913,7 +2011,10 @@ static int slice_decode_thread(AVCodecContext *c, void *arg)
 
         start_code = -1;
         buf        = avpriv_find_start_code(buf, s->gb.buffer_end, &start_code);
-        mb_y       = (start_code - SLICE_MIN_START_CODE) << field_pic;
+        mb_y       = start_code - SLICE_MIN_START_CODE;
+        if (s->codec_id != AV_CODEC_ID_MPEG1VIDEO && s->mb_height > 2800/16)
+            mb_y += (*buf&0xE0)<<2;
+        mb_y <<= field_pic;
         if (s->picture_structure == PICT_BOTTOM_FIELD)
             mb_y++;
         if (mb_y < 0 || mb_y >= s->end_mb_y)
@@ -1934,13 +2035,16 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict)
         return 0;
 
     if (s->avctx->hwaccel) {
-        if (s->avctx->hwaccel->end_frame(s->avctx) < 0)
+        int ret = s->avctx->hwaccel->end_frame(s->avctx);
+        if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
+            return ret;
+        }
     }
 
     /* end of slice reached */
-    if (/* s->mb_y << field_pic == s->mb_height && */ !s->first_field) {
+    if (/* s->mb_y << field_pic == s->mb_height && */ !s->first_field && !s1->first_slice) {
         /* end of image */
 
         ff_er_frame_end(&s->er);
@@ -1951,7 +2055,8 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict)
             int ret = av_frame_ref(pict, s->current_picture_ptr->f);
             if (ret < 0)
                 return ret;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG2);
         } else {
             if (avctx->active_thread_type & FF_THREAD_FRAME)
                 s->picture_number++;
@@ -1961,7 +2066,8 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict)
                 int ret = av_frame_ref(pict, s->last_picture_ptr->f);
                 if (ret < 0)
                     return ret;
-                ff_print_debug_info(s, s->last_picture_ptr);
+                ff_print_debug_info(s, s->last_picture_ptr, pict);
+                ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG2);
             }
         }
 
@@ -1986,28 +2092,25 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     if (width == 0 || height == 0) {
         av_log(avctx, AV_LOG_WARNING,
                "Invalid horizontal or vertical size value.\n");
-        if (avctx->err_recognition & AV_EF_BITSTREAM)
+        if (avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return AVERROR_INVALIDDATA;
     }
     s->aspect_ratio_info = get_bits(&s->gb, 4);
     if (s->aspect_ratio_info == 0) {
         av_log(avctx, AV_LOG_ERROR, "aspect ratio has forbidden 0 value\n");
-        if (avctx->err_recognition & AV_EF_BITSTREAM)
+        if (avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return AVERROR_INVALIDDATA;
     }
     s->frame_rate_index = get_bits(&s->gb, 4);
     if (s->frame_rate_index == 0 || s->frame_rate_index > 13) {
         av_log(avctx, AV_LOG_WARNING,
                "frame_rate_index %d is invalid\n", s->frame_rate_index);
-        return AVERROR_INVALIDDATA;
+        s->frame_rate_index = 1;
     }
-    s->bit_rate = get_bits(&s->gb, 18) * 400;
-    if (get_bits1(&s->gb) == 0) { /* marker */
-        av_log(avctx, AV_LOG_ERROR, "Marker in sequence header missing\n");
+    s->bit_rate = get_bits(&s->gb, 18) * 400LL;
+    if (check_marker(s->avctx, &s->gb, "in sequence header") == 0) {
         return AVERROR_INVALIDDATA;
     }
-    s->width  = width;
-    s->height = height;
 
     s->avctx->rc_buffer_size = get_bits(&s->gb, 10) * 1024 * 16;
     skip_bits(&s->gb, 1);
@@ -2039,21 +2142,26 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    s->width  = width;
+    s->height = height;
+
     /* We set MPEG-2 parameters so that it emulates MPEG-1. */
     s->progressive_sequence = 1;
     s->progressive_frame    = 1;
     s->picture_structure    = PICT_FRAME;
+    s->first_field          = 0;
     s->frame_pred_frame_dct = 1;
     s->chroma_format        = 1;
     s->codec_id             =
     s->avctx->codec_id      = AV_CODEC_ID_MPEG1VIDEO;
     s->out_format           = FMT_MPEG1;
+    s->swap_uv              = 0; // AFAIK VCR2 does not have SEQ_HEADER
     if (s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY)
         s->low_delay = 1;
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(s->avctx, AV_LOG_DEBUG, "vbv buffer: %d, bitrate:%d\n",
-               s->avctx->rc_buffer_size, s->bit_rate);
+        av_log(s->avctx, AV_LOG_DEBUG, "vbv buffer: %d, bitrate:%"PRId64", aspect_ratio_info: %d \n",
+               s->avctx->rc_buffer_size, s->bit_rate, s->aspect_ratio_info);
 
     return 0;
 }
@@ -2068,6 +2176,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->out_format = FMT_MPEG1;
     if (s1->mpeg_enc_ctx_allocated) {
         ff_mpv_common_end(s);
+        s1->mpeg_enc_ctx_allocated = 0;
     }
     s->width            = avctx->coded_width;
     s->height           = avctx->coded_height;
@@ -2075,9 +2184,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->low_delay        = 1;
 
     avctx->pix_fmt = mpeg_get_pixelformat(avctx);
-
-    if (avctx->hwaccel && avctx->idct_algo == FF_IDCT_AUTO)
-        avctx->idct_algo = FF_IDCT_SIMPLE;
+    setup_hwaccel_for_pixfmt(avctx);
 
     ff_mpv_idct_init(s);
     if ((ret = ff_mpv_common_init(s)) < 0)
@@ -2098,9 +2205,15 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->progressive_sequence  = 1;
     s->progressive_frame     = 1;
     s->picture_structure     = PICT_FRAME;
+    s->first_field           = 0;
     s->frame_pred_frame_dct  = 1;
     s->chroma_format         = 1;
-    s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+    if (s->codec_tag == AV_RL32("BW10")) {
+        s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG1VIDEO;
+    } else {
+        s->swap_uv = 1; // in case of xvmc we need to swap uv for each MB
+        s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+    }
     s1->save_width           = s->width;
     s1->save_height          = s->height;
     s1->save_progressive_seq = s->progressive_sequence;
@@ -2121,13 +2234,82 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
             av_freep(&s1->a53_caption);
             s1->a53_caption_size = cc_count * 3;
             s1->a53_caption      = av_malloc(s1->a53_caption_size);
-            if (s1->a53_caption)
+            if (!s1->a53_caption) {
+                s1->a53_caption_size = 0;
+            } else {
                 memcpy(s1->a53_caption, p + 7, s1->a53_caption_size);
+            }
+            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+        }
+        return 1;
+    } else if (buf_size >= 2 &&
+               p[0] == 0x03 && (p[1]&0x7f) == 0x01) {
+        /* extract SCTE-20 CC data */
+        GetBitContext gb;
+        int cc_count = 0;
+        int i;
+
+        init_get_bits(&gb, p + 2, buf_size - 2);
+        cc_count = get_bits(&gb, 5);
+        if (cc_count > 0) {
+            av_freep(&s1->a53_caption);
+            s1->a53_caption_size = cc_count * 3;
+            s1->a53_caption      = av_mallocz(s1->a53_caption_size);
+            if (!s1->a53_caption) {
+                s1->a53_caption_size = 0;
+            } else {
+                uint8_t field, cc1, cc2;
+                uint8_t *cap = s1->a53_caption;
+                for (i = 0; i < cc_count && get_bits_left(&gb) >= 26; i++) {
+                    skip_bits(&gb, 2); // priority
+                    field = get_bits(&gb, 2);
+                    skip_bits(&gb, 5); // line_offset
+                    cc1 = get_bits(&gb, 8);
+                    cc2 = get_bits(&gb, 8);
+                    skip_bits(&gb, 1); // marker
+
+                    if (!field) { // forbidden
+                        cap[0] = cap[1] = cap[2] = 0x00;
+                    } else {
+                        field = (field == 2 ? 1 : 0);
+                        if (!s1->mpeg_enc_ctx.top_field_first) field = !field;
+                        cap[0] = 0x04 | field;
+                        cap[1] = ff_reverse[cc1];
+                        cap[2] = ff_reverse[cc2];
+                    }
+                    cap += 3;
+                }
+            }
+            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
         }
         return 1;
     } else if (buf_size >= 11 &&
                p[0] == 'C' && p[1] == 'C' && p[2] == 0x01 && p[3] == 0xf8) {
-        /* extract DVD CC data */
+        /* extract DVD CC data
+         *
+         * uint32_t   user_data_start_code        0x000001B2    (big endian)
+         * uint16_t   user_identifier             0x4343 "CC"
+         * uint8_t    user_data_type_code         0x01
+         * uint8_t    caption_block_size          0xF8
+         * uint8_t
+         *   bit 7    caption_odd_field_first     1=odd field (CC1/CC2) first  0=even field (CC3/CC4) first
+         *   bit 6    caption_filler              0
+         *   bit 5:1  caption_block_count         number of caption blocks (pairs of caption words = frames). Most DVDs use 15 per start of GOP.
+         *   bit 0    caption_extra_field_added   1=one additional caption word
+         *
+         * struct caption_field_block {
+         *   uint8_t
+         *     bit 7:1 caption_filler             0x7F (all 1s)
+         *     bit 0   caption_field_odd          1=odd field (this is CC1/CC2)  0=even field (this is CC3/CC4)
+         *   uint8_t   caption_first_byte
+         *   uint8_t   caption_second_byte
+         * } caption_block[(caption_block_count * 2) + caption_extra_field_added];
+         *
+         * Some DVDs encode caption data for both fields with caption_field_odd=1. The only way to decode the fields
+         * correctly is to start on the field indicated by caption_odd_field_first and count between odd/even fields.
+         * Don't assume that the first caption word is the odd field. There do exist MPEG files in the wild that start
+         * on the even field. There also exist DVDs in the wild that encode an odd field count and the
+         * caption_extra_field_added/caption_odd_field_first bits change per packet to allow that. */
         int cc_count = 0;
         int i;
         // There is a caption count field in the data, but it is often
@@ -2139,7 +2321,9 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
             av_freep(&s1->a53_caption);
             s1->a53_caption_size = cc_count * 6;
             s1->a53_caption      = av_malloc(s1->a53_caption_size);
-            if (s1->a53_caption) {
+            if (!s1->a53_caption) {
+                s1->a53_caption_size = 0;
+            } else {
                 uint8_t field1 = !!(p[4] & 0x80);
                 uint8_t *cap = s1->a53_caption;
                 p += 5;
@@ -2154,6 +2338,7 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
                     p += 6;
                 }
             }
+            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
         }
         return 1;
     }
@@ -2163,9 +2348,25 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
 static void mpeg_decode_user_data(AVCodecContext *avctx,
                                   const uint8_t *p, int buf_size)
 {
+    Mpeg1Context *s = avctx->priv_data;
     const uint8_t *buf_end = p + buf_size;
     Mpeg1Context *s1 = avctx->priv_data;
 
+#if 0
+    int i;
+    for(i=0; !(!p[i-2] && !p[i-1] && p[i]==1) && i<buf_size; i++){
+        av_log(avctx, AV_LOG_ERROR, "%c", p[i]);
+    }
+    av_log(avctx, AV_LOG_ERROR, "\n");
+#endif
+
+    if (buf_size > 29){
+        int i;
+        for(i=0; i<20; i++)
+            if (!memcmp(p+i, "\0TMPGEXS\0", 9)){
+                s->tmpgexs= 1;
+            }
+    }
     /* we parse the DTG active format information */
     if (buf_end - p >= 5 &&
         p[0] == 'D' && p[1] == 'T' && p[2] == 'G' && p[3] == '1') {
@@ -2219,32 +2420,32 @@ static void mpeg_decode_gop(AVCodecContext *avctx,
 {
     Mpeg1Context *s1  = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
-
-    int time_code_hours, time_code_minutes;
-    int time_code_seconds, time_code_pictures;
     int broken_link;
+    int64_t tc;
 
     init_get_bits(&s->gb, buf, buf_size * 8);
 
-    skip_bits1(&s->gb); /* drop_frame_flag */
+    tc = s-> timecode_frame_start = get_bits(&s->gb, 25);
 
-    time_code_hours   = get_bits(&s->gb, 5);
-    time_code_minutes = get_bits(&s->gb, 6);
-    skip_bits1(&s->gb); // marker bit
-    time_code_seconds  = get_bits(&s->gb, 6);
-    time_code_pictures = get_bits(&s->gb, 6);
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->timecode_frame_start = tc;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
-    s1->closed_gop = get_bits1(&s->gb);
-    /* broken_link indicate that after editing the
+    s->closed_gop = get_bits1(&s->gb);
+    /* broken_link indicates that after editing the
      * reference frames of the first B-Frames after GOP I-Frame
      * are missing (open gop) */
     broken_link = get_bits1(&s->gb);
 
-    if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        char tcbuf[AV_TIMECODE_STR_SIZE];
+        av_timecode_make_mpeg_tc_string(tcbuf, tc);
         av_log(s->avctx, AV_LOG_DEBUG,
-               "GOP (%2d:%02d:%02d.[%02d]) closed_gop=%d broken_link=%d\n",
-               time_code_hours, time_code_minutes, time_code_seconds,
-               time_code_pictures, s1->closed_gop, broken_link);
+               "GOP (%s) closed_gop=%d broken_link=%d\n",
+               tcbuf, s->closed_gop, broken_link);
+    }
 }
 
 static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
@@ -2256,6 +2457,7 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
     const uint8_t *buf_end = buf + buf_size;
     int ret, input_size;
     int last_code = 0, skip_frame = 0;
+    int picture_start_code_seen = 0;
 
     for (;;) {
         /* find next start code */
@@ -2267,6 +2469,7 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     (avctx->active_thread_type & FF_THREAD_SLICE) &&
                     !avctx->hwaccel) {
                     int i;
+                    av_assert0(avctx->thread_count > 1);
 
                     avctx->execute(avctx, slice_decode_thread,
                                    &s2->thread_context[0], NULL,
@@ -2285,13 +2488,17 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                 }
             }
             s2->pict_type = 0;
+
+            if (avctx->err_recognition & AV_EF_EXPLODE && s2->er.error_count)
+                return AVERROR_INVALIDDATA;
+
             return FFMAX(0, buf_ptr - buf - s2->parse_context.last_index);
         }
 
         input_size = buf_end - buf_ptr;
 
         if (avctx->debug & FF_DEBUG_STARTCODE)
-            av_log(avctx, AV_LOG_DEBUG, "%3"PRIX32" at %td left %d\n",
+            av_log(avctx, AV_LOG_DEBUG, "%3"PRIX32" at %"PTRDIFF_SPECIFIER" left %d\n",
                    start_code, buf_ptr - buf, input_size);
 
         /* prepare data for next start code */
@@ -2299,7 +2506,8 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
         case SEQ_START_CODE:
             if (last_code == 0) {
                 mpeg1_decode_sequence(avctx, buf_ptr, input_size);
-                s->sync = 1;
+                if (buf != avctx->extradata)
+                    s->sync = 1;
             } else {
                 av_log(avctx, AV_LOG_ERROR,
                        "ignoring SEQ_START_CODE after %X\n", last_code);
@@ -2309,12 +2517,24 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
             break;
 
         case PICTURE_START_CODE:
+            if (picture_start_code_seen && s2->picture_structure == PICT_FRAME) {
+               /* If it's a frame picture, there can't be more than one picture header.
+                  Yet, it does happen and we need to handle it. */
+               av_log(avctx, AV_LOG_WARNING, "ignoring extra picture following a frame-picture\n");
+               break;
+            }
+            picture_start_code_seen = 1;
+
             if (s2->width <= 0 || s2->height <= 0) {
                 av_log(avctx, AV_LOG_ERROR, "Invalid frame dimensions %dx%d.\n",
                        s2->width, s2->height);
                 return AVERROR_INVALIDDATA;
             }
 
+            if (s->tmpgexs){
+                s2->intra_dc_precision= 3;
+                s2->intra_matrix[0]= 1;
+            }
             if (HAVE_THREADS && (avctx->active_thread_type & FF_THREAD_SLICE) &&
                 !avctx->hwaccel && s->slice_count) {
                 int i;
@@ -2398,14 +2618,50 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
             break;
         default:
             if (start_code >= SLICE_MIN_START_CODE &&
+                start_code <= SLICE_MAX_START_CODE && last_code == PICTURE_START_CODE) {
+                if (s2->progressive_sequence && !s2->progressive_frame) {
+                    s2->progressive_frame = 1;
+                    av_log(s2->avctx, AV_LOG_ERROR,
+                           "interlaced frame in progressive sequence, ignoring\n");
+                }
+
+                if (s2->picture_structure == 0 ||
+                    (s2->progressive_frame && s2->picture_structure != PICT_FRAME)) {
+                    av_log(s2->avctx, AV_LOG_ERROR,
+                           "picture_structure %d invalid, ignoring\n",
+                           s2->picture_structure);
+                    s2->picture_structure = PICT_FRAME;
+                }
+
+                if (s2->progressive_sequence && !s2->frame_pred_frame_dct)
+                    av_log(s2->avctx, AV_LOG_WARNING, "invalid frame_pred_frame_dct\n");
+
+                if (s2->picture_structure == PICT_FRAME) {
+                    s2->first_field = 0;
+                    s2->v_edge_pos  = 16 * s2->mb_height;
+                } else {
+                    s2->first_field ^= 1;
+                    s2->v_edge_pos   = 8 * s2->mb_height;
+                    memset(s2->mbskip_table, 0, s2->mb_stride * s2->mb_height);
+                }
+            }
+            if (start_code >= SLICE_MIN_START_CODE &&
                 start_code <= SLICE_MAX_START_CODE && last_code != 0) {
                 const int field_pic = s2->picture_structure != PICT_FRAME;
-                int mb_y = (start_code - SLICE_MIN_START_CODE) << field_pic;
+                int mb_y = start_code - SLICE_MIN_START_CODE;
                 last_code = SLICE_MIN_START_CODE;
+                if (s2->codec_id != AV_CODEC_ID_MPEG1VIDEO && s2->mb_height > 2800/16)
+                    mb_y += (*buf_ptr&0xE0)<<2;
 
+                mb_y <<= field_pic;
                 if (s2->picture_structure == PICT_BOTTOM_FIELD)
                     mb_y++;
 
+                if (buf_end - buf_ptr < 2) {
+                    av_log(s2->avctx, AV_LOG_ERROR, "slice too small\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
                 if (mb_y >= s2->mb_height) {
                     av_log(s2->avctx, AV_LOG_ERROR,
                            "slice below image (%d >= %d)\n", mb_y, s2->mb_height);
@@ -2416,19 +2672,23 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     /* Skip B-frames if we do not have reference frames and
                      * GOP is not closed. */
                     if (s2->pict_type == AV_PICTURE_TYPE_B) {
-                        if (!s->closed_gop) {
+                        if (!s2->closed_gop) {
                             skip_frame = 1;
+                            av_log(s2->avctx, AV_LOG_DEBUG,
+                                   "Skipping B slice due to open GOP\n");
                             break;
                         }
                     }
                 }
-                if (s2->pict_type == AV_PICTURE_TYPE_I)
+                if (s2->pict_type == AV_PICTURE_TYPE_I || (s2->avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
                     s->sync = 1;
                 if (!s2->next_picture_ptr) {
                     /* Skip P-frames if we do not have a reference frame or
                      * we have an invalid header. */
                     if (s2->pict_type == AV_PICTURE_TYPE_P && !s->sync) {
                         skip_frame = 1;
+                        av_log(s2->avctx, AV_LOG_DEBUG,
+                               "Skipping P slice due to !sync\n");
                         break;
                     }
                 }
@@ -2475,6 +2735,7 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     int threshold = (s2->mb_height * s->slice_count +
                                      s2->slice_context_count / 2) /
                                     s2->slice_context_count;
+                    av_assert0(avctx->thread_count > 1);
                     if (threshold <= mb_y) {
                         MpegEncContext *thread_context = s2->thread_context[s->slice_count];
 
@@ -2517,11 +2778,11 @@ static int mpeg_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_output, AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
+    int ret;
     int buf_size = avpkt->size;
     Mpeg1Context *s = avctx->priv_data;
     AVFrame *picture = data;
     MpegEncContext *s2 = &s->mpeg_enc_ctx;
-    ff_dlog(avctx, "fill_buffer\n");
 
     if (buf_size == 0 || (buf_size == 4 && AV_RB32(buf) == SEQ_END_CODE)) {
         /* special case for last picture */
@@ -2546,20 +2807,46 @@ static int mpeg_decode_frame(AVCodecContext *avctx, void *data,
             return buf_size;
     }
 
-    if (s->mpeg_enc_ctx_allocated == 0 && avctx->codec_tag == AV_RL32("VCR2"))
+    s2->codec_tag = avpriv_toupper4(avctx->codec_tag);
+    if (s->mpeg_enc_ctx_allocated == 0 && (   s2->codec_tag == AV_RL32("VCR2")
+                                           || s2->codec_tag == AV_RL32("BW10")
+                                          ))
         vcr2_init_sequence(avctx);
 
     s->slice_count = 0;
 
     if (avctx->extradata && !s->extradata_decoded) {
-        int ret = decode_chunks(avctx, picture, got_output,
-                                avctx->extradata, avctx->extradata_size);
+        ret = decode_chunks(avctx, picture, got_output,
+                            avctx->extradata, avctx->extradata_size);
+        if (*got_output) {
+            av_log(avctx, AV_LOG_ERROR, "picture in extradata\n");
+            av_frame_unref(picture);
+            *got_output = 0;
+        }
         s->extradata_decoded = 1;
-        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
+        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE)) {
+            s2->current_picture_ptr = NULL;
             return ret;
+        }
+    }
+
+    ret = decode_chunks(avctx, picture, got_output, buf, buf_size);
+    if (ret<0 || *got_output) {
+        s2->current_picture_ptr = NULL;
+
+        if (s2->timecode_frame_start != -1 && *got_output) {
+            AVFrameSideData *tcside = av_frame_new_side_data(picture,
+                                                             AV_FRAME_DATA_GOP_TIMECODE,
+                                                             sizeof(int64_t));
+            if (!tcside)
+                return AVERROR(ENOMEM);
+            memcpy(tcside->data, &s2->timecode_frame_start, sizeof(int64_t));
+
+            s2->timecode_frame_start = -1;
+        }
     }
 
-    return decode_chunks(avctx, picture, got_output, buf, buf_size);
+    return ret;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -2567,7 +2854,6 @@ static void flush(AVCodecContext *avctx)
     Mpeg1Context *s = avctx->priv_data;
 
     s->sync       = 0;
-    s->closed_gop = 0;
 
     ff_mpeg_flush(avctx);
 }
@@ -2594,12 +2880,23 @@ AVCodec ff_mpeg1video_decoder = {
     .capabilities          = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
                              AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                              AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal         = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
     .flush                 = flush,
+    .max_lowres            = 3,
     .update_thread_context = ONLY_IF_THREADS_ENABLED(mpeg_decode_update_thread_context),
     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
+#if CONFIG_MPEG1_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(mpeg1),
+#endif
 #if CONFIG_MPEG1_VDPAU_HWACCEL
                                HWACCEL_VDPAU(mpeg1),
 #endif
+#if CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL
+                               HWACCEL_VIDEOTOOLBOX(mpeg1),
+#endif
+#if CONFIG_MPEG1_XVMC_HWACCEL
+                               HWACCEL_XVMC(mpeg1),
+#endif
                                NULL
                            },
 };
@@ -2616,7 +2913,9 @@ AVCodec ff_mpeg2video_decoder = {
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
                       AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                       AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
     .flush          = flush,
+    .max_lowres     = 3,
     .profiles       = NULL_IF_CONFIG_SMALL(ff_mpeg2_video_profiles),
     .hw_configs     = (const AVCodecHWConfigInternal*[]) {
 #if CONFIG_MPEG2_DXVA2_HWACCEL
@@ -2628,12 +2927,37 @@ AVCodec ff_mpeg2video_decoder = {
 #if CONFIG_MPEG2_D3D11VA2_HWACCEL
                         HWACCEL_D3D11VA2(mpeg2),
 #endif
+#if CONFIG_MPEG2_NVDEC_HWACCEL
+                        HWACCEL_NVDEC(mpeg2),
+#endif
 #if CONFIG_MPEG2_VAAPI_HWACCEL
                         HWACCEL_VAAPI(mpeg2),
 #endif
 #if CONFIG_MPEG2_VDPAU_HWACCEL
                         HWACCEL_VDPAU(mpeg2),
 #endif
+#if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
+                        HWACCEL_VIDEOTOOLBOX(mpeg2),
+#endif
+#if CONFIG_MPEG2_XVMC_HWACCEL
+                        HWACCEL_XVMC(mpeg2),
+#endif
                         NULL
                     },
 };
+
+//legacy decoder
+AVCodec ff_mpegvideo_decoder = {
+    .name           = "mpegvideo",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-1 video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .priv_data_size = sizeof(Mpeg1Context),
+    .init           = mpeg_decode_init,
+    .close          = mpeg_decode_end,
+    .decode         = mpeg_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .flush          = flush,
+    .max_lowres     = 3,
+};
diff --git a/libavcodec/mpeg12enc.c b/libavcodec/mpeg12enc.c
index 4069509..d0b458e 100644
--- a/libavcodec/mpeg12enc.c
+++ b/libavcodec/mpeg12enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/log.h"
 #include "libavutil/opt.h"
+#include "libavutil/timecode.h"
 #include "libavutil/stereo3d.h"
 
 #include "avcodec.h"
@@ -40,17 +42,12 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 
-
-static const uint8_t inv_non_linear_qscale[] = {
-    0, 2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-};
-
 static const uint8_t svcd_scan_offset_placeholder[] = {
     0x10, 0x0E, 0x00, 0x80, 0x81, 0x00, 0x80,
     0x81, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 };
 
-static uint8_t mv_penalty[MAX_FCODE + 1][MAX_MV * 2 + 1];
+static uint8_t mv_penalty[MAX_FCODE + 1][MAX_DMV * 2 + 1];
 static uint8_t fcode_tab[MAX_MV * 2 + 1];
 
 static uint8_t uni_mpeg1_ac_vlc_len[64 * 64 * 2];
@@ -86,7 +83,7 @@ static av_cold void init_uni_ac_vlc(RLTable *rl, uint8_t *uni_ac_vlc_len)
                 /* length of VLC and sign */
                 len = rl->table_vlc[code][1] + 1;
             } else {
-                len = rl->table_vlc[111][1] + 6;    /* rl->n */
+                len = rl->table_vlc[111 /* rl->n */][1] + 6;
 
                 if (alevel < 128)
                     len += 8;
@@ -102,26 +99,37 @@ static av_cold void init_uni_ac_vlc(RLTable *rl, uint8_t *uni_ac_vlc_len)
 static int find_frame_rate_index(MpegEncContext *s)
 {
     int i;
-    int64_t dmin = INT64_MAX;
-    int64_t d;
+    AVRational bestq = (AVRational) {0, 0};
+    AVRational ext;
+    AVRational target = av_inv_q(s->avctx->time_base);
 
     for (i = 1; i < 14; i++) {
-        int64_t n0 = 1001LL / ff_mpeg12_frame_rate_tab[i].den *
-                     ff_mpeg12_frame_rate_tab[i].num * s->avctx->time_base.num;
-        int64_t n1 = 1001LL * s->avctx->time_base.den;
-
         if (s->avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL &&
             i >= 9)
             break;
 
-        d = FFABS(n0 - n1);
-        if (d < dmin) {
-            dmin                = d;
-            s->frame_rate_index = i;
+        for (ext.num=1; ext.num <= 4; ext.num++) {
+            for (ext.den=1; ext.den <= 32; ext.den++) {
+                AVRational q = av_mul_q(ext, ff_mpeg12_frame_rate_tab[i]);
+
+                if (s->codec_id != AV_CODEC_ID_MPEG2VIDEO && (ext.den!=1 || ext.num!=1))
+                    continue;
+                if (av_gcd(ext.den, ext.num) != 1)
+                    continue;
+
+                if (    bestq.num==0
+                    || av_nearer_q(target, bestq, q) < 0
+                    || ext.num==1 && ext.den==1 && av_nearer_q(target, bestq, q) == 0) {
+                    bestq               = q;
+                    s->frame_rate_index = i;
+                    s->mpeg2_frame_rate_ext.num = ext.num;
+                    s->mpeg2_frame_rate_ext.den = ext.den;
+                }
+            }
         }
     }
 
-    if (dmin)
+    if (av_cmp_q(target, bestq))
         return -1;
     else
         return 0;
@@ -176,6 +184,22 @@ static av_cold int encode_init(AVCodecContext *avctx)
         }
     }
 
+    if ((avctx->width & 0xFFF) == 0 && (avctx->height & 0xFFF) == 1) {
+        av_log(avctx, AV_LOG_ERROR, "Width / Height is invalid for MPEG2\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+        if ((avctx->width & 0xFFF) == 0 || (avctx->height & 0xFFF) == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Width or Height are not allowed to be multiples of 4096\n"
+                                        "add '-strict %d' if you want to use them anyway.\n", FF_COMPLIANCE_UNOFFICIAL);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    s->drop_frame_timecode = s->drop_frame_timecode || !!(avctx->flags2 & AV_CODEC_FLAG2_DROP_FRAME_TIMECODE);
+    if (s->drop_frame_timecode)
+        s->tc.flags |= AV_TIMECODE_FLAG_DROPFRAME;
     if (s->drop_frame_timecode && s->frame_rate_index != 4) {
         av_log(avctx, AV_LOG_ERROR,
                "Drop frame time code only allowed with 1001/30000 fps\n");
@@ -189,6 +213,17 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    if (s->tc_opt_str) {
+        AVRational rate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
+        int ret = av_timecode_init_from_string(&s->tc, rate, s->tc_opt_str, s);
+        if (ret < 0)
+            return ret;
+        s->drop_frame_timecode = !!(s->tc.flags & AV_TIMECODE_FLAG_DROPFRAME);
+        s->timecode_frame_start = s->tc.start;
+    } else {
+        s->timecode_frame_start = 0; // default is -1
+    }
+
     return 0;
 }
 
@@ -205,11 +240,11 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
     unsigned int vbv_buffer_size, fps, v;
     int i, constraint_parameter_flag;
     uint64_t time_code;
-    float best_aspect_error = 1E10;
-    float aspect_ratio      = av_q2d(s->avctx->sample_aspect_ratio);
+    int64_t best_aspect_error = INT64_MAX;
+    AVRational aspect_ratio = s->avctx->sample_aspect_ratio;
 
-    if (aspect_ratio == 0.0)
-        aspect_ratio = 1.0;             // pixel aspect 1.1 (VGA)
+    if (aspect_ratio.num == 0 || aspect_ratio.den == 0)
+        aspect_ratio = (AVRational){1,1};             // pixel aspect 1.1 (VGA)
 
     if (s->current_picture.f->key_frame) {
         AVRational framerate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
@@ -217,19 +252,19 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         /* MPEG-1 header repeated every GOP */
         put_header(s, SEQ_START_CODE);
 
-        put_sbits(&s->pb, 12, s->width);
-        put_sbits(&s->pb, 12, s->height);
+        put_sbits(&s->pb, 12, s->width  & 0xFFF);
+        put_sbits(&s->pb, 12, s->height & 0xFFF);
 
         for (i = 1; i < 15; i++) {
-            float error = aspect_ratio;
+            int64_t error = aspect_ratio.num * (1LL<<32) / aspect_ratio.den;
             if (s->codec_id == AV_CODEC_ID_MPEG1VIDEO || i <= 1)
-                error -= 1.0 / ff_mpeg1_aspect[i];
+                error -= (1LL<<32) / ff_mpeg1_aspect[i];
             else
-                error -= av_q2d(ff_mpeg2_aspect[i]) * s->height / s->width;
+                error -= (1LL<<32)*ff_mpeg2_aspect[i].num * s->height / s->width / ff_mpeg2_aspect[i].den;
 
             error = FFABS(error);
 
-            if (error < best_aspect_error) {
+            if (error - 2 <= best_aspect_error) {
                 best_aspect_error    = error;
                 s->aspect_ratio_info = i;
             }
@@ -276,6 +311,11 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         ff_write_quant_matrix(&s->pb, s->avctx->inter_matrix);
 
         if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
+            AVFrameSideData *side_data;
+            int width = s->width;
+            int height = s->height;
+            int use_seq_disp_ext;
+
             put_header(s, EXT_START_CODE);
             put_bits(&s->pb, 4, 1);                 // seq ext
 
@@ -292,20 +332,38 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
             put_bits(&s->pb, 1, 1);                 // marker
             put_bits(&s->pb, 8, vbv_buffer_size >> 10); // vbv buffer ext
             put_bits(&s->pb, 1, s->low_delay);
-            put_bits(&s->pb, 2, 0);                 // frame_rate_ext_n
-            put_bits(&s->pb, 5, 0);                 // frame_rate_ext_d
+            put_bits(&s->pb, 2, s->mpeg2_frame_rate_ext.num-1); // frame_rate_ext_n
+            put_bits(&s->pb, 5, s->mpeg2_frame_rate_ext.den-1); // frame_rate_ext_d
+
+            side_data = av_frame_get_side_data(s->current_picture_ptr->f, AV_FRAME_DATA_PANSCAN);
+            if (side_data) {
+                AVPanScan *pan_scan = (AVPanScan *)side_data->data;
+                if (pan_scan->width && pan_scan->height) {
+                    width = pan_scan->width >> 4;
+                    height = pan_scan->height >> 4;
+                }
+            }
 
-            put_header(s, EXT_START_CODE);
-            put_bits(&s->pb, 4, 2);                         // sequence display extension
-            put_bits(&s->pb, 3, 5);                         // video_format: 5 is unspecified
-            put_bits(&s->pb, 1, 1);                         // colour_description
-            put_bits(&s->pb, 8, s->avctx->color_primaries); // colour_primaries
-            put_bits(&s->pb, 8, s->avctx->color_trc);       // transfer_characteristics
-            put_bits(&s->pb, 8, s->avctx->colorspace);      // matrix_coefficients
-            put_bits(&s->pb, 14, s->width);                 // display_horizontal_size
-            put_bits(&s->pb, 1, 1);                         // marker_bit
-            put_bits(&s->pb, 14, s->height);                // display_vertical_size
-            put_bits(&s->pb, 3, 0);                         // remaining 3 bits are zero padding
+            use_seq_disp_ext = (width != s->width ||
+                                height != s->height ||
+                                s->avctx->color_primaries != AVCOL_PRI_UNSPECIFIED ||
+                                s->avctx->color_trc != AVCOL_TRC_UNSPECIFIED ||
+                                s->avctx->colorspace != AVCOL_SPC_UNSPECIFIED ||
+                                s->video_format != VIDEO_FORMAT_UNSPECIFIED);
+
+            if (s->seq_disp_ext == 1 || (s->seq_disp_ext == -1 && use_seq_disp_ext)) {
+                put_header(s, EXT_START_CODE);
+                put_bits(&s->pb, 4, 2);                         // sequence display extension
+                put_bits(&s->pb, 3, s->video_format);           // video_format
+                put_bits(&s->pb, 1, 1);                         // colour_description
+                put_bits(&s->pb, 8, s->avctx->color_primaries); // colour_primaries
+                put_bits(&s->pb, 8, s->avctx->color_trc);       // transfer_characteristics
+                put_bits(&s->pb, 8, s->avctx->colorspace);      // matrix_coefficients
+                put_bits(&s->pb, 14, width);                    // display_horizontal_size
+                put_bits(&s->pb, 1, 1);                         // marker_bit
+                put_bits(&s->pb, 14, height);                   // display_vertical_size
+                put_bits(&s->pb, 3, 0);                         // remaining 3 bits are zero padding
+            }
         }
 
         put_header(s, GOP_START_CODE);
@@ -317,21 +375,17 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
                     s->timecode_frame_start;
 
         s->gop_picture_number = s->current_picture_ptr->f->coded_picture_number;
-        if (s->drop_frame_timecode) {
-            /* only works for NTSC 29.97 */
-            int d = time_code / 17982;
-            int m = time_code % 17982;
-            /* not needed since -2,-1 / 1798 in C returns 0 */
-            // if (m < 2)
-            //     m += 2;
-            time_code += 18 * d + 2 * ((m - 2) / 1798);
-        }
+
+        av_assert0(s->drop_frame_timecode == !!(s->tc.flags & AV_TIMECODE_FLAG_DROPFRAME));
+        if (s->drop_frame_timecode)
+            time_code = av_timecode_adjust_ntsc_framenum2(time_code, fps);
+
         put_bits(&s->pb, 5, (uint32_t)((time_code / (fps * 3600)) % 24));
         put_bits(&s->pb, 6, (uint32_t)((time_code / (fps *   60)) % 60));
         put_bits(&s->pb, 1, 1);
         put_bits(&s->pb, 6, (uint32_t)((time_code / fps) % 60));
         put_bits(&s->pb, 6, (uint32_t)((time_code % fps)));
-        put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP) || s->intra_only);
+        put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP) || s->intra_only || !s->gop_picture_number);
         put_bits(&s->pb, 1, 0);                     // broken link
     }
 }
@@ -348,17 +402,12 @@ static inline void encode_mb_skip_run(MpegEncContext *s, int run)
 
 static av_always_inline void put_qscale(MpegEncContext *s)
 {
-    if (s->q_scale_type) {
-        assert(s->qscale >= 1 && s->qscale <= 12);
-        put_bits(&s->pb, 5, inv_non_linear_qscale[s->qscale]);
-    } else {
-        put_bits(&s->pb, 5, s->qscale);
-    }
+    put_bits(&s->pb, 5, s->qscale);
 }
 
 void ff_mpeg1_encode_slice_header(MpegEncContext *s)
 {
-    if (s->height > 2800) {
+    if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->height > 2800) {
         put_header(s, SLICE_MIN_START_CODE + (s->mb_y & 127));
         /* slice_vertical_position_extension */
         put_bits(&s->pb, 3, s->mb_y >> 7);
@@ -427,7 +476,7 @@ void ff_mpeg1_encode_picture_header(MpegEncContext *s, int picture_number)
         }
         put_bits(&s->pb, 2, s->intra_dc_precision);
 
-        assert(s->picture_structure == PICT_FRAME);
+        av_assert0(s->picture_structure == PICT_FRAME);
         put_bits(&s->pb, 2, s->picture_structure);
         if (s->progressive_sequence)
             put_bits(&s->pb, 1, 0);             /* no repeat */
@@ -539,7 +588,7 @@ static void mpeg1_encode_motion(MpegEncContext *s, int val, int f_or_b_code)
             sign = 1;
         }
 
-        assert(code > 0 && code <= 16);
+        av_assert2(code > 0 && code <= 16);
 
         put_bits(&s->pb,
                  ff_mpeg12_mbMotionVectorTable[code][1],
@@ -567,12 +616,12 @@ static inline void encode_dc(MpegEncContext *s, int diff, int component)
             put_bits(&s->pb,
                      ff_mpeg12_vlc_dc_lum_bits[index] + index,
                      (ff_mpeg12_vlc_dc_lum_code[index] << index) +
-                     (diff & ((1 << index) - 1)));
+                     av_mod_uintp2(diff, index));
         else
             put_bits(&s->pb,
                      ff_mpeg12_vlc_dc_chroma_bits[index] + index,
                      (ff_mpeg12_vlc_dc_chroma_code[index] << index) +
-                     (diff & ((1 << index) - 1)));
+                     av_mod_uintp2(diff, index));
     } else {
         if (component == 0)
             put_bits(&s->pb,
@@ -682,7 +731,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
 
     if (cbp == 0 && !first_mb && s->mv_type == MV_TYPE_16X16 &&
         (mb_x != s->mb_width - 1 ||
-         (mb_y != s->mb_height - 1 && s->codec_id == AV_CODEC_ID_MPEG1VIDEO)) &&
+         (mb_y != s->end_mb_y - 1 && s->codec_id == AV_CODEC_ID_MPEG1VIDEO)) &&
         ((s->pict_type == AV_PICTURE_TYPE_P && (motion_x | motion_y) == 0) ||
          (s->pict_type == AV_PICTURE_TYPE_B && s->mv_dir == s->last_mv_dir &&
           (((s->mv_dir & MV_DIR_FORWARD)
@@ -704,7 +753,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
         }
     } else {
         if (first_mb) {
-            assert(s->mb_skip_run == 0);
+            av_assert0(s->mb_skip_run == 0);
             encode_mb_skip_run(s, s->mb_x);
         } else {
             encode_mb_skip_run(s, s->mb_skip_run);
@@ -783,7 +832,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
                 s->last_mv[0][1][0] = s->last_mv[0][0][0] = motion_x;
                 s->last_mv[0][1][1] = s->last_mv[0][0][1] = motion_y;
             } else {
-                assert(!s->frame_pred_frame_dct && s->mv_type == MV_TYPE_FIELD);
+                av_assert2(!s->frame_pred_frame_dct && s->mv_type == MV_TYPE_FIELD);
 
                 if (cbp) {
                     if (s->dquant) {
@@ -870,8 +919,8 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
                     s->b_count++;
                 }
             } else {
-                assert(s->mv_type == MV_TYPE_FIELD);
-                assert(!s->frame_pred_frame_dct);
+                av_assert2(s->mv_type == MV_TYPE_FIELD);
+                av_assert2(!s->frame_pred_frame_dct);
                 if (cbp) {                      // With coded bloc pattern
                     if (s->dquant) {
                         if (s->mv_dir == MV_DIR_FORWARD)
@@ -988,17 +1037,17 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
 
             bits = ff_mpeg12_vlc_dc_lum_bits[index] + index;
             code = (ff_mpeg12_vlc_dc_lum_code[index] << index) +
-                   (diff & ((1 << index) - 1));
+                    av_mod_uintp2(diff, index);
             mpeg1_lum_dc_uni[i + 255] = bits + (code << 8);
 
             bits = ff_mpeg12_vlc_dc_chroma_bits[index] + index;
             code = (ff_mpeg12_vlc_dc_chroma_code[index] << index) +
-                   (diff & ((1 << index) - 1));
+                    av_mod_uintp2(diff, index);
             mpeg1_chr_dc_uni[i + 255] = bits + (code << 8);
         }
 
         for (f_code = 1; f_code <= MAX_FCODE; f_code++)
-            for (mv = -MAX_MV; mv <= MAX_MV; mv++) {
+            for (mv = -MAX_DMV; mv <= MAX_DMV; mv++) {
                 int len;
 
                 if (mv == 0) {
@@ -1021,7 +1070,7 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
                               2 + bit_size;
                 }
 
-                mv_penalty[f_code][mv + MAX_MV] = len;
+                mv_penalty[f_code][mv + MAX_DMV] = len;
             }
 
 
@@ -1052,14 +1101,16 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
 #define COMMON_OPTS                                                           \
+    { "gop_timecode",        "MPEG GOP Timecode in hh:mm:ss[:;.]ff format. Overrides timecode_frame_start.",   \
+      OFFSET(tc_opt_str), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, VE },\
     { "intra_vlc",           "Use MPEG-2 intra VLC table.",                   \
-      OFFSET(intra_vlc_format),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE }, \
+      OFFSET(intra_vlc_format),    AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
     { "drop_frame_timecode", "Timecode is in drop frame format.",             \
-      OFFSET(drop_frame_timecode), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE }, \
+      OFFSET(drop_frame_timecode), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
     { "scan_offset",         "Reserve space for SVCD scan offset user data.", \
-      OFFSET(scan_offset),         AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE }, \
+      OFFSET(scan_offset),         AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
     { "timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", \
-      OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, VE}, \
+      OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = -1 }, -1, INT64_MAX, VE}, \
 
 static const AVOption mpeg1_options[] = {
     COMMON_OPTS
@@ -1069,8 +1120,19 @@ static const AVOption mpeg1_options[] = {
 
 static const AVOption mpeg2_options[] = {
     COMMON_OPTS
-    { "non_linear_quant", "Use nonlinear quantizer.",    OFFSET(q_scale_type),   AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "alternate_scan",   "Enable alternate scantable.", OFFSET(alternate_scan), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "non_linear_quant", "Use nonlinear quantizer.",    OFFSET(q_scale_type),   AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "alternate_scan",   "Enable alternate scantable.", OFFSET(alternate_scan), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "seq_disp_ext",     "Write sequence_display_extension blocks.", OFFSET(seq_disp_ext), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VE, "seq_disp_ext" },
+    {     "auto",   NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = -1},  0, 0, VE, "seq_disp_ext" },
+    {     "never",  NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = 0 },  0, 0, VE, "seq_disp_ext" },
+    {     "always", NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = 1 },  0, 0, VE, "seq_disp_ext" },
+    { "video_format",     "Video_format in the sequence_display_extension indicating the source of the video.", OFFSET(video_format), AV_OPT_TYPE_INT, { .i64 = VIDEO_FORMAT_UNSPECIFIED }, 0, 7, VE, "video_format" },
+    {     "component",    NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = VIDEO_FORMAT_COMPONENT  },  0, 0, VE, "video_format" },
+    {     "pal",          NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = VIDEO_FORMAT_PAL        },  0, 0, VE, "video_format" },
+    {     "ntsc",         NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = VIDEO_FORMAT_NTSC       },  0, 0, VE, "video_format" },
+    {     "secam",        NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = VIDEO_FORMAT_SECAM      },  0, 0, VE, "video_format" },
+    {     "mac",          NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = VIDEO_FORMAT_MAC        },  0, 0, VE, "video_format" },
+    {     "unspecified",  NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = VIDEO_FORMAT_UNSPECIFIED},  0, 0, VE, "video_format" },
     FF_MPV_COMMON_OPTS
     { NULL },
 };
@@ -1111,7 +1173,7 @@ AVCodec ff_mpeg2video_encoder = {
     .init                 = encode_init,
     .encode2              = ff_mpv_encode_picture,
     .close                = ff_mpv_encode_end,
-    .supported_framerates = ff_mpeg12_frame_rate_tab + 1,
+    .supported_framerates = ff_mpeg2_frame_rate_tab,
     .pix_fmts             = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
                                                            AV_PIX_FMT_YUV422P,
                                                            AV_PIX_FMT_NONE },
diff --git a/libavcodec/mpeg12framerate.c b/libavcodec/mpeg12framerate.c
index acfef09..ab3d351 100644
--- a/libavcodec/mpeg12framerate.c
+++ b/libavcodec/mpeg12framerate.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg12vlc.h b/libavcodec/mpeg12vlc.h
index 90f8135..c5abae9 100644
--- a/libavcodec/mpeg12vlc.h
+++ b/libavcodec/mpeg12vlc.h
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg2_metadata_bsf.c b/libavcodec/mpeg2_metadata_bsf.c
index 668d70e..ba3a74a 100644
--- a/libavcodec/mpeg2_metadata_bsf.c
+++ b/libavcodec/mpeg2_metadata_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -188,7 +188,7 @@ static int mpeg2_metadata_filter(AVBSFContext *bsf, AVPacket *out)
 
     err = ff_bsf_get_packet(bsf, &in);
     if (err < 0)
-        goto fail;
+        return err;
 
     err = ff_cbs_read_packet(ctx->cbc, frag, in);
     if (err < 0) {
@@ -209,15 +209,15 @@ static int mpeg2_metadata_filter(AVBSFContext *bsf, AVPacket *out)
     }
 
     err = av_packet_copy_props(out, in);
-    if (err < 0) {
-        av_packet_unref(out);
+    if (err < 0)
         goto fail;
-    }
 
     err = 0;
 fail:
-    ff_cbs_fragment_uninit(ctx->cbc, frag);
+    ff_cbs_fragment_reset(ctx->cbc, frag);
 
+    if (err < 0)
+        av_packet_unref(out);
     av_packet_free(&in);
 
     return err;
@@ -255,38 +255,41 @@ static int mpeg2_metadata_init(AVBSFContext *bsf)
 
     err = 0;
 fail:
-    ff_cbs_fragment_uninit(ctx->cbc, frag);
+    ff_cbs_fragment_reset(ctx->cbc, frag);
     return err;
 }
 
 static void mpeg2_metadata_close(AVBSFContext *bsf)
 {
     MPEG2MetadataContext *ctx = bsf->priv_data;
+
+    ff_cbs_fragment_free(ctx->cbc, &ctx->fragment);
     ff_cbs_close(&ctx->cbc);
 }
 
 #define OFFSET(x) offsetof(MPEG2MetadataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
 static const AVOption mpeg2_metadata_options[] = {
     { "display_aspect_ratio", "Set display aspect ratio (table 6-3)",
         OFFSET(display_aspect_ratio), AV_OPT_TYPE_RATIONAL,
-        { .i64 = 0 }, 0, 65535 },
+        { .dbl = 0.0 }, 0, 65535, FLAGS },
 
     { "frame_rate", "Set frame rate",
         OFFSET(frame_rate), AV_OPT_TYPE_RATIONAL,
-        { .i64 = 0 }, 0, UINT_MAX },
+        { .dbl = 0.0 }, 0, UINT_MAX, FLAGS },
 
     { "video_format", "Set video format (table 6-6)",
         OFFSET(video_format), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 7 },
+        { .i64 = -1 }, -1, 7, FLAGS },
     { "colour_primaries", "Set colour primaries (table 6-7)",
         OFFSET(colour_primaries), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
     { "transfer_characteristics", "Set transfer characteristics (table 6-8)",
         OFFSET(transfer_characteristics), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
     { "matrix_coefficients", "Set matrix coefficients (table 6-9)",
         OFFSET(matrix_coefficients), AV_OPT_TYPE_INT,
-        { .i64 = -1 }, -1, 255 },
+        { .i64 = -1 }, -1, 255, FLAGS },
 
     { NULL }
 };
@@ -295,7 +298,7 @@ static const AVClass mpeg2_metadata_class = {
     .class_name = "mpeg2_metadata_bsf",
     .item_name  = av_default_item_name,
     .option     = mpeg2_metadata_options,
-    .version    = LIBAVCODEC_VERSION_MAJOR,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static const enum AVCodecID mpeg2_metadata_codec_ids[] = {
diff --git a/libavcodec/mpeg4_unpack_bframes_bsf.c b/libavcodec/mpeg4_unpack_bframes_bsf.c
new file mode 100644
index 0000000..1daf133
--- /dev/null
+++ b/libavcodec/mpeg4_unpack_bframes_bsf.c
@@ -0,0 +1,179 @@
+/*
+ * Bitstream filter for unpacking DivX-style packed B-frames in MPEG-4 (divx_packed)
+ * Copyright (c) 2015 Andreas Cadhalpun <Andreas.Cadhalpun@googlemail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "internal.h"
+#include "mpeg4video.h"
+
+typedef struct UnpackBFramesBSFContext {
+    AVPacket *b_frame;
+} UnpackBFramesBSFContext;
+
+/* determine the position of the packed marker in the userdata,
+ * the number of VOPs and the position of the second VOP */
+static void scan_buffer(const uint8_t *buf, int buf_size,
+                        int *pos_p, int *nb_vop, int *pos_vop2) {
+    uint32_t startcode;
+    const uint8_t *end = buf + buf_size, *pos = buf;
+
+    while (pos < end) {
+        startcode = -1;
+        pos = avpriv_find_start_code(pos, end, &startcode);
+
+        if (startcode == USER_DATA_STARTCODE && pos_p) {
+            /* check if the (DivX) userdata string ends with 'p' (packed) */
+            for (int i = 0; i < 255 && pos + i + 1 < end; i++) {
+                if (pos[i] == 'p' && pos[i + 1] == '\0') {
+                    *pos_p = pos + i - buf;
+                    break;
+                }
+            }
+        } else if (startcode == VOP_STARTCODE && nb_vop) {
+            *nb_vop += 1;
+            if (*nb_vop == 2 && pos_vop2) {
+                *pos_vop2 = pos - buf - 4; /* subtract 4 bytes startcode */
+            }
+        }
+    }
+}
+
+static int mpeg4_unpack_bframes_filter(AVBSFContext *ctx, AVPacket *out)
+{
+    UnpackBFramesBSFContext *s = ctx->priv_data;
+    int pos_p = -1, nb_vop = 0, pos_vop2 = -1, ret = 0;
+    AVPacket *in;
+
+    ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
+
+    scan_buffer(in->data, in->size, &pos_p, &nb_vop, &pos_vop2);
+    av_log(ctx, AV_LOG_DEBUG, "Found %d VOP startcode(s) in this packet.\n", nb_vop);
+
+    if (pos_vop2 >= 0) {
+        if (s->b_frame->data) {
+            av_log(ctx, AV_LOG_WARNING,
+                   "Missing one N-VOP packet, discarding one B-frame.\n");
+            av_packet_unref(s->b_frame);
+        }
+        /* store the packed B-frame in the BSFContext */
+        ret = av_packet_ref(s->b_frame, in);
+        if (ret < 0) {
+            goto fail;
+        }
+        s->b_frame->size -= pos_vop2;
+        s->b_frame->data += pos_vop2;
+    }
+
+    if (nb_vop > 2) {
+        av_log(ctx, AV_LOG_WARNING,
+       "Found %d VOP headers in one packet, only unpacking one.\n", nb_vop);
+    }
+
+    if (nb_vop == 1 && s->b_frame->data) {
+        /* use frame from BSFContext */
+        av_packet_move_ref(out, s->b_frame);
+
+        /* use properties from current input packet */
+        ret = av_packet_copy_props(out, in);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        if (in->size <= MAX_NVOP_SIZE) {
+            /* N-VOP */
+            av_log(ctx, AV_LOG_DEBUG, "Skipping N-VOP.\n");
+        } else {
+            /* copy packet into BSFContext */
+            av_packet_move_ref(s->b_frame, in);
+        }
+    } else if (nb_vop >= 2) {
+        /* use first frame of the packet */
+        av_packet_move_ref(out, in);
+        out->size = pos_vop2;
+    } else if (pos_p >= 0) {
+        ret = av_packet_make_writable(in);
+        if (ret < 0)
+            goto fail;
+        av_log(ctx, AV_LOG_DEBUG, "Updating DivX userdata (remove trailing 'p').\n");
+        av_packet_move_ref(out, in);
+        /* remove 'p' (packed) from the end of the (DivX) userdata string */
+        out->data[pos_p] = '\0';
+    } else {
+        /* copy packet */
+        av_packet_move_ref(out, in);
+    }
+
+fail:
+    if (ret < 0)
+        av_packet_unref(out);
+    av_packet_free(&in);
+
+    return ret;
+}
+
+static int mpeg4_unpack_bframes_init(AVBSFContext *ctx)
+{
+    UnpackBFramesBSFContext *s = ctx->priv_data;
+
+    s->b_frame = av_packet_alloc();
+    if (!s->b_frame)
+        return AVERROR(ENOMEM);
+
+    if (ctx->par_in->extradata) {
+        int pos_p_ext = -1;
+        scan_buffer(ctx->par_in->extradata, ctx->par_in->extradata_size, &pos_p_ext, NULL, NULL);
+        if (pos_p_ext >= 0) {
+            av_log(ctx, AV_LOG_DEBUG,
+                   "Updating DivX userdata (remove trailing 'p') in extradata.\n");
+            ctx->par_out->extradata[pos_p_ext] = '\0';
+        }
+    }
+
+    return 0;
+}
+
+static void mpeg4_unpack_bframes_flush(AVBSFContext *bsfc)
+{
+    UnpackBFramesBSFContext *ctx = bsfc->priv_data;
+    av_packet_unref(ctx->b_frame);
+}
+
+static void mpeg4_unpack_bframes_close(AVBSFContext *bsfc)
+{
+    UnpackBFramesBSFContext *ctx = bsfc->priv_data;
+    av_packet_free(&ctx->b_frame);
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_MPEG4, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_mpeg4_unpack_bframes_bsf = {
+    .name           = "mpeg4_unpack_bframes",
+    .priv_data_size = sizeof(UnpackBFramesBSFContext),
+    .init           = mpeg4_unpack_bframes_init,
+    .filter         = mpeg4_unpack_bframes_filter,
+    .flush          = mpeg4_unpack_bframes_flush,
+    .close          = mpeg4_unpack_bframes_close,
+    .codec_ids      = codec_ids,
+};
diff --git a/libavcodec/mpeg4audio.c b/libavcodec/mpeg4audio.c
index e321bcc..2197147 100644
--- a/libavcodec/mpeg4audio.c
+++ b/libavcodec/mpeg4audio.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  * Copyright (c) 2009 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,6 +42,11 @@ static int parse_config_ALS(GetBitContext *gb, MPEG4AudioConfig *c)
     // which are buggy in old ALS conformance files
     c->sample_rate = get_bits_long(gb, 32);
 
+    if (c->sample_rate <= 0) {
+        av_log(NULL, AV_LOG_ERROR, "Invalid sample rate %d\n", c->sample_rate);
+        return AVERROR_INVALIDDATA;
+    }
+
     // skip number of samples
     skip_bits_long(gb, 32);
 
@@ -52,6 +57,8 @@ static int parse_config_ALS(GetBitContext *gb, MPEG4AudioConfig *c)
     return 0;
 }
 
+/* XXX: make sure to update the copies in the different encoders if you change
+ * this table */
 const int avpriv_mpeg4audio_sample_rates[16] = {
     96000, 88200, 64000, 48000, 44100, 32000,
     24000, 22050, 16000, 12000, 11025, 8000, 7350
@@ -76,63 +83,62 @@ static inline int get_sample_rate(GetBitContext *gb, int *index)
         avpriv_mpeg4audio_sample_rates[*index];
 }
 
-int avpriv_mpeg4audio_get_config(MPEG4AudioConfig *c, const uint8_t *buf,
-                                 int bit_size, int sync_extension)
+int ff_mpeg4audio_get_config_gb(MPEG4AudioConfig *c, GetBitContext *gb,
+                                int sync_extension)
 {
-    GetBitContext gb;
     int specific_config_bitindex, ret;
-
-    ret = init_get_bits(&gb, buf, bit_size);
-    if (ret < 0)
-        return ret;
-    c->object_type = get_object_type(&gb);
-    c->sample_rate = get_sample_rate(&gb, &c->sampling_index);
-    c->chan_config = get_bits(&gb, 4);
+    int start_bit_index = get_bits_count(gb);
+    c->object_type = get_object_type(gb);
+    c->sample_rate = get_sample_rate(gb, &c->sampling_index);
+    c->chan_config = get_bits(gb, 4);
     if (c->chan_config < FF_ARRAY_ELEMS(ff_mpeg4audio_channels))
         c->channels = ff_mpeg4audio_channels[c->chan_config];
     c->sbr = -1;
     c->ps  = -1;
     if (c->object_type == AOT_SBR || (c->object_type == AOT_PS &&
         // check for W6132 Annex YYYY draft MP3onMP4
-        !(show_bits(&gb, 3) & 0x03 && !(show_bits(&gb, 9) & 0x3F)))) {
+        !(show_bits(gb, 3) & 0x03 && !(show_bits(gb, 9) & 0x3F)))) {
         if (c->object_type == AOT_PS)
             c->ps = 1;
         c->ext_object_type = AOT_SBR;
         c->sbr = 1;
-        c->ext_sample_rate = get_sample_rate(&gb, &c->ext_sampling_index);
-        c->object_type = get_object_type(&gb);
+        c->ext_sample_rate = get_sample_rate(gb, &c->ext_sampling_index);
+        c->object_type = get_object_type(gb);
         if (c->object_type == AOT_ER_BSAC)
-            c->ext_chan_config = get_bits(&gb, 4);
+            c->ext_chan_config = get_bits(gb, 4);
     } else {
         c->ext_object_type = AOT_NULL;
         c->ext_sample_rate = 0;
     }
-    specific_config_bitindex = get_bits_count(&gb);
+    specific_config_bitindex = get_bits_count(gb);
 
     if (c->object_type == AOT_ALS) {
-        skip_bits(&gb, 5);
-        if (show_bits_long(&gb, 24) != MKBETAG('\0','A','L','S'))
-            skip_bits_long(&gb, 24);
+        skip_bits(gb, 5);
+        if (show_bits_long(gb, 24) != MKBETAG('\0','A','L','S'))
+            skip_bits_long(gb, 24);
 
-        specific_config_bitindex = get_bits_count(&gb);
+        specific_config_bitindex = get_bits_count(gb);
 
-        ret = parse_config_ALS(&gb, c);
+        ret = parse_config_ALS(gb, c);
         if (ret < 0)
             return ret;
     }
 
     if (c->ext_object_type != AOT_SBR && sync_extension) {
-        while (get_bits_left(&gb) > 15) {
-            if (show_bits(&gb, 11) == 0x2b7) { // sync extension
-                get_bits(&gb, 11);
-                c->ext_object_type = get_object_type(&gb);
-                if (c->ext_object_type == AOT_SBR && (c->sbr = get_bits1(&gb)) == 1)
-                    c->ext_sample_rate = get_sample_rate(&gb, &c->ext_sampling_index);
-                if (get_bits_left(&gb) > 11 && get_bits(&gb, 11) == 0x548)
-                    c->ps = get_bits1(&gb);
+        while (get_bits_left(gb) > 15) {
+            if (show_bits(gb, 11) == 0x2b7) { // sync extension
+                get_bits(gb, 11);
+                c->ext_object_type = get_object_type(gb);
+                if (c->ext_object_type == AOT_SBR && (c->sbr = get_bits1(gb)) == 1) {
+                    c->ext_sample_rate = get_sample_rate(gb, &c->ext_sampling_index);
+                    if (c->ext_sample_rate == c->sample_rate)
+                        c->sbr = -1;
+                }
+                if (get_bits_left(gb) > 11 && get_bits(gb, 11) == 0x548)
+                    c->ps = get_bits1(gb);
                 break;
             } else
-                get_bits1(&gb); // skip 1 bit
+                get_bits1(gb); // skip 1 bit
         }
     }
 
@@ -143,5 +149,21 @@ int avpriv_mpeg4audio_get_config(MPEG4AudioConfig *c, const uint8_t *buf,
     if ((c->ps == -1 && c->object_type != AOT_AAC_LC) || c->channels & ~0x01)
         c->ps = 0;
 
-    return specific_config_bitindex;
+    return specific_config_bitindex - start_bit_index;
+}
+
+int avpriv_mpeg4audio_get_config(MPEG4AudioConfig *c, const uint8_t *buf,
+                                 int bit_size, int sync_extension)
+{
+    GetBitContext gb;
+    int ret;
+
+    if (bit_size <= 0)
+        return AVERROR_INVALIDDATA;
+
+    ret = init_get_bits(&gb, buf, bit_size);
+    if (ret < 0)
+        return ret;
+
+    return ff_mpeg4audio_get_config_gb(c, &gb, sync_extension);
 }
diff --git a/libavcodec/mpeg4audio.h b/libavcodec/mpeg4audio.h
index 0a09058..b9cea8a 100644
--- a/libavcodec/mpeg4audio.h
+++ b/libavcodec/mpeg4audio.h
@@ -2,20 +2,20 @@
  * MPEG-4 Audio common header
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,17 @@ extern av_export_avcodec const int avpriv_mpeg4audio_sample_rates[16];
 extern const uint8_t ff_mpeg4audio_channels[8];
 
 /**
- * Parse MPEG-4 systems extradata to retrieve audio configuration.
+ * Parse MPEG-4 systems extradata from a potentially unaligned GetBitContext to retrieve audio configuration.
+ * @param[in] c        MPEG4AudioConfig structure to fill.
+ * @param[in] gb       Extradata from container.
+ * @param[in] sync_extension look for a sync extension after config if true.
+ * @return On error -1 is returned, on success AudioSpecificConfig bit index in extradata.
+ */
+int ff_mpeg4audio_get_config_gb(MPEG4AudioConfig *c, GetBitContext *gb,
+                                int sync_extension);
+
+/**
+ * Parse MPEG-4 systems extradata from a raw buffer to retrieve audio configuration.
  * @param[in] c        MPEG4AudioConfig structure to fill.
  * @param[in] buf      Extradata from container.
  * @param[in] bit_size Extradata size in bits.
@@ -106,7 +116,7 @@ enum AudioObjectType {
     AOT_USAC,                  ///< N                       Unified Speech and Audio Coding
 };
 
-#define MAX_PCE_SIZE 304 ///<Maximum size of a PCE including the 3-bit ID_PCE
+#define MAX_PCE_SIZE 320 ///<Maximum size of a PCE including the 3-bit ID_PCE
                          ///<marker and the comment
 
 static av_always_inline unsigned int ff_pce_copy_bits(PutBitContext *pb,
diff --git a/libavcodec/mpeg4data.h b/libavcodec/mpeg4data.h
index b428a5e..4756e9e 100644
--- a/libavcodec/mpeg4data.h
+++ b/libavcodec/mpeg4data.h
@@ -3,20 +3,20 @@
  * H.263+ support
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -373,4 +373,120 @@ const uint8_t ff_mpeg4_dc_threshold[8]={
     99, 13, 15, 17, 19, 21, 23, 0
 };
 
+/* Note these are different in studio mode */
+const uint16_t ff_mpeg4_studio_dc_luma[19][2]={
+    {0x0e,  6}, {0x06,  5}, {0x00,  4}, {0x02,  4},
+    {0x07,  3}, {0x05,  3}, {0x03,  3}, {0x02,  3},
+    {0x04,  3}, {0x06,  3}, {0x01,  4}, {0x1e,  7},
+    {0x3e,  8}, {0x7e,  9}, {0xfe, 10}, {0x1fe, 11},
+    {0x3fe, 12}, {0x7fe, 13}, {0x7ff, 13}
+};
+
+const uint16_t ff_mpeg4_studio_dc_chroma[19][2]={
+    {0x00,  4}, {0x02,  4}, {0x07,  3}, {0x05,  3},
+    {0x03,  3}, {0x02,  3}, {0x04,  3}, {0x06,  3},
+    {0x01,  4}, {0x06,  5}, {0x0e,  6}, {0x1e,  7},
+    {0x3e,  8}, {0x7e,  9}, {0xfe, 10}, {0x1fe, 11},
+    {0x3fe, 12}, {0x7fe, 13}, {0x7ff, 13}
+};
+
+const uint16_t ff_mpeg4_studio_intra[12][22][2]={
+    {
+        {0x05,  4}, {0x04,  4}, {0x05,  7}, {0x09,  9},
+        {0x21, 11}, {0x41, 12}, {0x81, 13}, {0x03,  4},
+        {0x03,  5}, {0x05,  6}, {0x04,  7}, {0x03,  7},
+        {0x05,  8}, {0x03,  2}, {0x05,  3}, {0x04,  3},
+        {0x03,  3}, {0x02,  4}, {0x04,  6}, {0x03,  6},
+        {0x11, 10}, {0x80, 13}
+    },
+    {
+        {0x00,  0}, {0x00,  0}, {0x00,  0}, {0x00,  0},
+        {0x00,  0}, {0x00,  0}, {0x00,  0}, {0x00,  0},
+        {0x00,  0}, {0x00,  0}, {0x00,  0}, {0x00,  0},
+        {0x00,  0}, {0x00,  0}, {0x01,  1}, {0x01,  2},
+        {0x01,  3}, {0x01,  4}, {0x01,  5}, {0x03,  7},
+        {0x05,  8}, {0x04,  8}
+    },
+    {
+        {0x05,  3},  {0x03,  5},  {0x02,  5},  {0x03,  7},
+        {0x09,  9},  {0x103, 14}, {0x102, 14}, {0x04,  3},
+        {0x03,  3},  {0x03,  4},  {0x02,  4},  {0x03,  6},
+        {0x11, 10},  {0x03,  2},  {0x02,  3},  {0x02,  6},
+        {0x05,  8},  {0x21, 11},  {0x83, 13},  {0x101, 14},
+        {0x201, 15}, {0x82, 13}
+    },
+    {
+        {0x05,  5}, {0x05,  4}, {0x04,  5}, {0x03,  6},
+        {0x09,  9}, {0x83, 13}, {0x82, 13}, {0x03,  3},
+        {0x04,  4}, {0x03,  4}, {0x03,  5}, {0x05,  8},
+        {0x81, 13}, {0x03,  2}, {0x02,  2}, {0x02,  5},
+        {0x02,  6}, {0x03,  7}, {0x11, 10}, {0x43, 12},
+        {0x80, 13}, {0x42, 12}
+    },
+    {
+        {0x05,  7},  {0x03,  4}, {0x03,  5},  {0x04,  7},
+        {0x09,  9},  {0x83, 13}, {0x101, 14}, {0x03,  3},
+        {0x02,  4},  {0x05,  6}, {0x03,  7},  {0x11, 10},
+        {0x201, 15}, {0x03,  2}, {0x02,  2},  {0x02,  3},
+        {0x04,  6},  {0x03,  6}, {0x05,  8},  {0x21, 11},
+        {0x82, 13},  {0x81, 13}
+    },
+    {
+        {0x13, 10},  {0x03,  5}, {0x05,  7}, {0x12, 10},
+        {0x43, 12},  {0x83, 13}, {0x82, 13}, {0x02,  5},
+        {0x04,  7},  {0x05,  8}, {0x23, 11}, {0x81, 13},
+        {0x101, 14}, {0x03,  2}, {0x02,  2}, {0x01,  2},
+        {0x01,  3},  {0x03,  6}, {0x03,  7}, {0x22, 11},
+        {0x201, 15}, {0x42, 12}
+    },
+    {
+        {0x23, 11},  {0x01,  4},  {0x07,  8},  {0x13, 10},
+        {0x22, 11},  {0x103, 14}, {0x102, 14}, {0x03,  6},
+        {0x06,  8},  {0x12, 10},  {0x43, 12},  {0x101, 14},
+        {0x201, 15}, {0x03,  3},  {0x02,  3},  {0x03,  2},
+        {0x02,  2},  {0x01,  3},  {0x02,  6},  {0x05,  8},
+        {0x42, 12},  {0x41, 12}
+    },
+    {
+        {0x0b,  9}, {0x03,  5}, {0x07,  8}, {0x07,  7},
+        {0x06,  7}, {0x23, 11}, {0x41, 12}, {0x05,  7},
+        {0x06,  8}, {0x0a,  9}, {0x13, 10}, {0x22, 11},
+        {0x40, 12}, {0x03,  4}, {0x02,  4}, {0x03,  2},
+        {0x02,  2}, {0x01,  2}, {0x02,  5}, {0x04,  7},
+        {0x12, 10}, {0x21, 11}
+    },
+    {
+        {0x15, 10}, {0x03,  6}, {0x14, 10}, {0x23, 11},
+        {0x07,  8}, {0x43, 12}, {0x81, 13}, {0x06,  8},
+        {0x0b,  9}, {0x13, 10}, {0x12, 10}, {0x42, 12},
+        {0x80, 13}, {0x01,  4}, {0x03,  3}, {0x02,  3},
+        {0x03,  2}, {0x02,  2}, {0x01,  3}, {0x02,  6},
+        {0x22, 11}, {0x41, 12}
+    },
+    {
+        {0x43, 12}, {0x05,  6}, {0x07,  8}, {0x04,  6},
+        {0x03,  6}, {0x13, 10}, {0x42, 12}, {0x05,  7},
+        {0x04,  7}, {0x06,  8}, {0x12, 10}, {0x41, 12},
+        {0x40, 12}, {0x03,  5}, {0x03,  4}, {0x03,  3},
+        {0x02,  3}, {0x03,  2}, {0x02,  2}, {0x02,  4},
+        {0x05,  8}, {0x11, 10}
+    },
+    {
+        {0x83, 13}, {0x05,  7}, {0x07,  8}, {0x03,  4},
+        {0x21, 11}, {0x82, 13}, {0x81, 13}, {0x04,  7},
+        {0x06,  8}, {0x0b,  9}, {0x0a,  9}, {0x11, 10},
+        {0x80, 13}, {0x03,  5}, {0x02,  5}, {0x02,  4},
+        {0x03,  3}, {0x02,  3}, {0x03,  2}, {0x02,  2},
+        {0x03,  6}, {0x09,  9}
+    },
+    {
+        {0x13, 10}, {0x03,  5}, {0x03,  6}, {0x0d,  9},
+        {0x0c,  9}, {0x21, 11}, {0x20, 11}, {0x02,  5},
+        {0x02,  6}, {0x07,  8}, {0x0b,  9}, {0x12, 10},
+        {0x11, 10}, {0x05,  3}, {0x04,  3}, {0x05,  4},
+        {0x04,  4}, {0x03,  4}, {0x02,  4}, {0x03,  3},
+        {0x03,  2}, {0x0a,  9}
+    }
+};
+
 #endif /* AVCODEC_MPEG4DATA_H */
diff --git a/libavcodec/mpeg4video.c b/libavcodec/mpeg4video.c
index b60cd4f..2aaa9f7 100644
--- a/libavcodec/mpeg4video.c
+++ b/libavcodec/mpeg4video.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4video.h b/libavcodec/mpeg4video.h
index 4a4995e..1a5da31 100644
--- a/libavcodec/mpeg4video.h
+++ b/libavcodec/mpeg4video.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,8 +41,13 @@
 #define NBIT_VO_TYPE             5
 #define ARTS_VO_TYPE            10
 #define ACE_VO_TYPE             12
+#define SIMPLE_STUDIO_VO_TYPE   14
+#define CORE_STUDIO_VO_TYPE     15
 #define ADV_SIMPLE_VO_TYPE      17
 
+#define VOT_VIDEO_ID 1
+#define VOT_STILL_TEXTURE_ID 2
+
 // aspect_ratio_info
 #define EXTENDED_PAR 15
 
@@ -58,6 +63,13 @@
 #define GOP_STARTCODE        0x1B3
 #define VISUAL_OBJ_STARTCODE 0x1B5
 #define VOP_STARTCODE        0x1B6
+#define SLICE_STARTCODE      0x1B7
+#define EXT_STARTCODE        0x1B8
+
+#define QUANT_MATRIX_EXT_ID  0x3
+
+/* smaller packets likely don't contain a real frame */
+#define MAX_NVOP_SIZE 19
 
 typedef struct Mpeg4DecContext {
     MpegEncContext m;
@@ -84,6 +96,7 @@ typedef struct Mpeg4DecContext {
     int enhancement_type;
     int scalability;
     int use_intra_dc_vlc;
+
     /// QP above which the ac VLC should be used for intra dc
     int intra_dc_threshold;
 
@@ -92,6 +105,7 @@ typedef struct Mpeg4DecContext {
     int divx_build;
     int xvid_build;
     int lavc_build;
+
     /// flag for having shown the warning about invalid Divx B-frames
     int showed_packed_warning;
     /** does the stream contain the low_delay flag,
@@ -100,8 +114,16 @@ typedef struct Mpeg4DecContext {
     int cplx_estimation_trash_i;
     int cplx_estimation_trash_p;
     int cplx_estimation_trash_b;
+
+    VLC studio_intra_tab[12];
+    VLC studio_luma_dc;
+    VLC studio_chroma_dc;
+
+    int rgb;
 } Mpeg4DecContext;
 
+static const uint8_t mpeg4_block_count[4] = {0, 6, 8, 12};
+
 /* dc encoding for MPEG-4 */
 extern const uint8_t ff_mpeg4_DCtab_lum[13][2];
 extern const uint8_t ff_mpeg4_DCtab_chrom[13][2];
@@ -129,15 +151,19 @@ extern const uint16_t ff_mpeg4_resync_prefix[8];
 
 extern const uint8_t ff_mpeg4_dc_threshold[8];
 
+extern const uint16_t ff_mpeg4_studio_dc_luma[19][2];
+extern const uint16_t ff_mpeg4_studio_dc_chroma[19][2];
+extern const uint16_t ff_mpeg4_studio_intra[12][22][2];
+
 void ff_mpeg4_encode_mb(MpegEncContext *s,
                         int16_t block[6][64],
                         int motion_x, int motion_y);
 void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n,
                       int dir);
 void ff_set_mpeg4_time(MpegEncContext *s);
-void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
+int ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
 
-int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb);
+int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb, int header);
 void ff_mpeg4_encode_video_packet_header(MpegEncContext *s);
 void ff_mpeg4_clean_buffers(MpegEncContext *s);
 void ff_mpeg4_stuffing(PutBitContext *pbc);
@@ -147,7 +173,10 @@ void ff_clean_mpeg4_qscales(MpegEncContext *s);
 int ff_mpeg4_decode_partitions(Mpeg4DecContext *ctx);
 int ff_mpeg4_get_video_packet_prefix_length(MpegEncContext *s);
 int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx);
+int ff_mpeg4_decode_studio_slice_header(Mpeg4DecContext *ctx);
 void ff_mpeg4_init_direct_mv(MpegEncContext *s);
+void ff_mpeg4videodec_static_init(void);
+int ff_mpeg4_workaround_bugs(AVCodecContext *avctx);
 int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size);
 
 /**
@@ -222,21 +251,21 @@ static inline int ff_mpeg4_pred_dc(MpegEncContext *s, int n, int level,
     } else {
         level += pred;
         ret    = level;
-        if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+    }
+    level *= scale;
+    if (level & (~2047)) {
+        if (!s->encoding && (s->avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_AGGRESSIVE))) {
             if (level < 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "dc<0 at %dx%d\n", s->mb_x, s->mb_y);
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
-            if (level * scale > 2048 + scale) {
+            if (level > 2048 + scale) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "dc overflow at %dx%d\n", s->mb_x, s->mb_y);
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
         }
-    }
-    level *= scale;
-    if (level & (~2047)) {
         if (level < 0)
             level = 0;
         else if (!(s->workaround_bugs & FF_BUG_DC_CLIP))
diff --git a/libavcodec/mpeg4video_parser.c b/libavcodec/mpeg4video_parser.c
index e2203f9..9ca0f14 100644
--- a/libavcodec/mpeg4video_parser.c
+++ b/libavcodec/mpeg4video_parser.c
@@ -3,23 +3,25 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "internal.h"
 #include "parser.h"
 #include "mpegvideo.h"
@@ -44,7 +46,7 @@ int ff_mpeg4_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size)
     if (!vop_found) {
         for (i = 0; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if (state == 0x1B6) {
+            if (state == VOP_STARTCODE) {
                 i++;
                 vop_found = 1;
                 break;
@@ -59,6 +61,8 @@ int ff_mpeg4_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size)
         for (; i < buf_size; i++) {
             state = (state << 8) | buf[i];
             if ((state & 0xFFFFFF00) == 0x100) {
+                if (state == SLICE_STARTCODE || state == EXT_STARTCODE)
+                    continue;
                 pc->frame_start_found = 0;
                 pc->state             = -1;
                 return i - 3;
@@ -85,17 +89,26 @@ static int mpeg4_decode_header(AVCodecParserContext *s1, AVCodecContext *avctx,
 
     if (avctx->extradata_size && pc->first_picture) {
         init_get_bits(gb, avctx->extradata, avctx->extradata_size * 8);
-        ret = ff_mpeg4_decode_picture_header(dec_ctx, gb);
+        ret = ff_mpeg4_decode_picture_header(dec_ctx, gb, 1);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_WARNING, "Failed to parse extradata\n");
     }
 
     init_get_bits(gb, buf, 8 * buf_size);
-    ret = ff_mpeg4_decode_picture_header(dec_ctx, gb);
+    ret = ff_mpeg4_decode_picture_header(dec_ctx, gb, 0);
     if (s->width && (!avctx->width || !avctx->height ||
                      !avctx->coded_width || !avctx->coded_height)) {
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
     }
+    if((s1->flags & PARSER_FLAG_USE_CODEC_TS) && s->avctx->time_base.den>0 && ret>=0){
+        av_assert1(s1->pts == AV_NOPTS_VALUE);
+        av_assert1(s1->dts == AV_NOPTS_VALUE);
+
+        s1->pts = av_rescale_q(s->time, (AVRational){1, s->avctx->time_base.den}, (AVRational){1, 1200000});
+    }
+
     s1->pict_type     = s->pict_type;
     pc->first_picture = 0;
     return ret;
@@ -105,8 +118,12 @@ static av_cold int mpeg4video_parse_init(AVCodecParserContext *s)
 {
     struct Mp4vParseContext *pc = s->priv_data;
 
+    ff_mpeg4videodec_static_init();
+
     pc->first_picture           = 1;
+    pc->dec_ctx.m.quant_precision     = 5;
     pc->dec_ctx.m.slice_context_count = 1;
+    pc->dec_ctx.showed_packed_warning = 1;
     return 0;
 }
 
diff --git a/libavcodec/mpeg4video_parser.h b/libavcodec/mpeg4video_parser.h
index 030a276..8008e69 100644
--- a/libavcodec/mpeg4video_parser.h
+++ b/libavcodec/mpeg4video_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
index 566fd3a..b6f2ae7 100644
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -3,23 +3,28 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
 #include "error_resilience.h"
 #include "hwaccel.h"
 #include "idctdsp.h"
@@ -32,6 +37,7 @@
 #include "profiles.h"
 #include "thread.h"
 #include "xvididct.h"
+#include "unary.h"
 
 /* The defines below define the number of bits that are read at once for
  * reading vlc values. Changing these may improve speed and data cache needs
@@ -40,6 +46,9 @@
 #define SPRITE_TRAJ_VLC_BITS 6
 #define DC_VLC_BITS 9
 #define MB_TYPE_B_VLC_BITS 4
+#define STUDIO_INTRA_BITS 9
+
+static int decode_studio_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb);
 
 static VLC dc_lum, dc_chrom;
 static VLC sprite_trajectory;
@@ -52,15 +61,6 @@ static const int mb_type_b_map[4] = {
     MB_TYPE_L0      | MB_TYPE_16x16,
 };
 
-static inline int check_marker(AVCodecContext *avctx, GetBitContext *s, const char *msg)
-{
-    int bit = get_bits1(s);
-    if (!bit)
-        av_log(avctx, AV_LOG_INFO, "Marker bit missing %s\n", msg);
-
-    return bit;
-}
-
 /**
  * Predict the ac.
  * @param n block index (0-3 are luma, 4-5 are chroma)
@@ -73,7 +73,7 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir)
     int8_t *const qscale_table = s->current_picture.qscale_table;
 
     /* find prediction */
-    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val  = &s->ac_val[0][0][0] + s->block_index[n] * 16;
     ac_val1 = ac_val;
     if (s->ac_pred) {
         if (dir == 0) {
@@ -121,12 +121,13 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir)
  * check if the next stuff is a resync marker or the end.
  * @return 0 if not
  */
-static inline int mpeg4_is_resync(MpegEncContext *s)
+static inline int mpeg4_is_resync(Mpeg4DecContext *ctx)
 {
+    MpegEncContext *s = &ctx->m;
     int bits_count = get_bits_count(&s->gb);
     int v          = show_bits(&s->gb, 16);
 
-    if (s->workaround_bugs & FF_BUG_NO_PADDING)
+    if (s->workaround_bugs & FF_BUG_NO_PADDING && !ctx->resync_marker)
         return 0;
 
     while (v <= 0xFF) {
@@ -143,10 +144,11 @@ static inline int mpeg4_is_resync(MpegEncContext *s)
         v  |= 0x7F >> (7 - (bits_count & 7));
 
         if (v == 0x7F)
-            return 1;
+            return s->mb_num;
     } else {
         if (v == ff_mpeg4_resync_prefix[bits_count & 7]) {
-            int len;
+            int len, mb_num;
+            int mb_num_bits = av_log2(s->mb_num - 1) + 1;
             GetBitContext gb = s->gb;
 
             skip_bits(&s->gb, 1);
@@ -156,10 +158,14 @@ static inline int mpeg4_is_resync(MpegEncContext *s)
                 if (get_bits1(&s->gb))
                     break;
 
+            mb_num = get_bits(&s->gb, mb_num_bits);
+            if (!mb_num || mb_num > s->mb_num || get_bits_count(&s->gb)+6 > s->gb.size_in_bits)
+                mb_num= -1;
+
             s->gb = gb;
 
             if (len >= ff_mpeg4_get_video_packet_prefix_length(s))
-                return 1;
+                return mb_num;
         }
     }
     return 0;
@@ -171,13 +177,15 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
     int a     = 2 << s->sprite_warping_accuracy;
     int rho   = 3  - s->sprite_warping_accuracy;
     int r     = 16 / a;
-    int alpha = 0;
+    int alpha = 1;
     int beta  = 0;
     int w     = s->width;
     int h     = s->height;
     int min_ab, i, w2, h2, w3, h3;
     int sprite_ref[4][2];
     int virtual_ref[2][2];
+    int64_t sprite_offset[2][2];
+    int64_t sprite_delta[2][2];
 
     // only true for rectangle shapes
     const int vop_ref[4][2] = { { 0, 0 },         { s->width, 0 },
@@ -196,17 +204,17 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
         int x = 0, y = 0;
 
         length = get_vlc2(gb, sprite_trajectory.table, SPRITE_TRAJ_VLC_BITS, 3);
-        if (length)
+        if (length > 0)
             x = get_xbits(gb, length);
 
         if (!(ctx->divx_version == 500 && ctx->divx_build == 413))
-            skip_bits1(gb);     /* marker bit */
+            check_marker(s->avctx, gb, "before sprite_trajectory");
 
         length = get_vlc2(gb, sprite_trajectory.table, SPRITE_TRAJ_VLC_BITS, 3);
-        if (length)
+        if (length > 0)
             y = get_xbits(gb, length);
 
-        skip_bits1(gb);         /* marker bit */
+        check_marker(s->avctx, gb, "after sprite_trajectory");
         ctx->sprite_traj[i][0] = d[i][0] = x;
         ctx->sprite_traj[i][1] = d[i][1] = y;
     }
@@ -246,71 +254,71 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
      * from w&h based to w2&h2 based which are of the 2^x form. */
     virtual_ref[0][0] = 16 * (vop_ref[0][0] + w2) +
                          ROUNDED_DIV(((w - w2) *
-                                      (r * sprite_ref[0][0] - 16 * vop_ref[0][0]) +
-                                      w2 * (r * sprite_ref[1][0] - 16 * vop_ref[1][0])), w);
+                                           (r * sprite_ref[0][0] - 16LL * vop_ref[0][0]) +
+                                      w2 * (r * sprite_ref[1][0] - 16LL * vop_ref[1][0])), w);
     virtual_ref[0][1] = 16 * vop_ref[0][1] +
                         ROUNDED_DIV(((w - w2) *
-                                     (r * sprite_ref[0][1] - 16 * vop_ref[0][1]) +
-                                     w2 * (r * sprite_ref[1][1] - 16 * vop_ref[1][1])), w);
+                                          (r * sprite_ref[0][1] - 16LL * vop_ref[0][1]) +
+                                     w2 * (r * sprite_ref[1][1] - 16LL * vop_ref[1][1])), w);
     virtual_ref[1][0] = 16 * vop_ref[0][0] +
-                        ROUNDED_DIV(((h - h2) * (r * sprite_ref[0][0] - 16 * vop_ref[0][0]) +
-                                     h2 * (r * sprite_ref[2][0] - 16 * vop_ref[2][0])), h);
+                        ROUNDED_DIV(((h - h2) * (r * sprite_ref[0][0] - 16LL * vop_ref[0][0]) +
+                                           h2 * (r * sprite_ref[2][0] - 16LL * vop_ref[2][0])), h);
     virtual_ref[1][1] = 16 * (vop_ref[0][1] + h2) +
-                        ROUNDED_DIV(((h - h2) * (r * sprite_ref[0][1] - 16 * vop_ref[0][1]) +
-                                     h2 * (r * sprite_ref[2][1] - 16 * vop_ref[2][1])), h);
+                        ROUNDED_DIV(((h - h2) * (r * sprite_ref[0][1] - 16LL * vop_ref[0][1]) +
+                                           h2 * (r * sprite_ref[2][1] - 16LL * vop_ref[2][1])), h);
 
     switch (ctx->num_sprite_warping_points) {
     case 0:
-        s->sprite_offset[0][0] =
-        s->sprite_offset[0][1] =
-        s->sprite_offset[1][0] =
-        s->sprite_offset[1][1] = 0;
-        s->sprite_delta[0][0]  = a;
-        s->sprite_delta[0][1]  =
-        s->sprite_delta[1][0]  = 0;
-        s->sprite_delta[1][1]  = a;
+        sprite_offset[0][0]    =
+        sprite_offset[0][1]    =
+        sprite_offset[1][0]    =
+        sprite_offset[1][1]    = 0;
+        sprite_delta[0][0]     = a;
+        sprite_delta[0][1]     =
+        sprite_delta[1][0]     = 0;
+        sprite_delta[1][1]     = a;
         ctx->sprite_shift[0]   =
         ctx->sprite_shift[1]   = 0;
         break;
     case 1:     // GMC only
-        s->sprite_offset[0][0] = sprite_ref[0][0] - a * vop_ref[0][0];
-        s->sprite_offset[0][1] = sprite_ref[0][1] - a * vop_ref[0][1];
-        s->sprite_offset[1][0] = ((sprite_ref[0][0] >> 1) | (sprite_ref[0][0] & 1)) -
+        sprite_offset[0][0]    = sprite_ref[0][0] - a * vop_ref[0][0];
+        sprite_offset[0][1]    = sprite_ref[0][1] - a * vop_ref[0][1];
+        sprite_offset[1][0]    = ((sprite_ref[0][0] >> 1) | (sprite_ref[0][0] & 1)) -
                                  a * (vop_ref[0][0] / 2);
-        s->sprite_offset[1][1] = ((sprite_ref[0][1] >> 1) | (sprite_ref[0][1] & 1)) -
+        sprite_offset[1][1]    = ((sprite_ref[0][1] >> 1) | (sprite_ref[0][1] & 1)) -
                                  a * (vop_ref[0][1] / 2);
-        s->sprite_delta[0][0]  = a;
-        s->sprite_delta[0][1]  =
-        s->sprite_delta[1][0]  = 0;
-        s->sprite_delta[1][1]  = a;
+        sprite_delta[0][0]     = a;
+        sprite_delta[0][1]     =
+        sprite_delta[1][0]     = 0;
+        sprite_delta[1][1]     = a;
         ctx->sprite_shift[0]   =
         ctx->sprite_shift[1]   = 0;
         break;
     case 2:
-        s->sprite_offset[0][0] = (sprite_ref[0][0] << (alpha + rho)) +
-                                 (-r * sprite_ref[0][0] + virtual_ref[0][0]) *
-                                 (-vop_ref[0][0]) +
-                                 (r * sprite_ref[0][1] - virtual_ref[0][1]) *
-                                 (-vop_ref[0][1]) + (1 << (alpha + rho - 1));
-        s->sprite_offset[0][1] = (sprite_ref[0][1] << (alpha + rho)) +
-                                 (-r * sprite_ref[0][1] + virtual_ref[0][1]) *
-                                 (-vop_ref[0][0]) +
-                                 (-r * sprite_ref[0][0] + virtual_ref[0][0]) *
-                                 (-vop_ref[0][1]) + (1 << (alpha + rho - 1));
-        s->sprite_offset[1][0] = ((-r * sprite_ref[0][0] + virtual_ref[0][0]) *
-                                  (-2 * vop_ref[0][0] + 1) +
-                                  (r * sprite_ref[0][1] - virtual_ref[0][1]) *
-                                  (-2 * vop_ref[0][1] + 1) + 2 * w2 * r *
-                                  sprite_ref[0][0] - 16 * w2 + (1 << (alpha + rho + 1)));
-        s->sprite_offset[1][1] = ((-r * sprite_ref[0][1] + virtual_ref[0][1]) *
-                                  (-2 * vop_ref[0][0] + 1) +
-                                  (-r * sprite_ref[0][0] + virtual_ref[0][0]) *
-                                  (-2 * vop_ref[0][1] + 1) + 2 * w2 * r *
-                                  sprite_ref[0][1] - 16 * w2 + (1 << (alpha + rho + 1)));
-        s->sprite_delta[0][0] = (-r * sprite_ref[0][0] + virtual_ref[0][0]);
-        s->sprite_delta[0][1] = (+r * sprite_ref[0][1] - virtual_ref[0][1]);
-        s->sprite_delta[1][0] = (-r * sprite_ref[0][1] + virtual_ref[0][1]);
-        s->sprite_delta[1][1] = (-r * sprite_ref[0][0] + virtual_ref[0][0]);
+        sprite_offset[0][0]    = ((int64_t)      sprite_ref[0][0] * (1 << alpha + rho)) +
+                                 ((int64_t) -r * sprite_ref[0][0] + virtual_ref[0][0]) *
+                                 ((int64_t)        -vop_ref[0][0]) +
+                                 ((int64_t)  r * sprite_ref[0][1] - virtual_ref[0][1]) *
+                                 ((int64_t)        -vop_ref[0][1]) + (1 << (alpha + rho - 1));
+        sprite_offset[0][1]    = ((int64_t)      sprite_ref[0][1] * (1 << alpha + rho)) +
+                                 ((int64_t) -r * sprite_ref[0][1] + virtual_ref[0][1]) *
+                                 ((int64_t)        -vop_ref[0][0]) +
+                                 ((int64_t) -r * sprite_ref[0][0] + virtual_ref[0][0]) *
+                                 ((int64_t)        -vop_ref[0][1]) + (1 << (alpha + rho - 1));
+        sprite_offset[1][0]    = (((int64_t)-r * sprite_ref[0][0] + virtual_ref[0][0]) *
+                                  ((int64_t)-2 *    vop_ref[0][0] + 1) +
+                                  ((int64_t) r * sprite_ref[0][1] - virtual_ref[0][1]) *
+                                  ((int64_t)-2 *    vop_ref[0][1] + 1) + 2 * w2 * r *
+                                   (int64_t)     sprite_ref[0][0] - 16 * w2 + (1 << (alpha + rho + 1)));
+        sprite_offset[1][1]    = (((int64_t)-r * sprite_ref[0][1] + virtual_ref[0][1]) *
+                                  ((int64_t)-2 *    vop_ref[0][0] + 1) +
+                                  ((int64_t)-r * sprite_ref[0][0] + virtual_ref[0][0]) *
+                                  ((int64_t)-2 *    vop_ref[0][1] + 1) + 2 * w2 * r *
+                                  (int64_t)      sprite_ref[0][1] - 16 * w2 + (1 << (alpha + rho + 1)));
+        sprite_delta[0][0] = (-r * sprite_ref[0][0] + virtual_ref[0][0]);
+        sprite_delta[0][1] = (+r * sprite_ref[0][1] - virtual_ref[0][1]);
+        sprite_delta[1][0] = (-r * sprite_ref[0][1] + virtual_ref[0][1]);
+        sprite_delta[1][1] = (-r * sprite_ref[0][0] + virtual_ref[0][0]);
 
         ctx->sprite_shift[0]  = alpha + rho;
         ctx->sprite_shift[1]  = alpha + rho + 2;
@@ -319,68 +327,116 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
         min_ab = FFMIN(alpha, beta);
         w3     = w2 >> min_ab;
         h3     = h2 >> min_ab;
-        s->sprite_offset[0][0] = (sprite_ref[0][0] << (alpha + beta + rho - min_ab)) +
-                                 (-r * sprite_ref[0][0] + virtual_ref[0][0]) *
-                                 h3 * (-vop_ref[0][0]) +
-                                 (-r * sprite_ref[0][0] + virtual_ref[1][0]) *
-                                 w3 * (-vop_ref[0][1]) +
-                                 (1 << (alpha + beta + rho - min_ab - 1));
-        s->sprite_offset[0][1] = (sprite_ref[0][1] << (alpha + beta + rho - min_ab)) +
-                                 (-r * sprite_ref[0][1] + virtual_ref[0][1]) *
-                                 h3 * (-vop_ref[0][0]) +
-                                 (-r * sprite_ref[0][1] + virtual_ref[1][1]) *
-                                 w3 * (-vop_ref[0][1]) +
-                                 (1 << (alpha + beta + rho - min_ab - 1));
-        s->sprite_offset[1][0] = (-r * sprite_ref[0][0] + virtual_ref[0][0]) *
-                                 h3 * (-2 * vop_ref[0][0] + 1) +
-                                 (-r * sprite_ref[0][0] + virtual_ref[1][0]) *
-                                 w3 * (-2 * vop_ref[0][1] + 1) + 2 * w2 * h3 *
-                                 r * sprite_ref[0][0] - 16 * w2 * h3 +
-                                 (1 << (alpha + beta + rho - min_ab + 1));
-        s->sprite_offset[1][1] = (-r * sprite_ref[0][1] + virtual_ref[0][1]) *
-                                 h3 * (-2 * vop_ref[0][0] + 1) +
-                                 (-r * sprite_ref[0][1] + virtual_ref[1][1]) *
-                                 w3 * (-2 * vop_ref[0][1] + 1) + 2 * w2 * h3 *
-                                 r * sprite_ref[0][1] - 16 * w2 * h3 +
-                                 (1 << (alpha + beta + rho - min_ab + 1));
-        s->sprite_delta[0][0] = (-r * sprite_ref[0][0] + virtual_ref[0][0]) * h3;
-        s->sprite_delta[0][1] = (-r * sprite_ref[0][0] + virtual_ref[1][0]) * w3;
-        s->sprite_delta[1][0] = (-r * sprite_ref[0][1] + virtual_ref[0][1]) * h3;
-        s->sprite_delta[1][1] = (-r * sprite_ref[0][1] + virtual_ref[1][1]) * w3;
+        sprite_offset[0][0]    = ((int64_t)sprite_ref[0][0] * (1 << (alpha + beta + rho - min_ab))) +
+                                 ((int64_t)-r * sprite_ref[0][0] + virtual_ref[0][0]) * h3 * (-vop_ref[0][0]) +
+                                 ((int64_t)-r * sprite_ref[0][0] + virtual_ref[1][0]) * w3 * (-vop_ref[0][1]) +
+                                 ((int64_t)1 << (alpha + beta + rho - min_ab - 1));
+        sprite_offset[0][1]    = ((int64_t)sprite_ref[0][1] * (1 << (alpha + beta + rho - min_ab))) +
+                                 ((int64_t)-r * sprite_ref[0][1] + virtual_ref[0][1]) * h3 * (-vop_ref[0][0]) +
+                                 ((int64_t)-r * sprite_ref[0][1] + virtual_ref[1][1]) * w3 * (-vop_ref[0][1]) +
+                                 ((int64_t)1 << (alpha + beta + rho - min_ab - 1));
+        sprite_offset[1][0]    = ((int64_t)-r * sprite_ref[0][0] + virtual_ref[0][0]) * h3 * (-2 * vop_ref[0][0] + 1) +
+                                 ((int64_t)-r * sprite_ref[0][0] + virtual_ref[1][0]) * w3 * (-2 * vop_ref[0][1] + 1) +
+                                  (int64_t)2 * w2 * h3 * r * sprite_ref[0][0] - 16 * w2 * h3 +
+                                 ((int64_t)1 << (alpha + beta + rho - min_ab + 1));
+        sprite_offset[1][1]    = ((int64_t)-r * sprite_ref[0][1] + virtual_ref[0][1]) * h3 * (-2 * vop_ref[0][0] + 1) +
+                                 ((int64_t)-r * sprite_ref[0][1] + virtual_ref[1][1]) * w3 * (-2 * vop_ref[0][1] + 1) +
+                                  (int64_t)2 * w2 * h3 * r * sprite_ref[0][1] - 16 * w2 * h3 +
+                                 ((int64_t)1 << (alpha + beta + rho - min_ab + 1));
+        sprite_delta[0][0] = (-r * (int64_t)sprite_ref[0][0] + virtual_ref[0][0]) * h3;
+        sprite_delta[0][1] = (-r * (int64_t)sprite_ref[0][0] + virtual_ref[1][0]) * w3;
+        sprite_delta[1][0] = (-r * (int64_t)sprite_ref[0][1] + virtual_ref[0][1]) * h3;
+        sprite_delta[1][1] = (-r * (int64_t)sprite_ref[0][1] + virtual_ref[1][1]) * w3;
 
         ctx->sprite_shift[0]  = alpha + beta + rho - min_ab;
         ctx->sprite_shift[1]  = alpha + beta + rho - min_ab + 2;
         break;
     }
     /* try to simplify the situation */
-    if (s->sprite_delta[0][0] == a << ctx->sprite_shift[0] &&
-        s->sprite_delta[0][1] == 0 &&
-        s->sprite_delta[1][0] == 0 &&
-        s->sprite_delta[1][1] == a << ctx->sprite_shift[0]) {
-        s->sprite_offset[0][0] >>= ctx->sprite_shift[0];
-        s->sprite_offset[0][1] >>= ctx->sprite_shift[0];
-        s->sprite_offset[1][0] >>= ctx->sprite_shift[1];
-        s->sprite_offset[1][1] >>= ctx->sprite_shift[1];
-        s->sprite_delta[0][0] = a;
-        s->sprite_delta[0][1] = 0;
-        s->sprite_delta[1][0] = 0;
-        s->sprite_delta[1][1] = a;
+    if (sprite_delta[0][0] == a << ctx->sprite_shift[0] &&
+        sprite_delta[0][1] == 0 &&
+        sprite_delta[1][0] == 0 &&
+        sprite_delta[1][1] == a << ctx->sprite_shift[0]) {
+        sprite_offset[0][0] >>= ctx->sprite_shift[0];
+        sprite_offset[0][1] >>= ctx->sprite_shift[0];
+        sprite_offset[1][0] >>= ctx->sprite_shift[1];
+        sprite_offset[1][1] >>= ctx->sprite_shift[1];
+        sprite_delta[0][0] = a;
+        sprite_delta[0][1] = 0;
+        sprite_delta[1][0] = 0;
+        sprite_delta[1][1] = a;
         ctx->sprite_shift[0] = 0;
         ctx->sprite_shift[1] = 0;
         s->real_sprite_warping_points = 1;
     } else {
         int shift_y = 16 - ctx->sprite_shift[0];
         int shift_c = 16 - ctx->sprite_shift[1];
+
         for (i = 0; i < 2; i++) {
-            s->sprite_offset[0][i] <<= shift_y;
-            s->sprite_offset[1][i] <<= shift_c;
-            s->sprite_delta[0][i]  <<= shift_y;
-            s->sprite_delta[1][i]  <<= shift_y;
+            if (shift_c < 0 || shift_y < 0 ||
+                FFABS(  sprite_offset[0][i]) >= INT_MAX >> shift_y  ||
+                FFABS(  sprite_offset[1][i]) >= INT_MAX >> shift_c  ||
+                FFABS(   sprite_delta[0][i]) >= INT_MAX >> shift_y  ||
+                FFABS(   sprite_delta[1][i]) >= INT_MAX >> shift_y
+            ) {
+                avpriv_request_sample(s->avctx, "Too large sprite shift, delta or offset");
+                goto overflow;
+            }
+        }
+
+        for (i = 0; i < 2; i++) {
+            sprite_offset[0][i]    *= 1 << shift_y;
+            sprite_offset[1][i]    *= 1 << shift_c;
+            sprite_delta[0][i]     *= 1 << shift_y;
+            sprite_delta[1][i]     *= 1 << shift_y;
             ctx->sprite_shift[i]     = 16;
+
+        }
+        for (i = 0; i < 2; i++) {
+            int64_t sd[2] = {
+                sprite_delta[i][0] - a * (1LL<<16),
+                sprite_delta[i][1] - a * (1LL<<16)
+            };
+
+            if (llabs(sprite_offset[0][i] + sprite_delta[i][0] * (w+16LL)) >= INT_MAX ||
+                llabs(sprite_offset[0][i] + sprite_delta[i][1] * (h+16LL)) >= INT_MAX ||
+                llabs(sprite_offset[0][i] + sprite_delta[i][0] * (w+16LL) + sprite_delta[i][1] * (h+16LL)) >= INT_MAX ||
+                llabs(sprite_delta[i][0] * (w+16LL)) >= INT_MAX ||
+                llabs(sprite_delta[i][1] * (h+16LL)) >= INT_MAX ||
+                llabs(sd[0]) >= INT_MAX ||
+                llabs(sd[1]) >= INT_MAX ||
+                llabs(sprite_offset[0][i] + sd[0] * (w+16LL)) >= INT_MAX ||
+                llabs(sprite_offset[0][i] + sd[1] * (h+16LL)) >= INT_MAX ||
+                llabs(sprite_offset[0][i] + sd[0] * (w+16LL) + sd[1] * (h+16LL)) >= INT_MAX
+            ) {
+                avpriv_request_sample(s->avctx, "Overflow on sprite points");
+                goto overflow;
+            }
         }
         s->real_sprite_warping_points = ctx->num_sprite_warping_points;
     }
 
+    for (i = 0; i < 4; i++) {
+        s->sprite_offset[i&1][i>>1] = sprite_offset[i&1][i>>1];
+        s->sprite_delta [i&1][i>>1] = sprite_delta [i&1][i>>1];
+    }
+
+    return 0;
+overflow:
+    memset(s->sprite_offset, 0, sizeof(s->sprite_offset));
+    memset(s->sprite_delta, 0, sizeof(s->sprite_delta));
+    return AVERROR_PATCHWELCOME;
+}
+
+static int decode_new_pred(Mpeg4DecContext *ctx, GetBitContext *gb) {
+    MpegEncContext *s = &ctx->m;
+    int len = FFMIN(ctx->time_increment_bits + 3, 15);
+
+    get_bits(gb, len);
+    if (get_bits1(gb))
+        get_bits(gb, len);
+    check_marker(s->avctx, gb, "after new_pred");
+
     return 0;
 }
 
@@ -397,7 +453,7 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
 
     /* is there enough space left for a video packet + header */
     if (get_bits_count(&s->gb) > s->gb.size_in_bits - 20)
-        return -1;
+        return AVERROR_INVALIDDATA;
 
     for (len = 0; len < 32; len++)
         if (get_bits1(&s->gb))
@@ -405,7 +461,7 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
 
     if (len != ff_mpeg4_get_video_packet_prefix_length(s)) {
         av_log(s->avctx, AV_LOG_ERROR, "marker does not match f_code\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     if (ctx->shape != RECT_SHAPE) {
@@ -414,23 +470,10 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
     }
 
     mb_num = get_bits(&s->gb, mb_num_bits);
-    if (mb_num >= s->mb_num) {
+    if (mb_num >= s->mb_num || !mb_num) {
         av_log(s->avctx, AV_LOG_ERROR,
                "illegal mb_num in video packet (%d %d) \n", mb_num, s->mb_num);
-        return -1;
-    }
-    if (s->pict_type == AV_PICTURE_TYPE_B) {
-        int mb_x = 0, mb_y = 0;
-
-        while (s->next_picture.mbskip_table[s->mb_index2xy[mb_num]]) {
-            if (!mb_x)
-                ff_thread_await_progress(&s->next_picture_ptr->tf, mb_y++, 0);
-            mb_num++;
-            if (++mb_x == s->mb_width)
-                mb_x = 0;
-        }
-        if (mb_num >= s->mb_num)
-            return -1;  // slice contains just skipped MBs (already decoded)
+        return AVERROR_INVALIDDATA;
     }
 
     s->mb_x = mb_num % s->mb_width;
@@ -484,7 +527,57 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
             }
         }
     }
-    // FIXME new-pred stuff
+    if (ctx->new_pred)
+        decode_new_pred(ctx, &s->gb);
+
+    return 0;
+}
+
+static void reset_studio_dc_predictors(MpegEncContext *s)
+{
+    /* Reset DC Predictors */
+    s->last_dc[0] =
+    s->last_dc[1] =
+    s->last_dc[2] = 1 << (s->avctx->bits_per_raw_sample + s->dct_precision + s->intra_dc_precision - 1);
+}
+
+/**
+ * Decode the next video packet.
+ * @return <0 if something went wrong
+ */
+int ff_mpeg4_decode_studio_slice_header(Mpeg4DecContext *ctx)
+{
+    MpegEncContext *s = &ctx->m;
+    GetBitContext *gb = &s->gb;
+    unsigned vlc_len;
+    uint16_t mb_num;
+
+    if (get_bits_left(gb) >= 32 && get_bits_long(gb, 32) == SLICE_START_CODE) {
+        vlc_len = av_log2(s->mb_width * s->mb_height) + 1;
+        mb_num = get_bits(gb, vlc_len);
+
+        if (mb_num >= s->mb_num)
+            return AVERROR_INVALIDDATA;
+
+        s->mb_x = mb_num % s->mb_width;
+        s->mb_y = mb_num / s->mb_width;
+
+        if (ctx->shape != BIN_ONLY_SHAPE)
+            s->qscale = mpeg_get_qscale(s);
+
+        if (get_bits1(gb)) {  /* slice_extension_flag */
+            skip_bits1(gb);   /* intra_slice */
+            skip_bits1(gb);   /* slice_VOP_id_enable */
+            skip_bits(gb, 6); /* slice_VOP_id */
+            while (get_bits1(gb)) /* extra_bit_slice */
+                skip_bits(gb, 8); /* extra_information_slice */
+        }
+
+        reset_studio_dc_predictors(s);
+    }
+    else {
+        return AVERROR_INVALIDDATA;
+    }
 
     return 0;
 }
@@ -505,10 +598,10 @@ static inline int get_amv(Mpeg4DecContext *ctx, int n)
         len >>= s->quarter_sample;
 
     if (s->real_sprite_warping_points == 1) {
-        if (ctx->divx_version == 500 && ctx->divx_build == 413)
+        if (ctx->divx_version == 500 && ctx->divx_build == 413 && a >= s->quarter_sample)
             sum = s->sprite_offset[0][n] / (1 << (a - s->quarter_sample));
         else
-            sum = RSHIFT(s->sprite_offset[0][n] << s->quarter_sample, a);
+            sum = RSHIFT(s->sprite_offset[0][n] * (1 << s->quarter_sample), a);
     } else {
         dx    = s->sprite_delta[n][0];
         dy    = s->sprite_delta[n][1];
@@ -558,7 +651,7 @@ static inline int mpeg4_decode_dc(MpegEncContext *s, int n, int *dir_ptr)
 
     if (code < 0 || code > 9 /* && s->nbit < 9 */) {
         av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     if (code == 0) {
@@ -579,9 +672,9 @@ static inline int mpeg4_decode_dc(MpegEncContext *s, int n, int *dir_ptr)
 
         if (code > 8) {
             if (get_bits1(&s->gb) == 0) { /* marker */
-                if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)) {
                     av_log(s->avctx, AV_LOG_ERROR, "dc marker bit missing\n");
-                    return -1;
+                    return AVERROR_INVALIDDATA;
                 }
             }
         }
@@ -624,8 +717,8 @@ static int mpeg4_decode_partition_a(Mpeg4DecContext *ctx)
                     cbpc = get_vlc2(&s->gb, ff_h263_intra_MCBPC_vlc.table, INTRA_MCBPC_VLC_BITS, 2);
                     if (cbpc < 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
-                               "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
-                        return -1;
+                               "mcbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                        return AVERROR_INVALIDDATA;
                     }
                 } while (cbpc == 8);
 
@@ -645,7 +738,7 @@ static int mpeg4_decode_partition_a(Mpeg4DecContext *ctx)
                     if (dc < 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
                                "DC corrupted at %d %d\n", s->mb_x, s->mb_y);
-                        return -1;
+                        return dc;
                     }
                     dir <<= 1;
                     if (dc_pred_dir)
@@ -696,8 +789,8 @@ try_again:
                 cbpc = get_vlc2(&s->gb, ff_h263_inter_MCBPC_vlc.table, INTER_MCBPC_VLC_BITS, 2);
                 if (cbpc < 0) {
                     av_log(s->avctx, AV_LOG_ERROR,
-                           "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
-                    return -1;
+                           "mcbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                    return AVERROR_INVALIDDATA;
                 }
                 if (cbpc == 20)
                     goto try_again;
@@ -735,11 +828,11 @@ try_again:
                         if (!s->mcsel) {
                             mx = ff_h263_decode_motion(s, pred_x, s->f_code);
                             if (mx >= 0xffff)
-                                return -1;
+                                return AVERROR_INVALIDDATA;
 
                             my = ff_h263_decode_motion(s, pred_y, s->f_code);
                             if (my >= 0xffff)
-                                return -1;
+                                return AVERROR_INVALIDDATA;
                             s->current_picture.mb_type[xy] = MB_TYPE_16x16 |
                                                              MB_TYPE_L0;
                         } else {
@@ -766,11 +859,11 @@ try_again:
                             int16_t *mot_val = ff_h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                             mx = ff_h263_decode_motion(s, pred_x, s->f_code);
                             if (mx >= 0xffff)
-                                return -1;
+                                return AVERROR_INVALIDDATA;
 
                             my = ff_h263_decode_motion(s, pred_y, s->f_code);
                             if (my >= 0xffff)
-                                return -1;
+                                return AVERROR_INVALIDDATA;
                             mot_val[0] = mx;
                             mot_val[1] = my;
                         }
@@ -811,7 +904,7 @@ static int mpeg4_decode_partition_b(MpegEncContext *s, int mb_count)
                 if (cbpy < 0) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "cbpy corrupted at %d %d\n", s->mb_x, s->mb_y);
-                    return -1;
+                    return AVERROR_INVALIDDATA;
                 }
 
                 s->cbp_table[xy]               |= cbpy << 2;
@@ -826,7 +919,7 @@ static int mpeg4_decode_partition_b(MpegEncContext *s, int mb_count)
                     if (cbpy < 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
                                "I cbpy corrupted at %d %d\n", s->mb_x, s->mb_y);
-                        return -1;
+                        return AVERROR_INVALIDDATA;
                     }
 
                     if (s->cbp_table[xy] & 8)
@@ -839,7 +932,7 @@ static int mpeg4_decode_partition_b(MpegEncContext *s, int mb_count)
                         if (dc < 0) {
                             av_log(s->avctx, AV_LOG_ERROR,
                                    "DC corrupted at %d %d\n", s->mb_x, s->mb_y);
-                            return -1;
+                            return dc;
                         }
                         dir <<= 1;
                         if (dc_pred_dir)
@@ -858,7 +951,7 @@ static int mpeg4_decode_partition_b(MpegEncContext *s, int mb_count)
                     if (cbpy < 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
                                "P cbpy corrupted at %d %d\n", s->mb_x, s->mb_y);
-                        return -1;
+                        return AVERROR_INVALIDDATA;
                     }
 
                     if (s->cbp_table[xy] & 8)
@@ -885,21 +978,22 @@ int ff_mpeg4_decode_partitions(Mpeg4DecContext *ctx)
 {
     MpegEncContext *s = &ctx->m;
     int mb_num;
+    int ret;
     const int part_a_error = s->pict_type == AV_PICTURE_TYPE_I ? (ER_DC_ERROR | ER_MV_ERROR) : ER_MV_ERROR;
     const int part_a_end   = s->pict_type == AV_PICTURE_TYPE_I ? (ER_DC_END   | ER_MV_END)   : ER_MV_END;
 
     mb_num = mpeg4_decode_partition_a(ctx);
-    if (mb_num < 0) {
+    if (mb_num <= 0) {
         ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                         s->mb_x, s->mb_y, part_a_error);
-        return -1;
+        return mb_num ? mb_num : AVERROR_INVALIDDATA;
     }
 
     if (s->resync_mb_x + s->resync_mb_y * s->mb_width + mb_num > s->mb_num) {
         av_log(s->avctx, AV_LOG_ERROR, "slice below monitor ...\n");
         ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                         s->mb_x, s->mb_y, part_a_error);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     s->mb_num_left = mb_num;
@@ -911,7 +1005,7 @@ int ff_mpeg4_decode_partitions(Mpeg4DecContext *ctx)
             av_log(s->avctx, AV_LOG_ERROR,
                    "marker missing after first I partition at %d %d\n",
                    s->mb_x, s->mb_y);
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
     } else {
         while (show_bits(&s->gb, 10) == 1)
@@ -920,17 +1014,18 @@ int ff_mpeg4_decode_partitions(Mpeg4DecContext *ctx)
             av_log(s->avctx, AV_LOG_ERROR,
                    "marker missing after first P partition at %d %d\n",
                    s->mb_x, s->mb_y);
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
     }
     ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                     s->mb_x - 1, s->mb_y, part_a_end);
 
-    if (mpeg4_decode_partition_b(s, mb_num) < 0) {
+    ret = mpeg4_decode_partition_b(s, mb_num);
+    if (ret < 0) {
         if (s->pict_type == AV_PICTURE_TYPE_P)
             ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                             s->mb_x, s->mb_y, ER_DC_ERROR);
-        return -1;
+        return ret;
     } else {
         if (s->pict_type == AV_PICTURE_TYPE_P)
             ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
@@ -948,7 +1043,8 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                      int n, int coded, int intra, int rvlc)
 {
     MpegEncContext *s = &ctx->m;
-    int level, i, last, run, qmul, qadd, dc_pred_dir;
+    int level, i, last, run, qmul, qadd;
+    int av_uninit(dc_pred_dir);
     RLTable *rl;
     RL_VLC_ELEM *rl_vlc;
     const uint8_t *scan_table;
@@ -968,7 +1064,7 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
             } else {
                 level = mpeg4_decode_dc(s, n, &dc_pred_dir);
                 if (level < 0)
-                    return -1;
+                    return level;
             }
             block[0] = level;
             i        = 0;
@@ -1036,7 +1132,7 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                     if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
                                "1. marker bit missing in rvlc esc\n");
-                        return -1;
+                        return AVERROR_INVALIDDATA;
                     }
                     SKIP_CACHE(re, &s->gb, 1);
 
@@ -1049,7 +1145,7 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                     if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
                                "2. marker bit missing in rvlc esc\n");
-                        return -1;
+                        return AVERROR_INVALIDDATA;
                     }
                     SKIP_CACHE(re, &s->gb, 1);
 
@@ -1058,7 +1154,7 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
 
                     if (SHOW_UBITS(re, &s->gb, 5) != 0x10) {
                         av_log(s->avctx, AV_LOG_ERROR, "reverse esc missing\n");
-                        return -1;
+                        return AVERROR_INVALIDDATA;
                     }
                     SKIP_CACHE(re, &s->gb, 5);
 
@@ -1093,7 +1189,8 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                 if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                                     av_log(s->avctx, AV_LOG_ERROR,
                                            "1. marker bit missing in 3. esc\n");
-                                    return -1;
+                                    if (!(s->avctx->err_recognition & AV_EF_IGNORE_ERR))
+                                        return AVERROR_INVALIDDATA;
                                 }
                                 SKIP_CACHE(re, &s->gb, 1);
 
@@ -1103,24 +1200,47 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                 if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                                     av_log(s->avctx, AV_LOG_ERROR,
                                            "2. marker bit missing in 3. esc\n");
-                                    return -1;
+                                    if (!(s->avctx->err_recognition & AV_EF_IGNORE_ERR))
+                                        return AVERROR_INVALIDDATA;
                                 }
 
                                 SKIP_COUNTER(re, &s->gb, 1 + 12 + 1);
                             }
 
+#if 0
+                            if (s->error_recognition >= FF_ER_COMPLIANT) {
+                                const int abs_level= FFABS(level);
+                                if (abs_level<=MAX_LEVEL && run<=MAX_RUN) {
+                                    const int run1= run - rl->max_run[last][abs_level] - 1;
+                                    if (abs_level <= rl->max_level[last][run]) {
+                                        av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, vlc encoding possible\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
+                                    if (s->error_recognition > FF_ER_COMPLIANT) {
+                                        if (abs_level <= rl->max_level[last][run]*2) {
+                                            av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 1 encoding possible\n");
+                                            return AVERROR_INVALIDDATA;
+                                        }
+                                        if (run1 >= 0 && abs_level <= rl->max_level[last][run1]) {
+                                            av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 2 encoding possible\n");
+                                            return AVERROR_INVALIDDATA;
+                                        }
+                                    }
+                                }
+                            }
+#endif
                             if (level > 0)
                                 level = level * qmul + qadd;
                             else
                                 level = level * qmul - qadd;
 
                             if ((unsigned)(level + 2048) > 4095) {
-                                if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+                                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_AGGRESSIVE)) {
                                     if (level > 2560 || level < -2560) {
                                         av_log(s->avctx, AV_LOG_ERROR,
                                                "|level| overflow in 3. esc, qp=%d\n",
                                                s->qscale);
-                                        return -1;
+                                        return AVERROR_INVALIDDATA;
                                     }
                                 }
                                 level = level < 0 ? -2048 : 2047;
@@ -1152,12 +1272,13 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
                 LAST_SKIP_BITS(re, &s->gb, 1);
             }
+            ff_tlog(s->avctx, "dct[%d][%d] = %- 4d end?:%d\n", scan_table[i&63]&7, scan_table[i&63] >> 3, level, i>62);
             if (i > 62) {
                 i -= 192;
                 if (i & (~63)) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                    return -1;
+                    return AVERROR_INVALIDDATA;
                 }
 
                 block[scan_table[i]] = level;
@@ -1191,10 +1312,12 @@ not_coded:
  */
 static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64])
 {
-    Mpeg4DecContext *ctx = (Mpeg4DecContext *)s;
+    Mpeg4DecContext *ctx = s->avctx->priv_data;
     int cbp, mb_type;
     const int xy = s->mb_x + s->mb_y * s->mb_stride;
 
+    av_assert2(s == (void*)ctx);
+
     mb_type = s->current_picture.mb_type[xy];
     cbp     = s->cbp_table[xy];
 
@@ -1252,7 +1375,7 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64])
                 av_log(s->avctx, AV_LOG_ERROR,
                        "texture corrupted at %d %d %d\n",
                        s->mb_x, s->mb_y, s->mb_intra);
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
             cbp += cbp;
         }
@@ -1260,12 +1383,12 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64])
 
     /* per-MB end of slice check */
     if (--s->mb_num_left <= 0) {
-        if (mpeg4_is_resync(s))
+        if (mpeg4_is_resync(ctx))
             return SLICE_END;
         else
             return SLICE_NOEND;
     } else {
-        if (mpeg4_is_resync(s)) {
+        if (mpeg4_is_resync(ctx)) {
             const int delta = s->mb_x + 1 == s->mb_width ? 2 : 1;
             if (s->cbp_table[xy + delta])
                 return SLICE_END;
@@ -1276,13 +1399,14 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64])
 
 static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
 {
-    Mpeg4DecContext *ctx = (Mpeg4DecContext *)s;
+    Mpeg4DecContext *ctx = s->avctx->priv_data;
     int cbpc, cbpy, i, cbp, pred_x, pred_y, mx, my, dquant;
     int16_t *mot_val;
     static const int8_t quant_tab[4] = { -1, -2, 1, 2 };
     const int xy = s->mb_x + s->mb_y * s->mb_stride;
 
-    assert(s->h263_pred);
+    av_assert2(s ==  (void*)ctx);
+    av_assert2(s->h263_pred);
 
     if (s->pict_type == AV_PICTURE_TYPE_P ||
         s->pict_type == AV_PICTURE_TYPE_S) {
@@ -1318,8 +1442,8 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
             cbpc = get_vlc2(&s->gb, ff_h263_inter_MCBPC_vlc.table, INTER_MCBPC_VLC_BITS, 2);
             if (cbpc < 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "cbpc damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
+                       "mcbpc damaged at %d %d\n", s->mb_x, s->mb_y);
+                return AVERROR_INVALIDDATA;
             }
         } while (cbpc == 20);
 
@@ -1335,6 +1459,11 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
         else
             s->mcsel = 0;
         cbpy = get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1) ^ 0x0F;
+        if (cbpy < 0) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "P cbpy damaged at %d %d\n", s->mb_x, s->mb_y);
+            return AVERROR_INVALIDDATA;
+        }
 
         cbp = (cbpc & 3) | (cbpy << 2);
         if (dquant)
@@ -1370,11 +1499,11 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
                 for (i = 0; i < 2; i++) {
                     mx = ff_h263_decode_motion(s, pred_x, s->f_code);
                     if (mx >= 0xffff)
-                        return -1;
+                        return AVERROR_INVALIDDATA;
 
                     my = ff_h263_decode_motion(s, pred_y / 2, s->f_code);
                     if (my >= 0xffff)
-                        return -1;
+                        return AVERROR_INVALIDDATA;
 
                     s->mv[0][i][0] = mx;
                     s->mv[0][i][1] = my;
@@ -1387,12 +1516,12 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
                 mx = ff_h263_decode_motion(s, pred_x, s->f_code);
 
                 if (mx >= 0xffff)
-                    return -1;
+                    return AVERROR_INVALIDDATA;
 
                 my = ff_h263_decode_motion(s, pred_y, s->f_code);
 
                 if (my >= 0xffff)
-                    return -1;
+                    return AVERROR_INVALIDDATA;
                 s->mv[0][0][0] = mx;
                 s->mv[0][0][1] = my;
             }
@@ -1403,11 +1532,11 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
                 mot_val = ff_h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                 mx      = ff_h263_decode_motion(s, pred_x, s->f_code);
                 if (mx >= 0xffff)
-                    return -1;
+                    return AVERROR_INVALIDDATA;
 
                 my = ff_h263_decode_motion(s, pred_y, s->f_code);
                 if (my >= 0xffff)
-                    return -1;
+                    return AVERROR_INVALIDDATA;
                 s->mv[0][i][0] = mx;
                 s->mv[0][i][1] = my;
                 mot_val[0]     = mx;
@@ -1463,7 +1592,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
             mb_type = get_vlc2(&s->gb, mb_type_b_vlc.table, MB_TYPE_B_VLC_BITS, 1);
             if (mb_type < 0) {
                 av_log(s->avctx, AV_LOG_ERROR, "illegal MB_type\n");
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
             mb_type = mb_type_b_map[mb_type];
             if (modb2) {
@@ -1574,7 +1703,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
             if (cbpc < 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "I cbpc damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
         } while (cbpc == 8);
 
@@ -1592,7 +1721,7 @@ intra:
         if (cbpy < 0) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "I cbpy damaged at %d %d\n", s->mb_x, s->mb_y);
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
         cbp = (cbpc & 3) | (cbpy << 2);
 
@@ -1608,7 +1737,7 @@ intra:
         /* decode each block */
         for (i = 0; i < 6; i++) {
             if (mpeg4_decode_block(ctx, block[i], i, cbp & 32, 1, 0) < 0)
-                return -1;
+                return AVERROR_INVALIDDATA;
             cbp += cbp;
         }
         goto end;
@@ -1617,27 +1746,30 @@ intra:
     /* decode each block */
     for (i = 0; i < 6; i++) {
         if (mpeg4_decode_block(ctx, block[i], i, cbp & 32, 0, 0) < 0)
-            return -1;
+            return AVERROR_INVALIDDATA;
         cbp += cbp;
     }
 
 end:
     /* per-MB end of slice check */
     if (s->codec_id == AV_CODEC_ID_MPEG4) {
-        if (mpeg4_is_resync(s)) {
-            const int delta = s->mb_x + 1 == s->mb_width ? 2 : 1;
+        int next = mpeg4_is_resync(ctx);
+        if (next) {
+            if        (s->mb_x + s->mb_y*s->mb_width + 1 >  next && (s->avctx->err_recognition & AV_EF_AGGRESSIVE)) {
+                return AVERROR_INVALIDDATA;
+            } else if (s->mb_x + s->mb_y*s->mb_width + 1 >= next)
+                return SLICE_END;
 
-            if (s->pict_type == AV_PICTURE_TYPE_B &&
-                s->next_picture.mbskip_table[xy + delta]) {
+            if (s->pict_type == AV_PICTURE_TYPE_B) {
+                const int delta= s->mb_x + 1 == s->mb_width ? 2 : 1;
                 ff_thread_await_progress(&s->next_picture_ptr->tf,
                                          (s->mb_x + delta >= s->mb_width)
                                          ? FFMIN(s->mb_y + 1, s->mb_height - 1)
                                          : s->mb_y, 0);
+                if (s->next_picture.mbskip_table[xy + delta])
+                    return SLICE_OK;
             }
 
-            if (s->pict_type == AV_PICTURE_TYPE_B &&
-                s->next_picture.mbskip_table[xy + delta])
-                return SLICE_OK;
             return SLICE_END;
         }
     }
@@ -1645,41 +1777,375 @@ end:
     return SLICE_OK;
 }
 
-static int mpeg4_decode_gop_header(MpegEncContext *s, GetBitContext *gb)
+/* As per spec, studio start code search isn't the same as the old type of start code */
+static void next_start_code_studio(GetBitContext *gb)
 {
-    int hours, minutes, seconds;
-    unsigned time_code = show_bits(gb, 18);
-
-    if (time_code & 0x40) {     /* marker_bit */
-        hours   = time_code >> 13;
-        minutes = time_code >> 7 & 0x3f;
-        seconds = time_code & 0x3f;
-        s->time_base = seconds + 60 * (minutes + 60 * hours);
-        skip_bits(gb, 20);      /* time_code, closed_gov, broken_link */
+    align_get_bits(gb);
+
+    while (get_bits_left(gb) >= 24 && show_bits_long(gb, 24) != 0x1) {
+        get_bits(gb, 8);
+    }
+}
+
+/* additional_code, vlc index */
+static const uint8_t ac_state_tab[22][2] =
+{
+    {0, 0},
+    {0, 1},
+    {1, 1},
+    {2, 1},
+    {3, 1},
+    {4, 1},
+    {5, 1},
+    {1, 2},
+    {2, 2},
+    {3, 2},
+    {4, 2},
+    {5, 2},
+    {6, 2},
+    {1, 3},
+    {2, 4},
+    {3, 5},
+    {4, 6},
+    {5, 7},
+    {6, 8},
+    {7, 9},
+    {8, 10},
+    {0, 11}
+};
+
+static int mpeg4_decode_studio_block(MpegEncContext *s, int32_t block[64], int n)
+{
+    Mpeg4DecContext *ctx = s->avctx->priv_data;
+
+    int cc, dct_dc_size, dct_diff, code, j, idx = 1, group = 0, run = 0,
+        additional_code_len, sign, mismatch;
+    VLC *cur_vlc = &ctx->studio_intra_tab[0];
+    uint8_t *const scantable = s->intra_scantable.permutated;
+    const uint16_t *quant_matrix;
+    uint32_t flc;
+    const int min = -1 *  (1 << (s->avctx->bits_per_raw_sample + 6));
+    const int max =      ((1 << (s->avctx->bits_per_raw_sample + 6)) - 1);
+
+    mismatch = 1;
+
+    memset(block, 0, 64 * sizeof(int32_t));
+
+    if (n < 4) {
+        cc = 0;
+        dct_dc_size = get_vlc2(&s->gb, ctx->studio_luma_dc.table, STUDIO_INTRA_BITS, 2);
+        quant_matrix = s->intra_matrix;
     } else {
-        av_log(s->avctx, AV_LOG_WARNING, "GOP header missing marker_bit\n");
+        cc = (n & 1) + 1;
+        if (ctx->rgb)
+            dct_dc_size = get_vlc2(&s->gb, ctx->studio_luma_dc.table, STUDIO_INTRA_BITS, 2);
+        else
+            dct_dc_size = get_vlc2(&s->gb, ctx->studio_chroma_dc.table, STUDIO_INTRA_BITS, 2);
+        quant_matrix = s->chroma_intra_matrix;
     }
 
+    if (dct_dc_size < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "illegal dct_dc_size vlc\n");
+        return AVERROR_INVALIDDATA;
+    } else if (dct_dc_size == 0) {
+        dct_diff = 0;
+    } else {
+        dct_diff = get_xbits(&s->gb, dct_dc_size);
+
+        if (dct_dc_size > 8) {
+            if(!check_marker(s->avctx, &s->gb, "dct_dc_size > 8"))
+                return AVERROR_INVALIDDATA;
+        }
+
+    }
+
+    s->last_dc[cc] += dct_diff;
+
+    if (s->mpeg_quant)
+        block[0] = s->last_dc[cc] * (8 >> s->intra_dc_precision);
+    else
+        block[0] = s->last_dc[cc] * (8 >> s->intra_dc_precision) * (8 >> s->dct_precision);
+    /* TODO: support mpeg_quant for AC coefficients */
+
+    block[0] = av_clip(block[0], min, max);
+    mismatch ^= block[0];
+
+    /* AC Coefficients */
+    while (1) {
+        group = get_vlc2(&s->gb, cur_vlc->table, STUDIO_INTRA_BITS, 2);
+
+        if (group < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "illegal ac coefficient group vlc\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        additional_code_len = ac_state_tab[group][0];
+        cur_vlc = &ctx->studio_intra_tab[ac_state_tab[group][1]];
+
+        if (group == 0) {
+            /* End of Block */
+            break;
+        } else if (group >= 1 && group <= 6) {
+            /* Zero run length (Table B.47) */
+            run = 1 << additional_code_len;
+            if (additional_code_len)
+                run += get_bits(&s->gb, additional_code_len);
+            idx += run;
+            continue;
+        } else if (group >= 7 && group <= 12) {
+            /* Zero run length and +/-1 level (Table B.48) */
+            code = get_bits(&s->gb, additional_code_len);
+            sign = code & 1;
+            code >>= 1;
+            run = (1 << (additional_code_len - 1)) + code;
+            idx += run;
+            if (idx > 63)
+                return AVERROR_INVALIDDATA;
+            j = scantable[idx++];
+            block[j] = sign ? 1 : -1;
+        } else if (group >= 13 && group <= 20) {
+            /* Level value (Table B.49) */
+            if (idx > 63)
+                return AVERROR_INVALIDDATA;
+            j = scantable[idx++];
+            block[j] = get_xbits(&s->gb, additional_code_len);
+        } else if (group == 21) {
+            /* Escape */
+            if (idx > 63)
+                return AVERROR_INVALIDDATA;
+            j = scantable[idx++];
+            additional_code_len = s->avctx->bits_per_raw_sample + s->dct_precision + 4;
+            flc = get_bits(&s->gb, additional_code_len);
+            if (flc >> (additional_code_len-1))
+                block[j] = -1 * (( flc ^ ((1 << additional_code_len) -1)) + 1);
+            else
+                block[j] = flc;
+        }
+        block[j] = ((8 * 2 * block[j] * quant_matrix[j] * s->qscale) >> s->dct_precision) / 32;
+        block[j] = av_clip(block[j], min, max);
+        mismatch ^= block[j];
+    }
+
+    block[63] ^= mismatch & 1;
+
     return 0;
 }
 
-static int mpeg4_decode_profile_level(MpegEncContext *s, GetBitContext *gb)
+static int mpeg4_decode_dpcm_macroblock(MpegEncContext *s, int16_t macroblock[256], int n)
 {
-    int profile_and_level_indication;
+    int i, j, w, h, idx = 0;
+    int block_mean, rice_parameter, rice_prefix_code, rice_suffix_code,
+        dpcm_residual, left, top, topleft, min_left_top, max_left_top, p, p2, output;
+    h = 16 >> (n ? s->chroma_y_shift : 0);
+    w = 16 >> (n ? s->chroma_x_shift : 0);
+
+    block_mean = get_bits(&s->gb, s->avctx->bits_per_raw_sample);
+    if (block_mean == 0){
+        av_log(s->avctx, AV_LOG_ERROR, "Forbidden block_mean\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->last_dc[n] = block_mean * (1 << (s->dct_precision + s->intra_dc_precision));
 
-    profile_and_level_indication = get_bits(gb, 8);
+    rice_parameter = get_bits(&s->gb, 4);
+    if (rice_parameter == 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Forbidden rice_parameter\n");
+        return AVERROR_INVALIDDATA;
+    }
 
-    s->avctx->profile = (profile_and_level_indication & 0xf0) >> 4;
-    s->avctx->level   = (profile_and_level_indication & 0x0f);
+    if (rice_parameter == 15)
+        rice_parameter = 0;
+
+    if (rice_parameter > 11) {
+        av_log(s->avctx, AV_LOG_ERROR, "Forbidden rice_parameter\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i = 0; i < h; i++) {
+        output = 1 << (s->avctx->bits_per_raw_sample - 1);
+        top = 1 << (s->avctx->bits_per_raw_sample - 1);
+
+        for (j = 0; j < w; j++) {
+            left = output;
+            topleft = top;
+
+            rice_prefix_code = get_unary(&s->gb, 1, 12);
+
+            /* Escape */
+            if (rice_prefix_code == 11)
+                dpcm_residual = get_bits(&s->gb, s->avctx->bits_per_raw_sample);
+            else {
+                if (rice_prefix_code == 12) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Forbidden rice_prefix_code\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                rice_suffix_code = get_bitsz(&s->gb, rice_parameter);
+                dpcm_residual = (rice_prefix_code << rice_parameter) + rice_suffix_code;
+            }
+
+            /* Map to a signed residual */
+            if (dpcm_residual & 1)
+                dpcm_residual = (-1 * dpcm_residual) >> 1;
+            else
+                dpcm_residual = (dpcm_residual >> 1);
+
+            if (i != 0)
+                top = macroblock[idx-w];
+
+            p = left + top - topleft;
+            min_left_top = FFMIN(left, top);
+            if (p < min_left_top)
+                p = min_left_top;
+
+            max_left_top = FFMAX(left, top);
+            if (p > max_left_top)
+                p = max_left_top;
+
+            p2 = (FFMIN(min_left_top, topleft) + FFMAX(max_left_top, topleft)) >> 1;
+            if (p2 == p)
+                p2 = block_mean;
+
+            if (p2 > p)
+                dpcm_residual *= -1;
+
+            macroblock[idx++] = output = (dpcm_residual + p) & ((1 << s->avctx->bits_per_raw_sample) - 1);
+        }
+    }
+
+    return 0;
+}
+
+static int mpeg4_decode_studio_mb(MpegEncContext *s, int16_t block_[12][64])
+{
+    int i;
+
+    s->dpcm_direction = 0;
+
+    /* StudioMacroblock */
+    /* Assumes I-VOP */
+    s->mb_intra = 1;
+    if (get_bits1(&s->gb)) { /* compression_mode */
+        /* DCT */
+        /* macroblock_type, 1 or 2-bit VLC */
+        if (!get_bits1(&s->gb)) {
+            skip_bits1(&s->gb);
+            s->qscale = mpeg_get_qscale(s);
+        }
+
+        for (i = 0; i < mpeg4_block_count[s->chroma_format]; i++) {
+            if (mpeg4_decode_studio_block(s, (*s->block32)[i], i) < 0)
+                return AVERROR_INVALIDDATA;
+        }
+    } else {
+        /* DPCM */
+        check_marker(s->avctx, &s->gb, "DPCM block start");
+        s->dpcm_direction = get_bits1(&s->gb) ? -1 : 1;
+        for (i = 0; i < 3; i++) {
+            if (mpeg4_decode_dpcm_macroblock(s, (*s->dpcm_macroblock)[i], i) < 0)
+                return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (get_bits_left(&s->gb) >= 24 && show_bits(&s->gb, 23) == 0) {
+        next_start_code_studio(&s->gb);
+        return SLICE_END;
+    }
+
+    //vcon-stp9L1.bits (first frame)
+    if (get_bits_left(&s->gb) == 0)
+        return SLICE_END;
+
+    //vcon-stp2L1.bits, vcon-stp3L1.bits, vcon-stp6L1.bits, vcon-stp7L1.bits, vcon-stp8L1.bits, vcon-stp10L1.bits (first frame)
+    if (get_bits_left(&s->gb) < 8U && show_bits(&s->gb, get_bits_left(&s->gb)) == 0)
+        return SLICE_END;
+
+    return SLICE_OK;
+}
+
+static int mpeg4_decode_gop_header(MpegEncContext *s, GetBitContext *gb)
+{
+    int hours, minutes, seconds;
+
+    if (!show_bits(gb, 23)) {
+        av_log(s->avctx, AV_LOG_WARNING, "GOP header invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    hours   = get_bits(gb, 5);
+    minutes = get_bits(gb, 6);
+    check_marker(s->avctx, gb, "in gop_header");
+    seconds = get_bits(gb, 6);
+
+    s->time_base = seconds + 60*(minutes + 60*hours);
+
+    skip_bits1(gb);
+    skip_bits1(gb);
+
+    return 0;
+}
+
+static int mpeg4_decode_profile_level(MpegEncContext *s, GetBitContext *gb, int *profile, int *level)
+{
+
+    *profile = get_bits(gb, 4);
+    *level   = get_bits(gb, 4);
 
     // for Simple profile, level 0
-    if (s->avctx->profile == 0 && s->avctx->level == 8) {
-        s->avctx->level = 0;
+    if (*profile == 0 && *level == 8) {
+        *level = 0;
+    }
+
+    return 0;
+}
+
+static int mpeg4_decode_visual_object(MpegEncContext *s, GetBitContext *gb)
+{
+    int visual_object_type;
+    int is_visual_object_identifier = get_bits1(gb);
+
+    if (is_visual_object_identifier) {
+        skip_bits(gb, 4+3);
+    }
+    visual_object_type = get_bits(gb, 4);
+
+    if (visual_object_type == VOT_VIDEO_ID ||
+        visual_object_type == VOT_STILL_TEXTURE_ID) {
+        int video_signal_type = get_bits1(gb);
+        if (video_signal_type) {
+            int video_range, color_description;
+            skip_bits(gb, 3); // video_format
+            video_range = get_bits1(gb);
+            color_description = get_bits1(gb);
+
+            s->avctx->color_range = video_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
+
+            if (color_description) {
+                s->avctx->color_primaries = get_bits(gb, 8);
+                s->avctx->color_trc       = get_bits(gb, 8);
+                s->avctx->colorspace      = get_bits(gb, 8);
+            }
+        }
     }
 
     return 0;
 }
 
+static void mpeg4_load_default_matrices(MpegEncContext *s)
+{
+    int i, v;
+
+    /* load default matrices */
+    for (i = 0; i < 64; i++) {
+        int j = s->idsp.idct_permutation[i];
+        v = ff_mpeg4_default_intra_matrix[i];
+        s->intra_matrix[j]        = v;
+        s->chroma_intra_matrix[j] = v;
+
+        v = ff_mpeg4_default_non_intra_matrix[i];
+        s->inter_matrix[j]        = v;
+        s->chroma_inter_matrix[j] = v;
+    }
+}
+
 static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 {
     MpegEncContext *s = &ctx->m;
@@ -1688,6 +2154,23 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     /* vol header */
     skip_bits(gb, 1);                   /* random access */
     s->vo_type = get_bits(gb, 8);
+
+    /* If we are in studio profile (per vo_type), check if its all consistent
+     * and if so continue pass control to decode_studio_vol_header().
+     * elIf something is inconsistent, error out
+     * else continue with (non studio) vol header decpoding.
+     */
+    if (s->vo_type == CORE_STUDIO_VO_TYPE ||
+        s->vo_type == SIMPLE_STUDIO_VO_TYPE) {
+        if (s->avctx->profile != FF_PROFILE_UNKNOWN && s->avctx->profile != FF_PROFILE_MPEG4_SIMPLE_STUDIO)
+            return AVERROR_INVALIDDATA;
+        s->studio_profile = 1;
+        s->avctx->profile = FF_PROFILE_MPEG4_SIMPLE_STUDIO;
+        return decode_studio_vol_header(ctx, gb);
+    } else if (s->studio_profile) {
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (get_bits1(gb) != 0) {           /* is_ol_id */
         vo_ver_id = get_bits(gb, 4);    /* vo_ver_id */
         skip_bits(gb, 3);               /* vo_priority */
@@ -1710,22 +2193,30 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         s->low_delay = get_bits1(gb);
         if (get_bits1(gb)) {    /* vbv parameters */
             get_bits(gb, 15);   /* first_half_bitrate */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after first_half_bitrate");
             get_bits(gb, 15);   /* latter_half_bitrate */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after latter_half_bitrate");
             get_bits(gb, 15);   /* first_half_vbv_buffer_size */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after first_half_vbv_buffer_size");
             get_bits(gb, 3);    /* latter_half_vbv_buffer_size */
             get_bits(gb, 11);   /* first_half_vbv_occupancy */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after first_half_vbv_occupancy");
             get_bits(gb, 15);   /* latter_half_vbv_occupancy */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after latter_half_vbv_occupancy");
         }
     } else {
         /* is setting low delay flag only once the smartest thing to do?
          * low delay detection will not be overridden. */
-        if (s->picture_number == 0)
-            s->low_delay = 0;
+        if (s->picture_number == 0) {
+            switch(s->vo_type) {
+            case SIMPLE_VO_TYPE:
+            case ADV_SIMPLE_VO_TYPE:
+                s->low_delay = 1;
+                break;
+            default:
+                s->low_delay = 0;
+            }
+        }
     }
 
     ctx->shape = get_bits(gb, 2); /* vol shape */
@@ -1741,7 +2232,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     s->avctx->framerate.num = get_bits(gb, 16);
     if (!s->avctx->framerate.num) {
         av_log(s->avctx, AV_LOG_ERROR, "framerate==0\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     ctx->time_increment_bits = av_log2(s->avctx->framerate.num - 1) + 1;
@@ -1755,15 +2246,17 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     else
         s->avctx->framerate.den = 1;
 
+    s->avctx->time_base = av_inv_q(av_mul_q(s->avctx->framerate, (AVRational){s->avctx->ticks_per_frame, 1}));
+
     ctx->t_frame = 0;
 
     if (ctx->shape != BIN_ONLY_SHAPE) {
         if (ctx->shape == RECT_SHAPE) {
-            skip_bits1(gb);   /* marker */
+            check_marker(s->avctx, gb, "before width");
             width = get_bits(gb, 13);
-            skip_bits1(gb);   /* marker */
+            check_marker(s->avctx, gb, "before height");
             height = get_bits(gb, 13);
-            skip_bits1(gb);   /* marker */
+            check_marker(s->avctx, gb, "after height");
             if (width && height &&  /* they should be non zero but who knows */
                 !(s->width && s->codec_tag == AV_RL32("MP4S"))) {
                 if (s->width && s->height &&
@@ -1791,13 +2284,13 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             ctx->vol_sprite_usage == GMC_SPRITE) {
             if (ctx->vol_sprite_usage == STATIC_SPRITE) {
                 skip_bits(gb, 13); // sprite_width
-                skip_bits1(gb); /* marker */
+                check_marker(s->avctx, gb, "after sprite_width");
                 skip_bits(gb, 13); // sprite_height
-                skip_bits1(gb); /* marker */
+                check_marker(s->avctx, gb, "after sprite_height");
                 skip_bits(gb, 13); // sprite_left
-                skip_bits1(gb); /* marker */
+                check_marker(s->avctx, gb, "after sprite_left");
                 skip_bits(gb, 13); // sprite_top
-                skip_bits1(gb); /* marker */
+                check_marker(s->avctx, gb, "after sprite_top");
             }
             ctx->num_sprite_warping_points = get_bits(gb, 6);
             if (ctx->num_sprite_warping_points > 3) {
@@ -1805,7 +2298,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                        "%d sprite_warping_points\n",
                        ctx->num_sprite_warping_points);
                 ctx->num_sprite_warping_points = 0;
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
             s->sprite_warping_accuracy  = get_bits(gb, 2);
             ctx->sprite_brightness_change = get_bits1(gb);
@@ -1821,6 +2314,9 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             if (s->quant_precision != 5)
                 av_log(s->avctx, AV_LOG_ERROR,
                        "quant precision %d\n", s->quant_precision);
+            if (s->quant_precision<3 || s->quant_precision>9) {
+                s->quant_precision = 5;
+            }
         } else {
             s->quant_precision = 5;
         }
@@ -1830,23 +2326,17 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         if ((s->mpeg_quant = get_bits1(gb))) { /* vol_quant_type */
             int i, v;
 
-            /* load default matrixes */
-            for (i = 0; i < 64; i++) {
-                int j = s->idsp.idct_permutation[i];
-                v = ff_mpeg4_default_intra_matrix[i];
-                s->intra_matrix[j]        = v;
-                s->chroma_intra_matrix[j] = v;
-
-                v = ff_mpeg4_default_non_intra_matrix[i];
-                s->inter_matrix[j]        = v;
-                s->chroma_inter_matrix[j] = v;
-            }
+            mpeg4_load_default_matrices(s);
 
             /* load custom intra matrix */
             if (get_bits1(gb)) {
                 int last = 0;
                 for (i = 0; i < 64; i++) {
                     int j;
+                    if (get_bits_left(gb) < 8) {
+                        av_log(s->avctx, AV_LOG_ERROR, "insufficient data for custom matrix\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     v = get_bits(gb, 8);
                     if (v == 0)
                         break;
@@ -1870,6 +2360,10 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                 int last = 0;
                 for (i = 0; i < 64; i++) {
                     int j;
+                    if (get_bits_left(gb) < 8) {
+                        av_log(s->avctx, AV_LOG_ERROR, "insufficient data for custom matrix\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     v = get_bits(gb, 8);
                     if (v == 0)
                         break;
@@ -1896,6 +2390,11 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         else
             s->quarter_sample = 0;
 
+        if (get_bits_left(gb) < 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "VOL Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (!get_bits1(gb)) {
             int pos               = get_bits_count(gb);
             int estimation_method = get_bits(gb, 2);
@@ -2003,6 +2502,18 @@ no_cplx_est:
         }
     }
 
+    if (s->avctx->debug&FF_DEBUG_PICT_INFO) {
+        av_log(s->avctx, AV_LOG_DEBUG, "tb %d/%d, tincrbits:%d, qp_prec:%d, ps:%d, low_delay:%d  %s%s%s%s\n",
+               s->avctx->framerate.den, s->avctx->framerate.num,
+               ctx->time_increment_bits,
+               s->quant_precision,
+               s->progressive_sequence,
+               s->low_delay,
+               ctx->scalability ? "scalability " :"" , s->quarter_sample ? "qpel " : "",
+               s->data_partitioning ? "partition " : "", ctx->rvlc ? "rvlc " : ""
+        );
+    }
+
     return 0;
 }
 
@@ -2034,11 +2545,6 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
         ctx->divx_version = ver;
         ctx->divx_build   = build;
         s->divx_packed  = e == 3 && last == 'p';
-        if (s->divx_packed && !ctx->showed_packed_warning) {
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "Invalid and inefficient vfw-avi packed B-frames detected\n");
-            ctx->showed_packed_warning = 1;
-        }
     }
 
     /* libavcodec detection */
@@ -2047,8 +2553,15 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
         e = sscanf(buf, "FFmpeg v%d.%d.%d / libavcodec build: %d", &ver, &ver2, &ver3, &build);
     if (e != 4) {
         e = sscanf(buf, "Lavc%d.%d.%d", &ver, &ver2, &ver3) + 1;
-        if (e > 1)
-            build = (ver << 16) + (ver2 << 8) + ver3;
+        if (e > 1) {
+            if (ver > 0xFFU || ver2 > 0xFFU || ver3 > 0xFFU) {
+                av_log(s->avctx, AV_LOG_WARNING,
+                     "Unknown Lavc version string encountered, %d.%d.%d; "
+                     "clamping sub-version values to 8-bits.\n",
+                     ver, ver2, ver3);
+            }
+            build = ((ver & 0xFF) << 16) + ((ver2 & 0xFF) << 8) + (ver3 & 0xFF);
+        }
     }
     if (e != 4) {
         if (strcmp(buf, "ffmpeg") == 0)
@@ -2062,6 +2575,14 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
     if (e == 1)
         ctx->xvid_build = build;
 
+    return 0;
+}
+
+int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+{
+    Mpeg4DecContext *ctx = avctx->priv_data;
+    MpegEncContext *s = &ctx->m;
+
     if (ctx->xvid_build == -1 && ctx->divx_version == -1 && ctx->lavc_build == -1) {
         if (s->codec_tag        == AV_RL32("XVID") ||
             s->codec_tag        == AV_RL32("XVIX") ||
@@ -2081,8 +2602,96 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
         ctx->divx_build   = -1;
     }
 
-    if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0)
-        ff_xvid_idct_init(&s->idsp, s->avctx);
+    if (s->workaround_bugs & FF_BUG_AUTODETECT) {
+        if (s->codec_tag == AV_RL32("XVIX"))
+            s->workaround_bugs |= FF_BUG_XVID_ILACE;
+
+        if (s->codec_tag == AV_RL32("UMP4"))
+            s->workaround_bugs |= FF_BUG_UMP4;
+
+        if (ctx->divx_version >= 500 && ctx->divx_build < 1814)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
+
+        if (ctx->divx_version > 502 && ctx->divx_build < 1814)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA2;
+
+        if (ctx->xvid_build <= 3U)
+            s->padding_bug_score = 256 * 256 * 256 * 64;
+
+        if (ctx->xvid_build <= 1U)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
+
+        if (ctx->xvid_build <= 12U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->xvid_build <= 32U)
+            s->workaround_bugs |= FF_BUG_DC_CLIP;
+
+#define SET_QPEL_FUNC(postfix1, postfix2)                           \
+    s->qdsp.put_        ## postfix1 = ff_put_        ## postfix2;   \
+    s->qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2;   \
+    s->qdsp.avg_        ## postfix1 = ff_avg_        ## postfix2;
+
+        if (ctx->lavc_build < 4653U)
+            s->workaround_bugs |= FF_BUG_STD_QPEL;
+
+        if (ctx->lavc_build < 4655U)
+            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
+
+        if (ctx->lavc_build < 4670U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->lavc_build <= 4712U)
+            s->workaround_bugs |= FF_BUG_DC_CLIP;
+
+        if ((ctx->lavc_build&0xFF) >= 100) {
+            if (ctx->lavc_build > 3621476 && ctx->lavc_build < 3752552 &&
+               (ctx->lavc_build < 3752037 || ctx->lavc_build > 3752191) // 3.2.1+
+            )
+                s->workaround_bugs |= FF_BUG_IEDGE;
+        }
+
+        if (ctx->divx_version >= 0)
+            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
+        if (ctx->divx_version == 501 && ctx->divx_build == 20020416)
+            s->padding_bug_score = 256 * 256 * 256 * 64;
+
+        if (ctx->divx_version < 500U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->divx_version >= 0)
+            s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+    }
+
+    if (s->workaround_bugs & FF_BUG_STD_QPEL) {
+        SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c)
+
+        SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c)
+    }
+
+    if (avctx->debug & FF_DEBUG_BUGS)
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n",
+               s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
+               ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+
+    if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+        s->codec_id == AV_CODEC_ID_MPEG4 &&
+        avctx->idct_algo == FF_IDCT_AUTO) {
+        avctx->idct_algo = FF_IDCT_XVID;
+        ff_mpv_idct_init(s);
+        return 1;
+    }
 
     return 0;
 }
@@ -2091,7 +2700,9 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 {
     MpegEncContext *s = &ctx->m;
     int time_incr, time_increment;
+    int64_t pts;
 
+    s->mcsel       = 0;
     s->pict_type = get_bits(gb, 2) + AV_PICTURE_TYPE_I;        /* pict type: I = 0 , P = 1 */
     if (s->pict_type == AV_PICTURE_TYPE_B && s->low_delay &&
         ctx->vol_control_parameters == 0 && !(s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY)) {
@@ -2113,7 +2724,9 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 
     if (ctx->time_increment_bits == 0 ||
         !(show_bits(gb, ctx->time_increment_bits + 1) & 1)) {
-        /* Headers seem incomplete; try to guess time_increment_bits. */
+        av_log(s->avctx, AV_LOG_WARNING,
+               "time_increment_bits %d is invalid in relation to the current bitstream, this is likely caused by a missing VOL header\n", ctx->time_increment_bits);
+
         for (ctx->time_increment_bits = 1;
              ctx->time_increment_bits < 16;
              ctx->time_increment_bits++) {
@@ -2125,6 +2738,13 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             } else if ((show_bits(gb, ctx->time_increment_bits + 5) & 0x1F) == 0x18)
                 break;
         }
+
+        av_log(s->avctx, AV_LOG_WARNING,
+               "time_increment_bits set to %d bits, based on bitstream analysis\n", ctx->time_increment_bits);
+        if (s->avctx->framerate.num && 4*s->avctx->framerate.num < 1<<ctx->time_increment_bits) {
+            s->avctx->framerate.num = 1<<ctx->time_increment_bits;
+            s->avctx->time_base = av_inv_q(av_mul_q(s->avctx->framerate, (AVRational){s->avctx->ticks_per_frame, 1}));
+        }
     }
 
     if (IS_3IV1)
@@ -2135,7 +2755,7 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     if (s->pict_type != AV_PICTURE_TYPE_B) {
         s->last_time_base = s->time_base;
         s->time_base     += time_incr;
-        s->time = s->time_base * s->avctx->framerate.num + time_increment;
+        s->time = s->time_base * (int64_t)s->avctx->framerate.num + time_increment;
         if (s->workaround_bugs & FF_BUG_UMP4) {
             if (s->time < s->last_non_b_time) {
                 /* header is not mpeg-4-compatible, broken encoder,
@@ -2147,7 +2767,7 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         s->pp_time         = s->time - s->last_non_b_time;
         s->last_non_b_time = s->time;
     } else {
-        s->time    = (s->last_time_base + time_incr) * s->avctx->framerate.num + time_increment;
+        s->time    = (s->last_time_base + time_incr) * (int64_t)s->avctx->framerate.num + time_increment;
         s->pb_time = s->pp_time - (s->last_non_b_time - s->time);
         if (s->pp_time <= s->pb_time ||
             s->pp_time <= s->pp_time - s->pb_time ||
@@ -2165,12 +2785,20 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                             ROUNDED_DIV(s->last_non_b_time - s->pp_time, ctx->t_frame)) * 2;
         s->pb_field_time = (ROUNDED_DIV(s->time, ctx->t_frame) -
                             ROUNDED_DIV(s->last_non_b_time - s->pp_time, ctx->t_frame)) * 2;
-        if (!s->progressive_sequence) {
-            if (s->pp_field_time <= s->pb_field_time || s->pb_field_time <= 1)
+        if (s->pp_field_time <= s->pb_field_time || s->pb_field_time <= 1) {
+            s->pb_field_time = 2;
+            s->pp_field_time = 4;
+            if (!s->progressive_sequence)
                 return FRAME_SKIPPED;
         }
     }
 
+    if (s->avctx->framerate.den)
+        pts = ROUNDED_DIV(s->time, s->avctx->framerate.den);
+    else
+        pts = AV_NOPTS_VALUE;
+    ff_dlog(s->avctx, "MPEG4 PTS: %"PRId64"\n", pts);
+
     check_marker(s->avctx, gb, "before vop_coded");
 
     /* vop coded */
@@ -2179,6 +2807,9 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             av_log(s->avctx, AV_LOG_ERROR, "vop not coded\n");
         return FRAME_SKIPPED;
     }
+    if (ctx->new_pred)
+        decode_new_pred(ctx, gb);
+
     if (ctx->shape != BIN_ONLY_SHAPE &&
                     (s->pict_type == AV_PICTURE_TYPE_P ||
                      (s->pict_type == AV_PICTURE_TYPE_S &&
@@ -2193,11 +2824,11 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     if (ctx->shape != RECT_SHAPE) {
         if (ctx->vol_sprite_usage != 1 || s->pict_type != AV_PICTURE_TYPE_I) {
             skip_bits(gb, 13);  /* width */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after width");
             skip_bits(gb, 13);  /* height */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after height");
             skip_bits(gb, 13);  /* hor_spat_ref */
-            skip_bits1(gb);     /* marker */
+            check_marker(s->avctx, gb, "after hor_spat_ref");
             skip_bits(gb, 13);  /* ver_spat_ref */
         }
         skip_bits1(gb);         /* change_CR_disable */
@@ -2215,6 +2846,10 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         if (s->pict_type == AV_PICTURE_TYPE_B)
             skip_bits_long(gb, ctx->cplx_estimation_trash_b);
 
+        if (get_bits_left(gb) < 3) {
+            av_log(s->avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
         ctx->intra_dc_threshold = ff_mpeg4_dc_threshold[get_bits(gb, 3)];
         if (!s->progressive_sequence) {
             s->top_field_first = get_bits1(gb);
@@ -2235,16 +2870,20 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
     }
 
-    if (s->pict_type == AV_PICTURE_TYPE_S &&
-        (ctx->vol_sprite_usage == STATIC_SPRITE ||
-         ctx->vol_sprite_usage == GMC_SPRITE)) {
-        if (mpeg4_decode_sprite_trajectory(ctx, gb) < 0)
-            return AVERROR_INVALIDDATA;
-        if (ctx->sprite_brightness_change)
-            av_log(s->avctx, AV_LOG_ERROR,
-                   "sprite_brightness_change not supported\n");
-        if (ctx->vol_sprite_usage == STATIC_SPRITE)
-            av_log(s->avctx, AV_LOG_ERROR, "static sprite not supported\n");
+    if (s->pict_type == AV_PICTURE_TYPE_S) {
+        if((ctx->vol_sprite_usage == STATIC_SPRITE ||
+            ctx->vol_sprite_usage == GMC_SPRITE)) {
+            if (mpeg4_decode_sprite_trajectory(ctx, gb) < 0)
+                return AVERROR_INVALIDDATA;
+            if (ctx->sprite_brightness_change)
+                av_log(s->avctx, AV_LOG_ERROR,
+                    "sprite_brightness_change not supported\n");
+            if (ctx->vol_sprite_usage == STATIC_SPRITE)
+                av_log(s->avctx, AV_LOG_ERROR, "static sprite not supported\n");
+        } else {
+            memset(s->sprite_offset, 0, sizeof(s->sprite_offset));
+            memset(s->sprite_delta, 0, sizeof(s->sprite_delta));
+        }
     }
 
     if (ctx->shape != BIN_ONLY_SHAPE) {
@@ -2252,7 +2891,7 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         if (s->qscale == 0) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Error, header damaged or not MPEG-4 header (qscale=0)\n");
-            return -1;  // makes no sense to continue, as there is nothing left from the image then
+            return AVERROR_INVALIDDATA;  // makes no sense to continue, as there is nothing left from the image then
         }
 
         if (s->pict_type != AV_PICTURE_TYPE_I) {
@@ -2260,29 +2899,39 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             if (s->f_code == 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Error, header damaged or not MPEG-4 header (f_code=0)\n");
-                return -1;  // makes no sense to continue, as there is nothing left from the image then
+                s->f_code = 1;
+                return AVERROR_INVALIDDATA;  // makes no sense to continue, as there is nothing left from the image then
             }
         } else
             s->f_code = 1;
 
         if (s->pict_type == AV_PICTURE_TYPE_B) {
             s->b_code = get_bits(gb, 3);
+            if (s->b_code == 0) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                       "Error, header damaged or not MPEG4 header (b_code=0)\n");
+                s->b_code=1;
+                return AVERROR_INVALIDDATA; // makes no sense to continue, as the MV decoding will break very quickly
+            }
         } else
             s->b_code = 1;
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
             av_log(s->avctx, AV_LOG_DEBUG,
-                   "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d ce:%d/%d/%d\n",
+                   "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d ce:%d/%d/%d time:%"PRId64" tincr:%d\n",
                    s->qscale, s->f_code, s->b_code,
                    s->pict_type == AV_PICTURE_TYPE_I ? "I" : (s->pict_type == AV_PICTURE_TYPE_P ? "P" : (s->pict_type == AV_PICTURE_TYPE_B ? "B" : "S")),
-                   gb->size_in_bits, s->progressive_sequence, s->alternate_scan,
+                   gb->size_in_bits,s->progressive_sequence, s->alternate_scan,
                    s->top_field_first, s->quarter_sample ? "q" : "h",
                    s->data_partitioning, ctx->resync_marker,
                    ctx->num_sprite_warping_points, s->sprite_warping_accuracy,
                    1 - s->no_rounding, s->vo_type,
                    ctx->vol_control_parameters ? " VOLC" : " ", ctx->intra_dc_threshold,
                    ctx->cplx_estimation_trash_i, ctx->cplx_estimation_trash_p,
-                   ctx->cplx_estimation_trash_b);
+                   ctx->cplx_estimation_trash_b,
+                   s->time,
+                   time_increment
+                  );
         }
 
         if (!ctx->scalability) {
@@ -2321,20 +2970,267 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     return 0;
 }
 
+static int read_quant_matrix_ext(MpegEncContext *s, GetBitContext *gb)
+{
+    int i, j, v;
+
+    if (get_bits1(gb)) {
+        if (get_bits_left(gb) < 64*8)
+            return AVERROR_INVALIDDATA;
+        /* intra_quantiser_matrix */
+        for (i = 0; i < 64; i++) {
+            v = get_bits(gb, 8);
+            j = s->idsp.idct_permutation[ff_zigzag_direct[i]];
+            s->intra_matrix[j]        = v;
+            s->chroma_intra_matrix[j] = v;
+        }
+    }
+
+    if (get_bits1(gb)) {
+        if (get_bits_left(gb) < 64*8)
+            return AVERROR_INVALIDDATA;
+        /* non_intra_quantiser_matrix */
+        for (i = 0; i < 64; i++) {
+            get_bits(gb, 8);
+        }
+    }
+
+    if (get_bits1(gb)) {
+        if (get_bits_left(gb) < 64*8)
+            return AVERROR_INVALIDDATA;
+        /* chroma_intra_quantiser_matrix */
+        for (i = 0; i < 64; i++) {
+            v = get_bits(gb, 8);
+            j = s->idsp.idct_permutation[ff_zigzag_direct[i]];
+            s->chroma_intra_matrix[j] = v;
+        }
+    }
+
+    if (get_bits1(gb)) {
+        if (get_bits_left(gb) < 64*8)
+            return AVERROR_INVALIDDATA;
+        /* chroma_non_intra_quantiser_matrix */
+        for (i = 0; i < 64; i++) {
+            get_bits(gb, 8);
+        }
+    }
+
+    next_start_code_studio(gb);
+    return 0;
+}
+
+static void extension_and_user_data(MpegEncContext *s, GetBitContext *gb, int id)
+{
+    uint32_t startcode;
+    uint8_t extension_type;
+
+    startcode = show_bits_long(gb, 32);
+    if (startcode == USER_DATA_STARTCODE || startcode == EXT_STARTCODE) {
+
+        if ((id == 2 || id == 4) && startcode == EXT_STARTCODE) {
+            skip_bits_long(gb, 32);
+            extension_type = get_bits(gb, 4);
+            if (extension_type == QUANT_MATRIX_EXT_ID)
+                read_quant_matrix_ext(s, gb);
+        }
+    }
+}
+
+static void decode_smpte_tc(Mpeg4DecContext *ctx, GetBitContext *gb)
+{
+    MpegEncContext *s = &ctx->m;
+
+    skip_bits(gb, 16); /* Time_code[63..48] */
+    check_marker(s->avctx, gb, "after Time_code[63..48]");
+    skip_bits(gb, 16); /* Time_code[47..32] */
+    check_marker(s->avctx, gb, "after Time_code[47..32]");
+    skip_bits(gb, 16); /* Time_code[31..16] */
+    check_marker(s->avctx, gb, "after Time_code[31..16]");
+    skip_bits(gb, 16); /* Time_code[15..0] */
+    check_marker(s->avctx, gb, "after Time_code[15..0]");
+    skip_bits(gb, 4); /* reserved_bits */
+}
+
+/**
+ * Decode the next studio vop header.
+ * @return <0 if something went wrong
+ */
+static int decode_studio_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
+{
+    MpegEncContext *s = &ctx->m;
+
+    if (get_bits_left(gb) <= 32)
+        return 0;
+
+    s->partitioned_frame = 0;
+    s->interlaced_dct = 0;
+    s->decode_mb = mpeg4_decode_studio_mb;
+
+    decode_smpte_tc(ctx, gb);
+
+    skip_bits(gb, 10); /* temporal_reference */
+    skip_bits(gb, 2); /* vop_structure */
+    s->pict_type = get_bits(gb, 2) + AV_PICTURE_TYPE_I; /* vop_coding_type */
+    if (get_bits1(gb)) { /* vop_coded */
+        skip_bits1(gb); /* top_field_first */
+        skip_bits1(gb); /* repeat_first_field */
+        s->progressive_frame = get_bits1(gb) ^ 1; /* progressive_frame */
+    }
+
+    if (s->pict_type == AV_PICTURE_TYPE_I) {
+        if (get_bits1(gb))
+            reset_studio_dc_predictors(s);
+    }
+
+    if (ctx->shape != BIN_ONLY_SHAPE) {
+        s->alternate_scan = get_bits1(gb);
+        s->frame_pred_frame_dct = get_bits1(gb);
+        s->dct_precision = get_bits(gb, 2);
+        s->intra_dc_precision = get_bits(gb, 2);
+        s->q_scale_type = get_bits1(gb);
+    }
+
+    if (s->alternate_scan) {
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable,   ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable,   ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
+    } else {
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable,   ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable,   ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
+    }
+
+    mpeg4_load_default_matrices(s);
+
+    next_start_code_studio(gb);
+    extension_and_user_data(s, gb, 4);
+
+    return 0;
+}
+
+static int decode_studiovisualobject(Mpeg4DecContext *ctx, GetBitContext *gb)
+{
+    MpegEncContext *s = &ctx->m;
+    int visual_object_type;
+
+        skip_bits(gb, 4); /* visual_object_verid */
+        visual_object_type = get_bits(gb, 4);
+        if (visual_object_type != VOT_VIDEO_ID) {
+            avpriv_request_sample(s->avctx, "VO type %u", visual_object_type);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        next_start_code_studio(gb);
+        extension_and_user_data(s, gb, 1);
+
+    return 0;
+}
+
+static int decode_studio_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
+{
+    MpegEncContext *s = &ctx->m;
+    int width, height;
+    int bits_per_raw_sample;
+
+            // random_accessible_vol and video_object_type_indication have already
+            // been read by the caller decode_vol_header()
+            skip_bits(gb, 4); /* video_object_layer_verid */
+            ctx->shape = get_bits(gb, 2); /* video_object_layer_shape */
+            skip_bits(gb, 4); /* video_object_layer_shape_extension */
+            skip_bits1(gb); /* progressive_sequence */
+            if (ctx->shape != BIN_ONLY_SHAPE) {
+                ctx->rgb = get_bits1(gb); /* rgb_components */
+                s->chroma_format = get_bits(gb, 2); /* chroma_format */
+                if (!s->chroma_format) {
+                    av_log(s->avctx, AV_LOG_ERROR, "illegal chroma format\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                bits_per_raw_sample = get_bits(gb, 4); /* bit_depth */
+                if (bits_per_raw_sample == 10) {
+                    if (ctx->rgb) {
+                        s->avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+                    }
+                    else {
+                        s->avctx->pix_fmt = s->chroma_format == CHROMA_422 ? AV_PIX_FMT_YUV422P10 : AV_PIX_FMT_YUV444P10;
+                    }
+                }
+                else {
+                    avpriv_request_sample(s->avctx, "MPEG-4 Studio profile bit-depth %u", bits_per_raw_sample);
+                    return AVERROR_PATCHWELCOME;
+                }
+                s->avctx->bits_per_raw_sample = bits_per_raw_sample;
+            }
+            if (ctx->shape == RECT_SHAPE) {
+                check_marker(s->avctx, gb, "before video_object_layer_width");
+                width = get_bits(gb, 14); /* video_object_layer_width */
+                check_marker(s->avctx, gb, "before video_object_layer_height");
+                height = get_bits(gb, 14); /* video_object_layer_height */
+                check_marker(s->avctx, gb, "after video_object_layer_height");
+
+                /* Do the same check as non-studio profile */
+                if (width && height) {
+                    if (s->width && s->height &&
+                        (s->width != width || s->height != height))
+                        s->context_reinit = 1;
+                    s->width  = width;
+                    s->height = height;
+                }
+            }
+            s->aspect_ratio_info = get_bits(gb, 4);
+            if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
+                s->avctx->sample_aspect_ratio.num = get_bits(gb, 8);  // par_width
+                s->avctx->sample_aspect_ratio.den = get_bits(gb, 8);  // par_height
+            } else {
+                s->avctx->sample_aspect_ratio = ff_h263_pixel_aspect[s->aspect_ratio_info];
+            }
+            skip_bits(gb, 4); /* frame_rate_code */
+            skip_bits(gb, 15); /* first_half_bit_rate */
+            check_marker(s->avctx, gb, "after first_half_bit_rate");
+            skip_bits(gb, 15); /* latter_half_bit_rate */
+            check_marker(s->avctx, gb, "after latter_half_bit_rate");
+            skip_bits(gb, 15); /* first_half_vbv_buffer_size */
+            check_marker(s->avctx, gb, "after first_half_vbv_buffer_size");
+            skip_bits(gb, 3); /* latter_half_vbv_buffer_size */
+            skip_bits(gb, 11); /* first_half_vbv_buffer_size */
+            check_marker(s->avctx, gb, "after first_half_vbv_buffer_size");
+            skip_bits(gb, 15); /* latter_half_vbv_occupancy */
+            check_marker(s->avctx, gb, "after latter_half_vbv_occupancy");
+            s->low_delay = get_bits1(gb);
+            s->mpeg_quant = get_bits1(gb); /* mpeg2_stream */
+
+            next_start_code_studio(gb);
+            extension_and_user_data(s, gb, 2);
+
+    return 0;
+}
+
 /**
  * Decode MPEG-4 headers.
- * @return <0 if no VOP found (or a damaged one)
+ *
+ * @param  header If set the absence of a VOP is not treated as error; otherwise, it is treated as such.
+ * @return <0 if an error occured
  *         FRAME_SKIPPED if a not coded VOP is found
- *         0 if a VOP is found
+ *         0 else
  */
-int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
+int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb, int header)
 {
     MpegEncContext *s = &ctx->m;
     unsigned startcode, v;
+    int ret;
+    int vol = 0;
 
     /* search next start code */
     align_get_bits(gb);
 
+    // If we have not switched to studio profile than we also did not switch bps
+    // that means something else (like a previous instance) outside set bps which
+    // would be inconsistant with the currect state, thus reset it
+    if (!s->studio_profile && s->avctx->bits_per_raw_sample != 8)
+        s->avctx->bits_per_raw_sample = 0;
+
     if (s->codec_tag == AV_RL32("WV1F") && show_bits(gb, 24) == 0x575630) {
         skip_bits(gb, 24);
         if (get_bits(gb, 8) == 0xF0)
@@ -2345,11 +3241,13 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     for (;;) {
         if (get_bits_count(gb) >= gb->size_in_bits) {
             if (gb->size_in_bits == 8 &&
-                (ctx->divx_version >= 0 || ctx->xvid_build >= 0)) {
-                av_log(s->avctx, AV_LOG_WARNING, "frame skip %d\n", gb->size_in_bits);
+                (ctx->divx_version >= 0 || ctx->xvid_build >= 0) || s->codec_tag == AV_RL32("QMP4")) {
+                av_log(s->avctx, AV_LOG_VERBOSE, "frame skip %d\n", gb->size_in_bits);
                 return FRAME_SKIPPED;  // divx bug
+            } else if (header && get_bits_count(gb) == gb->size_in_bits) {
+                return 0; // ordinary return value for parsing of extradata
             } else
-                return -1;  // end of stream
+                return AVERROR_INVALIDDATA;  // end of stream
         }
 
         /* use the bits after the test */
@@ -2419,14 +3317,37 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         }
 
         if (startcode >= 0x120 && startcode <= 0x12F) {
-            if (decode_vol_header(ctx, gb) < 0)
-                return -1;
+            if (vol) {
+                av_log(s->avctx, AV_LOG_WARNING, "Ignoring multiple VOL headers\n");
+                continue;
+            }
+            vol++;
+            if ((ret = decode_vol_header(ctx, gb)) < 0)
+                return ret;
         } else if (startcode == USER_DATA_STARTCODE) {
             decode_user_data(ctx, gb);
         } else if (startcode == GOP_STARTCODE) {
             mpeg4_decode_gop_header(s, gb);
         } else if (startcode == VOS_STARTCODE) {
-            mpeg4_decode_profile_level(s, gb);
+            int profile, level;
+            mpeg4_decode_profile_level(s, gb, &profile, &level);
+            if (profile == FF_PROFILE_MPEG4_SIMPLE_STUDIO &&
+                (level > 0 && level < 9)) {
+                s->studio_profile = 1;
+                next_start_code_studio(gb);
+                extension_and_user_data(s, gb, 0);
+            } else if (s->studio_profile) {
+                avpriv_request_sample(s->avctx, "Mixes studio and non studio profile\n");
+                return AVERROR_PATCHWELCOME;
+            }
+            s->avctx->profile = profile;
+            s->avctx->level   = level;
+        } else if (startcode == VISUAL_OBJ_STARTCODE) {
+            if (s->studio_profile) {
+                if ((ret = decode_studiovisualobject(ctx, gb)) < 0)
+                    return ret;
+            } else
+                mpeg4_decode_visual_object(s, gb);
         } else if (startcode == VOP_STARTCODE) {
             break;
         }
@@ -2440,64 +3361,40 @@ end:
         s->low_delay = 1;
     s->avctx->has_b_frames = !s->low_delay;
 
-    if (s->workaround_bugs & FF_BUG_AUTODETECT) {
-        if (s->codec_tag == AV_RL32("XVIX"))
-            s->workaround_bugs |= FF_BUG_XVID_ILACE;
-
-        if (s->codec_tag == AV_RL32("UMP4"))
-            s->workaround_bugs |= FF_BUG_UMP4;
-
-        if (ctx->divx_version >= 500 && ctx->divx_build < 1814)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
-
-        if (ctx->divx_version > 502 && ctx->divx_build < 1814)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA2;
-
-        if (ctx->xvid_build <= 3U)
-            s->padding_bug_score = 256 * 256 * 256 * 64;
-
-        if (ctx->xvid_build <= 1U)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
-
-        if (ctx->xvid_build <= 12U)
-            s->workaround_bugs |= FF_BUG_EDGE;
-
-        if (ctx->xvid_build <= 32U)
-            s->workaround_bugs |= FF_BUG_DC_CLIP;
-
-        if (ctx->lavc_build < 4653U)
-            s->workaround_bugs |= FF_BUG_STD_QPEL;
-
-        if (ctx->lavc_build < 4655U)
-            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
-
-        if (ctx->lavc_build < 4670U)
-            s->workaround_bugs |= FF_BUG_EDGE;
-
-        if (ctx->lavc_build <= 4712U)
-            s->workaround_bugs |= FF_BUG_DC_CLIP;
-
-        if (ctx->divx_version >= 0)
-            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
-
-        if (ctx->divx_version == 501 && ctx->divx_build == 20020416)
-            s->padding_bug_score = 256 * 256 * 256 * 64;
+    if (s->studio_profile) {
+        if (!s->avctx->bits_per_raw_sample) {
+            av_log(s->avctx, AV_LOG_ERROR, "Missing VOL header\n");
+            return AVERROR_INVALIDDATA;
+        }
+        return decode_studio_vop_header(ctx, gb);
+    } else
+        return decode_vop_header(ctx, gb);
+}
 
-        if (ctx->divx_version < 500U)
-            s->workaround_bugs |= FF_BUG_EDGE;
+av_cold void ff_mpeg4videodec_static_init(void) {
+    static int done = 0;
 
-        if (ctx->divx_version >= 0)
-            s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+    if (!done) {
+        ff_rl_init(&ff_mpeg4_rl_intra, ff_mpeg4_static_rl_table_store[0]);
+        ff_rl_init(&ff_rvlc_rl_inter, ff_mpeg4_static_rl_table_store[1]);
+        ff_rl_init(&ff_rvlc_rl_intra, ff_mpeg4_static_rl_table_store[2]);
+        INIT_VLC_RL(ff_mpeg4_rl_intra, 554);
+        INIT_VLC_RL(ff_rvlc_rl_inter, 1072);
+        INIT_VLC_RL(ff_rvlc_rl_intra, 1072);
+        INIT_VLC_STATIC(&dc_lum, DC_VLC_BITS, 10 /* 13 */,
+                        &ff_mpeg4_DCtab_lum[0][1], 2, 1,
+                        &ff_mpeg4_DCtab_lum[0][0], 2, 1, 512);
+        INIT_VLC_STATIC(&dc_chrom, DC_VLC_BITS, 10 /* 13 */,
+                        &ff_mpeg4_DCtab_chrom[0][1], 2, 1,
+                        &ff_mpeg4_DCtab_chrom[0][0], 2, 1, 512);
+        INIT_VLC_STATIC(&sprite_trajectory, SPRITE_TRAJ_VLC_BITS, 15,
+                        &ff_sprite_trajectory_tab[0][1], 4, 2,
+                        &ff_sprite_trajectory_tab[0][0], 4, 2, 128);
+        INIT_VLC_STATIC(&mb_type_b_vlc, MB_TYPE_B_VLC_BITS, 4,
+                        &ff_mb_type_b_tab[0][1], 2, 1,
+                        &ff_mb_type_b_tab[0][0], 2, 1, 16);
+        done = 1;
     }
-
-
-    if (s->avctx->debug & FF_DEBUG_BUGS)
-        av_log(s->avctx, AV_LOG_DEBUG,
-               "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n",
-               s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-               ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
-
-    return decode_vop_header(ctx, gb);
 }
 
 int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
@@ -2506,34 +3403,40 @@ int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
     MpegEncContext    *s = &ctx->m;
 
     /* divx 5.01+ bitstream reorder stuff */
+    /* Since this clobbers the input buffer and hwaccel codecs still need the
+     * data during hwaccel->end_frame we should not do this any earlier */
     if (s->divx_packed) {
-        int current_pos     = get_bits_count(&s->gb) >> 3;
+        int current_pos     = s->gb.buffer == s->bitstream_buffer ? 0 : (get_bits_count(&s->gb) >> 3);
         int startcode_found = 0;
 
-        if (buf_size - current_pos > 5) {
+        if (buf_size - current_pos > 7) {
+
             int i;
-            for (i = current_pos; i < buf_size - 3; i++)
+            for (i = current_pos; i < buf_size - 4; i++)
+
                 if (buf[i]     == 0 &&
                     buf[i + 1] == 0 &&
                     buf[i + 2] == 1 &&
                     buf[i + 3] == 0xB6) {
-                    startcode_found = 1;
+                    startcode_found = !(buf[i + 4] & 0x40);
                     break;
                 }
         }
-        if (s->gb.buffer == s->bitstream_buffer && buf_size > 7 &&
-            ctx->xvid_build >= 0) {       // xvid style
-            startcode_found = 1;
-            current_pos     = 0;
-        }
 
         if (startcode_found) {
-            av_fast_malloc(&s->bitstream_buffer,
+            if (!ctx->showed_packed_warning) {
+                av_log(s->avctx, AV_LOG_INFO, "Video uses a non-standard and "
+                       "wasteful way to store B-frames ('packed B-frames'). "
+                       "Consider using the mpeg4_unpack_bframes bitstream filter without encoding but stream copy to fix it.\n");
+                ctx->showed_packed_warning = 1;
+            }
+            av_fast_padded_malloc(&s->bitstream_buffer,
                            &s->allocated_bitstream_buffer_size,
-                           buf_size - current_pos +
-                           AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!s->bitstream_buffer)
+                           buf_size - current_pos);
+            if (!s->bitstream_buffer) {
+                s->bitstream_buffer_size = 0;
                 return AVERROR(ENOMEM);
+            }
             memcpy(s->bitstream_buffer, buf + current_pos,
                    buf_size - current_pos);
             s->bitstream_buffer_size = buf_size - current_pos;
@@ -2543,6 +3446,7 @@ int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
     return 0;
 }
 
+#if HAVE_THREADS
 static int mpeg4_update_thread_context(AVCodecContext *dst,
                                        const AVCodecContext *src)
 {
@@ -2555,12 +3459,42 @@ static int mpeg4_update_thread_context(AVCodecContext *dst,
     if (ret < 0)
         return ret;
 
+    memcpy(((uint8_t*)s) + sizeof(MpegEncContext), ((uint8_t*)s1) + sizeof(MpegEncContext), sizeof(Mpeg4DecContext) - sizeof(MpegEncContext));
+
     if (CONFIG_MPEG4_DECODER && !init && s1->xvid_build >= 0)
         ff_xvid_idct_init(&s->m.idsp, dst);
 
-    s->shape               = s1->shape;
-    s->time_increment_bits = s1->time_increment_bits;
-    s->xvid_build          = s1->xvid_build;
+    return 0;
+}
+#endif
+
+static av_cold int init_studio_vlcs(Mpeg4DecContext *ctx)
+{
+    int i, ret;
+
+    for (i = 0; i < 12; i++) {
+        ret = init_vlc(&ctx->studio_intra_tab[i], STUDIO_INTRA_BITS, 22,
+                       &ff_mpeg4_studio_intra[i][0][1], 4, 2,
+                       &ff_mpeg4_studio_intra[i][0][0], 4, 2,
+                       0);
+
+        if (ret < 0)
+            return ret;
+    }
+
+    ret = init_vlc(&ctx->studio_luma_dc, STUDIO_INTRA_BITS, 19,
+                   &ff_mpeg4_studio_dc_luma[0][1], 4, 2,
+                   &ff_mpeg4_studio_dc_luma[0][0], 4, 2,
+                   0);
+    if (ret < 0)
+        return ret;
+
+    ret = init_vlc(&ctx->studio_chroma_dc, STUDIO_INTRA_BITS, 19,
+                   &ff_mpeg4_studio_dc_chroma[0][1], 4, 2,
+                   &ff_mpeg4_studio_dc_chroma[0][0], 4, 2,
+                   0);
+    if (ret < 0)
+        return ret;
 
     return 0;
 }
@@ -2570,7 +3504,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
     Mpeg4DecContext *ctx = avctx->priv_data;
     MpegEncContext *s = &ctx->m;
     int ret;
-    static int done = 0;
 
     ctx->divx_version =
     ctx->divx_build   =
@@ -2580,28 +3513,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     if ((ret = ff_h263_decode_init(avctx)) < 0)
         return ret;
 
-    if (!done) {
-        done = 1;
-
-        ff_rl_init(&ff_mpeg4_rl_intra, ff_mpeg4_static_rl_table_store[0]);
-        ff_rl_init(&ff_rvlc_rl_inter, ff_mpeg4_static_rl_table_store[1]);
-        ff_rl_init(&ff_rvlc_rl_intra, ff_mpeg4_static_rl_table_store[2]);
-        INIT_VLC_RL(ff_mpeg4_rl_intra, 554);
-        INIT_VLC_RL(ff_rvlc_rl_inter, 1072);
-        INIT_VLC_RL(ff_rvlc_rl_intra, 1072);
-        INIT_VLC_STATIC(&dc_lum, DC_VLC_BITS, 10 /* 13 */,
-                        &ff_mpeg4_DCtab_lum[0][1], 2, 1,
-                        &ff_mpeg4_DCtab_lum[0][0], 2, 1, 512);
-        INIT_VLC_STATIC(&dc_chrom, DC_VLC_BITS, 10 /* 13 */,
-                        &ff_mpeg4_DCtab_chrom[0][1], 2, 1,
-                        &ff_mpeg4_DCtab_chrom[0][0], 2, 1, 512);
-        INIT_VLC_STATIC(&sprite_trajectory, SPRITE_TRAJ_VLC_BITS, 15,
-                        &ff_sprite_trajectory_tab[0][1], 4, 2,
-                        &ff_sprite_trajectory_tab[0][0], 4, 2, 128);
-        INIT_VLC_STATIC(&mb_type_b_vlc, MB_TYPE_B_VLC_BITS, 4,
-                        &ff_mb_type_b_tab[0][1], 2, 1,
-                        &ff_mb_type_b_tab[0][0], 2, 1, 16);
-    }
+    ff_mpeg4videodec_static_init();
+    if ((ret = init_studio_vlcs(ctx)) < 0)
+        return ret;
 
     s->h263_pred = 1;
     s->low_delay = 0; /* default, might be overridden in the vol header during header parsing */
@@ -2614,6 +3528,35 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    Mpeg4DecContext *ctx = avctx->priv_data;
+    int i;
+
+    if (!avctx->internal->is_copy) {
+        for (i = 0; i < 12; i++)
+            ff_free_vlc(&ctx->studio_intra_tab[i]);
+
+        ff_free_vlc(&ctx->studio_luma_dc);
+        ff_free_vlc(&ctx->studio_chroma_dc);
+    }
+
+    return ff_h263_decode_end(avctx);
+}
+
+static const AVOption mpeg4_options[] = {
+    {"quarter_sample", "1/4 subpel MC", offsetof(MpegEncContext, quarter_sample), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, 0},
+    {"divx_packed", "divx style packed b frames", offsetof(MpegEncContext, divx_packed), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, 0},
+    {NULL}
+};
+
+static const AVClass mpeg4_class = {
+    .class_name = "MPEG4 Video Decoder",
+    .item_name  = av_default_item_name,
+    .option     = mpeg4_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_mpeg4_decoder = {
     .name                  = "mpeg4",
     .long_name             = NULL_IF_CONFIG_SMALL("MPEG-4 part 2"),
@@ -2621,22 +3564,31 @@ AVCodec ff_mpeg4_decoder = {
     .id                    = AV_CODEC_ID_MPEG4,
     .priv_data_size        = sizeof(Mpeg4DecContext),
     .init                  = decode_init,
-    .close                 = ff_h263_decode_end,
+    .close                 = decode_end,
     .decode                = ff_h263_decode_frame,
     .capabilities          = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
                              AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                              AV_CODEC_CAP_FRAME_THREADS,
+    .caps_internal         = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .flush                 = ff_mpeg_flush,
+    .max_lowres            = 3,
+    .pix_fmts              = ff_h263_hwaccel_pixfmt_list_420,
+    .profiles              = NULL_IF_CONFIG_SMALL(ff_mpeg4_video_profiles),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(mpeg4_update_thread_context),
+    .priv_class = &mpeg4_class,
     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
+#if CONFIG_MPEG4_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(mpeg4),
+#endif
 #if CONFIG_MPEG4_VAAPI_HWACCEL
                                HWACCEL_VAAPI(mpeg4),
 #endif
 #if CONFIG_MPEG4_VDPAU_HWACCEL
                                HWACCEL_VDPAU(mpeg4),
 #endif
+#if CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL
+                               HWACCEL_VIDEOTOOLBOX(mpeg4),
+#endif
                                NULL
                            },
-    .flush                 = ff_mpeg_flush,
-    .pix_fmts              = ff_h263_hwaccel_pixfmt_list_420,
-    .profiles              = NULL_IF_CONFIG_SMALL(ff_mpeg4_video_profiles),
-    .update_thread_context = ONLY_IF_THREADS_ENABLED(mpeg4_update_thread_context),
 };
diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c
index 8815ba8..f6a5992 100644
--- a/libavcodec/mpeg4videoenc.c
+++ b/libavcodec/mpeg4videoenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -497,9 +497,9 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                     s->last_mv[i][1][1] = 0;
             }
 
-            assert(s->dquant >= -2 && s->dquant <= 2);
-            assert((s->dquant & 1) == 0);
-            assert(mb_type >= 0);
+            av_assert2(s->dquant >= -2 && s->dquant <= 2);
+            av_assert2((s->dquant & 1) == 0);
+            av_assert2(mb_type >= 0);
 
             /* nothing to do if this MB was skipped in the next P-frame */
             if (s->next_picture.mbskip_table[s->mb_y * s->mb_stride + s->mb_x]) {  // FIXME avoid DCT & ...
@@ -519,7 +519,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
 
             if ((cbp | motion_x | motion_y | mb_type) == 0) {
                 /* direct MB with MV={0,0} */
-                assert(s->dquant == 0);
+                av_assert2(s->dquant == 0);
 
                 put_bits(&s->pb, 1, 1); /* mb not coded modb1=1 */
 
@@ -556,12 +556,12 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                 s->misc_bits += get_bits_diff(s);
 
             if (!mb_type) {
-                assert(s->mv_dir & MV_DIRECT);
+                av_assert2(s->mv_dir & MV_DIRECT);
                 ff_h263_encode_motion_vector(s, motion_x, motion_y, 1);
                 s->b_count++;
                 s->f_count++;
             } else {
-                assert(mb_type > 0 && mb_type < 4);
+                av_assert2(mb_type > 0 && mb_type < 4);
                 if (s->mv_type != MV_TYPE_FIELD) {
                     if (s->mv_dir & MV_DIR_FORWARD) {
                         ff_h263_encode_motion_vector(s,
@@ -641,10 +641,6 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
 
                     x = s->mb_x * 16;
                     y = s->mb_y * 16;
-                    if (x + 16 > s->width)
-                        x = s->width - 16;
-                    if (y + 16 > s->height)
-                        y = s->height - 16;
 
                     offset = x + y * s->linesize;
                     p_pic  = s->new_picture.f->data[0] + offset;
@@ -661,7 +657,21 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                         b_pic = pic->f->data[0] + offset;
                         if (!pic->shared)
                             b_pic += INPLACE_OFFSET;
-                        diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
+
+                        if (x + 16 > s->width || y + 16 > s->height) {
+                            int x1, y1;
+                            int xe = FFMIN(16, s->width - x);
+                            int ye = FFMIN(16, s->height - y);
+                            diff = 0;
+                            for (y1 = 0; y1 < ye; y1++) {
+                                for (x1 = 0; x1 < xe; x1++) {
+                                    diff += FFABS(p_pic[x1 + y1 * s->linesize] - b_pic[x1 + y1 * s->linesize]);
+                                }
+                            }
+                            diff = diff * 256 / (xe * ye);
+                        } else {
+                            diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
+                        }
                         if (diff > s->qscale * 70) {  // FIXME check that 70 is optimal
                             s->mb_skipped = 0;
                             break;
@@ -726,7 +736,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                 if (s->dquant)
                     put_bits(pb2, 2, dquant_code[s->dquant + 2]);
 
-                assert(!s->progressive_sequence);
+                av_assert2(!s->progressive_sequence);
                 if (cbp)
                     put_bits(pb2, 1, s->interlaced_dct);
                 put_bits(pb2, 1, 1);
@@ -750,7 +760,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                                              s->mv[0][1][1] - pred_y,
                                              s->f_code);
             } else {
-                assert(s->mv_type == MV_TYPE_8X8);
+                av_assert2(s->mv_type == MV_TYPE_8X8);
                 put_bits(&s->pb,
                          ff_h263_inter_MCBPC_bits[cbpc + 16],
                          ff_h263_inter_MCBPC_code[cbpc + 16]);
@@ -866,13 +876,13 @@ void ff_set_mpeg4_time(MpegEncContext *s)
         ff_mpeg4_init_direct_mv(s);
     } else {
         s->last_time_base = s->time_base;
-        s->time_base      = s->time / s->avctx->time_base.den;
+        s->time_base      = FFUDIV(s->time, s->avctx->time_base.den);
     }
 }
 
 static void mpeg4_encode_gop_header(MpegEncContext *s)
 {
-    int hours, minutes, seconds;
+    int64_t hours, minutes, seconds;
     int64_t time;
 
     put_bits(&s->pb, 16, 0);
@@ -882,13 +892,12 @@ static void mpeg4_encode_gop_header(MpegEncContext *s)
     if (s->reordered_input_picture[1])
         time = FFMIN(time, s->reordered_input_picture[1]->f->pts);
     time = time * s->avctx->time_base.num;
+    s->last_time_base = FFUDIV(time, s->avctx->time_base.den);
 
-    seconds  = time / s->avctx->time_base.den;
-    minutes  = seconds / 60;
-    seconds %= 60;
-    hours    = minutes / 60;
-    minutes %= 60;
-    hours   %= 24;
+    seconds = FFUDIV(time, s->avctx->time_base.den);
+    minutes = FFUDIV(seconds, 60); seconds = FFUMOD(seconds, 60);
+    hours   = FFUDIV(minutes, 60); minutes = FFUMOD(minutes, 60);
+    hours   = FFUMOD(hours  , 24);
 
     put_bits(&s->pb, 5, hours);
     put_bits(&s->pb, 6, minutes);
@@ -898,8 +907,6 @@ static void mpeg4_encode_gop_header(MpegEncContext *s)
     put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP));
     put_bits(&s->pb, 1, 0);  // broken link == NO
 
-    s->last_time_base = time / s->avctx->time_base.den;
-
     ff_mpeg4_stuffing(&s->pb);
 }
 
@@ -983,6 +990,8 @@ static void mpeg4_encode_vol_header(MpegEncContext *s,
 
     put_bits(&s->pb, 4, s->aspect_ratio_info); /* aspect ratio info */
     if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
+        av_reduce(&s->avctx->sample_aspect_ratio.num, &s->avctx->sample_aspect_ratio.den,
+                   s->avctx->sample_aspect_ratio.num,  s->avctx->sample_aspect_ratio.den, 255);
         put_bits(&s->pb, 8, s->avctx->sample_aspect_ratio.num);
         put_bits(&s->pb, 8, s->avctx->sample_aspect_ratio.den);
     }
@@ -1049,10 +1058,10 @@ static void mpeg4_encode_vol_header(MpegEncContext *s,
 }
 
 /* write MPEG-4 VOP header */
-void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
+int ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
 {
-    int time_incr;
-    int time_div, time_mod;
+    uint64_t time_incr;
+    int64_t time_div, time_mod;
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
         if (!(s->avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)) {
@@ -1071,11 +1080,15 @@ void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
     put_bits(&s->pb, 16, VOP_STARTCODE);    /* vop header */
     put_bits(&s->pb, 2, s->pict_type - 1);  /* pict type: I = 0 , P = 1 */
 
-    assert(s->time >= 0);
-    time_div  = s->time / s->avctx->time_base.den;
-    time_mod  = s->time % s->avctx->time_base.den;
+    time_div  = FFUDIV(s->time, s->avctx->time_base.den);
+    time_mod  = FFUMOD(s->time, s->avctx->time_base.den);
     time_incr = time_div - s->last_time_base;
-    assert(time_incr >= 0);
+
+    // This limits the frame duration to max 1 hour
+    if (time_incr > 3600) {
+        av_log(s->avctx, AV_LOG_ERROR, "time_incr %"PRIu64" too large\n", time_incr);
+        return AVERROR(EINVAL);
+    }
     while (time_incr--)
         put_bits(&s->pb, 1, 1);
 
@@ -1101,6 +1114,8 @@ void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
         put_bits(&s->pb, 3, s->f_code);  /* fcode_for */
     if (s->pict_type == AV_PICTURE_TYPE_B)
         put_bits(&s->pb, 3, s->b_code);  /* fcode_back */
+
+    return 0;
 }
 
 static av_cold void init_uni_dc_tab(void)
@@ -1163,8 +1178,8 @@ static av_cold void init_uni_mpeg4_rl_tab(RLTable *rl, uint32_t *bits_tab,
 {
     int slevel, run, last;
 
-    assert(MAX_LEVEL >= 64);
-    assert(MAX_RUN >= 63);
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN >= 63);
 
     for (slevel = -64; slevel < 64; slevel++) {
         if (slevel == 0)
@@ -1259,6 +1274,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int ret;
     static int done = 0;
 
+    if (avctx->width >= (1<<13) || avctx->height >= (1<<13)) {
+        av_log(avctx, AV_LOG_ERROR, "dimensions too large for MPEG-4\n");
+        return AVERROR(EINVAL);
+    }
+
     if ((ret = ff_mpv_encode_init(avctx)) < 0)
         return ret;
 
@@ -1353,8 +1373,8 @@ void ff_mpeg4_encode_video_packet_header(MpegEncContext *s)
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "data_partitioning", "Use data partitioning.",      OFFSET(data_partitioning), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "alternate_scan",    "Enable alternate scantable.", OFFSET(alternate_scan),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "data_partitioning", "Use data partitioning.",      OFFSET(data_partitioning), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "alternate_scan",    "Enable alternate scantable.", OFFSET(alternate_scan),    AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     FF_MPV_COMMON_OPTS
     { NULL },
 };
diff --git a/libavcodec/mpeg_er.c b/libavcodec/mpeg_er.c
index 9410b27..f54cb85 100644
--- a/libavcodec/mpeg_er.c
+++ b/libavcodec/mpeg_er.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@ static void set_erpic(ERPicture *dst, Picture *src)
 {
     int i;
 
+    memset(dst, 0, sizeof(*dst));
     if (!src) {
         dst->f  = NULL;
         dst->tf = NULL;
@@ -70,12 +71,15 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
     s->mb_skipped = mb_skipped;
     s->mb_x       = mb_x;
     s->mb_y       = mb_y;
+    s->mcsel      = 0;
     memcpy(s->mv, mv, sizeof(*mv));
 
     ff_init_block_index(s);
     ff_update_block_index(s);
 
     s->bdsp.clear_blocks(s->block[0]);
+    if (!s->chroma_y_shift)
+        s->bdsp.clear_blocks(s->block[6]);
 
     s->dest[0] = s->current_picture.f->data[0] +
                  s->mb_y * 16 * s->linesize +
@@ -90,7 +94,7 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
     if (ref)
         av_log(s->avctx, AV_LOG_DEBUG,
                "Interlaced error concealment is not fully implemented\n");
-    ff_mpv_decode_mb(s, s->block);
+    ff_mpv_reconstruct_mb(s, s->block);
 }
 
 int ff_mpeg_er_init(MpegEncContext *s)
@@ -108,7 +112,7 @@ int ff_mpeg_er_init(MpegEncContext *s)
     er->mb_stride   = s->mb_stride;
     er->b8_stride   = s->b8_stride;
 
-    er->er_temp_buffer     = av_malloc(s->mb_height * s->mb_stride);
+    er->er_temp_buffer     = av_malloc(s->mb_height * s->mb_stride * (4*sizeof(int) + 1));
     er->error_status_table = av_mallocz(mb_array_size);
     if (!er->er_temp_buffer || !er->error_status_table)
         goto fail;
diff --git a/libavcodec/mpeg_er.h b/libavcodec/mpeg_er.h
index ca1ea90..bb627a4 100644
--- a/libavcodec/mpeg_er.h
+++ b/libavcodec/mpeg_er.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudio.c b/libavcodec/mpegaudio.c
index 1a83635..cba5299 100644
--- a/libavcodec/mpegaudio.c
+++ b/libavcodec/mpegaudio.c
@@ -2,20 +2,20 @@
  * MPEG Audio common code
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index b556801..74590a8 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,11 +26,12 @@
 #ifndef AVCODEC_MPEGAUDIO_H
 #define AVCODEC_MPEGAUDIO_H
 
-#ifndef CONFIG_FLOAT
-#   define CONFIG_FLOAT 0
+#ifndef USE_FLOATS
+#   define USE_FLOATS 0
 #endif
 
 #include <stdint.h>
+#include "libavutil/internal.h"
 
 /* max frame size, in samples */
 #define MPA_FRAME_SIZE 1152
@@ -52,20 +53,25 @@
 #define WFRAC_BITS  16   /* fractional bits for window */
 #endif
 
+#define IMDCT_SCALAR 1.759
+
 #define FRAC_ONE    (1 << FRAC_BITS)
 
 #define FIX(a)   ((int)((a) * FRAC_ONE))
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #   define INTFLOAT float
+#   define SUINTFLOAT float
 typedef float MPA_INT;
 typedef float OUT_INT;
 #elif FRAC_BITS <= 15
 #   define INTFLOAT int
+#   define SUINTFLOAT SUINT
 typedef int16_t MPA_INT;
 typedef int16_t OUT_INT;
 #else
 #   define INTFLOAT int
+#   define SUINTFLOAT SUINT
 typedef int32_t MPA_INT;
 typedef int16_t OUT_INT;
 #endif
diff --git a/libavcodec/mpegaudio_parser.c b/libavcodec/mpegaudio_parser.c
index c44c024..1005e89 100644
--- a/libavcodec/mpegaudio_parser.c
+++ b/libavcodec/mpegaudio_parser.c
@@ -3,27 +3,28 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "parser.h"
 #include "mpegaudiodecheader.h"
 #include "libavutil/common.h"
-
+#include "libavformat/apetag.h" // for APE tag.
+#include "libavformat/id3v1.h" // for ID3v1_TAG_SIZE
 
 typedef struct MpegAudioParseContext {
     ParseContext pc;
@@ -35,7 +36,7 @@ typedef struct MpegAudioParseContext {
 
 #define MPA_HEADER_SIZE 4
 
-/* header + layer + bitrate + freq + lsf/mpeg25 */
+/* header + layer + freq + lsf/mpeg25 */
 #define SAME_HEADER_MASK \
    (0xffe00000 | (3 << 17) | (3 << 10) | (3 << 19))
 
@@ -49,12 +50,14 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
     uint32_t state= pc->state;
     int i;
     int next= END_NOT_FOUND;
+    int flush = !buf_size;
 
     for(i=0; i<buf_size; ){
         if(s->frame_size){
             int inc= FFMIN(buf_size - i, s->frame_size);
             i += inc;
             s->frame_size -= inc;
+            state = 0;
 
             if(!s->frame_size){
                 next= i;
@@ -67,24 +70,26 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
 
                 state= (state<<8) + buf[i++];
 
-                ret = ff_mpa_decode_header(avctx, state, &sr, &channels, &frame_size, &bit_rate);
+                ret = ff_mpa_decode_header(state, &sr, &channels, &frame_size, &bit_rate, &codec_id);
                 if (ret < 4) {
                     if (i > 4)
                         s->header_count = -2;
                 } else {
+                    int header_threshold = avctx->codec_id != AV_CODEC_ID_NONE && avctx->codec_id != codec_id;
                     if((state&SAME_HEADER_MASK) != (s->header&SAME_HEADER_MASK) && s->header)
                         s->header_count= -3;
                     s->header= state;
                     s->header_count++;
                     s->frame_size = ret-4;
 
-                    if (s->header_count > 0) {
+                    if (s->header_count > header_threshold) {
                         avctx->sample_rate= sr;
                         avctx->channels   = channels;
                         s1->duration      = frame_size;
+                        avctx->codec_id   = codec_id;
                         if (s->no_bitrate || !avctx->bit_rate) {
                             s->no_bitrate = 1;
-                            avctx->bit_rate += (bit_rate - avctx->bit_rate) / s->header_count;
+                            avctx->bit_rate += (bit_rate - avctx->bit_rate) / (s->header_count - header_threshold);
                         }
                     }
 
@@ -94,7 +99,9 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
                     } else if (codec_id == AV_CODEC_ID_MP3ADU) {
                         avpriv_report_missing_feature(avctx,
                             "MP3ADU full parser");
-                        return AVERROR_PATCHWELCOME;
+                        *poutbuf = NULL;
+                        *poutbuf_size = 0;
+                        return buf_size; /* parsers must not return error codes */
                     }
 
                     break;
@@ -110,6 +117,18 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
         return buf_size;
     }
 
+    if (flush && buf_size >= ID3v1_TAG_SIZE && memcmp(buf, "TAG", 3) == 0) {
+        *poutbuf = NULL;
+        *poutbuf_size = 0;
+        return next;
+    }
+
+    if (flush && buf_size >= APE_TAG_FOOTER_BYTES && memcmp(buf, APE_TAG_PREAMBLE, 8) == 0) {
+        *poutbuf = NULL;
+        *poutbuf_size = 0;
+        return next;
+    }
+
     *poutbuf = buf;
     *poutbuf_size = buf_size;
     return next;
diff --git a/libavcodec/mpegaudio_tablegen.c b/libavcodec/mpegaudio_tablegen.c
index b4c240b..ede7c8e 100644
--- a/libavcodec/mpegaudio_tablegen.c
+++ b/libavcodec/mpegaudio_tablegen.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdlib.h>
 #define CONFIG_HARDCODED_TABLES 0
+#include "libavutil/tablegen.h"
 #include "mpegaudio_tablegen.h"
 #include "tableprint.h"
 
diff --git a/libavcodec/mpegaudio_tablegen.h b/libavcodec/mpegaudio_tablegen.h
index 8a3e51a..0b0ea40 100644
--- a/libavcodec/mpegaudio_tablegen.h
+++ b/libavcodec/mpegaudio_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 #include <stdint.h>
 #include <math.h>
+#include "libavutil/attributes.h"
 
 #define TABLE_4_3_SIZE (8191 + 16)*4
 #if CONFIG_HARDCODED_TABLES
@@ -39,18 +40,33 @@ static float exp_table_float[512];
 static float expval_table_float[512][16];
 
 #define FRAC_BITS 23
+#define IMDCT_SCALAR 1.759
 
-static void mpegaudio_tableinit(void)
+static av_cold void mpegaudio_tableinit(void)
 {
     int i, value, exponent;
+    static const double exp2_lut[4] = {
+        1.00000000000000000000, /* 2 ^ (0 * 0.25) */
+        1.18920711500272106672, /* 2 ^ (1 * 0.25) */
+        M_SQRT2               , /* 2 ^ (2 * 0.25) */
+        1.68179283050742908606, /* 2 ^ (3 * 0.25) */
+    };
+    static double pow43_lut[16];
+    double exp2_base = 2.11758236813575084767080625169910490512847900390625e-22; // 2^(-72)
+    double exp2_val;
+    double pow43_val = 0;
+    for (i = 0; i < 16; ++i)
+        pow43_lut[i] = i * cbrt(i);
+
     for (i = 1; i < TABLE_4_3_SIZE; i++) {
-        double value = i / 4;
         double f, fm;
         int e, m;
-        /* cbrtf() isn't available on all systems, so we use powf(). */
-        f  = value * powf(value, 1.0 / 3.0) * pow(2, (i & 3) * 0.25);
+        double value = i / 4;
+        if ((i & 3) == 0)
+            pow43_val = value / IMDCT_SCALAR * cbrt(value);
+        f  = pow43_val * exp2_lut[i & 3];
         fm = frexp(f, &e);
-        m  = (uint32_t)(fm * (1LL << 31) + 0.5);
+        m  = llrint(fm * (1LL << 31));
         e += FRAC_BITS - 31 + 5 - 100;
 
         /* normalized to FRAC_BITS */
@@ -58,11 +74,12 @@ static void mpegaudio_tableinit(void)
         table_4_3_exp[i]   = -e;
     }
     for (exponent = 0; exponent < 512; exponent++) {
+        if (exponent && (exponent & 3) == 0)
+            exp2_base *= 2;
+        exp2_val = exp2_base * exp2_lut[exponent & 3] / IMDCT_SCALAR;
         for (value = 0; value < 16; value++) {
-            /* cbrtf() isn't available on all systems, so we use powf(). */
-            double f = (double)value * powf(value, 1.0 / 3.0) * pow(2, (exponent - 400) * 0.25 + FRAC_BITS + 5);
-            /* llrint() isn't always available, so round and cast manually. */
-            expval_table_fixed[exponent][value] = (long long int) (f >= 0 ? floor(f + 0.5) : ceil(f - 0.5));
+            double f = pow43_lut[value] * exp2_val;
+            expval_table_fixed[exponent][value] = (f < 0xFFFFFFFF ? llrint(f) : 0xFFFFFFFF);
             expval_table_float[exponent][value] = f;
         }
         exp_table_fixed[exponent] = expval_table_fixed[exponent][1];
diff --git a/libavcodec/mpegaudiodata.c b/libavcodec/mpegaudiodata.c
index 009a02a..0569281 100644
--- a/libavcodec/mpegaudiodata.c
+++ b/libavcodec/mpegaudiodata.c
@@ -2,20 +2,20 @@
  * MPEG Audio common tables
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,11 @@
 
 const uint16_t avpriv_mpa_bitrate_tab[2][3][15] = {
     { {0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448 },
-      {0, 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384 },
-      {0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320 } },
-    { {0, 32, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 224, 256},
-      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160},
-      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160}
+      {0, 32, 48, 56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320, 384 },
+      {0, 32, 40, 48,  56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320 } },
+    { {0, 32, 48, 56,  64,  80,  96, 112, 128, 144, 160, 176, 192, 224, 256},
+      {0,  8, 16, 24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160},
+      {0,  8, 16, 24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160}
     }
 };
 
diff --git a/libavcodec/mpegaudiodata.h b/libavcodec/mpegaudiodata.h
index 1609f48..a188150 100644
--- a/libavcodec/mpegaudiodata.h
+++ b/libavcodec/mpegaudiodata.h
@@ -2,20 +2,20 @@
  * MPEG Audio common tables
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodec_fixed.c b/libavcodec/mpegaudiodec_fixed.c
index 2db1e18..ad7ceb2 100644
--- a/libavcodec/mpegaudiodec_fixed.c
+++ b/libavcodec/mpegaudiodec_fixed.c
@@ -1,37 +1,37 @@
 /*
  * Fixed-point MPEG audio decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/samplefmt.h"
 
-#define CONFIG_FLOAT 0
+#define USE_FLOATS 0
 
 #include "mpegaudio.h"
 
-#define SHR(a,b)       ((a)>>(b))
+#define SHR(a,b)       (((int)(a))>>(b))
 /* WARNING: only correct for positive numbers */
 #define FIXR_OLD(a)    ((int)((a) * FRAC_ONE + 0.5))
 #define FIXR(a)        ((int)((a) * FRAC_ONE + 0.5))
 #define FIXHR(a)       ((int)((a) * (1LL<<32) + 0.5))
 #define MULH3(x, y, s) MULH((s)*(x), y)
-#define MULLx(x, y, s) MULL(x,y,s)
+#define MULLx(x, y, s) MULL((int)(x),(y),s)
 #define RENAME(a)      a ## _fixed
 #define OUT_FMT   AV_SAMPLE_FMT_S16
 #define OUT_FMT_P AV_SAMPLE_FMT_S16P
diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c
index 7bdfd90..ddfa5e0 100644
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@@ -2,27 +2,27 @@
  * Float MPEG Audio decoder
  * Copyright (c) 2010 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/samplefmt.h"
 
-#define CONFIG_FLOAT 1
+#define USE_FLOATS 1
 
 #include "mpegaudio.h"
 
@@ -46,6 +46,7 @@ AVCodec ff_mp1float_decoder = {
     .id             = AV_CODEC_ID_MP1,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
@@ -63,6 +64,7 @@ AVCodec ff_mp2float_decoder = {
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
     .decode         = decode_frame,
+    .close          = decode_close,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
@@ -78,6 +80,7 @@ AVCodec ff_mp3float_decoder = {
     .id             = AV_CODEC_ID_MP3,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
@@ -94,6 +97,7 @@ AVCodec ff_mp3adufloat_decoder = {
     .id             = AV_CODEC_ID_MP3ADU,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame_adu,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
diff --git a/libavcodec/mpegaudiodec_template.c b/libavcodec/mpegaudiodec_template.c
index e9ea65e..9cce88e 100644
--- a/libavcodec/mpegaudiodec_template.c
+++ b/libavcodec/mpegaudiodec_template.c
@@ -2,20 +2,20 @@
  * MPEG Audio decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "internal.h"
@@ -85,7 +86,7 @@ typedef struct MPADecodeContext {
     int err_recognition;
     AVCodecContext* avctx;
     MPADSPContext mpadsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     AVFrame *frame;
 } MPADecodeContext;
 
@@ -185,6 +186,8 @@ static void compute_band_indexes(MPADecodeContext *s, GranuleDef *g)
 {
     if (g->block_type == 2) {
         if (g->switch_point) {
+            if(s->sample_rate_index == 8)
+                avpriv_request_sample(s->avctx, "switch point in 8khz");
             /* if switched mode, we handle the 36 first samples as
                 long blocks.  For 8000Hz, we handle the 72 first
                 exponents as long blocks */
@@ -214,7 +217,7 @@ static inline int l1_unscale(int n, int mant, int scale_factor)
     shift   = scale_factor_modshift[scale_factor];
     mod     = shift & 3;
     shift >>= 2;
-    val     = MUL64(mant + (-1 << n) + 1, scale_factor_mult[n-1][mod]);
+    val     = MUL64((int)(mant + (-1U << n) + 1), scale_factor_mult[n-1][mod]);
     shift  += n;
     /* NOTE: at this point, 1 <= shift >= 21 + 15 */
     return (int)((val + (1LL << (shift - 1))) >> shift);
@@ -244,10 +247,13 @@ static inline int l3_unscale(int value, int exponent)
     e  = table_4_3_exp  [4 * value + (exponent & 3)];
     m  = table_4_3_value[4 * value + (exponent & 3)];
     e -= exponent >> 2;
-    assert(e >= 1);
-    if (e > 31)
+#ifdef DEBUG
+    if(e < 1)
+        av_log(NULL, AV_LOG_WARNING, "l3_unscale: e is %d\n", e);
+#endif
+    if (e > (SUINT)31)
         return 0;
-    m = (m + (1 << (e - 1))) >> e;
+    m = (m + ((1U << e)>>1)) >> e;
 
     return m;
 }
@@ -274,7 +280,8 @@ static av_cold void decode_init_static(void)
         scale_factor_mult[i][0] = MULLx(norm, FIXR(1.0          * 2.0), FRAC_BITS);
         scale_factor_mult[i][1] = MULLx(norm, FIXR(0.7937005259 * 2.0), FRAC_BITS);
         scale_factor_mult[i][2] = MULLx(norm, FIXR(0.6299605249 * 2.0), FRAC_BITS);
-        ff_dlog(NULL, "%d: norm=%x s=%"PRIx32" %"PRIx32" %"PRIx32"\n", i, norm,
+        ff_dlog(NULL, "%d: norm=%x s=%"PRIx32" %"PRIx32" %"PRIx32"\n", i,
+                (unsigned)norm,
                 scale_factor_mult[i][0],
                 scale_factor_mult[i][1],
                 scale_factor_mult[i][2]);
@@ -308,7 +315,7 @@ static av_cold void decode_init_static(void)
                  INIT_VLC_USE_NEW_STATIC);
         offset += huff_vlc_tables_sizes[i];
     }
-    assert(offset == FF_ARRAY_ELEMS(huff_vlc_tables));
+    av_assert0(offset == FF_ARRAY_ELEMS(huff_vlc_tables));
 
     offset = 0;
     for (i = 0; i < 2; i++) {
@@ -319,7 +326,7 @@ static av_cold void decode_init_static(void)
                  INIT_VLC_USE_NEW_STATIC);
         offset += huff_quad_vlc_tables_sizes[i];
     }
-    assert(offset == FF_ARRAY_ELEMS(huff_quad_vlc_tables));
+    av_assert0(offset == FF_ARRAY_ELEMS(huff_quad_vlc_tables));
 
     for (i = 0; i < 9; i++) {
         k = 0;
@@ -372,7 +379,7 @@ static av_cold void decode_init_static(void)
 
         for (j = 0; j < 2; j++) {
             e = -(j + 1) * ((i + 1) >> 1);
-            f = pow(2.0, e / 4.0);
+            f = exp2(e / 4.0);
             k = i & 1;
             is_table_lsf[j][k ^ 1][i] = FIXR(f);
             is_table_lsf[j][k    ][i] = FIXR(1.0);
@@ -383,11 +390,11 @@ static av_cold void decode_init_static(void)
     }
 
     for (i = 0; i < 8; i++) {
-        float ci, cs, ca;
+        double ci, cs, ca;
         ci = ci_table[i];
         cs = 1.0 / sqrt(1.0 + ci * ci);
         ca = cs * ci;
-#if !CONFIG_FLOAT
+#if !USE_FLOATS
         csa_table[i][0] = FIXHR(cs/4);
         csa_table[i][1] = FIXHR(ca/4);
         csa_table[i][2] = FIXHR(ca/4) + FIXHR(cs/4);
@@ -401,6 +408,16 @@ static av_cold void decode_init_static(void)
     }
 }
 
+#if USE_FLOATS
+static av_cold int decode_close(AVCodecContext * avctx)
+{
+    MPADecodeContext *s = avctx->priv_data;
+    av_freep(&s->fdsp);
+
+    return 0;
+}
+#endif
+
 static av_cold int decode_init(AVCodecContext * avctx)
 {
     static int initialized_tables = 0;
@@ -413,7 +430,12 @@ static av_cold int decode_init(AVCodecContext * avctx)
 
     s->avctx = avctx;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#if USE_FLOATS
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+#endif
+
     ff_mpadsp_init(&s->mpadsp);
 
     if (avctx->request_sample_fmt == OUT_FMT &&
@@ -436,9 +458,9 @@ static av_cold int decode_init(AVCodecContext * avctx)
 
 /* 12 points IMDCT. We compute it "by hand" by factorizing obvious
    cases. */
-static void imdct12(INTFLOAT *out, INTFLOAT *in)
+static void imdct12(INTFLOAT *out, SUINTFLOAT *in)
 {
-    INTFLOAT in0, in1, in2, in3, in4, in5, t1, t2;
+    SUINTFLOAT in0, in1, in2, in3, in4, in5, t1, t2;
 
     in0  = in[0*3];
     in1  = in[1*3] + in[0*3];
@@ -803,7 +825,7 @@ static void switch_buffer(MPADecodeContext *s, int *pos, int *end_pos,
         s->gb           = s->in_gb;
         s->in_gb.buffer = NULL;
         s->extrasize    = 0;
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        av_assert2((get_bits_count(&s->gb) & 7) == 0);
         skip_bits_long(&s->gb, *pos - *end_pos);
         *end_pos2 =
         *end_pos  = *end_pos2 + get_bits_count(&s->gb) - *pos;
@@ -811,13 +833,13 @@ static void switch_buffer(MPADecodeContext *s, int *pos, int *end_pos,
     }
 }
 
-/* Following is a optimized code for
+/* Following is an optimized code for
             INTFLOAT v = *src
             if(get_bits1(&s->gb))
                 v = -v;
             *dst = v;
 */
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #define READ_FLIP_SIGN(dst,src)                     \
     v = AV_RN32A(src) ^ (get_bits1(&s->gb) << 31);  \
     AV_WN32A(dst, v);
@@ -932,7 +954,7 @@ static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
                 s_index -= 4;
                 skip_bits_long(&s->gb, last_pos - pos);
                 av_log(s->avctx, AV_LOG_INFO, "overread, skip %d enddists: %d %d\n", last_pos - pos, end_pos-pos, end_pos2-pos);
-                if(s->err_recognition & AV_EF_BITSTREAM)
+                if(s->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))
                     s_index=0;
                 break;
             }
@@ -959,10 +981,10 @@ static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
     }
     /* skip extension bits */
     bits_left = end_pos2 - get_bits_count(&s->gb);
-    if (bits_left < 0 && (s->err_recognition & AV_EF_BUFFER)) {
+    if (bits_left < 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_COMPLIANT))) {
         av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left);
         s_index=0;
-    } else if (bits_left > 0 && (s->err_recognition & AV_EF_BUFFER)) {
+    } else if (bits_left > 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_AGGRESSIVE))) {
         av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left);
         s_index = 0;
     }
@@ -1017,7 +1039,8 @@ static void compute_stereo(MPADecodeContext *s, GranuleDef *g0, GranuleDef *g1)
 {
     int i, j, k, l;
     int sf_max, sf, len, non_zero_found;
-    INTFLOAT (*is_tab)[16], *tab0, *tab1, tmp0, tmp1, v1, v2;
+    INTFLOAT (*is_tab)[16], *tab0, *tab1, v1, v2;
+    SUINTFLOAT tmp0, tmp1;
     int non_zero_found_short[3];
 
     /* intensity stereo */
@@ -1126,8 +1149,8 @@ found2:
         /* ms stereo ONLY */
         /* NOTE: the 1/sqrt(2) normalization factor is included in the
            global gain */
-#if CONFIG_FLOAT
-       s->fdsp.butterflies_float(g0->sb_hybrid, g1->sb_hybrid, 576);
+#if USE_FLOATS
+       s->fdsp->butterflies_float(g0->sb_hybrid, g1->sb_hybrid, 576);
 #else
         tab0 = g0->sb_hybrid;
         tab1 = g1->sb_hybrid;
@@ -1141,7 +1164,18 @@ found2:
     }
 }
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
+#if HAVE_MIPSFPU
+#   include "mips/compute_antialias_float.h"
+#endif /* HAVE_MIPSFPU */
+#else
+#if HAVE_MIPSDSP
+#   include "mips/compute_antialias_fixed.h"
+#endif /* HAVE_MIPSDSP */
+#endif /* USE_FLOATS */
+
+#ifndef compute_antialias
+#if USE_FLOATS
 #define AA(j) do {                                                      \
         float tmp0 = ptr[-1-j];                                         \
         float tmp1 = ptr[   j];                                         \
@@ -1150,9 +1184,9 @@ found2:
     } while (0)
 #else
 #define AA(j) do {                                              \
-        int tmp0 = ptr[-1-j];                                   \
-        int tmp1 = ptr[   j];                                   \
-        int tmp2 = MULH(tmp0 + tmp1, csa_table[j][0]);          \
+        SUINT tmp0 = ptr[-1-j];                                   \
+        SUINT tmp1 = ptr[   j];                                   \
+        SUINT tmp2 = MULH(tmp0 + tmp1, csa_table[j][0]);          \
         ptr[-1-j] = 4 * (tmp2 - MULH(tmp1, csa_table[j][2]));   \
         ptr[   j] = 4 * (tmp2 + MULH(tmp0, csa_table[j][3]));   \
     } while (0)
@@ -1187,6 +1221,7 @@ static void compute_antialias(MPADecodeContext *s, GranuleDef *g)
         ptr += 18;
     }
 }
+#endif /* compute_antialias */
 
 static void compute_imdct(MPADecodeContext *s, GranuleDef *g,
                           INTFLOAT *sb_samples, INTFLOAT *mdct_buf)
@@ -1358,7 +1393,7 @@ static int mp_decode_layer3(MPADecodeContext *s)
         const uint8_t *ptr = s->gb.buffer + (get_bits_count(&s->gb)>>3);
         s->extrasize = av_clip((get_bits_left(&s->gb) >> 3) - s->extrasize, 0,
                                FFMAX(0, LAST_BUF_SIZE - s->last_buf_size));
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        av_assert1((get_bits_count(&s->gb) & 7) == 0);
         /* now we get bits from the main_data_begin offset */
         ff_dlog(s->avctx, "seekback:%d, lastbuf:%d\n",
                 main_data_begin, s->last_buf_size);
@@ -1548,9 +1583,6 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
     default:
         nb_frames = mp_decode_layer3(s);
 
-        if (nb_frames < 0)
-            return nb_frames;
-
         s->last_buf_size=0;
         if (s->in_gb.buffer) {
             align_get_bits(&s->gb);
@@ -1566,26 +1598,27 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
         }
 
         align_get_bits(&s->gb);
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        av_assert1((get_bits_count(&s->gb) & 7) == 0);
         i = (get_bits_left(&s->gb) >> 3) - s->extrasize;
         if (i < 0 || i > BACKSTEP_SIZE || nb_frames < 0) {
             if (i < 0)
                 av_log(s->avctx, AV_LOG_ERROR, "invalid new backstep %d\n", i);
             i = FFMIN(BACKSTEP_SIZE, buf_size - HEADER_SIZE);
         }
-        assert(i <= buf_size - HEADER_SIZE && i >= 0);
+        av_assert1(i <= buf_size - HEADER_SIZE && i >= 0);
         memcpy(s->last_buf + s->last_buf_size, s->gb.buffer + buf_size - HEADER_SIZE - i, i);
         s->last_buf_size += i;
     }
 
+    if(nb_frames < 0)
+        return nb_frames;
+
     /* get output buffer */
     if (!samples) {
-        av_assert0(s->frame != NULL);
+        av_assert0(s->frame);
         s->frame->nb_samples = s->avctx->frame_size;
-        if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0)
             return ret;
-        }
         samples = (OUT_INT **)s->frame->extended_data;
     }
 
@@ -1621,11 +1654,21 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
     uint32_t header;
     int ret;
 
+    int skipped = 0;
+    while(buf_size && !*buf){
+        buf++;
+        buf_size--;
+        skipped++;
+    }
+
     if (buf_size < HEADER_SIZE)
         return AVERROR_INVALIDDATA;
 
     header = AV_RB32(buf);
-
+    if (header>>8 == AV_RB32("TAG")>>8) {
+        av_log(avctx, AV_LOG_DEBUG, "discarding ID3 tag\n");
+        return buf_size + skipped;
+    }
     ret = avpriv_mpegaudio_decode_header((MPADecodeHeader *)s, header);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Header missing\n");
@@ -1641,6 +1684,14 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
     if (!avctx->bit_rate)
         avctx->bit_rate = s->bit_rate;
 
+    if (s->frame_size <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
+        return AVERROR_INVALIDDATA;
+    } else if (s->frame_size < buf_size) {
+        av_log(avctx, AV_LOG_DEBUG, "incorrect frame size - multiple frames in buffer?\n");
+        buf_size= s->frame_size;
+    }
+
     s->frame = data;
 
     ret = mp_decode_frame(s, NULL, buf, buf_size);
@@ -1661,13 +1712,15 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
             return ret;
     }
     s->frame_size = 0;
-    return buf_size;
+    return buf_size + skipped;
 }
 
 static void mp_flush(MPADecodeContext *ctx)
 {
     memset(ctx->synth_buf, 0, sizeof(ctx->synth_buf));
+    memset(ctx->mdct_buf, 0, sizeof(ctx->mdct_buf));
     ctx->last_buf_size = 0;
+    ctx->dither_state = 0;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -1684,6 +1737,7 @@ static int decode_frame_adu(AVCodecContext *avctx, void *data,
     MPADecodeContext *s = avctx->priv_data;
     uint32_t header;
     int len, ret;
+    int av_unused out_size;
 
     len = buf_size;
 
@@ -1776,8 +1830,11 @@ static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
     MP3On4DecodeContext *s = avctx->priv_data;
     int i;
 
+    if (s->mp3decctx[0])
+        av_freep(&s->mp3decctx[0]->fdsp);
+
     for (i = 0; i < s->frames; i++)
-        av_free(s->mp3decctx[i]);
+        av_freep(&s->mp3decctx[i]);
 
     return 0;
 }
@@ -1836,6 +1893,7 @@ static av_cold int decode_init_mp3on4(AVCodecContext * avctx)
         s->mp3decctx[i]->adu_mode = 1;
         s->mp3decctx[i]->avctx = avctx;
         s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
+        s->mp3decctx[i]->fdsp = s->mp3decctx[0]->fdsp;
     }
 
     return 0;
@@ -1871,10 +1929,8 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = MPA_FRAME_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out_samples = (OUT_INT **)frame->extended_data;
 
     // Discard too short frames
@@ -1888,7 +1944,7 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         fsize = AV_RB16(buf) >> 4;
         fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE);
         m     = s->mp3decctx[fr];
-        assert(m != NULL);
+        av_assert1(m);
 
         if (fsize < HEADER_SIZE) {
             av_log(avctx, AV_LOG_ERROR, "Frame size smaller than header size\n");
@@ -1897,8 +1953,10 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         header = (AV_RB32(buf) & 0x000fffff) | s->syncword; // patch header
 
         ret = avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header);
-        if (ret < 0) // Bad header, discard block
-            break;
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Bad header, discard block\n");
+            return AVERROR_INVALIDDATA;
+        }
 
         if (ch + m->nb_channels > avctx->channels ||
             s->coff[fr] + m->nb_channels > avctx->channels) {
@@ -1912,8 +1970,13 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         if (m->nb_channels > 1)
             outptr[1] = out_samples[s->coff[fr] + 1];
 
-        if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0)
-            return ret;
+        if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "failed to decode channel %d\n", ch);
+            memset(outptr[0], 0, MPA_FRAME_SIZE*sizeof(OUT_INT));
+            if (m->nb_channels > 1)
+                memset(outptr[1], 0, MPA_FRAME_SIZE*sizeof(OUT_INT));
+            ret = m->nb_channels * MPA_FRAME_SIZE*sizeof(OUT_INT);
+        }
 
         out_size += ret;
         buf      += fsize;
@@ -1921,6 +1984,10 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
 
         avctx->bit_rate += m->bit_rate;
     }
+    if (ch != avctx->channels) {
+        av_log(avctx, AV_LOG_ERROR, "failed to decode all channels\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     /* update codec info */
     avctx->sample_rate = s->mp3decctx[0]->sample_rate;
diff --git a/libavcodec/mpegaudiodecheader.c b/libavcodec/mpegaudiodecheader.c
index a315965..6cc79f1 100644
--- a/libavcodec/mpegaudiodecheader.c
+++ b/libavcodec/mpegaudiodecheader.c
@@ -2,20 +2,20 @@
  * MPEG Audio header decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -119,8 +119,7 @@ int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header)
     return 0;
 }
 
-int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
-                         int *channels, int *frame_size, int *bit_rate)
+int ff_mpa_decode_header(uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate, enum AVCodecID *codec_id)
 {
     MPADecodeHeader s1, *s = &s1;
 
@@ -130,17 +129,17 @@ int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
 
     switch(s->layer) {
     case 1:
-        avctx->codec_id = AV_CODEC_ID_MP1;
+        *codec_id = AV_CODEC_ID_MP1;
         *frame_size = 384;
         break;
     case 2:
-        avctx->codec_id = AV_CODEC_ID_MP2;
+        *codec_id = AV_CODEC_ID_MP2;
         *frame_size = 1152;
         break;
     default:
     case 3:
-        if (avctx->codec_id != AV_CODEC_ID_MP3ADU)
-            avctx->codec_id = AV_CODEC_ID_MP3;
+        if (*codec_id != AV_CODEC_ID_MP3ADU)
+            *codec_id = AV_CODEC_ID_MP3;
         if (s->lsf)
             *frame_size = 576;
         else
@@ -153,10 +152,3 @@ int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
     *bit_rate = s->bit_rate;
     return s->frame_size;
 }
-
-#if LIBAVCODEC_VERSION_MAJOR < 57
-int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate)
-{
-    return ff_mpa_decode_header(avctx, head, sample_rate, channels, frame_size, bit_rate);
-}
-#endif
diff --git a/libavcodec/mpegaudiodecheader.h b/libavcodec/mpegaudiodecheader.h
index 089a508..1da2a4c 100644
--- a/libavcodec/mpegaudiodecheader.h
+++ b/libavcodec/mpegaudiodecheader.h
@@ -2,20 +2,20 @@
  * MPEG Audio header decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,18 +54,17 @@ int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header);
 
 /* useful helper to get MPEG audio stream info. Return -1 if error in
    header, otherwise the coded frame size in bytes */
-int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
-                         int *channels, int *frame_size, int *bitrate);
-
-#if LIBAVCODEC_VERSION_MAJOR < 57
-int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate);
-#endif
+int ff_mpa_decode_header(uint32_t head, int *sample_rate,
+                         int *channels, int *frame_size, int *bitrate, enum AVCodecID *codec_id);
 
 /* fast header check for resync */
 static inline int ff_mpa_check_header(uint32_t header){
     /* header */
     if ((header & 0xffe00000) != 0xffe00000)
         return -1;
+    /* version check */
+    if ((header & (3<<19)) == 1<<19)
+        return -1;
     /* layer check */
     if ((header & (3<<17)) == 0)
         return -1;
diff --git a/libavcodec/mpegaudiodectab.h b/libavcodec/mpegaudiodectab.h
index 1221657..accd12b 100644
--- a/libavcodec/mpegaudiodectab.h
+++ b/libavcodec/mpegaudiodectab.h
@@ -2,20 +2,20 @@
  * MPEG Audio decoder
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c
index 58ea1d1..3cafca2 100644
--- a/libavcodec/mpegaudiodsp.c
+++ b/libavcodec/mpegaudiodsp.c
@@ -1,36 +1,40 @@
 /*
  * Copyright (c) 2011 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/thread.h"
 #include "mpegaudiodsp.h"
 #include "dct.h"
 #include "dct32.h"
 
+static AVOnce mpadsp_float_table_init = AV_ONCE_INIT;
+static AVOnce mpadsp_fixed_table_init = AV_ONCE_INIT;
+
 av_cold void ff_mpadsp_init(MPADSPContext *s)
 {
     DCTContext dct;
 
     ff_dct_init(&dct, 5, DCT_II);
-    ff_init_mpadsp_tabs_float();
-    ff_init_mpadsp_tabs_fixed();
+    ff_thread_once(&mpadsp_float_table_init, &ff_init_mpadsp_tabs_float);
+    ff_thread_once(&mpadsp_fixed_table_init, &ff_init_mpadsp_tabs_fixed);
 
     s->apply_window_float = ff_mpadsp_apply_window_float;
     s->apply_window_fixed = ff_mpadsp_apply_window_fixed;
@@ -45,4 +49,6 @@ av_cold void ff_mpadsp_init(MPADSPContext *s)
     if (ARCH_ARM)     ff_mpadsp_init_arm(s);
     if (ARCH_PPC)     ff_mpadsp_init_ppc(s);
     if (ARCH_X86)     ff_mpadsp_init_x86(s);
+    if (HAVE_MIPSFPU)   ff_mpadsp_init_mipsfpu(s);
+    if (HAVE_MIPSDSP) ff_mpadsp_init_mipsdsp(s);
 }
diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h
index e0e872f..0e4352d 100644
--- a/libavcodec/mpegaudiodsp.h
+++ b/libavcodec/mpegaudiodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@ typedef struct MPADSPContext {
                                ptrdiff_t incr);
     void (*dct32_float)(float *dst, const float *src);
     void (*dct32_fixed)(int *dst, const int *src);
+
     void (*imdct36_blocks_float)(float *out, float *buf, float *in,
                                  int count, int switch_point, int block_type);
     void (*imdct36_blocks_fixed)(int *out, int *buf, int *in,
@@ -62,6 +63,8 @@ void ff_mpadsp_init_aarch64(MPADSPContext *s);
 void ff_mpadsp_init_arm(MPADSPContext *s);
 void ff_mpadsp_init_ppc(MPADSPContext *s);
 void ff_mpadsp_init_x86(MPADSPContext *s);
+void ff_mpadsp_init_mipsfpu(MPADSPContext *s);
+void ff_mpadsp_init_mipsdsp(MPADSPContext *s);
 
 void ff_mpa_synth_init_float(float *window);
 void ff_mpa_synth_init_fixed(int32_t *window);
diff --git a/libavcodec/mpegaudiodsp_data.c b/libavcodec/mpegaudiodsp_data.c
index 5cf86b8..4550de9 100644
--- a/libavcodec/mpegaudiodsp_data.c
+++ b/libavcodec/mpegaudiodsp_data.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodsp_fixed.c b/libavcodec/mpegaudiodsp_fixed.c
index 3c49a56..83c9d66 100644
--- a/libavcodec/mpegaudiodsp_fixed.c
+++ b/libavcodec/mpegaudiodsp_fixed.c
@@ -1,20 +1,20 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#define CONFIG_FLOAT 0
+#define USE_FLOATS 0
 #include "mpegaudiodsp_template.c"
diff --git a/libavcodec/mpegaudiodsp_float.c b/libavcodec/mpegaudiodsp_float.c
index 2d8d53e..c45b136 100644
--- a/libavcodec/mpegaudiodsp_float.c
+++ b/libavcodec/mpegaudiodsp_float.c
@@ -1,20 +1,20 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#define CONFIG_FLOAT 1
+#define USE_FLOATS 1
 #include "mpegaudiodsp_template.c"
diff --git a/libavcodec/mpegaudiodsp_template.c b/libavcodec/mpegaudiodsp_template.c
index b8836c9..e531f8a 100644
--- a/libavcodec/mpegaudiodsp_template.c
+++ b/libavcodec/mpegaudiodsp_template.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "mpegaudiodsp.h"
 #include "mpegaudio.h"
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #define RENAME(n) n##_float
 
 static inline float round_sample(float *sum)
@@ -63,8 +63,8 @@ static inline int round_sample(int64_t *sum)
 #   define MACS(rt, ra, rb) MAC64(rt, ra, rb)
 #   define MLSS(rt, ra, rb) MLS64(rt, ra, rb)
 #   define MULH3(x, y, s) MULH((s)*(x), y)
-#   define MULLx(x, y, s) MULL(x,y,s)
-#   define SHR(a,b)       ((a)>>(b))
+#   define MULLx(x, y, s) MULL((int)(x),(y),s)
+#   define SHR(a,b)       (((int)(a))>>(b))
 #   define FIXR(a)        ((int)((a) * FRAC_ONE + 0.5))
 #   define FIXHR(a)       ((int)((a) * (1LL<<32) + 0.5))
 #endif
@@ -125,7 +125,7 @@ void RENAME(ff_mpadsp_apply_window)(MPA_INT *synth_buf, MPA_INT *window,
     register const MPA_INT *w, *w2, *p;
     int j;
     OUT_INT *samples2;
-#if CONFIG_FLOAT
+#if USE_FLOATS
     float sum, sum2;
 #else
     int64_t sum, sum2;
@@ -200,7 +200,7 @@ av_cold void RENAME(ff_mpa_synth_init)(MPA_INT *window)
     for(i=0;i<257;i++) {
         INTFLOAT v;
         v = ff_mpa_enwindow[i];
-#if CONFIG_FLOAT
+#if USE_FLOATS
         v *= 1.0 / (1LL<<(16 + FRAC_BITS));
 #endif
         window[i] = v;
@@ -243,7 +243,7 @@ av_cold void RENAME(ff_init_mpadsp_tabs)(void)
                 else if (i <  18) d = 1;
             }
             //merge last stage of imdct into the window coefficients
-            d *= 0.5 / cos(M_PI * (2 * i + 19) / 72);
+            d *= 0.5 * IMDCT_SCALAR / cos(M_PI * (2 * i + 19) / 72);
 
             if (j == 2)
                 RENAME(ff_mdct_win)[j][i/3] = FIXHR((d / (1<<5)));
@@ -300,11 +300,11 @@ static const INTFLOAT icos36h[9] = {
 };
 
 /* using Lee like decomposition followed by hand coded 9 points DCT */
-static void imdct36(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, INTFLOAT *win)
+static void imdct36(INTFLOAT *out, INTFLOAT *buf, SUINTFLOAT *in, INTFLOAT *win)
 {
     int i, j;
-    INTFLOAT t0, t1, t2, t3, s0, s1, s2, s3;
-    INTFLOAT tmp[18], *tmp1, *in1;
+    SUINTFLOAT t0, t1, t2, t3, s0, s1, s2, s3;
+    SUINTFLOAT tmp[18], *tmp1, *in1;
 
     for (i = 17; i >= 1; i--)
         in[i] += in[i-1];
@@ -398,3 +398,4 @@ void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in,
         out++;
     }
 }
+
diff --git a/libavcodec/mpegaudioenc_fixed.c b/libavcodec/mpegaudioenc_fixed.c
new file mode 100644
index 0000000..022b6fe
--- /dev/null
+++ b/libavcodec/mpegaudioenc_fixed.c
@@ -0,0 +1,41 @@
+/*
+ * The simplest mpeg audio layer 2 encoder
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mpegaudioenc_template.c"
+
+AVCodec ff_mp2fixed_encoder = {
+    .name                  = "mp2fixed",
+    .long_name             = NULL_IF_CONFIG_SMALL("MP2 fixed point (MPEG audio layer 2)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP2,
+    .priv_data_size        = sizeof(MpegAudioContext),
+    .init                  = MPA_encode_init,
+    .encode2               = MPA_encode_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){
+        44100, 48000,  32000, 22050, 24000, 16000, 0
+    },
+    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
+                                                 AV_CH_LAYOUT_STEREO,
+                                                 0 },
+    .defaults              = mp2_defaults,
+};
diff --git a/libavcodec/mpegaudioenc_float.c b/libavcodec/mpegaudioenc_float.c
new file mode 100644
index 0000000..4d4ab2d
--- /dev/null
+++ b/libavcodec/mpegaudioenc_float.c
@@ -0,0 +1,42 @@
+/*
+ * The simplest mpeg audio layer 2 encoder
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FLOATS 1
+#include "mpegaudioenc_template.c"
+
+AVCodec ff_mp2_encoder = {
+    .name                  = "mp2",
+    .long_name             = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP2,
+    .priv_data_size        = sizeof(MpegAudioContext),
+    .init                  = MPA_encode_init,
+    .encode2               = MPA_encode_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){
+        44100, 48000,  32000, 22050, 24000, 16000, 0
+    },
+    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
+                                                 AV_CH_LAYOUT_STEREO,
+                                                 0 },
+    .defaults              = mp2_defaults,
+};
diff --git a/libavcodec/mpegaudioenc.c b/libavcodec/mpegaudioenc_template.c
index 2be8b7f..93363fe 100644
--- a/libavcodec/mpegaudioenc.c
+++ b/libavcodec/mpegaudioenc_template.c
@@ -2,20 +2,20 @@
  * The simplest mpeg audio layer 2 encoder
  * Copyright (c) 2000, 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,12 @@ typedef struct MpegAudioContext {
     int16_t filter_bank[512];
     int scale_factor_table[64];
     unsigned char scale_diff_table[128];
+#if USE_FLOATS
     float scale_factor_inv_table[64];
+#else
+    int8_t scale_factor_shift[64];
+    unsigned short scale_factor_mult[64];
+#endif
     unsigned short total_quant_bits[17]; /* total number of bits per allocation group */
 } MpegAudioContext;
 
@@ -103,10 +108,15 @@ static av_cold int MPA_encode_init(AVCodecContext *avctx)
     s->freq_index = i;
 
     /* encoding bitrate & frequency */
-    for(i=0;i<15;i++) {
+    for(i=1;i<15;i++) {
         if (avpriv_mpa_bitrate_tab[s->lsf][1][i] == bitrate)
             break;
     }
+    if (i == 15 && !avctx->bit_rate) {
+        i = 14;
+        bitrate = avpriv_mpa_bitrate_tab[s->lsf][1][i];
+        avctx->bit_rate = bitrate * 1000;
+    }
     if (i == 15){
         av_log(avctx, AV_LOG_ERROR, "bitrate %d is not allowed in mp2\n", bitrate);
         return AVERROR(EINVAL);
@@ -149,11 +159,17 @@ static av_cold int MPA_encode_init(AVCodecContext *avctx)
     }
 
     for(i=0;i<64;i++) {
-        v = (int)(pow(2.0, (3 - i) / 3.0) * (1 << 20));
+        v = (int)(exp2((3 - i) / 3.0) * (1 << 20));
         if (v <= 0)
             v = 1;
         s->scale_factor_table[i] = v;
-        s->scale_factor_inv_table[i] = pow(2.0, -(3 - i) / 3.0) / (float)(1 << 20);
+#if USE_FLOATS
+        s->scale_factor_inv_table[i] = exp2(-(3 - i) / 3.0) / (float)(1 << 20);
+#else
+#define P 15
+        s->scale_factor_shift[i] = 21 - P - (i / 3);
+        s->scale_factor_mult[i] = (1 << P) * exp2((i % 3) / 3.0);
+#endif
     }
     for(i=0;i<128;i++) {
         v = i - 64;
@@ -228,11 +244,11 @@ static void idct32(int *out, int *tab)
     do {
         int x1, x2, x3, x4;
 
-        x3 = MUL(t[16], FIX(SQRT2*0.5));
+        x3 = MUL(t[16], FIX(M_SQRT2*0.5));
         x4 = t[0] - x3;
         x3 = t[0] + x3;
 
-        x2 = MUL(-(t[24] + t[8]), FIX(SQRT2*0.5));
+        x2 = MUL(-(t[24] + t[8]), FIX(M_SQRT2*0.5));
         x1 = MUL((t[8] - x2), xp[0]);
         x2 = MUL((t[8] + x2), xp[1]);
 
@@ -397,7 +413,7 @@ static void compute_scale_factors(MpegAudioContext *s,
             ff_dlog(NULL, "%2d:%d in=%x %x %d\n",
                     j, i, vmax, s->scale_factor_table[index], index);
             /* store the scale factor */
-            assert(index >=0 && index <= 63);
+            av_assert2(index >=0 && index <= 63);
             sf[i] = index;
         }
 
@@ -459,7 +475,7 @@ static void compute_scale_factors(MpegAudioContext *s,
             sf[1] = sf[2] = sf[0];
             break;
         default:
-            assert(0); //cannot happen
+            av_assert2(0); //cannot happen
             code = 0;           /* kill warning */
         }
 
@@ -579,7 +595,7 @@ static void compute_bit_allocation(MpegAudioContext *s,
         }
     }
     *padding = max_frame_size - current_frame_size;
-    assert(*padding >= 0);
+    av_assert0(*padding >= 0);
 }
 
 /*
@@ -668,14 +684,36 @@ static void encode_frame(MpegAudioContext *s,
                         qindex = s->alloc_table[j+b];
                         steps = ff_mpa_quant_steps[qindex];
                         for(m=0;m<3;m++) {
-                            float a;
                             sample = s->sb_samples[ch][k][l + m][i];
                             /* divide by scale factor */
-                            a = (float)sample * s->scale_factor_inv_table[s->scale_factors[ch][i][k]];
-                            q[m] = (int)((a + 1.0) * steps * 0.5);
+#if USE_FLOATS
+                            {
+                                float a;
+                                a = (float)sample * s->scale_factor_inv_table[s->scale_factors[ch][i][k]];
+                                q[m] = (int)((a + 1.0) * steps * 0.5);
+                            }
+#else
+                            {
+                                int q1, e, shift, mult;
+                                e = s->scale_factors[ch][i][k];
+                                shift = s->scale_factor_shift[e];
+                                mult = s->scale_factor_mult[e];
+
+                                /* normalize to P bits */
+                                if (shift < 0)
+                                    q1 = sample << (-shift);
+                                else
+                                    q1 = sample >> shift;
+                                q1 = (q1 * mult) >> P;
+                                q1 += 1 << P;
+                                if (q1 < 0)
+                                    q1 = 0;
+                                q[m] = (q1 * (unsigned)steps) >> (P + 1);
+                            }
+#endif
                             if (q[m] >= steps)
                                 q[m] = steps - 1;
-                            assert(q[m] >= 0 && q[m] < steps);
+                            av_assert2(q[m] >= 0 && q[m] < steps);
                         }
                         bits = ff_mpa_quant_bits[qindex];
                         if (bits < 0) {
@@ -725,10 +763,8 @@ static int MPA_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
     compute_bit_allocation(s, smr, bit_alloc, &padding);
 
-    if ((ret = ff_alloc_packet(avpkt, MPA_MAX_CODED_FRAME_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MPA_MAX_CODED_FRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
@@ -743,25 +779,7 @@ static int MPA_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 }
 
 static const AVCodecDefault mp2_defaults[] = {
-    { "b", "384000" },
+    { "b", "0" },
     { NULL },
 };
 
-AVCodec ff_mp2_encoder = {
-    .name                  = "mp2",
-    .long_name             = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
-    .type                  = AVMEDIA_TYPE_AUDIO,
-    .id                    = AV_CODEC_ID_MP2,
-    .priv_data_size        = sizeof(MpegAudioContext),
-    .init                  = MPA_encode_init,
-    .encode2               = MPA_encode_frame,
-    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
-                                                            AV_SAMPLE_FMT_NONE },
-    .supported_samplerates = (const int[]){
-        44100, 48000,  32000, 22050, 24000, 16000, 0
-    },
-    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
-                                                 AV_CH_LAYOUT_STEREO,
-                                                 0 },
-    .defaults              = mp2_defaults,
-};
diff --git a/libavcodec/mpegaudiotab.h b/libavcodec/mpegaudiotab.h
index d30ef1b..bb2e5de 100644
--- a/libavcodec/mpegaudiotab.h
+++ b/libavcodec/mpegaudiotab.h
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2000, 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,8 +33,6 @@
 #include <stdint.h>
 #include "mpegaudio.h"
 
-#define SQRT2 1.41421356237309514547
-
 static const int costab32[30] = {
     FIX(0.54119610014619701222),
     FIX(1.3065629648763763537),
diff --git a/libavcodec/mpegpicture.c b/libavcodec/mpegpicture.c
index 1d9544b..c0e0690 100644
--- a/libavcodec/mpegpicture.c
+++ b/libavcodec/mpegpicture.c
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related picture management functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,7 @@
 
 #include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
 #include "motion_est.h"
@@ -56,17 +57,26 @@ do {\
 int ff_mpeg_framesize_alloc(AVCodecContext *avctx, MotionEstContext *me,
                             ScratchpadContext *sc, int linesize)
 {
-    int alloc_size = FFALIGN(FFABS(linesize) + 32, 32);
+    int alloc_size = FFALIGN(FFABS(linesize) + 64, 32);
+
+    if (avctx->hwaccel)
+        return 0;
+
+    if (linesize < 24) {
+        av_log(avctx, AV_LOG_ERROR, "Image too small, temporary buffers cannot function\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     // edge emu needs blocksize + filter length - 1
     // (= 17x17 for  halfpel / 21x21 for H.264)
     // VC-1 computes luma and chroma simultaneously and needs 19X19 + 9x9
     // at uvlinesize. It supports only YUV420 so 24x24 is enough
     // linesize * interlaced * MBsize
-    FF_ALLOCZ_OR_GOTO(avctx, sc->edge_emu_buffer, alloc_size * 2 * 24,
+    // we also use this buffer for encoding in encode_mb_internal() needig an additional 32 lines
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, sc->edge_emu_buffer, alloc_size, 4 * 70,
                       fail);
 
-    FF_ALLOCZ_OR_GOTO(avctx, me->scratchpad, alloc_size * 2 * 16 * 3,
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, me->scratchpad, alloc_size, 4 * 16 * 2,
                       fail)
     me->temp            = me->scratchpad;
     sc->rd_scratchpad   = me->scratchpad;
@@ -138,15 +148,18 @@ static int alloc_frame_buffer(AVCodecContext *avctx,  Picture *pic,
         }
     }
 
-    if (linesize && (linesize   != pic->f->linesize[0] ||
-                     uvlinesize != pic->f->linesize[1])) {
+    if ((linesize   &&   linesize != pic->f->linesize[0]) ||
+        (uvlinesize && uvlinesize != pic->f->linesize[1])) {
         av_log(avctx, AV_LOG_ERROR,
-               "get_buffer() failed (stride changed)\n");
+               "get_buffer() failed (stride changed: linesize=%d/%d uvlinesize=%d/%d)\n",
+               linesize,   pic->f->linesize[0],
+               uvlinesize, pic->f->linesize[1]);
         ff_mpeg_unref_picture(avctx, pic);
         return -1;
     }
 
-    if (pic->f->linesize[1] != pic->f->linesize[2]) {
+    if (av_pix_fmt_count_planes(pic->f->format) > 2 &&
+        pic->f->linesize[1] != pic->f->linesize[2]) {
         av_log(avctx, AV_LOG_ERROR,
                "get_buffer() failed (uv stride mismatch)\n");
         ff_mpeg_unref_picture(avctx, pic);
@@ -165,8 +178,8 @@ static int alloc_frame_buffer(AVCodecContext *avctx,  Picture *pic,
     return 0;
 }
 
-static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
-                                int mb_stride, int mb_height, int b8_stride)
+static int alloc_picture_tables(AVCodecContext *avctx, Picture *pic, int encoding, int out_format,
+                                int mb_stride, int mb_width, int mb_height, int b8_stride)
 {
     const int big_mb_num    = mb_stride * (mb_height + 1) + 1;
     const int mb_array_size = mb_stride * mb_height;
@@ -189,7 +202,11 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
             return AVERROR(ENOMEM);
     }
 
-    if (out_format == FMT_H263 || encoding) {
+    if (out_format == FMT_H263 || encoding ||
+#if FF_API_DEBUG_MV
+        avctx->debug_mv ||
+#endif
+        (avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS)) {
         int mv_size        = 2 * (b8_array_size + 4) * sizeof(int16_t);
         int ref_index_size = 4 * mb_array_size;
 
@@ -201,6 +218,9 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
         }
     }
 
+    pic->alloc_mb_width  = mb_width;
+    pic->alloc_mb_height = mb_height;
+
     return 0;
 }
 
@@ -211,16 +231,21 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
 int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
                      ScratchpadContext *sc, int shared, int encoding,
                      int chroma_x_shift, int chroma_y_shift, int out_format,
-                     int mb_stride, int mb_height, int b8_stride,
+                     int mb_stride, int mb_width, int mb_height, int b8_stride,
                      ptrdiff_t *linesize, ptrdiff_t *uvlinesize)
 {
     int i, ret;
 
+    if (pic->qscale_table_buf)
+        if (   pic->alloc_mb_width  != mb_width
+            || pic->alloc_mb_height != mb_height)
+            ff_free_picture_tables(pic);
+
     if (shared) {
-        assert(pic->f->data[0]);
+        av_assert0(pic->f->data[0]);
         pic->shared = 1;
     } else {
-        assert(!pic->f->buf[0]);
+        av_assert0(!pic->f->buf[0]);
         if (alloc_frame_buffer(avctx, pic, me, sc,
                                chroma_x_shift, chroma_y_shift,
                                *linesize, *uvlinesize) < 0)
@@ -231,8 +256,8 @@ int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
     }
 
     if (!pic->qscale_table_buf)
-        ret = alloc_picture_tables(pic, encoding, out_format,
-                                   mb_stride, mb_height, b8_stride);
+        ret = alloc_picture_tables(avctx, pic, encoding, out_format,
+                                   mb_stride, mb_width, mb_height, b8_stride);
     else
         ret = make_tables_writable(pic);
     if (ret < 0)
@@ -268,6 +293,8 @@ fail:
  */
 void ff_mpeg_unref_picture(AVCodecContext *avctx, Picture *pic)
 {
+    int off = offsetof(Picture, mb_mean) + sizeof(pic->mb_mean);
+
     pic->tf.f = pic->f;
     /* WM Image / Screen codecs allocate internal buffers with different
      * dimensions / colorspaces; ignore user-defined callbacks for these. */
@@ -282,6 +309,8 @@ void ff_mpeg_unref_picture(AVCodecContext *avctx, Picture *pic)
 
     if (pic->needs_realloc)
         ff_free_picture_tables(pic);
+
+    memset((uint8_t*)pic + off, 0, sizeof(*pic) - off);
 }
 
 int ff_update_picture_tables(Picture *dst, Picture *src)
@@ -323,6 +352,9 @@ do {                                                                          \
         dst->ref_index[i]  = src->ref_index[i];
     }
 
+    dst->alloc_mb_width  = src->alloc_mb_width;
+    dst->alloc_mb_height = src->alloc_mb_height;
+
     return 0;
 }
 
@@ -345,8 +377,10 @@ int ff_mpeg_ref_picture(AVCodecContext *avctx, Picture *dst, Picture *src)
 
     if (src->hwaccel_picture_private) {
         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
-        if (!dst->hwaccel_priv_buf)
+        if (!dst->hwaccel_priv_buf) {
+            ret = AVERROR(ENOMEM);
             goto fail;
+        }
         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
     }
 
@@ -376,7 +410,7 @@ static inline int pic_is_unused(Picture *pic)
     return 0;
 }
 
-static int find_unused_picture(Picture *picture, int shared)
+static int find_unused_picture(AVCodecContext *avctx, Picture *picture, int shared)
 {
     int i;
 
@@ -392,12 +426,26 @@ static int find_unused_picture(Picture *picture, int shared)
         }
     }
 
-    return AVERROR_INVALIDDATA;
+    av_log(avctx, AV_LOG_FATAL,
+           "Internal error, picture buffer overflow\n");
+    /* We could return -1, but the codec would crash trying to draw into a
+     * non-existing frame anyway. This is safer than waiting for a random crash.
+     * Also the return of this is never useful, an encoder must only allocate
+     * as much as allowed in the specification. This has no relationship to how
+     * much libavcodec could allocate (and MAX_PICTURE_COUNT is always large
+     * enough for such valid streams).
+     * Plus, a decoder has to check stream validity and remove frames if too
+     * many reference frames are around. Waiting for "OOM" is not correct at
+     * all. Similarly, missing reference frames have to be replaced by
+     * interpolated/MC frames, anything else is a bug in the codec ...
+     */
+    abort();
+    return -1;
 }
 
 int ff_find_unused_picture(AVCodecContext *avctx, Picture *picture, int shared)
 {
-    int ret = find_unused_picture(picture, shared);
+    int ret = find_unused_picture(avctx, picture, shared);
 
     if (ret >= 0 && ret < MAX_PICTURE_COUNT) {
         if (picture[ret].needs_realloc) {
@@ -413,6 +461,9 @@ void ff_free_picture_tables(Picture *pic)
 {
     int i;
 
+    pic->alloc_mb_width  =
+    pic->alloc_mb_height = 0;
+
     av_buffer_unref(&pic->mb_var_buf);
     av_buffer_unref(&pic->mc_mb_var_buf);
     av_buffer_unref(&pic->mb_mean_buf);
diff --git a/libavcodec/mpegpicture.h b/libavcodec/mpegpicture.h
index 115c288..2db3d67 100644
--- a/libavcodec/mpegpicture.h
+++ b/libavcodec/mpegpicture.h
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "motion_est.h"
 #include "thread.h"
 
-#define MAX_PICTURE_COUNT 32
+#define MAX_PICTURE_COUNT 36
 #define EDGE_WIDTH 16
 
 typedef struct ScratchpadContext {
@@ -67,6 +67,9 @@ typedef struct Picture {
     AVBufferRef *mc_mb_var_buf;
     uint16_t *mc_mb_var;        ///< Table for motion compensated MB variances
 
+    int alloc_mb_width;         ///< mb_width used to allocate tables
+    int alloc_mb_height;        ///< mb_height used to allocate tables
+
     AVBufferRef *mb_mean_buf;
     uint8_t *mb_mean;           ///< Table for MB luminance
 
@@ -75,16 +78,16 @@ typedef struct Picture {
 
     int field_picture;          ///< whether or not the picture was encoded in separate fields
 
-    int mb_var_sum;             ///< sum of MB variance for current frame
-    int mc_mb_var_sum;          ///< motion compensated MB variance for current frame
+    int64_t mb_var_sum;         ///< sum of MB variance for current frame
+    int64_t mc_mb_var_sum;      ///< motion compensated MB variance for current frame
 
-    int b_frame_score;          /* */
+    int b_frame_score;
     int needs_realloc;          ///< Picture needs to be reallocated (eg due to a frame size change)
 
     int reference;
     int shared;
 
-    uint64_t encoding_error[4];
+    uint64_t encoding_error[AV_NUM_DATA_POINTERS];
 } Picture;
 
 /**
@@ -94,7 +97,7 @@ typedef struct Picture {
 int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
                      ScratchpadContext *sc, int shared, int encoding,
                      int chroma_x_shift, int chroma_y_shift, int out_format,
-                     int mb_stride, int mb_height, int b8_stride,
+                     int mb_stride, int mb_width, int mb_height, int b8_stride,
                      ptrdiff_t *linesize, ptrdiff_t *uvlinesize);
 
 int ff_mpeg_framesize_alloc(AVCodecContext *avctx, MotionEstContext *me,
diff --git a/libavcodec/mpegutils.c b/libavcodec/mpegutils.c
index bc430f0..3f94540 100644
--- a/libavcodec/mpegutils.c
+++ b/libavcodec/mpegutils.c
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,10 +23,31 @@
 #include "libavutil/common.h"
 #include "libavutil/frame.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/motion_vector.h"
+#include "libavutil/avassert.h"
 
 #include "avcodec.h"
 #include "mpegutils.h"
 
+static int add_mb(AVMotionVector *mb, uint32_t mb_type,
+                  int dst_x, int dst_y,
+                  int motion_x, int motion_y, int motion_scale,
+                  int direction)
+{
+    mb->w = IS_8X8(mb_type) || IS_8X16(mb_type) ? 8 : 16;
+    mb->h = IS_8X8(mb_type) || IS_16X8(mb_type) ? 8 : 16;
+    mb->motion_x = motion_x;
+    mb->motion_y = motion_y;
+    mb->motion_scale = motion_scale;
+    mb->dst_x = dst_x;
+    mb->dst_y = dst_y;
+    mb->src_x = dst_x + motion_x / motion_scale;
+    mb->src_y = dst_y + motion_y / motion_scale;
+    mb->source = direction ? 1 : -1;
+    mb->flags = 0; // XXX: does mb_type contain extra information that could be exported here?
+    return 1;
+}
+
 void ff_draw_horiz_band(AVCodecContext *avctx,
                         AVFrame *cur, AVFrame *last,
                         int y, int h, int picture_structure,
@@ -78,3 +99,295 @@ void ff_draw_horiz_band(AVCodecContext *avctx,
                                y, picture_structure, h);
     }
 }
+
+void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
+                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
+                         int *low_delay,
+                         int mb_width, int mb_height, int mb_stride, int quarter_sample)
+{
+    if ((avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS) && mbtype_table && motion_val[0]) {
+        const int shift = 1 + quarter_sample;
+        const int scale = 1 << shift;
+        const int mv_sample_log2 = avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_SVQ3 ? 2 : 1;
+        const int mv_stride      = (mb_width << mv_sample_log2) +
+                                   (avctx->codec->id == AV_CODEC_ID_H264 ? 0 : 1);
+        int mb_x, mb_y, mbcount = 0;
+
+        /* size is width * height * 2 * 4 where 2 is for directions and 4 is
+         * for the maximum number of MB (4 MB in case of IS_8x8) */
+        AVMotionVector *mvs = av_malloc_array(mb_width * mb_height, 2 * 4 * sizeof(AVMotionVector));
+        if (!mvs)
+            return;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            for (mb_x = 0; mb_x < mb_width; mb_x++) {
+                int i, direction, mb_type = mbtype_table[mb_x + mb_y * mb_stride];
+                for (direction = 0; direction < 2; direction++) {
+                    if (!USES_LIST(mb_type, direction))
+                        continue;
+                    if (IS_8X8(mb_type)) {
+                        for (i = 0; i < 4; i++) {
+                            int sx = mb_x * 16 + 4 + 8 * (i & 1);
+                            int sy = mb_y * 16 + 4 + 8 * (i >> 1);
+                            int xy = (mb_x * 2 + (i & 1) +
+                                      (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = motion_val[direction][xy][0];
+                            int my = motion_val[direction][xy][1];
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                        }
+                    } else if (IS_16X8(mb_type)) {
+                        for (i = 0; i < 2; i++) {
+                            int sx = mb_x * 16 + 8;
+                            int sy = mb_y * 16 + 4 + 8 * i;
+                            int xy = (mb_x * 2 + (mb_y * 2 + i) * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = motion_val[direction][xy][0];
+                            int my = motion_val[direction][xy][1];
+
+                            if (IS_INTERLACED(mb_type))
+                                my *= 2;
+
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                        }
+                    } else if (IS_8X16(mb_type)) {
+                        for (i = 0; i < 2; i++) {
+                            int sx = mb_x * 16 + 4 + 8 * i;
+                            int sy = mb_y * 16 + 8;
+                            int xy = (mb_x * 2 + i + mb_y * 2 * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = motion_val[direction][xy][0];
+                            int my = motion_val[direction][xy][1];
+
+                            if (IS_INTERLACED(mb_type))
+                                my *= 2;
+
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                        }
+                    } else {
+                          int sx = mb_x * 16 + 8;
+                          int sy = mb_y * 16 + 8;
+                          int xy = (mb_x + mb_y * mv_stride) << mv_sample_log2;
+                          int mx = motion_val[direction][xy][0];
+                          int my = motion_val[direction][xy][1];
+                          mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                    }
+                }
+            }
+        }
+
+        if (mbcount) {
+            AVFrameSideData *sd;
+
+            av_log(avctx, AV_LOG_DEBUG, "Adding %d MVs info to frame %d\n", mbcount, avctx->frame_number);
+            sd = av_frame_new_side_data(pict, AV_FRAME_DATA_MOTION_VECTORS, mbcount * sizeof(AVMotionVector));
+            if (!sd) {
+                av_freep(&mvs);
+                return;
+            }
+            memcpy(sd->data, mvs, mbcount * sizeof(AVMotionVector));
+        }
+
+        av_freep(&mvs);
+    }
+
+    /* TODO: export all the following to make them accessible for users (and filters) */
+    if (avctx->hwaccel || !mbtype_table)
+        return;
+
+
+    if (avctx->debug & (FF_DEBUG_SKIP | FF_DEBUG_QP | FF_DEBUG_MB_TYPE)) {
+        int x,y;
+
+        av_log(avctx, AV_LOG_DEBUG, "New frame, type: %c\n",
+               av_get_picture_type_char(pict->pict_type));
+        for (y = 0; y < mb_height; y++) {
+            for (x = 0; x < mb_width; x++) {
+                if (avctx->debug & FF_DEBUG_SKIP) {
+                    int count = mbskip_table ? mbskip_table[x + y * mb_stride] : 0;
+                    if (count > 9)
+                        count = 9;
+                    av_log(avctx, AV_LOG_DEBUG, "%1d", count);
+                }
+                if (avctx->debug & FF_DEBUG_QP) {
+                    av_log(avctx, AV_LOG_DEBUG, "%2d",
+                           qscale_table[x + y * mb_stride]);
+                }
+                if (avctx->debug & FF_DEBUG_MB_TYPE) {
+                    int mb_type = mbtype_table[x + y * mb_stride];
+                    // Type & MV direction
+                    if (IS_PCM(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "P");
+                    else if (IS_INTRA(mb_type) && IS_ACPRED(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "A");
+                    else if (IS_INTRA4x4(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "i");
+                    else if (IS_INTRA16x16(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "I");
+                    else if (IS_DIRECT(mb_type) && IS_SKIP(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "d");
+                    else if (IS_DIRECT(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "D");
+                    else if (IS_GMC(mb_type) && IS_SKIP(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "g");
+                    else if (IS_GMC(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "G");
+                    else if (IS_SKIP(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "S");
+                    else if (!USES_LIST(mb_type, 1))
+                        av_log(avctx, AV_LOG_DEBUG, ">");
+                    else if (!USES_LIST(mb_type, 0))
+                        av_log(avctx, AV_LOG_DEBUG, "<");
+                    else {
+                        av_assert2(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
+                        av_log(avctx, AV_LOG_DEBUG, "X");
+                    }
+
+                    // segmentation
+                    if (IS_8X8(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "+");
+                    else if (IS_16X8(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "-");
+                    else if (IS_8X16(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "|");
+                    else if (IS_INTRA(mb_type) || IS_16X16(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, " ");
+                    else
+                        av_log(avctx, AV_LOG_DEBUG, "?");
+
+
+                    if (IS_INTERLACED(mb_type))
+                        av_log(avctx, AV_LOG_DEBUG, "=");
+                    else
+                        av_log(avctx, AV_LOG_DEBUG, " ");
+                }
+            }
+            av_log(avctx, AV_LOG_DEBUG, "\n");
+        }
+    }
+
+#if FF_API_DEBUG_MV
+    if ((avctx->debug & (FF_DEBUG_VIS_QP | FF_DEBUG_VIS_MB_TYPE)) ||
+        (avctx->debug_mv)) {
+        int mb_y;
+        int i, ret;
+        int h_chroma_shift, v_chroma_shift, block_height;
+        const int mv_sample_log2 = avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_SVQ3 ? 2 : 1;
+        const int mv_stride      = (mb_width << mv_sample_log2) +
+                                   (avctx->codec->id == AV_CODEC_ID_H264 ? 0 : 1);
+
+        if (low_delay)
+            *low_delay = 0; // needed to see the vectors without trashing the buffers
+
+        ret = av_pix_fmt_get_chroma_sub_sample (avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
+        if (ret)
+            return ret;
+
+        av_frame_make_writable(pict);
+
+        pict->opaque = NULL;
+        block_height = 16 >> v_chroma_shift;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            int mb_x;
+            for (mb_x = 0; mb_x < mb_width; mb_x++) {
+                const int mb_index = mb_x + mb_y * mb_stride;
+                if ((avctx->debug & FF_DEBUG_VIS_QP)) {
+                    uint64_t c = (qscale_table[mb_index] * 128 / 31) *
+                                 0x0101010101010101ULL;
+                    int y;
+                    for (y = 0; y < block_height; y++) {
+                        *(uint64_t *)(pict->data[1] + 8 * mb_x +
+                                      (block_height * mb_y + y) *
+                                      pict->linesize[1]) = c;
+                        *(uint64_t *)(pict->data[2] + 8 * mb_x +
+                                      (block_height * mb_y + y) *
+                                      pict->linesize[2]) = c;
+                    }
+                }
+                if ((avctx->debug & FF_DEBUG_VIS_MB_TYPE) &&
+                    motion_val[0]) {
+                    int mb_type = mbtype_table[mb_index];
+                    uint64_t u,v;
+                    int y;
+#define COLOR(theta, r) \
+    u = (int)(128 + r * cos(theta * M_PI / 180)); \
+    v = (int)(128 + r * sin(theta * M_PI / 180));
+
+
+                    u = v = 128;
+                    if (IS_PCM(mb_type)) {
+                        COLOR(120, 48)
+                    } else if ((IS_INTRA(mb_type) && IS_ACPRED(mb_type)) ||
+                               IS_INTRA16x16(mb_type)) {
+                        COLOR(30, 48)
+                    } else if (IS_INTRA4x4(mb_type)) {
+                        COLOR(90, 48)
+                    } else if (IS_DIRECT(mb_type) && IS_SKIP(mb_type)) {
+                        // COLOR(120, 48)
+                    } else if (IS_DIRECT(mb_type)) {
+                        COLOR(150, 48)
+                    } else if (IS_GMC(mb_type) && IS_SKIP(mb_type)) {
+                        COLOR(170, 48)
+                    } else if (IS_GMC(mb_type)) {
+                        COLOR(190, 48)
+                    } else if (IS_SKIP(mb_type)) {
+                        // COLOR(180, 48)
+                    } else if (!USES_LIST(mb_type, 1)) {
+                        COLOR(240, 48)
+                    } else if (!USES_LIST(mb_type, 0)) {
+                        COLOR(0, 48)
+                    } else {
+                        av_assert2(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
+                        COLOR(300,48)
+                    }
+
+                    u *= 0x0101010101010101ULL;
+                    v *= 0x0101010101010101ULL;
+                    for (y = 0; y < block_height; y++) {
+                        *(uint64_t *)(pict->data[1] + 8 * mb_x +
+                                      (block_height * mb_y + y) * pict->linesize[1]) = u;
+                        *(uint64_t *)(pict->data[2] + 8 * mb_x +
+                                      (block_height * mb_y + y) * pict->linesize[2]) = v;
+                    }
+
+                    // segmentation
+                    if (IS_8X8(mb_type) || IS_16X8(mb_type)) {
+                        *(uint64_t *)(pict->data[0] + 16 * mb_x + 0 +
+                                      (16 * mb_y + 8) * pict->linesize[0]) ^= 0x8080808080808080ULL;
+                        *(uint64_t *)(pict->data[0] + 16 * mb_x + 8 +
+                                      (16 * mb_y + 8) * pict->linesize[0]) ^= 0x8080808080808080ULL;
+                    }
+                    if (IS_8X8(mb_type) || IS_8X16(mb_type)) {
+                        for (y = 0; y < 16; y++)
+                            pict->data[0][16 * mb_x + 8 + (16 * mb_y + y) *
+                                          pict->linesize[0]] ^= 0x80;
+                    }
+                    if (IS_8X8(mb_type) && mv_sample_log2 >= 2) {
+                        int dm = 1 << (mv_sample_log2 - 2);
+                        for (i = 0; i < 4; i++) {
+                            int sx = mb_x * 16 + 8 * (i & 1);
+                            int sy = mb_y * 16 + 8 * (i >> 1);
+                            int xy = (mb_x * 2 + (i & 1) +
+                                     (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                            // FIXME bidir
+                            int32_t *mv = (int32_t *) &motion_val[0][xy];
+                            if (mv[0] != mv[dm] ||
+                                mv[dm * mv_stride] != mv[dm * (mv_stride + 1)])
+                                for (y = 0; y < 8; y++)
+                                    pict->data[0][sx + 4 + (sy + y) * pict->linesize[0]] ^= 0x80;
+                            if (mv[0] != mv[dm * mv_stride] || mv[dm] != mv[dm * (mv_stride + 1)])
+                                *(uint64_t *)(pict->data[0] + sx + (sy + 4) *
+                                              pict->linesize[0]) ^= 0x8080808080808080ULL;
+                        }
+                    }
+
+                    if (IS_INTERLACED(mb_type) &&
+                        avctx->codec->id == AV_CODEC_ID_H264) {
+                        // hmm
+                    }
+                }
+                if (mbskip_table)
+                    mbskip_table[mb_index] = 0;
+            }
+        }
+    }
+#endif
+}
diff --git a/libavcodec/mpegutils.h b/libavcodec/mpegutils.h
index 960999c..1ed21c1 100644
--- a/libavcodec/mpegutils.h
+++ b/libavcodec/mpegutils.h
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -137,4 +137,12 @@ void ff_draw_horiz_band(AVCodecContext *avctx, AVFrame *cur, AVFrame *last,
                         int y, int h, int picture_structure, int first_field,
                         int low_delay);
 
+/**
+ * Print debugging info for the given picture.
+ */
+void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
+                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
+                         int *low_delay,
+                         int mb_width, int mb_height, int mb_stride, int quarter_sample);
+
 #endif /* AVCODEC_MPEGUTILS_H */
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index ca3cd2e..dbb6ab9 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -5,20 +5,20 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,9 +31,11 @@
 #include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
+#include "libavutil/motion_vector.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "h264chroma.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -56,10 +58,7 @@ static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
 
     nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
     /* XXX: only MPEG-1 */
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
@@ -115,13 +114,13 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
         int j= s->intra_scantable.permutated[i];
@@ -129,10 +128,10 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
         if (level) {
             if (level < 0) {
                 level = -level;
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
                 level = -level;
             } else {
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
             }
             block[j] = level;
         }
@@ -146,13 +145,14 @@ static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int sum=-1;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
+    sum += block[0];
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
         int j= s->intra_scantable.permutated[i];
@@ -160,10 +160,10 @@ static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
         if (level) {
             if (level < 0) {
                 level = -level;
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
                 level = -level;
             } else {
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
             }
             block[j] = level;
             sum+=level;
@@ -179,6 +179,9 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int sum=-1;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
@@ -190,11 +193,11 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
             if (level < 0) {
                 level = -level;
                 level = (((level << 1) + 1) * qscale *
-                         ((int) (quant_matrix[j]))) >> 4;
+                         ((int) (quant_matrix[j]))) >> 5;
                 level = -level;
             } else {
                 level = (((level << 1) + 1) * qscale *
-                         ((int) (quant_matrix[j]))) >> 4;
+                         ((int) (quant_matrix[j]))) >> 5;
             }
             block[j] = level;
             sum+=level;
@@ -209,15 +212,12 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     qmul = qscale << 1;
 
     if (!s->h263_aic) {
-        if (n < 4)
-            block[0] = block[0] * s->y_dc_scale;
-        else
-            block[0] = block[0] * s->c_dc_scale;
+        block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
         qadd = (qscale - 1) | 1;
     }else{
         qadd = 0;
@@ -225,7 +225,7 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
     if(s->ac_pred)
         nCoeffs=63;
     else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
 
     for(i=1; i<=nCoeffs; i++) {
         level = block[i];
@@ -246,7 +246,7 @@ static void dct_unquantize_h263_inter_c(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
@@ -266,14 +266,41 @@ static void dct_unquantize_h263_inter_c(MpegEncContext *s,
     }
 }
 
+
+static void gray16(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
+{
+    while(h--)
+        memset(dst + h*linesize, 128, 16);
+}
+
+static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
+{
+    while(h--)
+        memset(dst + h*linesize, 128, 8);
+}
+
 /* init common dct for both encoder and decoder */
 static av_cold int dct_init(MpegEncContext *s)
 {
-    ff_blockdsp_init(&s->bdsp);
+    ff_blockdsp_init(&s->bdsp, s->avctx);
+    ff_h264chroma_init(&s->h264chroma, 8); //for lowres
     ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
     ff_mpegvideodsp_init(&s->mdsp);
     ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
 
+    if (s->avctx->debug & FF_DEBUG_NOMC) {
+        int i;
+        for (i=0; i<4; i++) {
+            s->hdsp.avg_pixels_tab[0][i] = gray16;
+            s->hdsp.put_pixels_tab[0][i] = gray16;
+            s->hdsp.put_no_rnd_pixels_tab[0][i] = gray16;
+
+            s->hdsp.avg_pixels_tab[1][i] = gray8;
+            s->hdsp.put_pixels_tab[1][i] = gray8;
+            s->hdsp.put_no_rnd_pixels_tab[1][i] = gray8;
+        }
+    }
+
     s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_c;
     s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_c;
     s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_c;
@@ -286,18 +313,24 @@ static av_cold int dct_init(MpegEncContext *s)
     if (HAVE_INTRINSICS_NEON)
         ff_mpv_common_init_neon(s);
 
+    if (ARCH_ALPHA)
+        ff_mpv_common_init_axp(s);
     if (ARCH_ARM)
         ff_mpv_common_init_arm(s);
     if (ARCH_PPC)
         ff_mpv_common_init_ppc(s);
     if (ARCH_X86)
         ff_mpv_common_init_x86(s);
+    if (ARCH_MIPS)
+        ff_mpv_common_init_mips(s);
 
     return 0;
 }
 
 av_cold void ff_mpv_idct_init(MpegEncContext *s)
 {
+    if (s->codec_id == AV_CODEC_ID_MPEG4)
+        s->idsp.mpeg4_studio_profile = s->studio_profile;
     ff_idctdsp_init(&s->idsp, s->avctx);
 
     /* load & permutate scantables
@@ -318,7 +351,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared)
 {
     return ff_alloc_picture(s->avctx, pic, &s->me, &s->sc, shared, 0,
                             s->chroma_x_shift, s->chroma_y_shift, s->out_format,
-                            s->mb_stride, s->mb_height, s->b8_stride,
+                            s->mb_stride, s->mb_width, s->mb_height, s->b8_stride,
                             &s->linesize, &s->uvlinesize);
 }
 
@@ -329,6 +362,9 @@ static int init_duplicate_context(MpegEncContext *s)
     int yc_size = y_size + 2 * c_size;
     int i;
 
+    if (s->mb_height & 1)
+        yc_size += 2*s->b8_stride + 2*s->mb_stride;
+
     s->sc.edge_emu_buffer =
     s->me.scratchpad   =
     s->me.temp         =
@@ -352,12 +388,14 @@ static int init_duplicate_context(MpegEncContext *s)
     for (i = 0; i < 12; i++) {
         s->pblocks[i] = &s->block[i];
     }
+
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->block32, sizeof(*s->block32), fail)
+    s->dpcm_direction = 0;
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->dpcm_macroblock, sizeof(*s->dpcm_macroblock), fail)
+
     if (s->avctx->codec_tag == AV_RL32("VCR2")) {
         // exchange uv
-        int16_t (*tmp)[64];
-        tmp           = s->pblocks[4];
-        s->pblocks[4] = s->pblocks[5];
-        s->pblocks[5] = tmp;
+        FFSWAP(void *, s->pblocks[4], s->pblocks[5]);
     }
 
     if (s->out_format == FMT_H263) {
@@ -390,6 +428,8 @@ static void free_duplicate_context(MpegEncContext *s)
     av_freep(&s->me.map);
     av_freep(&s->me.score_map);
     av_freep(&s->blocks);
+    av_freep(&s->block32);
+    av_freep(&s->dpcm_macroblock);
     av_freep(&s->ac_val_base);
     s->block = NULL;
 }
@@ -407,6 +447,9 @@ static void backup_duplicate_context(MpegEncContext *bak, MpegEncContext *src)
     COPY(me.score_map);
     COPY(blocks);
     COPY(block);
+    COPY(block32);
+    COPY(dpcm_macroblock);
+    COPY(dpcm_direction);
     COPY(start_mb_y);
     COPY(end_mb_y);
     COPY(me.map_generation);
@@ -435,10 +478,7 @@ int ff_update_duplicate_context(MpegEncContext *dst, MpegEncContext *src)
     }
     if (dst->avctx->codec_tag == AV_RL32("VCR2")) {
         // exchange uv
-        int16_t (*tmp)[64];
-        tmp             = dst->pblocks[4];
-        dst->pblocks[4] = dst->pblocks[5];
-        dst->pblocks[5] = tmp;
+        FFSWAP(void *, dst->pblocks[4], dst->pblocks[5]);
     }
     if (!dst->sc.edge_emu_buffer &&
         (ret = ff_mpeg_framesize_alloc(dst->avctx, &dst->me,
@@ -458,9 +498,11 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
     int i, ret;
     MpegEncContext *s = dst->priv_data, *s1 = src->priv_data;
 
-    if (dst == src || !s1->context_initialized)
+    if (dst == src)
         return 0;
 
+    av_assert0(s != s1);
+
     // FIXME can parameters change on I-frames?
     // in that case dst may need a reinit
     if (!s->context_initialized) {
@@ -471,18 +513,24 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
         s->bitstream_buffer      = NULL;
         s->bitstream_buffer_size = s->allocated_bitstream_buffer_size = 0;
 
-        ff_mpv_idct_init(s);
-        if ((err = ff_mpv_common_init(s)) < 0)
-            return err;
+        if (s1->context_initialized){
+//             s->picture_range_start  += MAX_PICTURE_COUNT;
+//             s->picture_range_end    += MAX_PICTURE_COUNT;
+            ff_mpv_idct_init(s);
+            if((err = ff_mpv_common_init(s)) < 0){
+                memset(s, 0, sizeof(MpegEncContext));
+                s->avctx = dst;
+                return err;
+            }
+        }
     }
 
     if (s->height != s1->height || s->width != s1->width || s->context_reinit) {
-        int err;
         s->context_reinit = 0;
         s->height = s1->height;
         s->width  = s1->width;
-        if ((err = ff_mpv_common_frame_size_change(s)) < 0)
-            return err;
+        if ((ret = ff_mpv_common_frame_size_change(s)) < 0)
+            return ret;
     }
 
     s->avctx->coded_height  = s1->avctx->coded_height;
@@ -490,12 +538,16 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
     s->avctx->width         = s1->avctx->width;
     s->avctx->height        = s1->avctx->height;
 
+    s->quarter_sample       = s1->quarter_sample;
+
     s->coded_picture_number = s1->coded_picture_number;
     s->picture_number       = s1->picture_number;
 
+    av_assert0(!s->picture || s->picture != s1->picture);
+    if(s->picture)
     for (i = 0; i < MAX_PICTURE_COUNT; i++) {
         ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
-        if (s1->picture[i].f->buf[0] &&
+        if (s1->picture && s1->picture[i].f->buf[0] &&
             (ret = ff_mpeg_ref_picture(s->avctx, &s->picture[i], &s1->picture[i])) < 0)
             return ret;
     }
@@ -503,7 +555,7 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
 #define UPDATE_PICTURE(pic)\
 do {\
     ff_mpeg_unref_picture(s->avctx, &s->pic);\
-    if (s1->pic.f->buf[0])\
+    if (s1->pic.f && s1->pic.f->buf[0])\
         ret = ff_mpeg_ref_picture(s->avctx, &s->pic, &s1->pic);\
     else\
         ret = ff_update_picture_tables(&s->pic, &s1->pic);\
@@ -527,6 +579,7 @@ do {\
     // Error/bug resilience
     s->next_p_frame_damaged = s1->next_p_frame_damaged;
     s->workaround_bugs      = s1->workaround_bugs;
+    s->padding_bug_score    = s1->padding_bug_score;
 
     // MPEG-4 timing info
     memcpy(&s->last_time_base, &s1->last_time_base,
@@ -543,10 +596,15 @@ do {\
 
     if (s1->bitstream_buffer) {
         if (s1->bitstream_buffer_size +
-            AV_INPUT_BUFFER_PADDING_SIZE > s->allocated_bitstream_buffer_size)
+            AV_INPUT_BUFFER_PADDING_SIZE > s->allocated_bitstream_buffer_size) {
             av_fast_malloc(&s->bitstream_buffer,
                            &s->allocated_bitstream_buffer_size,
                            s1->allocated_bitstream_buffer_size);
+            if (!s->bitstream_buffer) {
+                s->bitstream_buffer_size = 0;
+                return AVERROR(ENOMEM);
+            }
+        }
         s->bitstream_buffer_size = s1->bitstream_buffer_size;
         memcpy(s->bitstream_buffer, s1->bitstream_buffer,
                s1->bitstream_buffer_size);
@@ -566,7 +624,6 @@ do {\
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "Context scratch buffers could not "
                    "be allocated due to unknown size.\n");
-            return AVERROR_BUG;
         }
 
     // MPEG-2/interlacing info
@@ -616,6 +673,18 @@ void ff_mpv_decode_defaults(MpegEncContext *s)
     ff_mpv_common_defaults(s);
 }
 
+void ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx)
+{
+    s->avctx           = avctx;
+    s->width           = avctx->coded_width;
+    s->height          = avctx->coded_height;
+    s->codec_id        = avctx->codec->id;
+    s->workaround_bugs = avctx->workaround_bugs;
+
+    /* convert fourcc to upper case */
+    s->codec_tag          = avpriv_toupper4(avctx->codec_tag);
+}
+
 /**
  * Initialize and allocates MpegEncContext fields dependent on the resolution.
  */
@@ -647,44 +716,36 @@ static int init_context_frame(MpegEncContext *s)
     c_size  = s->mb_stride * (s->mb_height + 1);
     yc_size = y_size + 2   * c_size;
 
+    if (s->mb_height & 1)
+        yc_size += 2*s->b8_stride + 2*s->mb_stride;
+
     FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_index2xy, (s->mb_num + 1) * sizeof(int),
                       fail); // error resilience code looks cleaner with this
     for (y = 0; y < s->mb_height; y++)
         for (x = 0; x < s->mb_width; x++)
             s->mb_index2xy[x + y * s->mb_width] = x + y * s->mb_stride;
 
-    s->mb_index2xy[s->mb_height * s->mb_width] =
-        (s->mb_height - 1) * s->mb_stride + s->mb_width; // FIXME really needed?
+    s->mb_index2xy[s->mb_height * s->mb_width] = (s->mb_height - 1) * s->mb_stride + s->mb_width; // FIXME really needed?
 
     if (s->encoding) {
         /* Allocate MV tables */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->p_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_forw_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_back_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_forw_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_back_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_direct_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->p_mv_table_base,                 mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_forw_mv_table_base,            mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_back_mv_table_base,            mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_forw_mv_table_base,      mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_back_mv_table_base,      mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_direct_mv_table_base,          mv_table_size * 2 * sizeof(int16_t), fail)
         s->p_mv_table            = s->p_mv_table_base + s->mb_stride + 1;
         s->b_forw_mv_table       = s->b_forw_mv_table_base + s->mb_stride + 1;
         s->b_back_mv_table       = s->b_back_mv_table_base + s->mb_stride + 1;
-        s->b_bidir_forw_mv_table = s->b_bidir_forw_mv_table_base +
-                                   s->mb_stride + 1;
-        s->b_bidir_back_mv_table = s->b_bidir_back_mv_table_base +
-                                   s->mb_stride + 1;
+        s->b_bidir_forw_mv_table = s->b_bidir_forw_mv_table_base + s->mb_stride + 1;
+        s->b_bidir_back_mv_table = s->b_bidir_back_mv_table_base + s->mb_stride + 1;
         s->b_direct_mv_table     = s->b_direct_mv_table_base + s->mb_stride + 1;
 
         /* Allocate MB type table */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_type, mb_array_size *
-                          sizeof(uint16_t), fail); // needed for encoding
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_type, mb_array_size * sizeof(uint16_t), fail) // needed for encoding
 
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->lambda_table, mb_array_size *
-                          sizeof(int), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->lambda_table, mb_array_size * sizeof(int), fail)
 
         FF_ALLOC_OR_GOTO(s->avctx, s->cplx_tab,
                          mb_array_size * sizeof(float), fail);
@@ -707,34 +768,27 @@ static int init_context_frame(MpegEncContext *s)
                     s->b_field_mv_table[i][j][k] = s->b_field_mv_table_base[i][j][k] +
                                                    s->mb_stride + 1;
                 }
-                FF_ALLOCZ_OR_GOTO(s->avctx, s->b_field_select_table [i][j],
-                                  mb_array_size * 2 * sizeof(uint8_t), fail);
-                FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_mv_table_base[i][j],
-                                  mv_table_size * 2 * sizeof(int16_t), fail);
-                s->p_field_mv_table[i][j] = s->p_field_mv_table_base[i][j]
-                                            + s->mb_stride + 1;
+                FF_ALLOCZ_OR_GOTO(s->avctx, s->b_field_select_table [i][j], mb_array_size * 2 * sizeof(uint8_t), fail)
+                FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_mv_table_base[i][j], mv_table_size * 2 * sizeof(int16_t), fail)
+                s->p_field_mv_table[i][j] = s->p_field_mv_table_base[i][j] + s->mb_stride + 1;
             }
-            FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_select_table[i],
-                              mb_array_size * 2 * sizeof(uint8_t), fail);
+            FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_select_table[i], mb_array_size * 2 * sizeof(uint8_t), fail)
         }
     }
     if (s->out_format == FMT_H263) {
         /* cbp values */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->coded_block_base, y_size, fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->coded_block_base, y_size + (s->mb_height&1)*2*s->b8_stride, fail);
         s->coded_block = s->coded_block_base + s->b8_stride + 1;
 
         /* cbp, ac_pred, pred_dir */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->cbp_table,
-                          mb_array_size * sizeof(uint8_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->pred_dir_table,
-                          mb_array_size * sizeof(uint8_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->cbp_table     , mb_array_size * sizeof(uint8_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->pred_dir_table, mb_array_size * sizeof(uint8_t), fail);
     }
 
     if (s->h263_pred || s->h263_plus || !s->encoding) {
         /* dc values */
-        // MN: we need these for  error resilience of intra-frames
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->dc_val_base,
-                          yc_size * sizeof(int16_t), fail);
+        // MN: we need these for error resilience of intra-frames
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->dc_val_base, yc_size * sizeof(int16_t), fail);
         s->dc_val[0] = s->dc_val_base + s->b8_stride + 1;
         s->dc_val[1] = s->dc_val_base + y_size + s->mb_stride + 1;
         s->dc_val[2] = s->dc_val[1] + c_size;
@@ -742,30 +796,110 @@ static int init_context_frame(MpegEncContext *s)
             s->dc_val_base[i] = 1024;
     }
 
-    /* which mb is a intra block */
+    /* which mb is an intra block */
     FF_ALLOCZ_OR_GOTO(s->avctx, s->mbintra_table, mb_array_size, fail);
     memset(s->mbintra_table, 1, mb_array_size);
 
     /* init macroblock skip table */
     FF_ALLOCZ_OR_GOTO(s->avctx, s->mbskip_table, mb_array_size + 2, fail);
-    // Note the + 1 is for  a quicker MPEG-4 slice_end detection
+    // Note the + 1 is for a quicker MPEG-4 slice_end detection
 
     return ff_mpeg_er_init(s);
 fail:
     return AVERROR(ENOMEM);
 }
 
+static void clear_context(MpegEncContext *s)
+{
+    int i, j, k;
+
+    memset(&s->next_picture, 0, sizeof(s->next_picture));
+    memset(&s->last_picture, 0, sizeof(s->last_picture));
+    memset(&s->current_picture, 0, sizeof(s->current_picture));
+    memset(&s->new_picture, 0, sizeof(s->new_picture));
+
+    memset(s->thread_context, 0, sizeof(s->thread_context));
+
+    s->me.map = NULL;
+    s->me.score_map = NULL;
+    s->dct_error_sum = NULL;
+    s->block = NULL;
+    s->blocks = NULL;
+    s->block32 = NULL;
+    memset(s->pblocks, 0, sizeof(s->pblocks));
+    s->dpcm_direction = 0;
+    s->dpcm_macroblock = NULL;
+    s->ac_val_base = NULL;
+    s->ac_val[0] =
+    s->ac_val[1] =
+    s->ac_val[2] =NULL;
+    s->sc.edge_emu_buffer = NULL;
+    s->me.scratchpad = NULL;
+    s->me.temp =
+    s->sc.rd_scratchpad =
+    s->sc.b_scratchpad =
+    s->sc.obmc_scratchpad = NULL;
+
+
+    s->bitstream_buffer = NULL;
+    s->allocated_bitstream_buffer_size = 0;
+    s->picture          = NULL;
+    s->mb_type          = NULL;
+    s->p_mv_table_base  = NULL;
+    s->b_forw_mv_table_base = NULL;
+    s->b_back_mv_table_base = NULL;
+    s->b_bidir_forw_mv_table_base = NULL;
+    s->b_bidir_back_mv_table_base = NULL;
+    s->b_direct_mv_table_base = NULL;
+    s->p_mv_table            = NULL;
+    s->b_forw_mv_table       = NULL;
+    s->b_back_mv_table       = NULL;
+    s->b_bidir_forw_mv_table = NULL;
+    s->b_bidir_back_mv_table = NULL;
+    s->b_direct_mv_table     = NULL;
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+            for (k = 0; k < 2; k++) {
+                s->b_field_mv_table_base[i][j][k] = NULL;
+                s->b_field_mv_table[i][j][k] = NULL;
+            }
+            s->b_field_select_table[i][j] = NULL;
+            s->p_field_mv_table_base[i][j] = NULL;
+            s->p_field_mv_table[i][j] = NULL;
+        }
+        s->p_field_select_table[i] = NULL;
+    }
+
+    s->dc_val_base = NULL;
+    s->coded_block_base = NULL;
+    s->mbintra_table = NULL;
+    s->cbp_table = NULL;
+    s->pred_dir_table = NULL;
+
+    s->mbskip_table = NULL;
+
+    s->er.error_status_table = NULL;
+    s->er.er_temp_buffer = NULL;
+    s->mb_index2xy = NULL;
+    s->lambda_table = NULL;
+
+    s->cplx_tab = NULL;
+    s->bits_tab = NULL;
+}
+
 /**
  * init common structure for both encoder and decoder.
  * this assumes that some variables like width/height are already set
  */
 av_cold int ff_mpv_common_init(MpegEncContext *s)
 {
-    int i;
+    int i, ret;
     int nb_slices = (HAVE_THREADS &&
                      s->avctx->active_thread_type & FF_THREAD_SLICE) ?
                     s->avctx->thread_count : 1;
 
+    clear_context(s);
+
     if (s->encoding && s->avctx->slices)
         nb_slices = s->avctx->slices;
 
@@ -798,12 +932,11 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
     dct_init(s);
 
     /* set chroma shifts */
-    av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt,
-                                     &s->chroma_x_shift,
-                                     &s->chroma_y_shift);
-
-    /* convert fourcc to upper case */
-    s->codec_tag          = avpriv_toupper4(s->avctx->codec_tag);
+    ret = av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt,
+                                           &s->chroma_x_shift,
+                                           &s->chroma_y_shift);
+    if (ret)
+        return ret;
 
     FF_ALLOCZ_OR_GOTO(s->avctx, s->picture,
                       MAX_PICTURE_COUNT * sizeof(Picture), fail);
@@ -812,10 +945,6 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
         if (!s->picture[i].f)
             goto fail;
     }
-    memset(&s->next_picture, 0, sizeof(s->next_picture));
-    memset(&s->last_picture, 0, sizeof(s->last_picture));
-    memset(&s->current_picture, 0, sizeof(s->current_picture));
-    memset(&s->new_picture, 0, sizeof(s->new_picture));
     s->next_picture.f = av_frame_alloc();
     if (!s->next_picture.f)
         goto fail;
@@ -829,39 +958,38 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
     if (!s->new_picture.f)
         goto fail;
 
-    if (s->width && s->height) {
-        if (init_context_frame(s))
-            goto fail;
+    if (init_context_frame(s))
+        goto fail;
 
-        s->parse_context.state = -1;
-    }
+    s->parse_context.state = -1;
 
     s->context_initialized = 1;
+    memset(s->thread_context, 0, sizeof(s->thread_context));
     s->thread_context[0]   = s;
 
-    if (s->width && s->height) {
-        if (nb_slices > 1) {
-            for (i = 1; i < nb_slices; i++) {
-                s->thread_context[i] = av_malloc(sizeof(MpegEncContext));
-                memcpy(s->thread_context[i], s, sizeof(MpegEncContext));
-            }
-
-            for (i = 0; i < nb_slices; i++) {
-                if (init_duplicate_context(s->thread_context[i]) < 0)
+//     if (s->width && s->height) {
+    if (nb_slices > 1) {
+        for (i = 0; i < nb_slices; i++) {
+            if (i) {
+                s->thread_context[i] = av_memdup(s, sizeof(MpegEncContext));
+                if (!s->thread_context[i])
                     goto fail;
-                s->thread_context[i]->start_mb_y =
-                    (s->mb_height * (i) + nb_slices / 2) / nb_slices;
-                s->thread_context[i]->end_mb_y   =
-                    (s->mb_height * (i + 1) + nb_slices / 2) / nb_slices;
             }
-        } else {
-            if (init_duplicate_context(s) < 0)
+            if (init_duplicate_context(s->thread_context[i]) < 0)
                 goto fail;
-            s->start_mb_y = 0;
-            s->end_mb_y   = s->mb_height;
+            s->thread_context[i]->start_mb_y =
+                (s->mb_height * (i) + nb_slices / 2) / nb_slices;
+            s->thread_context[i]->end_mb_y   =
+                (s->mb_height * (i + 1) + nb_slices / 2) / nb_slices;
         }
-        s->slice_context_count = nb_slices;
+    } else {
+        if (init_duplicate_context(s) < 0)
+            goto fail;
+        s->start_mb_y = 0;
+        s->end_mb_y   = s->mb_height;
     }
+    s->slice_context_count = nb_slices;
+//     }
 
     return 0;
  fail:
@@ -916,6 +1044,7 @@ static void free_context_frame(MpegEncContext *s)
     av_freep(&s->er.er_temp_buffer);
     av_freep(&s->mb_index2xy);
     av_freep(&s->lambda_table);
+
     av_freep(&s->cplx_tab);
     av_freep(&s->bits_tab);
 
@@ -926,6 +1055,9 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
 {
     int i, err = 0;
 
+    if (!s->context_initialized)
+        return AVERROR(EINVAL);
+
     if (s->slice_context_count > 1) {
         for (i = 0; i < s->slice_context_count; i++) {
             free_duplicate_context(s->thread_context[i]);
@@ -960,17 +1092,20 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
     if ((err = init_context_frame(s)))
         goto fail;
 
+    memset(s->thread_context, 0, sizeof(s->thread_context));
     s->thread_context[0]   = s;
 
     if (s->width && s->height) {
         int nb_slices = s->slice_context_count;
         if (nb_slices > 1) {
-            for (i = 1; i < nb_slices; i++) {
-                s->thread_context[i] = av_malloc(sizeof(MpegEncContext));
-                memcpy(s->thread_context[i], s, sizeof(MpegEncContext));
-            }
-
             for (i = 0; i < nb_slices; i++) {
+                if (i) {
+                    s->thread_context[i] = av_memdup(s, sizeof(MpegEncContext));
+                    if (!s->thread_context[i]) {
+                        err = AVERROR(ENOMEM);
+                        goto fail;
+                    }
+                }
                 if ((err = init_duplicate_context(s->thread_context[i])) < 0)
                     goto fail;
                 s->thread_context[i]->start_mb_y =
@@ -979,7 +1114,8 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
                     (s->mb_height * (i + 1) + nb_slices / 2) / nb_slices;
             }
         } else {
-            if (init_duplicate_context(s) < 0)
+            err = init_duplicate_context(s);
+            if (err < 0)
                 goto fail;
             s->start_mb_y = 0;
             s->end_mb_y   = s->mb_height;
@@ -998,6 +1134,9 @@ void ff_mpv_common_end(MpegEncContext *s)
 {
     int i;
 
+    if (!s)
+        return ;
+
     if (s->slice_context_count > 1) {
         for (i = 0; i < s->slice_context_count; i++) {
             free_duplicate_context(s->thread_context[i]);
@@ -1044,6 +1183,23 @@ void ff_mpv_common_end(MpegEncContext *s)
     s->linesize = s->uvlinesize = 0;
 }
 
+
+static void gray_frame(AVFrame *frame)
+{
+    int i, h_chroma_shift, v_chroma_shift;
+
+    av_pix_fmt_get_chroma_sub_sample(frame->format, &h_chroma_shift, &v_chroma_shift);
+
+    for(i=0; i<frame->height; i++)
+        memset(frame->data[0] + frame->linesize[0]*i, 0x80, frame->width);
+    for(i=0; i<AV_CEIL_RSHIFT(frame->height, v_chroma_shift); i++) {
+        memset(frame->data[1] + frame->linesize[1]*i,
+               0x80, AV_CEIL_RSHIFT(frame->width, h_chroma_shift));
+        memset(frame->data[2] + frame->linesize[2]*i,
+               0x80, AV_CEIL_RSHIFT(frame->width, h_chroma_shift));
+    }
+}
+
 /**
  * generic function called after decoding
  * the header and before a frame is decoded.
@@ -1054,6 +1210,11 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     Picture *pic;
     s->mb_skipped = 0;
 
+    if (!ff_thread_can_start_frame(avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "Attempt to start a frame outside SETUP state\n");
+        return -1;
+    }
+
     /* mark & release old frames */
     if (s->pict_type != AV_PICTURE_TYPE_B && s->last_picture_ptr &&
         s->last_picture_ptr != s->next_picture_ptr &&
@@ -1072,6 +1233,8 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     }
 
     ff_mpeg_unref_picture(s->avctx, &s->current_picture);
+    ff_mpeg_unref_picture(s->avctx, &s->last_picture);
+    ff_mpeg_unref_picture(s->avctx, &s->next_picture);
 
     /* release non reference frames */
     for (i = 0; i < MAX_PICTURE_COUNT; i++) {
@@ -1080,7 +1243,7 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     }
 
     if (s->current_picture_ptr && !s->current_picture_ptr->f->buf[0]) {
-        // we already have a unused image
+        // we already have an unused image
         // (maybe it was set before reading the header)
         pic = s->current_picture_ptr;
     } else {
@@ -1138,17 +1301,16 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
             s->pict_type, s->droppable);
 
     if ((!s->last_picture_ptr || !s->last_picture_ptr->f->buf[0]) &&
-        (s->pict_type != AV_PICTURE_TYPE_I ||
-         s->picture_structure != PICT_FRAME)) {
+        (s->pict_type != AV_PICTURE_TYPE_I)) {
         int h_chroma_shift, v_chroma_shift;
         av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt,
                                          &h_chroma_shift, &v_chroma_shift);
-        if (s->pict_type != AV_PICTURE_TYPE_I)
+        if (s->pict_type == AV_PICTURE_TYPE_B && s->next_picture_ptr && s->next_picture_ptr->f->buf[0])
+            av_log(avctx, AV_LOG_DEBUG,
+                   "allocating dummy last picture for B frame\n");
+        else if (s->pict_type != AV_PICTURE_TYPE_I)
             av_log(avctx, AV_LOG_ERROR,
                    "warning: first frame is no keyframe\n");
-        else if (s->picture_structure != PICT_FRAME)
-            av_log(avctx, AV_LOG_INFO,
-                   "allocate dummy last picture for field based first keyframe\n");
 
         /* Allocate a dummy frame */
         i = ff_find_unused_picture(s->avctx, s->picture, 0);
@@ -1159,21 +1321,32 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->last_picture_ptr = &s->picture[i];
 
         s->last_picture_ptr->reference   = 3;
-        s->last_picture_ptr->f->pict_type = AV_PICTURE_TYPE_I;
+        s->last_picture_ptr->f->key_frame = 0;
+        s->last_picture_ptr->f->pict_type = AV_PICTURE_TYPE_P;
 
         if (alloc_picture(s, s->last_picture_ptr, 0) < 0) {
             s->last_picture_ptr = NULL;
             return -1;
         }
 
-        memset(s->last_picture_ptr->f->data[0], 0,
-               avctx->height * s->last_picture_ptr->f->linesize[0]);
-        memset(s->last_picture_ptr->f->data[1], 0x80,
-               (avctx->height >> v_chroma_shift) *
-               s->last_picture_ptr->f->linesize[1]);
-        memset(s->last_picture_ptr->f->data[2], 0x80,
-               (avctx->height >> v_chroma_shift) *
-               s->last_picture_ptr->f->linesize[2]);
+        if (!avctx->hwaccel) {
+            for(i=0; i<avctx->height; i++)
+                memset(s->last_picture_ptr->f->data[0] + s->last_picture_ptr->f->linesize[0]*i,
+                       0x80, avctx->width);
+            if (s->last_picture_ptr->f->data[2]) {
+                for(i=0; i<AV_CEIL_RSHIFT(avctx->height, v_chroma_shift); i++) {
+                    memset(s->last_picture_ptr->f->data[1] + s->last_picture_ptr->f->linesize[1]*i,
+                        0x80, AV_CEIL_RSHIFT(avctx->width, h_chroma_shift));
+                    memset(s->last_picture_ptr->f->data[2] + s->last_picture_ptr->f->linesize[2]*i,
+                        0x80, AV_CEIL_RSHIFT(avctx->width, h_chroma_shift));
+                }
+            }
+
+            if(s->codec_id == AV_CODEC_ID_FLV1 || s->codec_id == AV_CODEC_ID_H263){
+                for(i=0; i<avctx->height; i++)
+                memset(s->last_picture_ptr->f->data[0] + s->last_picture_ptr->f->linesize[0]*i, 16, avctx->width);
+            }
+        }
 
         ff_thread_report_progress(&s->last_picture_ptr->tf, INT_MAX, 0);
         ff_thread_report_progress(&s->last_picture_ptr->tf, INT_MAX, 1);
@@ -1189,7 +1362,8 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->next_picture_ptr = &s->picture[i];
 
         s->next_picture_ptr->reference   = 3;
-        s->next_picture_ptr->f->pict_type = AV_PICTURE_TYPE_I;
+        s->next_picture_ptr->f->key_frame = 0;
+        s->next_picture_ptr->f->pict_type = AV_PICTURE_TYPE_P;
 
         if (alloc_picture(s, s->next_picture_ptr, 0) < 0) {
             s->next_picture_ptr = NULL;
@@ -1199,27 +1373,25 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         ff_thread_report_progress(&s->next_picture_ptr->tf, INT_MAX, 1);
     }
 
+#if 0 // BUFREF-FIXME
+    memset(s->last_picture.f->data, 0, sizeof(s->last_picture.f->data));
+    memset(s->next_picture.f->data, 0, sizeof(s->next_picture.f->data));
+#endif
     if (s->last_picture_ptr) {
-        ff_mpeg_unref_picture(s->avctx, &s->last_picture);
         if (s->last_picture_ptr->f->buf[0] &&
             (ret = ff_mpeg_ref_picture(s->avctx, &s->last_picture,
                                        s->last_picture_ptr)) < 0)
             return ret;
     }
     if (s->next_picture_ptr) {
-        ff_mpeg_unref_picture(s->avctx, &s->next_picture);
         if (s->next_picture_ptr->f->buf[0] &&
             (ret = ff_mpeg_ref_picture(s->avctx, &s->next_picture,
                                        s->next_picture_ptr)) < 0)
             return ret;
     }
 
-    if (s->pict_type != AV_PICTURE_TYPE_I &&
-        !(s->last_picture_ptr && s->last_picture_ptr->f->buf[0])) {
-        av_log(s, AV_LOG_ERROR,
-               "Non-reference picture received and no reference available\n");
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(s->pict_type == AV_PICTURE_TYPE_I || (s->last_picture_ptr &&
+                                                 s->last_picture_ptr->f->buf[0]));
 
     if (s->picture_structure!= PICT_FRAME) {
         int i;
@@ -1248,6 +1420,10 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->dct_unquantize_inter = s->dct_unquantize_mpeg1_inter;
     }
 
+    if (s->avctx->debug & FF_DEBUG_NOMC) {
+        gray_frame(s->current_picture_ptr->f);
+    }
+
     return 0;
 }
 
@@ -1260,103 +1436,407 @@ void ff_mpv_frame_end(MpegEncContext *s)
         ff_thread_report_progress(&s->current_picture_ptr->tf, INT_MAX, 0);
 }
 
+void ff_print_debug_info(MpegEncContext *s, Picture *p, AVFrame *pict)
+{
+    ff_print_debug_info2(s->avctx, pict, s->mbskip_table, p->mb_type,
+                         p->qscale_table, p->motion_val, &s->low_delay,
+                         s->mb_width, s->mb_height, s->mb_stride, s->quarter_sample);
+}
+
+int ff_mpv_export_qp_table(MpegEncContext *s, AVFrame *f, Picture *p, int qp_type)
+{
+    AVBufferRef *ref = av_buffer_ref(p->qscale_table_buf);
+    int offset = 2*s->mb_stride + 1;
+    if(!ref)
+        return AVERROR(ENOMEM);
+    av_assert0(ref->size >= offset + s->mb_stride * ((f->height+15)/16));
+    ref->size -= offset;
+    ref->data += offset;
+    return av_frame_set_qp_table(f, ref, s->mb_stride, qp_type);
+}
+
+static inline int hpel_motion_lowres(MpegEncContext *s,
+                                     uint8_t *dest, uint8_t *src,
+                                     int field_based, int field_select,
+                                     int src_x, int src_y,
+                                     int width, int height, ptrdiff_t stride,
+                                     int h_edge_pos, int v_edge_pos,
+                                     int w, int h, h264_chroma_mc_func *pix_op,
+                                     int motion_x, int motion_y)
+{
+    const int lowres   = s->avctx->lowres;
+    const int op_index = FFMIN(lowres, 3);
+    const int s_mask   = (2 << lowres) - 1;
+    int emu = 0;
+    int sx, sy;
+
+    if (s->quarter_sample) {
+        motion_x /= 2;
+        motion_y /= 2;
+    }
+
+    sx = motion_x & s_mask;
+    sy = motion_y & s_mask;
+    src_x += motion_x >> lowres + 1;
+    src_y += motion_y >> lowres + 1;
+
+    src   += src_y * stride + src_x;
+
+    if ((unsigned)src_x > FFMAX( h_edge_pos - (!!sx) - w,                 0) ||
+        (unsigned)src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, src,
+                                 s->linesize, s->linesize,
+                                 w + 1, (h + 1) << field_based,
+                                 src_x, src_y   << field_based,
+                                 h_edge_pos, v_edge_pos);
+        src = s->sc.edge_emu_buffer;
+        emu = 1;
+    }
+
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    if (field_select)
+        src += s->linesize;
+    pix_op[op_index](dest, src, stride, h, sx, sy);
+    return emu;
+}
+
+/* apply one mpeg motion vector to the three components */
+static av_always_inline void mpeg_motion_lowres(MpegEncContext *s,
+                                                uint8_t *dest_y,
+                                                uint8_t *dest_cb,
+                                                uint8_t *dest_cr,
+                                                int field_based,
+                                                int bottom_field,
+                                                int field_select,
+                                                uint8_t **ref_picture,
+                                                h264_chroma_mc_func *pix_op,
+                                                int motion_x, int motion_y,
+                                                int h, int mb_y)
+{
+    uint8_t *ptr_y, *ptr_cb, *ptr_cr;
+    int mx, my, src_x, src_y, uvsrc_x, uvsrc_y, sx, sy, uvsx, uvsy;
+    ptrdiff_t uvlinesize, linesize;
+    const int lowres     = s->avctx->lowres;
+    const int op_index   = FFMIN(lowres-1+s->chroma_x_shift, 3);
+    const int block_s    = 8>>lowres;
+    const int s_mask     = (2 << lowres) - 1;
+    const int h_edge_pos = s->h_edge_pos >> lowres;
+    const int v_edge_pos = s->v_edge_pos >> lowres;
+    linesize   = s->current_picture.f->linesize[0] << field_based;
+    uvlinesize = s->current_picture.f->linesize[1] << field_based;
+
+    // FIXME obviously not perfect but qpel will not work in lowres anyway
+    if (s->quarter_sample) {
+        motion_x /= 2;
+        motion_y /= 2;
+    }
+
+    if(field_based){
+        motion_y += (bottom_field - field_select)*((1 << lowres)-1);
+    }
+
+    sx = motion_x & s_mask;
+    sy = motion_y & s_mask;
+    src_x = s->mb_x * 2 * block_s + (motion_x >> lowres + 1);
+    src_y = (mb_y * 2 * block_s >> field_based) + (motion_y >> lowres + 1);
+
+    if (s->out_format == FMT_H263) {
+        uvsx    = ((motion_x >> 1) & s_mask) | (sx & 1);
+        uvsy    = ((motion_y >> 1) & s_mask) | (sy & 1);
+        uvsrc_x = src_x >> 1;
+        uvsrc_y = src_y >> 1;
+    } else if (s->out_format == FMT_H261) {
+        // even chroma mv's are full pel in H261
+        mx      = motion_x / 4;
+        my      = motion_y / 4;
+        uvsx    = (2 * mx) & s_mask;
+        uvsy    = (2 * my) & s_mask;
+        uvsrc_x = s->mb_x * block_s + (mx >> lowres);
+        uvsrc_y =    mb_y * block_s + (my >> lowres);
+    } else {
+        if(s->chroma_y_shift){
+            mx      = motion_x / 2;
+            my      = motion_y / 2;
+            uvsx    = mx & s_mask;
+            uvsy    = my & s_mask;
+            uvsrc_x = s->mb_x * block_s                 + (mx >> lowres + 1);
+            uvsrc_y =   (mb_y * block_s >> field_based) + (my >> lowres + 1);
+        } else {
+            if(s->chroma_x_shift){
+            //Chroma422
+                mx = motion_x / 2;
+                uvsx = mx & s_mask;
+                uvsy = motion_y & s_mask;
+                uvsrc_y = src_y;
+                uvsrc_x = s->mb_x*block_s               + (mx >> (lowres+1));
+            } else {
+            //Chroma444
+                uvsx = motion_x & s_mask;
+                uvsy = motion_y & s_mask;
+                uvsrc_x = src_x;
+                uvsrc_y = src_y;
+            }
+        }
+    }
+
+    ptr_y  = ref_picture[0] + src_y   * linesize   + src_x;
+    ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
+    ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
+
+    if ((unsigned) src_x > FFMAX( h_edge_pos - (!!sx) - 2 * block_s,       0) || uvsrc_y<0 ||
+        (unsigned) src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
+                                 linesize >> field_based, linesize >> field_based,
+                                 17, 17 + field_based,
+                                src_x, src_y << field_based, h_edge_pos,
+                                v_edge_pos);
+        ptr_y = s->sc.edge_emu_buffer;
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf =ubuf + 10 * s->uvlinesize;
+            if (s->workaround_bugs & FF_BUG_IEDGE)
+                vbuf -= s->uvlinesize;
+            s->vdsp.emulated_edge_mc(ubuf,  ptr_cb,
+                                     uvlinesize >> field_based, uvlinesize >> field_based,
+                                     9, 9 + field_based,
+                                    uvsrc_x, uvsrc_y << field_based,
+                                    h_edge_pos >> 1, v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf,  ptr_cr,
+                                     uvlinesize >> field_based,uvlinesize >> field_based,
+                                     9, 9 + field_based,
+                                    uvsrc_x, uvsrc_y << field_based,
+                                    h_edge_pos >> 1, v_edge_pos >> 1);
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
+        }
+    }
+
+    // FIXME use this for field pix too instead of the obnoxious hack which changes picture.f->data
+    if (bottom_field) {
+        dest_y  += s->linesize;
+        dest_cb += s->uvlinesize;
+        dest_cr += s->uvlinesize;
+    }
+
+    if (field_select) {
+        ptr_y   += s->linesize;
+        ptr_cb  += s->uvlinesize;
+        ptr_cr  += s->uvlinesize;
+    }
+
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    pix_op[lowres - 1](dest_y, ptr_y, linesize, h, sx, sy);
+
+    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        int hc = s->chroma_y_shift ? (h+1-bottom_field)>>1 : h;
+        uvsx = (uvsx << 2) >> lowres;
+        uvsy = (uvsy << 2) >> lowres;
+        if (hc) {
+            pix_op[op_index](dest_cb, ptr_cb, uvlinesize, hc, uvsx, uvsy);
+            pix_op[op_index](dest_cr, ptr_cr, uvlinesize, hc, uvsx, uvsy);
+        }
+    }
+    // FIXME h261 lowres loop filter
+}
+
+static inline void chroma_4mv_motion_lowres(MpegEncContext *s,
+                                            uint8_t *dest_cb, uint8_t *dest_cr,
+                                            uint8_t **ref_picture,
+                                            h264_chroma_mc_func * pix_op,
+                                            int mx, int my)
+{
+    const int lowres     = s->avctx->lowres;
+    const int op_index   = FFMIN(lowres, 3);
+    const int block_s    = 8 >> lowres;
+    const int s_mask     = (2 << lowres) - 1;
+    const int h_edge_pos = s->h_edge_pos >> lowres + 1;
+    const int v_edge_pos = s->v_edge_pos >> lowres + 1;
+    int emu = 0, src_x, src_y, sx, sy;
+    ptrdiff_t offset;
+    uint8_t *ptr;
+
+    if (s->quarter_sample) {
+        mx /= 2;
+        my /= 2;
+    }
+
+    /* In case of 8X8, we construct a single chroma motion vector
+       with a special rounding */
+    mx = ff_h263_round_chroma(mx);
+    my = ff_h263_round_chroma(my);
+
+    sx = mx & s_mask;
+    sy = my & s_mask;
+    src_x = s->mb_x * block_s + (mx >> lowres + 1);
+    src_y = s->mb_y * block_s + (my >> lowres + 1);
+
+    offset = src_y * s->uvlinesize + src_x;
+    ptr = ref_picture[1] + offset;
+    if ((unsigned) src_x > FFMAX(h_edge_pos - (!!sx) - block_s, 0) ||
+        (unsigned) src_y > FFMAX(v_edge_pos - (!!sy) - block_s, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
+                                 s->uvlinesize, s->uvlinesize,
+                                 9, 9,
+                                 src_x, src_y, h_edge_pos, v_edge_pos);
+        ptr = s->sc.edge_emu_buffer;
+        emu = 1;
+    }
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    pix_op[op_index](dest_cb, ptr, s->uvlinesize, block_s, sx, sy);
+
+    ptr = ref_picture[2] + offset;
+    if (emu) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
+                                 s->uvlinesize, s->uvlinesize,
+                                 9, 9,
+                                 src_x, src_y, h_edge_pos, v_edge_pos);
+        ptr = s->sc.edge_emu_buffer;
+    }
+    pix_op[op_index](dest_cr, ptr, s->uvlinesize, block_s, sx, sy);
+}
+
 /**
- * Print debugging info for the given picture.
+ * motion compensation of a single macroblock
+ * @param s context
+ * @param dest_y luma destination pointer
+ * @param dest_cb chroma cb/u destination pointer
+ * @param dest_cr chroma cr/v destination pointer
+ * @param dir direction (0->forward, 1->backward)
+ * @param ref_picture array[3] of pointers to the 3 planes of the reference picture
+ * @param pix_op halfpel motion compensation function (average or put normally)
+ * the motion vectors are taken from s->mv and the MV type from s->mv_type
  */
-void ff_print_debug_info(MpegEncContext *s, Picture *p)
+static inline void MPV_motion_lowres(MpegEncContext *s,
+                                     uint8_t *dest_y, uint8_t *dest_cb,
+                                     uint8_t *dest_cr,
+                                     int dir, uint8_t **ref_picture,
+                                     h264_chroma_mc_func *pix_op)
 {
-    AVFrame *pict;
-    if (s->avctx->hwaccel || !p || !p->mb_type)
-        return;
-    pict = p->f;
+    int mx, my;
+    int mb_x, mb_y, i;
+    const int lowres  = s->avctx->lowres;
+    const int block_s = 8 >>lowres;
 
-    if (s->avctx->debug & (FF_DEBUG_SKIP | FF_DEBUG_QP | FF_DEBUG_MB_TYPE)) {
-        int x,y;
+    mb_x = s->mb_x;
+    mb_y = s->mb_y;
 
-        av_log(s->avctx,AV_LOG_DEBUG,"New frame, type: ");
-        switch (pict->pict_type) {
-        case AV_PICTURE_TYPE_I:
-            av_log(s->avctx,AV_LOG_DEBUG,"I\n");
-            break;
-        case AV_PICTURE_TYPE_P:
-            av_log(s->avctx,AV_LOG_DEBUG,"P\n");
-            break;
-        case AV_PICTURE_TYPE_B:
-            av_log(s->avctx,AV_LOG_DEBUG,"B\n");
-            break;
-        case AV_PICTURE_TYPE_S:
-            av_log(s->avctx,AV_LOG_DEBUG,"S\n");
-            break;
-        case AV_PICTURE_TYPE_SI:
-            av_log(s->avctx,AV_LOG_DEBUG,"SI\n");
-            break;
-        case AV_PICTURE_TYPE_SP:
-            av_log(s->avctx,AV_LOG_DEBUG,"SP\n");
-            break;
+    switch (s->mv_type) {
+    case MV_TYPE_16X16:
+        mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                           0, 0, 0,
+                           ref_picture, pix_op,
+                           s->mv[dir][0][0], s->mv[dir][0][1],
+                           2 * block_s, mb_y);
+        break;
+    case MV_TYPE_8X8:
+        mx = 0;
+        my = 0;
+        for (i = 0; i < 4; i++) {
+            hpel_motion_lowres(s, dest_y + ((i & 1) + (i >> 1) *
+                               s->linesize) * block_s,
+                               ref_picture[0], 0, 0,
+                               (2 * mb_x + (i & 1)) * block_s,
+                               (2 * mb_y + (i >> 1)) * block_s,
+                               s->width, s->height, s->linesize,
+                               s->h_edge_pos >> lowres, s->v_edge_pos >> lowres,
+                               block_s, block_s, pix_op,
+                               s->mv[dir][i][0], s->mv[dir][i][1]);
+
+            mx += s->mv[dir][i][0];
+            my += s->mv[dir][i][1];
         }
-        for (y = 0; y < s->mb_height; y++) {
-            for (x = 0; x < s->mb_width; x++) {
-                if (s->avctx->debug & FF_DEBUG_SKIP) {
-                    int count = s->mbskip_table[x + y * s->mb_stride];
-                    if (count > 9)
-                        count = 9;
-                    av_log(s->avctx, AV_LOG_DEBUG, "%1d", count);
-                }
-                if (s->avctx->debug & FF_DEBUG_QP) {
-                    av_log(s->avctx, AV_LOG_DEBUG, "%2d",
-                           p->qscale_table[x + y * s->mb_stride]);
-                }
-                if (s->avctx->debug & FF_DEBUG_MB_TYPE) {
-                    int mb_type = p->mb_type[x + y * s->mb_stride];
-                    // Type & MV direction
-                    if (IS_PCM(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "P");
-                    else if (IS_INTRA(mb_type) && IS_ACPRED(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "A");
-                    else if (IS_INTRA4x4(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "i");
-                    else if (IS_INTRA16x16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "I");
-                    else if (IS_DIRECT(mb_type) && IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "d");
-                    else if (IS_DIRECT(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "D");
-                    else if (IS_GMC(mb_type) && IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "g");
-                    else if (IS_GMC(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "G");
-                    else if (IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "S");
-                    else if (!USES_LIST(mb_type, 1))
-                        av_log(s->avctx, AV_LOG_DEBUG, ">");
-                    else if (!USES_LIST(mb_type, 0))
-                        av_log(s->avctx, AV_LOG_DEBUG, "<");
-                    else {
-                        assert(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
-                        av_log(s->avctx, AV_LOG_DEBUG, "X");
-                    }
 
-                    // segmentation
-                    if (IS_8X8(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "+");
-                    else if (IS_16X8(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "-");
-                    else if (IS_8X16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "|");
-                    else if (IS_INTRA(mb_type) || IS_16X16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, " ");
-                    else
-                        av_log(s->avctx, AV_LOG_DEBUG, "?");
-
-
-                    if (IS_INTERLACED(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "=");
-                    else
-                        av_log(s->avctx, AV_LOG_DEBUG, " ");
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            chroma_4mv_motion_lowres(s, dest_cb, dest_cr, ref_picture,
+                                     pix_op, mx, my);
+        break;
+    case MV_TYPE_FIELD:
+        if (s->picture_structure == PICT_FRAME) {
+            /* top field */
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               1, 0, s->field_select[dir][0],
+                               ref_picture, pix_op,
+                               s->mv[dir][0][0], s->mv[dir][0][1],
+                               block_s, mb_y);
+            /* bottom field */
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               1, 1, s->field_select[dir][1],
+                               ref_picture, pix_op,
+                               s->mv[dir][1][0], s->mv[dir][1][1],
+                               block_s, mb_y);
+        } else {
+            if (s->picture_structure != s->field_select[dir][0] + 1 &&
+                s->pict_type != AV_PICTURE_TYPE_B && !s->first_field) {
+                ref_picture = s->current_picture_ptr->f->data;
+
+            }
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               0, 0, s->field_select[dir][0],
+                               ref_picture, pix_op,
+                               s->mv[dir][0][0],
+                               s->mv[dir][0][1], 2 * block_s, mb_y >> 1);
+            }
+        break;
+    case MV_TYPE_16X8:
+        for (i = 0; i < 2; i++) {
+            uint8_t **ref2picture;
+
+            if (s->picture_structure == s->field_select[dir][i] + 1 ||
+                s->pict_type == AV_PICTURE_TYPE_B || s->first_field) {
+                ref2picture = ref_picture;
+            } else {
+                ref2picture = s->current_picture_ptr->f->data;
+            }
+
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               0, 0, s->field_select[dir][i],
+                               ref2picture, pix_op,
+                               s->mv[dir][i][0], s->mv[dir][i][1] +
+                               2 * block_s * i, block_s, mb_y >> 1);
+
+            dest_y  +=  2 * block_s *  s->linesize;
+            dest_cb += (2 * block_s >> s->chroma_y_shift) * s->uvlinesize;
+            dest_cr += (2 * block_s >> s->chroma_y_shift) * s->uvlinesize;
+        }
+        break;
+    case MV_TYPE_DMV:
+        if (s->picture_structure == PICT_FRAME) {
+            for (i = 0; i < 2; i++) {
+                int j;
+                for (j = 0; j < 2; j++) {
+                    mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                                       1, j, j ^ i,
+                                       ref_picture, pix_op,
+                                       s->mv[dir][2 * i + j][0],
+                                       s->mv[dir][2 * i + j][1],
+                                       block_s, mb_y);
+                }
+                pix_op = s->h264chroma.avg_h264_chroma_pixels_tab;
+            }
+        } else {
+            for (i = 0; i < 2; i++) {
+                mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                                   0, 0, s->picture_structure != i + 1,
+                                   ref_picture, pix_op,
+                                   s->mv[dir][2 * i][0],s->mv[dir][2 * i][1],
+                                   2 * block_s, mb_y >> 1);
+
+                // after put we make avg of the same block
+                pix_op = s->h264chroma.avg_h264_chroma_pixels_tab;
+
+                // opposite parity is always in the same
+                // frame if this is second field
+                if (!s->first_field) {
+                    ref_picture = s->current_picture_ptr->f->data;
                 }
             }
-            av_log(s->avctx, AV_LOG_DEBUG, "\n");
         }
+        break;
+    default:
+        av_assert2(0);
     }
 }
 
@@ -1386,14 +1866,14 @@ static int lowest_referenced_row(MpegEncContext *s, int dir)
     }
 
     for (i = 0; i < mvs; i++) {
-        my = s->mv[dir][i][1]<<qpel_shift;
+        my = s->mv[dir][i][1];
         my_max = FFMAX(my_max, my);
         my_min = FFMIN(my_min, my);
     }
 
-    off = (FFMAX(-my_min, my_max) + 63) >> 6;
+    off = ((FFMAX(-my_min, my_max)<<qpel_shift) + 63) >> 6;
 
-    return FFMIN(FFMAX(s->mb_y + off, 0), s->mb_height-1);
+    return av_clip(s->mb_y + off, 0, s->mb_height - 1);
 unhandled:
     return s->mb_height-1;
 }
@@ -1469,11 +1949,17 @@ void ff_clean_intra_table_entries(MpegEncContext *s)
    s->interlaced_dct : true if interlaced dct used (mpeg2)
  */
 static av_always_inline
-void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
-                            int is_mpeg12)
+void mpv_reconstruct_mb_internal(MpegEncContext *s, int16_t block[12][64],
+                            int lowres_flag, int is_mpeg12)
 {
     const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
 
+    if (CONFIG_XVMC &&
+        s->avctx->hwaccel && s->avctx->hwaccel->decode_mb) {
+        s->avctx->hwaccel->decode_mb(s);//xvmc uses pblocks
+        return;
+    }
+
     if(s->avctx->debug&FF_DEBUG_DCT_COEFF) {
        /* print DCT coefficients */
        int i,j;
@@ -1503,7 +1989,7 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
     else if (!is_mpeg12 && (s->h263_pred || s->h263_aic))
         s->mbintra_table[mb_xy]=1;
 
-    if ((s->avctx->flags & AV_CODEC_FLAG_PSNR) ||
+    if ((s->avctx->flags & AV_CODEC_FLAG_PSNR) || s->frame_skip_threshold || s->frame_skip_factor ||
         !(s->encoding && (s->intra_only || s->pict_type == AV_PICTURE_TYPE_B) &&
           s->avctx->mb_decision != FF_MB_DECISION_RD)) { // FIXME precalc
         uint8_t *dest_y, *dest_cb, *dest_cr;
@@ -1512,8 +1998,8 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
         qpel_mc_func (*op_qpix)[16];
         const int linesize   = s->current_picture.f->linesize[0]; //not s->linesize as this would be wrong for field pics
         const int uvlinesize = s->current_picture.f->linesize[1];
-        const int readable= s->pict_type != AV_PICTURE_TYPE_B || s->encoding || s->avctx->draw_horiz_band;
-        const int block_size = 8;
+        const int readable= s->pict_type != AV_PICTURE_TYPE_B || s->encoding || s->avctx->draw_horiz_band || lowres_flag;
+        const int block_size= lowres_flag ? 8>>s->avctx->lowres : 8;
 
         /* avoid copy if macroblock skipped in last frame too */
         /* skip only during decoding as we might trash the buffers during encoding a bit */
@@ -1522,7 +2008,7 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
 
             if (s->mb_skipped) {
                 s->mb_skipped= 0;
-                assert(s->pict_type!=AV_PICTURE_TYPE_I);
+                av_assert2(s->pict_type!=AV_PICTURE_TYPE_I);
                 *mbskip_ptr = 1;
             } else if(!s->current_picture.reference) {
                 *mbskip_ptr = 1;
@@ -1562,19 +2048,31 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
                     }
                 }
 
-                op_qpix= s->me.qpel_put;
-                if ((!s->no_rounding) || s->pict_type==AV_PICTURE_TYPE_B){
-                    op_pix = s->hdsp.put_pixels_tab;
+                if(lowres_flag){
+                    h264_chroma_mc_func *op_pix = s->h264chroma.put_h264_chroma_pixels_tab;
+
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix);
+                        op_pix = s->h264chroma.avg_h264_chroma_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix);
+                    }
                 }else{
-                    op_pix = s->hdsp.put_no_rnd_pixels_tab;
-                }
-                if (s->mv_dir & MV_DIR_FORWARD) {
-                    ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix, op_qpix);
-                    op_pix = s->hdsp.avg_pixels_tab;
-                    op_qpix= s->me.qpel_avg;
-                }
-                if (s->mv_dir & MV_DIR_BACKWARD) {
-                    ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix, op_qpix);
+                    op_qpix = s->me.qpel_put;
+                    if ((!s->no_rounding) || s->pict_type==AV_PICTURE_TYPE_B){
+                        op_pix = s->hdsp.put_pixels_tab;
+                    }else{
+                        op_pix = s->hdsp.put_no_rnd_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix, op_qpix);
+                        op_pix = s->hdsp.avg_pixels_tab;
+                        op_qpix= s->me.qpel_avg;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix, op_qpix);
+                    }
                 }
             }
 
@@ -1620,17 +2118,17 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
                     }else{
                         //chroma422
                         dct_linesize = uvlinesize << s->interlaced_dct;
-                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize * 8;
+                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize*block_size;
 
                         add_dct(s, block[4], 4, dest_cb, dct_linesize);
                         add_dct(s, block[5], 5, dest_cr, dct_linesize);
                         add_dct(s, block[6], 6, dest_cb+dct_offset, dct_linesize);
                         add_dct(s, block[7], 7, dest_cr+dct_offset, dct_linesize);
                         if(!s->chroma_x_shift){//Chroma444
-                            add_dct(s, block[8], 8, dest_cb+8, dct_linesize);
-                            add_dct(s, block[9], 9, dest_cr+8, dct_linesize);
-                            add_dct(s, block[10], 10, dest_cb+8+dct_offset, dct_linesize);
-                            add_dct(s, block[11], 11, dest_cr+8+dct_offset, dct_linesize);
+                            add_dct(s, block[8], 8, dest_cb+block_size, dct_linesize);
+                            add_dct(s, block[9], 9, dest_cr+block_size, dct_linesize);
+                            add_dct(s, block[10], 10, dest_cb+block_size+dct_offset, dct_linesize);
+                            add_dct(s, block[11], 11, dest_cr+block_size+dct_offset, dct_linesize);
                         }
                     }
                 }//fi gray
@@ -1639,8 +2137,63 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
                 ff_wmv2_add_mb(s, block, dest_y, dest_cb, dest_cr);
             }
         } else {
+            /* Only MPEG-4 Simple Studio Profile is supported in > 8-bit mode.
+               TODO: Integrate 10-bit properly into mpegvideo.c so that ER works properly */
+            if (s->avctx->bits_per_raw_sample > 8){
+                const int act_block_size = block_size * 2;
+
+                if(s->dpcm_direction == 0) {
+                    s->idsp.idct_put(dest_y,                           dct_linesize, (int16_t*)(*s->block32)[0]);
+                    s->idsp.idct_put(dest_y              + act_block_size, dct_linesize, (int16_t*)(*s->block32)[1]);
+                    s->idsp.idct_put(dest_y + dct_offset,              dct_linesize, (int16_t*)(*s->block32)[2]);
+                    s->idsp.idct_put(dest_y + dct_offset + act_block_size, dct_linesize, (int16_t*)(*s->block32)[3]);
+
+                    dct_linesize = uvlinesize << s->interlaced_dct;
+                    dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize*block_size;
+
+                    s->idsp.idct_put(dest_cb,              dct_linesize, (int16_t*)(*s->block32)[4]);
+                    s->idsp.idct_put(dest_cr,              dct_linesize, (int16_t*)(*s->block32)[5]);
+                    s->idsp.idct_put(dest_cb + dct_offset, dct_linesize, (int16_t*)(*s->block32)[6]);
+                    s->idsp.idct_put(dest_cr + dct_offset, dct_linesize, (int16_t*)(*s->block32)[7]);
+                    if(!s->chroma_x_shift){//Chroma444
+                        s->idsp.idct_put(dest_cb + act_block_size,              dct_linesize, (int16_t*)(*s->block32)[8]);
+                        s->idsp.idct_put(dest_cr + act_block_size,              dct_linesize, (int16_t*)(*s->block32)[9]);
+                        s->idsp.idct_put(dest_cb + act_block_size + dct_offset, dct_linesize, (int16_t*)(*s->block32)[10]);
+                        s->idsp.idct_put(dest_cr + act_block_size + dct_offset, dct_linesize, (int16_t*)(*s->block32)[11]);
+                    }
+                } else if(s->dpcm_direction == 1) {
+                    int i, w, h;
+                    uint16_t *dest_pcm[3] = {(uint16_t*)dest_y, (uint16_t*)dest_cb, (uint16_t*)dest_cr};
+                    int linesize[3] = {dct_linesize, uvlinesize, uvlinesize};
+                    for(i = 0; i < 3; i++) {
+                        int idx = 0;
+                        int vsub = i ? s->chroma_y_shift : 0;
+                        int hsub = i ? s->chroma_x_shift : 0;
+                        for(h = 0; h < (16 >> vsub); h++){
+                            for(w = 0; w < (16 >> hsub); w++)
+                                dest_pcm[i][w] = (*s->dpcm_macroblock)[i][idx++];
+                            dest_pcm[i] += linesize[i] / 2;
+                        }
+                    }
+                } else if(s->dpcm_direction == -1) {
+                    int i, w, h;
+                    uint16_t *dest_pcm[3] = {(uint16_t*)dest_y, (uint16_t*)dest_cb, (uint16_t*)dest_cr};
+                    int linesize[3] = {dct_linesize, uvlinesize, uvlinesize};
+                    for(i = 0; i < 3; i++) {
+                        int idx = 0;
+                        int vsub = i ? s->chroma_y_shift : 0;
+                        int hsub = i ? s->chroma_x_shift : 0;
+                        dest_pcm[i] += (linesize[i] / 2) * ((16 >> vsub) - 1);
+                        for(h = (16 >> vsub)-1; h >= 1; h--){
+                            for(w = (16 >> hsub)-1; w >= 1; w--)
+                                dest_pcm[i][w] = (*s->dpcm_macroblock)[i][idx++];
+                            dest_pcm[i] -= linesize[i] / 2;
+                        }
+                    }
+                }
+            }
             /* dct only in intra block */
-            if(s->encoding || !(s->codec_id==AV_CODEC_ID_MPEG1VIDEO || s->codec_id==AV_CODEC_ID_MPEG2VIDEO)){
+            else if(s->encoding || !(s->codec_id==AV_CODEC_ID_MPEG1VIDEO || s->codec_id==AV_CODEC_ID_MPEG2VIDEO)){
                 put_dct(s, block[0], 0, dest_y                          , dct_linesize, s->qscale);
                 put_dct(s, block[1], 1, dest_y              + block_size, dct_linesize, s->qscale);
                 put_dct(s, block[2], 2, dest_y + dct_offset             , dct_linesize, s->qscale);
@@ -1672,17 +2225,17 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
                     }else{
 
                         dct_linesize = uvlinesize << s->interlaced_dct;
-                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize * 8;
+                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize*block_size;
 
                         s->idsp.idct_put(dest_cb,              dct_linesize, block[4]);
                         s->idsp.idct_put(dest_cr,              dct_linesize, block[5]);
                         s->idsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]);
                         s->idsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]);
                         if(!s->chroma_x_shift){//Chroma444
-                            s->idsp.idct_put(dest_cb + 8,              dct_linesize, block[8]);
-                            s->idsp.idct_put(dest_cr + 8,              dct_linesize, block[9]);
-                            s->idsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]);
-                            s->idsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]);
+                            s->idsp.idct_put(dest_cb + block_size,              dct_linesize, block[8]);
+                            s->idsp.idct_put(dest_cr + block_size,              dct_linesize, block[9]);
+                            s->idsp.idct_put(dest_cb + block_size + dct_offset, dct_linesize, block[10]);
+                            s->idsp.idct_put(dest_cr + block_size + dct_offset, dct_linesize, block[11]);
                         }
                     }
                 }//gray
@@ -1691,33 +2244,38 @@ void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
 skip_idct:
         if(!readable){
             s->hdsp.put_pixels_tab[0][0](s->dest[0], dest_y ,   linesize,16);
-            s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[1], dest_cb, uvlinesize,16 >> s->chroma_y_shift);
-            s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[2], dest_cr, uvlinesize,16 >> s->chroma_y_shift);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[1], dest_cb, uvlinesize,16 >> s->chroma_y_shift);
+                s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[2], dest_cr, uvlinesize,16 >> s->chroma_y_shift);
+            }
         }
     }
 }
 
-void ff_mpv_decode_mb(MpegEncContext *s, int16_t block[12][64])
+void ff_mpv_reconstruct_mb(MpegEncContext *s, int16_t block[12][64])
 {
 #if !CONFIG_SMALL
     if(s->out_format == FMT_MPEG1) {
-        mpv_decode_mb_internal(s, block, 1);
+        if(s->avctx->lowres) mpv_reconstruct_mb_internal(s, block, 1, 1);
+        else                 mpv_reconstruct_mb_internal(s, block, 0, 1);
     } else
 #endif
-        mpv_decode_mb_internal(s, block, 0);
+    if(s->avctx->lowres) mpv_reconstruct_mb_internal(s, block, 1, 0);
+    else                  mpv_reconstruct_mb_internal(s, block, 0, 0);
 }
 
 void ff_mpeg_draw_horiz_band(MpegEncContext *s, int y, int h)
 {
-    ff_draw_horiz_band(s->avctx, s->current_picture.f,
-                       s->last_picture.f, y, h, s->picture_structure,
+    ff_draw_horiz_band(s->avctx, s->current_picture_ptr->f,
+                       s->last_picture_ptr ? s->last_picture_ptr->f : NULL, y, h, s->picture_structure,
                        s->first_field, s->low_delay);
 }
 
 void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     const int linesize   = s->current_picture.f->linesize[0]; //not s->linesize as this would be wrong for field pics
     const int uvlinesize = s->current_picture.f->linesize[1];
-    const int mb_size= 4;
+    const int width_of_mb = (4 + (s->avctx->bits_per_raw_sample > 8)) - s->avctx->lowres;
+    const int height_of_mb = 4 - s->avctx->lowres;
 
     s->block_index[0]= s->b8_stride*(s->mb_y*2    ) - 2 + s->mb_x*2;
     s->block_index[1]= s->b8_stride*(s->mb_y*2    ) - 1 + s->mb_x*2;
@@ -1727,21 +2285,21 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
     //block_index is not used by mpeg2, so it is not affected by chroma_format
 
-    s->dest[0] = s->current_picture.f->data[0] + (s->mb_x - 1) * (1 << mb_size);
-    s->dest[1] = s->current_picture.f->data[1] + (s->mb_x - 1) * (1 << (mb_size - s->chroma_x_shift));
-    s->dest[2] = s->current_picture.f->data[2] + (s->mb_x - 1) * (1 << (mb_size - s->chroma_x_shift));
+    s->dest[0] = s->current_picture.f->data[0] + (int)((s->mb_x - 1U) <<  width_of_mb);
+    s->dest[1] = s->current_picture.f->data[1] + (int)((s->mb_x - 1U) << (width_of_mb - s->chroma_x_shift));
+    s->dest[2] = s->current_picture.f->data[2] + (int)((s->mb_x - 1U) << (width_of_mb - s->chroma_x_shift));
 
     if(!(s->pict_type==AV_PICTURE_TYPE_B && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME))
     {
         if(s->picture_structure==PICT_FRAME){
-        s->dest[0] += s->mb_y *   linesize << mb_size;
-        s->dest[1] += s->mb_y * uvlinesize << (mb_size - s->chroma_y_shift);
-        s->dest[2] += s->mb_y * uvlinesize << (mb_size - s->chroma_y_shift);
+        s->dest[0] += s->mb_y *   linesize << height_of_mb;
+        s->dest[1] += s->mb_y * uvlinesize << (height_of_mb - s->chroma_y_shift);
+        s->dest[2] += s->mb_y * uvlinesize << (height_of_mb - s->chroma_y_shift);
         }else{
-            s->dest[0] += (s->mb_y>>1) *   linesize << mb_size;
-            s->dest[1] += (s->mb_y>>1) * uvlinesize << (mb_size - s->chroma_y_shift);
-            s->dest[2] += (s->mb_y>>1) * uvlinesize << (mb_size - s->chroma_y_shift);
-            assert((s->mb_y&1) == (s->picture_structure == PICT_BOTTOM_FIELD));
+            s->dest[0] += (s->mb_y>>1) *   linesize << height_of_mb;
+            s->dest[1] += (s->mb_y>>1) * uvlinesize << (height_of_mb - s->chroma_y_shift);
+            s->dest[2] += (s->mb_y>>1) * uvlinesize << (height_of_mb - s->chroma_y_shift);
+            av_assert1((s->mb_y&1) == (s->picture_structure == PICT_BOTTOM_FIELD));
         }
     }
 }
@@ -1762,6 +2320,7 @@ void ff_mpeg_flush(AVCodecContext *avctx){
     ff_mpeg_unref_picture(s->avctx, &s->next_picture);
 
     s->mb_x= s->mb_y= 0;
+    s->closed_gop= 0;
 
     s->parse_context.state= -1;
     s->parse_context.frame_start_found= 0;
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index e7835e1..bbc6b56 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,11 +31,11 @@
 #include <float.h>
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "blockdsp.h"
 #include "error_resilience.h"
 #include "fdctdsp.h"
 #include "get_bits.h"
+#include "h264chroma.h"
 #include "h263dsp.h"
 #include "hpeldsp.h"
 #include "idctdsp.h"
@@ -45,6 +45,7 @@
 #include "mpegpicture.h"
 #include "mpegvideodsp.h"
 #include "mpegvideoencdsp.h"
+#include "mpegvideodata.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
 #include "ratecontrol.h"
@@ -56,8 +57,9 @@
 #include "videodsp.h"
 
 #include "libavutil/opt.h"
+#include "libavutil/timecode.h"
 
-#define MAX_THREADS 16
+#define MAX_THREADS 32
 
 #define MAX_B_FRAMES 16
 
@@ -70,6 +72,8 @@
 #define SLICE_MAX_START_CODE    0x000001af
 #define EXT_START_CODE          0x000001b5
 #define USER_START_CODE         0x000001b2
+#define SLICE_START_CODE        0x000001b7
+
 
 /**
  * MpegEncContext.
@@ -96,7 +100,7 @@ typedef struct MpegEncContext {
     int width, height;///< picture size. must be a multiple of 16
     int gop_size;
     int intra_only;   ///< if true, only intra pictures are generated
-    int bit_rate;     ///< wanted bit rate
+    int64_t bit_rate; ///< wanted bit rate
     enum OutputFormat out_format; ///< output format
     int h263_pred;    ///< use MPEG-4/H.263 ac/dc predictions
     int pb_frame;     ///< PB-frame mode (0 = none, 1 = base, 2 = improved)
@@ -187,7 +191,7 @@ typedef struct MpegEncContext {
     uint8_t *coded_block_base;
     uint8_t *coded_block;          ///< used for coded block pattern prediction (msmpeg4v3, wmv1)
     int16_t (*ac_val_base)[16];
-    int16_t (*ac_val[3])[16];      ///< used for for MPEG-4 AC prediction, all 3 arrays must be continuous
+    int16_t (*ac_val[3])[16];      ///< used for MPEG-4 AC prediction, all 3 arrays must be continuous
     int mb_skipped;                ///< MUST BE SET only during DECODING
     uint8_t *mbskip_table;        /**< used to avoid copy if macroblock skipped (for black regions for example)
                                    and used for B-frame encoding & decoding (contains skip table of next P-frame) */
@@ -204,11 +208,14 @@ typedef struct MpegEncContext {
     int *lambda_table;
     int adaptive_quant;         ///< use adaptive quantization
     int dquant;                 ///< qscale difference to prev qscale
+    int closed_gop;             ///< MPEG1/2 GOP is closed
     int pict_type;              ///< AV_PICTURE_TYPE_I, AV_PICTURE_TYPE_P, AV_PICTURE_TYPE_B, ...
+    int vbv_delay;
     int last_pict_type; //FIXME removes
     int last_non_b_pict_type;   ///< used for MPEG-4 gmc B-frames & ratecontrol
     int droppable;
     int frame_rate_index;
+    AVRational mpeg2_frame_rate_ext;
     int last_lambda_for[5];     ///< last lambda for a specific pict type
     int skipdct;                ///< skip dct and code zero residual
 
@@ -218,6 +225,7 @@ typedef struct MpegEncContext {
 
     BlockDSPContext bdsp;
     FDCTDSPContext fdsp;
+    H264ChromaContext h264chroma;
     HpelDSPContext hdsp;
     IDCTDSPContext idsp;
     MECmpContext mecc;
@@ -293,6 +301,7 @@ typedef struct MpegEncContext {
     uint16_t chroma_intra_matrix[64];
     uint16_t inter_matrix[64];
     uint16_t chroma_inter_matrix[64];
+    int force_duplicated_matrix; ///< Force duplication of mjpeg matrices, useful for rtp streaming
 
     int intra_quant_bias;    ///< bias for the quantizer
     int inter_quant_bias;    ///< bias for the quantizer
@@ -301,18 +310,22 @@ typedef struct MpegEncContext {
     int ac_esc_length;       ///< num of bits needed to encode the longest esc
     uint8_t *intra_ac_vlc_length;
     uint8_t *intra_ac_vlc_last_length;
+    uint8_t *intra_chroma_ac_vlc_length;
+    uint8_t *intra_chroma_ac_vlc_last_length;
     uint8_t *inter_ac_vlc_length;
     uint8_t *inter_ac_vlc_last_length;
     uint8_t *luma_dc_vlc_length;
 #define UNI_AC_ENC_INDEX(run,level) ((run)*128 + (level))
 
-    int coded_score[8];
+    int coded_score[12];
 
     /** precomputed matrix (combine qscale and DCT renorm) */
     int (*q_intra_matrix)[64];
+    int (*q_chroma_intra_matrix)[64];
     int (*q_inter_matrix)[64];
     /** identical to the above but for MMX & these are not permutated, second 64 entries are bias*/
     uint16_t (*q_intra_matrix16)[2][64];
+    uint16_t (*q_chroma_intra_matrix16)[2][64];
     uint16_t (*q_inter_matrix16)[2][64];
 
     /* noise reduction */
@@ -323,6 +336,7 @@ typedef struct MpegEncContext {
     /* bit rate control */
     int64_t total_bits;
     int frame_bits;                ///< bits used for the current frame
+    int stuffing_bits;             ///< bits used for stuffing
     int next_lambda;               ///< next lambda used for retrying to encode a frame
     RateControlContext rc_context; ///< contains stuff only accessed in ratecontrol.c
 
@@ -342,7 +356,6 @@ typedef struct MpegEncContext {
     int resync_mb_x;                 ///< x position of last resync marker
     int resync_mb_y;                 ///< y position of last resync marker
     GetBitContext last_resync_gb;    ///< used to search for the next resync marker
-    BitstreamContext last_resync_bc; ///< used to search for the next resync marker
     int mb_num_left;                 ///< number of MBs left in this video packet (for partitioned Slices only)
     int next_p_frame_damaged;        ///< set if the next p frame is damaged, to avoid showing trashed B-frames
 
@@ -355,6 +368,8 @@ typedef struct MpegEncContext {
     int prev_mb_info, last_mb_info;
     uint8_t *mb_info_ptr;
     int mb_info_size;
+    int ehc_mode;
+    int rc_strategy;                ///< deprecated
 
     /* H.263+ specific */
     int umvplus;                    ///< == H.263+ && unrestricted_mv
@@ -366,6 +381,8 @@ typedef struct MpegEncContext {
     int custom_pcf;
 
     /* MPEG-4 specific */
+    int studio_profile;
+    int dct_precision;
     ///< number of bits to represent the fractional part of time (encoder only)
     int time_increment_bits;
     int last_time_base;
@@ -405,7 +422,9 @@ typedef struct MpegEncContext {
 
     /* MJPEG specific */
     struct MJpegContext *mjpeg_ctx;
+    int esc_pos;
     int pred;
+    int huffman;
 
     /* MSMPEG4 specific */
     int mv_table_index;
@@ -427,7 +446,6 @@ typedef struct MpegEncContext {
 
     /* decompression specific */
     GetBitContext gb;
-    BitstreamContext bc;
 
     /* MPEG-1 specific */
     int gop_picture_number;  ///< index of the first picture of a GOP based on fake_pic_num & MPEG-1 specific
@@ -450,11 +468,20 @@ typedef struct MpegEncContext {
     int brd_scale;
     int intra_vlc_format;
     int alternate_scan;
+    int seq_disp_ext;
+    int video_format;
+#define VIDEO_FORMAT_COMPONENT   0
+#define VIDEO_FORMAT_PAL         1
+#define VIDEO_FORMAT_NTSC        2
+#define VIDEO_FORMAT_SECAM       3
+#define VIDEO_FORMAT_MAC         4
+#define VIDEO_FORMAT_UNSPECIFIED 5
     int repeat_first_field;
     int chroma_420_type;
     int chroma_format;
 #define CHROMA_420 1
 #define CHROMA_422 2
+#define CHROMA_444 3
     int chroma_x_shift;//depend on pix_format, that depend on chroma_format
     int chroma_y_shift;
 
@@ -469,12 +496,22 @@ typedef struct MpegEncContext {
     int rtp_mode;
     int rtp_payload_size;
 
+    char *tc_opt_str;        ///< timecode option string
+    AVTimecode tc;           ///< timecode context
+
     uint8_t *ptr_lastgob;
+    int swap_uv;             //vcr2 codec is an MPEG-2 variant with U and V swapped
+    int pack_pblocks;        //xvmc needs to keep blocks without gaps.
     int16_t (*pblocks[12])[64];
 
     int16_t (*block)[64]; ///< points to one of the following blocks
     int16_t (*blocks)[12][64]; // for HQ mode we need to keep the best block
-    int (*decode_mb)(struct MpegEncContext *s, int16_t block[6][64]); // used by some codecs to avoid a switch()
+    int (*decode_mb)(struct MpegEncContext *s, int16_t block[12][64]); // used by some codecs to avoid a switch()
+
+    int32_t (*block32)[12][64];
+    int dpcm_direction;          // 0 = DCT, 1 = DPCM top to bottom scan, -1 = DPCM bottom to top scan
+    int16_t (*dpcm_macroblock)[3][256];
+
 #define SLICE_OK         0
 #define SLICE_ERROR     -1
 #define SLICE_END       -2 ///<end marker found
@@ -514,6 +551,7 @@ typedef struct MpegEncContext {
     float rc_buffer_aggressivity;
     float border_masking;
     int lmin, lmax;
+    int vbv_ignore_qmax;
 
     char *rc_eq;
 
@@ -565,9 +603,12 @@ typedef struct MpegEncContext {
 { "nsse",   "Noise preserving sum of squared differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_NSSE }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }, \
 { "dct264", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCT264 }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }, \
 { "dctmax", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }, \
-{ "chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }
+{ "chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }, \
+{ "msad",   "Sum of absolute differences, median predicted", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_MEDIAN_SAD }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }
 
+#ifndef FF_MPV_OFFSET
 #define FF_MPV_OFFSET(x) offsetof(MpegEncContext, x)
+#endif
 #define FF_MPV_OPT_FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
 #define FF_MPV_COMMON_OPTS \
 FF_MPV_OPT_CMP_FUNC, \
@@ -601,10 +642,14 @@ FF_MPV_OPT_CMP_FUNC, \
 {"lmax", "maximum Lagrange factor (VBR)",                           FF_MPV_OFFSET(lmax), AV_OPT_TYPE_INT, {.i64 = 31*FF_QP2LAMBDA }, 0, INT_MAX, FF_MPV_OPT_FLAGS },            \
 {"ibias", "intra quant bias",                                       FF_MPV_OFFSET(intra_quant_bias), AV_OPT_TYPE_INT, {.i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS },   \
 {"pbias", "inter quant bias",                                       FF_MPV_OFFSET(inter_quant_bias), AV_OPT_TYPE_INT, {.i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS },   \
-{"motion_est", "motion estimation algorithm",                       FF_MPV_OFFSET(motion_est), AV_OPT_TYPE_INT, {.i64 = FF_ME_EPZS }, FF_ME_ZERO, FF_ME_XONE, FF_MPV_OPT_FLAGS, "motion_est" }, \
+{"rc_strategy", "ratecontrol method",                               FF_MPV_OFFSET(rc_strategy), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, FF_MPV_OPT_FLAGS | AV_OPT_FLAG_DEPRECATED, "rc_strategy" },   \
+    { "ffmpeg", "deprecated, does nothing", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FF_MPV_OPT_FLAGS | AV_OPT_FLAG_DEPRECATED, "rc_strategy" }, \
+    { "xvid",   "deprecated, does nothing", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FF_MPV_OPT_FLAGS | AV_OPT_FLAG_DEPRECATED, "rc_strategy" }, \
+{"motion_est", "motion estimation algorithm",                       FF_MPV_OFFSET(motion_est), AV_OPT_TYPE_INT, {.i64 = FF_ME_EPZS }, FF_ME_ZERO, FF_ME_XONE, FF_MPV_OPT_FLAGS, "motion_est" },   \
 { "zero", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_ZERO }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" }, \
 { "epzs", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_EPZS }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" }, \
 { "xone", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_XONE }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" }, \
+{ "force_duplicated_matrix", "Always write luma and chroma matrix for mjpeg, useful for rtp streaming.", FF_MPV_OFFSET(force_duplicated_matrix), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, FF_MPV_OPT_FLAGS },   \
 {"b_strategy", "Strategy to choose between I/P/B-frames",           FF_MPV_OFFSET(b_frame_strategy), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 2, FF_MPV_OPT_FLAGS }, \
 {"b_sensitivity", "Adjust sensitivity of b_frame_strategy 1",       FF_MPV_OFFSET(b_sensitivity), AV_OPT_TYPE_INT, {.i64 = 40 }, 1, INT_MAX, FF_MPV_OPT_FLAGS }, \
 {"brd_scale", "Downscale frames for dynamic B-frame decision",      FF_MPV_OFFSET(brd_scale), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 3, FF_MPV_OPT_FLAGS }, \
@@ -628,17 +673,22 @@ extern const AVOption ff_mpv_generic_options[];
  */
 void ff_mpv_common_defaults(MpegEncContext *s);
 
+void ff_dct_encode_init_x86(MpegEncContext *s);
+
 int ff_mpv_common_init(MpegEncContext *s);
 void ff_mpv_common_init_arm(MpegEncContext *s);
+void ff_mpv_common_init_axp(MpegEncContext *s);
 void ff_mpv_common_init_neon(MpegEncContext *s);
 void ff_mpv_common_init_ppc(MpegEncContext *s);
 void ff_mpv_common_init_x86(MpegEncContext *s);
+void ff_mpv_common_init_mips(MpegEncContext *s);
 
 int ff_mpv_common_frame_size_change(MpegEncContext *s);
 void ff_mpv_common_end(MpegEncContext *s);
 
 void ff_mpv_decode_defaults(MpegEncContext *s);
-void ff_mpv_decode_mb(MpegEncContext *s, int16_t block[12][64]);
+void ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx);
+void ff_mpv_reconstruct_mb(MpegEncContext *s, int16_t block[12][64]);
 void ff_mpv_report_decode_progress(MpegEncContext *s);
 
 int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx);
@@ -650,11 +700,16 @@ void ff_mpv_encode_init_x86(MpegEncContext *s);
 int ff_mpv_encode_end(AVCodecContext *avctx);
 int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
                           const AVFrame *frame, int *got_packet);
+int ff_mpv_reallocate_putbitbuffer(MpegEncContext *s, size_t threshold, size_t size_increase);
 
 void ff_clean_intra_table_entries(MpegEncContext *s);
 void ff_mpeg_draw_horiz_band(MpegEncContext *s, int y, int h);
 void ff_mpeg_flush(AVCodecContext *avctx);
-void ff_print_debug_info(MpegEncContext *s, Picture *p);
+
+void ff_print_debug_info(MpegEncContext *s, Picture *p, AVFrame *pict);
+
+int ff_mpv_export_qp_table(MpegEncContext *s, AVFrame *f, Picture *p, int qp_type);
+
 void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix);
 
 int ff_update_duplicate_context(MpegEncContext *dst, MpegEncContext *src);
@@ -662,10 +717,12 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst, const AVCodecContext *src
 void ff_set_qscale(MpegEncContext * s, int qscale);
 
 void ff_mpv_idct_init(MpegEncContext *s);
+int ff_dct_encode_init(MpegEncContext *s);
 void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[2][64],
                        const uint16_t *quant_matrix, int bias, int qmin, int qmax, int intra);
 int ff_dct_quantize_c(MpegEncContext *s, int16_t *block, int n, int qscale, int *overflow);
-
+void ff_block_permute(int16_t *block, uint8_t *permutation,
+                      const uint8_t *scantable, int last);
 void ff_init_block_index(MpegEncContext *s);
 
 void ff_mpv_motion(MpegEncContext *s,
@@ -676,7 +733,8 @@ void ff_mpv_motion(MpegEncContext *s,
                    qpel_mc_func (*qpix_op)[16]);
 
 static inline void ff_update_block_index(MpegEncContext *s){
-    const int block_size = 8;
+    const int bytes_per_pixel = 1 + (s->avctx->bits_per_raw_sample > 8);
+    const int block_size= (8*bytes_per_pixel) >> s->avctx->lowres;
 
     s->block_index[0]+=2;
     s->block_index[1]+=2;
@@ -685,8 +743,8 @@ static inline void ff_update_block_index(MpegEncContext *s){
     s->block_index[4]++;
     s->block_index[5]++;
     s->dest[0]+= 2*block_size;
-    s->dest[1]+= block_size;
-    s->dest[2]+= block_size;
+    s->dest[1]+= (2 >> s->chroma_x_shift) * block_size;
+    s->dest[2]+= (2 >> s->chroma_x_shift) * block_size;
 }
 
 static inline int get_bits_diff(MpegEncContext *s){
@@ -698,4 +756,13 @@ static inline int get_bits_diff(MpegEncContext *s){
     return bits - last;
 }
 
+static inline int mpeg_get_qscale(MpegEncContext *s)
+{
+    int qscale = get_bits(&s->gb, 5);
+    if (s->q_scale_type)
+        return ff_mpeg2_non_linear_qscale[qscale];
+    else
+        return qscale << 1;
+}
+
 #endif /* AVCODEC_MPEGVIDEO_H */
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 84de157..ae3b131 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -5,23 +5,27 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/*
+ * non linear quantizers with large QPs and VBV with restrictive qmin fixes sponsored by NOA GmbH
+ */
+
 /**
  * @file
  * The simplest mpeg encoder (well, it was the simplest!).
@@ -60,12 +64,14 @@
 #include "bytestream.h"
 #include "wmv2.h"
 #include "rv10.h"
+#include "libxvid.h"
 #include <limits.h>
+#include "sp5x.h"
 
 #define QUANT_BIAS_SHIFT 8
 
 #define QMAT_SHIFT_MMX 16
-#define QMAT_SHIFT 22
+#define QMAT_SHIFT 21
 
 static int encode_picture(MpegEncContext *s, int picture_number);
 static int dct_quantize_refine(MpegEncContext *s, int16_t *block, int16_t *weight, int16_t *orig, int n, int qscale);
@@ -73,7 +79,7 @@ static int sse_mb(MpegEncContext *s);
 static void denoise_dct_c(MpegEncContext *s, int16_t *block);
 static int dct_quantize_trellis_c(MpegEncContext *s, int16_t *block, int n, int qscale, int *overflow);
 
-static uint8_t default_mv_penalty[MAX_FCODE + 1][MAX_MV * 2 + 1];
+static uint8_t default_mv_penalty[MAX_FCODE + 1][MAX_DMV * 2 + 1];
 static uint8_t default_fcode_tab[MAX_MV * 2 + 1];
 
 const AVOption ff_mpv_generic_options[] = {
@@ -92,6 +98,11 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
 
     for (qscale = qmin; qscale <= qmax; qscale++) {
         int i;
+        int qscale2;
+
+        if (s->q_scale_type) qscale2 = ff_mpeg2_non_linear_qscale[qscale];
+        else                 qscale2 = qscale << 1;
+
         if (fdsp->fdct == ff_jpeg_fdct_islow_8  ||
 #if CONFIG_FAANDCT
             fdsp->fdct == ff_faandct            ||
@@ -99,46 +110,46 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
             fdsp->fdct == ff_jpeg_fdct_islow_10) {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = (int64_t) qscale * quant_matrix[j];
+                int64_t den = (int64_t) qscale2 * quant_matrix[j];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
                  * (1 << 36) / 19952 >= (1 << 36) / (x) >= (1 << 36) / 249205026
                  *           3444240 >= (1 << 36) / (x) >= 275 */
 
-                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << QMAT_SHIFT) / den);
             }
         } else if (fdsp->fdct == ff_fdct_ifast) {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = ff_aanscales[i] * (int64_t) qscale * quant_matrix[j];
+                int64_t den = ff_aanscales[i] * (int64_t) qscale2 * quant_matrix[j];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
                  * (1 << 36) / 19952 >= (1 << 36) / (x) >= (1 << 36) / 249205026
                  *           3444240 >= (1 << 36) / (x) >= 275 */
 
-                qmat[qscale][i] = (int)((UINT64_C(1) << (QMAT_SHIFT + 14)) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << (QMAT_SHIFT + 14)) / den);
             }
         } else {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = (int64_t) qscale * quant_matrix[j];
+                int64_t den = (int64_t) qscale2 * quant_matrix[j];
                 /* We can safely suppose that 16 <= quant_matrix[i] <= 255
                  * Assume x = qscale * quant_matrix[i]
                  * So             16 <=              x  <= 7905
                  * so (1 << 19) / 16 >= (1 << 19) / (x) >= (1 << 19) / 7905
                  * so          32768 >= (1 << 19) / (x) >= 67 */
-                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << QMAT_SHIFT) / den);
                 //qmat  [qscale][i] = (1 << QMAT_SHIFT_MMX) /
                 //                    (qscale * quant_matrix[i]);
-                qmat16[qscale][0][i] = (1 << QMAT_SHIFT_MMX) / den;
+                qmat16[qscale][0][i] = (2 << QMAT_SHIFT_MMX) / den;
 
                 if (qmat16[qscale][0][i] == 0 ||
                     qmat16[qscale][0][i] == 128 * 256)
                     qmat16[qscale][0][i] = 128 * 256 - 1;
                 qmat16[qscale][1][i] =
-                    ROUNDED_DIV(bias << (16 - QUANT_BIAS_SHIFT),
+                    ROUNDED_DIV(bias * (1<<(16 - QUANT_BIAS_SHIFT)),
                                 qmat16[qscale][0][i]);
             }
         }
@@ -162,9 +173,27 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
 
 static inline void update_qscale(MpegEncContext *s)
 {
-    s->qscale = (s->lambda * 139 + FF_LAMBDA_SCALE * 64) >>
-                (FF_LAMBDA_SHIFT + 7);
-    s->qscale = av_clip(s->qscale, s->avctx->qmin, s->avctx->qmax);
+    if (s->q_scale_type == 1 && 0) {
+        int i;
+        int bestdiff=INT_MAX;
+        int best = 1;
+
+        for (i = 0 ; i<FF_ARRAY_ELEMS(ff_mpeg2_non_linear_qscale); i++) {
+            int diff = FFABS((ff_mpeg2_non_linear_qscale[i]<<(FF_LAMBDA_SHIFT + 6)) - (int)s->lambda * 139);
+            if (ff_mpeg2_non_linear_qscale[i] < s->avctx->qmin ||
+                (ff_mpeg2_non_linear_qscale[i] > s->avctx->qmax && !s->vbv_ignore_qmax))
+                continue;
+            if (diff < bestdiff) {
+                bestdiff = diff;
+                best = i;
+            }
+        }
+        s->qscale = best;
+    } else {
+        s->qscale = (s->lambda * 139 + FF_LAMBDA_SCALE * 64) >>
+                    (FF_LAMBDA_SHIFT + 7);
+        s->qscale = av_clip(s->qscale, s->avctx->qmin, s->vbv_ignore_qmax ? 31 : s->avctx->qmax);
+    }
 
     s->lambda2 = (s->lambda * s->lambda + FF_LAMBDA_SCALE / 2) >>
                  FF_LAMBDA_SHIFT;
@@ -237,6 +266,24 @@ static void mpv_encode_defaults(MpegEncContext *s)
     s->picture_in_gop_number = 0;
 }
 
+av_cold int ff_dct_encode_init(MpegEncContext *s)
+{
+    if (ARCH_X86)
+        ff_dct_encode_init_x86(s);
+
+    if (CONFIG_H263_ENCODER)
+        ff_h263dsp_init(&s->h263dsp);
+    if (!s->dct_quantize)
+        s->dct_quantize = ff_dct_quantize_c;
+    if (!s->denoise_dct)
+        s->denoise_dct  = denoise_dct_c;
+    s->fast_dct_quantize = s->dct_quantize;
+    if (s->avctx->trellis)
+        s->dct_quantize  = dct_quantize_trellis_c;
+
+    return 0;
+}
+
 /* init video encoder */
 av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
 {
@@ -256,18 +303,22 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         }
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         format_supported = 0;
         /* JPEG color space */
         if (avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
             avctx->pix_fmt == AV_PIX_FMT_YUVJ422P ||
+            avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
             (avctx->color_range == AVCOL_RANGE_JPEG &&
              (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-              avctx->pix_fmt == AV_PIX_FMT_YUV422P)))
+              avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+              avctx->pix_fmt == AV_PIX_FMT_YUV444P)))
             format_supported = 1;
         /* MPEG color space */
         else if (avctx->strict_std_compliance <= FF_COMPLIANCE_UNOFFICIAL &&
                  (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-                  avctx->pix_fmt == AV_PIX_FMT_YUV422P))
+                  avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+                  avctx->pix_fmt == AV_PIX_FMT_YUV444P))
             format_supported = 1;
 
         if (!format_supported) {
@@ -283,6 +334,10 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
     }
 
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUVJ444P:
+    case AV_PIX_FMT_YUV444P:
+        s->chroma_format = CHROMA_444;
+        break;
     case AV_PIX_FMT_YUVJ422P:
     case AV_PIX_FMT_YUV422P:
         s->chroma_format = CHROMA_422;
@@ -294,6 +349,8 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         break;
     }
 
+    avctx->bits_per_raw_sample = av_clip(avctx->bits_per_raw_sample, 0, 8);
+
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->rtp_payload_size)
@@ -310,8 +367,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->height   = avctx->height;
     if (avctx->gop_size > 600 &&
         avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Warning keyframe interval too large! reducing it ...\n");
+        av_log(avctx, AV_LOG_WARNING,
+               "keyframe interval too large!, reducing it from %d to %d\n",
+               avctx->gop_size, 600);
         avctx->gop_size = 600;
     }
     s->gop_size     = avctx->gop_size;
@@ -319,6 +377,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->max_b_frames > MAX_B_FRAMES) {
         av_log(avctx, AV_LOG_ERROR, "Too many B-frames requested, maximum "
                "is %d.\n", MAX_B_FRAMES);
+        avctx->max_b_frames = MAX_B_FRAMES;
     }
     s->max_b_frames = avctx->max_b_frames;
     s->codec_id     = avctx->codec->id;
@@ -326,6 +385,27 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->quarter_sample     = (avctx->flags & AV_CODEC_FLAG_QPEL) != 0;
     s->rtp_mode           = !!s->rtp_payload_size;
     s->intra_dc_precision = avctx->intra_dc_precision;
+
+    // workaround some differences between how applications specify dc precision
+    if (s->intra_dc_precision < 0) {
+        s->intra_dc_precision += 8;
+    } else if (s->intra_dc_precision >= 8)
+        s->intra_dc_precision -= 8;
+
+    if (s->intra_dc_precision < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+                "intra dc precision must be positive, note some applications use"
+                " 0 and some 8 as base meaning 8bit, the value must not be smaller than that\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->codec_id == AV_CODEC_ID_AMV || (avctx->active_thread_type & FF_THREAD_SLICE))
+        s->huffman = 0;
+
+    if (s->intra_dc_precision > (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO ? 3 : 0)) {
+        av_log(avctx, AV_LOG_ERROR, "intra dc precision too large\n");
+        return AVERROR(EINVAL);
+    }
     s->user_specified_pts = AV_NOPTS_VALUE;
 
     if (s->gop_size <= 1) {
@@ -350,9 +430,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->loop_filter = !!(s->avctx->flags & AV_CODEC_FLAG_LOOP_FILTER);
 
     if (avctx->rc_max_rate && !avctx->rc_buffer_size) {
-        av_log(avctx, AV_LOG_ERROR,
-               "a vbv buffer size is needed, "
-               "for encoding with a maximum bitrate\n");
+        switch(avctx->codec_id) {
+        case AV_CODEC_ID_MPEG1VIDEO:
+        case AV_CODEC_ID_MPEG2VIDEO:
+            avctx->rc_buffer_size = FFMAX(avctx->rc_max_rate, 15000000) * 112LL / 15000000 * 16384;
+            break;
+        case AV_CODEC_ID_MPEG4:
+        case AV_CODEC_ID_MSMPEG4V1:
+        case AV_CODEC_ID_MSMPEG4V2:
+        case AV_CODEC_ID_MSMPEG4V3:
+            if       (avctx->rc_max_rate >= 15000000) {
+                avctx->rc_buffer_size = 320 + (avctx->rc_max_rate - 15000000LL) * (760-320) / (38400000 - 15000000);
+            } else if(avctx->rc_max_rate >=  2000000) {
+                avctx->rc_buffer_size =  80 + (avctx->rc_max_rate -  2000000LL) * (320- 80) / (15000000 -  2000000);
+            } else if(avctx->rc_max_rate >=   384000) {
+                avctx->rc_buffer_size =  40 + (avctx->rc_max_rate -   384000LL) * ( 80- 40) / ( 2000000 -   384000);
+            } else
+                avctx->rc_buffer_size = 40;
+            avctx->rc_buffer_size *= 16384;
+            break;
+        }
+        if (avctx->rc_buffer_size) {
+            av_log(avctx, AV_LOG_INFO, "Automatically choosing VBV buffer size of %d kbyte\n", avctx->rc_buffer_size/8192);
+        }
+    }
+
+    if ((!avctx->rc_max_rate) != (!avctx->rc_buffer_size)) {
+        av_log(avctx, AV_LOG_ERROR, "Either both buffer size and max rate or neither must be specified\n");
         return -1;
     }
 
@@ -367,7 +471,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     if (avctx->rc_max_rate && avctx->rc_max_rate < avctx->bit_rate) {
-        av_log(avctx, AV_LOG_INFO, "bitrate above max bitrate\n");
+        av_log(avctx, AV_LOG_ERROR, "bitrate above max bitrate\n");
         return -1;
     }
 
@@ -388,9 +492,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (!s->fixed_qscale &&
         avctx->bit_rate * av_q2d(avctx->time_base) >
             avctx->bit_rate_tolerance) {
-        av_log(avctx, AV_LOG_ERROR,
-               "bitrate tolerance too small for bitrate\n");
-        return -1;
+        av_log(avctx, AV_LOG_WARNING,
+               "bitrate tolerance %d too small for bitrate %"PRId64", overriding\n", avctx->bit_rate_tolerance, avctx->bit_rate);
+        avctx->bit_rate_tolerance = 5 * avctx->bit_rate * av_q2d(avctx->time_base);
     }
 
     if (s->avctx->rc_max_rate &&
@@ -429,18 +533,74 @@ FF_ENABLE_DEPRECATION_WARNINGS
         av_log(avctx, AV_LOG_ERROR, "B-frames not supported by codec\n");
         return -1;
     }
+    if (s->max_b_frames < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "max b frames must be 0 or positive for mpegvideo based encoders\n");
+        return -1;
+    }
 
     if ((s->codec_id == AV_CODEC_ID_MPEG4 ||
          s->codec_id == AV_CODEC_ID_H263  ||
          s->codec_id == AV_CODEC_ID_H263P) &&
         (avctx->sample_aspect_ratio.num > 255 ||
          avctx->sample_aspect_ratio.den > 255)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid pixel aspect ratio %i/%i, limit is 255/255\n",
+        av_log(avctx, AV_LOG_WARNING,
+               "Invalid pixel aspect ratio %i/%i, limit is 255/255 reducing\n",
                avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den);
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den, 255);
+    }
+
+    if ((s->codec_id == AV_CODEC_ID_H263  ||
+         s->codec_id == AV_CODEC_ID_H263P) &&
+        (avctx->width  > 2048 ||
+         avctx->height > 1152 )) {
+        av_log(avctx, AV_LOG_ERROR, "H.263 does not support resolutions above 2048x1152\n");
+        return -1;
+    }
+    if ((s->codec_id == AV_CODEC_ID_H263  ||
+         s->codec_id == AV_CODEC_ID_H263P) &&
+        ((avctx->width &3) ||
+         (avctx->height&3) )) {
+        av_log(avctx, AV_LOG_ERROR, "w/h must be a multiple of 4\n");
+        return -1;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_MPEG1VIDEO &&
+        (avctx->width  > 4095 ||
+         avctx->height > 4095 )) {
+        av_log(avctx, AV_LOG_ERROR, "MPEG-1 does not support resolutions above 4095x4095\n");
+        return -1;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO &&
+        (avctx->width  > 16383 ||
+         avctx->height > 16383 )) {
+        av_log(avctx, AV_LOG_ERROR, "MPEG-2 does not support resolutions above 16383x16383\n");
         return -1;
     }
 
+    if (s->codec_id == AV_CODEC_ID_RV10 &&
+        (avctx->width &15 ||
+         avctx->height&15 )) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be a multiple of 16\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->codec_id == AV_CODEC_ID_RV20 &&
+        (avctx->width &3 ||
+         avctx->height&3 )) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be a multiple of 4\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((s->codec_id == AV_CODEC_ID_WMV1 ||
+         s->codec_id == AV_CODEC_ID_WMV2) &&
+         avctx->width & 1) {
+         av_log(avctx, AV_LOG_ERROR, "width must be multiple of 2\n");
+         return -1;
+    }
+
     if ((s->avctx->flags & (AV_CODEC_FLAG_INTERLACED_DCT | AV_CODEC_FLAG_INTERLACED_ME)) &&
         s->codec_id != AV_CODEC_ID_MPEG4 && s->codec_id != AV_CODEC_ID_MPEG2VIDEO) {
         av_log(avctx, AV_LOG_ERROR, "interlacing not supported by codec\n");
@@ -455,7 +615,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     // FIXME mpeg2 uses that too
-    if (s->mpeg_quant && s->codec_id != AV_CODEC_ID_MPEG4) {
+    if (s->mpeg_quant && (   s->codec_id != AV_CODEC_ID_MPEG4
+                          && s->codec_id != AV_CODEC_ID_MPEG2VIDEO)) {
         av_log(avctx, AV_LOG_ERROR,
                "mpeg2 style quantization not supported by codec\n");
         return -1;
@@ -472,6 +633,15 @@ FF_ENABLE_DEPRECATION_WARNINGS
         return -1;
     }
 
+    if ((s->mpv_flags & FF_MPV_FLAG_QP_RD) &&
+            (s->codec_id == AV_CODEC_ID_AMV ||
+             s->codec_id == AV_CODEC_ID_MJPEG)) {
+        // Used to produce garbage with MJPEG.
+        av_log(avctx, AV_LOG_ERROR,
+               "QP RD is no longer compatible with MJPEG or AMV\n");
+        return -1;
+    }
+
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->scenechange_threshold)
@@ -488,9 +658,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     if (s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY) {
-        if (s->codec_id != AV_CODEC_ID_MPEG2VIDEO) {
+        if (s->codec_id != AV_CODEC_ID_MPEG2VIDEO &&
+            s->strict_std_compliance >= FF_COMPLIANCE_NORMAL) {
             av_log(avctx, AV_LOG_ERROR,
-                  "low delay forcing is only available for mpeg2\n");
+                   "low delay forcing is only available for mpeg2, "
+                   "set strict_std_compliance to 'unofficial' or lower in order to allow it\n");
             return -1;
         }
         if (s->max_b_frames != 0) {
@@ -501,9 +673,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     if (s->q_scale_type == 1) {
-        if (avctx->qmax > 12) {
+        if (avctx->qmax > 28) {
             av_log(avctx, AV_LOG_ERROR,
-                   "non linear quant only supports qmax <= 12 currently\n");
+                   "non linear quant only supports qmax <= 28 currently\n");
             return -1;
         }
     }
@@ -518,6 +690,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->codec_id != AV_CODEC_ID_MPEG4      &&
         s->codec_id != AV_CODEC_ID_MPEG1VIDEO &&
         s->codec_id != AV_CODEC_ID_MPEG2VIDEO &&
+        s->codec_id != AV_CODEC_ID_MJPEG      &&
         (s->codec_id != AV_CODEC_ID_H263P)) {
         av_log(avctx, AV_LOG_ERROR,
                "multi threaded encoding not supported by codec\n");
@@ -526,7 +699,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (s->avctx->thread_count < 1) {
         av_log(avctx, AV_LOG_ERROR,
-               "automatic thread number detection not supported by codec,"
+               "automatic thread number detection not supported by codec, "
                "patch welcome\n");
         return -1;
     }
@@ -559,8 +732,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         //return -1;
     }
 
-    if (s->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG1VIDEO ||
-        s->codec_id == AV_CODEC_ID_MPEG2VIDEO || s->codec_id == AV_CODEC_ID_MJPEG) {
+    if (s->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG1VIDEO || s->codec_id == AV_CODEC_ID_MPEG2VIDEO || s->codec_id == AV_CODEC_ID_MJPEG || s->codec_id==AV_CODEC_ID_AMV) {
         // (a + x * 3 / 8) / x
         s->intra_quant_bias = 3 << (QUANT_BIAS_SHIFT - 3);
         s->inter_quant_bias = 0;
@@ -570,6 +742,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->inter_quant_bias = -(1 << (QUANT_BIAS_SHIFT - 2));
     }
 
+    if (avctx->qmin > avctx->qmax || avctx->qmin <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "qmin and or qmax are invalid, they must be 0 < min <= max\n");
+        return AVERROR(EINVAL);
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "intra_quant_bias = %d inter_quant_bias = %d\n",s->intra_quant_bias,s->inter_quant_bias);
+
     if (avctx->codec_id == AV_CODEC_ID_MPEG4 &&
         s->avctx->time_base.den > (1 << 16) - 1) {
         av_log(avctx, AV_LOG_ERROR,
@@ -594,6 +773,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->rtp_mode   = 1;
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         s->out_format = FMT_MJPEG;
         s->intra_only = 1; /* force intra only for jpeg */
         if (!CONFIG_MJPEG_ENCODER ||
@@ -619,13 +799,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     case AV_CODEC_ID_H263:
         if (!CONFIG_H263_ENCODER)
-        return -1;
+            return -1;
         if (ff_match_2uint16(ff_h263_format, FF_ARRAY_ELEMS(ff_h263_format),
                              s->width, s->height) == 8) {
-            av_log(avctx, AV_LOG_INFO,
+            av_log(avctx, AV_LOG_ERROR,
                    "The specified picture size of %dx%d is not valid for "
                    "the H.263 codec.\nValid sizes are 128x96, 176x144, "
-                   "352x288, 704x576, and 1408x1152."
+                   "352x288, 704x576, and 1408x1152. "
                    "Try H.263+.\n", s->width, s->height);
             return -1;
         }
@@ -737,9 +917,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (ff_mpv_common_init(s) < 0)
         return -1;
 
-    if (ARCH_X86)
-        ff_mpv_encode_init_x86(s);
-
     ff_fdctdsp_init(&s->fdsp, avctx);
     ff_me_cmp_init(&s->mecc, avctx);
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
@@ -754,8 +931,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
     FF_ALLOCZ_OR_GOTO(s->avctx, s->avctx->stats_out, 256, fail);
 
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_intra_matrix,   64 * 32 * sizeof(int), fail);
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->q_chroma_intra_matrix, 64 * 32 * sizeof(int), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_inter_matrix,   64 * 32 * sizeof(int), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_intra_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->q_chroma_intra_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_inter_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->input_picture,
                       MAX_PICTURE_COUNT * sizeof(Picture *), fail);
@@ -768,15 +947,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                           2 * 64 * sizeof(uint16_t), fail);
     }
 
-    if (CONFIG_H263_ENCODER)
-        ff_h263dsp_init(&s->h263dsp);
-    if (!s->dct_quantize)
-        s->dct_quantize = ff_dct_quantize_c;
-    if (!s->denoise_dct)
-        s->denoise_dct  = denoise_dct_c;
-    s->fast_dct_quantize = s->dct_quantize;
-    if (avctx->trellis)
-        s->dct_quantize  = dct_quantize_trellis_c;
+    ff_dct_encode_init(s);
 
     if ((CONFIG_H263P_ENCODER || CONFIG_RV20_ENCODER) && s->modified_quant)
         s->chroma_qscale_table = ff_h263_chroma_qscale_table;
@@ -784,7 +955,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (s->slice_context_count > 1) {
         s->rtp_mode = 1;
 
-        if (avctx->codec_id == AV_CODEC_ID_H263 || avctx->codec_id == AV_CODEC_ID_H263P)
+        if (avctx->codec_id == AV_CODEC_ID_H263P)
             s->h263_slice_structured = 1;
     }
 
@@ -829,6 +1000,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         } else {
             /* MPEG-1/2 */
+            s->chroma_intra_matrix[j] =
             s->intra_matrix[j] = ff_mpeg1_default_intra_matrix[i];
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         }
@@ -898,6 +1070,7 @@ av_cold int ff_mpv_encode_end(AVCodecContext *avctx)
     int i;
 
     ff_rate_control_uninit(s);
+
     ff_mpv_common_end(s);
     if (CONFIG_MJPEG_ENCODER &&
         s->out_format == FMT_MJPEG)
@@ -914,6 +1087,10 @@ av_cold int ff_mpv_encode_end(AVCodecContext *avctx)
     av_freep(&s->avctx->stats_out);
     av_freep(&s->ac_stats);
 
+    if(s->q_chroma_intra_matrix   != s->q_intra_matrix  ) av_freep(&s->q_chroma_intra_matrix);
+    if(s->q_chroma_intra_matrix16 != s->q_intra_matrix16) av_freep(&s->q_chroma_intra_matrix16);
+    s->q_chroma_intra_matrix=   NULL;
+    s->q_chroma_intra_matrix16= NULL;
     av_freep(&s->q_intra_matrix);
     av_freep(&s->q_inter_matrix);
     av_freep(&s->q_intra_matrix16);
@@ -966,7 +1143,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared)
 {
     return ff_alloc_picture(s->avctx, pic, &s->me, &s->sc, shared, 1,
                             s->chroma_x_shift, s->chroma_y_shift, s->out_format,
-                            s->mb_stride, s->mb_height, s->b8_stride,
+                            s->mb_stride, s->mb_width, s->mb_height, s->b8_stride,
                             &s->linesize, &s->uvlinesize);
 }
 
@@ -986,18 +1163,17 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
 
         if (pts != AV_NOPTS_VALUE) {
             if (s->user_specified_pts != AV_NOPTS_VALUE) {
-                int64_t time = pts;
                 int64_t last = s->user_specified_pts;
 
-                if (time <= last) {
+                if (pts <= last) {
                     av_log(s->avctx, AV_LOG_ERROR,
-                           "Error, Invalid timestamp=%"PRId64", "
-                           "last=%"PRId64"\n", pts, s->user_specified_pts);
-                    return -1;
+                           "Invalid pts (%"PRId64") <= last (%"PRId64")\n",
+                           pts, last);
+                    return AVERROR(EINVAL);
                 }
 
                 if (!s->low_delay && display_picture_number == 1)
-                    s->dts_delta = time - last;
+                    s->dts_delta = pts - last;
             }
             s->user_specified_pts = pts;
         } else {
@@ -1019,8 +1195,12 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
             direct = 0;
         if ((s->width & 15) || (s->height & 15))
             direct = 0;
+        if (((intptr_t)(pic_arg->data[0])) & (STRIDE_ALIGN-1))
+            direct = 0;
+        if (s->linesize & (STRIDE_ALIGN-1))
+            direct = 0;
 
-        ff_dlog(s->avctx, "%d %d %td %td\n", pic_arg->linesize[0],
+        ff_dlog(s->avctx, "%d %d %"PTRDIFF_SPECIFIER" %"PTRDIFF_SPECIFIER"\n", pic_arg->linesize[0],
                 pic_arg->linesize[1], s->linesize, s->uvlinesize);
 
         i = ff_find_unused_picture(s->avctx, s->picture, direct);
@@ -1058,6 +1238,12 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
                     int h = s->height >> v_shift;
                     uint8_t *src = pic_arg->data[i];
                     uint8_t *dst = pic->f->data[i];
+                    int vpad = 16;
+
+                    if (   s->codec_id == AV_CODEC_ID_MPEG2VIDEO
+                        && !s->progressive_sequence
+                        && FFALIGN(s->height, 32) - s->height > 16)
+                        vpad = 32;
 
                     if (!s->avctx->rc_buffer_size)
                         dst += INPLACE_OFFSET;
@@ -1073,14 +1259,15 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
                             src += src_stride;
                         }
                     }
-                    if ((s->width & 15) || (s->height & 15)) {
+                    if ((s->width & 15) || (s->height & (vpad-1))) {
                         s->mpvencdsp.draw_edges(dst, dst_stride,
                                                 w, h,
                                                 16 >> h_shift,
-                                                16 >> v_shift,
+                                                vpad >> v_shift,
                                                 EDGE_BOTTOM);
                     }
                 }
+                emms_c();
             }
         }
         ret = av_frame_copy_props(pic->f, pic_arg);
@@ -1127,19 +1314,23 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref)
                 uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride);
                 int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8);
 
-                switch (s->frame_skip_exp) {
+                switch (FFABS(s->frame_skip_exp)) {
                 case 0: score    =  FFMAX(score, v);          break;
                 case 1: score   += FFABS(v);                  break;
-                case 2: score   += v * v;                     break;
-                case 3: score64 += FFABS(v * v * (int64_t)v); break;
-                case 4: score64 += v * v * (int64_t)(v * v);  break;
+                case 2: score64 += v * (int64_t)v;                       break;
+                case 3: score64 += FFABS(v * (int64_t)v * v);            break;
+                case 4: score64 += (v * (int64_t)v) * (v * (int64_t)v);  break;
                 }
             }
         }
     }
+    emms_c();
 
     if (score)
         score64 = score;
+    if (s->frame_skip_exp < 0)
+        score64 = pow(score64 / (double)(s->mb_width * s->mb_height),
+                      -1.0/s->frame_skip_exp);
 
     if (score64 < s->frame_skip_threshold)
         return 1;
@@ -1183,7 +1374,7 @@ static int estimate_best_b_count(MpegEncContext *s)
     int best_b_count = -1;
     int ret = 0;
 
-    assert(scale >= 0 && scale <= 3);
+    av_assert0(scale >= 0 && scale <= 3);
 
     //emms_c();
     //s->next_picture_ptr->quality;
@@ -1198,29 +1389,31 @@ static int estimate_best_b_count(MpegEncContext *s)
     for (i = 0; i < s->max_b_frames + 2; i++) {
         Picture pre_input, *pre_input_ptr = i ? s->input_picture[i - 1] :
                                                 s->next_picture_ptr;
+        uint8_t *data[4];
 
         if (pre_input_ptr && (!i || s->input_picture[i - 1])) {
             pre_input = *pre_input_ptr;
+            memcpy(data, pre_input_ptr->f->data, sizeof(data));
 
             if (!pre_input.shared && i) {
-                pre_input.f->data[0] += INPLACE_OFFSET;
-                pre_input.f->data[1] += INPLACE_OFFSET;
-                pre_input.f->data[2] += INPLACE_OFFSET;
+                data[0] += INPLACE_OFFSET;
+                data[1] += INPLACE_OFFSET;
+                data[2] += INPLACE_OFFSET;
             }
 
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0],
                                        s->tmp_frames[i]->linesize[0],
-                                       pre_input.f->data[0],
+                                       data[0],
                                        pre_input.f->linesize[0],
                                        width, height);
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1],
                                        s->tmp_frames[i]->linesize[1],
-                                       pre_input.f->data[1],
+                                       data[1],
                                        pre_input.f->linesize[1],
                                        width >> 1, height >> 1);
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2],
                                        s->tmp_frames[i]->linesize[2],
-                                       pre_input.f->data[2],
+                                       data[2],
                                        pre_input.f->linesize[2],
                                        width >> 1, height >> 1);
         }
@@ -1314,6 +1507,19 @@ static int select_input_picture(MpegEncContext *s)
 
     /* set next picture type & ordering */
     if (!s->reordered_input_picture[0] && s->input_picture[0]) {
+        if (s->frame_skip_threshold || s->frame_skip_factor) {
+            if (s->picture_in_gop_number < s->gop_size &&
+                s->next_picture_ptr &&
+                skip_check(s, s->input_picture[0], s->next_picture_ptr)) {
+                // FIXME check that the gop check above is +-1 correct
+                av_frame_unref(s->input_picture[0]->f);
+
+                ff_vbv_update(s, 0);
+
+                goto no_output_pic;
+            }
+        }
+
         if (/*s->picture_in_gop_number >= s->gop_size ||*/
             !s->next_picture_ptr || s->intra_only) {
             s->reordered_input_picture[0] = s->input_picture[0];
@@ -1323,19 +1529,6 @@ static int select_input_picture(MpegEncContext *s)
         } else {
             int b_frames = 0;
 
-            if (s->frame_skip_threshold || s->frame_skip_factor) {
-                if (s->picture_in_gop_number < s->gop_size &&
-                    skip_check(s, s->input_picture[0], s->next_picture_ptr)) {
-                    // FIXME check that the gop check above is +-1 correct
-                    av_frame_unref(s->input_picture[0]->f);
-
-                    emms_c();
-                    ff_vbv_update(s, 0);
-
-                    goto no_output_pic;
-                }
-            }
-
             if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
                 for (i = 0; i < s->max_b_frames + 1; i++) {
                     int pict_num = s->input_picture[0]->f->display_picture_number + i;
@@ -1482,25 +1675,26 @@ no_output_pic:
 
 static void frame_end(MpegEncContext *s)
 {
-    int i;
-
     if (s->unrestricted_mv &&
         s->current_picture.reference &&
         !s->intra_only) {
         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
         int hshift = desc->log2_chroma_w;
         int vshift = desc->log2_chroma_h;
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[0], s->linesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[0],
+                                s->current_picture.f->linesize[0],
                                 s->h_edge_pos, s->v_edge_pos,
                                 EDGE_WIDTH, EDGE_WIDTH,
                                 EDGE_TOP | EDGE_BOTTOM);
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[1], s->uvlinesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[1],
+                                s->current_picture.f->linesize[1],
                                 s->h_edge_pos >> hshift,
                                 s->v_edge_pos >> vshift,
                                 EDGE_WIDTH >> hshift,
                                 EDGE_WIDTH >> vshift,
                                 EDGE_TOP | EDGE_BOTTOM);
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[2], s->uvlinesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[2],
+                                s->current_picture.f->linesize[2],
                                 s->h_edge_pos >> hshift,
                                 s->v_edge_pos >> vshift,
                                 EDGE_WIDTH >> hshift,
@@ -1515,16 +1709,9 @@ static void frame_end(MpegEncContext *s)
     if (s->pict_type!= AV_PICTURE_TYPE_B)
         s->last_non_b_pict_type = s->pict_type;
 
-    if (s->encoding) {
-        /* release non-reference frames */
-        for (i = 0; i < MAX_PICTURE_COUNT; i++) {
-            if (!s->picture[i].reference)
-                ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
-        }
-    }
-
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
+    av_frame_unref(s->avctx->coded_frame);
     av_frame_copy_props(s->avctx->coded_frame, s->current_picture.f);
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
@@ -1622,35 +1809,13 @@ static int frame_start(MpegEncContext *s)
     }
 
     if (s->dct_error_sum) {
-        assert(s->noise_reduction && s->encoding);
+        av_assert2(s->noise_reduction && s->encoding);
         update_noise_reduction(s);
     }
 
     return 0;
 }
 
-static void write_pass1_stats(MpegEncContext *s)
-{
-    snprintf(s->avctx->stats_out, 256,
-             "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d "
-             "fcode:%d bcode:%d mc-var:%d var:%d icount:%d skipcount:%d "
-             "hbits:%d;\n",
-             s->current_picture_ptr->f->display_picture_number,
-             s->current_picture_ptr->f->coded_picture_number,
-             s->pict_type,
-             s->current_picture.f->quality,
-             s->i_tex_bits,
-             s->p_tex_bits,
-             s->mv_bits,
-             s->misc_bits,
-             s->f_code,
-             s->b_code,
-             s->current_picture.mc_mb_var_sum,
-             s->current_picture.mb_var_sum,
-             s->i_count, s->skip_count,
-             s->header_bits);
-}
-
 int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
                           const AVFrame *pic_arg, int *got_packet)
 {
@@ -1658,6 +1823,8 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
     int i, stuffing_count, ret;
     int context_count = s->slice_context_count;
 
+    s->vbv_ignore_qmax = 0;
+
     s->picture_in_gop_number++;
 
     if (load_input_picture(s, pic_arg) < 0)
@@ -1669,9 +1836,11 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
 
     /* output? */
     if (s->new_picture.f->data[0]) {
-        uint8_t *sd;
-        if (!pkt->data &&
-            (ret = ff_alloc_packet(pkt, s->mb_width*s->mb_height*MAX_MB_BYTES)) < 0)
+        int growing_buffer = context_count == 1 && !pkt->data && !s->data_partitioning;
+        int pkt_size = growing_buffer ? FFMAX(s->mb_width*s->mb_height*64+10000, avctx->internal->byte_buffer_size) - AV_INPUT_BUFFER_PADDING_SIZE
+                                              :
+                                              s->mb_width*s->mb_height*(MAX_MB_BYTES+100)+10000;
+        if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
             return ret;
         if (s->mb_info) {
             s->mb_info_ptr = av_packet_new_side_data(pkt,
@@ -1696,7 +1865,13 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
         if (ret < 0)
             return ret;
 vbv_retry:
-        if (encode_picture(s, s->picture_number) < 0)
+        ret = encode_picture(s, s->picture_number);
+        if (growing_buffer) {
+            av_assert0(s->pb.buf == avctx->internal->byte_buffer);
+            pkt->data = s->pb.buf;
+            pkt->size = avctx->internal->byte_buffer_size;
+        }
+        if (ret < 0)
             return -1;
 
 #if FF_API_STAT_BITS
@@ -1715,28 +1890,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         frame_end(s);
 
-        sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                     sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = s->current_picture.f->quality;
-
         if (CONFIG_MJPEG_ENCODER && s->out_format == FMT_MJPEG)
             ff_mjpeg_encode_picture_trailer(&s->pb, s->header_bits);
 
         if (avctx->rc_buffer_size) {
             RateControlContext *rcc = &s->rc_context;
-            int max_size = rcc->buffer_index * avctx->rc_max_available_vbv_use;
+            int max_size = FFMAX(rcc->buffer_index * avctx->rc_max_available_vbv_use, rcc->buffer_index - 500);
+            int hq = (s->avctx->mb_decision == FF_MB_DECISION_RD || s->avctx->trellis);
+            int min_step = hq ? 1 : (1<<(FF_LAMBDA_SHIFT + 7))/139;
 
             if (put_bits_count(&s->pb) > max_size &&
                 s->lambda < s->lmax) {
-                s->next_lambda = FFMAX(s->lambda + 1, s->lambda *
+                s->next_lambda = FFMAX(s->lambda + min_step, s->lambda *
                                        (s->qscale + 1) / s->qscale);
                 if (s->adaptive_quant) {
                     int i;
                     for (i = 0; i < s->mb_height * s->mb_stride; i++)
                         s->lambda_table[i] =
-                            FFMAX(s->lambda_table[i] + 1,
+                            FFMAX(s->lambda_table[i] + min_step,
                                   s->lambda_table[i] * (s->qscale + 1) /
                                   s->qscale);
                 }
@@ -1756,19 +1927,25 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     PutBitContext *pb = &s->thread_context[i]->pb;
                     init_put_bits(pb, pb->buf, pb->buf_end - pb->buf);
                 }
+                s->vbv_ignore_qmax = 1;
+                av_log(s->avctx, AV_LOG_VERBOSE, "reencoding frame due to VBV\n");
                 goto vbv_retry;
             }
 
-            assert(s->avctx->rc_max_rate);
+            av_assert0(s->avctx->rc_max_rate);
         }
 
         if (s->avctx->flags & AV_CODEC_FLAG_PASS1)
-            write_pass1_stats(s);
+            ff_write_pass1_stats(s);
 
         for (i = 0; i < 4; i++) {
             s->current_picture_ptr->encoding_error[i] = s->current_picture.encoding_error[i];
             avctx->error[i] += s->current_picture_ptr->encoding_error[i];
         }
+        ff_side_data_set_encoder_stats(pkt, s->current_picture.f->quality,
+                                       s->current_picture_ptr->encoding_error,
+                                       (s->avctx->flags&AV_CODEC_FLAG_PSNR) ? 4 : 0,
+                                       s->pict_type);
 
         if (s->avctx->flags & AV_CODEC_FLAG_PASS1)
             assert(put_bits_count(&s->pb) == s->header_bits + s->mv_bits +
@@ -1778,6 +1955,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->frame_bits  = put_bits_count(&s->pb);
 
         stuffing_count = ff_vbv_update(s, s->frame_bits);
+        s->stuffing_bits = 8*stuffing_count;
         if (stuffing_count) {
             if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) <
                     stuffing_count + 50) {
@@ -1827,7 +2005,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Internal error, negative bits\n");
 
-            assert(s->repeat_first_field == 0);
+            av_assert1(s->repeat_first_field == 0);
 
             vbv_delay = bits * 90000 / s->avctx->rc_max_rate;
             min_delay = (minbits * 90000LL + s->avctx->rc_max_rate - 1) /
@@ -1835,7 +2013,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             vbv_delay = FFMAX(vbv_delay, min_delay);
 
-            assert(vbv_delay < 0xFFFF);
+            av_assert0(vbv_delay < 0xFFFF);
 
             s->vbv_delay_ptr[0] &= 0xF8;
             s->vbv_delay_ptr[0] |= vbv_delay >> 13;
@@ -1885,7 +2063,14 @@ FF_ENABLE_DEPRECATION_WARNINGS
     } else {
         s->frame_bits = 0;
     }
-    assert((s->frame_bits & 7) == 0);
+
+    /* release non-reference frames */
+    for (i = 0; i < MAX_PICTURE_COUNT; i++) {
+        if (!s->picture[i].reference)
+            ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
+    }
+
+    av_assert1((s->frame_bits & 7) == 0);
 
     pkt->size = s->frame_bits / 8;
     *got_packet = !!pkt->size;
@@ -2009,15 +2194,17 @@ static void get_visual_weight(int16_t *weight, uint8_t *ptr, int stride)
 static av_always_inline void encode_mb_internal(MpegEncContext *s,
                                                 int motion_x, int motion_y,
                                                 int mb_block_height,
+                                                int mb_block_width,
                                                 int mb_block_count)
 {
-    int16_t weight[8][64];
-    int16_t orig[8][64];
+    int16_t weight[12][64];
+    int16_t orig[12][64];
     const int mb_x = s->mb_x;
     const int mb_y = s->mb_y;
     int i;
-    int skip_dct[8];
+    int skip_dct[12];
     int dct_offset = s->linesize * 8; // default for progressive frames
+    int uv_dct_offset = s->uvlinesize * 8;
     uint8_t *ptr_y, *ptr_cb, *ptr_cr;
     ptrdiff_t wrap_y, wrap_c;
 
@@ -2059,27 +2246,31 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
     ptr_y  = s->new_picture.f->data[0] +
              (mb_y * 16 * wrap_y)              + mb_x * 16;
     ptr_cb = s->new_picture.f->data[1] +
-             (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+             (mb_y * mb_block_height * wrap_c) + mb_x * mb_block_width;
     ptr_cr = s->new_picture.f->data[2] +
-             (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+             (mb_y * mb_block_height * wrap_c) + mb_x * mb_block_width;
 
-    if (mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) {
-        uint8_t *ebuf = s->sc.edge_emu_buffer + 32;
+    if((mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) && s->codec_id != AV_CODEC_ID_AMV){
+        uint8_t *ebuf = s->sc.edge_emu_buffer + 38 * wrap_y;
+        int cw = (s->width  + s->chroma_x_shift) >> s->chroma_x_shift;
+        int ch = (s->height + s->chroma_y_shift) >> s->chroma_y_shift;
         s->vdsp.emulated_edge_mc(ebuf, ptr_y,
                                  wrap_y, wrap_y,
                                  16, 16, mb_x * 16, mb_y * 16,
                                  s->width, s->height);
         ptr_y = ebuf;
-        s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y, ptr_cb,
+        s->vdsp.emulated_edge_mc(ebuf + 16 * wrap_y, ptr_cb,
                                  wrap_c, wrap_c,
-                                 8, mb_block_height, mb_x * 8, mb_y * 8,
-                                 s->width >> 1, s->height >> 1);
-        ptr_cb = ebuf + 18 * wrap_y;
-        s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y + 8, ptr_cr,
+                                 mb_block_width, mb_block_height,
+                                 mb_x * mb_block_width, mb_y * mb_block_height,
+                                 cw, ch);
+        ptr_cb = ebuf + 16 * wrap_y;
+        s->vdsp.emulated_edge_mc(ebuf + 16 * wrap_y + 16, ptr_cr,
                                  wrap_c, wrap_c,
-                                 8, mb_block_height, mb_x * 8, mb_y * 8,
-                                 s->width >> 1, s->height >> 1);
-        ptr_cr = ebuf + 18 * wrap_y + 8;
+                                 mb_block_width, mb_block_height,
+                                 mb_x * mb_block_width, mb_y * mb_block_height,
+                                 cw, ch);
+        ptr_cr = ebuf + 16 * wrap_y + 16;
     }
 
     if (s->mb_intra) {
@@ -2100,8 +2291,10 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
                     s->interlaced_dct = 1;
 
                     dct_offset = wrap_y;
+                    uv_dct_offset = wrap_c;
                     wrap_y <<= 1;
-                    if (s->chroma_format == CHROMA_422)
+                    if (s->chroma_format == CHROMA_422 ||
+                        s->chroma_format == CHROMA_444)
                         wrap_c <<= 1;
                 }
             }
@@ -2118,11 +2311,16 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
         } else {
             s->pdsp.get_pixels(s->block[4], ptr_cb, wrap_c);
             s->pdsp.get_pixels(s->block[5], ptr_cr, wrap_c);
-            if (!s->chroma_y_shift) { /* 422 */
-                s->pdsp.get_pixels(s->block[6],
-                                   ptr_cb + (dct_offset >> 1), wrap_c);
-                s->pdsp.get_pixels(s->block[7],
-                                   ptr_cr + (dct_offset >> 1), wrap_c);
+            if (!s->chroma_y_shift && s->chroma_x_shift) { /* 422 */
+                s->pdsp.get_pixels(s->block[6], ptr_cb + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[7], ptr_cr + uv_dct_offset, wrap_c);
+            } else if (!s->chroma_y_shift && !s->chroma_x_shift) { /* 444 */
+                s->pdsp.get_pixels(s->block[ 6], ptr_cb + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[ 7], ptr_cr + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[ 8], ptr_cb + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[ 9], ptr_cr + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[10], ptr_cb + uv_dct_offset + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[11], ptr_cr + uv_dct_offset + 8, wrap_c);
             }
         }
     } else {
@@ -2178,6 +2376,7 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
                     s->interlaced_dct = 1;
 
                     dct_offset = wrap_y;
+                    uv_dct_offset = wrap_c;
                     wrap_y <<= 1;
                     if (s->chroma_format == CHROMA_422)
                         wrap_c <<= 1;
@@ -2199,10 +2398,10 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             s->pdsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
             s->pdsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
             if (!s->chroma_y_shift) { /* 422 */
-                s->pdsp.diff_pixels(s->block[6], ptr_cb + (dct_offset >> 1),
-                                    dest_cb + (dct_offset >> 1), wrap_c);
-                s->pdsp.diff_pixels(s->block[7], ptr_cr + (dct_offset >> 1),
-                                    dest_cr + (dct_offset >> 1), wrap_c);
+                s->pdsp.diff_pixels(s->block[6], ptr_cb + uv_dct_offset,
+                                    dest_cb + uv_dct_offset, wrap_c);
+                s->pdsp.diff_pixels(s->block[7], ptr_cr + uv_dct_offset,
+                                    dest_cr + uv_dct_offset, wrap_c);
             }
         }
         /* pre quantization */
@@ -2224,12 +2423,12 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale)
                 skip_dct[5] = 1;
             if (!s->chroma_y_shift) { /* 422 */
-                if (s->mecc.sad[1](NULL, ptr_cb + (dct_offset >> 1),
-                                   dest_cb + (dct_offset >> 1),
+                if (s->mecc.sad[1](NULL, ptr_cb + uv_dct_offset,
+                                   dest_cb + uv_dct_offset,
                                    wrap_c, 8) < 20 * s->qscale)
                     skip_dct[6] = 1;
-                if (s->mecc.sad[1](NULL, ptr_cr + (dct_offset >> 1),
-                                   dest_cr + (dct_offset >> 1),
+                if (s->mecc.sad[1](NULL, ptr_cr + uv_dct_offset,
+                                   dest_cr + uv_dct_offset,
                                    wrap_c, 8) < 20 * s->qscale)
                     skip_dct[7] = 1;
             }
@@ -2251,17 +2450,17 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             get_visual_weight(weight[5], ptr_cr                , wrap_c);
         if (!s->chroma_y_shift) { /* 422 */
             if (!skip_dct[6])
-                get_visual_weight(weight[6], ptr_cb + (dct_offset >> 1),
+                get_visual_weight(weight[6], ptr_cb + uv_dct_offset,
                                   wrap_c);
             if (!skip_dct[7])
-                get_visual_weight(weight[7], ptr_cr + (dct_offset >> 1),
+                get_visual_weight(weight[7], ptr_cr + uv_dct_offset,
                                   wrap_c);
         }
         memcpy(orig[0], s->block[0], sizeof(int16_t) * 64 * mb_block_count);
     }
 
     /* DCT & quantize */
-    assert(s->out_format != FMT_MJPEG || s->qscale == 8);
+    av_assert2(s->out_format != FMT_MJPEG || s->qscale == 8);
     {
         for (i = 0; i < mb_block_count; i++) {
             if (!skip_dct[i]) {
@@ -2307,6 +2506,12 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
         s->block_last_index[5] = 0;
         s->block[4][0] =
         s->block[5][0] = (1024 + s->c_dc_scale / 2) / s->c_dc_scale;
+        if (!s->chroma_y_shift) { /* 422 / 444 */
+            for (i=6; i<12; i++) {
+                s->block_last_index[i] = 0;
+                s->block[i][0] = s->block[4][0];
+            }
+        }
     }
 
     // non c quantize code returns incorrect block_last_index FIXME
@@ -2357,18 +2562,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             ff_h263_encode_mb(s, s->block, motion_x, motion_y);
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         if (CONFIG_MJPEG_ENCODER)
             ff_mjpeg_encode_mb(s, s->block);
         break;
     default:
-        assert(0);
+        av_assert1(0);
     }
 }
 
 static av_always_inline void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 {
-    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 6);
-    else                                encode_mb_internal(s, motion_x, motion_y, 16, 8);
+    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 8, 6);
+    else if (s->chroma_format == CHROMA_422) encode_mb_internal(s, motion_x, motion_y, 16, 8, 8);
+    else encode_mb_internal(s, motion_x, motion_y, 16, 16, 12);
 }
 
 static inline void copy_context_before_encode(MpegEncContext *d, MpegEncContext *s, int type){
@@ -2459,7 +2666,7 @@ static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegE
         s->dest[0] = s->sc.rd_scratchpad;
         s->dest[1] = s->sc.rd_scratchpad + 16*s->linesize;
         s->dest[2] = s->sc.rd_scratchpad + 16*s->linesize + 8;
-        assert(s->linesize >= 32); //FIXME
+        av_assert0(s->linesize >= 32); //FIXME
     }
 
     encode_mb(s, motion_x, motion_y);
@@ -2471,7 +2678,7 @@ static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegE
     }
 
     if(s->avctx->mb_decision == FF_MB_DECISION_RD){
-        ff_mpv_decode_mb(s, s->block);
+        ff_mpv_reconstruct_mb(s, s->block);
 
         score *= s->lambda2;
         score += sse_mb(s) << FF_LAMBDA_SHIFT;
@@ -2490,7 +2697,7 @@ static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegE
 }
 
 static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, int stride){
-    uint32_t *sq = ff_square_tab + 256;
+    const uint32_t *sq = ff_square_tab + 256;
     int acc=0;
     int x,y;
 
@@ -2505,7 +2712,7 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in
         }
     }
 
-    assert(acc>=0);
+    av_assert2(acc>=0);
 
     return acc;
 }
@@ -2555,6 +2762,8 @@ static int pre_estimate_motion_thread(AVCodecContext *c, void *arg){
 static int estimate_motion_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
 
+    ff_check_alignment();
+
     s->me.dia_size= s->avctx->dia_size;
     s->first_slice_line=1;
     for(s->mb_y= s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
@@ -2581,6 +2790,8 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
     int mb_x, mb_y;
 
+    ff_check_alignment();
+
     for(mb_y=s->start_mb_y; mb_y < s->end_mb_y; mb_y++) {
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
             int xx = mb_x * 16;
@@ -2608,7 +2819,7 @@ static void write_slice_end(MpegEncContext *s){
 
         ff_mpeg4_stuffing(&s->pb);
     }else if(CONFIG_MJPEG_ENCODER && s->out_format == FMT_MJPEG){
-        ff_mjpeg_encode_stuffing(&s->pb);
+        ff_mjpeg_encode_stuffing(s);
     }
 
     avpriv_align_put_bits(&s->pb);
@@ -2661,6 +2872,42 @@ static void update_mb_info(MpegEncContext *s, int startcode)
     write_mb_info(s);
 }
 
+int ff_mpv_reallocate_putbitbuffer(MpegEncContext *s, size_t threshold, size_t size_increase)
+{
+    if (   s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < threshold
+        && s->slice_context_count == 1
+        && s->pb.buf == s->avctx->internal->byte_buffer) {
+        int lastgob_pos = s->ptr_lastgob - s->pb.buf;
+        int vbv_pos     = s->vbv_delay_ptr - s->pb.buf;
+
+        uint8_t *new_buffer = NULL;
+        int new_buffer_size = 0;
+
+        if ((s->avctx->internal->byte_buffer_size + size_increase) >= INT_MAX/8) {
+            av_log(s->avctx, AV_LOG_ERROR, "Cannot reallocate putbit buffer\n");
+            return AVERROR(ENOMEM);
+        }
+
+        emms_c();
+
+        av_fast_padded_malloc(&new_buffer, &new_buffer_size,
+                              s->avctx->internal->byte_buffer_size + size_increase);
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
+
+        memcpy(new_buffer, s->avctx->internal->byte_buffer, s->avctx->internal->byte_buffer_size);
+        av_free(s->avctx->internal->byte_buffer);
+        s->avctx->internal->byte_buffer      = new_buffer;
+        s->avctx->internal->byte_buffer_size = new_buffer_size;
+        rebase_put_bits(&s->pb, new_buffer, new_buffer_size);
+        s->ptr_lastgob   = s->pb.buf + lastgob_pos;
+        s->vbv_delay_ptr = s->pb.buf + vbv_pos;
+    }
+    if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < threshold)
+        return AVERROR(EINVAL);
+    return 0;
+}
+
 static int encode_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
     int mb_x, mb_y;
@@ -2672,6 +2919,8 @@ static int encode_thread(AVCodecContext *c, void *arg){
     uint8_t bit_buf_tex[2][MAX_MB_BYTES];
     PutBitContext pb[2], pb2[2], tex_pb[2];
 
+    ff_check_alignment();
+
     for(i=0; i<2; i++){
         init_put_bits(&pb    [i], bit_buf    [i], MAX_MB_BYTES);
         init_put_bits(&pb2   [i], bit_buf2   [i], MAX_MB_BYTES);
@@ -2695,6 +2944,11 @@ static int encode_thread(AVCodecContext *c, void *arg){
 
         s->current_picture.encoding_error[i] = 0;
     }
+    if(s->codec_id==AV_CODEC_ID_AMV){
+        s->last_dc[0] = 128*8/13;
+        s->last_dc[1] = 128*8/14;
+        s->last_dc[2] = 128*8/14;
+    }
     s->mb_skip_run = 0;
     memset(s->last_mv, 0, sizeof(s->last_mv));
 
@@ -2730,7 +2984,10 @@ static int encode_thread(AVCodecContext *c, void *arg){
 //            int d;
             int dmin= INT_MAX;
             int dir;
+            int size_increase =  s->avctx->internal->byte_buffer_size/4
+                               + s->mb_width*MAX_MB_BYTES;
 
+            ff_mpv_reallocate_putbitbuffer(s, MAX_MB_BYTES, size_increase);
             if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < MAX_MB_BYTES){
                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
                 return -1;
@@ -2738,7 +2995,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
             if(s->data_partitioning){
                 if(   s->pb2   .buf_end - s->pb2   .buf - (put_bits_count(&s->    pb2)>>3) < MAX_MB_BYTES
                    || s->tex_pb.buf_end - s->tex_pb.buf - (put_bits_count(&s->tex_pb )>>3) < MAX_MB_BYTES){
-                    av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                    av_log(s->avctx, AV_LOG_ERROR, "encoded partitioned frame too large\n");
                     return -1;
                 }
             }
@@ -2776,6 +3033,9 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 case AV_CODEC_ID_MPEG1VIDEO:
                     if(s->mb_skip_run) is_gob_start=0;
                     break;
+                case AV_CODEC_ID_MJPEG:
+                    if(s->mb_x==0 && s->mb_y!=0) is_gob_start=1;
+                    break;
                 }
 
                 if(is_gob_start){
@@ -2787,7 +3047,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         }
                     }
 
-                    assert((put_bits_count(&s->pb)&7) == 0);
+                    av_assert2((put_bits_count(&s->pb)&7) == 0);
                     current_packet_size= put_bits_ptr(&s->pb) - s->ptr_lastgob;
 
                     if (s->error_rate && s->resync_mb_x + s->resync_mb_y > 0) {
@@ -2796,7 +3056,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         if(r % d == 0){
                             current_packet_size=0;
                             s->pb.buf_ptr= s->ptr_lastgob;
-                            assert(put_bits_ptr(&s->pb) == s->ptr_lastgob);
+                            av_assert1(put_bits_ptr(&s->pb) == s->ptr_lastgob);
                         }
                     }
 
@@ -2998,8 +3258,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                         int16_t ac[6][16];
                         const int mvdir= (best_s.mv_dir&MV_DIR_BACKWARD) ? 1 : 0;
                         static const int dquant_tab[4]={-1,1,-2,2};
+                        int storecoefs = s->mb_intra && s->dc_val[0];
 
-                        assert(backup_s.dquant == 0);
+                        av_assert2(backup_s.dquant == 0);
 
                         //FIXME intra
                         s->mv_dir= best_s.mv_dir;
@@ -3017,7 +3278,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                             if(qp < s->avctx->qmin || qp > s->avctx->qmax)
                                 continue;
                             backup_s.dquant= dquant;
-                            if(s->mb_intra && s->dc_val[0]){
+                            if(storecoefs){
                                 for(i=0; i<6; i++){
                                     dc[i]= s->dc_val[0][ s->block_index[i] ];
                                     memcpy(ac[i], s->ac_val[0][s->block_index[i]], sizeof(int16_t)*16);
@@ -3027,7 +3288,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                             encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_INTER /* wrong but unused */, pb, pb2, tex_pb,
                                          &dmin, &next_block, s->mv[mvdir][0][0], s->mv[mvdir][0][1]);
                             if(best_s.qscale != qp){
-                                if(s->mb_intra && s->dc_val[0]){
+                                if(storecoefs){
                                     for(i=0; i<6; i++){
                                         s->dc_val[0][ s->block_index[i] ]= dc[i];
                                         memcpy(s->ac_val[0][s->block_index[i]], ac[i], sizeof(int16_t)*16);
@@ -3122,7 +3383,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
 
                 if(s->avctx->mb_decision == FF_MB_DECISION_BITS)
-                    ff_mpv_decode_mb(s, s->block);
+                    ff_mpv_reconstruct_mb(s, s->block);
             } else {
                 int motion_x = 0, motion_y = 0;
                 s->mv_type=MV_TYPE_16X16;
@@ -3241,7 +3502,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     s->out_format == FMT_H263 && s->pict_type!=AV_PICTURE_TYPE_B)
                     ff_h263_update_motion_val(s);
 
-                ff_mpv_decode_mb(s, s->block);
+                ff_mpv_reconstruct_mb(s, s->block);
             }
 
             /* clean the MV table in IPS frames for direct mode in B-frames */
@@ -3331,8 +3592,8 @@ static void merge_context_after_encode(MpegEncContext *dst, MpegEncContext *src)
         }
     }
 
-    assert(put_bits_count(&src->pb) % 8 ==0);
-    assert(put_bits_count(&dst->pb) % 8 ==0);
+    av_assert1(put_bits_count(&src->pb) % 8 ==0);
+    av_assert1(put_bits_count(&dst->pb) % 8 ==0);
     avpriv_copy_bits(&dst->pb, src->pb.buf, put_bits_count(&src->pb));
     flush_put_bits(&dst->pb);
 }
@@ -3343,8 +3604,7 @@ static int estimate_qp(MpegEncContext *s, int dry_run){
         s->current_picture.f->quality = s->next_lambda;
         if(!dry_run) s->next_lambda= 0;
     } else if (!s->fixed_qscale) {
-        int quality;
-        quality = ff_rate_estimate_qscale(s, dry_run);
+        int quality = ff_rate_estimate_qscale(s, dry_run);
         s->current_picture_ptr->f->quality =
         s->current_picture.f->quality = quality;
         if (s->current_picture.f->quality < 0)
@@ -3377,16 +3637,16 @@ static int estimate_qp(MpegEncContext *s, int dry_run){
 
 /* must be called before writing the header */
 static void set_frame_distances(MpegEncContext * s){
-    assert(s->current_picture_ptr->f->pts != AV_NOPTS_VALUE);
+    av_assert1(s->current_picture_ptr->f->pts != AV_NOPTS_VALUE);
     s->time = s->current_picture_ptr->f->pts * s->avctx->time_base.num;
 
     if(s->pict_type==AV_PICTURE_TYPE_B){
         s->pb_time= s->pp_time - (s->last_non_b_time - s->time);
-        assert(s->pb_time > 0 && s->pb_time < s->pp_time);
+        av_assert1(s->pb_time > 0 && s->pb_time < s->pp_time);
     }else{
         s->pp_time= s->time - s->last_non_b_time;
         s->last_non_b_time= s->time;
-        assert(s->picture_number==0 || s->pp_time > 0);
+        av_assert1(s->picture_number==0 || s->pp_time > 0);
     }
 }
 
@@ -3433,6 +3693,13 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         update_qscale(s);
     }
 
+    if(s->codec_id != AV_CODEC_ID_AMV && s->codec_id != AV_CODEC_ID_MJPEG){
+        if(s->q_chroma_intra_matrix   != s->q_intra_matrix  ) av_freep(&s->q_chroma_intra_matrix);
+        if(s->q_chroma_intra_matrix16 != s->q_intra_matrix16) av_freep(&s->q_chroma_intra_matrix16);
+        s->q_chroma_intra_matrix   = s->q_intra_matrix;
+        s->q_chroma_intra_matrix16 = s->q_intra_matrix16;
+    }
+
     s->mb_intra=0; //for the rate distortion & bit compare functions
     for(i=1; i<context_count; i++){
         ret = ff_update_duplicate_context(s->thread_context[i], s);
@@ -3477,7 +3744,9 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         s->pict_type= AV_PICTURE_TYPE_I;
         for(i=0; i<s->mb_stride*s->mb_height; i++)
             s->mb_type[i]= CANDIDATE_MB_TYPE_INTRA;
-        ff_dlog(s, "Scene change detected, encoding as I Frame %d %d\n",
+        if(s->msmpeg4_version >= 3)
+            s->no_rounding=1;
+        ff_dlog(s, "Scene change detected, encoding as I Frame %"PRId64" %"PRId64"\n",
                 s->current_picture.mb_var_sum, s->current_picture.mc_mb_var_sum);
     }
 
@@ -3544,17 +3813,50 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         s->qscale= 3; //reduce clipping problems
 
     if (s->out_format == FMT_MJPEG) {
+        const uint16_t *  luma_matrix = ff_mpeg1_default_intra_matrix;
+        const uint16_t *chroma_matrix = ff_mpeg1_default_intra_matrix;
+
+        if (s->avctx->intra_matrix) {
+            chroma_matrix =
+            luma_matrix = s->avctx->intra_matrix;
+        }
+        if (s->avctx->chroma_intra_matrix)
+            chroma_matrix = s->avctx->chroma_intra_matrix;
+
         /* for mjpeg, we do include qscale in the matrix */
         for(i=1;i<64;i++){
             int j = s->idsp.idct_permutation[i];
 
-            s->intra_matrix[j] = av_clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
+            s->chroma_intra_matrix[j] = av_clip_uint8((chroma_matrix[i] * s->qscale) >> 3);
+            s->       intra_matrix[j] = av_clip_uint8((  luma_matrix[i] * s->qscale) >> 3);
         }
         s->y_dc_scale_table=
         s->c_dc_scale_table= ff_mpeg2_dc_scale_table[s->intra_dc_precision];
+        s->chroma_intra_matrix[0] =
         s->intra_matrix[0] = ff_mpeg2_dc_scale_table[s->intra_dc_precision][8];
         ff_convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16,
                        s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        ff_convert_matrix(s, s->q_chroma_intra_matrix, s->q_chroma_intra_matrix16,
+                       s->chroma_intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        s->qscale= 8;
+    }
+    if(s->codec_id == AV_CODEC_ID_AMV){
+        static const uint8_t y[32]={13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13};
+        static const uint8_t c[32]={14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14};
+        for(i=1;i<64;i++){
+            int j= s->idsp.idct_permutation[ff_zigzag_direct[i]];
+
+            s->intra_matrix[j] = sp5x_quant_table[5*2+0][i];
+            s->chroma_intra_matrix[j] = sp5x_quant_table[5*2+1][i];
+        }
+        s->y_dc_scale_table= y;
+        s->c_dc_scale_table= c;
+        s->intra_matrix[0] = 13;
+        s->chroma_intra_matrix[0] = 14;
+        ff_convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16,
+                       s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        ff_convert_matrix(s, s->q_chroma_intra_matrix, s->q_chroma_intra_matrix16,
+                       s->chroma_intra_matrix, s->intra_quant_bias, 8, 8, 1);
         s->qscale= 8;
     }
 
@@ -3567,12 +3869,13 @@ static int encode_picture(MpegEncContext *s, int picture_number)
     if (s->current_picture.f->key_frame)
         s->picture_in_gop_number=0;
 
+    s->mb_x = s->mb_y = 0;
     s->last_bits= put_bits_count(&s->pb);
     switch(s->out_format) {
     case FMT_MJPEG:
-        if (CONFIG_MJPEG_ENCODER)
+        if (CONFIG_MJPEG_ENCODER && s->huffman != HUFFMAN_TABLE_OPTIMAL)
             ff_mjpeg_encode_picture_header(s->avctx, &s->pb, &s->intra_scantable,
-                                           s->pred, s->intra_matrix);
+                                           s->pred, s->intra_matrix, s->chroma_intra_matrix);
         break;
     case FMT_H261:
         if (CONFIG_H261_ENCODER)
@@ -3583,9 +3886,11 @@ static int encode_picture(MpegEncContext *s, int picture_number)
             ff_wmv2_encode_picture_header(s, picture_number);
         else if (CONFIG_MSMPEG4_ENCODER && s->msmpeg4_version)
             ff_msmpeg4_encode_picture_header(s, picture_number);
-        else if (CONFIG_MPEG4_ENCODER && s->h263_pred)
-            ff_mpeg4_encode_picture_header(s, picture_number);
-        else if (CONFIG_RV10_ENCODER && s->codec_id == AV_CODEC_ID_RV10) {
+        else if (CONFIG_MPEG4_ENCODER && s->h263_pred) {
+            ret = ff_mpeg4_encode_picture_header(s, picture_number);
+            if (ret < 0)
+                return ret;
+        } else if (CONFIG_RV10_ENCODER && s->codec_id == AV_CODEC_ID_RV10) {
             ret = ff_rv10_encode_picture_header(s, picture_number);
             if (ret < 0)
                 return ret;
@@ -3602,7 +3907,7 @@ static int encode_picture(MpegEncContext *s, int picture_number)
             ff_mpeg1_encode_picture_header(s, picture_number);
         break;
     default:
-        assert(0);
+        av_assert0(0);
     }
     bits= put_bits_count(&s->pb);
     s->header_bits= bits - s->last_bits;
@@ -3612,6 +3917,8 @@ static int encode_picture(MpegEncContext *s, int picture_number)
     }
     s->avctx->execute(s->avctx, encode_thread, &s->thread_context[0], NULL, context_count, sizeof(void*));
     for(i=1; i<context_count; i++){
+        if (s->pb.buf_end == s->thread_context[i]->pb.buf)
+            set_put_bits_buffer_size(&s->pb, FFMIN(s->thread_context[i]->pb.buf_end - s->pb.buf, INT_MAX/8-32));
         merge_context_after_encode(s, s->thread_context[i]);
     }
     emms_c();
@@ -3646,8 +3953,9 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                                   int16_t *block, int n,
                                   int qscale, int *overflow){
     const int *qmat;
-    const uint8_t *scantable= s->intra_scantable.scantable;
-    const uint8_t *perm_scantable= s->intra_scantable.permutated;
+    const uint16_t *matrix;
+    const uint8_t *scantable;
+    const uint8_t *perm_scantable;
     int max=0;
     unsigned int threshold1, threshold2;
     int bias=0;
@@ -3667,6 +3975,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     uint8_t * length;
     uint8_t * last_length;
     const int lambda= s->lambda2 >> (FF_LAMBDA_SHIFT - 6);
+    int mpeg2_qscale;
 
     s->fdsp.fdct(block);
 
@@ -3675,8 +3984,13 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     qmul= qscale*16;
     qadd= ((qscale-1)|1)*8;
 
+    if (s->q_scale_type) mpeg2_qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 mpeg2_qscale = qscale << 1;
+
     if (s->mb_intra) {
         int q;
+        scantable= s->intra_scantable.scantable;
+        perm_scantable= s->intra_scantable.permutated;
         if (!s->h263_aic) {
             if (n < 4)
                 q = s->y_dc_scale;
@@ -3693,15 +4007,25 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
         block[0] = (block[0] + (q >> 1)) / q;
         start_i = 1;
         last_non_zero = 0;
-        qmat = s->q_intra_matrix[qscale];
-        if(s->mpeg_quant || s->out_format == FMT_MPEG1)
+        qmat = n < 4 ? s->q_intra_matrix[qscale] : s->q_chroma_intra_matrix[qscale];
+        matrix = n < 4 ? s->intra_matrix : s->chroma_intra_matrix;
+        if(s->mpeg_quant || s->out_format == FMT_MPEG1 || s->out_format == FMT_MJPEG)
             bias= 1<<(QMAT_SHIFT-1);
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
+
+        if (n > 3 && s->intra_chroma_ac_vlc_length) {
+            length     = s->intra_chroma_ac_vlc_length;
+            last_length= s->intra_chroma_ac_vlc_last_length;
+        } else {
+            length     = s->intra_ac_vlc_length;
+            last_length= s->intra_ac_vlc_last_length;
+        }
     } else {
+        scantable= s->inter_scantable.scantable;
+        perm_scantable= s->inter_scantable.permutated;
         start_i = 0;
         last_non_zero = -1;
         qmat = s->q_inter_matrix[qscale];
+        matrix = s->inter_matrix;
         length     = s->inter_ac_vlc_length;
         last_length= s->inter_ac_vlc_last_length;
     }
@@ -3739,7 +4063,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
 //                coeff[2][k]= -level+2;
             }
             coeff_count[i]= FFMIN(level, 2);
-            assert(coeff_count[i]);
+            av_assert2(coeff_count[i]);
             max |=level;
         }else{
             coeff[0][i]= (level>>31)|1;
@@ -3773,17 +4097,20 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
             const int alevel= FFABS(level);
             int unquant_coeff;
 
-            assert(level);
+            av_assert2(level);
 
-            if(s->out_format == FMT_H263){
+            if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                 unquant_coeff= alevel*qmul + qadd;
-            } else { // MPEG-1
+            } else if(s->out_format == FMT_MJPEG) {
+                j = s->idsp.idct_permutation[scantable[i]];
+                unquant_coeff = alevel * matrix[j] * 8;
+            }else{ // MPEG-1
                 j = s->idsp.idct_permutation[scantable[i]]; // FIXME: optimize
                 if(s->mb_intra){
-                        unquant_coeff = (int)(  alevel  * qscale * s->intra_matrix[j]) >> 3;
+                        unquant_coeff = (int)(  alevel  * mpeg2_qscale * matrix[j]) >> 4;
                         unquant_coeff =   (unquant_coeff - 1) | 1;
                 }else{
-                        unquant_coeff = (((  alevel  << 1) + 1) * qscale * ((int) s->inter_matrix[j])) >> 4;
+                        unquant_coeff = (((  alevel  << 1) + 1) * mpeg2_qscale * ((int) matrix[j])) >> 5;
                         unquant_coeff =   (unquant_coeff - 1) | 1;
                 }
                 unquant_coeff<<= 3;
@@ -3804,7 +4131,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                     }
                 }
 
-                if(s->out_format == FMT_H263){
+                if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                     for(j=survivor_count-1; j>=0; j--){
                         int run= i - survivor[j];
                         int score= distortion + last_length[UNI_AC_ENC_INDEX(run, level)]*lambda;
@@ -3830,7 +4157,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                     }
                 }
 
-                if(s->out_format == FMT_H263){
+                if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                   for(j=survivor_count-1; j>=0; j--){
                         int run= i - survivor[j];
                         int score= distortion + score_tab[i-run];
@@ -3863,7 +4190,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
         survivor[ survivor_count++ ]= i+1;
     }
 
-    if(s->out_format != FMT_H263){
+    if(s->out_format != FMT_H263 && s->out_format != FMT_H261){
         last_score= 256*256*256*120;
         for(i= survivor[0]; i<=last_non_zero + 1; i++){
             int score= score_tab[i];
@@ -3897,10 +4224,10 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
             int alevel= FFABS(level);
             int unquant_coeff, score, distortion;
 
-            if(s->out_format == FMT_H263){
+            if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                     unquant_coeff= (alevel*qmul + qadd)>>3;
-            } else { // MPEG-1
-                    unquant_coeff = (((  alevel  << 1) + 1) * qscale * ((int) s->inter_matrix[0])) >> 4;
+            } else{ // MPEG-1
+                    unquant_coeff = (((  alevel  << 1) + 1) * mpeg2_qscale * ((int) matrix[0])) >> 5;
                     unquant_coeff =   (unquant_coeff - 1) | 1;
             }
             unquant_coeff = (unquant_coeff + 4) >> 3;
@@ -3923,7 +4250,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     }
 
     i= last_i;
-    assert(last_level);
+    av_assert2(last_level);
 
     block[ perm_scantable[last_non_zero] ]= last_level;
     i -= last_run + 1;
@@ -3962,8 +4289,8 @@ static int dct_quantize_refine(MpegEncContext *s, //FIXME breaks denoise?
                         int n, int qscale){
     int16_t rem[64];
     LOCAL_ALIGNED_16(int16_t, d1, [64]);
-    const uint8_t *scantable= s->intra_scantable.scantable;
-    const uint8_t *perm_scantable= s->intra_scantable.permutated;
+    const uint8_t *scantable;
+    const uint8_t *perm_scantable;
 //    unsigned int threshold1, threshold2;
 //    int bias=0;
     int run_tab[65];
@@ -3990,6 +4317,8 @@ static int messed_sign=0;
     qmul= qscale*2;
     qadd= (qscale-1)|1;
     if (s->mb_intra) {
+        scantable= s->intra_scantable.scantable;
+        perm_scantable= s->intra_scantable.permutated;
         if (!s->h263_aic) {
             if (n < 4)
                 q = s->y_dc_scale;
@@ -4007,9 +4336,16 @@ static int messed_sign=0;
         start_i = 1;
 //        if(s->mpeg_quant || s->out_format == FMT_MPEG1)
 //            bias= 1<<(QMAT_SHIFT-1);
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
+        if (n > 3 && s->intra_chroma_ac_vlc_length) {
+            length     = s->intra_chroma_ac_vlc_length;
+            last_length= s->intra_chroma_ac_vlc_last_length;
+        } else {
+            length     = s->intra_ac_vlc_length;
+            last_length= s->intra_ac_vlc_last_length;
+        }
     } else {
+        scantable= s->inter_scantable.scantable;
+        perm_scantable= s->inter_scantable.permutated;
         dc= 0;
         start_i = 0;
         length     = s->inter_ac_vlc_length;
@@ -4039,8 +4375,8 @@ STOP_TIMER("memset rem[]")}
         weight[i] = w;
 //        w=weight[i] = (63*qns + (w/2)) / w;
 
-        assert(w>0);
-        assert(w<(1<<6));
+        av_assert2(w>0);
+        av_assert2(w<(1<<6));
         sum += w*w;
     }
     lambda= sum*(uint64_t)s->lambda2 >> (FF_LAMBDA_SHIFT - 6 + 6 + 6 + 6);
@@ -4106,7 +4442,7 @@ STOP_TIMER("dct")}
             const int level= block[0];
             int change, old_coeff;
 
-            assert(s->mb_intra);
+            av_assert2(s->mb_intra);
 
             old_coeff= q*level;
 
@@ -4150,7 +4486,7 @@ STOP_TIMER("dct")}
             }else{
                 old_coeff=0;
                 run2--;
-                assert(run2>=0 || i >= last_non_zero );
+                av_assert2(run2>=0 || i >= last_non_zero );
             }
 
             for(change=-1; change<=1; change+=2){
@@ -4178,7 +4514,7 @@ STOP_TIMER("dct")}
                                          - last_length[UNI_AC_ENC_INDEX(run, level+64)];
                         }
                     }else{
-                        assert(FFABS(new_level)==1);
+                        av_assert2(FFABS(new_level)==1);
 
                         if(analyze_gradient){
                             int g= d1[ scantable[i] ];
@@ -4211,7 +4547,7 @@ STOP_TIMER("dct")}
                     }
                 }else{
                     new_coeff=0;
-                    assert(FFABS(level)==1);
+                    av_assert2(FFABS(level)==1);
 
                     if(i < last_non_zero){
                         int next_i= i + run2 + 1;
@@ -4240,7 +4576,7 @@ STOP_TIMER("dct")}
                 score *= lambda;
 
                 unquant_change= new_coeff - old_coeff;
-                assert((score < 100*lambda && score > -100*lambda) || lambda==0);
+                av_assert2((score < 100*lambda && score > -100*lambda) || lambda==0);
 
                 score += s->mpvencdsp.try_8x8basis(rem, weight, basis[j],
                                                    unquant_change);
@@ -4272,7 +4608,7 @@ STOP_TIMER("iterative step")}
 
             if(best_coeff > last_non_zero){
                 last_non_zero= best_coeff;
-                assert(block[j]);
+                av_assert2(block[j]);
 #ifdef REFINE_STATS
 after_last++;
 #endif
@@ -4300,7 +4636,7 @@ if(block[j]){
 #ifdef REFINE_STATS
 count++;
 if(256*256*256*64 % count == 0){
-    printf("after_last:%d to_zero:%d from_zero:%d raise:%d lower:%d sign:%d xyp:%d/%d/%d\n", after_last, to_zero, from_zero, raise, lower, messed_sign, s->mb_x, s->mb_y, s->picture_number);
+    av_log(s->avctx, AV_LOG_DEBUG, "after_last:%d to_zero:%d from_zero:%d raise:%d lower:%d sign:%d xyp:%d/%d/%d\n", after_last, to_zero, from_zero, raise, lower, messed_sign, s->mb_x, s->mb_y, s->picture_number);
 }
 #endif
             run=0;
@@ -4343,8 +4679,8 @@ STOP_TIMER("iterative search")
  *                  permutation up, the block is not (inverse) permutated
  *                  to scantable order!
  */
-static void block_permute(int16_t *block, uint8_t *permutation,
-                          const uint8_t *scantable, int last)
+void ff_block_permute(int16_t *block, uint8_t *permutation,
+                      const uint8_t *scantable, int last)
 {
     int i;
     int16_t temp[64];
@@ -4374,7 +4710,7 @@ int ff_dct_quantize_c(MpegEncContext *s,
 {
     int i, j, level, last_non_zero, q, start_i;
     const int *qmat;
-    const uint8_t *scantable= s->intra_scantable.scantable;
+    const uint8_t *scantable;
     int bias;
     int max=0;
     unsigned int threshold1, threshold2;
@@ -4385,6 +4721,7 @@ int ff_dct_quantize_c(MpegEncContext *s,
         s->denoise_dct(s, block);
 
     if (s->mb_intra) {
+        scantable= s->intra_scantable.scantable;
         if (!s->h263_aic) {
             if (n < 4)
                 q = s->y_dc_scale;
@@ -4399,13 +4736,14 @@ int ff_dct_quantize_c(MpegEncContext *s,
         block[0] = (block[0] + (q >> 1)) / q;
         start_i = 1;
         last_non_zero = 0;
-        qmat = s->q_intra_matrix[qscale];
-        bias= s->intra_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
+        qmat = n < 4 ? s->q_intra_matrix[qscale] : s->q_chroma_intra_matrix[qscale];
+        bias= s->intra_quant_bias*(1<<(QMAT_SHIFT - QUANT_BIAS_SHIFT));
     } else {
+        scantable= s->inter_scantable.scantable;
         start_i = 0;
         last_non_zero = -1;
         qmat = s->q_inter_matrix[qscale];
-        bias= s->inter_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
+        bias= s->inter_quant_bias*(1<<(QMAT_SHIFT - QUANT_BIAS_SHIFT));
     }
     threshold1= (1<<QMAT_SHIFT) - bias - 1;
     threshold2= (threshold1<<1);
@@ -4443,7 +4781,7 @@ int ff_dct_quantize_c(MpegEncContext *s,
 
     /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */
     if (s->idsp.perm_type != FF_IDCT_PERM_NONE)
-        block_permute(block, s->idsp.idct_permutation,
+        ff_block_permute(block, s->idsp.idct_permutation,
                       scantable, last_non_zero);
 
     return last_non_zero;
@@ -4452,8 +4790,7 @@ int ff_dct_quantize_c(MpegEncContext *s,
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption h263_options[] = {
-    { "obmc",         "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "structured_slices","Write slice start position at every GOB header instead of just GOB number.", OFFSET(h263_slice_structured), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE},
+    { "obmc",         "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "mb_info",      "emit macroblock info for RFC 2190 packetization, the parameter value is the maximum payload size", OFFSET(mb_info), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
     FF_MPV_COMMON_OPTS
     { NULL },
@@ -4480,10 +4817,10 @@ AVCodec ff_h263_encoder = {
 };
 
 static const AVOption h263p_options[] = {
-    { "umv",        "Use unlimited motion vectors.",    OFFSET(umvplus), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "aiv",        "Use alternative inter VLC.",       OFFSET(alt_inter_vlc), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "obmc",       "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "structured_slices", "Write slice start position at every GOB header instead of just GOB number.", OFFSET(h263_slice_structured), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE},
+    { "umv",        "Use unlimited motion vectors.",    OFFSET(umvplus),       AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "aiv",        "Use alternative inter VLC.",       OFFSET(alt_inter_vlc), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "obmc",       "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "structured_slices", "Write slice start position at every GOB header instead of just GOB number.", OFFSET(h263_slice_structured), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE},
     FF_MPV_COMMON_OPTS
     { NULL },
 };
diff --git a/libavcodec/mpegvideo_motion.c b/libavcodec/mpegvideo_motion.c
index f6d9613..5624c10 100644
--- a/libavcodec/mpegvideo_motion.c
+++ b/libavcodec/mpegvideo_motion.c
@@ -4,25 +4,26 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
 #include "h261.h"
@@ -47,8 +48,8 @@ static void gmc1_motion(MpegEncContext *s,
     motion_y   = s->sprite_offset[0][1];
     src_x      = s->mb_x * 16 + (motion_x >> (s->sprite_warping_accuracy + 1));
     src_y      = s->mb_y * 16 + (motion_y >> (s->sprite_warping_accuracy + 1));
-    motion_x <<= (3 - s->sprite_warping_accuracy);
-    motion_y <<= (3 - s->sprite_warping_accuracy);
+    motion_x *= 1 << (3 - s->sprite_warping_accuracy);
+    motion_y *= 1 << (3 - s->sprite_warping_accuracy);
     src_x      = av_clip(src_x, -16, s->width);
     if (src_x == s->width)
         motion_x = 0;
@@ -94,8 +95,8 @@ static void gmc1_motion(MpegEncContext *s,
     motion_y   = s->sprite_offset[1][1];
     src_x      = s->mb_x * 8 + (motion_x >> (s->sprite_warping_accuracy + 1));
     src_y      = s->mb_y * 8 + (motion_y >> (s->sprite_warping_accuracy + 1));
-    motion_x <<= (3 - s->sprite_warping_accuracy);
-    motion_y <<= (3 - s->sprite_warping_accuracy);
+    motion_x  *= 1 << (3 - s->sprite_warping_accuracy);
+    motion_y  *= 1 << (3 - s->sprite_warping_accuracy);
     src_x      = av_clip(src_x, -8, s->width >> 1);
     if (src_x == s->width >> 1)
         motion_x = 0;
@@ -178,7 +179,7 @@ static void gmc_motion(MpegEncContext *s,
                 s->sprite_delta[0][0], s->sprite_delta[0][1],
                 s->sprite_delta[1][0], s->sprite_delta[1][1],
                 a + 1, (1 << (2 * a + 1)) - s->no_rounding,
-                s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+                (s->h_edge_pos + 1) >> 1, (s->v_edge_pos + 1) >> 1);
 
     ptr = ref_picture[2];
     s->mdsp.gmc(dest_cr, ptr, uvlinesize, 8,
@@ -186,7 +187,7 @@ static void gmc_motion(MpegEncContext *s,
                 s->sprite_delta[0][0], s->sprite_delta[0][1],
                 s->sprite_delta[1][0], s->sprite_delta[1][1],
                 a + 1, (1 << (2 * a + 1)) - s->no_rounding,
-                s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+                (s->h_edge_pos + 1) >> 1, (s->v_edge_pos + 1) >> 1);
 }
 
 static inline int hpel_motion(MpegEncContext *s,
@@ -210,52 +211,21 @@ static inline int hpel_motion(MpegEncContext *s,
         dxy |= (motion_y & 1) << 1;
     src += src_y * s->linesize + src_x;
 
-    if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 1) - 8, 0) ||
-        (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y & 1) - 8, 0)) {
-        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, src,
-                                 s->linesize, s->linesize,
-                                 9, 9, src_x, src_y,
-                                 s->h_edge_pos, s->v_edge_pos);
-        src = s->sc.edge_emu_buffer;
-        emu = 1;
-    }
+        if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 1) - 7, 0) ||
+            (unsigned)src_y >= FFMAX(s->v_edge_pos - (motion_y & 1) - 7, 0)) {
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, src,
+                                     s->linesize, s->linesize,
+                                     9, 9,
+                                     src_x, src_y,
+                                     s->h_edge_pos, s->v_edge_pos);
+            src = s->sc.edge_emu_buffer;
+            emu = 1;
+        }
     pix_op[dxy](dest, src, s->linesize, 8);
     return emu;
 }
 
 static av_always_inline
-void emulated_edge_mc(MpegEncContext *s,
-                      int src_x, int src_y,
-                      int uvsrc_x, int uvsrc_y,
-                      int field_based,
-                      uint8_t **ptr_y,
-                      uint8_t **ptr_cb,
-                      uint8_t **ptr_cr)
-{
-    s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, *ptr_y,
-                             s->linesize, s->linesize,
-                             17, 17 + field_based,
-                             src_x, src_y * (1 << field_based),
-                             s->h_edge_pos, s->v_edge_pos);
-    *ptr_y = s->sc.edge_emu_buffer;
-    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 18 * s->linesize;
-        s->vdsp.emulated_edge_mc(uvbuf, *ptr_cb,
-                                 s->uvlinesize, s->uvlinesize,
-                                 9, 9 + field_based,
-                                 uvsrc_x, uvsrc_y * (1 << field_based),
-                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, *ptr_cr,
-                                 s->uvlinesize, s->uvlinesize,
-                                 9, 9 + field_based,
-                                 uvsrc_x, uvsrc_y * (1 << field_based),
-                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        *ptr_cb = uvbuf;
-        *ptr_cr = uvbuf + 16;
-    }
-}
-
-static av_always_inline
 void mpeg_motion_internal(MpegEncContext *s,
                           uint8_t *dest_y,
                           uint8_t *dest_cb,
@@ -269,20 +239,22 @@ void mpeg_motion_internal(MpegEncContext *s,
                           int motion_y,
                           int h,
                           int is_mpeg12,
+                          int is_16x8,
                           int mb_y)
 {
     uint8_t *ptr_y, *ptr_cb, *ptr_cr;
     int dxy, uvdxy, mx, my, src_x, src_y,
-        uvsrc_x, uvsrc_y, v_edge_pos;
+        uvsrc_x, uvsrc_y, v_edge_pos, block_y_half;
     ptrdiff_t uvlinesize, linesize;
 
     v_edge_pos = s->v_edge_pos >> field_based;
     linesize   = s->current_picture.f->linesize[0] << field_based;
     uvlinesize = s->current_picture.f->linesize[1] << field_based;
+    block_y_half = (field_based | is_16x8);
 
     dxy   = ((motion_y & 1) << 1) | (motion_x & 1);
     src_x = s->mb_x * 16 + (motion_x >> 1);
-    src_y = (mb_y << (4 - field_based)) + (motion_y >> 1);
+    src_y = (mb_y << (4 - block_y_half)) + (motion_y >> 1);
 
     if (!is_mpeg12 && s->out_format == FMT_H263) {
         if ((s->workaround_bugs & FF_BUG_HPEL_CHROMA) && field_based) {
@@ -290,7 +262,7 @@ void mpeg_motion_internal(MpegEncContext *s,
             my      = motion_y >> 1;
             uvdxy   = ((my & 1) << 1) | (mx & 1);
             uvsrc_x = s->mb_x * 8 + (mx >> 1);
-            uvsrc_y = (mb_y << (3 - field_based)) + (my >> 1);
+            uvsrc_y = (mb_y << (3 - block_y_half)) + (my >> 1);
         } else {
             uvdxy   = dxy | (motion_y & 2) | ((motion_x & 2) >> 1);
             uvsrc_x = src_x >> 1;
@@ -309,7 +281,7 @@ void mpeg_motion_internal(MpegEncContext *s,
             my      = motion_y / 2;
             uvdxy   = ((my & 1) << 1) | (mx & 1);
             uvsrc_x = s->mb_x * 8 + (mx >> 1);
-            uvsrc_y = (mb_y << (3 - field_based)) + (my >> 1);
+            uvsrc_y = (mb_y << (3 - block_y_half)) + (my >> 1);
         } else {
             if (s->chroma_x_shift) {
                 // Chroma422
@@ -330,8 +302,8 @@ void mpeg_motion_internal(MpegEncContext *s,
     ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
     ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
 
-    if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 1) - 16, 0) ||
-        (unsigned)src_y > FFMAX(v_edge_pos - (motion_y & 1) - h, 0)) {
+    if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 1) - 15   , 0) ||
+        (unsigned)src_y >= FFMAX(   v_edge_pos - (motion_y & 1) - h + 1, 0)) {
         if (is_mpeg12 ||
             s->codec_id == AV_CODEC_ID_MPEG2VIDEO ||
             s->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
@@ -340,8 +312,32 @@ void mpeg_motion_internal(MpegEncContext *s,
                    src_y);
             return;
         }
-        emulated_edge_mc(s, src_x, src_y, uvsrc_x, uvsrc_y, field_based,
-                         &ptr_y, &ptr_cb, &ptr_cr);
+        src_y = (unsigned)src_y << field_based;
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
+                                 s->linesize, s->linesize,
+                                 17, 17 + field_based,
+                                 src_x, src_y,
+                                 s->h_edge_pos, s->v_edge_pos);
+        ptr_y = s->sc.edge_emu_buffer;
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf = ubuf + 10 * s->uvlinesize;
+            if (s->workaround_bugs & FF_BUG_IEDGE)
+                vbuf -= s->uvlinesize;
+            uvsrc_y = (unsigned)uvsrc_y << field_based;
+            s->vdsp.emulated_edge_mc(ubuf, ptr_cb,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y,
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf, ptr_cr,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y,
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
+        }
     }
 
     /* FIXME use this for field pix too instead of the obnoxious hack which
@@ -376,18 +372,18 @@ static void mpeg_motion(MpegEncContext *s,
                         uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                         int field_select, uint8_t **ref_picture,
                         op_pixels_func (*pix_op)[4],
-                        int motion_x, int motion_y, int h, int mb_y)
+                        int motion_x, int motion_y, int h, int is_16x8, int mb_y)
 {
 #if !CONFIG_SMALL
     if (s->out_format == FMT_MPEG1)
         mpeg_motion_internal(s, dest_y, dest_cb, dest_cr, 0, 0,
                              field_select, ref_picture, pix_op,
-                             motion_x, motion_y, h, 1, mb_y);
+                             motion_x, motion_y, h, 1, is_16x8, mb_y);
     else
 #endif
         mpeg_motion_internal(s, dest_y, dest_cb, dest_cr, 0, 0,
                              field_select, ref_picture, pix_op,
-                             motion_x, motion_y, h, 0, mb_y);
+                             motion_x, motion_y, h, 0, is_16x8, mb_y);
 }
 
 static void mpeg_motion_field(MpegEncContext *s, uint8_t *dest_y,
@@ -398,15 +394,15 @@ static void mpeg_motion_field(MpegEncContext *s, uint8_t *dest_y,
                               int motion_x, int motion_y, int h, int mb_y)
 {
 #if !CONFIG_SMALL
-    if(s->out_format == FMT_MPEG1)
+    if (s->out_format == FMT_MPEG1)
         mpeg_motion_internal(s, dest_y, dest_cb, dest_cr, 1,
                              bottom_field, field_select, ref_picture, pix_op,
-                             motion_x, motion_y, h, 1, mb_y);
+                             motion_x, motion_y, h, 1, 0, mb_y);
     else
 #endif
         mpeg_motion_internal(s, dest_y, dest_cb, dest_cr, 1,
                              bottom_field, field_select, ref_picture, pix_op,
-                             motion_x, motion_y, h, 0, mb_y);
+                             motion_x, motion_y, h, 0, 0, mb_y);
 }
 
 // FIXME: SIMDify, avg variant, 16x16 version
@@ -473,7 +469,7 @@ static inline void obmc_motion(MpegEncContext *s,
     int i;
     uint8_t *ptr[5];
 
-    assert(s->quarter_sample == 0);
+    av_assert2(s->quarter_sample == 0);
 
     for (i = 0; i < 5; i++) {
         if (i && mv[i][0] == mv[MID][0] && mv[i][1] == mv[MID][1]) {
@@ -540,10 +536,32 @@ static inline void qpel_motion(MpegEncContext *s,
     ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
     ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
 
-    if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 3) - 16, 0) ||
-        (unsigned)src_y > FFMAX(v_edge_pos - (motion_y & 3) - h, 0)) {
-        emulated_edge_mc(s, src_x, src_y, uvsrc_x, uvsrc_y, field_based,
-                         &ptr_y, &ptr_cb, &ptr_cr);
+    if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 3) - 15   , 0) ||
+        (unsigned)src_y >= FFMAX(   v_edge_pos - (motion_y & 3) - h + 1, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
+                                 s->linesize, s->linesize,
+                                 17, 17 + field_based,
+                                 src_x, src_y * (1 << field_based),
+                                 s->h_edge_pos, s->v_edge_pos);
+        ptr_y = s->sc.edge_emu_buffer;
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf = ubuf + 10 * s->uvlinesize;
+            if (s->workaround_bugs & FF_BUG_IEDGE)
+                vbuf -= s->uvlinesize;
+            s->vdsp.emulated_edge_mc(ubuf, ptr_cb,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y * (1 << field_based),
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf, ptr_cr,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y * (1 << field_based),
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
+        }
     }
 
     if (!field_based)
@@ -604,8 +622,8 @@ static void chroma_4mv_motion(MpegEncContext *s,
 
     offset = src_y * s->uvlinesize + src_x;
     ptr    = ref_picture[1] + offset;
-    if ((unsigned)src_x > FFMAX((s->h_edge_pos >> 1) - (dxy & 1) - 8, 0) ||
-        (unsigned)src_y > FFMAX((s->v_edge_pos >> 1) - (dxy >> 1) - 8, 0)) {
+    if ((unsigned)src_x >= FFMAX((s->h_edge_pos >> 1) - (dxy  & 1) - 7, 0) ||
+        (unsigned)src_y >= FFMAX((s->v_edge_pos >> 1) - (dxy >> 1) - 7, 0)) {
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
                                  s->uvlinesize, s->uvlinesize,
                                  9, 9, src_x, src_y,
@@ -656,7 +674,7 @@ static inline void apply_obmc(MpegEncContext *s,
     const int mot_xy     = mb_x * 2 + mb_y * 2 * mot_stride;
     int mx, my, i;
 
-    assert(!s->mb_skipped);
+    av_assert2(!s->mb_skipped);
 
     AV_COPY32(mv_cache[1][1], cur_frame->motion_val[0][mot_xy]);
     AV_COPY32(mv_cache[1][2], cur_frame->motion_val[0][mot_xy + 1]);
@@ -762,8 +780,8 @@ static inline void apply_8x8(MpegEncContext *s,
                 dxy &= ~12;
 
             ptr = ref_picture[0] + (src_y * s->linesize) + (src_x);
-            if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 3) - 8, 0) ||
-                (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y & 3) - 8, 0)) {
+            if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 3) - 7, 0) ||
+                (unsigned)src_y >= FFMAX(s->v_edge_pos - (motion_y & 3) - 7, 0)) {
                 s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
                                          s->linesize, s->linesize,
                                          9, 9,
@@ -854,7 +872,7 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
         } else {
             mpeg_motion(s, dest_y, dest_cb, dest_cr, 0,
                         ref_picture, pix_op,
-                        s->mv[dir][0][0], s->mv[dir][0][1], 16, mb_y);
+                        s->mv[dir][0][0], s->mv[dir][0][1], 16, 0, mb_y);
         }
         break;
     case MV_TYPE_8X8:
@@ -883,23 +901,23 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
                                   s->mv[dir][1][0], s->mv[dir][1][1], 8, mb_y);
             }
         } else {
-            if (s->picture_structure != s->field_select[dir][0] + 1 &&
-                s->pict_type != AV_PICTURE_TYPE_B && !s->first_field) {
+            if (   s->picture_structure != s->field_select[dir][0] + 1 && s->pict_type != AV_PICTURE_TYPE_B && !s->first_field
+                || !ref_picture[0]) {
                 ref_picture = s->current_picture_ptr->f->data;
             }
 
             mpeg_motion(s, dest_y, dest_cb, dest_cr,
                         s->field_select[dir][0],
                         ref_picture, pix_op,
-                        s->mv[dir][0][0], s->mv[dir][0][1], 16, mb_y >> 1);
+                        s->mv[dir][0][0], s->mv[dir][0][1], 16, 0, mb_y >> 1);
         }
         break;
     case MV_TYPE_16X8:
         for (i = 0; i < 2; i++) {
             uint8_t **ref2picture;
 
-            if (s->picture_structure == s->field_select[dir][i] + 1
-                || s->pict_type == AV_PICTURE_TYPE_B || s->first_field) {
+            if ((s->picture_structure == s->field_select[dir][i] + 1
+                || s->pict_type == AV_PICTURE_TYPE_B || s->first_field) && ref_picture[0]) {
                 ref2picture = ref_picture;
             } else {
                 ref2picture = s->current_picture_ptr->f->data;
@@ -908,8 +926,8 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
             mpeg_motion(s, dest_y, dest_cb, dest_cr,
                         s->field_select[dir][i],
                         ref2picture, pix_op,
-                        s->mv[dir][i][0], s->mv[dir][i][1] + 16 * i,
-                        8, mb_y >> 1);
+                        s->mv[dir][i][0], s->mv[dir][i][1],
+                        8, 1, (mb_y & ~1) + i);
 
             dest_y  += 16 * s->linesize;
             dest_cb += (16 >> s->chroma_y_shift) * s->uvlinesize;
@@ -928,12 +946,15 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
                 pix_op = s->hdsp.avg_pixels_tab;
             }
         } else {
+            if (!ref_picture[0]) {
+                ref_picture = s->current_picture_ptr->f->data;
+            }
             for (i = 0; i < 2; i++) {
                 mpeg_motion(s, dest_y, dest_cb, dest_cr,
                             s->picture_structure != i + 1,
                             ref_picture, pix_op,
                             s->mv[dir][2 * i][0], s->mv[dir][2 * i][1],
-                            16, mb_y >> 1);
+                            16, 0, mb_y >> 1);
 
                 // after put we make avg of the same block
                 pix_op = s->hdsp.avg_pixels_tab;
@@ -946,7 +967,7 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
             }
         }
         break;
-    default: assert(0);
+    default: av_assert2(0);
     }
 }
 
diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
index f883b5f..7a3c7ab 100644
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,6 +44,9 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
     int top_field_first, repeat_first_field, progressive_frame;
     int horiz_size_ext, vert_size_ext, bit_rate_ext;
     int did_set_size=0;
+    int set_dim_ret = 0;
+    int bit_rate = 0;
+    int vbv_delay = 0;
     int chroma_format;
     enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
 //FIXME replace the crap with get_bits()
@@ -57,6 +60,8 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
         case PICTURE_START_CODE:
             if (bytes_left >= 2) {
                 s->pict_type = (buf[1] >> 3) & 7;
+                if (bytes_left >= 4)
+                    vbv_delay = ((buf[1] & 0x07) << 13) | (buf[2] << 5) | (buf[3] >> 3);
             }
             break;
         case SEQ_START_CODE:
@@ -64,14 +69,15 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                 pc->width  = (buf[0] << 4) | (buf[1] >> 4);
                 pc->height = ((buf[1] & 0x0f) << 8) | buf[2];
                 if(!avctx->width || !avctx->height || !avctx->coded_width || !avctx->coded_height){
-                    ff_set_dimensions(avctx, pc->width, pc->height);
+                    set_dim_ret = ff_set_dimensions(avctx, pc->width, pc->height);
                     did_set_size=1;
                 }
                 pix_fmt = AV_PIX_FMT_YUV420P;
                 frame_rate_index = buf[3] & 0xf;
                 pc->frame_rate = avctx->framerate = ff_mpeg12_frame_rate_tab[frame_rate_index];
-                avctx->bit_rate = ((buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6))*400;
+                bit_rate = (buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6);
                 avctx->codec_id = AV_CODEC_ID_MPEG1VIDEO;
+                avctx->ticks_per_frame = 1;
             }
             break;
         case EXT_START_CODE:
@@ -95,21 +101,15 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                         case 3: pix_fmt = AV_PIX_FMT_YUV444P; break;
                         }
 
-                        pc->width  |=(horiz_size_ext << 12);
-                        pc->height |=( vert_size_ext << 12);
-
-                        bit_rate_ext <<= 18;
-                        if (bit_rate_ext < INT_MAX / 400 &&
-                            bit_rate_ext * 400 < INT_MAX - avctx->bit_rate) {
-                            avctx->bit_rate += bit_rate_ext * 400;
-                        } else
-                            avctx->bit_rate = 0;
-
+                        pc->width  = (pc->width & 0xFFF) | (horiz_size_ext << 12);
+                        pc->height = (pc->height& 0xFFF) | ( vert_size_ext << 12);
+                        bit_rate = (bit_rate&0x3FFFF) | (bit_rate_ext << 18);
                         if(did_set_size)
-                            ff_set_dimensions(avctx, pc->width, pc->height);
-                        avctx->framerate.num = pc->frame_rate.num * (frame_rate_ext_n + 1) * 2;
+                            set_dim_ret = ff_set_dimensions(avctx, pc->width, pc->height);
+                        avctx->framerate.num = pc->frame_rate.num * (frame_rate_ext_n + 1);
                         avctx->framerate.den = pc->frame_rate.den * (frame_rate_ext_d + 1);
                         avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+                        avctx->ticks_per_frame = 2;
                     }
                     break;
                 case 0x8: /* picture coding extension */
@@ -131,7 +131,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                             }
                         }
 
-                        if (!pc->progressive_sequence) {
+                        if (!pc->progressive_sequence && !progressive_frame) {
                             if (top_field_first)
                                 s->field_order = AV_FIELD_TT;
                             else
@@ -155,6 +155,16 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
         }
     }
  the_end: ;
+    if (set_dim_ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions\n");
+
+    if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO && bit_rate) {
+        avctx->rc_max_rate = 400LL*bit_rate;
+    }
+    if (bit_rate &&
+        ((avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && bit_rate != 0x3FFFF) || vbv_delay != 0xFFFF)) {
+        avctx->bit_rate = 400LL*bit_rate;
+    }
 
     if (pix_fmt != AV_PIX_FMT_NONE) {
         s->format = pix_fmt;
@@ -163,6 +173,11 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
         s->coded_width  = FFALIGN(pc->width,  16);
         s->coded_height = FFALIGN(pc->height, 16);
     }
+
+#if FF_API_AVCTX_TIMEBASE
+    if (avctx->framerate.num)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+#endif
 }
 
 static int mpegvideo_parse(AVCodecParserContext *s,
@@ -191,7 +206,7 @@ static int mpegvideo_parse(AVCodecParserContext *s,
        function should be negligible for uncorrupted streams */
     mpegvideo_extract_headers(s, avctx, buf, buf_size);
     ff_dlog(NULL, "pict_type=%d frame_rate=%0.3f repeat_pict=%d\n",
-            s->pict_type, (double)avctx->time_base.den / avctx->time_base.num, s->repeat_pict);
+            s->pict_type, av_q2d(avctx->framerate), s->repeat_pict);
 
     *poutbuf = buf;
     *poutbuf_size = buf_size;
@@ -203,18 +218,28 @@ static int mpegvideo_split(AVCodecContext *avctx,
 {
     int i;
     uint32_t state= -1;
+    int found=0;
 
     for(i=0; i<buf_size; i++){
         state= (state<<8) | buf[i];
-        if(state != 0x1B3 && state != 0x1B5 && state < 0x200 && state >= 0x100)
+        if(state == 0x1B3){
+            found=1;
+        }else if(found && state != 0x1B5 && state < 0x200 && state >= 0x100)
             return i-3;
     }
     return 0;
 }
 
+static int mpegvideo_parse_init(AVCodecParserContext *s)
+{
+    s->pict_type = AV_PICTURE_TYPE_NONE; // first frame might be partial
+    return 0;
+}
+
 AVCodecParser ff_mpegvideo_parser = {
     .codec_ids      = { AV_CODEC_ID_MPEG1VIDEO, AV_CODEC_ID_MPEG2VIDEO },
     .priv_data_size = sizeof(struct MpvParseContext),
+    .parser_init    = mpegvideo_parse_init,
     .parser_parse   = mpegvideo_parse,
     .parser_close   = ff_parse_close,
     .split          = mpegvideo_split,
diff --git a/libavcodec/mpegvideo_xvmc.c b/libavcodec/mpegvideo_xvmc.c
new file mode 100644
index 0000000..f065837
--- /dev/null
+++ b/libavcodec/mpegvideo_xvmc.c
@@ -0,0 +1,376 @@
+/*
+ * XVideo Motion Compensation
+ * Copyright (c) 2003 Ivan Kalvachev
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <limits.h>
+#include <X11/extensions/XvMC.h>
+
+#include "avcodec.h"
+#include "mpegutils.h"
+#include "mpegvideo.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+#include "xvmc.h"
+#include "xvmc_internal.h"
+#include "version.h"
+
+/**
+ * Initialize the block field of the MpegEncContext pointer passed as
+ * parameter after making sure that the data is not corrupted.
+ * In order to implement something like direct rendering instead of decoding
+ * coefficients in s->blocks and then copying them, copy them directly
+ * into the data_blocks array provided by xvmc.
+ */
+void ff_xvmc_init_block(MpegEncContext *s)
+{
+    struct xvmc_pix_fmt *render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
+    assert(render && render->xvmc_id == AV_XVMC_ID);
+
+    s->block = (int16_t (*)[64])(render->data_blocks + render->next_free_data_block_num * 64);
+}
+
+static void exchange_uv(MpegEncContext *s)
+{
+    int16_t (*tmp)[64];
+
+    tmp           = s->pblocks[4];
+    s->pblocks[4] = s->pblocks[5];
+    s->pblocks[5] = tmp;
+}
+
+/**
+ * Fill individual block pointers, so there are no gaps in the data_block array
+ * in case not all blocks in the macroblock are coded.
+ */
+void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp)
+{
+    int i, j = 0;
+    const int mb_block_count = 4 + (1 << s->chroma_format);
+
+    cbp <<= 12-mb_block_count;
+    for (i = 0; i < mb_block_count; i++) {
+        if (cbp & (1 << 11))
+            s->pblocks[i] = &s->block[j++];
+        else
+            s->pblocks[i] = NULL;
+        cbp += cbp;
+    }
+    if (s->swap_uv) {
+        exchange_uv(s);
+    }
+}
+
+/**
+ * Find and store the surfaces that are used as reference frames.
+ * This function should be called for every new field and/or frame.
+ * It should be safe to call the function a few times for the same field.
+ */
+static int ff_xvmc_field_start(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size)
+{
+    struct MpegEncContext *s = avctx->priv_data;
+    struct xvmc_pix_fmt *last, *next, *render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
+    const int mb_block_count = 4 + (1 << s->chroma_format);
+
+    assert(avctx);
+    if (!render || render->xvmc_id != AV_XVMC_ID ||
+        !render->data_blocks || !render->mv_blocks ||
+        (unsigned int)render->allocated_mv_blocks   > INT_MAX/(64*6) ||
+        (unsigned int)render->allocated_data_blocks > INT_MAX/64     ||
+        !render->p_surface) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Render token doesn't look as expected.\n");
+        return -1; // make sure that this is a render packet
+    }
+
+    if (render->filled_mv_blocks_num) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Rendering surface contains %i unprocessed blocks.\n",
+               render->filled_mv_blocks_num);
+        return -1;
+    }
+    if (render->allocated_mv_blocks   < 1 ||
+        render->allocated_data_blocks <  render->allocated_mv_blocks*mb_block_count ||
+        render->start_mv_blocks_num   >= render->allocated_mv_blocks                ||
+        render->next_free_data_block_num >
+                        render->allocated_data_blocks -
+                        mb_block_count*(render->allocated_mv_blocks-render->start_mv_blocks_num)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Rendering surface doesn't provide enough block structures to work with.\n");
+        return -1;
+    }
+
+    render->picture_structure = s->picture_structure;
+    render->flags             = s->first_field ? 0 : XVMC_SECOND_FIELD;
+    render->p_future_surface  = NULL;
+    render->p_past_surface    = NULL;
+
+    switch(s->pict_type) {
+        case  AV_PICTURE_TYPE_I:
+            return 0; // no prediction from other frames
+        case  AV_PICTURE_TYPE_B:
+            next = (struct xvmc_pix_fmt*)s->next_picture.f->data[2];
+            if (!next)
+                return -1;
+            if (next->xvmc_id != AV_XVMC_ID)
+                return -1;
+            render->p_future_surface = next->p_surface;
+            // no return here, going to set forward prediction
+        case  AV_PICTURE_TYPE_P:
+            last = (struct xvmc_pix_fmt*)s->last_picture.f->data[2];
+            if (!last)
+                last = render; // predict second field from the first
+            if (last->xvmc_id != AV_XVMC_ID)
+                return -1;
+            render->p_past_surface = last->p_surface;
+            return 0;
+    }
+
+return -1;
+}
+
+/**
+ * Complete frame/field rendering by passing any remaining blocks.
+ * Normally ff_draw_horiz_band() is called for each slice, however,
+ * some leftover blocks, for example from error_resilience(), may remain.
+ * It should be safe to call the function a few times for the same field.
+ */
+static int ff_xvmc_field_end(AVCodecContext *avctx)
+{
+    struct MpegEncContext *s = avctx->priv_data;
+    struct xvmc_pix_fmt *render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
+    assert(render);
+
+    if (render->filled_mv_blocks_num > 0)
+        ff_mpeg_draw_horiz_band(s, 0, 0);
+    return 0;
+}
+
+/**
+ * Synthesize the data needed by XvMC to render one macroblock of data.
+ * Fill all relevant fields, if necessary do IDCT.
+ */
+static void ff_xvmc_decode_mb(struct MpegEncContext *s)
+{
+    XvMCMacroBlock *mv_block;
+    struct xvmc_pix_fmt *render;
+    int i, cbp, blocks_per_mb;
+
+    const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
+
+
+    if (s->encoding) {
+        av_log(s->avctx, AV_LOG_ERROR, "XVMC doesn't support encoding!!!\n");
+        return;
+    }
+
+    // from ff_mpv_reconstruct_mb(), update DC predictors for P macroblocks
+    if (!s->mb_intra) {
+        s->last_dc[0] =
+        s->last_dc[1] =
+        s->last_dc[2] =  128 << s->intra_dc_precision;
+    }
+
+    // MC doesn't skip blocks
+    s->mb_skipped = 0;
+
+
+    // Do I need to export quant when I could not perform postprocessing?
+    // Anyway, it doesn't hurt.
+    s->current_picture.qscale_table[mb_xy] = s->qscale;
+
+    // start of XVMC-specific code
+    render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
+    assert(render);
+    assert(render->xvmc_id == AV_XVMC_ID);
+    assert(render->mv_blocks);
+
+    // take the next free macroblock
+    mv_block = &render->mv_blocks[render->start_mv_blocks_num +
+                                  render->filled_mv_blocks_num];
+
+    mv_block->x        = s->mb_x;
+    mv_block->y        = s->mb_y;
+    mv_block->dct_type = s->interlaced_dct; // XVMC_DCT_TYPE_FRAME/FIELD;
+    if (s->mb_intra) {
+        mv_block->macroblock_type = XVMC_MB_TYPE_INTRA; // no MC, all done
+    } else {
+        mv_block->macroblock_type = XVMC_MB_TYPE_PATTERN;
+
+        if (s->mv_dir & MV_DIR_FORWARD) {
+            mv_block->macroblock_type |= XVMC_MB_TYPE_MOTION_FORWARD;
+            // PMV[n][dir][xy] = mv[dir][n][xy]
+            mv_block->PMV[0][0][0] = s->mv[0][0][0];
+            mv_block->PMV[0][0][1] = s->mv[0][0][1];
+            mv_block->PMV[1][0][0] = s->mv[0][1][0];
+            mv_block->PMV[1][0][1] = s->mv[0][1][1];
+        }
+        if (s->mv_dir & MV_DIR_BACKWARD) {
+            mv_block->macroblock_type |= XVMC_MB_TYPE_MOTION_BACKWARD;
+            mv_block->PMV[0][1][0] = s->mv[1][0][0];
+            mv_block->PMV[0][1][1] = s->mv[1][0][1];
+            mv_block->PMV[1][1][0] = s->mv[1][1][0];
+            mv_block->PMV[1][1][1] = s->mv[1][1][1];
+        }
+
+        switch(s->mv_type) {
+            case  MV_TYPE_16X16:
+                mv_block->motion_type = XVMC_PREDICTION_FRAME;
+                break;
+            case  MV_TYPE_16X8:
+                mv_block->motion_type = XVMC_PREDICTION_16x8;
+                break;
+            case  MV_TYPE_FIELD:
+                mv_block->motion_type = XVMC_PREDICTION_FIELD;
+                if (s->picture_structure == PICT_FRAME) {
+                    mv_block->PMV[0][0][1] <<= 1;
+                    mv_block->PMV[1][0][1] <<= 1;
+                    mv_block->PMV[0][1][1] <<= 1;
+                    mv_block->PMV[1][1][1] <<= 1;
+                }
+                break;
+            case  MV_TYPE_DMV:
+                mv_block->motion_type = XVMC_PREDICTION_DUAL_PRIME;
+                if (s->picture_structure == PICT_FRAME) {
+
+                    mv_block->PMV[0][0][0] = s->mv[0][0][0];      // top from top
+                    mv_block->PMV[0][0][1] = s->mv[0][0][1] << 1;
+
+                    mv_block->PMV[0][1][0] = s->mv[0][0][0];      // bottom from bottom
+                    mv_block->PMV[0][1][1] = s->mv[0][0][1] << 1;
+
+                    mv_block->PMV[1][0][0] = s->mv[0][2][0];      // dmv00, top from bottom
+                    mv_block->PMV[1][0][1] = s->mv[0][2][1] << 1; // dmv01
+
+                    mv_block->PMV[1][1][0] = s->mv[0][3][0];      // dmv10, bottom from top
+                    mv_block->PMV[1][1][1] = s->mv[0][3][1] << 1; // dmv11
+
+                } else {
+                    mv_block->PMV[0][1][0] = s->mv[0][2][0];      // dmv00
+                    mv_block->PMV[0][1][1] = s->mv[0][2][1];      // dmv01
+                }
+                break;
+            default:
+                assert(0);
+        }
+
+        mv_block->motion_vertical_field_select = 0;
+
+        // set correct field references
+        if (s->mv_type == MV_TYPE_FIELD || s->mv_type == MV_TYPE_16X8) {
+            mv_block->motion_vertical_field_select |= s->field_select[0][0];
+            mv_block->motion_vertical_field_select |= s->field_select[1][0] << 1;
+            mv_block->motion_vertical_field_select |= s->field_select[0][1] << 2;
+            mv_block->motion_vertical_field_select |= s->field_select[1][1] << 3;
+        }
+    } // !intra
+    // time to handle data blocks
+    mv_block->index = render->next_free_data_block_num;
+
+    blocks_per_mb = 6;
+    if (s->chroma_format >= 2) {
+        blocks_per_mb = 4 + (1 << s->chroma_format);
+    }
+
+    // calculate cbp
+    cbp = 0;
+    for (i = 0; i < blocks_per_mb; i++) {
+        cbp += cbp;
+        if (s->block_last_index[i] >= 0)
+            cbp++;
+    }
+
+    if (s->avctx->flags & AV_CODEC_FLAG_GRAY) {
+        if (s->mb_intra) {                                   // intra frames are always full chroma blocks
+            for (i = 4; i < blocks_per_mb; i++) {
+                memset(s->pblocks[i], 0, sizeof(*s->pblocks[i]));  // so we need to clear them
+                if (!render->unsigned_intra)
+                    *s->pblocks[i][0] = 1 << 10;
+            }
+        } else {
+            cbp &= 0xf << (blocks_per_mb - 4);
+            blocks_per_mb = 4;                               // luminance blocks only
+        }
+    }
+    mv_block->coded_block_pattern = cbp;
+    if (cbp == 0)
+        mv_block->macroblock_type &= ~XVMC_MB_TYPE_PATTERN;
+
+    for (i = 0; i < blocks_per_mb; i++) {
+        if (s->block_last_index[i] >= 0) {
+            // I do not have unsigned_intra MOCO to test, hope it is OK.
+            if (s->mb_intra && (render->idct || !render->unsigned_intra))
+                *s->pblocks[i][0] -= 1 << 10;
+            if (!render->idct) {
+                s->idsp.idct(*s->pblocks[i]);
+                /* It is unclear if MC hardware requires pixel diff values to be
+                 * in the range [-255;255]. TODO: Clipping if such hardware is
+                 * ever found. As of now it would only be an unnecessary
+                 * slowdown. */
+            }
+            // copy blocks only if the codec doesn't support pblocks reordering
+            if (!s->pack_pblocks) {
+                memcpy(&render->data_blocks[render->next_free_data_block_num*64],
+                       s->pblocks[i], sizeof(*s->pblocks[i]));
+            }
+            render->next_free_data_block_num++;
+        }
+    }
+    render->filled_mv_blocks_num++;
+
+    assert(render->filled_mv_blocks_num     <= render->allocated_mv_blocks);
+    assert(render->next_free_data_block_num <= render->allocated_data_blocks);
+    /* The above conditions should not be able to fail as long as this function
+     * is used and the following 'if ()' automatically calls a callback to free
+     * blocks. */
+
+
+    if (render->filled_mv_blocks_num == render->allocated_mv_blocks)
+        ff_mpeg_draw_horiz_band(s, 0, 0);
+}
+
+#if CONFIG_MPEG1_XVMC_HWACCEL
+const AVHWAccel ff_mpeg1_xvmc_hwaccel = {
+    .name           = "mpeg1_xvmc",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .pix_fmt        = AV_PIX_FMT_XVMC,
+    .start_frame    = ff_xvmc_field_start,
+    .end_frame      = ff_xvmc_field_end,
+    .decode_slice   = NULL,
+    .decode_mb      = ff_xvmc_decode_mb,
+    .priv_data_size = 0,
+};
+#endif
+
+#if CONFIG_MPEG2_XVMC_HWACCEL
+const AVHWAccel ff_mpeg2_xvmc_hwaccel = {
+    .name           = "mpeg2_xvmc",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .pix_fmt        = AV_PIX_FMT_XVMC,
+    .start_frame    = ff_xvmc_field_start,
+    .end_frame      = ff_xvmc_field_end,
+    .decode_slice   = NULL,
+    .decode_mb      = ff_xvmc_decode_mb,
+    .priv_data_size = 0,
+};
+#endif
diff --git a/libavcodec/mpegvideodata.c b/libavcodec/mpegvideodata.c
index f27dd90..5f1d8f7 100644
--- a/libavcodec/mpegvideodata.c
+++ b/libavcodec/mpegvideodata.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,13 @@ const uint8_t ff_default_chroma_qscale_table[32] = {
     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 };
 
+const uint8_t ff_mpeg2_non_linear_qscale[32] = {
+     0,  1,  2,  3,  4,  5,   6,   7,
+     8, 10, 12, 14, 16, 18,  20,  22,
+    24, 28, 32, 36, 40, 44,  48,  52,
+    56, 64, 72, 80, 88, 96, 104, 112,
+};
+
 const uint8_t ff_mpeg1_dc_scale_table[128] = {
 //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
diff --git a/libavcodec/mpegvideodata.h b/libavcodec/mpegvideodata.h
index d3ace23..14f4806 100644
--- a/libavcodec/mpegvideodata.h
+++ b/libavcodec/mpegvideodata.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@ extern const uint8_t ff_alternate_vertical_scan[64];
 extern const uint8_t ff_mpeg1_dc_scale_table[128];
 extern const uint8_t * const ff_mpeg2_dc_scale_table[4];
 
+extern const uint8_t ff_mpeg2_non_linear_qscale[32];
+
 extern const uint8_t ff_default_chroma_qscale_table[32];
 
 #endif /* AVCODEC_MPEGVIDEODATA_H */
diff --git a/libavcodec/mpegvideodsp.c b/libavcodec/mpegvideodsp.c
index 915a844..a58e45a 100644
--- a/libavcodec/mpegvideodsp.c
+++ b/libavcodec/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideodsp.h b/libavcodec/mpegvideodsp.h
index b0f45db..293e254 100644
--- a/libavcodec/mpegvideodsp.h
+++ b/libavcodec/mpegvideodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 279d233..a34ab35 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include <string.h>
 
 #include "config.h"
+#include "libavutil/avassert.h"
 #include "libavutil/attributes.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
@@ -39,7 +40,7 @@ static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
                           (BASIS_SHIFT - RECON_SHIFT));
         int w = weight[i];
         b >>= RECON_SHIFT;
-        assert(-512 < b && b < 512);
+        av_assert2(-512 < b && b < 512);
 
         sum += (w * b) * (w * b) >> 4;
     }
@@ -80,7 +81,7 @@ static int pix_sum_c(uint8_t *pix, int line_size)
 static int pix_norm1_c(uint8_t *pix, int line_size)
 {
     int s = 0, i, j;
-    uint32_t *sq = ff_square_tab + 256;
+    const uint32_t *sq = ff_square_tab + 256;
 
     for (i = 0; i < 16; i++) {
         for (j = 0; j < 16; j += 8) {
@@ -250,4 +251,6 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
         ff_mpegvideoencdsp_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_mpegvideoencdsp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_mpegvideoencdsp_init_mips(c, avctx);
 }
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 91a292a..33f0282 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,5 +52,7 @@ void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
+                                  AVCodecContext *avctx);
 
 #endif /* AVCODEC_MPEGVIDEOENCDSP_H */
diff --git a/libavcodec/mpl2dec.c b/libavcodec/mpl2dec.c
new file mode 100644
index 0000000..409e4b3
--- /dev/null
+++ b/libavcodec/mpl2dec.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * MPL2 subtitles decoder
+ *
+ * @see http://web.archive.org/web/20090328040233/http://napisy.ussbrowarek.org/mpl2-eng.html
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static int mpl2_event_to_ass(AVBPrint *buf, const char *p)
+{
+    if (*p == ' ')
+        p++;
+
+    while (*p) {
+        int got_style = 0;
+
+        while (*p && strchr("/\\_", *p)) {
+            if      (*p == '/')  av_bprintf(buf, "{\\i1}");
+            else if (*p == '\\') av_bprintf(buf, "{\\b1}");
+            else if (*p == '_')  av_bprintf(buf, "{\\u1}");
+            got_style = 1;
+            p++;
+        }
+
+        while (*p && *p != '|') {
+            if (*p != '\r' && *p != '\n')
+                av_bprint_chars(buf, *p, 1);
+            p++;
+        }
+
+        if (*p == '|') {
+            if (got_style)
+                av_bprintf(buf, "{\\r}");
+            av_bprintf(buf, "\\N");
+            p++;
+        }
+    }
+
+    return 0;
+}
+
+static int mpl2_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVBPrint buf;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && *ptr && !mpl2_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_mpl2_decoder = {
+    .name           = "mpl2",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_MPL2,
+    .decode         = mpl2_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/mqc.c b/libavcodec/mqc.c
index 0144581..f2d1e3b 100644
--- a/libavcodec/mqc.c
+++ b/libavcodec/mqc.c
@@ -2,20 +2,20 @@
  * MQ-coder encoder and decoder common functions
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mqc.h b/libavcodec/mqc.h
index 6326192..8bf7223 100644
--- a/libavcodec/mqc.h
+++ b/libavcodec/mqc.h
@@ -2,20 +2,20 @@
  * MQ-coder: structures, common and decoder functions
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,16 +43,34 @@ typedef struct MqcState {
     unsigned int c;
     unsigned int ct;
     uint8_t cx_states[19];
+    int raw;
 } MqcState;
 
+/* encoder */
+
+/** initialize the encoder */
+void ff_mqc_initenc(MqcState *mqc, uint8_t *bp);
+
+/** code bit d with context cx */
+void ff_mqc_encode(MqcState *mqc, uint8_t *cxstate, int d);
+
+/** number of encoded bytes */
+int ff_mqc_length(MqcState *mqc);
+
+/** flush the encoder [returns number of bytes encoded] */
+int ff_mqc_flush(MqcState *mqc);
+int ff_mqc_flush_to(MqcState *mqc, uint8_t *dst, int *dst_len);
+
 /* decoder */
 
 /**
  * Initialize MQ-decoder.
  * @param mqc   MQ decoder state
  * @param bp    byte pointer
+ * @param raw   raw mode
+ * @param reset reset states
  */
-void ff_mqc_initdec(MqcState *mqc, uint8_t *bp);
+void ff_mqc_initdec(MqcState *mqc, uint8_t *bp, int raw, int reset);
 
 /**
  * MQ decoder.
diff --git a/libavcodec/mqcdec.c b/libavcodec/mqcdec.c
index 889763a..34aa519 100644
--- a/libavcodec/mqcdec.c
+++ b/libavcodec/mqcdec.c
@@ -2,20 +2,20 @@
  * MQ-coder decoder
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,9 +68,11 @@ static int exchange(MqcState *mqc, uint8_t *cxstate, int lps)
     return d;
 }
 
-void ff_mqc_initdec(MqcState *mqc, uint8_t *bp)
+void ff_mqc_initdec(MqcState *mqc, uint8_t *bp, int raw, int reset)
 {
-    ff_mqc_init_contexts(mqc);
+    mqc->raw = raw;
+    if (reset)
+        ff_mqc_init_contexts(mqc);
     mqc->bp = bp;
     mqc->c  = (*mqc->bp ^ 0xff) << 16;
     bytein(mqc);
@@ -78,8 +80,20 @@ void ff_mqc_initdec(MqcState *mqc, uint8_t *bp)
     mqc->a = 0x8000;
 }
 
+static int mqc_decode_bypass(MqcState *mqc) {
+    int bit = !(mqc->c & 0x40000000);
+    if (!(mqc->c & 0xff)) {
+        mqc->c -= 0x100;
+        bytein(mqc);
+    }
+    mqc->c += mqc->c;
+    return bit;
+}
+
 int ff_mqc_decode(MqcState *mqc, uint8_t *cxstate)
 {
+    if (mqc->raw)
+        return mqc_decode_bypass(mqc);
     mqc->a -= ff_mqc_qe[*cxstate];
     if ((mqc->c >> 16) < mqc->a) {
         if (mqc->a & 0x8000)
diff --git a/libavcodec/mqcenc.c b/libavcodec/mqcenc.c
new file mode 100644
index 0000000..7c9e1a0
--- /dev/null
+++ b/libavcodec/mqcenc.c
@@ -0,0 +1,139 @@
+/*
+ * MQ-coder encoder
+ * Copyright (c) 2007 Kamil Nowosad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MQ-coder encoder
+ * @file
+ * @author Kamil Nowosad
+ */
+
+#include "libavutil/avassert.h"
+#include "mqc.h"
+
+static void byteout(MqcState *mqc)
+{
+retry:
+    if (*mqc->bp == 0xff){
+        mqc->bp++;
+        *mqc->bp = mqc->c >> 20;
+        mqc->c &= 0xfffff;
+        mqc->ct = 7;
+    } else if ((mqc->c & 0x8000000)){
+        (*mqc->bp)++;
+        mqc->c &= 0x7ffffff;
+        goto retry;
+    } else{
+        mqc->bp++;
+        *mqc->bp = mqc->c >> 19;
+        mqc->c &= 0x7ffff;
+        mqc->ct = 8;
+    }
+}
+
+static void renorme(MqcState *mqc)
+{
+    do{
+        mqc->a += mqc->a;
+        mqc->c += mqc->c;
+        if (!--mqc->ct)
+            byteout(mqc);
+    } while (!(mqc->a & 0x8000));
+}
+
+static void setbits(MqcState *mqc)
+{
+    int tmp = mqc->c + mqc->a;
+    mqc->c |= 0xffff;
+    if (mqc->c >= tmp)
+        mqc->c -= 0x8000;
+}
+
+void ff_mqc_initenc(MqcState *mqc, uint8_t *bp)
+{
+    ff_mqc_init_contexts(mqc);
+    mqc->a = 0x8000;
+    mqc->c = 0;
+    mqc->bp = bp-1;
+    mqc->bpstart = bp;
+    mqc->ct = 12 + (*mqc->bp == 0xff);
+}
+
+void ff_mqc_encode(MqcState *mqc, uint8_t *cxstate, int d)
+{
+    int qe;
+
+    qe = ff_mqc_qe[*cxstate];
+    mqc->a -= qe;
+    if ((*cxstate & 1) == d){
+        if (!(mqc->a & 0x8000)){
+            if (mqc->a < qe)
+                mqc->a = qe;
+            else
+                mqc->c += qe;
+            *cxstate = ff_mqc_nmps[*cxstate];
+            renorme(mqc);
+        } else
+            mqc->c += qe;
+    } else{
+        if (mqc->a < qe)
+            mqc->c += qe;
+        else
+            mqc->a = qe;
+        *cxstate = ff_mqc_nlps[*cxstate];
+        renorme(mqc);
+    }
+}
+
+int ff_mqc_length(MqcState *mqc)
+{
+    return mqc->bp - mqc->bpstart;
+}
+
+int ff_mqc_flush(MqcState *mqc)
+{
+    setbits(mqc);
+    mqc->c = mqc->c << mqc->ct;
+    byteout(mqc);
+    mqc->c = mqc->c << mqc->ct;
+    byteout(mqc);
+    if (*mqc->bp != 0xff)
+        mqc->bp++;
+    return mqc->bp - mqc->bpstart;
+}
+
+int ff_mqc_flush_to(MqcState *mqc, uint8_t *dst, int *dst_len)
+{
+    MqcState mqc2 = *mqc;
+    mqc2.bpstart=
+    mqc2.bp = dst;
+    *mqc2.bp = *mqc->bp;
+    ff_mqc_flush(&mqc2);
+    *dst_len = mqc2.bp - dst;
+    if (mqc->bp < mqc->bpstart) {
+        av_assert1(mqc->bpstart - mqc->bp == 1);
+        av_assert1(*dst_len > 0);
+        av_assert1(mqc->bp[0] == 0 && dst[0] == 0);
+        (*dst_len) --;
+        memmove(dst, dst+1, *dst_len);
+        return mqc->bp - mqc->bpstart + 1 + *dst_len;
+    }
+    return mqc->bp - mqc->bpstart + *dst_len;
+}
diff --git a/libavcodec/mscc.c b/libavcodec/mscc.c
new file mode 100644
index 0000000..86e4e88
--- /dev/null
+++ b/libavcodec/mscc.c
@@ -0,0 +1,278 @@
+/*
+ * Mandsoft Screen Capture Codec decoder
+ *
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+#include <zlib.h>
+
+typedef struct MSCCContext {
+    unsigned          bpp;
+    unsigned int      decomp_size;
+    uint8_t          *decomp_buf;
+    unsigned int      uncomp_size;
+    uint8_t          *uncomp_buf;
+    z_stream          zstream;
+
+    uint32_t          pal[256];
+} MSCCContext;
+
+static int rle_uncompress(AVCodecContext *avctx, GetByteContext *gb, PutByteContext *pb)
+{
+    MSCCContext *s = avctx->priv_data;
+    unsigned x = 0, y = 0;
+
+    while (bytestream2_get_bytes_left(gb) > 0) {
+        uint32_t fill;
+        int j;
+        unsigned run = bytestream2_get_byte(gb);
+
+        if (run) {
+            switch (avctx->bits_per_coded_sample) {
+            case 8:
+                fill = bytestream2_get_byte(gb);
+                break;
+            case 16:
+                fill = bytestream2_get_le16(gb);
+                break;
+            case 24:
+                fill = bytestream2_get_le24(gb);
+                break;
+            case 32:
+                fill = bytestream2_get_le32(gb);
+                break;
+            }
+
+            for (j = 0; j < run; j++) {
+                switch (avctx->bits_per_coded_sample) {
+                case 8:
+                    bytestream2_put_byte(pb, fill);
+                    break;
+                case 16:
+                    bytestream2_put_le16(pb, fill);
+                    break;
+                case 24:
+                    bytestream2_put_le24(pb, fill);
+                    break;
+                case 32:
+                    bytestream2_put_le32(pb, fill);
+                    break;
+                }
+            }
+            x += run;
+        } else {
+            unsigned copy = bytestream2_get_byte(gb);
+
+            if (copy == 0) {
+                x = 0;
+                y++;
+                bytestream2_seek_p(pb, y * avctx->width * s->bpp, SEEK_SET);
+            } else if (copy == 1) {
+                return 0;
+            } else if (copy == 2) {
+
+                x += bytestream2_get_byte(gb);
+                y += bytestream2_get_byte(gb);
+
+                bytestream2_seek_p(pb, y * avctx->width * s->bpp + x * s->bpp, SEEK_SET);
+            } else {
+                for (j = 0; j < copy; j++) {
+                    switch (avctx->bits_per_coded_sample) {
+                    case 8:
+                        bytestream2_put_byte(pb, bytestream2_get_byte(gb));
+                        break;
+                    case 16:
+                        bytestream2_put_le16(pb, bytestream2_get_le16(gb));
+                        break;
+                    case 24:
+                        bytestream2_put_le24(pb, bytestream2_get_le24(gb));
+                        break;
+                    case 32:
+                        bytestream2_put_le32(pb, bytestream2_get_le32(gb));
+                        break;
+                    }
+                }
+
+                if (s->bpp == 1 && (copy & 1))
+                    bytestream2_skip(gb, 1);
+                x += copy;
+            }
+        }
+    }
+
+    return AVERROR_INVALIDDATA;
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    MSCCContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    GetByteContext gb;
+    PutByteContext pb;
+    int ret, j;
+
+    if (avpkt->size < 3)
+        return buf_size;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    if (avctx->codec_id == AV_CODEC_ID_MSCC) {
+        avpkt->data[2] ^= avpkt->data[0];
+        buf += 2;
+        buf_size -= 2;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        int size;
+        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &size);
+
+        if (pal && size == AVPALETTE_SIZE) {
+            frame->palette_has_changed = 1;
+            for (j = 0; j < 256; j++)
+                s->pal[j] = 0xFF000000 | AV_RL32(pal + j * 4);
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
+        }
+        memcpy(frame->data[1], s->pal, AVPALETTE_SIZE);
+    }
+
+    ret = inflateReset(&s->zstream);
+    if (ret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", ret);
+        return AVERROR_UNKNOWN;
+    }
+    s->zstream.next_in   = buf;
+    s->zstream.avail_in  = buf_size;
+    s->zstream.next_out  = s->decomp_buf;
+    s->zstream.avail_out = s->decomp_size;
+    ret = inflate(&s->zstream, Z_FINISH);
+    if (ret != Z_STREAM_END) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate error: %d\n", ret);
+        return AVERROR_UNKNOWN;
+    }
+
+    bytestream2_init(&gb, s->decomp_buf, s->zstream.total_out);
+    bytestream2_init_writer(&pb, s->uncomp_buf, s->uncomp_size);
+
+    ret = rle_uncompress(avctx, &gb, &pb);
+    if (ret)
+        return ret;
+
+    for (j = 0; j < avctx->height; j++) {
+        memcpy(frame->data[0] + (avctx->height - j - 1) * frame->linesize[0],
+               s->uncomp_buf + s->bpp * j * avctx->width, s->bpp * avctx->width);
+    }
+
+    frame->key_frame = 1;
+    frame->pict_type = AV_PICTURE_TYPE_I;
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    MSCCContext *s = avctx->priv_data;
+    int stride, zret;
+
+    switch (avctx->bits_per_coded_sample) {
+    case  8: avctx->pix_fmt = AV_PIX_FMT_PAL8;   break;
+    case 16: avctx->pix_fmt = AV_PIX_FMT_RGB555; break;
+    case 24: avctx->pix_fmt = AV_PIX_FMT_BGR24;  break;
+    case 32: avctx->pix_fmt = AV_PIX_FMT_BGRA;   break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported bitdepth %i\n", avctx->bits_per_coded_sample);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->bpp = avctx->bits_per_coded_sample >> 3;
+    stride = 4 * ((avctx->width * avctx->bits_per_coded_sample + 31) / 32);
+
+    s->decomp_size = 2 * avctx->height * stride;
+    if (!(s->decomp_buf = av_malloc(s->decomp_size)))
+        return AVERROR(ENOMEM);
+
+    s->uncomp_size = avctx->height * stride;
+    if (!(s->uncomp_buf = av_malloc(s->uncomp_size)))
+        return AVERROR(ENOMEM);
+
+    s->zstream.zalloc = Z_NULL;
+    s->zstream.zfree = Z_NULL;
+    s->zstream.opaque = Z_NULL;
+    zret = inflateInit(&s->zstream);
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate init error: %d\n", zret);
+        return AVERROR_UNKNOWN;
+    }
+
+    return 0;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    MSCCContext *s = avctx->priv_data;
+
+    av_freep(&s->decomp_buf);
+    s->decomp_size = 0;
+    av_freep(&s->uncomp_buf);
+    s->uncomp_size = 0;
+    inflateEnd(&s->zstream);
+
+    return 0;
+}
+
+AVCodec ff_mscc_decoder = {
+    .name             = "mscc",
+    .long_name        = NULL_IF_CONFIG_SMALL("Mandsoft Screen Capture Codec"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_MSCC,
+    .priv_data_size   = sizeof(MSCCContext),
+    .init             = decode_init,
+    .close            = decode_close,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1,
+    .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
+};
+
+AVCodec ff_srgc_decoder = {
+    .name             = "srgc",
+    .long_name        = NULL_IF_CONFIG_SMALL("Screen Recorder Gold Codec"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_SRGC,
+    .priv_data_size   = sizeof(MSCCContext),
+    .init             = decode_init,
+    .close            = decode_close,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1,
+    .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/msgsmdec.c b/libavcodec/msgsmdec.c
index 01ea95f..003bc2c 100644
--- a/libavcodec/msgsmdec.c
+++ b/libavcodec/msgsmdec.c
@@ -2,26 +2,25 @@
  * gsm 06.10 decoder, Microsoft variant
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "gsm.h"
 #include "msgsmdec.h"
 
@@ -31,10 +30,10 @@ int ff_msgsm_decode_block(AVCodecContext *avctx, int16_t *samples,
                           const uint8_t *buf, int mode)
 {
     int res;
-    BitstreamContext bc;
-    bitstream_init8(&bc, buf, GSM_MS_BLOCK_SIZE);
-    res = gsm_decode_block(avctx, samples, &bc, mode);
+    GetBitContext gb;
+    init_get_bits(&gb, buf, GSM_MS_BLOCK_SIZE * 8);
+    res = gsm_decode_block(avctx, samples, &gb, mode);
     if (res < 0)
         return res;
-    return gsm_decode_block(avctx, samples + GSM_FRAME_SIZE, &bc, mode);
+    return gsm_decode_block(avctx, samples + GSM_FRAME_SIZE, &gb, mode);
 }
diff --git a/libavcodec/msgsmdec.h b/libavcodec/msgsmdec.h
index adbda9a..b2a1a62 100644
--- a/libavcodec/msgsmdec.h
+++ b/libavcodec/msgsmdec.h
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder, Microsoft variant
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c
index 3ab3bd5..920f50f 100644
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,7 @@
 #include "msmpeg4data.h"
 #include "mpegvideodata.h"
 #include "vc1data.h"
+#include "libavutil/imgutils.h"
 
 /*
  * You can also call this codec: MPEG-4 with a twist!
@@ -52,6 +53,9 @@ static av_cold void init_h263_dc_for_msmpeg4(void)
 {
         int level, uni_code, uni_len;
 
+        if(ff_v2_dc_chroma_table[255 + 256][1])
+            return;
+
         for(level=-256; level<256; level++){
             int size, v, l;
             /* find number of bits */
@@ -104,8 +108,6 @@ static av_cold void init_h263_dc_for_msmpeg4(void)
 
 av_cold void ff_msmpeg4_common_init(MpegEncContext *s)
 {
-    static int initialized=0;
-
     switch(s->msmpeg4_version){
     case 1:
     case 2:
@@ -144,11 +146,7 @@ av_cold void ff_msmpeg4_common_init(MpegEncContext *s)
     }
     //Note the default tables are set in common_init in mpegvideo.c
 
-    if(!initialized){
-        initialized=1;
-
-        init_h263_dc_for_msmpeg4();
-    }
+    init_h263_dc_for_msmpeg4();
 }
 
 /* predict coded block */
@@ -178,13 +176,13 @@ int ff_msmpeg4_coded_block_pred(MpegEncContext * s, int n, uint8_t **coded_block
     return pred;
 }
 
-static int get_dc(uint8_t *src, int stride, int scale)
+static int get_dc(uint8_t *src, int stride, int scale, int block_size)
 {
     int y;
     int sum=0;
-    for(y=0; y<8; y++){
+    for(y=0; y<block_size; y++){
         int x;
-        for(x=0; x<8; x++){
+        for(x=0; x<block_size; x++){
             sum+=src[x + y*stride];
         }
     }
@@ -230,13 +228,13 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
         "addl %%eax, %2         \n\t"
         "addl %%eax, %1         \n\t"
         "addl %0, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %0         \n\t"
         "movl %1, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %1         \n\t"
         "movl %2, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %2         \n\t"
         : "+b" (a), "+c" (b), "+D" (c)
         : "g" (scale), "S" (ff_inverse[scale])
@@ -276,17 +274,18 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
                     *dir_ptr = 0;
                 }
             }else{
+                int bs = 8 >> s->avctx->lowres;
                 if(n<4){
                     wrap= s->linesize;
-                    dest= s->current_picture.f->data[0] + (((n >> 1) + 2*s->mb_y) * 8*  wrap ) + ((n & 1) + 2*s->mb_x) * 8;
+                    dest= s->current_picture.f->data[0] + (((n >> 1) + 2*s->mb_y) * bs*  wrap ) + ((n & 1) + 2*s->mb_x) * bs;
                 }else{
                     wrap= s->uvlinesize;
-                    dest= s->current_picture.f->data[n - 3] + (s->mb_y * 8 * wrap) + s->mb_x * 8;
+                    dest= s->current_picture.f->data[n - 3] + (s->mb_y * bs * wrap) + s->mb_x * bs;
                 }
                 if(s->mb_x==0) a= (1024 + (scale>>1))/scale;
-                else           a= get_dc(dest-8, wrap, scale*8);
+                else           a= get_dc(dest-bs, wrap, scale*8>>(2*s->avctx->lowres), bs);
                 if(s->mb_y==0) c= (1024 + (scale>>1))/scale;
-                else           c= get_dc(dest-8*wrap, wrap, scale*8);
+                else           c= get_dc(dest-bs*wrap, wrap, scale*8>>(2*s->avctx->lowres), bs);
 
                 if (s->h263_aic_dir==0) {
                     pred= a;
diff --git a/libavcodec/msmpeg4.h b/libavcodec/msmpeg4.h
index e57ae66..bcdb967 100644
--- a/libavcodec/msmpeg4.h
+++ b/libavcodec/msmpeg4.h
@@ -2,20 +2,20 @@
  * MSMPEG4 backend for encoder and decoder
  * copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,10 +69,12 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
 #define CONFIG_MSMPEG4_DECODER (CONFIG_MSMPEG4V1_DECODER || \
                                 CONFIG_MSMPEG4V2_DECODER || \
                                 CONFIG_MSMPEG4V3_DECODER || \
+                                CONFIG_WMV1_DECODER      || \
                                 CONFIG_WMV2_DECODER      || \
                                 CONFIG_VC1_DECODER)
 #define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V2_ENCODER || \
                                 CONFIG_MSMPEG4V3_ENCODER || \
+                                CONFIG_WMV1_ENCODER      || \
                                 CONFIG_WMV2_ENCODER)
 
 #endif /* AVCODEC_MSMPEG4_H */
diff --git a/libavcodec/msmpeg4data.c b/libavcodec/msmpeg4data.c
index 6bc0520..b9c1d8e 100644
--- a/libavcodec/msmpeg4data.c
+++ b/libavcodec/msmpeg4data.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4data.h b/libavcodec/msmpeg4data.h
index 2c2dfab..02199d0 100644
--- a/libavcodec/msmpeg4data.h
+++ b/libavcodec/msmpeg4data.h
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4dec.c b/libavcodec/msmpeg4dec.c
index a2d0ad4..16b6719 100644
--- a/libavcodec/msmpeg4dec.c
+++ b/libavcodec/msmpeg4dec.c
@@ -1,24 +1,24 @@
 /*
  * MSMPEG4 backend for encoder and decoder
  * Copyright (c) 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2013 Michael Niedermayer <michaelni@gmx.at>
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "msmpeg4.h"
+#include "libavutil/imgutils.h"
 #include "h263.h"
 #include "mpeg4video.h"
 #include "msmpeg4data.h"
@@ -103,6 +104,7 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
 static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
 {
     int cbp, code, i;
+    uint32_t * const mb_type_ptr = &s->current_picture.mb_type[s->mb_x + s->mb_y*s->mb_stride];
 
     if (s->pict_type == AV_PICTURE_TYPE_P) {
         if (s->use_skip_mb_code) {
@@ -116,6 +118,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
                 s->mv[0][0][0] = 0;
                 s->mv[0][0][1] = 0;
                 s->mb_skipped = 1;
+                *mb_type_ptr = MB_TYPE_SKIP | MB_TYPE_L0 | MB_TYPE_16x16;
                 return 0;
             }
         }
@@ -137,7 +140,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
         if(s->msmpeg4_version==2)
             cbp= get_vlc2(&s->gb, v2_intra_cbpc_vlc.table, V2_INTRA_CBPC_VLC_BITS, 1);
         else
-            cbp= get_vlc2(&s->gb, ff_h263_intra_MCBPC_vlc.table, INTRA_MCBPC_VLC_BITS, 1);
+            cbp= get_vlc2(&s->gb, ff_h263_intra_MCBPC_vlc.table, INTRA_MCBPC_VLC_BITS, 2);
         if(cbp<0 || cbp>3){
             av_log(s->avctx, AV_LOG_ERROR, "cbpc %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y);
             return -1;
@@ -164,15 +167,28 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
         s->mv_type = MV_TYPE_16X16;
         s->mv[0][0][0] = mx;
         s->mv[0][0][1] = my;
+        *mb_type_ptr = MB_TYPE_L0 | MB_TYPE_16x16;
     } else {
+        int v;
         if(s->msmpeg4_version==2){
             s->ac_pred = get_bits1(&s->gb);
-            cbp|= get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
+            v = get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1);
+            if (v < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "cbpy vlc invalid\n");
+                return -1;
+            }
+            cbp|= v<<2;
         } else{
             s->ac_pred = 0;
-            cbp|= get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
+            v = get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1);
+            if (v < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "cbpy vlc invalid\n");
+                return -1;
+            }
+            cbp|= v<<2;
             if(s->pict_type==AV_PICTURE_TYPE_P) cbp^=0x3C;
         }
+        *mb_type_ptr = MB_TYPE_INTRA;
     }
 
     s->bdsp.clear_blocks(s->block[0]);
@@ -192,6 +208,9 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, int16_t block[6][64])
     uint8_t *coded_val;
     uint32_t * const mb_type_ptr = &s->current_picture.mb_type[s->mb_x + s->mb_y*s->mb_stride];
 
+    if (get_bits_left(&s->gb) <= 0)
+        return AVERROR_INVALIDDATA;
+
     if (s->pict_type == AV_PICTURE_TYPE_P) {
         if (s->use_skip_mb_code) {
             if (get_bits1(&s->gb)) {
@@ -282,18 +301,19 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, int16_t block[6][64])
 av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
-    static int done = 0;
-    int i;
+    static volatile int done = 0;
+    int i, ret;
     MVTable *mv;
 
+    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return ret;
+
     if (ff_h263_decode_init(avctx) < 0)
         return -1;
 
     ff_msmpeg4_common_init(s);
 
     if (!done) {
-        done = 1;
-
         for(i=0;i<NB_RL_TABLES;i++) {
             ff_rl_init(&ff_rl_table[i], ff_static_rl_table_store[i]);
         }
@@ -363,6 +383,7 @@ av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
         INIT_VLC_STATIC(&ff_inter_intra_vlc, INTER_INTRA_VLC_BITS, 4,
                  &ff_table_inter_intra[0][1], 2, 1,
                  &ff_table_inter_intra[0][0], 2, 1, 8);
+        done = 1;
     }
 
     switch(s->msmpeg4_version){
@@ -391,6 +412,14 @@ int ff_msmpeg4_decode_picture_header(MpegEncContext * s)
 {
     int code;
 
+    // at minimum one bit per macroblock is required at least in a valid frame,
+    // we discard frames much smaller than this. Frames smaller than 1/8 of the
+    // smallest "black/skip" frame generally contain not much recoverable content
+    // while at the same time they have the highest computational requirements
+    // per byte
+    if (get_bits_left(&s->gb) * 8LL < (s->width+15)/16 * ((s->height+15)/16))
+        return AVERROR_INVALIDDATA;
+
     if(s->msmpeg4_version==1){
         int start_code = get_bits_long(&s->gb, 32);
         if(start_code!=0x00000100){
@@ -526,7 +555,7 @@ int ff_msmpeg4_decode_picture_header(MpegEncContext * s)
             s->no_rounding = 0;
         }
     }
-    ff_dlog(s->avctx, "%d %d %d %d %d\n", s->pict_type, s->bit_rate,
+    ff_dlog(s->avctx, "%d %"PRId64" %d %d %d\n", s->pict_type, s->bit_rate,
             s->inter_intra_pred, s->width, s->height);
 
     s->esc3_level_length= 0;
@@ -573,8 +602,11 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         } else {
             level = get_vlc2(&s->gb, v2_dc_chroma_vlc.table, DC_VLC_BITS, 3);
         }
-        if (level < 0)
+        if (level < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
+            *dir_ptr = 0;
             return -1;
+        }
         level-=256;
     }else{  //FIXME optimize use unified tables & index
         if (n < 4) {
@@ -584,6 +616,7 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         }
         if (level < 0){
             av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
+            *dir_ptr = 0;
             return -1;
         }
 
@@ -639,7 +672,6 @@ int ff_msmpeg4_decode_block(MpegEncContext * s, int16_t * block,
         if (level < 0){
             av_log(s->avctx, AV_LOG_ERROR, "dc overflow- block: %d qscale: %d//\n", n, s->qscale);
             if(s->inter_intra_pred) level=0;
-            else                    return -1;
         }
         if (n < 4) {
             rl = &ff_rl_table[s->rl_table_index];
@@ -776,9 +808,10 @@ int ff_msmpeg4_decode_block(MpegEncContext * s, int16_t * block,
             if(i&(~63)){
                 const int left= get_bits_left(&s->gb);
                 if (((i + 192 == 64 && level / qmul == -1) ||
-                     !(s->avctx->err_recognition & AV_EF_BITSTREAM)) &&
+                     !(s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))) &&
                     left >= 0) {
                     av_log(s->avctx, AV_LOG_ERROR, "ignoring overflow at %d %d\n", s->mb_x, s->mb_y);
+                    i = 63;
                     break;
                 }else{
                     av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
@@ -855,6 +888,8 @@ AVCodec ff_msmpeg4v1_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -871,6 +906,8 @@ AVCodec ff_msmpeg4v2_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -887,6 +924,8 @@ AVCodec ff_msmpeg4v3_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -903,6 +942,8 @@ AVCodec ff_wmv1_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/msmpeg4enc.c b/libavcodec/msmpeg4enc.c
index 6d353e5..144468b 100644
--- a/libavcodec/msmpeg4enc.c
+++ b/libavcodec/msmpeg4enc.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,7 +34,6 @@
 #include "libavutil/avutil.h"
 #include "libavutil/mem.h"
 #include "mpegvideo.h"
-#include "msmpeg4.h"
 #include "h263.h"
 #include "internal.h"
 #include "mpeg4video.h"
@@ -160,8 +159,8 @@ av_cold int ff_msmpeg4_encode_init(MpegEncContext *s)
 static void find_best_tables(MpegEncContext * s)
 {
     int i;
-    int best       =-1, best_size       =9999999;
-    int chroma_best=-1, best_chroma_size=9999999;
+    int best        = 0, best_size        = INT_MAX;
+    int chroma_best = 0, best_chroma_size = INT_MAX;
 
     for(i=0; i<3; i++){
         int level;
@@ -241,7 +240,7 @@ void ff_msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     s->per_mb_rl_table = 0;
     if(s->msmpeg4_version==4)
         s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE && s->pict_type==AV_PICTURE_TYPE_P);
-    ff_dlog(s, "%d %d %d %d %d\n", s->pict_type, s->bit_rate,
+    ff_dlog(s, "%d %"PRId64" %d %d %d\n", s->pict_type, s->bit_rate,
             s->inter_intra_pred, s->width, s->height);
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
@@ -284,14 +283,15 @@ void ff_msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 
 void ff_msmpeg4_encode_ext_header(MpegEncContext * s)
 {
-        put_bits(&s->pb, 5, s->avctx->time_base.den / s->avctx->time_base.num); //yes 29.97 -> 29
+        unsigned fps = s->avctx->time_base.den / s->avctx->time_base.num / FFMAX(s->avctx->ticks_per_frame, 1);
+        put_bits(&s->pb, 5, FFMIN(fps, 31)); //yes 29.97 -> 29
 
         put_bits(&s->pb, 11, FFMIN(s->bit_rate/1024, 2047));
 
         if(s->msmpeg4_version>=3)
             put_bits(&s->pb, 1, s->flipflop_rounding);
         else
-            assert(s->flipflop_rounding==0);
+            av_assert0(s->flipflop_rounding==0);
 }
 
 void ff_msmpeg4_encode_motion(MpegEncContext * s,
@@ -499,7 +499,7 @@ void ff_msmpeg4_encode_mb(MpegEncContext * s,
 static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr)
 {
     int sign, code;
-    int pred, extquant;
+    int pred, av_uninit(extquant);
     int extrabits = 0;
 
     int16_t *dc_val;
diff --git a/libavcodec/msrle.c b/libavcodec/msrle.c
index a7838ab..1ab8a41 100644
--- a/libavcodec/msrle.c
+++ b/libavcodec/msrle.c
@@ -2,20 +2,20 @@
  * Microsoft RLE video decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "msrledec.h"
+#include "libavutil/imgutils.h"
 
 typedef struct MsrleContext {
     AVCodecContext *avctx;
@@ -50,10 +51,14 @@ typedef struct MsrleContext {
 static av_cold int msrle_decode_init(AVCodecContext *avctx)
 {
     MsrleContext *s = avctx->priv_data;
+    int i;
 
     s->avctx = avctx;
 
     switch (avctx->bits_per_coded_sample) {
+    case 1:
+        avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+        break;
     case 4:
     case 8:
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -70,6 +75,10 @@ static av_cold int msrle_decode_init(AVCodecContext *avctx)
     if (!s->frame)
         return AVERROR(ENOMEM);
 
+    if (avctx->extradata_size >= 4)
+        for (i = 0; i < FFMIN(avctx->extradata_size, AVPALETTE_SIZE)/4; i++)
+            s->pal[i] = 0xFFU<<24 | AV_RL32(avctx->extradata+4*i);
+
     return 0;
 }
 
@@ -86,30 +95,36 @@ static int msrle_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if (buf_size < 2) //Minimally a end of picture code should be there
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
-    if (avctx->bits_per_coded_sample <= 8) {
-        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+    if (avctx->bits_per_coded_sample > 1 && avctx->bits_per_coded_sample <= 8) {
+        int size;
+        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &size);
 
-        if (pal) {
+        if (pal && size == AVPALETTE_SIZE) {
             s->frame->palette_has_changed = 1;
             memcpy(s->pal, pal, AVPALETTE_SIZE);
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
         }
-
         /* make the palette available */
         memcpy(s->frame->data[1], s->pal, AVPALETTE_SIZE);
     }
 
     /* FIXME how to correctly detect RLE ??? */
     if (avctx->height * istride == avpkt->size) { /* assume uncompressed */
-        int linesize = avctx->width * avctx->bits_per_coded_sample / 8;
+        int linesize = av_image_get_linesize(avctx->pix_fmt, avctx->width, 0);
         uint8_t *ptr = s->frame->data[0];
         uint8_t *buf = avpkt->data + (avctx->height-1)*istride;
         int i, j;
 
+        if (linesize < 0)
+            return linesize;
+
         for (i = 0; i < avctx->height; i++) {
             if (avctx->bits_per_coded_sample == 4) {
                 for (j = 0; j < avctx->width - 1; j += 2) {
diff --git a/libavcodec/msrledec.c b/libavcodec/msrledec.c
index f45179f..f0cbde6 100644
--- a/libavcodec/msrledec.c
+++ b/libavcodec/msrledec.c
@@ -2,20 +2,20 @@
  * Microsoft RLE decoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,17 +36,15 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
     unsigned char rle_code;
     unsigned char extra_byte, odd_pixel;
     unsigned char stream_byte;
-    unsigned int pixel_ptr = 0;
-    int row_dec = pic->linesize[0];
-    int row_ptr = (avctx->height - 1) * row_dec;
-    int frame_size = FFABS(row_dec) * avctx->height;
+    int pixel_ptr = 0;
+    int line = avctx->height - 1;
     int i;
 
-    while (row_ptr >= 0) {
+    while (line >= 0 && pixel_ptr <= avctx->width) {
         if (bytestream2_get_bytes_left(gb) <= 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "MS RLE: bytestream overrun, %d rows left\n",
-                   row_ptr);
+                   "MS RLE: bytestream overrun, %dx%d left\n",
+                   avctx->width - pixel_ptr, line);
             return AVERROR_INVALIDDATA;
         }
         rle_code = stream_byte = bytestream2_get_byteu(gb);
@@ -55,7 +53,7 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
             stream_byte = bytestream2_get_byte(gb);
             if (stream_byte == 0) {
                 /* line is done, goto the next one */
-                row_ptr -= row_dec;
+                line--;
                 pixel_ptr = 0;
             } else if (stream_byte == 1) {
                 /* decode is done */
@@ -65,13 +63,13 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
                 stream_byte = bytestream2_get_byte(gb);
                 pixel_ptr += stream_byte;
                 stream_byte = bytestream2_get_byte(gb);
-                row_ptr -= stream_byte * row_dec;
+                line -= stream_byte;
             } else {
                 // copy pixels from encoded stream
                 odd_pixel =  stream_byte & 1;
                 rle_code = (stream_byte + 1) / 2;
                 extra_byte = rle_code & 0x01;
-                if (row_ptr + pixel_ptr + stream_byte > frame_size ||
+                if (pixel_ptr + 2*rle_code - odd_pixel > avctx->width ||
                     bytestream2_get_bytes_left(gb) < rle_code) {
                     av_log(avctx, AV_LOG_ERROR,
                            "MS RLE: frame/stream ptr just went out of bounds (copy)\n");
@@ -82,13 +80,13 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
                     if (pixel_ptr >= avctx->width)
                         break;
                     stream_byte = bytestream2_get_byteu(gb);
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte >> 4;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte >> 4;
                     pixel_ptr++;
                     if (i + 1 == rle_code && odd_pixel)
                         break;
                     if (pixel_ptr >= avctx->width)
                         break;
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte & 0x0F;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte & 0x0F;
                     pixel_ptr++;
                 }
 
@@ -98,9 +96,9 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
             }
         } else {
             // decode a run of data
-            if (row_ptr + pixel_ptr + stream_byte > frame_size) {
+            if (pixel_ptr + rle_code > avctx->width + 1) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "MS RLE: frame ptr just went out of bounds (run)\n");
+                       "MS RLE: frame ptr just went out of bounds (run) %d %d %d\n", pixel_ptr, rle_code, avctx->width);
                 return AVERROR_INVALIDDATA;
             }
             stream_byte = bytestream2_get_byte(gb);
@@ -108,9 +106,9 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
                 if (pixel_ptr >= avctx->width)
                     break;
                 if ((i & 1) == 0)
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte >> 4;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte >> 4;
                 else
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte & 0x0F;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte & 0x0F;
                 pixel_ptr++;
             }
         }
@@ -138,7 +136,8 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
     unsigned int width= FFABS(pic->linesize[0]) / (depth >> 3);
 
     output     = pic->data[0] + (avctx->height - 1) * pic->linesize[0];
-    output_end = pic->data[0] +  avctx->height      * pic->linesize[0];
+    output_end = output + FFABS(pic->linesize[0]);
+
     while (bytestream2_get_bytes_left(gb) > 0) {
         p1 = bytestream2_get_byteu(gb);
         if(p1 == 0) { //Escape code
@@ -155,6 +154,7 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
                     }
                 }
                 output = pic->data[0] + line * pic->linesize[0];
+                output_end = output + FFABS(pic->linesize[0]);
                 pos = 0;
                 continue;
             } else if(p2 == 1) { //End-of-picture
@@ -169,11 +169,11 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
                     return -1;
                 }
                 output = pic->data[0] + line * pic->linesize[0] + pos * (depth >> 3);
+                output_end = pic->data[0] + line * pic->linesize[0] + FFABS(pic->linesize[0]);
                 continue;
             }
             // Copy data
-            if ((pic->linesize[0] > 0 && output + p2 * (depth >> 3) > output_end) ||
-                (pic->linesize[0] < 0 && output + p2 * (depth >> 3) < output_end)) {
+            if (output + p2 * (depth >> 3) > output_end) {
                 bytestream2_skip(gb, 2 * (depth >> 3));
                 continue;
             } else if (bytestream2_get_bytes_left(gb) < p2 * (depth >> 3)) {
@@ -182,9 +182,9 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
             }
 
             if ((depth == 8) || (depth == 24)) {
-                for(i = 0; i < p2 * (depth >> 3); i++) {
-                    *output++ = bytestream2_get_byteu(gb);
-                }
+                bytestream2_get_bufferu(gb, output, p2 * (depth >> 3));
+                output += p2 * (depth >> 3);
+
                 // RLE8 copy is actually padded - and runs are not!
                 if(depth == 8 && (p2 & 1)) {
                     bytestream2_skip(gb, 1);
@@ -203,36 +203,39 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
             pos += p2;
         } else { //run of pixels
             uint8_t pix[3]; //original pixel
-            switch(depth){
-            case  8: pix[0] = bytestream2_get_byte(gb);
-                     break;
-            case 16: pix16  = bytestream2_get_le16(gb);
-                     break;
-            case 24: pix[0] = bytestream2_get_byte(gb);
-                     pix[1] = bytestream2_get_byte(gb);
-                     pix[2] = bytestream2_get_byte(gb);
-                     break;
-            case 32: pix32  = bytestream2_get_le32(gb);
-                     break;
-            }
-            if ((pic->linesize[0] > 0 && output + p1 * (depth >> 3) > output_end) ||
-                (pic->linesize[0] < 0 && output + p1 * (depth >> 3) < output_end))
+            if (output + p1 * (depth >> 3) > output_end)
                 continue;
-            for(i = 0; i < p1; i++) {
-                switch(depth){
-                case  8: *output++ = pix[0];
-                         break;
-                case 16: *(uint16_t*)output = pix16;
-                         output += 2;
-                         break;
-                case 24: *output++ = pix[0];
-                         *output++ = pix[1];
-                         *output++ = pix[2];
-                         break;
-                case 32: *(uint32_t*)output = pix32;
-                         output += 4;
-                         break;
+
+            switch(depth){
+            case  8:
+                pix[0] = bytestream2_get_byte(gb);
+                memset(output, pix[0], p1);
+                output += p1;
+                break;
+            case 16:
+                pix16  = bytestream2_get_le16(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint16_t*)output = pix16;
+                        output += 2;
+                }
+                break;
+            case 24:
+                pix[0] = bytestream2_get_byte(gb);
+                pix[1] = bytestream2_get_byte(gb);
+                pix[2] = bytestream2_get_byte(gb);
+                for(i = 0; i < p1; i++) {
+                        *output++ = pix[0];
+                        *output++ = pix[1];
+                        *output++ = pix[2];
+                }
+                break;
+            case 32:
+                pix32  = bytestream2_get_le32(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint32_t*)output = pix32;
+                        output += 4;
                 }
+                break;
             }
             pos += p1;
         }
diff --git a/libavcodec/msrledec.h b/libavcodec/msrledec.h
index 0c5b8b1..7f7bbcf 100644
--- a/libavcodec/msrledec.h
+++ b/libavcodec/msrledec.h
@@ -2,20 +2,20 @@
  * Microsoft RLE decoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss1.c b/libavcodec/mss1.c
index 6679a1c..a579d9d 100644
--- a/libavcodec/mss1.c
+++ b/libavcodec/mss1.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 1 (aka Windows Media Video V7 Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,7 +34,7 @@ typedef struct MSS1Context {
     SliceContext   sc;
 } MSS1Context;
 
-static void arith1_normalise(ArithCoder *c)
+static void arith_normalise(ArithCoder *c)
 {
     for (;;) {
         if (c->high >= 0x8000) {
@@ -60,7 +60,7 @@ static void arith1_normalise(ArithCoder *c)
     }
 }
 
-ARITH_GET_BIT(1)
+ARITH_GET_BIT(arith)
 
 static int arith_get_bits(ArithCoder *c, int bits)
 {
@@ -71,7 +71,7 @@ static int arith_get_bits(ArithCoder *c, int bits)
     c->high   = ((prob + range) >> bits) + c->low - 1;
     c->low   += prob >> bits;
 
-    arith1_normalise(c);
+    arith_normalise(c);
 
     return val;
 }
@@ -85,12 +85,12 @@ static int arith_get_number(ArithCoder *c, int mod_val)
     c->high   = (prob + range) / mod_val + c->low - 1;
     c->low   += prob / mod_val;
 
-    arith1_normalise(c);
+    arith_normalise(c);
 
     return val;
 }
 
-static int arith1_get_prob(ArithCoder *c, int16_t *probs)
+static int arith_get_prob(ArithCoder *c, int16_t *probs)
 {
     int range = c->high - c->low + 1;
     int val   = ((c->value - c->low + 1) * probs[0] - 1) / range;
@@ -105,7 +105,7 @@ static int arith1_get_prob(ArithCoder *c, int16_t *probs)
     return sym;
 }
 
-ARITH_GET_MODEL_SYM(1)
+ARITH_GET_MODEL_SYM(arith)
 
 static void arith_init(ArithCoder *c, GetBitContext *gb)
 {
@@ -113,7 +113,7 @@ static void arith_init(ArithCoder *c, GetBitContext *gb)
     c->high          = 0xFFFF;
     c->value         = get_bits(gb, 16);
     c->gbc.gb        = gb;
-    c->get_model_sym = arith1_get_model_sym;
+    c->get_model_sym = arith_get_model_sym;
     c->get_number    = arith_get_number;
 }
 
@@ -130,7 +130,7 @@ static int decode_pal(MSS12Context *ctx, ArithCoder *acoder)
         r = arith_get_bits(acoder, 8);
         g = arith_get_bits(acoder, 8);
         b = arith_get_bits(acoder, 8);
-        *pal++ = (r << 16) | (g << 8) | b;
+        *pal++ = (0xFFU << 24) | (r << 16) | (g << 8) | b;
     }
 
     return !!ncol;
@@ -139,8 +139,6 @@ static int decode_pal(MSS12Context *ctx, ArithCoder *acoder)
 static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                              AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     MSS1Context *ctx = avctx->priv_data;
     MSS12Context *c = &ctx->ctx;
     GetBitContext gb;
@@ -148,17 +146,17 @@ static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int pal_changed = 0;
     int ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+
     arith_init(&acoder, &gb);
 
-    if ((ret = ff_reget_buffer(avctx, ctx->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, ctx->pic)) < 0)
         return ret;
-    }
 
     c->pal_pic    =  ctx->pic->data[0] + ctx->pic->linesize[0] * (avctx->height - 1);
     c->pal_stride = -ctx->pic->linesize[0];
-    c->keyframe   = !arith1_get_bit(&acoder);
+    c->keyframe   = !arith_get_bit(&acoder);
     if (c->keyframe) {
         c->corrupted = 0;
         ff_mss12_slicecontext_reset(&ctx->sc);
@@ -184,7 +182,7 @@ static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     *got_frame      = 1;
 
     /* always report that the buffer was completely consumed */
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int mss1_decode_init(AVCodecContext *avctx)
@@ -199,6 +197,8 @@ static av_cold int mss1_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     ret = ff_mss12_decode_init(&c->ctx, 0, &c->sc, NULL);
+    if (ret < 0)
+        av_frame_free(&c->pic);
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
diff --git a/libavcodec/mss12.c b/libavcodec/mss12.c
index 8735b85..3b1a302 100644
--- a/libavcodec/mss12.c
+++ b/libavcodec/mss12.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -451,7 +451,7 @@ static int decode_pivot(SliceContext *sc, ArithCoder *acoder, int base)
         val = acoder->get_number(acoder, (base + 1) / 2 - 2) + 3;
     }
 
-    if (val >= base)
+    if ((unsigned)val >= base)
         return -1;
 
     return inv ? base - val : val;
@@ -582,22 +582,17 @@ av_cold int ff_mss12_decode_init(MSS12Context *c, int version,
         return AVERROR_INVALIDDATA;
     }
 
-    avctx->coded_width  = AV_RB32(avctx->extradata + 20);
-    avctx->coded_height = AV_RB32(avctx->extradata + 24);
+    avctx->coded_width  = FFMAX(AV_RB32(avctx->extradata + 20), avctx->width);
+    avctx->coded_height = FFMAX(AV_RB32(avctx->extradata + 24), avctx->height);
     if (avctx->coded_width > 4096 || avctx->coded_height > 4096) {
         av_log(avctx, AV_LOG_ERROR, "Frame dimensions %dx%d too large",
                avctx->coded_width, avctx->coded_height);
         return AVERROR_INVALIDDATA;
     }
-    if (avctx->width || avctx->height) {
-        if (avctx->width  <= 0 || avctx->width > avctx->coded_width ||
-            avctx->height <= 0 || avctx->height > avctx->coded_height) {
-            av_log(avctx, AV_LOG_ERROR, "Invalid display dimensions\n");
-            return AVERROR_INVALIDDATA;
-        }
-    } else {
-        avctx->width  = avctx->coded_width;
-        avctx->height = avctx->coded_height;
+    if (avctx->coded_width < 1 || avctx->coded_height < 1) {
+        av_log(avctx, AV_LOG_ERROR, "Frame dimensions %dx%d too small",
+               avctx->coded_width, avctx->coded_height);
+        return AVERROR_INVALIDDATA;
     }
 
     av_log(avctx, AV_LOG_DEBUG, "Encoder version %"PRIu32".%"PRIu32"\n",
@@ -658,11 +653,11 @@ av_cold int ff_mss12_decode_init(MSS12Context *c, int version,
     }
 
     for (i = 0; i < 256; i++)
-        c->pal[i] = AV_RB24(avctx->extradata + 52 +
+        c->pal[i] = 0xFFU << 24 | AV_RB24(avctx->extradata + 52 +
                             (version ? 8 : 0) + i * 3);
 
     c->mask_stride = FFALIGN(avctx->width, 16);
-    c->mask        = av_malloc(c->mask_stride * avctx->height);
+    c->mask        = av_malloc_array(c->mask_stride, avctx->height);
     if (!c->mask) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate mask plane\n");
         return AVERROR(ENOMEM);
diff --git a/libavcodec/mss12.h b/libavcodec/mss12.h
index 8cad5dc..45c4074 100644
--- a/libavcodec/mss12.h
+++ b/libavcodec/mss12.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -99,8 +99,8 @@ int ff_mss12_decode_init(MSS12Context *c, int version,
                          SliceContext *sc1, SliceContext *sc2);
 int ff_mss12_decode_end(MSS12Context *ctx);
 
-#define ARITH_GET_BIT(VERSION)                                          \
-static int arith ## VERSION ## _get_bit(ArithCoder *c)                  \
+#define ARITH_GET_BIT(prefix)                                           \
+static int prefix ## _get_bit(ArithCoder *c)                            \
 {                                                                       \
     int range = c->high - c->low + 1;                                   \
     int bit   = 2 * c->value - c->low >= c->high;                       \
@@ -110,22 +110,22 @@ static int arith ## VERSION ## _get_bit(ArithCoder *c)                  \
     else                                                                \
         c->high = c->low + (range >> 1) - 1;                            \
                                                                         \
-    arith ## VERSION ## _normalise(c);                                  \
+    prefix ## _normalise(c);                                            \
                                                                         \
     return bit;                                                         \
 }
 
-#define ARITH_GET_MODEL_SYM(VERSION)                                    \
-static int arith ## VERSION ## _get_model_sym(ArithCoder *c, Model *m)  \
+#define ARITH_GET_MODEL_SYM(prefix)                                     \
+static int prefix ## _get_model_sym(ArithCoder *c, Model *m)            \
 {                                                                       \
     int idx, val;                                                       \
                                                                         \
-    idx = arith ## VERSION ## _get_prob(c, m->cum_prob);                \
+    idx = prefix ## _get_prob(c, m->cum_prob);                          \
                                                                         \
     val = m->idx2sym[idx];                                              \
     ff_mss12_model_update(m, idx);                                      \
                                                                         \
-    arith ## VERSION ## _normalise(c);                                  \
+    prefix ## _normalise(c);                                            \
                                                                         \
     return val;                                                         \
 }
diff --git a/libavcodec/mss2.c b/libavcodec/mss2.c
index 6fcadb1..2eb366e 100644
--- a/libavcodec/mss2.c
+++ b/libavcodec/mss2.c
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,13 +52,13 @@ static void arith2_normalise(ArithCoder *c)
             c->value ^= 0x8000;
             c->low   ^= 0x8000;
         }
-        c->high  = c->high  << 8 & 0xFFFFFF | 0xFF;
-        c->value = c->value << 8 & 0xFFFFFF | bytestream2_get_byte(c->gbc.gB);
-        c->low   = c->low   << 8 & 0xFFFFFF;
+        c->high  = (uint16_t)c->high  << 8  | 0xFF;
+        c->value = (uint16_t)c->value << 8  | bytestream2_get_byte(c->gbc.gB);
+        c->low   = (uint16_t)c->low   << 8;
     }
 }
 
-ARITH_GET_BIT(2)
+ARITH_GET_BIT(arith2)
 
 /* L. Stuiver and A. Moffat: "Piecewise Integer Mapping for Arithmetic Coding."
  * In Proc. 8th Data Compression Conference (DCC '98), pp. 3-12, Mar. 1998 */
@@ -131,7 +131,7 @@ static int arith2_get_prob(ArithCoder *c, int16_t *probs)
     return i;
 }
 
-ARITH_GET_MODEL_SYM(2)
+ARITH_GET_MODEL_SYM(arith2)
 
 static int arith2_get_consumed_bytes(ArithCoder *c)
 {
@@ -174,7 +174,7 @@ static int decode_pal_v2(MSS12Context *ctx, const uint8_t *buf, int buf_size)
     return 1 + ncol * 3;
 }
 
-static int decode_555(GetByteContext *gB, uint16_t *dst, ptrdiff_t stride,
+static int decode_555(AVCodecContext *avctx, GetByteContext *gB, uint16_t *dst, ptrdiff_t stride,
                       int keyframe, int w, int h)
 {
     int last_symbol = 0, repeat = 0, prev_avail = 0;
@@ -210,8 +210,13 @@ static int decode_555(GetByteContext *gB, uint16_t *dst, ptrdiff_t stride,
                     last_symbol = b << 8 | bytestream2_get_byte(gB);
                 else if (b > 129) {
                     repeat = 0;
-                    while (b-- > 130)
+                    while (b-- > 130) {
+                        if (repeat >= (INT_MAX >> 8) - 1) {
+                            av_log(avctx, AV_LOG_ERROR, "repeat overflow\n");
+                            return AVERROR_INVALIDDATA;
+                        }
                         repeat = (repeat << 8) + bytestream2_get_byte(gB) + 1;
+                    }
                     if (last_symbol == -2) {
                         int skip = FFMIN((unsigned)repeat, dst + w - p);
                         repeat -= skip;
@@ -318,7 +323,7 @@ static int decode_rle(GetBitContext *gb, uint8_t *pal_dst, ptrdiff_t pal_stride,
     if (next_code != 1 << current_length)
         return AVERROR_INVALIDDATA;
 
-    if (i = init_vlc(&vlc, 9, alphabet_size, bits, 1, 1, codes, 4, 4, 0))
+    if ((i = init_vlc(&vlc, 9, alphabet_size, bits, 1, 1, codes, 4, 4, 0)) < 0)
         return i;
 
     /* frame decode */
@@ -381,7 +386,8 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t *buf, int buf_size,
 
     ff_mpeg_flush(avctx);
 
-    init_get_bits(&s->gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
 
     s->loop_filter = avctx->skip_loop_filter < AVDISCARD_ALL;
 
@@ -430,8 +436,8 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t *buf, int buf_size,
 
     if (v->respic == 3) {
         ctx->dsp.upsample_plane(f->data[0], f->linesize[0], w,      h);
-        ctx->dsp.upsample_plane(f->data[1], f->linesize[1], w >> 1, h >> 1);
-        ctx->dsp.upsample_plane(f->data[2], f->linesize[2], w >> 1, h >> 1);
+        ctx->dsp.upsample_plane(f->data[1], f->linesize[1], w+1 >> 1, h+1 >> 1);
+        ctx->dsp.upsample_plane(f->data[2], f->linesize[2], w+1 >> 1, h+1 >> 1);
     } else if (v->respic)
         avpriv_request_sample(v->s.avctx,
                               "Asymmetric WMV9 rectangle subsampling");
@@ -458,9 +464,9 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t *buf, int buf_size,
     return 0;
 }
 
-typedef struct Rectangle {
+struct Rectangle {
     int coded, x, y, w, h;
-} Rectangle;
+};
 
 #define MAX_WMV9_RECTANGLES 20
 #define ARITH2_PADDING 2
@@ -479,10 +485,11 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     int keyframe, has_wmv9, has_mv, is_rle, is_555, ret;
 
-    Rectangle wmv9rects[MAX_WMV9_RECTANGLES], *r;
+    struct Rectangle wmv9rects[MAX_WMV9_RECTANGLES], *r;
     int used_rects = 0, i, implicit_rect = 0, av_uninit(wmv9_mask);
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
 
     if (keyframe = get_bits1(&gb))
         skip_bits(&gb, 7);
@@ -598,10 +605,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (c->mvX < 0 || c->mvY < 0) {
         FFSWAP(uint8_t *, c->pal_pic, c->last_pal_pic);
 
-        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
             return ret;
-        }
 
         if (ctx->last_pic->data[0]) {
             av_assert0(frame->linesize[0] == ctx->last_pic->linesize[0]);
@@ -612,10 +617,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
         }
     } else {
-        if ((ret = ff_reget_buffer(avctx, ctx->last_pic)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+        if ((ret = ff_reget_buffer(avctx, ctx->last_pic)) < 0)
             return ret;
-        }
         if ((ret = av_frame_ref(frame, ctx->last_pic)) < 0)
             return ret;
 
@@ -631,7 +634,7 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (is_555) {
         bytestream2_init(&gB, buf, buf_size);
 
-        if (decode_555(&gB, (uint16_t *)c->rgb_pic, c->rgb_stride >> 1,
+        if (decode_555(avctx, &gB, (uint16_t *)c->rgb_pic, c->rgb_stride >> 1,
                        keyframe, avctx->width, avctx->height))
             return AVERROR_INVALIDDATA;
 
@@ -644,7 +647,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 ff_mss12_slicecontext_reset(&ctx->sc[1]);
         }
         if (is_rle) {
-            init_get_bits(&gb, buf, buf_size * 8);
+            if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+                return ret;
             if (ret = decode_rle(&gb, c->pal_pic, c->pal_stride,
                                  c->rgb_pic, c->rgb_stride, c->pal, keyframe,
                                  ctx->split_position, 0,
@@ -822,10 +826,11 @@ static av_cold int mss2_decode_init(AVCodecContext *avctx)
     c->avctx = avctx;
     if (ret = ff_mss12_decode_init(c, 1, &ctx->sc[0], &ctx->sc[1]))
         return ret;
+    ctx->last_pic   = av_frame_alloc();
     c->pal_stride   = c->mask_stride;
     c->pal_pic      = av_mallocz(c->pal_stride * avctx->height);
     c->last_pal_pic = av_mallocz(c->pal_stride * avctx->height);
-    if (!c->pal_pic || !c->last_pal_pic) {
+    if (!c->pal_pic || !c->last_pal_pic || !ctx->last_pic) {
         mss2_decode_end(avctx);
         return AVERROR(ENOMEM);
     }
@@ -839,11 +844,6 @@ static av_cold int mss2_decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = c->free_colours == 127 ? AV_PIX_FMT_RGB555
                                             : AV_PIX_FMT_RGB24;
 
-    ctx->last_pic = av_frame_alloc();
-    if (!ctx->last_pic) {
-        mss2_decode_end(avctx);
-        return AVERROR(ENOMEM);
-    }
 
     return 0;
 }
diff --git a/libavcodec/mss2dsp.c b/libavcodec/mss2dsp.c
index 4de4dba..cc39dd6 100644
--- a/libavcodec/mss2dsp.c
+++ b/libavcodec/mss2dsp.c
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -106,6 +106,9 @@ static void upsample_plane_c(uint8_t *plane, ptrdiff_t plane_stride, int w, int
     uint8_t *src1, *src2, *dst1, *dst2, *p, a, b;
     int i, j;
 
+    if(!w || !h)
+        return;
+
     w += (w & 1);
     h += (h & 1);
 
diff --git a/libavcodec/mss2dsp.h b/libavcodec/mss2dsp.h
index 352b851..e44c9ab 100644
--- a/libavcodec/mss2dsp.h
+++ b/libavcodec/mss2dsp.h
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss3.c b/libavcodec/mss3.c
index 6a9731b..21226f9 100644
--- a/libavcodec/mss3.c
+++ b/libavcodec/mss3.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 3 (aka Microsoft ATC Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -296,7 +296,7 @@ static void rac_normalise(RangeCoder *c)
             c->low |= *c->src++;
         } else if (!c->low) {
             c->got_error = 1;
-            return;
+            c->low = 1;
         }
         if (c->range >= RAC_BOTTOM)
             return;
@@ -356,8 +356,9 @@ static int rac_get_model2_sym(RangeCoder *c, Model2 *m)
 
 static int rac_get_model_sym(RangeCoder *c, Model *m)
 {
-    int prob, prob2, helper, val;
+    int val;
     int end, end2;
+    unsigned prob, prob2, helper;
 
     prob       = 0;
     prob2      = c->range;
@@ -388,9 +389,10 @@ static int rac_get_model_sym(RangeCoder *c, Model *m)
 
 static int rac_get_model256_sym(RangeCoder *c, Model256 *m)
 {
-    int prob, prob2, helper, val;
+    int val;
     int start, end;
     int ssym;
+    unsigned prob, prob2, helper;
 
     prob2      = c->range;
     c->range >>= MODEL_SCALE;
@@ -731,10 +733,8 @@ static int mss3_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return buf_size;
     c->got_error = 0;
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
     c->pic->key_frame = keyframe;
     c->pic->pict_type = keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
     if (!bytestream2_get_bytes_left(&gb)) {
@@ -840,6 +840,7 @@ static av_cold int mss3_decode_init(AVCodecContext *avctx)
                                             b_width * b_height);
         if (!c->dct_coder[i].prev_dc) {
             av_log(avctx, AV_LOG_ERROR, "Cannot allocate buffer\n");
+            av_frame_free(&c->pic);
             while (i >= 0) {
                 av_freep(&c->dct_coder[i].prev_dc);
                 i--;
diff --git a/libavcodec/mss34dsp.c b/libavcodec/mss34dsp.c
index dc79676..f340565 100644
--- a/libavcodec/mss34dsp.c
+++ b/libavcodec/mss34dsp.c
@@ -2,20 +2,20 @@
  * Common stuff for some Microsoft Screen codecs
  * Copyright (C) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,30 +62,30 @@ void ff_mss34_gen_quant_mat(uint16_t *qmat, int quality, int luma)
 }
 
 #define DCT_TEMPLATE(blk, step, SOP, shift)                         \
-    const int t0 = -39409 * blk[7 * step] -  58980 * blk[1 * step]; \
-    const int t1 =  39410 * blk[1 * step] -  58980 * blk[7 * step]; \
-    const int t2 = -33410 * blk[5 * step] - 167963 * blk[3 * step]; \
-    const int t3 =  33410 * blk[3 * step] - 167963 * blk[5 * step]; \
-    const int t4 =          blk[3 * step] +          blk[7 * step]; \
-    const int t5 =          blk[1 * step] +          blk[5 * step]; \
-    const int t6 =  77062 * t4            +  51491 * t5;            \
-    const int t7 =  77062 * t5            -  51491 * t4;            \
-    const int t8 =  35470 * blk[2 * step] -  85623 * blk[6 * step]; \
-    const int t9 =  35470 * blk[6 * step] +  85623 * blk[2 * step]; \
-    const int tA = SOP(blk[0 * step] - blk[4 * step]);              \
-    const int tB = SOP(blk[0 * step] + blk[4 * step]);              \
+    const unsigned t0 =-39409U * blk[7 * step] - 58980U * blk[1 * step]; \
+    const unsigned t1 = 39410U * blk[1 * step] - 58980U * blk[7 * step]; \
+    const unsigned t2 =-33410U * blk[5 * step] -167963U * blk[3 * step]; \
+    const unsigned t3 = 33410U * blk[3 * step] -167963U * blk[5 * step]; \
+    const unsigned t4 =          blk[3 * step] +          blk[7 * step]; \
+    const unsigned t5 =          blk[1 * step] +          blk[5 * step]; \
+    const unsigned t6 = 77062U * t4            + 51491U * t5;            \
+    const unsigned t7 = 77062U * t5            - 51491U * t4;            \
+    const unsigned t8 = 35470U * blk[2 * step] - 85623U * blk[6 * step]; \
+    const unsigned t9 = 35470U * blk[6 * step] + 85623U * blk[2 * step]; \
+    const unsigned tA = SOP(blk[0 * step] - blk[4 * step]);              \
+    const unsigned tB = SOP(blk[0 * step] + blk[4 * step]);              \
                                                                     \
-    blk[0 * step] = (  t1 + t6  + t9 + tB) >> shift;                \
-    blk[1 * step] = (  t3 + t7  + t8 + tA) >> shift;                \
-    blk[2 * step] = (  t2 + t6  - t8 + tA) >> shift;                \
-    blk[3 * step] = (  t0 + t7  - t9 + tB) >> shift;                \
-    blk[4 * step] = (-(t0 + t7) - t9 + tB) >> shift;                \
-    blk[5 * step] = (-(t2 + t6) - t8 + tA) >> shift;                \
-    blk[6 * step] = (-(t3 + t7) + t8 + tA) >> shift;                \
-    blk[7 * step] = (-(t1 + t6) + t9 + tB) >> shift;                \
+    blk[0 * step] = (int)(  t1 + t6  + t9 + tB) >> shift;                \
+    blk[1 * step] = (int)(  t3 + t7  + t8 + tA) >> shift;                \
+    blk[2 * step] = (int)(  t2 + t6  - t8 + tA) >> shift;                \
+    blk[3 * step] = (int)(  t0 + t7  - t9 + tB) >> shift;                \
+    blk[4 * step] = (int)(-(t0 + t7) - t9 + tB) >> shift;                \
+    blk[5 * step] = (int)(-(t2 + t6) - t8 + tA) >> shift;                \
+    blk[6 * step] = (int)(-(t3 + t7) + t8 + tA) >> shift;                \
+    blk[7 * step] = (int)(-(t1 + t6) + t9 + tB) >> shift;                \
 
-#define SOP_ROW(a) ((a) << 16) + 0x2000
-#define SOP_COL(a) ((a + 32) << 16)
+#define SOP_ROW(a) (((a) * (1U << 16)) + 0x2000)
+#define SOP_COL(a) (((a) + 32) * (1U << 16))
 
 void ff_mss34_dct_put(uint8_t *dst, ptrdiff_t stride, int *block)
 {
diff --git a/libavcodec/mss34dsp.h b/libavcodec/mss34dsp.h
index cec8247..29c61e3 100644
--- a/libavcodec/mss34dsp.h
+++ b/libavcodec/mss34dsp.h
@@ -2,20 +2,20 @@
  * Common stuff for some Microsoft Screen codecs
  * Copyright (C) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss4.c b/libavcodec/mss4.c
index 0acdb99..b58c21b 100644
--- a/libavcodec/mss4.c
+++ b/libavcodec/mss4.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 4 (aka Microsoft Expression Encoder Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "get_bits.h"
 #include "internal.h"
 #include "mss34dsp.h"
-#include "unary_legacy.h"
+#include "unary.h"
 
 #define HEADER_SIZE 8
 
@@ -125,7 +125,7 @@ static const uint8_t mss4_vec_entry_vlc_syms[2][9] = {
 #define MAX_ENTRIES  162
 
 typedef struct MSS4Context {
-    AVFrame   *pic;
+    AVFrame    *pic;
 
     VLC        dc_vlc[2], ac_vlc[2];
     VLC        vec_entry_vlc[2];
@@ -363,7 +363,7 @@ static int get_value_cached(GetBitContext *gb, int vec_pos, uint8_t *vec,
     return prev[component];
 }
 
-#define MKVAL(vals)  (vals[0] | (vals[1] << 3) | (vals[2] << 6))
+#define MKVAL(vals)  ((vals)[0] | ((vals)[1] << 3) | ((vals)[2] << 6))
 
 /* Image mode - the hardest to comprehend MSS4 coding mode.
  *
@@ -553,10 +553,8 @@ static int mss4_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
     c->pic->key_frame = (frame_type == INTRA_FRAME);
     c->pic->pict_type = (frame_type == INTRA_FRAME) ? AV_PICTURE_TYPE_I
                                                    : AV_PICTURE_TYPE_P;
@@ -574,7 +572,8 @@ static int mss4_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_mss34_gen_quant_mat(c->quant_mat[i], quality, !i);
     }
 
-    init_get_bits(&gb, buf + HEADER_SIZE, (buf_size - HEADER_SIZE) * 8);
+    if ((ret = init_get_bits8(&gb, buf + HEADER_SIZE, buf_size - HEADER_SIZE)) < 0)
+        return ret;
 
     mb_width  = FFALIGN(width,  16) >> 4;
     mb_height = FFALIGN(height, 16) >> 4;
@@ -652,7 +651,7 @@ static av_cold int mss4_decode_init(AVCodecContext *avctx)
     }
     for (i = 0; i < 3; i++) {
         c->dc_stride[i] = FFALIGN(avctx->width, 16) >> (2 + !!i);
-        c->prev_dc[i]   = av_malloc(sizeof(**c->prev_dc) * c->dc_stride[i]);
+        c->prev_dc[i]   = av_malloc_array(c->dc_stride[i], sizeof(**c->prev_dc));
         if (!c->prev_dc[i]) {
             av_log(avctx, AV_LOG_ERROR, "Cannot allocate buffer\n");
             mss4_free_vlcs(c);
diff --git a/libavcodec/msvideo1.c b/libavcodec/msvideo1.c
index 37ea32d..de048d8 100644
--- a/libavcodec/msvideo1.c
+++ b/libavcodec/msvideo1.c
@@ -2,20 +2,20 @@
  * Microsoft Video-1 Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,10 +62,15 @@ static av_cold int msvideo1_decode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
+    if (avctx->width < 4 || avctx->height < 4)
+        return AVERROR_INVALIDDATA;
+
     /* figure out the colorspace based on the presence of a palette */
     if (s->avctx->bits_per_coded_sample == 8) {
         s->mode_8bit = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        if (avctx->extradata_size >= AVPALETTE_SIZE)
+            memcpy(s->pal, avctx->extradata, AVPALETTE_SIZE);
     } else {
         s->mode_8bit = 0;
         avctx->pix_fmt = AV_PIX_FMT_RGB555;
@@ -299,17 +304,24 @@ static int msvideo1_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-        return ret;
+    // Discard frame if its smaller than the minimum frame size
+    if (buf_size < (avctx->width/4) * (avctx->height/4) / 512) {
+        av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
+        return AVERROR_INVALIDDATA;
     }
 
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+        return ret;
+
     if (s->mode_8bit) {
-        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+        int size;
+        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &size);
 
-        if (pal) {
+        if (pal && size == AVPALETTE_SIZE) {
             memcpy(s->pal, pal, AVPALETTE_SIZE);
             s->frame->palette_has_changed = 1;
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
         }
     }
 
diff --git a/libavcodec/msvideo1enc.c b/libavcodec/msvideo1enc.c
new file mode 100644
index 0000000..b6ae92b
--- /dev/null
+++ b/libavcodec/msvideo1enc.c
@@ -0,0 +1,305 @@
+/*
+ * Microsoft Video-1 Encoder
+ * Copyright (c) 2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Microsoft Video-1 encoder
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "libavutil/lfg.h"
+#include "elbg.h"
+#include "libavutil/imgutils.h"
+/**
+ * Encoder context
+ */
+typedef struct Msvideo1EncContext {
+    AVCodecContext *avctx;
+    AVLFG rnd;
+    uint8_t *prev;
+
+    int block[16*3];
+    int block2[16*3];
+    int codebook[8*3];
+    int codebook2[8*3];
+    int output[16*3];
+    int output2[16*3];
+    int avg[3];
+    int bestpos;
+    int keyint;
+} Msvideo1EncContext;
+
+enum MSV1Mode{
+    MODE_SKIP = 0,
+    MODE_FILL,
+    MODE_2COL,
+    MODE_8COL,
+};
+
+#define SKIP_PREFIX 0x8400
+#define SKIPS_MAX 0x03FF
+#define MKRGB555(in, off) (((in)[off] << 10) | ((in)[(off) + 1] << 5) | ((in)[(off) + 2]))
+
+static const int remap[16] = { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 };
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                               const AVFrame *pict, int *got_packet)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+    const AVFrame *p = pict;
+    uint16_t *src;
+    uint8_t *prevptr;
+    uint8_t *dst, *buf;
+    int keyframe = 0;
+    int no_skips = 1;
+    int i, j, k, x, y, ret;
+    int skips = 0;
+    int quality = 24;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*9 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+    dst= buf= pkt->data;
+
+    if(!c->prev)
+        c->prev = av_malloc(avctx->width * 3 * (avctx->height + 3));
+    prevptr = c->prev + avctx->width * 3 * (FFALIGN(avctx->height, 4) - 1);
+    src = (uint16_t*)(p->data[0] + p->linesize[0]*(FFALIGN(avctx->height, 4) - 1));
+    if(c->keyint >= avctx->keyint_min)
+        keyframe = 1;
+
+
+    for(y = 0; y < avctx->height; y += 4){
+        for(x = 0; x < avctx->width; x += 4){
+            int bestmode = MODE_SKIP;
+            int bestscore = INT_MAX;
+            int flags = 0;
+            int score;
+
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    uint16_t val = src[x + i - j*p->linesize[0]/2];
+                    for(k = 0; k < 3; k++){
+                        c->block[(i + j*4)*3 + k] =
+                        c->block2[remap[i + j*4]*3 + k] = (val >> (10-k*5)) & 0x1F;
+                    }
+                }
+            }
+            if(!keyframe){
+                bestscore = 0;
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4*3; i++){
+                        int t = prevptr[x*3 + i - j*3*avctx->width] - c->block[i + j*4*3];
+                        bestscore += t*t;
+                    }
+                }
+                bestscore /= quality;
+            }
+            // try to find optimal value to fill whole 4x4 block
+            score = 0;
+            avpriv_init_elbg(c->block, 3, 16, c->avg, 1, 1, c->output, &c->rnd);
+            avpriv_do_elbg  (c->block, 3, 16, c->avg, 1, 1, c->output, &c->rnd);
+            if(c->avg[0] == 1) // red component = 1 will be written as skip code
+                c->avg[0] = 0;
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->avg[k] - c->block[(i+j*4)*3+k];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 2;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_FILL;
+            }
+            // search for optimal filling of 2-color block
+            score = 0;
+            avpriv_init_elbg(c->block, 3, 16, c->codebook, 2, 1, c->output, &c->rnd);
+            avpriv_do_elbg  (c->block, 3, 16, c->codebook, 2, 1, c->output, &c->rnd);
+            // last output value should be always 1, swap codebooks if needed
+            if(!c->output[15]){
+                for(i = 0; i < 3; i++)
+                    FFSWAP(uint8_t, c->codebook[i], c->codebook[i+3]);
+                for(i = 0; i < 16; i++)
+                    c->output[i] ^= 1;
+            }
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->codebook[c->output[i+j*4]*3 + k] - c->block[i*3+k+j*4*3];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 6;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_2COL;
+            }
+            // search for optimal filling of 2-color 2x2 subblocks
+            score = 0;
+            for(i = 0; i < 4; i++){
+                avpriv_init_elbg(c->block2 + i*4*3, 3, 4, c->codebook2 + i*2*3, 2, 1, c->output2 + i*4, &c->rnd);
+                avpriv_do_elbg  (c->block2 + i*4*3, 3, 4, c->codebook2 + i*2*3, 2, 1, c->output2 + i*4, &c->rnd);
+            }
+            // last value should be always 1, swap codebooks if needed
+            if(!c->output2[15]){
+                for(i = 0; i < 3; i++)
+                    FFSWAP(uint8_t, c->codebook2[i+18], c->codebook2[i+21]);
+                for(i = 12; i < 16; i++)
+                    c->output2[i] ^= 1;
+            }
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->codebook2[(c->output2[remap[i+j*4]] + (i&2) + (j&2)*2)*3+k] - c->block[i*3+k + j*4*3];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 18;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_8COL;
+            }
+
+            if(bestmode == MODE_SKIP){
+                skips++;
+                no_skips = 0;
+            }
+            if((bestmode != MODE_SKIP && skips) || skips == SKIPS_MAX){
+                bytestream_put_le16(&dst, skips | SKIP_PREFIX);
+                skips = 0;
+            }
+
+            switch(bestmode){
+            case MODE_FILL:
+                bytestream_put_le16(&dst, MKRGB555(c->avg,0) | 0x8000);
+                for(j = 0; j < 4; j++)
+                    for(i = 0; i < 4; i++)
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->avg[k];
+                break;
+            case MODE_2COL:
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4; i++){
+                        flags |= (c->output[i + j*4]^1) << (i + j*4);
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->codebook[c->output[i + j*4]*3 + k];
+                    }
+                }
+                bytestream_put_le16(&dst, flags);
+                bytestream_put_le16(&dst, MKRGB555(c->codebook, 0));
+                bytestream_put_le16(&dst, MKRGB555(c->codebook, 3));
+                break;
+            case MODE_8COL:
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4; i++){
+                        flags |= (c->output2[remap[i + j*4]]^1) << (i + j*4);
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->codebook2[(c->output2[remap[i+j*4]] + (i&2) + (j&2)*2)*3 + k];
+                    }
+                }
+                bytestream_put_le16(&dst, flags);
+                bytestream_put_le16(&dst, MKRGB555(c->codebook2, 0) | 0x8000);
+                for(i = 3; i < 24; i += 3)
+                    bytestream_put_le16(&dst, MKRGB555(c->codebook2, i));
+                break;
+            }
+        }
+        src     -= p->linesize[0] << 1;
+        prevptr -= avctx->width * 3 * 4;
+    }
+    if(skips)
+        bytestream_put_le16(&dst, skips | SKIP_PREFIX);
+    //EOF
+    bytestream_put_byte(&dst, 0);
+    bytestream_put_byte(&dst, 0);
+
+    if(no_skips)
+        keyframe = 1;
+    if(keyframe)
+        c->keyint = 0;
+    else
+        c->keyint++;
+    if (keyframe) pkt->flags |= AV_PKT_FLAG_KEY;
+    pkt->size = dst - buf;
+    *got_packet = 1;
+
+    return 0;
+}
+
+
+/**
+ * init encoder
+ */
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+
+    c->avctx = avctx;
+    if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0) {
+        return -1;
+    }
+    if((avctx->width&3) || (avctx->height&3)){
+        av_log(avctx, AV_LOG_ERROR, "width and height must be multiples of 4\n");
+        return -1;
+    }
+
+    avctx->bits_per_coded_sample = 16;
+
+    c->keyint = avctx->keyint_min;
+    av_lfg_init(&c->rnd, 1);
+
+    return 0;
+}
+
+
+
+/**
+ * Uninit encoder
+ */
+static av_cold int encode_end(AVCodecContext *avctx)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+
+    av_freep(&c->prev);
+
+    return 0;
+}
+
+AVCodec ff_msvideo1_encoder = {
+    .name           = "msvideo1",
+    .long_name = NULL_IF_CONFIG_SMALL("Microsoft Video-1"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MSVIDEO1,
+    .priv_data_size = sizeof(Msvideo1EncContext),
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_end,
+    .pix_fmts = (const enum AVPixelFormat[]){AV_PIX_FMT_RGB555, AV_PIX_FMT_NONE},
+};
diff --git a/libavcodec/mvcdec.c b/libavcodec/mvcdec.c
index 1546bcc..e507674 100644
--- a/libavcodec/mvcdec.c
+++ b/libavcodec/mvcdec.c
@@ -2,20 +2,20 @@
  * Silicon Graphics Motion Video Compressor 1 & 2 decoder
  * Copyright (c) 2012 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,6 @@
 #include "internal.h"
 
 typedef struct MvcContext {
-    AVFrame *frame;
     int vflip;
 } MvcContext;
 
@@ -53,10 +52,6 @@ static av_cold int mvc_decode_init(AVCodecContext *avctx)
 
     avctx->pix_fmt = (avctx->codec_id == AV_CODEC_ID_MVC1) ? AV_PIX_FMT_RGB555
                                                            : AV_PIX_FMT_RGB32;
-    s->frame       = av_frame_alloc();
-    if (!s->frame)
-        return AVERROR(ENOMEM);
-
     s->vflip = avctx->extradata_size >= 9 &&
                !memcmp(avctx->extradata + avctx->extradata_size - 9, "BottomUp", 9);
     return 0;
@@ -231,39 +226,32 @@ static int mvc_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
     MvcContext *s = avctx->priv_data;
+    AVFrame *frame = data;
     GetByteContext gb;
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
     if (avctx->codec_id == AV_CODEC_ID_MVC1)
-        ret = decode_mvc1(avctx, &gb, s->frame->data[0],
-                          avctx->width, avctx->height, s->frame->linesize[0]);
+        ret = decode_mvc1(avctx, &gb, frame->data[0],
+                          avctx->width, avctx->height, frame->linesize[0]);
     else
-        ret = decode_mvc2(avctx, &gb, s->frame->data[0],
-                          avctx->width, avctx->height, s->frame->linesize[0],
+        ret = decode_mvc2(avctx, &gb, frame->data[0],
+                          avctx->width, avctx->height, frame->linesize[0],
                           s->vflip);
     if (ret < 0)
         return ret;
 
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+
     *got_frame = 1;
-    if ((ret = av_frame_ref(data, s->frame)) < 0)
-        return ret;
 
     return avpkt->size;
 }
 
-static av_cold int mvc_decode_end(AVCodecContext *avctx)
-{
-    MvcContext *s = avctx->priv_data;
-
-    av_frame_free(&s->frame);
-
-    return 0;
-}
-
 #if CONFIG_MVC1_DECODER
 AVCodec ff_mvc1_decoder = {
     .name           = "mvc1",
@@ -272,7 +260,6 @@ AVCodec ff_mvc1_decoder = {
     .id             = AV_CODEC_ID_MVC1,
     .priv_data_size = sizeof(MvcContext),
     .init           = mvc_decode_init,
-    .close          = mvc_decode_end,
     .decode         = mvc_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
@@ -286,7 +273,6 @@ AVCodec ff_mvc2_decoder = {
     .id             = AV_CODEC_ID_MVC2,
     .priv_data_size = sizeof(MvcContext),
     .init           = mvc_decode_init,
-    .close          = mvc_decode_end,
     .decode         = mvc_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/mwsc.c b/libavcodec/mwsc.c
new file mode 100644
index 0000000..4db7642
--- /dev/null
+++ b/libavcodec/mwsc.c
@@ -0,0 +1,192 @@
+/*
+ * MatchWare Screen Capture Codec decoder
+ *
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+#include <zlib.h>
+
+typedef struct MWSCContext {
+    unsigned int      decomp_size;
+    uint8_t          *decomp_buf;
+    z_stream          zstream;
+    AVFrame          *prev_frame;
+} MWSCContext;
+
+static int rle_uncompress(GetByteContext *gb, PutByteContext *pb, GetByteContext *gbp,
+                          int width, int height, int stride, int pb_linesize, int gbp_linesize)
+{
+    int intra = 1, w = 0;
+
+    bytestream2_seek_p(pb, (height - 1) * pb_linesize, SEEK_SET);
+
+    while (bytestream2_get_bytes_left(gb) > 0) {
+        uint32_t fill = bytestream2_get_le24(gb);
+        unsigned run = bytestream2_get_byte(gb);
+
+        if (run == 0) {
+            run = bytestream2_get_le32(gb);
+            for (int j = 0; j < run; j++, w++) {
+                if (w == width) {
+                    w = 0;
+                    bytestream2_seek_p(pb, -(pb_linesize + stride), SEEK_CUR);
+                }
+                bytestream2_put_le24(pb, fill);
+            }
+        } else if (run == 255) {
+            int pos = bytestream2_tell_p(pb);
+
+            bytestream2_seek(gbp, pos, SEEK_SET);
+            for (int j = 0; j < fill; j++, w++) {
+                if (w == width) {
+                    w = 0;
+                    bytestream2_seek_p(pb, -(pb_linesize + stride), SEEK_CUR);
+                    bytestream2_seek(gbp, -(gbp_linesize + stride), SEEK_CUR);
+                }
+                bytestream2_put_le24(pb, bytestream2_get_le24(gbp));
+            }
+
+            intra = 0;
+        } else {
+            for (int j = 0; j < run; j++, w++) {
+                if (w == width) {
+                    w = 0;
+                    bytestream2_seek_p(pb, -(pb_linesize + stride), SEEK_CUR);
+                }
+                bytestream2_put_le24(pb, fill);
+            }
+        }
+    }
+
+    return intra;
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    MWSCContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    GetByteContext gb;
+    GetByteContext gbp;
+    PutByteContext pb;
+    int ret;
+
+    ret = inflateReset(&s->zstream);
+    if (ret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", ret);
+        return AVERROR_EXTERNAL;
+    }
+    s->zstream.next_in   = buf;
+    s->zstream.avail_in  = buf_size;
+    s->zstream.next_out  = s->decomp_buf;
+    s->zstream.avail_out = s->decomp_size;
+    ret = inflate(&s->zstream, Z_FINISH);
+    if (ret != Z_STREAM_END) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate error: %d\n", ret);
+        return AVERROR_EXTERNAL;
+    }
+
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+
+    bytestream2_init(&gb, s->decomp_buf, s->zstream.total_out);
+    bytestream2_init(&gbp, s->prev_frame->data[0], avctx->height * s->prev_frame->linesize[0]);
+    bytestream2_init_writer(&pb, frame->data[0], avctx->height * frame->linesize[0]);
+
+    frame->key_frame = rle_uncompress(&gb, &pb, &gbp, avctx->width, avctx->height, avctx->width * 3,
+                                      frame->linesize[0], s->prev_frame->linesize[0]);
+
+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    av_frame_unref(s->prev_frame);
+    if ((ret = av_frame_ref(s->prev_frame, frame)) < 0)
+        return ret;
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    MWSCContext *s = avctx->priv_data;
+    int64_t size;
+    int zret;
+
+    avctx->pix_fmt = AV_PIX_FMT_BGR24;
+
+    size = 32LL * avctx->height * avctx->width;
+    if (size >= INT32_MAX)
+        return AVERROR_INVALIDDATA;
+    s->decomp_size = size;
+    if (!(s->decomp_buf = av_malloc(s->decomp_size)))
+        return AVERROR(ENOMEM);
+
+    s->zstream.zalloc = Z_NULL;
+    s->zstream.zfree = Z_NULL;
+    s->zstream.opaque = Z_NULL;
+    zret = inflateInit(&s->zstream);
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate init error: %d\n", zret);
+        return AVERROR_EXTERNAL;
+    }
+
+    s->prev_frame = av_frame_alloc();
+    if (!s->prev_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    MWSCContext *s = avctx->priv_data;
+
+    av_frame_free(&s->prev_frame);
+    av_freep(&s->decomp_buf);
+    s->decomp_size = 0;
+    inflateEnd(&s->zstream);
+
+    return 0;
+}
+
+AVCodec ff_mwsc_decoder = {
+    .name             = "mwsc",
+    .long_name        = NULL_IF_CONFIG_SMALL("MatchWare Screen Capture Codec"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_MWSC,
+    .priv_data_size   = sizeof(MWSCContext),
+    .init             = decode_init,
+    .close            = decode_close,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/mxpegdec.c b/libavcodec/mxpegdec.c
index a8ef6d0..2e3ebe6 100644
--- a/libavcodec/mxpegdec.c
+++ b/libavcodec/mxpegdec.c
@@ -2,20 +2,20 @@
  * MxPEG decoder
  * Copyright (c) 2011 Anatoly Nenashev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,6 +54,7 @@ static av_cold int mxpeg_decode_end(AVCodecContext *avctx)
     for (i = 0; i < 2; ++i)
         av_frame_free(&s->picture[i]);
 
+    s->bitmask_size = 0;
     av_freep(&s->mxm_bitmask);
     av_freep(&s->completion_bitmask);
 
@@ -105,6 +106,7 @@ static int mxpeg_decode_mxm(MXpegDecodeContext *s,
     }
 
     if (s->bitmask_size != bitmask_size) {
+        s->bitmask_size = 0;
         av_freep(&s->mxm_bitmask);
         s->mxm_bitmask = av_malloc(bitmask_size);
         if (!s->mxm_bitmask) {
@@ -272,11 +274,9 @@ static int mxpeg_decode_frame(AVCodecContext *avctx,
                     }
                     /* use stored SOF data to allocate current picture */
                     av_frame_unref(jpg->picture_ptr);
-                    if (ff_get_buffer(avctx, jpg->picture_ptr,
-                                      AV_GET_BUFFER_FLAG_REF) < 0) {
-                        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                        return AVERROR(ENOMEM);
-                    }
+                    if ((ret = ff_get_buffer(avctx, jpg->picture_ptr,
+                                             AV_GET_BUFFER_FLAG_REF)) < 0)
+                        return ret;
                     jpg->picture_ptr->pict_type = AV_PICTURE_TYPE_P;
                     jpg->picture_ptr->key_frame = 0;
                     jpg->got_picture = 1;
@@ -292,17 +292,15 @@ static int mxpeg_decode_frame(AVCodecContext *avctx,
 
                     /* allocate dummy reference picture if needed */
                     if (!reference_ptr->data[0] &&
-                        ff_get_buffer(avctx, reference_ptr,
-                                      AV_GET_BUFFER_FLAG_REF) < 0) {
-                        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                        return AVERROR(ENOMEM);
-                    }
+                        (ret = ff_get_buffer(avctx, reference_ptr,
+                                             AV_GET_BUFFER_FLAG_REF)) < 0)
+                        return ret;
 
-                    ret = ff_mjpeg_decode_sos(jpg, s->mxm_bitmask, reference_ptr);
+                    ret = ff_mjpeg_decode_sos(jpg, s->mxm_bitmask, s->bitmask_size, reference_ptr);
                     if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
                         return ret;
                 } else {
-                    ret = ff_mjpeg_decode_sos(jpg, NULL, NULL);
+                    ret = ff_mjpeg_decode_sos(jpg, NULL, 0, NULL);
                     if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
                         return ret;
                 }
@@ -346,5 +344,6 @@ AVCodec ff_mxpeg_decoder = {
     .close          = mxpeg_decode_end,
     .decode         = mxpeg_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/nellymoser.c b/libavcodec/nellymoser.c
index 027726e..5ff6583 100644
--- a/libavcodec/nellymoser.c
+++ b/libavcodec/nellymoser.c
@@ -84,7 +84,7 @@ const int16_t ff_nelly_delta_table[32] = {
 
 static inline int signed_shift(int i, int shift) {
     if (shift > 0)
-        return i << shift;
+        return (unsigned)i << shift;
     return i >> -shift;
 }
 
@@ -108,7 +108,7 @@ static int headroom(int *la)
         return 31;
     }
     l = 30 - av_log2(FFABS(*la));
-    *la <<= l;
+    *la *= 1<<l;
     return l;
 }
 
diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c
index 8d9af5f..b0deb79 100644
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -38,8 +38,8 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "fft.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "nellymoser.h"
 #include "sinewin.h"
@@ -48,9 +48,9 @@
 typedef struct NellyMoserDecodeContext {
     AVCodecContext* avctx;
     AVLFG           random_state;
-    BitstreamContext bc;
+    GetBitContext   gb;
     float           scale_bias;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext      imdct_ctx;
     DECLARE_ALIGNED(32, float, imdct_buf)[2][NELLY_BUF_LEN];
     float          *imdct_out;
@@ -67,15 +67,15 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
     int bits[NELLY_BUF_LEN];
     unsigned char v;
 
-    bitstream_init8(&s->bc, block, NELLY_BLOCK_LEN);
+    init_get_bits(&s->gb, block, NELLY_BLOCK_LEN * 8);
 
     bptr = buf;
     pptr = pows;
-    val = ff_nelly_init_table[bitstream_read(&s->bc, 6)];
+    val = ff_nelly_init_table[get_bits(&s->gb, 6)];
     for (i=0 ; i<NELLY_BANDS ; i++) {
         if (i > 0)
-            val += ff_nelly_delta_table[bitstream_read(&s->bc, 5)];
-        pval = -pow(2, val/2048) * s->scale_bias;
+            val += ff_nelly_delta_table[get_bits(&s->gb, 5)];
+        pval = -exp2(val/2048) * s->scale_bias;
         for (j = 0; j < ff_nelly_band_sizes_table[i]; j++) {
             *bptr++ = val;
             *pptr++ = pval;
@@ -88,8 +88,8 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
     for (i = 0; i < 2; i++) {
         aptr = audio + i * NELLY_BUF_LEN;
 
-        bitstream_init8(&s->bc, block, NELLY_BLOCK_LEN);
-        bitstream_skip(&s->bc, NELLY_HEADER_BITS + i * NELLY_DETAIL_BITS);
+        init_get_bits(&s->gb, block, NELLY_BLOCK_LEN * 8);
+        skip_bits_long(&s->gb, NELLY_HEADER_BITS + i*NELLY_DETAIL_BITS);
 
         for (j = 0; j < NELLY_FILL_LEN; j++) {
             if (bits[j] <= 0) {
@@ -97,7 +97,7 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
                 if (av_lfg_get(&s->random_state) & 1)
                     aptr[j] *= -1.0;
             } else {
-                v = bitstream_read(&s->bc, bits[j]);
+                v = get_bits(&s->gb, bits[j]);
                 aptr[j] = ff_nelly_dequantization_table[(1<<bits[j])-1+v]*pows[j];
             }
         }
@@ -105,7 +105,7 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
                (NELLY_BUF_LEN - NELLY_FILL_LEN) * sizeof(float));
 
         s->imdct_ctx.imdct_half(&s->imdct_ctx, s->imdct_out, aptr);
-        s->fdsp.vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN / 2,
+        s->fdsp->vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN / 2,
                                    s->imdct_out, ff_sine_128,
                                    NELLY_BUF_LEN / 2);
         FFSWAP(float *, s->imdct_out, s->imdct_prev);
@@ -121,7 +121,9 @@ static av_cold int decode_init(AVCodecContext * avctx) {
     av_lfg_init(&s->random_state, 0);
     ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     s->scale_bias = 1.0/(32768*8);
     avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
@@ -141,16 +143,19 @@ static int decode_tag(AVCodecContext *avctx, void *data,
 {
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
+    const uint8_t *side=av_packet_get_side_data(avpkt, 'F', NULL);
     int buf_size = avpkt->size;
     NellyMoserDecodeContext *s = avctx->priv_data;
     int blocks, i, ret;
     float   *samples_flt;
 
     blocks     = buf_size / NELLY_BLOCK_LEN;
+
     if (blocks <= 0) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
         return AVERROR_INVALIDDATA;
     }
+
     if (buf_size % NELLY_BLOCK_LEN) {
         av_log(avctx, AV_LOG_WARNING, "Leftover bytes: %d.\n",
                buf_size % NELLY_BLOCK_LEN);
@@ -162,13 +167,13 @@ static int decode_tag(AVCodecContext *avctx, void *data,
      * 22050 Hz - 4
      * 44100 Hz - 8
      */
+    if(side && blocks>1 && avctx->sample_rate%11025==0 && (1<<((side[0]>>2)&3)) == blocks)
+        avctx->sample_rate= 11025*(blocks/2);
 
     /* get output buffer */
     frame->nb_samples = NELLY_SAMPLES * blocks;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples_flt = (float *)frame->data[0];
 
     for (i=0 ; i<blocks ; i++) {
@@ -186,6 +191,7 @@ static av_cold int decode_end(AVCodecContext * avctx) {
     NellyMoserDecodeContext *s = avctx->priv_data;
 
     ff_mdct_end(&s->imdct_ctx);
+    av_freep(&s->fdsp);
 
     return 0;
 }
diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index 9d12081..9d22ac8 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2008 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
  *
  * Generic codec information: libavcodec/nellymoserdec.c
  *
- * Some information also from: http://samples.libav.org/A-codecs/Nelly_Moser/ASAO/ASAO.zip
+ * Some information also from: http://samples.mplayerhq.hu/A-codecs/Nelly_Moser/ASAO/ASAO.zip
  *                             (Copyright Joseph Artsimovich and UAB "DKD")
  *
  * for more information about nellymoser format, visit:
@@ -56,7 +56,7 @@
 typedef struct NellyMoserEncodeContext {
     AVCodecContext  *avctx;
     int             last_frame;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext      mdct_ctx;
     AudioFrameQueue afq;
     DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
@@ -66,7 +66,7 @@ typedef struct NellyMoserEncodeContext {
     uint8_t         (*path)[OPT_SIZE];
 } NellyMoserEncodeContext;
 
-static float pow_table[POW_TABLE_SIZE];     ///< -pow(2, -i / 2048.0 - 3.0);
+static float pow_table[POW_TABLE_SIZE];     ///< pow(2, -i / 2048.0 - 3.0);
 
 static const uint8_t sf_lut[96] = {
      0,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  4,  4,
@@ -122,12 +122,12 @@ static void apply_mdct(NellyMoserEncodeContext *s)
     float *in1 = s->buf + NELLY_BUF_LEN;
     float *in2 = s->buf + 2 * NELLY_BUF_LEN;
 
-    s->fdsp.vector_fmul        (s->in_buff,                 in0, ff_sine_128, NELLY_BUF_LEN);
-    s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul        (s->in_buff,                 in0, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
     s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff);
 
-    s->fdsp.vector_fmul        (s->in_buff,                 in1, ff_sine_128, NELLY_BUF_LEN);
-    s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul        (s->in_buff,                 in1, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
     s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->in_buff);
 }
 
@@ -138,10 +138,11 @@ static av_cold int encode_end(AVCodecContext *avctx)
     ff_mdct_end(&s->mdct_ctx);
 
     if (s->avctx->trellis) {
-        av_free(s->opt);
-        av_free(s->path);
+        av_freep(&s->opt);
+        av_freep(&s->path);
     }
     ff_af_queue_close(&s->afq);
+    av_freep(&s->fdsp);
 
     return 0;
 }
@@ -170,12 +171,26 @@ static av_cold int encode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0)
         goto error;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
 
     /* Generate overlap window */
-    ff_sine_window_init(ff_sine_128, 128);
+    ff_init_ff_sine_windows(7);
+    /* faster way of doing
     for (i = 0; i < POW_TABLE_SIZE; i++)
-        pow_table[i] = -pow(2, -i / 2048.0 - 3.0 + POW_TABLE_OFFSET);
+       pow_table[i] = 2^(-i / 2048.0 - 3.0 + POW_TABLE_OFFSET); */
+    pow_table[0] = 1;
+    pow_table[1024] = M_SQRT1_2;
+    for (i = 1; i < 513; i++) {
+        double tmp = exp2(-i / 2048.0);
+        pow_table[i] = tmp;
+        pow_table[1024-i] = M_SQRT1_2 / tmp;
+        pow_table[1024+i] = tmp * M_SQRT1_2;
+        pow_table[2048-i] = 0.5 / tmp;
+    }
 
     if (s->avctx->trellis) {
         s->opt  = av_malloc(NELLY_BANDS * OPT_SIZE * sizeof(float  ));
@@ -231,7 +246,7 @@ static void get_exponent_dynamic(NellyMoserEncodeContext *s, float *cand, int *i
     float  (*opt )[OPT_SIZE] = s->opt ;
     uint8_t(*path)[OPT_SIZE] = s->path;
 
-    for (i = 0; i < OPT_SIZE; i++) {
+    for (i = 0; i < NELLY_BANDS * OPT_SIZE; i++) {
         opt[0][i] = INFINITY;
     }
 
@@ -266,7 +281,7 @@ static void get_exponent_dynamic(NellyMoserEncodeContext *s, float *cand, int *i
                 }
             }
         }
-        assert(c); //FIXME
+        av_assert1(c); //FIXME
     }
 
     best_val = INFINITY;
@@ -303,7 +318,7 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int
 
     apply_mdct(s);
 
-    init_put_bits(&pb, output, output_size * 8);
+    init_put_bits(&pb, output, output_size);
 
     i = 0;
     for (band = 0; band < NELLY_BANDS; band++) {
@@ -313,7 +328,7 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int
                        + s->mdct_out[i + NELLY_BUF_LEN] * s->mdct_out[i + NELLY_BUF_LEN];
         }
         cand[band] =
-            log(FFMAX(1.0, coeff_sum / (ff_nelly_band_sizes_table[band] << 7))) * 1024.0 / M_LN2;
+            log2(FFMAX(1.0, coeff_sum / (ff_nelly_band_sizes_table[band] << 7))) * 1024.0;
     }
 
     if (s->avctx->trellis) {
@@ -392,10 +407,8 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         s->last_frame = 1;
     }
 
-    if ((ret = ff_alloc_packet(avpkt, NELLY_BLOCK_LEN))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, NELLY_BLOCK_LEN, 0)) < 0)
         return ret;
-    }
     encode_block(s, avpkt->data, avpkt->size);
 
     /* Get the next frame pts/duration */
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index fe952ae..a96ae51 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2010 Mans Rullgard
  * Copyright (c) 2014 James Yu <james.yu@linaro.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/noise_bsf.c b/libavcodec/noise_bsf.c
index 3b41dbf..d79f63b 100644
--- a/libavcodec/noise_bsf.c
+++ b/libavcodec/noise_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,45 +31,50 @@
 typedef struct NoiseContext {
     const AVClass *class;
     int amount;
+    int dropamount;
     unsigned int state;
 } NoiseContext;
 
-static int noise(AVBSFContext *ctx, AVPacket *out)
+static int noise(AVBSFContext *ctx, AVPacket *pkt)
 {
     NoiseContext *s = ctx->priv_data;
-    AVPacket *in;
     int amount = s->amount > 0 ? s->amount : (s->state % 10001 + 1);
     int i, ret = 0;
 
-    ret = ff_bsf_get_packet(ctx, &in);
+    if (amount <= 0)
+        return AVERROR(EINVAL);
+
+    ret = ff_bsf_get_packet_ref(ctx, pkt);
     if (ret < 0)
         return ret;
 
-    ret = av_new_packet(out, in->size);
-    if (ret < 0)
-        goto fail;
+    if (s->dropamount > 0 && s->state % s->dropamount == 0) {
+        s->state++;
+        av_packet_unref(pkt);
+        return AVERROR(EAGAIN);
+    }
 
-    ret = av_packet_copy_props(out, in);
+    ret = av_packet_make_writable(pkt);
     if (ret < 0)
         goto fail;
 
-    memcpy(out->data, in->data, in->size);
-
-    for (i = 0; i < out->size; i++) {
-        s->state += out->data[i] + 1;
+    for (i = 0; i < pkt->size; i++) {
+        s->state += pkt->data[i] + 1;
         if (s->state % amount == 0)
-            out->data[i] = s->state;
+            pkt->data[i] = s->state;
     }
 fail:
     if (ret < 0)
-        av_packet_unref(out);
-    av_packet_free(&in);
+        av_packet_unref(pkt);
+
     return ret;
 }
 
 #define OFFSET(x) offsetof(NoiseContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_BSF_PARAM)
 static const AVOption options[] = {
-    { "amount", NULL, OFFSET(amount), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX },
+    { "amount", NULL, OFFSET(amount), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },
+    { "dropamount", NULL, OFFSET(dropamount), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },
     { NULL },
 };
 
@@ -82,7 +87,7 @@ static const AVClass noise_class = {
 
 const AVBitStreamFilter ff_noise_bsf = {
     .name           = "noise",
-    .priv_data_size = sizeof(int),
+    .priv_data_size = sizeof(NoiseContext),
     .priv_class     = &noise_class,
     .filter         = noise,
 };
diff --git a/libavcodec/null_bsf.c b/libavcodec/null_bsf.c
index 0fe4f35..24d26df 100644
--- a/libavcodec/null_bsf.c
+++ b/libavcodec/null_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,17 +24,9 @@
 #include "avcodec.h"
 #include "bsf.h"
 
-static int null_filter(AVBSFContext *ctx, AVPacket *out)
+static int null_filter(AVBSFContext *ctx, AVPacket *pkt)
 {
-    AVPacket *in;
-    int ret;
-
-    ret = ff_bsf_get_packet(ctx, &in);
-    if (ret < 0)
-        return ret;
-    av_packet_move_ref(out, in);
-    av_packet_free(&in);
-    return 0;
+    return ff_bsf_get_packet_ref(ctx, pkt);
 }
 
 const AVBitStreamFilter ff_null_bsf = {
diff --git a/libavcodec/nuv.c b/libavcodec/nuv.c
index 92c1fda..75b14bc 100644
--- a/libavcodec/nuv.c
+++ b/libavcodec/nuv.c
@@ -2,25 +2,26 @@
  * NuppelVideo decoder
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <limits.h>
 
 #include "libavutil/bswap.h"
 #include "libavutil/common.h"
@@ -78,7 +79,7 @@ static void copy_frame(AVFrame *f, const uint8_t *src, int width, int height)
     int src_linesize[4];
     av_image_fill_arrays(src_data, src_linesize, src,
                          f->format, width, height, 1);
-    av_image_copy(f->data, f->linesize, src_data, src_linesize,
+    av_image_copy(f->data, f->linesize, (const uint8_t **)src_data, src_linesize,
                   f->format, width, height);
 }
 
@@ -124,23 +125,26 @@ static int codec_reinit(AVCodecContext *avctx, int width, int height,
     if (quality >= 0)
         get_quant_quality(c, quality);
     if (width != c->width || height != c->height) {
-        void *ptr;
+        // also reserve space for a possible additional header
+        int buf_size = height * width * 3 / 2
+                     + FFMAX(AV_LZO_OUTPUT_PADDING, AV_INPUT_BUFFER_PADDING_SIZE)
+                     + RTJPEG_HEADER_SIZE;
+        if (buf_size > INT_MAX/8)
+            return -1;
         if ((ret = av_image_check_size(height, width, 0, avctx)) < 0)
             return ret;
         avctx->width  = c->width  = width;
         avctx->height = c->height = height;
-        ptr = av_fast_realloc(c->decomp_buf, &c->decomp_size,
-                              c->height * c->width * 3 / 2 +
-                              AV_INPUT_BUFFER_PADDING_SIZE +
-                              RTJPEG_HEADER_SIZE);
-        if (!ptr) {
+        av_fast_malloc(&c->decomp_buf, &c->decomp_size,
+                       buf_size);
+        if (!c->decomp_buf) {
             av_log(avctx, AV_LOG_ERROR,
                    "Can't allocate decompression buffer.\n");
             return AVERROR(ENOMEM);
-        } else
-            c->decomp_buf = ptr;
+        }
         ff_rtjpeg_decode_init(&c->rtj, c->width, c->height, c->lq, c->cq);
         av_frame_unref(c->pic);
+        return 1;
     } else if (quality != c->quality)
         ff_rtjpeg_decode_init(&c->rtj, c->width, c->height, c->lq, c->cq);
 
@@ -156,6 +160,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *picture   = data;
     int orig_size      = buf_size;
     int keyframe, ret;
+    int size_change = 0;
+    int minsize = 0;
     int result, init_frame = !avctx->frame_number;
     enum {
         NUV_UNCOMPRESSED  = '0',
@@ -184,7 +190,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return orig_size;
     }
 
-    if (buf[0] != 'V' || buf_size < 12) {
+    if (buf_size < 12 || buf[0] != 'V') {
         av_log(avctx, AV_LOG_ERROR, "not a nuv video frame\n");
         return AVERROR_INVALIDDATA;
     }
@@ -193,6 +199,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     case NUV_RTJPEG_IN_LZO:
     case NUV_RTJPEG:
         keyframe = !buf[2];
+        if (c->width < 16 || c->height < 16) {
+            return AVERROR_INVALIDDATA;
+        }
         break;
     case NUV_COPY_LAST:
         keyframe = 0;
@@ -201,24 +210,42 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         keyframe = 1;
         break;
     }
+    switch (comptype) {
+    case NUV_UNCOMPRESSED:
+        minsize = c->width * c->height * 3 / 2;
+        break;
+    case NUV_RTJPEG:
+        minsize = c->width/16 * (c->height/16) * 6;
+        break;
+    }
+    if (buf_size < minsize / 4)
+        return AVERROR_INVALIDDATA;
+retry:
     // Skip the rest of the frame header.
     buf       = &buf[12];
     buf_size -= 12;
     if (comptype == NUV_RTJPEG_IN_LZO || comptype == NUV_LZO) {
-        int outlen = c->decomp_size - AV_INPUT_BUFFER_PADDING_SIZE;
+        int outlen = c->decomp_size - FFMAX(AV_INPUT_BUFFER_PADDING_SIZE, AV_LZO_OUTPUT_PADDING);
         int inlen  = buf_size;
         if (av_lzo1x_decode(c->decomp_buf, &outlen, buf, &inlen)) {
             av_log(avctx, AV_LOG_ERROR, "error during lzo decompression\n");
             return AVERROR_INVALIDDATA;
         }
         buf      = c->decomp_buf;
-        buf_size = outlen;
+        buf_size = c->decomp_size - FFMAX(AV_INPUT_BUFFER_PADDING_SIZE, AV_LZO_OUTPUT_PADDING) - outlen;
+        memset(c->decomp_buf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     }
     if (c->codec_frameheader) {
         int w, h, q;
-        if (buf_size < RTJPEG_HEADER_SIZE || buf[4] != RTJPEG_HEADER_SIZE ||
-            buf[5] != RTJPEG_FILE_VERSION) {
-            av_log(avctx, AV_LOG_ERROR, "invalid nuv video frame\n");
+        if (buf_size < RTJPEG_HEADER_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "Too small NUV video frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+        // There seem to exist two variants of this header: one starts with 'V'
+        // and 5 bytes unknown, the other matches current MythTV and is 4 bytes size,
+        // 1 byte header size (== 12), 1 byte version (== 0)
+        if (buf[0] != 'V' && AV_RL16(&buf[4]) != 0x000c) {
+            av_log(avctx, AV_LOG_ERROR, "Unknown secondary frame header (wrong codec_tag?)\n");
             return AVERROR_INVALIDDATA;
         }
         w = AV_RL16(&buf[6]);
@@ -226,22 +253,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         q = buf[10];
         if ((result = codec_reinit(avctx, w, h, q)) < 0)
             return result;
-        if (comptype == NUV_RTJPEG_IN_LZO || comptype == NUV_LZO)
-            buf = c->decomp_buf;
+        if (result) {
+            buf = avpkt->data;
+            buf_size = avpkt->size;
+            size_change = 1;
+            goto retry;
+        }
         buf       = &buf[RTJPEG_HEADER_SIZE];
         buf_size -= RTJPEG_HEADER_SIZE;
     }
 
-    if (keyframe) {
+    if (size_change || keyframe) {
         av_frame_unref(c->pic);
         init_frame = 1;
     }
 
-    result = ff_reget_buffer(avctx, c->pic);
-    if (result < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((result = ff_reget_buffer(avctx, c->pic)) < 0)
         return result;
-    }
     if (init_frame) {
         memset(c->pic->data[0], 0,    avctx->height * c->pic->linesize[0]);
         memset(c->pic->data[1], 0x80, avctx->height * c->pic->linesize[1] / 2);
@@ -259,7 +287,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             av_log(avctx, AV_LOG_ERROR, "uncompressed frame too short\n");
             height = buf_size / c->width / 3 * 2;
         }
-        copy_frame(c->pic, buf, c->width, height);
+        if(height > 0)
+            copy_frame(c->pic, buf, c->width, height);
         break;
     }
     case NUV_RTJPEG_IN_LZO:
@@ -336,4 +365,5 @@ AVCodec ff_nuv_decoder = {
     .close          = decode_end,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
diff --git a/libavcodec/nvdec.c b/libavcodec/nvdec.c
new file mode 100644
index 0000000..b60da24
--- /dev/null
+++ b/libavcodec/nvdec.c
@@ -0,0 +1,660 @@
+/*
+ * HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2016 Anton Khirnov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/common.h"
+#include "libavutil/error.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/pixfmt.h"
+
+#include "avcodec.h"
+#include "decode.h"
+#include "nvdec.h"
+#include "internal.h"
+
+#if !NVDECAPI_CHECK_VERSION(9, 0)
+#define cudaVideoSurfaceFormat_YUV444 2
+#define cudaVideoSurfaceFormat_YUV444_16Bit 3
+#endif
+
+typedef struct NVDECDecoder {
+    CUvideodecoder decoder;
+
+    AVBufferRef *hw_device_ref;
+    CUcontext    cuda_ctx;
+    CUstream     stream;
+
+    CudaFunctions *cudl;
+    CuvidFunctions *cvdl;
+} NVDECDecoder;
+
+typedef struct NVDECFramePool {
+    unsigned int dpb_size;
+    unsigned int nb_allocated;
+} NVDECFramePool;
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(logctx, decoder->cudl, x)
+
+static int map_avcodec_id(enum AVCodecID id)
+{
+    switch (id) {
+    case AV_CODEC_ID_H264:       return cudaVideoCodec_H264;
+    case AV_CODEC_ID_HEVC:       return cudaVideoCodec_HEVC;
+    case AV_CODEC_ID_MJPEG:      return cudaVideoCodec_JPEG;
+    case AV_CODEC_ID_MPEG1VIDEO: return cudaVideoCodec_MPEG1;
+    case AV_CODEC_ID_MPEG2VIDEO: return cudaVideoCodec_MPEG2;
+    case AV_CODEC_ID_MPEG4:      return cudaVideoCodec_MPEG4;
+    case AV_CODEC_ID_VC1:        return cudaVideoCodec_VC1;
+    case AV_CODEC_ID_VP8:        return cudaVideoCodec_VP8;
+    case AV_CODEC_ID_VP9:        return cudaVideoCodec_VP9;
+    case AV_CODEC_ID_WMV3:       return cudaVideoCodec_VC1;
+    }
+    return -1;
+}
+
+static int map_chroma_format(enum AVPixelFormat pix_fmt)
+{
+    int shift_h = 0, shift_v = 0;
+
+    av_pix_fmt_get_chroma_sub_sample(pix_fmt, &shift_h, &shift_v);
+
+    if (shift_h == 1 && shift_v == 1)
+        return cudaVideoChromaFormat_420;
+    else if (shift_h == 1 && shift_v == 0)
+        return cudaVideoChromaFormat_422;
+    else if (shift_h == 0 && shift_v == 0)
+        return cudaVideoChromaFormat_444;
+
+    return -1;
+}
+
+static int nvdec_test_capabilities(NVDECDecoder *decoder,
+                                   CUVIDDECODECREATEINFO *params, void *logctx)
+{
+    int ret;
+    CUVIDDECODECAPS caps = { 0 };
+
+    caps.eCodecType      = params->CodecType;
+    caps.eChromaFormat   = params->ChromaFormat;
+    caps.nBitDepthMinus8 = params->bitDepthMinus8;
+
+    if (!decoder->cvdl->cuvidGetDecoderCaps) {
+        av_log(logctx, AV_LOG_WARNING, "Used Nvidia driver is too old to perform a capability check.\n");
+        av_log(logctx, AV_LOG_WARNING, "The minimum required version is "
+#if defined(_WIN32) || defined(__CYGWIN__)
+            "378.66"
+#else
+            "378.13"
+#endif
+            ". Continuing blind.\n");
+        return 0;
+    }
+
+    ret = CHECK_CU(decoder->cvdl->cuvidGetDecoderCaps(&caps));
+    if (ret < 0)
+        return ret;
+
+    av_log(logctx, AV_LOG_VERBOSE, "NVDEC capabilities:\n");
+    av_log(logctx, AV_LOG_VERBOSE, "format supported: %s, max_mb_count: %d\n",
+           caps.bIsSupported ? "yes" : "no", caps.nMaxMBCount);
+    av_log(logctx, AV_LOG_VERBOSE, "min_width: %d, max_width: %d\n",
+           caps.nMinWidth, caps.nMaxWidth);
+    av_log(logctx, AV_LOG_VERBOSE, "min_height: %d, max_height: %d\n",
+           caps.nMinHeight, caps.nMaxHeight);
+
+    if (!caps.bIsSupported) {
+        av_log(logctx, AV_LOG_ERROR, "Hardware is lacking required capabilities\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (params->ulWidth > caps.nMaxWidth || params->ulWidth < caps.nMinWidth) {
+        av_log(logctx, AV_LOG_ERROR, "Video width %d not within range from %d to %d\n",
+               (int)params->ulWidth, caps.nMinWidth, caps.nMaxWidth);
+        return AVERROR(EINVAL);
+    }
+
+    if (params->ulHeight > caps.nMaxHeight || params->ulHeight < caps.nMinHeight) {
+        av_log(logctx, AV_LOG_ERROR, "Video height %d not within range from %d to %d\n",
+               (int)params->ulHeight, caps.nMinHeight, caps.nMaxHeight);
+        return AVERROR(EINVAL);
+    }
+
+    if ((params->ulWidth * params->ulHeight) / 256 > caps.nMaxMBCount) {
+        av_log(logctx, AV_LOG_ERROR, "Video macroblock count %d exceeds maximum of %d\n",
+               (int)(params->ulWidth * params->ulHeight) / 256, caps.nMaxMBCount);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static void nvdec_decoder_free(void *opaque, uint8_t *data)
+{
+    NVDECDecoder *decoder = (NVDECDecoder*)data;
+
+    if (decoder->decoder) {
+        void *logctx = decoder->hw_device_ref->data;
+        CUcontext dummy;
+        CHECK_CU(decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx));
+        CHECK_CU(decoder->cvdl->cuvidDestroyDecoder(decoder->decoder));
+        CHECK_CU(decoder->cudl->cuCtxPopCurrent(&dummy));
+    }
+
+    av_buffer_unref(&decoder->hw_device_ref);
+
+    cuvid_free_functions(&decoder->cvdl);
+
+    av_freep(&decoder);
+}
+
+static int nvdec_decoder_create(AVBufferRef **out, AVBufferRef *hw_device_ref,
+                                CUVIDDECODECREATEINFO *params, void *logctx)
+{
+    AVHWDeviceContext  *hw_device_ctx = (AVHWDeviceContext*)hw_device_ref->data;
+    AVCUDADeviceContext *device_hwctx = hw_device_ctx->hwctx;
+
+    AVBufferRef *decoder_ref;
+    NVDECDecoder *decoder;
+
+    CUcontext dummy;
+    int ret;
+
+    decoder = av_mallocz(sizeof(*decoder));
+    if (!decoder)
+        return AVERROR(ENOMEM);
+
+    decoder_ref = av_buffer_create((uint8_t*)decoder, sizeof(*decoder),
+                                   nvdec_decoder_free, NULL, AV_BUFFER_FLAG_READONLY);
+    if (!decoder_ref) {
+        av_freep(&decoder);
+        return AVERROR(ENOMEM);
+    }
+
+    decoder->hw_device_ref = av_buffer_ref(hw_device_ref);
+    if (!decoder->hw_device_ref) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    decoder->cuda_ctx = device_hwctx->cuda_ctx;
+    decoder->cudl = device_hwctx->internal->cuda_dl;
+    decoder->stream = device_hwctx->stream;
+
+    ret = cuvid_load_functions(&decoder->cvdl, logctx);
+    if (ret < 0) {
+        av_log(logctx, AV_LOG_ERROR, "Failed loading nvcuvid.\n");
+        goto fail;
+    }
+
+    ret = CHECK_CU(decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = nvdec_test_capabilities(decoder, params, logctx);
+    if (ret < 0) {
+        CHECK_CU(decoder->cudl->cuCtxPopCurrent(&dummy));
+        goto fail;
+    }
+
+    ret = CHECK_CU(decoder->cvdl->cuvidCreateDecoder(&decoder->decoder, params));
+
+    CHECK_CU(decoder->cudl->cuCtxPopCurrent(&dummy));
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    *out = decoder_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&decoder_ref);
+    return ret;
+}
+
+static AVBufferRef *nvdec_decoder_frame_alloc(void *opaque, int size)
+{
+    NVDECFramePool *pool = opaque;
+    AVBufferRef *ret;
+
+    if (pool->nb_allocated >= pool->dpb_size)
+        return NULL;
+
+    ret = av_buffer_alloc(sizeof(unsigned int));
+    if (!ret)
+        return NULL;
+
+    *(unsigned int*)ret->data = pool->nb_allocated++;
+
+    return ret;
+}
+
+int ff_nvdec_decode_uninit(AVCodecContext *avctx)
+{
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
+
+    av_freep(&ctx->bitstream);
+    ctx->bitstream_len       = 0;
+    ctx->bitstream_allocated = 0;
+
+    av_freep(&ctx->slice_offsets);
+    ctx->nb_slices               = 0;
+    ctx->slice_offsets_allocated = 0;
+
+    av_buffer_unref(&ctx->decoder_ref);
+    av_buffer_pool_uninit(&ctx->decoder_pool);
+
+    return 0;
+}
+
+int ff_nvdec_decode_init(AVCodecContext *avctx)
+{
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
+
+    NVDECFramePool      *pool;
+    AVHWFramesContext   *frames_ctx;
+    const AVPixFmtDescriptor *sw_desc;
+
+    CUVIDDECODECREATEINFO params = { 0 };
+
+    cudaVideoSurfaceFormat output_format;
+    int cuvid_codec_type, cuvid_chroma_format, chroma_444;
+    int ret = 0;
+
+    sw_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+    if (!sw_desc)
+        return AVERROR_BUG;
+
+    cuvid_codec_type = map_avcodec_id(avctx->codec_id);
+    if (cuvid_codec_type < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported codec ID\n");
+        return AVERROR_BUG;
+    }
+
+    cuvid_chroma_format = map_chroma_format(avctx->sw_pix_fmt);
+    if (cuvid_chroma_format < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported chroma format\n");
+        return AVERROR(ENOSYS);
+    }
+    chroma_444 = ctx->supports_444 && cuvid_chroma_format == cudaVideoChromaFormat_444;
+
+    if (!avctx->hw_frames_ctx) {
+        ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_CUDA);
+        if (ret < 0)
+            return ret;
+    }
+
+    switch (sw_desc->comp[0].depth) {
+    case 8:
+        output_format = chroma_444 ? cudaVideoSurfaceFormat_YUV444 :
+                                     cudaVideoSurfaceFormat_NV12;
+        break;
+    case 10:
+    case 12:
+        output_format = chroma_444 ? cudaVideoSurfaceFormat_YUV444_16Bit :
+                                     cudaVideoSurfaceFormat_P016;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth\n");
+        return AVERROR(ENOSYS);
+    }
+
+    frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+
+    params.ulWidth             = avctx->coded_width;
+    params.ulHeight            = avctx->coded_height;
+    params.ulTargetWidth       = avctx->coded_width;
+    params.ulTargetHeight      = avctx->coded_height;
+    params.bitDepthMinus8      = sw_desc->comp[0].depth - 8;
+    params.OutputFormat        = output_format;
+    params.CodecType           = cuvid_codec_type;
+    params.ChromaFormat        = cuvid_chroma_format;
+    params.ulNumDecodeSurfaces = frames_ctx->initial_pool_size;
+    params.ulNumOutputSurfaces = frames_ctx->initial_pool_size;
+
+    ret = nvdec_decoder_create(&ctx->decoder_ref, frames_ctx->device_ref, &params, avctx);
+    if (ret < 0) {
+        if (params.ulNumDecodeSurfaces > 32) {
+            av_log(avctx, AV_LOG_WARNING, "Using more than 32 (%d) decode surfaces might cause nvdec to fail.\n",
+                   (int)params.ulNumDecodeSurfaces);
+            av_log(avctx, AV_LOG_WARNING, "Try lowering the amount of threads. Using %d right now.\n",
+                   avctx->thread_count);
+        }
+        return ret;
+    }
+
+    pool = av_mallocz(sizeof(*pool));
+    if (!pool) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    pool->dpb_size = frames_ctx->initial_pool_size;
+
+    ctx->decoder_pool = av_buffer_pool_init2(sizeof(int), pool,
+                                             nvdec_decoder_frame_alloc, av_free);
+    if (!ctx->decoder_pool) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    return 0;
+fail:
+    ff_nvdec_decode_uninit(avctx);
+    return ret;
+}
+
+static void nvdec_fdd_priv_free(void *priv)
+{
+    NVDECFrame *cf = priv;
+
+    if (!cf)
+        return;
+
+    av_buffer_unref(&cf->idx_ref);
+    av_buffer_unref(&cf->decoder_ref);
+
+    av_freep(&priv);
+}
+
+static void nvdec_unmap_mapped_frame(void *opaque, uint8_t *data)
+{
+    NVDECFrame *unmap_data = (NVDECFrame*)data;
+    NVDECDecoder *decoder = (NVDECDecoder*)unmap_data->decoder_ref->data;
+    void *logctx = decoder->hw_device_ref->data;
+    CUdeviceptr devptr = (CUdeviceptr)opaque;
+    int ret;
+    CUcontext dummy;
+
+    ret = CHECK_CU(decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx));
+    if (ret < 0)
+        goto finish;
+
+    CHECK_CU(decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr));
+
+    CHECK_CU(decoder->cudl->cuCtxPopCurrent(&dummy));
+
+finish:
+    av_buffer_unref(&unmap_data->idx_ref);
+    av_buffer_unref(&unmap_data->decoder_ref);
+    av_free(unmap_data);
+}
+
+static int nvdec_retrieve_data(void *logctx, AVFrame *frame)
+{
+    FrameDecodeData  *fdd = (FrameDecodeData*)frame->private_ref->data;
+    NVDECFrame        *cf = (NVDECFrame*)fdd->hwaccel_priv;
+    NVDECDecoder *decoder = (NVDECDecoder*)cf->decoder_ref->data;
+
+    AVHWFramesContext *hwctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
+
+    CUVIDPROCPARAMS vpp = { 0 };
+    NVDECFrame *unmap_data = NULL;
+
+    CUcontext dummy;
+    CUdeviceptr devptr;
+
+    unsigned int pitch, i;
+    unsigned int offset = 0;
+    int shift_h = 0, shift_v = 0;
+    int ret = 0;
+
+    vpp.progressive_frame = 1;
+    vpp.output_stream = decoder->stream;
+
+    ret = CHECK_CU(decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(decoder->cvdl->cuvidMapVideoFrame(decoder->decoder,
+                                                     cf->idx, &devptr,
+                                                     &pitch, &vpp));
+    if (ret < 0)
+        goto finish;
+
+    unmap_data = av_mallocz(sizeof(*unmap_data));
+    if (!unmap_data) {
+        ret = AVERROR(ENOMEM);
+        goto copy_fail;
+    }
+
+    frame->buf[1] = av_buffer_create((uint8_t *)unmap_data, sizeof(*unmap_data),
+                                     nvdec_unmap_mapped_frame, (void*)devptr,
+                                     AV_BUFFER_FLAG_READONLY);
+    if (!frame->buf[1]) {
+        ret = AVERROR(ENOMEM);
+        goto copy_fail;
+    }
+
+    unmap_data->idx = cf->idx;
+    unmap_data->idx_ref = av_buffer_ref(cf->idx_ref);
+    unmap_data->decoder_ref = av_buffer_ref(cf->decoder_ref);
+
+    av_pix_fmt_get_chroma_sub_sample(hwctx->sw_format, &shift_h, &shift_v);
+    for (i = 0; frame->linesize[i]; i++) {
+        frame->data[i] = (uint8_t*)(devptr + offset);
+        frame->linesize[i] = pitch;
+        offset += pitch * (frame->height >> (i ? shift_v : 0));
+    }
+
+    goto finish;
+
+copy_fail:
+    if (!frame->buf[1]) {
+        CHECK_CU(decoder->cvdl->cuvidUnmapVideoFrame(decoder->decoder, devptr));
+        av_freep(&unmap_data);
+    } else {
+        av_buffer_unref(&frame->buf[1]);
+    }
+
+finish:
+    CHECK_CU(decoder->cudl->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
+int ff_nvdec_start_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
+    FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
+    NVDECFrame *cf = NULL;
+    int ret;
+
+    ctx->bitstream_len = 0;
+    ctx->nb_slices     = 0;
+
+    if (fdd->hwaccel_priv)
+        return 0;
+
+    cf = av_mallocz(sizeof(*cf));
+    if (!cf)
+        return AVERROR(ENOMEM);
+
+    cf->decoder_ref = av_buffer_ref(ctx->decoder_ref);
+    if (!cf->decoder_ref) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    cf->idx_ref = av_buffer_pool_get(ctx->decoder_pool);
+    if (!cf->idx_ref) {
+        av_log(avctx, AV_LOG_ERROR, "No decoder surfaces left\n");
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    cf->idx = *(unsigned int*)cf->idx_ref->data;
+
+    fdd->hwaccel_priv      = cf;
+    fdd->hwaccel_priv_free = nvdec_fdd_priv_free;
+    fdd->post_process      = nvdec_retrieve_data;
+
+    return 0;
+fail:
+    nvdec_fdd_priv_free(cf);
+    return ret;
+
+}
+
+int ff_nvdec_end_frame(AVCodecContext *avctx)
+{
+    NVDECContext     *ctx = avctx->internal->hwaccel_priv_data;
+    NVDECDecoder *decoder = (NVDECDecoder*)ctx->decoder_ref->data;
+    void *logctx          = avctx;
+    CUVIDPICPARAMS    *pp = &ctx->pic_params;
+
+    CUcontext dummy;
+
+    int ret = 0;
+
+    pp->nBitstreamDataLen = ctx->bitstream_len;
+    pp->pBitstreamData    = ctx->bitstream;
+    pp->nNumSlices        = ctx->nb_slices;
+    pp->pSliceDataOffsets = ctx->slice_offsets;
+
+    ret = CHECK_CU(decoder->cudl->cuCtxPushCurrent(decoder->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(decoder->cvdl->cuvidDecodePicture(decoder->decoder, &ctx->pic_params));
+    if (ret < 0)
+        goto finish;
+
+finish:
+    CHECK_CU(decoder->cudl->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+int ff_nvdec_simple_end_frame(AVCodecContext *avctx)
+{
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
+    int ret = ff_nvdec_end_frame(avctx);
+    ctx->bitstream = NULL;
+    return ret;
+}
+
+int ff_nvdec_simple_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
+                                 uint32_t size)
+{
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
+    void *tmp;
+
+    tmp = av_fast_realloc(ctx->slice_offsets, &ctx->slice_offsets_allocated,
+                          (ctx->nb_slices + 1) * sizeof(*ctx->slice_offsets));
+    if (!tmp)
+        return AVERROR(ENOMEM);
+    ctx->slice_offsets = tmp;
+
+    if (!ctx->bitstream)
+        ctx->bitstream = (uint8_t*)buffer;
+
+    ctx->slice_offsets[ctx->nb_slices] = buffer - ctx->bitstream;
+    ctx->bitstream_len += size;
+    ctx->nb_slices++;
+
+    return 0;
+}
+
+static void nvdec_free_dummy(struct AVHWFramesContext *ctx)
+{
+    av_buffer_pool_uninit(&ctx->pool);
+}
+
+static AVBufferRef *nvdec_alloc_dummy(int size)
+{
+    return av_buffer_create(NULL, 0, NULL, NULL, 0);
+}
+
+int ff_nvdec_frame_params(AVCodecContext *avctx,
+                          AVBufferRef *hw_frames_ctx,
+                          int dpb_size,
+                          int supports_444)
+{
+    AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data;
+    const AVPixFmtDescriptor *sw_desc;
+    int cuvid_codec_type, cuvid_chroma_format, chroma_444;
+
+    sw_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+    if (!sw_desc)
+        return AVERROR_BUG;
+
+    cuvid_codec_type = map_avcodec_id(avctx->codec_id);
+    if (cuvid_codec_type < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported codec ID\n");
+        return AVERROR_BUG;
+    }
+
+    cuvid_chroma_format = map_chroma_format(avctx->sw_pix_fmt);
+    if (cuvid_chroma_format < 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Unsupported chroma format\n");
+        return AVERROR(EINVAL);
+    }
+    chroma_444 = supports_444 && cuvid_chroma_format == cudaVideoChromaFormat_444;
+
+    frames_ctx->format            = AV_PIX_FMT_CUDA;
+    frames_ctx->width             = (avctx->coded_width + 1) & ~1;
+    frames_ctx->height            = (avctx->coded_height + 1) & ~1;
+    /*
+     * We add two extra frames to the pool to account for deinterlacing filters
+     * holding onto their frames.
+     */
+    frames_ctx->initial_pool_size = dpb_size + 2;
+
+    frames_ctx->free = nvdec_free_dummy;
+    frames_ctx->pool = av_buffer_pool_init(0, nvdec_alloc_dummy);
+
+    if (!frames_ctx->pool)
+        return AVERROR(ENOMEM);
+
+    switch (sw_desc->comp[0].depth) {
+    case 8:
+        frames_ctx->sw_format = chroma_444 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_NV12;
+        break;
+    case 10:
+        frames_ctx->sw_format = chroma_444 ? AV_PIX_FMT_YUV444P16 : AV_PIX_FMT_P010;
+        break;
+    case 12:
+        frames_ctx->sw_format = chroma_444 ? AV_PIX_FMT_YUV444P16 : AV_PIX_FMT_P016;
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+int ff_nvdec_get_ref_idx(AVFrame *frame)
+{
+    FrameDecodeData *fdd;
+    NVDECFrame *cf;
+
+    if (!frame || !frame->private_ref)
+        return -1;
+
+    fdd = (FrameDecodeData*)frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
+    if (!cf)
+        return -1;
+
+    return cf->idx;
+}
diff --git a/libavcodec/nvdec.h b/libavcodec/nvdec.h
new file mode 100644
index 0000000..09ae8c3
--- /dev/null
+++ b/libavcodec/nvdec.h
@@ -0,0 +1,81 @@
+/*
+ * HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2016 Anton Khirnov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_NVDEC_H
+#define AVCODEC_NVDEC_H
+
+#include "compat/cuda/dynlink_loader.h"
+
+#include <stdint.h>
+
+#include "libavutil/buffer.h"
+#include "libavutil/frame.h"
+
+#include "avcodec.h"
+
+#if defined(NVDECAPI_MAJOR_VERSION) && defined(NVDECAPI_MINOR_VERSION)
+# define NVDECAPI_CHECK_VERSION(major, minor) \
+    ((major) < NVDECAPI_MAJOR_VERSION || ((major) == NVDECAPI_MAJOR_VERSION && (minor) <= NVDECAPI_MINOR_VERSION))
+#else
+/* version macros were added in SDK 8.1 ffnvcodec */
+# define NVDECAPI_CHECK_VERSION(major, minor) \
+    ((major) < 8 || ((major) == 8 && (minor) <= 0))
+#endif
+
+typedef struct NVDECFrame {
+    unsigned int idx;
+    AVBufferRef *idx_ref;
+    AVBufferRef *decoder_ref;
+} NVDECFrame;
+
+typedef struct NVDECContext {
+    CUVIDPICPARAMS pic_params;
+
+    AVBufferPool *decoder_pool;
+
+    AVBufferRef  *decoder_ref;
+
+    uint8_t      *bitstream;
+    int           bitstream_len;
+    unsigned int  bitstream_allocated;
+
+    unsigned     *slice_offsets;
+    int           nb_slices;
+    unsigned int  slice_offsets_allocated;
+
+    int           supports_444;
+} NVDECContext;
+
+int ff_nvdec_decode_init(AVCodecContext *avctx);
+int ff_nvdec_decode_uninit(AVCodecContext *avctx);
+int ff_nvdec_start_frame(AVCodecContext *avctx, AVFrame *frame);
+int ff_nvdec_end_frame(AVCodecContext *avctx);
+int ff_nvdec_simple_end_frame(AVCodecContext *avctx);
+int ff_nvdec_simple_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
+                                 uint32_t size);
+int ff_nvdec_frame_params(AVCodecContext *avctx,
+                          AVBufferRef *hw_frames_ctx,
+                          int dpb_size,
+                          int supports_444);
+int ff_nvdec_get_ref_idx(AVFrame *frame);
+
+#endif /* AVCODEC_NVDEC_H */
diff --git a/libavcodec/cuvid_h264.c b/libavcodec/nvdec_h264.c
index 4f36e92..116bd4f 100644
--- a/libavcodec/cuvid_h264.c
+++ b/libavcodec/nvdec_h264.c
@@ -1,31 +1,30 @@
 /*
- * MPEG-4 Part 10 / AVC / H.264 HW decode acceleration through CUVID
+ * MPEG-4 Part 10 / AVC / H.264 HW decode acceleration through NVDEC
  *
  * Copyright (c) 2016 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <cuviddec.h>
 #include <stdint.h>
 #include <string.h>
 
 #include "avcodec.h"
-#include "cuvid.h"
+#include "nvdec.h"
 #include "decode.h"
 #include "internal.h"
 #include "h264dec.h"
@@ -33,8 +32,8 @@
 static void dpb_add(const H264Context *h, CUVIDH264DPBENTRY *dst, const H264Picture *src,
                     int frame_idx)
 {
-    FrameDecodeData *fdd = (FrameDecodeData*)src->f->opaque_ref->data;
-    const CUVIDFrame *cf = fdd->hwaccel_priv;
+    FrameDecodeData *fdd = (FrameDecodeData*)src->f->private_ref->data;
+    const NVDECFrame *cf = fdd->hwaccel_priv;
 
     dst->PicIdx             = cf ? cf->idx : -1;
     dst->FrameIdx           = frame_idx;
@@ -45,27 +44,27 @@ static void dpb_add(const H264Context *h, CUVIDH264DPBENTRY *dst, const H264Pict
     dst->FieldOrderCnt[1]   = src->field_poc[1];
 }
 
-static int cuvid_h264_start_frame(AVCodecContext *avctx,
+static int nvdec_h264_start_frame(AVCodecContext *avctx,
                                   const uint8_t *buffer, uint32_t size)
 {
     const H264Context *h = avctx->priv_data;
     const PPS *pps = h->ps.pps;
     const SPS *sps = h->ps.sps;
 
-    CUVIDContext       *ctx = avctx->internal->hwaccel_priv_data;
+    NVDECContext       *ctx = avctx->internal->hwaccel_priv_data;
     CUVIDPICPARAMS      *pp = &ctx->pic_params;
     CUVIDH264PICPARAMS *ppc = &pp->CodecSpecific.h264;
     FrameDecodeData *fdd;
-    CUVIDFrame *cf;
+    NVDECFrame *cf;
 
     int i, dpb_size, ret;
 
-    ret = ff_cuvid_start_frame(avctx, h->cur_pic_ptr->f);
+    ret = ff_nvdec_start_frame(avctx, h->cur_pic_ptr->f);
     if (ret < 0)
         return ret;
 
-    fdd = (FrameDecodeData*)h->cur_pic_ptr->f->opaque_ref->data;
-    cf  = (CUVIDFrame*)fdd->hwaccel_priv;
+    fdd = (FrameDecodeData*)h->cur_pic_ptr->f->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
 
     *pp = (CUVIDPICPARAMS) {
         .PicWidthInMbs     = h->mb_width,
@@ -75,7 +74,7 @@ static int cuvid_h264_start_frame(AVCodecContext *avctx,
         .bottom_field_flag = h->picture_structure == PICT_BOTTOM_FIELD,
         .second_field      = FIELD_PICTURE(h) && !h->first_field,
         .ref_pic_flag      = h->nal_ref_idc != 0,
-        .intra_pic_flag    = 0,
+        .intra_pic_flag    = 1,
 
         .CodecSpecific.h264 = {
             .log2_max_frame_num_minus4            = sps->log2_max_frame_num - 4,
@@ -129,10 +128,13 @@ static int cuvid_h264_start_frame(AVCodecContext *avctx,
     return 0;
 }
 
-static int cuvid_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
+static int nvdec_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
                                    uint32_t size)
 {
-    CUVIDContext *ctx = avctx->internal->hwaccel_priv_data;
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
+    CUVIDPICPARAMS *pp = &ctx->pic_params;
+    const H264Context *h = avctx->priv_data;
+    const H264SliceContext *sl = &h->slice_ctx[0];
     void *tmp;
 
     tmp = av_fast_realloc(ctx->bitstream, &ctx->bitstream_allocated,
@@ -153,25 +155,30 @@ static int cuvid_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
     ctx->bitstream_len += size + 3;
     ctx->nb_slices++;
 
+    if (sl->slice_type != AV_PICTURE_TYPE_I && sl->slice_type != AV_PICTURE_TYPE_SI)
+        pp->intra_pic_flag = 0;
+
     return 0;
 }
 
-static int cuvid_h264_decode_init(AVCodecContext *avctx)
+static int nvdec_h264_frame_params(AVCodecContext *avctx,
+                                   AVBufferRef *hw_frames_ctx)
 {
     const H264Context *h = avctx->priv_data;
     const SPS       *sps = h->ps.sps;
-    return ff_cuvid_decode_init(avctx, sps->ref_frame_count + sps->num_reorder_frames);
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, sps->ref_frame_count + sps->num_reorder_frames, 0);
 }
 
-const AVHWAccel ff_h264_cuvid_hwaccel = {
-    .name                 = "h264_cuvid",
+const AVHWAccel ff_h264_nvdec_hwaccel = {
+    .name                 = "h264_nvdec",
     .type                 = AVMEDIA_TYPE_VIDEO,
     .id                   = AV_CODEC_ID_H264,
     .pix_fmt              = AV_PIX_FMT_CUDA,
-    .start_frame          = cuvid_h264_start_frame,
-    .end_frame            = ff_cuvid_end_frame,
-    .decode_slice         = cuvid_h264_decode_slice,
-    .init                 = cuvid_h264_decode_init,
-    .uninit               = ff_cuvid_decode_uninit,
-    .priv_data_size       = sizeof(CUVIDContext),
+    .start_frame          = nvdec_h264_start_frame,
+    .end_frame            = ff_nvdec_end_frame,
+    .decode_slice         = nvdec_h264_decode_slice,
+    .frame_params         = nvdec_h264_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
 };
diff --git a/libavcodec/cuvid_hevc.c b/libavcodec/nvdec_hevc.c
index fcf20bb..590278b 100644
--- a/libavcodec/cuvid_hevc.c
+++ b/libavcodec/nvdec_hevc.c
@@ -1,31 +1,30 @@
 /*
- * HEVC HW decode acceleration through CUVID
+ * HEVC HW decode acceleration through NVDEC
  *
  * Copyright (c) 2017 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <cuviddec.h>
 #include <stdint.h>
 #include <string.h>
 
 #include "avcodec.h"
-#include "cuvid.h"
+#include "nvdec.h"
 #include "decode.h"
 #include "internal.h"
 #include "hevcdec.h"
@@ -33,8 +32,8 @@
 
 static void dpb_add(CUVIDHEVCPICPARAMS *pp, int idx, const HEVCFrame *src)
 {
-    FrameDecodeData *fdd = (FrameDecodeData*)src->frame->opaque_ref->data;
-    const CUVIDFrame *cf = fdd->hwaccel_priv;
+    FrameDecodeData *fdd = (FrameDecodeData*)src->frame->private_ref->data;
+    const NVDECFrame *cf = fdd->hwaccel_priv;
 
     pp->RefPicIdx[idx]      = cf ? cf->idx : -1;
     pp->PicOrderCntVal[idx] = src->poc;
@@ -59,42 +58,43 @@ static void fill_scaling_lists(CUVIDHEVCPICPARAMS *ppc, const HEVCContext *s)
             ppc->ScalingList16x16[i][j] = sl->sl[2][i][pos];
 
             if (i < 2)
-                ppc->ScalingList32x32[i][j] = sl->sl[3][i][pos];
+                ppc->ScalingList32x32[i][j] = sl->sl[3][i * 3][pos];
         }
-    }
 
-    memcpy(ppc->ScalingListDCCoeff16x16, sl->sl_dc[0], sizeof(ppc->ScalingListDCCoeff16x16));
-    memcpy(ppc->ScalingListDCCoeff32x32, sl->sl_dc[1], sizeof(ppc->ScalingListDCCoeff32x32));
+        ppc->ScalingListDCCoeff16x16[i] = sl->sl_dc[0][i];
+        if (i < 2)
+            ppc->ScalingListDCCoeff32x32[i] = sl->sl_dc[1][i * 3];
+    }
 }
 
-static int cuvid_hevc_start_frame(AVCodecContext *avctx,
+static int nvdec_hevc_start_frame(AVCodecContext *avctx,
                                   const uint8_t *buffer, uint32_t size)
 {
     const HEVCContext *s = avctx->priv_data;
     const HEVCPPS *pps = s->ps.pps;
     const HEVCSPS *sps = s->ps.sps;
 
-    CUVIDContext       *ctx = avctx->internal->hwaccel_priv_data;
+    NVDECContext       *ctx = avctx->internal->hwaccel_priv_data;
     CUVIDPICPARAMS      *pp = &ctx->pic_params;
     CUVIDHEVCPICPARAMS *ppc = &pp->CodecSpecific.hevc;
     FrameDecodeData *fdd;
-    CUVIDFrame *cf;
+    NVDECFrame *cf;
 
     int i, j, dpb_size, ret;
 
-    ret = ff_cuvid_start_frame(avctx, s->ref->frame);
+    ret = ff_nvdec_start_frame(avctx, s->ref->frame);
     if (ret < 0)
         return ret;
 
-    fdd = (FrameDecodeData*)s->ref->frame->opaque_ref->data;
-    cf  = (CUVIDFrame*)fdd->hwaccel_priv;
+    fdd = (FrameDecodeData*)s->ref->frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
 
     *pp = (CUVIDPICPARAMS) {
         .PicWidthInMbs     = sps->width  / 16,
         .FrameHeightInMbs  = sps->height / 16,
         .CurrPicIdx        = cf->idx,
         .ref_pic_flag      = 1,
-        .intra_pic_flag    = 0,
+        .intra_pic_flag    = IS_IRAP(s),
 
         .CodecSpecific.hevc = {
             .pic_width_in_luma_samples                    = sps->width,
@@ -108,6 +108,12 @@ static int cuvid_hevc_start_frame(AVCodecContext *avctx,
             .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
             .pcm_sample_bit_depth_luma_minus1             = sps->pcm_enabled_flag ? sps->pcm.bit_depth - 1 : 0,
             .pcm_sample_bit_depth_chroma_minus1           = sps->pcm_enabled_flag ? sps->pcm.bit_depth_chroma - 1 : 0,
+#if NVDECAPI_CHECK_VERSION(8, 1)
+            .log2_max_transform_skip_block_size_minus2    = pps->log2_max_transform_skip_block_size - 2,
+            .log2_sao_offset_scale_luma                   = pps->log2_sao_offset_scale_luma,
+            .log2_sao_offset_scale_chroma                 = pps->log2_sao_offset_scale_chroma,
+            .high_precision_offsets_enabled_flag          = sps->high_precision_offsets_enabled_flag,
+#endif
             .pcm_loop_filter_disabled_flag                = sps->pcm.loop_filter_disable_flag,
             .strong_intra_smoothing_enabled_flag          = sps->sps_strong_intra_smoothing_enable_flag,
             .max_transform_hierarchy_depth_intra          = sps->max_transform_hierarchy_depth_intra,
@@ -125,6 +131,17 @@ static int cuvid_hevc_start_frame(AVCodecContext *avctx,
             .IdrPicFlag                                   = IS_IDR(s),
             .bit_depth_luma_minus8                        = sps->bit_depth - 8,
             .bit_depth_chroma_minus8                      = sps->bit_depth - 8,
+#if NVDECAPI_CHECK_VERSION(9, 0)
+            .sps_range_extension_flag                     = sps->sps_range_extension_flag,
+            .transform_skip_rotation_enabled_flag         = sps->transform_skip_rotation_enabled_flag,
+            .transform_skip_context_enabled_flag          = sps->transform_skip_context_enabled_flag,
+            .implicit_rdpcm_enabled_flag                  = sps->implicit_rdpcm_enabled_flag,
+            .explicit_rdpcm_enabled_flag                  = sps->explicit_rdpcm_enabled_flag,
+            .extended_precision_processing_flag           = sps->extended_precision_processing_flag,
+            .intra_smoothing_disabled_flag                = sps->intra_smoothing_disabled_flag,
+            .persistent_rice_adaptation_enabled_flag      = sps->persistent_rice_adaptation_enabled_flag,
+            .cabac_bypass_alignment_enabled_flag          = sps->cabac_bypass_alignment_enabled_flag,
+#endif
 
             .dependent_slice_segments_enabled_flag        = pps->dependent_slice_segments_enabled_flag,
             .slice_segment_header_extension_present_flag  = pps->slice_header_extension_present_flag,
@@ -158,11 +175,17 @@ static int cuvid_hevc_start_frame(AVCodecContext *avctx,
             .uniform_spacing_flag                         = pps->uniform_spacing_flag,
             .num_tile_columns_minus1                      = pps->num_tile_columns - 1,
             .num_tile_rows_minus1                         = pps->num_tile_rows - 1,
+#if NVDECAPI_CHECK_VERSION(9, 0)
+            .pps_range_extension_flag                     = pps->pps_range_extensions_flag,
+            .cross_component_prediction_enabled_flag      = pps->cross_component_prediction_enabled_flag,
+            .chroma_qp_offset_list_enabled_flag           = pps->chroma_qp_offset_list_enabled_flag,
+            .diff_cu_chroma_qp_offset_depth               = pps->diff_cu_chroma_qp_offset_depth,
+            .chroma_qp_offset_list_len_minus1             = pps->chroma_qp_offset_list_len_minus1,
+#endif
 
             .NumBitsForShortTermRPSInSlice                = s->sh.short_term_rps ? s->sh.short_term_ref_pic_set_size : 0,
             .NumDeltaPocsOfRefRpsIdx                      = s->sh.short_term_rps ? s->sh.short_term_rps->rps_idx_num_delta_pocs : 0,
-            .NumPocTotalCurr                              = s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
-                                                            s->rps[LT_CURR].nb_refs,
+            .NumPocTotalCurr                              = ff_hevc_frame_nb_refs(s),
             .NumPocStCurrBefore                           = s->rps[ST_CURR_BEF].nb_refs,
             .NumPocStCurrAfter                            = s->rps[ST_CURR_AFT].nb_refs,
             .NumPocLtCurr                                 = s->rps[LT_CURR].nb_refs,
@@ -180,6 +203,18 @@ static int cuvid_hevc_start_frame(AVCodecContext *avctx,
     for (i = 0; i < pps->num_tile_rows; i++)
         ppc->row_height_minus1[i] = pps->row_height[i] - 1;
 
+#if NVDECAPI_CHECK_VERSION(9, 0)
+    if (pps->chroma_qp_offset_list_len_minus1 > FF_ARRAY_ELEMS(ppc->cb_qp_offset_list) ||
+        pps->chroma_qp_offset_list_len_minus1 > FF_ARRAY_ELEMS(ppc->cr_qp_offset_list)) {
+        av_log(avctx, AV_LOG_ERROR, "Too many chroma_qp_offsets\n");
+        return AVERROR(ENOSYS);
+    }
+    for (i = 0; i <= pps->chroma_qp_offset_list_len_minus1; i++) {
+        ppc->cb_qp_offset_list[i] = pps->cb_qp_offset_list[i];
+        ppc->cr_qp_offset_list[i] = pps->cr_qp_offset_list[i];
+    }
+#endif
+
     if (s->rps[LT_CURR].nb_refs     > FF_ARRAY_ELEMS(ppc->RefPicSetLtCurr)       ||
         s->rps[ST_CURR_BEF].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetStCurrBefore) ||
         s->rps[ST_CURR_AFT].nb_refs > FF_ARRAY_ELEMS(ppc->RefPicSetStCurrAfter)) {
@@ -232,10 +267,10 @@ static int cuvid_hevc_start_frame(AVCodecContext *avctx,
     return 0;
 }
 
-static int cuvid_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
+static int nvdec_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
                                    uint32_t size)
 {
-    CUVIDContext *ctx = avctx->internal->hwaccel_priv_data;
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
     void *tmp;
 
     tmp = av_fast_realloc(ctx->bitstream, &ctx->bitstream_allocated,
@@ -259,22 +294,30 @@ static int cuvid_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
     return 0;
 }
 
-static int cuvid_hevc_decode_init(AVCodecContext *avctx)
+static int nvdec_hevc_frame_params(AVCodecContext *avctx,
+                                   AVBufferRef *hw_frames_ctx)
 {
     const HEVCContext *s = avctx->priv_data;
     const HEVCSPS *sps = s->ps.sps;
-    return ff_cuvid_decode_init(avctx, sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + 1);
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + 1, 1);
+}
+
+static int nvdec_hevc_decode_init(AVCodecContext *avctx) {
+    NVDECContext *ctx = avctx->internal->hwaccel_priv_data;
+    ctx->supports_444 = 1;
+    return ff_nvdec_decode_init(avctx);
 }
 
-const AVHWAccel ff_hevc_cuvid_hwaccel = {
-    .name                 = "hevc_cuvid",
+const AVHWAccel ff_hevc_nvdec_hwaccel = {
+    .name                 = "hevc_nvdec",
     .type                 = AVMEDIA_TYPE_VIDEO,
     .id                   = AV_CODEC_ID_HEVC,
     .pix_fmt              = AV_PIX_FMT_CUDA,
-    .start_frame          = cuvid_hevc_start_frame,
-    .end_frame            = ff_cuvid_end_frame,
-    .decode_slice         = cuvid_hevc_decode_slice,
-    .init                 = cuvid_hevc_decode_init,
-    .uninit               = ff_cuvid_decode_uninit,
-    .priv_data_size       = sizeof(CUVIDContext),
+    .start_frame          = nvdec_hevc_start_frame,
+    .end_frame            = ff_nvdec_end_frame,
+    .decode_slice         = nvdec_hevc_decode_slice,
+    .frame_params         = nvdec_hevc_frame_params,
+    .init                 = nvdec_hevc_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
 };
diff --git a/libavcodec/nvdec_mjpeg.c b/libavcodec/nvdec_mjpeg.c
new file mode 100644
index 0000000..be39d23
--- /dev/null
+++ b/libavcodec/nvdec_mjpeg.c
@@ -0,0 +1,86 @@
+/*
+ * MJPEG HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2017 Philip Langdale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "mjpegdec.h"
+#include "nvdec.h"
+#include "decode.h"
+
+static int nvdec_mjpeg_start_frame(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    MJpegDecodeContext *s = avctx->priv_data;
+
+    NVDECContext      *ctx = avctx->internal->hwaccel_priv_data;
+    CUVIDPICPARAMS     *pp = &ctx->pic_params;
+    FrameDecodeData *fdd;
+    NVDECFrame *cf;
+    AVFrame *cur_frame = s->picture;
+
+    int ret;
+
+    ret = ff_nvdec_start_frame(avctx, cur_frame);
+    if (ret < 0)
+        return ret;
+
+    fdd = (FrameDecodeData*)cur_frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
+
+    *pp = (CUVIDPICPARAMS) {
+        .PicWidthInMbs     = (cur_frame->width  + 15) / 16,
+        .FrameHeightInMbs  = (cur_frame->height + 15) / 16,
+        .CurrPicIdx        = cf->idx,
+
+        .intra_pic_flag    = 1,
+        .ref_pic_flag      = 0,
+    };
+
+    return ff_nvdec_simple_decode_slice(avctx, buffer, size);
+}
+
+static int nvdec_mjpeg_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    return 0;
+}
+
+static int nvdec_mjpeg_frame_params(AVCodecContext *avctx,
+                                  AVBufferRef *hw_frames_ctx)
+{
+    // Only need storage for the current frame
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, 1, 0);
+}
+
+#if CONFIG_MJPEG_NVDEC_HWACCEL
+AVHWAccel ff_mjpeg_nvdec_hwaccel = {
+    .name                 = "mjpeg_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_MJPEG,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_mjpeg_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = nvdec_mjpeg_decode_slice,
+    .frame_params         = nvdec_mjpeg_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
+#endif
diff --git a/libavcodec/nvdec_mpeg12.c b/libavcodec/nvdec_mpeg12.c
new file mode 100644
index 0000000..300e1d3
--- /dev/null
+++ b/libavcodec/nvdec_mpeg12.c
@@ -0,0 +1,123 @@
+/*
+ * MPEG-1/2 HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2017 Philip Langdale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "mpegvideo.h"
+#include "nvdec.h"
+#include "decode.h"
+
+static int nvdec_mpeg12_start_frame(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    MpegEncContext *s = avctx->priv_data;
+
+    NVDECContext      *ctx = avctx->internal->hwaccel_priv_data;
+    CUVIDPICPARAMS     *pp = &ctx->pic_params;
+    CUVIDMPEG2PICPARAMS *ppc = &pp->CodecSpecific.mpeg2;
+    FrameDecodeData *fdd;
+    NVDECFrame *cf;
+    AVFrame *cur_frame = s->current_picture.f;
+
+    int ret, i;
+
+    ret = ff_nvdec_start_frame(avctx, cur_frame);
+    if (ret < 0)
+        return ret;
+
+    fdd = (FrameDecodeData*)cur_frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
+
+    *pp = (CUVIDPICPARAMS) {
+        .PicWidthInMbs     = (cur_frame->width  + 15) / 16,
+        .FrameHeightInMbs  = (cur_frame->height + 15) / 16,
+        .CurrPicIdx        = cf->idx,
+
+        .intra_pic_flag    = s->pict_type == AV_PICTURE_TYPE_I,
+        .ref_pic_flag      = s->pict_type == AV_PICTURE_TYPE_I ||
+                             s->pict_type == AV_PICTURE_TYPE_P,
+
+        .CodecSpecific.mpeg2 = {
+            .ForwardRefIdx     = ff_nvdec_get_ref_idx(s->last_picture.f),
+            .BackwardRefIdx    = ff_nvdec_get_ref_idx(s->next_picture.f),
+
+            .picture_coding_type        = s->pict_type,
+            .full_pel_forward_vector    = s->full_pel[0],
+            .full_pel_backward_vector   = s->full_pel[1],
+            .f_code                     = { { s->mpeg_f_code[0][0],
+                                              s->mpeg_f_code[0][1] },
+                                            { s->mpeg_f_code[1][0],
+                                              s->mpeg_f_code[1][1] } },
+            .intra_dc_precision         = s->intra_dc_precision,
+            .frame_pred_frame_dct       = s->frame_pred_frame_dct,
+            .concealment_motion_vectors = s->concealment_motion_vectors,
+            .q_scale_type               = s->q_scale_type,
+            .intra_vlc_format           = s->intra_vlc_format,
+            .alternate_scan             = s->alternate_scan,
+            .top_field_first            = s->top_field_first,
+        }
+    };
+
+    for (i = 0; i < 64; ++i) {
+        ppc->QuantMatrixIntra[i] = s->intra_matrix[i];
+        ppc->QuantMatrixInter[i] = s->inter_matrix[i];
+    }
+
+    return 0;
+}
+
+static int nvdec_mpeg12_frame_params(AVCodecContext *avctx,
+                                  AVBufferRef *hw_frames_ctx)
+{
+    // Each frame can at most have one P and one B reference
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, 2, 0);
+}
+
+#if CONFIG_MPEG2_NVDEC_HWACCEL
+const AVHWAccel ff_mpeg2_nvdec_hwaccel = {
+    .name                 = "mpeg2_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_MPEG2VIDEO,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_mpeg12_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = ff_nvdec_simple_decode_slice,
+    .frame_params         = nvdec_mpeg12_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
+#endif
+
+#if CONFIG_MPEG1_NVDEC_HWACCEL
+const AVHWAccel ff_mpeg1_nvdec_hwaccel = {
+    .name                 = "mpeg1_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_MPEG1VIDEO,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_mpeg12_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = ff_nvdec_simple_decode_slice,
+    .frame_params         = nvdec_mpeg12_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
+#endif
diff --git a/libavcodec/nvdec_mpeg4.c b/libavcodec/nvdec_mpeg4.c
new file mode 100644
index 0000000..739b049
--- /dev/null
+++ b/libavcodec/nvdec_mpeg4.c
@@ -0,0 +1,121 @@
+/*
+ * MPEG-4 Part 2 HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2017 Philip Langdale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "mpeg4video.h"
+#include "nvdec.h"
+#include "decode.h"
+
+static int nvdec_mpeg4_start_frame(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    Mpeg4DecContext *m = avctx->priv_data;
+    MpegEncContext *s = &m->m;
+
+    NVDECContext      *ctx = avctx->internal->hwaccel_priv_data;
+    CUVIDPICPARAMS     *pp = &ctx->pic_params;
+    CUVIDMPEG4PICPARAMS *ppc = &pp->CodecSpecific.mpeg4;
+    FrameDecodeData *fdd;
+    NVDECFrame *cf;
+    AVFrame *cur_frame = s->current_picture.f;
+
+    int ret, i;
+
+    ret = ff_nvdec_start_frame(avctx, cur_frame);
+    if (ret < 0)
+        return ret;
+
+    fdd = (FrameDecodeData*)cur_frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
+
+    *pp = (CUVIDPICPARAMS) {
+        .PicWidthInMbs     = (cur_frame->width  + 15) / 16,
+        .FrameHeightInMbs  = (cur_frame->height + 15) / 16,
+        .CurrPicIdx        = cf->idx,
+
+        .intra_pic_flag    = s->pict_type == AV_PICTURE_TYPE_I,
+        .ref_pic_flag      = s->pict_type == AV_PICTURE_TYPE_I ||
+                             s->pict_type == AV_PICTURE_TYPE_P ||
+                             s->pict_type == AV_PICTURE_TYPE_S,
+
+        .CodecSpecific.mpeg4 = {
+            .ForwardRefIdx                = ff_nvdec_get_ref_idx(s->last_picture.f),
+            .BackwardRefIdx               = ff_nvdec_get_ref_idx(s->next_picture.f),
+
+            .video_object_layer_width     = s->width,
+            .video_object_layer_height    = s->height,
+            .vop_time_increment_bitcount  = m->time_increment_bits,
+            .top_field_first              = s->top_field_first,
+            .resync_marker_disable        = !m->resync_marker,
+            .quant_type                   = s->mpeg_quant,
+            .quarter_sample               = s->quarter_sample,
+            .short_video_header           = avctx->codec->id == AV_CODEC_ID_H263,
+            .divx_flags                   = s->divx_packed ? 5 : 0,
+
+            .vop_coding_type              = s->pict_type - AV_PICTURE_TYPE_I,
+            .vop_coded                    = 1,
+            .vop_rounding_type            = s->no_rounding,
+            .alternate_vertical_scan_flag = s->alternate_scan,
+            .interlaced                   = !s->progressive_sequence,
+            .vop_fcode_forward            = s->f_code,
+            .vop_fcode_backward           = s->b_code,
+            .trd                          = { s->pp_time, s->pp_field_time >> 1 },
+            .trb                          = { s->pb_time, s->pb_field_time >> 1 },
+
+            .gmc_enabled                  = s->pict_type == AV_PICTURE_TYPE_S &&
+                                            m->vol_sprite_usage == GMC_SPRITE,
+        }
+    };
+
+    for (i = 0; i < 64; ++i) {
+        ppc->QuantMatrixIntra[i] = s->intra_matrix[i];
+        ppc->QuantMatrixInter[i] = s->inter_matrix[i];
+    }
+
+    // We need to pass the full frame buffer and not just the slice
+    return ff_nvdec_simple_decode_slice(avctx, buffer, size);
+}
+
+static int nvdec_mpeg4_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    return 0;
+}
+
+static int nvdec_mpeg4_frame_params(AVCodecContext *avctx,
+                                  AVBufferRef *hw_frames_ctx)
+{
+    // Each frame can at most have one P and one B reference
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, 2, 0);
+}
+
+const AVHWAccel ff_mpeg4_nvdec_hwaccel = {
+    .name                 = "mpeg4_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_MPEG4,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_mpeg4_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = nvdec_mpeg4_decode_slice,
+    .frame_params         = nvdec_mpeg4_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
diff --git a/libavcodec/nvdec_vc1.c b/libavcodec/nvdec_vc1.c
new file mode 100644
index 0000000..10e7b5a
--- /dev/null
+++ b/libavcodec/nvdec_vc1.c
@@ -0,0 +1,141 @@
+/*
+ * VC1 HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2017 Philip Langdale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "nvdec.h"
+#include "decode.h"
+#include "vc1.h"
+
+static int nvdec_vc1_start_frame(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    VC1Context *v = avctx->priv_data;
+    MpegEncContext *s = &v->s;
+
+    NVDECContext      *ctx = avctx->internal->hwaccel_priv_data;
+    CUVIDPICPARAMS     *pp = &ctx->pic_params;
+    FrameDecodeData *fdd;
+    NVDECFrame *cf;
+    AVFrame *cur_frame = s->current_picture.f;
+
+    int ret;
+
+    ret = ff_nvdec_start_frame(avctx, cur_frame);
+    if (ret < 0)
+        return ret;
+
+    fdd = (FrameDecodeData*)cur_frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
+
+    *pp = (CUVIDPICPARAMS) {
+        .PicWidthInMbs     = (cur_frame->width  + 15) / 16,
+        .FrameHeightInMbs  = (cur_frame->height + 15) / 16,
+        .CurrPicIdx        = cf->idx,
+        .field_pic_flag    = v->field_mode,
+        .bottom_field_flag = v->cur_field_type,
+        .second_field      = v->second_field,
+
+        .intra_pic_flag    = s->pict_type == AV_PICTURE_TYPE_I ||
+                             s->pict_type == AV_PICTURE_TYPE_BI,
+        .ref_pic_flag      = s->pict_type == AV_PICTURE_TYPE_I ||
+                             s->pict_type == AV_PICTURE_TYPE_P,
+
+        .CodecSpecific.vc1 = {
+            .ForwardRefIdx     = ff_nvdec_get_ref_idx(s->last_picture.f),
+            .BackwardRefIdx    = ff_nvdec_get_ref_idx(s->next_picture.f),
+            .FrameWidth        = cur_frame->width,
+            .FrameHeight       = cur_frame->height,
+
+            .intra_pic_flag    = s->pict_type == AV_PICTURE_TYPE_I ||
+                                 s->pict_type == AV_PICTURE_TYPE_BI,
+            .ref_pic_flag      = s->pict_type == AV_PICTURE_TYPE_I ||
+                                 s->pict_type == AV_PICTURE_TYPE_P,
+            .progressive_fcm   = v->fcm == 0,
+
+            .profile           = v->profile,
+            .postprocflag      = v->postprocflag,
+            .pulldown          = v->broadcast,
+            .interlace         = v->interlace,
+            .tfcntrflag        = v->tfcntrflag,
+            .finterpflag       = v->finterpflag,
+            .psf               = v->psf,
+            .multires          = v->multires,
+            .syncmarker        = v->resync_marker,
+            .rangered          = v->rangered,
+            .maxbframes        = s->max_b_frames,
+
+            .panscan_flag      = v->panscanflag,
+            .refdist_flag      = v->refdist_flag,
+            .extended_mv       = v->extended_mv,
+            .dquant            = v->dquant,
+            .vstransform       = v->vstransform,
+            .loopfilter        = v->s.loop_filter,
+            .fastuvmc          = v->fastuvmc,
+            .overlap           = v->overlap,
+            .quantizer         = v->quantizer_mode,
+            .extended_dmv      = v->extended_dmv,
+            .range_mapy_flag   = v->range_mapy_flag,
+            .range_mapy        = v->range_mapy,
+            .range_mapuv_flag  = v->range_mapuv_flag,
+            .range_mapuv       = v->range_mapuv,
+            .rangeredfrm       = v->rangeredfrm,
+        }
+    };
+
+    return 0;
+}
+
+static int nvdec_vc1_frame_params(AVCodecContext *avctx,
+                                  AVBufferRef *hw_frames_ctx)
+{
+    // Each frame can at most have one P and one B reference
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, 2, 0);
+}
+
+const AVHWAccel ff_vc1_nvdec_hwaccel = {
+    .name                 = "vc1_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_VC1,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_vc1_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = ff_nvdec_simple_decode_slice,
+    .frame_params         = nvdec_vc1_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
+
+#if CONFIG_WMV3_NVDEC_HWACCEL
+const AVHWAccel ff_wmv3_nvdec_hwaccel = {
+    .name                 = "wmv3_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_WMV3,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_vc1_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = ff_nvdec_simple_decode_slice,
+    .frame_params         = nvdec_vc1_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
+#endif
diff --git a/libavcodec/nvdec_vp8.c b/libavcodec/nvdec_vp8.c
new file mode 100644
index 0000000..9c4608d
--- /dev/null
+++ b/libavcodec/nvdec_vp8.c
@@ -0,0 +1,105 @@
+/*
+ * VP8 HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2017 Philip Langdale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "nvdec.h"
+#include "decode.h"
+#include "internal.h"
+#include "vp8.h"
+
+static unsigned char safe_get_ref_idx(VP8Frame *frame)
+{
+    return frame ? ff_nvdec_get_ref_idx(frame->tf.f) : 255;
+}
+
+static int nvdec_vp8_start_frame(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    VP8Context *h = avctx->priv_data;
+
+    NVDECContext      *ctx = avctx->internal->hwaccel_priv_data;
+    CUVIDPICPARAMS     *pp = &ctx->pic_params;
+    FrameDecodeData *fdd;
+    NVDECFrame *cf;
+    AVFrame *cur_frame = h->framep[VP56_FRAME_CURRENT]->tf.f;
+
+    int ret;
+
+    ret = ff_nvdec_start_frame(avctx, cur_frame);
+    if (ret < 0)
+        return ret;
+
+    fdd = (FrameDecodeData*)cur_frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
+
+    *pp = (CUVIDPICPARAMS) {
+        .PicWidthInMbs     = (cur_frame->width  + 15) / 16,
+        .FrameHeightInMbs  = (cur_frame->height + 15) / 16,
+        .CurrPicIdx        = cf->idx,
+
+        .CodecSpecific.vp8 = {
+            .width                       = cur_frame->width,
+            .height                      = cur_frame->height,
+
+            .first_partition_size        = h->header_partition_size,
+
+            .LastRefIdx                  = safe_get_ref_idx(h->framep[VP56_FRAME_PREVIOUS]),
+            .GoldenRefIdx                = safe_get_ref_idx(h->framep[VP56_FRAME_GOLDEN]),
+            .AltRefIdx                   = safe_get_ref_idx(h->framep[VP56_FRAME_GOLDEN2]),
+            /*
+             * Explicit braces for anonymous inners and unnamed fields
+             * to work around limitations in ancient versions of gcc.
+             */
+            { // union
+                { // struct
+                    !h->keyframe,             // frame_type
+                    h->profile,               // version
+                    !h->invisible,            // show_frame
+                    h->segmentation.enabled ? // update_mb_segmentation_data
+                        h->segmentation.update_feature_data : 0,
+                }
+            }
+        }
+    };
+
+    return 0;
+}
+
+static int nvdec_vp8_frame_params(AVCodecContext *avctx,
+                                  AVBufferRef *hw_frames_ctx)
+{
+    // VP8 uses a fixed size pool of 3 possible reference frames
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, 3, 0);
+}
+
+AVHWAccel ff_vp8_nvdec_hwaccel = {
+    .name                 = "vp8_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_VP8,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_vp8_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = ff_nvdec_simple_decode_slice,
+    .frame_params         = nvdec_vp8_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
diff --git a/libavcodec/nvdec_vp9.c b/libavcodec/nvdec_vp9.c
new file mode 100644
index 0000000..a76bcf9
--- /dev/null
+++ b/libavcodec/nvdec_vp9.c
@@ -0,0 +1,184 @@
+/*
+ * VP9 HW decode acceleration through NVDEC
+ *
+ * Copyright (c) 2016 Timo Rothenpieler
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+
+#include "avcodec.h"
+#include "nvdec.h"
+#include "decode.h"
+#include "internal.h"
+#include "vp9shared.h"
+
+static int nvdec_vp9_start_frame(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
+{
+    VP9SharedContext *h = avctx->priv_data;
+    const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+
+    NVDECContext      *ctx = avctx->internal->hwaccel_priv_data;
+    CUVIDPICPARAMS     *pp = &ctx->pic_params;
+    CUVIDVP9PICPARAMS *ppc = &pp->CodecSpecific.vp9;
+    FrameDecodeData *fdd;
+    NVDECFrame *cf;
+    AVFrame *cur_frame = h->frames[CUR_FRAME].tf.f;
+
+    int ret, i;
+
+    ret = ff_nvdec_start_frame(avctx, cur_frame);
+    if (ret < 0)
+        return ret;
+
+    fdd = (FrameDecodeData*)cur_frame->private_ref->data;
+    cf  = (NVDECFrame*)fdd->hwaccel_priv;
+
+    *pp = (CUVIDPICPARAMS) {
+        .PicWidthInMbs     = (cur_frame->width  + 15) / 16,
+        .FrameHeightInMbs  = (cur_frame->height + 15) / 16,
+        .CurrPicIdx        = cf->idx,
+
+        .CodecSpecific.vp9 = {
+            .width                    = cur_frame->width,
+            .height                   = cur_frame->height,
+
+            .LastRefIdx               = ff_nvdec_get_ref_idx(h->refs[h->h.refidx[0]].f),
+            .GoldenRefIdx             = ff_nvdec_get_ref_idx(h->refs[h->h.refidx[1]].f),
+            .AltRefIdx                = ff_nvdec_get_ref_idx(h->refs[h->h.refidx[2]].f),
+
+            .profile                  = h->h.profile,
+            .frameContextIdx          = h->h.framectxid,
+            .frameType                = !h->h.keyframe,
+            .showFrame                = !h->h.invisible,
+            .errorResilient           = h->h.errorres,
+            .frameParallelDecoding    = h->h.parallelmode,
+            .subSamplingX             = pixdesc->log2_chroma_w,
+            .subSamplingY             = pixdesc->log2_chroma_h,
+            .intraOnly                = h->h.intraonly,
+            .allow_high_precision_mv  = h->h.keyframe ? 0 : h->h.highprecisionmvs,
+            .refreshEntropyProbs      = h->h.refreshctx,
+
+            .bitDepthMinus8Luma       = pixdesc->comp[0].depth - 8,
+            .bitDepthMinus8Chroma     = pixdesc->comp[1].depth - 8,
+
+            .loopFilterLevel          = h->h.filter.level,
+            .loopFilterSharpness      = h->h.filter.sharpness,
+            .modeRefLfEnabled         = h->h.lf_delta.enabled,
+
+            .log2_tile_columns        = h->h.tiling.log2_tile_cols,
+            .log2_tile_rows           = h->h.tiling.log2_tile_rows,
+
+            .segmentEnabled           = h->h.segmentation.enabled,
+            .segmentMapUpdate         = h->h.segmentation.update_map,
+            .segmentMapTemporalUpdate = h->h.segmentation.temporal,
+            .segmentFeatureMode       = h->h.segmentation.absolute_vals,
+
+            .qpYAc                    = h->h.yac_qi,
+            .qpYDc                    = h->h.ydc_qdelta,
+            .qpChDc                   = h->h.uvdc_qdelta,
+            .qpChAc                   = h->h.uvac_qdelta,
+
+            .resetFrameContext        = h->h.resetctx,
+            .mcomp_filter_type        = h->h.filtermode ^ (h->h.filtermode <= 1),
+
+            .frameTagSize             = h->h.uncompressed_header_size,
+            .offsetToDctParts         = h->h.compressed_header_size,
+
+            .refFrameSignBias[0]      = 0,
+        }
+    };
+
+    for (i = 0; i < 2; i++)
+        ppc->mbModeLfDelta[i] = h->h.lf_delta.mode[i];
+
+    for (i = 0; i < 4; i++)
+        ppc->mbRefLfDelta[i] = h->h.lf_delta.ref[i];
+
+    for (i = 0; i < 7; i++)
+        ppc->mb_segment_tree_probs[i] = h->h.segmentation.prob[i];
+
+    for (i = 0; i < 3; i++) {
+        ppc->activeRefIdx[i] = h->h.refidx[i];
+        ppc->segment_pred_probs[i] = h->h.segmentation.pred_prob[i];
+        ppc->refFrameSignBias[i + 1] = h->h.signbias[i];
+    }
+
+    for (i = 0; i < 8; i++) {
+        ppc->segmentFeatureEnable[i][0] = h->h.segmentation.feat[i].q_enabled;
+        ppc->segmentFeatureEnable[i][1] = h->h.segmentation.feat[i].lf_enabled;
+        ppc->segmentFeatureEnable[i][2] = h->h.segmentation.feat[i].ref_enabled;
+        ppc->segmentFeatureEnable[i][3] = h->h.segmentation.feat[i].skip_enabled;
+
+        ppc->segmentFeatureData[i][0] = h->h.segmentation.feat[i].q_val;
+        ppc->segmentFeatureData[i][1] = h->h.segmentation.feat[i].lf_val;
+        ppc->segmentFeatureData[i][2] = h->h.segmentation.feat[i].ref_val;
+        ppc->segmentFeatureData[i][3] = 0;
+    }
+
+    switch (avctx->colorspace) {
+    default:
+    case AVCOL_SPC_UNSPECIFIED:
+        ppc->colorSpace = 0;
+        break;
+    case AVCOL_SPC_BT470BG:
+        ppc->colorSpace = 1;
+        break;
+    case AVCOL_SPC_BT709:
+        ppc->colorSpace = 2;
+        break;
+    case AVCOL_SPC_SMPTE170M:
+        ppc->colorSpace = 3;
+        break;
+    case AVCOL_SPC_SMPTE240M:
+        ppc->colorSpace = 4;
+        break;
+    case AVCOL_SPC_BT2020_NCL:
+        ppc->colorSpace = 5;
+        break;
+    case AVCOL_SPC_RESERVED:
+        ppc->colorSpace = 6;
+        break;
+    case AVCOL_SPC_RGB:
+        ppc->colorSpace = 7;
+        break;
+    }
+
+    return 0;
+}
+
+static int nvdec_vp9_frame_params(AVCodecContext *avctx,
+                                  AVBufferRef *hw_frames_ctx)
+{
+    // VP9 uses a fixed size pool of 8 possible reference frames
+    return ff_nvdec_frame_params(avctx, hw_frames_ctx, 8, 0);
+}
+
+const AVHWAccel ff_vp9_nvdec_hwaccel = {
+    .name                 = "vp9_nvdec",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_VP9,
+    .pix_fmt              = AV_PIX_FMT_CUDA,
+    .start_frame          = nvdec_vp9_start_frame,
+    .end_frame            = ff_nvdec_simple_end_frame,
+    .decode_slice         = ff_nvdec_simple_decode_slice,
+    .frame_params         = nvdec_vp9_frame_params,
+    .init                 = ff_nvdec_decode_init,
+    .uninit               = ff_nvdec_decode_uninit,
+    .priv_data_size       = sizeof(NVDECContext),
+};
diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index a7171e4..d3413b3 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -1,103 +1,62 @@
 /*
- * NVIDIA NVENC Support
- * Copyright (C) 2015 Luca Barbato
- * Copyright (C) 2015 Philip Langdale <philipl@overt.org>
- * Copyright (C) 2014 Timo Rothenpieler <timo@rothenpieler.org>
+ * H.264/HEVC hardware encoding using nvidia nvenc
+ * Copyright (c) 2016 Timo Rothenpieler <timo@rothenpieler.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
-#include <nvEncodeAPI.h>
-#include <string.h>
-
-#define CUDA_LIBNAME "libcuda.so"
-
-#if HAVE_WINDOWS_H
-#include <windows.h>
-
-#if ARCH_X86_64
-#define NVENC_LIBNAME "nvEncodeAPI64.dll"
-#else
-#define NVENC_LIBNAME "nvEncodeAPI.dll"
-#endif
-
-#define dlopen(filename, flags) LoadLibrary((filename))
-#define dlsym(handle, symbol)   GetProcAddress(handle, symbol)
-#define dlclose(handle)         FreeLibrary(handle)
-#else
-#include <dlfcn.h>
-#define NVENC_LIBNAME "libnvidia-encode.so"
-#endif
+#include "nvenc.h"
 
-#include "libavutil/common.h"
+#include "libavutil/hwcontext_cuda.h"
 #include "libavutil/hwcontext.h"
+#include "libavutil/cuda_check.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "libavutil/mem.h"
-#include "avcodec.h"
+#include "libavutil/pixdesc.h"
 #include "internal.h"
-#include "nvenc.h"
 
-#if CONFIG_CUDA
-#include "libavutil/hwcontext_cuda.h"
-#endif
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(avctx, dl_fn->cuda_dl, x)
 
 #define NVENC_CAP 0x30
-#define BITSTREAM_BUFFER_SIZE 1024 * 1024
-#define IS_CBR(rc) (rc == NV_ENC_PARAMS_RC_CBR ||               \
-                    rc == NV_ENC_PARAMS_RC_2_PASS_QUALITY ||    \
-                    rc == NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP)
-
-#define LOAD_LIBRARY(l, path)                   \
-    do {                                        \
-        if (!((l) = dlopen(path, RTLD_LAZY))) { \
-            av_log(avctx, AV_LOG_ERROR,         \
-                   "Cannot load %s\n",          \
-                   path);                       \
-            return AVERROR_UNKNOWN;             \
-        }                                       \
-    } while (0)
-
-#define LOAD_SYMBOL(fun, lib, symbol)        \
-    do {                                     \
-        if (!((fun) = dlsym(lib, symbol))) { \
-            av_log(avctx, AV_LOG_ERROR,      \
-                   "Cannot load %s\n",       \
-                   symbol);                  \
-            return AVERROR_UNKNOWN;          \
-        }                                    \
-    } while (0)
+#define IS_CBR(rc) (rc == NV_ENC_PARAMS_RC_CBR ||             \
+                    rc == NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ || \
+                    rc == NV_ENC_PARAMS_RC_CBR_HQ)
 
 const enum AVPixelFormat ff_nvenc_pix_fmts[] = {
-    AV_PIX_FMT_NV12,
     AV_PIX_FMT_YUV420P,
-    AV_PIX_FMT_YUV444P,
-#if NVENCAPI_MAJOR_VERSION >= 7
+    AV_PIX_FMT_NV12,
     AV_PIX_FMT_P010,
-    AV_PIX_FMT_YUV444P16,
-#endif
-#if CONFIG_CUDA
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P016,      // Truncated to 10bits
+    AV_PIX_FMT_YUV444P16, // Truncated to 10bits
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
     AV_PIX_FMT_CUDA,
+#if CONFIG_D3D11VA
+    AV_PIX_FMT_D3D11,
 #endif
     AV_PIX_FMT_NONE
 };
 
 #define IS_10BIT(pix_fmt)  (pix_fmt == AV_PIX_FMT_P010    || \
+                            pix_fmt == AV_PIX_FMT_P016    || \
                             pix_fmt == AV_PIX_FMT_YUV444P16)
 
 #define IS_YUV444(pix_fmt) (pix_fmt == AV_PIX_FMT_YUV444P || \
@@ -121,14 +80,12 @@ static const struct {
     { NV_ENC_ERR_OUT_OF_MEMORY,            AVERROR(ENOMEM),  "out of memory"            },
     { NV_ENC_ERR_ENCODER_NOT_INITIALIZED,  AVERROR(EINVAL),  "encoder not initialized"  },
     { NV_ENC_ERR_UNSUPPORTED_PARAM,        AVERROR(ENOSYS),  "unsupported param"        },
-    { NV_ENC_ERR_LOCK_BUSY,                AVERROR(EBUSY),   "lock busy"                },
-    { NV_ENC_ERR_NOT_ENOUGH_BUFFER,        AVERROR(ENOBUFS), "not enough buffer"        },
+    { NV_ENC_ERR_LOCK_BUSY,                AVERROR(EAGAIN),  "lock busy"                },
+    { NV_ENC_ERR_NOT_ENOUGH_BUFFER,        AVERROR_BUFFER_TOO_SMALL, "not enough buffer"},
     { NV_ENC_ERR_INVALID_VERSION,          AVERROR(EINVAL),  "invalid version"          },
     { NV_ENC_ERR_MAP_FAILED,               AVERROR(EIO),     "map failed"               },
-    /* this is error should always be treated specially, so this "mapping"
-     * is for completeness only */
-    { NV_ENC_ERR_NEED_MORE_INPUT,          AVERROR_UNKNOWN,  "need more input"          },
-    { NV_ENC_ERR_ENCODER_BUSY,             AVERROR(EBUSY),   "encoder busy"             },
+    { NV_ENC_ERR_NEED_MORE_INPUT,          AVERROR(EAGAIN),  "need more input"          },
+    { NV_ENC_ERR_ENCODER_BUSY,             AVERROR(EAGAIN),  "encoder busy"             },
     { NV_ENC_ERR_EVENT_NOT_REGISTERD,      AVERROR(EBADF),   "event not registered"     },
     { NV_ENC_ERR_GENERIC,                  AVERROR_UNKNOWN,  "generic error"            },
     { NV_ENC_ERR_INCOMPATIBLE_CLIENT_KEY,  AVERROR(EINVAL),  "incompatible client key"  },
@@ -163,68 +120,124 @@ static int nvenc_print_error(void *log_ctx, NVENCSTATUS err,
     return ret;
 }
 
+static void nvenc_print_driver_requirement(AVCodecContext *avctx, int level)
+{
+#if NVENCAPI_CHECK_VERSION(9, 0)
+# if defined(_WIN32) || defined(__CYGWIN__)
+    const char *minver = "418.81";
+# else
+    const char *minver = "418.30";
+# endif
+#elif NVENCAPI_CHECK_VERSION(8, 2)
+# if defined(_WIN32) || defined(__CYGWIN__)
+    const char *minver = "397.93";
+# else
+    const char *minver = "396.24";
+#endif
+#elif NVENCAPI_CHECK_VERSION(8, 1)
+# if defined(_WIN32) || defined(__CYGWIN__)
+    const char *minver = "390.77";
+# else
+    const char *minver = "390.25";
+# endif
+#else
+# if defined(_WIN32) || defined(__CYGWIN__)
+    const char *minver = "378.66";
+# else
+    const char *minver = "378.13";
+# endif
+#endif
+    av_log(avctx, level, "The minimum required Nvidia driver for nvenc is %s or newer\n", minver);
+}
+
 static av_cold int nvenc_load_libraries(AVCodecContext *avctx)
 {
-    NVENCContext *ctx         = avctx->priv_data;
-    NVENCLibraryContext *nvel = &ctx->nvel;
-    PNVENCODEAPICREATEINSTANCE nvenc_create_instance;
+    NvencContext *ctx            = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
     NVENCSTATUS err;
+    uint32_t nvenc_max_ver;
+    int ret;
 
-#if CONFIG_CUDA
-    nvel->cu_init                      = cuInit;
-    nvel->cu_device_get_count          = cuDeviceGetCount;
-    nvel->cu_device_get                = cuDeviceGet;
-    nvel->cu_device_get_name           = cuDeviceGetName;
-    nvel->cu_device_compute_capability = cuDeviceComputeCapability;
-    nvel->cu_ctx_create                = cuCtxCreate_v2;
-    nvel->cu_ctx_pop_current           = cuCtxPopCurrent_v2;
-    nvel->cu_ctx_push_current          = cuCtxPushCurrent_v2;
-    nvel->cu_ctx_destroy               = cuCtxDestroy_v2;
-#else
-    LOAD_LIBRARY(nvel->cuda, CUDA_LIBNAME);
-
-    LOAD_SYMBOL(nvel->cu_init, nvel->cuda, "cuInit");
-    LOAD_SYMBOL(nvel->cu_device_get_count, nvel->cuda, "cuDeviceGetCount");
-    LOAD_SYMBOL(nvel->cu_device_get, nvel->cuda, "cuDeviceGet");
-    LOAD_SYMBOL(nvel->cu_device_get_name, nvel->cuda, "cuDeviceGetName");
-    LOAD_SYMBOL(nvel->cu_device_compute_capability, nvel->cuda,
-                "cuDeviceComputeCapability");
-    LOAD_SYMBOL(nvel->cu_ctx_create, nvel->cuda, "cuCtxCreate_v2");
-    LOAD_SYMBOL(nvel->cu_ctx_pop_current, nvel->cuda, "cuCtxPopCurrent_v2");
-    LOAD_SYMBOL(nvel->cu_ctx_push_current, nvel->cuda, "cuCtxPushCurrent_v2");
-    LOAD_SYMBOL(nvel->cu_ctx_destroy, nvel->cuda, "cuCtxDestroy_v2");
-#endif
+    ret = cuda_load_functions(&dl_fn->cuda_dl, avctx);
+    if (ret < 0)
+        return ret;
+
+    ret = nvenc_load_functions(&dl_fn->nvenc_dl, avctx);
+    if (ret < 0) {
+        nvenc_print_driver_requirement(avctx, AV_LOG_ERROR);
+        return ret;
+    }
+
+    err = dl_fn->nvenc_dl->NvEncodeAPIGetMaxSupportedVersion(&nvenc_max_ver);
+    if (err != NV_ENC_SUCCESS)
+        return nvenc_print_error(avctx, err, "Failed to query nvenc max version");
 
-    LOAD_LIBRARY(nvel->nvenc, NVENC_LIBNAME);
+    av_log(avctx, AV_LOG_VERBOSE, "Loaded Nvenc version %d.%d\n", nvenc_max_ver >> 4, nvenc_max_ver & 0xf);
 
-    LOAD_SYMBOL(nvenc_create_instance, nvel->nvenc,
-                "NvEncodeAPICreateInstance");
+    if ((NVENCAPI_MAJOR_VERSION << 4 | NVENCAPI_MINOR_VERSION) > nvenc_max_ver) {
+        av_log(avctx, AV_LOG_ERROR, "Driver does not support the required nvenc API version. "
+               "Required: %d.%d Found: %d.%d\n",
+               NVENCAPI_MAJOR_VERSION, NVENCAPI_MINOR_VERSION,
+               nvenc_max_ver >> 4, nvenc_max_ver & 0xf);
+        nvenc_print_driver_requirement(avctx, AV_LOG_ERROR);
+        return AVERROR(ENOSYS);
+    }
 
-    nvel->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
+    dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
 
-    err = nvenc_create_instance(&nvel->nvenc_funcs);
+    err = dl_fn->nvenc_dl->NvEncodeAPICreateInstance(&dl_fn->nvenc_funcs);
     if (err != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, err, "Cannot create the NVENC instance");
+        return nvenc_print_error(avctx, err, "Failed to create nvenc instance");
+
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
 
     return 0;
 }
 
-static int nvenc_open_session(AVCodecContext *avctx)
+static int nvenc_push_context(AVCodecContext *avctx)
+{
+    NvencContext *ctx            = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+
+    if (ctx->d3d11_device)
+        return 0;
+
+    return CHECK_CU(dl_fn->cuda_dl->cuCtxPushCurrent(ctx->cu_context));
+}
+
+static int nvenc_pop_context(AVCodecContext *avctx)
+{
+    NvencContext *ctx            = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    CUcontext dummy;
+
+    if (ctx->d3d11_device)
+        return 0;
+
+    return CHECK_CU(dl_fn->cuda_dl->cuCtxPopCurrent(&dummy));
+}
+
+static av_cold int nvenc_open_session(AVCodecContext *avctx)
 {
     NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params = { 0 };
-    NVENCContext *ctx                           = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv             = &ctx->nvel.nvenc_funcs;
-    int ret;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &ctx->nvenc_dload_funcs.nvenc_funcs;
+    NVENCSTATUS ret;
 
     params.version    = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
     params.apiVersion = NVENCAPI_VERSION;
-    params.device     = ctx->cu_context;
-    params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
+    if (ctx->d3d11_device) {
+        params.device     = ctx->d3d11_device;
+        params.deviceType = NV_ENC_DEVICE_TYPE_DIRECTX;
+    } else {
+        params.device     = ctx->cu_context;
+        params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
+    }
 
-    ret = nv->nvEncOpenEncodeSessionEx(&params, &ctx->nvenc_ctx);
+    ret = p_nvenc->nvEncOpenEncodeSessionEx(&params, &ctx->nvencoder);
     if (ret != NV_ENC_SUCCESS) {
-        ctx->nvenc_ctx = NULL;
-        return nvenc_print_error(avctx, ret, "Cannot open the NVENC Session");
+        ctx->nvencoder = NULL;
+        return nvenc_print_error(avctx, ret, "OpenEncodeSessionEx failed");
     }
 
     return 0;
@@ -232,12 +245,12 @@ static int nvenc_open_session(AVCodecContext *avctx)
 
 static int nvenc_check_codec_support(AVCodecContext *avctx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx                    = avctx->priv_data;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &ctx->nvenc_dload_funcs.nvenc_funcs;
     int i, ret, count = 0;
     GUID *guids = NULL;
 
-    ret = nv->nvEncGetEncodeGUIDCount(ctx->nvenc_ctx, &count);
+    ret = p_nvenc->nvEncGetEncodeGUIDCount(ctx->nvencoder, &count);
 
     if (ret != NV_ENC_SUCCESS || !count)
         return AVERROR(ENOSYS);
@@ -246,7 +259,7 @@ static int nvenc_check_codec_support(AVCodecContext *avctx)
     if (!guids)
         return AVERROR(ENOMEM);
 
-    ret = nv->nvEncGetEncodeGUIDs(ctx->nvenc_ctx, guids, count, &count);
+    ret = p_nvenc->nvEncGetEncodeGUIDs(ctx->nvencoder, guids, count, &count);
     if (ret != NV_ENC_SUCCESS) {
         ret = AVERROR(ENOSYS);
         goto fail;
@@ -254,7 +267,7 @@ static int nvenc_check_codec_support(AVCodecContext *avctx)
 
     ret = AVERROR(ENOSYS);
     for (i = 0; i < count; i++) {
-        if (!memcmp(&guids[i], &ctx->params.encodeGUID, sizeof(*guids))) {
+        if (!memcmp(&guids[i], &ctx->init_encode_params.encodeGUID, sizeof(*guids))) {
             ret = 0;
             break;
         }
@@ -268,15 +281,15 @@ fail:
 
 static int nvenc_check_cap(AVCodecContext *avctx, NV_ENC_CAPS cap)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &ctx->nvenc_dload_funcs.nvenc_funcs;
     NV_ENC_CAPS_PARAM params        = { 0 };
     int ret, val = 0;
 
     params.version     = NV_ENC_CAPS_PARAM_VER;
     params.capsToQuery = cap;
 
-    ret = nv->nvEncGetEncodeCaps(ctx->nvenc_ctx, ctx->params.encodeGUID, &params, &val);
+    ret = p_nvenc->nvEncGetEncodeCaps(ctx->nvencoder, ctx->init_encode_params.encodeGUID, &params, &val);
 
     if (ret == NV_ENC_SUCCESS)
         return val;
@@ -285,7 +298,7 @@ static int nvenc_check_cap(AVCodecContext *avctx, NV_ENC_CAPS cap)
 
 static int nvenc_check_capabilities(AVCodecContext *avctx)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
     int ret;
 
     ret = nvenc_check_codec_support(avctx);
@@ -295,11 +308,17 @@ static int nvenc_check_capabilities(AVCodecContext *avctx)
     }
 
     ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_YUV444_ENCODE);
-    if (ctx->data_pix_fmt == AV_PIX_FMT_YUV444P && ret <= 0) {
+    if (IS_YUV444(ctx->data_pix_fmt) && ret <= 0) {
         av_log(avctx, AV_LOG_VERBOSE, "YUV444P not supported\n");
         return AVERROR(ENOSYS);
     }
 
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_LOSSLESS_ENCODE);
+    if (ctx->preset >= PRESET_LOSSLESS_DEFAULT && ret <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Lossless encoding not supported\n");
+        return AVERROR(ENOSYS);
+    }
+
     ret = nvenc_check_cap(avctx, NV_ENC_CAPS_WIDTH_MAX);
     if (ret < avctx->width) {
         av_log(avctx, AV_LOG_VERBOSE, "Width %d exceeds %d\n",
@@ -322,54 +341,106 @@ static int nvenc_check_capabilities(AVCodecContext *avctx)
         return AVERROR(ENOSYS);
     }
 
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_FIELD_ENCODING);
+    if (ret < 1 && avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "Interlaced encoding is not supported. Supported level: %d\n",
+               ret);
+        return AVERROR(ENOSYS);
+    }
+
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_10BIT_ENCODE);
+    if (IS_10BIT(ctx->data_pix_fmt) && ret <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "10 bit encode not supported\n");
+        return AVERROR(ENOSYS);
+    }
+
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_LOOKAHEAD);
+    if (ctx->rc_lookahead > 0 && ret <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "RC lookahead not supported\n");
+        return AVERROR(ENOSYS);
+    }
+
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_TEMPORAL_AQ);
+    if (ctx->temporal_aq > 0 && ret <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Temporal AQ not supported\n");
+        return AVERROR(ENOSYS);
+    }
+
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_WEIGHTED_PREDICTION);
+    if (ctx->weighted_pred > 0 && ret <= 0) {
+        av_log (avctx, AV_LOG_VERBOSE, "Weighted Prediction not supported\n");
+        return AVERROR(ENOSYS);
+    }
+
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_CABAC);
+    if (ctx->coder == NV_ENC_H264_ENTROPY_CODING_MODE_CABAC && ret <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "CABAC entropy coding not supported\n");
+        return AVERROR(ENOSYS);
+    }
+
+#ifdef NVENC_HAVE_BFRAME_REF_MODE
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_BFRAME_REF_MODE);
+    if (ctx->b_ref_mode == NV_ENC_BFRAME_REF_MODE_EACH && ret != 1) {
+        av_log(avctx, AV_LOG_VERBOSE, "Each B frame as reference is not supported\n");
+        return AVERROR(ENOSYS);
+    } else if (ctx->b_ref_mode != NV_ENC_BFRAME_REF_MODE_DISABLED && ret == 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "B frames as references are not supported\n");
+        return AVERROR(ENOSYS);
+    }
+#else
+    if (ctx->b_ref_mode != 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "B frames as references need SDK 8.1 at build time\n");
+        return AVERROR(ENOSYS);
+    }
+#endif
+
+    ctx->support_dyn_bitrate = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_DYN_BITRATE_CHANGE);
+
     return 0;
 }
 
-static int nvenc_check_device(AVCodecContext *avctx, int idx)
+static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NVENCLibraryContext *nvel       = &ctx->nvel;
-    char name[128]                  = { 0 };
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    char name[128] = { 0};
     int major, minor, ret;
     CUdevice cu_device;
-    CUcontext dummy;
     int loglevel = AV_LOG_VERBOSE;
 
     if (ctx->device == LIST_DEVICES)
         loglevel = AV_LOG_INFO;
 
-    ret = nvel->cu_device_get(&cu_device, idx);
-    if (ret != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Cannot access the CUDA device %d\n",
-               idx);
-        return -1;
-    }
-
-    ret = nvel->cu_device_get_name(name, sizeof(name), cu_device);
-    if (ret != CUDA_SUCCESS)
-        return -1;
+    ret = CHECK_CU(dl_fn->cuda_dl->cuDeviceGet(&cu_device, idx));
+    if (ret < 0)
+        return ret;
 
-    ret = nvel->cu_device_compute_capability(&major, &minor, cu_device);
-    if (ret != CUDA_SUCCESS)
-        return -1;
+    ret = CHECK_CU(dl_fn->cuda_dl->cuDeviceGetName(name, sizeof(name), cu_device));
+    if (ret < 0)
+        return ret;
 
-    av_log(avctx, loglevel, "Device %d [%s] ", cu_device, name);
+    ret = CHECK_CU(dl_fn->cuda_dl->cuDeviceComputeCapability(&major, &minor, cu_device));
+    if (ret < 0)
+        return ret;
 
-    if (((major << 4) | minor) < NVENC_CAP)
+    av_log(avctx, loglevel, "[ GPU #%d - < %s > has Compute SM %d.%d ]\n", idx, name, major, minor);
+    if (((major << 4) | minor) < NVENC_CAP) {
+        av_log(avctx, loglevel, "does not support NVENC\n");
         goto fail;
+    }
 
     if (ctx->device != idx && ctx->device != ANY_DEVICE)
         return -1;
 
-    ret = nvel->cu_ctx_create(&ctx->cu_context_internal, 0, cu_device);
-    if (ret != CUDA_SUCCESS)
+    ret = CHECK_CU(dl_fn->cuda_dl->cuCtxCreate(&ctx->cu_context_internal, 0, cu_device));
+    if (ret < 0)
         goto fail;
 
     ctx->cu_context = ctx->cu_context_internal;
 
-    ret = nvel->cu_ctx_pop_current(&dummy);
-    if (ret != CUDA_SUCCESS)
+    if ((ret = nvenc_pop_context(avctx)) < 0)
         goto fail2;
 
     if ((ret = nvenc_open_session(avctx)) < 0)
@@ -380,81 +451,114 @@ static int nvenc_check_device(AVCodecContext *avctx, int idx)
 
     av_log(avctx, loglevel, "supports NVENC\n");
 
+    dl_fn->nvenc_device_count++;
+
     if (ctx->device == idx || ctx->device == ANY_DEVICE)
         return 0;
 
 fail3:
-    nvel->nvenc_funcs.nvEncDestroyEncoder(ctx->nvenc_ctx);
-    ctx->nvenc_ctx = NULL;
+    if ((ret = nvenc_push_context(avctx)) < 0)
+        return ret;
+
+    p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+    ctx->nvencoder = NULL;
+
+    if ((ret = nvenc_pop_context(avctx)) < 0)
+        return ret;
 
 fail2:
-    nvel->cu_ctx_destroy(ctx->cu_context_internal);
+    CHECK_CU(dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal));
     ctx->cu_context_internal = NULL;
 
 fail:
-    if (ret != 0)
-        av_log(avctx, loglevel, "does not support NVENC (major %d minor %d)\n",
-               major, minor);
-
     return AVERROR(ENOSYS);
 }
 
-static int nvenc_setup_device(AVCodecContext *avctx)
+static av_cold int nvenc_setup_device(AVCodecContext *avctx)
 {
-    NVENCContext *ctx         = avctx->priv_data;
-    NVENCLibraryContext *nvel = &ctx->nvel;
+    NvencContext *ctx            = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
 
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
-        ctx->params.encodeGUID = NV_ENC_CODEC_H264_GUID;
+        ctx->init_encode_params.encodeGUID = NV_ENC_CODEC_H264_GUID;
         break;
     case AV_CODEC_ID_HEVC:
-        ctx->params.encodeGUID = NV_ENC_CODEC_HEVC_GUID;
+        ctx->init_encode_params.encodeGUID = NV_ENC_CODEC_HEVC_GUID;
         break;
     default:
         return AVERROR_BUG;
     }
 
-    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
-#if CONFIG_CUDA
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA || avctx->pix_fmt == AV_PIX_FMT_D3D11 || avctx->hw_frames_ctx || avctx->hw_device_ctx) {
         AVHWFramesContext   *frames_ctx;
-        AVCUDADeviceContext *device_hwctx;
+        AVHWDeviceContext   *hwdev_ctx;
+        AVCUDADeviceContext *cuda_device_hwctx = NULL;
+#if CONFIG_D3D11VA
+        AVD3D11VADeviceContext *d3d11_device_hwctx = NULL;
+#endif
         int ret;
 
-        if (!avctx->hw_frames_ctx)
+        if (avctx->hw_frames_ctx) {
+            frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+            if (frames_ctx->format == AV_PIX_FMT_CUDA)
+                cuda_device_hwctx = frames_ctx->device_ctx->hwctx;
+#if CONFIG_D3D11VA
+            else if (frames_ctx->format == AV_PIX_FMT_D3D11)
+                d3d11_device_hwctx = frames_ctx->device_ctx->hwctx;
+#endif
+            else
+                return AVERROR(EINVAL);
+        } else if (avctx->hw_device_ctx) {
+            hwdev_ctx = (AVHWDeviceContext*)avctx->hw_device_ctx->data;
+            if (hwdev_ctx->type == AV_HWDEVICE_TYPE_CUDA)
+                cuda_device_hwctx = hwdev_ctx->hwctx;
+#if CONFIG_D3D11VA
+            else if (hwdev_ctx->type == AV_HWDEVICE_TYPE_D3D11VA)
+                d3d11_device_hwctx = hwdev_ctx->hwctx;
+#endif
+            else
+                return AVERROR(EINVAL);
+        } else {
             return AVERROR(EINVAL);
+        }
 
-        frames_ctx   = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-        device_hwctx = frames_ctx->device_ctx->hwctx;
-
-        ctx->cu_context = device_hwctx->cuda_ctx;
+        if (cuda_device_hwctx) {
+            ctx->cu_context = cuda_device_hwctx->cuda_ctx;
+        }
+#if CONFIG_D3D11VA
+        else if (d3d11_device_hwctx) {
+            ctx->d3d11_device = d3d11_device_hwctx->device;
+            ID3D11Device_AddRef(ctx->d3d11_device);
+        }
+#endif
 
         ret = nvenc_open_session(avctx);
         if (ret < 0)
             return ret;
 
         ret = nvenc_check_capabilities(avctx);
-        if (ret < 0)
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_FATAL, "Provided device doesn't support required NVENC features\n");
             return ret;
-#else
-        return AVERROR_BUG;
-#endif
+        }
     } else {
         int i, nb_devices = 0;
 
-        if ((nvel->cu_init(0)) != CUDA_SUCCESS) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Cannot init CUDA\n");
+        if (CHECK_CU(dl_fn->cuda_dl->cuInit(0)) < 0)
             return AVERROR_UNKNOWN;
-        }
 
-        if ((nvel->cu_device_get_count(&nb_devices)) != CUDA_SUCCESS) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Cannot enumerate the CUDA devices\n");
+        if (CHECK_CU(dl_fn->cuda_dl->cuDeviceGetCount(&nb_devices)) < 0)
             return AVERROR_UNKNOWN;
+
+        if (!nb_devices) {
+            av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
+                return AVERROR_EXTERNAL;
         }
 
+        av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", nb_devices);
 
+        dl_fn->nvenc_device_count = 0;
         for (i = 0; i < nb_devices; ++i) {
             if ((nvenc_check_device(avctx, i)) >= 0 && ctx->device != LIST_DEVICES)
                 return 0;
@@ -463,7 +567,13 @@ static int nvenc_setup_device(AVCodecContext *avctx)
         if (ctx->device == LIST_DEVICES)
             return AVERROR_EXIT;
 
-        return AVERROR(ENOSYS);
+        if (!dl_fn->nvenc_device_count) {
+            av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->device, nb_devices);
+        return AVERROR(EINVAL);
     }
 
     return 0;
@@ -479,337 +589,469 @@ typedef struct GUIDTuple {
 
 #define PRESET(name, ...) PRESET_ALIAS(name, name, __VA_ARGS__)
 
-static int nvenc_map_preset(NVENCContext *ctx)
+static void nvenc_map_preset(NvencContext *ctx)
 {
     GUIDTuple presets[] = {
         PRESET(DEFAULT),
         PRESET(HP),
         PRESET(HQ),
         PRESET(BD),
+        PRESET_ALIAS(SLOW,   HQ,    NVENC_TWO_PASSES),
+        PRESET_ALIAS(MEDIUM, HQ,    NVENC_ONE_PASS),
+        PRESET_ALIAS(FAST,   HP,    NVENC_ONE_PASS),
         PRESET(LOW_LATENCY_DEFAULT, NVENC_LOWLATENCY),
         PRESET(LOW_LATENCY_HP,      NVENC_LOWLATENCY),
         PRESET(LOW_LATENCY_HQ,      NVENC_LOWLATENCY),
         PRESET(LOSSLESS_DEFAULT,    NVENC_LOSSLESS),
         PRESET(LOSSLESS_HP,         NVENC_LOSSLESS),
-        PRESET_ALIAS(SLOW, HQ,      NVENC_TWO_PASSES),
-        PRESET_ALIAS(MEDIUM, HQ,    NVENC_ONE_PASS),
-        PRESET_ALIAS(FAST, HP,      NVENC_ONE_PASS)
     };
 
     GUIDTuple *t = &presets[ctx->preset];
 
-    ctx->params.presetGUID = t->guid;
-    ctx->flags             = t->flags;
-
-    return AVERROR(EINVAL);
+    ctx->init_encode_params.presetGUID = t->guid;
+    ctx->flags = t->flags;
 }
 
 #undef PRESET
 #undef PRESET_ALIAS
 
-static void set_constqp(AVCodecContext *avctx, NV_ENC_RC_PARAMS *rc)
+static av_cold void set_constqp(AVCodecContext *avctx)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
+
     rc->rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
 
     if (ctx->init_qp_p >= 0) {
         rc->constQP.qpInterP = ctx->init_qp_p;
         if (ctx->init_qp_i >= 0 && ctx->init_qp_b >= 0) {
-            rc->constQP.qpIntra  = ctx->init_qp_i;
+            rc->constQP.qpIntra = ctx->init_qp_i;
             rc->constQP.qpInterB = ctx->init_qp_b;
         } else if (avctx->i_quant_factor != 0.0 && avctx->b_quant_factor != 0.0) {
-            rc->constQP.qpIntra  = av_clip(rc->constQP.qpInterP * fabs(avctx->i_quant_factor) + avctx->i_quant_offset + 0.5, 0, 51);
-            rc->constQP.qpInterB = av_clip(rc->constQP.qpInterP * fabs(avctx->b_quant_factor) + avctx->b_quant_offset + 0.5, 0, 51);
+            rc->constQP.qpIntra = av_clip(
+                rc->constQP.qpInterP * fabs(avctx->i_quant_factor) + avctx->i_quant_offset + 0.5, 0, 51);
+            rc->constQP.qpInterB = av_clip(
+                rc->constQP.qpInterP * fabs(avctx->b_quant_factor) + avctx->b_quant_offset + 0.5, 0, 51);
         } else {
-            rc->constQP.qpIntra  = rc->constQP.qpInterP;
+            rc->constQP.qpIntra = rc->constQP.qpInterP;
             rc->constQP.qpInterB = rc->constQP.qpInterP;
         }
-    } else if (avctx->global_quality >= 0) {
-        rc->constQP.qpInterP = avctx->global_quality;
-        rc->constQP.qpInterB = avctx->global_quality;
-        rc->constQP.qpIntra  = avctx->global_quality;
+    } else if (ctx->cqp >= 0) {
+        rc->constQP.qpInterP = rc->constQP.qpInterB = rc->constQP.qpIntra = ctx->cqp;
+        if (avctx->b_quant_factor != 0.0)
+            rc->constQP.qpInterB = av_clip(ctx->cqp * fabs(avctx->b_quant_factor) + avctx->b_quant_offset + 0.5, 0, 51);
+        if (avctx->i_quant_factor != 0.0)
+            rc->constQP.qpIntra = av_clip(ctx->cqp * fabs(avctx->i_quant_factor) + avctx->i_quant_offset + 0.5, 0, 51);
     }
+
+    avctx->qmin = -1;
+    avctx->qmax = -1;
 }
 
-static void set_vbr(AVCodecContext *avctx, NV_ENC_RC_PARAMS *rc)
+static av_cold void set_vbr(AVCodecContext *avctx)
 {
-    NVENCContext *ctx    = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
+    int qp_inter_p;
+
+    if (avctx->qmin >= 0 && avctx->qmax >= 0) {
+        rc->enableMinQP = 1;
+        rc->enableMaxQP = 1;
 
-    if (avctx->qmin >= 0) {
-        rc->enableMinQP    = 1;
         rc->minQP.qpInterB = avctx->qmin;
         rc->minQP.qpInterP = avctx->qmin;
         rc->minQP.qpIntra  = avctx->qmin;
-    }
 
-    if (avctx->qmax >= 0) {
-        rc->enableMaxQP = 1;
         rc->maxQP.qpInterB = avctx->qmax;
         rc->maxQP.qpInterP = avctx->qmax;
-        rc->maxQP.qpIntra  = avctx->qmax;
+        rc->maxQP.qpIntra = avctx->qmax;
+
+        qp_inter_p = (avctx->qmax + 3 * avctx->qmin) / 4; // biased towards Qmin
+    } else if (avctx->qmin >= 0) {
+        rc->enableMinQP = 1;
+
+        rc->minQP.qpInterB = avctx->qmin;
+        rc->minQP.qpInterP = avctx->qmin;
+        rc->minQP.qpIntra = avctx->qmin;
+
+        qp_inter_p = avctx->qmin;
+    } else {
+        qp_inter_p = 26; // default to 26
     }
 
-    if (ctx->init_qp_p >= 0) {
-        rc->enableInitialRCQP = 1;
+    rc->enableInitialRCQP = 1;
+
+    if (ctx->init_qp_p < 0) {
+        rc->initialRCQP.qpInterP  = qp_inter_p;
+    } else {
         rc->initialRCQP.qpInterP = ctx->init_qp_p;
-        if (ctx->init_qp_i < 0) {
-            if (avctx->i_quant_factor != 0.0 && avctx->b_quant_factor != 0.0) {
-                rc->initialRCQP.qpIntra = av_clip(rc->initialRCQP.qpInterP * fabs(avctx->i_quant_factor) + avctx->i_quant_offset + 0.5, 0, 51);
-            } else {
-                rc->initialRCQP.qpIntra = rc->initialRCQP.qpInterP;
-            }
+    }
+
+    if (ctx->init_qp_i < 0) {
+        if (avctx->i_quant_factor != 0.0 && avctx->b_quant_factor != 0.0) {
+            rc->initialRCQP.qpIntra = av_clip(
+                rc->initialRCQP.qpInterP * fabs(avctx->i_quant_factor) + avctx->i_quant_offset + 0.5, 0, 51);
         } else {
-            rc->initialRCQP.qpIntra = ctx->init_qp_i;
+            rc->initialRCQP.qpIntra = rc->initialRCQP.qpInterP;
         }
+    } else {
+        rc->initialRCQP.qpIntra = ctx->init_qp_i;
+    }
 
-        if (ctx->init_qp_b < 0) {
-            if (avctx->i_quant_factor != 0.0 && avctx->b_quant_factor != 0.0) {
-                rc->initialRCQP.qpInterB = av_clip(rc->initialRCQP.qpInterP * fabs(avctx->b_quant_factor) + avctx->b_quant_offset + 0.5, 0, 51);
-            } else {
-                rc->initialRCQP.qpInterB = rc->initialRCQP.qpInterP;
-            }
+    if (ctx->init_qp_b < 0) {
+        if (avctx->i_quant_factor != 0.0 && avctx->b_quant_factor != 0.0) {
+            rc->initialRCQP.qpInterB = av_clip(
+                rc->initialRCQP.qpInterP * fabs(avctx->b_quant_factor) + avctx->b_quant_offset + 0.5, 0, 51);
         } else {
-            rc->initialRCQP.qpInterB = ctx->init_qp_b;
+            rc->initialRCQP.qpInterB = rc->initialRCQP.qpInterP;
         }
+    } else {
+        rc->initialRCQP.qpInterB = ctx->init_qp_b;
     }
 }
 
-static void set_lossless(AVCodecContext *avctx, NV_ENC_RC_PARAMS *rc)
+static av_cold void set_lossless(AVCodecContext *avctx)
 {
-    rc->rateControlMode  = NV_ENC_PARAMS_RC_CONSTQP;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
+
+    rc->rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
     rc->constQP.qpInterB = 0;
     rc->constQP.qpInterP = 0;
     rc->constQP.qpIntra  = 0;
+
+    avctx->qmin = -1;
+    avctx->qmax = -1;
 }
 
-static void nvenc_override_rate_control(AVCodecContext *avctx,
-                                        NV_ENC_RC_PARAMS *rc)
+static void nvenc_override_rate_control(AVCodecContext *avctx)
 {
-    NVENCContext *ctx    = avctx->priv_data;
+    NvencContext *ctx    = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
 
     switch (ctx->rc) {
     case NV_ENC_PARAMS_RC_CONSTQP:
-        set_constqp(avctx, rc);
+        set_constqp(avctx);
         return;
-    case NV_ENC_PARAMS_RC_2_PASS_VBR:
-    case NV_ENC_PARAMS_RC_VBR:
-        set_vbr(avctx, rc);
-        break;
     case NV_ENC_PARAMS_RC_VBR_MINQP:
         if (avctx->qmin < 0) {
             av_log(avctx, AV_LOG_WARNING,
                    "The variable bitrate rate-control requires "
                    "the 'qmin' option set.\n");
+            set_vbr(avctx);
             return;
         }
-        set_vbr(avctx, rc);
+        /* fall through */
+    case NV_ENC_PARAMS_RC_VBR_HQ:
+    case NV_ENC_PARAMS_RC_VBR:
+        set_vbr(avctx);
         break;
     case NV_ENC_PARAMS_RC_CBR:
+    case NV_ENC_PARAMS_RC_CBR_HQ:
+    case NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ:
         break;
-    case NV_ENC_PARAMS_RC_2_PASS_QUALITY:
-    case NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP:
-        if (!(ctx->flags & NVENC_LOWLATENCY)) {
+    }
+
+    rc->rateControlMode = ctx->rc;
+}
+
+static av_cold int nvenc_recalc_surfaces(AVCodecContext *avctx)
+{
+    NvencContext *ctx = avctx->priv_data;
+    // default minimum of 4 surfaces
+    // multiply by 2 for number of NVENCs on gpu (hardcode to 2)
+    // another multiply by 2 to avoid blocking next PBB group
+    int nb_surfaces = FFMAX(4, ctx->encode_config.frameIntervalP * 2 * 2);
+
+    // lookahead enabled
+    if (ctx->rc_lookahead > 0) {
+        // +1 is to account for lkd_bound calculation later
+        // +4 is to allow sufficient pipelining with lookahead
+        nb_surfaces = FFMAX(1, FFMAX(nb_surfaces, ctx->rc_lookahead + ctx->encode_config.frameIntervalP + 1 + 4));
+        if (nb_surfaces > ctx->nb_surfaces && ctx->nb_surfaces > 0)
+        {
             av_log(avctx, AV_LOG_WARNING,
-                   "The multipass rate-control requires "
-                   "a low-latency preset.\n");
-            return;
+                   "Defined rc_lookahead requires more surfaces, "
+                   "increasing used surfaces %d -> %d\n", ctx->nb_surfaces, nb_surfaces);
         }
+        ctx->nb_surfaces = FFMAX(nb_surfaces, ctx->nb_surfaces);
+    } else {
+        if (ctx->encode_config.frameIntervalP > 1 && ctx->nb_surfaces < nb_surfaces && ctx->nb_surfaces > 0)
+        {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Defined b-frame requires more surfaces, "
+                   "increasing used surfaces %d -> %d\n", ctx->nb_surfaces, nb_surfaces);
+            ctx->nb_surfaces = FFMAX(ctx->nb_surfaces, nb_surfaces);
+        }
+        else if (ctx->nb_surfaces <= 0)
+            ctx->nb_surfaces = nb_surfaces;
+        // otherwise use user specified value
     }
 
-    rc->rateControlMode = ctx->rc;
+    ctx->nb_surfaces = FFMAX(1, FFMIN(MAX_REGISTERED_FRAMES, ctx->nb_surfaces));
+    ctx->async_depth = FFMIN(ctx->async_depth, ctx->nb_surfaces - 1);
+
+    return 0;
 }
 
-static void nvenc_setup_rate_control(AVCodecContext *avctx)
+static av_cold void nvenc_setup_rate_control(AVCodecContext *avctx)
 {
-    NVENCContext *ctx    = avctx->priv_data;
-    NV_ENC_RC_PARAMS *rc = &ctx->config.rcParams;
+    NvencContext *ctx = avctx->priv_data;
 
-    if (avctx->bit_rate > 0)
-        rc->averageBitRate = avctx->bit_rate;
+    if (avctx->global_quality > 0)
+        av_log(avctx, AV_LOG_WARNING, "Using global_quality with nvenc is deprecated. Use qp instead.\n");
+
+    if (ctx->cqp < 0 && avctx->global_quality > 0)
+        ctx->cqp = avctx->global_quality;
+
+    if (avctx->bit_rate > 0) {
+        ctx->encode_config.rcParams.averageBitRate = avctx->bit_rate;
+    } else if (ctx->encode_config.rcParams.averageBitRate > 0) {
+        ctx->encode_config.rcParams.maxBitRate = ctx->encode_config.rcParams.averageBitRate;
+    }
 
     if (avctx->rc_max_rate > 0)
-        rc->maxBitRate = avctx->rc_max_rate;
-
-    if (ctx->rc > 0) {
-        nvenc_override_rate_control(avctx, rc);
-    } else if (ctx->flags & NVENC_LOSSLESS) {
-        set_lossless(avctx, rc);
-    } else if (avctx->global_quality > 0) {
-        set_constqp(avctx, rc);
-    } else {
+        ctx->encode_config.rcParams.maxBitRate = avctx->rc_max_rate;
+
+    if (ctx->rc < 0) {
+        if (ctx->flags & NVENC_ONE_PASS)
+            ctx->twopass = 0;
         if (ctx->flags & NVENC_TWO_PASSES)
-            rc->rateControlMode = NV_ENC_PARAMS_RC_2_PASS_VBR;
-        else
-            rc->rateControlMode = NV_ENC_PARAMS_RC_VBR;
-        set_vbr(avctx, rc);
+            ctx->twopass = 1;
+
+        if (ctx->twopass < 0)
+            ctx->twopass = (ctx->flags & NVENC_LOWLATENCY) != 0;
+
+        if (ctx->cbr) {
+            if (ctx->twopass) {
+                ctx->rc = NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ;
+            } else {
+                ctx->rc = NV_ENC_PARAMS_RC_CBR;
+            }
+        } else if (ctx->cqp >= 0) {
+            ctx->rc = NV_ENC_PARAMS_RC_CONSTQP;
+        } else if (ctx->twopass) {
+            ctx->rc = NV_ENC_PARAMS_RC_VBR_HQ;
+        } else if (avctx->qmin >= 0 && avctx->qmax >= 0) {
+            ctx->rc = NV_ENC_PARAMS_RC_VBR_MINQP;
+        }
     }
 
-    if (avctx->rc_buffer_size > 0)
-        rc->vbvBufferSize = avctx->rc_buffer_size;
+    if (ctx->rc >= 0 && ctx->rc & RC_MODE_DEPRECATED) {
+        av_log(avctx, AV_LOG_WARNING, "Specified rc mode is deprecated.\n");
+        av_log(avctx, AV_LOG_WARNING, "\tll_2pass_quality -> cbr_ld_hq\n");
+        av_log(avctx, AV_LOG_WARNING, "\tll_2pass_size -> cbr_hq\n");
+        av_log(avctx, AV_LOG_WARNING, "\tvbr_2pass -> vbr_hq\n");
+        av_log(avctx, AV_LOG_WARNING, "\tvbr_minqp -> (no replacement)\n");
 
-    if (rc->averageBitRate > 0)
-        avctx->bit_rate = rc->averageBitRate;
+        ctx->rc &= ~RC_MODE_DEPRECATED;
+    }
+
+    if (ctx->flags & NVENC_LOSSLESS) {
+        set_lossless(avctx);
+    } else if (ctx->rc >= 0) {
+        nvenc_override_rate_control(avctx);
+    } else {
+        ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR;
+        set_vbr(avctx);
+    }
+
+    if (avctx->rc_buffer_size > 0) {
+        ctx->encode_config.rcParams.vbvBufferSize = avctx->rc_buffer_size;
+    } else if (ctx->encode_config.rcParams.averageBitRate > 0) {
+        avctx->rc_buffer_size = ctx->encode_config.rcParams.vbvBufferSize = 2 * ctx->encode_config.rcParams.averageBitRate;
+    }
 
-#if NVENCAPI_MAJOR_VERSION >= 7
     if (ctx->aq) {
-        ctx->config.rcParams.enableAQ   = 1;
-        ctx->config.rcParams.aqStrength = ctx->aq_strength;
+        ctx->encode_config.rcParams.enableAQ   = 1;
+        ctx->encode_config.rcParams.aqStrength = ctx->aq_strength;
         av_log(avctx, AV_LOG_VERBOSE, "AQ enabled.\n");
     }
 
     if (ctx->temporal_aq) {
-        ctx->config.rcParams.enableTemporalAQ = 1;
+        ctx->encode_config.rcParams.enableTemporalAQ = 1;
         av_log(avctx, AV_LOG_VERBOSE, "Temporal AQ enabled.\n");
     }
 
     if (ctx->rc_lookahead > 0) {
         int lkd_bound = FFMIN(ctx->nb_surfaces, ctx->async_depth) -
-                        ctx->config.frameIntervalP - 4;
+                        ctx->encode_config.frameIntervalP - 4;
 
         if (lkd_bound < 0) {
             av_log(avctx, AV_LOG_WARNING,
                    "Lookahead not enabled. Increase buffer delay (-delay).\n");
         } else {
-            ctx->config.rcParams.enableLookahead = 1;
-            ctx->config.rcParams.lookaheadDepth  = av_clip(ctx->rc_lookahead, 0, lkd_bound);
-            ctx->config.rcParams.disableIadapt   = ctx->no_scenecut;
-            ctx->config.rcParams.disableBadapt   = !ctx->b_adapt;
+            ctx->encode_config.rcParams.enableLookahead = 1;
+            ctx->encode_config.rcParams.lookaheadDepth  = av_clip(ctx->rc_lookahead, 0, lkd_bound);
+            ctx->encode_config.rcParams.disableIadapt   = ctx->no_scenecut;
+            ctx->encode_config.rcParams.disableBadapt   = !ctx->b_adapt;
             av_log(avctx, AV_LOG_VERBOSE,
                    "Lookahead enabled: depth %d, scenecut %s, B-adapt %s.\n",
-                   ctx->config.rcParams.lookaheadDepth,
-                   ctx->config.rcParams.disableIadapt ? "disabled" : "enabled",
-                   ctx->config.rcParams.disableBadapt ? "disabled" : "enabled");
+                   ctx->encode_config.rcParams.lookaheadDepth,
+                   ctx->encode_config.rcParams.disableIadapt ? "disabled" : "enabled",
+                   ctx->encode_config.rcParams.disableBadapt ? "disabled" : "enabled");
         }
     }
 
     if (ctx->strict_gop) {
-        ctx->config.rcParams.strictGOPTarget = 1;
+        ctx->encode_config.rcParams.strictGOPTarget = 1;
         av_log(avctx, AV_LOG_VERBOSE, "Strict GOP target enabled.\n");
     }
 
     if (ctx->nonref_p)
-        ctx->config.rcParams.enableNonRefP = 1;
+        ctx->encode_config.rcParams.enableNonRefP = 1;
 
     if (ctx->zerolatency)
-        ctx->config.rcParams.zeroReorderDelay = 1;
+        ctx->encode_config.rcParams.zeroReorderDelay = 1;
 
     if (ctx->quality)
-        ctx->config.rcParams.targetQuality = ctx->quality;
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
+    {
+        //convert from float to fixed point 8.8
+        int tmp_quality = (int)(ctx->quality * 256.0f);
+        ctx->encode_config.rcParams.targetQuality = (uint8_t)(tmp_quality >> 8);
+        ctx->encode_config.rcParams.targetQualityLSB = (uint8_t)(tmp_quality & 0xff);
+    }
 }
 
-static int nvenc_setup_h264_config(AVCodecContext *avctx)
+static av_cold int nvenc_setup_h264_config(AVCodecContext *avctx)
 {
-    NVENCContext *ctx                      = avctx->priv_data;
-    NV_ENC_CONFIG *cc                      = &ctx->config;
+    NvencContext *ctx                      = avctx->priv_data;
+    NV_ENC_CONFIG *cc                      = &ctx->encode_config;
     NV_ENC_CONFIG_H264 *h264               = &cc->encodeCodecConfig.h264Config;
     NV_ENC_CONFIG_H264_VUI_PARAMETERS *vui = &h264->h264VUIParameters;
 
-    vui->colourDescriptionPresentFlag = avctx->colorspace      != AVCOL_SPC_UNSPECIFIED ||
-                                        avctx->color_primaries != AVCOL_PRI_UNSPECIFIED ||
-                                        avctx->color_trc       != AVCOL_TRC_UNSPECIFIED;
-
-    vui->colourMatrix            = avctx->colorspace;
-    vui->colourPrimaries         = avctx->color_primaries;
+    vui->colourMatrix = avctx->colorspace;
+    vui->colourPrimaries = avctx->color_primaries;
     vui->transferCharacteristics = avctx->color_trc;
+    vui->videoFullRangeFlag = (avctx->color_range == AVCOL_RANGE_JPEG
+        || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ420P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ422P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ444P);
+
+    vui->colourDescriptionPresentFlag =
+        (avctx->colorspace != 2 || avctx->color_primaries != 2 || avctx->color_trc != 2);
 
-    vui->videoFullRangeFlag = avctx->color_range == AVCOL_RANGE_JPEG;
+    vui->videoSignalTypePresentFlag =
+        (vui->colourDescriptionPresentFlag
+        || vui->videoFormat != 5
+        || vui->videoFullRangeFlag != 0);
 
-    vui->videoSignalTypePresentFlag = vui->colourDescriptionPresentFlag ||
-                                      vui->videoFullRangeFlag;
+    h264->sliceMode = 3;
+    h264->sliceModeData = 1;
 
     h264->disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
     h264->repeatSPSPPS  = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
-    h264->outputAUD     = 1;
+    h264->outputAUD     = ctx->aud;
 
-    h264->maxNumRefFrames = avctx->refs;
-    h264->idrPeriod       = cc->gopLength;
-
-    h264->sliceMode     = 3;
-    h264->sliceModeData = FFMAX(avctx->slices, 1);
-
-    if (ctx->flags & NVENC_LOSSLESS)
-        h264->qpPrimeYZeroTransformBypassFlag = 1;
+    if (avctx->refs >= 0) {
+        /* 0 means "let the hardware decide" */
+        h264->maxNumRefFrames = avctx->refs;
+    }
+    if (avctx->gop_size >= 0) {
+        h264->idrPeriod = cc->gopLength;
+    }
 
     if (IS_CBR(cc->rcParams.rateControlMode)) {
         h264->outputBufferingPeriodSEI = 1;
-        h264->outputPictureTimingSEI   = 1;
     }
 
-    if (ctx->profile)
-        avctx->profile = ctx->profile;
+    h264->outputPictureTimingSEI = 1;
 
-    if (ctx->data_pix_fmt == AV_PIX_FMT_YUV444P)
-        h264->chromaFormatIDC = 3;
-    else
-        h264->chromaFormatIDC = 1;
+    if (cc->rcParams.rateControlMode == NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ ||
+        cc->rcParams.rateControlMode == NV_ENC_PARAMS_RC_CBR_HQ ||
+        cc->rcParams.rateControlMode == NV_ENC_PARAMS_RC_VBR_HQ) {
+        h264->adaptiveTransformMode = NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE;
+        h264->fmoMode = NV_ENC_H264_FMO_DISABLE;
+    }
 
-    switch (ctx->profile) {
-    case NV_ENC_H264_PROFILE_BASELINE:
-        cc->profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_MAIN:
-        cc->profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_HIGH:
-        cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_HIGH_444:
-        cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_CONSTRAINED_HIGH:
-        cc->profileGUID = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH_GUID;
-        break;
+    if (ctx->flags & NVENC_LOSSLESS) {
+        h264->qpPrimeYZeroTransformBypassFlag = 1;
+    } else {
+        switch(ctx->profile) {
+        case NV_ENC_H264_PROFILE_BASELINE:
+            cc->profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
+            avctx->profile = FF_PROFILE_H264_BASELINE;
+            break;
+        case NV_ENC_H264_PROFILE_MAIN:
+            cc->profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
+            avctx->profile = FF_PROFILE_H264_MAIN;
+            break;
+        case NV_ENC_H264_PROFILE_HIGH:
+            cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
+            avctx->profile = FF_PROFILE_H264_HIGH;
+            break;
+        case NV_ENC_H264_PROFILE_HIGH_444P:
+            cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
+            avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
+            break;
+        }
     }
 
+    // force setting profile as high444p if input is AV_PIX_FMT_YUV444P
     if (ctx->data_pix_fmt == AV_PIX_FMT_YUV444P) {
         cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
         avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
     }
 
+    h264->chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1;
+
     h264->level = ctx->level;
 
+    if (ctx->coder >= 0)
+        h264->entropyCodingMode = ctx->coder;
+
+#ifdef NVENC_HAVE_BFRAME_REF_MODE
+    h264->useBFramesAsRef = ctx->b_ref_mode;
+#endif
+
     return 0;
 }
 
-static int nvenc_setup_hevc_config(AVCodecContext *avctx)
+static av_cold int nvenc_setup_hevc_config(AVCodecContext *avctx)
 {
-    NVENCContext *ctx                      = avctx->priv_data;
-    NV_ENC_CONFIG *cc                      = &ctx->config;
+    NvencContext *ctx                      = avctx->priv_data;
+    NV_ENC_CONFIG *cc                      = &ctx->encode_config;
     NV_ENC_CONFIG_HEVC *hevc               = &cc->encodeCodecConfig.hevcConfig;
     NV_ENC_CONFIG_HEVC_VUI_PARAMETERS *vui = &hevc->hevcVUIParameters;
 
-    vui->colourDescriptionPresentFlag = avctx->colorspace      != AVCOL_SPC_UNSPECIFIED ||
-                                        avctx->color_primaries != AVCOL_PRI_UNSPECIFIED ||
-                                        avctx->color_trc       != AVCOL_TRC_UNSPECIFIED;
-
-    vui->colourMatrix            = avctx->colorspace;
-    vui->colourPrimaries         = avctx->color_primaries;
+    vui->colourMatrix = avctx->colorspace;
+    vui->colourPrimaries = avctx->color_primaries;
     vui->transferCharacteristics = avctx->color_trc;
+    vui->videoFullRangeFlag = (avctx->color_range == AVCOL_RANGE_JPEG
+        || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ420P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ422P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ444P);
 
-    vui->videoFullRangeFlag = avctx->color_range == AVCOL_RANGE_JPEG;
+    vui->colourDescriptionPresentFlag =
+        (avctx->colorspace != 2 || avctx->color_primaries != 2 || avctx->color_trc != 2);
 
-    vui->videoSignalTypePresentFlag = vui->colourDescriptionPresentFlag ||
-                                      vui->videoFullRangeFlag;
+    vui->videoSignalTypePresentFlag =
+        (vui->colourDescriptionPresentFlag
+        || vui->videoFormat != 5
+        || vui->videoFullRangeFlag != 0);
+
+    hevc->sliceMode = 3;
+    hevc->sliceModeData = 1;
 
     hevc->disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
     hevc->repeatSPSPPS  = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
-    hevc->outputAUD     = 1;
+    hevc->outputAUD     = ctx->aud;
 
-    hevc->maxNumRefFramesInDPB = avctx->refs;
-    hevc->idrPeriod            = cc->gopLength;
+    if (avctx->refs >= 0) {
+        /* 0 means "let the hardware decide" */
+        hevc->maxNumRefFramesInDPB = avctx->refs;
+    }
+    if (avctx->gop_size >= 0) {
+        hevc->idrPeriod = cc->gopLength;
+    }
 
     if (IS_CBR(cc->rcParams.rateControlMode)) {
         hevc->outputBufferingPeriodSEI = 1;
-        hevc->outputPictureTimingSEI   = 1;
     }
 
+    hevc->outputPictureTimingSEI = 1;
+
     switch (ctx->profile) {
     case NV_ENC_HEVC_PROFILE_MAIN:
         cc->profileGUID = NV_ENC_HEVC_PROFILE_MAIN_GUID;
         avctx->profile  = FF_PROFILE_HEVC_MAIN;
         break;
-#if NVENCAPI_MAJOR_VERSION >= 7
     case NV_ENC_HEVC_PROFILE_MAIN_10:
         cc->profileGUID = NV_ENC_HEVC_PROFILE_MAIN10_GUID;
         avctx->profile  = FF_PROFILE_HEVC_MAIN_10;
@@ -818,177 +1060,138 @@ static int nvenc_setup_hevc_config(AVCodecContext *avctx)
         cc->profileGUID = NV_ENC_HEVC_PROFILE_FREXT_GUID;
         avctx->profile  = FF_PROFILE_HEVC_REXT;
         break;
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
     }
 
-    // force setting profile for various input formats
-    switch (ctx->data_pix_fmt) {
-    case AV_PIX_FMT_YUV420P:
-    case AV_PIX_FMT_NV12:
-        cc->profileGUID = NV_ENC_HEVC_PROFILE_MAIN_GUID;
-        avctx->profile  = FF_PROFILE_HEVC_MAIN;
-        break;
-#if NVENCAPI_MAJOR_VERSION >= 7
-    case AV_PIX_FMT_P010:
+    // force setting profile as main10 if input is 10 bit
+    if (IS_10BIT(ctx->data_pix_fmt)) {
         cc->profileGUID = NV_ENC_HEVC_PROFILE_MAIN10_GUID;
-        avctx->profile  = FF_PROFILE_HEVC_MAIN_10;
-        break;
-    case AV_PIX_FMT_YUV444P:
-    case AV_PIX_FMT_YUV444P16:
+        avctx->profile = FF_PROFILE_HEVC_MAIN_10;
+    }
+
+    // force setting profile as rext if input is yuv444
+    if (IS_YUV444(ctx->data_pix_fmt)) {
         cc->profileGUID = NV_ENC_HEVC_PROFILE_FREXT_GUID;
-        avctx->profile  = FF_PROFILE_HEVC_REXT;
-        break;
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
+        avctx->profile = FF_PROFILE_HEVC_REXT;
     }
 
-#if NVENCAPI_MAJOR_VERSION >= 7
-    hevc->chromaFormatIDC     = IS_YUV444(ctx->data_pix_fmt) ? 3 : 1;
-    hevc->pixelBitDepthMinus8 = IS_10BIT(ctx->data_pix_fmt)  ? 2 : 0;
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
+    hevc->chromaFormatIDC = IS_YUV444(ctx->data_pix_fmt) ? 3 : 1;
 
-    hevc->sliceMode     = 3;
-    hevc->sliceModeData = FFMAX(avctx->slices, 1);
+    hevc->pixelBitDepthMinus8 = IS_10BIT(ctx->data_pix_fmt) ? 2 : 0;
 
-    if (ctx->level) {
-        hevc->level = ctx->level;
-    } else {
-        hevc->level = NV_ENC_LEVEL_AUTOSELECT;
-    }
+    hevc->level = ctx->level;
 
-    if (ctx->tier) {
-        hevc->tier = ctx->tier;
-    }
+    hevc->tier = ctx->tier;
+
+#ifdef NVENC_HAVE_HEVC_BFRAME_REF_MODE
+    hevc->useBFramesAsRef = ctx->b_ref_mode;
+#endif
 
     return 0;
 }
-static int nvenc_setup_codec_config(AVCodecContext *avctx)
+
+static av_cold int nvenc_setup_codec_config(AVCodecContext *avctx)
 {
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
         return nvenc_setup_h264_config(avctx);
     case AV_CODEC_ID_HEVC:
         return nvenc_setup_hevc_config(avctx);
+    /* Earlier switch/case will return if unknown codec is passed. */
     }
+
     return 0;
 }
 
-static int nvenc_recalc_surfaces(AVCodecContext *avctx)
-{
-    NVENCContext *ctx = avctx->priv_data;
-    // default minimum of 4 surfaces
-    // multiply by 2 for number of NVENCs on gpu (hardcode to 2)
-    // another multiply by 2 to avoid blocking next PBB group
-    int nb_surfaces = FFMAX(4, ctx->config.frameIntervalP * 2 * 2);
+static void compute_dar(AVCodecContext *avctx, int *dw, int *dh) {
+    int sw, sh;
 
-    // lookahead enabled
-    if (ctx->rc_lookahead > 0) {
-        // +1 is to account for lkd_bound calculation later
-        // +4 is to allow sufficient pipelining with lookahead
-        nb_surfaces = FFMAX(1, FFMAX(nb_surfaces, ctx->rc_lookahead + ctx->config.frameIntervalP + 1 + 4));
-        if (nb_surfaces > ctx->nb_surfaces && ctx->nb_surfaces > 0) {
-            av_log(avctx, AV_LOG_WARNING,
-                "Defined rc_lookahead requires more surfaces, "
-                "increasing used surfaces %d -> %d\n",
-                ctx->nb_surfaces, nb_surfaces);
-        }
-        ctx->nb_surfaces = FFMAX(nb_surfaces, ctx->nb_surfaces);
-    } else {
-        if (ctx->config.frameIntervalP > 1 &&
-            ctx->nb_surfaces < nb_surfaces && ctx->nb_surfaces > 0) {
-            av_log(avctx, AV_LOG_WARNING,
-                "Defined b-frame requires more surfaces, "
-                "increasing used surfaces %d -> %d\n",
-                ctx->nb_surfaces, nb_surfaces);
-            ctx->nb_surfaces = FFMAX(ctx->nb_surfaces, nb_surfaces);
-        } else if (ctx->nb_surfaces <= 0)
-            ctx->nb_surfaces = nb_surfaces;
-        // otherwise use user specified value
+    sw = avctx->width;
+    sh = avctx->height;
+
+    if (avctx->sample_aspect_ratio.num > 0 && avctx->sample_aspect_ratio.den > 0) {
+        sw *= avctx->sample_aspect_ratio.num;
+        sh *= avctx->sample_aspect_ratio.den;
     }
 
-    ctx->nb_surfaces = FFMAX(1, FFMIN(MAX_REGISTERED_FRAMES, ctx->nb_surfaces));
-    ctx->async_depth = FFMIN(ctx->async_depth, ctx->nb_surfaces - 1);
-    return 0;
+    av_reduce(dw, dh, sw, sh, 1024 * 1024);
 }
 
-static int nvenc_setup_encoder(AVCodecContext *avctx)
+static av_cold int nvenc_setup_encoder(AVCodecContext *avctx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_PRESET_CONFIG preset_cfg = { 0 };
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    NV_ENC_PRESET_CONFIG preset_config = { 0 };
+    NVENCSTATUS nv_status = NV_ENC_SUCCESS;
     AVCPBProperties *cpb_props;
-    int ret;
+    int res = 0;
+    int dw, dh;
 
-    ctx->params.version = NV_ENC_INITIALIZE_PARAMS_VER;
+    ctx->encode_config.version = NV_ENC_CONFIG_VER;
+    ctx->init_encode_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
 
-    ctx->params.encodeHeight = avctx->height;
-    ctx->params.encodeWidth  = avctx->width;
+    ctx->init_encode_params.encodeHeight = avctx->height;
+    ctx->init_encode_params.encodeWidth = avctx->width;
 
-    if (avctx->sample_aspect_ratio.num &&
-        avctx->sample_aspect_ratio.den &&
-        (avctx->sample_aspect_ratio.num != 1 ||
-         avctx->sample_aspect_ratio.den != 1)) {
-        av_reduce(&ctx->params.darWidth,
-                  &ctx->params.darHeight,
-                  avctx->width * avctx->sample_aspect_ratio.num,
-                  avctx->height * avctx->sample_aspect_ratio.den,
-                  INT_MAX / 8);
-    } else {
-        ctx->params.darHeight = avctx->height;
-        ctx->params.darWidth  = avctx->width;
-    }
+    ctx->init_encode_params.encodeConfig = &ctx->encode_config;
 
-    // De-compensate for hardware, dubiously, trying to compensate for
-    // playback at 704 pixel width.
-    if (avctx->width == 720 && (avctx->height == 480 || avctx->height == 576)) {
-        av_reduce(&ctx->params.darWidth, &ctx->params.darHeight,
-                  ctx->params.darWidth * 44,
-                  ctx->params.darHeight * 45,
-                  1024 * 1024);
-    }
+    nvenc_map_preset(ctx);
 
-    ctx->params.frameRateNum = avctx->time_base.den;
-    ctx->params.frameRateDen = avctx->time_base.num * avctx->ticks_per_frame;
+    preset_config.version = NV_ENC_PRESET_CONFIG_VER;
+    preset_config.presetCfg.version = NV_ENC_CONFIG_VER;
 
-    ctx->params.enableEncodeAsync = 0;
-    ctx->params.enablePTD         = 1;
+    nv_status = p_nvenc->nvEncGetEncodePresetConfig(ctx->nvencoder,
+                                                    ctx->init_encode_params.encodeGUID,
+                                                    ctx->init_encode_params.presetGUID,
+                                                    &preset_config);
+    if (nv_status != NV_ENC_SUCCESS)
+        return nvenc_print_error(avctx, nv_status, "Cannot get the preset configuration");
 
-    ctx->params.encodeConfig = &ctx->config;
+    memcpy(&ctx->encode_config, &preset_config.presetCfg, sizeof(ctx->encode_config));
 
-    nvenc_map_preset(ctx);
+    ctx->encode_config.version = NV_ENC_CONFIG_VER;
 
-    preset_cfg.version           = NV_ENC_PRESET_CONFIG_VER;
-    preset_cfg.presetCfg.version = NV_ENC_CONFIG_VER;
+    compute_dar(avctx, &dw, &dh);
+    ctx->init_encode_params.darHeight = dh;
+    ctx->init_encode_params.darWidth = dw;
 
-    ret = nv->nvEncGetEncodePresetConfig(ctx->nvenc_ctx,
-                                         ctx->params.encodeGUID,
-                                         ctx->params.presetGUID,
-                                         &preset_cfg);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "Cannot get the preset configuration");
+    ctx->init_encode_params.frameRateNum = avctx->time_base.den;
+    ctx->init_encode_params.frameRateDen = avctx->time_base.num * avctx->ticks_per_frame;
 
-    memcpy(&ctx->config, &preset_cfg.presetCfg, sizeof(ctx->config));
+    ctx->init_encode_params.enableEncodeAsync = 0;
+    ctx->init_encode_params.enablePTD = 1;
 
-    ctx->config.version = NV_ENC_CONFIG_VER;
+    if (ctx->weighted_pred == 1)
+        ctx->init_encode_params.enableWeightedPrediction = 1;
+
+    if (ctx->bluray_compat) {
+        ctx->aud = 1;
+        avctx->refs = FFMIN(FFMAX(avctx->refs, 0), 6);
+        avctx->max_b_frames = FFMIN(avctx->max_b_frames, 3);
+        switch (avctx->codec->id) {
+        case AV_CODEC_ID_H264:
+            /* maximum level depends on used resolution */
+            break;
+        case AV_CODEC_ID_HEVC:
+            ctx->level = NV_ENC_LEVEL_HEVC_51;
+            ctx->tier = NV_ENC_TIER_HEVC_HIGH;
+            break;
+        }
+    }
 
     if (avctx->gop_size > 0) {
-        if (avctx->max_b_frames > 0) {
-            /* 0 is intra-only,
-             * 1 is I/P only,
-             * 2 is one B-Frame,
-             * 3 two B-frames, and so on. */
-            ctx->config.frameIntervalP = avctx->max_b_frames + 1;
-        } else if (avctx->max_b_frames == 0) {
-            ctx->config.frameIntervalP = 1;
+        if (avctx->max_b_frames >= 0) {
+            /* 0 is intra-only, 1 is I/P only, 2 is one B-Frame, 3 two B-frames, and so on. */
+            ctx->encode_config.frameIntervalP = avctx->max_b_frames + 1;
         }
-        ctx->config.gopLength = avctx->gop_size;
+
+        ctx->encode_config.gopLength = avctx->gop_size;
     } else if (avctx->gop_size == 0) {
-        ctx->config.frameIntervalP = 0;
-        ctx->config.gopLength      = 1;
+        ctx->encode_config.frameIntervalP = 0;
+        ctx->encode_config.gopLength = 1;
     }
 
-    if (ctx->config.frameIntervalP > 1)
-        avctx->max_b_frames = ctx->config.frameIntervalP - 1;
-
     ctx->initial_pts[0] = AV_NOPTS_VALUE;
     ctx->initial_pts[1] = AV_NOPTS_VALUE;
 
@@ -997,216 +1200,280 @@ static int nvenc_setup_encoder(AVCodecContext *avctx)
     nvenc_setup_rate_control(avctx);
 
     if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
-        ctx->config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
+        ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
     } else {
-        ctx->config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
+        ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
     }
 
-    if ((ret = nvenc_setup_codec_config(avctx)) < 0)
-        return ret;
+    res = nvenc_setup_codec_config(avctx);
+    if (res)
+        return res;
+
+    res = nvenc_push_context(avctx);
+    if (res < 0)
+        return res;
 
-    ret = nv->nvEncInitializeEncoder(ctx->nvenc_ctx, &ctx->params);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "InitializeEncoder failed");
+    nv_status = p_nvenc->nvEncInitializeEncoder(ctx->nvencoder, &ctx->init_encode_params);
+
+    res = nvenc_pop_context(avctx);
+    if (res < 0)
+        return res;
+
+    if (nv_status != NV_ENC_SUCCESS) {
+        return nvenc_print_error(avctx, nv_status, "InitializeEncoder failed");
+    }
+
+    if (ctx->encode_config.frameIntervalP > 1)
+        avctx->has_b_frames = 2;
+
+    if (ctx->encode_config.rcParams.averageBitRate > 0)
+        avctx->bit_rate = ctx->encode_config.rcParams.averageBitRate;
 
     cpb_props = ff_add_cpb_side_data(avctx);
     if (!cpb_props)
         return AVERROR(ENOMEM);
-    cpb_props->max_bitrate = avctx->rc_max_rate;
-    cpb_props->min_bitrate = avctx->rc_min_rate;
+    cpb_props->max_bitrate = ctx->encode_config.rcParams.maxBitRate;
     cpb_props->avg_bitrate = avctx->bit_rate;
-    cpb_props->buffer_size = avctx->rc_buffer_size;
+    cpb_props->buffer_size = ctx->encode_config.rcParams.vbvBufferSize;
 
     return 0;
 }
 
-static int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
+static NV_ENC_BUFFER_FORMAT nvenc_map_buffer_format(enum AVPixelFormat pix_fmt)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NVENCFrame *tmp_surface         = &ctx->frames[idx];
-    int ret;
-    NV_ENC_CREATE_BITSTREAM_BUFFER out_buffer = { 0 };
-
-    switch (ctx->data_pix_fmt) {
+    switch (pix_fmt) {
     case AV_PIX_FMT_YUV420P:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_YV12_PL;
-        break;
+        return NV_ENC_BUFFER_FORMAT_YV12_PL;
     case AV_PIX_FMT_NV12:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_NV12_PL;
-        break;
-    case AV_PIX_FMT_YUV444P:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_YUV444_PL;
-        break;
-#if NVENCAPI_MAJOR_VERSION >= 7
+        return NV_ENC_BUFFER_FORMAT_NV12_PL;
     case AV_PIX_FMT_P010:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_YUV420_10BIT;
-        break;
+    case AV_PIX_FMT_P016:
+        return NV_ENC_BUFFER_FORMAT_YUV420_10BIT;
+    case AV_PIX_FMT_YUV444P:
+        return NV_ENC_BUFFER_FORMAT_YUV444_PL;
     case AV_PIX_FMT_YUV444P16:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_YUV444_10BIT;
-        break;
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
+        return NV_ENC_BUFFER_FORMAT_YUV444_10BIT;
+    case AV_PIX_FMT_0RGB32:
+        return NV_ENC_BUFFER_FORMAT_ARGB;
+    case AV_PIX_FMT_0BGR32:
+        return NV_ENC_BUFFER_FORMAT_ABGR;
     default:
-        return AVERROR_BUG;
+        return NV_ENC_BUFFER_FORMAT_UNDEFINED;
     }
+}
 
-    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
-        ctx->frames[idx].in_ref = av_frame_alloc();
-        if (!ctx->frames[idx].in_ref)
+static av_cold int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
+{
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    NvencSurface* tmp_surface = &ctx->surfaces[idx];
+
+    NVENCSTATUS nv_status;
+    NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
+    allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA || avctx->pix_fmt == AV_PIX_FMT_D3D11) {
+        ctx->surfaces[idx].in_ref = av_frame_alloc();
+        if (!ctx->surfaces[idx].in_ref)
             return AVERROR(ENOMEM);
     } else {
-        NV_ENC_CREATE_INPUT_BUFFER in_buffer      = { 0 };
-
-        in_buffer.version  = NV_ENC_CREATE_INPUT_BUFFER_VER;
+        NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
 
-        in_buffer.width  = avctx->width;
-        in_buffer.height = avctx->height;
+        ctx->surfaces[idx].format = nvenc_map_buffer_format(ctx->data_pix_fmt);
+        if (ctx->surfaces[idx].format == NV_ENC_BUFFER_FORMAT_UNDEFINED) {
+            av_log(avctx, AV_LOG_FATAL, "Invalid input pixel format: %s\n",
+                   av_get_pix_fmt_name(ctx->data_pix_fmt));
+            return AVERROR(EINVAL);
+        }
 
-        in_buffer.bufferFmt  = ctx->frames[idx].format;
-        in_buffer.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_UNCACHED;
+        allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
+        allocSurf.width = avctx->width;
+        allocSurf.height = avctx->height;
+        allocSurf.bufferFmt = ctx->surfaces[idx].format;
 
-        ret = nv->nvEncCreateInputBuffer(ctx->nvenc_ctx, &in_buffer);
-        if (ret != NV_ENC_SUCCESS)
-            return nvenc_print_error(avctx, ret, "CreateInputBuffer failed");
+        nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf);
+        if (nv_status != NV_ENC_SUCCESS) {
+            return nvenc_print_error(avctx, nv_status, "CreateInputBuffer failed");
+        }
 
-        ctx->frames[idx].in     = in_buffer.inputBuffer;
+        ctx->surfaces[idx].input_surface = allocSurf.inputBuffer;
+        ctx->surfaces[idx].width = allocSurf.width;
+        ctx->surfaces[idx].height = allocSurf.height;
     }
 
-    out_buffer.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
-    /* 1MB is large enough to hold most output frames.
-     * NVENC increases this automatically if it is not enough. */
-    out_buffer.size = BITSTREAM_BUFFER_SIZE;
-
-    out_buffer.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_UNCACHED;
-
-    ret = nv->nvEncCreateBitstreamBuffer(ctx->nvenc_ctx, &out_buffer);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "CreateBitstreamBuffer failed");
+    nv_status = p_nvenc->nvEncCreateBitstreamBuffer(ctx->nvencoder, &allocOut);
+    if (nv_status != NV_ENC_SUCCESS) {
+        int err = nvenc_print_error(avctx, nv_status, "CreateBitstreamBuffer failed");
+        if (avctx->pix_fmt != AV_PIX_FMT_CUDA && avctx->pix_fmt != AV_PIX_FMT_D3D11)
+            p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->surfaces[idx].input_surface);
+        av_frame_free(&ctx->surfaces[idx].in_ref);
+        return err;
+    }
 
-    ctx->frames[idx].out  = out_buffer.bitstreamBuffer;
+    ctx->surfaces[idx].output_surface = allocOut.bitstreamBuffer;
+    ctx->surfaces[idx].size = allocOut.size;
 
     av_fifo_generic_write(ctx->unused_surface_queue, &tmp_surface, sizeof(tmp_surface), NULL);
 
     return 0;
 }
 
-static int nvenc_setup_surfaces(AVCodecContext *avctx)
+static av_cold int nvenc_setup_surfaces(AVCodecContext *avctx)
 {
-    NVENCContext *ctx = avctx->priv_data;
-    int i, ret;
+    NvencContext *ctx = avctx->priv_data;
+    int i, res = 0, res2;
 
-    ctx->frames = av_mallocz_array(ctx->nb_surfaces, sizeof(*ctx->frames));
-    if (!ctx->frames)
+    ctx->surfaces = av_mallocz_array(ctx->nb_surfaces, sizeof(*ctx->surfaces));
+    if (!ctx->surfaces)
         return AVERROR(ENOMEM);
 
-    ctx->timestamps = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
-    if (!ctx->timestamps)
+    ctx->timestamp_list = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
+    if (!ctx->timestamp_list)
         return AVERROR(ENOMEM);
-    ctx->unused_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NVENCFrame*));
+
+    ctx->unused_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NvencSurface*));
     if (!ctx->unused_surface_queue)
         return AVERROR(ENOMEM);
-    ctx->pending = av_fifo_alloc(ctx->nb_surfaces * sizeof(*ctx->frames));
-    if (!ctx->pending)
+
+    ctx->output_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NvencSurface*));
+    if (!ctx->output_surface_queue)
         return AVERROR(ENOMEM);
-    ctx->ready = av_fifo_alloc(ctx->nb_surfaces * sizeof(*ctx->frames));
-    if (!ctx->ready)
+    ctx->output_surface_ready_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NvencSurface*));
+    if (!ctx->output_surface_ready_queue)
         return AVERROR(ENOMEM);
 
+    res = nvenc_push_context(avctx);
+    if (res < 0)
+        return res;
+
     for (i = 0; i < ctx->nb_surfaces; i++) {
-        if ((ret = nvenc_alloc_surface(avctx, i)) < 0)
-            return ret;
+        if ((res = nvenc_alloc_surface(avctx, i)) < 0)
+            goto fail;
     }
 
-    return 0;
-}
+fail:
+    res2 = nvenc_pop_context(avctx);
+    if (res2 < 0)
+        return res2;
 
-#define EXTRADATA_SIZE 512
+    return res;
+}
 
-static int nvenc_setup_extradata(AVCodecContext *avctx)
+static av_cold int nvenc_setup_extradata(AVCodecContext *avctx)
 {
-    NVENCContext *ctx                     = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv       = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    NVENCSTATUS nv_status;
+    uint32_t outSize = 0;
+    char tmpHeader[256];
     NV_ENC_SEQUENCE_PARAM_PAYLOAD payload = { 0 };
-    int ret;
+    payload.version = NV_ENC_SEQUENCE_PARAM_PAYLOAD_VER;
 
-    avctx->extradata = av_mallocz(EXTRADATA_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!avctx->extradata)
-        return AVERROR(ENOMEM);
+    payload.spsppsBuffer = tmpHeader;
+    payload.inBufferSize = sizeof(tmpHeader);
+    payload.outSPSPPSPayloadSize = &outSize;
 
-    payload.version              = NV_ENC_SEQUENCE_PARAM_PAYLOAD_VER;
-    payload.spsppsBuffer         = avctx->extradata;
-    payload.inBufferSize         = EXTRADATA_SIZE;
-    payload.outSPSPPSPayloadSize = &avctx->extradata_size;
+    nv_status = p_nvenc->nvEncGetSequenceParams(ctx->nvencoder, &payload);
+    if (nv_status != NV_ENC_SUCCESS) {
+        return nvenc_print_error(avctx, nv_status, "GetSequenceParams failed");
+    }
+
+    avctx->extradata_size = outSize;
+    avctx->extradata = av_mallocz(outSize + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if (!avctx->extradata) {
+        return AVERROR(ENOMEM);
+    }
 
-    ret = nv->nvEncGetSequenceParams(ctx->nvenc_ctx, &payload);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "Cannot get the extradata");
+    memcpy(avctx->extradata, tmpHeader, outSize);
 
     return 0;
 }
 
 av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    int i;
+    NvencContext *ctx               = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    int i, res;
 
     /* the encoder has to be flushed before it can be closed */
-    if (ctx->nvenc_ctx) {
+    if (ctx->nvencoder) {
         NV_ENC_PIC_PARAMS params        = { .version        = NV_ENC_PIC_PARAMS_VER,
                                             .encodePicFlags = NV_ENC_PIC_FLAG_EOS };
 
-        nv->nvEncEncodePicture(ctx->nvenc_ctx, &params);
+        res = nvenc_push_context(avctx);
+        if (res < 0)
+            return res;
+
+        p_nvenc->nvEncEncodePicture(ctx->nvencoder, &params);
     }
 
-    av_fifo_free(ctx->timestamps);
-    av_fifo_free(ctx->pending);
-    av_fifo_free(ctx->ready);
-    av_fifo_free(ctx->unused_surface_queue);
+    av_fifo_freep(&ctx->timestamp_list);
+    av_fifo_freep(&ctx->output_surface_ready_queue);
+    av_fifo_freep(&ctx->output_surface_queue);
+    av_fifo_freep(&ctx->unused_surface_queue);
 
-    if (ctx->frames) {
-        for (i = 0; i < ctx->nb_surfaces; ++i) {
-            if (avctx->pix_fmt != AV_PIX_FMT_CUDA) {
-                nv->nvEncDestroyInputBuffer(ctx->nvenc_ctx, ctx->frames[i].in);
-            } else if (ctx->frames[i].in) {
-                nv->nvEncUnmapInputResource(ctx->nvenc_ctx, ctx->frames[i].in_map.mappedResource);
-            }
-
-            av_frame_free(&ctx->frames[i].in_ref);
-            nv->nvEncDestroyBitstreamBuffer(ctx->nvenc_ctx, ctx->frames[i].out);
+    if (ctx->surfaces && (avctx->pix_fmt == AV_PIX_FMT_CUDA || avctx->pix_fmt == AV_PIX_FMT_D3D11)) {
+        for (i = 0; i < ctx->nb_registered_frames; i++) {
+            if (ctx->registered_frames[i].mapped)
+                p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, ctx->registered_frames[i].in_map.mappedResource);
+            if (ctx->registered_frames[i].regptr)
+                p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->registered_frames[i].regptr);
         }
+        ctx->nb_registered_frames = 0;
     }
-    for (i = 0; i < ctx->nb_registered_frames; i++) {
-        if (ctx->registered_frames[i].regptr)
-            nv->nvEncUnregisterResource(ctx->nvenc_ctx, ctx->registered_frames[i].regptr);
+
+    if (ctx->surfaces) {
+        for (i = 0; i < ctx->nb_surfaces; ++i) {
+            if (avctx->pix_fmt != AV_PIX_FMT_CUDA && avctx->pix_fmt != AV_PIX_FMT_D3D11)
+                p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->surfaces[i].input_surface);
+            av_frame_free(&ctx->surfaces[i].in_ref);
+            p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->surfaces[i].output_surface);
+        }
     }
-    ctx->nb_registered_frames = 0;
+    av_freep(&ctx->surfaces);
+    ctx->nb_surfaces = 0;
 
-    av_freep(&ctx->frames);
+    if (ctx->nvencoder) {
+        p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
 
-    if (ctx->nvenc_ctx)
-        nv->nvEncDestroyEncoder(ctx->nvenc_ctx);
+        res = nvenc_pop_context(avctx);
+        if (res < 0)
+            return res;
+    }
+    ctx->nvencoder = NULL;
 
     if (ctx->cu_context_internal)
-        ctx->nvel.cu_ctx_destroy(ctx->cu_context_internal);
-
-    if (ctx->nvel.nvenc)
-        dlclose(ctx->nvel.nvenc);
+        CHECK_CU(dl_fn->cuda_dl->cuCtxDestroy(ctx->cu_context_internal));
+    ctx->cu_context = ctx->cu_context_internal = NULL;
 
-#if !CONFIG_CUDA
-    if (ctx->nvel.cuda)
-        dlclose(ctx->nvel.cuda);
+#if CONFIG_D3D11VA
+    if (ctx->d3d11_device) {
+        ID3D11Device_Release(ctx->d3d11_device);
+        ctx->d3d11_device = NULL;
+    }
 #endif
 
+    nvenc_free_functions(&dl_fn->nvenc_dl);
+    cuda_free_functions(&dl_fn->cuda_dl);
+
+    dl_fn->nvenc_device_count = 0;
+
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
+
     return 0;
 }
 
 av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
     int ret;
 
-    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA || avctx->pix_fmt == AV_PIX_FMT_D3D11) {
         AVHWFramesContext *frames_ctx;
         if (!avctx->hw_frames_ctx) {
             av_log(avctx, AV_LOG_ERROR,
@@ -1214,6 +1481,11 @@ av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
             return AVERROR(EINVAL);
         }
         frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+        if (frames_ctx->format != avctx->pix_fmt) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "hw_frames_ctx must match the GPU frame type\n");
+            return AVERROR(EINVAL);
+        }
         ctx->data_pix_fmt = frames_ctx->sw_format;
     } else {
         ctx->data_pix_fmt = avctx->pix_fmt;
@@ -1239,9 +1511,9 @@ av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static NVENCFrame *get_free_frame(NVENCContext *ctx)
+static NvencSurface *get_free_frame(NvencContext *ctx)
 {
-    NVENCFrame *tmp_surf;
+    NvencSurface *tmp_surf;
 
     if (!(av_fifo_size(ctx->unused_surface_queue) > 0))
         // queue empty
@@ -1251,97 +1523,53 @@ static NVENCFrame *get_free_frame(NVENCContext *ctx)
     return tmp_surf;
 }
 
-static int nvenc_copy_frame(NV_ENC_LOCK_INPUT_BUFFER *in, const AVFrame *frame)
+static int nvenc_copy_frame(AVCodecContext *avctx, NvencSurface *nv_surface,
+            NV_ENC_LOCK_INPUT_BUFFER *lock_buffer_params, const AVFrame *frame)
 {
-    uint8_t *buf = in->bufferDataPtr;
-    int off      = frame->height * in->pitch;
+    int dst_linesize[4] = {
+        lock_buffer_params->pitch,
+        lock_buffer_params->pitch,
+        lock_buffer_params->pitch,
+        lock_buffer_params->pitch
+    };
+    uint8_t *dst_data[4];
+    int ret;
 
-    switch (frame->format) {
-    case AV_PIX_FMT_YUV420P:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
-        buf += off;
+    if (frame->format == AV_PIX_FMT_YUV420P)
+        dst_linesize[1] = dst_linesize[2] >>= 1;
 
-        av_image_copy_plane(buf, in->pitch >> 1,
-                            frame->data[2], frame->linesize[2],
-                            frame->width >> 1, frame->height >> 1);
+    ret = av_image_fill_pointers(dst_data, frame->format, nv_surface->height,
+                                 lock_buffer_params->bufferDataPtr, dst_linesize);
+    if (ret < 0)
+        return ret;
 
-        buf += off >> 2;
+    if (frame->format == AV_PIX_FMT_YUV420P)
+        FFSWAP(uint8_t*, dst_data[1], dst_data[2]);
 
-        av_image_copy_plane(buf, in->pitch >> 1,
-                            frame->data[1], frame->linesize[1],
-                            frame->width >> 1, frame->height >> 1);
-        break;
-    case AV_PIX_FMT_NV12:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width, frame->height >> 1);
-        break;
-    case AV_PIX_FMT_P010:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width << 1, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width << 1, frame->height >> 1);
-        break;
-    case AV_PIX_FMT_YUV444P:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[2], frame->linesize[2],
-                            frame->width, frame->height);
-        break;
-    case AV_PIX_FMT_YUV444P16:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width << 1, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width << 1, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[2], frame->linesize[2],
-                            frame->width << 1, frame->height);
-        break;
-    default:
-        return AVERROR_BUG;
-    }
+    av_image_copy(dst_data, dst_linesize,
+                  (const uint8_t**)frame->data, frame->linesize, frame->format,
+                  avctx->width, avctx->height);
 
     return 0;
 }
 
 static int nvenc_find_free_reg_resource(AVCodecContext *avctx)
 {
-    NVENCContext               *ctx = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    NVENCSTATUS nv_status;
+
     int i;
 
     if (ctx->nb_registered_frames == FF_ARRAY_ELEMS(ctx->registered_frames)) {
         for (i = 0; i < ctx->nb_registered_frames; i++) {
             if (!ctx->registered_frames[i].mapped) {
                 if (ctx->registered_frames[i].regptr) {
-                    nv->nvEncUnregisterResource(ctx->nvenc_ctx,
-                                                ctx->registered_frames[i].regptr);
+                    nv_status = p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->registered_frames[i].regptr);
+                    if (nv_status != NV_ENC_SUCCESS)
+                        return nvenc_print_error(avctx, nv_status, "Failed unregistering unused input resource");
+                    ctx->registered_frames[i].ptr = NULL;
                     ctx->registered_frames[i].regptr = NULL;
                 }
                 return i;
@@ -1357,14 +1585,18 @@ static int nvenc_find_free_reg_resource(AVCodecContext *avctx)
 
 static int nvenc_register_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
-    NVENCContext               *ctx = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    AVHWFramesContext   *frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    AVHWFramesContext *frames_ctx = (AVHWFramesContext*)frame->hw_frames_ctx->data;
     NV_ENC_REGISTER_RESOURCE reg;
     int i, idx, ret;
 
     for (i = 0; i < ctx->nb_registered_frames; i++) {
-        if (ctx->registered_frames[i].ptr == (CUdeviceptr)frame->data[0])
+        if (avctx->pix_fmt == AV_PIX_FMT_CUDA && ctx->registered_frames[i].ptr == frame->data[0])
+            return i;
+        else if (avctx->pix_fmt == AV_PIX_FMT_D3D11 && ctx->registered_frames[i].ptr == frame->data[0] && ctx->registered_frames[i].ptr_index == (intptr_t)frame->data[1])
             return i;
     }
 
@@ -1373,120 +1605,153 @@ static int nvenc_register_frame(AVCodecContext *avctx, const AVFrame *frame)
         return idx;
 
     reg.version            = NV_ENC_REGISTER_RESOURCE_VER;
-    reg.resourceType       = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
     reg.width              = frames_ctx->width;
     reg.height             = frames_ctx->height;
-    reg.bufferFormat       = ctx->frames[0].format;
     reg.pitch              = frame->linesize[0];
     reg.resourceToRegister = frame->data[0];
 
-    ret = nv->nvEncRegisterResource(ctx->nvenc_ctx, &reg);
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
+        reg.resourceType   = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
+    }
+    else if (avctx->pix_fmt == AV_PIX_FMT_D3D11) {
+        reg.resourceType     = NV_ENC_INPUT_RESOURCE_TYPE_DIRECTX;
+        reg.subResourceIndex = (intptr_t)frame->data[1];
+    }
+
+    reg.bufferFormat       = nvenc_map_buffer_format(frames_ctx->sw_format);
+    if (reg.bufferFormat == NV_ENC_BUFFER_FORMAT_UNDEFINED) {
+        av_log(avctx, AV_LOG_FATAL, "Invalid input pixel format: %s\n",
+               av_get_pix_fmt_name(frames_ctx->sw_format));
+        return AVERROR(EINVAL);
+    }
+
+    ret = p_nvenc->nvEncRegisterResource(ctx->nvencoder, &reg);
     if (ret != NV_ENC_SUCCESS) {
         nvenc_print_error(avctx, ret, "Error registering an input resource");
         return AVERROR_UNKNOWN;
     }
 
-    ctx->registered_frames[idx].ptr    = (CUdeviceptr)frame->data[0];
-    ctx->registered_frames[idx].regptr = reg.registeredResource;
+    ctx->registered_frames[idx].ptr       = frame->data[0];
+    ctx->registered_frames[idx].ptr_index = reg.subResourceIndex;
+    ctx->registered_frames[idx].regptr    = reg.registeredResource;
     return idx;
 }
 
 static int nvenc_upload_frame(AVCodecContext *avctx, const AVFrame *frame,
-                              NVENCFrame *nvenc_frame)
+                                      NvencSurface *nvenc_frame)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    int ret;
-
-    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
-        int reg_idx;
-
-        ret = nvenc_register_frame(avctx, frame);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not register an input CUDA frame\n");
-            return ret;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    int res;
+    NVENCSTATUS nv_status;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA || avctx->pix_fmt == AV_PIX_FMT_D3D11) {
+        int reg_idx = nvenc_register_frame(avctx, frame);
+        if (reg_idx < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Could not register an input HW frame\n");
+            return reg_idx;
         }
-        reg_idx = ret;
 
-        ret = av_frame_ref(nvenc_frame->in_ref, frame);
-        if (ret < 0)
-            return ret;
-
-        nvenc_frame->in_map.version            = NV_ENC_MAP_INPUT_RESOURCE_VER;
-        nvenc_frame->in_map.registeredResource = ctx->registered_frames[reg_idx].regptr;
-
-        ret = nv->nvEncMapInputResource(ctx->nvenc_ctx, &nvenc_frame->in_map);
-        if (ret != NV_ENC_SUCCESS) {
-            av_frame_unref(nvenc_frame->in_ref);
-            return nvenc_print_error(avctx, ret, "Error mapping an input resource");
+        res = av_frame_ref(nvenc_frame->in_ref, frame);
+        if (res < 0)
+            return res;
+
+        if (!ctx->registered_frames[reg_idx].mapped) {
+            ctx->registered_frames[reg_idx].in_map.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+            ctx->registered_frames[reg_idx].in_map.registeredResource = ctx->registered_frames[reg_idx].regptr;
+            nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &ctx->registered_frames[reg_idx].in_map);
+            if (nv_status != NV_ENC_SUCCESS) {
+                av_frame_unref(nvenc_frame->in_ref);
+                return nvenc_print_error(avctx, nv_status, "Error mapping an input resource");
+            }
         }
 
-        ctx->registered_frames[reg_idx].mapped = 1;
+        ctx->registered_frames[reg_idx].mapped += 1;
+
         nvenc_frame->reg_idx                   = reg_idx;
-        nvenc_frame->in                        = nvenc_frame->in_map.mappedResource;
+        nvenc_frame->input_surface             = ctx->registered_frames[reg_idx].in_map.mappedResource;
+        nvenc_frame->format                    = ctx->registered_frames[reg_idx].in_map.mappedBufferFmt;
+        nvenc_frame->pitch                     = frame->linesize[0];
+
+        return 0;
     } else {
-        NV_ENC_LOCK_INPUT_BUFFER params = { 0 };
+        NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
+
+        lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
+        lockBufferParams.inputBuffer = nvenc_frame->input_surface;
 
-        params.version     = NV_ENC_LOCK_INPUT_BUFFER_VER;
-        params.inputBuffer = nvenc_frame->in;
+        nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams);
+        if (nv_status != NV_ENC_SUCCESS) {
+            return nvenc_print_error(avctx, nv_status, "Failed locking nvenc input buffer");
+        }
 
-        ret = nv->nvEncLockInputBuffer(ctx->nvenc_ctx, &params);
-        if (ret != NV_ENC_SUCCESS)
-            return nvenc_print_error(avctx, ret, "Cannot lock the buffer");
+        nvenc_frame->pitch = lockBufferParams.pitch;
+        res = nvenc_copy_frame(avctx, nvenc_frame, &lockBufferParams, frame);
 
-        ret = nvenc_copy_frame(&params, frame);
-        if (ret < 0) {
-            nv->nvEncUnlockInputBuffer(ctx->nvenc_ctx, nvenc_frame->in);
-            return ret;
+        nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, nvenc_frame->input_surface);
+        if (nv_status != NV_ENC_SUCCESS) {
+            return nvenc_print_error(avctx, nv_status, "Failed unlocking input buffer!");
         }
 
-        ret = nv->nvEncUnlockInputBuffer(ctx->nvenc_ctx, nvenc_frame->in);
-        if (ret != NV_ENC_SUCCESS)
-            return nvenc_print_error(avctx, ret, "Cannot unlock the buffer");
+        return res;
     }
-
-    return 0;
 }
 
 static void nvenc_codec_specific_pic_params(AVCodecContext *avctx,
-                                            NV_ENC_PIC_PARAMS *params)
+                                            NV_ENC_PIC_PARAMS *params,
+                                            NV_ENC_SEI_PAYLOAD *sei_data)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
 
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
         params->codecPicParams.h264PicParams.sliceMode =
-            ctx->config.encodeCodecConfig.h264Config.sliceMode;
+            ctx->encode_config.encodeCodecConfig.h264Config.sliceMode;
         params->codecPicParams.h264PicParams.sliceModeData =
-            ctx->config.encodeCodecConfig.h264Config.sliceModeData;
-        break;
+            ctx->encode_config.encodeCodecConfig.h264Config.sliceModeData;
+        if (sei_data) {
+            params->codecPicParams.h264PicParams.seiPayloadArray = sei_data;
+            params->codecPicParams.h264PicParams.seiPayloadArrayCnt = 1;
+        }
+
+      break;
     case AV_CODEC_ID_HEVC:
         params->codecPicParams.hevcPicParams.sliceMode =
-            ctx->config.encodeCodecConfig.hevcConfig.sliceMode;
+            ctx->encode_config.encodeCodecConfig.hevcConfig.sliceMode;
         params->codecPicParams.hevcPicParams.sliceModeData =
-            ctx->config.encodeCodecConfig.hevcConfig.sliceModeData;
+            ctx->encode_config.encodeCodecConfig.hevcConfig.sliceModeData;
+        if (sei_data) {
+            params->codecPicParams.hevcPicParams.seiPayloadArray = sei_data;
+            params->codecPicParams.hevcPicParams.seiPayloadArrayCnt = 1;
+        }
+
         break;
     }
 }
 
-static inline int nvenc_enqueue_timestamp(AVFifoBuffer *f, int64_t pts)
+static inline void timestamp_queue_enqueue(AVFifoBuffer* queue, int64_t timestamp)
 {
-    return av_fifo_generic_write(f, &pts, sizeof(pts), NULL);
+    av_fifo_generic_write(queue, &timestamp, sizeof(timestamp), NULL);
 }
 
-static inline int nvenc_dequeue_timestamp(AVFifoBuffer *f, int64_t *pts)
+static inline int64_t timestamp_queue_dequeue(AVFifoBuffer* queue)
 {
-    return av_fifo_generic_read(f, pts, sizeof(*pts), NULL);
+    int64_t timestamp = AV_NOPTS_VALUE;
+    if (av_fifo_size(queue) > 0)
+        av_fifo_generic_read(queue, &timestamp, sizeof(timestamp), NULL);
+
+    return timestamp;
 }
 
 static int nvenc_set_timestamp(AVCodecContext *avctx,
                                NV_ENC_LOCK_BITSTREAM *params,
                                AVPacket *pkt)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
 
-    pkt->pts      = params->outputTimeStamp;
-    pkt->duration = params->outputDuration;
+    pkt->pts = params->outputTimeStamp;
 
     /* generate the first dts by linearly extrapolating the
      * first two pts values to the past */
@@ -1508,80 +1773,147 @@ static int nvenc_set_timestamp(AVCodecContext *avctx,
         ctx->first_packet_output = 1;
         return 0;
     }
-    return nvenc_dequeue_timestamp(ctx->timestamps, &pkt->dts);
+
+    pkt->dts = timestamp_queue_dequeue(ctx->timestamp_list);
+
+    return 0;
 }
 
-static int nvenc_get_output(AVCodecContext *avctx, AVPacket *pkt)
+static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, NvencSurface *tmpoutsurf)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_LOCK_BITSTREAM params    = { 0 };
-    NVENCFrame *frame;
-    int ret;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
 
-    ret = av_fifo_generic_read(ctx->ready, &frame, sizeof(frame), NULL);
-    if (ret)
-        return ret;
+    uint32_t slice_mode_data;
+    uint32_t *slice_offsets = NULL;
+    NV_ENC_LOCK_BITSTREAM lock_params = { 0 };
+    NVENCSTATUS nv_status;
+    int res = 0;
 
-    params.version         = NV_ENC_LOCK_BITSTREAM_VER;
-    params.outputBitstream = frame->out;
+    enum AVPictureType pict_type;
 
-    ret = nv->nvEncLockBitstream(ctx->nvenc_ctx, &params);
-    if (ret < 0)
-        return nvenc_print_error(avctx, ret, "Cannot lock the bitstream");
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_H264:
+      slice_mode_data = ctx->encode_config.encodeCodecConfig.h264Config.sliceModeData;
+      break;
+    case AV_CODEC_ID_H265:
+      slice_mode_data = ctx->encode_config.encodeCodecConfig.hevcConfig.sliceModeData;
+      break;
+    default:
+      av_log(avctx, AV_LOG_ERROR, "Unknown codec name\n");
+      res = AVERROR(EINVAL);
+      goto error;
+    }
+    slice_offsets = av_mallocz(slice_mode_data * sizeof(*slice_offsets));
 
-    ret = ff_alloc_packet(pkt, params.bitstreamSizeInBytes);
-    if (ret < 0)
-        return ret;
+    if (!slice_offsets) {
+        res = AVERROR(ENOMEM);
+        goto error;
+    }
 
-    memcpy(pkt->data, params.bitstreamBufferPtr, pkt->size);
+    lock_params.version = NV_ENC_LOCK_BITSTREAM_VER;
 
-    ret = nv->nvEncUnlockBitstream(ctx->nvenc_ctx, frame->out);
-    if (ret < 0)
-        return nvenc_print_error(avctx, ret, "Cannot unlock the bitstream");
+    lock_params.doNotWait = 0;
+    lock_params.outputBitstream = tmpoutsurf->output_surface;
+    lock_params.sliceOffsets = slice_offsets;
 
-    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
-        nv->nvEncUnmapInputResource(ctx->nvenc_ctx, frame->in_map.mappedResource);
-        av_frame_unref(frame->in_ref);
-        ctx->registered_frames[frame->reg_idx].mapped = 0;
+    nv_status = p_nvenc->nvEncLockBitstream(ctx->nvencoder, &lock_params);
+    if (nv_status != NV_ENC_SUCCESS) {
+        res = nvenc_print_error(avctx, nv_status, "Failed locking bitstream buffer");
+        goto error;
+    }
 
-        frame->in = NULL;
+    if (res = ff_alloc_packet2(avctx, pkt, lock_params.bitstreamSizeInBytes,0)) {
+        p_nvenc->nvEncUnlockBitstream(ctx->nvencoder, tmpoutsurf->output_surface);
+        goto error;
     }
 
-    av_fifo_generic_write(ctx->unused_surface_queue, &frame, sizeof(frame), NULL);
+    memcpy(pkt->data, lock_params.bitstreamBufferPtr, lock_params.bitstreamSizeInBytes);
 
-    ret = nvenc_set_timestamp(avctx, &params, pkt);
-    if (ret < 0)
-        return ret;
+    nv_status = p_nvenc->nvEncUnlockBitstream(ctx->nvencoder, tmpoutsurf->output_surface);
+    if (nv_status != NV_ENC_SUCCESS) {
+        res = nvenc_print_error(avctx, nv_status, "Failed unlocking bitstream buffer, expect the gates of mordor to open");
+        goto error;
+    }
 
-    switch (params.pictureType) {
+
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA || avctx->pix_fmt == AV_PIX_FMT_D3D11) {
+        ctx->registered_frames[tmpoutsurf->reg_idx].mapped -= 1;
+        if (ctx->registered_frames[tmpoutsurf->reg_idx].mapped == 0) {
+            nv_status = p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, ctx->registered_frames[tmpoutsurf->reg_idx].in_map.mappedResource);
+            if (nv_status != NV_ENC_SUCCESS) {
+                res = nvenc_print_error(avctx, nv_status, "Failed unmapping input resource");
+                goto error;
+            }
+            nv_status = p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->registered_frames[tmpoutsurf->reg_idx].regptr);
+            if (nv_status != NV_ENC_SUCCESS) {
+                res = nvenc_print_error(avctx, nv_status, "Failed unregistering input resource");
+                goto error;
+            }
+            ctx->registered_frames[tmpoutsurf->reg_idx].ptr = NULL;
+            ctx->registered_frames[tmpoutsurf->reg_idx].regptr = NULL;
+        } else if (ctx->registered_frames[tmpoutsurf->reg_idx].mapped < 0) {
+            res = AVERROR_BUG;
+            goto error;
+        }
+
+        av_frame_unref(tmpoutsurf->in_ref);
+
+        tmpoutsurf->input_surface = NULL;
+    }
+
+    switch (lock_params.pictureType) {
     case NV_ENC_PIC_TYPE_IDR:
         pkt->flags |= AV_PKT_FLAG_KEY;
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    case NV_ENC_PIC_TYPE_INTRA_REFRESH:
     case NV_ENC_PIC_TYPE_I:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case NV_ENC_PIC_TYPE_P:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case NV_ENC_PIC_TYPE_B:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
     case NV_ENC_PIC_TYPE_BI:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
+        pict_type = AV_PICTURE_TYPE_BI;
         break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown picture type encountered, expect the output to be broken.\n");
+        av_log(avctx, AV_LOG_ERROR, "Please report this error and include as much information on how to reproduce it as possible.\n");
+        res = AVERROR_EXTERNAL;
+        goto error;
+    }
+
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
-    }
+
+    ff_side_data_set_encoder_stats(pkt,
+        (lock_params.frameAvgQP - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
+
+    res = nvenc_set_timestamp(avctx, &lock_params, pkt);
+    if (res < 0)
+        goto error2;
+
+    av_free(slice_offsets);
 
     return 0;
+
+error:
+    timestamp_queue_dequeue(ctx->timestamp_list);
+
+error2:
+    av_free(slice_offsets);
+
+    return res;
 }
 
 static int output_ready(AVCodecContext *avctx, int flush)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
     int nb_ready, nb_pending;
 
     /* when B-frames are enabled, we wait for two initial timestamps to
@@ -1590,96 +1922,288 @@ static int output_ready(AVCodecContext *avctx, int flush)
         (ctx->initial_pts[0] == AV_NOPTS_VALUE || ctx->initial_pts[1] == AV_NOPTS_VALUE))
         return 0;
 
-    nb_ready   = av_fifo_size(ctx->ready)   / sizeof(NVENCFrame*);
-    nb_pending = av_fifo_size(ctx->pending) / sizeof(NVENCFrame*);
+    nb_ready   = av_fifo_size(ctx->output_surface_ready_queue)   / sizeof(NvencSurface*);
+    nb_pending = av_fifo_size(ctx->output_surface_queue)         / sizeof(NvencSurface*);
     if (flush)
         return nb_ready > 0;
     return (nb_ready > 0) && (nb_ready + nb_pending >= ctx->async_depth);
 }
 
-int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                          const AVFrame *frame, int *got_packet)
+static void reconfig_encoder(AVCodecContext *avctx, const AVFrame *frame)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NVENCLibraryContext *nvel       = &ctx->nvel;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_PIC_PARAMS params        = { 0 };
-    NVENCFrame         *nvenc_frame = NULL;
-    CUcontext dummy;
-    int enc_ret, ret;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &ctx->nvenc_dload_funcs.nvenc_funcs;
+    NVENCSTATUS ret;
+
+    NV_ENC_RECONFIGURE_PARAMS params = { 0 };
+    int needs_reconfig = 0;
+    int needs_encode_config = 0;
+    int reconfig_bitrate = 0, reconfig_dar = 0;
+    int dw, dh;
+
+    params.version = NV_ENC_RECONFIGURE_PARAMS_VER;
+    params.reInitEncodeParams = ctx->init_encode_params;
+
+    compute_dar(avctx, &dw, &dh);
+    if (dw != ctx->init_encode_params.darWidth || dh != ctx->init_encode_params.darHeight) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "aspect ratio change (DAR): %d:%d -> %d:%d\n",
+               ctx->init_encode_params.darWidth,
+               ctx->init_encode_params.darHeight, dw, dh);
+
+        params.reInitEncodeParams.darHeight = dh;
+        params.reInitEncodeParams.darWidth = dw;
+
+        needs_reconfig = 1;
+        reconfig_dar = 1;
+    }
 
-    params.version = NV_ENC_PIC_PARAMS_VER;
+    if (ctx->rc != NV_ENC_PARAMS_RC_CONSTQP && ctx->support_dyn_bitrate) {
+        if (avctx->bit_rate > 0 && params.reInitEncodeParams.encodeConfig->rcParams.averageBitRate != avctx->bit_rate) {
+            av_log(avctx, AV_LOG_VERBOSE,
+                   "avg bitrate change: %d -> %d\n",
+                   params.reInitEncodeParams.encodeConfig->rcParams.averageBitRate,
+                   (uint32_t)avctx->bit_rate);
 
-    if (frame) {
-        nvenc_frame = get_free_frame(ctx);
-        if (!nvenc_frame) {
-            av_log(avctx, AV_LOG_ERROR, "No free surfaces\n");
-            return AVERROR_BUG;
+            params.reInitEncodeParams.encodeConfig->rcParams.averageBitRate = avctx->bit_rate;
+            reconfig_bitrate = 1;
         }
 
-        ret = nvenc_upload_frame(avctx, frame, nvenc_frame);
-        if (ret < 0)
-            return ret;
+        if (avctx->rc_max_rate > 0 && ctx->encode_config.rcParams.maxBitRate != avctx->rc_max_rate) {
+            av_log(avctx, AV_LOG_VERBOSE,
+                   "max bitrate change: %d -> %d\n",
+                   params.reInitEncodeParams.encodeConfig->rcParams.maxBitRate,
+                   (uint32_t)avctx->rc_max_rate);
 
-        params.inputBuffer     = nvenc_frame->in;
-        params.bufferFmt       = nvenc_frame->format;
-        params.inputWidth      = frame->width;
-        params.inputHeight     = frame->height;
-        params.outputBitstream = nvenc_frame->out;
-        params.inputTimeStamp  = frame->pts;
+            params.reInitEncodeParams.encodeConfig->rcParams.maxBitRate = avctx->rc_max_rate;
+            reconfig_bitrate = 1;
+        }
+
+        if (avctx->rc_buffer_size > 0 && ctx->encode_config.rcParams.vbvBufferSize != avctx->rc_buffer_size) {
+            av_log(avctx, AV_LOG_VERBOSE,
+                   "vbv buffer size change: %d -> %d\n",
+                   params.reInitEncodeParams.encodeConfig->rcParams.vbvBufferSize,
+                   avctx->rc_buffer_size);
+
+            params.reInitEncodeParams.encodeConfig->rcParams.vbvBufferSize = avctx->rc_buffer_size;
+            reconfig_bitrate = 1;
+        }
+
+        if (reconfig_bitrate) {
+            params.resetEncoder = 1;
+            params.forceIDR = 1;
+
+            needs_encode_config = 1;
+            needs_reconfig = 1;
+        }
+    }
+
+    if (!needs_encode_config)
+        params.reInitEncodeParams.encodeConfig = NULL;
+
+    if (needs_reconfig) {
+        ret = p_nvenc->nvEncReconfigureEncoder(ctx->nvencoder, &params);
+        if (ret != NV_ENC_SUCCESS) {
+            nvenc_print_error(avctx, ret, "failed to reconfigure nvenc");
+        } else {
+            if (reconfig_dar) {
+                ctx->init_encode_params.darHeight = dh;
+                ctx->init_encode_params.darWidth = dw;
+            }
+
+            if (reconfig_bitrate) {
+                ctx->encode_config.rcParams.averageBitRate = params.reInitEncodeParams.encodeConfig->rcParams.averageBitRate;
+                ctx->encode_config.rcParams.maxBitRate = params.reInitEncodeParams.encodeConfig->rcParams.maxBitRate;
+                ctx->encode_config.rcParams.vbvBufferSize = params.reInitEncodeParams.encodeConfig->rcParams.vbvBufferSize;
+            }
+
+        }
+    }
+}
+
+int ff_nvenc_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+{
+    NVENCSTATUS nv_status;
+    NvencSurface *tmp_out_surf, *in_surf;
+    int res, res2;
+    NV_ENC_SEI_PAYLOAD *sei_data = NULL;
+    size_t sei_size;
+
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    NV_ENC_PIC_PARAMS pic_params = { 0 };
+    pic_params.version = NV_ENC_PIC_PARAMS_VER;
+
+    if ((!ctx->cu_context && !ctx->d3d11_device) || !ctx->nvencoder)
+        return AVERROR(EINVAL);
+
+    if (ctx->encoder_flushing) {
+        if (avctx->internal->draining)
+            return AVERROR_EOF;
+
+        ctx->encoder_flushing = 0;
+        ctx->first_packet_output = 0;
+        ctx->initial_pts[0] = AV_NOPTS_VALUE;
+        ctx->initial_pts[1] = AV_NOPTS_VALUE;
+        av_fifo_reset(ctx->timestamp_list);
+    }
+
+    if (frame) {
+        in_surf = get_free_frame(ctx);
+        if (!in_surf)
+            return AVERROR(EAGAIN);
+
+        res = nvenc_push_context(avctx);
+        if (res < 0)
+            return res;
+
+        reconfig_encoder(avctx, frame);
+
+        res = nvenc_upload_frame(avctx, frame, in_surf);
+
+        res2 = nvenc_pop_context(avctx);
+        if (res2 < 0)
+            return res2;
+
+        if (res)
+            return res;
+
+        pic_params.inputBuffer = in_surf->input_surface;
+        pic_params.bufferFmt = in_surf->format;
+        pic_params.inputWidth = in_surf->width;
+        pic_params.inputHeight = in_surf->height;
+        pic_params.inputPitch = in_surf->pitch;
+        pic_params.outputBitstream = in_surf->output_surface;
 
         if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
             if (frame->top_field_first)
-                params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM;
+                pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM;
             else
-                params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP;
+                pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP;
+        } else {
+            pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+        }
+
+        if (ctx->forced_idr >= 0 && frame->pict_type == AV_PICTURE_TYPE_I) {
+            pic_params.encodePicFlags =
+                ctx->forced_idr ? NV_ENC_PIC_FLAG_FORCEIDR : NV_ENC_PIC_FLAG_FORCEINTRA;
         } else {
-            params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+            pic_params.encodePicFlags = 0;
         }
 
-        nvenc_codec_specific_pic_params(avctx, &params);
+        pic_params.inputTimeStamp = frame->pts;
 
-        ret = nvenc_enqueue_timestamp(ctx->timestamps, frame->pts);
-        if (ret < 0)
-            return ret;
+        if (ctx->a53_cc && av_frame_get_side_data(frame, AV_FRAME_DATA_A53_CC)) {
+            if (ff_alloc_a53_sei(frame, sizeof(NV_ENC_SEI_PAYLOAD), (void**)&sei_data, &sei_size) < 0) {
+                av_log(ctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+            }
 
-        if (ctx->initial_pts[0] == AV_NOPTS_VALUE)
-            ctx->initial_pts[0] = frame->pts;
-        else if (ctx->initial_pts[1] == AV_NOPTS_VALUE)
-            ctx->initial_pts[1] = frame->pts;
+            if (sei_data) {
+                sei_data->payloadSize = (uint32_t)sei_size;
+                sei_data->payloadType = 4;
+                sei_data->payload = (uint8_t*)(sei_data + 1);
+            }
+        }
+
+        nvenc_codec_specific_pic_params(avctx, &pic_params, sei_data);
     } else {
-        params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+        pic_params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+        ctx->encoder_flushing = 1;
     }
 
-    nvel->cu_ctx_push_current(ctx->cu_context);
-    enc_ret = nv->nvEncEncodePicture(ctx->nvenc_ctx, &params);
-    nvel->cu_ctx_pop_current(&dummy);
+    res = nvenc_push_context(avctx);
+    if (res < 0)
+        return res;
 
-    if (enc_ret != NV_ENC_SUCCESS &&
-        enc_ret != NV_ENC_ERR_NEED_MORE_INPUT)
-        return nvenc_print_error(avctx, enc_ret, "Error encoding the frame");
+    nv_status = p_nvenc->nvEncEncodePicture(ctx->nvencoder, &pic_params);
+    av_free(sei_data);
 
-    if (nvenc_frame) {
-        ret = av_fifo_generic_write(ctx->pending, &nvenc_frame, sizeof(nvenc_frame), NULL);
-        if (ret < 0)
-            return ret;
+    res = nvenc_pop_context(avctx);
+    if (res < 0)
+        return res;
+
+    if (nv_status != NV_ENC_SUCCESS &&
+        nv_status != NV_ENC_ERR_NEED_MORE_INPUT)
+        return nvenc_print_error(avctx, nv_status, "EncodePicture failed!");
+
+    if (frame) {
+        av_fifo_generic_write(ctx->output_surface_queue, &in_surf, sizeof(in_surf), NULL);
+        timestamp_queue_enqueue(ctx->timestamp_list, frame->pts);
+
+        if (ctx->initial_pts[0] == AV_NOPTS_VALUE)
+            ctx->initial_pts[0] = frame->pts;
+        else if (ctx->initial_pts[1] == AV_NOPTS_VALUE)
+            ctx->initial_pts[1] = frame->pts;
     }
 
     /* all the pending buffers are now ready for output */
-    if (enc_ret == NV_ENC_SUCCESS) {
-        while (av_fifo_size(ctx->pending) > 0) {
-            av_fifo_generic_read(ctx->pending, &nvenc_frame, sizeof(nvenc_frame), NULL);
-            av_fifo_generic_write(ctx->ready,  &nvenc_frame, sizeof(nvenc_frame), NULL);
+    if (nv_status == NV_ENC_SUCCESS) {
+        while (av_fifo_size(ctx->output_surface_queue) > 0) {
+            av_fifo_generic_read(ctx->output_surface_queue, &tmp_out_surf, sizeof(tmp_out_surf), NULL);
+            av_fifo_generic_write(ctx->output_surface_ready_queue, &tmp_out_surf, sizeof(tmp_out_surf), NULL);
         }
     }
 
-    if (output_ready(avctx, !frame)) {
-        ret = nvenc_get_output(avctx, pkt);
-        if (ret < 0)
-            return ret;
-        *got_packet = 1;
+    return 0;
+}
+
+int ff_nvenc_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
+{
+    NvencSurface *tmp_out_surf;
+    int res, res2;
+
+    NvencContext *ctx = avctx->priv_data;
+
+    if ((!ctx->cu_context && !ctx->d3d11_device) || !ctx->nvencoder)
+        return AVERROR(EINVAL);
+
+    if (output_ready(avctx, ctx->encoder_flushing)) {
+        av_fifo_generic_read(ctx->output_surface_ready_queue, &tmp_out_surf, sizeof(tmp_out_surf), NULL);
+
+        res = nvenc_push_context(avctx);
+        if (res < 0)
+            return res;
+
+        res = process_output_surface(avctx, pkt, tmp_out_surf);
+
+        res2 = nvenc_pop_context(avctx);
+        if (res2 < 0)
+            return res2;
+
+        if (res)
+            return res;
+
+        av_fifo_generic_write(ctx->unused_surface_queue, &tmp_out_surf, sizeof(tmp_out_surf), NULL);
+    } else if (ctx->encoder_flushing) {
+        return AVERROR_EOF;
     } else {
+        return AVERROR(EAGAIN);
+    }
+
+    return 0;
+}
+
+int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                          const AVFrame *frame, int *got_packet)
+{
+    NvencContext *ctx = avctx->priv_data;
+    int res;
+
+    if (!ctx->encoder_flushing) {
+        res = ff_nvenc_send_frame(avctx, frame);
+        if (res < 0)
+            return res;
+    }
+
+    res = ff_nvenc_receive_packet(avctx, pkt);
+    if (res == AVERROR(EAGAIN) || res == AVERROR_EOF) {
         *got_packet = 0;
+    } else if (res < 0) {
+        return res;
+    } else {
+        *got_packet = 1;
     }
 
     return 0;
diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h
index b42b930..ddd6168 100644
--- a/libavcodec/nvenc.h
+++ b/libavcodec/nvenc.h
@@ -1,97 +1,84 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_NVENC_H
 #define AVCODEC_NVENC_H
 
-#include <nvEncodeAPI.h>
-
 #include "config.h"
 
+#if CONFIG_D3D11VA
+#define COBJMACROS
+#include "libavutil/hwcontext_d3d11va.h"
+#else
+typedef void ID3D11Device;
+#endif
+
+#include <ffnvcodec/nvEncodeAPI.h>
+
+#include "compat/cuda/dynlink_loader.h"
 #include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
 
-#if CONFIG_CUDA
-#include <cuda.h>
-#else
+#define MAX_REGISTERED_FRAMES 64
+#define RC_MODE_DEPRECATED 0x800000
+#define RCD(rc_mode) ((rc_mode) | RC_MODE_DEPRECATED)
 
-#if defined(_WIN32)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
+#define NVENCAPI_CHECK_VERSION(major, minor) \
+    ((major) < NVENCAPI_MAJOR_VERSION || ((major) == NVENCAPI_MAJOR_VERSION && (minor) <= NVENCAPI_MINOR_VERSION))
 
-typedef enum cudaError_enum {
-    CUDA_SUCCESS = 0
-} CUresult;
-typedef int CUdevice;
-typedef void* CUcontext;
-typedef void* CUdeviceptr;
+// SDK 8.1 compile time feature checks
+#if NVENCAPI_CHECK_VERSION(8, 1)
+#define NVENC_HAVE_BFRAME_REF_MODE
+#define NVENC_HAVE_QP_MAP_MODE
 #endif
 
-#define MAX_REGISTERED_FRAMES 64
+// SDK 9.0 compile time feature checks
+#if NVENCAPI_CHECK_VERSION(9, 0)
+#define NVENC_HAVE_HEVC_BFRAME_REF_MODE
+#endif
 
-typedef struct NVENCFrame {
-    NV_ENC_INPUT_PTR  in;
-    AVFrame          *in_ref;
-    NV_ENC_MAP_INPUT_RESOURCE in_map;
+typedef struct NvencSurface
+{
+    NV_ENC_INPUT_PTR input_surface;
+    AVFrame *in_ref;
     int reg_idx;
+    int width;
+    int height;
+    int pitch;
 
-    NV_ENC_OUTPUT_PTR out;
+    NV_ENC_OUTPUT_PTR output_surface;
     NV_ENC_BUFFER_FORMAT format;
-} NVENCFrame;
-
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
-typedef CUresult(CUDAAPI *PCUCTXPUSHCURRENT)(CUcontext ctx);
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
-
-typedef NVENCSTATUS (NVENCAPI *PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
+    int size;
+} NvencSurface;
 
-typedef struct NVENCLibraryContext
+typedef struct NvencDynLoadFunctions
 {
-#if !CONFIG_CUDA
-    void *cuda;
-#endif
-    void *nvenc;
-
-    PCUINIT cu_init;
-    PCUDEVICEGETCOUNT cu_device_get_count;
-    PCUDEVICEGET cu_device_get;
-    PCUDEVICEGETNAME cu_device_get_name;
-    PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
-    PCUCTXCREATE cu_ctx_create;
-    PCUCTXPOPCURRENT cu_ctx_pop_current;
-    PCUCTXPUSHCURRENT cu_ctx_push_current;
-    PCUCTXDESTROY cu_ctx_destroy;
+    CudaFunctions *cuda_dl;
+    NvencFunctions *nvenc_dl;
 
     NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
-} NVENCLibraryContext;
+    int nvenc_device_count;
+} NvencDynLoadFunctions;
 
 enum {
-    PRESET_DEFAULT,
+    PRESET_DEFAULT = 0,
     PRESET_SLOW,
     PRESET_MEDIUM,
     PRESET_FAST,
@@ -101,7 +88,7 @@ enum {
     PRESET_LOW_LATENCY_DEFAULT ,
     PRESET_LOW_LATENCY_HQ ,
     PRESET_LOW_LATENCY_HP,
-    PRESET_LOSSLESS_DEFAULT,
+    PRESET_LOSSLESS_DEFAULT, // lossless presets must be the last ones
     PRESET_LOSSLESS_HP,
 };
 
@@ -109,8 +96,7 @@ enum {
     NV_ENC_H264_PROFILE_BASELINE,
     NV_ENC_H264_PROFILE_MAIN,
     NV_ENC_H264_PROFILE_HIGH,
-    NV_ENC_H264_PROFILE_HIGH_444,
-    NV_ENC_H264_PROFILE_CONSTRAINED_HIGH,
+    NV_ENC_H264_PROFILE_HIGH_444P,
 };
 
 enum {
@@ -131,25 +117,34 @@ enum {
     ANY_DEVICE,
 };
 
-typedef struct NVENCContext {
-    AVClass *class;
-    NVENCLibraryContext nvel;
+typedef struct NvencContext
+{
+    AVClass *avclass;
 
-    NV_ENC_INITIALIZE_PARAMS params;
-    NV_ENC_CONFIG config;
+    NvencDynLoadFunctions nvenc_dload_funcs;
 
+    NV_ENC_INITIALIZE_PARAMS init_encode_params;
+    NV_ENC_CONFIG encode_config;
     CUcontext cu_context;
     CUcontext cu_context_internal;
+    ID3D11Device *d3d11_device;
 
     int nb_surfaces;
-    NVENCFrame *frames;
-    AVFifoBuffer *timestamps;
-    AVFifoBuffer *pending, *ready, *unused_surface_queue;
+    NvencSurface *surfaces;
+
+    AVFifoBuffer *unused_surface_queue;
+    AVFifoBuffer *output_surface_queue;
+    AVFifoBuffer *output_surface_ready_queue;
+    AVFifoBuffer *timestamp_list;
+
+    int encoder_flushing;
 
     struct {
-        CUdeviceptr ptr;
+        void *ptr;
+        int ptr_index;
         NV_ENC_REGISTERED_PTR regptr;
         int mapped;
+        NV_ENC_MAP_INPUT_RESOURCE in_map;
     } registered_frames[MAX_REGISTERED_FRAMES];
     int nb_registered_frames;
 
@@ -162,35 +157,51 @@ typedef struct NVENCContext {
     int64_t initial_pts[2];
     int first_packet_output;
 
-    void *nvenc_ctx;
+    int support_dyn_bitrate;
+
+    void *nvencoder;
 
     int preset;
     int profile;
     int level;
     int tier;
     int rc;
+    int cbr;
+    int twopass;
     int device;
     int flags;
     int async_depth;
     int rc_lookahead;
     int aq;
     int no_scenecut;
+    int forced_idr;
     int b_adapt;
     int temporal_aq;
     int zerolatency;
     int nonref_p;
     int strict_gop;
     int aq_strength;
-    int quality;
+    float quality;
+    int aud;
+    int bluray_compat;
     int init_qp_p;
     int init_qp_b;
     int init_qp_i;
-} NVENCContext;
+    int cqp;
+    int weighted_pred;
+    int coder;
+    int b_ref_mode;
+    int a53_cc;
+} NvencContext;
 
 int ff_nvenc_encode_init(AVCodecContext *avctx);
 
 int ff_nvenc_encode_close(AVCodecContext *avctx);
 
+int ff_nvenc_send_frame(AVCodecContext *avctx, const AVFrame *frame);
+
+int ff_nvenc_receive_packet(AVCodecContext *avctx, AVPacket *pkt);
+
 int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                           const AVFrame *frame, int *got_packet);
 
diff --git a/libavcodec/nvenc_h264.c b/libavcodec/nvenc_h264.c
index bf98326..a6623f5 100644
--- a/libavcodec/nvenc_h264.c
+++ b/libavcodec/nvenc_h264.c
@@ -1,144 +1,198 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/internal.h"
-#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
 
 #include "nvenc.h"
 
-#define OFFSET(x) offsetof(NVENCContext, x)
+#define OFFSET(x) offsetof(NvencContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_MEDIUM }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
-    { "default",    "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_DEFAULT }, 0, 0, VE, "preset" },
-    { "slow",       "hq 2 passes",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_SLOW }, 0, 0, VE, "preset" },
-    { "medium",     "hq 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_MEDIUM }, 0, 0, VE, "preset" },
-    { "fast",       "hp 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_FAST }, 0, 0, VE, "preset" },
-    { "hp",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HP }, 0, 0, VE, "preset" },
-    { "hq",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HQ }, 0, 0, VE, "preset" },
-    { "bd",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_BD }, 0, 0, VE, "preset" },
-    { "ll",         "low latency",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_DEFAULT }, 0, 0, VE, "preset" },
-    { "llhq",       "low latency hq",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HQ }, 0, 0, VE, "preset" },
-    { "llhp",       "low latency hp",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HP }, 0, 0, VE, "preset" },
-    { "lossless",   NULL,                                 0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_DEFAULT }, 0, 0, VE, "preset" },
-    { "losslesshp", NULL,                                 0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_HP }, 0, 0, VE, "preset" },
-    { "profile",  "Set the encoding profile",             OFFSET(profile),     AV_OPT_TYPE_INT,    { .i64 = NV_ENC_H264_PROFILE_HIGH }, NV_ENC_H264_PROFILE_BASELINE, NV_ENC_H264_PROFILE_CONSTRAINED_HIGH, VE, "profile" },
-    { "baseline", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_BASELINE },            0, 0, VE, "profile" },
-    { "main",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_MAIN },                0, 0, VE, "profile" },
-    { "high",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH },                0, 0, VE, "profile" },
-    { "high_444", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH_444 },            0, 0, VE, "profile" },
-    { "constrained_high", "",                             0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH },    0, 0, VE, "profile" },
-    { "level",    "Set the encoding level restriction",   OFFSET(level),       AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_H264_51, VE, "level" },
-    { "auto",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT }, 0, 0, VE, "level" },
-    { "1.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1 },  0, 0, VE,  "level" },
-    { "1.b",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1b }, 0, 0, VE,  "level" },
-    { "1.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_11 }, 0, 0, VE,  "level" },
-    { "1.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_12 }, 0, 0, VE,  "level" },
-    { "1.3",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_13 }, 0, 0, VE,  "level" },
-    { "2.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_2 },  0, 0, VE,  "level" },
-    { "2.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_21 }, 0, 0, VE,  "level" },
-    { "2.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_22 }, 0, 0, VE,  "level" },
-    { "3.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_3 },  0, 0, VE,  "level" },
-    { "3.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_31 }, 0, 0, VE,  "level" },
-    { "3.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_32 }, 0, 0, VE,  "level" },
-    { "4.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_4 },  0, 0, VE,  "level" },
-    { "4.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_41 }, 0, 0, VE,  "level" },
-    { "4.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_42 }, 0, 0, VE,  "level" },
-    { "5.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_5 },  0, 0, VE,  "level" },
-    { "5.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_51 }, 0, 0, VE,  "level" },
-    { "rc",       "Override the preset rate-control",     OFFSET(rc),          AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, INT_MAX, VE, "rc" },
-    { "constqp",          "Constant QP mode",                                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CONSTQP },              0, 0, VE, "rc" },
-    { "vbr",              "Variable bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR },                  0, 0, VE, "rc" },
-    { "cbr",              "Constant bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CBR },                  0, 0, VE, "rc" },
-    { "vbr_minqp",        "Variable bitrate mode with MinQP",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR_MINQP },            0, 0, VE, "rc" },
-    { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
-    { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
-    { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 0 },                    0, MAX_REGISTERED_FRAMES, VE },
-    { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
-    { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
-    { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
-    { "async_depth", "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
-    { "delay",       "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
-#if NVENCAPI_MAJOR_VERSION >= 7
-    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = 0 }, -1, INT_MAX, VE },
-    { "no-scenecut", "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts", OFFSET(no_scenecut), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "b_adapt", "When lookahead is enabled, set this to 0 to disable adaptive B-frame decision", OFFSET(b_adapt), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
-    { "spatial-aq", "set to 1 to enable Spatial AQ", OFFSET(aq), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "temporal-aq", "set to 1 to enable Temporal AQ",     OFFSET(temporal_aq),  AV_OPT_TYPE_INT,   { .i64 = 0                       }, 0, 1, VE        },
-    { "zerolatency", "Set 1 to indicate zero latency operation (no reordering delay)", OFFSET(zerolatency), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "nonref_p", "Set this to 1 to enable automatic insertion of non-reference P-frames", OFFSET(nonref_p), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "strict_gop", "Set 1 to minimize GOP-to-GOP rate fluctuations", OFFSET(strict_gop), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "aq-strength", "When Spatial AQ is enabled, this field is used to specify AQ strength. AQ strength scale is from 1 (low) - 15 (aggressive)", OFFSET(aq_strength), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 15, VE },
-    { "cq", "Set target quality level (0 to 51, 0 means automatic) for constant quality mode in VBR rate control", OFFSET(quality), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 51, VE },
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
-    { "init_qpP", "Initial QP value for P-frames",        OFFSET(init_qp_p),   AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 51, VE },
-    { "init_qpB", "Initial QP value for B-frames",        OFFSET(init_qp_b),   AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 51, VE },
-    { "init_qpI", "Initial QP value for I-frames",        OFFSET(init_qp_i),   AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 51, VE },
+    { "preset",       "Set the encoding preset",            OFFSET(preset),       AV_OPT_TYPE_INT,   { .i64 = PRESET_MEDIUM }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
+    { "default",      "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_DEFAULT },             0, 0, VE, "preset" },
+    { "slow",         "hq 2 passes",                        0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_SLOW },                0, 0, VE, "preset" },
+    { "medium",       "hq 1 pass",                          0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_MEDIUM },              0, 0, VE, "preset" },
+    { "fast",         "hp 1 pass",                          0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_FAST },                0, 0, VE, "preset" },
+    { "hp",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_HP },                  0, 0, VE, "preset" },
+    { "hq",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_HQ },                  0, 0, VE, "preset" },
+    { "bd",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_BD },                  0, 0, VE, "preset" },
+    { "ll",           "low latency",                        0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOW_LATENCY_DEFAULT }, 0, 0, VE, "preset" },
+    { "llhq",         "low latency hq",                     0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOW_LATENCY_HQ },      0, 0, VE, "preset" },
+    { "llhp",         "low latency hp",                     0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOW_LATENCY_HP },      0, 0, VE, "preset" },
+    { "lossless",     "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOSSLESS_DEFAULT },    0, 0, VE, "preset" },
+    { "losslesshp",   "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOSSLESS_HP },         0, 0, VE, "preset" },
+    { "profile",      "Set the encoding profile",           OFFSET(profile),      AV_OPT_TYPE_INT,   { .i64 = NV_ENC_H264_PROFILE_MAIN }, NV_ENC_H264_PROFILE_BASELINE, NV_ENC_H264_PROFILE_HIGH_444P, VE, "profile" },
+    { "baseline",     "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_PROFILE_BASELINE },  0, 0, VE, "profile" },
+    { "main",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_PROFILE_MAIN },      0, 0, VE, "profile" },
+    { "high",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_PROFILE_HIGH },      0, 0, VE, "profile" },
+    { "high444p",     "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_PROFILE_HIGH_444P }, 0, 0, VE, "profile" },
+    { "level",        "Set the encoding level restriction", OFFSET(level),        AV_OPT_TYPE_INT,   { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_H264_51, VE, "level" },
+    { "auto",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_AUTOSELECT },    0, 0, VE, "level" },
+    { "1",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_1 },        0, 0, VE, "level" },
+    { "1.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_1 },        0, 0, VE, "level" },
+    { "1b",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_1b },       0, 0, VE, "level" },
+    { "1.0b",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_1b },       0, 0, VE, "level" },
+    { "1.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_11 },       0, 0, VE, "level" },
+    { "1.2",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_12 },       0, 0, VE, "level" },
+    { "1.3",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_13 },       0, 0, VE, "level" },
+    { "2",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_2 },        0, 0, VE, "level" },
+    { "2.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_2 },        0, 0, VE, "level" },
+    { "2.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_21 },       0, 0, VE, "level" },
+    { "2.2",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_22 },       0, 0, VE, "level" },
+    { "3",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_3 },        0, 0, VE, "level" },
+    { "3.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_3 },        0, 0, VE, "level" },
+    { "3.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_31 },       0, 0, VE, "level" },
+    { "3.2",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_32 },       0, 0, VE, "level" },
+    { "4",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_4 },        0, 0, VE, "level" },
+    { "4.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_4 },        0, 0, VE, "level" },
+    { "4.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_41 },       0, 0, VE, "level" },
+    { "4.2",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_42 },       0, 0, VE, "level" },
+    { "5",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_5 },        0, 0, VE, "level" },
+    { "5.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_5 },        0, 0, VE, "level" },
+    { "5.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_H264_51 },       0, 0, VE, "level" },
+    { "rc",           "Override the preset rate-control",   OFFSET(rc),           AV_OPT_TYPE_INT,   { .i64 = -1 },                                  -1, INT_MAX, VE, "rc" },
+    { "constqp",      "Constant QP mode",                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CONSTQP },                   0, 0, VE, "rc" },
+    { "vbr",          "Variable bitrate mode",              0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_VBR },                       0, 0, VE, "rc" },
+    { "cbr",          "Constant bitrate mode",              0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CBR },                       0, 0, VE, "rc" },
+    { "vbr_minqp",    "Variable bitrate mode with MinQP (deprecated)", 0,         AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_VBR_MINQP) },            0, 0, VE, "rc" },
+    { "ll_2pass_quality", "Multi-pass optimized for image quality (deprecated)",
+                                                            0,                    AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_2_PASS_QUALITY) },       0, 0, VE, "rc" },
+    { "ll_2pass_size", "Multi-pass optimized for constant frame size (deprecated)",
+                                                            0,                    AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP) }, 0, 0, VE, "rc" },
+    { "vbr_2pass",    "Multi-pass variable bitrate mode (deprecated)", 0,         AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_2_PASS_VBR) },           0, 0, VE, "rc" },
+    { "cbr_ld_hq",    "Constant bitrate low delay high quality mode", 0,          AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ },           0, 0, VE, "rc" },
+    { "cbr_hq",       "Constant bitrate high quality mode", 0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CBR_HQ },                    0, 0, VE, "rc" },
+    { "vbr_hq",       "Variable bitrate high quality mode", 0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_VBR_HQ },                    0, 0, VE, "rc" },
+    { "rc-lookahead", "Number of frames to look ahead for rate-control",
+                                                            OFFSET(rc_lookahead), AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, INT_MAX, VE },
+    { "surfaces",     "Number of concurrent surfaces",      OFFSET(nb_surfaces),  AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, MAX_REGISTERED_FRAMES, VE },
+    { "cbr",          "Use cbr encoding mode",              OFFSET(cbr),          AV_OPT_TYPE_BOOL,  { .i64 = 0 },   0, 1, VE },
+    { "2pass",        "Use 2pass encoding mode",            OFFSET(twopass),      AV_OPT_TYPE_BOOL,  { .i64 = -1 }, -1, 1, VE },
+    { "gpu",          "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.",
+                                                            OFFSET(device),       AV_OPT_TYPE_INT,   { .i64 = ANY_DEVICE },   -2, INT_MAX, VE, "gpu" },
+    { "any",          "Pick the first device available",    0,                    AV_OPT_TYPE_CONST, { .i64 = ANY_DEVICE },          0, 0, VE, "gpu" },
+    { "list",         "List the available devices",         0,                    AV_OPT_TYPE_CONST, { .i64 = LIST_DEVICES },        0, 0, VE, "gpu" },
+    { "delay",        "Delay frame output by the given amount of frames",
+                                                            OFFSET(async_depth),  AV_OPT_TYPE_INT,   { .i64 = INT_MAX }, 0, INT_MAX, VE },
+    { "no-scenecut",  "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts",
+                                                            OFFSET(no_scenecut),  AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0,  1, VE },
+    { "forced-idr",   "If forcing keyframes, force them as IDR frames.",
+                                                            OFFSET(forced_idr),   AV_OPT_TYPE_BOOL,  { .i64 = 0 }, -1, 1, VE },
+    { "b_adapt",      "When lookahead is enabled, set this to 0 to disable adaptive B-frame decision",
+                                                            OFFSET(b_adapt),      AV_OPT_TYPE_BOOL,  { .i64 = 1 }, 0,  1, VE },
+    { "spatial-aq",   "set to 1 to enable Spatial AQ",      OFFSET(aq),           AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0,  1, VE },
+    { "temporal-aq",  "set to 1 to enable Temporal AQ",     OFFSET(temporal_aq),  AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0,  1, VE },
+    { "zerolatency",  "Set 1 to indicate zero latency operation (no reordering delay)",
+                                                            OFFSET(zerolatency),  AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0,  1, VE },
+    { "nonref_p",     "Set this to 1 to enable automatic insertion of non-reference P-frames",
+                                                            OFFSET(nonref_p),     AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0,  1, VE },
+    { "strict_gop",   "Set 1 to minimize GOP-to-GOP rate fluctuations",
+                                                            OFFSET(strict_gop),   AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0,  1, VE },
+    { "aq-strength",  "When Spatial AQ is enabled, this field is used to specify AQ strength. AQ strength scale is from 1 (low) - 15 (aggressive)",
+                                                            OFFSET(aq_strength),  AV_OPT_TYPE_INT,   { .i64 = 8 }, 1, 15, VE },
+    { "cq",           "Set target quality level (0 to 51, 0 means automatic) for constant quality mode in VBR rate control",
+                                                            OFFSET(quality),      AV_OPT_TYPE_FLOAT, { .dbl = 0.}, 0., 51., VE },
+    { "aud",          "Use access unit delimiters",         OFFSET(aud),          AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "bluray-compat", "Bluray compatibility workarounds",  OFFSET(bluray_compat),AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "init_qpP",     "Initial QP value for P frame",       OFFSET(init_qp_p),    AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "init_qpB",     "Initial QP value for B frame",       OFFSET(init_qp_b),    AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "init_qpI",     "Initial QP value for I frame",       OFFSET(init_qp_i),    AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "qp",           "Constant quantization parameter rate control method",
+                                                            OFFSET(cqp),          AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "weighted_pred","Set 1 to enable weighted prediction",
+                                                            OFFSET(weighted_pred),AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, 1, VE },
+    { "coder",        "Coder type",                         OFFSET(coder),        AV_OPT_TYPE_INT,   { .i64 = -1                                         },-1, 2, VE, "coder" },
+    { "default",      "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = -1                                         }, 0, 0, VE, "coder" },
+    { "auto",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_ENTROPY_CODING_MODE_AUTOSELECT }, 0, 0, VE, "coder" },
+    { "cabac",        "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_ENTROPY_CODING_MODE_CABAC      }, 0, 0, VE, "coder" },
+    { "cavlc",        "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_ENTROPY_CODING_MODE_CAVLC      }, 0, 0, VE, "coder" },
+    { "ac",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_ENTROPY_CODING_MODE_CABAC      }, 0, 0, VE, "coder" },
+    { "vlc",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_H264_ENTROPY_CODING_MODE_CAVLC      }, 0, 0, VE, "coder" },
+#ifdef NVENC_HAVE_BFRAME_REF_MODE
+    { "b_ref_mode",   "Use B frames as references",         OFFSET(b_ref_mode),   AV_OPT_TYPE_INT,   { .i64 = NV_ENC_BFRAME_REF_MODE_DISABLED }, NV_ENC_BFRAME_REF_MODE_DISABLED, NV_ENC_BFRAME_REF_MODE_MIDDLE, VE, "b_ref_mode" },
+    { "disabled",     "B frames will not be used for reference", 0,               AV_OPT_TYPE_CONST, { .i64 = NV_ENC_BFRAME_REF_MODE_DISABLED }, 0, 0, VE, "b_ref_mode" },
+    { "each",         "Each B frame will be used for reference", 0,               AV_OPT_TYPE_CONST, { .i64 = NV_ENC_BFRAME_REF_MODE_EACH }, 0, 0, VE, "b_ref_mode" },
+    { "middle",       "Only (number of B frames)/2 will be used for reference", 0,AV_OPT_TYPE_CONST, { .i64 = NV_ENC_BFRAME_REF_MODE_MIDDLE }, 0, 0, VE, "b_ref_mode" },
+#else
+    { "b_ref_mode",   "(not supported)",                    OFFSET(b_ref_mode),   AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, INT_MAX, VE, "b_ref_mode" },
+    { "disabled",     "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0,       VE, "b_ref_mode" },
+    { "each",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0,       VE, "b_ref_mode" },
+    { "middle",       "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0,       VE, "b_ref_mode" },
+#endif
+    { "a53cc",        "Use A53 Closed Captions (if available)", OFFSET(a53_cc),   AV_OPT_TYPE_BOOL,  { .i64 = 1 }, 0, 1, VE },
     { NULL }
 };
 
-static const AVClass nvenc_h264_class = {
-    .class_name = "nvenc_h264",
-    .item_name = av_default_item_name,
-    .option = options,
-    .version = LIBAVUTIL_VERSION_INT,
-};
-
 static const AVCodecDefault defaults[] = {
-    { "b", "0" },
+    { "b", "2M" },
     { "qmin", "-1" },
     { "qmax", "-1" },
     { "qdiff", "-1" },
     { "qblur", "-1" },
     { "qcomp", "-1" },
+    { "g", "250" },
+    { "bf", "0" },
     { "refs", "0" },
     { NULL },
 };
 
-AVCodec ff_h264_nvenc_encoder = {
-    .name           = "h264_nvenc",
+#if FF_API_NVENC_OLD_NAME
+
+static av_cold int nvenc_old_init(AVCodecContext *avctx)
+{
+    av_log(avctx, AV_LOG_WARNING, "This encoder is deprecated, use 'h264_nvenc' instead\n");
+    return ff_nvenc_encode_init(avctx);
+}
+
+#if CONFIG_NVENC_ENCODER
+static const AVClass nvenc_class = {
+    .class_name = "nvenc",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_nvenc_encoder = {
+    .name           = "nvenc",
     .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC H.264 encoder"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
-    .init           = ff_nvenc_encode_init,
+    .init           = nvenc_old_init,
+    .send_frame     = ff_nvenc_send_frame,
+    .receive_packet = ff_nvenc_receive_packet,
     .encode2        = ff_nvenc_encode_frame,
     .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_h264_class,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &nvenc_class,
     .defaults       = defaults,
-    .capabilities   = AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .pix_fmts       = ff_nvenc_pix_fmts,
+    .wrapper_name   = "nvenc",
 };
+#endif
 
-#if FF_API_NVENC_OLD_NAME
-
-static int nvenc_old_init(AVCodecContext *avctx)
-{
-    av_log(avctx, AV_LOG_WARNING, "This encoder is deprecated, use 'h264_nvenc' instead\n");
-    return ff_nvenc_encode_init(avctx);
-}
-
-static const AVClass nvenc_h264_old_class = {
+/* Add an alias for nvenc_h264 */
+#if CONFIG_NVENC_H264_ENCODER
+static const AVClass nvenc_h264_class = {
     .class_name = "nvenc_h264",
     .item_name = av_default_item_name,
     .option = options,
@@ -151,13 +205,44 @@ AVCodec ff_nvenc_h264_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
     .init           = nvenc_old_init,
+    .send_frame     = ff_nvenc_send_frame,
+    .receive_packet = ff_nvenc_receive_packet,
     .encode2        = ff_nvenc_encode_frame,
     .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_h264_old_class,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &nvenc_h264_class,
     .defaults       = defaults,
-    .capabilities   = AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .pix_fmts       = ff_nvenc_pix_fmts,
+    .wrapper_name   = "nvenc",
 };
 #endif
+
+#endif
+
+static const AVClass h264_nvenc_class = {
+    .class_name = "h264_nvenc",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_h264_nvenc_encoder = {
+    .name           = "h264_nvenc",
+    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC H.264 encoder"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .init           = ff_nvenc_encode_init,
+    .send_frame     = ff_nvenc_send_frame,
+    .receive_packet = ff_nvenc_receive_packet,
+    .encode2        = ff_nvenc_encode_frame,
+    .close          = ff_nvenc_encode_close,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &h264_nvenc_class,
+    .defaults       = defaults,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
+    .pix_fmts       = ff_nvenc_pix_fmts,
+    .wrapper_name   = "nvenc",
+};
diff --git a/libavcodec/nvenc_hevc.c b/libavcodec/nvenc_hevc.c
index caf7c4a..d567d96 100644
--- a/libavcodec/nvenc_hevc.c
+++ b/libavcodec/nvenc_hevc.c
@@ -1,142 +1,157 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/internal.h"
-#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
 
 #include "nvenc.h"
 
-#define OFFSET(x) offsetof(NVENCContext, x)
+#define OFFSET(x) offsetof(NvencContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_MEDIUM }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
-    { "default",    "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_DEFAULT }, 0, 0, VE, "preset" },
-    { "slow",       "hq 2 passes",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_SLOW }, 0, 0, VE, "preset" },
-    { "medium",     "hq 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_MEDIUM }, 0, 0, VE, "preset" },
-    { "fast",       "hp 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_FAST }, 0, 0, VE, "preset" },
-    { "hp",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HP }, 0, 0, VE, "preset" },
-    { "hq",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HQ }, 0, 0, VE, "preset" },
-    { "bd",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_BD }, 0, 0, VE, "preset" },
-    { "ll",         "low latency",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_DEFAULT }, 0, 0, VE, "preset" },
-    { "llhq",       "low latency hq",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HQ }, 0, 0, VE, "preset" },
-    { "llhp",       "low latency hp",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HP }, 0, 0, VE, "preset" },
-    { "lossless",   "lossless",                           0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_DEFAULT }, 0, 0, VE, "preset" },
-    { "losslesshp", "lossless hp",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_HP }, 0, 0, VE, "preset" },
-    { "profile", "Set the encoding profile",             OFFSET(profile),      AV_OPT_TYPE_INT,    { .i64 = NV_ENC_HEVC_PROFILE_MAIN }, NV_ENC_HEVC_PROFILE_MAIN, FF_PROFILE_HEVC_REXT, VE, "profile" },
-    { "main",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_HEVC_PROFILE_MAIN }, 0, 0, VE, "profile" },
-#if NVENCAPI_MAJOR_VERSION >= 7
-    { "main10",  "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_HEVC_PROFILE_MAIN_10 }, 0, 0, VE, "profile" },
-    { "rext",   "",                                      0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_HEVC_PROFILE_REXT }, 0, 0, VE, "profile" },
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
-    { "level",   "Set the encoding level restriction",   OFFSET(level),        AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_HEVC_62, VE, "level" },
-    { "auto",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT }, 0, 0, VE, "level" },
-    { "1.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_1 },  0, 0, VE,  "level" },
-    { "2.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_2 },  0, 0, VE,  "level" },
-    { "2.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_21 }, 0, 0, VE,  "level" },
-    { "3.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_3 },  0, 0, VE,  "level" },
-    { "3.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_31 }, 0, 0, VE,  "level" },
-    { "4.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_4 },  0, 0, VE,  "level" },
-    { "4.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_41 }, 0, 0, VE,  "level" },
-    { "5.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_5 },  0, 0, VE,  "level" },
-    { "5.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_51 }, 0, 0, VE,  "level" },
-    { "5.2",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_52 }, 0, 0, VE,  "level" },
-    { "6.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_6 },  0, 0, VE,  "level" },
-    { "6.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_61 }, 0, 0, VE,  "level" },
-    { "6.2",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_62 }, 0, 0, VE,  "level" },
-    { "tier",    "Set the encoding tier",                OFFSET(tier),         AV_OPT_TYPE_INT,    { .i64 = NV_ENC_TIER_HEVC_MAIN }, NV_ENC_TIER_HEVC_MAIN, NV_ENC_TIER_HEVC_HIGH, VE, "tier"},
-    { "main",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_TIER_HEVC_MAIN }, 0, 0, VE, "tier" },
-    { "high",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_TIER_HEVC_HIGH }, 0, 0, VE, "tier" },
-    { "rc",      "Override the preset rate-control",     OFFSET(rc),           AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, INT_MAX, VE, "rc" },
-    { "constqp",          "Constant QP mode",                                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CONSTQP },              0, 0, VE, "rc" },
-    { "vbr",              "Variable bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR },                  0, 0, VE, "rc" },
-    { "cbr",              "Constant bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CBR },                  0, 0, VE, "rc" },
-    { "vbr_minqp",        "Variable bitrate mode with MinQP",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR_MINQP },            0, 0, VE, "rc" },
-    { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
-    { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
-    { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 0 },                    0, MAX_REGISTERED_FRAMES, VE },
-    { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
-    { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
-    { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
-    { "async_depth", "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
-    { "delay",       "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
-#if NVENCAPI_MAJOR_VERSION >= 7
-    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = 0 }, -1, INT_MAX, VE },
-    { "no-scenecut", "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts", OFFSET(no_scenecut), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "spatial_aq", "set to 1 to enable Spatial AQ", OFFSET(aq), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "zerolatency", "Set 1 to indicate zero latency operation (no reordering delay)", OFFSET(zerolatency), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "nonref_p", "Set this to 1 to enable automatic insertion of non-reference P-frames", OFFSET(nonref_p), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "strict_gop", "Set 1 to minimize GOP-to-GOP rate fluctuations", OFFSET(strict_gop), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "aq-strength", "When Spatial AQ is enabled, this field is used to specify AQ strength. AQ strength scale is from 1 (low) - 15 (aggressive)", OFFSET(aq_strength), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 15, VE },
-    { "cq", "Set target quality level (0 to 51, 0 means automatic) for constant quality mode in VBR rate control", OFFSET(quality), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 51, VE },
-#endif /* NVENCAPI_MAJOR_VERSION >= 7 */
-    { "init_qpP", "Initial QP value for P-frames",        OFFSET(init_qp_p),   AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 51, VE },
-    { "init_qpB", "Initial QP value for B-frames",        OFFSET(init_qp_b),   AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 51, VE },
-    { "init_qpI", "Initial QP value for I-frames",        OFFSET(init_qp_i),   AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 51, VE },
+    { "preset",       "Set the encoding preset",            OFFSET(preset),       AV_OPT_TYPE_INT,   { .i64 = PRESET_MEDIUM }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
+    { "default",      "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_DEFAULT },             0, 0, VE, "preset" },
+    { "slow",         "hq 2 passes",                        0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_SLOW },                0, 0, VE, "preset" },
+    { "medium",       "hq 1 pass",                          0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_MEDIUM },              0, 0, VE, "preset" },
+    { "fast",         "hp 1 pass",                          0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_FAST },                0, 0, VE, "preset" },
+    { "hp",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_HP },                  0, 0, VE, "preset" },
+    { "hq",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_HQ },                  0, 0, VE, "preset" },
+    { "bd",           "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_BD },                  0, 0, VE, "preset" },
+    { "ll",           "low latency",                        0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOW_LATENCY_DEFAULT }, 0, 0, VE, "preset" },
+    { "llhq",         "low latency hq",                     0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOW_LATENCY_HQ },      0, 0, VE, "preset" },
+    { "llhp",         "low latency hp",                     0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOW_LATENCY_HP },      0, 0, VE, "preset" },
+    { "lossless",     "lossless",                           0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOSSLESS_DEFAULT },    0, 0, VE, "preset" },
+    { "losslesshp",   "lossless hp",                        0,                    AV_OPT_TYPE_CONST, { .i64 = PRESET_LOSSLESS_HP },         0, 0, VE, "preset" },
+    { "profile",      "Set the encoding profile",           OFFSET(profile),      AV_OPT_TYPE_INT,   { .i64 = NV_ENC_HEVC_PROFILE_MAIN }, NV_ENC_HEVC_PROFILE_MAIN, FF_PROFILE_HEVC_REXT, VE, "profile" },
+    { "main",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_HEVC_PROFILE_MAIN },    0, 0, VE, "profile" },
+    { "main10",       "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_HEVC_PROFILE_MAIN_10 }, 0, 0, VE, "profile" },
+    { "rext",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_HEVC_PROFILE_REXT },    0, 0, VE, "profile" },
+    { "level",        "Set the encoding level restriction", OFFSET(level),        AV_OPT_TYPE_INT,   { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_HEVC_62, VE, "level" },
+    { "auto",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_AUTOSELECT },  0, 0, VE,  "level" },
+    { "1",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_1 },      0, 0, VE,  "level" },
+    { "1.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_1 },      0, 0, VE,  "level" },
+    { "2",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_2 },      0, 0, VE,  "level" },
+    { "2.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_2 },      0, 0, VE,  "level" },
+    { "2.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_21 },     0, 0, VE,  "level" },
+    { "3",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_3 },      0, 0, VE,  "level" },
+    { "3.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_3 },      0, 0, VE,  "level" },
+    { "3.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_31 },     0, 0, VE,  "level" },
+    { "4",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_4 },      0, 0, VE,  "level" },
+    { "4.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_4 },      0, 0, VE,  "level" },
+    { "4.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_41 },     0, 0, VE,  "level" },
+    { "5",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_5 },      0, 0, VE,  "level" },
+    { "5.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_5 },      0, 0, VE,  "level" },
+    { "5.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_51 },     0, 0, VE,  "level" },
+    { "5.2",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_52 },     0, 0, VE,  "level" },
+    { "6",            "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_6 },      0, 0, VE,  "level" },
+    { "6.0",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_6 },      0, 0, VE,  "level" },
+    { "6.1",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_61 },     0, 0, VE,  "level" },
+    { "6.2",          "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_LEVEL_HEVC_62 },     0, 0, VE,  "level" },
+    { "tier",         "Set the encoding tier",              OFFSET(tier),         AV_OPT_TYPE_INT,   { .i64 = NV_ENC_TIER_HEVC_MAIN }, NV_ENC_TIER_HEVC_MAIN, NV_ENC_TIER_HEVC_HIGH, VE, "tier"},
+    { "main",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_TIER_HEVC_MAIN },    0, 0, VE,   "tier" },
+    { "high",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_TIER_HEVC_HIGH },    0, 0, VE,   "tier" },
+    { "rc",           "Override the preset rate-control",   OFFSET(rc),           AV_OPT_TYPE_INT,   { .i64 = -1 },                                  -1, INT_MAX, VE, "rc" },
+    { "constqp",      "Constant QP mode",                   0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CONSTQP },                   0, 0, VE, "rc" },
+    { "vbr",          "Variable bitrate mode",              0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_VBR },                       0, 0, VE, "rc" },
+    { "cbr",          "Constant bitrate mode",              0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CBR },                       0, 0, VE, "rc" },
+    { "vbr_minqp",    "Variable bitrate mode with MinQP (deprecated)", 0,         AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_VBR_MINQP) },            0, 0, VE, "rc" },
+    { "ll_2pass_quality", "Multi-pass optimized for image quality (deprecated)",
+                                                            0,                    AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_2_PASS_QUALITY) },       0, 0, VE, "rc" },
+    { "ll_2pass_size", "Multi-pass optimized for constant frame size (deprecated)",
+                                                            0,                    AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP) }, 0, 0, VE, "rc" },
+    { "vbr_2pass",    "Multi-pass variable bitrate mode (deprecated)", 0,         AV_OPT_TYPE_CONST, { .i64 = RCD(NV_ENC_PARAMS_RC_2_PASS_VBR) },           0, 0, VE, "rc" },
+    { "cbr_ld_hq",    "Constant bitrate low delay high quality mode", 0,          AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ },           0, 0, VE, "rc" },
+    { "cbr_hq",       "Constant bitrate high quality mode", 0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_CBR_HQ },                    0, 0, VE, "rc" },
+    { "vbr_hq",       "Variable bitrate high quality mode", 0,                    AV_OPT_TYPE_CONST, { .i64 = NV_ENC_PARAMS_RC_VBR_HQ },                    0, 0, VE, "rc" },
+    { "rc-lookahead", "Number of frames to look ahead for rate-control",
+                                                            OFFSET(rc_lookahead), AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, INT_MAX, VE },
+    { "surfaces",     "Number of concurrent surfaces",      OFFSET(nb_surfaces),  AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, MAX_REGISTERED_FRAMES, VE },
+    { "cbr",          "Use cbr encoding mode",              OFFSET(cbr),          AV_OPT_TYPE_BOOL,  { .i64 = 0 },   0, 1, VE },
+    { "2pass",        "Use 2pass encoding mode",            OFFSET(twopass),      AV_OPT_TYPE_BOOL,  { .i64 = -1 }, -1, 1, VE },
+    { "gpu",          "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.",
+                                                            OFFSET(device),       AV_OPT_TYPE_INT,   { .i64 = ANY_DEVICE }, -2, INT_MAX, VE, "gpu" },
+    { "any",          "Pick the first device available",    0,                    AV_OPT_TYPE_CONST, { .i64 = ANY_DEVICE },        0, 0, VE, "gpu" },
+    { "list",         "List the available devices",         0,                    AV_OPT_TYPE_CONST, { .i64 = LIST_DEVICES },      0, 0, VE, "gpu" },
+    { "delay",        "Delay frame output by the given amount of frames",
+                                                            OFFSET(async_depth),  AV_OPT_TYPE_INT,   { .i64 = INT_MAX }, 0, INT_MAX, VE },
+    { "no-scenecut",  "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts",
+                                                            OFFSET(no_scenecut),  AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "forced-idr",   "If forcing keyframes, force them as IDR frames.",
+                                                            OFFSET(forced_idr),   AV_OPT_TYPE_BOOL,  { .i64 = 0 }, -1, 1, VE },
+    { "spatial_aq",   "set to 1 to enable Spatial AQ",      OFFSET(aq),           AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "temporal_aq",  "set to 1 to enable Temporal AQ",     OFFSET(temporal_aq),  AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "zerolatency",  "Set 1 to indicate zero latency operation (no reordering delay)",
+                                                            OFFSET(zerolatency),  AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "nonref_p",     "Set this to 1 to enable automatic insertion of non-reference P-frames",
+                                                            OFFSET(nonref_p),     AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "strict_gop",   "Set 1 to minimize GOP-to-GOP rate fluctuations",
+                                                            OFFSET(strict_gop),   AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "aq-strength",  "When Spatial AQ is enabled, this field is used to specify AQ strength. AQ strength scale is from 1 (low) - 15 (aggressive)",
+                                                            OFFSET(aq_strength),  AV_OPT_TYPE_INT,   { .i64 = 8 }, 1, 15, VE },
+    { "cq",           "Set target quality level (0 to 51, 0 means automatic) for constant quality mode in VBR rate control",
+                                                            OFFSET(quality),      AV_OPT_TYPE_FLOAT, { .dbl = 0.}, 0., 51., VE },
+    { "aud",          "Use access unit delimiters",         OFFSET(aud),          AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "bluray-compat", "Bluray compatibility workarounds",  OFFSET(bluray_compat),AV_OPT_TYPE_BOOL,  { .i64 = 0 }, 0, 1, VE },
+    { "init_qpP",     "Initial QP value for P frame",       OFFSET(init_qp_p),    AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "init_qpB",     "Initial QP value for B frame",       OFFSET(init_qp_b),    AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "init_qpI",     "Initial QP value for I frame",       OFFSET(init_qp_i),    AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "qp",           "Constant quantization parameter rate control method",
+                                                            OFFSET(cqp),          AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, 51, VE },
+    { "weighted_pred","Set 1 to enable weighted prediction",
+                                                            OFFSET(weighted_pred),AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, 1, VE },
+#ifdef NVENC_HAVE_HEVC_BFRAME_REF_MODE
+    { "b_ref_mode",   "Use B frames as references",         OFFSET(b_ref_mode),   AV_OPT_TYPE_INT,   { .i64 = NV_ENC_BFRAME_REF_MODE_DISABLED }, NV_ENC_BFRAME_REF_MODE_DISABLED, NV_ENC_BFRAME_REF_MODE_MIDDLE, VE, "b_ref_mode" },
+    { "disabled",     "B frames will not be used for reference", 0,               AV_OPT_TYPE_CONST, { .i64 = NV_ENC_BFRAME_REF_MODE_DISABLED }, 0, 0, VE, "b_ref_mode" },
+    { "each",         "Each B frame will be used for reference", 0,               AV_OPT_TYPE_CONST, { .i64 = NV_ENC_BFRAME_REF_MODE_EACH }, 0, 0, VE, "b_ref_mode" },
+    { "middle",       "Only (number of B frames)/2 will be used for reference", 0,AV_OPT_TYPE_CONST, { .i64 = NV_ENC_BFRAME_REF_MODE_MIDDLE }, 0, 0, VE, "b_ref_mode" },
+#else
+    { "b_ref_mode",   "(not supported)",                    OFFSET(b_ref_mode),   AV_OPT_TYPE_INT,   { .i64 = 0 }, 0, INT_MAX, VE, "b_ref_mode" },
+    { "disabled",     "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0,       VE, "b_ref_mode" },
+    { "each",         "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0,       VE, "b_ref_mode" },
+    { "middle",       "",                                   0,                    AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0,       VE, "b_ref_mode" },
+#endif
     { NULL }
 };
 
-static const AVClass nvenc_hevc_class = {
-    .class_name = "nvenc_hevc",
-    .item_name = av_default_item_name,
-    .option = options,
-    .version = LIBAVUTIL_VERSION_INT,
-};
-
 static const AVCodecDefault defaults[] = {
-    { "b", "0" },
+    { "b", "2M" },
     { "qmin", "-1" },
     { "qmax", "-1" },
     { "qdiff", "-1" },
     { "qblur", "-1" },
     { "qcomp", "-1" },
+    { "g", "250" },
+    { "bf", "0" },
     { "refs", "0" },
     { NULL },
 };
 
-AVCodec ff_hevc_nvenc_encoder = {
-    .name           = "hevc_nvenc",
-    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC HEVC encoder"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_HEVC,
-    .init           = ff_nvenc_encode_init,
-    .encode2        = ff_nvenc_encode_frame,
-    .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_hevc_class,
-    .defaults       = defaults,
-    .pix_fmts       = ff_nvenc_pix_fmts,
-    .capabilities   = AV_CODEC_CAP_DELAY,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
-};
-
 #if FF_API_NVENC_OLD_NAME
 
-static int nvenc_old_init(AVCodecContext *avctx)
+static av_cold int nvenc_old_init(AVCodecContext *avctx)
 {
     av_log(avctx, AV_LOG_WARNING, "This encoder is deprecated, use 'hevc_nvenc' instead\n");
     return ff_nvenc_encode_init(avctx);
 }
 
-static const AVClass nvenc_hevc_old_class = {
+static const AVClass nvenc_hevc_class = {
     .class_name = "nvenc_hevc",
     .item_name = av_default_item_name,
     .option = options,
@@ -145,17 +160,47 @@ static const AVClass nvenc_hevc_old_class = {
 
 AVCodec ff_nvenc_hevc_encoder = {
     .name           = "nvenc_hevc",
-    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC HEVC encoder"),
+    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC hevc encoder"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_HEVC,
     .init           = nvenc_old_init,
+    .send_frame     = ff_nvenc_send_frame,
+    .receive_packet = ff_nvenc_receive_packet,
     .encode2        = ff_nvenc_encode_frame,
     .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_hevc_old_class,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &nvenc_hevc_class,
     .defaults       = defaults,
     .pix_fmts       = ff_nvenc_pix_fmts,
-    .capabilities   = AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
+    .wrapper_name   = "nvenc",
 };
+
 #endif
+
+static const AVClass hevc_nvenc_class = {
+    .class_name = "hevc_nvenc",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_hevc_nvenc_encoder = {
+    .name           = "hevc_nvenc",
+    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC hevc encoder"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .init           = ff_nvenc_encode_init,
+    .send_frame     = ff_nvenc_send_frame,
+    .receive_packet = ff_nvenc_receive_packet,
+    .encode2        = ff_nvenc_encode_frame,
+    .close          = ff_nvenc_encode_close,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &hevc_nvenc_class,
+    .defaults       = defaults,
+    .pix_fmts       = ff_nvenc_pix_fmts,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
+    .wrapper_name   = "nvenc",
+};
diff --git a/libavcodec/omx.c b/libavcodec/omx.c
index f43e92a..466e0be 100644
--- a/libavcodec/omx.c
+++ b/libavcodec/omx.c
@@ -2,20 +2,20 @@
  * OMX Video encoder
  * Copyright (C) 2011 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -226,6 +226,7 @@ typedef struct OMXCodecContext {
     int output_buf_size;
 
     int input_zerocopy;
+    int profile;
 } OMXCodecContext;
 
 static void append_buffer(pthread_mutex_t *mutex, pthread_cond_t *cond,
@@ -523,6 +524,19 @@ static av_cold int omx_component_init(AVCodecContext *avctx, const char *role)
         CHECK(err);
         avc.nBFrames = 0;
         avc.nPFrames = avctx->gop_size - 1;
+        switch (s->profile == FF_PROFILE_UNKNOWN ? avctx->profile : s->profile) {
+        case FF_PROFILE_H264_BASELINE:
+            avc.eProfile = OMX_VIDEO_AVCProfileBaseline;
+            break;
+        case FF_PROFILE_H264_MAIN:
+            avc.eProfile = OMX_VIDEO_AVCProfileMain;
+            break;
+        case FF_PROFILE_H264_HIGH:
+            avc.eProfile = OMX_VIDEO_AVCProfileHigh;
+            break;
+        default:
+            break;
+        }
         err = OMX_SetParameter(s->handle, OMX_IndexParamVideoAvc, &avc);
         CHECK(err);
     }
@@ -761,7 +775,10 @@ static int omx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             } else {
                 // If not, we need to allocate a new buffer with the right
                 // size and copy the input frame into it.
-                uint8_t *buf = av_malloc(av_image_get_buffer_size(avctx->pix_fmt, s->stride, s->plane_size, 1));
+                uint8_t *buf = NULL;
+                int image_buffer_size = av_image_get_buffer_size(avctx->pix_fmt, s->stride, s->plane_size, 1);
+                if (image_buffer_size >= 0)
+                    buf = av_malloc(image_buffer_size);
                 if (!buf) {
                     // Return the buffer to the queue so it's not lost
                     append_buffer(&s->input_mutex, &s->input_cond, &s->num_free_in_buffers, s->free_in_buffers, buffer);
@@ -850,7 +867,7 @@ static int omx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                 }
             } else {
                 // End of frame, and the caller provided a preallocated frame
-                if ((ret = ff_alloc_packet(pkt, s->output_buf_size + buffer->nFilledLen)) < 0) {
+                if ((ret = ff_alloc_packet2(avctx, pkt, s->output_buf_size + buffer->nFilledLen, 0)) < 0) {
                     av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n",
                            (int)(s->output_buf_size + buffer->nFilledLen));
                     goto end;
@@ -897,6 +914,10 @@ static const AVOption options[] = {
     { "omx_libname", "OpenMAX library name", OFFSET(libname), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VDE },
     { "omx_libprefix", "OpenMAX library prefix", OFFSET(libprefix), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VDE },
     { "zerocopy", "Try to avoid copying input frames if possible", OFFSET(input_zerocopy), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "profile",  "Set the encoding profile", OFFSET(profile), AV_OPT_TYPE_INT,   { .i64 = FF_PROFILE_UNKNOWN },       FF_PROFILE_UNKNOWN, FF_PROFILE_H264_HIGH, VE, "profile" },
+    { "baseline", "",                         0,               AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_H264_BASELINE }, 0, 0, VE, "profile" },
+    { "main",     "",                         0,               AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_H264_MAIN },     0, 0, VE, "profile" },
+    { "high",     "",                         0,               AV_OPT_TYPE_CONST, { .i64 = FF_PROFILE_H264_HIGH },     0, 0, VE, "profile" },
     { NULL }
 };
 
diff --git a/libavcodec/on2avc.c b/libavcodec/on2avc.c
index 39ae117..00e5bf5 100644
--- a/libavcodec/on2avc.c
+++ b/libavcodec/on2avc.c
@@ -3,32 +3,31 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
 #include "fft.h"
+#include "get_bits.h"
 #include "internal.h"
-#include "vlc.h"
 
 #include "on2avcdata.h"
 
@@ -47,7 +46,7 @@ enum WindowTypes {
 
 typedef struct On2AVCContext {
     AVCodecContext *avctx;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext mdct, mdct_half, mdct_small;
     FFTContext fft128, fft256, fft512, fft1024;
     void (*wtf)(struct On2AVCContext *ctx, float *out, float *in, int size);
@@ -86,11 +85,11 @@ typedef struct On2AVCContext {
     DECLARE_ALIGNED(32, float, short_win)[ON2AVC_SUBFRAME_SIZE / 8];
 } On2AVCContext;
 
-static void on2avc_read_ms_info(On2AVCContext *c, BitstreamContext *bc)
+static void on2avc_read_ms_info(On2AVCContext *c, GetBitContext *gb)
 {
     int w, b, band_off = 0;
 
-    c->ms_present = bitstream_read_bit(bc);
+    c->ms_present = get_bits1(gb);
     if (!c->ms_present)
         return;
     for (w = 0; w < c->num_windows; w++) {
@@ -102,12 +101,12 @@ static void on2avc_read_ms_info(On2AVCContext *c, BitstreamContext *bc)
             continue;
         }
         for (b = 0; b < c->num_bands; b++)
-            c->ms_info[band_off++] = bitstream_read_bit(bc);
+            c->ms_info[band_off++] = get_bits1(gb);
     }
 }
 
 // do not see Table 17 in ISO/IEC 13818-7
-static int on2avc_decode_band_types(On2AVCContext *c, BitstreamContext *bc)
+static int on2avc_decode_band_types(On2AVCContext *c, GetBitContext *gb)
 {
     int bits_per_sect = c->is_long ? 5 : 3;
     int esc_val = (1 << bits_per_sect) - 1;
@@ -115,16 +114,16 @@ static int on2avc_decode_band_types(On2AVCContext *c, BitstreamContext *bc)
     int band = 0, i, band_type, run_len, run;
 
     while (band < num_bands) {
-        band_type = bitstream_read(bc, 4);
+        band_type = get_bits(gb, 4);
         run_len   = 1;
         do {
-            run = bitstream_read(bc, bits_per_sect);
+            run = get_bits(gb, bits_per_sect);
+            if (run > num_bands - band - run_len) {
+                av_log(c->avctx, AV_LOG_ERROR, "Invalid band type run\n");
+                return AVERROR_INVALIDDATA;
+            }
             run_len += run;
         } while (run == esc_val);
-        if (band + run_len > num_bands) {
-            av_log(c->avctx, AV_LOG_ERROR, "Invalid band type run\n");
-            return AVERROR_INVALIDDATA;
-        }
         for (i = band; i < band + run_len; i++) {
             c->band_type[i]    = band_type;
             c->band_run_end[i] = band + run_len;
@@ -137,7 +136,7 @@ static int on2avc_decode_band_types(On2AVCContext *c, BitstreamContext *bc)
 
 // completely not like Table 18 in ISO/IEC 13818-7
 // (no intensity stereo, different coding for the first coefficient)
-static int on2avc_decode_band_scales(On2AVCContext *c, BitstreamContext *bc)
+static int on2avc_decode_band_scales(On2AVCContext *c, GetBitContext *gb)
 {
     int w, w2, b, scale, first = 1;
     int band_off = 0;
@@ -167,10 +166,10 @@ static int on2avc_decode_band_scales(On2AVCContext *c, BitstreamContext *bc)
                 }
             }
             if (first) {
-                scale = bitstream_read(bc, 7);
+                scale = get_bits(gb, 7);
                 first = 0;
             } else {
-                scale += bitstream_read_vlc(bc, c->scale_diff.table, 9, 3) - 60;
+                scale += get_vlc2(gb, c->scale_diff.table, 9, 3) - 60;
             }
             if (scale < 0 || scale > 127) {
                 av_log(c->avctx, AV_LOG_ERROR, "Invalid scale value %d\n",
@@ -190,13 +189,13 @@ static inline float on2avc_scale(int v, float scale)
 }
 
 // spectral data is coded completely differently - there are no unsigned codebooks
-static int on2avc_decode_quads(On2AVCContext *c, BitstreamContext *bc, float *dst,
+static int on2avc_decode_quads(On2AVCContext *c, GetBitContext *gb, float *dst,
                                int dst_size, int type, float band_scale)
 {
     int i, j, val, val1;
 
     for (i = 0; i < dst_size; i += 4) {
-        val = bitstream_read_vlc(bc, c->cb_vlc[type].table, 9, 3);
+        val = get_vlc2(gb, c->cb_vlc[type].table, 9, 3);
 
         for (j = 0; j < 4; j++) {
             val1 = sign_extend((val >> (12 - j * 4)) & 0xF, 4);
@@ -207,11 +206,11 @@ static int on2avc_decode_quads(On2AVCContext *c, BitstreamContext *bc, float *ds
     return 0;
 }
 
-static inline int get_egolomb(BitstreamContext *bc)
+static inline int get_egolomb(GetBitContext *gb)
 {
     int v = 4;
 
-    while (bitstream_read_bit(bc)) {
+    while (get_bits1(gb)) {
         v++;
         if (v > 30) {
             av_log(NULL, AV_LOG_WARNING, "Too large golomb code in get_egolomb.\n");
@@ -220,27 +219,27 @@ static inline int get_egolomb(BitstreamContext *bc)
         }
     }
 
-    return (1 << v) + bitstream_read(bc, v);
+    return (1 << v) + get_bits_long(gb, v);
 }
 
-static int on2avc_decode_pairs(On2AVCContext *c, BitstreamContext *bc, float *dst,
+static int on2avc_decode_pairs(On2AVCContext *c, GetBitContext *gb, float *dst,
                                int dst_size, int type, float band_scale)
 {
     int i, val, val1, val2, sign;
 
     for (i = 0; i < dst_size; i += 2) {
-        val = bitstream_read_vlc(bc, c->cb_vlc[type].table, 9, 3);
+        val = get_vlc2(gb, c->cb_vlc[type].table, 9, 3);
 
         val1 = sign_extend(val >> 8,   8);
         val2 = sign_extend(val & 0xFF, 8);
         if (type == ON2AVC_ESC_CB) {
             if (val1 <= -16 || val1 >= 16) {
                 sign = 1 - (val1 < 0) * 2;
-                val1 = sign * get_egolomb(bc);
+                val1 = sign * get_egolomb(gb);
             }
             if (val2 <= -16 || val2 >= 16) {
                 sign = 1 - (val2 < 0) * 2;
-                val2 = sign * get_egolomb(bc);
+                val2 = sign * get_egolomb(gb);
             }
         }
 
@@ -251,15 +250,15 @@ static int on2avc_decode_pairs(On2AVCContext *c, BitstreamContext *bc, float *ds
     return 0;
 }
 
-static int on2avc_read_channel_data(On2AVCContext *c, BitstreamContext *bc, int ch)
+static int on2avc_read_channel_data(On2AVCContext *c, GetBitContext *gb, int ch)
 {
     int ret;
     int w, b, band_idx;
     float *coeff_ptr;
 
-    if ((ret = on2avc_decode_band_types(c, bc)) < 0)
+    if ((ret = on2avc_decode_band_types(c, gb)) < 0)
         return ret;
-    if ((ret = on2avc_decode_band_scales(c, bc)) < 0)
+    if ((ret = on2avc_decode_band_scales(c, gb)) < 0)
         return ret;
 
     coeff_ptr = c->coeffs[ch];
@@ -275,10 +274,10 @@ static int on2avc_read_channel_data(On2AVCContext *c, BitstreamContext *bc, int
                 continue;
             }
             if (band_type < 9)
-                on2avc_decode_quads(c, bc, coeff_ptr, band_size, band_type,
+                on2avc_decode_quads(c, gb, coeff_ptr, band_size, band_type,
                                     c->band_scales[band_idx + b]);
             else
-                on2avc_decode_pairs(c, bc, coeff_ptr, band_size, band_type,
+                on2avc_decode_pairs(c, gb, coeff_ptr, band_size, band_type,
                                     c->band_scales[band_idx + b]);
             coeff_ptr += band_size;
         }
@@ -686,11 +685,11 @@ static void wtf_44(On2AVCContext *c, float *out, float *src, int size)
     }
 }
 
-static int on2avc_reconstruct_stereo(On2AVCContext *c, AVFrame *dst, int offset)
+static int on2avc_reconstruct_channel_ext(On2AVCContext *c, AVFrame *dst, int offset)
 {
     int ch, i;
 
-    for (ch = 0; ch < 2; ch++) {
+    for (ch = 0; ch < c->avctx->channels; ch++) {
         float *out   = (float*)dst->extended_data[ch] + offset;
         float *in    = c->coeffs[ch];
         float *saved = c->delay[ch];
@@ -721,7 +720,7 @@ static int on2avc_reconstruct_stereo(On2AVCContext *c, AVFrame *dst, int offset)
         }
 
         memcpy(out, saved, 448 * sizeof(float));
-        c->fdsp.vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
+        c->fdsp->vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
         memcpy(wout + 128,  buf + 64,         448 * sizeof(float));
         memcpy(saved,       buf + 512,        448 * sizeof(float));
         memcpy(saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
@@ -757,20 +756,20 @@ static int on2avc_reconstruct_channel(On2AVCContext *c, int channel,
          c->prev_window_type == WINDOW_TYPE_LONG_STOP) &&
         (c->window_type == WINDOW_TYPE_LONG ||
          c->window_type == WINDOW_TYPE_LONG_START)) {
-        c->fdsp.vector_fmul_window(out, saved, buf, c->long_win, 512);
+        c->fdsp->vector_fmul_window(out, saved, buf, c->long_win, 512);
     } else {
         float *wout = out + 448;
         memcpy(out, saved, 448 * sizeof(float));
 
         if (c->window_type == WINDOW_TYPE_8SHORT) {
-            c->fdsp.vector_fmul_window(wout + 0*128, saved + 448,      buf + 0*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 1*128, buf + 0*128 + 64, buf + 1*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 2*128, buf + 1*128 + 64, buf + 2*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 3*128, buf + 2*128 + 64, buf + 3*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(temp,         buf + 3*128 + 64, buf + 4*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 0*128, saved + 448,      buf + 0*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 1*128, buf + 0*128 + 64, buf + 1*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 2*128, buf + 1*128 + 64, buf + 2*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 3*128, buf + 2*128 + 64, buf + 3*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(temp,         buf + 3*128 + 64, buf + 4*128, c->short_win, 64);
             memcpy(wout + 4*128, temp, 64 * sizeof(float));
         } else {
-            c->fdsp.vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
             memcpy(wout + 128, buf + 64, 448 * sizeof(float));
         }
     }
@@ -779,9 +778,9 @@ static int on2avc_reconstruct_channel(On2AVCContext *c, int channel,
     switch (c->window_type) {
     case WINDOW_TYPE_8SHORT:
         memcpy(saved,       temp + 64,         64 * sizeof(float));
-        c->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, c->short_win, 64);
-        c->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, c->short_win, 64);
-        c->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, c->short_win, 64);
         memcpy(saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
         break;
     case WINDOW_TYPE_LONG_START:
@@ -799,20 +798,18 @@ static int on2avc_reconstruct_channel(On2AVCContext *c, int channel,
 static int on2avc_decode_subframe(On2AVCContext *c, const uint8_t *buf,
                                   int buf_size, AVFrame *dst, int offset)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int i, ret;
 
-    bitstream_init8(&bc, buf, buf_size);
-    if (bitstream_read_bit(&bc)) {
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
+
+    if (get_bits1(&gb)) {
         av_log(c->avctx, AV_LOG_ERROR, "enh bit set\n");
         return AVERROR_INVALIDDATA;
     }
     c->prev_window_type = c->window_type;
-    c->window_type      = bitstream_read(&bc, 3);
-    if (c->window_type >= WINDOW_TYPE_EXT4 && c->avctx->channels == 1) {
-        av_log(c->avctx, AV_LOG_ERROR, "stereo mode window for mono audio\n");
-        return AVERROR_INVALIDDATA;
-    }
+    c->window_type      = get_bits(&gb, 3);
 
     c->band_start  = c->modes[c->window_type].band_start;
     c->num_windows = c->modes[c->window_type].num_windows;
@@ -821,11 +818,11 @@ static int on2avc_decode_subframe(On2AVCContext *c, const uint8_t *buf,
 
     c->grouping[0] = 1;
     for (i = 1; i < c->num_windows; i++)
-        c->grouping[i] = !bitstream_read_bit(&bc);
+        c->grouping[i] = !get_bits1(&gb);
 
-    on2avc_read_ms_info(c, &bc);
+    on2avc_read_ms_info(c, &gb);
     for (i = 0; i < c->avctx->channels; i++)
-        if ((ret = on2avc_read_channel_data(c, &bc, i)) < 0)
+        if ((ret = on2avc_read_channel_data(c, &gb, i)) < 0)
             return AVERROR_INVALIDDATA;
     if (c->avctx->channels == 2 && c->ms_present)
         on2avc_apply_ms(c);
@@ -833,7 +830,7 @@ static int on2avc_decode_subframe(On2AVCContext *c, const uint8_t *buf,
         for (i = 0; i < c->avctx->channels; i++)
             on2avc_reconstruct_channel(c, i, dst, offset);
     } else {
-        on2avc_reconstruct_stereo(c, dst, offset);
+        on2avc_reconstruct_channel_ext(c, dst, offset);
     }
 
     return 0;
@@ -853,10 +850,8 @@ static int on2avc_decode_frame(AVCodecContext * avctx, void *data,
     if (c->is_av500) {
         /* get output buffer */
         frame->nb_samples = ON2AVC_SUBFRAME_SIZE;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
 
         if ((ret = on2avc_decode_subframe(c, buf, buf_size, frame, 0)) < 0)
             return ret;
@@ -879,10 +874,8 @@ static int on2avc_decode_frame(AVCodecContext * avctx, void *data,
 
         /* get output buffer */
         frame->nb_samples = ON2AVC_SUBFRAME_SIZE * num_frames;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
 
         audio_off = 0;
         bytestream2_init(&gb, buf, buf_size);
@@ -915,28 +908,29 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
     On2AVCContext *c = avctx->priv_data;
     int i;
 
+    if (avctx->channels > 2U) {
+        avpriv_request_sample(avctx, "Decoding more than 2 channels");
+        return AVERROR_PATCHWELCOME;
+    }
+
     c->avctx = avctx;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;
     avctx->channel_layout = (avctx->channels == 2) ? AV_CH_LAYOUT_STEREO
                                                    : AV_CH_LAYOUT_MONO;
 
     c->is_av500 = (avctx->codec_tag == 0x500);
-    if (c->is_av500 && avctx->channels == 2) {
-        av_log(avctx, AV_LOG_ERROR, "0x500 version should be mono\n");
-        return AVERROR_INVALIDDATA;
-    }
-    if (avctx->channels > 2) {
-        av_log(avctx, AV_LOG_ERROR, "Only 1 or 2 channels are supported.\n");
-        return AVERROR(EINVAL);
-    }
+
     if (avctx->channels == 2)
         av_log(avctx, AV_LOG_WARNING,
                "Stereo mode support is not good, patch is welcome\n");
 
+    // We add -0.01 before ceil() to avoid any values to fall at exactly the
+    // midpoint between different ceil values. The results are identical to
+    // using pow(10, i / 10.0) without such bias
     for (i = 0; i < 20; i++)
-        c->scale_tab[i] = ceil(pow(10.0, i * 0.1) * 16) / 32;
+        c->scale_tab[i] = ceil(ff_exp10(i * 0.1) * 16 - 0.01) / 32;
     for (; i < 128; i++)
-        c->scale_tab[i] = ceil(pow(10.0, i * 0.1) * 0.5);
+        c->scale_tab[i] = ceil(ff_exp10(i * 0.1) * 0.5 - 0.01);
 
     if (avctx->sample_rate < 32000 || avctx->channels == 1)
         memcpy(c->long_win, ff_on2avc_window_long_24000,
@@ -958,13 +952,14 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
     ff_fft_init(&c->fft256,  7, 0);
     ff_fft_init(&c->fft512,  8, 1);
     ff_fft_init(&c->fft1024, 9, 1);
-    avpriv_float_dsp_init(&c->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    c->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!c->fdsp)
+        return AVERROR(ENOMEM);
 
     if (init_vlc(&c->scale_diff, 9, ON2AVC_SCALE_DIFFS,
                  ff_on2avc_scale_diff_bits,  1, 1,
                  ff_on2avc_scale_diff_codes, 4, 4, 0)) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-        return AVERROR(ENOMEM);
+        goto vlc_fail;
     }
     for (i = 1; i < 9; i++) {
         int idx = i - 1;
@@ -972,9 +967,7 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
                                ff_on2avc_quad_cb_bits[idx],  1, 1,
                                ff_on2avc_quad_cb_codes[idx], 4, 4,
                                ff_on2avc_quad_cb_syms[idx],  2, 2, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-            on2avc_free_vlcs(c);
-            return AVERROR(ENOMEM);
+            goto vlc_fail;
         }
     }
     for (i = 9; i < 16; i++) {
@@ -983,13 +976,16 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
                                ff_on2avc_pair_cb_bits[idx],  1, 1,
                                ff_on2avc_pair_cb_codes[idx], 2, 2,
                                ff_on2avc_pair_cb_syms[idx],  2, 2, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-            on2avc_free_vlcs(c);
-            return AVERROR(ENOMEM);
+            goto vlc_fail;
         }
     }
 
     return 0;
+vlc_fail:
+    av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
+    on2avc_free_vlcs(c);
+    av_freep(&c->fdsp);
+    return AVERROR(ENOMEM);
 }
 
 static av_cold int on2avc_decode_close(AVCodecContext *avctx)
@@ -1004,6 +1000,8 @@ static av_cold int on2avc_decode_close(AVCodecContext *avctx)
     ff_fft_end(&c->fft512);
     ff_fft_end(&c->fft1024);
 
+    av_freep(&c->fdsp);
+
     on2avc_free_vlcs(c);
 
     return 0;
@@ -1020,6 +1018,7 @@ AVCodec ff_on2avc_decoder = {
     .decode         = on2avc_decode_frame,
     .close          = on2avc_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
diff --git a/libavcodec/on2avcdata.c b/libavcodec/on2avcdata.c
index 93543ad..ec98357 100644
--- a/libavcodec/on2avcdata.c
+++ b/libavcodec/on2avcdata.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/on2avcdata.h b/libavcodec/on2avcdata.h
index ff66cca..dc7833f 100644
--- a/libavcodec/on2avcdata.h
+++ b/libavcodec/on2avcdata.h
@@ -3,25 +3,25 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_ON2AVC_DATA_H
-#define AVCODEC_ON2AVC_DATA_H
+#ifndef AVCODEC_ON2AVCDATA_H
+#define AVCODEC_ON2AVCDATA_H
 
 #include <stdint.h>
 
@@ -79,4 +79,4 @@ extern const float ff_on2avc_ctab_2[2048];
 extern const float ff_on2avc_ctab_3[2048];
 extern const float ff_on2avc_ctab_4[2048];
 
-#endif /* AVCODEC_ON2AVC_DATA_H */
+#endif /* AVCODEC_ON2AVCDATA_H */
diff --git a/libavcodec/options.c b/libavcodec/options.c
index 765bcb7..41b6052 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,6 @@
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
-#include <float.h>              /* FLT_MIN, FLT_MAX */
 #include <string.h>
 
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -70,6 +69,13 @@ static const AVClass *codec_child_class_next(const AVClass *prev)
     return NULL;
 }
 
+static AVClassCategory get_category(void *ptr)
+{
+    AVCodecContext* avctx = ptr;
+    if(avctx->codec && avctx->codec->decode) return AV_CLASS_CATEGORY_DECODER;
+    else                                     return AV_CLASS_CATEGORY_ENCODER;
+}
+
 static const AVClass av_codec_context_class = {
     .class_name              = "AVCodecContext",
     .item_name               = context_to_name,
@@ -78,20 +84,34 @@ static const AVClass av_codec_context_class = {
     .log_level_offset_offset = offsetof(AVCodecContext, log_level_offset),
     .child_next              = codec_child_next,
     .child_class_next        = codec_child_class_next,
+    .category                = AV_CLASS_CATEGORY_ENCODER,
+    .get_category            = get_category,
 };
 
 static int init_context_defaults(AVCodecContext *s, const AVCodec *codec)
 {
+    int flags=0;
     memset(s, 0, sizeof(AVCodecContext));
 
     s->av_class = &av_codec_context_class;
 
     s->codec_type = codec ? codec->type : AVMEDIA_TYPE_UNKNOWN;
-    s->codec      = codec;
-    av_opt_set_defaults(s);
+    if (codec) {
+        s->codec = codec;
+        s->codec_id = codec->id;
+    }
+
+    if(s->codec_type == AVMEDIA_TYPE_AUDIO)
+        flags= AV_OPT_FLAG_AUDIO_PARAM;
+    else if(s->codec_type == AVMEDIA_TYPE_VIDEO)
+        flags= AV_OPT_FLAG_VIDEO_PARAM;
+    else if(s->codec_type == AVMEDIA_TYPE_SUBTITLE)
+        flags= AV_OPT_FLAG_SUBTITLE_PARAM;
+    av_opt_set_defaults2(s, flags, flags);
 
     s->time_base           = (AVRational){0,1};
     s->framerate           = (AVRational){ 0, 1 };
+    s->pkt_timebase        = (AVRational){ 0, 1 };
     s->get_buffer2         = avcodec_default_get_buffer2;
     s->get_format          = avcodec_default_get_format;
     s->execute             = avcodec_default_execute;
@@ -159,11 +179,39 @@ void avcodec_free_context(AVCodecContext **pavctx)
 
     av_freep(&avctx->extradata);
     av_freep(&avctx->subtitle_header);
+    av_freep(&avctx->intra_matrix);
+    av_freep(&avctx->inter_matrix);
+    av_freep(&avctx->rc_override);
 
     av_freep(pavctx);
 }
 
 #if FF_API_COPY_CONTEXT
+static void copy_context_reset(AVCodecContext *avctx)
+{
+    int i;
+
+    av_opt_free(avctx);
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    av_frame_free(&avctx->coded_frame);
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    av_freep(&avctx->rc_override);
+    av_freep(&avctx->intra_matrix);
+    av_freep(&avctx->inter_matrix);
+    av_freep(&avctx->extradata);
+    av_freep(&avctx->subtitle_header);
+    av_buffer_unref(&avctx->hw_frames_ctx);
+    av_buffer_unref(&avctx->hw_device_ctx);
+    for (i = 0; i < avctx->nb_coded_side_data; i++)
+        av_freep(&avctx->coded_side_data[i].data);
+    av_freep(&avctx->coded_side_data);
+    avctx->subtitle_header_size = 0;
+    avctx->nb_coded_side_data = 0;
+    avctx->extradata_size = 0;
+}
+
 int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
 {
     const AVCodec *orig_codec = dest->codec;
@@ -175,23 +223,40 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
                src, dest);
         return AVERROR(EINVAL);
     }
+
+    copy_context_reset(dest);
+
     memcpy(dest, src, sizeof(*dest));
+    av_opt_copy(dest, src);
 
     dest->priv_data       = orig_priv_data;
     dest->codec           = orig_codec;
 
+    if (orig_priv_data && src->codec && src->codec->priv_class &&
+        dest->codec && dest->codec->priv_class)
+        av_opt_copy(orig_priv_data, src->priv_data);
+
+
     /* set values specific to opened codecs back to their default state */
     dest->slice_offset    = NULL;
     dest->hwaccel         = NULL;
     dest->internal        = NULL;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    dest->coded_frame     = NULL;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
     /* reallocate values that should be allocated separately */
     dest->extradata       = NULL;
+    dest->coded_side_data = NULL;
     dest->intra_matrix    = NULL;
     dest->inter_matrix    = NULL;
     dest->rc_override     = NULL;
     dest->subtitle_header = NULL;
     dest->hw_frames_ctx   = NULL;
+    dest->hw_device_ctx   = NULL;
+    dest->nb_coded_side_data = 0;
 
 #define alloc_and_copy_or_fail(obj, size, pad) \
     if (src->obj && size > 0) { \
@@ -204,11 +269,12 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
     }
     alloc_and_copy_or_fail(extradata,    src->extradata_size,
                            AV_INPUT_BUFFER_PADDING_SIZE);
+    dest->extradata_size  = src->extradata_size;
     alloc_and_copy_or_fail(intra_matrix, 64 * sizeof(int16_t), 0);
     alloc_and_copy_or_fail(inter_matrix, 64 * sizeof(int16_t), 0);
     alloc_and_copy_or_fail(rc_override,  src->rc_override_count * sizeof(*src->rc_override), 0);
-    alloc_and_copy_or_fail(subtitle_header, src->subtitle_header_size, 0);
-    dest->subtitle_header_size = src->subtitle_header_size;
+    alloc_and_copy_or_fail(subtitle_header, src->subtitle_header_size, 1);
+    av_assert0(dest->subtitle_header_size == src->subtitle_header_size);
 #undef alloc_and_copy_or_fail
 
     if (src->hw_frames_ctx) {
@@ -220,12 +286,7 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
     return 0;
 
 fail:
-    av_freep(&dest->subtitle_header);
-    av_freep(&dest->rc_override);
-    av_freep(&dest->intra_matrix);
-    av_freep(&dest->inter_matrix);
-    av_freep(&dest->extradata);
-    av_buffer_unref(&dest->hw_frames_ctx);
+    copy_context_reset(dest);
     return AVERROR(ENOMEM);
 }
 #endif
@@ -234,3 +295,224 @@ const AVClass *avcodec_get_class(void)
 {
     return &av_codec_context_class;
 }
+
+#define FOFFSET(x) offsetof(AVFrame,x)
+
+static const AVOption frame_options[]={
+{"best_effort_timestamp", "", FOFFSET(best_effort_timestamp), AV_OPT_TYPE_INT64, {.i64 = AV_NOPTS_VALUE }, INT64_MIN, INT64_MAX, 0},
+{"pkt_pos", "", FOFFSET(pkt_pos), AV_OPT_TYPE_INT64, {.i64 = -1 }, INT64_MIN, INT64_MAX, 0},
+{"pkt_size", "", FOFFSET(pkt_size), AV_OPT_TYPE_INT64, {.i64 = -1 }, INT64_MIN, INT64_MAX, 0},
+{"sample_aspect_ratio", "", FOFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
+{"width", "", FOFFSET(width), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"height", "", FOFFSET(height), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"format", "", FOFFSET(format), AV_OPT_TYPE_INT, {.i64 = -1 }, 0, INT_MAX, 0},
+{"channel_layout", "", FOFFSET(channel_layout), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, 0},
+{"sample_rate", "", FOFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{NULL},
+};
+
+static const AVClass av_frame_class = {
+    .class_name              = "AVFrame",
+    .item_name               = NULL,
+    .option                  = frame_options,
+    .version                 = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_get_frame_class(void)
+{
+    return &av_frame_class;
+}
+
+#define SROFFSET(x) offsetof(AVSubtitleRect,x)
+
+static const AVOption subtitle_rect_options[]={
+{"x", "", SROFFSET(x), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"y", "", SROFFSET(y), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"w", "", SROFFSET(w), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"h", "", SROFFSET(h), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"type", "", SROFFSET(type), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"flags", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0, "flags"},
+{"forced", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0},
+{NULL},
+};
+
+static const AVClass av_subtitle_rect_class = {
+    .class_name             = "AVSubtitleRect",
+    .item_name              = NULL,
+    .option                 = subtitle_rect_options,
+    .version                = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_get_subtitle_rect_class(void)
+{
+    return &av_subtitle_rect_class;
+}
+
+#ifdef TEST
+static int dummy_init(AVCodecContext *ctx)
+{
+    //TODO: this code should set every possible pointer that could be set by codec and is not an option;
+    ctx->extradata_size = 8;
+    ctx->extradata = av_malloc(ctx->extradata_size);
+    return 0;
+}
+
+static int dummy_close(AVCodecContext *ctx)
+{
+    av_freep(&ctx->extradata);
+    ctx->extradata_size = 0;
+    return 0;
+}
+
+static int dummy_encode(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame, int *got_packet)
+{
+    return AVERROR(ENOSYS);
+}
+
+typedef struct Dummy12Context {
+    AVClass  *av_class;
+    int      num;
+    char*    str;
+} Dummy12Context;
+
+typedef struct Dummy3Context {
+    void     *fake_av_class;
+    int      num;
+    char*    str;
+} Dummy3Context;
+
+#define OFFSET(x) offsetof(Dummy12Context, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption dummy_options[] = {
+    { "str", "set str", OFFSET(str), AV_OPT_TYPE_STRING, { .str = "i'm src default value" }, 0, 0, VE},
+    { "num", "set num", OFFSET(num), AV_OPT_TYPE_INT,    { .i64 = 1500100900 },    0, INT_MAX, VE},
+    { NULL },
+};
+
+static const AVClass dummy_v1_class = {
+    .class_name = "dummy_v1_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass dummy_v2_class = {
+    .class_name = "dummy_v2_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+/* codec with options */
+static AVCodec dummy_v1_encoder = {
+    .name             = "dummy_v1_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 1,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v1_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with options, different class */
+static AVCodec dummy_v2_encoder = {
+    .name             = "dummy_v2_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 2,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v2_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with priv data, but no class */
+static AVCodec dummy_v3_encoder = {
+    .name             = "dummy_v3_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 3,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_data_size   = sizeof(Dummy3Context),
+};
+
+/* codec without priv data */
+static AVCodec dummy_v4_encoder = {
+    .name             = "dummy_v4_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 4,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+};
+
+static void test_copy_print_codec(const AVCodecContext *ctx)
+{
+    printf("%-14s: %dx%d prv: %s",
+           ctx->codec ? ctx->codec->name : "NULL",
+           ctx->width, ctx->height,
+           ctx->priv_data ? "set" : "null");
+    if (ctx->codec && ctx->codec->priv_class && ctx->codec->priv_data_size) {
+        int64_t i64;
+        char *str = NULL;
+        av_opt_get_int(ctx->priv_data, "num", 0, &i64);
+        av_opt_get(ctx->priv_data, "str", 0, (uint8_t**)&str);
+        printf(" opts: %"PRId64" %s", i64, str);
+        av_free(str);
+    }
+    printf("\n");
+}
+
+static void test_copy(const AVCodec *c1, const AVCodec *c2)
+{
+    AVCodecContext *ctx1, *ctx2;
+    printf("%s -> %s\nclosed:\n", c1 ? c1->name : "NULL", c2 ? c2->name : "NULL");
+    ctx1 = avcodec_alloc_context3(c1);
+    ctx2 = avcodec_alloc_context3(c2);
+    ctx1->width = ctx1->height = 128;
+    if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+        av_opt_set(ctx2->priv_data, "num", "667", 0);
+        av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+    }
+    avcodec_copy_context(ctx2, ctx1);
+    test_copy_print_codec(ctx1);
+    test_copy_print_codec(ctx2);
+    if (ctx1->codec) {
+        printf("opened:\n");
+        avcodec_open2(ctx1, ctx1->codec, NULL);
+        if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+            av_opt_set(ctx2->priv_data, "num", "667", 0);
+            av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+        }
+        avcodec_copy_context(ctx2, ctx1);
+        test_copy_print_codec(ctx1);
+        test_copy_print_codec(ctx2);
+        avcodec_close(ctx1);
+    }
+    avcodec_free_context(&ctx1);
+    avcodec_free_context(&ctx2);
+}
+
+int main(void)
+{
+    AVCodec *dummy_codec[] = {
+        &dummy_v1_encoder,
+        &dummy_v2_encoder,
+        &dummy_v3_encoder,
+        &dummy_v4_encoder,
+        NULL,
+    };
+    int i, j;
+
+    for (i = 0; dummy_codec[i]; i++)
+        avcodec_register(dummy_codec[i]);
+
+    printf("testing avcodec_copy_context()\n");
+    for (i = 0; i < FF_ARRAY_ELEMS(dummy_codec); i++)
+        for (j = 0; j < FF_ARRAY_ELEMS(dummy_codec); j++)
+            test_copy(dummy_codec[i], dummy_codec[j]);
+    return 0;
+}
+#endif
diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
index 4b0a834..a3235bc 100644
--- a/libavcodec/options_table.h
+++ b/libavcodec/options_table.h
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,12 +42,13 @@
 #define AV_CODEC_DEFAULT_BITRATE 200*1000
 
 static const AVOption avcodec_options[] = {
-{"b", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT, {.i64 = AV_CODEC_DEFAULT_BITRATE }, INT_MIN, INT_MAX, V|A|E},
+{"b", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = AV_CODEC_DEFAULT_BITRATE }, 0, INT64_MAX, A|V|E},
+{"ab", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = 128*1000 }, 0, INT_MAX, A|E},
 {"bt", "Set video bitrate tolerance (in bits/s). In 1-pass mode, bitrate tolerance specifies how far "
        "ratecontrol is willing to deviate from the target average bitrate value. This is not related "
        "to minimum/maximum bitrate. Lowering tolerance too much has an adverse effect on quality.",
        OFFSET(bit_rate_tolerance), AV_OPT_TYPE_INT, {.i64 = AV_CODEC_DEFAULT_BITRATE*20 }, 1, INT_MAX, V|E},
-{"flags", NULL, OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, UINT_MAX, V|A|E|D, "flags"},
+{"flags", NULL, OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, UINT_MAX, V|A|S|E|D, "flags"},
 {"unaligned", "allow decoders to produce unaligned output", 0, AV_OPT_TYPE_CONST, { .i64 = AV_CODEC_FLAG_UNALIGNED }, INT_MIN, INT_MAX, V | D, "flags" },
 {"mv4", "use four motion vectors per macroblock (MPEG-4)", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_4MV }, INT_MIN, INT_MAX, V|E, "flags"},
 {"qpel", "use 1/4-pel motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_QPEL }, INT_MIN, INT_MAX, V|E, "flags"},
@@ -54,7 +58,7 @@ static const AVOption avcodec_options[] = {
 {"pass2", "use internal 2-pass ratecontrol in second pass mode", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PASS2 }, INT_MIN, INT_MAX, 0, "flags"},
 {"gray", "only decode/encode grayscale", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_GRAY }, INT_MIN, INT_MAX, V|E|D, "flags"},
 {"psnr", "error[?] variables will be set during encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PSNR }, INT_MIN, INT_MAX, V|E, "flags"},
-{"truncated", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_TRUNCATED }, INT_MIN, INT_MAX, 0, "flags"},
+{"truncated", "Input bitstream might be randomly truncated", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_TRUNCATED }, INT_MIN, INT_MAX, V|D, "flags"},
 {"ildct", "use interlaced DCT", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_INTERLACED_DCT }, INT_MIN, INT_MAX, V|E, "flags"},
 {"low_delay", "force low delay", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_LOW_DELAY }, INT_MIN, INT_MAX, V|D|E, "flags"},
 {"global_header", "place global headers in extradata instead of every keyframe", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_GLOBAL_HEADER }, INT_MIN, INT_MAX, V|A|E, "flags"},
@@ -63,17 +67,22 @@ static const AVOption avcodec_options[] = {
 {"ilme", "interlaced motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_INTERLACED_ME }, INT_MIN, INT_MAX, V|E, "flags"},
 {"cgop", "closed GOP", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_CLOSED_GOP }, INT_MIN, INT_MAX, V|E, "flags"},
 {"output_corrupt", "Output even potentially corrupted frames", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_OUTPUT_CORRUPT }, INT_MIN, INT_MAX, V|D, "flags"},
+{"flags2", NULL, OFFSET(flags2), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT}, 0, UINT_MAX, V|A|E|D, "flags2"},
 {"fast", "allow non-spec-compliant speedup tricks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_FAST }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"noout", "skip bitstream encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_NO_OUTPUT }, INT_MIN, INT_MAX, V|E, "flags2"},
-{"ignorecrop", "ignore cropping information from sps", 1, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_IGNORE_CROP }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"ignorecrop", "ignore cropping information from sps", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_IGNORE_CROP }, INT_MIN, INT_MAX, V|D, "flags2"},
 {"local_header", "place global headers at every keyframe instead of in extradata", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_LOCAL_HEADER }, INT_MIN, INT_MAX, V|E, "flags2"},
-{"extradata_size", NULL, OFFSET(extradata_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-{"time_base", NULL, OFFSET(time_base), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, INT_MIN, INT_MAX},
+{"chunks", "Frame data might be split into multiple chunks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_CHUNKS }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"showall", "Show all frames before the first keyframe", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SHOW_ALL }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"export_mvs", "export motion vectors through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_EXPORT_MVS}, INT_MIN, INT_MAX, V|D, "flags2"},
+{"skip_manual", "do not skip samples and export skip information as frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SKIP_MANUAL}, INT_MIN, INT_MAX, V|D, "flags2"},
+{"ass_ro_flush_noop", "do not reset ASS ReadOrder field on flush", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_RO_FLUSH_NOOP}, INT_MIN, INT_MAX, S|D, "flags2"},
+{"time_base", NULL, OFFSET(time_base), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, INT_MAX},
 {"g", "set the group of picture (GOP) size", OFFSET(gop_size), AV_OPT_TYPE_INT, {.i64 = 12 }, INT_MIN, INT_MAX, V|E},
-{"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|D|E},
-{"ac", "set number of audio channels", OFFSET(channels), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|D|E},
+{"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
+{"ac", "set number of audio channels", OFFSET(channels), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
 {"cutoff", "set cutoff bandwidth", OFFSET(cutoff), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|E},
-{"frame_size", NULL, OFFSET(frame_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|E},
+{"frame_size", NULL, OFFSET(frame_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|E},
 {"frame_number", NULL, OFFSET(frame_number), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"delay", NULL, OFFSET(delay), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"qcomp", "video quantizer scale compression (VBR). Constant of ratecontrol equation. "
@@ -81,9 +90,9 @@ static const AVOption avcodec_options[] = {
           OFFSET(qcompress), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -FLT_MAX, FLT_MAX, V|E},
 {"qblur", "video quantizer scale blur (VBR)", OFFSET(qblur), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -1, FLT_MAX, V|E},
 {"qmin", "minimum video quantizer scale (VBR)", OFFSET(qmin), AV_OPT_TYPE_INT, {.i64 = 2 }, -1, 69, V|E},
-{"qmax", "maximum video quantizer scale (VBR)", OFFSET(qmax), AV_OPT_TYPE_INT, {.i64 = 31 }, -1, 69, V|E},
+{"qmax", "maximum video quantizer scale (VBR)", OFFSET(qmax), AV_OPT_TYPE_INT, {.i64 = 31 }, -1, 1024, V|E},
 {"qdiff", "maximum difference between the quantizer scales (VBR)", OFFSET(max_qdiff), AV_OPT_TYPE_INT, {.i64 = 3 }, INT_MIN, INT_MAX, V|E},
-{"bf", "use 'frames' B-frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E},
+{"bf", "set maximum number of B-frames between non-B-frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E},
 {"b_qfactor", "QP factor between P- and B-frames", OFFSET(b_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E},
 #if FF_API_PRIVATE_OPT
 {"b_strategy", "strategy to choose between I/P/B-frames", OFFSET(b_frame_strategy), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, V|E},
@@ -116,32 +125,37 @@ static const AVOption avcodec_options[] = {
 {"dc_clip", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_DC_CLIP }, INT_MIN, INT_MAX, V|D, "bug"},
 {"ms", "work around various bugs in Microsoft's broken decoders", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_MS }, INT_MIN, INT_MAX, V|D, "bug"},
 {"trunc", "truncated frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_TRUNCATED}, INT_MIN, INT_MAX, V|D, "bug"},
+{"iedge", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_IEDGE }, INT_MIN, INT_MAX, V|D, "bug"},
 {"strict", "how strictly to follow the standards", OFFSET(strict_std_compliance), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
-{"very", "strictly conform to a older more strict version of the spec or reference software", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_VERY_STRICT }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"strict", "strictly conform to all the things in the spec no matter what the consequences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_STRICT }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"normal", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_NORMAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"unofficial", "allow unofficial extensions", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_UNOFFICIAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"experimental", "allow non-standardized experimental things", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_EXPERIMENTAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
+{"very", "strictly conform to a older more strict version of the spec or reference software", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_VERY_STRICT }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"strict", "strictly conform to all the things in the spec no matter what the consequences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_STRICT }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"normal", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_NORMAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"unofficial", "allow unofficial extensions", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_UNOFFICIAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"experimental", "allow non-standardized experimental things", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_EXPERIMENTAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
 {"b_qoffset", "QP offset between P- and B-frames", OFFSET(b_quant_offset), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E},
-{"err_detect", "set error detection flags", OFFSET(err_recognition), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, A|V|D, "err_detect"},
-{"crccheck", "verify embedded CRCs", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CRCCHECK }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"bitstream", "detect bitstream specification deviations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BITSTREAM }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"buffer", "detect improper bitstream length", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BUFFER }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"explode", "abort decoding on minor error detection", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_EXPLODE }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"has_b_frames", NULL, OFFSET(has_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-{"block_align", NULL, OFFSET(block_align), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
+{"err_detect", "set error detection flags", OFFSET(err_recognition), AV_OPT_TYPE_FLAGS, {.i64 = 0 }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"crccheck", "verify embedded CRCs", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CRCCHECK }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"bitstream", "detect bitstream specification deviations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BITSTREAM }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"buffer", "detect improper bitstream length", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BUFFER }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"explode", "abort decoding on minor error detection", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_EXPLODE }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"ignore_err", "ignore errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_IGNORE_ERR }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"careful",    "consider things that violate the spec, are fast to check and have not been seen in the wild as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CAREFUL }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"compliant",  "consider all spec non compliancies as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_COMPLIANT }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"aggressive", "consider things that a sane encoder should not do as an error", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_AGGRESSIVE }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"has_b_frames", NULL, OFFSET(has_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX},
+{"block_align", NULL, OFFSET(block_align), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX},
 #if FF_API_PRIVATE_OPT
 {"mpeg_quant", "use MPEG quantizers instead of H.263", OFFSET(mpeg_quant), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
 {"rc_override_count", NULL, OFFSET(rc_override_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-{"maxrate", "Set maximum bitrate tolerance (in bits/s). Requires bufsize to be set.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
-{"minrate", "Set minimum bitrate tolerance (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.",
-            OFFSET(rc_min_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
+{"maxrate", "maximum bitrate (in bits/s). Used for VBV together with bufsize.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT_MAX, V|A|E},
+{"minrate", "minimum bitrate (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.",
+            OFFSET(rc_min_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
 {"bufsize", "set ratecontrol buffer size (in bits)", OFFSET(rc_buffer_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|E},
 {"i_qfactor", "QP factor between P- and I-frames", OFFSET(i_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = -0.8 }, -FLT_MAX, FLT_MAX, V|E},
 {"i_qoffset", "QP offset between P- and I-frames", OFFSET(i_quant_offset), AV_OPT_TYPE_FLOAT, {.dbl = 0.0 }, -FLT_MAX, FLT_MAX, V|E},
 {"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"},
-{"auto", "autoselect a good one (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
+{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
 {"fastint", "fast integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"},
 {"int", "accurate integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_INT }, INT_MIN, INT_MAX, V|E, "dct"},
 {"mmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"},
@@ -164,12 +178,15 @@ static const AVOption avcodec_options[] = {
 {"simplearmv6", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV6 }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"simpleneon", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLENEON }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"xvid", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"xvidmmx", "deprecated, for compatibility only", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"faani", "floating point AAN IDCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"},
+{"simpleauto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"slice_count", NULL, OFFSET(slice_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"ec", "set error concealment strategy", OFFSET(error_concealment), AV_OPT_TYPE_FLAGS, {.i64 = 3 }, INT_MIN, INT_MAX, V|D, "ec"},
 {"guess_mvs", "iterative motion vector (MV) search (slow)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_GUESS_MVS }, INT_MIN, INT_MAX, V|D, "ec"},
 {"deblock", "use strong deblock filter for damaged MBs", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_DEBLOCK }, INT_MIN, INT_MAX, V|D, "ec"},
-{"bits_per_coded_sample", NULL, OFFSET(bits_per_coded_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
+{"favor_inter", "favor predicting from the previous frame", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_FAVOR_INTER }, INT_MIN, INT_MAX, V|D, "ec"},
+{"bits_per_coded_sample", NULL, OFFSET(bits_per_coded_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX},
 #if FF_API_PRIVATE_OPT
 {"pred", "prediction method", OFFSET(prediction_method), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "pred"},
 {"left", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PRED_LEFT }, INT_MIN, INT_MAX, V|E, "pred"},
@@ -177,43 +194,35 @@ static const AVOption avcodec_options[] = {
 {"median", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PRED_MEDIAN }, INT_MIN, INT_MAX, V|E, "pred"},
 #endif
 {"aspect", "sample aspect ratio", OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E},
+{"sar",    "sample aspect ratio", OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E},
 {"debug", "print specific debug info", OFFSET(debug), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, INT_MAX, V|A|S|E|D, "debug"},
 {"pict", "picture info", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_PICT_INFO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"rc", "rate control", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_RC }, INT_MIN, INT_MAX, V|E, "debug"},
 {"bitstream", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BITSTREAM }, INT_MIN, INT_MAX, V|D, "debug"},
 {"mb_type", "macroblock (MB) type", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MB_TYPE }, INT_MIN, INT_MAX, V|D, "debug"},
 {"qp", "per-block quantization parameter (QP)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_QP }, INT_MIN, INT_MAX, V|D, "debug"},
+#if FF_API_DEBUG_MV
+{"mv", "motion vector", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MV }, INT_MIN, INT_MAX, V|D, "debug"},
+#endif
 {"dct_coeff", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_DCT_COEFF }, INT_MIN, INT_MAX, V|D, "debug"},
+{"green_metadata", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_GREEN_MD }, INT_MIN, INT_MAX, V|D, "debug"},
 {"skip", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_SKIP }, INT_MIN, INT_MAX, V|D, "debug"},
 {"startcode", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_STARTCODE }, INT_MIN, INT_MAX, V|D, "debug"},
 {"er", "error recognition", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_ER }, INT_MIN, INT_MAX, V|D, "debug"},
 {"mmco", "memory management control operations (H.264)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MMCO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"bugs", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUGS }, INT_MIN, INT_MAX, V|D, "debug"},
+#if FF_API_DEBUG_MV
+{"vis_qp", "visualize quantization parameter (QP), lower QP are tinted greener", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_QP }, INT_MIN, INT_MAX, V|D, "debug"},
+{"vis_mb_type", "visualize block types", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MB_TYPE }, INT_MIN, INT_MAX, V|D, "debug"},
+#endif
 {"buffers", "picture buffer allocations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUFFERS }, INT_MIN, INT_MAX, V|D, "debug"},
-{"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|D, "debug"},
-{"cmp", "full-pel ME compare function", OFFSET(me_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"subcmp", "sub-pel ME compare function", OFFSET(me_sub_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"mbcmp", "macroblock compare function", OFFSET(mb_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"ildctcmp", "interlaced DCT compare function", OFFSET(ildct_cmp), AV_OPT_TYPE_INT, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|A|D, "debug"},
+{"nomc", "skip motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_NOMC }, INT_MIN, INT_MAX, V|A|D, "debug"},
 {"dia_size", "diamond type & size for motion estimation", OFFSET(dia_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"last_pred", "amount of motion predictors from the previous frame", OFFSET(last_predictor_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #if FF_API_PRIVATE_OPT
 {"preme", "pre motion estimation", OFFSET(pre_me), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
-{"precmp", "pre motion estimation compare function", OFFSET(me_pre_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"sad", "sum of absolute differences, fast (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"sse", "sum of squared errors", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"satd", "sum of absolute Hadamard transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SATD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"dct", "sum of absolute DCT transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"psnr", "sum of squared quantization errors (avoid, low quality)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_PSNR }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"bit", "number of bits needed for the block", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_BIT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"rd", "rate distortion optimal, slow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_RD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"zero", "0", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_ZERO }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"vsad", "sum of absolute vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"vsse", "sum of squared vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"nsse", "noise preserving sum of squared differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_NSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"dctmax", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"pre_dia_size", "diamond type & size for motion estimation pre-pass", OFFSET(pre_dia_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"subq", "sub-pel motion estimation quality", OFFSET(me_subpel_quality), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E},
 {"me_range", "limit motion vectors range (1023 for DivX player)", OFFSET(me_range), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -229,8 +238,8 @@ static const AVOption avcodec_options[] = {
 {"context", "context model", OFFSET(context_model), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
 {"slice_flags", NULL, OFFSET(slice_flags), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "mbd"},
-{"simple", "use mbcmp (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_SIMPLE }, INT_MIN, INT_MAX, V|E, "mbd"},
+{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 2, V|E, "mbd"},
+{"simple", "use mbcmp", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_SIMPLE }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"bits", "use fewest bits", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_BITS }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"rd", "use best rate distortion", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_RD }, INT_MIN, INT_MAX, V|E, "mbd"},
 #if FF_API_PRIVATE_OPT
@@ -240,10 +249,9 @@ static const AVOption avcodec_options[] = {
 {"nr", "noise reduction", OFFSET(noise_reduction), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
 {"rc_init_occupancy", "number of bits which should be loaded into the rc buffer before decoding starts", OFFSET(rc_initial_buffer_occupancy), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
-{"flags2", NULL, OFFSET(flags2), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT}, 0, UINT_MAX, V|A|E|D, "flags2"},
-{"threads", NULL, OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|E|D, "threads"},
+{"threads", "set the number of threads", OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|A|E|D, "threads"},
 {"auto", "autodetect a suitable number of threads to use", 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, INT_MIN, INT_MAX, V|E|D, "threads"},
-{"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, V|E},
+{"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, -8, 16, V|E},
 {"nssew", "nsse weight", OFFSET(nsse_weight), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E},
 {"skip_top", "number of macroblock rows at the top which are skipped", OFFSET(skip_top), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D},
 {"skip_bottom", "number of macroblock rows at the bottom which are skipped", OFFSET(skip_bottom), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D},
@@ -264,34 +272,64 @@ static const AVOption avcodec_options[] = {
 {"dts_96_24", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_96_24 }, INT_MIN, INT_MAX, A|E, "profile"},
 {"dts_hd_hra", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_HD_HRA }, INT_MIN, INT_MAX, A|E, "profile"},
 {"dts_hd_ma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_HD_MA }, INT_MIN, INT_MAX, A|E, "profile"},
-{"main10", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_HEVC_MAIN_10 }, INT_MIN, INT_MAX, A|E, "profile"},
+{"mpeg4_sp",   NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_SIMPLE }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_core", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_CORE }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_main", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_MAIN }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_asp",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_ADVANCED_SIMPLE }, INT_MIN, INT_MAX, V|E, "profile"},
+{"main10",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_HEVC_MAIN_10 }, INT_MIN, INT_MAX, V|E, "profile"},
+{"msbc",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_SBC_MSBC }, INT_MIN, INT_MAX, A|E, "profile"},
 {"level", NULL, OFFSET(level), AV_OPT_TYPE_INT, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
 {"unknown", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
+{"lowres", "decode at 1= 1/2, 2=1/4, 3=1/8 resolutions", OFFSET(lowres), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|A|D},
 #if FF_API_PRIVATE_OPT
 {"skip_threshold", "frame skip threshold", OFFSET(frame_skip_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"skip_factor", "frame skip factor", OFFSET(frame_skip_factor), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"skip_exp", "frame skip exponent", OFFSET(frame_skip_exp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"skipcmp", "frame skip compare function", OFFSET(frame_skip_cmp), AV_OPT_TYPE_INT, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 #endif
+{"cmp", "full-pel ME compare function", OFFSET(me_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"subcmp", "sub-pel ME compare function", OFFSET(me_sub_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"mbcmp", "macroblock compare function", OFFSET(mb_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"ildctcmp", "interlaced DCT compare function", OFFSET(ildct_cmp), AV_OPT_TYPE_INT, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"precmp", "pre motion estimation compare function", OFFSET(me_pre_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"sad", "sum of absolute differences, fast", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"sse", "sum of squared errors", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"satd", "sum of absolute Hadamard transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SATD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"dct", "sum of absolute DCT transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"psnr", "sum of squared quantization errors (avoid, low quality)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_PSNR }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"bit", "number of bits needed for the block", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_BIT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"rd", "rate distortion optimal, slow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_RD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"zero", "0", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_ZERO }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"vsad", "sum of absolute vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"vsse", "sum of squared vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"nsse", "noise preserving sum of squared differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_NSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#if CONFIG_SNOW_ENCODER
+{"w53", "5/3 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W53 }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"w97", "9/7 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W97 }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#endif
+{"dctmax", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"msad", "sum of absolute differences, median predicted", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_MEDIAN_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"mblmin", "minimum macroblock Lagrange factor (VBR)", OFFSET(mb_lmin), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 2 }, 1, FF_LAMBDA_MAX, V|E},
 {"mblmax", "maximum macroblock Lagrange factor (VBR)", OFFSET(mb_lmax), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 31 }, 1, FF_LAMBDA_MAX, V|E},
 #if FF_API_PRIVATE_OPT
 {"mepc", "motion estimation bitrate penalty compensation (1.0 = 256)", OFFSET(me_penalty_compensation), AV_OPT_TYPE_INT, {.i64 = 256 }, INT_MIN, INT_MAX, V|E},
 #endif
-{"skip_loop_filter", NULL, OFFSET(skip_loop_filter), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"skip_idct"       , NULL, OFFSET(skip_idct)       , AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"skip_frame"      , NULL, OFFSET(skip_frame)      , AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"none"            , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONE    }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"default"         , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"noref"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONREF  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"bidir"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_BIDIR   }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"nokey"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONKEY  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"all"             , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_ALL     }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_loop_filter", "skip loop filtering process for the selected frames", OFFSET(skip_loop_filter), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_idct"       , "skip IDCT/dequantization for the selected frames",    OFFSET(skip_idct),        AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_frame"      , "skip decoding for the selected frames",               OFFSET(skip_frame),       AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"none"            , "discard no frame",                    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONE    }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"default"         , "discard useless frames",              0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"noref"           , "discard all non-reference frames",    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONREF  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"bidir"           , "discard all bidirectional frames",    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_BIDIR   }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"nokey"           , "discard all frames except keyframes", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONKEY  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"nointra"         , "discard all frames except I frames",  0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONINTRA}, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"all"             , "discard all frames",                  0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_ALL     }, INT_MIN, INT_MAX, V|D, "avdiscard"},
 {"bidir_refine", "refine the two motion vectors used in bidirectional macroblocks", OFFSET(bidir_refine), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, 4, V|E},
 #if FF_API_PRIVATE_OPT
 {"brd_scale", "downscale frames for dynamic B-frame decision", OFFSET(brd_scale), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 10, V|E},
 #endif
-{"keyint_min", "minimum interval between IDR-frames (x264)", OFFSET(keyint_min), AV_OPT_TYPE_INT, {.i64 = 25 }, INT_MIN, INT_MAX, V|E},
+{"keyint_min", "minimum interval between IDR-frames", OFFSET(keyint_min), AV_OPT_TYPE_INT, {.i64 = 25 }, INT_MIN, INT_MAX, V|E},
 {"refs", "reference frames to consider for motion compensation", OFFSET(refs), AV_OPT_TYPE_INT, {.i64 = 1 }, INT_MIN, INT_MAX, V|E},
 #if FF_API_PRIVATE_OPT
 {"chromaoffset", "chroma QP offset from luma", OFFSET(chromaoffset), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -305,29 +343,29 @@ static const AVOption avcodec_options[] = {
 #if FF_API_PRIVATE_OPT
 {"min_prediction_order", NULL, OFFSET(min_prediction_order), AV_OPT_TYPE_INT, {.i64 = -1 }, INT_MIN, INT_MAX, A|E},
 {"max_prediction_order", NULL, OFFSET(max_prediction_order), AV_OPT_TYPE_INT, {.i64 = -1 }, INT_MIN, INT_MAX, A|E},
-{"timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, V|E},
+{"timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = -1 }, -1, INT64_MAX, V|E},
 #endif
-{"bits_per_raw_sample", NULL, OFFSET(bits_per_raw_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-{"channel_layout", NULL, OFFSET(channel_layout), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT64_MAX, A|E|D, "channel_layout"},
-{"request_channel_layout", NULL, OFFSET(request_channel_layout), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT64_MAX, A|D, "request_channel_layout"},
-{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), AV_OPT_TYPE_FLOAT, {.dbl = 1.0/3 }, 0.0, FLT_MAX, V|E},
+{"bits_per_raw_sample", NULL, OFFSET(bits_per_raw_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX},
+{"channel_layout", NULL, OFFSET(channel_layout), AV_OPT_TYPE_UINT64, {.i64 = DEFAULT }, 0, UINT64_MAX, A|E|D, "channel_layout"},
+{"request_channel_layout", NULL, OFFSET(request_channel_layout), AV_OPT_TYPE_UINT64, {.i64 = DEFAULT }, 0, UINT64_MAX, A|D, "request_channel_layout"},
+{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, 0.0, FLT_MAX, V|E},
 {"rc_min_vbv_use", NULL, OFFSET(rc_min_vbv_overflow_use),  AV_OPT_TYPE_FLOAT, {.dbl = 3 },     0.0, FLT_MAX, V|E},
 {"ticks_per_frame", NULL, OFFSET(ticks_per_frame), AV_OPT_TYPE_INT, {.i64 = 1 }, 1, INT_MAX, A|V|E|D},
 {"color_primaries", "color primaries", OFFSET(color_primaries), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_UNSPECIFIED }, 1, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt709",       "BT.709",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709 },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"unknown",     "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt470m",      "BT.470 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470M },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt470bg",     "BT.470 BG",   0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG },     INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte170m",   "SMPTE 170 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE170M },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte240m",   "SMPTE 240 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE240M },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"film",        "Film",        0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_FILM },        INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt2020",      "BT.2020",     0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020 },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte428",    "SMPTE 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE428 },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte431",    "SMPTE 431-2", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE431 },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte432",    "SMPTE 422-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE432 },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"jedec-p22",    "JEDEC P22",  0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_JEDEC_P22 },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smptest428_1", "SMPTE 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE428 },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt709",       "BT.709",         0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709 },        INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"unknown",     "Unspecified",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED },  INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt470m",      "BT.470 M",       0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470M },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt470bg",     "BT.470 BG",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte170m",   "SMPTE 170 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE170M },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte240m",   "SMPTE 240 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE240M },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"film",        "Film",           0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_FILM },         INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt2020",      "BT.2020",        0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020 },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte428",    "SMPTE 428-1",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE428 },     INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte428_1",  "SMPTE 428-1",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE428 },     INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte431",    "SMPTE 431-2",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE431 },     INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte432",    "SMPTE 422-1",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE432 },     INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"jedec-p22",   "JEDEC P22",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_JEDEC_P22 },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"unspecified", "Unspecified",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED },  INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
 {"color_trc", "color transfer characteristics", OFFSET(color_trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_UNSPECIFIED }, 1, INT_MAX, V|E|D, "color_trc_type"},
 {"bt709",        "BT.709",           0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709 },        INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"unknown",      "Unspecified",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED },  INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
@@ -354,8 +392,7 @@ static const AVOption avcodec_options[] = {
 {"iec61966_2_1", "IEC 61966-2-1",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"bt2020_10bit", "BT.2020 - 10 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 },    INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"bt2020_12bit", "BT.2020 - 12 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 },    INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
-{"smptest2084",  "SMPTE 2084",       0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE2084 },    INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
-{"smptest428_1", "SMPTE 428-1",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE428 },     INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
+{"smpte428_1",   "SMPTE 428-1",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE428 },     INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"colorspace", "color space", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_UNSPECIFIED }, 0, INT_MAX, V|E|D, "colorspace_type"},
 {"rgb",         "RGB",         0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_RGB },         INT_MIN, INT_MAX, V|E|D, "colorspace_type"},
 {"bt709",       "BT.709",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709 },       INT_MIN, INT_MAX, V|E|D, "colorspace_type"},
@@ -380,7 +417,7 @@ static const AVOption avcodec_options[] = {
 {"mpeg", "MPEG (219*2^(n-8))", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG },        INT_MIN, INT_MAX, V|E|D, "color_range_type"},
 {"jpeg", "JPEG (2^n-1)",       0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG },        INT_MIN, INT_MAX, V|E|D, "color_range_type"},
 {"chroma_sample_location", "chroma sample location", OFFSET(chroma_sample_location), AV_OPT_TYPE_INT, {.i64 = AVCHROMA_LOC_UNSPECIFIED }, 0, INT_MAX, V|E|D, "chroma_sample_location_type"},
-{"unknown", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
+{"unknown",     "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"left",        "Left",        0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_LEFT },        INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"center",      "Center",      0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_CENTER },      INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"topleft",     "Top-left",    0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_TOPLEFT },     INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
@@ -389,8 +426,8 @@ static const AVOption avcodec_options[] = {
 {"bottom",      "Bottom",      0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_BOTTOM },      INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"log_level_offset", "set the log level offset", OFFSET(log_level_offset), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX },
-{"slices", "number of slices, used in parallelized encoding", OFFSET(slices), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|E},
-{"thread_type", "select multithreading type", OFFSET(thread_type), AV_OPT_TYPE_FLAGS, {.i64 = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|E|D, "thread_type"},
+{"slices", "set the number of slices, used in parallelized encoding", OFFSET(slices), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|E},
+{"thread_type", "select multithreading type", OFFSET(thread_type), AV_OPT_TYPE_FLAGS, {.i64 = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|A|E|D, "thread_type"},
 {"slice", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_SLICE }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
 {"frame", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_FRAME }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
 {"audio_service_type", "audio service type", OFFSET(audio_service_type), AV_OPT_TYPE_INT, {.i64 = AV_AUDIO_SERVICE_TYPE_MAIN }, 0, AV_AUDIO_SERVICE_TYPE_NB-1, A|E, "audio_service_type"},
@@ -403,23 +440,46 @@ static const AVOption avcodec_options[] = {
 {"em", "Emergency",          0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_EMERGENCY },         INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"vo", "Voice Over",         0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_VOICE_OVER },        INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
-{"request_sample_fmt", NULL, OFFSET(request_sample_fmt), AV_OPT_TYPE_INT, {.i64 = AV_SAMPLE_FMT_NONE }, AV_SAMPLE_FMT_NONE, INT_MAX, A|D, "request_sample_fmt"},
-{"u8" , "8-bit unsigned integer", 0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_U8  }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s16", "16-bit signed integer",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S16 }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s32", "32-bit signed integer",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S32 }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"flt", "32-bit float",           0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_FLT }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"dbl", "64-bit double",          0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_DBL }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"u8p" , "8-bit unsigned integer planar", 0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_U8P  }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s16p", "16-bit signed integer planar",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S16P }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s32p", "32-bit signed integer planar",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S32P }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"fltp", "32-bit float planar",           0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_FLTP }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"dblp", "64-bit double planar",          0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_DBLP }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"refcounted_frames", NULL, OFFSET(refcounted_frames), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, A|V|D },
+{"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, INT_MAX, A|D, "request_sample_fmt"},
+{"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
+{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
+{"sub_charenc_mode", "set input text subtitles character encoding mode", OFFSET(sub_charenc_mode), AV_OPT_TYPE_FLAGS, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC}, -1, INT_MAX, S|D, "sub_charenc_mode"},
+{"do_nothing",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_DO_NOTHING},  INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"auto",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC},   INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"pre_decoder", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_PRE_DECODER}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"ignore",      NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_IGNORE},      INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+#if FF_API_ASS_TIMING
+{"sub_text_format", "set decoded text subtitle format", OFFSET(sub_text_format), AV_OPT_TYPE_INT, {.i64 = FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS}, 0, 1, S|D, "sub_text_format"},
+#else
+{"sub_text_format", "set decoded text subtitle format", OFFSET(sub_text_format), AV_OPT_TYPE_INT, {.i64 = FF_SUB_TEXT_FMT_ASS}, 0, 1, S|D, "sub_text_format"},
+#endif
+{"ass",              NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_TEXT_FMT_ASS},              INT_MIN, INT_MAX, S|D, "sub_text_format"},
+#if FF_API_ASS_TIMING
+{"ass_with_timings", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS}, INT_MIN, INT_MAX, S|D, "sub_text_format"},
+#endif
+{"refcounted_frames", NULL, OFFSET(refcounted_frames), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, A|V|D },
 #if FF_API_SIDEDATA_ONLY_PKT
-{"side_data_only_packets", NULL, OFFSET(side_data_only_packets), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, A|V|E },
+{"side_data_only_packets", NULL, OFFSET(side_data_only_packets), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, A|V|E },
 #endif
-{"apply_cropping", NULL, OFFSET(apply_cropping), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, V | D },
+{"apply_cropping", NULL, OFFSET(apply_cropping), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, V | D },
+{"skip_alpha", "Skip processing alpha", OFFSET(skip_alpha), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, V|D },
+{"field_order", "Field order", OFFSET(field_order), AV_OPT_TYPE_INT, {.i64 = AV_FIELD_UNKNOWN }, 0, 5, V|D|E, "field_order" },
+{"progressive", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_PROGRESSIVE }, 0, 0, V|D|E, "field_order" },
+{"tt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TT }, 0, 0, V|D|E, "field_order" },
+{"bb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BB }, 0, 0, V|D|E, "field_order" },
+{"tb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TB }, 0, 0, V|D|E, "field_order" },
+{"bt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BT }, 0, 0, V|D|E, "field_order" },
+{"dump_separator", "set information dump field separator", OFFSET(dump_separator), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, A|V|S|D|E},
+{"codec_whitelist", "List of decoders that are allowed to be used", OFFSET(codec_whitelist), AV_OPT_TYPE_STRING, { .str = NULL },  CHAR_MIN, CHAR_MAX, A|V|S|D },
+{"pixel_format", "set pixel format", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_NONE}, -1, INT_MAX, 0 },
+{"video_size", "set video size", OFFSET(width), AV_OPT_TYPE_IMAGE_SIZE, {.str=NULL}, 0, INT_MAX, 0 },
+{"max_pixels", "Maximum number of pixels", OFFSET(max_pixels), AV_OPT_TYPE_INT64, {.i64 = INT_MAX }, 0, INT_MAX, A|V|S|D|E },
+{"hwaccel_flags", NULL, OFFSET(hwaccel_flags), AV_OPT_TYPE_FLAGS, {.i64 = AV_HWACCEL_FLAG_IGNORE_LEVEL }, 0, UINT_MAX, V|D, "hwaccel_flags"},
+{"ignore_level", "ignore level even if the codec level used is unknown or higher than the maximum supported level reported by the hardware driver", 0, AV_OPT_TYPE_CONST, { .i64 = AV_HWACCEL_FLAG_IGNORE_LEVEL }, INT_MIN, INT_MAX, V | D, "hwaccel_flags" },
+{"allow_high_depth", "allow to output YUV pixel formats with a different chroma sampling than 4:2:0 and/or other than 8 bits per component", 0, AV_OPT_TYPE_CONST, {.i64 = AV_HWACCEL_FLAG_ALLOW_HIGH_DEPTH }, INT_MIN, INT_MAX, V | D, "hwaccel_flags"},
+{"allow_profile_mismatch", "attempt to decode anyway if HW accelerated decoder's supported profiles do not exactly match the stream", 0, AV_OPT_TYPE_CONST, {.i64 = AV_HWACCEL_FLAG_ALLOW_PROFILE_MISMATCH }, INT_MIN, INT_MAX, V | D, "hwaccel_flags"},
 {"extra_hw_frames", "Number of extra hardware frames to allocate for the user", OFFSET(extra_hw_frames), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, V|D },
+{"discard_damaged_percentage", "Percentage of damaged samples to discard a frame", OFFSET(discard_damaged_percentage), AV_OPT_TYPE_INT, {.i64 = 95 }, 0, 100, V|D },
 {NULL},
 };
 
diff --git a/libavcodec/opus.c b/libavcodec/opus.c
index 8e896dd..f74278a 100644
--- a/libavcodec/opus.c
+++ b/libavcodec/opus.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,11 @@
 #include <stdint.h>
 
 #include "libavutil/error.h"
+#include "libavutil/ffmath.h"
 
-#include "opus.h"
+#include "opus_celt.h"
+#include "opustab.h"
+#include "internal.h"
 #include "vorbis.h"
 
 static const uint16_t opus_frame_duration[32] = {
@@ -324,16 +327,18 @@ av_cold int ff_opus_parse_extradata(AVCodecContext *avctx,
     }
 
     avctx->delay = AV_RL16(extradata + 10);
+    if (avctx->internal)
+        avctx->internal->skip_samples = avctx->delay;
 
     channels = avctx->extradata ? extradata[9] : (avctx->channels == 1) ? 1 : 2;
     if (!channels) {
-        av_log(avctx, AV_LOG_ERROR, "Zero channel count specified in the extadata\n");
+        av_log(avctx, AV_LOG_ERROR, "Zero channel count specified in the extradata\n");
         return AVERROR_INVALIDDATA;
     }
 
     s->gain_i = AV_RL16(extradata + 16);
     if (s->gain_i)
-        s->gain = pow(10, s->gain_i / (20.0 * 256));
+        s->gain = ff_exp10(s->gain_i / (20.0 * 256));
 
     map_type = extradata[18];
     if (!map_type) {
@@ -346,7 +351,7 @@ av_cold int ff_opus_parse_extradata(AVCodecContext *avctx,
         streams        = 1;
         stereo_streams = channels - 1;
         channel_map    = default_channel_map;
-    } else if (map_type == 1 || map_type == 255) {
+    } else if (map_type == 1 || map_type == 2 || map_type == 255) {
         if (extradata_size < 21 + channels) {
             av_log(avctx, AV_LOG_ERROR, "Invalid extradata size: %d\n",
                    extradata_size);
@@ -370,6 +375,21 @@ av_cold int ff_opus_parse_extradata(AVCodecContext *avctx,
             }
             layout = ff_vorbis_channel_layouts[channels - 1];
             channel_reorder = channel_reorder_vorbis;
+        } else if (map_type == 2) {
+            int ambisonic_order = ff_sqrt(channels) - 1;
+            if (channels != ((ambisonic_order + 1) * (ambisonic_order + 1)) &&
+                channels != ((ambisonic_order + 1) * (ambisonic_order + 1) + 2)) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Channel mapping 2 is only specified for channel counts"
+                       " which can be written as (n + 1)^2 or (n + 1)^2 + 2"
+                       " for nonnegative integer n\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (channels > 227) {
+                av_log(avctx, AV_LOG_ERROR, "Too many channels\n");
+                return AVERROR_INVALIDDATA;
+            }
+            layout = 0;
         } else
             layout = 0;
 
@@ -393,6 +413,7 @@ av_cold int ff_opus_parse_extradata(AVCodecContext *avctx,
         } else if (idx >= streams + stereo_streams) {
             av_log(avctx, AV_LOG_ERROR,
                    "Invalid channel map for output channel %d: %d\n", i, idx);
+            av_freep(&s->channel_maps);
             return AVERROR_INVALIDDATA;
         }
 
@@ -421,3 +442,459 @@ av_cold int ff_opus_parse_extradata(AVCodecContext *avctx,
 
     return 0;
 }
+
+void ff_celt_quant_bands(CeltFrame *f, OpusRangeCoder *rc)
+{
+    float lowband_scratch[8 * 22];
+    float norm1[2 * 8 * 100];
+    float *norm2 = norm1 + 8 * 100;
+
+    int totalbits = (f->framebits << 3) - f->anticollapse_needed;
+
+    int update_lowband = 1;
+    int lowband_offset = 0;
+
+    int i, j;
+
+    for (i = f->start_band; i < f->end_band; i++) {
+        uint32_t cm[2] = { (1 << f->blocks) - 1, (1 << f->blocks) - 1 };
+        int band_offset = ff_celt_freq_bands[i] << f->size;
+        int band_size   = ff_celt_freq_range[i] << f->size;
+        float *X = f->block[0].coeffs + band_offset;
+        float *Y = (f->channels == 2) ? f->block[1].coeffs + band_offset : NULL;
+        float *norm_loc1, *norm_loc2;
+
+        int consumed = opus_rc_tell_frac(rc);
+        int effective_lowband = -1;
+        int b = 0;
+
+        /* Compute how many bits we want to allocate to this band */
+        if (i != f->start_band)
+            f->remaining -= consumed;
+        f->remaining2 = totalbits - consumed - 1;
+        if (i <= f->coded_bands - 1) {
+            int curr_balance = f->remaining / FFMIN(3, f->coded_bands-i);
+            b = av_clip_uintp2(FFMIN(f->remaining2 + 1, f->pulses[i] + curr_balance), 14);
+        }
+
+        if ((ff_celt_freq_bands[i] - ff_celt_freq_range[i] >= ff_celt_freq_bands[f->start_band] ||
+            i == f->start_band + 1) && (update_lowband || lowband_offset == 0))
+            lowband_offset = i;
+
+        if (i == f->start_band + 1) {
+            /* Special Hybrid Folding (RFC 8251 section 9). Copy the first band into
+            the second to ensure the second band never has to use the LCG. */
+            int count = (ff_celt_freq_range[i] - ff_celt_freq_range[i-1]) << f->size;
+
+            memcpy(&norm1[band_offset], &norm1[band_offset - count], count * sizeof(float));
+
+            if (f->channels == 2)
+                memcpy(&norm2[band_offset], &norm2[band_offset - count], count * sizeof(float));
+        }
+
+        /* Get a conservative estimate of the collapse_mask's for the bands we're
+           going to be folding from. */
+        if (lowband_offset != 0 && (f->spread != CELT_SPREAD_AGGRESSIVE ||
+                                    f->blocks > 1 || f->tf_change[i] < 0)) {
+            int foldstart, foldend;
+
+            /* This ensures we never repeat spectral content within one band */
+            effective_lowband = FFMAX(ff_celt_freq_bands[f->start_band],
+                                      ff_celt_freq_bands[lowband_offset] - ff_celt_freq_range[i]);
+            foldstart = lowband_offset;
+            while (ff_celt_freq_bands[--foldstart] > effective_lowband);
+            foldend = lowband_offset - 1;
+            while (++foldend < i && ff_celt_freq_bands[foldend] < effective_lowband + ff_celt_freq_range[i]);
+
+            cm[0] = cm[1] = 0;
+            for (j = foldstart; j < foldend; j++) {
+                cm[0] |= f->block[0].collapse_masks[j];
+                cm[1] |= f->block[f->channels - 1].collapse_masks[j];
+            }
+        }
+
+        if (f->dual_stereo && i == f->intensity_stereo) {
+            /* Switch off dual stereo to do intensity */
+            f->dual_stereo = 0;
+            for (j = ff_celt_freq_bands[f->start_band] << f->size; j < band_offset; j++)
+                norm1[j] = (norm1[j] + norm2[j]) / 2;
+        }
+
+        norm_loc1 = effective_lowband != -1 ? norm1 + (effective_lowband << f->size) : NULL;
+        norm_loc2 = effective_lowband != -1 ? norm2 + (effective_lowband << f->size) : NULL;
+
+        if (f->dual_stereo) {
+            cm[0] = f->pvq->quant_band(f->pvq, f, rc, i, X, NULL, band_size, b >> 1,
+                                       f->blocks, norm_loc1, f->size,
+                                       norm1 + band_offset, 0, 1.0f,
+                                       lowband_scratch, cm[0]);
+
+            cm[1] = f->pvq->quant_band(f->pvq, f, rc, i, Y, NULL, band_size, b >> 1,
+                                       f->blocks, norm_loc2, f->size,
+                                       norm2 + band_offset, 0, 1.0f,
+                                       lowband_scratch, cm[1]);
+        } else {
+            cm[0] = f->pvq->quant_band(f->pvq, f, rc, i, X,    Y, band_size, b >> 0,
+                                       f->blocks, norm_loc1, f->size,
+                                       norm1 + band_offset, 0, 1.0f,
+                                       lowband_scratch, cm[0] | cm[1]);
+            cm[1] = cm[0];
+        }
+
+        f->block[0].collapse_masks[i]               = (uint8_t)cm[0];
+        f->block[f->channels - 1].collapse_masks[i] = (uint8_t)cm[1];
+        f->remaining += f->pulses[i] + consumed;
+
+        /* Update the folding position only as long as we have 1 bit/sample depth */
+        update_lowband = (b > band_size << 3);
+    }
+}
+
+#define NORMC(bits) ((bits) << (f->channels - 1) << f->size >> 2)
+
+void ff_celt_bitalloc(CeltFrame *f, OpusRangeCoder *rc, int encode)
+{
+    int i, j, low, high, total, done, bandbits, remaining, tbits_8ths;
+    int skip_startband      = f->start_band;
+    int skip_bit            = 0;
+    int intensitystereo_bit = 0;
+    int dualstereo_bit      = 0;
+    int dynalloc            = 6;
+    int extrabits           = 0;
+
+    int boost[CELT_MAX_BANDS] = { 0 };
+    int trim_offset[CELT_MAX_BANDS];
+    int threshold[CELT_MAX_BANDS];
+    int bits1[CELT_MAX_BANDS];
+    int bits2[CELT_MAX_BANDS];
+
+    /* Spread */
+    if (opus_rc_tell(rc) + 4 <= f->framebits) {
+        if (encode)
+            ff_opus_rc_enc_cdf(rc, f->spread, ff_celt_model_spread);
+        else
+            f->spread = ff_opus_rc_dec_cdf(rc, ff_celt_model_spread);
+    } else {
+        f->spread = CELT_SPREAD_NORMAL;
+    }
+
+    /* Initialize static allocation caps */
+    for (i = 0; i < CELT_MAX_BANDS; i++)
+        f->caps[i] = NORMC((ff_celt_static_caps[f->size][f->channels - 1][i] + 64) * ff_celt_freq_range[i]);
+
+    /* Band boosts */
+    tbits_8ths = f->framebits << 3;
+    for (i = f->start_band; i < f->end_band; i++) {
+        int quanta = ff_celt_freq_range[i] << (f->channels - 1) << f->size;
+        int b_dynalloc = dynalloc;
+        int boost_amount = f->alloc_boost[i];
+        quanta = FFMIN(quanta << 3, FFMAX(6 << 3, quanta));
+
+        while (opus_rc_tell_frac(rc) + (b_dynalloc << 3) < tbits_8ths && boost[i] < f->caps[i]) {
+            int is_boost;
+            if (encode) {
+                is_boost = boost_amount--;
+                ff_opus_rc_enc_log(rc, is_boost, b_dynalloc);
+            } else {
+                is_boost = ff_opus_rc_dec_log(rc, b_dynalloc);
+            }
+
+            if (!is_boost)
+                break;
+
+            boost[i]   += quanta;
+            tbits_8ths -= quanta;
+
+            b_dynalloc = 1;
+        }
+
+        if (boost[i])
+            dynalloc = FFMAX(dynalloc - 1, 2);
+    }
+
+    /* Allocation trim */
+    if (opus_rc_tell_frac(rc) + (6 << 3) <= tbits_8ths)
+        if (encode)
+            ff_opus_rc_enc_cdf(rc, f->alloc_trim, ff_celt_model_alloc_trim);
+        else
+            f->alloc_trim = ff_opus_rc_dec_cdf(rc, ff_celt_model_alloc_trim);
+
+    /* Anti-collapse bit reservation */
+    tbits_8ths = (f->framebits << 3) - opus_rc_tell_frac(rc) - 1;
+    f->anticollapse_needed = 0;
+    if (f->transient && f->size >= 2 && tbits_8ths >= ((f->size + 2) << 3))
+        f->anticollapse_needed = 1 << 3;
+    tbits_8ths -= f->anticollapse_needed;
+
+    /* Band skip bit reservation */
+    if (tbits_8ths >= 1 << 3)
+        skip_bit = 1 << 3;
+    tbits_8ths -= skip_bit;
+
+    /* Intensity/dual stereo bit reservation */
+    if (f->channels == 2) {
+        intensitystereo_bit = ff_celt_log2_frac[f->end_band - f->start_band];
+        if (intensitystereo_bit <= tbits_8ths) {
+            tbits_8ths -= intensitystereo_bit;
+            if (tbits_8ths >= 1 << 3) {
+                dualstereo_bit = 1 << 3;
+                tbits_8ths -= 1 << 3;
+            }
+        } else {
+            intensitystereo_bit = 0;
+        }
+    }
+
+    /* Trim offsets */
+    for (i = f->start_band; i < f->end_band; i++) {
+        int trim     = f->alloc_trim - 5 - f->size;
+        int band     = ff_celt_freq_range[i] * (f->end_band - i - 1);
+        int duration = f->size + 3;
+        int scale    = duration + f->channels - 1;
+
+        /* PVQ minimum allocation threshold, below this value the band is
+         * skipped */
+        threshold[i] = FFMAX(3 * ff_celt_freq_range[i] << duration >> 4,
+                             f->channels << 3);
+
+        trim_offset[i] = trim * (band << scale) >> 6;
+
+        if (ff_celt_freq_range[i] << f->size == 1)
+            trim_offset[i] -= f->channels << 3;
+    }
+
+    /* Bisection */
+    low  = 1;
+    high = CELT_VECTORS - 1;
+    while (low <= high) {
+        int center = (low + high) >> 1;
+        done = total = 0;
+
+        for (i = f->end_band - 1; i >= f->start_band; i--) {
+            bandbits = NORMC(ff_celt_freq_range[i] * ff_celt_static_alloc[center][i]);
+
+            if (bandbits)
+                bandbits = FFMAX(bandbits + trim_offset[i], 0);
+            bandbits += boost[i];
+
+            if (bandbits >= threshold[i] || done) {
+                done = 1;
+                total += FFMIN(bandbits, f->caps[i]);
+            } else if (bandbits >= f->channels << 3) {
+                total += f->channels << 3;
+            }
+        }
+
+        if (total > tbits_8ths)
+            high = center - 1;
+        else
+            low = center + 1;
+    }
+    high = low--;
+
+    /* Bisection */
+    for (i = f->start_band; i < f->end_band; i++) {
+        bits1[i] = NORMC(ff_celt_freq_range[i] * ff_celt_static_alloc[low][i]);
+        bits2[i] = high >= CELT_VECTORS ? f->caps[i] :
+                   NORMC(ff_celt_freq_range[i] * ff_celt_static_alloc[high][i]);
+
+        if (bits1[i])
+            bits1[i] = FFMAX(bits1[i] + trim_offset[i], 0);
+        if (bits2[i])
+            bits2[i] = FFMAX(bits2[i] + trim_offset[i], 0);
+
+        if (low)
+            bits1[i] += boost[i];
+        bits2[i] += boost[i];
+
+        if (boost[i])
+            skip_startband = i;
+        bits2[i] = FFMAX(bits2[i] - bits1[i], 0);
+    }
+
+    /* Bisection */
+    low  = 0;
+    high = 1 << CELT_ALLOC_STEPS;
+    for (i = 0; i < CELT_ALLOC_STEPS; i++) {
+        int center = (low + high) >> 1;
+        done = total = 0;
+
+        for (j = f->end_band - 1; j >= f->start_band; j--) {
+            bandbits = bits1[j] + (center * bits2[j] >> CELT_ALLOC_STEPS);
+
+            if (bandbits >= threshold[j] || done) {
+                done = 1;
+                total += FFMIN(bandbits, f->caps[j]);
+            } else if (bandbits >= f->channels << 3)
+                total += f->channels << 3;
+        }
+        if (total > tbits_8ths)
+            high = center;
+        else
+            low = center;
+    }
+
+    /* Bisection */
+    done = total = 0;
+    for (i = f->end_band - 1; i >= f->start_band; i--) {
+        bandbits = bits1[i] + (low * bits2[i] >> CELT_ALLOC_STEPS);
+
+        if (bandbits >= threshold[i] || done)
+            done = 1;
+        else
+            bandbits = (bandbits >= f->channels << 3) ?
+            f->channels << 3 : 0;
+
+        bandbits     = FFMIN(bandbits, f->caps[i]);
+        f->pulses[i] = bandbits;
+        total      += bandbits;
+    }
+
+    /* Band skipping */
+    for (f->coded_bands = f->end_band; ; f->coded_bands--) {
+        int allocation;
+        j = f->coded_bands - 1;
+
+        if (j == skip_startband) {
+            /* all remaining bands are not skipped */
+            tbits_8ths += skip_bit;
+            break;
+        }
+
+        /* determine the number of bits available for coding "do not skip" markers */
+        remaining   = tbits_8ths - total;
+        bandbits    = remaining / (ff_celt_freq_bands[j+1] - ff_celt_freq_bands[f->start_band]);
+        remaining  -= bandbits  * (ff_celt_freq_bands[j+1] - ff_celt_freq_bands[f->start_band]);
+        allocation  = f->pulses[j] + bandbits * ff_celt_freq_range[j];
+        allocation += FFMAX(remaining - (ff_celt_freq_bands[j] - ff_celt_freq_bands[f->start_band]), 0);
+
+        /* a "do not skip" marker is only coded if the allocation is
+         * above the chosen threshold */
+        if (allocation >= FFMAX(threshold[j], (f->channels + 1) << 3)) {
+            int do_not_skip;
+            if (encode) {
+                do_not_skip = f->coded_bands <= f->skip_band_floor;
+                ff_opus_rc_enc_log(rc, do_not_skip, 1);
+            } else {
+                do_not_skip = ff_opus_rc_dec_log(rc, 1);
+            }
+
+            if (do_not_skip)
+                break;
+
+            total      += 1 << 3;
+            allocation -= 1 << 3;
+        }
+
+        /* the band is skipped, so reclaim its bits */
+        total -= f->pulses[j];
+        if (intensitystereo_bit) {
+            total -= intensitystereo_bit;
+            intensitystereo_bit = ff_celt_log2_frac[j - f->start_band];
+            total += intensitystereo_bit;
+        }
+
+        total += f->pulses[j] = (allocation >= f->channels << 3) ? f->channels << 3 : 0;
+    }
+
+    /* IS start band */
+    if (encode) {
+        if (intensitystereo_bit) {
+            f->intensity_stereo = FFMIN(f->intensity_stereo, f->coded_bands);
+            ff_opus_rc_enc_uint(rc, f->intensity_stereo, f->coded_bands + 1 - f->start_band);
+        }
+    } else {
+        f->intensity_stereo = f->dual_stereo = 0;
+        if (intensitystereo_bit)
+            f->intensity_stereo = f->start_band + ff_opus_rc_dec_uint(rc, f->coded_bands + 1 - f->start_band);
+    }
+
+    /* DS flag */
+    if (f->intensity_stereo <= f->start_band)
+        tbits_8ths += dualstereo_bit; /* no intensity stereo means no dual stereo */
+    else if (dualstereo_bit)
+        if (encode)
+            ff_opus_rc_enc_log(rc, f->dual_stereo, 1);
+        else
+            f->dual_stereo = ff_opus_rc_dec_log(rc, 1);
+
+    /* Supply the remaining bits in this frame to lower bands */
+    remaining = tbits_8ths - total;
+    bandbits  = remaining / (ff_celt_freq_bands[f->coded_bands] - ff_celt_freq_bands[f->start_band]);
+    remaining -= bandbits * (ff_celt_freq_bands[f->coded_bands] - ff_celt_freq_bands[f->start_band]);
+    for (i = f->start_band; i < f->coded_bands; i++) {
+        const int bits = FFMIN(remaining, ff_celt_freq_range[i]);
+        f->pulses[i] += bits + bandbits * ff_celt_freq_range[i];
+        remaining    -= bits;
+    }
+
+    /* Finally determine the allocation */
+    for (i = f->start_band; i < f->coded_bands; i++) {
+        int N = ff_celt_freq_range[i] << f->size;
+        int prev_extra = extrabits;
+        f->pulses[i] += extrabits;
+
+        if (N > 1) {
+            int dof;        /* degrees of freedom */
+            int temp;       /* dof * channels * log(dof) */
+            int fine_bits;
+            int max_bits;
+            int offset;     /* fine energy quantization offset, i.e.
+                             * extra bits assigned over the standard
+                             * totalbits/dof */
+
+            extrabits = FFMAX(f->pulses[i] - f->caps[i], 0);
+            f->pulses[i] -= extrabits;
+
+            /* intensity stereo makes use of an extra degree of freedom */
+            dof = N * f->channels + (f->channels == 2 && N > 2 && !f->dual_stereo && i < f->intensity_stereo);
+            temp = dof * (ff_celt_log_freq_range[i] + (f->size << 3));
+            offset = (temp >> 1) - dof * CELT_FINE_OFFSET;
+            if (N == 2) /* dof=2 is the only case that doesn't fit the model */
+                offset += dof << 1;
+
+            /* grant an additional bias for the first and second pulses */
+            if (f->pulses[i] + offset < 2 * (dof << 3))
+                offset += temp >> 2;
+            else if (f->pulses[i] + offset < 3 * (dof << 3))
+                offset += temp >> 3;
+
+            fine_bits = (f->pulses[i] + offset + (dof << 2)) / (dof << 3);
+            max_bits  = FFMIN((f->pulses[i] >> 3) >> (f->channels - 1), CELT_MAX_FINE_BITS);
+            max_bits  = FFMAX(max_bits, 0);
+            f->fine_bits[i] = av_clip(fine_bits, 0, max_bits);
+
+            /* If fine_bits was rounded down or capped,
+             * give priority for the final fine energy pass */
+            f->fine_priority[i] = (f->fine_bits[i] * (dof << 3) >= f->pulses[i] + offset);
+
+            /* the remaining bits are assigned to PVQ */
+            f->pulses[i] -= f->fine_bits[i] << (f->channels - 1) << 3;
+        } else {
+            /* all bits go to fine energy except for the sign bit */
+            extrabits = FFMAX(f->pulses[i] - (f->channels << 3), 0);
+            f->pulses[i] -= extrabits;
+            f->fine_bits[i] = 0;
+            f->fine_priority[i] = 1;
+        }
+
+        /* hand back a limited number of extra fine energy bits to this band */
+        if (extrabits > 0) {
+            int fineextra = FFMIN(extrabits >> (f->channels + 2),
+                                  CELT_MAX_FINE_BITS - f->fine_bits[i]);
+            f->fine_bits[i] += fineextra;
+
+            fineextra <<= f->channels + 2;
+            f->fine_priority[i] = (fineextra >= extrabits - prev_extra);
+            extrabits -= fineextra;
+        }
+    }
+    f->remaining = extrabits;
+
+    /* skipped bands dedicate all of their bits for fine energy */
+    for (; i < f->end_band; i++) {
+        f->fine_bits[i]     = f->pulses[i] >> (f->channels - 1) >> 3;
+        f->pulses[i]        = 0;
+        f->fine_priority[i] = f->fine_bits[i] < 1;
+    }
+}
diff --git a/libavcodec/opus.h b/libavcodec/opus.h
index fbf67c9..edbaab5 100644
--- a/libavcodec/opus.h
+++ b/libavcodec/opus.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,10 +29,10 @@
 #include "libavutil/float_dsp.h"
 #include "libavutil/frame.h"
 
-#include "libavresample/avresample.h"
+#include "libswresample/swresample.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "opus_rc.h"
 
 #define MAX_FRAME_SIZE               1275
 #define MAX_FRAMES                   48
@@ -43,23 +43,12 @@
 #define CELT_MAX_LOG_BLOCKS          3
 #define CELT_MAX_FRAME_SIZE          (CELT_SHORT_BLOCKSIZE * (1 << CELT_MAX_LOG_BLOCKS))
 #define CELT_MAX_BANDS               21
-#define CELT_VECTORS                 11
-#define CELT_ALLOC_STEPS             6
-#define CELT_FINE_OFFSET             21
-#define CELT_MAX_FINE_BITS           8
-#define CELT_NORM_SCALE              16384
-#define CELT_QTHETA_OFFSET           4
-#define CELT_QTHETA_OFFSET_TWOPHASE  16
-#define CELT_DEEMPH_COEFF            0.85000610f
-#define CELT_POSTFILTER_MINPERIOD    15
-#define CELT_ENERGY_SILENCE          (-28.0f)
 
 #define SILK_HISTORY                 322
 #define SILK_MAX_LPC                 16
 
-#define ROUND_MULL(a,b,s) (((MUL64(a, b) >> (s - 1)) + 1) >> 1)
+#define ROUND_MULL(a,b,s) (((MUL64(a, b) >> ((s) - 1)) + 1) >> 1)
 #define ROUND_MUL16(a,b)  ((MUL16(a, b) + 16384) >> 15)
-#define opus_ilog(i) (av_log2(i) + !!(i))
 
 #define OPUS_TS_HEADER     0x7FE0        // 0x3ff (11 bits)
 #define OPUS_TS_MASK       0xFFE0        // top 11 bits
@@ -73,7 +62,9 @@ static const uint8_t opus_default_extradata[30] = {
 enum OpusMode {
     OPUS_MODE_SILK,
     OPUS_MODE_HYBRID,
-    OPUS_MODE_CELT
+    OPUS_MODE_CELT,
+
+    OPUS_MODE_NB
 };
 
 enum OpusBandwidth {
@@ -81,42 +72,29 @@ enum OpusBandwidth {
     OPUS_BANDWIDTH_MEDIUMBAND,
     OPUS_BANDWIDTH_WIDEBAND,
     OPUS_BANDWIDTH_SUPERWIDEBAND,
-    OPUS_BANDWIDTH_FULLBAND
-};
+    OPUS_BANDWIDTH_FULLBAND,
 
-typedef struct RawBitsContext {
-    const uint8_t *position;
-    unsigned int bytes;
-    unsigned int cachelen;
-    unsigned int cacheval;
-} RawBitsContext;
-
-typedef struct OpusRangeCoder {
-    BitstreamContext bc;
-    RawBitsContext rb;
-    unsigned int range;
-    unsigned int value;
-    unsigned int total_read_bits;
-} OpusRangeCoder;
+    OPUS_BANDWITH_NB
+};
 
 typedef struct SilkContext SilkContext;
 
-typedef struct CeltContext CeltContext;
+typedef struct CeltFrame CeltFrame;
 
 typedef struct OpusPacket {
-    int packet_size;                /** packet size */
-    int data_size;                  /** size of the useful data -- packet size - padding */
-    int code;                       /** packet code: specifies the frame layout */
-    int stereo;                     /** whether this packet is mono or stereo */
-    int vbr;                        /** vbr flag */
-    int config;                     /** configuration: tells the audio mode,
+    int packet_size;                /**< packet size */
+    int data_size;                  /**< size of the useful data -- packet size - padding */
+    int code;                       /**< packet code: specifies the frame layout */
+    int stereo;                     /**< whether this packet is mono or stereo */
+    int vbr;                        /**< vbr flag */
+    int config;                     /**< configuration: tells the audio mode,
                                      **                bandwidth, and frame duration */
-    int frame_count;                /** frame count */
-    int frame_offset[MAX_FRAMES];   /** frame offsets */
-    int frame_size[MAX_FRAMES];     /** frame sizes */
-    int frame_duration;             /** frame duration, in samples @ 48kHz */
-    enum OpusMode mode;             /** mode */
-    enum OpusBandwidth bandwidth;   /** bandwidth */
+    int frame_count;                /**< frame count */
+    int frame_offset[MAX_FRAMES];   /**< frame offsets */
+    int frame_size[MAX_FRAMES];     /**< frame sizes */
+    int frame_duration;             /**< frame duration, in samples @ 48kHz */
+    enum OpusMode mode;             /**< mode */
+    enum OpusBandwidth bandwidth;   /**< bandwidth */
 } OpusPacket;
 
 typedef struct OpusStreamContext {
@@ -126,7 +104,7 @@ typedef struct OpusStreamContext {
     OpusRangeCoder rc;
     OpusRangeCoder redundancy_rc;
     SilkContext *silk;
-    CeltContext *celt;
+    CeltFrame *celt;
     AVFloatDSPContext *fdsp;
 
     float silk_buf[2][960];
@@ -144,7 +122,7 @@ typedef struct OpusStreamContext {
     float *out_dummy;
     int    out_dummy_allocated_size;
 
-    AVAudioResampleContext *avr;
+    SwrContext *swr;
     AVAudioFifo *celt_delay;
     int silk_samplerate;
     /* number of samples we still want to get from the resampler */
@@ -172,7 +150,9 @@ typedef struct ChannelMap {
 } ChannelMap;
 
 typedef struct OpusContext {
+    AVClass *av_class;
     OpusStreamContext *streams;
+    int apply_phase_inv;
 
     /* current output buffers for each streams */
     float **out;
@@ -186,217 +166,13 @@ typedef struct OpusContext {
     int             nb_streams;
     int      nb_stereo_streams;
 
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     int16_t gain_i;
     float   gain;
 
     ChannelMap *channel_maps;
 } OpusContext;
 
-static av_always_inline void opus_rc_normalize(OpusRangeCoder *rc)
-{
-    while (rc->range <= 1<<23) {
-        rc->value = ((rc->value << 8) | (bitstream_read(&rc->bc, 8) ^ 0xFF)) & ((1u << 31) - 1);
-        rc->range          <<= 8;
-        rc->total_read_bits += 8;
-    }
-}
-
-static av_always_inline void opus_rc_update(OpusRangeCoder *rc, unsigned int scale,
-                                          unsigned int low, unsigned int high,
-                                          unsigned int total)
-{
-    rc->value -= scale * (total - high);
-    rc->range  = low ? scale * (high - low)
-                      : rc->range - scale * (total - high);
-    opus_rc_normalize(rc);
-}
-
-static av_always_inline unsigned int opus_rc_getsymbol(OpusRangeCoder *rc, const uint16_t *cdf)
-{
-    unsigned int k, scale, total, symbol, low, high;
-
-    total = *cdf++;
-
-    scale   = rc->range / total;
-    symbol = rc->value / scale + 1;
-    symbol = total - FFMIN(symbol, total);
-
-    for (k = 0; cdf[k] <= symbol; k++);
-    high = cdf[k];
-    low  = k ? cdf[k-1] : 0;
-
-    opus_rc_update(rc, scale, low, high, total);
-
-    return k;
-}
-
-static av_always_inline unsigned int opus_rc_p2model(OpusRangeCoder *rc, unsigned int bits)
-{
-    unsigned int k, scale;
-    scale = rc->range >> bits; // in this case, scale = symbol
-
-    if (rc->value >= scale) {
-        rc->value -= scale;
-        rc->range -= scale;
-        k = 0;
-    } else {
-        rc->range = scale;
-        k = 1;
-    }
-    opus_rc_normalize(rc);
-    return k;
-}
-
-/**
- * CELT: estimate bits of entropy that have thus far been consumed for the
- *       current CELT frame, to integer and fractional (1/8th bit) precision
- */
-static av_always_inline unsigned int opus_rc_tell(const OpusRangeCoder *rc)
-{
-    return rc->total_read_bits - av_log2(rc->range) - 1;
-}
-
-static av_always_inline unsigned int opus_rc_tell_frac(const OpusRangeCoder *rc)
-{
-    unsigned int i, total_bits, rcbuffer, range;
-
-    total_bits = rc->total_read_bits << 3;
-    rcbuffer   = av_log2(rc->range) + 1;
-    range      = rc->range >> (rcbuffer-16);
-
-    for (i = 0; i < 3; i++) {
-        int bit;
-        range = range * range >> 15;
-        bit = range >> 16;
-        rcbuffer = rcbuffer << 1 | bit;
-        range >>= bit;
-    }
-
-    return total_bits - rcbuffer;
-}
-
-/**
- * CELT: read 1-25 raw bits at the end of the frame, backwards byte-wise
- */
-static av_always_inline unsigned int opus_getrawbits(OpusRangeCoder *rc, unsigned int count)
-{
-    unsigned int value = 0;
-
-    while (rc->rb.bytes && rc->rb.cachelen < count) {
-        rc->rb.cacheval |= *--rc->rb.position << rc->rb.cachelen;
-        rc->rb.cachelen += 8;
-        rc->rb.bytes--;
-    }
-
-    value = rc->rb.cacheval & ((1<<count)-1);
-    rc->rb.cacheval    >>= count;
-    rc->rb.cachelen     -= count;
-    rc->total_read_bits += count;
-
-    return value;
-}
-
-/**
- * CELT: read a uniform distribution
- */
-static av_always_inline unsigned int opus_rc_unimodel(OpusRangeCoder *rc, unsigned int size)
-{
-    unsigned int bits, k, scale, total;
-
-    bits  = opus_ilog(size - 1);
-    total = (bits > 8) ? ((size - 1) >> (bits - 8)) + 1 : size;
-
-    scale  = rc->range / total;
-    k      = rc->value / scale + 1;
-    k      = total - FFMIN(k, total);
-    opus_rc_update(rc, scale, k, k + 1, total);
-
-    if (bits > 8) {
-        k = k << (bits - 8) | opus_getrawbits(rc, bits - 8);
-        return FFMIN(k, size - 1);
-    } else
-        return k;
-}
-
-static av_always_inline int opus_rc_laplace(OpusRangeCoder *rc, unsigned int symbol, int decay)
-{
-    /* extends the range coder to model a Laplace distribution */
-    int value = 0;
-    unsigned int scale, low = 0, center;
-
-    scale  = rc->range >> 15;
-    center = rc->value / scale + 1;
-    center = (1 << 15) - FFMIN(center, 1 << 15);
-
-    if (center >= symbol) {
-        value++;
-        low = symbol;
-        symbol = 1 + ((32768 - 32 - symbol) * (16384-decay) >> 15);
-
-        while (symbol > 1 && center >= low + 2 * symbol) {
-            value++;
-            symbol *= 2;
-            low    += symbol;
-            symbol  = (((symbol - 2) * decay) >> 15) + 1;
-        }
-
-        if (symbol <= 1) {
-            int distance = (center - low) >> 1;
-            value += distance;
-            low   += 2 * distance;
-        }
-
-        if (center < low + symbol)
-            value *= -1;
-        else
-            low += symbol;
-    }
-
-    opus_rc_update(rc, scale, low, FFMIN(low + symbol, 32768), 32768);
-
-    return value;
-}
-
-static av_always_inline unsigned int opus_rc_stepmodel(OpusRangeCoder *rc, int k0)
-{
-    /* Use a probability of 3 up to itheta=8192 and then use 1 after */
-    unsigned int k, scale, symbol, total = (k0+1)*3 + k0;
-    scale  = rc->range / total;
-    symbol = rc->value / scale + 1;
-    symbol = total - FFMIN(symbol, total);
-
-    k = (symbol < (k0+1)*3) ? symbol/3 : symbol - (k0+1)*2;
-
-    opus_rc_update(rc, scale, (k <= k0) ? 3*(k+0) : (k-1-k0) + 3*(k0+1),
-                   (k <= k0) ? 3*(k+1) : (k-0-k0) + 3*(k0+1), total);
-    return k;
-}
-
-static av_always_inline unsigned int opus_rc_trimodel(OpusRangeCoder *rc, int qn)
-{
-    unsigned int k, scale, symbol, total, low, center;
-
-    total = ((qn>>1) + 1) * ((qn>>1) + 1);
-    scale   = rc->range / total;
-    center = rc->value / scale + 1;
-    center = total - FFMIN(center, total);
-
-    if (center < total >> 1) {
-        k      = (ff_sqrt(8 * center + 1) - 1) >> 1;
-        low    = k * (k + 1) >> 1;
-        symbol = k + 1;
-    } else {
-        k      = (2*(qn + 1) - ff_sqrt(8*(total - center - 1) + 1)) >> 1;
-        low    = total - ((qn + 1 - k) * (qn + 2 - k) >> 1);
-        symbol = qn + 1 - k;
-    }
-
-    opus_rc_update(rc, scale, low, low + symbol, total);
-
-    return k;
-}
-
 int ff_opus_parse_packet(OpusPacket *pkt, const uint8_t *buf, int buf_size,
                          int self_delimited);
 
@@ -415,16 +191,10 @@ int ff_silk_decode_superframe(SilkContext *s, OpusRangeCoder *rc,
                               enum OpusBandwidth bandwidth, int coded_channels,
                               int duration_ms);
 
-int ff_celt_init(AVCodecContext *avctx, CeltContext **s, int output_channels);
-
-void ff_celt_free(CeltContext **s);
-
-void ff_celt_flush(CeltContext *s);
-
-int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc,
-                         float **output, int coded_channels, int frame_size,
-                         int startband,  int endband);
+/* Encode or decode CELT bands */
+void ff_celt_quant_bands(CeltFrame *f, OpusRangeCoder *rc);
 
-extern const float ff_celt_window2[120];
+/* Encode or decode CELT bitallocation */
+void ff_celt_bitalloc(CeltFrame *f, OpusRangeCoder *rc, int encode);
 
 #endif /* AVCODEC_OPUS_H */
diff --git a/libavcodec/opus_celt.c b/libavcodec/opus_celt.c
index 07a4f77..115dd8c 100644
--- a/libavcodec/opus_celt.c
+++ b/libavcodec/opus_celt.c
@@ -1,21 +1,22 @@
 /*
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,1670 +25,142 @@
  * Opus CELT decoder
  */
 
-#include <stdint.h>
+#include "opus_celt.h"
+#include "opustab.h"
+#include "opus_pvq.h"
 
-#include "libavutil/float_dsp.h"
-
-#include "imdct15.h"
-#include "opus.h"
-
-enum CeltSpread {
-    CELT_SPREAD_NONE,
-    CELT_SPREAD_LIGHT,
-    CELT_SPREAD_NORMAL,
-    CELT_SPREAD_AGGRESSIVE
-};
-
-typedef struct CeltFrame {
-    float energy[CELT_MAX_BANDS];
-    float prev_energy[2][CELT_MAX_BANDS];
-
-    uint8_t collapse_masks[CELT_MAX_BANDS];
-
-    /* buffer for mdct output + postfilter */
-    DECLARE_ALIGNED(32, float, buf)[2048];
-
-    /* postfilter parameters */
-    int pf_period_new;
-    float pf_gains_new[3];
-    int pf_period;
-    float pf_gains[3];
-    int pf_period_old;
-    float pf_gains_old[3];
-
-    float deemph_coeff;
-} CeltFrame;
-
-struct CeltContext {
-    // constant values that do not change during context lifetime
-    AVCodecContext    *avctx;
-    IMDCT15Context    *imdct[4];
-    AVFloatDSPContext  dsp;
-    int output_channels;
-
-    // values that have inter-frame effect and must be reset on flush
-    CeltFrame frame[2];
-    uint32_t seed;
-    int flushed;
-
-    // values that only affect a single frame
-    int coded_channels;
-    int framebits;
-    int duration;
-
-    /* number of iMDCT blocks in the frame */
-    int blocks;
-    /* size of each block */
-    int blocksize;
-
-    int startband;
-    int endband;
-    int codedbands;
-
-    int anticollapse_bit;
-
-    int intensitystereo;
-    int dualstereo;
-    enum CeltSpread spread;
-
-    int remaining;
-    int remaining2;
-    int fine_bits    [CELT_MAX_BANDS];
-    int fine_priority[CELT_MAX_BANDS];
-    int pulses       [CELT_MAX_BANDS];
-    int tf_change    [CELT_MAX_BANDS];
-
-    DECLARE_ALIGNED(32, float, coeffs)[2][CELT_MAX_FRAME_SIZE];
-    DECLARE_ALIGNED(32, float, scratch)[22 * 8]; // MAX(celt_freq_range) * 1<<CELT_MAX_LOG_BLOCKS
-};
-
-static const uint16_t celt_model_tapset[] = { 4, 2, 3, 4 };
-
-static const uint16_t celt_model_spread[] = { 32, 7, 9, 30, 32 };
-
-static const uint16_t celt_model_alloc_trim[] = {
-    128,   2,   4,   9,  19,  41,  87, 109, 119, 124, 126, 128
-};
-
-static const uint16_t celt_model_energy_small[] = { 4, 2, 3, 4 };
-
-static const uint8_t celt_freq_bands[] = { /* in steps of 200Hz */
-    0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
-};
-
-static const uint8_t celt_freq_range[] = {
-    1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  4,  4,  4,  6,  6,  8, 12, 18, 22
-};
-
-static const uint8_t celt_log_freq_range[] = {
-    0,  0,  0,  0,  0,  0,  0,  0,  8,  8,  8,  8, 16, 16, 16, 21, 21, 24, 29, 34, 36
-};
-
-static const int8_t celt_tf_select[4][2][2][2] = {
-    { { { 0, -1 }, { 0, -1 } }, { { 0, -1 }, { 0, -1 } } },
-    { { { 0, -1 }, { 0, -2 } }, { { 1,  0 }, { 1, -1 } } },
-    { { { 0, -2 }, { 0, -3 } }, { { 2,  0 }, { 1, -1 } } },
-    { { { 0, -2 }, { 0, -3 } }, { { 3,  0 }, { 1, -1 } } }
-};
-
-static const float celt_mean_energy[] = {
-    6.437500f, 6.250000f, 5.750000f, 5.312500f, 5.062500f,
-    4.812500f, 4.500000f, 4.375000f, 4.875000f, 4.687500f,
-    4.562500f, 4.437500f, 4.875000f, 4.625000f, 4.312500f,
-    4.500000f, 4.375000f, 4.625000f, 4.750000f, 4.437500f,
-    3.750000f, 3.750000f, 3.750000f, 3.750000f, 3.750000f
-};
-
-static const float celt_alpha_coef[] = {
-    29440.0f/32768.0f,    26112.0f/32768.0f,    21248.0f/32768.0f,    16384.0f/32768.0f
-};
-
-static const float celt_beta_coef[] = { /* TODO: precompute 1 minus this if the code ends up neater */
-    30147.0f/32768.0f,    22282.0f/32768.0f,    12124.0f/32768.0f,     6554.0f/32768.0f
-};
-
-static const uint8_t celt_coarse_energy_dist[4][2][42] = {
-    {
-        {       // 120-sample inter
-             72, 127,  65, 129,  66, 128,  65, 128,  64, 128,  62, 128,  64, 128,
-             64, 128,  92,  78,  92,  79,  92,  78,  90,  79, 116,  41, 115,  40,
-            114,  40, 132,  26, 132,  26, 145,  17, 161,  12, 176,  10, 177,  11
-        }, {    // 120-sample intra
-             24, 179,  48, 138,  54, 135,  54, 132,  53, 134,  56, 133,  55, 132,
-             55, 132,  61, 114,  70,  96,  74,  88,  75,  88,  87,  74,  89,  66,
-             91,  67, 100,  59, 108,  50, 120,  40, 122,  37,  97,  43,  78,  50
-        }
-    }, {
-        {       // 240-sample inter
-             83,  78,  84,  81,  88,  75,  86,  74,  87,  71,  90,  73,  93,  74,
-             93,  74, 109,  40, 114,  36, 117,  34, 117,  34, 143,  17, 145,  18,
-            146,  19, 162,  12, 165,  10, 178,   7, 189,   6, 190,   8, 177,   9
-        }, {    // 240-sample intra
-             23, 178,  54, 115,  63, 102,  66,  98,  69,  99,  74,  89,  71,  91,
-             73,  91,  78,  89,  86,  80,  92,  66,  93,  64, 102,  59, 103,  60,
-            104,  60, 117,  52, 123,  44, 138,  35, 133,  31,  97,  38,  77,  45
-        }
-    }, {
-        {       // 480-sample inter
-             61,  90,  93,  60, 105,  42, 107,  41, 110,  45, 116,  38, 113,  38,
-            112,  38, 124,  26, 132,  27, 136,  19, 140,  20, 155,  14, 159,  16,
-            158,  18, 170,  13, 177,  10, 187,   8, 192,   6, 175,   9, 159,  10
-        }, {    // 480-sample intra
-             21, 178,  59, 110,  71,  86,  75,  85,  84,  83,  91,  66,  88,  73,
-             87,  72,  92,  75,  98,  72, 105,  58, 107,  54, 115,  52, 114,  55,
-            112,  56, 129,  51, 132,  40, 150,  33, 140,  29,  98,  35,  77,  42
-        }
-    }, {
-        {       // 960-sample inter
-             42, 121,  96,  66, 108,  43, 111,  40, 117,  44, 123,  32, 120,  36,
-            119,  33, 127,  33, 134,  34, 139,  21, 147,  23, 152,  20, 158,  25,
-            154,  26, 166,  21, 173,  16, 184,  13, 184,  10, 150,  13, 139,  15
-        }, {    // 960-sample intra
-             22, 178,  63, 114,  74,  82,  84,  83,  92,  82, 103,  62,  96,  72,
-             96,  67, 101,  73, 107,  72, 113,  55, 118,  52, 125,  52, 118,  52,
-            117,  55, 135,  49, 137,  39, 157,  32, 145,  29,  97,  33,  77,  40
-        }
-    }
-};
-
-static const uint8_t celt_static_alloc[11][21] = {  /* 1/32 bit/sample */
-    {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 },
-    {  90,  80,  75,  69,  63,  56,  49,  40,  34,  29,  20,  18,  10,   0,   0,   0,   0,   0,   0,   0,   0 },
-    { 110, 100,  90,  84,  78,  71,  65,  58,  51,  45,  39,  32,  26,  20,  12,   0,   0,   0,   0,   0,   0 },
-    { 118, 110, 103,  93,  86,  80,  75,  70,  65,  59,  53,  47,  40,  31,  23,  15,   4,   0,   0,   0,   0 },
-    { 126, 119, 112, 104,  95,  89,  83,  78,  72,  66,  60,  54,  47,  39,  32,  25,  17,  12,   1,   0,   0 },
-    { 134, 127, 120, 114, 103,  97,  91,  85,  78,  72,  66,  60,  54,  47,  41,  35,  29,  23,  16,  10,   1 },
-    { 144, 137, 130, 124, 113, 107, 101,  95,  88,  82,  76,  70,  64,  57,  51,  45,  39,  33,  26,  15,   1 },
-    { 152, 145, 138, 132, 123, 117, 111, 105,  98,  92,  86,  80,  74,  67,  61,  55,  49,  43,  36,  20,   1 },
-    { 162, 155, 148, 142, 133, 127, 121, 115, 108, 102,  96,  90,  84,  77,  71,  65,  59,  53,  46,  30,   1 },
-    { 172, 165, 158, 152, 143, 137, 131, 125, 118, 112, 106, 100,  94,  87,  81,  75,  69,  63,  56,  45,  20 },
-    { 200, 200, 200, 200, 200, 200, 200, 200, 198, 193, 188, 183, 178, 173, 168, 163, 158, 153, 148, 129, 104 }
-};
-
-static const uint8_t celt_static_caps[4][2][21] = {
-    {       // 120-sample
-        {224, 224, 224, 224, 224, 224, 224, 224, 160, 160,
-         160, 160, 185, 185, 185, 178, 178, 168, 134,  61,  37},
-        {224, 224, 224, 224, 224, 224, 224, 224, 240, 240,
-         240, 240, 207, 207, 207, 198, 198, 183, 144,  66,  40},
-    }, {    // 240-sample
-        {160, 160, 160, 160, 160, 160, 160, 160, 185, 185,
-         185, 185, 193, 193, 193, 183, 183, 172, 138,  64,  38},
-        {240, 240, 240, 240, 240, 240, 240, 240, 207, 207,
-         207, 207, 204, 204, 204, 193, 193, 180, 143,  66,  40},
-    }, {    // 480-sample
-        {185, 185, 185, 185, 185, 185, 185, 185, 193, 193,
-         193, 193, 193, 193, 193, 183, 183, 172, 138,  65,  39},
-        {207, 207, 207, 207, 207, 207, 207, 207, 204, 204,
-         204, 204, 201, 201, 201, 188, 188, 176, 141,  66,  40},
-    }, {    // 960-sample
-        {193, 193, 193, 193, 193, 193, 193, 193, 193, 193,
-         193, 193, 194, 194, 194, 184, 184, 173, 139,  65,  39},
-        {204, 204, 204, 204, 204, 204, 204, 204, 201, 201,
-         201, 201, 198, 198, 198, 187, 187, 175, 140,  66,  40}
-    }
-};
-
-static const uint8_t celt_cache_bits[392] = {
-    40, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 40, 15, 23, 28,
-    31, 34, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50,
-    51, 52, 53, 54, 55, 55, 57, 58, 59, 60, 61, 62, 63, 63, 65,
-    66, 67, 68, 69, 70, 71, 71, 40, 20, 33, 41, 48, 53, 57, 61,
-    64, 66, 69, 71, 73, 75, 76, 78, 80, 82, 85, 87, 89, 91, 92,
-    94, 96, 98, 101, 103, 105, 107, 108, 110, 112, 114, 117, 119, 121, 123,
-    124, 126, 128, 40, 23, 39, 51, 60, 67, 73, 79, 83, 87, 91, 94,
-    97, 100, 102, 105, 107, 111, 115, 118, 121, 124, 126, 129, 131, 135, 139,
-    142, 145, 148, 150, 153, 155, 159, 163, 166, 169, 172, 174, 177, 179, 35,
-    28, 49, 65, 78, 89, 99, 107, 114, 120, 126, 132, 136, 141, 145, 149,
-    153, 159, 165, 171, 176, 180, 185, 189, 192, 199, 205, 211, 216, 220, 225,
-    229, 232, 239, 245, 251, 21, 33, 58, 79, 97, 112, 125, 137, 148, 157,
-    166, 174, 182, 189, 195, 201, 207, 217, 227, 235, 243, 251, 17, 35, 63,
-    86, 106, 123, 139, 152, 165, 177, 187, 197, 206, 214, 222, 230, 237, 250,
-    25, 31, 55, 75, 91, 105, 117, 128, 138, 146, 154, 161, 168, 174, 180,
-    185, 190, 200, 208, 215, 222, 229, 235, 240, 245, 255, 16, 36, 65, 89,
-    110, 128, 144, 159, 173, 185, 196, 207, 217, 226, 234, 242, 250, 11, 41,
-    74, 103, 128, 151, 172, 191, 209, 225, 241, 255, 9, 43, 79, 110, 138,
-    163, 186, 207, 227, 246, 12, 39, 71, 99, 123, 144, 164, 182, 198, 214,
-    228, 241, 253, 9, 44, 81, 113, 142, 168, 192, 214, 235, 255, 7, 49,
-    90, 127, 160, 191, 220, 247, 6, 51, 95, 134, 170, 203, 234, 7, 47,
-    87, 123, 155, 184, 212, 237, 6, 52, 97, 137, 174, 208, 240, 5, 57,
-    106, 151, 192, 231, 5, 59, 111, 158, 202, 243, 5, 55, 103, 147, 187,
-    224, 5, 60, 113, 161, 206, 248, 4, 65, 122, 175, 224, 4, 67, 127,
-    182, 234
-};
-
-static const int16_t celt_cache_index[105] = {
-    -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 41, 41, 41,
-    82, 82, 123, 164, 200, 222, 0, 0, 0, 0, 0, 0, 0, 0, 41,
-    41, 41, 41, 123, 123, 123, 164, 164, 240, 266, 283, 295, 41, 41, 41,
-    41, 41, 41, 41, 41, 123, 123, 123, 123, 240, 240, 240, 266, 266, 305,
-    318, 328, 336, 123, 123, 123, 123, 123, 123, 123, 123, 240, 240, 240, 240,
-    305, 305, 305, 318, 318, 343, 351, 358, 364, 240, 240, 240, 240, 240, 240,
-    240, 240, 305, 305, 305, 305, 343, 343, 343, 351, 351, 370, 376, 382, 387,
-};
-
-static const uint8_t celt_log2_frac[] = {
-    0, 8, 13, 16, 19, 21, 23, 24, 26, 27, 28, 29, 30, 31, 32, 32, 33, 34, 34, 35, 36, 36, 37, 37
-};
-
-static const uint8_t celt_bit_interleave[] = {
-    0, 1, 1, 1, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3
-};
-
-static const uint8_t celt_bit_deinterleave[] = {
-    0x00, 0x03, 0x0C, 0x0F, 0x30, 0x33, 0x3C, 0x3F,
-    0xC0, 0xC3, 0xCC, 0xCF, 0xF0, 0xF3, 0xFC, 0xFF
-};
-
-static const uint8_t celt_hadamard_ordery[] = {
-    1,   0,
-    3,   0,  2,  1,
-    7,   0,  4,  3,  6,  1,  5,  2,
-    15,  0,  8,  7, 12,  3, 11,  4, 14,  1,  9,  6, 13,  2, 10,  5
-};
-
-static const uint16_t celt_qn_exp2[] = {
-    16384, 17866, 19483, 21247, 23170, 25267, 27554, 30048
-};
-
-static const uint32_t celt_pvq_u[1272] = {
-    /* N = 0, K = 0...176 */
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* N = 1, K = 1...176 */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    /* N = 2, K = 2...176 */
-    3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
-    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
-    81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
-    115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 141, 143,
-    145, 147, 149, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173,
-    175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203,
-    205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233,
-    235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 259, 261, 263,
-    265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293,
-    295, 297, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323,
-    325, 327, 329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351,
-    /* N = 3, K = 3...176 */
-    13, 25, 41, 61, 85, 113, 145, 181, 221, 265, 313, 365, 421, 481, 545, 613,
-    685, 761, 841, 925, 1013, 1105, 1201, 1301, 1405, 1513, 1625, 1741, 1861,
-    1985, 2113, 2245, 2381, 2521, 2665, 2813, 2965, 3121, 3281, 3445, 3613, 3785,
-    3961, 4141, 4325, 4513, 4705, 4901, 5101, 5305, 5513, 5725, 5941, 6161, 6385,
-    6613, 6845, 7081, 7321, 7565, 7813, 8065, 8321, 8581, 8845, 9113, 9385, 9661,
-    9941, 10225, 10513, 10805, 11101, 11401, 11705, 12013, 12325, 12641, 12961,
-    13285, 13613, 13945, 14281, 14621, 14965, 15313, 15665, 16021, 16381, 16745,
-    17113, 17485, 17861, 18241, 18625, 19013, 19405, 19801, 20201, 20605, 21013,
-    21425, 21841, 22261, 22685, 23113, 23545, 23981, 24421, 24865, 25313, 25765,
-    26221, 26681, 27145, 27613, 28085, 28561, 29041, 29525, 30013, 30505, 31001,
-    31501, 32005, 32513, 33025, 33541, 34061, 34585, 35113, 35645, 36181, 36721,
-    37265, 37813, 38365, 38921, 39481, 40045, 40613, 41185, 41761, 42341, 42925,
-    43513, 44105, 44701, 45301, 45905, 46513, 47125, 47741, 48361, 48985, 49613,
-    50245, 50881, 51521, 52165, 52813, 53465, 54121, 54781, 55445, 56113, 56785,
-    57461, 58141, 58825, 59513, 60205, 60901, 61601,
-    /* N = 4, K = 4...176 */
-    63, 129, 231, 377, 575, 833, 1159, 1561, 2047, 2625, 3303, 4089, 4991, 6017,
-    7175, 8473, 9919, 11521, 13287, 15225, 17343, 19649, 22151, 24857, 27775,
-    30913, 34279, 37881, 41727, 45825, 50183, 54809, 59711, 64897, 70375, 76153,
-    82239, 88641, 95367, 102425, 109823, 117569, 125671, 134137, 142975, 152193,
-    161799, 171801, 182207, 193025, 204263, 215929, 228031, 240577, 253575,
-    267033, 280959, 295361, 310247, 325625, 341503, 357889, 374791, 392217,
-    410175, 428673, 447719, 467321, 487487, 508225, 529543, 551449, 573951,
-    597057, 620775, 645113, 670079, 695681, 721927, 748825, 776383, 804609,
-    833511, 863097, 893375, 924353, 956039, 988441, 1021567, 1055425, 1090023,
-    1125369, 1161471, 1198337, 1235975, 1274393, 1313599, 1353601, 1394407,
-    1436025, 1478463, 1521729, 1565831, 1610777, 1656575, 1703233, 1750759,
-    1799161, 1848447, 1898625, 1949703, 2001689, 2054591, 2108417, 2163175,
-    2218873, 2275519, 2333121, 2391687, 2451225, 2511743, 2573249, 2635751,
-    2699257, 2763775, 2829313, 2895879, 2963481, 3032127, 3101825, 3172583,
-    3244409, 3317311, 3391297, 3466375, 3542553, 3619839, 3698241, 3777767,
-    3858425, 3940223, 4023169, 4107271, 4192537, 4278975, 4366593, 4455399,
-    4545401, 4636607, 4729025, 4822663, 4917529, 5013631, 5110977, 5209575,
-    5309433, 5410559, 5512961, 5616647, 5721625, 5827903, 5935489, 6044391,
-    6154617, 6266175, 6379073, 6493319, 6608921, 6725887, 6844225, 6963943,
-    7085049, 7207551,
-    /* N = 5, K = 5...176 */
-    321, 681, 1289, 2241, 3649, 5641, 8361, 11969, 16641, 22569, 29961, 39041,
-    50049, 63241, 78889, 97281, 118721, 143529, 172041, 204609, 241601, 283401,
-    330409, 383041, 441729, 506921, 579081, 658689, 746241, 842249, 947241,
-    1061761, 1186369, 1321641, 1468169, 1626561, 1797441, 1981449, 2179241,
-    2391489, 2618881, 2862121, 3121929, 3399041, 3694209, 4008201, 4341801,
-    4695809, 5071041, 5468329, 5888521, 6332481, 6801089, 7295241, 7815849,
-    8363841, 8940161, 9545769, 10181641, 10848769, 11548161, 12280841, 13047849,
-    13850241, 14689089, 15565481, 16480521, 17435329, 18431041, 19468809,
-    20549801, 21675201, 22846209, 24064041, 25329929, 26645121, 28010881,
-    29428489, 30899241, 32424449, 34005441, 35643561, 37340169, 39096641,
-    40914369, 42794761, 44739241, 46749249, 48826241, 50971689, 53187081,
-    55473921, 57833729, 60268041, 62778409, 65366401, 68033601, 70781609,
-    73612041, 76526529, 79526721, 82614281, 85790889, 89058241, 92418049,
-    95872041, 99421961, 103069569, 106816641, 110664969, 114616361, 118672641,
-    122835649, 127107241, 131489289, 135983681, 140592321, 145317129, 150160041,
-    155123009, 160208001, 165417001, 170752009, 176215041, 181808129, 187533321,
-    193392681, 199388289, 205522241, 211796649, 218213641, 224775361, 231483969,
-    238341641, 245350569, 252512961, 259831041, 267307049, 274943241, 282741889,
-    290705281, 298835721, 307135529, 315607041, 324252609, 333074601, 342075401,
-    351257409, 360623041, 370174729, 379914921, 389846081, 399970689, 410291241,
-    420810249, 431530241, 442453761, 453583369, 464921641, 476471169, 488234561,
-    500214441, 512413449, 524834241, 537479489, 550351881, 563454121, 576788929,
-    590359041, 604167209, 618216201, 632508801,
-    /* N = 6, K = 6...96 (technically V(109,5) fits in 32 bits, but that can't be
-     achieved by splitting an Opus band) */
-    1683, 3653, 7183, 13073, 22363, 36365, 56695, 85305, 124515, 177045, 246047,
-    335137, 448427, 590557, 766727, 982729, 1244979, 1560549, 1937199, 2383409,
-    2908411, 3522221, 4235671, 5060441, 6009091, 7095093, 8332863, 9737793,
-    11326283, 13115773, 15124775, 17372905, 19880915, 22670725, 25765455,
-    29189457, 32968347, 37129037, 41699767, 46710137, 52191139, 58175189,
-    64696159, 71789409, 79491819, 87841821, 96879431, 106646281, 117185651,
-    128542501, 140763503, 153897073, 167993403, 183104493, 199284183, 216588185,
-    235074115, 254801525, 275831935, 298228865, 322057867, 347386557, 374284647,
-    402823977, 433078547, 465124549, 499040399, 534906769, 572806619, 612825229,
-    655050231, 699571641, 746481891, 795875861, 847850911, 902506913, 959946283,
-    1020274013, 1083597703, 1150027593, 1219676595, 1292660325, 1369097135,
-    1449108145, 1532817275, 1620351277, 1711839767, 1807415257, 1907213187,
-    2011371957, 2120032959,
-    /* N = 7, K = 7...54 (technically V(60,6) fits in 32 bits, but that can't be
-     achieved by splitting an Opus band) */
-    8989, 19825, 40081, 75517, 134245, 227305, 369305, 579125, 880685, 1303777,
-    1884961, 2668525, 3707509, 5064793, 6814249, 9041957, 11847485, 15345233,
-    19665841, 24957661, 31388293, 39146185, 48442297, 59511829, 72616013,
-    88043969, 106114625, 127178701, 151620757, 179861305, 212358985, 249612805,
-    292164445, 340600625, 395555537, 457713341, 527810725, 606639529, 695049433,
-    793950709, 904317037, 1027188385, 1163673953, 1314955181, 1482288821,
-    1667010073, 1870535785, 2094367717,
-    /* N = 8, K = 8...37 (technically V(40,7) fits in 32 bits, but that can't be
-     achieved by splitting an Opus band) */
-    48639, 108545, 224143, 433905, 795455, 1392065, 2340495, 3800305, 5984767,
-    9173505, 13726991, 20103025, 28875327, 40754369, 56610575, 77500017,
-    104692735, 139703809, 184327311, 240673265, 311207743, 398796225, 506750351,
-    638878193, 799538175, 993696769, 1226990095, 1505789553, 1837271615,
-    2229491905,
-    /* N = 9, K = 9...28 (technically V(29,8) fits in 32 bits, but that can't be
-     achieved by splitting an Opus band) */
-    265729, 598417, 1256465, 2485825, 4673345, 8405905, 14546705, 24331777,
-    39490049, 62390545, 96220561, 145198913, 214828609, 312193553, 446304145,
-    628496897, 872893441, 1196924561, 1621925137, 2173806145,
-    /* N = 10, K = 10...24 */
-    1462563, 3317445, 7059735, 14218905, 27298155, 50250765, 89129247, 152951073,
-    254831667, 413442773, 654862247, 1014889769, 1541911931, 2300409629,
-    3375210671,
-    /* N = 11, K = 11...19 (technically V(20,10) fits in 32 bits, but that can't be
-     achieved by splitting an Opus band) */
-    8097453, 18474633, 39753273, 81270333, 158819253, 298199265, 540279585,
-    948062325, 1616336765,
-    /* N = 12, K = 12...18 */
-    45046719, 103274625, 224298231, 464387817, 921406335, 1759885185,
-    3248227095,
-    /* N = 13, K = 13...16 */
-    251595969, 579168825, 1267854873, 2653649025,
-    /* N = 14, K = 14 */
-    1409933619
-};
-
-DECLARE_ALIGNED(32, static const float, celt_window)[120] = {
-    6.7286966e-05f, 0.00060551348f, 0.0016815970f, 0.0032947962f, 0.0054439943f,
-    0.0081276923f, 0.011344001f, 0.015090633f, 0.019364886f, 0.024163635f,
-    0.029483315f, 0.035319905f, 0.041668911f, 0.048525347f, 0.055883718f,
-    0.063737999f, 0.072081616f, 0.080907428f, 0.090207705f, 0.099974111f,
-    0.11019769f, 0.12086883f, 0.13197729f, 0.14351214f, 0.15546177f,
-    0.16781389f, 0.18055550f, 0.19367290f, 0.20715171f, 0.22097682f,
-    0.23513243f, 0.24960208f, 0.26436860f, 0.27941419f, 0.29472040f,
-    0.31026818f, 0.32603788f, 0.34200931f, 0.35816177f, 0.37447407f,
-    0.39092462f, 0.40749142f, 0.42415215f, 0.44088423f, 0.45766484f,
-    0.47447104f, 0.49127978f, 0.50806798f, 0.52481261f, 0.54149077f,
-    0.55807973f, 0.57455701f, 0.59090049f, 0.60708841f, 0.62309951f,
-    0.63891306f, 0.65450896f, 0.66986776f, 0.68497077f, 0.69980010f,
-    0.71433873f, 0.72857055f, 0.74248043f, 0.75605424f, 0.76927895f,
-    0.78214257f, 0.79463430f, 0.80674445f, 0.81846456f, 0.82978733f,
-    0.84070669f, 0.85121779f, 0.86131698f, 0.87100183f, 0.88027111f,
-    0.88912479f, 0.89756398f, 0.90559094f, 0.91320904f, 0.92042270f,
-    0.92723738f, 0.93365955f, 0.93969656f, 0.94535671f, 0.95064907f,
-    0.95558353f, 0.96017067f, 0.96442171f, 0.96834849f, 0.97196334f,
-    0.97527906f, 0.97830883f, 0.98106616f, 0.98356480f, 0.98581869f,
-    0.98784191f, 0.98964856f, 0.99125274f, 0.99266849f, 0.99390969f,
-    0.99499004f, 0.99592297f, 0.99672162f, 0.99739874f, 0.99796667f,
-    0.99843728f, 0.99882195f, 0.99913147f, 0.99937606f, 0.99956527f,
-    0.99970802f, 0.99981248f, 0.99988613f, 0.99993565f, 0.99996697f,
-    0.99998518f, 0.99999457f, 0.99999859f, 0.99999982f, 1.0000000f,
-};
-
-/* square of the window, used for the postfilter */
-const float ff_celt_window2[120] = {
-    4.5275357e-09f, 3.66647e-07f, 2.82777e-06f, 1.08557e-05f, 2.96371e-05f, 6.60594e-05f,
-    0.000128686f, 0.000227727f, 0.000374999f, 0.000583881f, 0.000869266f, 0.0012475f,
-    0.0017363f, 0.00235471f, 0.00312299f, 0.00406253f, 0.00519576f, 0.00654601f,
-    0.00813743f, 0.00999482f, 0.0121435f, 0.0146093f, 0.017418f, 0.0205957f, 0.0241684f,
-    0.0281615f, 0.0326003f, 0.0375092f, 0.0429118f, 0.0488308f, 0.0552873f, 0.0623012f,
-    0.0698908f, 0.0780723f, 0.0868601f, 0.0962664f, 0.106301f, 0.11697f, 0.12828f,
-    0.140231f, 0.152822f, 0.166049f, 0.179905f, 0.194379f, 0.209457f, 0.225123f, 0.241356f,
-    0.258133f, 0.275428f, 0.293212f, 0.311453f, 0.330116f, 0.349163f, 0.368556f, 0.388253f,
-    0.40821f, 0.428382f, 0.448723f, 0.469185f, 0.48972f, 0.51028f, 0.530815f, 0.551277f,
-    0.571618f, 0.59179f, 0.611747f, 0.631444f, 0.650837f, 0.669884f, 0.688547f, 0.706788f,
-    0.724572f, 0.741867f, 0.758644f, 0.774877f, 0.790543f, 0.805621f, 0.820095f, 0.833951f,
-    0.847178f, 0.859769f, 0.87172f, 0.88303f, 0.893699f, 0.903734f, 0.91314f, 0.921928f,
-    0.930109f, 0.937699f, 0.944713f, 0.951169f, 0.957088f, 0.962491f, 0.9674f, 0.971838f,
-    0.975832f, 0.979404f, 0.982582f, 0.985391f, 0.987857f, 0.990005f, 0.991863f, 0.993454f,
-    0.994804f, 0.995937f, 0.996877f, 0.997645f, 0.998264f, 0.998753f, 0.999131f, 0.999416f,
-    0.999625f, 0.999772f, 0.999871f, 0.999934f, 0.99997f, 0.999989f, 0.999997f, 0.99999964f, 1.0f,
-};
-
-static const uint32_t * const celt_pvq_u_row[15] = {
-    celt_pvq_u +    0, celt_pvq_u +  176, celt_pvq_u +  351,
-    celt_pvq_u +  525, celt_pvq_u +  698, celt_pvq_u +  870,
-    celt_pvq_u + 1041, celt_pvq_u + 1131, celt_pvq_u + 1178,
-    celt_pvq_u + 1207, celt_pvq_u + 1226, celt_pvq_u + 1240,
-    celt_pvq_u + 1248, celt_pvq_u + 1254, celt_pvq_u + 1257
-};
-
-static inline int16_t celt_cos(int16_t x)
-{
-    x = (MUL16(x, x) + 4096) >> 13;
-    x = (32767-x) + ROUND_MUL16(x, (-7651 + ROUND_MUL16(x, (8277 + ROUND_MUL16(-626, x)))));
-    return 1+x;
-}
-
-static inline int celt_log2tan(int isin, int icos)
-{
-    int lc, ls;
-    lc = opus_ilog(icos);
-    ls = opus_ilog(isin);
-    icos <<= 15 - lc;
-    isin <<= 15 - ls;
-    return (ls << 11) - (lc << 11) +
-           ROUND_MUL16(isin, ROUND_MUL16(isin, -2597) + 7932) -
-           ROUND_MUL16(icos, ROUND_MUL16(icos, -2597) + 7932);
-}
-
-static inline uint32_t celt_rng(CeltContext *s)
-{
-    s->seed = 1664525 * s->seed + 1013904223;
-    return s->seed;
-}
-
-static void celt_decode_coarse_energy(CeltContext *s, OpusRangeCoder *rc)
+/* Use the 2D z-transform to apply prediction in both the time domain (alpha)
+ * and the frequency domain (beta) */
+static void celt_decode_coarse_energy(CeltFrame *f, OpusRangeCoder *rc)
 {
     int i, j;
-    float prev[2] = {0};
-    float alpha, beta;
-    const uint8_t *model;
-
-    /* use the 2D z-transform to apply prediction in both */
-    /* the time domain (alpha) and the frequency domain (beta) */
-
-    if (opus_rc_tell(rc)+3 <= s->framebits && opus_rc_p2model(rc, 3)) {
-        /* intra frame */
-        alpha = 0;
-        beta  = 1.0f - 4915.0f/32768.0f;
-        model = celt_coarse_energy_dist[s->duration][1];
-    } else {
-        alpha = celt_alpha_coef[s->duration];
-        beta  = 1.0f - celt_beta_coef[s->duration];
-        model = celt_coarse_energy_dist[s->duration][0];
+    float prev[2] = { 0 };
+    float alpha = ff_celt_alpha_coef[f->size];
+    float beta  = ff_celt_beta_coef[f->size];
+    const uint8_t *model = ff_celt_coarse_energy_dist[f->size][0];
+
+    /* intra frame */
+    if (opus_rc_tell(rc) + 3 <= f->framebits && ff_opus_rc_dec_log(rc, 3)) {
+        alpha = 0.0f;
+        beta  = 1.0f - (4915.0f/32768.0f);
+        model = ff_celt_coarse_energy_dist[f->size][1];
     }
 
     for (i = 0; i < CELT_MAX_BANDS; i++) {
-        for (j = 0; j < s->coded_channels; j++) {
-            CeltFrame *frame = &s->frame[j];
+        for (j = 0; j < f->channels; j++) {
+            CeltBlock *block = &f->block[j];
             float value;
             int available;
 
-            if (i < s->startband || i >= s->endband) {
-                frame->energy[i] = 0.0;
+            if (i < f->start_band || i >= f->end_band) {
+                block->energy[i] = 0.0;
                 continue;
             }
 
-            available = s->framebits - opus_rc_tell(rc);
+            available = f->framebits - opus_rc_tell(rc);
             if (available >= 15) {
                 /* decode using a Laplace distribution */
                 int k = FFMIN(i, 20) << 1;
-                value = opus_rc_laplace(rc, model[k] << 7, model[k+1] << 6);
+                value = ff_opus_rc_dec_laplace(rc, model[k] << 7, model[k+1] << 6);
             } else if (available >= 2) {
-                int x = opus_rc_getsymbol(rc, celt_model_energy_small);
+                int x = ff_opus_rc_dec_cdf(rc, ff_celt_model_energy_small);
                 value = (x>>1) ^ -(x&1);
             } else if (available >= 1) {
-                value = -(float)opus_rc_p2model(rc, 1);
+                value = -(float)ff_opus_rc_dec_log(rc, 1);
             } else value = -1;
 
-            frame->energy[i] = FFMAX(-9.0f, frame->energy[i]) * alpha + prev[j] + value;
+            block->energy[i] = FFMAX(-9.0f, block->energy[i]) * alpha + prev[j] + value;
             prev[j] += beta * value;
         }
     }
 }
 
-static void celt_decode_fine_energy(CeltContext *s, OpusRangeCoder *rc)
+static void celt_decode_fine_energy(CeltFrame *f, OpusRangeCoder *rc)
 {
     int i;
-    for (i = s->startband; i < s->endband; i++) {
+    for (i = f->start_band; i < f->end_band; i++) {
         int j;
-        if (!s->fine_bits[i])
+        if (!f->fine_bits[i])
             continue;
 
-        for (j = 0; j < s->coded_channels; j++) {
-            CeltFrame *frame = &s->frame[j];
+        for (j = 0; j < f->channels; j++) {
+            CeltBlock *block = &f->block[j];
             int q2;
             float offset;
-            q2 = opus_getrawbits(rc, s->fine_bits[i]);
-            offset = (q2 + 0.5f) * (1 << (14 - s->fine_bits[i])) / 16384.0f - 0.5f;
-            frame->energy[i] += offset;
+            q2 = ff_opus_rc_get_raw(rc, f->fine_bits[i]);
+            offset = (q2 + 0.5f) * (1 << (14 - f->fine_bits[i])) / 16384.0f - 0.5f;
+            block->energy[i] += offset;
         }
     }
 }
 
-static void celt_decode_final_energy(CeltContext *s, OpusRangeCoder *rc,
-                                     int bits_left)
+static void celt_decode_final_energy(CeltFrame *f, OpusRangeCoder *rc)
 {
     int priority, i, j;
+    int bits_left = f->framebits - opus_rc_tell(rc);
 
     for (priority = 0; priority < 2; priority++) {
-        for (i = s->startband; i < s->endband && bits_left >= s->coded_channels; i++) {
-            if (s->fine_priority[i] != priority || s->fine_bits[i] >= CELT_MAX_FINE_BITS)
+        for (i = f->start_band; i < f->end_band && bits_left >= f->channels; i++) {
+            if (f->fine_priority[i] != priority || f->fine_bits[i] >= CELT_MAX_FINE_BITS)
                 continue;
 
-            for (j = 0; j < s->coded_channels; j++) {
+            for (j = 0; j < f->channels; j++) {
                 int q2;
                 float offset;
-                q2 = opus_getrawbits(rc, 1);
-                offset = (q2 - 0.5f) * (1 << (14 - s->fine_bits[i] - 1)) / 16384.0f;
-                s->frame[j].energy[i] += offset;
+                q2 = ff_opus_rc_get_raw(rc, 1);
+                offset = (q2 - 0.5f) * (1 << (14 - f->fine_bits[i] - 1)) / 16384.0f;
+                f->block[j].energy[i] += offset;
                 bits_left--;
             }
         }
     }
 }
 
-static void celt_decode_tf_changes(CeltContext *s, OpusRangeCoder *rc,
-                                   int transient)
+static void celt_decode_tf_changes(CeltFrame *f, OpusRangeCoder *rc)
 {
     int i, diff = 0, tf_select = 0, tf_changed = 0, tf_select_bit;
-    int consumed, bits = transient ? 2 : 4;
+    int consumed, bits = f->transient ? 2 : 4;
 
     consumed = opus_rc_tell(rc);
-    tf_select_bit = (s->duration != 0 && consumed+bits+1 <= s->framebits);
+    tf_select_bit = (f->size != 0 && consumed+bits+1 <= f->framebits);
 
-    for (i = s->startband; i < s->endband; i++) {
-        if (consumed+bits+tf_select_bit <= s->framebits) {
-            diff ^= opus_rc_p2model(rc, bits);
+    for (i = f->start_band; i < f->end_band; i++) {
+        if (consumed+bits+tf_select_bit <= f->framebits) {
+            diff ^= ff_opus_rc_dec_log(rc, bits);
             consumed = opus_rc_tell(rc);
             tf_changed |= diff;
         }
-        s->tf_change[i] = diff;
-        bits = transient ? 4 : 5;
-    }
-
-    if (tf_select_bit && celt_tf_select[s->duration][transient][0][tf_changed] !=
-                         celt_tf_select[s->duration][transient][1][tf_changed])
-        tf_select = opus_rc_p2model(rc, 1);
-
-    for (i = s->startband; i < s->endband; i++) {
-        s->tf_change[i] = celt_tf_select[s->duration][transient][tf_select][s->tf_change[i]];
-    }
-}
-
-static void celt_decode_allocation(CeltContext *s, OpusRangeCoder *rc)
-{
-    // approx. maximum bit allocation for each band before boost/trim
-    int cap[CELT_MAX_BANDS];
-    int boost[CELT_MAX_BANDS];
-    int threshold[CELT_MAX_BANDS];
-    int bits1[CELT_MAX_BANDS];
-    int bits2[CELT_MAX_BANDS];
-    int trim_offset[CELT_MAX_BANDS];
-
-    int skip_startband = s->startband;
-    int dynalloc       = 6;
-    int alloctrim      = 5;
-    int extrabits      = 0;
-
-    int skip_bit            = 0;
-    int intensitystereo_bit = 0;
-    int dualstereo_bit      = 0;
-
-    int remaining, bandbits;
-    int low, high, total, done;
-    int totalbits;
-    int consumed;
-    int i, j;
-
-    consumed = opus_rc_tell(rc);
-
-    /* obtain spread flag */
-    s->spread = CELT_SPREAD_NORMAL;
-    if (consumed + 4 <= s->framebits)
-        s->spread = opus_rc_getsymbol(rc, celt_model_spread);
-
-    /* generate static allocation caps */
-    for (i = 0; i < CELT_MAX_BANDS; i++) {
-        cap[i] = (celt_static_caps[s->duration][s->coded_channels - 1][i] + 64)
-                 * celt_freq_range[i] << (s->coded_channels - 1) << s->duration >> 2;
-    }
-
-    /* obtain band boost */
-    totalbits = s->framebits << 3; // convert to 1/8 bits
-    consumed = opus_rc_tell_frac(rc);
-    for (i = s->startband; i < s->endband; i++) {
-        int quanta, band_dynalloc;
-
-        boost[i] = 0;
-
-        quanta = celt_freq_range[i] << (s->coded_channels - 1) << s->duration;
-        quanta = FFMIN(quanta << 3, FFMAX(6 << 3, quanta));
-        band_dynalloc = dynalloc;
-        while (consumed + (band_dynalloc<<3) < totalbits && boost[i] < cap[i]) {
-            int add = opus_rc_p2model(rc, band_dynalloc);
-            consumed = opus_rc_tell_frac(rc);
-            if (!add)
-                break;
-
-            boost[i]     += quanta;
-            totalbits    -= quanta;
-            band_dynalloc = 1;
-        }
-        /* dynalloc is more likely to occur if it's already been used for earlier bands */
-        if (boost[i])
-            dynalloc = FFMAX(2, dynalloc - 1);
-    }
-
-    /* obtain allocation trim */
-    if (consumed + (6 << 3) <= totalbits)
-        alloctrim = opus_rc_getsymbol(rc, celt_model_alloc_trim);
-
-    /* anti-collapse bit reservation */
-    totalbits = (s->framebits << 3) - opus_rc_tell_frac(rc) - 1;
-    s->anticollapse_bit = 0;
-    if (s->blocks > 1 && s->duration >= 2 &&
-        totalbits >= ((s->duration + 2) << 3))
-        s->anticollapse_bit = 1 << 3;
-    totalbits -= s->anticollapse_bit;
-
-    /* band skip bit reservation */
-    if (totalbits >= 1 << 3)
-        skip_bit = 1 << 3;
-    totalbits -= skip_bit;
-
-    /* intensity/dual stereo bit reservation */
-    if (s->coded_channels == 2) {
-        intensitystereo_bit = celt_log2_frac[s->endband - s->startband];
-        if (intensitystereo_bit <= totalbits) {
-            totalbits -= intensitystereo_bit;
-            if (totalbits >= 1 << 3) {
-                dualstereo_bit = 1 << 3;
-                totalbits -= 1 << 3;
-            }
-        } else
-            intensitystereo_bit = 0;
-    }
-
-    for (i = s->startband; i < s->endband; i++) {
-        int trim     = alloctrim - 5 - s->duration;
-        int band     = celt_freq_range[i] * (s->endband - i - 1);
-        int duration = s->duration + 3;
-        int scale    = duration + s->coded_channels - 1;
-
-        /* PVQ minimum allocation threshold, below this value the band is
-         * skipped */
-        threshold[i] = FFMAX(3 * celt_freq_range[i] << duration >> 4,
-                             s->coded_channels << 3);
-
-        trim_offset[i] = trim * (band << scale) >> 6;
-
-        if (celt_freq_range[i] << s->duration == 1)
-            trim_offset[i] -= s->coded_channels << 3;
-    }
-
-    /* bisection */
-    low  = 1;
-    high = CELT_VECTORS - 1;
-    while (low <= high) {
-        int center = (low + high) >> 1;
-        done = total = 0;
-
-        for (i = s->endband - 1; i >= s->startband; i--) {
-            bandbits = celt_freq_range[i] * celt_static_alloc[center][i]
-                       << (s->coded_channels - 1) << s->duration >> 2;
-
-            if (bandbits)
-                bandbits = FFMAX(0, bandbits + trim_offset[i]);
-            bandbits += boost[i];
-
-            if (bandbits >= threshold[i] || done) {
-                done = 1;
-                total += FFMIN(bandbits, cap[i]);
-            } else if (bandbits >= s->coded_channels << 3)
-                total += s->coded_channels << 3;
-        }
-
-        if (total > totalbits)
-            high = center - 1;
-        else
-            low = center + 1;
-    }
-    high = low--;
-
-    for (i = s->startband; i < s->endband; i++) {
-        bits1[i] = celt_freq_range[i] * celt_static_alloc[low][i]
-                   << (s->coded_channels - 1) << s->duration >> 2;
-        bits2[i] = high >= CELT_VECTORS ? cap[i] :
-                   celt_freq_range[i] * celt_static_alloc[high][i]
-                   << (s->coded_channels - 1) << s->duration >> 2;
-
-        if (bits1[i])
-            bits1[i] = FFMAX(0, bits1[i] + trim_offset[i]);
-        if (bits2[i])
-            bits2[i] = FFMAX(0, bits2[i] + trim_offset[i]);
-        if (low)
-            bits1[i] += boost[i];
-        bits2[i] += boost[i];
-
-        if (boost[i])
-            skip_startband = i;
-        bits2[i] = FFMAX(0, bits2[i] - bits1[i]);
-    }
-
-    /* bisection */
-    low  = 0;
-    high = 1 << CELT_ALLOC_STEPS;
-    for (i = 0; i < CELT_ALLOC_STEPS; i++) {
-        int center = (low + high) >> 1;
-        done = total = 0;
-
-        for (j = s->endband - 1; j >= s->startband; j--) {
-            bandbits = bits1[j] + (center * bits2[j] >> CELT_ALLOC_STEPS);
-
-            if (bandbits >= threshold[j] || done) {
-                done = 1;
-                total += FFMIN(bandbits, cap[j]);
-            } else if (bandbits >= s->coded_channels << 3)
-                total += s->coded_channels << 3;
-        }
-        if (total > totalbits)
-            high = center;
-        else
-            low = center;
-    }
-
-    done = total = 0;
-    for (i = s->endband - 1; i >= s->startband; i--) {
-        bandbits = bits1[i] + (low * bits2[i] >> CELT_ALLOC_STEPS);
-
-        if (bandbits >= threshold[i] || done)
-            done = 1;
-        else
-            bandbits = (bandbits >= s->coded_channels << 3) ?
-                       s->coded_channels << 3 : 0;
-
-        bandbits     = FFMIN(bandbits, cap[i]);
-        s->pulses[i] = bandbits;
-        total      += bandbits;
-    }
-
-    /* band skipping */
-    for (s->codedbands = s->endband; ; s->codedbands--) {
-        int allocation;
-        j = s->codedbands - 1;
-
-        if (j == skip_startband) {
-            /* all remaining bands are not skipped */
-            totalbits += skip_bit;
-            break;
-        }
-
-        /* determine the number of bits available for coding "do not skip" markers */
-        remaining   = totalbits - total;
-        bandbits    = remaining / (celt_freq_bands[j+1] - celt_freq_bands[s->startband]);
-        remaining  -= bandbits  * (celt_freq_bands[j+1] - celt_freq_bands[s->startband]);
-        allocation  = s->pulses[j] + bandbits * celt_freq_range[j]
-                      + FFMAX(0, remaining - (celt_freq_bands[j] - celt_freq_bands[s->startband]));
-
-        /* a "do not skip" marker is only coded if the allocation is
-           above the chosen threshold */
-        if (allocation >= FFMAX(threshold[j], (s->coded_channels + 1) <<3 )) {
-            if (opus_rc_p2model(rc, 1))
-                break;
-
-            total      += 1 << 3;
-            allocation -= 1 << 3;
-        }
-
-        /* the band is skipped, so reclaim its bits */
-        total -= s->pulses[j];
-        if (intensitystereo_bit) {
-            total -= intensitystereo_bit;
-            intensitystereo_bit = celt_log2_frac[j - s->startband];
-            total += intensitystereo_bit;
-        }
-
-        total += s->pulses[j] = (allocation >= s->coded_channels << 3) ?
-                              s->coded_channels << 3 : 0;
-    }
-
-    /* obtain stereo flags */
-    s->intensitystereo = 0;
-    s->dualstereo      = 0;
-    if (intensitystereo_bit)
-        s->intensitystereo = s->startband +
-                          opus_rc_unimodel(rc, s->codedbands + 1 - s->startband);
-    if (s->intensitystereo <= s->startband)
-        totalbits += dualstereo_bit; /* no intensity stereo means no dual stereo */
-    else if (dualstereo_bit)
-        s->dualstereo = opus_rc_p2model(rc, 1);
-
-    /* supply the remaining bits in this frame to lower bands */
-    remaining = totalbits - total;
-    bandbits  = remaining / (celt_freq_bands[s->codedbands] - celt_freq_bands[s->startband]);
-    remaining -= bandbits * (celt_freq_bands[s->codedbands] - celt_freq_bands[s->startband]);
-    for (i = s->startband; i < s->codedbands; i++) {
-        int bits = FFMIN(remaining, celt_freq_range[i]);
-
-        s->pulses[i] += bits + bandbits * celt_freq_range[i];
-        remaining    -= bits;
-    }
-
-    for (i = s->startband; i < s->codedbands; i++) {
-        int N = celt_freq_range[i] << s->duration;
-        int prev_extra = extrabits;
-        s->pulses[i] += extrabits;
-
-        if (N > 1) {
-            int dof;        // degrees of freedom
-            int temp;       // dof * channels * log(dof)
-            int offset;     // fine energy quantization offset, i.e.
-                            // extra bits assigned over the standard
-                            // totalbits/dof
-            int fine_bits, max_bits;
-
-            extrabits = FFMAX(0, s->pulses[i] - cap[i]);
-            s->pulses[i] -= extrabits;
-
-            /* intensity stereo makes use of an extra degree of freedom */
-            dof = N * s->coded_channels
-                  + (s->coded_channels == 2 && N > 2 && !s->dualstereo && i < s->intensitystereo);
-            temp = dof * (celt_log_freq_range[i] + (s->duration<<3));
-            offset = (temp >> 1) - dof * CELT_FINE_OFFSET;
-            if (N == 2) /* dof=2 is the only case that doesn't fit the model */
-                offset += dof<<1;
-
-            /* grant an additional bias for the first and second pulses */
-            if (s->pulses[i] + offset < 2 * (dof << 3))
-                offset += temp >> 2;
-            else if (s->pulses[i] + offset < 3 * (dof << 3))
-                offset += temp >> 3;
-
-            fine_bits = (s->pulses[i] + offset + (dof << 2)) / (dof << 3);
-            max_bits  = FFMIN((s->pulses[i]>>3) >> (s->coded_channels - 1),
-                              CELT_MAX_FINE_BITS);
-
-            max_bits  = FFMAX(max_bits, 0);
-
-            s->fine_bits[i] = av_clip(fine_bits, 0, max_bits);
-
-            /* if fine_bits was rounded down or capped,
-               give priority for the final fine energy pass */
-            s->fine_priority[i] = (s->fine_bits[i] * (dof<<3) >= s->pulses[i] + offset);
-
-            /* the remaining bits are assigned to PVQ */
-            s->pulses[i] -= s->fine_bits[i] << (s->coded_channels - 1) << 3;
-        } else {
-            /* all bits go to fine energy except for the sign bit */
-            extrabits = FFMAX(0, s->pulses[i] - (s->coded_channels << 3));
-            s->pulses[i] -= extrabits;
-            s->fine_bits[i] = 0;
-            s->fine_priority[i] = 1;
-        }
-
-        /* hand back a limited number of extra fine energy bits to this band */
-        if (extrabits > 0) {
-            int fineextra = FFMIN(extrabits >> (s->coded_channels + 2),
-                                  CELT_MAX_FINE_BITS - s->fine_bits[i]);
-            s->fine_bits[i] += fineextra;
-
-            fineextra <<= s->coded_channels + 2;
-            s->fine_priority[i] = (fineextra >= extrabits - prev_extra);
-            extrabits -= fineextra;
-        }
-    }
-    s->remaining = extrabits;
-
-    /* skipped bands dedicate all of their bits for fine energy */
-    for (; i < s->endband; i++) {
-        s->fine_bits[i]     = s->pulses[i] >> (s->coded_channels - 1) >> 3;
-        s->pulses[i]        = 0;
-        s->fine_priority[i] = s->fine_bits[i] < 1;
-    }
-}
-
-static inline int celt_bits2pulses(const uint8_t *cache, int bits)
-{
-    // TODO: Find the size of cache and make it into an array in the parameters list
-    int i, low = 0, high;
-
-    high = cache[0];
-    bits--;
-
-    for (i = 0; i < 6; i++) {
-        int center = (low + high + 1) >> 1;
-        if (cache[center] >= bits)
-            high = center;
-        else
-            low = center;
-    }
-
-    return (bits - (low == 0 ? -1 : cache[low]) <= cache[high] - bits) ? low : high;
-}
-
-static inline int celt_pulses2bits(const uint8_t *cache, int pulses)
-{
-    // TODO: Find the size of cache and make it into an array in the parameters list
-   return (pulses == 0) ? 0 : cache[pulses] + 1;
-}
-
-static inline void celt_normalize_residual(const int * restrict iy, float * restrict X,
-                                           int N, float g)
-{
-    int i;
-    for (i = 0; i < N; i++)
-        X[i] = g * iy[i];
-}
-
-static void celt_exp_rotation1(float *X, unsigned int len, unsigned int stride,
-                               float c, float s)
-{
-    float *Xptr;
-    int i;
-
-    Xptr = X;
-    for (i = 0; i < len - stride; i++) {
-        float x1, x2;
-        x1           = Xptr[0];
-        x2           = Xptr[stride];
-        Xptr[stride] = c * x2 + s * x1;
-        *Xptr++      = c * x1 - s * x2;
-    }
-
-    Xptr = &X[len - 2 * stride - 1];
-    for (i = len - 2 * stride - 1; i >= 0; i--) {
-        float x1, x2;
-        x1           = Xptr[0];
-        x2           = Xptr[stride];
-        Xptr[stride] = c * x2 + s * x1;
-        *Xptr--      = c * x1 - s * x2;
-    }
-}
-
-static inline void celt_exp_rotation(float *X, unsigned int len,
-                                     unsigned int stride, unsigned int K,
-                                     enum CeltSpread spread)
-{
-    unsigned int stride2 = 0;
-    float c, s;
-    float gain, theta;
-    int i;
-
-    if (2*K >= len || spread == CELT_SPREAD_NONE)
-        return;
-
-    gain = (float)len / (len + (20 - 5*spread) * K);
-    theta = M_PI * gain * gain / 4;
-
-    c = cos(theta);
-    s = sin(theta);
-
-    if (len >= stride << 3) {
-        stride2 = 1;
-        /* This is just a simple (equivalent) way of computing sqrt(len/stride) with rounding.
-        It's basically incrementing long as (stride2+0.5)^2 < len/stride. */
-        while ((stride2 * stride2 + stride2) * stride + (stride >> 2) < len)
-            stride2++;
-    }
-
-    /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
-    extract_collapse_mask().*/
-    len /= stride;
-    for (i = 0; i < stride; i++) {
-        if (stride2)
-            celt_exp_rotation1(X + i * len, len, stride2, s, c);
-        celt_exp_rotation1(X + i * len, len, 1, c, s);
-    }
-}
-
-static inline unsigned int celt_extract_collapse_mask(const int *iy,
-                                                      unsigned int N,
-                                                      unsigned int B)
-{
-    unsigned int collapse_mask;
-    int N0;
-    int i, j;
-
-    if (B <= 1)
-        return 1;
-
-    /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
-    exp_rotation().*/
-    N0 = N/B;
-    collapse_mask = 0;
-    for (i = 0; i < B; i++)
-        for (j = 0; j < N0; j++)
-            collapse_mask |= (iy[i*N0+j]!=0)<<i;
-    return collapse_mask;
-}
-
-static inline void celt_renormalize_vector(float *X, int N, float gain)
-{
-    int i;
-    float g = 1e-15f;
-    for (i = 0; i < N; i++)
-        g += X[i] * X[i];
-    g = gain / sqrtf(g);
-
-    for (i = 0; i < N; i++)
-        X[i] *= g;
-}
-
-static inline void celt_stereo_merge(float *X, float *Y, float mid, int N)
-{
-    int i;
-    float xp = 0, side = 0;
-    float E[2];
-    float mid2;
-    float t, gain[2];
-
-    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-    for (i = 0; i < N; i++) {
-        xp   += X[i] * Y[i];
-        side += Y[i] * Y[i];
-    }
-
-    /* Compensating for the mid normalization */
-    xp *= mid;
-    mid2 = mid;
-    E[0] = mid2 * mid2 + side - 2 * xp;
-    E[1] = mid2 * mid2 + side + 2 * xp;
-    if (E[0] < 6e-4f || E[1] < 6e-4f) {
-        for (i = 0; i < N; i++)
-            Y[i] = X[i];
-        return;
-    }
-
-    t = E[0];
-    gain[0] = 1.0f / sqrtf(t);
-    t = E[1];
-    gain[1] = 1.0f / sqrtf(t);
-
-    for (i = 0; i < N; i++) {
-        float value[2];
-        /* Apply mid scaling (side is already scaled) */
-        value[0] = mid * X[i];
-        value[1] = Y[i];
-        X[i] = gain[0] * (value[0] - value[1]);
-        Y[i] = gain[1] * (value[0] + value[1]);
-    }
-}
-
-static void celt_interleave_hadamard(float *tmp, float *X, int N0,
-                                     int stride, int hadamard)
-{
-    int i, j;
-    int N = N0*stride;
-
-    if (hadamard) {
-        const uint8_t *ordery = celt_hadamard_ordery + stride - 2;
-        for (i = 0; i < stride; i++)
-            for (j = 0; j < N0; j++)
-                tmp[j*stride+i] = X[ordery[i]*N0+j];
-    } else {
-        for (i = 0; i < stride; i++)
-            for (j = 0; j < N0; j++)
-                tmp[j*stride+i] = X[i*N0+j];
-    }
-
-    for (i = 0; i < N; i++)
-        X[i] = tmp[i];
-}
-
-static void celt_deinterleave_hadamard(float *tmp, float *X, int N0,
-                                       int stride, int hadamard)
-{
-    int i, j;
-    int N = N0*stride;
-
-    if (hadamard) {
-        const uint8_t *ordery = celt_hadamard_ordery + stride - 2;
-        for (i = 0; i < stride; i++)
-            for (j = 0; j < N0; j++)
-                tmp[ordery[i]*N0+j] = X[j*stride+i];
-    } else {
-        for (i = 0; i < stride; i++)
-            for (j = 0; j < N0; j++)
-                tmp[i*N0+j] = X[j*stride+i];
-    }
-
-    for (i = 0; i < N; i++)
-        X[i] = tmp[i];
-}
-
-static void celt_haar1(float *X, int N0, int stride)
-{
-    int i, j;
-    N0 >>= 1;
-    for (i = 0; i < stride; i++) {
-        for (j = 0; j < N0; j++) {
-            float x0 = X[stride * (2 * j + 0) + i];
-            float x1 = X[stride * (2 * j + 1) + i];
-            X[stride * (2 * j + 0) + i] = (x0 + x1) * M_SQRT1_2;
-            X[stride * (2 * j + 1) + i] = (x0 - x1) * M_SQRT1_2;
-        }
+        f->tf_change[i] = diff;
+        bits = f->transient ? 4 : 5;
     }
-}
 
-static inline int celt_compute_qn(int N, int b, int offset, int pulse_cap,
-                                  int dualstereo)
-{
-    int qn, qb;
-    int N2 = 2 * N - 1;
-    if (dualstereo && N == 2)
-        N2--;
-
-    /* The upper limit ensures that in a stereo split with itheta==16384, we'll
-     * always have enough bits left over to code at least one pulse in the
-     * side; otherwise it would collapse, since it doesn't get folded. */
-    qb = FFMIN3(b - pulse_cap - (4 << 3), (b + N2 * offset) / N2, 8 << 3);
-    qn = (qb < (1 << 3 >> 1)) ? 1 : ((celt_qn_exp2[qb & 0x7] >> (14 - (qb >> 3))) + 1) >> 1 << 1;
-    return qn;
-}
+    if (tf_select_bit && ff_celt_tf_select[f->size][f->transient][0][tf_changed] !=
+                         ff_celt_tf_select[f->size][f->transient][1][tf_changed])
+        tf_select = ff_opus_rc_dec_log(rc, 1);
 
-// this code was adapted from libopus
-static inline uint64_t celt_cwrsi(unsigned int N, unsigned int K, unsigned int i, int *y)
-{
-    uint64_t norm = 0;
-    uint32_t p;
-    int s, val;
-    int k0;
-
-    while (N > 2) {
-        uint32_t q;
-
-        /*Lots of pulses case:*/
-        if (K >= N) {
-            const uint32_t *row = celt_pvq_u_row[N];
-
-            /* Are the pulses in this dimension negative? */
-            p  = row[K + 1];
-            s  = -(i >= p);
-            i -= p & s;
-
-            /*Count how many pulses were placed in this dimension.*/
-            k0 = K;
-            q = row[N];
-            if (q > i) {
-                K = N;
-                do {
-                    p = celt_pvq_u_row[--K][N];
-                } while (p > i);
-            } else
-                for (p = row[K]; p > i; p = row[K])
-                    K--;
-
-            i    -= p;
-            val   = (k0 - K + s) ^ s;
-            norm += val * val;
-            *y++  = val;
-        } else { /*Lots of dimensions case:*/
-            /*Are there any pulses in this dimension at all?*/
-            p = celt_pvq_u_row[K    ][N];
-            q = celt_pvq_u_row[K + 1][N];
-
-            if (p <= i && i < q) {
-                i -= p;
-                *y++ = 0;
-            } else {
-                /*Are the pulses in this dimension negative?*/
-                s  = -(i >= q);
-                i -= q & s;
-
-                /*Count how many pulses were placed in this dimension.*/
-                k0 = K;
-                do p = celt_pvq_u_row[--K][N];
-                while (p > i);
-
-                i    -= p;
-                val   = (k0 - K + s) ^ s;
-                norm += val * val;
-                *y++  = val;
-            }
-        }
-        N--;
+    for (i = f->start_band; i < f->end_band; i++) {
+        f->tf_change[i] = ff_celt_tf_select[f->size][f->transient][tf_select][f->tf_change[i]];
     }
-
-    /* N == 2 */
-    p  = 2 * K + 1;
-    s  = -(i >= p);
-    i -= p & s;
-    k0 = K;
-    K  = (i + 1) / 2;
-
-    if (K)
-        i -= 2 * K - 1;
-
-    val   = (k0 - K + s) ^ s;
-    norm += val * val;
-    *y++  = val;
-
-    /* N==1 */
-    s     = -i;
-    val   = (K + s) ^ s;
-    norm += val * val;
-    *y    = val;
-
-    return norm;
 }
 
-static inline float celt_decode_pulses(OpusRangeCoder *rc, int *y, unsigned int N, unsigned int K)
-{
-    unsigned int idx;
-#define CELT_PVQ_U(n, k) (celt_pvq_u_row[FFMIN(n, k)][FFMAX(n, k)])
-#define CELT_PVQ_V(n, k) (CELT_PVQ_U(n, k) + CELT_PVQ_U(n, k + 1))
-    idx = opus_rc_unimodel(rc, CELT_PVQ_V(N, K));
-    return celt_cwrsi(N, K, idx, y);
-}
-
-/** Decode pulse vector and combine the result with the pitch vector to produce
-    the final normalised signal in the current band. */
-static inline unsigned int celt_alg_unquant(OpusRangeCoder *rc, float *X,
-                                            unsigned int N, unsigned int K,
-                                            enum CeltSpread spread,
-                                            unsigned int blocks, float gain)
-{
-    int y[176];
-
-    gain /= sqrtf(celt_decode_pulses(rc, y, N, K));
-    celt_normalize_residual(y, X, N, gain);
-    celt_exp_rotation(X, N, blocks, K, spread);
-    return celt_extract_collapse_mask(y, N, blocks);
-}
-
-static unsigned int celt_decode_band(CeltContext *s, OpusRangeCoder *rc,
-                                     const int band, float *X, float *Y,
-                                     int N, int b, unsigned int blocks,
-                                     float *lowband, int duration,
-                                     float *lowband_out, int level,
-                                     float gain, float *lowband_scratch,
-                                     int fill)
-{
-    const uint8_t *cache;
-    int dualstereo, split;
-    int imid = 0, iside = 0;
-    unsigned int N0 = N;
-    int N_B;
-    int N_B0;
-    int B0 = blocks;
-    int time_divide = 0;
-    int recombine = 0;
-    int inv = 0;
-    float mid = 0, side = 0;
-    int longblocks = (B0 == 1);
-    unsigned int cm = 0;
-
-    N_B0 = N_B = N / blocks;
-    split = dualstereo = (Y != NULL);
-
-    if (N == 1) {
-        /* special case for one sample */
-        int i;
-        float *x = X;
-        for (i = 0; i <= dualstereo; i++) {
-            int sign = 0;
-            if (s->remaining2 >= 1<<3) {
-                sign           = opus_getrawbits(rc, 1);
-                s->remaining2 -= 1 << 3;
-                b             -= 1 << 3;
-            }
-            x[0] = sign ? -1.0f : 1.0f;
-            x = Y;
-        }
-        if (lowband_out)
-            lowband_out[0] = X[0];
-        return 1;
-    }
-
-    if (!dualstereo && level == 0) {
-        int tf_change = s->tf_change[band];
-        int k;
-        if (tf_change > 0)
-            recombine = tf_change;
-        /* Band recombining to increase frequency resolution */
-
-        if (lowband &&
-            (recombine || ((N_B & 1) == 0 && tf_change < 0) || B0 > 1)) {
-            int j;
-            for (j = 0; j < N; j++)
-                lowband_scratch[j] = lowband[j];
-            lowband = lowband_scratch;
-        }
-
-        for (k = 0; k < recombine; k++) {
-            if (lowband)
-                celt_haar1(lowband, N >> k, 1 << k);
-            fill = celt_bit_interleave[fill & 0xF] | celt_bit_interleave[fill >> 4] << 2;
-        }
-        blocks >>= recombine;
-        N_B <<= recombine;
-
-        /* Increasing the time resolution */
-        while ((N_B & 1) == 0 && tf_change < 0) {
-            if (lowband)
-                celt_haar1(lowband, N_B, blocks);
-            fill |= fill << blocks;
-            blocks <<= 1;
-            N_B >>= 1;
-            time_divide++;
-            tf_change++;
-        }
-        B0 = blocks;
-        N_B0 = N_B;
-
-        /* Reorganize the samples in time order instead of frequency order */
-        if (B0 > 1 && lowband)
-            celt_deinterleave_hadamard(s->scratch, lowband, N_B >> recombine,
-                                       B0 << recombine, longblocks);
-    }
-
-    /* If we need 1.5 more bit than we can produce, split the band in two. */
-    cache = celt_cache_bits +
-            celt_cache_index[(duration + 1) * CELT_MAX_BANDS + band];
-    if (!dualstereo && duration >= 0 && b > cache[cache[0]] + 12 && N > 2) {
-        N >>= 1;
-        Y = X + N;
-        split = 1;
-        duration -= 1;
-        if (blocks == 1)
-            fill = (fill & 1) | (fill << 1);
-        blocks = (blocks + 1) >> 1;
-    }
-
-    if (split) {
-        int qn;
-        int itheta = 0;
-        int mbits, sbits, delta;
-        int qalloc;
-        int pulse_cap;
-        int offset;
-        int orig_fill;
-        int tell;
-
-        /* Decide on the resolution to give to the split parameter theta */
-        pulse_cap = celt_log_freq_range[band] + duration * 8;
-        offset = (pulse_cap >> 1) - (dualstereo && N == 2 ? CELT_QTHETA_OFFSET_TWOPHASE :
-                                                          CELT_QTHETA_OFFSET);
-        qn = (dualstereo && band >= s->intensitystereo) ? 1 :
-             celt_compute_qn(N, b, offset, pulse_cap, dualstereo);
-        tell = opus_rc_tell_frac(rc);
-        if (qn != 1) {
-            /* Entropy coding of the angle. We use a uniform pdf for the
-            time split, a step for stereo, and a triangular one for the rest. */
-            if (dualstereo && N > 2)
-                itheta = opus_rc_stepmodel(rc, qn/2);
-            else if (dualstereo || B0 > 1)
-                itheta = opus_rc_unimodel(rc, qn+1);
-            else
-                itheta = opus_rc_trimodel(rc, qn);
-            itheta = itheta * 16384 / qn;
-            /* NOTE: Renormalising X and Y *may* help fixed-point a bit at very high rate.
-            Let's do that at higher complexity */
-        } else if (dualstereo) {
-            inv = (b > 2 << 3 && s->remaining2 > 2 << 3) ? opus_rc_p2model(rc, 2) : 0;
-            itheta = 0;
-        }
-        qalloc = opus_rc_tell_frac(rc) - tell;
-        b -= qalloc;
-
-        orig_fill = fill;
-        if (itheta == 0) {
-            imid = 32767;
-            iside = 0;
-            fill &= (1 << blocks) - 1;
-            delta = -16384;
-        } else if (itheta == 16384) {
-            imid = 0;
-            iside = 32767;
-            fill &= ((1 << blocks) - 1) << blocks;
-            delta = 16384;
-        } else {
-            imid = celt_cos(itheta);
-            iside = celt_cos(16384-itheta);
-            /* This is the mid vs side allocation that minimizes squared error
-            in that band. */
-            delta = ROUND_MUL16((N - 1) << 7, celt_log2tan(iside, imid));
-        }
-
-        mid  = imid  / 32768.0f;
-        side = iside / 32768.0f;
-
-        /* This is a special case for N=2 that only works for stereo and takes
-        advantage of the fact that mid and side are orthogonal to encode
-        the side with just one bit. */
-        if (N == 2 && dualstereo) {
-            int c;
-            int sign = 0;
-            float tmp;
-            float *x2, *y2;
-            mbits = b;
-            /* Only need one bit for the side */
-            sbits = (itheta != 0 && itheta != 16384) ? 1 << 3 : 0;
-            mbits -= sbits;
-            c = (itheta > 8192);
-            s->remaining2 -= qalloc+sbits;
-
-            x2 = c ? Y : X;
-            y2 = c ? X : Y;
-            if (sbits)
-                sign = opus_getrawbits(rc, 1);
-            sign = 1 - 2 * sign;
-            /* We use orig_fill here because we want to fold the side, but if
-            itheta==16384, we'll have cleared the low bits of fill. */
-            cm = celt_decode_band(s, rc, band, x2, NULL, N, mbits, blocks,
-                                  lowband, duration, lowband_out, level, gain,
-                                  lowband_scratch, orig_fill);
-            /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
-            and there's no need to worry about mixing with the other channel. */
-            y2[0] = -sign * x2[1];
-            y2[1] =  sign * x2[0];
-            X[0] *= mid;
-            X[1] *= mid;
-            Y[0] *= side;
-            Y[1] *= side;
-            tmp = X[0];
-            X[0] = tmp - Y[0];
-            Y[0] = tmp + Y[0];
-            tmp = X[1];
-            X[1] = tmp - Y[1];
-            Y[1] = tmp + Y[1];
-        } else {
-            /* "Normal" split code */
-            float *next_lowband2     = NULL;
-            float *next_lowband_out1 = NULL;
-            int next_level = 0;
-            int rebalance;
-
-            /* Give more bits to low-energy MDCTs than they would
-             * otherwise deserve */
-            if (B0 > 1 && !dualstereo && (itheta & 0x3fff)) {
-                if (itheta > 8192)
-                    /* Rough approximation for pre-echo masking */
-                    delta -= delta >> (4 - duration);
-                else
-                    /* Corresponds to a forward-masking slope of
-                     * 1.5 dB per 10 ms */
-                    delta = FFMIN(0, delta + (N << 3 >> (5 - duration)));
-            }
-            mbits = av_clip((b - delta) / 2, 0, b);
-            sbits = b - mbits;
-            s->remaining2 -= qalloc;
-
-            if (lowband && !dualstereo)
-                next_lowband2 = lowband + N; /* >32-bit split case */
-
-            /* Only stereo needs to pass on lowband_out.
-             * Otherwise, it's handled at the end */
-            if (dualstereo)
-                next_lowband_out1 = lowband_out;
-            else
-                next_level = level + 1;
-
-            rebalance = s->remaining2;
-            if (mbits >= sbits) {
-                /* In stereo mode, we do not apply a scaling to the mid
-                 * because we need the normalized mid for folding later */
-                cm = celt_decode_band(s, rc, band, X, NULL, N, mbits, blocks,
-                                      lowband, duration, next_lowband_out1,
-                                      next_level, dualstereo ? 1.0f : (gain * mid),
-                                      lowband_scratch, fill);
-
-                rebalance = mbits - (rebalance - s->remaining2);
-                if (rebalance > 3 << 3 && itheta != 0)
-                    sbits += rebalance - (3 << 3);
-
-                /* For a stereo split, the high bits of fill are always zero,
-                 * so no folding will be done to the side. */
-                cm |= celt_decode_band(s, rc, band, Y, NULL, N, sbits, blocks,
-                                       next_lowband2, duration, NULL,
-                                       next_level, gain * side, NULL,
-                                       fill >> blocks) << ((B0 >> 1) & (dualstereo - 1));
-            } else {
-                /* For a stereo split, the high bits of fill are always zero,
-                 * so no folding will be done to the side. */
-                cm = celt_decode_band(s, rc, band, Y, NULL, N, sbits, blocks,
-                                      next_lowband2, duration, NULL,
-                                      next_level, gain * side, NULL,
-                                      fill >> blocks) << ((B0 >> 1) & (dualstereo - 1));
-
-                rebalance = sbits - (rebalance - s->remaining2);
-                if (rebalance > 3 << 3 && itheta != 16384)
-                    mbits += rebalance - (3 << 3);
-
-                /* In stereo mode, we do not apply a scaling to the mid because
-                 * we need the normalized mid for folding later */
-                cm |= celt_decode_band(s, rc, band, X, NULL, N, mbits, blocks,
-                                       lowband, duration, next_lowband_out1,
-                                       next_level, dualstereo ? 1.0f : (gain * mid),
-                                       lowband_scratch, fill);
-            }
-        }
-    } else {
-        /* This is the basic no-split case */
-        unsigned int q         = celt_bits2pulses(cache, b);
-        unsigned int curr_bits = celt_pulses2bits(cache, q);
-        s->remaining2 -= curr_bits;
-
-        /* Ensures we can never bust the budget */
-        while (s->remaining2 < 0 && q > 0) {
-            s->remaining2 += curr_bits;
-            curr_bits      = celt_pulses2bits(cache, --q);
-            s->remaining2 -= curr_bits;
-        }
-
-        if (q != 0) {
-            /* Finally do the actual quantization */
-            cm = celt_alg_unquant(rc, X, N, (q < 8) ? q : (8 + (q & 7)) << ((q >> 3) - 1),
-                                  s->spread, blocks, gain);
-        } else {
-            /* If there's no pulse, fill the band anyway */
-            int j;
-            unsigned int cm_mask = (1 << blocks) - 1;
-            fill &= cm_mask;
-            if (!fill) {
-                for (j = 0; j < N; j++)
-                    X[j] = 0.0f;
-            } else {
-                if (!lowband) {
-                    /* Noise */
-                    for (j = 0; j < N; j++)
-                        X[j] = (((int32_t)celt_rng(s)) >> 20);
-                    cm = cm_mask;
-                } else {
-                    /* Folded spectrum */
-                    for (j = 0; j < N; j++) {
-                        /* About 48 dB below the "normal" folding level */
-                        X[j] = lowband[j] + (((celt_rng(s)) & 0x8000) ? 1.0f / 256 : -1.0f / 256);
-                    }
-                    cm = fill;
-                }
-                celt_renormalize_vector(X, N, gain);
-            }
-        }
-    }
-
-    /* This code is used by the decoder and by the resynthesis-enabled encoder */
-    if (dualstereo) {
-        int j;
-        if (N != 2)
-            celt_stereo_merge(X, Y, mid, N);
-        if (inv) {
-            for (j = 0; j < N; j++)
-                Y[j] *= -1;
-        }
-    } else if (level == 0) {
-        int k;
-
-        /* Undo the sample reorganization going from time order to frequency order */
-        if (B0 > 1)
-            celt_interleave_hadamard(s->scratch, X, N_B>>recombine,
-                                     B0<<recombine, longblocks);
-
-        /* Undo time-freq changes that we did earlier */
-        N_B = N_B0;
-        blocks = B0;
-        for (k = 0; k < time_divide; k++) {
-            blocks >>= 1;
-            N_B <<= 1;
-            cm |= cm >> blocks;
-            celt_haar1(X, N_B, blocks);
-        }
-
-        for (k = 0; k < recombine; k++) {
-            cm = celt_bit_deinterleave[cm];
-            celt_haar1(X, N0>>k, 1<<k);
-        }
-        blocks <<= recombine;
-
-        /* Scale output for later folding */
-        if (lowband_out) {
-            int j;
-            float n = sqrtf(N0);
-            for (j = 0; j < N0; j++)
-                lowband_out[j] = n * X[j];
-        }
-        cm &= (1 << blocks) - 1;
-    }
-    return cm;
-}
-
-static void celt_denormalize(CeltContext *s, CeltFrame *frame, float *data)
+static void celt_denormalize(CeltFrame *f, CeltBlock *block, float *data)
 {
     int i, j;
 
-    for (i = s->startband; i < s->endband; i++) {
-        float *dst = data + (celt_freq_bands[i] << s->duration);
-        float norm = pow(2, frame->energy[i] + celt_mean_energy[i]);
+    for (i = f->start_band; i < f->end_band; i++) {
+        float *dst = data + (ff_celt_freq_bands[i] << f->size);
+        float log_norm = block->energy[i] + ff_celt_mean_energy[i];
+        float norm = exp2f(FFMIN(log_norm, 32.0f));
 
-        for (j = 0; j < celt_freq_range[i] << s->duration; j++)
+        for (j = 0; j < ff_celt_freq_range[i] << f->size; j++)
             dst[j] *= norm;
     }
 }
 
-static void celt_postfilter_apply_transition(CeltFrame *frame, float *data)
+static void celt_postfilter_apply_transition(CeltBlock *block, float *data)
 {
-    const int T0 = frame->pf_period_old;
-    const int T1 = frame->pf_period;
+    const int T0 = block->pf_period_old;
+    const int T1 = block->pf_period;
 
     float g00, g01, g02;
     float g10, g11, g12;
@@ -1696,16 +169,16 @@ static void celt_postfilter_apply_transition(CeltFrame *frame, float *data)
 
     int i;
 
-    if (frame->pf_gains[0]     == 0.0 &&
-        frame->pf_gains_old[0] == 0.0)
+    if (block->pf_gains[0]     == 0.0 &&
+        block->pf_gains_old[0] == 0.0)
         return;
 
-    g00 = frame->pf_gains_old[0];
-    g01 = frame->pf_gains_old[1];
-    g02 = frame->pf_gains_old[2];
-    g10 = frame->pf_gains[0];
-    g11 = frame->pf_gains[1];
-    g12 = frame->pf_gains[2];
+    g00 = block->pf_gains_old[0];
+    g01 = block->pf_gains_old[1];
+    g02 = block->pf_gains_old[2];
+    g10 = block->pf_gains[0];
+    g11 = block->pf_gains[1];
+    g12 = block->pf_gains[2];
 
     x1 = data[-T1 + 1];
     x2 = data[-T1];
@@ -1729,20 +202,19 @@ static void celt_postfilter_apply_transition(CeltFrame *frame, float *data)
     }
 }
 
-static void celt_postfilter_apply(CeltFrame *frame,
-                                  float *data, int len)
+static void celt_postfilter_apply(CeltBlock *block, float *data, int len)
 {
-    const int T = frame->pf_period;
+    const int T = block->pf_period;
     float g0, g1, g2;
     float x0, x1, x2, x3, x4;
     int i;
 
-    if (frame->pf_gains[0] == 0.0 || len <= 0)
+    if (block->pf_gains[0] == 0.0 || len <= 0)
         return;
 
-    g0 = frame->pf_gains[0];
-    g1 = frame->pf_gains[1];
-    g2 = frame->pf_gains[2];
+    g0 = block->pf_gains[0];
+    g1 = block->pf_gains[1];
+    g2 = block->pf_gains[2];
 
     x4 = data[-T - 2];
     x3 = data[-T - 1];
@@ -1761,61 +233,56 @@ static void celt_postfilter_apply(CeltFrame *frame,
     }
 }
 
-static void celt_postfilter(CeltContext *s, CeltFrame *frame)
+static void celt_postfilter(CeltFrame *f, CeltBlock *block)
 {
-    int len = s->blocksize * s->blocks;
+    int len = f->blocksize * f->blocks;
 
-    celt_postfilter_apply_transition(frame, frame->buf + 1024);
+    celt_postfilter_apply_transition(block, block->buf + 1024);
 
-    frame->pf_period_old = frame->pf_period;
-    memcpy(frame->pf_gains_old, frame->pf_gains, sizeof(frame->pf_gains));
+    block->pf_period_old = block->pf_period;
+    memcpy(block->pf_gains_old, block->pf_gains, sizeof(block->pf_gains));
 
-    frame->pf_period = frame->pf_period_new;
-    memcpy(frame->pf_gains, frame->pf_gains_new, sizeof(frame->pf_gains));
+    block->pf_period = block->pf_period_new;
+    memcpy(block->pf_gains, block->pf_gains_new, sizeof(block->pf_gains));
 
     if (len > CELT_OVERLAP) {
-        celt_postfilter_apply_transition(frame, frame->buf + 1024 + CELT_OVERLAP);
-        celt_postfilter_apply(frame, frame->buf + 1024 + 2 * CELT_OVERLAP,
+        celt_postfilter_apply_transition(block, block->buf + 1024 + CELT_OVERLAP);
+        celt_postfilter_apply(block, block->buf + 1024 + 2 * CELT_OVERLAP,
                               len - 2 * CELT_OVERLAP);
 
-        frame->pf_period_old = frame->pf_period;
-        memcpy(frame->pf_gains_old, frame->pf_gains, sizeof(frame->pf_gains));
+        block->pf_period_old = block->pf_period;
+        memcpy(block->pf_gains_old, block->pf_gains, sizeof(block->pf_gains));
     }
 
-    memmove(frame->buf, frame->buf + len, (1024 + CELT_OVERLAP / 2) * sizeof(float));
+    memmove(block->buf, block->buf + len, (1024 + CELT_OVERLAP / 2) * sizeof(float));
 }
 
-static int parse_postfilter(CeltContext *s, OpusRangeCoder *rc, int consumed)
+static int parse_postfilter(CeltFrame *f, OpusRangeCoder *rc, int consumed)
 {
-    static const float postfilter_taps[3][3] = {
-        { 0.3066406250f, 0.2170410156f, 0.1296386719f },
-        { 0.4638671875f, 0.2680664062f, 0.0           },
-        { 0.7998046875f, 0.1000976562f, 0.0           }
-    };
     int i;
 
-    memset(s->frame[0].pf_gains_new, 0, sizeof(s->frame[0].pf_gains_new));
-    memset(s->frame[1].pf_gains_new, 0, sizeof(s->frame[1].pf_gains_new));
+    memset(f->block[0].pf_gains_new, 0, sizeof(f->block[0].pf_gains_new));
+    memset(f->block[1].pf_gains_new, 0, sizeof(f->block[1].pf_gains_new));
 
-    if (s->startband == 0 && consumed + 16 <= s->framebits) {
-        int has_postfilter = opus_rc_p2model(rc, 1);
+    if (f->start_band == 0 && consumed + 16 <= f->framebits) {
+        int has_postfilter = ff_opus_rc_dec_log(rc, 1);
         if (has_postfilter) {
             float gain;
             int tapset, octave, period;
 
-            octave = opus_rc_unimodel(rc, 6);
-            period = (16 << octave) + opus_getrawbits(rc, 4 + octave) - 1;
-            gain   = 0.09375f * (opus_getrawbits(rc, 3) + 1);
-            tapset = (opus_rc_tell(rc) + 2 <= s->framebits) ?
-                     opus_rc_getsymbol(rc, celt_model_tapset) : 0;
+            octave = ff_opus_rc_dec_uint(rc, 6);
+            period = (16 << octave) + ff_opus_rc_get_raw(rc, 4 + octave) - 1;
+            gain   = 0.09375f * (ff_opus_rc_get_raw(rc, 3) + 1);
+            tapset = (opus_rc_tell(rc) + 2 <= f->framebits) ?
+                     ff_opus_rc_dec_cdf(rc, ff_celt_model_tapset) : 0;
 
             for (i = 0; i < 2; i++) {
-                CeltFrame *frame = &s->frame[i];
+                CeltBlock *block = &f->block[i];
 
-                frame->pf_period_new = FFMAX(period, CELT_POSTFILTER_MINPERIOD);
-                frame->pf_gains_new[0] = gain * postfilter_taps[tapset][0];
-                frame->pf_gains_new[1] = gain * postfilter_taps[tapset][1];
-                frame->pf_gains_new[2] = gain * postfilter_taps[tapset][2];
+                block->pf_period_new = FFMAX(period, CELT_POSTFILTER_MINPERIOD);
+                block->pf_gains_new[0] = gain * ff_celt_postfilter_taps[tapset][0];
+                block->pf_gains_new[1] = gain * ff_celt_postfilter_taps[tapset][1];
+                block->pf_gains_new[2] = gain * ff_celt_postfilter_taps[tapset][2];
             }
         }
 
@@ -1825,11 +292,11 @@ static int parse_postfilter(CeltContext *s, OpusRangeCoder *rc, int consumed)
     return consumed;
 }
 
-static void process_anticollapse(CeltContext *s, CeltFrame *frame, float *X)
+static void process_anticollapse(CeltFrame *f, CeltBlock *block, float *X)
 {
     int i, j, k;
 
-    for (i = s->startband; i < s->endband; i++) {
+    for (i = f->start_band; i < f->end_band; i++) {
         int renormalize = 0;
         float *xptr;
         float prev[2];
@@ -1838,355 +305,268 @@ static void process_anticollapse(CeltContext *s, CeltFrame *frame, float *X)
         int depth;
 
         /* depth in 1/8 bits */
-        depth = (1 + s->pulses[i]) / (celt_freq_range[i] << s->duration);
-        thresh = pow(2, -1.0 - 0.125f * depth);
-        sqrt_1 = 1.0f / sqrtf(celt_freq_range[i] << s->duration);
+        depth = (1 + f->pulses[i]) / (ff_celt_freq_range[i] << f->size);
+        thresh = exp2f(-1.0 - 0.125f * depth);
+        sqrt_1 = 1.0f / sqrtf(ff_celt_freq_range[i] << f->size);
 
-        xptr = X + (celt_freq_bands[i] << s->duration);
+        xptr = X + (ff_celt_freq_bands[i] << f->size);
 
-        prev[0] = frame->prev_energy[0][i];
-        prev[1] = frame->prev_energy[1][i];
-        if (s->coded_channels == 1) {
-            CeltFrame *frame1 = &s->frame[1];
+        prev[0] = block->prev_energy[0][i];
+        prev[1] = block->prev_energy[1][i];
+        if (f->channels == 1) {
+            CeltBlock *block1 = &f->block[1];
 
-            prev[0] = FFMAX(prev[0], frame1->prev_energy[0][i]);
-            prev[1] = FFMAX(prev[1], frame1->prev_energy[1][i]);
+            prev[0] = FFMAX(prev[0], block1->prev_energy[0][i]);
+            prev[1] = FFMAX(prev[1], block1->prev_energy[1][i]);
         }
-        Ediff = frame->energy[i] - FFMIN(prev[0], prev[1]);
+        Ediff = block->energy[i] - FFMIN(prev[0], prev[1]);
         Ediff = FFMAX(0, Ediff);
 
         /* r needs to be multiplied by 2 or 2*sqrt(2) depending on LM because
         short blocks don't have the same energy as long */
-        r = pow(2, 1 - Ediff);
-        if (s->duration == 3)
+        r = exp2f(1 - Ediff);
+        if (f->size == 3)
             r *= M_SQRT2;
         r = FFMIN(thresh, r) * sqrt_1;
-        for (k = 0; k < 1 << s->duration; k++) {
+        for (k = 0; k < 1 << f->size; k++) {
             /* Detect collapse */
-            if (!(frame->collapse_masks[i] & 1 << k)) {
+            if (!(block->collapse_masks[i] & 1 << k)) {
                 /* Fill with noise */
-                for (j = 0; j < celt_freq_range[i]; j++)
-                    xptr[(j << s->duration) + k] = (celt_rng(s) & 0x8000) ? r : -r;
+                for (j = 0; j < ff_celt_freq_range[i]; j++)
+                    xptr[(j << f->size) + k] = (celt_rng(f) & 0x8000) ? r : -r;
                 renormalize = 1;
             }
         }
 
         /* We just added some energy, so we need to renormalize */
         if (renormalize)
-            celt_renormalize_vector(xptr, celt_freq_range[i] << s->duration, 1.0f);
+            celt_renormalize_vector(xptr, ff_celt_freq_range[i] << f->size, 1.0f);
     }
 }
 
-static void celt_decode_bands(CeltContext *s, OpusRangeCoder *rc)
+int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc,
+                         float **output, int channels, int frame_size,
+                         int start_band,  int end_band)
 {
-    float lowband_scratch[8 * 22];
-    float norm[2 * 8 * 100];
-
-    int totalbits = (s->framebits << 3) - s->anticollapse_bit;
-
-    int update_lowband = 1;
-    int lowband_offset = 0;
-
-    int i, j;
-
-    memset(s->coeffs, 0, sizeof(s->coeffs));
-
-    for (i = s->startband; i < s->endband; i++) {
-        int band_offset = celt_freq_bands[i] << s->duration;
-        int band_size   = celt_freq_range[i] << s->duration;
-        float *X = s->coeffs[0] + band_offset;
-        float *Y = (s->coded_channels == 2) ? s->coeffs[1] + band_offset : NULL;
-
-        int consumed = opus_rc_tell_frac(rc);
-        float *norm2 = norm + 8 * 100;
-        int effective_lowband = -1;
-        unsigned int cm[2];
-        int b;
-
-        /* Compute how many bits we want to allocate to this band */
-        if (i != s->startband)
-            s->remaining -= consumed;
-        s->remaining2 = totalbits - consumed - 1;
-        if (i <= s->codedbands - 1) {
-            int curr_balance = s->remaining / FFMIN(3, s->codedbands-i);
-            b = av_clip_uintp2(FFMIN(s->remaining2 + 1, s->pulses[i] + curr_balance), 14);
-        } else
-            b = 0;
-
-        if (celt_freq_bands[i] - celt_freq_range[i] >= celt_freq_bands[s->startband] &&
-            (update_lowband || lowband_offset == 0))
-            lowband_offset = i;
-
-        /* Get a conservative estimate of the collapse_mask's for the bands we're
-        going to be folding from. */
-        if (lowband_offset != 0 && (s->spread != CELT_SPREAD_AGGRESSIVE ||
-                                    s->blocks > 1 || s->tf_change[i] < 0)) {
-            int foldstart, foldend;
-
-            /* This ensures we never repeat spectral content within one band */
-            effective_lowband = FFMAX(celt_freq_bands[s->startband],
-                                      celt_freq_bands[lowband_offset] - celt_freq_range[i]);
-            foldstart = lowband_offset;
-            while (celt_freq_bands[--foldstart] > effective_lowband);
-            foldend = lowband_offset - 1;
-            while (celt_freq_bands[++foldend] < effective_lowband + celt_freq_range[i]);
-
-            cm[0] = cm[1] = 0;
-            for (j = foldstart; j < foldend; j++) {
-                cm[0] |= s->frame[0].collapse_masks[j];
-                cm[1] |= s->frame[s->coded_channels - 1].collapse_masks[j];
-            }
-        } else
-            /* Otherwise, we'll be using the LCG to fold, so all blocks will (almost
-            always) be non-zero.*/
-            cm[0] = cm[1] = (1 << s->blocks) - 1;
-
-        if (s->dualstereo && i == s->intensitystereo) {
-            /* Switch off dual stereo to do intensity */
-            s->dualstereo = 0;
-            for (j = celt_freq_bands[s->startband] << s->duration; j < band_offset; j++)
-                norm[j] = (norm[j] + norm2[j]) / 2;
-        }
-
-        if (s->dualstereo) {
-            cm[0] = celt_decode_band(s, rc, i, X, NULL, band_size, b / 2, s->blocks,
-                                     effective_lowband != -1 ? norm + (effective_lowband << s->duration) : NULL, s->duration,
-            norm + band_offset, 0, 1.0f, lowband_scratch, cm[0]);
-
-            cm[1] = celt_decode_band(s, rc, i, Y, NULL, band_size, b/2, s->blocks,
-                                     effective_lowband != -1 ? norm2 + (effective_lowband << s->duration) : NULL, s->duration,
-            norm2 + band_offset, 0, 1.0f, lowband_scratch, cm[1]);
-        } else {
-            cm[0] = celt_decode_band(s, rc, i, X, Y, band_size, b, s->blocks,
-            effective_lowband != -1 ? norm + (effective_lowband << s->duration) : NULL, s->duration,
-            norm + band_offset, 0, 1.0f, lowband_scratch, cm[0]|cm[1]);
-
-            cm[1] = cm[0];
-        }
-
-        s->frame[0].collapse_masks[i]                     = (uint8_t)cm[0];
-        s->frame[s->coded_channels - 1].collapse_masks[i] = (uint8_t)cm[1];
-        s->remaining += s->pulses[i] + consumed;
-
-        /* Update the folding position only as long as we have 1 bit/sample depth */
-        update_lowband = (b > band_size << 3);
-    }
-}
-
-int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc,
-                         float **output, int coded_channels, int frame_size,
-                         int startband,  int endband)
-{
-    int i, j;
-
+    int i, j, downmix = 0;
     int consumed;           // bits of entropy consumed thus far for this frame
-    int silence = 0;
-    int transient = 0;
-    int anticollapse = 0;
-    IMDCT15Context *imdct;
-    float imdct_scale = 1.0;
-
-    if (coded_channels != 1 && coded_channels != 2) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid number of coded channels: %d\n",
-               coded_channels);
+    MDCT15Context *imdct;
+
+    if (channels != 1 && channels != 2) {
+        av_log(f->avctx, AV_LOG_ERROR, "Invalid number of coded channels: %d\n",
+               channels);
         return AVERROR_INVALIDDATA;
     }
-    if (startband < 0 || startband > endband || endband > CELT_MAX_BANDS) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid start/end band: %d %d\n",
-               startband, endband);
+    if (start_band < 0 || start_band > end_band || end_band > CELT_MAX_BANDS) {
+        av_log(f->avctx, AV_LOG_ERROR, "Invalid start/end band: %d %d\n",
+               start_band, end_band);
         return AVERROR_INVALIDDATA;
     }
 
-    s->flushed        = 0;
-    s->coded_channels = coded_channels;
-    s->startband      = startband;
-    s->endband        = endband;
-    s->framebits      = rc->rb.bytes * 8;
+    f->silence        = 0;
+    f->transient      = 0;
+    f->anticollapse   = 0;
+    f->flushed        = 0;
+    f->channels       = channels;
+    f->start_band     = start_band;
+    f->end_band       = end_band;
+    f->framebits      = rc->rb.bytes * 8;
 
-    s->duration = av_log2(frame_size / CELT_SHORT_BLOCKSIZE);
-    if (s->duration > CELT_MAX_LOG_BLOCKS ||
-        frame_size != CELT_SHORT_BLOCKSIZE * (1 << s->duration)) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid CELT frame size: %d\n",
+    f->size = av_log2(frame_size / CELT_SHORT_BLOCKSIZE);
+    if (f->size > CELT_MAX_LOG_BLOCKS ||
+        frame_size != CELT_SHORT_BLOCKSIZE * (1 << f->size)) {
+        av_log(f->avctx, AV_LOG_ERROR, "Invalid CELT frame size: %d\n",
                frame_size);
         return AVERROR_INVALIDDATA;
     }
 
-    if (!s->output_channels)
-        s->output_channels = coded_channels;
+    if (!f->output_channels)
+        f->output_channels = channels;
 
-    memset(s->frame[0].collapse_masks, 0, sizeof(s->frame[0].collapse_masks));
-    memset(s->frame[1].collapse_masks, 0, sizeof(s->frame[1].collapse_masks));
+    for (i = 0; i < f->channels; i++) {
+        memset(f->block[i].coeffs,         0, sizeof(f->block[i].coeffs));
+        memset(f->block[i].collapse_masks, 0, sizeof(f->block[i].collapse_masks));
+    }
 
     consumed = opus_rc_tell(rc);
 
     /* obtain silence flag */
-    if (consumed >= s->framebits)
-        silence = 1;
+    if (consumed >= f->framebits)
+        f->silence = 1;
     else if (consumed == 1)
-        silence = opus_rc_p2model(rc, 15);
+        f->silence = ff_opus_rc_dec_log(rc, 15);
 
 
-    if (silence) {
-        consumed = s->framebits;
-        rc->total_read_bits += s->framebits - opus_rc_tell(rc);
+    if (f->silence) {
+        consumed = f->framebits;
+        rc->total_bits += f->framebits - opus_rc_tell(rc);
     }
 
     /* obtain post-filter options */
-    consumed = parse_postfilter(s, rc, consumed);
+    consumed = parse_postfilter(f, rc, consumed);
 
     /* obtain transient flag */
-    if (s->duration != 0 && consumed+3 <= s->framebits)
-        transient = opus_rc_p2model(rc, 3);
+    if (f->size != 0 && consumed+3 <= f->framebits)
+        f->transient = ff_opus_rc_dec_log(rc, 3);
 
-    s->blocks    = transient ? 1 << s->duration : 1;
-    s->blocksize = frame_size / s->blocks;
+    f->blocks    = f->transient ? 1 << f->size : 1;
+    f->blocksize = frame_size / f->blocks;
 
-    imdct = s->imdct[transient ? 0 : s->duration];
+    imdct = f->imdct[f->transient ? 0 : f->size];
 
-    if (coded_channels == 1) {
+    if (channels == 1) {
         for (i = 0; i < CELT_MAX_BANDS; i++)
-            s->frame[0].energy[i] = FFMAX(s->frame[0].energy[i], s->frame[1].energy[i]);
+            f->block[0].energy[i] = FFMAX(f->block[0].energy[i], f->block[1].energy[i]);
     }
 
-    celt_decode_coarse_energy(s, rc);
-    celt_decode_tf_changes   (s, rc, transient);
-    celt_decode_allocation   (s, rc);
-    celt_decode_fine_energy  (s, rc);
-    celt_decode_bands        (s, rc);
+    celt_decode_coarse_energy(f, rc);
+    celt_decode_tf_changes   (f, rc);
+    ff_celt_bitalloc         (f, rc, 0);
+    celt_decode_fine_energy  (f, rc);
+    ff_celt_quant_bands      (f, rc);
 
-    if (s->anticollapse_bit)
-        anticollapse = opus_getrawbits(rc, 1);
+    if (f->anticollapse_needed)
+        f->anticollapse = ff_opus_rc_get_raw(rc, 1);
 
-    celt_decode_final_energy(s, rc, s->framebits - opus_rc_tell(rc));
+    celt_decode_final_energy(f, rc);
 
     /* apply anti-collapse processing and denormalization to
      * each coded channel */
-    for (i = 0; i < s->coded_channels; i++) {
-        CeltFrame *frame = &s->frame[i];
+    for (i = 0; i < f->channels; i++) {
+        CeltBlock *block = &f->block[i];
 
-        if (anticollapse)
-            process_anticollapse(s, frame, s->coeffs[i]);
+        if (f->anticollapse)
+            process_anticollapse(f, block, f->block[i].coeffs);
 
-        celt_denormalize(s, frame, s->coeffs[i]);
+        celt_denormalize(f, block, f->block[i].coeffs);
     }
 
     /* stereo -> mono downmix */
-    if (s->output_channels < s->coded_channels) {
-        s->dsp.vector_fmac_scalar(s->coeffs[0], s->coeffs[1], 1.0, FFALIGN(frame_size, 16));
-        imdct_scale = 0.5;
-    } else if (s->output_channels > s->coded_channels)
-        memcpy(s->coeffs[1], s->coeffs[0], frame_size * sizeof(float));
+    if (f->output_channels < f->channels) {
+        f->dsp->vector_fmac_scalar(f->block[0].coeffs, f->block[1].coeffs, 1.0, FFALIGN(frame_size, 16));
+        downmix = 1;
+    } else if (f->output_channels > f->channels)
+        memcpy(f->block[1].coeffs, f->block[0].coeffs, frame_size * sizeof(float));
 
-    if (silence) {
+    if (f->silence) {
         for (i = 0; i < 2; i++) {
-            CeltFrame *frame = &s->frame[i];
+            CeltBlock *block = &f->block[i];
 
-            for (j = 0; j < FF_ARRAY_ELEMS(frame->energy); j++)
-                frame->energy[j] = CELT_ENERGY_SILENCE;
+            for (j = 0; j < FF_ARRAY_ELEMS(block->energy); j++)
+                block->energy[j] = CELT_ENERGY_SILENCE;
         }
-        memset(s->coeffs, 0, sizeof(s->coeffs));
+        memset(f->block[0].coeffs, 0, sizeof(f->block[0].coeffs));
+        memset(f->block[1].coeffs, 0, sizeof(f->block[1].coeffs));
     }
 
     /* transform and output for each output channel */
-    for (i = 0; i < s->output_channels; i++) {
-        CeltFrame *frame = &s->frame[i];
-        float m = frame->deemph_coeff;
+    for (i = 0; i < f->output_channels; i++) {
+        CeltBlock *block = &f->block[i];
+        float m = block->emph_coeff;
 
         /* iMDCT and overlap-add */
-        for (j = 0; j < s->blocks; j++) {
-            float *dst  = frame->buf + 1024 + j * s->blocksize;
+        for (j = 0; j < f->blocks; j++) {
+            float *dst  = block->buf + 1024 + j * f->blocksize;
 
-            imdct->imdct_half(imdct, dst + CELT_OVERLAP / 2, s->coeffs[i] + j,
-                              s->blocks, imdct_scale);
-            s->dsp.vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
-                                      celt_window, CELT_OVERLAP / 2);
+            imdct->imdct_half(imdct, dst + CELT_OVERLAP / 2, f->block[i].coeffs + j,
+                              f->blocks);
+            f->dsp->vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
+                                       ff_celt_window, CELT_OVERLAP / 2);
         }
 
+        if (downmix)
+            f->dsp->vector_fmul_scalar(&block->buf[1024], &block->buf[1024], 0.5f, frame_size);
+
         /* postfilter */
-        celt_postfilter(s, frame);
+        celt_postfilter(f, block);
 
         /* deemphasis and output scaling */
         for (j = 0; j < frame_size; j++) {
-            float tmp = frame->buf[1024 - frame_size + j] + m;
-            m = tmp * CELT_DEEMPH_COEFF;
-            output[i][j] = tmp / 32768.;
+            const float tmp = block->buf[1024 - frame_size + j] + m;
+            m = tmp * CELT_EMPH_COEFF;
+            output[i][j] = tmp;
         }
-        frame->deemph_coeff = m;
+
+        block->emph_coeff = m;
     }
 
-    if (coded_channels == 1)
-        memcpy(s->frame[1].energy, s->frame[0].energy, sizeof(s->frame[0].energy));
+    if (channels == 1)
+        memcpy(f->block[1].energy, f->block[0].energy, sizeof(f->block[0].energy));
 
     for (i = 0; i < 2; i++ ) {
-        CeltFrame *frame = &s->frame[i];
+        CeltBlock *block = &f->block[i];
 
-        if (!transient) {
-            memcpy(frame->prev_energy[1], frame->prev_energy[0], sizeof(frame->prev_energy[0]));
-            memcpy(frame->prev_energy[0], frame->energy,         sizeof(frame->prev_energy[0]));
+        if (!f->transient) {
+            memcpy(block->prev_energy[1], block->prev_energy[0], sizeof(block->prev_energy[0]));
+            memcpy(block->prev_energy[0], block->energy,         sizeof(block->prev_energy[0]));
         } else {
             for (j = 0; j < CELT_MAX_BANDS; j++)
-                frame->prev_energy[0][j] = FFMIN(frame->prev_energy[0][j], frame->energy[j]);
+                block->prev_energy[0][j] = FFMIN(block->prev_energy[0][j], block->energy[j]);
         }
 
-        for (j = 0; j < s->startband; j++) {
-            frame->prev_energy[0][j] = CELT_ENERGY_SILENCE;
-            frame->energy[j]         = 0.0;
+        for (j = 0; j < f->start_band; j++) {
+            block->prev_energy[0][j] = CELT_ENERGY_SILENCE;
+            block->energy[j]         = 0.0;
         }
-        for (j = s->endband; j < CELT_MAX_BANDS; j++) {
-            frame->prev_energy[0][j] = CELT_ENERGY_SILENCE;
-            frame->energy[j]         = 0.0;
+        for (j = f->end_band; j < CELT_MAX_BANDS; j++) {
+            block->prev_energy[0][j] = CELT_ENERGY_SILENCE;
+            block->energy[j]         = 0.0;
         }
     }
 
-    s->seed = rc->range;
+    f->seed = rc->range;
 
     return 0;
 }
 
-void ff_celt_flush(CeltContext *s)
+void ff_celt_flush(CeltFrame *f)
 {
     int i, j;
 
-    if (s->flushed)
+    if (f->flushed)
         return;
 
     for (i = 0; i < 2; i++) {
-        CeltFrame *frame = &s->frame[i];
+        CeltBlock *block = &f->block[i];
 
         for (j = 0; j < CELT_MAX_BANDS; j++)
-            frame->prev_energy[0][j] = frame->prev_energy[1][j] = CELT_ENERGY_SILENCE;
+            block->prev_energy[0][j] = block->prev_energy[1][j] = CELT_ENERGY_SILENCE;
 
-        memset(frame->energy, 0, sizeof(frame->energy));
-        memset(frame->buf,    0, sizeof(frame->buf));
+        memset(block->energy, 0, sizeof(block->energy));
+        memset(block->buf,    0, sizeof(block->buf));
 
-        memset(frame->pf_gains,     0, sizeof(frame->pf_gains));
-        memset(frame->pf_gains_old, 0, sizeof(frame->pf_gains_old));
-        memset(frame->pf_gains_new, 0, sizeof(frame->pf_gains_new));
+        memset(block->pf_gains,     0, sizeof(block->pf_gains));
+        memset(block->pf_gains_old, 0, sizeof(block->pf_gains_old));
+        memset(block->pf_gains_new, 0, sizeof(block->pf_gains_new));
 
-        frame->deemph_coeff = 0.0;
+        block->emph_coeff = 0.0;
     }
-    s->seed = 0;
+    f->seed = 0;
 
-    s->flushed = 1;
+    f->flushed = 1;
 }
 
-void ff_celt_free(CeltContext **ps)
+void ff_celt_free(CeltFrame **f)
 {
-    CeltContext *s = *ps;
+    CeltFrame *frm = *f;
     int i;
 
-    if (!s)
+    if (!frm)
         return;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->imdct); i++)
-        ff_imdct15_uninit(&s->imdct[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
+        ff_mdct15_uninit(&frm->imdct[i]);
+
+    ff_celt_pvq_uninit(&frm->pvq);
 
-    av_freep(ps);
+    av_freep(&frm->dsp);
+    av_freep(f);
 }
 
-int ff_celt_init(AVCodecContext *avctx, CeltContext **ps, int output_channels)
+int ff_celt_init(AVCodecContext *avctx, CeltFrame **f, int output_channels,
+                 int apply_phase_inv)
 {
-    CeltContext *s;
+    CeltFrame *frm;
     int i, ret;
 
     if (output_channels != 1 && output_channels != 2) {
@@ -2195,27 +575,33 @@ int ff_celt_init(AVCodecContext *avctx, CeltContext **ps, int output_channels)
         return AVERROR(EINVAL);
     }
 
-    s = av_mallocz(sizeof(*s));
-    if (!s)
+    frm = av_mallocz(sizeof(*frm));
+    if (!frm)
         return AVERROR(ENOMEM);
 
-    s->avctx           = avctx;
-    s->output_channels = output_channels;
+    frm->avctx           = avctx;
+    frm->output_channels = output_channels;
+    frm->apply_phase_inv = apply_phase_inv;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->imdct); i++) {
-        ret = ff_imdct15_init(&s->imdct[i], i + 3);
-        if (ret < 0)
+    for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
+        if ((ret = ff_mdct15_init(&frm->imdct[i], 1, i + 3, -1.0f/32768)) < 0)
             goto fail;
-    }
 
-    avpriv_float_dsp_init(&s->dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if ((ret = ff_celt_pvq_init(&frm->pvq, 0)) < 0)
+        goto fail;
+
+    frm->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!frm->dsp) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
 
-    ff_celt_flush(s);
+    ff_celt_flush(frm);
 
-    *ps = s;
+    *f = frm;
 
     return 0;
 fail:
-    ff_celt_free(&s);
+    ff_celt_free(&frm);
     return ret;
 }
diff --git a/libavcodec/opus_celt.h b/libavcodec/opus_celt.h
new file mode 100644
index 0000000..9289a18
--- /dev/null
+++ b/libavcodec/opus_celt.h
@@ -0,0 +1,170 @@
+/*
+ * Opus decoder/demuxer common functions
+ * Copyright (c) 2012 Andrew D'Addesio
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_OPUS_CELT_H
+#define AVCODEC_OPUS_CELT_H
+
+#include <float.h>
+
+#include "opus.h"
+#include "opus_pvq.h"
+
+#include "mdct15.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
+
+#define CELT_VECTORS                 11
+#define CELT_ALLOC_STEPS             6
+#define CELT_FINE_OFFSET             21
+#define CELT_MAX_FINE_BITS           8
+#define CELT_NORM_SCALE              16384
+#define CELT_QTHETA_OFFSET           4
+#define CELT_QTHETA_OFFSET_TWOPHASE  16
+#define CELT_EMPH_COEFF              0.85000610f
+#define CELT_POSTFILTER_MINPERIOD    15
+#define CELT_ENERGY_SILENCE          (-28.0f)
+
+typedef struct CeltPVQ CeltPVQ;
+
+enum CeltSpread {
+    CELT_SPREAD_NONE,
+    CELT_SPREAD_LIGHT,
+    CELT_SPREAD_NORMAL,
+    CELT_SPREAD_AGGRESSIVE
+};
+
+enum CeltBlockSize {
+    CELT_BLOCK_120,
+    CELT_BLOCK_240,
+    CELT_BLOCK_480,
+    CELT_BLOCK_960,
+
+    CELT_BLOCK_NB
+};
+
+typedef struct CeltBlock {
+    float energy[CELT_MAX_BANDS];
+    float lin_energy[CELT_MAX_BANDS];
+    float error_energy[CELT_MAX_BANDS];
+    float prev_energy[2][CELT_MAX_BANDS];
+
+    uint8_t collapse_masks[CELT_MAX_BANDS];
+
+    /* buffer for mdct output + postfilter */
+    DECLARE_ALIGNED(32, float, buf)[2048];
+    DECLARE_ALIGNED(32, float, coeffs)[CELT_MAX_FRAME_SIZE];
+
+    /* Used by the encoder */
+    DECLARE_ALIGNED(32, float, overlap)[FFALIGN(CELT_OVERLAP, 16)];
+    DECLARE_ALIGNED(32, float, samples)[FFALIGN(CELT_MAX_FRAME_SIZE, 16)];
+
+    /* postfilter parameters */
+    int   pf_period_new;
+    float pf_gains_new[3];
+    int   pf_period;
+    float pf_gains[3];
+    int   pf_period_old;
+    float pf_gains_old[3];
+
+    float emph_coeff;
+} CeltBlock;
+
+struct CeltFrame {
+    // constant values that do not change during context lifetime
+    AVCodecContext      *avctx;
+    MDCT15Context       *imdct[4];
+    AVFloatDSPContext   *dsp;
+    CeltBlock           block[2];
+    CeltPVQ             *pvq;
+    int channels;
+    int output_channels;
+    int apply_phase_inv;
+
+    enum CeltBlockSize size;
+    int start_band;
+    int end_band;
+    int coded_bands;
+    int transient;
+    int pfilter;
+    int skip_band_floor;
+    int tf_select;
+    int alloc_trim;
+    int alloc_boost[CELT_MAX_BANDS];
+    int blocks;        /* number of iMDCT blocks in the frame, depends on transient */
+    int blocksize;     /* size of each block */
+    int silence;       /* Frame is filled with silence */
+    int anticollapse_needed; /* Whether to expect an anticollapse bit */
+    int anticollapse;  /* Encoded anticollapse bit */
+    int intensity_stereo;
+    int dual_stereo;
+    int flushed;
+    uint32_t seed;
+    enum CeltSpread spread;
+
+    /* Encoder PF coeffs */
+    int pf_octave;
+    int pf_period;
+    int pf_tapset;
+    float pf_gain;
+
+    /* Bit allocation */
+    int framebits;
+    int remaining;
+    int remaining2;
+    int caps         [CELT_MAX_BANDS];
+    int fine_bits    [CELT_MAX_BANDS];
+    int fine_priority[CELT_MAX_BANDS];
+    int pulses       [CELT_MAX_BANDS];
+    int tf_change    [CELT_MAX_BANDS];
+};
+
+/* LCG for noise generation */
+static av_always_inline uint32_t celt_rng(CeltFrame *f)
+{
+    f->seed = 1664525 * f->seed + 1013904223;
+    return f->seed;
+}
+
+static av_always_inline void celt_renormalize_vector(float *X, int N, float gain)
+{
+    int i;
+    float g = 1e-15f;
+    for (i = 0; i < N; i++)
+        g += X[i] * X[i];
+    g = gain / sqrtf(g);
+
+    for (i = 0; i < N; i++)
+        X[i] *= g;
+}
+
+int ff_celt_init(AVCodecContext *avctx, CeltFrame **f, int output_channels,
+                 int apply_phase_inv);
+
+void ff_celt_free(CeltFrame **f);
+
+void ff_celt_flush(CeltFrame *f);
+
+int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc, float **output,
+                         int coded_channels, int frame_size, int startband, int endband);
+
+#endif /* AVCODEC_OPUS_CELT_H */
diff --git a/libavcodec/opus_parser.c b/libavcodec/opus_parser.c
index d256fbb..28b0933 100644
--- a/libavcodec/opus_parser.c
+++ b/libavcodec/opus_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,10 +31,10 @@
 #include "parser.h"
 
 typedef struct OpusParseContext {
+    ParseContext pc;
     OpusContext ctx;
     OpusPacket pkt;
     int extradata_parsed;
-    ParseContext pc;
     int ts_framing;
 } OpusParseContext;
 
@@ -43,6 +43,7 @@ static const uint8_t *parse_opus_ts_header(const uint8_t *start, int *payload_le
     const uint8_t *buf = start + 1;
     int start_trim_flag, end_trim_flag, control_extension_flag, control_extension_length;
     uint8_t flags;
+    uint64_t payload_len_tmp;
 
     GetByteContext gb;
     bytestream2_init(&gb, buf, buf_len);
@@ -52,11 +53,11 @@ static const uint8_t *parse_opus_ts_header(const uint8_t *start, int *payload_le
     end_trim_flag          = (flags >> 3) & 1;
     control_extension_flag = (flags >> 2) & 1;
 
-    *payload_len = 0;
+    payload_len_tmp = *payload_len = 0;
     while (bytestream2_peek_byte(&gb) == 0xff)
-        *payload_len += bytestream2_get_byte(&gb);
+        payload_len_tmp += bytestream2_get_byte(&gb);
 
-    *payload_len += bytestream2_get_byte(&gb);
+    payload_len_tmp += bytestream2_get_byte(&gb);
 
     if (start_trim_flag)
         bytestream2_skip(&gb, 2);
@@ -67,6 +68,11 @@ static const uint8_t *parse_opus_ts_header(const uint8_t *start, int *payload_le
         bytestream2_skip(&gb, control_extension_length);
     }
 
+    if (bytestream2_tell(&gb) + payload_len_tmp > buf_len)
+        return NULL;
+
+    *payload_len = payload_len_tmp;
+
     return buf + bytestream2_tell(&gb);
 }
 
@@ -104,6 +110,10 @@ static int opus_find_frame_end(AVCodecParserContext *ctx, AVCodecContext *avctx,
             state = (state << 8) | payload[i];
             if ((state & OPUS_TS_MASK) == OPUS_TS_HEADER) {
                 payload = parse_opus_ts_header(payload, &payload_len, buf_size - i);
+                if (!payload) {
+                    av_log(avctx, AV_LOG_ERROR, "Error parsing Ogg TS header.\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 *header_len = payload - buf;
                 start_found = 1;
                 break;
diff --git a/libavcodec/opus_pvq.c b/libavcodec/opus_pvq.c
new file mode 100644
index 0000000..0dbf141
--- /dev/null
+++ b/libavcodec/opus_pvq.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2007-2008 CSIRO
+ * Copyright (c) 2007-2009 Xiph.Org Foundation
+ * Copyright (c) 2008-2009 Gregory Maxwell
+ * Copyright (c) 2012 Andrew D'Addesio
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "opustab.h"
+#include "opus_pvq.h"
+
+#define CELT_PVQ_U(n, k) (ff_celt_pvq_u_row[FFMIN(n, k)][FFMAX(n, k)])
+#define CELT_PVQ_V(n, k) (CELT_PVQ_U(n, k) + CELT_PVQ_U(n, (k) + 1))
+
+static inline int16_t celt_cos(int16_t x)
+{
+    x = (MUL16(x, x) + 4096) >> 13;
+    x = (32767-x) + ROUND_MUL16(x, (-7651 + ROUND_MUL16(x, (8277 + ROUND_MUL16(-626, x)))));
+    return x + 1;
+}
+
+static inline int celt_log2tan(int isin, int icos)
+{
+    int lc, ls;
+    lc = opus_ilog(icos);
+    ls = opus_ilog(isin);
+    icos <<= 15 - lc;
+    isin <<= 15 - ls;
+    return (ls << 11) - (lc << 11) +
+           ROUND_MUL16(isin, ROUND_MUL16(isin, -2597) + 7932) -
+           ROUND_MUL16(icos, ROUND_MUL16(icos, -2597) + 7932);
+}
+
+static inline int celt_bits2pulses(const uint8_t *cache, int bits)
+{
+    // TODO: Find the size of cache and make it into an array in the parameters list
+    int i, low = 0, high;
+
+    high = cache[0];
+    bits--;
+
+    for (i = 0; i < 6; i++) {
+        int center = (low + high + 1) >> 1;
+        if (cache[center] >= bits)
+            high = center;
+        else
+            low = center;
+    }
+
+    return (bits - (low == 0 ? -1 : cache[low]) <= cache[high] - bits) ? low : high;
+}
+
+static inline int celt_pulses2bits(const uint8_t *cache, int pulses)
+{
+    // TODO: Find the size of cache and make it into an array in the parameters list
+   return (pulses == 0) ? 0 : cache[pulses] + 1;
+}
+
+static inline void celt_normalize_residual(const int * av_restrict iy, float * av_restrict X,
+                                           int N, float g)
+{
+    int i;
+    for (i = 0; i < N; i++)
+        X[i] = g * iy[i];
+}
+
+static void celt_exp_rotation_impl(float *X, uint32_t len, uint32_t stride,
+                                   float c, float s)
+{
+    float *Xptr;
+    int i;
+
+    Xptr = X;
+    for (i = 0; i < len - stride; i++) {
+        float x1     = Xptr[0];
+        float x2     = Xptr[stride];
+        Xptr[stride] = c * x2 + s * x1;
+        *Xptr++      = c * x1 - s * x2;
+    }
+
+    Xptr = &X[len - 2 * stride - 1];
+    for (i = len - 2 * stride - 1; i >= 0; i--) {
+        float x1     = Xptr[0];
+        float x2     = Xptr[stride];
+        Xptr[stride] = c * x2 + s * x1;
+        *Xptr--      = c * x1 - s * x2;
+    }
+}
+
+static inline void celt_exp_rotation(float *X, uint32_t len,
+                                     uint32_t stride, uint32_t K,
+                                     enum CeltSpread spread, const int encode)
+{
+    uint32_t stride2 = 0;
+    float c, s;
+    float gain, theta;
+    int i;
+
+    if (2*K >= len || spread == CELT_SPREAD_NONE)
+        return;
+
+    gain = (float)len / (len + (20 - 5*spread) * K);
+    theta = M_PI * gain * gain / 4;
+
+    c = cosf(theta);
+    s = sinf(theta);
+
+    if (len >= stride << 3) {
+        stride2 = 1;
+        /* This is just a simple (equivalent) way of computing sqrt(len/stride) with rounding.
+        It's basically incrementing long as (stride2+0.5)^2 < len/stride. */
+        while ((stride2 * stride2 + stride2) * stride + (stride >> 2) < len)
+            stride2++;
+    }
+
+    len /= stride;
+    for (i = 0; i < stride; i++) {
+        if (encode) {
+            celt_exp_rotation_impl(X + i * len, len, 1, c, -s);
+            if (stride2)
+                celt_exp_rotation_impl(X + i * len, len, stride2, s, -c);
+        } else {
+            if (stride2)
+                celt_exp_rotation_impl(X + i * len, len, stride2, s, c);
+            celt_exp_rotation_impl(X + i * len, len, 1, c, s);
+        }
+    }
+}
+
+static inline uint32_t celt_extract_collapse_mask(const int *iy, uint32_t N, uint32_t B)
+{
+    int i, j, N0 = N / B;
+    uint32_t collapse_mask = 0;
+
+    if (B <= 1)
+        return 1;
+
+    for (i = 0; i < B; i++)
+        for (j = 0; j < N0; j++)
+            collapse_mask |= (!!iy[i*N0+j]) << i;
+    return collapse_mask;
+}
+
+static inline void celt_stereo_merge(float *X, float *Y, float mid, int N)
+{
+    int i;
+    float xp = 0, side = 0;
+    float E[2];
+    float mid2;
+    float gain[2];
+
+    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
+    for (i = 0; i < N; i++) {
+        xp   += X[i] * Y[i];
+        side += Y[i] * Y[i];
+    }
+
+    /* Compensating for the mid normalization */
+    xp *= mid;
+    mid2 = mid;
+    E[0] = mid2 * mid2 + side - 2 * xp;
+    E[1] = mid2 * mid2 + side + 2 * xp;
+    if (E[0] < 6e-4f || E[1] < 6e-4f) {
+        for (i = 0; i < N; i++)
+            Y[i] = X[i];
+        return;
+    }
+
+    gain[0] = 1.0f / sqrtf(E[0]);
+    gain[1] = 1.0f / sqrtf(E[1]);
+
+    for (i = 0; i < N; i++) {
+        float value[2];
+        /* Apply mid scaling (side is already scaled) */
+        value[0] = mid * X[i];
+        value[1] = Y[i];
+        X[i] = gain[0] * (value[0] - value[1]);
+        Y[i] = gain[1] * (value[0] + value[1]);
+    }
+}
+
+static void celt_interleave_hadamard(float *tmp, float *X, int N0,
+                                     int stride, int hadamard)
+{
+    int i, j, N = N0*stride;
+    const uint8_t *order = &ff_celt_hadamard_order[hadamard ? stride - 2 : 30];
+
+    for (i = 0; i < stride; i++)
+        for (j = 0; j < N0; j++)
+            tmp[j*stride+i] = X[order[i]*N0+j];
+
+    memcpy(X, tmp, N*sizeof(float));
+}
+
+static void celt_deinterleave_hadamard(float *tmp, float *X, int N0,
+                                       int stride, int hadamard)
+{
+    int i, j, N = N0*stride;
+    const uint8_t *order = &ff_celt_hadamard_order[hadamard ? stride - 2 : 30];
+
+    for (i = 0; i < stride; i++)
+        for (j = 0; j < N0; j++)
+            tmp[order[i]*N0+j] = X[j*stride+i];
+
+    memcpy(X, tmp, N*sizeof(float));
+}
+
+static void celt_haar1(float *X, int N0, int stride)
+{
+    int i, j;
+    N0 >>= 1;
+    for (i = 0; i < stride; i++) {
+        for (j = 0; j < N0; j++) {
+            float x0 = X[stride * (2 * j + 0) + i];
+            float x1 = X[stride * (2 * j + 1) + i];
+            X[stride * (2 * j + 0) + i] = (x0 + x1) * M_SQRT1_2;
+            X[stride * (2 * j + 1) + i] = (x0 - x1) * M_SQRT1_2;
+        }
+    }
+}
+
+static inline int celt_compute_qn(int N, int b, int offset, int pulse_cap,
+                                  int stereo)
+{
+    int qn, qb;
+    int N2 = 2 * N - 1;
+    if (stereo && N == 2)
+        N2--;
+
+    /* The upper limit ensures that in a stereo split with itheta==16384, we'll
+     * always have enough bits left over to code at least one pulse in the
+     * side; otherwise it would collapse, since it doesn't get folded. */
+    qb = FFMIN3(b - pulse_cap - (4 << 3), (b + N2 * offset) / N2, 8 << 3);
+    qn = (qb < (1 << 3 >> 1)) ? 1 : ((ff_celt_qn_exp2[qb & 0x7] >> (14 - (qb >> 3))) + 1) >> 1 << 1;
+    return qn;
+}
+
+/* Convert the quantized vector to an index */
+static inline uint32_t celt_icwrsi(uint32_t N, uint32_t K, const int *y)
+{
+    int i, idx = 0, sum = 0;
+    for (i = N - 1; i >= 0; i--) {
+        const uint32_t i_s = CELT_PVQ_U(N - i, sum + FFABS(y[i]) + 1);
+        idx += CELT_PVQ_U(N - i, sum) + (y[i] < 0)*i_s;
+        sum += FFABS(y[i]);
+    }
+    return idx;
+}
+
+// this code was adapted from libopus
+static inline uint64_t celt_cwrsi(uint32_t N, uint32_t K, uint32_t i, int *y)
+{
+    uint64_t norm = 0;
+    uint32_t q, p;
+    int s, val;
+    int k0;
+
+    while (N > 2) {
+        /*Lots of pulses case:*/
+        if (K >= N) {
+            const uint32_t *row = ff_celt_pvq_u_row[N];
+
+            /* Are the pulses in this dimension negative? */
+            p  = row[K + 1];
+            s  = -(i >= p);
+            i -= p & s;
+
+            /*Count how many pulses were placed in this dimension.*/
+            k0 = K;
+            q = row[N];
+            if (q > i) {
+                K = N;
+                do {
+                    p = ff_celt_pvq_u_row[--K][N];
+                } while (p > i);
+            } else
+                for (p = row[K]; p > i; p = row[K])
+                    K--;
+
+            i    -= p;
+            val   = (k0 - K + s) ^ s;
+            norm += val * val;
+            *y++  = val;
+        } else { /*Lots of dimensions case:*/
+            /*Are there any pulses in this dimension at all?*/
+            p = ff_celt_pvq_u_row[K    ][N];
+            q = ff_celt_pvq_u_row[K + 1][N];
+
+            if (p <= i && i < q) {
+                i -= p;
+                *y++ = 0;
+            } else {
+                /*Are the pulses in this dimension negative?*/
+                s  = -(i >= q);
+                i -= q & s;
+
+                /*Count how many pulses were placed in this dimension.*/
+                k0 = K;
+                do p = ff_celt_pvq_u_row[--K][N];
+                while (p > i);
+
+                i    -= p;
+                val   = (k0 - K + s) ^ s;
+                norm += val * val;
+                *y++  = val;
+            }
+        }
+        N--;
+    }
+
+    /* N == 2 */
+    p  = 2 * K + 1;
+    s  = -(i >= p);
+    i -= p & s;
+    k0 = K;
+    K  = (i + 1) / 2;
+
+    if (K)
+        i -= 2 * K - 1;
+
+    val   = (k0 - K + s) ^ s;
+    norm += val * val;
+    *y++  = val;
+
+    /* N==1 */
+    s     = -i;
+    val   = (K + s) ^ s;
+    norm += val * val;
+    *y    = val;
+
+    return norm;
+}
+
+static inline void celt_encode_pulses(OpusRangeCoder *rc, int *y, uint32_t N, uint32_t K)
+{
+    ff_opus_rc_enc_uint(rc, celt_icwrsi(N, K, y), CELT_PVQ_V(N, K));
+}
+
+static inline float celt_decode_pulses(OpusRangeCoder *rc, int *y, uint32_t N, uint32_t K)
+{
+    const uint32_t idx = ff_opus_rc_dec_uint(rc, CELT_PVQ_V(N, K));
+    return celt_cwrsi(N, K, idx, y);
+}
+
+/*
+ * Faster than libopus's search, operates entirely in the signed domain.
+ * Slightly worse/better depending on N, K and the input vector.
+ */
+static float ppp_pvq_search_c(float *X, int *y, int K, int N)
+{
+    int i, y_norm = 0;
+    float res = 0.0f, xy_norm = 0.0f;
+
+    for (i = 0; i < N; i++)
+        res += FFABS(X[i]);
+
+    res = K/(res + FLT_EPSILON);
+
+    for (i = 0; i < N; i++) {
+        y[i] = lrintf(res*X[i]);
+        y_norm  += y[i]*y[i];
+        xy_norm += y[i]*X[i];
+        K -= FFABS(y[i]);
+    }
+
+    while (K) {
+        int max_idx = 0, phase = FFSIGN(K);
+        float max_num = 0.0f;
+        float max_den = 1.0f;
+        y_norm += 1.0f;
+
+        for (i = 0; i < N; i++) {
+            /* If the sum has been overshot and the best place has 0 pulses allocated
+             * to it, attempting to decrease it further will actually increase the
+             * sum. Prevent this by disregarding any 0 positions when decrementing. */
+            const int ca = 1 ^ ((y[i] == 0) & (phase < 0));
+            const int y_new = y_norm  + 2*phase*FFABS(y[i]);
+            float xy_new = xy_norm + 1*phase*FFABS(X[i]);
+            xy_new = xy_new * xy_new;
+            if (ca && (max_den*xy_new) > (y_new*max_num)) {
+                max_den = y_new;
+                max_num = xy_new;
+                max_idx = i;
+            }
+        }
+
+        K -= phase;
+
+        phase *= FFSIGN(X[max_idx]);
+        xy_norm += 1*phase*X[max_idx];
+        y_norm  += 2*phase*y[max_idx];
+        y[max_idx] += phase;
+    }
+
+    return (float)y_norm;
+}
+
+static uint32_t celt_alg_quant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_t K,
+                               enum CeltSpread spread, uint32_t blocks, float gain,
+                               CeltPVQ *pvq)
+{
+    int *y = pvq->qcoeff;
+
+    celt_exp_rotation(X, N, blocks, K, spread, 1);
+    gain /= sqrtf(pvq->pvq_search(X, y, K, N));
+    celt_encode_pulses(rc, y,  N, K);
+    celt_normalize_residual(y, X, N, gain);
+    celt_exp_rotation(X, N, blocks, K, spread, 0);
+    return celt_extract_collapse_mask(y, N, blocks);
+}
+
+/** Decode pulse vector and combine the result with the pitch vector to produce
+    the final normalised signal in the current band. */
+static uint32_t celt_alg_unquant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_t K,
+                                 enum CeltSpread spread, uint32_t blocks, float gain,
+                                 CeltPVQ *pvq)
+{
+    int *y = pvq->qcoeff;
+
+    gain /= sqrtf(celt_decode_pulses(rc, y, N, K));
+    celt_normalize_residual(y, X, N, gain);
+    celt_exp_rotation(X, N, blocks, K, spread, 0);
+    return celt_extract_collapse_mask(y, N, blocks);
+}
+
+static int celt_calc_theta(const float *X, const float *Y, int coupling, int N)
+{
+    int i;
+    float e[2] = { 0.0f, 0.0f };
+    if (coupling) { /* Coupling case */
+        for (i = 0; i < N; i++) {
+            e[0] += (X[i] + Y[i])*(X[i] + Y[i]);
+            e[1] += (X[i] - Y[i])*(X[i] - Y[i]);
+        }
+    } else {
+        for (i = 0; i < N; i++) {
+            e[0] += X[i]*X[i];
+            e[1] += Y[i]*Y[i];
+        }
+    }
+    return lrintf(32768.0f*atan2f(sqrtf(e[1]), sqrtf(e[0]))/M_PI);
+}
+
+static void celt_stereo_is_decouple(float *X, float *Y, float e_l, float e_r, int N)
+{
+    int i;
+    const float energy_n = 1.0f/(sqrtf(e_l*e_l + e_r*e_r) + FLT_EPSILON);
+    e_l *= energy_n;
+    e_r *= energy_n;
+    for (i = 0; i < N; i++)
+        X[i] = e_l*X[i] + e_r*Y[i];
+}
+
+static void celt_stereo_ms_decouple(float *X, float *Y, int N)
+{
+    int i;
+    for (i = 0; i < N; i++) {
+        const float Xret = X[i];
+        X[i] = (X[i] + Y[i])*M_SQRT1_2;
+        Y[i] = (Y[i] - Xret)*M_SQRT1_2;
+    }
+}
+
+static av_always_inline uint32_t quant_band_template(CeltPVQ *pvq, CeltFrame *f,
+                                                     OpusRangeCoder *rc,
+                                                     const int band, float *X,
+                                                     float *Y, int N, int b,
+                                                     uint32_t blocks, float *lowband,
+                                                     int duration, float *lowband_out,
+                                                     int level, float gain,
+                                                     float *lowband_scratch,
+                                                     int fill, int quant)
+{
+    int i;
+    const uint8_t *cache;
+    int stereo = !!Y, split = stereo;
+    int imid = 0, iside = 0;
+    uint32_t N0 = N;
+    int N_B = N / blocks;
+    int N_B0 = N_B;
+    int B0 = blocks;
+    int time_divide = 0;
+    int recombine = 0;
+    int inv = 0;
+    float mid = 0, side = 0;
+    int longblocks = (B0 == 1);
+    uint32_t cm = 0;
+
+    if (N == 1) {
+        float *x = X;
+        for (i = 0; i <= stereo; i++) {
+            int sign = 0;
+            if (f->remaining2 >= 1 << 3) {
+                if (quant) {
+                    sign = x[0] < 0;
+                    ff_opus_rc_put_raw(rc, sign, 1);
+                } else {
+                    sign = ff_opus_rc_get_raw(rc, 1);
+                }
+                f->remaining2 -= 1 << 3;
+            }
+            x[0] = 1.0f - 2.0f*sign;
+            x = Y;
+        }
+        if (lowband_out)
+            lowband_out[0] = X[0];
+        return 1;
+    }
+
+    if (!stereo && level == 0) {
+        int tf_change = f->tf_change[band];
+        int k;
+        if (tf_change > 0)
+            recombine = tf_change;
+        /* Band recombining to increase frequency resolution */
+
+        if (lowband &&
+            (recombine || ((N_B & 1) == 0 && tf_change < 0) || B0 > 1)) {
+            for (i = 0; i < N; i++)
+                lowband_scratch[i] = lowband[i];
+            lowband = lowband_scratch;
+        }
+
+        for (k = 0; k < recombine; k++) {
+            if (quant || lowband)
+                celt_haar1(quant ? X : lowband, N >> k, 1 << k);
+            fill = ff_celt_bit_interleave[fill & 0xF] | ff_celt_bit_interleave[fill >> 4] << 2;
+        }
+        blocks >>= recombine;
+        N_B <<= recombine;
+
+        /* Increasing the time resolution */
+        while ((N_B & 1) == 0 && tf_change < 0) {
+            if (quant || lowband)
+                celt_haar1(quant ? X : lowband, N_B, blocks);
+            fill |= fill << blocks;
+            blocks <<= 1;
+            N_B >>= 1;
+            time_divide++;
+            tf_change++;
+        }
+        B0 = blocks;
+        N_B0 = N_B;
+
+        /* Reorganize the samples in time order instead of frequency order */
+        if (B0 > 1 && (quant || lowband))
+            celt_deinterleave_hadamard(pvq->hadamard_tmp, quant ? X : lowband,
+                                       N_B >> recombine, B0 << recombine,
+                                       longblocks);
+    }
+
+    /* If we need 1.5 more bit than we can produce, split the band in two. */
+    cache = ff_celt_cache_bits +
+            ff_celt_cache_index[(duration + 1) * CELT_MAX_BANDS + band];
+    if (!stereo && duration >= 0 && b > cache[cache[0]] + 12 && N > 2) {
+        N >>= 1;
+        Y = X + N;
+        split = 1;
+        duration -= 1;
+        if (blocks == 1)
+            fill = (fill & 1) | (fill << 1);
+        blocks = (blocks + 1) >> 1;
+    }
+
+    if (split) {
+        int qn;
+        int itheta = quant ? celt_calc_theta(X, Y, stereo, N) : 0;
+        int mbits, sbits, delta;
+        int qalloc;
+        int pulse_cap;
+        int offset;
+        int orig_fill;
+        int tell;
+
+        /* Decide on the resolution to give to the split parameter theta */
+        pulse_cap = ff_celt_log_freq_range[band] + duration * 8;
+        offset = (pulse_cap >> 1) - (stereo && N == 2 ? CELT_QTHETA_OFFSET_TWOPHASE :
+                                                          CELT_QTHETA_OFFSET);
+        qn = (stereo && band >= f->intensity_stereo) ? 1 :
+             celt_compute_qn(N, b, offset, pulse_cap, stereo);
+        tell = opus_rc_tell_frac(rc);
+        if (qn != 1) {
+            if (quant)
+                itheta = (itheta*qn + 8192) >> 14;
+            /* Entropy coding of the angle. We use a uniform pdf for the
+             * time split, a step for stereo, and a triangular one for the rest. */
+            if (quant) {
+                if (stereo && N > 2)
+                    ff_opus_rc_enc_uint_step(rc, itheta, qn / 2);
+                else if (stereo || B0 > 1)
+                    ff_opus_rc_enc_uint(rc, itheta, qn + 1);
+                else
+                    ff_opus_rc_enc_uint_tri(rc, itheta, qn);
+                itheta = itheta * 16384 / qn;
+                if (stereo) {
+                    if (itheta == 0)
+                        celt_stereo_is_decouple(X, Y, f->block[0].lin_energy[band],
+                                                f->block[1].lin_energy[band], N);
+                    else
+                        celt_stereo_ms_decouple(X, Y, N);
+                }
+            } else {
+                if (stereo && N > 2)
+                    itheta = ff_opus_rc_dec_uint_step(rc, qn / 2);
+                else if (stereo || B0 > 1)
+                    itheta = ff_opus_rc_dec_uint(rc, qn+1);
+                else
+                    itheta = ff_opus_rc_dec_uint_tri(rc, qn);
+                itheta = itheta * 16384 / qn;
+            }
+        } else if (stereo) {
+            if (quant) {
+                inv = itheta > 8192;
+                 if (inv) {
+                    for (i = 0; i < N; i++)
+                       Y[i] *= -1;
+                 }
+                 celt_stereo_is_decouple(X, Y, f->block[0].lin_energy[band],
+                                         f->block[1].lin_energy[band], N);
+
+                if (b > 2 << 3 && f->remaining2 > 2 << 3) {
+                    ff_opus_rc_enc_log(rc, inv, 2);
+                } else {
+                    inv = 0;
+                }
+            } else {
+                inv = (b > 2 << 3 && f->remaining2 > 2 << 3) ? ff_opus_rc_dec_log(rc, 2) : 0;
+                inv = f->apply_phase_inv ? inv : 0;
+            }
+            itheta = 0;
+        }
+        qalloc = opus_rc_tell_frac(rc) - tell;
+        b -= qalloc;
+
+        orig_fill = fill;
+        if (itheta == 0) {
+            imid = 32767;
+            iside = 0;
+            fill = av_mod_uintp2(fill, blocks);
+            delta = -16384;
+        } else if (itheta == 16384) {
+            imid = 0;
+            iside = 32767;
+            fill &= ((1 << blocks) - 1) << blocks;
+            delta = 16384;
+        } else {
+            imid = celt_cos(itheta);
+            iside = celt_cos(16384-itheta);
+            /* This is the mid vs side allocation that minimizes squared error
+            in that band. */
+            delta = ROUND_MUL16((N - 1) << 7, celt_log2tan(iside, imid));
+        }
+
+        mid  = imid  / 32768.0f;
+        side = iside / 32768.0f;
+
+        /* This is a special case for N=2 that only works for stereo and takes
+        advantage of the fact that mid and side are orthogonal to encode
+        the side with just one bit. */
+        if (N == 2 && stereo) {
+            int c;
+            int sign = 0;
+            float tmp;
+            float *x2, *y2;
+            mbits = b;
+            /* Only need one bit for the side */
+            sbits = (itheta != 0 && itheta != 16384) ? 1 << 3 : 0;
+            mbits -= sbits;
+            c = (itheta > 8192);
+            f->remaining2 -= qalloc+sbits;
+
+            x2 = c ? Y : X;
+            y2 = c ? X : Y;
+            if (sbits) {
+                if (quant) {
+                    sign = x2[0]*y2[1] - x2[1]*y2[0] < 0;
+                    ff_opus_rc_put_raw(rc, sign, 1);
+                } else {
+                    sign = ff_opus_rc_get_raw(rc, 1);
+                }
+            }
+            sign = 1 - 2 * sign;
+            /* We use orig_fill here because we want to fold the side, but if
+            itheta==16384, we'll have cleared the low bits of fill. */
+            cm = pvq->quant_band(pvq, f, rc, band, x2, NULL, N, mbits, blocks, lowband, duration,
+                                 lowband_out, level, gain, lowband_scratch, orig_fill);
+            /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
+            and there's no need to worry about mixing with the other channel. */
+            y2[0] = -sign * x2[1];
+            y2[1] =  sign * x2[0];
+            X[0] *= mid;
+            X[1] *= mid;
+            Y[0] *= side;
+            Y[1] *= side;
+            tmp = X[0];
+            X[0] = tmp - Y[0];
+            Y[0] = tmp + Y[0];
+            tmp = X[1];
+            X[1] = tmp - Y[1];
+            Y[1] = tmp + Y[1];
+        } else {
+            /* "Normal" split code */
+            float *next_lowband2     = NULL;
+            float *next_lowband_out1 = NULL;
+            int next_level = 0;
+            int rebalance;
+            uint32_t cmt;
+
+            /* Give more bits to low-energy MDCTs than they would
+             * otherwise deserve */
+            if (B0 > 1 && !stereo && (itheta & 0x3fff)) {
+                if (itheta > 8192)
+                    /* Rough approximation for pre-echo masking */
+                    delta -= delta >> (4 - duration);
+                else
+                    /* Corresponds to a forward-masking slope of
+                     * 1.5 dB per 10 ms */
+                    delta = FFMIN(0, delta + (N << 3 >> (5 - duration)));
+            }
+            mbits = av_clip((b - delta) / 2, 0, b);
+            sbits = b - mbits;
+            f->remaining2 -= qalloc;
+
+            if (lowband && !stereo)
+                next_lowband2 = lowband + N; /* >32-bit split case */
+
+            /* Only stereo needs to pass on lowband_out.
+             * Otherwise, it's handled at the end */
+            if (stereo)
+                next_lowband_out1 = lowband_out;
+            else
+                next_level = level + 1;
+
+            rebalance = f->remaining2;
+            if (mbits >= sbits) {
+                /* In stereo mode, we do not apply a scaling to the mid
+                 * because we need the normalized mid for folding later */
+                cm = pvq->quant_band(pvq, f, rc, band, X, NULL, N, mbits, blocks,
+                                     lowband, duration, next_lowband_out1, next_level,
+                                     stereo ? 1.0f : (gain * mid), lowband_scratch, fill);
+                rebalance = mbits - (rebalance - f->remaining2);
+                if (rebalance > 3 << 3 && itheta != 0)
+                    sbits += rebalance - (3 << 3);
+
+                /* For a stereo split, the high bits of fill are always zero,
+                 * so no folding will be done to the side. */
+                cmt = pvq->quant_band(pvq, f, rc, band, Y, NULL, N, sbits, blocks,
+                                      next_lowband2, duration, NULL, next_level,
+                                      gain * side, NULL, fill >> blocks);
+                cm |= cmt << ((B0 >> 1) & (stereo - 1));
+            } else {
+                /* For a stereo split, the high bits of fill are always zero,
+                 * so no folding will be done to the side. */
+                cm = pvq->quant_band(pvq, f, rc, band, Y, NULL, N, sbits, blocks,
+                                     next_lowband2, duration, NULL, next_level,
+                                     gain * side, NULL, fill >> blocks);
+                cm <<= ((B0 >> 1) & (stereo - 1));
+                rebalance = sbits - (rebalance - f->remaining2);
+                if (rebalance > 3 << 3 && itheta != 16384)
+                    mbits += rebalance - (3 << 3);
+
+                /* In stereo mode, we do not apply a scaling to the mid because
+                 * we need the normalized mid for folding later */
+                cm |= pvq->quant_band(pvq, f, rc, band, X, NULL, N, mbits, blocks,
+                                      lowband, duration, next_lowband_out1, next_level,
+                                      stereo ? 1.0f : (gain * mid), lowband_scratch, fill);
+            }
+        }
+    } else {
+        /* This is the basic no-split case */
+        uint32_t q         = celt_bits2pulses(cache, b);
+        uint32_t curr_bits = celt_pulses2bits(cache, q);
+        f->remaining2 -= curr_bits;
+
+        /* Ensures we can never bust the budget */
+        while (f->remaining2 < 0 && q > 0) {
+            f->remaining2 += curr_bits;
+            curr_bits      = celt_pulses2bits(cache, --q);
+            f->remaining2 -= curr_bits;
+        }
+
+        if (q != 0) {
+            /* Finally do the actual (de)quantization */
+            if (quant) {
+                cm = celt_alg_quant(rc, X, N, (q < 8) ? q : (8 + (q & 7)) << ((q >> 3) - 1),
+                                    f->spread, blocks, gain, pvq);
+            } else {
+                cm = celt_alg_unquant(rc, X, N, (q < 8) ? q : (8 + (q & 7)) << ((q >> 3) - 1),
+                                      f->spread, blocks, gain, pvq);
+            }
+        } else {
+            /* If there's no pulse, fill the band anyway */
+            uint32_t cm_mask = (1 << blocks) - 1;
+            fill &= cm_mask;
+            if (fill) {
+                if (!lowband) {
+                    /* Noise */
+                    for (i = 0; i < N; i++)
+                        X[i] = (((int32_t)celt_rng(f)) >> 20);
+                    cm = cm_mask;
+                } else {
+                    /* Folded spectrum */
+                    for (i = 0; i < N; i++) {
+                        /* About 48 dB below the "normal" folding level */
+                        X[i] = lowband[i] + (((celt_rng(f)) & 0x8000) ? 1.0f / 256 : -1.0f / 256);
+                    }
+                    cm = fill;
+                }
+                celt_renormalize_vector(X, N, gain);
+            } else {
+                memset(X, 0, N*sizeof(float));
+            }
+        }
+    }
+
+    /* This code is used by the decoder and by the resynthesis-enabled encoder */
+    if (stereo) {
+        if (N > 2)
+            celt_stereo_merge(X, Y, mid, N);
+        if (inv) {
+            for (i = 0; i < N; i++)
+                Y[i] *= -1;
+        }
+    } else if (level == 0) {
+        int k;
+
+        /* Undo the sample reorganization going from time order to frequency order */
+        if (B0 > 1)
+            celt_interleave_hadamard(pvq->hadamard_tmp, X, N_B >> recombine,
+                                     B0 << recombine, longblocks);
+
+        /* Undo time-freq changes that we did earlier */
+        N_B = N_B0;
+        blocks = B0;
+        for (k = 0; k < time_divide; k++) {
+            blocks >>= 1;
+            N_B <<= 1;
+            cm |= cm >> blocks;
+            celt_haar1(X, N_B, blocks);
+        }
+
+        for (k = 0; k < recombine; k++) {
+            cm = ff_celt_bit_deinterleave[cm];
+            celt_haar1(X, N0>>k, 1<<k);
+        }
+        blocks <<= recombine;
+
+        /* Scale output for later folding */
+        if (lowband_out) {
+            float n = sqrtf(N0);
+            for (i = 0; i < N0; i++)
+                lowband_out[i] = n * X[i];
+        }
+        cm = av_mod_uintp2(cm, blocks);
+    }
+
+    return cm;
+}
+
+static QUANT_FN(pvq_decode_band)
+{
+#if CONFIG_OPUS_DECODER
+    return quant_band_template(pvq, f, rc, band, X, Y, N, b, blocks, lowband, duration,
+                               lowband_out, level, gain, lowband_scratch, fill, 0);
+#else
+    return 0;
+#endif
+}
+
+static QUANT_FN(pvq_encode_band)
+{
+#if CONFIG_OPUS_ENCODER
+    return quant_band_template(pvq, f, rc, band, X, Y, N, b, blocks, lowband, duration,
+                               lowband_out, level, gain, lowband_scratch, fill, 1);
+#else
+    return 0;
+#endif
+}
+
+int av_cold ff_celt_pvq_init(CeltPVQ **pvq, int encode)
+{
+    CeltPVQ *s = av_malloc(sizeof(CeltPVQ));
+    if (!s)
+        return AVERROR(ENOMEM);
+
+    s->pvq_search = ppp_pvq_search_c;
+    s->quant_band = encode ? pvq_encode_band : pvq_decode_band;
+
+    if (ARCH_X86)
+        ff_opus_dsp_init_x86(s);
+
+    *pvq = s;
+
+    return 0;
+}
+
+void av_cold ff_celt_pvq_uninit(CeltPVQ **pvq)
+{
+    av_freep(pvq);
+}
diff --git a/libavcodec/opus_pvq.h b/libavcodec/opus_pvq.h
new file mode 100644
index 0000000..e2f01a0
--- /dev/null
+++ b/libavcodec/opus_pvq.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012 Andrew D'Addesio
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_OPUS_PVQ_H
+#define AVCODEC_OPUS_PVQ_H
+
+#include "opus_celt.h"
+
+#define QUANT_FN(name) uint32_t (name)(struct CeltPVQ *pvq, CeltFrame *f,            \
+                                       OpusRangeCoder *rc, const int band, float *X, \
+                                       float *Y, int N, int b, uint32_t blocks,      \
+                                       float *lowband, int duration,                 \
+                                       float *lowband_out, int level, float gain,    \
+                                       float *lowband_scratch, int fill)
+
+struct CeltPVQ {
+    DECLARE_ALIGNED(32, int,   qcoeff      )[256];
+    DECLARE_ALIGNED(32, float, hadamard_tmp)[256];
+
+    float (*pvq_search)(float *X, int *y, int K, int N);
+    QUANT_FN(*quant_band);
+};
+
+void ff_opus_dsp_init_x86(struct CeltPVQ *s);
+
+int  ff_celt_pvq_init(struct CeltPVQ **pvq, int encode);
+void ff_celt_pvq_uninit(struct CeltPVQ **pvq);
+
+#endif /* AVCODEC_OPUS_PVQ_H */
diff --git a/libavcodec/opus_rc.c b/libavcodec/opus_rc.c
new file mode 100644
index 0000000..c432eb9
--- /dev/null
+++ b/libavcodec/opus_rc.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2012 Andrew D'Addesio
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "opus_rc.h"
+
+#define OPUS_RC_BITS 32
+#define OPUS_RC_SYM  8
+#define OPUS_RC_CEIL ((1 << OPUS_RC_SYM) - 1)
+#define OPUS_RC_TOP (1u << 31)
+#define OPUS_RC_BOT (OPUS_RC_TOP >> OPUS_RC_SYM)
+#define OPUS_RC_SHIFT (OPUS_RC_BITS - OPUS_RC_SYM - 1)
+
+static av_always_inline void opus_rc_enc_carryout(OpusRangeCoder *rc, int cbuf)
+{
+    const int cb = cbuf >> OPUS_RC_SYM, mb = (OPUS_RC_CEIL + cb) & OPUS_RC_CEIL;
+    if (cbuf == OPUS_RC_CEIL) {
+        rc->ext++;
+        return;
+    }
+    rc->rng_cur[0] = rc->rem + cb;
+    rc->rng_cur += (rc->rem >= 0);
+    for (; rc->ext > 0; rc->ext--)
+        *rc->rng_cur++ = mb;
+    av_assert0(rc->rng_cur < rc->rb.position);
+    rc->rem = cbuf & OPUS_RC_CEIL; /* Propagate */
+}
+
+static av_always_inline void opus_rc_dec_normalize(OpusRangeCoder *rc)
+{
+    while (rc->range <= OPUS_RC_BOT) {
+        rc->value = ((rc->value << OPUS_RC_SYM) | (get_bits(&rc->gb, OPUS_RC_SYM) ^ OPUS_RC_CEIL)) & (OPUS_RC_TOP - 1);
+        rc->range     <<= OPUS_RC_SYM;
+        rc->total_bits += OPUS_RC_SYM;
+    }
+}
+
+static av_always_inline void opus_rc_enc_normalize(OpusRangeCoder *rc)
+{
+    while (rc->range <= OPUS_RC_BOT) {
+        opus_rc_enc_carryout(rc, rc->value >> OPUS_RC_SHIFT);
+        rc->value = (rc->value << OPUS_RC_SYM) & (OPUS_RC_TOP - 1);
+        rc->range     <<= OPUS_RC_SYM;
+        rc->total_bits += OPUS_RC_SYM;
+    }
+}
+
+static av_always_inline void opus_rc_dec_update(OpusRangeCoder *rc, uint32_t scale,
+                                                uint32_t low, uint32_t high,
+                                                uint32_t total)
+{
+    rc->value -= scale * (total - high);
+    rc->range  = low ? scale * (high - low)
+                      : rc->range - scale * (total - high);
+    opus_rc_dec_normalize(rc);
+}
+
+/* Main encoding function, this needs to go fast */
+static av_always_inline void opus_rc_enc_update(OpusRangeCoder *rc, uint32_t b, uint32_t p,
+                                                uint32_t p_tot, const int ptwo)
+{
+    uint32_t rscaled, cnd = !!b;
+    if (ptwo) /* Whole function is inlined so hopefully branch is optimized out */
+        rscaled = rc->range >> ff_log2(p_tot);
+    else
+        rscaled = rc->range/p_tot;
+    rc->value +=    cnd*(rc->range - rscaled*(p_tot - b));
+    rc->range  = (!cnd)*(rc->range - rscaled*(p_tot - p)) + cnd*rscaled*(p - b);
+    opus_rc_enc_normalize(rc);
+}
+
+uint32_t ff_opus_rc_dec_cdf(OpusRangeCoder *rc, const uint16_t *cdf)
+{
+    unsigned int k, scale, total, symbol, low, high;
+
+    total = *cdf++;
+
+    scale   = rc->range / total;
+    symbol = rc->value / scale + 1;
+    symbol = total - FFMIN(symbol, total);
+
+    for (k = 0; cdf[k] <= symbol; k++);
+    high = cdf[k];
+    low  = k ? cdf[k-1] : 0;
+
+    opus_rc_dec_update(rc, scale, low, high, total);
+
+    return k;
+}
+
+void ff_opus_rc_enc_cdf(OpusRangeCoder *rc, int val, const uint16_t *cdf)
+{
+    opus_rc_enc_update(rc, (!!val)*cdf[val], cdf[val + 1], cdf[0], 1);
+}
+
+uint32_t ff_opus_rc_dec_log(OpusRangeCoder *rc, uint32_t bits)
+{
+    uint32_t k, scale;
+    scale = rc->range >> bits; // in this case, scale = symbol
+
+    if (rc->value >= scale) {
+        rc->value -= scale;
+        rc->range -= scale;
+        k = 0;
+    } else {
+        rc->range = scale;
+        k = 1;
+    }
+    opus_rc_dec_normalize(rc);
+    return k;
+}
+
+void ff_opus_rc_enc_log(OpusRangeCoder *rc, int val, uint32_t bits)
+{
+    bits = (1 << bits) - 1;
+    opus_rc_enc_update(rc, (!!val)*bits, bits + !!val, bits + 1, 1);
+}
+
+/**
+ * CELT: read 1-25 raw bits at the end of the frame, backwards byte-wise
+ */
+uint32_t ff_opus_rc_get_raw(OpusRangeCoder *rc, uint32_t count)
+{
+    uint32_t value = 0;
+
+    while (rc->rb.bytes && rc->rb.cachelen < count) {
+        rc->rb.cacheval |= *--rc->rb.position << rc->rb.cachelen;
+        rc->rb.cachelen += 8;
+        rc->rb.bytes--;
+    }
+
+    value = av_mod_uintp2(rc->rb.cacheval, count);
+    rc->rb.cacheval    >>= count;
+    rc->rb.cachelen     -= count;
+    rc->total_bits      += count;
+
+    return value;
+}
+
+/**
+ * CELT: write 0 - 31 bits to the rawbits buffer
+ */
+void ff_opus_rc_put_raw(OpusRangeCoder *rc, uint32_t val, uint32_t count)
+{
+    const int to_write = FFMIN(32 - rc->rb.cachelen, count);
+
+    rc->total_bits += count;
+    rc->rb.cacheval |= av_mod_uintp2(val, to_write) << rc->rb.cachelen;
+    rc->rb.cachelen = (rc->rb.cachelen + to_write) % 32;
+
+    if (!rc->rb.cachelen && count) {
+        AV_WB32((uint8_t *)rc->rb.position, rc->rb.cacheval);
+        rc->rb.bytes    += 4;
+        rc->rb.position -= 4;
+        rc->rb.cachelen = count - to_write;
+        rc->rb.cacheval = av_mod_uintp2(val >> to_write, rc->rb.cachelen);
+        av_assert0(rc->rng_cur < rc->rb.position);
+    }
+}
+
+/**
+ * CELT: read a uniform distribution
+ */
+uint32_t ff_opus_rc_dec_uint(OpusRangeCoder *rc, uint32_t size)
+{
+    uint32_t bits, k, scale, total;
+
+    bits  = opus_ilog(size - 1);
+    total = (bits > 8) ? ((size - 1) >> (bits - 8)) + 1 : size;
+
+    scale  = rc->range / total;
+    k      = rc->value / scale + 1;
+    k      = total - FFMIN(k, total);
+    opus_rc_dec_update(rc, scale, k, k + 1, total);
+
+    if (bits > 8) {
+        k = k << (bits - 8) | ff_opus_rc_get_raw(rc, bits - 8);
+        return FFMIN(k, size - 1);
+    } else
+        return k;
+}
+
+/**
+ * CELT: write a uniformly distributed integer
+ */
+void ff_opus_rc_enc_uint(OpusRangeCoder *rc, uint32_t val, uint32_t size)
+{
+    const int ps = FFMAX(opus_ilog(size - 1) - 8, 0);
+    opus_rc_enc_update(rc, val >> ps, (val >> ps) + 1, ((size - 1) >> ps) + 1, 0);
+    ff_opus_rc_put_raw(rc, val, ps);
+}
+
+uint32_t ff_opus_rc_dec_uint_step(OpusRangeCoder *rc, int k0)
+{
+    /* Use a probability of 3 up to itheta=8192 and then use 1 after */
+    uint32_t k, scale, symbol, total = (k0+1)*3 + k0;
+    scale  = rc->range / total;
+    symbol = rc->value / scale + 1;
+    symbol = total - FFMIN(symbol, total);
+
+    k = (symbol < (k0+1)*3) ? symbol/3 : symbol - (k0+1)*2;
+
+    opus_rc_dec_update(rc, scale, (k <= k0) ? 3*(k+0) : (k-1-k0) + 3*(k0+1),
+                       (k <= k0) ? 3*(k+1) : (k-0-k0) + 3*(k0+1), total);
+    return k;
+}
+
+void ff_opus_rc_enc_uint_step(OpusRangeCoder *rc, uint32_t val, int k0)
+{
+    const uint32_t a = val <= k0, b = 2*a + 1;
+    k0 = (k0 + 1) << 1;
+    val = b*(val + k0) - 3*a*k0;
+    opus_rc_enc_update(rc, val, val + b, (k0 << 1) - 1, 0);
+}
+
+uint32_t ff_opus_rc_dec_uint_tri(OpusRangeCoder *rc, int qn)
+{
+    uint32_t k, scale, symbol, total, low, center;
+
+    total = ((qn>>1) + 1) * ((qn>>1) + 1);
+    scale   = rc->range / total;
+    center = rc->value / scale + 1;
+    center = total - FFMIN(center, total);
+
+    if (center < total >> 1) {
+        k      = (ff_sqrt(8 * center + 1) - 1) >> 1;
+        low    = k * (k + 1) >> 1;
+        symbol = k + 1;
+    } else {
+        k      = (2*(qn + 1) - ff_sqrt(8*(total - center - 1) + 1)) >> 1;
+        low    = total - ((qn + 1 - k) * (qn + 2 - k) >> 1);
+        symbol = qn + 1 - k;
+    }
+
+    opus_rc_dec_update(rc, scale, low, low + symbol, total);
+
+    return k;
+}
+
+void ff_opus_rc_enc_uint_tri(OpusRangeCoder *rc, uint32_t k, int qn)
+{
+    uint32_t symbol, low, total;
+
+    total = ((qn>>1) + 1) * ((qn>>1) + 1);
+
+    if (k <= qn >> 1) {
+        low    = k * (k + 1) >> 1;
+        symbol = k + 1;
+    } else {
+        low    = total - ((qn + 1 - k) * (qn + 2 - k) >> 1);
+        symbol = qn + 1 - k;
+    }
+
+    opus_rc_enc_update(rc, low, low + symbol, total, 0);
+}
+
+int ff_opus_rc_dec_laplace(OpusRangeCoder *rc, uint32_t symbol, int decay)
+{
+    /* extends the range coder to model a Laplace distribution */
+    int value = 0;
+    uint32_t scale, low = 0, center;
+
+    scale  = rc->range >> 15;
+    center = rc->value / scale + 1;
+    center = (1 << 15) - FFMIN(center, 1 << 15);
+
+    if (center >= symbol) {
+        value++;
+        low = symbol;
+        symbol = 1 + ((32768 - 32 - symbol) * (16384-decay) >> 15);
+
+        while (symbol > 1 && center >= low + 2 * symbol) {
+            value++;
+            symbol *= 2;
+            low    += symbol;
+            symbol  = (((symbol - 2) * decay) >> 15) + 1;
+        }
+
+        if (symbol <= 1) {
+            int distance = (center - low) >> 1;
+            value += distance;
+            low   += 2 * distance;
+        }
+
+        if (center < low + symbol)
+            value *= -1;
+        else
+            low += symbol;
+    }
+
+    opus_rc_dec_update(rc, scale, low, FFMIN(low + symbol, 32768), 32768);
+
+    return value;
+}
+
+void ff_opus_rc_enc_laplace(OpusRangeCoder *rc, int *value, uint32_t symbol, int decay)
+{
+    uint32_t low = symbol;
+    int i = 1, val = FFABS(*value), pos = *value > 0;
+    if (!val) {
+        opus_rc_enc_update(rc, 0, symbol, 1 << 15, 1);
+        return;
+    }
+    symbol = ((32768 - 32 - symbol)*(16384 - decay)) >> 15;
+    for (; i < val && symbol; i++) {
+        low   += (symbol << 1) + 2;
+        symbol = (symbol*decay) >> 14;
+    }
+    if (symbol) {
+        low += (++symbol)*pos;
+    } else {
+        const int distance = FFMIN(val - i, (((32768 - low) - !pos) >> 1) - 1);
+        low   += pos + (distance << 1);
+        symbol = FFMIN(1, 32768 - low);
+        *value = FFSIGN(*value)*(distance + i);
+    }
+    opus_rc_enc_update(rc, low, low + symbol, 1 << 15, 1);
+}
+
+int ff_opus_rc_dec_init(OpusRangeCoder *rc, const uint8_t *data, int size)
+{
+    int ret = init_get_bits8(&rc->gb, data, size);
+    if (ret < 0)
+        return ret;
+
+    rc->range = 128;
+    rc->value = 127 - get_bits(&rc->gb, 7);
+    rc->total_bits = 9;
+    opus_rc_dec_normalize(rc);
+
+    return 0;
+}
+
+void ff_opus_rc_dec_raw_init(OpusRangeCoder *rc, const uint8_t *rightend, uint32_t bytes)
+{
+    rc->rb.position = rightend;
+    rc->rb.bytes    = bytes;
+    rc->rb.cachelen = 0;
+    rc->rb.cacheval = 0;
+}
+
+void ff_opus_rc_enc_end(OpusRangeCoder *rc, uint8_t *dst, int size)
+{
+    int rng_bytes, bits = OPUS_RC_BITS - opus_ilog(rc->range);
+    uint32_t mask = (OPUS_RC_TOP - 1) >> bits;
+    uint32_t end = (rc->value + mask) & ~mask;
+
+    if ((end | mask) >= rc->value + rc->range) {
+        bits++;
+        mask >>= 1;
+        end = (rc->value + mask) & ~mask;
+    }
+
+    /* Finish what's left */
+    while (bits > 0) {
+        opus_rc_enc_carryout(rc, end >> OPUS_RC_SHIFT);
+        end = (end << OPUS_RC_SYM) & (OPUS_RC_TOP - 1);
+        bits -= OPUS_RC_SYM;
+    }
+
+    /* Flush out anything left or marked */
+    if (rc->rem >= 0 || rc->ext > 0)
+        opus_rc_enc_carryout(rc, 0);
+
+    rng_bytes = rc->rng_cur - rc->buf;
+    memcpy(dst, rc->buf, rng_bytes);
+
+    rc->waste = size*8 - (rc->rb.bytes*8 + rc->rb.cachelen) - rng_bytes*8;
+
+    /* Put the rawbits part, if any */
+    if (rc->rb.bytes || rc->rb.cachelen) {
+        int i, lap;
+        uint8_t *rb_src, *rb_dst;
+        ff_opus_rc_put_raw(rc, 0, 32 - rc->rb.cachelen);
+        rb_src = rc->buf + OPUS_MAX_PACKET_SIZE + 12 - rc->rb.bytes;
+        rb_dst = dst + FFMAX(size - rc->rb.bytes, 0);
+        lap = &dst[rng_bytes] - rb_dst;
+        for (i = 0; i < lap; i++)
+            rb_dst[i] |= rb_src[i];
+        memcpy(&rb_dst[lap], &rb_src[lap], FFMAX(rc->rb.bytes - lap, 0));
+    }
+}
+
+void ff_opus_rc_enc_init(OpusRangeCoder *rc)
+{
+    rc->value = 0;
+    rc->range = OPUS_RC_TOP;
+    rc->total_bits = OPUS_RC_BITS + 1;
+    rc->rem = -1;
+    rc->ext =  0;
+    rc->rng_cur = rc->buf;
+    ff_opus_rc_dec_raw_init(rc, rc->buf + OPUS_MAX_PACKET_SIZE + 8, 0);
+}
diff --git a/libavcodec/opus_rc.h b/libavcodec/opus_rc.h
new file mode 100644
index 0000000..627f832
--- /dev/null
+++ b/libavcodec/opus_rc.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2012 Andrew D'Addesio
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_OPUS_RC_H
+#define AVCODEC_OPUS_RC_H
+
+#include <stdint.h>
+#include "get_bits.h"
+
+#define OPUS_MAX_PACKET_SIZE 1275
+
+#define opus_ilog(i) (av_log2(i) + !!(i))
+
+typedef struct RawBitsContext {
+    const uint8_t *position;
+    uint32_t bytes;
+    uint32_t cachelen;
+    uint32_t cacheval;
+} RawBitsContext;
+
+typedef struct OpusRangeCoder {
+    GetBitContext gb;
+    RawBitsContext rb;
+    uint32_t range;
+    uint32_t value;
+    uint32_t total_bits;
+
+    /* Encoder */
+    uint8_t buf[OPUS_MAX_PACKET_SIZE + 12]; /* memcpy vs (memmove + overreading) */
+    uint8_t *rng_cur;                      /* Current range coded byte */
+    int ext;                               /* Awaiting propagation */
+    int rem;                               /* Carryout flag */
+
+    /* Encoding stats */
+    int waste;
+} OpusRangeCoder;
+
+/**
+ * CELT: estimate bits of entropy that have thus far been consumed for the
+ *       current CELT frame, to integer and fractional (1/8th bit) precision
+ */
+static av_always_inline uint32_t opus_rc_tell(const OpusRangeCoder *rc)
+{
+    return rc->total_bits - av_log2(rc->range) - 1;
+}
+
+static av_always_inline uint32_t opus_rc_tell_frac(const OpusRangeCoder *rc)
+{
+    uint32_t i, total_bits, rcbuffer, range;
+
+    total_bits = rc->total_bits << 3;
+    rcbuffer   = av_log2(rc->range) + 1;
+    range      = rc->range >> (rcbuffer-16);
+
+    for (i = 0; i < 3; i++) {
+        int bit;
+        range = range * range >> 15;
+        bit = range >> 16;
+        rcbuffer = rcbuffer << 1 | bit;
+        range >>= bit;
+    }
+
+    return total_bits - rcbuffer;
+}
+
+uint32_t ff_opus_rc_dec_cdf(OpusRangeCoder *rc, const uint16_t *cdf);
+void     ff_opus_rc_enc_cdf(OpusRangeCoder *rc, int val, const uint16_t *cdf);
+
+uint32_t ff_opus_rc_dec_log(OpusRangeCoder *rc, uint32_t bits);
+void     ff_opus_rc_enc_log(OpusRangeCoder *rc, int val, uint32_t bits);
+
+uint32_t ff_opus_rc_dec_uint_step(OpusRangeCoder *rc, int k0);
+void     ff_opus_rc_enc_uint_step(OpusRangeCoder *rc, uint32_t val, int k0);
+
+uint32_t ff_opus_rc_dec_uint_tri(OpusRangeCoder *rc, int qn);
+void     ff_opus_rc_enc_uint_tri(OpusRangeCoder *rc, uint32_t k, int qn);
+
+uint32_t ff_opus_rc_dec_uint(OpusRangeCoder *rc, uint32_t size);
+void     ff_opus_rc_enc_uint(OpusRangeCoder *rc, uint32_t val, uint32_t size);
+
+uint32_t ff_opus_rc_get_raw(OpusRangeCoder *rc, uint32_t count);
+void     ff_opus_rc_put_raw(OpusRangeCoder *rc, uint32_t val, uint32_t count);
+
+int      ff_opus_rc_dec_laplace(OpusRangeCoder *rc, uint32_t symbol, int decay);
+void     ff_opus_rc_enc_laplace(OpusRangeCoder *rc, int *value, uint32_t symbol, int decay);
+
+int      ff_opus_rc_dec_init(OpusRangeCoder *rc, const uint8_t *data, int size);
+void     ff_opus_rc_dec_raw_init(OpusRangeCoder *rc, const uint8_t *rightend, uint32_t bytes);
+
+void     ff_opus_rc_enc_end(OpusRangeCoder *rc, uint8_t *dst, int size);
+void     ff_opus_rc_enc_init(OpusRangeCoder *rc);
+
+#define OPUS_RC_CHECKPOINT_UPDATE(rc) \
+    rc_rollback_bits = opus_rc_tell_frac(rc); \
+    rc_rollback_ctx  = *rc
+
+#define OPUS_RC_CHECKPOINT_SPAWN(rc) \
+    uint32_t rc_rollback_bits = opus_rc_tell_frac(rc); \
+    OpusRangeCoder rc_rollback_ctx = *rc \
+
+#define OPUS_RC_CHECKPOINT_BITS(rc) \
+    (opus_rc_tell_frac(rc) - rc_rollback_bits)
+
+#define OPUS_RC_CHECKPOINT_ROLLBACK(rc) \
+    memcpy(rc, &rc_rollback_ctx, sizeof(OpusRangeCoder)); \
+
+#endif /* AVCODEC_OPUS_RC_H */
diff --git a/libavcodec/opus_silk.c b/libavcodec/opus_silk.c
index e5d1a99..2fcbf3b 100644
--- a/libavcodec/opus_silk.c
+++ b/libavcodec/opus_silk.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include <stdint.h>
 
 #include "opus.h"
+#include "opustab.h"
 
 typedef struct SilkFrame {
     int coded;
@@ -61,730 +62,6 @@ struct SilkContext {
     int prev_coded_channels;
 };
 
-static const uint16_t silk_model_stereo_s1[] = {
-    256,   7,   9,  10,  11,  12,  22,  46,  54,  55,  56,  59,  82, 174, 197, 200,
-    201, 202, 210, 234, 244, 245, 246, 247, 249, 256
-};
-
-static const uint16_t silk_model_stereo_s2[] = {256, 85, 171, 256};
-
-static const uint16_t silk_model_stereo_s3[] = {256, 51, 102, 154, 205, 256};
-
-static const uint16_t silk_model_mid_only[] = {256, 192, 256};
-
-static const uint16_t silk_model_frame_type_inactive[] = {256, 26, 256};
-
-static const uint16_t silk_model_frame_type_active[] = {256, 24, 98, 246, 256};
-
-static const uint16_t silk_model_gain_highbits[3][9] = {
-    {256,  32, 144, 212, 241, 253, 254, 255, 256},
-    {256,   2,  19,  64, 124, 186, 233, 252, 256},
-    {256,   1,   4,  30, 101, 195, 245, 254, 256}
-};
-
-static const uint16_t silk_model_gain_lowbits[] = {256, 32, 64, 96, 128, 160, 192, 224, 256};
-
-static const uint16_t silk_model_gain_delta[] = {
-    256,   6,  11,  22,  53, 185, 206, 214, 218, 221, 223, 225, 227, 228, 229, 230,
-    231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
-    247, 248, 249, 250, 251, 252, 253, 254, 255, 256
-};
-static const uint16_t silk_model_lsf_s1[2][2][33] = {
-    {
-        {    // NB or MB, unvoiced
-            256,  44,  78, 108, 127, 148, 160, 171, 174, 177, 179, 195, 197, 199, 200, 205,
-            207, 208, 211, 214, 215, 216, 218, 220, 222, 225, 226, 235, 244, 246, 253, 255, 256
-        }, { // NB or MB, voiced
-            256,   1,  11,  12,  20,  23,  31,  39,  53,  66,  80,  81,  95, 107, 120, 131,
-            142, 154, 165, 175, 185, 196, 204, 213, 221, 228, 236, 237, 238, 244, 245, 251, 256
-        }
-    }, {
-        {    // WB, unvoiced
-            256,  31,  52,  55,  72,  73,  81,  98, 102, 103, 121, 137, 141, 143, 146, 147,
-            157, 158, 161, 177, 188, 204, 206, 208, 211, 213, 224, 225, 229, 238, 246, 253, 256
-        }, { // WB, voiced
-            256,   1,   5,  21,  26,  44,  55,  60,  74,  89,  90,  93, 105, 118, 132, 146,
-            152, 166, 178, 180, 186, 187, 199, 211, 222, 232, 235, 245, 250, 251, 252, 253, 256
-        }
-    }
-};
-
-static const uint16_t silk_model_lsf_s2[32][10] = {
-    // NB, MB
-    { 256,   1,   2,   3,  18, 242, 253, 254, 255, 256 },
-    { 256,   1,   2,   4,  38, 221, 253, 254, 255, 256 },
-    { 256,   1,   2,   6,  48, 197, 252, 254, 255, 256 },
-    { 256,   1,   2,  10,  62, 185, 246, 254, 255, 256 },
-    { 256,   1,   4,  20,  73, 174, 248, 254, 255, 256 },
-    { 256,   1,   4,  21,  76, 166, 239, 254, 255, 256 },
-    { 256,   1,   8,  32,  85, 159, 226, 252, 255, 256 },
-    { 256,   1,   2,  20,  83, 161, 219, 249, 255, 256 },
-
-    // WB
-    { 256,   1,   2,   3,  12, 244, 253, 254, 255, 256 },
-    { 256,   1,   2,   4,  32, 218, 253, 254, 255, 256 },
-    { 256,   1,   2,   5,  47, 199, 252, 254, 255, 256 },
-    { 256,   1,   2,  12,  61, 187, 252, 254, 255, 256 },
-    { 256,   1,   5,  24,  72, 172, 249, 254, 255, 256 },
-    { 256,   1,   2,  16,  70, 170, 242, 254, 255, 256 },
-    { 256,   1,   2,  17,  78, 165, 226, 251, 255, 256 },
-    { 256,   1,   8,  29,  79, 156, 237, 254, 255, 256 }
-};
-
-static const uint16_t silk_model_lsf_s2_ext[] = { 256, 156, 216, 240, 249, 253, 255, 256 };
-
-static const uint16_t silk_model_lsf_interpolation_offset[] = { 256, 13, 35, 64, 75, 256 };
-
-static const uint16_t silk_model_pitch_highbits[] = {
-    256,   3,   6,  12,  23,  44,  74, 106, 125, 136, 146, 158, 171, 184, 196, 207,
-    216, 224, 231, 237, 241, 243, 245, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256
-};
-
-static const uint16_t silk_model_pitch_lowbits_nb[]= { 256, 64, 128, 192, 256 };
-
-static const uint16_t silk_model_pitch_lowbits_mb[]= { 256, 43, 85, 128, 171, 213, 256 };
-
-static const uint16_t silk_model_pitch_lowbits_wb[]= { 256, 32, 64, 96, 128, 160, 192, 224, 256 };
-
-static const uint16_t silk_model_pitch_delta[] = {
-    256,  46,  48,  50,  53,  57,  63,  73,  88, 114, 152, 182, 204, 219, 229, 236,
-    242, 246, 250, 252, 254, 256
-};
-
-static const uint16_t silk_model_pitch_contour_nb10ms[] = { 256, 143, 193, 256 };
-
-static const uint16_t silk_model_pitch_contour_nb20ms[] = {
-    256,  68,  80, 101, 118, 137, 159, 189, 213, 230, 246, 256
-};
-
-static const uint16_t silk_model_pitch_contour_mbwb10ms[] = {
-    256,  91, 137, 176, 195, 209, 221, 229, 236, 242, 247, 252, 256
-};
-
-static const uint16_t silk_model_pitch_contour_mbwb20ms[] = {
-    256,  33,  55,  73,  89, 104, 118, 132, 145, 158, 168, 177, 186, 194, 200, 206,
-    212, 217, 221, 225, 229, 232, 235, 238, 240, 242, 244, 246, 248, 250, 252, 253,
-    254, 255, 256
-};
-
-static const uint16_t silk_model_ltp_filter[] = { 256, 77, 157, 256 };
-
-static const uint16_t silk_model_ltp_filter0_sel[] = {
-    256, 185, 200, 213, 226, 235, 244, 250, 256
-};
-
-static const uint16_t silk_model_ltp_filter1_sel[] = {
-    256,  57,  91, 112, 132, 147, 160, 172, 185, 195, 205, 214, 224, 233, 241, 248, 256
-};
-
-static const uint16_t silk_model_ltp_filter2_sel[] = {
-    256,  15,  31,  45,  57,  69,  81,  92, 103, 114, 124, 133, 142, 151, 160, 168,
-    176, 184, 192, 199, 206, 212, 218, 223, 227, 232, 236, 240, 244, 247, 251, 254, 256
-};
-
-static const uint16_t silk_model_ltp_scale_index[] = { 256, 128, 192, 256 };
-
-static const uint16_t silk_model_lcg_seed[] = { 256, 64, 128, 192, 256 };
-
-static const uint16_t silk_model_exc_rate[2][10] = {
-    { 256,  15,  66,  78, 124, 169, 182, 215, 242, 256 }, // unvoiced
-    { 256,  33,  63,  99, 116, 150, 199, 217, 238, 256 }  // voiced
-};
-
-static const uint16_t silk_model_pulse_count[11][19] = {
-    { 256, 131, 205, 230, 238, 241, 244, 245, 246,
-      247, 248, 249, 250, 251, 252, 253, 254, 255, 256 },
-    { 256,  58, 151, 211, 234, 241, 244, 245, 246,
-      247, 248, 249, 250, 251, 252, 253, 254, 255, 256 },
-    { 256,  43,  94, 140, 173, 197, 213, 224, 232,
-      238, 241, 244, 247, 249, 250, 251, 253, 254, 256 },
-    { 256,  17,  69, 140, 197, 228, 240, 245, 246,
-      247, 248, 249, 250, 251, 252, 253, 254, 255, 256 },
-    { 256,   6,  27,  68, 121, 170, 205, 226, 237,
-      243, 246, 248, 250, 251, 252, 253, 254, 255, 256 },
-    { 256,   7,  21,  43,  71, 100, 128, 153, 173,
-      190, 203, 214, 223, 230, 235, 239, 243, 246, 256 },
-    { 256,   2,   7,  21,  50,  92, 138, 179, 210,
-      229, 240, 246, 249, 251, 252, 253, 254, 255, 256 },
-    { 256,   1,   3,   7,  17,  36,  65, 100, 137,
-      171, 199, 219, 233, 241, 246, 250, 252, 254, 256 },
-    { 256,   1,   3,   5,  10,  19,  33,  53,  77,
-      104, 132, 158, 181, 201, 216, 227, 235, 241, 256 },
-    { 256,   1,   2,   3,   9,  36,  94, 150, 189,
-      214, 228, 238, 244, 247, 250, 252, 253, 254, 256 },
-    { 256,   2,   3,   9,  36,  94, 150, 189, 214,
-      228, 238, 244, 247, 250, 252, 253, 254, 256, 256 }
-};
-
-static const uint16_t silk_model_pulse_location[4][168] = {
-    {
-        256, 126, 256,
-        256, 56, 198, 256,
-        256, 25, 126, 230, 256,
-        256, 12, 72, 180, 244, 256,
-        256, 7, 42, 126, 213, 250, 256,
-        256, 4, 24, 83, 169, 232, 253, 256,
-        256, 3, 15, 53, 125, 200, 242, 254, 256,
-        256, 2, 10, 35, 89, 162, 221, 248, 255, 256,
-        256, 2, 7, 24, 63, 126, 191, 233, 251, 255, 256,
-        256, 1, 5, 17, 45, 94, 157, 211, 241, 252, 255, 256,
-        256, 1, 5, 13, 33, 70, 125, 182, 223, 245, 253, 255, 256,
-        256, 1, 4, 11, 26, 54, 98, 151, 199, 232, 248, 254, 255, 256,
-        256, 1, 3, 9, 21, 42, 77, 124, 172, 212, 237, 249, 254, 255, 256,
-        256, 1, 2, 6, 16, 33, 60, 97, 144, 187, 220, 241, 250, 254, 255, 256,
-        256, 1, 2, 3, 11, 25, 47, 80, 120, 163, 201, 229, 245, 253, 254, 255, 256,
-        256, 1, 2, 3, 4, 17, 35, 62, 98, 139, 180, 214, 238, 252, 253, 254, 255, 256
-    },{
-        256, 127, 256,
-        256, 53, 202, 256,
-        256, 22, 127, 233, 256,
-        256, 11, 72, 183, 246, 256,
-        256, 6, 41, 127, 215, 251, 256,
-        256, 4, 24, 83, 170, 232, 253, 256,
-        256, 3, 16, 56, 127, 200, 241, 254, 256,
-        256, 3, 12, 39, 92, 162, 218, 246, 255, 256,
-        256, 3, 11, 30, 67, 124, 185, 229, 249, 255, 256,
-        256, 3, 10, 25, 53, 97, 151, 200, 233, 250, 255, 256,
-        256, 1, 8, 21, 43, 77, 123, 171, 209, 237, 251, 255, 256,
-        256, 1, 2, 13, 35, 62, 97, 139, 186, 219, 244, 254, 255, 256,
-        256, 1, 2, 8, 22, 48, 85, 128, 171, 208, 234, 248, 254, 255, 256,
-        256, 1, 2, 6, 16, 36, 67, 107, 149, 189, 220, 240, 250, 254, 255, 256,
-        256, 1, 2, 5, 13, 29, 55, 90, 128, 166, 201, 227, 243, 251, 254, 255, 256,
-        256, 1, 2, 4, 10, 22, 43, 73, 109, 147, 183, 213, 234, 246, 252, 254, 255, 256
-    },{
-        256, 127, 256,
-        256, 49, 206, 256,
-        256, 20, 127, 236, 256,
-        256, 11, 71, 184, 246, 256,
-        256, 7, 43, 127, 214, 250, 256,
-        256, 6, 30, 87, 169, 229, 252, 256,
-        256, 5, 23, 62, 126, 194, 236, 252, 256,
-        256, 6, 20, 49, 96, 157, 209, 239, 253, 256,
-        256, 1, 16, 39, 74, 125, 175, 215, 245, 255, 256,
-        256, 1, 2, 23, 55, 97, 149, 195, 236, 254, 255, 256,
-        256, 1, 7, 23, 50, 86, 128, 170, 206, 233, 249, 255, 256,
-        256, 1, 6, 18, 39, 70, 108, 148, 186, 217, 238, 250, 255, 256,
-        256, 1, 4, 13, 30, 56, 90, 128, 166, 200, 226, 243, 252, 255, 256,
-        256, 1, 4, 11, 25, 47, 76, 110, 146, 180, 209, 231, 245, 252, 255, 256,
-        256, 1, 3, 8, 19, 37, 62, 93, 128, 163, 194, 219, 237, 248, 253, 255, 256,
-        256, 1, 2, 6, 15, 30, 51, 79, 111, 145, 177, 205, 226, 241, 250, 254, 255, 256
-    },{
-        256, 128, 256,
-        256, 42, 214, 256,
-        256, 21, 128, 235, 256,
-        256, 12, 72, 184, 245, 256,
-        256, 8, 42, 128, 214, 249, 256,
-        256, 8, 31, 86, 176, 231, 251, 256,
-        256, 5, 20, 58, 130, 202, 238, 253, 256,
-        256, 6, 18, 45, 97, 174, 221, 241, 251, 256,
-        256, 6, 25, 53, 88, 128, 168, 203, 231, 250, 256,
-        256, 4, 18, 40, 71, 108, 148, 185, 216, 238, 252, 256,
-        256, 3, 13, 31, 57, 90, 128, 166, 199, 225, 243, 253, 256,
-        256, 2, 10, 23, 44, 73, 109, 147, 183, 212, 233, 246, 254, 256,
-        256, 1, 6, 16, 33, 58, 90, 128, 166, 198, 223, 240, 250, 255, 256,
-        256, 1, 5, 12, 25, 46, 75, 110, 146, 181, 210, 231, 244, 251, 255, 256,
-        256, 1, 3, 8, 18, 35, 60, 92, 128, 164, 196, 221, 238, 248, 253, 255, 256,
-        256, 1, 3, 7, 14, 27, 48, 76, 110, 146, 180, 208, 229, 242, 249, 253, 255, 256
-    }
-};
-
-static const uint16_t silk_model_excitation_lsb[] = {256, 136, 256};
-
-static const uint16_t silk_model_excitation_sign[3][2][7][3] = {
-    {    // Inactive
-        {    // Low offset
-            {256,   2, 256},
-            {256, 207, 256},
-            {256, 189, 256},
-            {256, 179, 256},
-            {256, 174, 256},
-            {256, 163, 256},
-            {256, 157, 256}
-        }, { // High offset
-            {256,  58, 256},
-            {256, 245, 256},
-            {256, 238, 256},
-            {256, 232, 256},
-            {256, 225, 256},
-            {256, 220, 256},
-            {256, 211, 256}
-        }
-    }, { // Unvoiced
-        {    // Low offset
-            {256,   1, 256},
-            {256, 210, 256},
-            {256, 190, 256},
-            {256, 178, 256},
-            {256, 169, 256},
-            {256, 162, 256},
-            {256, 152, 256}
-        }, { // High offset
-            {256,  48, 256},
-            {256, 242, 256},
-            {256, 235, 256},
-            {256, 224, 256},
-            {256, 214, 256},
-            {256, 205, 256},
-            {256, 190, 256}
-        }
-    }, { // Voiced
-        {    // Low offset
-            {256,   1, 256},
-            {256, 162, 256},
-            {256, 152, 256},
-            {256, 147, 256},
-            {256, 144, 256},
-            {256, 141, 256},
-            {256, 138, 256}
-        }, { // High offset
-            {256,   8, 256},
-            {256, 203, 256},
-            {256, 187, 256},
-            {256, 176, 256},
-            {256, 168, 256},
-            {256, 161, 256},
-            {256, 154, 256}
-        }
-    }
-};
-
-static const int16_t silk_stereo_weights[] = {
-    -13732, -10050,  -8266,  -7526,  -6500,  -5000,  -2950,   -820,
-       820,   2950,   5000,   6500,   7526,   8266,  10050,  13732
-};
-
-static const uint8_t silk_lsf_s2_model_sel_nbmb[32][10] = {
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1, 3, 1, 2, 2, 1, 2, 1, 1, 1 },
-    { 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-    { 1, 2, 2, 2, 2, 1, 2, 1, 1, 1 },
-    { 2, 3, 3, 3, 3, 2, 2, 2, 2, 2 },
-    { 0, 5, 3, 3, 2, 2, 2, 2, 1, 1 },
-    { 0, 2, 2, 2, 2, 2, 2, 2, 2, 1 },
-    { 2, 3, 6, 4, 4, 4, 5, 4, 5, 5 },
-    { 2, 4, 5, 5, 4, 5, 4, 6, 4, 4 },
-    { 2, 4, 4, 7, 4, 5, 4, 5, 5, 4 },
-    { 4, 3, 3, 3, 2, 3, 2, 2, 2, 2 },
-    { 1, 5, 5, 6, 4, 5, 4, 5, 5, 5 },
-    { 2, 7, 4, 6, 5, 5, 5, 5, 5, 5 },
-    { 2, 7, 5, 5, 5, 5, 5, 6, 5, 4 },
-    { 3, 3, 5, 4, 4, 5, 4, 5, 4, 4 },
-    { 2, 3, 3, 5, 5, 4, 4, 4, 4, 4 },
-    { 2, 4, 4, 6, 4, 5, 4, 5, 5, 5 },
-    { 2, 5, 4, 6, 5, 5, 5, 4, 5, 4 },
-    { 2, 7, 4, 5, 4, 5, 4, 5, 5, 5 },
-    { 2, 5, 4, 6, 7, 6, 5, 6, 5, 4 },
-    { 3, 6, 7, 4, 6, 5, 5, 6, 4, 5 },
-    { 2, 7, 6, 4, 4, 4, 5, 4, 5, 5 },
-    { 4, 5, 5, 4, 6, 6, 5, 6, 5, 4 },
-    { 2, 5, 5, 6, 5, 6, 4, 6, 4, 4 },
-    { 4, 5, 5, 5, 3, 7, 4, 5, 5, 4 },
-    { 2, 3, 4, 5, 5, 6, 4, 5, 5, 4 },
-    { 2, 3, 2, 3, 3, 4, 2, 3, 3, 3 },
-    { 1, 1, 2, 2, 2, 2, 2, 3, 2, 2 },
-    { 4, 5, 5, 6, 6, 6, 5, 6, 4, 5 },
-    { 3, 5, 5, 4, 4, 4, 4, 3, 3, 2 },
-    { 2, 5, 3, 7, 5, 5, 4, 4, 5, 4 },
-    { 4, 4, 5, 4, 5, 6, 5, 6, 5, 4 }
-};
-
-static const uint8_t silk_lsf_s2_model_sel_wb[32][16] = {
-    {  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
-    { 10, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  8, 11 },
-    { 10, 13, 13, 11, 15, 12, 12, 13, 10, 13, 12, 13, 13, 12, 11, 11 },
-    {  8, 10,  9, 10, 10,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  9 },
-    {  8, 14, 13, 12, 14, 12, 15, 13, 12, 12, 12, 13, 13, 12, 12, 11 },
-    {  8, 11, 13, 13, 12, 11, 11, 13, 11, 11, 11, 11, 11, 11, 10, 12 },
-    {  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
-    {  8, 10, 14, 11, 15, 10, 13, 11, 12, 13, 13, 12, 11, 11, 10, 11 },
-    {  8, 14, 10, 14, 14, 12, 13, 12, 14, 13, 12, 12, 13, 11, 11, 11 },
-    { 10,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
-    {  8,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9 },
-    { 10, 10, 11, 12, 13, 11, 11, 11, 11, 11, 11, 11, 10, 10,  9, 11 },
-    { 10, 10, 11, 11, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10,  9, 11 },
-    { 11, 12, 12, 12, 14, 12, 12, 13, 11, 13, 12, 12, 13, 12, 11, 12 },
-    {  8, 14, 12, 13, 12, 15, 13, 10, 14, 13, 15, 12, 12, 11, 13, 11 },
-    {  8,  9,  8,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  9,  8 },
-    {  9, 14, 13, 15, 13, 12, 13, 11, 12, 13, 12, 12, 12, 11, 11, 12 },
-    {  9, 11, 11, 12, 12, 11, 11, 13, 10, 11, 11, 13, 13, 13, 11, 12 },
-    { 10, 11, 11, 10, 10, 10, 11, 10,  9, 10,  9, 10,  9,  9,  9, 12 },
-    {  8, 10, 11, 13, 11, 11, 10, 10, 10,  9,  9,  8,  8,  8,  8,  8 },
-    { 11, 12, 11, 13, 11, 11, 10, 10,  9,  9,  9,  9,  9, 10, 10, 12 },
-    { 10, 14, 11, 15, 15, 12, 13, 12, 13, 11, 13, 11, 11, 10, 11, 11 },
-    { 10, 11, 13, 14, 14, 11, 13, 11, 12, 12, 11, 11, 11, 11, 10, 12 },
-    {  9, 11, 11, 12, 12, 12, 12, 11, 13, 13, 13, 11,  9,  9,  9,  9 },
-    { 10, 13, 11, 14, 14, 12, 15, 12, 12, 13, 11, 12, 12, 11, 11, 11 },
-    {  8, 14,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
-    {  8, 14, 14, 11, 13, 10, 13, 13, 11, 12, 12, 15, 15, 12, 12, 12 },
-    { 11, 11, 15, 11, 13, 12, 11, 11, 11, 10, 10, 11, 11, 11, 10, 11 },
-    {  8,  8,  9,  8,  8,  8, 10,  9, 10,  9,  9, 10, 10, 10,  9,  9 },
-    {  8, 11, 10, 13, 11, 11, 10, 11, 10,  9,  8,  8,  9,  8,  8,  9 },
-    { 11, 13, 13, 12, 15, 13, 11, 11, 10, 11, 10, 10,  9,  8,  9,  8 },
-    { 10, 11, 13, 11, 12, 11, 11, 11, 10,  9, 10, 14, 12,  8,  8,  8 }
-};
-
-static const uint8_t silk_lsf_pred_weights_nbmb[2][9] = {
-    {179, 138, 140, 148, 151, 149, 153, 151, 163},
-    {116,  67,  82,  59,  92,  72, 100,  89,  92}
-};
-
-static const uint8_t silk_lsf_pred_weights_wb[2][15] = {
-    {175, 148, 160, 176, 178, 173, 174, 164, 177, 174, 196, 182, 198, 192, 182},
-    { 68,  62,  66,  60,  72, 117,  85,  90, 118, 136, 151, 142, 160, 142, 155}
-};
-
-static const uint8_t silk_lsf_weight_sel_nbmb[32][9] = {
-    { 0, 1, 0, 0, 0, 0, 0, 0, 0 },
-    { 1, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1, 1, 1, 0, 0, 0, 0, 1, 0 },
-    { 0, 1, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 1, 0, 0, 0, 0, 0, 0, 0 },
-    { 1, 0, 1, 1, 0, 0, 0, 1, 0 },
-    { 0, 1, 1, 0, 0, 1, 1, 0, 0 },
-    { 0, 0, 1, 1, 0, 1, 0, 1, 1 },
-    { 0, 0, 1, 1, 0, 0, 1, 1, 1 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 1, 0, 1, 1, 1, 1, 1, 0 },
-    { 0, 1, 0, 1, 1, 1, 1, 1, 0 },
-    { 0, 1, 1, 1, 1, 1, 1, 1, 0 },
-    { 1, 0, 1, 1, 0, 1, 1, 1, 1 },
-    { 0, 1, 1, 1, 1, 1, 0, 1, 0 },
-    { 0, 0, 1, 1, 0, 1, 0, 1, 0 },
-    { 0, 0, 1, 1, 1, 0, 1, 1, 1 },
-    { 0, 1, 1, 0, 0, 1, 1, 1, 0 },
-    { 0, 0, 0, 1, 1, 1, 0, 1, 0 },
-    { 0, 1, 1, 0, 0, 1, 0, 1, 0 },
-    { 0, 1, 1, 0, 0, 0, 1, 1, 0 },
-    { 0, 0, 0, 0, 0, 1, 1, 1, 1 },
-    { 0, 0, 1, 1, 0, 0, 0, 1, 1 },
-    { 0, 0, 0, 1, 0, 1, 1, 1, 1 },
-    { 0, 1, 1, 1, 1, 1, 1, 1, 0 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 0, 1, 0, 1, 1, 0, 1, 0 },
-    { 1, 0, 0, 1, 0, 0, 0, 0, 0 },
-    { 0, 0, 0, 1, 1, 0, 1, 0, 1 },
-    { 1, 0, 1, 1, 0, 1, 1, 1, 1 }
-};
-
-static const uint8_t silk_lsf_weight_sel_wb[32][15] = {
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },
-    { 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0 },
-    { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0 },
-    { 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1 },
-    { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
-    { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0 },
-    { 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0 },
-    { 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0 },
-    { 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 },
-    { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0 },
-    { 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0 },
-    { 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0 },
-    { 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0 },
-    { 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0 },
-    { 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
-    { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1 },
-    { 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
-    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 },
-    { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0 }
-};
-
-static const uint8_t silk_lsf_codebook_nbmb[32][10] = {
-    { 12,  35,  60,  83, 108, 132, 157, 180, 206, 228 },
-    { 15,  32,  55,  77, 101, 125, 151, 175, 201, 225 },
-    { 19,  42,  66,  89, 114, 137, 162, 184, 209, 230 },
-    { 12,  25,  50,  72,  97, 120, 147, 172, 200, 223 },
-    { 26,  44,  69,  90, 114, 135, 159, 180, 205, 225 },
-    { 13,  22,  53,  80, 106, 130, 156, 180, 205, 228 },
-    { 15,  25,  44,  64,  90, 115, 142, 168, 196, 222 },
-    { 19,  24,  62,  82, 100, 120, 145, 168, 190, 214 },
-    { 22,  31,  50,  79, 103, 120, 151, 170, 203, 227 },
-    { 21,  29,  45,  65, 106, 124, 150, 171, 196, 224 },
-    { 30,  49,  75,  97, 121, 142, 165, 186, 209, 229 },
-    { 19,  25,  52,  70,  93, 116, 143, 166, 192, 219 },
-    { 26,  34,  62,  75,  97, 118, 145, 167, 194, 217 },
-    { 25,  33,  56,  70,  91, 113, 143, 165, 196, 223 },
-    { 21,  34,  51,  72,  97, 117, 145, 171, 196, 222 },
-    { 20,  29,  50,  67,  90, 117, 144, 168, 197, 221 },
-    { 22,  31,  48,  66,  95, 117, 146, 168, 196, 222 },
-    { 24,  33,  51,  77, 116, 134, 158, 180, 200, 224 },
-    { 21,  28,  70,  87, 106, 124, 149, 170, 194, 217 },
-    { 26,  33,  53,  64,  83, 117, 152, 173, 204, 225 },
-    { 27,  34,  65,  95, 108, 129, 155, 174, 210, 225 },
-    { 20,  26,  72,  99, 113, 131, 154, 176, 200, 219 },
-    { 34,  43,  61,  78,  93, 114, 155, 177, 205, 229 },
-    { 23,  29,  54,  97, 124, 138, 163, 179, 209, 229 },
-    { 30,  38,  56,  89, 118, 129, 158, 178, 200, 231 },
-    { 21,  29,  49,  63,  85, 111, 142, 163, 193, 222 },
-    { 27,  48,  77, 103, 133, 158, 179, 196, 215, 232 },
-    { 29,  47,  74,  99, 124, 151, 176, 198, 220, 237 },
-    { 33,  42,  61,  76,  93, 121, 155, 174, 207, 225 },
-    { 29,  53,  87, 112, 136, 154, 170, 188, 208, 227 },
-    { 24,  30,  52,  84, 131, 150, 166, 186, 203, 229 },
-    { 37,  48,  64,  84, 104, 118, 156, 177, 201, 230 }
-};
-
-static const uint8_t silk_lsf_codebook_wb[32][16] = {
-    {  7,  23,  38,  54,  69,  85, 100, 116, 131, 147, 162, 178, 193, 208, 223, 239 },
-    { 13,  25,  41,  55,  69,  83,  98, 112, 127, 142, 157, 171, 187, 203, 220, 236 },
-    { 15,  21,  34,  51,  61,  78,  92, 106, 126, 136, 152, 167, 185, 205, 225, 240 },
-    { 10,  21,  36,  50,  63,  79,  95, 110, 126, 141, 157, 173, 189, 205, 221, 237 },
-    { 17,  20,  37,  51,  59,  78,  89, 107, 123, 134, 150, 164, 184, 205, 224, 240 },
-    { 10,  15,  32,  51,  67,  81,  96, 112, 129, 142, 158, 173, 189, 204, 220, 236 },
-    {  8,  21,  37,  51,  65,  79,  98, 113, 126, 138, 155, 168, 179, 192, 209, 218 },
-    { 12,  15,  34,  55,  63,  78,  87, 108, 118, 131, 148, 167, 185, 203, 219, 236 },
-    { 16,  19,  32,  36,  56,  79,  91, 108, 118, 136, 154, 171, 186, 204, 220, 237 },
-    { 11,  28,  43,  58,  74,  89, 105, 120, 135, 150, 165, 180, 196, 211, 226, 241 },
-    {  6,  16,  33,  46,  60,  75,  92, 107, 123, 137, 156, 169, 185, 199, 214, 225 },
-    { 11,  19,  30,  44,  57,  74,  89, 105, 121, 135, 152, 169, 186, 202, 218, 234 },
-    { 12,  19,  29,  46,  57,  71,  88, 100, 120, 132, 148, 165, 182, 199, 216, 233 },
-    { 17,  23,  35,  46,  56,  77,  92, 106, 123, 134, 152, 167, 185, 204, 222, 237 },
-    { 14,  17,  45,  53,  63,  75,  89, 107, 115, 132, 151, 171, 188, 206, 221, 240 },
-    {  9,  16,  29,  40,  56,  71,  88, 103, 119, 137, 154, 171, 189, 205, 222, 237 },
-    { 16,  19,  36,  48,  57,  76,  87, 105, 118, 132, 150, 167, 185, 202, 218, 236 },
-    { 12,  17,  29,  54,  71,  81,  94, 104, 126, 136, 149, 164, 182, 201, 221, 237 },
-    { 15,  28,  47,  62,  79,  97, 115, 129, 142, 155, 168, 180, 194, 208, 223, 238 },
-    {  8,  14,  30,  45,  62,  78,  94, 111, 127, 143, 159, 175, 192, 207, 223, 239 },
-    { 17,  30,  49,  62,  79,  92, 107, 119, 132, 145, 160, 174, 190, 204, 220, 235 },
-    { 14,  19,  36,  45,  61,  76,  91, 108, 121, 138, 154, 172, 189, 205, 222, 238 },
-    { 12,  18,  31,  45,  60,  76,  91, 107, 123, 138, 154, 171, 187, 204, 221, 236 },
-    { 13,  17,  31,  43,  53,  70,  83, 103, 114, 131, 149, 167, 185, 203, 220, 237 },
-    { 17,  22,  35,  42,  58,  78,  93, 110, 125, 139, 155, 170, 188, 206, 224, 240 },
-    {  8,  15,  34,  50,  67,  83,  99, 115, 131, 146, 162, 178, 193, 209, 224, 239 },
-    { 13,  16,  41,  66,  73,  86,  95, 111, 128, 137, 150, 163, 183, 206, 225, 241 },
-    { 17,  25,  37,  52,  63,  75,  92, 102, 119, 132, 144, 160, 175, 191, 212, 231 },
-    { 19,  31,  49,  65,  83, 100, 117, 133, 147, 161, 174, 187, 200, 213, 227, 242 },
-    { 18,  31,  52,  68,  88, 103, 117, 126, 138, 149, 163, 177, 192, 207, 223, 239 },
-    { 16,  29,  47,  61,  76,  90, 106, 119, 133, 147, 161, 176, 193, 209, 224, 240 },
-    { 15,  21,  35,  50,  61,  73,  86,  97, 110, 119, 129, 141, 175, 198, 218, 237 }
-};
-
-static const uint16_t silk_lsf_min_spacing_nbmb[] = {
-    250, 3, 6, 3, 3, 3, 4, 3, 3, 3, 461
-};
-
-static const uint16_t silk_lsf_min_spacing_wb[] = {
-    100, 3, 40, 3, 3, 3, 5, 14, 14, 10, 11, 3, 8, 9, 7, 3, 347
-};
-
-static const uint8_t silk_lsf_ordering_nbmb[] = {
-    0, 9, 6, 3, 4, 5, 8, 1, 2, 7
-};
-
-static const uint8_t silk_lsf_ordering_wb[] = {
-    0, 15, 8, 7, 4, 11, 12, 3, 2, 13, 10, 5, 6, 9, 14, 1
-};
-
-static const int16_t silk_cosine[] = { /* (0.12) */
-     4096,  4095,  4091,  4085,
-     4076,  4065,  4052,  4036,
-     4017,  3997,  3973,  3948,
-     3920,  3889,  3857,  3822,
-     3784,  3745,  3703,  3659,
-     3613,  3564,  3513,  3461,
-     3406,  3349,  3290,  3229,
-     3166,  3102,  3035,  2967,
-     2896,  2824,  2751,  2676,
-     2599,  2520,  2440,  2359,
-     2276,  2191,  2106,  2019,
-     1931,  1842,  1751,  1660,
-     1568,  1474,  1380,  1285,
-     1189,  1093,   995,   897,
-      799,   700,   601,   501,
-      401,   301,   201,   101,
-        0,  -101,  -201,  -301,
-     -401,  -501,  -601,  -700,
-     -799,  -897,  -995, -1093,
-    -1189, -1285, -1380, -1474,
-    -1568, -1660, -1751, -1842,
-    -1931, -2019, -2106, -2191,
-    -2276, -2359, -2440, -2520,
-    -2599, -2676, -2751, -2824,
-    -2896, -2967, -3035, -3102,
-    -3166, -3229, -3290, -3349,
-    -3406, -3461, -3513, -3564,
-    -3613, -3659, -3703, -3745,
-    -3784, -3822, -3857, -3889,
-    -3920, -3948, -3973, -3997,
-    -4017, -4036, -4052, -4065,
-    -4076, -4085, -4091, -4095,
-    -4096
-};
-
-static const uint16_t silk_pitch_scale[]   = {  4,   6,   8};
-
-static const uint16_t silk_pitch_min_lag[] = { 16,  24,  32};
-
-static const uint16_t silk_pitch_max_lag[] = {144, 216, 288};
-
-static const int8_t silk_pitch_offset_nb10ms[3][2] = {
-    { 0,  0},
-    { 1,  0},
-    { 0,  1}
-};
-
-static const int8_t silk_pitch_offset_nb20ms[11][4] = {
-    { 0,  0,  0,  0},
-    { 2,  1,  0, -1},
-    {-1,  0,  1,  2},
-    {-1,  0,  0,  1},
-    {-1,  0,  0,  0},
-    { 0,  0,  0,  1},
-    { 0,  0,  1,  1},
-    { 1,  1,  0,  0},
-    { 1,  0,  0,  0},
-    { 0,  0,  0, -1},
-    { 1,  0,  0, -1}
-};
-
-static const int8_t silk_pitch_offset_mbwb10ms[12][2] = {
-    { 0,  0},
-    { 0,  1},
-    { 1,  0},
-    {-1,  1},
-    { 1, -1},
-    {-1,  2},
-    { 2, -1},
-    {-2,  2},
-    { 2, -2},
-    {-2,  3},
-    { 3, -2},
-    {-3,  3}
-};
-
-static const int8_t silk_pitch_offset_mbwb20ms[34][4] = {
-    { 0,  0,  0,  0},
-    { 0,  0,  1,  1},
-    { 1,  1,  0,  0},
-    {-1,  0,  0,  0},
-    { 0,  0,  0,  1},
-    { 1,  0,  0,  0},
-    {-1,  0,  0,  1},
-    { 0,  0,  0, -1},
-    {-1,  0,  1,  2},
-    { 1,  0,  0, -1},
-    {-2, -1,  1,  2},
-    { 2,  1,  0, -1},
-    {-2,  0,  0,  2},
-    {-2,  0,  1,  3},
-    { 2,  1, -1, -2},
-    {-3, -1,  1,  3},
-    { 2,  0,  0, -2},
-    { 3,  1,  0, -2},
-    {-3, -1,  2,  4},
-    {-4, -1,  1,  4},
-    { 3,  1, -1, -3},
-    {-4, -1,  2,  5},
-    { 4,  2, -1, -3},
-    { 4,  1, -1, -4},
-    {-5, -1,  2,  6},
-    { 5,  2, -1, -4},
-    {-6, -2,  2,  6},
-    {-5, -2,  2,  5},
-    { 6,  2, -1, -5},
-    {-7, -2,  3,  8},
-    { 6,  2, -2, -6},
-    { 5,  2, -2, -5},
-    { 8,  3, -2, -7},
-    {-9, -3,  3,  9}
-};
-
-static const int8_t silk_ltp_filter0_taps[8][5] = {
-    {  4,   6,  24,   7,   5},
-    {  0,   0,   2,   0,   0},
-    { 12,  28,  41,  13,  -4},
-    { -9,  15,  42,  25,  14},
-    {  1,  -2,  62,  41,  -9},
-    {-10,  37,  65,  -4,   3},
-    { -6,   4,  66,   7,  -8},
-    { 16,  14,  38,  -3,  33}
-};
-
-static const int8_t silk_ltp_filter1_taps[16][5] = {
-    { 13,  22,  39,  23,  12},
-    { -1,  36,  64,  27,  -6},
-    { -7,  10,  55,  43,  17},
-    {  1,   1,   8,   1,   1},
-    {  6, -11,  74,  53,  -9},
-    {-12,  55,  76, -12,   8},
-    { -3,   3,  93,  27,  -4},
-    { 26,  39,  59,   3,  -8},
-    {  2,   0,  77,  11,   9},
-    { -8,  22,  44,  -6,   7},
-    { 40,   9,  26,   3,   9},
-    { -7,  20, 101,  -7,   4},
-    {  3,  -8,  42,  26,   0},
-    {-15,  33,  68,   2,  23},
-    { -2,  55,  46,  -2,  15},
-    {  3,  -1,  21,  16,  41}
-};
-
-static const int8_t silk_ltp_filter2_taps[32][5] = {
-    { -6,  27,  61,  39,   5},
-    {-11,  42,  88,   4,   1},
-    { -2,  60,  65,   6,  -4},
-    { -1,  -5,  73,  56,   1},
-    { -9,  19,  94,  29,  -9},
-    {  0,  12,  99,   6,   4},
-    {  8, -19, 102,  46, -13},
-    {  3,   2,  13,   3,   2},
-    {  9, -21,  84,  72, -18},
-    {-11,  46, 104, -22,   8},
-    { 18,  38,  48,  23,   0},
-    {-16,  70,  83, -21,  11},
-    {  5, -11, 117,  22,  -8},
-    { -6,  23, 117, -12,   3},
-    {  3,  -8,  95,  28,   4},
-    {-10,  15,  77,  60, -15},
-    { -1,   4, 124,   2,  -4},
-    {  3,  38,  84,  24, -25},
-    {  2,  13,  42,  13,  31},
-    { 21,  -4,  56,  46,  -1},
-    { -1,  35,  79, -13,  19},
-    { -7,  65,  88,  -9, -14},
-    { 20,   4,  81,  49, -29},
-    { 20,   0,  75,   3, -17},
-    {  5,  -9,  44,  92,  -8},
-    {  1,  -3,  22,  69,  31},
-    { -6,  95,  41, -12,   5},
-    { 39,  67,  16,  -4,   1},
-    {  0,  -6, 120,  55, -36},
-    {-13,  44, 122,   4, -24},
-    { 81,   5,  11,   3,   7},
-    {  2,   0,   9,  10,  88}
-};
-
-static const uint16_t silk_ltp_scale_factor[] = {15565, 12288, 8192};
-
-static const uint8_t silk_shell_blocks[3][2] = {
-    { 5, 10}, // NB
-    { 8, 15}, // MB
-    {10, 20}  // WB
-};
-
-static const uint8_t silk_quant_offset[2][2] = { /* (0.23) */
-    {25, 60}, // Inactive or Unvoiced
-    { 8, 25}  // Voiced
-};
-
-static const int silk_stereo_interp_len[3] = {
-    64, 96, 128
-};
-
 static inline void silk_stabilize_lsf(int16_t nlsf[16], int order, const uint16_t min_delta[17])
 {
     int pass, i;
@@ -851,8 +128,7 @@ static inline void silk_stabilize_lsf(int16_t nlsf[16], int order, const uint16_
     if (nlsf[0] < min_delta[0])
         nlsf[0] = min_delta[0];
     for (i = 1; i < order; i++)
-        if (nlsf[i] < nlsf[i - 1] + min_delta[i])
-            nlsf[i] = nlsf[i - 1] + min_delta[i];
+        nlsf[i] = FFMAX(nlsf[i], FFMIN(nlsf[i - 1] + min_delta[i], 32767));
 
     /* push backwards to increase distance */
     if (nlsf[order-1] > 32768 - min_delta[order])
@@ -909,8 +185,15 @@ static inline int silk_is_lpc_stable(const int16_t lpc[16], int order)
         row = lpc32[k & 1];
 
         for (j = 0; j < k; j++) {
-            int x = prevrow[j] - ROUND_MULL(prevrow[k - j - 1], rc, 31);
-            row[j] = ROUND_MULL(x, gain, fbits);
+            int x = av_sat_sub32(prevrow[j], ROUND_MULL(prevrow[k - j - 1], rc, 31));
+            int64_t tmp = ROUND_MULL(x, gain, fbits);
+
+            /* per RFC 8251 section 6, if this calculation overflows, the filter
+               is considered unstable. */
+            if (tmp < INT32_MIN || tmp > INT32_MAX)
+                return 0;
+
+            row[j] = (int32_t)tmp;
         }
     }
 }
@@ -943,11 +226,11 @@ static void silk_lsf2lpc(const int16_t nlsf[16], float lpcf[16], int order)
     for (k = 0; k < order; k++) {
         int index = nlsf[k] >> 8;
         int offset = nlsf[k] & 255;
-        int k2 = (order == 10) ? silk_lsf_ordering_nbmb[k] : silk_lsf_ordering_wb[k];
+        int k2 = (order == 10) ? ff_silk_lsf_ordering_nbmb[k] : ff_silk_lsf_ordering_wb[k];
 
         /* interpolate and round */
-        lsp[k2]  = silk_cosine[index] * 256;
-        lsp[k2] += (silk_cosine[index + 1] - silk_cosine[index]) * offset;
+        lsp[k2]  = ff_silk_cosine[index] * 256;
+        lsp[k2] += (ff_silk_cosine[index + 1] - ff_silk_cosine[index]) * offset;
         lsp[k2]  = (lsp[k2] + 4) >> 3;
     }
 
@@ -956,8 +239,10 @@ static void silk_lsf2lpc(const int16_t nlsf[16], float lpcf[16], int order)
 
     /* reconstruct A(z) */
     for (k = 0; k < order>>1; k++) {
-        lpc32[k]         = -p[k + 1] - p[k] - q[k + 1] + q[k];
-        lpc32[order-k-1] = -p[k + 1] - p[k] + q[k + 1] - q[k];
+        int32_t p_tmp = p[k + 1] + p[k];
+        int32_t q_tmp = q[k + 1] - q[k];
+        lpc32[k]         = -q_tmp - p_tmp;
+        lpc32[order-k-1] =  q_tmp - p_tmp;
     }
 
     /* limit the range of the LPC coefficients to each fit within an int16_t */
@@ -1030,15 +315,15 @@ static inline void silk_decode_lpc(SilkContext *s, SilkFrame *frame,
     *lpc_order = order = s->wb ? 16 : 10;
 
     /* obtain LSF stage-1 and stage-2 indices */
-    lsf_i1 = opus_rc_getsymbol(rc, silk_model_lsf_s1[s->wb][voiced]);
+    lsf_i1 = ff_opus_rc_dec_cdf(rc, ff_silk_model_lsf_s1[s->wb][voiced]);
     for (i = 0; i < order; i++) {
-        int index = s->wb ? silk_lsf_s2_model_sel_wb  [lsf_i1][i] :
-                            silk_lsf_s2_model_sel_nbmb[lsf_i1][i];
-        lsf_i2[i] = opus_rc_getsymbol(rc, silk_model_lsf_s2[index]) - 4;
+        int index = s->wb ? ff_silk_lsf_s2_model_sel_wb  [lsf_i1][i] :
+                            ff_silk_lsf_s2_model_sel_nbmb[lsf_i1][i];
+        lsf_i2[i] = ff_opus_rc_dec_cdf(rc, ff_silk_model_lsf_s2[index]) - 4;
         if (lsf_i2[i] == -4)
-            lsf_i2[i] -= opus_rc_getsymbol(rc, silk_model_lsf_s2_ext);
+            lsf_i2[i] -= ff_opus_rc_dec_cdf(rc, ff_silk_model_lsf_s2_ext);
         else if (lsf_i2[i] == 4)
-            lsf_i2[i] += opus_rc_getsymbol(rc, silk_model_lsf_s2_ext);
+            lsf_i2[i] += ff_opus_rc_dec_cdf(rc, ff_silk_model_lsf_s2_ext);
     }
 
     /* reverse the backwards-prediction step */
@@ -1051,16 +336,16 @@ static inline void silk_decode_lpc(SilkContext *s, SilkFrame *frame,
         lsf_res[i] = (lsf_res[i] * qstep) >> 16;
 
         if (i + 1 < order) {
-            int weight = s->wb ? silk_lsf_pred_weights_wb  [silk_lsf_weight_sel_wb  [lsf_i1][i]][i] :
-                                 silk_lsf_pred_weights_nbmb[silk_lsf_weight_sel_nbmb[lsf_i1][i]][i];
+            int weight = s->wb ? ff_silk_lsf_pred_weights_wb  [ff_silk_lsf_weight_sel_wb  [lsf_i1][i]][i] :
+                                 ff_silk_lsf_pred_weights_nbmb[ff_silk_lsf_weight_sel_nbmb[lsf_i1][i]][i];
             lsf_res[i] += (lsf_res[i+1] * weight) >> 8;
         }
     }
 
     /* reconstruct the NLSF coefficients from the supplied indices */
     for (i = 0; i < order; i++) {
-        const uint8_t * codebook = s->wb ? silk_lsf_codebook_wb  [lsf_i1] :
-                                           silk_lsf_codebook_nbmb[lsf_i1];
+        const uint8_t * codebook = s->wb ? ff_silk_lsf_codebook_wb  [lsf_i1] :
+                                           ff_silk_lsf_codebook_nbmb[lsf_i1];
         int cur, prev, next, weight_sq, weight, ipart, fpart, y, value;
 
         /* find the weight of the residual */
@@ -1081,14 +366,14 @@ static inline void silk_decode_lpc(SilkContext *s, SilkFrame *frame,
     }
 
     /* stabilize the NLSF coefficients */
-    silk_stabilize_lsf(nlsf, order, s->wb ? silk_lsf_min_spacing_wb :
-                                            silk_lsf_min_spacing_nbmb);
+    silk_stabilize_lsf(nlsf, order, s->wb ? ff_silk_lsf_min_spacing_wb :
+                                            ff_silk_lsf_min_spacing_nbmb);
 
     /* produce an interpolation for the first 2 subframes, */
     /* and then convert both sets of NLSFs to LPC coefficients */
     *has_lpc_leadin = 0;
     if (s->subframes == 4) {
-        int offset = opus_rc_getsymbol(rc, silk_model_lsf_interpolation_offset);
+        int offset = ff_opus_rc_dec_cdf(rc, ff_silk_model_lsf_interpolation_offset);
         if (offset != 4 && frame->coded) {
             *has_lpc_leadin = 1;
             if (offset != 0) {
@@ -1117,8 +402,8 @@ static inline void silk_count_children(OpusRangeCoder *rc, int model, int32_t to
                                        int32_t child[2])
 {
     if (total != 0) {
-        child[0] = opus_rc_getsymbol(rc,
-                       silk_model_pulse_location[model] + (((total - 1 + 5) * (total - 1)) >> 1));
+        child[0] = ff_opus_rc_dec_cdf(rc,
+                       ff_silk_model_pulse_location[model] + (((total - 1 + 5) * (total - 1)) >> 1));
         child[1] = total - child[0];
     } else {
         child[0] = 0;
@@ -1139,17 +424,17 @@ static inline void silk_decode_excitation(SilkContext *s, OpusRangeCoder *rc,
     int32_t excitation[320];    // Q23
 
     /* excitation parameters */
-    seed = opus_rc_getsymbol(rc, silk_model_lcg_seed);
-    shellblocks = silk_shell_blocks[s->bandwidth][s->subframes >> 2];
-    ratelevel = opus_rc_getsymbol(rc, silk_model_exc_rate[voiced]);
+    seed = ff_opus_rc_dec_cdf(rc, ff_silk_model_lcg_seed);
+    shellblocks = ff_silk_shell_blocks[s->bandwidth][s->subframes >> 2];
+    ratelevel = ff_opus_rc_dec_cdf(rc, ff_silk_model_exc_rate[voiced]);
 
     for (i = 0; i < shellblocks; i++) {
-        pulsecount[i] = opus_rc_getsymbol(rc, silk_model_pulse_count[ratelevel]);
+        pulsecount[i] = ff_opus_rc_dec_cdf(rc, ff_silk_model_pulse_count[ratelevel]);
         if (pulsecount[i] == 17) {
             while (pulsecount[i] == 17 && ++lsbcount[i] != 10)
-                pulsecount[i] = opus_rc_getsymbol(rc, silk_model_pulse_count[9]);
+                pulsecount[i] = ff_opus_rc_dec_cdf(rc, ff_silk_model_pulse_count[9]);
             if (lsbcount[i] == 10)
-                pulsecount[i] = opus_rc_getsymbol(rc, silk_model_pulse_count[10]);
+                pulsecount[i] = ff_opus_rc_dec_cdf(rc, ff_silk_model_pulse_count[10]);
         }
     }
 
@@ -1184,13 +469,13 @@ static inline void silk_decode_excitation(SilkContext *s, OpusRangeCoder *rc,
         int bit;
         for (bit = 0; bit < lsbcount[i >> 4]; bit++)
             excitation[i] = (excitation[i] << 1) |
-                            opus_rc_getsymbol(rc, silk_model_excitation_lsb);
+                            ff_opus_rc_dec_cdf(rc, ff_silk_model_excitation_lsb);
     }
 
     /* decode signs */
     for (i = 0; i < shellblocks << 4; i++) {
         if (excitation[i] != 0) {
-            int sign = opus_rc_getsymbol(rc, silk_model_excitation_sign[active +
+            int sign = ff_opus_rc_dec_cdf(rc, ff_silk_model_excitation_sign[active +
                                          voiced][qoffset_high][FFMIN(pulsecount[i >> 4], 6)]);
             if (sign == 0)
                 excitation[i] *= -1;
@@ -1200,7 +485,7 @@ static inline void silk_decode_excitation(SilkContext *s, OpusRangeCoder *rc,
     /* assemble the excitation */
     for (i = 0; i < shellblocks << 4; i++) {
         int value = excitation[i];
-        excitation[i] = value * 256 | silk_quant_offset[voiced][qoffset_high];
+        excitation[i] = value * 256 | ff_silk_quant_offset[voiced][qoffset_high];
         if (value < 0)      excitation[i] += 20;
         else if (value > 0) excitation[i] -= 20;
 
@@ -1245,30 +530,30 @@ static void silk_decode_frame(SilkContext *s, OpusRangeCoder *rc,
     /* obtain stereo weights */
     if (coded_channels == 2 && channel == 0) {
         int n, wi[2], ws[2], w[2];
-        n     = opus_rc_getsymbol(rc, silk_model_stereo_s1);
-        wi[0] = opus_rc_getsymbol(rc, silk_model_stereo_s2) + 3 * (n / 5);
-        ws[0] = opus_rc_getsymbol(rc, silk_model_stereo_s3);
-        wi[1] = opus_rc_getsymbol(rc, silk_model_stereo_s2) + 3 * (n % 5);
-        ws[1] = opus_rc_getsymbol(rc, silk_model_stereo_s3);
+        n     = ff_opus_rc_dec_cdf(rc, ff_silk_model_stereo_s1);
+        wi[0] = ff_opus_rc_dec_cdf(rc, ff_silk_model_stereo_s2) + 3 * (n / 5);
+        ws[0] = ff_opus_rc_dec_cdf(rc, ff_silk_model_stereo_s3);
+        wi[1] = ff_opus_rc_dec_cdf(rc, ff_silk_model_stereo_s2) + 3 * (n % 5);
+        ws[1] = ff_opus_rc_dec_cdf(rc, ff_silk_model_stereo_s3);
 
         for (i = 0; i < 2; i++)
-            w[i] = silk_stereo_weights[wi[i]] +
-                   (((silk_stereo_weights[wi[i] + 1] - silk_stereo_weights[wi[i]]) * 6554) >> 16)
+            w[i] = ff_silk_stereo_weights[wi[i]] +
+                   (((ff_silk_stereo_weights[wi[i] + 1] - ff_silk_stereo_weights[wi[i]]) * 6554) >> 16)
                     * (ws[i]*2 + 1);
 
         s->stereo_weights[0] = (w[0] - w[1]) / 8192.0;
         s->stereo_weights[1] = w[1]          / 8192.0;
 
         /* and read the mid-only flag */
-        s->midonly = active1 ? 0 : opus_rc_getsymbol(rc, silk_model_mid_only);
+        s->midonly = active1 ? 0 : ff_opus_rc_dec_cdf(rc, ff_silk_model_mid_only);
     }
 
     /* obtain frame type */
     if (!active) {
-        qoffset_high = opus_rc_getsymbol(rc, silk_model_frame_type_inactive);
+        qoffset_high = ff_opus_rc_dec_cdf(rc, ff_silk_model_frame_type_inactive);
         voiced = 0;
     } else {
-        int type = opus_rc_getsymbol(rc, silk_model_frame_type_active);
+        int type = ff_opus_rc_dec_cdf(rc, ff_silk_model_frame_type_active);
         qoffset_high = type & 1;
         voiced = type >> 1;
     }
@@ -1280,14 +565,14 @@ static void silk_decode_frame(SilkContext *s, OpusRangeCoder *rc,
 
         if (i == 0 && (frame_num == 0 || !frame->coded)) {
             /* gain is coded absolute */
-            int x = opus_rc_getsymbol(rc, silk_model_gain_highbits[active + voiced]);
-            log_gain = (x<<3) | opus_rc_getsymbol(rc, silk_model_gain_lowbits);
+            int x = ff_opus_rc_dec_cdf(rc, ff_silk_model_gain_highbits[active + voiced]);
+            log_gain = (x<<3) | ff_opus_rc_dec_cdf(rc, ff_silk_model_gain_lowbits);
 
             if (frame->coded)
                 log_gain = FFMAX(log_gain, frame->log_gain - 16);
         } else {
             /* gain is coded relative */
-            int delta_gain = opus_rc_getsymbol(rc, silk_model_gain_delta);
+            int delta_gain = ff_opus_rc_dec_cdf(rc, ff_silk_model_gain_delta);
             log_gain = av_clip_uintp2(FFMAX((delta_gain<<1) - 16,
                                      frame->log_gain + delta_gain - 4), 6);
         }
@@ -1313,7 +598,7 @@ static void silk_decode_frame(SilkContext *s, OpusRangeCoder *rc,
         const int8_t * offsets;
 
         if (!lag_absolute) {
-            int delta = opus_rc_getsymbol(rc, silk_model_pitch_delta);
+            int delta = ff_opus_rc_dec_cdf(rc, ff_silk_model_pitch_delta);
             if (delta)
                 primarylag = frame->primarylag + delta - 9;
             else
@@ -1324,47 +609,47 @@ static void silk_decode_frame(SilkContext *s, OpusRangeCoder *rc,
             /* primary lag is coded absolute */
             int highbits, lowbits;
             static const uint16_t * const model[] = {
-                silk_model_pitch_lowbits_nb, silk_model_pitch_lowbits_mb,
-                silk_model_pitch_lowbits_wb
+                ff_silk_model_pitch_lowbits_nb, ff_silk_model_pitch_lowbits_mb,
+                ff_silk_model_pitch_lowbits_wb
             };
-            highbits = opus_rc_getsymbol(rc, silk_model_pitch_highbits);
-            lowbits  = opus_rc_getsymbol(rc, model[s->bandwidth]);
+            highbits = ff_opus_rc_dec_cdf(rc, ff_silk_model_pitch_highbits);
+            lowbits  = ff_opus_rc_dec_cdf(rc, model[s->bandwidth]);
 
-            primarylag = silk_pitch_min_lag[s->bandwidth] +
-                         highbits*silk_pitch_scale[s->bandwidth] + lowbits;
+            primarylag = ff_silk_pitch_min_lag[s->bandwidth] +
+                         highbits*ff_silk_pitch_scale[s->bandwidth] + lowbits;
         }
         frame->primarylag = primarylag;
 
         if (s->subframes == 2)
             offsets = (s->bandwidth == OPUS_BANDWIDTH_NARROWBAND)
-                     ? silk_pitch_offset_nb10ms[opus_rc_getsymbol(rc,
-                                                silk_model_pitch_contour_nb10ms)]
-                     : silk_pitch_offset_mbwb10ms[opus_rc_getsymbol(rc,
-                                                silk_model_pitch_contour_mbwb10ms)];
+                     ? ff_silk_pitch_offset_nb10ms[ff_opus_rc_dec_cdf(rc,
+                                                ff_silk_model_pitch_contour_nb10ms)]
+                     : ff_silk_pitch_offset_mbwb10ms[ff_opus_rc_dec_cdf(rc,
+                                                ff_silk_model_pitch_contour_mbwb10ms)];
         else
             offsets = (s->bandwidth == OPUS_BANDWIDTH_NARROWBAND)
-                     ? silk_pitch_offset_nb20ms[opus_rc_getsymbol(rc,
-                                                silk_model_pitch_contour_nb20ms)]
-                     : silk_pitch_offset_mbwb20ms[opus_rc_getsymbol(rc,
-                                                silk_model_pitch_contour_mbwb20ms)];
+                     ? ff_silk_pitch_offset_nb20ms[ff_opus_rc_dec_cdf(rc,
+                                                ff_silk_model_pitch_contour_nb20ms)]
+                     : ff_silk_pitch_offset_mbwb20ms[ff_opus_rc_dec_cdf(rc,
+                                                ff_silk_model_pitch_contour_mbwb20ms)];
 
         for (i = 0; i < s->subframes; i++)
             sf[i].pitchlag = av_clip(primarylag + offsets[i],
-                                     silk_pitch_min_lag[s->bandwidth],
-                                     silk_pitch_max_lag[s->bandwidth]);
+                                     ff_silk_pitch_min_lag[s->bandwidth],
+                                     ff_silk_pitch_max_lag[s->bandwidth]);
 
         /* obtain LTP filter coefficients */
-        ltpfilter = opus_rc_getsymbol(rc, silk_model_ltp_filter);
+        ltpfilter = ff_opus_rc_dec_cdf(rc, ff_silk_model_ltp_filter);
         for (i = 0; i < s->subframes; i++) {
             int index, j;
             static const uint16_t * const filter_sel[] = {
-                silk_model_ltp_filter0_sel, silk_model_ltp_filter1_sel,
-                silk_model_ltp_filter2_sel
+                ff_silk_model_ltp_filter0_sel, ff_silk_model_ltp_filter1_sel,
+                ff_silk_model_ltp_filter2_sel
             };
             static const int8_t (* const filter_taps[])[5] = {
-                silk_ltp_filter0_taps, silk_ltp_filter1_taps, silk_ltp_filter2_taps
+                ff_silk_ltp_filter0_taps, ff_silk_ltp_filter1_taps, ff_silk_ltp_filter2_taps
             };
-            index = opus_rc_getsymbol(rc, filter_sel[ltpfilter]);
+            index = ff_opus_rc_dec_cdf(rc, filter_sel[ltpfilter]);
             for (j = 0; j < 5; j++)
                 sf[i].ltptaps[j] = filter_taps[ltpfilter][index][j] / 128.0f;
         }
@@ -1372,8 +657,8 @@ static void silk_decode_frame(SilkContext *s, OpusRangeCoder *rc,
 
     /* obtain LTP scale factor */
     if (voiced && frame_num == 0)
-        ltpscale = silk_ltp_scale_factor[opus_rc_getsymbol(rc,
-                                         silk_model_ltp_scale_index)] / 16384.0f;
+        ltpscale = ff_silk_ltp_scale_factor[ff_opus_rc_dec_cdf(rc,
+                                         ff_silk_model_ltp_scale_index)] / 16384.0f;
     else ltpscale = 15565.0f/16384.0f;
 
     /* generate the excitation signal for the entire frame */
@@ -1455,7 +740,7 @@ static void silk_unmix_ms(SilkContext *s, float *l, float *r)
     float w1_prev = s->prev_stereo_weights[1];
     float w0      = s->stereo_weights[0];
     float w1      = s->stereo_weights[1];
-    int n1        = silk_stereo_interp_len[s->bandwidth];
+    int n1        = ff_silk_stereo_interp_len[s->bandwidth];
     int i;
 
     for (i = 0; i < n1; i++) {
@@ -1526,9 +811,9 @@ int ff_silk_decode_superframe(SilkContext *s, OpusRangeCoder *rc,
     /* read the LP-layer header bits */
     for (i = 0; i < coded_channels; i++) {
         for (j = 0; j < nb_frames; j++)
-            active[i][j] = opus_rc_p2model(rc, 1);
+            active[i][j] = ff_opus_rc_dec_log(rc, 1);
 
-        redundancy[i] = opus_rc_p2model(rc, 1);
+        redundancy[i] = ff_opus_rc_dec_log(rc, 1);
         if (redundancy[i]) {
             avpriv_report_missing_feature(s->avctx, "LBRR frames");
             return AVERROR_PATCHWELCOME;
diff --git a/libavcodec/opusdec.c b/libavcodec/opusdec.c
index 163f0d5..03086de 100644
--- a/libavcodec/opusdec.c
+++ b/libavcodec/opusdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,15 +40,15 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
 
-#include "libavresample/avresample.h"
+#include "libswresample/swresample.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
-#include "celp_filters.h"
-#include "fft.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
 #include "opus.h"
+#include "opustab.h"
+#include "opus_celt.h"
 
 static const uint16_t silk_frame_duration_ms[16] = {
     10, 20, 40, 60,
@@ -64,8 +64,6 @@ static const int silk_resample_delay[] = {
     4, 8, 11, 11, 11
 };
 
-static const uint8_t celt_band_end[] = { 13, 17, 17, 19, 21 };
-
 static int get_silk_samplerate(int config)
 {
     if (config < 4)
@@ -75,32 +73,6 @@ static int get_silk_samplerate(int config)
     return 16000;
 }
 
-/**
- * Range decoder
- */
-static int opus_rc_init(OpusRangeCoder *rc, const uint8_t *data, int size)
-{
-    int ret = bitstream_init8(&rc->bc, data, size);
-    if (ret < 0)
-        return ret;
-
-    rc->range = 128;
-    rc->value = 127 - bitstream_read(&rc->bc, 7);
-    rc->total_read_bits = 9;
-    opus_rc_normalize(rc);
-
-    return 0;
-}
-
-static void opus_raw_init(OpusRangeCoder *rc, const uint8_t *rightend,
-                          unsigned int bytes)
-{
-    rc->rb.position = rightend;
-    rc->rb.bytes    = bytes;
-    rc->rb.cachelen = 0;
-    rc->rb.cacheval = 0;
-}
-
 static void opus_fade(float *out,
                       const float *in1, const float *in2,
                       const float *window, int len)
@@ -114,9 +86,9 @@ static int opus_flush_resample(OpusStreamContext *s, int nb_samples)
 {
     int celt_size = av_audio_fifo_size(s->celt_delay);
     int ret, i;
-
-    ret = avresample_convert(s->avr, (uint8_t**)s->out, s->out_size, nb_samples,
-                             NULL, 0, 0);
+    ret = swr_convert(s->swr,
+                      (uint8_t**)s->out, nb_samples,
+                      NULL, 0);
     if (ret < 0)
         return ret;
     else if (ret != nb_samples) {
@@ -155,19 +127,20 @@ static int opus_flush_resample(OpusStreamContext *s, int nb_samples)
 
 static int opus_init_resample(OpusStreamContext *s)
 {
-    float delay[16] = { 0.0 };
-    uint8_t *delayptr[2] = { (uint8_t*)delay, (uint8_t*)delay };
+    static const float delay[16] = { 0.0 };
+    const uint8_t *delayptr[2] = { (uint8_t*)delay, (uint8_t*)delay };
     int ret;
 
-    av_opt_set_int(s->avr, "in_sample_rate", s->silk_samplerate, 0);
-    ret = avresample_open(s->avr);
+    av_opt_set_int(s->swr, "in_sample_rate", s->silk_samplerate, 0);
+    ret = swr_init(s->swr);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Error opening the resampler.\n");
         return ret;
     }
 
-    ret = avresample_convert(s->avr, NULL, 0, 0, delayptr, sizeof(delay),
-                             silk_resample_delay[s->packet.bandwidth]);
+    ret = swr_convert(s->swr,
+                      NULL, 0,
+                      delayptr, silk_resample_delay[s->packet.bandwidth]);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR,
                "Error feeding initial silence to the resampler.\n");
@@ -179,22 +152,15 @@ static int opus_init_resample(OpusStreamContext *s)
 
 static int opus_decode_redundancy(OpusStreamContext *s, const uint8_t *data, int size)
 {
-    int ret;
-    enum OpusBandwidth bw = s->packet.bandwidth;
-
-    if (s->packet.mode == OPUS_MODE_SILK &&
-        bw == OPUS_BANDWIDTH_MEDIUMBAND)
-        bw = OPUS_BANDWIDTH_WIDEBAND;
-
-    ret = opus_rc_init(&s->redundancy_rc, data, size);
+    int ret = ff_opus_rc_dec_init(&s->redundancy_rc, data, size);
     if (ret < 0)
         goto fail;
-    opus_raw_init(&s->redundancy_rc, data + size, size);
+    ff_opus_rc_dec_raw_init(&s->redundancy_rc, data + size, size);
 
     ret = ff_celt_decode_frame(s->celt, &s->redundancy_rc,
                                s->redundancy_output,
                                s->packet.stereo + 1, 240,
-                               0, celt_band_end[s->packet.bandwidth]);
+                               0, ff_celt_band_end[s->packet.bandwidth]);
     if (ret < 0)
         goto fail;
 
@@ -212,13 +178,13 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
     int ret, i, consumed;
     int delayed_samples = s->delayed_samples;
 
-    ret = opus_rc_init(&s->rc, data, size);
+    ret = ff_opus_rc_dec_init(&s->rc, data, size);
     if (ret < 0)
         return ret;
 
     /* decode the silk frame */
     if (s->packet.mode == OPUS_MODE_SILK || s->packet.mode == OPUS_MODE_HYBRID) {
-        if (!avresample_is_open(s->avr)) {
+        if (!swr_is_initialized(s->swr)) {
             ret = opus_init_resample(s);
             if (ret < 0)
                 return ret;
@@ -232,16 +198,14 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
             av_log(s->avctx, AV_LOG_ERROR, "Error decoding a SILK frame.\n");
             return samples;
         }
-
-        samples = avresample_convert(s->avr, (uint8_t**)s->out, s->out_size,
-                                     s->packet.frame_duration,
-                                     (uint8_t**)s->silk_output,
-                                     sizeof(s->silk_buf[0]),
-                                     samples);
+        samples = swr_convert(s->swr,
+                              (uint8_t**)s->out, s->packet.frame_duration,
+                              (const uint8_t**)s->silk_output, samples);
         if (samples < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Error resampling SILK data.\n");
             return samples;
         }
+        av_assert2((samples & 7) == 0);
         s->delayed_samples += s->packet.frame_duration - samples;
     } else
         ff_silk_flush(s->silk);
@@ -249,15 +213,15 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
     // decode redundancy information
     consumed = opus_rc_tell(&s->rc);
     if (s->packet.mode == OPUS_MODE_HYBRID && consumed + 37 <= size * 8)
-        redundancy = opus_rc_p2model(&s->rc, 12);
+        redundancy = ff_opus_rc_dec_log(&s->rc, 12);
     else if (s->packet.mode == OPUS_MODE_SILK && consumed + 17 <= size * 8)
         redundancy = 1;
 
     if (redundancy) {
-        redundancy_pos = opus_rc_p2model(&s->rc, 1);
+        redundancy_pos = ff_opus_rc_dec_log(&s->rc, 1);
 
         if (s->packet.mode == OPUS_MODE_HYBRID)
-            redundancy_size = opus_rc_unimodel(&s->rc, 256) + 2;
+            redundancy_size = ff_opus_rc_dec_uint(&s->rc, 256) + 2;
         else
             redundancy_size = size - (consumed + 7) / 8;
         size -= redundancy_size;
@@ -301,13 +265,13 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
             }
         }
 
-        opus_raw_init(&s->rc, data + size, size);
+        ff_opus_rc_dec_raw_init(&s->rc, data + size, size);
 
         ret = ff_celt_decode_frame(s->celt, &s->rc, dst,
                                    s->packet.stereo + 1,
                                    s->packet.frame_duration,
                                    (s->packet.mode == OPUS_MODE_HYBRID) ? 17 : 0,
-                                   celt_band_end[s->packet.bandwidth]);
+                                   ff_celt_band_end[s->packet.bandwidth]);
         if (ret < 0)
             return ret;
 
@@ -379,10 +343,10 @@ static int opus_decode_subpacket(OpusStreamContext *s,
     s->out_size = out_size;
 
     /* check if we need to flush the resampler */
-    if (avresample_is_open(s->avr)) {
+    if (swr_is_initialized(s->swr)) {
         if (buf) {
             int64_t cur_samplerate;
-            av_opt_get_int(s->avr, "in_sample_rate", 0, &cur_samplerate);
+            av_opt_get_int(s->swr, "in_sample_rate", 0, &cur_samplerate);
             flush_needed = (s->packet.mode == OPUS_MODE_CELT) || (cur_samplerate != s->silk_samplerate);
         } else {
             flush_needed = !!s->delayed_samples;
@@ -411,7 +375,7 @@ static int opus_decode_subpacket(OpusStreamContext *s,
             av_log(s->avctx, AV_LOG_ERROR, "Error flushing the resampler.\n");
             return ret;
         }
-        avresample_close(s->avr);
+        swr_close(s->swr);
         output_samples += s->delayed_samples;
         s->delayed_samples = 0;
 
@@ -461,8 +425,11 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
 
     /* calculate the number of delayed samples */
     for (i = 0; i < c->nb_streams; i++) {
+        OpusStreamContext *s = &c->streams[i];
+        s->out[0] =
+        s->out[1] = NULL;
         delayed_samples = FFMAX(delayed_samples,
-                                c->streams[i].delayed_samples + av_audio_fifo_size(c->sync_buffers[i]));
+                                s->delayed_samples + av_audio_fifo_size(c->sync_buffers[i]));
     }
 
     /* decode the header of the first sub-packet to find out the sample count */
@@ -487,10 +454,8 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
 
     /* setup the data buffers */
     ret = ff_get_buffer(avctx, frame, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (ret < 0)
         return ret;
-    }
     frame->nb_samples = 0;
 
     memset(c->out, 0, c->nb_streams * 2 * sizeof(*c->out));
@@ -588,7 +553,7 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
         }
 
         if (c->gain_i && decoded_samples > 0) {
-            c->fdsp.vector_fmul_scalar((float*)frame->extended_data[i],
+            c->fdsp->vector_fmul_scalar((float*)frame->extended_data[i],
                                        (float*)frame->extended_data[i],
                                        c->gain, FFALIGN(decoded_samples, 8));
         }
@@ -613,7 +578,7 @@ static av_cold void opus_decode_flush(AVCodecContext *ctx)
 
         if (s->celt_delay)
             av_audio_fifo_drain(s->celt_delay, av_audio_fifo_size(s->celt_delay));
-        avresample_close(s->avr);
+        swr_close(s->swr);
 
         av_audio_fifo_drain(c->sync_buffers[i], av_audio_fifo_size(c->sync_buffers[i]));
 
@@ -637,7 +602,7 @@ static av_cold int opus_decode_close(AVCodecContext *avctx)
         s->out_dummy_allocated_size = 0;
 
         av_audio_fifo_free(s->celt_delay);
-        avresample_free(&s->avr);
+        swr_free(&s->swr);
     }
 
     av_freep(&c->streams);
@@ -654,6 +619,7 @@ static av_cold int opus_decode_close(AVCodecContext *avctx)
     c->nb_streams = 0;
 
     av_freep(&c->channel_maps);
+    av_freep(&c->fdsp);
 
     return 0;
 }
@@ -666,12 +632,16 @@ static av_cold int opus_decode_init(AVCodecContext *avctx)
     avctx->sample_fmt  = AV_SAMPLE_FMT_FLTP;
     avctx->sample_rate = 48000;
 
-    avpriv_float_dsp_init(&c->fdsp, 0);
+    c->fdsp = avpriv_float_dsp_alloc(0);
+    if (!c->fdsp)
+        return AVERROR(ENOMEM);
 
     /* find out the channel configuration */
     ret = ff_opus_parse_extradata(avctx, c);
-    if (ret < 0)
+    if (ret < 0) {
+        av_freep(&c->fdsp);
         return ret;
+    }
 
     /* allocate and init each independent decoder */
     c->streams = av_mallocz_array(c->nb_streams, sizeof(*c->streams));
@@ -699,24 +669,25 @@ static av_cold int opus_decode_init(AVCodecContext *avctx)
             s->redundancy_output[j] = s->redundancy_buf[j];
         }
 
-        s->fdsp = &c->fdsp;
+        s->fdsp = c->fdsp;
 
-        s->avr = avresample_alloc_context();
-        if (!s->avr)
+        s->swr =swr_alloc();
+        if (!s->swr)
             goto fail;
 
         layout = (s->output_channels == 1) ? AV_CH_LAYOUT_MONO : AV_CH_LAYOUT_STEREO;
-        av_opt_set_int(s->avr, "in_sample_fmt",      avctx->sample_fmt,  0);
-        av_opt_set_int(s->avr, "out_sample_fmt",     avctx->sample_fmt,  0);
-        av_opt_set_int(s->avr, "in_channel_layout",  layout,             0);
-        av_opt_set_int(s->avr, "out_channel_layout", layout,             0);
-        av_opt_set_int(s->avr, "out_sample_rate",    avctx->sample_rate, 0);
+        av_opt_set_int(s->swr, "in_sample_fmt",      avctx->sample_fmt,  0);
+        av_opt_set_int(s->swr, "out_sample_fmt",     avctx->sample_fmt,  0);
+        av_opt_set_int(s->swr, "in_channel_layout",  layout,             0);
+        av_opt_set_int(s->swr, "out_channel_layout", layout,             0);
+        av_opt_set_int(s->swr, "out_sample_rate",    avctx->sample_rate, 0);
+        av_opt_set_int(s->swr, "filter_size",        16,                 0);
 
         ret = ff_silk_init(avctx, &s->silk, s->output_channels);
         if (ret < 0)
             goto fail;
 
-        ret = ff_celt_init(avctx, &s->celt, s->output_channels);
+        ret = ff_celt_init(avctx, &s->celt, s->output_channels, c->apply_phase_inv);
         if (ret < 0)
             goto fail;
 
@@ -741,9 +712,24 @@ fail:
     return ret;
 }
 
+#define OFFSET(x) offsetof(OpusContext, x)
+#define AD AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption opus_options[] = {
+    { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, AD },
+    { NULL },
+};
+
+static const AVClass opus_class = {
+    .class_name = "Opus Decoder",
+    .item_name  = av_default_item_name,
+    .option     = opus_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_opus_decoder = {
     .name            = "opus",
     .long_name       = NULL_IF_CONFIG_SMALL("Opus"),
+    .priv_class      = &opus_class,
     .type            = AVMEDIA_TYPE_AUDIO,
     .id              = AV_CODEC_ID_OPUS,
     .priv_data_size  = sizeof(OpusContext),
diff --git a/libavcodec/opusenc.c b/libavcodec/opusenc.c
new file mode 100644
index 0000000..3c08ebc
--- /dev/null
+++ b/libavcodec/opusenc.c
@@ -0,0 +1,738 @@
+/*
+ * Opus encoder
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "opusenc.h"
+#include "opus_pvq.h"
+#include "opusenc_psy.h"
+#include "opustab.h"
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/opt.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "audio_frame_queue.h"
+
+typedef struct OpusEncContext {
+    AVClass *av_class;
+    OpusEncOptions options;
+    OpusPsyContext psyctx;
+    AVCodecContext *avctx;
+    AudioFrameQueue afq;
+    AVFloatDSPContext *dsp;
+    MDCT15Context *mdct[CELT_BLOCK_NB];
+    CeltPVQ *pvq;
+    struct FFBufQueue bufqueue;
+
+    uint8_t enc_id[64];
+    int enc_id_bits;
+
+    OpusPacketInfo packet;
+
+    int channels;
+
+    CeltFrame *frame;
+    OpusRangeCoder *rc;
+
+    /* Actual energy the decoder will have */
+    float last_quantized_energy[OPUS_MAX_CHANNELS][CELT_MAX_BANDS];
+
+    DECLARE_ALIGNED(32, float, scratch)[2048];
+} OpusEncContext;
+
+static void opus_write_extradata(AVCodecContext *avctx)
+{
+    uint8_t *bs = avctx->extradata;
+
+    bytestream_put_buffer(&bs, "OpusHead", 8);
+    bytestream_put_byte  (&bs, 0x1);
+    bytestream_put_byte  (&bs, avctx->channels);
+    bytestream_put_le16  (&bs, avctx->initial_padding);
+    bytestream_put_le32  (&bs, avctx->sample_rate);
+    bytestream_put_le16  (&bs, 0x0);
+    bytestream_put_byte  (&bs, 0x0); /* Default layout */
+}
+
+static int opus_gen_toc(OpusEncContext *s, uint8_t *toc, int *size, int *fsize_needed)
+{
+    int tmp = 0x0, extended_toc = 0;
+    static const int toc_cfg[][OPUS_MODE_NB][OPUS_BANDWITH_NB] = {
+        /*  Silk                    Hybrid                  Celt                    Layer     */
+        /*  NB  MB  WB SWB  FB      NB  MB  WB SWB  FB      NB  MB  WB SWB  FB      Bandwidth */
+        { {  0,  0,  0,  0,  0 }, {  0,  0,  0,  0,  0 }, { 17,  0, 21, 25, 29 } }, /* 2.5 ms */
+        { {  0,  0,  0,  0,  0 }, {  0,  0,  0,  0,  0 }, { 18,  0, 22, 26, 30 } }, /*   5 ms */
+        { {  1,  5,  9,  0,  0 }, {  0,  0,  0, 13, 15 }, { 19,  0, 23, 27, 31 } }, /*  10 ms */
+        { {  2,  6, 10,  0,  0 }, {  0,  0,  0, 14, 16 }, { 20,  0, 24, 28, 32 } }, /*  20 ms */
+        { {  3,  7, 11,  0,  0 }, {  0,  0,  0,  0,  0 }, {  0,  0,  0,  0,  0 } }, /*  40 ms */
+        { {  4,  8, 12,  0,  0 }, {  0,  0,  0,  0,  0 }, {  0,  0,  0,  0,  0 } }, /*  60 ms */
+    };
+    int cfg = toc_cfg[s->packet.framesize][s->packet.mode][s->packet.bandwidth];
+    *fsize_needed = 0;
+    if (!cfg)
+        return 1;
+    if (s->packet.frames == 2) {                                       /* 2 packets */
+        if (s->frame[0].framebits == s->frame[1].framebits) {          /* same size */
+            tmp = 0x1;
+        } else {                                                  /* different size */
+            tmp = 0x2;
+            *fsize_needed = 1;                     /* put frame sizes in the packet */
+        }
+    } else if (s->packet.frames > 2) {
+        tmp = 0x3;
+        extended_toc = 1;
+    }
+    tmp |= (s->channels > 1) << 2;                                /* Stereo or mono */
+    tmp |= (cfg - 1)         << 3;                           /* codec configuration */
+    *toc++ = tmp;
+    if (extended_toc) {
+        for (int i = 0; i < (s->packet.frames - 1); i++)
+            *fsize_needed |= (s->frame[i].framebits != s->frame[i + 1].framebits);
+        tmp = (*fsize_needed) << 7;                                /* vbr flag */
+        tmp |= (0) << 6;                                       /* padding flag */
+        tmp |= s->packet.frames;
+        *toc++ = tmp;
+    }
+    *size = 1 + extended_toc;
+    return 0;
+}
+
+static void celt_frame_setup_input(OpusEncContext *s, CeltFrame *f)
+{
+    AVFrame *cur = NULL;
+    const int subframesize = s->avctx->frame_size;
+    int subframes = OPUS_BLOCK_SIZE(s->packet.framesize) / subframesize;
+
+    cur = ff_bufqueue_get(&s->bufqueue);
+
+    for (int ch = 0; ch < f->channels; ch++) {
+        CeltBlock *b = &f->block[ch];
+        const void *input = cur->extended_data[ch];
+        size_t bps = av_get_bytes_per_sample(cur->format);
+        memcpy(b->overlap, input, bps*cur->nb_samples);
+    }
+
+    av_frame_free(&cur);
+
+    for (int sf = 0; sf < subframes; sf++) {
+        if (sf != (subframes - 1))
+            cur = ff_bufqueue_get(&s->bufqueue);
+        else
+            cur = ff_bufqueue_peek(&s->bufqueue, 0);
+
+        for (int ch = 0; ch < f->channels; ch++) {
+            CeltBlock *b = &f->block[ch];
+            const void *input = cur->extended_data[ch];
+            const size_t bps  = av_get_bytes_per_sample(cur->format);
+            const size_t left = (subframesize - cur->nb_samples)*bps;
+            const size_t len  = FFMIN(subframesize, cur->nb_samples)*bps;
+            memcpy(&b->samples[sf*subframesize], input, len);
+            memset(&b->samples[cur->nb_samples], 0, left);
+        }
+
+        /* Last frame isn't popped off and freed yet - we need it for overlap */
+        if (sf != (subframes - 1))
+            av_frame_free(&cur);
+    }
+}
+
+/* Apply the pre emphasis filter */
+static void celt_apply_preemph_filter(OpusEncContext *s, CeltFrame *f)
+{
+    const int subframesize = s->avctx->frame_size;
+    const int subframes = OPUS_BLOCK_SIZE(s->packet.framesize) / subframesize;
+
+    /* Filter overlap */
+    for (int ch = 0; ch < f->channels; ch++) {
+        CeltBlock *b = &f->block[ch];
+        float m = b->emph_coeff;
+        for (int i = 0; i < CELT_OVERLAP; i++) {
+            float sample = b->overlap[i];
+            b->overlap[i] = sample - m;
+            m = sample * CELT_EMPH_COEFF;
+        }
+        b->emph_coeff = m;
+    }
+
+    /* Filter the samples but do not update the last subframe's coeff - overlap ^^^ */
+    for (int sf = 0; sf < subframes; sf++) {
+        for (int ch = 0; ch < f->channels; ch++) {
+            CeltBlock *b = &f->block[ch];
+            float m = b->emph_coeff;
+            for (int i = 0; i < subframesize; i++) {
+                float sample = b->samples[sf*subframesize + i];
+                b->samples[sf*subframesize + i] = sample - m;
+                m = sample * CELT_EMPH_COEFF;
+            }
+            if (sf != (subframes - 1))
+                b->emph_coeff = m;
+        }
+    }
+}
+
+/* Create the window and do the mdct */
+static void celt_frame_mdct(OpusEncContext *s, CeltFrame *f)
+{
+    float *win = s->scratch, *temp = s->scratch + 1920;
+
+    if (f->transient) {
+        for (int ch = 0; ch < f->channels; ch++) {
+            CeltBlock *b = &f->block[ch];
+            float *src1 = b->overlap;
+            for (int t = 0; t < f->blocks; t++) {
+                float *src2 = &b->samples[CELT_OVERLAP*t];
+                s->dsp->vector_fmul(win, src1, ff_celt_window, 128);
+                s->dsp->vector_fmul_reverse(&win[CELT_OVERLAP], src2,
+                                            ff_celt_window - 8, 128);
+                src1 = src2;
+                s->mdct[0]->mdct(s->mdct[0], b->coeffs + t, win, f->blocks);
+            }
+        }
+    } else {
+        int blk_len = OPUS_BLOCK_SIZE(f->size), wlen = OPUS_BLOCK_SIZE(f->size + 1);
+        int rwin = blk_len - CELT_OVERLAP, lap_dst = (wlen - blk_len - CELT_OVERLAP) >> 1;
+        memset(win, 0, wlen*sizeof(float));
+        for (int ch = 0; ch < f->channels; ch++) {
+            CeltBlock *b = &f->block[ch];
+
+            /* Overlap */
+            s->dsp->vector_fmul(temp, b->overlap, ff_celt_window, 128);
+            memcpy(win + lap_dst, temp, CELT_OVERLAP*sizeof(float));
+
+            /* Samples, flat top window */
+            memcpy(&win[lap_dst + CELT_OVERLAP], b->samples, rwin*sizeof(float));
+
+            /* Samples, windowed */
+            s->dsp->vector_fmul_reverse(temp, b->samples + rwin,
+                                        ff_celt_window - 8, 128);
+            memcpy(win + lap_dst + blk_len, temp, CELT_OVERLAP*sizeof(float));
+
+            s->mdct[f->size]->mdct(s->mdct[f->size], b->coeffs, win, 1);
+        }
+    }
+
+    for (int ch = 0; ch < f->channels; ch++) {
+        CeltBlock *block = &f->block[ch];
+        for (int i = 0; i < CELT_MAX_BANDS; i++) {
+            float ener = 0.0f;
+            int band_offset = ff_celt_freq_bands[i] << f->size;
+            int band_size   = ff_celt_freq_range[i] << f->size;
+            float *coeffs   = &block->coeffs[band_offset];
+
+            for (int j = 0; j < band_size; j++)
+                ener += coeffs[j]*coeffs[j];
+
+            block->lin_energy[i] = sqrtf(ener) + FLT_EPSILON;
+            ener = 1.0f/block->lin_energy[i];
+
+            for (int j = 0; j < band_size; j++)
+                coeffs[j] *= ener;
+
+            block->energy[i] = log2f(block->lin_energy[i]) - ff_celt_mean_energy[i];
+
+            /* CELT_ENERGY_SILENCE is what the decoder uses and its not -infinity */
+            block->energy[i] = FFMAX(block->energy[i], CELT_ENERGY_SILENCE);
+        }
+    }
+}
+
+static void celt_enc_tf(CeltFrame *f, OpusRangeCoder *rc)
+{
+    int tf_select = 0, diff = 0, tf_changed = 0, tf_select_needed;
+    int bits = f->transient ? 2 : 4;
+
+    tf_select_needed = ((f->size && (opus_rc_tell(rc) + bits + 1) <= f->framebits));
+
+    for (int i = f->start_band; i < f->end_band; i++) {
+        if ((opus_rc_tell(rc) + bits + tf_select_needed) <= f->framebits) {
+            const int tbit = (diff ^ 1) == f->tf_change[i];
+            ff_opus_rc_enc_log(rc, tbit, bits);
+            diff ^= tbit;
+            tf_changed |= diff;
+        }
+        bits = f->transient ? 4 : 5;
+    }
+
+    if (tf_select_needed && ff_celt_tf_select[f->size][f->transient][0][tf_changed] !=
+                            ff_celt_tf_select[f->size][f->transient][1][tf_changed]) {
+        ff_opus_rc_enc_log(rc, f->tf_select, 1);
+        tf_select = f->tf_select;
+    }
+
+    for (int i = f->start_band; i < f->end_band; i++)
+        f->tf_change[i] = ff_celt_tf_select[f->size][f->transient][tf_select][f->tf_change[i]];
+}
+
+static void celt_enc_quant_pfilter(OpusRangeCoder *rc, CeltFrame *f)
+{
+    float gain = f->pf_gain;
+    int txval, octave = f->pf_octave, period = f->pf_period, tapset = f->pf_tapset;
+
+    ff_opus_rc_enc_log(rc, f->pfilter, 1);
+    if (!f->pfilter)
+        return;
+
+    /* Octave */
+    txval = FFMIN(octave, 6);
+    ff_opus_rc_enc_uint(rc, txval, 6);
+    octave = txval;
+    /* Period */
+    txval = av_clip(period - (16 << octave) + 1, 0, (1 << (4 + octave)) - 1);
+    ff_opus_rc_put_raw(rc, period, 4 + octave);
+    period = txval + (16 << octave) - 1;
+    /* Gain */
+    txval = FFMIN(((int)(gain / 0.09375f)) - 1, 7);
+    ff_opus_rc_put_raw(rc, txval, 3);
+    gain   = 0.09375f * (txval + 1);
+    /* Tapset */
+    if ((opus_rc_tell(rc) + 2) <= f->framebits)
+        ff_opus_rc_enc_cdf(rc, tapset, ff_celt_model_tapset);
+    else
+        tapset = 0;
+    /* Finally create the coeffs */
+    for (int i = 0; i < 2; i++) {
+        CeltBlock *block = &f->block[i];
+
+        block->pf_period_new = FFMAX(period, CELT_POSTFILTER_MINPERIOD);
+        block->pf_gains_new[0] = gain * ff_celt_postfilter_taps[tapset][0];
+        block->pf_gains_new[1] = gain * ff_celt_postfilter_taps[tapset][1];
+        block->pf_gains_new[2] = gain * ff_celt_postfilter_taps[tapset][2];
+    }
+}
+
+static void exp_quant_coarse(OpusRangeCoder *rc, CeltFrame *f,
+                             float last_energy[][CELT_MAX_BANDS], int intra)
+{
+    float alpha, beta, prev[2] = { 0, 0 };
+    const uint8_t *pmod = ff_celt_coarse_energy_dist[f->size][intra];
+
+    /* Inter is really just differential coding */
+    if (opus_rc_tell(rc) + 3 <= f->framebits)
+        ff_opus_rc_enc_log(rc, intra, 3);
+    else
+        intra = 0;
+
+    if (intra) {
+        alpha = 0.0f;
+        beta  = 1.0f - (4915.0f/32768.0f);
+    } else {
+        alpha = ff_celt_alpha_coef[f->size];
+        beta  = ff_celt_beta_coef[f->size];
+    }
+
+    for (int i = f->start_band; i < f->end_band; i++) {
+        for (int ch = 0; ch < f->channels; ch++) {
+            CeltBlock *block = &f->block[ch];
+            const int left = f->framebits - opus_rc_tell(rc);
+            const float last = FFMAX(-9.0f, last_energy[ch][i]);
+            float diff = block->energy[i] - prev[ch] - last*alpha;
+            int q_en = lrintf(diff);
+            if (left >= 15) {
+                ff_opus_rc_enc_laplace(rc, &q_en, pmod[i << 1] << 7, pmod[(i << 1) + 1] << 6);
+            } else if (left >= 2) {
+                q_en = av_clip(q_en, -1, 1);
+                ff_opus_rc_enc_cdf(rc, 2*q_en + 3*(q_en < 0), ff_celt_model_energy_small);
+            } else if (left >= 1) {
+                q_en = av_clip(q_en, -1, 0);
+                ff_opus_rc_enc_log(rc, (q_en & 1), 1);
+            } else q_en = -1;
+
+            block->error_energy[i] = q_en - diff;
+            prev[ch] += beta * q_en;
+        }
+    }
+}
+
+static void celt_quant_coarse(CeltFrame *f, OpusRangeCoder *rc,
+                              float last_energy[][CELT_MAX_BANDS])
+{
+    uint32_t inter, intra;
+    OPUS_RC_CHECKPOINT_SPAWN(rc);
+
+    exp_quant_coarse(rc, f, last_energy, 1);
+    intra = OPUS_RC_CHECKPOINT_BITS(rc);
+
+    OPUS_RC_CHECKPOINT_ROLLBACK(rc);
+
+    exp_quant_coarse(rc, f, last_energy, 0);
+    inter = OPUS_RC_CHECKPOINT_BITS(rc);
+
+    if (inter > intra) { /* Unlikely */
+        OPUS_RC_CHECKPOINT_ROLLBACK(rc);
+        exp_quant_coarse(rc, f, last_energy, 1);
+    }
+}
+
+static void celt_quant_fine(CeltFrame *f, OpusRangeCoder *rc)
+{
+    for (int i = f->start_band; i < f->end_band; i++) {
+        if (!f->fine_bits[i])
+            continue;
+        for (int ch = 0; ch < f->channels; ch++) {
+            CeltBlock *block = &f->block[ch];
+            int quant, lim = (1 << f->fine_bits[i]);
+            float offset, diff = 0.5f - block->error_energy[i];
+            quant = av_clip(floor(diff*lim), 0, lim - 1);
+            ff_opus_rc_put_raw(rc, quant, f->fine_bits[i]);
+            offset = 0.5f - ((quant + 0.5f) * (1 << (14 - f->fine_bits[i])) / 16384.0f);
+            block->error_energy[i] -= offset;
+        }
+    }
+}
+
+static void celt_quant_final(OpusEncContext *s, OpusRangeCoder *rc, CeltFrame *f)
+{
+    for (int priority = 0; priority < 2; priority++) {
+        for (int i = f->start_band; i < f->end_band && (f->framebits - opus_rc_tell(rc)) >= f->channels; i++) {
+            if (f->fine_priority[i] != priority || f->fine_bits[i] >= CELT_MAX_FINE_BITS)
+                continue;
+            for (int ch = 0; ch < f->channels; ch++) {
+                CeltBlock *block = &f->block[ch];
+                const float err = block->error_energy[i];
+                const float offset = 0.5f * (1 << (14 - f->fine_bits[i] - 1)) / 16384.0f;
+                const int sign = FFABS(err + offset) < FFABS(err - offset);
+                ff_opus_rc_put_raw(rc, sign, 1);
+                block->error_energy[i] -= offset*(1 - 2*sign);
+            }
+        }
+    }
+}
+
+static void celt_encode_frame(OpusEncContext *s, OpusRangeCoder *rc,
+                              CeltFrame *f, int index)
+{
+    ff_opus_rc_enc_init(rc);
+
+    ff_opus_psy_celt_frame_init(&s->psyctx, f, index);
+
+    celt_frame_setup_input(s, f);
+
+    if (f->silence) {
+        if (f->framebits >= 16)
+            ff_opus_rc_enc_log(rc, 1, 15); /* Silence (if using explicit singalling) */
+        for (int ch = 0; ch < s->channels; ch++)
+            memset(s->last_quantized_energy[ch], 0.0f, sizeof(float)*CELT_MAX_BANDS);
+        return;
+    }
+
+    /* Filters */
+    celt_apply_preemph_filter(s, f);
+    if (f->pfilter) {
+        ff_opus_rc_enc_log(rc, 0, 15);
+        celt_enc_quant_pfilter(rc, f);
+    }
+
+    /* Transform */
+    celt_frame_mdct(s, f);
+
+    /* Need to handle transient/non-transient switches at any point during analysis */
+    while (ff_opus_psy_celt_frame_process(&s->psyctx, f, index))
+        celt_frame_mdct(s, f);
+
+    ff_opus_rc_enc_init(rc);
+
+    /* Silence */
+    ff_opus_rc_enc_log(rc, 0, 15);
+
+    /* Pitch filter */
+    if (!f->start_band && opus_rc_tell(rc) + 16 <= f->framebits)
+        celt_enc_quant_pfilter(rc, f);
+
+    /* Transient flag */
+    if (f->size && opus_rc_tell(rc) + 3 <= f->framebits)
+        ff_opus_rc_enc_log(rc, f->transient, 3);
+
+    /* Main encoding */
+    celt_quant_coarse  (f, rc, s->last_quantized_energy);
+    celt_enc_tf        (f, rc);
+    ff_celt_bitalloc   (f, rc, 1);
+    celt_quant_fine    (f, rc);
+    ff_celt_quant_bands(f, rc);
+
+    /* Anticollapse bit */
+    if (f->anticollapse_needed)
+        ff_opus_rc_put_raw(rc, f->anticollapse, 1);
+
+    /* Final per-band energy adjustments from leftover bits */
+    celt_quant_final(s, rc, f);
+
+    for (int ch = 0; ch < f->channels; ch++) {
+        CeltBlock *block = &f->block[ch];
+        for (int i = 0; i < CELT_MAX_BANDS; i++)
+            s->last_quantized_energy[ch][i] = block->energy[i] + block->error_energy[i];
+    }
+}
+
+static inline int write_opuslacing(uint8_t *dst, int v)
+{
+    dst[0] = FFMIN(v - FFALIGN(v - 255, 4), v);
+    dst[1] = v - dst[0] >> 2;
+    return 1 + (v >= 252);
+}
+
+static void opus_packet_assembler(OpusEncContext *s, AVPacket *avpkt)
+{
+    int offset, fsize_needed;
+
+    /* Write toc */
+    opus_gen_toc(s, avpkt->data, &offset, &fsize_needed);
+
+    /* Frame sizes if needed */
+    if (fsize_needed) {
+        for (int i = 0; i < s->packet.frames - 1; i++) {
+            offset += write_opuslacing(avpkt->data + offset,
+                                       s->frame[i].framebits >> 3);
+        }
+    }
+
+    /* Packets */
+    for (int i = 0; i < s->packet.frames; i++) {
+        ff_opus_rc_enc_end(&s->rc[i], avpkt->data + offset,
+                           s->frame[i].framebits >> 3);
+        offset += s->frame[i].framebits >> 3;
+    }
+
+    avpkt->size = offset;
+}
+
+/* Used as overlap for the first frame and padding for the last encoded packet */
+static AVFrame *spawn_empty_frame(OpusEncContext *s)
+{
+    AVFrame *f = av_frame_alloc();
+    if (!f)
+        return NULL;
+    f->format         = s->avctx->sample_fmt;
+    f->nb_samples     = s->avctx->frame_size;
+    f->channel_layout = s->avctx->channel_layout;
+    if (av_frame_get_buffer(f, 4)) {
+        av_frame_free(&f);
+        return NULL;
+    }
+    for (int i = 0; i < s->channels; i++) {
+        size_t bps = av_get_bytes_per_sample(f->format);
+        memset(f->extended_data[i], 0, bps*f->nb_samples);
+    }
+    return f;
+}
+
+static int opus_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                             const AVFrame *frame, int *got_packet_ptr)
+{
+    OpusEncContext *s = avctx->priv_data;
+    int ret, frame_size, alloc_size = 0;
+
+    if (frame) { /* Add new frame to queue */
+        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
+            return ret;
+        ff_bufqueue_add(avctx, &s->bufqueue, av_frame_clone(frame));
+    } else {
+        ff_opus_psy_signal_eof(&s->psyctx);
+        if (!s->afq.remaining_samples || !avctx->frame_number)
+            return 0; /* We've been flushed and there's nothing left to encode */
+    }
+
+    /* Run the psychoacoustic system */
+    if (ff_opus_psy_process(&s->psyctx, &s->packet))
+        return 0;
+
+    frame_size = OPUS_BLOCK_SIZE(s->packet.framesize);
+
+    if (!frame) {
+        /* This can go negative, that's not a problem, we only pad if positive */
+        int pad_empty = s->packet.frames*(frame_size/s->avctx->frame_size) - s->bufqueue.available + 1;
+        /* Pad with empty 2.5 ms frames to whatever framesize was decided,
+         * this should only happen at the very last flush frame. The frames
+         * allocated here will be freed (because they have no other references)
+         * after they get used by celt_frame_setup_input() */
+        for (int i = 0; i < pad_empty; i++) {
+            AVFrame *empty = spawn_empty_frame(s);
+            if (!empty)
+                return AVERROR(ENOMEM);
+            ff_bufqueue_add(avctx, &s->bufqueue, empty);
+        }
+    }
+
+    for (int i = 0; i < s->packet.frames; i++) {
+        celt_encode_frame(s, &s->rc[i], &s->frame[i], i);
+        alloc_size += s->frame[i].framebits >> 3;
+    }
+
+    /* Worst case toc + the frame lengths if needed */
+    alloc_size += 2 + s->packet.frames*2;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, alloc_size, 0)) < 0)
+        return ret;
+
+    /* Assemble packet */
+    opus_packet_assembler(s, avpkt);
+
+    /* Update the psychoacoustic system */
+    ff_opus_psy_postencode_update(&s->psyctx, s->frame, s->rc);
+
+    /* Remove samples from queue and skip if needed */
+    ff_af_queue_remove(&s->afq, s->packet.frames*frame_size, &avpkt->pts, &avpkt->duration);
+    if (s->packet.frames*frame_size > avpkt->duration) {
+        uint8_t *side = av_packet_new_side_data(avpkt, AV_PKT_DATA_SKIP_SAMPLES, 10);
+        if (!side)
+            return AVERROR(ENOMEM);
+        AV_WL32(&side[4], s->packet.frames*frame_size - avpkt->duration + 120);
+    }
+
+    *got_packet_ptr = 1;
+
+    return 0;
+}
+
+static av_cold int opus_encode_end(AVCodecContext *avctx)
+{
+    OpusEncContext *s = avctx->priv_data;
+
+    for (int i = 0; i < CELT_BLOCK_NB; i++)
+        ff_mdct15_uninit(&s->mdct[i]);
+
+    ff_celt_pvq_uninit(&s->pvq);
+    av_freep(&s->dsp);
+    av_freep(&s->frame);
+    av_freep(&s->rc);
+    ff_af_queue_close(&s->afq);
+    ff_opus_psy_end(&s->psyctx);
+    ff_bufqueue_discard_all(&s->bufqueue);
+    av_freep(&avctx->extradata);
+
+    return 0;
+}
+
+static av_cold int opus_encode_init(AVCodecContext *avctx)
+{
+    int ret, max_frames;
+    OpusEncContext *s = avctx->priv_data;
+
+    s->avctx = avctx;
+    s->channels = avctx->channels;
+
+    /* Opus allows us to change the framesize on each packet (and each packet may
+     * have multiple frames in it) but we can't change the codec's frame size on
+     * runtime, so fix it to the lowest possible number of samples and use a queue
+     * to accumulate AVFrames until we have enough to encode whatever the encoder
+     * decides is the best */
+    avctx->frame_size = 120;
+    /* Initial padding will change if SILK is ever supported */
+    avctx->initial_padding = 120;
+
+    if (!avctx->bit_rate) {
+        int coupled = ff_opus_default_coupled_streams[s->channels - 1];
+        avctx->bit_rate = coupled*(96000) + (s->channels - coupled*2)*(48000);
+    } else if (avctx->bit_rate < 6000 || avctx->bit_rate > 255000 * s->channels) {
+        int64_t clipped_rate = av_clip(avctx->bit_rate, 6000, 255000 * s->channels);
+        av_log(avctx, AV_LOG_ERROR, "Unsupported bitrate %"PRId64" kbps, clipping to %"PRId64" kbps\n",
+               avctx->bit_rate/1000, clipped_rate/1000);
+        avctx->bit_rate = clipped_rate;
+    }
+
+    /* Extradata */
+    avctx->extradata_size = 19;
+    avctx->extradata = av_malloc(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+    opus_write_extradata(avctx);
+
+    ff_af_queue_init(avctx, &s->afq);
+
+    if ((ret = ff_celt_pvq_init(&s->pvq, 1)) < 0)
+        return ret;
+
+    if (!(s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT)))
+        return AVERROR(ENOMEM);
+
+    /* I have no idea why a base scaling factor of 68 works, could be the twiddles */
+    for (int i = 0; i < CELT_BLOCK_NB; i++)
+        if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
+            return AVERROR(ENOMEM);
+
+    /* Zero out previous energy (matters for inter first frame) */
+    for (int ch = 0; ch < s->channels; ch++)
+        memset(s->last_quantized_energy[ch], 0.0f, sizeof(float)*CELT_MAX_BANDS);
+
+    /* Allocate an empty frame to use as overlap for the first frame of audio */
+    ff_bufqueue_add(avctx, &s->bufqueue, spawn_empty_frame(s));
+    if (!ff_bufqueue_peek(&s->bufqueue, 0))
+        return AVERROR(ENOMEM);
+
+    if ((ret = ff_opus_psy_init(&s->psyctx, s->avctx, &s->bufqueue, &s->options)))
+        return ret;
+
+    /* Frame structs and range coder buffers */
+    max_frames = ceilf(FFMIN(s->options.max_delay_ms, 120.0f)/2.5f);
+    s->frame = av_malloc(max_frames*sizeof(CeltFrame));
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+    s->rc = av_malloc(max_frames*sizeof(OpusRangeCoder));
+    if (!s->rc)
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < max_frames; i++) {
+        s->frame[i].dsp = s->dsp;
+        s->frame[i].avctx = s->avctx;
+        s->frame[i].seed = 0;
+        s->frame[i].pvq = s->pvq;
+        s->frame[i].apply_phase_inv = 1;
+        s->frame[i].block[0].emph_coeff = s->frame[i].block[1].emph_coeff = 0.0f;
+    }
+
+    return 0;
+}
+
+#define OPUSENC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption opusenc_options[] = {
+    { "opus_delay", "Maximum delay in milliseconds", offsetof(OpusEncContext, options.max_delay_ms), AV_OPT_TYPE_FLOAT, { .dbl = OPUS_MAX_LOOKAHEAD }, 2.5f, OPUS_MAX_LOOKAHEAD, OPUSENC_FLAGS, "max_delay_ms" },
+    { NULL },
+};
+
+static const AVClass opusenc_class = {
+    .class_name = "Opus encoder",
+    .item_name  = av_default_item_name,
+    .option     = opusenc_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVCodecDefault opusenc_defaults[] = {
+    { "b", "0" },
+    { "compression_level", "10" },
+    { NULL },
+};
+
+AVCodec ff_opus_encoder = {
+    .name           = "opus",
+    .long_name      = NULL_IF_CONFIG_SMALL("Opus"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_OPUS,
+    .defaults       = opusenc_defaults,
+    .priv_class     = &opusenc_class,
+    .priv_data_size = sizeof(OpusEncContext),
+    .init           = opus_encode_init,
+    .encode2        = opus_encode_frame,
+    .close          = opus_encode_end,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
+    .capabilities   = AV_CODEC_CAP_EXPERIMENTAL | AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
+    .supported_samplerates = (const int []){ 48000, 0 },
+    .channel_layouts = (const uint64_t []){ AV_CH_LAYOUT_MONO,
+                                            AV_CH_LAYOUT_STEREO, 0 },
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLTP,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/opusenc.h b/libavcodec/opusenc.h
new file mode 100644
index 0000000..b9162eb
--- /dev/null
+++ b/libavcodec/opusenc.h
@@ -0,0 +1,54 @@
+/*
+ * Opus encoder
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_OPUSENC_H
+#define AVCODEC_OPUSENC_H
+
+#include "internal.h"
+#include "opus_celt.h"
+
+/* Determines the maximum delay the psychoacoustic system will use for lookahead */
+#define FF_BUFQUEUE_SIZE 145
+#include "libavfilter/bufferqueue.h"
+
+#define OPUS_MAX_LOOKAHEAD ((FF_BUFQUEUE_SIZE - 1)*2.5f)
+
+#define OPUS_MAX_CHANNELS 2
+
+/* 120 ms / 2.5 ms = 48 frames (extremely improbable, but the encoder'll work) */
+#define OPUS_MAX_FRAMES_PER_PACKET 48
+
+#define OPUS_BLOCK_SIZE(x) (2 * 15 * (1 << ((x) + 2)))
+
+#define OPUS_SAMPLES_TO_BLOCK_SIZE(x) (ff_log2((x) / (2 * 15)) - 2)
+
+typedef struct OpusEncOptions {
+    float max_delay_ms;
+} OpusEncOptions;
+
+typedef struct OpusPacketInfo {
+    enum OpusMode mode;
+    enum OpusBandwidth bandwidth;
+    int framesize;
+    int frames;
+} OpusPacketInfo;
+
+#endif /* AVCODEC_OPUSENC_H */
diff --git a/libavcodec/opusenc_psy.c b/libavcodec/opusenc_psy.c
new file mode 100644
index 0000000..5a50db9
--- /dev/null
+++ b/libavcodec/opusenc_psy.c
@@ -0,0 +1,612 @@
+/*
+ * Opus encoder
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "opusenc_psy.h"
+#include "opus_pvq.h"
+#include "opustab.h"
+#include "mdct15.h"
+#include "libavutil/qsort.h"
+
+static float pvq_band_cost(CeltPVQ *pvq, CeltFrame *f, OpusRangeCoder *rc, int band,
+                           float *bits, float lambda)
+{
+    int i, b = 0;
+    uint32_t cm[2] = { (1 << f->blocks) - 1, (1 << f->blocks) - 1 };
+    const int band_size = ff_celt_freq_range[band] << f->size;
+    float buf[176 * 2], lowband_scratch[176], norm1[176], norm2[176];
+    float dist, cost, err_x = 0.0f, err_y = 0.0f;
+    float *X = buf;
+    float *X_orig = f->block[0].coeffs + (ff_celt_freq_bands[band] << f->size);
+    float *Y = (f->channels == 2) ? &buf[176] : NULL;
+    float *Y_orig = f->block[1].coeffs + (ff_celt_freq_bands[band] << f->size);
+    OPUS_RC_CHECKPOINT_SPAWN(rc);
+
+    memcpy(X, X_orig, band_size*sizeof(float));
+    if (Y)
+        memcpy(Y, Y_orig, band_size*sizeof(float));
+
+    f->remaining2 = ((f->framebits << 3) - f->anticollapse_needed) - opus_rc_tell_frac(rc) - 1;
+    if (band <= f->coded_bands - 1) {
+        int curr_balance = f->remaining / FFMIN(3, f->coded_bands - band);
+        b = av_clip_uintp2(FFMIN(f->remaining2 + 1, f->pulses[band] + curr_balance), 14);
+    }
+
+    if (f->dual_stereo) {
+        pvq->quant_band(pvq, f, rc, band, X, NULL, band_size, b / 2, f->blocks, NULL,
+                        f->size, norm1, 0, 1.0f, lowband_scratch, cm[0]);
+
+        pvq->quant_band(pvq, f, rc, band, Y, NULL, band_size, b / 2, f->blocks, NULL,
+                        f->size, norm2, 0, 1.0f, lowband_scratch, cm[1]);
+    } else {
+        pvq->quant_band(pvq, f, rc, band, X, Y, band_size, b, f->blocks, NULL, f->size,
+                        norm1, 0, 1.0f, lowband_scratch, cm[0] | cm[1]);
+    }
+
+    for (i = 0; i < band_size; i++) {
+        err_x += (X[i] - X_orig[i])*(X[i] - X_orig[i]);
+        if (Y)
+            err_y += (Y[i] - Y_orig[i])*(Y[i] - Y_orig[i]);
+    }
+
+    dist = sqrtf(err_x) + sqrtf(err_y);
+    cost = OPUS_RC_CHECKPOINT_BITS(rc)/8.0f;
+    *bits += cost;
+
+    OPUS_RC_CHECKPOINT_ROLLBACK(rc);
+
+    return lambda*dist*cost;
+}
+
+/* Populate metrics without taking into consideration neighbouring steps */
+static void step_collect_psy_metrics(OpusPsyContext *s, int index)
+{
+    int silence = 0, ch, i, j;
+    OpusPsyStep *st = s->steps[index];
+
+    st->index = index;
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        const int lap_size = (1 << s->bsize_analysis);
+        for (i = 1; i <= FFMIN(lap_size, index); i++) {
+            const int offset = i*120;
+            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index - i);
+            memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
+        }
+        for (i = 0; i < lap_size; i++) {
+            const int offset = i*120 + lap_size;
+            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + i);
+            memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
+        }
+
+        s->dsp->vector_fmul(s->scratch, s->scratch, s->window[s->bsize_analysis],
+                            (OPUS_BLOCK_SIZE(s->bsize_analysis) << 1));
+
+        s->mdct[s->bsize_analysis]->mdct(s->mdct[s->bsize_analysis], st->coeffs[ch], s->scratch, 1);
+
+        for (i = 0; i < CELT_MAX_BANDS; i++)
+            st->bands[ch][i] = &st->coeffs[ch][ff_celt_freq_bands[i] << s->bsize_analysis];
+    }
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            float avg_c_s, energy = 0.0f, dist_dev = 0.0f;
+            const int range = ff_celt_freq_range[i] << s->bsize_analysis;
+            const float *coeffs = st->bands[ch][i];
+            for (j = 0; j < range; j++)
+                energy += coeffs[j]*coeffs[j];
+
+            st->energy[ch][i] += sqrtf(energy);
+            silence |= !!st->energy[ch][i];
+            avg_c_s = energy / range;
+
+            for (j = 0; j < range; j++) {
+                const float c_s = coeffs[j]*coeffs[j];
+                dist_dev += (avg_c_s - c_s)*(avg_c_s - c_s);
+            }
+
+            st->tone[ch][i] += sqrtf(dist_dev);
+        }
+    }
+
+    st->silence = !silence;
+
+    if (s->avctx->channels > 1) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            float incompat = 0.0f;
+            const float *coeffs1 = st->bands[0][i];
+            const float *coeffs2 = st->bands[1][i];
+            const int range = ff_celt_freq_range[i] << s->bsize_analysis;
+            for (j = 0; j < range; j++)
+                incompat += (coeffs1[j] - coeffs2[j])*(coeffs1[j] - coeffs2[j]);
+            st->stereo[i] = sqrtf(incompat);
+        }
+    }
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            OpusBandExcitation *ex = &s->ex[ch][i];
+            float bp_e = bessel_filter(&s->bfilter_lo[ch][i], st->energy[ch][i]);
+            bp_e = bessel_filter(&s->bfilter_hi[ch][i], bp_e);
+            bp_e *= bp_e;
+            if (bp_e > ex->excitation) {
+                st->change_amp[ch][i] = bp_e - ex->excitation;
+                st->total_change += st->change_amp[ch][i];
+                ex->excitation = ex->excitation_init = bp_e;
+                ex->excitation_dist = 0.0f;
+            }
+            if (ex->excitation > 0.0f) {
+                ex->excitation -= av_clipf((1/expf(ex->excitation_dist)), ex->excitation_init/20, ex->excitation_init/1.09);
+                ex->excitation = FFMAX(ex->excitation, 0.0f);
+                ex->excitation_dist += 1.0f;
+            }
+        }
+    }
+}
+
+static void search_for_change_points(OpusPsyContext *s, float tgt_change,
+                                     int offset_s, int offset_e, int resolution,
+                                     int level)
+{
+    int i;
+    float c_change = 0.0f;
+    if ((offset_e - offset_s) <= resolution)
+        return;
+    for (i = offset_s; i < offset_e; i++) {
+        c_change += s->steps[i]->total_change;
+        if (c_change > tgt_change)
+            break;
+    }
+    if (i == offset_e)
+        return;
+    search_for_change_points(s, tgt_change / 2.0f, offset_s, i + 0, resolution, level + 1);
+    s->inflection_points[s->inflection_points_count++] = i;
+    search_for_change_points(s, tgt_change / 2.0f, i + 1, offset_e, resolution, level + 1);
+}
+
+static int flush_silent_frames(OpusPsyContext *s)
+{
+    int fsize, silent_frames;
+
+    for (silent_frames = 0; silent_frames < s->buffered_steps; silent_frames++)
+        if (!s->steps[silent_frames]->silence)
+            break;
+    if (--silent_frames < 0)
+        return 0;
+
+    for (fsize = CELT_BLOCK_960; fsize > CELT_BLOCK_120; fsize--) {
+        if ((1 << fsize) > silent_frames)
+            continue;
+        s->p.frames = FFMIN(silent_frames / (1 << fsize), 48 >> fsize);
+        s->p.framesize = fsize;
+        return 1;
+    }
+
+    return 0;
+}
+
+/* Main function which decides frame size and frames per current packet */
+static void psy_output_groups(OpusPsyContext *s)
+{
+    int max_delay_samples = (s->options->max_delay_ms*s->avctx->sample_rate)/1000;
+    int max_bsize = FFMIN(OPUS_SAMPLES_TO_BLOCK_SIZE(max_delay_samples), CELT_BLOCK_960);
+
+    /* These don't change for now */
+    s->p.mode      = OPUS_MODE_CELT;
+    s->p.bandwidth = OPUS_BANDWIDTH_FULLBAND;
+
+    /* Flush silent frames ASAP */
+    if (s->steps[0]->silence && flush_silent_frames(s))
+        return;
+
+    s->p.framesize = FFMIN(max_bsize, CELT_BLOCK_960);
+    s->p.frames    = 1;
+}
+
+int ff_opus_psy_process(OpusPsyContext *s, OpusPacketInfo *p)
+{
+    int i;
+    float total_energy_change = 0.0f;
+
+    if (s->buffered_steps < s->max_steps && !s->eof) {
+        const int awin = (1 << s->bsize_analysis);
+        if (++s->steps_to_process >= awin) {
+            step_collect_psy_metrics(s, s->buffered_steps - awin + 1);
+            s->steps_to_process = 0;
+        }
+        if ((++s->buffered_steps) < s->max_steps)
+            return 1;
+    }
+
+    for (i = 0; i < s->buffered_steps; i++)
+        total_energy_change += s->steps[i]->total_change;
+
+    search_for_change_points(s, total_energy_change / 2.0f, 0,
+                             s->buffered_steps, 1, 0);
+
+    psy_output_groups(s);
+
+    p->frames    = s->p.frames;
+    p->framesize = s->p.framesize;
+    p->mode      = s->p.mode;
+    p->bandwidth = s->p.bandwidth;
+
+    return 0;
+}
+
+void ff_opus_psy_celt_frame_init(OpusPsyContext *s, CeltFrame *f, int index)
+{
+    int i, neighbouring_points = 0, start_offset = 0;
+    int radius = (1 << s->p.framesize), step_offset = radius*index;
+    int silence = 1;
+
+    f->start_band = (s->p.mode == OPUS_MODE_HYBRID) ? 17 : 0;
+    f->end_band   = ff_celt_band_end[s->p.bandwidth];
+    f->channels   = s->avctx->channels;
+    f->size       = s->p.framesize;
+
+    for (i = 0; i < (1 << f->size); i++)
+        silence &= s->steps[index*(1 << f->size) + i]->silence;
+
+    f->silence = silence;
+    if (f->silence) {
+        f->framebits = 0; /* Otherwise the silence flag eats up 16(!) bits */
+        return;
+    }
+
+    for (i = 0; i < s->inflection_points_count; i++) {
+        if (s->inflection_points[i] >= step_offset) {
+            start_offset = i;
+            break;
+        }
+    }
+
+    for (i = start_offset; i < FFMIN(radius, s->inflection_points_count - start_offset); i++) {
+        if (s->inflection_points[i] < (step_offset + radius)) {
+            neighbouring_points++;
+        }
+    }
+
+    /* Transient flagging */
+    f->transient = neighbouring_points > 0;
+    f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
+
+    /* Some sane defaults */
+    f->pfilter   = 0;
+    f->pf_gain   = 0.5f;
+    f->pf_octave = 2;
+    f->pf_period = 1;
+    f->pf_tapset = 2;
+
+    /* More sane defaults */
+    f->tf_select = 0;
+    f->anticollapse = 1;
+    f->alloc_trim = 5;
+    f->skip_band_floor = f->end_band;
+    f->intensity_stereo = f->end_band;
+    f->dual_stereo = 0;
+    f->spread = CELT_SPREAD_NORMAL;
+    memset(f->tf_change, 0, sizeof(int)*CELT_MAX_BANDS);
+    memset(f->alloc_boost, 0, sizeof(int)*CELT_MAX_BANDS);
+}
+
+static void celt_gauge_psy_weight(OpusPsyContext *s, OpusPsyStep **start,
+                                  CeltFrame *f_out)
+{
+    int i, f, ch;
+    int frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
+    float rate, frame_bits = 0;
+
+    /* Used for the global ROTATE flag */
+    float tonal = 0.0f;
+
+    /* Pseudo-weights */
+    float band_score[CELT_MAX_BANDS] = { 0 };
+    float max_score = 1.0f;
+
+    /* Pass one - one loop around each band, computing unquant stuff */
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        float weight = 0.0f;
+        float tonal_contrib = 0.0f;
+        for (f = 0; f < (1 << s->p.framesize); f++) {
+            weight = start[f]->stereo[i];
+            for (ch = 0; ch < s->avctx->channels; ch++) {
+                weight += start[f]->change_amp[ch][i] + start[f]->tone[ch][i] + start[f]->energy[ch][i];
+                tonal_contrib += start[f]->tone[ch][i];
+            }
+        }
+        tonal += tonal_contrib;
+        band_score[i] = weight;
+    }
+
+    tonal /= (float)CELT_MAX_BANDS;
+
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        if (band_score[i] > max_score)
+            max_score = band_score[i];
+    }
+
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        f_out->alloc_boost[i] = (int)((band_score[i]/max_score)*3.0f);
+        frame_bits += band_score[i]*8.0f;
+    }
+
+    tonal /= 1333136.0f;
+    f_out->spread = av_clip_uintp2(lrintf(tonal), 2);
+
+    rate = ((float)s->avctx->bit_rate) + frame_bits*frame_size*16;
+    rate *= s->lambda;
+    rate /= s->avctx->sample_rate/frame_size;
+
+    f_out->framebits = lrintf(rate);
+    f_out->framebits = FFMIN(f_out->framebits, OPUS_MAX_PACKET_SIZE*8);
+    f_out->framebits = FFALIGN(f_out->framebits, 8);
+}
+
+static int bands_dist(OpusPsyContext *s, CeltFrame *f, float *total_dist)
+{
+    int i, tdist = 0.0f;
+    OpusRangeCoder dump;
+
+    ff_opus_rc_enc_init(&dump);
+    ff_celt_bitalloc(f, &dump, 1);
+
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        float bits = 0.0f;
+        float dist = pvq_band_cost(f->pvq, f, &dump, i, &bits, s->lambda);
+        tdist += dist;
+    }
+
+    *total_dist = tdist;
+
+    return 0;
+}
+
+static void celt_search_for_dual_stereo(OpusPsyContext *s, CeltFrame *f)
+{
+    float td1, td2;
+    f->dual_stereo = 0;
+
+    if (s->avctx->channels < 2)
+        return;
+
+    bands_dist(s, f, &td1);
+    f->dual_stereo = 1;
+    bands_dist(s, f, &td2);
+
+    f->dual_stereo = td2 < td1;
+    s->dual_stereo_used += td2 < td1;
+}
+
+static void celt_search_for_intensity(OpusPsyContext *s, CeltFrame *f)
+{
+    int i, best_band = CELT_MAX_BANDS - 1;
+    float dist, best_dist = FLT_MAX;
+    /* TODO: fix, make some heuristic up here using the lambda value */
+    float end_band = 0;
+
+    if (s->avctx->channels < 2)
+        return;
+
+    for (i = f->end_band; i >= end_band; i--) {
+        f->intensity_stereo = i;
+        bands_dist(s, f, &dist);
+        if (best_dist > dist) {
+            best_dist = dist;
+            best_band = i;
+        }
+    }
+
+    f->intensity_stereo = best_band;
+    s->avg_is_band = (s->avg_is_band + f->intensity_stereo)/2.0f;
+}
+
+static int celt_search_for_tf(OpusPsyContext *s, OpusPsyStep **start, CeltFrame *f)
+{
+    int i, j, k, cway, config[2][CELT_MAX_BANDS] = { { 0 } };
+    float score[2] = { 0 };
+
+    for (cway = 0; cway < 2; cway++) {
+        int mag[2];
+        int base = f->transient ? 120 : 960;
+
+        for (i = 0; i < 2; i++) {
+            int c = ff_celt_tf_select[f->size][f->transient][cway][i];
+            mag[i] = c < 0 ? base >> FFABS(c) : base << FFABS(c);
+        }
+
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            float iscore0 = 0.0f;
+            float iscore1 = 0.0f;
+            for (j = 0; j < (1 << f->size); j++) {
+                for (k = 0; k < s->avctx->channels; k++) {
+                    iscore0 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[0];
+                    iscore1 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[1];
+                }
+            }
+            config[cway][i] = FFABS(iscore0 - 1.0f) < FFABS(iscore1 - 1.0f);
+            score[cway] += config[cway][i] ? iscore1 : iscore0;
+        }
+    }
+
+    f->tf_select = score[0] < score[1];
+    memcpy(f->tf_change, config[f->tf_select], sizeof(int)*CELT_MAX_BANDS);
+
+    return 0;
+}
+
+int ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index)
+{
+    int start_transient_flag = f->transient;
+    OpusPsyStep **start = &s->steps[index * (1 << s->p.framesize)];
+
+    if (f->silence)
+        return 0;
+
+    celt_gauge_psy_weight(s, start, f);
+    celt_search_for_intensity(s, f);
+    celt_search_for_dual_stereo(s, f);
+    celt_search_for_tf(s, start, f);
+
+    if (f->transient != start_transient_flag) {
+        f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
+        s->redo_analysis = 1;
+        return 1;
+    }
+
+    s->redo_analysis = 0;
+
+    return 0;
+}
+
+void ff_opus_psy_postencode_update(OpusPsyContext *s, CeltFrame *f, OpusRangeCoder *rc)
+{
+    int i, frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
+    int steps_out = s->p.frames*(frame_size/120);
+    void *tmp[FF_BUFQUEUE_SIZE];
+    float ideal_fbits;
+
+    for (i = 0; i < steps_out; i++)
+        memset(s->steps[i], 0, sizeof(OpusPsyStep));
+
+    for (i = 0; i < s->max_steps; i++)
+        tmp[i] = s->steps[i];
+
+    for (i = 0; i < s->max_steps; i++) {
+        const int i_new = i - steps_out;
+        s->steps[i_new < 0 ? s->max_steps + i_new : i_new] = tmp[i];
+    }
+
+    for (i = steps_out; i < s->buffered_steps; i++)
+        s->steps[i]->index -= steps_out;
+
+    ideal_fbits = s->avctx->bit_rate/(s->avctx->sample_rate/frame_size);
+
+    for (i = 0; i < s->p.frames; i++) {
+        s->avg_is_band += f[i].intensity_stereo;
+        s->lambda *= ideal_fbits / f[i].framebits;
+    }
+
+    s->avg_is_band /= (s->p.frames + 1);
+
+    s->cs_num = 0;
+    s->steps_to_process = 0;
+    s->buffered_steps -= steps_out;
+    s->total_packets_out += s->p.frames;
+    s->inflection_points_count = 0;
+}
+
+av_cold int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx,
+                             struct FFBufQueue *bufqueue, OpusEncOptions *options)
+{
+    int i, ch, ret;
+
+    s->redo_analysis = 0;
+    s->lambda = 1.0f;
+    s->options = options;
+    s->avctx = avctx;
+    s->bufqueue = bufqueue;
+    s->max_steps = ceilf(s->options->max_delay_ms/2.5f);
+    s->bsize_analysis = CELT_BLOCK_960;
+    s->avg_is_band = CELT_MAX_BANDS - 1;
+    s->inflection_points_count = 0;
+
+    s->inflection_points = av_mallocz(sizeof(*s->inflection_points)*s->max_steps);
+    if (!s->inflection_points) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->dsp) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            bessel_init(&s->bfilter_hi[ch][i], 1.0f, 19.0f, 100.0f, 1);
+            bessel_init(&s->bfilter_lo[ch][i], 1.0f, 20.0f, 100.0f, 0);
+        }
+    }
+
+    for (i = 0; i < s->max_steps; i++) {
+        s->steps[i] = av_mallocz(sizeof(OpusPsyStep));
+        if (!s->steps[i]) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+    }
+
+    for (i = 0; i < CELT_BLOCK_NB; i++) {
+        float tmp;
+        const int len = OPUS_BLOCK_SIZE(i);
+        s->window[i] = av_malloc(2*len*sizeof(float));
+        if (!s->window[i]) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+        generate_window_func(s->window[i], 2*len, WFUNC_SINE, &tmp);
+        if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
+            goto fail;
+    }
+
+    return 0;
+
+fail:
+    av_freep(&s->inflection_points);
+    av_freep(&s->dsp);
+
+    for (i = 0; i < CELT_BLOCK_NB; i++) {
+        ff_mdct15_uninit(&s->mdct[i]);
+        av_freep(&s->window[i]);
+    }
+
+    for (i = 0; i < s->max_steps; i++)
+        av_freep(&s->steps[i]);
+
+    return ret;
+}
+
+void ff_opus_psy_signal_eof(OpusPsyContext *s)
+{
+    s->eof = 1;
+}
+
+av_cold int ff_opus_psy_end(OpusPsyContext *s)
+{
+    int i;
+
+    av_freep(&s->inflection_points);
+    av_freep(&s->dsp);
+
+    for (i = 0; i < CELT_BLOCK_NB; i++) {
+        ff_mdct15_uninit(&s->mdct[i]);
+        av_freep(&s->window[i]);
+    }
+
+    for (i = 0; i < s->max_steps; i++)
+        av_freep(&s->steps[i]);
+
+    av_log(s->avctx, AV_LOG_INFO, "Average Intensity Stereo band: %0.1f\n", s->avg_is_band);
+    av_log(s->avctx, AV_LOG_INFO, "Dual Stereo used: %0.2f%%\n", ((float)s->dual_stereo_used/s->total_packets_out)*100.0f);
+
+    return 0;
+}
diff --git a/libavcodec/opusenc_psy.h b/libavcodec/opusenc_psy.h
new file mode 100644
index 0000000..b91e4f1
--- /dev/null
+++ b/libavcodec/opusenc_psy.h
@@ -0,0 +1,104 @@
+/*
+ * Opus encoder
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_OPUSENC_PSY_H
+#define AVCODEC_OPUSENC_PSY_H
+
+#include "opusenc.h"
+#include "opusenc_utils.h"
+#include "libavfilter/window_func.h"
+
+/* Each step is 2.5ms */
+typedef struct OpusPsyStep {
+    int   index; /* Current index */
+    int   silence;
+    float energy[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; /* Masking effects included */
+    float tone[OPUS_MAX_CHANNELS][CELT_MAX_BANDS];   /* Tonality */
+    float stereo[CELT_MAX_BANDS];                    /* IS/MS compatibility */
+    float change_amp[OPUS_MAX_CHANNELS][CELT_MAX_BANDS]; /* Jump over last frame */
+    float total_change; /* Total change */
+
+    float *bands[OPUS_MAX_CHANNELS][CELT_MAX_BANDS];
+    float coeffs[OPUS_MAX_CHANNELS][OPUS_BLOCK_SIZE(CELT_BLOCK_960)];
+} OpusPsyStep;
+
+typedef struct OpusBandExcitation {
+    float excitation;
+    float excitation_dist;
+    float excitation_init;
+} OpusBandExcitation;
+
+typedef struct PsyChain {
+    int start;
+    int end;
+} PsyChain;
+
+typedef struct OpusPsyContext {
+    AVCodecContext *avctx;
+    AVFloatDSPContext *dsp;
+    struct FFBufQueue *bufqueue;
+    OpusEncOptions *options;
+
+    PsyChain cs[128];
+    int cs_num;
+
+    OpusBandExcitation ex[OPUS_MAX_CHANNELS][CELT_MAX_BANDS];
+    FFBesselFilter bfilter_lo[OPUS_MAX_CHANNELS][CELT_MAX_BANDS];
+    FFBesselFilter bfilter_hi[OPUS_MAX_CHANNELS][CELT_MAX_BANDS];
+
+    OpusPsyStep *steps[FF_BUFQUEUE_SIZE + 1];
+    int max_steps;
+
+    float *window[CELT_BLOCK_NB];
+    MDCT15Context *mdct[CELT_BLOCK_NB];
+    int bsize_analysis;
+
+    DECLARE_ALIGNED(32, float, scratch)[2048];
+
+    /* Stats */
+    float rc_waste;
+    float avg_is_band;
+    int64_t dual_stereo_used;
+    int64_t total_packets_out;
+
+    /* State */
+    FFBesselFilter lambda_lp;
+    OpusPacketInfo p;
+    int redo_analysis;
+    int buffered_steps;
+    int steps_to_process;
+    int eof;
+    float lambda;
+    int *inflection_points;
+    int inflection_points_count;
+} OpusPsyContext;
+
+int  ff_opus_psy_process           (OpusPsyContext *s, OpusPacketInfo *p);
+void ff_opus_psy_celt_frame_init   (OpusPsyContext *s, CeltFrame *f, int index);
+int  ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index);
+void ff_opus_psy_postencode_update (OpusPsyContext *s, CeltFrame *f, OpusRangeCoder *rc);
+
+int  ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx,
+                      struct FFBufQueue *bufqueue, OpusEncOptions *options);
+void ff_opus_psy_signal_eof(OpusPsyContext *s);
+int  ff_opus_psy_end(OpusPsyContext *s);
+
+#endif /* AVCODEC_OPUSENC_PSY_H */
diff --git a/libavcodec/opusenc_utils.h b/libavcodec/opusenc_utils.h
new file mode 100644
index 0000000..be82e13
--- /dev/null
+++ b/libavcodec/opusenc_utils.h
@@ -0,0 +1,87 @@
+/*
+ * Opus encoder
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_OPUSENC_UTILS_H
+#define AVCODEC_OPUSENC_UTILS_H
+
+#include "opus.h"
+
+typedef struct FFBesselFilter {
+    float a[3];
+    float b[2];
+    float x[3];
+    float y[3];
+} FFBesselFilter;
+
+/* Fills the coefficients, returns 1 if filter will be unstable */
+static inline int bessel_reinit(FFBesselFilter *s, float n, float f0, float fs,
+                                int highpass)
+{
+    int unstable;
+    float c, cfreq, w0, k1, k2;
+
+    if (!highpass) {
+        c = (1.0f/sqrtf(sqrtf(pow(2.0f, 1.0f/n) - 3.0f/4.0f) - 0.5f))/sqrtf(3.0f);
+        cfreq = c*f0/fs;
+        unstable = (cfreq <= 0.0f || cfreq >= 1.0f/4.0f);
+    } else {
+        c = sqrtf(3.0f)*sqrtf(sqrtf(pow(2.0f, 1.0f/n) - 3.0f/4.0f) - 0.5f);
+        cfreq = 0.5f - c*f0/fs;
+        unstable = (cfreq <= 3.0f/8.0f || cfreq >= 1.0f/2.0f);
+    }
+
+    w0 = tanf(M_PI*cfreq);
+    k1 = 3.0f * w0;
+    k2 = 3.0f * w0;
+
+    s->a[0] = k2/(1.0f + k1 + k2);
+    s->a[1] = 2.0f * s->a[0];
+    s->a[2] = s->a[0];
+    s->b[0] = 2.0f * s->a[0] * (1.0f/k2 - 1.0f);
+    s->b[1] = 1.0f - (s->a[0] + s->a[1] + s->a[2] + s->b[0]);
+
+    if (highpass) {
+        s->a[1] *= -1;
+        s->b[0] *= -1;
+    }
+
+    return unstable;
+}
+
+static inline int bessel_init(FFBesselFilter *s, float n, float f0, float fs,
+                              int highpass)
+{
+    memset(s, 0, sizeof(FFBesselFilter));
+    return bessel_reinit(s, n, f0, fs, highpass);
+}
+
+static inline float bessel_filter(FFBesselFilter *s, float x)
+{
+    s->x[2] = s->x[1];
+    s->x[1] = s->x[0];
+    s->x[0] = x;
+    s->y[2] = s->y[1];
+    s->y[1] = s->y[0];
+    s->y[0] = s->a[0]*s->x[0] + s->a[1]*s->x[1] + s->a[2]*s->x[2] + s->b[0]*s->y[1] + s->b[1]*s->y[2];
+    return s->y[0];
+}
+
+#endif /* AVCODEC_OPUSENC_UTILS_H */
diff --git a/libavcodec/opustab.c b/libavcodec/opustab.c
new file mode 100644
index 0000000..fb340e0
--- /dev/null
+++ b/libavcodec/opustab.c
@@ -0,0 +1,1158 @@
+/*
+ * Copyright (c) 2012 Andrew D'Addesio
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "opustab.h"
+
+const uint8_t ff_opus_default_coupled_streams[] = { 0, 1, 1, 2, 2, 2, 2, 3 };
+
+const uint8_t ff_celt_band_end[] = { 13, 17, 17, 19, 21 };
+
+const uint16_t ff_silk_model_stereo_s1[] = {
+    256,   7,   9,  10,  11,  12,  22,  46,  54,  55,  56,  59,  82, 174, 197, 200,
+    201, 202, 210, 234, 244, 245, 246, 247, 249, 256
+};
+
+const uint16_t ff_silk_model_stereo_s2[] = {256, 85, 171, 256};
+
+const uint16_t ff_silk_model_stereo_s3[] = {256, 51, 102, 154, 205, 256};
+
+const uint16_t ff_silk_model_mid_only[] = {256, 192, 256};
+
+const uint16_t ff_silk_model_frame_type_inactive[] = {256, 26, 256};
+
+const uint16_t ff_silk_model_frame_type_active[] = {256, 24, 98, 246, 256};
+
+const uint16_t ff_silk_model_gain_highbits[3][9] = {
+    {256,  32, 144, 212, 241, 253, 254, 255, 256},
+    {256,   2,  19,  64, 124, 186, 233, 252, 256},
+    {256,   1,   4,  30, 101, 195, 245, 254, 256}
+};
+
+const uint16_t ff_silk_model_gain_lowbits[] = {256, 32, 64, 96, 128, 160, 192, 224, 256};
+
+const uint16_t ff_silk_model_gain_delta[] = {
+    256,   6,  11,  22,  53, 185, 206, 214, 218, 221, 223, 225, 227, 228, 229, 230,
+    231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
+    247, 248, 249, 250, 251, 252, 253, 254, 255, 256
+};
+const uint16_t ff_silk_model_lsf_s1[2][2][33] = {
+    {
+        {    // NB or MB, unvoiced
+            256,  44,  78, 108, 127, 148, 160, 171, 174, 177, 179, 195, 197, 199, 200, 205,
+            207, 208, 211, 214, 215, 216, 218, 220, 222, 225, 226, 235, 244, 246, 253, 255, 256
+        }, { // NB or MB, voiced
+            256,   1,  11,  12,  20,  23,  31,  39,  53,  66,  80,  81,  95, 107, 120, 131,
+            142, 154, 165, 175, 185, 196, 204, 213, 221, 228, 236, 237, 238, 244, 245, 251, 256
+        }
+    }, {
+        {    // WB, unvoiced
+            256,  31,  52,  55,  72,  73,  81,  98, 102, 103, 121, 137, 141, 143, 146, 147,
+            157, 158, 161, 177, 188, 204, 206, 208, 211, 213, 224, 225, 229, 238, 246, 253, 256
+        }, { // WB, voiced
+            256,   1,   5,  21,  26,  44,  55,  60,  74,  89,  90,  93, 105, 118, 132, 146,
+            152, 166, 178, 180, 186, 187, 199, 211, 222, 232, 235, 245, 250, 251, 252, 253, 256
+        }
+    }
+};
+
+const uint16_t ff_silk_model_lsf_s2[32][10] = {
+    // NB, MB
+    { 256,   1,   2,   3,  18, 242, 253, 254, 255, 256 },
+    { 256,   1,   2,   4,  38, 221, 253, 254, 255, 256 },
+    { 256,   1,   2,   6,  48, 197, 252, 254, 255, 256 },
+    { 256,   1,   2,  10,  62, 185, 246, 254, 255, 256 },
+    { 256,   1,   4,  20,  73, 174, 248, 254, 255, 256 },
+    { 256,   1,   4,  21,  76, 166, 239, 254, 255, 256 },
+    { 256,   1,   8,  32,  85, 159, 226, 252, 255, 256 },
+    { 256,   1,   2,  20,  83, 161, 219, 249, 255, 256 },
+
+    // WB
+    { 256,   1,   2,   3,  12, 244, 253, 254, 255, 256 },
+    { 256,   1,   2,   4,  32, 218, 253, 254, 255, 256 },
+    { 256,   1,   2,   5,  47, 199, 252, 254, 255, 256 },
+    { 256,   1,   2,  12,  61, 187, 252, 254, 255, 256 },
+    { 256,   1,   5,  24,  72, 172, 249, 254, 255, 256 },
+    { 256,   1,   2,  16,  70, 170, 242, 254, 255, 256 },
+    { 256,   1,   2,  17,  78, 165, 226, 251, 255, 256 },
+    { 256,   1,   8,  29,  79, 156, 237, 254, 255, 256 }
+};
+
+const uint16_t ff_silk_model_lsf_s2_ext[] = { 256, 156, 216, 240, 249, 253, 255, 256 };
+
+const uint16_t ff_silk_model_lsf_interpolation_offset[] = { 256, 13, 35, 64, 75, 256 };
+
+const uint16_t ff_silk_model_pitch_highbits[] = {
+    256,   3,   6,  12,  23,  44,  74, 106, 125, 136, 146, 158, 171, 184, 196, 207,
+    216, 224, 231, 237, 241, 243, 245, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256
+};
+
+const uint16_t ff_silk_model_pitch_lowbits_nb[] = { 256, 64, 128, 192, 256 };
+
+const uint16_t ff_silk_model_pitch_lowbits_mb[] = { 256, 43, 85, 128, 171, 213, 256 };
+
+const uint16_t ff_silk_model_pitch_lowbits_wb[] = { 256, 32, 64, 96, 128, 160, 192, 224, 256 };
+
+const uint16_t ff_silk_model_pitch_delta[] = {
+    256,  46,  48,  50,  53,  57,  63,  73,  88, 114, 152, 182, 204, 219, 229, 236,
+    242, 246, 250, 252, 254, 256
+};
+
+const uint16_t ff_silk_model_pitch_contour_nb10ms[] = { 256, 143, 193, 256 };
+
+const uint16_t ff_silk_model_pitch_contour_nb20ms[] = {
+    256,  68,  80, 101, 118, 137, 159, 189, 213, 230, 246, 256
+};
+
+const uint16_t ff_silk_model_pitch_contour_mbwb10ms[] = {
+    256,  91, 137, 176, 195, 209, 221, 229, 236, 242, 247, 252, 256
+};
+
+const uint16_t ff_silk_model_pitch_contour_mbwb20ms[] = {
+    256,  33,  55,  73,  89, 104, 118, 132, 145, 158, 168, 177, 186, 194, 200, 206,
+    212, 217, 221, 225, 229, 232, 235, 238, 240, 242, 244, 246, 248, 250, 252, 253,
+    254, 255, 256
+};
+
+const uint16_t ff_silk_model_ltp_filter[] = { 256, 77, 157, 256 };
+
+const uint16_t ff_silk_model_ltp_filter0_sel[] = {
+    256, 185, 200, 213, 226, 235, 244, 250, 256
+};
+
+const uint16_t ff_silk_model_ltp_filter1_sel[] = {
+    256,  57,  91, 112, 132, 147, 160, 172, 185, 195, 205, 214, 224, 233, 241, 248, 256
+};
+
+const uint16_t ff_silk_model_ltp_filter2_sel[] = {
+    256,  15,  31,  45,  57,  69,  81,  92, 103, 114, 124, 133, 142, 151, 160, 168,
+    176, 184, 192, 199, 206, 212, 218, 223, 227, 232, 236, 240, 244, 247, 251, 254, 256
+};
+
+const uint16_t ff_silk_model_ltp_scale_index[] = { 256, 128, 192, 256 };
+
+const uint16_t ff_silk_model_lcg_seed[] = { 256, 64, 128, 192, 256 };
+
+const uint16_t ff_silk_model_exc_rate[2][10] = {
+    { 256,  15,  66,  78, 124, 169, 182, 215, 242, 256 }, // unvoiced
+    { 256,  33,  63,  99, 116, 150, 199, 217, 238, 256 }  // voiced
+};
+
+const uint16_t ff_silk_model_pulse_count[11][19] = {
+    { 256, 131, 205, 230, 238, 241, 244, 245, 246,
+      247, 248, 249, 250, 251, 252, 253, 254, 255, 256 },
+    { 256,  58, 151, 211, 234, 241, 244, 245, 246,
+      247, 248, 249, 250, 251, 252, 253, 254, 255, 256 },
+    { 256,  43,  94, 140, 173, 197, 213, 224, 232,
+      238, 241, 244, 247, 249, 250, 251, 253, 254, 256 },
+    { 256,  17,  69, 140, 197, 228, 240, 245, 246,
+      247, 248, 249, 250, 251, 252, 253, 254, 255, 256 },
+    { 256,   6,  27,  68, 121, 170, 205, 226, 237,
+      243, 246, 248, 250, 251, 252, 253, 254, 255, 256 },
+    { 256,   7,  21,  43,  71, 100, 128, 153, 173,
+      190, 203, 214, 223, 230, 235, 239, 243, 246, 256 },
+    { 256,   2,   7,  21,  50,  92, 138, 179, 210,
+      229, 240, 246, 249, 251, 252, 253, 254, 255, 256 },
+    { 256,   1,   3,   7,  17,  36,  65, 100, 137,
+      171, 199, 219, 233, 241, 246, 250, 252, 254, 256 },
+    { 256,   1,   3,   5,  10,  19,  33,  53,  77,
+      104, 132, 158, 181, 201, 216, 227, 235, 241, 256 },
+    { 256,   1,   2,   3,   9,  36,  94, 150, 189,
+      214, 228, 238, 244, 247, 250, 252, 253, 254, 256 },
+    { 256,   2,   3,   9,  36,  94, 150, 189, 214,
+      228, 238, 244, 247, 250, 252, 253, 254, 256, 256 }
+};
+
+const uint16_t ff_silk_model_pulse_location[4][168] = {
+    {
+        256, 126, 256,
+        256, 56, 198, 256,
+        256, 25, 126, 230, 256,
+        256, 12, 72, 180, 244, 256,
+        256, 7, 42, 126, 213, 250, 256,
+        256, 4, 24, 83, 169, 232, 253, 256,
+        256, 3, 15, 53, 125, 200, 242, 254, 256,
+        256, 2, 10, 35, 89, 162, 221, 248, 255, 256,
+        256, 2, 7, 24, 63, 126, 191, 233, 251, 255, 256,
+        256, 1, 5, 17, 45, 94, 157, 211, 241, 252, 255, 256,
+        256, 1, 5, 13, 33, 70, 125, 182, 223, 245, 253, 255, 256,
+        256, 1, 4, 11, 26, 54, 98, 151, 199, 232, 248, 254, 255, 256,
+        256, 1, 3, 9, 21, 42, 77, 124, 172, 212, 237, 249, 254, 255, 256,
+        256, 1, 2, 6, 16, 33, 60, 97, 144, 187, 220, 241, 250, 254, 255, 256,
+        256, 1, 2, 3, 11, 25, 47, 80, 120, 163, 201, 229, 245, 253, 254, 255, 256,
+        256, 1, 2, 3, 4, 17, 35, 62, 98, 139, 180, 214, 238, 252, 253, 254, 255, 256
+    },{
+        256, 127, 256,
+        256, 53, 202, 256,
+        256, 22, 127, 233, 256,
+        256, 11, 72, 183, 246, 256,
+        256, 6, 41, 127, 215, 251, 256,
+        256, 4, 24, 83, 170, 232, 253, 256,
+        256, 3, 16, 56, 127, 200, 241, 254, 256,
+        256, 3, 12, 39, 92, 162, 218, 246, 255, 256,
+        256, 3, 11, 30, 67, 124, 185, 229, 249, 255, 256,
+        256, 3, 10, 25, 53, 97, 151, 200, 233, 250, 255, 256,
+        256, 1, 8, 21, 43, 77, 123, 171, 209, 237, 251, 255, 256,
+        256, 1, 2, 13, 35, 62, 97, 139, 186, 219, 244, 254, 255, 256,
+        256, 1, 2, 8, 22, 48, 85, 128, 171, 208, 234, 248, 254, 255, 256,
+        256, 1, 2, 6, 16, 36, 67, 107, 149, 189, 220, 240, 250, 254, 255, 256,
+        256, 1, 2, 5, 13, 29, 55, 90, 128, 166, 201, 227, 243, 251, 254, 255, 256,
+        256, 1, 2, 4, 10, 22, 43, 73, 109, 147, 183, 213, 234, 246, 252, 254, 255, 256
+    },{
+        256, 127, 256,
+        256, 49, 206, 256,
+        256, 20, 127, 236, 256,
+        256, 11, 71, 184, 246, 256,
+        256, 7, 43, 127, 214, 250, 256,
+        256, 6, 30, 87, 169, 229, 252, 256,
+        256, 5, 23, 62, 126, 194, 236, 252, 256,
+        256, 6, 20, 49, 96, 157, 209, 239, 253, 256,
+        256, 1, 16, 39, 74, 125, 175, 215, 245, 255, 256,
+        256, 1, 2, 23, 55, 97, 149, 195, 236, 254, 255, 256,
+        256, 1, 7, 23, 50, 86, 128, 170, 206, 233, 249, 255, 256,
+        256, 1, 6, 18, 39, 70, 108, 148, 186, 217, 238, 250, 255, 256,
+        256, 1, 4, 13, 30, 56, 90, 128, 166, 200, 226, 243, 252, 255, 256,
+        256, 1, 4, 11, 25, 47, 76, 110, 146, 180, 209, 231, 245, 252, 255, 256,
+        256, 1, 3, 8, 19, 37, 62, 93, 128, 163, 194, 219, 237, 248, 253, 255, 256,
+        256, 1, 2, 6, 15, 30, 51, 79, 111, 145, 177, 205, 226, 241, 250, 254, 255, 256
+    },{
+        256, 128, 256,
+        256, 42, 214, 256,
+        256, 21, 128, 235, 256,
+        256, 12, 72, 184, 245, 256,
+        256, 8, 42, 128, 214, 249, 256,
+        256, 8, 31, 86, 176, 231, 251, 256,
+        256, 5, 20, 58, 130, 202, 238, 253, 256,
+        256, 6, 18, 45, 97, 174, 221, 241, 251, 256,
+        256, 6, 25, 53, 88, 128, 168, 203, 231, 250, 256,
+        256, 4, 18, 40, 71, 108, 148, 185, 216, 238, 252, 256,
+        256, 3, 13, 31, 57, 90, 128, 166, 199, 225, 243, 253, 256,
+        256, 2, 10, 23, 44, 73, 109, 147, 183, 212, 233, 246, 254, 256,
+        256, 1, 6, 16, 33, 58, 90, 128, 166, 198, 223, 240, 250, 255, 256,
+        256, 1, 5, 12, 25, 46, 75, 110, 146, 181, 210, 231, 244, 251, 255, 256,
+        256, 1, 3, 8, 18, 35, 60, 92, 128, 164, 196, 221, 238, 248, 253, 255, 256,
+        256, 1, 3, 7, 14, 27, 48, 76, 110, 146, 180, 208, 229, 242, 249, 253, 255, 256
+    }
+};
+
+const uint16_t ff_silk_model_excitation_lsb[] = {256, 136, 256};
+
+const uint16_t ff_silk_model_excitation_sign[3][2][7][3] = {
+    {    // Inactive
+        {    // Low offset
+            {256,   2, 256},
+            {256, 207, 256},
+            {256, 189, 256},
+            {256, 179, 256},
+            {256, 174, 256},
+            {256, 163, 256},
+            {256, 157, 256}
+        }, { // High offset
+            {256,  58, 256},
+            {256, 245, 256},
+            {256, 238, 256},
+            {256, 232, 256},
+            {256, 225, 256},
+            {256, 220, 256},
+            {256, 211, 256}
+        }
+    }, { // Unvoiced
+        {    // Low offset
+            {256,   1, 256},
+            {256, 210, 256},
+            {256, 190, 256},
+            {256, 178, 256},
+            {256, 169, 256},
+            {256, 162, 256},
+            {256, 152, 256}
+        }, { // High offset
+            {256,  48, 256},
+            {256, 242, 256},
+            {256, 235, 256},
+            {256, 224, 256},
+            {256, 214, 256},
+            {256, 205, 256},
+            {256, 190, 256}
+        }
+    }, { // Voiced
+        {    // Low offset
+            {256,   1, 256},
+            {256, 162, 256},
+            {256, 152, 256},
+            {256, 147, 256},
+            {256, 144, 256},
+            {256, 141, 256},
+            {256, 138, 256}
+        }, { // High offset
+            {256,   8, 256},
+            {256, 203, 256},
+            {256, 187, 256},
+            {256, 176, 256},
+            {256, 168, 256},
+            {256, 161, 256},
+            {256, 154, 256}
+        }
+    }
+};
+
+const int16_t ff_silk_stereo_weights[] = {
+    -13732, -10050,  -8266,  -7526,  -6500,  -5000,  -2950,   -820,
+       820,   2950,   5000,   6500,   7526,   8266,  10050,  13732
+};
+
+const uint8_t ff_silk_lsf_s2_model_sel_nbmb[32][10] = {
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1, 3, 1, 2, 2, 1, 2, 1, 1, 1 },
+    { 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+    { 1, 2, 2, 2, 2, 1, 2, 1, 1, 1 },
+    { 2, 3, 3, 3, 3, 2, 2, 2, 2, 2 },
+    { 0, 5, 3, 3, 2, 2, 2, 2, 1, 1 },
+    { 0, 2, 2, 2, 2, 2, 2, 2, 2, 1 },
+    { 2, 3, 6, 4, 4, 4, 5, 4, 5, 5 },
+    { 2, 4, 5, 5, 4, 5, 4, 6, 4, 4 },
+    { 2, 4, 4, 7, 4, 5, 4, 5, 5, 4 },
+    { 4, 3, 3, 3, 2, 3, 2, 2, 2, 2 },
+    { 1, 5, 5, 6, 4, 5, 4, 5, 5, 5 },
+    { 2, 7, 4, 6, 5, 5, 5, 5, 5, 5 },
+    { 2, 7, 5, 5, 5, 5, 5, 6, 5, 4 },
+    { 3, 3, 5, 4, 4, 5, 4, 5, 4, 4 },
+    { 2, 3, 3, 5, 5, 4, 4, 4, 4, 4 },
+    { 2, 4, 4, 6, 4, 5, 4, 5, 5, 5 },
+    { 2, 5, 4, 6, 5, 5, 5, 4, 5, 4 },
+    { 2, 7, 4, 5, 4, 5, 4, 5, 5, 5 },
+    { 2, 5, 4, 6, 7, 6, 5, 6, 5, 4 },
+    { 3, 6, 7, 4, 6, 5, 5, 6, 4, 5 },
+    { 2, 7, 6, 4, 4, 4, 5, 4, 5, 5 },
+    { 4, 5, 5, 4, 6, 6, 5, 6, 5, 4 },
+    { 2, 5, 5, 6, 5, 6, 4, 6, 4, 4 },
+    { 4, 5, 5, 5, 3, 7, 4, 5, 5, 4 },
+    { 2, 3, 4, 5, 5, 6, 4, 5, 5, 4 },
+    { 2, 3, 2, 3, 3, 4, 2, 3, 3, 3 },
+    { 1, 1, 2, 2, 2, 2, 2, 3, 2, 2 },
+    { 4, 5, 5, 6, 6, 6, 5, 6, 4, 5 },
+    { 3, 5, 5, 4, 4, 4, 4, 3, 3, 2 },
+    { 2, 5, 3, 7, 5, 5, 4, 4, 5, 4 },
+    { 4, 4, 5, 4, 5, 6, 5, 6, 5, 4 }
+};
+
+const uint8_t ff_silk_lsf_s2_model_sel_wb[32][16] = {
+    {  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
+    { 10, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  8, 11 },
+    { 10, 13, 13, 11, 15, 12, 12, 13, 10, 13, 12, 13, 13, 12, 11, 11 },
+    {  8, 10,  9, 10, 10,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  9 },
+    {  8, 14, 13, 12, 14, 12, 15, 13, 12, 12, 12, 13, 13, 12, 12, 11 },
+    {  8, 11, 13, 13, 12, 11, 11, 13, 11, 11, 11, 11, 11, 11, 10, 12 },
+    {  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
+    {  8, 10, 14, 11, 15, 10, 13, 11, 12, 13, 13, 12, 11, 11, 10, 11 },
+    {  8, 14, 10, 14, 14, 12, 13, 12, 14, 13, 12, 12, 13, 11, 11, 11 },
+    { 10,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
+    {  8,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9 },
+    { 10, 10, 11, 12, 13, 11, 11, 11, 11, 11, 11, 11, 10, 10,  9, 11 },
+    { 10, 10, 11, 11, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10,  9, 11 },
+    { 11, 12, 12, 12, 14, 12, 12, 13, 11, 13, 12, 12, 13, 12, 11, 12 },
+    {  8, 14, 12, 13, 12, 15, 13, 10, 14, 13, 15, 12, 12, 11, 13, 11 },
+    {  8,  9,  8,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  9,  8 },
+    {  9, 14, 13, 15, 13, 12, 13, 11, 12, 13, 12, 12, 12, 11, 11, 12 },
+    {  9, 11, 11, 12, 12, 11, 11, 13, 10, 11, 11, 13, 13, 13, 11, 12 },
+    { 10, 11, 11, 10, 10, 10, 11, 10,  9, 10,  9, 10,  9,  9,  9, 12 },
+    {  8, 10, 11, 13, 11, 11, 10, 10, 10,  9,  9,  8,  8,  8,  8,  8 },
+    { 11, 12, 11, 13, 11, 11, 10, 10,  9,  9,  9,  9,  9, 10, 10, 12 },
+    { 10, 14, 11, 15, 15, 12, 13, 12, 13, 11, 13, 11, 11, 10, 11, 11 },
+    { 10, 11, 13, 14, 14, 11, 13, 11, 12, 12, 11, 11, 11, 11, 10, 12 },
+    {  9, 11, 11, 12, 12, 12, 12, 11, 13, 13, 13, 11,  9,  9,  9,  9 },
+    { 10, 13, 11, 14, 14, 12, 15, 12, 12, 13, 11, 12, 12, 11, 11, 11 },
+    {  8, 14,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8 },
+    {  8, 14, 14, 11, 13, 10, 13, 13, 11, 12, 12, 15, 15, 12, 12, 12 },
+    { 11, 11, 15, 11, 13, 12, 11, 11, 11, 10, 10, 11, 11, 11, 10, 11 },
+    {  8,  8,  9,  8,  8,  8, 10,  9, 10,  9,  9, 10, 10, 10,  9,  9 },
+    {  8, 11, 10, 13, 11, 11, 10, 11, 10,  9,  8,  8,  9,  8,  8,  9 },
+    { 11, 13, 13, 12, 15, 13, 11, 11, 10, 11, 10, 10,  9,  8,  9,  8 },
+    { 10, 11, 13, 11, 12, 11, 11, 11, 10,  9, 10, 14, 12,  8,  8,  8 }
+};
+
+const uint8_t ff_silk_lsf_pred_weights_nbmb[2][9] = {
+    {179, 138, 140, 148, 151, 149, 153, 151, 163},
+    {116,  67,  82,  59,  92,  72, 100,  89,  92}
+};
+
+const uint8_t ff_silk_lsf_pred_weights_wb[2][15] = {
+    {175, 148, 160, 176, 178, 173, 174, 164, 177, 174, 196, 182, 198, 192, 182},
+    { 68,  62,  66,  60,  72, 117,  85,  90, 118, 136, 151, 142, 160, 142, 155}
+};
+
+const uint8_t ff_silk_lsf_weight_sel_nbmb[32][9] = {
+    { 0, 1, 0, 0, 0, 0, 0, 0, 0 },
+    { 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1, 1, 1, 0, 0, 0, 0, 1, 0 },
+    { 0, 1, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 1, 0, 0, 0, 0, 0, 0, 0 },
+    { 1, 0, 1, 1, 0, 0, 0, 1, 0 },
+    { 0, 1, 1, 0, 0, 1, 1, 0, 0 },
+    { 0, 0, 1, 1, 0, 1, 0, 1, 1 },
+    { 0, 0, 1, 1, 0, 0, 1, 1, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 1, 0, 1, 1, 1, 1, 1, 0 },
+    { 0, 1, 0, 1, 1, 1, 1, 1, 0 },
+    { 0, 1, 1, 1, 1, 1, 1, 1, 0 },
+    { 1, 0, 1, 1, 0, 1, 1, 1, 1 },
+    { 0, 1, 1, 1, 1, 1, 0, 1, 0 },
+    { 0, 0, 1, 1, 0, 1, 0, 1, 0 },
+    { 0, 0, 1, 1, 1, 0, 1, 1, 1 },
+    { 0, 1, 1, 0, 0, 1, 1, 1, 0 },
+    { 0, 0, 0, 1, 1, 1, 0, 1, 0 },
+    { 0, 1, 1, 0, 0, 1, 0, 1, 0 },
+    { 0, 1, 1, 0, 0, 0, 1, 1, 0 },
+    { 0, 0, 0, 0, 0, 1, 1, 1, 1 },
+    { 0, 0, 1, 1, 0, 0, 0, 1, 1 },
+    { 0, 0, 0, 1, 0, 1, 1, 1, 1 },
+    { 0, 1, 1, 1, 1, 1, 1, 1, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 1, 0, 1, 1, 0, 1, 0 },
+    { 1, 0, 0, 1, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 1, 1, 0, 1, 0, 1 },
+    { 1, 0, 1, 1, 0, 1, 1, 1, 1 }
+};
+
+const uint8_t ff_silk_lsf_weight_sel_wb[32][15] = {
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },
+    { 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0 },
+    { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0 },
+    { 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1 },
+    { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0 },
+    { 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0 },
+    { 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0 },
+    { 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 },
+    { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0 },
+    { 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0 },
+    { 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0 },
+    { 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0 },
+    { 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0 },
+    { 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1 },
+    { 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 },
+    { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0 }
+};
+
+const uint8_t ff_silk_lsf_codebook_nbmb[32][10] = {
+    { 12,  35,  60,  83, 108, 132, 157, 180, 206, 228 },
+    { 15,  32,  55,  77, 101, 125, 151, 175, 201, 225 },
+    { 19,  42,  66,  89, 114, 137, 162, 184, 209, 230 },
+    { 12,  25,  50,  72,  97, 120, 147, 172, 200, 223 },
+    { 26,  44,  69,  90, 114, 135, 159, 180, 205, 225 },
+    { 13,  22,  53,  80, 106, 130, 156, 180, 205, 228 },
+    { 15,  25,  44,  64,  90, 115, 142, 168, 196, 222 },
+    { 19,  24,  62,  82, 100, 120, 145, 168, 190, 214 },
+    { 22,  31,  50,  79, 103, 120, 151, 170, 203, 227 },
+    { 21,  29,  45,  65, 106, 124, 150, 171, 196, 224 },
+    { 30,  49,  75,  97, 121, 142, 165, 186, 209, 229 },
+    { 19,  25,  52,  70,  93, 116, 143, 166, 192, 219 },
+    { 26,  34,  62,  75,  97, 118, 145, 167, 194, 217 },
+    { 25,  33,  56,  70,  91, 113, 143, 165, 196, 223 },
+    { 21,  34,  51,  72,  97, 117, 145, 171, 196, 222 },
+    { 20,  29,  50,  67,  90, 117, 144, 168, 197, 221 },
+    { 22,  31,  48,  66,  95, 117, 146, 168, 196, 222 },
+    { 24,  33,  51,  77, 116, 134, 158, 180, 200, 224 },
+    { 21,  28,  70,  87, 106, 124, 149, 170, 194, 217 },
+    { 26,  33,  53,  64,  83, 117, 152, 173, 204, 225 },
+    { 27,  34,  65,  95, 108, 129, 155, 174, 210, 225 },
+    { 20,  26,  72,  99, 113, 131, 154, 176, 200, 219 },
+    { 34,  43,  61,  78,  93, 114, 155, 177, 205, 229 },
+    { 23,  29,  54,  97, 124, 138, 163, 179, 209, 229 },
+    { 30,  38,  56,  89, 118, 129, 158, 178, 200, 231 },
+    { 21,  29,  49,  63,  85, 111, 142, 163, 193, 222 },
+    { 27,  48,  77, 103, 133, 158, 179, 196, 215, 232 },
+    { 29,  47,  74,  99, 124, 151, 176, 198, 220, 237 },
+    { 33,  42,  61,  76,  93, 121, 155, 174, 207, 225 },
+    { 29,  53,  87, 112, 136, 154, 170, 188, 208, 227 },
+    { 24,  30,  52,  84, 131, 150, 166, 186, 203, 229 },
+    { 37,  48,  64,  84, 104, 118, 156, 177, 201, 230 }
+};
+
+const uint8_t ff_silk_lsf_codebook_wb[32][16] = {
+    {  7,  23,  38,  54,  69,  85, 100, 116, 131, 147, 162, 178, 193, 208, 223, 239 },
+    { 13,  25,  41,  55,  69,  83,  98, 112, 127, 142, 157, 171, 187, 203, 220, 236 },
+    { 15,  21,  34,  51,  61,  78,  92, 106, 126, 136, 152, 167, 185, 205, 225, 240 },
+    { 10,  21,  36,  50,  63,  79,  95, 110, 126, 141, 157, 173, 189, 205, 221, 237 },
+    { 17,  20,  37,  51,  59,  78,  89, 107, 123, 134, 150, 164, 184, 205, 224, 240 },
+    { 10,  15,  32,  51,  67,  81,  96, 112, 129, 142, 158, 173, 189, 204, 220, 236 },
+    {  8,  21,  37,  51,  65,  79,  98, 113, 126, 138, 155, 168, 179, 192, 209, 218 },
+    { 12,  15,  34,  55,  63,  78,  87, 108, 118, 131, 148, 167, 185, 203, 219, 236 },
+    { 16,  19,  32,  36,  56,  79,  91, 108, 118, 136, 154, 171, 186, 204, 220, 237 },
+    { 11,  28,  43,  58,  74,  89, 105, 120, 135, 150, 165, 180, 196, 211, 226, 241 },
+    {  6,  16,  33,  46,  60,  75,  92, 107, 123, 137, 156, 169, 185, 199, 214, 225 },
+    { 11,  19,  30,  44,  57,  74,  89, 105, 121, 135, 152, 169, 186, 202, 218, 234 },
+    { 12,  19,  29,  46,  57,  71,  88, 100, 120, 132, 148, 165, 182, 199, 216, 233 },
+    { 17,  23,  35,  46,  56,  77,  92, 106, 123, 134, 152, 167, 185, 204, 222, 237 },
+    { 14,  17,  45,  53,  63,  75,  89, 107, 115, 132, 151, 171, 188, 206, 221, 240 },
+    {  9,  16,  29,  40,  56,  71,  88, 103, 119, 137, 154, 171, 189, 205, 222, 237 },
+    { 16,  19,  36,  48,  57,  76,  87, 105, 118, 132, 150, 167, 185, 202, 218, 236 },
+    { 12,  17,  29,  54,  71,  81,  94, 104, 126, 136, 149, 164, 182, 201, 221, 237 },
+    { 15,  28,  47,  62,  79,  97, 115, 129, 142, 155, 168, 180, 194, 208, 223, 238 },
+    {  8,  14,  30,  45,  62,  78,  94, 111, 127, 143, 159, 175, 192, 207, 223, 239 },
+    { 17,  30,  49,  62,  79,  92, 107, 119, 132, 145, 160, 174, 190, 204, 220, 235 },
+    { 14,  19,  36,  45,  61,  76,  91, 108, 121, 138, 154, 172, 189, 205, 222, 238 },
+    { 12,  18,  31,  45,  60,  76,  91, 107, 123, 138, 154, 171, 187, 204, 221, 236 },
+    { 13,  17,  31,  43,  53,  70,  83, 103, 114, 131, 149, 167, 185, 203, 220, 237 },
+    { 17,  22,  35,  42,  58,  78,  93, 110, 125, 139, 155, 170, 188, 206, 224, 240 },
+    {  8,  15,  34,  50,  67,  83,  99, 115, 131, 146, 162, 178, 193, 209, 224, 239 },
+    { 13,  16,  41,  66,  73,  86,  95, 111, 128, 137, 150, 163, 183, 206, 225, 241 },
+    { 17,  25,  37,  52,  63,  75,  92, 102, 119, 132, 144, 160, 175, 191, 212, 231 },
+    { 19,  31,  49,  65,  83, 100, 117, 133, 147, 161, 174, 187, 200, 213, 227, 242 },
+    { 18,  31,  52,  68,  88, 103, 117, 126, 138, 149, 163, 177, 192, 207, 223, 239 },
+    { 16,  29,  47,  61,  76,  90, 106, 119, 133, 147, 161, 176, 193, 209, 224, 240 },
+    { 15,  21,  35,  50,  61,  73,  86,  97, 110, 119, 129, 141, 175, 198, 218, 237 }
+};
+
+const uint16_t ff_silk_lsf_min_spacing_nbmb[] = {
+    250, 3, 6, 3, 3, 3, 4, 3, 3, 3, 461
+};
+
+const uint16_t ff_silk_lsf_min_spacing_wb[] = {
+    100, 3, 40, 3, 3, 3, 5, 14, 14, 10, 11, 3, 8, 9, 7, 3, 347
+};
+
+const uint8_t ff_silk_lsf_ordering_nbmb[] = {
+    0, 9, 6, 3, 4, 5, 8, 1, 2, 7
+};
+
+const uint8_t ff_silk_lsf_ordering_wb[] = {
+    0, 15, 8, 7, 4, 11, 12, 3, 2, 13, 10, 5, 6, 9, 14, 1
+};
+
+const int16_t ff_silk_cosine[] = { /* (0.12) */
+     4096,  4095,  4091,  4085,
+     4076,  4065,  4052,  4036,
+     4017,  3997,  3973,  3948,
+     3920,  3889,  3857,  3822,
+     3784,  3745,  3703,  3659,
+     3613,  3564,  3513,  3461,
+     3406,  3349,  3290,  3229,
+     3166,  3102,  3035,  2967,
+     2896,  2824,  2751,  2676,
+     2599,  2520,  2440,  2359,
+     2276,  2191,  2106,  2019,
+     1931,  1842,  1751,  1660,
+     1568,  1474,  1380,  1285,
+     1189,  1093,   995,   897,
+      799,   700,   601,   501,
+      401,   301,   201,   101,
+        0,  -101,  -201,  -301,
+     -401,  -501,  -601,  -700,
+     -799,  -897,  -995, -1093,
+    -1189, -1285, -1380, -1474,
+    -1568, -1660, -1751, -1842,
+    -1931, -2019, -2106, -2191,
+    -2276, -2359, -2440, -2520,
+    -2599, -2676, -2751, -2824,
+    -2896, -2967, -3035, -3102,
+    -3166, -3229, -3290, -3349,
+    -3406, -3461, -3513, -3564,
+    -3613, -3659, -3703, -3745,
+    -3784, -3822, -3857, -3889,
+    -3920, -3948, -3973, -3997,
+    -4017, -4036, -4052, -4065,
+    -4076, -4085, -4091, -4095,
+    -4096
+};
+
+const uint16_t ff_silk_pitch_scale[]   = {  4,   6,   8};
+
+const uint16_t ff_silk_pitch_min_lag[] = { 16,  24,  32};
+
+const uint16_t ff_silk_pitch_max_lag[] = {144, 216, 288};
+
+const int8_t ff_silk_pitch_offset_nb10ms[3][2] = {
+    { 0,  0},
+    { 1,  0},
+    { 0,  1}
+};
+
+const int8_t ff_silk_pitch_offset_nb20ms[11][4] = {
+    { 0,  0,  0,  0},
+    { 2,  1,  0, -1},
+    {-1,  0,  1,  2},
+    {-1,  0,  0,  1},
+    {-1,  0,  0,  0},
+    { 0,  0,  0,  1},
+    { 0,  0,  1,  1},
+    { 1,  1,  0,  0},
+    { 1,  0,  0,  0},
+    { 0,  0,  0, -1},
+    { 1,  0,  0, -1}
+};
+
+const int8_t ff_silk_pitch_offset_mbwb10ms[12][2] = {
+    { 0,  0},
+    { 0,  1},
+    { 1,  0},
+    {-1,  1},
+    { 1, -1},
+    {-1,  2},
+    { 2, -1},
+    {-2,  2},
+    { 2, -2},
+    {-2,  3},
+    { 3, -2},
+    {-3,  3}
+};
+
+const int8_t ff_silk_pitch_offset_mbwb20ms[34][4] = {
+    { 0,  0,  0,  0},
+    { 0,  0,  1,  1},
+    { 1,  1,  0,  0},
+    {-1,  0,  0,  0},
+    { 0,  0,  0,  1},
+    { 1,  0,  0,  0},
+    {-1,  0,  0,  1},
+    { 0,  0,  0, -1},
+    {-1,  0,  1,  2},
+    { 1,  0,  0, -1},
+    {-2, -1,  1,  2},
+    { 2,  1,  0, -1},
+    {-2,  0,  0,  2},
+    {-2,  0,  1,  3},
+    { 2,  1, -1, -2},
+    {-3, -1,  1,  3},
+    { 2,  0,  0, -2},
+    { 3,  1,  0, -2},
+    {-3, -1,  2,  4},
+    {-4, -1,  1,  4},
+    { 3,  1, -1, -3},
+    {-4, -1,  2,  5},
+    { 4,  2, -1, -3},
+    { 4,  1, -1, -4},
+    {-5, -1,  2,  6},
+    { 5,  2, -1, -4},
+    {-6, -2,  2,  6},
+    {-5, -2,  2,  5},
+    { 6,  2, -1, -5},
+    {-7, -2,  3,  8},
+    { 6,  2, -2, -6},
+    { 5,  2, -2, -5},
+    { 8,  3, -2, -7},
+    {-9, -3,  3,  9}
+};
+
+const int8_t ff_silk_ltp_filter0_taps[8][5] = {
+    {  4,   6,  24,   7,   5},
+    {  0,   0,   2,   0,   0},
+    { 12,  28,  41,  13,  -4},
+    { -9,  15,  42,  25,  14},
+    {  1,  -2,  62,  41,  -9},
+    {-10,  37,  65,  -4,   3},
+    { -6,   4,  66,   7,  -8},
+    { 16,  14,  38,  -3,  33}
+};
+
+const int8_t ff_silk_ltp_filter1_taps[16][5] = {
+    { 13,  22,  39,  23,  12},
+    { -1,  36,  64,  27,  -6},
+    { -7,  10,  55,  43,  17},
+    {  1,   1,   8,   1,   1},
+    {  6, -11,  74,  53,  -9},
+    {-12,  55,  76, -12,   8},
+    { -3,   3,  93,  27,  -4},
+    { 26,  39,  59,   3,  -8},
+    {  2,   0,  77,  11,   9},
+    { -8,  22,  44,  -6,   7},
+    { 40,   9,  26,   3,   9},
+    { -7,  20, 101,  -7,   4},
+    {  3,  -8,  42,  26,   0},
+    {-15,  33,  68,   2,  23},
+    { -2,  55,  46,  -2,  15},
+    {  3,  -1,  21,  16,  41}
+};
+
+const int8_t ff_silk_ltp_filter2_taps[32][5] = {
+    { -6,  27,  61,  39,   5},
+    {-11,  42,  88,   4,   1},
+    { -2,  60,  65,   6,  -4},
+    { -1,  -5,  73,  56,   1},
+    { -9,  19,  94,  29,  -9},
+    {  0,  12,  99,   6,   4},
+    {  8, -19, 102,  46, -13},
+    {  3,   2,  13,   3,   2},
+    {  9, -21,  84,  72, -18},
+    {-11,  46, 104, -22,   8},
+    { 18,  38,  48,  23,   0},
+    {-16,  70,  83, -21,  11},
+    {  5, -11, 117,  22,  -8},
+    { -6,  23, 117, -12,   3},
+    {  3,  -8,  95,  28,   4},
+    {-10,  15,  77,  60, -15},
+    { -1,   4, 124,   2,  -4},
+    {  3,  38,  84,  24, -25},
+    {  2,  13,  42,  13,  31},
+    { 21,  -4,  56,  46,  -1},
+    { -1,  35,  79, -13,  19},
+    { -7,  65,  88,  -9, -14},
+    { 20,   4,  81,  49, -29},
+    { 20,   0,  75,   3, -17},
+    {  5,  -9,  44,  92,  -8},
+    {  1,  -3,  22,  69,  31},
+    { -6,  95,  41, -12,   5},
+    { 39,  67,  16,  -4,   1},
+    {  0,  -6, 120,  55, -36},
+    {-13,  44, 122,   4, -24},
+    { 81,   5,  11,   3,   7},
+    {  2,   0,   9,  10,  88}
+};
+
+const uint16_t ff_silk_ltp_scale_factor[] = {15565, 12288, 8192};
+
+const uint8_t ff_silk_shell_blocks[3][2] = {
+    { 5, 10}, // NB
+    { 8, 15}, // MB
+    {10, 20}  // WB
+};
+
+const uint8_t ff_silk_quant_offset[2][2] = { /* (0.23) */
+    {25, 60}, // Inactive or Unvoiced
+    { 8, 25}  // Voiced
+};
+
+const int ff_silk_stereo_interp_len[3] = {
+    64, 96, 128
+};
+
+const uint16_t ff_celt_model_tapset[] = { 4, 2, 3, 4 };
+
+const uint16_t ff_celt_model_spread[] = { 32, 7, 9, 30, 32 };
+
+const uint16_t ff_celt_model_alloc_trim[] = {
+    128,   2,   4,   9,  19,  41,  87, 109, 119, 124, 126, 128
+};
+
+const uint16_t ff_celt_model_energy_small[] = { 4, 2, 3, 4 };
+
+const uint8_t ff_celt_freq_bands[] = { /* in steps of 200Hz */
+    0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
+};
+
+const uint8_t ff_celt_freq_range[] = {
+    1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  4,  4,  4,  6,  6,  8, 12, 18, 22
+};
+
+const uint8_t ff_celt_log_freq_range[] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  8,  8,  8,  8, 16, 16, 16, 21, 21, 24, 29, 34, 36
+};
+
+/* Positive - increased freqeuency resolution (only possible on transients)
+ * Negative - increased time resolution */
+const int8_t ff_celt_tf_select[4][2][2][2] = {
+    /*          OFF                        ON                Transient frame */
+    /*     OFF        ON             OFF        ON           TF select flag  */
+    /*   OFF  ON    OFF  ON        OFF  ON    OFF  ON        TF change flag  */
+    { { { 0, -1 }, { 0, -1 } }, { { 0, -1 }, { 0, -1 } } }, /* 120 */
+    { { { 0, -1 }, { 0, -2 } }, { { 1,  0 }, { 1, -1 } } }, /* 240 */
+    { { { 0, -2 }, { 0, -3 } }, { { 2,  0 }, { 1, -1 } } }, /* 480 */
+    { { { 0, -2 }, { 0, -3 } }, { { 3,  0 }, { 1, -1 } } }  /* 960 */
+};
+
+const float ff_celt_mean_energy[] = {
+    6.437500f, 6.250000f, 5.750000f, 5.312500f, 5.062500f,
+    4.812500f, 4.500000f, 4.375000f, 4.875000f, 4.687500f,
+    4.562500f, 4.437500f, 4.875000f, 4.625000f, 4.312500f,
+    4.500000f, 4.375000f, 4.625000f, 4.750000f, 4.437500f,
+    3.750000f, 3.750000f, 3.750000f, 3.750000f, 3.750000f
+};
+
+const float ff_celt_alpha_coef[] = {
+    29440.0f/32768.0f,    26112.0f/32768.0f,    21248.0f/32768.0f,    16384.0f/32768.0f
+};
+
+const float ff_celt_beta_coef[] = {
+    1.0f - (30147.0f/32768.0f), 1.0f - (22282.0f/32768.0f), 1.0f - (12124.0f/32768.0f), 1.0f - (6554.0f/32768.0f),
+};
+
+const uint8_t ff_celt_coarse_energy_dist[4][2][42] = {
+    {
+        {       // 120-sample inter
+             72, 127,  65, 129,  66, 128,  65, 128,  64, 128,  62, 128,  64, 128,
+             64, 128,  92,  78,  92,  79,  92,  78,  90,  79, 116,  41, 115,  40,
+            114,  40, 132,  26, 132,  26, 145,  17, 161,  12, 176,  10, 177,  11
+        }, {    // 120-sample intra
+             24, 179,  48, 138,  54, 135,  54, 132,  53, 134,  56, 133,  55, 132,
+             55, 132,  61, 114,  70,  96,  74,  88,  75,  88,  87,  74,  89,  66,
+             91,  67, 100,  59, 108,  50, 120,  40, 122,  37,  97,  43,  78,  50
+        }
+    }, {
+        {       // 240-sample inter
+             83,  78,  84,  81,  88,  75,  86,  74,  87,  71,  90,  73,  93,  74,
+             93,  74, 109,  40, 114,  36, 117,  34, 117,  34, 143,  17, 145,  18,
+            146,  19, 162,  12, 165,  10, 178,   7, 189,   6, 190,   8, 177,   9
+        }, {    // 240-sample intra
+             23, 178,  54, 115,  63, 102,  66,  98,  69,  99,  74,  89,  71,  91,
+             73,  91,  78,  89,  86,  80,  92,  66,  93,  64, 102,  59, 103,  60,
+            104,  60, 117,  52, 123,  44, 138,  35, 133,  31,  97,  38,  77,  45
+        }
+    }, {
+        {       // 480-sample inter
+             61,  90,  93,  60, 105,  42, 107,  41, 110,  45, 116,  38, 113,  38,
+            112,  38, 124,  26, 132,  27, 136,  19, 140,  20, 155,  14, 159,  16,
+            158,  18, 170,  13, 177,  10, 187,   8, 192,   6, 175,   9, 159,  10
+        }, {    // 480-sample intra
+             21, 178,  59, 110,  71,  86,  75,  85,  84,  83,  91,  66,  88,  73,
+             87,  72,  92,  75,  98,  72, 105,  58, 107,  54, 115,  52, 114,  55,
+            112,  56, 129,  51, 132,  40, 150,  33, 140,  29,  98,  35,  77,  42
+        }
+    }, {
+        {       // 960-sample inter
+             42, 121,  96,  66, 108,  43, 111,  40, 117,  44, 123,  32, 120,  36,
+            119,  33, 127,  33, 134,  34, 139,  21, 147,  23, 152,  20, 158,  25,
+            154,  26, 166,  21, 173,  16, 184,  13, 184,  10, 150,  13, 139,  15
+        }, {    // 960-sample intra
+             22, 178,  63, 114,  74,  82,  84,  83,  92,  82, 103,  62,  96,  72,
+             96,  67, 101,  73, 107,  72, 113,  55, 118,  52, 125,  52, 118,  52,
+            117,  55, 135,  49, 137,  39, 157,  32, 145,  29,  97,  33,  77,  40
+        }
+    }
+};
+
+const uint8_t ff_celt_static_alloc[11][21] = {  /* 1/32 bit/sample */
+    {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 },
+    {  90,  80,  75,  69,  63,  56,  49,  40,  34,  29,  20,  18,  10,   0,   0,   0,   0,   0,   0,   0,   0 },
+    { 110, 100,  90,  84,  78,  71,  65,  58,  51,  45,  39,  32,  26,  20,  12,   0,   0,   0,   0,   0,   0 },
+    { 118, 110, 103,  93,  86,  80,  75,  70,  65,  59,  53,  47,  40,  31,  23,  15,   4,   0,   0,   0,   0 },
+    { 126, 119, 112, 104,  95,  89,  83,  78,  72,  66,  60,  54,  47,  39,  32,  25,  17,  12,   1,   0,   0 },
+    { 134, 127, 120, 114, 103,  97,  91,  85,  78,  72,  66,  60,  54,  47,  41,  35,  29,  23,  16,  10,   1 },
+    { 144, 137, 130, 124, 113, 107, 101,  95,  88,  82,  76,  70,  64,  57,  51,  45,  39,  33,  26,  15,   1 },
+    { 152, 145, 138, 132, 123, 117, 111, 105,  98,  92,  86,  80,  74,  67,  61,  55,  49,  43,  36,  20,   1 },
+    { 162, 155, 148, 142, 133, 127, 121, 115, 108, 102,  96,  90,  84,  77,  71,  65,  59,  53,  46,  30,   1 },
+    { 172, 165, 158, 152, 143, 137, 131, 125, 118, 112, 106, 100,  94,  87,  81,  75,  69,  63,  56,  45,  20 },
+    { 200, 200, 200, 200, 200, 200, 200, 200, 198, 193, 188, 183, 178, 173, 168, 163, 158, 153, 148, 129, 104 }
+};
+
+const uint8_t ff_celt_static_caps[4][2][21] = {
+    {       // 120-sample
+        {224, 224, 224, 224, 224, 224, 224, 224, 160, 160,
+         160, 160, 185, 185, 185, 178, 178, 168, 134,  61,  37},
+        {224, 224, 224, 224, 224, 224, 224, 224, 240, 240,
+         240, 240, 207, 207, 207, 198, 198, 183, 144,  66,  40},
+    }, {    // 240-sample
+        {160, 160, 160, 160, 160, 160, 160, 160, 185, 185,
+         185, 185, 193, 193, 193, 183, 183, 172, 138,  64,  38},
+        {240, 240, 240, 240, 240, 240, 240, 240, 207, 207,
+         207, 207, 204, 204, 204, 193, 193, 180, 143,  66,  40},
+    }, {    // 480-sample
+        {185, 185, 185, 185, 185, 185, 185, 185, 193, 193,
+         193, 193, 193, 193, 193, 183, 183, 172, 138,  65,  39},
+        {207, 207, 207, 207, 207, 207, 207, 207, 204, 204,
+         204, 204, 201, 201, 201, 188, 188, 176, 141,  66,  40},
+    }, {    // 960-sample
+        {193, 193, 193, 193, 193, 193, 193, 193, 193, 193,
+         193, 193, 194, 194, 194, 184, 184, 173, 139,  65,  39},
+        {204, 204, 204, 204, 204, 204, 204, 204, 201, 201,
+         201, 201, 198, 198, 198, 187, 187, 175, 140,  66,  40}
+    }
+};
+
+const uint8_t ff_celt_cache_bits[392] = {
+    40, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 40, 15, 23, 28,
+    31, 34, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50,
+    51, 52, 53, 54, 55, 55, 57, 58, 59, 60, 61, 62, 63, 63, 65,
+    66, 67, 68, 69, 70, 71, 71, 40, 20, 33, 41, 48, 53, 57, 61,
+    64, 66, 69, 71, 73, 75, 76, 78, 80, 82, 85, 87, 89, 91, 92,
+    94, 96, 98, 101, 103, 105, 107, 108, 110, 112, 114, 117, 119, 121, 123,
+    124, 126, 128, 40, 23, 39, 51, 60, 67, 73, 79, 83, 87, 91, 94,
+    97, 100, 102, 105, 107, 111, 115, 118, 121, 124, 126, 129, 131, 135, 139,
+    142, 145, 148, 150, 153, 155, 159, 163, 166, 169, 172, 174, 177, 179, 35,
+    28, 49, 65, 78, 89, 99, 107, 114, 120, 126, 132, 136, 141, 145, 149,
+    153, 159, 165, 171, 176, 180, 185, 189, 192, 199, 205, 211, 216, 220, 225,
+    229, 232, 239, 245, 251, 21, 33, 58, 79, 97, 112, 125, 137, 148, 157,
+    166, 174, 182, 189, 195, 201, 207, 217, 227, 235, 243, 251, 17, 35, 63,
+    86, 106, 123, 139, 152, 165, 177, 187, 197, 206, 214, 222, 230, 237, 250,
+    25, 31, 55, 75, 91, 105, 117, 128, 138, 146, 154, 161, 168, 174, 180,
+    185, 190, 200, 208, 215, 222, 229, 235, 240, 245, 255, 16, 36, 65, 89,
+    110, 128, 144, 159, 173, 185, 196, 207, 217, 226, 234, 242, 250, 11, 41,
+    74, 103, 128, 151, 172, 191, 209, 225, 241, 255, 9, 43, 79, 110, 138,
+    163, 186, 207, 227, 246, 12, 39, 71, 99, 123, 144, 164, 182, 198, 214,
+    228, 241, 253, 9, 44, 81, 113, 142, 168, 192, 214, 235, 255, 7, 49,
+    90, 127, 160, 191, 220, 247, 6, 51, 95, 134, 170, 203, 234, 7, 47,
+    87, 123, 155, 184, 212, 237, 6, 52, 97, 137, 174, 208, 240, 5, 57,
+    106, 151, 192, 231, 5, 59, 111, 158, 202, 243, 5, 55, 103, 147, 187,
+    224, 5, 60, 113, 161, 206, 248, 4, 65, 122, 175, 224, 4, 67, 127,
+    182, 234
+};
+
+const int16_t ff_celt_cache_index[105] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 41, 41, 41,
+    82, 82, 123, 164, 200, 222, 0, 0, 0, 0, 0, 0, 0, 0, 41,
+    41, 41, 41, 123, 123, 123, 164, 164, 240, 266, 283, 295, 41, 41, 41,
+    41, 41, 41, 41, 41, 123, 123, 123, 123, 240, 240, 240, 266, 266, 305,
+    318, 328, 336, 123, 123, 123, 123, 123, 123, 123, 123, 240, 240, 240, 240,
+    305, 305, 305, 318, 318, 343, 351, 358, 364, 240, 240, 240, 240, 240, 240,
+    240, 240, 305, 305, 305, 305, 343, 343, 343, 351, 351, 370, 376, 382, 387,
+};
+
+const uint8_t ff_celt_log2_frac[] = {
+    0, 8, 13, 16, 19, 21, 23, 24, 26, 27, 28, 29, 30, 31, 32, 32, 33, 34, 34, 35, 36, 36, 37, 37
+};
+
+const uint8_t ff_celt_bit_interleave[] = {
+    0, 1, 1, 1, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3
+};
+
+const uint8_t ff_celt_bit_deinterleave[] = {
+    0x00, 0x03, 0x0C, 0x0F, 0x30, 0x33, 0x3C, 0x3F,
+    0xC0, 0xC3, 0xCC, 0xCF, 0xF0, 0xF3, 0xFC, 0xFF
+};
+
+const uint8_t ff_celt_hadamard_order[] = {
+    1,   0,
+    3,   0,  2,  1,
+    7,   0,  4,  3,  6,  1,  5,  2,
+    15,  0,  8,  7, 12,  3, 11,  4, 14,  1,  9,  6, 13,  2, 10,  5,
+    0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+};
+
+const uint16_t ff_celt_qn_exp2[] = {
+    16384, 17866, 19483, 21247, 23170, 25267, 27554, 30048
+};
+
+const uint32_t ff_celt_pvq_u[1272] = {
+    /* N = 0, K = 0...176 */
+    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* N = 1, K = 1...176 */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* N = 2, K = 2...176 */
+    3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+    81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
+    115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 141, 143,
+    145, 147, 149, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173,
+    175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203,
+    205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233,
+    235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 259, 261, 263,
+    265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293,
+    295, 297, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323,
+    325, 327, 329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351,
+    /* N = 3, K = 3...176 */
+    13, 25, 41, 61, 85, 113, 145, 181, 221, 265, 313, 365, 421, 481, 545, 613,
+    685, 761, 841, 925, 1013, 1105, 1201, 1301, 1405, 1513, 1625, 1741, 1861,
+    1985, 2113, 2245, 2381, 2521, 2665, 2813, 2965, 3121, 3281, 3445, 3613, 3785,
+    3961, 4141, 4325, 4513, 4705, 4901, 5101, 5305, 5513, 5725, 5941, 6161, 6385,
+    6613, 6845, 7081, 7321, 7565, 7813, 8065, 8321, 8581, 8845, 9113, 9385, 9661,
+    9941, 10225, 10513, 10805, 11101, 11401, 11705, 12013, 12325, 12641, 12961,
+    13285, 13613, 13945, 14281, 14621, 14965, 15313, 15665, 16021, 16381, 16745,
+    17113, 17485, 17861, 18241, 18625, 19013, 19405, 19801, 20201, 20605, 21013,
+    21425, 21841, 22261, 22685, 23113, 23545, 23981, 24421, 24865, 25313, 25765,
+    26221, 26681, 27145, 27613, 28085, 28561, 29041, 29525, 30013, 30505, 31001,
+    31501, 32005, 32513, 33025, 33541, 34061, 34585, 35113, 35645, 36181, 36721,
+    37265, 37813, 38365, 38921, 39481, 40045, 40613, 41185, 41761, 42341, 42925,
+    43513, 44105, 44701, 45301, 45905, 46513, 47125, 47741, 48361, 48985, 49613,
+    50245, 50881, 51521, 52165, 52813, 53465, 54121, 54781, 55445, 56113, 56785,
+    57461, 58141, 58825, 59513, 60205, 60901, 61601,
+    /* N = 4, K = 4...176 */
+    63, 129, 231, 377, 575, 833, 1159, 1561, 2047, 2625, 3303, 4089, 4991, 6017,
+    7175, 8473, 9919, 11521, 13287, 15225, 17343, 19649, 22151, 24857, 27775,
+    30913, 34279, 37881, 41727, 45825, 50183, 54809, 59711, 64897, 70375, 76153,
+    82239, 88641, 95367, 102425, 109823, 117569, 125671, 134137, 142975, 152193,
+    161799, 171801, 182207, 193025, 204263, 215929, 228031, 240577, 253575,
+    267033, 280959, 295361, 310247, 325625, 341503, 357889, 374791, 392217,
+    410175, 428673, 447719, 467321, 487487, 508225, 529543, 551449, 573951,
+    597057, 620775, 645113, 670079, 695681, 721927, 748825, 776383, 804609,
+    833511, 863097, 893375, 924353, 956039, 988441, 1021567, 1055425, 1090023,
+    1125369, 1161471, 1198337, 1235975, 1274393, 1313599, 1353601, 1394407,
+    1436025, 1478463, 1521729, 1565831, 1610777, 1656575, 1703233, 1750759,
+    1799161, 1848447, 1898625, 1949703, 2001689, 2054591, 2108417, 2163175,
+    2218873, 2275519, 2333121, 2391687, 2451225, 2511743, 2573249, 2635751,
+    2699257, 2763775, 2829313, 2895879, 2963481, 3032127, 3101825, 3172583,
+    3244409, 3317311, 3391297, 3466375, 3542553, 3619839, 3698241, 3777767,
+    3858425, 3940223, 4023169, 4107271, 4192537, 4278975, 4366593, 4455399,
+    4545401, 4636607, 4729025, 4822663, 4917529, 5013631, 5110977, 5209575,
+    5309433, 5410559, 5512961, 5616647, 5721625, 5827903, 5935489, 6044391,
+    6154617, 6266175, 6379073, 6493319, 6608921, 6725887, 6844225, 6963943,
+    7085049, 7207551,
+    /* N = 5, K = 5...176 */
+    321, 681, 1289, 2241, 3649, 5641, 8361, 11969, 16641, 22569, 29961, 39041,
+    50049, 63241, 78889, 97281, 118721, 143529, 172041, 204609, 241601, 283401,
+    330409, 383041, 441729, 506921, 579081, 658689, 746241, 842249, 947241,
+    1061761, 1186369, 1321641, 1468169, 1626561, 1797441, 1981449, 2179241,
+    2391489, 2618881, 2862121, 3121929, 3399041, 3694209, 4008201, 4341801,
+    4695809, 5071041, 5468329, 5888521, 6332481, 6801089, 7295241, 7815849,
+    8363841, 8940161, 9545769, 10181641, 10848769, 11548161, 12280841, 13047849,
+    13850241, 14689089, 15565481, 16480521, 17435329, 18431041, 19468809,
+    20549801, 21675201, 22846209, 24064041, 25329929, 26645121, 28010881,
+    29428489, 30899241, 32424449, 34005441, 35643561, 37340169, 39096641,
+    40914369, 42794761, 44739241, 46749249, 48826241, 50971689, 53187081,
+    55473921, 57833729, 60268041, 62778409, 65366401, 68033601, 70781609,
+    73612041, 76526529, 79526721, 82614281, 85790889, 89058241, 92418049,
+    95872041, 99421961, 103069569, 106816641, 110664969, 114616361, 118672641,
+    122835649, 127107241, 131489289, 135983681, 140592321, 145317129, 150160041,
+    155123009, 160208001, 165417001, 170752009, 176215041, 181808129, 187533321,
+    193392681, 199388289, 205522241, 211796649, 218213641, 224775361, 231483969,
+    238341641, 245350569, 252512961, 259831041, 267307049, 274943241, 282741889,
+    290705281, 298835721, 307135529, 315607041, 324252609, 333074601, 342075401,
+    351257409, 360623041, 370174729, 379914921, 389846081, 399970689, 410291241,
+    420810249, 431530241, 442453761, 453583369, 464921641, 476471169, 488234561,
+    500214441, 512413449, 524834241, 537479489, 550351881, 563454121, 576788929,
+    590359041, 604167209, 618216201, 632508801,
+    /* N = 6, K = 6...96 (technically V(109,5) fits in 32 bits, but that can't be
+     achieved by splitting an Opus band) */
+    1683, 3653, 7183, 13073, 22363, 36365, 56695, 85305, 124515, 177045, 246047,
+    335137, 448427, 590557, 766727, 982729, 1244979, 1560549, 1937199, 2383409,
+    2908411, 3522221, 4235671, 5060441, 6009091, 7095093, 8332863, 9737793,
+    11326283, 13115773, 15124775, 17372905, 19880915, 22670725, 25765455,
+    29189457, 32968347, 37129037, 41699767, 46710137, 52191139, 58175189,
+    64696159, 71789409, 79491819, 87841821, 96879431, 106646281, 117185651,
+    128542501, 140763503, 153897073, 167993403, 183104493, 199284183, 216588185,
+    235074115, 254801525, 275831935, 298228865, 322057867, 347386557, 374284647,
+    402823977, 433078547, 465124549, 499040399, 534906769, 572806619, 612825229,
+    655050231, 699571641, 746481891, 795875861, 847850911, 902506913, 959946283,
+    1020274013, 1083597703, 1150027593, 1219676595, 1292660325, 1369097135,
+    1449108145, 1532817275, 1620351277, 1711839767, 1807415257, 1907213187,
+    2011371957, 2120032959,
+    /* N = 7, K = 7...54 (technically V(60,6) fits in 32 bits, but that can't be
+     achieved by splitting an Opus band) */
+    8989, 19825, 40081, 75517, 134245, 227305, 369305, 579125, 880685, 1303777,
+    1884961, 2668525, 3707509, 5064793, 6814249, 9041957, 11847485, 15345233,
+    19665841, 24957661, 31388293, 39146185, 48442297, 59511829, 72616013,
+    88043969, 106114625, 127178701, 151620757, 179861305, 212358985, 249612805,
+    292164445, 340600625, 395555537, 457713341, 527810725, 606639529, 695049433,
+    793950709, 904317037, 1027188385, 1163673953, 1314955181, 1482288821,
+    1667010073, 1870535785, 2094367717,
+    /* N = 8, K = 8...37 (technically V(40,7) fits in 32 bits, but that can't be
+     achieved by splitting an Opus band) */
+    48639, 108545, 224143, 433905, 795455, 1392065, 2340495, 3800305, 5984767,
+    9173505, 13726991, 20103025, 28875327, 40754369, 56610575, 77500017,
+    104692735, 139703809, 184327311, 240673265, 311207743, 398796225, 506750351,
+    638878193, 799538175, 993696769, 1226990095, 1505789553, 1837271615,
+    2229491905,
+    /* N = 9, K = 9...28 (technically V(29,8) fits in 32 bits, but that can't be
+     achieved by splitting an Opus band) */
+    265729, 598417, 1256465, 2485825, 4673345, 8405905, 14546705, 24331777,
+    39490049, 62390545, 96220561, 145198913, 214828609, 312193553, 446304145,
+    628496897, 872893441, 1196924561, 1621925137, 2173806145,
+    /* N = 10, K = 10...24 */
+    1462563, 3317445, 7059735, 14218905, 27298155, 50250765, 89129247, 152951073,
+    254831667, 413442773, 654862247, 1014889769, 1541911931, 2300409629,
+    3375210671,
+    /* N = 11, K = 11...19 (technically V(20,10) fits in 32 bits, but that can't be
+     achieved by splitting an Opus band) */
+    8097453, 18474633, 39753273, 81270333, 158819253, 298199265, 540279585,
+    948062325, 1616336765,
+    /* N = 12, K = 12...18 */
+    45046719, 103274625, 224298231, 464387817, 921406335, 1759885185,
+    3248227095,
+    /* N = 13, K = 13...16 */
+    251595969, 579168825, 1267854873, 2653649025,
+    /* N = 14, K = 14 */
+    1409933619
+};
+
+const float ff_celt_postfilter_taps[3][3] = {
+    { 0.3066406250f, 0.2170410156f, 0.1296386719f },
+    { 0.4638671875f, 0.2680664062f, 0.0           },
+    { 0.7998046875f, 0.1000976562f, 0.0           }
+};
+
+DECLARE_ALIGNED(32, static const float, ff_celt_window_padded)[136] = {
+    0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+    0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+    6.7286966e-05f, 0.00060551348f, 0.0016815970f, 0.0032947962f, 0.0054439943f,
+    0.0081276923f, 0.011344001f, 0.015090633f, 0.019364886f, 0.024163635f,
+    0.029483315f, 0.035319905f, 0.041668911f, 0.048525347f, 0.055883718f,
+    0.063737999f, 0.072081616f, 0.080907428f, 0.090207705f, 0.099974111f,
+    0.11019769f, 0.12086883f, 0.13197729f, 0.14351214f, 0.15546177f,
+    0.16781389f, 0.18055550f, 0.19367290f, 0.20715171f, 0.22097682f,
+    0.23513243f, 0.24960208f, 0.26436860f, 0.27941419f, 0.29472040f,
+    0.31026818f, 0.32603788f, 0.34200931f, 0.35816177f, 0.37447407f,
+    0.39092462f, 0.40749142f, 0.42415215f, 0.44088423f, 0.45766484f,
+    0.47447104f, 0.49127978f, 0.50806798f, 0.52481261f, 0.54149077f,
+    0.55807973f, 0.57455701f, 0.59090049f, 0.60708841f, 0.62309951f,
+    0.63891306f, 0.65450896f, 0.66986776f, 0.68497077f, 0.69980010f,
+    0.71433873f, 0.72857055f, 0.74248043f, 0.75605424f, 0.76927895f,
+    0.78214257f, 0.79463430f, 0.80674445f, 0.81846456f, 0.82978733f,
+    0.84070669f, 0.85121779f, 0.86131698f, 0.87100183f, 0.88027111f,
+    0.88912479f, 0.89756398f, 0.90559094f, 0.91320904f, 0.92042270f,
+    0.92723738f, 0.93365955f, 0.93969656f, 0.94535671f, 0.95064907f,
+    0.95558353f, 0.96017067f, 0.96442171f, 0.96834849f, 0.97196334f,
+    0.97527906f, 0.97830883f, 0.98106616f, 0.98356480f, 0.98581869f,
+    0.98784191f, 0.98964856f, 0.99125274f, 0.99266849f, 0.99390969f,
+    0.99499004f, 0.99592297f, 0.99672162f, 0.99739874f, 0.99796667f,
+    0.99843728f, 0.99882195f, 0.99913147f, 0.99937606f, 0.99956527f,
+    0.99970802f, 0.99981248f, 0.99988613f, 0.99993565f, 0.99996697f,
+    0.99998518f, 0.99999457f, 0.99999859f, 0.99999982f, 1.00000000f,
+    1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+    1.00000000f, 1.00000000f, 1.00000000f,
+};
+
+const float *ff_celt_window = &ff_celt_window_padded[8];
+
+/* square of the window, used for the postfilter */
+const float ff_celt_window2[120] = {
+    4.5275357e-09f, 3.66647e-07f, 2.82777e-06f, 1.08557e-05f, 2.96371e-05f, 6.60594e-05f,
+    0.000128686f, 0.000227727f, 0.000374999f, 0.000583881f, 0.000869266f, 0.0012475f,
+    0.0017363f, 0.00235471f, 0.00312299f, 0.00406253f, 0.00519576f, 0.00654601f,
+    0.00813743f, 0.00999482f, 0.0121435f, 0.0146093f, 0.017418f, 0.0205957f, 0.0241684f,
+    0.0281615f, 0.0326003f, 0.0375092f, 0.0429118f, 0.0488308f, 0.0552873f, 0.0623012f,
+    0.0698908f, 0.0780723f, 0.0868601f, 0.0962664f, 0.106301f, 0.11697f, 0.12828f,
+    0.140231f, 0.152822f, 0.166049f, 0.179905f, 0.194379f, 0.209457f, 0.225123f, 0.241356f,
+    0.258133f, 0.275428f, 0.293212f, 0.311453f, 0.330116f, 0.349163f, 0.368556f, 0.388253f,
+    0.40821f, 0.428382f, 0.448723f, 0.469185f, 0.48972f, 0.51028f, 0.530815f, 0.551277f,
+    0.571618f, 0.59179f, 0.611747f, 0.631444f, 0.650837f, 0.669884f, 0.688547f, 0.706788f,
+    0.724572f, 0.741867f, 0.758644f, 0.774877f, 0.790543f, 0.805621f, 0.820095f, 0.833951f,
+    0.847178f, 0.859769f, 0.87172f, 0.88303f, 0.893699f, 0.903734f, 0.91314f, 0.921928f,
+    0.930109f, 0.937699f, 0.944713f, 0.951169f, 0.957088f, 0.962491f, 0.9674f, 0.971838f,
+    0.975832f, 0.979404f, 0.982582f, 0.985391f, 0.987857f, 0.990005f, 0.991863f, 0.993454f,
+    0.994804f, 0.995937f, 0.996877f, 0.997645f, 0.998264f, 0.998753f, 0.999131f, 0.999416f,
+    0.999625f, 0.999772f, 0.999871f, 0.999934f, 0.99997f, 0.999989f, 0.999997f, 0.99999964f, 1.0f,
+};
+
+const uint32_t * const ff_celt_pvq_u_row[15] = {
+    ff_celt_pvq_u +    0, ff_celt_pvq_u +  176, ff_celt_pvq_u +  351,
+    ff_celt_pvq_u +  525, ff_celt_pvq_u +  698, ff_celt_pvq_u +  870,
+    ff_celt_pvq_u + 1041, ff_celt_pvq_u + 1131, ff_celt_pvq_u + 1178,
+    ff_celt_pvq_u + 1207, ff_celt_pvq_u + 1226, ff_celt_pvq_u + 1240,
+    ff_celt_pvq_u + 1248, ff_celt_pvq_u + 1254, ff_celt_pvq_u + 1257
+};
diff --git a/libavcodec/opustab.h b/libavcodec/opustab.h
new file mode 100644
index 0000000..bce5a42
--- /dev/null
+++ b/libavcodec/opustab.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2012 Andrew D'Addesio
+ * Copyright (c) 2013-2014 Mozilla Corporation
+ * Copyright (c) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_OPUSTAB_H
+#define AVCODEC_OPUSTAB_H
+
+#include "libavutil/mem.h"
+
+#include <stdint.h>
+
+extern const uint8_t  ff_celt_band_end[];
+
+extern const uint8_t  ff_opus_default_coupled_streams[];
+
+extern const uint16_t ff_silk_model_stereo_s1[];
+extern const uint16_t ff_silk_model_stereo_s2[];
+extern const uint16_t ff_silk_model_stereo_s3[];
+extern const uint16_t ff_silk_model_mid_only[];
+
+extern const uint16_t ff_silk_model_frame_type_inactive[];
+extern const uint16_t ff_silk_model_frame_type_active[];
+
+extern const uint16_t ff_silk_model_gain_highbits[3][9];
+extern const uint16_t ff_silk_model_gain_lowbits[];
+extern const uint16_t ff_silk_model_gain_delta[];
+
+extern const uint16_t ff_silk_model_lsf_s1[2][2][33];
+extern const uint16_t ff_silk_model_lsf_s2[32][10];
+extern const uint16_t ff_silk_model_lsf_s2_ext[];
+extern const uint16_t ff_silk_model_lsf_interpolation_offset[];
+
+extern const uint16_t ff_silk_model_pitch_highbits[];
+extern const uint16_t ff_silk_model_pitch_lowbits_nb[];
+extern const uint16_t ff_silk_model_pitch_lowbits_mb[];
+extern const uint16_t ff_silk_model_pitch_lowbits_wb[];
+extern const uint16_t ff_silk_model_pitch_delta[];
+extern const uint16_t ff_silk_model_pitch_contour_nb10ms[];
+extern const uint16_t ff_silk_model_pitch_contour_nb20ms[];
+extern const uint16_t ff_silk_model_pitch_contour_mbwb10ms[];
+extern const uint16_t ff_silk_model_pitch_contour_mbwb20ms[];
+
+extern const uint16_t ff_silk_model_ltp_filter[];
+extern const uint16_t ff_silk_model_ltp_filter0_sel[];
+extern const uint16_t ff_silk_model_ltp_filter1_sel[];
+extern const uint16_t ff_silk_model_ltp_filter2_sel[];
+extern const uint16_t ff_silk_model_ltp_scale_index[];
+
+extern const uint16_t ff_silk_model_lcg_seed[];
+
+extern const uint16_t ff_silk_model_exc_rate[2][10];
+
+extern const uint16_t ff_silk_model_pulse_count[11][19];
+extern const uint16_t ff_silk_model_pulse_location[4][168];
+
+extern const uint16_t ff_silk_model_excitation_lsb[];
+extern const uint16_t ff_silk_model_excitation_sign[3][2][7][3];
+
+extern const int16_t  ff_silk_stereo_weights[];
+
+extern const uint8_t  ff_silk_lsf_s2_model_sel_nbmb[32][10];
+extern const uint8_t  ff_silk_lsf_s2_model_sel_wb[32][16];
+
+extern const uint8_t  ff_silk_lsf_pred_weights_nbmb[2][9];
+extern const uint8_t  ff_silk_lsf_pred_weights_wb[2][15];
+
+extern const uint8_t  ff_silk_lsf_weight_sel_nbmb[32][9];
+extern const uint8_t  ff_silk_lsf_weight_sel_wb[32][15];
+
+extern const uint8_t  ff_silk_lsf_codebook_nbmb[32][10];
+extern const uint8_t  ff_silk_lsf_codebook_wb[32][16];
+
+extern const uint16_t ff_silk_lsf_min_spacing_nbmb[];
+extern const uint16_t ff_silk_lsf_min_spacing_wb[];
+
+extern const uint8_t  ff_silk_lsf_ordering_nbmb[];
+extern const uint8_t  ff_silk_lsf_ordering_wb[];
+
+extern const int16_t  ff_silk_cosine[];
+
+extern const uint16_t ff_silk_pitch_scale[];
+extern const uint16_t ff_silk_pitch_min_lag[];
+extern const uint16_t ff_silk_pitch_max_lag[];
+
+extern const int8_t   ff_silk_pitch_offset_nb10ms[3][2];
+extern const int8_t   ff_silk_pitch_offset_nb20ms[11][4];
+extern const int8_t   ff_silk_pitch_offset_mbwb10ms[12][2];
+extern const int8_t   ff_silk_pitch_offset_mbwb20ms[34][4];
+
+extern const int8_t   ff_silk_ltp_filter0_taps[8][5];
+extern const int8_t   ff_silk_ltp_filter1_taps[16][5];
+extern const int8_t   ff_silk_ltp_filter2_taps[32][5];
+
+extern const uint16_t ff_silk_ltp_scale_factor[];
+
+extern const uint8_t  ff_silk_shell_blocks[3][2];
+
+extern const uint8_t  ff_silk_quant_offset[2][2];
+
+extern const int      ff_silk_stereo_interp_len[3];
+
+extern const uint16_t ff_celt_model_tapset[];
+extern const uint16_t ff_celt_model_spread[];
+extern const uint16_t ff_celt_model_alloc_trim[];
+extern const uint16_t ff_celt_model_energy_small[];
+
+extern const uint8_t  ff_celt_freq_bands[];
+extern const uint8_t  ff_celt_freq_range[];
+extern const uint8_t  ff_celt_log_freq_range[];
+
+extern const int8_t   ff_celt_tf_select[4][2][2][2];
+
+extern const float    ff_celt_mean_energy[];
+
+extern const float    ff_celt_alpha_coef[];
+extern const float    ff_celt_beta_coef[];
+
+extern const uint8_t  ff_celt_coarse_energy_dist[4][2][42];
+
+extern const uint8_t  ff_celt_static_alloc[11][21];
+extern const uint8_t  ff_celt_static_caps[4][2][21];
+
+extern const uint8_t  ff_celt_cache_bits[392];
+extern const int16_t  ff_celt_cache_index[105];
+
+extern const uint8_t  ff_celt_log2_frac[];
+
+extern const uint8_t  ff_celt_bit_interleave[];
+extern const uint8_t  ff_celt_bit_deinterleave[];
+
+extern const uint8_t  ff_celt_hadamard_order[];
+
+extern const uint16_t ff_celt_qn_exp2[];
+extern const uint32_t ff_celt_pvq_u[1272];
+
+extern const float    ff_celt_postfilter_taps[3][3];
+
+extern const float    ff_celt_window2[120];
+extern const float   *ff_celt_window;
+
+extern const uint32_t * const ff_celt_pvq_u_row[15];
+
+#endif /* AVCODEC_OPUSTAB_H */
diff --git a/libavcodec/paf.h b/libavcodec/paf.h
new file mode 100644
index 0000000..ce8245f
--- /dev/null
+++ b/libavcodec/paf.h
@@ -0,0 +1,28 @@
+/*
+ * Packed Animation File decoder/demuxer common code
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PAF_H
+#define AVCODEC_PAF_H
+
+#define PAF_SOUND_SAMPLES     2205
+#define PAF_SOUND_FRAME_SIZE  ((256 + PAF_SOUND_SAMPLES) * 2)
+
+#endif /* AVCODEC_PAF_H */
diff --git a/libavcodec/pafaudio.c b/libavcodec/pafaudio.c
index c83e7f5..12f473a 100644
--- a/libavcodec/pafaudio.c
+++ b/libavcodec/pafaudio.c
@@ -2,20 +2,20 @@
  * Packed Animation File audio decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
-
-#define PAF_SOUND_SAMPLES     2205
-#define PAF_SOUND_FRAME_SIZE  ((256 + PAF_SOUND_SAMPLES) * 2)
+#include "paf.h"
 
 static av_cold int paf_audio_init(AVCodecContext *avctx)
 {
diff --git a/libavcodec/pafvideo.c b/libavcodec/pafvideo.c
index b77f47e..7c5861d 100644
--- a/libavcodec/pafvideo.c
+++ b/libavcodec/pafvideo.c
@@ -2,20 +2,20 @@
  * Packed Animation File video decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,15 +26,24 @@
 #include "copy_block.h"
 #include "internal.h"
 
+
 static const uint8_t block_sequences[16][8] = {
-    { 0, 0, 0, 0, 0, 0, 0, 0 }, { 2, 0, 0, 0, 0, 0, 0, 0 },
-    { 5, 7, 0, 0, 0, 0, 0, 0 }, { 5, 0, 0, 0, 0, 0, 0, 0 },
-    { 6, 0, 0, 0, 0, 0, 0, 0 }, { 5, 7, 5, 7, 0, 0, 0, 0 },
-    { 5, 7, 5, 0, 0, 0, 0, 0 }, { 5, 7, 6, 0, 0, 0, 0, 0 },
-    { 5, 5, 0, 0, 0, 0, 0, 0 }, { 3, 0, 0, 0, 0, 0, 0, 0 },
-    { 6, 6, 0, 0, 0, 0, 0, 0 }, { 2, 4, 0, 0, 0, 0, 0, 0 },
-    { 2, 4, 5, 7, 0, 0, 0, 0 }, { 2, 4, 5, 0, 0, 0, 0, 0 },
-    { 2, 4, 6, 0, 0, 0, 0, 0 }, { 2, 4, 5, 7, 5, 7, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 2, 0, 0, 0, 0, 0, 0, 0 },
+    { 5, 7, 0, 0, 0, 0, 0, 0 },
+    { 5, 0, 0, 0, 0, 0, 0, 0 },
+    { 6, 0, 0, 0, 0, 0, 0, 0 },
+    { 5, 7, 5, 7, 0, 0, 0, 0 },
+    { 5, 7, 5, 0, 0, 0, 0, 0 },
+    { 5, 7, 6, 0, 0, 0, 0, 0 },
+    { 5, 5, 0, 0, 0, 0, 0, 0 },
+    { 3, 0, 0, 0, 0, 0, 0, 0 },
+    { 6, 6, 0, 0, 0, 0, 0, 0 },
+    { 2, 4, 0, 0, 0, 0, 0, 0 },
+    { 2, 4, 5, 7, 0, 0, 0, 0 },
+    { 2, 4, 5, 0, 0, 0, 0, 0 },
+    { 2, 4, 6, 0, 0, 0, 0, 0 },
+    { 2, 4, 5, 7, 5, 7, 0, 0 },
 };
 
 typedef struct PAFVideoDecContext {
@@ -69,6 +78,7 @@ static av_cold int paf_video_init(AVCodecContext *avctx)
 {
     PAFVideoDecContext *c = avctx->priv_data;
     int i;
+    int ret;
 
     c->width  = avctx->width;
     c->height = avctx->height;
@@ -81,6 +91,9 @@ static av_cold int paf_video_init(AVCodecContext *avctx)
     }
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    ret = av_image_check_size2(avctx->width, FFALIGN(avctx->height, 256), avctx->max_pixels, avctx->pix_fmt, 0, avctx);
+    if (ret < 0)
+        return ret;
 
     c->pic = av_frame_alloc();
     if (!c->pic)
@@ -156,9 +169,11 @@ static int decode_0(PAFVideoDecContext *c, uint8_t *pkt, uint8_t code)
     i = bytestream2_get_byte(&c->gb);
     if (i) {
         if (code & 0x10) {
-            int pos = bytestream2_tell(&c->gb) & 3;
-            if (pos)
-                bytestream2_skip(&c->gb, 4 - pos);
+            int align;
+
+            align = bytestream2_tell(&c->gb) & 3;
+            if (align)
+                bytestream2_skip(&c->gb, 4 - align);
         }
         do {
             int page, val, x, y;
@@ -170,6 +185,8 @@ static int decode_0(PAFVideoDecContext *c, uint8_t *pkt, uint8_t code)
             dend   = c->frame[page] + c->frame_size;
             offset = (x & 0x7F) * 2;
             j      = bytestream2_get_le16(&c->gb) + offset;
+            if (bytestream2_get_bytes_left(&c->gb) < (j - offset) * 16)
+                return AVERROR_INVALIDDATA;
             do {
                 offset++;
                 if (dst + 3 * c->width + 4 > dend)
@@ -187,7 +204,8 @@ static int decode_0(PAFVideoDecContext *c, uint8_t *pkt, uint8_t code)
     do {
         set_src_position(c, &src, &send);
         if ((src + 3 * c->width + 4 > send) ||
-            (dst + 3 * c->width + 4 > dend))
+            (dst + 3 * c->width + 4 > dend) ||
+            bytestream2_get_bytes_left(&c->gb) < 4)
             return AVERROR_INVALIDDATA;
         copy_block4(dst, src, c->width, c->width, 4);
         i++;
@@ -256,12 +274,20 @@ static int paf_video_decode(AVCodecContext *avctx, void *data,
     uint8_t code, *dst, *end;
     int i, frame, ret;
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
-        return ret;
+    if (pkt->size < 2)
+        return AVERROR_INVALIDDATA;
 
     bytestream2_init(&c->gb, pkt->data, pkt->size);
 
     code = bytestream2_get_byte(&c->gb);
+    if ((code & 0xF) > 4 || (code & 0xF) == 3) {
+        avpriv_request_sample(avctx, "unknown/invalid code");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
+        return ret;
+
     if (code & 0x20) {  // frame is keyframe
         for (i = 0; i < 4; i++)
             memset(c->frame[i], 0, c->frame_size);
@@ -356,8 +382,7 @@ static int paf_video_decode(AVCodecContext *avctx, void *data,
         }
         break;
     default:
-        avpriv_request_sample(avctx, "unknown/invalid code");
-        return AVERROR_INVALIDDATA;
+        av_assert0(0);
     }
 
     av_image_copy_plane(c->pic->data[0], c->pic->linesize[0],
diff --git a/libavcodec/pamenc.c b/libavcodec/pamenc.c
index 2b63af9..50c9fcb 100644
--- a/libavcodec/pamenc.c
+++ b/libavcodec/pamenc.c
@@ -2,54 +2,39 @@
  * PAM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/imgutils.h"
-
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 
 static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *pict, int *got_packet)
+                            const AVFrame *p, int *got_packet)
 {
     uint8_t *bytestream_start, *bytestream, *bytestream_end;
-    const AVFrame * const p = pict;
     int i, h, w, n, linesize, depth, maxval, ret;
     const char *tuple_type;
     uint8_t *ptr;
-    int size = av_image_get_buffer_size(avctx->pix_fmt,
-                                        avctx->width, avctx->height, 1);
-
-    if ((ret = ff_alloc_packet(pkt, size + 200)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
-        return ret;
-    }
-
-    bytestream_start =
-    bytestream       = pkt->data;
-    bytestream_end   = pkt->data + pkt->size;
 
     h = avctx->height;
     w = avctx->width;
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_MONOWHITE:
-        n          = (w + 7) >> 3;
+    case AV_PIX_FMT_MONOBLACK:
+        n          = w;
         depth      = 1;
         maxval     = 1;
         tuple_type = "BLACKANDWHITE";
@@ -60,21 +45,59 @@ static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         maxval     = 255;
         tuple_type = "GRAYSCALE";
         break;
+    case AV_PIX_FMT_GRAY16BE:
+        n          = w * 2;
+        depth      = 1;
+        maxval     = 0xFFFF;
+        tuple_type = "GRAYSCALE";
+        break;
+    case AV_PIX_FMT_GRAY8A:
+        n          = w * 2;
+        depth      = 2;
+        maxval     = 255;
+        tuple_type = "GRAYSCALE_ALPHA";
+        break;
+    case AV_PIX_FMT_YA16BE:
+        n          = w * 4;
+        depth      = 2;
+        maxval     = 0xFFFF;
+        tuple_type = "GRAYSCALE_ALPHA";
+        break;
     case AV_PIX_FMT_RGB24:
         n          = w * 3;
         depth      = 3;
         maxval     = 255;
         tuple_type = "RGB";
         break;
-    case AV_PIX_FMT_RGB32:
+    case AV_PIX_FMT_RGBA:
         n          = w * 4;
         depth      = 4;
         maxval     = 255;
         tuple_type = "RGB_ALPHA";
         break;
+    case AV_PIX_FMT_RGB48BE:
+        n          = w * 6;
+        depth      = 3;
+        maxval     = 0xFFFF;
+        tuple_type = "RGB";
+        break;
+    case AV_PIX_FMT_RGBA64BE:
+        n          = w * 8;
+        depth      = 4;
+        maxval     = 0xFFFF;
+        tuple_type = "RGB_ALPHA";
+        break;
     default:
         return -1;
     }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, n*h + 200, 0)) < 0)
+        return ret;
+
+    bytestream_start =
+    bytestream       = pkt->data;
+    bytestream_end   = pkt->data + pkt->size;
+
     snprintf(bytestream, bytestream_end - bytestream,
              "P7\nWIDTH %d\nHEIGHT %d\nDEPTH %d\nMAXVAL %d\nTUPLTYPE %s\nENDHDR\n",
              w, h, depth, maxval, tuple_type);
@@ -83,16 +106,11 @@ static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     ptr      = p->data[0];
     linesize = p->linesize[0];
 
-    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+    if (avctx->pix_fmt == AV_PIX_FMT_MONOBLACK){
         int j;
-        unsigned int v;
-
         for (i = 0; i < h; i++) {
-            for (j = 0; j < w; j++) {
-                v = ((uint32_t *)ptr)[j];
-                bytestream_put_be24(&bytestream, v);
-                *bytestream++ = v >> 24;
-            }
+            for (j = 0; j < w; j++)
+                *bytestream++ = ptr[j >> 3] >> (7 - j & 7) & 1;
             ptr += linesize;
         }
     } else {
@@ -129,7 +147,10 @@ AVCodec ff_pam_encoder = {
     .init           = pam_encode_init,
     .encode2        = pam_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB32, AV_PIX_FMT_GRAY8, AV_PIX_FMT_MONOWHITE,
-        AV_PIX_FMT_NONE
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+        AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/parser.c b/libavcodec/parser.c
index b74c22b..0a994a3 100644
--- a/libavcodec/parser.c
+++ b/libavcodec/parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,38 +24,24 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
 #include "internal.h"
 #include "parser.h"
 
-static AVCodecParser *av_first_parser = NULL;
-
-AVCodecParser *av_parser_next(const AVCodecParser *p)
-{
-    if (p)
-        return p->next;
-    else
-        return av_first_parser;
-}
-
-void av_register_codec_parser(AVCodecParser *parser)
-{
-    parser->next = av_first_parser;
-    av_first_parser = parser;
-}
-
 AVCodecParserContext *av_parser_init(int codec_id)
 {
-    AVCodecParserContext *s;
-    AVCodecParser *parser;
+    AVCodecParserContext *s = NULL;
+    const AVCodecParser *parser;
+    void *i = 0;
     int ret;
 
     if (codec_id == AV_CODEC_ID_NONE)
         return NULL;
 
-    for (parser = av_first_parser; parser != NULL; parser = parser->next) {
+    while ((parser = av_parser_iterate(&i))) {
         if (parser->codec_ids[0] == codec_id ||
             parser->codec_ids[1] == codec_id ||
             parser->codec_ids[2] == codec_id ||
@@ -68,25 +54,18 @@ AVCodecParserContext *av_parser_init(int codec_id)
 found:
     s = av_mallocz(sizeof(AVCodecParserContext));
     if (!s)
-        return NULL;
-    s->parser = parser;
-    if (parser->priv_data_size) {
-        s->priv_data = av_mallocz(parser->priv_data_size);
-        if (!s->priv_data) {
-            av_free(s);
-            return NULL;
-        }
-    }
+        goto err_out;
+    s->parser = (AVCodecParser*)parser;
+    s->priv_data = av_mallocz(parser->priv_data_size);
+    if (!s->priv_data)
+        goto err_out;
+    s->fetch_timestamp=1;
+    s->pict_type = AV_PICTURE_TYPE_I;
     if (parser->parser_init) {
         ret = parser->parser_init(s);
-        if (ret != 0) {
-            av_free(s->priv_data);
-            av_free(s);
-            return NULL;
-        }
+        if (ret != 0)
+            goto err_out;
     }
-    s->fetch_timestamp      = 1;
-    s->pict_type            = AV_PICTURE_TYPE_I;
     s->key_frame            = -1;
 #if FF_API_CONVERGENCE_DURATION
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -99,25 +78,37 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->format               = -1;
 
     return s;
+
+err_out:
+    if (s)
+        av_freep(&s->priv_data);
+    av_free(s);
+    return NULL;
 }
 
-void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove)
+void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy)
 {
     int i;
 
-    s->dts    =
-    s->pts    = AV_NOPTS_VALUE;
-    s->pos    = -1;
-    s->offset = 0;
+    if (!fuzzy) {
+        s->dts    =
+        s->pts    = AV_NOPTS_VALUE;
+        s->pos    = -1;
+        s->offset = 0;
+    }
     for (i = 0; i < AV_PARSER_PTS_NB; i++) {
         if (s->cur_offset + off >= s->cur_frame_offset[i] &&
             (s->frame_offset < s->cur_frame_offset[i] ||
-             (!s->frame_offset && !s->next_frame_offset)) &&
-            s->cur_frame_end[i]) {
-            s->dts    = s->cur_frame_dts[i];
-            s->pts    = s->cur_frame_pts[i];
-            s->pos    = s->cur_frame_pos[i];
-            s->offset = s->next_frame_offset - s->cur_frame_offset[i];
+             (!s->frame_offset && !s->next_frame_offset)) && // first field/frame
+            // check disabled since MPEG-TS does not send complete PES packets
+            /*s->next_frame_offset + off <*/  s->cur_frame_end[i]){
+
+            if (!fuzzy || s->cur_frame_dts[i] != AV_NOPTS_VALUE) {
+                s->dts    = s->cur_frame_dts[i];
+                s->pts    = s->cur_frame_pts[i];
+                s->pos    = s->cur_frame_pos[i];
+                s->offset = s->next_frame_offset - s->cur_frame_offset[i];
+            }
             if (remove)
                 s->cur_frame_offset[i] = INT64_MAX;
             if (s->cur_offset + off < s->cur_frame_end[i])
@@ -134,6 +125,15 @@ int av_parser_parse2(AVCodecParserContext *s, AVCodecContext *avctx,
     int index, i;
     uint8_t dummy_buf[AV_INPUT_BUFFER_PADDING_SIZE];
 
+    av_assert1(avctx->codec_id != AV_CODEC_ID_NONE);
+
+    /* Parsers only work for the specified codec ids. */
+    av_assert1(avctx->codec_id == s->parser->codec_ids[0] ||
+               avctx->codec_id == s->parser->codec_ids[1] ||
+               avctx->codec_id == s->parser->codec_ids[2] ||
+               avctx->codec_id == s->parser->codec_ids[3] ||
+               avctx->codec_id == s->parser->codec_ids[4]);
+
     if (!(s->flags & PARSER_FLAG_FETCHED_OFFSET)) {
         s->next_frame_offset =
         s->cur_offset        = pos;
@@ -160,11 +160,17 @@ int av_parser_parse2(AVCodecParserContext *s, AVCodecContext *avctx,
         s->last_pts        = s->pts;
         s->last_dts        = s->dts;
         s->last_pos        = s->pos;
-        ff_fetch_timestamp(s, 0, 0);
+        ff_fetch_timestamp(s, 0, 0, 0);
     }
     /* WARNING: the returned index can be negative */
     index = s->parser->parser_parse(s, avctx, (const uint8_t **) poutbuf,
                                     poutbuf_size, buf, buf_size);
+    av_assert0(index > -0x20000000); // The API does not allow returning AVERROR codes
+#define FILL(name) if(s->name > 0 && avctx->name <= 0) avctx->name = s->name
+    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+        FILL(field_order);
+    }
+
     /* update the file pointer */
     if (*poutbuf_size) {
         /* fill the data for the current frame */
@@ -220,7 +226,7 @@ void av_parser_close(AVCodecParserContext *s)
     if (s) {
         if (s->parser->parser_close)
             s->parser->parser_close(s);
-        av_free(s->priv_data);
+        av_freep(&s->priv_data);
         av_free(s);
     }
 }
@@ -251,14 +257,19 @@ int ff_combine_frame(ParseContext *pc, int next,
                                            *buf_size + pc->index +
                                            AV_INPUT_BUFFER_PADDING_SIZE);
 
-        if (!new_buffer)
+        if (!new_buffer) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", *buf_size + pc->index + AV_INPUT_BUFFER_PADDING_SIZE);
+            pc->index = 0;
             return AVERROR(ENOMEM);
+        }
         pc->buffer = new_buffer;
         memcpy(&pc->buffer[pc->index], *buf, *buf_size);
         pc->index += *buf_size;
         return -1;
     }
 
+    av_assert0(next >= 0 || pc->buffer);
+
     *buf_size          =
     pc->overread_index = pc->index + next;
 
@@ -267,9 +278,12 @@ int ff_combine_frame(ParseContext *pc, int next,
         void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
                                            next + pc->index +
                                            AV_INPUT_BUFFER_PADDING_SIZE);
-
-        if (!new_buffer)
+        if (!new_buffer) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", next + pc->index + AV_INPUT_BUFFER_PADDING_SIZE);
+            pc->overread_index =
+            pc->index = 0;
             return AVERROR(ENOMEM);
+        }
         pc->buffer = new_buffer;
         if (next > -AV_INPUT_BUFFER_PADDING_SIZE)
             memcpy(&pc->buffer[pc->index], *buf,
@@ -304,13 +318,14 @@ void ff_parse_close(AVCodecParserContext *s)
 
 int ff_mpeg4video_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
 {
-    int i;
     uint32_t state = -1;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
 
-    for (i = 0; i < buf_size; i++) {
-        state = state << 8 | buf[i];
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
         if (state == 0x1B3 || state == 0x1B6)
-            return i - 3;
+            return ptr - 4 - buf;
     }
+
     return 0;
 }
diff --git a/libavcodec/parser.h b/libavcodec/parser.h
index ea1cae2..ef35547 100644
--- a/libavcodec/parser.h
+++ b/libavcodec/parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,7 +53,8 @@ void ff_parse_close(AVCodecParserContext *s);
  * Fetch timestamps for a specific byte within the current access unit.
  * @param off byte position within the access unit
  * @param remove Found timestamps will be removed if set to 1, kept if set to 0.
+ * @param fuzzy Only use found value if it is more informative than what we already have
  */
-void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove);
+void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy);
 
 #endif /* AVCODEC_PARSER_H */
diff --git a/libavcodec/parsers.c b/libavcodec/parsers.c
new file mode 100644
index 0000000..33a71de
--- /dev/null
+++ b/libavcodec/parsers.c
@@ -0,0 +1,110 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/thread.h"
+
+#include "avcodec.h"
+
+extern AVCodecParser ff_aac_parser;
+extern AVCodecParser ff_aac_latm_parser;
+extern AVCodecParser ff_ac3_parser;
+extern AVCodecParser ff_adx_parser;
+extern AVCodecParser ff_av1_parser;
+extern AVCodecParser ff_avs2_parser;
+extern AVCodecParser ff_bmp_parser;
+extern AVCodecParser ff_cavsvideo_parser;
+extern AVCodecParser ff_cook_parser;
+extern AVCodecParser ff_dca_parser;
+extern AVCodecParser ff_dirac_parser;
+extern AVCodecParser ff_dnxhd_parser;
+extern AVCodecParser ff_dpx_parser;
+extern AVCodecParser ff_dvaudio_parser;
+extern AVCodecParser ff_dvbsub_parser;
+extern AVCodecParser ff_dvdsub_parser;
+extern AVCodecParser ff_dvd_nav_parser;
+extern AVCodecParser ff_flac_parser;
+extern AVCodecParser ff_g723_1_parser;
+extern AVCodecParser ff_g729_parser;
+extern AVCodecParser ff_gif_parser;
+extern AVCodecParser ff_gsm_parser;
+extern AVCodecParser ff_h261_parser;
+extern AVCodecParser ff_h263_parser;
+extern AVCodecParser ff_h264_parser;
+extern AVCodecParser ff_hevc_parser;
+extern AVCodecParser ff_mjpeg_parser;
+extern AVCodecParser ff_mlp_parser;
+extern AVCodecParser ff_mpeg4video_parser;
+extern AVCodecParser ff_mpegaudio_parser;
+extern AVCodecParser ff_mpegvideo_parser;
+extern AVCodecParser ff_opus_parser;
+extern AVCodecParser ff_png_parser;
+extern AVCodecParser ff_pnm_parser;
+extern AVCodecParser ff_rv30_parser;
+extern AVCodecParser ff_rv40_parser;
+extern AVCodecParser ff_sbc_parser;
+extern AVCodecParser ff_sipr_parser;
+extern AVCodecParser ff_tak_parser;
+extern AVCodecParser ff_vc1_parser;
+extern AVCodecParser ff_vorbis_parser;
+extern AVCodecParser ff_vp3_parser;
+extern AVCodecParser ff_vp8_parser;
+extern AVCodecParser ff_vp9_parser;
+extern AVCodecParser ff_xma_parser;
+
+#include "libavcodec/parser_list.c"
+
+static AVOnce av_parser_next_init = AV_ONCE_INIT;
+
+static void av_parser_init_next(void)
+{
+    AVCodecParser *prev = NULL, *p;
+    int i = 0;
+    while ((p = (AVCodecParser*)parser_list[i++])) {
+        if (prev)
+            prev->next = p;
+        prev = p;
+    }
+}
+
+AVCodecParser *av_parser_next(const AVCodecParser *p)
+{
+    ff_thread_once(&av_parser_next_init, av_parser_init_next);
+
+    if (p)
+        return p->next;
+    else
+        return (AVCodecParser*)parser_list[0];
+}
+
+const AVCodecParser *av_parser_iterate(void **opaque)
+{
+    uintptr_t i = (uintptr_t)*opaque;
+    const AVCodecParser *p = parser_list[i];
+
+    if (p)
+        *opaque = (void*)(i + 1);
+
+    return p;
+}
+
+void av_register_codec_parser(AVCodecParser *parser)
+{
+    ff_thread_once(&av_parser_next_init, av_parser_init_next);
+}
diff --git a/libavcodec/pcm-bluray.c b/libavcodec/pcm-bluray.c
index 51fcd2d..517d7b5 100644
--- a/libavcodec/pcm-bluray.c
+++ b/libavcodec/pcm-bluray.c
@@ -2,20 +2,20 @@
  * LPCM codecs for PCM format found in Blu-ray PCM streams
  * Copyright (c) 2009, 2013 Christian Schmidt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,13 +71,14 @@ static int pcm_bluray_parse_header(AVCodecContext *avctx,
 
     /* get the sample depth and derive the sample format from it */
     avctx->bits_per_coded_sample = bits_per_samples[header[3] >> 6];
-    if (!avctx->bits_per_coded_sample) {
-        av_log(avctx, AV_LOG_ERROR, "reserved sample depth (0)\n");
+    if (!(avctx->bits_per_coded_sample == 16 || avctx->bits_per_coded_sample == 24)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported sample depth (%d)\n", avctx->bits_per_coded_sample);
         return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = avctx->bits_per_coded_sample == 16 ? AV_SAMPLE_FMT_S16
                                                            : AV_SAMPLE_FMT_S32;
-    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
+    if (avctx->sample_fmt == AV_SAMPLE_FMT_S32)
+        avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     /* get the sample rate. Not all values are used. */
     switch (header[2] & 0x0f) {
@@ -116,7 +117,7 @@ static int pcm_bluray_parse_header(AVCodecContext *avctx,
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
         ff_dlog(avctx,
-                "pcm_bluray_parse_header: %d channels, %d bits per sample, %d Hz, %d bit/s\n",
+                "pcm_bluray_parse_header: %d channels, %d bits per sample, %d Hz, %"PRId64" bit/s\n",
                 avctx->channels, avctx->bits_per_coded_sample,
                 avctx->sample_rate, avctx->bit_rate);
     return 0;
@@ -154,10 +155,8 @@ static int pcm_bluray_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = samples;
-    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0)
         return retval;
-    }
     dst16 = (int16_t *)frame->data[0];
     dst32 = (int32_t *)frame->data[0];
 
diff --git a/libavcodec/pcm-dvd.c b/libavcodec/pcm-dvd.c
index 62aacf8..0a751a8 100644
--- a/libavcodec/pcm-dvd.c
+++ b/libavcodec/pcm-dvd.c
@@ -2,20 +2,20 @@
  * LPCM codecs for PCM formats found in Video DVD streams
  * Copyright (c) 2013 Christian Schmidt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 typedef struct PCMDVDContext {
     uint32_t last_header;    // Cached header to see if parsing is needed
     int block_size;          // Size of a block of samples in bytes
+    int last_block_size;     // Size of the last block of samples in bytes
     int samples_per_block;   // Number of samples per channel per block
     int groups_per_block;    // Number of 20/24-bit sample groups per block
     uint8_t *extra_samples;  // Pointer to leftover samples from a frame
@@ -69,9 +70,10 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
     /* early exit if the header didn't change apart from the frame number */
     if (s->last_header == header_int)
         return 0;
+    s->last_header = -1;
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
-        ff_dlog(avctx, "pcm_dvd_parse_header: header = %02x%02x%02x\n",
+        av_log(avctx, AV_LOG_DEBUG, "pcm_dvd_parse_header: header = %02x%02x%02x\n",
                 header[0], header[1], header[2]);
     /*
      * header[0] emphasis (1), muse(1), reserved(1), frame number(5)
@@ -85,7 +87,9 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
     /* get the sample depth and derive the sample format from it */
     avctx->bits_per_coded_sample = 16 + (header[1] >> 6 & 3) * 4;
     if (avctx->bits_per_coded_sample == 28) {
-        av_log(avctx, AV_LOG_ERROR, "PCM DVD unsupported sample depth\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "PCM DVD unsupported sample depth %i\n",
+               avctx->bits_per_coded_sample);
         return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = avctx->bits_per_coded_sample == 16 ? AV_SAMPLE_FMT_S16
@@ -136,7 +140,7 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
         ff_dlog(avctx,
-                "pcm_dvd_parse_header: %d channels, %d bits per sample, %d Hz, %d bit/s\n",
+                "pcm_dvd_parse_header: %d channels, %d bits per sample, %d Hz, %"PRId64" bit/s\n",
                 avctx->channels, avctx->bits_per_coded_sample,
                 avctx->sample_rate, avctx->bit_rate);
 
@@ -170,6 +174,17 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
         return dst16;
     }
     case 20:
+        if (avctx->channels == 1) {
+            do {
+                for (i = 2; i; i--) {
+                    dst32[0] = bytestream2_get_be16u(&gb) << 16;
+                    dst32[1] = bytestream2_get_be16u(&gb) << 16;
+                    t = bytestream2_get_byteu(&gb);
+                    *dst32++ += (t & 0xf0) << 8;
+                    *dst32++ += (t & 0x0f) << 12;
+                }
+            } while (--blocks);
+        } else {
         do {
             for (i = s->groups_per_block; i; i--) {
                 dst32[0] = bytestream2_get_be16u(&gb) << 16;
@@ -184,8 +199,19 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
                 *dst32++ += (t & 0x0f) << 12;
             }
         } while (--blocks);
+        }
         return dst32;
     case 24:
+        if (avctx->channels == 1) {
+            do {
+                for (i = 2; i; i--) {
+                    dst32[0] = bytestream2_get_be16u(&gb) << 16;
+                    dst32[1] = bytestream2_get_be16u(&gb) << 16;
+                    *dst32++ += bytestream2_get_byteu(&gb) << 8;
+                    *dst32++ += bytestream2_get_byteu(&gb) << 8;
+                }
+            } while (--blocks);
+        } else {
         do {
             for (i = s->groups_per_block; i; i--) {
                 dst32[0] = bytestream2_get_be16u(&gb) << 16;
@@ -198,6 +224,7 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
                 *dst32++ += bytestream2_get_byteu(&gb) << 8;
             }
         } while (--blocks);
+        }
         return dst32;
     default:
         return NULL;
@@ -222,6 +249,11 @@ static int pcm_dvd_decode_frame(AVCodecContext *avctx, void *data,
 
     if ((retval = pcm_dvd_parse_header(avctx, src)))
         return retval;
+    if (s->last_block_size && s->last_block_size != s->block_size) {
+        av_log(avctx, AV_LOG_WARNING, "block_size has changed %d != %d\n", s->last_block_size, s->block_size);
+        s->extra_sample_count = 0;
+    }
+    s->last_block_size = s->block_size;
     src      += 3;
     buf_size -= 3;
 
@@ -229,10 +261,8 @@ static int pcm_dvd_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = blocks * s->samples_per_block;
-    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0)
         return retval;
-    }
     dst = frame->data[0];
 
     /* consume leftover samples from last packet */
diff --git a/libavcodec/pcm-dvdenc.c b/libavcodec/pcm-dvdenc.c
new file mode 100644
index 0000000..d26eaf0
--- /dev/null
+++ b/libavcodec/pcm-dvdenc.c
@@ -0,0 +1,197 @@
+/*
+ * LPCM codecs for PCM formats found in Video DVD streams
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+typedef struct PCMDVDContext {
+    uint8_t header[3];       // Header added to every frame
+    int block_size;          // Size of a block of samples in bytes
+    int samples_per_block;   // Number of samples per channel per block
+    int groups_per_block;    // Number of 20/24-bit sample groups per block
+    uint8_t *extra_samples;  // Pointer to leftover samples from a frame
+    int extra_sample_count;  // Number of leftover samples in the buffer
+} PCMDVDContext;
+
+static av_cold int pcm_dvd_encode_init(AVCodecContext *avctx)
+{
+    PCMDVDContext *s = avctx->priv_data;
+    int quant, freq, frame_size;
+
+    switch (avctx->sample_rate) {
+    case 48000:
+        freq = 0;
+        break;
+    case 96000:
+        freq = 1;
+        break;
+    }
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_S16:
+        avctx->bits_per_coded_sample = 16;
+        quant = 0;
+        break;
+    case AV_SAMPLE_FMT_S32:
+        avctx->bits_per_coded_sample = 24;
+        quant = 2;
+        break;
+    }
+
+    avctx->bits_per_coded_sample = 16 + quant * 4;
+    avctx->block_align           = avctx->channels * avctx->bits_per_coded_sample / 8;
+    avctx->bit_rate              = avctx->block_align * 8LL * avctx->sample_rate;
+    if (avctx->bit_rate > 9800000) {
+        av_log(avctx, AV_LOG_ERROR, "Too big bitrate: reduce sample rate, bitdepth or channels.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->sample_fmt == AV_SAMPLE_FMT_S16) {
+        s->samples_per_block = 1;
+        s->block_size        = avctx->channels * 2;
+        frame_size           = 2008 / s->block_size;
+    } else {
+        switch (avctx->channels) {
+        case 1:
+        case 2:
+        case 4:
+            /* one group has all the samples needed */
+            s->block_size        = 4 * avctx->bits_per_coded_sample / 8;
+            s->samples_per_block = 4 / avctx->channels;
+            s->groups_per_block  = 1;
+            break;
+        case 8:
+            /* two groups have all the samples needed */
+            s->block_size        = 8 * avctx->bits_per_coded_sample / 8;
+            s->samples_per_block = 1;
+            s->groups_per_block  = 2;
+            break;
+        default:
+            /* need avctx->channels groups */
+            s->block_size        = 4 * avctx->channels *
+                                   avctx->bits_per_coded_sample / 8;
+            s->samples_per_block = 4;
+            s->groups_per_block  = avctx->channels;
+            break;
+        }
+
+        frame_size = FFALIGN(2008 / s->block_size, s->samples_per_block);
+    }
+
+    s->header[0] = 0x0c;
+    s->header[1] = (quant << 6) | (freq << 4) | (avctx->channels - 1);
+    s->header[2] = 0x80;
+
+    if (!avctx->frame_size)
+        avctx->frame_size = frame_size;
+
+    return 0;
+}
+
+static int pcm_dvd_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                const AVFrame *frame, int *got_packet_ptr)
+{
+    PCMDVDContext *s = avctx->priv_data;
+    int samples = frame->nb_samples * avctx->channels;
+    int64_t pkt_size = (frame->nb_samples / s->samples_per_block) * s->block_size + 3;
+    int blocks = (pkt_size - 3) / s->block_size;
+    const int16_t *src16;
+    const int32_t *src32;
+    PutByteContext pb;
+    int ret;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, pkt_size, 0)) < 0)
+        return ret;
+
+    memcpy(avpkt->data, s->header, 3);
+
+    src16 = (const int16_t *)frame->data[0];
+    src32 = (const int32_t *)frame->data[0];
+
+    bytestream2_init_writer(&pb, avpkt->data + 3, avpkt->size - 3);
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_S16:
+        do {
+            bytestream2_put_be16(&pb, *src16++);
+        } while (--samples);
+        break;
+    case AV_SAMPLE_FMT_S32:
+        if (avctx->channels == 1) {
+            do {
+                for (int i = 2; i; i--) {
+                    bytestream2_put_be16(&pb, src32[0] >> 16);
+                    bytestream2_put_be16(&pb, src32[1] >> 16);
+                    bytestream2_put_byte(&pb, (*src32++) >> 24);
+                    bytestream2_put_byte(&pb, (*src32++) >> 24);
+                }
+            } while (--blocks);
+        } else {
+            do {
+                for (int i = s->groups_per_block; i; i--) {
+                    bytestream2_put_be16(&pb, src32[0] >> 16);
+                    bytestream2_put_be16(&pb, src32[1] >> 16);
+                    bytestream2_put_be16(&pb, src32[2] >> 16);
+                    bytestream2_put_be16(&pb, src32[3] >> 16);
+                    bytestream2_put_byte(&pb, (*src32++) >> 24);
+                    bytestream2_put_byte(&pb, (*src32++) >> 24);
+                    bytestream2_put_byte(&pb, (*src32++) >> 24);
+                    bytestream2_put_byte(&pb, (*src32++) >> 24);
+                }
+            } while (--blocks);
+        }
+        break;
+    }
+
+    avpkt->pts      = frame->pts;
+    avpkt->size     = pkt_size;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    *got_packet_ptr = 1;
+
+    return 0;
+}
+
+static av_cold int pcm_dvd_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_pcm_dvd_encoder = {
+    .name           = "pcm_dvd",
+    .long_name      = NULL_IF_CONFIG_SMALL("PCM signed 16|20|24-bit big-endian for DVD media"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_PCM_DVD,
+    .priv_data_size = sizeof(PCMDVDContext),
+    .init           = pcm_dvd_encode_init,
+    .close          = pcm_dvd_encode_close,
+    .encode2        = pcm_dvd_encode_frame,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .supported_samplerates = (const int[]) { 48000, 96000, 0},
+    .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                            AV_CH_LAYOUT_STEREO,
+                                            AV_CH_LAYOUT_5POINT1,
+                                            AV_CH_LAYOUT_7POINT1,
+                                            0 },
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                     AV_SAMPLE_FMT_S32,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c
index 959c50b..ffcbccc 100644
--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -2,20 +2,20 @@
  * PCM codecs
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -41,13 +42,16 @@ static av_cold int pcm_encode_init(AVCodecContext *avctx)
     case AV_CODEC_ID_PCM_MULAW:
         pcm_ulaw_tableinit();
         break;
+    case AV_CODEC_ID_PCM_VIDC:
+        pcm_vidc_tableinit();
+        break;
     default:
         break;
     }
 
     avctx->bits_per_coded_sample = av_get_bits_per_sample(avctx->codec->id);
     avctx->block_align           = avctx->channels * avctx->bits_per_coded_sample / 8;
-    avctx->bit_rate              = avctx->block_align * avctx->sample_rate * 8;
+    avctx->bit_rate              = avctx->block_align * 8LL * avctx->sample_rate;
 
     return 0;
 }
@@ -69,13 +73,24 @@ static av_cold int pcm_encode_init(AVCodecContext *avctx)
         bytestream_put_ ## endian(&dst, v);                             \
     }
 
+#define ENCODE_PLANAR(type, endian, dst, n, shift, offset)              \
+    n /= avctx->channels;                                               \
+    for (c = 0; c < avctx->channels; c++) {                             \
+        int i;                                                          \
+        samples_ ## type = (const type *) frame->extended_data[c];      \
+        for (i = n; i > 0; i--) {                                       \
+            register type v = (*samples_ ## type++ >> shift) + offset;  \
+            bytestream_put_ ## endian(&dst, v);                         \
+        }                                                               \
+    }
+
 static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                             const AVFrame *frame, int *got_packet_ptr)
 {
-    int n, sample_size, v, ret;
+    int n, c, sample_size, v, ret;
     const short *samples;
     unsigned char *dst;
-    const uint8_t *srcu8;
+    const uint8_t *samples_uint8_t;
     const int16_t *samples_int16_t;
     const int32_t *samples_int32_t;
     const int64_t *samples_int64_t;
@@ -86,10 +101,8 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     n           = frame->nb_samples * avctx->channels;
     samples     = (const short *)frame->data[0];
 
-    if ((ret = ff_alloc_packet(avpkt, n * sample_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, n * sample_size, n * sample_size)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     switch (avctx->codec->id) {
@@ -102,6 +115,9 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_S24LE:
         ENCODE(int32_t, le24, samples, dst, n, 8, 0)
         break;
+    case AV_CODEC_ID_PCM_S24LE_PLANAR:
+        ENCODE_PLANAR(int32_t, le24, dst, n, 8, 0)
+        break;
     case AV_CODEC_ID_PCM_S24BE:
         ENCODE(int32_t, be24, samples, dst, n, 8, 0)
         break;
@@ -127,13 +143,13 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         ENCODE(uint16_t, be16, samples, dst, n, 0, 0x8000)
         break;
     case AV_CODEC_ID_PCM_S8:
-        srcu8 = frame->data[0];
-        for (; n > 0; n--) {
-            v      = *srcu8++;
-            *dst++ = v - 128;
-        }
+        ENCODE(uint8_t, byte, samples, dst, n, 0, -128)
+        break;
+    case AV_CODEC_ID_PCM_S8_PLANAR:
+        ENCODE_PLANAR(uint8_t, byte, dst, n, 0, -128)
         break;
 #if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S64LE:
     case AV_CODEC_ID_PCM_F64LE:
         ENCODE(int64_t, le64, samples, dst, n, 0, 0)
         break;
@@ -141,14 +157,22 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_F32LE:
         ENCODE(int32_t, le32, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+        ENCODE_PLANAR(int32_t, le32, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_S16LE:
         ENCODE(int16_t, le16, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+        ENCODE_PLANAR(int16_t, le16, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_F64BE:
     case AV_CODEC_ID_PCM_F32BE:
+    case AV_CODEC_ID_PCM_S64BE:
     case AV_CODEC_ID_PCM_S32BE:
     case AV_CODEC_ID_PCM_S16BE:
 #else
+    case AV_CODEC_ID_PCM_S64BE:
     case AV_CODEC_ID_PCM_F64BE:
         ENCODE(int64_t, be64, samples, dst, n, 0, 0)
         break;
@@ -159,14 +183,29 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_S16BE:
         ENCODE(int16_t, be16, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+        ENCODE_PLANAR(int16_t, be16, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_F64LE:
     case AV_CODEC_ID_PCM_F32LE:
+    case AV_CODEC_ID_PCM_S64LE:
     case AV_CODEC_ID_PCM_S32LE:
     case AV_CODEC_ID_PCM_S16LE:
 #endif /* HAVE_BIGENDIAN */
     case AV_CODEC_ID_PCM_U8:
         memcpy(dst, samples, n * sample_size);
-        dst += n * sample_size;
+        break;
+#if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+#else
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+#endif /* HAVE_BIGENDIAN */
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            const uint8_t *src = frame->extended_data[c];
+            bytestream_put_buffer(&dst, src, n * sample_size);
+        }
         break;
     case AV_CODEC_ID_PCM_ALAW:
         for (; n > 0; n--) {
@@ -180,6 +219,12 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             *dst++ = linear_to_ulaw[(v + 32768) >> 2];
         }
         break;
+    case AV_CODEC_ID_PCM_VIDC:
+        for (; n > 0; n--) {
+            v      = *samples++;
+            *dst++ = linear_to_vidc[(v + 32768) >> 2];
+        }
+        break;
     default:
         return -1;
     }
@@ -190,6 +235,8 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
 typedef struct PCMDecode {
     short   table[256];
+    AVFloatDSPContext *fdsp;
+    float   scale;
 } PCMDecode;
 
 static av_cold int pcm_decode_init(AVCodecContext *avctx)
@@ -202,7 +249,7 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    switch (avctx->codec->id) {
+    switch (avctx->codec_id) {
     case AV_CODEC_ID_PCM_ALAW:
         for (i = 0; i < 256; i++)
             s->table[i] = alaw2linear(i);
@@ -211,6 +258,17 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
         for (i = 0; i < 256; i++)
             s->table[i] = ulaw2linear(i);
         break;
+    case AV_CODEC_ID_PCM_VIDC:
+        for (i = 0; i < 256; i++)
+            s->table[i] = vidc2linear(i);
+        break;
+    case AV_CODEC_ID_PCM_F16LE:
+    case AV_CODEC_ID_PCM_F24LE:
+        s->scale = 1. / (1 << (avctx->bits_per_coded_sample - 1));
+        s->fdsp = avpriv_float_dsp_alloc(0);
+        if (!s->fdsp)
+            return AVERROR(ENOMEM);
+        break;
     default:
         break;
     }
@@ -218,7 +276,16 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
     avctx->sample_fmt = avctx->codec->sample_fmts[0];
 
     if (avctx->sample_fmt == AV_SAMPLE_FMT_S32)
-        avctx->bits_per_raw_sample = av_get_bits_per_sample(avctx->codec->id);
+        avctx->bits_per_raw_sample = av_get_bits_per_sample(avctx->codec_id);
+
+    return 0;
+}
+
+static av_cold int pcm_decode_close(AVCodecContext *avctx)
+{
+    PCMDecode *s = avctx->priv_data;
+
+    av_freep(&s->fdsp);
 
     return 0;
 }
@@ -240,28 +307,17 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
         dst += size / 8;                                                \
     }
 
-#if HAVE_BIGENDIAN
 #define DECODE_PLANAR(size, endian, src, dst, n, shift, offset)         \
-    {                                                                   \
-        int n2;                                                         \
-        n /= avctx->channels;                                           \
-        for (c = 0; c < avctx->channels; c++) {                         \
-            samples = frame->extended_data[c];                          \
-            n2 = n;                                                     \
-            DECODE(size, endian, src, samples, n2, 0, 0)                \
+    n /= avctx->channels;                                               \
+    for (c = 0; c < avctx->channels; c++) {                             \
+        int i;                                                          \
+        dst = frame->extended_data[c];                                \
+        for (i = n; i > 0; i--) {                                       \
+            uint ## size ## _t v = bytestream_get_ ## endian(&src);     \
+            AV_WN ## size ## A(dst, (v - offset) << shift);             \
+            dst += size / 8;                                            \
         }                                                               \
     }
-#else
-#define DECODE_PLANAR(size, endian, src, dst, n, shift, offset)         \
-    {                                                                   \
-        n /= avctx->channels;                                           \
-        for (c = 0; c < avctx->channels; c++) {                         \
-            samples = frame->extended_data[c];                          \
-            memcpy(samples, src, n * size / 8);                         \
-            src += n * size / 8;                                        \
-        }                                                               \
-    }
-#endif /* HAVE_BIGENDIAN */
 
 static int pcm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame_ptr, AVPacket *avpkt)
@@ -289,12 +345,24 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR(EINVAL);
     }
 
+    if (avctx->channels == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->codec_id != avctx->codec->id) {
+        av_log(avctx, AV_LOG_ERROR, "codec ids mismatch\n");
+        return AVERROR(EINVAL);
+    }
+
     n = avctx->channels * sample_size;
 
     if (n && buf_size % n) {
         if (buf_size < n) {
-            av_log(avctx, AV_LOG_ERROR, "invalid PCM packet\n");
-            return -1;
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid PCM packet, data has size %d but at least a size of %d was expected\n",
+                   buf_size, n);
+            return AVERROR_INVALIDDATA;
         } else
             buf_size -= buf_size % n;
     }
@@ -303,13 +371,11 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = n * samples_per_block / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = frame->data[0];
 
-    switch (avctx->codec->id) {
+    switch (avctx->codec_id) {
     case AV_CODEC_ID_PCM_U32LE:
         DECODE(32, le32, src, samples, n, 0, 0x80000000)
         break;
@@ -319,6 +385,9 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_S24LE:
         DECODE(32, le24, src, samples, n, 8, 0)
         break;
+    case AV_CODEC_ID_PCM_S24LE_PLANAR:
+        DECODE_PLANAR(32, le24, src, samples, n, 8, 0);
+        break;
     case AV_CODEC_ID_PCM_S24BE:
         DECODE(32, be24, src, samples, n, 8, 0)
         break;
@@ -337,18 +406,6 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
             samples += 2;
         }
         break;
-    case AV_CODEC_ID_PCM_S16BE_PLANAR:
-        DECODE_PLANAR(16, be16, src, samples, n, 0, 0);
-        break;
-    case AV_CODEC_ID_PCM_S16LE_PLANAR:
-        DECODE_PLANAR(16, le16, src, samples, n, 0, 0);
-        break;
-    case AV_CODEC_ID_PCM_S24LE_PLANAR:
-        DECODE_PLANAR(32, le24, src, samples, n, 8, 0);
-        break;
-    case AV_CODEC_ID_PCM_S32LE_PLANAR:
-        DECODE_PLANAR(32, le32, src, samples, n, 0, 0);
-        break;
     case AV_CODEC_ID_PCM_U16LE:
         DECODE(16, le16, src, samples, n, 0, 0x8000)
         break;
@@ -359,22 +416,42 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         for (; n > 0; n--)
             *samples++ = *src++ + 128;
         break;
+    case AV_CODEC_ID_PCM_S8_PLANAR:
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            int i;
+            samples = frame->extended_data[c];
+            for (i = n; i > 0; i--)
+                *samples++ = *src++ + 128;
+        }
+        break;
 #if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S64LE:
     case AV_CODEC_ID_PCM_F64LE:
         DECODE(64, le64, src, samples, n, 0, 0)
         break;
     case AV_CODEC_ID_PCM_S32LE:
     case AV_CODEC_ID_PCM_F32LE:
+    case AV_CODEC_ID_PCM_F24LE:
+    case AV_CODEC_ID_PCM_F16LE:
         DECODE(32, le32, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+        DECODE_PLANAR(32, le32, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_S16LE:
         DECODE(16, le16, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+        DECODE_PLANAR(16, le16, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_F64BE:
     case AV_CODEC_ID_PCM_F32BE:
+    case AV_CODEC_ID_PCM_S64BE:
     case AV_CODEC_ID_PCM_S32BE:
     case AV_CODEC_ID_PCM_S16BE:
 #else
+    case AV_CODEC_ID_PCM_S64BE:
     case AV_CODEC_ID_PCM_F64BE:
         DECODE(64, be64, src, samples, n, 0, 0)
         break;
@@ -385,14 +462,32 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_S16BE:
         DECODE(16, be16, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+        DECODE_PLANAR(16, be16, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_F64LE:
     case AV_CODEC_ID_PCM_F32LE:
+    case AV_CODEC_ID_PCM_F24LE:
+    case AV_CODEC_ID_PCM_F16LE:
+    case AV_CODEC_ID_PCM_S64LE:
     case AV_CODEC_ID_PCM_S32LE:
     case AV_CODEC_ID_PCM_S16LE:
 #endif /* HAVE_BIGENDIAN */
     case AV_CODEC_ID_PCM_U8:
         memcpy(samples, src, n * sample_size);
         break;
+#if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+#else
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+#endif /* HAVE_BIGENDIAN */
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            samples = frame->extended_data[c];
+            bytestream_get_buffer(&src, samples, n * sample_size);
+        }
+        break;
     case AV_CODEC_ID_PCM_ZORK:
         for (; n > 0; n--) {
             int v = *src++;
@@ -403,6 +498,7 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         break;
     case AV_CODEC_ID_PCM_ALAW:
     case AV_CODEC_ID_PCM_MULAW:
+    case AV_CODEC_ID_PCM_VIDC:
         for (; n > 0; n--) {
             AV_WN16A(samples, s->table[*src++]);
             samples += 2;
@@ -436,6 +532,14 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         return -1;
     }
 
+    if (avctx->codec_id == AV_CODEC_ID_PCM_F16LE ||
+        avctx->codec_id == AV_CODEC_ID_PCM_F24LE) {
+        s->fdsp->vector_fmul_scalar((float *)frame->extended_data[0],
+                                    (const float *)frame->extended_data[0],
+                                    s->scale, FFALIGN(frame->nb_samples * avctx->channels, 4));
+        emms_c();
+    }
+
     *got_frame_ptr = 1;
 
     return buf_size;
@@ -471,6 +575,7 @@ AVCodec ff_ ## name_ ## _decoder = {                                        \
     .id             = AV_CODEC_ID_ ## id_,                                  \
     .priv_data_size = sizeof(PCMDecode),                                    \
     .init           = pcm_decode_init,                                      \
+    .close          = pcm_decode_close,                                     \
     .decode         = pcm_decode_frame,                                     \
     .capabilities   = AV_CODEC_CAP_DR1,                                     \
     .sample_fmts    = (const enum AVSampleFormat[]){ sample_fmt_,           \
@@ -489,25 +594,28 @@ AVCodec ff_ ## name_ ## _decoder = {                                        \
     PCM_DECODER(id, sample_fmt_, name, long_name_)
 
 /* Note: Do not forget to add new entries to the Makefile as well. */
-PCM_CODEC  (PCM_ALAW,         AV_SAMPLE_FMT_S16, pcm_alaw,         "PCM A-law");
+PCM_CODEC  (PCM_ALAW,         AV_SAMPLE_FMT_S16, pcm_alaw,         "PCM A-law / G.711 A-law");
+PCM_DECODER(PCM_F16LE,        AV_SAMPLE_FMT_FLT, pcm_f16le,        "PCM 16.8 floating point little-endian");
+PCM_DECODER(PCM_F24LE,        AV_SAMPLE_FMT_FLT, pcm_f24le,        "PCM 24.0 floating point little-endian");
 PCM_CODEC  (PCM_F32BE,        AV_SAMPLE_FMT_FLT, pcm_f32be,        "PCM 32-bit floating point big-endian");
 PCM_CODEC  (PCM_F32LE,        AV_SAMPLE_FMT_FLT, pcm_f32le,        "PCM 32-bit floating point little-endian");
 PCM_CODEC  (PCM_F64BE,        AV_SAMPLE_FMT_DBL, pcm_f64be,        "PCM 64-bit floating point big-endian");
 PCM_CODEC  (PCM_F64LE,        AV_SAMPLE_FMT_DBL, pcm_f64le,        "PCM 64-bit floating point little-endian");
-PCM_DECODER(PCM_LXF,          AV_SAMPLE_FMT_S32P, pcm_lxf,          "PCM signed 20-bit little-endian planar");
-PCM_CODEC  (PCM_MULAW,        AV_SAMPLE_FMT_S16, pcm_mulaw,        "PCM mu-law");
+PCM_DECODER(PCM_LXF,          AV_SAMPLE_FMT_S32P,pcm_lxf,          "PCM signed 20-bit little-endian planar");
+PCM_CODEC  (PCM_MULAW,        AV_SAMPLE_FMT_S16, pcm_mulaw,        "PCM mu-law / G.711 mu-law");
 PCM_CODEC  (PCM_S8,           AV_SAMPLE_FMT_U8,  pcm_s8,           "PCM signed 8-bit");
+PCM_CODEC  (PCM_S8_PLANAR,    AV_SAMPLE_FMT_U8P, pcm_s8_planar,    "PCM signed 8-bit planar");
 PCM_CODEC  (PCM_S16BE,        AV_SAMPLE_FMT_S16, pcm_s16be,        "PCM signed 16-bit big-endian");
-PCM_DECODER(PCM_S16BE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16be_planar, "PCM signed 16-bit big-endian planar");
+PCM_CODEC  (PCM_S16BE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16be_planar, "PCM signed 16-bit big-endian planar");
 PCM_CODEC  (PCM_S16LE,        AV_SAMPLE_FMT_S16, pcm_s16le,        "PCM signed 16-bit little-endian");
-PCM_DECODER(PCM_S16LE_PLANAR, AV_SAMPLE_FMT_S16P, pcm_s16le_planar, "PCM 16-bit little-endian planar");
+PCM_CODEC  (PCM_S16LE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16le_planar, "PCM signed 16-bit little-endian planar");
 PCM_CODEC  (PCM_S24BE,        AV_SAMPLE_FMT_S32, pcm_s24be,        "PCM signed 24-bit big-endian");
 PCM_CODEC  (PCM_S24DAUD,      AV_SAMPLE_FMT_S16, pcm_s24daud,      "PCM D-Cinema audio signed 24-bit");
 PCM_CODEC  (PCM_S24LE,        AV_SAMPLE_FMT_S32, pcm_s24le,        "PCM signed 24-bit little-endian");
-PCM_DECODER(PCM_S24LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s24le_planar, "PCM signed 24-bit little-endian planar");
+PCM_CODEC  (PCM_S24LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s24le_planar, "PCM signed 24-bit little-endian planar");
 PCM_CODEC  (PCM_S32BE,        AV_SAMPLE_FMT_S32, pcm_s32be,        "PCM signed 32-bit big-endian");
 PCM_CODEC  (PCM_S32LE,        AV_SAMPLE_FMT_S32, pcm_s32le,        "PCM signed 32-bit little-endian");
-PCM_DECODER(PCM_S32LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s32le_planar, "PCM signed 32-bit little-endian planar");
+PCM_CODEC  (PCM_S32LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s32le_planar, "PCM signed 32-bit little-endian planar");
 PCM_CODEC  (PCM_U8,           AV_SAMPLE_FMT_U8,  pcm_u8,           "PCM unsigned 8-bit");
 PCM_CODEC  (PCM_U16BE,        AV_SAMPLE_FMT_S16, pcm_u16be,        "PCM unsigned 16-bit big-endian");
 PCM_CODEC  (PCM_U16LE,        AV_SAMPLE_FMT_S16, pcm_u16le,        "PCM unsigned 16-bit little-endian");
@@ -516,3 +624,6 @@ PCM_CODEC  (PCM_U24LE,        AV_SAMPLE_FMT_S32, pcm_u24le,        "PCM unsigned
 PCM_CODEC  (PCM_U32BE,        AV_SAMPLE_FMT_S32, pcm_u32be,        "PCM unsigned 32-bit big-endian");
 PCM_CODEC  (PCM_U32LE,        AV_SAMPLE_FMT_S32, pcm_u32le,        "PCM unsigned 32-bit little-endian");
 PCM_DECODER(PCM_ZORK,         AV_SAMPLE_FMT_U8,  pcm_zork,         "PCM Zork");
+PCM_CODEC  (PCM_S64BE,        AV_SAMPLE_FMT_S64, pcm_s64be,        "PCM signed 64-bit big-endian");
+PCM_CODEC  (PCM_S64LE,        AV_SAMPLE_FMT_S64, pcm_s64le,        "PCM signed 64-bit little-endian");
+PCM_CODEC  (PCM_VIDC,         AV_SAMPLE_FMT_S16, pcm_vidc,         "PCM Archimedes VIDC");
diff --git a/libavcodec/pcm_tablegen.c b/libavcodec/pcm_tablegen.c
index 7b4bc8c..473a47f 100644
--- a/libavcodec/pcm_tablegen.c
+++ b/libavcodec/pcm_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,13 @@ int main(void)
 {
     pcm_alaw_tableinit();
     pcm_ulaw_tableinit();
+    pcm_vidc_tableinit();
 
     write_fileheader();
 
     WRITE_ARRAY("static const", uint8_t, linear_to_alaw);
     WRITE_ARRAY("static const", uint8_t, linear_to_ulaw);
+    WRITE_ARRAY("static const", uint8_t, linear_to_vidc);
 
     return 0;
 }
diff --git a/libavcodec/pcm_tablegen.h b/libavcodec/pcm_tablegen.h
index 438c2b9..d8763ab 100644
--- a/libavcodec/pcm_tablegen.h
+++ b/libavcodec/pcm_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,6 +36,12 @@
 
 #define         BIAS            (0x84)      /* Bias for linear code. */
 
+#define         VIDC_SIGN_BIT    (1)
+#define         VIDC_QUANT_MASK  (0x1E)
+#define         VIDC_QUANT_SHIFT (1)
+#define         VIDC_SEG_SHIFT   (5)
+#define         VIDC_SEG_MASK    (0xE0)
+
 /* alaw2linear() - Convert an A-law value to 16-bit linear PCM */
 static av_cold int alaw2linear(unsigned char a_val)
 {
@@ -69,14 +75,30 @@ static av_cold int ulaw2linear(unsigned char u_val)
         return (u_val & SIGN_BIT) ? (BIAS - t) : (t - BIAS);
 }
 
+static av_cold int vidc2linear(unsigned char u_val)
+{
+        int t;
+
+        /*
+         * Extract and bias the quantization bits. Then
+         * shift up by the segment number and subtract out the bias.
+         */
+        t = (((u_val & VIDC_QUANT_MASK) >> VIDC_QUANT_SHIFT) << 3) + BIAS;
+        t <<= ((unsigned)u_val & VIDC_SEG_MASK) >> VIDC_SEG_SHIFT;
+
+        return (u_val & VIDC_SIGN_BIT) ? (BIAS - t) : (t - BIAS);
+}
+
 #if CONFIG_HARDCODED_TABLES
 #define pcm_alaw_tableinit()
 #define pcm_ulaw_tableinit()
+#define pcm_vidc_tableinit()
 #include "libavcodec/pcm_tables.h"
 #else
 /* 16384 entries per table */
 static uint8_t linear_to_alaw[16384];
 static uint8_t linear_to_ulaw[16384];
+static uint8_t linear_to_vidc[16384];
 
 static av_cold void build_xlaw_table(uint8_t *linear_to_xlaw,
                              int (*xlaw2linear)(unsigned char),
@@ -84,21 +106,21 @@ static av_cold void build_xlaw_table(uint8_t *linear_to_xlaw,
 {
     int i, j, v, v1, v2;
 
-    j = 0;
-    for(i=0;i<128;i++) {
-        if (i != 127) {
-            v1 = xlaw2linear(i ^ mask);
-            v2 = xlaw2linear((i + 1) ^ mask);
-            v = (v1 + v2 + 4) >> 3;
-        } else {
-            v = 8192;
-        }
-        for(;j<v;j++) {
+    j = 1;
+    linear_to_xlaw[8192] = mask;
+    for(i=0;i<127;i++) {
+        v1 = xlaw2linear(i ^ mask);
+        v2 = xlaw2linear((i + 1) ^ mask);
+        v = (v1 + v2 + 4) >> 3;
+        for(;j<v;j+=1) {
+            linear_to_xlaw[8192 - j] = (i ^ (mask ^ 0x80));
             linear_to_xlaw[8192 + j] = (i ^ mask);
-            if (j > 0)
-                linear_to_xlaw[8192 - j] = (i ^ (mask ^ 0x80));
         }
     }
+    for(;j<8192;j++) {
+        linear_to_xlaw[8192 - j] = (127 ^ (mask ^ 0x80));
+        linear_to_xlaw[8192 + j] = (127 ^ mask);
+    }
     linear_to_xlaw[0] = linear_to_xlaw[1];
 }
 
@@ -111,6 +133,11 @@ static void pcm_ulaw_tableinit(void)
 {
     build_xlaw_table(linear_to_ulaw, ulaw2linear, 0xff);
 }
+
+static void pcm_vidc_tableinit(void)
+{
+    build_xlaw_table(linear_to_vidc, vidc2linear, 0xff);
+}
 #endif /* CONFIG_HARDCODED_TABLES */
 
 #endif /* AVCODEC_PCM_TABLEGEN_H */
diff --git a/libavcodec/pcx.c b/libavcodec/pcx.c
index f4a6a91..4505db7 100644
--- a/libavcodec/pcx.c
+++ b/libavcodec/pcx.c
@@ -5,36 +5,32 @@
  * This decoder does not support CGA palettes. I am unable to find samples
  * and Netpbm cannot generate them.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #define PCX_HEADER_SIZE 128
 
-/**
- * @return advanced src pointer
- */
-static void pcx_rle_decode(GetByteContext *gb,
+static int pcx_rle_decode(GetByteContext *gb,
                            uint8_t *dst,
                            unsigned int bytes_per_scanline,
                            int compressed)
@@ -42,11 +38,14 @@ static void pcx_rle_decode(GetByteContext *gb,
     unsigned int i = 0;
     unsigned char run, value;
 
+    if (bytestream2_get_bytes_left(gb) < 1)
+        return AVERROR_INVALIDDATA;
+
     if (compressed) {
-        while (i < bytes_per_scanline && bytestream2_get_bytes_left(gb)) {
+        while (i < bytes_per_scanline && bytestream2_get_bytes_left(gb)>0) {
             run   = 1;
             value = bytestream2_get_byte(gb);
-            if (value >= 0xc0 && bytestream2_get_bytes_left(gb)) {
+            if (value >= 0xc0 && bytestream2_get_bytes_left(gb)>0) {
                 run   = value & 0x3f;
                 value = bytestream2_get_byte(gb);
             }
@@ -56,15 +55,16 @@ static void pcx_rle_decode(GetByteContext *gb,
     } else {
         bytestream2_get_buffer(gb, dst, bytes_per_scanline);
     }
+    return 0;
 }
 
-static void pcx_palette(GetByteContext *gb, uint32_t *dst,
-                        unsigned int pallen)
+static void pcx_palette(GetByteContext *gb, uint32_t *dst, int pallen)
 {
-    unsigned int i;
+    int i;
 
+    pallen = FFMIN(pallen, bytestream2_get_bytes_left(gb) / 3);
     for (i = 0; i < pallen; i++)
-        *dst++ = bytestream2_get_be24(gb);
+        *dst++ = 0xFF000000 | bytestream2_get_be24u(gb);
     if (pallen < 256)
         memset(dst, 0, (256 - pallen) * sizeof(*dst));
 }
@@ -72,32 +72,34 @@ static void pcx_palette(GetByteContext *gb, uint32_t *dst,
 static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-    AVFrame *const p   = data;
     GetByteContext gb;
+    AVFrame * const p  = data;
     int compressed, xmin, ymin, xmax, ymax;
+    int ret;
     unsigned int w, h, bits_per_pixel, bytes_per_line, nplanes, stride, y, x,
                  bytes_per_scanline;
-    uint8_t *ptr;
-    uint8_t *scanline;
-    int ret = -1;
+    uint8_t *ptr, *scanline;
 
-    if (buf_size < PCX_HEADER_SIZE) {
+    if (avpkt->size < PCX_HEADER_SIZE) {
         av_log(avctx, AV_LOG_ERROR, "Packet too small\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (buf[0] != 0x0a || buf[1] > 5) {
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
+
+    if (bytestream2_get_byteu(&gb) != 0x0a || bytestream2_get_byteu(&gb) > 5) {
         av_log(avctx, AV_LOG_ERROR, "this is not PCX encoded data\n");
         return AVERROR_INVALIDDATA;
     }
 
-    compressed = buf[2];
-    xmin       = AV_RL16(buf + 4);
-    ymin       = AV_RL16(buf + 6);
-    xmax       = AV_RL16(buf + 8);
-    ymax       = AV_RL16(buf + 10);
+    compressed                     = bytestream2_get_byteu(&gb);
+    bits_per_pixel                 = bytestream2_get_byteu(&gb);
+    xmin                           = bytestream2_get_le16u(&gb);
+    ymin                           = bytestream2_get_le16u(&gb);
+    xmax                           = bytestream2_get_le16u(&gb);
+    ymax                           = bytestream2_get_le16u(&gb);
+    avctx->sample_aspect_ratio.num = bytestream2_get_le16u(&gb);
+    avctx->sample_aspect_ratio.den = bytestream2_get_le16u(&gb);
 
     if (xmax < xmin || ymax < ymin) {
         av_log(avctx, AV_LOG_ERROR, "invalid image dimensions\n");
@@ -107,13 +109,13 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     w = xmax - xmin + 1;
     h = ymax - ymin + 1;
 
-    bits_per_pixel     = buf[3];
-    bytes_per_line     = AV_RL16(buf + 66);
-    nplanes            = buf[65];
+    bytestream2_skipu(&gb, 49);
+    nplanes            = bytestream2_get_byteu(&gb);
+    bytes_per_line     = bytestream2_get_le16u(&gb);
     bytes_per_scanline = nplanes * bytes_per_line;
 
     if (bytes_per_scanline < (w * bits_per_pixel * nplanes + 7) / 8 ||
-        (!compressed && bytes_per_scanline > buf_size / h)) {
+        (!compressed && bytes_per_scanline > bytestream2_get_bytes_left(&gb) / h)) {
         av_log(avctx, AV_LOG_ERROR, "PCX data is corrupted\n");
         return AVERROR_INVALIDDATA;
     }
@@ -136,15 +138,13 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    bytestream2_init(&gb, buf + PCX_HEADER_SIZE, buf_size - PCX_HEADER_SIZE);
+    bytestream2_skipu(&gb, 60);
 
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
@@ -157,7 +157,9 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     if (nplanes == 3 && bits_per_pixel == 8) {
         for (y = 0; y < h; y++) {
-            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            ret = pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            if (ret < 0)
+                goto end;
 
             for (x = 0; x < w; x++) {
                 ptr[3 * x]     = scanline[x];
@@ -168,34 +170,53 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ptr += stride;
         }
     } else if (nplanes == 1 && bits_per_pixel == 8) {
+        int palstart = avpkt->size - 769;
+
+        if (avpkt->size < 769) {
+            av_log(avctx, AV_LOG_ERROR, "File is too short\n");
+            ret = avctx->err_recognition & AV_EF_EXPLODE ?
+                  AVERROR_INVALIDDATA : avpkt->size;
+            goto end;
+        }
+
         for (y = 0; y < h; y++, ptr += stride) {
-            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            ret = pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            if (ret < 0)
+                goto end;
             memcpy(ptr, scanline, w);
         }
 
+        if (bytestream2_tell(&gb) != palstart) {
+            av_log(avctx, AV_LOG_WARNING, "image data possibly corrupted\n");
+            bytestream2_seek(&gb, palstart, SEEK_SET);
+        }
         if (bytestream2_get_byte(&gb) != 12) {
             av_log(avctx, AV_LOG_ERROR, "expected palette after image data\n");
             ret = avctx->err_recognition & AV_EF_EXPLODE ?
-                  AVERROR_INVALIDDATA : buf_size;
+                  AVERROR_INVALIDDATA : avpkt->size;
             goto end;
         }
     } else if (nplanes == 1) {   /* all packed formats, max. 16 colors */
-        BitstreamContext s;
+        GetBitContext s;
 
         for (y = 0; y < h; y++) {
-            bitstream_init8(&s, scanline, bytes_per_scanline);
+            init_get_bits8(&s, scanline, bytes_per_scanline);
 
-            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            ret = pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            if (ret < 0)
+                goto end;
 
             for (x = 0; x < w; x++)
-                ptr[x] = bitstream_read(&s, bits_per_pixel);
+                ptr[x] = get_bits(&s, bits_per_pixel);
             ptr += stride;
         }
     } else {    /* planar, 4, 8 or 16 colors */
         int i;
 
         for (y = 0; y < h; y++) {
-            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            ret = pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
+            if (ret < 0)
+                goto end;
 
             for (x = 0; x < w; x++) {
                 int m = 0x80 >> (x & 7), v = 0;
@@ -209,23 +230,20 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
+    ret = bytestream2_tell(&gb);
     if (nplanes == 1 && bits_per_pixel == 8) {
-        if (bytestream2_get_bytes_left(&gb) < 768) {
-            av_log(avctx, AV_LOG_ERROR, "Palette truncated\n");
-            ret = AVERROR_INVALIDDATA;
-            goto end;
-        }
-
         pcx_palette(&gb, (uint32_t *)p->data[1], 256);
+        ret += 256 * 3;
+    } else if (bits_per_pixel * nplanes == 1) {
+        AV_WN32A(p->data[1]  , 0xFF000000);
+        AV_WN32A(p->data[1]+4, 0xFFFFFFFF);
     } else if (bits_per_pixel < 8) {
-        GetByteContext gb1;
-        bytestream2_init(&gb1, avpkt->data + 16, 48);
-        pcx_palette(&gb1, (uint32_t *)p->data[1], 16);
+        bytestream2_seek(&gb, 16, SEEK_SET);
+        pcx_palette(&gb, (uint32_t *)p->data[1], 16);
     }
 
     *got_frame = 1;
 
-    ret = bytestream2_tell(&gb);
 end:
     av_free(scanline);
     return ret;
diff --git a/libavcodec/pcxenc.c b/libavcodec/pcxenc.c
index 7fc0d9c..6135944 100644
--- a/libavcodec/pcxenc.c
+++ b/libavcodec/pcxenc.c
@@ -2,20 +2,20 @@
  * PC Paintbrush PCX (.pcx) image encoder
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,11 +23,12 @@
  * @file
  * PCX image encoder
  * @author Daniel Verkamp
- * @see http://www.qzx.com/pc-gpe/pcx.txt
+ * @see http://bespin.org/~qz/pc-gpe/pcx.txt
  */
 
 #include "avcodec.h"
 #include "bytestream.h"
+#include "libavutil/imgutils.h"
 #include "internal.h"
 
 static const uint32_t monoblack_pal[16] = { 0x000000, 0xFFFFFF };
@@ -100,8 +101,9 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const uint8_t *buf_end;
     uint8_t *buf;
 
-    int bpp, nplanes, i, y, line_bytes, written, ret, max_pkt_size;
+    int bpp, nplanes, i, y, line_bytes, written, ret, max_pkt_size, sw, sh;
     const uint32_t *pal = NULL;
+    uint32_t palette256[256];
     const uint8_t *src;
 
     if (avctx->width > 65535 || avctx->height > 65535) {
@@ -119,6 +121,11 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_RGB4_BYTE:
     case AV_PIX_FMT_BGR4_BYTE:
     case AV_PIX_FMT_GRAY8:
+        bpp = 8;
+        nplanes = 1;
+        avpriv_set_systematic_pal2(palette256, avctx->pix_fmt);
+        pal = palette256;
+        break;
     case AV_PIX_FMT_PAL8:
         bpp = 8;
         nplanes = 1;
@@ -138,13 +145,16 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     line_bytes = (line_bytes + 1) & ~1;
 
     max_pkt_size = 128 + avctx->height * 2 * line_bytes * nplanes + (pal ? 256*3 + 1 : 0);
-    if ((ret = ff_alloc_packet(pkt, max_pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", max_pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_pkt_size, 0)) < 0)
         return ret;
-    }
     buf     = pkt->data;
     buf_end = pkt->data + pkt->size;
 
+    sw = avctx->sample_aspect_ratio.num;
+    sh = avctx->sample_aspect_ratio.den;
+    if (sw > 0xFFFFu || sh > 0xFFFFu)
+        av_reduce(&sw, &sh, sw, sh, 0xFFFFu);
+
     bytestream_put_byte(&buf, 10);                  // manufacturer
     bytestream_put_byte(&buf, 5);                   // version
     bytestream_put_byte(&buf, 1);                   // encoding
@@ -153,8 +163,8 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     bytestream_put_le16(&buf, 0);                   // y min
     bytestream_put_le16(&buf, avctx->width - 1);    // x max
     bytestream_put_le16(&buf, avctx->height - 1);   // y max
-    bytestream_put_le16(&buf, 0);                   // horizontal DPI
-    bytestream_put_le16(&buf, 0);                   // vertical DPI
+    bytestream_put_le16(&buf, sw);                  // horizontal DPI
+    bytestream_put_le16(&buf, sh);                  // vertical DPI
     for (i = 0; i < 16; i++)
         bytestream_put_be24(&buf, pal ? pal[i] : 0);// palette (<= 16 color only)
     bytestream_put_byte(&buf, 0);                   // reserved
diff --git a/libavcodec/pel_template.c b/libavcodec/pel_template.c
index b832ae7..6da7a56 100644
--- a/libavcodec/pel_template.c
+++ b/libavcodec/pel_template.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pgssubdec.c b/libavcodec/pgssubdec.c
index a6a43ae..8c10f6d 100644
--- a/libavcodec/pgssubdec.c
+++ b/libavcodec/pgssubdec.c
@@ -2,20 +2,20 @@
  * PGS subtitle decoder
  * Copyright (c) 2009 Stephen Backway
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,8 +31,9 @@
 
 #include "libavutil/colorspace.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
-#define RGBA(r,g,b,a) (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
+#define RGBA(r,g,b,a) (((unsigned)(a) << 24) | ((r) << 16) | ((g) << 8) | (b))
 #define MAX_EPOCH_PALETTES 8   // Max 8 allowed per PGS epoch
 #define MAX_EPOCH_OBJECTS  64  // Max 64 allowed per PGS epoch
 #define MAX_OBJECT_REFS    2   // Max objects per display set
@@ -90,9 +91,11 @@ typedef struct PGSSubPalettes {
 } PGSSubPalettes;
 
 typedef struct PGSSubContext {
+    AVClass *class;
     PGSSubPresentation presentation;
     PGSSubPalettes     palettes;
     PGSSubObjects      objects;
+    int forced_subs_only;
 } PGSSubContext;
 
 static void flush_cache(AVCodecContext *avctx)
@@ -133,7 +136,7 @@ static PGSSubPalette * find_palette(int id, PGSSubPalettes *palettes)
 
 static av_cold int init_decoder(AVCodecContext *avctx)
 {
-    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    avctx->pix_fmt     = AV_PIX_FMT_PAL8;
 
     return 0;
 }
@@ -148,7 +151,7 @@ static av_cold int close_decoder(AVCodecContext *avctx)
 /**
  * Decode the RLE data.
  *
- * The subtitle is stored as an Run Length Encoded image.
+ * The subtitle is stored as a Run Length Encoded image.
  *
  * @param avctx contains the current codec context
  * @param sub pointer to the processed subtitle data
@@ -163,7 +166,7 @@ static int decode_rle(AVCodecContext *avctx, AVSubtitleRect *rect,
 
     rle_bitmap_end = buf + buf_size;
 
-    rect->data[0] = av_malloc(rect->w * rect->h);
+    rect->data[0] = av_malloc_array(rect->w, rect->h);
 
     if (!rect->data[0])
         return AVERROR(ENOMEM);
@@ -287,18 +290,18 @@ static int parse_object_segment(AVCodecContext *avctx,
     height = bytestream_get_be16(&buf);
 
     /* Make sure the bitmap is not too large */
-    if (avctx->width < width || avctx->height < height) {
-        av_log(avctx, AV_LOG_ERROR, "Bitmap dimensions larger than video.\n");
+    if (avctx->width < width || avctx->height < height || !width || !height) {
+        av_log(avctx, AV_LOG_ERROR, "Bitmap dimensions (%dx%d) invalid.\n", width, height);
         return AVERROR_INVALIDDATA;
     }
 
     object->w = width;
     object->h = height;
 
-    av_fast_malloc(&object->rle, &object->rle_buffer_size, rle_bitmap_len);
+    av_fast_padded_malloc(&object->rle, &object->rle_buffer_size, rle_bitmap_len);
 
     if (!object->rle) {
-        object->rle_data_len      = 0;
+        object->rle_data_len = 0;
         object->rle_remaining_len = 0;
         return AVERROR(ENOMEM);
     }
@@ -387,8 +390,8 @@ static int parse_presentation_segment(AVCodecContext *avctx,
                                       int64_t pts)
 {
     PGSSubContext *ctx = avctx->priv_data;
-
     int i, state, ret;
+    const uint8_t *buf_end = buf + buf_size;
 
     // Video descriptor
     int w = bytestream_get_be16(&buf);
@@ -437,8 +440,16 @@ static int parse_presentation_segment(AVCodecContext *avctx,
         }
     }
 
+
     for (i = 0; i < ctx->presentation.object_count; i++)
     {
+
+        if (buf_end - buf < 8) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficent space for object\n");
+            ctx->presentation.object_count = i;
+            return AVERROR_INVALIDDATA;
+        }
+
         ctx->presentation.objects[i].id = bytestream_get_be16(&buf);
         ctx->presentation.objects[i].window_id = bytestream_get_byte(&buf);
         ctx->presentation.objects[i].composition_flag = bytestream_get_byte(&buf);
@@ -489,11 +500,14 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
 {
     AVSubtitle    *sub = data;
     PGSSubContext *ctx = avctx->priv_data;
+    int64_t pts;
     PGSSubPalette *palette;
     int i, ret;
 
+    pts = ctx->presentation.pts != AV_NOPTS_VALUE ? ctx->presentation.pts : sub->pts;
     memset(sub, 0, sizeof(*sub));
-    sub->pts = ctx->presentation.pts;
+    sub->pts = pts;
+    ctx->presentation.pts = AV_NOPTS_VALUE;
     sub->start_display_time = 0;
     // There is no explicit end time for PGS subtitles.  The end time
     // is defined by the start of the next sub which may contain no
@@ -504,7 +518,7 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
     // Blank if last object_count was 0.
     if (!ctx->presentation.object_count)
         return 1;
-    sub->rects = av_mallocz(sizeof(*sub->rects) * ctx->presentation.object_count);
+    sub->rects = av_mallocz_array(ctx->presentation.object_count, sizeof(*sub->rects));
     if (!sub->rects) {
         return AVERROR(ENOMEM);
     }
@@ -545,12 +559,13 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
 
         sub->rects[i]->x    = ctx->presentation.objects[i].x;
         sub->rects[i]->y    = ctx->presentation.objects[i].y;
-        sub->rects[i]->w    = object->w;
-        sub->rects[i]->h    = object->h;
-
-        sub->rects[i]->linesize[0] = object->w;
 
         if (object->rle) {
+            sub->rects[i]->w    = object->w;
+            sub->rects[i]->h    = object->h;
+
+            sub->rects[i]->linesize[0] = object->w;
+
             if (object->rle_remaining_len) {
                 av_log(avctx, AV_LOG_ERROR, "RLE data length %u is %u bytes shorter than expected\n",
                        object->rle_data_len, object->rle_remaining_len);
@@ -579,6 +594,9 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
             return AVERROR(ENOMEM);
         }
 
+        if (!ctx->forced_subs_only || ctx->presentation.objects[i].composition_flag & 0x40)
+        memcpy(sub->rects[i]->data[1], palette->clut, sub->rects[i]->nb_colors * sizeof(uint32_t));
+
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
 {
@@ -592,9 +610,6 @@ FF_DISABLE_DEPRECATION_WARNINGS
 }
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
-
-        memcpy(sub->rects[i]->data[1], palette->clut, sub->rects[i]->nb_colors * sizeof(uint32_t));
-
     }
     return 1;
 }
@@ -648,7 +663,7 @@ static int decode(AVCodecContext *avctx, void *data, int *data_size,
             ret = parse_object_segment(avctx, buf, segment_length);
             break;
         case PRESENTATION_SEGMENT:
-            ret = parse_presentation_segment(avctx, buf, segment_length, avpkt->pts);
+            ret = parse_presentation_segment(avctx, buf, segment_length, ((AVSubtitle*)(data))->pts);
             break;
         case WINDOW_SEGMENT:
             /*
@@ -661,6 +676,11 @@ static int decode(AVCodecContext *avctx, void *data, int *data_size,
              */
             break;
         case DISPLAY_SEGMENT:
+            if (*data_size) {
+                av_log(avctx, AV_LOG_ERROR, "Duplicate display segment\n");
+                ret = AVERROR_INVALIDDATA;
+                break;
+            }
             ret = display_end_segment(avctx, data, buf, segment_length);
             if (ret >= 0)
                 *data_size = ret;
@@ -680,6 +700,20 @@ static int decode(AVCodecContext *avctx, void *data, int *data_size,
     return buf_size;
 }
 
+#define OFFSET(x) offsetof(PGSSubContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    {"forced_subs_only", "Only show forced subtitles", OFFSET(forced_subs_only), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, SD},
+    { NULL },
+};
+
+static const AVClass pgsdec_class = {
+    .class_name = "PGS subtitle decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_pgssub_decoder = {
     .name           = "pgssub",
     .long_name      = NULL_IF_CONFIG_SMALL("HDMV Presentation Graphic Stream subtitles"),
@@ -689,4 +723,5 @@ AVCodec ff_pgssub_decoder = {
     .init           = init_decoder,
     .close          = close_decoder,
     .decode         = decode,
+    .priv_class     = &pgsdec_class,
 };
diff --git a/libavcodec/pictordec.c b/libavcodec/pictordec.c
index 49547cf..65d2d49 100644
--- a/libavcodec/pictordec.c
+++ b/libavcodec/pictordec.c
@@ -2,20 +2,20 @@
  * Pictor/PC Paint decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -57,12 +57,12 @@ static void picmemset_8bpp(PicContext *s, AVFrame *frame, int value, int run,
     }
 }
 
-static void picmemset(PicContext *s, AVFrame *frame, int value, int run,
+static void picmemset(PicContext *s, AVFrame *frame, unsigned value, int run,
                       int *x, int *y, int *plane, int bits_per_plane)
 {
     uint8_t *d;
     int shift = *plane * bits_per_plane;
-    int mask  = ((1 << bits_per_plane) - 1) << shift;
+    unsigned mask  = ((1U << bits_per_plane) - 1) << shift;
     value   <<= shift;
 
     while (run > 0) {
@@ -77,10 +77,10 @@ static void picmemset(PicContext *s, AVFrame *frame, int value, int run,
                 if (*y < 0) {
                    *y = s->height - 1;
                    *plane += 1;
+                   if (*plane >= s->nb_planes)
+                       return;
                    value <<= bits_per_plane;
                    mask  <<= bits_per_plane;
-                   if (*plane >= s->nb_planes)
-                       break;
                 }
             }
         }
@@ -105,7 +105,7 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     uint32_t *palette;
     int bits_per_plane, bpp, etype, esize, npal, pos_after_pal;
-    int i, x, y, plane, tmp, ret;
+    int i, x, y, plane, tmp, ret, val;
 
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
 
@@ -127,7 +127,7 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_PATCHWELCOME;
     }
 
-    if (bytestream2_peek_byte(&s->g) == 0xFF) {
+    if (bytestream2_peek_byte(&s->g) == 0xFF || bpp == 1 || bpp == 4 || bpp == 8) {
         bytestream2_skip(&s->g, 2);
         etype = bytestream2_get_le16(&s->g);
         esize = bytestream2_get_le16(&s->g);
@@ -140,16 +140,16 @@ static int decode_frame(AVCodecContext *avctx,
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
+    if (av_image_check_size(s->width, s->height, 0, avctx) < 0)
+        return -1;
     if (s->width != avctx->width || s->height != avctx->height) {
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     memset(frame->data[0], 0, s->height * frame->linesize[0]);
     frame->pict_type           = AV_PICTURE_TYPE_I;
     frame->palette_has_changed = 1;
@@ -165,7 +165,7 @@ static int decode_frame(AVCodecContext *avctx,
         npal = FFMIN(esize, 16);
         for (i = 0; i < npal; i++) {
             int pal_idx = bytestream2_get_byte(&s->g);
-            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 16)];
+            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 15)];
         }
     } else if (etype == 3) {
         npal = FFMIN(esize, 16);
@@ -175,13 +175,15 @@ static int decode_frame(AVCodecContext *avctx,
         }
     } else if (etype == 4 || etype == 5) {
         npal = FFMIN(esize / 3, 256);
-        for (i = 0; i < npal; i++)
+        for (i = 0; i < npal; i++) {
             palette[i] = bytestream2_get_be24(&s->g) << 2;
+            palette[i] |= 0xFFU << 24 | palette[i] >> 6 & 0x30303;
+        }
     } else {
         if (bpp == 1) {
             npal = 2;
-            palette[0] = 0x000000;
-            palette[1] = 0xFFFFFF;
+            palette[0] = 0xFF000000;
+            palette[1] = 0xFFFFFFFF;
         } else if (bpp == 2) {
             npal = 4;
             for (i = 0; i < npal; i++)
@@ -196,10 +198,11 @@ static int decode_frame(AVCodecContext *avctx,
     // skip remaining palette bytes
     bytestream2_seek(&s->g, pos_after_pal, SEEK_SET);
 
-    x = 0;
+    val = 0;
     y = s->height - 1;
-    plane = 0;
     if (bytestream2_get_le16(&s->g)) {
+        x = 0;
+        plane = 0;
         while (bytestream2_get_bytes_left(&s->g) >= 6) {
             int stop_size, marker, t1, t2;
 
@@ -213,7 +216,7 @@ static int decode_frame(AVCodecContext *avctx,
             while (plane < s->nb_planes &&
                    bytestream2_get_bytes_left(&s->g) > stop_size) {
                 int run = 1;
-                int val = bytestream2_get_byte(&s->g);
+                val = bytestream2_get_byte(&s->g);
                 if (val == marker) {
                     run = bytestream2_get_byte(&s->g);
                     if (run == 0)
@@ -232,9 +235,23 @@ static int decode_frame(AVCodecContext *avctx,
                 }
             }
         }
+
+        if (s->nb_planes - plane > 1)
+            return AVERROR_INVALIDDATA;
+
+        if (plane < s->nb_planes && x < avctx->width) {
+            int run = (y + 1) * avctx->width - x;
+            if (bits_per_plane == 8)
+                picmemset_8bpp(s, frame, val, run, &x, &y);
+            else
+                picmemset(s, frame, val, run / (8 / bits_per_plane), &x, &y, &plane, bits_per_plane);
+        }
     } else {
-        avpriv_request_sample(avctx, "Uncompressed image");
-        return avpkt->size;
+        while (y >= 0 && bytestream2_get_bytes_left(&s->g) > 0) {
+            memcpy(frame->data[0] + y * frame->linesize[0], s->g.buffer, FFMIN(avctx->width, bytestream2_get_bytes_left(&s->g)));
+            bytestream2_skip(&s->g, avctx->width);
+            y--;
+        }
     }
 finish:
 
diff --git a/libavcodec/pixblockdsp.c b/libavcodec/pixblockdsp.c
index 9d68d26..50e1d1d 100644
--- a/libavcodec/pixblockdsp.c
+++ b/libavcodec/pixblockdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,17 +20,44 @@
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "pixblockdsp.h"
 
-#define BIT_DEPTH 16
-#include "pixblockdsp_template.c"
-#undef BIT_DEPTH
+static void get_pixels_16_c(int16_t *av_restrict block, const uint8_t *pixels,
+                            ptrdiff_t stride)
+{
+    AV_COPY128U(block + 0 * 8, pixels + 0 * stride);
+    AV_COPY128U(block + 1 * 8, pixels + 1 * stride);
+    AV_COPY128U(block + 2 * 8, pixels + 2 * stride);
+    AV_COPY128U(block + 3 * 8, pixels + 3 * stride);
+    AV_COPY128U(block + 4 * 8, pixels + 4 * stride);
+    AV_COPY128U(block + 5 * 8, pixels + 5 * stride);
+    AV_COPY128U(block + 6 * 8, pixels + 6 * stride);
+    AV_COPY128U(block + 7 * 8, pixels + 7 * stride);
+}
+
+static void get_pixels_8_c(int16_t *av_restrict block, const uint8_t *pixels,
+                           ptrdiff_t stride)
+{
+    int i;
 
-#define BIT_DEPTH 8
-#include "pixblockdsp_template.c"
+    /* read the pixels */
+    for (i = 0; i < 8; i++) {
+        block[0] = pixels[0];
+        block[1] = pixels[1];
+        block[2] = pixels[2];
+        block[3] = pixels[3];
+        block[4] = pixels[4];
+        block[5] = pixels[5];
+        block[6] = pixels[6];
+        block[7] = pixels[7];
+        pixels  += stride;
+        block   += 8;
+    }
+}
 
-static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
+static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
                           const uint8_t *s2, ptrdiff_t stride)
 {
     int i;
@@ -55,22 +82,31 @@ av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
+    c->diff_pixels_unaligned =
     c->diff_pixels = diff_pixels_c;
 
     switch (avctx->bits_per_raw_sample) {
     case 9:
     case 10:
+    case 12:
+    case 14:
         c->get_pixels = get_pixels_16_c;
         break;
     default:
-        c->get_pixels = get_pixels_8_c;
+        if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
+            c->get_pixels = get_pixels_8_c;
+        }
         break;
     }
 
+    if (ARCH_ALPHA)
+        ff_pixblockdsp_init_alpha(c, avctx, high_bit_depth);
     if (ARCH_ARM)
         ff_pixblockdsp_init_arm(c, avctx, high_bit_depth);
     if (ARCH_PPC)
         ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth);
     if (ARCH_X86)
         ff_pixblockdsp_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_pixblockdsp_init_mips(c, avctx, high_bit_depth);
 }
diff --git a/libavcodec/pixblockdsp.h b/libavcodec/pixblockdsp.h
index c7587cb..e036700 100644
--- a/libavcodec/pixblockdsp.h
+++ b/libavcodec/pixblockdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,21 +26,30 @@
 #include "avcodec.h"
 
 typedef struct PixblockDSPContext {
-    void (*get_pixels)(int16_t *restrict block /* align 16 */,
+    void (*get_pixels)(int16_t *av_restrict block /* align 16 */,
                        const uint8_t *pixels /* align 8 */,
                        ptrdiff_t stride);
-    void (*diff_pixels)(int16_t *restrict block /* align 16 */,
+    void (*diff_pixels)(int16_t *av_restrict block /* align 16 */,
                         const uint8_t *s1 /* align 8 */,
                         const uint8_t *s2 /* align 8 */,
                         ptrdiff_t stride);
+    void (*diff_pixels_unaligned)(int16_t *av_restrict block /* align 16 */,
+                        const uint8_t *s1,
+                        const uint8_t *s2,
+                        ptrdiff_t stride);
+
 } PixblockDSPContext;
 
 void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx);
+void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx,
+                               unsigned high_bit_depth);
 void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
 void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
 void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth);
 
 #endif /* AVCODEC_PIXBLOCKDSP_H */
diff --git a/libavcodec/pixblockdsp_template.c b/libavcodec/pixblockdsp_template.c
deleted file mode 100644
index 3d86e2c..0000000
--- a/libavcodec/pixblockdsp_template.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "bit_depth_template.c"
-
-static void FUNCC(get_pixels)(int16_t *restrict block, const uint8_t *_pixels,
-                              ptrdiff_t stride)
-{
-    const pixel *pixels = (const pixel *) _pixels;
-    int i;
-
-    /* read the pixels */
-    for (i = 0; i < 8; i++) {
-        block[0] = pixels[0];
-        block[1] = pixels[1];
-        block[2] = pixels[2];
-        block[3] = pixels[3];
-        block[4] = pixels[4];
-        block[5] = pixels[5];
-        block[6] = pixels[6];
-        block[7] = pixels[7];
-        pixels  += stride / sizeof(pixel);
-        block   += 8;
-    }
-}
diff --git a/libavcodec/pixels.h b/libavcodec/pixels.h
index d9d2fde..98eacd4 100644
--- a/libavcodec/pixels.h
+++ b/libavcodec/pixels.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pixlet.c b/libavcodec/pixlet.c
index 9f7d082..03a2cda 100644
--- a/libavcodec/pixlet.c
+++ b/libavcodec/pixlet.c
@@ -2,20 +2,20 @@
  * Apple Pixlet decoder
  * Copyright (c) 2016 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,8 @@
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "thread.h"
 #include "unary.h"
@@ -40,20 +40,20 @@
 #define V 1
 
 typedef struct SubBand {
-    size_t width, height;
-    size_t size;
-    size_t x, y;
+    unsigned width, height;
+    unsigned size;
+    unsigned x, y;
 } SubBand;
 
 typedef struct PixletContext {
     AVClass *class;
 
     GetByteContext gb;
-    BitstreamContext bc;
+    GetBitContext bc;
 
     int levels;
     int depth;
-    size_t w, h;
+    int w, h;
 
     int16_t *filter[2];
     int16_t *prediction;
@@ -68,14 +68,21 @@ static av_cold int pixlet_init(AVCodecContext *avctx)
     return 0;
 }
 
-static av_cold int pixlet_close(AVCodecContext *avctx)
+static void free_buffers(AVCodecContext *avctx)
 {
     PixletContext *ctx = avctx->priv_data;
 
     av_freep(&ctx->filter[0]);
     av_freep(&ctx->filter[1]);
     av_freep(&ctx->prediction);
+}
 
+static av_cold int pixlet_close(AVCodecContext *avctx)
+{
+    PixletContext *ctx = avctx->priv_data;
+    free_buffers(avctx);
+    ctx->w = 0;
+    ctx->h = 0;
     return 0;
 }
 
@@ -92,8 +99,8 @@ static int init_decoder(AVCodecContext *avctx)
 
     for (plane = 0; plane < 3; plane++) {
         unsigned shift = plane > 0;
-        size_t w       = ctx->w >> shift;
-        size_t h       = ctx->h >> shift;
+        unsigned w     = ctx->w >> shift;
+        unsigned h     = ctx->h >> shift;
 
         ctx->band[plane][0].width  =  w >> NB_LEVELS;
         ctx->band[plane][0].height =  h >> NB_LEVELS;
@@ -114,11 +121,11 @@ static int init_decoder(AVCodecContext *avctx)
     return 0;
 }
 
-static int read_low_coeffs(AVCodecContext *avctx, int16_t *dst, size_t size,
-                           size_t width, ptrdiff_t stride)
+static int read_low_coeffs(AVCodecContext *avctx, int16_t *dst, int size,
+                           int width, ptrdiff_t stride)
 {
     PixletContext *ctx = avctx->priv_data;
-    BitstreamContext *bc = &ctx->bc;
+    GetBitContext *bc = &ctx->bc;
     unsigned cnt1, nbits, k, j = 0, i = 0;
     int64_t value, state = 3;
     int rlen, escape, flag = 0;
@@ -128,14 +135,16 @@ static int read_low_coeffs(AVCodecContext *avctx, int16_t *dst, size_t size,
 
         cnt1 = get_unary(bc, 0, 8);
         if (cnt1 < 8) {
-            value = bitstream_read(bc, nbits);
+            value = show_bits(bc, nbits);
             if (value <= 1) {
-                bitstream_unget(bc, value & 1, 1);
-                value = 1;
+                skip_bits(bc, nbits - 1);
+                escape = ((1 << nbits) - 1) * cnt1;
+            } else {
+                skip_bits(bc, nbits);
+                escape = value + ((1 << nbits) - 1) * cnt1 - 1;
             }
-            escape = value + ((1 << nbits) - 1) * cnt1 - 1;
         } else {
-            escape = bitstream_read(bc, 16);
+            escape = get_bits(bc, 16);
         }
 
         value    = -((escape + flag) & 1) | 1;
@@ -148,24 +157,26 @@ static int read_low_coeffs(AVCodecContext *avctx, int16_t *dst, size_t size,
         state = 120 * (escape + flag) + state - (120 * state >> 8);
         flag  = 0;
 
-        if (state * 4 > 0xFF || i >= size)
+        if (state * 4ULL > 0xFF || i >= size)
             continue;
 
         nbits  = ((state + 8) >> 5) + (state ? ff_clz(state) : 32) - 24;
         escape = av_mod_uintp2(16383, nbits);
         cnt1   = get_unary(bc, 0, 8);
         if (cnt1 > 7) {
-            rlen = bitstream_read(bc, 16);
+            rlen = get_bits(bc, 16);
         } else {
-            value = bitstream_read(bc, nbits);
-            if (value <= 1) {
-                bitstream_unget(bc, value & 1, 1);
-                value = 1;
+            value = show_bits(bc, nbits);
+            if (value > 1) {
+                skip_bits(bc, nbits);
+                rlen = value + escape * cnt1 - 1;
+            } else {
+                skip_bits(bc, nbits - 1);
+                rlen = escape * cnt1;
             }
-            rlen = value + escape * cnt1 - 1;
         }
 
-        if (i + rlen > size)
+        if (rlen > size - i)
             return AVERROR_INVALIDDATA;
         i += rlen;
 
@@ -181,27 +192,26 @@ static int read_low_coeffs(AVCodecContext *avctx, int16_t *dst, size_t size,
         flag  = rlen < 0xFFFF ? 1 : 0;
     }
 
-    bitstream_align(bc);
-    return bitstream_tell(bc) >> 3;
+    align_get_bits(bc);
+    return get_bits_count(bc) >> 3;
 }
 
 static int read_high_coeffs(AVCodecContext *avctx, uint8_t *src, int16_t *dst,
-                            int size, int64_t c, int a, int64_t d,
+                            int size, int c, int a, int d,
                             int width, ptrdiff_t stride)
 {
     PixletContext *ctx = avctx->priv_data;
-    BitstreamContext *bc = &ctx->bc;
+    GetBitContext *bc = &ctx->bc;
     unsigned cnt1, shbits, rlen, nbits, length, i = 0, j = 0, k;
-    int ret, escape, pfx, cthulu, yflag, xflag, flag = 0;
-    int64_t state = 3, value, tmp;
+    int ret, escape, pfx, value, yflag, xflag, flag = 0;
+    int64_t state = 3, tmp;
 
-    ret = bitstream_init8(bc, src, bytestream2_get_bytes_left(&ctx->gb));
+    ret = init_get_bits8(bc, src, bytestream2_get_bytes_left(&ctx->gb));
     if (ret < 0)
         return ret;
 
-    cthulu = (a >= 0) + (a ^ (a >> 31)) - (a >> 31);
-    if (cthulu != 1) {
-        nbits = 33 - ff_clz(cthulu - 1);
+    if (a ^ (a >> 31)) {
+        nbits = 33 - ff_clz(a ^ (a >> 31));
         if (nbits > 16)
             return AVERROR_INVALIDDATA;
     } else {
@@ -218,17 +228,19 @@ static int read_high_coeffs(AVCodecContext *avctx, uint8_t *src, int16_t *dst,
 
         cnt1 = get_unary(bc, 0, length);
         if (cnt1 >= length) {
-            cnt1 = bitstream_read(bc, nbits);
+            cnt1 = get_bits(bc, nbits);
         } else {
-            pfx    = 14 + (((value - 14) >> 32) & (value - 14));
-            cnt1  *= (1 << pfx) - 1;
-
-            shbits = bitstream_read(bc, pfx);
+            pfx = 14 + ((((uint64_t)(value - 14)) >> 32) & (value - 14));
+            if (pfx < 1 || pfx > 25)
+                return AVERROR_INVALIDDATA;
+            cnt1 *= (1 << pfx) - 1;
+            shbits = show_bits(bc, pfx);
             if (shbits <= 1) {
-                bitstream_unget(bc, shbits & 1, 1);
-                shbits = 1;
+                skip_bits(bc, pfx - 1);
+            } else {
+                skip_bits(bc, pfx);
+                cnt1 += shbits - 1;
             }
-            cnt1 += shbits - 1;
         }
 
         xflag = flag + cnt1;
@@ -238,7 +250,7 @@ static int read_high_coeffs(AVCodecContext *avctx, uint8_t *src, int16_t *dst,
             value = 0;
         } else {
             xflag &= 1u;
-            tmp    = c * ((yflag + 1) >> 1) + (c >> 1);
+            tmp    = (int64_t)c * ((yflag + 1) >> 1) + (c >> 1);
             value  = xflag + (tmp ^ -xflag);
         }
 
@@ -248,11 +260,11 @@ static int read_high_coeffs(AVCodecContext *avctx, uint8_t *src, int16_t *dst,
             j    = 0;
             dst += stride;
         }
-        state += d * yflag - (d * state >> 8);
+        state += (int64_t)d * (uint64_t)yflag - ((int64_t)(d * (uint64_t)state) >> 8);
 
         flag = 0;
 
-        if (state * 4 > 0xFF || i >= size)
+        if ((uint64_t)state > 0xFF / 4 || i >= size)
             continue;
 
         pfx    = ((state + 8) >> 5) + (state ? ff_clz(state) : 32) - 24;
@@ -262,17 +274,19 @@ static int read_high_coeffs(AVCodecContext *avctx, uint8_t *src, int16_t *dst,
             if (pfx < 1 || pfx > 25)
                 return AVERROR_INVALIDDATA;
 
-            value = bitstream_read(bc, pfx);
-            if (value <= 1) {
-                bitstream_unget(bc, value & 1, 1);
-                value = 1;
+            value = show_bits(bc, pfx);
+            if (value > 1) {
+                skip_bits(bc, pfx);
+                rlen = value + escape * cnt1 - 1;
+            } else {
+                skip_bits(bc, pfx - 1);
+                rlen = escape * cnt1;
             }
-            rlen = value + escape * cnt1 - 1;
         } else {
-            if (bitstream_read_bit(bc))
-                value = bitstream_read(bc, 16);
+            if (get_bits1(bc))
+                value = get_bits(bc, 16);
             else
-                value = bitstream_read(bc, 8);
+                value = get_bits(bc, 8);
 
             rlen = value + 8 * escape;
         }
@@ -293,8 +307,8 @@ static int read_high_coeffs(AVCodecContext *avctx, uint8_t *src, int16_t *dst,
         flag  = rlen < 0xFFFF ? 1 : 0;
     }
 
-    bitstream_align(bc);
-    return bitstream_tell(bc) >> 3;
+    align_get_bits(bc);
+    return get_bits_count(bc) >> 3;
 }
 
 static int read_highpass(AVCodecContext *avctx, uint8_t *ptr,
@@ -312,18 +326,21 @@ static int read_highpass(AVCodecContext *avctx, uint8_t *ptr,
         int16_t *dest = (int16_t *)frame->data[plane] +
                         ctx->band[plane][i + 1].x +
                         ctx->band[plane][i + 1].y * stride;
-        size_t size = ctx->band[plane][i + 1].size;
+        unsigned size = ctx->band[plane][i + 1].size;
         uint32_t magic = bytestream2_get_be32(&ctx->gb);
 
         if (magic != PIXLET_MAGIC) {
             av_log(avctx, AV_LOG_ERROR,
-                   "wrong magic number: 0x%"PRIX32" for plane %d, band %d\n",
+                   "wrong magic number: 0x%08"PRIX32" for plane %d, band %d\n",
                    magic, plane, i);
             return AVERROR_INVALIDDATA;
         }
 
-        ret = read_high_coeffs(avctx, ptr + bytestream2_tell(&ctx->gb), dest,
-                               size, c, (b >= FFABS(a)) ? b : a, d,
+        if (a == INT32_MIN)
+            return AVERROR_INVALIDDATA;
+
+        ret = read_high_coeffs(avctx, ptr + bytestream2_tell(&ctx->gb), dest, size,
+                               c, (b >= FFABS(a)) ? b : a, d,
                                ctx->band[plane][i + 1].width, stride);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
@@ -337,32 +354,27 @@ static int read_highpass(AVCodecContext *avctx, uint8_t *ptr,
     return 0;
 }
 
-static void line_add_sat_s16(int16_t *dst, const int16_t *src, size_t len)
-{
-    int i;
-    for (i = 0; i < len; i++) {
-        int val = dst[i] + src[i];
-        dst[i] = av_clip_int16(val);
-    }
-}
-
 static void lowpass_prediction(int16_t *dst, int16_t *pred,
-                               size_t width, size_t height, ptrdiff_t stride)
+                               int width, int height, ptrdiff_t stride)
 {
+    int16_t val;
     int i, j;
 
     memset(pred, 0, width * sizeof(*pred));
 
     for (i = 0; i < height; i++) {
-        line_add_sat_s16(pred, dst, width);
-        dst[0] = pred[0];
-        for (j = 1; j < width; j++)
-            dst[j] = pred[j] + dst[j - 1];
+        val    = pred[0] + dst[0];
+        dst[0] = pred[0] = val;
+        for (j = 1; j < width; j++) {
+            val     = pred[j] + dst[j];
+            dst[j]  = pred[j] = val;
+            dst[j] += dst[j-1];
+        }
         dst += stride;
     }
 }
 
-static void filterfn(int16_t *dest, int16_t *tmp, size_t size, int64_t scale)
+static void filterfn(int16_t *dest, int16_t *tmp, unsigned size, int64_t scale)
 {
     int16_t *low, *high, *ll, *lh, *hl, *hh;
     int hsize, i, j;
@@ -408,7 +420,7 @@ static void filterfn(int16_t *dest, int16_t *tmp, size_t size, int64_t scale)
 }
 
 static void reconstruction(AVCodecContext *avctx, int16_t *dest,
-                           size_t width, size_t height, ptrdiff_t stride,
+                           unsigned width, unsigned height, ptrdiff_t stride,
                            int64_t *scaling_h, int64_t *scaling_v)
 {
     PixletContext *ctx = avctx->priv_data;
@@ -450,7 +462,7 @@ static void reconstruction(AVCodecContext *avctx, int16_t *dest,
     }
 }
 
-static void postprocess_luma(AVFrame *frame, size_t w, size_t h, int depth)
+static void postprocess_luma(AVFrame *frame, int w, int h, int depth)
 {
     uint16_t *dsty = (uint16_t *)frame->data[0];
     int16_t *srcy  = (int16_t *)frame->data[0];
@@ -521,8 +533,8 @@ static int decode_plane(AVCodecContext *avctx, int plane,
     dst    = (int16_t *)frame->data[plane];
     dst[0] = sign_extend(bytestream2_get_be16(&ctx->gb), 16);
 
-    ret = bitstream_init8(&ctx->bc, avpkt->data + bytestream2_tell(&ctx->gb),
-                          bytestream2_get_bytes_left(&ctx->gb));
+    ret = init_get_bits8(&ctx->bc, avpkt->data + bytestream2_tell(&ctx->gb),
+                         bytestream2_get_bytes_left(&ctx->gb));
     if (ret < 0)
         return ret;
 
@@ -585,7 +597,7 @@ static int pixlet_decode_frame(AVCodecContext *avctx, void *data,
 
     pktsize = bytestream2_get_be32(&ctx->gb);
     if (pktsize <= 44 || pktsize - 4 > bytestream2_get_bytes_left(&ctx->gb)) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid packet size %"PRIu32".\n", pktsize);
+        av_log(avctx, AV_LOG_ERROR, "Invalid packet size %"PRIu32"\n", pktsize);
         return AVERROR_INVALIDDATA;
     }
 
@@ -601,6 +613,10 @@ static int pixlet_decode_frame(AVCodecContext *avctx, void *data,
     width  = bytestream2_get_be32(&ctx->gb);
     height = bytestream2_get_be32(&ctx->gb);
 
+    if (    width > INT_MAX - (1U << (NB_LEVELS + 1))
+        || height > INT_MAX - (1U << (NB_LEVELS + 1)))
+        return AVERROR_INVALIDDATA;
+
     w = FFALIGN(width,  1 << (NB_LEVELS + 1));
     h = FFALIGN(height, 1 << (NB_LEVELS + 1));
 
@@ -619,15 +635,14 @@ static int pixlet_decode_frame(AVCodecContext *avctx, void *data,
     avctx->width  = width;
     avctx->height = height;
 
-    /* reinit should dimensions change */
     if (ctx->w != w || ctx->h != h) {
-        pixlet_close(avctx);
+        free_buffers(avctx);
         ctx->w = w;
         ctx->h = h;
 
         ret = init_decoder(avctx);
         if (ret < 0) {
-            pixlet_close(avctx);
+            free_buffers(avctx);
             ctx->w = 0;
             ctx->h = 0;
             return ret;
@@ -636,6 +651,10 @@ static int pixlet_decode_frame(AVCodecContext *avctx, void *data,
 
     bytestream2_skip(&ctx->gb, 8);
 
+    p->pict_type = AV_PICTURE_TYPE_I;
+    p->key_frame = 1;
+    p->color_range = AVCOL_RANGE_JPEG;
+
     ret = ff_thread_get_buffer(avctx, &frame, 0);
     if (ret < 0)
         return ret;
@@ -651,10 +670,6 @@ static int pixlet_decode_frame(AVCodecContext *avctx, void *data,
     postprocess_luma(frame.f, ctx->w, ctx->h, ctx->depth);
     postprocess_chroma(frame.f, ctx->w >> 1, ctx->h >> 1, ctx->depth);
 
-    p->pict_type   = AV_PICTURE_TYPE_I;
-    p->color_range = AVCOL_RANGE_JPEG;
-    p->key_frame   = 1;
-
     *got_frame = 1;
 
     return pktsize;
diff --git a/libavcodec/png.c b/libavcodec/png.c
index cd75dc1..ef52b51 100644
--- a/libavcodec/png.c
+++ b/libavcodec/png.c
@@ -2,29 +2,25 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
-#include "bytestream.h"
 #include "png.h"
 
-const uint8_t ff_pngsig[8] = { 137, 80, 78, 71, 13, 10, 26, 10 };
-const uint8_t ff_mngsig[8] = { 138, 77, 78, 71, 13, 10, 26, 10 };
-
 /* Mask to determine which y pixels are valid in a pass */
 const uint8_t ff_png_pass_ymask[NB_PASSES] = {
     0x80, 0x80, 0x08, 0x88, 0x22, 0xaa, 0x55,
@@ -40,11 +36,6 @@ static const uint8_t ff_png_pass_xshift[NB_PASSES] = {
     3, 3, 2, 2, 1, 1, 0
 };
 
-/* Mask to determine which pixels are valid in a pass */
-const uint8_t ff_png_pass_mask[NB_PASSES] = {
-    0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff
-};
-
 void *ff_png_zalloc(void *opaque, unsigned int items, unsigned int size)
 {
     return av_mallocz_array(items, size);
diff --git a/libavcodec/png.h b/libavcodec/png.h
index b8c72ee..e967fcf 100644
--- a/libavcodec/png.h
+++ b/libavcodec/png.h
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,22 +42,14 @@
 #define PNG_FILTER_VALUE_PAETH 4
 #define PNG_FILTER_VALUE_MIXED 5
 
-#define PNG_IHDR      0x0001
-#define PNG_IDAT      0x0002
-#define PNG_ALLIMAGE  0x0004
-#define PNG_PLTE      0x0008
-
 #define NB_PASSES 7
 
-extern const uint8_t ff_pngsig[8];
-extern const uint8_t ff_mngsig[8];
+#define PNGSIG 0x89504e470d0a1a0a
+#define MNGSIG 0x8a4d4e470d0a1a0a
 
 /* Mask to determine which y pixels are valid in a pass */
 extern const uint8_t ff_png_pass_ymask[NB_PASSES];
 
-/* Mask to determine which pixels are valid in a pass */
-extern const uint8_t ff_png_pass_mask[NB_PASSES];
-
 void *ff_png_zalloc(void *opaque, unsigned int items, unsigned int size);
 
 void ff_png_zfree(void *opaque, void *ptr);
diff --git a/libavcodec/png_parser.c b/libavcodec/png_parser.c
index c66caf3..74f2964 100644
--- a/libavcodec/png_parser.c
+++ b/libavcodec/png_parser.c
@@ -2,20 +2,20 @@
  * PNG parser
  * Copyright (c) 2009 Peter Holik
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,20 +24,14 @@
  * PNG parser
  */
 
-#include "libavutil/intreadwrite.h"
-#include "libavutil/common.h"
-
 #include "parser.h"
-
-#define PNG_SIGNATURE UINT64_C(0x89504e470d0a1a0a)
-#define MNG_SIGNATURE UINT64_C(0x8a4d4e470d0a1a0a)
+#include "png.h"
 
 typedef struct PNGParseContext {
     ParseContext pc;
-
-    int chunk_pos;          ///< position inside current chunk
-    uint32_t chunk_length;  ///< length of the current chunk
-    int remaining_size;     ///< remaining size of the current chunk
+    uint32_t chunk_pos;           ///< position inside current chunk
+    uint32_t chunk_length;        ///< length of the current chunk
+    uint32_t remaining_size;      ///< remaining size of the current chunk
 } PNGParseContext;
 
 static int png_parse(AVCodecParserContext *s, AVCodecContext *avctx,
@@ -48,16 +42,15 @@ static int png_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     int next = END_NOT_FOUND;
     int i = 0;
 
+    s->pict_type = AV_PICTURE_TYPE_NONE;
+
     *poutbuf_size = 0;
-    if (buf_size == 0)
-        return 0;
 
     if (!ppc->pc.frame_start_found) {
         uint64_t state64 = ppc->pc.state64;
         for (; i < buf_size; i++) {
             state64 = (state64 << 8) | buf[i];
-            if (state64 == PNG_SIGNATURE ||
-                state64 == MNG_SIGNATURE) {
+            if (state64 == PNGSIG || state64 == MNGSIG) {
                 i++;
                 ppc->pc.frame_start_found = 1;
                 break;
diff --git a/libavcodec/pngdec.c b/libavcodec/pngdec.c
index bc8c952..189bb9a 100644
--- a/libavcodec/pngdec.c
+++ b/libavcodec/pngdec.c
@@ -2,47 +2,69 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/avstring.h"
+//#define DEBUG
+
+#include "libavutil/avassert.h"
+#include "libavutil/bprint.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/stereo3d.h"
+#include "libavutil/mastering_display_metadata.h"
 
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "apng.h"
 #include "png.h"
 #include "pngdsp.h"
-
-/* TODO:
- * - add 2, 4 and 16 bit depth support
- */
+#include "thread.h"
 
 #include <zlib.h>
 
+enum PNGHeaderState {
+    PNG_IHDR = 1 << 0,
+    PNG_PLTE = 1 << 1,
+};
+
+enum PNGImageState {
+    PNG_IDAT     = 1 << 0,
+    PNG_ALLIMAGE = 1 << 1,
+};
+
 typedef struct PNGDecContext {
     PNGDSPContext dsp;
+    AVCodecContext *avctx;
 
     GetByteContext gb;
-    AVFrame *prev;
+    ThreadFrame previous_picture;
+    ThreadFrame last_picture;
+    ThreadFrame picture;
 
-    int state;
+    enum PNGHeaderState hdr_state;
+    enum PNGImageState pic_state;
     int width, height;
+    int cur_w, cur_h;
+    int last_w, last_h;
+    int x_offset, y_offset;
+    int last_x_offset, last_y_offset;
+    uint8_t dispose_op, blend_op;
+    uint8_t last_dispose_op;
     int bit_depth;
     int color_type;
     int compression_type;
@@ -51,13 +73,19 @@ typedef struct PNGDecContext {
     int channels;
     int bits_per_pixel;
     int bpp;
+    int has_trns;
+    uint8_t transparent_color_be[6];
 
     uint8_t *image_buf;
     int image_linesize;
     uint32_t palette[256];
     uint8_t *crow_buf;
     uint8_t *last_row;
+    unsigned int last_row_size;
     uint8_t *tmp_row;
+    unsigned int tmp_row_size;
+    uint8_t *buffer;
+    int buffer_size;
     int pass;
     int crow_size; /* compressed row size (include filter type) */
     int row_size; /* decompressed row size */
@@ -66,9 +94,14 @@ typedef struct PNGDecContext {
     z_stream zstream;
 } PNGDecContext;
 
+/* Mask to determine which pixels are valid in a pass */
+static const uint8_t png_pass_mask[NB_PASSES] = {
+    0x01, 0x01, 0x11, 0x11, 0x55, 0x55, 0xff,
+};
+
 /* Mask to determine which y pixels can be written in a pass */
 static const uint8_t png_pass_dsp_ymask[NB_PASSES] = {
-    0xff, 0xff, 0x0f, 0xcc, 0x33, 0xff, 0x55,
+    0xff, 0xff, 0x0f, 0xff, 0x33, 0xff, 0x55,
 };
 
 /* Mask to determine which pixels to overwrite while displaying */
@@ -87,40 +120,55 @@ static void png_put_interlaced_row(uint8_t *dst, int width,
     uint8_t *d;
     const uint8_t *s;
 
-    mask     = ff_png_pass_mask[pass];
+    mask     = png_pass_mask[pass];
     dsp_mask = png_pass_dsp_mask[pass];
 
     switch (bits_per_pixel) {
     case 1:
-        /* we must initialize the line to zero before writing to it */
-        if (pass == 0)
-            memset(dst, 0, (width + 7) >> 3);
         src_x = 0;
         for (x = 0; x < width; x++) {
             j = (x & 7);
             if ((dsp_mask << j) & 0x80) {
                 b = (src[src_x >> 3] >> (7 - (src_x & 7))) & 1;
+                dst[x >> 3] &= 0xFF7F>>j;
                 dst[x >> 3] |= b << (7 - j);
             }
             if ((mask << j) & 0x80)
                 src_x++;
         }
         break;
+    case 2:
+        src_x = 0;
+        for (x = 0; x < width; x++) {
+            int j2 = 2 * (x & 3);
+            j = (x & 7);
+            if ((dsp_mask << j) & 0x80) {
+                b = (src[src_x >> 2] >> (6 - 2*(src_x & 3))) & 3;
+                dst[x >> 2] &= 0xFF3F>>j2;
+                dst[x >> 2] |= b << (6 - j2);
+            }
+            if ((mask << j) & 0x80)
+                src_x++;
+        }
+        break;
+    case 4:
+        src_x = 0;
+        for (x = 0; x < width; x++) {
+            int j2 = 4*(x&1);
+            j = (x & 7);
+            if ((dsp_mask << j) & 0x80) {
+                b = (src[src_x >> 1] >> (4 - 4*(src_x & 1))) & 15;
+                dst[x >> 1] &= 0xFF0F>>j2;
+                dst[x >> 1] |= b << (4 - j2);
+            }
+            if ((mask << j) & 0x80)
+                src_x++;
+        }
+        break;
     default:
         bpp = bits_per_pixel >> 3;
         d   = dst;
         s   = src;
-        if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-            for (x = 0; x < width; x++) {
-                j = x & 7;
-                if ((dsp_mask << j) & 0x80) {
-                    *(uint32_t *)d = (s[3] << 24) | (s[0] << 16) | (s[1] << 8) | s[2];
-                }
-                d += bpp;
-                if ((mask << j) & 0x80)
-                    s += bpp;
-            }
-        } else {
             for (x = 0; x < width; x++) {
                 j = x & 7;
                 if ((dsp_mask << j) & 0x80) {
@@ -130,7 +178,6 @@ static void png_put_interlaced_row(uint8_t *dst, int width,
                 if ((mask << j) & 0x80)
                     s += bpp;
             }
-        }
         break;
     }
 }
@@ -172,7 +219,7 @@ void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
             b = dst[2];                                                       \
         if (bpp >= 4)                                                         \
             a = dst[3];                                                       \
-        for (; i < size; i += bpp) {                                          \
+        for (; i <= size - bpp; i += bpp) {                                   \
             dst[i + 0] = r = op(r, src[i + 0], last[i + 0]);                  \
             if (bpp == 1)                                                     \
                 continue;                                                     \
@@ -195,12 +242,9 @@ void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
         UNROLL1(3, op)                                                        \
     } else if (bpp == 4) {                                                    \
         UNROLL1(4, op)                                                        \
-    } else {                                                                  \
-        for (; i < size; i += bpp) {                                          \
-            int j;                                                            \
-            for (j = 0; j < bpp; j++)                                         \
-                dst[i + j] = op(dst[i + j - bpp], src[i + j], last[i + j]);   \
-        }                                                                     \
+    }                                                                         \
+    for (; i < size; i++) {                                                   \
+        dst[i] = op(dst[i - bpp], src[i], last[i]);                           \
     }
 
 /* NOTE: 'dst' can be equal to 'last' */
@@ -219,12 +263,12 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
         if (bpp == 4) {
             p = *(int *)dst;
             for (; i < size; i += bpp) {
-                int s = *(int *)(src + i);
+                unsigned s = *(int *)(src + i);
                 p = ((s & 0x7f7f7f7f) + (p & 0x7f7f7f7f)) ^ ((s ^ p) & 0x80808080);
                 *(int *)(dst + i) = p;
             }
         } else {
-#define OP_SUB(x, s, l) x + s
+#define OP_SUB(x, s, l) ((x) + (s))
             UNROLL_FILTER(OP_SUB);
         }
         break;
@@ -236,7 +280,7 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
             p      = (last[i] >> 1);
             dst[i] = p + src[i];
         }
-#define OP_AVG(x, s, l) (((x + l) >> 1) + s) & 0xff
+#define OP_AVG(x, s, l) (((((x) + (l)) >> 1) + (s)) & 0xff)
         UNROLL_FILTER(OP_AVG);
         break;
     case PNG_FILTER_VALUE_PAETH:
@@ -247,55 +291,33 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
         if (bpp > 2 && size > 4) {
             /* would write off the end of the array if we let it process
              * the last pixel with bpp=3 */
-            int w = bpp == 4 ? size : size - 3;
-            dsp->add_paeth_prediction(dst + i, src + i, last + i, w - i, bpp);
-            i = w;
+            int w = (bpp & 3) ? size - 3 : size;
+
+            if (w > i) {
+                dsp->add_paeth_prediction(dst + i, src + i, last + i, size - i, bpp);
+                i = w;
+            }
         }
         ff_add_png_paeth_prediction(dst + i, src + i, last + i, size - i, bpp);
         break;
     }
 }
 
-static av_always_inline void convert_to_rgb32_loco(uint8_t *dst,
-                                                   const uint8_t *src,
-                                                   int width, int loco)
-{
-    int j;
-    unsigned int r, g, b, a;
-
-    for (j = 0; j < width; j++) {
-        r = src[0];
-        g = src[1];
-        b = src[2];
-        a = src[3];
-        if (loco) {
-            r = (r + g) & 0xff;
-            b = (b + g) & 0xff;
-        }
-        *(uint32_t *) dst = (a << 24) | (r << 16) | (g << 8) | b;
-        dst += 4;
-        src += 4;
-    }
+/* This used to be called "deloco" in FFmpeg
+ * and is actually an inverse reversible colorspace transformation */
+#define YUV2RGB(NAME, TYPE) \
+static void deloco_ ## NAME(TYPE *dst, int size, int alpha) \
+{ \
+    int i; \
+    for (i = 0; i < size; i += 3 + alpha) { \
+        int g = dst [i + 1]; \
+        dst[i + 0] += g; \
+        dst[i + 2] += g; \
+    } \
 }
 
-static void convert_to_rgb32(uint8_t *dst, const uint8_t *src,
-                             int width, int loco)
-{
-    if (loco)
-        convert_to_rgb32_loco(dst, src, width, 1);
-    else
-        convert_to_rgb32_loco(dst, src, width, 0);
-}
-
-static void deloco_rgb24(uint8_t *dst, int size)
-{
-    int i;
-    for (i = 0; i < size; i += 3) {
-        int g = dst[i + 1];
-        dst[i + 0] += g;
-        dst[i + 2] += g;
-    }
-}
+YUV2RGB(rgb8, uint8_t)
+YUV2RGB(rgb16, uint16_t)
 
 /* process exactly one decompressed row */
 static void png_handle_row(PNGDecContext *s)
@@ -304,39 +326,41 @@ static void png_handle_row(PNGDecContext *s)
     int got_line;
 
     if (!s->interlace_type) {
-        ptr = s->image_buf + s->image_linesize * s->y;
-        /* need to swap bytes correctly for RGB_ALPHA */
-        if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-            png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
-                           s->last_row, s->row_size, s->bpp);
-            convert_to_rgb32(ptr, s->tmp_row, s->width,
-                             s->filter_type == PNG_FILTER_TYPE_LOCO);
-            FFSWAP(uint8_t *, s->last_row, s->tmp_row);
-        } else {
-            /* in normal case, we avoid one copy */
-            if (s->y == 0)
-                last_row = s->last_row;
-            else
-                last_row = ptr - s->image_linesize;
+        ptr = s->image_buf + s->image_linesize * (s->y + s->y_offset) + s->x_offset * s->bpp;
+        if (s->y == 0)
+            last_row = s->last_row;
+        else
+            last_row = ptr - s->image_linesize;
 
-            png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1,
-                           last_row, s->row_size, s->bpp);
-        }
+        png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1,
+                       last_row, s->row_size, s->bpp);
         /* loco lags by 1 row so that it doesn't interfere with top prediction */
-        if (s->filter_type == PNG_FILTER_TYPE_LOCO &&
-            s->color_type == PNG_COLOR_TYPE_RGB && s->y > 0)
-            deloco_rgb24(ptr - s->image_linesize, s->row_size);
+        if (s->filter_type == PNG_FILTER_TYPE_LOCO && s->y > 0) {
+            if (s->bit_depth == 16) {
+                deloco_rgb16((uint16_t *)(ptr - s->image_linesize), s->row_size / 2,
+                             s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+            } else {
+                deloco_rgb8(ptr - s->image_linesize, s->row_size,
+                            s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+            }
+        }
         s->y++;
-        if (s->y == s->height) {
-            s->state |= PNG_ALLIMAGE;
-            if (s->filter_type == PNG_FILTER_TYPE_LOCO &&
-                s->color_type == PNG_COLOR_TYPE_RGB)
-                deloco_rgb24(ptr, s->row_size);
+        if (s->y == s->cur_h) {
+            s->pic_state |= PNG_ALLIMAGE;
+            if (s->filter_type == PNG_FILTER_TYPE_LOCO) {
+                if (s->bit_depth == 16) {
+                    deloco_rgb16((uint16_t *)ptr, s->row_size / 2,
+                                 s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+                } else {
+                    deloco_rgb8(ptr, s->row_size,
+                                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+                }
+            }
         }
     } else {
         got_line = 0;
         for (;;) {
-            ptr = s->image_buf + s->image_linesize * s->y;
+            ptr = s->image_buf + s->image_linesize * (s->y + s->y_offset) + s->x_offset * s->bpp;
             if ((ff_png_pass_ymask[s->pass] << (s->y & 7)) & 0x80) {
                 /* if we already read one row, it is time to stop to
                  * wait for the next one */
@@ -345,25 +369,26 @@ static void png_handle_row(PNGDecContext *s)
                 png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
                                s->last_row, s->pass_row_size, s->bpp);
                 FFSWAP(uint8_t *, s->last_row, s->tmp_row);
+                FFSWAP(unsigned int, s->last_row_size, s->tmp_row_size);
                 got_line = 1;
             }
             if ((png_pass_dsp_ymask[s->pass] << (s->y & 7)) & 0x80) {
-                /* NOTE: RGB32 is handled directly in png_put_interlaced_row */
-                png_put_interlaced_row(ptr, s->width, s->bits_per_pixel, s->pass,
+                png_put_interlaced_row(ptr, s->cur_w, s->bits_per_pixel, s->pass,
                                        s->color_type, s->last_row);
             }
             s->y++;
-            if (s->y == s->height) {
+            if (s->y == s->cur_h) {
+                memset(s->last_row, 0, s->row_size);
                 for (;;) {
                     if (s->pass == NB_PASSES - 1) {
-                        s->state |= PNG_ALLIMAGE;
+                        s->pic_state |= PNG_ALLIMAGE;
                         goto the_end;
                     } else {
                         s->pass++;
                         s->y = 0;
                         s->pass_row_size = ff_png_pass_row_size(s->pass,
                                                                 s->bits_per_pixel,
-                                                                s->width);
+                                                                s->cur_w);
                         s->crow_size = s->pass_row_size + 1;
                         if (s->pass_row_size != 0)
                             break;
@@ -380,17 +405,18 @@ static int png_decode_idat(PNGDecContext *s, int length)
 {
     int ret;
     s->zstream.avail_in = FFMIN(length, bytestream2_get_bytes_left(&s->gb));
-    s->zstream.next_in  = s->gb.buffer;
+    s->zstream.next_in  = (unsigned char *)s->gb.buffer;
     bytestream2_skip(&s->gb, length);
 
     /* decode one line if possible */
     while (s->zstream.avail_in > 0) {
         ret = inflate(&s->zstream, Z_PARTIAL_FLUSH);
         if (ret != Z_OK && ret != Z_STREAM_END) {
-            return -1;
+            av_log(s->avctx, AV_LOG_ERROR, "inflate returned error %d\n", ret);
+            return AVERROR_EXTERNAL;
         }
         if (s->zstream.avail_out == 0) {
-            if (!(s->state & PNG_ALLIMAGE)) {
+            if (!(s->pic_state & PNG_ALLIMAGE)) {
                 png_handle_row(s);
             }
             s->zstream.avail_out = s->crow_size;
@@ -405,222 +431,853 @@ static int png_decode_idat(PNGDecContext *s, int length)
     return 0;
 }
 
-static int decode_frame(AVCodecContext *avctx,
-                        void *data, int *got_frame,
-                        AVPacket *avpkt)
+static int decode_zbuf(AVBPrint *bp, const uint8_t *data,
+                       const uint8_t *data_end)
 {
-    PNGDecContext *const s = avctx->priv_data;
-    const uint8_t *buf     = avpkt->data;
-    int buf_size           = avpkt->size;
-    AVFrame *p             = data;
-    uint8_t *crow_buf_base = NULL;
-    uint32_t tag, length;
+    z_stream zstream;
+    unsigned char *buf;
+    unsigned buf_size;
     int ret;
 
-    /* check signature */
-    if (buf_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Not enough data %d\n",
-               buf_size);
+    zstream.zalloc = ff_png_zalloc;
+    zstream.zfree  = ff_png_zfree;
+    zstream.opaque = NULL;
+    if (inflateInit(&zstream) != Z_OK)
+        return AVERROR_EXTERNAL;
+    zstream.next_in  = (unsigned char *)data;
+    zstream.avail_in = data_end - data;
+    av_bprint_init(bp, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    while (zstream.avail_in > 0) {
+        av_bprint_get_buffer(bp, 2, &buf, &buf_size);
+        if (buf_size < 2) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+        zstream.next_out  = buf;
+        zstream.avail_out = buf_size - 1;
+        ret = inflate(&zstream, Z_PARTIAL_FLUSH);
+        if (ret != Z_OK && ret != Z_STREAM_END) {
+            ret = AVERROR_EXTERNAL;
+            goto fail;
+        }
+        bp->len += zstream.next_out - buf;
+        if (ret == Z_STREAM_END)
+            break;
+    }
+    inflateEnd(&zstream);
+    bp->str[bp->len] = 0;
+    return 0;
+
+fail:
+    inflateEnd(&zstream);
+    av_bprint_finalize(bp, NULL);
+    return ret;
+}
+
+static uint8_t *iso88591_to_utf8(const uint8_t *in, size_t size_in)
+{
+    size_t extra = 0, i;
+    uint8_t *out, *q;
+
+    for (i = 0; i < size_in; i++)
+        extra += in[i] >= 0x80;
+    if (size_in == SIZE_MAX || extra > SIZE_MAX - size_in - 1)
+        return NULL;
+    q = out = av_malloc(size_in + extra + 1);
+    if (!out)
+        return NULL;
+    for (i = 0; i < size_in; i++) {
+        if (in[i] >= 0x80) {
+            *(q++) = 0xC0 | (in[i] >> 6);
+            *(q++) = 0x80 | (in[i] & 0x3F);
+        } else {
+            *(q++) = in[i];
+        }
+    }
+    *(q++) = 0;
+    return out;
+}
+
+static int decode_text_chunk(PNGDecContext *s, uint32_t length, int compressed,
+                             AVDictionary **dict)
+{
+    int ret, method;
+    const uint8_t *data        = s->gb.buffer;
+    const uint8_t *data_end    = data + length;
+    const uint8_t *keyword     = data;
+    const uint8_t *keyword_end = memchr(keyword, 0, data_end - keyword);
+    uint8_t *kw_utf8 = NULL, *text, *txt_utf8 = NULL;
+    unsigned text_len;
+    AVBPrint bp;
+
+    if (!keyword_end)
+        return AVERROR_INVALIDDATA;
+    data = keyword_end + 1;
+
+    if (compressed) {
+        if (data == data_end)
+            return AVERROR_INVALIDDATA;
+        method = *(data++);
+        if (method)
+            return AVERROR_INVALIDDATA;
+        if ((ret = decode_zbuf(&bp, data, data_end)) < 0)
+            return ret;
+        text_len = bp.len;
+        ret = av_bprint_finalize(&bp, (char **)&text);
+        if (ret < 0)
+            return ret;
+    } else {
+        text = (uint8_t *)data;
+        text_len = data_end - text;
+    }
+
+    kw_utf8  = iso88591_to_utf8(keyword, keyword_end - keyword);
+    txt_utf8 = iso88591_to_utf8(text, text_len);
+    if (text != data)
+        av_free(text);
+    if (!(kw_utf8 && txt_utf8)) {
+        av_free(kw_utf8);
+        av_free(txt_utf8);
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(dict, kw_utf8, txt_utf8,
+                AV_DICT_DONT_STRDUP_KEY | AV_DICT_DONT_STRDUP_VAL);
+    return 0;
+}
+
+static int decode_ihdr_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    if (length != 13)
+        return AVERROR_INVALIDDATA;
+
+    if (s->pic_state & PNG_IDAT) {
+        av_log(avctx, AV_LOG_ERROR, "IHDR after IDAT\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->hdr_state & PNG_IHDR) {
+        av_log(avctx, AV_LOG_ERROR, "Multiple IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->width  = s->cur_w = bytestream2_get_be32(&s->gb);
+    s->height = s->cur_h = bytestream2_get_be32(&s->gb);
+    if (av_image_check_size(s->width, s->height, 0, avctx)) {
+        s->cur_w = s->cur_h = s->width = s->height = 0;
+        av_log(avctx, AV_LOG_ERROR, "Invalid image size\n");
         return AVERROR_INVALIDDATA;
     }
-    if (memcmp(buf, ff_pngsig, 8) != 0 &&
-        memcmp(buf, ff_mngsig, 8) != 0) {
-        char signature[5 * 8 + 1] = { 0 };
-        int i;
-        for (i = 0; i < 8; i++) {
-            av_strlcatf(signature + i * 5, sizeof(signature) - i * 5,
-                        " 0x%02x", buf[i]);
+    s->bit_depth        = bytestream2_get_byte(&s->gb);
+    if (s->bit_depth != 1 && s->bit_depth != 2 && s->bit_depth != 4 &&
+        s->bit_depth != 8 && s->bit_depth != 16) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid bit depth\n");
+        goto error;
+    }
+    s->color_type       = bytestream2_get_byte(&s->gb);
+    s->compression_type = bytestream2_get_byte(&s->gb);
+    if (s->compression_type) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid compression method %d\n", s->compression_type);
+        goto error;
+    }
+    s->filter_type      = bytestream2_get_byte(&s->gb);
+    s->interlace_type   = bytestream2_get_byte(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* crc */
+    s->hdr_state |= PNG_IHDR;
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_DEBUG, "width=%d height=%d depth=%d color_type=%d "
+                "compression_type=%d filter_type=%d interlace_type=%d\n",
+                s->width, s->height, s->bit_depth, s->color_type,
+                s->compression_type, s->filter_type, s->interlace_type);
+
+    return 0;
+error:
+    s->cur_w = s->cur_h = s->width = s->height = 0;
+    s->bit_depth = 8;
+    return AVERROR_INVALIDDATA;
+}
+
+static int decode_phys_chunk(AVCodecContext *avctx, PNGDecContext *s)
+{
+    if (s->pic_state & PNG_IDAT) {
+        av_log(avctx, AV_LOG_ERROR, "pHYs after IDAT\n");
+        return AVERROR_INVALIDDATA;
+    }
+    avctx->sample_aspect_ratio.num = bytestream2_get_be32(&s->gb);
+    avctx->sample_aspect_ratio.den = bytestream2_get_be32(&s->gb);
+    if (avctx->sample_aspect_ratio.num < 0 || avctx->sample_aspect_ratio.den < 0)
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+    bytestream2_skip(&s->gb, 1); /* unit specifier */
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    return 0;
+}
+
+static int decode_idat_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length, AVFrame *p)
+{
+    int ret;
+    size_t byte_depth = s->bit_depth > 8 ? 2 : 1;
+
+    if (!(s->hdr_state & PNG_IHDR)) {
+        av_log(avctx, AV_LOG_ERROR, "IDAT without IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (!(s->pic_state & PNG_IDAT)) {
+        /* init image info */
+        ret = ff_set_dimensions(avctx, s->width, s->height);
+        if (ret < 0)
+            return ret;
+
+        s->channels       = ff_png_get_nb_channels(s->color_type);
+        s->bits_per_pixel = s->bit_depth * s->channels;
+        s->bpp            = (s->bits_per_pixel + 7) >> 3;
+        s->row_size       = (s->cur_w * s->bits_per_pixel + 7) >> 3;
+
+        if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_RGB) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        } else if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_GRAY) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_RGB) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+        } else if ((s->bits_per_pixel == 1 || s->bits_per_pixel == 2 || s->bits_per_pixel == 4 || s->bits_per_pixel == 8) &&
+                s->color_type == PNG_COLOR_TYPE_PALETTE) {
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        } else if (s->bit_depth == 1 && s->bits_per_pixel == 1 && avctx->codec_id != AV_CODEC_ID_APNG) {
+            avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+        } else if (s->bit_depth == 8 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_YA8;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_YA16BE;
+        } else {
+            avpriv_report_missing_feature(avctx,
+                                          "Bit depth %d color type %d",
+                                          s->bit_depth, s->color_type);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE) {
+            switch (avctx->pix_fmt) {
+            case AV_PIX_FMT_RGB24:
+                avctx->pix_fmt = AV_PIX_FMT_RGBA;
+                break;
+
+            case AV_PIX_FMT_RGB48BE:
+                avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+                break;
+
+            case AV_PIX_FMT_GRAY8:
+                avctx->pix_fmt = AV_PIX_FMT_YA8;
+                break;
+
+            case AV_PIX_FMT_GRAY16BE:
+                avctx->pix_fmt = AV_PIX_FMT_YA16BE;
+                break;
+
+            default:
+                avpriv_request_sample(avctx, "bit depth %d "
+                        "and color type %d with TRNS",
+                        s->bit_depth, s->color_type);
+                return AVERROR_INVALIDDATA;
+            }
+
+            s->bpp += byte_depth;
+        }
+
+        if ((ret = ff_thread_get_buffer(avctx, &s->picture, AV_GET_BUFFER_FLAG_REF)) < 0)
+            return ret;
+        if (avctx->codec_id == AV_CODEC_ID_APNG && s->last_dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+            ff_thread_release_buffer(avctx, &s->previous_picture);
+            if ((ret = ff_thread_get_buffer(avctx, &s->previous_picture, AV_GET_BUFFER_FLAG_REF)) < 0)
+                return ret;
         }
-        av_log(avctx, AV_LOG_ERROR, "Invalid PNG signature %s\n",
-               signature);
+        p->pict_type        = AV_PICTURE_TYPE_I;
+        p->key_frame        = 1;
+        p->interlaced_frame = !!s->interlace_type;
+
+        ff_thread_finish_setup(avctx);
+
+        /* compute the compressed row size */
+        if (!s->interlace_type) {
+            s->crow_size = s->row_size + 1;
+        } else {
+            s->pass          = 0;
+            s->pass_row_size = ff_png_pass_row_size(s->pass,
+                    s->bits_per_pixel,
+                    s->cur_w);
+            s->crow_size = s->pass_row_size + 1;
+        }
+        ff_dlog(avctx, "row_size=%d crow_size =%d\n",
+                s->row_size, s->crow_size);
+        s->image_buf      = p->data[0];
+        s->image_linesize = p->linesize[0];
+        /* copy the palette if needed */
+        if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+            memcpy(p->data[1], s->palette, 256 * sizeof(uint32_t));
+        /* empty row is used if differencing to the first row */
+        av_fast_padded_mallocz(&s->last_row, &s->last_row_size, s->row_size);
+        if (!s->last_row)
+            return AVERROR_INVALIDDATA;
+        if (s->interlace_type ||
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            av_fast_padded_malloc(&s->tmp_row, &s->tmp_row_size, s->row_size);
+            if (!s->tmp_row)
+                return AVERROR_INVALIDDATA;
+        }
+        /* compressed row */
+        av_fast_padded_malloc(&s->buffer, &s->buffer_size, s->row_size + 16);
+        if (!s->buffer)
+            return AVERROR(ENOMEM);
+
+        /* we want crow_buf+1 to be 16-byte aligned */
+        s->crow_buf          = s->buffer + 15;
+        s->zstream.avail_out = s->crow_size;
+        s->zstream.next_out  = s->crow_buf;
+    }
+
+    s->pic_state |= PNG_IDAT;
+
+    /* set image to non-transparent bpp while decompressing */
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE)
+        s->bpp -= byte_depth;
+
+    ret = png_decode_idat(s, length);
+
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE)
+        s->bpp += byte_depth;
+
+    if (ret < 0)
+        return ret;
+
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    return 0;
+}
+
+static int decode_plte_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    int n, i, r, g, b;
+
+    if ((length % 3) != 0 || length > 256 * 3)
         return AVERROR_INVALIDDATA;
+    /* read the palette */
+    n = length / 3;
+    for (i = 0; i < n; i++) {
+        r = bytestream2_get_byte(&s->gb);
+        g = bytestream2_get_byte(&s->gb);
+        b = bytestream2_get_byte(&s->gb);
+        s->palette[i] = (0xFFU << 24) | (r << 16) | (g << 8) | b;
     }
+    for (; i < 256; i++)
+        s->palette[i] = (0xFFU << 24);
+    s->hdr_state |= PNG_PLTE;
+    bytestream2_skip(&s->gb, 4);     /* crc */
 
-    bytestream2_init(&s->gb, buf + 8, buf_size - 8);
-    s->y = s->state = 0;
+    return 0;
+}
+
+static int decode_trns_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    int v, i;
+
+    if (!(s->hdr_state & PNG_IHDR)) {
+        av_log(avctx, AV_LOG_ERROR, "trns before IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->pic_state & PNG_IDAT) {
+        av_log(avctx, AV_LOG_ERROR, "trns after IDAT\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        if (length > 256 || !(s->hdr_state & PNG_PLTE))
+            return AVERROR_INVALIDDATA;
+
+        for (i = 0; i < length; i++) {
+            unsigned v = bytestream2_get_byte(&s->gb);
+            s->palette[i] = (s->palette[i] & 0x00ffffff) | (v << 24);
+        }
+    } else if (s->color_type == PNG_COLOR_TYPE_GRAY || s->color_type == PNG_COLOR_TYPE_RGB) {
+        if ((s->color_type == PNG_COLOR_TYPE_GRAY && length != 2) ||
+            (s->color_type == PNG_COLOR_TYPE_RGB && length != 6) ||
+            s->bit_depth == 1)
+            return AVERROR_INVALIDDATA;
+
+        for (i = 0; i < length / 2; i++) {
+            /* only use the least significant bits */
+            v = av_mod_uintp2(bytestream2_get_be16(&s->gb), s->bit_depth);
+
+            if (s->bit_depth > 8)
+                AV_WB16(&s->transparent_color_be[2 * i], v);
+            else
+                s->transparent_color_be[i] = v;
+        }
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&s->gb, 4); /* crc */
+    s->has_trns = 1;
+
+    return 0;
+}
+
+static int decode_iccp_chunk(PNGDecContext *s, int length, AVFrame *f)
+{
+    int ret, cnt = 0;
+    uint8_t *data, profile_name[82];
+    AVBPrint bp;
+    AVFrameSideData *sd;
+
+    while ((profile_name[cnt++] = bytestream2_get_byte(&s->gb)) && cnt < 81);
+    if (cnt > 80) {
+        av_log(s->avctx, AV_LOG_ERROR, "iCCP with invalid name!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    length = FFMAX(length - cnt, 0);
+
+    if (bytestream2_get_byte(&s->gb) != 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "iCCP with invalid compression!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    length = FFMAX(length - 1, 0);
+
+    if ((ret = decode_zbuf(&bp, s->gb.buffer, s->gb.buffer + length)) < 0)
+        return ret;
+
+    ret = av_bprint_finalize(&bp, (char **)&data);
+    if (ret < 0)
+        return ret;
+
+    sd = av_frame_new_side_data(f, AV_FRAME_DATA_ICC_PROFILE, bp.len);
+    if (!sd) {
+        av_free(data);
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(&sd->metadata, "name", profile_name, 0);
+    memcpy(sd->data, data, bp.len);
+    av_free(data);
+
+    /* ICC compressed data and CRC */
+    bytestream2_skip(&s->gb, length + 4);
+
+    return 0;
+}
+
+static void handle_small_bpp(PNGDecContext *s, AVFrame *p)
+{
+    if (s->bits_per_pixel == 1 && s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        int i, j, k;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width / 8;
+            for (k = 7; k >= 1; k--)
+                if ((s->width&7) >= k)
+                    pd[8*i + k - 1] = (pd[i]>>8-k) & 1;
+            for (i--; i >= 0; i--) {
+                pd[8*i + 7]=  pd[i]     & 1;
+                pd[8*i + 6]= (pd[i]>>1) & 1;
+                pd[8*i + 5]= (pd[i]>>2) & 1;
+                pd[8*i + 4]= (pd[i]>>3) & 1;
+                pd[8*i + 3]= (pd[i]>>4) & 1;
+                pd[8*i + 2]= (pd[i]>>5) & 1;
+                pd[8*i + 1]= (pd[i]>>6) & 1;
+                pd[8*i + 0]=  pd[i]>>7;
+            }
+            pd += s->image_linesize;
+        }
+    } else if (s->bits_per_pixel == 2) {
+        int i, j;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width / 4;
+            if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+                if ((s->width&3) >= 3) pd[4*i + 2]= (pd[i] >> 2) & 3;
+                if ((s->width&3) >= 2) pd[4*i + 1]= (pd[i] >> 4) & 3;
+                if ((s->width&3) >= 1) pd[4*i + 0]=  pd[i] >> 6;
+                for (i--; i >= 0; i--) {
+                    pd[4*i + 3]=  pd[i]     & 3;
+                    pd[4*i + 2]= (pd[i]>>2) & 3;
+                    pd[4*i + 1]= (pd[i]>>4) & 3;
+                    pd[4*i + 0]=  pd[i]>>6;
+                }
+            } else {
+                if ((s->width&3) >= 3) pd[4*i + 2]= ((pd[i]>>2) & 3)*0x55;
+                if ((s->width&3) >= 2) pd[4*i + 1]= ((pd[i]>>4) & 3)*0x55;
+                if ((s->width&3) >= 1) pd[4*i + 0]= ( pd[i]>>6     )*0x55;
+                for (i--; i >= 0; i--) {
+                    pd[4*i + 3]= ( pd[i]     & 3)*0x55;
+                    pd[4*i + 2]= ((pd[i]>>2) & 3)*0x55;
+                    pd[4*i + 1]= ((pd[i]>>4) & 3)*0x55;
+                    pd[4*i + 0]= ( pd[i]>>6     )*0x55;
+                }
+            }
+            pd += s->image_linesize;
+        }
+    } else if (s->bits_per_pixel == 4) {
+        int i, j;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width/2;
+            if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+                if (s->width&1) pd[2*i+0]= pd[i]>>4;
+                for (i--; i >= 0; i--) {
+                    pd[2*i + 1] = pd[i] & 15;
+                    pd[2*i + 0] = pd[i] >> 4;
+                }
+            } else {
+                if (s->width & 1) pd[2*i + 0]= (pd[i] >> 4) * 0x11;
+                for (i--; i >= 0; i--) {
+                    pd[2*i + 1] = (pd[i] & 15) * 0x11;
+                    pd[2*i + 0] = (pd[i] >> 4) * 0x11;
+                }
+            }
+            pd += s->image_linesize;
+        }
+    }
+}
+
+static int decode_fctl_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    uint32_t sequence_number;
+    int cur_w, cur_h, x_offset, y_offset, dispose_op, blend_op;
+
+    if (length != 26)
+        return AVERROR_INVALIDDATA;
+
+    if (!(s->hdr_state & PNG_IHDR)) {
+        av_log(avctx, AV_LOG_ERROR, "fctl before IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->last_w = s->cur_w;
+    s->last_h = s->cur_h;
+    s->last_x_offset = s->x_offset;
+    s->last_y_offset = s->y_offset;
+    s->last_dispose_op = s->dispose_op;
+
+    sequence_number = bytestream2_get_be32(&s->gb);
+    cur_w           = bytestream2_get_be32(&s->gb);
+    cur_h           = bytestream2_get_be32(&s->gb);
+    x_offset        = bytestream2_get_be32(&s->gb);
+    y_offset        = bytestream2_get_be32(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* delay_num (2), delay_den (2) */
+    dispose_op      = bytestream2_get_byte(&s->gb);
+    blend_op        = bytestream2_get_byte(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    if (sequence_number == 0 &&
+        (cur_w != s->width ||
+         cur_h != s->height ||
+         x_offset != 0 ||
+         y_offset != 0) ||
+        cur_w <= 0 || cur_h <= 0 ||
+        x_offset < 0 || y_offset < 0 ||
+        cur_w > s->width - x_offset|| cur_h > s->height - y_offset)
+            return AVERROR_INVALIDDATA;
+
+    if (blend_op != APNG_BLEND_OP_OVER && blend_op != APNG_BLEND_OP_SOURCE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid blend_op %d\n", blend_op);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((sequence_number == 0 || !s->previous_picture.f->data[0]) &&
+        dispose_op == APNG_DISPOSE_OP_PREVIOUS) {
+        // No previous frame to revert to for the first frame
+        // Spec says to just treat it as a APNG_DISPOSE_OP_BACKGROUND
+        dispose_op = APNG_DISPOSE_OP_BACKGROUND;
+    }
+
+    if (blend_op == APNG_BLEND_OP_OVER && !s->has_trns && (
+            avctx->pix_fmt == AV_PIX_FMT_RGB24 ||
+            avctx->pix_fmt == AV_PIX_FMT_RGB48BE ||
+            avctx->pix_fmt == AV_PIX_FMT_PAL8 ||
+            avctx->pix_fmt == AV_PIX_FMT_GRAY8 ||
+            avctx->pix_fmt == AV_PIX_FMT_GRAY16BE ||
+            avctx->pix_fmt == AV_PIX_FMT_MONOBLACK
+        )) {
+        // APNG_BLEND_OP_OVER is the same as APNG_BLEND_OP_SOURCE when there is no alpha channel
+        blend_op = APNG_BLEND_OP_SOURCE;
+    }
+
+    s->cur_w      = cur_w;
+    s->cur_h      = cur_h;
+    s->x_offset   = x_offset;
+    s->y_offset   = y_offset;
+    s->dispose_op = dispose_op;
+    s->blend_op   = blend_op;
+
+    return 0;
+}
+
+static void handle_p_frame_png(PNGDecContext *s, AVFrame *p)
+{
+    int i, j;
+    uint8_t *pd      = p->data[0];
+    uint8_t *pd_last = s->last_picture.f->data[0];
+    int ls = FFMIN(av_image_get_linesize(p->format, s->width, 0), s->width * s->bpp);
+
+    ff_thread_await_progress(&s->last_picture, INT_MAX, 0);
+    for (j = 0; j < s->height; j++) {
+        for (i = 0; i < ls; i++)
+            pd[i] += pd_last[i];
+        pd      += s->image_linesize;
+        pd_last += s->image_linesize;
+    }
+}
+
+// divide by 255 and round to nearest
+// apply a fast variant: (X+127)/255 = ((X+127)*257+257)>>16 = ((X+128)*257)>>16
+#define FAST_DIV255(x) ((((x) + 128) * 257) >> 16)
+
+static int handle_p_frame_apng(AVCodecContext *avctx, PNGDecContext *s,
+                               AVFrame *p)
+{
+    size_t x, y;
+    uint8_t *buffer;
+
+    if (s->blend_op == APNG_BLEND_OP_OVER &&
+        avctx->pix_fmt != AV_PIX_FMT_RGBA &&
+        avctx->pix_fmt != AV_PIX_FMT_GRAY8A &&
+        avctx->pix_fmt != AV_PIX_FMT_PAL8) {
+        avpriv_request_sample(avctx, "Blending with pixel format %s",
+                              av_get_pix_fmt_name(avctx->pix_fmt));
+        return AVERROR_PATCHWELCOME;
+    }
+
+    buffer = av_malloc_array(s->image_linesize, s->height);
+    if (!buffer)
+        return AVERROR(ENOMEM);
+
+
+    // Do the disposal operation specified by the last frame on the frame
+    if (s->last_dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+        ff_thread_await_progress(&s->last_picture, INT_MAX, 0);
+        memcpy(buffer, s->last_picture.f->data[0], s->image_linesize * s->height);
+
+        if (s->last_dispose_op == APNG_DISPOSE_OP_BACKGROUND)
+            for (y = s->last_y_offset; y < s->last_y_offset + s->last_h; ++y)
+                memset(buffer + s->image_linesize * y + s->bpp * s->last_x_offset, 0, s->bpp * s->last_w);
+
+        memcpy(s->previous_picture.f->data[0], buffer, s->image_linesize * s->height);
+        ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+    } else {
+        ff_thread_await_progress(&s->previous_picture, INT_MAX, 0);
+        memcpy(buffer, s->previous_picture.f->data[0], s->image_linesize * s->height);
+    }
+
+    // Perform blending
+    if (s->blend_op == APNG_BLEND_OP_SOURCE) {
+        for (y = s->y_offset; y < s->y_offset + s->cur_h; ++y) {
+            size_t row_start = s->image_linesize * y + s->bpp * s->x_offset;
+            memcpy(buffer + row_start, p->data[0] + row_start, s->bpp * s->cur_w);
+        }
+    } else { // APNG_BLEND_OP_OVER
+        for (y = s->y_offset; y < s->y_offset + s->cur_h; ++y) {
+            uint8_t *foreground = p->data[0] + s->image_linesize * y + s->bpp * s->x_offset;
+            uint8_t *background = buffer + s->image_linesize * y + s->bpp * s->x_offset;
+            for (x = s->x_offset; x < s->x_offset + s->cur_w; ++x, foreground += s->bpp, background += s->bpp) {
+                size_t b;
+                uint8_t foreground_alpha, background_alpha, output_alpha;
+                uint8_t output[10];
+
+                // Since we might be blending alpha onto alpha, we use the following equations:
+                // output_alpha = foreground_alpha + (1 - foreground_alpha) * background_alpha
+                // output = (foreground_alpha * foreground + (1 - foreground_alpha) * background_alpha * background) / output_alpha
+
+                switch (avctx->pix_fmt) {
+                case AV_PIX_FMT_RGBA:
+                    foreground_alpha = foreground[3];
+                    background_alpha = background[3];
+                    break;
+
+                case AV_PIX_FMT_GRAY8A:
+                    foreground_alpha = foreground[1];
+                    background_alpha = background[1];
+                    break;
+
+                case AV_PIX_FMT_PAL8:
+                    foreground_alpha = s->palette[foreground[0]] >> 24;
+                    background_alpha = s->palette[background[0]] >> 24;
+                    break;
+                }
+
+                if (foreground_alpha == 0)
+                    continue;
+
+                if (foreground_alpha == 255) {
+                    memcpy(background, foreground, s->bpp);
+                    continue;
+                }
+
+                if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+                    // TODO: Alpha blending with PAL8 will likely need the entire image converted over to RGBA first
+                    avpriv_request_sample(avctx, "Alpha blending palette samples");
+                    background[0] = foreground[0];
+                    continue;
+                }
+
+                output_alpha = foreground_alpha + FAST_DIV255((255 - foreground_alpha) * background_alpha);
+
+                av_assert0(s->bpp <= 10);
+
+                for (b = 0; b < s->bpp - 1; ++b) {
+                    if (output_alpha == 0) {
+                        output[b] = 0;
+                    } else if (background_alpha == 255) {
+                        output[b] = FAST_DIV255(foreground_alpha * foreground[b] + (255 - foreground_alpha) * background[b]);
+                    } else {
+                        output[b] = (255 * foreground_alpha * foreground[b] + (255 - foreground_alpha) * background_alpha * background[b]) / (255 * output_alpha);
+                    }
+                }
+                output[b] = output_alpha;
+                memcpy(background, output, s->bpp);
+            }
+        }
+    }
+
+    // Copy blended buffer into the frame and free
+    memcpy(p->data[0], buffer, s->image_linesize * s->height);
+    av_free(buffer);
+
+    return 0;
+}
+
+static int decode_frame_common(AVCodecContext *avctx, PNGDecContext *s,
+                               AVFrame *p, AVPacket *avpkt)
+{
+    AVDictionary **metadatap = NULL;
+    uint32_t tag, length;
+    int decode_next_dat = 0;
+    int i, ret;
 
-    /* init the zlib */
-    s->zstream.zalloc = ff_png_zalloc;
-    s->zstream.zfree  = ff_png_zfree;
-    s->zstream.opaque = NULL;
-    ret = inflateInit(&s->zstream);
-    if (ret != Z_OK)
-        return -1;
     for (;;) {
-        if (bytestream2_get_bytes_left(&s->gb) <= 0)
+        length = bytestream2_get_bytes_left(&s->gb);
+        if (length <= 0) {
+
+            if (avctx->codec_id == AV_CODEC_ID_PNG &&
+                avctx->skip_frame == AVDISCARD_ALL) {
+                return 0;
+            }
+
+            if (CONFIG_APNG_DECODER && avctx->codec_id == AV_CODEC_ID_APNG && length == 0) {
+                if (!(s->pic_state & PNG_IDAT))
+                    return 0;
+                else
+                    goto exit_loop;
+            }
+            av_log(avctx, AV_LOG_ERROR, "%d bytes left\n", length);
+            if (   s->pic_state & PNG_ALLIMAGE
+                && avctx->strict_std_compliance <= FF_COMPLIANCE_NORMAL)
+                goto exit_loop;
+            ret = AVERROR_INVALIDDATA;
             goto fail;
+        }
+
         length = bytestream2_get_be32(&s->gb);
-        if (length > 0x7fffffff)
+        if (length > 0x7fffffff || length > bytestream2_get_bytes_left(&s->gb)) {
+            av_log(avctx, AV_LOG_ERROR, "chunk too big\n");
+            ret = AVERROR_INVALIDDATA;
             goto fail;
+        }
         tag = bytestream2_get_le32(&s->gb);
-        ff_dlog(avctx, "png: tag=%c%c%c%c length=%"PRIu32"\n",
-                (tag & 0xff),
-                ((tag >> 8) & 0xff),
-                ((tag >> 16) & 0xff),
-                ((tag >> 24) & 0xff), length);
+        if (avctx->debug & FF_DEBUG_STARTCODE)
+            av_log(avctx, AV_LOG_DEBUG, "png: tag=%s length=%u\n",
+                   av_fourcc2str(tag), length);
+
+        if (avctx->codec_id == AV_CODEC_ID_PNG &&
+            avctx->skip_frame == AVDISCARD_ALL) {
+            switch(tag) {
+            case MKTAG('I', 'H', 'D', 'R'):
+            case MKTAG('p', 'H', 'Y', 's'):
+            case MKTAG('t', 'E', 'X', 't'):
+            case MKTAG('I', 'D', 'A', 'T'):
+            case MKTAG('t', 'R', 'N', 'S'):
+                break;
+            default:
+                goto skip_tag;
+            }
+        }
+
+        metadatap = &p->metadata;
         switch (tag) {
         case MKTAG('I', 'H', 'D', 'R'):
-            if (length != 13)
+            if ((ret = decode_ihdr_chunk(avctx, s, length)) < 0)
                 goto fail;
-            s->width  = bytestream2_get_be32(&s->gb);
-            s->height = bytestream2_get_be32(&s->gb);
-            if (av_image_check_size(s->width, s->height, 0, avctx)) {
-                s->width = s->height = 0;
+            break;
+        case MKTAG('p', 'H', 'Y', 's'):
+            if ((ret = decode_phys_chunk(avctx, s)) < 0)
                 goto fail;
-            }
-            s->bit_depth        = bytestream2_get_byte(&s->gb);
-            s->color_type       = bytestream2_get_byte(&s->gb);
-            s->compression_type = bytestream2_get_byte(&s->gb);
-            s->filter_type      = bytestream2_get_byte(&s->gb);
-            s->interlace_type   = bytestream2_get_byte(&s->gb);
-            bytestream2_skip(&s->gb, 4); /* crc */
-            s->state |= PNG_IHDR;
-            ff_dlog(avctx, "width=%d height=%d depth=%d color_type=%d "
-                           "compression_type=%d filter_type=%d interlace_type=%d\n",
-                    s->width, s->height, s->bit_depth, s->color_type,
-                    s->compression_type, s->filter_type, s->interlace_type);
             break;
-        case MKTAG('I', 'D', 'A', 'T'):
-            if (!(s->state & PNG_IHDR))
+        case MKTAG('f', 'c', 'T', 'L'):
+            if (!CONFIG_APNG_DECODER || avctx->codec_id != AV_CODEC_ID_APNG)
+                goto skip_tag;
+            if ((ret = decode_fctl_chunk(avctx, s, length)) < 0)
+                goto fail;
+            decode_next_dat = 1;
+            break;
+        case MKTAG('f', 'd', 'A', 'T'):
+            if (!CONFIG_APNG_DECODER || avctx->codec_id != AV_CODEC_ID_APNG)
+                goto skip_tag;
+            if (!decode_next_dat) {
+                ret = AVERROR_INVALIDDATA;
                 goto fail;
-            if (!(s->state & PNG_IDAT)) {
-                /* init image info */
-                avctx->width  = s->width;
-                avctx->height = s->height;
-
-                s->channels       = ff_png_get_nb_channels(s->color_type);
-                s->bits_per_pixel = s->bit_depth * s->channels;
-                s->bpp            = (s->bits_per_pixel + 7) >> 3;
-                s->row_size       = (avctx->width * s->bits_per_pixel + 7) >> 3;
-
-                if (s->bit_depth == 8 &&
-                    s->color_type == PNG_COLOR_TYPE_RGB) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB24;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB32;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY8;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_RGB) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
-                } else if (s->bit_depth == 1 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_PALETTE) {
-                    avctx->pix_fmt = AV_PIX_FMT_PAL8;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_YA8;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_YA16BE;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
-                } else {
-                    avpriv_report_missing_feature(avctx,
-                                                  "Bit depth %d color type %d",
-                                                  s->bit_depth, s->color_type);
-                    goto fail;
-                }
-
-                if (ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF) < 0) {
-                    av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                    goto fail;
-                }
-                p->pict_type        = AV_PICTURE_TYPE_I;
-                p->key_frame        = 1;
-                p->interlaced_frame = !!s->interlace_type;
-
-                /* compute the compressed row size */
-                if (!s->interlace_type) {
-                    s->crow_size = s->row_size + 1;
-                } else {
-                    s->pass          = 0;
-                    s->pass_row_size = ff_png_pass_row_size(s->pass,
-                                                            s->bits_per_pixel,
-                                                            s->width);
-                    s->crow_size = s->pass_row_size + 1;
-                }
-                ff_dlog(avctx, "row_size=%d crow_size =%d\n",
-                        s->row_size, s->crow_size);
-                s->image_buf      = p->data[0];
-                s->image_linesize = p->linesize[0];
-                /* copy the palette if needed */
-                if (s->color_type == PNG_COLOR_TYPE_PALETTE)
-                    memcpy(p->data[1], s->palette, 256 * sizeof(uint32_t));
-                /* empty row is used if differencing to the first row */
-                s->last_row = av_mallocz(s->row_size);
-                if (!s->last_row)
-                    goto fail;
-                if (s->interlace_type ||
-                    s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                    s->tmp_row = av_malloc(s->row_size);
-                    if (!s->tmp_row)
-                        goto fail;
-                }
-                /* compressed row */
-                crow_buf_base = av_malloc(s->row_size + 16);
-                if (!crow_buf_base)
-                    goto fail;
-
-                /* we want crow_buf+1 to be 16-byte aligned */
-                s->crow_buf          = crow_buf_base + 15;
-                s->zstream.avail_out = s->crow_size;
-                s->zstream.next_out  = s->crow_buf;
             }
-            s->state |= PNG_IDAT;
-            if (png_decode_idat(s, length) < 0)
+            bytestream2_get_be32(&s->gb);
+            length -= 4;
+            /* fallthrough */
+        case MKTAG('I', 'D', 'A', 'T'):
+            if (CONFIG_APNG_DECODER && avctx->codec_id == AV_CODEC_ID_APNG && !decode_next_dat)
+                goto skip_tag;
+            if ((ret = decode_idat_chunk(avctx, s, length, p)) < 0)
                 goto fail;
-            bytestream2_skip(&s->gb, 4); /* crc */
             break;
         case MKTAG('P', 'L', 'T', 'E'):
-        {
-            int n, i, r, g, b;
-
-            if ((length % 3) != 0 || length > 256 * 3)
+            if (decode_plte_chunk(avctx, s, length) < 0)
                 goto skip_tag;
-            /* read the palette */
-            n = length / 3;
-            for (i = 0; i < n; i++) {
-                r = bytestream2_get_byte(&s->gb);
-                g = bytestream2_get_byte(&s->gb);
-                b = bytestream2_get_byte(&s->gb);
-                s->palette[i] = (0xff << 24) | (r << 16) | (g << 8) | b;
-            }
-            for (; i < 256; i++)
-                s->palette[i] = (0xff << 24);
-            s->state |= PNG_PLTE;
-            bytestream2_skip(&s->gb, 4);     /* crc */
-        }
-        break;
+            break;
         case MKTAG('t', 'R', 'N', 'S'):
-        {
-            int v, i;
-
-            /* read the transparency. XXX: Only palette mode supported */
-            if (s->color_type != PNG_COLOR_TYPE_PALETTE ||
-                length > 256 ||
-                !(s->state & PNG_PLTE))
+            if (decode_trns_chunk(avctx, s, length) < 0)
                 goto skip_tag;
-            for (i = 0; i < length; i++) {
-                v = bytestream2_get_byte(&s->gb);
-                s->palette[i] = (s->palette[i] & 0x00ffffff) | (v << 24);
-            }
-            bytestream2_skip(&s->gb, 4);     /* crc */
-        }
-        break;
+            break;
+        case MKTAG('t', 'E', 'X', 't'):
+            if (decode_text_chunk(s, length, 0, metadatap) < 0)
+                av_log(avctx, AV_LOG_WARNING, "Broken tEXt chunk\n");
+            bytestream2_skip(&s->gb, length + 4);
+            break;
+        case MKTAG('z', 'T', 'X', 't'):
+            if (decode_text_chunk(s, length, 1, metadatap) < 0)
+                av_log(avctx, AV_LOG_WARNING, "Broken zTXt chunk\n");
+            bytestream2_skip(&s->gb, length + 4);
+            break;
         case MKTAG('s', 'T', 'E', 'R'): {
             int mode = bytestream2_get_byte(&s->gb);
             AVStereo3D *stereo3d = av_stereo3d_create_side_data(p);
             if (!stereo3d)
-                goto the_end;
+                goto fail;
 
             if (mode == 0 || mode == 1) {
                 stereo3d->type  = AV_STEREO3D_SIDEBYSIDE;
@@ -632,9 +1289,54 @@ static int decode_frame(AVCodecContext *avctx,
             bytestream2_skip(&s->gb, 4); /* crc */
             break;
         }
+        case MKTAG('i', 'C', 'C', 'P'): {
+            if (decode_iccp_chunk(s, length, p) < 0)
+                goto fail;
+            break;
+        }
+        case MKTAG('c', 'H', 'R', 'M'): {
+            AVMasteringDisplayMetadata *mdm = av_mastering_display_metadata_create_side_data(p);
+            if (!mdm) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            mdm->white_point[0] = av_make_q(bytestream2_get_be32(&s->gb), 100000);
+            mdm->white_point[1] = av_make_q(bytestream2_get_be32(&s->gb), 100000);
+
+            /* RGB Primaries */
+            for (i = 0; i < 3; i++) {
+                mdm->display_primaries[i][0] = av_make_q(bytestream2_get_be32(&s->gb), 100000);
+                mdm->display_primaries[i][1] = av_make_q(bytestream2_get_be32(&s->gb), 100000);
+            }
+
+            mdm->has_primaries = 1;
+            bytestream2_skip(&s->gb, 4); /* crc */
+            break;
+        }
+        case MKTAG('g', 'A', 'M', 'A'): {
+            AVBPrint bp;
+            char *gamma_str;
+            int num = bytestream2_get_be32(&s->gb);
+
+            av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
+            av_bprintf(&bp, "%i/%i", num, 100000);
+            ret = av_bprint_finalize(&bp, &gamma_str);
+            if (ret < 0)
+                return ret;
+
+            av_dict_set(&p->metadata, "gamma", gamma_str, AV_DICT_DONT_STRDUP_VAL);
+
+            bytestream2_skip(&s->gb, 4); /* crc */
+            break;
+        }
         case MKTAG('I', 'E', 'N', 'D'):
-            if (!(s->state & PNG_ALLIMAGE))
+            if (!(s->pic_state & PNG_ALLIMAGE))
+                av_log(avctx, AV_LOG_ERROR, "IEND without all image\n");
+            if (!(s->pic_state & (PNG_ALLIMAGE|PNG_IDAT))) {
+                ret = AVERROR_INVALIDDATA;
                 goto fail;
+            }
             bytestream2_skip(&s->gb, 4); /* crc */
             goto exit_loop;
         default:
@@ -645,40 +1347,233 @@ skip_tag:
         }
     }
 exit_loop:
-    /* handle P-frames only if a predecessor frame is available */
-    if (s->prev->data[0]) {
-        if (!(avpkt->flags & AV_PKT_FLAG_KEY)) {
-            int i, j;
-            uint8_t *pd      = p->data[0];
-            uint8_t *pd_last = s->prev->data[0];
-
-            for (j = 0; j < s->height; j++) {
-                for (i = 0; i < s->width * s->bpp; i++)
-                    pd[i] += pd_last[i];
-                pd      += s->image_linesize;
-                pd_last += s->image_linesize;
+
+    if (avctx->codec_id == AV_CODEC_ID_PNG &&
+        avctx->skip_frame == AVDISCARD_ALL) {
+        return 0;
+    }
+
+    if (s->bits_per_pixel <= 4)
+        handle_small_bpp(s, p);
+
+    /* apply transparency if needed */
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE) {
+        size_t byte_depth = s->bit_depth > 8 ? 2 : 1;
+        size_t raw_bpp = s->bpp - byte_depth;
+        unsigned x, y;
+
+        av_assert0(s->bit_depth > 1);
+
+        for (y = 0; y < s->height; ++y) {
+            uint8_t *row = &s->image_buf[s->image_linesize * y];
+
+            /* since we're updating in-place, we have to go from right to left */
+            for (x = s->width; x > 0; --x) {
+                uint8_t *pixel = &row[s->bpp * (x - 1)];
+                memmove(pixel, &row[raw_bpp * (x - 1)], raw_bpp);
+
+                if (!memcmp(pixel, s->transparent_color_be, raw_bpp)) {
+                    memset(&pixel[raw_bpp], 0, byte_depth);
+                } else {
+                    memset(&pixel[raw_bpp], 0xff, byte_depth);
+                }
             }
         }
     }
 
-    av_frame_unref(s->prev);
-    if ((ret = av_frame_ref(s->prev, p)) < 0)
-        goto fail;
+    /* handle P-frames only if a predecessor frame is available */
+    if (s->last_picture.f->data[0]) {
+        if (   !(avpkt->flags & AV_PKT_FLAG_KEY) && avctx->codec_tag != AV_RL32("MPNG")
+            && s->last_picture.f->width == p->width
+            && s->last_picture.f->height== p->height
+            && s->last_picture.f->format== p->format
+         ) {
+            if (CONFIG_PNG_DECODER && avctx->codec_id != AV_CODEC_ID_APNG)
+                handle_p_frame_png(s, p);
+            else if (CONFIG_APNG_DECODER &&
+                     avctx->codec_id == AV_CODEC_ID_APNG &&
+                     (ret = handle_p_frame_apng(avctx, s, p)) < 0)
+                goto fail;
+        }
+    }
+    ff_thread_report_progress(&s->picture, INT_MAX, 0);
+    ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+
+    return 0;
+
+fail:
+    ff_thread_report_progress(&s->picture, INT_MAX, 0);
+    ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+    return ret;
+}
+
+#if CONFIG_PNG_DECODER
+static int decode_frame_png(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    PNGDecContext *const s = avctx->priv_data;
+    const uint8_t *buf     = avpkt->data;
+    int buf_size           = avpkt->size;
+    AVFrame *p;
+    int64_t sig;
+    int ret;
+
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    FFSWAP(ThreadFrame, s->picture, s->last_picture);
+    p = s->picture.f;
+
+    bytestream2_init(&s->gb, buf, buf_size);
+
+    /* check signature */
+    sig = bytestream2_get_be64(&s->gb);
+    if (sig != PNGSIG &&
+        sig != MNGSIG) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid PNG signature 0x%08"PRIX64".\n", sig);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->y = s->has_trns = 0;
+    s->hdr_state = 0;
+    s->pic_state = 0;
+
+    /* init the zlib */
+    s->zstream.zalloc = ff_png_zalloc;
+    s->zstream.zfree  = ff_png_zfree;
+    s->zstream.opaque = NULL;
+    ret = inflateInit(&s->zstream);
+    if (ret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "inflateInit returned error %d\n", ret);
+        return AVERROR_EXTERNAL;
+    }
+
+    if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+        goto the_end;
+
+    if (avctx->skip_frame == AVDISCARD_ALL) {
+        *got_frame = 0;
+        ret = bytestream2_tell(&s->gb);
+        goto the_end;
+    }
+
+    if ((ret = av_frame_ref(data, s->picture.f)) < 0)
+        goto the_end;
 
     *got_frame = 1;
 
     ret = bytestream2_tell(&s->gb);
 the_end:
     inflateEnd(&s->zstream);
-    av_free(crow_buf_base);
     s->crow_buf = NULL;
-    av_freep(&s->last_row);
-    av_freep(&s->tmp_row);
     return ret;
-fail:
-    ret = -1;
-    goto the_end;
 }
+#endif
+
+#if CONFIG_APNG_DECODER
+static int decode_frame_apng(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    PNGDecContext *const s = avctx->priv_data;
+    int ret;
+    AVFrame *p;
+
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    FFSWAP(ThreadFrame, s->picture, s->last_picture);
+    p = s->picture.f;
+
+    if (!(s->hdr_state & PNG_IHDR)) {
+        if (!avctx->extradata_size)
+            return AVERROR_INVALIDDATA;
+
+        /* only init fields, there is no zlib use in extradata */
+        s->zstream.zalloc = ff_png_zalloc;
+        s->zstream.zfree  = ff_png_zfree;
+
+        bytestream2_init(&s->gb, avctx->extradata, avctx->extradata_size);
+        if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+            goto end;
+    }
+
+    /* reset state for a new frame */
+    if ((ret = inflateInit(&s->zstream)) != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "inflateInit returned error %d\n", ret);
+        ret = AVERROR_EXTERNAL;
+        goto end;
+    }
+    s->y = 0;
+    s->pic_state = 0;
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
+    if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+        goto end;
+
+    if (!(s->pic_state & PNG_ALLIMAGE))
+        av_log(avctx, AV_LOG_WARNING, "Frame did not contain a complete image\n");
+    if (!(s->pic_state & (PNG_ALLIMAGE|PNG_IDAT))) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    if ((ret = av_frame_ref(data, s->picture.f)) < 0)
+        goto end;
+
+    *got_frame = 1;
+    ret = bytestream2_tell(&s->gb);
+
+end:
+    inflateEnd(&s->zstream);
+    return ret;
+}
+#endif
+
+#if HAVE_THREADS
+static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    PNGDecContext *psrc = src->priv_data;
+    PNGDecContext *pdst = dst->priv_data;
+    int ret;
+
+    if (dst == src)
+        return 0;
+
+    ff_thread_release_buffer(dst, &pdst->picture);
+    if (psrc->picture.f->data[0] &&
+        (ret = ff_thread_ref_frame(&pdst->picture, &psrc->picture)) < 0)
+        return ret;
+    if (CONFIG_APNG_DECODER && dst->codec_id == AV_CODEC_ID_APNG) {
+        pdst->width             = psrc->width;
+        pdst->height            = psrc->height;
+        pdst->bit_depth         = psrc->bit_depth;
+        pdst->color_type        = psrc->color_type;
+        pdst->compression_type  = psrc->compression_type;
+        pdst->interlace_type    = psrc->interlace_type;
+        pdst->filter_type       = psrc->filter_type;
+        pdst->cur_w = psrc->cur_w;
+        pdst->cur_h = psrc->cur_h;
+        pdst->x_offset = psrc->x_offset;
+        pdst->y_offset = psrc->y_offset;
+        pdst->has_trns = psrc->has_trns;
+        memcpy(pdst->transparent_color_be, psrc->transparent_color_be, sizeof(pdst->transparent_color_be));
+
+        pdst->dispose_op = psrc->dispose_op;
+
+        memcpy(pdst->palette, psrc->palette, sizeof(pdst->palette));
+
+        pdst->hdr_state |= psrc->hdr_state;
+
+        ff_thread_release_buffer(dst, &pdst->last_picture);
+        if (psrc->last_picture.f->data[0] &&
+            (ret = ff_thread_ref_frame(&pdst->last_picture, &psrc->last_picture)) < 0)
+            return ret;
+
+        ff_thread_release_buffer(dst, &pdst->previous_picture);
+        if (psrc->previous_picture.f->data[0] &&
+            (ret = ff_thread_ref_frame(&pdst->previous_picture, &psrc->previous_picture)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+#endif
 
 static av_cold int png_dec_init(AVCodecContext *avctx)
 {
@@ -686,11 +1581,21 @@ static av_cold int png_dec_init(AVCodecContext *avctx)
 
     avctx->color_range = AVCOL_RANGE_JPEG;
 
-    s->prev = av_frame_alloc();
-    if (!s->prev)
+    s->avctx = avctx;
+    s->previous_picture.f = av_frame_alloc();
+    s->last_picture.f = av_frame_alloc();
+    s->picture.f = av_frame_alloc();
+    if (!s->previous_picture.f || !s->last_picture.f || !s->picture.f) {
+        av_frame_free(&s->previous_picture.f);
+        av_frame_free(&s->last_picture.f);
+        av_frame_free(&s->picture.f);
         return AVERROR(ENOMEM);
+    }
 
-    ff_pngdsp_init(&s->dsp);
+    if (!avctx->internal->is_copy) {
+        avctx->internal->allocate_progress = 1;
+        ff_pngdsp_init(&s->dsp);
+    }
 
     return 0;
 }
@@ -699,11 +1604,40 @@ static av_cold int png_dec_end(AVCodecContext *avctx)
 {
     PNGDecContext *s = avctx->priv_data;
 
-    av_frame_free(&s->prev);
+    ff_thread_release_buffer(avctx, &s->previous_picture);
+    av_frame_free(&s->previous_picture.f);
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    av_frame_free(&s->last_picture.f);
+    ff_thread_release_buffer(avctx, &s->picture);
+    av_frame_free(&s->picture.f);
+    av_freep(&s->buffer);
+    s->buffer_size = 0;
+    av_freep(&s->last_row);
+    s->last_row_size = 0;
+    av_freep(&s->tmp_row);
+    s->tmp_row_size = 0;
 
     return 0;
 }
 
+#if CONFIG_APNG_DECODER
+AVCodec ff_apng_decoder = {
+    .name           = "apng",
+    .long_name      = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_APNG,
+    .priv_data_size = sizeof(PNGDecContext),
+    .init           = png_dec_init,
+    .close          = png_dec_end,
+    .decode         = decode_frame_apng,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(png_dec_init),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
+#endif
+
+#if CONFIG_PNG_DECODER
 AVCodec ff_png_decoder = {
     .name           = "png",
     .long_name      = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
@@ -712,7 +1646,10 @@ AVCodec ff_png_decoder = {
     .priv_data_size = sizeof(PNGDecContext),
     .init           = png_dec_init,
     .close          = png_dec_end,
-    .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
-    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .decode         = decode_frame_png,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(png_dec_init),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM | FF_CODEC_CAP_INIT_THREADSAFE,
 };
+#endif
diff --git a/libavcodec/pngdsp.c b/libavcodec/pngdsp.c
index c0e9402..d275316 100644
--- a/libavcodec/pngdsp.c
+++ b/libavcodec/pngdsp.c
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h
index 607fe64..5475d0d 100644
--- a/libavcodec/pngdsp.h
+++ b/libavcodec/pngdsp.h
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,9 @@
 #include <stdint.h>
 
 typedef struct PNGDSPContext {
-    void (*add_bytes_l2)(uint8_t *dst  /* align 16 */,
+    void (*add_bytes_l2)(uint8_t *dst,
                          uint8_t *src1 /* align 16 */,
-                         uint8_t *src2 /* align 16 */, int w);
+                         uint8_t *src2, int w);
 
     /* this might write to dst[w] */
     void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c
index f91c54c..69b4495 100644
--- a/libavcodec/pngenc.c
+++ b/libavcodec/pngenc.c
@@ -2,42 +2,52 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/opt.h"
-#include "libavutil/stereo3d.h"
-
 #include "avcodec.h"
+#include "internal.h"
 #include "bytestream.h"
-#include "huffyuvencdsp.h"
+#include "lossless_videoencdsp.h"
 #include "png.h"
+#include "apng.h"
 
-/* TODO:
- * - add 2, 4 and 16 bit depth support
- */
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/libm.h"
+#include "libavutil/opt.h"
+#include "libavutil/color_utils.h"
+#include "libavutil/stereo3d.h"
 
 #include <zlib.h>
 
 #define IOBUF_SIZE 4096
 
+typedef struct APNGFctlChunk {
+    uint32_t sequence_number;
+    uint32_t width, height;
+    uint32_t x_offset, y_offset;
+    uint16_t delay_num, delay_den;
+    uint8_t dispose_op, blend_op;
+} APNGFctlChunk;
+
 typedef struct PNGEncContext {
     AVClass *class;
-    HuffYUVEncDSPContext hdsp;
+    LLVidEncDSPContext llvidencdsp;
 
     uint8_t *bytestream;
     uint8_t *bytestream_start;
@@ -47,6 +57,26 @@ typedef struct PNGEncContext {
 
     z_stream zstream;
     uint8_t buf[IOBUF_SIZE];
+    int dpi;                     ///< Physical pixel density, in dots per inch, if set
+    int dpm;                     ///< Physical pixel density, in dots per meter, if set
+
+    int is_progressive;
+    int bit_depth;
+    int color_type;
+    int bits_per_pixel;
+
+    // APNG
+    uint32_t palette_checksum;   // Used to ensure a single unique palette
+    uint32_t sequence_number;
+    int extra_data_updated;
+    uint8_t *extra_data;
+    int extra_data_size;
+
+    AVFrame *prev_frame;
+    AVFrame *last_frame;
+    APNGFctlChunk last_frame_fctl;
+    uint8_t *last_frame_packet;
+    size_t last_frame_packet_size;
 } PNGEncContext;
 
 static void png_get_interlaced_row(uint8_t *dst, int row_size,
@@ -56,8 +86,9 @@ static void png_get_interlaced_row(uint8_t *dst, int row_size,
     int x, mask, dst_x, j, b, bpp;
     uint8_t *d;
     const uint8_t *s;
+    static const int masks[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
 
-    mask = ff_png_pass_mask[pass];
+    mask = masks[pass];
     switch (bits_per_pixel) {
     case 1:
         memset(dst, 0, row_size);
@@ -115,6 +146,22 @@ static void sub_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
     }
 }
 
+static void sub_left_prediction(PNGEncContext *c, uint8_t *dst, const uint8_t *src, int bpp, int size)
+{
+    const uint8_t *src1 = src + bpp;
+    const uint8_t *src2 = src;
+    int x, unaligned_w;
+
+    memcpy(dst, src, bpp);
+    dst += bpp;
+    size -= bpp;
+    unaligned_w = FFMIN(32 - bpp, size);
+    for (x = 0; x < unaligned_w; x++)
+        *dst++ = *src1++ - *src2++;
+    size -= unaligned_w;
+    c->llvidencdsp.diff_bytes(dst, src1, src2, size);
+}
+
 static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
                            uint8_t *src, uint8_t *top, int size, int bpp)
 {
@@ -125,11 +172,10 @@ static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
         memcpy(dst, src, size);
         break;
     case PNG_FILTER_VALUE_SUB:
-        c->hdsp.diff_bytes(dst, src, src - bpp, size);
-        memcpy(dst, src, bpp);
+        sub_left_prediction(c, dst, src, bpp, size);
         break;
     case PNG_FILTER_VALUE_UP:
-        c->hdsp.diff_bytes(dst, src, top, size);
+        c->llvidencdsp.diff_bytes(dst, src, top, size);
         break;
     case PNG_FILTER_VALUE_AVG:
         for (i = 0; i < bpp; i++)
@@ -149,7 +195,7 @@ static uint8_t *png_choose_filter(PNGEncContext *s, uint8_t *dst,
                                   uint8_t *src, uint8_t *top, int size, int bpp)
 {
     int pred = s->filter_type;
-    assert(bpp || !pred);
+    av_assert0(bpp || !pred);
     if (!top && pred)
         pred = PNG_FILTER_VALUE_SUB;
     if (pred == PNG_FILTER_VALUE_MIXED) {
@@ -175,45 +221,56 @@ static uint8_t *png_choose_filter(PNGEncContext *s, uint8_t *dst,
     }
 }
 
-static void convert_from_rgb32(uint8_t *dst, const uint8_t *src, int width)
-{
-    uint8_t *d;
-    int j;
-    unsigned int v;
-
-    d = dst;
-    for (j = 0; j < width; j++) {
-        v    = ((const uint32_t *) src)[j];
-        d[0] = v >> 16;
-        d[1] = v >> 8;
-        d[2] = v;
-        d[3] = v >> 24;
-        d   += 4;
-    }
-}
-
 static void png_write_chunk(uint8_t **f, uint32_t tag,
                             const uint8_t *buf, int length)
 {
-    uint32_t crc;
+    const AVCRC *crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    uint32_t crc = ~0U;
     uint8_t tagbuf[4];
 
     bytestream_put_be32(f, length);
-    crc = crc32(0, Z_NULL, 0);
     AV_WL32(tagbuf, tag);
-    crc = crc32(crc, tagbuf, 4);
+    crc = av_crc(crc_table, crc, tagbuf, 4);
     bytestream_put_be32(f, av_bswap32(tag));
     if (length > 0) {
-        crc = crc32(crc, buf, length);
+        crc = av_crc(crc_table, crc, buf, length);
         memcpy(*f, buf, length);
         *f += length;
     }
-    bytestream_put_be32(f, crc);
+    bytestream_put_be32(f, ~crc);
+}
+
+static void png_write_image_data(AVCodecContext *avctx,
+                                 const uint8_t *buf, int length)
+{
+    PNGEncContext *s = avctx->priv_data;
+    const AVCRC *crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    uint32_t crc = ~0U;
+
+    if (avctx->codec_id == AV_CODEC_ID_PNG || avctx->frame_number == 0) {
+        png_write_chunk(&s->bytestream, MKTAG('I', 'D', 'A', 'T'), buf, length);
+        return;
+    }
+
+    bytestream_put_be32(&s->bytestream, length + 4);
+
+    bytestream_put_be32(&s->bytestream, MKBETAG('f', 'd', 'A', 'T'));
+    bytestream_put_be32(&s->bytestream, s->sequence_number);
+    crc = av_crc(crc_table, crc, s->bytestream - 8, 8);
+
+    crc = av_crc(crc_table, crc, buf, length);
+    memcpy(s->bytestream, buf, length);
+    s->bytestream += length;
+
+    bytestream_put_be32(&s->bytestream, ~crc);
+
+    ++s->sequence_number;
 }
 
 /* XXX: do filtering */
-static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
+static int png_write_row(AVCodecContext *avctx, const uint8_t *data, int size)
 {
+    PNGEncContext *s = avctx->priv_data;
     int ret;
 
     s->zstream.avail_in = size;
@@ -224,8 +281,7 @@ static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
             return -1;
         if (s->zstream.avail_out == 0) {
             if (s->bytestream_end - s->bytestream > IOBUF_SIZE + 100)
-                png_write_chunk(&s->bytestream,
-                                MKTAG('I', 'D', 'A', 'T'), s->buf, IOBUF_SIZE);
+                png_write_image_data(avctx, s->buf, IOBUF_SIZE);
             s->zstream.avail_out = IOBUF_SIZE;
             s->zstream.next_out  = s->buf;
         }
@@ -233,137 +289,130 @@ static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
     return 0;
 }
 
-static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+#define AV_WB32_PNG(buf, n) AV_WB32(buf, lrint((n) * 100000))
+static int png_get_chrm(enum AVColorPrimaries prim,  uint8_t *buf)
 {
-    PNGEncContext *s       = avctx->priv_data;
-    AVFrameSideData *side_data;
-    const AVFrame *const p = pict;
-    int bit_depth, color_type, y, len, row_size, ret, is_progressive;
-    int bits_per_pixel, pass_row_size, enc_row_size, max_packet_size;
-    int compression_level;
-    uint8_t *ptr, *top, *crow_buf, *crow;
-    uint8_t *crow_base       = NULL;
-    uint8_t *progressive_buf = NULL;
-    uint8_t *rgba_buf        = NULL;
-    uint8_t *top_buf         = NULL;
-
-    is_progressive = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
-    switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGBA64BE:
-        bit_depth = 16;
-        color_type = PNG_COLOR_TYPE_RGB_ALPHA;
-        break;
-    case AV_PIX_FMT_RGB48BE:
-        bit_depth = 16;
-        color_type = PNG_COLOR_TYPE_RGB;
-        break;
-    case AV_PIX_FMT_RGB32:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_RGB_ALPHA;
-        break;
-    case AV_PIX_FMT_RGB24:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_RGB;
-        break;
-    case AV_PIX_FMT_GRAY16BE:
-        bit_depth  = 16;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_GRAY8:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_MONOBLACK:
-        bit_depth  = 1;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_PAL8:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_PALETTE;
-        break;
-    default:
-        return -1;
+    double rx, ry, gx, gy, bx, by, wx = 0.3127, wy = 0.3290;
+    switch (prim) {
+        case AVCOL_PRI_BT709:
+            rx = 0.640; ry = 0.330;
+            gx = 0.300; gy = 0.600;
+            bx = 0.150; by = 0.060;
+            break;
+        case AVCOL_PRI_BT470M:
+            rx = 0.670; ry = 0.330;
+            gx = 0.210; gy = 0.710;
+            bx = 0.140; by = 0.080;
+            wx = 0.310; wy = 0.316;
+            break;
+        case AVCOL_PRI_BT470BG:
+            rx = 0.640; ry = 0.330;
+            gx = 0.290; gy = 0.600;
+            bx = 0.150; by = 0.060;
+            break;
+        case AVCOL_PRI_SMPTE170M:
+        case AVCOL_PRI_SMPTE240M:
+            rx = 0.630; ry = 0.340;
+            gx = 0.310; gy = 0.595;
+            bx = 0.155; by = 0.070;
+            break;
+        case AVCOL_PRI_BT2020:
+            rx = 0.708; ry = 0.292;
+            gx = 0.170; gy = 0.797;
+            bx = 0.131; by = 0.046;
+            break;
+        default:
+            return 0;
     }
-    bits_per_pixel = ff_png_get_nb_channels(color_type) * bit_depth;
-    row_size       = (avctx->width * bits_per_pixel + 7) >> 3;
 
-    s->zstream.zalloc = ff_png_zalloc;
-    s->zstream.zfree  = ff_png_zfree;
-    s->zstream.opaque = NULL;
-    compression_level = avctx->compression_level == FF_COMPRESSION_DEFAULT
-                      ? Z_DEFAULT_COMPRESSION
-                      : av_clip(avctx->compression_level, 0, 9);
-    ret = deflateInit2(&s->zstream, compression_level,
-                       Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY);
-    if (ret != Z_OK)
-        return -1;
+    AV_WB32_PNG(buf     , wx); AV_WB32_PNG(buf + 4 , wy);
+    AV_WB32_PNG(buf + 8 , rx); AV_WB32_PNG(buf + 12, ry);
+    AV_WB32_PNG(buf + 16, gx); AV_WB32_PNG(buf + 20, gy);
+    AV_WB32_PNG(buf + 24, bx); AV_WB32_PNG(buf + 28, by);
+    return 1;
+}
 
-    enc_row_size    = deflateBound(&s->zstream, row_size);
-    max_packet_size = avctx->height * (enc_row_size +
-                                       ((enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) * 12)
-                      + AV_INPUT_BUFFER_MIN_SIZE;
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, max_packet_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate output packet of size %d.\n",
-               max_packet_size);
-        return ret;
-    }
+static int png_get_gama(enum AVColorTransferCharacteristic trc, uint8_t *buf)
+{
+    double gamma = avpriv_get_gamma_from_trc(trc);
+    if (gamma <= 1e-6)
+        return 0;
 
-    s->bytestream_start =
-    s->bytestream       = pkt->data;
-    s->bytestream_end   = pkt->data + pkt->size;
+    AV_WB32_PNG(buf, 1.0 / gamma);
+    return 1;
+}
 
-    crow_base = av_malloc((row_size + 32) << (s->filter_type == PNG_FILTER_VALUE_MIXED));
-    if (!crow_base)
-        goto fail;
-    // pixel data should be aligned, but there's a control byte before it
-    crow_buf = crow_base + 15;
-    if (is_progressive) {
-        progressive_buf = av_malloc(row_size + 1);
-        if (!progressive_buf)
-            goto fail;
-    }
-    if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-        rgba_buf = av_malloc(row_size + 1);
-        if (!rgba_buf)
-            goto fail;
-    }
-    if (is_progressive || color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-        top_buf = av_malloc(row_size + 1);
-        if (!top_buf)
-            goto fail;
-    }
+static int encode_headers(AVCodecContext *avctx, const AVFrame *pict)
+{
+    AVFrameSideData *side_data;
+    PNGEncContext *s = avctx->priv_data;
 
     /* write png header */
-    memcpy(s->bytestream, ff_pngsig, 8);
-    s->bytestream += 8;
-
     AV_WB32(s->buf, avctx->width);
     AV_WB32(s->buf + 4, avctx->height);
-    s->buf[8]  = bit_depth;
-    s->buf[9]  = color_type;
+    s->buf[8]  = s->bit_depth;
+    s->buf[9]  = s->color_type;
     s->buf[10] = 0; /* compression type */
     s->buf[11] = 0; /* filter type */
-    s->buf[12] = is_progressive; /* interlace type */
-
+    s->buf[12] = s->is_progressive; /* interlace type */
     png_write_chunk(&s->bytestream, MKTAG('I', 'H', 'D', 'R'), s->buf, 13);
 
+    /* write physical information */
+    if (s->dpm) {
+      AV_WB32(s->buf, s->dpm);
+      AV_WB32(s->buf + 4, s->dpm);
+      s->buf[8] = 1; /* unit specifier is meter */
+    } else {
+      AV_WB32(s->buf, avctx->sample_aspect_ratio.num);
+      AV_WB32(s->buf + 4, avctx->sample_aspect_ratio.den);
+      s->buf[8] = 0; /* unit specifier is unknown */
+    }
+    png_write_chunk(&s->bytestream, MKTAG('p', 'H', 'Y', 's'), s->buf, 9);
+
+    /* write stereoscopic information */
+    side_data = av_frame_get_side_data(pict, AV_FRAME_DATA_STEREO3D);
+    if (side_data) {
+        AVStereo3D *stereo3d = (AVStereo3D *)side_data->data;
+        switch (stereo3d->type) {
+            case AV_STEREO3D_SIDEBYSIDE:
+                s->buf[0] = ((stereo3d->flags & AV_STEREO3D_FLAG_INVERT) == 0) ? 1 : 0;
+                png_write_chunk(&s->bytestream, MKTAG('s', 'T', 'E', 'R'), s->buf, 1);
+                break;
+            case AV_STEREO3D_2D:
+                break;
+            default:
+                av_log(avctx, AV_LOG_WARNING, "Only side-by-side stereo3d flag can be defined within sTER chunk\n");
+                break;
+        }
+    }
+
+    /* write colorspace information */
+    if (pict->color_primaries == AVCOL_PRI_BT709 &&
+        pict->color_trc == AVCOL_TRC_IEC61966_2_1) {
+        s->buf[0] = 1; /* rendering intent, relative colorimetric by default */
+        png_write_chunk(&s->bytestream, MKTAG('s', 'R', 'G', 'B'), s->buf, 1);
+    }
+
+    if (png_get_chrm(pict->color_primaries, s->buf))
+        png_write_chunk(&s->bytestream, MKTAG('c', 'H', 'R', 'M'), s->buf, 32);
+    if (png_get_gama(pict->color_trc, s->buf))
+        png_write_chunk(&s->bytestream, MKTAG('g', 'A', 'M', 'A'), s->buf, 4);
+
     /* put the palette if needed */
-    if (color_type == PNG_COLOR_TYPE_PALETTE) {
+    if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
         int has_alpha, alpha, i;
         unsigned int v;
         uint32_t *palette;
-        uint8_t *alpha_ptr;
+        uint8_t *ptr, *alpha_ptr;
 
-        palette   = (uint32_t *)p->data[1];
+        palette   = (uint32_t *)pict->data[1];
         ptr       = s->buf;
         alpha_ptr = s->buf + 256 * 3;
         has_alpha = 0;
         for (i = 0; i < 256; i++) {
             v     = palette[i];
             alpha = v >> 24;
-            if (alpha && alpha != 0xff)
+            if (alpha != 0xff)
                 has_alpha = 1;
             *alpha_ptr++ = alpha;
             bytestream_put_be24(&ptr, v);
@@ -376,67 +425,71 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
 
-    /* write stereoscopic information */
-    side_data = av_frame_get_side_data(pict, AV_FRAME_DATA_STEREO3D);
-    if (side_data) {
-        AVStereo3D *stereo3d = (AVStereo3D *)side_data->data;
-        uint8_t sm;
-        switch (stereo3d->type) {
-        case AV_STEREO3D_SIDEBYSIDE:
-            sm = !(stereo3d->flags & AV_STEREO3D_FLAG_INVERT);
-            png_write_chunk(&s->bytestream, MKTAG('s', 'T', 'E', 'R'), &sm, 1);
-            break;
-        case AV_STEREO3D_2D:
-            break;
-        default:
-            av_log(avctx, AV_LOG_WARNING,
-                   "Only side-by-side stereo3d flag can be defined within sTER chunk\n");
-            break;
+    return 0;
+}
+
+static int encode_frame(AVCodecContext *avctx, const AVFrame *pict)
+{
+    PNGEncContext *s       = avctx->priv_data;
+    const AVFrame *const p = pict;
+    int y, len, ret;
+    int row_size, pass_row_size;
+    uint8_t *ptr, *top, *crow_buf, *crow;
+    uint8_t *crow_base       = NULL;
+    uint8_t *progressive_buf = NULL;
+    uint8_t *top_buf         = NULL;
+
+    row_size = (pict->width * s->bits_per_pixel + 7) >> 3;
+
+    crow_base = av_malloc((row_size + 32) << (s->filter_type == PNG_FILTER_VALUE_MIXED));
+    if (!crow_base) {
+        ret = AVERROR(ENOMEM);
+        goto the_end;
+    }
+    // pixel data should be aligned, but there's a control byte before it
+    crow_buf = crow_base + 15;
+    if (s->is_progressive) {
+        progressive_buf = av_malloc(row_size + 1);
+        top_buf = av_malloc(row_size + 1);
+        if (!progressive_buf || !top_buf) {
+            ret = AVERROR(ENOMEM);
+            goto the_end;
         }
     }
 
-    /* now put each row */
+    /* put each row */
     s->zstream.avail_out = IOBUF_SIZE;
     s->zstream.next_out  = s->buf;
-    if (is_progressive) {
+    if (s->is_progressive) {
         int pass;
 
         for (pass = 0; pass < NB_PASSES; pass++) {
             /* NOTE: a pass is completely omitted if no pixels would be
              * output */
-            pass_row_size = ff_png_pass_row_size(pass, bits_per_pixel, avctx->width);
+            pass_row_size = ff_png_pass_row_size(pass, s->bits_per_pixel, pict->width);
             if (pass_row_size > 0) {
                 top = NULL;
-                for (y = 0; y < avctx->height; y++)
+                for (y = 0; y < pict->height; y++)
                     if ((ff_png_pass_ymask[pass] << (y & 7)) & 0x80) {
                         ptr = p->data[0] + y * p->linesize[0];
                         FFSWAP(uint8_t *, progressive_buf, top_buf);
-                        if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                            convert_from_rgb32(rgba_buf, ptr, avctx->width);
-                            ptr = rgba_buf;
-                        }
                         png_get_interlaced_row(progressive_buf, pass_row_size,
-                                               bits_per_pixel, pass,
-                                               ptr, avctx->width);
+                                               s->bits_per_pixel, pass,
+                                               ptr, pict->width);
                         crow = png_choose_filter(s, crow_buf, progressive_buf,
-                                                 top, pass_row_size, bits_per_pixel >> 3);
-                        png_write_row(s, crow, pass_row_size + 1);
+                                                 top, pass_row_size, s->bits_per_pixel >> 3);
+                        png_write_row(avctx, crow, pass_row_size + 1);
                         top = progressive_buf;
                     }
             }
         }
     } else {
         top = NULL;
-        for (y = 0; y < avctx->height; y++) {
+        for (y = 0; y < pict->height; y++) {
             ptr = p->data[0] + y * p->linesize[0];
-            if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                FFSWAP(uint8_t *, rgba_buf, top_buf);
-                convert_from_rgb32(rgba_buf, ptr, avctx->width);
-                ptr = rgba_buf;
-            }
             crow = png_choose_filter(s, crow_buf, ptr, top,
-                                     row_size, bits_per_pixel >> 3);
-            png_write_row(s, crow, row_size + 1);
+                                     row_size, s->bits_per_pixel >> 3);
+            png_write_row(avctx, crow, row_size + 1);
             top = ptr;
         }
     }
@@ -446,38 +499,514 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         if (ret == Z_OK || ret == Z_STREAM_END) {
             len = IOBUF_SIZE - s->zstream.avail_out;
             if (len > 0 && s->bytestream_end - s->bytestream > len + 100) {
-                png_write_chunk(&s->bytestream, MKTAG('I', 'D', 'A', 'T'), s->buf, len);
+                png_write_image_data(avctx, s->buf, len);
             }
             s->zstream.avail_out = IOBUF_SIZE;
             s->zstream.next_out  = s->buf;
             if (ret == Z_STREAM_END)
                 break;
         } else {
-            goto fail;
+            ret = -1;
+            goto the_end;
         }
     }
+
+    ret = 0;
+
+the_end:
+    av_freep(&crow_base);
+    av_freep(&progressive_buf);
+    av_freep(&top_buf);
+    deflateReset(&s->zstream);
+    return ret;
+}
+
+static int encode_png(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *pict, int *got_packet)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    int enc_row_size;
+    size_t max_packet_size;
+
+    enc_row_size    = deflateBound(&s->zstream, (avctx->width * s->bits_per_pixel + 7) >> 3);
+    max_packet_size =
+        AV_INPUT_BUFFER_MIN_SIZE + // headers
+        avctx->height * (
+            enc_row_size +
+            12 * (((int64_t)enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) // IDAT * ceil(enc_row_size / IOBUF_SIZE)
+        );
+    if (max_packet_size > INT_MAX)
+        return AVERROR(ENOMEM);
+    ret = ff_alloc_packet2(avctx, pkt, max_packet_size, 0);
+    if (ret < 0)
+        return ret;
+
+    s->bytestream_start =
+    s->bytestream       = pkt->data;
+    s->bytestream_end   = pkt->data + pkt->size;
+
+    AV_WB64(s->bytestream, PNGSIG);
+    s->bytestream += 8;
+
+    ret = encode_headers(avctx, pict);
+    if (ret < 0)
+        return ret;
+
+    ret = encode_frame(avctx, pict);
+    if (ret < 0)
+        return ret;
+
     png_write_chunk(&s->bytestream, MKTAG('I', 'E', 'N', 'D'), NULL, 0);
 
-    pkt->size   = s->bytestream - s->bytestream_start;
+    pkt->size = s->bytestream - s->bytestream_start;
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
-    ret         = 0;
 
-the_end:
-    av_free(crow_base);
-    av_free(progressive_buf);
-    av_free(rgba_buf);
-    av_free(top_buf);
-    deflateEnd(&s->zstream);
-    return ret;
+    return 0;
+}
+
+static int apng_do_inverse_blend(AVFrame *output, const AVFrame *input,
+                                  APNGFctlChunk *fctl_chunk, uint8_t bpp)
+{
+    // output: background, input: foreground
+    // output the image such that when blended with the background, will produce the foreground
+
+    unsigned int x, y;
+    unsigned int leftmost_x = input->width;
+    unsigned int rightmost_x = 0;
+    unsigned int topmost_y = input->height;
+    unsigned int bottommost_y = 0;
+    const uint8_t *input_data = input->data[0];
+    uint8_t *output_data = output->data[0];
+    ptrdiff_t input_linesize = input->linesize[0];
+    ptrdiff_t output_linesize = output->linesize[0];
+
+    // Find bounding box of changes
+    for (y = 0; y < input->height; ++y) {
+        for (x = 0; x < input->width; ++x) {
+            if (!memcmp(input_data + bpp * x, output_data + bpp * x, bpp))
+                continue;
+
+            if (x < leftmost_x)
+                leftmost_x = x;
+            if (x >= rightmost_x)
+                rightmost_x = x + 1;
+            if (y < topmost_y)
+                topmost_y = y;
+            if (y >= bottommost_y)
+                bottommost_y = y + 1;
+        }
+
+        input_data += input_linesize;
+        output_data += output_linesize;
+    }
+
+    if (leftmost_x == input->width && rightmost_x == 0) {
+        // Empty frame
+        // APNG does not support empty frames, so we make it a 1x1 frame
+        leftmost_x = topmost_y = 0;
+        rightmost_x = bottommost_y = 1;
+    }
+
+    // Do actual inverse blending
+    if (fctl_chunk->blend_op == APNG_BLEND_OP_SOURCE) {
+        output_data = output->data[0];
+        for (y = topmost_y; y < bottommost_y; ++y) {
+            memcpy(output_data,
+                   input->data[0] + input_linesize * y + bpp * leftmost_x,
+                   bpp * (rightmost_x - leftmost_x));
+            output_data += output_linesize;
+        }
+    } else { // APNG_BLEND_OP_OVER
+        size_t transparent_palette_index;
+        uint32_t *palette;
+
+        switch (input->format) {
+        case AV_PIX_FMT_RGBA64BE:
+        case AV_PIX_FMT_YA16BE:
+        case AV_PIX_FMT_RGBA:
+        case AV_PIX_FMT_GRAY8A:
+            break;
+
+        case AV_PIX_FMT_PAL8:
+            palette = (uint32_t*)input->data[1];
+            for (transparent_palette_index = 0; transparent_palette_index < 256; ++transparent_palette_index)
+                if (palette[transparent_palette_index] >> 24 == 0)
+                    break;
+            break;
+
+        default:
+            // No alpha, so blending not possible
+            return -1;
+        }
+
+        for (y = topmost_y; y < bottommost_y; ++y) {
+            uint8_t *foreground = input->data[0] + input_linesize * y + bpp * leftmost_x;
+            uint8_t *background = output->data[0] + output_linesize * y + bpp * leftmost_x;
+            output_data = output->data[0] + output_linesize * (y - topmost_y);
+            for (x = leftmost_x; x < rightmost_x; ++x, foreground += bpp, background += bpp, output_data += bpp) {
+                if (!memcmp(foreground, background, bpp)) {
+                    if (input->format == AV_PIX_FMT_PAL8) {
+                        if (transparent_palette_index == 256) {
+                            // Need fully transparent colour, but none exists
+                            return -1;
+                        }
+
+                        *output_data = transparent_palette_index;
+                    } else {
+                        memset(output_data, 0, bpp);
+                    }
+                    continue;
+                }
+
+                // Check for special alpha values, since full inverse
+                // alpha-on-alpha blending is rarely possible, and when
+                // possible, doesn't compress much better than
+                // APNG_BLEND_OP_SOURCE blending
+                switch (input->format) {
+                case AV_PIX_FMT_RGBA64BE:
+                    if (((uint16_t*)foreground)[3] == 0xffff ||
+                        ((uint16_t*)background)[3] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_YA16BE:
+                    if (((uint16_t*)foreground)[1] == 0xffff ||
+                        ((uint16_t*)background)[1] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_RGBA:
+                    if (foreground[3] == 0xff || background[3] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_GRAY8A:
+                    if (foreground[1] == 0xff || background[1] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_PAL8:
+                    if (palette[*foreground] >> 24 == 0xff ||
+                        palette[*background] >> 24 == 0)
+                        break;
+                    return -1;
+                }
+
+                memmove(output_data, foreground, bpp);
+            }
+        }
+    }
+
+    output->width = rightmost_x - leftmost_x;
+    output->height = bottommost_y - topmost_y;
+    fctl_chunk->width = output->width;
+    fctl_chunk->height = output->height;
+    fctl_chunk->x_offset = leftmost_x;
+    fctl_chunk->y_offset = topmost_y;
+
+    return 0;
+}
+
+static int apng_encode_frame(AVCodecContext *avctx, const AVFrame *pict,
+                             APNGFctlChunk *best_fctl_chunk, APNGFctlChunk *best_last_fctl_chunk)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    unsigned int y;
+    AVFrame* diffFrame;
+    uint8_t bpp = (s->bits_per_pixel + 7) >> 3;
+    uint8_t *original_bytestream, *original_bytestream_end;
+    uint8_t *temp_bytestream = 0, *temp_bytestream_end;
+    uint32_t best_sequence_number;
+    uint8_t *best_bytestream;
+    size_t best_bytestream_size = SIZE_MAX;
+    APNGFctlChunk last_fctl_chunk = *best_last_fctl_chunk;
+    APNGFctlChunk fctl_chunk = *best_fctl_chunk;
+
+    if (avctx->frame_number == 0) {
+        best_fctl_chunk->width = pict->width;
+        best_fctl_chunk->height = pict->height;
+        best_fctl_chunk->x_offset = 0;
+        best_fctl_chunk->y_offset = 0;
+        best_fctl_chunk->blend_op = APNG_BLEND_OP_SOURCE;
+        return encode_frame(avctx, pict);
+    }
+
+    diffFrame = av_frame_alloc();
+    if (!diffFrame)
+        return AVERROR(ENOMEM);
+
+    diffFrame->format = pict->format;
+    diffFrame->width = pict->width;
+    diffFrame->height = pict->height;
+    if ((ret = av_frame_get_buffer(diffFrame, 32)) < 0)
+        goto fail;
+
+    original_bytestream = s->bytestream;
+    original_bytestream_end = s->bytestream_end;
+
+    temp_bytestream = av_malloc(original_bytestream_end - original_bytestream);
+    temp_bytestream_end = temp_bytestream + (original_bytestream_end - original_bytestream);
+    if (!temp_bytestream) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (last_fctl_chunk.dispose_op = 0; last_fctl_chunk.dispose_op < 3; ++last_fctl_chunk.dispose_op) {
+        // 0: APNG_DISPOSE_OP_NONE
+        // 1: APNG_DISPOSE_OP_BACKGROUND
+        // 2: APNG_DISPOSE_OP_PREVIOUS
+
+        for (fctl_chunk.blend_op = 0; fctl_chunk.blend_op < 2; ++fctl_chunk.blend_op) {
+            // 0: APNG_BLEND_OP_SOURCE
+            // 1: APNG_BLEND_OP_OVER
+
+            uint32_t original_sequence_number = s->sequence_number, sequence_number;
+            uint8_t *bytestream_start = s->bytestream;
+            size_t bytestream_size;
+
+            // Do disposal
+            if (last_fctl_chunk.dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+                diffFrame->width = pict->width;
+                diffFrame->height = pict->height;
+                ret = av_frame_copy(diffFrame, s->last_frame);
+                if (ret < 0)
+                    goto fail;
+
+                if (last_fctl_chunk.dispose_op == APNG_DISPOSE_OP_BACKGROUND) {
+                    for (y = last_fctl_chunk.y_offset; y < last_fctl_chunk.y_offset + last_fctl_chunk.height; ++y) {
+                        size_t row_start = diffFrame->linesize[0] * y + bpp * last_fctl_chunk.x_offset;
+                        memset(diffFrame->data[0] + row_start, 0, bpp * last_fctl_chunk.width);
+                    }
+                }
+            } else {
+                if (!s->prev_frame)
+                    continue;
+
+                diffFrame->width = pict->width;
+                diffFrame->height = pict->height;
+                ret = av_frame_copy(diffFrame, s->prev_frame);
+                if (ret < 0)
+                    goto fail;
+            }
+
+            // Do inverse blending
+            if (apng_do_inverse_blend(diffFrame, pict, &fctl_chunk, bpp) < 0)
+                continue;
+
+            // Do encoding
+            ret = encode_frame(avctx, diffFrame);
+            sequence_number = s->sequence_number;
+            s->sequence_number = original_sequence_number;
+            bytestream_size = s->bytestream - bytestream_start;
+            s->bytestream = bytestream_start;
+            if (ret < 0)
+                goto fail;
+
+            if (bytestream_size < best_bytestream_size) {
+                *best_fctl_chunk = fctl_chunk;
+                *best_last_fctl_chunk = last_fctl_chunk;
+
+                best_sequence_number = sequence_number;
+                best_bytestream = s->bytestream;
+                best_bytestream_size = bytestream_size;
+
+                if (best_bytestream == original_bytestream) {
+                    s->bytestream = temp_bytestream;
+                    s->bytestream_end = temp_bytestream_end;
+                } else {
+                    s->bytestream = original_bytestream;
+                    s->bytestream_end = original_bytestream_end;
+                }
+            }
+        }
+    }
+
+    s->sequence_number = best_sequence_number;
+    s->bytestream = original_bytestream + best_bytestream_size;
+    s->bytestream_end = original_bytestream_end;
+    if (best_bytestream != original_bytestream)
+        memcpy(original_bytestream, best_bytestream, best_bytestream_size);
+
+    ret = 0;
+
 fail:
-    ret = -1;
-    goto the_end;
+    av_freep(&temp_bytestream);
+    av_frame_free(&diffFrame);
+    return ret;
+}
+
+static int encode_apng(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *pict, int *got_packet)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    int enc_row_size;
+    size_t max_packet_size;
+    APNGFctlChunk fctl_chunk = {0};
+
+    if (pict && avctx->codec_id == AV_CODEC_ID_APNG && s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        uint32_t checksum = ~av_crc(av_crc_get_table(AV_CRC_32_IEEE_LE), ~0U, pict->data[1], 256 * sizeof(uint32_t));
+
+        if (avctx->frame_number == 0) {
+            s->palette_checksum = checksum;
+        } else if (checksum != s->palette_checksum) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Input contains more than one unique palette. APNG does not support multiple palettes.\n");
+            return -1;
+        }
+    }
+
+    enc_row_size    = deflateBound(&s->zstream, (avctx->width * s->bits_per_pixel + 7) >> 3);
+    max_packet_size =
+        AV_INPUT_BUFFER_MIN_SIZE + // headers
+        avctx->height * (
+            enc_row_size +
+            (4 + 12) * (((int64_t)enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) // fdAT * ceil(enc_row_size / IOBUF_SIZE)
+        );
+    if (max_packet_size > INT_MAX)
+        return AVERROR(ENOMEM);
+
+    if (avctx->frame_number == 0) {
+        if (!pict)
+            return AVERROR(EINVAL);
+
+        s->bytestream = s->extra_data = av_malloc(AV_INPUT_BUFFER_MIN_SIZE);
+        if (!s->extra_data)
+            return AVERROR(ENOMEM);
+
+        ret = encode_headers(avctx, pict);
+        if (ret < 0)
+            return ret;
+
+        s->extra_data_size = s->bytestream - s->extra_data;
+
+        s->last_frame_packet = av_malloc(max_packet_size);
+        if (!s->last_frame_packet)
+            return AVERROR(ENOMEM);
+    } else if (s->last_frame) {
+        ret = ff_alloc_packet2(avctx, pkt, max_packet_size, 0);
+        if (ret < 0)
+            return ret;
+
+        memcpy(pkt->data, s->last_frame_packet, s->last_frame_packet_size);
+        pkt->size = s->last_frame_packet_size;
+        pkt->pts = pkt->dts = s->last_frame->pts;
+    }
+
+    if (pict) {
+        s->bytestream_start =
+        s->bytestream       = s->last_frame_packet;
+        s->bytestream_end   = s->bytestream + max_packet_size;
+
+        // We're encoding the frame first, so we have to do a bit of shuffling around
+        // to have the image data write to the correct place in the buffer
+        fctl_chunk.sequence_number = s->sequence_number;
+        ++s->sequence_number;
+        s->bytestream += 26 + 12;
+
+        ret = apng_encode_frame(avctx, pict, &fctl_chunk, &s->last_frame_fctl);
+        if (ret < 0)
+            return ret;
+
+        fctl_chunk.delay_num = 0; // delay filled in during muxing
+        fctl_chunk.delay_den = 0;
+    } else {
+        s->last_frame_fctl.dispose_op = APNG_DISPOSE_OP_NONE;
+    }
+
+    if (s->last_frame) {
+        uint8_t* last_fctl_chunk_start = pkt->data;
+        uint8_t buf[26];
+        if (!s->extra_data_updated) {
+            uint8_t *side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, s->extra_data_size);
+            if (!side_data)
+                return AVERROR(ENOMEM);
+            memcpy(side_data, s->extra_data, s->extra_data_size);
+            s->extra_data_updated = 1;
+        }
+
+        AV_WB32(buf + 0, s->last_frame_fctl.sequence_number);
+        AV_WB32(buf + 4, s->last_frame_fctl.width);
+        AV_WB32(buf + 8, s->last_frame_fctl.height);
+        AV_WB32(buf + 12, s->last_frame_fctl.x_offset);
+        AV_WB32(buf + 16, s->last_frame_fctl.y_offset);
+        AV_WB16(buf + 20, s->last_frame_fctl.delay_num);
+        AV_WB16(buf + 22, s->last_frame_fctl.delay_den);
+        buf[24] = s->last_frame_fctl.dispose_op;
+        buf[25] = s->last_frame_fctl.blend_op;
+        png_write_chunk(&last_fctl_chunk_start, MKTAG('f', 'c', 'T', 'L'), buf, 26);
+
+        *got_packet = 1;
+    }
+
+    if (pict) {
+        if (!s->last_frame) {
+            s->last_frame = av_frame_alloc();
+            if (!s->last_frame)
+                return AVERROR(ENOMEM);
+        } else if (s->last_frame_fctl.dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+            if (!s->prev_frame) {
+                s->prev_frame = av_frame_alloc();
+                if (!s->prev_frame)
+                    return AVERROR(ENOMEM);
+
+                s->prev_frame->format = pict->format;
+                s->prev_frame->width = pict->width;
+                s->prev_frame->height = pict->height;
+                if ((ret = av_frame_get_buffer(s->prev_frame, 32)) < 0)
+                    return ret;
+            }
+
+            // Do disposal, but not blending
+            av_frame_copy(s->prev_frame, s->last_frame);
+            if (s->last_frame_fctl.dispose_op == APNG_DISPOSE_OP_BACKGROUND) {
+                uint32_t y;
+                uint8_t bpp = (s->bits_per_pixel + 7) >> 3;
+                for (y = s->last_frame_fctl.y_offset; y < s->last_frame_fctl.y_offset + s->last_frame_fctl.height; ++y) {
+                    size_t row_start = s->prev_frame->linesize[0] * y + bpp * s->last_frame_fctl.x_offset;
+                    memset(s->prev_frame->data[0] + row_start, 0, bpp * s->last_frame_fctl.width);
+                }
+            }
+        }
+
+        av_frame_unref(s->last_frame);
+        ret = av_frame_ref(s->last_frame, (AVFrame*)pict);
+        if (ret < 0)
+            return ret;
+
+        s->last_frame_fctl = fctl_chunk;
+        s->last_frame_packet_size = s->bytestream - s->bytestream_start;
+    } else {
+        av_frame_free(&s->last_frame);
+    }
+
+    return 0;
 }
 
 static av_cold int png_enc_init(AVCodecContext *avctx)
 {
     PNGEncContext *s = avctx->priv_data;
+    int compression_level;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGBA:
+        avctx->bits_per_coded_sample = 32;
+        break;
+    case AV_PIX_FMT_RGB24:
+        avctx->bits_per_coded_sample = 24;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        avctx->bits_per_coded_sample = 0x28;
+        break;
+    case AV_PIX_FMT_MONOBLACK:
+        avctx->bits_per_coded_sample = 1;
+        break;
+    case AV_PIX_FMT_PAL8:
+        avctx->bits_per_coded_sample = 8;
+    }
 
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -486,7 +1015,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    ff_huffyuvencdsp_init(&s->hdsp);
+    ff_llvidencdsp_init(&s->llvidencdsp);
 
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -500,41 +1029,152 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->pix_fmt == AV_PIX_FMT_MONOBLACK)
         s->filter_type = PNG_FILTER_VALUE_NONE;
 
+    if (s->dpi && s->dpm) {
+      av_log(avctx, AV_LOG_ERROR, "Only one of 'dpi' or 'dpm' options should be set\n");
+      return AVERROR(EINVAL);
+    } else if (s->dpi) {
+      s->dpm = s->dpi * 10000 / 254;
+    }
+
+    s->is_progressive = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGBA64BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_RGB_ALPHA;
+        break;
+    case AV_PIX_FMT_RGB48BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_RGB;
+        break;
+    case AV_PIX_FMT_RGBA:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_RGB_ALPHA;
+        break;
+    case AV_PIX_FMT_RGB24:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_RGB;
+        break;
+    case AV_PIX_FMT_GRAY16BE:
+        s->bit_depth  = 16;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_GRAY8A:
+        s->bit_depth = 8;
+        s->color_type = PNG_COLOR_TYPE_GRAY_ALPHA;
+        break;
+    case AV_PIX_FMT_YA16BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_GRAY_ALPHA;
+        break;
+    case AV_PIX_FMT_MONOBLACK:
+        s->bit_depth  = 1;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_PAL8:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_PALETTE;
+        break;
+    default:
+        return -1;
+    }
+    s->bits_per_pixel = ff_png_get_nb_channels(s->color_type) * s->bit_depth;
+
+    s->zstream.zalloc = ff_png_zalloc;
+    s->zstream.zfree  = ff_png_zfree;
+    s->zstream.opaque = NULL;
+    compression_level = avctx->compression_level == FF_COMPRESSION_DEFAULT
+                      ? Z_DEFAULT_COMPRESSION
+                      : av_clip(avctx->compression_level, 0, 9);
+    if (deflateInit2(&s->zstream, compression_level, Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY) != Z_OK)
+        return -1;
+
+    return 0;
+}
+
+static av_cold int png_enc_close(AVCodecContext *avctx)
+{
+    PNGEncContext *s = avctx->priv_data;
+
+    deflateEnd(&s->zstream);
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->prev_frame);
+    av_freep(&s->last_frame_packet);
+    av_freep(&s->extra_data);
+    s->extra_data_size = 0;
     return 0;
 }
 
 #define OFFSET(x) offsetof(PNGEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-{ "pred", "Prediction method", OFFSET(filter_type), AV_OPT_TYPE_INT, { .i64 = PNG_FILTER_VALUE_NONE }, PNG_FILTER_VALUE_NONE, PNG_FILTER_VALUE_MIXED, VE, "pred" },
-    { "none",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_NONE },  INT_MIN, INT_MAX, VE, "pred" },
-    { "sub",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_SUB },   INT_MIN, INT_MAX, VE, "pred" },
-    { "up",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_UP },    INT_MIN, INT_MAX, VE, "pred" },
-    { "avg",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_AVG },   INT_MIN, INT_MAX, VE, "pred" },
-    { "paeth", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_PAETH }, INT_MIN, INT_MAX, VE, "pred" },
-    { "mixed", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_MIXED }, INT_MIN, INT_MAX, VE, "pred" },
-
+    {"dpi", "Set image resolution (in dots per inch)",  OFFSET(dpi), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 0x10000, VE},
+    {"dpm", "Set image resolution (in dots per meter)", OFFSET(dpm), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 0x10000, VE},
+    { "pred", "Prediction method", OFFSET(filter_type), AV_OPT_TYPE_INT, { .i64 = PNG_FILTER_VALUE_NONE }, PNG_FILTER_VALUE_NONE, PNG_FILTER_VALUE_MIXED, VE, "pred" },
+        { "none",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_NONE },  INT_MIN, INT_MAX, VE, "pred" },
+        { "sub",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_SUB },   INT_MIN, INT_MAX, VE, "pred" },
+        { "up",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_UP },    INT_MIN, INT_MAX, VE, "pred" },
+        { "avg",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_AVG },   INT_MIN, INT_MAX, VE, "pred" },
+        { "paeth", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_PAETH }, INT_MIN, INT_MAX, VE, "pred" },
+        { "mixed", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_MIXED }, INT_MIN, INT_MAX, VE, "pred" },
     { NULL},
 };
 
-static const AVClass png_class = {
-    .class_name = "png",
+static const AVClass pngenc_class = {
+    .class_name = "PNG encoder",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
+
+static const AVClass apngenc_class = {
+    .class_name = "APNG encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_png_encoder = {
     .name           = "png",
     .long_name      = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_PNG,
     .priv_data_size = sizeof(PNGEncContext),
-    .priv_class     = &png_class,
     .init           = png_enc_init,
-    .encode2        = encode_frame,
+    .close          = png_enc_close,
+    .encode2        = encode_png,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_PAL8,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+        AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
+    },
+    .priv_class     = &pngenc_class,
+};
+
+AVCodec ff_apng_encoder = {
+    .name           = "apng",
+    .long_name      = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_APNG,
+    .priv_data_size = sizeof(PNGEncContext),
+    .init           = png_enc_init,
+    .close          = png_enc_close,
+    .encode2        = encode_apng,
+    .capabilities   = AV_CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB32, AV_PIX_FMT_PAL8, AV_PIX_FMT_GRAY8,
-        AV_PIX_FMT_RGBA64BE, AV_PIX_FMT_RGB48BE, AV_PIX_FMT_GRAY16BE,
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_PAL8,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
         AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
     },
+    .priv_class     = &apngenc_class,
 };
diff --git a/libavcodec/pnm.c b/libavcodec/pnm.c
index 1c380b0..17926f2 100644
--- a/libavcodec/pnm.c
+++ b/libavcodec/pnm.c
@@ -2,20 +2,20 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
+#include "internal.h"
 #include "pnm.h"
 
 static inline int pnm_space(int c)
@@ -35,37 +36,41 @@ static void pnm_get(PNMContext *sc, char *str, int buf_size)
 {
     char *s;
     int c;
+    uint8_t *bs  = sc->bytestream;
+    const uint8_t *end = sc->bytestream_end;
 
     /* skip spaces and comments */
-    for (;;) {
-        c = *sc->bytestream++;
+    while (bs < end) {
+        c = *bs++;
         if (c == '#')  {
-            do {
-                c = *sc->bytestream++;
-            } while (c != '\n' && sc->bytestream < sc->bytestream_end);
+            while (c != '\n' && bs < end) {
+                c = *bs++;
+            }
         } else if (!pnm_space(c)) {
             break;
         }
     }
 
     s = str;
-    while (sc->bytestream < sc->bytestream_end && !pnm_space(c)) {
+    while (bs < end && !pnm_space(c)) {
         if ((s - str)  < buf_size - 1)
             *s++ = c;
-        c = *sc->bytestream++;
+        c = *bs++;
     }
     *s = '\0';
+    sc->bytestream = bs;
 }
 
 int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
 {
     char buf1[32], tuple_type[32];
     int h, w, depth, maxval;
+    int ret;
 
     pnm_get(s, buf1, sizeof(buf1));
-    s->type= buf1[1]-'0';
     if(buf1[0] != 'P')
         return AVERROR_INVALIDDATA;
+    s->type= buf1[1]-'0';
 
     if (s->type==1 || s->type==4) {
         avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
@@ -107,26 +112,40 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
             }
         }
         /* check that all tags are present */
-        if (w <= 0 || h <= 0 || maxval <= 0 || depth <= 0 || tuple_type[0] == '\0' || av_image_check_size(w, h, 0, avctx))
+        if (w <= 0 || h <= 0 || maxval <= 0 || maxval > UINT16_MAX || depth <= 0 || tuple_type[0] == '\0' ||
+            av_image_check_size(w, h, 0, avctx) || s->bytestream >= s->bytestream_end)
             return AVERROR_INVALIDDATA;
 
-        avctx->width  = w;
-        avctx->height = h;
+        ret = ff_set_dimensions(avctx, w, h);
+        if (ret < 0)
+            return ret;
+        s->maxval     = maxval;
         if (depth == 1) {
-            if (maxval == 1)
-                avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-            else
+            if (maxval == 1) {
+                avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+            } else if (maxval < 256) {
                 avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            }
+        } else if (depth == 2) {
+            if (maxval < 256) {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY8A;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_YA16;
+            }
         } else if (depth == 3) {
             if (maxval < 256) {
-            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+                avctx->pix_fmt = AV_PIX_FMT_RGB24;
             } else {
-                av_log(avctx, AV_LOG_ERROR, "16-bit components are only supported for grayscale\n");
-                avctx->pix_fmt = AV_PIX_FMT_NONE;
-                return AVERROR_INVALIDDATA;
+                avctx->pix_fmt = AV_PIX_FMT_RGB48;
             }
         } else if (depth == 4) {
-            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+            if (maxval < 256) {
+                avctx->pix_fmt = AV_PIX_FMT_RGBA;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_RGBA64;
+            }
         } else {
             return AVERROR_INVALIDDATA;
         }
@@ -135,33 +154,33 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
         return AVERROR_INVALIDDATA;
     }
     pnm_get(s, buf1, sizeof(buf1));
-    avctx->width = atoi(buf1);
-    if (avctx->width <= 0)
-        return AVERROR_INVALIDDATA;
+    w = atoi(buf1);
     pnm_get(s, buf1, sizeof(buf1));
-    avctx->height = atoi(buf1);
-    if(av_image_check_size(avctx->width, avctx->height, 0, avctx))
+    h = atoi(buf1);
+    if(w <= 0 || h <= 0 || av_image_check_size(w, h, 0, avctx) || s->bytestream >= s->bytestream_end)
         return AVERROR_INVALIDDATA;
-    if (avctx->pix_fmt != AV_PIX_FMT_MONOWHITE) {
+
+    ret = ff_set_dimensions(avctx, w, h);
+    if (ret < 0)
+        return ret;
+
+    if (avctx->pix_fmt != AV_PIX_FMT_MONOWHITE && avctx->pix_fmt != AV_PIX_FMT_MONOBLACK) {
         pnm_get(s, buf1, sizeof(buf1));
         s->maxval = atoi(buf1);
-        if (s->maxval <= 0) {
+        if (s->maxval <= 0 || s->maxval > UINT16_MAX) {
             av_log(avctx, AV_LOG_ERROR, "Invalid maxval: %d\n", s->maxval);
             s->maxval = 255;
         }
         if (s->maxval >= 256) {
             if (avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
-                avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
-                if (s->maxval != 65535)
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16;
             } else if (avctx->pix_fmt == AV_PIX_FMT_RGB24) {
-                if (s->maxval > 255)
-                    avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+                avctx->pix_fmt = AV_PIX_FMT_RGB48;
             } else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P && s->maxval < 65536) {
                 if (s->maxval < 512)
-                    avctx->pix_fmt = AV_PIX_FMT_YUV420P9BE;
+                    avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
                 else if (s->maxval < 1024)
-                    avctx->pix_fmt = AV_PIX_FMT_YUV420P10BE;
+                    avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
                 else
                     avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
             } else {
diff --git a/libavcodec/pnm.h b/libavcodec/pnm.h
index 5fc6513..5bc0aad 100644
--- a/libavcodec/pnm.h
+++ b/libavcodec/pnm.h
@@ -2,20 +2,20 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pnm_parser.c b/libavcodec/pnm_parser.c
index 03d2da9..9bf1fdc 100644
--- a/libavcodec/pnm_parser.c
+++ b/libavcodec/pnm_parser.c
@@ -2,20 +2,20 @@
  * PNM image parser
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@ static int pnm_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     ParseContext *pc = s->priv_data;
     PNMContext pnmctx;
     int next;
+    int skip = 0;
 
     for (; pc->overread > 0; pc->overread--) {
         pc->buffer[pc->index++]= pc->buffer[pc->overread_index++];
@@ -43,24 +44,27 @@ retry:
         pnmctx.bytestream_end   = pc->buffer + pc->index;
     } else {
         pnmctx.bytestream_start =
-        pnmctx.bytestream       = (uint8_t *) buf; /* casts avoid warnings */
-        pnmctx.bytestream_end   = (uint8_t *) buf + buf_size;
+        pnmctx.bytestream       = (uint8_t *) buf + skip; /* casts avoid warnings */
+        pnmctx.bytestream_end   = (uint8_t *) buf + buf_size - skip;
     }
     if (ff_pnm_decode_header(avctx, &pnmctx) < 0) {
         if (pnmctx.bytestream < pnmctx.bytestream_end) {
             if (pc->index) {
                 pc->index = 0;
             } else {
-                buf++;
-                buf_size--;
+                unsigned step = FFMAX(1, pnmctx.bytestream - pnmctx.bytestream_start);
+
+                skip += step;
             }
             goto retry;
         }
         next = END_NOT_FOUND;
+    } else if (pnmctx.type < 4) {
+        next = END_NOT_FOUND;
     } else {
-        next = pnmctx.bytestream - pnmctx.bytestream_start
+        next = pnmctx.bytestream - pnmctx.bytestream_start + skip
                + av_image_get_buffer_size(avctx->pix_fmt, avctx->width, avctx->height, 1);
-        if (pnmctx.bytestream_start != buf)
+        if (pnmctx.bytestream_start != buf + skip)
             next -= pc->index;
         if (next > buf_size)
             next = END_NOT_FOUND;
diff --git a/libavcodec/pnmdec.c b/libavcodec/pnmdec.c
index d23e2c0..958c5e4 100644
--- a/libavcodec/pnmdec.c
+++ b/libavcodec/pnmdec.c
@@ -2,29 +2,39 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 #include "put_bits.h"
 #include "pnm.h"
 
+static void samplecpy(uint8_t *dst, const uint8_t *src, int n, int maxval)
+{
+    if (maxval <= 255) {
+        memcpy(dst, src, n);
+    } else {
+        int i;
+        for (i=0; i<n/2; i++) {
+            ((uint16_t *)dst)[i] = AV_RB16(src+2*i);
+        }
+    }
+}
 
 static int pnm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
@@ -33,36 +43,51 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
     int buf_size         = avpkt->size;
     PNMContext * const s = avctx->priv_data;
     AVFrame * const p    = data;
-    int i, j, n, linesize, h, upgrade = 0;
+    int i, j, k, n, linesize, h, upgrade = 0, is_mono = 0;
     unsigned char *ptr;
     int components, sample_len, ret;
 
     s->bytestream_start =
-    s->bytestream       = buf;
-    s->bytestream_end   = buf + buf_size;
+    s->bytestream       = (uint8_t *)buf;
+    s->bytestream_end   = (uint8_t *)buf + buf_size;
 
     if ((ret = ff_pnm_decode_header(avctx, s)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
+    avctx->bits_per_raw_sample = av_log2(s->maxval) + 1;
 
     switch (avctx->pix_fmt) {
     default:
         return AVERROR(EINVAL);
-    case AV_PIX_FMT_RGB48BE:
+    case AV_PIX_FMT_RGBA64:
+        n = avctx->width * 8;
+        components=4;
+        sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
+    case AV_PIX_FMT_RGB48:
         n = avctx->width * 6;
         components=3;
         sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
+    case AV_PIX_FMT_RGBA:
+        n = avctx->width * 4;
+        components=4;
+        sample_len=8;
         goto do_read;
     case AV_PIX_FMT_RGB24:
         n = avctx->width * 3;
         components=3;
         sample_len=8;
+        if (s->maxval < 255)
+            upgrade = 1;
         goto do_read;
     case AV_PIX_FMT_GRAY8:
         n = avctx->width;
@@ -71,48 +96,75 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
         if (s->maxval < 255)
             upgrade = 1;
         goto do_read;
-    case AV_PIX_FMT_GRAY16BE:
-    case AV_PIX_FMT_GRAY16LE:
+    case AV_PIX_FMT_GRAY8A:
+        n = avctx->width * 2;
+        components=2;
+        sample_len=8;
+        goto do_read;
+    case AV_PIX_FMT_GRAY16:
         n = avctx->width * 2;
         components=1;
         sample_len=16;
         if (s->maxval < 65535)
             upgrade = 2;
         goto do_read;
+    case AV_PIX_FMT_YA16:
+        n =  avctx->width * 4;
+        components=2;
+        sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
     case AV_PIX_FMT_MONOWHITE:
     case AV_PIX_FMT_MONOBLACK:
         n = (avctx->width + 7) >> 3;
         components=1;
         sample_len=1;
+        is_mono = 1;
     do_read:
         ptr      = p->data[0];
         linesize = p->linesize[0];
-        if (s->bytestream + n * avctx->height > s->bytestream_end)
+        if (n * avctx->height > s->bytestream_end - s->bytestream)
             return AVERROR_INVALIDDATA;
-        if(s->type < 4){
+        if(s->type < 4 || (is_mono && s->type==7)){
             for (i=0; i<avctx->height; i++) {
                 PutBitContext pb;
                 init_put_bits(&pb, ptr, linesize);
                 for(j=0; j<avctx->width * components; j++){
                     unsigned int c=0;
                     int v=0;
+                    if(s->type < 4)
                     while(s->bytestream < s->bytestream_end && (*s->bytestream < '0' || *s->bytestream > '9' ))
                         s->bytestream++;
                     if(s->bytestream >= s->bytestream_end)
                         return AVERROR_INVALIDDATA;
-                    do{
-                        v= 10*v + c;
-                        c= (*s->bytestream++) - '0';
-                    }while(c <= 9);
-                    put_bits(&pb, sample_len, (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval);
+                    if (is_mono) {
+                        /* read a single digit */
+                        v = (*s->bytestream++)&1;
+                    } else {
+                        /* read a sequence of digits */
+                        for (k = 0; k < 5 && c <= 9; k += 1) {
+                            v = 10*v + c;
+                            c = (*s->bytestream++) - '0';
+                        }
+                        if (v > s->maxval) {
+                            av_log(avctx, AV_LOG_ERROR, "value %d larger than maxval %d\n", v, s->maxval);
+                            return AVERROR_INVALIDDATA;
+                        }
+                    }
+                    if (sample_len == 16) {
+                        ((uint16_t*)ptr)[j] = (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval;
+                    } else
+                        put_bits(&pb, sample_len, (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval);
                 }
-                flush_put_bits(&pb);
+                if (sample_len != 16)
+                    flush_put_bits(&pb);
                 ptr+= linesize;
             }
         }else{
         for (i = 0; i < avctx->height; i++) {
             if (!upgrade)
-                memcpy(ptr, s->bytestream, n);
+                samplecpy(ptr, s->bytestream, n, s->maxval);
             else if (upgrade == 1) {
                 unsigned int j, f = (255 * 128 + s->maxval / 2) / s->maxval;
                 for (j = 0; j < n; j++)
@@ -130,8 +182,8 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     case AV_PIX_FMT_YUV420P:
-    case AV_PIX_FMT_YUV420P9BE:
-    case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
         {
             unsigned char *ptr1, *ptr2;
 
@@ -140,10 +192,10 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             linesize = p->linesize[0];
             if (s->maxval >= 256)
                 n *= 2;
-            if (s->bytestream + n * avctx->height * 3 / 2 > s->bytestream_end)
+            if (n * avctx->height * 3 / 2 > s->bytestream_end - s->bytestream)
                 return AVERROR_INVALIDDATA;
             for (i = 0; i < avctx->height; i++) {
-                memcpy(ptr, s->bytestream, n);
+                samplecpy(ptr, s->bytestream, n, s->maxval);
                 s->bytestream += n;
                 ptr           += linesize;
             }
@@ -152,9 +204,9 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             n >>= 1;
             h = avctx->height >> 1;
             for (i = 0; i < h; i++) {
-                memcpy(ptr1, s->bytestream, n);
+                samplecpy(ptr1, s->bytestream, n, s->maxval);
                 s->bytestream += n;
-                memcpy(ptr2, s->bytestream, n);
+                samplecpy(ptr2, s->bytestream, n, s->maxval);
                 s->bytestream += n;
                 ptr1 += p->linesize[1];
                 ptr2 += p->linesize[2];
@@ -170,7 +222,7 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             n        = avctx->width * 2;
             ptr      = p->data[0];
             linesize = p->linesize[0];
-            if (s->bytestream + n * avctx->height * 3 / 2 > s->bytestream_end)
+            if (n * avctx->height * 3 / 2 > s->bytestream_end - s->bytestream)
                 return AVERROR_INVALIDDATA;
             for (i = 0; i < avctx->height; i++) {
                 for (j = 0; j < n / 2; j++) {
@@ -202,24 +254,6 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
         break;
-    case AV_PIX_FMT_RGB32:
-        ptr      = p->data[0];
-        linesize = p->linesize[0];
-        if (s->bytestream + avctx->width * avctx->height * 4 > s->bytestream_end)
-            return AVERROR_INVALIDDATA;
-        for (i = 0; i < avctx->height; i++) {
-            int j, r, g, b, a;
-
-            for (j = 0; j < avctx->width; j++) {
-                r = *s->bytestream++;
-                g = *s->bytestream++;
-                b = *s->bytestream++;
-                a = *s->bytestream++;
-                ((uint32_t *)ptr)[j] = (a << 24) | (r << 16) | (g << 8) | b;
-            }
-            ptr += linesize;
-        }
-        break;
     }
     *got_frame = 1;
 
diff --git a/libavcodec/pnmenc.c b/libavcodec/pnmenc.c
index f8c600f..ba9478d 100644
--- a/libavcodec/pnmenc.c
+++ b/libavcodec/pnmenc.c
@@ -2,43 +2,39 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 
 static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *pict, int *got_packet)
+                            const AVFrame *p, int *got_packet)
 {
     uint8_t *bytestream, *bytestream_start, *bytestream_end;
-    const AVFrame * const p = pict;
     int i, h, h1, c, n, linesize, ret;
     uint8_t *ptr, *ptr1, *ptr2;
     int size = av_image_get_buffer_size(avctx->pix_fmt,
                                         avctx->width, avctx->height, 1);
 
-    if ((ret = ff_alloc_packet(pkt, size + 200)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, size + 200, 0)) < 0)
         return ret;
-    }
 
     bytestream_start =
     bytestream       = pkt->data;
@@ -68,6 +64,10 @@ static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         n  = avctx->width * 6;
         break;
     case AV_PIX_FMT_YUV420P:
+        if (avctx->width & 1 || avctx->height & 1) {
+            av_log(avctx, AV_LOG_ERROR, "pgmyuv needs even width and height\n");
+            return AVERROR(EINVAL);
+        }
         c  = '5';
         n  = avctx->width;
         h1 = (h * 3) / 2;
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index 4b92add..03e5b42 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -2,16 +2,16 @@
 OBJS-$(CONFIG_AUDIODSP)                += ppc/audiodsp.o
 OBJS-$(CONFIG_BLOCKDSP)                += ppc/blockdsp.o
 OBJS-$(CONFIG_FFT)                     += ppc/fft_init.o                \
-                                          ppc/fft_altivec.o
+                                          ppc/fft_altivec.o             \
+                                          ppc/fft_vsx.o
 OBJS-$(CONFIG_FDCTDSP)                 += ppc/fdctdsp.o
 OBJS-$(CONFIG_FMTCONVERT)              += ppc/fmtconvert_altivec.o
 OBJS-$(CONFIG_H264CHROMA)              += ppc/h264chroma_init.o
-OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o
+OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o ppc/hpeldsp_altivec.o
 OBJS-$(CONFIG_H264QPEL)                += ppc/h264qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += ppc/hpeldsp_altivec.o
-OBJS-$(CONFIG_HUFFYUVDSP)              += ppc/huffyuvdsp_altivec.o
 OBJS-$(CONFIG_IDCTDSP)                 += ppc/idctdsp.o
-OBJS-$(CONFIG_MDCT)                    += ppc/mdct_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += ppc/lossless_videodsp_altivec.o
 OBJS-$(CONFIG_ME_CMP)                  += ppc/me_cmp.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
@@ -24,8 +24,8 @@ OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
 OBJS-$(CONFIG_VP8DSP)                  += ppc/vp8dsp_altivec.o
 
 # decoders/encoders
-OBJS-$(CONFIG_APE_DECODER)             += ppc/apedsp_altivec.o
 OBJS-$(CONFIG_HEVC_DECODER)            += ppc/hevcdsp.o
+OBJS-$(CONFIG_LLAUDDSP)                += ppc/lossless_audiodsp_altivec.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += ppc/svq1enc_altivec.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
 OBJS-$(CONFIG_VP7_DECODER)             += ppc/vp8dsp_altivec.o
diff --git a/libavcodec/ppc/asm.S b/libavcodec/ppc/asm.S
index 141dee9..6222b8b 100644
--- a/libavcodec/ppc/asm.S
+++ b/libavcodec/ppc/asm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,12 @@
 #define JOIN(a, b) GLUE(a, b)
 #define X(s) JOIN(EXTERN_ASM, s)
 
+#if __APPLE__
+#define R(n) r ## n
+#else
+#define R(n) n
+#endif
+
 #if ARCH_PPC64
 
 #define PTR  .quad
@@ -53,7 +59,7 @@ L(\name):
 .endm
 
 .macro movrel rd, sym, gp
-    ld      \rd, \sym@got(r2)
+    ld      \rd, \sym@got(R(2))
 .endm
 
 .macro get_got rd
diff --git a/libavcodec/ppc/audiodsp.c b/libavcodec/ppc/audiodsp.c
index 371e0d1..2e37473 100644
--- a/libavcodec/ppc/audiodsp.c
+++ b/libavcodec/ppc/audiodsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 
 #include "libavcodec/audiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                            int order)
@@ -56,7 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
     return ires;
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 #if HAVE_VSX
 
@@ -85,7 +85,7 @@ static int32_t scalarproduct_int16_vsx(const int16_t *v1, const int16_t *v2, int
 
 av_cold void ff_audiodsp_init_ppc(AudioDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/blockdsp.c b/libavcodec/ppc/blockdsp.c
index ee5139a..d89b77e 100644
--- a/libavcodec/ppc/blockdsp.c
+++ b/libavcodec/ppc/blockdsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/fdct.h b/libavcodec/ppc/fdct.h
index 7471035..437f815 100644
--- a/libavcodec/ppc/fdct.h
+++ b/libavcodec/ppc/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/fdctdsp.c b/libavcodec/ppc/fdctdsp.c
index 36d4b4e..4ab516c 100644
--- a/libavcodec/ppc/fdctdsp.c
+++ b/libavcodec/ppc/fdctdsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003  James Klicman <james@klicman.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 
 #include "fdct.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define vs16(v)   ((vector signed short) (v))
 #define vs32(v)     ((vector signed int) (v))
@@ -37,29 +37,28 @@
 #define vu16(v) ((vector unsigned short) (v))
 #define vu32(v)   ((vector unsigned int) (v))
 
-#define C1     0.98078525066375732421875000 /* cos(1 * PI / 16) */
-#define C2     0.92387950420379638671875000 /* cos(2 * PI / 16) */
-#define C3     0.83146959543228149414062500 /* cos(3 * PI / 16) */
-#define C4     0.70710676908493041992187500 /* cos(4 * PI / 16) */
-#define C5     0.55557024478912353515625000 /* cos(5 * PI / 16) */
-#define C6     0.38268342614173889160156250 /* cos(6 * PI / 16) */
-#define C7     0.19509032368659973144531250 /* cos(7 * PI / 16) */
-#define SQRT_2 1.41421353816986083984375000 /* sqrt(2)          */
+#define C1     0.98078528040323044912618224 /* cos(1 * PI / 16) */
+#define C2     0.92387953251128675612818319 /* cos(2 * PI / 16) */
+#define C3     0.83146961230254523707878838 /* cos(3 * PI / 16) */
+#define C4     0.70710678118654752440084436 /* cos(4 * PI / 16) */
+#define C5     0.55557023301960222474283081 /* cos(5 * PI / 16) */
+#define C6     0.38268343236508977172845998 /* cos(6 * PI / 16) */
+#define C7     0.19509032201612826784828487 /* cos(7 * PI / 16) */
 
 #define W0 -(2 * C2)
 #define W1  (2 * C6)
-#define W2 (SQRT_2 * C6)
-#define W3 (SQRT_2 * C3)
-#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
-#define W5 (SQRT_2 *  (C1 + C3 - C5 + C7))
-#define W6 (SQRT_2 *  (C1 + C3 + C5 - C7))
-#define W7 (SQRT_2 *  (C1 + C3 - C5 - C7))
-#define W8 (SQRT_2 *  (C7 - C3))
-#define W9 (SQRT_2 * (-C1 - C3))
-#define WA (SQRT_2 * (-C3 - C5))
-#define WB (SQRT_2 *  (C5 - C3))
-
-static vector float fdctconsts[3] = {
+#define W2 (M_SQRT2 * C6)
+#define W3 (M_SQRT2 * C3)
+#define W4 (M_SQRT2 * (-C1 + C3 + C5 - C7))
+#define W5 (M_SQRT2 *  (C1 + C3 - C5 + C7))
+#define W6 (M_SQRT2 *  (C1 + C3 + C5 - C7))
+#define W7 (M_SQRT2 *  (C1 + C3 - C5 - C7))
+#define W8 (M_SQRT2 *  (C7 - C3))
+#define W9 (M_SQRT2 * (-C1 - C3))
+#define WA (M_SQRT2 * (-C3 - C5))
+#define WB (M_SQRT2 *  (C5 - C3))
+
+static const vector float fdctconsts[3] = {
     { W0, W1, W2, W3 },
     { W4, W5, W6, W7 },
     { W8, W9, WA, WB }
@@ -196,7 +195,7 @@ static vector float fdctconsts[3] = {
 void ff_fdct_altivec(int16_t *block)
 {
     vector signed short *bp;
-    vector float *cp = fdctconsts;
+    const vector float *cp = fdctconsts;
     vector float b00, b10, b20, b30, b40, b50, b60, b70;
     vector float b01, b11, b21, b31, b41, b51, b61, b71;
     vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
@@ -460,12 +459,12 @@ void ff_fdct_altivec(int16_t *block)
     /* }}} */
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -475,5 +474,5 @@ av_cold void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx,
             c->fdct = ff_fdct_altivec;
         }
     }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/fft_altivec.S b/libavcodec/ppc/fft_altivec.S
index cb7c871..8cd68d6 100644
--- a/libavcodec/ppc/fft_altivec.S
+++ b/libavcodec/ppc/fft_altivec.S
@@ -5,20 +5,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -354,14 +354,18 @@ fft_data:
 .macro fft_calc interleave
 extfunc ff_fft_calc\interleave\()_altivec
     mflr    r0
-    stp     r0, 2*PS(r1)
-    stpu    r1, -(160+16*PS)(r1)
+    stp     r0, 2*PS(R(1))
+    stpu    r1, -(160+16*PS)(R(1))
     get_got r11
     addi    r6, r1, 16*PS
     stvm    r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
     mfvrsave r0
-    stw     r0, 15*PS(r1)
+    stw     r0, 15*PS(R(1))
+#if __APPLE__
     li      r6, 0xfffffffc
+#else
+    li      r6, -4
+#endif
     mtvrsave r6
 
     movrel  r6, fft_data, r11
@@ -372,7 +376,7 @@ extfunc ff_fft_calc\interleave\()_altivec
     movrel  r12, X(ff_cos_tabs), r11
 
     movrel  r6, fft_dispatch_tab\interleave\()_altivec, r11
-    lwz     r3, 0(r3)
+    lwz     r3, 0(R(3))
     subi    r3, r3, 2
     slwi    r3, r3, 2+ARCH_PPC64
     lpx     r3, r3, r6
@@ -382,10 +386,10 @@ extfunc ff_fft_calc\interleave\()_altivec
 
     addi    r6, r1, 16*PS
     lvm     r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
-    lwz     r6, 15*PS(r1)
+    lwz     r6, 15*PS(R(1))
     mtvrsave r6
-    lp      r1, 0(r1)
-    lp      r0, 2*PS(r1)
+    lp      r1, 0(R(1))
+    lp      r0, 2*PS(R(1))
     mtlr    r0
     blr
 .endm
@@ -393,15 +397,15 @@ extfunc ff_fft_calc\interleave\()_altivec
 .macro DECL_FFT suffix, bits, n, n2, n4
 fft\n\suffix\()_altivec:
     mflr  r0
-    stp   r0,PS*(\bits-3)(r1)
+    stp   r0,PS*(\bits-3)(R(1))
     bl    fft\n2\()_altivec
     addi2 r3,\n*4
     bl    fft\n4\()_altivec
     addi2 r3,\n*2
     bl    fft\n4\()_altivec
     addi2 r3,\n*-6
-    lp    r0,PS*(\bits-3)(r1)
-    lp    r4,\bits*PS(r12)
+    lp    r0,PS*(\bits-3)(R(1))
+    lp    r4,\bits*PS(R(12))
     mtlr  r0
     li    r5,\n/16
     b     fft_pass\suffix\()_altivec
diff --git a/libavcodec/ppc/fft_init.c b/libavcodec/ppc/fft_init.c
index 56eafb9..733e58b 100644
--- a/libavcodec/ppc/fft_init.c
+++ b/libavcodec/ppc/fft_init.c
@@ -1,36 +1,167 @@
 /*
- * This file is part of Libav.
+ * FFT/IFFT transforms
+ * AltiVec-enabled
+ * Copyright (c) 2009 Loren Merritt
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
-
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-
+#include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fft.h"
 
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init().
+ * The input data must be permuted before with s->revtab table.
+ * No 1.0 / sqrt(n) normalization is done.
+ * AltiVec-enabled:
+ * This code assumes that the 'z' pointer is 16 bytes-aligned.
+ * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
+ */
+
+#if HAVE_VSX
+#include "fft_vsx.h"
+#else
+void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
+#endif
+
+#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
+static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int j, k;
+    int n = 1 << s->mdct_bits;
+    int n4 = n >> 2;
+    int n8 = n >> 3;
+    int n32 = n >> 5;
+    const uint16_t *revtabj = s->revtab;
+    const uint16_t *revtabk = s->revtab+n4;
+    const vec_f *tcos = (const vec_f*)(s->tcos+n8);
+    const vec_f *tsin = (const vec_f*)(s->tsin+n8);
+    const vec_f *pin = (const vec_f*)(input+n4);
+    vec_f *pout = (vec_f*)(output+n4);
+
+    /* pre rotation */
+    k = n32-1;
+    do {
+        vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
+#define CMULA(p,o0,o1,o2,o3)\
+        a = pin[ k*2+p];                       /* { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  } */\
+        b = pin[-k*2-p-1];                     /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
+        re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re } */\
+        im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    } */\
+        cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
+        sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
+        r##p = im*cos - re*sin;\
+        i##p = re*cos + im*sin;
+#define STORE2(v,dst)\
+        j = dst;\
+        vec_ste(v, 0, output+j*2);\
+        vec_ste(v, 4, output+j*2);
+#define STORE8(p)\
+        a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
+        b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
+        c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
+        d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
+        STORE2(a, revtabk[ p*2-4]);\
+        STORE2(b, revtabk[ p*2-3]);\
+        STORE2(c, revtabj[-p*2+2]);\
+        STORE2(d, revtabj[-p*2+3]);
+
+        cos0 = tcos[k];
+        sin0 = tsin[k];
+        cos1 = tcos[-k-1];
+        sin1 = tsin[-k-1];
+        CMULA(0, 0,1,2,3);
+        CMULA(1, 2,3,0,1);
+        STORE8(0);
+        STORE8(1);
+        revtabj += 4;
+        revtabk -= 4;
+        k--;
+    } while(k >= 0);
+
+#if HAVE_VSX
+    ff_fft_calc_vsx(s, (FFTComplex*)output);
+#else
+    ff_fft_calc_altivec(s, (FFTComplex*)output);
+#endif
+
+    /* post rotation + reordering */
+    j = -n32;
+    k = n32-1;
+    do {
+        vec_f cos,sin,re,im,a,b,c,d;
+#define CMULB(d0,d1,o)\
+        re = pout[o*2];\
+        im = pout[o*2+1];\
+        cos = tcos[o];\
+        sin = tsin[o];\
+        d0 = im*sin - re*cos;\
+        d1 = re*sin + im*cos;
+
+        CMULB(a,b,j);
+        CMULB(c,d,k);
+        pout[2*j]   = vec_perm(a, d, vcprm(0,s3,1,s2));
+        pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
+        pout[2*k]   = vec_perm(c, b, vcprm(0,s3,1,s2));
+        pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
+        j++;
+        k--;
+    } while(k >= 0);
+}
+
+static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k;
+    int n = 1 << s->mdct_bits;
+    int n4 = n >> 2;
+    int n16 = n >> 4;
+    vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
+    vec_u32 *p0 = (vec_u32*)(output+n4);
+    vec_u32 *p1 = (vec_u32*)(output+n4*3);
+
+    imdct_half_altivec(s, output + n4, input);
+
+    for (k = 0; k < n16; k++) {
+        vec_u32 a = p0[k] ^ sign;
+        vec_u32 b = p1[-k-1];
+        p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
+        p1[k]    = vec_perm(b, b, vcprm(3,2,1,0));
+    }
+}
+#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) */
 
 av_cold void ff_fft_init_ppc(FFTContext *s)
 {
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
+#if HAVE_VSX
+    s->fft_calc = ff_fft_calc_interleave_vsx;
+#else
     s->fft_calc   = ff_fft_calc_interleave_altivec;
+#endif
+    if (s->mdct_bits >= 5) {
+        s->imdct_calc = imdct_calc_altivec;
+        s->imdct_half = imdct_half_altivec;
+    }
 #endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
 }
diff --git a/libavcodec/ppc/fft_vsx.c b/libavcodec/ppc/fft_vsx.c
new file mode 100644
index 0000000..c365fa1
--- /dev/null
+++ b/libavcodec/ppc/fft_vsx.c
@@ -0,0 +1,226 @@
+/*
+ * FFT  transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+#include "fft_vsx.h"
+
+#if HAVE_VSX
+
+static void fft32_vsx_interleave(FFTComplex *z)
+{
+    fft16_vsx_interleave(z);
+    fft8_vsx_interleave(z+16);
+    fft8_vsx_interleave(z+24);
+    pass_vsx_interleave(z,ff_cos_32,4);
+}
+
+static void fft64_vsx_interleave(FFTComplex *z)
+{
+    fft32_vsx_interleave(z);
+    fft16_vsx_interleave(z+32);
+    fft16_vsx_interleave(z+48);
+    pass_vsx_interleave(z,ff_cos_64, 8);
+}
+static void fft128_vsx_interleave(FFTComplex *z)
+{
+    fft64_vsx_interleave(z);
+    fft32_vsx_interleave(z+64);
+    fft32_vsx_interleave(z+96);
+    pass_vsx_interleave(z,ff_cos_128,16);
+}
+static void fft256_vsx_interleave(FFTComplex *z)
+{
+    fft128_vsx_interleave(z);
+    fft64_vsx_interleave(z+128);
+    fft64_vsx_interleave(z+192);
+    pass_vsx_interleave(z,ff_cos_256,32);
+}
+static void fft512_vsx_interleave(FFTComplex *z)
+{
+    fft256_vsx_interleave(z);
+    fft128_vsx_interleave(z+256);
+    fft128_vsx_interleave(z+384);
+    pass_vsx_interleave(z,ff_cos_512,64);
+}
+static void fft1024_vsx_interleave(FFTComplex *z)
+{
+    fft512_vsx_interleave(z);
+    fft256_vsx_interleave(z+512);
+    fft256_vsx_interleave(z+768);
+    pass_vsx_interleave(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx_interleave(FFTComplex *z)
+{
+    fft1024_vsx_interleave(z);
+    fft512_vsx_interleave(z+1024);
+    fft512_vsx_interleave(z+1536);
+    pass_vsx_interleave(z,ff_cos_2048,256);
+}
+static void fft4096_vsx_interleave(FFTComplex *z)
+{
+    fft2048_vsx_interleave(z);
+    fft1024_vsx_interleave(z+2048);
+    fft1024_vsx_interleave(z+3072);
+    pass_vsx_interleave(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx_interleave(FFTComplex *z)
+{
+    fft4096_vsx_interleave(z);
+    fft2048_vsx_interleave(z+4096);
+    fft2048_vsx_interleave(z+6144);
+    pass_vsx_interleave(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx_interleave(FFTComplex *z)
+{
+    fft8192_vsx_interleave(z);
+    fft4096_vsx_interleave(z+8192);
+    fft4096_vsx_interleave(z+12288);
+    pass_vsx_interleave(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx_interleave(FFTComplex *z)
+{
+    fft16384_vsx_interleave(z);
+    fft8192_vsx_interleave(z+16384);
+    fft8192_vsx_interleave(z+24576);
+    pass_vsx_interleave(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx_interleave(FFTComplex *z)
+{
+    fft32768_vsx_interleave(z);
+    fft16384_vsx_interleave(z+32768);
+    fft16384_vsx_interleave(z+49152);
+    pass_vsx_interleave(z,ff_cos_65536,8192);
+}
+
+static void fft32_vsx(FFTComplex *z)
+{
+    fft16_vsx(z);
+    fft8_vsx(z+16);
+    fft8_vsx(z+24);
+    pass_vsx(z,ff_cos_32,4);
+}
+
+static void fft64_vsx(FFTComplex *z)
+{
+    fft32_vsx(z);
+    fft16_vsx(z+32);
+    fft16_vsx(z+48);
+    pass_vsx(z,ff_cos_64, 8);
+}
+static void fft128_vsx(FFTComplex *z)
+{
+    fft64_vsx(z);
+    fft32_vsx(z+64);
+    fft32_vsx(z+96);
+    pass_vsx(z,ff_cos_128,16);
+}
+static void fft256_vsx(FFTComplex *z)
+{
+    fft128_vsx(z);
+    fft64_vsx(z+128);
+    fft64_vsx(z+192);
+    pass_vsx(z,ff_cos_256,32);
+}
+static void fft512_vsx(FFTComplex *z)
+{
+    fft256_vsx(z);
+    fft128_vsx(z+256);
+    fft128_vsx(z+384);
+    pass_vsx(z,ff_cos_512,64);
+}
+static void fft1024_vsx(FFTComplex *z)
+{
+    fft512_vsx(z);
+    fft256_vsx(z+512);
+    fft256_vsx(z+768);
+    pass_vsx(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx(FFTComplex *z)
+{
+    fft1024_vsx(z);
+    fft512_vsx(z+1024);
+    fft512_vsx(z+1536);
+    pass_vsx(z,ff_cos_2048,256);
+}
+static void fft4096_vsx(FFTComplex *z)
+{
+    fft2048_vsx(z);
+    fft1024_vsx(z+2048);
+    fft1024_vsx(z+3072);
+    pass_vsx(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx(FFTComplex *z)
+{
+    fft4096_vsx(z);
+    fft2048_vsx(z+4096);
+    fft2048_vsx(z+6144);
+    pass_vsx(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx(FFTComplex *z)
+{
+    fft8192_vsx(z);
+    fft4096_vsx(z+8192);
+    fft4096_vsx(z+12288);
+    pass_vsx(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx(FFTComplex *z)
+{
+    fft16384_vsx(z);
+    fft8192_vsx(z+16384);
+    fft8192_vsx(z+24576);
+    pass_vsx(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx(FFTComplex *z)
+{
+    fft32768_vsx(z);
+    fft16384_vsx(z+32768);
+    fft16384_vsx(z+49152);
+    pass_vsx(z,ff_cos_65536,8192);
+}
+
+static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
+    fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
+    fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
+};
+static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
+    fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
+    fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
+    fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
+};
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
+{
+     fft_dispatch_vsx_interleave[s->nbits-2](z);
+}
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
+{
+     fft_dispatch_vsx[s->nbits-2](z);
+}
+#endif /* HAVE_VSX */
diff --git a/libavcodec/ppc/fft_vsx.h b/libavcodec/ppc/fft_vsx.h
new file mode 100644
index 0000000..1e44031
--- /dev/null
+++ b/libavcodec/ppc/fft_vsx.h
@@ -0,0 +1,829 @@
+#ifndef AVCODEC_PPC_FFT_VSX_H
+#define AVCODEC_PPC_FFT_VSX_H
+/*
+ * FFT  transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan  Copyright (c) 2009 Loren Merritt
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+
+#if HAVE_VSX
+
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
+
+
+#define byte_2complex (2*sizeof(FFTComplex))
+#define byte_4complex (4*sizeof(FFTComplex))
+#define byte_6complex (6*sizeof(FFTComplex))
+#define byte_8complex (8*sizeof(FFTComplex))
+#define byte_10complex (10*sizeof(FFTComplex))
+#define byte_12complex (12*sizeof(FFTComplex))
+#define byte_14complex (14*sizeof(FFTComplex))
+
+inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
+{
+    int o1 = n<<1;
+    int o2 = n<<2;
+    int o3 = o1+o2;
+    int i1, i2, i3;
+    FFTSample* out = (FFTSample*)z;
+    const FFTSample *wim = wre+o1;
+    vec_f vz0, vzo1, vzo2, vzo3;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
+    vec_f y0, y1, y2, y3;
+    vec_f y4, y5, y8, y9;
+    vec_f y10, y13, y14, y15;
+    vec_f y16, y17, y18, y19;
+    vec_f y20, y21, y22, y23;
+    vec_f wr1, wi1, wr0, wi0;
+    vec_f wr2, wi2, wr3, wi3;
+    vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
+
+    n = n-2;
+    i1 = o1*sizeof(FFTComplex);
+    i2 = o2*sizeof(FFTComplex);
+    i3 = o3*sizeof(FFTComplex);
+    vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
+    vzo2plus1 = vec_ld(i2+16, &(out[0]));
+    vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
+    vzo3plus1 = vec_ld(i3+16, &(out[0]));
+    vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
+    vz0plus1 = vec_ld(16, &(out[0]));
+    vzo1 = vec_ld(i1, &(out[0]));  // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
+    vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+    x0 = vec_add(vzo2, vzo3);
+    x1 = vec_sub(vzo2, vzo3);
+    y0 = vec_add(vzo2plus1, vzo3plus1);
+    y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+    wr1 = vec_splats(wre[1]);
+    wi1 = vec_splats(wim[-1]);
+    wi2 = vec_splats(wim[-2]);
+    wi3 = vec_splats(wim[-3]);
+    wr2 = vec_splats(wre[2]);
+    wr3 = vec_splats(wre[3]);
+
+    x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+    x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+
+    y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+    y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+    y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+    y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+
+    ymulwi2 = vec_mul(y4, wi2);
+    ymulwi3 = vec_mul(y5, wi3);
+    x4 = vec_mul(x2, wr1);
+    x5 = vec_mul(x3, wi1);
+    y8 = vec_madd(y2, wr2, ymulwi2);
+    y9 = vec_msub(y2, wr2, ymulwi2);
+    x6 = vec_add(x4, x5);
+    x7 = vec_sub(x4, x5);
+    y13 = vec_madd(y3, wr3, ymulwi3);
+    y14 = vec_msub(y3, wr3, ymulwi3);
+
+    x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
+    y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+    y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+    x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
+    x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
+
+    y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+    y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+    x11 = vec_add(vz0, x9);
+    x12 = vec_sub(vz0, x9);
+    x13 = vec_add(vzo1, x10);
+    x14 = vec_sub(vzo1, x10);
+
+    y18 = vec_add(vz0plus1, y16);
+    y19 = vec_sub(vz0plus1, y16);
+    y20 = vec_add(vzo1plus1, y17);
+    y21 = vec_sub(vzo1plus1, y17);
+
+    x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
+    x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
+    y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+    y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+
+    vec_st(x11, 0, &(out[0]));
+    vec_st(y18, 16, &(out[0]));
+    vec_st(x15, i1, &(out[0]));
+    vec_st(y22, i1+16, &(out[0]));
+    vec_st(x12, i2, &(out[0]));
+    vec_st(y19, i2+16, &(out[0]));
+    vec_st(x16, i3, &(out[0]));
+    vec_st(y23, i3+16, &(out[0]));
+
+    do {
+        out += 8;
+        wre += 4;
+        wim -= 4;
+        wr0 = vec_splats(wre[0]);
+        wr1 = vec_splats(wre[1]);
+        wi0 = vec_splats(wim[0]);
+        wi1 = vec_splats(wim[-1]);
+
+        wr2 = vec_splats(wre[2]);
+        wr3 = vec_splats(wre[3]);
+        wi2 = vec_splats(wim[-2]);
+        wi3 = vec_splats(wim[-3]);
+
+        vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
+        vzo2plus1 = vec_ld(i2+16, &(out[0]));
+        vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
+        vzo3plus1 = vec_ld(i3+16, &(out[0]));
+        vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
+        vz0plus1 = vec_ld(16, &(out[0]));
+        vzo1 = vec_ld(i1, &(out[0])); // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
+        vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+        x0 = vec_add(vzo2, vzo3);
+        x1 = vec_sub(vzo2, vzo3);
+
+        y0 = vec_add(vzo2plus1, vzo3plus1);
+        y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+        x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
+        x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+        x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
+        x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+
+        y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+        y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+        xmulwi0 = vec_mul(x4, wi0);
+        xmulwi1 = vec_mul(x5, wi1);
+
+        y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+        y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+
+        x8 = vec_madd(x2, wr0, xmulwi0);
+        x9 = vec_msub(x2, wr0, xmulwi0);
+        ymulwi2 = vec_mul(y4, wi2);
+        ymulwi3 = vec_mul(y5, wi3);
+
+        x13 = vec_madd(x3, wr1, xmulwi1);
+        x14 = vec_msub(x3, wr1, xmulwi1);
+
+        y8 = vec_madd(y2, wr2, ymulwi2);
+        y9 = vec_msub(y2, wr2, ymulwi2);
+        y13 = vec_madd(y3, wr3, ymulwi3);
+        y14 = vec_msub(y3, wr3, ymulwi3);
+
+        x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
+        x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
+
+        y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+        y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+        x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
+        x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
+
+        y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+        y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+        x18 = vec_add(vz0, x16);
+        x19 = vec_sub(vz0, x16);
+        x20 = vec_add(vzo1, x17);
+        x21 = vec_sub(vzo1, x17);
+
+        y18 = vec_add(vz0plus1, y16);
+        y19 = vec_sub(vz0plus1, y16);
+        y20 = vec_add(vzo1plus1, y17);
+        y21 = vec_sub(vzo1plus1, y17);
+
+        x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
+        x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
+
+        y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+        y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+        vec_st(x18, 0, &(out[0]));
+        vec_st(y18, 16, &(out[0]));
+        vec_st(x22, i1, &(out[0]));
+        vec_st(y22, i1+16, &(out[0]));
+        vec_st(x19, i2, &(out[0]));
+        vec_st(y19, i2+16, &(out[0]));
+        vec_st(x23, i3, &(out[0]));
+        vec_st(y23, i3+16, &(out[0]));
+    } while (n-=2);
+}
+
+inline static void fft2_vsx_interleave(FFTComplex *z)
+{
+    FFTSample r1, i1;
+
+    r1 = z[0].re - z[1].re;
+    z[0].re += z[1].re;
+    z[1].re = r1;
+
+    i1 = z[0].im - z[1].im;
+    z[0].im += z[1].im;
+    z[1].im = i1;
+ }
+
+inline static void fft4_vsx_interleave(FFTComplex *z)
+{
+    vec_f a, b, c, d;
+    float* out=  (float*)z;
+    a = vec_ld(0, &(out[0]));
+    b = vec_ld(byte_2complex, &(out[0]));
+
+    c = vec_perm(a, b, vcprm(0,1,s2,s1));
+    d = vec_perm(a, b, vcprm(2,3,s0,s3));
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a, b, vcprm(0,1,s0,s1));
+    d = vec_perm(a, b, vcprm(2,3,s3,s2));
+
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+    vec_st(a, 0, &(out[0]));
+    vec_st(b, byte_2complex, &(out[0]));
+}
+
+inline static void fft8_vsx_interleave(FFTComplex *z)
+{
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f x24, x25, x26, x27;
+    vec_f x28, x29, x30, x31;
+    vec_f x32, x33, x34;
+
+    float* out=  (float*)z;
+    vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+
+    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
+    x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
+
+    x4 = vec_add(x0, x1);
+    x5 = vec_sub(x0, x1);
+    x6 = vec_add(x2, x3);
+    x7 = vec_sub(x2, x3);
+
+    x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
+    x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
+    x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
+    x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
+
+    x12 = vec_add(x8, x9);
+    x13 = vec_sub(x8, x9);
+    x14 = vec_add(x10, x11);
+    x15 = vec_sub(x10, x11);
+    x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
+    x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
+    x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
+    x19 = vec_add(x16, x18); // z0.r  z2.r  z0.i  z2.i
+    x20 = vec_sub(x16, x18); // z4.r  z6.r  z4.i  z6.i
+
+    x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
+    x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
+    x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
+    x24 = vec_add(x22, x23);
+    x25 = vec_sub(x22, x23);
+    x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
+
+    x27 = vec_add(x21, x26); // z1.r  z7.r z1.i z3.i
+    x28 = vec_sub(x21, x26); //z5.r  z3.r z5.i z7.i
+
+    x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r  z0.i  z1.r  z1.i
+    x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r  z2.i  z7.r  z3.i
+    x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r  z4.i  z5.r  z5.i
+    x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r  z6.i  z3.r  z7.i
+    x33 = vec_perm(x30, x32, vcprm(0,1,s2,3));  // z2.r  z2.i  z3.r  z3.i
+    x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r  z6.i  z7.r  z7.i
+
+    vec_st(x29, 0, &(out[0]));
+    vec_st(x33, byte_2complex, &(out[0]));
+    vec_st(x31, byte_4complex, &(out[0]));
+    vec_st(x34, byte_6complex, &(out[0]));
+}
+
+inline static void fft16_vsx_interleave(FFTComplex *z)
+{
+    float* out=  (float*)z;
+    vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+    vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
+    vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f x24, x25, x26, x27;
+    vec_f x28, x29, x30, x31;
+    vec_f x32, x33, x34, x35;
+    vec_f x36, x37, x38, x39;
+    vec_f x40, x41, x42, x43;
+    vec_f x44, x45, x46, x47;
+    vec_f x48, x49, x50, x51;
+    vec_f x52, x53, x54, x55;
+    vec_f x56, x57, x58, x59;
+    vec_f x60, x61, x62, x63;
+    vec_f x64, x65, x66, x67;
+    vec_f x68, x69, x70, x71;
+    vec_f x72, x73, x74, x75;
+    vec_f x76, x77, x78, x79;
+    vec_f x80, x81, x82, x83;
+    vec_f x84, x85, x86;
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+    vz4 = vec_ld(byte_8complex, &(out[0]));
+    vz5 = vec_ld(byte_10complex, &(out[0]));
+    vz6 = vec_ld(byte_12complex, &(out[0]));
+    vz7 = vec_ld(byte_14complex, &(out[0]));
+
+    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+    x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+    x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
+    x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
+    x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
+    x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
+
+    x8 = vec_add(x0, x1);
+    x9 = vec_sub(x0, x1);
+    x10 = vec_add(x2, x3);
+    x11 = vec_sub(x2, x3);
+
+    x12 = vec_add(x4, x5);
+    x13 = vec_sub(x4, x5);
+    x14 = vec_add(x6, x7);
+    x15 = vec_sub(x6, x7);
+
+    x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
+    x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
+    x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
+    x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
+    x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
+    x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
+    x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
+    x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
+
+    x24 = vec_add(x16, x17);
+    x25 = vec_sub(x16, x17);
+    x26 = vec_add(x18, x19);
+    x27 = vec_sub(x18, x19);
+    x28 = vec_add(x20, x21);
+    x29 = vec_sub(x20, x21);
+    x30 = vec_add(x22, x23);
+    x31 = vec_sub(x22, x23);
+
+    x32 = vec_add(x24, x26);
+    x33 = vec_sub(x24, x26);
+    x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
+
+    x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
+    x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
+    x37 = vec_add(x35, x36);
+    x38 = vec_sub(x35, x36);
+    x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
+
+    x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
+    x41 = vec_perm(x26,  x37, vcprm(2,3,s3,s2));
+    x42 = vec_add(x40, x41);
+    x43 = vec_sub(x40, x41);
+    x44 = vec_mul(x42, vc0);
+    x45 = vec_mul(x43, vc0);
+
+    x46 = vec_add(x34, x39);  // z0.r  z0.i  z4.r  z4.i
+    x47 = vec_sub(x34, x39);  // z8.r  z8.i  z12.r  z12.i
+
+    x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
+    x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
+    x50 = vec_add(x48, x49);
+    x51 = vec_sub(x48, x49);
+    x52 = vec_mul(x50, vc1);
+    x53 = vec_mul(x50, vc2);
+    x54 = vec_mul(x51, vc1);
+    x55 = vec_mul(x51, vc2);
+
+    x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
+    x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
+    x58 = vec_add(x56, x57);
+    x59 = vec_sub(x56, x57);
+
+    x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
+    x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
+    x62 = vec_add(x52, x61);
+    x63 = vec_sub(x52, x61);
+    x64 = vec_add(x60, x53);
+    x65 = vec_sub(x60, x53);
+    x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
+    x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
+
+    x68 = vec_add(x58, x66); // z1.r    z1.i  z3.r    z3.i
+    x69 = vec_sub(x58, x66); // z9.r    z9.i  z11.r  z11.i
+    x70 = vec_add(x59, x67); // z5.r    z5.i  z15.r  z15.i
+    x71 = vec_sub(x59, x67); // z13.r  z13.i z7.r   z7.i
+
+    x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
+    x73 = vec_add(x25, x72);
+    x74 = vec_sub(x25, x72);
+    x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
+    x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
+    x77 = vec_add(x75, x76); // z2.r   z2.i    z6.r    z6.i
+    x78 = vec_sub(x75, x76); // z10.r  z10.i  z14.r  z14.i
+
+    x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r  z0.i  z1.r  z1.i
+    x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r  z2.i  z3.r  z3.i
+    x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r  z4.i  z5.r  z5.i
+    x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r  z6.i  z7.r  z7.i
+    vec_st(x79, 0, &(out[0]));
+    vec_st(x80, byte_2complex, &(out[0]));
+    vec_st(x81, byte_4complex, &(out[0]));
+    vec_st(x82, byte_6complex, &(out[0]));
+    x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r  z8.i  z9.r  z9.i
+    x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r  z10.i  z11.r  z11.i
+    x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r  z12.i  z13.r  z13.i
+    x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r  z14.i  z15.r  z15.i
+    vec_st(x83, byte_8complex, &(out[0]));
+    vec_st(x84, byte_10complex, &(out[0]));
+    vec_st(x85, byte_12complex, &(out[0]));
+    vec_st(x86, byte_14complex, &(out[0]));
+}
+
+inline static void fft4_vsx(FFTComplex *z)
+{
+    vec_f a, b, c, d;
+    float* out=  (float*)z;
+    a = vec_ld(0, &(out[0]));
+    b = vec_ld(byte_2complex, &(out[0]));
+
+    c = vec_perm(a, b, vcprm(0,1,s2,s1));
+    d = vec_perm(a, b, vcprm(2,3,s0,s3));
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a,b, vcprm(0,s0,1,s1));
+    d = vec_perm(a, b, vcprm(2,s3,3,s2));
+
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a, b, vcprm(0,1,s0,s1));
+    d = vec_perm(a, b, vcprm(2,3,s2,s3));
+
+    vec_st(c, 0, &(out[0]));
+    vec_st(d, byte_2complex, &(out[0]));
+    return;
+}
+
+inline static void fft8_vsx(FFTComplex *z)
+{
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7, vz8;
+
+    float* out=  (float*)z;
+    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+
+    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+    vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+
+    vz3 = vec_madd(vz3, vc1, vc0);
+    vz3 = vec_madd(vz8, vc2, vz3);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+    vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+    vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+
+    vz2 = vec_sub(vz4, vz6);
+    vz3 = vec_sub(vz5, vz7);
+
+    vz0 = vec_add(vz4, vz6);
+    vz1 = vec_add(vz5, vz7);
+
+    vec_st(vz0, 0, &(out[0]));
+    vec_st(vz1, byte_2complex, &(out[0]));
+    vec_st(vz2, byte_4complex, &(out[0]));
+    vec_st(vz3, byte_6complex, &(out[0]));
+    return;
+}
+
+inline static void fft16_vsx(FFTComplex *z)
+{
+    float* out=  (float*)z;
+    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+    vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
+    vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
+    vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
+
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7;
+    vec_f vz8, vz9, vz10, vz11;
+    vec_f vz12, vz13;
+
+    vz0 = vec_ld(byte_8complex, &(out[0]));
+    vz1 = vec_ld(byte_10complex, &(out[0]));
+    vz2 = vec_ld(byte_12complex, &(out[0]));
+    vz3 = vec_ld(byte_14complex, &(out[0]));
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1= vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+
+    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+    vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+    vz2 = vec_add(vz10, vz11);
+    vz3 = vec_sub(vz10, vz11);
+    vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+    vz0 = vec_add(vz8, vz9);
+    vz1 = vec_sub(vz8, vz9);
+
+    vz3 = vec_madd(vz3, vc1, vc0);
+    vz3 = vec_madd(vz12, vc2, vz3);
+    vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+    vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+    vz0 = vec_add(vz8, vz9);
+    vz1 = vec_sub(vz8, vz9);
+    vz2 = vec_add(vz10, vz11);
+    vz3 = vec_sub(vz10, vz11);
+
+    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+    vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+    vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+    vz2 = vec_sub(vz8, vz10);
+    vz3 = vec_sub(vz9, vz11);
+    vz0 = vec_add(vz8, vz10);
+    vz1 = vec_add(vz9, vz11);
+
+    vz8 = vec_madd(vz4, vc3, vc0);
+    vz9 = vec_madd(vz5, vc3, vc0);
+    vz10 = vec_madd(vz6, vc3, vc0);
+    vz11 = vec_madd(vz7, vc3, vc0);
+
+    vz8 = vec_madd(vz5, vc4, vz8);
+    vz9 = vec_madd(vz4, vc5, vz9);
+    vz10 = vec_madd(vz7, vc5, vz10);
+    vz11 = vec_madd(vz6, vc4, vz11);
+
+    vz12 = vec_sub(vz10, vz8);
+    vz10 = vec_add(vz10, vz8);
+
+    vz13 = vec_sub(vz9, vz11);
+    vz11 = vec_add(vz9, vz11);
+
+    vz4 = vec_sub(vz0, vz10);
+    vz0 = vec_add(vz0, vz10);
+
+    vz7= vec_sub(vz3, vz12);
+    vz3= vec_add(vz3, vz12);
+
+    vz5 = vec_sub(vz1, vz11);
+    vz1 = vec_add(vz1, vz11);
+
+    vz6 = vec_sub(vz2, vz13);
+    vz2 = vec_add(vz2, vz13);
+
+    vec_st(vz0, 0, &(out[0]));
+    vec_st(vz1, byte_2complex, &(out[0]));
+    vec_st(vz2, byte_4complex, &(out[0]));
+    vec_st(vz3, byte_6complex, &(out[0]));
+    vec_st(vz4, byte_8complex, &(out[0]));
+    vec_st(vz5, byte_10complex, &(out[0]));
+    vec_st(vz6, byte_12complex, &(out[0]));
+    vec_st(vz7, byte_14complex, &(out[0]));
+    return;
+
+}
+inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
+{
+    int o1 = n<<1;
+    int o2 = n<<2;
+    int o3 = o1+o2;
+    int i1, i2, i3;
+    FFTSample* out = (FFTSample*)z;
+    const FFTSample *wim = wre+o1;
+    vec_f v0, v1, v2, v3;
+    vec_f v4, v5, v6, v7;
+    vec_f v8, v9, v10, v11;
+    vec_f v12, v13;
+
+    n = n-2;
+    i1 = o1*sizeof(FFTComplex);
+    i2 = o2*sizeof(FFTComplex);
+    i3 = o3*sizeof(FFTComplex);
+
+    v8 = vec_ld(0, &(wre[0]));
+    v10 = vec_ld(0, &(wim[0]));
+    v9 = vec_ld(0, &(wim[-4]));
+    v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+    v4 = vec_ld(i2, &(out[0]));
+    v5 = vec_ld(i2+16, &(out[0]));
+    v6 = vec_ld(i3, &(out[0]));
+    v7 = vec_ld(i3+16, &(out[0]));
+    v10 = vec_mul(v4, v8); // r2*wre
+    v11 = vec_mul(v5, v8); // i2*wre
+    v12 = vec_mul(v6, v8); // r3*wre
+    v13 = vec_mul(v7, v8); // i3*wre
+
+    v0 = vec_ld(0, &(out[0])); // r0
+    v3 = vec_ld(i1+16, &(out[0])); // i1
+    v10 = vec_madd(v5, v9, v10); // r2*wim
+    v11 = vec_nmsub(v4, v9, v11); // i2*wim
+    v12 = vec_nmsub(v7, v9, v12); // r3*wim
+    v13 = vec_madd(v6, v9, v13); // i3*wim
+
+    v1 = vec_ld(16, &(out[0])); // i0
+    v2 = vec_ld(i1, &(out[0])); // r1
+    v8 = vec_sub(v12, v10);
+    v12 = vec_add(v12, v10);
+    v9 = vec_sub(v11, v13);
+    v13 = vec_add(v11, v13);
+    v4 = vec_sub(v0, v12);
+    v0 = vec_add(v0, v12);
+    v7 = vec_sub(v3, v8);
+    v3 = vec_add(v3, v8);
+
+    vec_st(v0, 0, &(out[0])); // r0
+    vec_st(v3, i1+16, &(out[0])); // i1
+    vec_st(v4, i2, &(out[0])); // r2
+    vec_st(v7, i3+16, &(out[0]));// i3
+
+    v5 = vec_sub(v1, v13);
+    v1 = vec_add(v1, v13);
+    v6 = vec_sub(v2, v9);
+    v2 = vec_add(v2, v9);
+
+    vec_st(v1, 16, &(out[0])); // i0
+    vec_st(v2, i1, &(out[0])); // r1
+    vec_st(v5, i2+16, &(out[0])); // i2
+    vec_st(v6, i3, &(out[0])); // r3
+
+    do {
+        out += 8;
+        wre += 4;
+        wim -= 4;
+
+        v8 = vec_ld(0, &(wre[0]));
+        v10 = vec_ld(0, &(wim[0]));
+        v9 = vec_ld(0, &(wim[-4]));
+        v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+        v4 = vec_ld(i2, &(out[0])); // r2
+        v5 = vec_ld(i2+16, &(out[0])); // i2
+        v6 = vec_ld(i3, &(out[0])); // r3
+        v7 = vec_ld(i3+16, &(out[0]));// i3
+        v10 = vec_mul(v4, v8); // r2*wre
+        v11 = vec_mul(v5, v8); // i2*wre
+        v12 = vec_mul(v6, v8); // r3*wre
+        v13 = vec_mul(v7, v8); // i3*wre
+
+        v0 = vec_ld(0, &(out[0])); // r0
+        v3 = vec_ld(i1+16, &(out[0])); // i1
+        v10 = vec_madd(v5, v9, v10); // r2*wim
+        v11 = vec_nmsub(v4, v9, v11); // i2*wim
+        v12 = vec_nmsub(v7, v9, v12); // r3*wim
+        v13 = vec_madd(v6, v9, v13); // i3*wim
+
+        v1 = vec_ld(16, &(out[0])); // i0
+        v2 = vec_ld(i1, &(out[0])); // r1
+        v8 = vec_sub(v12, v10);
+        v12 = vec_add(v12, v10);
+        v9 = vec_sub(v11, v13);
+        v13 = vec_add(v11, v13);
+        v4 = vec_sub(v0, v12);
+        v0 = vec_add(v0, v12);
+        v7 = vec_sub(v3, v8);
+        v3 = vec_add(v3, v8);
+
+        vec_st(v0, 0, &(out[0])); // r0
+        vec_st(v3, i1+16, &(out[0])); // i1
+        vec_st(v4, i2, &(out[0])); // r2
+        vec_st(v7, i3+16, &(out[0])); // i3
+
+        v5 = vec_sub(v1, v13);
+        v1 = vec_add(v1, v13);
+        v6 = vec_sub(v2, v9);
+        v2 = vec_add(v2, v9);
+
+        vec_st(v1, 16, &(out[0])); // i0
+        vec_st(v2, i1, &(out[0])); // r1
+        vec_st(v5, i2+16, &(out[0])); // i2
+        vec_st(v6, i3, &(out[0])); // r3
+    } while (n-=2);
+}
+
+#endif
+
+#endif /* AVCODEC_PPC_FFT_VSX_H */
diff --git a/libavcodec/ppc/fmtconvert_altivec.c b/libavcodec/ppc/fmtconvert_altivec.c
index 153f44a..7323eff 100644
--- a/libavcodec/ppc/fmtconvert_altivec.c
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fmtconvert.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
                                                float mul, int len)
@@ -52,15 +52,15 @@ static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
     }
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_fmt_convert_init_ppc(FmtConvertContext *c,
                                      AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/h264chroma_init.c b/libavcodec/ppc/h264chroma_init.c
index f8392c2..bd0d213 100644
--- a/libavcodec/ppc/h264chroma_init.c
+++ b/libavcodec/ppc/h264chroma_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 
 #include "libavcodec/h264chroma.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
 
@@ -47,11 +47,11 @@
 #undef OP_U8_ALTIVEC
 #undef PREFIX_h264_chroma_mc8_altivec
 #undef PREFIX_h264_chroma_mc8_num
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     const int high_bit_depth = bit_depth > 8;
 
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
@@ -61,5 +61,5 @@ av_cold void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth)
         c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
         c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
     }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/h264chroma_template.c b/libavcodec/ppc/h264chroma_template.c
index daa7652..8f43e5d 100644
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@ -1,30 +1,31 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/mem.h"
+#include "libavutil/ppc/util_altivec.h"
 
 /* this code assume that stride % 16 == 0 */
 
 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
-        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
-        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
+        vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\
+        vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
         psum = vec_mladd(vB, vsrc1ssH, psum);\
@@ -49,8 +50,8 @@
 
 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
 \
-        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
-        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
+        vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\
+        vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
         psum = vec_mladd(vE, vsrc1ssH, psum);\
@@ -70,6 +71,43 @@
 #define noop(a) a
 #define add28(a) vec_add(v28ss, a)
 
+#if HAVE_BIGENDIAN
+#define GET_VSRC1(vs0, off, b, perm0, s){    \
+    vec_u8 vsrcCuc, vsrcDuc;                 \
+    vsrcCuc = vec_ld(off, s);                \
+    if (loadSecond){                         \
+        vsrcDuc = vec_ld(off + b, s);        \
+    } else                                   \
+        vsrcDuc = vsrcCuc;                   \
+                                             \
+    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
+}
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+    vec_u8 vsrcCuc, vsrcDuc;                         \
+    vsrcCuc = vec_ld(off, s);                        \
+    if (loadSecond){                                 \
+        vsrcDuc = vec_ld(off + b, s);                \
+    } else                                           \
+        vsrcDuc = vsrcCuc;                           \
+                                                     \
+    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0);         \
+    if (reallyBadAlign){                             \
+        vs1 = vsrcDuc;                               \
+    } else                                           \
+        vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1);     \
+ }
+
+#else
+
+#define GET_VSRC1(vs0, off, b, perm0, s){            \
+    vs0 = vec_vsx_ld(off, s);                        \
+ }
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+    vs0 = vec_vsx_ld(off, s);                        \
+    vs1 = vec_vsx_ld(off + 1, s);                    \
+ }
+#endif /* HAVE_BIGENDIAN */
+
 #ifdef PREFIX_h264_chroma_mc8_altivec
 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                                            ptrdiff_t stride, int h,
@@ -82,23 +120,27 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                          ((    x) * (    y))};
     register int i;
     vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
+    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
+    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
+    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
     const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
     const vec_u16 v6us = vec_splat_u16(6);
-    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrcperm0, vsrcperm1;
     vec_u8 vsrc0uc, vsrc1uc;
     vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_u8 vsrc2uc, vsrc3uc;
     vec_s16 vsrc2ssH, vsrc3ssH, psum;
     vec_u8 vdst, ppsum, vfdst, fsum;
+#if HAVE_BIGENDIAN
+    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+#endif
 
     if (((unsigned long)dst) % 16 == 0) {
         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
@@ -112,89 +154,28 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                          0x1C, 0x1D, 0x1E, 0x1F};
     }
 
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
+    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
+    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);
 
     if (ABCD[3]) {
-        if (!loadSecond) {// -> !reallyBadAlign
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
-        } else {
-            vec_u8 vsrcDuc;
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrcDuc = vec_ld(stride + 16, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                if (reallyBadAlign)
-                    vsrc3uc = vsrcDuc;
-                else
-                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
+        for (i = 0 ; i < h ; i++) {
+            GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
+            CHROMA_MC8_ALTIVEC_CORE(v32ss, noop);
         }
     } else {
         const vec_s16 vE = vec_add(vB, vC);
         if (ABCD[2]) { // x == 0 B == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrcDuc = vec_ld(stride + 15, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
+            for (i = 0 ; i < h ; i++) {
+                GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src);
+                CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
+                vsrc0uc = vsrc1uc;
             }
         } else { // y == 0 C == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrcDuc = vec_ld(15, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    if (reallyBadAlign)
-                        vsrc1uc = vsrcDuc;
-                    else
-                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
+            for (i = 0 ; i < h ; i++) {
+               GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src);
+               CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
             }
         }
     }
@@ -214,23 +195,27 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src,
                          ((    x) * (    y))};
     register int i;
     vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
+    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
+    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
+    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
     const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
     const vec_u16 v6us  = vec_splat_u16(6);
-    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrcperm0, vsrcperm1;
     vec_u8 vsrc0uc, vsrc1uc;
     vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_u8 vsrc2uc, vsrc3uc;
     vec_s16 vsrc2ssH, vsrc3ssH, psum;
     vec_u8 vdst, ppsum, vfdst, fsum;
+#if HAVE_BIGENDIAN
+    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+#endif
 
     if (((unsigned long)dst) % 16 == 0) {
         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
@@ -244,47 +229,14 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src,
                          0x1C, 0x1D, 0x1E, 0x1F};
     }
 
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
-
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
-
-    if (!loadSecond) {// -> !reallyBadAlign
-        for (i = 0 ; i < h ; i++) {
-
-
-            vsrcCuc = vec_ld(stride + 0, src);
+    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 
-            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc);
 
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
-    } else {
-        vec_u8 vsrcDuc;
-        for (i = 0 ; i < h ; i++) {
-            vsrcCuc = vec_ld(stride + 0, src);
-            vsrcDuc = vec_ld(stride + 16, src);
-
-            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-            if (reallyBadAlign)
-                vsrc3uc = vsrcDuc;
-            else
-                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
+    for (i = 0 ; i < h ; i++) {
+        GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
+        CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28);
     }
 }
 #endif
diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index 9247cdf..d8a3baa 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 #include "libavcodec/h264dec.h"
 #include "libavcodec/h264dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 /****************************************************************************
  * IDCT transform:
@@ -67,10 +67,17 @@
     b2 = vec_mergeh( a1, a3 ); \
     b3 = vec_mergel( a1, a3 )
 
+#if HAVE_BIGENDIAN
+#define vdst_load(d)              \
+    vdst_orig = vec_ld(0, dst);   \
+    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);
+#else
+#define vdst_load(d) vdst = vec_vsx_ld(0, dst)
+#endif
+
 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
-    vdst_orig = vec_ld(0, dst);                               \
-    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
-    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
+    vdst_load();                                              \
+    vdst_ss = (vec_s16) VEC_MERGEH(zero_u8v, vdst);           \
     va = vec_add(va, vdst_ss);                                \
     va_u8 = vec_packsu(va, zero_s16v);                        \
     va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
@@ -170,26 +177,43 @@ static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
     d7 = vec_sub(b0v, b7v); \
 }
 
+#if HAVE_BIGENDIAN
+#define GET_2PERM(ldv, stv, d)  \
+    ldv = vec_lvsl(0, d);       \
+    stv = vec_lvsr(8, d);
+#define dstv_load(d)            \
+    vec_u8 hv = vec_ld( 0, d ); \
+    vec_u8 lv = vec_ld( 7, d);  \
+    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );
+#define dest_unligned_store(d)                                 \
+    vec_u8 edgehv;                                             \
+    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );  \
+    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );       \
+    lv    = vec_sel( lv, bodyv, edgelv );                      \
+    vec_st( lv, 7, d );                                        \
+    hv    = vec_ld( 0, d );                                    \
+    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
+    hv    = vec_sel( hv, bodyv, edgehv );                      \
+    vec_st( hv, 0, d );
+#else
+
+#define GET_2PERM(ldv, stv, d) {}
+#define dstv_load(d) vec_u8 dstv = vec_vsx_ld(0, d)
+#define dest_unligned_store(d)\
+    vec_u8 dst8 = vec_perm((vec_u8)idstsum8, dstv, vcprm(2,3,s2,s3));\
+    vec_vsx_st(dst8, 0, d)
+#endif /* HAVE_BIGENDIAN */
+
 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
     /* unaligned load */                                       \
-    vec_u8 hv = vec_ld( 0, dest );                           \
-    vec_u8 lv = vec_ld( 7, dest );                           \
-    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
+    dstv_load(dest);                                           \
     vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
+    vec_u16 dst16 = (vec_u16)VEC_MERGEH(zero_u8v, dstv);   \
     vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
     vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
-    vec_u8 edgehv;                                           \
     /* unaligned store */                                      \
-    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
-    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
-    lv    = vec_sel( lv, bodyv, edgelv );                      \
-    vec_st( lv, 7, dest );                                     \
-    hv    = vec_ld( 0, dest );                                 \
-    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
-    hv    = vec_sel( hv, bodyv, edgehv );                      \
-    vec_st( hv, 0, dest );                                     \
- }
+    dest_unligned_store(dest);\
+}
 
 static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
 {
@@ -197,8 +221,8 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
     vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
     vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
 
-    vec_u8 perm_ldv = vec_lvsl(0, dst);
-    vec_u8 perm_stv = vec_lvsr(8, dst);
+    vec_u8 perm_ldv, perm_stv;
+    GET_2PERM(perm_ldv, perm_stv, dst);
 
     const vec_u16 onev = vec_splat_u16(1);
     const vec_u16 twov = vec_splat_u16(2);
@@ -237,32 +261,41 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
     ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
 }
 
+#if HAVE_BIGENDIAN
+#define DST_LD vec_ld
+#else
+#define DST_LD vec_vsx_ld
+#endif
 static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
 {
     vec_s16 dc16;
     vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
+    vec_s32 v_dc32;
     LOAD_ZERO;
     DECLARE_ALIGNED(16, int, dc);
     int i;
 
     dc = (block[0] + 32) >> 6;
     block[0] = 0;
-    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
+    v_dc32 = vec_lde(0, &dc);
+    dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1);
 
     if (size == 4)
-        dc16 = vec_sld(dc16, zero_s16v, 8);
+        dc16 = VEC_SLD16(dc16, zero_s16v, 8);
     dcplus = vec_packsu(dc16, zero_s16v);
     dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
 
+#if HAVE_BIGENDIAN
     aligner = vec_lvsr(0, dst);
     dcplus = vec_perm(dcplus, dcplus, aligner);
     dcminus = vec_perm(dcminus, dcminus, aligner);
+#endif
 
     for (i = 0; i < size; i += 4) {
-        v0 = vec_ld(0, dst+0*stride);
-        v1 = vec_ld(0, dst+1*stride);
-        v2 = vec_ld(0, dst+2*stride);
-        v3 = vec_ld(0, dst+3*stride);
+        v0 = DST_LD(0, dst+0*stride);
+        v1 = DST_LD(0, dst+1*stride);
+        v2 = DST_LD(0, dst+2*stride);
+        v3 = DST_LD(0, dst+3*stride);
 
         v0 = vec_adds(v0, dcplus);
         v1 = vec_adds(v1, dcplus);
@@ -274,10 +307,10 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl
         v2 = vec_subs(v2, dcminus);
         v3 = vec_subs(v3, dcminus);
 
-        vec_st(v0, 0, dst+0*stride);
-        vec_st(v1, 0, dst+1*stride);
-        vec_st(v2, 0, dst+2*stride);
-        vec_st(v3, 0, dst+3*stride);
+        VEC_ST(v0, 0, dst+0*stride);
+        VEC_ST(v1, 0, dst+1*stride);
+        VEC_ST(v2, 0, dst+2*stride);
+        VEC_ST(v3, 0, dst+3*stride);
 
         dst += 4*stride;
     }
@@ -496,7 +529,7 @@ static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
 
     register vec_u8 average = vec_avg(p0, q0);
     register vec_u8 temp;
-    register vec_u8 uncliped;
+    register vec_u8 unclipped;
     register vec_u8 ones;
     register vec_u8 max;
     register vec_u8 min;
@@ -506,10 +539,10 @@ static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
     average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
     ones = vec_splat_u8(1);
     temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
-    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
+    unclipped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
     max = vec_adds(p1, tc0);
     min = vec_subs(p1, tc0);
-    newp1 = vec_max(min, uncliped);
+    newp1 = vec_max(min, unclipped);
     newp1 = vec_min(max, newp1);
     return newp1;
 }
@@ -592,7 +625,7 @@ static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
     q1 = newq1;                                                                              \
 }
 
-static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
+static void h264_v_loop_filter_luma_altivec(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) {
 
     if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
         register vec_u8 p2 = vec_ld(-3*stride, pix);
@@ -609,7 +642,7 @@ static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
     }
 }
 
-static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
+static void h264_h_loop_filter_luma_altivec(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) {
 
     register vec_u8 line0, line1, line2, line3, line4, line5;
     if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
@@ -638,6 +671,9 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
     temp[2] = offset;
 
     vtemp = (vec_s16)vec_ld(0, temp);
+#if !HAVE_BIGENDIAN
+    vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
+#endif
     vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
     vweight = vec_splat(vtemp, 3);
     voffset = vec_splat(vtemp, 5);
@@ -646,8 +682,8 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
     for (y = 0; y < height; y++) {
         vblock = vec_ld(0, block);
 
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
+        v0 = (vec_s16)VEC_MERGEH(zero_u8v, vblock);
+        v1 = (vec_s16)VEC_MERGEL(zero_u8v, vblock);
 
         if (w == 16 || aligned) {
             v0 = vec_mladd(v0, vweight, zero_s16v);
@@ -684,6 +720,9 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
     temp[3] = offset;
 
     vtemp = (vec_s16)vec_ld(0, temp);
+#if !HAVE_BIGENDIAN
+    vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
+#endif
     vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
     vweights = vec_splat(vtemp, 3);
     vweightd = vec_splat(vtemp, 5);
@@ -695,10 +734,10 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
         vdst = vec_ld(0, dst);
         vsrc = vec_ld(0, src);
 
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
-        v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
-        v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
+        v0 = (vec_s16)VEC_MERGEH(zero_u8v, vdst);
+        v1 = (vec_s16)VEC_MERGEL(zero_u8v, vdst);
+        v2 = (vec_s16)VEC_MERGEH(zero_u8v, vsrc);
+        v3 = (vec_s16)VEC_MERGEL(zero_u8v, vsrc);
 
         if (w == 8) {
             if (src_aligned)
@@ -732,12 +771,12 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
 }
 
 #define H264_WEIGHT(W) \
-static void weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
+static void weight_h264_pixels ## W ## _altivec(uint8_t *block, ptrdiff_t stride, int height, \
                                                 int log2_denom, int weight, int offset) \
 { \
     weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
-static void biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
+static void biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, \
                                                   int log2_denom, int weightd, int weights, int offset) \
 { \
     biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
@@ -745,12 +784,12 @@ static void biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, in
 
 H264_WEIGHT(16)
 H264_WEIGHT( 8)
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -772,5 +811,5 @@ av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
         c->biweight_h264_pixels_tab[0] = biweight_h264_pixels16_altivec;
         c->biweight_h264_pixels_tab[1] = biweight_h264_pixels8_altivec;
     }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/h264qpel.c b/libavcodec/ppc/h264qpel.c
index 5da09bf..bef421f 100644
--- a/libavcodec/ppc/h264qpel.c
+++ b/libavcodec/ppc/h264qpel.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
@@ -193,86 +193,79 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, cons
     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 }\
 
+#if HAVE_BIGENDIAN
+#define put_unligned_store(s, dest) {    \
+    tmp1 = vec_ld(0, dest);              \
+    mask = vec_lvsl(0, dest);            \
+    tmp2 = vec_ld(15, dest);             \
+    edges = vec_perm(tmp2, tmp1, mask);  \
+    align = vec_lvsr(0, dest);           \
+    tmp2 = vec_perm(s, edges, align);    \
+    tmp1 = vec_perm(edges, s, align);    \
+    vec_st(tmp2, 15, dest);              \
+    vec_st(tmp1, 0 , dest);              \
+ }
+#else
+#define put_unligned_store(s, dest) vec_vsx_st(s, 0, dest);
+#endif /* HAVE_BIGENDIAN */
+
 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     const uint8_t * src2, int dst_stride,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
-
+    vec_u8 a, b, d, mask_;
+#if HAVE_BIGENDIAN
+    vec_u8 tmp1, tmp2, mask, edges, align;
     mask_ = vec_lvsl(0, src2);
+#endif
 
     for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
+        a = unaligned_load(i * src_stride1, src1);
+        b = load_with_perm_vec(i * 16, src2, mask_);
         d = vec_avg(a, b);
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
+        put_unligned_store(d, dst);
         dst += dst_stride;
     }
 }
 
+#if HAVE_BIGENDIAN
+#define avg_unligned_store(s, dest){            \
+    tmp1 = vec_ld(0, dest);                     \
+    mask = vec_lvsl(0, dest);                   \
+    tmp2 = vec_ld(15, dest);                    \
+    a = vec_avg(vec_perm(tmp1, tmp2, mask), s); \
+    edges = vec_perm(tmp2, tmp1, mask);         \
+    align = vec_lvsr(0, dest);                  \
+    tmp2 = vec_perm(a, edges, align);           \
+    tmp1 = vec_perm(edges, a, align);           \
+    vec_st(tmp2, 15, dest);                     \
+    vec_st(tmp1, 0 , dest);                     \
+ }
+#else
+#define avg_unligned_store(s, dest){            \
+    a = vec_avg(vec_vsx_ld(0, dst), s);         \
+    vec_vsx_st(a, 0, dst);                      \
+ }
+#endif /* HAVE_BIGENDIAN */
+
 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     const uint8_t * src2, int dst_stride,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+    vec_u8 a, b, d, mask_;
 
+#if HAVE_BIGENDIAN
+    vec_u8 tmp1, tmp2, mask, edges, align;
     mask_ = vec_lvsl(0, src2);
+#endif
 
     for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
-        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
+        a = unaligned_load(i * src_stride1, src1);
+        b = load_with_perm_vec(i * 16, src2, mask_);
+        d = vec_avg(a, b);
+        avg_unligned_store(d, dst);
         dst += dst_stride;
     }
 }
@@ -284,11 +277,11 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
 
 H264_MC(put_, 16, altivec)
 H264_MC(avg_, 16, altivec)
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     const int high_bit_depth = bit_depth > 8;
 
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
@@ -317,5 +310,5 @@ av_cold void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth)
         dspfunc(avg_h264_qpel, 0, 16);
 #undef dspfunc
     }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/h264qpel_template.c b/libavcodec/ppc/h264qpel_template.c
index 6de063a..304604c 100644
--- a/libavcodec/ppc/h264qpel_template.c
+++ b/libavcodec/ppc/h264qpel_template.c
@@ -1,30 +1,103 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/mem.h"
+#include "config.h"
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 
-#ifdef DEBUG
-#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
+
+#if HAVE_BIGENDIAN
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+    vec_u8 srcR1 = vec_ld(-2, s);\
+    vec_u8 srcR2 = vec_ld(14, s);\
+    switch (ali) {\
+    default: {\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = vec_perm(srcR1, srcR2, pp2);\
+        srcP3 = vec_perm(srcR1, srcR2, pp3);\
+    } break;\
+    case 11: {\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = vec_perm(srcR1, srcR2, pp2);\
+        srcP3 = srcR2;\
+    } break;\
+    case 12: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = srcR2;\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 13: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = srcR2;\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 14: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = srcR2;\
+        srcP1 = vec_perm(srcR2, srcR3, pp1);\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 15: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = srcR2;\
+        srcP0 = vec_perm(srcR2, srcR3, pp0);\
+        srcP1 = vec_perm(srcR2, srcR3, pp1);\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    }\
+ }
 #else
-#define ASSERT_ALIGNED(ptr) ;
-#endif
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+    srcM2 =  vec_vsx_ld(-2, s);\
+    srcM1 = vec_vsx_ld(-1, s);\
+    srcP0 = vec_vsx_ld(0, s);\
+    srcP1 = vec_vsx_ld(1, s);\
+    srcP2 = vec_vsx_ld(2, s);\
+    srcP3 = vec_vsx_ld(3, s);\
+ }
+#endif /* HAVE_BIGENDIAN */
 
 /* this code assume stride % 16 == 0 */
 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
@@ -35,12 +108,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
     register int i;
 
     LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
+    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
     const vec_s16 v5ss = vec_splat_s16(5);
     const vec_u16 v5us = vec_splat_u16(5);
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
@@ -59,79 +127,32 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
 
     vec_u8 sum, fsum;
 
+#if HAVE_BIGENDIAN
+    permM2 = vec_lvsl(-2, src);
+    permM1 = vec_lvsl(-1, src);
+    permP0 = vec_lvsl(+0, src);
+    permP1 = vec_lvsl(+1, src);
+    permP2 = vec_lvsl(+2, src);
+    permP3 = vec_lvsl(+3, src);
+#endif /* HAVE_BIGENDIAN */
+
     for (i = 0 ; i < 16 ; i ++) {
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
+
+        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
+        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -178,7 +199,10 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
     register int i;
 
     LOAD_ZERO;
-    const vec_u8 perm = vec_lvsl(0, src);
+    vec_u8 perm;
+#if HAVE_BIGENDIAN
+    perm = vec_lvsl(0, src);
+#endif
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
     const vec_u16 v5us = vec_splat_u16(5);
     const vec_s16 v5ss = vec_splat_s16(5);
@@ -186,52 +210,41 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
 
     const uint8_t *srcbis = src - (srcStride * 2);
 
-    const vec_u8 srcM2a = vec_ld(0, srcbis);
-    const vec_u8 srcM2b = vec_ld(16, srcbis);
-    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcM1b = vec_ld(16, srcbis);
-    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP0b = vec_ld(16, srcbis);
-    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP1b = vec_ld(16, srcbis);
-    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP2b = vec_ld(16, srcbis);
-    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
-    //srcbis += srcStride;
-
-    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
-    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
-    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
-    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
-    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
+    const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+
+    vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+    vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
+    vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+    vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+    vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+    vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+    vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+    vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+    vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+    vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
 
     vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
               psumA, psumB, sumA, sumB,
               srcP3ssA, srcP3ssB,
               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
 
-    vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
+    vec_u8 sum, fsum, srcP3;
 
     for (i = 0 ; i < 16 ; i++) {
-        srcP3a = vec_ld(0, srcbis += srcStride);
-        srcP3b = vec_ld(16, srcbis);
-        srcP3 = vec_perm(srcP3a, srcP3b, perm);
-        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
-        //srcbis += srcStride;
+        srcP3 = load_with_perm_vec(0, srcbis, perm);
+        srcbis += srcStride;
+
+        srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
 
         sum1A = vec_adds(srcP0ssA, srcP1ssA);
         sum1B = vec_adds(srcP0ssB, srcP1ssB);
@@ -288,12 +301,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
 {
     register int i;
     LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
+    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
     const vec_u32 v10ui = vec_splat_u32(10);
     const vec_s16 v5ss = vec_splat_s16(5);
@@ -325,81 +333,35 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
     vec_u8 fsum, sumv, sum;
     vec_s16 ssume, ssumo;
 
+#if HAVE_BIGENDIAN
+    permM2 = vec_lvsl(-2, src);
+    permM1 = vec_lvsl(-1, src);
+    permP0 = vec_lvsl(+0, src);
+    permP1 = vec_lvsl(+1, src);
+    permP2 = vec_lvsl(+2, src);
+    permP3 = vec_lvsl(+3, src);
+#endif /* HAVE_BIGENDIAN */
+
     src -= (2 * srcStride);
     for (i = 0 ; i < 21 ; i ++) {
         vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+
+        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
+
+        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
+        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -448,8 +410,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
         const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
         const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
         const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
-        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
-        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
+        vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
+        vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
 
         tmpbis += tmpStride;
 
@@ -474,10 +436,14 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
         pp2Be = vec_mule(sum2B, v5ss);
         pp2Bo = vec_mulo(sum2B, v5ss);
 
-        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
         pp3Ao = vec_mulo(sum3A, v1ss);
-        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
         pp3Bo = vec_mulo(sum3B, v1ss);
+#if !HAVE_BIGENDIAN
+        sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
+        sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
+#endif
+        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
+        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
 
         pp1cAe = vec_add(pp1Ae, v512si);
         pp1cAo = vec_add(pp1Ao, v512si);
diff --git a/libavcodec/ppc/hevcdsp.c b/libavcodec/ppc/hevcdsp.c
index f77943b..c1d562a 100644
--- a/libavcodec/ppc/hevcdsp.c
+++ b/libavcodec/ppc/hevcdsp.c
@@ -1,20 +1,21 @@
-/* SIMD-optimized IDCT functions for HEVC decoding
+/*
+ * SIMD-optimized IDCT functions for HEVC decoding
  * Copyright (c) Alexandra Hajkova
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,8 +41,9 @@ static const vec_u8 mask[2] = {
     { 0x04, 0x05, 0x0C, 0x0D, 0x14, 0x15, 0x1C, 0x1D, 0x06, 0x07, 0x0E, 0x0F, 0x16, 0x17, 0x1E, 0x1F },
 };
 
-static void transform4x4(vec_s16 src_01, vec_s16 src_23, vec_s32 res[4],
-                         const int shift, int16_t *coeffs)
+static av_always_inline void transform4x4(vec_s16 src_01, vec_s16 src_23,
+                                          vec_s32 res[4], const int shift,
+                                          int16_t *coeffs)
 {
     vec_s16 src_02, src_13;
     vec_s32 zero = vec_splat_s32(0);
@@ -56,7 +58,13 @@ static void transform4x4(vec_s16 src_01, vec_s16 src_23, vec_s32 res[4],
     e1 = vec_msums(src_02, trans4[2], zero);
     o1 = vec_msums(src_13, trans4[3], zero);
 
-    add = vec_sl(vec_splat_s32(1), vec_splat_u32(shift - 1));
+    switch(shift) {
+    case  7: add = vec_sl(vec_splat_s32(1), vec_splat_u32( 7 - 1)); break;
+    case 10: add = vec_sl(vec_splat_s32(1), vec_splat_u32(10 - 1)); break;
+    case 12: add = vec_sl(vec_splat_s32(1), vec_splat_u32(12 - 1)); break;
+    default: abort();
+    }
+
     e0 = vec_add(e0, add);
     e1 = vec_add(e1, add);
 
@@ -66,10 +74,18 @@ static void transform4x4(vec_s16 src_01, vec_s16 src_23, vec_s32 res[4],
     res[3] = vec_sub(e0, o0);
 }
 
-static void scale(vec_s32 res[4], vec_s16 res_packed[2], int shift)
+static av_always_inline void scale(vec_s32 res[4], vec_s16 res_packed[2],
+                                   const int shift)
 {
     int i;
-    vec_u32 v_shift = vec_splat_u32(shift);
+    vec_u32 v_shift;
+
+    switch(shift) {
+    case  7: v_shift = vec_splat_u32(7) ; break;
+    case 10: v_shift = vec_splat_u32(10); break;
+    case 12: v_shift = vec_splat_u32(12); break;
+    default: abort();
+    }
 
     for (i = 0; i < 4; i++)
         res[i] = vec_sra(res[i], v_shift);
diff --git a/libavcodec/ppc/hevcdsp_template.c b/libavcodec/ppc/hevcdsp_template.c
index 368ae91..2b6411b 100644
--- a/libavcodec/ppc/hevcdsp_template.c
+++ b/libavcodec/ppc/hevcdsp_template.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) Alexandra Hajkova
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 405b918..a531b6b 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,16 +31,15 @@
 
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 /* next one assumes that ((line_size % 16) == 0) */
 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
+    register vector unsigned char pixelsv1;
+    register vector unsigned char pixelsv1B;
+    register vector unsigned char pixelsv1C;
+    register vector unsigned char pixelsv1D;
 
-    register vector unsigned char perm = vec_lvsl(0, pixels);
     int i;
     register ptrdiff_t line_size_2 = line_size << 1;
     register ptrdiff_t line_size_3 = line_size + line_size_2;
@@ -52,22 +51,14 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 // -funroll-loops w/ this is bad - 74 cycles again.
 // all this is on a 7450, tuning for the 7450
     for (i = 0; i < h; i += 4) {
-        pixelsv1  = vec_ld( 0, pixels);
-        pixelsv2  = vec_ld(15, pixels);
-        pixelsv1B = vec_ld(line_size, pixels);
-        pixelsv2B = vec_ld(15 + line_size, pixels);
-        pixelsv1C = vec_ld(line_size_2, pixels);
-        pixelsv2C = vec_ld(15 + line_size_2, pixels);
-        pixelsv1D = vec_ld(line_size_3, pixels);
-        pixelsv2D = vec_ld(15 + line_size_3, pixels);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
-               line_size, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
-               line_size_2, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
-               line_size_3, (unsigned char*)block);
+        pixelsv1  = unaligned_load( 0, pixels);
+        pixelsv1B = unaligned_load(line_size, pixels);
+        pixelsv1C = unaligned_load(line_size_2, pixels);
+        pixelsv1D = unaligned_load(line_size_3, pixels);
+        VEC_ST(pixelsv1, 0, (unsigned char*)block);
+        VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
+        VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
+        VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
         pixels+=line_size_4;
         block +=line_size_4;
     }
@@ -77,15 +68,12 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
-    register vector unsigned char perm = vec_lvsl(0, pixels);
-    int i;
+    register vector unsigned char pixelsv, blockv;
 
+    int i;
     for (i = 0; i < h; i++) {
-        pixelsv1 = vec_ld( 0, pixels);
-        pixelsv2 = vec_ld(16,pixels);
         blockv = vec_ld(0, block);
-        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
+        pixelsv = VEC_LD( 0, pixels);
         blockv = vec_avg(blockv,pixelsv);
         vec_st(blockv, 0, (unsigned char*)block);
         pixels+=line_size;
@@ -96,7 +84,7 @@ void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 /* next one assumes that ((line_size % 8) == 0) */
 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
 {
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    register vector unsigned char pixelsv, blockv;
     int i;
 
    for (i = 0; i < h; i++) {
@@ -105,9 +93,7 @@ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff
        int rightside = ((unsigned long)block & 0x0000000F);
 
        blockv = vec_ld(0, block);
-       pixelsv1 = vec_ld( 0, pixels);
-       pixelsv2 = vec_ld(16, pixels);
-       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+       pixelsv = VEC_LD( 0, pixels);
 
        if (rightside) {
            pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
@@ -129,21 +115,16 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -152,17 +133,10 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -188,22 +162,16 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vcone);
@@ -212,17 +180,10 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -248,24 +209,18 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short temp3, temp4,
         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vctwo);
@@ -276,20 +231,13 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     for (i = 0; i < h ; i++) {
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
+        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -316,25 +264,19 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short temp3, temp4,
         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vcone);
@@ -343,22 +285,13 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     pixelssum1 = vec_add(pixelssum1, vcone);
 
     for (i = 0; i < h ; i++) {
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
+        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -373,7 +306,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
 
         blockv = vec_packsu(temp3, temp4);
 
-        vec_st(blockv, 0, block);
+        VEC_ST(blockv, 0, block);
 
         block += line_size;
         pixels += line_size;
@@ -385,7 +318,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2, blocktemp;
+    register vector unsigned char blockv, blocktemp;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
 
     register const vector unsigned char vczero = (const vector unsigned char)
@@ -393,16 +326,10 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     register const vector unsigned short vctwo = (const vector unsigned short)
                                         vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -411,17 +338,11 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -442,11 +363,11 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         pixels += line_size;
     }
 }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -461,5 +382,5 @@ av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
     c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
     c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
     c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/hpeldsp_altivec.h b/libavcodec/ppc/hpeldsp_altivec.h
index 98dd80e..590809f 100644
--- a/libavcodec/ppc/hpeldsp_altivec.h
+++ b/libavcodec/ppc/hpeldsp_altivec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c
index dc22e15..29f625a 100644
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@ -1,28 +1,28 @@
 /*
  * Copyright (c) 2001 Michel Lespinasse
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /* NOTE: This code is based on GPL code from the libmpeg2 project.  The
  * author, Michel Lespinasses, has given explicit permission to release
- * under LGPL as part of Libav.
+ * under LGPL as part of FFmpeg.
  *
- * Libav integration by Dieter Shirley
+ * FFmpeg integration by Dieter Shirley
  *
  * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
  * project.  I've deleted all of the libmpeg2-specific code, renamed the
@@ -42,7 +42,7 @@
 
 #include "libavcodec/idctdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define IDCT_HALF                                       \
     /* 1st stage */                                     \
@@ -152,6 +152,22 @@ static const vec_s16 constants[5] = {
     { 19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722 }
 };
 
+static void idct_altivec(int16_t *blk)
+{
+    vec_s16 *block = (vec_s16 *) blk;
+
+    IDCT;
+
+    block[0] = vx0;
+    block[1] = vx1;
+    block[2] = vx2;
+    block[3] = vx3;
+    block[4] = vx4;
+    block[5] = vx5;
+    block[6] = vx6;
+    block[7] = vx7;
+}
+
 static void idct_put_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
 {
     vec_s16 *block = (vec_s16 *) blk;
@@ -192,16 +208,26 @@ static void idct_add_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
 
     IDCT;
 
+#if HAVE_BIGENDIAN
     p0    = vec_lvsl(0, dest);
     p1    = vec_lvsl(stride, dest);
     p     = vec_splat_u8(-1);
     perm0 = vec_mergeh(p, p0);
     perm1 = vec_mergeh(p, p1);
+#endif
 
-#define ADD(dest, src, perm)                                \
-    /* *(uint64_t *) &tmp = *(uint64_t *) dest; */          \
+#if HAVE_BIGENDIAN
+#define GET_TMP2(dest, prm)                                 \
     tmp  = vec_ld(0, dest);                                 \
-    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm);    \
+    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, prm);
+#else
+#define GET_TMP2(dest, prm)                                 \
+    tmp  = vec_vsx_ld(0, dest);                             \
+    tmp2 = (vec_s16) vec_mergeh(tmp, (vec_u8) zero)
+#endif
+
+#define ADD(dest, src, perm)                                \
+    GET_TMP2(dest, perm);                                   \
     tmp3 = vec_adds(tmp2, src);                             \
     tmp  = vec_packsu(tmp3, tmp3);                          \
     vec_ste((vec_u32) tmp, 0, (unsigned int *) dest);       \
@@ -224,22 +250,23 @@ static void idct_add_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
     ADD(dest, vx7, perm1);
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
-    if (!high_bit_depth) {
-        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
+    if (!high_bit_depth && avctx->lowres == 0) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
+            c->idct      = idct_altivec;
             c->idct_add  = idct_add_altivec;
             c->idct_put  = idct_put_altivec;
             c->perm_type = FF_IDCT_PERM_TRANSPOSE;
         }
     }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/apedsp_altivec.c b/libavcodec/ppc/lossless_audiodsp_altivec.c
index 7c9d02a..298e6c3 100644
--- a/libavcodec/ppc/apedsp_altivec.c
+++ b/libavcodec/ppc/lossless_audiodsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,23 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/util_altivec.h"
 
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_BIGENDIAN
+#define GET_T(tt0,tt1,src,a,b){       \
+        a = vec_ld(16, src);          \
+        tt0 = vec_perm(b, a, align);  \
+        b = vec_ld(32, src);          \
+        tt1 = vec_perm(a, b, align);  \
+ }
+#else
+#define GET_T(tt0,tt1,src,a,b){       \
+        tt0 = vec_vsx_ld(0, src);     \
+        tt1 = vec_vsx_ld(16, src);    \
+ }
+#endif
+
+#if HAVE_ALTIVEC
 static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                     const int16_t *v2,
                                                     const int16_t *v3,
@@ -36,26 +50,23 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
     LOAD_ZERO;
     vec_s16 *pv1 = (vec_s16 *) v1;
     register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
-    register vec_s16 t0, t1, i0, i1, i4;
-    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
+    register vec_s16 t0, t1, i0, i1, i4, i2, i3;
     register vec_s32 res = zero_s32v;
+#if HAVE_BIGENDIAN
     register vec_u8 align = vec_lvsl(0, v2);
+    i2 = vec_ld(0, v2);
+    i3 = vec_ld(0, v3);
+#endif
     int32_t ires;
 
     order >>= 4;
     do {
-        i1     = vec_ld(16, v2);
-        t0     = vec_perm(i2, i1, align);
-        i2     = vec_ld(32, v2);
-        t1     = vec_perm(i1, i2, align);
+        GET_T(t0,t1,v2,i1,i2);
         i0     = pv1[0];
         i1     = pv1[1];
         res    = vec_msum(t0, i0, res);
         res    = vec_msum(t1, i1, res);
-        i4     = vec_ld(16, v3);
-        t0     = vec_perm(i3, i4, align);
-        i3     = vec_ld(32, v3);
-        t1     = vec_perm(i4, i3, align);
+        GET_T(t0,t1,v3,i4,i3);
         pv1[0] = vec_mladd(t0, muls, i0);
         pv1[1] = vec_mladd(t1, muls, i1);
         pv1   += 2;
@@ -67,14 +78,14 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
 
     return ires;
 }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
-av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
+av_cold void ff_llauddsp_init_ppc(LLAudDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c b/libavcodec/ppc/lossless_videodsp_altivec.c
index dff2902..980f85b 100644
--- a/libavcodec/ppc/huffyuvdsp_altivec.c
+++ b/libavcodec/ppc/lossless_videodsp_altivec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,10 +27,10 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/util_altivec.h"
 
-#include "libavcodec/huffyuvdsp.h"
+#include "libavcodec/lossless_videodsp.h"
 
 #if HAVE_ALTIVEC
-static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, ptrdiff_t w)
 {
     register int i;
     register vector unsigned char vdst, vsrc;
@@ -48,7 +48,7 @@ static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
 }
 #endif /* HAVE_ALTIVEC */
 
-av_cold void ff_huffyuvdsp_init_ppc(HuffYUVDSPContext *c)
+av_cold void ff_llviddsp_init_ppc(LLVidDSPContext *c)
 {
 #if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
diff --git a/libavcodec/ppc/mathops.h b/libavcodec/ppc/mathops.h
index 34ddb11..dbd714f 100644
--- a/libavcodec/ppc/mathops.h
+++ b/libavcodec/ppc/mathops.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2001, 2002 Fabrice Bellard
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/mdct_init.c b/libavcodec/ppc/mdct_init.c
deleted file mode 100644
index 73200a0..0000000
--- a/libavcodec/ppc/mdct_init.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * FFT/IFFT transforms
- * AltiVec-enabled
- * Copyright (c) 2009 Loren Merritt
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/cpu.h"
-#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/util_altivec.h"
-
-#include "libavcodec/fft.h"
-
-/**
- * Do a complex FFT with the parameters defined in ff_fft_init().
- * The input data must be permuted before with s->revtab table.
- * No 1.0 / sqrt(n) normalization is done.
- * AltiVec-enabled:
- * This code assumes that the 'z' pointer is 16 bytes-aligned.
- * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
- */
-
-void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
-
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
-static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
-{
-    int j, k;
-    int n = 1 << s->mdct_bits;
-    int n4 = n >> 2;
-    int n8 = n >> 3;
-    int n32 = n >> 5;
-    const uint16_t *revtabj = s->revtab;
-    const uint16_t *revtabk = s->revtab+n4;
-    const vec_f *tcos = (const vec_f*)(s->tcos+n8);
-    const vec_f *tsin = (const vec_f*)(s->tsin+n8);
-    const vec_f *pin = (const vec_f*)(input+n4);
-    vec_f *pout = (vec_f*)(output+n4);
-
-    /* pre rotation */
-    k = n32-1;
-    do {
-        vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
-#define CMULA(p,o0,o1,o2,o3)\
-        a = pin[ k*2+p];                       /* { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  } */\
-        b = pin[-k*2-p-1];                     /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
-        re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re } */\
-        im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    } */\
-        cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
-        sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
-        r##p = im*cos - re*sin;\
-        i##p = re*cos + im*sin;
-#define STORE2(v,dst)\
-        j = dst;\
-        vec_ste(v, 0, output+j*2);\
-        vec_ste(v, 4, output+j*2);
-#define STORE8(p)\
-        a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
-        b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
-        c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
-        d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
-        STORE2(a, revtabk[ p*2-4]);\
-        STORE2(b, revtabk[ p*2-3]);\
-        STORE2(c, revtabj[-p*2+2]);\
-        STORE2(d, revtabj[-p*2+3]);
-
-        cos0 = tcos[k];
-        sin0 = tsin[k];
-        cos1 = tcos[-k-1];
-        sin1 = tsin[-k-1];
-        CMULA(0, 0,1,2,3);
-        CMULA(1, 2,3,0,1);
-        STORE8(0);
-        STORE8(1);
-        revtabj += 4;
-        revtabk -= 4;
-        k--;
-    } while(k >= 0);
-
-    ff_fft_calc_altivec(s, (FFTComplex*)output);
-
-    /* post rotation + reordering */
-    j = -n32;
-    k = n32-1;
-    do {
-        vec_f cos,sin,re,im,a,b,c,d;
-#define CMULB(d0,d1,o)\
-        re = pout[o*2];\
-        im = pout[o*2+1];\
-        cos = tcos[o];\
-        sin = tsin[o];\
-        d0 = im*sin - re*cos;\
-        d1 = re*sin + im*cos;
-
-        CMULB(a,b,j);
-        CMULB(c,d,k);
-        pout[2*j]   = vec_perm(a, d, vcprm(0,s3,1,s2));
-        pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
-        pout[2*k]   = vec_perm(c, b, vcprm(0,s3,1,s2));
-        pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
-        j++;
-        k--;
-    } while(k >= 0);
-}
-
-static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
-{
-    int k;
-    int n = 1 << s->mdct_bits;
-    int n4 = n >> 2;
-    int n16 = n >> 4;
-    vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
-    vec_u32 *p0 = (vec_u32*)(output+n4);
-    vec_u32 *p1 = (vec_u32*)(output+n4*3);
-
-    imdct_half_altivec(s, output + n4, input);
-
-    for (k = 0; k < n16; k++) {
-        vec_u32 a = p0[k] ^ sign;
-        vec_u32 b = p1[-k-1];
-        p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
-        p1[k]    = vec_perm(b, b, vcprm(3,2,1,0));
-    }
-}
-#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
-
-av_cold void ff_mdct_init_ppc(FFTContext *s)
-{
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
-    if (!PPC_ALTIVEC(av_get_cpu_flags()))
-        return;
-
-    if (s->mdct_bits >= 5) {
-        s->imdct_calc = imdct_calc_altivec;
-        s->imdct_half = imdct_half_altivec;
-    }
-#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
-}
diff --git a/libavcodec/ppc/me_cmp.c b/libavcodec/ppc/me_cmp.c
index 4e4d8da..17f9a4f 100644
--- a/libavcodec/ppc/me_cmp.c
+++ b/libavcodec/ppc/me_cmp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,27 +31,44 @@
 #include "libavcodec/mpegvideo.h"
 #include "libavcodec/me_cmp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
+
+#if HAVE_BIGENDIAN
+#define GET_PERM(per1, per2, pix) {\
+    per1 = vec_lvsl(0, pix);\
+    per2 = vec_add(per1, vec_splat_u8(1));\
+}
+#define LOAD_PIX(v, iv, pix, per1, per2) {\
+    vector unsigned char pix2l  = vec_ld(0,  pix);\
+    vector unsigned char pix2r  = vec_ld(16, pix);\
+    v  = vec_perm(pix2l, pix2r, per1);\
+    iv = vec_perm(pix2l, pix2r, per2);\
+}
+#else
+#define GET_PERM(per1, per2, pix) {}
+#define LOAD_PIX(v, iv, pix, per1, per2) {\
+    v  = vec_vsx_ld(0,  pix);\
+    iv = vec_vsx_ld(1,  pix);\
+}
+#endif
 static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                             ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int __attribute__((aligned(16))) s = 0;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+    vector unsigned char perm1, perm2, pix2v, pix2iv;
 
+    GET_PERM(perm1, perm2, pix2);
     for (i = 0; i < h; i++) {
         /* Read unaligned pixels into our vectors. The vectors are as follows:
          * pix1v: pix1[0] - pix1[15]
          * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
         vector unsigned char pix1v  = vec_ld(0,  pix1);
-        vector unsigned char pix2l  = vec_ld(0,  pix2);
-        vector unsigned char pix2r  = vec_ld(16, pix2);
-        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
-        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
+        LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
 
         /* Calculate the average vector. */
         vector unsigned char avgv = vec_avg(pix2v, pix2iv);
@@ -77,13 +94,14 @@ static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                             ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int  __attribute__((aligned(16))) s = 0;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned char pix1v, pix3v, avgv, t5;
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+
     uint8_t *pix3 = pix2 + stride;
 
     /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
@@ -93,19 +111,14 @@ static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
      * Read unaligned pixels into our vectors. The vectors are as follows:
      * pix2v: pix2[0] - pix2[15]
      * Split the pixel vectors into shorts. */
-    vector unsigned char pix2l = vec_ld(0,  pix2);
-    vector unsigned char pix2r = vec_ld(15, pix2);
-    vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
+    vector unsigned char pix2v = VEC_LD(0, pix2);
 
     for (i = 0; i < h; i++) {
         /* Read unaligned pixels into our vectors. The vectors are as follows:
          * pix1v: pix1[0] - pix1[15]
          * pix3v: pix3[0] - pix3[15] */
         pix1v = vec_ld(0,  pix1);
-
-        pix2l = vec_ld(0,  pix3);
-        pix2r = vec_ld(15, pix3);
-        pix3v = vec_perm(pix2l, pix2r, perm);
+        pix3v = VEC_LD(0,  pix3);
 
         /* Calculate the average vector. */
         avgv = vec_avg(pix2v, pix3v);
@@ -131,20 +144,21 @@ static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                              ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int  __attribute__((aligned(16))) s = 0;
     uint8_t *pix3 = pix2 + stride;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
     const vector unsigned short two =
         (const vector unsigned short) vec_splat_u16(2);
     vector unsigned char avgv, t5;
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
     vector unsigned char pix1v, pix3v, pix3iv;
     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
     vector unsigned short avghv, avglv;
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+    vector unsigned char perm1, perm2, pix2v, pix2iv;
+    GET_PERM(perm1, perm2, pix2);
 
     /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
      * iteration becomes pix2 in the next iteration. We can use this
@@ -153,19 +167,16 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
      * Read unaligned pixels into our vectors. The vectors are as follows:
      * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
      * Split the pixel vectors into shorts. */
-    vector unsigned char pix2l  = vec_ld(0,  pix2);
-    vector unsigned char pix2r  = vec_ld(16, pix2);
-    vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
-    vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
-
+    LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
     vector unsigned short pix2hv  =
-        (vector unsigned short) vec_mergeh(zero, pix2v);
+        (vector unsigned short) VEC_MERGEH(zero, pix2v);
     vector unsigned short pix2lv  =
-        (vector unsigned short) vec_mergel(zero, pix2v);
+        (vector unsigned short) VEC_MERGEL(zero, pix2v);
     vector unsigned short pix2ihv =
-        (vector unsigned short) vec_mergeh(zero, pix2iv);
+        (vector unsigned short) VEC_MERGEH(zero, pix2iv);
     vector unsigned short pix2ilv =
-        (vector unsigned short) vec_mergel(zero, pix2iv);
+        (vector unsigned short) VEC_MERGEL(zero, pix2iv);
+
     vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
     vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
     vector unsigned short t3, t4;
@@ -175,11 +186,7 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
          * pix1v: pix1[0] - pix1[15]
          * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
         pix1v  = vec_ld(0, pix1);
-
-        pix2l  = vec_ld(0, pix3);
-        pix2r  = vec_ld(16, pix3);
-        pix3v  = vec_perm(pix2l, pix2r, perm1);
-        pix3iv = vec_perm(pix2l, pix2r, perm2);
+        LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2);
 
         /* Note that AltiVec does have vec_avg, but this works on vector pairs
          * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
@@ -188,10 +195,10 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
          * vectors of shorts and do the averaging by hand. */
 
         /* Split the pixel vectors into shorts. */
-        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
-        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
-        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
-        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
+        pix3hv  = (vector unsigned short) VEC_MERGEH(zero, pix3v);
+        pix3lv  = (vector unsigned short) VEC_MERGEL(zero, pix3v);
+        pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv);
+        pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv);
 
         /* Do the averaging on them. */
         t3 = vec_add(pix3hv, pix3ihv);
@@ -226,19 +233,17 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                          ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
 
     for (i = 0; i < h; i++) {
         /* Read potentially unaligned pixels into t1 and t2. */
-        vector unsigned char pix2l = vec_ld(0,  pix2);
-        vector unsigned char pix2r = vec_ld(15, pix2);
-        vector unsigned char t1 = vec_ld(0, pix1);
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
+        vector unsigned char t1 =vec_ld(0, pix1);
+        vector unsigned char t2 = VEC_LD(0, pix2);
 
         /* Calculate a sum of abs differences vector. */
         vector unsigned char t3 = vec_max(t1, t2);
@@ -263,14 +268,13 @@ static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                         ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
     const vector unsigned char permclear =
         (vector unsigned char)
         { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
 
@@ -278,14 +282,10 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
         /* Read potentially unaligned pixels into t1 and t2.
          * Since we're reading 16 pixels, and actually only want 8,
          * mask out the last 8 pixels. The 0s don't change the sum. */
-        vector unsigned char pix1l = vec_ld(0, pix1);
-        vector unsigned char pix1r = vec_ld(7, pix1);
-        vector unsigned char pix2l = vec_ld(0, pix2);
-        vector unsigned char pix2r = vec_ld(7, pix2);
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
-                                          permclear);
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
-                                          permclear);
+        vector unsigned char pix1l = VEC_LD(0, pix1);
+        vector unsigned char pix2l = VEC_LD(0, pix2);
+        vector unsigned char t1 = vec_and(pix1l, permclear);
+        vector unsigned char t2 = vec_and(pix2l, permclear);
 
         /* Calculate a sum of abs differences vector. */
         vector unsigned char t3 = vec_max(t1, t2);
@@ -312,14 +312,13 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                         ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
     const vector unsigned char permclear =
         (vector unsigned char)
         { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
     vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumsqr;
 
@@ -327,14 +326,8 @@ static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
         /* Read potentially unaligned pixels into t1 and t2.
          * Since we're reading 16 pixels, and actually only want 8,
          * mask out the last 8 pixels. The 0s don't change the sum. */
-        vector unsigned char pix1l = vec_ld(0, pix1);
-        vector unsigned char pix1r = vec_ld(7, pix1);
-        vector unsigned char pix2l = vec_ld(0, pix2);
-        vector unsigned char pix2r = vec_ld(7, pix2);
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
-                                          permclear);
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
-                                          permclear);
+        vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear);
+        vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear);
 
         /* Since we want to use unsigned chars, we can take advantage
          * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
@@ -364,19 +357,17 @@ static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                          ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumsqr;
 
     for (i = 0; i < h; i++) {
         /* Read potentially unaligned pixels into t1 and t2. */
-        vector unsigned char pix2l = vec_ld(0,  pix2);
-        vector unsigned char pix2r = vec_ld(15, pix2);
         vector unsigned char t1 = vec_ld(0, pix1);
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
+        vector unsigned char t2 = VEC_LD(0, pix2);
 
         /* Since we want to use unsigned chars, we can take advantage
          * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
@@ -396,15 +387,15 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
     /* Sum up the four partial sums, and put the result into s. */
     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
     sumsqr = vec_splat(sumsqr, 3);
-    vec_ste(sumsqr, 0, &s);
 
+    vec_ste(sumsqr, 0, &s);
     return s;
 }
 
 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
                                      uint8_t *src, ptrdiff_t stride, int h)
 {
-    int sum;
+    int __attribute__((aligned(16))) sum;
     register const vector unsigned char vzero =
         (const vector unsigned char) vec_splat_u8(0);
     register vector signed short temp0, temp1, temp2, temp3, temp4,
@@ -429,24 +420,19 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
             { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
 
+
 #define ONEITERBUTTERFLY(i, res)                                            \
     {                                                                       \
-        register vector unsigned char src1 = vec_ld(stride * i, src);       \
-        register vector unsigned char src2 = vec_ld(stride * i + 15, src);  \
-        register vector unsigned char srcO =                                \
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
-        register vector unsigned char dst1 = vec_ld(stride * i, dst);       \
-        register vector unsigned char dst2 = vec_ld(stride * i + 15, dst);  \
-        register vector unsigned char dstO =                                \
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
+        register vector unsigned char srcO =  unaligned_load(stride * i, src);  \
+        register vector unsigned char dstO = unaligned_load(stride * i, dst);\
                                                                             \
         /* Promote the unsigned chars to signed shorts. */                  \
         /* We're in the 8x8 function, we only care for the first 8. */      \
         register vector signed short srcV =                                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstV =                                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
                                                                             \
         /* subtractions inside the first butterfly */                       \
@@ -458,6 +444,7 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
         register vector signed short op3  = vec_perm(but2, but2, perm3);    \
         res  = vec_mladd(but2, vprod3, op3);                                \
     }
+
         ONEITERBUTTERFLY(0, temp0);
         ONEITERBUTTERFLY(1, temp1);
         ONEITERBUTTERFLY(2, temp2);
@@ -507,13 +494,14 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
         vsum = vec_sum4s(vec_abs(line7C), vsum);
         vsum = vec_sums(vsum, (vector signed int) vzero);
         vsum = vec_splat(vsum, 3);
+
         vec_ste(vsum, 0, &sum);
     }
     return sum;
 }
 
 /*
- * 16x8 works with 16 elements; it allows to avoid replicating loads, and
+ * 16x8 works with 16 elements; it can avoid replicating loads, and
  * gives the compiler more room for scheduling. It's only used from
  * inside hadamard8_diff16_altivec.
  *
@@ -533,7 +521,7 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
 static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
                                       uint8_t *src, ptrdiff_t stride, int h)
 {
-    int sum;
+    int __attribute__((aligned(16))) sum;
     register vector signed short
         temp0 __asm__ ("v0"),
         temp1 __asm__ ("v1"),
@@ -581,31 +569,23 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
 
 #define ONEITERBUTTERFLY(i, res1, res2)                                     \
     {                                                                       \
-        register vector unsigned char src1 __asm__ ("v22") =                \
-            vec_ld(stride * i, src);                                        \
-        register vector unsigned char src2 __asm__ ("v23") =                \
-            vec_ld(stride * i + 16, src);                                   \
         register vector unsigned char srcO __asm__ ("v22") =                \
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
-        register vector unsigned char dst1 __asm__ ("v24") =                \
-            vec_ld(stride * i, dst);                                        \
-        register vector unsigned char dst2 __asm__ ("v25") =                \
-            vec_ld(stride * i + 16, dst);                                   \
+            unaligned_load(stride * i, src);                                    \
         register vector unsigned char dstO __asm__ ("v23") =                \
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
+            unaligned_load(stride * i, dst);\
                                                                             \
         /* Promote the unsigned chars to signed shorts. */                  \
         register vector signed short srcV __asm__ ("v24") =                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstV __asm__ ("v25") =                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
         register vector signed short srcW __asm__ ("v26") =                 \
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstW __asm__ ("v27") =                 \
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
                                                                             \
         /* subtractions inside the first butterfly */                       \
@@ -636,6 +616,7 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
         res1 = vec_mladd(but2, vprod3, op3);                                \
         res2 = vec_mladd(but2S, vprod3, op3S);                              \
     }
+
         ONEITERBUTTERFLY(0, temp0, temp0S);
         ONEITERBUTTERFLY(1, temp1, temp1S);
         ONEITERBUTTERFLY(2, temp2, temp2S);
@@ -722,6 +703,7 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
         vsum = vec_sum4s(vec_abs(line7CS), vsum);
         vsum = vec_sums(vsum, (vector signed int) vzero);
         vsum = vec_splat(vsum, 3);
+
         vec_ste(vsum, 0, &sum);
     }
     return sum;
@@ -739,11 +721,11 @@ static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
     }
     return score;
 }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -760,5 +742,5 @@ av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
 
     c->hadamard8_diff[0] = hadamard8_diff16_altivec;
     c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/mpegaudiodsp_altivec.c b/libavcodec/ppc/mpegaudiodsp_altivec.c
index 4c07131..efa9fd5 100644
--- a/libavcodec/ppc/mpegaudiodsp_altivec.c
+++ b/libavcodec/ppc/mpegaudiodsp_altivec.c
@@ -2,20 +2,20 @@
  * Altivec optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -128,14 +128,14 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     *out = sum;
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpadsp_init_ppc(MPADSPContext *s)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     s->apply_window_float = apply_window_mp3;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c
index 89e15a4..2c6ff91 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -4,20 +4,20 @@
  * dct_unquantize_h263_altivec:
  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 /* AltiVec version of dct_unquantize_h263
    this code assumes `block' is 16 bytes-aligned */
@@ -43,8 +43,6 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
-
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
 
@@ -60,6 +58,7 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
         nCoeffs= 63; //does not always use zigzag table
     } else {
         i = 0;
+        av_assert2(s->block_last_index[n]>=0);
         nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
     }
 
@@ -114,11 +113,11 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
     }
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpv_common_init_ppc(MpegEncContext *s)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -127,5 +126,5 @@ av_cold void ff_mpv_common_init_ppc(MpegEncContext *s)
         s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
         s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
     }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/mpegvideodsp.c b/libavcodec/ppc/mpegvideodsp.c
index 44ae126..990a974 100644
--- a/libavcodec/ppc/mpegvideodsp.c
+++ b/libavcodec/ppc/mpegvideodsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 #include "libavcodec/mpegvideodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 /* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
  * to preserve proper dst alignment. */
 static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
@@ -68,7 +68,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
                                                    vec_lvsl(0, src));
 
     if (src_really_odd != 0x0000000F)
-        /* If src & 0xF == 0xF, then (src + 1) is properly aligned
+        /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
          * on the second vector. */
         srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
     else
@@ -90,7 +90,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
         srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
 
         if (src_really_odd != 0x0000000F)
-            /* If src & 0xF == 0xF, then (src + 1) is properly aligned
+            /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
              * on the second vector. */
             srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
         else
@@ -125,14 +125,14 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
         src += stride;
     }
 }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpegvideodsp_init_ppc(MpegVideoDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     c->gmc1 = gmc1_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/mpegvideoencdsp.c b/libavcodec/ppc/mpegvideoencdsp.c
index d11f05b..b96487b 100644
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,36 @@
 
 #include "libavcodec/mpegvideoencdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
+#if HAVE_VSX
+static int pix_norm1_altivec(uint8_t *pix, int line_size)
+{
+    int i, s = 0;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sum;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned pixels. */
+        //vector unsigned char pixl = vec_ld(0,  pix);
+        //vector unsigned char pixr = vec_ld(15, pix);
+        //vector unsigned char pixv = vec_perm(pixl, pixr, perm);
+        vector unsigned char pixv = vec_vsx_ld(0,  pix);
+
+        /* Square the values, and add them to our sum. */
+        sv = vec_msum(pixv, pixv, sv);
+
+        pix += line_size;
+    }
+    /* Sum up the four partial sums, and put the result into s. */
+    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
+    sum = vec_splat(sum, 3);
+    vec_ste(sum, 0, &s);
+    return s;
+}
+#else
 static int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
     int i, s = 0;
@@ -56,7 +84,37 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
 
     return s;
 }
+#endif /* HAVE_VSX */
+
+#if HAVE_VSX
+static int pix_sum_altivec(uint8_t *pix, int line_size)
+{
+    int i, s;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sumdiffs;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned 16 pixels into t1. */
+        //vector unsigned char pixl = vec_ld(0,  pix);
+        //vector unsigned char pixr = vec_ld(15, pix);
+        //vector unsigned char t1   = vec_perm(pixl, pixr, perm);
+        vector unsigned char t1   = vec_vsx_ld(0,  pix);
 
+        /* Add each 4 pixel group together and put 4 results into sad. */
+        sad = vec_sum4s(t1, sad);
+
+        pix += line_size;
+    }
+
+    /* Sum up the four partial sums, and put the result into s. */
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
+    sumdiffs = vec_splat(sumdiffs, 3);
+    vec_ste(sumdiffs, 0, &s);
+    return s;
+}
+#else
 static int pix_sum_altivec(uint8_t *pix, int line_size)
 {
     int i, s;
@@ -86,16 +144,18 @@ static int pix_sum_altivec(uint8_t *pix, int line_size)
     return s;
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_VSX */
+
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                          AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     c->pix_norm1 = pix_norm1_altivec;
     c->pix_sum   = pix_sum_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/pixblockdsp.c b/libavcodec/ppc/pixblockdsp.c
index c9e598b..01d14b4 100644
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,16 +30,44 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
+#if HAVE_VSX
+static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
+                               ptrdiff_t stride)
+{
+    int i;
+    vector unsigned char perm =
+        (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
+            0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
+    const vector unsigned char zero =
+        (const vector unsigned char) vec_splat_u8(0);
+
+    for (i = 0; i < 8; i++) {
+        /* Read potentially unaligned pixels.
+         * We're reading 16 pixels, and actually only want 8,
+         * but we simply ignore the extras. */
+        vector unsigned char bytes = vec_vsx_ld(0, pixels);
+
+        // Convert the bytes into shorts.
+        //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
+        vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
+
+        // Save the data to the block, we assume the block is 16-byte aligned.
+        vec_vsx_st(shorts, i * 16, (vector signed short *) block);
+
+        pixels += stride;
+    }
+}
+#else
 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
                                ptrdiff_t stride)
 {
     int i;
-    vec_u8 perm = vec_lvsl(0, pixels);
     const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
 
     for (i = 0; i < 8; i++) {
+        vec_u8 perm = vec_lvsl(0, pixels);
         /* Read potentially unaligned pixels.
          * We're reading 16 pixels, and actually only want 8,
          * but we simply ignore the extras. */
@@ -57,12 +85,76 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
     }
 }
 
+#endif /* HAVE_VSX */
+
+#if HAVE_VSX
+static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
+                                const uint8_t *s2, ptrdiff_t stride)
+{
+  int i;
+  const vector unsigned char zero =
+    (const vector unsigned char) vec_splat_u8(0);
+  vector signed short shorts1, shorts2;
+
+  for (i = 0; i < 4; i++) {
+    /* Read potentially unaligned pixels.
+     * We're reading 16 pixels, and actually only want 8,
+     * but we simply ignore the extras. */
+    vector unsigned char bytes = vec_vsx_ld(0,  s1);
+
+    // Convert the bytes into shorts.
+    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the same for the second block of pixels.
+    bytes =vec_vsx_ld(0,  s2);
+
+    // Convert the bytes into shorts.
+    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the subtraction.
+    shorts1 = vec_sub(shorts1, shorts2);
+
+    // Save the data to the block, we assume the block is 16-byte aligned.
+    vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+    s1    += stride;
+    s2    += stride;
+    block += 8;
+
+    /* The code below is a copy of the code above...
+     * This is a manual unroll. */
+
+    /* Read potentially unaligned pixels.
+     * We're reading 16 pixels, and actually only want 8,
+     * but we simply ignore the extras. */
+    bytes = vec_vsx_ld(0,  s1);
+
+    // Convert the bytes into shorts.
+    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the same for the second block of pixels.
+    bytes = vec_vsx_ld(0,  s2);
+
+    // Convert the bytes into shorts.
+    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the subtraction.
+    shorts1 = vec_sub(shorts1, shorts2);
+
+    // Save the data to the block, we assume the block is 16-byte aligned.
+    vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+    s1    += stride;
+    s2    += stride;
+    block += 8;
+  }
+}
+#else
 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
                                 const uint8_t *s2, ptrdiff_t stride)
 {
     int i;
-    vec_u8 perm1 = vec_lvsl(0, s1);
-    vec_u8 perm2 = vec_lvsl(0, s2);
+    vec_u8 perm;
     const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
     vec_s16 shorts1, shorts2;
 
@@ -70,17 +162,19 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
         /* Read potentially unaligned pixels.
          * We're reading 16 pixels, and actually only want 8,
          * but we simply ignore the extras. */
+        perm = vec_lvsl(0, s1);
         vec_u8 pixl  = vec_ld(0,  s1);
         vec_u8 pixr  = vec_ld(15, s1);
-        vec_u8 bytes = vec_perm(pixl, pixr, perm1);
+        vec_u8 bytes = vec_perm(pixl, pixr, perm);
 
         // Convert the bytes into shorts.
         shorts1 = (vec_s16)vec_mergeh(zero, bytes);
 
         // Do the same for the second block of pixels.
+        perm = vec_lvsl(0, s2);
         pixl  = vec_ld(0,  s2);
         pixr  = vec_ld(15, s2);
-        bytes = vec_perm(pixl, pixr, perm2);
+        bytes = vec_perm(pixl, pixr, perm);
 
         // Convert the bytes into shorts.
         shorts2 = (vec_s16)vec_mergeh(zero, bytes);
@@ -101,17 +195,19 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
         /* Read potentially unaligned pixels.
          * We're reading 16 pixels, and actually only want 8,
          * but we simply ignore the extras. */
+        perm = vec_lvsl(0, s1);
         pixl  = vec_ld(0,  s1);
         pixr  = vec_ld(15, s1);
-        bytes = vec_perm(pixl, pixr, perm1);
+        bytes = vec_perm(pixl, pixr, perm);
 
         // Convert the bytes into shorts.
         shorts1 = (vec_s16)vec_mergeh(zero, bytes);
 
         // Do the same for the second block of pixels.
+        perm = vec_lvsl(0, s2);
         pixl  = vec_ld(0,  s2);
         pixr  = vec_ld(15, s2);
-        bytes = vec_perm(pixl, pixr, perm2);
+        bytes = vec_perm(pixl, pixr, perm);
 
         // Convert the bytes into shorts.
         shorts2 = (vec_s16)vec_mergeh(zero, bytes);
@@ -128,7 +224,9 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
     }
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_VSX */
+
+#endif /* HAVE_ALTIVEC */
 
 #if HAVE_VSX
 static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
@@ -168,7 +266,7 @@ av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -177,7 +275,7 @@ av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
     if (!high_bit_depth) {
         c->get_pixels = get_pixels_altivec;
     }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 #if HAVE_VSX
     if (!PPC_VSX(av_get_cpu_flags()))
diff --git a/libavcodec/ppc/svq1enc_altivec.c b/libavcodec/ppc/svq1enc_altivec.c
index e155f88..aa66b40 100644
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,9 +29,9 @@
 
 #include "libavcodec/svq1enc.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
-                                     int size)
+                                     intptr_t size)
 {
     int i, size16 = size >> 4;
     vector signed char vpix1;
@@ -69,14 +69,14 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
 
     return u.score[3];
 }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c
index fc82502..bbadb2a 100644
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 
 #include "libavcodec/vc1dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 // main steps of 8x8 transform
 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
@@ -306,16 +306,23 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, ptrdiff_t stride,
     src2 = vec_pack(s2, sA);
     src3 = vec_pack(s3, sB);
 
+#if HAVE_BIGENDIAN
     p0 = vec_lvsl (0, dest);
     p1 = vec_lvsl (stride, dest);
     p = vec_splat_u8 (-1);
     perm0 = vec_mergeh (p, p0);
     perm1 = vec_mergeh (p, p1);
+#define GET_TMP2(dst, p)        \
+    tmp = vec_ld (0, dest);     \
+    tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), p);
+#else
+#define GET_TMP2(dst,p)         \
+    tmp = vec_vsx_ld (0, dst);  \
+    tmp2 = (vector signed short)vec_mergeh (tmp, vec_splat_u8(0));
+#endif
 
 #define ADD(dest,src,perm)                                              \
-    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
-    tmp = vec_ld (0, dest);                                             \
-    tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm);  \
+    GET_TMP2(dest, perm);                                               \
     tmp3 = vec_adds (tmp2, src);                                        \
     tmp = vec_packsu (tmp3, tmp3);                                      \
     vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest);        \
@@ -342,11 +349,11 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, ptrdiff_t stride,
 #undef OP_U8_ALTIVEC
 #undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_vc1dsp_init_ppc(VC1DSPContext *dsp)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -354,5 +361,5 @@ av_cold void ff_vc1dsp_init_ppc(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/videodsp.c b/libavcodec/ppc/videodsp.c
index b9e003b..9157022 100644
--- a/libavcodec/ppc/videodsp.c
+++ b/libavcodec/ppc/videodsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2004 Romain Dolbeau
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/vorbisdsp_altivec.c b/libavcodec/ppc/vorbisdsp_altivec.c
index 52c2952..4dabf2d 100644
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 #include "libavcodec/vorbisdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
                                             intptr_t blocksize)
 {
@@ -50,14 +50,14 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
         vec_stl(m, 0, mag+i);
     }
 }
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index 2b7cc9d..a9a48d1 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,12 +29,17 @@
 
 #include "libavcodec/vp3dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static const vec_s16 constants =
     {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
+#if HAVE_BIGENDIAN
 static const vec_u8 interleave_high =
     {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+#else
+static const vec_u8 interleave_high =
+    {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+#endif
 
 #define IDCT_START \
     vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
@@ -157,9 +162,18 @@ static void vp3_idct_add_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[6
     TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
     IDCT_1D(ADD8, SHIFT4)
 
-#define ADD(a)\
+#if HAVE_BIGENDIAN
+#define GET_VDST16\
     vdst = vec_ld(0, dst);\
-    vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\
+    vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);
+#else
+#define GET_VDST16\
+    vdst = vec_vsx_ld(0,dst);\
+    vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v);
+#endif
+
+#define ADD(a)\
+    GET_VDST16;\
     vdst_16 = vec_adds(a, vdst_16);\
     t = vec_packsu(vdst_16, vdst_16);\
     vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
@@ -176,15 +190,15 @@ static void vp3_idct_add_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[6
     memset(block, 0, sizeof(*block) * 64);
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 
 av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
     c->idct_put = vp3_idct_put_altivec;
     c->idct_add = vp3_idct_add_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif
 }
diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c
index 6857e6b..31201ed 100644
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2010 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
 
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
 
 // h subpel filter uses msum to multiply+add 4 pixel taps at once
@@ -61,17 +61,30 @@ static const vec_s8 h_subpel_filters_outer[3] =
     vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
     vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
 
+#if HAVE_BIGENDIAN
+#define GET_PIXHL(offset)                   \
+    a = vec_ld((offset)-is6tap-1, src);     \
+    b = vec_ld((offset)-is6tap-1+15, src);  \
+    pixh  = vec_perm(a, b, permh##offset);  \
+    pixl  = vec_perm(a, b, perml##offset)
+
+#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
+#else
+#define GET_PIXHL(offset)                   \
+    a = vec_vsx_ld((offset)-is6tap-1, src); \
+    pixh  = vec_perm(a, a, perm_inner);     \
+    pixl  = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
+
+#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
+#endif
+
 #define FILTER_H(dstv, off) \
-    a = vec_ld((off)-is6tap-1,    src); \
-    b = vec_ld((off)-is6tap-1+15, src); \
-\
-    pixh  = vec_perm(a, b, permh##off); \
-    pixl  = vec_perm(a, b, perml##off); \
+    GET_PIXHL(off);                            \
     filth = vec_msum(filter_inner, pixh, c64); \
     filtl = vec_msum(filter_inner, pixl, c64); \
 \
     if (is6tap) { \
-        outer = vec_perm(a, b, perm_6tap##off); \
+        GET_OUTER(off);                                \
         filth = vec_msum(filter_outerh, outer, filth); \
         filtl = vec_msum(filter_outerl, outer, filtl); \
     } \
@@ -86,9 +99,12 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                  int h, int mx, int w, int is6tap)
 {
     LOAD_H_SUBPEL_FILTER(mx-1);
-    vec_u8 align_vec0, align_vec8, permh0, permh8, filt;
+#if HAVE_BIGENDIAN
+    vec_u8 align_vec0, align_vec8, permh0, permh8;
     vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
-    vec_u8 a, b, pixh, pixl, outer;
+    vec_u8 b;
+#endif
+    vec_u8 filt, a, pixh, pixl, outer;
     vec_s16 f16h, f16l;
     vec_s32 filth, filtl;
 
@@ -99,6 +115,7 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
     vec_u16 c7  = vec_splat_u16(7);
 
+#if HAVE_BIGENDIAN
     align_vec0 = vec_lvsl( -is6tap-1, src);
     align_vec8 = vec_lvsl(8-is6tap-1, src);
 
@@ -109,6 +126,7 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     perml8     = vec_perm(align_vec8, align_vec8, perm_inner);
     perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
     perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
+#endif
 
     while (h --> 0) {
         FILTER_H(f16h, 0);
@@ -166,6 +184,12 @@ static const vec_u8 v_subpel_filters[7] =
     dstv = vec_adds(dstv, c64); \
     dstv = vec_sra(dstv, c7)
 
+#if HAVE_BIGENDIAN
+#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
+#else
+#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
+#endif
+
 static av_always_inline
 void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                  uint8_t *src, ptrdiff_t src_stride,
@@ -177,6 +201,7 @@ void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
     vec_u16 c7  = vec_splat_u16(7);
 
+#if HAVE_BIGENDIAN
     // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
     // so combine this permute with the alignment permute vector
     align_vech = vec_lvsl(0, src);
@@ -185,22 +210,23 @@ void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
         perm_vec = vec_mergeh(align_vech, align_vecl);
     else
         perm_vec = vec_mergeh(align_vech, align_vech);
+#endif
 
     if (is6tap)
-        s0 = load_with_perm_vec(-2*src_stride, src, perm_vec);
-    s1 = load_with_perm_vec(-1*src_stride, src, perm_vec);
-    s2 = load_with_perm_vec( 0*src_stride, src, perm_vec);
-    s3 = load_with_perm_vec( 1*src_stride, src, perm_vec);
+        s0 = LOAD_HL(-2*src_stride, src, perm_vec);
+    s1 = LOAD_HL(-1*src_stride, src, perm_vec);
+    s2 = LOAD_HL( 0*src_stride, src, perm_vec);
+    s3 = LOAD_HL( 1*src_stride, src, perm_vec);
     if (is6tap)
-        s4 = load_with_perm_vec( 2*src_stride, src, perm_vec);
+        s4 = LOAD_HL( 2*src_stride, src, perm_vec);
 
     src += (2+is6tap)*src_stride;
 
     while (h --> 0) {
         if (is6tap)
-            s5 = load_with_perm_vec(0, src, perm_vec);
+            s5 = LOAD_HL(0, src, perm_vec);
         else
-            s4 = load_with_perm_vec(0, src, perm_vec);
+            s4 = LOAD_HL(0, src, perm_vec);
 
         FILTER_V(f16h, vec_mule);
 
@@ -274,49 +300,36 @@ EPEL_HV(4,  4,4)
 
 static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
 {
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    register vector unsigned char perm = vec_lvsl(0, src);
+    register vector unsigned char perm;
     int i;
     register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
     register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
     register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
 
+#if HAVE_BIGENDIAN
+    perm = vec_lvsl(0, src);
+#endif
 // hand-unrolling the loop by 4 gains about 15%
 // mininum execution time goes from 74 to 60 cycles
 // it's faster than -funroll-loops, but using
 // -funroll-loops w/ this is bad - 74 cycles again.
 // all this is on a 7450, tuning for the 7450
     for (i = 0; i < h; i += 4) {
-        pixelsv1  = vec_ld( 0, src);
-        pixelsv2  = vec_ld(15, src);
-        pixelsv1B = vec_ld(sstride, src);
-        pixelsv2B = vec_ld(15 + sstride, src);
-        pixelsv1C = vec_ld(sstride2, src);
-        pixelsv2C = vec_ld(15 + sstride2, src);
-        pixelsv1D = vec_ld(sstride3, src);
-        pixelsv2D = vec_ld(15 + sstride3, src);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
-               dstride, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
-               dstride2, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
-               dstride3, (unsigned char*)dst);
+        vec_st(load_with_perm_vec(0, src, perm), 0, dst);
+        vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
+        vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
+        vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
         src += sstride4;
         dst += dstride4;
     }
 }
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
+
 
 av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
@@ -344,5 +357,5 @@ av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
     c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
     c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
     c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
index d5c5df2..eaf0d68 100644
--- a/libavcodec/profiles.c
+++ b/libavcodec/profiles.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,21 +30,31 @@ const AVProfile ff_aac_profiles[] = {
     { FF_PROFILE_AAC_LD,    "LD"       },
     { FF_PROFILE_AAC_ELD,   "ELD"      },
     { FF_PROFILE_AAC_MAIN,  "Main" },
-    { FF_PROFILE_AAC_LOW,   "LC"   },
     { FF_PROFILE_AAC_SSR,   "SSR"  },
     { FF_PROFILE_AAC_LTP,   "LTP"  },
     { FF_PROFILE_UNKNOWN },
 };
 
 const AVProfile ff_dca_profiles[] = {
-    { FF_PROFILE_DTS,        "DTS"        },
-    { FF_PROFILE_DTS_ES,     "DTS-ES"     },
-    { FF_PROFILE_DTS_96_24,  "DTS 96/24"  },
-    { FF_PROFILE_DTS_HD_HRA, "DTS-HD HRA" },
-    { FF_PROFILE_DTS_HD_MA,  "DTS-HD MA"  },
+    { FF_PROFILE_DTS,         "DTS"         },
+    { FF_PROFILE_DTS_ES,      "DTS-ES"      },
+    { FF_PROFILE_DTS_96_24,   "DTS 96/24"   },
+    { FF_PROFILE_DTS_HD_HRA,  "DTS-HD HRA"  },
+    { FF_PROFILE_DTS_HD_MA,   "DTS-HD MA"   },
+    { FF_PROFILE_DTS_EXPRESS, "DTS Express" },
     { FF_PROFILE_UNKNOWN },
 };
 
+const AVProfile ff_dnxhd_profiles[] = {
+  { FF_PROFILE_DNXHD,      "DNXHD"},
+  { FF_PROFILE_DNXHR_LB,   "DNXHR LB"},
+  { FF_PROFILE_DNXHR_SQ,   "DNXHR SQ"},
+  { FF_PROFILE_DNXHR_HQ,   "DNXHR HQ" },
+  { FF_PROFILE_DNXHR_HQX,  "DNXHR HQX"},
+  { FF_PROFILE_DNXHR_444,  "DNXHR 444"},
+  { FF_PROFILE_UNKNOWN },
+};
+
 const AVProfile ff_h264_profiles[] = {
     { FF_PROFILE_H264_BASELINE,             "Baseline"              },
     { FF_PROFILE_H264_CONSTRAINED_BASELINE, "Constrained Baseline"  },
@@ -68,6 +78,7 @@ const AVProfile ff_hevc_profiles[] = {
     { FF_PROFILE_HEVC_MAIN,                 "Main"                },
     { FF_PROFILE_HEVC_MAIN_10,              "Main 10"             },
     { FF_PROFILE_HEVC_MAIN_STILL_PICTURE,   "Main Still Picture"  },
+    { FF_PROFILE_HEVC_REXT,                 "Rext"                },
     { FF_PROFILE_UNKNOWN },
 };
 
@@ -120,4 +131,49 @@ const AVProfile ff_vc1_profiles[] = {
     { FF_PROFILE_UNKNOWN },
 };
 
+const AVProfile ff_vp9_profiles[] = {
+    { FF_PROFILE_VP9_0, "Profile 0" },
+    { FF_PROFILE_VP9_1, "Profile 1" },
+    { FF_PROFILE_VP9_2, "Profile 2" },
+    { FF_PROFILE_VP9_3, "Profile 3" },
+    { FF_PROFILE_UNKNOWN },
+};
+
+const AVProfile ff_av1_profiles[] = {
+    { FF_PROFILE_AV1_MAIN,         "Main" },
+    { FF_PROFILE_AV1_HIGH,         "High" },
+    { FF_PROFILE_AV1_PROFESSIONAL, "Professional" },
+    { FF_PROFILE_UNKNOWN },
+};
+
+const AVProfile ff_sbc_profiles[] = {
+    { FF_PROFILE_SBC_MSBC, "mSBC" },
+    { FF_PROFILE_UNKNOWN },
+};
+
+const AVProfile ff_prores_profiles[] = {
+    { FF_PROFILE_PRORES_PROXY,    "Proxy"    },
+    { FF_PROFILE_PRORES_LT,       "LT"       },
+    { FF_PROFILE_PRORES_STANDARD, "Standard" },
+    { FF_PROFILE_PRORES_HQ,       "HQ"       },
+    { FF_PROFILE_PRORES_4444,     "4444"     },
+    { FF_PROFILE_PRORES_XQ,       "XQ"       },
+    { FF_PROFILE_UNKNOWN }
+};
+
+const AVProfile ff_mjpeg_profiles[] = {
+    { FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT,            "Baseline"    },
+    { FF_PROFILE_MJPEG_HUFFMAN_EXTENDED_SEQUENTIAL_DCT, "Sequential"  },
+    { FF_PROFILE_MJPEG_HUFFMAN_PROGRESSIVE_DCT,         "Progressive" },
+    { FF_PROFILE_MJPEG_HUFFMAN_LOSSLESS,                "Lossless"    },
+    { FF_PROFILE_MJPEG_JPEG_LS,                         "JPEG LS"     },
+    { FF_PROFILE_UNKNOWN }
+};
+
+const AVProfile ff_arib_caption_profiles[] = {
+    { FF_PROFILE_ARIB_PROFILE_A, "Profile A" },
+    { FF_PROFILE_ARIB_PROFILE_C, "Profile C" },
+    { FF_PROFILE_UNKNOWN }
+};
+
 #endif /* !CONFIG_SMALL */
diff --git a/libavcodec/profiles.h b/libavcodec/profiles.h
index 0276b17..a53b67e 100644
--- a/libavcodec/profiles.h
+++ b/libavcodec/profiles.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,11 +23,18 @@
 
 extern const AVProfile ff_aac_profiles[];
 extern const AVProfile ff_dca_profiles[];
+extern const AVProfile ff_dnxhd_profiles[];
 extern const AVProfile ff_h264_profiles[];
 extern const AVProfile ff_hevc_profiles[];
 extern const AVProfile ff_jpeg2000_profiles[];
 extern const AVProfile ff_mpeg2_video_profiles[];
 extern const AVProfile ff_mpeg4_video_profiles[];
 extern const AVProfile ff_vc1_profiles[];
+extern const AVProfile ff_vp9_profiles[];
+extern const AVProfile ff_av1_profiles[];
+extern const AVProfile ff_sbc_profiles[];
+extern const AVProfile ff_prores_profiles[];
+extern const AVProfile ff_mjpeg_profiles[];
+extern const AVProfile ff_arib_caption_profiles[];
 
 #endif /* AVCODEC_PROFILES_H */
diff --git a/libavcodec/prores_metadata_bsf.c b/libavcodec/prores_metadata_bsf.c
new file mode 100644
index 0000000..0510d35
--- /dev/null
+++ b/libavcodec/prores_metadata_bsf.c
@@ -0,0 +1,172 @@
+/*
+ * Prores Metadata bitstream filter
+ * Copyright (c) 2018 Jokyo Images
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Prores Metadata bitstream filter
+ * set frame colorspace property
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+#include "bsf.h"
+
+typedef struct ProresMetadataContext {
+    const AVClass *class;
+
+    int color_primaries;
+    int transfer_characteristics;
+    int matrix_coefficients;
+} ProresMetadataContext;
+
+static int prores_metadata(AVBSFContext *bsf, AVPacket *pkt)
+{
+    ProresMetadataContext *ctx = bsf->priv_data;
+    int ret = 0;
+    int buf_size;
+    uint8_t *buf;
+
+    ret = ff_bsf_get_packet_ref(bsf, pkt);
+    if (ret < 0)
+        return ret;
+
+    ret = av_packet_make_writable(pkt);
+    if (ret < 0)
+        goto fail;
+
+    buf = pkt->data;
+    buf_size = pkt->size;
+
+    /* check start of the prores frame */
+    if (buf_size < 28) {
+        av_log(bsf, AV_LOG_ERROR, "not enough data in prores frame\n");
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    if (AV_RL32(buf + 4) != AV_RL32("icpf")) {
+        av_log(bsf, AV_LOG_ERROR, "invalid frame header\n");
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    if (AV_RB16(buf + 8) < 28) {
+        av_log(bsf, AV_LOG_ERROR, "invalid frame header size\n");
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    /* set the new values */
+    if (ctx->color_primaries != -1)
+        buf[8+14] = ctx->color_primaries;
+    if (ctx->transfer_characteristics != -1)
+        buf[8+15] = ctx->transfer_characteristics;
+    if (ctx->matrix_coefficients != -1)
+        buf[8+16] = ctx->matrix_coefficients;
+
+fail:
+    if (ret < 0)
+        av_packet_unref(pkt);
+    return ret;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_PRORES, AV_CODEC_ID_NONE,
+};
+
+static int prores_metadata_init(AVBSFContext *bsf)
+{
+    ProresMetadataContext *ctx = bsf->priv_data;
+    /*! check options */
+    switch (ctx->color_primaries) {
+    case -1:
+    case 0:
+    case AVCOL_PRI_BT709:
+    case AVCOL_PRI_BT470BG:
+    case AVCOL_PRI_SMPTE170M:
+    case AVCOL_PRI_BT2020:
+    case AVCOL_PRI_SMPTE431:
+    case AVCOL_PRI_SMPTE432:
+        break;
+    default:
+        av_log(bsf, AV_LOG_ERROR, "Color primaries %d is not a valid value\n", ctx->color_primaries);
+        return AVERROR(EINVAL);
+    }
+
+    switch (ctx->matrix_coefficients) {
+    case -1:
+    case 0:
+    case AVCOL_SPC_BT709:
+    case AVCOL_SPC_SMPTE170M:
+    case AVCOL_SPC_BT2020_NCL:
+        break;
+    default:
+        av_log(bsf, AV_LOG_ERROR, "Colorspace %d is not a valid value\n", ctx->matrix_coefficients);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(ProresMetadataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_BSF_PARAM)
+static const AVOption options[] = {
+    {"color_primaries", "select color primaries", OFFSET(color_primaries), AV_OPT_TYPE_INT, {.i64=-1}, -1, AVCOL_PRI_SMPTE432, FLAGS, "color_primaries"},
+    {"auto", "keep the same color primaries",  0, AV_OPT_TYPE_CONST, {.i64=-1},                     INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+    {"unknown",                         NULL,  0, AV_OPT_TYPE_CONST, {.i64=0},                      INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+    {"bt709",                           NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_PRI_BT709},        INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+    {"bt470bg",                         NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_PRI_BT470BG},      INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+    {"smpte170m",                       NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_PRI_SMPTE170M},    INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+    {"bt2020",                          NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_PRI_BT2020},       INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+    {"smpte431",                        NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_PRI_SMPTE431},     INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+    {"smpte432",                        NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_PRI_SMPTE432},     INT_MIN, INT_MAX, FLAGS, "color_primaries"},
+
+    {"color_trc", "select color transfer", OFFSET(transfer_characteristics), AV_OPT_TYPE_INT, {.i64=-1}, -1, AVCOL_TRC_BT709, FLAGS, "color_trc"},
+    {"auto", "keep the same color transfer",  0, AV_OPT_TYPE_CONST, {.i64=-1},                               INT_MIN, INT_MAX, FLAGS, "color_trc"},
+    {"unknown",                        NULL,  0, AV_OPT_TYPE_CONST, {.i64=0},                                INT_MIN, INT_MAX, FLAGS, "color_trc"},
+    {"bt709",                          NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_TRC_BT709},                  INT_MIN, INT_MAX, FLAGS, "color_trc"},
+
+    {"colorspace", "select colorspace", OFFSET(matrix_coefficients), AV_OPT_TYPE_INT, {.i64=-1}, -1,  AVCOL_SPC_BT2020_NCL, FLAGS, "colorspace"},
+    {"auto", "keep the same colorspace",  0, AV_OPT_TYPE_CONST, {.i64=-1},                            INT_MIN, INT_MAX, FLAGS, "colorspace"},
+    {"unknown",                    NULL,  0, AV_OPT_TYPE_CONST, {.i64=0},                             INT_MIN, INT_MAX, FLAGS, "colorspace"},
+    {"bt709",                      NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_SPC_BT709},               INT_MIN, INT_MAX, FLAGS, "colorspace"},
+    {"smpte170m",                  NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_SPC_SMPTE170M},           INT_MIN, INT_MAX, FLAGS, "colorspace"},
+    {"bt2020nc",                   NULL,  0, AV_OPT_TYPE_CONST, {.i64=AVCOL_SPC_BT2020_NCL},          INT_MIN, INT_MAX, FLAGS, "colorspace"},
+
+    { NULL },
+};
+
+static const AVClass prores_metadata_class = {
+    .class_name = "prores_metadata_bsf",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const AVBitStreamFilter ff_prores_metadata_bsf = {
+    .name       = "prores_metadata",
+    .init       = prores_metadata_init,
+    .filter     = prores_metadata,
+    .priv_data_size = sizeof(ProresMetadataContext),
+    .priv_class = &prores_metadata_class,
+    .codec_ids  = codec_ids,
+};
diff --git a/libavcodec/proresdata.c b/libavcodec/proresdata.c
index fcaf32a..9849b5c 100644
--- a/libavcodec/proresdata.c
+++ b/libavcodec/proresdata.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/proresdata.h b/libavcodec/proresdata.h
index 1e5d05e..ee8278d 100644
--- a/libavcodec/proresdata.h
+++ b/libavcodec/proresdata.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/proresdec.c b/libavcodec/proresdec.c
deleted file mode 100644
index 1659927..0000000
--- a/libavcodec/proresdec.c
+++ /dev/null
@@ -1,783 +0,0 @@
-/*
- * Apple ProRes compatible decoder
- *
- * Copyright (c) 2010-2011 Maxim Poliakovski
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * This is a decoder for Apple ProRes 422 SD/HQ/LT/Proxy and ProRes 4444.
- * It is used for storing and editing high definition video data in Apple's Final Cut Pro.
- *
- * @see http://wiki.multimedia.cx/index.php?title=Apple_ProRes
- */
-
-#define LONG_BITSTREAM_READER // some ProRes vlc codes require up to 28 bits to be read at once
-
-#include <stdint.h>
-
-#include "libavutil/intmath.h"
-#include "avcodec.h"
-#include "idctdsp.h"
-#include "internal.h"
-#include "proresdata.h"
-#include "proresdsp.h"
-#include "get_bits.h"
-
-typedef struct ProresThreadData {
-    const uint8_t *index;            ///< pointers to the data of this slice
-    int slice_num;
-    int x_pos, y_pos;
-    int slice_width;
-    int prev_slice_sf;               ///< scalefactor of the previous decoded slice
-    DECLARE_ALIGNED(16, int16_t, blocks)[8 * 4 * 64];
-    DECLARE_ALIGNED(16, int16_t, qmat_luma_scaled)[64];
-    DECLARE_ALIGNED(16, int16_t, qmat_chroma_scaled)[64];
-} ProresThreadData;
-
-typedef struct ProresContext {
-    ProresDSPContext dsp;
-    AVFrame    *frame;
-    ScanTable  scantable;
-    int        scantable_type;           ///< -1 = uninitialized, 0 = progressive, 1/2 = interlaced
-
-    int        frame_type;               ///< 0 = progressive, 1 = top-field first, 2 = bottom-field first
-    int        pic_format;               ///< 2 = 422, 3 = 444
-    uint8_t    qmat_luma[64];            ///< dequantization matrix for luma
-    uint8_t    qmat_chroma[64];          ///< dequantization matrix for chroma
-    int        qmat_changed;             ///< 1 - global quantization matrices changed
-    int        total_slices;            ///< total number of slices in a picture
-    ProresThreadData *slice_data;
-    int        pic_num;
-    int        chroma_factor;
-    int        mb_chroma_factor;
-    int        num_chroma_blocks;       ///< number of chrominance blocks in a macroblock
-    int        num_x_slices;
-    int        num_y_slices;
-    int        slice_width_factor;
-    int        slice_height_factor;
-    int        num_x_mbs;
-    int        num_y_mbs;
-    int        alpha_info;
-} ProresContext;
-
-
-static av_cold int decode_init(AVCodecContext *avctx)
-{
-    ProresContext *ctx = avctx->priv_data;
-
-    ctx->total_slices     = 0;
-    ctx->slice_data       = NULL;
-
-    avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE;
-    ff_proresdsp_init(&ctx->dsp);
-
-    ctx->scantable_type = -1;   // set scantable type to uninitialized
-    memset(ctx->qmat_luma, 4, 64);
-    memset(ctx->qmat_chroma, 4, 64);
-
-    return 0;
-}
-
-
-static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
-                               const int data_size, AVCodecContext *avctx)
-{
-    int hdr_size, version, width, height, flags;
-    const uint8_t *ptr;
-
-    hdr_size = AV_RB16(buf);
-    if (hdr_size > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "frame data too small\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    version = AV_RB16(buf + 2);
-    if (version >= 2) {
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported header version: %d\n", version);
-        return AVERROR_INVALIDDATA;
-    }
-
-    width  = AV_RB16(buf + 8);
-    height = AV_RB16(buf + 10);
-    if (width != avctx->width || height != avctx->height) {
-        av_log(avctx, AV_LOG_ERROR,
-               "picture dimension changed: old: %d x %d, new: %d x %d\n",
-               avctx->width, avctx->height, width, height);
-        return AVERROR_INVALIDDATA;
-    }
-
-    ctx->frame_type = (buf[12] >> 2) & 3;
-    if (ctx->frame_type > 2) {
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported frame type: %d\n", ctx->frame_type);
-        return AVERROR_INVALIDDATA;
-    }
-
-    ctx->chroma_factor     = (buf[12] >> 6) & 3;
-    ctx->mb_chroma_factor  = ctx->chroma_factor + 2;
-    ctx->num_chroma_blocks = (1 << ctx->chroma_factor) >> 1;
-    ctx->alpha_info        = buf[17] & 0xf;
-
-    if (ctx->alpha_info > 2) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid alpha mode %d\n", ctx->alpha_info);
-        return AVERROR_INVALIDDATA;
-    }
-
-    switch (ctx->chroma_factor) {
-    case 2:
-        avctx->pix_fmt = ctx->alpha_info ? AV_PIX_FMT_YUVA422P10
-                                         : AV_PIX_FMT_YUV422P10;
-        break;
-    case 3:
-        avctx->pix_fmt = ctx->alpha_info ? AV_PIX_FMT_YUVA444P10
-                                         : AV_PIX_FMT_YUV444P10;
-        break;
-    default:
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported picture format: %d\n", ctx->pic_format);
-        return AVERROR_INVALIDDATA;
-    }
-
-    if (ctx->scantable_type != ctx->frame_type) {
-        if (!ctx->frame_type)
-            ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable,
-                              ff_prores_progressive_scan);
-        else
-            ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable,
-                              ff_prores_interlaced_scan);
-        ctx->scantable_type = ctx->frame_type;
-    }
-
-    if (ctx->frame_type) {      /* if interlaced */
-        ctx->frame->interlaced_frame = 1;
-        ctx->frame->top_field_first  = ctx->frame_type & 1;
-    } else {
-        ctx->frame->interlaced_frame = 0;
-    }
-
-    avctx->color_primaries = buf[14];
-    avctx->color_trc       = buf[15];
-    avctx->colorspace      = buf[16];
-    avctx->color_range     = AVCOL_RANGE_MPEG;
-
-    ctx->qmat_changed = 0;
-    ptr   = buf + 20;
-    flags = buf[19];
-    if (flags & 2) {
-        if (ptr - buf > hdr_size - 64) {
-            av_log(avctx, AV_LOG_ERROR, "header data too small\n");
-            return AVERROR_INVALIDDATA;
-        }
-        if (memcmp(ctx->qmat_luma, ptr, 64)) {
-            memcpy(ctx->qmat_luma, ptr, 64);
-            ctx->qmat_changed = 1;
-        }
-        ptr += 64;
-    } else {
-        memset(ctx->qmat_luma, 4, 64);
-        ctx->qmat_changed = 1;
-    }
-
-    if (flags & 1) {
-        if (ptr - buf > hdr_size - 64) {
-            av_log(avctx, AV_LOG_ERROR, "header data too small\n");
-            return -1;
-        }
-        if (memcmp(ctx->qmat_chroma, ptr, 64)) {
-            memcpy(ctx->qmat_chroma, ptr, 64);
-            ctx->qmat_changed = 1;
-        }
-    } else {
-        memset(ctx->qmat_chroma, 4, 64);
-        ctx->qmat_changed = 1;
-    }
-
-    return hdr_size;
-}
-
-
-static int decode_picture_header(ProresContext *ctx, const uint8_t *buf,
-                                 const int data_size, AVCodecContext *avctx)
-{
-    int   i, hdr_size, pic_data_size, num_slices;
-    int   slice_width_factor, slice_height_factor;
-    int   remainder, num_x_slices;
-    const uint8_t *data_ptr, *index_ptr;
-
-    hdr_size = data_size > 0 ? buf[0] >> 3 : 0;
-    if (hdr_size < 8 || hdr_size > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "picture header too small\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    pic_data_size = AV_RB32(buf + 1);
-    if (pic_data_size > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "picture data too small\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    slice_width_factor  = buf[7] >> 4;
-    slice_height_factor = buf[7] & 0xF;
-    if (slice_width_factor > 3 || slice_height_factor) {
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported slice dimension: %d x %d\n",
-               1 << slice_width_factor, 1 << slice_height_factor);
-        return AVERROR_INVALIDDATA;
-    }
-
-    ctx->slice_width_factor  = slice_width_factor;
-    ctx->slice_height_factor = slice_height_factor;
-
-    ctx->num_x_mbs = (avctx->width + 15) >> 4;
-    ctx->num_y_mbs = (avctx->height +
-                      (1 << (4 + ctx->frame->interlaced_frame)) - 1) >>
-                     (4 + ctx->frame->interlaced_frame);
-
-    remainder    = ctx->num_x_mbs & ((1 << slice_width_factor) - 1);
-    num_x_slices = (ctx->num_x_mbs >> slice_width_factor) + (remainder & 1) +
-                   ((remainder >> 1) & 1) + ((remainder >> 2) & 1);
-
-    num_slices = num_x_slices * ctx->num_y_mbs;
-    if (num_slices != AV_RB16(buf + 5)) {
-        av_log(avctx, AV_LOG_ERROR, "invalid number of slices\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    if (ctx->total_slices != num_slices) {
-        av_freep(&ctx->slice_data);
-        ctx->slice_data = av_malloc((num_slices + 1) * sizeof(ctx->slice_data[0]));
-        if (!ctx->slice_data)
-            return AVERROR(ENOMEM);
-        ctx->total_slices = num_slices;
-    }
-
-    if (hdr_size + num_slices * 2 > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "slice table too small\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    /* parse slice table allowing quick access to the slice data */
-    index_ptr = buf + hdr_size;
-    data_ptr = index_ptr + num_slices * 2;
-
-    for (i = 0; i < num_slices; i++) {
-        ctx->slice_data[i].index = data_ptr;
-        ctx->slice_data[i].prev_slice_sf = 0;
-        data_ptr += AV_RB16(index_ptr + i * 2);
-    }
-    ctx->slice_data[i].index = data_ptr;
-    ctx->slice_data[i].prev_slice_sf = 0;
-
-    if (data_ptr > buf + data_size) {
-        av_log(avctx, AV_LOG_ERROR, "out of slice data\n");
-        return -1;
-    }
-
-    return pic_data_size;
-}
-
-
-/**
- * Read an unsigned rice/exp golomb codeword.
- */
-static inline int decode_vlc_codeword(GetBitContext *gb, unsigned codebook)
-{
-    unsigned int rice_order, exp_order, switch_bits;
-    unsigned int buf, code;
-    int log, prefix_len, len;
-
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    /* number of prefix bits to switch between Rice and expGolomb */
-    switch_bits = (codebook & 3) + 1;
-    rice_order  = codebook >> 5;        /* rice code order */
-    exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
-
-    log = 31 - av_log2(buf); /* count prefix bits (zeroes) */
-
-    if (log < switch_bits) { /* ok, we got a rice code */
-        if (!rice_order) {
-            /* shortcut for faster decoding of rice codes without remainder */
-            code = log;
-            LAST_SKIP_BITS(re, gb, log + 1);
-        } else {
-            prefix_len = log + 1;
-            code = (log << rice_order) + NEG_USR32(buf << prefix_len, rice_order);
-            LAST_SKIP_BITS(re, gb, prefix_len + rice_order);
-        }
-    } else { /* otherwise we got a exp golomb code */
-        len  = (log << 1) - switch_bits + exp_order + 1;
-        code = NEG_USR32(buf, len) - (1 << exp_order) + (switch_bits << rice_order);
-        LAST_SKIP_BITS(re, gb, len);
-    }
-
-    CLOSE_READER(re, gb);
-
-    return code;
-}
-
-#define LSB2SIGN(x) (-((x) & 1))
-#define TOSIGNED(x) (((x) >> 1) ^ LSB2SIGN(x))
-
-/**
- * Decode DC coefficients for all blocks in a slice.
- */
-static inline void decode_dc_coeffs(GetBitContext *gb, int16_t *out,
-                                    int nblocks)
-{
-    int16_t prev_dc;
-    int     i, sign;
-    int16_t delta;
-    unsigned int code;
-
-    code   = decode_vlc_codeword(gb, FIRST_DC_CB);
-    out[0] = prev_dc = TOSIGNED(code);
-
-    out   += 64; /* move to the DC coeff of the next block */
-    delta  = 3;
-
-    for (i = 1; i < nblocks; i++, out += 64) {
-        code = decode_vlc_codeword(gb, ff_prores_dc_codebook[FFMIN(FFABS(delta), 3)]);
-
-        sign     = -(((delta >> 15) & 1) ^ (code & 1));
-        delta    = (((code + 1) >> 1) ^ sign) - sign;
-        prev_dc += delta;
-        out[0]   = prev_dc;
-    }
-}
-
-#define MAX_PADDING 16
-
-/**
- * Decode AC coefficients for all blocks in a slice.
- */
-static inline int decode_ac_coeffs(GetBitContext *gb, int16_t *out,
-                                   int blocks_per_slice,
-                                   int plane_size_factor,
-                                   const uint8_t *scan)
-{
-    int pos, block_mask, run, level, sign, run_cb_index, lev_cb_index;
-    int max_coeffs, bits_left;
-
-    /* set initial prediction values */
-    run   = 4;
-    level = 2;
-
-    max_coeffs = blocks_per_slice << 6;
-    block_mask = blocks_per_slice - 1;
-
-    for (pos = blocks_per_slice - 1; pos < max_coeffs;) {
-        run_cb_index = ff_prores_run_to_cb_index[FFMIN(run, 15)];
-        lev_cb_index = ff_prores_lev_to_cb_index[FFMIN(level, 9)];
-
-        bits_left = get_bits_left(gb);
-        if (bits_left <= 0 || (bits_left <= MAX_PADDING && !show_bits(gb, bits_left)))
-            return 0;
-
-        run = decode_vlc_codeword(gb, ff_prores_ac_codebook[run_cb_index]);
-        if (run < 0)
-            return AVERROR_INVALIDDATA;
-
-        bits_left = get_bits_left(gb);
-        if (bits_left <= 0 || (bits_left <= MAX_PADDING && !show_bits(gb, bits_left)))
-            return AVERROR_INVALIDDATA;
-
-        level = decode_vlc_codeword(gb, ff_prores_ac_codebook[lev_cb_index]) + 1;
-        if (level < 0)
-            return AVERROR_INVALIDDATA;
-
-        pos += run + 1;
-        if (pos >= max_coeffs)
-            break;
-
-        sign = get_sbits(gb, 1);
-        out[((pos & block_mask) << 6) + scan[pos >> plane_size_factor]] =
-            (level ^ sign) - sign;
-    }
-
-    return 0;
-}
-
-
-/**
- * Decode a slice plane (luma or chroma).
- */
-static int decode_slice_plane(ProresContext *ctx, ProresThreadData *td,
-                              const uint8_t *buf,
-                              int data_size, uint16_t *out_ptr,
-                              int linesize, int mbs_per_slice,
-                              int blocks_per_mb, int plane_size_factor,
-                              const int16_t *qmat, int is_chroma)
-{
-    GetBitContext gb;
-    int16_t *block_ptr;
-    int mb_num, blocks_per_slice, ret;
-
-    blocks_per_slice = mbs_per_slice * blocks_per_mb;
-
-    memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks));
-
-    init_get_bits(&gb, buf, data_size << 3);
-
-    decode_dc_coeffs(&gb, td->blocks, blocks_per_slice);
-
-    ret = decode_ac_coeffs(&gb, td->blocks, blocks_per_slice,
-                           plane_size_factor, ctx->scantable.permutated);
-    if (ret < 0)
-        return ret;
-
-    /* inverse quantization, inverse transform and output */
-    block_ptr = td->blocks;
-
-    if (!is_chroma) {
-        for (mb_num = 0; mb_num < mbs_per_slice; mb_num++, out_ptr += blocks_per_mb * 4) {
-            ctx->dsp.idct_put(out_ptr,                    linesize, block_ptr, qmat);
-            block_ptr += 64;
-            if (blocks_per_mb > 2) {
-                ctx->dsp.idct_put(out_ptr + 8,            linesize, block_ptr, qmat);
-                block_ptr += 64;
-            }
-            ctx->dsp.idct_put(out_ptr + linesize * 4,     linesize, block_ptr, qmat);
-            block_ptr += 64;
-            if (blocks_per_mb > 2) {
-                ctx->dsp.idct_put(out_ptr + linesize * 4 + 8, linesize, block_ptr, qmat);
-                block_ptr += 64;
-            }
-        }
-    } else {
-        for (mb_num = 0; mb_num < mbs_per_slice; mb_num++, out_ptr += blocks_per_mb * 4) {
-            ctx->dsp.idct_put(out_ptr,                    linesize, block_ptr, qmat);
-            block_ptr += 64;
-            ctx->dsp.idct_put(out_ptr + linesize * 4,     linesize, block_ptr, qmat);
-            block_ptr += 64;
-            if (blocks_per_mb > 2) {
-                ctx->dsp.idct_put(out_ptr + 8,            linesize, block_ptr, qmat);
-                block_ptr += 64;
-                ctx->dsp.idct_put(out_ptr + linesize * 4 + 8, linesize, block_ptr, qmat);
-                block_ptr += 64;
-            }
-        }
-    }
-    return 0;
-}
-
-
-static void unpack_alpha(GetBitContext *gb, uint16_t *dst, int num_coeffs,
-                         const int num_bits)
-{
-    const int mask = (1 << num_bits) - 1;
-    int i, idx, val, alpha_val;
-
-    idx       = 0;
-    alpha_val = mask;
-    do {
-        do {
-            if (get_bits1(gb))
-                val = get_bits(gb, num_bits);
-            else {
-                int sign;
-                val  = get_bits(gb, num_bits == 16 ? 7 : 4);
-                sign = val & 1;
-                val  = (val + 2) >> 1;
-                if (sign)
-                    val = -val;
-            }
-            alpha_val = (alpha_val + val) & mask;
-            if (num_bits == 16)
-                dst[idx++] = alpha_val >> 6;
-            else
-                dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
-            if (idx >= num_coeffs - 1)
-                break;
-        } while (get_bits1(gb));
-        val = get_bits(gb, 4);
-        if (!val)
-            val = get_bits(gb, 11);
-        if (idx + val > num_coeffs)
-            val = num_coeffs - idx;
-        if (num_bits == 16)
-            for (i = 0; i < val; i++)
-                dst[idx++] = alpha_val >> 6;
-        else
-            for (i = 0; i < val; i++)
-                dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
-    } while (idx < num_coeffs);
-}
-
-/**
- * Decode alpha slice plane.
- */
-static void decode_alpha_plane(ProresContext *ctx, ProresThreadData *td,
-                               const uint8_t *buf, int data_size,
-                               uint16_t *out_ptr, int linesize,
-                               int mbs_per_slice)
-{
-    GetBitContext gb;
-    int i;
-    uint16_t *block_ptr;
-
-    memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks));
-
-    init_get_bits(&gb, buf, data_size << 3);
-
-    if (ctx->alpha_info == 2)
-        unpack_alpha(&gb, td->blocks, mbs_per_slice * 4 * 64, 16);
-    else
-        unpack_alpha(&gb, td->blocks, mbs_per_slice * 4 * 64, 8);
-
-    block_ptr = td->blocks;
-
-    for (i = 0; i < 16; i++) {
-        memcpy(out_ptr, block_ptr, 16 * mbs_per_slice * sizeof(*out_ptr));
-        out_ptr   += linesize >> 1;
-        block_ptr += 16 * mbs_per_slice;
-    }
-}
-
-static int decode_slice(AVCodecContext *avctx, void *tdata)
-{
-    ProresThreadData *td = tdata;
-    ProresContext *ctx = avctx->priv_data;
-    int mb_x_pos  = td->x_pos;
-    int mb_y_pos  = td->y_pos;
-    int pic_num   = ctx->pic_num;
-    int slice_num = td->slice_num;
-    int mbs_per_slice = td->slice_width;
-    const uint8_t *buf;
-    uint8_t *y_data, *u_data, *v_data, *a_data;
-    AVFrame *pic = ctx->frame;
-    int i, sf, slice_width_factor;
-    int slice_data_size, hdr_size;
-    int y_data_size, u_data_size, v_data_size, a_data_size;
-    int y_linesize, u_linesize, v_linesize, a_linesize;
-    int coff[4];
-    int ret;
-
-    buf             = ctx->slice_data[slice_num].index;
-    slice_data_size = ctx->slice_data[slice_num + 1].index - buf;
-
-    slice_width_factor = av_log2(mbs_per_slice);
-
-    y_data     = pic->data[0];
-    u_data     = pic->data[1];
-    v_data     = pic->data[2];
-    a_data     = pic->data[3];
-    y_linesize = pic->linesize[0];
-    u_linesize = pic->linesize[1];
-    v_linesize = pic->linesize[2];
-    a_linesize = pic->linesize[3];
-
-    if (pic->interlaced_frame) {
-        if (!(pic_num ^ pic->top_field_first)) {
-            y_data += y_linesize;
-            u_data += u_linesize;
-            v_data += v_linesize;
-            if (a_data)
-                a_data += a_linesize;
-        }
-        y_linesize <<= 1;
-        u_linesize <<= 1;
-        v_linesize <<= 1;
-        a_linesize <<= 1;
-    }
-    y_data += (mb_y_pos << 4) * y_linesize + (mb_x_pos << 5);
-    u_data += (mb_y_pos << 4) * u_linesize + (mb_x_pos << ctx->mb_chroma_factor);
-    v_data += (mb_y_pos << 4) * v_linesize + (mb_x_pos << ctx->mb_chroma_factor);
-    if (a_data)
-        a_data += (mb_y_pos << 4) * a_linesize + (mb_x_pos << 5);
-
-    if (slice_data_size < 6) {
-        av_log(avctx, AV_LOG_ERROR, "slice data too small\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    /* parse slice header */
-    hdr_size    = buf[0] >> 3;
-    coff[0]     = hdr_size;
-    y_data_size = AV_RB16(buf + 2);
-    coff[1]     = coff[0] + y_data_size;
-    u_data_size = AV_RB16(buf + 4);
-    coff[2]     = coff[1] + u_data_size;
-    v_data_size = hdr_size > 7 ? AV_RB16(buf + 6) : slice_data_size - coff[2];
-    coff[3]     = coff[2] + v_data_size;
-    a_data_size = slice_data_size - coff[3];
-
-    /* if V or alpha component size is negative that means that previous
-       component sizes are too large */
-    if (v_data_size < 0 || a_data_size < 0 || hdr_size < 6) {
-        av_log(avctx, AV_LOG_ERROR, "invalid data size\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    sf = av_clip(buf[1], 1, 224);
-    sf = sf > 128 ? (sf - 96) << 2 : sf;
-
-    /* scale quantization matrixes according with slice's scale factor */
-    /* TODO: this can be SIMD-optimized a lot */
-    if (ctx->qmat_changed || sf != td->prev_slice_sf) {
-        td->prev_slice_sf = sf;
-        for (i = 0; i < 64; i++) {
-            td->qmat_luma_scaled[ctx->dsp.idct_permutation[i]]   = ctx->qmat_luma[i]   * sf;
-            td->qmat_chroma_scaled[ctx->dsp.idct_permutation[i]] = ctx->qmat_chroma[i] * sf;
-        }
-    }
-
-    /* decode luma plane */
-    ret = decode_slice_plane(ctx, td, buf + coff[0], y_data_size,
-                             (uint16_t*) y_data, y_linesize,
-                             mbs_per_slice, 4, slice_width_factor + 2,
-                             td->qmat_luma_scaled, 0);
-
-    if (ret < 0)
-        return ret;
-
-    /* decode U chroma plane */
-    ret = decode_slice_plane(ctx, td, buf + coff[1], u_data_size,
-                             (uint16_t*) u_data, u_linesize,
-                             mbs_per_slice, ctx->num_chroma_blocks,
-                             slice_width_factor + ctx->chroma_factor - 1,
-                             td->qmat_chroma_scaled, 1);
-    if (ret < 0)
-        return ret;
-
-    /* decode V chroma plane */
-    ret = decode_slice_plane(ctx, td, buf + coff[2], v_data_size,
-                             (uint16_t*) v_data, v_linesize,
-                             mbs_per_slice, ctx->num_chroma_blocks,
-                             slice_width_factor + ctx->chroma_factor - 1,
-                             td->qmat_chroma_scaled, 1);
-    if (ret < 0)
-        return ret;
-
-    /* decode alpha plane if available */
-    if (a_data && a_data_size)
-        decode_alpha_plane(ctx, td, buf + coff[3], a_data_size,
-                           (uint16_t*) a_data, a_linesize,
-                           mbs_per_slice);
-
-    return 0;
-}
-
-
-static int decode_picture(ProresContext *ctx, int pic_num,
-                          AVCodecContext *avctx)
-{
-    int slice_num, slice_width, x_pos, y_pos;
-
-    slice_num = 0;
-
-    ctx->pic_num = pic_num;
-    for (y_pos = 0; y_pos < ctx->num_y_mbs; y_pos++) {
-        slice_width = 1 << ctx->slice_width_factor;
-
-        for (x_pos = 0; x_pos < ctx->num_x_mbs && slice_width;
-             x_pos += slice_width) {
-            while (ctx->num_x_mbs - x_pos < slice_width)
-                slice_width >>= 1;
-
-            ctx->slice_data[slice_num].slice_num   = slice_num;
-            ctx->slice_data[slice_num].x_pos       = x_pos;
-            ctx->slice_data[slice_num].y_pos       = y_pos;
-            ctx->slice_data[slice_num].slice_width = slice_width;
-
-            slice_num++;
-        }
-    }
-
-    return avctx->execute(avctx, decode_slice,
-                          ctx->slice_data, NULL, slice_num,
-                          sizeof(ctx->slice_data[0]));
-}
-
-
-#define MOVE_DATA_PTR(nbytes) buf += (nbytes); buf_size -= (nbytes)
-
-static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                        AVPacket *avpkt)
-{
-    ProresContext *ctx = avctx->priv_data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-    int frame_hdr_size, pic_num, pic_data_size;
-
-    ctx->frame            = data;
-    ctx->frame->pict_type = AV_PICTURE_TYPE_I;
-    ctx->frame->key_frame = 1;
-
-    /* check frame atom container */
-    if (buf_size < 28 || buf_size < AV_RB32(buf) ||
-        AV_RB32(buf + 4) != FRAME_ID) {
-        av_log(avctx, AV_LOG_ERROR, "invalid frame\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    MOVE_DATA_PTR(8);
-
-    frame_hdr_size = decode_frame_header(ctx, buf, buf_size, avctx);
-    if (frame_hdr_size < 0)
-        return AVERROR_INVALIDDATA;
-
-    MOVE_DATA_PTR(frame_hdr_size);
-
-    if (ff_get_buffer(avctx, ctx->frame, 0) < 0)
-        return -1;
-
-    for (pic_num = 0; ctx->frame->interlaced_frame - pic_num + 1; pic_num++) {
-        pic_data_size = decode_picture_header(ctx, buf, buf_size, avctx);
-        if (pic_data_size < 0)
-            return AVERROR_INVALIDDATA;
-
-        if (decode_picture(ctx, pic_num, avctx))
-            return -1;
-
-        MOVE_DATA_PTR(pic_data_size);
-    }
-
-    ctx->frame = NULL;
-    *got_frame = 1;
-
-    return avpkt->size;
-}
-
-
-static av_cold int decode_close(AVCodecContext *avctx)
-{
-    ProresContext *ctx = avctx->priv_data;
-
-    av_freep(&ctx->slice_data);
-
-    return 0;
-}
-
-
-AVCodec ff_prores_decoder = {
-    .name           = "prores",
-    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_PRORES,
-    .priv_data_size = sizeof(ProresContext),
-    .init           = decode_init,
-    .close          = decode_close,
-    .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
-};
diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
new file mode 100644
index 0000000..06e41dd
--- /dev/null
+++ b/libavcodec/proresdec.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ * Copyright (c) 2010-2011 Elvis Presley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PRORESDEC_H
+#define AVCODEC_PRORESDEC_H
+
+#include "get_bits.h"
+#include "blockdsp.h"
+#include "proresdsp.h"
+
+typedef struct {
+    const uint8_t *data;
+    unsigned mb_x;
+    unsigned mb_y;
+    unsigned mb_count;
+    unsigned data_size;
+    int ret;
+} SliceContext;
+
+typedef struct {
+    BlockDSPContext bdsp;
+    ProresDSPContext prodsp;
+    AVFrame *frame;
+    int frame_type;              ///< 0 = progressive, 1 = tff, 2 = bff
+    uint8_t qmat_luma[64];
+    uint8_t qmat_chroma[64];
+    SliceContext *slices;
+    int slice_count;             ///< number of slices in the current picture
+    unsigned mb_width;           ///< width of the current picture in mb
+    unsigned mb_height;          ///< height of the current picture in mb
+    uint8_t progressive_scan[64];
+    uint8_t interlaced_scan[64];
+    const uint8_t *scan;
+    int first_field;
+    int alpha_info;
+    void (*unpack_alpha)(GetBitContext *gb, uint16_t *dst, int num_coeffs, const int num_bits);
+} ProresContext;
+
+#endif /* AVCODEC_PRORESDEC_H */
diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
new file mode 100644
index 0000000..6209c22
--- /dev/null
+++ b/libavcodec/proresdec2.c
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ * Copyright (c) 2010-2011 Elvis Presley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'acpo' (Proxy), 'ap4h' (4444)
+ */
+
+//#define DEBUG
+
+#define LONG_BITSTREAM_READER
+
+#include "libavutil/internal.h"
+#include "avcodec.h"
+#include "get_bits.h"
+#include "idctdsp.h"
+#include "internal.h"
+#include "profiles.h"
+#include "simple_idct.h"
+#include "proresdec.h"
+#include "proresdata.h"
+#include "thread.h"
+
+static void permute(uint8_t *dst, const uint8_t *src, const uint8_t permutation[64])
+{
+    int i;
+    for (i = 0; i < 64; i++)
+        dst[i] = permutation[src[i]];
+}
+
+#define ALPHA_SHIFT_16_TO_10(alpha_val) (alpha_val >> 6)
+#define ALPHA_SHIFT_8_TO_10(alpha_val)  ((alpha_val << 2) | (alpha_val >> 6))
+#define ALPHA_SHIFT_16_TO_12(alpha_val) (alpha_val >> 4)
+#define ALPHA_SHIFT_8_TO_12(alpha_val)  ((alpha_val << 4) | (alpha_val >> 4))
+
+static void inline unpack_alpha(GetBitContext *gb, uint16_t *dst, int num_coeffs,
+                                const int num_bits, const int decode_precision) {
+    const int mask = (1 << num_bits) - 1;
+    int i, idx, val, alpha_val;
+
+    idx       = 0;
+    alpha_val = mask;
+    do {
+        do {
+            if (get_bits1(gb)) {
+                val = get_bits(gb, num_bits);
+            } else {
+                int sign;
+                val  = get_bits(gb, num_bits == 16 ? 7 : 4);
+                sign = val & 1;
+                val  = (val + 2) >> 1;
+                if (sign)
+                    val = -val;
+            }
+            alpha_val = (alpha_val + val) & mask;
+            if (num_bits == 16) {
+                if (decode_precision == 10) {
+                    dst[idx++] = ALPHA_SHIFT_16_TO_10(alpha_val);
+                } else { /* 12b */
+                    dst[idx++] = ALPHA_SHIFT_16_TO_12(alpha_val);
+                }
+            } else {
+                if (decode_precision == 10) {
+                    dst[idx++] = ALPHA_SHIFT_8_TO_10(alpha_val);
+                } else { /* 12b */
+                    dst[idx++] = ALPHA_SHIFT_8_TO_12(alpha_val);
+                }
+            }
+            if (idx >= num_coeffs)
+                break;
+        } while (get_bits_left(gb)>0 && get_bits1(gb));
+        val = get_bits(gb, 4);
+        if (!val)
+            val = get_bits(gb, 11);
+        if (idx + val > num_coeffs)
+            val = num_coeffs - idx;
+        if (num_bits == 16) {
+            for (i = 0; i < val; i++) {
+                if (decode_precision == 10) {
+                    dst[idx++] = ALPHA_SHIFT_16_TO_10(alpha_val);
+                } else { /* 12b */
+                    dst[idx++] = ALPHA_SHIFT_16_TO_12(alpha_val);
+                }
+            }
+        } else {
+            for (i = 0; i < val; i++) {
+                if (decode_precision == 10) {
+                    dst[idx++] = ALPHA_SHIFT_8_TO_10(alpha_val);
+                } else { /* 12b */
+                    dst[idx++] = ALPHA_SHIFT_8_TO_12(alpha_val);
+                }
+            }
+        }
+    } while (idx < num_coeffs);
+}
+
+static void unpack_alpha_10(GetBitContext *gb, uint16_t *dst, int num_coeffs,
+                            const int num_bits)
+{
+    if (num_bits == 16) {
+        unpack_alpha(gb, dst, num_coeffs, 16, 10);
+    } else { /* 8 bits alpha */
+        unpack_alpha(gb, dst, num_coeffs, 8, 10);
+    }
+}
+
+static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
+                            const int num_bits)
+{
+    if (num_bits == 16) {
+        unpack_alpha(gb, dst, num_coeffs, 16, 12);
+    } else { /* 8 bits alpha */
+        unpack_alpha(gb, dst, num_coeffs, 8, 12);
+    }
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    int ret = 0;
+    ProresContext *ctx = avctx->priv_data;
+    uint8_t idct_permutation[64];
+
+    avctx->bits_per_raw_sample = 10;
+
+    switch (avctx->codec_tag) {
+    case MKTAG('a','p','c','o'):
+        avctx->profile = FF_PROFILE_PRORES_PROXY;
+        break;
+    case MKTAG('a','p','c','s'):
+        avctx->profile = FF_PROFILE_PRORES_LT;
+        break;
+    case MKTAG('a','p','c','n'):
+        avctx->profile = FF_PROFILE_PRORES_STANDARD;
+        break;
+    case MKTAG('a','p','c','h'):
+        avctx->profile = FF_PROFILE_PRORES_HQ;
+        break;
+    case MKTAG('a','p','4','h'):
+        avctx->profile = FF_PROFILE_PRORES_4444;
+        avctx->bits_per_raw_sample = 12;
+        break;
+    case MKTAG('a','p','4','x'):
+        avctx->profile = FF_PROFILE_PRORES_XQ;
+        avctx->bits_per_raw_sample = 12;
+        break;
+    default:
+        avctx->profile = FF_PROFILE_UNKNOWN;
+        av_log(avctx, AV_LOG_WARNING, "Unknown prores profile %d\n", avctx->codec_tag);
+    }
+
+    if (avctx->bits_per_raw_sample == 10) {
+        av_log(avctx, AV_LOG_DEBUG, "Auto bitdepth precision. Use 10b decoding based on codec tag.\n");
+    } else { /* 12b */
+        av_log(avctx, AV_LOG_DEBUG, "Auto bitdepth precision. Use 12b decoding based on codec tag.\n");
+    }
+
+    ff_blockdsp_init(&ctx->bdsp, avctx);
+    ret = ff_proresdsp_init(&ctx->prodsp, avctx);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Fail to init proresdsp for bits per raw sample %d\n", avctx->bits_per_raw_sample);
+        return ret;
+    }
+
+    ff_init_scantable_permutation(idct_permutation,
+                                  ctx->prodsp.idct_permutation_type);
+
+    permute(ctx->progressive_scan, ff_prores_progressive_scan, idct_permutation);
+    permute(ctx->interlaced_scan, ff_prores_interlaced_scan, idct_permutation);
+
+    if (avctx->bits_per_raw_sample == 10){
+        ctx->unpack_alpha = unpack_alpha_10;
+    } else if (avctx->bits_per_raw_sample == 12){
+        ctx->unpack_alpha = unpack_alpha_12;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Fail to set unpack_alpha for bits per raw sample %d\n", avctx->bits_per_raw_sample);
+        return AVERROR_BUG;
+    }
+    return ret;
+}
+
+static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
+                               const int data_size, AVCodecContext *avctx)
+{
+    int hdr_size, width, height, flags;
+    int version;
+    const uint8_t *ptr;
+
+    hdr_size = AV_RB16(buf);
+    ff_dlog(avctx, "header size %d\n", hdr_size);
+    if (hdr_size > data_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong header size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    version = AV_RB16(buf + 2);
+    ff_dlog(avctx, "%.4s version %d\n", buf+4, version);
+    if (version > 1) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported version: %d\n", version);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    width  = AV_RB16(buf + 8);
+    height = AV_RB16(buf + 10);
+
+    if (width != avctx->width || height != avctx->height) {
+        int ret;
+
+        av_log(avctx, AV_LOG_WARNING, "picture resolution change: %dx%d -> %dx%d\n",
+               avctx->width, avctx->height, width, height);
+        if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
+            return ret;
+    }
+
+    ctx->frame_type = (buf[12] >> 2) & 3;
+    ctx->alpha_info = buf[17] & 0xf;
+
+    if (ctx->alpha_info > 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid alpha mode %d\n", ctx->alpha_info);
+        return AVERROR_INVALIDDATA;
+    }
+    if (avctx->skip_alpha) ctx->alpha_info = 0;
+
+    ff_dlog(avctx, "frame type %d\n", ctx->frame_type);
+
+    if (ctx->frame_type == 0) {
+        ctx->scan = ctx->progressive_scan; // permuted
+    } else {
+        ctx->scan = ctx->interlaced_scan; // permuted
+        ctx->frame->interlaced_frame = 1;
+        ctx->frame->top_field_first = ctx->frame_type == 1;
+    }
+
+    if (ctx->alpha_info) {
+        if (avctx->bits_per_raw_sample == 10) {
+            avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUVA444P10 : AV_PIX_FMT_YUVA422P10;
+        } else { /* 12b */
+            avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUVA444P12 : AV_PIX_FMT_YUVA422P12;
+        }
+    } else {
+        if (avctx->bits_per_raw_sample == 10) {
+            avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUV444P10 : AV_PIX_FMT_YUV422P10;
+        } else { /* 12b */
+            avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUV444P12 : AV_PIX_FMT_YUV422P12;
+        }
+    }
+
+    avctx->color_primaries = buf[14];
+    avctx->color_trc       = buf[15];
+    avctx->colorspace      = buf[16];
+    avctx->color_range     = AVCOL_RANGE_MPEG;
+
+    ptr   = buf + 20;
+    flags = buf[19];
+    ff_dlog(avctx, "flags %x\n", flags);
+
+    if (flags & 2) {
+        if(buf + data_size - ptr < 64) {
+            av_log(avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+        permute(ctx->qmat_luma, ctx->prodsp.idct_permutation, ptr);
+        ptr += 64;
+    } else {
+        memset(ctx->qmat_luma, 4, 64);
+    }
+
+    if (flags & 1) {
+        if(buf + data_size - ptr < 64) {
+            av_log(avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+        permute(ctx->qmat_chroma, ctx->prodsp.idct_permutation, ptr);
+    } else {
+        memset(ctx->qmat_chroma, 4, 64);
+    }
+
+    return hdr_size;
+}
+
+static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, const int buf_size)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int i, hdr_size, slice_count;
+    unsigned pic_data_size;
+    int log2_slice_mb_width, log2_slice_mb_height;
+    int slice_mb_count, mb_x, mb_y;
+    const uint8_t *data_ptr, *index_ptr;
+
+    hdr_size = buf[0] >> 3;
+    if (hdr_size < 8 || hdr_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture header size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    pic_data_size = AV_RB32(buf + 1);
+    if (pic_data_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture data size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    log2_slice_mb_width  = buf[7] >> 4;
+    log2_slice_mb_height = buf[7] & 0xF;
+    if (log2_slice_mb_width > 3 || log2_slice_mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported slice resolution: %dx%d\n",
+               1 << log2_slice_mb_width, 1 << log2_slice_mb_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->mb_width  = (avctx->width  + 15) >> 4;
+    if (ctx->frame_type)
+        ctx->mb_height = (avctx->height + 31) >> 5;
+    else
+        ctx->mb_height = (avctx->height + 15) >> 4;
+
+    // QT ignores the written value
+    // slice_count = AV_RB16(buf + 5);
+    slice_count = ctx->mb_height * ((ctx->mb_width >> log2_slice_mb_width) +
+                                    av_popcount(ctx->mb_width & (1 << log2_slice_mb_width) - 1));
+
+    if (ctx->slice_count != slice_count || !ctx->slices) {
+        av_freep(&ctx->slices);
+        ctx->slice_count = 0;
+        ctx->slices = av_mallocz_array(slice_count, sizeof(*ctx->slices));
+        if (!ctx->slices)
+            return AVERROR(ENOMEM);
+        ctx->slice_count = slice_count;
+    }
+
+    if (!slice_count)
+        return AVERROR(EINVAL);
+
+    if (hdr_size + slice_count*2 > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong slice count\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // parse slice information
+    index_ptr = buf + hdr_size;
+    data_ptr  = index_ptr + slice_count*2;
+
+    slice_mb_count = 1 << log2_slice_mb_width;
+    mb_x = 0;
+    mb_y = 0;
+
+    for (i = 0; i < slice_count; i++) {
+        SliceContext *slice = &ctx->slices[i];
+
+        slice->data = data_ptr;
+        data_ptr += AV_RB16(index_ptr + i*2);
+
+        while (ctx->mb_width - mb_x < slice_mb_count)
+            slice_mb_count >>= 1;
+
+        slice->mb_x = mb_x;
+        slice->mb_y = mb_y;
+        slice->mb_count = slice_mb_count;
+        slice->data_size = data_ptr - slice->data;
+
+        if (slice->data_size < 6) {
+            av_log(avctx, AV_LOG_ERROR, "error, wrong slice data size\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        mb_x += slice_mb_count;
+        if (mb_x == ctx->mb_width) {
+            slice_mb_count = 1 << log2_slice_mb_width;
+            mb_x = 0;
+            mb_y++;
+        }
+        if (data_ptr > buf + buf_size) {
+            av_log(avctx, AV_LOG_ERROR, "error, slice out of bounds\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (mb_x || mb_y != ctx->mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "error wrong mb count y %d h %d\n",
+               mb_y, ctx->mb_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return pic_data_size;
+}
+
+#define DECODE_CODEWORD(val, codebook, SKIP)                            \
+    do {                                                                \
+        unsigned int rice_order, exp_order, switch_bits;                \
+        unsigned int q, buf, bits;                                      \
+                                                                        \
+        UPDATE_CACHE(re, gb);                                           \
+        buf = GET_CACHE(re, gb);                                        \
+                                                                        \
+        /* number of bits to switch between rice and exp golomb */      \
+        switch_bits =  codebook & 3;                                    \
+        rice_order  =  codebook >> 5;                                   \
+        exp_order   = (codebook >> 2) & 7;                              \
+                                                                        \
+        q = 31 - av_log2(buf);                                          \
+                                                                        \
+        if (q > switch_bits) { /* exp golomb */                         \
+            bits = exp_order - switch_bits + (q<<1);                    \
+            if (bits > FFMIN(MIN_CACHE_BITS, 31))                       \
+                return AVERROR_INVALIDDATA;                             \
+            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
+                ((switch_bits + 1) << rice_order);                      \
+            SKIP(re, gb, bits);                                         \
+        } else if (rice_order) {                                        \
+            SKIP_BITS(re, gb, q+1);                                     \
+            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
+            SKIP(re, gb, rice_order);                                   \
+        } else {                                                        \
+            val = q;                                                    \
+            SKIP(re, gb, q+1);                                          \
+        }                                                               \
+    } while (0)
+
+#define TOSIGNED(x) (((x) >> 1) ^ (-((x) & 1)))
+
+#define FIRST_DC_CB 0xB8
+
+static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+
+static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
+                                              int blocks_per_slice)
+{
+    int16_t prev_dc;
+    int code, i, sign;
+
+    OPEN_READER(re, gb);
+
+    DECODE_CODEWORD(code, FIRST_DC_CB, LAST_SKIP_BITS);
+    prev_dc = TOSIGNED(code);
+    out[0] = prev_dc;
+
+    out += 64; // dc coeff for the next block
+
+    code = 5;
+    sign = 0;
+    for (i = 1; i < blocks_per_slice; i++, out += 64) {
+        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)], LAST_SKIP_BITS);
+        if(code) sign ^= -(code & 1);
+        else     sign  = 0;
+        prev_dc += (((code + 1) >> 1) ^ sign) - sign;
+        out[0] = prev_dc;
+    }
+    CLOSE_READER(re, gb);
+    return 0;
+}
+
+// adaptive codebook switching lut according to previous run/level values
+static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29, 0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28, 0x28, 0x28, 0x28, 0x4C };
+
+static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
+                                             int16_t *out, int blocks_per_slice)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int block_mask, sign;
+    unsigned pos, run, level;
+    int max_coeffs, i, bits_left;
+    int log2_block_count = av_log2(blocks_per_slice);
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);                                           \
+    run   = 4;
+    level = 2;
+
+    max_coeffs = 64 << log2_block_count;
+    block_mask = blocks_per_slice - 1;
+
+    for (pos = block_mask;;) {
+        bits_left = gb->size_in_bits - re_index;
+        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
+            break;
+
+        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)], LAST_SKIP_BITS);
+        pos += run + 1;
+        if (pos >= max_coeffs) {
+            av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
+            return AVERROR_INVALIDDATA;
+        }
+
+        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)], SKIP_BITS);
+        level += 1;
+
+        i = pos >> log2_block_count;
+
+        sign = SHOW_SBITS(re, gb, 1);
+        SKIP_BITS(re, gb, 1);
+        out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
+    }
+
+    CLOSE_READER(re, gb);
+    return 0;
+}
+
+static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
+                             uint16_t *dst, int dst_stride,
+                             const uint8_t *buf, unsigned buf_size,
+                             const int16_t *qmat)
+{
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+    GetBitContext gb;
+    int i, blocks_per_slice = slice->mb_count<<2;
+    int ret;
+
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    if ((ret = decode_dc_coeffs(&gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+    if ((ret = decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        ctx->prodsp.idct_put(dst, dst_stride, block+(0<<6), qmat);
+        ctx->prodsp.idct_put(dst             +8, dst_stride, block+(1<<6), qmat);
+        ctx->prodsp.idct_put(dst+4*dst_stride  , dst_stride, block+(2<<6), qmat);
+        ctx->prodsp.idct_put(dst+4*dst_stride+8, dst_stride, block+(3<<6), qmat);
+        block += 4*64;
+        dst += 16;
+    }
+    return 0;
+}
+
+static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
+                               uint16_t *dst, int dst_stride,
+                               const uint8_t *buf, unsigned buf_size,
+                               const int16_t *qmat, int log2_blocks_per_mb)
+{
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+    GetBitContext gb;
+    int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
+    int ret;
+
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    if ((ret = decode_dc_coeffs(&gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+    if ((ret = decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        for (j = 0; j < log2_blocks_per_mb; j++) {
+            ctx->prodsp.idct_put(dst,              dst_stride, block+(0<<6), qmat);
+            ctx->prodsp.idct_put(dst+4*dst_stride, dst_stride, block+(1<<6), qmat);
+            block += 2*64;
+            dst += 8;
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode alpha slice plane.
+ */
+static void decode_slice_alpha(ProresContext *ctx,
+                               uint16_t *dst, int dst_stride,
+                               const uint8_t *buf, int buf_size,
+                               int blocks_per_slice)
+{
+    GetBitContext gb;
+    int i;
+    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+
+    for (i = 0; i < blocks_per_slice<<2; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    if (ctx->alpha_info == 2) {
+        ctx->unpack_alpha(&gb, blocks, blocks_per_slice * 4 * 64, 16);
+    } else {
+        ctx->unpack_alpha(&gb, blocks, blocks_per_slice * 4 * 64, 8);
+    }
+
+    block = blocks;
+
+    for (i = 0; i < 16; i++) {
+        memcpy(dst, block, 16 * blocks_per_slice * sizeof(*dst));
+        dst   += dst_stride >> 1;
+        block += 16 * blocks_per_slice;
+    }
+}
+
+static int decode_slice_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    ProresContext *ctx = avctx->priv_data;
+    SliceContext *slice = &ctx->slices[jobnr];
+    const uint8_t *buf = slice->data;
+    AVFrame *pic = ctx->frame;
+    int i, hdr_size, qscale, log2_chroma_blocks_per_mb;
+    int luma_stride, chroma_stride;
+    int y_data_size, u_data_size, v_data_size, a_data_size;
+    uint8_t *dest_y, *dest_u, *dest_v, *dest_a;
+    LOCAL_ALIGNED_16(int16_t, qmat_luma_scaled,  [64]);
+    LOCAL_ALIGNED_16(int16_t, qmat_chroma_scaled,[64]);
+    int mb_x_shift;
+    int ret;
+    uint16_t val_no_chroma;
+
+    slice->ret = -1;
+    //av_log(avctx, AV_LOG_INFO, "slice %d mb width %d mb x %d y %d\n",
+    //       jobnr, slice->mb_count, slice->mb_x, slice->mb_y);
+
+    // slice header
+    hdr_size = buf[0] >> 3;
+    qscale = av_clip(buf[1], 1, 224);
+    qscale = qscale > 128 ? qscale - 96 << 2: qscale;
+    y_data_size = AV_RB16(buf + 2);
+    u_data_size = AV_RB16(buf + 4);
+    v_data_size = slice->data_size - y_data_size - u_data_size - hdr_size;
+    if (hdr_size > 7) v_data_size = AV_RB16(buf + 6);
+    a_data_size = slice->data_size - y_data_size - u_data_size -
+                  v_data_size - hdr_size;
+
+    if (y_data_size < 0 || u_data_size < 0 || v_data_size < 0
+        || hdr_size+y_data_size+u_data_size+v_data_size > slice->data_size){
+        av_log(avctx, AV_LOG_ERROR, "invalid plane data size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    buf += hdr_size;
+
+    for (i = 0; i < 64; i++) {
+        qmat_luma_scaled  [i] = ctx->qmat_luma  [i] * qscale;
+        qmat_chroma_scaled[i] = ctx->qmat_chroma[i] * qscale;
+    }
+
+    if (ctx->frame_type == 0) {
+        luma_stride   = pic->linesize[0];
+        chroma_stride = pic->linesize[1];
+    } else {
+        luma_stride   = pic->linesize[0] << 1;
+        chroma_stride = pic->linesize[1] << 1;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV444P10 || avctx->pix_fmt == AV_PIX_FMT_YUVA444P10 ||
+        avctx->pix_fmt == AV_PIX_FMT_YUV444P12 || avctx->pix_fmt == AV_PIX_FMT_YUVA444P12) {
+        mb_x_shift = 5;
+        log2_chroma_blocks_per_mb = 2;
+    } else {
+        mb_x_shift = 4;
+        log2_chroma_blocks_per_mb = 1;
+    }
+
+    dest_y = pic->data[0] + (slice->mb_y << 4) * luma_stride + (slice->mb_x << 5);
+    dest_u = pic->data[1] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
+    dest_v = pic->data[2] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
+    dest_a = pic->data[3] + (slice->mb_y << 4) * luma_stride + (slice->mb_x << 5);
+
+    if (ctx->frame_type && ctx->first_field ^ ctx->frame->top_field_first) {
+        dest_y += pic->linesize[0];
+        dest_u += pic->linesize[1];
+        dest_v += pic->linesize[2];
+        dest_a += pic->linesize[3];
+    }
+
+    ret = decode_slice_luma(avctx, slice, (uint16_t*)dest_y, luma_stride,
+                            buf, y_data_size, qmat_luma_scaled);
+    if (ret < 0)
+        return ret;
+
+    if (!(avctx->flags & AV_CODEC_FLAG_GRAY) && (u_data_size + v_data_size) > 0) {
+        ret = decode_slice_chroma(avctx, slice, (uint16_t*)dest_u, chroma_stride,
+                                  buf + y_data_size, u_data_size,
+                                  qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+        if (ret < 0)
+            return ret;
+
+        ret = decode_slice_chroma(avctx, slice, (uint16_t*)dest_v, chroma_stride,
+                                  buf + y_data_size + u_data_size, v_data_size,
+                                  qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+        if (ret < 0)
+            return ret;
+    }
+    else {
+        size_t mb_max_x = slice->mb_count << (mb_x_shift - 1);
+        size_t i, j;
+        if (avctx->bits_per_raw_sample == 10) {
+            val_no_chroma = 511;
+        } else { /* 12b */
+            val_no_chroma = 511 * 4;
+        }
+        for (i = 0; i < 16; ++i)
+            for (j = 0; j < mb_max_x; ++j) {
+                *(uint16_t*)(dest_u + (i * chroma_stride) + (j << 1)) = val_no_chroma;
+                *(uint16_t*)(dest_v + (i * chroma_stride) + (j << 1)) = val_no_chroma;
+            }
+    }
+
+    /* decode alpha plane if available */
+    if (ctx->alpha_info && pic->data[3] && a_data_size)
+        decode_slice_alpha(ctx, (uint16_t*)dest_a, luma_stride,
+                           buf + y_data_size + u_data_size + v_data_size,
+                           a_data_size, slice->mb_count);
+
+    slice->ret = 0;
+    return 0;
+}
+
+static int decode_picture(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int i;
+    int error = 0;
+
+    avctx->execute2(avctx, decode_slice_thread, NULL, NULL, ctx->slice_count);
+
+    for (i = 0; i < ctx->slice_count; i++)
+        error += ctx->slices[i].ret < 0;
+
+    if (error)
+        ctx->frame->decode_error_flags = FF_DECODE_ERROR_INVALID_BITSTREAM;
+    if (error < ctx->slice_count)
+        return 0;
+
+    return ctx->slices[0].ret;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    ProresContext *ctx = avctx->priv_data;
+    ThreadFrame tframe = { .f = data };
+    AVFrame *frame = data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    int frame_hdr_size, pic_size, ret;
+
+    if (buf_size < 28 || AV_RL32(buf + 4) != AV_RL32("icpf")) {
+        av_log(avctx, AV_LOG_ERROR, "invalid frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->frame = frame;
+    ctx->frame->pict_type = AV_PICTURE_TYPE_I;
+    ctx->frame->key_frame = 1;
+    ctx->first_field = 1;
+
+    buf += 8;
+    buf_size -= 8;
+
+    frame_hdr_size = decode_frame_header(ctx, buf, buf_size, avctx);
+    if (frame_hdr_size < 0)
+        return frame_hdr_size;
+
+    buf += frame_hdr_size;
+    buf_size -= frame_hdr_size;
+
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
+        return ret;
+
+ decode_picture:
+    pic_size = decode_picture_header(avctx, buf, buf_size);
+    if (pic_size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture header\n");
+        return pic_size;
+    }
+
+    if ((ret = decode_picture(avctx)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture\n");
+        return ret;
+    }
+
+    buf += pic_size;
+    buf_size -= pic_size;
+
+    if (ctx->frame_type && buf_size > 0 && ctx->first_field) {
+        ctx->first_field = 0;
+        goto decode_picture;
+    }
+
+    *got_frame      = 1;
+
+    return avpkt->size;
+}
+
+#if HAVE_THREADS
+static int decode_init_thread_copy(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+
+    ctx->slices = NULL;
+
+    return 0;
+}
+#endif
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+
+    av_freep(&ctx->slices);
+
+    return 0;
+}
+
+AVCodec ff_prores_decoder = {
+    .name           = "prores",
+    .long_name      = NULL_IF_CONFIG_SMALL("ProRes (iCodec Pro)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = decode_init,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(decode_init_thread_copy),
+    .close          = decode_close,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
+};
diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c
index f782c90..a3c618c 100644
--- a/libavcodec/proresdsp.c
+++ b/libavcodec/proresdsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,42 +27,71 @@
 #include "proresdsp.h"
 #include "simple_idct.h"
 
-#define BIAS     (1 << (PRORES_BITS_PER_SAMPLE - 1))           ///< bias value for converting signed pixels into unsigned ones
-#define CLIP_MIN (1 << (PRORES_BITS_PER_SAMPLE - 8))           ///< minimum value for clipping resulting pixels
-#define CLIP_MAX (1 << PRORES_BITS_PER_SAMPLE) - CLIP_MIN - 1  ///< maximum value for clipping resulting pixels
+#define CLIP_MIN (1 << 2)                     ///< minimum value for clipping resulting pixels
+#define CLIP_MAX_10 (1 << 10) - CLIP_MIN - 1  ///< maximum value for clipping resulting pixels
+#define CLIP_MAX_12 (1 << 12) - CLIP_MIN - 1  ///< maximum value for clipping resulting pixels
 
-#define CLIP_AND_BIAS(x) (av_clip((x) + BIAS, CLIP_MIN, CLIP_MAX))
+#define CLIP_10(x) (av_clip((x), CLIP_MIN, CLIP_MAX_10))
+#define CLIP_12(x) (av_clip((x), CLIP_MIN, CLIP_MAX_12))
 
 /**
  * Add bias value, clamp and output pixels of a slice
  */
-static void put_pixels(uint16_t *dst, ptrdiff_t linesize, const int16_t *in)
-{
+
+static inline void put_pixel(uint16_t *dst, ptrdiff_t linesize, const int16_t *in, int bits_per_raw_sample) {
     int x, y, src_offset, dst_offset;
 
     for (y = 0, dst_offset = 0; y < 8; y++, dst_offset += linesize) {
         for (x = 0; x < 8; x++) {
             src_offset = (y << 3) + x;
 
-            dst[dst_offset + x] = CLIP_AND_BIAS(in[src_offset]);
+            if (bits_per_raw_sample == 10) {
+                dst[dst_offset + x] = CLIP_10(in[src_offset]);
+            } else {//12b
+                dst[dst_offset + x] = CLIP_12(in[src_offset]);
+            }
         }
     }
 }
 
-static void prores_idct_put_c(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat)
+static void put_pixels_10(uint16_t *dst, ptrdiff_t linesize, const int16_t *in)
+{
+    put_pixel(dst, linesize, in, 10);
+}
+
+static void put_pixels_12(uint16_t *dst, ptrdiff_t linesize, const int16_t *in)
+{
+    put_pixel(dst, linesize, in, 12);
+}
+
+static void prores_idct_put_10_c(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat)
+{
+    ff_prores_idct_10(block, qmat);
+    put_pixels_10(out, linesize >> 1, block);
+}
+
+static void prores_idct_put_12_c(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat)
 {
-    ff_prores_idct(block, qmat);
-    put_pixels(out, linesize >> 1, block);
+    ff_prores_idct_12(block, qmat);
+    put_pixels_12(out, linesize >> 1, block);
 }
 
-av_cold void ff_proresdsp_init(ProresDSPContext *dsp)
+av_cold int ff_proresdsp_init(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
-    dsp->idct_put = prores_idct_put_c;
-    dsp->idct_permutation_type = FF_IDCT_PERM_NONE;
+    if (avctx->bits_per_raw_sample == 10) {
+        dsp->idct_put = prores_idct_put_10_c;
+        dsp->idct_permutation_type = FF_IDCT_PERM_NONE;
+    } else if (avctx->bits_per_raw_sample == 12) {
+        dsp->idct_put = prores_idct_put_12_c;
+        dsp->idct_permutation_type = FF_IDCT_PERM_NONE;
+    } else {
+        return AVERROR_BUG;
+    }
 
     if (ARCH_X86)
-        ff_proresdsp_init_x86(dsp);
+        ff_proresdsp_init_x86(dsp, avctx);
 
     ff_init_scantable_permutation(dsp->idct_permutation,
                                   dsp->idct_permutation_type);
+    return 0;
 }
diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h
index 7f06494..37ba76b 100644
--- a/libavcodec/proresdsp.h
+++ b/libavcodec/proresdsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
-
-#define PRORES_BITS_PER_SAMPLE 10 ///< output precision of prores decoder
+#include "avcodec.h"
 
 typedef struct ProresDSPContext {
     int idct_permutation_type;
@@ -34,8 +33,8 @@ typedef struct ProresDSPContext {
     void (*idct_put)(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat);
 } ProresDSPContext;
 
-void ff_proresdsp_init(ProresDSPContext *dsp);
+int ff_proresdsp_init(ProresDSPContext *dsp, AVCodecContext *avctx);
 
-void ff_proresdsp_init_x86(ProresDSPContext *dsp);
+void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx);
 
 #endif /* AVCODEC_PRORESDSP_H */
diff --git a/libavcodec/proresenc_anatoliy.c b/libavcodec/proresenc_anatoliy.c
new file mode 100644
index 0000000..0fc79fc
--- /dev/null
+++ b/libavcodec/proresenc_anatoliy.c
@@ -0,0 +1,973 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ * Copyright (c) 2012 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Apple ProRes encoder (Anatoliy Wasserman version)
+ * Known FOURCCs: 'ap4h' (444), 'apch' (HQ), 'apcn' (422), 'apcs' (LT), 'acpo' (Proxy)
+ */
+
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "dct.h"
+#include "internal.h"
+#include "profiles.h"
+#include "proresdata.h"
+#include "put_bits.h"
+#include "bytestream.h"
+#include "fdctdsp.h"
+
+#define DEFAULT_SLICE_MB_WIDTH 8
+
+static const AVProfile profiles[] = {
+    { FF_PROFILE_PRORES_PROXY,    "apco"},
+    { FF_PROFILE_PRORES_LT,       "apcs"},
+    { FF_PROFILE_PRORES_STANDARD, "apcn"},
+    { FF_PROFILE_PRORES_HQ,       "apch"},
+    { FF_PROFILE_PRORES_4444,     "ap4h"},
+    { FF_PROFILE_PRORES_XQ,       "ap4x"},
+    { FF_PROFILE_UNKNOWN }
+};
+
+static const int qp_start_table[6] = {  8, 3, 2, 1, 1, 1};
+static const int qp_end_table[6]   = { 13, 9, 6, 6, 5, 4};
+static const int bitrate_table[6]  = { 1000, 2100, 3500, 5400, 7000, 10000};
+
+static const int valid_primaries[9]  = { AVCOL_PRI_RESERVED0, AVCOL_PRI_BT709, AVCOL_PRI_UNSPECIFIED, AVCOL_PRI_BT470BG,
+                                         AVCOL_PRI_SMPTE170M, AVCOL_PRI_BT2020, AVCOL_PRI_SMPTE431, AVCOL_PRI_SMPTE432,INT_MAX };
+static const int valid_trc[4]        = { AVCOL_TRC_RESERVED0, AVCOL_TRC_BT709, AVCOL_TRC_UNSPECIFIED, INT_MAX };
+static const int valid_colorspace[5] = { AVCOL_SPC_BT709, AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_SMPTE170M,
+                                         AVCOL_SPC_BT2020_NCL, INT_MAX };
+
+static const uint8_t QMAT_LUMA[6][64] = {
+    {
+         4,  7,  9, 11, 13, 14, 15, 63,
+         7,  7, 11, 12, 14, 15, 63, 63,
+         9, 11, 13, 14, 15, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    }, {
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41
+    }, {
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21
+    }, {
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7
+    }, { /* 444 */
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  5,
+        4,  4,  4,  4,  4,  4,  5,  5,
+        4,  4,  4,  4,  4,  5,  5,  6,
+        4,  4,  4,  4,  5,  5,  6,  7,
+        4,  4,  4,  4,  5,  6,  7,  7
+    }, { /* 444 XQ */
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  3,
+        2,  2,  2,  2,  2,  2,  3,  3,
+        2,  2,  2,  2,  2,  3,  3,  3,
+        2,  2,  2,  2,  3,  3,  3,  4,
+        2,  2,  2,  2,  3,  3,  4,  4,
+    }
+};
+
+static const uint8_t QMAT_CHROMA[6][64] = {
+    {
+         4,  7,  9, 11, 13, 14, 63, 63,
+         7,  7, 11, 12, 14, 63, 63, 63,
+         9, 11, 13, 14, 63, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    }, {
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41
+    }, {
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21
+    }, {
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7
+    }, { /* 444 */
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  5,
+        4,  4,  4,  4,  4,  4,  5,  5,
+        4,  4,  4,  4,  4,  5,  5,  6,
+        4,  4,  4,  4,  5,  5,  6,  7,
+        4,  4,  4,  4,  5,  6,  7,  7
+    }, { /* 444 xq */
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  4,
+        4,  4,  4,  4,  4,  4,  4,  5,
+        4,  4,  4,  4,  4,  4,  5,  5,
+        4,  4,  4,  4,  4,  5,  5,  6,
+        4,  4,  4,  4,  5,  5,  6,  7,
+        4,  4,  4,  4,  5,  6,  7,  7
+    }
+};
+
+
+typedef struct {
+    AVClass *class;
+    FDCTDSPContext fdsp;
+    uint8_t* fill_y;
+    uint8_t* fill_u;
+    uint8_t* fill_v;
+    uint8_t* fill_a;
+
+    int qmat_luma[16][64];
+    int qmat_chroma[16][64];
+    const uint8_t *scantable;
+
+    int is_422;
+    int need_alpha;
+    int is_interlaced;
+
+    char *vendor;
+} ProresContext;
+
+static void encode_codeword(PutBitContext *pb, int val, int codebook)
+{
+    unsigned int rice_order, exp_order, switch_bits, first_exp, exp, zeros;
+
+    /* number of bits to switch between rice and exp golomb */
+    switch_bits = codebook & 3;
+    rice_order  = codebook >> 5;
+    exp_order   = (codebook >> 2) & 7;
+
+    first_exp = ((switch_bits + 1) << rice_order);
+
+    if (val >= first_exp) { /* exp golomb */
+        val -= first_exp;
+        val += (1 << exp_order);
+        exp = av_log2(val);
+        zeros = exp - exp_order + switch_bits + 1;
+        put_bits(pb, zeros, 0);
+        put_bits(pb, exp + 1, val);
+    } else if (rice_order) {
+        put_bits(pb, (val >> rice_order), 0);
+        put_bits(pb, 1, 1);
+        put_sbits(pb, rice_order, val);
+    } else {
+        put_bits(pb, val, 0);
+        put_bits(pb, 1, 1);
+    }
+}
+
+#define QSCALE(qmat,ind,val) ((val) / ((qmat)[ind]))
+#define TO_GOLOMB(val) (((val) << 1) ^ ((val) >> 31))
+#define DIFF_SIGN(val, sign) (((val) >> 31) ^ (sign))
+#define IS_NEGATIVE(val) ((((val) >> 31) ^ -1) + 1)
+#define TO_GOLOMB2(val,sign) ((val)==0 ? 0 : ((val) << 1) + (sign))
+
+static av_always_inline int get_level(int val)
+{
+    int sign = (val >> 31);
+    return (val ^ sign) - sign;
+}
+
+#define FIRST_DC_CB 0xB8
+
+static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+
+static void encode_dc_coeffs(PutBitContext *pb, int16_t *in,
+        int blocks_per_slice, int *qmat)
+{
+    int prev_dc, code;
+    int i, sign, idx;
+    int new_dc, delta, diff_sign, new_code;
+
+    prev_dc = QSCALE(qmat, 0, in[0] - 16384);
+    code = TO_GOLOMB(prev_dc);
+    encode_codeword(pb, code, FIRST_DC_CB);
+
+    code = 5; sign = 0; idx = 64;
+    for (i = 1; i < blocks_per_slice; i++, idx += 64) {
+        new_dc    = QSCALE(qmat, 0, in[idx] - 16384);
+        delta     = new_dc - prev_dc;
+        diff_sign = DIFF_SIGN(delta, sign);
+        new_code  = TO_GOLOMB2(get_level(delta), diff_sign);
+
+        encode_codeword(pb, new_code, dc_codebook[FFMIN(code, 6)]);
+
+        code      = new_code;
+        sign      = delta >> 31;
+        prev_dc   = new_dc;
+    }
+}
+
+static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29,
+        0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28,
+        0x28, 0x28, 0x28, 0x4C };
+
+static void encode_ac_coeffs(PutBitContext *pb,
+        int16_t *in, int blocks_per_slice, int *qmat, const uint8_t ff_prores_scan[64])
+{
+    int prev_run = 4;
+    int prev_level = 2;
+
+    int run = 0, level, code, i, j;
+    for (i = 1; i < 64; i++) {
+        int indp = ff_prores_scan[i];
+        for (j = 0; j < blocks_per_slice; j++) {
+            int val = QSCALE(qmat, indp, in[(j << 6) + indp]);
+            if (val) {
+                encode_codeword(pb, run, run_to_cb[FFMIN(prev_run, 15)]);
+
+                prev_run   = run;
+                run        = 0;
+                level      = get_level(val);
+                code       = level - 1;
+
+                encode_codeword(pb, code, lev_to_cb[FFMIN(prev_level, 9)]);
+
+                prev_level = level;
+
+                put_bits(pb, 1, IS_NEGATIVE(val));
+            } else {
+                ++run;
+            }
+        }
+    }
+}
+
+static void get(uint8_t *pixels, int stride, int16_t* block)
+{
+    int i;
+
+    for (i = 0; i < 8; i++) {
+        AV_WN64(block, AV_RN64(pixels));
+        AV_WN64(block+4, AV_RN64(pixels+8));
+        pixels += stride;
+        block += 8;
+    }
+}
+
+static void fdct_get(FDCTDSPContext *fdsp, uint8_t *pixels, int stride, int16_t* block)
+{
+    get(pixels, stride, block);
+    fdsp->fdct(block);
+}
+
+static void calc_plane_dct(FDCTDSPContext *fdsp, uint8_t *src, int16_t * blocks, int src_stride, int mb_count, int chroma, int is_422)
+{
+    int16_t *block;
+    int i;
+
+    block = blocks;
+
+    if (!chroma) { /* Luma plane */
+        for (i = 0; i < mb_count; i++) {
+            fdct_get(fdsp, src,                       src_stride, block + (0 << 6));
+            fdct_get(fdsp, src + 16,                  src_stride, block + (1 << 6));
+            fdct_get(fdsp, src +      8 * src_stride, src_stride, block + (2 << 6));
+            fdct_get(fdsp, src + 16 + 8 * src_stride, src_stride, block + (3 << 6));
+
+            block += 256;
+            src   += 32;
+        }
+    } else if (chroma && is_422){ /* chroma plane 422 */
+        for (i = 0; i < mb_count; i++) {
+            fdct_get(fdsp, src,                  src_stride, block + (0 << 6));
+            fdct_get(fdsp, src + 8 * src_stride, src_stride, block + (1 << 6));
+            block += (256 >> 1);
+            src   += (32  >> 1);
+        }
+    } else { /* chroma plane 444 */
+        for (i = 0; i < mb_count; i++) {
+            fdct_get(fdsp, src,                       src_stride, block + (0 << 6));
+            fdct_get(fdsp, src +      8 * src_stride, src_stride, block + (1 << 6));
+            fdct_get(fdsp, src + 16,                  src_stride, block + (2 << 6));
+            fdct_get(fdsp, src + 16 + 8 * src_stride, src_stride, block + (3 << 6));
+
+            block += 256;
+            src   += 32;
+        }
+    }
+}
+
+static int encode_slice_plane(int16_t *blocks, int mb_count, uint8_t *buf, unsigned buf_size, int *qmat, int sub_sample_chroma,
+                              const uint8_t ff_prores_scan[64])
+{
+    int blocks_per_slice;
+    PutBitContext pb;
+
+    blocks_per_slice = mb_count << (2 - sub_sample_chroma);
+    init_put_bits(&pb, buf, buf_size);
+
+    encode_dc_coeffs(&pb, blocks, blocks_per_slice, qmat);
+    encode_ac_coeffs(&pb, blocks, blocks_per_slice, qmat, ff_prores_scan);
+
+    flush_put_bits(&pb);
+    return put_bits_ptr(&pb) - pb.buf;
+}
+
+static av_always_inline unsigned encode_slice_data(AVCodecContext *avctx,
+                                                   int16_t * blocks_y, int16_t * blocks_u, int16_t * blocks_v,
+                                                   unsigned mb_count, uint8_t *buf, unsigned data_size,
+                                                   unsigned* y_data_size, unsigned* u_data_size, unsigned* v_data_size,
+                                                   int qp)
+{
+    ProresContext* ctx = avctx->priv_data;
+
+    *y_data_size = encode_slice_plane(blocks_y, mb_count,
+                                      buf, data_size, ctx->qmat_luma[qp - 1], 0, ctx->scantable);
+
+    if (!(avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        *u_data_size = encode_slice_plane(blocks_u, mb_count, buf + *y_data_size, data_size - *y_data_size,
+                                          ctx->qmat_chroma[qp - 1], ctx->is_422, ctx->scantable);
+
+        *v_data_size = encode_slice_plane(blocks_v, mb_count, buf + *y_data_size + *u_data_size,
+                                          data_size - *y_data_size - *u_data_size,
+                                          ctx->qmat_chroma[qp - 1], ctx->is_422, ctx->scantable);
+    }
+
+    return *y_data_size + *u_data_size + *v_data_size;
+}
+
+static void put_alpha_diff(PutBitContext *pb, int cur, int prev)
+{
+    const int abits = 16;
+    const int dbits = 7;
+    const int dsize = 1 << dbits - 1;
+    int diff = cur - prev;
+
+    diff = av_mod_uintp2(diff, abits);
+    if (diff >= (1 << abits) - dsize)
+        diff -= 1 << abits;
+    if (diff < -dsize || diff > dsize || !diff) {
+        put_bits(pb, 1, 1);
+        put_bits(pb, abits, diff);
+    } else {
+        put_bits(pb, 1, 0);
+        put_bits(pb, dbits - 1, FFABS(diff) - 1);
+        put_bits(pb, 1, diff < 0);
+    }
+}
+
+static inline void put_alpha_run(PutBitContext *pb, int run)
+{
+    if (run) {
+        put_bits(pb, 1, 0);
+        if (run < 0x10)
+            put_bits(pb, 4, run);
+        else
+            put_bits(pb, 15, run);
+    } else {
+        put_bits(pb, 1, 1);
+    }
+}
+
+static av_always_inline int encode_alpha_slice_data(AVCodecContext *avctx, int8_t * src_a,
+                                                   unsigned mb_count, uint8_t *buf, unsigned data_size, unsigned* a_data_size)
+{
+    const int abits = 16;
+    const int mask  = (1 << abits) - 1;
+    const int num_coeffs = mb_count * 256;
+    int prev = mask, cur;
+    int idx = 0;
+    int run = 0;
+    int16_t * blocks = (int16_t *)src_a;
+    PutBitContext pb;
+    init_put_bits(&pb, buf, data_size);
+
+    cur = blocks[idx++];
+    put_alpha_diff(&pb, cur, prev);
+    prev = cur;
+    do {
+        cur = blocks[idx++];
+        if (cur != prev) {
+            put_alpha_run (&pb, run);
+            put_alpha_diff(&pb, cur, prev);
+            prev = cur;
+            run  = 0;
+        } else {
+            run++;
+        }
+    } while (idx < num_coeffs);
+    if (run)
+        put_alpha_run(&pb, run);
+    flush_put_bits(&pb);
+    *a_data_size = put_bits_count(&pb) >> 3;
+
+    if (put_bits_left(&pb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Underestimated required buffer size.\n");
+        return AVERROR_BUG;
+    } else {
+        return 0;
+    }
+}
+
+static inline void subimage_with_fill_template(uint16_t *src, unsigned x, unsigned y,
+                                               unsigned stride, unsigned width, unsigned height, uint16_t *dst,
+                                               unsigned dst_width, unsigned dst_height, int is_alpha_plane,
+                                               int is_interlaced, int is_top_field)
+{
+    int box_width = FFMIN(width - x, dst_width);
+    int i, j, src_stride, box_height;
+    uint16_t last_pix, *last_line;
+
+    if (!is_interlaced) {
+        src_stride = stride >> 1;
+        src += y * src_stride + x;
+        box_height = FFMIN(height - y, dst_height);
+    } else {
+        src_stride = stride; /* 2 lines stride */
+        src += y * src_stride + x;
+        box_height = FFMIN(height/2 - y, dst_height);
+        if (!is_top_field)
+            src += stride >> 1;
+    }
+
+    for (i = 0; i < box_height; ++i) {
+        for (j = 0; j < box_width; ++j) {
+            if (!is_alpha_plane) {
+                dst[j] = src[j];
+            } else {
+                dst[j] = src[j] << 6; /* alpha 10b to 16b */
+            }
+        }
+        if (!is_alpha_plane) {
+            last_pix = dst[j - 1];
+        } else {
+            last_pix = dst[j - 1] << 6; /* alpha 10b to 16b */
+        }
+        for (; j < dst_width; j++)
+            dst[j] = last_pix;
+        src += src_stride;
+        dst += dst_width;
+    }
+    last_line = dst - dst_width;
+    for (; i < dst_height; i++) {
+        for (j = 0; j < dst_width; ++j) {
+            dst[j] = last_line[j];
+        }
+        dst += dst_width;
+    }
+}
+
+static void subimage_with_fill(uint16_t *src, unsigned x, unsigned y,
+        unsigned stride, unsigned width, unsigned height, uint16_t *dst,
+        unsigned dst_width, unsigned dst_height, int is_interlaced, int is_top_field)
+{
+    subimage_with_fill_template(src, x, y, stride, width, height, dst, dst_width, dst_height, 0, is_interlaced, is_top_field);
+}
+
+/* reorganize alpha data and convert 10b -> 16b */
+static void subimage_alpha_with_fill(uint16_t *src, unsigned x, unsigned y,
+                               unsigned stride, unsigned width, unsigned height, uint16_t *dst,
+                               unsigned dst_width, unsigned dst_height, int is_interlaced, int is_top_field)
+{
+    subimage_with_fill_template(src, x, y, stride, width, height, dst, dst_width, dst_height, 1, is_interlaced, is_top_field);
+}
+
+static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, int mb_x,
+        int mb_y, unsigned mb_count, uint8_t *buf, unsigned data_size,
+        int unsafe, int *qp, int is_interlaced, int is_top_field)
+{
+    int luma_stride, chroma_stride, alpha_stride = 0;
+    ProresContext* ctx = avctx->priv_data;
+    int hdr_size = 6 + (ctx->need_alpha * 2); /* v data size is write when there is alpha */
+    int ret = 0, slice_size;
+    uint8_t *dest_y, *dest_u, *dest_v;
+    unsigned y_data_size = 0, u_data_size = 0, v_data_size = 0, a_data_size = 0;
+    FDCTDSPContext *fdsp = &ctx->fdsp;
+    int tgt_bits   = (mb_count * bitrate_table[avctx->profile]) >> 2;
+    int low_bytes  = (tgt_bits - (tgt_bits >> 3)) >> 3; // 12% bitrate fluctuation
+    int high_bytes = (tgt_bits + (tgt_bits >> 3)) >> 3;
+
+    LOCAL_ALIGNED(16, int16_t, blocks_y, [DEFAULT_SLICE_MB_WIDTH << 8]);
+    LOCAL_ALIGNED(16, int16_t, blocks_u, [DEFAULT_SLICE_MB_WIDTH << 8]);
+    LOCAL_ALIGNED(16, int16_t, blocks_v, [DEFAULT_SLICE_MB_WIDTH << 8]);
+
+    luma_stride   = pic->linesize[0];
+    chroma_stride = pic->linesize[1];
+
+    if (ctx->need_alpha)
+        alpha_stride = pic->linesize[3];
+
+    if (!is_interlaced) {
+        dest_y = pic->data[0] + (mb_y << 4) * luma_stride   + (mb_x << 5);
+        dest_u = pic->data[1] + (mb_y << 4) * chroma_stride + (mb_x << (5 - ctx->is_422));
+        dest_v = pic->data[2] + (mb_y << 4) * chroma_stride + (mb_x << (5 - ctx->is_422));
+    } else {
+        dest_y = pic->data[0] + (mb_y << 4) * luma_stride * 2   + (mb_x << 5);
+        dest_u = pic->data[1] + (mb_y << 4) * chroma_stride * 2 + (mb_x << (5 - ctx->is_422));
+        dest_v = pic->data[2] + (mb_y << 4) * chroma_stride * 2 + (mb_x << (5 - ctx->is_422));
+        if (!is_top_field){ /* bottom field, offset dest */
+            dest_y += luma_stride;
+            dest_u += chroma_stride;
+            dest_v += chroma_stride;
+        }
+    }
+
+    if (unsafe) {
+        subimage_with_fill((uint16_t *) pic->data[0], mb_x << 4, mb_y << 4,
+                luma_stride, avctx->width, avctx->height,
+                (uint16_t *) ctx->fill_y, mb_count << 4, 16, is_interlaced, is_top_field);
+        subimage_with_fill((uint16_t *) pic->data[1], mb_x << (4 - ctx->is_422), mb_y << 4,
+                           chroma_stride, avctx->width >> ctx->is_422, avctx->height,
+                           (uint16_t *) ctx->fill_u, mb_count << (4 - ctx->is_422), 16, is_interlaced, is_top_field);
+        subimage_with_fill((uint16_t *) pic->data[2], mb_x << (4 - ctx->is_422), mb_y << 4,
+                           chroma_stride, avctx->width >> ctx->is_422, avctx->height,
+                           (uint16_t *) ctx->fill_v, mb_count << (4 - ctx->is_422), 16, is_interlaced, is_top_field);
+
+        /* no need for interlaced special case, data already reorganized in subimage_with_fill */
+        calc_plane_dct(fdsp, ctx->fill_y, blocks_y, mb_count <<  5,                mb_count, 0, 0);
+        calc_plane_dct(fdsp, ctx->fill_u, blocks_u, mb_count << (5 - ctx->is_422), mb_count, 1, ctx->is_422);
+        calc_plane_dct(fdsp, ctx->fill_v, blocks_v, mb_count << (5 - ctx->is_422), mb_count, 1, ctx->is_422);
+
+        slice_size = encode_slice_data(avctx, blocks_y, blocks_u, blocks_v,
+                          mb_count, buf + hdr_size, data_size - hdr_size,
+                          &y_data_size, &u_data_size, &v_data_size,
+                          *qp);
+    } else {
+        if (!is_interlaced) {
+            calc_plane_dct(fdsp, dest_y, blocks_y, luma_stride, mb_count, 0, 0);
+            calc_plane_dct(fdsp, dest_u, blocks_u, chroma_stride, mb_count, 1, ctx->is_422);
+            calc_plane_dct(fdsp, dest_v, blocks_v, chroma_stride, mb_count, 1, ctx->is_422);
+        } else {
+            calc_plane_dct(fdsp, dest_y, blocks_y, luma_stride   * 2, mb_count, 0, 0);
+            calc_plane_dct(fdsp, dest_u, blocks_u, chroma_stride * 2, mb_count, 1, ctx->is_422);
+            calc_plane_dct(fdsp, dest_v, blocks_v, chroma_stride * 2, mb_count, 1, ctx->is_422);
+        }
+
+        slice_size = encode_slice_data(avctx, blocks_y, blocks_u, blocks_v,
+                          mb_count, buf + hdr_size, data_size - hdr_size,
+                          &y_data_size, &u_data_size, &v_data_size,
+                          *qp);
+
+        if (slice_size > high_bytes && *qp < qp_end_table[avctx->profile]) {
+            do {
+                *qp += 1;
+                slice_size = encode_slice_data(avctx, blocks_y, blocks_u, blocks_v,
+                                               mb_count, buf + hdr_size, data_size - hdr_size,
+                                               &y_data_size, &u_data_size, &v_data_size,
+                                               *qp);
+            } while (slice_size > high_bytes && *qp < qp_end_table[avctx->profile]);
+        } else if (slice_size < low_bytes && *qp
+                > qp_start_table[avctx->profile]) {
+            do {
+                *qp -= 1;
+                slice_size = encode_slice_data(avctx, blocks_y, blocks_u, blocks_v,
+                                               mb_count, buf + hdr_size, data_size - hdr_size,
+                                               &y_data_size, &u_data_size, &v_data_size,
+                                               *qp);
+            } while (slice_size < low_bytes && *qp > qp_start_table[avctx->profile]);
+        }
+    }
+
+    buf[0] = hdr_size << 3;
+    buf[1] = *qp;
+    AV_WB16(buf + 2, y_data_size);
+    AV_WB16(buf + 4, u_data_size);
+
+    if (ctx->need_alpha) {
+        AV_WB16(buf + 6, v_data_size); /* write v data size only if there is alpha */
+
+        subimage_alpha_with_fill((uint16_t *) pic->data[3], mb_x << 4, mb_y << 4,
+                           alpha_stride, avctx->width, avctx->height,
+                           (uint16_t *) ctx->fill_a, mb_count << 4, 16, is_interlaced, is_top_field);
+        ret = encode_alpha_slice_data(avctx, ctx->fill_a, mb_count,
+                                      buf + hdr_size + slice_size,
+                                      data_size - hdr_size - slice_size, &a_data_size);
+    }
+
+    if (ret != 0) {
+        return ret;
+    }
+    return hdr_size + y_data_size + u_data_size + v_data_size + a_data_size;
+}
+
+static int prores_encode_picture(AVCodecContext *avctx, const AVFrame *pic,
+        uint8_t *buf, const int buf_size, const int picture_index, const int is_top_field)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int mb_width = (avctx->width + 15) >> 4;
+    int hdr_size, sl_size, i;
+    int mb_y, sl_data_size, qp, mb_height, picture_height, unsafe_mb_height_limit;
+    int unsafe_bot, unsafe_right;
+    uint8_t *sl_data, *sl_data_sizes;
+    int slice_per_line = 0, rem = mb_width;
+
+    if (!ctx->is_interlaced) { /* progressive encoding */
+        mb_height = (avctx->height + 15) >> 4;
+        unsafe_mb_height_limit = mb_height;
+    } else {
+        if (is_top_field) {
+            picture_height = (avctx->height + 1) / 2;
+        } else {
+            picture_height = avctx->height / 2;
+        }
+        mb_height = (picture_height + 15) >> 4;
+        unsafe_mb_height_limit = mb_height;
+    }
+
+    for (i = av_log2(DEFAULT_SLICE_MB_WIDTH); i >= 0; --i) {
+        slice_per_line += rem >> i;
+        rem &= (1 << i) - 1;
+    }
+
+    qp = qp_start_table[avctx->profile];
+    hdr_size = 8; sl_data_size = buf_size - hdr_size;
+    sl_data_sizes = buf + hdr_size;
+    sl_data = sl_data_sizes + (slice_per_line * mb_height * 2);
+    for (mb_y = 0; mb_y < mb_height; mb_y++) {
+        int mb_x = 0;
+        int slice_mb_count = DEFAULT_SLICE_MB_WIDTH;
+        while (mb_x < mb_width) {
+            while (mb_width - mb_x < slice_mb_count)
+                slice_mb_count >>= 1;
+
+            unsafe_bot = (avctx->height & 0xf) && (mb_y == unsafe_mb_height_limit - 1);
+            unsafe_right = (avctx->width & 0xf) && (mb_x + slice_mb_count == mb_width);
+
+            sl_size = encode_slice(avctx, pic, mb_x, mb_y, slice_mb_count,
+                    sl_data, sl_data_size, unsafe_bot || unsafe_right, &qp, ctx->is_interlaced, is_top_field);
+            if (sl_size < 0){
+                return sl_size;
+            }
+
+            bytestream_put_be16(&sl_data_sizes, sl_size);
+            sl_data           += sl_size;
+            sl_data_size      -= sl_size;
+            mb_x              += slice_mb_count;
+        }
+    }
+
+    buf[0] = hdr_size << 3;
+    AV_WB32(buf + 1, sl_data - buf);
+    AV_WB16(buf + 5, slice_per_line * mb_height); /* picture size */
+    buf[7] = av_log2(DEFAULT_SLICE_MB_WIDTH) << 4; /* number of slices */
+
+    return sl_data - buf;
+}
+
+static int prores_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                               const AVFrame *pict, int *got_packet)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int header_size = 148;
+    uint8_t *buf;
+    int compress_frame_size, pic_size, ret, is_top_field_first = 0;
+    uint8_t frame_flags;
+    int frame_size = FFALIGN(avctx->width, 16) * FFALIGN(avctx->height, 16)*16 + 500 + AV_INPUT_BUFFER_MIN_SIZE; //FIXME choose tighter limit
+
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, frame_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    buf = pkt->data;
+    compress_frame_size = 8 + header_size;
+
+    bytestream_put_be32(&buf, compress_frame_size);/* frame size will be update after picture(s) encoding */
+    bytestream_put_buffer(&buf, "icpf", 4);
+
+    bytestream_put_be16(&buf, header_size);
+    bytestream_put_be16(&buf, 0); /* version */
+    bytestream_put_buffer(&buf, ctx->vendor, 4);
+    bytestream_put_be16(&buf, avctx->width);
+    bytestream_put_be16(&buf, avctx->height);
+    frame_flags = 0x82; /* 422 not interlaced */
+    if (avctx->profile >= FF_PROFILE_PRORES_4444) /* 4444 or 4444 Xq */
+        frame_flags |= 0x40; /* 444 chroma */
+    if (ctx->is_interlaced) {
+        if (pict->top_field_first || !pict->interlaced_frame) { /* tff frame or progressive frame interpret as tff */
+            av_log(avctx, AV_LOG_DEBUG, "use interlaced encoding, top field first\n");
+            frame_flags |= 0x04; /* interlaced tff */
+            is_top_field_first = 1;
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "use interlaced encoding, bottom field first\n");
+            frame_flags |= 0x08; /* interlaced bff */
+        }
+    } else {
+        av_log(avctx, AV_LOG_DEBUG, "use progressive encoding\n");
+    }
+    *buf++ = frame_flags;
+    *buf++ = 0; /* reserved */
+    /* only write color properties, if valid value. set to unspecified otherwise */
+    *buf++ = ff_int_from_list_or_default(avctx, "frame color primaries", pict->color_primaries, valid_primaries, 0);
+    *buf++ = ff_int_from_list_or_default(avctx, "frame color trc", pict->color_trc, valid_trc, 0);
+    *buf++ = ff_int_from_list_or_default(avctx, "frame colorspace", pict->colorspace, valid_colorspace, 0);
+    if (avctx->profile >= FF_PROFILE_PRORES_4444) {
+        if (avctx->pix_fmt == AV_PIX_FMT_YUV444P10) {
+            *buf++ = 0xA0;/* src b64a and no alpha */
+        } else {
+            *buf++ = 0xA2;/* src b64a and 16b alpha */
+        }
+    } else {
+        *buf++ = 32;/* src v210 and no alpha */
+    }
+    *buf++ = 0; /* reserved */
+    *buf++ = 3; /* luma and chroma matrix present */
+
+    bytestream_put_buffer(&buf, QMAT_LUMA[avctx->profile],   64);
+    bytestream_put_buffer(&buf, QMAT_CHROMA[avctx->profile], 64);
+
+    pic_size = prores_encode_picture(avctx, pict, buf,
+                                     pkt->size - compress_frame_size, 0, is_top_field_first);/* encode progressive or first field */
+    if (pic_size < 0) {
+        return pic_size;
+    }
+    compress_frame_size += pic_size;
+
+    if (ctx->is_interlaced) { /* encode second field */
+        pic_size = prores_encode_picture(avctx, pict, pkt->data + compress_frame_size,
+                                         pkt->size - compress_frame_size, 1, !is_top_field_first);
+        if (pic_size < 0) {
+            return pic_size;
+        }
+        compress_frame_size += pic_size;
+    }
+
+    AV_WB32(pkt->data, compress_frame_size);/* update frame size */
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    pkt->size = compress_frame_size;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static void scale_mat(const uint8_t* src, int* dst, int scale)
+{
+    int i;
+    for (i = 0; i < 64; i++)
+        dst[i] = src[i] * scale;
+}
+
+static av_cold int prores_encode_init(AVCodecContext *avctx)
+{
+    int i;
+    ProresContext* ctx = avctx->priv_data;
+
+    avctx->bits_per_raw_sample = 10;
+    ctx->need_alpha = 0;
+    ctx->is_interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+    if (ctx->is_interlaced) {
+        ctx->scantable = ff_prores_interlaced_scan;
+    } else {
+        ctx->scantable = ff_prores_progressive_scan;
+    }
+
+    if (avctx->width & 0x1) {
+        av_log(avctx, AV_LOG_ERROR,
+                "frame width needs to be multiple of 2\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->width > 65534 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR,
+                "The maximum dimensions are 65534x65535\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (strlen(ctx->vendor) != 4) {
+        av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->profile == FF_PROFILE_UNKNOWN) {
+        if (avctx->pix_fmt == AV_PIX_FMT_YUV422P10) {
+            avctx->profile = FF_PROFILE_PRORES_STANDARD;
+            av_log(avctx, AV_LOG_INFO,
+                "encoding with ProRes standard (apcn) profile\n");
+        } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P10) {
+            avctx->profile = FF_PROFILE_PRORES_4444;
+            av_log(avctx, AV_LOG_INFO,
+                   "encoding with ProRes 4444 (ap4h) profile\n");
+        } else if (avctx->pix_fmt == AV_PIX_FMT_YUVA444P10) {
+            avctx->profile = FF_PROFILE_PRORES_4444;
+            av_log(avctx, AV_LOG_INFO,
+                   "encoding with ProRes 4444+ (ap4h) profile\n");
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Unknown pixel format\n");
+            return AVERROR(EINVAL);
+        }
+    } else if (avctx->profile < FF_PROFILE_PRORES_PROXY
+            || avctx->profile > FF_PROFILE_PRORES_XQ) {
+        av_log(
+                avctx,
+                AV_LOG_ERROR,
+                "unknown profile %d, use [0 - apco, 1 - apcs, 2 - apcn (default), 3 - apch, 4 - ap4h, 5 - ap4x]\n",
+                avctx->profile);
+        return AVERROR(EINVAL);
+    } else if ((avctx->pix_fmt == AV_PIX_FMT_YUV422P10) && (avctx->profile > FF_PROFILE_PRORES_HQ)){
+        av_log(avctx, AV_LOG_ERROR,
+               "encoding with ProRes 444/Xq (ap4h/ap4x) profile, need YUV444P10 input\n");
+        return AVERROR(EINVAL);
+    }  else if ((avctx->pix_fmt == AV_PIX_FMT_YUV444P10 || avctx->pix_fmt == AV_PIX_FMT_YUVA444P10)
+                && (avctx->profile < FF_PROFILE_PRORES_4444)){
+        av_log(avctx, AV_LOG_ERROR,
+               "encoding with ProRes Proxy/LT/422/422 HQ (apco, apcs, apcn, ap4h) profile, need YUV422P10 input\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->profile < FF_PROFILE_PRORES_4444) { /* 422 versions */
+        ctx->is_422 = 1;
+        if ((avctx->height & 0xf) || (avctx->width & 0xf)) {
+            ctx->fill_y = av_malloc(4 * (DEFAULT_SLICE_MB_WIDTH << 8));
+            if (!ctx->fill_y)
+                return AVERROR(ENOMEM);
+            ctx->fill_u = ctx->fill_y + (DEFAULT_SLICE_MB_WIDTH << 9);
+            ctx->fill_v = ctx->fill_u + (DEFAULT_SLICE_MB_WIDTH << 8);
+        }
+    } else { /* 444 */
+        ctx->is_422 = 0;
+        if ((avctx->height & 0xf) || (avctx->width & 0xf)) {
+            ctx->fill_y = av_malloc(3 * (DEFAULT_SLICE_MB_WIDTH << 9));
+            if (!ctx->fill_y)
+                return AVERROR(ENOMEM);
+            ctx->fill_u = ctx->fill_y + (DEFAULT_SLICE_MB_WIDTH << 9);
+            ctx->fill_v = ctx->fill_u + (DEFAULT_SLICE_MB_WIDTH << 9);
+        }
+        if (avctx->pix_fmt == AV_PIX_FMT_YUVA444P10) {
+            ctx->need_alpha = 1;
+            ctx->fill_a = av_malloc(DEFAULT_SLICE_MB_WIDTH << 9); /* 8 blocks x 16px x 16px x sizeof (uint16) */
+            if (!ctx->fill_a)
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    ff_fdctdsp_init(&ctx->fdsp, avctx);
+
+    avctx->codec_tag = AV_RL32((const uint8_t*)profiles[avctx->profile].name);
+
+    for (i = 1; i <= 16; i++) {
+        scale_mat(QMAT_LUMA[avctx->profile]  , ctx->qmat_luma[i - 1]  , i);
+        scale_mat(QMAT_CHROMA[avctx->profile], ctx->qmat_chroma[i - 1], i);
+    }
+
+    return 0;
+}
+
+static av_cold int prores_encode_close(AVCodecContext *avctx)
+{
+    ProresContext* ctx = avctx->priv_data;
+    av_freep(&ctx->fill_y);
+    av_freep(&ctx->fill_a);
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(ProresContext, x)
+#define VE     AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption options[] = {
+    { "vendor", "vendor ID", OFFSET(vendor), AV_OPT_TYPE_STRING, { .str = "fmpg" }, CHAR_MIN, CHAR_MAX, VE },
+    { NULL }
+};
+
+static const AVClass proresaw_enc_class = {
+    .class_name = "ProResAw encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass prores_enc_class = {
+    .class_name = "ProRes encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_prores_aw_encoder = {
+    .name           = "prores_aw",
+    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = prores_encode_init,
+    .close          = prores_encode_close,
+    .encode2        = prores_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_NONE},
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &proresaw_enc_class,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
+};
+
+AVCodec ff_prores_encoder = {
+    .name           = "prores",
+    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = prores_encode_init,
+    .close          = prores_encode_close,
+    .encode2        = prores_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_NONE},
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &prores_enc_class,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
+};
diff --git a/libavcodec/proresenc.c b/libavcodec/proresenc_kostya.c
index e4842d2..e045a97 100644
--- a/libavcodec/proresenc.c
+++ b/libavcodec/proresenc_kostya.c
@@ -3,20 +3,23 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This encoder appears to be based on Anatoliy Wassermans considering
+ * similarities in the bugs.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +28,7 @@
 #include "avcodec.h"
 #include "fdctdsp.h"
 #include "put_bits.h"
+#include "profiles.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "proresdata.h"
@@ -37,18 +41,22 @@
 #define MAX_PLANES 4
 
 enum {
+    PRORES_PROFILE_AUTO  = -1,
     PRORES_PROFILE_PROXY = 0,
     PRORES_PROFILE_LT,
     PRORES_PROFILE_STANDARD,
     PRORES_PROFILE_HQ,
     PRORES_PROFILE_4444,
+    PRORES_PROFILE_4444XQ,
 };
 
 enum {
     QUANT_MAT_PROXY = 0,
+    QUANT_MAT_PROXY_CHROMA,
     QUANT_MAT_LT,
     QUANT_MAT_STANDARD,
     QUANT_MAT_HQ,
+    QUANT_MAT_XQ_LUMA,
     QUANT_MAT_DEFAULT,
 };
 
@@ -63,6 +71,16 @@ static const uint8_t prores_quant_matrices[][64] = {
         13, 63, 63, 63, 63, 63, 63, 63,
         63, 63, 63, 63, 63, 63, 63, 63,
     },
+    { // proxy chromas
+        4,  7,  9, 11, 13, 14, 63, 63,
+        7,  7, 11, 12, 14, 63, 63, 63,
+        9, 11, 13, 14, 63, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    },
     { // LT
          4,  5,  6,  7,  9, 11, 13, 15,
          5,  5,  7,  8, 11, 13, 15, 17,
@@ -93,6 +111,16 @@ static const uint8_t prores_quant_matrices[][64] = {
          4,  4,  4,  4,  5,  5,  6,  7,
          4,  4,  4,  4,  5,  6,  7,  7,
     },
+    { // XQ luma
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  2,
+        2,  2,  2,  2,  2,  2,  2,  3,
+        2,  2,  2,  2,  2,  2,  3,  3,
+        2,  2,  2,  2,  2,  3,  3,  3,
+        2,  2,  2,  2,  3,  3,  3,  4,
+        2,  2,  2,  2,  3,  3,  4,  4,
+    },
     { // codec default
          4,  4,  4,  4,  4,  4,  4,  4,
          4,  4,  4,  4,  4,  4,  4,  4,
@@ -120,7 +148,8 @@ static const struct prores_profile {
     int         max_quant;
     int         br_tab[NUM_MB_LIMITS];
     int         quant;
-} prores_profile_info[5] = {
+    int         quant_chroma;
+} prores_profile_info[6] = {
     {
         .full_name = "proxy",
         .tag       = MKTAG('a', 'p', 'c', 'o'),
@@ -128,6 +157,7 @@ static const struct prores_profile {
         .max_quant = 8,
         .br_tab    = { 300, 242, 220, 194 },
         .quant     = QUANT_MAT_PROXY,
+        .quant_chroma = QUANT_MAT_PROXY_CHROMA,
     },
     {
         .full_name = "LT",
@@ -136,6 +166,7 @@ static const struct prores_profile {
         .max_quant = 9,
         .br_tab    = { 720, 560, 490, 440 },
         .quant     = QUANT_MAT_LT,
+        .quant_chroma = QUANT_MAT_LT,
     },
     {
         .full_name = "standard",
@@ -144,6 +175,7 @@ static const struct prores_profile {
         .max_quant = 6,
         .br_tab    = { 1050, 808, 710, 632 },
         .quant     = QUANT_MAT_STANDARD,
+        .quant_chroma = QUANT_MAT_STANDARD,
     },
     {
         .full_name = "high quality",
@@ -152,6 +184,7 @@ static const struct prores_profile {
         .max_quant = 6,
         .br_tab    = { 1566, 1216, 1070, 950 },
         .quant     = QUANT_MAT_HQ,
+        .quant_chroma = QUANT_MAT_HQ,
     },
     {
         .full_name = "4444",
@@ -160,6 +193,16 @@ static const struct prores_profile {
         .max_quant = 6,
         .br_tab    = { 2350, 1828, 1600, 1425 },
         .quant     = QUANT_MAT_HQ,
+        .quant_chroma = QUANT_MAT_HQ,
+    },
+    {
+        .full_name = "4444XQ",
+        .tag       = MKTAG('a', 'p', '4', 'x'),
+        .min_quant = 1,
+        .max_quant = 6,
+        .br_tab    = { 3525, 2742, 2400, 2137 },
+        .quant     = QUANT_MAT_HQ, /* Fix me : use QUANT_MAT_XQ_LUMA */
+        .quant_chroma = QUANT_MAT_HQ,
     }
 };
 
@@ -179,6 +222,7 @@ typedef struct ProresThreadData {
     DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16];
     int16_t custom_q[64];
+    int16_t custom_chroma_q[64];
     struct TrellisNode *nodes;
 } ProresThreadData;
 
@@ -187,8 +231,11 @@ typedef struct ProresContext {
     DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
     int16_t quants[MAX_STORED_Q][64];
+    int16_t quants_chroma[MAX_STORED_Q][64];
     int16_t custom_q[64];
+    int16_t custom_chroma_q[64];
     const uint8_t *quant_mat;
+    const uint8_t *quant_chroma_mat;
     const uint8_t *scantable;
 
     void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src,
@@ -354,7 +401,7 @@ static inline void encode_vlc_codeword(PutBitContext *pb, unsigned codebook, int
 }
 
 #define GET_SIGN(x)  ((x) >> 31)
-#define MAKE_CODE(x) (((x) << 1) ^ GET_SIGN(x))
+#define MAKE_CODE(x) ((((x)) * 2) ^ GET_SIGN(x))
 
 static void encode_dcs(PutBitContext *pb, int16_t *blocks,
                        int blocks_per_slice, int scale)
@@ -437,12 +484,11 @@ static int encode_slice_plane(ProresContext *ctx, PutBitContext *pb,
 
 static void put_alpha_diff(PutBitContext *pb, int cur, int prev, int abits)
 {
-    const int mask  = (1 << abits) - 1;
     const int dbits = (abits == 8) ? 4 : 7;
     const int dsize = 1 << dbits - 1;
     int diff = cur - prev;
 
-    diff &= mask;
+    diff = av_mod_uintp2(diff, abits);
     if (diff >= (1 << abits) - dsize)
         diff -= 1 << abits;
     if (diff < -dsize || diff > dsize || !diff) {
@@ -515,6 +561,7 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
     ptrdiff_t linesize;
     int plane_factor, is_chroma;
     uint16_t *qmat;
+    uint16_t *qmat_chroma;
 
     if (ctx->pictures_per_frame == 1)
         line_add = 0;
@@ -523,12 +570,17 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
 
     if (ctx->force_quant) {
         qmat = ctx->quants[0];
+        qmat_chroma = ctx->quants_chroma[0];
     } else if (quant < MAX_STORED_Q) {
         qmat = ctx->quants[quant];
+        qmat_chroma = ctx->quants_chroma[quant];
     } else {
         qmat = ctx->custom_q;
-        for (i = 0; i < 64; i++)
+        qmat_chroma = ctx->custom_chroma_q;
+        for (i = 0; i < 64; i++) {
             qmat[i] = ctx->quant_mat[i] * quant;
+            qmat_chroma[i] = ctx->quant_chroma_mat[i] * quant;
+        }
     }
 
     for (i = 0; i < ctx->num_planes; i++) {
@@ -557,10 +609,17 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
                            pwidth, avctx->height / ctx->pictures_per_frame,
                            ctx->blocks[0], ctx->emu_buf,
                            mbs_per_slice, num_cblocks, is_chroma);
-            sizes[i] = encode_slice_plane(ctx, pb, src, linesize,
-                                          mbs_per_slice, ctx->blocks[0],
-                                          num_cblocks, plane_factor,
-                                          qmat);
+            if (!is_chroma) {/* luma quant */
+                sizes[i] = encode_slice_plane(ctx, pb, src, linesize,
+                                              mbs_per_slice, ctx->blocks[0],
+                                              num_cblocks, plane_factor,
+                                              qmat);
+            } else { /* chroma plane */
+                sizes[i] = encode_slice_plane(ctx, pb, src, linesize,
+                                              mbs_per_slice, ctx->blocks[0],
+                                              num_cblocks, plane_factor,
+                                              qmat_chroma);
+            }
         } else {
             get_alpha_data(ctx, src, linesize, xp, yp,
                            pwidth, avctx->height / ctx->pictures_per_frame,
@@ -687,12 +746,11 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane,
 
 static int est_alpha_diff(int cur, int prev, int abits)
 {
-    const int mask  = (1 << abits) - 1;
     const int dbits = (abits == 8) ? 4 : 7;
     const int dsize = 1 << dbits - 1;
     int diff = cur - prev;
 
-    diff &= mask;
+    diff = av_mod_uintp2(diff, abits);
     if (diff >= (1 << abits) - dsize)
         diff -= 1 << abits;
     if (diff < -dsize || diff > dsize || !diff)
@@ -701,10 +759,9 @@ static int est_alpha_diff(int cur, int prev, int abits)
         return dbits + 1;
 }
 
-static int estimate_alpha_plane(ProresContext *ctx, int *error,
+static int estimate_alpha_plane(ProresContext *ctx,
                                 const uint16_t *src, ptrdiff_t linesize,
-                                int mbs_per_slice, int quant,
-                                int16_t *blocks)
+                                int mbs_per_slice, int16_t *blocks)
 {
     const int abits = ctx->alpha_bits;
     const int mask  = (1 << abits) - 1;
@@ -714,7 +771,6 @@ static int estimate_alpha_plane(ProresContext *ctx, int *error,
     int run = 0;
     int bits;
 
-    *error = 0;
     cur = blocks[idx++];
     bits = est_alpha_diff(cur, prev, abits);
     prev = cur;
@@ -762,7 +818,9 @@ static int find_slice_quant(AVCodecContext *avctx,
     int slice_bits[TRELLIS_WIDTH], slice_score[TRELLIS_WIDTH];
     int overquant;
     uint16_t *qmat;
+    uint16_t *qmat_chroma;
     int linesize[4], line_add;
+    int alpha_bits = 0;
 
     if (ctx->pictures_per_frame == 1)
         line_add = 0;
@@ -808,20 +866,25 @@ static int find_slice_quant(AVCodecContext *avctx,
         td->nodes[trellis_node + q].quant     = q;
     }
 
+    if (ctx->alpha_bits)
+        alpha_bits = estimate_alpha_plane(ctx, src, linesize[3],
+                                          mbs_per_slice, td->blocks[3]);
     // todo: maybe perform coarser quantising to fit into frame size when needed
     for (q = min_quant; q <= max_quant; q++) {
-        bits  = 0;
+        bits  = alpha_bits;
         error = 0;
-        for (i = 0; i < ctx->num_planes - !!ctx->alpha_bits; i++) {
+        bits += estimate_slice_plane(ctx, &error, 0,
+                                     src, linesize[0],
+                                     mbs_per_slice,
+                                     num_cblocks[0], plane_factor[0],
+                                     ctx->quants[q], td); /* estimate luma plane */
+        for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */
             bits += estimate_slice_plane(ctx, &error, i,
                                          src, linesize[i],
                                          mbs_per_slice,
                                          num_cblocks[i], plane_factor[i],
-                                         ctx->quants[q], td);
+                                         ctx->quants_chroma[q], td);
         }
-        if (ctx->alpha_bits)
-            bits += estimate_alpha_plane(ctx, &error, src, linesize[3],
-                                         mbs_per_slice, q, td->blocks[3]);
         if (bits > 65000 * 8)
             error = SCORE_LIMIT;
 
@@ -834,25 +897,31 @@ static int find_slice_quant(AVCodecContext *avctx,
         overquant = max_quant;
     } else {
         for (q = max_quant + 1; q < 128; q++) {
-            bits  = 0;
+            bits  = alpha_bits;
             error = 0;
             if (q < MAX_STORED_Q) {
                 qmat = ctx->quants[q];
+                qmat_chroma = ctx->quants_chroma[q];
             } else {
                 qmat = td->custom_q;
-                for (i = 0; i < 64; i++)
+                qmat_chroma = td->custom_chroma_q;
+                for (i = 0; i < 64; i++) {
                     qmat[i] = ctx->quant_mat[i] * q;
+                    qmat_chroma[i] = ctx->quant_chroma_mat[i] * q;
+                }
             }
-            for (i = 0; i < ctx->num_planes - !!ctx->alpha_bits; i++) {
+            bits += estimate_slice_plane(ctx, &error, 0,
+                                         src, linesize[0],
+                                         mbs_per_slice,
+                                         num_cblocks[0], plane_factor[0],
+                                         qmat, td);/* estimate luma plane */
+            for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */
                 bits += estimate_slice_plane(ctx, &error, i,
                                              src, linesize[i],
                                              mbs_per_slice,
                                              num_cblocks[i], plane_factor[i],
-                                             qmat, td);
+                                             qmat_chroma, td);
             }
-            if (ctx->alpha_bits)
-                bits += estimate_alpha_plane(ctx, &error, src, linesize[3],
-                                             mbs_per_slice, q, td->blocks[3]);
             if (bits <= ctx->bits_per_mb * mbs_per_slice)
                 break;
         }
@@ -936,23 +1005,15 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int sizes[4] = { 0 };
     int slice_hdr_size = 2 + 2 * (ctx->num_planes - 1);
     int frame_size, picture_size, slice_size;
-    int pkt_size, ret, max_slice_size = 0;
+    int pkt_size, ret;
+    int max_slice_size = (ctx->frame_size_upper_bound - 200) / (ctx->pictures_per_frame * ctx->slices_per_picture + 1);
     uint8_t frame_flags;
 
     ctx->pic = pic;
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
     pkt_size = ctx->frame_size_upper_bound;
 
-    if ((ret = ff_alloc_packet(pkt, pkt_size + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     orig_buf = pkt->data;
 
@@ -975,9 +1036,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     bytestream_put_byte  (&buf, frame_flags);
 
     bytestream_put_byte  (&buf, 0);             // reserved
-    bytestream_put_byte  (&buf, avctx->color_primaries);
-    bytestream_put_byte  (&buf, avctx->color_trc);
-    bytestream_put_byte  (&buf, avctx->colorspace);
+    bytestream_put_byte  (&buf, pic->color_primaries);
+    bytestream_put_byte  (&buf, pic->color_trc);
+    bytestream_put_byte  (&buf, pic->colorspace);
     bytestream_put_byte  (&buf, 0x40 | (ctx->alpha_bits >> 3));
     bytestream_put_byte  (&buf, 0);             // reserved
     if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
@@ -1009,7 +1070,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         // slices
         if (!ctx->force_quant) {
-            ret = avctx->execute2(avctx, find_quant_thread, NULL, NULL,
+            ret = avctx->execute2(avctx, find_quant_thread, (void*)pic, NULL,
                                   ctx->mb_height);
             if (ret)
                 return ret;
@@ -1031,9 +1092,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     uint8_t *start = pkt->data;
                     // Recompute new size according to max_slice_size
                     // and deduce delta
-                    int delta = 200 + ctx->pictures_per_frame *
-                                ctx->slices_per_picture * max_slice_size -
-                                pkt_size;
+                    int delta = 200 + (ctx->pictures_per_frame *
+                                ctx->slices_per_picture + 1) *
+                                max_slice_size - pkt_size;
 
                     delta = FFMAX(delta, 2 * max_slice_size);
                     ctx->frame_size_upper_bound += delta;
@@ -1060,7 +1121,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     slice_hdr        = pkt->data + (slice_hdr        - start);
                     tmp              = pkt->data + (tmp              - start);
                 }
-                init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)) * 8);
+                init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)));
                 ret = encode_slice(avctx, pic, &pb, sizes, x, y, q,
                                    mbs_per_slice);
                 if (ret < 0)
@@ -1079,10 +1140,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
         }
 
-        if (ctx->pictures_per_frame == 1)
-            picture_size = buf - picture_size_pos - 6;
-        else
-            picture_size = buf - picture_size_pos + 1;
+        picture_size = buf - (picture_size_pos - 1);
         bytestream_put_be32(&picture_size_pos, picture_size);
     }
 
@@ -1104,7 +1162,7 @@ static av_cold int encode_close(AVCodecContext *avctx)
 
     if (ctx->tdata) {
         for (i = 0; i < avctx->thread_count; i++)
-            av_free(ctx->tdata[i].nodes);
+            av_freep(&ctx->tdata[i].nodes);
     }
     av_freep(&ctx->tdata);
     av_freep(&ctx->slice_q);
@@ -1135,6 +1193,12 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
 
     avctx->bits_per_raw_sample = 10;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+    avctx->coded_frame->key_frame = 1;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
     ctx->fdct      = prores_fdct;
     ctx->scantable = interlaced ? ff_prores_interlaced_scan
@@ -1147,7 +1211,24 @@ static av_cold int encode_init(AVCodecContext *avctx)
                "there should be an integer power of two MBs per slice\n");
         return AVERROR(EINVAL);
     }
+    if (ctx->profile == PRORES_PROFILE_AUTO) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+        ctx->profile = (desc->flags & AV_PIX_FMT_FLAG_ALPHA ||
+                        !(desc->log2_chroma_w + desc->log2_chroma_h))
+                     ? PRORES_PROFILE_4444 : PRORES_PROFILE_HQ;
+        av_log(avctx, AV_LOG_INFO, "Autoselected %s. It can be overridden "
+               "through -profile option.\n", ctx->profile == PRORES_PROFILE_4444
+               ? "4:4:4:4 profile because of the used input colorspace"
+               : "HQ profile to keep best quality");
+    }
     if (av_pix_fmt_desc_get(avctx->pix_fmt)->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        if (ctx->profile != PRORES_PROFILE_4444 &&
+            ctx->profile != PRORES_PROFILE_4444XQ) {
+            // force alpha and warn
+            av_log(avctx, AV_LOG_WARNING, "Profile selected will not "
+                   "encode alpha. Override with -profile if needed.\n");
+            ctx->alpha_bits = 0;
+        }
         if (ctx->alpha_bits & 7) {
             av_log(avctx, AV_LOG_ERROR, "alpha bits should be 0, 8 or 16\n");
             return AVERROR(EINVAL);
@@ -1175,10 +1256,13 @@ static av_cold int encode_init(AVCodecContext *avctx)
     ctx->slices_per_picture = ctx->mb_height * ctx->slices_width;
     ctx->pictures_per_frame = 1 + interlaced;
 
-    if (ctx->quant_sel == -1)
+    if (ctx->quant_sel == -1) {
         ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
-    else
+        ctx->quant_chroma_mat = prores_quant_matrices[ctx->profile_info->quant_chroma];
+    } else {
         ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
+        ctx->quant_chroma_mat = prores_quant_matrices[ctx->quant_sel];
+    }
 
     if (strlen(ctx->vendor) != 4) {
         av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
@@ -1193,6 +1277,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
                                            ctx->pictures_per_frame)
                     break;
             ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
+            if (ctx->alpha_bits)
+                ctx->bits_per_mb *= 20;
         } else if (ctx->bits_per_mb < 128) {
             av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
             return AVERROR_INVALIDDATA;
@@ -1201,8 +1287,10 @@ static av_cold int encode_init(AVCodecContext *avctx)
         min_quant = ctx->profile_info->min_quant;
         max_quant = ctx->profile_info->max_quant;
         for (i = min_quant; i < MAX_STORED_Q; i++) {
-            for (j = 0; j < 64; j++)
+            for (j = 0; j < 64; j++) {
                 ctx->quants[i][j] = ctx->quant_mat[j] * i;
+                ctx->quants_chroma[i][j] = ctx->quant_chroma_mat[j] * i;
+            }
         }
 
         ctx->slice_q = av_malloc(ctx->slices_per_picture * sizeof(*ctx->slice_q));
@@ -1233,6 +1321,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
         }
     } else {
         int ls = 0;
+        int ls_chroma = 0;
 
         if (ctx->force_quant > 64) {
             av_log(avctx, AV_LOG_ERROR, "too large quantiser, maximum is 64\n");
@@ -1241,24 +1330,26 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
         for (j = 0; j < 64; j++) {
             ctx->quants[0][j] = ctx->quant_mat[j] * ctx->force_quant;
+            ctx->quants_chroma[0][j] = ctx->quant_chroma_mat[j] * ctx->force_quant;
             ls += av_log2((1 << 11)  / ctx->quants[0][j]) * 2 + 1;
+            ls_chroma += av_log2((1 << 11)  / ctx->quants_chroma[0][j]) * 2 + 1;
         }
 
-        ctx->bits_per_mb = ls * 8;
+        ctx->bits_per_mb = ls * 4 + ls_chroma * 4;
         if (ctx->chroma_factor == CFACTOR_Y444)
-            ctx->bits_per_mb += ls * 4;
+            ctx->bits_per_mb += ls_chroma * 4;
     }
 
-    ctx->frame_size_upper_bound = ctx->pictures_per_frame *
-                                  ctx->slices_per_picture *
+    ctx->frame_size_upper_bound = (ctx->pictures_per_frame *
+                                   ctx->slices_per_picture + 1) *
                                   (2 + 2 * ctx->num_planes +
                                    (mps * ctx->bits_per_mb) / 8)
                                   + 200;
 
     if (ctx->alpha_bits) {
          // The alpha plane is run-coded and might exceed the bit budget.
-         ctx->frame_size_upper_bound += ctx->pictures_per_frame *
-                                        ctx->slices_per_picture *
+         ctx->frame_size_upper_bound += (ctx->pictures_per_frame *
+                                         ctx->slices_per_picture + 1) *
          /* num pixels per slice */     (ctx->mbs_per_slice * 256 *
          /* bits per pixel */            (1 + ctx->alpha_bits + 1) + 7 >> 3);
     }
@@ -1282,8 +1373,10 @@ static const AVOption options[] = {
     { "mbs_per_slice", "macroblocks per slice", OFFSET(mbs_per_slice),
         AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE },
     { "profile",       NULL, OFFSET(profile), AV_OPT_TYPE_INT,
-        { .i64 = PRORES_PROFILE_STANDARD },
-        PRORES_PROFILE_PROXY, PRORES_PROFILE_4444, VE, "profile" },
+        { .i64 = PRORES_PROFILE_AUTO },
+        PRORES_PROFILE_AUTO, PRORES_PROFILE_4444XQ, VE, "profile" },
+    { "auto",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO },
+        0, 0, VE, "profile" },
     { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_PROXY },
         0, 0, VE, "profile" },
     { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT },
@@ -1294,6 +1387,8 @@ static const AVOption options[] = {
         0, 0, VE, "profile" },
     { "4444",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444 },
         0, 0, VE, "profile" },
+    { "4444xq",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444XQ },
+        0, 0, VE, "profile" },
     { "vendor", "vendor ID", OFFSET(vendor),
         AV_OPT_TYPE_STRING, { .str = "Lavc" }, CHAR_MIN, CHAR_MAX, VE },
     { "bits_per_mb", "desired bits per macroblock", OFFSET(bits_per_mb),
@@ -1324,8 +1419,8 @@ static const AVClass proresenc_class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-AVCodec ff_prores_encoder = {
-    .name           = "prores",
+AVCodec ff_prores_ks_encoder = {
+    .name           = "prores_ks",
     .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_PRORES,
@@ -1333,10 +1428,11 @@ AVCodec ff_prores_encoder = {
     .init           = encode_init,
     .close          = encode_close,
     .encode2        = encode_frame,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
                           AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
                           AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_NONE
                       },
     .priv_class     = &proresenc_class,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_prores_profiles),
 };
diff --git a/libavcodec/prosumer.c b/libavcodec/prosumer.c
new file mode 100644
index 0000000..ce3cbdb
--- /dev/null
+++ b/libavcodec/prosumer.c
@@ -0,0 +1,375 @@
+/*
+ * Brooktree ProSumer Video decoder
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/imgutils.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+typedef struct ProSumerContext {
+    GetByteContext gb;
+    PutByteContext pb;
+
+    unsigned stride;
+    unsigned size;
+    uint32_t lut[0x2000];
+    uint8_t *initial_line;
+    uint8_t *decbuffer;
+} ProSumerContext;
+
+#define PAIR(high, low) (((uint64_t)(high) << 32) | low)
+
+static int decompress(GetByteContext *gb, int size, PutByteContext *pb, const uint32_t *lut)
+{
+    int pos, idx, cnt, fill;
+    uint32_t a, b, c;
+
+    bytestream2_skip(gb, 32);
+    cnt = 4;
+    a = bytestream2_get_le32(gb);
+    idx = a >> 20;
+    b = lut[2 * idx];
+
+    while (1) {
+        if (bytestream2_get_bytes_left_p(pb) <= 0 || bytestream2_get_eof(pb))
+            return 0;
+        if ((b & 0xFF00u) != 0x8000u || (b & 0xFFu)) {
+            if ((b & 0xFF00u) != 0x8000u) {
+                bytestream2_put_le16(pb, b);
+            } else {
+                idx = 0;
+                for (int i = 0; i < (b & 0xFFu); i++)
+                    bytestream2_put_le32(pb, 0);
+            }
+            c = b >> 16;
+            if (c & 0xFF00u) {
+                fill = lut[2 * idx + 1];
+                if ((c & 0xF000u) == 0x1000) {
+                    bytestream2_put_le16(pb, fill);
+                } else {
+                    bytestream2_put_le32(pb, fill);
+                }
+                c = (c >> 8) & 0x0Fu;
+            }
+            while (c) {
+                a <<= 4;
+                cnt--;
+                if (!cnt) {
+                    if (bytestream2_get_bytes_left(gb) <= 0) {
+                        if (!a)
+                            return 0;
+                    } else {
+                        pos = bytestream2_tell(gb);
+                        bytestream2_seek(gb, pos ^ 2, SEEK_SET);
+                        AV_WN16(&a, bytestream2_peek_le16(gb));
+                        bytestream2_seek(gb, pos + 2, SEEK_SET);
+                    }
+                    cnt = 4;
+                }
+                c--;
+            }
+            idx = a >> 20;
+            b = lut[2 * idx];
+            if (!b)
+                return AVERROR_INVALIDDATA;
+            continue;
+        }
+        idx = 2;
+        while (idx) {
+            a <<= 4;
+            cnt--;
+            if (cnt) {
+                idx--;
+                continue;
+            }
+            if (bytestream2_get_bytes_left(gb) <= 0) {
+                if (a) {
+                    cnt = 4;
+                    idx--;
+                    continue;
+                }
+                return 0;
+            }
+            pos = bytestream2_tell(gb);
+            bytestream2_seek(gb, pos ^ 2, SEEK_SET);
+            AV_WN16(&a, bytestream2_peek_le16(gb));
+            bytestream2_seek(gb, pos + 2, SEEK_SET);
+            cnt = 4;
+            idx--;
+        }
+        b = PAIR(4, a) >> 16;
+    }
+
+    return 0;
+}
+
+static void vertical_predict(uint32_t *dst, int offset, const uint32_t *src, int stride, int height)
+{
+    dst += offset >> 2;
+
+    for (int i = 0; i < height; i++) {
+        for (int j = 0; j < stride >> 2; j++) {
+            dst[j] = (((src[j] >> 3) + (0x3F3F3F3F & dst[j])) << 3) & 0xFCFCFCFC;
+        }
+
+        dst += stride >> 2;
+        src += stride >> 2;
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame, AVPacket *avpkt)
+{
+    ProSumerContext *s = avctx->priv_data;
+    AVFrame * const frame = data;
+    int ret;
+
+    if (avpkt->size <= 32)
+        return AVERROR_INVALIDDATA;
+
+    memset(s->decbuffer, 0, s->size);
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
+    bytestream2_init_writer(&s->pb, s->decbuffer, s->size);
+    ret = decompress(&s->gb, AV_RL32(avpkt->data + 28) >> 1, &s->pb, s->lut);
+    if (ret < 0)
+        return ret;
+    vertical_predict((uint32_t *)s->decbuffer, 0, (uint32_t *)s->initial_line, s->stride, 1);
+    vertical_predict((uint32_t *)s->decbuffer, s->stride, (uint32_t *)s->decbuffer, s->stride, avctx->height - 1);
+
+    ret = ff_get_buffer(avctx, frame, 0);
+    if (ret < 0)
+        return ret;
+
+    for (int i = avctx->height - 1; i >= 0 ; i--) {
+        uint8_t *y = &frame->data[0][i * frame->linesize[0]];
+        uint8_t *u = &frame->data[1][i * frame->linesize[1]];
+        uint8_t *v = &frame->data[2][i * frame->linesize[2]];
+        const uint8_t *src = s->decbuffer + (avctx->height - 1 - i) * s->stride;
+
+        for (int j = 0; j < avctx->width; j += 8) {
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+        }
+    }
+
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+#define TB(i) (1 + ((i) > 10) + ((i) > 49))
+static const uint16_t table[] = {
+    0x0000, 0x100, 0x0101, 0x200, 0x0202, 0x300, 0xFFFF, 0x400, 0xFEFE, 0x500,
+    0x0001, 0x700, 0x0100, 0x800, 0x00FF, 0x900, 0xFF00, 0xA00, 0x8001, 0x600,
+    0x8002, 0xB00, 0xFCFC, 0x010, 0x0404, 0x030, 0x0002, 0xD30, 0xFEFC, 0x020,
+    0xFCFE, 0x040, 0xFEFF, 0xD20, 0x0808, 0x060, 0xFFFE, 0x050, 0x0402, 0xC00,
+    0x0204, 0xC10, 0xF8F8, 0xC30, 0x0201, 0xC40, 0x0102, 0xC60, 0x0804, 0xF30,
+    0x0408, 0xE00, 0xF8FC, 0xE10, 0xFCF8, 0xC70, 0x00FE, 0xD00, 0xFE00, 0xD40,
+    0xFF01, 0xD50, 0x01FF, 0xD60, 0x0200, 0xD70, 0xFCFF, 0xE20, 0x0104, 0xE30,
+    0xF0F0, 0xE50, 0x0401, 0xE70, 0x02FE, 0xF00, 0xFE02, 0xF10, 0xFE01, 0xF20,
+    0x01FE, 0xF40, 0xFF02, 0xF50, 0x02FF, 0xF60, 0x8003, 0xC20, 0x8004, 0x070,
+    0x8005, 0xD10, 0x8006, 0xC50, 0x8007, 0xE60, 0x8008, 0xE40, 0x8009, 0xF70,
+    0xFC02, 0x080, 0xFE04, 0x081, 0xFC00, 0x082, 0x02FC, 0x083, 0x1010, 0x084,
+    0x00FC, 0x085, 0x0004, 0x086, 0x0400, 0x087, 0xFFFC, 0x088, 0x1008, 0x089,
+    0x0810, 0x08A, 0x0802, 0x08B, 0x0208, 0x08C, 0xFEF8, 0x08D, 0xFC01, 0x08E,
+    0x04FF, 0x08F, 0xF8FE, 0x090, 0xFC04, 0x091, 0x04FC, 0x092, 0xFF04, 0x093,
+    0x01FC, 0x094, 0xF0F8, 0x095, 0xF8F0, 0x096, 0x04FE, 0x097, 0xF0FC, 0x098,
+    0x0008, 0x099, 0x08FE, 0x09A, 0x01F8, 0x09B, 0x0800, 0x09C, 0x08FC, 0x09D,
+    0xFE08, 0x09E, 0xFC08, 0x09F, 0xF800, 0x0A0, 0x0108, 0x0A1, 0xF802, 0x0A2,
+    0x0801, 0x0A3, 0x00F8, 0x0A4, 0xF804, 0x0A5, 0xF8FF, 0x0A6, 0xFFF8, 0x0A7,
+    0x04F8, 0x0A8, 0x02F8, 0x0A9, 0x1004, 0x0AA, 0x08F8, 0x0AB, 0xF808, 0x0AC,
+    0x0410, 0x0AD, 0xFF08, 0x0AE, 0x08FF, 0x0AF, 0xFCF0, 0x0B0, 0xF801, 0x0B1,
+    0xE0F0, 0x0B2, 0xF3F3, 0x0B3, 0xF0E0, 0x0B4, 0xFAFA, 0x0B5, 0xF7F7, 0x0B6,
+    0xFEF0, 0x0B7, 0xF0FE, 0x0B8, 0xE9E9, 0x0B9, 0xF9F9, 0x0BA, 0x2020, 0x0BB,
+    0xE0E0, 0x0BC, 0x02F0, 0x0BD, 0x04F0, 0x0BE, 0x2010, 0x0BF, 0xECEC, 0x0C0,
+    0xEFEF, 0x0C1, 0x1020, 0x0C2, 0xF5F5, 0x0C3, 0xF4F4, 0x0C4, 0xEDED, 0x0C5,
+    0xEAEA, 0x0C6, 0xFBFB, 0x0C7, 0x1002, 0x0C8, 0xF2F2, 0x0C9, 0xF6F6, 0x0CA,
+    0xF1F1, 0x0CB, 0xFDFD, 0x0CC, 0x0210, 0x0CD, 0x10FF, 0x0CE, 0xFDFE, 0x0CF,
+    0x10F8, 0x0D0, 0x1000, 0x0D1, 0xF001, 0x0D2, 0x1001, 0x0D3, 0x0010, 0x0D4,
+    0x10FE, 0x0D5, 0xEBEB, 0x0D6, 0xFE10, 0x0D7, 0x0110, 0x0D8, 0xF000, 0x0D9,
+    0x08F0, 0x0DA, 0x01F0, 0x0DB, 0x0303, 0x0DC, 0x00F0, 0x0DD, 0xF002, 0x0DE,
+    0x10FC, 0x0DF, 0xFC10, 0x0E0, 0xF0FF, 0x0E1, 0xEEEE, 0x0E2, 0xF004, 0x0E3,
+    0xFFF0, 0x0E4, 0xF7F8, 0x0E5, 0xF3F2, 0x0E6, 0xF9FA, 0x0E7, 0x0820, 0x0E8,
+    0x0302, 0x0E9, 0xE0F8, 0x0EA, 0x0505, 0x0EB, 0x2008, 0x0EC, 0xE8E8, 0x0ED,
+    0x0403, 0x0EE, 0xFBFC, 0x0EF, 0xFCFD, 0x0F0, 0xFBFA, 0x0F1, 0x0203, 0x0F2,
+    0xFCFB, 0x0F3, 0x0304, 0x0F4, 0xF810, 0x0F5, 0xFF10, 0x0F6, 0xF008, 0x0F7,
+    0xFEFD, 0x0F8, 0xF7F6, 0x0F9, 0xF2F1, 0x0FA, 0xF3F4, 0x0FB, 0xEDEC, 0x0FC,
+    0xF4F1, 0x0FD, 0xF5F6, 0x0FE, 0xF0F1, 0x0FF, 0xF9F8, 0xC80, 0x10F0, 0xC81,
+    0xF2F3, 0xC82, 0xF7F9, 0xC83, 0xF6F5, 0xC84, 0xF0EF, 0xC85, 0xF4F5, 0xC86,
+    0xF6F7, 0xC87, 0xFAF9, 0xC88, 0x0405, 0xC89, 0xF8F9, 0xC8A, 0xFAFB, 0xC8B,
+    0xF1F0, 0xC8C, 0xF4F3, 0xC8D, 0xF1F2, 0xC8E, 0xF8E0, 0xC8F, 0xF8F7, 0xC90,
+    0xFDFC, 0xC91, 0xF8FA, 0xC92, 0xFAF6, 0xC93, 0xEEEF, 0xC94, 0xF5F7, 0xC95,
+    0xFDFB, 0xC96, 0xF4F6, 0xC97, 0xFCFA, 0xC98, 0xECED, 0xC99, 0xF0F3, 0xC9A,
+    0xF3F1, 0xC9B, 0xECEB, 0xC9C, 0xEDEE, 0xC9D, 0xF9F7, 0xC9E, 0x0420, 0xC9F,
+    0xEBEA, 0xCA0, 0xF0F4, 0xCA1, 0xF3F5, 0xCA2, 0xFAF7, 0xCA3, 0x0301, 0xCA4,
+    0xF3F7, 0xCA5, 0xF7F3, 0xCA6, 0xEFF0, 0xCA7, 0xF9F6, 0xCA8, 0xEFEE, 0xCA9,
+    0xF4F7, 0xCAA, 0x0504, 0xCAB, 0xF5F4, 0xCAC, 0xF1F3, 0xCAD, 0xEBEE, 0xCAE,
+    0xF2F5, 0xCAF, 0xF3EF, 0xCB0, 0xF5F1, 0xCB1, 0xF9F3, 0xCB2, 0xEDF0, 0xCB3,
+    0xEEF1, 0xCB4, 0xF6F9, 0xCB5, 0xF8FB, 0xCB6, 0xF010, 0xCB7, 0xF2F6, 0xCB8,
+    0xF4ED, 0xCB9, 0xF7FB, 0xCBA, 0xF8F3, 0xCBB, 0xEDEB, 0xCBC, 0xF0F2, 0xCBD,
+    0xF2F9, 0xCBE, 0xF8F1, 0xCBF, 0xFAFC, 0xCC0, 0xFBF8, 0xCC1, 0xF6F0, 0xCC2,
+    0xFAF8, 0xCC3, 0x0103, 0xCC4, 0xF3F6, 0xCC5, 0xF4F9, 0xCC6, 0xF7F2, 0xCC7,
+    0x2004, 0xCC8, 0xF2F0, 0xCC9, 0xF4F2, 0xCCA, 0xEEED, 0xCCB, 0xFCE0, 0xCCC,
+    0xEAE9, 0xCCD, 0xEAEB, 0xCCE, 0xF6F4, 0xCCF, 0xFFFD, 0xCD0, 0xE9EA, 0xCD1,
+    0xF1F4, 0xCD2, 0xF6EF, 0xCD3, 0xF6F8, 0xCD4, 0xF8F6, 0xCD5, 0xEFF2, 0xCD6,
+    0xEFF1, 0xCD7, 0xF7F1, 0xCD8, 0xFBFD, 0xCD9, 0xFEF6, 0xCDA, 0xFFF7, 0xCDB,
+    0x0605, 0xCDC, 0xF0F5, 0xCDD, 0xF0FA, 0xCDE, 0xF1F9, 0xCDF, 0xF2FC, 0xCE0,
+    0xF7EE, 0xCE1, 0xF7F5, 0xCE2, 0xF9FC, 0xCE3, 0xFAF5, 0xCE4, 0xFBF1, 0xCE5,
+    0xF1EF, 0xCE6, 0xF1FA, 0xCE7, 0xF4F8, 0xCE8, 0xF7F0, 0xCE9, 0xF7F4, 0xCEA,
+    0xF7FC, 0xCEB, 0xF9FB, 0xCEC, 0xFAF1, 0xCED, 0xFBF9, 0xCEE, 0xFDFF, 0xCEF,
+    0xE0FC, 0xCF0, 0xEBEC, 0xCF1, 0xEDEF, 0xCF2, 0xEFED, 0xCF3, 0xF1F6, 0xCF4,
+    0xF2F7, 0xCF5, 0xF3EE, 0xCF6, 0xF3F8, 0xCF7, 0xF5F2, 0xCF8, 0xF8F2, 0xCF9,
+    0xF9F1, 0xCFA, 0xF9F2, 0xCFB, 0xFBEF, 0xCFC, 0x00FD, 0xCFD, 0xECEE, 0xCFE,
+    0xF2EF, 0xCFF, 0xF2F8, 0xD80, 0xF5F0, 0xD81, 0xF6F2, 0xD82, 0xFCF7, 0xD83,
+    0xFCF9, 0xD84, 0x0506, 0xD85, 0xEEEC, 0xD86, 0xF0F6, 0xD87, 0xF2F4, 0xD88,
+    0xF6F1, 0xD89, 0xF8F5, 0xD8A, 0xF9F4, 0xD8B, 0xFBF7, 0xD8C, 0x0503, 0xD8D,
+    0xEFEC, 0xD8E, 0xF3F0, 0xD8F, 0xF4F0, 0xD90, 0xF5F3, 0xD91, 0xF6F3, 0xD92,
+    0xF7FA, 0xD93, 0x800A, 0xD94, 0x800B, 0xD95, 0x800C, 0xD96, 0x800D, 0xD97,
+    0x800E, 0xD98, 0x800F, 0xD99, 0x8010, 0xD9A, 0x8011, 0xD9B, 0x8012, 0xD9C,
+    0x8013, 0xD9D, 0x8014, 0xD9E, 0x8015, 0xD9F, 0x8016, 0xDA0, 0x8017, 0xDA1,
+    0x8018, 0xDA2, 0x8019, 0xDA3, 0x801A, 0xDA4, 0x801B, 0xDA5, 0x801C, 0xDA6,
+    0x801D, 0xDA7, 0x801E, 0xDA8, 0x801F, 0xDA9, 0x8020, 0xDAA, 0x8021, 0xDAB,
+    0x8022, 0xDAC, 0x8023, 0xDAD, 0x8024, 0xDAE, 0x8025, 0xDAF, 0x8026, 0xDB0,
+    0x8027, 0xDB1, 0x8028, 0xDB2, 0x8029, 0xDB3, 0x802A, 0xDB4, 0x802B, 0xDB5,
+    0x802C, 0xDB6, 0x802D, 0xDB7, 0x802E, 0xDB8, 0x802F, 0xDB9, 0x80FF, 0xDBA,
+};
+
+static void fill_elements(uint32_t idx, uint32_t shift, uint32_t *e0, uint32_t *e1)
+{
+    uint32_t b, h = idx << (32 - shift);
+
+    for (int j = 0; j < 2; j++) {
+        for (int i = 0; i < 43; i++) {
+            b = 4 * TB(i);
+            if (shift >= b && ((h & (0xFFF00000u << (12 - b))) >> 20) == table[2 * i + 1]) {
+                if (table[2 * i] >> 8 == 0x80u) {
+                    return;
+                } else {
+                    *e0 = (*e0 & 0xFFFFFFu) | (((12 + b - shift)  | (0x40u<<j)) << 22);
+                    if (j == 0) {
+                        *e1 = table[2 * i];
+                        shift -= b;
+                        h <<= b;
+                    } else {
+                        *e1 |= (unsigned)table[2 * i] << 16;
+                    }
+                    break;
+                }
+            }
+        }
+    }
+}
+
+static void fill_lut(uint32_t *lut)
+{
+    for (int i = 1; i < FF_ARRAY_ELEMS(table); i += 2) {
+        uint32_t a = table[i];
+        uint32_t b = TB(i>>1);
+        uint32_t c, d;
+
+        c = (b << 16) | table[i-1];
+        d = 4 * (3 - b);
+        if (d <= 0) {
+            lut[2 * a] = c;
+            lut[2 * a + 1] = 0;
+        } else {
+            for (int j = 0; j < 1 << d; j++) {
+                uint32_t f = 0xFFFFFFFFu;
+                c &= 0xFFFFFFu;
+                if ((c & 0xFF00u) != 0x8000u)
+                    fill_elements(j, d, &c, &f);
+                lut[2 * a + 2 * j] = c;
+                lut[2 * a + 2 * j + 1] = f;
+            }
+        }
+    }
+
+    for (int i = 0; i < 32; i += 2) {
+        lut[i  ] = 0x68000;
+        lut[i+1] = 0;
+    }
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    ProSumerContext *s = avctx->priv_data;
+
+    s->stride = 3LL * FFALIGN(avctx->width, 8) >> 1;
+    s->size = avctx->height * s->stride;
+
+    avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+
+    s->initial_line = av_malloc(s->stride);
+    s->decbuffer = av_malloc(s->size);
+    if (!s->initial_line || !s->decbuffer)
+        return AVERROR(ENOMEM);
+    memset(s->initial_line, 0x80u, s->stride);
+
+    fill_lut(s->lut);
+
+    return 0;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    ProSumerContext *s = avctx->priv_data;
+
+    av_freep(&s->initial_line);
+    av_freep(&s->decbuffer);
+
+    return 0;
+}
+
+AVCodec ff_prosumer_decoder = {
+    .name           = "prosumer",
+    .long_name      = NULL_IF_CONFIG_SMALL("Brooktree ProSumer Video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PROSUMER,
+    .priv_data_size = sizeof(ProSumerContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .close          = decode_close,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/psd.c b/libavcodec/psd.c
new file mode 100644
index 0000000..a31f738
--- /dev/null
+++ b/libavcodec/psd.c
@@ -0,0 +1,558 @@
+/*
+ * Photoshop (PSD) image decoder
+ * Copyright (c) 2016 Jokyo Images
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "bytestream.h"
+#include "internal.h"
+
+enum PsdCompr {
+    PSD_RAW,
+    PSD_RLE,
+    PSD_ZIP_WITHOUT_P,
+    PSD_ZIP_WITH_P,
+};
+
+enum PsdColorMode {
+    PSD_BITMAP,
+    PSD_GRAYSCALE,
+    PSD_INDEXED,
+    PSD_RGB,
+    PSD_CMYK,
+    PSD_MULTICHANNEL,
+    PSD_DUOTONE,
+    PSD_LAB,
+};
+
+typedef struct PSDContext {
+    AVClass *class;
+    AVFrame *picture;
+    AVCodecContext *avctx;
+    GetByteContext gb;
+
+    uint8_t * tmp;
+
+    uint16_t channel_count;
+    uint16_t channel_depth;
+
+    uint64_t uncompressed_size;
+    unsigned int pixel_size;/* 1 for 8 bits, 2 for 16 bits */
+    uint64_t line_size;/* length of src data (even width) */
+
+    int width;
+    int height;
+
+    enum PsdCompr compression;
+    enum PsdColorMode color_mode;
+
+    uint8_t palette[AVPALETTE_SIZE];
+} PSDContext;
+
+static int decode_header(PSDContext * s)
+{
+    int signature, version, color_mode;
+    int64_t len_section;
+    int ret = 0;
+
+    if (bytestream2_get_bytes_left(&s->gb) < 30) {/* File header section + color map data section length */
+        av_log(s->avctx, AV_LOG_ERROR, "Header too short to parse.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    signature = bytestream2_get_le32(&s->gb);
+    if (signature != MKTAG('8','B','P','S')) {
+        av_log(s->avctx, AV_LOG_ERROR, "Wrong signature %d.\n", signature);
+        return AVERROR_INVALIDDATA;
+    }
+
+    version = bytestream2_get_be16(&s->gb);
+    if (version != 1) {
+        av_log(s->avctx, AV_LOG_ERROR, "Wrong version %d.\n", version);
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&s->gb, 6);/* reserved */
+
+    s->channel_count = bytestream2_get_be16(&s->gb);
+    if ((s->channel_count < 1) || (s->channel_count > 56)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid channel count %d.\n", s->channel_count);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->height = bytestream2_get_be32(&s->gb);
+
+    if ((s->height > 30000) && (s->avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL)) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Height > 30000 is experimental, add "
+               "'-strict %d' if you want to try to decode the picture.\n",
+               FF_COMPLIANCE_EXPERIMENTAL);
+        return AVERROR_EXPERIMENTAL;
+    }
+
+    s->width = bytestream2_get_be32(&s->gb);
+    if ((s->width > 30000) && (s->avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL)) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Width > 30000 is experimental, add "
+               "'-strict %d' if you want to try to decode the picture.\n",
+               FF_COMPLIANCE_EXPERIMENTAL);
+        return AVERROR_EXPERIMENTAL;
+    }
+
+    if ((ret = ff_set_dimensions(s->avctx, s->width, s->height)) < 0)
+        return ret;
+
+    s->channel_depth = bytestream2_get_be16(&s->gb);
+
+    color_mode = bytestream2_get_be16(&s->gb);
+    switch (color_mode) {
+    case 0:
+        s->color_mode = PSD_BITMAP;
+        break;
+    case 1:
+        s->color_mode = PSD_GRAYSCALE;
+        break;
+    case 2:
+        s->color_mode = PSD_INDEXED;
+        break;
+    case 3:
+        s->color_mode = PSD_RGB;
+        break;
+    case 4:
+        s->color_mode = PSD_CMYK;
+        break;
+    case 7:
+        s->color_mode = PSD_MULTICHANNEL;
+        break;
+    case 8:
+        s->color_mode = PSD_DUOTONE;
+        break;
+    case 9:
+        s->color_mode = PSD_LAB;
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_ERROR, "Unknown color mode %d.\n", color_mode);
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* color map data */
+    len_section = bytestream2_get_be32(&s->gb);
+    if (len_section < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Negative size for color map data section.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (bytestream2_get_bytes_left(&s->gb) < (len_section + 4)) { /* section and len next section */
+        av_log(s->avctx, AV_LOG_ERROR, "Incomplete file.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (len_section) {
+        int i,j;
+        memset(s->palette, 0xff, AVPALETTE_SIZE);
+        for (j = HAVE_BIGENDIAN; j < 3 + HAVE_BIGENDIAN; j++)
+            for (i = 0; i < FFMIN(256, len_section / 3); i++)
+                s->palette[i * 4 + (HAVE_BIGENDIAN ? j : 2 - j)] = bytestream2_get_byteu(&s->gb);
+        len_section -= i * 3;
+    }
+    bytestream2_skip(&s->gb, len_section);
+
+    /* image ressources */
+    len_section = bytestream2_get_be32(&s->gb);
+    if (len_section < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Negative size for image ressources section.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (bytestream2_get_bytes_left(&s->gb) < (len_section + 4)) { /* section and len next section */
+        av_log(s->avctx, AV_LOG_ERROR, "Incomplete file.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    bytestream2_skip(&s->gb, len_section);
+
+    /* layers and masks */
+    len_section = bytestream2_get_be32(&s->gb);
+    if (len_section < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Negative size for layers and masks data section.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (bytestream2_get_bytes_left(&s->gb) < len_section) {
+        av_log(s->avctx, AV_LOG_ERROR, "Incomplete file.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    bytestream2_skip(&s->gb, len_section);
+
+    /* image section */
+    if (bytestream2_get_bytes_left(&s->gb) < 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "File without image data section.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->compression = bytestream2_get_be16(&s->gb);
+    switch (s->compression) {
+    case 0:
+    case 1:
+        break;
+    case 2:
+        avpriv_request_sample(s->avctx, "ZIP without predictor compression");
+        return AVERROR_PATCHWELCOME;
+        break;
+    case 3:
+        avpriv_request_sample(s->avctx, "ZIP with predictor compression");
+        return AVERROR_PATCHWELCOME;
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_ERROR, "Unknown compression %d.\n", s->compression);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return ret;
+}
+
+static int decode_rle(PSDContext * s){
+    unsigned int scanline_count;
+    unsigned int sl, count;
+    unsigned long target_index = 0;
+    unsigned int p;
+    int8_t rle_char;
+    unsigned int repeat_count;
+    uint8_t v;
+
+    scanline_count = s->height * s->channel_count;
+
+    /* scanline table */
+    if (bytestream2_get_bytes_left(&s->gb) < scanline_count * 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Not enough data for rle scanline table.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    bytestream2_skip(&s->gb, scanline_count * 2);/* size of each scanline */
+
+    /* decode rle data scanline by scanline */
+    for (sl = 0; sl < scanline_count; sl++) {
+        count = 0;
+
+        while (count < s->line_size) {
+            rle_char = bytestream2_get_byte(&s->gb);
+
+            if (rle_char <= 0) {/* byte repeat */
+                repeat_count = rle_char * -1;
+
+                if (bytestream2_get_bytes_left(&s->gb) < 1) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Not enough data for rle scanline.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                if (target_index + repeat_count >= s->uncompressed_size) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid rle char.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                v = bytestream2_get_byte(&s->gb);
+                for (p = 0; p <= repeat_count; p++) {
+                    s->tmp[target_index++] = v;
+                }
+                count += repeat_count + 1;
+            } else {
+                if (bytestream2_get_bytes_left(&s->gb) < rle_char) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Not enough data for rle scanline.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                if (target_index + rle_char >= s->uncompressed_size) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid rle char.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                for (p = 0; p <= rle_char; p++) {
+                    v = bytestream2_get_byte(&s->gb);
+                    s->tmp[target_index++] = v;
+                }
+                count += rle_char + 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame, AVPacket *avpkt)
+{
+    int ret;
+    uint8_t *ptr;
+    const uint8_t *ptr_data;
+    int index_out, c, y, x, p;
+    uint8_t eq_channel[4] = {2,0,1,3};/* RGBA -> GBRA channel order */
+    uint8_t plane_number;
+
+    AVFrame *picture = data;
+
+    PSDContext *s = avctx->priv_data;
+    s->avctx     = avctx;
+    s->channel_count = 0;
+    s->channel_depth = 0;
+    s->tmp           = NULL;
+    s->line_size     = 0;
+
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
+
+    if ((ret = decode_header(s)) < 0)
+        return ret;
+
+    s->pixel_size = s->channel_depth >> 3;/* in byte */
+    s->line_size = s->width * s->pixel_size;
+
+    switch (s->color_mode) {
+    case PSD_BITMAP:
+        if (s->channel_depth != 1 || s->channel_count != 1) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                    "Invalid bitmap file (channel_depth %d, channel_count %d)\n",
+                    s->channel_depth, s->channel_count);
+            return AVERROR_INVALIDDATA;
+        }
+        s->line_size = s->width + 7 >> 3;
+        avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+        break;
+    case PSD_INDEXED:
+        if (s->channel_depth != 8 || s->channel_count != 1) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Invalid indexed file (channel_depth %d, channel_count %d)\n",
+                   s->channel_depth, s->channel_count);
+            return AVERROR_INVALIDDATA;
+        }
+        avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        break;
+    case PSD_CMYK:
+        if (s->channel_count == 4) {
+            if (s->channel_depth == 8) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            } else if (s->channel_depth == 16) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRP16BE;
+            } else {
+                avpriv_report_missing_feature(avctx, "channel depth %d for cmyk", s->channel_depth);
+                return AVERROR_PATCHWELCOME;
+            }
+        } else if (s->channel_count == 5) {
+            if (s->channel_depth == 8) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            } else if (s->channel_depth == 16) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRAP16BE;
+            } else {
+                avpriv_report_missing_feature(avctx, "channel depth %d for cmyk", s->channel_depth);
+                return AVERROR_PATCHWELCOME;
+            }
+        } else {
+            avpriv_report_missing_feature(avctx, "channel count %d for cmyk", s->channel_count);
+            return AVERROR_PATCHWELCOME;
+        }
+        break;
+    case PSD_RGB:
+        if (s->channel_count == 3) {
+            if (s->channel_depth == 8) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            } else if (s->channel_depth == 16) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRP16BE;
+            } else {
+                avpriv_report_missing_feature(avctx, "channel depth %d for rgb", s->channel_depth);
+                return AVERROR_PATCHWELCOME;
+            }
+        } else if (s->channel_count == 4) {
+            if (s->channel_depth == 8) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            } else if (s->channel_depth == 16) {
+                avctx->pix_fmt = AV_PIX_FMT_GBRAP16BE;
+            } else {
+                avpriv_report_missing_feature(avctx, "channel depth %d for rgb", s->channel_depth);
+                return AVERROR_PATCHWELCOME;
+            }
+        } else {
+            avpriv_report_missing_feature(avctx, "channel count %d for rgb", s->channel_count);
+            return AVERROR_PATCHWELCOME;
+        }
+        break;
+    case PSD_DUOTONE:
+        av_log(avctx, AV_LOG_WARNING, "ignoring unknown duotone specification.\n");
+    case PSD_GRAYSCALE:
+        if (s->channel_count == 1) {
+            if (s->channel_depth == 8) {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+            } else if (s->channel_depth == 16) {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
+            } else if (s->channel_depth == 32) {
+                avctx->pix_fmt = AV_PIX_FMT_GRAYF32BE;
+            } else {
+                avpriv_report_missing_feature(avctx, "channel depth %d for grayscale", s->channel_depth);
+                return AVERROR_PATCHWELCOME;
+            }
+        } else if (s->channel_count == 2) {
+            if (s->channel_depth == 8) {
+                avctx->pix_fmt = AV_PIX_FMT_YA8;
+            } else if (s->channel_depth == 16) {
+                avctx->pix_fmt = AV_PIX_FMT_YA16BE;
+            } else {
+                avpriv_report_missing_feature(avctx, "channel depth %d for grayscale", s->channel_depth);
+                return AVERROR_PATCHWELCOME;
+            }
+        } else {
+            avpriv_report_missing_feature(avctx, "channel count %d for grayscale", s->channel_count);
+            return AVERROR_PATCHWELCOME;
+        }
+        break;
+    default:
+        avpriv_report_missing_feature(avctx, "color mode %d", s->color_mode);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    s->uncompressed_size = s->line_size * s->height * s->channel_count;
+
+    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0)
+        return ret;
+
+    /* decode picture if need */
+    if (s->compression == PSD_RLE) {
+        s->tmp = av_malloc(s->uncompressed_size);
+        if (!s->tmp)
+            return AVERROR(ENOMEM);
+
+        ret = decode_rle(s);
+
+        if (ret < 0) {
+            av_freep(&s->tmp);
+            return ret;
+        }
+
+        ptr_data = s->tmp;
+    } else {
+        if (bytestream2_get_bytes_left(&s->gb) < s->uncompressed_size) {
+            av_log(s->avctx, AV_LOG_ERROR, "Not enough data for raw image data section.\n");
+            return AVERROR_INVALIDDATA;
+        }
+        ptr_data = s->gb.buffer;
+    }
+
+    /* Store data */
+    if ((avctx->pix_fmt == AV_PIX_FMT_YA8)||(avctx->pix_fmt == AV_PIX_FMT_YA16BE)){/* Interleaved */
+        ptr = picture->data[0];
+        for (c = 0; c < s->channel_count; c++) {
+            for (y = 0; y < s->height; y++) {
+                for (x = 0; x < s->width; x++) {
+                    index_out = y * picture->linesize[0] + x * s->channel_count * s->pixel_size + c * s->pixel_size;
+                    for (p = 0; p < s->pixel_size; p++) {
+                        ptr[index_out + p] = *ptr_data;
+                        ptr_data ++;
+                    }
+                }
+            }
+        }
+    } else if (s->color_mode == PSD_CMYK) {
+        uint8_t *dst[4] = { picture->data[0], picture->data[1], picture->data[2], picture->data[3] };
+        const uint8_t *src[5] = { ptr_data };
+        src[1] = src[0] + s->line_size * s->height;
+        src[2] = src[1] + s->line_size * s->height;
+        src[3] = src[2] + s->line_size * s->height;
+        src[4] = src[3] + s->line_size * s->height;
+        if (s->channel_depth == 8) {
+            for (y = 0; y < s->height; y++) {
+                for (x = 0; x < s->width; x++) {
+                    int k = src[3][x];
+                    int r = src[0][x] * k;
+                    int g = src[1][x] * k;
+                    int b = src[2][x] * k;
+                    dst[0][x] = g * 257 >> 16;
+                    dst[1][x] = b * 257 >> 16;
+                    dst[2][x] = r * 257 >> 16;
+                }
+                dst[0] += picture->linesize[0];
+                dst[1] += picture->linesize[1];
+                dst[2] += picture->linesize[2];
+                src[0] += s->line_size;
+                src[1] += s->line_size;
+                src[2] += s->line_size;
+                src[3] += s->line_size;
+            }
+            if (avctx->pix_fmt == AV_PIX_FMT_GBRAP) {
+                for (y = 0; y < s->height; y++) {
+                    memcpy(dst[3], src[4], s->line_size);
+                    src[4] += s->line_size;
+                    dst[3] += picture->linesize[3];
+                }
+            }
+        } else {
+            for (y = 0; y < s->height; y++) {
+                for (x = 0; x < s->width; x++) {
+                    int64_t k = AV_RB16(&src[3][x * 2]);
+                    int64_t r = AV_RB16(&src[0][x * 2]) * k;
+                    int64_t g = AV_RB16(&src[1][x * 2]) * k;
+                    int64_t b = AV_RB16(&src[2][x * 2]) * k;
+                    AV_WB16(&dst[0][x * 2], g * 65537 >> 32);
+                    AV_WB16(&dst[1][x * 2], b * 65537 >> 32);
+                    AV_WB16(&dst[2][x * 2], r * 65537 >> 32);
+                }
+                dst[0] += picture->linesize[0];
+                dst[1] += picture->linesize[1];
+                dst[2] += picture->linesize[2];
+                src[0] += s->line_size;
+                src[1] += s->line_size;
+                src[2] += s->line_size;
+                src[3] += s->line_size;
+            }
+            if (avctx->pix_fmt == AV_PIX_FMT_GBRAP16BE) {
+                for (y = 0; y < s->height; y++) {
+                    memcpy(dst[3], src[4], s->line_size);
+                    src[4] += s->line_size;
+                    dst[3] += picture->linesize[3];
+                }
+            }
+        }
+    } else {/* Planar */
+        if (s->channel_count == 1)/* gray 8 or gray 16be */
+            eq_channel[0] = 0;/* assign first channel, to first plane */
+
+        for (c = 0; c < s->channel_count; c++) {
+            plane_number = eq_channel[c];
+            ptr = picture->data[plane_number];/* get the right plane */
+            for (y = 0; y < s->height; y++) {
+                memcpy(ptr, ptr_data, s->line_size);
+                ptr += picture->linesize[plane_number];
+                ptr_data += s->line_size;
+            }
+        }
+    }
+
+    if (s->color_mode == PSD_INDEXED) {
+        picture->palette_has_changed = 1;
+        memcpy(picture->data[1], s->palette, AVPALETTE_SIZE);
+    }
+
+    av_freep(&s->tmp);
+
+    picture->pict_type = AV_PICTURE_TYPE_I;
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_psd_decoder = {
+    .name             = "psd",
+    .long_name        = NULL_IF_CONFIG_SMALL("Photoshop PSD file"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_PSD,
+    .priv_data_size   = sizeof(PSDContext),
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+};
diff --git a/libavcodec/psymodel.c b/libavcodec/psymodel.c
index 5179ede..2b5f111 100644
--- a/libavcodec/psymodel.c
+++ b/libavcodec/psymodel.c
@@ -2,20 +2,20 @@
  * audio encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,10 +35,11 @@ av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx, int num_lens,
     int i, j, k = 0;
 
     ctx->avctx = avctx;
-    ctx->ch        = av_mallocz(sizeof(ctx->ch[0]) * avctx->channels * 2);
-    ctx->group     = av_mallocz(sizeof(ctx->group[0]) * num_groups);
-    ctx->bands     = av_malloc (sizeof(ctx->bands[0])     * num_lens);
-    ctx->num_bands = av_malloc (sizeof(ctx->num_bands[0]) * num_lens);
+    ctx->ch        = av_mallocz_array(sizeof(ctx->ch[0]), avctx->channels * 2);
+    ctx->group     = av_mallocz_array(sizeof(ctx->group[0]), num_groups);
+    ctx->bands     = av_malloc_array (sizeof(ctx->bands[0]),      num_lens);
+    ctx->num_bands = av_malloc_array (sizeof(ctx->num_bands[0]),  num_lens);
+    ctx->cutoff    = avctx->cutoff;
 
     if (!ctx->ch || !ctx->group || !ctx->bands || !ctx->num_bands) {
         ff_psy_end(ctx);
@@ -81,7 +82,7 @@ FFPsyChannelGroup *ff_psy_find_group(FFPsyContext *ctx, int channel)
 
 av_cold void ff_psy_end(FFPsyContext *ctx)
 {
-    if (ctx->model->end)
+    if (ctx->model && ctx->model->end)
         ctx->model->end(ctx);
     av_freep(&ctx->bands);
     av_freep(&ctx->num_bands);
@@ -94,6 +95,7 @@ typedef struct FFPsyPreprocessContext{
     float stereo_att;
     struct FFIIRFilterCoeffs *fcoeffs;
     struct FFIIRFilterState **fstate;
+    struct FFIIRFilterContext fiir;
 }FFPsyPreprocessContext;
 
 #define FILT_ORDER 4
@@ -108,22 +110,29 @@ av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *av
         return NULL;
     ctx->avctx = avctx;
 
-    if (avctx->cutoff > 0)
-        cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;
-
-    if (cutoff_coeff)
-    ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
-                                             FF_FILTER_MODE_LOWPASS, FILT_ORDER,
-                                             cutoff_coeff, 0.0, 0.0);
-    if (ctx->fcoeffs) {
-        ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
-        if (!ctx->fstate) {
-            av_free(ctx);
-            return NULL;
+    /* AAC has its own LP method */
+    if (avctx->codec_id != AV_CODEC_ID_AAC) {
+        if (avctx->cutoff > 0)
+            cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;
+
+        if (cutoff_coeff && cutoff_coeff < 0.98)
+        ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
+                                                 FF_FILTER_MODE_LOWPASS, FILT_ORDER,
+                                                 cutoff_coeff, 0.0, 0.0);
+        if (ctx->fcoeffs) {
+            ctx->fstate = av_mallocz_array(sizeof(ctx->fstate[0]), avctx->channels);
+            if (!ctx->fstate) {
+                av_free(ctx->fcoeffs);
+                av_free(ctx);
+                return NULL;
+            }
+            for (i = 0; i < avctx->channels; i++)
+                ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
         }
-        for (i = 0; i < avctx->channels; i++)
-            ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
     }
+
+    ff_iir_filter_init(&ctx->fiir);
+
     return ctx;
 }
 
@@ -131,21 +140,22 @@ void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx, float **audio, int ch
 {
     int ch;
     int frame_size = ctx->avctx->frame_size;
+    FFIIRFilterContext *iir = &ctx->fiir;
 
     if (ctx->fstate) {
         for (ch = 0; ch < channels; ch++)
-            ff_iir_filter_flt(ctx->fcoeffs, ctx->fstate[ch], frame_size,
-                              &audio[ch][frame_size], 1, &audio[ch][frame_size], 1);
+            iir->filter_flt(ctx->fcoeffs, ctx->fstate[ch], frame_size,
+                            &audio[ch][frame_size], 1, &audio[ch][frame_size], 1);
     }
 }
 
 av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx)
 {
     int i;
-    ff_iir_filter_free_coeffs(ctx->fcoeffs);
+    ff_iir_filter_free_coeffsp(&ctx->fcoeffs);
     if (ctx->fstate)
         for (i = 0; i < ctx->avctx->channels; i++)
-            ff_iir_filter_free_state(ctx->fstate[i]);
+            ff_iir_filter_free_statep(&ctx->fstate[i]);
     av_freep(&ctx->fstate);
     av_free(ctx);
 }
diff --git a/libavcodec/psymodel.h b/libavcodec/psymodel.h
index 1cc3066..e5f917d 100644
--- a/libavcodec/psymodel.h
+++ b/libavcodec/psymodel.h
@@ -2,20 +2,20 @@
  * audio encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,21 @@
 /** maximum number of channels */
 #define PSY_MAX_CHANS 20
 
+/* cutoff for VBR is purposely increased, since LP filtering actually
+ * hinders VBR performance rather than the opposite
+ */
+#define AAC_CUTOFF_FROM_BITRATE(bit_rate,channels,sample_rate) (bit_rate ? FFMIN3(FFMIN3( \
+    FFMAX(bit_rate/channels/5, bit_rate/channels*15/32 - 5500), \
+    3000 + bit_rate/channels/4, \
+    12000 + bit_rate/channels/16), \
+    22000, \
+    sample_rate / 2): (sample_rate / 2))
+#define AAC_CUTOFF(s) ( \
+    (s->flags & AV_CODEC_FLAG_QSCALE) \
+    ? s->sample_rate / 2 \
+    : AAC_CUTOFF_FROM_BITRATE(s->bit_rate, s->channels, s->sample_rate) \
+)
+
 /**
  * single band psychoacoustic information
  */
@@ -36,8 +51,7 @@ typedef struct FFPsyBand {
     int   bits;
     float energy;
     float threshold;
-    float distortion;
-    float perceptual_weight;
+    float spread;    /* Energy spread over the band */
 } FFPsyBand;
 
 /**
@@ -65,6 +79,7 @@ typedef struct FFPsyWindowInfo {
     int window_shape;                 ///< window shape (sine/KBD/whatever)
     int num_windows;                  ///< number of windows in a frame
     int grouping[8];                  ///< window grouping (for e.g. AAC)
+    float clipping[8];                ///< maximum absolute normalized intensity in the given window for clip avoidance
     int *window_sizes;                ///< sequence of window sizes inside one frame (for eg. WMA)
 } FFPsyWindowInfo;
 
@@ -78,6 +93,7 @@ typedef struct FFPsyContext {
     FFPsyChannel      *ch;            ///< single channel information
     FFPsyChannelGroup *group;         ///< channel group information
     int num_groups;                   ///< number of channel groups
+    int cutoff;                       ///< lowpass frequency cutoff for analysis
 
     uint8_t **bands;                  ///< scalefactor band sizes for possible frame sizes
     int     *num_bands;               ///< number of scalefactor bands for possible frame sizes
@@ -86,6 +102,7 @@ typedef struct FFPsyContext {
     struct {
         int size;                     ///< size of the bitresevoir in bits
         int bits;                     ///< number of bits used in the bitresevoir
+        int alloc;                    ///< number of bits allocated by the psy, or -1 if no allocation was done
     } bitres;
 
     void* model_priv_data;            ///< psychoacoustic model implementation private data
diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index 3b3f3ad..5724715 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -6,20 +6,20 @@
  * to Michael Niedermayer <michaelni@gmx.at> for writing initial
  * implementation.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index 338b008..36ac0ac 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,6 @@
 #include <stdatomic.h>
 #include <stdint.h>
 
-#if HAVE_PTHREADS
-#include <pthread.h>
-#elif HAVE_W32THREADS
-#include "compat/w32pthreads.h"
-#endif
-
 #include "avcodec.h"
 #include "hwaccel.h"
 #include "internal.h"
@@ -48,6 +42,8 @@
 #include "libavutil/internal.h"
 #include "libavutil/log.h"
 #include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/thread.h"
 
 enum {
     ///< Set when the thread is awaiting a packet.
@@ -59,6 +55,11 @@ enum {
      * State is returned to STATE_SETTING_UP afterwards.
      */
     STATE_GET_BUFFER,
+     /**
+      * Set when the codec calls get_format().
+      * State is returned to STATE_SETTING_UP afterwards.
+      */
+    STATE_GET_FORMAT,
     ///< Set after the codec has called ff_thread_finish_setup().
     STATE_SETUP_FINISHED,
 };
@@ -99,10 +100,15 @@ typedef struct PerThreadContext {
     AVFrame *requested_frame;       ///< AVFrame the codec passed to get_buffer()
     int      requested_flags;       ///< flags passed to get_buffer() for requested_frame
 
-    int die;                       ///< Set when the thread should exit.
+    const enum AVPixelFormat *available_formats; ///< Format array for get_format()
+    enum AVPixelFormat result_format;            ///< get_format() result
+
+    int die;                        ///< Set when the thread should exit.
 
     int hwaccel_serializing;
     int async_serializing;
+
+    atomic_int debug_threads;       ///< Set if the FF_DEBUG_THREADS option is set.
 } PerThreadContext;
 
 /**
@@ -119,6 +125,8 @@ typedef struct FrameThreadContext {
      */
     pthread_mutex_t hwaccel_mutex;
     pthread_mutex_t async_mutex;
+    pthread_cond_t async_cond;
+    int async_lock;
 
     int next_decoding;             ///< The next context to submit a packet to.
     int next_finished;             ///< The next context to return output from.
@@ -129,6 +137,27 @@ typedef struct FrameThreadContext {
                                     */
 } FrameThreadContext;
 
+#define THREAD_SAFE_CALLBACKS(avctx) \
+((avctx)->thread_safe_callbacks || (avctx)->get_buffer2 == avcodec_default_get_buffer2)
+
+static void async_lock(FrameThreadContext *fctx)
+{
+    pthread_mutex_lock(&fctx->async_mutex);
+    while (fctx->async_lock)
+        pthread_cond_wait(&fctx->async_cond, &fctx->async_mutex);
+    fctx->async_lock = 1;
+    pthread_mutex_unlock(&fctx->async_mutex);
+}
+
+static void async_unlock(FrameThreadContext *fctx)
+{
+    pthread_mutex_lock(&fctx->async_mutex);
+    av_assert0(fctx->async_lock);
+    fctx->async_lock = 0;
+    pthread_cond_broadcast(&fctx->async_cond);
+    pthread_mutex_unlock(&fctx->async_mutex);
+}
+
 /**
  * Codec worker thread.
  *
@@ -142,23 +171,15 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
     AVCodecContext *avctx = p->avctx;
     const AVCodec *codec = avctx->codec;
 
+    pthread_mutex_lock(&p->mutex);
     while (1) {
-        if (atomic_load(&p->state) == STATE_INPUT_READY) {
-            pthread_mutex_lock(&p->mutex);
-            while (atomic_load(&p->state) == STATE_INPUT_READY) {
-                if (p->die) {
-                    pthread_mutex_unlock(&p->mutex);
-                    goto die;
-                }
-                pthread_cond_wait(&p->input_cond, &p->mutex);
-            }
-            pthread_mutex_unlock(&p->mutex);
-        }
+        while (atomic_load(&p->state) == STATE_INPUT_READY && !p->die)
+            pthread_cond_wait(&p->input_cond, &p->mutex);
 
-        if (!codec->update_thread_context && avctx->thread_safe_callbacks)
-            ff_thread_finish_setup(avctx);
+        if (p->die) break;
 
-        pthread_mutex_lock(&p->mutex);
+        if (!codec->update_thread_context && THREAD_SAFE_CALLBACKS(avctx))
+            ff_thread_finish_setup(avctx);
 
         /* If a decoder supports hwaccel, then it must call ff_get_format().
          * Since that call must happen before ff_thread_finish_setup(), the
@@ -196,18 +217,19 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
 
         if (p->async_serializing) {
             p->async_serializing = 0;
-            pthread_mutex_unlock(&p->parent->async_mutex);
+
+            async_unlock(p->parent);
         }
 
+        pthread_mutex_lock(&p->progress_mutex);
+
         atomic_store(&p->state, STATE_INPUT_READY);
 
-        pthread_mutex_lock(&p->progress_mutex);
+        pthread_cond_broadcast(&p->progress_cond);
         pthread_cond_signal(&p->output_cond);
         pthread_mutex_unlock(&p->progress_mutex);
-
-        pthread_mutex_unlock(&p->mutex);
     }
-die:
+    pthread_mutex_unlock(&p->mutex);
 
     return NULL;
 }
@@ -218,12 +240,13 @@ die:
  * @param dst The destination context.
  * @param src The source context.
  * @param for_user 0 if the destination is a codec thread, 1 if the destination is the user's thread
+ * @return 0 on success, negative error code on failure
  */
 static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src, int for_user)
 {
     int err = 0;
 
-    if (dst != src) {
+    if (dst != src && (for_user || !(src->codec_descriptor->props & AV_CODEC_PROP_INTRA_ONLY))) {
         dst->time_base = src->time_base;
         dst->framerate = src->framerate;
         dst->width     = src->width;
@@ -254,6 +277,11 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src,
 
         dst->hwaccel = src->hwaccel;
         dst->hwaccel_context = src->hwaccel_context;
+
+        dst->channels       = src->channels;
+        dst->sample_rate    = src->sample_rate;
+        dst->sample_fmt     = src->sample_fmt;
+        dst->channel_layout = src->channel_layout;
         dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;
 
         if (!!dst->hw_frames_ctx != !!src->hw_frames_ctx ||
@@ -271,6 +299,7 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src,
     }
 
     if (for_user) {
+        dst->delay       = src->thread_count - 1;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
         dst->coded_frame = src->coded_frame;
@@ -301,6 +330,7 @@ static int update_context_from_user(AVCodecContext *dst, AVCodecContext *src)
 
     dst->opaque   = src->opaque;
     dst->debug    = src->debug;
+    dst->debug_mv = src->debug_mv;
 
     dst->slice_flags = src->slice_flags;
     dst->flags2      = src->flags2;
@@ -309,16 +339,14 @@ static int update_context_from_user(AVCodecContext *dst, AVCodecContext *src)
 
     dst->frame_number     = src->frame_number;
     dst->reordered_opaque = src->reordered_opaque;
+    dst->thread_safe_callbacks = src->thread_safe_callbacks;
 
     if (src->slice_count && src->slice_offset) {
         if (dst->slice_count < src->slice_count) {
-            int *tmp = av_realloc(dst->slice_offset, src->slice_count *
-                                  sizeof(*dst->slice_offset));
-            if (!tmp) {
-                av_free(dst->slice_offset);
-                return AVERROR(ENOMEM);
-            }
-            dst->slice_offset = tmp;
+            int err = av_reallocp_array(&dst->slice_offset, src->slice_count,
+                                        sizeof(*dst->slice_offset));
+            if (err < 0)
+                return err;
         }
         memcpy(dst->slice_offset, src->slice_offset,
                src->slice_count * sizeof(*dst->slice_offset));
@@ -339,7 +367,8 @@ static void release_delayed_buffers(PerThreadContext *p)
         pthread_mutex_lock(&fctx->buffer_mutex);
 
         // fix extended data in case the caller screwed it up
-        av_assert0(p->avctx->codec_type == AVMEDIA_TYPE_VIDEO);
+        av_assert0(p->avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+                   p->avctx->codec_type == AVMEDIA_TYPE_AUDIO);
         f = &p->released_buffers[--p->num_released_buffers];
         f->extended_data = f->data;
         av_frame_unref(f);
@@ -348,17 +377,28 @@ static void release_delayed_buffers(PerThreadContext *p)
     }
 }
 
-static int submit_packet(PerThreadContext *p, AVPacket *avpkt)
+static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx,
+                         AVPacket *avpkt)
 {
     FrameThreadContext *fctx = p->parent;
     PerThreadContext *prev_thread = fctx->prev_thread;
     const AVCodec *codec = p->avctx->codec;
+    int ret;
 
     if (!avpkt->size && !(codec->capabilities & AV_CODEC_CAP_DELAY))
         return 0;
 
     pthread_mutex_lock(&p->mutex);
 
+    ret = update_context_from_user(p->avctx, user_avctx);
+    if (ret) {
+        pthread_mutex_unlock(&p->mutex);
+        return ret;
+    }
+    atomic_store_explicit(&p->debug_threads,
+                          (p->avctx->debug & FF_DEBUG_THREADS) != 0,
+                          memory_order_relaxed);
+
     release_delayed_buffers(p);
 
     if (prev_thread) {
@@ -378,7 +418,12 @@ static int submit_packet(PerThreadContext *p, AVPacket *avpkt)
     }
 
     av_packet_unref(&p->avpkt);
-    av_packet_ref(&p->avpkt, avpkt);
+    ret = av_packet_ref(&p->avpkt, avpkt);
+    if (ret < 0) {
+        pthread_mutex_unlock(&p->mutex);
+        av_log(p->avctx, AV_LOG_ERROR, "av_packet_ref() failed in submit_packet()\n");
+        return ret;
+    }
 
     atomic_store(&p->state, STATE_SETTING_UP);
     pthread_cond_signal(&p->input_cond);
@@ -390,16 +435,27 @@ static int submit_packet(PerThreadContext *p, AVPacket *avpkt)
      * and it calls back to the client here.
      */
 
-    if (!p->avctx->thread_safe_callbacks &&
-        p->avctx->get_buffer2 != avcodec_default_get_buffer2) {
-        while (atomic_load(&p->state) != STATE_SETUP_FINISHED &&
-               atomic_load(&p->state) != STATE_INPUT_READY) {
+    if (!p->avctx->thread_safe_callbacks && (
+         p->avctx->get_format != avcodec_default_get_format ||
+         p->avctx->get_buffer2 != avcodec_default_get_buffer2)) {
+        while (atomic_load(&p->state) != STATE_SETUP_FINISHED && atomic_load(&p->state) != STATE_INPUT_READY) {
+            int call_done = 1;
             pthread_mutex_lock(&p->progress_mutex);
             while (atomic_load(&p->state) == STATE_SETTING_UP)
                 pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
 
-            if (atomic_load_explicit(&p->state, memory_order_acquire) == STATE_GET_BUFFER) {
+            switch (atomic_load_explicit(&p->state, memory_order_acquire)) {
+            case STATE_GET_BUFFER:
                 p->result = ff_get_buffer(p->avctx, p->requested_frame, p->requested_flags);
+                break;
+            case STATE_GET_FORMAT:
+                p->result_format = ff_get_format(p->avctx, p->available_formats);
+                break;
+            default:
+                call_done = 0;
+                break;
+            }
+            if (call_done) {
                 atomic_store(&p->state, STATE_SETTING_UP);
                 pthread_cond_signal(&p->progress_cond);
             }
@@ -420,21 +476,18 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
     FrameThreadContext *fctx = avctx->internal->thread_ctx;
     int finished = fctx->next_finished;
     PerThreadContext *p;
-    int err, ret;
+    int err;
 
     /* release the async lock, permitting blocked hwaccel threads to
      * go forward while we are in this function */
-    pthread_mutex_unlock(&fctx->async_mutex);
+    async_unlock(fctx);
 
     /*
      * Submit a packet to the next decoding thread.
      */
 
     p = &fctx->threads[fctx->next_decoding];
-    err = update_context_from_user(p->avctx, avctx);
-    if (err)
-        goto finish;
-    err = submit_packet(p, avpkt);
+    err = submit_packet(p, avctx, avpkt);
     if (err)
         goto finish;
 
@@ -442,12 +495,13 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
      * If we're still receiving the initial packets, don't return a frame.
      */
 
-    if (fctx->delaying) {
-        if (fctx->next_decoding >= (avctx->thread_count-1)) fctx->delaying = 0;
+    if (fctx->next_decoding > (avctx->thread_count-1-(avctx->codec_id == AV_CODEC_ID_FFV1)))
+        fctx->delaying = 0;
 
+    if (fctx->delaying) {
         *got_picture_ptr=0;
         if (avpkt->size) {
-            ret = avpkt->size;
+            err = avpkt->size;
             goto finish;
         }
     }
@@ -455,8 +509,8 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
     /*
      * Return the next available frame from the oldest thread.
      * If we're at the end of the stream, then we have to skip threads that
-     * didn't output a frame, because we don't want to accidentally signal
-     * EOF (avpkt->size == 0 && *got_picture_ptr == 0).
+     * didn't output a frame/error, because we don't want to accidentally signal
+     * EOF (avpkt->size == 0 && *got_picture_ptr == 0 && err >= 0).
      */
 
     do {
@@ -472,17 +526,19 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
         av_frame_move_ref(picture, p->frame);
         *got_picture_ptr = p->got_frame;
         picture->pkt_dts = p->avpkt.dts;
+        err = p->result;
 
         /*
          * A later call with avkpt->size == 0 may loop over all threads,
-         * including this one, searching for a frame to return before being
+         * including this one, searching for a frame/error to return before being
          * stopped by the "finished != fctx->next_finished" condition.
-         * Make sure we don't mistakenly return the same frame again.
+         * Make sure we don't mistakenly return the same frame/error again.
          */
         p->got_frame = 0;
+        p->result = 0;
 
         if (finished >= avctx->thread_count) finished = 0;
-    } while (!avpkt->size && !*got_picture_ptr && finished != fctx->next_finished);
+    } while (!avpkt->size && !*got_picture_ptr && err >= 0 && finished != fctx->next_finished);
 
     update_context_from_thread(avctx, p->avctx, 1);
 
@@ -491,12 +547,11 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
     fctx->next_finished = finished;
 
     /* return the size of the consumed packet if no error occurred */
-    ret = (p->result >= 0) ? avpkt->size : p->result;
+    if (err >= 0)
+        err = avpkt->size;
 finish:
-    pthread_mutex_lock(&fctx->async_mutex);
-    if (err < 0)
-        return err;
-    return ret;
+    async_lock(fctx);
+    return err;
 }
 
 void ff_thread_report_progress(ThreadFrame *f, int n, int field)
@@ -508,10 +563,11 @@ void ff_thread_report_progress(ThreadFrame *f, int n, int field)
         atomic_load_explicit(&progress[field], memory_order_relaxed) >= n)
         return;
 
-    p = f->owner->internal->thread_ctx;
+    p = f->owner[field]->internal->thread_ctx;
 
-    if (f->owner->debug&FF_DEBUG_THREADS)
-        av_log(f->owner, AV_LOG_DEBUG, "%p finished %d field %d\n", progress, n, field);
+    if (atomic_load_explicit(&p->debug_threads, memory_order_relaxed))
+        av_log(f->owner[field], AV_LOG_DEBUG,
+               "%p finished %d field %d\n", progress, n, field);
 
     pthread_mutex_lock(&p->progress_mutex);
 
@@ -530,10 +586,11 @@ void ff_thread_await_progress(ThreadFrame *f, int n, int field)
         atomic_load_explicit(&progress[field], memory_order_acquire) >= n)
         return;
 
-    p = f->owner->internal->thread_ctx;
+    p = f->owner[field]->internal->thread_ctx;
 
-    if (f->owner->debug&FF_DEBUG_THREADS)
-        av_log(f->owner, AV_LOG_DEBUG, "thread awaiting %d field %d from %p\n", n, field, progress);
+    if (atomic_load_explicit(&p->debug_threads, memory_order_relaxed))
+        av_log(f->owner[field], AV_LOG_DEBUG,
+               "thread awaiting %d field %d from %p\n", n, field, progress);
 
     pthread_mutex_lock(&p->progress_mutex);
     while (atomic_load_explicit(&progress[field], memory_order_relaxed) < n)
@@ -555,10 +612,14 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
     if (avctx->hwaccel &&
         !(avctx->hwaccel->caps_internal & HWACCEL_CAP_ASYNC_SAFE)) {
         p->async_serializing = 1;
-        pthread_mutex_lock(&p->parent->async_mutex);
+
+        async_lock(p->parent);
     }
 
     pthread_mutex_lock(&p->progress_mutex);
+    if(atomic_load(&p->state) == STATE_SETUP_FINISHED){
+        av_log(avctx, AV_LOG_WARNING, "Multiple ff_thread_finish_setup() calls\n");
+    }
 
     atomic_store(&p->state, STATE_SETUP_FINISHED);
 
@@ -571,7 +632,7 @@ static void park_frame_worker_threads(FrameThreadContext *fctx, int thread_count
 {
     int i;
 
-    pthread_mutex_unlock(&fctx->async_mutex);
+    async_unlock(fctx);
 
     for (i = 0; i < thread_count; i++) {
         PerThreadContext *p = &fctx->threads[i];
@@ -582,9 +643,10 @@ static void park_frame_worker_threads(FrameThreadContext *fctx, int thread_count
                 pthread_cond_wait(&p->output_cond, &p->progress_mutex);
             pthread_mutex_unlock(&p->progress_mutex);
         }
+        p->got_frame = 0;
     }
 
-    pthread_mutex_lock(&fctx->async_mutex);
+    async_lock(fctx);
 }
 
 void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
@@ -596,7 +658,11 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
     park_frame_worker_threads(fctx, thread_count);
 
     if (fctx->prev_thread && fctx->prev_thread != fctx->threads)
-        update_context_from_thread(fctx->threads->avctx, fctx->prev_thread->avctx, 0);
+        if (update_context_from_thread(fctx->threads->avctx, fctx->prev_thread->avctx, 0) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Final thread update failed\n");
+            fctx->prev_thread->avctx->internal->is_copy = fctx->threads->avctx->internal->is_copy;
+            fctx->threads->avctx->internal->is_copy = 1;
+        }
 
     for (i = 0; i < thread_count; i++) {
         PerThreadContext *p = &fctx->threads[i];
@@ -608,12 +674,11 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
 
         if (p->thread_init)
             pthread_join(p->thread, NULL);
+        p->thread_init=0;
 
-        if (codec->close)
+        if (codec->close && p->avctx)
             codec->close(p->avctx);
 
-        avctx->codec = NULL;
-
         release_delayed_buffers(p);
         av_frame_free(&p->frame);
     }
@@ -629,25 +694,30 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
         av_packet_unref(&p->avpkt);
         av_freep(&p->released_buffers);
 
-        if (i) {
+        if (i && p->avctx) {
             av_freep(&p->avctx->priv_data);
             av_freep(&p->avctx->slice_offset);
         }
 
-        av_buffer_unref(&p->avctx->hw_frames_ctx);
+        if (p->avctx) {
+            av_freep(&p->avctx->internal);
+            av_buffer_unref(&p->avctx->hw_frames_ctx);
+        }
 
-        av_freep(&p->avctx->internal);
         av_freep(&p->avctx);
     }
 
     av_freep(&fctx->threads);
     pthread_mutex_destroy(&fctx->buffer_mutex);
     pthread_mutex_destroy(&fctx->hwaccel_mutex);
-
-    pthread_mutex_unlock(&fctx->async_mutex);
     pthread_mutex_destroy(&fctx->async_mutex);
+    pthread_cond_destroy(&fctx->async_cond);
 
     av_freep(&avctx->internal->thread_ctx);
+
+    if (avctx->priv_data && avctx->codec && avctx->codec->priv_class)
+        av_opt_free(avctx->priv_data);
+    avctx->codec = NULL;
 }
 
 int ff_frame_thread_init(AVCodecContext *avctx)
@@ -660,7 +730,10 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
     if (!thread_count) {
         int nb_cpus = av_cpu_count();
-        av_log(avctx, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
+#if FF_API_DEBUG_MV
+        if ((avctx->debug & (FF_DEBUG_VIS_QP | FF_DEBUG_VIS_MB_TYPE)) || avctx->debug_mv)
+            nb_cpus = 1;
+#endif
         // use number of cores + 1 as thread count if there is more than one
         if (nb_cpus > 1)
             thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
@@ -677,7 +750,7 @@ int ff_frame_thread_init(AVCodecContext *avctx)
     if (!fctx)
         return AVERROR(ENOMEM);
 
-    fctx->threads = av_mallocz(sizeof(PerThreadContext) * thread_count);
+    fctx->threads = av_mallocz_array(thread_count, sizeof(PerThreadContext));
     if (!fctx->threads) {
         av_freep(&avctx->internal->thread_ctx);
         return AVERROR(ENOMEM);
@@ -685,10 +758,10 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
     pthread_mutex_init(&fctx->buffer_mutex, NULL);
     pthread_mutex_init(&fctx->hwaccel_mutex, NULL);
-
     pthread_mutex_init(&fctx->async_mutex, NULL);
-    pthread_mutex_lock(&fctx->async_mutex);
+    pthread_cond_init(&fctx->async_cond, NULL);
 
+    fctx->async_lock = 1;
     fctx->delaying = 1;
 
     for (i = 0; i < thread_count; i++) {
@@ -720,6 +793,7 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
         copy->internal = av_malloc(sizeof(AVCodecInternal));
         if (!copy->internal) {
+            copy->priv_data = NULL;
             err = AVERROR(ENOMEM);
             goto error;
         }
@@ -749,8 +823,12 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
         if (err) goto error;
 
-        if (!pthread_create(&p->thread, NULL, frame_worker_thread, p))
-            p->thread_init = 1;
+        atomic_init(&p->debug_threads, (copy->debug & FF_DEBUG_THREADS) != 0);
+
+        err = AVERROR(pthread_create(&p->thread, NULL, frame_worker_thread, p));
+        p->thread_init= !err;
+        if(!p->thread_init)
+            goto error;
     }
 
     return 0;
@@ -782,6 +860,7 @@ void ff_thread_flush(AVCodecContext *avctx)
         // Make sure decode flush calls with size=0 won't return old frames
         p->got_frame = 0;
         av_frame_unref(p->frame);
+        p->result = 0;
 
         release_delayed_buffers(p);
 
@@ -790,18 +869,28 @@ void ff_thread_flush(AVCodecContext *avctx)
     }
 }
 
-int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
+int ff_thread_can_start_frame(AVCodecContext *avctx)
+{
+    PerThreadContext *p = avctx->internal->thread_ctx;
+    if ((avctx->active_thread_type&FF_THREAD_FRAME) && atomic_load(&p->state) != STATE_SETTING_UP &&
+        (avctx->codec->update_thread_context || !THREAD_SAFE_CALLBACKS(avctx))) {
+        return 0;
+    }
+    return 1;
+}
+
+static int thread_get_buffer_internal(AVCodecContext *avctx, ThreadFrame *f, int flags)
 {
     PerThreadContext *p = avctx->internal->thread_ctx;
     int err;
 
-    f->owner = avctx;
+    f->owner[0] = f->owner[1] = avctx;
 
     if (!(avctx->active_thread_type & FF_THREAD_FRAME))
         return ff_get_buffer(avctx, f->f, flags);
 
     if (atomic_load(&p->state) != STATE_SETTING_UP &&
-        (avctx->codec->update_thread_context || !avctx->thread_safe_callbacks)) {
+        (avctx->codec->update_thread_context || !THREAD_SAFE_CALLBACKS(avctx))) {
         av_log(avctx, AV_LOG_ERROR, "get_buffer() cannot be called after ff_thread_finish_setup()\n");
         return -1;
     }
@@ -819,15 +908,14 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
     }
 
     pthread_mutex_lock(&p->parent->buffer_mutex);
-    if (avctx->thread_safe_callbacks ||
-        avctx->get_buffer2 == avcodec_default_get_buffer2) {
+    if (THREAD_SAFE_CALLBACKS(avctx)) {
         err = ff_get_buffer(avctx, f->f, flags);
     } else {
+        pthread_mutex_lock(&p->progress_mutex);
         p->requested_frame = f->f;
         p->requested_flags = flags;
         atomic_store_explicit(&p->state, STATE_GET_BUFFER, memory_order_release);
-        pthread_mutex_lock(&p->progress_mutex);
-        pthread_cond_signal(&p->progress_cond);
+        pthread_cond_broadcast(&p->progress_cond);
 
         while (atomic_load(&p->state) != STATE_SETTING_UP)
             pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
@@ -837,9 +925,8 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
         pthread_mutex_unlock(&p->progress_mutex);
 
     }
-    if (!avctx->thread_safe_callbacks && !avctx->codec->update_thread_context)
+    if (!THREAD_SAFE_CALLBACKS(avctx) && !avctx->codec->update_thread_context)
         ff_thread_finish_setup(avctx);
-
     if (err)
         av_buffer_unref(&f->progress);
 
@@ -848,14 +935,47 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
     return err;
 }
 
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
+{
+    enum AVPixelFormat res;
+    PerThreadContext *p = avctx->internal->thread_ctx;
+    if (!(avctx->active_thread_type & FF_THREAD_FRAME) || avctx->thread_safe_callbacks ||
+        avctx->get_format == avcodec_default_get_format)
+        return ff_get_format(avctx, fmt);
+    if (atomic_load(&p->state) != STATE_SETTING_UP) {
+        av_log(avctx, AV_LOG_ERROR, "get_format() cannot be called after ff_thread_finish_setup()\n");
+        return -1;
+    }
+    pthread_mutex_lock(&p->progress_mutex);
+    p->available_formats = fmt;
+    atomic_store(&p->state, STATE_GET_FORMAT);
+    pthread_cond_broadcast(&p->progress_cond);
+
+    while (atomic_load(&p->state) != STATE_SETTING_UP)
+        pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
+
+    res = p->result_format;
+
+    pthread_mutex_unlock(&p->progress_mutex);
+
+    return res;
+}
+
+int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
+{
+    int ret = thread_get_buffer_internal(avctx, f, flags);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "thread_get_buffer() failed\n");
+    return ret;
+}
+
 void ff_thread_release_buffer(AVCodecContext *avctx, ThreadFrame *f)
 {
     PerThreadContext *p = avctx->internal->thread_ctx;
     FrameThreadContext *fctx;
     AVFrame *dst, *tmp;
     int can_direct_free = !(avctx->active_thread_type & FF_THREAD_FRAME) ||
-                          avctx->thread_safe_callbacks                   ||
-                          avctx->get_buffer2 == avcodec_default_get_buffer2;
+                          THREAD_SAFE_CALLBACKS(avctx);
 
     if (!f->f || !f->f->buf[0])
         return;
@@ -864,7 +984,7 @@ void ff_thread_release_buffer(AVCodecContext *avctx, ThreadFrame *f)
         av_log(avctx, AV_LOG_DEBUG, "thread_release_buffer called on pic %p\n", f);
 
     av_buffer_unref(&f->progress);
-    f->owner    = NULL;
+    f->owner[0] = f->owner[1] = NULL;
 
     if (can_direct_free) {
         av_frame_unref(f->f);
diff --git a/libavcodec/pthread_internal.h b/libavcodec/pthread_internal.h
index fca9b10..d2115cb 100644
--- a/libavcodec/pthread_internal.h
+++ b/libavcodec/pthread_internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index d4af37b..77cfe3c 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,76 +24,54 @@
 
 #include "config.h"
 
-#if HAVE_PTHREADS
-#include <pthread.h>
-#elif HAVE_W32THREADS
-#include "compat/w32pthreads.h"
-#endif
-
 #include "avcodec.h"
 #include "internal.h"
 #include "pthread_internal.h"
 #include "thread.h"
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
+#include "libavutil/thread.h"
+#include "libavutil/slicethread.h"
 
 typedef int (action_func)(AVCodecContext *c, void *arg);
 typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr);
+typedef int (main_func)(AVCodecContext *c);
 
 typedef struct SliceThreadContext {
-    pthread_t *workers;
+    AVSliceThread *thread;
     action_func *func;
     action_func2 *func2;
+    main_func *mainfunc;
     void *args;
     int *rets;
-    int rets_count;
-    int job_count;
     int job_size;
 
-    pthread_cond_t last_job_cond;
-    pthread_cond_t current_job_cond;
-    pthread_mutex_t current_job_lock;
-    unsigned current_execute;
-    int current_job;
-    int done;
+    int *entries;
+    int entries_count;
+    int thread_count;
+    pthread_cond_t *progress_cond;
+    pthread_mutex_t *progress_mutex;
 } SliceThreadContext;
 
-static void* attribute_align_arg worker(void *v)
-{
-    AVCodecContext *avctx = v;
+static void main_function(void *priv) {
+    AVCodecContext *avctx = priv;
     SliceThreadContext *c = avctx->internal->thread_ctx;
-    unsigned last_execute = 0;
-    int our_job = c->job_count;
-    int thread_count = avctx->thread_count;
-    int self_id;
-
-    pthread_mutex_lock(&c->current_job_lock);
-    self_id = c->current_job++;
-    for (;;){
-        while (our_job >= c->job_count) {
-            if (c->current_job == thread_count + c->job_count)
-                pthread_cond_signal(&c->last_job_cond);
-
-            while (last_execute == c->current_execute && !c->done)
-                pthread_cond_wait(&c->current_job_cond, &c->current_job_lock);
-            last_execute = c->current_execute;
-            our_job = self_id;
-
-            if (c->done) {
-                pthread_mutex_unlock(&c->current_job_lock);
-                return NULL;
-            }
-        }
-        pthread_mutex_unlock(&c->current_job_lock);
+    c->mainfunc(avctx);
+}
 
-        c->rets[our_job%c->rets_count] = c->func ? c->func(avctx, (char*)c->args + our_job*c->job_size):
-                                                   c->func2(avctx, c->args, our_job, self_id);
+static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int nb_threads)
+{
+    AVCodecContext *avctx = priv;
+    SliceThreadContext *c = avctx->internal->thread_ctx;
+    int ret;
 
-        pthread_mutex_lock(&c->current_job_lock);
-        our_job = c->current_job++;
-    }
+    ret = c->func ? c->func(avctx, (char *)c->args + c->job_size * jobnr)
+                  : c->func2(avctx, c->args, jobnr, threadnr);
+    if (c->rets)
+        c->rets[jobnr] = ret;
 }
 
 void ff_slice_thread_free(AVCodecContext *avctx)
@@ -101,32 +79,22 @@ void ff_slice_thread_free(AVCodecContext *avctx)
     SliceThreadContext *c = avctx->internal->thread_ctx;
     int i;
 
-    pthread_mutex_lock(&c->current_job_lock);
-    c->done = 1;
-    pthread_cond_broadcast(&c->current_job_cond);
-    pthread_mutex_unlock(&c->current_job_lock);
+    avpriv_slicethread_free(&c->thread);
 
-    for (i=0; i<avctx->thread_count; i++)
-         pthread_join(c->workers[i], NULL);
+    for (i = 0; i < c->thread_count; i++) {
+        pthread_mutex_destroy(&c->progress_mutex[i]);
+        pthread_cond_destroy(&c->progress_cond[i]);
+    }
 
-    pthread_mutex_destroy(&c->current_job_lock);
-    pthread_cond_destroy(&c->current_job_cond);
-    pthread_cond_destroy(&c->last_job_cond);
-    av_free(c->workers);
+    av_freep(&c->entries);
+    av_freep(&c->progress_mutex);
+    av_freep(&c->progress_cond);
     av_freep(&avctx->internal->thread_ctx);
 }
 
-static av_always_inline void thread_park_workers(SliceThreadContext *c, int thread_count)
-{
-    while (c->current_job != thread_count + c->job_count)
-        pthread_cond_wait(&c->last_job_cond, &c->current_job_lock);
-    pthread_mutex_unlock(&c->current_job_lock);
-}
-
 static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size)
 {
     SliceThreadContext *c = avctx->internal->thread_ctx;
-    int dummy_ret;
 
     if (!(avctx->active_thread_type&FF_THREAD_SLICE) || avctx->thread_count <= 1)
         return avcodec_default_execute(avctx, func, arg, ret, job_count, job_size);
@@ -134,25 +102,12 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i
     if (job_count <= 0)
         return 0;
 
-    pthread_mutex_lock(&c->current_job_lock);
-
-    c->current_job = avctx->thread_count;
-    c->job_count = job_count;
     c->job_size = job_size;
     c->args = arg;
     c->func = func;
-    if (ret) {
-        c->rets = ret;
-        c->rets_count = job_count;
-    } else {
-        c->rets = &dummy_ret;
-        c->rets_count = 1;
-    }
-    c->current_execute++;
-    pthread_cond_broadcast(&c->current_job_cond);
-
-    thread_park_workers(c, avctx->thread_count);
+    c->rets = ret;
 
+    avpriv_slicethread_execute(c->thread, job_count, !!c->mainfunc  );
     return 0;
 }
 
@@ -163,15 +118,30 @@ static int thread_execute2(AVCodecContext *avctx, action_func2* func2, void *arg
     return thread_execute(avctx, NULL, arg, ret, job_count, 0);
 }
 
+int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx, action_func2* func2, main_func *mainfunc, void *arg, int *ret, int job_count)
+{
+    SliceThreadContext *c = avctx->internal->thread_ctx;
+    c->func2 = func2;
+    c->mainfunc = mainfunc;
+    return thread_execute(avctx, NULL, arg, ret, job_count, 0);
+}
+
 int ff_slice_thread_init(AVCodecContext *avctx)
 {
-    int i;
     SliceThreadContext *c;
     int thread_count = avctx->thread_count;
+    static void (*mainfunc)(void *);
+
+    // We cannot do this in the encoder init as the threads are created before
+    if (av_codec_is_encoder(avctx->codec) &&
+        avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO &&
+        avctx->height > 2800)
+        thread_count = avctx->thread_count = 1;
 
     if (!thread_count) {
         int nb_cpus = av_cpu_count();
-        av_log(avctx, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
+        if  (avctx->height)
+            nb_cpus = FFMIN(nb_cpus, (avctx->height+15)/16);
         // use number of cores + 1 as thread count if there is more than one
         if (nb_cpus > 1)
             thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
@@ -184,37 +154,89 @@ int ff_slice_thread_init(AVCodecContext *avctx)
         return 0;
     }
 
-    c = av_mallocz(sizeof(SliceThreadContext));
-    if (!c)
-        return -1;
+    avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c));
+    mainfunc = avctx->codec->caps_internal & FF_CODEC_CAP_SLICE_THREAD_HAS_MF ? &main_function : NULL;
+    if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, mainfunc, thread_count)) <= 1) {
+        if (c)
+            avpriv_slicethread_free(&c->thread);
+        av_freep(&avctx->internal->thread_ctx);
+        avctx->thread_count = 1;
+        avctx->active_thread_type = 0;
+        return 0;
+    }
+    avctx->thread_count = thread_count;
 
-    c->workers = av_mallocz(sizeof(pthread_t)*thread_count);
-    if (!c->workers) {
-        av_free(c);
-        return -1;
+    avctx->execute = thread_execute;
+    avctx->execute2 = thread_execute2;
+    return 0;
+}
+
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n)
+{
+    SliceThreadContext *p = avctx->internal->thread_ctx;
+    int *entries = p->entries;
+
+    pthread_mutex_lock(&p->progress_mutex[thread]);
+    entries[field] +=n;
+    pthread_cond_signal(&p->progress_cond[thread]);
+    pthread_mutex_unlock(&p->progress_mutex[thread]);
+}
+
+void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift)
+{
+    SliceThreadContext *p  = avctx->internal->thread_ctx;
+    int *entries      = p->entries;
+
+    if (!entries || !field) return;
+
+    thread = thread ? thread - 1 : p->thread_count - 1;
+
+    pthread_mutex_lock(&p->progress_mutex[thread]);
+    while ((entries[field - 1] - entries[field]) < shift){
+        pthread_cond_wait(&p->progress_cond[thread], &p->progress_mutex[thread]);
     }
+    pthread_mutex_unlock(&p->progress_mutex[thread]);
+}
+
+int ff_alloc_entries(AVCodecContext *avctx, int count)
+{
+    int i;
 
-    avctx->internal->thread_ctx = c;
-    c->current_job = 0;
-    c->job_count = 0;
-    c->job_size = 0;
-    c->done = 0;
-    pthread_cond_init(&c->current_job_cond, NULL);
-    pthread_cond_init(&c->last_job_cond, NULL);
-    pthread_mutex_init(&c->current_job_lock, NULL);
-    pthread_mutex_lock(&c->current_job_lock);
-    for (i=0; i<thread_count; i++) {
-        if(pthread_create(&c->workers[i], NULL, worker, avctx)) {
-           avctx->thread_count = i;
-           pthread_mutex_unlock(&c->current_job_lock);
-           ff_thread_free(avctx);
-           return -1;
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        SliceThreadContext *p = avctx->internal->thread_ctx;
+
+        if (p->entries) {
+            av_assert0(p->thread_count == avctx->thread_count);
+            av_freep(&p->entries);
         }
-    }
 
-    thread_park_workers(c, thread_count);
+        p->thread_count  = avctx->thread_count;
+        p->entries       = av_mallocz_array(count, sizeof(int));
+
+        if (!p->progress_mutex) {
+            p->progress_mutex = av_malloc_array(p->thread_count, sizeof(pthread_mutex_t));
+            p->progress_cond  = av_malloc_array(p->thread_count, sizeof(pthread_cond_t));
+        }
+
+        if (!p->entries || !p->progress_mutex || !p->progress_cond) {
+            av_freep(&p->entries);
+            av_freep(&p->progress_mutex);
+            av_freep(&p->progress_cond);
+            return AVERROR(ENOMEM);
+        }
+        p->entries_count  = count;
+
+        for (i = 0; i < p->thread_count; i++) {
+            pthread_mutex_init(&p->progress_mutex[i], NULL);
+            pthread_cond_init(&p->progress_cond[i], NULL);
+        }
+    }
 
-    avctx->execute = thread_execute;
-    avctx->execute2 = thread_execute2;
     return 0;
 }
+
+void ff_reset_entries(AVCodecContext *avctx)
+{
+    SliceThreadContext *p = avctx->internal->thread_ctx;
+    memset(p->entries, 0, p->entries_count * sizeof(int));
+}
diff --git a/libavcodec/ptx.c b/libavcodec/ptx.c
index 312850c..42147f4 100644
--- a/libavcodec/ptx.c
+++ b/libavcodec/ptx.c
@@ -2,20 +2,20 @@
  * V.Flash PTX (.ptx) image decoder
  * Copyright (c) 2007 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_PATCHWELCOME;
     }
 
-    avctx->pix_fmt = AV_PIX_FMT_RGB555;
+    avctx->pix_fmt = AV_PIX_FMT_BGR555LE;
 
     if (buf_end - buf < offset)
         return AVERROR_INVALIDDATA;
@@ -58,10 +58,8 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
@@ -69,13 +67,7 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     stride = p->linesize[0];
 
     for (y = 0; y < h && buf_end - buf >= w * bytes_per_pixel; y++) {
-#if HAVE_BIGENDIAN
-        unsigned int x;
-        for (x=0; x<w*bytes_per_pixel; x+=bytes_per_pixel)
-            AV_WN16(ptr+x, AV_RL16(buf+x));
-#else
         memcpy(ptr, buf, w*bytes_per_pixel);
-#endif
         ptr += stride;
         buf += w*bytes_per_pixel;
     }
diff --git a/libavcodec/put_bits.h b/libavcodec/put_bits.h
index 17666fa..1ceb1cc 100644
--- a/libavcodec/put_bits.h
+++ b/libavcodec/put_bits.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 
 #include <stdint.h>
 #include <stddef.h>
-#include <assert.h>
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/avassert.h"
 
 typedef struct PutBitContext {
     uint32_t bit_buf;
@@ -62,6 +62,24 @@ static inline void init_put_bits(PutBitContext *s, uint8_t *buffer,
 }
 
 /**
+ * Rebase the bit writer onto a reallocated buffer.
+ *
+ * @param buffer the buffer where to put bits
+ * @param buffer_size the size in bytes of buffer,
+ *                    must be larger than the previous size
+ */
+static inline void rebase_put_bits(PutBitContext *s, uint8_t *buffer,
+                                   int buffer_size)
+{
+    av_assert0(8*buffer_size > s->size_in_bits);
+
+    s->buf_end = buffer + buffer_size;
+    s->buf_ptr = buffer + (s->buf_ptr - s->buf);
+    s->buf     = buffer;
+    s->size_in_bits = 8 * buffer_size;
+}
+
+/**
  * @return the total number of bits written to the bitstream.
  */
 static inline int put_bits_count(PutBitContext *s)
@@ -87,7 +105,7 @@ static inline void flush_put_bits(PutBitContext *s)
         s->bit_buf <<= s->bit_left;
 #endif
     while (s->bit_left < 32) {
-        /* XXX: should test end of buffer */
+        av_assert0(s->buf_ptr < s->buf_end);
 #ifdef BITSTREAM_WRITER_LE
         *s->buf_ptr++ = s->bit_buf;
         s->bit_buf  >>= 8;
@@ -101,6 +119,18 @@ static inline void flush_put_bits(PutBitContext *s)
     s->bit_buf  = 0;
 }
 
+static inline void flush_put_bits_le(PutBitContext *s)
+{
+    while (s->bit_left < 32) {
+        av_assert0(s->buf_ptr < s->buf_end);
+        *s->buf_ptr++ = s->bit_buf;
+        s->bit_buf  >>= 8;
+        s->bit_left  += 8;
+    }
+    s->bit_left = 32;
+    s->bit_buf  = 0;
+}
+
 #ifdef BITSTREAM_WRITER_LE
 #define avpriv_align_put_bits align_put_bits_unsupported_here
 #define avpriv_put_string ff_put_string_unsupported_here
@@ -136,7 +166,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     unsigned int bit_buf;
     int bit_left;
 
-    assert(n <= 31 && value < (1U << n));
+    av_assert2(n <= 31 && value < (1U << n));
 
     bit_buf  = s->bit_buf;
     bit_left = s->bit_left;
@@ -145,9 +175,14 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 #ifdef BITSTREAM_WRITER_LE
     bit_buf |= value << (32 - bit_left);
     if (n >= bit_left) {
-        AV_WL32(s->buf_ptr, bit_buf);
-        s->buf_ptr += 4;
-        bit_buf     = (bit_left == 32) ? 0 : value >> bit_left;
+        if (3 < s->buf_end - s->buf_ptr) {
+            AV_WL32(s->buf_ptr, bit_buf);
+            s->buf_ptr += 4;
+        } else {
+            av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n");
+            av_assert2(0);
+        }
+        bit_buf     = value >> bit_left;
         bit_left   += 32;
     }
     bit_left -= n;
@@ -158,8 +193,13 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     } else {
         bit_buf   <<= bit_left;
         bit_buf    |= value >> (n - bit_left);
-        AV_WB32(s->buf_ptr, bit_buf);
-        s->buf_ptr += 4;
+        if (3 < s->buf_end - s->buf_ptr) {
+            AV_WB32(s->buf_ptr, bit_buf);
+            s->buf_ptr += 4;
+        } else {
+            av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n");
+            av_assert2(0);
+        }
         bit_left   += 32 - n;
         bit_buf     = value;
     }
@@ -169,11 +209,39 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     s->bit_left = bit_left;
 }
 
+static inline void put_bits_le(PutBitContext *s, int n, unsigned int value)
+{
+    unsigned int bit_buf;
+    int bit_left;
+
+    av_assert2(n <= 31 && value < (1U << n));
+
+    bit_buf  = s->bit_buf;
+    bit_left = s->bit_left;
+
+    bit_buf |= value << (32 - bit_left);
+    if (n >= bit_left) {
+        if (3 < s->buf_end - s->buf_ptr) {
+            AV_WL32(s->buf_ptr, bit_buf);
+            s->buf_ptr += 4;
+        } else {
+            av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n");
+            av_assert2(0);
+        }
+        bit_buf     = value >> bit_left;
+        bit_left   += 32;
+    }
+    bit_left -= n;
+
+    s->bit_buf  = bit_buf;
+    s->bit_left = bit_left;
+}
+
 static inline void put_sbits(PutBitContext *pb, int n, int32_t value)
 {
-    assert(n >= 0 && n <= 31);
+    av_assert2(n >= 0 && n <= 31);
 
-    put_bits(pb, n, value & ((1 << n) - 1));
+    put_bits(pb, n, av_mod_uintp2(value, n));
 }
 
 /**
@@ -181,15 +249,72 @@ static inline void put_sbits(PutBitContext *pb, int n, int32_t value)
  */
 static void av_unused put_bits32(PutBitContext *s, uint32_t value)
 {
-    int lo = value & 0xffff;
-    int hi = value >> 16;
+    unsigned int bit_buf;
+    int bit_left;
+
+    bit_buf  = s->bit_buf;
+    bit_left = s->bit_left;
+
 #ifdef BITSTREAM_WRITER_LE
-    put_bits(s, 16, lo);
-    put_bits(s, 16, hi);
+    bit_buf |= value << (32 - bit_left);
+    if (3 < s->buf_end - s->buf_ptr) {
+        AV_WL32(s->buf_ptr, bit_buf);
+        s->buf_ptr += 4;
+    } else {
+        av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n");
+        av_assert2(0);
+    }
+    bit_buf     = (uint64_t)value >> bit_left;
 #else
-    put_bits(s, 16, hi);
-    put_bits(s, 16, lo);
+    bit_buf     = (uint64_t)bit_buf << bit_left;
+    bit_buf    |= value >> (32 - bit_left);
+    if (3 < s->buf_end - s->buf_ptr) {
+        AV_WB32(s->buf_ptr, bit_buf);
+        s->buf_ptr += 4;
+    } else {
+        av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n");
+        av_assert2(0);
+    }
+    bit_buf     = value;
 #endif
+
+    s->bit_buf  = bit_buf;
+    s->bit_left = bit_left;
+}
+
+/**
+ * Write up to 64 bits into a bitstream.
+ */
+static inline void put_bits64(PutBitContext *s, int n, uint64_t value)
+{
+    av_assert2((n == 64) || (n < 64 && value < (UINT64_C(1) << n)));
+
+    if (n < 32)
+        put_bits(s, n, value);
+    else if (n == 32)
+        put_bits32(s, value);
+    else if (n < 64) {
+        uint32_t lo = value & 0xffffffff;
+        uint32_t hi = value >> 32;
+#ifdef BITSTREAM_WRITER_LE
+        put_bits32(s, lo);
+        put_bits(s, n - 32, hi);
+#else
+        put_bits(s, n - 32, hi);
+        put_bits32(s, lo);
+#endif
+    } else {
+        uint32_t lo = value & 0xffffffff;
+        uint32_t hi = value >> 32;
+#ifdef BITSTREAM_WRITER_LE
+        put_bits32(s, lo);
+        put_bits32(s, hi);
+#else
+        put_bits32(s, hi);
+        put_bits32(s, lo);
+#endif
+
+    }
 }
 
 /**
@@ -207,8 +332,9 @@ static inline uint8_t *put_bits_ptr(PutBitContext *s)
  */
 static inline void skip_put_bytes(PutBitContext *s, int n)
 {
-    assert((put_bits_count(s) & 7) == 0);
-    assert(s->bit_left == 32);
+    av_assert2((put_bits_count(s) & 7) == 0);
+    av_assert2(s->bit_left == 32);
+    av_assert0(n <= s->buf_end - s->buf_ptr);
     s->buf_ptr += n;
 }
 
@@ -231,7 +357,9 @@ static inline void skip_put_bits(PutBitContext *s, int n)
  */
 static inline void set_put_bits_buffer_size(PutBitContext *s, int size)
 {
+    av_assert0(size <= INT_MAX/8 - 32);
     s->buf_end = s->buf + size;
+    s->size_in_bits = 8*size;
 }
 
 #endif /* AVCODEC_PUT_BITS_H */
diff --git a/libavcodec/qcelpdata.h b/libavcodec/qcelpdata.h
index 319833e..931c990 100644
--- a/libavcodec/qcelpdata.h
+++ b/libavcodec/qcelpdata.h
@@ -2,20 +2,20 @@
  * QCELP decoder
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
  * @file
  * Data tables for the QCELP decoder
  * @author Reynaldo H. Verdejo Pinochet
- * @remark Libav merging spearheaded by Kenan Gillet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
  * @remark Development mentored by Benjamin Larson
  */
 
@@ -66,7 +66,7 @@ typedef struct QCELPFrame {
 } QCELPFrame;
 
 /**
- * pre-calculated table for hammsinc function
+ * Pre-calculated table for hammsinc function.
  * Only half of the table is needed because of symmetry.
  *
  * TIA/EIA/IS-733 2.4.5.2-2/3
@@ -82,7 +82,7 @@ typedef struct QCELPBitmap {
 #define QCELP_OF(variable, bit, len) {offsetof(QCELPFrame, variable), bit, len}
 
 /**
- * bitmap unpacking tables for RATE_FULL
+ * Bitmap unpacking tables for RATE_FULL
  *
  * TIA/EIA/IS-733 Table 2.4.7.1-1
  */
@@ -169,7 +169,7 @@ static const QCELPBitmap qcelp_rate_full_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_HALF
+ * Bitmap unpacking tables for RATE_HALF
  *
  * TIA/EIA/IS-733 Table 2.4.7.2-1
  */
@@ -211,7 +211,7 @@ static const QCELPBitmap qcelp_rate_half_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_QUARTER
+ * Bitmap unpacking tables for RATE_QUARTER
  *
  * TIA/EIA/IS-733 Table 2.4.7.3-1
  */
@@ -232,7 +232,7 @@ static const QCELPBitmap qcelp_rate_quarter_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_OCTAVE
+ * Bitmap unpacking tables for RATE_OCTAVE
  *
  * trick: CBSEED is written into QCELPContext.cbsign[15],
  * which is not used for RATE_OCTAVE.
@@ -257,12 +257,12 @@ static const QCELPBitmap qcelp_rate_octave_bitmap[] = {
     QCELP_OF(lspv   [8], 0, 1), //  8
     QCELP_OF(cbsign[15], 0, 1), //  7
     QCELP_OF(lspv   [9], 0, 1), //  6
-    QCELP_OF(cbgain [0], 0, 2), //  7
+    QCELP_OF(cbgain [0], 0, 2), //  5
     QCELP_OF(reserved,   0, 4)  //  3
 };
 
 /**
- * position of the bitmapping data for each packet type in
+ * Bitmapping data position for each packet type in
  * the QCELPContext
  */
 static const QCELPBitmap * const qcelp_unpacking_bitmaps_per_rate[5] = {
@@ -420,12 +420,12 @@ static const qcelp_vector * const qcelp_lspvq[5] = {
 };
 
 /**
- * the final gain scalefactor before clipping into a usable output float
+ * The final gain scalefactor before clipping into a usable output float
  */
 #define QCELP_SCALE 8192.
 
 /**
- * table for computing Ga (decoded linear codebook gain magnitude)
+ * Table for computing Ga (decoded linear codebook gain magnitude)
  *
  * @note The table could fit in int16_t in x*8 form, but it seems
  *       to be slower on x86
@@ -452,7 +452,7 @@ static const float qcelp_g12ga[61] = {
  1000.000/QCELP_SCALE};
 
 /**
- * circular codebook for rate 1 frames in x*100 form
+ * Circular codebook for rate 1 frames in x*100 form
  *
  * TIA/EIA/IS-733 2.4.6.1-2
  */
@@ -477,7 +477,7 @@ static const int16_t qcelp_rate_full_codebook[128] = {
 #define QCELP_RATE_FULL_CODEBOOK_RATIO .01
 
 /**
- * circular codebook for rate 1/2 frames in x*2 form
+ * Circular codebook for rate 1/2 frames in x*2 form
  *
  * TIA/EIA/IS-733 2.4.6.1-1
  */
@@ -511,7 +511,7 @@ static const int8_t qcelp_rate_half_codebook[128] = {
 #define QCELP_SQRT1887 1.373681186
 
 /**
- * table for impulse response of BPF used to filter
+ * Table for impulse response of BPF used to filter
  * the white excitation for bitrate 1/4 synthesis
  *
  * Only half the tables are needed because of symmetry.
@@ -526,14 +526,14 @@ static const double qcelp_rnd_fir_coefs[11] = {
 
 /**
  * This spread factor is used, for bitrate 1/8 and I_F_Q,
- * to force the LSP frequencies to be at least 80 Hz apart.
+ * to force LSP frequencies to be at least 80 Hz apart.
  *
  * TIA/EIA/IS-733 2.4.3.3.2
  */
 #define QCELP_LSP_SPREAD_FACTOR 0.02
 
 /**
- * predictor coefficient for the conversion of LSP codes
+ * Predictor coefficient for the conversion of LSP codes
  * to LSP frequencies for 1/8 and I_F_Q
  *
  * TIA/EIA/IS-733 2.4.3.2.7-2
@@ -541,7 +541,7 @@ static const double qcelp_rnd_fir_coefs[11] = {
 #define QCELP_LSP_OCTAVE_PREDICTOR 29.0/32
 
 /**
- * initial coefficient to perform bandwidth expansion on LPC
+ * Initial coefficient to perform bandwidth expansion on LPC
  *
  * @note: 0.9883 looks like an approximation of 253/256.
  *
diff --git a/libavcodec/qcelpdec.c b/libavcodec/qcelpdec.c
index 83be57a..b4afda2 100644
--- a/libavcodec/qcelpdec.c
+++ b/libavcodec/qcelpdec.c
@@ -2,20 +2,20 @@
  * QCELP decoder
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,27 +23,24 @@
  * @file
  * QCELP decoder
  * @author Reynaldo H. Verdejo Pinochet
- * @remark Libav merging spearheaded by Kenan Gillet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
  * @remark Development mentored by Benjamin Larson
  */
 
 #include <stddef.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "qcelpdata.h"
 #include "celp_filters.h"
 #include "acelp_filters.h"
 #include "acelp_vectors.h"
 #include "lsp.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 typedef enum {
     I_F_Q = -1,    /**< insufficient frame quality */
     SILENCE,
@@ -54,7 +51,7 @@ typedef enum {
 } qcelp_packet_rate;
 
 typedef struct QCELPContext {
-    BitstreamContext  bc;
+    GetBitContext     gb;
     qcelp_packet_rate bitrate;
     QCELPFrame        frame;    /**< unpacked data frame */
 
@@ -136,7 +133,7 @@ static int decode_lspf(QCELPContext *q, float *lspf)
         } else {
             erasure_coeff = QCELP_LSP_OCTAVE_PREDICTOR;
 
-            assert(q->bitrate == I_F_Q);
+            av_assert2(q->bitrate == I_F_Q);
 
             if (q->erasure_count > 1)
                 erasure_coeff *= q->erasure_count < 4 ? 0.9 : 0.7;
@@ -240,7 +237,7 @@ static void decode_gain_and_index(QCELPContext *q, float *gain)
                     av_clip((q->prev_g1[0] + q->prev_g1[1]) / 2 - 5, 0, 54);
             subframes_count = 8;
         } else {
-            assert(q->bitrate == I_F_Q);
+            av_assert2(q->bitrate == I_F_Q);
 
             g1[0] = q->prev_g1[1];
             switch (q->erasure_count) {
@@ -322,7 +319,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
             tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
             cindex   = -q->frame.cindex[i];
             for (j = 0; j < 10; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cindex++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_full_codebook[cindex++ & 127];
         }
         break;
     case RATE_HALF:
@@ -330,7 +328,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
             tmp_gain = gain[i] * QCELP_RATE_HALF_CODEBOOK_RATIO;
             cindex   = -q->frame.cindex[i];
             for (j = 0; j < 40; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_half_codebook[cindex++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_half_codebook[cindex++ & 127];
         }
         break;
     case RATE_QUARTER:
@@ -375,7 +374,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
         for (i = 0; i < 4; i++) {
             tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
             for (j = 0; j < 40; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cbseed++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_full_codebook[cbseed++ & 127];
         }
         break;
     case SILENCE:
@@ -436,7 +436,8 @@ static const float *do_pitchfilter(float memory[303], const float v_in[160],
             for (v_len = v_in + 40; v_in < v_len; v_in++) {
                 if (pfrac[i]) { // If it is a fractional lag...
                     for (j = 0, *v_out = 0.0; j < 4; j++)
-                        *v_out += qcelp_hammsinc_table[j] * (v_lag[j - 4] + v_lag[3 - j]);
+                        *v_out += qcelp_hammsinc_table[j] *
+                                  (v_lag[j - 4] + v_lag[3 - j]);
                 } else
                     *v_out = *v_lag;
 
@@ -487,7 +488,7 @@ static void apply_pitch_filters(QCELPContext *q, float *cdn_vector)
                   else
                       max_pitch_gain = 0.0;
             } else {
-                assert(q->bitrate == SILENCE);
+                av_assert2(q->bitrate == SILENCE);
                 max_pitch_gain = 1.0;
             }
             for (i = 0; i < 4; i++)
@@ -512,7 +513,8 @@ static void apply_pitch_filters(QCELPContext *q, float *cdn_vector)
 
         apply_gain_ctrl(cdn_vector, v_synthesis_filtered, v_pre_filtered);
     } else {
-        memcpy(q->pitch_synthesis_filter_mem, cdn_vector + 17, 143 * sizeof(float));
+        memcpy(q->pitch_synthesis_filter_mem,
+               cdn_vector + 17, 143 * sizeof(float));
         memcpy(q->pitch_pre_filter_mem, cdn_vector + 17, 143 * sizeof(float));
         memset(q->pitch_gain, 0, sizeof(q->pitch_gain));
         memset(q->pitch_lag,  0, sizeof(q->pitch_lag));
@@ -631,7 +633,7 @@ static qcelp_packet_rate determine_bitrate(AVCodecContext *avctx,
         (*buf)++;
     } else if ((bitrate = buf_size2bitrate(buf_size + 1)) >= 0) {
         av_log(avctx, AV_LOG_WARNING,
-               "Bitrate byte is missing, guessing the bitrate from packet size.\n");
+               "Bitrate byte missing, guessing bitrate from packet size.\n");
     } else
         return I_F_Q;
 
@@ -696,14 +698,12 @@ static int qcelp_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 160;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     outbuffer = (float *)frame->data[0];
 
     if ((q->bitrate = determine_bitrate(avctx, buf_size, &buf)) == I_F_Q) {
-        warn_insufficient_frame_quality(avctx, "bitrate cannot be determined.");
+        warn_insufficient_frame_quality(avctx, "Bitrate cannot be determined.");
         goto erasure;
     }
 
@@ -719,12 +719,13 @@ static int qcelp_decode_frame(AVCodecContext *avctx, void *data,
                                          qcelp_unpacking_bitmaps_lengths[q->bitrate];
         uint8_t *unpacked_data         = (uint8_t *)&q->frame;
 
-        bitstream_init8(&q->bc, buf, buf_size);
+        if ((ret = init_get_bits8(&q->gb, buf, buf_size)) < 0)
+            return ret;
 
         memset(&q->frame, 0, sizeof(QCELPFrame));
 
         for (; bitmaps < bitmaps_end; bitmaps++)
-            unpacked_data[bitmaps->index] |= bitstream_read(&q->bc, bitmaps->bitlen) << bitmaps->bitpos;
+            unpacked_data[bitmaps->index] |= get_bits(&q->gb, bitmaps->bitlen) << bitmaps->bitpos;
 
         // Check for erasures/blanks on rates 1, 1/4 and 1/8.
         if (q->frame.reserved) {
@@ -771,7 +772,8 @@ erasure:
     formant_mem = q->formant_mem + 10;
     for (i = 0; i < 4; i++) {
         interpolate_lpc(q, quantized_lspf, lpc, i);
-        ff_celp_lp_synthesis_filterf(formant_mem, lpc, outbuffer + i * 40, 40, 10);
+        ff_celp_lp_synthesis_filterf(formant_mem, lpc,
+                                     outbuffer + i * 40, 40, 10);
         formant_mem += 40;
     }
 
diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c
index 668e513..88b6b19 100644
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2005 Alex Beregszaszi
  * Copyright (c) 2005 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,17 +39,15 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
+#include "bytestream.h"
 #include "internal.h"
 #include "mpegaudio.h"
 #include "mpegaudiodsp.h"
 #include "rdft.h"
-#include "vlc.h"
 
-#include "qdm2data.h"
 #include "qdm2_tablegen.h"
 
-
 #define QDM2_LIST_ADD(list, size, packet) \
 do { \
       if (size > 0) { \
@@ -167,7 +165,7 @@ typedef struct QDM2Context {
     /// I/O data
     const uint8_t *compressed_data;
     int compressed_size;
-    float output_buffer[QDM2_MAX_FRAME_SIZE * 2];
+    float output_buffer[QDM2_MAX_FRAME_SIZE * MPA_MAX_CHANNELS * 2];
 
     /// Synthesis filter
     MPADSPContext mpadsp;
@@ -196,197 +194,42 @@ typedef struct QDM2Context {
     int noise_idx; ///< index for dithering noise table
 } QDM2Context;
 
-
-static VLC vlc_tab_level;
-static VLC vlc_tab_diff;
-static VLC vlc_tab_run;
-static VLC fft_level_exp_alt_vlc;
-static VLC fft_level_exp_vlc;
-static VLC fft_stereo_exp_vlc;
-static VLC fft_stereo_phase_vlc;
-static VLC vlc_tab_tone_level_idx_hi1;
-static VLC vlc_tab_tone_level_idx_mid;
-static VLC vlc_tab_tone_level_idx_hi2;
-static VLC vlc_tab_type30;
-static VLC vlc_tab_type34;
-static VLC vlc_tab_fft_tone_offset[5];
-
-static const uint16_t qdm2_vlc_offs[] = {
-    0,260,566,598,894,1166,1230,1294,1678,1950,2214,2278,2310,2570,2834,3124,3448,3838,
-};
-
 static const int switchtable[23] = {
     0, 5, 1, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 4
 };
 
-static av_cold void qdm2_init_vlc(void)
-{
-    static VLC_TYPE qdm2_table[3838][2];
-
-    vlc_tab_level.table           = &qdm2_table[qdm2_vlc_offs[0]];
-    vlc_tab_level.table_allocated = qdm2_vlc_offs[1] - qdm2_vlc_offs[0];
-    init_vlc(&vlc_tab_level, 8, 24,
-             vlc_tab_level_huffbits, 1, 1,
-             vlc_tab_level_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_diff.table           = &qdm2_table[qdm2_vlc_offs[1]];
-    vlc_tab_diff.table_allocated = qdm2_vlc_offs[2] - qdm2_vlc_offs[1];
-    init_vlc(&vlc_tab_diff, 8, 37,
-             vlc_tab_diff_huffbits, 1, 1,
-             vlc_tab_diff_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_run.table           = &qdm2_table[qdm2_vlc_offs[2]];
-    vlc_tab_run.table_allocated = qdm2_vlc_offs[3] - qdm2_vlc_offs[2];
-    init_vlc(&vlc_tab_run, 5, 6,
-             vlc_tab_run_huffbits, 1, 1,
-             vlc_tab_run_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_level_exp_alt_vlc.table           = &qdm2_table[qdm2_vlc_offs[3]];
-    fft_level_exp_alt_vlc.table_allocated = qdm2_vlc_offs[4] -
-                                            qdm2_vlc_offs[3];
-    init_vlc(&fft_level_exp_alt_vlc, 8, 28,
-             fft_level_exp_alt_huffbits, 1, 1,
-             fft_level_exp_alt_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_level_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[4]];
-    fft_level_exp_vlc.table_allocated = qdm2_vlc_offs[5] - qdm2_vlc_offs[4];
-    init_vlc(&fft_level_exp_vlc, 8, 20,
-             fft_level_exp_huffbits, 1, 1,
-             fft_level_exp_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_stereo_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[5]];
-    fft_stereo_exp_vlc.table_allocated = qdm2_vlc_offs[6] -
-                                         qdm2_vlc_offs[5];
-    init_vlc(&fft_stereo_exp_vlc, 6, 7,
-             fft_stereo_exp_huffbits, 1, 1,
-             fft_stereo_exp_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_stereo_phase_vlc.table           = &qdm2_table[qdm2_vlc_offs[6]];
-    fft_stereo_phase_vlc.table_allocated = qdm2_vlc_offs[7] -
-                                           qdm2_vlc_offs[6];
-    init_vlc(&fft_stereo_phase_vlc, 6, 9,
-             fft_stereo_phase_huffbits, 1, 1,
-             fft_stereo_phase_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_hi1.table =
-        &qdm2_table[qdm2_vlc_offs[7]];
-    vlc_tab_tone_level_idx_hi1.table_allocated = qdm2_vlc_offs[8] -
-                                                 qdm2_vlc_offs[7];
-    init_vlc(&vlc_tab_tone_level_idx_hi1, 8, 20,
-             vlc_tab_tone_level_idx_hi1_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_hi1_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_mid.table =
-        &qdm2_table[qdm2_vlc_offs[8]];
-    vlc_tab_tone_level_idx_mid.table_allocated = qdm2_vlc_offs[9] -
-                                                 qdm2_vlc_offs[8];
-    init_vlc(&vlc_tab_tone_level_idx_mid, 8, 24,
-             vlc_tab_tone_level_idx_mid_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_mid_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_hi2.table =
-        &qdm2_table[qdm2_vlc_offs[9]];
-    vlc_tab_tone_level_idx_hi2.table_allocated = qdm2_vlc_offs[10] -
-                                                 qdm2_vlc_offs[9];
-    init_vlc(&vlc_tab_tone_level_idx_hi2, 8, 24,
-             vlc_tab_tone_level_idx_hi2_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_hi2_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_type30.table           = &qdm2_table[qdm2_vlc_offs[10]];
-    vlc_tab_type30.table_allocated = qdm2_vlc_offs[11] - qdm2_vlc_offs[10];
-    init_vlc(&vlc_tab_type30, 6, 9,
-             vlc_tab_type30_huffbits, 1, 1,
-             vlc_tab_type30_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_type34.table           = &qdm2_table[qdm2_vlc_offs[11]];
-    vlc_tab_type34.table_allocated = qdm2_vlc_offs[12] - qdm2_vlc_offs[11];
-    init_vlc(&vlc_tab_type34, 5, 10,
-             vlc_tab_type34_huffbits, 1, 1,
-             vlc_tab_type34_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[0].table =
-        &qdm2_table[qdm2_vlc_offs[12]];
-    vlc_tab_fft_tone_offset[0].table_allocated = qdm2_vlc_offs[13] -
-                                                 qdm2_vlc_offs[12];
-    init_vlc(&vlc_tab_fft_tone_offset[0], 8, 23,
-             vlc_tab_fft_tone_offset_0_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_0_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[1].table =
-        &qdm2_table[qdm2_vlc_offs[13]];
-    vlc_tab_fft_tone_offset[1].table_allocated = qdm2_vlc_offs[14] -
-                                                 qdm2_vlc_offs[13];
-    init_vlc(&vlc_tab_fft_tone_offset[1], 8, 28,
-             vlc_tab_fft_tone_offset_1_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_1_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[2].table =
-        &qdm2_table[qdm2_vlc_offs[14]];
-    vlc_tab_fft_tone_offset[2].table_allocated = qdm2_vlc_offs[15] -
-                                                 qdm2_vlc_offs[14];
-    init_vlc(&vlc_tab_fft_tone_offset[2], 8, 32,
-             vlc_tab_fft_tone_offset_2_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_2_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[3].table =
-        &qdm2_table[qdm2_vlc_offs[15]];
-    vlc_tab_fft_tone_offset[3].table_allocated = qdm2_vlc_offs[16] -
-                                                 qdm2_vlc_offs[15];
-    init_vlc(&vlc_tab_fft_tone_offset[3], 8, 35,
-             vlc_tab_fft_tone_offset_3_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_3_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[4].table =
-        &qdm2_table[qdm2_vlc_offs[16]];
-    vlc_tab_fft_tone_offset[4].table_allocated = qdm2_vlc_offs[17] -
-                                                 qdm2_vlc_offs[16];
-    init_vlc(&vlc_tab_fft_tone_offset[4], 8, 38,
-             vlc_tab_fft_tone_offset_4_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_4_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-}
-
-static int qdm2_get_vlc(BitstreamContext *bc, VLC *vlc, int flag, int depth)
+static int qdm2_get_vlc(GetBitContext *gb, const VLC *vlc, int flag, int depth)
 {
     int value;
 
-    value = bitstream_read_vlc(bc, vlc->table, vlc->bits, depth);
+    value = get_vlc2(gb, vlc->table, vlc->bits, depth);
 
     /* stage-2, 3 bits exponent escape sequence */
     if (value-- == 0)
-        value = bitstream_read(bc, bitstream_read(bc, 3) + 1);
+        value = get_bits(gb, get_bits(gb, 3) + 1);
 
     /* stage-3, optional */
     if (flag) {
-        int tmp = vlc_stage3_values[value];
+        int tmp;
+
+        if (value >= 60) {
+            av_log(NULL, AV_LOG_ERROR, "value %d in qdm2_get_vlc too large\n", value);
+            return 0;
+        }
+
+        tmp= vlc_stage3_values[value];
 
         if ((value & ~3) > 0)
-            tmp += bitstream_read(bc, value >> 2);
+            tmp += get_bits(gb, (value >> 2));
         value = tmp;
     }
 
     return value;
 }
 
-static int qdm2_get_se_vlc(VLC *vlc, BitstreamContext *bc, int depth)
+static int qdm2_get_se_vlc(const VLC *vlc, GetBitContext *gb, int depth)
 {
-    int value = qdm2_get_vlc(bc, vlc, 0, depth);
+    int value = qdm2_get_vlc(gb, vlc, 0, depth);
 
     return (value & 1) ? ((value + 1) >> 1) : -(value >> 1);
 }
@@ -413,35 +256,35 @@ static uint16_t qdm2_packet_checksum(const uint8_t *data, int length, int value)
 /**
  * Fill a QDM2SubPacket structure with packet type, size, and data pointer.
  *
- * @param bc            bitreader context
+ * @param gb            bitreader context
  * @param sub_packet    packet under analysis
  */
-static void qdm2_decode_sub_packet_header(BitstreamContext *bc,
+static void qdm2_decode_sub_packet_header(GetBitContext *gb,
                                           QDM2SubPacket *sub_packet)
 {
-    sub_packet->type = bitstream_read(bc, 8);
+    sub_packet->type = get_bits(gb, 8);
 
     if (sub_packet->type == 0) {
         sub_packet->size = 0;
         sub_packet->data = NULL;
     } else {
-        sub_packet->size = bitstream_read(bc, 8);
+        sub_packet->size = get_bits(gb, 8);
 
         if (sub_packet->type & 0x80) {
             sub_packet->size <<= 8;
-            sub_packet->size  |= bitstream_read(bc, 8);
+            sub_packet->size  |= get_bits(gb, 8);
             sub_packet->type  &= 0x7f;
         }
 
         if (sub_packet->type == 0x7f)
-            sub_packet->type |= bitstream_read(bc, 8) << 8;
+            sub_packet->type |= (get_bits(gb, 8) << 8);
 
         // FIXME: this depends on bitreader-internal data
-        sub_packet->data = &bc->buffer[bitstream_tell(bc) / 8];
+        sub_packet->data = &gb->buffer[get_bits_count(gb) / 8];
     }
 
     av_log(NULL, AV_LOG_DEBUG, "Subpacket: type=%d size=%d start_offs=%x\n",
-           sub_packet->type, sub_packet->size, bitstream_tell(bc) / 8);
+           sub_packet->type, sub_packet->size, get_bits_count(gb) / 8);
 }
 
 /**
@@ -693,8 +536,9 @@ static void fill_coding_method_array(sb_int8_array tone_level_idx,
 
     if (!superblocktype_2_3) {
         /* This case is untested, no samples available */
-        SAMPLES_NEEDED
-        for (ch = 0; ch < nb_channels; ch++)
+        avpriv_request_sample(NULL, "!superblocktype_2_3");
+        return;
+        for (ch = 0; ch < nb_channels; ch++) {
             for (sb = 0; sb < 30; sb++) {
                 for (j = 1; j < 63; j++) {  // The loop only iterates to 63 so the code doesn't overflow the buffer
                     add1 = tone_level_idx[ch][sb][j] - 10;
@@ -723,7 +567,7 @@ static void fill_coding_method_array(sb_int8_array tone_level_idx,
                 }
                 tone_level_idx_temp[ch][sb][0] = tone_level_idx_temp[ch][sb][1];
             }
-
+        }
         acc = 0;
         for (ch = 0; ch < nb_channels; ch++)
             for (sb = 0; sb < 30; sb++)
@@ -800,12 +644,12 @@ static void fill_coding_method_array(sb_int8_array tone_level_idx,
  * sb 8-sb_used.
  *
  * @param q         context
- * @param bc        bitreader context
+ * @param gb        bitreader context
  * @param length    packet length in bits
  * @param sb_min    lower subband processed (sb_min included)
  * @param sb_max    higher subband processed (sb_max excluded)
  */
-static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
+static int synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                        int length, int sb_min, int sb_max)
 {
     int sb, j, k, n, ch, run, channels;
@@ -813,14 +657,15 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
     int type34_first;
     float type34_div = 0;
     float type34_predictor;
-    float samples[10], sign_bits[16];
+    float samples[10];
+    int sign_bits[16] = {0};
 
     if (length == 0) {
         // If no data use noise
         for (sb=sb_min; sb < sb_max; sb++)
             build_sb_samples_from_noise(q, sb);
 
-        return;
+        return 0;
     }
 
     for (sb = sb_min; sb < sb_max; sb++) {
@@ -831,12 +676,12 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
         else if (sb >= 24)
             joined_stereo = 1;
         else
-            joined_stereo = (bitstream_bits_left(bc) >= 1) ? bitstream_read_bit(bc) : 0;
+            joined_stereo = (get_bits_left(gb) >= 1) ? get_bits1(gb) : 0;
 
         if (joined_stereo) {
-            if (bitstream_bits_left(bc) >= 16)
+            if (get_bits_left(gb) >= 16)
                 for (j = 0; j < 16; j++)
-                    sign_bits[j] = bitstream_read_bit(bc);
+                    sign_bits[j] = get_bits1(gb);
 
             for (j = 0; j < 64; j++)
                 if (q->coding_method[1][sb][j] > q->coding_method[0][sb][j])
@@ -844,6 +689,7 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
 
             if (fix_coding_method_array(sb, q->nb_channels,
                                             q->coding_method)) {
+                av_log(NULL, AV_LOG_ERROR, "coding method invalid\n");
                 build_sb_samples_from_noise(q, sb);
                 continue;
             }
@@ -852,22 +698,27 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
 
         for (ch = 0; ch < channels; ch++) {
             FIX_NOISE_IDX(q->noise_idx);
-            zero_encoding = (bitstream_bits_left(bc) >= 1) ? bitstream_read_bit(bc) : 0;
+            zero_encoding = (get_bits_left(gb) >= 1) ? get_bits1(gb) : 0;
             type34_predictor = 0.0;
             type34_first = 1;
 
             for (j = 0; j < 128; ) {
                 switch (q->coding_method[ch][sb][j / 2]) {
                     case 8:
-                        if (bitstream_bits_left(bc) >= 10) {
+                        if (get_bits_left(gb) >= 10) {
                             if (zero_encoding) {
                                 for (k = 0; k < 5; k++) {
                                     if ((j + 2 * k) >= 128)
                                         break;
-                                    samples[2 * k] = bitstream_read_bit(bc) ? dequant_1bit[joined_stereo][2 * bitstream_read_bit(bc)] : 0;
+                                    samples[2 * k] = get_bits1(gb) ? dequant_1bit[joined_stereo][2 * get_bits1(gb)] : 0;
                                 }
                             } else {
-                                n = bitstream_read(bc, 8);
+                                n = get_bits(gb, 8);
+                                if (n >= 243) {
+                                    av_log(NULL, AV_LOG_ERROR, "Invalid 8bit codeword\n");
+                                    return AVERROR_INVALIDDATA;
+                                }
+
                                 for (k = 0; k < 5; k++)
                                     samples[2 * k] = dequant_1bit[joined_stereo][random_dequant_index[n][k]];
                             }
@@ -881,10 +732,10 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
                         break;
 
                     case 10:
-                        if (bitstream_bits_left(bc) >= 1) {
+                        if (get_bits_left(gb) >= 1) {
                             float f = 0.81;
 
-                            if (bitstream_read_bit(bc))
+                            if (get_bits1(gb))
                                 f = -f;
                             f -= noise_samples[((sb + 1) * (j +5 * ch + 1)) & 127] * 9.0 / 40.0;
                             samples[0] = f;
@@ -895,15 +746,20 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
                         break;
 
                     case 16:
-                        if (bitstream_bits_left(bc) >= 10) {
+                        if (get_bits_left(gb) >= 10) {
                             if (zero_encoding) {
                                 for (k = 0; k < 5; k++) {
                                     if ((j + k) >= 128)
                                         break;
-                                    samples[k] = (bitstream_read_bit(bc) == 0) ? 0 : dequant_1bit[joined_stereo][2 * bitstream_read_bit(bc)];
+                                    samples[k] = (get_bits1(gb) == 0) ? 0 : dequant_1bit[joined_stereo][2 * get_bits1(gb)];
                                 }
                             } else {
-                                n = bitstream_read (bc, 8);
+                                n = get_bits (gb, 8);
+                                if (n >= 243) {
+                                    av_log(NULL, AV_LOG_ERROR, "Invalid 8bit codeword\n");
+                                    return AVERROR_INVALIDDATA;
+                                }
+
                                 for (k = 0; k < 5; k++)
                                     samples[k] = dequant_1bit[joined_stereo][random_dequant_index[n][k]];
                             }
@@ -915,8 +771,13 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
                         break;
 
                     case 24:
-                        if (bitstream_bits_left(bc) >= 7) {
-                            n = bitstream_read(bc, 7);
+                        if (get_bits_left(gb) >= 7) {
+                            n = get_bits(gb, 7);
+                            if (n >= 125) {
+                                av_log(NULL, AV_LOG_ERROR, "Invalid 7bit codeword\n");
+                                return AVERROR_INVALIDDATA;
+                            }
+
                             for (k = 0; k < 3; k++)
                                 samples[k] = (random_dequant_type24[n][k] - 2.0) * 0.5;
                         } else {
@@ -927,12 +788,13 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
                         break;
 
                     case 30:
-                        if (bitstream_bits_left(bc) >= 4) {
-                            unsigned index = qdm2_get_vlc(bc, &vlc_tab_type30, 0, 1);
-                            if (index < FF_ARRAY_ELEMS(type30_dequant)) {
-                                samples[0] = type30_dequant[index];
-                            } else
-                                samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
+                        if (get_bits_left(gb) >= 4) {
+                            unsigned index = qdm2_get_vlc(gb, &vlc_tab_type30, 0, 1);
+                            if (index >= FF_ARRAY_ELEMS(type30_dequant)) {
+                                av_log(NULL, AV_LOG_ERROR, "index %d out of type30_dequant array\n", index);
+                                return AVERROR_INVALIDDATA;
+                            }
+                            samples[0] = type30_dequant[index];
                         } else
                             samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
 
@@ -940,19 +802,20 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
                         break;
 
                     case 34:
-                        if (bitstream_bits_left(bc) >= 7) {
+                        if (get_bits_left(gb) >= 7) {
                             if (type34_first) {
-                                type34_div = (float)(1 << bitstream_read(bc, 2));
-                                samples[0] = ((float)bitstream_read(bc, 5) - 16.0) / 15.0;
+                                type34_div = (float)(1 << get_bits(gb, 2));
+                                samples[0] = ((float)get_bits(gb, 5) - 16.0) / 15.0;
                                 type34_predictor = samples[0];
                                 type34_first = 0;
                             } else {
-                                unsigned index = qdm2_get_vlc(bc, &vlc_tab_type34, 0, 1);
-                                if (index < FF_ARRAY_ELEMS(type34_delta)) {
-                                    samples[0] = type34_delta[index] / type34_div + type34_predictor;
-                                    type34_predictor = samples[0];
-                                } else
-                                    samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
+                                unsigned index = qdm2_get_vlc(gb, &vlc_tab_type34, 0, 1);
+                                if (index >= FF_ARRAY_ELEMS(type34_delta)) {
+                                    av_log(NULL, AV_LOG_ERROR, "index %d out of type34_delta array\n", index);
+                                    return AVERROR_INVALIDDATA;
+                                }
+                                samples[0] = type34_delta[index] / type34_div + type34_predictor;
+                                type34_predictor = samples[0];
                             }
                         } else {
                             samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
@@ -989,6 +852,7 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
             } // j loop
         } // channel loop
     } // subband loop
+    return 0;
 }
 
 /**
@@ -999,27 +863,30 @@ static void synthfilt_build_sb_samples(QDM2Context *q, BitstreamContext *bc,
  * same VLC tables as process_subpacket_9 are used.
  *
  * @param quantized_coeffs    pointer to quantized_coeffs[ch][0]
- * @param bc        bitreader context
+ * @param gb        bitreader context
  */
-static void init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
-                                        BitstreamContext *bc)
+static int init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
+                                        GetBitContext *gb)
 {
     int i, k, run, level, diff;
 
-    if (bitstream_bits_left(bc) < 16)
-        return;
-    level = qdm2_get_vlc(bc, &vlc_tab_level, 0, 2);
+    if (get_bits_left(gb) < 16)
+        return -1;
+    level = qdm2_get_vlc(gb, &vlc_tab_level, 0, 2);
 
     quantized_coeffs[0] = level;
 
     for (i = 0; i < 7; ) {
-        if (bitstream_bits_left(bc) < 16)
-            break;
-        run = qdm2_get_vlc(bc, &vlc_tab_run, 0, 1) + 1;
+        if (get_bits_left(gb) < 16)
+            return -1;
+        run = qdm2_get_vlc(gb, &vlc_tab_run, 0, 1) + 1;
 
-        if (bitstream_bits_left(bc) < 16)
-            break;
-        diff = qdm2_get_se_vlc(&vlc_tab_diff, bc, 2);
+        if (i + run >= 8)
+            return -1;
+
+        if (get_bits_left(gb) < 16)
+            return -1;
+        diff = qdm2_get_se_vlc(&vlc_tab_diff, gb, 2);
 
         for (k = 1; k <= run; k++)
             quantized_coeffs[i + k] = (level + ((k * diff) / run));
@@ -1027,6 +894,7 @@ static void init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
         level += diff;
         i += run;
     }
+    return 0;
 }
 
 /**
@@ -1036,16 +904,16 @@ static void init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
  * data from packet 10
  *
  * @param q         context
- * @param bc        bitreader context
+ * @param gb        bitreader context
  */
-static void init_tone_level_dequantization(QDM2Context *q, BitstreamContext *bc)
+static void init_tone_level_dequantization(QDM2Context *q, GetBitContext *gb)
 {
     int sb, j, k, n, ch;
 
     for (ch = 0; ch < q->nb_channels; ch++) {
-        init_quantized_coeffs_elem0(q->quantized_coeffs[ch][0], bc);
+        init_quantized_coeffs_elem0(q->quantized_coeffs[ch][0], gb);
 
-        if (bitstream_bits_left(bc) < 16) {
+        if (get_bits_left(gb) < 16) {
             memset(q->quantized_coeffs[ch][0], 0, 8);
             break;
         }
@@ -1056,13 +924,13 @@ static void init_tone_level_dequantization(QDM2Context *q, BitstreamContext *bc)
     for (sb = 0; sb < n; sb++)
         for (ch = 0; ch < q->nb_channels; ch++)
             for (j = 0; j < 8; j++) {
-                if (bitstream_bits_left(bc) < 1)
+                if (get_bits_left(gb) < 1)
                     break;
-                if (bitstream_read_bit(bc)) {
+                if (get_bits1(gb)) {
                     for (k=0; k < 8; k++) {
-                        if (bitstream_bits_left(bc) < 16)
+                        if (get_bits_left(gb) < 16)
                             break;
-                        q->tone_level_idx_hi1[ch][sb][j][k] = qdm2_get_vlc(bc, &vlc_tab_tone_level_idx_hi1, 0, 2);
+                        q->tone_level_idx_hi1[ch][sb][j][k] = qdm2_get_vlc(gb, &vlc_tab_tone_level_idx_hi1, 0, 2);
                     }
                 } else {
                     for (k=0; k < 8; k++)
@@ -1074,9 +942,9 @@ static void init_tone_level_dequantization(QDM2Context *q, BitstreamContext *bc)
 
     for (sb = 0; sb < n; sb++)
         for (ch = 0; ch < q->nb_channels; ch++) {
-            if (bitstream_bits_left(bc) < 16)
+            if (get_bits_left(gb) < 16)
                 break;
-            q->tone_level_idx_hi2[ch][sb] = qdm2_get_vlc(bc, &vlc_tab_tone_level_idx_hi2, 0, 2);
+            q->tone_level_idx_hi2[ch][sb] = qdm2_get_vlc(gb, &vlc_tab_tone_level_idx_hi2, 0, 2);
             if (sb > 19)
                 q->tone_level_idx_hi2[ch][sb] -= 16;
             else
@@ -1089,9 +957,9 @@ static void init_tone_level_dequantization(QDM2Context *q, BitstreamContext *bc)
     for (sb = 0; sb < n; sb++)
         for (ch = 0; ch < q->nb_channels; ch++)
             for (j = 0; j < 8; j++) {
-                if (bitstream_bits_left(bc) < 16)
+                if (get_bits_left(gb) < 16)
                     break;
-                q->tone_level_idx_mid[ch][sb][j] = qdm2_get_vlc(bc, &vlc_tab_tone_level_idx_mid, 0, 2) - 32;
+                q->tone_level_idx_mid[ch][sb][j] = qdm2_get_vlc(gb, &vlc_tab_tone_level_idx_mid, 0, 2) - 32;
             }
 }
 
@@ -1101,23 +969,26 @@ static void init_tone_level_dequantization(QDM2Context *q, BitstreamContext *bc)
  * @param q       context
  * @param node    pointer to node with packet
  */
-static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
+static int process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int i, j, k, n, ch, run, level, diff;
 
-    bitstream_init8(&bc, node->packet->data, node->packet->size);
+    init_get_bits(&gb, node->packet->data, node->packet->size * 8);
 
     n = coeff_per_sb_for_avg[q->coeff_per_sb_select][QDM2_SB_USED(q->sub_sampling) - 1] + 1;
 
     for (i = 1; i < n; i++)
         for (ch = 0; ch < q->nb_channels; ch++) {
-            level = qdm2_get_vlc(&bc, &vlc_tab_level, 0, 2);
+            level = qdm2_get_vlc(&gb, &vlc_tab_level, 0, 2);
             q->quantized_coeffs[ch][i][0] = level;
 
             for (j = 0; j < (8 - 1); ) {
-                run  = qdm2_get_vlc(&bc, &vlc_tab_run, 0, 1) + 1;
-                diff = qdm2_get_se_vlc(&vlc_tab_diff, &bc, 2);
+                run  = qdm2_get_vlc(&gb, &vlc_tab_run, 0, 1) + 1;
+                diff = qdm2_get_se_vlc(&vlc_tab_diff, &gb, 2);
+
+                if (j + run >= 8)
+                    return -1;
 
                 for (k = 1; k <= run; k++)
                     q->quantized_coeffs[ch][i][j + k] = (level + ((k * diff) / run));
@@ -1130,6 +1001,8 @@ static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
     for (ch = 0; ch < q->nb_channels; ch++)
         for (i = 0; i < 8; i++)
             q->quantized_coeffs[ch][0][i] = 0;
+
+    return 0;
 }
 
 /**
@@ -1140,11 +1013,11 @@ static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
  */
 static void process_subpacket_10(QDM2Context *q, QDM2SubPNode *node)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
 
     if (node) {
-        bitstream_init8(&bc, node->packet->data, node->packet->size);
-        init_tone_level_dequantization(q, &bc);
+        init_get_bits(&gb, node->packet->data, node->packet->size * 8);
+        init_tone_level_dequantization(q, &gb);
         fill_tone_level_array(q, 1);
     } else {
         fill_tone_level_array(q, 0);
@@ -1159,16 +1032,16 @@ static void process_subpacket_10(QDM2Context *q, QDM2SubPNode *node)
  */
 static void process_subpacket_11(QDM2Context *q, QDM2SubPNode *node)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int length = 0;
 
     if (node) {
         length = node->packet->size * 8;
-        bitstream_init(&bc, node->packet->data, length);
+        init_get_bits(&gb, node->packet->data, length);
     }
 
     if (length >= 32) {
-        int c = bitstream_read(&bc, 13);
+        int c = get_bits(&gb, 13);
 
         if (c > 3)
             fill_coding_method_array(q->tone_level_idx,
@@ -1177,7 +1050,7 @@ static void process_subpacket_11(QDM2Context *q, QDM2SubPNode *node)
                                      q->superblocktype_2_3, q->cm_table_select);
     }
 
-    synthfilt_build_sb_samples(q, &bc, length, 0, 8);
+    synthfilt_build_sb_samples(q, &gb, length, 0, 8);
 }
 
 /**
@@ -1188,18 +1061,18 @@ static void process_subpacket_11(QDM2Context *q, QDM2SubPNode *node)
  */
 static void process_subpacket_12(QDM2Context *q, QDM2SubPNode *node)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     int length = 0;
 
     if (node) {
         length = node->packet->size * 8;
-        bitstream_init(&bc, node->packet->data, length);
+        init_get_bits(&gb, node->packet->data, length);
     }
 
-    synthfilt_build_sb_samples(q, &bc, length, 8, QDM2_SB_USED(q->sub_sampling));
+    synthfilt_build_sb_samples(q, &gb, length, 8, QDM2_SB_USED(q->sub_sampling));
 }
 
-/*
+/**
  * Process new subpackets for synthesis filter
  *
  * @param q       context
@@ -1232,14 +1105,14 @@ static void process_synthesis_subpackets(QDM2Context *q, QDM2SubPNode *list)
         process_subpacket_12(q, NULL);
 }
 
-/*
+/**
  * Decode superblock, fill packet lists.
  *
  * @param q    context
  */
 static void qdm2_decode_super_block(QDM2Context *q)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     QDM2SubPacket header, *packet;
     int i, packet_bytes, sub_packet_size, sub_packets_D;
     unsigned int next_index = 0;
@@ -1253,8 +1126,8 @@ static void qdm2_decode_super_block(QDM2Context *q)
 
     average_quantized_coeffs(q); // average elements in quantized_coeffs[max_ch][10][8]
 
-    bitstream_init8(&bc, q->compressed_data, q->compressed_size);
-    qdm2_decode_sub_packet_header(&bc, &header);
+    init_get_bits(&gb, q->compressed_data, q->compressed_size * 8);
+    qdm2_decode_sub_packet_header(&gb, &header);
 
     if (header.type < 2 || header.type >= 8) {
         q->has_errors = 1;
@@ -1263,13 +1136,13 @@ static void qdm2_decode_super_block(QDM2Context *q)
     }
 
     q->superblocktype_2_3 = (header.type == 2 || header.type == 3);
-    packet_bytes          = (q->compressed_size - bitstream_tell(&bc) / 8);
+    packet_bytes          = (q->compressed_size - get_bits_count(&gb) / 8);
 
-    bitstream_init8(&bc, header.data, header.size);
+    init_get_bits(&gb, header.data, header.size * 8);
 
     if (header.type == 2 || header.type == 4 || header.type == 5) {
-        int csum = 257 * bitstream_read(&bc, 8);
-        csum    +=   2 * bitstream_read(&bc, 8);
+        int csum = 257 * get_bits(&gb, 8);
+        csum += 2 * get_bits(&gb, 8);
 
         csum = qdm2_packet_checksum(q->compressed_data, q->checksum_size, csum);
 
@@ -1301,8 +1174,8 @@ static void qdm2_decode_super_block(QDM2Context *q)
             q->sub_packet_list_A[i - 1].next = &q->sub_packet_list_A[i];
 
             /* seek to next block */
-            bitstream_init8(&bc, header.data, header.size);
-            bitstream_skip(&bc, next_index * 8);
+            init_get_bits(&gb, header.data, header.size * 8);
+            skip_bits(&gb, next_index * 8);
 
             if (next_index >= header.size)
                 break;
@@ -1310,8 +1183,8 @@ static void qdm2_decode_super_block(QDM2Context *q)
 
         /* decode subpacket */
         packet = &q->sub_packets[i];
-        qdm2_decode_sub_packet_header(&bc, packet);
-        next_index      = packet->size + bitstream_tell(&bc) / 8;
+        qdm2_decode_sub_packet_header(&gb, packet);
+        next_index      = packet->size + get_bits_count(&gb) / 8;
         sub_packet_size = ((packet->size > 0xff) ? 1 : 0) + packet->size + 2;
 
         if (packet->type == 0)
@@ -1337,10 +1210,10 @@ static void qdm2_decode_super_block(QDM2Context *q)
             QDM2_LIST_ADD(q->sub_packet_list_D, sub_packets_D, packet);
         } else if (packet->type == 13) {
             for (j = 0; j < 6; j++)
-                q->fft_level_exp[j] = bitstream_read(&bc, 6);
+                q->fft_level_exp[j] = get_bits(&gb, 6);
         } else if (packet->type == 14) {
             for (j = 0; j < 6; j++)
-                q->fft_level_exp[j] = qdm2_get_vlc(&bc, &fft_level_exp_vlc, 0, 2);
+                q->fft_level_exp[j] = qdm2_get_vlc(&gb, &fft_level_exp_vlc, 0, 2);
         } else if (packet->type == 15) {
             SAMPLES_NEEDED_2("packet type 15")
             return;
@@ -1378,7 +1251,7 @@ static void qdm2_fft_init_coefficient(QDM2Context *q, int sub_packet,
 }
 
 static void qdm2_fft_decode_tones(QDM2Context *q, int duration,
-                                  BitstreamContext *bc, int b)
+                                  GetBitContext *gb, int b)
 {
     int channel, stereo, phase, exp;
     int local_int_4, local_int_8, stereo_phase, local_int_10;
@@ -1392,9 +1265,14 @@ static void qdm2_fft_decode_tones(QDM2Context *q, int duration,
     local_int_10 = 1 << (q->group_order - duration - 1);
     offset       = 1;
 
-    while (1) {
+    while (get_bits_left(gb)>0) {
         if (q->superblocktype_2_3) {
-            while ((n = qdm2_get_vlc(bc, &vlc_tab_fft_tone_offset[local_int_8], 1, 2)) < 2) {
+            while ((n = qdm2_get_vlc(gb, &vlc_tab_fft_tone_offset[local_int_8], 1, 2)) < 2) {
+                if (get_bits_left(gb)<0) {
+                    if(local_int_4 < q->group_size)
+                        av_log(NULL, AV_LOG_ERROR, "overread in qdm2_fft_decode_tones()\n");
+                    return;
+                }
                 offset = 1;
                 if (n == 0) {
                     local_int_4  += local_int_10;
@@ -1406,7 +1284,7 @@ static void qdm2_fft_decode_tones(QDM2Context *q, int duration,
             }
             offset += (n - 2);
         } else {
-            offset += qdm2_get_vlc(bc, &vlc_tab_fft_tone_offset[local_int_8], 1, 2);
+            offset += qdm2_get_vlc(gb, &vlc_tab_fft_tone_offset[local_int_8], 1, 2);
             while (offset >= (local_int_10 - 1)) {
                 offset       += (1 - (local_int_10 - 1));
                 local_int_4  += local_int_10;
@@ -1422,24 +1300,24 @@ static void qdm2_fft_decode_tones(QDM2Context *q, int duration,
             return;
 
         if (q->nb_channels > 1) {
-            channel = bitstream_read_bit(bc);
-            stereo  = bitstream_read_bit(bc);
+            channel = get_bits1(gb);
+            stereo  = get_bits1(gb);
         } else {
             channel = 0;
             stereo  = 0;
         }
 
-        exp  = qdm2_get_vlc(bc, (b ? &fft_level_exp_vlc : &fft_level_exp_alt_vlc), 0, 2);
+        exp  = qdm2_get_vlc(gb, (b ? &fft_level_exp_vlc : &fft_level_exp_alt_vlc), 0, 2);
         exp += q->fft_level_exp[fft_level_index_table[local_int_14]];
         exp  = (exp < 0) ? 0 : exp;
 
-        phase        = bitstream_read(bc, 3);
+        phase        = get_bits(gb, 3);
         stereo_exp   = 0;
         stereo_phase = 0;
 
         if (stereo) {
-            stereo_exp   = (exp   - qdm2_get_vlc(bc, &fft_stereo_exp_vlc,   0, 1));
-            stereo_phase = (phase - qdm2_get_vlc(bc, &fft_stereo_phase_vlc, 0, 1));
+            stereo_exp   = (exp - qdm2_get_vlc(gb, &fft_stereo_exp_vlc, 0, 1));
+            stereo_phase = (phase - qdm2_get_vlc(gb, &fft_stereo_phase_vlc, 0, 1));
             if (stereo_phase < 0)
                 stereo_phase += 8;
         }
@@ -1461,7 +1339,7 @@ static void qdm2_fft_decode_tones(QDM2Context *q, int duration,
 static void qdm2_decode_fft_packets(QDM2Context *q)
 {
     int i, j, min, max, value, type, unknown_flag;
-    BitstreamContext bc;
+    GetBitContext gb;
 
     if (!q->sub_packet_list_B[0].packet)
         return;
@@ -1496,7 +1374,7 @@ static void qdm2_decode_fft_packets(QDM2Context *q)
             return;
 
         /* decode FFT tones */
-        bitstream_init8(&bc, packet->data, packet->size);
+        init_get_bits(&gb, packet->data, packet->size * 8);
 
         if (packet->type >= 32 && packet->type < 48 && !fft_subpackets[packet->type - 16])
             unknown_flag = 1;
@@ -1509,15 +1387,15 @@ static void qdm2_decode_fft_packets(QDM2Context *q)
             int duration = q->sub_sampling + 5 - (type & 15);
 
             if (duration >= 0 && duration < 4)
-                qdm2_fft_decode_tones(q, duration, &bc, unknown_flag);
+                qdm2_fft_decode_tones(q, duration, &gb, unknown_flag);
         } else if (type == 31) {
             for (j = 0; j < 4; j++)
-                qdm2_fft_decode_tones(q, j, &bc, unknown_flag);
+                qdm2_fft_decode_tones(q, j, &gb, unknown_flag);
         } else if (type == 46) {
             for (j = 0; j < 6; j++)
-                q->fft_level_exp[j] = bitstream_read(&bc, 6);
+                q->fft_level_exp[j] = get_bits(&gb, 6);
             for (j = 0; j < 4; j++)
-                qdm2_fft_decode_tones(q, j, &bc, unknown_flag);
+                qdm2_fft_decode_tones(q, j, &gb, unknown_flag);
         }
     } // Loop on B packets
 
@@ -1707,12 +1585,19 @@ static void qdm2_synthesis_filter(QDM2Context *q, int index)
  *
  * @param q    context
  */
-static av_cold void qdm2_init_static_data(AVCodec *codec) {
+static av_cold void qdm2_init_static_data(void) {
+    static int done;
+
+    if(done)
+        return;
+
     qdm2_init_vlc();
     ff_mpa_synth_init_float(ff_mpa_synth_window_float);
     softclip_table_init();
     rnd_table_init();
     init_noise_samples();
+
+    done = 1;
 }
 
 /**
@@ -1721,9 +1606,10 @@ static av_cold void qdm2_init_static_data(AVCodec *codec) {
 static av_cold int qdm2_decode_init(AVCodecContext *avctx)
 {
     QDM2Context *s = avctx->priv_data;
-    uint8_t *extradata;
-    int extradata_size;
     int tmp_val, tmp, size;
+    GetByteContext gb;
+
+    qdm2_init_static_data();
 
     /* extradata parsing
 
@@ -1764,73 +1650,51 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    extradata      = avctx->extradata;
-    extradata_size = avctx->extradata_size;
+    bytestream2_init(&gb, avctx->extradata, avctx->extradata_size);
 
-    while (extradata_size > 7) {
-        if (!memcmp(extradata, "frmaQDM", 7))
+    while (bytestream2_get_bytes_left(&gb) > 8) {
+        if (bytestream2_peek_be64(&gb) == (((uint64_t)MKBETAG('f','r','m','a') << 32) |
+                                            (uint64_t)MKBETAG('Q','D','M','2')))
             break;
-        extradata++;
-        extradata_size--;
+        bytestream2_skip(&gb, 1);
     }
 
-    if (extradata_size < 12) {
+    if (bytestream2_get_bytes_left(&gb) < 12) {
         av_log(avctx, AV_LOG_ERROR, "not enough extradata (%i)\n",
-               extradata_size);
+               bytestream2_get_bytes_left(&gb));
         return AVERROR_INVALIDDATA;
     }
 
-    if (memcmp(extradata, "frmaQDM", 7)) {
-        av_log(avctx, AV_LOG_ERROR, "invalid headers, QDM? not found\n");
-        return AVERROR_INVALIDDATA;
-    }
+    bytestream2_skip(&gb, 8);
+    size = bytestream2_get_be32(&gb);
 
-    if (extradata[7] == 'C') {
-//        s->is_qdmc = 1;
-        avpriv_report_missing_feature(avctx, "QDMC version 1");
-        return AVERROR_PATCHWELCOME;
-    }
-
-    extradata += 8;
-    extradata_size -= 8;
-
-    size = AV_RB32(extradata);
-
-    if(size > extradata_size){
+    if (size > bytestream2_get_bytes_left(&gb)) {
         av_log(avctx, AV_LOG_ERROR, "extradata size too small, %i < %i\n",
-               extradata_size, size);
+               bytestream2_get_bytes_left(&gb), size);
         return AVERROR_INVALIDDATA;
     }
 
-    extradata += 4;
     av_log(avctx, AV_LOG_DEBUG, "size: %d\n", size);
-    if (AV_RB32(extradata) != MKBETAG('Q','D','C','A')) {
+    if (bytestream2_get_be32(&gb) != MKBETAG('Q','D','C','A')) {
         av_log(avctx, AV_LOG_ERROR, "invalid extradata, expecting QDCA\n");
         return AVERROR_INVALIDDATA;
     }
 
-    extradata += 8;
+    bytestream2_skip(&gb, 4);
 
-    avctx->channels = s->nb_channels = s->channels = AV_RB32(extradata);
-    extradata += 4;
-    if (s->channels <= 0 || s->channels > MPA_MAX_CHANNELS)
+    avctx->channels = s->nb_channels = s->channels = bytestream2_get_be32(&gb);
+    if (s->channels <= 0 || s->channels > MPA_MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
         return AVERROR_INVALIDDATA;
+    }
     avctx->channel_layout = avctx->channels == 2 ? AV_CH_LAYOUT_STEREO :
                                                    AV_CH_LAYOUT_MONO;
 
-    avctx->sample_rate = AV_RB32(extradata);
-    extradata += 4;
-
-    avctx->bit_rate = AV_RB32(extradata);
-    extradata += 4;
-
-    s->group_size = AV_RB32(extradata);
-    extradata += 4;
-
-    s->fft_size = AV_RB32(extradata);
-    extradata += 4;
-
-    s->checksum_size = AV_RB32(extradata);
+    avctx->sample_rate = bytestream2_get_be32(&gb);
+    avctx->bit_rate = bytestream2_get_be32(&gb);
+    s->group_size = bytestream2_get_be32(&gb);
+    s->fft_size = bytestream2_get_be32(&gb);
+    s->checksum_size = bytestream2_get_be32(&gb);
     if (s->checksum_size >= 1U << 28) {
         av_log(avctx, AV_LOG_ERROR, "data block size too large (%u)\n", s->checksum_size);
         return AVERROR_INVALIDDATA;
@@ -1841,6 +1705,7 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     // something like max decodable tones
     s->group_order = av_log2(s->group_size) + 1;
     s->frame_size = s->group_size / 16; // 16 iterations per super block
+
     if (s->frame_size > QDM2_MAX_FRAME_SIZE)
         return AVERROR_INVALIDDATA;
 
@@ -1863,18 +1728,9 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     if ((tmp * 2240) < avctx->bit_rate)  tmp_val = 4;
     s->cm_table_select = tmp_val;
 
-    if (s->sub_sampling == 0)
-        tmp = 7999;
-    else
-        tmp = ((-(s->sub_sampling -1)) & 8000) + 20000;
-    /*
-    0: 7999 -> 0
-    1: 20000 -> 2
-    2: 28000 -> 2
-    */
-    if (tmp < 8000)
+    if (avctx->bit_rate <= 8000)
         s->coeff_per_sb_select = 0;
-    else if (tmp <= 16000)
+    else if (avctx->bit_rate < 16000)
         s->coeff_per_sb_select = 1;
     else
         s->coeff_per_sb_select = 2;
@@ -1911,6 +1767,9 @@ static int qdm2_decode(QDM2Context *q, const uint8_t *in, int16_t *out)
     int ch, i;
     const int frame_size = (q->frame_size * q->channels);
 
+    if((unsigned)frame_size > FF_ARRAY_ELEMS(q->output_buffer)/2)
+        return -1;
+
     /* select input buffer */
     q->compressed_data = in;
     q->compressed_size = q->checksum_size;
@@ -1982,10 +1841,8 @@ static int qdm2_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 16 * s->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out = (int16_t *)frame->data[0];
 
     for (i = 0; i < 16; i++) {
@@ -2006,7 +1863,6 @@ AVCodec ff_qdm2_decoder = {
     .id               = AV_CODEC_ID_QDM2,
     .priv_data_size   = sizeof(QDM2Context),
     .init             = qdm2_decode_init,
-    .init_static_data = qdm2_init_static_data,
     .close            = qdm2_decode_close,
     .decode           = qdm2_decode_frame,
     .capabilities     = AV_CODEC_CAP_DR1,
diff --git a/libavcodec/qdm2_tablegen.c b/libavcodec/qdm2_tablegen.c
index 59d82df..e19b49b 100644
--- a/libavcodec/qdm2_tablegen.c
+++ b/libavcodec/qdm2_tablegen.c
@@ -3,27 +3,27 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdlib.h>
+#include "tableprint_vlc.h"
 #define CONFIG_HARDCODED_TABLES 0
 #include "qdm2_tablegen.h"
-#include "tableprint.h"
 
 int main(void)
 {
@@ -40,5 +40,22 @@ int main(void)
     WRITE_2D_ARRAY("static const", uint8_t, random_dequant_index);
     WRITE_2D_ARRAY("static const", uint8_t, random_dequant_type24);
 
+    qdm2_init_vlc();
+
+    WRITE_2D_ARRAY("static const", VLC_TYPE, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_level, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_diff, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_run, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_level_exp_alt_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_level_exp_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_stereo_exp_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_stereo_phase_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_hi1, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_mid, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_hi2, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_type30, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_type34, qdm2_table);
+    WRITE_VLC_ARRAY("static const", vlc_tab_fft_tone_offset, qdm2_table);
+
     return 0;
 }
diff --git a/libavcodec/qdm2_tablegen.h b/libavcodec/qdm2_tablegen.h
index bb73d92..2331ebf 100644
--- a/libavcodec/qdm2_tablegen.h
+++ b/libavcodec/qdm2_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 #include <math.h>
 #include "libavutil/attributes.h"
+#include "qdm2data.h"
 
 #define SOFTCLIP_THRESHOLD 27600
 #define HARDCLIP_THRESHOLD 35716
@@ -34,10 +35,11 @@
 #define softclip_table_init()
 #define rnd_table_init()
 #define init_noise_samples()
+#define qdm2_init_vlc()
 #include "libavcodec/qdm2_tables.h"
 #else
 static uint16_t softclip_table[HARDCLIP_THRESHOLD - SOFTCLIP_THRESHOLD + 1];
-static float noise_table[4096];
+static float noise_table[4096 + 20];
 static uint8_t random_dequant_index[256][5];
 static uint8_t random_dequant_type24[128][3];
 static float noise_samples[128];
@@ -54,8 +56,7 @@ static av_cold void softclip_table_init(void) {
 // random generated table
 static av_cold void rnd_table_init(void) {
     int i,j;
-    uint32_t ldw,hdw;
-    uint64_t tmp64_1;
+    uint32_t ldw;
     uint64_t random_seed = 0;
     float delta = 1.0 / 16384.0;
     for(i = 0; i < 4096 ;i++) {
@@ -67,22 +68,18 @@ static av_cold void rnd_table_init(void) {
         random_seed = 81;
         ldw = i;
         for (j = 0; j < 5 ;j++) {
-            random_dequant_index[i][j] = (uint8_t)((ldw / random_seed) & 0xFF);
-            ldw = (uint32_t)ldw % (uint32_t)random_seed;
-            tmp64_1 = (random_seed * 0x55555556);
-            hdw = (uint32_t)(tmp64_1 >> 32);
-            random_seed = (uint64_t)(hdw + (ldw >> 31));
+            random_dequant_index[i][j] = ldw / random_seed;
+            ldw %= random_seed;
+            random_seed /= 3;
         }
     }
     for (i = 0; i < 128 ;i++) {
         random_seed = 25;
         ldw = i;
         for (j = 0; j < 3 ;j++) {
-            random_dequant_type24[i][j] = (uint8_t)((ldw / random_seed) & 0xFF);
-            ldw = (uint32_t)ldw % (uint32_t)random_seed;
-            tmp64_1 = (random_seed * 0x66666667);
-            hdw = (uint32_t)(tmp64_1 >> 33);
-            random_seed = hdw + (ldw >> 31);
+            random_dequant_type24[i][j] = ldw / random_seed;
+            ldw %= random_seed;
+            random_seed /= 5;
         }
     }
 }
@@ -97,6 +94,168 @@ static av_cold void init_noise_samples(void) {
         noise_samples[i] = (delta * (float)((random_seed >> 16) & 0x00007fff) - 1.0);
     }
 }
+
+static VLC vlc_tab_level;
+static VLC vlc_tab_diff;
+static VLC vlc_tab_run;
+static VLC fft_level_exp_alt_vlc;
+static VLC fft_level_exp_vlc;
+static VLC fft_stereo_exp_vlc;
+static VLC fft_stereo_phase_vlc;
+static VLC vlc_tab_tone_level_idx_hi1;
+static VLC vlc_tab_tone_level_idx_mid;
+static VLC vlc_tab_tone_level_idx_hi2;
+static VLC vlc_tab_type30;
+static VLC vlc_tab_type34;
+static VLC vlc_tab_fft_tone_offset[5];
+
+static const uint16_t qdm2_vlc_offs[] = {
+    0,260,566,598,894,1166,1230,1294,1678,1950,2214,2278,2310,2570,2834,3124,3448,3838,
+};
+
+static VLC_TYPE qdm2_table[3838][2];
+
+static av_cold void qdm2_init_vlc(void)
+{
+    vlc_tab_level.table           = &qdm2_table[qdm2_vlc_offs[0]];
+    vlc_tab_level.table_allocated = qdm2_vlc_offs[1] - qdm2_vlc_offs[0];
+    init_vlc(&vlc_tab_level, 8, 24,
+             vlc_tab_level_huffbits, 1, 1,
+             vlc_tab_level_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_diff.table           = &qdm2_table[qdm2_vlc_offs[1]];
+    vlc_tab_diff.table_allocated = qdm2_vlc_offs[2] - qdm2_vlc_offs[1];
+    init_vlc(&vlc_tab_diff, 8, 37,
+             vlc_tab_diff_huffbits, 1, 1,
+             vlc_tab_diff_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_run.table           = &qdm2_table[qdm2_vlc_offs[2]];
+    vlc_tab_run.table_allocated = qdm2_vlc_offs[3] - qdm2_vlc_offs[2];
+    init_vlc(&vlc_tab_run, 5, 6,
+             vlc_tab_run_huffbits, 1, 1,
+             vlc_tab_run_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_level_exp_alt_vlc.table           = &qdm2_table[qdm2_vlc_offs[3]];
+    fft_level_exp_alt_vlc.table_allocated = qdm2_vlc_offs[4] -
+                                            qdm2_vlc_offs[3];
+    init_vlc(&fft_level_exp_alt_vlc, 8, 28,
+             fft_level_exp_alt_huffbits, 1, 1,
+             fft_level_exp_alt_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_level_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[4]];
+    fft_level_exp_vlc.table_allocated = qdm2_vlc_offs[5] - qdm2_vlc_offs[4];
+    init_vlc(&fft_level_exp_vlc, 8, 20,
+             fft_level_exp_huffbits, 1, 1,
+             fft_level_exp_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_stereo_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[5]];
+    fft_stereo_exp_vlc.table_allocated = qdm2_vlc_offs[6] -
+                                         qdm2_vlc_offs[5];
+    init_vlc(&fft_stereo_exp_vlc, 6, 7,
+             fft_stereo_exp_huffbits, 1, 1,
+             fft_stereo_exp_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_stereo_phase_vlc.table           = &qdm2_table[qdm2_vlc_offs[6]];
+    fft_stereo_phase_vlc.table_allocated = qdm2_vlc_offs[7] -
+                                           qdm2_vlc_offs[6];
+    init_vlc(&fft_stereo_phase_vlc, 6, 9,
+             fft_stereo_phase_huffbits, 1, 1,
+             fft_stereo_phase_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_hi1.table =
+        &qdm2_table[qdm2_vlc_offs[7]];
+    vlc_tab_tone_level_idx_hi1.table_allocated = qdm2_vlc_offs[8] -
+                                                 qdm2_vlc_offs[7];
+    init_vlc(&vlc_tab_tone_level_idx_hi1, 8, 20,
+             vlc_tab_tone_level_idx_hi1_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_hi1_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_mid.table =
+        &qdm2_table[qdm2_vlc_offs[8]];
+    vlc_tab_tone_level_idx_mid.table_allocated = qdm2_vlc_offs[9] -
+                                                 qdm2_vlc_offs[8];
+    init_vlc(&vlc_tab_tone_level_idx_mid, 8, 24,
+             vlc_tab_tone_level_idx_mid_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_mid_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_hi2.table =
+        &qdm2_table[qdm2_vlc_offs[9]];
+    vlc_tab_tone_level_idx_hi2.table_allocated = qdm2_vlc_offs[10] -
+                                                 qdm2_vlc_offs[9];
+    init_vlc(&vlc_tab_tone_level_idx_hi2, 8, 24,
+             vlc_tab_tone_level_idx_hi2_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_hi2_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_type30.table           = &qdm2_table[qdm2_vlc_offs[10]];
+    vlc_tab_type30.table_allocated = qdm2_vlc_offs[11] - qdm2_vlc_offs[10];
+    init_vlc(&vlc_tab_type30, 6, 9,
+             vlc_tab_type30_huffbits, 1, 1,
+             vlc_tab_type30_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_type34.table           = &qdm2_table[qdm2_vlc_offs[11]];
+    vlc_tab_type34.table_allocated = qdm2_vlc_offs[12] - qdm2_vlc_offs[11];
+    init_vlc(&vlc_tab_type34, 5, 10,
+             vlc_tab_type34_huffbits, 1, 1,
+             vlc_tab_type34_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[0].table =
+        &qdm2_table[qdm2_vlc_offs[12]];
+    vlc_tab_fft_tone_offset[0].table_allocated = qdm2_vlc_offs[13] -
+                                                 qdm2_vlc_offs[12];
+    init_vlc(&vlc_tab_fft_tone_offset[0], 8, 23,
+             vlc_tab_fft_tone_offset_0_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_0_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[1].table =
+        &qdm2_table[qdm2_vlc_offs[13]];
+    vlc_tab_fft_tone_offset[1].table_allocated = qdm2_vlc_offs[14] -
+                                                 qdm2_vlc_offs[13];
+    init_vlc(&vlc_tab_fft_tone_offset[1], 8, 28,
+             vlc_tab_fft_tone_offset_1_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_1_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[2].table =
+        &qdm2_table[qdm2_vlc_offs[14]];
+    vlc_tab_fft_tone_offset[2].table_allocated = qdm2_vlc_offs[15] -
+                                                 qdm2_vlc_offs[14];
+    init_vlc(&vlc_tab_fft_tone_offset[2], 8, 32,
+             vlc_tab_fft_tone_offset_2_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_2_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[3].table =
+        &qdm2_table[qdm2_vlc_offs[15]];
+    vlc_tab_fft_tone_offset[3].table_allocated = qdm2_vlc_offs[16] -
+                                                 qdm2_vlc_offs[15];
+    init_vlc(&vlc_tab_fft_tone_offset[3], 8, 35,
+             vlc_tab_fft_tone_offset_3_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_3_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[4].table =
+        &qdm2_table[qdm2_vlc_offs[16]];
+    vlc_tab_fft_tone_offset[4].table_allocated = qdm2_vlc_offs[17] -
+                                                 qdm2_vlc_offs[16];
+    init_vlc(&vlc_tab_fft_tone_offset[4], 8, 38,
+             vlc_tab_fft_tone_offset_4_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_4_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+}
+
 #endif /* CONFIG_HARDCODED_TABLES */
 
 #endif /* AVCODEC_QDM2_TABLEGEN_H */
diff --git a/libavcodec/qdm2data.h b/libavcodec/qdm2data.h
index ad6ea88..355d613 100644
--- a/libavcodec/qdm2data.h
+++ b/libavcodec/qdm2data.h
@@ -5,20 +5,20 @@
  * Copyright (c) 2005 Alex Beregszaszi
  * Copyright (c) 2005 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qdmc.c b/libavcodec/qdmc.c
new file mode 100644
index 0000000..8f5b7b9
--- /dev/null
+++ b/libavcodec/qdmc.c
@@ -0,0 +1,788 @@
+/*
+ * QDMC compatible decoder
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#define BITSTREAM_READER_LE
+
+#include "libavutil/channel_layout.h"
+#include "libavutil/thread.h"
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "get_bits.h"
+#include "internal.h"
+#include "fft.h"
+
+typedef struct QDMCTone {
+    uint8_t mode;
+    uint8_t phase;
+    uint8_t offset;
+    int16_t freq;
+    int16_t amplitude;
+} QDMCTone;
+
+typedef struct QDMCContext {
+    AVCodecContext *avctx;
+
+    uint8_t frame_bits;
+    int band_index;
+    int frame_size;
+    int subframe_size;
+    int fft_offset;
+    int buffer_offset;
+    int nb_channels;
+    int checksum_size;
+
+    uint8_t noise[2][19][17];
+    QDMCTone tones[5][8192];
+    int nb_tones[5];
+    int cur_tone[5];
+    float alt_sin[5][31];
+    float fft_buffer[4][8192 * 2];
+    float noise2_buffer[4096 * 2];
+    float noise_buffer[4096 * 2];
+    float buffer[2 * 32768];
+    float *buffer_ptr;
+    int rndval;
+
+    DECLARE_ALIGNED(32, FFTComplex, cmplx)[2][512];
+    FFTContext fft_ctx;
+} QDMCContext;
+
+static float sin_table[512];
+static VLC vtable[6];
+
+static const unsigned code_prefix[] = {
+    0x0, 0x1, 0x2, 0x3, 0x4, 0x6, 0x8, 0xA,
+    0xC, 0x10, 0x14, 0x18, 0x1C, 0x24, 0x2C, 0x34,
+    0x3C, 0x4C, 0x5C, 0x6C, 0x7C, 0x9C, 0xBC, 0xDC,
+    0xFC, 0x13C, 0x17C, 0x1BC, 0x1FC, 0x27C, 0x2FC, 0x37C,
+    0x3FC, 0x4FC, 0x5FC, 0x6FC, 0x7FC, 0x9FC, 0xBFC, 0xDFC,
+    0xFFC, 0x13FC, 0x17FC, 0x1BFC, 0x1FFC, 0x27FC, 0x2FFC, 0x37FC,
+    0x3FFC, 0x4FFC, 0x5FFC, 0x6FFC, 0x7FFC, 0x9FFC, 0xBFFC, 0xDFFC,
+    0xFFFC, 0x13FFC, 0x17FFC, 0x1BFFC, 0x1FFFC, 0x27FFC, 0x2FFFC, 0x37FFC,
+    0x3FFFC
+};
+
+static const float amplitude_tab[64] = {
+    1.18750000f, 1.68359380f, 2.37500000f, 3.36718750f, 4.75000000f,
+    6.73437500f, 9.50000000f, 13.4687500f, 19.0000000f, 26.9375000f,
+    38.0000000f, 53.8750000f, 76.0000000f, 107.750000f, 152.000000f,
+    215.500000f, 304.000000f, 431.000000f, 608.000000f, 862.000000f,
+    1216.00000f, 1724.00000f, 2432.00000f, 3448.00000f, 4864.00000f,
+    6896.00000f, 9728.00000f, 13792.0000f, 19456.0000f, 27584.0000f,
+    38912.0000f, 55168.0000f, 77824.0000f, 110336.000f, 155648.000f,
+    220672.000f, 311296.000f, 441344.000f, 622592.000f, 882688.000f,
+    1245184.00f, 1765376.00f, 2490368.00f, 3530752.00f, 4980736.00f,
+    7061504.00f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const uint16_t qdmc_nodes[112] = {
+    0, 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 56, 64,
+    80, 96, 120, 144, 176, 208, 240, 256,
+    0, 2, 4, 8, 16, 24, 32, 48, 56, 64, 80, 104,
+    128, 160, 208, 256, 0, 0, 0, 0, 0,
+    0, 2, 4, 8, 16, 32, 48, 64, 80, 112, 160, 208,
+    256, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 4, 8, 16, 32, 48, 64, 96, 144, 208, 256,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 4, 16, 32, 64, 256, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static const uint8_t noise_bands_size[] = {
+    19, 14, 11, 9, 4, 2, 0
+};
+
+static const uint8_t noise_bands_selector[] = {
+    4, 3, 2, 1, 0, 0, 0,
+};
+
+static const uint8_t noise_value_bits[] = {
+    12, 7, 9, 7, 10, 9, 11, 9, 9, 2, 9, 9, 9, 9,
+    9, 3, 9, 10, 10, 12, 2, 3, 3, 5, 5, 6, 7,
+};
+
+static const uint8_t noise_value_symbols[] = {
+    0, 10, 11, 12, 13, 14, 15, 16, 18, 1, 20, 22, 24,
+    26, 28, 2, 30, 32, 34, 36, 3, 4, 5, 6, 7, 8, 9,
+};
+
+static const uint16_t noise_value_codes[] = {
+    0xC7A, 0x002, 0x0FA, 0x03A, 0x35A, 0x1C2, 0x07A, 0x1FA,
+    0x17A, 0x000, 0x0DA, 0x142, 0x0C2, 0x042, 0x1DA, 0x001,
+    0x05A, 0x15A, 0x27A, 0x47A, 0x003, 0x005, 0x006, 0x012,
+    0x00A, 0x022, 0x01A,
+};
+
+static const uint8_t noise_segment_length_bits[] = {
+    10, 8, 5, 1, 2, 4, 4, 4, 6, 7, 9, 10,
+};
+
+static const uint8_t noise_segment_length_symbols[] = {
+    0, 13, 17, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+};
+
+static const uint16_t noise_segment_length_codes[] = {
+    0x30B, 0x8B, 0x1B, 0x0, 0x1, 0x3, 0x7, 0xF, 0x2b, 0x4B, 0xB, 0x10B,
+};
+
+static const uint8_t freq_diff_bits[] = {
+    18, 2, 4, 4, 5, 4, 4, 5, 5, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 6,
+    7, 6, 6, 6, 7, 7, 7, 7, 7, 8, 9, 9, 8, 9, 11, 11, 12, 12, 13, 12,
+    14, 15, 18, 16, 17,
+};
+
+static const uint32_t freq_diff_codes[] = {
+    0x2AD46, 0x1, 0x0, 0x3, 0xC, 0xA, 0x7, 0x18, 0x12, 0xE, 0x4, 0x16,
+    0xF, 0x1C, 0x8, 0x22, 0x26, 0x2, 0x3B, 0x34, 0x74, 0x1F, 0x14, 0x2B,
+    0x1B, 0x3F, 0x28, 0x54, 0x6, 0x4B, 0xB, 0x68, 0xE8, 0x46, 0xC6, 0x1E8,
+    0x146, 0x346, 0x546, 0x746, 0x1D46, 0xF46, 0xD46, 0x6D46, 0xAD46, 0x2D46,
+    0x1AD46,
+};
+
+static const uint8_t amplitude_bits[] = {
+    13, 7, 8, 9, 10, 10, 10, 10, 10, 9, 8, 7, 6,
+    5, 4, 3, 3, 2, 3, 3, 4, 5, 7, 8, 9, 11, 12, 13,
+};
+
+static const uint16_t amplitude_codes[] = {
+    0x1EC6, 0x6, 0xC2, 0x142, 0x242, 0x246, 0xC6, 0x46, 0x42, 0x146, 0xA2,
+    0x62, 0x26, 0x16, 0xE, 0x5, 0x4, 0x3, 0x0, 0x1, 0xA, 0x12, 0x2, 0x22,
+    0x1C6, 0x2C6, 0x6C6, 0xEC6,
+};
+
+static const uint8_t amplitude_diff_bits[] = {
+    8, 2, 1, 3, 4, 5, 6, 7, 8,
+};
+
+static const uint8_t amplitude_diff_codes[] = {
+    0xFE, 0x0, 0x1, 0x2, 0x6, 0xE, 0x1E, 0x3E, 0x7E,
+};
+
+static const uint8_t phase_diff_bits[] = {
+    6, 2, 2, 4, 4, 6, 5, 4, 2,
+};
+
+static const uint8_t phase_diff_codes[] = {
+    0x35, 0x2, 0x0, 0x1, 0xD, 0x15, 0x5, 0x9, 0x3,
+};
+
+#define INIT_VLC_STATIC_LE(vlc, nb_bits, nb_codes,                 \
+                           bits, bits_wrap, bits_size,             \
+                           codes, codes_wrap, codes_size,          \
+                           symbols, symbols_wrap, symbols_size,    \
+                           static_size)                            \
+    do {                                                           \
+        static VLC_TYPE table[static_size][2];                     \
+        (vlc)->table           = table;                            \
+        (vlc)->table_allocated = static_size;                      \
+        ff_init_vlc_sparse(vlc, nb_bits, nb_codes,                 \
+                           bits, bits_wrap, bits_size,             \
+                           codes, codes_wrap, codes_size,          \
+                           symbols, symbols_wrap, symbols_size,    \
+                           INIT_VLC_LE | INIT_VLC_USE_NEW_STATIC); \
+    } while (0)
+
+static av_cold void qdmc_init_static_data(void)
+{
+    int i;
+
+    INIT_VLC_STATIC_LE(&vtable[0], 12, FF_ARRAY_ELEMS(noise_value_bits),
+                       noise_value_bits, 1, 1, noise_value_codes, 2, 2, noise_value_symbols, 1, 1, 4096);
+    INIT_VLC_STATIC_LE(&vtable[1], 10, FF_ARRAY_ELEMS(noise_segment_length_bits),
+                       noise_segment_length_bits, 1, 1, noise_segment_length_codes, 2, 2,
+                       noise_segment_length_symbols, 1, 1, 1024);
+    INIT_VLC_STATIC_LE(&vtable[2], 13, FF_ARRAY_ELEMS(amplitude_bits),
+                       amplitude_bits, 1, 1, amplitude_codes, 2, 2, NULL, 0, 0, 8192);
+    INIT_VLC_STATIC_LE(&vtable[3], 18, FF_ARRAY_ELEMS(freq_diff_bits),
+                       freq_diff_bits, 1, 1, freq_diff_codes, 4, 4, NULL, 0, 0, 262144);
+    INIT_VLC_STATIC_LE(&vtable[4], 8, FF_ARRAY_ELEMS(amplitude_diff_bits),
+                       amplitude_diff_bits, 1, 1, amplitude_diff_codes, 1, 1, NULL, 0, 0, 256);
+    INIT_VLC_STATIC_LE(&vtable[5], 6, FF_ARRAY_ELEMS(phase_diff_bits),
+                       phase_diff_bits, 1, 1, phase_diff_codes, 1, 1, NULL, 0, 0, 64);
+
+    for (i = 0; i < 512; i++)
+        sin_table[i] = sin(2.0f * i * M_PI * 0.001953125f);
+}
+
+static void make_noises(QDMCContext *s)
+{
+    int i, j, n0, n1, n2, diff;
+    float *nptr;
+
+    for (j = 0; j < noise_bands_size[s->band_index]; j++) {
+        n0 = qdmc_nodes[j + 21 * s->band_index    ];
+        n1 = qdmc_nodes[j + 21 * s->band_index + 1];
+        n2 = qdmc_nodes[j + 21 * s->band_index + 2];
+        nptr = s->noise_buffer + 256 * j;
+
+        for (i = 0; i + n0 < n1; i++, nptr++)
+            nptr[0] = i / (float)(n1 - n0);
+
+        diff = n2 - n1;
+        nptr = s->noise_buffer + (j << 8) + n1 - n0;
+
+        for (i = n1; i < n2; i++, nptr++, diff--)
+            nptr[0] = diff / (float)(n2 - n1);
+    }
+}
+
+static av_cold int qdmc_decode_init(AVCodecContext *avctx)
+{
+    static AVOnce init_static_once = AV_ONCE_INIT;
+    QDMCContext *s = avctx->priv_data;
+    int ret, fft_size, fft_order, size, g, j, x;
+    GetByteContext b;
+
+    ff_thread_once(&init_static_once, qdmc_init_static_data);
+
+    if (!avctx->extradata || (avctx->extradata_size < 48)) {
+        av_log(avctx, AV_LOG_ERROR, "extradata missing or truncated\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_init(&b, avctx->extradata, avctx->extradata_size);
+
+    while (bytestream2_get_bytes_left(&b) > 8) {
+        if (bytestream2_peek_be64(&b) == (((uint64_t)MKBETAG('f','r','m','a') << 32) |
+                                           (uint64_t)MKBETAG('Q','D','M','C')))
+            break;
+        bytestream2_skipu(&b, 1);
+    }
+    bytestream2_skipu(&b, 8);
+
+    if (bytestream2_get_bytes_left(&b) < 36) {
+        av_log(avctx, AV_LOG_ERROR, "not enough extradata (%i)\n",
+               bytestream2_get_bytes_left(&b));
+        return AVERROR_INVALIDDATA;
+    }
+
+    size = bytestream2_get_be32u(&b);
+    if (size > bytestream2_get_bytes_left(&b)) {
+        av_log(avctx, AV_LOG_ERROR, "extradata size too small, %i < %i\n",
+               bytestream2_get_bytes_left(&b), size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (bytestream2_get_be32u(&b) != MKBETAG('Q','D','C','A')) {
+        av_log(avctx, AV_LOG_ERROR, "invalid extradata, expecting QDCA\n");
+        return AVERROR_INVALIDDATA;
+    }
+    bytestream2_skipu(&b, 4);
+
+    avctx->channels = s->nb_channels = bytestream2_get_be32u(&b);
+    if (s->nb_channels <= 0 || s->nb_channels > 2) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
+        return AVERROR_INVALIDDATA;
+    }
+    avctx->channel_layout = avctx->channels == 2 ? AV_CH_LAYOUT_STEREO :
+                                                   AV_CH_LAYOUT_MONO;
+
+    avctx->sample_rate = bytestream2_get_be32u(&b);
+    avctx->bit_rate = bytestream2_get_be32u(&b);
+    bytestream2_skipu(&b, 4);
+    fft_size = bytestream2_get_be32u(&b);
+    fft_order = av_log2(fft_size) + 1;
+    s->checksum_size = bytestream2_get_be32u(&b);
+    if (s->checksum_size >= 1U << 28) {
+        av_log(avctx, AV_LOG_ERROR, "data block size too large (%u)\n", s->checksum_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (avctx->sample_rate >= 32000) {
+        x = 28000;
+        s->frame_bits = 13;
+    } else if (avctx->sample_rate >= 16000) {
+        x = 20000;
+        s->frame_bits = 12;
+    } else {
+        x = 16000;
+        s->frame_bits = 11;
+    }
+    s->frame_size = 1 << s->frame_bits;
+    s->subframe_size = s->frame_size >> 5;
+
+    if (avctx->channels == 2)
+        x = 3 * x / 2;
+    s->band_index = noise_bands_selector[FFMIN(6, llrint(floor(avctx->bit_rate * 3.0 / (double)x + 0.5)))];
+
+    if ((fft_order < 7) || (fft_order > 9)) {
+        avpriv_request_sample(avctx, "Unknown FFT order %d", fft_order);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (fft_size != (1 << (fft_order - 1))) {
+        av_log(avctx, AV_LOG_ERROR, "FFT size %d not power of 2.\n", fft_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    ret = ff_fft_init(&s->fft_ctx, fft_order, 1);
+    if (ret < 0)
+        return ret;
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+
+    for (g = 5; g > 0; g--) {
+        for (j = 0; j < (1 << g) - 1; j++)
+            s->alt_sin[5-g][j] = sin_table[(((j+1) << (8 - g)) & 0x1FF)];
+    }
+
+    make_noises(s);
+
+    return 0;
+}
+
+static av_cold int qdmc_decode_close(AVCodecContext *avctx)
+{
+    QDMCContext *s = avctx->priv_data;
+
+    ff_fft_end(&s->fft_ctx);
+
+    return 0;
+}
+
+static int qdmc_get_vlc(GetBitContext *gb, VLC *table, int flag)
+{
+    int v;
+
+    v = get_vlc2(gb, table->table, table->bits, 1);
+    if (v < 0)
+        return AVERROR_INVALIDDATA;
+    if (v)
+        v = v - 1;
+    else
+        v = get_bits(gb, get_bits(gb, 3) + 1);
+
+    if (flag) {
+        if (v >= FF_ARRAY_ELEMS(code_prefix))
+            return AVERROR_INVALIDDATA;
+
+        v = code_prefix[v] + get_bitsz(gb, v >> 2);
+    }
+
+    return v;
+}
+
+static int skip_label(QDMCContext *s, GetBitContext *gb)
+{
+    uint32_t label = get_bits_long(gb, 32);
+    uint16_t sum = 226, checksum = get_bits(gb, 16);
+    const uint8_t *ptr = gb->buffer + 6;
+    int i;
+
+    if (label != MKTAG('Q', 'M', 'C', 1))
+        return AVERROR_INVALIDDATA;
+
+    for (i = 0; i < s->checksum_size - 6; i++)
+        sum += ptr[i];
+
+    return sum != checksum;
+}
+
+static int read_noise_data(QDMCContext *s, GetBitContext *gb)
+{
+    int ch, j, k, v, idx, band, lastval, newval, len;
+
+    for (ch = 0; ch < s->nb_channels; ch++) {
+        for (band = 0; band < noise_bands_size[s->band_index]; band++) {
+            v = qdmc_get_vlc(gb, &vtable[0], 0);
+            if (v < 0)
+                return AVERROR_INVALIDDATA;
+
+            if (v & 1)
+                v = v + 1;
+            else
+                v = -v;
+
+            lastval = v / 2;
+            s->noise[ch][band][0] = lastval - 1;
+            for (j = 0; j < 15;) {
+                len = qdmc_get_vlc(gb, &vtable[1], 1);
+                if (len < 0)
+                    return AVERROR_INVALIDDATA;
+                len += 1;
+
+                v = qdmc_get_vlc(gb, &vtable[0], 0);
+                if (v < 0)
+                    return AVERROR_INVALIDDATA;
+
+                if (v & 1)
+                    newval = lastval + (v + 1) / 2;
+                else
+                    newval = lastval - v / 2;
+
+                idx = j + 1;
+                if (len + idx > 16)
+                    return AVERROR_INVALIDDATA;
+
+                for (k = 1; idx <= j + len; k++, idx++)
+                    s->noise[ch][band][idx] = lastval + k * (newval - lastval) / len - 1;
+
+                lastval = newval;
+                j += len;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void add_tone(QDMCContext *s, int group, int offset, int freq, int stereo_mode, int amplitude, int phase)
+{
+    const int index = s->nb_tones[group];
+
+    if (index >= FF_ARRAY_ELEMS(s->tones[group])) {
+        av_log(s->avctx, AV_LOG_WARNING, "Too many tones already in buffer, ignoring tone!\n");
+        return;
+    }
+
+    s->tones[group][index].offset    = offset;
+    s->tones[group][index].freq      = freq;
+    s->tones[group][index].mode      = stereo_mode;
+    s->tones[group][index].amplitude = amplitude;
+    s->tones[group][index].phase     = phase;
+    s->nb_tones[group]++;
+}
+
+static int read_wave_data(QDMCContext *s, GetBitContext *gb)
+{
+    int amp, phase, stereo_mode = 0, i, group, freq, group_size, group_bits;
+    int amp2, phase2, pos2, off;
+
+    for (group = 0; group < 5; group++) {
+        group_size = 1 << (s->frame_bits - group - 1);
+        group_bits = 4 - group;
+        pos2 = 0;
+        off  = 0;
+
+        for (i = 1; ; i = freq + 1) {
+            int v;
+
+            v = qdmc_get_vlc(gb, &vtable[3], 1);
+            if (v < 0)
+                return AVERROR_INVALIDDATA;
+
+            freq = i + v;
+            while (freq >= group_size - 1) {
+                freq += 2 - group_size;
+                pos2 += group_size;
+                off  += 1 << group_bits;
+            }
+
+            if (pos2 >= s->frame_size)
+                break;
+
+            if (s->nb_channels > 1)
+                stereo_mode = get_bits(gb, 2);
+
+            amp   = qdmc_get_vlc(gb, &vtable[2], 0);
+            if (amp < 0)
+                return AVERROR_INVALIDDATA;
+            phase = get_bits(gb, 3);
+
+            if (stereo_mode > 1) {
+                amp2   = qdmc_get_vlc(gb, &vtable[4], 0);
+                if (amp2 < 0)
+                    return AVERROR_INVALIDDATA;
+                amp2   = amp - amp2;
+
+                phase2 = qdmc_get_vlc(gb, &vtable[5], 0);
+                if (phase2 < 0)
+                    return AVERROR_INVALIDDATA;
+                phase2 = phase - phase2;
+
+                if (phase2 < 0)
+                    phase2 += 8;
+            }
+
+            if ((freq >> group_bits) + 1 < s->subframe_size) {
+                add_tone(s, group, off, freq, stereo_mode & 1, amp, phase);
+                if (stereo_mode > 1)
+                    add_tone(s, group, off, freq, ~stereo_mode & 1, amp2, phase2);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void lin_calc(QDMCContext *s, float amplitude, int node1, int node2, int index)
+{
+    int subframe_size, i, j, k, length;
+    float scale, *noise_ptr;
+
+    scale = 0.5 * amplitude;
+    subframe_size = s->subframe_size;
+    if (subframe_size >= node2)
+        subframe_size = node2;
+    length = (subframe_size - node1) & 0xFFFC;
+    j = node1;
+    noise_ptr = &s->noise_buffer[256 * index];
+
+    for (i = 0; i < length; i += 4, j+= 4, noise_ptr += 4) {
+        s->noise2_buffer[j    ] += scale * noise_ptr[0];
+        s->noise2_buffer[j + 1] += scale * noise_ptr[1];
+        s->noise2_buffer[j + 2] += scale * noise_ptr[2];
+        s->noise2_buffer[j + 3] += scale * noise_ptr[3];
+    }
+
+    k = length + node1;
+    noise_ptr = s->noise_buffer + length + (index << 8);
+    for (i = length; i < subframe_size - node1; i++, k++, noise_ptr++)
+        s->noise2_buffer[k] += scale * noise_ptr[0];
+}
+
+static void add_noise(QDMCContext *s, int ch, int current_subframe)
+{
+    int i, j, aindex;
+    float amplitude;
+    float *im = &s->fft_buffer[0 + ch][s->fft_offset + s->subframe_size * current_subframe];
+    float *re = &s->fft_buffer[2 + ch][s->fft_offset + s->subframe_size * current_subframe];
+
+    memset(s->noise2_buffer, 0, 4 * s->subframe_size);
+
+    for (i = 0; i < noise_bands_size[s->band_index]; i++) {
+        if (qdmc_nodes[i + 21 * s->band_index] > s->subframe_size - 1)
+            break;
+
+        aindex = s->noise[ch][i][current_subframe / 2];
+        amplitude = aindex > 0 ? amplitude_tab[aindex & 0x3F] : 0.0f;
+
+        lin_calc(s, amplitude, qdmc_nodes[21 * s->band_index + i],
+                 qdmc_nodes[21 * s->band_index + i + 2], i);
+    }
+
+    for (j = 2; j < s->subframe_size - 1; j++) {
+        float rnd_re, rnd_im;
+
+        s->rndval = 214013 * s->rndval + 2531011;
+        rnd_im = ((s->rndval & 0x7FFF) - 16384.0f) * 0.000030517578f * s->noise2_buffer[j];
+        s->rndval = 214013 * s->rndval + 2531011;
+        rnd_re = ((s->rndval & 0x7FFF) - 16384.0f) * 0.000030517578f * s->noise2_buffer[j];
+        im[j  ] += rnd_im;
+        re[j  ] += rnd_re;
+        im[j+1] -= rnd_im;
+        re[j+1] -= rnd_re;
+    }
+}
+
+static void add_wave(QDMCContext *s, int offset, int freqs, int group, int stereo_mode, int amp, int phase)
+{
+    int j, group_bits, pos, pindex;
+    float im, re, amplitude, level, *imptr, *reptr;
+
+    if (s->nb_channels == 1)
+        stereo_mode = 0;
+
+    group_bits = 4 - group;
+    pos = freqs >> (4 - group);
+    amplitude = amplitude_tab[amp & 0x3F];
+    imptr = &s->fft_buffer[    stereo_mode][s->fft_offset + s->subframe_size * offset + pos];
+    reptr = &s->fft_buffer[2 + stereo_mode][s->fft_offset + s->subframe_size * offset + pos];
+    pindex = (phase << 6) - ((2 * (freqs >> (4 - group)) + 1) << 7);
+    for (j = 0; j < (1 << (group_bits + 1)) - 1; j++) {
+        pindex += (2 * freqs + 1) << (7 - group_bits);
+        level = amplitude * s->alt_sin[group][j];
+        im = level * sin_table[ pindex        & 0x1FF];
+        re = level * sin_table[(pindex + 128) & 0x1FF];
+        imptr[0] += im;
+        imptr[1] -= im;
+        reptr[0] += re;
+        reptr[1] -= re;
+        imptr += s->subframe_size;
+        reptr += s->subframe_size;
+        if (imptr >= &s->fft_buffer[stereo_mode][2 * s->frame_size]) {
+            imptr = &s->fft_buffer[0 + stereo_mode][pos];
+            reptr = &s->fft_buffer[2 + stereo_mode][pos];
+        }
+    }
+}
+
+static void add_wave0(QDMCContext *s, int offset, int freqs, int stereo_mode, int amp, int phase)
+{
+    float level, im, re;
+    int pos;
+
+    if (s->nb_channels == 1)
+        stereo_mode = 0;
+
+    level = amplitude_tab[amp & 0x3F];
+    im = level * sin_table[ (phase << 6)        & 0x1FF];
+    re = level * sin_table[((phase << 6) + 128) & 0x1FF];
+    pos = s->fft_offset + freqs + s->subframe_size * offset;
+    s->fft_buffer[    stereo_mode][pos    ] += im;
+    s->fft_buffer[2 + stereo_mode][pos    ] += re;
+    s->fft_buffer[    stereo_mode][pos + 1] -= im;
+    s->fft_buffer[2 + stereo_mode][pos + 1] -= re;
+}
+
+static void add_waves(QDMCContext *s, int current_subframe)
+{
+    int w, g;
+
+    for (g = 0; g < 4; g++) {
+        for (w = s->cur_tone[g]; w < s->nb_tones[g]; w++) {
+            QDMCTone *t = &s->tones[g][w];
+
+            if (current_subframe < t->offset)
+                break;
+            add_wave(s, t->offset, t->freq, g, t->mode, t->amplitude, t->phase);
+        }
+        s->cur_tone[g] = w;
+    }
+    for (w = s->cur_tone[4]; w < s->nb_tones[4]; w++) {
+        QDMCTone *t = &s->tones[4][w];
+
+        if (current_subframe < t->offset)
+            break;
+        add_wave0(s, t->offset, t->freq, t->mode, t->amplitude, t->phase);
+    }
+    s->cur_tone[4] = w;
+}
+
+static int decode_frame(QDMCContext *s, GetBitContext *gb, int16_t *out)
+{
+    int ret, ch, i, n;
+
+    if (skip_label(s, gb))
+        return AVERROR_INVALIDDATA;
+
+    s->fft_offset = s->frame_size - s->fft_offset;
+    s->buffer_ptr = &s->buffer[s->nb_channels * s->buffer_offset];
+
+    ret = read_noise_data(s, gb);
+    if (ret < 0)
+        return ret;
+
+    ret = read_wave_data(s, gb);
+    if (ret < 0)
+        return ret;
+
+    for (n = 0; n < 32; n++) {
+        float *r;
+
+        for (ch = 0; ch < s->nb_channels; ch++)
+            add_noise(s, ch, n);
+
+        add_waves(s, n);
+
+        for (ch = 0; ch < s->nb_channels; ch++) {
+            for (i = 0; i < s->subframe_size; i++) {
+                s->cmplx[ch][i].re = s->fft_buffer[ch + 2][s->fft_offset + n * s->subframe_size + i];
+                s->cmplx[ch][i].im = s->fft_buffer[ch + 0][s->fft_offset + n * s->subframe_size + i];
+                s->cmplx[ch][s->subframe_size + i].re = 0;
+                s->cmplx[ch][s->subframe_size + i].im = 0;
+            }
+        }
+
+        for (ch = 0; ch < s->nb_channels; ch++) {
+            s->fft_ctx.fft_permute(&s->fft_ctx, s->cmplx[ch]);
+            s->fft_ctx.fft_calc(&s->fft_ctx, s->cmplx[ch]);
+        }
+
+        r = &s->buffer_ptr[s->nb_channels * n * s->subframe_size];
+        for (i = 0; i < 2 * s->subframe_size; i++) {
+            for (ch = 0; ch < s->nb_channels; ch++) {
+                *r++ += s->cmplx[ch][i].re;
+            }
+        }
+
+        r = &s->buffer_ptr[n * s->subframe_size * s->nb_channels];
+        for (i = 0; i < s->nb_channels * s->subframe_size; i++) {
+            out[i] = av_clipf(r[i], INT16_MIN, INT16_MAX);
+        }
+        out += s->subframe_size * s->nb_channels;
+
+        for (ch = 0; ch < s->nb_channels; ch++) {
+            memset(s->fft_buffer[ch+0] + s->fft_offset + n * s->subframe_size, 0, 4 * s->subframe_size);
+            memset(s->fft_buffer[ch+2] + s->fft_offset + n * s->subframe_size, 0, 4 * s->subframe_size);
+        }
+        memset(s->buffer + s->nb_channels * (n * s->subframe_size + s->frame_size + s->buffer_offset), 0, 4 * s->subframe_size * s->nb_channels);
+    }
+
+    s->buffer_offset += s->frame_size;
+    if (s->buffer_offset >= 32768 - s->frame_size) {
+        memcpy(s->buffer, &s->buffer[s->nb_channels * s->buffer_offset], 4 * s->frame_size * s->nb_channels);
+        s->buffer_offset = 0;
+    }
+
+    return 0;
+}
+
+static av_cold void qdmc_flush(AVCodecContext *avctx)
+{
+    QDMCContext *s = avctx->priv_data;
+
+    memset(s->buffer, 0, sizeof(s->buffer));
+    memset(s->fft_buffer, 0, sizeof(s->fft_buffer));
+    s->fft_offset = 0;
+    s->buffer_offset = 0;
+}
+
+static int qdmc_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame_ptr, AVPacket *avpkt)
+{
+    QDMCContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    GetBitContext gb;
+    int ret;
+
+    if (!avpkt->data)
+        return 0;
+    if (avpkt->size < s->checksum_size)
+        return AVERROR_INVALIDDATA;
+
+    s->avctx = avctx;
+    frame->nb_samples = s->frame_size;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    if ((ret = init_get_bits8(&gb, avpkt->data, s->checksum_size)) < 0)
+        return ret;
+
+    memset(s->nb_tones, 0, sizeof(s->nb_tones));
+    memset(s->cur_tone, 0, sizeof(s->cur_tone));
+
+    ret = decode_frame(s, &gb, (int16_t *)frame->data[0]);
+    if (ret >= 0) {
+        *got_frame_ptr = 1;
+        return s->checksum_size;
+    }
+    qdmc_flush(avctx);
+    return ret;
+}
+
+AVCodec ff_qdmc_decoder = {
+    .name             = "qdmc",
+    .long_name        = NULL_IF_CONFIG_SMALL("QDesign Music Codec 1"),
+    .type             = AVMEDIA_TYPE_AUDIO,
+    .id               = AV_CODEC_ID_QDMC,
+    .priv_data_size   = sizeof(QDMCContext),
+    .init             = qdmc_decode_init,
+    .close            = qdmc_decode_close,
+    .decode           = qdmc_decode_frame,
+    .flush            = qdmc_flush,
+    .capabilities     = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/qdrw.c b/libavcodec/qdrw.c
index b7493e4..32ba410 100644
--- a/libavcodec/qdrw.c
+++ b/libavcodec/qdrw.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2004 Konstantin Shishkov
  * Copyright (c) 2015 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,35 +33,182 @@
 #include "internal.h"
 
 enum QuickdrawOpcodes {
+    CLIP = 0x0001,
     PACKBITSRECT = 0x0098,
     PACKBITSRGN,
     DIRECTBITSRECT,
     DIRECTBITSRGN,
+    SHORTCOMMENT = 0x00A0,
+    LONGCOMMENT,
 
     EOP = 0x00FF,
 };
 
 static int parse_palette(AVCodecContext *avctx, GetByteContext *gbc,
-                         uint32_t *pal, int colors)
+                         uint32_t *pal, int colors, int pixmap)
 {
     int i;
 
     for (i = 0; i <= colors; i++) {
         uint8_t r, g, b;
         unsigned int idx = bytestream2_get_be16(gbc); /* color index */
-        if (idx > 255) {
+        if (idx > 255 && !pixmap) {
             av_log(avctx, AV_LOG_WARNING,
                    "Palette index out of range: %u\n", idx);
             bytestream2_skip(gbc, 6);
             continue;
         }
+        if (avctx->pix_fmt != AV_PIX_FMT_PAL8)
+            return AVERROR_INVALIDDATA;
         r = bytestream2_get_byte(gbc);
         bytestream2_skip(gbc, 1);
         g = bytestream2_get_byte(gbc);
         bytestream2_skip(gbc, 1);
         b = bytestream2_get_byte(gbc);
         bytestream2_skip(gbc, 1);
-        pal[idx] = (0xFFU << 24) | (r << 16) | (g << 8) | b;
+        pal[pixmap ? i : idx] = (0xFFU << 24) | (r << 16) | (g << 8) | b;
+    }
+    return 0;
+}
+
+static int decode_rle_bpp2(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc)
+{
+    int offset = avctx->width;
+    uint8_t *outdata = p->data[0];
+    int i, j;
+
+    for (i = 0; i < avctx->height; i++) {
+        int size, left, code, pix;
+        uint8_t *out = outdata;
+        int pos = 0;
+
+        /* size of packed line */
+        if (offset / 4 > 200)
+            size = left = bytestream2_get_be16(gbc);
+        else
+            size = left = bytestream2_get_byte(gbc);
+        if (bytestream2_get_bytes_left(gbc) < size)
+            return AVERROR_INVALIDDATA;
+
+        /* decode line */
+        while (left > 0) {
+            code = bytestream2_get_byte(gbc);
+            if (code & 0x80 ) { /* run */
+                pix = bytestream2_get_byte(gbc);
+                for (j = 0; j < 257 - code; j++) {
+                    if (pos < offset)
+                        out[pos++] = (pix & 0xC0) >> 6;
+                    if (pos < offset)
+                        out[pos++] = (pix & 0x30) >> 4;
+                    if (pos < offset)
+                        out[pos++] = (pix & 0x0C) >> 2;
+                    if (pos < offset)
+                        out[pos++] = (pix & 0x03);
+                }
+                left  -= 2;
+            } else { /* copy */
+                for (j = 0; j < code + 1; j++) {
+                    pix = bytestream2_get_byte(gbc);
+                    if (pos < offset)
+                        out[pos++] = (pix & 0xC0) >> 6;
+                    if (pos < offset)
+                        out[pos++] = (pix & 0x30) >> 4;
+                    if (pos < offset)
+                        out[pos++] = (pix & 0x0C) >> 2;
+                    if (pos < offset)
+                        out[pos++] = (pix & 0x03);
+                }
+                left  -= 1 + (code + 1);
+            }
+        }
+        outdata += p->linesize[0];
+    }
+    return 0;
+}
+
+static int decode_rle_bpp4(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc)
+{
+    int offset = avctx->width;
+    uint8_t *outdata = p->data[0];
+    int i, j;
+
+    for (i = 0; i < avctx->height; i++) {
+        int size, left, code, pix;
+        uint8_t *out = outdata;
+        int pos = 0;
+
+        /* size of packed line */
+        size = left = bytestream2_get_be16(gbc);
+        if (bytestream2_get_bytes_left(gbc) < size)
+            return AVERROR_INVALIDDATA;
+
+        /* decode line */
+        while (left > 0) {
+            code = bytestream2_get_byte(gbc);
+            if (code & 0x80 ) { /* run */
+                pix = bytestream2_get_byte(gbc);
+                for (j = 0; j < 257 - code; j++) {
+                    if (pos < offset)
+                        out[pos++] = (pix & 0xF0) >> 4;
+                    if (pos < offset)
+                        out[pos++] = pix & 0xF;
+                }
+                left  -= 2;
+            } else { /* copy */
+                for (j = 0; j < code + 1; j++) {
+                    pix = bytestream2_get_byte(gbc);
+                    if (pos < offset)
+                        out[pos++] = (pix & 0xF0) >> 4;
+                    if (pos < offset)
+                        out[pos++] = pix & 0xF;
+                }
+                left  -= 1 + (code + 1);
+            }
+        }
+        outdata += p->linesize[0];
+    }
+    return 0;
+}
+
+static int decode_rle16(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc)
+{
+    int offset = avctx->width;
+    uint8_t *outdata = p->data[0];
+    int i, j;
+
+    for (i = 0; i < avctx->height; i++) {
+        int size, left, code, pix;
+        uint16_t *out = (uint16_t *)outdata;
+        int pos = 0;
+
+        /* size of packed line */
+        size = left = bytestream2_get_be16(gbc);
+        if (bytestream2_get_bytes_left(gbc) < size)
+            return AVERROR_INVALIDDATA;
+
+        /* decode line */
+        while (left > 0) {
+            code = bytestream2_get_byte(gbc);
+            if (code & 0x80 ) { /* run */
+                pix = bytestream2_get_be16(gbc);
+                for (j = 0; j < 257 - code; j++) {
+                    if (pos < offset) {
+                        out[pos++] = pix;
+                    }
+                }
+                left  -= 3;
+            } else { /* copy */
+                for (j = 0; j < code + 1; j++) {
+                    if (pos < offset) {
+                        out[pos++] = bytestream2_get_be16(gbc);
+                    } else {
+                        bytestream2_skip(gbc, 2);
+                    }
+                }
+                left  -= 1 + (code + 1) * 2;
+            }
+        }
+        outdata += p->linesize[0];
     }
     return 0;
 }
@@ -89,9 +236,10 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
             if (code & 0x80 ) { /* run */
                 pix = bytestream2_get_byte(gbc);
                 for (j = 0; j < 257 - code; j++) {
-                    out[pos] = pix;
+                    if (pos < offset)
+                        out[pos] = pix;
                     pos += step;
-                    if (pos >= offset) {
+                    if (pos >= offset && step > 1) {
                         pos -= offset;
                         pos++;
                     }
@@ -99,9 +247,11 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
                 left  -= 2;
             } else { /* copy */
                 for (j = 0; j < code + 1; j++) {
-                    out[pos] = bytestream2_get_byte(gbc);
+                    pix = bytestream2_get_byte(gbc);
+                    if (pos < offset)
+                        out[pos] = pix;
                     pos += step;
-                    if (pos >= offset) {
+                    if (pos >= offset && step > 1) {
                         pos -= offset;
                         pos++;
                     }
@@ -114,6 +264,29 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
     return 0;
 }
 
+static int check_header(const char *buf, int buf_size)
+{
+    unsigned w, h, v0, v1;
+
+    if (buf_size < 40)
+        return 0;
+
+    w = AV_RB16(buf+6);
+    h = AV_RB16(buf+8);
+    v0 = AV_RB16(buf+10);
+    v1 = AV_RB16(buf+12);
+
+    if (!w || !h)
+        return 0;
+
+    if (v0 == 0x1101)
+        return 1;
+    if (v0 == 0x0011 && v1 == 0x02FF)
+        return 2;
+    return 0;
+}
+
+
 static int decode_frame(AVCodecContext *avctx,
                         void *data, int *got_frame,
                         AVPacket *avpkt)
@@ -122,13 +295,16 @@ static int decode_frame(AVCodecContext *avctx,
     GetByteContext gbc;
     int colors;
     int w, h, ret;
+    int ver;
 
     bytestream2_init(&gbc, avpkt->data, avpkt->size);
-
-    /* PICT images start with a 512 bytes empty header */
-    if (bytestream2_peek_be32(&gbc) == 0)
+    if (   bytestream2_get_bytes_left(&gbc) >= 552
+           &&  check_header(gbc.buffer + 512, bytestream2_get_bytes_left(&gbc) - 512)
+       )
         bytestream2_skip(&gbc, 512);
 
+    ver = check_header(gbc.buffer, bytestream2_get_bytes_left(&gbc));
+
     /* smallest PICT header */
     if (bytestream2_get_bytes_left(&gbc) < 40) {
         av_log(avctx, AV_LOG_ERROR, "Frame is too small %d\n",
@@ -146,30 +322,42 @@ static int decode_frame(AVCodecContext *avctx,
 
     /* version 1 is identified by 0x1101
      * it uses byte-aligned opcodes rather than word-aligned */
-    if (bytestream2_get_be32(&gbc) != 0x001102FF) {
+    if (ver == 1) {
         avpriv_request_sample(avctx, "QuickDraw version 1");
         return AVERROR_PATCHWELCOME;
+    } else if (ver != 2) {
+        avpriv_request_sample(avctx, "QuickDraw version unknown (%X)", bytestream2_get_be32(&gbc));
+        return AVERROR_PATCHWELCOME;
     }
 
-    bytestream2_skip(&gbc, 26);
+    bytestream2_skip(&gbc, 4+26);
 
     while (bytestream2_get_bytes_left(&gbc) >= 4) {
         int bppcnt, bpp;
         int rowbytes, pack_type;
+        int flags;
         int opcode = bytestream2_get_be16(&gbc);
 
         switch(opcode) {
+        case CLIP:
+            bytestream2_skip(&gbc, 10);
+            break;
         case PACKBITSRECT:
         case PACKBITSRGN:
             av_log(avctx, AV_LOG_DEBUG, "Parsing Packbit opcode\n");
 
-            bytestream2_skip(&gbc, 30);
+            flags = bytestream2_get_be16(&gbc) & 0xC000;
+            bytestream2_skip(&gbc, 28);
             bppcnt = bytestream2_get_be16(&gbc); /* cmpCount */
             bpp    = bytestream2_get_be16(&gbc); /* cmpSize */
 
             av_log(avctx, AV_LOG_DEBUG, "bppcount %d bpp %d\n", bppcnt, bpp);
             if (bppcnt == 1 && bpp == 8) {
                 avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            } else if (bppcnt == 1 && (bpp == 4 || bpp == 2)) {
+                avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            } else if (bppcnt == 3 && bpp == 5) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB555;
             } else {
                 av_log(avctx, AV_LOG_ERROR,
                        "Invalid pixel format (bppcnt %d bpp %d) in Packbit\n",
@@ -191,12 +379,12 @@ static int decode_frame(AVCodecContext *avctx,
                        bytestream2_get_bytes_left(&gbc));
                 return AVERROR_INVALIDDATA;
             }
-            if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
                 return ret;
-            }
 
-            parse_palette(avctx, &gbc, (uint32_t *)p->data[1], colors);
+            ret = parse_palette(avctx, &gbc, (uint32_t *)p->data[1], colors, flags & 0x8000);
+            if (ret < 0)
+                return ret;
             p->palette_has_changed = 1;
 
             /* jump to image data */
@@ -207,7 +395,14 @@ static int decode_frame(AVCodecContext *avctx,
                 avpriv_report_missing_feature(avctx, "Packbit mask region");
             }
 
-            ret = decode_rle(avctx, p, &gbc, bppcnt);
+            if (avctx->pix_fmt == AV_PIX_FMT_RGB555)
+                ret = decode_rle16(avctx, p, &gbc);
+            else if (bpp == 2)
+                ret = decode_rle_bpp2(avctx, p, &gbc);
+            else if (bpp == 4)
+                ret = decode_rle_bpp4(avctx, p, &gbc);
+            else
+                ret = decode_rle(avctx, p, &gbc, bppcnt);
             if (ret < 0)
                 return ret;
             *got_frame = 1;
@@ -223,7 +418,15 @@ static int decode_frame(AVCodecContext *avctx,
                 return AVERROR_PATCHWELCOME;
             }
 
-            bytestream2_skip(&gbc, 10);
+            bytestream2_skip(&gbc, 4);
+            h = bytestream2_get_be16(&gbc);
+            w = bytestream2_get_be16(&gbc);
+            bytestream2_skip(&gbc, 2);
+
+            ret = ff_set_dimensions(avctx, w, h);
+            if (ret < 0)
+                return ret;
+
             pack_type = bytestream2_get_be16(&gbc);
 
             bytestream2_skip(&gbc, 16);
@@ -233,6 +436,8 @@ static int decode_frame(AVCodecContext *avctx,
             av_log(avctx, AV_LOG_DEBUG, "bppcount %d bpp %d\n", bppcnt, bpp);
             if (bppcnt == 3 && bpp == 8) {
                 avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            } else if (bppcnt == 3 && bpp == 5 || bppcnt == 2 && bpp == 8) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB555;
             } else if (bppcnt == 4 && bpp == 8) {
                 avctx->pix_fmt = AV_PIX_FMT_ARGB;
             } else {
@@ -250,10 +455,8 @@ static int decode_frame(AVCodecContext *avctx,
                 avpriv_request_sample(avctx, "Pack type %d", pack_type);
                 return AVERROR_PATCHWELCOME;
             }
-            if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
                 return ret;
-            }
 
             /* jump to data */
             bytestream2_skip(&gbc, 30);
@@ -263,11 +466,18 @@ static int decode_frame(AVCodecContext *avctx,
                 avpriv_report_missing_feature(avctx, "DirectBit mask region");
             }
 
-            ret = decode_rle(avctx, p, &gbc, bppcnt);
+            if (avctx->pix_fmt == AV_PIX_FMT_RGB555)
+                ret = decode_rle16(avctx, p, &gbc);
+            else
+                ret = decode_rle(avctx, p, &gbc, bppcnt);
             if (ret < 0)
                 return ret;
             *got_frame = 1;
             break;
+        case LONGCOMMENT:
+            bytestream2_get_be16(&gbc);
+            bytestream2_skip(&gbc, bytestream2_get_be16(&gbc));
+            break;
         default:
             av_log(avctx, AV_LOG_TRACE, "Unknown 0x%04X opcode\n", opcode);
             break;
diff --git a/libavcodec/qpeg.c b/libavcodec/qpeg.c
index 3a2e56c..d4195c5 100644
--- a/libavcodec/qpeg.c
+++ b/libavcodec/qpeg.c
@@ -2,20 +2,20 @@
  * QPEG codec
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,7 @@
 
 typedef struct QpegContext{
     AVCodecContext *avctx;
-    AVFrame *pic;
-    uint8_t *refdata;
+    AVFrame *pic, *ref;
     uint32_t pal[256];
     GetByteContext buffer;
 } QpegContext;
@@ -81,16 +80,27 @@ static void qpeg_decode_intra(QpegContext *qctx, uint8_t *dst,
 
             p = bytestream2_get_byte(&qctx->buffer);
             for(i = 0; i < run; i++) {
-                dst[filled++] = p;
+                int step = FFMIN(run - i, width - filled);
+                memset(dst+filled, p, step);
+                filled += step;
+                i      += step - 1;
                 if (filled >= width) {
                     filled = 0;
                     dst -= stride;
                     rows_to_go--;
+                    while (run - i > width && rows_to_go > 0) {
+                        memset(dst, p, width);
+                        dst -= stride;
+                        rows_to_go--;
+                        i += width;
+                    }
                     if(rows_to_go <= 0)
                         break;
                 }
             }
         } else {
+            if (bytestream2_get_bytes_left(&qctx->buffer) < copy)
+                copy = bytestream2_get_bytes_left(&qctx->buffer);
             for(i = 0; i < copy; i++) {
                 dst[filled++] = bytestream2_get_byte(&qctx->buffer);
                 if (filled >= width) {
@@ -111,7 +121,7 @@ static const int qpeg_table_w[16] =
  { 0x00, 0x20, 0x18, 0x08, 0x18, 0x10, 0x20, 0x10, 0x08, 0x10, 0x20, 0x20, 0x08, 0x10, 0x18, 0x04};
 
 /* Decodes delta frames */
-static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
+static void av_noinline qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
                               int stride, int width, int height,
                               int delta, const uint8_t *ctable,
                               uint8_t *refdata)
@@ -121,9 +131,13 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
     int filled = 0;
     int orig_height;
 
-    /* copy prev frame */
-    for(i = 0; i < height; i++)
-        memcpy(refdata + (i * width), dst + (i * stride), width);
+    if (refdata) {
+        /* copy prev frame */
+        for (i = 0; i < height; i++)
+            memcpy(dst + (i * stride), refdata + (i * stride), width);
+    } else {
+        refdata = dst;
+    }
 
     orig_height = height;
     height--;
@@ -134,7 +148,7 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
 
         if(delta) {
             /* motion compensation */
-            while((code & 0xF0) == 0xF0) {
+            while(bytestream2_get_bytes_left(&qctx->buffer) > 0 && (code & 0xF0) == 0xF0) {
                 if(delta == 1) {
                     int me_idx;
                     int me_w, me_h, me_x, me_y;
@@ -167,10 +181,10 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
                                me_x, me_y, me_w, me_h, filled, height);
                     else {
                         /* do motion compensation */
-                        me_plane = refdata + (filled + me_x) + (height - me_y) * width;
+                        me_plane = refdata + (filled + me_x) + (height - me_y) * stride;
                         for(j = 0; j < me_h; j++) {
                             for(i = 0; i < me_w; i++)
-                                dst[filled + i - (j * stride)] = me_plane[i - (j * width)];
+                                dst[filled + i - (j * stride)] = me_plane[i - (j * stride)];
                         }
                     }
                 }
@@ -198,6 +212,9 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
         } else if(code >= 0xC0) { /* copy code: 0xC0..0xDF */
             code &= 0x1F;
 
+            if(code + 1 > bytestream2_get_bytes_left(&qctx->buffer))
+                break;
+
             for(i = 0; i <= code; i++) {
                 dst[filled++] = bytestream2_get_byte(&qctx->buffer);
                 if(filled >= width) {
@@ -251,9 +268,11 @@ static int decode_frame(AVCodecContext *avctx,
     uint8_t ctable[128];
     QpegContext * const a = avctx->priv_data;
     AVFrame * const p = a->pic;
+    AVFrame * const ref = a->ref;
     uint8_t* outdata;
     int delta, ret;
-    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+    int pal_size;
+    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &pal_size);
 
     if (avpkt->size < 0x86) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
@@ -261,10 +280,12 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     bytestream2_init(&a->buffer, avpkt->data, avpkt->size);
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+
+    av_frame_unref(ref);
+    av_frame_move_ref(ref, p);
+
+    if ((ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
     outdata = p->data[0];
     bytestream2_skip(&a->buffer, 4);
     bytestream2_get_buffer(&a->buffer, ctable, 128);
@@ -274,13 +295,15 @@ static int decode_frame(AVCodecContext *avctx,
     if(delta == 0x10) {
         qpeg_decode_intra(a, outdata, p->linesize[0], avctx->width, avctx->height);
     } else {
-        qpeg_decode_inter(a, outdata, p->linesize[0], avctx->width, avctx->height, delta, ctable, a->refdata);
+        qpeg_decode_inter(a, outdata, p->linesize[0], avctx->width, avctx->height, delta, ctable, ref->data[0]);
     }
 
     /* make the palette available on the way out */
-    if (pal) {
+    if (pal && pal_size == AVPALETTE_SIZE) {
         p->palette_has_changed = 1;
         memcpy(a->pal, pal, AVPALETTE_SIZE);
+    } else if (pal) {
+        av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", pal_size);
     }
     memcpy(p->data[1], a->pal, AVPALETTE_SIZE);
 
@@ -292,13 +315,25 @@ static int decode_frame(AVCodecContext *avctx,
     return avpkt->size;
 }
 
+static void decode_flush(AVCodecContext *avctx){
+    QpegContext * const a = avctx->priv_data;
+    int i, pal_size;
+    const uint8_t *pal_src;
+
+    pal_size = FFMIN(1024U, avctx->extradata_size);
+    pal_src = avctx->extradata + avctx->extradata_size - pal_size;
+
+    for (i=0; i<pal_size/4; i++)
+        a->pal[i] = 0xFFU<<24 | AV_RL32(pal_src+4*i);
+}
+
 static av_cold int decode_end(AVCodecContext *avctx)
 {
     QpegContext * const a = avctx->priv_data;
 
     av_frame_free(&a->pic);
+    av_frame_free(&a->ref);
 
-    av_free(a->refdata);
     return 0;
 }
 
@@ -307,10 +342,12 @@ static av_cold int decode_init(AVCodecContext *avctx){
 
     a->avctx = avctx;
     avctx->pix_fmt= AV_PIX_FMT_PAL8;
-    a->refdata = av_malloc(avctx->width * avctx->height);
+
+    decode_flush(avctx);
 
     a->pic = av_frame_alloc();
-    if (!a->pic) {
+    a->ref = av_frame_alloc();
+    if (!a->pic || !a->ref) {
         decode_end(avctx);
         return AVERROR(ENOMEM);
     }
@@ -327,5 +364,6 @@ AVCodec ff_qpeg_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/qpel_template.c b/libavcodec/qpel_template.c
index 2106160..e52a78c 100644
--- a/libavcodec/qpel_template.c
+++ b/libavcodec/qpel_template.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP function templates
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qpeldsp.c b/libavcodec/qpeldsp.c
index 1d0422a..6e52b33 100644
--- a/libavcodec/qpeldsp.c
+++ b/libavcodec/qpeldsp.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +32,7 @@
 #include "libavutil/attributes.h"
 #include "copy_block.h"
 #include "qpeldsp.h"
+#include "diracdsp.h"
 
 #define BIT_DEPTH 8
 #include "hpel_template.c"
@@ -732,6 +735,51 @@ void ff_put_pixels8_l2_8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 
 }
 
+#if CONFIG_DIRAC_DECODER
+#define DIRAC_MC(OPNAME)\
+void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+     OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
+    OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
+    OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
+    OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
+}
+DIRAC_MC(put)
+DIRAC_MC(avg)
+#endif
+
 av_cold void ff_qpeldsp_init(QpelDSPContext *c)
 {
 #define dspfunc(PFX, IDX, NUM)                              \
@@ -763,4 +811,6 @@ av_cold void ff_qpeldsp_init(QpelDSPContext *c)
 
     if (ARCH_X86)
         ff_qpeldsp_init_x86(c);
+    if (ARCH_MIPS)
+        ff_qpeldsp_init_mips(c);
 }
diff --git a/libavcodec/qpeldsp.h b/libavcodec/qpeldsp.h
index 4ad141d..91019ed 100644
--- a/libavcodec/qpeldsp.h
+++ b/libavcodec/qpeldsp.h
@@ -1,20 +1,20 @@
 /*
  * quarterpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -78,5 +78,6 @@ typedef struct QpelDSPContext {
 void ff_qpeldsp_init(QpelDSPContext *c);
 
 void ff_qpeldsp_init_x86(QpelDSPContext *c);
+void ff_qpeldsp_init_mips(QpelDSPContext *c);
 
 #endif /* AVCODEC_QPELDSP_H */
diff --git a/libavcodec/qsv.c b/libavcodec/qsv.c
index a9b3c59..bb0d795 100644
--- a/libavcodec/qsv.c
+++ b/libavcodec/qsv.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV encoder/decoder shared code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_qsv.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 
 #include "avcodec.h"
 #include "qsv_internal.h"
@@ -214,6 +215,11 @@ enum AVPictureType ff_qsv_map_pictype(int mfx_pic_type)
         else
             type = AV_PICTURE_TYPE_P;
         break;
+    case MFX_FRAMETYPE_UNKNOWN:
+        type = AV_PICTURE_TYPE_NONE;
+        break;
+    default:
+        av_assert0(0);
     }
 
     return type;
diff --git a/libavcodec/qsv.h b/libavcodec/qsv.h
index 1d1f8b4..b77158e 100644
--- a/libavcodec/qsv.h
+++ b/libavcodec/qsv.h
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV public API
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsv_api.c b/libavcodec/qsv_api.c
index 234b596..327ff7d 100644
--- a/libavcodec/qsv_api.c
+++ b/libavcodec/qsv_api.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV public API functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsv_internal.h b/libavcodec/qsv_internal.h
index 3cd8f18..394c558 100644
--- a/libavcodec/qsv_internal.h
+++ b/libavcodec/qsv_internal.h
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV encoder/decoder shared code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,8 @@
 
 #define ASYNC_DEPTH_DEFAULT 4       // internal parallelism
 
+#define QSV_MAX_ENC_PAYLOAD 2       // # of mfxEncodeCtrl payloads supported
+
 #define QSV_VERSION_ATLEAST(MAJOR, MINOR)   \
     (MFX_VERSION_MAJOR > (MAJOR) ||         \
      MFX_VERSION_MAJOR == (MAJOR) && MFX_VERSION_MINOR >= (MINOR))
@@ -52,6 +54,7 @@ typedef struct QSVMid {
 typedef struct QSVFrame {
     AVFrame *frame;
     mfxFrameSurface1 surface;
+    mfxEncodeCtrl enc_ctrl;
     mfxExtDecodedFrameInfo dec_info;
     mfxExtBuffer *ext_param;
 
@@ -75,7 +78,7 @@ typedef struct QSVFramesContext {
 } QSVFramesContext;
 
 /**
- * Convert a libmfx error code into a libav error code.
+ * Convert a libmfx error code into an ffmpeg error code.
  */
 int ff_qsv_map_error(mfxStatus mfx_err, const char **desc);
 
diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
index 48ac6eb..4a0be81 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Luca Barbato
  * copyright (c) 2015 Anton Khirnov <anton@khirnov.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -352,7 +352,7 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
         ret = MFXVideoDECODE_DecodeFrameAsync(q->session, avpkt->size ? &bs : NULL,
                                               insurf, &outsurf, sync);
         if (ret == MFX_WRN_DEVICE_BUSY)
-            av_usleep(1);
+            av_usleep(500);
 
     } while (ret == MFX_WRN_DEVICE_BUSY || ret == MFX_ERR_MORE_SURFACE);
 
@@ -372,6 +372,8 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
         ++q->zero_consume_run;
         if (q->zero_consume_run > 1)
             ff_qsv_print_warning(avctx, ret, "A decode call did not consume any data");
+    } else if (!*sync && bs.DataOffset) {
+        ++q->buffered_count;
     } else {
         q->zero_consume_run = 0;
     }
@@ -499,15 +501,7 @@ int ff_qsv_process_data(AVCodecContext *avctx, QSVContext *q,
         if (!q->avctx_internal)
             return AVERROR(ENOMEM);
 
-        if (avctx->extradata) {
-            q->avctx_internal->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!q->avctx_internal->extradata)
-                return AVERROR(ENOMEM);
-
-            memcpy(q->avctx_internal->extradata, avctx->extradata,
-                   avctx->extradata_size);
-            q->avctx_internal->extradata_size = avctx->extradata_size;
-        }
+        q->avctx_internal->codec_id = avctx->codec_id;
 
         q->parser = av_parser_init(avctx->codec_id);
         if (!q->parser)
@@ -536,6 +530,16 @@ int ff_qsv_process_data(AVCodecContext *avctx, QSVContext *q,
                                            AV_PIX_FMT_NONE,
                                            AV_PIX_FMT_NONE };
         enum AVPixelFormat qsv_format;
+        AVPacket zero_pkt = {0};
+
+        if (q->buffered_count) {
+            q->reinit_flag = 1;
+            /* decode zero-size pkt to flush the buffered pkt before reinit */
+            q->buffered_count--;
+            return qsv_decode(avctx, q, frame, got_frame, &zero_pkt);
+        }
+
+        q->reinit_flag = 0;
 
         qsv_format = ff_qsv_map_pixfmt(q->parser->format, &q->fourcc);
         if (qsv_format < 0) {
diff --git a/libavcodec/qsvdec.h b/libavcodec/qsvdec.h
index e25c4d6..111536c 100644
--- a/libavcodec/qsvdec.h
+++ b/libavcodec/qsvdec.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,6 +53,8 @@ typedef struct QSVContext {
 
     AVFifoBuffer *async_fifo;
     int zero_consume_run;
+    int buffered_count;
+    int reinit_flag;
 
     // the internal parser and codec context for parsing the data
     AVCodecParserContext *parser;
diff --git a/libavcodec/qsvdec_h2645.c b/libavcodec/qsvdec_h2645.c
index 0150372..9b49f55 100644
--- a/libavcodec/qsvdec_h2645.c
+++ b/libavcodec/qsvdec_h2645.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Luca Barbato
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -146,10 +146,11 @@ static int qsv_decode_frame(AVCodecContext *avctx, void *data,
             /* no more data */
             if (av_fifo_size(s->packet_fifo) < sizeof(AVPacket))
                 return avpkt->size ? avpkt->size : ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, avpkt);
-
-            av_packet_unref(&s->buffer_pkt);
-
-            av_fifo_generic_read(s->packet_fifo, &s->buffer_pkt, sizeof(s->buffer_pkt), NULL);
+            /* in progress of reinit, no read from fifo and keep the buffer_pkt */
+            if (!s->qsv.reinit_flag) {
+                av_packet_unref(&s->buffer_pkt);
+                av_fifo_generic_read(s->packet_fifo, &s->buffer_pkt, sizeof(s->buffer_pkt), NULL);
+            }
         }
 
         ret = ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, &s->buffer_pkt);
@@ -159,6 +160,8 @@ static int qsv_decode_frame(AVCodecContext *avctx, void *data,
             av_packet_unref(&s->buffer_pkt);
             return ret;
         }
+        if (s->qsv.reinit_flag)
+            continue;
 
         s->buffer_pkt.size -= ret;
         s->buffer_pkt.data += ret;
@@ -175,12 +178,6 @@ static void qsv_decode_flush(AVCodecContext *avctx)
     ff_qsv_decode_flush(avctx, &s->qsv);
 }
 
-#if defined(_WIN32)
-#define LOAD_PLUGIN_DEFAULT LOAD_PLUGIN_HEVC_SW
-#else
-#define LOAD_PLUGIN_DEFAULT LOAD_PLUGIN_HEVC_HW
-#endif
-
 #define OFFSET(x) offsetof(QSVH2645Context, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
@@ -188,7 +185,7 @@ static void qsv_decode_flush(AVCodecContext *avctx)
 static const AVOption hevc_options[] = {
     { "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 1, INT_MAX, VD },
 
-    { "load_plugin", "A user plugin to load in an internal session", OFFSET(load_plugin), AV_OPT_TYPE_INT, { .i64 = LOAD_PLUGIN_DEFAULT }, LOAD_PLUGIN_NONE, LOAD_PLUGIN_HEVC_HW, VD, "load_plugin" },
+    { "load_plugin", "A user plugin to load in an internal session", OFFSET(load_plugin), AV_OPT_TYPE_INT, { .i64 = LOAD_PLUGIN_HEVC_HW }, LOAD_PLUGIN_NONE, LOAD_PLUGIN_HEVC_HW, VD, "load_plugin" },
     { "none",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LOAD_PLUGIN_NONE },    0, 0, VD, "load_plugin" },
     { "hevc_sw",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LOAD_PLUGIN_HEVC_SW }, 0, 0, VD, "load_plugin" },
     { "hevc_hw",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LOAD_PLUGIN_HEVC_HW }, 0, 0, VD, "load_plugin" },
@@ -215,7 +212,7 @@ AVCodec ff_hevc_qsv_decoder = {
     .decode         = qsv_decode_frame,
     .flush          = qsv_decode_flush,
     .close          = qsv_decode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HYBRID,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HYBRID,
     .priv_class     = &hevc_class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_P010,
@@ -250,7 +247,7 @@ AVCodec ff_h264_qsv_decoder = {
     .decode         = qsv_decode_frame,
     .flush          = qsv_decode_flush,
     .close          = qsv_decode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HYBRID,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HYBRID,
     .priv_class     = &class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_P010,
diff --git a/libavcodec/qsvdec_other.c b/libavcodec/qsvdec_other.c
index 47934e9..03251d2 100644
--- a/libavcodec/qsvdec_other.c
+++ b/libavcodec/qsvdec_other.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -132,9 +132,11 @@ static int qsv_decode_frame(AVCodecContext *avctx, void *data,
             /* no more data */
             if (av_fifo_size(s->packet_fifo) < sizeof(AVPacket))
                 return avpkt->size ? avpkt->size : ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, avpkt);
-
-            av_packet_unref(&s->input_ref);
-            av_fifo_generic_read(s->packet_fifo, &s->input_ref, sizeof(s->input_ref), NULL);
+            /* in progress of reinit, no read from fifo and keep the buffer_pkt */
+            if (!s->qsv.reinit_flag) {
+                av_packet_unref(&s->input_ref);
+                av_fifo_generic_read(s->packet_fifo, &s->input_ref, sizeof(s->input_ref), NULL);
+            }
         }
 
         ret = ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, &s->input_ref);
@@ -145,6 +147,8 @@ static int qsv_decode_frame(AVCodecContext *avctx, void *data,
 
             return ret;
         }
+        if (s->qsv.reinit_flag)
+            continue;
 
         s->input_ref.size -= ret;
         s->input_ref.data += ret;
@@ -186,7 +190,7 @@ AVCodec ff_mpeg2_qsv_decoder = {
     .decode         = qsv_decode_frame,
     .flush          = qsv_decode_flush,
     .close          = qsv_decode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HYBRID,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HYBRID,
     .priv_class     = &mpeg2_qsv_class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_QSV,
@@ -214,7 +218,7 @@ AVCodec ff_vc1_qsv_decoder = {
     .decode         = qsv_decode_frame,
     .flush          = qsv_decode_flush,
     .close          = qsv_decode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HYBRID,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HYBRID,
     .priv_class     = &vc1_qsv_class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_QSV,
@@ -242,7 +246,7 @@ AVCodec ff_vp8_qsv_decoder = {
     .decode         = qsv_decode_frame,
     .flush          = qsv_decode_flush,
     .close          = qsv_decode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HYBRID,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HYBRID,
     .priv_class     = &vp8_qsv_class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_QSV,
diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
index 16d6e79..5aa020d 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Yukinori Yamazoe
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "libavutil/log.h"
 #include "libavutil/time.h"
 #include "libavutil/imgutils.h"
+#include "libavcodec/bytestream.h"
 
 #include "avcodec.h"
 #include "internal.h"
@@ -135,7 +136,7 @@ static void dump_video_param(AVCodecContext *avctx, QSVEncContext *q,
 #if QSV_HAVE_CO2
     mfxExtCodingOption2 *co2 = (mfxExtCodingOption2*)coding_opts[1];
 #endif
-#if QSV_HAVE_CO3 && QSV_HAVE_QVBR
+#if QSV_HAVE_CO3
     mfxExtCodingOption3 *co3 = (mfxExtCodingOption3*)coding_opts[2];
 #endif
 
@@ -160,8 +161,8 @@ static void dump_video_param(AVCodecContext *avctx, QSVEncContext *q,
 #endif
         ) {
         av_log(avctx, AV_LOG_VERBOSE,
-               "BufferSizeInKB: %"PRIu16"; InitialDelayInKB: %"PRIu16"; TargetKbps: %"PRIu16"; MaxKbps: %"PRIu16"\n",
-               info->BufferSizeInKB, info->InitialDelayInKB, info->TargetKbps, info->MaxKbps);
+               "BufferSizeInKB: %"PRIu16"; InitialDelayInKB: %"PRIu16"; TargetKbps: %"PRIu16"; MaxKbps: %"PRIu16"; BRCParamMultiplier: %"PRIu16"\n",
+               info->BufferSizeInKB, info->InitialDelayInKB, info->TargetKbps, info->MaxKbps, info->BRCParamMultiplier);
     } else if (info->RateControlMethod == MFX_RATECONTROL_CQP) {
         av_log(avctx, AV_LOG_VERBOSE, "QPI: %"PRIu16"; QPP: %"PRIu16"; QPB: %"PRIu16"\n",
                info->QPI, info->QPP, info->QPB);
@@ -169,8 +170,8 @@ static void dump_video_param(AVCodecContext *avctx, QSVEncContext *q,
 #if QSV_HAVE_AVBR
     else if (info->RateControlMethod == MFX_RATECONTROL_AVBR) {
         av_log(avctx, AV_LOG_VERBOSE,
-               "TargetKbps: %"PRIu16"; Accuracy: %"PRIu16"; Convergence: %"PRIu16"\n",
-               info->TargetKbps, info->Accuracy, info->Convergence);
+               "TargetKbps: %"PRIu16"; Accuracy: %"PRIu16"; Convergence: %"PRIu16"; BRCParamMultiplier: %"PRIu16"\n",
+               info->TargetKbps, info->Accuracy, info->Convergence, info->BRCParamMultiplier);
     }
 #endif
 #if QSV_HAVE_LA
@@ -180,8 +181,8 @@ static void dump_video_param(AVCodecContext *avctx, QSVEncContext *q,
 #endif
              ) {
         av_log(avctx, AV_LOG_VERBOSE,
-               "TargetKbps: %"PRIu16"; LookAheadDepth: %"PRIu16"\n",
-               info->TargetKbps, co2->LookAheadDepth);
+               "TargetKbps: %"PRIu16"; LookAheadDepth: %"PRIu16"; BRCParamMultiplier: %"PRIu16"\n",
+               info->TargetKbps, co2->LookAheadDepth, info->BRCParamMultiplier);
     }
 #endif
 #if QSV_HAVE_ICQ
@@ -198,7 +199,6 @@ static void dump_video_param(AVCodecContext *avctx, QSVEncContext *q,
                co3->QVBRQuality);
     }
 #endif
-
     av_log(avctx, AV_LOG_VERBOSE, "NumSlice: %"PRIu16"; NumRefFrame: %"PRIu16"\n",
            info->NumSlice, info->NumRefFrame);
     av_log(avctx, AV_LOG_VERBOSE, "RateDistortionOpt: %s\n",
@@ -234,6 +234,10 @@ static void dump_video_param(AVCodecContext *avctx, QSVEncContext *q,
     av_log(avctx, AV_LOG_VERBOSE, "\n");
 #endif
 
+#if QSV_HAVE_VDENC
+    av_log(avctx, AV_LOG_VERBOSE, "VDENC: %s\n", print_threestate(info->LowPower));
+#endif
+
 #if QSV_VERSION_ATLEAST(1, 8)
     av_log(avctx, AV_LOG_VERBOSE,
            "RepeatPPS: %s; NumMbPerSlice: %"PRIu16"; LookAheadDS: ",
@@ -271,6 +275,10 @@ static void dump_video_param(AVCodecContext *avctx, QSVEncContext *q,
                print_threestate(co->NalHrdConformance), print_threestate(co->SingleSeiNalUnit),
                print_threestate(co->VuiVclHrdParameters), print_threestate(co->VuiNalHrdParameters));
     }
+
+    av_log(avctx, AV_LOG_VERBOSE, "FrameRateExtD: %"PRIu32"; FrameRateExtN: %"PRIu32" \n",
+           info->FrameInfo.FrameRateExtD, info->FrameInfo.FrameRateExtN);
+
 }
 
 static int select_rc_mode(AVCodecContext *avctx, QSVEncContext *q)
@@ -278,7 +286,7 @@ static int select_rc_mode(AVCodecContext *avctx, QSVEncContext *q)
     const char *rc_desc;
     mfxU16      rc_mode;
 
-    int want_la     = q->la_depth >= 10;
+    int want_la     = q->look_ahead;
     int want_qscale = !!(avctx->flags & AV_CODEC_FLAG_QSCALE);
     int want_vcm    = q->vcm;
 
@@ -330,7 +338,7 @@ static int select_rc_mode(AVCodecContext *avctx, QSVEncContext *q)
     }
 #endif
 #if QSV_HAVE_ICQ
-    else if (avctx->global_quality > 0) {
+    else if (avctx->global_quality > 0 && !avctx->rc_max_rate) {
         rc_mode = MFX_RATECONTROL_ICQ;
         rc_desc = "intelligent constant quality (ICQ)";
     }
@@ -345,6 +353,12 @@ static int select_rc_mode(AVCodecContext *avctx, QSVEncContext *q)
         rc_desc = "average variable bitrate (AVBR)";
     }
 #endif
+#if QSV_HAVE_QVBR
+    else if (avctx->global_quality > 0) {
+        rc_mode = MFX_RATECONTROL_QVBR;
+        rc_desc = "constant quality with VBR algorithm (QVBR)";
+    }
+#endif
     else {
         rc_mode = MFX_RATECONTROL_VBR;
         rc_desc = "variable bitrate (VBR)";
@@ -451,7 +465,16 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
                                    avctx->sw_pix_fmt : avctx->pix_fmt;
     const AVPixFmtDescriptor *desc;
     float quant;
+    int target_bitrate_kbps, max_bitrate_kbps, brc_param_multiplier;
+    int buffer_size_in_kilobytes, initial_delay_in_kilobytes;
     int ret;
+    mfxVersion ver;
+
+    ret = MFXQueryVersion(q->session,&ver);
+    if (ret != MFX_ERR_NONE) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting the session handle\n");
+        return AVERROR_UNKNOWN;
+    }
 
     ret = ff_qsv_codec_id_to_mfx(avctx->codec_id);
     if (ret < 0)
@@ -473,7 +496,7 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
     }
 
 #if QSV_HAVE_VDENC
-    q->param.mfx.LowPower           = q->low_power ? MFX_CODINGOPTION_ON:MFX_CODINGOPTION_OFF;
+    q->param.mfx.LowPower           = q->low_power ? MFX_CODINGOPTION_ON : MFX_CODINGOPTION_OFF;
 #endif
     q->param.mfx.CodecProfile       = q->profile;
     q->param.mfx.TargetUsage        = avctx->compression_level;
@@ -504,10 +527,10 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
     q->param.mfx.FrameInfo.BitDepthChroma = desc->comp[0].depth;
     q->param.mfx.FrameInfo.Shift          = desc->comp[0].depth > 8;
 
-    // TODO:  detect version of MFX--if the minor version is greater than
-    // or equal to 19, then can use the same alignment settings as H.264
-    // for HEVC
-    q->width_align = avctx->codec_id == AV_CODEC_ID_HEVC ? 32 : 16;
+    // If the minor version is greater than or equal to 19,
+    // then can use the same alignment settings as H.264 for HEVC
+    q->width_align = (avctx->codec_id != AV_CODEC_ID_HEVC ||
+                      QSV_RUNTIME_VERSION_ATLEAST(ver, 1, 19)) ? 16 : 32;
     q->param.mfx.FrameInfo.Width = FFALIGN(avctx->width, q->width_align);
 
     if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
@@ -545,16 +568,32 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
     if (ret < 0)
         return ret;
 
+    //libmfx BRC parameters are 16 bits thus maybe overflow, then BRCParamMultiplier is needed
+    buffer_size_in_kilobytes   = avctx->rc_buffer_size / 8000;
+    initial_delay_in_kilobytes = avctx->rc_initial_buffer_occupancy / 1000;
+    target_bitrate_kbps        = avctx->bit_rate / 1000;
+    max_bitrate_kbps           = avctx->rc_max_rate / 1000;
+    brc_param_multiplier       = (FFMAX(FFMAX3(target_bitrate_kbps, max_bitrate_kbps, buffer_size_in_kilobytes),
+                                  initial_delay_in_kilobytes) + 0x10000) / 0x10000;
+
     switch (q->param.mfx.RateControlMethod) {
     case MFX_RATECONTROL_CBR:
     case MFX_RATECONTROL_VBR:
 #if QSV_HAVE_VCM
     case MFX_RATECONTROL_VCM:
 #endif
-        q->param.mfx.BufferSizeInKB   = avctx->rc_buffer_size / 8000;
-        q->param.mfx.InitialDelayInKB = avctx->rc_initial_buffer_occupancy / 1000;
-        q->param.mfx.TargetKbps       = avctx->bit_rate / 1000;
-        q->param.mfx.MaxKbps          = avctx->rc_max_rate / 1000;
+#if QSV_HAVE_QVBR
+    case MFX_RATECONTROL_QVBR:
+#endif
+        q->param.mfx.BufferSizeInKB   = buffer_size_in_kilobytes / brc_param_multiplier;
+        q->param.mfx.InitialDelayInKB = initial_delay_in_kilobytes / brc_param_multiplier;
+        q->param.mfx.TargetKbps       = target_bitrate_kbps / brc_param_multiplier;
+        q->param.mfx.MaxKbps          = max_bitrate_kbps / brc_param_multiplier;
+        q->param.mfx.BRCParamMultiplier = brc_param_multiplier;
+#if QSV_HAVE_QVBR
+        if (q->param.mfx.RateControlMethod == MFX_RATECONTROL_QVBR)
+            q->extco3.QVBRQuality = av_clip(avctx->global_quality, 0, 51);
+#endif
         break;
     case MFX_RATECONTROL_CQP:
         quant = avctx->global_quality / FF_QP2LAMBDA;
@@ -566,19 +605,21 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
         break;
 #if QSV_HAVE_AVBR
     case MFX_RATECONTROL_AVBR:
-        q->param.mfx.TargetKbps  = avctx->bit_rate / 1000;
+        q->param.mfx.TargetKbps  = target_bitrate_kbps / brc_param_multiplier;
         q->param.mfx.Convergence = q->avbr_convergence;
         q->param.mfx.Accuracy    = q->avbr_accuracy;
+        q->param.mfx.BRCParamMultiplier = brc_param_multiplier;
         break;
 #endif
 #if QSV_HAVE_LA
     case MFX_RATECONTROL_LA:
-        q->param.mfx.TargetKbps  = avctx->bit_rate / 1000;
-        q->extco2.LookAheadDepth = q->la_depth;
+        q->param.mfx.TargetKbps  = target_bitrate_kbps / brc_param_multiplier;
+        q->extco2.LookAheadDepth = q->look_ahead_depth;
+        q->param.mfx.BRCParamMultiplier = brc_param_multiplier;
         break;
 #if QSV_HAVE_ICQ
     case MFX_RATECONTROL_LA_ICQ:
-        q->extco2.LookAheadDepth = q->la_depth;
+        q->extco2.LookAheadDepth = q->look_ahead_depth;
     case MFX_RATECONTROL_ICQ:
         q->param.mfx.ICQQuality  = avctx->global_quality;
         break;
@@ -592,6 +633,9 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
         q->extco.Header.BufferId      = MFX_EXTBUFF_CODING_OPTION;
         q->extco.Header.BufferSz      = sizeof(q->extco);
 
+        q->extco.PicTimingSEI         = q->pic_timing_sei ?
+                                        MFX_CODINGOPTION_ON : MFX_CODINGOPTION_UNKNOWN;
+
         if (q->rdo >= 0)
             q->extco.RateDistortionOpt = q->rdo > 0 ? MFX_CODINGOPTION_ON : MFX_CODINGOPTION_OFF;
 
@@ -619,8 +663,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         q->extparam_internal[q->nb_extparam_internal++] = (mfxExtBuffer *)&q->extco;
 
-#if QSV_HAVE_CO2
         if (avctx->codec_id == AV_CODEC_ID_H264) {
+#if QSV_HAVE_CO2
             q->extco2.Header.BufferId     = MFX_EXTBUFF_CODING_OPTION2;
             q->extco2.Header.BufferSz     = sizeof(q->extco2);
 
@@ -649,11 +693,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
             q->extco2.Trellis = q->trellis;
 #endif
 
-#if QSV_HAVE_LA_DS
-            q->extco2.LookAheadDS = q->la_ds;
-#endif
+#if QSV_VERSION_ATLEAST(1, 8)
+            q->extco2.LookAheadDS = q->look_ahead_downsampling;
+            q->extco2.RepeatPPS   = q->repeat_pps ? MFX_CODINGOPTION_ON : MFX_CODINGOPTION_OFF;
 
-#if QSV_HAVE_BREF_TYPE
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
             if (avctx->b_frame_strategy >= 0)
@@ -683,13 +726,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
 #endif
             q->extparam_internal[q->nb_extparam_internal++] = (mfxExtBuffer *)&q->extco2;
-        }
 #endif
+
 #if QSV_HAVE_MF
-        if (avctx->codec_id == AV_CODEC_ID_H264) {
-            mfxVersion    ver;
-            ret = MFXQueryVersion(q->session,&ver);
-            if (ret >= MFX_ERR_NONE && QSV_RUNTIME_VERSION_ATLEAST(ver, 1, 25)) {
+            if (QSV_RUNTIME_VERSION_ATLEAST(ver, 1, 25)) {
                 q->extmfp.Header.BufferId     = MFX_EXTBUFF_MULTI_FRAME_PARAM;
                 q->extmfp.Header.BufferSz     = sizeof(q->extmfp);
 
@@ -697,7 +737,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 av_log(avctx,AV_LOG_VERBOSE,"MFMode:%d\n", q->extmfp.MFMode);
                 q->extparam_internal[q->nb_extparam_internal++] = (mfxExtBuffer *)&q->extmfp;
             }
+#endif
         }
+#if QSV_HAVE_CO3
+        q->extco3.Header.BufferId      = MFX_EXTBUFF_CODING_OPTION3;
+        q->extco3.Header.BufferSz      = sizeof(q->extco3);
+        q->extparam_internal[q->nb_extparam_internal++] = (mfxExtBuffer *)&q->extco3;
 #endif
     }
 
@@ -720,7 +765,7 @@ static int qsv_retrieve_enc_jpeg_params(AVCodecContext *avctx, QSVEncContext *q)
         return ff_qsv_print_error(avctx, ret,
                                   "Error calling GetVideoParam");
 
-    q->packet_size = q->param.mfx.BufferSizeInKB * 1000;
+    q->packet_size = q->param.mfx.BufferSizeInKB * q->param.mfx.BRCParamMultiplier * 1000;
 
     // for qsv mjpeg the return value maybe 0 so alloc the buffer
     if (q->packet_size == 0)
@@ -782,7 +827,7 @@ static int qsv_retrieve_enc_params(AVCodecContext *avctx, QSVEncContext *q)
         return ff_qsv_print_error(avctx, ret,
                                   "Error calling GetVideoParam");
 
-    q->packet_size = q->param.mfx.BufferSizeInKB * 1000;
+    q->packet_size = q->param.mfx.BufferSizeInKB * q->param.mfx.BRCParamMultiplier * 1000;
 
     if (!extradata.SPSBufSize || (need_pps && !extradata.PPSBufSize)) {
         av_log(avctx, AV_LOG_ERROR, "No extradata returned from libmfx.\n");
@@ -1026,11 +1071,23 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
     return 0;
 }
 
+static void free_encoder_ctrl_payloads(mfxEncodeCtrl* enc_ctrl)
+{
+    if (enc_ctrl) {
+        int i;
+        for (i = 0; i < enc_ctrl->NumPayload && i < QSV_MAX_ENC_PAYLOAD; i++) {
+            av_free(enc_ctrl->Payload[i]);
+        }
+        enc_ctrl->NumPayload = 0;
+    }
+}
+
 static void clear_unused_frames(QSVEncContext *q)
 {
     QSVFrame *cur = q->work_frames;
     while (cur) {
         if (cur->used && !cur->surface.Data.Locked) {
+            free_encoder_ctrl_payloads(&cur->enc_ctrl);
             if (cur->frame->format == AV_PIX_FMT_QSV) {
                 av_frame_unref(cur->frame);
             }
@@ -1067,6 +1124,11 @@ static int get_free_frame(QSVEncContext *q, QSVFrame **f)
         av_freep(&frame);
         return AVERROR(ENOMEM);
     }
+    frame->enc_ctrl.Payload = av_mallocz(sizeof(mfxPayload*) * QSV_MAX_ENC_PAYLOAD);
+    if (!frame->enc_ctrl.Payload) {
+        av_freep(&frame);
+        return AVERROR(ENOMEM);
+    }
     *last = frame;
 
     *f = frame;
@@ -1076,7 +1138,7 @@ static int get_free_frame(QSVEncContext *q, QSVFrame **f)
 }
 
 static int submit_frame(QSVEncContext *q, const AVFrame *frame,
-                        mfxFrameSurface1 **surface)
+                        QSVFrame **new_frame)
 {
     QSVFrame *qf;
     int ret;
@@ -1149,7 +1211,7 @@ static int submit_frame(QSVEncContext *q, const AVFrame *frame,
 
     qf->surface.Data.TimeStamp = av_rescale_q(frame->pts, q->avctx->time_base, (AVRational){1, 90000});
 
-    *surface = &qf->surface;
+    *new_frame = qf;
 
     return 0;
 }
@@ -1171,18 +1233,35 @@ static int encode_frame(AVCodecContext *avctx, QSVEncContext *q,
 {
     AVPacket new_pkt = { 0 };
     mfxBitstream *bs;
+#if QSV_VERSION_ATLEAST(1, 26)
+    mfxExtAVCEncodedFrameInfo *enc_info;
+    mfxExtBuffer **enc_buf;
+#endif
 
     mfxFrameSurface1 *surf = NULL;
     mfxSyncPoint *sync     = NULL;
+    QSVFrame *qsv_frame = NULL;
+    mfxEncodeCtrl* enc_ctrl = NULL;
     int ret;
 
     if (frame) {
-        ret = submit_frame(q, frame, &surf);
+        ret = submit_frame(q, frame, &qsv_frame);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error submitting the frame for encoding.\n");
             return ret;
         }
     }
+    if (qsv_frame) {
+        surf = &qsv_frame->surface;
+        enc_ctrl = &qsv_frame->enc_ctrl;
+        memset(enc_ctrl, 0, sizeof(mfxEncodeCtrl));
+
+        if (frame->pict_type == AV_PICTURE_TYPE_I) {
+            enc_ctrl->FrameType = MFX_FRAMETYPE_I | MFX_FRAMETYPE_REF;
+            if (q->forced_idr)
+                enc_ctrl->FrameType |= MFX_FRAMETYPE_IDR;
+        }
+    }
 
     ret = av_new_packet(&new_pkt, q->packet_size);
     if (ret < 0) {
@@ -1198,17 +1277,45 @@ static int encode_frame(AVCodecContext *avctx, QSVEncContext *q,
     bs->Data      = new_pkt.data;
     bs->MaxLength = new_pkt.size;
 
+#if QSV_VERSION_ATLEAST(1, 26)
+    if (avctx->codec_id == AV_CODEC_ID_H264) {
+        enc_info = av_mallocz(sizeof(*enc_info));
+        if (!enc_info)
+            return AVERROR(ENOMEM);
+
+        enc_info->Header.BufferId = MFX_EXTBUFF_ENCODED_FRAME_INFO;
+        enc_info->Header.BufferSz = sizeof (*enc_info);
+        bs->NumExtParam = 1;
+        enc_buf = av_mallocz(sizeof(mfxExtBuffer *));
+        if (!enc_buf)
+            return AVERROR(ENOMEM);
+        enc_buf[0] = (mfxExtBuffer *)enc_info;
+
+        bs->ExtParam = enc_buf;
+    }
+#endif
+
+    if (q->set_encode_ctrl_cb) {
+        q->set_encode_ctrl_cb(avctx, frame, &qsv_frame->enc_ctrl);
+    }
+
     sync = av_mallocz(sizeof(*sync));
     if (!sync) {
         av_freep(&bs);
+ #if QSV_VERSION_ATLEAST(1, 26)
+        if (avctx->codec_id == AV_CODEC_ID_H264) {
+            av_freep(&enc_info);
+            av_freep(&enc_buf);
+        }
+ #endif
         av_packet_unref(&new_pkt);
         return AVERROR(ENOMEM);
     }
 
     do {
-        ret = MFXVideoENCODE_EncodeFrameAsync(q->session, NULL, surf, bs, sync);
+        ret = MFXVideoENCODE_EncodeFrameAsync(q->session, enc_ctrl, surf, bs, sync);
         if (ret == MFX_WRN_DEVICE_BUSY)
-            av_usleep(1);
+            av_usleep(500);
     } while (ret == MFX_WRN_DEVICE_BUSY || ret == MFX_WRN_IN_EXECUTION);
 
     if (ret > 0)
@@ -1217,6 +1324,12 @@ static int encode_frame(AVCodecContext *avctx, QSVEncContext *q,
     if (ret < 0) {
         av_packet_unref(&new_pkt);
         av_freep(&bs);
+#if QSV_VERSION_ATLEAST(1, 26)
+        if (avctx->codec_id == AV_CODEC_ID_H264) {
+            av_freep(&enc_info);
+            av_freep(&enc_buf);
+        }
+#endif
         av_freep(&sync);
         return (ret == MFX_ERR_MORE_DATA) ?
                0 : ff_qsv_print_error(avctx, ret, "Error during encoding");
@@ -1233,6 +1346,12 @@ static int encode_frame(AVCodecContext *avctx, QSVEncContext *q,
         av_freep(&sync);
         av_packet_unref(&new_pkt);
         av_freep(&bs);
+#if QSV_VERSION_ATLEAST(1, 26)
+        if (avctx->codec_id == AV_CODEC_ID_H264) {
+            av_freep(&enc_info);
+            av_freep(&enc_buf);
+        }
+#endif
     }
 
     return 0;
@@ -1252,6 +1371,11 @@ int ff_qsv_encode(AVCodecContext *avctx, QSVEncContext *q,
         AVPacket new_pkt;
         mfxBitstream *bs;
         mfxSyncPoint *sync;
+#if QSV_VERSION_ATLEAST(1, 26)
+        mfxExtAVCEncodedFrameInfo *enc_info;
+        mfxExtBuffer **enc_buf;
+#endif
+        enum AVPictureType pict_type;
 
         av_fifo_generic_read(q->async_fifo, &new_pkt, sizeof(new_pkt), NULL);
         av_fifo_generic_read(q->async_fifo, &sync,    sizeof(sync),    NULL);
@@ -1265,21 +1389,39 @@ int ff_qsv_encode(AVCodecContext *avctx, QSVEncContext *q,
         new_pkt.pts  = av_rescale_q(bs->TimeStamp,       (AVRational){1, 90000}, avctx->time_base);
         new_pkt.size = bs->DataLength;
 
-        if (bs->FrameType & MFX_FRAMETYPE_IDR ||
-            bs->FrameType & MFX_FRAMETYPE_xIDR)
+        if (bs->FrameType & MFX_FRAMETYPE_IDR || bs->FrameType & MFX_FRAMETYPE_xIDR) {
             new_pkt.flags |= AV_PKT_FLAG_KEY;
+            pict_type = AV_PICTURE_TYPE_I;
+        } else if (bs->FrameType & MFX_FRAMETYPE_I || bs->FrameType & MFX_FRAMETYPE_xI)
+            pict_type = AV_PICTURE_TYPE_I;
+        else if (bs->FrameType & MFX_FRAMETYPE_P || bs->FrameType & MFX_FRAMETYPE_xP)
+            pict_type = AV_PICTURE_TYPE_P;
+        else if (bs->FrameType & MFX_FRAMETYPE_B || bs->FrameType & MFX_FRAMETYPE_xB)
+            pict_type = AV_PICTURE_TYPE_B;
+        else if (bs->FrameType == MFX_FRAMETYPE_UNKNOWN) {
+            pict_type = AV_PICTURE_TYPE_NONE;
+            av_log(avctx, AV_LOG_WARNING, "Unknown FrameType, set pict_type to AV_PICTURE_TYPE_NONE.\n");
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Invalid FrameType:%d.\n", bs->FrameType);
+            return AVERROR_INVALIDDATA;
+        }
 
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-        if (bs->FrameType & MFX_FRAMETYPE_I || bs->FrameType & MFX_FRAMETYPE_xI)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-        else if (bs->FrameType & MFX_FRAMETYPE_P || bs->FrameType & MFX_FRAMETYPE_xP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
-        else if (bs->FrameType & MFX_FRAMETYPE_B || bs->FrameType & MFX_FRAMETYPE_xB)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+#if QSV_VERSION_ATLEAST(1, 26)
+        if (avctx->codec_id == AV_CODEC_ID_H264) {
+            enc_buf = bs->ExtParam;
+            enc_info = (mfxExtAVCEncodedFrameInfo *)(*bs->ExtParam);
+            ff_side_data_set_encoder_stats(&new_pkt,
+                enc_info->QP * FF_QP2LAMBDA, NULL, 0, pict_type);
+            av_freep(&enc_info);
+            av_freep(&enc_buf);
+        }
+#endif
         av_freep(&bs);
         av_freep(&sync);
 
@@ -1325,6 +1467,7 @@ int ff_qsv_enc_close(AVCodecContext *avctx, QSVEncContext *q)
     while (cur) {
         q->work_frames = cur->next;
         av_frame_free(&cur->frame);
+        av_free(cur->enc_ctrl.Payload);
         av_freep(&cur);
         cur = q->work_frames;
     }
diff --git a/libavcodec/qsvenc.h b/libavcodec/qsvenc.h
index b74b977..00afbd8 100644
--- a/libavcodec/qsvenc.h
+++ b/libavcodec/qsvenc.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Yukinori Yamazoe
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@
 #define QSV_HAVE_LA_HRD QSV_VERSION_ATLEAST(1, 11)
 #define QSV_HAVE_VDENC  QSV_VERSION_ATLEAST(1, 15)
 
-#if defined(_WIN32)
+#if defined(_WIN32) || defined(__CYGWIN__)
 #define QSV_HAVE_AVBR   QSV_VERSION_ATLEAST(1, 3)
 #define QSV_HAVE_ICQ    QSV_VERSION_ATLEAST(1, 8)
 #define QSV_HAVE_VCM    QSV_VERSION_ATLEAST(1, 8)
@@ -54,44 +54,44 @@
 #define QSV_HAVE_MF     0
 #else
 #define QSV_HAVE_AVBR   0
-#define QSV_HAVE_ICQ    0
+#define QSV_HAVE_ICQ    QSV_VERSION_ATLEAST(1, 28)
 #define QSV_HAVE_VCM    0
-#define QSV_HAVE_QVBR   0
+#define QSV_HAVE_QVBR   QSV_VERSION_ATLEAST(1, 28)
 #define QSV_HAVE_MF     QSV_VERSION_ATLEAST(1, 25)
 #endif
 
 #if !QSV_HAVE_LA_DS
+#define MFX_LOOKAHEAD_DS_UNKNOWN 0
 #define MFX_LOOKAHEAD_DS_OFF 0
 #define MFX_LOOKAHEAD_DS_2x 0
 #define MFX_LOOKAHEAD_DS_4x 0
 #endif
 
 #define QSV_COMMON_OPTS \
-{ "async_depth", "Maximum processing parallelism", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 1, INT_MAX, VE },  \
-{ "avbr_accuracy",    "Accuracy of the AVBR ratecontrol",    OFFSET(qsv.avbr_accuracy),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },     \
-{ "avbr_convergence", "Convergence of the AVBR ratecontrol", OFFSET(qsv.avbr_convergence), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },     \
-{ "preset", NULL, OFFSET(qsv.preset), AV_OPT_TYPE_INT, { .i64 = MFX_TARGETUSAGE_BALANCED }, 0, 7,   VE, "preset" },                             \
-{ "fast",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_SPEED  },   INT_MIN, INT_MAX, VE, "preset" },                             \
-{ "medium", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BALANCED  },     INT_MIN, INT_MAX, VE, "preset" },                             \
-{ "slow",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_QUALITY  }, INT_MIN, INT_MAX, VE, "preset" },                             \
-{ "la_depth", "Number of frames to analyze before encoding.", OFFSET(qsv.la_depth), AV_OPT_TYPE_INT, { .i64 = 9 },   9, 100, VE, "la_depth" },  \
-{ "unset", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 9 }, INT_MIN, INT_MAX,                                                       VE, "la_depth" },  \
-{ "la_ds", "Downscaling factor for the frames saved for the lookahead analysis", OFFSET(qsv.la_ds), AV_OPT_TYPE_INT,                            \
-                    { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, MFX_LOOKAHEAD_DS_UNKNOWN, MFX_LOOKAHEAD_DS_4x, VE, "la_ds" },                          \
-{ "auto", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, INT_MIN, INT_MAX,                                     VE, "la_ds" }, \
-{ "off", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_OFF }, INT_MIN, INT_MAX,                                          VE, "la_ds" }, \
-{ "2x", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_2x }, INT_MIN, INT_MAX,                                            VE, "la_ds" }, \
-{ "4x", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_4x }, INT_MIN, INT_MAX,                                            VE, "la_ds" }, \
-{ "rdo",            "Enable rate distortion optimization",    OFFSET(qsv.rdo),            AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "max_frame_size", "Maximum encoded frame size in bytes",    OFFSET(qsv.max_frame_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE }, \
-{ "max_slice_size", "Maximum encoded slice size in bytes",    OFFSET(qsv.max_slice_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE }, \
-{ "bitrate_limit",  "Toggle bitrate limitations",             OFFSET(qsv.bitrate_limit),  AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "mbbrc",          "MB level bitrate control",               OFFSET(qsv.mbbrc),          AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "extbrc",         "Extended bitrate control",               OFFSET(qsv.extbrc),         AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "adaptive_i",     "Adaptive I-frame placement",             OFFSET(qsv.adaptive_i),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "adaptive_b",     "Adaptive B-frame placement",             OFFSET(qsv.adaptive_b),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "b_strategy",     "Strategy to choose between I/P/B-frames", OFFSET(qsv.b_strategy),    AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },
-
+{ "async_depth", "Maximum processing parallelism", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 1, INT_MAX, VE },                          \
+{ "avbr_accuracy",    "Accuracy of the AVBR ratecontrol",    OFFSET(qsv.avbr_accuracy),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },                             \
+{ "avbr_convergence", "Convergence of the AVBR ratecontrol", OFFSET(qsv.avbr_convergence), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },                             \
+{ "preset", NULL, OFFSET(qsv.preset), AV_OPT_TYPE_INT, { .i64 = MFX_TARGETUSAGE_BALANCED }, MFX_TARGETUSAGE_BEST_QUALITY, MFX_TARGETUSAGE_BEST_SPEED,   VE, "preset" }, \
+{ "veryfast",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_SPEED  },   INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "faster",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_6  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "fast",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_5  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "medium",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BALANCED  },     INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "slow",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_3  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "slower",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_2  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "veryslow",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_QUALITY  }, INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "rdo",            "Enable rate distortion optimization",    OFFSET(qsv.rdo),            AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "max_frame_size", "Maximum encoded frame size in bytes",    OFFSET(qsv.max_frame_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE },                         \
+{ "max_slice_size", "Maximum encoded slice size in bytes",    OFFSET(qsv.max_slice_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE },                         \
+{ "bitrate_limit",  "Toggle bitrate limitations",             OFFSET(qsv.bitrate_limit),  AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "mbbrc",          "MB level bitrate control",               OFFSET(qsv.mbbrc),          AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "extbrc",         "Extended bitrate control",               OFFSET(qsv.extbrc),         AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "adaptive_i",     "Adaptive I-frame placement",             OFFSET(qsv.adaptive_i),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "adaptive_b",     "Adaptive B-frame placement",             OFFSET(qsv.adaptive_b),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "b_strategy",     "Strategy to choose between I/P/B-frames", OFFSET(qsv.b_strategy),    AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "forced_idr",     "Forcing I frames as IDR frames",         OFFSET(qsv.forced_idr),     AV_OPT_TYPE_BOOL,{ .i64 = 0  },  0,          1, VE },                         \
+
+typedef int SetEncodeCtrlCB (AVCodecContext *avctx,
+                             const AVFrame *frame, mfxEncodeCtrl* enc_ctrl);
 typedef struct QSVEncContext {
     AVCodecContext *avctx;
 
@@ -111,6 +111,9 @@ typedef struct QSVEncContext {
 #if QSV_HAVE_CO2
     mfxExtCodingOption2 extco2;
 #endif
+#if QSV_HAVE_CO3
+    mfxExtCodingOption3 extco3;
+#endif
 #if QSV_HAVE_MF
     mfxExtMultiFrameParam   extmfp;
     mfxExtMultiFrameControl extmfc;
@@ -119,7 +122,7 @@ typedef struct QSVEncContext {
     mfxFrameSurface1       **opaque_surfaces;
     AVBufferRef             *opaque_alloc_buf;
 
-    mfxExtBuffer  *extparam_internal[2 + QSV_HAVE_CO2 + (QSV_HAVE_MF * 2)];
+    mfxExtBuffer  *extparam_internal[2 + QSV_HAVE_CO2 + QSV_HAVE_CO3 + (QSV_HAVE_MF * 2)];
     int         nb_extparam_internal;
 
     mfxExtBuffer **extparam;
@@ -135,8 +138,10 @@ typedef struct QSVEncContext {
     int preset;
     int avbr_accuracy;
     int avbr_convergence;
-    int la_depth;
-    int la_ds;
+    int pic_timing_sei;
+    int look_ahead;
+    int look_ahead_depth;
+    int look_ahead_downsampling;
     int vcm;
     int rdo;
     int max_frame_size;
@@ -160,12 +165,18 @@ typedef struct QSVEncContext {
     int int_ref_cycle_size;
     int int_ref_qp_delta;
     int recovery_point_sei;
+
+    int repeat_pps;
     int low_power;
 
+    int a53_cc;
+
 #if QSV_HAVE_MF
     int mfmode;
 #endif
     char *load_plugins;
+    SetEncodeCtrlCB *set_encode_ctrl_cb;
+    int forced_idr;
 } QSVEncContext;
 
 int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q);
diff --git a/libavcodec/qsvenc_h264.c b/libavcodec/qsvenc_h264.c
index 795e27c..f458137 100644
--- a/libavcodec/qsvenc_h264.c
+++ b/libavcodec/qsvenc_h264.c
@@ -1,22 +1,22 @@
 /*
- * Intel MediaSDK QSV based H.264 enccoder
+ * Intel MediaSDK QSV based H.264 encoder
  *
  * copyright (c) 2013 Yukinori Yamazoe
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,10 +40,45 @@ typedef struct QSVH264EncContext {
     QSVEncContext qsv;
 } QSVH264EncContext;
 
+static int qsv_h264_set_encode_ctrl(AVCodecContext *avctx,
+                                    const AVFrame *frame, mfxEncodeCtrl* enc_ctrl)
+{
+    QSVH264EncContext *qh264 = avctx->priv_data;
+    QSVEncContext *q = &qh264->qsv;
+
+    if (q->a53_cc && frame) {
+        mfxPayload* payload;
+        mfxU8* sei_data;
+        size_t sei_size;
+        int res;
+
+        res = ff_alloc_a53_sei(frame, sizeof(mfxPayload) + 2, (void**)&payload, &sei_size);
+        if (res < 0 || !payload)
+            return res;
+
+        sei_data = (mfxU8*)(payload + 1);
+        // SEI header
+        sei_data[0] = 4;
+        sei_data[1] = (mfxU8)sei_size; // size of SEI data
+        // SEI data filled in by ff_alloc_a53_sei
+
+        payload->BufSize = sei_size + 2;
+        payload->NumBit = payload->BufSize * 8;
+        payload->Type = 4;
+        payload->Data = sei_data;
+
+        enc_ctrl->NumExtParam = 0;
+        enc_ctrl->NumPayload = 1;
+        enc_ctrl->Payload[0] = payload;
+    }
+    return 0;
+}
+
 static av_cold int qsv_enc_init(AVCodecContext *avctx)
 {
     QSVH264EncContext *q = avctx->priv_data;
 
+    q->qsv.set_encode_ctrl_cb = qsv_h264_set_encode_ctrl;
     return ff_qsv_enc_init(avctx, &q->qsv);
 }
 
@@ -72,9 +107,24 @@ static const AVOption options[] = {
     { "vcm",      "Use the video conferencing mode ratecontrol",  OFFSET(qsv.vcm),      AV_OPT_TYPE_INT, { .i64 = 0  },  0, 1,         VE },
 #endif
     { "idr_interval", "Distance (in I-frames) between IDR frames", OFFSET(qsv.idr_interval), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "pic_timing_sei",    "Insert picture timing SEI with pic_struct_syntax element", OFFSET(qsv.pic_timing_sei), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
     { "single_sei_nal_unit",    "Put all the SEI messages into one NALU",        OFFSET(qsv.single_sei_nal_unit),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },
     { "max_dec_frame_buffering", "Maximum number of frames buffered in the DPB", OFFSET(qsv.max_dec_frame_buffering), AV_OPT_TYPE_INT, { .i64 = 0 },   0, UINT16_MAX, VE },
 
+#if QSV_HAVE_LA
+    { "look_ahead",       "Use VBR algorithm with look ahead",    OFFSET(qsv.look_ahead),       AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "look_ahead_depth", "Depth of look ahead in number frames", OFFSET(qsv.look_ahead_depth), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 100, VE },
+#endif
+#if QSV_HAVE_LA_DS
+    { "look_ahead_downsampling", "Downscaling factor for the frames saved for the lookahead analysis", OFFSET(qsv.look_ahead_downsampling),
+                                          AV_OPT_TYPE_INT,   { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, MFX_LOOKAHEAD_DS_UNKNOWN, MFX_LOOKAHEAD_DS_4x, VE, "look_ahead_downsampling" },
+    { "unknown"                , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "auto"                   , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "off"                    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_OFF     }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "2x"                     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_2x      }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "4x"                     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_4x      }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+#endif
+
     { "int_ref_type", "Intra refresh type",                                      OFFSET(qsv.int_ref_type),            AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE, "int_ref_type" },
         { "none",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, .flags = VE, "int_ref_type" },
         { "vertical", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, .flags = VE, "int_ref_type" },
@@ -94,6 +144,8 @@ static const AVOption options[] = {
     { "main"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_AVC_MAIN     }, INT_MIN, INT_MAX,     VE, "profile" },
     { "high"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_AVC_HIGH     }, INT_MIN, INT_MAX,     VE, "profile" },
 
+    { "a53cc" , "Use A53 Closed Captions (if available)", OFFSET(qsv.a53_cc), AV_OPT_TYPE_INT, {.i64 = 1}, 0, 1, VE},
+
     { "aud", "Insert the Access Unit Delimiter NAL", OFFSET(qsv.aud), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE},
 
 #if QSV_HAVE_MF
@@ -101,10 +153,13 @@ static const AVOption options[] = {
     { "off"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_MF_DISABLED }, INT_MIN, INT_MAX,     VE, "mfmode" },
     { "auto"   , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_MF_AUTO     }, INT_MIN, INT_MAX,     VE, "mfmode" },
 #endif
+
 #if QSV_HAVE_VDENC
-    { "low_power", "enable low power mode (experimental, many limitations by mfx version, HW platform, BRC modes, etc.", OFFSET(qsv.low_power), AV_OPT_TYPE_INT, { .i64 =  0 }, 0, 1, VE},
+    { "low_power", "enable low power mode(experimental: many limitations by mfx version, BRC modes, etc.)", OFFSET(qsv.low_power), AV_OPT_TYPE_BOOL, { .i64 =  0 }, 0, 1, VE},
 #endif
 
+    { "repeat_pps", "repeat pps for every frame", OFFSET(qsv.repeat_pps), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+
     { NULL },
 };
 
diff --git a/libavcodec/qsvenc_hevc.c b/libavcodec/qsvenc_hevc.c
index cba671b..1c615b4 100644
--- a/libavcodec/qsvenc_hevc.c
+++ b/libavcodec/qsvenc_hevc.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV based HEVC encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,6 +56,7 @@ static int generate_fake_vps(QSVEncContext *q, AVCodecContext *avctx)
     PutByteContext pbc;
 
     GetBitContext gb;
+    H2645RBSP sps_rbsp = { NULL };
     H2645NAL sps_nal = { NULL };
     HEVCSPS sps = { 0 };
     HEVCVPS vps = { 0 };
@@ -69,8 +70,12 @@ static int generate_fake_vps(QSVEncContext *q, AVCodecContext *avctx)
         return AVERROR_UNKNOWN;
     }
 
+    av_fast_padded_malloc(&sps_rbsp.rbsp_buffer, &sps_rbsp.rbsp_buffer_alloc_size, avctx->extradata_size);
+    if (!sps_rbsp.rbsp_buffer)
+        return AVERROR(ENOMEM);
+
     /* parse the SPS */
-    ret = ff_h2645_extract_rbsp(avctx->extradata + 4, avctx->extradata_size - 4, &sps_nal);
+    ret = ff_h2645_extract_rbsp(avctx->extradata + 4, avctx->extradata_size - 4, &sps_rbsp, &sps_nal, 1);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error unescaping the SPS buffer\n");
         return ret;
@@ -78,7 +83,7 @@ static int generate_fake_vps(QSVEncContext *q, AVCodecContext *avctx)
 
     ret = init_get_bits8(&gb, sps_nal.data, sps_nal.size);
     if (ret < 0) {
-        av_freep(&sps_nal.rbsp_buffer);
+        av_freep(&sps_rbsp.rbsp_buffer);
         return ret;
     }
 
@@ -87,13 +92,13 @@ static int generate_fake_vps(QSVEncContext *q, AVCodecContext *avctx)
     if (type != HEVC_NAL_SPS) {
         av_log(avctx, AV_LOG_ERROR, "Unexpected NAL type in the extradata: %d\n",
                type);
-        av_freep(&sps_nal.rbsp_buffer);
+        av_freep(&sps_rbsp.rbsp_buffer);
         return AVERROR_INVALIDDATA;
     }
     get_bits(&gb, 9);
 
     ret = ff_hevc_parse_sps(&sps, &gb, &sps_id, 0, NULL, avctx);
-    av_freep(&sps_nal.rbsp_buffer);
+    av_freep(&sps_rbsp.rbsp_buffer);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error parsing the SPS\n");
         return ret;
@@ -212,12 +217,6 @@ static av_cold int qsv_enc_close(AVCodecContext *avctx)
     return ff_qsv_enc_close(avctx, &q->qsv);
 }
 
-#if defined(_WIN32)
-#define LOAD_PLUGIN_DEFAULT LOAD_PLUGIN_HEVC_SW
-#else
-#define LOAD_PLUGIN_DEFAULT LOAD_PLUGIN_HEVC_HW
-#endif
-
 #define OFFSET(x) offsetof(QSVHEVCEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
@@ -225,7 +224,7 @@ static const AVOption options[] = {
 
     { "idr_interval", "Distance (in I-frames) between IDR frames", OFFSET(qsv.idr_interval), AV_OPT_TYPE_INT, { .i64 = 0 }, -1, INT_MAX, VE, "idr_interval" },
     { "begin_only", "Output an IDR-frame only at the beginning of the stream", 0, AV_OPT_TYPE_CONST, { .i64 = -1 }, 0, 0, VE, "idr_interval" },
-    { "load_plugin", "A user plugin to load in an internal session", OFFSET(load_plugin), AV_OPT_TYPE_INT, { .i64 = LOAD_PLUGIN_DEFAULT }, LOAD_PLUGIN_NONE, LOAD_PLUGIN_HEVC_HW, VE, "load_plugin" },
+    { "load_plugin", "A user plugin to load in an internal session", OFFSET(load_plugin), AV_OPT_TYPE_INT, { .i64 = LOAD_PLUGIN_HEVC_HW }, LOAD_PLUGIN_NONE, LOAD_PLUGIN_HEVC_HW, VE, "load_plugin" },
     { "none",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LOAD_PLUGIN_NONE },    0, 0, VE, "load_plugin" },
     { "hevc_sw",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LOAD_PLUGIN_HEVC_SW }, 0, 0, VE, "load_plugin" },
     { "hevc_hw",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LOAD_PLUGIN_HEVC_HW }, 0, 0, VE, "load_plugin" },
@@ -238,9 +237,6 @@ static const AVOption options[] = {
     { "main",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_HEVC_MAIN    }, INT_MIN, INT_MAX,     VE, "profile" },
     { "main10",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_HEVC_MAIN10  }, INT_MIN, INT_MAX,     VE, "profile" },
     { "mainsp",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_HEVC_MAINSP  }, INT_MIN, INT_MAX,     VE, "profile" },
-#if QSV_HAVE_VDENC
-    { "low_power", "enable low power mode (experimental, many limitations by mfx version, HW platform, BRC modes, etc.", OFFSET(qsv.low_power), AV_OPT_TYPE_INT, { .i64 =  0 },  0,  1, VE },
-#endif
 
     { NULL },
 };
diff --git a/libavcodec/qsvenc_jpeg.c b/libavcodec/qsvenc_jpeg.c
index bbfd009..1e7785a 100644
--- a/libavcodec/qsvenc_jpeg.c
+++ b/libavcodec/qsvenc_jpeg.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV based MJPEG encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsvenc_mpeg2.c b/libavcodec/qsvenc_mpeg2.c
index 9986d8b..a7427d8 100644
--- a/libavcodec/qsvenc_mpeg2.c
+++ b/libavcodec/qsvenc_mpeg2.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV based MPEG-2 encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qtrle.c b/libavcodec/qtrle.c
index 70440d3..a744d7b 100644
--- a/libavcodec/qtrle.c
+++ b/libavcodec/qtrle.c
@@ -2,20 +2,20 @@
  * Quicktime Animation (RLE) Video Decoder
  * Copyright (C) 2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,27 +59,38 @@ static void qtrle_decode_1bpp(QtrleContext *s, int row_ptr, int lines_to_change)
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi0, pi1;  /* 2 8-pixel values */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi0, pi1;  /* 2 8-pixel values */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
     int skip;
+    /* skip & 0x80 appears to mean 'start a new line', which can be interpreted
+     * as 'go to next line' during the decoding of a frame but is 'go to first
+     * line' at the beginning. Since we always interpret it as 'go to next line'
+     * in the decoding loop (which makes code simpler/faster), the first line
+     * would not be counted, so we count one more.
+     * See: https://trac.ffmpeg.org/ticket/226
+     * In the following decoding loop, row_ptr will be the position of the
+     * current row. */
 
     row_ptr  -= row_inc;
     pixel_ptr = row_ptr;
     lines_to_change++;
     while (lines_to_change) {
         skip     =              bytestream2_get_byte(&s->g);
-        rle_code = (signed char)bytestream2_get_byte(&s->g);
+        rle_code = (int8_t)bytestream2_get_byte(&s->g);
         if (rle_code == 0)
             break;
         if(skip & 0x80) {
             lines_to_change--;
             row_ptr += row_inc;
-            pixel_ptr = row_ptr + 2 * (skip & 0x7f);
+            pixel_ptr = row_ptr + 2 * 8 * (skip & 0x7f);
         } else
-            pixel_ptr += 2 * skip;
+            pixel_ptr += 2 * 8 * skip;
         CHECK_PIXEL_PTR(0);  /* make sure pixel_ptr is positive */
 
+        if(rle_code == -1)
+            continue;
+
         if (rle_code < 0) {
             /* decode the run length code */
             rle_code = -rle_code;
@@ -88,19 +99,42 @@ static void qtrle_decode_1bpp(QtrleContext *s, int row_ptr, int lines_to_change)
 
             pi0 = bytestream2_get_byte(&s->g);
             pi1 = bytestream2_get_byte(&s->g);
-            CHECK_PIXEL_PTR(rle_code * 2);
+            CHECK_PIXEL_PTR(rle_code * 2 * 8);
 
             while (rle_code--) {
-                rgb[pixel_ptr++] = pi0;
-                rgb[pixel_ptr++] = pi1;
+                rgb[pixel_ptr++] = (pi0 >> 7) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 6) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 5) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 4) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 3) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 2) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 1) & 0x01;
+                rgb[pixel_ptr++] =  pi0       & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 7) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 6) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 5) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 4) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 3) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 2) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 1) & 0x01;
+                rgb[pixel_ptr++] =  pi1       & 0x01;
             }
         } else {
             /* copy the same pixel directly to output 2 times */
             rle_code *= 2;
-            CHECK_PIXEL_PTR(rle_code);
+            CHECK_PIXEL_PTR(rle_code * 8);
 
-            while (rle_code--)
-                rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
+            while (rle_code--) {
+                int x = bytestream2_get_byte(&s->g);
+                rgb[pixel_ptr++] = (x >> 7) & 0x01;
+                rgb[pixel_ptr++] = (x >> 6) & 0x01;
+                rgb[pixel_ptr++] = (x >> 5) & 0x01;
+                rgb[pixel_ptr++] = (x >> 4) & 0x01;
+                rgb[pixel_ptr++] = (x >> 3) & 0x01;
+                rgb[pixel_ptr++] = (x >> 2) & 0x01;
+                rgb[pixel_ptr++] = (x >> 1) & 0x01;
+                rgb[pixel_ptr++] =  x       & 0x01;
+            }
         }
     }
 }
@@ -111,8 +145,8 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
     int rle_code, i;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi[16];  /* 16 palette indices */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi[16];  /* 16 palette indices */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
     int num_pixels = (bpp == 4) ? 8 : 16;
 
@@ -120,7 +154,9 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
         pixel_ptr = row_ptr + (num_pixels * (bytestream2_get_byte(&s->g) - 1));
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
+            if (bytestream2_get_bytes_left(&s->g) < 1)
+                return;
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (num_pixels * (bytestream2_get_byte(&s->g) - 1));
@@ -136,8 +172,8 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
                 }
                 CHECK_PIXEL_PTR(rle_code * num_pixels);
                 while (rle_code--) {
-                    for (i = 0; i < num_pixels; i++)
-                        rgb[pixel_ptr++] = pi[i];
+                    memcpy(&rgb[pixel_ptr], &pi, num_pixels);
+                    pixel_ptr += num_pixels;
                 }
             } else {
                 /* copy the same pixel directly to output 4 times */
@@ -167,15 +203,17 @@ static void qtrle_decode_8bpp(QtrleContext *s, int row_ptr, int lines_to_change)
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi1, pi2, pi3, pi4;  /* 4 palette indexes */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi1, pi2, pi3, pi4;  /* 4 palette indexes */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (4 * (bytestream2_get_byte(&s->g) - 1));
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
+            if (bytestream2_get_bytes_left(&s->g) < 1)
+                return;
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (4 * (bytestream2_get_byte(&s->g) - 1));
@@ -203,9 +241,8 @@ static void qtrle_decode_8bpp(QtrleContext *s, int row_ptr, int lines_to_change)
                 rle_code *= 4;
                 CHECK_PIXEL_PTR(rle_code);
 
-                while (rle_code--) {
-                    rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
-                }
+                bytestream2_get_buffer(&s->g, &rgb[pixel_ptr], rle_code);
+                pixel_ptr += rle_code;
             }
         }
         row_ptr += row_inc;
@@ -217,15 +254,17 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned short rgb16;
-    unsigned char *rgb = s->frame->data[0];
+    uint16_t rgb16;
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 2;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
+            if (bytestream2_get_bytes_left(&s->g) < 1)
+                return;
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 2;
@@ -238,7 +277,7 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
                 CHECK_PIXEL_PTR(rle_code * 2);
 
                 while (rle_code--) {
-                    *(unsigned short *)(&rgb[pixel_ptr]) = rgb16;
+                    *(uint16_t *)(&rgb[pixel_ptr]) = rgb16;
                     pixel_ptr += 2;
                 }
             } else {
@@ -247,7 +286,7 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
                 /* copy pixels directly to output */
                 while (rle_code--) {
                     rgb16 = bytestream2_get_be16(&s->g);
-                    *(unsigned short *)(&rgb[pixel_ptr]) = rgb16;
+                    *(uint16_t *)(&rgb[pixel_ptr]) = rgb16;
                     pixel_ptr += 2;
                 }
             }
@@ -258,18 +297,21 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
 
 static void qtrle_decode_24bpp(QtrleContext *s, int row_ptr, int lines_to_change)
 {
-    int rle_code;
+    int rle_code, rle_code_half;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char r, g, b;
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t b;
+    uint16_t rg;
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 3;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
+            if (bytestream2_get_bytes_left(&s->g) < 1)
+                return;
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 3;
@@ -277,25 +319,31 @@ static void qtrle_decode_24bpp(QtrleContext *s, int row_ptr, int lines_to_change
             } else if (rle_code < 0) {
                 /* decode the run length code */
                 rle_code = -rle_code;
-                r = bytestream2_get_byte(&s->g);
-                g = bytestream2_get_byte(&s->g);
+                rg = bytestream2_get_ne16(&s->g);
                 b = bytestream2_get_byte(&s->g);
 
                 CHECK_PIXEL_PTR(rle_code * 3);
 
                 while (rle_code--) {
-                    rgb[pixel_ptr++] = r;
-                    rgb[pixel_ptr++] = g;
-                    rgb[pixel_ptr++] = b;
+                    AV_WN16A(rgb + pixel_ptr, rg);
+                    rgb[pixel_ptr + 2] = b;
+                    pixel_ptr += 3;
                 }
             } else {
                 CHECK_PIXEL_PTR(rle_code * 3);
 
-                /* copy pixels directly to output */
-                while (rle_code--) {
-                    rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
-                    rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
-                    rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
+                rle_code_half = rle_code / 2;
+
+                while (rle_code_half--) { /* copy 2 raw rgb value at the same time */
+                    AV_WN32A(rgb + pixel_ptr, bytestream2_get_ne32(&s->g)); /* rgbr */
+                    AV_WN16A(rgb + pixel_ptr + 4, bytestream2_get_ne16(&s->g)); /* rgbr */
+                    pixel_ptr += 6;
+                }
+
+                if (rle_code % 2 != 0){ /* not even raw value */
+                    AV_WN16A(rgb + pixel_ptr, bytestream2_get_ne16(&s->g));
+                    rgb[pixel_ptr + 2] = bytestream2_get_byte(&s->g);
+                    pixel_ptr += 3;
                 }
             }
         }
@@ -305,18 +353,20 @@ static void qtrle_decode_24bpp(QtrleContext *s, int row_ptr, int lines_to_change
 
 static void qtrle_decode_32bpp(QtrleContext *s, int row_ptr, int lines_to_change)
 {
-    int rle_code;
+    int rle_code, rle_code_half;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
     unsigned int argb;
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 4;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
+            if (bytestream2_get_bytes_left(&s->g) < 1)
+                return;
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 4;
@@ -324,7 +374,7 @@ static void qtrle_decode_32bpp(QtrleContext *s, int row_ptr, int lines_to_change
             } else if (rle_code < 0) {
                 /* decode the run length code */
                 rle_code = -rle_code;
-                argb = bytestream2_get_be32(&s->g);
+                argb = bytestream2_get_ne32(&s->g);
 
                 CHECK_PIXEL_PTR(rle_code * 4);
 
@@ -336,10 +386,15 @@ static void qtrle_decode_32bpp(QtrleContext *s, int row_ptr, int lines_to_change
                 CHECK_PIXEL_PTR(rle_code * 4);
 
                 /* copy pixels directly to output */
-                while (rle_code--) {
-                    argb = bytestream2_get_be32(&s->g);
-                    AV_WN32A(rgb + pixel_ptr, argb);
-                    pixel_ptr  += 4;
+                rle_code_half = rle_code / 2;
+                while (rle_code_half--) { /* copy 2 argb raw value at the same time */
+                    AV_WN64A(rgb + pixel_ptr, bytestream2_get_ne64(&s->g));
+                    pixel_ptr += 8;
+                }
+
+                if (rle_code % 2 != 0){ /* not even raw value */
+                    AV_WN32A(rgb + pixel_ptr, bytestream2_get_ne32(&s->g));
+                    pixel_ptr += 4;
                 }
             }
         }
@@ -354,13 +409,10 @@ static av_cold int qtrle_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     switch (avctx->bits_per_coded_sample) {
     case 1:
-    case 33:
-        avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-        break;
-
     case 2:
     case 4:
     case 8:
+    case 33:
     case 34:
     case 36:
     case 40:
@@ -376,7 +428,7 @@ static av_cold int qtrle_decode_init(AVCodecContext *avctx)
         break;
 
     case 32:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_ARGB;
         break;
 
     default:
@@ -403,14 +455,10 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
     int ret;
 
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log (s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-        return ret;
-    }
 
     /* check if this frame is even supposed to change */
     if (avpkt->size < 8)
-        goto done;
+        return avpkt->size;
 
     /* start after the chunk size */
     bytestream2_seek(&s->g, 4, SEEK_SET);
@@ -421,21 +469,27 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
     /* if a header is present, fetch additional decoding parameters */
     if (header & 0x0008) {
         if (avpkt->size < 14)
-            goto done;
+            return avpkt->size;
         start_line = bytestream2_get_be16(&s->g);
         bytestream2_skip(&s->g, 2);
         height     = bytestream2_get_be16(&s->g);
         bytestream2_skip(&s->g, 2);
+        if (height > s->avctx->height - start_line)
+            return avpkt->size;
     } else {
         start_line = 0;
         height     = s->avctx->height;
     }
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+        return ret;
+
     row_ptr = s->frame->linesize[0] * start_line;
 
     switch (avctx->bits_per_coded_sample) {
     case 1:
     case 33:
         qtrle_decode_1bpp(s, row_ptr, height);
+        has_palette = 1;
         break;
 
     case 2:
@@ -475,18 +529,20 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
     }
 
     if(has_palette) {
-        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+        int size;
+        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &size);
 
-        if (pal) {
+        if (pal && size == AVPALETTE_SIZE) {
             s->frame->palette_has_changed = 1;
             memcpy(s->pal, pal, AVPALETTE_SIZE);
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
         }
 
         /* make the palette available on the way out */
         memcpy(s->frame->data[1], s->pal, AVPALETTE_SIZE);
     }
 
-done:
     if ((ret = av_frame_ref(data, s->frame)) < 0)
         return ret;
     *got_frame      = 1;
diff --git a/libavcodec/qtrleenc.c b/libavcodec/qtrleenc.c
index e7de209..cdd864b 100644
--- a/libavcodec/qtrleenc.c
+++ b/libavcodec/qtrleenc.c
@@ -5,20 +5,20 @@
  *
  * This file is based on flashsvenc.c.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,7 @@ typedef struct QtrleEncContext {
     int pixel_size;
     AVFrame *previous_frame;
     unsigned int max_buf_size;
+    int logical_width;
     /**
      * This array will contain at ith position the value of the best RLE code
      * if the line started at pixel i
@@ -78,11 +79,20 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
     QtrleEncContext *s = avctx->priv_data;
 
     if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0) {
-        return -1;
+        return AVERROR(EINVAL);
     }
     s->avctx=avctx;
+    s->logical_width=avctx->width;
 
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_GRAY8:
+        if (avctx->width % 4) {
+            av_log(avctx, AV_LOG_ERROR, "Width not being a multiple of 4 is not supported\n");
+            return AVERROR(EINVAL);
+        }
+        s->logical_width = avctx->width / 4;
+        s->pixel_size = 4;
+        break;
     case AV_PIX_FMT_RGB555BE:
         s->pixel_size = 2;
         break;
@@ -96,25 +106,25 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Unsupported colorspace.\n");
         break;
     }
-    avctx->bits_per_coded_sample = s->pixel_size*8;
+    avctx->bits_per_coded_sample = avctx->pix_fmt == AV_PIX_FMT_GRAY8 ? 40 : s->pixel_size*8;
 
-    s->rlecode_table = av_mallocz(s->avctx->width);
-    s->skip_table    = av_mallocz(s->avctx->width);
-    s->length_table  = av_mallocz((s->avctx->width + 1)*sizeof(int));
+    s->rlecode_table = av_mallocz(s->logical_width);
+    s->skip_table    = av_mallocz(s->logical_width);
+    s->length_table  = av_mallocz_array(s->logical_width + 1, sizeof(int));
     if (!s->skip_table || !s->length_table || !s->rlecode_table) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating memory.\n");
-        return -1;
+        return AVERROR(ENOMEM);
     }
     s->previous_frame = av_frame_alloc();
     if (!s->previous_frame) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating picture\n");
-        return -1;
+        return AVERROR(ENOMEM);
     }
 
-    s->max_buf_size = s->avctx->width*s->avctx->height*s->pixel_size*2 /* image base material */
-                      + 15                                           /* header + footer */
-                      + s->avctx->height*2                           /* skip code+rle end */
-                      + s->avctx->width/MAX_RLE_BULK + 1             /* rle codes */;
+    s->max_buf_size = s->logical_width*s->avctx->height*s->pixel_size*2 /* image base material */
+                      + 15                                            /* header + footer */
+                      + s->avctx->height*2                            /* skip code+rle end */
+                      + s->logical_width/MAX_RLE_BULK + 1             /* rle codes */;
 
     return 0;
 }
@@ -124,26 +134,26 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
  */
 static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, uint8_t **buf)
 {
-    int width=s->avctx->width;
+    int width=s->logical_width;
     int i;
     signed char rlecode;
 
-    /* We will use it to compute the best bulk copy sequence */
-    unsigned int bulkcount;
     /* This will be the number of pixels equal to the previous frame one's
      * starting from the ith pixel */
     unsigned int skipcount;
     /* This will be the number of consecutive equal pixels in the current
      * frame, starting from the ith one also */
-    unsigned int repeatcount;
+    unsigned int av_uninit(repeatcount);
 
     /* The cost of the three different possibilities */
-    int total_bulk_cost;
     int total_skip_cost;
     int total_repeat_cost;
 
-    int temp_cost;
-    int j;
+    int base_bulk_cost;
+    int lowest_bulk_cost;
+    int lowest_bulk_cost_index;
+    int sec_lowest_bulk_cost;
+    int sec_lowest_bulk_cost_index;
 
     uint8_t *this_line = p->               data[0] + line*p->               linesize[0] +
         (width - 1)*s->pixel_size;
@@ -153,8 +163,57 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
     s->length_table[width] = 0;
     skipcount = 0;
 
+    /* Initial values */
+    lowest_bulk_cost = INT_MAX / 2;
+    lowest_bulk_cost_index = width;
+    sec_lowest_bulk_cost = INT_MAX / 2;
+    sec_lowest_bulk_cost_index = width;
+
+    base_bulk_cost = 1 + s->pixel_size;
+
     for (i = width - 1; i >= 0; i--) {
 
+        int prev_bulk_cost;
+
+        /* If our lowest bulk cost index is too far away, replace it
+         * with the next lowest bulk cost */
+        if (FFMIN(width, i + MAX_RLE_BULK) < lowest_bulk_cost_index) {
+            lowest_bulk_cost = sec_lowest_bulk_cost;
+            lowest_bulk_cost_index = sec_lowest_bulk_cost_index;
+
+            sec_lowest_bulk_cost = INT_MAX / 2;
+            sec_lowest_bulk_cost_index = width;
+        }
+
+        /* Deal with the first pixel's bulk cost */
+        if (!i) {
+            base_bulk_cost++;
+            lowest_bulk_cost++;
+            sec_lowest_bulk_cost++;
+        }
+
+        /* Look at the bulk cost of the previous loop and see if it is
+         * a new lower bulk cost */
+        prev_bulk_cost = s->length_table[i + 1] + base_bulk_cost;
+        if (prev_bulk_cost <= sec_lowest_bulk_cost) {
+            /* If it's lower than the 2nd lowest, then it may be lower
+             * than the lowest */
+            if (prev_bulk_cost <= lowest_bulk_cost) {
+
+                /* If we have found a new lowest bulk cost,
+                 * then the 2nd lowest bulk cost is now farther than the
+                 * lowest bulk cost, and will never be used */
+                sec_lowest_bulk_cost = INT_MAX / 2;
+
+                lowest_bulk_cost = prev_bulk_cost;
+                lowest_bulk_cost_index = i + 1;
+            } else {
+                /* Then it must be the 2nd lowest bulk cost */
+                sec_lowest_bulk_cost = prev_bulk_cost;
+                sec_lowest_bulk_cost_index = i + 1;
+            }
+        }
+
         if (!s->key_frame && !memcmp(this_line, prev_line, s->pixel_size))
             skipcount = FFMIN(skipcount + 1, MAX_RLE_SKIP);
         else
@@ -190,26 +249,17 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
         }
         else {
             /* We cannot do neither skip nor repeat
-             * thus we search for the best bulk copy to do */
+             * thus we use the best bulk copy  */
 
-            int limit = FFMIN(width - i, MAX_RLE_BULK);
+            s->length_table[i]  = lowest_bulk_cost;
+            s->rlecode_table[i] = lowest_bulk_cost_index - i;
 
-            temp_cost = 1 + s->pixel_size + !i;
-            total_bulk_cost = INT_MAX;
-
-            for (j = 1; j <= limit; j++) {
-                if (s->length_table[i + j] + temp_cost < total_bulk_cost) {
-                    /* We have found a better bulk copy ... */
-                    total_bulk_cost = s->length_table[i + j] + temp_cost;
-                    bulkcount = j;
-                }
-                temp_cost += s->pixel_size;
-            }
-
-            s->length_table[i]  = total_bulk_cost;
-            s->rlecode_table[i] = bulkcount;
         }
 
+        /* These bulk costs increase every iteration */
+        lowest_bulk_cost += s->pixel_size;
+        sec_lowest_bulk_cost += s->pixel_size;
+
         this_line -= s->pixel_size;
         prev_line -= s->pixel_size;
     }
@@ -239,12 +289,28 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
         }
         else if (rlecode > 0) {
             /* bulk copy */
-            bytestream_put_buffer(buf, this_line + i*s->pixel_size, rlecode*s->pixel_size);
+            if (s->avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                int j;
+                // QT grayscale colorspace has 0=white and 255=black, we will
+                // ignore the palette that is included in the AVFrame because
+                // AV_PIX_FMT_GRAY8 has defined color mapping
+                for (j = 0; j < rlecode*s->pixel_size; ++j)
+                    bytestream_put_byte(buf, *(this_line + i*s->pixel_size + j) ^ 0xff);
+            } else {
+                bytestream_put_buffer(buf, this_line + i*s->pixel_size, rlecode*s->pixel_size);
+            }
             i += rlecode;
         }
         else {
             /* repeat the bits */
-            bytestream_put_buffer(buf, this_line + i*s->pixel_size, s->pixel_size);
+            if (s->avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                int j;
+                // QT grayscale colorspace has 0=white and 255=black, ...
+                for (j = 0; j < s->pixel_size; ++j)
+                    bytestream_put_byte(buf, *(this_line + i*s->pixel_size + j) ^ 0xff);
+            } else {
+                bytestream_put_buffer(buf, this_line + i*s->pixel_size, s->pixel_size);
+            }
             i -= rlecode;
         }
     }
@@ -260,7 +326,7 @@ static int encode_frame(QtrleEncContext *s, const AVFrame *p, uint8_t *buf)
     uint8_t *orig_buf = buf;
 
     if (!s->key_frame) {
-        unsigned line_size = s->avctx->width * s->pixel_size;
+        unsigned line_size = s->logical_width * s->pixel_size;
         for (start_line = 0; start_line < s->avctx->height; start_line++)
             if (memcmp(p->data[0] + start_line*p->linesize[0],
                        s->previous_frame->data[0] + start_line * s->previous_frame->linesize[0],
@@ -297,28 +363,16 @@ static int qtrle_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                               const AVFrame *pict, int *got_packet)
 {
     QtrleEncContext * const s = avctx->priv_data;
-#if FF_API_CODED_FRAME
-    enum AVPictureType pict_type;
-#endif
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, s->max_buf_size)) < 0) {
-        /* Upper bound check for compressed data */
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", s->max_buf_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->max_buf_size, 0)) < 0)
         return ret;
-    }
 
     if (avctx->gop_size == 0 || (s->avctx->frame_number % avctx->gop_size) == 0) {
         /* I-Frame */
-#if FF_API_CODED_FRAME
-        pict_type = AV_PICTURE_TYPE_I;
-#endif
         s->key_frame = 1;
     } else {
         /* P-Frame */
-#if FF_API_CODED_FRAME
-        pict_type = AV_PICTURE_TYPE_P;
-#endif
         s->key_frame = 0;
     }
 
@@ -335,7 +389,7 @@ static int qtrle_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->key_frame = s->key_frame;
-    avctx->coded_frame->pict_type = pict_type;
+    avctx->coded_frame->pict_type = s->key_frame ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
@@ -356,6 +410,6 @@ AVCodec ff_qtrle_encoder = {
     .encode2        = qtrle_encode_frame,
     .close          = qtrle_encode_end,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB555BE, AV_PIX_FMT_ARGB, AV_PIX_FMT_NONE
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB555BE, AV_PIX_FMT_ARGB, AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/r210dec.c b/libavcodec/r210dec.c
index cfe5484..407684c 100644
--- a/libavcodec/r210dec.c
+++ b/libavcodec/r210dec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Doeffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    avctx->pix_fmt             = AV_PIX_FMT_RGB48;
+    avctx->pix_fmt = AV_PIX_FMT_GBRP10;
     avctx->bits_per_raw_sample = 10;
 
     return 0;
@@ -39,8 +39,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int h, w, ret;
     AVFrame *pic = data;
     const uint32_t *src = (const uint32_t *)avpkt->data;
-    int aligned_width = FFALIGN(avctx->width, 64);
-    uint8_t *dst_line;
+    int aligned_width = FFALIGN(avctx->width,
+                                avctx->codec_id == AV_CODEC_ID_R10K ? 1 : 64);
+    uint8_t *g_line, *b_line, *r_line;
+    int r10 = (avctx->codec_tag & 0xFFFFFF) == MKTAG('r', '1', '0', 0);
+    int le = avctx->codec_tag == MKTAG('R', '1', '0', 'k') &&
+             avctx->extradata_size >= 12 && !memcmp(&avctx->extradata[4], "DpxE", 4) &&
+             !avctx->extradata[11];
 
     if (avpkt->size < 4 * aligned_width * avctx->height) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
@@ -52,28 +57,43 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
-    dst_line = pic->data[0];
+    g_line = pic->data[0];
+    b_line = pic->data[1];
+    r_line = pic->data[2];
 
     for (h = 0; h < avctx->height; h++) {
-        uint16_t *dst = (uint16_t *)dst_line;
+        uint16_t *dstg = (uint16_t *)g_line;
+        uint16_t *dstb = (uint16_t *)b_line;
+        uint16_t *dstr = (uint16_t *)r_line;
         for (w = 0; w < avctx->width; w++) {
-            uint32_t pixel = av_be2ne32(*src++);
+            uint32_t pixel;
             uint16_t r, g, b;
-            if (avctx->codec_id==AV_CODEC_ID_R210) {
-                b =  pixel <<  6;
-                g = (pixel >>  4) & 0xffc0;
-                r = (pixel >> 14) & 0xffc0;
+            if (avctx->codec_id == AV_CODEC_ID_AVRP || r10 || le) {
+                pixel = av_le2ne32(*src++);
             } else {
-                b =  pixel <<  4;
-                g = (pixel >>  6) & 0xffc0;
-                r = (pixel >> 16) & 0xffc0;
+                pixel = av_be2ne32(*src++);
             }
-            *dst++ = r | (r >> 10);
-            *dst++ = g | (g >> 10);
-            *dst++ = b | (b >> 10);
+            if (avctx->codec_id == AV_CODEC_ID_R210) {
+                b =  pixel & 0x3ff;
+                g = (pixel >> 10) & 0x3ff;
+                r = (pixel >> 20) & 0x3ff;
+            } else if (r10) {
+                r =  pixel & 0x3ff;
+                g = (pixel >> 10) & 0x3ff;
+                b = (pixel >> 20) & 0x3ff;
+            } else {
+                b = (pixel >>  2) & 0x3ff;
+                g = (pixel >> 12) & 0x3ff;
+                r = (pixel >> 22) & 0x3ff;
+            }
+            *dstr++ = r;
+            *dstg++ = g;
+            *dstb++ = b;
         }
         src += aligned_width - avctx->width;
-        dst_line += pic->linesize[0];
+        g_line += pic->linesize[0];
+        b_line += pic->linesize[1];
+        r_line += pic->linesize[2];
     }
 
     *got_frame      = 1;
@@ -105,3 +125,15 @@ AVCodec ff_r10k_decoder = {
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
 #endif
+#if CONFIG_AVRP_DECODER
+AVCodec ff_avrp_decoder = {
+    .name           = "avrp",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRP,
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
+#endif
diff --git a/libavcodec/r210enc.c b/libavcodec/r210enc.c
new file mode 100644
index 0000000..02412f3
--- /dev/null
+++ b/libavcodec/r210enc.c
@@ -0,0 +1,123 @@
+/*
+ * R210 encoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    int aligned_width = FFALIGN(avctx->width,
+                                avctx->codec_id == AV_CODEC_ID_R10K ? 1 : 64);
+
+    avctx->bits_per_coded_sample = 32;
+    if (avctx->width > 0)
+        avctx->bit_rate = ff_guess_coded_bitrate(avctx) * aligned_width / avctx->width;
+
+    return 0;
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pic, int *got_packet)
+{
+    int i, j, ret;
+    int aligned_width = FFALIGN(avctx->width,
+                                avctx->codec_id == AV_CODEC_ID_R10K ? 1 : 64);
+    int pad = (aligned_width - avctx->width) * 4;
+    uint8_t *srcr_line, *srcg_line, *srcb_line;
+    uint8_t *dst;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, 4 * aligned_width * avctx->height, 0)) < 0)
+        return ret;
+
+    srcg_line = pic->data[0];
+    srcb_line = pic->data[1];
+    srcr_line = pic->data[2];
+    dst = pkt->data;
+
+    for (i = 0; i < avctx->height; i++) {
+        uint16_t *srcr = (uint16_t *)srcr_line;
+        uint16_t *srcg = (uint16_t *)srcg_line;
+        uint16_t *srcb = (uint16_t *)srcb_line;
+        for (j = 0; j < avctx->width; j++) {
+            uint32_t pixel;
+            uint16_t r = *srcr++;
+            uint16_t g = *srcg++;
+            uint16_t b = *srcb++;
+            if (avctx->codec_id == AV_CODEC_ID_R210)
+                pixel = (r << 20) | (g << 10) | b;
+            else
+                pixel = (r << 22) | (g << 12) | (b << 2);
+            if (avctx->codec_id == AV_CODEC_ID_AVRP)
+                bytestream_put_le32(&dst, pixel);
+            else
+                bytestream_put_be32(&dst, pixel);
+        }
+        memset(dst, 0, pad);
+        dst += pad;
+        srcr_line += pic->linesize[2];
+        srcg_line += pic->linesize[0];
+        srcb_line += pic->linesize[1];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+
+#if CONFIG_R210_ENCODER
+AVCodec ff_r210_encoder = {
+    .name           = "r210",
+    .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed RGB 10-bit"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_R210,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_GBRP10, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_R10K_ENCODER
+AVCodec ff_r10k_encoder = {
+    .name           = "r10k",
+    .long_name      = NULL_IF_CONFIG_SMALL("AJA Kona 10-bit RGB Codec"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_R10K,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_GBRP10, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_AVRP_ENCODER
+AVCodec ff_avrp_encoder = {
+    .name           = "avrp",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRP,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_GBRP10, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
diff --git a/libavcodec/ra144.c b/libavcodec/ra144.c
index ccaa149..65a7448 100644
--- a/libavcodec/ra144.c
+++ b/libavcodec/ra144.c
@@ -2,20 +2,20 @@
  * Real Audio 1.0 (14.4K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1512,11 +1512,11 @@ static void add_wav(int16_t *dest, int n, int skip_first, int *m,
 
     v[0] = 0;
     for (i=!skip_first; i<3; i++)
-        v[i] = (ff_gain_val_tab[n][i] * m[i]) >> ff_gain_exp_tab[n];
+        v[i] = (ff_gain_val_tab[n][i] * (unsigned)m[i]) >> ff_gain_exp_tab[n];
 
     if (v[0]) {
         for (i=0; i < BLOCKSIZE; i++)
-            dest[i] = (s1[i]*v[0] + s2[i]*v[1] + s3[i]*v[2]) >> 12;
+            dest[i] = (int)((s1[i]*(unsigned)v[0]) + s2[i]*v[1] + s3[i]*v[2]) >> 12;
     } else {
         for (i=0; i < BLOCKSIZE; i++)
             dest[i] = (             s2[i]*v[1] + s3[i]*v[2]) >> 12;
@@ -1566,8 +1566,15 @@ int ff_eval_refl(int *refl, const int16_t *coefs, AVCodecContext *avctx)
         if (!b)
             b = -2;
 
-        for (j=0; j <= i; j++)
-            bp1[j] = ((bp2[j] - ((refl[i+1] * bp2[i-j]) >> 12)) * (0x1000000 / b)) >> 12;
+        b = 0x1000000 / b;
+        for (j=0; j <= i; j++) {
+#if CONFIG_FTRAPV
+            int a = bp2[j] - ((int)(refl[i+1] * (unsigned)bp2[i-j]) >> 12);
+            if((int)(a*(unsigned)b) != a*(int64_t)b)
+                return 1;
+#endif
+            bp1[j] = (int)((bp2[j] - ((int)(refl[i+1] * (unsigned)bp2[i-j]) >> 12)) * (unsigned)b) >> 12;
+        }
 
         if ((unsigned) bp1[i] + 0x1000 > 0x1fff)
             return 1;
@@ -1591,10 +1598,10 @@ void ff_eval_coefs(int *coefs, const int *refl)
     int i, j;
 
     for (i=0; i < LPC_ORDER; i++) {
-        b1[i] = refl[i] << 4;
+        b1[i] = refl[i] * 16;
 
         for (j=0; j < i; j++)
-            b1[j] = ((refl[i] * b2[i-j-1]) >> 12) + b2[j];
+            b1[j] = ((int)(refl[i] * (unsigned)b2[i-j-1]) >> 12) + b2[j];
 
         FFSWAP(int *, b1, b2);
     }
@@ -1674,12 +1681,9 @@ unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy)
 }
 
 /** inverse root mean square */
-int ff_irms(const int16_t *data)
+int ff_irms(AudioDSPContext *adsp, const int16_t *data)
 {
-    unsigned int i, sum = 0;
-
-    for (i=0; i < BLOCKSIZE; i++)
-        sum += data[i] * data[i];
+    unsigned int sum = adsp->scalarproduct_int16(data, data, BLOCKSIZE);
 
     if (sum == 0)
         return 0; /* OOPS - division by zero */
@@ -1687,18 +1691,17 @@ int ff_irms(const int16_t *data)
     return 0x20000000 / (ff_t_sqrt(sum) >> 8);
 }
 
-void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
+void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs,
                            int cba_idx, int cb1_idx, int cb2_idx,
                            int gval, int gain)
 {
-    uint16_t buffer_a[BLOCKSIZE];
-    uint16_t *block;
+    int16_t *block;
     int m[3];
 
     if (cba_idx) {
         cba_idx += BLOCKSIZE/2 - 1;
-        ff_copy_and_dup(buffer_a, ractx->adapt_cb, cba_idx);
-        m[0] = (ff_irms(buffer_a) * gval) >> 12;
+        ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx);
+        m[0] = (ff_irms(&ractx->adsp, ractx->buffer_a) * (unsigned)gval) >> 12;
     } else {
         m[0] = 0;
     }
@@ -1709,7 +1712,7 @@ void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
 
     block = ractx->adapt_cb + BUFFERSIZE - BLOCKSIZE;
 
-    add_wav(block, gain, cba_idx, m, cba_idx? buffer_a: NULL,
+    add_wav(block, gain, cba_idx, m, cba_idx? ractx->buffer_a: NULL,
             ff_cb1_vects[cb1_idx], ff_cb2_vects[cb2_idx]);
 
     memcpy(ractx->curr_sblock, ractx->curr_sblock + BLOCKSIZE,
diff --git a/libavcodec/ra144.h b/libavcodec/ra144.h
index 89d4fb5..19a4ce0 100644
--- a/libavcodec/ra144.h
+++ b/libavcodec/ra144.h
@@ -2,20 +2,20 @@
  * Real Audio 1.0 (14.4K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,16 +25,18 @@
 #include <stdint.h>
 #include "lpc.h"
 #include "audio_frame_queue.h"
+#include "audiodsp.h"
 
 #define NBLOCKS         4       ///< number of subblocks within a block
 #define BLOCKSIZE       40      ///< subblock size in 16-bit words
 #define BUFFERSIZE      146     ///< the size of the adaptive codebook
 #define FIXED_CB_SIZE   128     ///< size of fixed codebooks
-#define FRAMESIZE       20      ///< size of encoded frame
+#define FRAME_SIZE      20      ///< size of encoded frame
 #define LPC_ORDER       10      ///< order of LPC filter
 
 typedef struct RA144Context {
     AVCodecContext *avctx;
+    AudioDSPContext adsp;
     LPCContext lpc_ctx;
     AudioFrameQueue afq;
     int last_frame;
@@ -56,7 +58,9 @@ typedef struct RA144Context {
 
     /** Adaptive codebook, its size is two units bigger to avoid a
      *  buffer overflow. */
-    uint16_t adapt_cb[146+2];
+    int16_t adapt_cb[146+2];
+
+    DECLARE_ALIGNED(16, int16_t, buffer_a)[FFALIGN(BLOCKSIZE,16)];
 } RA144Context;
 
 void ff_copy_and_dup(int16_t *target, const int16_t *source, int offset);
@@ -68,8 +72,8 @@ unsigned int ff_rms(const int *data);
 int ff_interp(RA144Context *ractx, int16_t *out, int a, int copyold,
               int energy);
 unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy);
-int ff_irms(const int16_t *data);
-void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
+int ff_irms(AudioDSPContext *adsp, const int16_t *data/*align 16*/);
+void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs,
                            int cba_idx, int cb1_idx, int cb2_idx,
                            int gval, int gain);
 
diff --git a/libavcodec/ra144dec.c b/libavcodec/ra144dec.c
index 05165bf..c716c32 100644
--- a/libavcodec/ra144dec.c
+++ b/libavcodec/ra144dec.c
@@ -5,27 +5,26 @@
  * Copyright (c) 2003 Nick Kurshev
  *     Based on public domain decoder at http://www.honeypot.net/audio
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/channel_layout.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "ra144.h"
 
@@ -35,6 +34,7 @@ static av_cold int ra144_decode_init(AVCodecContext * avctx)
     RA144Context *ractx = avctx->priv_data;
 
     ractx->avctx = avctx;
+    ff_audiodsp_init(&ractx->adsp);
 
     ractx->lpc_coef[0] = ractx->lpc_tables[0];
     ractx->lpc_coef[1] = ractx->lpc_tables[1];
@@ -46,13 +46,13 @@ static av_cold int ra144_decode_init(AVCodecContext * avctx)
     return 0;
 }
 
-static void do_output_subblock(RA144Context *ractx, const uint16_t  *lpc_coefs,
-                               int gval, BitstreamContext *bc)
+static void do_output_subblock(RA144Context *ractx, const int16_t  *lpc_coefs,
+                               int gval, GetBitContext *gb)
 {
-    int cba_idx = bitstream_read(bc, 7); // index of the adaptive CB, 0 if none
-    int gain    = bitstream_read(bc, 8);
-    int cb1_idx = bitstream_read(bc, 7);
-    int cb2_idx = bitstream_read(bc, 7);
+    int cba_idx = get_bits(gb, 7); // index of the adaptive CB, 0 if none
+    int gain    = get_bits(gb, 8);
+    int cb1_idx = get_bits(gb, 7);
+    int cb2_idx = get_bits(gb, 7);
 
     ff_subblock_synthesis(ractx, lpc_coefs, cba_idx, cb1_idx, cb2_idx, gval,
                           gain);
@@ -67,7 +67,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
     int buf_size = avpkt->size;
     static const uint8_t sizes[LPC_ORDER] = {6, 5, 5, 4, 4, 3, 3, 3, 3, 2};
     unsigned int refl_rms[NBLOCKS];           // RMS of the reflection coefficients
-    uint16_t block_coefs[NBLOCKS][LPC_ORDER]; // LPC coefficients of each sub-block
+    int16_t block_coefs[NBLOCKS][LPC_ORDER];  // LPC coefficients of each sub-block
     unsigned int lpc_refl[LPC_ORDER];         // LPC reflection coefficients of the frame
     int i, j;
     int ret;
@@ -75,9 +75,9 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
     unsigned int energy;
 
     RA144Context *ractx = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
 
-    if (buf_size < FRAMESIZE) {
+    if (buf_size < FRAME_SIZE) {
         av_log(avctx, AV_LOG_ERROR,
                "Frame too small (%d bytes). Truncated file?\n", buf_size);
         *got_frame_ptr = 0;
@@ -86,21 +86,19 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = NBLOCKS * BLOCKSIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
-    bitstream_init8(&bc, buf, FRAMESIZE);
+    init_get_bits8(&gb, buf, FRAME_SIZE);
 
     for (i = 0; i < LPC_ORDER; i++)
-        lpc_refl[i] = ff_lpc_refl_cb[i][bitstream_read(&bc, sizes[i])];
+        lpc_refl[i] = ff_lpc_refl_cb[i][get_bits(&gb, sizes[i])];
 
     ff_eval_coefs(ractx->lpc_coef[0], lpc_refl);
     ractx->lpc_refl_rms[0] = ff_rms(lpc_refl);
 
-    energy = ff_energy_tab[bitstream_read(&bc, 5)];
+    energy = ff_energy_tab[get_bits(&gb, 5)];
 
     refl_rms[0] = ff_interp(ractx, block_coefs[0], 1, 1, ractx->old_energy);
     refl_rms[1] = ff_interp(ractx, block_coefs[1], 2,
@@ -112,10 +110,10 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
     ff_int_to_int16(block_coefs[3], ractx->lpc_coef[0]);
 
     for (i=0; i < NBLOCKS; i++) {
-        do_output_subblock(ractx, block_coefs[i], refl_rms[i], &bc);
+        do_output_subblock(ractx, block_coefs[i], refl_rms[i], &gb);
 
         for (j=0; j < BLOCKSIZE; j++)
-            *samples++ = av_clip_int16(ractx->curr_sblock[j + 10] << 2);
+            *samples++ = av_clip_int16(ractx->curr_sblock[j + 10] * (1 << 2));
     }
 
     ractx->old_energy = energy;
@@ -125,7 +123,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return FRAMESIZE;
+    return FRAME_SIZE;
 }
 
 AVCodec ff_ra_144_decoder = {
diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c
index 678e668..cc4f381 100644
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@@ -2,20 +2,20 @@
  * Real Audio 1.0 (14.4K) encoder
  * Copyright (c) 2010 Francesco Lavra <francescolavra@interfree.it>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,6 @@
 #include "put_bits.h"
 #include "ra144.h"
 
-
 static av_cold int ra144_encode_close(AVCodecContext *avctx)
 {
     RA144Context *ractx = avctx->priv_data;
@@ -62,6 +61,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
     ractx->lpc_coef[0] = ractx->lpc_tables[0];
     ractx->lpc_coef[1] = ractx->lpc_tables[1];
     ractx->avctx = avctx;
+    ff_audiodsp_init(&ractx->adsp);
     ret = ff_lpc_init(&ractx->lpc_ctx, avctx->frame_size, LPC_ORDER,
                       FF_LPC_TYPE_LEVINSON);
     if (ret < 0)
@@ -198,8 +198,8 @@ static void create_adapt_vect(float *vect, const int16_t *cb, int lag)
 static int adaptive_cb_search(const int16_t *adapt_cb, float *work,
                               const float *coefs, float *data)
 {
-    int i, best_vect;
-    float score, gain, best_score, best_gain;
+    int i, av_uninit(best_vect);
+    float score, gain, best_score, av_uninit(best_gain);
     float exc[BLOCKSIZE];
 
     gain = best_score = 0;
@@ -335,9 +335,9 @@ static void ra144_encode_subblock(RA144Context *ractx,
     float data[BLOCKSIZE] = { 0 }, work[LPC_ORDER + BLOCKSIZE];
     float coefs[LPC_ORDER];
     float zero[BLOCKSIZE], cba[BLOCKSIZE], cb1[BLOCKSIZE], cb2[BLOCKSIZE];
-    int16_t cba_vect[BLOCKSIZE];
     int cba_idx, cb1_idx, cb2_idx, gain;
-    int i, n, m[3];
+    int i, n;
+    unsigned m[3];
     float g[3];
     float error, best_error;
 
@@ -373,8 +373,8 @@ static void ra144_encode_subblock(RA144Context *ractx,
          */
         memcpy(cba, work + LPC_ORDER, sizeof(cba));
 
-        ff_copy_and_dup(cba_vect, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1);
-        m[0] = (ff_irms(cba_vect) * rms) >> 12;
+        ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1);
+        m[0] = (ff_irms(&ractx->adsp, ractx->buffer_a) * rms) >> 12;
     }
     fixed_cb_search(work + LPC_ORDER, coefs, data, cba_idx, &cb1_idx, &cb2_idx);
     for (i = 0; i < BLOCKSIZE; i++) {
@@ -447,10 +447,8 @@ static int ra144_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     if (ractx->last_frame)
         return 0;
 
-    if ((ret = ff_alloc_packet(avpkt, FRAMESIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, FRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
     /**
      * Since the LPC coefficients are calculated on a frame centered over the
@@ -477,7 +475,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     ff_lpc_calc_coefs(&ractx->lpc_ctx, lpc_data, NBLOCKS * BLOCKSIZE, LPC_ORDER,
                       LPC_ORDER, 16, lpc_coefs, shift, FF_LPC_TYPE_LEVINSON,
-                      0, ORDER_METHOD_EST, 12, 0);
+                      0, ORDER_METHOD_EST, 0, 12, 0);
     for (i = 0; i < LPC_ORDER; i++)
         block_coefs[NBLOCKS - 1][i] = -(lpc_coefs[LPC_ORDER - 1][i] <<
                                         (12 - shift[LPC_ORDER - 1]));
@@ -538,7 +536,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ff_af_queue_remove(&ractx->afq, avctx->frame_size, &avpkt->pts,
                        &avpkt->duration);
 
-    avpkt->size = FRAMESIZE;
+    avpkt->size = FRAME_SIZE;
     *got_packet_ptr = 1;
     return 0;
 }
@@ -556,4 +554,6 @@ AVCodec ff_ra_144_encoder = {
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){ 8000, 0 },
+    .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0 },
 };
diff --git a/libavcodec/ra288.c b/libavcodec/ra288.c
index c457d0c..f1b3c8e 100644
--- a/libavcodec/ra288.c
+++ b/libavcodec/ra288.c
@@ -2,20 +2,20 @@
  * RealAudio 2.0 (28.8K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,8 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "celp_filters.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "lpc.h"
 #include "ra288.h"
@@ -39,7 +39,7 @@
 #define RA288_BLOCKS_PER_FRAME 32
 
 typedef struct RA288Context {
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     DECLARE_ALIGNED(32, float,   sp_lpc)[FFALIGN(36, 16)];   ///< LPC coefficients for speech data (spec: A)
     DECLARE_ALIGNED(32, float, gain_lpc)[FFALIGN(10, 16)];   ///< LPC coefficients for gain        (spec: GB)
 
@@ -60,6 +60,15 @@ typedef struct RA288Context {
     float gain_rec[11];
 } RA288Context;
 
+static av_cold int ra288_decode_close(AVCodecContext *avctx)
+{
+    RA288Context *ractx = avctx->priv_data;
+
+    av_freep(&ractx->fdsp);
+
+    return 0;
+}
+
 static av_cold int ra288_decode_init(AVCodecContext *avctx)
 {
     RA288Context *ractx = avctx->priv_data;
@@ -68,7 +77,14 @@ static av_cold int ra288_decode_init(AVCodecContext *avctx)
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
-    avpriv_float_dsp_init(&ractx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (avctx->block_align <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported block align\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ractx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!ractx->fdsp)
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -105,14 +121,14 @@ static void decode(RA288Context *ractx, float gain, int cb_coef)
     for (i=0; i < 5; i++)
         buffer[i] = codetable[cb_coef][i] * sumsum;
 
-    sum = avpriv_scalarproduct_float_c(buffer, buffer, 5) * ((1 << 24) / 5.0);
+    sum = avpriv_scalarproduct_float_c(buffer, buffer, 5);
 
-    sum = FFMAX(sum, 1);
+    sum = FFMAX(sum, 5.0 / (1<<24));
 
     /* shift and store */
     memmove(gain_block, gain_block + 1, 9 * sizeof(*gain_block));
 
-    gain_block[9] = 10 * log10(sum) - 32;
+    gain_block[9] = 10 * log10(sum) + (10*log10(((1<<24)/5.)) - 32);
 
     ff_celp_lp_synthesis_filterf(block, ractx->sp_lpc, buffer, 5, 36);
 }
@@ -140,7 +156,9 @@ static void do_hybrid_window(RA288Context *ractx,
                                             MAX_BACKWARD_FILTER_LEN   +
                                             MAX_BACKWARD_FILTER_NONREC, 16)]);
 
-    ractx->fdsp.vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16));
+    av_assert2(order>=0);
+
+    ractx->fdsp->vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16));
 
     convolve(buffer1, work + order    , n      , order);
     convolve(buffer2, work + order + n, non_rec, order);
@@ -167,7 +185,7 @@ static void backward_filter(RA288Context *ractx,
     do_hybrid_window(ractx, order, n, non_rec, temp, hist, rec, window);
 
     if (!compute_lpc_coefs(temp, order, lpc, 0, 1, 1))
-        ractx->fdsp.vector_fmul(lpc, lpc, tab, FFALIGN(order, 16));
+        ractx->fdsp->vector_fmul(lpc, lpc, tab, FFALIGN(order, 16));
 
     memmove(hist, hist + n, move_size*sizeof(*hist));
 }
@@ -181,7 +199,7 @@ static int ra288_decode_frame(AVCodecContext * avctx, void *data,
     float *out;
     int i, ret;
     RA288Context *ractx = avctx->priv_data;
-    BitstreamContext bc;
+    GetBitContext gb;
 
     if (buf_size < avctx->block_align) {
         av_log(avctx, AV_LOG_ERROR,
@@ -190,19 +208,19 @@ static int ra288_decode_frame(AVCodecContext * avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    ret = init_get_bits8(&gb, buf, avctx->block_align);
+    if (ret < 0)
+        return ret;
+
     /* get output buffer */
     frame->nb_samples = RA288_BLOCK_SIZE * RA288_BLOCKS_PER_FRAME;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out = (float *)frame->data[0];
 
-    bitstream_init8(&bc, buf, avctx->block_align);
-
     for (i=0; i < RA288_BLOCKS_PER_FRAME; i++) {
-        float gain = amptable[bitstream_read(&bc, 3)];
-        int cb_coef = bitstream_read(&bc, 6 + (i & 1));
+        float gain = amptable[get_bits(&gb, 3)];
+        int cb_coef = get_bits(&gb, 6 + (i&1));
 
         decode(ractx, gain, cb_coef);
 
@@ -231,5 +249,6 @@ AVCodec ff_ra_288_decoder = {
     .priv_data_size = sizeof(RA288Context),
     .init           = ra288_decode_init,
     .decode         = ra288_decode_frame,
+    .close          = ra288_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/ra288.h b/libavcodec/ra288.h
index 3d6ed8d..fa0b528 100644
--- a/libavcodec/ra288.h
+++ b/libavcodec/ra288.h
@@ -2,20 +2,20 @@
  * RealAudio 2.0 (28.8K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ralf.c b/libavcodec/ralf.c
index 1003b10..3f7953c 100644
--- a/libavcodec/ralf.c
+++ b/libavcodec/ralf.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,13 +28,11 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/channel_layout.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "golomb.h"
 #include "internal.h"
 #include "unary.h"
-#include "vlc.h"
 #include "ralfdata.h"
 
 #define FILTER_NONE 0
@@ -212,21 +210,21 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static inline int extend_code(BitstreamContext *bc, int val, int range, int bits)
+static inline int extend_code(GetBitContext *gb, int val, int range, int bits)
 {
     if (val == 0) {
-        val = -range - get_ue_golomb(bc);
+        val = -range - get_ue_golomb(gb);
     } else if (val == range * 2) {
-        val =  range + get_ue_golomb(bc);
+        val =  range + get_ue_golomb(gb);
     } else {
         val -= range;
     }
     if (bits)
-        val = (val << bits) | bitstream_read(bc, bits);
+        val = (val << bits) | get_bits(gb, bits);
     return val;
 }
 
-static int decode_channel(RALFContext *ctx, BitstreamContext *bc, int ch,
+static int decode_channel(RALFContext *ctx, GetBitContext *gb, int ch,
                           int length, int mode, int bits)
 {
     int i, t;
@@ -235,19 +233,19 @@ static int decode_channel(RALFContext *ctx, BitstreamContext *bc, int ch,
     VLC *code_vlc; int range, range2, add_bits;
     int *dst = ctx->channel_data[ch];
 
-    ctx->filter_params = bitstream_read_vlc(bc, set->filter_params.table, 9, 2);
+    ctx->filter_params = get_vlc2(gb, set->filter_params.table, 9, 2);
     ctx->filter_bits   = (ctx->filter_params - 2) >> 6;
     ctx->filter_length = ctx->filter_params - (ctx->filter_bits << 6) - 1;
 
     if (ctx->filter_params == FILTER_RAW) {
         for (i = 0; i < length; i++)
-            dst[i] = bitstream_read(bc, bits);
+            dst[i] = get_bits(gb, bits);
         ctx->bias[ch] = 0;
         return 0;
     }
 
-    ctx->bias[ch] = bitstream_read_vlc(bc, set->bias.table, 9, 2);
-    ctx->bias[ch] = extend_code(bc, ctx->bias[ch], 127, 4);
+    ctx->bias[ch] = get_vlc2(gb, set->bias.table, 9, 2);
+    ctx->bias[ch] = extend_code(gb, ctx->bias[ch], 127, 4);
 
     if (ctx->filter_params == FILTER_NONE) {
         memset(dst, 0, sizeof(*dst) * length);
@@ -261,8 +259,8 @@ static int decode_channel(RALFContext *ctx, BitstreamContext *bc, int ch,
         add_bits = ctx->filter_bits;
 
         for (i = 0; i < ctx->filter_length; i++) {
-            t = bitstream_read_vlc(bc, vlc[cmode].table, vlc[cmode].bits, 2);
-            t = extend_code(bc, t, 21, add_bits);
+            t = get_vlc2(gb, vlc[cmode].table, vlc[cmode].bits, 2);
+            t = extend_code(gb, t, 21, add_bits);
             if (!cmode)
                 coeff -= 12 << add_bits;
             coeff = t - coeff;
@@ -281,7 +279,7 @@ static int decode_channel(RALFContext *ctx, BitstreamContext *bc, int ch,
         }
     }
 
-    code_params = bitstream_read_vlc(bc, set->coding_mode.table, set->coding_mode.bits, 2);
+    code_params = get_vlc2(gb, set->coding_mode.table, set->coding_mode.bits, 2);
     if (code_params >= 15) {
         add_bits = av_clip((code_params / 5 - 3) / 2, 0, 10);
         if (add_bits > 9 && (code_params % 5) != 2)
@@ -299,14 +297,14 @@ static int decode_channel(RALFContext *ctx, BitstreamContext *bc, int ch,
     for (i = 0; i < length; i += 2) {
         int code1, code2;
 
-        t = bitstream_read_vlc(bc, code_vlc->table, code_vlc->bits, 2);
+        t = get_vlc2(gb, code_vlc->table, code_vlc->bits, 2);
         code1 = t / range2;
         code2 = t % range2;
-        dst[i]     = extend_code(bc, code1, range, 0) << add_bits;
-        dst[i + 1] = extend_code(bc, code2, range, 0) << add_bits;
+        dst[i]     = extend_code(gb, code1, range, 0) << add_bits;
+        dst[i + 1] = extend_code(gb, code2, range, 0) << add_bits;
         if (add_bits) {
-            dst[i]     |= bitstream_read(bc, add_bits);
-            dst[i + 1] |= bitstream_read(bc, add_bits);
+            dst[i]     |= get_bits(gb, add_bits);
+            dst[i + 1] |= get_bits(gb, add_bits);
         }
     }
 
@@ -337,7 +335,7 @@ static void apply_lpc(RALFContext *ctx, int ch, int length, int bits)
     }
 }
 
-static int decode_block(AVCodecContext *avctx, BitstreamContext *bc,
+static int decode_block(AVCodecContext *avctx, GetBitContext *gb,
                         int16_t *dst0, int16_t *dst1)
 {
     RALFContext *ctx = avctx->priv_data;
@@ -346,7 +344,7 @@ static int decode_block(AVCodecContext *avctx, BitstreamContext *bc,
     int *ch0, *ch1;
     int i, t, t2;
 
-    len = 12 - get_unary(bc, 0, 6);
+    len = 12 - get_unary(gb, 0, 6);
 
     if (len <= 7) len ^= 1; // codes for length = 6 and 7 are swapped
     len = 1 << len;
@@ -358,7 +356,7 @@ static int decode_block(AVCodecContext *avctx, BitstreamContext *bc,
     }
 
     if (avctx->channels > 1)
-        dmode = bitstream_read(bc, 2) + 1;
+        dmode = get_bits(gb, 2) + 1;
     else
         dmode = 0;
 
@@ -368,13 +366,13 @@ static int decode_block(AVCodecContext *avctx, BitstreamContext *bc,
     bits[1] = (mode[1] == 2) ? 17 : 16;
 
     for (ch = 0; ch < avctx->channels; ch++) {
-        if ((ret = decode_channel(ctx, bc, ch, len, mode[ch], bits[ch])) < 0)
+        if ((ret = decode_channel(ctx, gb, ch, len, mode[ch], bits[ch])) < 0)
             return ret;
         if (ctx->filter_params > 1 && ctx->filter_params != FILTER_RAW) {
             ctx->filter_bits += 3;
             apply_lpc(ctx, ch, len, bits[ch]);
         }
-        if (bitstream_bits_left(bc) < 0)
+        if (get_bits_left(gb) < 0)
             return AVERROR_INVALIDDATA;
     }
     ch0 = ctx->channel_data[0];
@@ -428,7 +426,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     int16_t *samples0;
     int16_t *samples1;
     int ret;
-    BitstreamContext bc;
+    GetBitContext gb;
     int table_size, table_bytes, i;
     const uint8_t *src, *block_pointer;
     int src_size;
@@ -463,10 +461,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     }
 
     frame->nb_samples = ctx->max_frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Me fail get_buffer()? That's unpossible!\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples0 = (int16_t *)frame->data[0];
     samples1 = (int16_t *)frame->data[1];
 
@@ -480,12 +476,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
         av_log(avctx, AV_LOG_ERROR, "short packets are short!\n");
         return AVERROR_INVALIDDATA;
     }
-    bitstream_init(&bc, src + 2, table_size);
+    init_get_bits(&gb, src + 2, table_size);
     ctx->num_blocks = 0;
-    while (bitstream_bits_left(&bc) > 0) {
-        ctx->block_size[ctx->num_blocks] = bitstream_read(&bc, 15);
-        if (bitstream_read_bit(&bc)) {
-            ctx->block_pts[ctx->num_blocks] = bitstream_read(&bc, 9);
+    while (get_bits_left(&gb) > 0) {
+        ctx->block_size[ctx->num_blocks] = get_bits(&gb, 13 + avctx->channels);
+        if (get_bits1(&gb)) {
+            ctx->block_pts[ctx->num_blocks] = get_bits(&gb, 9);
         } else {
             ctx->block_pts[ctx->num_blocks] = 0;
         }
@@ -500,8 +496,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
             av_log(avctx, AV_LOG_ERROR, "I'm pedaling backwards\n");
             break;
         }
-        bitstream_init8(&bc, block_pointer, ctx->block_size[i]);
-        if (decode_block(avctx, &bc, samples0 + ctx->sample_offset,
+        init_get_bits(&gb, block_pointer, ctx->block_size[i] * 8);
+        if (decode_block(avctx, &gb, samples0 + ctx->sample_offset,
                                      samples1 + ctx->sample_offset) < 0) {
             av_log(avctx, AV_LOG_ERROR, "Sir, I got carsick in your office. Not decoding the rest of packet.\n");
             break;
diff --git a/libavcodec/ralfdata.h b/libavcodec/ralfdata.h
index 83eb970..9a84e45 100644
--- a/libavcodec/ralfdata.h
+++ b/libavcodec/ralfdata.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rangecoder.c b/libavcodec/rangecoder.c
index 4c4731d..a6a3f08 100644
--- a/libavcodec/rangecoder.c
+++ b/libavcodec/rangecoder.c
@@ -2,20 +2,20 @@
  * Range coder
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
@@ -57,6 +58,11 @@ av_cold void ff_init_range_decoder(RangeCoder *c, const uint8_t *buf,
 
     c->low         = AV_RB16(c->bytestream);
     c->bytestream += 2;
+    c->overread    = 0;
+    if (c->low >= 0xFF00) {
+        c->low = 0xFF00;
+        c->bytestream_end = c->bytestream;
+    }
 }
 
 void ff_build_rac_states(RangeCoder *c, int factor, int max_p)
@@ -100,16 +106,37 @@ void ff_build_rac_states(RangeCoder *c, int factor, int max_p)
 }
 
 /* Return the number of bytes written. */
-int ff_rac_terminate(RangeCoder *c)
+int ff_rac_terminate(RangeCoder *c, int version)
 {
+    if (version == 1)
+        put_rac(c, (uint8_t[]) { 129 }, 0);
     c->range = 0xFF;
     c->low  += 0xFF;
     renorm_encoder(c);
     c->range = 0xFF;
     renorm_encoder(c);
 
-    assert(c->low == 0);
-    assert(c->range >= 0x100);
+    av_assert1(c->low   == 0);
+    av_assert1(c->range >= 0x100);
 
     return c->bytestream - c->bytestream_start;
 }
+
+int ff_rac_check_termination(RangeCoder *c, int version)
+{
+    if (version == 1) {
+        RangeCoder tmp = *c;
+        get_rac(c, (uint8_t[]) { 129 });
+
+        if (c->bytestream == tmp.bytestream && c->bytestream > c->bytestream_start)
+            tmp.low -= *--tmp.bytestream;
+        tmp.bytestream_end = tmp.bytestream;
+
+        if (get_rac(&tmp, (uint8_t[]) { 129 }))
+            return AVERROR_INVALIDDATA;
+    } else {
+        if (c->bytestream_end != c->bytestream)
+            return AVERROR_INVALIDDATA;
+    }
+    return 0;
+}
diff --git a/libavcodec/rangecoder.h b/libavcodec/rangecoder.h
index 2ead446..4d4ca4d 100644
--- a/libavcodec/rangecoder.h
+++ b/libavcodec/rangecoder.h
@@ -2,20 +2,20 @@
  * Range coder
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #define AVCODEC_RANGECODER_H
 
 #include <stdint.h>
-#include <assert.h>
 
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
 
 typedef struct RangeCoder {
     int low;
@@ -42,11 +42,30 @@ typedef struct RangeCoder {
     uint8_t *bytestream_start;
     uint8_t *bytestream;
     uint8_t *bytestream_end;
+    int overread;
+#define MAX_OVERREAD 2
 } RangeCoder;
 
 void ff_init_range_encoder(RangeCoder *c, uint8_t *buf, int buf_size);
 void ff_init_range_decoder(RangeCoder *c, const uint8_t *buf, int buf_size);
-int ff_rac_terminate(RangeCoder *c);
+
+/**
+ * Terminates the range coder
+ * @param version version 0 requires the decoder to know the data size in bytes
+ *                version 1 needs about 1 bit more space but does not need to
+ *                          carry the size from encoder to decoder
+ */
+int ff_rac_terminate(RangeCoder *c, int version);
+
+/**
+ * Check if at the current position there is a valid looking termination
+ * @param version version 0 requires the decoder to know the data size in bytes
+ *                version 1 needs about 1 bit more space but does not need to
+ *                          carry the size from encoder to decoder
+ * @returns negative AVERROR code on error or non negative.
+ */
+int ff_rac_check_termination(RangeCoder *c, int version);
+
 void ff_build_rac_states(RangeCoder *c, int factor, int max_p);
 
 static inline void renorm_encoder(RangeCoder *c)
@@ -86,9 +105,9 @@ static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
 {
     int range1 = (c->range * (*state)) >> 8;
 
-    assert(*state);
-    assert(range1 < c->range);
-    assert(range1 > 0);
+    av_assert2(*state);
+    av_assert2(range1 < c->range);
+    av_assert2(range1 > 0);
     if (!bit) {
         c->range -= range1;
         *state    = c->zero_state[*state];
@@ -106,9 +125,11 @@ static inline void refill(RangeCoder *c)
     if (c->range < 0x100) {
         c->range <<= 8;
         c->low   <<= 8;
-        if (c->bytestream < c->bytestream_end)
+        if (c->bytestream < c->bytestream_end) {
             c->low += c->bytestream[0];
-        c->bytestream++;
+            c->bytestream++;
+        } else
+            c->overread ++;
     }
 }
 
diff --git a/libavcodec/rasc.c b/libavcodec/rasc.c
new file mode 100644
index 0000000..21fc43f
--- /dev/null
+++ b/libavcodec/rasc.c
@@ -0,0 +1,817 @@
+/*
+ * RemotelyAnywhere Screen Capture decoder
+ *
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+#include <zlib.h>
+
+#define KBND MKTAG('K', 'B', 'N', 'D')
+#define FINT MKTAG('F', 'I', 'N', 'T')
+#define INIT MKTAG('I', 'N', 'I', 'T')
+#define BNDL MKTAG('B', 'N', 'D', 'L')
+#define KFRM MKTAG('K', 'F', 'R', 'M')
+#define DLTA MKTAG('D', 'L', 'T', 'A')
+#define MOUS MKTAG('M', 'O', 'U', 'S')
+#define MPOS MKTAG('M', 'P', 'O', 'S')
+#define MOVE MKTAG('M', 'O', 'V', 'E')
+#define EMPT MKTAG('E', 'M', 'P', 'T')
+
+typedef struct RASCContext {
+    AVClass        *class;
+    int             skip_cursor;
+    GetByteContext  gb;
+    uint8_t        *delta;
+    int             delta_size;
+    uint8_t        *cursor;
+    int             cursor_size;
+    unsigned        cursor_w;
+    unsigned        cursor_h;
+    unsigned        cursor_x;
+    unsigned        cursor_y;
+    int             stride;
+    int             bpp;
+    z_stream        zstream;
+    AVFrame        *frame;
+    AVFrame        *frame1;
+    AVFrame        *frame2;
+} RASCContext;
+
+static void clear_plane(AVCodecContext *avctx, AVFrame *frame)
+{
+    RASCContext *s = avctx->priv_data;
+    uint8_t *dst = frame->data[0];
+
+    for (int y = 0; y < avctx->height; y++) {
+        memset(dst, 0, avctx->width * s->bpp);
+        dst += frame->linesize[0];
+    }
+}
+
+static void copy_plane(AVCodecContext *avctx, AVFrame *src, AVFrame *dst)
+{
+    RASCContext *s = avctx->priv_data;
+    uint8_t *srcp = src->data[0];
+    uint8_t *dstp = dst->data[0];
+
+    for (int y = 0; y < avctx->height; y++) {
+        memcpy(dstp, srcp, s->stride);
+        srcp += src->linesize[0];
+        dstp += dst->linesize[0];
+    }
+}
+
+static int init_frames(AVCodecContext *avctx)
+{
+    RASCContext *s = avctx->priv_data;
+    int ret;
+
+    av_frame_unref(s->frame1);
+    av_frame_unref(s->frame2);
+    if ((ret = ff_get_buffer(avctx, s->frame1, 0)) < 0)
+        return ret;
+
+    if ((ret = ff_get_buffer(avctx, s->frame2, 0)) < 0)
+        return ret;
+
+    clear_plane(avctx, s->frame2);
+    clear_plane(avctx, s->frame1);
+
+    return 0;
+}
+
+static int decode_fint(AVCodecContext *avctx,
+                       AVPacket *avpkt, unsigned size)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    unsigned w, h, fmt;
+    int ret;
+
+    if (bytestream2_peek_le32(gb) != 0x65) {
+        if (!s->frame2->data[0] || !s->frame1->data[0])
+            return AVERROR_INVALIDDATA;
+
+        clear_plane(avctx, s->frame2);
+        clear_plane(avctx, s->frame1);
+        return 0;
+    }
+
+    bytestream2_skip(gb, 8);
+    w = bytestream2_get_le32(gb);
+    h = bytestream2_get_le32(gb);
+    bytestream2_skip(gb, 30);
+    fmt = bytestream2_get_le16(gb);
+    bytestream2_skip(gb, 24);
+
+    switch (fmt) {
+    case 8:  s->stride = FFALIGN(w, 4);
+             s->bpp    = 1;
+             fmt = AV_PIX_FMT_PAL8; break;
+    case 16: s->stride = w * 2;
+             s->bpp    = 2;
+             fmt = AV_PIX_FMT_RGB555LE; break;
+    case 32: s->stride = w * 4;
+             s->bpp    = 4;
+             fmt = AV_PIX_FMT_BGR0; break;
+    default: return AVERROR_INVALIDDATA;
+    }
+
+    ret = ff_set_dimensions(avctx, w, h);
+    if (ret < 0)
+        return ret;
+    avctx->width  = w;
+    avctx->height = h;
+    avctx->pix_fmt = fmt;
+
+    ret = init_frames(avctx);
+    if (ret < 0)
+        return ret;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        uint32_t *pal = (uint32_t *)s->frame2->data[1];
+
+        for (int i = 0; i < 256; i++)
+            pal[i] = bytestream2_get_le32(gb) | 0xFF000000u;
+    }
+
+    return 0;
+}
+
+static int decode_zlib(AVCodecContext *avctx, AVPacket *avpkt,
+                       unsigned size, unsigned uncompressed_size)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    int zret;
+
+    zret = inflateReset(&s->zstream);
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", zret);
+        return AVERROR_EXTERNAL;
+    }
+
+    av_fast_padded_malloc(&s->delta, &s->delta_size, uncompressed_size);
+    if (!s->delta)
+        return AVERROR(ENOMEM);
+
+    s->zstream.next_in  = avpkt->data + bytestream2_tell(gb);
+    s->zstream.avail_in = FFMIN(size, bytestream2_get_bytes_left(gb));
+
+    s->zstream.next_out  = s->delta;
+    s->zstream.avail_out = s->delta_size;
+
+    zret = inflate(&s->zstream, Z_FINISH);
+    if (zret != Z_STREAM_END) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Inflate failed with return code: %d.\n", zret);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int decode_move(AVCodecContext *avctx,
+                       AVPacket *avpkt, unsigned size)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    GetByteContext mc;
+    unsigned pos, compression, nb_moves;
+    unsigned uncompressed_size;
+    int ret;
+
+    pos = bytestream2_tell(gb);
+    bytestream2_skip(gb, 8);
+    nb_moves = bytestream2_get_le32(gb);
+    bytestream2_skip(gb, 8);
+    compression = bytestream2_get_le32(gb);
+
+    if (nb_moves > INT32_MAX / 16 || nb_moves > avctx->width * avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    uncompressed_size = 16 * nb_moves;
+
+    if (compression == 1) {
+        ret = decode_zlib(avctx, avpkt,
+                          size - (bytestream2_tell(gb) - pos),
+                          uncompressed_size);
+        if (ret < 0)
+            return ret;
+        bytestream2_init(&mc, s->delta, uncompressed_size);
+    } else if (compression == 0) {
+        bytestream2_init(&mc, avpkt->data + bytestream2_tell(gb),
+                         bytestream2_get_bytes_left(gb));
+    } else if (compression == 2) {
+        avpriv_request_sample(avctx, "compression %d", compression);
+        return AVERROR_PATCHWELCOME;
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (bytestream2_get_bytes_left(&mc) < uncompressed_size)
+        return AVERROR_INVALIDDATA;
+
+    for (int i = 0; i < nb_moves; i++) {
+        int type, start_x, start_y, end_x, end_y, mov_x, mov_y;
+        uint8_t *e2, *b1, *b2;
+        int w, h;
+
+        type = bytestream2_get_le16(&mc);
+        start_x = bytestream2_get_le16(&mc);
+        start_y = bytestream2_get_le16(&mc);
+        end_x = bytestream2_get_le16(&mc);
+        end_y = bytestream2_get_le16(&mc);
+        mov_x = bytestream2_get_le16(&mc);
+        mov_y = bytestream2_get_le16(&mc);
+        bytestream2_skip(&mc, 2);
+
+        if (start_x >= avctx->width || start_y >= avctx->height ||
+            end_x >= avctx->width || end_y >= avctx->height ||
+            mov_x >= avctx->width || mov_y >= avctx->height) {
+            continue;
+        }
+
+        if (start_x >= end_x || start_y >= end_y)
+            continue;
+
+        w = end_x - start_x;
+        h = end_y - start_y;
+
+        if (mov_x + w > avctx->width || mov_y + h > avctx->height)
+            continue;
+
+        if (!s->frame2->data[0] || !s->frame1->data[0])
+            return AVERROR_INVALIDDATA;
+
+        b1 = s->frame1->data[0] + s->frame1->linesize[0] * (start_y + h - 1) + start_x * s->bpp;
+        b2 = s->frame2->data[0] + s->frame2->linesize[0] * (start_y + h - 1) + start_x * s->bpp;
+        e2 = s->frame2->data[0] + s->frame2->linesize[0] * (mov_y + h - 1) + mov_x * s->bpp;
+
+        if (type == 2) {
+            for (int j = 0; j < h; j++) {
+                memcpy(b1, b2, w * s->bpp);
+                b1 -= s->frame1->linesize[0];
+                b2 -= s->frame2->linesize[0];
+            }
+        } else if (type == 1) {
+            for (int j = 0; j < h; j++) {
+                memset(b2, 0, w * s->bpp);
+                b2 -= s->frame2->linesize[0];
+            }
+        } else if (type == 0) {
+            uint8_t *buffer;
+
+            av_fast_padded_malloc(&s->delta, &s->delta_size, w * h * s->bpp);
+            buffer = s->delta;
+            if (!buffer)
+                return AVERROR(ENOMEM);
+
+            for (int j = 0; j < h; j++) {
+                memcpy(buffer + j * w * s->bpp, e2, w * s->bpp);
+                e2 -= s->frame2->linesize[0];
+            }
+
+            for (int j = 0; j < h; j++) {
+                memcpy(b2, buffer + j * w * s->bpp, w * s->bpp);
+                b2 -= s->frame2->linesize[0];
+            }
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    bytestream2_skip(gb, size - (bytestream2_tell(gb) - pos));
+
+    return 0;
+}
+
+#define NEXT_LINE                        \
+    if (cx >= w * s->bpp) {              \
+        cx = 0;                          \
+        cy--;                            \
+        b1 -= s->frame1->linesize[0];    \
+        b2 -= s->frame2->linesize[0];    \
+    }                                    \
+    len--;
+
+static int decode_dlta(AVCodecContext *avctx,
+                       AVPacket *avpkt, unsigned size)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    GetByteContext dc;
+    unsigned uncompressed_size, pos;
+    unsigned x, y, w, h;
+    int ret, cx, cy, compression;
+    uint8_t *b1, *b2;
+
+    pos = bytestream2_tell(gb);
+    bytestream2_skip(gb, 12);
+    uncompressed_size = bytestream2_get_le32(gb);
+    x = bytestream2_get_le32(gb);
+    y = bytestream2_get_le32(gb);
+    w = bytestream2_get_le32(gb);
+    h = bytestream2_get_le32(gb);
+
+    if (x >= avctx->width || y >= avctx->height ||
+        w > avctx->width || h > avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    if (x + w > avctx->width || y + h > avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    bytestream2_skip(gb, 4);
+    compression = bytestream2_get_le32(gb);
+
+    if (compression == 1) {
+        if (w * h * s->bpp * 3 < uncompressed_size)
+            return AVERROR_INVALIDDATA;
+        ret = decode_zlib(avctx, avpkt, size, uncompressed_size);
+        if (ret < 0)
+            return ret;
+        bytestream2_init(&dc, s->delta, uncompressed_size);
+    } else if (compression == 0) {
+        if (bytestream2_get_bytes_left(gb) < uncompressed_size)
+            return AVERROR_INVALIDDATA;
+        bytestream2_init(&dc, avpkt->data + bytestream2_tell(gb),
+                         uncompressed_size);
+    } else if (compression == 2) {
+        avpriv_request_sample(avctx, "compression %d", compression);
+        return AVERROR_PATCHWELCOME;
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!s->frame2->data[0] || !s->frame1->data[0])
+        return AVERROR_INVALIDDATA;
+
+    b1  = s->frame1->data[0] + s->frame1->linesize[0] * (y + h - 1) + x * s->bpp;
+    b2  = s->frame2->data[0] + s->frame2->linesize[0] * (y + h - 1) + x * s->bpp;
+    cx = 0, cy = h;
+    while (bytestream2_get_bytes_left(&dc) > 0) {
+        int type = bytestream2_get_byte(&dc);
+        int len = bytestream2_get_byte(&dc);
+        unsigned fill;
+
+        switch (type) {
+        case 1:
+            while (len > 0 && cy > 0) {
+                cx++;
+                NEXT_LINE
+            }
+            break;
+        case 2:
+            while (len > 0 && cy > 0) {
+                int v0 = b1[cx];
+                int v1 = b2[cx];
+
+                b2[cx] = v0;
+                b1[cx] = v1;
+                cx++;
+                NEXT_LINE
+            }
+            break;
+        case 3:
+            while (len > 0 && cy > 0) {
+                fill = bytestream2_get_byte(&dc);
+                b1[cx] = b2[cx];
+                b2[cx] = fill;
+                cx++;
+                NEXT_LINE
+            }
+            break;
+        case 4:
+            fill = bytestream2_get_byte(&dc);
+            while (len > 0 && cy > 0) {
+                AV_WL32(b1 + cx, AV_RL32(b2 + cx));
+                AV_WL32(b2 + cx, fill);
+                cx++;
+                NEXT_LINE
+            }
+            break;
+        case 7:
+            fill = bytestream2_get_le32(&dc);
+            while (len > 0 && cy > 0) {
+                AV_WL32(b1 + cx, AV_RL32(b2 + cx));
+                AV_WL32(b2 + cx, fill);
+                cx += 4;
+                NEXT_LINE
+            }
+            break;
+        case 10:
+            while (len > 0 && cy > 0) {
+                cx += 4;
+                NEXT_LINE
+            }
+            break;
+        case 12:
+            while (len > 0 && cy > 0) {
+                unsigned v0, v1;
+
+                v0 = AV_RL32(b2 + cx);
+                v1 = AV_RL32(b1 + cx);
+                AV_WL32(b2 + cx, v1);
+                AV_WL32(b1 + cx, v0);
+                cx += 4;
+                NEXT_LINE
+            }
+            break;
+        case 13:
+            while (len > 0 && cy > 0) {
+                fill = bytestream2_get_le32(&dc);
+                AV_WL32(b1 + cx, AV_RL32(b2 + cx));
+                AV_WL32(b2 + cx, fill);
+                cx += 4;
+                NEXT_LINE
+            }
+            break;
+        default:
+            avpriv_request_sample(avctx, "runlen %d", type);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    bytestream2_skip(gb, size - (bytestream2_tell(gb) - pos));
+
+    return 0;
+}
+
+static int decode_kfrm(AVCodecContext *avctx,
+                       AVPacket *avpkt, unsigned size)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    uint8_t *dst;
+    unsigned pos;
+    int zret, ret;
+
+    pos = bytestream2_tell(gb);
+    if (bytestream2_peek_le32(gb) == 0x65) {
+        ret = decode_fint(avctx, avpkt, size);
+        if (ret < 0)
+            return ret;
+    }
+
+    if (!s->frame2->data[0])
+        return AVERROR_INVALIDDATA;
+
+    zret = inflateReset(&s->zstream);
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", zret);
+        return AVERROR_EXTERNAL;
+    }
+
+    s->zstream.next_in  = avpkt->data + bytestream2_tell(gb);
+    s->zstream.avail_in = bytestream2_get_bytes_left(gb);
+
+    dst = s->frame2->data[0] + (avctx->height - 1) * s->frame2->linesize[0];
+    for (int i = 0; i < avctx->height; i++) {
+        s->zstream.next_out  = dst;
+        s->zstream.avail_out = s->stride;
+
+        zret = inflate(&s->zstream, Z_SYNC_FLUSH);
+        if (zret != Z_OK && zret != Z_STREAM_END) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Inflate failed with return code: %d.\n", zret);
+            return AVERROR_INVALIDDATA;
+        }
+
+        dst -= s->frame2->linesize[0];
+    }
+
+    dst = s->frame1->data[0] + (avctx->height - 1) * s->frame1->linesize[0];
+    for (int i = 0; i < avctx->height; i++) {
+        s->zstream.next_out  = dst;
+        s->zstream.avail_out = s->stride;
+
+        zret = inflate(&s->zstream, Z_SYNC_FLUSH);
+        if (zret != Z_OK && zret != Z_STREAM_END) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Inflate failed with return code: %d.\n", zret);
+            return AVERROR_INVALIDDATA;
+        }
+
+        dst -= s->frame1->linesize[0];
+    }
+
+    bytestream2_skip(gb, size - (bytestream2_tell(gb) - pos));
+
+    return 0;
+}
+
+static int decode_mous(AVCodecContext *avctx,
+                       AVPacket *avpkt, unsigned size)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    unsigned w, h, pos, uncompressed_size;
+    int ret;
+
+    pos = bytestream2_tell(gb);
+    bytestream2_skip(gb, 8);
+    w = bytestream2_get_le32(gb);
+    h = bytestream2_get_le32(gb);
+    bytestream2_skip(gb, 12);
+    uncompressed_size = bytestream2_get_le32(gb);
+
+    if (w > avctx->width || h > avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    if (uncompressed_size != 3 * w * h)
+        return AVERROR_INVALIDDATA;
+
+    av_fast_padded_malloc(&s->cursor, &s->cursor_size, uncompressed_size);
+    if (!s->cursor)
+        return AVERROR(ENOMEM);
+
+    ret = decode_zlib(avctx, avpkt,
+                      size - (bytestream2_tell(gb) - pos),
+                      uncompressed_size);
+    if (ret < 0)
+        return ret;
+    memcpy(s->cursor, s->delta, uncompressed_size);
+
+    bytestream2_skip(gb, size - (bytestream2_tell(gb) - pos));
+
+    s->cursor_w = w;
+    s->cursor_h = h;
+
+    return 0;
+}
+
+static int decode_mpos(AVCodecContext *avctx,
+                       AVPacket *avpkt, unsigned size)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    unsigned pos;
+
+    pos = bytestream2_tell(gb);
+    bytestream2_skip(gb, 8);
+    s->cursor_x = bytestream2_get_le32(gb);
+    s->cursor_y = bytestream2_get_le32(gb);
+
+    bytestream2_skip(gb, size - (bytestream2_tell(gb) - pos));
+
+    return 0;
+}
+
+static void draw_cursor(AVCodecContext *avctx)
+{
+    RASCContext *s = avctx->priv_data;
+    uint8_t *dst, *pal;
+
+    if (!s->cursor)
+        return;
+
+    if (s->cursor_x >= avctx->width || s->cursor_y >= avctx->height)
+        return;
+
+    if (s->cursor_x + s->cursor_w > avctx->width ||
+        s->cursor_y + s->cursor_h > avctx->height)
+        return;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        pal = s->frame->data[1];
+        for (int i = 0; i < s->cursor_h; i++) {
+            for (int j = 0; j < s->cursor_w; j++) {
+                int cr = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 0];
+                int cg = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 1];
+                int cb = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 2];
+                int best = INT_MAX;
+                int index = 0;
+                int dist;
+
+                if (cr == s->cursor[0] && cg == s->cursor[1] && cb == s->cursor[2])
+                    continue;
+
+                dst = s->frame->data[0] + s->frame->linesize[0] * (s->cursor_y + i) + (s->cursor_x + j);
+                for (int k = 0; k < 256; k++) {
+                    int pr = pal[k * 4 + 0];
+                    int pg = pal[k * 4 + 1];
+                    int pb = pal[k * 4 + 2];
+
+                    dist = FFABS(cr - pr) + FFABS(cg - pg) + FFABS(cb - pb);
+                    if (dist < best) {
+                        best = dist;
+                        index = k;
+                    }
+                }
+                dst[0] = index;
+            }
+        }
+    } else if (avctx->pix_fmt == AV_PIX_FMT_RGB555LE) {
+        for (int i = 0; i < s->cursor_h; i++) {
+            for (int j = 0; j < s->cursor_w; j++) {
+                int cr = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 0];
+                int cg = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 1];
+                int cb = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 2];
+
+                if (cr == s->cursor[0] && cg == s->cursor[1] && cb == s->cursor[2])
+                    continue;
+
+                cr >>= 3; cg >>=3; cb >>= 3;
+                dst = s->frame->data[0] + s->frame->linesize[0] * (s->cursor_y + i) + 2 * (s->cursor_x + j);
+                AV_WL16(dst, cr | cg << 5 | cb << 10);
+            }
+        }
+    } else if (avctx->pix_fmt == AV_PIX_FMT_BGR0) {
+        for (int i = 0; i < s->cursor_h; i++) {
+            for (int j = 0; j < s->cursor_w; j++) {
+                int cr = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 0];
+                int cg = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 1];
+                int cb = s->cursor[3 * s->cursor_w * (s->cursor_h - i - 1) + 3 * j + 2];
+
+                if (cr == s->cursor[0] && cg == s->cursor[1] && cb == s->cursor[2])
+                    continue;
+
+                dst = s->frame->data[0] + s->frame->linesize[0] * (s->cursor_y + i) + 4 * (s->cursor_x + j);
+                dst[0] = cb;
+                dst[1] = cg;
+                dst[2] = cr;
+            }
+        }
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    RASCContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    int ret, intra = 0;
+    AVFrame *frame = data;
+
+    bytestream2_init(gb, avpkt->data, avpkt->size);
+
+    if (bytestream2_peek_le32(gb) == EMPT)
+        return avpkt->size;
+
+    s->frame = frame;
+
+    while (bytestream2_get_bytes_left(gb) > 0) {
+        unsigned type, size = 0;
+
+        if (bytestream2_get_bytes_left(gb) < 8)
+            return AVERROR_INVALIDDATA;
+
+        type = bytestream2_get_le32(gb);
+        if (type == KBND || type == BNDL) {
+            intra = type == KBND;
+            type = bytestream2_get_le32(gb);
+        }
+
+        size = bytestream2_get_le32(gb);
+        if (bytestream2_get_bytes_left(gb) < size)
+            return AVERROR_INVALIDDATA;
+
+        switch (type) {
+        case FINT:
+        case INIT:
+            ret = decode_fint(avctx, avpkt, size);
+            break;
+        case KFRM:
+            ret = decode_kfrm(avctx, avpkt, size);
+            break;
+        case DLTA:
+            ret = decode_dlta(avctx, avpkt, size);
+            break;
+        case MOVE:
+            ret = decode_move(avctx, avpkt, size);
+            break;
+        case MOUS:
+            ret = decode_mous(avctx, avpkt, size);
+            break;
+        case MPOS:
+            ret = decode_mpos(avctx, avpkt, size);
+            break;
+        default:
+            bytestream2_skip(gb, size);
+        }
+
+        if (ret < 0)
+            return ret;
+    }
+
+    if (!s->frame2->data[0] || !s->frame1->data[0])
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = ff_get_buffer(avctx, s->frame, 0)) < 0)
+        return ret;
+
+    copy_plane(avctx, s->frame2, s->frame);
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        memcpy(s->frame->data[1], s->frame2->data[1], 1024);
+    if (!s->skip_cursor)
+        draw_cursor(avctx);
+
+    s->frame->key_frame = intra;
+    s->frame->pict_type = intra ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    RASCContext *s = avctx->priv_data;
+    int zret;
+
+    s->zstream.zalloc = Z_NULL;
+    s->zstream.zfree = Z_NULL;
+    s->zstream.opaque = Z_NULL;
+    zret = inflateInit(&s->zstream);
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate init error: %d\n", zret);
+        return AVERROR_EXTERNAL;
+    }
+
+    s->frame1 = av_frame_alloc();
+    s->frame2 = av_frame_alloc();
+    if (!s->frame1 || !s->frame2)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    RASCContext *s = avctx->priv_data;
+
+    av_freep(&s->cursor);
+    s->cursor_size = 0;
+    av_freep(&s->delta);
+    s->delta_size = 0;
+    av_frame_free(&s->frame1);
+    av_frame_free(&s->frame2);
+    inflateEnd(&s->zstream);
+
+    return 0;
+}
+
+static void decode_flush(AVCodecContext *avctx)
+{
+    RASCContext *s = avctx->priv_data;
+
+    clear_plane(avctx, s->frame1);
+    clear_plane(avctx, s->frame2);
+}
+
+static const AVOption options[] = {
+{ "skip_cursor", "skip the cursor", offsetof(RASCContext, skip_cursor), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM },
+{ NULL },
+};
+
+static const AVClass rasc_decoder_class = {
+    .class_name = "rasc decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_rasc_decoder = {
+    .name             = "rasc",
+    .long_name        = NULL_IF_CONFIG_SMALL("RemotelyAnywhere Screen Capture"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_RASC,
+    .priv_data_size   = sizeof(RASCContext),
+    .init             = decode_init,
+    .close            = decode_close,
+    .decode           = decode_frame,
+    .flush            = decode_flush,
+    .capabilities     = AV_CODEC_CAP_DR1,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+    .priv_class       = &rasc_decoder_class,
+};
diff --git a/libavcodec/ratecontrol.c b/libavcodec/ratecontrol.c
index 097da2e..49d169b 100644
--- a/libavcodec/ratecontrol.c
+++ b/libavcodec/ratecontrol.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,12 +35,31 @@
 #include "mpegvideo.h"
 #include "libavutil/eval.h"
 
-#undef NDEBUG // Always check asserts, the speed effect is far too small to disable them.
-#include <assert.h>
+void ff_write_pass1_stats(MpegEncContext *s)
+{
+    snprintf(s->avctx->stats_out, 256,
+             "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d "
+             "fcode:%d bcode:%d mc-var:%"PRId64" var:%"PRId64" icount:%d skipcount:%d hbits:%d;\n",
+             s->current_picture_ptr->f->display_picture_number,
+             s->current_picture_ptr->f->coded_picture_number,
+             s->pict_type,
+             s->current_picture.f->quality,
+             s->i_tex_bits,
+             s->p_tex_bits,
+             s->mv_bits,
+             s->misc_bits,
+             s->f_code,
+             s->b_code,
+             s->current_picture.mc_mb_var_sum,
+             s->current_picture.mb_var_sum,
+             s->i_count, s->skip_count,
+             s->header_bits);
+}
 
-#ifndef M_E
-#define M_E 2.718281828
-#endif
+static double get_fps(AVCodecContext *avctx)
+{
+    return 1.0 / av_q2d(avctx->time_base) / FFMAX(avctx->ticks_per_frame, 1);
+}
 
 static inline double qp2bits(RateControlEntry *rce, double qp)
 {
@@ -102,7 +121,7 @@ static void get_qminmax(int *qmin_ret, int *qmax_ret, MpegEncContext *s, int pic
     int qmin = s->lmin;
     int qmax = s->lmax;
 
-    assert(qmin <= qmax);
+    av_assert0(qmin <= qmax);
 
     switch (pict_type) {
     case AV_PICTURE_TYPE_B:
@@ -130,7 +149,7 @@ static double modify_qscale(MpegEncContext *s, RateControlEntry *rce,
 {
     RateControlContext *rcc  = &s->rc_context;
     const double buffer_size = s->avctx->rc_buffer_size;
-    const double fps         = 1 / av_q2d(s->avctx->time_base);
+    const double fps         = get_fps(s->avctx);
     const double min_rate    = s->avctx->rc_min_rate / fps;
     const double max_rate    = s->avctx->rc_max_rate / fps;
     const int pict_type      = rce->new_pict_type;
@@ -294,7 +313,7 @@ static int init_pass2(MpegEncContext *s)
     RateControlContext *rcc = &s->rc_context;
     AVCodecContext *a       = s->avctx;
     int i, toobig;
-    double fps             = 1 / av_q2d(s->avctx->time_base);
+    double fps             = get_fps(s->avctx);
     double complexity[5]   = { 0 }; // approximate bits at quant=1
     uint64_t const_bits[5] = { 0 }; // quantizer independent bits
     uint64_t all_const_bits;
@@ -303,7 +322,7 @@ static int init_pass2(MpegEncContext *s)
     double rate_factor          = 0;
     double step;
     const int filter_size = (int)(a->qblur * 4) | 1;
-    double expected_bits;
+    double expected_bits = 0; // init to silence gcc warning
     double *qscale, *blurred_qscale, qscale_sum;
 
     /* find complexity & const_bits & decide the pict_types */
@@ -330,8 +349,8 @@ static int init_pass2(MpegEncContext *s)
         return -1;
     }
 
-    qscale         = av_malloc(sizeof(double) * rcc->num_entries);
-    blurred_qscale = av_malloc(sizeof(double) * rcc->num_entries);
+    qscale         = av_malloc_array(rcc->num_entries, sizeof(double));
+    blurred_qscale = av_malloc_array(rcc->num_entries, sizeof(double));
     if (!qscale || !blurred_qscale) {
         av_free(qscale);
         av_free(blurred_qscale);
@@ -352,9 +371,15 @@ static int init_pass2(MpegEncContext *s)
             qscale[i] = get_qscale(s, &rcc->entry[i], rate_factor, i);
             rcc->last_qscale_for[rce->pict_type] = qscale[i];
         }
-        assert(filter_size % 2 == 1);
+        av_assert0(filter_size % 2 == 1);
 
         /* fixed I/B QP relative to P mode */
+        for (i = FFMAX(0, rcc->num_entries - 300); i < rcc->num_entries; i++) {
+            RateControlEntry *rce = &rcc->entry[i];
+
+            qscale[i] = get_diff_limited_q(s, rce, qscale[i]);
+        }
+
         for (i = rcc->num_entries - 1; i >= 0; i--) {
             RateControlEntry *rce = &rcc->entry[i];
 
@@ -418,11 +443,11 @@ static int init_pass2(MpegEncContext *s)
         qscale_sum += av_clip(rcc->entry[i].new_qscale / FF_QP2LAMBDA,
                               s->avctx->qmin, s->avctx->qmax);
     }
-    assert(toobig <= 40);
+    av_assert0(toobig <= 40);
     av_log(s->avctx, AV_LOG_DEBUG,
-           "[lavc rc] requested bitrate: %d bps  expected bitrate: %d bps\n",
+           "[lavc rc] requested bitrate: %"PRId64" bps  expected bitrate: %"PRId64" bps\n",
            s->bit_rate,
-           (int)(expected_bits / ((double)all_available_bits / s->bit_rate)));
+           (int64_t)(expected_bits / ((double)all_available_bits / s->bit_rate)));
     av_log(s->avctx, AV_LOG_DEBUG,
            "[lavc rc] estimated target average qp: %.3f\n",
            (float)qscale_sum / rcc->num_entries);
@@ -483,6 +508,13 @@ av_cold int ff_rate_control_init(MpegEncContext *s)
     };
     emms_c();
 
+    if (!s->avctx->rc_max_available_vbv_use && s->avctx->rc_buffer_size) {
+        if (s->avctx->rc_max_rate) {
+            s->avctx->rc_max_available_vbv_use = av_clipf(s->avctx->rc_max_rate/(s->avctx->rc_buffer_size*get_fps(s->avctx)), 1.0/3, 1.0);
+        } else
+            s->avctx->rc_max_available_vbv_use = 1.0;
+    }
+
     res = av_expr_parse(&rcc->rc_eq_eval,
                         s->rc_eq ? s->rc_eq : "tex^qComp",
                         const_names, func1_names, func1,
@@ -506,6 +538,8 @@ av_cold int ff_rate_control_init(MpegEncContext *s)
         rcc->last_qscale_for[i] = FF_QP2LAMBDA * 5;
     }
     rcc->buffer_index = s->avctx->rc_initial_buffer_occupancy;
+    if (!rcc->buffer_index)
+        rcc->buffer_index = s->avctx->rc_buffer_size * 3 / 4;
 
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
         int i;
@@ -519,9 +553,9 @@ av_cold int ff_rate_control_init(MpegEncContext *s)
         if (i <= 0 || i >= INT_MAX / sizeof(RateControlEntry))
             return -1;
         rcc->entry       = av_mallocz(i * sizeof(RateControlEntry));
-        rcc->num_entries = i;
         if (!rcc->entry)
             return AVERROR(ENOMEM);
+        rcc->num_entries = i;
 
         /* init all to skipped P-frames
          * (with B-frames we might have a not encoded frame at the end FIXME) */
@@ -549,11 +583,11 @@ av_cold int ff_rate_control_init(MpegEncContext *s)
             }
             e = sscanf(p, " in:%d ", &picture_number);
 
-            assert(picture_number >= 0);
-            assert(picture_number < rcc->num_entries);
+            av_assert0(picture_number >= 0);
+            av_assert0(picture_number < rcc->num_entries);
             rce = &rcc->entry[picture_number];
 
-            e += sscanf(p, " in:%*d out:%*d type:%d q:%f itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%d var:%d icount:%d skipcount:%d hbits:%d",
+            e += sscanf(p, " in:%*d out:%*d type:%d q:%f itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%"SCNd64" var:%"SCNd64" icount:%d skipcount:%d hbits:%d",
                         &rce->pict_type, &rce->qscale, &rce->i_tex_bits, &rce->p_tex_bits,
                         &rce->mv_bits, &rce->misc_bits,
                         &rce->f_code, &rce->b_code,
@@ -627,7 +661,7 @@ av_cold int ff_rate_control_init(MpegEncContext *s)
                 get_qscale(s, &rce, rcc->pass1_wanted_bits / rcc->pass1_rc_eq_output_sum, i);
 
                 // FIXME misbehaves a little for variable fps
-                rcc->pass1_wanted_bits += s->bit_rate / (1 / av_q2d(s->avctx->time_base));
+                rcc->pass1_wanted_bits += s->bit_rate / get_fps(s->avctx);
             }
         }
     }
@@ -647,7 +681,7 @@ av_cold void ff_rate_control_uninit(MpegEncContext *s)
 int ff_vbv_update(MpegEncContext *s, int frame_size)
 {
     RateControlContext *rcc = &s->rc_context;
-    const double fps        = 1 / av_q2d(s->avctx->time_base);
+    const double fps        = get_fps(s->avctx);
     const int buffer_size   = s->avctx->rc_buffer_size;
     const double min_rate   = s->avctx->rc_min_rate / fps;
     const double max_rate   = s->avctx->rc_max_rate / fps;
@@ -661,6 +695,9 @@ int ff_vbv_update(MpegEncContext *s, int frame_size)
         rcc->buffer_index -= frame_size;
         if (rcc->buffer_index < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "rc buffer underflow\n");
+            if (frame_size > max_rate && s->qscale == s->avctx->qmax) {
+                av_log(s->avctx, AV_LOG_ERROR, "max bitrate possibly too small or try trellis with large lmax or increase qmax\n");
+            }
             rcc->buffer_index = 0;
         }
 
@@ -843,26 +880,32 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
     RateControlEntry local_rce, *rce;
     double bits;
     double rate_factor;
-    int var;
+    int64_t var;
     const int pict_type = s->pict_type;
     Picture * const pic = &s->current_picture;
     emms_c();
 
     get_qminmax(&qmin, &qmax, s, pict_type);
 
-    fps = 1 / av_q2d(s->avctx->time_base);
+    fps = get_fps(s->avctx);
     /* update predictors */
     if (picture_number > 2 && !dry_run) {
-        const int last_var = s->last_pict_type == AV_PICTURE_TYPE_I ? rcc->last_mb_var_sum
-                                                                    : rcc->last_mc_mb_var_sum;
+        const int64_t last_var =
+            s->last_pict_type == AV_PICTURE_TYPE_I ? rcc->last_mb_var_sum
+                                                   : rcc->last_mc_mb_var_sum;
+        av_assert1(s->frame_bits >= s->stuffing_bits);
         update_predictor(&rcc->pred[s->last_pict_type],
                          rcc->last_qscale,
-                         sqrt(last_var), s->frame_bits);
+                         sqrt(last_var),
+                         s->frame_bits - s->stuffing_bits);
     }
 
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
-        assert(picture_number >= 0);
-        assert(picture_number < rcc->num_entries);
+        av_assert0(picture_number >= 0);
+        if (picture_number >= rcc->num_entries) {
+            av_log(s, AV_LOG_ERROR, "Input is longer than 2-pass log file\n");
+            return -1;
+        }
         rce         = &rcc->entry[picture_number];
         wanted_bits = rce->expected_bits;
     } else {
@@ -893,10 +936,10 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
     short_term_q = 0; /* avoid warning */
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
         if (pict_type != AV_PICTURE_TYPE_I)
-            assert(pict_type == rce->new_pict_type);
+            av_assert0(pict_type == rce->new_pict_type);
 
         q = rce->new_qscale / br_compensation;
-        ff_dlog(s, "%f %f %f last:%d var:%d type:%d//\n", q, rce->new_qscale,
+        ff_dlog(s, "%f %f %f last:%d var:%"PRId64" type:%d//\n", q, rce->new_qscale,
                 br_compensation, s->frame_bits, var, pict_type);
     } else {
         rce->pict_type     =
@@ -925,7 +968,6 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
         rcc->mv_bits_sum[pict_type] += rce->mv_bits;
         rcc->frame_count[pict_type]++;
 
-        bits        = rce->i_tex_bits + rce->p_tex_bits;
         rate_factor = rcc->pass1_wanted_bits /
                       rcc->pass1_rc_eq_output_sum * br_compensation;
 
@@ -933,9 +975,9 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
         if (q < 0)
             return -1;
 
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
         q = get_diff_limited_q(s, rce, q);
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
 
         // FIXME type dependent blur like in 2-pass
         if (pict_type == AV_PICTURE_TYPE_P || s->intra_only) {
@@ -946,19 +988,19 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
             rcc->short_term_qcount++;
             q = short_term_q = rcc->short_term_qsum / rcc->short_term_qcount;
         }
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
 
         q = modify_qscale(s, rce, q, picture_number);
 
         rcc->pass1_wanted_bits += s->bit_rate / fps;
 
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
     }
 
     if (s->avctx->debug & FF_DEBUG_RC) {
         av_log(s->avctx, AV_LOG_DEBUG,
                "%c qp:%d<%2.1f<%d %d want:%d total:%d comp:%f st_q:%2.2f "
-               "size:%d var:%d/%d br:%d fps:%d\n",
+               "size:%d var:%"PRId64"/%"PRId64" br:%"PRId64" fps:%d\n",
                av_get_picture_type_char(pict_type),
                qmin, q, qmax, picture_number,
                (int)wanted_bits / 1000, (int)s->total_bits / 1000,
diff --git a/libavcodec/ratecontrol.h b/libavcodec/ratecontrol.h
index 7c289c6..2a7aaec 100644
--- a/libavcodec/ratecontrol.h
+++ b/libavcodec/ratecontrol.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,8 +49,8 @@ typedef struct RateControlEntry{
     uint64_t expected_bits;
     int new_pict_type;
     float new_qscale;
-    int mc_mb_var_sum;
-    int mb_var_sum;
+    int64_t mc_mb_var_sum;
+    int64_t mb_var_sum;
     int i_count;
     int skip_count;
     int f_code;
@@ -71,14 +71,18 @@ typedef struct RateControlContext{
     double pass1_wanted_bits;     ///< bits which should have been output by the pass1 code (including complexity init)
     double last_qscale;
     double last_qscale_for[5];    ///< last qscale for a specific pict type, used for max_diff & ipb factor stuff
-    int last_mc_mb_var_sum;
-    int last_mb_var_sum;
+    int64_t last_mc_mb_var_sum;
+    int64_t last_mb_var_sum;
     uint64_t i_cplx_sum[5];
     uint64_t p_cplx_sum[5];
     uint64_t mv_bits_sum[5];
     uint64_t qscale_sum[5];
     int frame_count[5];
     int last_non_b_pict_type;
+
+    void *non_lavc_opaque;        ///< context for non lavc rc code (for example xvid)
+    float dry_run_qscale;         ///< for xvid rc
+    int last_picture_number;      ///< for xvid rc
     AVExpr * rc_eq_eval;
 }RateControlContext;
 
@@ -87,6 +91,7 @@ struct MpegEncContext;
 /* rate control */
 int ff_rate_control_init(struct MpegEncContext *s);
 float ff_rate_estimate_qscale(struct MpegEncContext *s, int dry_run);
+void ff_write_pass1_stats(struct MpegEncContext *s);
 void ff_rate_control_uninit(struct MpegEncContext *s);
 int ff_vbv_update(struct MpegEncContext *s, int frame_size);
 void ff_get_2pass_fcode(struct MpegEncContext *s);
diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index 67dff9b..b6fb91c 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -2,20 +2,20 @@
  * Raw Video Codec
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUV420P, MKTAG('I', '4', '2', '0') }, /* Planar formats */
     { AV_PIX_FMT_YUV420P, MKTAG('I', 'Y', 'U', 'V') },
+    { AV_PIX_FMT_YUV420P, MKTAG('y', 'v', '1', '2') },
     { AV_PIX_FMT_YUV420P, MKTAG('Y', 'V', '1', '2') },
     { AV_PIX_FMT_YUV410P, MKTAG('Y', 'U', 'V', '9') },
     { AV_PIX_FMT_YUV410P, MKTAG('Y', 'V', 'U', '9') },
@@ -66,6 +67,7 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'u', 'p') },
     { AV_PIX_FMT_UYVY422, MKTAG('V', 'D', 'T', 'Z') }, /* SoftLab-NSK VideoTizer */
     { AV_PIX_FMT_UYVY422, MKTAG('a', 'u', 'v', '2') },
+    { AV_PIX_FMT_UYVY422, MKTAG('c', 'y', 'u', 'v') }, /* CYUV is also Creative YUV */
     { AV_PIX_FMT_UYYVYY411, MKTAG('Y', '4', '1', '1') },
     { AV_PIX_FMT_GRAY8,   MKTAG('G', 'R', 'E', 'Y') },
     { AV_PIX_FMT_NV12,    MKTAG('N', 'V', '1', '2') },
@@ -84,14 +86,18 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_BGR444LE, MKTAG('B', 'G', 'R', 12) },
     { AV_PIX_FMT_RGB444BE, MKTAG(12 , 'B', 'G', 'R') },
     { AV_PIX_FMT_BGR444BE, MKTAG(12 , 'R', 'G', 'B') },
-    { AV_PIX_FMT_RGBA,     MKTAG('R', 'G', 'B', 'A') },
-    { AV_PIX_FMT_BGRA,     MKTAG('B', 'G', 'R', 'A') },
     { AV_PIX_FMT_RGBA64LE, MKTAG('R', 'B', 'A', 64 ) },
     { AV_PIX_FMT_BGRA64LE, MKTAG('B', 'R', 'A', 64 ) },
     { AV_PIX_FMT_RGBA64BE, MKTAG(64 , 'R', 'B', 'A') },
     { AV_PIX_FMT_BGRA64BE, MKTAG(64 , 'B', 'R', 'A') },
+    { AV_PIX_FMT_RGBA,     MKTAG('R', 'G', 'B', 'A') },
+    { AV_PIX_FMT_RGB0,     MKTAG('R', 'G', 'B',  0 ) },
+    { AV_PIX_FMT_BGRA,     MKTAG('B', 'G', 'R', 'A') },
+    { AV_PIX_FMT_BGR0,     MKTAG('B', 'G', 'R',  0 ) },
     { AV_PIX_FMT_ABGR,     MKTAG('A', 'B', 'G', 'R') },
+    { AV_PIX_FMT_0BGR,     MKTAG( 0 , 'B', 'G', 'R') },
     { AV_PIX_FMT_ARGB,     MKTAG('A', 'R', 'G', 'B') },
+    { AV_PIX_FMT_0RGB,     MKTAG( 0 , 'R', 'G', 'B') },
     { AV_PIX_FMT_RGB24,    MKTAG('R', 'G', 'B', 24 ) },
     { AV_PIX_FMT_BGR24,    MKTAG('B', 'G', 'R', 24 ) },
     { AV_PIX_FMT_YUV411P,  MKTAG('4', '1', '1', 'P') },
@@ -113,14 +119,40 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_RGB48BE,  MKTAG( 48, 'R', 'G', 'B') },
     { AV_PIX_FMT_BGR48LE,  MKTAG('B', 'G', 'R', 48 ) },
     { AV_PIX_FMT_BGR48BE,  MKTAG( 48, 'B', 'G', 'R') },
+    { AV_PIX_FMT_GRAY9LE,     MKTAG('Y', '1',  0 ,  9 ) },
+    { AV_PIX_FMT_GRAY9BE,     MKTAG( 9 ,  0 , '1', 'Y') },
+    { AV_PIX_FMT_GRAY10LE,    MKTAG('Y', '1',  0 , 10 ) },
+    { AV_PIX_FMT_GRAY10BE,    MKTAG(10 ,  0 , '1', 'Y') },
+    { AV_PIX_FMT_GRAY12LE,    MKTAG('Y', '1',  0 , 12 ) },
+    { AV_PIX_FMT_GRAY12BE,    MKTAG(12 ,  0 , '1', 'Y') },
+    { AV_PIX_FMT_GRAY14LE,    MKTAG('Y', '1',  0 , 14 ) },
+    { AV_PIX_FMT_GRAY14BE,    MKTAG(14 ,  0 , '1', 'Y') },
     { AV_PIX_FMT_GRAY16LE,    MKTAG('Y', '1',  0 , 16 ) },
     { AV_PIX_FMT_GRAY16BE,    MKTAG(16 ,  0 , '1', 'Y') },
+    { AV_PIX_FMT_YUV420P9LE,  MKTAG('Y', '3', 11 ,  9 ) },
+    { AV_PIX_FMT_YUV420P9BE,  MKTAG( 9 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P9LE,  MKTAG('Y', '3', 10 ,  9 ) },
+    { AV_PIX_FMT_YUV422P9BE,  MKTAG( 9 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P9LE,  MKTAG('Y', '3',  0 ,  9 ) },
+    { AV_PIX_FMT_YUV444P9BE,  MKTAG( 9 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUV420P10LE, MKTAG('Y', '3', 11 , 10 ) },
     { AV_PIX_FMT_YUV420P10BE, MKTAG(10 , 11 , '3', 'Y') },
     { AV_PIX_FMT_YUV422P10LE, MKTAG('Y', '3', 10 , 10 ) },
     { AV_PIX_FMT_YUV422P10BE, MKTAG(10 , 10 , '3', 'Y') },
     { AV_PIX_FMT_YUV444P10LE, MKTAG('Y', '3',  0 , 10 ) },
     { AV_PIX_FMT_YUV444P10BE, MKTAG(10 ,  0 , '3', 'Y') },
+    { AV_PIX_FMT_YUV420P12LE, MKTAG('Y', '3', 11 , 12 ) },
+    { AV_PIX_FMT_YUV420P12BE, MKTAG(12 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P12LE, MKTAG('Y', '3', 10 , 12 ) },
+    { AV_PIX_FMT_YUV422P12BE, MKTAG(12 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P12LE, MKTAG('Y', '3',  0 , 12 ) },
+    { AV_PIX_FMT_YUV444P12BE, MKTAG(12 ,  0 , '3', 'Y') },
+    { AV_PIX_FMT_YUV420P14LE, MKTAG('Y', '3', 11 , 14 ) },
+    { AV_PIX_FMT_YUV420P14BE, MKTAG(14 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P14LE, MKTAG('Y', '3', 10 , 14 ) },
+    { AV_PIX_FMT_YUV422P14BE, MKTAG(14 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P14LE, MKTAG('Y', '3',  0 , 14 ) },
+    { AV_PIX_FMT_YUV444P14BE, MKTAG(14 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUV420P16LE, MKTAG('Y', '3', 11 , 16 ) },
     { AV_PIX_FMT_YUV420P16BE, MKTAG(16 , 11 , '3', 'Y') },
     { AV_PIX_FMT_YUV422P16LE, MKTAG('Y', '3', 10 , 16 ) },
@@ -128,7 +160,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUV444P16LE, MKTAG('Y', '3',  0 , 16 ) },
     { AV_PIX_FMT_YUV444P16BE, MKTAG(16 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUVA420P,    MKTAG('Y', '4', 11 ,  8 ) },
+    { AV_PIX_FMT_YUVA422P,    MKTAG('Y', '4', 10 ,  8 ) },
+    { AV_PIX_FMT_YUVA444P,    MKTAG('Y', '4',  0 ,  8 ) },
     { AV_PIX_FMT_YA8,         MKTAG('Y', '2',  0 ,  8 ) },
+    { AV_PIX_FMT_PAL8,        MKTAG('P', 'A', 'L',  8 ) },
 
     { AV_PIX_FMT_YUVA420P9LE,  MKTAG('Y', '4', 11 ,  9 ) },
     { AV_PIX_FMT_YUVA420P9BE,  MKTAG( 9 , 11 , '4', 'Y') },
@@ -142,6 +177,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUVA422P10BE, MKTAG(10 , 10 , '4', 'Y') },
     { AV_PIX_FMT_YUVA444P10LE, MKTAG('Y', '4',  0 , 10 ) },
     { AV_PIX_FMT_YUVA444P10BE, MKTAG(10 ,  0 , '4', 'Y') },
+    { AV_PIX_FMT_YUVA422P12LE, MKTAG('Y', '4', 10 , 12 ) },
+    { AV_PIX_FMT_YUVA422P12BE, MKTAG(12 , 10 , '4', 'Y') },
+    { AV_PIX_FMT_YUVA444P12LE, MKTAG('Y', '4',  0 , 12 ) },
+    { AV_PIX_FMT_YUVA444P12BE, MKTAG(12 ,  0 , '4', 'Y') },
     { AV_PIX_FMT_YUVA420P16LE, MKTAG('Y', '4', 11 , 16 ) },
     { AV_PIX_FMT_YUVA420P16BE, MKTAG(16 , 11 , '4', 'Y') },
     { AV_PIX_FMT_YUVA422P16LE, MKTAG('Y', '4', 10 , 16 ) },
@@ -149,10 +188,49 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUVA444P16LE, MKTAG('Y', '4',  0 , 16 ) },
     { AV_PIX_FMT_YUVA444P16BE, MKTAG(16 ,  0 , '4', 'Y') },
 
+    { AV_PIX_FMT_GBRP,         MKTAG('G', '3', 00 ,  8 ) },
+    { AV_PIX_FMT_GBRP9LE,      MKTAG('G', '3', 00 ,  9 ) },
+    { AV_PIX_FMT_GBRP9BE,      MKTAG( 9 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP10LE,     MKTAG('G', '3', 00 , 10 ) },
+    { AV_PIX_FMT_GBRP10BE,     MKTAG(10 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP12LE,     MKTAG('G', '3', 00 , 12 ) },
+    { AV_PIX_FMT_GBRP12BE,     MKTAG(12 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP14LE,     MKTAG('G', '3', 00 , 14 ) },
+    { AV_PIX_FMT_GBRP14BE,     MKTAG(14 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP16LE,     MKTAG('G', '3', 00 , 16 ) },
+    { AV_PIX_FMT_GBRP16BE,     MKTAG(16 , 00 , '3', 'G') },
+
+    { AV_PIX_FMT_GBRAP,        MKTAG('G', '4', 00 ,  8 ) },
+    { AV_PIX_FMT_GBRAP10LE,    MKTAG('G', '4', 00 , 10 ) },
+    { AV_PIX_FMT_GBRAP10BE,    MKTAG(10 , 00 , '4', 'G') },
+    { AV_PIX_FMT_GBRAP12LE,    MKTAG('G', '4', 00 , 12 ) },
+    { AV_PIX_FMT_GBRAP12BE,    MKTAG(12 , 00 , '4', 'G') },
+    { AV_PIX_FMT_GBRAP16LE,    MKTAG('G', '4', 00 , 16 ) },
+    { AV_PIX_FMT_GBRAP16BE,    MKTAG(16 , 00 , '4', 'G') },
+
+    { AV_PIX_FMT_XYZ12LE,      MKTAG('X', 'Y', 'Z' , 36 ) },
+    { AV_PIX_FMT_XYZ12BE,      MKTAG(36 , 'Z' , 'Y', 'X') },
+
+    { AV_PIX_FMT_BAYER_BGGR8,    MKTAG(0xBA, 'B', 'G', 8   ) },
+    { AV_PIX_FMT_BAYER_BGGR16LE, MKTAG(0xBA, 'B', 'G', 16  ) },
+    { AV_PIX_FMT_BAYER_BGGR16BE, MKTAG(16  , 'G', 'B', 0xBA) },
+    { AV_PIX_FMT_BAYER_RGGB8,    MKTAG(0xBA, 'R', 'G', 8   ) },
+    { AV_PIX_FMT_BAYER_RGGB16LE, MKTAG(0xBA, 'R', 'G', 16  ) },
+    { AV_PIX_FMT_BAYER_RGGB16BE, MKTAG(16  , 'G', 'R', 0xBA) },
+    { AV_PIX_FMT_BAYER_GBRG8,    MKTAG(0xBA, 'G', 'B', 8   ) },
+    { AV_PIX_FMT_BAYER_GBRG16LE, MKTAG(0xBA, 'G', 'B', 16  ) },
+    { AV_PIX_FMT_BAYER_GBRG16BE, MKTAG(16,   'B', 'G', 0xBA) },
+    { AV_PIX_FMT_BAYER_GRBG8,    MKTAG(0xBA, 'G', 'R', 8   ) },
+    { AV_PIX_FMT_BAYER_GRBG16LE, MKTAG(0xBA, 'G', 'R', 16  ) },
+    { AV_PIX_FMT_BAYER_GRBG16BE, MKTAG(16,   'R', 'G', 0xBA) },
+
     /* quicktime */
+    { AV_PIX_FMT_YUV420P, MKTAG('R', '4', '2', '0') }, /* Radius DV YUV PAL */
+    { AV_PIX_FMT_YUV411P, MKTAG('R', '4', '1', '1') }, /* Radius DV YUV NTSC */
     { AV_PIX_FMT_UYVY422, MKTAG('2', 'v', 'u', 'y') },
     { AV_PIX_FMT_UYVY422, MKTAG('2', 'V', 'u', 'y') },
     { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'U', 'I') }, /* FIXME merge both fields */
+    { AV_PIX_FMT_UYVY422, MKTAG('b', 'x', 'y', 'v') },
     { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', '2') },
     { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', 's') },
     { AV_PIX_FMT_YUYV422, MKTAG('D', 'V', 'O', 'O') }, /* Digital Voodoo SD 8 Bit */
@@ -160,11 +238,56 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_RGB565LE,MKTAG('L', '5', '6', '5') },
     { AV_PIX_FMT_RGB565BE,MKTAG('B', '5', '6', '5') },
     { AV_PIX_FMT_BGR24,   MKTAG('2', '4', 'B', 'G') },
+    { AV_PIX_FMT_BGR24,   MKTAG('b', 'x', 'b', 'g') },
     { AV_PIX_FMT_BGRA,    MKTAG('B', 'G', 'R', 'A') },
     { AV_PIX_FMT_RGBA,    MKTAG('R', 'G', 'B', 'A') },
+    { AV_PIX_FMT_RGB24,   MKTAG('b', 'x', 'r', 'g') },
     { AV_PIX_FMT_ABGR,    MKTAG('A', 'B', 'G', 'R') },
     { AV_PIX_FMT_GRAY16BE,MKTAG('b', '1', '6', 'g') },
     { AV_PIX_FMT_RGB48BE, MKTAG('b', '4', '8', 'r') },
+    { AV_PIX_FMT_RGBA64BE,MKTAG('b', '6', '4', 'a') },
+
+    /* vlc */
+    { AV_PIX_FMT_YUV410P,     MKTAG('I', '4', '1', '0') },
+    { AV_PIX_FMT_YUV411P,     MKTAG('I', '4', '1', '1') },
+    { AV_PIX_FMT_YUV422P,     MKTAG('I', '4', '2', '2') },
+    { AV_PIX_FMT_YUV440P,     MKTAG('I', '4', '4', '0') },
+    { AV_PIX_FMT_YUV444P,     MKTAG('I', '4', '4', '4') },
+    { AV_PIX_FMT_YUVJ420P,    MKTAG('J', '4', '2', '0') },
+    { AV_PIX_FMT_YUVJ422P,    MKTAG('J', '4', '2', '2') },
+    { AV_PIX_FMT_YUVJ440P,    MKTAG('J', '4', '4', '0') },
+    { AV_PIX_FMT_YUVJ444P,    MKTAG('J', '4', '4', '4') },
+    { AV_PIX_FMT_YUVA444P,    MKTAG('Y', 'U', 'V', 'A') },
+    { AV_PIX_FMT_YUVA420P,    MKTAG('I', '4', '0', 'A') },
+    { AV_PIX_FMT_YUVA422P,    MKTAG('I', '4', '2', 'A') },
+    { AV_PIX_FMT_RGB8,        MKTAG('R', 'G', 'B', '2') },
+    { AV_PIX_FMT_RGB555LE,    MKTAG('R', 'V', '1', '5') },
+    { AV_PIX_FMT_RGB565LE,    MKTAG('R', 'V', '1', '6') },
+    { AV_PIX_FMT_BGR24,       MKTAG('R', 'V', '2', '4') },
+    { AV_PIX_FMT_BGR0,        MKTAG('R', 'V', '3', '2') },
+    { AV_PIX_FMT_RGBA,        MKTAG('A', 'V', '3', '2') },
+    { AV_PIX_FMT_YUV420P9LE,  MKTAG('I', '0', '9', 'L') },
+    { AV_PIX_FMT_YUV420P9BE,  MKTAG('I', '0', '9', 'B') },
+    { AV_PIX_FMT_YUV422P9LE,  MKTAG('I', '2', '9', 'L') },
+    { AV_PIX_FMT_YUV422P9BE,  MKTAG('I', '2', '9', 'B') },
+    { AV_PIX_FMT_YUV444P9LE,  MKTAG('I', '4', '9', 'L') },
+    { AV_PIX_FMT_YUV444P9BE,  MKTAG('I', '4', '9', 'B') },
+    { AV_PIX_FMT_YUV420P10LE, MKTAG('I', '0', 'A', 'L') },
+    { AV_PIX_FMT_YUV420P10BE, MKTAG('I', '0', 'A', 'B') },
+    { AV_PIX_FMT_YUV422P10LE, MKTAG('I', '2', 'A', 'L') },
+    { AV_PIX_FMT_YUV422P10BE, MKTAG('I', '2', 'A', 'B') },
+    { AV_PIX_FMT_YUV444P10LE, MKTAG('I', '4', 'A', 'L') },
+    { AV_PIX_FMT_YUV444P10BE, MKTAG('I', '4', 'A', 'B') },
+    { AV_PIX_FMT_YUV420P12LE, MKTAG('I', '0', 'C', 'L') },
+    { AV_PIX_FMT_YUV420P12BE, MKTAG('I', '0', 'C', 'B') },
+    { AV_PIX_FMT_YUV422P12LE, MKTAG('I', '2', 'C', 'L') },
+    { AV_PIX_FMT_YUV422P12BE, MKTAG('I', '2', 'C', 'B') },
+    { AV_PIX_FMT_YUV444P12LE, MKTAG('I', '4', 'C', 'L') },
+    { AV_PIX_FMT_YUV444P12BE, MKTAG('I', '4', 'C', 'B') },
+    { AV_PIX_FMT_YUV420P16LE, MKTAG('I', '0', 'F', 'L') },
+    { AV_PIX_FMT_YUV420P16BE, MKTAG('I', '0', 'F', 'B') },
+    { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+    { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
 
     /* special */
     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
@@ -173,6 +296,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_NONE, 0 },
 };
 
+const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void)
+{
+    return ff_raw_pix_fmt_tags;
+}
+
 unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat fmt)
 {
     const PixelFormatTag *tags = ff_raw_pix_fmt_tags;
@@ -183,3 +311,28 @@ unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat fmt)
     }
     return 0;
 }
+
+const PixelFormatTag avpriv_pix_fmt_bps_avi[] = {
+    { AV_PIX_FMT_PAL8,    1 },
+    { AV_PIX_FMT_PAL8,    2 },
+    { AV_PIX_FMT_PAL8,    4 },
+    { AV_PIX_FMT_PAL8,    8 },
+    { AV_PIX_FMT_RGB444LE, 12 },
+    { AV_PIX_FMT_RGB555LE, 15 },
+    { AV_PIX_FMT_RGB555LE, 16 },
+    { AV_PIX_FMT_BGR24,  24 },
+    { AV_PIX_FMT_BGRA,   32 },
+    { AV_PIX_FMT_NONE,    0 },
+};
+
+const PixelFormatTag avpriv_pix_fmt_bps_mov[] = {
+    { AV_PIX_FMT_PAL8,      1 },
+    { AV_PIX_FMT_PAL8,      2 },
+    { AV_PIX_FMT_PAL8,      4 },
+    { AV_PIX_FMT_PAL8,      8 },
+    { AV_PIX_FMT_RGB555BE, 16 },
+    { AV_PIX_FMT_RGB24,    24 },
+    { AV_PIX_FMT_ARGB,     32 },
+    { AV_PIX_FMT_PAL8,     33 },
+    { AV_PIX_FMT_NONE,      0 },
+};
diff --git a/libavcodec/raw.h b/libavcodec/raw.h
index bf66671..28a27b1 100644
--- a/libavcodec/raw.h
+++ b/libavcodec/raw.h
@@ -2,20 +2,20 @@
  * Raw Video Codec
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,21 @@
 #define AVCODEC_RAW_H
 
 #include "avcodec.h"
+#include "internal.h"
+#include "libavutil/internal.h"
 
 typedef struct PixelFormatTag {
     enum AVPixelFormat pix_fmt;
     unsigned int fourcc;
 } PixelFormatTag;
 
-extern const PixelFormatTag ff_raw_pix_fmt_tags[];
+extern const PixelFormatTag ff_raw_pix_fmt_tags[]; // exposed through avpriv_get_raw_pix_fmt_tags()
+
+const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void);
+
+enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags, unsigned int fourcc);
+
+extern av_export_avcodec const PixelFormatTag avpriv_pix_fmt_bps_avi[];
+extern av_export_avcodec const PixelFormatTag avpriv_pix_fmt_bps_mov[];
 
 #endif /* AVCODEC_RAW_H */
diff --git a/libavcodec/rawdec.c b/libavcodec/rawdec.c
index 284c345..53f5b76 100644
--- a/libavcodec/rawdec.c
+++ b/libavcodec/rawdec.c
@@ -2,20 +2,20 @@
  * Raw Video Decoder
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,72 +25,65 @@
  */
 
 #include "avcodec.h"
+#include "bswapdsp.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "raw.h"
+#include "libavutil/avassert.h"
 #include "libavutil/buffer.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
 typedef struct RawVideoContext {
+    AVClass *av_class;
     AVBufferRef *palette;
     int frame_size;  /* size of the frame in bytes */
     int flip;
-    int is_2_4_bpp; // 2 or 4 bpp raw in avi/mov
+    int is_1_2_4_8_bpp; // 1, 2, 4 and 8 bpp in avi/mov, 1 and 8 bpp in nut
+    int is_mono;
+    int is_pal8;
+    int is_nut_mono;
+    int is_nut_pal8;
     int is_yuv2;
+    int is_lt_16bpp; // 16bpp pixfmt and bits_per_coded_sample < 16
+    int tff;
+
+    BswapDSPContext bbdsp;
+    void *bitstream_buf;
+    unsigned int bitstream_buf_size;
 } RawVideoContext;
 
-static const PixelFormatTag pix_fmt_bps_avi[] = {
-    { AV_PIX_FMT_PAL8,    4 },
-    { AV_PIX_FMT_PAL8,    8 },
-    { AV_PIX_FMT_RGB444, 12 },
-    { AV_PIX_FMT_RGB555, 15 },
-    { AV_PIX_FMT_RGB555, 16 },
-    { AV_PIX_FMT_BGR24,  24 },
-    { AV_PIX_FMT_RGB32,  32 },
-    { AV_PIX_FMT_NONE,    0 },
+static const AVOption options[]={
+{"top", "top field first", offsetof(RawVideoContext, tff), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, AV_OPT_FLAG_DECODING_PARAM|AV_OPT_FLAG_VIDEO_PARAM},
+{NULL}
 };
 
-static const PixelFormatTag pix_fmt_bps_mov[] = {
-    { AV_PIX_FMT_MONOWHITE, 1 },
-    { AV_PIX_FMT_PAL8,      2 },
-    { AV_PIX_FMT_PAL8,      4 },
-    { AV_PIX_FMT_PAL8,      8 },
-    // FIXME swscale does not support 16 bit in .mov, sample 16bit.mov
-    // http://developer.apple.com/documentation/QuickTime/QTFF/QTFFChap3/qtff3.html
-    { AV_PIX_FMT_RGB555BE, 16 },
-    { AV_PIX_FMT_RGB24,    24 },
-    { AV_PIX_FMT_ARGB,     32 },
-    { AV_PIX_FMT_MONOWHITE,33 },
-    { AV_PIX_FMT_NONE,      0 },
+static const AVClass rawdec_class = {
+    .class_name = "rawdec",
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static enum AVPixelFormat find_pix_fmt(const PixelFormatTag *tags,
-                                       unsigned int fourcc)
-{
-    while (tags->pix_fmt >= 0) {
-        if (tags->fourcc == fourcc)
-            return tags->pix_fmt;
-        tags++;
-    }
-    return AV_PIX_FMT_YUV420P;
-}
-
 static av_cold int raw_init_decoder(AVCodecContext *avctx)
 {
     RawVideoContext *context = avctx->priv_data;
     const AVPixFmtDescriptor *desc;
 
-    if (avctx->codec_tag == MKTAG('r', 'a', 'w', ' '))
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_mov,
+    ff_bswapdsp_init(&context->bbdsp);
+
+    if (   avctx->codec_tag == MKTAG('r','a','w',' ')
+        || avctx->codec_tag == MKTAG('N','O','1','6'))
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_mov,
                                       avctx->bits_per_coded_sample);
     else if (avctx->codec_tag == MKTAG('W', 'R', 'A', 'W'))
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_avi,
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_avi,
                                       avctx->bits_per_coded_sample);
-    else if (avctx->codec_tag)
-        avctx->pix_fmt = find_pix_fmt(ff_raw_pix_fmt_tags, avctx->codec_tag);
+    else if (avctx->codec_tag && (avctx->codec_tag & 0xFFFFFF) != MKTAG('B','I','T', 0))
+        avctx->pix_fmt = avpriv_find_pix_fmt(ff_raw_pix_fmt_tags, avctx->codec_tag);
     else if (avctx->pix_fmt == AV_PIX_FMT_NONE && avctx->bits_per_coded_sample)
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_avi,
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_avi,
                                       avctx->bits_per_coded_sample);
 
     desc = av_pix_fmt_desc_get(avctx->pix_fmt);
@@ -99,31 +92,40 @@ static av_cold int raw_init_decoder(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    if (desc->flags & (AV_PIX_FMT_FLAG_PAL | AV_PIX_FMT_FLAG_PSEUDOPAL)) {
+    if (desc->flags & (AV_PIX_FMT_FLAG_PAL | FF_PSEUDOPAL)) {
         context->palette = av_buffer_alloc(AVPALETTE_SIZE);
         if (!context->palette)
             return AVERROR(ENOMEM);
+#if FF_API_PSEUDOPAL
         if (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)
             avpriv_set_systematic_pal2((uint32_t*)context->palette->data, avctx->pix_fmt);
-        else
+#endif
+        else {
             memset(context->palette->data, 0, AVPALETTE_SIZE);
+            if (avctx->bits_per_coded_sample == 1)
+                memset(context->palette->data, 0xff, 4);
+        }
     }
 
-    context->frame_size = av_image_get_buffer_size(avctx->pix_fmt,
-                                                   avctx->width,
-                                                   avctx->height, 1);
-
-    if ((avctx->bits_per_coded_sample == 4 || avctx->bits_per_coded_sample == 2) &&
-        avctx->pix_fmt == AV_PIX_FMT_PAL8 &&
-       (!avctx->codec_tag || avctx->codec_tag == MKTAG('r','a','w',' ')))
-        context->is_2_4_bpp = 1;
-
     if ((avctx->extradata_size >= 9 &&
          !memcmp(avctx->extradata + avctx->extradata_size - 9, "BottomUp", 9)) ||
+        avctx->codec_tag == MKTAG('c','y','u','v') ||
         avctx->codec_tag == MKTAG(3, 0, 0, 0) ||
         avctx->codec_tag == MKTAG('W','R','A','W'))
         context->flip = 1;
 
+    if (avctx->pix_fmt == AV_PIX_FMT_MONOWHITE ||
+        avctx->pix_fmt == AV_PIX_FMT_MONOBLACK)
+        context->is_mono = 1;
+    else if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        context->is_pal8 = 1;
+
+    if (avctx->codec_tag == MKTAG('B','1','W','0') ||
+        avctx->codec_tag == MKTAG('B','0','W','1'))
+        context->is_nut_mono = 1;
+    else if (avctx->codec_tag == MKTAG('P','A','L',8))
+        context->is_nut_pal8 = 1;
+
     if (avctx->codec_tag == AV_RL32("yuv2") &&
         avctx->pix_fmt   == AV_PIX_FMT_YUYV422)
         context->is_yuv2 = 1;
@@ -137,18 +139,99 @@ static void flip(AVCodecContext *avctx, AVFrame *frame)
     frame->linesize[0] *= -1;
 }
 
+/*
+ * Scale sample to 16-bit resolution
+ */
+#define SCALE16(x, bits) (((x) << (16 - (bits))) | ((x) >> (2 * (bits) - 16)))
+
+/**
+ * Scale buffer to 16 bits per coded sample resolution
+ */
+#define MKSCALE16(name, r16, w16) \
+static void name(AVCodecContext *avctx, uint8_t * dst, const uint8_t *buf, int buf_size, int packed) \
+{ \
+    int i; \
+    if (!packed) { \
+        for (i = 0; i + 1 < buf_size; i += 2) \
+            w16(dst + i, SCALE16(r16(buf + i), avctx->bits_per_coded_sample)); \
+    } else { \
+        GetBitContext gb; \
+        init_get_bits(&gb, buf, buf_size * 8); \
+        for (i = 0; i < avctx->width * avctx->height; i++) { \
+            int sample = get_bits(&gb, avctx->bits_per_coded_sample); \
+            w16(dst + i*2, SCALE16(sample, avctx->bits_per_coded_sample)); \
+        } \
+   } \
+}
+
+MKSCALE16(scale16be, AV_RB16, AV_WB16)
+MKSCALE16(scale16le, AV_RL16, AV_WL16)
+
 static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
                       AVPacket *avpkt)
 {
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+    const AVPixFmtDescriptor *desc;
     RawVideoContext *context       = avctx->priv_data;
     const uint8_t *buf             = avpkt->data;
     int buf_size                   = avpkt->size;
-    int need_copy                  = !avpkt->buf || context->is_2_4_bpp || context->is_yuv2;
-    int res;
+    int linesize_align             = 4;
+    int stride;
+    int res, len;
+    int need_copy;
 
     AVFrame   *frame   = data;
 
+    if (avctx->width <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "width is not set\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (avctx->height <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "height is not set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (context->is_nut_mono)
+        stride = avctx->width / 8 + (avctx->width & 7 ? 1 : 0);
+    else if (context->is_nut_pal8)
+        stride = avctx->width;
+    else
+        stride = avpkt->size / avctx->height;
+
+    av_log(avctx, AV_LOG_DEBUG, "PACKET SIZE: %d, STRIDE: %d\n", avpkt->size, stride);
+
+    if (stride == 0 || avpkt->size < stride * avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "Packet too small (%d)\n", avpkt->size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if ((avctx->bits_per_coded_sample == 8 || avctx->bits_per_coded_sample == 4 ||
+         avctx->bits_per_coded_sample == 2 || avctx->bits_per_coded_sample == 1 ||
+         (avctx->bits_per_coded_sample == 0 && (context->is_nut_pal8 || context->is_mono)) ) &&
+        (context->is_mono || context->is_pal8) &&
+        (!avctx->codec_tag || avctx->codec_tag == MKTAG('r','a','w',' ') ||
+                context->is_nut_mono || context->is_nut_pal8)) {
+        context->is_1_2_4_8_bpp = 1;
+        if (context->is_mono) {
+            int row_bytes = avctx->width / 8 + (avctx->width & 7 ? 1 : 0);
+            context->frame_size = av_image_get_buffer_size(avctx->pix_fmt,
+                                                           FFALIGN(row_bytes, 16) * 8,
+                                                           avctx->height, 1);
+        } else
+            context->frame_size = av_image_get_buffer_size(avctx->pix_fmt,
+                                                           FFALIGN(avctx->width, 16),
+                                                           avctx->height, 1);
+    } else {
+        context->is_lt_16bpp = av_get_bits_per_pixel(desc) == 16 && avctx->bits_per_coded_sample && avctx->bits_per_coded_sample < 16;
+        context->frame_size = av_image_get_buffer_size(avctx->pix_fmt, avctx->width,
+                                                       avctx->height, 1);
+    }
+    if (context->frame_size < 0)
+        return context->frame_size;
+
+    need_copy = !avpkt->buf || context->is_1_2_4_8_bpp || context->is_yuv2 || context->is_lt_16bpp;
+
     frame->pict_type        = AV_PICTURE_TYPE_I;
     frame->key_frame        = 1;
 
@@ -156,38 +239,111 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
     if (res < 0)
         return res;
 
-    if (buf_size < context->frame_size - (avctx->pix_fmt == AV_PIX_FMT_PAL8 ?
-                                          AVPALETTE_SIZE : 0))
-        return -1;
+    frame->pkt_pos      = avctx->internal->last_pkt_props->pos;
+    frame->pkt_duration = avctx->internal->last_pkt_props->duration;
+
+    if (context->tff >= 0) {
+        frame->interlaced_frame = 1;
+        frame->top_field_first  = context->tff;
+    }
+
+    if ((res = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return res;
 
     if (need_copy)
-        frame->buf[0] = av_buffer_alloc(context->frame_size);
+        frame->buf[0] = av_buffer_alloc(FFMAX(context->frame_size, buf_size));
     else
         frame->buf[0] = av_buffer_ref(avpkt->buf);
     if (!frame->buf[0])
         return AVERROR(ENOMEM);
 
-    //2bpp and 4bpp raw in avi and mov (yes this is ugly ...)
-    if (context->is_2_4_bpp) {
-        int i;
+    // 1, 2, 4 and 8 bpp in avi/mov, 1 and 8 bpp in nut
+    if (context->is_1_2_4_8_bpp) {
+        int i, j, row_pix = 0;
         uint8_t *dst = frame->buf[0]->data;
-        buf_size = context->frame_size - AVPALETTE_SIZE;
-        if (avctx->bits_per_coded_sample == 4) {
-            for (i = 0; 2 * i + 1 < buf_size; i++) {
-                dst[2 * i + 0] = buf[i] >> 4;
-                dst[2 * i + 1] = buf[i] & 15;
+        buf_size = context->frame_size - (context->is_pal8 ? AVPALETTE_SIZE : 0);
+        if (avctx->bits_per_coded_sample == 8 || context->is_nut_pal8 || context->is_mono) {
+            int pix_per_byte = context->is_mono ? 8 : 1;
+            for (i = 0, j = 0; j < buf_size && i<avpkt->size; i++, j++) {
+                dst[j] = buf[i];
+                row_pix += pix_per_byte;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 16 - (j % 16) - 1;
+                    row_pix = 0;
+                }
+            }
+        } else if (avctx->bits_per_coded_sample == 4) {
+            for (i = 0, j = 0; 2 * j + 1 < buf_size && i<avpkt->size; i++, j++) {
+                dst[2 * j + 0] = buf[i] >> 4;
+                dst[2 * j + 1] = buf[i] & 15;
+                row_pix += 2;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 8 - (j % 8) - 1;
+                    row_pix = 0;
+                }
+            }
+        } else if (avctx->bits_per_coded_sample == 2) {
+            for (i = 0, j = 0; 4 * j + 3 < buf_size && i<avpkt->size; i++, j++) {
+                dst[4 * j + 0] = buf[i] >> 6;
+                dst[4 * j + 1] = buf[i] >> 4 & 3;
+                dst[4 * j + 2] = buf[i] >> 2 & 3;
+                dst[4 * j + 3] = buf[i]      & 3;
+                row_pix += 4;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 4 - (j % 4) - 1;
+                    row_pix = 0;
+                }
             }
         } else {
-            for (i = 0; 4 * i + 3 < buf_size; i++) {
-                dst[4 * i + 0] = buf[i] >> 6;
-                dst[4 * i + 1] = buf[i] >> 4 & 3;
-                dst[4 * i + 2] = buf[i] >> 2 & 3;
-                dst[4 * i + 3] = buf[i]      & 3;
+            av_assert0(avctx->bits_per_coded_sample == 1);
+            for (i = 0, j = 0; 8 * j + 7 < buf_size && i<avpkt->size; i++, j++) {
+                dst[8 * j + 0] = buf[i] >> 7;
+                dst[8 * j + 1] = buf[i] >> 6 & 1;
+                dst[8 * j + 2] = buf[i] >> 5 & 1;
+                dst[8 * j + 3] = buf[i] >> 4 & 1;
+                dst[8 * j + 4] = buf[i] >> 3 & 1;
+                dst[8 * j + 5] = buf[i] >> 2 & 1;
+                dst[8 * j + 6] = buf[i] >> 1 & 1;
+                dst[8 * j + 7] = buf[i]      & 1;
+                row_pix += 8;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 2 - (j % 2) - 1;
+                    row_pix = 0;
+                }
             }
         }
+        linesize_align = 16;
+        buf = dst;
+    } else if (context->is_lt_16bpp) {
+        uint8_t *dst = frame->buf[0]->data;
+        int packed = (avctx->codec_tag & 0xFFFFFF) == MKTAG('B','I','T', 0);
+        int swap   =  avctx->codec_tag >> 24;
+
+        if (packed && swap) {
+            av_fast_padded_malloc(&context->bitstream_buf, &context->bitstream_buf_size, buf_size);
+            if (!context->bitstream_buf)
+                return AVERROR(ENOMEM);
+            if (swap == 16)
+                context->bbdsp.bswap16_buf(context->bitstream_buf, (const uint16_t*)buf, buf_size / 2);
+            else if (swap == 32)
+                context->bbdsp.bswap_buf(context->bitstream_buf, (const uint32_t*)buf, buf_size / 4);
+            else
+                return AVERROR_INVALIDDATA;
+            buf = context->bitstream_buf;
+        }
+
+        if (desc->flags & AV_PIX_FMT_FLAG_BE)
+            scale16be(avctx, dst, buf, buf_size, packed);
+        else
+            scale16le(avctx, dst, buf, buf_size, packed);
+
         buf = dst;
     } else if (need_copy) {
-        memcpy(frame->buf[0]->data, buf, FFMIN(buf_size, context->frame_size));
+        memcpy(frame->buf[0]->data, buf, buf_size);
         buf = frame->buf[0]->data;
     }
 
@@ -195,32 +351,89 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->codec_tag == MKTAG('A', 'V', 'u', 'p'))
         buf += buf_size - context->frame_size;
 
+    len = context->frame_size - (avctx->pix_fmt==AV_PIX_FMT_PAL8 ? AVPALETTE_SIZE : 0);
+    if (buf_size < len && ((avctx->codec_tag & 0xFFFFFF) != MKTAG('B','I','T', 0) || !need_copy)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid buffer size, packet size %d < expected frame_size %d\n", buf_size, len);
+        av_buffer_unref(&frame->buf[0]);
+        return AVERROR(EINVAL);
+    }
+
     if ((res = av_image_fill_arrays(frame->data, frame->linesize,
                                     buf, avctx->pix_fmt,
-                                    avctx->width, avctx->height, 1)) < 0)
+                                    avctx->width, avctx->height, 1)) < 0) {
+        av_buffer_unref(&frame->buf[0]);
         return res;
+    }
 
     if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        int pal_size;
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE,
-                                                     NULL);
+                                                     &pal_size);
+        int ret;
 
-        if (pal) {
-            av_buffer_unref(&context->palette);
+        if (pal && pal_size != AVPALETTE_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", pal_size);
+            pal = NULL;
+        }
+
+        if (!context->palette)
             context->palette = av_buffer_alloc(AVPALETTE_SIZE);
-            if (!context->palette)
-                return AVERROR(ENOMEM);
+        if (!context->palette) {
+            av_buffer_unref(&frame->buf[0]);
+            return AVERROR(ENOMEM);
+        }
+        ret = av_buffer_make_writable(&context->palette);
+        if (ret < 0) {
+            av_buffer_unref(&frame->buf[0]);
+            return ret;
+        }
+
+        if (pal) {
             memcpy(context->palette->data, pal, AVPALETTE_SIZE);
             frame->palette_has_changed = 1;
+        } else if (context->is_nut_pal8) {
+            int vid_size = avctx->width * avctx->height;
+            int pal_size = avpkt->size - vid_size;
+
+            if (avpkt->size > vid_size && pal_size <= AVPALETTE_SIZE) {
+                pal = avpkt->data + vid_size;
+                memcpy(context->palette->data, pal, pal_size);
+                frame->palette_has_changed = 1;
+            }
         }
     }
 
+    if ((avctx->pix_fmt==AV_PIX_FMT_RGB24    ||
+        avctx->pix_fmt==AV_PIX_FMT_BGR24     ||
+        avctx->pix_fmt==AV_PIX_FMT_GRAY8     ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB555LE  ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB555BE  ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB565LE  ||
+        avctx->pix_fmt==AV_PIX_FMT_MONOWHITE ||
+        avctx->pix_fmt==AV_PIX_FMT_MONOBLACK ||
+        avctx->pix_fmt==AV_PIX_FMT_PAL8) &&
+        FFALIGN(frame->linesize[0], linesize_align) * avctx->height <= buf_size)
+        frame->linesize[0] = FFALIGN(frame->linesize[0], linesize_align);
+
+    if (avctx->pix_fmt == AV_PIX_FMT_NV12 && avctx->codec_tag == MKTAG('N', 'V', '1', '2') &&
+        FFALIGN(frame->linesize[0], linesize_align) * avctx->height +
+        FFALIGN(frame->linesize[1], linesize_align) * ((avctx->height + 1) / 2) <= buf_size) {
+        int la0 = FFALIGN(frame->linesize[0], linesize_align);
+        frame->data[1] += (la0 - frame->linesize[0]) * avctx->height;
+        frame->linesize[0] = la0;
+        frame->linesize[1] = FFALIGN(frame->linesize[1], linesize_align);
+    }
+
     if ((avctx->pix_fmt == AV_PIX_FMT_PAL8 && buf_size < context->frame_size) ||
-        (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)) {
+        (desc->flags & FF_PSEUDOPAL)) {
         frame->buf[1]  = av_buffer_ref(context->palette);
-        if (!frame->buf[1])
+        if (!frame->buf[1]) {
+            av_buffer_unref(&frame->buf[0]);
             return AVERROR(ENOMEM);
+        }
         frame->data[1] = frame->buf[1]->data;
     }
+
     if (avctx->pix_fmt == AV_PIX_FMT_BGR24 &&
         ((frame->linesize[0] + 3) & ~3) * avctx->height <= buf_size)
         frame->linesize[0] = (frame->linesize[0] + 3) & ~3;
@@ -234,6 +447,11 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->codec_tag == MKTAG('Y', 'V', 'U', '9'))
         FFSWAP(uint8_t *, frame->data[1], frame->data[2]);
 
+    if (avctx->codec_tag == AV_RL32("I420") && (avctx->width+1)*(avctx->height+1) * 3/2 == buf_size) {
+        frame->data[1] = frame->data[1] +  (avctx->width+1)*(avctx->height+1) -avctx->width*avctx->height;
+        frame->data[2] = frame->data[2] + ((avctx->width+1)*(avctx->height+1) -avctx->width*avctx->height)*5/4;
+    }
+
     if (avctx->codec_tag == AV_RL32("yuv2") &&
         avctx->pix_fmt   == AV_PIX_FMT_YUYV422) {
         int x, y;
@@ -245,6 +463,23 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
+    if (avctx->codec_tag == AV_RL32("b64a") &&
+        avctx->pix_fmt   == AV_PIX_FMT_RGBA64BE) {
+        uint8_t *dst = frame->data[0];
+        uint64_t v;
+        int x;
+        for (x = 0; x >> 3 < avctx->width * avctx->height; x += 8) {
+            v = AV_RB64(&dst[x]);
+            AV_WB64(&dst[x], v << 16 | v >> 48);
+        }
+    }
+
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) { /* we have interlaced material flagged in container */
+        frame->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            frame->top_field_first = 1;
+    }
+
     *got_frame = 1;
     return buf_size;
 }
@@ -266,4 +501,6 @@ AVCodec ff_rawvideo_decoder = {
     .init           = raw_init_decoder,
     .close          = raw_close_decoder,
     .decode         = raw_decode,
+    .priv_class     = &rawdec_class,
+    .capabilities   = AV_CODEC_CAP_PARAM_CHANGE,
 };
diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index 60bd0c7..d181b74 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -2,20 +2,20 @@
  * Raw Video Encoder
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,37 +39,46 @@ static av_cold int raw_encode_init(AVCodecContext *avctx)
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     avctx->bits_per_coded_sample = av_get_bits_per_pixel(desc);
     if(!avctx->codec_tag)
         avctx->codec_tag = avcodec_pix_fmt_to_codec_tag(avctx->pix_fmt);
+    avctx->bit_rate = ff_guess_coded_bitrate(avctx);
+
     return 0;
 }
 
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                       const AVFrame *frame, int *got_packet)
 {
-    int ret = av_image_get_buffer_size(avctx->pix_fmt,
-                                       avctx->width, avctx->height, 1);
+    int ret = av_image_get_buffer_size(frame->format,
+                                       frame->width, frame->height, 1);
 
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_alloc_packet(pkt, ret)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
         return ret;
     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-                                       frame->data, frame->linesize,
+                                       (const uint8_t **)frame->data, frame->linesize,
                                        frame->format,
                                        frame->width, frame->height, 1)) < 0)
         return ret;
 
     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
-       avctx->pix_fmt   == AV_PIX_FMT_YUYV422) {
+       frame->format   == AV_PIX_FMT_YUYV422) {
         int x;
-        for(x = 1; x < avctx->height*avctx->width*2; x += 2)
+        for(x = 1; x < frame->height*frame->width*2; x += 2)
             pkt->data[x] ^= 0x80;
+    } else if (avctx->codec_tag == AV_RL32("b64a") && ret > 0 &&
+        frame->format == AV_PIX_FMT_RGBA64BE) {
+        uint64_t v;
+        int x;
+        for (x = 0; x < frame->height * frame->width; x++) {
+            v = AV_RB64(&pkt->data[8 * x]);
+            AV_WB64(&pkt->data[8 * x], v << 48 | v >> 16);
+        }
     }
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
diff --git a/libavcodec/rdft.c b/libavcodec/rdft.c
index 1965253..6ba7484 100644
--- a/libavcodec/rdft.c
+++ b/libavcodec/rdft.c
@@ -2,20 +2,20 @@
  * (I)RDFT transforms
  * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <stdlib.h>
@@ -28,28 +28,6 @@
  * (Inverse) Real Discrete Fourier Transforms.
  */
 
-/* sin(2*pi*x/n) for 0<=x<n/4, followed by n/2<=x<3n/4 */
-#if !CONFIG_HARDCODED_TABLES
-SINTABLE(16);
-SINTABLE(32);
-SINTABLE(64);
-SINTABLE(128);
-SINTABLE(256);
-SINTABLE(512);
-SINTABLE(1024);
-SINTABLE(2048);
-SINTABLE(4096);
-SINTABLE(8192);
-SINTABLE(16384);
-SINTABLE(32768);
-SINTABLE(65536);
-#endif
-static SINTABLE_CONST FFTSample * const ff_sin_tabs[] = {
-    NULL, NULL, NULL, NULL,
-    ff_sin_16, ff_sin_32, ff_sin_64, ff_sin_128, ff_sin_256, ff_sin_512, ff_sin_1024,
-    ff_sin_2048, ff_sin_4096, ff_sin_8192, ff_sin_16384, ff_sin_32768, ff_sin_65536,
-};
-
 /** Map one real FFT into two parallel real even and odd FFTs. Then interleave
  * the two real FFTs into one complex FFT. Unmangle the results.
  * ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
@@ -57,7 +35,7 @@ static SINTABLE_CONST FFTSample * const ff_sin_tabs[] = {
 static void rdft_calc_c(RDFTContext *s, FFTSample *data)
 {
     int i, i1, i2;
-    FFTComplex ev, od;
+    FFTComplex ev, od, odsum;
     const int n = 1 << s->nbits;
     const float k1 = 0.5;
     const float k2 = 0.5 - s->inverse;
@@ -73,20 +51,31 @@ static void rdft_calc_c(RDFTContext *s, FFTSample *data)
     ev.re = data[0];
     data[0] = ev.re+data[1];
     data[1] = ev.re-data[1];
-    for (i = 1; i < (n>>2); i++) {
-        i1 = 2*i;
-        i2 = n-i1;
-        /* Separate even and odd FFTs */
-        ev.re =  k1*(data[i1  ]+data[i2  ]);
-        od.im = -k2*(data[i1  ]-data[i2  ]);
-        ev.im =  k1*(data[i1+1]-data[i2+1]);
-        od.re =  k2*(data[i1+1]+data[i2+1]);
-        /* Apply twiddle factors to the odd FFT and add to the even FFT */
-        data[i1  ] =  ev.re + od.re*tcos[i] - od.im*tsin[i];
-        data[i1+1] =  ev.im + od.im*tcos[i] + od.re*tsin[i];
-        data[i2  ] =  ev.re - od.re*tcos[i] + od.im*tsin[i];
-        data[i2+1] = -ev.im + od.im*tcos[i] + od.re*tsin[i];
+
+#define RDFT_UNMANGLE(sign0, sign1)                                         \
+    for (i = 1; i < (n>>2); i++) {                                          \
+        i1 = 2*i;                                                           \
+        i2 = n-i1;                                                          \
+        /* Separate even and odd FFTs */                                    \
+        ev.re =  k1*(data[i1  ]+data[i2  ]);                                \
+        od.im =  k2*(data[i2  ]-data[i1  ]);                                \
+        ev.im =  k1*(data[i1+1]-data[i2+1]);                                \
+        od.re =  k2*(data[i1+1]+data[i2+1]);                                \
+        /* Apply twiddle factors to the odd FFT and add to the even FFT */  \
+        odsum.re = od.re*tcos[i] sign0 od.im*tsin[i];                       \
+        odsum.im = od.im*tcos[i] sign1 od.re*tsin[i];                       \
+        data[i1  ] =  ev.re + odsum.re;                                     \
+        data[i1+1] =  ev.im + odsum.im;                                     \
+        data[i2  ] =  ev.re - odsum.re;                                     \
+        data[i2+1] =  odsum.im - ev.im;                                     \
+    }
+
+    if (s->negative_sin) {
+        RDFT_UNMANGLE(+,-)
+    } else {
+        RDFT_UNMANGLE(-,+)
     }
+
     data[2*i+1]=s->sign_convention*data[2*i+1];
     if (s->inverse) {
         data[0] *= k1;
@@ -99,28 +88,22 @@ static void rdft_calc_c(RDFTContext *s, FFTSample *data)
 av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
 {
     int n = 1 << nbits;
+    int ret;
 
     s->nbits           = nbits;
     s->inverse         = trans == IDFT_C2R || trans == DFT_C2R;
     s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1;
+    s->negative_sin    = trans == DFT_C2R || trans == DFT_R2C;
 
     if (nbits < 4 || nbits > 16)
-        return -1;
+        return AVERROR(EINVAL);
 
-    if (ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C) < 0)
-        return -1;
+    if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0)
+        return ret;
 
     ff_init_ff_cos_tabs(nbits);
     s->tcos = ff_cos_tabs[nbits];
-    s->tsin = ff_sin_tabs[nbits]+(trans == DFT_R2C || trans == DFT_C2R)*(n>>2);
-#if !CONFIG_HARDCODED_TABLES
-    {
-        int i;
-        const double theta = (trans == DFT_R2C || trans == DFT_C2R ? -1 : 1) * 2 * M_PI / n;
-        for (i = 0; i < (n >> 2); i++)
-            s->tsin[i] = sin(i * theta);
-    }
-#endif
+    s->tsin = ff_cos_tabs[nbits] + (n >> 2);
     s->rdft_calc   = rdft_calc_c;
 
     if (ARCH_ARM) ff_rdft_init_arm(s);
diff --git a/libavcodec/rdft.h b/libavcodec/rdft.h
index 8ff620f..ffafca7 100644
--- a/libavcodec/rdft.h
+++ b/libavcodec/rdft.h
@@ -2,52 +2,29 @@
  * (I)RDFT transforms
  * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_RDFT_H
+#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
 #define AVCODEC_RDFT_H
 
 #include "config.h"
 #include "fft.h"
 
-#if CONFIG_HARDCODED_TABLES
-#   define SINTABLE_CONST const
-#else
-#   define SINTABLE_CONST
-#endif
-
-#define SINTABLE(size) \
-    SINTABLE_CONST DECLARE_ALIGNED(16, FFTSample, ff_sin_##size)[size/2]
-
-extern SINTABLE(16);
-extern SINTABLE(32);
-extern SINTABLE(64);
-extern SINTABLE(128);
-extern SINTABLE(256);
-extern SINTABLE(512);
-extern SINTABLE(1024);
-extern SINTABLE(2048);
-extern SINTABLE(4096);
-extern SINTABLE(8192);
-extern SINTABLE(16384);
-extern SINTABLE(32768);
-extern SINTABLE(65536);
-
 struct RDFTContext {
     int nbits;
     int inverse;
@@ -55,7 +32,8 @@ struct RDFTContext {
 
     /* pre/post rotation tables */
     const FFTSample *tcos;
-    SINTABLE_CONST FFTSample *tsin;
+    const FFTSample *tsin;
+    int negative_sin;
     FFTContext fft;
     void (*rdft_calc)(struct RDFTContext *s, FFTSample *z);
 };
diff --git a/libavcodec/realtextdec.c b/libavcodec/realtextdec.c
new file mode 100644
index 0000000..5084781
--- /dev/null
+++ b/libavcodec/realtextdec.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * RealText subtitle decoder
+ * @see http://service.real.com/help/library/guides/ProductionGuide/prodguide/htmfiles/realtext.htm
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+
+static int rt_event_to_ass(AVBPrint *buf, const char *p)
+{
+    int prev_chr_is_space = 1;
+
+    while (*p) {
+        if (*p != '<') {
+            if (!av_isspace(*p))
+                av_bprint_chars(buf, *p, 1);
+            else if (!prev_chr_is_space)
+                av_bprint_chars(buf, ' ', 1);
+            prev_chr_is_space = av_isspace(*p);
+        } else {
+            const char *end = strchr(p, '>');
+            if (!end)
+                break;
+            if (!av_strncasecmp(p, "<br/>", 5) ||
+                !av_strncasecmp(p, "<br>",  4)) {
+                av_bprintf(buf, "\\N");
+            }
+            p = end;
+        }
+        p++;
+    }
+    return 0;
+}
+
+static int realtext_decode_frame(AVCodecContext *avctx,
+                                 void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, 4096);
+    if (ptr && avpkt->size > 0 && !rt_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_realtext_decoder = {
+    .name           = "realtext",
+    .long_name      = NULL_IF_CONFIG_SMALL("RealText subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_REALTEXT,
+    .decode         = realtext_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/rectangle.h b/libavcodec/rectangle.h
index 616a637..df7c18a 100644
--- a/libavcodec/rectangle.h
+++ b/libavcodec/rectangle.h
@@ -2,20 +2,20 @@
  * rectangle filling function
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #ifndef AVCODEC_RECTANGLE_H
 #define AVCODEC_RECTANGLE_H
 
-#include <assert.h>
 #include "config.h"
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
 
 /**
  * fill a rectangle.
@@ -40,13 +40,14 @@
  */
 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
     uint8_t *p= (uint8_t*)vp;
-    assert(size==1 || size==2 || size==4);
-    assert(w<=4);
+    av_assert2(size==1 || size==2 || size==4);
+    av_assert2(w<=4);
 
     w      *= size;
     stride *= size;
 
-    assert((stride&(w-1))==0);
+    av_assert2((((long)vp)&(FFMIN(w, 8<<(HAVE_NEON|ARCH_PPC|HAVE_MMX))-1)) == 0);
+    av_assert2((stride&(w-1))==0);
     if(w==2){
         const uint16_t v= size==4 ? val : val*0x0101;
         *(uint16_t*)(p + 0*stride)= v;
@@ -116,8 +117,8 @@ static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride,
         *(uint32_t*)(p +12+3*stride)= val;
 #endif
     }else
-        assert(0);
-    assert(h==4);
+        av_assert2(0);
+    av_assert2(h==4);
 }
 
 #endif /* AVCODEC_RECTANGLE_H */
diff --git a/libavcodec/remove_extradata_bsf.c b/libavcodec/remove_extradata_bsf.c
index a89fa06..b762079 100644
--- a/libavcodec/remove_extradata_bsf.c
+++ b/libavcodec/remove_extradata_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 enum RemoveFreq {
     REMOVE_FREQ_KEYFRAME,
     REMOVE_FREQ_ALL,
+    REMOVE_FREQ_NONKEYFRAME,
 };
 
 typedef struct RemoveExtradataContext {
@@ -37,29 +38,26 @@ typedef struct RemoveExtradataContext {
     AVCodecContext *avctx;
 } RemoveExtradataContext;
 
-static int remove_extradata(AVBSFContext *ctx, AVPacket *out)
+static int remove_extradata(AVBSFContext *ctx, AVPacket *pkt)
 {
     RemoveExtradataContext *s = ctx->priv_data;
 
-    AVPacket *in;
     int ret;
 
-    ret = ff_bsf_get_packet(ctx, &in);
+    ret = ff_bsf_get_packet_ref(ctx, pkt);
     if (ret < 0)
         return ret;
 
     if (s->parser && s->parser->parser->split) {
         if (s->freq == REMOVE_FREQ_ALL ||
-            (s->freq == REMOVE_FREQ_KEYFRAME && in->flags & AV_PKT_FLAG_KEY)) {
-            int i = s->parser->parser->split(s->avctx, in->data, in->size);
-            in->data += i;
-            in->size -= i;
+            (s->freq == REMOVE_FREQ_NONKEYFRAME && !(pkt->flags & AV_PKT_FLAG_KEY)) ||
+            (s->freq == REMOVE_FREQ_KEYFRAME && pkt->flags & AV_PKT_FLAG_KEY)) {
+            int i = s->parser->parser->split(s->avctx, pkt->data, pkt->size);
+            pkt->data += i;
+            pkt->size -= i;
         }
     }
 
-    av_packet_move_ref(out, in);
-    av_packet_free(&in);
-
     return 0;
 }
 
@@ -92,10 +90,13 @@ static void remove_extradata_close(AVBSFContext *ctx)
 }
 
 #define OFFSET(x) offsetof(RemoveExtradataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
 static const AVOption options[] = {
-    { "freq", NULL, OFFSET(freq), AV_OPT_TYPE_INT, { .i64 = REMOVE_FREQ_KEYFRAME }, REMOVE_FREQ_KEYFRAME, REMOVE_FREQ_ALL, 0, "freq" },
-        { "keyframe", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_KEYFRAME }, .unit = "freq" },
-        { "all",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_ALL      }, .unit = "freq" },
+    { "freq", NULL, OFFSET(freq), AV_OPT_TYPE_INT, { .i64 = REMOVE_FREQ_KEYFRAME }, REMOVE_FREQ_KEYFRAME, REMOVE_FREQ_NONKEYFRAME, FLAGS, "freq" },
+        { "k",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_NONKEYFRAME }, .flags = FLAGS, .unit = "freq" },
+        { "keyframe", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_KEYFRAME }, .flags = FLAGS, .unit = "freq" },
+        { "e",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_ALL      }, .flags = FLAGS, .unit = "freq" },
+        { "all",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_ALL      }, .flags = FLAGS, .unit = "freq" },
     { NULL },
 };
 
diff --git a/libavcodec/reverse.c b/libavcodec/reverse.c
new file mode 100644
index 0000000..440bada
--- /dev/null
+++ b/libavcodec/reverse.c
@@ -0,0 +1 @@
+#include "libavutil/reverse.c"
diff --git a/libavcodec/rkmppdec.c b/libavcodec/rkmppdec.c
new file mode 100644
index 0000000..143d05b
--- /dev/null
+++ b/libavcodec/rkmppdec.c
@@ -0,0 +1,586 @@
+/*
+ * RockChip MPP Video Decoder
+ * Copyright (c) 2017 Lionel CHAZALLON
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <drm_fourcc.h>
+#include <pthread.h>
+#include <rockchip/mpp_buffer.h>
+#include <rockchip/rk_mpi.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "avcodec.h"
+#include "decode.h"
+#include "hwaccel.h"
+#include "internal.h"
+#include "libavutil/buffer.h"
+#include "libavutil/common.h"
+#include "libavutil/frame.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_drm.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/log.h"
+
+#define RECEIVE_FRAME_TIMEOUT   100
+#define FRAMEGROUP_MAX_FRAMES   16
+#define INPUT_MAX_PACKETS       4
+
+typedef struct {
+    MppCtx ctx;
+    MppApi *mpi;
+    MppBufferGroup frame_group;
+
+    char first_packet;
+    char eos_reached;
+
+    AVBufferRef *frames_ref;
+    AVBufferRef *device_ref;
+} RKMPPDecoder;
+
+typedef struct {
+    AVClass *av_class;
+    AVBufferRef *decoder_ref;
+} RKMPPDecodeContext;
+
+typedef struct {
+    MppFrame frame;
+    AVBufferRef *decoder_ref;
+} RKMPPFrameContext;
+
+static MppCodingType rkmpp_get_codingtype(AVCodecContext *avctx)
+{
+    switch (avctx->codec_id) {
+    case AV_CODEC_ID_H264:          return MPP_VIDEO_CodingAVC;
+    case AV_CODEC_ID_HEVC:          return MPP_VIDEO_CodingHEVC;
+    case AV_CODEC_ID_VP8:           return MPP_VIDEO_CodingVP8;
+    case AV_CODEC_ID_VP9:           return MPP_VIDEO_CodingVP9;
+    default:                        return MPP_VIDEO_CodingUnused;
+    }
+}
+
+static uint32_t rkmpp_get_frameformat(MppFrameFormat mppformat)
+{
+    switch (mppformat) {
+    case MPP_FMT_YUV420SP:          return DRM_FORMAT_NV12;
+#ifdef DRM_FORMAT_NV12_10
+    case MPP_FMT_YUV420SP_10BIT:    return DRM_FORMAT_NV12_10;
+#endif
+    default:                        return 0;
+    }
+}
+
+static int rkmpp_write_data(AVCodecContext *avctx, uint8_t *buffer, int size, int64_t pts)
+{
+    RKMPPDecodeContext *rk_context = avctx->priv_data;
+    RKMPPDecoder *decoder = (RKMPPDecoder *)rk_context->decoder_ref->data;
+    int ret;
+    MppPacket packet;
+
+    // create the MPP packet
+    ret = mpp_packet_init(&packet, buffer, size);
+    if (ret != MPP_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to init MPP packet (code = %d)\n", ret);
+        return AVERROR_UNKNOWN;
+    }
+
+    mpp_packet_set_pts(packet, pts);
+
+    if (!buffer)
+        mpp_packet_set_eos(packet);
+
+    ret = decoder->mpi->decode_put_packet(decoder->ctx, packet);
+    if (ret != MPP_OK) {
+        if (ret == MPP_ERR_BUFFER_FULL) {
+            av_log(avctx, AV_LOG_DEBUG, "Buffer full writing %d bytes to decoder\n", size);
+            ret = AVERROR(EAGAIN);
+        } else
+            ret = AVERROR_UNKNOWN;
+    }
+    else
+        av_log(avctx, AV_LOG_DEBUG, "Wrote %d bytes to decoder\n", size);
+
+    mpp_packet_deinit(&packet);
+
+    return ret;
+}
+
+static int rkmpp_close_decoder(AVCodecContext *avctx)
+{
+    RKMPPDecodeContext *rk_context = avctx->priv_data;
+    av_buffer_unref(&rk_context->decoder_ref);
+    return 0;
+}
+
+static void rkmpp_release_decoder(void *opaque, uint8_t *data)
+{
+    RKMPPDecoder *decoder = (RKMPPDecoder *)data;
+
+    if (decoder->mpi) {
+        decoder->mpi->reset(decoder->ctx);
+        mpp_destroy(decoder->ctx);
+        decoder->ctx = NULL;
+    }
+
+    if (decoder->frame_group) {
+        mpp_buffer_group_put(decoder->frame_group);
+        decoder->frame_group = NULL;
+    }
+
+    av_buffer_unref(&decoder->frames_ref);
+    av_buffer_unref(&decoder->device_ref);
+
+    av_free(decoder);
+}
+
+static int rkmpp_init_decoder(AVCodecContext *avctx)
+{
+    RKMPPDecodeContext *rk_context = avctx->priv_data;
+    RKMPPDecoder *decoder = NULL;
+    MppCodingType codectype = MPP_VIDEO_CodingUnused;
+    int ret;
+    RK_S64 paramS64;
+    RK_S32 paramS32;
+
+    avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
+
+    // create a decoder and a ref to it
+    decoder = av_mallocz(sizeof(RKMPPDecoder));
+    if (!decoder) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    rk_context->decoder_ref = av_buffer_create((uint8_t *)decoder, sizeof(*decoder), rkmpp_release_decoder,
+                                               NULL, AV_BUFFER_FLAG_READONLY);
+    if (!rk_context->decoder_ref) {
+        av_free(decoder);
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Initializing RKMPP decoder.\n");
+
+    codectype = rkmpp_get_codingtype(avctx);
+    if (codectype == MPP_VIDEO_CodingUnused) {
+        av_log(avctx, AV_LOG_ERROR, "Unknown codec type (%d).\n", avctx->codec_id);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    ret = mpp_check_support_format(MPP_CTX_DEC, codectype);
+    if (ret != MPP_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Codec type (%d) unsupported by MPP\n", avctx->codec_id);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    // Create the MPP context
+    ret = mpp_create(&decoder->ctx, &decoder->mpi);
+    if (ret != MPP_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create MPP context (code = %d).\n", ret);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    // initialize mpp
+    ret = mpp_init(decoder->ctx, MPP_CTX_DEC, codectype);
+    if (ret != MPP_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to initialize MPP context (code = %d).\n", ret);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    // make decode calls blocking with a timeout
+    paramS32 = MPP_POLL_BLOCK;
+    ret = decoder->mpi->control(decoder->ctx, MPP_SET_OUTPUT_BLOCK, &paramS32);
+    if (ret != MPP_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set blocking mode on MPI (code = %d).\n", ret);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    paramS64 = RECEIVE_FRAME_TIMEOUT;
+    ret = decoder->mpi->control(decoder->ctx, MPP_SET_OUTPUT_BLOCK_TIMEOUT, &paramS64);
+    if (ret != MPP_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set block timeout on MPI (code = %d).\n", ret);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    ret = mpp_buffer_group_get_internal(&decoder->frame_group, MPP_BUFFER_TYPE_ION);
+    if (ret) {
+       av_log(avctx, AV_LOG_ERROR, "Failed to retrieve buffer group (code = %d)\n", ret);
+       ret = AVERROR_UNKNOWN;
+       goto fail;
+    }
+
+    ret = decoder->mpi->control(decoder->ctx, MPP_DEC_SET_EXT_BUF_GROUP, decoder->frame_group);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to assign buffer group (code = %d)\n", ret);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    ret = mpp_buffer_group_limit_config(decoder->frame_group, 0, FRAMEGROUP_MAX_FRAMES);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set buffer group limit (code = %d)\n", ret);
+        ret = AVERROR_UNKNOWN;
+        goto fail;
+    }
+
+    decoder->first_packet = 1;
+
+    av_log(avctx, AV_LOG_DEBUG, "RKMPP decoder initialized successfully.\n");
+
+    decoder->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
+    if (!decoder->device_ref) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    ret = av_hwdevice_ctx_init(decoder->device_ref);
+    if (ret < 0)
+        goto fail;
+
+    return 0;
+
+fail:
+    av_log(avctx, AV_LOG_ERROR, "Failed to initialize RKMPP decoder.\n");
+    rkmpp_close_decoder(avctx);
+    return ret;
+}
+
+static int rkmpp_send_packet(AVCodecContext *avctx, const AVPacket *avpkt)
+{
+    RKMPPDecodeContext *rk_context = avctx->priv_data;
+    RKMPPDecoder *decoder = (RKMPPDecoder *)rk_context->decoder_ref->data;
+    int ret;
+
+    // handle EOF
+    if (!avpkt->size) {
+        av_log(avctx, AV_LOG_DEBUG, "End of stream.\n");
+        decoder->eos_reached = 1;
+        ret = rkmpp_write_data(avctx, NULL, 0, 0);
+        if (ret)
+            av_log(avctx, AV_LOG_ERROR, "Failed to send EOS to decoder (code = %d)\n", ret);
+        return ret;
+    }
+
+    // on first packet, send extradata
+    if (decoder->first_packet) {
+        if (avctx->extradata_size) {
+            ret = rkmpp_write_data(avctx, avctx->extradata,
+                                            avctx->extradata_size,
+                                            avpkt->pts);
+            if (ret) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to write extradata to decoder (code = %d)\n", ret);
+                return ret;
+            }
+        }
+        decoder->first_packet = 0;
+    }
+
+    // now send packet
+    ret = rkmpp_write_data(avctx, avpkt->data, avpkt->size, avpkt->pts);
+    if (ret && ret!=AVERROR(EAGAIN))
+        av_log(avctx, AV_LOG_ERROR, "Failed to write data to decoder (code = %d)\n", ret);
+
+    return ret;
+}
+
+static void rkmpp_release_frame(void *opaque, uint8_t *data)
+{
+    AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)data;
+    AVBufferRef *framecontextref = (AVBufferRef *)opaque;
+    RKMPPFrameContext *framecontext = (RKMPPFrameContext *)framecontextref->data;
+
+    mpp_frame_deinit(&framecontext->frame);
+    av_buffer_unref(&framecontext->decoder_ref);
+    av_buffer_unref(&framecontextref);
+
+    av_free(desc);
+}
+
+static int rkmpp_retrieve_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    RKMPPDecodeContext *rk_context = avctx->priv_data;
+    RKMPPDecoder *decoder = (RKMPPDecoder *)rk_context->decoder_ref->data;
+    RKMPPFrameContext *framecontext = NULL;
+    AVBufferRef *framecontextref = NULL;
+    int ret;
+    MppFrame mppframe = NULL;
+    MppBuffer buffer = NULL;
+    AVDRMFrameDescriptor *desc = NULL;
+    AVDRMLayerDescriptor *layer = NULL;
+    int mode;
+    MppFrameFormat mppformat;
+    uint32_t drmformat;
+
+    ret = decoder->mpi->decode_get_frame(decoder->ctx, &mppframe);
+    if (ret != MPP_OK && ret != MPP_ERR_TIMEOUT) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to get a frame from MPP (code = %d)\n", ret);
+        goto fail;
+    }
+
+    if (mppframe) {
+        // Check whether we have a special frame or not
+        if (mpp_frame_get_info_change(mppframe)) {
+            AVHWFramesContext *hwframes;
+
+            av_log(avctx, AV_LOG_INFO, "Decoder noticed an info change (%dx%d), format=%d\n",
+                                        (int)mpp_frame_get_width(mppframe), (int)mpp_frame_get_height(mppframe),
+                                        (int)mpp_frame_get_fmt(mppframe));
+
+            avctx->width = mpp_frame_get_width(mppframe);
+            avctx->height = mpp_frame_get_height(mppframe);
+
+            decoder->mpi->control(decoder->ctx, MPP_DEC_SET_INFO_CHANGE_READY, NULL);
+
+            av_buffer_unref(&decoder->frames_ref);
+
+            decoder->frames_ref = av_hwframe_ctx_alloc(decoder->device_ref);
+            if (!decoder->frames_ref) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            mppformat = mpp_frame_get_fmt(mppframe);
+            drmformat = rkmpp_get_frameformat(mppformat);
+
+            hwframes = (AVHWFramesContext*)decoder->frames_ref->data;
+            hwframes->format    = AV_PIX_FMT_DRM_PRIME;
+            hwframes->sw_format = drmformat == DRM_FORMAT_NV12 ? AV_PIX_FMT_NV12 : AV_PIX_FMT_NONE;
+            hwframes->width     = avctx->width;
+            hwframes->height    = avctx->height;
+            ret = av_hwframe_ctx_init(decoder->frames_ref);
+            if (ret < 0)
+                goto fail;
+
+            // here decoder is fully initialized, we need to feed it again with data
+            ret = AVERROR(EAGAIN);
+            goto fail;
+        } else if (mpp_frame_get_eos(mppframe)) {
+            av_log(avctx, AV_LOG_DEBUG, "Received a EOS frame.\n");
+            decoder->eos_reached = 1;
+            ret = AVERROR_EOF;
+            goto fail;
+        } else if (mpp_frame_get_discard(mppframe)) {
+            av_log(avctx, AV_LOG_DEBUG, "Received a discard frame.\n");
+            ret = AVERROR(EAGAIN);
+            goto fail;
+        } else if (mpp_frame_get_errinfo(mppframe)) {
+            av_log(avctx, AV_LOG_ERROR, "Received a errinfo frame.\n");
+            ret = AVERROR_UNKNOWN;
+            goto fail;
+        }
+
+        // here we should have a valid frame
+        av_log(avctx, AV_LOG_DEBUG, "Received a frame.\n");
+
+        // setup general frame fields
+        frame->format           = AV_PIX_FMT_DRM_PRIME;
+        frame->width            = mpp_frame_get_width(mppframe);
+        frame->height           = mpp_frame_get_height(mppframe);
+        frame->pts              = mpp_frame_get_pts(mppframe);
+        frame->color_range      = mpp_frame_get_color_range(mppframe);
+        frame->color_primaries  = mpp_frame_get_color_primaries(mppframe);
+        frame->color_trc        = mpp_frame_get_color_trc(mppframe);
+        frame->colorspace       = mpp_frame_get_colorspace(mppframe);
+
+        mode = mpp_frame_get_mode(mppframe);
+        frame->interlaced_frame = ((mode & MPP_FRAME_FLAG_FIELD_ORDER_MASK) == MPP_FRAME_FLAG_DEINTERLACED);
+        frame->top_field_first  = ((mode & MPP_FRAME_FLAG_FIELD_ORDER_MASK) == MPP_FRAME_FLAG_TOP_FIRST);
+
+        mppformat = mpp_frame_get_fmt(mppframe);
+        drmformat = rkmpp_get_frameformat(mppformat);
+
+        // now setup the frame buffer info
+        buffer = mpp_frame_get_buffer(mppframe);
+        if (buffer) {
+            desc = av_mallocz(sizeof(AVDRMFrameDescriptor));
+            if (!desc) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            desc->nb_objects = 1;
+            desc->objects[0].fd = mpp_buffer_get_fd(buffer);
+            desc->objects[0].size = mpp_buffer_get_size(buffer);
+
+            desc->nb_layers = 1;
+            layer = &desc->layers[0];
+            layer->format = drmformat;
+            layer->nb_planes = 2;
+
+            layer->planes[0].object_index = 0;
+            layer->planes[0].offset = 0;
+            layer->planes[0].pitch = mpp_frame_get_hor_stride(mppframe);
+
+            layer->planes[1].object_index = 0;
+            layer->planes[1].offset = layer->planes[0].pitch * mpp_frame_get_ver_stride(mppframe);
+            layer->planes[1].pitch = layer->planes[0].pitch;
+
+            // we also allocate a struct in buf[0] that will allow to hold additionnal information
+            // for releasing properly MPP frames and decoder
+            framecontextref = av_buffer_allocz(sizeof(*framecontext));
+            if (!framecontextref) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            // MPP decoder needs to be closed only when all frames have been released.
+            framecontext = (RKMPPFrameContext *)framecontextref->data;
+            framecontext->decoder_ref = av_buffer_ref(rk_context->decoder_ref);
+            framecontext->frame = mppframe;
+
+            frame->data[0]  = (uint8_t *)desc;
+            frame->buf[0]   = av_buffer_create((uint8_t *)desc, sizeof(*desc), rkmpp_release_frame,
+                                               framecontextref, AV_BUFFER_FLAG_READONLY);
+
+            if (!frame->buf[0]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            frame->hw_frames_ctx = av_buffer_ref(decoder->frames_ref);
+            if (!frame->hw_frames_ctx) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            return 0;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Failed to retrieve the frame buffer, frame is dropped (code = %d)\n", ret);
+            mpp_frame_deinit(&mppframe);
+        }
+    } else if (decoder->eos_reached) {
+        return AVERROR_EOF;
+    } else if (ret == MPP_ERR_TIMEOUT) {
+        av_log(avctx, AV_LOG_DEBUG, "Timeout when trying to get a frame from MPP\n");
+    }
+
+    return AVERROR(EAGAIN);
+
+fail:
+    if (mppframe)
+        mpp_frame_deinit(&mppframe);
+
+    if (framecontext)
+        av_buffer_unref(&framecontext->decoder_ref);
+
+    if (framecontextref)
+        av_buffer_unref(&framecontextref);
+
+    if (desc)
+        av_free(desc);
+
+    return ret;
+}
+
+static int rkmpp_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    RKMPPDecodeContext *rk_context = avctx->priv_data;
+    RKMPPDecoder *decoder = (RKMPPDecoder *)rk_context->decoder_ref->data;
+    int ret = MPP_NOK;
+    AVPacket pkt = {0};
+    RK_S32 usedslots, freeslots;
+
+    if (!decoder->eos_reached) {
+        // we get the available slots in decoder
+        ret = decoder->mpi->control(decoder->ctx, MPP_DEC_GET_STREAM_COUNT, &usedslots);
+        if (ret != MPP_OK) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get decoder used slots (code = %d).\n", ret);
+            return ret;
+        }
+
+        freeslots = INPUT_MAX_PACKETS - usedslots;
+        if (freeslots > 0) {
+            ret = ff_decode_get_packet(avctx, &pkt);
+            if (ret < 0 && ret != AVERROR_EOF) {
+                return ret;
+            }
+
+            ret = rkmpp_send_packet(avctx, &pkt);
+            av_packet_unref(&pkt);
+
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to send packet to decoder (code = %d)\n", ret);
+                return ret;
+            }
+        }
+
+        // make sure we keep decoder full
+        if (freeslots > 1)
+            return AVERROR(EAGAIN);
+    }
+
+    return rkmpp_retrieve_frame(avctx, frame);
+}
+
+static void rkmpp_flush(AVCodecContext *avctx)
+{
+    RKMPPDecodeContext *rk_context = avctx->priv_data;
+    RKMPPDecoder *decoder = (RKMPPDecoder *)rk_context->decoder_ref->data;
+    int ret = MPP_NOK;
+
+    av_log(avctx, AV_LOG_DEBUG, "Flush.\n");
+
+    ret = decoder->mpi->reset(decoder->ctx);
+    if (ret == MPP_OK) {
+        decoder->first_packet = 1;
+    } else
+        av_log(avctx, AV_LOG_ERROR, "Failed to reset MPI (code = %d)\n", ret);
+}
+
+static const AVCodecHWConfigInternal *rkmpp_hw_configs[] = {
+    HW_CONFIG_INTERNAL(DRM_PRIME),
+    NULL
+};
+
+#define RKMPP_DEC_CLASS(NAME) \
+    static const AVClass rkmpp_##NAME##_dec_class = { \
+        .class_name = "rkmpp_" #NAME "_dec", \
+        .version    = LIBAVUTIL_VERSION_INT, \
+    };
+
+#define RKMPP_DEC(NAME, ID, BSFS) \
+    RKMPP_DEC_CLASS(NAME) \
+    AVCodec ff_##NAME##_rkmpp_decoder = { \
+        .name           = #NAME "_rkmpp", \
+        .long_name      = NULL_IF_CONFIG_SMALL(#NAME " (rkmpp)"), \
+        .type           = AVMEDIA_TYPE_VIDEO, \
+        .id             = ID, \
+        .priv_data_size = sizeof(RKMPPDecodeContext), \
+        .init           = rkmpp_init_decoder, \
+        .close          = rkmpp_close_decoder, \
+        .receive_frame  = rkmpp_receive_frame, \
+        .flush          = rkmpp_flush, \
+        .priv_class     = &rkmpp_##NAME##_dec_class, \
+        .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING | AV_CODEC_CAP_HARDWARE, \
+        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
+                                                         AV_PIX_FMT_NONE}, \
+        .hw_configs     = rkmpp_hw_configs, \
+        .bsfs           = BSFS, \
+        .wrapper_name   = "rkmpp", \
+    };
+
+RKMPP_DEC(h264,  AV_CODEC_ID_H264,          "h264_mp4toannexb")
+RKMPP_DEC(hevc,  AV_CODEC_ID_HEVC,          "hevc_mp4toannexb")
+RKMPP_DEC(vp8,   AV_CODEC_ID_VP8,           NULL)
+RKMPP_DEC(vp9,   AV_CODEC_ID_VP9,           NULL)
diff --git a/libavcodec/rl.c b/libavcodec/rl.c
index 9a9cbd9..6eac306 100644
--- a/libavcodec/rl.c
+++ b/libavcodec/rl.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,6 +20,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/mem.h"
 
 #include "rl.h"
@@ -101,9 +102,13 @@ fail:
     return AVERROR(ENOMEM);
 }
 
-av_cold void ff_rl_init_vlc(RLTable *rl)
+av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
 {
     int i, q;
+    VLC_TYPE table[1500][2] = {{0}};
+    VLC vlc = { .table = table, .table_allocated = static_size };
+    av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+    init_vlc(&vlc, 9, rl->n + 1, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC);
 
     for (q = 0; q < 32; q++) {
         int qmul = q * 2;
@@ -113,9 +118,9 @@ av_cold void ff_rl_init_vlc(RLTable *rl)
             qmul = 1;
             qadd = 0;
         }
-        for (i = 0; i < rl->vlc.table_size; i++) {
-            int code = rl->vlc.table[i][0];
-            int len  = rl->vlc.table[i][1];
+        for (i = 0; i < vlc.table_size; i++) {
+            int code = vlc.table[i][0];
+            int len  = vlc.table[i][1];
             int level, run;
 
             if (len == 0) { // illegal code
diff --git a/libavcodec/rl.h b/libavcodec/rl.h
index a5725ce..9a767bc 100644
--- a/libavcodec/rl.h
+++ b/libavcodec/rl.h
@@ -2,20 +2,20 @@
  * Copyright (c) 2000-2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,6 @@ typedef struct RLTable {
     uint8_t *index_run[2];         ///< encoding only
     int8_t *max_level[2];          ///< encoding & decoding
     int8_t *max_run[2];            ///< encoding & decoding
-    VLC vlc;                       ///< decoding only deprecated FIXME remove
     RL_VLC_ELEM *rl_vlc[32];       ///< decoding only
 } RLTable;
 
@@ -54,7 +53,7 @@ typedef struct RLTable {
  *                     the level and run tables, if this is NULL av_malloc() will be used
  */
 int ff_rl_init(RLTable *rl, uint8_t static_store[2][2*MAX_RUN + MAX_LEVEL + 3]);
-void ff_rl_init_vlc(RLTable *rl);
+void ff_rl_init_vlc(RLTable *rl, unsigned static_size);
 
 /**
  * Free the contents of a dynamically allocated table.
@@ -65,15 +64,12 @@ void ff_rl_free(RLTable *rl);
 {\
     int q;\
     static RL_VLC_ELEM rl_vlc_table[32][static_size];\
-    INIT_VLC_STATIC(&rl.vlc, 9, rl.n + 1,\
-             &rl.table_vlc[0][1], 4, 2,\
-             &rl.table_vlc[0][0], 4, 2, static_size);\
 \
     if(!rl.rl_vlc[0]){\
         for(q=0; q<32; q++)\
             rl.rl_vlc[q]= rl_vlc_table[q];\
 \
-        ff_rl_init_vlc(&rl);\
+        ff_rl_init_vlc(&rl, static_size);\
     }\
 }
 
diff --git a/libavcodec/rl2.c b/libavcodec/rl2.c
index c42a1cd..6662979 100644
--- a/libavcodec/rl2.c
+++ b/libavcodec/rl2.c
@@ -2,20 +2,20 @@
  * RL2 Video Decoder
  * Copyright (C) 2008 Sascha Sommer (saschasommer@freenet.de)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -155,7 +155,7 @@ static av_cold int rl2_decode_init(AVCodecContext *avctx)
 
     /** initialize palette */
     for (i = 0; i < AVPALETTE_COUNT; i++)
-        s->palette[i] = AV_RB24(&avctx->extradata[6 + i * 3]);
+        s->palette[i] = 0xFFU << 24 | AV_RB24(&avctx->extradata[6 + i * 3]);
 
     /** decode background frame if present */
     back_size = avctx->extradata_size - EXTRADATA1_SIZE;
@@ -181,10 +181,8 @@ static int rl2_decode_frame(AVCodecContext *avctx,
     int ret, buf_size  = avpkt->size;
     Rl2Context *s = avctx->priv_data;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /** run length decode */
     rl2_rle_decode(s, buf, buf_size, frame->data[0], frame->linesize[0],
@@ -209,7 +207,7 @@ static av_cold int rl2_decode_end(AVCodecContext *avctx)
 {
     Rl2Context *s = avctx->priv_data;
 
-    av_free(s->back_frame);
+    av_freep(&s->back_frame);
 
     return 0;
 }
diff --git a/libavcodec/rle.c b/libavcodec/rle.c
index 6c8bf27..792bc05 100644
--- a/libavcodec/rle.c
+++ b/libavcodec/rle.c
@@ -2,20 +2,20 @@
  * RLE encoder
  * Copyright (c) 2007 Bobby Bingham
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rle.h b/libavcodec/rle.h
index f1b0c78..a92edf7 100644
--- a/libavcodec/rle.h
+++ b/libavcodec/rle.h
@@ -1,20 +1,20 @@
 /*
  * RLE encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rnd_avg.h b/libavcodec/rnd_avg.h
index 412cda5..344775e 100644
--- a/libavcodec/rnd_avg.h
+++ b/libavcodec/rnd_avg.h
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
+ * Copyright (c) 2011 Oskar Arvidsson
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/roqaudioenc.c b/libavcodec/roqaudioenc.c
index f687f5c..5154604 100644
--- a/libavcodec/roqaudioenc.c
+++ b/libavcodec/roqaudioenc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2005 Eric Lasota
  *    Based on RoQ specs (c)2001 Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -160,10 +160,8 @@ static int roq_dpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     else
         data_size = avctx->channels * avctx->frame_size;
 
-    if ((ret = ff_alloc_packet(avpkt, ROQ_HEADER_SIZE + data_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, ROQ_HEADER_SIZE + data_size, 0)) < 0)
         return ret;
-    }
     out = avpkt->data;
 
     bytestream_put_byte(&out, stereo ? 0x21 : 0x20);
diff --git a/libavcodec/roqvideo.c b/libavcodec/roqvideo.c
index b0fd6ba..8eda93c 100644
--- a/libavcodec/roqvideo.c
+++ b/libavcodec/roqvideo.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2003 Mike Melanson
  * Copyright (C) 2003 Dr. Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/roqvideo.h b/libavcodec/roqvideo.h
index 3f00022..3da6eaa 100644
--- a/libavcodec/roqvideo.h
+++ b/libavcodec/roqvideo.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2003 Mike Melanson
  * Copyright (C) 2003 Dr. Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,7 @@ struct RoqTempData;
 
 typedef struct RoqContext {
 
+    const AVClass *class;
     AVCodecContext *avctx;
     AVFrame *last_frame;
     AVFrame *current_frame;
@@ -69,6 +70,9 @@ typedef struct RoqContext {
     const AVFrame *frame_to_enc;
     uint8_t *out_buf;
     struct RoqTempData *tmpData;
+
+    int quake3_compat; // Quake 3 compatibility option
+
 } RoqContext;
 
 #define RoQ_INFO              0x1001
diff --git a/libavcodec/roqvideodec.c b/libavcodec/roqvideodec.c
index d141064..0ab7d39 100644
--- a/libavcodec/roqvideodec.c
+++ b/libavcodec/roqvideodec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,10 +25,7 @@
  *   http://www.csse.monash.edu.au/~timf/
  */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 
 #include "avcodec.h"
@@ -74,9 +71,19 @@ static void roqvideo_decode_frame(RoqContext *ri)
 
     chunk_start = bytestream2_tell(&ri->gb);
     xpos = ypos = 0;
+
+    if (chunk_size > bytestream2_get_bytes_left(&ri->gb)) {
+        av_log(ri->avctx, AV_LOG_ERROR, "Chunk does not fit in input buffer\n");
+        chunk_size = bytestream2_get_bytes_left(&ri->gb);
+    }
+
     while (bytestream2_tell(&ri->gb) < chunk_start + chunk_size) {
         for (yp = ypos; yp < ypos + 16; yp += 8)
             for (xp = xpos; xp < xpos + 16; xp += 8) {
+                if (bytestream2_tell(&ri->gb) >= chunk_start + chunk_size) {
+                    av_log(ri->avctx, AV_LOG_VERBOSE, "Chunk is too short\n");
+                    return;
+                }
                 if (vqflg_pos < 0) {
                     vqflg = bytestream2_get_le16(&ri->gb);
                     vqflg_pos = 7;
@@ -108,6 +115,10 @@ static void roqvideo_decode_frame(RoqContext *ri)
                         if(k & 0x01) x += 4;
                         if(k & 0x02) y += 4;
 
+                        if (bytestream2_tell(&ri->gb) >= chunk_start + chunk_size) {
+                            av_log(ri->avctx, AV_LOG_VERBOSE, "Chunk is too short\n");
+                            return;
+                        }
                         if (vqflg_pos < 0) {
                             vqflg = bytestream2_get_le16(&ri->gb);
                             vqflg_pos = 7;
@@ -142,7 +153,7 @@ static void roqvideo_decode_frame(RoqContext *ri)
                     }
                     break;
                 default:
-                    av_log(ri->avctx, AV_LOG_ERROR, "Unknown vq code: %d\n", vqid);
+                    av_assert2(0);
             }
         }
 
@@ -179,7 +190,8 @@ static av_cold int roq_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
     }
 
-    avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+    avctx->pix_fmt = AV_PIX_FMT_YUVJ444P;
+    avctx->color_range = AVCOL_RANGE_JPEG;
 
     return 0;
 }
@@ -194,10 +206,8 @@ static int roq_decode_frame(AVCodecContext *avctx,
     int copy = !s->current_frame->data[0] && s->last_frame->data[0];
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->current_frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  RoQ: get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->current_frame)) < 0)
         return ret;
-    }
 
     if (copy) {
         ret = av_frame_copy(s->current_frame, s->last_frame);
diff --git a/libavcodec/roqvideoenc.c b/libavcodec/roqvideoenc.c
index 6421ccc..ac05123 100644
--- a/libavcodec/roqvideoenc.c
+++ b/libavcodec/roqvideoenc.c
@@ -5,27 +5,27 @@
  * Copyright (C) 2004-2007 Eric Lasota
  *    Based on RoQ specs (C) 2001 Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * id RoQ encoder by Vitor. Based on the Switchblade3 library and the
- * Switchblade3 Libav glue by Eric Lasota.
+ * Switchblade3 FFmpeg glue by Eric Lasota.
  */
 
 /*
@@ -57,6 +57,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/opt.h"
 #include "roqvideo.h"
 #include "bytestream.h"
 #include "elbg.h"
@@ -69,7 +70,7 @@
  * Maximum number of generated 4x4 codebooks. Can't be 256 to workaround a
  * Quake 3 bug.
  */
-#define MAX_CBS_4x4 255
+#define MAX_CBS_4x4 256
 
 #define MAX_CBS_2x2 256 ///< Maximum number of 2x2 codebooks.
 
@@ -245,7 +246,7 @@ static int create_cel_evals(RoqContext *enc, RoqTempdata *tempData)
 {
     int n=0, x, y, i;
 
-    tempData->cel_evals = av_malloc(enc->width*enc->height/64 * sizeof(CelEvaluation));
+    tempData->cel_evals = av_malloc_array(enc->width*enc->height/64, sizeof(CelEvaluation));
     if (!tempData->cel_evals)
         return AVERROR(ENOMEM);
 
@@ -541,7 +542,7 @@ static void remap_codebooks(RoqContext *enc, RoqTempdata *tempData)
     int i, j, idx=0;
 
     /* Make remaps for the final codebook usage */
-    for (i=0; i<MAX_CBS_4x4; i++) {
+    for (i=0; i<(enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4); i++) {
         if (tempData->codebooks.usedCB4[i]) {
             tempData->i2f4[i] = idx;
             tempData->f2i4[idx] = i;
@@ -783,14 +784,14 @@ static int generate_codebook(RoqContext *enc, RoqTempdata *tempdata,
     int i, j, k, ret = 0;
     int c_size = size*size/4;
     int *buf;
-    int *codebook = av_malloc(6*c_size*cbsize*sizeof(int));
+    int *codebook = av_malloc_array(6*c_size, cbsize*sizeof(int));
     int *closest_cb;
 
     if (!codebook)
         return AVERROR(ENOMEM);
 
     if (size == 4) {
-        closest_cb = av_malloc(6*c_size*inputCount*sizeof(int));
+        closest_cb = av_malloc_array(6*c_size, inputCount*sizeof(int));
         if (!closest_cb) {
             ret = AVERROR(ENOMEM);
             goto out;
@@ -798,11 +799,11 @@ static int generate_codebook(RoqContext *enc, RoqTempdata *tempdata,
     } else
         closest_cb = tempdata->closest_cb2;
 
-    ret = ff_init_elbg(points, 6 * c_size, inputCount, codebook,
+    ret = avpriv_init_elbg(points, 6 * c_size, inputCount, codebook,
                        cbsize, 1, closest_cb, &enc->randctx);
     if (ret < 0)
         goto out;
-    ret = ff_do_elbg(points, 6 * c_size, inputCount, codebook,
+    ret = avpriv_do_elbg(points, 6 * c_size, inputCount, codebook,
                      cbsize, 1, closest_cb, &enc->randctx);
     if (ret < 0)
         goto out;
@@ -831,8 +832,8 @@ static int generate_new_codebooks(RoqContext *enc, RoqTempdata *tempData)
     int max = enc->width*enc->height/16;
     uint8_t mb2[3*4];
     roq_cell *results4 = av_malloc(sizeof(roq_cell)*MAX_CBS_4x4*4);
-    uint8_t *yuvClusters=av_malloc(sizeof(int)*max*6*4);
-    int *points = av_malloc(max*6*4*sizeof(int));
+    uint8_t *yuvClusters=av_malloc_array(max, sizeof(int)*6*4);
+    int *points = av_malloc_array(max, 6*4*sizeof(int));
     int bias;
 
     if (!results4 || !yuvClusters || !points) {
@@ -851,12 +852,12 @@ static int generate_new_codebooks(RoqContext *enc, RoqTempdata *tempData)
 
     /* Create 4x4 codebooks */
     if ((ret = generate_codebook(enc, tempData, points, max,
-                                 results4, 4, MAX_CBS_4x4)) < 0)
+                                 results4, 4, (enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4))) < 0)
         goto out;
 
-    codebooks->numCB4 = MAX_CBS_4x4;
+    codebooks->numCB4 = (enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4);
 
-    tempData->closest_cb2 = av_malloc(max*4*sizeof(int));
+    tempData->closest_cb2 = av_malloc_array(max, 4*sizeof(int));
     if (!tempData->closest_cb2) {
         ret = AVERROR(ENOMEM);
         goto out;
@@ -917,10 +918,14 @@ static int roq_encode_video(RoqContext *enc)
         gather_data_for_cel(tempData->cel_evals + i, enc, tempData);
 
     /* Quake 3 can't handle chunks bigger than 65535 bytes */
-    if (tempData->mainChunkSize/8 > 65535) {
+    if (tempData->mainChunkSize/8 > 65535 && enc->quake3_compat) {
+        if (enc->lambda > 100000) {
+            av_log(enc->avctx, AV_LOG_ERROR, "Cannot encode video in Quake compatible form\n");
+            return AVERROR(EINVAL);
+        }
         av_log(enc->avctx, AV_LOG_ERROR,
-               "Warning, generated a frame too big (%d > 65535), "
-               "try using a smaller qscale value.\n",
+               "Warning, generated a frame too big for Quake (%d > 65535), "
+               "now switching to a bigger qscale value.\n",
                tempData->mainChunkSize/8);
         enc->lambda *= 1.5;
         tempData->mainChunkSize = 0;
@@ -945,8 +950,8 @@ static int roq_encode_video(RoqContext *enc)
     FFSWAP(motion_vect *, enc->last_motion4, enc->this_motion4);
     FFSWAP(motion_vect *, enc->last_motion8, enc->this_motion8);
 
-    av_free(tempData->cel_evals);
-    av_free(tempData->closest_cb2);
+    av_freep(&tempData->cel_evals);
+    av_freep(&tempData->closest_cb2);
 
     enc->framesSinceKeyframe++;
 
@@ -960,11 +965,11 @@ static av_cold int roq_encode_end(AVCodecContext *avctx)
     av_frame_free(&enc->current_frame);
     av_frame_free(&enc->last_frame);
 
-    av_free(enc->tmpData);
-    av_free(enc->this_motion4);
-    av_free(enc->last_motion4);
-    av_free(enc->this_motion8);
-    av_free(enc->last_motion8);
+    av_freep(&enc->tmpData);
+    av_freep(&enc->this_motion4);
+    av_freep(&enc->last_motion4);
+    av_freep(&enc->this_motion8);
+    av_freep(&enc->last_motion8);
 
     return 0;
 }
@@ -980,11 +985,16 @@ static av_cold int roq_encode_init(AVCodecContext *avctx)
     enc->framesSinceKeyframe = 0;
     if ((avctx->width & 0xf) || (avctx->height & 0xf)) {
         av_log(avctx, AV_LOG_ERROR, "Dimensions must be divisible by 16\n");
-        return -1;
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->width > 65535 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions are max %d\n", enc->quake3_compat ? 32768 : 65535);
+        return AVERROR(EINVAL);
     }
 
     if (((avctx->width)&(avctx->width-1))||((avctx->height)&(avctx->height-1)))
-        av_log(avctx, AV_LOG_ERROR, "Warning: dimensions not power of two\n");
+        av_log(avctx, AV_LOG_ERROR, "Warning: dimensions not power of two, this is not supported by quake\n");
 
     enc->width = avctx->width;
     enc->height = avctx->height;
@@ -1002,16 +1012,22 @@ static av_cold int roq_encode_init(AVCodecContext *avctx)
     enc->tmpData      = av_malloc(sizeof(RoqTempdata));
 
     enc->this_motion4 =
-        av_mallocz((enc->width*enc->height/16)*sizeof(motion_vect));
+        av_mallocz_array((enc->width*enc->height/16), sizeof(motion_vect));
 
     enc->last_motion4 =
-        av_malloc ((enc->width*enc->height/16)*sizeof(motion_vect));
+        av_malloc_array ((enc->width*enc->height/16), sizeof(motion_vect));
 
     enc->this_motion8 =
-        av_mallocz((enc->width*enc->height/64)*sizeof(motion_vect));
+        av_mallocz_array((enc->width*enc->height/64), sizeof(motion_vect));
 
     enc->last_motion8 =
-        av_malloc ((enc->width*enc->height/64)*sizeof(motion_vect));
+        av_malloc_array ((enc->width*enc->height/64), sizeof(motion_vect));
+
+    if (!enc->tmpData || !enc->this_motion4 || !enc->last_motion4 ||
+        !enc->this_motion8 || !enc->last_motion8) {
+        roq_encode_end(avctx);
+        return AVERROR(ENOMEM);
+    }
 
     return 0;
 }
@@ -1059,10 +1075,8 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     /* 138 bits max per 8x8 block +
      *     256 codebooks*(6 bytes 2x2 + 4 bytes 4x4) + 8 bytes frame header */
     size = ((enc->width * enc->height / 64) * 138 + 7) / 8 + 256 * (6 + 4) + 8;
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet with size %d.\n", size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0)
         return ret;
-    }
     enc->out_buf = pkt->data;
 
     /* Check for I-frame */
@@ -1072,11 +1086,9 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (enc->first_frame) {
         /* Alloc memory for the reconstruction data (we must know the stride
          for that) */
-        if (ff_get_buffer(avctx, enc->current_frame, 0) ||
-            ff_get_buffer(avctx, enc->last_frame, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "  RoQ: get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_get_buffer(avctx, enc->current_frame, 0)) < 0 ||
+            (ret = ff_get_buffer(avctx, enc->last_frame,    0)) < 0)
+            return ret;
 
         /* Before the first video frame, write a "video info" chunk */
         roq_write_video_info_chunk(enc);
@@ -1097,6 +1109,20 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     return 0;
 }
 
+#define OFFSET(x) offsetof(RoqContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "quake3_compat", "Whether to respect known limitations in Quake 3 decoder", OFFSET(quake3_compat), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VE },
+    { NULL },
+};
+
+static const AVClass roq_class = {
+    .class_name = "RoQ",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_roq_encoder = {
     .name                 = "roqvideo",
     .long_name            = NULL_IF_CONFIG_SMALL("id RoQ video"),
@@ -1106,7 +1132,7 @@ AVCodec ff_roq_encoder = {
     .init                 = roq_encode_init,
     .encode2              = roq_encode_frame,
     .close                = roq_encode_end,
-    .supported_framerates = (const AVRational[]){ {30,1}, {0,0} },
-    .pix_fmts             = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV444P,
+    .pix_fmts             = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVJ444P,
                                                         AV_PIX_FMT_NONE },
+    .priv_class     = &roq_class,
 };
diff --git a/libavcodec/rpza.c b/libavcodec/rpza.c
index f3f3fbc..8e1efa2 100644
--- a/libavcodec/rpza.c
+++ b/libavcodec/rpza.c
@@ -2,20 +2,20 @@
  * Quicktime Video (RPZA) Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,13 +73,12 @@ typedef struct RpzaContext {
 static int rpza_decode_stream(RpzaContext *s)
 {
     int width = s->avctx->width;
-    int stride = s->frame->linesize[0] / 2;
-    int row_inc = stride - 4;
+    int stride, row_inc, ret;
     int chunk_size;
     uint16_t colorA = 0, colorB;
     uint16_t color4[4];
     uint16_t ta, tb;
-    uint16_t *pixels = (uint16_t *)s->frame->data[0];
+    uint16_t *pixels;
 
     int row_ptr = 0;
     int pixel_ptr = 0;
@@ -96,12 +95,25 @@ static int rpza_decode_stream(RpzaContext *s)
     chunk_size = bytestream2_get_be32(&s->gb) & 0x00FFFFFF;
 
     /* If length mismatch use size from MOV file and try to decode anyway */
-    if (chunk_size != bytestream2_get_bytes_left(&s->gb) - 4)
-        av_log(s->avctx, AV_LOG_WARNING, "MOV chunk size != encoded chunk size\n");
+    if (chunk_size != bytestream2_get_bytes_left(&s->gb) + 4)
+        av_log(s->avctx, AV_LOG_WARNING,
+               "MOV chunk size %d != encoded chunk size %d\n",
+               chunk_size,
+               bytestream2_get_bytes_left(&s->gb) + 4
+              );
 
     /* Number of 4x4 blocks in frame. */
     total_blocks = ((s->avctx->width + 3) / 4) * ((s->avctx->height + 3) / 4);
 
+    if (total_blocks / 32 > bytestream2_get_bytes_left(&s->gb))
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = ff_reget_buffer(s->avctx, s->frame)) < 0)
+        return ret;
+    pixels = (uint16_t *)s->frame->data[0];
+    stride = s->frame->linesize[0] / 2;
+    row_inc = stride - 4;
+
     /* Process chunk data */
     while (bytestream2_get_bytes_left(&s->gb)) {
         uint8_t opcode = bytestream2_get_byte(&s->gb); /* Get opcode */
@@ -252,11 +264,6 @@ static int rpza_decode_frame(AVCodecContext *avctx,
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-        return ret;
-    }
-
     ret = rpza_decode_stream(s);
     if (ret < 0)
         return ret;
diff --git a/libavcodec/rscc.c b/libavcodec/rscc.c
index dfc2338..7d4e842 100644
--- a/libavcodec/rscc.c
+++ b/libavcodec/rscc.c
@@ -2,20 +2,20 @@
  * innoHeim/Rsupport Screen Capture Codec
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
  * and it can be deflated or not. Similarly, pixel data comes after the header
  * and a variable size value, and it can be deflated or just raw.
  *
- * Supports: PAL8, BGRA, BGR24, RGB555, RGB8
+ * Supports: PAL8, BGRA, BGR24, RGB555
  */
 
 #include <stdint.h>
@@ -64,6 +64,7 @@ typedef struct RsccContext {
     /* zlib interaction */
     uint8_t *inflated_buf;
     uLongf inflated_size;
+    int valid_pixels;
 } RsccContext;
 
 static av_cold int rscc_init(AVCodecContext *avctx)
@@ -85,8 +86,18 @@ static av_cold int rscc_init(AVCodecContext *avctx)
 
     /* Get pixel format and the size of the pixel */
     if (avctx->codec_tag == MKTAG('I', 'S', 'C', 'C')) {
-        avctx->pix_fmt = AV_PIX_FMT_BGRA;
-        ctx->component_size = 4;
+        if (avctx->extradata && avctx->extradata_size == 4) {
+            if ((avctx->extradata[0] >> 1) & 1) {
+                avctx->pix_fmt = AV_PIX_FMT_BGRA;
+                ctx->component_size = 4;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_BGR24;
+                ctx->component_size = 3;
+            }
+        } else {
+            avctx->pix_fmt = AV_PIX_FMT_BGRA;
+            ctx->component_size = 4;
+        }
     } else if (avctx->codec_tag == MKTAG('R', 'S', 'C', 'C')) {
         ctx->component_size = avctx->bits_per_coded_sample / 8;
         switch (avctx->bits_per_coded_sample) {
@@ -100,7 +111,7 @@ static av_cold int rscc_init(AVCodecContext *avctx)
             avctx->pix_fmt = AV_PIX_FMT_BGR24;
             break;
         case 32:
-            avctx->pix_fmt = AV_PIX_FMT_BGRA;
+            avctx->pix_fmt = AV_PIX_FMT_BGR0;
             break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Invalid bits per pixel value (%d)\n",
@@ -108,8 +119,9 @@ static av_cold int rscc_init(AVCodecContext *avctx)
             return AVERROR_INVALIDDATA;
         }
     } else {
-        av_log(avctx, AV_LOG_ERROR, "Invalid codec tag\n");
-        return AVERROR_INVALIDDATA;
+        avctx->pix_fmt = AV_PIX_FMT_BGR0;
+        ctx->component_size = 4;
+        av_log(avctx, AV_LOG_WARNING, "Invalid codec tag\n");
     }
 
     /* Store the value to check for keyframes */
@@ -156,6 +168,12 @@ static int rscc_decode_frame(AVCodecContext *avctx, void *data,
 
     /* Read number of tiles, and allocate the array */
     tiles_nb = bytestream2_get_le16(gbc);
+
+    if (tiles_nb == 0) {
+        av_log(avctx, AV_LOG_DEBUG, "no tiles\n");
+        return avpkt->size;
+    }
+
     av_fast_malloc(&ctx->tiles, &ctx->tiles_size,
                    tiles_nb * sizeof(*ctx->tiles));
     if (!ctx->tiles) {
@@ -210,6 +228,12 @@ static int rscc_decode_frame(AVCodecContext *avctx, void *data,
         ctx->tiles[i].y = bytestream2_get_le16(gbc);
         ctx->tiles[i].h = bytestream2_get_le16(gbc);
 
+        if (pixel_size + ctx->tiles[i].w * (int64_t)ctx->tiles[i].h * ctx->component_size > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid tile dimensions\n");
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
+
         pixel_size += ctx->tiles[i].w * ctx->tiles[i].h * ctx->component_size;
 
         ff_dlog(avctx, "tile %d orig(%d,%d) %dx%d.\n", i,
@@ -249,11 +273,27 @@ static int rscc_decode_frame(AVCodecContext *avctx, void *data,
 
     ff_dlog(avctx, "pixel_size %d packed_size %d.\n", pixel_size, packed_size);
 
+    if (packed_size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid tile size %d\n", packed_size);
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+
     /* Get pixels buffer, it may be deflated or just raw */
     if (pixel_size == packed_size) {
+        if (bytestream2_get_bytes_left(gbc) < pixel_size) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficient input for %d\n", pixel_size);
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
         pixels = gbc->buffer;
     } else {
         uLongf len = ctx->inflated_size;
+        if (bytestream2_get_bytes_left(gbc) < packed_size) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficient input for %d\n", packed_size);
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
         ret = uncompress(ctx->inflated_buf, &len, gbc->buffer, packed_size);
         if (ret) {
             av_log(avctx, AV_LOG_ERROR, "Pixel deflate error %d.\n", ret);
@@ -296,18 +336,25 @@ static int rscc_decode_frame(AVCodecContext *avctx, void *data,
 
     /* Palette handling */
     if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        int size;
         const uint8_t *palette = av_packet_get_side_data(avpkt,
                                                          AV_PKT_DATA_PALETTE,
-                                                         NULL);
-        if (palette) {
+                                                         &size);
+        if (palette && size == AVPALETTE_SIZE) {
             frame->palette_has_changed = 1;
             memcpy(ctx->palette, palette, AVPALETTE_SIZE);
+        } else if (palette) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
         }
-        memcpy(frame->data[1], ctx->palette, AVPALETTE_SIZE);
+        memcpy (frame->data[1], ctx->palette, AVPALETTE_SIZE);
     }
+    // We only return a picture when enough of it is undamaged, this avoids copying nearly broken frames around
+    if (ctx->valid_pixels < ctx->inflated_size)
+        ctx->valid_pixels += pixel_size;
+    if (ctx->valid_pixels >= ctx->inflated_size * (100 - avctx->discard_damaged_percentage) / 100)
+        *got_frame = 1;
 
-    *got_frame = 1;
-
+    ret = avpkt->size;
 end:
     av_free(inflated_tiles);
     return ret;
diff --git a/libavcodec/rtjpeg.c b/libavcodec/rtjpeg.c
index baa1f78..8e02bce 100644
--- a/libavcodec/rtjpeg.c
+++ b/libavcodec/rtjpeg.c
@@ -2,26 +2,24 @@
  * RTJpeg decoding functions
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
-
 #include "libavutil/common.h"
-
-#include "bitstream.h"
+#include "get_bits.h"
 #include "rtjpeg.h"
 
 #define PUT_COEFF(c) \
@@ -30,36 +28,34 @@
 
 /// aligns the bitstream to the given power of two
 #define ALIGN(a) \
-    n = (-bitstream_tell(bc)) & (a - 1); \
-    if (n)                               \
-        bitstream_skip(bc, n);
+    n = (-get_bits_count(gb)) & (a - 1); \
+    if (n) {skip_bits(gb, n);}
 
 /**
  * @brief read one block from stream
- * @param bc contains stream data
+ * @param gb contains stream data
  * @param block where data is written to
  * @param scan array containing the mapping stream address -> block position
  * @param quant quantization factors
  * @return 0 means the block is not coded, < 0 means an error occurred.
  *
- * Note: BitstreamContext is used to make the code simpler, since all data is
+ * Note: GetBitContext is used to make the code simpler, since all data is
  * aligned this could be done faster in a different way, e.g. as it is done
  * in MPlayer libmpcodecs/native/rtjpegn.c.
  */
-static inline int get_block(BitstreamContext *bc, int16_t *block,
-                            const uint8_t *scan, const uint32_t *quant)
-{
+static inline int get_block(GetBitContext *gb, int16_t *block, const uint8_t *scan,
+                            const uint32_t *quant) {
     int coeff, i, n;
     int8_t ac;
-    uint8_t dc = bitstream_read(bc, 8);
+    uint8_t dc = get_bits(gb, 8);
 
     // block not coded
     if (dc == 255)
        return 0;
 
     // number of non-zero coefficients
-    coeff = bitstream_read(bc, 6);
-    if (bitstream_bits_left(bc) < (coeff << 1))
+    coeff = get_bits(gb, 6);
+    if (get_bits_left(gb) < (coeff << 1))
         return AVERROR_INVALIDDATA;
 
     // normally we would only need to clear the (63 - coeff) last values,
@@ -68,7 +64,7 @@ static inline int get_block(BitstreamContext *bc, int16_t *block,
 
     // 2 bits per coefficient
     while (coeff) {
-        ac = bitstream_read_signed(bc, 2);
+        ac = get_sbits(gb, 2);
         if (ac == -2)
             break; // continue with more bits
         PUT_COEFF(ac);
@@ -76,10 +72,10 @@ static inline int get_block(BitstreamContext *bc, int16_t *block,
 
     // 4 bits per coefficient
     ALIGN(4);
-    if (bitstream_bits_left(bc) < (coeff << 2))
+    if (get_bits_left(gb) < (coeff << 2))
         return AVERROR_INVALIDDATA;
     while (coeff) {
-        ac = bitstream_read_signed(bc, 4);
+        ac = get_sbits(gb, 4);
         if (ac == -8)
             break; // continue with more bits
         PUT_COEFF(ac);
@@ -87,10 +83,10 @@ static inline int get_block(BitstreamContext *bc, int16_t *block,
 
     // 8 bits per coefficient
     ALIGN(8);
-    if (bitstream_bits_left(bc) < (coeff << 3))
+    if (get_bits_left(gb) < (coeff << 3))
         return AVERROR_INVALIDDATA;
     while (coeff) {
-        ac = bitstream_read_signed(bc, 8);
+        ac = get_sbits(gb, 8);
         PUT_COEFF(ac);
     }
 
@@ -109,19 +105,19 @@ static inline int get_block(BitstreamContext *bc, int16_t *block,
  */
 int ff_rtjpeg_decode_frame_yuv420(RTJpegContext *c, AVFrame *f,
                                   const uint8_t *buf, int buf_size) {
-    BitstreamContext bc;
+    GetBitContext gb;
     int w = c->w / 16, h = c->h / 16;
     int x, y, ret;
     uint8_t *y1 = f->data[0], *y2 = f->data[0] + 8 * f->linesize[0];
     uint8_t *u = f->data[1], *v = f->data[2];
 
-    if ((ret = bitstream_init8(&bc, buf, buf_size)) < 0)
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
         return ret;
 
     for (y = 0; y < h; y++) {
         for (x = 0; x < w; x++) {
 #define BLOCK(quant, dst, stride) do { \
-    int res = get_block(&bc, block, c->scan, quant); \
+    int res = get_block(&gb, block, c->scan, quant); \
     if (res < 0) \
         return res; \
     if (res > 0) \
@@ -146,7 +142,7 @@ int ff_rtjpeg_decode_frame_yuv420(RTJpegContext *c, AVFrame *f,
         u += 8 * (f->linesize[1] - w);
         v += 8 * (f->linesize[2] - w);
     }
-    return bitstream_tell(&bc) / 8;
+    return get_bits_count(&gb) / 8;
 }
 
 /**
diff --git a/libavcodec/rtjpeg.h b/libavcodec/rtjpeg.h
index cd30079..d22ff40 100644
--- a/libavcodec/rtjpeg.h
+++ b/libavcodec/rtjpeg.h
@@ -2,20 +2,20 @@
  * RTJpeg decoding functions
  * copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c
index 3b5f4df..595e217 100644
--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -309,7 +309,7 @@ static int rv20_decode_picture_header(RVDecContext *rv)
 {
     MpegEncContext *s = &rv->m;
     int seq, mb_pos, i, ret;
-    int rpr_bits;
+    int rpr_max;
 
     i = get_bits(&s->gb, 2);
     switch (i) {
@@ -330,6 +330,10 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         return AVERROR_INVALIDDATA;
     }
 
+    if (s->low_delay && s->pict_type == AV_PICTURE_TYPE_B) {
+        av_log(s->avctx, AV_LOG_ERROR, "low delay B\n");
+        return -1;
+    }
     if (!s->last_picture_ptr && s->pict_type == AV_PICTURE_TYPE_B) {
         av_log(s->avctx, AV_LOG_ERROR, "early B-frame\n");
         return AVERROR_INVALIDDATA;
@@ -347,17 +351,17 @@ static int rv20_decode_picture_header(RVDecContext *rv)
     }
 
     if (RV_GET_MINOR_VER(rv->sub_id) >= 2)
-        s->loop_filter = get_bits1(&s->gb);
+        s->loop_filter = get_bits1(&s->gb) && !s->avctx->lowres;
 
     if (RV_GET_MINOR_VER(rv->sub_id) <= 1)
         seq = get_bits(&s->gb, 8) << 7;
     else
         seq = get_bits(&s->gb, 13) << 2;
 
-    rpr_bits = s->avctx->extradata[1] & 7;
-    if (rpr_bits) {
+    rpr_max = s->avctx->extradata[1] & 7;
+    if (rpr_max) {
         int f, new_w, new_h;
-        rpr_bits = FFMIN((rpr_bits >> 1) + 1, 3);
+        int rpr_bits = av_log2(rpr_max) + 1;
 
         f = get_bits(&s->gb, rpr_bits);
 
@@ -374,10 +378,21 @@ static int rv20_decode_picture_header(RVDecContext *rv)
             new_h = rv->orig_height;
         }
         if (new_w != s->width || new_h != s->height) {
+            AVRational old_aspect = s->avctx->sample_aspect_ratio;
             av_log(s->avctx, AV_LOG_DEBUG,
                    "attempting to change resolution to %dx%d\n", new_w, new_h);
+            if (av_image_check_size(new_w, new_h, 0, s->avctx) < 0)
+                return AVERROR_INVALIDDATA;
             ff_mpv_common_end(s);
 
+            // attempt to keep aspect during typical resolution switches
+            if (!old_aspect.num)
+                old_aspect = (AVRational){1, 1};
+            if (2 * new_w * s->height == new_h * s->width)
+                s->avctx->sample_aspect_ratio = av_mul_q(old_aspect, (AVRational){2, 1});
+            if (new_w * s->height == 2 * new_h * s->width)
+                s->avctx->sample_aspect_ratio = av_mul_q(old_aspect, (AVRational){1, 2});
+
             ret = ff_set_dimensions(s->avctx, new_w, new_h);
             if (ret < 0)
                 return ret;
@@ -389,9 +404,10 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         }
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
-            av_log(s->avctx, AV_LOG_DEBUG, "F %d/%d\n", f, rpr_bits);
+            av_log(s->avctx, AV_LOG_DEBUG, "F %d/%d/%d\n", f, rpr_bits, rpr_max);
         }
-    } else if (av_image_check_size(s->width, s->height, 0, s->avctx) < 0)
+    }
+    if (av_image_check_size(s->width, s->height, 0, s->avctx) < 0)
         return AVERROR_INVALIDDATA;
 
     mb_pos = ff_h263_decode_mba(s);
@@ -410,15 +426,17 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         } else {
             s->time    = seq;
             s->pb_time = s->pp_time - (s->last_non_b_time - s->time);
-            if (s->pp_time <= s->pb_time ||
-                s->pp_time <= s->pp_time - s->pb_time || s->pp_time <= 0) {
-                av_log(s->avctx, AV_LOG_DEBUG, "messed up order, possible "
-                       "from seeking? skipping current B-frame\n");
-                return FRAME_SKIPPED;
-            }
-            ff_mpeg4_init_direct_mv(s);
         }
     }
+    if (s->pict_type == AV_PICTURE_TYPE_B) {
+        if (s->pp_time <=s->pb_time || s->pp_time <= s->pp_time - s->pb_time || s->pp_time<=0) {
+            av_log(s->avctx, AV_LOG_DEBUG,
+                   "messed up order, possible from seeking? skipping current B-frame\n");
+#define ERROR_SKIP_FRAME -123
+            return ERROR_SKIP_FRAME;
+        }
+        ff_mpeg4_init_direct_mv(s);
+    }
 
     s->no_rounding = get_bits1(&s->gb);
 
@@ -430,7 +448,8 @@ static int rv20_decode_picture_header(RVDecContext *rv)
     s->unrestricted_mv = 1;
     s->h263_aic        = s->pict_type == AV_PICTURE_TYPE_I;
     s->modified_quant  = 1;
-    s->loop_filter     = 1;
+    if (!s->avctx->lowres)
+        s->loop_filter = 1;
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(s->avctx, AV_LOG_INFO,
@@ -439,7 +458,7 @@ static int rv20_decode_picture_header(RVDecContext *rv)
                s->no_rounding);
     }
 
-    assert(s->pict_type != AV_PICTURE_TYPE_B || !s->low_delay);
+    av_assert0(s->pict_type != AV_PICTURE_TYPE_B || !s->low_delay);
 
     return s->mb_width * s->mb_height - mb_pos;
 }
@@ -460,10 +479,9 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
         return ret;
 
     ff_mpv_decode_defaults(s);
+    ff_mpv_decode_init(s, avctx);
 
-    s->avctx       = avctx;
     s->out_format  = FMT_H263;
-    s->codec_id    = avctx->codec_id;
 
     rv->orig_width  =
     s->width        = avctx->coded_width;
@@ -496,8 +514,8 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
     }
 
     if (avctx->debug & FF_DEBUG_PICT_INFO) {
-        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", rv->sub_id,
-               avctx->extradata_size >= 4 ? ((int *) avctx->extradata)[0] : -1);
+        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%"PRIX32"\n", rv->sub_id,
+               ((uint32_t *) avctx->extradata)[0]);
     }
 
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
@@ -545,7 +563,8 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
     else
         mb_count = rv20_decode_picture_header(rv);
     if (mb_count < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
+        if (mb_count != ERROR_SKIP_FRAME)
+            av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -578,6 +597,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
         }
     }
 
+
     ff_dlog(avctx, "qscale=%d\n", s->qscale);
 
     /* default quantization values */
@@ -618,7 +638,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
     for (s->mb_num_left = mb_count; s->mb_num_left > 0; s->mb_num_left--) {
         int ret;
         ff_update_block_index(s);
-        ff_dlog(avctx, "**mb x=%d y=%d\n", s->mb_x, s->mb_y);
+        ff_tlog(avctx, "**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 
         s->mv_dir  = MV_DIR_FORWARD;
         s->mv_type = MV_TYPE_16X16;
@@ -650,7 +670,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
         }
         if (s->pict_type != AV_PICTURE_TYPE_B)
             ff_h263_update_motion_val(s);
-        ff_mpv_decode_mb(s, s->block);
+        ff_mpv_reconstruct_mb(s, s->block);
         if (s->loop_filter)
             ff_h263_loop_filter(s);
 
@@ -748,11 +768,13 @@ static int rv10_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
             if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
                 return ret;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         } else if (s->last_picture_ptr) {
             if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
                 return ret;
-            ff_print_debug_info(s, s->last_picture_ptr);
+            ff_print_debug_info(s, s->last_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict,s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         }
 
         if (s->last_picture_ptr || s->low_delay) {
@@ -776,6 +798,7 @@ AVCodec ff_rv10_decoder = {
     .close          = rv10_decode_end,
     .decode         = rv10_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -793,6 +816,7 @@ AVCodec ff_rv20_decoder = {
     .decode         = rv10_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
     .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/rv10.h b/libavcodec/rv10.h
index b44bc1f..364270e 100644
--- a/libavcodec/rv10.h
+++ b/libavcodec/rv10.h
@@ -1,20 +1,20 @@
 /*
  * RV10/RV20 decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv10enc.c b/libavcodec/rv10enc.c
index 765c57d..8691d18 100644
--- a/libavcodec/rv10enc.c
+++ b/libavcodec/rv10enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv20enc.c b/libavcodec/rv20enc.c
index 20090b1..81fb4fc 100644
--- a/libavcodec/rv20enc.c
+++ b/libavcodec/rv20enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,12 +43,12 @@ void ff_rv20_encode_picture_header(MpegEncContext *s, int picture_number){
 
     put_bits(&s->pb, 1, s->no_rounding);
 
-    assert(s->f_code == 1);
-    assert(s->unrestricted_mv == 0);
-    assert(s->alt_inter_vlc == 0);
-    assert(s->umvplus == 0);
-    assert(s->modified_quant==1);
-    assert(s->loop_filter==1);
+    av_assert0(s->f_code == 1);
+    av_assert0(s->unrestricted_mv == 0);
+    av_assert0(s->alt_inter_vlc == 0);
+    av_assert0(s->umvplus == 0);
+    av_assert0(s->modified_quant==1);
+    av_assert0(s->loop_filter==1);
 
     s->h263_aic= s->pict_type == AV_PICTURE_TYPE_I;
     if(s->h263_aic){
diff --git a/libavcodec/rv30.c b/libavcodec/rv30.c
index 77e875b..ddaaac6 100644
--- a/libavcodec/rv30.c
+++ b/libavcodec/rv30.c
@@ -2,20 +2,20 @@
  * RV30 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,9 @@
  */
 
 #include "avcodec.h"
-#include "golomb_legacy.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
+#include "golomb.h"
 
 #include "rv34.h"
 #include "rv30data.h"
@@ -51,8 +51,13 @@ static int rv30_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn
     si->quant = get_bits(gb, 5);
     skip_bits1(gb);
     si->pts = get_bits(gb, 13);
-    rpr = get_bits(gb, r->rpr);
+    rpr = get_bits(gb, av_log2(r->max_rpr) + 1);
     if(rpr){
+        if (rpr > r->max_rpr) {
+            av_log(avctx, AV_LOG_ERROR, "rpr too large\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (avctx->extradata_size < rpr * 2 + 8) {
             av_log(avctx, AV_LOG_ERROR,
                    "Insufficient extradata - need at least %d bytes, got %d\n",
@@ -62,6 +67,9 @@ static int rv30_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn
 
         w = r->s.avctx->extradata[6 + rpr*2] << 2;
         h = r->s.avctx->extradata[7 + rpr*2] << 2;
+    } else {
+        w = r->orig_width;
+        h = r->orig_height;
     }
     si->width  = w;
     si->height = h;
@@ -82,7 +90,7 @@ static int rv30_decode_intra_types(RV34DecContext *r, GetBitContext *gb, int8_t
     for(i = 0; i < 4; i++, dst += r->intra_types_stride - 4){
         for(j = 0; j < 4; j+= 2){
             unsigned code = get_interleaved_ue_golomb(gb) << 1;
-            if(code >= 81*2){
+            if (code > 80U*2U) {
                 av_log(r->s.avctx, AV_LOG_ERROR, "Incorrect intra prediction code\n");
                 return -1;
             }
@@ -254,15 +262,22 @@ static av_cold int rv30_decode_init(AVCodecContext *avctx)
     RV34DecContext *r = avctx->priv_data;
     int ret;
 
+    r->orig_width  = avctx->coded_width;
+    r->orig_height = avctx->coded_height;
+
+    if (avctx->extradata_size < 2) {
+        av_log(avctx, AV_LOG_ERROR, "Extradata is too small.\n");
+        return AVERROR(EINVAL);
+    }
     r->rv30 = 1;
     if ((ret = ff_rv34_decode_init(avctx)) < 0)
         return ret;
-    if(avctx->extradata_size < 2){
-        av_log(avctx, AV_LOG_ERROR, "Extradata is too small.\n");
-        return -1;
+
+    r->max_rpr = avctx->extradata[1] & 7;
+    if(avctx->extradata_size < 2*r->max_rpr + 8){
+        av_log(avctx, AV_LOG_WARNING, "Insufficient extradata - need at least %d bytes, got %d\n",
+               2*r->max_rpr + 8, avctx->extradata_size);
     }
-    r->rpr = (avctx->extradata[1] & 7) >> 1;
-    r->rpr = FFMIN(r->rpr + 1, 3);
 
     r->parse_slice_header = rv30_parse_slice_header;
     r->decode_intra_types = rv30_decode_intra_types;
diff --git a/libavcodec/rv30data.h b/libavcodec/rv30data.h
index 079204d..5c4cb97 100644
--- a/libavcodec/rv30data.h
+++ b/libavcodec/rv30data.h
@@ -2,20 +2,20 @@
  * RealVideo 3 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv30dsp.c b/libavcodec/rv30dsp.c
index 50f4186..8b205e0 100644
--- a/libavcodec/rv30dsp.c
+++ b/libavcodec/rv30dsp.c
@@ -2,20 +2,20 @@
  * RV30 decoder motion compensation functions
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 9fb0760..d171e6e 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -2,20 +2,20 @@
  * RV30/40 decoder common data
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,13 +24,14 @@
  * RV30/40 decoder common data
  */
 
+#include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 
 #include "avcodec.h"
 #include "error_resilience.h"
-#include "golomb_legacy.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
+#include "golomb.h"
 #include "internal.h"
 #include "mathops.h"
 #include "mpeg_er.h"
@@ -403,7 +404,8 @@ static int rv34_decode_inter_mb_header(RV34DecContext *r, int8_t *intra_types)
             r->mb_type[mb_pos] = RV34_MB_B_DIRECT;
     }
     r->is16 = !!IS_INTRA16x16(s->current_picture_ptr->mb_type[mb_pos]);
-    rv34_decode_mv(r, r->block_type);
+    if (rv34_decode_mv(r, r->block_type) < 0)
+        return -1;
     if(r->block_type == RV34_MB_SKIP){
         fill_rectangle(intra_types, 4, 4, r->intra_types_stride, 0, sizeof(intra_types[0]));
         return 0;
@@ -510,7 +512,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int
     }
 }
 
-#define GET_PTS_DIFF(a, b) ((a - b + 8192) & 0x1FFF)
+#define GET_PTS_DIFF(a, b) (((a) - (b) + 8192) & 0x1FFF)
 
 /**
  * Calculate motion vector component that should be added for direct blocks.
@@ -519,7 +521,7 @@ static int calc_add_mv(RV34DecContext *r, int dir, int val)
 {
     int mul = dir ? -r->mv_weight2 : r->mv_weight1;
 
-    return (val * mul + 0x2000) >> 14;
+    return (int)(val * (SUINT)mul + 0x2000) >> 14;
 }
 
 /**
@@ -672,6 +674,7 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     int dxy, mx, my, umx, umy, lx, ly, uvmx, uvmy, src_x, src_y, uvsrc_x, uvsrc_y;
     int mv_pos = s->mb_x * 2 + s->mb_y * 2 * s->b8_stride + mv_off;
     int is16x16 = 1;
+    int emu = 0;
 
     if(thirdpel){
         int chroma_mx, chroma_my;
@@ -723,24 +726,14 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     if(s->h_edge_pos - (width << 3) < 6 || s->v_edge_pos - (height << 3) < 6 ||
        (unsigned)(src_x - !!lx*2) > s->h_edge_pos - !!lx*2 - (width <<3) - 4 ||
        (unsigned)(src_y - !!ly*2) > s->v_edge_pos - !!ly*2 - (height<<3) - 4) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 22 * s->linesize;
-
         srcY -= 2 + 2*s->linesize;
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
                                  (width << 3) + 6, (height << 3) + 6,
-                            src_x - 2, src_y - 2, s->h_edge_pos, s->v_edge_pos);
+                                 src_x - 2, src_y - 2,
+                                 s->h_edge_pos, s->v_edge_pos);
         srcY = s->sc.edge_emu_buffer + 2 + 2*s->linesize;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
-                                 s->uvlinesize,s->uvlinesize,
-                                 (width << 2) + 1, (height << 2) + 1,
-                            uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
-                                 s->uvlinesize, s->uvlinesize,
-                                 (width << 2) + 1, (height << 2) + 1,
-                            uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+        emu = 1;
     }
     if(!weighted){
         Y = s->dest[0] + xoff      + yoff     *s->linesize;
@@ -763,6 +756,24 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     }
     is16x16 = (block_type != RV34_MB_P_8x8) && (block_type != RV34_MB_P_16x8) && (block_type != RV34_MB_P_8x16);
     qpel_mc[!is16x16][dxy](Y, srcY, s->linesize);
+    if (emu) {
+        uint8_t *uvbuf = s->sc.edge_emu_buffer;
+
+        s->vdsp.emulated_edge_mc(uvbuf, srcU,
+                                 s->uvlinesize, s->uvlinesize,
+                                 (width << 2) + 1, (height << 2) + 1,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+        srcU = uvbuf;
+        uvbuf += 9*s->uvlinesize;
+
+        s->vdsp.emulated_edge_mc(uvbuf, srcV,
+                                 s->uvlinesize, s->uvlinesize,
+                                 (width << 2) + 1, (height << 2) + 1,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+        srcV = uvbuf;
+    }
     chroma_mc[2-width]   (U, srcU, s->uvlinesize, height*4, uvmx, uvmy);
     chroma_mc[2-width]   (V, srcV, s->uvlinesize, height*4, uvmx, uvmy);
 }
@@ -856,6 +867,11 @@ static int rv34_decode_mv(RV34DecContext *r, int block_type)
     for(i = 0; i < num_mvs[block_type]; i++){
         r->dmv[i][0] = get_interleaved_se_golomb(gb);
         r->dmv[i][1] = get_interleaved_se_golomb(gb);
+        if (r->dmv[i][0] == INVALID_VLC ||
+            r->dmv[i][1] == INVALID_VLC) {
+            r->dmv[i][0] = r->dmv[i][1] = 0;
+            return AVERROR_INVALIDDATA;
+        }
     }
     switch(block_type){
     case RV34_MB_TYPE_INTRA:
@@ -1339,7 +1355,7 @@ static int check_slice_end(RV34DecContext *r, MpegEncContext *s)
     if(r->s.mb_skip_run > 1)
         return 0;
     bits = get_bits_left(&s->gb);
-    if(bits < 0 || (bits < 8 && !show_bits(&s->gb, bits)))
+    if(bits <= 0 || (bits < 8 && !show_bits(&s->gb, bits)))
         return 1;
     return 0;
 }
@@ -1361,11 +1377,11 @@ static int rv34_decoder_alloc(RV34DecContext *r)
 {
     r->intra_types_stride = r->s.mb_width * 4 + 4;
 
-    r->cbp_chroma       = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->cbp_chroma       = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->cbp_chroma));
-    r->cbp_luma         = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->cbp_luma         = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->cbp_luma));
-    r->deblock_coefs    = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->deblock_coefs    = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->deblock_coefs));
     r->intra_types_hist = av_malloc(r->intra_types_stride * 4 * 2 *
                                     sizeof(*r->intra_types_hist));
@@ -1410,6 +1426,10 @@ static int rv34_decode_slice(RV34DecContext *r, int end, const uint8_t* buf, int
         av_log(s->avctx, AV_LOG_ERROR, "Slice type mismatch\n");
         return AVERROR_INVALIDDATA;
     }
+    if (s->width != r->si.width || s->height != r->si.height) {
+        av_log(s->avctx, AV_LOG_ERROR, "Size mismatch\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     r->si.end = end;
     s->qscale = r->si.quant;
@@ -1476,14 +1496,9 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx)
     int ret;
 
     ff_mpv_decode_defaults(s);
-    s->avctx      = avctx;
+    ff_mpv_decode_init(s, avctx);
     s->out_format = FMT_H263;
-    s->codec_id   = avctx->codec_id;
-
-    s->width  = avctx->width;
-    s->height = avctx->height;
 
-    r->s.avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->has_b_frames = 1;
     s->low_delay = 0;
@@ -1525,7 +1540,14 @@ int ff_rv34_decode_init_thread_copy(AVCodecContext *avctx)
 
     if (avctx->internal->is_copy) {
         r->tmp_b_block_base = NULL;
+        r->cbp_chroma       = NULL;
+        r->cbp_luma         = NULL;
+        r->deblock_coefs    = NULL;
+        r->intra_types_hist = NULL;
+        r->mb_type          = NULL;
+
         ff_mpv_idct_init(&r->s);
+
         if ((err = ff_mpv_common_init(&r->s)) < 0)
             return err;
         if ((err = rv34_decoder_alloc(r)) < 0) {
@@ -1563,16 +1585,19 @@ int ff_rv34_decode_update_thread_context(AVCodecContext *dst, const AVCodecConte
 
     // Do no call ff_mpeg_update_thread_context on a partially initialized
     // decoder context.
-    if (!s1->linesize)
+    if (!s1->context_initialized)
         return 0;
 
     return ff_mpeg_update_thread_context(dst, src);
 }
 
-static int get_slice_offset(AVCodecContext *avctx, const uint8_t *buf, int n)
+static int get_slice_offset(AVCodecContext *avctx, const uint8_t *buf, int n, int slice_count, int buf_size)
 {
-    if(avctx->slice_count) return avctx->slice_offset[n];
-    else                   return AV_RL32(buf + n*8 - 4) == 1 ? AV_RL32(buf + n*8) :  AV_RB32(buf + n*8);
+    if (n < slice_count) {
+        if(avctx->slice_count) return avctx->slice_offset[n];
+        else                   return AV_RL32(buf + n*8 - 4) == 1 ? AV_RL32(buf + n*8) :  AV_RB32(buf + n*8);
+    } else
+        return buf_size;
 }
 
 static int finish_frame(AVCodecContext *avctx, AVFrame *pict)
@@ -1591,18 +1616,30 @@ static int finish_frame(AVCodecContext *avctx, AVFrame *pict)
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
         if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->current_picture_ptr);
+        ff_print_debug_info(s, s->current_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         got_picture = 1;
     } else if (s->last_picture_ptr) {
         if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->last_picture_ptr);
+        ff_print_debug_info(s, s->last_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         got_picture = 1;
     }
 
     return got_picture;
 }
 
+static AVRational update_sar(int old_w, int old_h, AVRational sar, int new_w, int new_h)
+{
+    // attempt to keep aspect during typical resolution switches
+    if (!sar.num)
+        sar = (AVRational){1, 1};
+
+    sar = av_mul_q(sar, av_mul_q((AVRational){new_h, new_w}, (AVRational){old_w, old_h}));
+    return sar;
+}
+
 int ff_rv34_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_picture_ptr,
                             AVPacket *avpkt)
@@ -1617,6 +1654,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
     int slice_count;
     const uint8_t *slices_hdr = NULL;
     int last = 0;
+    int faulty_b = 0;
+    int offset;
 
     /* no supplementary picture */
     if (buf_size == 0) {
@@ -1639,13 +1678,13 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
     }else
         slice_count = avctx->slice_count;
 
+    offset = get_slice_offset(avctx, slices_hdr, 0, slice_count, buf_size);
     //parse first slice header to check whether this frame can be decoded
-    if(get_slice_offset(avctx, slices_hdr, 0) < 0 ||
-       get_slice_offset(avctx, slices_hdr, 0) > buf_size){
+    if(offset < 0 || offset > buf_size){
         av_log(avctx, AV_LOG_ERROR, "Slice offset is invalid\n");
         return AVERROR_INVALIDDATA;
     }
-    init_get_bits(&s->gb, buf+get_slice_offset(avctx, slices_hdr, 0), (buf_size-get_slice_offset(avctx, slices_hdr, 0))*8);
+    init_get_bits(&s->gb, buf+offset, (buf_size-offset)*8);
     if(r->parse_slice_header(r, &r->s.gb, &si) < 0 || si.start){
         av_log(avctx, AV_LOG_ERROR, "First slice header is incorrect\n");
         return AVERROR_INVALIDDATA;
@@ -1654,7 +1693,7 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
         si.type == AV_PICTURE_TYPE_B) {
         av_log(avctx, AV_LOG_ERROR, "Invalid decoder state: B-frame without "
                "reference data.\n");
-        return AVERROR_INVALIDDATA;
+        faulty_b = 1;
     }
     if(   (avctx->skip_frame >= AVDISCARD_NONREF && si.type==AV_PICTURE_TYPE_B)
        || (avctx->skip_frame >= AVDISCARD_NONKEY && si.type!=AV_PICTURE_TYPE_I)
@@ -1663,8 +1702,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
 
     /* first slice */
     if (si.start == 0) {
-        if (s->mb_num_left > 0) {
-            av_log(avctx, AV_LOG_ERROR, "New frame but still %d MB left.",
+        if (s->mb_num_left > 0 && s->current_picture_ptr) {
+            av_log(avctx, AV_LOG_ERROR, "New frame but still %d MB left.\n",
                    s->mb_num_left);
             ff_er_frame_end(&s->er);
             ff_mpv_frame_end(s);
@@ -1676,6 +1715,12 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
             av_log(s->avctx, AV_LOG_WARNING, "Changing dimensions to %dx%d\n",
                    si.width, si.height);
 
+            if (av_image_check_size(si.width, si.height, 0, s->avctx))
+                return AVERROR_INVALIDDATA;
+
+            s->avctx->sample_aspect_ratio = update_sar(
+                s->width, s->height, s->avctx->sample_aspect_ratio,
+                si.width, si.height);
             s->width  = si.width;
             s->height = si.height;
 
@@ -1688,6 +1733,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
             if ((err = rv34_decoder_realloc(r)) < 0)
                 return err;
         }
+        if (faulty_b)
+            return AVERROR_INVALIDDATA;
         s->pict_type = si.type ? si.type : AV_PICTURE_TYPE_I;
         if (ff_mpv_frame_start(s, s->avctx) < 0)
             return -1;
@@ -1717,6 +1764,9 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
                 r->mv_weight1 = r->mv_weight2 = r->weight1 = r->weight2 = 8192;
                 r->scaled_weight = 0;
             }else{
+                if (FFMAX(dist0, dist1) > refdist)
+                    av_log(avctx, AV_LOG_TRACE, "distance overflow\n");
+
                 r->mv_weight1 = (dist0 << 14) / refdist;
                 r->mv_weight2 = (dist1 << 14) / refdist;
                 if((r->mv_weight1|r->mv_weight2) & 511){
@@ -1740,40 +1790,32 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
     }
 
     for(i = 0; i < slice_count; i++){
-        int offset = get_slice_offset(avctx, slices_hdr, i);
+        int offset  = get_slice_offset(avctx, slices_hdr, i  , slice_count, buf_size);
+        int offset1 = get_slice_offset(avctx, slices_hdr, i+1, slice_count, buf_size);
         int size;
-        if(i+1 == slice_count)
-            size = buf_size - offset;
-        else
-            size = get_slice_offset(avctx, slices_hdr, i+1) - offset;
 
-        if(offset < 0 || offset > buf_size){
+        if(offset < 0 || offset > offset1 || offset1 > buf_size){
             av_log(avctx, AV_LOG_ERROR, "Slice offset is invalid\n");
             break;
         }
+        size = offset1 - offset;
 
         r->si.end = s->mb_width * s->mb_height;
         s->mb_num_left = r->s.mb_x + r->s.mb_y*r->s.mb_width - r->si.start;
 
         if(i+1 < slice_count){
-            if (get_slice_offset(avctx, slices_hdr, i+1) < 0 ||
-                get_slice_offset(avctx, slices_hdr, i+1) > buf_size) {
+            int offset2 = get_slice_offset(avctx, slices_hdr, i+2, slice_count, buf_size);
+            if (offset2 < offset1 || offset2 > buf_size) {
                 av_log(avctx, AV_LOG_ERROR, "Slice offset is invalid\n");
                 break;
             }
-            init_get_bits(&s->gb, buf+get_slice_offset(avctx, slices_hdr, i+1), (buf_size-get_slice_offset(avctx, slices_hdr, i+1))*8);
+            init_get_bits(&s->gb, buf+offset1, (buf_size-offset1)*8);
             if(r->parse_slice_header(r, &r->s.gb, &si) < 0){
-                if(i+2 < slice_count)
-                    size = get_slice_offset(avctx, slices_hdr, i+2) - offset;
-                else
-                    size = buf_size - offset;
+                size = offset2 - offset;
             }else
                 r->si.end = si.start;
         }
-        if (size < 0 || size > buf_size - offset) {
-            av_log(avctx, AV_LOG_ERROR, "Slice size is invalid\n");
-            break;
-        }
+        av_assert0 (size >= 0 && size <= buf_size - offset);
         last = rv34_decode_slice(r, r->si.end, buf + offset, size);
         if(last)
             break;
diff --git a/libavcodec/rv34.h b/libavcodec/rv34.h
index 0ac24bf..efff94a 100644
--- a/libavcodec/rv34.h
+++ b/libavcodec/rv34.h
@@ -2,20 +2,20 @@
  * RV30/40 decoder common data declarations
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -102,13 +102,15 @@ typedef struct RV34DecContext{
     int dmv[4][2];           ///< differential motion vectors for the current macroblock
 
     int rv30;                ///< indicates which RV variant is currently decoded
-    int rpr;                 ///< one field size in RV30 slice header
+    int max_rpr;
 
     int cur_pts, last_pts, next_pts;
     int scaled_weight;
     int weight1, weight2;    ///< B-frame distance fractions (0.14) used in motion compensation
     int mv_weight1, mv_weight2;
 
+    int orig_width, orig_height;
+
     uint16_t *cbp_luma;      ///< CBP values for luma subblocks
     uint8_t  *cbp_chroma;    ///< CBP values for chroma subblocks
     uint16_t *deblock_coefs; ///< deblock coefficients for each macroblock
diff --git a/libavcodec/rv34_parser.c b/libavcodec/rv34_parser.c
index ec6d3a5..765d390 100644
--- a/libavcodec/rv34_parser.c
+++ b/libavcodec/rv34_parser.c
@@ -2,20 +2,20 @@
  * RV30/40 parser
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34data.h b/libavcodec/rv34data.h
index 3064124..4b2701f 100644
--- a/libavcodec/rv34data.h
+++ b/libavcodec/rv34data.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c
index 7234ee8..c3f245e 100644
--- a/libavcodec/rv34dsp.c
+++ b/libavcodec/rv34dsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  * Copyright (c) 2011 Janne Grunau
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 1aa80cf..2e9ec4e 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -2,20 +2,20 @@
  * RV30/40 decoder motion compensation functions
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34vlc.h b/libavcodec/rv34vlc.h
index f4670c1..aa29357 100644
--- a/libavcodec/rv34vlc.h
+++ b/libavcodec/rv34vlc.h
@@ -2,20 +2,20 @@
  * RealVideo 3/4 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv40.c b/libavcodec/rv40.c
index d46a44a..dfeebda 100644
--- a/libavcodec/rv40.c
+++ b/libavcodec/rv40.c
@@ -2,20 +2,20 @@
  * RV40 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,9 @@
 #include "libavutil/imgutils.h"
 
 #include "avcodec.h"
-#include "golomb_legacy.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
+#include "golomb.h"
 
 #include "rv34.h"
 #include "rv40vlc2.h"
@@ -109,6 +109,8 @@ static int get_dimension(GetBitContext *gb, const int *dim)
         val = dim[get_bits1(gb) - val];
     if(!val){
         do{
+            if (get_bits_left(gb) < 8)
+                return AVERROR_INVALIDDATA;
             t = get_bits(gb, 8);
             val += t << 2;
         }while(t == 0xFF);
@@ -187,7 +189,7 @@ static int rv40_decode_intra_types(RV34DecContext *r, GetBitContext *gb, int8_t
             A = ptr[-r->intra_types_stride + 1]; // it won't be used for the last coefficient in a row
             B = ptr[-r->intra_types_stride];
             C = ptr[-1];
-            pattern = A + (B << 4) + (C << 8);
+            pattern = A + B * (1 << 4) + C * (1 << 8);
             for(k = 0; k < MODE2_PATTERNS_NUM; k++)
                 if(pattern == rv40_aic_table_index[k])
                     break;
@@ -230,8 +232,11 @@ static int rv40_decode_mb_info(RV34DecContext *r)
     int prev_type = 0;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
-    if(!r->s.mb_skip_run)
+    if(!r->s.mb_skip_run) {
         r->s.mb_skip_run = get_interleaved_ue_golomb(gb) + 1;
+        if(r->s.mb_skip_run > (unsigned)s->mb_num)
+            return -1;
+    }
 
     if(--r->s.mb_skip_run)
          return RV34_MB_SKIP;
@@ -358,7 +363,7 @@ static void rv40_loop_filter(RV34DecContext *r, int row)
     int uvcbp[4][2];
     /**
      * This mask represents the pattern of luma subblocks that should be filtered
-     * in addition to the coded ones because because they lie at the edge of
+     * in addition to the coded ones because they lie at the edge of
      * 8x8 block with different enough motion vectors
      */
     unsigned mvmasks[4];
diff --git a/libavcodec/rv40data.h b/libavcodec/rv40data.h
index 42328af..36f9f91 100644
--- a/libavcodec/rv40data.h
+++ b/libavcodec/rv40data.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index 4ca5cc7..5579bd9 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "pixels.h"
 #include "rnd_avg.h"
 #include "rv34dsp.h"
+#include "libavutil/avassert.h"
 
 #define RV40_LOWPASS(OPNAME, OP) \
 static void OPNAME ## rv40_qpel8_h_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\
@@ -302,7 +303,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst /*align 8*/,\
     int i;\
     int bias = rv40_bias[y>>1][x>>1];\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i = 0; i < h; i++){\
@@ -338,7 +339,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/,\
     int i;\
     int bias = rv40_bias[y>>1][x>>1];\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i = 0; i < h; i++){\
@@ -454,7 +455,7 @@ static av_always_inline void rv40_weak_loop_filter(uint8_t *src,
         if (u > 3 - (filter_p1 && filter_q1))
             continue;
 
-        t <<= 2;
+        t *= 1 << 2;
         if (filter_p1 && filter_q1)
             t += src[-2*step] - src[1*step];
 
diff --git a/libavcodec/rv40vlc2.h b/libavcodec/rv40vlc2.h
index 2f63fc2..15119a1 100644
--- a/libavcodec/rv40vlc2.h
+++ b/libavcodec/rv40vlc2.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/s302m.c b/libavcodec/s302m.c
index 635f697..584b58e 100644
--- a/libavcodec/s302m.c
+++ b/libavcodec/s302m.c
@@ -3,24 +3,25 @@
  * Copyright (c) 2008 Laurent Aimar <fenrir@videolan.org>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste.coudurier@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
 #include "libavutil/log.h"
 #include "avcodec.h"
 #include "internal.h"
@@ -28,6 +29,11 @@
 
 #define AES3_HEADER_LEN 4
 
+typedef struct S302Context {
+    AVClass *class;
+    int non_pcm_mode;
+} S302Context;
+
 static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
                                     int buf_size)
 {
@@ -59,18 +65,26 @@ static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
     }
 
     /* Set output properties */
-    avctx->bits_per_coded_sample = bits;
+    avctx->bits_per_raw_sample = bits;
     if (bits > 16)
         avctx->sample_fmt = AV_SAMPLE_FMT_S32;
     else
         avctx->sample_fmt = AV_SAMPLE_FMT_S16;
 
     avctx->channels    = channels;
-    avctx->sample_rate = 48000;
-    avctx->bit_rate    = 48000 * avctx->channels * (avctx->bits_per_coded_sample + 4) +
-                         32 * (48000 / (buf_size * 8 /
-                                        (avctx->channels *
-                                         (avctx->bits_per_coded_sample + 4))));
+    switch(channels) {
+        case 2:
+            avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+            break;
+        case 4:
+            avctx->channel_layout = AV_CH_LAYOUT_QUAD;
+            break;
+        case 6:
+            avctx->channel_layout = AV_CH_LAYOUT_5POINT1_BACK;
+            break;
+        case 8:
+            avctx->channel_layout = AV_CH_LAYOUT_5POINT1_BACK | AV_CH_LAYOUT_STEREO_DOWNMIX;
+    }
 
     return frame_size;
 }
@@ -78,10 +92,13 @@ static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
 static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                               int *got_frame_ptr, AVPacket *avpkt)
 {
+    S302Context *s = avctx->priv_data;
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
     int block_size, ret;
+    int i;
+    int non_pcm_data_type = -1;
 
     int frame_size = s302m_parse_frame_header(avctx, buf, buf_size);
     if (frame_size < 0)
@@ -91,38 +108,58 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
     buf      += AES3_HEADER_LEN;
 
     /* get output buffer */
-    block_size = (avctx->bits_per_coded_sample + 4) / 4;
+    block_size = (avctx->bits_per_raw_sample + 4) / 4;
     frame->nb_samples = 2 * (buf_size / block_size) / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
+    avctx->bit_rate = 48000 * avctx->channels * (avctx->bits_per_raw_sample + 4) +
+                      32 * 48000 / frame->nb_samples;
     buf_size = (frame->nb_samples * avctx->channels / 2) * block_size;
 
-    if (avctx->bits_per_coded_sample == 24) {
+    if (avctx->bits_per_raw_sample == 24) {
         uint32_t *o = (uint32_t *)frame->data[0];
         for (; buf_size > 6; buf_size -= 7) {
-            *o++ = (ff_reverse[buf[2]]        << 24) |
+            *o++ = ((unsigned)ff_reverse[buf[2]]        << 24) |
                    (ff_reverse[buf[1]]        << 16) |
                    (ff_reverse[buf[0]]        <<  8);
-            *o++ = (ff_reverse[buf[6] & 0xf0] << 28) |
+            *o++ = ((unsigned)ff_reverse[buf[6] & 0xf0] << 28) |
                    (ff_reverse[buf[5]]        << 20) |
                    (ff_reverse[buf[4]]        << 12) |
                    (ff_reverse[buf[3] & 0x0f] <<  4);
             buf += 7;
         }
-    } else if (avctx->bits_per_coded_sample == 20) {
+        o = (uint32_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0x96F87200U && o[i+5] == 0xA54E1F00) {
+                    non_pcm_data_type = (o[i+6] >> 16) & 0x1F;
+                    break;
+                }
+            }
+    } else if (avctx->bits_per_raw_sample == 20) {
         uint32_t *o = (uint32_t *)frame->data[0];
         for (; buf_size > 5; buf_size -= 6) {
-            *o++ = (ff_reverse[buf[2] & 0xf0] << 28) |
+            *o++ = ((unsigned)ff_reverse[buf[2] & 0xf0] << 28) |
                    (ff_reverse[buf[1]]        << 20) |
                    (ff_reverse[buf[0]]        << 12);
-            *o++ = (ff_reverse[buf[5] & 0xf0] << 28) |
+            *o++ = ((unsigned)ff_reverse[buf[5] & 0xf0] << 28) |
                    (ff_reverse[buf[4]]        << 20) |
                    (ff_reverse[buf[3]]        << 12);
             buf += 6;
         }
+        o = (uint32_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0x6F872000U && o[i+5] == 0x54E1F000) {
+                    non_pcm_data_type = (o[i+6] >> 16) & 0x1F;
+                    break;
+                }
+            }
     } else {
         uint16_t *o = (uint16_t *)frame->data[0];
         for (; buf_size > 4; buf_size -= 5) {
@@ -133,18 +170,61 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                    (ff_reverse[buf[2]]        >>  4);
             buf += 5;
         }
+        o = (uint16_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0xF872U && o[i+5] == 0x4E1F) {
+                    non_pcm_data_type = (o[i+6] & 0x1F);
+                    break;
+                }
+            }
+    }
+
+    if (non_pcm_data_type != -1) {
+        if (s->non_pcm_mode == 3) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "S302 non PCM mode with data type %d not supported\n",
+                   non_pcm_data_type);
+            return AVERROR_PATCHWELCOME;
+        }
+        if (s->non_pcm_mode & 1) {
+            return avpkt->size;
+        }
     }
 
+    avctx->sample_rate = 48000;
+
     *got_frame_ptr = 1;
 
     return avpkt->size;
 }
 
+#define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_DECODING_PARAM
+static const AVOption s302m_options[] = {
+    {"non_pcm_mode", "Chooses what to do with NON-PCM", offsetof(S302Context, non_pcm_mode), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"copy"        , "Pass NON-PCM through unchanged"     , 0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"drop"        , "Drop NON-PCM"                       , 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"decode_copy" , "Decode if possible else passthrough", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"decode_drop" , "Decode if possible else drop"       , 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 3, FLAGS, "non_pcm_mode"},
+    {NULL}
+};
+
+static const AVClass s302m_class = {
+    .class_name = "SMPTE 302M Decoder",
+    .item_name  = av_default_item_name,
+    .option     = s302m_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_s302m_decoder = {
     .name           = "s302m",
     .long_name      = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
     .type           = AVMEDIA_TYPE_AUDIO,
     .id             = AV_CODEC_ID_S302M,
+    .priv_data_size = sizeof(S302Context),
     .decode         = s302m_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &s302m_class,
 };
diff --git a/libavcodec/s302menc.c b/libavcodec/s302menc.c
new file mode 100644
index 0000000..b04a54e
--- /dev/null
+++ b/libavcodec/s302menc.c
@@ -0,0 +1,188 @@
+/*
+ * SMPTE 302M encoder
+ * Copyright (c) 2010 Google, Inc.
+ * Copyright (c) 2013 Darryl Wallace <wallacdj@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "mathops.h"
+#include "put_bits.h"
+
+#define AES3_HEADER_LEN 4
+
+typedef struct S302MEncContext {
+    uint8_t framing_index; /* Set for even channels on multiple of 192 samples */
+} S302MEncContext;
+
+static av_cold int s302m_encode_init(AVCodecContext *avctx)
+{
+    S302MEncContext *s = avctx->priv_data;
+
+    if (avctx->channels & 1 || avctx->channels > 8) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Encoding %d channel(s) is not allowed. Only 2, 4, 6 and 8 channels are supported.\n",
+               avctx->channels);
+        return AVERROR(EINVAL);
+    }
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_S16:
+        avctx->bits_per_raw_sample = 16;
+        break;
+    case AV_SAMPLE_FMT_S32:
+        if (avctx->bits_per_raw_sample > 20) {
+            if (avctx->bits_per_raw_sample > 24)
+                av_log(avctx, AV_LOG_WARNING, "encoding as 24 bits-per-sample\n");
+            avctx->bits_per_raw_sample = 24;
+        } else if (!avctx->bits_per_raw_sample) {
+            avctx->bits_per_raw_sample = 24;
+        } else if (avctx->bits_per_raw_sample <= 20) {
+            avctx->bits_per_raw_sample = 20;
+        }
+    }
+
+    avctx->frame_size = 0;
+    avctx->bit_rate   = 48000 * avctx->channels *
+                       (avctx->bits_per_raw_sample + 4);
+    s->framing_index  = 0;
+
+    return 0;
+}
+
+static int s302m_encode2_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                               const AVFrame *frame, int *got_packet_ptr)
+{
+    S302MEncContext *s = avctx->priv_data;
+    const int buf_size = AES3_HEADER_LEN +
+                        (frame->nb_samples *
+                         avctx->channels *
+                        (avctx->bits_per_raw_sample + 4)) / 8;
+    int ret, c, channels;
+    uint8_t *o;
+    PutBitContext pb;
+
+    if (buf_size - AES3_HEADER_LEN > UINT16_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "number of samples in frame too big\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, buf_size, 0)) < 0)
+        return ret;
+
+    o = avpkt->data;
+    init_put_bits(&pb, o, buf_size);
+    put_bits(&pb, 16, buf_size - AES3_HEADER_LEN);
+    put_bits(&pb, 2, (avctx->channels - 2) >> 1);   // number of channels
+    put_bits(&pb, 8, 0);                            // channel ID
+    put_bits(&pb, 2, (avctx->bits_per_raw_sample - 16) / 4); // bits per samples (0 = 16bit, 1 = 20bit, 2 = 24bit)
+    put_bits(&pb, 4, 0);                            // alignments
+    flush_put_bits(&pb);
+    o += AES3_HEADER_LEN;
+
+    if (avctx->bits_per_raw_sample == 24) {
+        const uint32_t *samples = (uint32_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x10: 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[(samples[0] & 0x0000FF00) >> 8];
+                o[1] = ff_reverse[(samples[0] & 0x00FF0000) >> 16];
+                o[2] = ff_reverse[(samples[0] & 0xFF000000) >> 24];
+                o[3] = ff_reverse[(samples[1] & 0x00000F00) >> 4] | vucf;
+                o[4] = ff_reverse[(samples[1] & 0x000FF000) >> 12];
+                o[5] = ff_reverse[(samples[1] & 0x0FF00000) >> 20];
+                o[6] = ff_reverse[(samples[1] & 0xF0000000) >> 28];
+                o += 7;
+                samples += 2;
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    } else if (avctx->bits_per_raw_sample == 20) {
+        const uint32_t *samples = (uint32_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x80: 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[ (samples[0] & 0x000FF000) >> 12];
+                o[1] = ff_reverse[ (samples[0] & 0x0FF00000) >> 20];
+                o[2] = ff_reverse[((samples[0] & 0xF0000000) >> 28) | vucf];
+                o[3] = ff_reverse[ (samples[1] & 0x000FF000) >> 12];
+                o[4] = ff_reverse[ (samples[1] & 0x0FF00000) >> 20];
+                o[5] = ff_reverse[ (samples[1] & 0xF0000000) >> 28];
+                o += 6;
+                samples += 2;
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    } else if (avctx->bits_per_raw_sample == 16) {
+        const uint16_t *samples = (uint16_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x10 : 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[ samples[0] & 0xFF];
+                o[1] = ff_reverse[(samples[0] & 0xFF00) >>  8];
+                o[2] = ff_reverse[(samples[1] & 0x0F)   <<  4] | vucf;
+                o[3] = ff_reverse[(samples[1] & 0x0FF0) >>  4];
+                o[4] = ff_reverse[(samples[1] & 0xF000) >> 12];
+                o += 5;
+                samples += 2;
+
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    }
+
+    *got_packet_ptr = 1;
+
+    return 0;
+}
+
+AVCodec ff_s302m_encoder = {
+    .name                  = "s302m",
+    .long_name             = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_S302M,
+    .priv_data_size        = sizeof(S302MEncContext),
+    .init                  = s302m_encode_init,
+    .encode2               = s302m_encode2_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S32,
+                                                            AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .capabilities          = AV_CODEC_CAP_VARIABLE_FRAME_SIZE | AV_CODEC_CAP_EXPERIMENTAL,
+    .supported_samplerates = (const int[]) { 48000, 0 },
+ /* .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_STEREO,
+                                                  AV_CH_LAYOUT_QUAD,
+                                                  AV_CH_LAYOUT_5POINT1_BACK,
+                                                  AV_CH_LAYOUT_5POINT1_BACK | AV_CH_LAYOUT_STEREO_DOWNMIX,
+                                                  0 }, */
+};
diff --git a/libavcodec/samidec.c b/libavcodec/samidec.c
new file mode 100644
index 0000000..e32f238
--- /dev/null
+++ b/libavcodec/samidec.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SAMI subtitle decoder
+ * @see http://msdn.microsoft.com/en-us/library/ms971327.aspx
+ */
+
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "htmlsubtitles.h"
+
+typedef struct {
+    AVBPrint source;
+    AVBPrint content;
+    AVBPrint encoded_source;
+    AVBPrint encoded_content;
+    AVBPrint full;
+    int readorder;
+} SAMIContext;
+
+static int sami_paragraph_to_ass(AVCodecContext *avctx, const char *src)
+{
+    SAMIContext *sami = avctx->priv_data;
+    int ret = 0;
+    char *tag = NULL;
+    char *dupsrc = av_strdup(src);
+    char *p = dupsrc;
+    AVBPrint *dst_content = &sami->encoded_content;
+    AVBPrint *dst_source = &sami->encoded_source;
+
+    if (!dupsrc)
+        return AVERROR(ENOMEM);
+
+    av_bprint_clear(&sami->encoded_content);
+    av_bprint_clear(&sami->content);
+    av_bprint_clear(&sami->encoded_source);
+    for (;;) {
+        char *saveptr = NULL;
+        int prev_chr_is_space = 0;
+        AVBPrint *dst = &sami->content;
+
+        /* parse & extract paragraph tag */
+        p = av_stristr(p, "<P");
+        if (!p)
+            break;
+        if (p[2] != '>' && !av_isspace(p[2])) { // avoid confusion with tags such as <PRE>
+            p++;
+            continue;
+        }
+        if (dst->len) // add a separator with the previous paragraph if there was one
+            av_bprintf(dst, "\\N");
+        tag = av_strtok(p, ">", &saveptr);
+        if (!tag || !saveptr)
+            break;
+        p = saveptr;
+
+        /* check if the current paragraph is the "source" (speaker name) */
+        if (av_stristr(tag, "ID=Source") || av_stristr(tag, "ID=\"Source\"")) {
+            dst = &sami->source;
+            av_bprint_clear(dst);
+        }
+
+        /* if empty event -> skip subtitle */
+        while (av_isspace(*p))
+            p++;
+        if (!strncmp(p, "&nbsp;", 6)) {
+            ret = -1;
+            goto end;
+        }
+
+        /* extract the text, stripping most of the tags */
+        while (*p) {
+            if (*p == '<') {
+                if (!av_strncasecmp(p, "<P", 2) && (p[2] == '>' || av_isspace(p[2])))
+                    break;
+            }
+            if (!av_strncasecmp(p, "<BR", 3)) {
+                av_bprintf(dst, "\\N");
+                p++;
+                while (*p && *p != '>')
+                    p++;
+                if (!*p)
+                    break;
+                if (*p == '>')
+                    p++;
+                continue;
+            }
+            if (!av_isspace(*p))
+                av_bprint_chars(dst, *p, 1);
+            else if (!prev_chr_is_space)
+                av_bprint_chars(dst, ' ', 1);
+            prev_chr_is_space = av_isspace(*p);
+            p++;
+        }
+    }
+
+    av_bprint_clear(&sami->full);
+    if (sami->source.len) {
+        ret = ff_htmlmarkup_to_ass(avctx, dst_source, sami->source.str);
+        if (ret < 0)
+            goto end;
+        av_bprintf(&sami->full, "{\\i1}%s{\\i0}\\N", sami->encoded_source.str);
+    }
+    ret = ff_htmlmarkup_to_ass(avctx, dst_content, sami->content.str);
+    if (ret < 0)
+        goto end;
+    av_bprintf(&sami->full, "%s", sami->encoded_content.str);
+
+end:
+    av_free(dupsrc);
+    return ret;
+}
+
+static int sami_decode_frame(AVCodecContext *avctx,
+                             void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    SAMIContext *sami = avctx->priv_data;
+
+    if (ptr && avpkt->size > 0) {
+        int ret = sami_paragraph_to_ass(avctx, ptr);
+        if (ret < 0)
+            return ret;
+        // TODO: pass escaped sami->encoded_source.str as source
+        ret = ff_ass_add_rect(sub, sami->full.str, sami->readorder++, 0, NULL, NULL);
+        if (ret < 0)
+            return ret;
+    }
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static av_cold int sami_init(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    av_bprint_init(&sami->source,  0, 2048);
+    av_bprint_init(&sami->content, 0, 2048);
+    av_bprint_init(&sami->encoded_source,  0, 2048);
+    av_bprint_init(&sami->encoded_content, 0, 2048);
+    av_bprint_init(&sami->full,    0, 2048);
+    return ff_ass_subtitle_header_default(avctx);
+}
+
+static av_cold int sami_close(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    av_bprint_finalize(&sami->source,  NULL);
+    av_bprint_finalize(&sami->content, NULL);
+    av_bprint_finalize(&sami->encoded_source,  NULL);
+    av_bprint_finalize(&sami->encoded_content, NULL);
+    av_bprint_finalize(&sami->full,    NULL);
+    return 0;
+}
+
+static void sami_flush(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        sami->readorder = 0;
+}
+
+AVCodec ff_sami_decoder = {
+    .name           = "sami",
+    .long_name      = NULL_IF_CONFIG_SMALL("SAMI subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SAMI,
+    .priv_data_size = sizeof(SAMIContext),
+    .init           = sami_init,
+    .close          = sami_close,
+    .decode         = sami_decode_frame,
+    .flush          = sami_flush,
+};
diff --git a/libavcodec/sanm.c b/libavcodec/sanm.c
index 7b00049..811fd21 100644
--- a/libavcodec/sanm.c
+++ b/libavcodec/sanm.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Cyril Zorin
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,108 +105,159 @@ static const int8_t motion_vectors[256][2] = {
 };
 
 static const int8_t c37_mv[] = {
-    0,   0,   1,   0,   2,   0,   3,   0,   5,   0,   8,   0,  13,   0,  21,
-    0,  -1,   0,  -2,   0,  -3,   0,  -5,   0,  -8,   0, -13,   0, -17,   0,
-  -21,   0,   0,   1,   1,   1,   2,   1,   3,   1,   5,   1,   8,   1,  13,
-    1,  21,   1,  -1,   1,  -2,   1,  -3,   1,  -5,   1,  -8,   1, -13,   1,
-  -17,   1, -21,   1,   0,   2,   1,   2,   2,   2,   3,   2,   5,   2,   8,
-    2,  13,   2,  21,   2,  -1,   2,  -2,   2,  -3,   2,  -5,   2,  -8,   2,
-  -13,   2, -17,   2, -21,   2,   0,   3,   1,   3,   2,   3,   3,   3,   5,
-    3,   8,   3,  13,   3,  21,   3,  -1,   3,  -2,   3,  -3,   3,  -5,   3,
-   -8,   3, -13,   3, -17,   3, -21,   3,   0,   5,   1,   5,   2,   5,   3,
-    5,   5,   5,   8,   5,  13,   5,  21,   5,  -1,   5,  -2,   5,  -3,   5,
-   -5,   5,  -8,   5, -13,   5, -17,   5, -21,   5,   0,   8,   1,   8,   2,
-    8,   3,   8,   5,   8,   8,   8,  13,   8,  21,   8,  -1,   8,  -2,   8,
-   -3,   8,  -5,   8,  -8,   8, -13,   8, -17,   8, -21,   8,   0,  13,   1,
-   13,   2,  13,   3,  13,   5,  13,   8,  13,  13,  13,  21,  13,  -1,  13,
-   -2,  13,  -3,  13,  -5,  13,  -8,  13, -13,  13, -17,  13, -21,  13,   0,
-   21,   1,  21,   2,  21,   3,  21,   5,  21,   8,  21,  13,  21,  21,  21,
-   -1,  21,  -2,  21,  -3,  21,  -5,  21,  -8,  21, -13,  21, -17,  21, -21,
-   21,   0,  -1,   1,  -1,   2,  -1,   3,  -1,   5,  -1,   8,  -1,  13,  -1,
-   21,  -1,  -1,  -1,  -2,  -1,  -3,  -1,  -5,  -1,  -8,  -1, -13,  -1, -17,
-   -1, -21,  -1,   0,  -2,   1,  -2,   2,  -2,   3,  -2,   5,  -2,   8,  -2,
-   13,  -2,  21,  -2,  -1,  -2,  -2,  -2,  -3,  -2,  -5,  -2,  -8,  -2, -13,
-   -2, -17,  -2, -21,  -2,   0,  -3,   1,  -3,   2,  -3,   3,  -3,   5,  -3,
-    8,  -3,  13,  -3,  21,  -3,  -1,  -3,  -2,  -3,  -3,  -3,  -5,  -3,  -8,
-   -3, -13,  -3, -17,  -3, -21,  -3,   0,  -5,   1,  -5,   2,  -5,   3,  -5,
-    5,  -5,   8,  -5,  13,  -5,  21,  -5,  -1,  -5,  -2,  -5,  -3,  -5,  -5,
-   -5,  -8,  -5, -13,  -5, -17,  -5, -21,  -5,   0,  -8,   1,  -8,   2,  -8,
-    3,  -8,   5,  -8,   8,  -8,  13,  -8,  21,  -8,  -1,  -8,  -2,  -8,  -3,
-   -8,  -5,  -8,  -8,  -8, -13,  -8, -17,  -8, -21,  -8,   0, -13,   1, -13,
-    2, -13,   3, -13,   5, -13,   8, -13,  13, -13,  21, -13,  -1, -13,  -2,
-  -13,  -3, -13,  -5, -13,  -8, -13, -13, -13, -17, -13, -21, -13,   0, -17,
-    1, -17,   2, -17,   3, -17,   5, -17,   8, -17,  13, -17,  21, -17,  -1,
-  -17,  -2, -17,  -3, -17,  -5, -17,  -8, -17, -13, -17, -17, -17, -21, -17,
-    0, -21,   1, -21,   2, -21,   3, -21,   5, -21,   8, -21,  13, -21,  21,
-  -21,  -1, -21,  -2, -21,  -3, -21,  -5, -21,  -8, -21, -13, -21, -17, -21,
-    0,   0,  -8, -29,   8, -29, -18, -25,  17, -25,   0, -23,  -6, -22,   6,
-  -22, -13, -19,  12, -19,   0, -18,  25, -18, -25, -17,  -5, -17,   5, -17,
-  -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,  -8,
-  -11,  -2, -11,   0, -11,   2, -11,   8, -11, -15, -10,  -4, -10,   4, -10,
-   15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9, -29,  -8, -11,  -8,  -8,
-   -8,  -3,  -8,   3,  -8,   8,  -8,  11,  -8,  29,  -8,  -5,  -7,  -2,  -7,
-    0,  -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,
-   -6,   1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,
-   -4,  -5,  -2,  -5,   0,  -5,   2,  -5,   4,  -5,   7,  -5,  17,  -5, -13,
-   -4, -10,  -4,  -5,  -4,  -3,  -4,  -1,  -4,   0,  -4,   1,  -4,   3,  -4,
-    5,  -4,  10,  -4,  13,  -4,  -8,  -3,  -6,  -3,  -4,  -3,  -3,  -3,  -2,
-   -3,  -1,  -3,   0,  -3,   1,  -3,   2,  -3,   4,  -3,   6,  -3,   8,  -3,
-  -11,  -2,  -7,  -2,  -5,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,   1,
-   -2,   2,  -2,   3,  -2,   5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,
-   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,   1,  -1,   2,  -1,   3,
-   -1,   4,  -1,   6,  -1,   9,  -1, -31,   0, -23,   0, -18,   0, -14,   0,
-  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,  -2,   0,  -1,   0,   0,
-  -31,   1,   0,   2,   0,   3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
-   14,   0,  18,   0,  23,   0,  31,   0,  -9,   1,  -6,   1,  -4,   1,  -3,
-    1,  -2,   1,  -1,   1,   0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
-    6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,  -3,   2,  -2,   2,  -1,
-    2,   0,   2,   1,   2,   2,   2,   3,   2,   5,   2,   7,   2,  11,   2,
-   -8,   3,  -6,   3,  -4,   3,  -2,   3,  -1,   3,   0,   3,   1,   3,   2,
-    3,   3,   3,   4,   3,   6,   3,   8,   3, -13,   4, -10,   4,  -5,   4,
-   -3,   4,  -1,   4,   0,   4,   1,   4,   3,   4,   5,   4,  10,   4,  13,
-    4, -17,   5,  -7,   5,  -4,   5,  -2,   5,   0,   5,   2,   5,   4,   5,
-    7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,   1,
-    6,   3,   6,   6,   6,   9,   6,  22,   6,  -5,   7,  -2,   7,   0,   7,
-    2,   7,   5,   7, -29,   8, -11,   8,  -8,   8,  -3,   8,   3,   8,   8,
-    8,  11,   8,  29,   8,  -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,
-   -4,  10,   4,  10,  15,  10,  -8,  11,  -2,  11,   0,  11,   2,  11,   8,
-   11,  19,  12, -19,  13,  -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,
-   -5,  17,   5,  17,  25,  17, -25,  18,   0,  18, -12,  19,  13,  19,  -6,
-   22,   6,  22,   0,  23, -17,  25,  18,  25,  -8,  29,   8,  29,   0,  31,
-    0,   0,  -6, -22,   6, -22, -13, -19,  12, -19,   0, -18,  -5, -17,   5,
-  -17, -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,
-   -8, -11,  -2, -11,   0, -11,   2, -11,   8, -11, -15, -10,  -4, -10,   4,
-  -10,  15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9, -11,  -8,  -8,  -8,
-   -3,  -8,   0,  -8,   3,  -8,   8,  -8,  11,  -8,  -5,  -7,  -2,  -7,   0,
-   -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,  -6,
-    1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,  -4,
-   -5,  -2,  -5,  -1,  -5,   0,  -5,   1,  -5,   2,  -5,   4,  -5,   7,  -5,
-   17,  -5, -13,  -4, -10,  -4,  -5,  -4,  -3,  -4,  -2,  -4,  -1,  -4,   0,
-   -4,   1,  -4,   2,  -4,   3,  -4,   5,  -4,  10,  -4,  13,  -4,  -8,  -3,
-   -6,  -3,  -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,   0,  -3,   1,  -3,   2,
-   -3,   3,  -3,   4,  -3,   6,  -3,   8,  -3, -11,  -2,  -7,  -2,  -5,  -2,
-   -4,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,   1,  -2,   2,  -2,   3,
-   -2,   4,  -2,   5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,  -5,  -1,
-   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,   1,  -1,   2,  -1,   3,
-   -1,   4,  -1,   5,  -1,   6,  -1,   9,  -1, -23,   0, -18,   0, -14,   0,
-  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,  -2,   0,  -1,   0,   0,
-  -23,   1,   0,   2,   0,   3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
-   14,   0,  18,   0,  23,   0,  -9,   1,  -6,   1,  -5,   1,  -4,   1,  -3,
-    1,  -2,   1,  -1,   1,   0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
-    5,   1,   6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,  -4,   2,  -3,
-    2,  -2,   2,  -1,   2,   0,   2,   1,   2,   2,   2,   3,   2,   4,   2,
-    5,   2,   7,   2,  11,   2,  -8,   3,  -6,   3,  -4,   3,  -3,   3,  -2,
-    3,  -1,   3,   0,   3,   1,   3,   2,   3,   3,   3,   4,   3,   6,   3,
-    8,   3, -13,   4, -10,   4,  -5,   4,  -3,   4,  -2,   4,  -1,   4,   0,
-    4,   1,   4,   2,   4,   3,   4,   5,   4,  10,   4,  13,   4, -17,   5,
-   -7,   5,  -4,   5,  -2,   5,  -1,   5,   0,   5,   1,   5,   2,   5,   4,
-    5,   7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,
-    1,   6,   3,   6,   6,   6,   9,   6,  22,   6,  -5,   7,  -2,   7,   0,
-    7,   2,   7,   5,   7, -11,   8,  -8,   8,  -3,   8,   0,   8,   3,   8,
-    8,   8,  11,   8,  -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,  -4,
-   10,   4,  10,  15,  10,  -8,  11,  -2,  11,   0,  11,   2,  11,   8,  11,
-   19,  12, -19,  13,  -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,  -5,
-   17,   5,  17,   0,  18, -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
+    0,   0,   1,   0,   2,   0,   3,   0,   5,   0,
+    8,   0,  13,   0,  21,   0,  -1,   0,  -2,   0,
+   -3,   0,  -5,   0,  -8,   0, -13,   0, -17,   0,
+  -21,   0,   0,   1,   1,   1,   2,   1,   3,   1,
+    5,   1,   8,   1,  13,   1,  21,   1,  -1,   1,
+   -2,   1,  -3,   1,  -5,   1,  -8,   1, -13,   1,
+  -17,   1, -21,   1,   0,   2,   1,   2,   2,   2,
+    3,   2,   5,   2,   8,   2,  13,   2,  21,   2,
+   -1,   2,  -2,   2,  -3,   2,  -5,   2,  -8,   2,
+  -13,   2, -17,   2, -21,   2,   0,   3,   1,   3,
+    2,   3,   3,   3,   5,   3,   8,   3,  13,   3,
+   21,   3,  -1,   3,  -2,   3,  -3,   3,  -5,   3,
+   -8,   3, -13,   3, -17,   3, -21,   3,   0,   5,
+    1,   5,   2,   5,   3,   5,   5,   5,   8,   5,
+   13,   5,  21,   5,  -1,   5,  -2,   5,  -3,   5,
+   -5,   5,  -8,   5, -13,   5, -17,   5, -21,   5,
+    0,   8,   1,   8,   2,   8,   3,   8,   5,   8,
+    8,   8,  13,   8,  21,   8,  -1,   8,  -2,   8,
+   -3,   8,  -5,   8,  -8,   8, -13,   8, -17,   8,
+  -21,   8,   0,  13,   1,  13,   2,  13,   3,  13,
+    5,  13,   8,  13,  13,  13,  21,  13,  -1,  13,
+   -2,  13,  -3,  13,  -5,  13,  -8,  13, -13,  13,
+  -17,  13, -21,  13,   0,  21,   1,  21,   2,  21,
+    3,  21,   5,  21,   8,  21,  13,  21,  21,  21,
+   -1,  21,  -2,  21,  -3,  21,  -5,  21,  -8,  21,
+  -13,  21, -17,  21, -21,  21,   0,  -1,   1,  -1,
+    2,  -1,   3,  -1,   5,  -1,   8,  -1,  13,  -1,
+   21,  -1,  -1,  -1,  -2,  -1,  -3,  -1,  -5,  -1,
+   -8,  -1, -13,  -1, -17,  -1, -21,  -1,   0,  -2,
+    1,  -2,   2,  -2,   3,  -2,   5,  -2,   8,  -2,
+   13,  -2,  21,  -2,  -1,  -2,  -2,  -2,  -3,  -2,
+   -5,  -2,  -8,  -2, -13,  -2, -17,  -2, -21,  -2,
+    0,  -3,   1,  -3,   2,  -3,   3,  -3,   5,  -3,
+    8,  -3,  13,  -3,  21,  -3,  -1,  -3,  -2,  -3,
+   -3,  -3,  -5,  -3,  -8,  -3, -13,  -3, -17,  -3,
+  -21,  -3,   0,  -5,   1,  -5,   2,  -5,   3,  -5,
+    5,  -5,   8,  -5,  13,  -5,  21,  -5,  -1,  -5,
+   -2,  -5,  -3,  -5,  -5,  -5,  -8,  -5, -13,  -5,
+  -17,  -5, -21,  -5,   0,  -8,   1,  -8,   2,  -8,
+    3,  -8,   5,  -8,   8,  -8,  13,  -8,  21,  -8,
+   -1,  -8,  -2,  -8,  -3,  -8,  -5,  -8,  -8,  -8,
+  -13,  -8, -17,  -8, -21,  -8,   0, -13,   1, -13,
+    2, -13,   3, -13,   5, -13,   8, -13,  13, -13,
+   21, -13,  -1, -13,  -2, -13,  -3, -13,  -5, -13,
+   -8, -13, -13, -13, -17, -13, -21, -13,   0, -17,
+    1, -17,   2, -17,   3, -17,   5, -17,   8, -17,
+   13, -17,  21, -17,  -1, -17,  -2, -17,  -3, -17,
+   -5, -17,  -8, -17, -13, -17, -17, -17, -21, -17,
+    0, -21,   1, -21,   2, -21,   3, -21,   5, -21,
+    8, -21,  13, -21,  21, -21,  -1, -21,  -2, -21,
+   -3, -21,  -5, -21,  -8, -21, -13, -21, -17, -21,
+    0,   0,  -8, -29,   8, -29, -18, -25,  17, -25,
+    0, -23,  -6, -22,   6, -22, -13, -19,  12, -19,
+    0, -18,  25, -18, -25, -17,  -5, -17,   5, -17,
+  -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,
+   19, -13, -19, -12,  -8, -11,  -2, -11,   0, -11,
+    2, -11,   8, -11, -15, -10,  -4, -10,   4, -10,
+   15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9,
+  -29,  -8, -11,  -8,  -8,  -8,  -3,  -8,   3,  -8,
+    8,  -8,  11,  -8,  29,  -8,  -5,  -7,  -2,  -7,
+    0,  -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,
+   -6,  -6,  -3,  -6,  -1,  -6,   1,  -6,   3,  -6,
+    6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,
+   -4,  -5,  -2,  -5,   0,  -5,   2,  -5,   4,  -5,
+    7,  -5,  17,  -5, -13,  -4, -10,  -4,  -5,  -4,
+   -3,  -4,  -1,  -4,   0,  -4,   1,  -4,   3,  -4,
+    5,  -4,  10,  -4,  13,  -4,  -8,  -3,  -6,  -3,
+   -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,   0,  -3,
+    1,  -3,   2,  -3,   4,  -3,   6,  -3,   8,  -3,
+  -11,  -2,  -7,  -2,  -5,  -2,  -3,  -2,  -2,  -2,
+   -1,  -2,   0,  -2,   1,  -2,   2,  -2,   3,  -2,
+    5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,
+   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,
+    1,  -1,   2,  -1,   3,  -1,   4,  -1,   6,  -1,
+    9,  -1, -31,   0, -23,   0, -18,   0, -14,   0,
+  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,
+   -2,   0,  -1,   0,   0, -31,   1,   0,   2,   0,
+    3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
+   14,   0,  18,   0,  23,   0,  31,   0,  -9,   1,
+   -6,   1,  -4,   1,  -3,   1,  -2,   1,  -1,   1,
+    0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
+    6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,
+   -3,   2,  -2,   2,  -1,   2,   0,   2,   1,   2,
+    2,   2,   3,   2,   5,   2,   7,   2,  11,   2,
+   -8,   3,  -6,   3,  -4,   3,  -2,   3,  -1,   3,
+    0,   3,   1,   3,   2,   3,   3,   3,   4,   3,
+    6,   3,   8,   3, -13,   4, -10,   4,  -5,   4,
+   -3,   4,  -1,   4,   0,   4,   1,   4,   3,   4,
+    5,   4,  10,   4,  13,   4, -17,   5,  -7,   5,
+   -4,   5,  -2,   5,   0,   5,   2,   5,   4,   5,
+    7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,
+   -3,   6,  -1,   6,   1,   6,   3,   6,   6,   6,
+    9,   6,  22,   6,  -5,   7,  -2,   7,   0,   7,
+    2,   7,   5,   7, -29,   8, -11,   8,  -8,   8,
+   -3,   8,   3,   8,   8,   8,  11,   8,  29,   8,
+   -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,
+   -4,  10,   4,  10,  15,  10,  -8,  11,  -2,  11,
+    0,  11,   2,  11,   8,  11,  19,  12, -19,  13,
+   -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,
+   -5,  17,   5,  17,  25,  17, -25,  18,   0,  18,
+  -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
+  -17,  25,  18,  25,  -8,  29,   8,  29,   0,  31,
+    0,   0,  -6, -22,   6, -22, -13, -19,  12, -19,
+    0, -18,  -5, -17,   5, -17, -10, -15,  10, -15,
+    0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,
+   -8, -11,  -2, -11,   0, -11,   2, -11,   8, -11,
+  -15, -10,  -4, -10,   4, -10,  15, -10,  -6,  -9,
+   -1,  -9,   1,  -9,   6,  -9, -11,  -8,  -8,  -8,
+   -3,  -8,   0,  -8,   3,  -8,   8,  -8,  11,  -8,
+   -5,  -7,  -2,  -7,   0,  -7,   2,  -7,   5,  -7,
+  -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,  -6,
+    1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6,
+  -17,  -5,  -7,  -5,  -4,  -5,  -2,  -5,  -1,  -5,
+    0,  -5,   1,  -5,   2,  -5,   4,  -5,   7,  -5,
+   17,  -5, -13,  -4, -10,  -4,  -5,  -4,  -3,  -4,
+   -2,  -4,  -1,  -4,   0,  -4,   1,  -4,   2,  -4,
+    3,  -4,   5,  -4,  10,  -4,  13,  -4,  -8,  -3,
+   -6,  -3,  -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,
+    0,  -3,   1,  -3,   2,  -3,   3,  -3,   4,  -3,
+    6,  -3,   8,  -3, -11,  -2,  -7,  -2,  -5,  -2,
+   -4,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,
+    1,  -2,   2,  -2,   3,  -2,   4,  -2,   5,  -2,
+    7,  -2,  11,  -2,  -9,  -1,  -6,  -1,  -5,  -1,
+   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,
+    1,  -1,   2,  -1,   3,  -1,   4,  -1,   5,  -1,
+    6,  -1,   9,  -1, -23,   0, -18,   0, -14,   0,
+  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,
+   -2,   0,  -1,   0,   0, -23,   1,   0,   2,   0,
+    3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
+   14,   0,  18,   0,  23,   0,  -9,   1,  -6,   1,
+   -5,   1,  -4,   1,  -3,   1,  -2,   1,  -1,   1,
+    0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
+    5,   1,   6,   1,   9,   1, -11,   2,  -7,   2,
+   -5,   2,  -4,   2,  -3,   2,  -2,   2,  -1,   2,
+    0,   2,   1,   2,   2,   2,   3,   2,   4,   2,
+    5,   2,   7,   2,  11,   2,  -8,   3,  -6,   3,
+   -4,   3,  -3,   3,  -2,   3,  -1,   3,   0,   3,
+    1,   3,   2,   3,   3,   3,   4,   3,   6,   3,
+    8,   3, -13,   4, -10,   4,  -5,   4,  -3,   4,
+   -2,   4,  -1,   4,   0,   4,   1,   4,   2,   4,
+    3,   4,   5,   4,  10,   4,  13,   4, -17,   5,
+   -7,   5,  -4,   5,  -2,   5,  -1,   5,   0,   5,
+    1,   5,   2,   5,   4,   5,   7,   5,  17,   5,
+  -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,
+    1,   6,   3,   6,   6,   6,   9,   6,  22,   6,
+   -5,   7,  -2,   7,   0,   7,   2,   7,   5,   7,
+  -11,   8,  -8,   8,  -3,   8,   0,   8,   3,   8,
+    8,   8,  11,   8,  -6,   9,  -1,   9,   1,   9,
+    6,   9, -15,  10,  -4,  10,   4,  10,  15,  10,
+   -8,  11,  -2,  11,   0,  11,   2,  11,   8,  11,
+   19,  12, -19,  13,  -4,  13,   4,  13,   0,  14,
+  -10,  15,  10,  15,  -5,  17,   5,  17,   0,  18,
+  -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
 };
 
 typedef struct SANMVideoContext {
@@ -406,15 +457,16 @@ static void destroy_buffers(SANMVideoContext *ctx)
     ctx->frm0_size =
     ctx->frm1_size =
     ctx->frm2_size = 0;
+    init_sizes(ctx, 0, 0);
 }
 
 static av_cold int init_buffers(SANMVideoContext *ctx)
 {
-    av_fast_padded_malloc(&ctx->frm0, &ctx->frm0_size, ctx->buf_size);
-    av_fast_padded_malloc(&ctx->frm1, &ctx->frm1_size, ctx->buf_size);
-    av_fast_padded_malloc(&ctx->frm2, &ctx->frm2_size, ctx->buf_size);
+    av_fast_padded_mallocz(&ctx->frm0, &ctx->frm0_size, ctx->buf_size);
+    av_fast_padded_mallocz(&ctx->frm1, &ctx->frm1_size, ctx->buf_size);
+    av_fast_padded_mallocz(&ctx->frm2, &ctx->frm2_size, ctx->buf_size);
     if (!ctx->version)
-        av_fast_padded_malloc(&ctx->stored_frame,
+        av_fast_padded_mallocz(&ctx->stored_frame,
                               &ctx->stored_frame_size, ctx->buf_size);
 
     if (!ctx->frm0 || !ctx->frm1 || !ctx->frm2 ||
@@ -460,7 +512,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         }
 
         ctx->subversion = AV_RL16(avctx->extradata);
-        for (i = 0; i < 256; i++)
+        for (i = 0; i < PALETTE_SIZE; i++)
             ctx->pal[i] = 0xFFU << 24 | AV_RL32(avctx->extradata + 2 + i * 4);
     }
 
@@ -1463,7 +1515,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
 AVCodec ff_sanm_decoder = {
     .name           = "sanm",
-    .long_name      = NULL_IF_CONFIG_SMALL("LucasArts SANM video"),
+    .long_name      = NULL_IF_CONFIG_SMALL("LucasArts SANM/Smush video"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_SANM,
     .priv_data_size = sizeof(SANMVideoContext),
diff --git a/libavcodec/sbc.c b/libavcodec/sbc.c
new file mode 100644
index 0000000..b43b66e
--- /dev/null
+++ b/libavcodec/sbc.c
@@ -0,0 +1,271 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2012-2013  Intel Corporation
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2008  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC common functions for the encoder and decoder
+ */
+
+#include "avcodec.h"
+#include "sbc.h"
+
+/* A2DP specification: Appendix B, page 69 */
+static const int sbc_offset4[4][4] = {
+    { -1, 0, 0, 0 },
+    { -2, 0, 0, 1 },
+    { -2, 0, 0, 1 },
+    { -2, 0, 0, 1 }
+};
+
+/* A2DP specification: Appendix B, page 69 */
+static const int sbc_offset8[4][8] = {
+    { -2, 0, 0, 0, 0, 0, 0, 1 },
+    { -3, 0, 0, 0, 0, 0, 1, 2 },
+    { -4, 0, 0, 0, 0, 0, 1, 2 },
+    { -4, 0, 0, 0, 0, 0, 1, 2 }
+};
+
+/*
+ * Calculates the CRC-8 of the first len bits in data
+ */
+uint8_t ff_sbc_crc8(const AVCRC *ctx, const uint8_t *data, size_t len)
+{
+    size_t byte_length = len >> 3;
+    int bit_length = len & 7;
+    uint8_t crc;
+
+    crc = av_crc(ctx, 0x0F, data, byte_length);
+
+    if (bit_length) {
+        uint8_t bits = data[byte_length];
+        while (bit_length--) {
+            int8_t mask = bits ^ crc;
+            crc = (crc << 1) ^ ((mask >> 7) & 0x1D);
+            bits <<= 1;
+        }
+    }
+
+    return crc;
+}
+
+/*
+ * Code straight from the spec to calculate the bits array
+ * Takes a pointer to the frame in question and a pointer to the bits array
+ */
+void ff_sbc_calculate_bits(const struct sbc_frame *frame, int (*bits)[8])
+{
+    int subbands = frame->subbands;
+    uint8_t sf = frame->frequency;
+
+    if (frame->mode == MONO || frame->mode == DUAL_CHANNEL) {
+        int bitneed[2][8], loudness, max_bitneed, bitcount, slicecount, bitslice;
+        int ch, sb;
+
+        for (ch = 0; ch < frame->channels; ch++) {
+            max_bitneed = 0;
+            if (frame->allocation == SNR) {
+                for (sb = 0; sb < subbands; sb++) {
+                    bitneed[ch][sb] = frame->scale_factor[ch][sb];
+                    if (bitneed[ch][sb] > max_bitneed)
+                        max_bitneed = bitneed[ch][sb];
+                }
+            } else {
+                for (sb = 0; sb < subbands; sb++) {
+                    if (frame->scale_factor[ch][sb] == 0)
+                        bitneed[ch][sb] = -5;
+                    else {
+                        if (subbands == 4)
+                            loudness = frame->scale_factor[ch][sb] - sbc_offset4[sf][sb];
+                        else
+                            loudness = frame->scale_factor[ch][sb] - sbc_offset8[sf][sb];
+                        if (loudness > 0)
+                            bitneed[ch][sb] = loudness / 2;
+                        else
+                            bitneed[ch][sb] = loudness;
+                    }
+                    if (bitneed[ch][sb] > max_bitneed)
+                        max_bitneed = bitneed[ch][sb];
+                }
+            }
+
+            bitcount = 0;
+            slicecount = 0;
+            bitslice = max_bitneed + 1;
+            do {
+                bitslice--;
+                bitcount += slicecount;
+                slicecount = 0;
+                for (sb = 0; sb < subbands; sb++) {
+                    if ((bitneed[ch][sb] > bitslice + 1) && (bitneed[ch][sb] < bitslice + 16))
+                        slicecount++;
+                    else if (bitneed[ch][sb] == bitslice + 1)
+                        slicecount += 2;
+                }
+            } while (bitcount + slicecount < frame->bitpool);
+
+            if (bitcount + slicecount == frame->bitpool) {
+                bitcount += slicecount;
+                bitslice--;
+            }
+
+            for (sb = 0; sb < subbands; sb++) {
+                if (bitneed[ch][sb] < bitslice + 2)
+                    bits[ch][sb] = 0;
+                else {
+                    bits[ch][sb] = bitneed[ch][sb] - bitslice;
+                    if (bits[ch][sb] > 16)
+                        bits[ch][sb] = 16;
+                }
+            }
+
+            for (sb = 0; bitcount < frame->bitpool &&
+                            sb < subbands; sb++) {
+                if ((bits[ch][sb] >= 2) && (bits[ch][sb] < 16)) {
+                    bits[ch][sb]++;
+                    bitcount++;
+                } else if ((bitneed[ch][sb] == bitslice + 1) && (frame->bitpool > bitcount + 1)) {
+                    bits[ch][sb] = 2;
+                    bitcount += 2;
+                }
+            }
+
+            for (sb = 0; bitcount < frame->bitpool &&
+                            sb < subbands; sb++) {
+                if (bits[ch][sb] < 16) {
+                    bits[ch][sb]++;
+                    bitcount++;
+                }
+            }
+
+        }
+
+    } else if (frame->mode == STEREO || frame->mode == JOINT_STEREO) {
+        int bitneed[2][8], loudness, max_bitneed, bitcount, slicecount, bitslice;
+        int ch, sb;
+
+        max_bitneed = 0;
+        if (frame->allocation == SNR) {
+            for (ch = 0; ch < 2; ch++) {
+                for (sb = 0; sb < subbands; sb++) {
+                    bitneed[ch][sb] = frame->scale_factor[ch][sb];
+                    if (bitneed[ch][sb] > max_bitneed)
+                        max_bitneed = bitneed[ch][sb];
+                }
+            }
+        } else {
+            for (ch = 0; ch < 2; ch++) {
+                for (sb = 0; sb < subbands; sb++) {
+                    if (frame->scale_factor[ch][sb] == 0)
+                        bitneed[ch][sb] = -5;
+                    else {
+                        if (subbands == 4)
+                            loudness = frame->scale_factor[ch][sb] - sbc_offset4[sf][sb];
+                        else
+                            loudness = frame->scale_factor[ch][sb] - sbc_offset8[sf][sb];
+                        if (loudness > 0)
+                            bitneed[ch][sb] = loudness / 2;
+                        else
+                            bitneed[ch][sb] = loudness;
+                    }
+                    if (bitneed[ch][sb] > max_bitneed)
+                        max_bitneed = bitneed[ch][sb];
+                }
+            }
+        }
+
+        bitcount = 0;
+        slicecount = 0;
+        bitslice = max_bitneed + 1;
+        do {
+            bitslice--;
+            bitcount += slicecount;
+            slicecount = 0;
+            for (ch = 0; ch < 2; ch++) {
+                for (sb = 0; sb < subbands; sb++) {
+                    if ((bitneed[ch][sb] > bitslice + 1) && (bitneed[ch][sb] < bitslice + 16))
+                        slicecount++;
+                    else if (bitneed[ch][sb] == bitslice + 1)
+                        slicecount += 2;
+                }
+            }
+        } while (bitcount + slicecount < frame->bitpool);
+
+        if (bitcount + slicecount == frame->bitpool) {
+            bitcount += slicecount;
+            bitslice--;
+        }
+
+        for (ch = 0; ch < 2; ch++) {
+            for (sb = 0; sb < subbands; sb++) {
+                if (bitneed[ch][sb] < bitslice + 2) {
+                    bits[ch][sb] = 0;
+                } else {
+                    bits[ch][sb] = bitneed[ch][sb] - bitslice;
+                    if (bits[ch][sb] > 16)
+                        bits[ch][sb] = 16;
+                }
+            }
+        }
+
+        ch = 0;
+        sb = 0;
+        while (bitcount < frame->bitpool) {
+            if ((bits[ch][sb] >= 2) && (bits[ch][sb] < 16)) {
+                bits[ch][sb]++;
+                bitcount++;
+            } else if ((bitneed[ch][sb] == bitslice + 1) && (frame->bitpool > bitcount + 1)) {
+                bits[ch][sb] = 2;
+                bitcount += 2;
+            }
+            if (ch == 1) {
+                ch = 0;
+                sb++;
+                if (sb >= subbands)
+                    break;
+            } else
+                ch = 1;
+        }
+
+        ch = 0;
+        sb = 0;
+        while (bitcount < frame->bitpool) {
+            if (bits[ch][sb] < 16) {
+                bits[ch][sb]++;
+                bitcount++;
+            }
+            if (ch == 1) {
+                ch = 0;
+                sb++;
+                if (sb >= subbands)
+                    break;
+            } else
+                ch = 1;
+        }
+
+    }
+
+}
diff --git a/libavcodec/sbc.h b/libavcodec/sbc.h
new file mode 100644
index 0000000..de9c8d9
--- /dev/null
+++ b/libavcodec/sbc.h
@@ -0,0 +1,118 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2012-2014  Intel Corporation
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC common definitions for the encoder and decoder
+ */
+
+#ifndef AVCODEC_SBC_H
+#define AVCODEC_SBC_H
+
+#include "avcodec.h"
+#include "libavutil/crc.h"
+
+#define MSBC_BLOCKS 15
+
+/* sampling frequency */
+#define SBC_FREQ_16000  0x00
+#define SBC_FREQ_32000  0x01
+#define SBC_FREQ_44100  0x02
+#define SBC_FREQ_48000  0x03
+
+/* blocks */
+#define SBC_BLK_4       0x00
+#define SBC_BLK_8       0x01
+#define SBC_BLK_12      0x02
+#define SBC_BLK_16      0x03
+
+/* channel mode */
+#define SBC_MODE_MONO         0x00
+#define SBC_MODE_DUAL_CHANNEL 0x01
+#define SBC_MODE_STEREO       0x02
+#define SBC_MODE_JOINT_STEREO 0x03
+
+/* allocation method */
+#define SBC_AM_LOUDNESS 0x00
+#define SBC_AM_SNR      0x01
+
+/* subbands */
+#define SBC_SB_4        0x00
+#define SBC_SB_8        0x01
+
+/* synchronisation words */
+#define SBC_SYNCWORD   0x9C
+#define MSBC_SYNCWORD  0xAD
+
+/* extra bits of precision for the synthesis filter input data */
+#define SBCDEC_FIXED_EXTRA_BITS 2
+
+/*
+ * Enforce 16 byte alignment for the data, which is supposed to be used
+ * with SIMD optimized code.
+ */
+#define SBC_ALIGN 16
+
+/* This structure contains an unpacked SBC frame.
+   Yes, there is probably quite some unused space herein */
+struct sbc_frame {
+    uint8_t frequency;
+    uint8_t blocks;
+    enum {
+        MONO         = SBC_MODE_MONO,
+        DUAL_CHANNEL = SBC_MODE_DUAL_CHANNEL,
+        STEREO       = SBC_MODE_STEREO,
+        JOINT_STEREO = SBC_MODE_JOINT_STEREO
+    } mode;
+    uint8_t channels;
+    enum {
+        LOUDNESS = SBC_AM_LOUDNESS,
+        SNR      = SBC_AM_SNR
+    } allocation;
+    uint8_t subbands;
+    uint8_t bitpool;
+    uint16_t codesize;
+
+    /* bit number x set means joint stereo has been used in subband x */
+    uint8_t joint;
+
+    /* only the lower 4 bits of every element are to be used */
+    DECLARE_ALIGNED(SBC_ALIGN, uint32_t, scale_factor)[2][8];
+
+    /* raw integer subband samples in the frame */
+    DECLARE_ALIGNED(SBC_ALIGN, int32_t, sb_sample_f)[16][2][8];
+
+    /* modified subband samples */
+    DECLARE_ALIGNED(SBC_ALIGN, int32_t, sb_sample)[16][2][8];
+
+    const AVCRC *crc_ctx;
+};
+
+uint8_t ff_sbc_crc8(const AVCRC *crc_ctx, const uint8_t *data, size_t len);
+void ff_sbc_calculate_bits(const struct sbc_frame *frame, int (*bits)[8]);
+
+#endif /* AVCODEC_SBC_H */
diff --git a/libavcodec/sbc_parser.c b/libavcodec/sbc_parser.c
new file mode 100644
index 0000000..f565641
--- /dev/null
+++ b/libavcodec/sbc_parser.c
@@ -0,0 +1,122 @@
+/*
+ * SBC parser
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "sbc.h"
+#include "parser.h"
+
+typedef struct SBCParseContext {
+    ParseContext pc;
+    uint8_t header[3];
+    int header_size;
+    int buffered_size;
+} SBCParseContext;
+
+static int sbc_parse_header(AVCodecParserContext *s, AVCodecContext *avctx,
+                            const uint8_t *data, size_t len)
+{
+    static const int sample_rates[4] = { 16000, 32000, 44100, 48000 };
+    int sr, blocks, mode, subbands, bitpool, channels, joint;
+    int length;
+
+    if (len < 3)
+        return -1;
+
+    if (data[0] == MSBC_SYNCWORD && data[1] == 0 && data[2] == 0) {
+        avctx->channels = 1;
+        avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+        avctx->sample_rate = 16000;
+        avctx->frame_size = 120;
+        s->duration = avctx->frame_size;
+        return 57;
+    }
+
+    if (data[0] != SBC_SYNCWORD)
+        return -2;
+
+    sr       =   (data[1] >> 6) & 0x03;
+    blocks   = (((data[1] >> 4) & 0x03) + 1) << 2;
+    mode     =   (data[1] >> 2) & 0x03;
+    subbands = (((data[1] >> 0) & 0x01) + 1) << 2;
+    bitpool  = data[2];
+
+    channels = mode == SBC_MODE_MONO ? 1 : 2;
+    joint    = mode == SBC_MODE_JOINT_STEREO;
+
+    length = 4 + (subbands * channels) / 2
+             + ((((mode == SBC_MODE_DUAL_CHANNEL) + 1) * blocks * bitpool
+                 + (joint * subbands)) + 7) / 8;
+
+    avctx->channels = channels;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+    avctx->sample_rate = sample_rates[sr];
+    avctx->frame_size = subbands * blocks;
+    s->duration = avctx->frame_size;
+    return length;
+}
+
+static int sbc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
+                     const uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size)
+{
+    SBCParseContext *pc = s->priv_data;
+    int next;
+
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        if (pc->header_size) {
+            memcpy(pc->header + pc->header_size, buf,
+                   sizeof(pc->header) - pc->header_size);
+            next = sbc_parse_header(s, avctx, pc->header, sizeof(pc->header))
+                 - pc->buffered_size;
+            pc->header_size = 0;
+        } else {
+            next = sbc_parse_header(s, avctx, buf, buf_size);
+            if (next >= buf_size)
+                next = -1;
+        }
+
+        if (next < 0) {
+            pc->header_size = FFMIN(sizeof(pc->header), buf_size);
+            memcpy(pc->header, buf, pc->header_size);
+            pc->buffered_size = buf_size;
+            next = END_NOT_FOUND;
+        }
+
+        if (ff_combine_frame(&pc->pc, next, &buf, &buf_size) < 0) {
+            *poutbuf      = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
+    }
+
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+AVCodecParser ff_sbc_parser = {
+    .codec_ids      = { AV_CODEC_ID_SBC },
+    .priv_data_size = sizeof(SBCParseContext),
+    .parser_parse   = sbc_parse,
+    .parser_close   = ff_parse_close,
+};
diff --git a/libavcodec/sbcdec.c b/libavcodec/sbcdec.c
new file mode 100644
index 0000000..546b38c
--- /dev/null
+++ b/libavcodec/sbcdec.c
@@ -0,0 +1,379 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2012-2013  Intel Corporation
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2008  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC decoder implementation
+ */
+
+#include <stdbool.h>
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+#include "sbc.h"
+#include "sbcdec_data.h"
+
+struct sbc_decoder_state {
+    int32_t V[2][170];
+    int offset[2][16];
+};
+
+typedef struct SBCDecContext {
+    AVClass *class;
+    DECLARE_ALIGNED(SBC_ALIGN, struct sbc_frame, frame);
+    DECLARE_ALIGNED(SBC_ALIGN, struct sbc_decoder_state, dsp);
+} SBCDecContext;
+
+/*
+ * Unpacks a SBC frame at the beginning of the stream in data,
+ * which has at most len bytes into frame.
+ * Returns the length in bytes of the packed frame, or a negative
+ * value on error. The error codes are:
+ *
+ *  -1   Data stream too short
+ *  -2   Sync byte incorrect
+ *  -3   CRC8 incorrect
+ *  -4   Bitpool value out of bounds
+ */
+static int sbc_unpack_frame(const uint8_t *data, struct sbc_frame *frame,
+                            size_t len)
+{
+    unsigned int consumed;
+    /* Will copy the parts of the header that are relevant to crc
+     * calculation here */
+    uint8_t crc_header[11] = { 0 };
+    int crc_pos;
+    int32_t temp;
+
+    uint32_t audio_sample;
+    int ch, sb, blk, bit;   /* channel, subband, block and bit standard
+                               counters */
+    int bits[2][8];         /* bits distribution */
+    uint32_t levels[2][8];  /* levels derived from that */
+
+    if (len < 4)
+        return -1;
+
+    if (data[0] == MSBC_SYNCWORD) {
+        if (data[1] != 0)
+            return -2;
+        if (data[2] != 0)
+            return -2;
+
+        frame->frequency = SBC_FREQ_16000;
+        frame->blocks = MSBC_BLOCKS;
+        frame->allocation = LOUDNESS;
+        frame->mode = MONO;
+        frame->channels = 1;
+        frame->subbands = 8;
+        frame->bitpool = 26;
+    } else if (data[0] == SBC_SYNCWORD) {
+        frame->frequency  = (data[1] >> 6) & 0x03;
+        frame->blocks = 4 * ((data[1] >> 4) & 0x03) + 4;
+        frame->mode = (data[1] >> 2) & 0x03;
+        frame->channels = frame->mode == MONO ? 1 : 2;
+        frame->allocation = (data[1] >> 1) & 0x01;
+        frame->subbands = data[1] & 0x01 ? 8 : 4;
+        frame->bitpool = data[2];
+
+        if ((frame->mode == MONO || frame->mode == DUAL_CHANNEL) &&
+            frame->bitpool > 16 * frame->subbands)
+            return -4;
+
+        if ((frame->mode == STEREO || frame->mode == JOINT_STEREO) &&
+            frame->bitpool > 32 * frame->subbands)
+            return -4;
+    } else
+        return -2;
+
+    consumed = 32;
+    crc_header[0] = data[1];
+    crc_header[1] = data[2];
+    crc_pos = 16;
+
+    if (frame->mode == JOINT_STEREO) {
+        if (len * 8 < consumed + frame->subbands)
+            return -1;
+
+        frame->joint = 0x00;
+        for (sb = 0; sb < frame->subbands - 1; sb++)
+            frame->joint |= ((data[4] >> (7 - sb)) & 0x01) << sb;
+        if (frame->subbands == 4)
+            crc_header[crc_pos / 8] = data[4] & 0xf0;
+        else
+            crc_header[crc_pos / 8] = data[4];
+
+        consumed += frame->subbands;
+        crc_pos += frame->subbands;
+    }
+
+    if (len * 8 < consumed + (4 * frame->subbands * frame->channels))
+        return -1;
+
+    for (ch = 0; ch < frame->channels; ch++) {
+        for (sb = 0; sb < frame->subbands; sb++) {
+            /* FIXME assert(consumed % 4 == 0); */
+            frame->scale_factor[ch][sb] =
+                (data[consumed >> 3] >> (4 - (consumed & 0x7))) & 0x0F;
+            crc_header[crc_pos >> 3] |=
+                frame->scale_factor[ch][sb] << (4 - (crc_pos & 0x7));
+
+            consumed += 4;
+            crc_pos += 4;
+        }
+    }
+
+    if (data[3] != ff_sbc_crc8(frame->crc_ctx, crc_header, crc_pos))
+        return -3;
+
+    ff_sbc_calculate_bits(frame, bits);
+
+    for (ch = 0; ch < frame->channels; ch++) {
+        for (sb = 0; sb < frame->subbands; sb++)
+            levels[ch][sb] = (1 << bits[ch][sb]) - 1;
+    }
+
+    for (blk = 0; blk < frame->blocks; blk++) {
+        for (ch = 0; ch < frame->channels; ch++) {
+            for (sb = 0; sb < frame->subbands; sb++) {
+                uint32_t shift;
+
+                if (levels[ch][sb] == 0) {
+                    frame->sb_sample[blk][ch][sb] = 0;
+                    continue;
+                }
+
+                shift = frame->scale_factor[ch][sb] +
+                        1 + SBCDEC_FIXED_EXTRA_BITS;
+
+                audio_sample = 0;
+                for (bit = 0; bit < bits[ch][sb]; bit++) {
+                    if (consumed > len * 8)
+                        return -1;
+
+                    if ((data[consumed >> 3] >> (7 - (consumed & 0x7))) & 0x01)
+                        audio_sample |= 1 << (bits[ch][sb] - bit - 1);
+
+                    consumed++;
+                }
+
+                frame->sb_sample[blk][ch][sb] = (int32_t)
+                    (((((uint64_t) audio_sample << 1) | 1) << shift) /
+                    levels[ch][sb]) - (1 << shift);
+            }
+        }
+    }
+
+    if (frame->mode == JOINT_STEREO) {
+        for (blk = 0; blk < frame->blocks; blk++) {
+            for (sb = 0; sb < frame->subbands; sb++) {
+                if (frame->joint & (0x01 << sb)) {
+                    temp = frame->sb_sample[blk][0][sb] +
+                           frame->sb_sample[blk][1][sb];
+                    frame->sb_sample[blk][1][sb] =
+                        frame->sb_sample[blk][0][sb] -
+                        frame->sb_sample[blk][1][sb];
+                    frame->sb_sample[blk][0][sb] = temp;
+                }
+            }
+        }
+    }
+
+    if ((consumed & 0x7) != 0)
+        consumed += 8 - (consumed & 0x7);
+
+    return consumed >> 3;
+}
+
+static inline void sbc_synthesize_four(struct sbc_decoder_state *state,
+                                       struct sbc_frame *frame,
+                                       int ch, int blk, AVFrame *output_frame)
+{
+    int i, k, idx;
+    int32_t *v = state->V[ch];
+    int *offset = state->offset[ch];
+
+    for (i = 0; i < 8; i++) {
+        /* Shifting */
+        offset[i]--;
+        if (offset[i] < 0) {
+            offset[i] = 79;
+            memcpy(v + 80, v, 9 * sizeof(*v));
+        }
+
+        /* Distribute the new matrix value to the shifted position */
+        v[offset[i]] =
+            ( ff_synmatrix4[i][0] * frame->sb_sample[blk][ch][0] +
+              ff_synmatrix4[i][1] * frame->sb_sample[blk][ch][1] +
+              ff_synmatrix4[i][2] * frame->sb_sample[blk][ch][2] +
+              ff_synmatrix4[i][3] * frame->sb_sample[blk][ch][3] ) >> 15;
+    }
+
+    /* Compute the samples */
+    for (idx = 0, i = 0; i < 4; i++, idx += 5) {
+        k = (i + 4) & 0xf;
+
+        /* Store in output, Q0 */
+        AV_WN16A(&output_frame->data[ch][blk * 8 + i * 2], av_clip_int16(
+            ( v[offset[i] + 0] * ff_sbc_proto_4_40m0[idx + 0] +
+              v[offset[k] + 1] * ff_sbc_proto_4_40m1[idx + 0] +
+              v[offset[i] + 2] * ff_sbc_proto_4_40m0[idx + 1] +
+              v[offset[k] + 3] * ff_sbc_proto_4_40m1[idx + 1] +
+              v[offset[i] + 4] * ff_sbc_proto_4_40m0[idx + 2] +
+              v[offset[k] + 5] * ff_sbc_proto_4_40m1[idx + 2] +
+              v[offset[i] + 6] * ff_sbc_proto_4_40m0[idx + 3] +
+              v[offset[k] + 7] * ff_sbc_proto_4_40m1[idx + 3] +
+              v[offset[i] + 8] * ff_sbc_proto_4_40m0[idx + 4] +
+              v[offset[k] + 9] * ff_sbc_proto_4_40m1[idx + 4] ) >> 15));
+    }
+}
+
+static inline void sbc_synthesize_eight(struct sbc_decoder_state *state,
+                                        struct sbc_frame *frame,
+                                        int ch, int blk, AVFrame *output_frame)
+{
+    int i, k, idx;
+    int32_t *v = state->V[ch];
+    int *offset = state->offset[ch];
+
+    for (i = 0; i < 16; i++) {
+        /* Shifting */
+        offset[i]--;
+        if (offset[i] < 0) {
+            offset[i] = 159;
+            memcpy(v + 160, v, 9 * sizeof(*v));
+        }
+
+        /* Distribute the new matrix value to the shifted position */
+        v[offset[i]] =
+            ( ff_synmatrix8[i][0] * frame->sb_sample[blk][ch][0] +
+              ff_synmatrix8[i][1] * frame->sb_sample[blk][ch][1] +
+              ff_synmatrix8[i][2] * frame->sb_sample[blk][ch][2] +
+              ff_synmatrix8[i][3] * frame->sb_sample[blk][ch][3] +
+              ff_synmatrix8[i][4] * frame->sb_sample[blk][ch][4] +
+              ff_synmatrix8[i][5] * frame->sb_sample[blk][ch][5] +
+              ff_synmatrix8[i][6] * frame->sb_sample[blk][ch][6] +
+              ff_synmatrix8[i][7] * frame->sb_sample[blk][ch][7] ) >> 15;
+    }
+
+    /* Compute the samples */
+    for (idx = 0, i = 0; i < 8; i++, idx += 5) {
+        k = (i + 8) & 0xf;
+
+        /* Store in output, Q0 */
+        AV_WN16A(&output_frame->data[ch][blk * 16 + i * 2], av_clip_int16(
+            ( v[offset[i] + 0] * ff_sbc_proto_8_80m0[idx + 0] +
+              v[offset[k] + 1] * ff_sbc_proto_8_80m1[idx + 0] +
+              v[offset[i] + 2] * ff_sbc_proto_8_80m0[idx + 1] +
+              v[offset[k] + 3] * ff_sbc_proto_8_80m1[idx + 1] +
+              v[offset[i] + 4] * ff_sbc_proto_8_80m0[idx + 2] +
+              v[offset[k] + 5] * ff_sbc_proto_8_80m1[idx + 2] +
+              v[offset[i] + 6] * ff_sbc_proto_8_80m0[idx + 3] +
+              v[offset[k] + 7] * ff_sbc_proto_8_80m1[idx + 3] +
+              v[offset[i] + 8] * ff_sbc_proto_8_80m0[idx + 4] +
+              v[offset[k] + 9] * ff_sbc_proto_8_80m1[idx + 4] ) >> 15));
+    }
+}
+
+static void sbc_synthesize_audio(struct sbc_decoder_state *state,
+                                 struct sbc_frame *frame, AVFrame *output_frame)
+{
+    int ch, blk;
+
+    switch (frame->subbands) {
+    case 4:
+        for (ch = 0; ch < frame->channels; ch++)
+            for (blk = 0; blk < frame->blocks; blk++)
+                sbc_synthesize_four(state, frame, ch, blk, output_frame);
+        break;
+
+    case 8:
+        for (ch = 0; ch < frame->channels; ch++)
+            for (blk = 0; blk < frame->blocks; blk++)
+                sbc_synthesize_eight(state, frame, ch, blk, output_frame);
+        break;
+    }
+}
+
+static int sbc_decode_init(AVCodecContext *avctx)
+{
+    SBCDecContext *sbc = avctx->priv_data;
+    int i, ch;
+
+    sbc->frame.crc_ctx = av_crc_get_table(AV_CRC_8_EBU);
+
+    memset(sbc->dsp.V, 0, sizeof(sbc->dsp.V));
+    for (ch = 0; ch < 2; ch++)
+        for (i = 0; i < FF_ARRAY_ELEMS(sbc->dsp.offset[0]); i++)
+            sbc->dsp.offset[ch][i] = (10 * i + 10);
+    return 0;
+}
+
+static int sbc_decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_frame_ptr,
+                            AVPacket *avpkt)
+{
+    SBCDecContext *sbc = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret, frame_length;
+
+    if (!sbc)
+        return AVERROR(EIO);
+
+    frame_length = sbc_unpack_frame(avpkt->data, &sbc->frame, avpkt->size);
+    if (frame_length <= 0)
+        return frame_length;
+
+    frame->channels = sbc->frame.channels;
+    frame->format = AV_SAMPLE_FMT_S16P;
+    frame->nb_samples = sbc->frame.blocks * sbc->frame.subbands;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    sbc_synthesize_audio(&sbc->dsp, &sbc->frame, frame);
+
+    *got_frame_ptr = 1;
+
+    return frame_length;
+}
+
+AVCodec ff_sbc_decoder = {
+    .name                  = "sbc",
+    .long_name             = NULL_IF_CONFIG_SMALL("SBC (low-complexity subband codec)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_SBC,
+    .priv_data_size        = sizeof(SBCDecContext),
+    .init                  = sbc_decode_init,
+    .decode                = sbc_decode_frame,
+    .capabilities          = AV_CODEC_CAP_DR1,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO, 0},
+    .sample_fmts           = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                             AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]) { 16000, 32000, 44100, 48000, 0 },
+};
diff --git a/libavcodec/sbcdec_data.c b/libavcodec/sbcdec_data.c
new file mode 100644
index 0000000..2152162
--- /dev/null
+++ b/libavcodec/sbcdec_data.c
@@ -0,0 +1,127 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC decoder tables
+ */
+
+#include <stdint.h>
+#include "sbcdec_data.h"
+#include "sbc.h"
+
+#define SS4(val)  ((int32_t)val >> 12)
+#define SS8(val)  ((int32_t)val >> 14)
+#define SN4(val)  ((int32_t)val >> 11 + 1 + SBCDEC_FIXED_EXTRA_BITS)
+#define SN8(val)  ((int32_t)val >> 11 + 1 + SBCDEC_FIXED_EXTRA_BITS)
+
+const int32_t ff_sbc_proto_4_40m0[] = {
+    SS4(0x00000000), SS4(0xffa6982f), SS4(0xfba93848), SS4(0x0456c7b8),
+    SS4(0x005967d1), SS4(0xfffb9ac7), SS4(0xff589157), SS4(0xf9c2a8d8),
+    SS4(0x027c1434), SS4(0x0019118b), SS4(0xfff3c74c), SS4(0xff137330),
+    SS4(0xf81b8d70), SS4(0x00ec1b8b), SS4(0xfff0b71a), SS4(0xffe99b00),
+    SS4(0xfef84470), SS4(0xf6fb4370), SS4(0xffcdc351), SS4(0xffe01dc7)
+};
+
+const int32_t ff_sbc_proto_4_40m1[] = {
+    SS4(0xffe090ce), SS4(0xff2c0475), SS4(0xf694f800), SS4(0xff2c0475),
+    SS4(0xffe090ce), SS4(0xffe01dc7), SS4(0xffcdc351), SS4(0xf6fb4370),
+    SS4(0xfef84470), SS4(0xffe99b00), SS4(0xfff0b71a), SS4(0x00ec1b8b),
+    SS4(0xf81b8d70), SS4(0xff137330), SS4(0xfff3c74c), SS4(0x0019118b),
+    SS4(0x027c1434), SS4(0xf9c2a8d8), SS4(0xff589157), SS4(0xfffb9ac7)
+};
+
+const int32_t ff_sbc_proto_8_80m0[] = {
+    SS8(0x00000000), SS8(0xfe8d1970), SS8(0xee979f00), SS8(0x11686100),
+    SS8(0x0172e690), SS8(0xfff5bd1a), SS8(0xfdf1c8d4), SS8(0xeac182c0),
+    SS8(0x0d9daee0), SS8(0x00e530da), SS8(0xffe9811d), SS8(0xfd52986c),
+    SS8(0xe7054ca0), SS8(0x0a00d410), SS8(0x006c1de4), SS8(0xffdba705),
+    SS8(0xfcbc98e8), SS8(0xe3889d20), SS8(0x06af2308), SS8(0x000bb7db),
+    SS8(0xffca00ed), SS8(0xfc3fbb68), SS8(0xe071bc00), SS8(0x03bf7948),
+    SS8(0xffc4e05c), SS8(0xffb54b3b), SS8(0xfbedadc0), SS8(0xdde26200),
+    SS8(0x0142291c), SS8(0xff960e94), SS8(0xff9f3e17), SS8(0xfbd8f358),
+    SS8(0xdbf79400), SS8(0xff405e01), SS8(0xff7d4914), SS8(0xff8b1a31),
+    SS8(0xfc1417b8), SS8(0xdac7bb40), SS8(0xfdbb828c), SS8(0xff762170)
+};
+
+const int32_t ff_sbc_proto_8_80m1[] = {
+    SS8(0xff7c272c), SS8(0xfcb02620), SS8(0xda612700), SS8(0xfcb02620),
+    SS8(0xff7c272c), SS8(0xff762170), SS8(0xfdbb828c), SS8(0xdac7bb40),
+    SS8(0xfc1417b8), SS8(0xff8b1a31), SS8(0xff7d4914), SS8(0xff405e01),
+    SS8(0xdbf79400), SS8(0xfbd8f358), SS8(0xff9f3e17), SS8(0xff960e94),
+    SS8(0x0142291c), SS8(0xdde26200), SS8(0xfbedadc0), SS8(0xffb54b3b),
+    SS8(0xffc4e05c), SS8(0x03bf7948), SS8(0xe071bc00), SS8(0xfc3fbb68),
+    SS8(0xffca00ed), SS8(0x000bb7db), SS8(0x06af2308), SS8(0xe3889d20),
+    SS8(0xfcbc98e8), SS8(0xffdba705), SS8(0x006c1de4), SS8(0x0a00d410),
+    SS8(0xe7054ca0), SS8(0xfd52986c), SS8(0xffe9811d), SS8(0x00e530da),
+    SS8(0x0d9daee0), SS8(0xeac182c0), SS8(0xfdf1c8d4), SS8(0xfff5bd1a)
+};
+
+const int32_t ff_synmatrix4[8][4] = {
+    { SN4(0x05a82798), SN4(0xfa57d868), SN4(0xfa57d868), SN4(0x05a82798) },
+    { SN4(0x030fbc54), SN4(0xf89be510), SN4(0x07641af0), SN4(0xfcf043ac) },
+    { SN4(0x00000000), SN4(0x00000000), SN4(0x00000000), SN4(0x00000000) },
+    { SN4(0xfcf043ac), SN4(0x07641af0), SN4(0xf89be510), SN4(0x030fbc54) },
+    { SN4(0xfa57d868), SN4(0x05a82798), SN4(0x05a82798), SN4(0xfa57d868) },
+    { SN4(0xf89be510), SN4(0xfcf043ac), SN4(0x030fbc54), SN4(0x07641af0) },
+    { SN4(0xf8000000), SN4(0xf8000000), SN4(0xf8000000), SN4(0xf8000000) },
+    { SN4(0xf89be510), SN4(0xfcf043ac), SN4(0x030fbc54), SN4(0x07641af0) }
+};
+
+const int32_t ff_synmatrix8[16][8] = {
+    { SN8(0x05a82798), SN8(0xfa57d868), SN8(0xfa57d868), SN8(0x05a82798),
+      SN8(0x05a82798), SN8(0xfa57d868), SN8(0xfa57d868), SN8(0x05a82798) },
+    { SN8(0x0471ced0), SN8(0xf8275a10), SN8(0x018f8b84), SN8(0x06a6d988),
+      SN8(0xf9592678), SN8(0xfe70747c), SN8(0x07d8a5f0), SN8(0xfb8e3130) },
+    { SN8(0x030fbc54), SN8(0xf89be510), SN8(0x07641af0), SN8(0xfcf043ac),
+      SN8(0xfcf043ac), SN8(0x07641af0), SN8(0xf89be510), SN8(0x030fbc54) },
+    { SN8(0x018f8b84), SN8(0xfb8e3130), SN8(0x06a6d988), SN8(0xf8275a10),
+      SN8(0x07d8a5f0), SN8(0xf9592678), SN8(0x0471ced0), SN8(0xfe70747c) },
+    { SN8(0x00000000), SN8(0x00000000), SN8(0x00000000), SN8(0x00000000),
+      SN8(0x00000000), SN8(0x00000000), SN8(0x00000000), SN8(0x00000000) },
+    { SN8(0xfe70747c), SN8(0x0471ced0), SN8(0xf9592678), SN8(0x07d8a5f0),
+      SN8(0xf8275a10), SN8(0x06a6d988), SN8(0xfb8e3130), SN8(0x018f8b84) },
+    { SN8(0xfcf043ac), SN8(0x07641af0), SN8(0xf89be510), SN8(0x030fbc54),
+      SN8(0x030fbc54), SN8(0xf89be510), SN8(0x07641af0), SN8(0xfcf043ac) },
+    { SN8(0xfb8e3130), SN8(0x07d8a5f0), SN8(0xfe70747c), SN8(0xf9592678),
+      SN8(0x06a6d988), SN8(0x018f8b84), SN8(0xf8275a10), SN8(0x0471ced0) },
+    { SN8(0xfa57d868), SN8(0x05a82798), SN8(0x05a82798), SN8(0xfa57d868),
+      SN8(0xfa57d868), SN8(0x05a82798), SN8(0x05a82798), SN8(0xfa57d868) },
+    { SN8(0xf9592678), SN8(0x018f8b84), SN8(0x07d8a5f0), SN8(0x0471ced0),
+      SN8(0xfb8e3130), SN8(0xf8275a10), SN8(0xfe70747c), SN8(0x06a6d988) },
+    { SN8(0xf89be510), SN8(0xfcf043ac), SN8(0x030fbc54), SN8(0x07641af0),
+      SN8(0x07641af0), SN8(0x030fbc54), SN8(0xfcf043ac), SN8(0xf89be510) },
+    { SN8(0xf8275a10), SN8(0xf9592678), SN8(0xfb8e3130), SN8(0xfe70747c),
+      SN8(0x018f8b84), SN8(0x0471ced0), SN8(0x06a6d988), SN8(0x07d8a5f0) },
+    { SN8(0xf8000000), SN8(0xf8000000), SN8(0xf8000000), SN8(0xf8000000),
+      SN8(0xf8000000), SN8(0xf8000000), SN8(0xf8000000), SN8(0xf8000000) },
+    { SN8(0xf8275a10), SN8(0xf9592678), SN8(0xfb8e3130), SN8(0xfe70747c),
+      SN8(0x018f8b84), SN8(0x0471ced0), SN8(0x06a6d988), SN8(0x07d8a5f0) },
+    { SN8(0xf89be510), SN8(0xfcf043ac), SN8(0x030fbc54), SN8(0x07641af0),
+      SN8(0x07641af0), SN8(0x030fbc54), SN8(0xfcf043ac), SN8(0xf89be510) },
+    { SN8(0xf9592678), SN8(0x018f8b84), SN8(0x07d8a5f0), SN8(0x0471ced0),
+      SN8(0xfb8e3130), SN8(0xf8275a10), SN8(0xfe70747c), SN8(0x06a6d988) }
+};
diff --git a/libavcodec/sbcdec_data.h b/libavcodec/sbcdec_data.h
new file mode 100644
index 0000000..1b79d1d
--- /dev/null
+++ b/libavcodec/sbcdec_data.h
@@ -0,0 +1,44 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC decoder tables
+ */
+
+#ifndef AVCODEC_SBCDEC_DATA_H
+#define AVCODEC_SBCDEC_DATA_H
+
+#include <stdint.h>
+
+extern const int32_t ff_sbc_proto_4_40m0[];
+extern const int32_t ff_sbc_proto_4_40m1[];
+extern const int32_t ff_sbc_proto_8_80m0[];
+extern const int32_t ff_sbc_proto_8_80m1[];
+extern const int32_t ff_synmatrix4[8][4];
+extern const int32_t ff_synmatrix8[16][8];
+
+#endif /* AVCODEC_SBCDEC_DATA_H */
diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
new file mode 100644
index 0000000..e745595
--- /dev/null
+++ b/libavcodec/sbcdsp.c
@@ -0,0 +1,387 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2012-2013  Intel Corporation
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC basic "building bricks"
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+#include "libavutil/common.h"
+#include "libavutil/intmath.h"
+#include "libavutil/intreadwrite.h"
+#include "sbc.h"
+#include "sbcdsp.h"
+#include "sbcdsp_data.h"
+
+/*
+ * A reference C code of analysis filter with SIMD-friendly tables
+ * reordering and code layout. This code can be used to develop platform
+ * specific SIMD optimizations. Also it may be used as some kind of test
+ * for compiler autovectorization capabilities (who knows, if the compiler
+ * is very good at this stuff, hand optimized assembly may be not strictly
+ * needed for some platform).
+ *
+ * Note: It is also possible to make a simple variant of analysis filter,
+ * which needs only a single constants table without taking care about
+ * even/odd cases. This simple variant of filter can be implemented without
+ * input data permutation. The only thing that would be lost is the
+ * possibility to use pairwise SIMD multiplications. But for some simple
+ * CPU cores without SIMD extensions it can be useful. If anybody is
+ * interested in implementing such variant of a filter, sourcecode from
+ * bluez versions 4.26/4.27 can be used as a reference and the history of
+ * the changes in git repository done around that time may be worth checking.
+ */
+
+static av_always_inline void sbc_analyze_simd(const int16_t *in, int32_t *out,
+                                              const int16_t *consts,
+                                              unsigned subbands)
+{
+    int32_t t1[8];
+    int16_t t2[8];
+    int i, j, hop = 0;
+
+    /* rounding coefficient */
+    for (i = 0; i < subbands; i++)
+        t1[i] = 1 << (SBC_PROTO_FIXED_SCALE - 1);
+
+    /* low pass polyphase filter */
+    for (hop = 0; hop < 10*subbands; hop += 2*subbands)
+        for (i = 0; i < 2*subbands; i++)
+            t1[i >> 1] += in[hop + i] * consts[hop + i];
+
+    /* scaling */
+    for (i = 0; i < subbands; i++)
+        t2[i] = t1[i] >> SBC_PROTO_FIXED_SCALE;
+
+    memset(t1, 0, sizeof(t1));
+
+    /* do the cos transform */
+    for (i = 0; i < subbands/2; i++)
+        for (j = 0; j < 2*subbands; j++)
+            t1[j>>1] += t2[i * 2 + (j&1)] * consts[10*subbands + i*2*subbands + j];
+
+    for (i = 0; i < subbands; i++)
+        out[i] = t1[i] >> (SBC_COS_TABLE_FIXED_SCALE - SCALE_OUT_BITS);
+}
+
+static void sbc_analyze_4_simd(const int16_t *in, int32_t *out,
+                               const int16_t *consts)
+{
+    sbc_analyze_simd(in, out, consts, 4);
+}
+
+static void sbc_analyze_8_simd(const int16_t *in, int32_t *out,
+                               const int16_t *consts)
+{
+    sbc_analyze_simd(in, out, consts, 8);
+}
+
+static inline void sbc_analyze_4b_4s_simd(SBCDSPContext *s,
+                                          int16_t *x, int32_t *out, int out_stride)
+{
+    /* Analyze blocks */
+    s->sbc_analyze_4(x + 12, out, ff_sbcdsp_analysis_consts_fixed4_simd_odd);
+    out += out_stride;
+    s->sbc_analyze_4(x + 8, out, ff_sbcdsp_analysis_consts_fixed4_simd_even);
+    out += out_stride;
+    s->sbc_analyze_4(x + 4, out, ff_sbcdsp_analysis_consts_fixed4_simd_odd);
+    out += out_stride;
+    s->sbc_analyze_4(x + 0, out, ff_sbcdsp_analysis_consts_fixed4_simd_even);
+}
+
+static inline void sbc_analyze_4b_8s_simd(SBCDSPContext *s,
+                                          int16_t *x, int32_t *out, int out_stride)
+{
+    /* Analyze blocks */
+    s->sbc_analyze_8(x + 24, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
+    out += out_stride;
+    s->sbc_analyze_8(x + 16, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
+    out += out_stride;
+    s->sbc_analyze_8(x + 8, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
+    out += out_stride;
+    s->sbc_analyze_8(x + 0, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
+}
+
+static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
+                                               int16_t *x, int32_t *out,
+                                               int out_stride);
+
+static inline void sbc_analyze_1b_8s_simd_odd(SBCDSPContext *s,
+                                              int16_t *x, int32_t *out,
+                                              int out_stride)
+{
+    s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
+    s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_even;
+}
+
+static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
+                                               int16_t *x, int32_t *out,
+                                               int out_stride)
+{
+    s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
+    s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd;
+}
+
+/*
+ * Input data processing functions. The data is endian converted if needed,
+ * channels are deintrleaved and audio samples are reordered for use in
+ * SIMD-friendly analysis filter function. The results are put into "X"
+ * array, getting appended to the previous data (or it is better to say
+ * prepended, as the buffer is filled from top to bottom). Old data is
+ * discarded when neededed, but availability of (10 * nrof_subbands)
+ * contiguous samples is always guaranteed for the input to the analysis
+ * filter. This is achieved by copying a sufficient part of old data
+ * to the top of the buffer on buffer wraparound.
+ */
+
+static int sbc_enc_process_input_4s(int position, const uint8_t *pcm,
+                                    int16_t X[2][SBC_X_BUFFER_SIZE],
+                                    int nsamples, int nchannels)
+{
+    int c;
+
+    /* handle X buffer wraparound */
+    if (position < nsamples) {
+        for (c = 0; c < nchannels; c++)
+            memcpy(&X[c][SBC_X_BUFFER_SIZE - 40], &X[c][position],
+                            36 * sizeof(int16_t));
+        position = SBC_X_BUFFER_SIZE - 40;
+    }
+
+    /* copy/permutate audio samples */
+    for (; nsamples >= 8; nsamples -= 8, pcm += 16 * nchannels) {
+        position -= 8;
+        for (c = 0; c < nchannels; c++) {
+            int16_t *x = &X[c][position];
+            x[0] = AV_RN16(pcm + 14*nchannels + 2*c);
+            x[1] = AV_RN16(pcm +  6*nchannels + 2*c);
+            x[2] = AV_RN16(pcm + 12*nchannels + 2*c);
+            x[3] = AV_RN16(pcm +  8*nchannels + 2*c);
+            x[4] = AV_RN16(pcm +  0*nchannels + 2*c);
+            x[5] = AV_RN16(pcm +  4*nchannels + 2*c);
+            x[6] = AV_RN16(pcm +  2*nchannels + 2*c);
+            x[7] = AV_RN16(pcm + 10*nchannels + 2*c);
+        }
+    }
+
+    return position;
+}
+
+static int sbc_enc_process_input_8s(int position, const uint8_t *pcm,
+                                    int16_t X[2][SBC_X_BUFFER_SIZE],
+                                    int nsamples, int nchannels)
+{
+    int c;
+
+    /* handle X buffer wraparound */
+    if (position < nsamples) {
+        for (c = 0; c < nchannels; c++)
+            memcpy(&X[c][SBC_X_BUFFER_SIZE - 72], &X[c][position],
+                            72 * sizeof(int16_t));
+        position = SBC_X_BUFFER_SIZE - 72;
+    }
+
+    if (position % 16 == 8) {
+        position -= 8;
+        nsamples -= 8;
+        for (c = 0; c < nchannels; c++) {
+            int16_t *x = &X[c][position];
+            x[0] = AV_RN16(pcm + 14*nchannels + 2*c);
+            x[2] = AV_RN16(pcm + 12*nchannels + 2*c);
+            x[3] = AV_RN16(pcm +  0*nchannels + 2*c);
+            x[4] = AV_RN16(pcm + 10*nchannels + 2*c);
+            x[5] = AV_RN16(pcm +  2*nchannels + 2*c);
+            x[6] = AV_RN16(pcm +  8*nchannels + 2*c);
+            x[7] = AV_RN16(pcm +  4*nchannels + 2*c);
+            x[8] = AV_RN16(pcm +  6*nchannels + 2*c);
+        }
+        pcm += 16 * nchannels;
+    }
+
+    /* copy/permutate audio samples */
+    for (; nsamples >= 16; nsamples -= 16, pcm += 32 * nchannels) {
+        position -= 16;
+        for (c = 0; c < nchannels; c++) {
+            int16_t *x = &X[c][position];
+            x[0]  = AV_RN16(pcm + 30*nchannels + 2*c);
+            x[1]  = AV_RN16(pcm + 14*nchannels + 2*c);
+            x[2]  = AV_RN16(pcm + 28*nchannels + 2*c);
+            x[3]  = AV_RN16(pcm + 16*nchannels + 2*c);
+            x[4]  = AV_RN16(pcm + 26*nchannels + 2*c);
+            x[5]  = AV_RN16(pcm + 18*nchannels + 2*c);
+            x[6]  = AV_RN16(pcm + 24*nchannels + 2*c);
+            x[7]  = AV_RN16(pcm + 20*nchannels + 2*c);
+            x[8]  = AV_RN16(pcm + 22*nchannels + 2*c);
+            x[9]  = AV_RN16(pcm +  6*nchannels + 2*c);
+            x[10] = AV_RN16(pcm + 12*nchannels + 2*c);
+            x[11] = AV_RN16(pcm +  0*nchannels + 2*c);
+            x[12] = AV_RN16(pcm + 10*nchannels + 2*c);
+            x[13] = AV_RN16(pcm +  2*nchannels + 2*c);
+            x[14] = AV_RN16(pcm +  8*nchannels + 2*c);
+            x[15] = AV_RN16(pcm +  4*nchannels + 2*c);
+        }
+    }
+
+    if (nsamples == 8) {
+        position -= 8;
+        for (c = 0; c < nchannels; c++) {
+            int16_t *x = &X[c][position];
+            x[-7] = AV_RN16(pcm + 14*nchannels + 2*c);
+            x[1]  = AV_RN16(pcm +  6*nchannels + 2*c);
+            x[2]  = AV_RN16(pcm + 12*nchannels + 2*c);
+            x[3]  = AV_RN16(pcm +  0*nchannels + 2*c);
+            x[4]  = AV_RN16(pcm + 10*nchannels + 2*c);
+            x[5]  = AV_RN16(pcm +  2*nchannels + 2*c);
+            x[6]  = AV_RN16(pcm +  8*nchannels + 2*c);
+            x[7]  = AV_RN16(pcm +  4*nchannels + 2*c);
+        }
+    }
+
+    return position;
+}
+
+static void sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
+                                  uint32_t scale_factor[2][8],
+                                  int blocks, int channels, int subbands)
+{
+    int ch, sb, blk;
+    for (ch = 0; ch < channels; ch++) {
+        for (sb = 0; sb < subbands; sb++) {
+            uint32_t x = 1 << SCALE_OUT_BITS;
+            for (blk = 0; blk < blocks; blk++) {
+                int32_t tmp = FFABS(sb_sample_f[blk][ch][sb]);
+                if (tmp != 0)
+                    x |= tmp - 1;
+            }
+            scale_factor[ch][sb] = (31 - SCALE_OUT_BITS) - ff_clz(x);
+        }
+    }
+}
+
+static int sbc_calc_scalefactors_j(int32_t sb_sample_f[16][2][8],
+                                   uint32_t scale_factor[2][8],
+                                   int blocks, int subbands)
+{
+    int blk, joint = 0;
+    int32_t tmp0, tmp1;
+    uint32_t x, y;
+
+    /* last subband does not use joint stereo */
+    int sb = subbands - 1;
+    x = 1 << SCALE_OUT_BITS;
+    y = 1 << SCALE_OUT_BITS;
+    for (blk = 0; blk < blocks; blk++) {
+        tmp0 = FFABS(sb_sample_f[blk][0][sb]);
+        tmp1 = FFABS(sb_sample_f[blk][1][sb]);
+        if (tmp0 != 0)
+            x |= tmp0 - 1;
+        if (tmp1 != 0)
+            y |= tmp1 - 1;
+    }
+    scale_factor[0][sb] = (31 - SCALE_OUT_BITS) - ff_clz(x);
+    scale_factor[1][sb] = (31 - SCALE_OUT_BITS) - ff_clz(y);
+
+    /* the rest of subbands can use joint stereo */
+    while (--sb >= 0) {
+        int32_t sb_sample_j[16][2];
+        x = 1 << SCALE_OUT_BITS;
+        y = 1 << SCALE_OUT_BITS;
+        for (blk = 0; blk < blocks; blk++) {
+            tmp0 = sb_sample_f[blk][0][sb];
+            tmp1 = sb_sample_f[blk][1][sb];
+            sb_sample_j[blk][0] = (tmp0 >> 1) + (tmp1 >> 1);
+            sb_sample_j[blk][1] = (tmp0 >> 1) - (tmp1 >> 1);
+            tmp0 = FFABS(tmp0);
+            tmp1 = FFABS(tmp1);
+            if (tmp0 != 0)
+                x |= tmp0 - 1;
+            if (tmp1 != 0)
+                y |= tmp1 - 1;
+        }
+        scale_factor[0][sb] = (31 - SCALE_OUT_BITS) -
+            ff_clz(x);
+        scale_factor[1][sb] = (31 - SCALE_OUT_BITS) -
+            ff_clz(y);
+        x = 1 << SCALE_OUT_BITS;
+        y = 1 << SCALE_OUT_BITS;
+        for (blk = 0; blk < blocks; blk++) {
+            tmp0 = FFABS(sb_sample_j[blk][0]);
+            tmp1 = FFABS(sb_sample_j[blk][1]);
+            if (tmp0 != 0)
+                x |= tmp0 - 1;
+            if (tmp1 != 0)
+                y |= tmp1 - 1;
+        }
+        x = (31 - SCALE_OUT_BITS) - ff_clz(x);
+        y = (31 - SCALE_OUT_BITS) - ff_clz(y);
+
+        /* decide whether to use joint stereo for this subband */
+        if ((scale_factor[0][sb] + scale_factor[1][sb]) > x + y) {
+            joint |= 1 << (subbands - 1 - sb);
+            scale_factor[0][sb] = x;
+            scale_factor[1][sb] = y;
+            for (blk = 0; blk < blocks; blk++) {
+                sb_sample_f[blk][0][sb] = sb_sample_j[blk][0];
+                sb_sample_f[blk][1][sb] = sb_sample_j[blk][1];
+            }
+        }
+    }
+
+    /* bitmask with the information about subbands using joint stereo */
+    return joint;
+}
+
+/*
+ * Detect CPU features and setup function pointers
+ */
+av_cold void ff_sbcdsp_init(SBCDSPContext *s)
+{
+    /* Default implementation for analyze functions */
+    s->sbc_analyze_4 = sbc_analyze_4_simd;
+    s->sbc_analyze_8 = sbc_analyze_8_simd;
+    s->sbc_analyze_4s = sbc_analyze_4b_4s_simd;
+    if (s->increment == 1)
+        s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd;
+    else
+        s->sbc_analyze_8s = sbc_analyze_4b_8s_simd;
+
+    /* Default implementation for input reordering / deinterleaving */
+    s->sbc_enc_process_input_4s = sbc_enc_process_input_4s;
+    s->sbc_enc_process_input_8s = sbc_enc_process_input_8s;
+
+    /* Default implementation for scale factors calculation */
+    s->sbc_calc_scalefactors = sbc_calc_scalefactors;
+    s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
+
+    if (ARCH_ARM)
+        ff_sbcdsp_init_arm(s);
+    if (ARCH_X86)
+        ff_sbcdsp_init_x86(s);
+}
diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
new file mode 100644
index 0000000..334c058
--- /dev/null
+++ b/libavcodec/sbcdsp.h
@@ -0,0 +1,86 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC basic "building bricks"
+ */
+
+#ifndef AVCODEC_SBCDSP_H
+#define AVCODEC_SBCDSP_H
+
+#include "sbc.h"
+#include "sbcdsp_data.h"
+
+#define SCALE_OUT_BITS 15
+#define SBC_X_BUFFER_SIZE 328
+
+typedef struct sbc_dsp_context SBCDSPContext;
+
+struct sbc_dsp_context {
+    int position;
+    /* Number of consecutive blocks handled by the encoder */
+    uint8_t increment;
+    DECLARE_ALIGNED(SBC_ALIGN, int16_t, X)[2][SBC_X_BUFFER_SIZE];
+    void (*sbc_analyze_4)(const int16_t *in, int32_t *out, const int16_t *consts);
+    void (*sbc_analyze_8)(const int16_t *in, int32_t *out, const int16_t *consts);
+    /* Polyphase analysis filter for 4 subbands configuration,
+     * it handles "increment" blocks at once */
+    void (*sbc_analyze_4s)(SBCDSPContext *s,
+                           int16_t *x, int32_t *out, int out_stride);
+    /* Polyphase analysis filter for 8 subbands configuration,
+     * it handles "increment" blocks at once */
+    void (*sbc_analyze_8s)(SBCDSPContext *s,
+                           int16_t *x, int32_t *out, int out_stride);
+    /* Process input data (deinterleave, endian conversion, reordering),
+     * depending on the number of subbands and input data byte order */
+    int (*sbc_enc_process_input_4s)(int position, const uint8_t *pcm,
+                                    int16_t X[2][SBC_X_BUFFER_SIZE],
+                                    int nsamples, int nchannels);
+    int (*sbc_enc_process_input_8s)(int position, const uint8_t *pcm,
+                                    int16_t X[2][SBC_X_BUFFER_SIZE],
+                                    int nsamples, int nchannels);
+    /* Scale factors calculation */
+    void (*sbc_calc_scalefactors)(int32_t sb_sample_f[16][2][8],
+                                  uint32_t scale_factor[2][8],
+                                  int blocks, int channels, int subbands);
+    /* Scale factors calculation with joint stereo support */
+    int (*sbc_calc_scalefactors_j)(int32_t sb_sample_f[16][2][8],
+                                   uint32_t scale_factor[2][8],
+                                   int blocks, int subbands);
+};
+
+/*
+ * Initialize pointers to the functions which are the basic "building bricks"
+ * of SBC codec. Best implementation is selected based on target CPU
+ * capabilities.
+ */
+void ff_sbcdsp_init(SBCDSPContext *s);
+
+void ff_sbcdsp_init_arm(SBCDSPContext *s);
+void ff_sbcdsp_init_x86(SBCDSPContext *s);
+
+#endif /* AVCODEC_SBCDSP_H */
diff --git a/libavcodec/sbcdsp_data.c b/libavcodec/sbcdsp_data.c
new file mode 100644
index 0000000..78c07c0
--- /dev/null
+++ b/libavcodec/sbcdsp_data.c
@@ -0,0 +1,329 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * miscellaneous SBC tables
+ */
+
+#include "sbcdsp_data.h"
+
+#define F_PROTO(x) ((int32_t) (((x) * 2) * ((int32_t) 1 << 15) + 0.5))
+#define F_COS(x)   ((int32_t) (((x)    ) * ((int32_t) 1 << 15) + 0.5))
+
+/*
+ * Constant tables for the use in SIMD optimized analysis filters
+ * Each table consists of two parts:
+ * 1. reordered "proto" table
+ * 2. reordered "cos" table
+ *
+ * Due to non-symmetrical reordering, separate tables for "even"
+ * and "odd" cases are needed
+ */
+
+DECLARE_ALIGNED(SBC_ALIGN, const int16_t, ff_sbcdsp_analysis_consts_fixed4_simd_even)[40 + 16] = {
+#define C0 1.0932568993
+#define C1 1.3056875580
+#define C2 1.3056875580
+#define C3 1.6772280856
+
+#define F(x) F_PROTO(x)
+     F(0.00000000E+00 * C0),  F(3.83720193E-03 * C0),
+     F(5.36548976E-04 * C1),  F(2.73370904E-03 * C1),
+     F(3.06012286E-03 * C2),  F(3.89205149E-03 * C2),
+     F(0.00000000E+00 * C3), -F(1.49188357E-03 * C3),
+     F(1.09137620E-02 * C0),  F(2.58767811E-02 * C0),
+     F(2.04385087E-02 * C1),  F(3.21939290E-02 * C1),
+     F(7.76463494E-02 * C2),  F(6.13245186E-03 * C2),
+     F(0.00000000E+00 * C3), -F(2.88757392E-02 * C3),
+     F(1.35593274E-01 * C0),  F(2.94315332E-01 * C0),
+     F(1.94987841E-01 * C1),  F(2.81828203E-01 * C1),
+    -F(1.94987841E-01 * C2),  F(2.81828203E-01 * C2),
+     F(0.00000000E+00 * C3), -F(2.46636662E-01 * C3),
+    -F(1.35593274E-01 * C0),  F(2.58767811E-02 * C0),
+    -F(7.76463494E-02 * C1),  F(6.13245186E-03 * C1),
+    -F(2.04385087E-02 * C2),  F(3.21939290E-02 * C2),
+     F(0.00000000E+00 * C3),  F(2.88217274E-02 * C3),
+    -F(1.09137620E-02 * C0),  F(3.83720193E-03 * C0),
+    -F(3.06012286E-03 * C1),  F(3.89205149E-03 * C1),
+    -F(5.36548976E-04 * C2),  F(2.73370904E-03 * C2),
+     F(0.00000000E+00 * C3), -F(1.86581691E-03 * C3),
+#undef F
+#define F(x) F_COS(x)
+     F(0.7071067812 / C0),  F(0.9238795325 / C1),
+    -F(0.7071067812 / C0),  F(0.3826834324 / C1),
+    -F(0.7071067812 / C0), -F(0.3826834324 / C1),
+     F(0.7071067812 / C0), -F(0.9238795325 / C1),
+     F(0.3826834324 / C2), -F(1.0000000000 / C3),
+    -F(0.9238795325 / C2), -F(1.0000000000 / C3),
+     F(0.9238795325 / C2), -F(1.0000000000 / C3),
+    -F(0.3826834324 / C2), -F(1.0000000000 / C3),
+#undef F
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+};
+
+DECLARE_ALIGNED(SBC_ALIGN, const int16_t, ff_sbcdsp_analysis_consts_fixed4_simd_odd)[40 + 16] = {
+#define C0 1.3056875580
+#define C1 1.6772280856
+#define C2 1.0932568993
+#define C3 1.3056875580
+
+#define F(x) F_PROTO(x)
+     F(2.73370904E-03 * C0),  F(5.36548976E-04 * C0),
+    -F(1.49188357E-03 * C1),  F(0.00000000E+00 * C1),
+     F(3.83720193E-03 * C2),  F(1.09137620E-02 * C2),
+     F(3.89205149E-03 * C3),  F(3.06012286E-03 * C3),
+     F(3.21939290E-02 * C0),  F(2.04385087E-02 * C0),
+    -F(2.88757392E-02 * C1),  F(0.00000000E+00 * C1),
+     F(2.58767811E-02 * C2),  F(1.35593274E-01 * C2),
+     F(6.13245186E-03 * C3),  F(7.76463494E-02 * C3),
+     F(2.81828203E-01 * C0),  F(1.94987841E-01 * C0),
+    -F(2.46636662E-01 * C1),  F(0.00000000E+00 * C1),
+     F(2.94315332E-01 * C2), -F(1.35593274E-01 * C2),
+     F(2.81828203E-01 * C3), -F(1.94987841E-01 * C3),
+     F(6.13245186E-03 * C0), -F(7.76463494E-02 * C0),
+     F(2.88217274E-02 * C1),  F(0.00000000E+00 * C1),
+     F(2.58767811E-02 * C2), -F(1.09137620E-02 * C2),
+     F(3.21939290E-02 * C3), -F(2.04385087E-02 * C3),
+     F(3.89205149E-03 * C0), -F(3.06012286E-03 * C0),
+    -F(1.86581691E-03 * C1),  F(0.00000000E+00 * C1),
+     F(3.83720193E-03 * C2),  F(0.00000000E+00 * C2),
+     F(2.73370904E-03 * C3), -F(5.36548976E-04 * C3),
+#undef F
+#define F(x) F_COS(x)
+     F(0.9238795325 / C0), -F(1.0000000000 / C1),
+     F(0.3826834324 / C0), -F(1.0000000000 / C1),
+    -F(0.3826834324 / C0), -F(1.0000000000 / C1),
+    -F(0.9238795325 / C0), -F(1.0000000000 / C1),
+     F(0.7071067812 / C2),  F(0.3826834324 / C3),
+    -F(0.7071067812 / C2), -F(0.9238795325 / C3),
+    -F(0.7071067812 / C2),  F(0.9238795325 / C3),
+     F(0.7071067812 / C2), -F(0.3826834324 / C3),
+#undef F
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+};
+
+DECLARE_ALIGNED(SBC_ALIGN, const int16_t, ff_sbcdsp_analysis_consts_fixed8_simd_even)[80 + 64] = {
+#define C0 2.7906148894
+#define C1 2.4270044280
+#define C2 2.8015616024
+#define C3 3.1710363741
+#define C4 2.5377944043
+#define C5 2.4270044280
+#define C6 2.8015616024
+#define C7 3.1710363741
+
+#define F(x) F_PROTO(x)
+     F(0.00000000E+00 * C0),  F(2.01182542E-03 * C0),
+     F(1.56575398E-04 * C1),  F(1.78371725E-03 * C1),
+     F(3.43256425E-04 * C2),  F(1.47640169E-03 * C2),
+     F(5.54620202E-04 * C3),  F(1.13992507E-03 * C3),
+    -F(8.23919506E-04 * C4),  F(0.00000000E+00 * C4),
+     F(2.10371989E-03 * C5),  F(3.49717454E-03 * C5),
+     F(1.99454554E-03 * C6),  F(1.64973098E-03 * C6),
+     F(1.61656283E-03 * C7),  F(1.78805361E-04 * C7),
+     F(5.65949473E-03 * C0),  F(1.29371806E-02 * C0),
+     F(8.02941163E-03 * C1),  F(1.53184106E-02 * C1),
+     F(1.04584443E-02 * C2),  F(1.62208471E-02 * C2),
+     F(1.27472335E-02 * C3),  F(1.59045603E-02 * C3),
+    -F(1.46525263E-02 * C4),  F(0.00000000E+00 * C4),
+     F(8.85757540E-03 * C5),  F(5.31873032E-02 * C5),
+     F(2.92408442E-03 * C6),  F(3.90751381E-02 * C6),
+    -F(4.91578024E-03 * C7),  F(2.61098752E-02 * C7),
+     F(6.79989431E-02 * C0),  F(1.46955068E-01 * C0),
+     F(8.29847578E-02 * C1),  F(1.45389847E-01 * C1),
+     F(9.75753918E-02 * C2),  F(1.40753505E-01 * C2),
+     F(1.11196689E-01 * C3),  F(1.33264415E-01 * C3),
+    -F(1.23264548E-01 * C4),  F(0.00000000E+00 * C4),
+     F(1.45389847E-01 * C5), -F(8.29847578E-02 * C5),
+     F(1.40753505E-01 * C6), -F(9.75753918E-02 * C6),
+     F(1.33264415E-01 * C7), -F(1.11196689E-01 * C7),
+    -F(6.79989431E-02 * C0),  F(1.29371806E-02 * C0),
+    -F(5.31873032E-02 * C1),  F(8.85757540E-03 * C1),
+    -F(3.90751381E-02 * C2),  F(2.92408442E-03 * C2),
+    -F(2.61098752E-02 * C3), -F(4.91578024E-03 * C3),
+     F(1.46404076E-02 * C4),  F(0.00000000E+00 * C4),
+     F(1.53184106E-02 * C5), -F(8.02941163E-03 * C5),
+     F(1.62208471E-02 * C6), -F(1.04584443E-02 * C6),
+     F(1.59045603E-02 * C7), -F(1.27472335E-02 * C7),
+    -F(5.65949473E-03 * C0),  F(2.01182542E-03 * C0),
+    -F(3.49717454E-03 * C1),  F(2.10371989E-03 * C1),
+    -F(1.64973098E-03 * C2),  F(1.99454554E-03 * C2),
+    -F(1.78805361E-04 * C3),  F(1.61656283E-03 * C3),
+    -F(9.02154502E-04 * C4),  F(0.00000000E+00 * C4),
+     F(1.78371725E-03 * C5), -F(1.56575398E-04 * C5),
+     F(1.47640169E-03 * C6), -F(3.43256425E-04 * C6),
+     F(1.13992507E-03 * C7), -F(5.54620202E-04 * C7),
+#undef F
+#define F(x) F_COS(x)
+     F(0.7071067812 / C0),  F(0.8314696123 / C1),
+    -F(0.7071067812 / C0), -F(0.1950903220 / C1),
+    -F(0.7071067812 / C0), -F(0.9807852804 / C1),
+     F(0.7071067812 / C0), -F(0.5555702330 / C1),
+     F(0.7071067812 / C0),  F(0.5555702330 / C1),
+    -F(0.7071067812 / C0),  F(0.9807852804 / C1),
+    -F(0.7071067812 / C0),  F(0.1950903220 / C1),
+     F(0.7071067812 / C0), -F(0.8314696123 / C1),
+     F(0.9238795325 / C2),  F(0.9807852804 / C3),
+     F(0.3826834324 / C2),  F(0.8314696123 / C3),
+    -F(0.3826834324 / C2),  F(0.5555702330 / C3),
+    -F(0.9238795325 / C2),  F(0.1950903220 / C3),
+    -F(0.9238795325 / C2), -F(0.1950903220 / C3),
+    -F(0.3826834324 / C2), -F(0.5555702330 / C3),
+     F(0.3826834324 / C2), -F(0.8314696123 / C3),
+     F(0.9238795325 / C2), -F(0.9807852804 / C3),
+    -F(1.0000000000 / C4),  F(0.5555702330 / C5),
+    -F(1.0000000000 / C4), -F(0.9807852804 / C5),
+    -F(1.0000000000 / C4),  F(0.1950903220 / C5),
+    -F(1.0000000000 / C4),  F(0.8314696123 / C5),
+    -F(1.0000000000 / C4), -F(0.8314696123 / C5),
+    -F(1.0000000000 / C4), -F(0.1950903220 / C5),
+    -F(1.0000000000 / C4),  F(0.9807852804 / C5),
+    -F(1.0000000000 / C4), -F(0.5555702330 / C5),
+     F(0.3826834324 / C6),  F(0.1950903220 / C7),
+    -F(0.9238795325 / C6), -F(0.5555702330 / C7),
+     F(0.9238795325 / C6),  F(0.8314696123 / C7),
+    -F(0.3826834324 / C6), -F(0.9807852804 / C7),
+    -F(0.3826834324 / C6),  F(0.9807852804 / C7),
+     F(0.9238795325 / C6), -F(0.8314696123 / C7),
+    -F(0.9238795325 / C6),  F(0.5555702330 / C7),
+     F(0.3826834324 / C6), -F(0.1950903220 / C7),
+#undef F
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+};
+
+DECLARE_ALIGNED(SBC_ALIGN, const int16_t, ff_sbcdsp_analysis_consts_fixed8_simd_odd)[80 + 64] = {
+#define C0 2.5377944043
+#define C1 2.4270044280
+#define C2 2.8015616024
+#define C3 3.1710363741
+#define C4 2.7906148894
+#define C5 2.4270044280
+#define C6 2.8015616024
+#define C7 3.1710363741
+
+#define F(x) F_PROTO(x)
+     F(0.00000000E+00 * C0), -F(8.23919506E-04 * C0),
+     F(1.56575398E-04 * C1),  F(1.78371725E-03 * C1),
+     F(3.43256425E-04 * C2),  F(1.47640169E-03 * C2),
+     F(5.54620202E-04 * C3),  F(1.13992507E-03 * C3),
+     F(2.01182542E-03 * C4),  F(5.65949473E-03 * C4),
+     F(2.10371989E-03 * C5),  F(3.49717454E-03 * C5),
+     F(1.99454554E-03 * C6),  F(1.64973098E-03 * C6),
+     F(1.61656283E-03 * C7),  F(1.78805361E-04 * C7),
+     F(0.00000000E+00 * C0), -F(1.46525263E-02 * C0),
+     F(8.02941163E-03 * C1),  F(1.53184106E-02 * C1),
+     F(1.04584443E-02 * C2),  F(1.62208471E-02 * C2),
+     F(1.27472335E-02 * C3),  F(1.59045603E-02 * C3),
+     F(1.29371806E-02 * C4),  F(6.79989431E-02 * C4),
+     F(8.85757540E-03 * C5),  F(5.31873032E-02 * C5),
+     F(2.92408442E-03 * C6),  F(3.90751381E-02 * C6),
+    -F(4.91578024E-03 * C7),  F(2.61098752E-02 * C7),
+     F(0.00000000E+00 * C0), -F(1.23264548E-01 * C0),
+     F(8.29847578E-02 * C1),  F(1.45389847E-01 * C1),
+     F(9.75753918E-02 * C2),  F(1.40753505E-01 * C2),
+     F(1.11196689E-01 * C3),  F(1.33264415E-01 * C3),
+     F(1.46955068E-01 * C4), -F(6.79989431E-02 * C4),
+     F(1.45389847E-01 * C5), -F(8.29847578E-02 * C5),
+     F(1.40753505E-01 * C6), -F(9.75753918E-02 * C6),
+     F(1.33264415E-01 * C7), -F(1.11196689E-01 * C7),
+     F(0.00000000E+00 * C0),  F(1.46404076E-02 * C0),
+    -F(5.31873032E-02 * C1),  F(8.85757540E-03 * C1),
+    -F(3.90751381E-02 * C2),  F(2.92408442E-03 * C2),
+    -F(2.61098752E-02 * C3), -F(4.91578024E-03 * C3),
+     F(1.29371806E-02 * C4), -F(5.65949473E-03 * C4),
+     F(1.53184106E-02 * C5), -F(8.02941163E-03 * C5),
+     F(1.62208471E-02 * C6), -F(1.04584443E-02 * C6),
+     F(1.59045603E-02 * C7), -F(1.27472335E-02 * C7),
+     F(0.00000000E+00 * C0), -F(9.02154502E-04 * C0),
+    -F(3.49717454E-03 * C1),  F(2.10371989E-03 * C1),
+    -F(1.64973098E-03 * C2),  F(1.99454554E-03 * C2),
+    -F(1.78805361E-04 * C3),  F(1.61656283E-03 * C3),
+     F(2.01182542E-03 * C4),  F(0.00000000E+00 * C4),
+     F(1.78371725E-03 * C5), -F(1.56575398E-04 * C5),
+     F(1.47640169E-03 * C6), -F(3.43256425E-04 * C6),
+     F(1.13992507E-03 * C7), -F(5.54620202E-04 * C7),
+#undef F
+#define F(x) F_COS(x)
+    -F(1.0000000000 / C0),  F(0.8314696123 / C1),
+    -F(1.0000000000 / C0), -F(0.1950903220 / C1),
+    -F(1.0000000000 / C0), -F(0.9807852804 / C1),
+    -F(1.0000000000 / C0), -F(0.5555702330 / C1),
+    -F(1.0000000000 / C0),  F(0.5555702330 / C1),
+    -F(1.0000000000 / C0),  F(0.9807852804 / C1),
+    -F(1.0000000000 / C0),  F(0.1950903220 / C1),
+    -F(1.0000000000 / C0), -F(0.8314696123 / C1),
+     F(0.9238795325 / C2),  F(0.9807852804 / C3),
+     F(0.3826834324 / C2),  F(0.8314696123 / C3),
+    -F(0.3826834324 / C2),  F(0.5555702330 / C3),
+    -F(0.9238795325 / C2),  F(0.1950903220 / C3),
+    -F(0.9238795325 / C2), -F(0.1950903220 / C3),
+    -F(0.3826834324 / C2), -F(0.5555702330 / C3),
+     F(0.3826834324 / C2), -F(0.8314696123 / C3),
+     F(0.9238795325 / C2), -F(0.9807852804 / C3),
+     F(0.7071067812 / C4),  F(0.5555702330 / C5),
+    -F(0.7071067812 / C4), -F(0.9807852804 / C5),
+    -F(0.7071067812 / C4),  F(0.1950903220 / C5),
+     F(0.7071067812 / C4),  F(0.8314696123 / C5),
+     F(0.7071067812 / C4), -F(0.8314696123 / C5),
+    -F(0.7071067812 / C4), -F(0.1950903220 / C5),
+    -F(0.7071067812 / C4),  F(0.9807852804 / C5),
+     F(0.7071067812 / C4), -F(0.5555702330 / C5),
+     F(0.3826834324 / C6),  F(0.1950903220 / C7),
+    -F(0.9238795325 / C6), -F(0.5555702330 / C7),
+     F(0.9238795325 / C6),  F(0.8314696123 / C7),
+    -F(0.3826834324 / C6), -F(0.9807852804 / C7),
+    -F(0.3826834324 / C6),  F(0.9807852804 / C7),
+     F(0.9238795325 / C6), -F(0.8314696123 / C7),
+    -F(0.9238795325 / C6),  F(0.5555702330 / C7),
+     F(0.3826834324 / C6), -F(0.1950903220 / C7),
+#undef F
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+};
diff --git a/libavcodec/sbcdsp_data.h b/libavcodec/sbcdsp_data.h
new file mode 100644
index 0000000..10fad5c
--- /dev/null
+++ b/libavcodec/sbcdsp_data.h
@@ -0,0 +1,55 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * miscellaneous SBC tables
+ */
+
+#ifndef AVCODEC_SBCDSP_DATA_H
+#define AVCODEC_SBCDSP_DATA_H
+
+#include "sbc.h"
+
+#define SBC_PROTO_FIXED_SCALE      16
+#define SBC_COS_TABLE_FIXED_SCALE  15
+
+/*
+ * Constant tables for the use in SIMD optimized analysis filters
+ * Each table consists of two parts:
+ * 1. reordered "proto" table
+ * 2. reordered "cos" table
+ *
+ * Due to non-symmetrical reordering, separate tables for "even"
+ * and "odd" cases are needed
+ */
+
+extern const int16_t ff_sbcdsp_analysis_consts_fixed4_simd_even[];
+extern const int16_t ff_sbcdsp_analysis_consts_fixed4_simd_odd[];
+extern const int16_t ff_sbcdsp_analysis_consts_fixed8_simd_even[];
+extern const int16_t ff_sbcdsp_analysis_consts_fixed8_simd_odd[];
+
+#endif /* AVCODEC_SBCDSP_DATA_H */
diff --git a/libavcodec/sbcenc.c b/libavcodec/sbcenc.c
new file mode 100644
index 0000000..e2929e2
--- /dev/null
+++ b/libavcodec/sbcenc.c
@@ -0,0 +1,361 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2012-2013  Intel Corporation
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2008  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC encoder implementation
+ */
+
+#include <stdbool.h>
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "profiles.h"
+#include "put_bits.h"
+#include "sbc.h"
+#include "sbcdsp.h"
+
+typedef struct SBCEncContext {
+    AVClass *class;
+    int64_t max_delay;
+    int msbc;
+    DECLARE_ALIGNED(SBC_ALIGN, struct sbc_frame, frame);
+    DECLARE_ALIGNED(SBC_ALIGN, SBCDSPContext, dsp);
+} SBCEncContext;
+
+static int sbc_analyze_audio(SBCDSPContext *s, struct sbc_frame *frame)
+{
+    int ch, blk;
+    int16_t *x;
+
+    switch (frame->subbands) {
+    case 4:
+        for (ch = 0; ch < frame->channels; ch++) {
+            x = &s->X[ch][s->position - 4 *
+                    s->increment + frame->blocks * 4];
+            for (blk = 0; blk < frame->blocks;
+                        blk += s->increment) {
+                s->sbc_analyze_4s(
+                    s, x,
+                    frame->sb_sample_f[blk][ch],
+                    frame->sb_sample_f[blk + 1][ch] -
+                    frame->sb_sample_f[blk][ch]);
+                x -= 4 * s->increment;
+            }
+        }
+        return frame->blocks * 4;
+
+    case 8:
+        for (ch = 0; ch < frame->channels; ch++) {
+            x = &s->X[ch][s->position - 8 *
+                    s->increment + frame->blocks * 8];
+            for (blk = 0; blk < frame->blocks;
+                        blk += s->increment) {
+                s->sbc_analyze_8s(
+                    s, x,
+                    frame->sb_sample_f[blk][ch],
+                    frame->sb_sample_f[blk + 1][ch] -
+                    frame->sb_sample_f[blk][ch]);
+                x -= 8 * s->increment;
+            }
+        }
+        return frame->blocks * 8;
+
+    default:
+        return AVERROR(EIO);
+    }
+}
+
+/*
+ * Packs the SBC frame from frame into the memory in avpkt.
+ * Returns the length of the packed frame.
+ */
+static size_t sbc_pack_frame(AVPacket *avpkt, struct sbc_frame *frame,
+                             int joint, bool msbc)
+{
+    PutBitContext pb;
+
+    /* Will copy the header parts for CRC-8 calculation here */
+    uint8_t crc_header[11] = { 0 };
+    int crc_pos;
+
+    uint32_t audio_sample;
+
+    int ch, sb, blk;        /* channel, subband, block and bit counters */
+    int bits[2][8];         /* bits distribution */
+    uint32_t levels[2][8];  /* levels are derived from that */
+    uint32_t sb_sample_delta[2][8];
+
+    if (msbc) {
+        avpkt->data[0] = MSBC_SYNCWORD;
+        avpkt->data[1] = 0;
+        avpkt->data[2] = 0;
+    } else {
+        avpkt->data[0] = SBC_SYNCWORD;
+
+        avpkt->data[1]  = (frame->frequency           & 0x03) << 6;
+        avpkt->data[1] |= (((frame->blocks >> 2) - 1) & 0x03) << 4;
+        avpkt->data[1] |= (frame->mode                & 0x03) << 2;
+        avpkt->data[1] |= (frame->allocation          & 0x01) << 1;
+        avpkt->data[1] |= ((frame->subbands == 8)     & 0x01) << 0;
+
+        avpkt->data[2] = frame->bitpool;
+
+        if (frame->bitpool > frame->subbands << (4 + (frame->mode == STEREO
+                                                   || frame->mode == JOINT_STEREO)))
+            return -5;
+    }
+
+    /* Can't fill in crc yet */
+    crc_header[0] = avpkt->data[1];
+    crc_header[1] = avpkt->data[2];
+    crc_pos = 16;
+
+    init_put_bits(&pb, avpkt->data + 4, avpkt->size);
+
+    if (frame->mode == JOINT_STEREO) {
+        put_bits(&pb, frame->subbands, joint);
+        crc_header[crc_pos >> 3] = joint;
+        crc_pos += frame->subbands;
+    }
+
+    for (ch = 0; ch < frame->channels; ch++) {
+        for (sb = 0; sb < frame->subbands; sb++) {
+            put_bits(&pb, 4, frame->scale_factor[ch][sb] & 0x0F);
+            crc_header[crc_pos >> 3] <<= 4;
+            crc_header[crc_pos >> 3] |= frame->scale_factor[ch][sb] & 0x0F;
+            crc_pos += 4;
+        }
+    }
+
+    /* align the last crc byte */
+    if (crc_pos % 8)
+        crc_header[crc_pos >> 3] <<= 8 - (crc_pos % 8);
+
+    avpkt->data[3] = ff_sbc_crc8(frame->crc_ctx, crc_header, crc_pos);
+
+    ff_sbc_calculate_bits(frame, bits);
+
+    for (ch = 0; ch < frame->channels; ch++) {
+        for (sb = 0; sb < frame->subbands; sb++) {
+            levels[ch][sb] = ((1 << bits[ch][sb]) - 1) <<
+                (32 - (frame->scale_factor[ch][sb] +
+                    SCALE_OUT_BITS + 2));
+            sb_sample_delta[ch][sb] = (uint32_t) 1 <<
+                (frame->scale_factor[ch][sb] +
+                    SCALE_OUT_BITS + 1);
+        }
+    }
+
+    for (blk = 0; blk < frame->blocks; blk++) {
+        for (ch = 0; ch < frame->channels; ch++) {
+            for (sb = 0; sb < frame->subbands; sb++) {
+
+                if (bits[ch][sb] == 0)
+                    continue;
+
+                audio_sample = ((uint64_t) levels[ch][sb] *
+                    (sb_sample_delta[ch][sb] +
+                    frame->sb_sample_f[blk][ch][sb])) >> 32;
+
+                put_bits(&pb, bits[ch][sb], audio_sample);
+            }
+        }
+    }
+
+    flush_put_bits(&pb);
+
+    return (put_bits_count(&pb) + 7) / 8;
+}
+
+static int sbc_encode_init(AVCodecContext *avctx)
+{
+    SBCEncContext *sbc = avctx->priv_data;
+    struct sbc_frame *frame = &sbc->frame;
+
+    if (avctx->profile == FF_PROFILE_SBC_MSBC)
+        sbc->msbc = 1;
+
+    if (sbc->msbc) {
+        if (avctx->channels != 1) {
+            av_log(avctx, AV_LOG_ERROR, "mSBC require mono channel.\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (avctx->sample_rate != 16000) {
+            av_log(avctx, AV_LOG_ERROR, "mSBC require 16 kHz samplerate.\n");
+            return AVERROR(EINVAL);
+        }
+
+        frame->mode = SBC_MODE_MONO;
+        frame->subbands = 8;
+        frame->blocks = MSBC_BLOCKS;
+        frame->allocation = SBC_AM_LOUDNESS;
+        frame->bitpool = 26;
+
+        avctx->frame_size = 8 * MSBC_BLOCKS;
+    } else {
+        int d;
+
+        if (avctx->global_quality > 255*FF_QP2LAMBDA) {
+            av_log(avctx, AV_LOG_ERROR, "bitpool > 255 is not allowed.\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (avctx->channels == 1) {
+            frame->mode = SBC_MODE_MONO;
+            if (sbc->max_delay <= 3000 || avctx->bit_rate > 270000)
+                frame->subbands = 4;
+            else
+                frame->subbands = 8;
+        } else {
+            if (avctx->bit_rate < 180000 || avctx->bit_rate > 420000)
+                frame->mode = SBC_MODE_JOINT_STEREO;
+            else
+                frame->mode = SBC_MODE_STEREO;
+            if (sbc->max_delay <= 4000 || avctx->bit_rate > 420000)
+                frame->subbands = 4;
+            else
+                frame->subbands = 8;
+        }
+        /* sbc algorithmic delay is ((blocks + 10) * subbands - 2) / sample_rate */
+        frame->blocks = av_clip(((sbc->max_delay * avctx->sample_rate + 2)
+                               / (1000000 * frame->subbands)) - 10, 4, 16) & ~3;
+
+        frame->allocation = SBC_AM_LOUDNESS;
+
+        d = frame->blocks * ((frame->mode == SBC_MODE_DUAL_CHANNEL) + 1);
+        frame->bitpool = (((avctx->bit_rate * frame->subbands * frame->blocks) / avctx->sample_rate)
+                          - 4 * frame->subbands * avctx->channels
+                          - (frame->mode == SBC_MODE_JOINT_STEREO)*frame->subbands - 32 + d/2) / d;
+        if (avctx->global_quality > 0)
+            frame->bitpool = avctx->global_quality / FF_QP2LAMBDA;
+
+        avctx->frame_size = 4*((frame->subbands >> 3) + 1) * 4*(frame->blocks >> 2);
+    }
+
+    for (int i = 0; avctx->codec->supported_samplerates[i]; i++)
+        if (avctx->sample_rate == avctx->codec->supported_samplerates[i])
+            frame->frequency = i;
+
+    frame->channels = avctx->channels;
+    frame->codesize = frame->subbands * frame->blocks * avctx->channels * 2;
+    frame->crc_ctx = av_crc_get_table(AV_CRC_8_EBU);
+
+    memset(&sbc->dsp.X, 0, sizeof(sbc->dsp.X));
+    sbc->dsp.position = (SBC_X_BUFFER_SIZE - frame->subbands * 9) & ~7;
+    sbc->dsp.increment = sbc->msbc ? 1 : 4;
+    ff_sbcdsp_init(&sbc->dsp);
+
+    return 0;
+}
+
+static int sbc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *av_frame, int *got_packet_ptr)
+{
+    SBCEncContext *sbc = avctx->priv_data;
+    struct sbc_frame *frame = &sbc->frame;
+    uint8_t joint = frame->mode == SBC_MODE_JOINT_STEREO;
+    uint8_t dual  = frame->mode == SBC_MODE_DUAL_CHANNEL;
+    int ret, j = 0;
+
+    int frame_length = 4 + (4 * frame->subbands * frame->channels) / 8
+                     + ((frame->blocks * frame->bitpool * (1 + dual)
+                     + joint * frame->subbands) + 7) / 8;
+
+    /* input must be large enough to encode a complete frame */
+    if (av_frame->nb_samples * frame->channels * 2 < frame->codesize)
+        return 0;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, frame_length, 0)) < 0)
+        return ret;
+
+    /* Select the needed input data processing function and call it */
+    if (frame->subbands == 8)
+        sbc->dsp.position = sbc->dsp.sbc_enc_process_input_8s(
+                sbc->dsp.position, av_frame->data[0], sbc->dsp.X,
+                frame->subbands * frame->blocks, frame->channels);
+    else
+        sbc->dsp.position = sbc->dsp.sbc_enc_process_input_4s(
+                sbc->dsp.position, av_frame->data[0], sbc->dsp.X,
+                frame->subbands * frame->blocks, frame->channels);
+
+    sbc_analyze_audio(&sbc->dsp, &sbc->frame);
+
+    if (frame->mode == JOINT_STEREO)
+        j = sbc->dsp.sbc_calc_scalefactors_j(frame->sb_sample_f,
+                                             frame->scale_factor,
+                                             frame->blocks,
+                                             frame->subbands);
+    else
+        sbc->dsp.sbc_calc_scalefactors(frame->sb_sample_f,
+                                       frame->scale_factor,
+                                       frame->blocks,
+                                       frame->channels,
+                                       frame->subbands);
+    emms_c();
+    sbc_pack_frame(avpkt, frame, j, sbc->msbc);
+
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+#define OFFSET(x) offsetof(SBCEncContext, x)
+#define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "sbc_delay", "set maximum algorithmic latency",
+      OFFSET(max_delay), AV_OPT_TYPE_DURATION, {.i64 = 13000}, 1000,13000, AE },
+    { "msbc",      "use mSBC mode (wideband speech mono SBC)",
+      OFFSET(msbc),      AV_OPT_TYPE_BOOL,     {.i64 = 0},        0,    1, AE },
+    { NULL },
+};
+
+static const AVClass sbc_class = {
+    .class_name = "sbc encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_sbc_encoder = {
+    .name                  = "sbc",
+    .long_name             = NULL_IF_CONFIG_SMALL("SBC (low-complexity subband codec)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_SBC,
+    .priv_data_size        = sizeof(SBCEncContext),
+    .init                  = sbc_encode_init,
+    .encode2               = sbc_encode_frame,
+    .capabilities          = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO, 0},
+    .sample_fmts           = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16,
+                                                             AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]) { 16000, 32000, 44100, 48000, 0 },
+    .priv_class            = &sbc_class,
+    .profiles              = NULL_IF_CONFIG_SMALL(ff_sbc_profiles),
+};
diff --git a/libavcodec/sbr.h b/libavcodec/sbr.h
index a47ad6e..eb7d1ae 100644
--- a/libavcodec/sbr.h
+++ b/libavcodec/sbr.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2010      Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,8 @@
 #include "aacps.h"
 #include "sbrdsp.h"
 
+typedef struct AACContext AACContext;
+
 /**
  * Spectral Band Replication header - spectrum parameters that invoke a reset if they differ from the previous header.
  */
@@ -64,9 +66,9 @@ typedef struct SBRData {
      */
     unsigned           bs_frame_class;
     unsigned           bs_add_harmonic_flag;
-    unsigned           bs_num_env;
+    AAC_SIGNE          bs_num_env;
     uint8_t            bs_freq_res[7];
-    unsigned           bs_num_noise;
+    AAC_SIGNE          bs_num_noise;
     uint8_t            bs_df_env[5];
     uint8_t            bs_df_noise[2];
     uint8_t            bs_invf_mode[2][5];
@@ -78,25 +80,27 @@ typedef struct SBRData {
      * @name State variables
      * @{
      */
-    DECLARE_ALIGNED(32, float, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
-    DECLARE_ALIGNED(32, float, analysis_filterbank_samples) [1312];
+    DECLARE_ALIGNED(32, INTFLOAT, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
+    DECLARE_ALIGNED(32, INTFLOAT, analysis_filterbank_samples) [1312];
     int                synthesis_filterbank_samples_offset;
     ///l_APrev and l_A
     int                e_a[2];
     ///Chirp factors
-    float              bw_array[5];
+    INTFLOAT              bw_array[5];
     ///QMF values of the original signal
-    float              W[2][32][32][2];
+    INTFLOAT              W[2][32][32][2];
     ///QMF output of the HF adjustor
     int                Ypos;
-    DECLARE_ALIGNED(16, float, Y)[2][38][64][2];
-    DECLARE_ALIGNED(16, float, g_temp)[42][48];
-    float              q_temp[42][48];
+    DECLARE_ALIGNED(16, INTFLOAT, Y)[2][38][64][2];
+    DECLARE_ALIGNED(16, AAC_FLOAT, g_temp)[42][48];
+    AAC_FLOAT          q_temp[42][48];
     uint8_t            s_indexmapped[8][48];
     ///Envelope scalefactors
-    float              env_facs[6][48];
+    uint8_t            env_facs_q[6][48];
+    AAC_FLOAT          env_facs[6][48];
     ///Noise scalefactors
-    float              noise_facs[3][5];
+    uint8_t            noise_facs_q[3][5];
+    AAC_FLOAT          noise_facs[3][5];
     ///Envelope time borders
     uint8_t            t_env[8];
     ///Envelope time border of the last envelope of the previous frame
@@ -108,12 +112,35 @@ typedef struct SBRData {
     /** @} */
 } SBRData;
 
+typedef struct SpectralBandReplication SpectralBandReplication;
+
+/**
+ * aacsbr functions pointers
+ */
+typedef struct AACSBRContext {
+    int (*sbr_lf_gen)(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_low[32][40][2], const INTFLOAT W[2][32][32][2],
+                      int buf_idx);
+    void (*sbr_hf_assemble)(INTFLOAT Y1[38][64][2],
+                            const INTFLOAT X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2]);
+    int (*sbr_x_gen)(SpectralBandReplication *sbr, INTFLOAT X[2][38][64],
+                     const INTFLOAT Y0[38][64][2], const INTFLOAT Y1[38][64][2],
+                     const INTFLOAT X_low[32][40][2], int ch);
+    void (*sbr_hf_inverse_filter)(SBRDSPContext *dsp,
+                                  INTFLOAT (*alpha0)[2], INTFLOAT (*alpha1)[2],
+                                  const INTFLOAT X_low[32][40][2], int k0);
+} AACSBRContext;
+
 /**
  * Spectral Band Replication
  */
-typedef struct SpectralBandReplication {
+struct SpectralBandReplication {
     int                sample_rate;
     int                start;
+    int                ready_for_dequant;
+    int                id_aac;
     int                reset;
     SpectrumParameters spectrum_params;
     int                bs_amp_res_header;
@@ -127,23 +154,23 @@ typedef struct SpectralBandReplication {
     unsigned           bs_smoothing_mode;
     /** @} */
     unsigned           bs_coupling;
-    unsigned           k[5]; ///< k0, k1, k2
+    AAC_SIGNE          k[5]; ///< k0, k1, k2
     ///kx', and kx respectively, kx is the first QMF subband where SBR is used.
     ///kx' is its value from the previous frame
-    unsigned           kx[2];
+    AAC_SIGNE          kx[2];
     ///M' and M respectively, M is the number of QMF subbands that use SBR.
-    unsigned           m[2];
+    AAC_SIGNE          m[2];
     unsigned           kx_and_m_pushed;
     ///The number of frequency bands in f_master
-    unsigned           n_master;
+    AAC_SIGNE          n_master;
     SBRData            data[2];
     PSContext          ps;
     ///N_Low and N_High respectively, the number of frequency bands for low and high resolution
-    unsigned           n[2];
+    AAC_SIGNE          n[2];
     ///Number of noise floor bands
-    unsigned           n_q;
+    AAC_SIGNE          n_q;
     ///Number of limiter bands
-    unsigned           n_lim;
+    AAC_SIGNE          n_lim;
     ///The master QMF frequency grouping
     uint16_t           f_master[49];
     ///Frequency borders for low resolution SBR
@@ -153,37 +180,38 @@ typedef struct SpectralBandReplication {
     ///Frequency borders for noise floors
     uint16_t           f_tablenoise[6];
     ///Frequency borders for the limiter
-    uint16_t           f_tablelim[29];
-    unsigned           num_patches;
+    uint16_t           f_tablelim[30];
+    AAC_SIGNE          num_patches;
     uint8_t            patch_num_subbands[6];
     uint8_t            patch_start_subband[6];
     ///QMF low frequency input to the HF generator
-    DECLARE_ALIGNED(16, float, X_low)[32][40][2];
+    DECLARE_ALIGNED(16, INTFLOAT, X_low)[32][40][2];
     ///QMF output of the HF generator
-    DECLARE_ALIGNED(16, float, X_high)[64][40][2];
+    DECLARE_ALIGNED(16, INTFLOAT, X_high)[64][40][2];
     ///QMF values of the reconstructed signal
-    DECLARE_ALIGNED(16, float, X)[2][2][38][64];
+    DECLARE_ALIGNED(16, INTFLOAT, X)[2][2][38][64];
     ///Zeroth coefficient used to filter the subband signals
-    DECLARE_ALIGNED(16, float, alpha0)[64][2];
+    DECLARE_ALIGNED(16, INTFLOAT, alpha0)[64][2];
     ///First coefficient used to filter the subband signals
-    DECLARE_ALIGNED(16, float, alpha1)[64][2];
+    DECLARE_ALIGNED(16, INTFLOAT, alpha1)[64][2];
     ///Dequantized envelope scalefactors, remapped
-    float              e_origmapped[7][48];
+    AAC_FLOAT          e_origmapped[7][48];
     ///Dequantized noise scalefactors, remapped
-    float              q_mapped[7][48];
+    AAC_FLOAT          q_mapped[7][48];
     ///Sinusoidal presence, remapped
     uint8_t            s_mapped[7][48];
     ///Estimated envelope
-    float              e_curr[7][48];
+    AAC_FLOAT          e_curr[7][48];
     ///Amplitude adjusted noise scalefactors
-    float              q_m[7][48];
+    AAC_FLOAT          q_m[7][48];
     ///Sinusoidal levels
-    float              s_m[7][48];
-    float              gain[7][48];
-    DECLARE_ALIGNED(32, float, qmf_filter_scratch)[5][64];
+    AAC_FLOAT          s_m[7][48];
+    AAC_FLOAT          gain[7][48];
+    DECLARE_ALIGNED(32, INTFLOAT, qmf_filter_scratch)[5][64];
     FFTContext         mdct_ana;
     FFTContext         mdct;
     SBRDSPContext      dsp;
-} SpectralBandReplication;
+    AACSBRContext      c;
+};
 
 #endif /* AVCODEC_SBR_H */
diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
index 0294332..a93b5f9 100644
--- a/libavcodec/sbrdsp.c
+++ b/libavcodec/sbrdsp.c
@@ -3,37 +3,31 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define USE_FIXED 0
+
+#include "aac.h"
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/intfloat.h"
 #include "sbrdsp.h"
 
-static void sbr_sum64x5_c(float *z)
-{
-    int k;
-    for (k = 0; k < 64; k++) {
-        float f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k + 256];
-        z[k] = f;
-    }
-}
-
 static float sbr_sum_square_c(float (*x)[2], int n)
 {
     float sum0 = 0.0f, sum1 = 0.0f;
@@ -72,6 +66,7 @@ static void sbr_qmf_pre_shuffle_c(float *z)
         zi[64 + 2 * k + 2].i = zi[63 - k].i ^ (1U << 31);
         zi[64 + 2 * k + 3].i = zi[ k + 2].i;
     }
+
     zi[64 + 2 * 31 + 0].i = zi[64 - 31].i ^ (1U << 31);
     zi[64 + 2 * 31 + 1].i = zi[31 +  1].i;
 }
@@ -100,16 +95,6 @@ static void sbr_qmf_deint_neg_c(float *v, const float *src)
     }
 }
 
-static void sbr_qmf_deint_bfly_c(float *v, const float *src0, const float *src1)
-{
-    int i;
-    for (i = 0; i < 64; i++) {
-        v[      i] = src0[i] - src1[63 - i];
-        v[127 - i] = src0[i] + src1[63 - i];
-    }
-}
-
-
 #if 0
     /* This code is slower because it multiplies memory accesses.
      * It is left for educational purposes and because it may offer
@@ -237,56 +222,4 @@ static av_always_inline void sbr_hf_apply_noise(float (*Y)[2],
     }
 }
 
-static void sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 1.0, 0.0, m_max);
-}
-
-static void sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    float phi_sign = 1 - 2 * (kx & 1);
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 0.0, phi_sign, m_max);
-}
-
-static void sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, -1.0, 0.0, m_max);
-}
-
-static void sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    float phi_sign = 1 - 2 * (kx & 1);
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 0.0, -phi_sign, m_max);
-}
-
-av_cold void ff_sbrdsp_init(SBRDSPContext *s)
-{
-    s->sum64x5 = sbr_sum64x5_c;
-    s->sum_square = sbr_sum_square_c;
-    s->neg_odd_64 = sbr_neg_odd_64_c;
-    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_c;
-    s->qmf_post_shuffle = sbr_qmf_post_shuffle_c;
-    s->qmf_deint_neg = sbr_qmf_deint_neg_c;
-    s->qmf_deint_bfly = sbr_qmf_deint_bfly_c;
-    s->autocorrelate = sbr_autocorrelate_c;
-    s->hf_gen = sbr_hf_gen_c;
-    s->hf_g_filt = sbr_hf_g_filt_c;
-
-    s->hf_apply_noise[0] = sbr_hf_apply_noise_0;
-    s->hf_apply_noise[1] = sbr_hf_apply_noise_1;
-    s->hf_apply_noise[2] = sbr_hf_apply_noise_2;
-    s->hf_apply_noise[3] = sbr_hf_apply_noise_3;
-
-    if (ARCH_ARM)
-        ff_sbrdsp_init_arm(s);
-    if (ARCH_X86)
-        ff_sbrdsp_init_x86(s);
-}
+#include "sbrdsp_template.c"
diff --git a/libavcodec/sbrdsp.h b/libavcodec/sbrdsp.h
index 07235c6..e6fd76d 100644
--- a/libavcodec/sbrdsp.h
+++ b/libavcodec/sbrdsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,30 +22,34 @@
 #define AVCODEC_SBRDSP_H
 
 #include <stdint.h>
+#include "aac_defines.h"
+#include "libavutil/softfloat.h"
 
 typedef struct SBRDSPContext {
-    void (*sum64x5)(float *z);
-    float (*sum_square)(float (*x)[2], int n);
-    void (*neg_odd_64)(float *x);
-    void (*qmf_pre_shuffle)(float *z);
-    void (*qmf_post_shuffle)(float W[32][2], const float *z);
-    void (*qmf_deint_neg)(float *v, const float *src);
-    void (*qmf_deint_bfly)(float *v, const float *src0, const float *src1);
-    void (*autocorrelate)(const float x[40][2], float phi[3][2][2]);
-    void (*hf_gen)(float (*X_high)[2], const float (*X_low)[2],
-                   const float alpha0[2], const float alpha1[2],
-                   float bw, int start, int end);
-    void (*hf_g_filt)(float (*Y)[2], const float (*X_high)[40][2],
-                      const float *g_filt, int m_max, intptr_t ixh);
-    void (*hf_apply_noise[4])(float (*Y)[2], const float *s_m,
-                              const float *q_filt, int noise,
+    void (*sum64x5)(INTFLOAT *z);
+    AAC_FLOAT (*sum_square)(INTFLOAT (*x)[2], int n);
+    void (*neg_odd_64)(INTFLOAT *x);
+    void (*qmf_pre_shuffle)(INTFLOAT *z);
+    void (*qmf_post_shuffle)(INTFLOAT W[32][2], const INTFLOAT *z);
+    void (*qmf_deint_neg)(INTFLOAT *v, const INTFLOAT *src);
+    void (*qmf_deint_bfly)(INTFLOAT *v, const INTFLOAT *src0, const INTFLOAT *src1);
+    void (*autocorrelate)(const INTFLOAT x[40][2], AAC_FLOAT phi[3][2][2]);
+    void (*hf_gen)(INTFLOAT (*X_high)[2], const INTFLOAT (*X_low)[2],
+                   const INTFLOAT alpha0[2], const INTFLOAT alpha1[2],
+                   INTFLOAT bw, int start, int end);
+    void (*hf_g_filt)(INTFLOAT (*Y)[2], const INTFLOAT (*X_high)[40][2],
+                      const AAC_FLOAT *g_filt, int m_max, intptr_t ixh);
+    void (*hf_apply_noise[4])(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                              const AAC_FLOAT *q_filt, int noise,
                               int kx, int m_max);
 } SBRDSPContext;
 
-extern const float ff_sbr_noise_table[][2];
+extern const INTFLOAT AAC_RENAME(ff_sbr_noise_table)[][2];
 
-void ff_sbrdsp_init(SBRDSPContext *s);
+void AAC_RENAME(ff_sbrdsp_init)(SBRDSPContext *s);
 void ff_sbrdsp_init_arm(SBRDSPContext *s);
+void ff_sbrdsp_init_aarch64(SBRDSPContext *s);
 void ff_sbrdsp_init_x86(SBRDSPContext *s);
+void ff_sbrdsp_init_mips(SBRDSPContext *s);
 
 #endif /* AVCODEC_SBRDSP_H */
diff --git a/libavcodec/sbrdsp_fixed.c b/libavcodec/sbrdsp_fixed.c
new file mode 100644
index 0000000..91fa664
--- /dev/null
+++ b/libavcodec/sbrdsp_fixed.c
@@ -0,0 +1,315 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+
+#define USE_FIXED 1
+
+#include "aac.h"
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/intfloat.h"
+#include "sbrdsp.h"
+
+static SoftFloat sbr_sum_square_c(int (*x)[2], int n)
+{
+    SoftFloat ret;
+    uint64_t accu = 0, round;
+    uint64_t accu0 = 0, accu1 = 0, accu2 = 0, accu3 = 0;
+    int i, nz, nz0;
+    unsigned u;
+
+    nz = 0;
+    for (i = 0; i < n; i += 2) {
+        accu0 += (int64_t)x[i + 0][0] * x[i + 0][0];
+        accu1 += (int64_t)x[i + 0][1] * x[i + 0][1];
+        accu2 += (int64_t)x[i + 1][0] * x[i + 1][0];
+        accu3 += (int64_t)x[i + 1][1] * x[i + 1][1];
+        if ((accu0|accu1|accu2|accu3) > UINT64_MAX - INT32_MIN*(int64_t)INT32_MIN || i+2>=n) {
+            accu0 >>= nz;
+            accu1 >>= nz;
+            accu2 >>= nz;
+            accu3 >>= nz;
+            while ((accu0|accu1|accu2|accu3) > (UINT64_MAX - accu) >> 2) {
+                accu0 >>= 1;
+                accu1 >>= 1;
+                accu2 >>= 1;
+                accu3 >>= 1;
+                accu  >>= 1;
+                nz ++;
+            }
+            accu += accu0 + accu1 + accu2 + accu3;
+            accu0 = accu1 = accu2 = accu3 = 0;
+        }
+    }
+
+    nz0 = 15 - nz;
+
+    u = accu >> 32;
+    if (u) {
+        nz = 33;
+        while (u < 0x80000000U) {
+            u <<= 1;
+            nz--;
+        }
+    } else
+        nz = 1;
+
+    round = 1ULL << (nz-1);
+    u = ((accu + round) >> nz);
+    u >>= 1;
+    ret = av_int2sf(u, nz0 - nz);
+
+    return ret;
+}
+
+static void sbr_neg_odd_64_c(int *x)
+{
+    int i;
+    for (i = 1; i < 64; i += 2)
+        x[i] = -x[i];
+}
+
+static void sbr_qmf_pre_shuffle_c(int *z)
+{
+    int k;
+    z[64] = z[0];
+    z[65] = z[1];
+    for (k = 1; k < 32; k++) {
+        z[64+2*k  ] = -z[64 - k];
+        z[64+2*k+1] =  z[ k + 1];
+    }
+}
+
+static void sbr_qmf_post_shuffle_c(int W[32][2], const int *z)
+{
+    int k;
+    for (k = 0; k < 32; k++) {
+        W[k][0] = -z[63-k];
+        W[k][1] = z[k];
+    }
+}
+
+static void sbr_qmf_deint_neg_c(int *v, const int *src)
+{
+    int i;
+    for (i = 0; i < 32; i++) {
+        v[     i] = ( src[63 - 2*i    ] + 0x10) >> 5;
+        v[63 - i] = (-src[63 - 2*i - 1] + 0x10) >> 5;
+    }
+}
+
+static av_always_inline SoftFloat autocorr_calc(int64_t accu)
+{
+        int nz, mant, expo;
+        unsigned round;
+        int i = (int)(accu >> 32);
+        if (i == 0) {
+            nz = 1;
+        } else {
+            nz = 0;
+            while (FFABS(i) < 0x40000000) {
+                i *= 2;
+                nz++;
+            }
+            nz = 32-nz;
+        }
+
+        round = 1U << (nz-1);
+        mant = (int)((accu + round) >> nz);
+        mant = (mant + 0x40LL)>>7;
+        mant *= 64;
+        expo = nz + 15;
+        return av_int2sf(mant, 30 - expo);
+}
+
+static av_always_inline void autocorrelate(const int x[40][2], SoftFloat phi[3][2][2], int lag)
+{
+    int i;
+    int64_t real_sum, imag_sum;
+    int64_t accu_re = 0, accu_im = 0;
+
+    if (lag) {
+        for (i = 1; i < 38; i++) {
+            accu_re += (uint64_t)x[i][0] * x[i+lag][0];
+            accu_re += (uint64_t)x[i][1] * x[i+lag][1];
+            accu_im += (uint64_t)x[i][0] * x[i+lag][1];
+            accu_im -= (uint64_t)x[i][1] * x[i+lag][0];
+        }
+
+        real_sum = accu_re;
+        imag_sum = accu_im;
+
+        accu_re += (uint64_t)x[ 0][0] * x[lag][0];
+        accu_re += (uint64_t)x[ 0][1] * x[lag][1];
+        accu_im += (uint64_t)x[ 0][0] * x[lag][1];
+        accu_im -= (uint64_t)x[ 0][1] * x[lag][0];
+
+        phi[2-lag][1][0] = autocorr_calc(accu_re);
+        phi[2-lag][1][1] = autocorr_calc(accu_im);
+
+        if (lag == 1) {
+            accu_re = real_sum;
+            accu_im = imag_sum;
+            accu_re += (uint64_t)x[38][0] * x[39][0];
+            accu_re += (uint64_t)x[38][1] * x[39][1];
+            accu_im += (uint64_t)x[38][0] * x[39][1];
+            accu_im -= (uint64_t)x[38][1] * x[39][0];
+
+            phi[0][0][0] = autocorr_calc(accu_re);
+            phi[0][0][1] = autocorr_calc(accu_im);
+        }
+    } else {
+        for (i = 1; i < 38; i++) {
+            accu_re += (uint64_t)x[i][0] * x[i][0];
+            accu_re += (uint64_t)x[i][1] * x[i][1];
+        }
+        real_sum = accu_re;
+        accu_re += (uint64_t)x[ 0][0] * x[ 0][0];
+        accu_re += (uint64_t)x[ 0][1] * x[ 0][1];
+
+        phi[2][1][0] = autocorr_calc(accu_re);
+
+        accu_re = real_sum;
+        accu_re += (uint64_t)x[38][0] * x[38][0];
+        accu_re += (uint64_t)x[38][1] * x[38][1];
+
+        phi[1][0][0] = autocorr_calc(accu_re);
+    }
+}
+
+static void sbr_autocorrelate_c(const int x[40][2], SoftFloat phi[3][2][2])
+{
+    autocorrelate(x, phi, 0);
+    autocorrelate(x, phi, 1);
+    autocorrelate(x, phi, 2);
+}
+
+static void sbr_hf_gen_c(int (*X_high)[2], const int (*X_low)[2],
+                       const int alpha0[2], const int alpha1[2],
+                       int bw, int start, int end)
+{
+    int alpha[4];
+    int i;
+    int64_t accu;
+
+    accu = (int64_t)alpha0[0] * bw;
+    alpha[2] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha0[1] * bw;
+    alpha[3] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)bw * bw;
+    bw = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha1[0] * bw;
+    alpha[0] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha1[1] * bw;
+    alpha[1] = (int)((accu + 0x40000000) >> 31);
+
+    for (i = start; i < end; i++) {
+        accu  = (int64_t)X_low[i][0] * 0x20000000;
+        accu += (int64_t)X_low[i - 2][0] * alpha[0];
+        accu -= (int64_t)X_low[i - 2][1] * alpha[1];
+        accu += (int64_t)X_low[i - 1][0] * alpha[2];
+        accu -= (int64_t)X_low[i - 1][1] * alpha[3];
+        X_high[i][0] = (int)((accu + 0x10000000) >> 29);
+
+        accu  = (int64_t)X_low[i][1] * 0x20000000;
+        accu += (int64_t)X_low[i - 2][1] * alpha[0];
+        accu += (int64_t)X_low[i - 2][0] * alpha[1];
+        accu += (int64_t)X_low[i - 1][1] * alpha[2];
+        accu += (int64_t)X_low[i - 1][0] * alpha[3];
+        X_high[i][1] = (int)((accu + 0x10000000) >> 29);
+    }
+}
+
+static void sbr_hf_g_filt_c(int (*Y)[2], const int (*X_high)[40][2],
+                          const SoftFloat *g_filt, int m_max, intptr_t ixh)
+{
+    int m;
+    int64_t accu;
+
+    for (m = 0; m < m_max; m++) {
+        if (22 - g_filt[m].exp < 61) {
+            int64_t r = 1LL << (22-g_filt[m].exp);
+            accu = (int64_t)X_high[m][ixh][0] * ((g_filt[m].mant + 0x40)>>7);
+            Y[m][0] = (int)((accu + r) >> (23-g_filt[m].exp));
+
+            accu = (int64_t)X_high[m][ixh][1] * ((g_filt[m].mant + 0x40)>>7);
+            Y[m][1] = (int)((accu + r) >> (23-g_filt[m].exp));
+        }
+    }
+}
+
+static av_always_inline int sbr_hf_apply_noise(int (*Y)[2],
+                                                const SoftFloat *s_m,
+                                                const SoftFloat *q_filt,
+                                                int noise,
+                                                int phi_sign0,
+                                                int phi_sign1,
+                                                int m_max)
+{
+    int m;
+
+    for (m = 0; m < m_max; m++) {
+        unsigned y0 = Y[m][0];
+        unsigned y1 = Y[m][1];
+        noise = (noise + 1) & 0x1ff;
+        if (s_m[m].mant) {
+            int shift, round;
+
+            shift = 22 - s_m[m].exp;
+            if (shift < 1) {
+                av_log(NULL, AV_LOG_ERROR, "Overflow in sbr_hf_apply_noise, shift=%d\n", shift);
+                return AVERROR(ERANGE);
+            } else if (shift < 30) {
+                round = 1 << (shift-1);
+                y0 += (s_m[m].mant * phi_sign0 + round) >> shift;
+                y1 += (s_m[m].mant * phi_sign1 + round) >> shift;
+            }
+        } else {
+            int shift, round, tmp;
+            int64_t accu;
+
+            shift = 22 - q_filt[m].exp;
+            if (shift < 1) {
+                av_log(NULL, AV_LOG_ERROR, "Overflow in sbr_hf_apply_noise, shift=%d\n", shift);
+                return AVERROR(ERANGE);
+            } else if (shift < 30) {
+                round = 1 << (shift-1);
+
+                accu = (int64_t)q_filt[m].mant * ff_sbr_noise_table_fixed[noise][0];
+                tmp = (int)((accu + 0x40000000) >> 31);
+                y0 += (tmp + round) >> shift;
+
+                accu = (int64_t)q_filt[m].mant * ff_sbr_noise_table_fixed[noise][1];
+                tmp = (int)((accu + 0x40000000) >> 31);
+                y1 += (tmp + round) >> shift;
+            }
+        }
+        Y[m][0] = y0;
+        Y[m][1] = y1;
+        phi_sign1 = -phi_sign1;
+    }
+    return 0;
+}
+
+#include "sbrdsp_template.c"
diff --git a/libavcodec/sbrdsp_template.c b/libavcodec/sbrdsp_template.c
new file mode 100644
index 0000000..37a3365
--- /dev/null
+++ b/libavcodec/sbrdsp_template.c
@@ -0,0 +1,104 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static void sbr_sum64x5_c(INTFLOAT *z)
+{
+    int k;
+    for (k = 0; k < 64; k++) {
+        INTFLOAT f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k + 256];
+        z[k] = f;
+    }
+}
+
+static void sbr_qmf_deint_bfly_c(INTFLOAT *v, const INTFLOAT *src0, const INTFLOAT *src1)
+{
+    int i;
+    for (i = 0; i < 64; i++) {
+#if USE_FIXED
+        v[      i] = (int)(0x10U + src0[i] - src1[63 - i]) >> 5;
+        v[127 - i] = (int)(0x10U + src0[i] + src1[63 - i]) >> 5;
+#else
+        v[      i] = src0[i] - src1[63 - i];
+        v[127 - i] = src0[i] + src1[63 - i];
+#endif
+    }
+}
+
+static void sbr_hf_apply_noise_0(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)1.0, (INTFLOAT)0.0, m_max);
+}
+
+static void sbr_hf_apply_noise_1(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    INTFLOAT phi_sign = 1 - 2 * (kx & 1);
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)0.0, phi_sign, m_max);
+}
+
+static void sbr_hf_apply_noise_2(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)-1.0, (INTFLOAT)0.0, m_max);
+}
+
+static void sbr_hf_apply_noise_3(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    INTFLOAT phi_sign = 1 - 2 * (kx & 1);
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)0.0, -phi_sign, m_max);
+}
+
+av_cold void AAC_RENAME(ff_sbrdsp_init)(SBRDSPContext *s)
+{
+    s->sum64x5 = sbr_sum64x5_c;
+    s->sum_square = sbr_sum_square_c;
+    s->neg_odd_64 = sbr_neg_odd_64_c;
+    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_c;
+    s->qmf_post_shuffle = sbr_qmf_post_shuffle_c;
+    s->qmf_deint_neg = sbr_qmf_deint_neg_c;
+    s->qmf_deint_bfly = sbr_qmf_deint_bfly_c;
+    s->autocorrelate = sbr_autocorrelate_c;
+    s->hf_gen = sbr_hf_gen_c;
+    s->hf_g_filt = sbr_hf_g_filt_c;
+
+    s->hf_apply_noise[0] = sbr_hf_apply_noise_0;
+    s->hf_apply_noise[1] = sbr_hf_apply_noise_1;
+    s->hf_apply_noise[2] = sbr_hf_apply_noise_2;
+    s->hf_apply_noise[3] = sbr_hf_apply_noise_3;
+
+#if !USE_FIXED
+    if (ARCH_ARM)
+        ff_sbrdsp_init_arm(s);
+    if (ARCH_AARCH64)
+        ff_sbrdsp_init_aarch64(s);
+    if (ARCH_X86)
+        ff_sbrdsp_init_x86(s);
+    if (ARCH_MIPS)
+        ff_sbrdsp_init_mips(s);
+#endif /* !USE_FIXED */
+}
diff --git a/libavcodec/scpr.c b/libavcodec/scpr.c
new file mode 100644
index 0000000..750cf59
--- /dev/null
+++ b/libavcodec/scpr.c
@@ -0,0 +1,674 @@
+/*
+ * ScreenPressor decoder
+ *
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "scpr.h"
+#include "scpr3.h"
+
+#define TOP  0x01000000
+#define BOT    0x010000
+
+#include "scpr3.c"
+
+static void init_rangecoder(RangeCoder *rc, GetByteContext *gb)
+{
+    rc->code1 = 0;
+    rc->range = 0xFFFFFFFFU;
+    rc->code  = bytestream2_get_be32(gb);
+}
+
+static void reinit_tables(SCPRContext *s)
+{
+    int comp, i, j;
+
+    for (comp = 0; comp < 3; comp++) {
+        for (j = 0; j < 4096; j++) {
+            if (s->pixel_model[comp][j].total_freq != 256) {
+                for (i = 0; i < 256; i++)
+                    s->pixel_model[comp][j].freq[i] = 1;
+                for (i = 0; i < 16; i++)
+                    s->pixel_model[comp][j].lookup[i] = 16;
+                s->pixel_model[comp][j].total_freq = 256;
+            }
+        }
+    }
+
+    for (j = 0; j < 6; j++) {
+        uint32_t *p = s->run_model[j];
+        for (i = 0; i < 256; i++)
+            p[i] = 1;
+        p[256] = 256;
+    }
+
+    for (j = 0; j < 6; j++) {
+        uint32_t *op = s->op_model[j];
+        for (i = 0; i < 6; i++)
+            op[i] = 1;
+        op[6] = 6;
+    }
+
+    for (i = 0; i < 256; i++) {
+        s->range_model[i] = 1;
+        s->count_model[i] = 1;
+    }
+    s->range_model[256] = 256;
+    s->count_model[256] = 256;
+
+    for (i = 0; i < 5; i++) {
+        s->fill_model[i] = 1;
+    }
+    s->fill_model[5] = 5;
+
+    for (j = 0; j < 4; j++) {
+        for (i = 0; i < 16; i++) {
+            s->sxy_model[j][i] = 1;
+        }
+        s->sxy_model[j][16] = 16;
+    }
+
+    for (i = 0; i < 512; i++) {
+        s->mv_model[0][i] = 1;
+        s->mv_model[1][i] = 1;
+    }
+    s->mv_model[0][512] = 512;
+    s->mv_model[1][512] = 512;
+}
+
+static int decode(GetByteContext *gb, RangeCoder *rc, uint32_t cumFreq, uint32_t freq, uint32_t total_freq)
+{
+    rc->code -= cumFreq * rc->range;
+    rc->range *= freq;
+
+    while (rc->range < TOP && bytestream2_get_bytes_left(gb) > 0) {
+        uint32_t byte = bytestream2_get_byteu(gb);
+        rc->code = (rc->code << 8) | byte;
+        rc->range <<= 8;
+    }
+
+    return 0;
+}
+
+static int get_freq(RangeCoder *rc, uint32_t total_freq, uint32_t *freq)
+{
+    if (total_freq == 0)
+        return AVERROR_INVALIDDATA;
+
+    rc->range = rc->range / total_freq;
+
+    if (rc->range == 0)
+        return AVERROR_INVALIDDATA;
+
+    *freq = rc->code / rc->range;
+
+    return 0;
+}
+
+static int decode0(GetByteContext *gb, RangeCoder *rc, uint32_t cumFreq, uint32_t freq, uint32_t total_freq)
+{
+    uint32_t t;
+
+    if (total_freq == 0)
+        return AVERROR_INVALIDDATA;
+
+    t = rc->range * (uint64_t)cumFreq / total_freq;
+
+    rc->code1 += t + 1;
+    rc->range = rc->range * (uint64_t)(freq + cumFreq) / total_freq - (t + 1);
+
+    while (rc->range < TOP && bytestream2_get_bytes_left(gb) > 0) {
+        uint32_t byte = bytestream2_get_byteu(gb);
+        rc->code = (rc->code << 8) | byte;
+        rc->code1 <<= 8;
+        rc->range <<= 8;
+    }
+
+    return 0;
+}
+
+static int get_freq0(RangeCoder *rc, uint32_t total_freq, uint32_t *freq)
+{
+    if (rc->range == 0)
+        return AVERROR_INVALIDDATA;
+
+    *freq = total_freq * (uint64_t)(rc->code - rc->code1) / rc->range;
+
+    return 0;
+}
+
+static int decode_value(SCPRContext *s, uint32_t *cnt, uint32_t maxc, uint32_t step, uint32_t *rval)
+{
+    GetByteContext *gb = &s->gb;
+    RangeCoder *rc = &s->rc;
+    uint32_t totfr = cnt[maxc];
+    uint32_t value;
+    uint32_t c = 0, cumfr = 0, cnt_c = 0;
+    int i, ret;
+
+    if ((ret = s->get_freq(rc, totfr, &value)) < 0)
+        return ret;
+
+    while (c < maxc) {
+        cnt_c = cnt[c];
+        if (value >= cumfr + cnt_c)
+            cumfr += cnt_c;
+        else
+            break;
+        c++;
+    }
+
+    if (c >= maxc)
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = s->decode(gb, rc, cumfr, cnt_c, totfr)) < 0)
+        return ret;
+
+    cnt[c] = cnt_c + step;
+    totfr += step;
+    if (totfr > BOT) {
+        totfr = 0;
+        for (i = 0; i < maxc; i++) {
+            uint32_t nc = (cnt[i] >> 1) + 1;
+            cnt[i] = nc;
+            totfr += nc;
+        }
+    }
+
+    cnt[maxc] = totfr;
+    *rval = c;
+
+    return 0;
+}
+
+static int decode_unit(SCPRContext *s, PixelModel *pixel, uint32_t step, uint32_t *rval)
+{
+    GetByteContext *gb = &s->gb;
+    RangeCoder *rc = &s->rc;
+    uint32_t totfr = pixel->total_freq;
+    uint32_t value, x = 0, cumfr = 0, cnt_x = 0;
+    int i, j, ret, c, cnt_c;
+
+    if ((ret = s->get_freq(rc, totfr, &value)) < 0)
+        return ret;
+
+    while (x < 16) {
+        cnt_x = pixel->lookup[x];
+        if (value >= cumfr + cnt_x)
+            cumfr += cnt_x;
+        else
+            break;
+        x++;
+    }
+
+    c = x * 16;
+    cnt_c = 0;
+    while (c < 256) {
+        cnt_c = pixel->freq[c];
+        if (value >= cumfr + cnt_c)
+            cumfr += cnt_c;
+        else
+            break;
+        c++;
+    }
+    if (x >= 16 || c >= 256) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = s->decode(gb, rc, cumfr, cnt_c, totfr)) < 0)
+        return ret;
+
+    pixel->freq[c] = cnt_c + step;
+    pixel->lookup[x] = cnt_x + step;
+    totfr += step;
+    if (totfr > BOT) {
+        totfr = 0;
+        for (i = 0; i < 256; i++) {
+            uint32_t nc = (pixel->freq[i] >> 1) + 1;
+            pixel->freq[i] = nc;
+            totfr += nc;
+        }
+        for (i = 0; i < 16; i++) {
+            uint32_t sum = 0;
+            uint32_t i16_17 = i << 4;
+            for (j = 0; j < 16; j++)
+                sum += pixel->freq[i16_17 + j];
+            pixel->lookup[i] = sum;
+        }
+    }
+    pixel->total_freq = totfr;
+
+    *rval = c & s->cbits;
+
+    return 0;
+}
+
+static int decode_units(SCPRContext *s, uint32_t *r, uint32_t *g, uint32_t *b,
+                        int *cx, int *cx1)
+{
+    const int cxshift = s->cxshift;
+    int ret;
+
+    ret = decode_unit(s, &s->pixel_model[0][*cx + *cx1], 400, r);
+    if (ret < 0)
+        return ret;
+
+    *cx1 = (*cx << 6) & 0xFC0;
+    *cx = *r >> cxshift;
+    ret = decode_unit(s, &s->pixel_model[1][*cx + *cx1], 400, g);
+    if (ret < 0)
+        return ret;
+
+    *cx1 = (*cx << 6) & 0xFC0;
+    *cx = *g >> cxshift;
+    ret = decode_unit(s, &s->pixel_model[2][*cx + *cx1], 400, b);
+    if (ret < 0)
+        return ret;
+
+    *cx1 = (*cx << 6) & 0xFC0;
+    *cx = *b >> cxshift;
+
+    return 0;
+}
+
+static int decompress_i(AVCodecContext *avctx, uint32_t *dst, int linesize)
+{
+    SCPRContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    int cx = 0, cx1 = 0, k = 0;
+    int run, off, y = 0, x = 0, ret;
+    uint32_t clr = 0, r, g, b, backstep = linesize - avctx->width;
+    uint32_t lx, ly, ptype;
+
+    reinit_tables(s);
+    bytestream2_skip(gb, 2);
+    init_rangecoder(&s->rc, gb);
+
+    while (k < avctx->width + 1) {
+        ret = decode_units(s, &r, &g, &b, &cx, &cx1);
+        if (ret < 0)
+            return ret;
+
+        ret = decode_value(s, s->run_model[0], 256, 400, &run);
+        if (ret < 0)
+            return ret;
+        if (run <= 0)
+            return AVERROR_INVALIDDATA;
+
+        clr = (b << 16) + (g << 8) + r;
+        k += run;
+        while (run-- > 0) {
+            if (y >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            dst[y * linesize + x] = clr;
+            lx = x;
+            ly = y;
+            x++;
+            if (x >= avctx->width) {
+                x = 0;
+                y++;
+            }
+        }
+    }
+    off = -linesize - 1;
+    ptype = 0;
+
+    while (x < avctx->width && y < avctx->height) {
+        ret = decode_value(s, s->op_model[ptype], 6, 1000, &ptype);
+        if (ret < 0)
+            return ret;
+        if (ptype == 0) {
+            ret = decode_units(s, &r, &g, &b, &cx, &cx1);
+            if (ret < 0)
+                return ret;
+
+            clr = (b << 16) + (g << 8) + r;
+        }
+        if (ptype > 5)
+            return AVERROR_INVALIDDATA;
+        ret = decode_value(s, s->run_model[ptype], 256, 400, &run);
+        if (ret < 0)
+            return ret;
+        if (run <= 0)
+            return AVERROR_INVALIDDATA;
+
+        ret = decode_run_i(avctx, ptype, run, &x, &y, clr,
+                           dst, linesize, &lx, &ly,
+                           backstep, off, &cx, &cx1);
+        if (run < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+static int decompress_p(AVCodecContext *avctx,
+                        uint32_t *dst, int linesize,
+                        uint32_t *prev, int plinesize)
+{
+    SCPRContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    int ret, temp = 0, min, max, x, y, cx = 0, cx1 = 0;
+    int backstep = linesize - avctx->width;
+
+    if (bytestream2_get_byte(gb) == 0)
+        return 1;
+    bytestream2_skip(gb, 1);
+    init_rangecoder(&s->rc, gb);
+
+    ret  = decode_value(s, s->range_model, 256, 1, &min);
+    ret |= decode_value(s, s->range_model, 256, 1, &temp);
+    min += temp << 8;
+    ret |= decode_value(s, s->range_model, 256, 1, &max);
+    ret |= decode_value(s, s->range_model, 256, 1, &temp);
+    if (ret < 0)
+        return ret;
+
+    max += temp << 8;
+    if (min > max || min >= s->nbcount)
+        return AVERROR_INVALIDDATA;
+
+    memset(s->blocks, 0, sizeof(*s->blocks) * s->nbcount);
+
+    while (min <= max) {
+        int fill, count;
+
+        ret  = decode_value(s, s->fill_model,  5,   10, &fill);
+        ret |= decode_value(s, s->count_model, 256, 20, &count);
+        if (ret < 0)
+            return ret;
+        if (count <= 0)
+            return AVERROR_INVALIDDATA;
+
+        while (min < s->nbcount && count-- > 0) {
+            s->blocks[min++] = fill;
+        }
+    }
+
+    for (y = 0; y < s->nby; y++) {
+        for (x = 0; x < s->nbx; x++) {
+            int sy1 = 0, sy2 = 16, sx1 = 0, sx2 = 16;
+
+            if (s->blocks[y * s->nbx + x] == 0)
+                continue;
+
+            if (((s->blocks[y * s->nbx + x] - 1) & 1) > 0) {
+                ret  = decode_value(s, s->sxy_model[0], 16, 100, &sx1);
+                ret |= decode_value(s, s->sxy_model[1], 16, 100, &sy1);
+                ret |= decode_value(s, s->sxy_model[2], 16, 100, &sx2);
+                ret |= decode_value(s, s->sxy_model[3], 16, 100, &sy2);
+                if (ret < 0)
+                    return ret;
+
+                sx2++;
+                sy2++;
+            }
+            if (((s->blocks[y * s->nbx + x] - 1) & 2) > 0) {
+                int i, j, by = y * 16, bx = x * 16;
+                int mvx, mvy;
+
+                ret  = decode_value(s, s->mv_model[0], 512, 100, &mvx);
+                ret |= decode_value(s, s->mv_model[1], 512, 100, &mvy);
+                if (ret < 0)
+                    return ret;
+
+                mvx -= 256;
+                mvy -= 256;
+
+                if (by + mvy + sy1 < 0 || bx + mvx + sx1 < 0 ||
+                    by + mvy + sy1 >= avctx->height || bx + mvx + sx1 >= avctx->width)
+                    return AVERROR_INVALIDDATA;
+
+                for (i = 0; i < sy2 - sy1 && (by + sy1 + i) < avctx->height && (by + mvy + sy1 + i) < avctx->height; i++) {
+                    for (j = 0; j < sx2 - sx1 && (bx + sx1 + j) < avctx->width && (bx + mvx + sx1 + j) < avctx->width; j++) {
+                        dst[(by + i + sy1) * linesize + bx + sx1 + j] = prev[(by + mvy + sy1 + i) * plinesize + bx + sx1 + mvx + j];
+                    }
+                }
+            } else {
+                int run, bx = x * 16 + sx1, by = y * 16 + sy1;
+                uint32_t r, g, b, clr, ptype = 0;
+
+                for (; by < y * 16 + sy2 && by < avctx->height;) {
+                    ret = decode_value(s, s->op_model[ptype], 6, 1000, &ptype);
+                    if (ret < 0)
+                        return ret;
+                    if (ptype == 0) {
+                        ret = decode_units(s, &r, &g, &b, &cx, &cx1);
+                        if (ret < 0)
+                            return ret;
+
+                        clr = (b << 16) + (g << 8) + r;
+                    }
+                    if (ptype > 5)
+                        return AVERROR_INVALIDDATA;
+                    ret = decode_value(s, s->run_model[ptype], 256, 400, &run);
+                    if (ret < 0)
+                        return ret;
+                    if (run <= 0)
+                        return AVERROR_INVALIDDATA;
+
+                    ret = decode_run_p(avctx, ptype, run, x, y, clr,
+                                       dst, prev, linesize, plinesize, &bx, &by,
+                                       backstep, sx1, sx2, &cx, &cx1);
+                    if (ret < 0)
+                        return ret;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    SCPRContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    AVFrame *frame = data;
+    int ret, type;
+
+    if (avctx->bits_per_coded_sample == 16) {
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+    }
+
+    if ((ret = ff_reget_buffer(avctx, s->current_frame)) < 0)
+        return ret;
+
+    bytestream2_init(gb, avpkt->data, avpkt->size);
+
+    type = bytestream2_peek_byte(gb);
+
+    if (type == 2) {
+        s->version = 1;
+        s->get_freq = get_freq0;
+        s->decode = decode0;
+        frame->key_frame = 1;
+        ret = decompress_i(avctx, (uint32_t *)s->current_frame->data[0],
+                           s->current_frame->linesize[0] / 4);
+    } else if (type == 18) {
+        s->version = 2;
+        s->get_freq = get_freq;
+        s->decode = decode;
+        frame->key_frame = 1;
+        ret = decompress_i(avctx, (uint32_t *)s->current_frame->data[0],
+                           s->current_frame->linesize[0] / 4);
+    } else if (type == 34) {
+        frame->key_frame = 1;
+        s->version = 3;
+        ret = decompress_i3(avctx, (uint32_t *)s->current_frame->data[0],
+                            s->current_frame->linesize[0] / 4);
+    } else if (type == 17 || type == 33) {
+        uint32_t clr, *dst = (uint32_t *)s->current_frame->data[0];
+        int x, y;
+
+        frame->key_frame = 1;
+        bytestream2_skip(gb, 1);
+        if (avctx->bits_per_coded_sample == 16) {
+            uint16_t value = bytestream2_get_le16(gb);
+            int r, g, b;
+
+            r = (value      ) & 31;
+            g = (value >>  5) & 31;
+            b = (value >> 10) & 31;
+            clr = (r << 16) + (g << 8) + b;
+        } else {
+            clr = bytestream2_get_le24(gb);
+        }
+        for (y = 0; y < avctx->height; y++) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x] = clr;
+            }
+            dst += s->current_frame->linesize[0] / 4;
+        }
+    } else if (type == 0 || type == 1) {
+        frame->key_frame = 0;
+
+        ret = av_frame_copy(s->current_frame, s->last_frame);
+        if (ret < 0)
+            return ret;
+
+        if (s->version == 1 || s->version == 2)
+            ret = decompress_p(avctx, (uint32_t *)s->current_frame->data[0],
+                               s->current_frame->linesize[0] / 4,
+                               (uint32_t *)s->last_frame->data[0],
+                               s->last_frame->linesize[0] / 4);
+        else
+            ret = decompress_p3(avctx, (uint32_t *)s->current_frame->data[0],
+                                s->current_frame->linesize[0] / 4,
+                                (uint32_t *)s->last_frame->data[0],
+                                s->last_frame->linesize[0] / 4);
+        if (ret == 1)
+            return avpkt->size;
+    } else {
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (ret < 0)
+        return ret;
+
+    if (avctx->bits_per_coded_sample != 16) {
+        ret = av_frame_ref(data, s->current_frame);
+        if (ret < 0)
+            return ret;
+    } else {
+        uint8_t *dst = frame->data[0];
+        int x, y;
+
+        ret = av_frame_copy(frame, s->current_frame);
+        if (ret < 0)
+            return ret;
+
+        // scale up each sample by 8
+        for (y = 0; y < avctx->height; y++) {
+            // If the image is sufficiently aligned, compute 8 samples at once
+            if (!(((uintptr_t)dst) & 7)) {
+                uint64_t *dst64 = (uint64_t *)dst;
+                int w = avctx->width>>1;
+                for (x = 0; x < w; x++) {
+                    dst64[x] = (dst64[x] << 3) & 0xFCFCFCFCFCFCFCFCULL;
+                }
+                x *= 8;
+            } else
+                x = 0;
+            for (; x < avctx->width * 4; x++) {
+                dst[x] = dst[x] << 3;
+            }
+            dst += frame->linesize[0];
+        }
+    }
+
+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    FFSWAP(AVFrame *, s->current_frame, s->last_frame);
+
+    frame->data[0]     += frame->linesize[0] * (avctx->height - 1);
+    frame->linesize[0] *= -1;
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    SCPRContext *s = avctx->priv_data;
+
+    switch (avctx->bits_per_coded_sample) {
+    case 16: avctx->pix_fmt = AV_PIX_FMT_RGB0; break;
+    case 24:
+    case 32: avctx->pix_fmt = AV_PIX_FMT_BGR0; break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported bitdepth %i\n", avctx->bits_per_coded_sample);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->get_freq = get_freq0;
+    s->decode = decode0;
+
+    s->cxshift = avctx->bits_per_coded_sample == 16 ? 0 : 2;
+    s->cbits = avctx->bits_per_coded_sample == 16 ? 0x1F : 0xFF;
+    s->nbx = (avctx->width + 15) / 16;
+    s->nby = (avctx->height + 15) / 16;
+    s->nbcount = s->nbx * s->nby;
+    s->blocks = av_malloc_array(s->nbcount, sizeof(*s->blocks));
+    if (!s->blocks)
+        return AVERROR(ENOMEM);
+
+    s->last_frame = av_frame_alloc();
+    s->current_frame = av_frame_alloc();
+    if (!s->last_frame || !s->current_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    SCPRContext *s = avctx->priv_data;
+
+    av_freep(&s->blocks);
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->current_frame);
+
+    return 0;
+}
+
+AVCodec ff_scpr_decoder = {
+    .name             = "scpr",
+    .long_name        = NULL_IF_CONFIG_SMALL("ScreenPressor"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_SCPR,
+    .priv_data_size   = sizeof(SCPRContext),
+    .init             = decode_init,
+    .close            = decode_close,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/scpr.h b/libavcodec/scpr.h
new file mode 100644
index 0000000..15cb87c
--- /dev/null
+++ b/libavcodec/scpr.h
@@ -0,0 +1,365 @@
+/*
+ * ScreenPressor decoder
+ *
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SCPR_H
+#define AVCODEC_SCPR_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "scpr3.h"
+
+typedef struct RangeCoder {
+    uint32_t   code;
+    uint32_t   range;
+    uint32_t   code1;
+} RangeCoder;
+
+typedef struct PixelModel {
+    uint32_t    freq[256];
+    uint32_t    lookup[16];
+    uint32_t    total_freq;
+} PixelModel;
+
+typedef struct SCPRContext {
+    int             version;
+    AVFrame        *last_frame;
+    AVFrame        *current_frame;
+    GetByteContext  gb;
+    RangeCoder      rc;
+    PixelModel      pixel_model[3][4096];
+    uint32_t        op_model[6][7];
+    uint32_t        run_model[6][257];
+    uint32_t        range_model[257];
+    uint32_t        count_model[257];
+    uint32_t        fill_model[6];
+    uint32_t        sxy_model[4][17];
+    uint32_t        mv_model[2][513];
+    uint32_t        nbx, nby;
+    uint32_t        nbcount;
+    uint32_t       *blocks;
+    uint32_t        cbits;
+    int             cxshift;
+
+    PixelModel3     pixel_model3[3][4096];
+    RunModel3       run_model3[6];
+    RunModel3       range_model3;
+    RunModel3       count_model3;
+    FillModel3      fill_model3;
+    SxyModel3       sxy_model3[4];
+    MVModel3        mv_model3[2];
+    OpModel3        op_model3[6];
+
+    int           (*get_freq)(RangeCoder *rc, uint32_t total_freq, uint32_t *freq);
+    int           (*decode)(GetByteContext *gb, RangeCoder *rc, uint32_t cumFreq, uint32_t freq, uint32_t total_freq);
+} SCPRContext;
+
+static int decode_run_i(AVCodecContext *avctx, uint32_t ptype, int run,
+                        int *px, int *py, uint32_t clr, uint32_t *dst,
+                        int linesize, uint32_t *plx, uint32_t *ply,
+                        uint32_t backstep, int off, int *cx, int *cx1)
+{
+    uint32_t r, g, b;
+    int z;
+    int x = *px,
+        y = *py;
+    uint32_t lx = *plx,
+             ly = *ply;
+
+    if (y >= avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    switch (ptype) {
+    case 0:
+        while (run-- > 0) {
+            dst[y * linesize + x] = clr;
+            lx = x;
+            ly = y;
+            (x)++;
+            if (x >= avctx->width) {
+                x = 0;
+                (y)++;
+                if (y >= avctx->height && run)
+                    return AVERROR_INVALIDDATA;
+            }
+        }
+        break;
+    case 1:
+        while (run-- > 0) {
+            dst[y * linesize + x] = dst[ly * linesize + lx];
+            lx = x;
+            ly = y;
+            (x)++;
+            if (x >= avctx->width) {
+                x = 0;
+                (y)++;
+                if (y >= avctx->height && run)
+                    return AVERROR_INVALIDDATA;
+            }
+        }
+        clr = dst[ly * linesize + lx];
+        break;
+    case 2:
+        if (y < 1)
+            return AVERROR_INVALIDDATA;
+
+        while (run-- > 0) {
+            clr = dst[y * linesize + x + off + 1];
+            dst[y * linesize + x] = clr;
+            lx = x;
+            ly = y;
+            (x)++;
+            if (x >= avctx->width) {
+                x = 0;
+                (y)++;
+                if (y >= avctx->height && run)
+                    return AVERROR_INVALIDDATA;
+            }
+        }
+        break;
+    case 4:
+        if (y < 1 || (y == 1 && x == 0))
+            return AVERROR_INVALIDDATA;
+
+        while (run-- > 0) {
+            uint8_t *odst = (uint8_t *)dst;
+            int off1 = (ly * linesize + lx) * 4;
+            int off2 = ((y * linesize + x) + off) * 4;
+
+            if (x == 0) {
+                z = backstep * 4;
+            } else {
+                z = 0;
+            }
+
+            r = odst[off1] +
+                odst[off2 + 4] -
+                odst[off2 - z ];
+            g = odst[off1 + 1] +
+                odst[off2 + 5] -
+                odst[off2 - z  + 1];
+            b = odst[off1 + 2] +
+                odst[off2 + 6] -
+                odst[off2 - z  + 2];
+            clr = ((b & 0xFF) << 16) + ((g & 0xFF) << 8) + (r & 0xFF);
+            dst[y * linesize + x] = clr;
+            lx = x;
+            ly = y;
+            (x)++;
+            if (x >= avctx->width) {
+                x = 0;
+                (y)++;
+                if (y >= avctx->height && run)
+                    return AVERROR_INVALIDDATA;
+            }
+        }
+        break;
+    case 5:
+        if (y < 1 || (y == 1 && x == 0))
+            return AVERROR_INVALIDDATA;
+
+        while (run-- > 0) {
+            if (x == 0) {
+                z = backstep;
+            } else {
+                z = 0;
+            }
+
+            clr = dst[y * linesize + x + off - z];
+            dst[y * linesize + x] = clr;
+            lx = x;
+            ly = y;
+            (x)++;
+            if (x >= avctx->width) {
+                x = 0;
+                (y)++;
+                if (y >= avctx->height && run)
+                    return AVERROR_INVALIDDATA;
+            }
+        }
+        break;
+    }
+
+    *px = x;
+    *py = y;
+    *plx= lx;
+    *ply= ly;
+
+    if (avctx->bits_per_coded_sample == 16) {
+        *cx1 = (clr & 0x3F00) >> 2;
+        *cx = (clr & 0x3FFFFF) >> 16;
+    } else {
+        *cx1 = (clr & 0xFC00) >> 4;
+        *cx = (clr & 0xFFFFFF) >> 18;
+    }
+
+    return 0;
+}
+
+static int decode_run_p(AVCodecContext *avctx, uint32_t ptype, int run,
+                        int x, int y, uint32_t clr,
+                        uint32_t *dst, uint32_t *prev,
+                        int linesize, int plinesize,
+                        uint32_t *bx, uint32_t *by,
+                        uint32_t backstep, int sx1, int sx2,
+                        int *cx, int *cx1)
+{
+    uint32_t r, g, b;
+    int z;
+
+    switch (ptype) {
+    case 0:
+        while (run-- > 0) {
+            if (*by >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            dst[*by * linesize + *bx] = clr;
+            (*bx)++;
+            if (*bx >= x * 16 + sx2 || *bx >= avctx->width) {
+                *bx = x * 16 + sx1;
+                (*by)++;
+            }
+        }
+        break;
+    case 1:
+        while (run-- > 0) {
+            if (*bx == 0) {
+                if (*by < 1)
+                    return AVERROR_INVALIDDATA;
+                z = backstep;
+            } else {
+                z = 0;
+            }
+
+            if (*by >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            clr = dst[*by * linesize + *bx - 1 - z];
+            dst[*by * linesize + *bx] = clr;
+            (*bx)++;
+            if (*bx >= x * 16 + sx2 || *bx >= avctx->width) {
+                *bx = x * 16 + sx1;
+                (*by)++;
+            }
+        }
+        break;
+    case 2:
+        while (run-- > 0) {
+            if (*by < 1 || *by >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            clr = dst[(*by - 1) * linesize + *bx];
+            dst[*by * linesize + *bx] = clr;
+            (*bx)++;
+            if (*bx >= x * 16 + sx2 || *bx >= avctx->width) {
+                *bx = x * 16 + sx1;
+                (*by)++;
+            }
+        }
+        break;
+    case 3:
+        while (run-- > 0) {
+            if (*by >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            clr = prev[*by * plinesize + *bx];
+            dst[*by * linesize + *bx] = clr;
+            (*bx)++;
+            if (*bx >= x * 16 + sx2 || *bx >= avctx->width) {
+                *bx = x * 16 + sx1;
+                (*by)++;
+            }
+        }
+        break;
+    case 4:
+        while (run-- > 0) {
+            uint8_t *odst = (uint8_t *)dst;
+
+            if (*by < 1 || *by >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            if (*bx == 0) {
+                if (*by < 2)
+                    return AVERROR_INVALIDDATA;
+                z = backstep;
+            } else {
+                z = 0;
+            }
+
+            r = odst[((*by - 1) * linesize + *bx) * 4] +
+                odst[(*by * linesize + *bx - 1 - z) * 4] -
+                odst[((*by - 1) * linesize + *bx - 1 - z) * 4];
+            g = odst[((*by - 1) * linesize + *bx) * 4 + 1] +
+                odst[(*by * linesize + *bx - 1 - z) * 4 + 1] -
+                odst[((*by - 1) * linesize + *bx - 1 - z) * 4 + 1];
+            b = odst[((*by - 1) * linesize + *bx) * 4 + 2] +
+                odst[(*by * linesize + *bx - 1 - z) * 4 + 2] -
+                odst[((*by - 1) * linesize + *bx - 1 - z) * 4 + 2];
+            clr = ((b & 0xFF) << 16) + ((g & 0xFF) << 8) + (r & 0xFF);
+            dst[*by * linesize + *bx] = clr;
+            (*bx)++;
+            if (*bx >= x * 16 + sx2 || *bx >= avctx->width) {
+                *bx = x * 16 + sx1;
+                (*by)++;
+            }
+        }
+        break;
+    case 5:
+        while (run-- > 0) {
+            if (*by < 1 || *by >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            if (*bx == 0) {
+                if (*by < 2)
+                    return AVERROR_INVALIDDATA;
+                z = backstep;
+            } else {
+                z = 0;
+            }
+
+            clr = dst[(*by - 1) * linesize + *bx - 1 - z];
+            dst[*by * linesize + *bx] = clr;
+            (*bx)++;
+            if (*bx >= x * 16 + sx2 || *bx >= avctx->width) {
+                *bx = x * 16 + sx1;
+                (*by)++;
+            }
+        }
+        break;
+    }
+
+    if (avctx->bits_per_coded_sample == 16) {
+        *cx1 = (clr & 0x3F00) >> 2;
+        *cx = (clr & 0x3FFFFF) >> 16;
+    } else {
+        *cx1 = (clr & 0xFC00) >> 4;
+        *cx = (clr & 0xFFFFFF) >> 18;
+    }
+
+    return 0;
+}
+
+#endif /* AVCODEC_SCPR_H */
diff --git a/libavcodec/scpr3.c b/libavcodec/scpr3.c
new file mode 100644
index 0000000..b59a8cc
--- /dev/null
+++ b/libavcodec/scpr3.c
@@ -0,0 +1,1207 @@
+/*
+ * ScreenPressor version 3 decoder
+ *
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/qsort.h"
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "scpr.h"
+
+static void renew_table3(uint32_t nsym, uint32_t *cntsum,
+                         uint16_t *freqs, uint16_t *freqs1,
+                         uint16_t *cnts, uint8_t *dectab)
+{
+    uint32_t a = 0, b = 4096 / nsym, c = b - (b >> 1);
+
+    *cntsum = c * nsym;
+
+    for (int d = 0; d < nsym; d++) {
+        freqs[d] = b;
+        freqs1[d] = a;
+        cnts[d] = c;
+        for (int q = a + 128 - 1 >> 7, f = (a + b - 1 >> 7) + 1; q < f; q++)
+            dectab[q] = d;
+
+        a += b;
+    }
+}
+
+static void reinit_tables3(SCPRContext * s)
+{
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 4096; j++) {
+            PixelModel3 *m = &s->pixel_model3[i][j];
+            m->type = 0;
+        }
+    }
+
+    for (int i = 0; i < 6; i++) {
+        renew_table3(256, &s->run_model3[i].cntsum,
+                     s->run_model3[i].freqs[0], s->run_model3[i].freqs[1],
+                     s->run_model3[i].cnts, s->run_model3[i].dectab);
+    }
+
+    renew_table3(256, &s->range_model3.cntsum,
+                 s->range_model3.freqs[0], s->range_model3.freqs[1],
+                 s->range_model3.cnts, s->range_model3.dectab);
+
+    renew_table3(5, &s->fill_model3.cntsum,
+                 s->fill_model3.freqs[0], s->fill_model3.freqs[1],
+                 s->fill_model3.cnts, s->fill_model3.dectab);
+
+    renew_table3(256, &s->count_model3.cntsum,
+                 s->count_model3.freqs[0], s->count_model3.freqs[1],
+                 s->count_model3.cnts, s->count_model3.dectab);
+
+    for (int i = 0; i < 4; i++) {
+        renew_table3(16, &s->sxy_model3[i].cntsum,
+                     s->sxy_model3[i].freqs[0], s->sxy_model3[i].freqs[1],
+                     s->sxy_model3[i].cnts, s->sxy_model3[i].dectab);
+    }
+
+    for (int i = 0; i < 2; i++) {
+        renew_table3(512, &s->mv_model3[i].cntsum,
+                     s->mv_model3[i].freqs[0], s->mv_model3[i].freqs[1],
+                     s->mv_model3[i].cnts, s->mv_model3[i].dectab);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        renew_table3(6, &s->op_model3[i].cntsum,
+                     s->op_model3[i].freqs[0], s->op_model3[i].freqs[1],
+                     s->op_model3[i].cnts, s->op_model3[i].dectab);
+    }
+}
+
+static int decode3(GetByteContext *gb, RangeCoder *rc, uint32_t a, uint32_t b)
+{
+    uint32_t code = a * (rc->code >> 12) + (rc->code & 0xFFF) - b;
+
+    while (code < 0x800000 && bytestream2_get_bytes_left(gb) > 0)
+        code = bytestream2_get_byteu(gb) | (code << 8);
+    rc->code = code;
+
+    return 0;
+}
+
+static void rescale(PixelModel3 *m, int *totfr)
+{
+    uint32_t a;
+
+    a = 256 - m->size;
+    for (int b = 0; b < m->size; b++) {
+        m->freqs[b] -= m->freqs[b] >> 1;
+        a += m->freqs[b];
+    }
+
+    *totfr = a;
+}
+
+static int add_symbol(PixelModel3 *m, int index, uint32_t symbol, int *totfr, int max)
+{
+    if (m->size == max)
+        return 0;
+
+    for (int c = m->size - 1; c >= index; c--) {
+        m->symbols[c + 1] = m->symbols[c];
+        m->freqs[c + 1] = m->freqs[c];
+    }
+
+    m->symbols[index] = symbol;
+    m->freqs[index] = 50;
+    m->size++;
+
+    if (m->maxpos >= index)
+        m->maxpos++;
+
+    *totfr += 50;
+    if (*totfr + 50 > 4096)
+        rescale(m, totfr);
+
+    return 1;
+}
+
+static int decode_adaptive45(PixelModel3 *m, int rccode, uint32_t *value,
+                             uint16_t *a, uint16_t *b, uint32_t *c, int max)
+{
+    uint32_t q, g, maxpos, d, e = *c, totfr = *c;
+    int ret;
+
+    for (d = 0; e <= 2048; d++)
+        e <<= 1;
+    maxpos = m->maxpos;
+    rccode >>= d;
+    *c = m->freqs[maxpos];
+    m->freqs[maxpos] += 4096 - e >> d;
+
+    for (q = 0, g = 0, e = 0; q < m->size; q++) {
+        uint32_t f = m->symbols[q];
+        uint32_t p = e + f - g;
+        uint32_t k = m->freqs[q];
+
+        if (rccode < p) {
+            *value = rccode - e + g;
+            *b = rccode << d;
+            *a = 1 << d;
+            m->freqs[maxpos] = *c;
+            ret = add_symbol(m, q, *value, &totfr, max);
+            *c = totfr;
+            return ret;
+        }
+
+        if (p + k > rccode) {
+            *value = f;
+            e += *value - g;
+            *b = e << d;
+            *a = k << d;
+            m->freqs[maxpos] = *c;
+            m->freqs[q] += 50;
+            totfr += 50;
+            if ((q != maxpos) && (m->freqs[q] > m->freqs[maxpos]))
+                m->maxpos = q;
+            if (totfr + 50 > 4096)
+                rescale(m, &totfr);
+            *c = totfr;
+            return 1;
+        }
+
+        e += f - g + k;
+        g = f + 1;
+    }
+
+    m->freqs[maxpos] = *c;
+    *value = g + rccode - e;
+    *b = rccode << d;
+    *a = 1 << d;
+    ret = add_symbol(m, q, *value, &totfr, max);
+    *c = totfr;
+    return ret;
+}
+
+static int update_model6_to_7(PixelModel3 *m)
+{
+    PixelModel3 n = {0};
+    int c, d, e, f, k, p, length, i, j, index;
+    uint16_t *freqs, *freqs1, *cnts;
+
+    n.type = 7;
+
+    length = m->length;
+    freqs = n.freqs;
+    freqs1 = n.freqs1;
+    cnts = n.cnts;
+    n.cntsum = m->cnts[length];
+    for (i = 0; i < length; i++) {
+        if (!m->cnts[i])
+            continue;
+        index = m->symbols[i];
+        freqs[index] = m->freqs[2 * i];
+        freqs1[index] = m->freqs[2 * i + 1];
+        cnts[index] = m->cnts[i];
+    }
+    c = 1 << m->fshift;
+    d = c - (c >> 1);
+    for (j = 0, e = 0; j < 256; j++) {
+        f = freqs[j];
+        if (!f) {
+            f = c;
+            freqs[j] = c;
+            freqs1[j] = e;
+            cnts[j] = d;
+        }
+        p = (e + 127) >> 7;
+        k = ((f + e - 1) >> 7) + 1;
+        for (i = 0; i < k - p; i++)
+            n.dectab[p + i] = j;
+        e += f;
+    }
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static void calc_sum(PixelModel3 *m)
+{
+    uint32_t a;
+    int len;
+
+    len = m->length;
+    a = 256 - m->size << (m->fshift > 0 ? m->fshift - 1 : 0);
+    for (int c = 0; c < len; c++)
+        a += m->cnts[c];
+    m->cnts[len] = a;
+}
+
+static void rescale_dec(PixelModel3 *m)
+{
+    uint16_t cnts[256] = {0};
+    uint16_t freqs[512] = {0};
+    int b, c, e, g;
+    uint32_t a;
+
+    for (a = 1 << (0 < m->fshift ? m->fshift - 1 : 0), b = 0; b < 256; b++)
+        cnts[b] = a;
+
+    for (a = 0, b = m->size; a < b; a++)
+        cnts[m->symbols[a]] = m->cnts[a];
+
+    for (b = a = 0; b < 256; b++) {
+        freqs[2 * b] = cnts[b];
+        freqs[2 * b + 1] = a;
+        a += cnts[b];
+    }
+
+    if (m->fshift > 0)
+        m->fshift--;
+
+    a = 256 - m->size << (0 < m->fshift ? m->fshift - 1 : 0);
+    for (b = 0, c = m->size; b < c; b++) {
+        m->cnts[b] -= m->cnts[b] >> 1;
+        a = a + m->cnts[b];
+        e = m->symbols[b];
+        g = freqs[2 * e + 1];
+        m->freqs[2 * b] = freqs[2 * e];
+        m->freqs[2 * b + 1] = g;
+    }
+    m->cnts[m->length] = a;
+}
+
+static int update_model5_to_6(PixelModel3 *m, uint8_t value)
+{
+    PixelModel3 n = {0};
+    int c, d, e, f, g, k, q, p;
+
+    n.type = 6;
+    n.length = 32;
+
+    for (c = m->size, d = 256 - c, e = 0; e < c; e++)
+        d = d + m->freqs[e];
+
+    for (e = 0; d <= 2048; e++)
+        d <<= 1;
+
+    for (q = d = 0, g = q = 0; g < c; g++) {
+        p = m->symbols[g];
+        d = d + (p - q);
+        q = m->freqs[g];
+        k = q << e;
+        n.freqs[2 * g] = k;
+        n.freqs[2 * g + 1] = d << e;
+        n.cnts[g] = k - (k >> 1);
+        n.symbols[g] = p;
+        d += q;
+        q = p + 1;
+    }
+
+    n.fshift = e;
+    e = 1 << n.fshift;
+    d = 0;
+    if (value > 0) {
+        d = -1;
+        for (p = f = g = 0; p < c; p++) {
+            k = n.symbols[p];
+            if (k > d && k < value) {
+                d = k;
+                g = n.freqs[2 * p];
+                f = n.freqs[2 * p + 1];
+            }
+        }
+        d = 0 < g ? f + g + (value - d - 1 << n.fshift) : value << n.fshift;
+    }
+    n.freqs[2 * c] = e;
+    n.freqs[2 * c + 1] = d;
+    n.cnts[c] = e - (e >> 1);
+    n.symbols[c] = value;
+    n.size = c + 1;
+    e = 25 << n.fshift;
+    n.cnts[c] += e;
+    n.cnts[32] += e;
+    if (n.cnts[32] + e > 4096)
+        rescale_dec(&n);
+
+    calc_sum(&n);
+    for (c = 0, e = n.size - 1; c < e; c++) {
+        for (g = c + 1, f = n.size; g < f; g++) {
+            if (q = n.freqs[2 * g], k = n.freqs[2 * c], q > k) {
+                int l = n.freqs[2 * c + 1];
+                int h = n.freqs[2 * g + 1];
+                n.freqs[2 * c] = q;
+                n.freqs[2 * c + 1] = h;
+                n.freqs[2 * g] = k;
+                n.freqs[2 * g + 1] = l;
+                FFSWAP(uint16_t, n.cnts[c], n.cnts[g]);
+                FFSWAP(uint8_t, n.symbols[c], n.symbols[g]);
+            }
+        }
+    }
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static void grow_dec(PixelModel3 *m)
+{
+    int a;
+
+    a = 2 * m->length;
+    m->cnts[2 * m->length] = m->cnts[m->length];
+    m->length = a;
+}
+
+static int add_dec(PixelModel3 *m, int sym, int f1, int f2)
+{
+    int size;
+
+    if (m->size >= 40 || m->size >= m->length)
+        return -1;
+
+    size = m->size;
+    m->symbols[size] = sym;
+    m->freqs[2 * size] = f1;
+    m->freqs[2 * size + 1] = f2;
+    m->cnts[size] = f1 - (f1 >> 1);
+    m->size++;
+
+    return size;
+}
+
+static void incr_cntdec(PixelModel3 *m, int a)
+{
+    int b, len, d, e, g;
+
+    b = 25 << m->fshift;
+    len = m->length;
+    m->cnts[a] += b;
+    m->cnts[len] += b;
+    if (a > 0 && m->cnts[a] > m->cnts[a - 1]) {
+        FFSWAP(uint16_t, m->cnts[a], m->cnts[a - 1]);
+        d = m->freqs[2 * a];
+        e = m->freqs[2 * a + 1];
+        g = m->freqs[2 * (a - 1) + 1];
+        m->freqs[2 * a] = m->freqs[2 * (a - 1)];
+        m->freqs[2 * a + 1] = g;
+        g = a - 1;
+        m->freqs[2 * g] = d;
+        m->freqs[2 * g + 1] = e;
+        FFSWAP(uint8_t, m->symbols[a], m->symbols[a - 1]);
+    }
+
+    if (m->cnts[len] + b > 4096)
+        rescale_dec(m);
+}
+
+static int decode_adaptive6(PixelModel3 *m, uint32_t code, uint32_t *value,
+                            uint16_t *a, uint16_t *b)
+{
+    int c, d, e, f, g, q;
+
+    for (c = 0, d = 0, e = 0, f = 0, g = 0, q = m->size; g < q; g++) {
+        uint32_t p = m->freqs[2 * g + 1];
+
+        if (p <= code) {
+            uint32_t k = m->freqs[2 * g];
+
+            if (p + k > code) {
+                *value = m->symbols[g];
+                *a = k;
+                *b = p;
+                incr_cntdec(m, g);
+                return 1;
+            }
+
+            if (p >= d) {
+                c = k;
+                d = p;
+                e = m->symbols[g];
+            }
+        }
+    }
+
+    g = 1 << m->fshift;
+    q = f = 0;
+
+    if (c > 0) {
+        f = code - (d + c) >> m->fshift;
+        q = f + e + 1;
+        f = d + c + (f << m->fshift);
+    } else {
+        q = code >> m->fshift;
+        f = q << m->fshift;
+    }
+
+    *a = g;
+    *b = f;
+    *value = q;
+
+    c = add_dec(m, q, g, f);
+    if (c < 0) {
+        if (m->length == 64)
+            return 0;
+        grow_dec(m);
+        c = add_dec(m, q, g, f);
+    }
+
+    incr_cntdec(m, c);
+    return 1;
+}
+
+static int cmpbytes(const void *p1, const void *p2)
+{
+    int left  = *(const uint8_t *)p1;
+    int right = *(const uint8_t *)p2;
+    return FFDIFFSIGN(left, right);
+}
+
+static int update_model1_to_2(PixelModel3 *m, uint32_t val)
+{
+    PixelModel3 n = {0};
+    int i, b;
+
+    n.type = 2;
+    n.size = m->size + 1;
+    b = m->size;
+    for (i = 0; i < b; i++)
+        n.symbols[i] = m->symbols[i];
+    n.symbols[b] = val;
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static int update_model1_to_4(PixelModel3 *m, uint32_t val)
+{
+    PixelModel3 n = {0};
+    int size, i;
+
+    size = m->size;
+    n.type = 4;
+    n.size = size;
+    for (i = 0; i < n.size; i++) {
+        n.symbols[i] = m->symbols[i];
+    }
+    AV_QSORT(n.symbols, size, uint8_t, cmpbytes);
+    for (i = 0; i < n.size; i++) {
+        if (val == n.symbols[i]) {
+            n.freqs[i] = 100;
+            n.maxpos = i;
+        } else {
+            n.freqs[i] = 50;
+        }
+    }
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static int update_model1_to_5(PixelModel3 *m, uint32_t val)
+{
+    PixelModel3 n = {0};
+    int i, size, freqs;
+    uint32_t a;
+
+    size = m->size;
+    n.size = size;
+    for (i = 0; i < size; i++) {
+        n.symbols[i] = m->symbols[i];
+    }
+    AV_QSORT(n.symbols, size, uint8_t, cmpbytes);
+    size = n.size;
+    for (i = 0; i < size; i++) {
+        if (val == n.symbols[i]) {
+            n.freqs[i] = 100;
+            n.maxpos = i;
+        } else {
+            n.freqs[i] = 50;
+        }
+    }
+    a = 256 - size;
+    for (i = 0; i < size; i++, a += freqs)
+        freqs = n.freqs[i];
+    n.type = 5;
+    n.cntsum = a;
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static int decode_static1(PixelModel3 *m, uint32_t val)
+{
+    uint32_t size;
+
+    size = m->size;
+    for (int i = 0; i < size; i++) {
+        if (val == m->symbols[i]) {
+            if (size <= 4)
+                return update_model1_to_4(m, val);
+            else
+                return update_model1_to_5(m, val);
+        }
+    }
+
+    if (size >= 14)
+        return update_model1_to_2(m, val);
+
+    m->symbols[size] = val;
+    m->size++;
+    return 0;
+}
+
+static int update_model2_to_6(PixelModel3 *m, uint8_t value, int a4)
+{
+    PixelModel3 n = {0};
+    int c, d, e, f, g, q;
+
+    n.type = 6;
+    n.length = a4;
+
+    memset(n.symbols, 1u, a4);
+
+    c = m->size;
+    d = 256 - c + (64 * c + 64);
+    for (e = 0; d <= 2048; e++) {
+        d <<= 1;
+    }
+
+    g = q = 0;
+    AV_QSORT(m->symbols, c, uint8_t, cmpbytes);
+    for (f = d = 0; f < c; f++) {
+        int p = f;
+        int k = m->symbols[p];
+        int l;
+        g = g + (k - q);
+
+        if (k == value) {
+            d = p;
+            q = 128;
+        } else {
+            q = 64;
+        }
+        l = q << e;
+        n.freqs[2 * p] = l;
+        n.freqs[2 * p + 1] = g << e;
+        n.symbols[p] = k;
+        n.cnts[p] = l - (l >> 1);
+        g += q;
+        q = k + 1;
+    }
+    n.size = c;
+    n.fshift = e;
+    calc_sum(&n);
+
+    if (d > 0) {
+        c = n.freqs[0];
+        e = n.freqs[1];
+        g = n.freqs[2 * d + 1];
+        n.freqs[0] = n.freqs[2 * d];
+        n.freqs[1] = g;
+        n.freqs[2 * d] = c;
+        n.freqs[2 * d + 1] = e;
+        FFSWAP(uint16_t, n.cnts[0], n.cnts[d]);
+        FFSWAP(uint8_t, n.symbols[0], n.symbols[d]);
+    }
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static int update_model2_to_3(PixelModel3 *m, uint32_t val)
+{
+    PixelModel3 n = {0};
+    uint32_t size;
+
+    n.type = 3;
+    n.size = m->size + 1;
+
+    size = m->size;
+    for (int i = 0; i < size; i++)
+        n.symbols[i] = m->symbols[i];
+    n.symbols[size] = val;
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static int decode_static2(PixelModel3 *m, uint32_t val)
+{
+    uint32_t size;
+
+    size = m->size;
+    for (int i = 0; i < size; i++) {
+        if (val == m->symbols[i]) {
+            int a;
+
+            if (m->size <= 32)
+                a = 32;
+            else
+                a = 64;
+            return update_model2_to_6(m, val, a);
+        }
+    }
+
+    if (size >= 64)
+        return update_model2_to_3(m, val);
+
+    m->symbols[size] = val;
+    m->size++;
+
+    return 0;
+}
+
+static int update_model3_to_7(PixelModel3 *m, uint8_t value)
+{
+    PixelModel3 n = {0};
+    int c, d, e, f, g, q;
+
+    n.type = 7;
+
+    for (c = 0; c < 256; c++) {
+        d = c;
+        n.freqs[d] = 1;
+        n.cnts[d] = 1;
+    }
+
+    for (c = m->size, d = (4096 - (256 - c)) / (c + 1) | 0, e = d - (d >> 1), g = 0; g < c;) {
+        q = g++;
+        q = m->symbols[q];
+        n.freqs[q] = d;
+        n.cnts[q] = e;
+    }
+    n.freqs[value] += d;
+    n.cnts[value] += 16;
+    for (d = c = n.cntsum = 0; 256 > d; d++) {
+        e = d;
+        n.cntsum += n.cnts[e];
+        n.freqs1[e] = c;
+        for (g = n.freqs[e], q = c + 128 - 1 >> 7, f = (c + g - 1 >> 7) + 1; q < f; q++) {
+            n.dectab[q] = e;
+        }
+        c += g;
+    }
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static int decode_static3(PixelModel3 *m, uint32_t val)
+{
+    uint32_t size = m->size;
+
+    for (int i = 0; i < size; i++) {
+        if (val == m->symbols[i])
+            return update_model3_to_7(m, val);
+    }
+
+    if (size >= 256)
+        return 0;
+
+    m->symbols[size] = val;
+    m->size++;
+    return 0;
+}
+
+static void sync_code3(GetByteContext *gb, RangeCoder *rc)
+{
+    rc->code1++;
+    if (rc->code1 == 0x20000) {
+        rc->code = bytestream2_get_le32(gb);
+        rc->code1 = 0;
+    }
+}
+
+static int decode_value3(SCPRContext *s, uint32_t max, uint32_t *cntsum,
+                         uint16_t *freqs1, uint16_t *freqs2,
+                         uint16_t *cnts, uint8_t *dectable,
+                         uint32_t *value)
+{
+    GetByteContext *gb = &s->gb;
+    RangeCoder *rc = &s->rc;
+    uint32_t r, y, a, b, e, g, q;
+
+    r = dectable[(rc->code & 0xFFFu) >> 7];
+    if (r < max) {
+        while (freqs2[r + 1] <= (rc->code & 0xFFF)) {
+            if (++r >= max)
+                break;
+        }
+    }
+
+    if (r > max)
+        return AVERROR_INVALIDDATA;
+
+    cnts[r] += 16;
+    a = freqs1[r];
+    b = freqs2[r];
+    *cntsum += 16;
+    if (*cntsum + 16 > 4096) {
+        *cntsum = 0;
+        for (int c = 0, i = 0; i < max + 1; i++) {
+            e = cnts[i];
+            freqs2[i] = c;
+            freqs1[i] = e;
+            g = (c + 127) >> 7;
+            c += e;
+            q = ((c - 1) >> 7) + 1;
+            if (q > g) {
+                for (int j = 0; j < q - g; j++)
+                    dectable[j + g] = i;
+            }
+            y = e - (e >> 1);
+            cnts[i] = y;
+            *cntsum += y;
+        }
+    }
+
+    decode3(gb, rc, a, b);
+    sync_code3(gb, rc);
+
+    *value = r;
+
+    return 0;
+}
+
+static void calc_sum5(PixelModel3 *m)
+{
+    uint32_t a;
+
+    a = 256 - m->size;
+    for (int b = 0; b < m->size; b++)
+        a += m->freqs[b];
+    m->cntsum = a;
+}
+
+static int update_model4_to_5(PixelModel3 *m, uint32_t value)
+{
+    PixelModel3 n = {0};
+    int c, e, g, totfr;
+
+    n.type = 5;
+
+    for (c = 0, e = 0; c < m->size && m->symbols[c] < value; c++) {
+        n.symbols[c] = m->symbols[c];
+        e += n.freqs[c] = m->freqs[c];
+    }
+
+    g = c;
+    n.symbols[g] = value;
+    e += n.freqs[g++] = 50;
+    for (; c < m->size; g++, c++) {
+        n.symbols[g] = m->symbols[c];
+        e += n.freqs[g] = m->freqs[c];
+    }
+    n.size = m->size + 1;
+    if (e > 4096)
+        rescale(&n, &totfr);
+
+    calc_sum5(&n);
+
+    memcpy(m, &n, sizeof(n));
+
+    return 0;
+}
+
+static int decode_unit3(SCPRContext *s, PixelModel3 *m, uint32_t code, uint32_t *value)
+{
+    GetByteContext *gb = &s->gb;
+    RangeCoder *rc = &s->rc;
+    uint16_t a = 0, b = 0;
+    uint32_t param;
+    int type;
+
+    type = m->type;
+    switch (type) {
+    case 0:
+        *value = bytestream2_get_byte(&s->gb);
+        m->type = 1;
+        m->size = 1;
+        m->symbols[0] = *value;
+        sync_code3(gb, rc);
+        break;
+    case 1:
+        *value = bytestream2_get_byte(&s->gb);
+        decode_static1(m, *value);
+        sync_code3(gb, rc);
+        break;
+    case 2:
+        *value = bytestream2_get_byte(&s->gb);
+        decode_static2(m, *value);
+        sync_code3(gb, rc);
+        break;
+    case 3:
+        *value = bytestream2_get_byte(&s->gb);
+        decode_static3(m, *value);
+        sync_code3(gb, rc);
+        break;
+    case 4:
+        param = m->freqs[0] + m->freqs[1] + m->freqs[2] + m->freqs[3] + 256 - m->size;
+        if (!decode_adaptive45(m, code, value, &a, &b, &param, 4))
+            update_model4_to_5(m, *value);
+        decode3(gb, rc, a, b);
+        sync_code3(gb, rc);
+        break;
+    case 5:
+        if (!decode_adaptive45(m, code, value, &a, &b, &m->cntsum, 16))
+            update_model5_to_6(m, *value);
+        decode3(gb, rc, a, b);
+        sync_code3(gb, rc);
+        break;
+    case 6:
+        if (!decode_adaptive6(m, code, value, &a, &b)) {
+            update_model6_to_7(m);
+        }
+        decode3(gb, rc, a, b);
+        sync_code3(gb, rc);
+        break;
+    case 7:
+        return decode_value3(s, 255, &m->cntsum,
+                             m->freqs, m->freqs1,
+                             m->cnts, m->dectab, value);
+    }
+
+    if (*value > 255)
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+static int decode_units3(SCPRContext * s, uint32_t *red,
+                         uint32_t *green, uint32_t *blue,
+                         int *cx, int *cx1)
+{
+    RangeCoder *rc = &s->rc;
+    int ret;
+
+    ret = decode_unit3(s, &s->pixel_model3[0][*cx + *cx1], rc->code & 0xFFF, red);
+    if (ret < 0)
+        return ret;
+
+    *cx1 = (*cx << 6) & 0xFC0;
+    *cx = *red >> 2;
+
+    ret = decode_unit3(s, &s->pixel_model3[1][*cx + *cx1], rc->code & 0xFFF, green);
+    if (ret < 0)
+        return ret;
+
+    *cx1 = (*cx << 6) & 0xFC0;
+    *cx = *green >> 2;
+
+    ret = decode_unit3(s, &s->pixel_model3[2][*cx + *cx1], rc->code & 0xFFF, blue);
+    if (ret < 0)
+        return ret;
+
+    *cx1 = (*cx << 6) & 0xFC0;
+    *cx = *blue >> 2;
+
+    return 0;
+}
+
+static void init_rangecoder3(RangeCoder *rc, GetByteContext *gb)
+{
+    rc->code  = bytestream2_get_le32(gb);
+    rc->code1 = 0;
+}
+
+static int decompress_i3(AVCodecContext *avctx, uint32_t *dst, int linesize)
+{
+    SCPRContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    RangeCoder *rc = &s->rc;
+    int cx = 0, cx1 = 0, k = 0;
+    int run, off, y = 0, x = 0, ret;
+    uint32_t backstep = linesize - avctx->width;
+    uint32_t clr = 0, lx, ly, ptype, r, g, b;
+
+    bytestream2_skip(gb, 1);
+    init_rangecoder3(rc, gb);
+    reinit_tables3(s);
+
+    while (k < avctx->width + 1) {
+        ret = decode_units3(s, &r, &g, &b, &cx, &cx1);
+        if (ret < 0)
+            return ret;
+        ret = decode_value3(s, 255, &s->run_model3[0].cntsum,
+                            s->run_model3[0].freqs[0],
+                            s->run_model3[0].freqs[1],
+                            s->run_model3[0].cnts,
+                            s->run_model3[0].dectab, &run);
+        if (ret < 0)
+            return ret;
+        if (run <= 0)
+            return AVERROR_INVALIDDATA;
+
+        clr = (b << 16) + (g << 8) + r;
+        k += run;
+        while (run-- > 0) {
+            if (y >= avctx->height)
+                return AVERROR_INVALIDDATA;
+
+            dst[y * linesize + x] = clr;
+            lx = x;
+            ly = y;
+            x++;
+            if (x >= avctx->width) {
+                x = 0;
+                y++;
+            }
+        }
+    }
+    off = -linesize - 1;
+    ptype = 0;
+
+    while (x < avctx->width && y < avctx->height) {
+        ret = decode_value3(s, 5, &s->op_model3[ptype].cntsum,
+                            s->op_model3[ptype].freqs[0],
+                            s->op_model3[ptype].freqs[1],
+                            s->op_model3[ptype].cnts,
+                            s->op_model3[ptype].dectab, &ptype);
+        if (ret < 0)
+            return ret;
+        if (ptype == 0) {
+            ret = decode_units3(s, &r, &g, &b, &cx, &cx1);
+            if (ret < 0)
+                return ret;
+            clr = (b << 16) + (g << 8) + r;
+        }
+        if (ptype > 5)
+            return AVERROR_INVALIDDATA;
+        ret = decode_value3(s, 255, &s->run_model3[ptype].cntsum,
+                            s->run_model3[ptype].freqs[0],
+                            s->run_model3[ptype].freqs[1],
+                            s->run_model3[ptype].cnts,
+                            s->run_model3[ptype].dectab, &run);
+        if (ret < 0)
+            return ret;
+        if (run <= 0)
+            return AVERROR_INVALIDDATA;
+
+        ret = decode_run_i(avctx, ptype, run, &x, &y, clr,
+                           dst, linesize, &lx, &ly,
+                           backstep, off, &cx, &cx1);
+        if (run < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+static int decompress_p3(AVCodecContext *avctx,
+                         uint32_t *dst, int linesize,
+                         uint32_t *prev, int plinesize)
+{
+    SCPRContext *s = avctx->priv_data;
+    GetByteContext *gb = &s->gb;
+    int ret, temp, min, max, x, y, cx = 0, cx1 = 0;
+    int backstep = linesize - avctx->width;
+    int mvx = 0, mvy = 0;
+
+    if (bytestream2_get_byte(gb) == 0)
+        return 1;
+    init_rangecoder3(&s->rc, gb);
+
+    ret  = decode_value3(s, 255, &s->range_model3.cntsum,
+                         s->range_model3.freqs[0],
+                         s->range_model3.freqs[1],
+                         s->range_model3.cnts,
+                         s->range_model3.dectab, &min);
+    ret |= decode_value3(s, 255, &s->range_model3.cntsum,
+                         s->range_model3.freqs[0],
+                         s->range_model3.freqs[1],
+                         s->range_model3.cnts,
+                         s->range_model3.dectab, &temp);
+    min += temp << 8;
+    ret |= decode_value3(s, 255, &s->range_model3.cntsum,
+                         s->range_model3.freqs[0],
+                         s->range_model3.freqs[1],
+                         s->range_model3.cnts,
+                         s->range_model3.dectab, &max);
+    ret |= decode_value3(s, 255, &s->range_model3.cntsum,
+                         s->range_model3.freqs[0],
+                         s->range_model3.freqs[1],
+                         s->range_model3.cnts,
+                         s->range_model3.dectab, &temp);
+    if (ret < 0)
+        return ret;
+
+    max += temp << 8;
+    if (min > max || min >= s->nbcount)
+        return AVERROR_INVALIDDATA;
+
+    memset(s->blocks, 0, sizeof(*s->blocks) * s->nbcount);
+
+    while (min <= max) {
+        int fill, count;
+
+        ret  = decode_value3(s, 4, &s->fill_model3.cntsum,
+                             s->fill_model3.freqs[0],
+                             s->fill_model3.freqs[1],
+                             s->fill_model3.cnts,
+                             s->fill_model3.dectab, &fill);
+        ret |= decode_value3(s, 255, &s->count_model3.cntsum,
+                             s->count_model3.freqs[0],
+                             s->count_model3.freqs[1],
+                             s->count_model3.cnts,
+                             s->count_model3.dectab, &count);
+        if (ret < 0)
+            return ret;
+        if (count <= 0)
+            return AVERROR_INVALIDDATA;
+
+        while (min < s->nbcount && count-- > 0) {
+            s->blocks[min++] = fill;
+        }
+    }
+
+    for (y = 0; y < s->nby; y++) {
+        for (x = 0; x < s->nbx; x++) {
+            int sy1 = 0, sy2 = 16, sx1 = 0, sx2 = 16;
+
+            if (s->blocks[y * s->nbx + x] == 0)
+                continue;
+
+            if (((s->blocks[y * s->nbx + x] + 1) & 1) > 0) {
+                ret  = decode_value3(s, 15, &s->sxy_model3[0].cntsum,
+                                     s->sxy_model3[0].freqs[0],
+                                     s->sxy_model3[0].freqs[1],
+                                     s->sxy_model3[0].cnts,
+                                     s->sxy_model3[0].dectab, &sx1);
+                ret |= decode_value3(s, 15, &s->sxy_model3[1].cntsum,
+                                     s->sxy_model3[1].freqs[0],
+                                     s->sxy_model3[1].freqs[1],
+                                     s->sxy_model3[1].cnts,
+                                     s->sxy_model3[1].dectab, &sy1);
+                ret |= decode_value3(s, 15, &s->sxy_model3[2].cntsum,
+                                     s->sxy_model3[2].freqs[0],
+                                     s->sxy_model3[2].freqs[1],
+                                     s->sxy_model3[2].cnts,
+                                     s->sxy_model3[2].dectab, &sx2);
+                ret |= decode_value3(s, 15, &s->sxy_model3[3].cntsum,
+                                     s->sxy_model3[3].freqs[0],
+                                     s->sxy_model3[3].freqs[1],
+                                     s->sxy_model3[3].cnts,
+                                     s->sxy_model3[3].dectab, &sy2);
+                if (ret < 0)
+                    return ret;
+
+                sx2++;
+                sy2++;
+            }
+            if (((s->blocks[y * s->nbx + x] + 3) & 2) > 0) {
+                int i, a, b, c, j, by = y * 16, bx = x * 16;
+                uint32_t code;
+
+                a = s->rc.code & 0xFFF;
+                c = 1;
+
+                if (a < 0x800)
+                    c = 0;
+                b = 2048;
+                if (!c)
+                    b = 0;
+
+                code = a + ((s->rc.code >> 1) & 0xFFFFF800) - b;
+                while (code < 0x800000 && bytestream2_get_bytes_left(gb) > 0)
+                    code = bytestream2_get_byteu(gb) | (code << 8);
+                s->rc.code = code;
+
+                sync_code3(gb, &s->rc);
+
+                if (!c) {
+                    ret  = decode_value3(s, 511, &s->mv_model3[0].cntsum,
+                                         s->mv_model3[0].freqs[0],
+                                         s->mv_model3[0].freqs[1],
+                                         s->mv_model3[0].cnts,
+                                         s->mv_model3[0].dectab, &mvx);
+                    ret |= decode_value3(s, 511, &s->mv_model3[1].cntsum,
+                                         s->mv_model3[1].freqs[0],
+                                         s->mv_model3[1].freqs[1],
+                                         s->mv_model3[1].cnts,
+                                         s->mv_model3[1].dectab, &mvy);
+                    if (ret < 0)
+                        return ret;
+
+                    mvx -= 256;
+                    mvy -= 256;
+                }
+
+                if (by + mvy + sy1 < 0 || bx + mvx + sx1 < 0 ||
+                    by + mvy + sy1 >= avctx->height || bx + mvx + sx1 >= avctx->width)
+                    return AVERROR_INVALIDDATA;
+
+                for (i = 0; i < sy2 - sy1 && (by + sy1 + i) < avctx->height && (by + mvy + sy1 + i) < avctx->height; i++) {
+                    for (j = 0; j < sx2 - sx1 && (bx + sx1 + j) < avctx->width && (bx + mvx + sx1 + j) < avctx->width; j++) {
+                        dst[(by + i + sy1) * linesize + bx + sx1 + j] = prev[(by + mvy + sy1 + i) * plinesize + bx + sx1 + mvx + j];
+                    }
+                }
+            } else {
+                int run, bx = x * 16 + sx1, by = y * 16 + sy1;
+                uint32_t clr, ptype = 0, r, g, b;
+
+                for (; by < y * 16 + sy2 && by < avctx->height;) {
+                    ret = decode_value3(s, 5, &s->op_model3[ptype].cntsum,
+                                        s->op_model3[ptype].freqs[0],
+                                        s->op_model3[ptype].freqs[1],
+                                        s->op_model3[ptype].cnts,
+                                        s->op_model3[ptype].dectab, &ptype);
+                    if (ret < 0)
+                        return ret;
+                    if (ptype == 0) {
+                        ret = decode_units3(s, &r, &g, &b, &cx, &cx1);
+                        if (ret < 0)
+                            return ret;
+
+                        clr = (b << 16) + (g << 8) + r;
+                    }
+                    if (ptype > 5)
+                        return AVERROR_INVALIDDATA;
+                    ret = decode_value3(s, 255, &s->run_model3[ptype].cntsum,
+                                        s->run_model3[ptype].freqs[0],
+                                        s->run_model3[ptype].freqs[1],
+                                        s->run_model3[ptype].cnts,
+                                        s->run_model3[ptype].dectab, &run);
+                    if (ret < 0)
+                        return ret;
+                    if (run <= 0)
+                        return AVERROR_INVALIDDATA;
+
+                    ret = decode_run_p(avctx, ptype, run, x, y, clr,
+                                       dst, prev, linesize, plinesize, &bx, &by,
+                                       backstep, sx1, sx2, &cx, &cx1);
+                    if (ret < 0)
+                        return ret;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/libavcodec/scpr3.h b/libavcodec/scpr3.h
new file mode 100644
index 0000000..92ad968
--- /dev/null
+++ b/libavcodec/scpr3.h
@@ -0,0 +1,82 @@
+/*
+ * ScreenPressor version 3 decoder
+ *
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SCPR3_H
+#define AVCODEC_SCPR3_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "internal.h"
+
+typedef struct PixelModel3 {
+    uint8_t    type;
+    uint8_t    length;
+    uint8_t    maxpos;
+    uint8_t    fshift;
+    uint16_t   size;
+    uint32_t   cntsum;
+    uint8_t    symbols[256];
+    uint16_t   freqs[256];
+    uint16_t   freqs1[256];
+    uint16_t   cnts[256];
+    uint8_t    dectab[32];
+} PixelModel3;
+
+typedef struct FillModel3 {
+    uint32_t   cntsum;
+    uint16_t   freqs[2][5];
+    uint16_t   cnts[5];
+    uint8_t    dectab[32];
+} FillModel3;
+
+typedef struct OpModel3 {
+    uint32_t   cntsum;
+    uint16_t   freqs[2][6];
+    uint16_t   cnts[6];
+    uint8_t    dectab[32];
+} OpModel3;
+
+typedef struct RunModel3 {
+    uint32_t   cntsum;
+    uint16_t   freqs[2][256];
+    uint16_t   cnts[256];
+    uint8_t    dectab[32];
+} RunModel3;
+
+typedef struct SxyModel3 {
+    uint32_t   cntsum;
+    uint16_t   freqs[2][16];
+    uint16_t   cnts[16];
+    uint8_t    dectab[32];
+} SxyModel3;
+
+typedef struct MVModel3 {
+    uint32_t   cntsum;
+    uint16_t   freqs[2][512];
+    uint16_t   cnts[512];
+    uint8_t    dectab[32];
+} MVModel3;
+
+#endif /* AVCODEC_SCPR3_H */
diff --git a/libavcodec/screenpresso.c b/libavcodec/screenpresso.c
index eae0ae7..fb8bfd4 100644
--- a/libavcodec/screenpresso.c
+++ b/libavcodec/screenpresso.c
@@ -2,20 +2,20 @@
  * Screenpresso decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
  * rebuilt frame (not the reference), and since there is no coordinate system
  * they contain exactly as many pixel as the keyframe.
  *
- * Supports: BGRA, BGR24, RGB555
+ * Supports: BGR0, BGR24, RGB555
  */
 
 #include <stdint.h>
@@ -129,7 +129,7 @@ static int screenpresso_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt = AV_PIX_FMT_BGR24;
         break;
     case 4:
-        avctx->pix_fmt = AV_PIX_FMT_BGRA;
+        avctx->pix_fmt = AV_PIX_FMT_BGR0;
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Invalid bits per pixel value (%d)\n",
@@ -179,7 +179,7 @@ static int screenpresso_decode_frame(AVCodecContext *avctx, void *data,
     }
     *got_frame = 1;
 
-    return 0;
+    return avpkt->size;
 }
 
 AVCodec ff_screenpresso_decoder = {
diff --git a/libavcodec/sgi.h b/libavcodec/sgi.h
index 3c47d3a..5ec891e 100644
--- a/libavcodec/sgi.h
+++ b/libavcodec/sgi.h
@@ -2,20 +2,20 @@
  * SGI image encoder
  * Xiaohui Sun <tjnksxh@hotmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sgidec.c b/libavcodec/sgidec.c
index 6f93a30..02ad1e1 100644
--- a/libavcodec/sgidec.c
+++ b/libavcodec/sgidec.c
@@ -2,24 +2,25 @@
  * SGI image decoder
  * Todd Kirby <doubleshot@pacbell.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -41,7 +42,7 @@ typedef struct SgiState {
  * @param out_buf Points to one line after the output buffer.
  * @param len length of out_buf in bytes
  * @param pixelstride pixel stride of input buffer
- * @return size of output in bytes, -1 if buffer overflows
+ * @return size of output in bytes, else return error code.
  */
 static int expand_rle_row8(SgiState *s, uint8_t *out_buf,
                            int len, int pixelstride)
@@ -59,7 +60,7 @@ static int expand_rle_row8(SgiState *s, uint8_t *out_buf,
         }
 
         /* Check for buffer overflow. */
-        if (pixelstride * (count - 1) >= len) {
+        if (out_end - out_buf <= pixelstride * (count - 1)) {
             av_log(s->avctx, AV_LOG_ERROR, "Invalid pixel count.\n");
             return AVERROR_INVALIDDATA;
         }
@@ -97,7 +98,7 @@ static int expand_rle_row16(SgiState *s, uint16_t *out_buf,
             break;
 
         /* Check for buffer overflow. */
-        if (pixelstride * (count - 1) >= len) {
+        if (out_end - out_buf <= pixelstride * (count - 1)) {
             av_log(s->avctx, AV_LOG_ERROR, "Invalid pixel count.\n");
             return AVERROR_INVALIDDATA;
         }
@@ -125,7 +126,7 @@ static int expand_rle_row16(SgiState *s, uint16_t *out_buf,
  * Read a run length encoded SGI image.
  * @param out_buf output buffer
  * @param s the current image state
- * @return 0 if no error, else return error number.
+ * @return 0 if no error, else return error code.
  */
 static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
 {
@@ -144,7 +145,7 @@ static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
     for (z = 0; z < s->depth; z++) {
         dest_row = out_buf;
         for (y = 0; y < s->height; y++) {
-            linesize = s->width * s->depth * s->bytes_per_channel;
+            linesize = s->width * s->depth;
             dest_row -= s->linesize;
             start_offset = bytestream2_get_be32(&g_table);
             bytestream2_seek(&s->g, start_offset, SEEK_SET);
@@ -163,7 +164,7 @@ static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
  * Read an uncompressed SGI image.
  * @param out_buf output buffer
  * @param s the current image state
- * @return 0 if read success, otherwise return -1.
+ * @return 0 if read success, else return error code.
  */
 static int read_uncompressed_sgi(unsigned char *out_buf, SgiState *s)
 {
@@ -215,27 +216,27 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     /* Test for SGI magic. */
-    if (bytestream2_get_be16(&s->g) != SGI_MAGIC) {
+    if (bytestream2_get_be16u(&s->g) != SGI_MAGIC) {
         av_log(avctx, AV_LOG_ERROR, "bad magic number\n");
         return AVERROR_INVALIDDATA;
     }
 
-    rle                  = bytestream2_get_byte(&s->g);
-    s->bytes_per_channel = bytestream2_get_byte(&s->g);
-    dimension            = bytestream2_get_be16(&s->g);
-    s->width             = bytestream2_get_be16(&s->g);
-    s->height            = bytestream2_get_be16(&s->g);
-    s->depth             = bytestream2_get_be16(&s->g);
+    rle                  = bytestream2_get_byteu(&s->g);
+    s->bytes_per_channel = bytestream2_get_byteu(&s->g);
+    dimension            = bytestream2_get_be16u(&s->g);
+    s->width             = bytestream2_get_be16u(&s->g);
+    s->height            = bytestream2_get_be16u(&s->g);
+    s->depth             = bytestream2_get_be16u(&s->g);
 
     if (s->bytes_per_channel != 1 && s->bytes_per_channel != 2) {
         av_log(avctx, AV_LOG_ERROR, "wrong channel number\n");
-        return AVERROR(EINVAL);
+        return AVERROR_INVALIDDATA;
     }
 
     /* Check for supported image dimensions. */
     if (dimension != 2 && dimension != 3) {
         av_log(avctx, AV_LOG_ERROR, "wrong dimension number\n");
-        return AVERROR(EINVAL);
+        return AVERROR_INVALIDDATA;
     }
 
     if (s->depth == SGI_GRAYSCALE) {
@@ -246,18 +247,15 @@ static int decode_frame(AVCodecContext *avctx,
         avctx->pix_fmt = s->bytes_per_channel == 2 ? AV_PIX_FMT_RGBA64BE : AV_PIX_FMT_RGBA;
     } else {
         av_log(avctx, AV_LOG_ERROR, "wrong picture format\n");
-        return AVERROR(EINVAL);
+        return AVERROR_INVALIDDATA;
     }
 
     ret = ff_set_dimensions(avctx, s->width, s->height);
     if (ret < 0)
         return ret;
 
-    ret = ff_get_buffer(avctx, p, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed.\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
@@ -274,13 +272,11 @@ static int decode_frame(AVCodecContext *avctx,
     } else {
         ret = read_uncompressed_sgi(out_buf, s);
     }
-
-    if (ret == 0) {
-        *got_frame = 1;
-        return avpkt->size;
-    } else {
+    if (ret)
         return ret;
-    }
+
+    *got_frame = 1;
+    return avpkt->size;
 }
 
 static av_cold int sgi_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/sgienc.c b/libavcodec/sgienc.c
index 07e224c..13756f1 100644
--- a/libavcodec/sgienc.c
+++ b/libavcodec/sgienc.c
@@ -2,20 +2,20 @@
  * SGI image encoder
  * Todd Kirby <doubleshot@pacbell.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,6 +41,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     if (avctx->width > 65535 || avctx->height > 65535) {
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported resolution %dx%d.\n", avctx->width, avctx->height);
+        av_log(avctx, AV_LOG_ERROR, "SGI does not support resolutions above 65535x65535\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -167,10 +168,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     else // assume sgi_rle_encode() produces at most 2x size of input
         length += tablesize * 2 + depth * height * (2 * width + 1);
 
-    if ((ret = ff_alloc_packet(pkt, bytes_per_channel * length)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", length);
+    if ((ret = ff_alloc_packet2(avctx, pkt, bytes_per_channel * length, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init_writer(&pbc, pkt->data, pkt->size);
 
diff --git a/libavcodec/sgirledec.c b/libavcodec/sgirledec.c
index f636bbc..aa4f0e7 100644
--- a/libavcodec/sgirledec.c
+++ b/libavcodec/sgirledec.c
@@ -2,20 +2,20 @@
  * Silicon Graphics RLE 8-bit video decoder
  * Copyright (c) 2012 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,9 +41,9 @@ static av_cold int sgirle_decode_init(AVCodecContext *avctx)
  * Convert SGI RBG323 pixel into AV_PIX_FMT_BGR8
  * SGI RGB data is packed as 8bpp, (msb)3R 2B 3G(lsb)
  */
-#define RBG323_TO_BGR8(x) (((x << 3) & 0xC0) |                                \
-                           ((x << 3) & 0x38) |                                \
-                           ((x >> 5) & 7))
+#define RBG323_TO_BGR8(x) ((((x) << 3) & 0xC0) |                                \
+                           (((x) << 3) & 0x38) |                                \
+                           (((x) >> 5) & 7))
 static av_always_inline
 void rbg323_to_bgr8(uint8_t *dst, const uint8_t *src, int size)
 {
@@ -102,8 +102,8 @@ static int decode_sgirle8(AVCodecContext *avctx, uint8_t *dst,
                 v   -= length;
             } while (v > 0);
         } else {
-            av_log(avctx, AV_LOG_ERROR, "Invalid opcode %d.\n", v);
-            return AVERROR_INVALIDDATA;
+            avpriv_request_sample(avctx, "opcode %d", v);
+            return AVERROR_PATCHWELCOME;
         }
     }
     return 0;
diff --git a/libavcodec/sh4/README b/libavcodec/sh4/README
new file mode 100644
index 0000000..8dd61fe
--- /dev/null
+++ b/libavcodec/sh4/README
@@ -0,0 +1,6 @@
+SH4 optimizations have been removed in
+commit d6096a67422534918405abb46dafbbac4608cbc3
+The last revission with the optimizations is cbfc9046e1c7e295b74f252902ae6f255eef4e78
+
+If you want to maintain these (or other) SH4 optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/sheervideo.c b/libavcodec/sheervideo.c
new file mode 100644
index 0000000..50c3ebc
--- /dev/null
+++ b/libavcodec/sheervideo.c
@@ -0,0 +1,2099 @@
+/*
+ * BitJazz SheerVideo decoder
+ * Copyright (c) 2016 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+#include "thread.h"
+#include "sheervideodata.h"
+
+typedef struct SheerVideoContext {
+    unsigned format;
+    int alt;
+    VLC vlc[2];
+    void (*decode_frame)(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb);
+} SheerVideoContext;
+
+static void decode_ca4i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_a = (uint16_t *)p->data[3];
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 502 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int y, u, v, a;
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+                dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+                dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+                dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_ca4p(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_a = (uint16_t *)p->data[3];
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 10);
+            dst_y[x] = get_bits(gb, 10);
+            dst_u[x] = get_bits(gb, 10);
+            dst_v[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 502 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v, a;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+            dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+    dst_a += p->linesize[3] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y, u, v, a;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+            pred_TL[3] = pred_L[3] = dst_a[-p->linesize[3] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x];
+                pred_T[3] = dst_a[-p->linesize[3] / 2 + x];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[3] = (a + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0x3ff;
+                dst_y[x] = pred_L[0] = (y + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x] = pred_L[1] = (u + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_v[x] = pred_L[2] = (v + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_ybr10i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 512 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int y, u, v;
+
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+                dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+                dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_ybr10(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_y[x] = get_bits(gb, 10);
+            dst_u[x] = get_bits(gb, 10);
+            dst_v[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 512 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v;
+
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x];
+
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred_L[0] = (y + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x] = pred_L[1] = (u + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_v[x] = pred_L[2] = (v + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_yry10i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 0 };
+
+            for (x = 0; x < avctx->width; x += 2) {
+                int y1, y2, u, v;
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+                dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+                dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+                dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_yry10(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 10);
+            dst_u[x / 2] = get_bits(gb, 10);
+            dst_y[x + 1] = get_bits(gb, 10);
+            dst_v[x / 2] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[6], pred_L[6], pred_T[6];
+            int y1, y2, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[3] = dst_y[-p->linesize[0] / 2 + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x / 2];
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0x3ff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0x3ff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0x3ff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_ca2i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+    dst_a = (uint16_t *)p->data[3];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 10);
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_a[x + 1] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 502 };
+
+            for (x = 0; x < avctx->width; x += 2) {
+                int y1, y2, u, v, a1, a2;
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+                dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+                dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+                dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0x3ff;
+                dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+                dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_ca2p(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+    dst_a = (uint16_t *)p->data[3];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_a[x    ] = get_bits(gb, 10);
+            dst_y[x    ] = get_bits(gb, 10);
+            dst_u[x / 2] = get_bits(gb, 10);
+            dst_a[x + 1] = get_bits(gb, 10);
+            dst_y[x + 1] = get_bits(gb, 10);
+            dst_v[x / 2] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 502 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v, a1, a2;
+
+            a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+            dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0x3ff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+            dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+    dst_a += p->linesize[3] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 10);
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_a[x + 1] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[6], pred_L[6], pred_T[6];
+            int y1, y2, u, v, a1, a2;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+            pred_TL[4] = pred_L[4] = dst_a[-p->linesize[3] / 2];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[3] = dst_y[-p->linesize[0] / 2 + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x / 2];
+                pred_T[4] = dst_a[-p->linesize[3] / 2 + x];
+                pred_T[5] = dst_a[-p->linesize[3] / 2 + x + 1];
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0x3ff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0x3ff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0x3ff;
+                dst_a[x    ] = pred_L[4] = (a1 + ((3 * (pred_T[4] + pred_L[4]) - 2 * pred_TL[4]) >> 2)) & 0x3ff;
+                dst_a[x + 1] = pred_L[4] = (a2 + ((3 * (pred_T[5] + pred_L[4]) - 2 * pred_T[4]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[4] = pred_T[5];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_c82i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+    dst_a = p->data[3];
+
+    for (y = 0; y < avctx->height; y += 1) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 8);
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_a[x + 1] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred[4] = { 125, -128, -128, 125 };
+
+            for (x = 0; x < avctx->width; x += 2) {
+                int y1, y2, u, v, a1, a2;
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+                dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+                dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+                dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+                dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0xff;
+                dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0xff;
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+        dst_a += p->linesize[3];
+    }
+}
+
+static void decode_c82p(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+    dst_a = p->data[3];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_a[x    ] = get_bits(gb, 8);
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8);
+            dst_a[x + 1] = get_bits(gb, 8);
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, 125 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v, a1, a2;
+
+            a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+            dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+    dst_a += p->linesize[3];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 8);
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_a[x + 1] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[6], pred_L[6], pred_T[6];
+            int y1, y2, u, v, a1, a2;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+            pred_TL[4] = pred_L[4] = dst_a[-p->linesize[3]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[3] = dst_y[-p->linesize[0] + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] + x / 2];
+                pred_T[4] = dst_a[-p->linesize[3] + x];
+                pred_T[5] = dst_a[-p->linesize[3] + x + 1];
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0xff;
+                dst_a[x    ] = pred_L[4] = (a1 + ((3 * (pred_T[4] + pred_L[4]) - 2 * pred_TL[4]) >> 2)) & 0xff;
+                dst_a[x + 1] = pred_L[4] = (a2 + ((3 * (pred_T[5] + pred_L[4]) - 2 * pred_T[4]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[4] = pred_T[5];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+        dst_a += p->linesize[3];
+    }
+}
+
+static void decode_ybyr(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8) + 128;
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8) + 128;
+        }
+    } else {
+        int pred[4] = { -128, 128, 128, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8) + 128;
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8) + 128;
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y1, y2, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[3] = dst_y[-p->linesize[0] + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] + x / 2];
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0xff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_byryi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8);
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int y1, y2, u, v;
+
+            pred_L[0] = dst_y[-p->linesize[0]];
+            pred_L[1] = dst_u[-p->linesize[1]];
+            pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + pred_L[0]) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u  + pred_L[1]) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + pred_L[0]) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v +  pred_L[2]) & 0xff;
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_byry(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8);
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y1, y2, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[3] = dst_y[-p->linesize[0] + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] + x / 2];
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0xff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_ybri(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { s->alt ? 125 : -146, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v;
+
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x] = pred[0] = (y + pred[0]) & 0xff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0xff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int y, u, v;
+
+            pred_L[0] = dst_y[-p->linesize[0]];
+            pred_L[1] = dst_u[-p->linesize[1]];
+            pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred_L[0] = (y + pred_L[0]) & 0xff;
+                dst_u[x] = pred_L[1] = (u + pred_L[1]) & 0xff;
+                dst_v[x] = pred_L[2] = (v + pred_L[2]) & 0xff;
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_ybr(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { s->alt ? 125 : -146, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v;
+
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x] = pred[0] = (y + pred[0]) & 0xff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0xff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[1] = dst_u[-p->linesize[1] + x];
+                pred_T[2] = dst_v[-p->linesize[2] + x];
+
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred_L[0] = (y + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x] = pred_L[1] = (u + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst_v[x] = pred_L[2] = (v + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_aybri(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_a, *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_a = p->data[3];
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 8);
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, s->alt ? 125 : -146, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, y, u, v;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[0] = (a + pred[0]) & 0xff;
+            dst_y[x] = pred[1] = (y + pred[1]) & 0xff;
+            dst_u[x] = pred[2] = (u + pred[2]) & 0xff;
+            dst_v[x] = pred[3] = (v + pred[3]) & 0xff;
+        }
+    }
+
+    dst_a += p->linesize[3];
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 8);
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int a, y, u, v;
+
+            pred_L[0] = dst_a[-p->linesize[3]];
+            pred_L[1] = dst_y[-p->linesize[0]];
+            pred_L[2] = dst_u[-p->linesize[1]];
+            pred_L[3] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[0] = (a + pred_L[0]) & 0xff;
+                dst_y[x] = pred_L[1] = (y + pred_L[1]) & 0xff;
+                dst_u[x] = pred_L[2] = (u + pred_L[2]) & 0xff;
+                dst_v[x] = pred_L[3] = (v + pred_L[3]) & 0xff;
+            }
+        }
+
+        dst_a += p->linesize[3];
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_aybr(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_a, *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_a = p->data[3];
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 8);
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, s->alt ? 125 : -146, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, y, u, v;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[0] = (a + pred[0]) & 0xff;
+            dst_y[x] = pred[1] = (y + pred[1]) & 0xff;
+            dst_u[x] = pred[2] = (u + pred[2]) & 0xff;
+            dst_v[x] = pred[3] = (v + pred[3]) & 0xff;
+        }
+    }
+
+    dst_a += p->linesize[3];
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 8);
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int a, y, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_a[-p->linesize[3]];
+            pred_TL[1] = pred_L[1] = dst_y[-p->linesize[0]];
+            pred_TL[2] = pred_L[2] = dst_u[-p->linesize[1]];
+            pred_TL[3] = pred_L[3] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_a[-p->linesize[3] + x];
+                pred_T[1] = dst_y[-p->linesize[0] + x];
+                pred_T[2] = dst_u[-p->linesize[1] + x];
+                pred_T[3] = dst_v[-p->linesize[2] + x];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[0] = (a + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_y[x] = pred_L[1] = (y + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst_u[x] = pred_L[2] = (u + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+                dst_v[x] = pred_L[3] = (v + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+
+        dst_a += p->linesize[3];
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_argxi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b, *dst_a;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+    dst_a = (uint16_t *)p->data[3];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 512, 512, 512, 512 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int r, g, b, a;
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+                dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+                dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+                dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_argx(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b, *dst_a;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+    dst_a = (uint16_t *)p->data[3];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 10);
+            dst_r[x] = get_bits(gb, 10);
+            dst_g[x] = get_bits(gb, 10);
+            dst_b[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 512, 512, 512, 512 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b, a;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+            dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+            dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+            dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_r += p->linesize[2] / 2;
+    dst_g += p->linesize[0] / 2;
+    dst_b += p->linesize[1] / 2;
+    dst_a += p->linesize[3] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int r, g, b, a;
+
+            pred_TL[0] = pred_L[0] = dst_r[-p->linesize[2] / 2];
+            pred_TL[1] = pred_L[1] = dst_g[-p->linesize[0] / 2];
+            pred_TL[2] = pred_L[2] = dst_b[-p->linesize[1] / 2];
+            pred_TL[3] = pred_L[3] = dst_a[-p->linesize[3] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_r[-p->linesize[2] / 2 + x];
+                pred_T[1] = dst_g[-p->linesize[0] / 2 + x];
+                pred_T[2] = dst_b[-p->linesize[1] / 2 + x];
+                pred_T[3] = dst_a[-p->linesize[3] / 2 + x];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[3] = (a + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0x3ff;
+                dst_r[x] = pred_L[0] = (r + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_g[x] = pred_L[1] = (r + g + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_b[x] = pred_L[2] = (r + g + b + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_rgbxi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 512, 512, 512, 0 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int r, g, b;
+
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+                dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+                dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+    }
+}
+
+static void decode_rgbx(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_r[x] = get_bits(gb, 10);
+            dst_g[x] = get_bits(gb, 10);
+            dst_b[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 512, 512, 512, 0 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b;
+
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+            dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+            dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_r += p->linesize[2] / 2;
+    dst_g += p->linesize[0] / 2;
+    dst_b += p->linesize[1] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int r, g, b;
+
+            pred_TL[0] = pred_L[0] = dst_r[-p->linesize[2] / 2];
+            pred_TL[1] = pred_L[1] = dst_g[-p->linesize[0] / 2];
+            pred_TL[2] = pred_L[2] = dst_b[-p->linesize[1] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_r[-p->linesize[2] / 2 + x];
+                pred_T[1] = dst_g[-p->linesize[0] / 2 + x];
+                pred_T[2] = dst_b[-p->linesize[1] / 2 + x];
+
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_r[x] = pred_L[0] = (r + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_g[x] = pred_L[1] = (r + g + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_b[x] = pred_L[2] = (r + g + b + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+    }
+}
+
+static void decode_argbi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+            dst[x * 4 + 3] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, r, g, b;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (a + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + pred[2]) & 0xff;
+            dst[4 * x + 3] = pred[3] = (r + g + b + pred[3]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+                dst[x * 4 + 3] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int a, r, g, b;
+
+            pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_L[2] = dst[-p->linesize[0] + 2];
+            pred_L[3] = dst[-p->linesize[0] + 3];
+
+            for (x = 0; x < avctx->width; x++) {
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (a + pred_L[0]) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + pred_L[1]) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + pred_L[2]) & 0xff;
+                dst[4 * x + 3] = pred_L[3] = (r + g + b + pred_L[3]) & 0xff;
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static void decode_argb(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+            dst[x * 4 + 3] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, r, g, b;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (a + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + pred[2]) & 0xff;
+            dst[4 * x + 3] = pred[3] = (r + g + b + pred[3]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+                dst[x * 4 + 3] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int a, r, g, b;
+
+            pred_TL[0] = pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_TL[1] = pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_TL[2] = pred_L[2] = dst[-p->linesize[0] + 2];
+            pred_TL[3] = pred_L[3] = dst[-p->linesize[0] + 3];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst[-p->linesize[0] + 4 * x + 0];
+                pred_T[1] = dst[-p->linesize[0] + 4 * x + 1];
+                pred_T[2] = dst[-p->linesize[0] + 4 * x + 2];
+                pred_T[3] = dst[-p->linesize[0] + 4 * x + 3];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (a + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+                dst[4 * x + 3] = pred_L[3] = (r + g + b + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static void decode_rgbi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b;
+
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (r + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + g + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + b + pred[2]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int r, g, b;
+
+            pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_L[2] = dst[-p->linesize[0] + 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (r + pred_L[0]) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + g + pred_L[1]) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + b + pred_L[2]) & 0xff;
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static void decode_rgb(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b;
+
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (r + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + g + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + b + pred[2]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int r, g, b;
+
+            pred_TL[0] = pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_TL[1] = pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_TL[2] = pred_L[2] = dst[-p->linesize[0] + 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst[-p->linesize[0] + 4 * x + 0];
+                pred_T[1] = dst[-p->linesize[0] + 4 * x + 1];
+                pred_T[2] = dst[-p->linesize[0] + 4 * x + 2];
+
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (r + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + g + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + b + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static int build_vlc(VLC *vlc, const uint8_t *len, int count)
+{
+    uint32_t codes[1024];
+    uint8_t bits[1024];
+    uint16_t syms[1024];
+    uint64_t index;
+    int i;
+
+    index = 0;
+    for (i = 0; i < count; i++) {
+        codes[i]  = index >> (32 - len[i]);
+        bits[i] = len[i];
+        syms[i]  = i;
+        index += 1ULL << (32 - len[i]);
+    }
+
+    ff_free_vlc(vlc);
+    return ff_init_vlc_sparse(vlc, 16, count,
+                              bits,  sizeof(*bits),  sizeof(*bits),
+                              codes, sizeof(*codes), sizeof(*codes),
+                              syms,  sizeof(*syms),  sizeof(*syms), 0);
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    ThreadFrame frame = { .f = data };
+    AVFrame *p = data;
+    GetBitContext gb;
+    unsigned format;
+    int ret;
+
+    if (avpkt->size <= 20)
+        return AVERROR_INVALIDDATA;
+
+    if (AV_RL32(avpkt->data) != MKTAG('S','h','i','r') &&
+        AV_RL32(avpkt->data) != MKTAG('Z','w','a','k'))
+        return AVERROR_INVALIDDATA;
+
+    s->alt = 0;
+    format = AV_RL32(avpkt->data + 16);
+    av_log(avctx, AV_LOG_DEBUG, "format: %s\n", av_fourcc2str(format));
+    switch (format) {
+    case MKTAG(' ', 'R', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
+        s->decode_frame = decode_rgb;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgb, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgb, 256);
+        }
+        break;
+    case MKTAG(' ', 'r', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
+        s->decode_frame = decode_rgbi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbi, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbi, 256);
+        }
+        break;
+    case MKTAG('A', 'R', 'G', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        s->decode_frame = decode_argx;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbx, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbx, 1024);
+        }
+        break;
+    case MKTAG('A', 'r', 'G', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        s->decode_frame = decode_argxi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbxi, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbxi, 1024);
+        }
+        break;
+    case MKTAG('R', 'G', 'B', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        s->decode_frame = decode_rgbx;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbx, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbx, 1024);
+        }
+        break;
+    case MKTAG('r', 'G', 'B', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        s->decode_frame = decode_rgbxi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbxi, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbxi, 1024);
+        }
+        break;
+    case MKTAG('A', 'R', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_ARGB;
+        s->decode_frame = decode_argb;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgb, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgb, 256);
+        }
+        break;
+    case MKTAG('A', 'r', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_ARGB;
+        s->decode_frame = decode_argbi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbi, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbi, 256);
+        }
+        break;
+    case MKTAG('A', 'Y', 'B', 'R'):
+        s->alt = 1;
+    case MKTAG('A', 'Y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        s->decode_frame = decode_aybr;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr, 256);
+        }
+        break;
+    case MKTAG('A', 'y', 'B', 'R'):
+        s->alt = 1;
+    case MKTAG('A', 'y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        s->decode_frame = decode_aybri;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybri, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybri, 256);
+        }
+        break;
+    case MKTAG(' ', 'Y', 'B', 'R'):
+        s->alt = 1;
+    case MKTAG(' ', 'Y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        s->decode_frame = decode_ybr;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr, 256);
+        }
+        break;
+    case MKTAG(' ', 'y', 'B', 'R'):
+        s->alt = 1;
+    case MKTAG(' ', 'y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        s->decode_frame = decode_ybri;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybri, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybri, 256);
+        }
+        break;
+    case MKTAG('Y', 'B', 'R', 0x0a):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+        s->decode_frame = decode_ybr10;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10, 1024);
+        }
+        break;
+    case MKTAG('y', 'B', 'R', 0x0a):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+        s->decode_frame = decode_ybr10i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10i, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '4', 'p'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P10;
+        s->decode_frame = decode_ca4p;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '4', 'i'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P10;
+        s->decode_frame = decode_ca4i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10i, 1024);
+        }
+        break;
+    case MKTAG('B', 'Y', 'R', 'Y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        s->decode_frame = decode_byry;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byry, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byry, 256);
+        }
+        break;
+    case MKTAG('B', 'Y', 'R', 'y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        s->decode_frame = decode_byryi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byryi, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byryi, 256);
+        }
+        break;
+    case MKTAG('Y', 'b', 'Y', 'r'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        s->decode_frame = decode_ybyr;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybyr, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybyr, 256);
+        }
+        break;
+    case MKTAG('C', '8', '2', 'p'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+        s->decode_frame = decode_c82p;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byry, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byry, 256);
+        }
+        break;
+    case MKTAG('C', '8', '2', 'i'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+        s->decode_frame = decode_c82i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byryi, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byryi, 256);
+        }
+        break;
+    case MKTAG(0xa2, 'Y', 'R', 'Y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        s->decode_frame = decode_yry10;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10, 1024);
+        }
+        break;
+    case MKTAG(0xa2, 'Y', 'R', 'y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        s->decode_frame = decode_yry10i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10i, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '2', 'p'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P10;
+        s->decode_frame = decode_ca2p;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '2', 'i'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P10;
+        s->decode_frame = decode_ca2i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10i, 1024);
+        }
+        break;
+    default:
+        avpriv_request_sample(avctx, "unsupported format: 0x%X", format);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (avpkt->size < 20 + avctx->width * avctx->height / 16) {
+        av_log(avctx, AV_LOG_ERROR, "Input packet too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->format != format) {
+        if (ret < 0)
+            return ret;
+        s->format = format;
+    }
+
+    p->pict_type = AV_PICTURE_TYPE_I;
+    p->key_frame = 1;
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    if ((ret = init_get_bits8(&gb, avpkt->data + 20, avpkt->size - 20)) < 0)
+        return ret;
+
+    s->decode_frame(avctx, p, &gb);
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+#if HAVE_THREADS
+static int decode_init_thread_copy(AVCodecContext *avctx)
+{
+    SheerVideoContext *s = avctx->priv_data;
+
+    s->format = 0;
+    memset(&s->vlc[0], 0, sizeof(s->vlc[0]));
+    memset(&s->vlc[1], 0, sizeof(s->vlc[1]));
+
+    return 0;
+}
+#endif
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    SheerVideoContext *s = avctx->priv_data;
+
+    ff_free_vlc(&s->vlc[0]);
+    ff_free_vlc(&s->vlc[1]);
+
+    return 0;
+}
+
+AVCodec ff_sheervideo_decoder = {
+    .name             = "sheervideo",
+    .long_name        = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_SHEERVIDEO,
+    .priv_data_size   = sizeof(SheerVideoContext),
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(decode_init_thread_copy),
+    .close            = decode_end,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+};
diff --git a/libavcodec/sheervideodata.h b/libavcodec/sheervideodata.h
new file mode 100644
index 0000000..3b6e2f6
--- /dev/null
+++ b/libavcodec/sheervideodata.h
@@ -0,0 +1,1097 @@
+/*
+ * BitJazz SheerVideo decoder
+ * Copyright (c) 2016 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SHEERVIDEODATA_H
+#define AVCODEC_SHEERVIDEODATA_H
+
+#include "libavutil/common.h"
+
+static const uint8_t l_r_rgb[256] = {
+     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,
+     8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_r_rgbi[256] = {
+     3,  4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  7,
+     8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,
+     8,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_g_rgbi[256] = {
+     1,  3,  4,  5,  6,  7,  7,  8,  9,  9, 10, 10, 10, 10, 11, 11,
+    11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14,
+    14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12,
+    11, 11, 11, 10, 10, 10,  9,  9,  9,  8,  8,  7,  6,  5,  5,  3,
+};
+
+static const uint8_t l_g_rgb[256] = {
+     2,  2,  4,  4,  6,  7,  9,  9, 10, 11, 11, 11, 12, 12, 12, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 12, 12, 12, 11, 11, 11, 10,  9,  9,  8,  6,  4,  3,  3,
+};
+
+static const uint8_t l_y_ybr[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_ybr[256] = {
+     1,  2,  4,  6,  9, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,  8,  5,  3,
+};
+
+static const uint8_t l_y_ybyr[256] = {
+     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,
+     8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_u_ybyr[256] = {
+     1,  2,  4,  6,  8,  9, 10, 10, 11, 11, 12, 12, 12, 13, 13, 14,
+    14, 14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14,
+    14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10,  9,  8,  7,  6,  3,
+};
+
+static const uint8_t l_y_byry[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_byry[256] = {
+     1,  2,  4,  6,  8,  9,  9, 10, 11, 11, 12, 12, 13, 13, 13, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 14,
+    14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10,  9,  8,  7,  6,  3,
+};
+
+static const uint8_t l_y_ybr10i[1024] = {
+     3,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,
+};
+
+static const uint8_t l_y_ybr10[1024] = {
+     4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
+};
+
+static const uint8_t l_u_ybr10i[1024] = {
+     2,  3,  4,  4,  5,  5,  6,  7,  7,  8,  8,  9,  9,  9,  9, 10,
+    10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10,  9,  9,  9,  8,  8,  8,  7,  6,  5,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_ybr10[1024] = {
+     2,  3,  3,  4,  5,  5,  6,  7,  8,  9,  9, 10, 10, 10, 11, 11,
+    12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 12, 12, 12,
+    12, 11, 11, 11, 10, 10,  9,  9,  8,  8,  7,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_r_rgbx[1024] = {
+     4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,
+};
+
+static const uint8_t l_g_rgbx[1024] = {
+     3,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,
+     8,  8,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+    12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10,  9,  9,  9,  9,
+     8,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_y_yry10[1024] = {
+     4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
+};
+
+static const uint8_t l_y_yry10i[1024] = {
+     3,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,
+};
+
+static const uint8_t l_u_yry10[1024] = {
+     2,  3,  3,  4,  5,  6,  7,  7,  8,  8,  8,  9,  9, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
+    10, 10, 10, 10,  9,  9,  9,  8,  8,  7,  7,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_yry10i[1024] = {
+     2,  4,  4,  4,  5,  6,  6,  6,  7,  7,  7,  8,  8,  8,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
+    11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11,
+    11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,  9,  9,  9,
+     9,  9,  8,  8,  8,  8,  7,  7,  7,  6,  6,  5,  5,  4,  4,  3,
+};
+
+static const uint8_t l_y_ybri[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  5,  5,  5,  4,  3,
+};
+
+static const uint8_t l_u_ybri[256] = {
+     1,  3,  5,  6,  8,  8,  9, 10, 10, 11, 11, 12, 12, 13, 13, 13,
+    14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14,
+    14, 13, 13, 13, 12, 12, 11, 11, 10, 10,  9,  8,  8,  6,  5,  2,
+};
+
+static const uint8_t l_y_byryi[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10,
+    10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10,
+    10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  7,  6,  6,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_byryi[256] = {
+     1,  3,  4,  6,  6,  7,  8,  8,  9,  9, 10, 10, 10, 11, 11, 11,
+    12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15,
+    15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15,
+    15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+    12, 11, 11, 11, 10, 10, 10,  9,  9,  8,  8,  7,  7,  5,  4,  3,
+};
+
+static const uint8_t l_r_rgbxi[1024] = {
+     3,  4,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_g_rgbxi[1024] = {
+     2,  3,  4,  4,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11,
+    11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11,
+    11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,
+     9,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  6,  6,  4,  4,  3,
+};
+
+#endif /* AVCODEC_SHEERVIDEODATA_H */
diff --git a/libavcodec/shorten.c b/libavcodec/shorten.c
index e040b9c..4134af7 100644
--- a/libavcodec/shorten.c
+++ b/libavcodec/shorten.c
@@ -2,20 +2,20 @@
  * Shorten decoder
  * Copyright (c) 2005 Jeff Muizelaar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,10 +26,10 @@
  */
 
 #include <limits.h>
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "bswapdsp.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "golomb.h"
 #include "internal.h"
 
@@ -50,8 +50,12 @@
 #define ENERGYSIZE 3
 #define BITSHIFTSIZE 2
 
+#define TYPE_S8    1
+#define TYPE_U8    2
 #define TYPE_S16HL 3
+#define TYPE_U16HL 4
 #define TYPE_S16LH 5
+#define TYPE_U16LH 6
 
 #define NWRAP 3
 #define NSKIPSIZE 1
@@ -80,7 +84,7 @@ static const uint8_t is_audio_command[10] = { 1, 1, 1, 1, 0, 0, 0, 1, 1, 0 };
 
 typedef struct ShortenContext {
     AVCodecContext *avctx;
-    BitstreamContext bc;
+    GetBitContext gb;
 
     int min_framesize, max_framesize;
     unsigned channels;
@@ -106,13 +110,16 @@ typedef struct ShortenContext {
     int32_t lpcqoffset;
     int got_header;
     int got_quit_command;
+    int swap;
+    BswapDSPContext bdsp;
 } ShortenContext;
 
 static av_cold int shorten_decode_init(AVCodecContext *avctx)
 {
     ShortenContext *s = avctx->priv_data;
     s->avctx          = avctx;
-    avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
+
+    ff_bswapdsp_init(&s->bdsp);
 
     return 0;
 }
@@ -126,19 +133,18 @@ static int allocate_buffers(ShortenContext *s)
             av_log(s->avctx, AV_LOG_ERROR, "nmean too large\n");
             return AVERROR_INVALIDDATA;
         }
-        if (s->blocksize + s->nwrap >= UINT_MAX / sizeof(int32_t) ||
-            s->blocksize + s->nwrap <= (unsigned)s->nwrap) {
+        if (s->blocksize + (uint64_t)s->nwrap >= UINT_MAX / sizeof(int32_t)) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "s->blocksize + s->nwrap too large\n");
             return AVERROR_INVALIDDATA;
         }
 
-        if ((err = av_reallocp(&s->offset[chan],
-                               sizeof(int32_t) *
+        if ((err = av_reallocp_array(&s->offset[chan],
+                               sizeof(int32_t),
                                FFMAX(1, s->nmean))) < 0)
             return err;
 
-        if ((err = av_reallocp(&s->decoded_base[chan], (s->blocksize + s->nwrap) *
+        if ((err = av_reallocp_array(&s->decoded_base[chan], (s->blocksize + s->nwrap),
                                sizeof(s->decoded_base[0][0]))) < 0)
             return err;
         for (i = 0; i < s->nwrap; i++)
@@ -146,7 +152,7 @@ static int allocate_buffers(ShortenContext *s)
         s->decoded[chan] = s->decoded_base[chan] + s->nwrap;
     }
 
-    if ((err = av_reallocp(&s->coeffs, s->nwrap * sizeof(*s->coeffs))) < 0)
+    if ((err = av_reallocp_array(&s->coeffs, s->nwrap, sizeof(*s->coeffs))) < 0)
         return err;
 
     return 0;
@@ -154,18 +160,25 @@ static int allocate_buffers(ShortenContext *s)
 
 static inline unsigned int get_uint(ShortenContext *s, int k)
 {
-    if (s->version != 0)
-        k = get_ur_golomb_shorten(&s->bc, ULONGSIZE);
-    return get_ur_golomb_shorten(&s->bc, k);
+    if (s->version != 0) {
+        k = get_ur_golomb_shorten(&s->gb, ULONGSIZE);
+        if (k > 31U)
+            return AVERROR_INVALIDDATA;
+    }
+    return get_ur_golomb_shorten(&s->gb, k);
 }
 
 static void fix_bitshift(ShortenContext *s, int32_t *buffer)
 {
     int i;
 
-    if (s->bitshift != 0)
+    if (s->bitshift == 32) {
         for (i = 0; i < s->blocksize; i++)
-            buffer[i] <<= s->bitshift;
+            buffer[i] = 0;
+    } else if (s->bitshift != 0) {
+        for (i = 0; i < s->blocksize; i++)
+            buffer[i] *= 1U << s->bitshift;
+    }
 }
 
 static int init_offset(ShortenContext *s)
@@ -175,13 +188,17 @@ static int init_offset(ShortenContext *s)
     int nblock = FFMAX(1, s->nmean);
     /* initialise offset */
     switch (s->internal_ftype) {
+    case TYPE_U8:
+        s->avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
+        mean = 0x80;
+        break;
     case TYPE_S16HL:
     case TYPE_S16LH:
-        mean = 0;
+        s->avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         break;
     default:
-        av_log(s->avctx, AV_LOG_ERROR, "unknown audio type");
-        return AVERROR_INVALIDDATA;
+        av_log(s->avctx, AV_LOG_ERROR, "unknown audio type\n");
+        return AVERROR_PATCHWELCOME;
     }
 
     for (chan = 0; chan < s->channels; chan++)
@@ -190,10 +207,78 @@ static int init_offset(ShortenContext *s)
     return 0;
 }
 
+static int decode_aiff_header(AVCodecContext *avctx, const uint8_t *header,
+                              int header_size)
+{
+    ShortenContext *s = avctx->priv_data;
+    int len, bps, exp;
+    GetByteContext gb;
+    uint64_t val;
+    uint32_t tag;
+
+    bytestream2_init(&gb, header, header_size);
+
+    if (bytestream2_get_le32(&gb) != MKTAG('F', 'O', 'R', 'M')) {
+        av_log(avctx, AV_LOG_ERROR, "missing FORM tag\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&gb, 4); /* chunk size */
+
+    tag = bytestream2_get_le32(&gb);
+    if (tag != MKTAG('A', 'I', 'F', 'F') &&
+        tag != MKTAG('A', 'I', 'F', 'C')) {
+        av_log(avctx, AV_LOG_ERROR, "missing AIFF tag\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    while (bytestream2_get_le32(&gb) != MKTAG('C', 'O', 'M', 'M')) {
+        len = bytestream2_get_be32(&gb);
+        if (len < 0 || bytestream2_get_bytes_left(&gb) < 18LL + len + (len&1)) {
+            av_log(avctx, AV_LOG_ERROR, "no COMM chunk found\n");
+            return AVERROR_INVALIDDATA;
+        }
+        bytestream2_skip(&gb, len + (len & 1));
+    }
+    len = bytestream2_get_be32(&gb);
+
+    if (len < 18) {
+        av_log(avctx, AV_LOG_ERROR, "COMM chunk was too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&gb, 6);
+    bps = bytestream2_get_be16(&gb);
+    avctx->bits_per_coded_sample = bps;
+
+    s->swap = tag == MKTAG('A', 'I', 'F', 'C');
+
+    if (bps != 16 && bps != 8) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample: %d\n", bps);
+        return AVERROR(ENOSYS);
+    }
+
+    exp = bytestream2_get_be16(&gb) - 16383 - 63;
+    val = bytestream2_get_be64(&gb);
+    if (exp < -63 || exp > 63) {
+        av_log(avctx, AV_LOG_ERROR, "exp %d is out of range\n", exp);
+        return AVERROR_INVALIDDATA;
+    }
+    if (exp >= 0)
+        avctx->sample_rate = val << exp;
+    else
+        avctx->sample_rate = (val + (1ULL<<(-exp-1))) >> -exp;
+    len -= 18;
+    if (len > 0)
+        av_log(avctx, AV_LOG_INFO, "%d header bytes unparsed\n", len);
+
+    return 0;
+}
+
 static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
                               int header_size)
 {
-    int len;
+    int len, bps;
     short wave_format;
     GetByteContext gb;
 
@@ -214,7 +299,7 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     while (bytestream2_get_le32(&gb) != MKTAG('f', 'm', 't', ' ')) {
         len = bytestream2_get_le32(&gb);
         bytestream2_skip(&gb, len);
-        if (bytestream2_get_bytes_left(&gb) < 16) {
+        if (len < 0 || bytestream2_get_bytes_left(&gb) < 16) {
             av_log(avctx, AV_LOG_ERROR, "no fmt chunk found\n");
             return AVERROR_INVALIDDATA;
         }
@@ -240,10 +325,11 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     avctx->sample_rate = bytestream2_get_le32(&gb);
     bytestream2_skip(&gb, 4); // skip bit rate    (represents original uncompressed bit rate)
     bytestream2_skip(&gb, 2); // skip block align (not needed)
-    avctx->bits_per_coded_sample = bytestream2_get_le16(&gb);
+    bps = bytestream2_get_le16(&gb);
+    avctx->bits_per_coded_sample = bps;
 
-    if (avctx->bits_per_coded_sample != 16) {
-        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample\n");
+    if (bps != 16 && bps != 8) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample: %d\n", bps);
         return AVERROR(ENOSYS);
     }
 
@@ -254,18 +340,6 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     return 0;
 }
 
-static void output_buffer(int16_t **samples, int nchan, int blocksize,
-                          int32_t **buffer)
-{
-    int i, ch;
-    for (ch = 0; ch < nchan; ch++) {
-        int32_t *in  = buffer[ch];
-        int16_t *out = samples[ch];
-        for (i = 0; i < blocksize; i++)
-            out[i] = av_clip_int16(in[i]);
-    }
-}
-
 static const int fixed_coeffs[][3] = {
     { 0,  0,  0 },
     { 1,  0,  0 },
@@ -281,15 +355,15 @@ static int decode_subframe_lpc(ShortenContext *s, int command, int channel,
 
     if (command == FN_QLPC) {
         /* read/validate prediction order */
-        pred_order = get_ur_golomb_shorten(&s->bc, LPCQSIZE);
-        if (pred_order > s->nwrap) {
+        pred_order = get_ur_golomb_shorten(&s->gb, LPCQSIZE);
+        if ((unsigned)pred_order > s->nwrap) {
             av_log(s->avctx, AV_LOG_ERROR, "invalid pred_order %d\n",
                    pred_order);
             return AVERROR(EINVAL);
         }
         /* read LPC coefficients */
         for (i = 0; i < pred_order; i++)
-            s->coeffs[i] = get_sr_golomb_shorten(&s->bc, LPCQUANT);
+            s->coeffs[i] = get_sr_golomb_shorten(&s->gb, LPCQUANT);
         coeffs = s->coeffs;
 
         qshift = LPCQUANT;
@@ -308,22 +382,22 @@ static int decode_subframe_lpc(ShortenContext *s, int command, int channel,
     /* subtract offset from previous samples to use in prediction */
     if (command == FN_QLPC && coffset)
         for (i = -pred_order; i < 0; i++)
-            s->decoded[channel][i] -= coffset;
+            s->decoded[channel][i] -= (unsigned)coffset;
 
     /* decode residual and do LPC prediction */
     init_sum = pred_order ? (command == FN_QLPC ? s->lpcqoffset : 0) : coffset;
     for (i = 0; i < s->blocksize; i++) {
         sum = init_sum;
         for (j = 0; j < pred_order; j++)
-            sum += coeffs[j] * s->decoded[channel][i - j - 1];
-        s->decoded[channel][i] = get_sr_golomb_shorten(&s->bc, residual_size) +
-                                 (sum >> qshift);
+            sum += coeffs[j] * (unsigned)s->decoded[channel][i - j - 1];
+        s->decoded[channel][i] = get_sr_golomb_shorten(&s->gb, residual_size) +
+                                 (unsigned)(sum >> qshift);
     }
 
     /* add offset to current samples */
     if (command == FN_QLPC && coffset)
         for (i = 0; i < s->blocksize; i++)
-            s->decoded[channel][i] += coffset;
+            s->decoded[channel][i] += (unsigned)coffset;
 
     return 0;
 }
@@ -333,7 +407,7 @@ static int read_header(ShortenContext *s)
     int i, ret;
     int maxnlpc = 0;
     /* shorten signature */
-    if (bitstream_read(&s->bc, 32) != AV_RB32("ajkg")) {
+    if (get_bits_long(&s->gb, 32) != AV_RB32("ajkg")) {
         av_log(s->avctx, AV_LOG_ERROR, "missing shorten magic 'ajkg'\n");
         return AVERROR_INVALIDDATA;
     }
@@ -341,7 +415,7 @@ static int read_header(ShortenContext *s)
     s->lpcqoffset     = 0;
     s->blocksize      = DEFAULT_BLOCK_SIZE;
     s->nmean          = -1;
-    s->version        = bitstream_read(&s->bc, 8);
+    s->version        = get_bits(&s->gb, 8);
     s->internal_ftype = get_uint(s, TYPESIZE);
 
     s->channels = get_uint(s, CHANSIZE);
@@ -371,30 +445,40 @@ static int read_header(ShortenContext *s)
         s->blocksize = blocksize;
 
         maxnlpc  = get_uint(s, LPCQSIZE);
+        if (maxnlpc > 1024U) {
+            av_log(s->avctx, AV_LOG_ERROR, "maxnlpc is: %d\n", maxnlpc);
+            return AVERROR_INVALIDDATA;
+        }
         s->nmean = get_uint(s, 0);
+        if (s->nmean > 32768U) {
+            av_log(s->avctx, AV_LOG_ERROR, "nmean is: %d\n", s->nmean);
+            return AVERROR_INVALIDDATA;
+        }
 
         skip_bytes = get_uint(s, NSKIPSIZE);
+        if ((unsigned)skip_bytes > FFMAX(get_bits_left(&s->gb), 0)/8) {
+            av_log(s->avctx, AV_LOG_ERROR, "invalid skip_bytes: %d\n", skip_bytes);
+            return AVERROR_INVALIDDATA;
+        }
+
         for (i = 0; i < skip_bytes; i++)
-            bitstream_skip(&s->bc, 8);
+            skip_bits(&s->gb, 8);
     }
     s->nwrap = FFMAX(NWRAP, maxnlpc);
 
-    if ((ret = allocate_buffers(s)) < 0)
-        return ret;
-
-    if ((ret = init_offset(s)) < 0)
-        return ret;
-
     if (s->version > 1)
         s->lpcqoffset = V2LPCQOFFSET;
 
-    if (get_ur_golomb_shorten(&s->bc, FNSIZE) != FN_VERBATIM) {
+    if (s->avctx->extradata_size > 0)
+        goto end;
+
+    if (get_ur_golomb_shorten(&s->gb, FNSIZE) != FN_VERBATIM) {
         av_log(s->avctx, AV_LOG_ERROR,
                "missing verbatim section at beginning of stream\n");
         return AVERROR_INVALIDDATA;
     }
 
-    s->header_size = get_ur_golomb_shorten(&s->bc, VERBATIM_CKSIZE_SIZE);
+    s->header_size = get_ur_golomb_shorten(&s->gb, VERBATIM_CKSIZE_SIZE);
     if (s->header_size >= OUT_BUFFER_SIZE ||
         s->header_size < CANONICAL_HEADER_SIZE) {
         av_log(s->avctx, AV_LOG_ERROR, "header is wrong size: %d\n",
@@ -403,9 +487,26 @@ static int read_header(ShortenContext *s)
     }
 
     for (i = 0; i < s->header_size; i++)
-        s->header[i] = (char)get_ur_golomb_shorten(&s->bc, VERBATIM_BYTE_SIZE);
+        s->header[i] = (char)get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE);
 
-    if ((ret = decode_wave_header(s->avctx, s->header, s->header_size)) < 0)
+    if (AV_RL32(s->header) == MKTAG('R','I','F','F')) {
+        if ((ret = decode_wave_header(s->avctx, s->header, s->header_size)) < 0)
+            return ret;
+    } else if (AV_RL32(s->header) == MKTAG('F','O','R','M')) {
+        if ((ret = decode_aiff_header(s->avctx, s->header, s->header_size)) < 0)
+            return ret;
+    } else {
+        avpriv_report_missing_feature(s->avctx, "unsupported bit packing %"
+                                      PRIX32, AV_RL32(s->header));
+        return AVERROR_PATCHWELCOME;
+    }
+
+end:
+
+    if ((ret = allocate_buffers(s)) < 0)
+        return ret;
+
+    if ((ret = init_offset(s)) < 0)
         return ret;
 
     s->cur_chan = 0;
@@ -429,51 +530,68 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
     /* allocate internal bitstream buffer */
     if (s->max_framesize == 0) {
         void *tmp_ptr;
-        s->max_framesize = 1024; // should hopefully be enough for the first header
+        s->max_framesize = 8192; // should hopefully be enough for the first header
         tmp_ptr = av_fast_realloc(s->bitstream, &s->allocated_bitstream_size,
                                   s->max_framesize + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!tmp_ptr) {
+            s->max_framesize = 0;
             av_log(avctx, AV_LOG_ERROR, "error allocating bitstream buffer\n");
             return AVERROR(ENOMEM);
         }
+        memset(tmp_ptr, 0, s->allocated_bitstream_size);
         s->bitstream = tmp_ptr;
     }
 
     /* append current packet data to bitstream buffer */
-    if (1 && s->max_framesize) { //FIXME truncated
-        buf_size       = FFMIN(buf_size, s->max_framesize - s->bitstream_size);
-        input_buf_size = buf_size;
-
-        if (s->bitstream_index + s->bitstream_size + buf_size >
-            s->allocated_bitstream_size) {
-            memmove(s->bitstream, &s->bitstream[s->bitstream_index],
-                    s->bitstream_size);
-            s->bitstream_index = 0;
-        }
-        if (buf)
-            memcpy(&s->bitstream[s->bitstream_index + s->bitstream_size], buf,
-                   buf_size);
-        buf               = &s->bitstream[s->bitstream_index];
-        buf_size         += s->bitstream_size;
-        s->bitstream_size = buf_size;
-
-        /* do not decode until buffer has at least max_framesize bytes or
-         * the end of the file has been reached */
-        if (buf_size < s->max_framesize && avpkt->data) {
-            *got_frame_ptr = 0;
-            return input_buf_size;
-        }
+    buf_size       = FFMIN(buf_size, s->max_framesize - s->bitstream_size);
+    input_buf_size = buf_size;
+
+    if (s->bitstream_index + s->bitstream_size + buf_size + AV_INPUT_BUFFER_PADDING_SIZE >
+        s->allocated_bitstream_size) {
+        memmove(s->bitstream, &s->bitstream[s->bitstream_index],
+                s->bitstream_size);
+        s->bitstream_index = 0;
+    }
+    if (buf)
+        memcpy(&s->bitstream[s->bitstream_index + s->bitstream_size], buf,
+               buf_size);
+    buf               = &s->bitstream[s->bitstream_index];
+    buf_size         += s->bitstream_size;
+    s->bitstream_size = buf_size;
+
+    /* do not decode until buffer has at least max_framesize bytes or
+     * the end of the file has been reached */
+    if (buf_size < s->max_framesize && avpkt->data) {
+        *got_frame_ptr = 0;
+        return input_buf_size;
     }
     /* init and position bitstream reader */
-    bitstream_init8(&s->bc, buf, buf_size);
-    bitstream_skip(&s->bc, s->bitindex);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
+    skip_bits(&s->gb, s->bitindex);
 
     /* process header or next subblock */
     if (!s->got_header) {
+
         if ((ret = read_header(s)) < 0)
             return ret;
-        *got_frame_ptr = 0;
-        goto finish_frame;
+
+        if (avpkt->size) {
+            int max_framesize;
+            void *tmp_ptr;
+
+            max_framesize = FFMAX(s->max_framesize, s->blocksize * s->channels * 8);
+            tmp_ptr = av_fast_realloc(s->bitstream, &s->allocated_bitstream_size,
+                                      max_framesize + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!tmp_ptr) {
+                av_log(avctx, AV_LOG_ERROR, "error allocating bitstream buffer\n");
+                return AVERROR(ENOMEM);
+            }
+            s->bitstream = tmp_ptr;
+            s->max_framesize = max_framesize;
+            *got_frame_ptr = 0;
+            goto finish_frame;
+        }
     }
 
     /* if quit command was read previously, don't decode anything */
@@ -487,12 +605,12 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
         unsigned cmd;
         int len;
 
-        if (bitstream_bits_left(&s->bc) < 3 + FNSIZE) {
+        if (get_bits_left(&s->gb) < 3 + FNSIZE) {
             *got_frame_ptr = 0;
             break;
         }
 
-        cmd = get_ur_golomb_shorten(&s->bc, FNSIZE);
+        cmd = get_ur_golomb_shorten(&s->gb, FNSIZE);
 
         if (cmd > FN_VERBATIM) {
             av_log(avctx, AV_LOG_ERROR, "unknown shorten function %d\n", cmd);
@@ -504,15 +622,25 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
             /* process non-audio command */
             switch (cmd) {
             case FN_VERBATIM:
-                len = get_ur_golomb_shorten(&s->bc, VERBATIM_CKSIZE_SIZE);
+                len = get_ur_golomb_shorten(&s->gb, VERBATIM_CKSIZE_SIZE);
+                if (len < 0 || len > get_bits_left(&s->gb)) {
+                    av_log(avctx, AV_LOG_ERROR, "verbatim length %d invalid\n",
+                           len);
+                    return AVERROR_INVALIDDATA;
+                }
                 while (len--)
-                    get_ur_golomb_shorten(&s->bc, VERBATIM_BYTE_SIZE);
+                    get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE);
                 break;
-            case FN_BITSHIFT:
-                s->bitshift = get_ur_golomb_shorten(&s->bc, BITSHIFTSIZE);
-                if (s->bitshift < 0)
+            case FN_BITSHIFT: {
+                unsigned bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE);
+                if (bitshift > 32) {
+                    av_log(avctx, AV_LOG_ERROR, "bitshift %d is invalid\n",
+                           bitshift);
                     return AVERROR_INVALIDDATA;
+                }
+                s->bitshift = bitshift;
                 break;
+            }
             case FN_BLOCKSIZE: {
                 unsigned blocksize = get_uint(s, av_log2(s->blocksize));
                 if (blocksize > s->blocksize) {
@@ -532,10 +660,8 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                 s->got_quit_command = 1;
                 break;
             }
-            if (cmd == FN_BLOCKSIZE || cmd == FN_QUIT) {
-                *got_frame_ptr = 0;
+            if (cmd == FN_QUIT)
                 break;
-            }
         } else {
             /* process audio command */
             int residual_size = 0;
@@ -544,11 +670,15 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
 
             /* get Rice code for residual decoding */
             if (cmd != FN_ZERO) {
-                residual_size = get_ur_golomb_shorten(&s->bc, ENERGYSIZE);
+                residual_size = get_ur_golomb_shorten(&s->gb, ENERGYSIZE);
                 /* This is a hack as version 0 differed in the definition
                  * of get_sr_golomb_shorten(). */
                 if (s->version == 0)
                     residual_size--;
+                if (residual_size > 30U) {
+                    av_log(avctx, AV_LOG_ERROR, "residual size unsupportd: %d\n", residual_size);
+                    return AVERROR_INVALIDDATA;
+                }
             }
 
             /* calculate sample offset using means from previous blocks */
@@ -557,10 +687,10 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
             else {
                 int32_t sum = (s->version < 2) ? 0 : s->nmean / 2;
                 for (i = 0; i < s->nmean; i++)
-                    sum += s->offset[channel][i];
+                    sum += (unsigned)s->offset[channel][i];
                 coffset = sum / s->nmean;
                 if (s->version >= 2)
-                    coffset >>= FFMIN(1, s->bitshift);
+                    coffset = s->bitshift == 0 ? coffset : coffset >> s->bitshift - 1 >> 1;
             }
 
             /* decode samples for this channel */
@@ -575,7 +705,7 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
 
             /* update means with info from the current block */
             if (s->nmean > 0) {
-                int32_t sum = (s->version < 2) ? 0 : s->blocksize / 2;
+                int64_t sum = (s->version < 2) ? 0 : s->blocksize / 2;
                 for (i = 0; i < s->blocksize; i++)
                     sum += s->decoded[channel][i];
 
@@ -585,7 +715,7 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                 if (s->version < 2)
                     s->offset[channel][s->nmean - 1] = sum / s->blocksize;
                 else
-                    s->offset[channel][s->nmean - 1] = (sum / s->blocksize) << s->bitshift;
+                    s->offset[channel][s->nmean - 1] = s->bitshift == 32 ? 0 : (sum / s->blocksize) * (1LL << s->bitshift);
             }
 
             /* copy wrap samples for use with next block */
@@ -599,15 +729,35 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
             /* if this is the last channel in the block, output the samples */
             s->cur_chan++;
             if (s->cur_chan == s->channels) {
+                uint8_t *samples_u8;
+                int16_t *samples_s16;
+                int chan;
+
                 /* get output buffer */
                 frame->nb_samples = s->blocksize;
-                if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-                    av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
                     return ret;
+
+                for (chan = 0; chan < s->channels; chan++) {
+                    samples_u8  = ((uint8_t **)frame->extended_data)[chan];
+                    samples_s16 = ((int16_t **)frame->extended_data)[chan];
+                    for (i = 0; i < s->blocksize; i++) {
+                        switch (s->internal_ftype) {
+                        case TYPE_U8:
+                            *samples_u8++ = av_clip_uint8(s->decoded[chan][i]);
+                            break;
+                        case TYPE_S16HL:
+                        case TYPE_S16LH:
+                            *samples_s16++ = av_clip_int16(s->decoded[chan][i]);
+                            break;
+                        }
+                    }
+                    if (s->swap && s->internal_ftype != TYPE_U8)
+                        s->bdsp.bswap16_buf(((uint16_t **)frame->extended_data)[chan],
+                                            ((uint16_t **)frame->extended_data)[chan],
+                                            s->blocksize);
+
                 }
-                /* interleave output */
-                output_buffer((int16_t **)frame->extended_data, s->channels,
-                              s->blocksize, s->decoded);
 
                 *got_frame_ptr = 1;
             }
@@ -617,8 +767,8 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
         *got_frame_ptr = 0;
 
 finish_frame:
-    s->bitindex = bitstream_tell(&s->bc) - 8 * (bitstream_tell(&s->bc) / 8);
-    i           = bitstream_tell(&s->bc) / 8;
+    s->bitindex = get_bits_count(&s->gb) - 8 * (get_bits_count(&s->gb) / 8);
+    i           = get_bits_count(&s->gb) / 8;
     if (i > buf_size) {
         av_log(s->avctx, AV_LOG_ERROR, "overread: %d\n", i - buf_size);
         s->bitstream_size  = 0;
@@ -658,7 +808,8 @@ AVCodec ff_shorten_decoder = {
     .init           = shorten_decode_init,
     .close          = shorten_decode_close,
     .decode         = shorten_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_U8P,
                                                       AV_SAMPLE_FMT_NONE },
 };
diff --git a/libavcodec/simple_idct.c b/libavcodec/simple_idct.c
index 9823a6b..3b2e736 100644
--- a/libavcodec/simple_idct.c
+++ b/libavcodec/simple_idct.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,13 +30,31 @@
 #include "mathops.h"
 #include "simple_idct.h"
 
+#define IN_IDCT_DEPTH 16
+
 #define BIT_DEPTH 8
 #include "simple_idct_template.c"
 #undef BIT_DEPTH
 
 #define BIT_DEPTH 10
 #include "simple_idct_template.c"
+
+#define EXTRA_SHIFT  2
+#include "simple_idct_template.c"
+
+#undef EXTRA_SHIFT
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "simple_idct_template.c"
+#undef BIT_DEPTH
+#undef IN_IDCT_DEPTH
+
+#define IN_IDCT_DEPTH 32
+#define BIT_DEPTH 10
+#include "simple_idct_template.c"
 #undef BIT_DEPTH
+#undef IN_IDCT_DEPTH
 
 /* 2x4x8 idct */
 
@@ -57,8 +75,8 @@ static inline void idct4col_put(uint8_t *dest, ptrdiff_t line_size, const int16_
     a1 = col[8*2];
     a2 = col[8*4];
     a3 = col[8*6];
-    c0 = ((a0 + a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
-    c2 = ((a0 - a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
+    c0 = ((a0 + a2) * (1 << CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
+    c2 = ((a0 - a2) * (1 << CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
     c1 = a1 * C1 + a3 * C2;
     c3 = a1 * C2 - a3 * C1;
     dest[0] = av_clip_uint8((c0 + c1) >> C_SHIFT);
@@ -106,7 +124,7 @@ void ff_simple_idct248_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 
     /* IDCT8 on each line */
     for(i=0; i<8; i++) {
-        idctRowCondDC_8(block + i*8, 0);
+        idctRowCondDC_int16_8bit(block + i*8, 0);
     }
 
     /* IDCT4 and store */
@@ -123,7 +141,7 @@ void ff_simple_idct248_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 #undef C1
 #undef C2
 #define CN_SHIFT 12
-#define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
+#define C_FIX(x) ((int)((x) * M_SQRT2 * (1 << CN_SHIFT) + 0.5))
 #define C1 C_FIX(0.6532814824)
 #define C2 C_FIX(0.2705980501)
 #define C3 C_FIX(0.5)
@@ -150,7 +168,7 @@ static inline void idct4col_add(uint8_t *dest, ptrdiff_t line_size, const int16_
 }
 
 #define RN_SHIFT 15
-#define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
+#define R_FIX(x) ((int)((x) * M_SQRT2 * (1 << RN_SHIFT) + 0.5))
 #define R1 R_FIX(0.6532814824)
 #define R2 R_FIX(0.2705980501)
 #define R3 R_FIX(0.5)
@@ -179,7 +197,7 @@ void ff_simple_idct84_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 
     /* IDCT8 on each line */
     for(i=0; i<4; i++) {
-        idctRowCondDC_8(block + i*8, 0);
+        idctRowCondDC_int16_8bit(block + i*8, 0);
     }
 
     /* IDCT4 and store */
@@ -199,7 +217,7 @@ void ff_simple_idct48_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 
     /* IDCT8 and store */
     for(i=0; i<4; i++){
-        idctSparseColAdd_8(dest + i, line_size, block + i);
+        idctSparseColAdd_int16_8bit(dest + i, line_size, block + i);
     }
 }
 
@@ -218,8 +236,7 @@ void ff_simple_idct44_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
     }
 }
 
-#if CONFIG_PRORES_DECODER
-void ff_prores_idct(int16_t *block, const int16_t *qmat)
+void ff_prores_idct_10(int16_t *block, const int16_t *qmat)
 {
     int i;
 
@@ -227,9 +244,26 @@ void ff_prores_idct(int16_t *block, const int16_t *qmat)
         block[i] *= qmat[i];
 
     for (i = 0; i < 8; i++)
-        idctRowCondDC_10(block + i*8, 2);
+        idctRowCondDC_extrashift_10(block + i*8, 2);
+
+    for (i = 0; i < 8; i++) {
+        block[i] += 8192;
+        idctSparseCol_extrashift_10(block + i);
+    }
+}
+
+void ff_prores_idct_12(int16_t *block, const int16_t *qmat)
+{
+    int i;
+
+    for (i = 0; i < 64; i++)
+        block[i] *= qmat[i];
 
     for (i = 0; i < 8; i++)
-        idctSparseCol_10(block + i);
+        idctRowCondDC_int16_12bit(block + i*8, 0);
+
+    for (i = 0; i < 8; i++) {
+        block[i] += 8192;
+        idctSparseCol_int16_12bit(block + i);
+    }
 }
-#endif /* CONFIG_PRORES_DECODER */
diff --git a/libavcodec/simple_idct.h b/libavcodec/simple_idct.h
index edc994d..20578b3 100644
--- a/libavcodec/simple_idct.h
+++ b/libavcodec/simple_idct.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,19 +31,29 @@
 #include <stddef.h>
 #include <stdint.h>
 
-void ff_simple_idct_put_8(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-void ff_simple_idct_add_8(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-void ff_simple_idct_8(int16_t *block);
+void ff_simple_idct_put_int16_8bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_int16_8bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_int16_8bit(int16_t *block);
+
+void ff_simple_idct_put_int16_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_int16_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_int16_10bit(int16_t *block);
+
+void ff_simple_idct_put_int32_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_int32_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_int32_10bit(int16_t *block);
+
+void ff_simple_idct_put_int16_12bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_add_int16_12bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_int16_12bit(int16_t *block);
 
-void ff_simple_idct_put_10(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-void ff_simple_idct_add_10(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
-void ff_simple_idct_10(int16_t *block);
 /**
- * Special version of ff_simple_idct_10() which does dequantization
+ * Special version of ff_simple_idct_int16_10bit() which does dequantization
  * and scales by a factor of 2 more between the two IDCTs to account
  * for larger scale of input coefficients.
  */
-void ff_prores_idct(int16_t *block, const int16_t *qmat);
+void ff_prores_idct_10(int16_t *block, const int16_t *qmat);
+void ff_prores_idct_12(int16_t *block, const int16_t *qmat);
 
 void ff_simple_idct248_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
diff --git a/libavcodec/simple_idct_template.c b/libavcodec/simple_idct_template.c
index d10df31..35c3132 100644
--- a/libavcodec/simple_idct_template.c
+++ b/libavcodec/simple_idct_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,22 +62,47 @@
 #define MUL(a, b)    MUL16(a, b)
 #define MAC(a, b, c) MAC16(a, b, c)
 
-#elif BIT_DEPTH == 10
-
-#define W1 90901
-#define W2 85627
-#define W3 77062
-#define W4 65535
-#define W5 51491
-#define W6 35468
-#define W7 18081
-
-#define ROW_SHIFT 15
-#define COL_SHIFT 20
-#define DC_SHIFT 1
-
-#define MUL(a, b)    ((a) * (b))
-#define MAC(a, b, c) ((a) += (b) * (c))
+#elif BIT_DEPTH == 10 || BIT_DEPTH == 12
+
+# if BIT_DEPTH == 10
+#define W1 22725 // 90901
+#define W2 21407 //  85627
+#define W3 19265 //  77062
+#define W4 16384 //  65535
+#define W5 12873 //  51491
+#define W6  8867 //  35468
+#define W7  4520 //  18081
+
+#   ifdef EXTRA_SHIFT
+#define ROW_SHIFT 13
+#define COL_SHIFT 18
+#define DC_SHIFT  1
+#   elif IN_IDCT_DEPTH == 32
+#define ROW_SHIFT 13
+#define COL_SHIFT 21
+#define DC_SHIFT  2
+#   else
+#define ROW_SHIFT 12
+#define COL_SHIFT 19
+#define DC_SHIFT  2
+#   endif
+
+# else
+#define W1 45451
+#define W2 42813
+#define W3 38531
+#define W4 32767
+#define W5 25746
+#define W6 17734
+#define W7 9041
+
+#define ROW_SHIFT 16
+#define COL_SHIFT 17
+#define DC_SHIFT -1
+# endif
+
+#define MUL(a, b)    ((int)((SUINT)(a) * (b)))
+#define MAC(a, b, c) ((a) += (SUINT)(b) * (c))
 
 #else
 
@@ -85,18 +110,24 @@
 
 #endif
 
-static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
+#ifdef EXTRA_SHIFT
+static inline void FUNC(idctRowCondDC_extrashift)(int16_t *row, int extra_shift)
+#else
+static inline void FUNC6(idctRowCondDC)(idctin *row, int extra_shift)
+#endif
 {
-    int a0, a1, a2, a3, b0, b1, b2, b3;
+    SUINT a0, a1, a2, a3, b0, b1, b2, b3;
 
+// TODO: Add DC-only support for int32_t input
+#if IN_IDCT_DEPTH == 16
 #if HAVE_FAST_64BIT
 #define ROW0_MASK (0xffffLL << 48 * HAVE_BIGENDIAN)
     if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) {
         uint64_t temp;
-        if (DC_SHIFT - extra_shift > 0) {
+        if (DC_SHIFT - extra_shift >= 0) {
             temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
         } else {
-            temp = (row[0] >> (extra_shift - DC_SHIFT)) & 0xffff;
+            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
         }
         temp += temp * (1 << 16);
         temp += temp * ((uint64_t) 1 << 32);
@@ -110,10 +141,10 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
           AV_RN32A(row+6) |
           row[1])) {
         uint32_t temp;
-        if (DC_SHIFT - extra_shift > 0) {
+        if (DC_SHIFT - extra_shift >= 0) {
             temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
         } else {
-            temp = (row[0] >> (extra_shift - DC_SHIFT)) & 0xffff;
+            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
         }
         temp += temp * (1 << 16);
         AV_WN32A(row, temp);
@@ -123,16 +154,17 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
         return;
     }
 #endif
+#endif
 
-    a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+    a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1));
     a1 = a0;
     a2 = a0;
     a3 = a0;
 
-    a0 += W2 * row[2];
-    a1 += W6 * row[2];
-    a2 -= W6 * row[2];
-    a3 -= W2 * row[2];
+    a0 += (SUINT)W2 * row[2];
+    a1 += (SUINT)W6 * row[2];
+    a2 -= (SUINT)W6 * row[2];
+    a3 -= (SUINT)W2 * row[2];
 
     b0 = MUL(W1, row[1]);
     MAC(b0, W3, row[3]);
@@ -143,11 +175,15 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
     b3 = MUL(W7, row[1]);
     MAC(b3, -W5, row[3]);
 
+#if IN_IDCT_DEPTH == 32
+    if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) {
+#else
     if (AV_RN64A(row + 4)) {
-        a0 +=   W4*row[4] + W6*row[6];
-        a1 += - W4*row[4] - W2*row[6];
-        a2 += - W4*row[4] + W2*row[6];
-        a3 +=   W4*row[4] - W6*row[6];
+#endif
+        a0 += (SUINT)  W4*row[4] + (SUINT)W6*row[6];
+        a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6];
+        a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6];
+        a3 += (SUINT)  W4*row[4] - (SUINT)W6*row[6];
 
         MAC(b0,  W5, row[5]);
         MAC(b0,  W7, row[7]);
@@ -162,26 +198,26 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
         MAC(b3, -W1, row[7]);
     }
 
-    row[0] = (a0 + b0) >> (ROW_SHIFT + extra_shift);
-    row[7] = (a0 - b0) >> (ROW_SHIFT + extra_shift);
-    row[1] = (a1 + b1) >> (ROW_SHIFT + extra_shift);
-    row[6] = (a1 - b1) >> (ROW_SHIFT + extra_shift);
-    row[2] = (a2 + b2) >> (ROW_SHIFT + extra_shift);
-    row[5] = (a2 - b2) >> (ROW_SHIFT + extra_shift);
-    row[3] = (a3 + b3) >> (ROW_SHIFT + extra_shift);
-    row[4] = (a3 - b3) >> (ROW_SHIFT + extra_shift);
+    row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift);
+    row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift);
+    row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift);
+    row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift);
+    row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift);
+    row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift);
+    row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift);
+    row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift);
 }
 
 #define IDCT_COLS do {                                  \
-        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); \
+        a0 = (SUINT)W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); \
         a1 = a0;                                        \
         a2 = a0;                                        \
         a3 = a0;                                        \
                                                         \
-        a0 +=  W2*col[8*2];                             \
-        a1 +=  W6*col[8*2];                             \
-        a2 += -W6*col[8*2];                             \
-        a3 += -W2*col[8*2];                             \
+        a0 += (SUINT) W2*col[8*2];                             \
+        a1 += (SUINT) W6*col[8*2];                             \
+        a2 += (SUINT)-W6*col[8*2];                             \
+        a3 += (SUINT)-W2*col[8*2];                             \
                                                         \
         b0 = MUL(W1, col[8*1]);                         \
         b1 = MUL(W3, col[8*1]);                         \
@@ -194,10 +230,10 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
         MAC(b3, -W5, col[8*3]);                         \
                                                         \
         if (col[8*4]) {                                 \
-            a0 +=  W4*col[8*4];                         \
-            a1 += -W4*col[8*4];                         \
-            a2 += -W4*col[8*4];                         \
-            a3 +=  W4*col[8*4];                         \
+            a0 += (SUINT) W4*col[8*4];                         \
+            a1 += (SUINT)-W4*col[8*4];                         \
+            a2 += (SUINT)-W4*col[8*4];                         \
+            a3 += (SUINT) W4*col[8*4];                         \
         }                                               \
                                                         \
         if (col[8*5]) {                                 \
@@ -208,10 +244,10 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
         }                                               \
                                                         \
         if (col[8*6]) {                                 \
-            a0 +=  W6*col[8*6];                         \
-            a1 += -W2*col[8*6];                         \
-            a2 +=  W2*col[8*6];                         \
-            a3 += -W6*col[8*6];                         \
+            a0 += (SUINT) W6*col[8*6];                         \
+            a1 += (SUINT)-W2*col[8*6];                         \
+            a2 += (SUINT) W2*col[8*6];                         \
+            a3 += (SUINT)-W6*col[8*6];                         \
         }                                               \
                                                         \
         if (col[8*7]) {                                 \
@@ -222,32 +258,35 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
         }                                               \
     } while (0)
 
-static inline void FUNC(idctSparseColPut)(pixel *dest, ptrdiff_t line_size,
-                                          int16_t *col)
+#ifdef EXTRA_SHIFT
+static inline void FUNC(idctSparseCol_extrashift)(int16_t *col)
+#else
+static inline void FUNC6(idctSparseColPut)(pixel *dest, ptrdiff_t line_size,
+                                          idctin *col)
 {
-    int a0, a1, a2, a3, b0, b1, b2, b3;
+    SUINT a0, a1, a2, a3, b0, b1, b2, b3;
 
     IDCT_COLS;
 
-    dest[0] = av_clip_pixel((a0 + b0) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT);
     dest += line_size;
-    dest[0] = av_clip_pixel((a1 + b1) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT);
     dest += line_size;
-    dest[0] = av_clip_pixel((a2 + b2) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT);
     dest += line_size;
-    dest[0] = av_clip_pixel((a3 + b3) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT);
     dest += line_size;
-    dest[0] = av_clip_pixel((a3 - b3) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT);
     dest += line_size;
-    dest[0] = av_clip_pixel((a2 - b2) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT);
     dest += line_size;
-    dest[0] = av_clip_pixel((a1 - b1) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT);
     dest += line_size;
-    dest[0] = av_clip_pixel((a0 - b0) >> COL_SHIFT);
+    dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT);
 }
 
-static inline void FUNC(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size,
-                                          int16_t *col)
+static inline void FUNC6(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size,
+                                          idctin *col)
 {
     int a0, a1, a2, a3, b0, b1, b2, b3;
 
@@ -270,7 +309,8 @@ static inline void FUNC(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size,
     dest[0] = av_clip_pixel(dest[0] + ((a0 - b0) >> COL_SHIFT));
 }
 
-static inline void FUNC(idctSparseCol)(int16_t *col)
+static inline void FUNC6(idctSparseCol)(idctin *col)
+#endif
 {
     int a0, a1, a2, a3, b0, b1, b2, b3;
 
@@ -286,21 +326,24 @@ static inline void FUNC(idctSparseCol)(int16_t *col)
     col[56] = ((a0 - b0) >> COL_SHIFT);
 }
 
-void FUNC(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block)
+#ifndef EXTRA_SHIFT
+void FUNC6(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block_)
 {
+    idctin *block = (idctin *)block_;
     pixel *dest = (pixel *)dest_;
     int i;
 
     line_size /= sizeof(pixel);
 
     for (i = 0; i < 8; i++)
-        FUNC(idctRowCondDC)(block + i*8, 0);
+        FUNC6(idctRowCondDC)(block + i*8, 0);
 
     for (i = 0; i < 8; i++)
-        FUNC(idctSparseColPut)(dest + i, line_size, block + i);
+        FUNC6(idctSparseColPut)(dest + i, line_size, block + i);
 }
 
-void FUNC(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block)
+#if IN_IDCT_DEPTH == 16
+void FUNC6(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block)
 {
     pixel *dest = (pixel *)dest_;
     int i;
@@ -308,19 +351,21 @@ void FUNC(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *bloc
     line_size /= sizeof(pixel);
 
     for (i = 0; i < 8; i++)
-        FUNC(idctRowCondDC)(block + i*8, 0);
+        FUNC6(idctRowCondDC)(block + i*8, 0);
 
     for (i = 0; i < 8; i++)
-        FUNC(idctSparseColAdd)(dest + i, line_size, block + i);
+        FUNC6(idctSparseColAdd)(dest + i, line_size, block + i);
 }
 
-void FUNC(ff_simple_idct)(int16_t *block)
+void FUNC6(ff_simple_idct)(int16_t *block)
 {
     int i;
 
     for (i = 0; i < 8; i++)
-        FUNC(idctRowCondDC)(block + i*8, 0);
+        FUNC6(idctRowCondDC)(block + i*8, 0);
 
     for (i = 0; i < 8; i++)
-        FUNC(idctSparseCol)(block + i);
+        FUNC6(idctSparseCol)(block + i);
 }
+#endif
+#endif
diff --git a/libavcodec/sinewin.c b/libavcodec/sinewin.c
index be38dbc..4532dc7 100644
--- a/libavcodec/sinewin.c
+++ b/libavcodec/sinewin.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define USE_FIXED 0
 #include "sinewin.h"
 #include "sinewin_tablegen.h"
diff --git a/libavcodec/sinewin.h b/libavcodec/sinewin.h
index 478036d..329e9bb 100644
--- a/libavcodec/sinewin.h
+++ b/libavcodec/sinewin.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Robert Swain
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 
 #include "config.h"
 #include "libavutil/mem.h"
+#include "libavcodec/aac_defines.h"
 
 #if CONFIG_HARDCODED_TABLES
 #   define SINETABLE_CONST const
@@ -30,31 +31,40 @@
 #   define SINETABLE_CONST
 #endif
 
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
 #define SINETABLE(size) \
-    SINETABLE_CONST DECLARE_ALIGNED(32, float, ff_sine_##size)[size]
+    SINETABLE_CONST DECLARE_ALIGNED(32, INTFLOAT, AAC_RENAME(ff_sine_##size))[size]
+
+#define SINETABLE120960(size) \
+    DECLARE_ALIGNED(32, INTFLOAT, AAC_RENAME(ff_sine_##size))[size]
 
 /**
  * Generate a sine window.
  * @param   window  pointer to half window
  * @param   n       size of half window
  */
-void ff_sine_window_init(float *window, int n);
+void AAC_RENAME(ff_sine_window_init)(INTFLOAT *window, int n);
 
 /**
  * initialize the specified entry of ff_sine_windows
  */
-void ff_init_ff_sine_windows(int index);
+void AAC_RENAME(ff_init_ff_sine_windows)(int index);
 
 extern SINETABLE(  32);
 extern SINETABLE(  64);
+extern SINETABLE120960(120);
 extern SINETABLE( 128);
 extern SINETABLE( 256);
 extern SINETABLE( 512);
+extern SINETABLE120960(960);
 extern SINETABLE(1024);
 extern SINETABLE(2048);
 extern SINETABLE(4096);
 extern SINETABLE(8192);
 
-extern SINETABLE_CONST float * const ff_sine_windows[14];
+extern SINETABLE_CONST INTFLOAT * const AAC_RENAME(ff_sine_windows)[16];
 
 #endif /* AVCODEC_SINEWIN_H */
diff --git a/libavcodec/sinewin_fixed.c b/libavcodec/sinewin_fixed.c
new file mode 100644
index 0000000..27ead29
--- /dev/null
+++ b/libavcodec/sinewin_fixed.c
@@ -0,0 +1,21 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "sinewin.h"
+#include "sinewin_tablegen.h"
diff --git a/libavcodec/sinewin_fixed_tablegen.c b/libavcodec/sinewin_fixed_tablegen.c
new file mode 100644
index 0000000..977e6f3c
--- /dev/null
+++ b/libavcodec/sinewin_fixed_tablegen.c
@@ -0,0 +1,24 @@
+/*
+ * Generate a header file for hardcoded sine windows
+ *
+ * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "sinewin_tablegen_template.c"
diff --git a/libavcodec/sinewin_tablegen.c b/libavcodec/sinewin_tablegen.c
index 90a75c2..dd60266 100644
--- a/libavcodec/sinewin_tablegen.c
+++ b/libavcodec/sinewin_tablegen.c
@@ -3,44 +3,22 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#define SINETABLE_CONST
-#define SINETABLE(size) \
-    float ff_sine_##size[size]
-#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
-#include "sinewin_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    int i;
-
-    write_fileheader();
-
-    for (i = 5; i <= 13; i++) {
-        ff_init_ff_sine_windows(i);
-        printf("SINETABLE(%4i) = {\n", 1 << i);
-        write_float_array(ff_sine_windows[i], 1 << i);
-        printf("};\n");
-    }
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "sinewin_tablegen_template.c"
diff --git a/libavcodec/sinewin_tablegen.h b/libavcodec/sinewin_tablegen.h
index 1ee225b..dc52234 100644
--- a/libavcodec/sinewin_tablegen.h
+++ b/libavcodec/sinewin_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,14 @@
 // do not use libavutil/libm.h since this is compiled both
 // for the host and the target and config.h is only valid for the target
 #include <math.h>
+#include "libavcodec/aac_defines.h"
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 
+#if !USE_FIXED
+SINETABLE120960(120);
+SINETABLE120960(960);
+#endif
 #if !CONFIG_HARDCODED_TABLES
 SINETABLE(  32);
 SINETABLE(  64);
@@ -41,26 +46,37 @@ SINETABLE(2048);
 SINETABLE(4096);
 SINETABLE(8192);
 #else
+#if USE_FIXED
+#include "libavcodec/sinewin_fixed_tables.h"
+#else
 #include "libavcodec/sinewin_tables.h"
 #endif
+#endif
+
+#if USE_FIXED
+#define SIN_FIX(a) (int)floor((a) * 0x80000000 + 0.5)
+#else
+#define SIN_FIX(a) a
+#endif
 
-SINETABLE_CONST float * const ff_sine_windows[] = {
+SINETABLE_CONST INTFLOAT * const AAC_RENAME(ff_sine_windows)[] = {
     NULL, NULL, NULL, NULL, NULL, // unused
-    ff_sine_32 , ff_sine_64 ,
-    ff_sine_128, ff_sine_256, ff_sine_512, ff_sine_1024, ff_sine_2048, ff_sine_4096, ff_sine_8192
+    AAC_RENAME(ff_sine_32) , AAC_RENAME(ff_sine_64), AAC_RENAME(ff_sine_128),
+    AAC_RENAME(ff_sine_256), AAC_RENAME(ff_sine_512), AAC_RENAME(ff_sine_1024),
+    AAC_RENAME(ff_sine_2048), AAC_RENAME(ff_sine_4096), AAC_RENAME(ff_sine_8192),
 };
 
 // Generate a sine window.
-av_cold void ff_sine_window_init(float *window, int n) {
+av_cold void AAC_RENAME(ff_sine_window_init)(INTFLOAT *window, int n) {
     int i;
     for(i = 0; i < n; i++)
-        window[i] = sinf((i + 0.5) * (M_PI / (2.0 * n)));
+        window[i] = SIN_FIX(sinf((i + 0.5) * (M_PI / (2.0 * n))));
 }
 
-av_cold void ff_init_ff_sine_windows(int index) {
-    assert(index >= 0 && index < FF_ARRAY_ELEMS(ff_sine_windows));
+av_cold void AAC_RENAME(ff_init_ff_sine_windows)(int index) {
+    assert(index >= 0 && index < FF_ARRAY_ELEMS(AAC_RENAME(ff_sine_windows)));
 #if !CONFIG_HARDCODED_TABLES
-    ff_sine_window_init(ff_sine_windows[index], 1 << index);
+    AAC_RENAME(ff_sine_window_init)(AAC_RENAME(ff_sine_windows)[index], 1 << index);
 #endif
 }
 
diff --git a/libavcodec/sinewin_tablegen_template.c b/libavcodec/sinewin_tablegen_template.c
new file mode 100644
index 0000000..b8eb407
--- /dev/null
+++ b/libavcodec/sinewin_tablegen_template.c
@@ -0,0 +1,56 @@
+/*
+ * Generate a header file for hardcoded sine windows
+ *
+ * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavcodec/aac_defines.h"
+#define CONFIG_HARDCODED_TABLES 0
+
+#if USE_FIXED
+#define WRITE_FUNC write_int32_t_array
+#else
+#define WRITE_FUNC write_float_array
+#endif
+
+#define SINETABLE_CONST
+#define SINETABLE(size) \
+    INTFLOAT AAC_RENAME(ff_sine_##size)[size]
+#define SINETABLE120960(size) \
+    INTFLOAT AAC_RENAME(ff_sine_##size)[size]
+#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
+#include "sinewin_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    int i;
+
+    write_fileheader();
+
+    for (i = 5; i <= 13; i++) {
+        AAC_RENAME(ff_init_ff_sine_windows)(i);
+        printf("SINETABLE(%4i) = {\n", 1 << i);
+        WRITE_FUNC(AAC_RENAME(ff_sine_windows)[i], 1 << i);
+        printf("};\n");
+    }
+
+    return 0;
+}
diff --git a/libavcodec/sipr.c b/libavcodec/sipr.c
index 5d2bdbd..1b6de25 100644
--- a/libavcodec/sipr.c
+++ b/libavcodec/sipr.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "lsp.h"
 #include "acelp_vectors.h"
@@ -188,28 +188,28 @@ static void pitch_sharpening(int pitch_lag_int, float beta,
 /**
  * Extract decoding parameters from the input bitstream.
  * @param parms          parameters structure
- * @param bc             pointer to initialized BitstreamContext structure
+ * @param pgb            pointer to initialized GetBitContext structure
  */
-static void decode_parameters(SiprParameters* parms, BitstreamContext *bc,
+static void decode_parameters(SiprParameters* parms, GetBitContext *pgb,
                               const SiprModeParam *p)
 {
     int i, j;
 
     if (p->ma_predictor_bits)
-        parms->ma_pred_switch = bitstream_read(bc, p->ma_predictor_bits);
+        parms->ma_pred_switch       = get_bits(pgb, p->ma_predictor_bits);
 
     for (i = 0; i < 5; i++)
-        parms->vq_indexes[i] = bitstream_read(bc, p->vq_indexes_bits[i]);
+        parms->vq_indexes[i]        = get_bits(pgb, p->vq_indexes_bits[i]);
 
     for (i = 0; i < p->subframe_count; i++) {
-        parms->pitch_delay[i] = bitstream_read(bc, p->pitch_delay_bits[i]);
+        parms->pitch_delay[i]       = get_bits(pgb, p->pitch_delay_bits[i]);
         if (p->gp_index_bits)
-            parms->gp_index[i] = bitstream_read(bc, p->gp_index_bits);
+            parms->gp_index[i]      = get_bits(pgb, p->gp_index_bits);
 
         for (j = 0; j < p->number_of_fc_indexes; j++)
-            parms->fc_indexes[i][j] = bitstream_read(bc, p->fc_index_bits[j]);
+            parms->fc_indexes[i][j] = get_bits(pgb, p->fc_index_bits[j]);
 
-        parms->gc_index[i] = bitstream_read(bc, p->gc_index_bits);
+        parms->gc_index[i]          = get_bits(pgb, p->gc_index_bits);
     }
 }
 
@@ -493,7 +493,7 @@ static av_cold int sipr_decoder_init(AVCodecContext * avctx)
         else if (avctx->bit_rate > 5750 ) ctx->mode = MODE_6k5;
         else                              ctx->mode = MODE_5k0;
         av_log(avctx, AV_LOG_WARNING,
-               "Invalid block_align: %d. Mode %s guessed based on bitrate: %d\n",
+               "Invalid block_align: %d. Mode %s guessed based on bitrate: %"PRId64"\n",
                avctx->block_align, modes[ctx->mode].mode_name, avctx->bit_rate);
     }
 
@@ -527,7 +527,7 @@ static int sipr_decode_frame(AVCodecContext *avctx, void *data,
     const uint8_t *buf=avpkt->data;
     SiprParameters parm;
     const SiprModeParam *mode_par = &modes[ctx->mode];
-    BitstreamContext bc;
+    GetBitContext gb;
     float *samples;
     int subframe_size = ctx->mode == MODE_16k ? L_SUBFR_16k : SUBFR_SIZE;
     int i, ret;
@@ -537,22 +537,20 @@ static int sipr_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_ERROR,
                "Error processing packet: packet size (%d) too small\n",
                avpkt->size);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     /* get output buffer */
     frame->nb_samples = mode_par->frames_per_packet * subframe_size *
                         mode_par->subframe_count;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (float *)frame->data[0];
 
-    bitstream_init(&bc, buf, mode_par->bits_per_frame);
+    init_get_bits(&gb, buf, mode_par->bits_per_frame);
 
     for (i = 0; i < mode_par->frames_per_packet; i++) {
-        decode_parameters(&parm, &bc, mode_par);
+        decode_parameters(&parm, &gb, mode_par);
 
         ctx->decode_frame(ctx, &parm, samples);
 
diff --git a/libavcodec/sipr.h b/libavcodec/sipr.h
index 4cdea67..34f7f99 100644
--- a/libavcodec/sipr.h
+++ b/libavcodec/sipr.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sipr16k.c b/libavcodec/sipr16k.c
index 6a0ef8d..9c8f684 100644
--- a/libavcodec/sipr16k.c
+++ b/libavcodec/sipr16k.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,6 @@
 #include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
 #include "lsp.h"
-#include "celp_filters.h"
 #include "acelp_vectors.h"
 #include "acelp_pitch_delay.h"
 #include "acelp_filters.h"
diff --git a/libavcodec/sipr16kdata.h b/libavcodec/sipr16kdata.h
index 7677a69..16a653d 100644
--- a/libavcodec/sipr16kdata.h
+++ b/libavcodec/sipr16kdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sipr_parser.c b/libavcodec/sipr_parser.c
new file mode 100644
index 0000000..fba25e1
--- /dev/null
+++ b/libavcodec/sipr_parser.c
@@ -0,0 +1,74 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Sipr audio parser
+ */
+
+#include "parser.h"
+
+typedef struct SiprParserContext{
+    ParseContext pc;
+} SiprParserContext;
+
+static int sipr_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
+{
+    int next;
+
+    switch (avctx->block_align) {
+    case 20:
+    case 19:
+    case 29:
+    case 37: next = avctx->block_align; break;
+    default:
+        if      (avctx->bit_rate > 12200) next = 20;
+        else if (avctx->bit_rate > 7500 ) next = 19;
+        else if (avctx->bit_rate > 5750 ) next = 29;
+        else                              next = 37;
+    }
+
+    return FFMIN(next, buf_size);
+}
+
+static int sipr_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                      const uint8_t **poutbuf, int *poutbuf_size,
+                      const uint8_t *buf, int buf_size)
+{
+    SiprParserContext *s = s1->priv_data;
+    ParseContext *pc = &s->pc;
+    int next;
+
+    next = sipr_split(avctx, buf, buf_size);
+    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+        *poutbuf = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+AVCodecParser ff_sipr_parser = {
+    .codec_ids      = { AV_CODEC_ID_SIPR },
+    .priv_data_size = sizeof(SiprParserContext),
+    .parser_parse   = sipr_parse,
+    .parser_close   = ff_parse_close,
+};
diff --git a/libavcodec/siprdata.h b/libavcodec/siprdata.h
index 2644d59..0dbc113 100644
--- a/libavcodec/siprdata.h
+++ b/libavcodec/siprdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/smacker.c b/libavcodec/smacker.c
index 636e3b4..61e3169 100644
--- a/libavcodec/smacker.c
+++ b/libavcodec/smacker.c
@@ -2,20 +2,20 @@
  * Smacker decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,14 +35,14 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
-#include "vlc.h"
 
 #define SMKTREE_BITS 9
 #define SMK_NODE 0x80000000
+
 #define SMKTREE_DECODE_MAX_RECURSION 32
 #define SMKTREE_DECODE_BIG_MAX_RECURSION 500
 
@@ -95,16 +95,15 @@ enum SmkBlockTypes {
 /**
  * Decode local frame tree
  */
-static int smacker_decode_tree(BitstreamContext *bc, HuffContext *hc,
-                               uint32_t prefix, int length)
+static int smacker_decode_tree(GetBitContext *gb, HuffContext *hc, uint32_t prefix, int length)
 {
-    if (length > SMKTREE_DECODE_MAX_RECURSION) {
+    if (length > SMKTREE_DECODE_MAX_RECURSION || length > 3 * SMKTREE_BITS) {
         av_log(NULL, AV_LOG_ERROR, "Maximum tree recursion level exceeded.\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (!bitstream_read_bit(bc)) { // Leaf
-        if(hc->current >= 256){
+    if(!get_bits1(gb)){ //Leaf
+        if(hc->current >= hc->length){
             av_log(NULL, AV_LOG_ERROR, "Tree size exceeded!\n");
             return AVERROR_INVALIDDATA;
         }
@@ -115,7 +114,7 @@ static int smacker_decode_tree(BitstreamContext *bc, HuffContext *hc,
             hc->bits[hc->current] = 0;
             hc->lengths[hc->current] = 0;
         }
-        hc->values[hc->current] = bitstream_read(bc, 8);
+        hc->values[hc->current] = get_bits(gb, 8);
         hc->current++;
         if(hc->maxlength < length)
             hc->maxlength = length;
@@ -123,17 +122,17 @@ static int smacker_decode_tree(BitstreamContext *bc, HuffContext *hc,
     } else { //Node
         int r;
         length++;
-        r = smacker_decode_tree(bc, hc, prefix, length);
+        r = smacker_decode_tree(gb, hc, prefix, length);
         if(r)
             return r;
-        return smacker_decode_tree(bc, hc, prefix | (1 << (length - 1)), length);
+        return smacker_decode_tree(gb, hc, prefix | (1 << (length - 1)), length);
     }
 }
 
 /**
  * Decode header tree
  */
-static int smacker_decode_bigtree(BitstreamContext *bc, HuffContext *hc,
+static int smacker_decode_bigtree(GetBitContext *gb, HuffContext *hc,
                                   DBCtx *ctx, int length)
 {
     // Larger length can cause segmentation faults due to too deep recursion.
@@ -146,10 +145,10 @@ static int smacker_decode_bigtree(BitstreamContext *bc, HuffContext *hc,
         av_log(NULL, AV_LOG_ERROR, "Tree size exceeded!\n");
         return AVERROR_INVALIDDATA;
     }
-    if (!bitstream_read_bit(bc)) { // Leaf
+    if(!get_bits1(gb)){ //Leaf
         int val, i1, i2;
-        i1 = ctx->v1->table ? bitstream_read_vlc(bc, ctx->v1->table, SMKTREE_BITS, 3) : 0;
-        i2 = ctx->v2->table ? bitstream_read_vlc(bc, ctx->v2->table, SMKTREE_BITS, 3) : 0;
+        i1 = ctx->v1->table ? get_vlc2(gb, ctx->v1->table, SMKTREE_BITS, 3) : 0;
+        i2 = ctx->v2->table ? get_vlc2(gb, ctx->v2->table, SMKTREE_BITS, 3) : 0;
         if (i1 < 0 || i2 < 0)
             return AVERROR_INVALIDDATA;
         val = ctx->recode1[i1] | (ctx->recode2[i2] << 8);
@@ -170,12 +169,12 @@ static int smacker_decode_bigtree(BitstreamContext *bc, HuffContext *hc,
         int r = 0, r_new, t;
 
         t = hc->current++;
-        r = smacker_decode_bigtree(bc, hc, ctx, length + 1);
+        r = smacker_decode_bigtree(gb, hc, ctx, length + 1);
         if(r < 0)
             return r;
         hc->values[t] = SMK_NODE | r;
         r++;
-        r_new = smacker_decode_bigtree(bc, hc, ctx, length + 1);
+        r_new = smacker_decode_bigtree(gb, hc, ctx, length + 1);
         if (r_new < 0)
             return r_new;
         return r + r_new;
@@ -183,10 +182,9 @@ static int smacker_decode_bigtree(BitstreamContext *bc, HuffContext *hc,
 }
 
 /**
- * Store large tree as Libav's vlc codes
+ * Store large tree as FFmpeg's vlc codes
  */
-static int smacker_decode_header_tree(SmackVContext *smk, BitstreamContext *bc,
-                                      int **recodes, int *last, int size)
+static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int **recodes, int *last, int size)
 {
     int res;
     HuffContext huff;
@@ -220,41 +218,52 @@ static int smacker_decode_header_tree(SmackVContext *smk, BitstreamContext *bc,
         goto error;
     }
 
-    if (bitstream_read_bit(bc)) {
-        smacker_decode_tree(bc, &tmp1, 0, 0);
-        bitstream_skip(bc, 1);
-        res = init_vlc(&vlc[0], SMKTREE_BITS, tmp1.length,
-                    tmp1.lengths, sizeof(int), sizeof(int),
-                    tmp1.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
-        if(res < 0) {
-            av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+    if(get_bits1(gb)) {
+        res = smacker_decode_tree(gb, &tmp1, 0, 0);
+        if (res < 0) {
             err = res;
             goto error;
         }
-    } else {
+        skip_bits1(gb);
+        if(tmp1.current > 1) {
+            res = init_vlc(&vlc[0], SMKTREE_BITS, tmp1.length,
+                        tmp1.lengths, sizeof(int), sizeof(int),
+                        tmp1.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
+            if(res < 0) {
+                av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+                err = res;
+                goto error;
+            }
+        }
+    }
+    if (!vlc[0].table) {
         av_log(smk->avctx, AV_LOG_ERROR, "Skipping low bytes tree\n");
     }
-    if (bitstream_read_bit(bc)) {
-        smacker_decode_tree(bc, &tmp2, 0, 0);
-        bitstream_skip(bc, 1);
-        res = init_vlc(&vlc[1], SMKTREE_BITS, tmp2.length,
-                    tmp2.lengths, sizeof(int), sizeof(int),
-                    tmp2.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
-        if(res < 0) {
-            av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+    if(get_bits1(gb)){
+        res = smacker_decode_tree(gb, &tmp2, 0, 0);
+        if (res < 0) {
             err = res;
             goto error;
         }
-    } else {
+        skip_bits1(gb);
+        if(tmp2.current > 1) {
+            res = init_vlc(&vlc[1], SMKTREE_BITS, tmp2.length,
+                        tmp2.lengths, sizeof(int), sizeof(int),
+                        tmp2.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
+            if(res < 0) {
+                av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+                err = res;
+                goto error;
+            }
+        }
+    }
+    if (!vlc[1].table) {
         av_log(smk->avctx, AV_LOG_ERROR, "Skipping high bytes tree\n");
     }
 
-    escapes[0]  = bitstream_read(bc, 8);
-    escapes[0] |= bitstream_read(bc, 8) << 8;
-    escapes[1]  = bitstream_read(bc, 8);
-    escapes[1] |= bitstream_read(bc, 8) << 8;
-    escapes[2]  = bitstream_read(bc, 8);
-    escapes[2] |= bitstream_read(bc, 8) << 8;
+    escapes[0]  = get_bits(gb, 16);
+    escapes[1]  = get_bits(gb, 16);
+    escapes[2]  = get_bits(gb, 16);
 
     last[0] = last[1] = last[2] = -1;
 
@@ -270,15 +279,16 @@ static int smacker_decode_header_tree(SmackVContext *smk, BitstreamContext *bc,
     huff.length = ((size + 3) >> 2) + 4;
     huff.maxlength = 0;
     huff.current = 0;
-    huff.values = av_mallocz(huff.length * sizeof(int));
+    huff.values = av_mallocz_array(huff.length, sizeof(int));
     if (!huff.values) {
         err = AVERROR(ENOMEM);
         goto error;
     }
 
-    if ((res = smacker_decode_bigtree(bc, &huff, &ctx, 0)) < 0)
+    res = smacker_decode_bigtree(gb, &huff, &ctx, 0);
+    if (res < 0)
         err = res;
-    bitstream_skip(bc, 1);
+    skip_bits1(gb);
     if(ctx.last[0] == -1) ctx.last[0] = huff.current++;
     if(ctx.last[1] == -1) ctx.last[1] = huff.current++;
     if(ctx.last[2] == -1) ctx.last[2] = huff.current++;
@@ -307,7 +317,7 @@ error:
 }
 
 static int decode_header_trees(SmackVContext *smk) {
-    BitstreamContext bc;
+    GetBitContext gb;
     int mmap_size, mclr_size, full_size, type_size, ret;
 
     mmap_size = AV_RL32(smk->avctx->extradata);
@@ -315,9 +325,11 @@ static int decode_header_trees(SmackVContext *smk) {
     full_size = AV_RL32(smk->avctx->extradata + 8);
     type_size = AV_RL32(smk->avctx->extradata + 12);
 
-    bitstream_init8(&bc, smk->avctx->extradata + 16, smk->avctx->extradata_size - 16);
+    ret = init_get_bits8(&gb, smk->avctx->extradata + 16, smk->avctx->extradata_size - 16);
+    if (ret < 0)
+        return ret;
 
-    if (!bitstream_read_bit(&bc)) {
+    if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping MMAP tree\n");
         smk->mmap_tbl = av_malloc(sizeof(int) * 2);
         if (!smk->mmap_tbl)
@@ -325,10 +337,11 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->mmap_tbl[0] = 0;
         smk->mmap_last[0] = smk->mmap_last[1] = smk->mmap_last[2] = 1;
     } else {
-        if ((ret = smacker_decode_header_tree(smk, &bc, &smk->mmap_tbl, smk->mmap_last, mmap_size)) < 0)
+        ret = smacker_decode_header_tree(smk, &gb, &smk->mmap_tbl, smk->mmap_last, mmap_size);
+        if (ret < 0)
             return ret;
     }
-    if (!bitstream_read_bit(&bc)) {
+    if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping MCLR tree\n");
         smk->mclr_tbl = av_malloc(sizeof(int) * 2);
         if (!smk->mclr_tbl)
@@ -336,10 +349,11 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->mclr_tbl[0] = 0;
         smk->mclr_last[0] = smk->mclr_last[1] = smk->mclr_last[2] = 1;
     } else {
-        if ((ret = smacker_decode_header_tree(smk, &bc, &smk->mclr_tbl, smk->mclr_last, mclr_size)) < 0)
+        ret = smacker_decode_header_tree(smk, &gb, &smk->mclr_tbl, smk->mclr_last, mclr_size);
+        if (ret < 0)
             return ret;
     }
-    if (!bitstream_read_bit(&bc)) {
+    if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping FULL tree\n");
         smk->full_tbl = av_malloc(sizeof(int) * 2);
         if (!smk->full_tbl)
@@ -347,10 +361,11 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->full_tbl[0] = 0;
         smk->full_last[0] = smk->full_last[1] = smk->full_last[2] = 1;
     } else {
-        if ((ret = smacker_decode_header_tree(smk, &bc, &smk->full_tbl, smk->full_last, full_size)) < 0)
+        ret = smacker_decode_header_tree(smk, &gb, &smk->full_tbl, smk->full_last, full_size);
+        if (ret < 0)
             return ret;
     }
-    if (!bitstream_read_bit(&bc)) {
+    if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping TYPE tree\n");
         smk->type_tbl = av_malloc(sizeof(int) * 2);
         if (!smk->type_tbl)
@@ -358,7 +373,8 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->type_tbl[0] = 0;
         smk->type_last[0] = smk->type_last[1] = smk->type_last[2] = 1;
     } else {
-        if ((ret = smacker_decode_header_tree(smk, &bc, &smk->type_tbl, smk->type_last, type_size)) < 0)
+        ret = smacker_decode_header_tree(smk, &gb, &smk->type_tbl, smk->type_last, type_size);
+        if (ret < 0)
             return ret;
     }
 
@@ -370,14 +386,12 @@ static av_always_inline void last_reset(int *recode, int *last) {
 }
 
 /* get code and update history */
-static av_always_inline int smk_get_code(BitstreamContext *bc, int *recode,
-                                         int *last)
-{
+static av_always_inline int smk_get_code(GetBitContext *gb, int *recode, int *last) {
     register int *table = recode;
     int v;
 
     while(*table & SMK_NODE) {
-        if (bitstream_read_bit(bc))
+        if(get_bits1(gb))
             table += (*table) & (~SMK_NODE);
         table++;
     }
@@ -398,19 +412,17 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     uint8_t *out;
     uint32_t *pal;
     GetByteContext gb2;
-    BitstreamContext bc;
+    GetBitContext gb;
     int blocks, blk, bw, bh;
     int i, ret;
     int stride;
     int flags;
 
     if (avpkt->size <= 769)
-        return 0;
+        return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_reget_buffer(avctx, smk->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, smk->pic)) < 0)
         return ret;
-    }
 
     /* make the palette available on the way out */
     pal = (uint32_t*)smk->pic->data[1];
@@ -418,39 +430,39 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     flags = bytestream2_get_byteu(&gb2);
     smk->pic->palette_has_changed = flags & 1;
     smk->pic->key_frame = !!(flags & 2);
-    if(smk->pic->key_frame)
+    if (smk->pic->key_frame)
         smk->pic->pict_type = AV_PICTURE_TYPE_I;
     else
         smk->pic->pict_type = AV_PICTURE_TYPE_P;
 
     for(i = 0; i < 256; i++)
-        *pal++ = bytestream2_get_be24u(&gb2);
+        *pal++ = 0xFFU << 24 | bytestream2_get_be24u(&gb2);
 
     last_reset(smk->mmap_tbl, smk->mmap_last);
     last_reset(smk->mclr_tbl, smk->mclr_last);
     last_reset(smk->full_tbl, smk->full_last);
     last_reset(smk->type_tbl, smk->type_last);
-    bitstream_init8(&bc, avpkt->data + 769, avpkt->size - 769);
+    if ((ret = init_get_bits8(&gb, avpkt->data + 769, avpkt->size - 769)) < 0)
+        return ret;
 
     blk = 0;
     bw = avctx->width >> 2;
     bh = avctx->height >> 2;
     blocks = bw * bh;
-    out = smk->pic->data[0];
     stride = smk->pic->linesize[0];
     while(blk < blocks) {
         int type, run, mode;
         uint16_t pix;
 
-        type = smk_get_code(&bc, smk->type_tbl, smk->type_last);
+        type = smk_get_code(&gb, smk->type_tbl, smk->type_last);
         run = block_runs[(type >> 2) & 0x3F];
         switch(type & 3){
         case SMK_BLK_MONO:
             while(run-- && blk < blocks){
                 int clr, map;
                 int hi, lo;
-                clr = smk_get_code(&bc, smk->mclr_tbl, smk->mclr_last);
-                map = smk_get_code(&bc, smk->mmap_tbl, smk->mmap_last);
+                clr = smk_get_code(&gb, smk->mclr_tbl, smk->mclr_last);
+                map = smk_get_code(&gb, smk->mmap_tbl, smk->mmap_last);
                 out = smk->pic->data[0] + (blk / bw) * (stride * 4) + (blk % bw) * 4;
                 hi = clr >> 8;
                 lo = clr & 0xFF;
@@ -468,44 +480,41 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         case SMK_BLK_FULL:
             mode = 0;
             if(avctx->codec_tag == MKTAG('S', 'M', 'K', '4')) { // In case of Smacker v4 we have three modes
-                if (bitstream_read_bit(&bc))
-                    mode = 1;
-                else if (bitstream_read_bit(&bc))
-                    mode = 2;
+                if(get_bits1(&gb)) mode = 1;
+                else if(get_bits1(&gb)) mode = 2;
             }
             while(run-- && blk < blocks){
                 out = smk->pic->data[0] + (blk / bw) * (stride * 4) + (blk % bw) * 4;
                 switch(mode){
                 case 0:
                     for(i = 0; i < 4; i++) {
-                        pix = smk_get_code(&bc, smk->full_tbl, smk->full_last);
+                        pix = smk_get_code(&gb, smk->full_tbl, smk->full_last);
                         AV_WL16(out+2,pix);
-                        pix = smk_get_code(&bc, smk->full_tbl, smk->full_last);
+                        pix = smk_get_code(&gb, smk->full_tbl, smk->full_last);
                         AV_WL16(out,pix);
                         out += stride;
                     }
                     break;
                 case 1:
-                    pix = smk_get_code(&bc, smk->full_tbl, smk->full_last);
+                    pix = smk_get_code(&gb, smk->full_tbl, smk->full_last);
                     out[0] = out[1] = pix & 0xFF;
                     out[2] = out[3] = pix >> 8;
                     out += stride;
                     out[0] = out[1] = pix & 0xFF;
                     out[2] = out[3] = pix >> 8;
                     out += stride;
-                    pix = smk_get_code(&bc, smk->full_tbl, smk->full_last);
+                    pix = smk_get_code(&gb, smk->full_tbl, smk->full_last);
                     out[0] = out[1] = pix & 0xFF;
                     out[2] = out[3] = pix >> 8;
                     out += stride;
                     out[0] = out[1] = pix & 0xFF;
                     out[2] = out[3] = pix >> 8;
-                    out += stride;
                     break;
                 case 2:
                     for(i = 0; i < 2; i++) {
                         uint16_t pix1, pix2;
-                        pix2 = smk_get_code(&bc, smk->full_tbl, smk->full_last);
-                        pix1 = smk_get_code(&bc, smk->full_tbl, smk->full_last);
+                        pix2 = smk_get_code(&gb, smk->full_tbl, smk->full_last);
+                        pix1 = smk_get_code(&gb, smk->full_tbl, smk->full_last);
                         AV_WL16(out,pix1);
                         AV_WL16(out+2,pix2);
                         out += stride;
@@ -580,10 +589,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
     /* decode huffman trees from extradata */
     if(avctx->extradata_size < 16){
         av_log(avctx, AV_LOG_ERROR, "Extradata missing!\n");
-        return AVERROR_INVALIDDATA;
+        decode_end(avctx);
+        return AVERROR(EINVAL);
     }
 
-    if ((ret = decode_header_trees(c))) {
+    ret = decode_header_trees(c);
+    if (ret < 0) {
         decode_end(avctx);
         return ret;
     }
@@ -592,7 +603,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
 }
 
 
-
 static av_cold int smka_decode_init(AVCodecContext *avctx)
 {
     if (avctx->channels < 1 || avctx->channels > 2) {
@@ -614,7 +624,7 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
-    BitstreamContext bc;
+    GetBitContext gb;
     HuffContext h[4] = { { 0 } };
     VLC vlc[4]       = { { 0 } };
     int16_t *samples;
@@ -632,35 +642,39 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
 
     unp_size = AV_RL32(buf);
 
-    bitstream_init8(&bc, buf + 4, buf_size - 4);
+    if (unp_size > (1U<<24)) {
+        av_log(avctx, AV_LOG_ERROR, "packet is too big\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = init_get_bits8(&gb, buf + 4, buf_size - 4)) < 0)
+        return ret;
 
-    if (!bitstream_read_bit(&bc)) {
+    if(!get_bits1(&gb)){
         av_log(avctx, AV_LOG_INFO, "Sound: no data\n");
         *got_frame_ptr = 0;
         return 1;
     }
-    stereo = bitstream_read_bit(&bc);
-    bits   = bitstream_read_bit(&bc);
+    stereo = get_bits1(&gb);
+    bits = get_bits1(&gb);
     if (stereo ^ (avctx->channels != 1)) {
         av_log(avctx, AV_LOG_ERROR, "channels mismatch\n");
         return AVERROR_INVALIDDATA;
     }
-    if (bits && avctx->sample_fmt == AV_SAMPLE_FMT_U8) {
+    if (bits == (avctx->sample_fmt == AV_SAMPLE_FMT_U8)) {
         av_log(avctx, AV_LOG_ERROR, "sample format mismatch\n");
         return AVERROR_INVALIDDATA;
     }
+
+    /* get output buffer */
+    frame->nb_samples = unp_size / (avctx->channels * (bits + 1));
     if (unp_size % (avctx->channels * (bits + 1))) {
         av_log(avctx, AV_LOG_ERROR,
                "The buffer does not contain an integer number of samples\n");
         return AVERROR_INVALIDDATA;
     }
-
-    /* get output buffer */
-    frame->nb_samples = unp_size / (avctx->channels * (bits + 1));
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples  = (int16_t *)frame->data[0];
     samples8 =            frame->data[0];
 
@@ -676,12 +690,12 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
             ret = AVERROR(ENOMEM);
             goto error;
         }
-        bitstream_skip(&bc, 1);
-        if (smacker_decode_tree(&bc, &h[i], 0, 0) < 0) {
+        skip_bits1(&gb);
+        if (smacker_decode_tree(&gb, &h[i], 0, 0) < 0) {
             ret = AVERROR_INVALIDDATA;
             goto error;
         }
-        bitstream_skip(&bc, 1);
+        skip_bits1(&gb);
         if(h[i].current > 1) {
             res = init_vlc(&vlc[i], SMKTREE_BITS, h[i].length,
                     h[i].lengths, sizeof(int), sizeof(int),
@@ -696,33 +710,51 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
     /* this codec relies on wraparound instead of clipping audio */
     if(bits) { //decode 16-bit data
         for(i = stereo; i >= 0; i--)
-            pred[i] = sign_extend(av_bswap16(bitstream_read(&bc, 16)), 16);
+            pred[i] = sign_extend(av_bswap16(get_bits(&gb, 16)), 16);
         for(i = 0; i <= stereo; i++)
             *samples++ = pred[i];
         for(; i < unp_size / 2; i++) {
+            if(get_bits_left(&gb)<0)
+                return AVERROR_INVALIDDATA;
             if(i & stereo) {
                 if(vlc[2].table)
-                    res = bitstream_read_vlc(&bc, vlc[2].table, SMKTREE_BITS, 3);
+                    res = get_vlc2(&gb, vlc[2].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val  = h[2].values[res];
                 if(vlc[3].table)
-                    res = bitstream_read_vlc(&bc, vlc[3].table, SMKTREE_BITS, 3);
+                    res = get_vlc2(&gb, vlc[3].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val |= h[3].values[res] << 8;
                 pred[1] += sign_extend(val, 16);
                 *samples++ = pred[1];
             } else {
                 if(vlc[0].table)
-                    res = bitstream_read_vlc(&bc, vlc[0].table, SMKTREE_BITS, 3);
+                    res = get_vlc2(&gb, vlc[0].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val  = h[0].values[res];
                 if(vlc[1].table)
-                    res = bitstream_read_vlc(&bc, vlc[1].table, SMKTREE_BITS, 3);
+                    res = get_vlc2(&gb, vlc[1].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val |= h[1].values[res] << 8;
                 pred[0] += sign_extend(val, 16);
                 *samples++ = pred[0];
@@ -730,22 +762,32 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
         }
     } else { //8-bit data
         for(i = stereo; i >= 0; i--)
-            pred[i] = bitstream_read(&bc, 8);
+            pred[i] = get_bits(&gb, 8);
         for(i = 0; i <= stereo; i++)
             *samples8++ = pred[i];
         for(; i < unp_size; i++) {
+            if(get_bits_left(&gb)<0)
+                return AVERROR_INVALIDDATA;
             if(i & stereo){
                 if(vlc[1].table)
-                    res = bitstream_read_vlc(&bc, vlc[1].table, SMKTREE_BITS, 3);
+                    res = get_vlc2(&gb, vlc[1].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 pred[1] += sign_extend(h[1].values[res], 8);
                 *samples8++ = pred[1];
             } else {
                 if(vlc[0].table)
-                    res = bitstream_read_vlc(&bc, vlc[0].table, SMKTREE_BITS, 3);
+                    res = get_vlc2(&gb, vlc[0].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 pred[0] += sign_extend(h[0].values[res], 8);
                 *samples8++ = pred[0];
             }
diff --git a/libavcodec/smc.c b/libavcodec/smc.c
index 92e522b..3cb4834 100644
--- a/libavcodec/smc.c
+++ b/libavcodec/smc.c
@@ -2,20 +2,20 @@
  * Quicktime Graphics (SMC) Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -84,7 +84,7 @@ static void smc_decode_stream(SmcContext *s)
     int stride = s->frame->linesize[0];
     int i;
     int chunk_size;
-    int buf_size = (int) (s->gb.buffer_end - s->gb.buffer_start);
+    int buf_size = bytestream2_size(&s->gb);
     unsigned char opcode;
     int n_blocks;
     unsigned int color_flags;
@@ -92,7 +92,7 @@ static void smc_decode_stream(SmcContext *s)
     unsigned int color_flags_b;
     unsigned int flag_mask;
 
-    unsigned char *pixels = s->frame->data[0];
+    unsigned char * const pixels = s->frame->data[0];
 
     int image_size = height * s->frame->linesize[0];
     int row_ptr = 0;
@@ -132,6 +132,10 @@ static void smc_decode_stream(SmcContext *s)
                 row_ptr, image_size);
             return;
         }
+        if (bytestream2_get_bytes_left(&s->gb) < 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "input too small\n");
+            return;
+        }
 
         opcode = bytestream2_get_byte(&s->gb);
         switch (opcode & 0xF0) {
@@ -431,19 +435,24 @@ static int smc_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     SmcContext *s = avctx->priv_data;
-    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
+    int pal_size;
+    const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &pal_size);
     int ret;
+    int total_blocks = ((s->avctx->width + 3) / 4) * ((s->avctx->height + 3) / 4);
+
+    if (total_blocks / 1024 > avpkt->size)
+        return AVERROR_INVALIDDATA;
 
     bytestream2_init(&s->gb, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
-    if (pal) {
+    if (pal && pal_size == AVPALETTE_SIZE) {
         s->frame->palette_has_changed = 1;
         memcpy(s->pal, pal, AVPALETTE_SIZE);
+    } else if (pal) {
+        av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", pal_size);
     }
 
     smc_decode_stream(s);
diff --git a/libavcodec/smvjpegdec.c b/libavcodec/smvjpegdec.c
new file mode 100644
index 0000000..7ea82eb
--- /dev/null
+++ b/libavcodec/smvjpegdec.c
@@ -0,0 +1,225 @@
+/*
+ * SMV JPEG decoder
+ * Copyright (c) 2013 Ash Hughes
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SMV JPEG decoder.
+ */
+
+// #define DEBUG
+#include "avcodec.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "mjpegdec.h"
+#include "internal.h"
+
+typedef struct SMVJpegDecodeContext {
+    MJpegDecodeContext jpg;
+    AVFrame *picture[2]; /* pictures array */
+    AVCodecContext* avctx;
+    int frames_per_jpeg;
+    int mjpeg_data_size;
+} SMVJpegDecodeContext;
+
+static inline void smv_img_pnt_plane(uint8_t      **dst, uint8_t *src,
+                                     int src_linesize, int height, int nlines)
+{
+    if (!dst || !src)
+        return;
+    src += (nlines) * src_linesize * height;
+    *dst = src;
+}
+
+static inline void smv_img_pnt(uint8_t *dst_data[4], uint8_t *src_data[4],
+                               const int src_linesizes[4],
+                               enum AVPixelFormat pix_fmt, int width, int height,
+                               int nlines)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    int i, planes_nb = 0;
+
+    if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL)
+        return;
+
+    for (i = 0; i < desc->nb_components; i++)
+        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+
+    for (i = 0; i < planes_nb; i++) {
+        int h = height;
+        if (i == 1 || i == 2) {
+            h = AV_CEIL_RSHIFT(height, desc->log2_chroma_h);
+        }
+        smv_img_pnt_plane(&dst_data[i], src_data[i],
+            src_linesizes[i], h, nlines);
+    }
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
+        desc->flags & FF_PSEUDOPAL)
+        dst_data[1] = src_data[1];
+}
+
+static av_cold int smvjpeg_decode_end(AVCodecContext *avctx)
+{
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    MJpegDecodeContext *jpg = &s->jpg;
+    int ret;
+
+    jpg->picture_ptr = NULL;
+    av_frame_free(&s->picture[0]);
+    av_frame_free(&s->picture[1]);
+    ret = avcodec_close(s->avctx);
+    av_freep(&s->avctx);
+    return ret;
+}
+
+static av_cold int smvjpeg_decode_init(AVCodecContext *avctx)
+{
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    AVCodec *codec;
+    AVDictionary *thread_opt = NULL;
+    int ret = 0, r;
+
+    s->frames_per_jpeg = 0;
+
+    s->picture[0] = av_frame_alloc();
+    if (!s->picture[0])
+        return AVERROR(ENOMEM);
+
+    s->picture[1] = av_frame_alloc();
+    if (!s->picture[1]) {
+        av_frame_free(&s->picture[0]);
+        return AVERROR(ENOMEM);
+    }
+
+    s->jpg.picture_ptr      = s->picture[0];
+
+    if (avctx->extradata_size >= 4)
+        s->frames_per_jpeg = AV_RL32(avctx->extradata);
+
+    if (s->frames_per_jpeg <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of frames per jpeg.\n");
+        ret = AVERROR_INVALIDDATA;
+    }
+
+    codec = avcodec_find_decoder(AV_CODEC_ID_MJPEG);
+    if (!codec) {
+        av_log(avctx, AV_LOG_ERROR, "MJPEG codec not found\n");
+        smvjpeg_decode_end(avctx);
+        return AVERROR_DECODER_NOT_FOUND;
+    }
+
+    s->avctx = avcodec_alloc_context3(codec);
+
+    av_dict_set(&thread_opt, "threads", "1", 0);
+    s->avctx->refcounted_frames = 1;
+    s->avctx->flags = avctx->flags;
+    s->avctx->idct_algo = avctx->idct_algo;
+    if ((r = ff_codec_open2_recursive(s->avctx, codec, &thread_opt)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "MJPEG codec failed to open\n");
+        ret = r;
+    }
+    av_dict_free(&thread_opt);
+
+    if (ret < 0)
+        smvjpeg_decode_end(avctx);
+    return ret;
+}
+
+static int smvjpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
+                            AVPacket *avpkt)
+{
+    const AVPixFmtDescriptor *desc;
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    AVFrame* mjpeg_data = s->picture[0];
+    int i, cur_frame = 0, ret = 0;
+
+    cur_frame = avpkt->pts % s->frames_per_jpeg;
+
+    /* cur_frame is later used to calculate the buffer offset, so it mustn't be negative */
+    if (cur_frame < 0)
+        cur_frame += s->frames_per_jpeg;
+
+    /* Are we at the start of a block? */
+    if (!cur_frame) {
+        av_frame_unref(mjpeg_data);
+        ret = avcodec_decode_video2(s->avctx, mjpeg_data, &s->mjpeg_data_size, avpkt);
+        if (ret < 0) {
+            s->mjpeg_data_size = 0;
+            return ret;
+        }
+    } else if (!s->mjpeg_data_size)
+        return AVERROR(EINVAL);
+
+    desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
+    av_assert0(desc);
+
+    if (mjpeg_data->height % (s->frames_per_jpeg << desc->log2_chroma_h)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid height\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*use the last lot... */
+    *data_size = s->mjpeg_data_size;
+
+    avctx->pix_fmt = s->avctx->pix_fmt;
+
+    /* We shouldn't get here if frames_per_jpeg <= 0 because this was rejected
+       in init */
+    ret = ff_set_dimensions(avctx, mjpeg_data->width, mjpeg_data->height / s->frames_per_jpeg);
+    if (ret < 0) {
+        av_log(s, AV_LOG_ERROR, "Failed to set dimensions\n");
+        return ret;
+    }
+
+    if (*data_size) {
+        s->picture[1]->extended_data = NULL;
+        s->picture[1]->width         = avctx->width;
+        s->picture[1]->height        = avctx->height;
+        s->picture[1]->format        = avctx->pix_fmt;
+        smv_img_pnt(s->picture[1]->data, mjpeg_data->data, mjpeg_data->linesize,
+                    avctx->pix_fmt, avctx->width, avctx->height, cur_frame);
+        for (i = 0; i < AV_NUM_DATA_POINTERS; i++)
+            s->picture[1]->linesize[i] = mjpeg_data->linesize[i];
+
+        ret = av_frame_ref(data, s->picture[1]);
+        if (ret < 0)
+            return ret;
+    }
+
+    return avpkt->size;
+}
+
+static const AVClass smvjpegdec_class = {
+    .class_name = "SMVJPEG decoder",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_smvjpeg_decoder = {
+    .name           = "smvjpeg",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMV JPEG"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SMVJPEG,
+    .priv_data_size = sizeof(SMVJpegDecodeContext),
+    .init           = smvjpeg_decode_init,
+    .close          = smvjpeg_decode_end,
+    .decode         = smvjpeg_decode_frame,
+    .priv_class     = &smvjpegdec_class,
+};
diff --git a/libavcodec/snappy.c b/libavcodec/snappy.c
index df6c6b3..7900b0f 100644
--- a/libavcodec/snappy.c
+++ b/libavcodec/snappy.c
@@ -2,20 +2,20 @@
  * Snappy decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -148,7 +148,7 @@ int ff_snappy_uncompress(GetByteContext *gb, uint8_t *buf, int64_t *size)
         return len;
 
     if (len > *size)
-        return AVERROR_BUG;
+        return AVERROR_BUFFER_TOO_SMALL;
 
     *size = len;
     p     = buf;
diff --git a/libavcodec/snappy.h b/libavcodec/snappy.h
index 8d365c0..a65cb3a 100644
--- a/libavcodec/snappy.h
+++ b/libavcodec/snappy.h
@@ -2,20 +2,20 @@
  * Snappy module
  * Copyright (c) Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
new file mode 100644
index 0000000..a3e6afc
--- /dev/null
+++ b/libavcodec/snow.c
@@ -0,0 +1,733 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "me_cmp.h"
+#include "snow_dwt.h"
+#include "internal.h"
+#include "snow.h"
+#include "snowdata.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+#include "h263.h"
+
+
+void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    int y, x;
+    IDWTELEM * dst;
+    for(y=0; y<b_h; y++){
+        //FIXME ugly misuse of obmc_stride
+        const uint8_t *obmc1= obmc + y*obmc_stride;
+        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        dst = slice_buffer_get_line(sb, src_y + y);
+        for(x=0; x<b_w; x++){
+            int v=   obmc1[x] * block[3][x + y*src_stride]
+                    +obmc2[x] * block[2][x + y*src_stride]
+                    +obmc3[x] * block[1][x + y*src_stride]
+                    +obmc4[x] * block[0][x + y*src_stride];
+
+            v <<= 8 - LOG2_OBMC_MAX;
+            if(FRAC_BITS != 8){
+                v >>= 8 - FRAC_BITS;
+            }
+            if(add){
+                v += dst[x + src_x];
+                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                if(v&(~255)) v= ~(v>>31);
+                dst8[x + y*src_stride] = v;
+            }else{
+                dst[x + src_x] -= v;
+            }
+        }
+    }
+}
+
+int ff_snow_get_buffer(SnowContext *s, AVFrame *frame)
+{
+    int ret, i;
+    int edges_needed = av_codec_is_encoder(s->avctx->codec);
+
+    frame->width  = s->avctx->width ;
+    frame->height = s->avctx->height;
+    if (edges_needed) {
+        frame->width  += 2 * EDGE_WIDTH;
+        frame->height += 2 * EDGE_WIDTH;
+    }
+    if ((ret = ff_get_buffer(s->avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+    if (edges_needed) {
+        for (i = 0; frame->data[i]; i++) {
+            int offset = (EDGE_WIDTH >> (i ? s->chroma_v_shift : 0)) *
+                            frame->linesize[i] +
+                            (EDGE_WIDTH >> (i ? s->chroma_h_shift : 0));
+            frame->data[i] += offset;
+        }
+        frame->width  = s->avctx->width;
+        frame->height = s->avctx->height;
+    }
+
+    return 0;
+}
+
+void ff_snow_reset_contexts(SnowContext *s){ //FIXME better initial contexts
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        for(level=0; level<MAX_DECOMPOSITIONS; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
+            }
+        }
+    }
+    memset(s->header_state, MID_STATE, sizeof(s->header_state));
+    memset(s->block_state, MID_STATE, sizeof(s->block_state));
+}
+
+int ff_snow_alloc_blocks(SnowContext *s){
+    int w= AV_CEIL_RSHIFT(s->avctx->width,  LOG2_MB_SIZE);
+    int h= AV_CEIL_RSHIFT(s->avctx->height, LOG2_MB_SIZE);
+
+    s->b_width = w;
+    s->b_height= h;
+
+    av_free(s->block);
+    s->block= av_mallocz_array(w * h,  sizeof(BlockNode) << (s->block_max_depth*2));
+    if (!s->block)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void init_qexp(void){
+    int i;
+    double v=128;
+
+    for(i=0; i<QROOT; i++){
+        ff_qexp[i]= lrintf(v);
+        v *= pow(2, 1.0 / QROOT);
+    }
+}
+static void mc_block(Plane *p, uint8_t *dst, const uint8_t *src, int stride, int b_w, int b_h, int dx, int dy){
+    static const uint8_t weight[64]={
+    8,7,6,5,4,3,2,1,
+    7,7,0,0,0,0,0,1,
+    6,0,6,0,0,0,2,0,
+    5,0,0,5,0,3,0,0,
+    4,0,0,0,4,0,0,0,
+    3,0,0,5,0,3,0,0,
+    2,0,6,0,0,0,2,0,
+    1,7,0,0,0,0,0,1,
+    };
+
+    static const uint8_t brane[256]={
+    0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+    0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
+    0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
+    0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
+    0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
+    0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
+    0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
+    0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
+    0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
+    0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
+    0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
+    0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
+    0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
+    0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
+    0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
+    0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
+    };
+
+    static const uint8_t needs[16]={
+    0,1,0,0,
+    2,4,2,0,
+    0,1,0,0,
+    15
+    };
+
+    int x, y, b, r, l;
+    int16_t tmpIt   [64*(32+HTAPS_MAX)];
+    uint8_t tmp2t[3][64*(32+HTAPS_MAX)];
+    int16_t *tmpI= tmpIt;
+    uint8_t *tmp2= tmp2t[0];
+    const uint8_t *hpel[11];
+    av_assert2(dx<16 && dy<16);
+    r= brane[dx + 16*dy]&15;
+    l= brane[dx + 16*dy]>>4;
+
+    b= needs[l] | needs[r];
+    if(p && !p->diag_mc)
+        b= 15;
+
+    if(b&5){
+        for(y=0; y < b_h+HTAPS_MAX-1; y++){
+            for(x=0; x < b_w; x++){
+                int a_1=src[x + HTAPS_MAX/2-4];
+                int a0= src[x + HTAPS_MAX/2-3];
+                int a1= src[x + HTAPS_MAX/2-2];
+                int a2= src[x + HTAPS_MAX/2-1];
+                int a3= src[x + HTAPS_MAX/2+0];
+                int a4= src[x + HTAPS_MAX/2+1];
+                int a5= src[x + HTAPS_MAX/2+2];
+                int a6= src[x + HTAPS_MAX/2+3];
+                int am=0;
+                if(!p || p->fast_mc){
+                    am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+                    tmpI[x]= am;
+                    am= (am+16)>>5;
+                }else{
+                    am= p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6);
+                    tmpI[x]= am;
+                    am= (am+32)>>6;
+                }
+
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            tmpI+= 64;
+            tmp2+= 64;
+            src += stride;
+        }
+        src -= stride*y;
+    }
+    src += HTAPS_MAX/2 - 1;
+    tmp2= tmp2t[1];
+
+    if(b&2){
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w+1; x++){
+                int a_1=src[x + (HTAPS_MAX/2-4)*stride];
+                int a0= src[x + (HTAPS_MAX/2-3)*stride];
+                int a1= src[x + (HTAPS_MAX/2-2)*stride];
+                int a2= src[x + (HTAPS_MAX/2-1)*stride];
+                int a3= src[x + (HTAPS_MAX/2+0)*stride];
+                int a4= src[x + (HTAPS_MAX/2+1)*stride];
+                int a5= src[x + (HTAPS_MAX/2+2)*stride];
+                int a6= src[x + (HTAPS_MAX/2+3)*stride];
+                int am=0;
+                if(!p || p->fast_mc)
+                    am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
+                else
+                    am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 32)>>6;
+
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            src += stride;
+            tmp2+= 64;
+        }
+        src -= stride*y;
+    }
+    src += stride*(HTAPS_MAX/2 - 1);
+    tmp2= tmp2t[2];
+    tmpI= tmpIt;
+    if(b&4){
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
+                int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
+                int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
+                int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
+                int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
+                int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
+                int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
+                int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
+                int am=0;
+                if(!p || p->fast_mc)
+                    am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
+                else
+                    am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 2048)>>12;
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            tmpI+= 64;
+            tmp2+= 64;
+        }
+    }
+
+    hpel[ 0]= src;
+    hpel[ 1]= tmp2t[0] + 64*(HTAPS_MAX/2-1);
+    hpel[ 2]= src + 1;
+
+    hpel[ 4]= tmp2t[1];
+    hpel[ 5]= tmp2t[2];
+    hpel[ 6]= tmp2t[1] + 1;
+
+    hpel[ 8]= src + stride;
+    hpel[ 9]= hpel[1] + 64;
+    hpel[10]= hpel[8] + 1;
+
+#define MC_STRIDE(x) (needs[x] ? 64 : stride)
+
+    if(b==15){
+        int dxy = dx / 8 + dy / 8 * 4;
+        const uint8_t *src1 = hpel[dxy    ];
+        const uint8_t *src2 = hpel[dxy + 1];
+        const uint8_t *src3 = hpel[dxy + 4];
+        const uint8_t *src4 = hpel[dxy + 5];
+        int stride1 = MC_STRIDE(dxy);
+        int stride2 = MC_STRIDE(dxy + 1);
+        int stride3 = MC_STRIDE(dxy + 4);
+        int stride4 = MC_STRIDE(dxy + 5);
+        dx&=7;
+        dy&=7;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
+                         (8-dx)*   dy *src3[x] + dx*   dy *src4[x]+32)>>6;
+            }
+            src1+=stride1;
+            src2+=stride2;
+            src3+=stride3;
+            src4+=stride4;
+            dst +=stride;
+        }
+    }else{
+        const uint8_t *src1= hpel[l];
+        const uint8_t *src2= hpel[r];
+        int stride1 = MC_STRIDE(l);
+        int stride2 = MC_STRIDE(r);
+        int a= weight[((dx&7) + (8*(dy&7)))];
+        int b= 8-a;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
+            }
+            src1+=stride1;
+            src2+=stride2;
+            dst +=stride;
+        }
+    }
+}
+
+void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t stride, int sx, int sy, int b_w, int b_h, const BlockNode *block, int plane_index, int w, int h){
+    if(block->type & BLOCK_INTRA){
+        int x, y;
+        const unsigned color  = block->color[plane_index];
+        const unsigned color4 = color*0x01010101;
+        if(b_w==32){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+                *(uint32_t*)&dst[8 + y*stride]= color4;
+                *(uint32_t*)&dst[12+ y*stride]= color4;
+                *(uint32_t*)&dst[16+ y*stride]= color4;
+                *(uint32_t*)&dst[20+ y*stride]= color4;
+                *(uint32_t*)&dst[24+ y*stride]= color4;
+                *(uint32_t*)&dst[28+ y*stride]= color4;
+            }
+        }else if(b_w==16){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+                *(uint32_t*)&dst[8 + y*stride]= color4;
+                *(uint32_t*)&dst[12+ y*stride]= color4;
+            }
+        }else if(b_w==8){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+            }
+        }else if(b_w==4){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+            }
+        }else{
+            for(y=0; y < b_h; y++){
+                for(x=0; x < b_w; x++){
+                    dst[x + y*stride]= color;
+                }
+            }
+        }
+    }else{
+        uint8_t *src= s->last_picture[block->ref]->data[plane_index];
+        const int scale= plane_index ?  (2*s->mv_scale)>>s->chroma_h_shift : 2*s->mv_scale;
+        int mx= block->mx*scale;
+        int my= block->my*scale;
+        const int dx= mx&15;
+        const int dy= my&15;
+        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
+        sx += (mx>>4) - (HTAPS_MAX/2-1);
+        sy += (my>>4) - (HTAPS_MAX/2-1);
+        src += sx + sy*stride;
+        if(   (unsigned)sx >= FFMAX(w - b_w - (HTAPS_MAX-2), 0)
+           || (unsigned)sy >= FFMAX(h - b_h - (HTAPS_MAX-2), 0)){
+            s->vdsp.emulated_edge_mc(tmp + MB_SIZE, src,
+                                     stride, stride,
+                                     b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1,
+                                     sx, sy, w, h);
+            src= tmp + MB_SIZE;
+        }
+
+        av_assert2(s->chroma_h_shift == s->chroma_v_shift); // only one mv_scale
+
+        av_assert2((tab_index>=0 && tab_index<4) || b_w==32);
+        if(    (dx&3) || (dy&3)
+            || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h)
+            || (b_w&(b_w-1))
+            || b_w == 1
+            || b_h == 1
+            || !s->plane[plane_index].fast_mc )
+            mc_block(&s->plane[plane_index], dst, src, stride, b_w, b_h, dx, dy);
+        else if(b_w==32){
+            int y;
+            for(y=0; y<b_h; y+=16){
+                s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
+                s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
+            }
+        }else if(b_w==b_h)
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
+        else if(b_w==2*b_h){
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 3       + 3*stride,stride);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
+        }else{
+            av_assert2(2*b_w==b_h);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 3 + 3*stride           ,stride);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
+        }
+    }
+}
+
+#define mca(dx,dy,b_w)\
+static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h){\
+    av_assert2(h==b_w);\
+    mc_block(NULL, dst, src-(HTAPS_MAX/2-1)-(HTAPS_MAX/2-1)*stride, stride, b_w, b_w, dx, dy);\
+}
+
+mca( 0, 0,16)
+mca( 8, 0,16)
+mca( 0, 8,16)
+mca( 8, 8,16)
+mca( 0, 0,8)
+mca( 8, 0,8)
+mca( 0, 8,8)
+mca( 8, 8,8)
+
+av_cold int ff_snow_common_init(AVCodecContext *avctx){
+    SnowContext *s = avctx->priv_data;
+    int width, height;
+    int i, j;
+
+    s->avctx= avctx;
+    s->max_ref_frames=1; //just make sure it's not an invalid value in case of no initial keyframe
+    s->spatial_decomposition_count = 1;
+
+    ff_me_cmp_init(&s->mecc, avctx);
+    ff_hpeldsp_init(&s->hdsp, avctx->flags);
+    ff_videodsp_init(&s->vdsp, 8);
+    ff_dwt_init(&s->dwt);
+    ff_h264qpel_init(&s->h264qpel, 8);
+
+#define mcf(dx,dy)\
+    s->qdsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
+    s->qdsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
+        s->h264qpel.put_h264_qpel_pixels_tab[0][dy+dx/4];\
+    s->qdsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
+    s->qdsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
+        s->h264qpel.put_h264_qpel_pixels_tab[1][dy+dx/4];
+
+    mcf( 0, 0)
+    mcf( 4, 0)
+    mcf( 8, 0)
+    mcf(12, 0)
+    mcf( 0, 4)
+    mcf( 4, 4)
+    mcf( 8, 4)
+    mcf(12, 4)
+    mcf( 0, 8)
+    mcf( 4, 8)
+    mcf( 8, 8)
+    mcf(12, 8)
+    mcf( 0,12)
+    mcf( 4,12)
+    mcf( 8,12)
+    mcf(12,12)
+
+#define mcfh(dx,dy)\
+    s->hdsp.put_pixels_tab       [0][dy/4+dx/8]=\
+    s->hdsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 16;\
+    s->hdsp.put_pixels_tab       [1][dy/4+dx/8]=\
+    s->hdsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 8;
+
+    mcfh(0, 0)
+    mcfh(8, 0)
+    mcfh(0, 8)
+    mcfh(8, 8)
+
+    init_qexp();
+
+//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
+
+    width= s->avctx->width;
+    height= s->avctx->height;
+
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->spatial_idwt_buffer, width, height * sizeof(IDWTELEM), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->spatial_dwt_buffer,  width, height * sizeof(DWTELEM),  fail); //FIXME this does not belong here
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->temp_dwt_buffer,     width, sizeof(DWTELEM),  fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->temp_idwt_buffer,    width, sizeof(IDWTELEM), fail);
+    FF_ALLOC_ARRAY_OR_GOTO(avctx,  s->run_buffer,          ((width + 1) >> 1), ((height + 1) >> 1) * sizeof(*s->run_buffer), fail);
+
+    for(i=0; i<MAX_REF_FRAMES; i++) {
+        for(j=0; j<MAX_REF_FRAMES; j++)
+            ff_scale_mv_ref[i][j] = 256*(i+1)/(j+1);
+        s->last_picture[i] = av_frame_alloc();
+        if (!s->last_picture[i])
+            goto fail;
+    }
+
+    s->mconly_picture = av_frame_alloc();
+    s->current_picture = av_frame_alloc();
+    if (!s->mconly_picture || !s->current_picture)
+        goto fail;
+
+    return 0;
+fail:
+    return AVERROR(ENOMEM);
+}
+
+int ff_snow_common_init_after_header(AVCodecContext *avctx) {
+    SnowContext *s = avctx->priv_data;
+    int plane_index, level, orientation;
+    int ret, emu_buf_size;
+
+    if(!s->scratchbuf) {
+        if ((ret = ff_get_buffer(s->avctx, s->mconly_picture,
+                                 AV_GET_BUFFER_FLAG_REF)) < 0)
+            return ret;
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->scratchbuf, FFMAX(s->mconly_picture->linesize[0], 2*avctx->width+256), 7*MB_SIZE, fail);
+        emu_buf_size = FFMAX(s->mconly_picture->linesize[0], 2*avctx->width+256) * (2 * MB_SIZE + HTAPS_MAX - 1);
+        FF_ALLOC_OR_GOTO(avctx, s->emu_edge_buffer, emu_buf_size, fail);
+    }
+
+    if(s->mconly_picture->format != avctx->pix_fmt) {
+        av_log(avctx, AV_LOG_ERROR, "pixel format changed\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        int w= s->avctx->width;
+        int h= s->avctx->height;
+
+        if(plane_index){
+            w = AV_CEIL_RSHIFT(w, s->chroma_h_shift);
+            h = AV_CEIL_RSHIFT(h, s->chroma_v_shift);
+        }
+        s->plane[plane_index].width = w;
+        s->plane[plane_index].height= h;
+
+        for(level=s->spatial_decomposition_count-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+
+                b->buf= s->spatial_dwt_buffer;
+                b->level= level;
+                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
+                b->width = (w + !(orientation&1))>>1;
+                b->height= (h + !(orientation>1))>>1;
+
+                b->stride_line = 1 << (s->spatial_decomposition_count - level);
+                b->buf_x_offset = 0;
+                b->buf_y_offset = 0;
+
+                if(orientation&1){
+                    b->buf += (w+1)>>1;
+                    b->buf_x_offset = (w+1)>>1;
+                }
+                if(orientation>1){
+                    b->buf += b->stride>>1;
+                    b->buf_y_offset = b->stride_line >> 1;
+                }
+                b->ibuf= s->spatial_idwt_buffer + (b->buf - s->spatial_dwt_buffer);
+
+                if(level)
+                    b->parent= &s->plane[plane_index].band[level-1][orientation];
+                //FIXME avoid this realloc
+                av_freep(&b->x_coeff);
+                b->x_coeff=av_mallocz_array(((b->width+1) * b->height+1), sizeof(x_and_coeff));
+                if (!b->x_coeff)
+                    goto fail;
+            }
+            w= (w+1)>>1;
+            h= (h+1)>>1;
+        }
+    }
+
+    return 0;
+fail:
+    return AVERROR(ENOMEM);
+}
+
+#define USE_HALFPEL_PLANE 0
+
+static int halfpel_interpol(SnowContext *s, uint8_t *halfpel[4][4], AVFrame *frame){
+    int p,x,y;
+
+    for(p=0; p < s->nb_planes; p++){
+        int is_chroma= !!p;
+        int w= is_chroma ? AV_CEIL_RSHIFT(s->avctx->width,  s->chroma_h_shift) : s->avctx->width;
+        int h= is_chroma ? AV_CEIL_RSHIFT(s->avctx->height, s->chroma_v_shift) : s->avctx->height;
+        int ls= frame->linesize[p];
+        uint8_t *src= frame->data[p];
+
+        halfpel[1][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        halfpel[2][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        halfpel[3][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        if (!halfpel[1][p] || !halfpel[2][p] || !halfpel[3][p]) {
+            av_freep(&halfpel[1][p]);
+            av_freep(&halfpel[2][p]);
+            av_freep(&halfpel[3][p]);
+            return AVERROR(ENOMEM);
+        }
+        halfpel[1][p] += EDGE_WIDTH * (1 + ls);
+        halfpel[2][p] += EDGE_WIDTH * (1 + ls);
+        halfpel[3][p] += EDGE_WIDTH * (1 + ls);
+
+        halfpel[0][p]= src;
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[1][p][i]= (20*(src[i] + src[i+1]) - 5*(src[i-1] + src[i+2]) + (src[i-2] + src[i+3]) + 16 )>>5;
+            }
+        }
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[2][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
+            }
+        }
+        src= halfpel[1][p];
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[3][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
+            }
+        }
+
+//FIXME border!
+    }
+    return 0;
+}
+
+void ff_snow_release_buffer(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int i;
+
+    if(s->last_picture[s->max_ref_frames-1]->data[0]){
+        av_frame_unref(s->last_picture[s->max_ref_frames-1]);
+        for(i=0; i<9; i++)
+            if(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3]) {
+                av_free(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] - EDGE_WIDTH*(1+s->current_picture->linesize[i%3]));
+                s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] = NULL;
+            }
+    }
+}
+
+int ff_snow_frame_start(SnowContext *s){
+   AVFrame *tmp;
+   int i, ret;
+
+    ff_snow_release_buffer(s->avctx);
+
+    tmp= s->last_picture[s->max_ref_frames-1];
+    for(i=s->max_ref_frames-1; i>0; i--)
+        s->last_picture[i] = s->last_picture[i-1];
+    memmove(s->halfpel_plane+1, s->halfpel_plane, (s->max_ref_frames-1)*sizeof(void*)*4*4);
+    if(USE_HALFPEL_PLANE && s->current_picture->data[0]) {
+        if((ret = halfpel_interpol(s, s->halfpel_plane[0], s->current_picture)) < 0)
+            return ret;
+    }
+    s->last_picture[0] = s->current_picture;
+    s->current_picture = tmp;
+
+    if(s->keyframe){
+        s->ref_frames= 0;
+    }else{
+        int i;
+        for(i=0; i<s->max_ref_frames && s->last_picture[i]->data[0]; i++)
+            if(i && s->last_picture[i-1]->key_frame)
+                break;
+        s->ref_frames= i;
+        if(s->ref_frames==0){
+            av_log(s->avctx,AV_LOG_ERROR, "No reference frames\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    if ((ret = ff_snow_get_buffer(s, s->current_picture)) < 0)
+        return ret;
+
+    s->current_picture->key_frame= s->keyframe;
+
+    return 0;
+}
+
+av_cold void ff_snow_common_end(SnowContext *s)
+{
+    int plane_index, level, orientation, i;
+
+    av_freep(&s->spatial_dwt_buffer);
+    av_freep(&s->temp_dwt_buffer);
+    av_freep(&s->spatial_idwt_buffer);
+    av_freep(&s->temp_idwt_buffer);
+    av_freep(&s->run_buffer);
+
+    s->m.me.temp= NULL;
+    av_freep(&s->m.me.scratchpad);
+    av_freep(&s->m.me.map);
+    av_freep(&s->m.me.score_map);
+    av_freep(&s->m.sc.obmc_scratchpad);
+
+    av_freep(&s->block);
+    av_freep(&s->scratchbuf);
+    av_freep(&s->emu_edge_buffer);
+
+    for(i=0; i<MAX_REF_FRAMES; i++){
+        av_freep(&s->ref_mvs[i]);
+        av_freep(&s->ref_scores[i]);
+        if(s->last_picture[i] && s->last_picture[i]->data[0]) {
+            av_assert0(s->last_picture[i]->data[0] != s->current_picture->data[0]);
+        }
+        av_frame_free(&s->last_picture[i]);
+    }
+
+    for(plane_index=0; plane_index < MAX_PLANES; plane_index++){
+        for(level=MAX_DECOMPOSITIONS-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+
+                av_freep(&b->x_coeff);
+            }
+        }
+    }
+    av_frame_free(&s->mconly_picture);
+    av_frame_free(&s->current_picture);
+}
diff --git a/libavcodec/snow.h b/libavcodec/snow.h
new file mode 100644
index 0000000..41a3bef
--- /dev/null
+++ b/libavcodec/snow.h
@@ -0,0 +1,710 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOW_H
+#define AVCODEC_SNOW_H
+
+#include "libavutil/motion_vector.h"
+
+#include "hpeldsp.h"
+#include "me_cmp.h"
+#include "qpeldsp.h"
+#include "snow_dwt.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#include "mpegvideo.h"
+#include "h264qpel.h"
+
+#define FF_ME_ITER 3
+
+#define MID_STATE 128
+
+#define MAX_PLANES 4
+#define QSHIFT 5
+#define QROOT (1<<QSHIFT)
+#define LOSSLESS_QLOG -128
+#define FRAC_BITS 4
+#define MAX_REF_FRAMES 8
+
+#define LOG2_OBMC_MAX 8
+#define OBMC_MAX (1<<(LOG2_OBMC_MAX))
+typedef struct BlockNode{
+    int16_t mx;                 ///< Motion vector component X, see mv_scale
+    int16_t my;                 ///< Motion vector component Y, see mv_scale
+    uint8_t ref;                ///< Reference frame index
+    uint8_t color[3];           ///< Color for intra
+    uint8_t type;               ///< Bitfield of BLOCK_*
+//#define TYPE_SPLIT    1
+#define BLOCK_INTRA   1         ///< Intra block, inter otherwise
+#define BLOCK_OPT     2         ///< Block needs no checks in this round of iterative motion estiation
+//#define TYPE_NOCOLOR  4
+    uint8_t level; //FIXME merge into type?
+}BlockNode;
+
+static const BlockNode null_block= { //FIXME add border maybe
+    .color= {128,128,128},
+    .mx= 0,
+    .my= 0,
+    .ref= 0,
+    .type= 0,
+    .level= 0,
+};
+
+#define LOG2_MB_SIZE 4
+#define MB_SIZE (1<<LOG2_MB_SIZE)
+#define ENCODER_EXTRA_BITS 4
+#define HTAPS_MAX 8
+
+typedef struct x_and_coeff{
+    int16_t x;
+    uint16_t coeff;
+} x_and_coeff;
+
+typedef struct SubBand{
+    int level;
+    int stride;
+    int width;
+    int height;
+    int qlog;        ///< log(qscale)/log[2^(1/6)]
+    DWTELEM *buf;
+    IDWTELEM *ibuf;
+    int buf_x_offset;
+    int buf_y_offset;
+    int stride_line; ///< Stride measured in lines, not pixels.
+    x_and_coeff * x_coeff;
+    struct SubBand *parent;
+    uint8_t state[/*7*2*/ 7 + 512][32];
+}SubBand;
+
+typedef struct Plane{
+    int width;
+    int height;
+    SubBand band[MAX_DECOMPOSITIONS][4];
+
+    int htaps;
+    int8_t hcoeff[HTAPS_MAX/2];
+    int diag_mc;
+    int fast_mc;
+
+    int last_htaps;
+    int8_t last_hcoeff[HTAPS_MAX/2];
+    int last_diag_mc;
+}Plane;
+
+typedef struct SnowContext{
+    AVClass *class;
+    AVCodecContext *avctx;
+    RangeCoder c;
+    MECmpContext mecc;
+    HpelDSPContext hdsp;
+    QpelDSPContext qdsp;
+    VideoDSPContext vdsp;
+    H264QpelContext h264qpel;
+    MpegvideoEncDSPContext mpvencdsp;
+    SnowDWTContext dwt;
+    AVFrame *input_picture;              ///< new_picture with the internal linesizes
+    AVFrame *current_picture;
+    AVFrame *last_picture[MAX_REF_FRAMES];
+    uint8_t *halfpel_plane[MAX_REF_FRAMES][4][4];
+    AVFrame *mconly_picture;
+//     uint8_t q_context[16];
+    uint8_t header_state[32];
+    uint8_t block_state[128 + 32*128];
+    int keyframe;
+    int always_reset;
+    int version;
+    int spatial_decomposition_type;
+    int last_spatial_decomposition_type;
+    int temporal_decomposition_type;
+    int spatial_decomposition_count;
+    int last_spatial_decomposition_count;
+    int temporal_decomposition_count;
+    int max_ref_frames;
+    int ref_frames;
+    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
+    uint32_t *ref_scores[MAX_REF_FRAMES];
+    DWTELEM *spatial_dwt_buffer;
+    DWTELEM *temp_dwt_buffer;
+    IDWTELEM *spatial_idwt_buffer;
+    IDWTELEM *temp_idwt_buffer;
+    int *run_buffer;
+    int colorspace_type;
+    int chroma_h_shift;
+    int chroma_v_shift;
+    int spatial_scalability;
+    int qlog;
+    int last_qlog;
+    int lambda;
+    int lambda2;
+    int pass1_rc;
+    int mv_scale;
+    int last_mv_scale;
+    int qbias;
+    int last_qbias;
+#define QBIAS_SHIFT 3
+    int b_width;
+    int b_height;
+    int block_max_depth;
+    int last_block_max_depth;
+    int nb_planes;
+    Plane plane[MAX_PLANES];
+    BlockNode *block;
+#define ME_CACHE_SIZE 1024
+    unsigned me_cache[ME_CACHE_SIZE];
+    unsigned me_cache_generation;
+    slice_buffer sb;
+    int memc_only;
+    int no_bitstream;
+    int intra_penalty;
+    int motion_est;
+    int iterative_dia_size;
+    int scenechange_threshold;
+
+    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to eventually make the motion estimation independent of MpegEncContext, so this will be removed then (FIXME/XXX)
+
+    uint8_t *scratchbuf;
+    uint8_t *emu_edge_buffer;
+
+    AVMotionVector *avmv;
+    int avmv_index;
+    uint64_t encoding_error[AV_NUM_DATA_POINTERS];
+
+    int pred;
+}SnowContext;
+
+/* Tables */
+extern const uint8_t * const ff_obmc_tab[4];
+extern uint8_t ff_qexp[QROOT];
+extern int ff_scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
+
+/* C bits used by mmx/sse2/altivec */
+
+static av_always_inline void snow_interleave_line_header(int * i, int width, IDWTELEM * low, IDWTELEM * high){
+    (*i) = (width) - 2;
+
+    if (width & 1){
+        low[(*i)+1] = low[((*i)+1)>>1];
+        (*i)--;
+    }
+}
+
+static av_always_inline void snow_interleave_line_footer(int * i, IDWTELEM * low, IDWTELEM * high){
+    for (; (*i)>=0; (*i)-=2){
+        low[(*i)+1] = high[(*i)>>1];
+        low[*i] = low[(*i)>>1];
+    }
+}
+
+static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM * dst, IDWTELEM * src, IDWTELEM * ref, int width, int w, int lift_high, int mul, int add, int shift){
+    for(; i<w; i++){
+        dst[i] = src[i] - ((mul * (ref[i] + ref[i + 1]) + add) >> shift);
+    }
+
+    if((width^lift_high)&1){
+        dst[w] = src[w] - ((mul * 2 * ref[w] + add) >> shift);
+    }
+}
+
+static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM * dst, IDWTELEM * src, IDWTELEM * ref, int width, int w){
+        for(; i<w; i++){
+            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
+        }
+
+        if(width&1){
+            dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
+        }
+}
+
+/* common code */
+
+int ff_snow_common_init(AVCodecContext *avctx);
+int ff_snow_common_init_after_header(AVCodecContext *avctx);
+void ff_snow_common_end(SnowContext *s);
+void ff_snow_release_buffer(AVCodecContext *avctx);
+void ff_snow_reset_contexts(SnowContext *s);
+int ff_snow_alloc_blocks(SnowContext *s);
+int ff_snow_frame_start(SnowContext *s);
+void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t stride,
+                     int sx, int sy, int b_w, int b_h, const BlockNode *block,
+                     int plane_index, int w, int h);
+int ff_snow_get_buffer(SnowContext *s, AVFrame *frame);
+/* common inline functions */
+//XXX doublecheck all of them should stay inlined
+
+static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
+                           const BlockNode *left, const BlockNode *top, const BlockNode *tr){
+    if(s->ref_frames == 1){
+        *mx = mid_pred(left->mx, top->mx, tr->mx);
+        *my = mid_pred(left->my, top->my, tr->my);
+    }else{
+        const int *scale = ff_scale_mv_ref[ref];
+        *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
+                       (top ->mx * scale[top ->ref] + 128) >>8,
+                       (tr  ->mx * scale[tr  ->ref] + 128) >>8);
+        *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
+                       (top ->my * scale[top ->ref] + 128) >>8,
+                       (tr  ->my * scale[tr  ->ref] + 128) >>8);
+    }
+}
+
+static av_always_inline int same_block(BlockNode *a, BlockNode *b){
+    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
+        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
+    }else{
+        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
+    }
+}
+
+//FIXME name cleanup (b_w, block_w, b_width stuff)
+//XXX should we really inline it?
+static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, IDWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    BlockNode *lt= &s->block[b_x + b_y*b_stride];
+    BlockNode *rt= lt+1;
+    BlockNode *lb= lt+b_stride;
+    BlockNode *rb= lb+1;
+    uint8_t *block[4];
+    // When src_stride is large enough, it is possible to interleave the blocks.
+    // Otherwise the blocks are written sequentially in the tmp buffer.
+    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
+    uint8_t *tmp = s->scratchbuf;
+    uint8_t *ptmp;
+    int x,y;
+
+    if(b_x<0){
+        lt= rt;
+        lb= rb;
+    }else if(b_x + 1 >= b_width){
+        rt= lt;
+        rb= lb;
+    }
+    if(b_y<0){
+        lt= lb;
+        rt= rb;
+    }else if(b_y + 1 >= b_height){
+        lb= lt;
+        rb= rt;
+    }
+
+    if(src_x<0){ //FIXME merge with prev & always round internal width up to *16
+        obmc -= src_x;
+        b_w += src_x;
+        if(!sliced && !offset_dst)
+            dst -= src_x;
+        src_x=0;
+    }
+    if(src_x + b_w > w){
+        b_w = w - src_x;
+    }
+    if(src_y<0){
+        obmc -= src_y*obmc_stride;
+        b_h += src_y;
+        if(!sliced && !offset_dst)
+            dst -= src_y*dst_stride;
+        src_y=0;
+    }
+    if(src_y + b_h> h){
+        b_h = h - src_y;
+    }
+
+    if(b_w<=0 || b_h<=0) return;
+
+    if(!sliced && offset_dst)
+        dst += src_x + src_y*dst_stride;
+    dst8+= src_x + src_y*src_stride;
+//    src += src_x + src_y*src_stride;
+
+    ptmp= tmp + 3*tmp_step;
+    block[0]= ptmp;
+    ptmp+=tmp_step;
+    ff_snow_pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
+
+    if(same_block(lt, rt)){
+        block[1]= block[0];
+    }else{
+        block[1]= ptmp;
+        ptmp+=tmp_step;
+        ff_snow_pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
+    }
+
+    if(same_block(lt, lb)){
+        block[2]= block[0];
+    }else if(same_block(rt, lb)){
+        block[2]= block[1];
+    }else{
+        block[2]= ptmp;
+        ptmp+=tmp_step;
+        ff_snow_pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
+    }
+
+    if(same_block(lt, rb) ){
+        block[3]= block[0];
+    }else if(same_block(rt, rb)){
+        block[3]= block[1];
+    }else if(same_block(lb, rb)){
+        block[3]= block[2];
+    }else{
+        block[3]= ptmp;
+        ff_snow_pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
+    }
+    if(sliced){
+        s->dwt.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    }else{
+        for(y=0; y<b_h; y++){
+            //FIXME ugly misuse of obmc_stride
+            const uint8_t *obmc1= obmc + y*obmc_stride;
+            const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+            const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+            const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+            for(x=0; x<b_w; x++){
+                int v=   obmc1[x] * block[3][x + y*src_stride]
+                        +obmc2[x] * block[2][x + y*src_stride]
+                        +obmc3[x] * block[1][x + y*src_stride]
+                        +obmc4[x] * block[0][x + y*src_stride];
+
+                v <<= 8 - LOG2_OBMC_MAX;
+                if(FRAC_BITS != 8){
+                    v >>= 8 - FRAC_BITS;
+                }
+                if(add){
+                    v += dst[x + y*dst_stride];
+                    v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*src_stride] = v;
+                }else{
+                    dst[x + y*dst_stride] -= v;
+                }
+            }
+        }
+    }
+}
+
+static av_always_inline void predict_slice(SnowContext *s, IDWTELEM *buf, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst8= s->current_picture->data[plane_index];
+    int w= p->width;
+    int h= p->height;
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); // obmc params assume squares
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for(mb_x=0; mb_x<=mb_w; mb_x++){
+        add_yblock(s, 0, NULL, buf, dst8, obmc,
+                   block_w*mb_x - block_w/2,
+                   block_h*mb_y - block_h/2,
+                   block_w, block_h,
+                   w, h,
+                   w, ref_stride, obmc_stride,
+                   mb_x - 1, mb_y - 1,
+                   add, 1, plane_index);
+    }
+}
+
+static av_always_inline void predict_plane(SnowContext *s, IDWTELEM *buf, int plane_index, int add){
+    const int mb_h= s->b_height << s->block_max_depth;
+    int mb_y;
+    for(mb_y=0; mb_y<=mb_h; mb_y++)
+        predict_slice(s, buf, plane_index, add, mb_y);
+}
+
+static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<rem_depth;
+    const int block_h= 1<<rem_depth; //FIXME "w!=h"
+    BlockNode block;
+    int i,j;
+
+    block.color[0]= l;
+    block.color[1]= cb;
+    block.color[2]= cr;
+    block.mx= mx;
+    block.my= my;
+    block.ref= ref;
+    block.type= type;
+    block.level= level;
+
+    for(j=0; j<block_h; j++){
+        for(i=0; i<block_w; i++){
+            s->block[index + i + j*w]= block;
+        }
+    }
+}
+
+static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
+    SnowContext *s = c->avctx->priv_data;
+    const int offset[3]= {
+          y*c->  stride + x,
+        ((y*c->uvstride + x)>>s->chroma_h_shift),
+        ((y*c->uvstride + x)>>s->chroma_h_shift),
+    };
+    int i;
+    for(i=0; i<3; i++){
+        c->src[0][i]= src [i];
+        c->ref[0][i]= ref [i] + offset[i];
+    }
+    av_assert2(!ref_index);
+}
+
+
+/* bitstream functions */
+
+extern const int8_t ff_quant3bA[256];
+
+#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
+
+static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
+    int i;
+
+    if(v){
+        const int a= FFABS(v);
+        const int e= av_log2(a);
+        const int el= FFMIN(e, 10);
+        put_rac(c, state+0, 0);
+
+        for(i=0; i<el; i++){
+            put_rac(c, state+1+i, 1);  //1..10
+        }
+        for(; i<e; i++){
+            put_rac(c, state+1+9, 1);  //1..10
+        }
+        put_rac(c, state+1+FFMIN(i,9), 0);
+
+        for(i=e-1; i>=el; i--){
+            put_rac(c, state+22+9, (a>>i)&1); //22..31
+        }
+        for(; i>=0; i--){
+            put_rac(c, state+22+i, (a>>i)&1); //22..31
+        }
+
+        if(is_signed)
+            put_rac(c, state+11 + el, v < 0); //11..21
+    }else{
+        put_rac(c, state+0, 1);
+    }
+}
+
+static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
+        return 0;
+    else{
+        int i, e;
+        unsigned a;
+        e= 0;
+        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
+            e++;
+            if (e > 31)
+                return AVERROR_INVALIDDATA;
+        }
+
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
+        }
+
+        e= -(is_signed && get_rac(c, state+11 + FFMIN(e,10))); //11..21
+        return (a^e)-e;
+    }
+}
+
+static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+
+    av_assert2(v>=0);
+    av_assert2(log2>=-4);
+
+    while(v >= r){
+        put_rac(c, state+4+log2, 1);
+        v -= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+    put_rac(c, state+4+log2, 0);
+
+    for(i=log2-1; i>=0; i--){
+        put_rac(c, state+31-i, (v>>i)&1);
+    }
+}
+
+static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+    int v=0;
+
+    av_assert2(log2>=-4);
+
+    while(log2<28 && get_rac(c, state+4+log2)){
+        v+= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+
+    for(i=log2-1; i>=0; i--){
+        v+= get_rac(c, state+31-i)<<i;
+    }
+
+    return v;
+}
+
+static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    int run, runs;
+    x_and_coeff *xc= b->x_coeff;
+    x_and_coeff *prev_xc= NULL;
+    x_and_coeff *prev2_xc= xc;
+    x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
+    x_and_coeff *prev_parent_xc= parent_xc;
+
+    runs= get_symbol2(&s->c, b->state[30], 0);
+    if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+    else           run= INT_MAX;
+
+    for(y=0; y<h; y++){
+        int v=0;
+        int lt=0, t=0, rt=0;
+
+        if(y && prev_xc->x == 0){
+            rt= prev_xc->coeff;
+        }
+        for(x=0; x<w; x++){
+            int p=0;
+            const int l= v;
+
+            lt= t; t= rt;
+
+            if(y){
+                if(prev_xc->x <= x)
+                    prev_xc++;
+                if(prev_xc->x == x + 1)
+                    rt= prev_xc->coeff;
+                else
+                    rt=0;
+            }
+            if(parent_xc){
+                if(x>>1 > parent_xc->x){
+                    parent_xc++;
+                }
+                if(x>>1 == parent_xc->x){
+                    p= parent_xc->coeff;
+                }
+            }
+            if(/*ll|*/l|lt|t|rt|p){
+                int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
+
+                v=get_rac(&s->c, &b->state[0][context]);
+                if(v){
+                    v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
+                    v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + ff_quant3bA[l&0xFF] + 3*ff_quant3bA[t&0xFF]]);
+                    if ((uint16_t)v != v) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Coefficient damaged\n");
+                        v = 1;
+                    }
+                    xc->x=x;
+                    (xc++)->coeff= v;
+                }
+            }else{
+                if(!run){
+                    if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+                    else           run= INT_MAX;
+                    v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
+                    v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
+                    if ((uint16_t)v != v) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Coefficient damaged\n");
+                        v = 1;
+                    }
+
+                    xc->x=x;
+                    (xc++)->coeff= v;
+                }else{
+                    int max_run;
+                    run--;
+                    v=0;
+                    av_assert2(run >= 0);
+                    if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
+                    else  max_run= FFMIN(run, w-x-1);
+                    if(parent_xc)
+                        max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
+                    av_assert2(max_run >= 0 && max_run <= run);
+
+                    x+= max_run;
+                    run-= max_run;
+                }
+            }
+        }
+        (xc++)->x= w+1; //end marker
+        prev_xc= prev2_xc;
+        prev2_xc= xc;
+
+        if(parent_xc){
+            if(y&1){
+                while(parent_xc->x != parent->width+1)
+                    parent_xc++;
+                parent_xc++;
+                prev_parent_xc= parent_xc;
+            }else{
+                parent_xc= prev_parent_xc;
+            }
+        }
+    }
+
+    (xc++)->x= w+1; //end marker
+}
+
+#endif /* AVCODEC_SNOW_H */
diff --git a/libavcodec/snow_dwt.c b/libavcodec/snow_dwt.c
new file mode 100644
index 0000000..25681e7
--- /dev/null
+++ b/libavcodec/snow_dwt.c
@@ -0,0 +1,860 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "me_cmp.h"
+#include "snow_dwt.h"
+
+int ff_slice_buffer_init(slice_buffer *buf, int line_count,
+                         int max_allocated_lines, int line_width,
+                         IDWTELEM *base_buffer)
+{
+    int i;
+
+    buf->base_buffer = base_buffer;
+    buf->line_count  = line_count;
+    buf->line_width  = line_width;
+    buf->data_count  = max_allocated_lines;
+    buf->line        = av_mallocz_array(line_count, sizeof(IDWTELEM *));
+    if (!buf->line)
+        return AVERROR(ENOMEM);
+    buf->data_stack  = av_malloc_array(max_allocated_lines, sizeof(IDWTELEM *));
+    if (!buf->data_stack) {
+        av_freep(&buf->line);
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < max_allocated_lines; i++) {
+        buf->data_stack[i] = av_malloc_array(line_width, sizeof(IDWTELEM));
+        if (!buf->data_stack[i]) {
+            for (i--; i >=0; i--)
+                av_freep(&buf->data_stack[i]);
+            av_freep(&buf->data_stack);
+            av_freep(&buf->line);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    buf->data_stack_top = max_allocated_lines - 1;
+    return 0;
+}
+
+IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line)
+{
+    IDWTELEM *buffer;
+
+    av_assert0(buf->data_stack_top >= 0);
+//  av_assert1(!buf->line[line]);
+    if (buf->line[line])
+        return buf->line[line];
+
+    buffer = buf->data_stack[buf->data_stack_top];
+    buf->data_stack_top--;
+    buf->line[line] = buffer;
+
+    return buffer;
+}
+
+void ff_slice_buffer_release(slice_buffer *buf, int line)
+{
+    IDWTELEM *buffer;
+
+    av_assert1(line >= 0 && line < buf->line_count);
+    av_assert1(buf->line[line]);
+
+    buffer = buf->line[line];
+    buf->data_stack_top++;
+    buf->data_stack[buf->data_stack_top] = buffer;
+    buf->line[line]                      = NULL;
+}
+
+void ff_slice_buffer_flush(slice_buffer *buf)
+{
+    int i;
+
+    if (!buf->line)
+        return;
+
+    for (i = 0; i < buf->line_count; i++)
+        if (buf->line[i])
+            ff_slice_buffer_release(buf, i);
+}
+
+void ff_slice_buffer_destroy(slice_buffer *buf)
+{
+    int i;
+    ff_slice_buffer_flush(buf);
+
+    if (buf->data_stack)
+        for (i = buf->data_count - 1; i >= 0; i--)
+            av_freep(&buf->data_stack[i]);
+    av_freep(&buf->data_stack);
+    av_freep(&buf->line);
+}
+
+static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
+                                  int dst_step, int src_step, int ref_step,
+                                  int width, int mul, int add, int shift,
+                                  int highpass, int inverse)
+{
+    const int mirror_left  = !highpass;
+    const int mirror_right = (width & 1) ^ highpass;
+    const int w            = (width >> 1) - 1 + (highpass & width);
+    int i;
+
+#define LIFT(src, ref, inv) ((src) + ((inv) ? -(ref) : +(ref)))
+    if (mirror_left) {
+        dst[0] = LIFT(src[0], ((mul * 2 * ref[0] + add) >> shift), inverse);
+        dst   += dst_step;
+        src   += src_step;
+    }
+
+    for (i = 0; i < w; i++)
+        dst[i * dst_step] = LIFT(src[i * src_step],
+                                 ((mul * (ref[i * ref_step] +
+                                          ref[(i + 1) * ref_step]) +
+                                   add) >> shift),
+                                 inverse);
+
+    if (mirror_right)
+        dst[w * dst_step] = LIFT(src[w * src_step],
+                                 ((mul * 2 * ref[w * ref_step] + add) >> shift),
+                                 inverse);
+}
+
+static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
+                                   int dst_step, int src_step, int ref_step,
+                                   int width, int mul, int add, int shift,
+                                   int highpass, int inverse)
+{
+    const int mirror_left  = !highpass;
+    const int mirror_right = (width & 1) ^ highpass;
+    const int w            = (width >> 1) - 1 + (highpass & width);
+    int i;
+
+    av_assert1(shift == 4);
+#define LIFTS(src, ref, inv)                                            \
+    ((inv) ? (src) + (((ref) + 4 * (src)) >> shift)                     \
+           : -((-16 * (src) + (ref) + add /                             \
+                4 + 1 + (5 << 25)) / (5 * 4) - (1 << 23)))
+    if (mirror_left) {
+        dst[0] = LIFTS(src[0], mul * 2 * ref[0] + add, inverse);
+        dst   += dst_step;
+        src   += src_step;
+    }
+
+    for (i = 0; i < w; i++)
+        dst[i * dst_step] = LIFTS(src[i * src_step],
+                                  mul * (ref[i * ref_step] +
+                                         ref[(i + 1) * ref_step]) + add,
+                                  inverse);
+
+    if (mirror_right)
+        dst[w * dst_step] = LIFTS(src[w * src_step],
+                                  mul * 2 * ref[w * ref_step] + add,
+                                  inverse);
+}
+
+static void horizontal_decompose53i(DWTELEM *b, DWTELEM *temp, int width)
+{
+    const int width2 = width >> 1;
+    int x;
+    const int w2 = (width + 1) >> 1;
+
+    for (x = 0; x < width2; x++) {
+        temp[x]      = b[2 * x];
+        temp[x + w2] = b[2 * x + 1];
+    }
+    if (width & 1)
+        temp[x] = b[2 * x];
+    lift(b + w2, temp + w2, temp,   1, 1, 1, width, -1, 0, 1, 1, 0);
+    lift(b,      temp,      b + w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
+}
+
+static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i]) >> 1;
+}
+
+static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (b0[i] + b2[i] + 2) >> 2;
+}
+
+static void spatial_decompose53i(DWTELEM *buffer, DWTELEM *temp,
+                                 int width, int height, int stride)
+{
+    int y;
+    DWTELEM *b0 = buffer + avpriv_mirror(-2 - 1, height - 1) * stride;
+    DWTELEM *b1 = buffer + avpriv_mirror(-2,     height - 1) * stride;
+
+    for (y = -2; y < height; y += 2) {
+        DWTELEM *b2 = buffer + avpriv_mirror(y + 1, height - 1) * stride;
+        DWTELEM *b3 = buffer + avpriv_mirror(y + 2, height - 1) * stride;
+
+        if (y + 1 < (unsigned)height)
+            horizontal_decompose53i(b2, temp, width);
+        if (y + 2 < (unsigned)height)
+            horizontal_decompose53i(b3, temp, width);
+
+        if (y + 1 < (unsigned)height)
+            vertical_decompose53iH0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_decompose53iL0(b0, b1, b2, width);
+
+        b0 = b2;
+        b1 = b3;
+    }
+}
+
+static void horizontal_decompose97i(DWTELEM *b, DWTELEM *temp, int width)
+{
+    const int w2 = (width + 1) >> 1;
+
+    lift(temp + w2, b + 1, b,         1, 2, 2, width, W_AM, W_AO, W_AS, 1, 1);
+    liftS(temp,     b,     temp + w2, 1, 2, 1, width, W_BM, W_BO, W_BS, 0, 0);
+    lift(b + w2, temp + w2, temp,     1, 1, 1, width, W_CM, W_CO, W_CS, 1, 0);
+    lift(b,      temp,      b + w2,   1, 1, 1, width, W_DM, W_DO, W_DS, 0, 0);
+}
+
+static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+}
+
+static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_CM * (b0[i] + b2[i]) + W_CO) >> W_CS;
+}
+
+static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] = (16 * 4 * b1[i] - 4 * (b0[i] + b2[i]) + W_BO * 5 + (5 << 27)) /
+                (5 * 16) - (1 << 23);
+}
+
+static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_DM * (b0[i] + b2[i]) + W_DO) >> W_DS;
+}
+
+static void spatial_decompose97i(DWTELEM *buffer, DWTELEM *temp,
+                                 int width, int height, int stride)
+{
+    int y;
+    DWTELEM *b0 = buffer + avpriv_mirror(-4 - 1, height - 1) * stride;
+    DWTELEM *b1 = buffer + avpriv_mirror(-4,     height - 1) * stride;
+    DWTELEM *b2 = buffer + avpriv_mirror(-4 + 1, height - 1) * stride;
+    DWTELEM *b3 = buffer + avpriv_mirror(-4 + 2, height - 1) * stride;
+
+    for (y = -4; y < height; y += 2) {
+        DWTELEM *b4 = buffer + avpriv_mirror(y + 3, height - 1) * stride;
+        DWTELEM *b5 = buffer + avpriv_mirror(y + 4, height - 1) * stride;
+
+        if (y + 3 < (unsigned)height)
+            horizontal_decompose97i(b4, temp, width);
+        if (y + 4 < (unsigned)height)
+            horizontal_decompose97i(b5, temp, width);
+
+        if (y + 3 < (unsigned)height)
+            vertical_decompose97iH0(b3, b4, b5, width);
+        if (y + 2 < (unsigned)height)
+            vertical_decompose97iL0(b2, b3, b4, width);
+        if (y + 1 < (unsigned)height)
+            vertical_decompose97iH1(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_decompose97iL1(b0, b1, b2, width);
+
+        b0 = b2;
+        b1 = b3;
+        b2 = b4;
+        b3 = b5;
+    }
+}
+
+void ff_spatial_dwt(DWTELEM *buffer, DWTELEM *temp, int width, int height,
+                    int stride, int type, int decomposition_count)
+{
+    int level;
+
+    for (level = 0; level < decomposition_count; level++) {
+        switch (type) {
+        case DWT_97:
+            spatial_decompose97i(buffer, temp,
+                                 width >> level, height >> level,
+                                 stride << level);
+            break;
+        case DWT_53:
+            spatial_decompose53i(buffer, temp,
+                                 width >> level, height >> level,
+                                 stride << level);
+            break;
+        }
+    }
+}
+
+static void horizontal_compose53i(IDWTELEM *b, IDWTELEM *temp, int width)
+{
+    const int width2 = width >> 1;
+    const int w2     = (width + 1) >> 1;
+    int x;
+
+    for (x = 0; x < width2; x++) {
+        temp[2 * x]     = b[x];
+        temp[2 * x + 1] = b[x + w2];
+    }
+    if (width & 1)
+        temp[2 * x] = b[x];
+
+    b[0] = temp[0] - ((temp[1] + 1) >> 1);
+    for (x = 2; x < width - 1; x += 2) {
+        b[x]     = temp[x]     - ((temp[x - 1] + temp[x + 1] + 2) >> 2);
+        b[x - 1] = temp[x - 1] + ((b[x - 2]    + b[x]        + 1) >> 1);
+    }
+    if (width & 1) {
+        b[x]     = temp[x]     - ((temp[x - 1]     + 1) >> 1);
+        b[x - 1] = temp[x - 1] + ((b[x - 2] + b[x] + 1) >> 1);
+    } else
+        b[x - 1] = temp[x - 1] + b[x - 2];
+}
+
+static void vertical_compose53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (b0[i] + b2[i]) >> 1;
+}
+
+static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i] + 2) >> 2;
+}
+
+static void spatial_compose53i_buffered_init(DWTCompose *cs, slice_buffer *sb,
+                                             int height, int stride_line)
+{
+    cs->b0 = slice_buffer_get_line(sb,
+                                   avpriv_mirror(-1 - 1, height - 1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, avpriv_mirror(-1, height - 1) * stride_line);
+    cs->y  = -1;
+}
+
+static void spatial_compose53i_init(DWTCompose *cs, IDWTELEM *buffer,
+                                    int height, int stride)
+{
+    cs->b0 = buffer + avpriv_mirror(-1 - 1, height - 1) * stride;
+    cs->b1 = buffer + avpriv_mirror(-1,     height - 1) * stride;
+    cs->y  = -1;
+}
+
+static void spatial_compose53i_dy_buffered(DWTCompose *cs, slice_buffer *sb,
+                                           IDWTELEM *temp,
+                                           int width, int height,
+                                           int stride_line)
+{
+    int y = cs->y;
+
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 1, height - 1) *
+                                         stride_line);
+    IDWTELEM *b3 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 2, height - 1) *
+                                         stride_line);
+
+    if (y + 1 < (unsigned)height && y < (unsigned)height) {
+        int x;
+
+        for (x = 0; x < width; x++) {
+            b2[x] -= (b1[x] + b3[x] + 2) >> 2;
+            b1[x] += (b0[x] + b2[x])     >> 1;
+        }
+    } else {
+        if (y + 1 < (unsigned)height)
+            vertical_compose53iL0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_compose53iH0(b0, b1, b2, width);
+    }
+
+    if (y - 1 < (unsigned)height)
+        horizontal_compose53i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        horizontal_compose53i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->y  += 2;
+}
+
+static void spatial_compose53i_dy(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride)
+{
+    int y        = cs->y;
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = buffer + avpriv_mirror(y + 1, height - 1) * stride;
+    IDWTELEM *b3 = buffer + avpriv_mirror(y + 2, height - 1) * stride;
+
+    if (y + 1 < (unsigned)height)
+        vertical_compose53iL0(b1, b2, b3, width);
+    if (y + 0 < (unsigned)height)
+        vertical_compose53iH0(b0, b1, b2, width);
+
+    if (y - 1 < (unsigned)height)
+        horizontal_compose53i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        horizontal_compose53i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->y  += 2;
+}
+
+void ff_snow_horizontal_compose97i(IDWTELEM *b, IDWTELEM *temp, int width)
+{
+    const int w2 = (width + 1) >> 1;
+    int x;
+
+    temp[0] = b[0] - ((3 * b[w2] + 2) >> 2);
+    for (x = 1; x < (width >> 1); x++) {
+        temp[2 * x]     = b[x] - ((3 * (b[x + w2 - 1] + b[x + w2]) + 4) >> 3);
+        temp[2 * x - 1] = b[x + w2 - 1] - temp[2 * x - 2] - temp[2 * x];
+    }
+    if (width & 1) {
+        temp[2 * x]     = b[x] - ((3 * b[x + w2 - 1] + 2) >> 2);
+        temp[2 * x - 1] = b[x + w2 - 1] - temp[2 * x - 2] - temp[2 * x];
+    } else
+        temp[2 * x - 1] = b[x + w2 - 1] - 2 * temp[2 * x - 2];
+
+    b[0] = temp[0] + ((2 * temp[0] + temp[1] + 4) >> 3);
+    for (x = 2; x < width - 1; x += 2) {
+        b[x]     = temp[x] + ((4 * temp[x] + temp[x - 1] + temp[x + 1] + 8) >> 4);
+        b[x - 1] = temp[x - 1] + ((3 * (b[x - 2] + b[x])) >> 1);
+    }
+    if (width & 1) {
+        b[x]     = temp[x] + ((2 * temp[x] + temp[x - 1] + 4) >> 3);
+        b[x - 1] = temp[x - 1] + ((3 * (b[x - 2] + b[x])) >> 1);
+    } else
+        b[x - 1] = temp[x - 1] + 3 * b[x - 2];
+}
+
+static void vertical_compose97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+}
+
+static void vertical_compose97iH1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_CM * (b0[i] + b2[i]) + W_CO) >> W_CS;
+}
+
+static void vertical_compose97iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_BM * (b0[i] + b2[i]) + 4 * b1[i] + W_BO) >> W_BS;
+}
+
+static void vertical_compose97iL1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_DM * (b0[i] + b2[i]) + W_DO) >> W_DS;
+}
+
+void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                 IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                 int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++) {
+        b4[i] -= (W_DM * (b3[i] + b5[i]) + W_DO) >> W_DS;
+        b3[i] -= (W_CM * (b2[i] + b4[i]) + W_CO) >> W_CS;
+        b2[i] += (W_BM * (b1[i] + b3[i]) + 4 * b2[i] + W_BO) >> W_BS;
+        b1[i] += (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+    }
+}
+
+static void spatial_compose97i_buffered_init(DWTCompose *cs, slice_buffer *sb,
+                                             int height, int stride_line)
+{
+    cs->b0 = slice_buffer_get_line(sb, avpriv_mirror(-3 - 1, height - 1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, avpriv_mirror(-3,     height - 1) * stride_line);
+    cs->b2 = slice_buffer_get_line(sb, avpriv_mirror(-3 + 1, height - 1) * stride_line);
+    cs->b3 = slice_buffer_get_line(sb, avpriv_mirror(-3 + 2, height - 1) * stride_line);
+    cs->y  = -3;
+}
+
+static void spatial_compose97i_init(DWTCompose *cs, IDWTELEM *buffer, int height,
+                                    int stride)
+{
+    cs->b0 = buffer + avpriv_mirror(-3 - 1, height - 1) * stride;
+    cs->b1 = buffer + avpriv_mirror(-3,     height - 1) * stride;
+    cs->b2 = buffer + avpriv_mirror(-3 + 1, height - 1) * stride;
+    cs->b3 = buffer + avpriv_mirror(-3 + 2, height - 1) * stride;
+    cs->y  = -3;
+}
+
+static void spatial_compose97i_dy_buffered(SnowDWTContext *dsp, DWTCompose *cs,
+                                           slice_buffer * sb, IDWTELEM *temp,
+                                           int width, int height,
+                                           int stride_line)
+{
+    int y = cs->y;
+
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = cs->b2;
+    IDWTELEM *b3 = cs->b3;
+    IDWTELEM *b4 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 3, height - 1) *
+                                         stride_line);
+    IDWTELEM *b5 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 4, height - 1) *
+                                         stride_line);
+
+    if (y > 0 && y + 4 < height) {
+        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
+    } else {
+        if (y + 3 < (unsigned)height)
+            vertical_compose97iL1(b3, b4, b5, width);
+        if (y + 2 < (unsigned)height)
+            vertical_compose97iH1(b2, b3, b4, width);
+        if (y + 1 < (unsigned)height)
+            vertical_compose97iL0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_compose97iH0(b0, b1, b2, width);
+    }
+
+    if (y - 1 < (unsigned)height)
+        dsp->horizontal_compose97i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        dsp->horizontal_compose97i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->b2  = b4;
+    cs->b3  = b5;
+    cs->y  += 2;
+}
+
+static void spatial_compose97i_dy(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride)
+{
+    int y        = cs->y;
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = cs->b2;
+    IDWTELEM *b3 = cs->b3;
+    IDWTELEM *b4 = buffer + avpriv_mirror(y + 3, height - 1) * stride;
+    IDWTELEM *b5 = buffer + avpriv_mirror(y + 4, height - 1) * stride;
+
+    if (y + 3 < (unsigned)height)
+        vertical_compose97iL1(b3, b4, b5, width);
+    if (y + 2 < (unsigned)height)
+        vertical_compose97iH1(b2, b3, b4, width);
+    if (y + 1 < (unsigned)height)
+        vertical_compose97iL0(b1, b2, b3, width);
+    if (y + 0 < (unsigned)height)
+        vertical_compose97iH0(b0, b1, b2, width);
+
+    if (y - 1 < (unsigned)height)
+        ff_snow_horizontal_compose97i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        ff_snow_horizontal_compose97i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->b2  = b4;
+    cs->b3  = b5;
+    cs->y  += 2;
+}
+
+void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer *sb, int width,
+                                   int height, int stride_line, int type,
+                                   int decomposition_count)
+{
+    int level;
+    for (level = decomposition_count - 1; level >= 0; level--) {
+        switch (type) {
+        case DWT_97:
+            spatial_compose97i_buffered_init(cs + level, sb, height >> level,
+                                             stride_line << level);
+            break;
+        case DWT_53:
+            spatial_compose53i_buffered_init(cs + level, sb, height >> level,
+                                             stride_line << level);
+            break;
+        }
+    }
+}
+
+void ff_spatial_idwt_buffered_slice(SnowDWTContext *dsp, DWTCompose *cs,
+                                    slice_buffer *slice_buf, IDWTELEM *temp,
+                                    int width, int height, int stride_line,
+                                    int type, int decomposition_count, int y)
+{
+    const int support = type == 1 ? 3 : 5;
+    int level;
+    if (type == 2)
+        return;
+
+    for (level = decomposition_count - 1; level >= 0; level--)
+        while (cs[level].y <= FFMIN((y >> level) + support, height >> level)) {
+            switch (type) {
+            case DWT_97:
+                spatial_compose97i_dy_buffered(dsp, cs + level, slice_buf, temp,
+                                               width >> level,
+                                               height >> level,
+                                               stride_line << level);
+                break;
+            case DWT_53:
+                spatial_compose53i_dy_buffered(cs + level, slice_buf, temp,
+                                               width >> level,
+                                               height >> level,
+                                               stride_line << level);
+                break;
+            }
+        }
+}
+
+static void spatial_idwt_init(DWTCompose *cs, IDWTELEM *buffer, int width,
+                                 int height, int stride, int type,
+                                 int decomposition_count)
+{
+    int level;
+    for (level = decomposition_count - 1; level >= 0; level--) {
+        switch (type) {
+        case DWT_97:
+            spatial_compose97i_init(cs + level, buffer, height >> level,
+                                    stride << level);
+            break;
+        case DWT_53:
+            spatial_compose53i_init(cs + level, buffer, height >> level,
+                                    stride << level);
+            break;
+        }
+    }
+}
+
+static void spatial_idwt_slice(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride, int type,
+                                  int decomposition_count, int y)
+{
+    const int support = type == 1 ? 3 : 5;
+    int level;
+    if (type == 2)
+        return;
+
+    for (level = decomposition_count - 1; level >= 0; level--)
+        while (cs[level].y <= FFMIN((y >> level) + support, height >> level)) {
+            switch (type) {
+            case DWT_97:
+                spatial_compose97i_dy(cs + level, buffer, temp, width >> level,
+                                      height >> level, stride << level);
+                break;
+            case DWT_53:
+                spatial_compose53i_dy(cs + level, buffer, temp, width >> level,
+                                      height >> level, stride << level);
+                break;
+            }
+        }
+}
+
+void ff_spatial_idwt(IDWTELEM *buffer, IDWTELEM *temp, int width, int height,
+                     int stride, int type, int decomposition_count)
+{
+    DWTCompose cs[MAX_DECOMPOSITIONS];
+    int y;
+    spatial_idwt_init(cs, buffer, width, height, stride, type,
+                         decomposition_count);
+    for (y = 0; y < height; y += 4)
+        spatial_idwt_slice(cs, buffer, temp, width, height, stride, type,
+                              decomposition_count, y);
+}
+
+static inline int w_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size,
+                      int w, int h, int type)
+{
+    int s, i, j;
+    const int dec_count = w == 8 ? 3 : 4;
+    int tmp[32 * 32], tmp2[32];
+    int level, ori;
+    static const int scale[2][2][4][4] = {
+        {
+            { // 9/7 8x8 dec=3
+                { 268, 239, 239, 213 },
+                { 0,   224, 224, 152 },
+                { 0,   135, 135, 110 },
+            },
+            { // 9/7 16x16 or 32x32 dec=4
+                { 344, 310, 310, 280 },
+                { 0,   320, 320, 228 },
+                { 0,   175, 175, 136 },
+                { 0,   129, 129, 102 },
+            }
+        },
+        {
+            { // 5/3 8x8 dec=3
+                { 275, 245, 245, 218 },
+                { 0,   230, 230, 156 },
+                { 0,   138, 138, 113 },
+            },
+            { // 5/3 16x16 or 32x32 dec=4
+                { 352, 317, 317, 286 },
+                { 0,   328, 328, 233 },
+                { 0,   180, 180, 140 },
+                { 0,   132, 132, 105 },
+            }
+        }
+    };
+
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j += 4) {
+            tmp[32 * i + j + 0] = (pix1[j + 0] - pix2[j + 0]) << 4;
+            tmp[32 * i + j + 1] = (pix1[j + 1] - pix2[j + 1]) << 4;
+            tmp[32 * i + j + 2] = (pix1[j + 2] - pix2[j + 2]) << 4;
+            tmp[32 * i + j + 3] = (pix1[j + 3] - pix2[j + 3]) << 4;
+        }
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+
+    ff_spatial_dwt(tmp, tmp2, w, h, 32, type, dec_count);
+
+    s = 0;
+    av_assert1(w == h);
+    for (level = 0; level < dec_count; level++)
+        for (ori = level ? 1 : 0; ori < 4; ori++) {
+            int size   = w >> (dec_count - level);
+            int sx     = (ori & 1) ? size : 0;
+            int stride = 32 << (dec_count - level);
+            int sy     = (ori & 2) ? stride >> 1 : 0;
+
+            for (i = 0; i < size; i++)
+                for (j = 0; j < size; j++) {
+                    int v = tmp[sx + sy + i * stride + j] *
+                            scale[type][dec_count - 3][level][ori];
+                    s += FFABS(v);
+                }
+        }
+    av_assert1(s >= 0);
+    return s >> 9;
+}
+
+static int w53_8_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 8, h, 1);
+}
+
+static int w97_8_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 8, h, 0);
+}
+
+static int w53_16_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 16, h, 1);
+}
+
+static int w97_16_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 16, h, 0);
+}
+
+int ff_w53_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 32, h, 1);
+}
+
+int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 32, h, 0);
+}
+
+av_cold void ff_dsputil_init_dwt(MECmpContext *c)
+{
+    c->w53[0] = w53_16_c;
+    c->w53[1] = w53_8_c;
+    c->w97[0] = w97_16_c;
+    c->w97[1] = w97_8_c;
+}
+
+av_cold void ff_dwt_init(SnowDWTContext *c)
+{
+    c->vertical_compose97i   = ff_snow_vertical_compose97i;
+    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
+    c->inner_add_yblock      = ff_snow_inner_add_yblock;
+
+    if (HAVE_MMX)
+        ff_dwt_init_x86(c);
+}
+
+
diff --git a/libavcodec/snow_dwt.h b/libavcodec/snow_dwt.h
new file mode 100644
index 0000000..ee699de
--- /dev/null
+++ b/libavcodec/snow_dwt.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOW_DWT_H
+#define AVCODEC_SNOW_DWT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct MpegEncContext;
+
+typedef int DWTELEM;
+typedef short IDWTELEM;
+
+#define MAX_DECOMPOSITIONS 8
+
+typedef struct DWTCompose {
+    IDWTELEM *b0;
+    IDWTELEM *b1;
+    IDWTELEM *b2;
+    IDWTELEM *b3;
+    int y;
+} DWTCompose;
+
+/** Used to minimize the amount of memory used in order to
+ *  optimize cache performance. **/
+typedef struct slice_buffer_s {
+    IDWTELEM **line;   ///< For use by idwt and predict_slices.
+    IDWTELEM **data_stack;   ///< Used for internal purposes.
+    int data_stack_top;
+    int line_count;
+    int line_width;
+    int data_count;
+    IDWTELEM *base_buffer;  ///< Buffer that this structure is caching.
+} slice_buffer;
+
+struct SnowDWTContext;
+
+typedef struct SnowDWTContext {
+    void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                int width);
+    void (*horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width);
+    void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride,
+                             uint8_t **block, int b_w, int b_h, int src_x,
+                             int src_y, int src_stride, slice_buffer *sb,
+                             int add, uint8_t *dst8);
+} SnowDWTContext;
+
+
+#define DWT_97 0
+#define DWT_53 1
+
+#define liftS lift
+#define W_AM 3
+#define W_AO 0
+#define W_AS 1
+
+#undef liftS
+#define W_BM 1
+#define W_BO 8
+#define W_BS 4
+
+#define W_CM 1
+#define W_CO 0
+#define W_CS 0
+
+#define W_DM 3
+#define W_DO 4
+#define W_DS 3
+
+#define slice_buffer_get_line(slice_buf, line_num)                          \
+    ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num]              \
+                                 : ff_slice_buffer_load_line((slice_buf),   \
+                                                             (line_num)))
+
+int ff_slice_buffer_init(slice_buffer *buf, int line_count,
+                         int max_allocated_lines, int line_width,
+                         IDWTELEM *base_buffer);
+void ff_slice_buffer_release(slice_buffer *buf, int line);
+void ff_slice_buffer_flush(slice_buffer *buf);
+void ff_slice_buffer_destroy(slice_buffer *buf);
+IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line);
+
+void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                 IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                 int width);
+void ff_snow_horizontal_compose97i(IDWTELEM *b, IDWTELEM *temp, int width);
+void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride,
+                              uint8_t **block, int b_w, int b_h, int src_x,
+                              int src_y, int src_stride, slice_buffer *sb,
+                              int add, uint8_t *dst8);
+
+int ff_w53_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h);
+int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h);
+
+void ff_spatial_dwt(int *buffer, int *temp, int width, int height, int stride,
+                    int type, int decomposition_count);
+
+void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer *sb, int width,
+                                   int height, int stride_line, int type,
+                                   int decomposition_count);
+void ff_spatial_idwt_buffered_slice(SnowDWTContext *dsp, DWTCompose *cs,
+                                    slice_buffer *slice_buf, IDWTELEM *temp,
+                                    int width, int height, int stride_line,
+                                    int type, int decomposition_count, int y);
+void ff_spatial_idwt(IDWTELEM *buffer, IDWTELEM *temp, int width, int height,
+                     int stride, int type, int decomposition_count);
+
+void ff_dwt_init(SnowDWTContext *c);
+void ff_dwt_init_x86(SnowDWTContext *c);
+
+#endif /* AVCODEC_DWT_H */
diff --git a/libavcodec/snowdata.h b/libavcodec/snowdata.h
new file mode 100644
index 0000000..490fdf8
--- /dev/null
+++ b/libavcodec/snowdata.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOWDATA_H
+#define AVCODEC_SNOWDATA_H
+
+#include "snow.h"
+
+static const uint8_t obmc32[1024]={
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+ //error:0.000020
+};
+static const uint8_t obmc16[256]={
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+//error:0.000015
+};
+
+//linear *64
+static const uint8_t obmc8[64]={
+  4, 12, 20, 28, 28, 20, 12,  4,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 20, 60,100,140,140,100, 60, 20,
+ 28, 84,140,196,196,140, 84, 28,
+ 28, 84,140,196,196,140, 84, 28,
+ 20, 60,100,140,140,100, 60, 20,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+  4, 12, 20, 28, 28, 20, 12,  4,
+//error:0.000000
+};
+
+//linear *64
+static const uint8_t obmc4[16]={
+ 16, 48, 48, 16,
+ 48,144,144, 48,
+ 48,144,144, 48,
+ 16, 48, 48, 16,
+//error:0.000000
+};
+
+const int8_t ff_quant3bA[256]={
+ 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+};
+
+const uint8_t * const ff_obmc_tab[4]= {
+    obmc32, obmc16, obmc8, obmc4
+};
+
+/* runtime generated tables */
+uint8_t ff_qexp[QROOT];
+int ff_scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
+
+
+#endif /* AVCODEC_SNOW_H */
diff --git a/libavcodec/snowdec.c b/libavcodec/snowdec.c
new file mode 100644
index 0000000..59bd24e
--- /dev/null
+++ b/libavcodec/snowdec.c
@@ -0,0 +1,671 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "snow_dwt.h"
+#include "internal.h"
+#include "snow.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#include "mpegvideo.h"
+#include "h263.h"
+
+static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, IDWTELEM * old_buffer, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst8= s->current_picture->data[plane_index];
+    int w= p->width;
+    int h= p->height;
+
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                IDWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++){
+//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                IDWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++){
+                    line[x] -= 128 << FRAC_BITS;
+//                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for(mb_x=0; mb_x<=mb_w; mb_x++){
+        add_yblock(s, 1, sb, old_buffer, dst8, obmc,
+                   block_w*mb_x - block_w/2,
+                   block_h*mb_y - block_h/2,
+                   block_w, block_h,
+                   w, h,
+                   w, ref_stride, obmc_stride,
+                   mb_x - 1, mb_y - 1,
+                   add, 0, plane_index);
+    }
+
+    if(s->avmv && mb_y < mb_h && plane_index == 0)
+        for(mb_x=0; mb_x<mb_w; mb_x++){
+            AVMotionVector *avmv = s->avmv + s->avmv_index;
+            const int b_width = s->b_width  << s->block_max_depth;
+            const int b_stride= b_width;
+            BlockNode *bn= &s->block[mb_x + mb_y*b_stride];
+
+            if (bn->type)
+                continue;
+
+            s->avmv_index++;
+
+            avmv->w = block_w;
+            avmv->h = block_h;
+            avmv->dst_x = block_w*mb_x - block_w/2;
+            avmv->dst_y = block_h*mb_y - block_h/2;
+            avmv->motion_scale = 8;
+            avmv->motion_x = bn->mx * s->mv_scale;
+            avmv->motion_y = bn->my * s->mv_scale;
+            avmv->src_x = avmv->dst_x + avmv->motion_x / 8;
+            avmv->src_y = avmv->dst_y + avmv->motion_y / 8;
+            avmv->source= -1 - bn->ref;
+            avmv->flags = 0;
+        }
+}
+
+static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
+    const int w= b->width;
+    int y;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int new_index = 0;
+
+    if(b->ibuf == s->spatial_idwt_buffer || s->qlog == LOSSLESS_QLOG){
+        qadd= 0;
+        qmul= 1<<QEXPSHIFT;
+    }
+
+    /* If we are on the second or later slice, restore our index. */
+    if (start_y != 0)
+        new_index = save_state[0];
+
+
+    for(y=start_y; y<h; y++){
+        int x = 0;
+        int v;
+        IDWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
+        memset(line, 0, b->width*sizeof(IDWTELEM));
+        v = b->x_coeff[new_index].coeff;
+        x = b->x_coeff[new_index++].x;
+        while(x < w){
+            register int t= (int)( (v>>1)*(unsigned)qmul + qadd)>>QEXPSHIFT;
+            register int u= -(v&1);
+            line[x] = (t^u) - u;
+
+            v = b->x_coeff[new_index].coeff;
+            x = b->x_coeff[new_index++].x;
+        }
+    }
+
+    /* Save our variables for the next slice. */
+    save_state[0] = new_index;
+
+    return;
+}
+
+static int decode_q_branch(SnowContext *s, int level, int x, int y){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    int trx= (x+1)<<rem_depth;
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    int res;
+
+    if(s->keyframe){
+        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
+        return 0;
+    }
+
+    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
+        int type, mx, my;
+        int l = left->color[0];
+        int cb= left->color[1];
+        int cr= left->color[2];
+        unsigned ref = 0;
+        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
+        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
+
+        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
+        if(type){
+            int ld, cbd, crd;
+            pred_mv(s, &mx, &my, 0, left, top, tr);
+            ld = get_symbol(&s->c, &s->block_state[32], 1);
+            if (ld < -255 || ld > 255) {
+                return AVERROR_INVALIDDATA;
+            }
+            l += ld;
+            if (s->nb_planes > 2) {
+                cbd = get_symbol(&s->c, &s->block_state[64], 1);
+                crd = get_symbol(&s->c, &s->block_state[96], 1);
+                if (cbd < -255 || cbd > 255 || crd < -255 || crd > 255) {
+                    return AVERROR_INVALIDDATA;
+                }
+                cb += cbd;
+                cr += crd;
+            }
+        }else{
+            if(s->ref_frames > 1)
+                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
+            if (ref >= s->ref_frames) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid ref\n");
+                return AVERROR_INVALIDDATA;
+            }
+            pred_mv(s, &mx, &my, ref, left, top, tr);
+            mx+= (unsigned)get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
+            my+= (unsigned)get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
+        }
+        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
+    }else{
+        if ((res = decode_q_branch(s, level+1, 2*x+0, 2*y+0)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+1, 2*y+0)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+0, 2*y+1)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+1, 2*y+1)) < 0)
+            return res;
+    }
+    return 0;
+}
+
+static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int start_y, int end_y){
+    const int w= b->width;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+
+    if(s->qlog == LOSSLESS_QLOG) return;
+
+    for(y=start_y; y<end_y; y++){
+//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        IDWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            int i= line[x];
+            if(i<0){
+                line[x]= -((-i*(unsigned)qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                line[x]=  (( i*(unsigned)qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+}
+
+static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
+    const int w= b->width;
+    int x,y;
+
+    IDWTELEM * line=0; // silence silly "could be used without having been initialized" warning
+    IDWTELEM * prev;
+
+    if (start_y != 0)
+        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+
+    for(y=start_y; y<end_y; y++){
+        prev = line;
+//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
+                    else  line[x] += line[x - 1];
+                }else{
+                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
+                    else  line[x] += line[x - 1];
+                }
+            }else{
+                if(y) line[x] += prev[x];
+            }
+        }
+    }
+}
+
+static void decode_qlogs(SnowContext *s){
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                int q;
+                if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
+                else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
+                else                    q= get_symbol(&s->c, s->header_state, 1);
+                s->plane[plane_index].band[level][orientation].qlog= q;
+            }
+        }
+    }
+}
+
+#define GET_S(dst, check) \
+    tmp= get_symbol(&s->c, s->header_state, 0);\
+    if(!(check)){\
+        av_log(s->avctx, AV_LOG_ERROR, "Error " #dst " is %d\n", tmp);\
+        return AVERROR_INVALIDDATA;\
+    }\
+    dst= tmp;
+
+static int decode_header(SnowContext *s){
+    int plane_index, tmp;
+    uint8_t kstate[32];
+
+    memset(kstate, MID_STATE, sizeof(kstate));
+
+    s->keyframe= get_rac(&s->c, kstate);
+    if(s->keyframe || s->always_reset){
+        ff_snow_reset_contexts(s);
+        s->spatial_decomposition_type=
+        s->qlog=
+        s->qbias=
+        s->mv_scale=
+        s->block_max_depth= 0;
+    }
+    if(s->keyframe){
+        GET_S(s->version, tmp <= 0U)
+        s->always_reset= get_rac(&s->c, s->header_state);
+        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
+        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
+        GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
+        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
+        if (s->colorspace_type == 1) {
+            s->avctx->pix_fmt= AV_PIX_FMT_GRAY8;
+            s->nb_planes = 1;
+        } else if(s->colorspace_type == 0) {
+            s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
+            s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
+
+            if(s->chroma_h_shift == 1 && s->chroma_v_shift==1){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+            }else if(s->chroma_h_shift == 0 && s->chroma_v_shift==0){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV444P;
+            }else if(s->chroma_h_shift == 2 && s->chroma_v_shift==2){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV410P;
+            } else {
+                av_log(s, AV_LOG_ERROR, "unsupported color subsample mode %d %d\n", s->chroma_h_shift, s->chroma_v_shift);
+                s->chroma_h_shift = s->chroma_v_shift = 1;
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+                return AVERROR_INVALIDDATA;
+            }
+            s->nb_planes = 3;
+        } else {
+            av_log(s, AV_LOG_ERROR, "unsupported color space\n");
+            s->chroma_h_shift = s->chroma_v_shift = 1;
+            s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+            return AVERROR_INVALIDDATA;
+        }
+
+
+        s->spatial_scalability= get_rac(&s->c, s->header_state);
+//        s->rate_scalability= get_rac(&s->c, s->header_state);
+        GET_S(s->max_ref_frames, tmp < (unsigned)MAX_REF_FRAMES)
+        s->max_ref_frames++;
+
+        decode_qlogs(s);
+    }
+
+    if(!s->keyframe){
+        if(get_rac(&s->c, s->header_state)){
+            for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+                int htaps, i, sum=0;
+                Plane *p= &s->plane[plane_index];
+                p->diag_mc= get_rac(&s->c, s->header_state);
+                htaps= get_symbol(&s->c, s->header_state, 0);
+                if((unsigned)htaps >= HTAPS_MAX/2 - 1)
+                    return AVERROR_INVALIDDATA;
+                htaps = htaps*2 + 2;
+                p->htaps= htaps;
+                for(i= htaps/2; i; i--){
+                    p->hcoeff[i]= get_symbol(&s->c, s->header_state, 0) * (1-2*(i&1));
+                    sum += p->hcoeff[i];
+                }
+                p->hcoeff[0]= 32-sum;
+            }
+            s->plane[2].diag_mc= s->plane[1].diag_mc;
+            s->plane[2].htaps  = s->plane[1].htaps;
+            memcpy(s->plane[2].hcoeff, s->plane[1].hcoeff, sizeof(s->plane[1].hcoeff));
+        }
+        if(get_rac(&s->c, s->header_state)){
+            GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
+            decode_qlogs(s);
+        }
+    }
+
+    s->spatial_decomposition_type+= (unsigned)get_symbol(&s->c, s->header_state, 1);
+    if(s->spatial_decomposition_type > 1U){
+        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported\n", s->spatial_decomposition_type);
+        return AVERROR_INVALIDDATA;
+    }
+    if(FFMIN(s->avctx-> width>>s->chroma_h_shift,
+             s->avctx->height>>s->chroma_v_shift) >> (s->spatial_decomposition_count-1) <= 1){
+        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_count %d too large for size\n", s->spatial_decomposition_count);
+        return AVERROR_INVALIDDATA;
+    }
+    if (s->avctx->width > 65536-4) {
+        av_log(s->avctx, AV_LOG_ERROR, "Width %d is too large\n", s->avctx->width);
+        return AVERROR_INVALIDDATA;
+    }
+
+
+    s->qlog           += (unsigned)get_symbol(&s->c, s->header_state, 1);
+    s->mv_scale       += (unsigned)get_symbol(&s->c, s->header_state, 1);
+    s->qbias          += (unsigned)get_symbol(&s->c, s->header_state, 1);
+    s->block_max_depth+= (unsigned)get_symbol(&s->c, s->header_state, 1);
+    if(s->block_max_depth > 1 || s->block_max_depth < 0 || s->mv_scale > 256U){
+        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large\n", s->block_max_depth);
+        s->block_max_depth= 0;
+        s->mv_scale = 0;
+        return AVERROR_INVALIDDATA;
+    }
+    if (FFABS(s->qbias) > 127) {
+        av_log(s->avctx, AV_LOG_ERROR, "qbias %d is too large\n", s->qbias);
+        s->qbias = 0;
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    int ret;
+
+    if ((ret = ff_snow_common_init(avctx)) < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+static int decode_blocks(SnowContext *s){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+    int res;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            if (s->c.bytestream >= s->c.bytestream_end)
+                return AVERROR_INVALIDDATA;
+            if ((res = decode_q_branch(s, 0, x, y)) < 0)
+                return res;
+        }
+    }
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    int bytes_read;
+    AVFrame *picture = data;
+    int level, orientation, plane_index;
+    int res;
+
+    ff_init_range_decoder(c, buf, buf_size);
+    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
+
+    s->current_picture->pict_type= AV_PICTURE_TYPE_I; //FIXME I vs. P
+    if ((res = decode_header(s)) < 0)
+        return res;
+    if ((res=ff_snow_common_init_after_header(avctx)) < 0)
+        return res;
+
+    // realloc slice buffer for the case that spatial_decomposition_count changed
+    ff_slice_buffer_destroy(&s->sb);
+    if ((res = ff_slice_buffer_init(&s->sb, s->plane[0].height,
+                                    (MB_SIZE >> s->block_max_depth) +
+                                    s->spatial_decomposition_count * 11 + 1,
+                                    s->plane[0].width,
+                                    s->spatial_idwt_buffer)) < 0)
+        return res;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        p->fast_mc= p->diag_mc && p->htaps==6 && p->hcoeff[0]==40
+                                              && p->hcoeff[1]==-10
+                                              && p->hcoeff[2]==2;
+    }
+
+    ff_snow_alloc_blocks(s);
+
+    if((res = ff_snow_frame_start(s)) < 0)
+        return res;
+
+    s->current_picture->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    //keyframe flag duplication mess FIXME
+    if(avctx->debug&FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_ERROR,
+               "keyframe:%d qlog:%d qbias: %d mvscale: %d "
+               "decomposition_type:%d decomposition_count:%d\n",
+               s->keyframe, s->qlog, s->qbias, s->mv_scale,
+               s->spatial_decomposition_type,
+               s->spatial_decomposition_count
+              );
+
+    av_assert0(!s->avmv);
+    if (s->avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS) {
+        s->avmv = av_malloc_array(s->b_width * s->b_height, sizeof(AVMotionVector) << (s->block_max_depth*2));
+    }
+    s->avmv_index = 0;
+
+    if ((res = decode_blocks(s)) < 0)
+        return res;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+        int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
+
+        if(s->avctx->debug&2048){
+            memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+
+            for(y=0; y<h; y++){
+                for(x=0; x<w; x++){
+                    int v= s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x];
+                    s->mconly_picture->data[plane_index][y*s->mconly_picture->linesize[plane_index] + x]= v;
+                }
+            }
+        }
+
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &p->band[level][orientation];
+                unpack_coeffs(s, b, b->parent, orientation);
+            }
+        }
+
+        {
+        const int mb_h= s->b_height << s->block_max_depth;
+        const int block_size = MB_SIZE >> s->block_max_depth;
+        const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+        int mb_y;
+        DWTCompose cs[MAX_DECOMPOSITIONS];
+        int yd=0, yq=0;
+        int y;
+        int end_y;
+
+        ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
+        for(mb_y=0; mb_y<=mb_h; mb_y++){
+
+            int slice_starty = block_h*mb_y;
+            int slice_h = block_h*(mb_y+1);
+
+            if (!(s->keyframe || s->avctx->debug&512)){
+                slice_starty = FFMAX(0, slice_starty - (block_h >> 1));
+                slice_h -= (block_h >> 1);
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+                    int start_y;
+                    int end_y;
+                    int our_mb_start = mb_y;
+                    int our_mb_end = (mb_y + 1);
+                    const int extra= 3;
+                    start_y = (mb_y ? ((block_h * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
+                    end_y = (((block_h * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
+                    if (!(s->keyframe || s->avctx->debug&512)){
+                        start_y = FFMAX(0, start_y - (block_h >> (1+s->spatial_decomposition_count - level)));
+                        end_y = FFMAX(0, end_y - (block_h >> (1+s->spatial_decomposition_count - level)));
+                    }
+                    start_y = FFMIN(b->height, start_y);
+                    end_y = FFMIN(b->height, end_y);
+
+                    if (start_y != end_y){
+                        if (orientation == 0){
+                            SubBand * correlate_band = &p->band[0][0];
+                            int correlate_end_y = FFMIN(b->height, end_y + 1);
+                            int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
+                            decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
+                            correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
+                            dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, start_y, end_y);
+                        }
+                        else
+                            decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
+                    }
+                }
+            }
+
+            for(; yd<slice_h; yd+=4){
+                ff_spatial_idwt_buffered_slice(&s->dwt, cs, &s->sb, s->temp_idwt_buffer, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
+            }
+
+            if(s->qlog == LOSSLESS_QLOG){
+                for(; yq<slice_h && yq<h; yq++){
+                    IDWTELEM * line = slice_buffer_get_line(&s->sb, yq);
+                    for(x=0; x<w; x++){
+                        line[x] *= 1<<FRAC_BITS;
+                    }
+                }
+            }
+
+            predict_slice_buffered(s, &s->sb, s->spatial_idwt_buffer, plane_index, 1, mb_y);
+
+            y = FFMIN(p->height, slice_starty);
+            end_y = FFMIN(p->height, slice_h);
+            while(y < end_y)
+                ff_slice_buffer_release(&s->sb, y++);
+        }
+
+        ff_slice_buffer_flush(&s->sb);
+        }
+
+    }
+
+    emms_c();
+
+    ff_snow_release_buffer(avctx);
+
+    if(!(s->avctx->debug&2048))
+        res = av_frame_ref(picture, s->current_picture);
+    else
+        res = av_frame_ref(picture, s->mconly_picture);
+    if (res >= 0 && s->avmv_index) {
+        AVFrameSideData *sd;
+
+        sd = av_frame_new_side_data(picture, AV_FRAME_DATA_MOTION_VECTORS, s->avmv_index * sizeof(AVMotionVector));
+        if (!sd)
+            return AVERROR(ENOMEM);
+        memcpy(sd->data, s->avmv, s->avmv_index * sizeof(AVMotionVector));
+    }
+
+    av_freep(&s->avmv);
+
+    if (res < 0)
+        return res;
+
+    *got_frame = 1;
+
+    bytes_read= c->bytestream - c->bytestream_start;
+    if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
+
+    return bytes_read;
+}
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    ff_slice_buffer_destroy(&s->sb);
+
+    ff_snow_common_end(s);
+
+    return 0;
+}
+
+AVCodec ff_snow_decoder = {
+    .name           = "snow",
+    .long_name      = NULL_IF_CONFIG_SMALL("Snow"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SNOW,
+    .priv_data_size = sizeof(SnowContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/snowenc.c b/libavcodec/snowenc.c
new file mode 100644
index 0000000..df1729a
--- /dev/null
+++ b/libavcodec/snowenc.c
@@ -0,0 +1,1965 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/libm.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "snow_dwt.h"
+#include "snow.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#include "mpegvideo.h"
+#include "h263.h"
+
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int plane_index, ret;
+    int i;
+
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->prediction_method)
+        s->pred = avctx->prediction_method;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    if(s->pred == DWT_97
+       && (avctx->flags & AV_CODEC_FLAG_QSCALE)
+       && avctx->global_quality == 0){
+        av_log(avctx, AV_LOG_ERROR, "The 9/7 wavelet is incompatible with lossless mode.\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->spatial_decomposition_type= s->pred; //FIXME add decorrelator type r transform_type
+
+    s->mv_scale       = (avctx->flags & AV_CODEC_FLAG_QPEL) ? 2 : 4;
+    s->block_max_depth= (avctx->flags & AV_CODEC_FLAG_4MV ) ? 1 : 0;
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        s->plane[plane_index].diag_mc= 1;
+        s->plane[plane_index].htaps= 6;
+        s->plane[plane_index].hcoeff[0]=  40;
+        s->plane[plane_index].hcoeff[1]= -10;
+        s->plane[plane_index].hcoeff[2]=   2;
+        s->plane[plane_index].fast_mc= 1;
+    }
+
+    if ((ret = ff_snow_common_init(avctx)) < 0) {
+        return ret;
+    }
+    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
+
+    ff_snow_alloc_blocks(s);
+
+    s->version=0;
+
+    s->m.avctx   = avctx;
+    s->m.bit_rate= avctx->bit_rate;
+    s->m.lmin    = avctx->mb_lmin;
+    s->m.lmax    = avctx->mb_lmax;
+
+    s->m.me.temp      =
+    s->m.me.scratchpad= av_mallocz_array((avctx->width+64), 2*16*2*sizeof(uint8_t));
+    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    s->m.sc.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
+    if (!s->m.me.scratchpad || !s->m.me.map || !s->m.me.score_map || !s->m.sc.obmc_scratchpad)
+        return AVERROR(ENOMEM);
+
+    ff_h263_encode_init(&s->m); //mv_penalty
+
+    s->max_ref_frames = av_clip(avctx->refs, 1, MAX_REF_FRAMES);
+
+    if(avctx->flags&AV_CODEC_FLAG_PASS1){
+        if(!avctx->stats_out)
+            avctx->stats_out = av_mallocz(256);
+
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
+    }
+    if((avctx->flags&AV_CODEC_FLAG_PASS2) || !(avctx->flags&AV_CODEC_FLAG_QSCALE)){
+        ret = ff_rate_control_init(&s->m);
+        if(ret < 0)
+            return ret;
+    }
+    s->pass1_rc= !(avctx->flags & (AV_CODEC_FLAG_QSCALE|AV_CODEC_FLAG_PASS2));
+
+    switch(avctx->pix_fmt){
+    case AV_PIX_FMT_YUV444P:
+//    case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV420P:
+//    case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUV410P:
+        s->nb_planes = 3;
+        s->colorspace_type= 0;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        s->nb_planes = 1;
+        s->colorspace_type = 1;
+        break;
+/*    case AV_PIX_FMT_RGB32:
+        s->colorspace= 1;
+        break;*/
+    default:
+        av_log(avctx, AV_LOG_ERROR, "pixel format not supported\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ret = av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift,
+                                           &s->chroma_v_shift);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "pixel format invalid or unknown\n");
+        return ret;
+    }
+
+    ff_set_cmp(&s->mecc, s->mecc.me_cmp, s->avctx->me_cmp);
+    ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, s->avctx->me_sub_cmp);
+
+    s->input_picture = av_frame_alloc();
+    if (!s->input_picture)
+        return AVERROR(ENOMEM);
+
+    if ((ret = ff_snow_get_buffer(s, s->input_picture)) < 0)
+        return ret;
+
+    if(s->motion_est == FF_ME_ITER){
+        int size= s->b_width * s->b_height << 2*s->block_max_depth;
+        for(i=0; i<s->max_ref_frames; i++){
+            s->ref_mvs[i]= av_mallocz_array(size, sizeof(int16_t[2]));
+            s->ref_scores[i]= av_mallocz_array(size, sizeof(uint32_t));
+            if (!s->ref_mvs[i] || !s->ref_scores[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_sum(uint8_t * pix, int line_size, int w, int h)
+{
+    int s, i, j;
+
+    s = 0;
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j++) {
+            s += pix[0];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_norm1(uint8_t * pix, int line_size, int w)
+{
+    int s, i, j;
+    const uint32_t *sq = ff_square_tab + 256;
+
+    s = 0;
+    for (i = 0; i < w; i++) {
+        for (j = 0; j < w; j ++) {
+            s += sq[pix[0]];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+static inline int get_penalty_factor(int lambda, int lambda2, int type){
+    switch(type&0xFF){
+    default:
+    case FF_CMP_SAD:
+        return lambda>>FF_LAMBDA_SHIFT;
+    case FF_CMP_DCT:
+        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
+    case FF_CMP_W53:
+        return (4*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_W97:
+        return (2*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_SATD:
+    case FF_CMP_DCT264:
+        return (2*lambda)>>FF_LAMBDA_SHIFT;
+    case FF_CMP_RD:
+    case FF_CMP_PSNR:
+    case FF_CMP_SSE:
+    case FF_CMP_NSSE:
+        return lambda2>>FF_LAMBDA_SHIFT;
+    case FF_CMP_BIT:
+        return 1;
+    }
+}
+
+//FIXME copy&paste
+#define P_LEFT P[1]
+#define P_TOP P[2]
+#define P_TOPRIGHT P[3]
+#define P_MEDIAN P[4]
+#define P_MV1 P[9]
+#define FLAG_QPEL   1 //must be 1
+
+static int encode_q_branch(SnowContext *s, int level, int x, int y){
+    uint8_t p_buffer[1024];
+    uint8_t i_buffer[1024];
+    uint8_t p_state[sizeof(s->block_state)];
+    uint8_t i_state[sizeof(s->block_state)];
+    RangeCoder pc, ic;
+    uint8_t *pbbak= s->c.bytestream;
+    uint8_t *pbbak_start= s->c.bytestream_start;
+    int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
+    const int w= s->b_width  << s->block_max_depth;
+    const int h= s->b_height << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<(LOG2_MB_SIZE - level);
+    int trx= (x+1)<<rem_depth;
+    int try= (y+1)<<rem_depth;
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
+    const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int pl = left->color[0];
+    int pcb= left->color[1];
+    int pcr= left->color[2];
+    int pmx, pmy;
+    int mx=0, my=0;
+    int l,cr,cb;
+    const int stride= s->current_picture->linesize[0];
+    const int uvstride= s->current_picture->linesize[1];
+    uint8_t *current_data[3]= { s->input_picture->data[0] + (x + y*  stride)*block_w,
+                                s->input_picture->data[1] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift),
+                                s->input_picture->data[2] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift)};
+    int P[10][2];
+    int16_t last_mv[3][2];
+    int qpel= !!(s->avctx->flags & AV_CODEC_FLAG_QPEL); //unused
+    const int shift= 1+qpel;
+    MotionEstContext *c= &s->m.me;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
+    int my_context= av_log2(2*FFABS(left->my - top->my));
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    int ref, best_ref, ref_score, ref_mx, ref_my;
+
+    av_assert0(sizeof(s->block_state) >= 256);
+    if(s->keyframe){
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
+        return 0;
+    }
+
+//    clip predictors / edge ?
+
+    P_LEFT[0]= left->mx;
+    P_LEFT[1]= left->my;
+    P_TOP [0]= top->mx;
+    P_TOP [1]= top->my;
+    P_TOPRIGHT[0]= tr->mx;
+    P_TOPRIGHT[1]= tr->my;
+
+    last_mv[0][0]= s->block[index].mx;
+    last_mv[0][1]= s->block[index].my;
+    last_mv[1][0]= right->mx;
+    last_mv[1][1]= right->my;
+    last_mv[2][0]= bottom->mx;
+    last_mv[2][1]= bottom->my;
+
+    s->m.mb_stride=2;
+    s->m.mb_x=
+    s->m.mb_y= 0;
+    c->skip= 0;
+
+    av_assert1(c->  stride ==   stride);
+    av_assert1(c->uvstride == uvstride);
+
+    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
+    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
+    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
+    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_DMV;
+
+    c->xmin = - x*block_w - 16+3;
+    c->ymin = - y*block_w - 16+3;
+    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
+    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
+
+    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
+    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
+    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
+    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
+    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
+    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
+    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
+
+    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+    if (!y) {
+        c->pred_x= P_LEFT[0];
+        c->pred_y= P_LEFT[1];
+    } else {
+        c->pred_x = P_MEDIAN[0];
+        c->pred_y = P_MEDIAN[1];
+    }
+
+    score= INT_MAX;
+    best_ref= 0;
+    for(ref=0; ref<s->ref_frames; ref++){
+        init_ref(c, current_data, s->last_picture[ref]->data, NULL, block_w*x, block_w*y, 0);
+
+        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
+                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
+
+        av_assert2(ref_mx >= c->xmin);
+        av_assert2(ref_mx <= c->xmax);
+        av_assert2(ref_my >= c->ymin);
+        av_assert2(ref_my <= c->ymax);
+
+        ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
+        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
+        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
+        if(s->ref_mvs[ref]){
+            s->ref_mvs[ref][index][0]= ref_mx;
+            s->ref_mvs[ref][index][1]= ref_my;
+            s->ref_scores[ref][index]= ref_score;
+        }
+        if(score > ref_score){
+            score= ref_score;
+            best_ref= ref;
+            mx= ref_mx;
+            my= ref_my;
+        }
+    }
+    //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
+
+  //  subpel search
+    base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
+    pc= s->c;
+    pc.bytestream_start=
+    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
+    memcpy(p_state, s->block_state, sizeof(s->block_state));
+
+    if(level!=s->block_max_depth)
+        put_rac(&pc, &p_state[4 + s_context], 1);
+    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
+    if(s->ref_frames > 1)
+        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
+    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
+    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
+    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
+    p_len= pc.bytestream - pc.bytestream_start;
+    score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
+
+    block_s= block_w*block_w;
+    sum = pix_sum(current_data[0], stride, block_w, block_w);
+    l= (sum + block_s/2)/block_s;
+    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
+
+    if (s->nb_planes > 2) {
+        block_s= block_w*block_w>>(s->chroma_h_shift + s->chroma_v_shift);
+        sum = pix_sum(current_data[1], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
+        cb= (sum + block_s/2)/block_s;
+    //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
+        sum = pix_sum(current_data[2], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
+        cr= (sum + block_s/2)/block_s;
+    //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
+    }else
+        cb = cr = 0;
+
+    ic= s->c;
+    ic.bytestream_start=
+    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
+    memcpy(i_state, s->block_state, sizeof(s->block_state));
+    if(level!=s->block_max_depth)
+        put_rac(&ic, &i_state[4 + s_context], 1);
+    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
+    put_symbol(&ic, &i_state[32],  l-pl , 1);
+    if (s->nb_planes > 2) {
+        put_symbol(&ic, &i_state[64], cb-pcb, 1);
+        put_symbol(&ic, &i_state[96], cr-pcr, 1);
+    }
+    i_len= ic.bytestream - ic.bytestream_start;
+    iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
+
+    av_assert1(iscore < 255*255*256 + s->lambda2*10);
+    av_assert1(iscore >= 0);
+    av_assert1(l>=0 && l<=255);
+    av_assert1(pl>=0 && pl<=255);
+
+    if(level==0){
+        int varc= iscore >> 8;
+        int vard= score >> 8;
+        if (vard <= 64 || vard < varc)
+            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+        else
+            c->scene_change_score+= s->m.qscale;
+    }
+
+    if(level!=s->block_max_depth){
+        put_rac(&s->c, &s->block_state[4 + s_context], 0);
+        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
+        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
+
+        if(score2 < score && score2 < iscore)
+            return score2;
+    }
+
+    if(iscore < score){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
+        memcpy(pbbak, i_buffer, i_len);
+        s->c= ic;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + i_len;
+        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
+        memcpy(s->block_state, i_state, sizeof(s->block_state));
+        return iscore;
+    }else{
+        memcpy(pbbak, p_buffer, p_len);
+        s->c= pc;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + p_len;
+        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
+        memcpy(s->block_state, p_state, sizeof(s->block_state));
+        return score;
+    }
+}
+
+static void encode_q_branch2(SnowContext *s, int level, int x, int y){
+    const int w= s->b_width  << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    int trx= (x+1)<<rem_depth;
+    BlockNode *b= &s->block[index];
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int pl = left->color[0];
+    int pcb= left->color[1];
+    int pcr= left->color[2];
+    int pmx, pmy;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
+    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+
+    if(s->keyframe){
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
+        return;
+    }
+
+    if(level!=s->block_max_depth){
+        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
+            put_rac(&s->c, &s->block_state[4 + s_context], 1);
+        }else{
+            put_rac(&s->c, &s->block_state[4 + s_context], 0);
+            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
+            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
+            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
+            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
+            return;
+        }
+    }
+    if(b->type & BLOCK_INTRA){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
+        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
+        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
+        if (s->nb_planes > 2) {
+            put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
+            put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
+        }
+        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
+    }else{
+        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
+        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
+        if(s->ref_frames > 1)
+            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
+        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
+        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
+        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
+    }
+}
+
+static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
+    int i, x2, y2;
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *src= s-> input_picture->data[plane_index];
+    IDWTELEM *dst= (IDWTELEM*)s->m.sc.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int index= mb_x + mb_y*b_stride;
+    BlockNode *b= &s->block[index];
+    BlockNode backup= *b;
+    int ab=0;
+    int aa=0;
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc stuff above
+
+    b->type|= BLOCK_INTRA;
+    b->color[plane_index]= 0;
+    memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
+
+    for(i=0; i<4; i++){
+        int mb_x2= mb_x + (i &1) - 1;
+        int mb_y2= mb_y + (i>>1) - 1;
+        int x= block_w*mb_x2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
+
+        add_yblock(s, 0, NULL, dst + (i&1)*block_w + (i>>1)*obmc_stride*block_h, NULL, obmc,
+                    x, y, block_w, block_h, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
+
+        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_h); y2++){
+            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
+                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_h*mb_y - block_h/2))*obmc_stride;
+                int obmc_v= obmc[index];
+                int d;
+                if(y<0) obmc_v += obmc[index + block_h*obmc_stride];
+                if(x<0) obmc_v += obmc[index + block_w];
+                if(y+block_h>h) obmc_v += obmc[index - block_h*obmc_stride];
+                if(x+block_w>w) obmc_v += obmc[index - block_w];
+                //FIXME precalculate this or simplify it somehow else
+
+                d = -dst[index] + (1<<(FRAC_BITS-1));
+                dst[index] = d;
+                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
+                aa += obmc_v * obmc_v; //FIXME precalculate this
+            }
+        }
+    }
+    *b= backup;
+
+    return av_clip_uint8( ROUNDED_DIV(ab<<LOG2_OBMC_MAX, aa) ); //FIXME we should not need clipping
+}
+
+static inline int get_block_bits(SnowContext *s, int x, int y, int w){
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int b_height = s->b_height<< s->block_max_depth;
+    int index= x + y*b_stride;
+    const BlockNode *b     = &s->block[index];
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
+    const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
+    int dmx, dmy;
+//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
+//  int my_context= av_log2(2*FFABS(left->my - top->my));
+
+    if(x<0 || x>=b_stride || y>=b_height)
+        return 0;
+/*
+1            0      0
+01X          1-2    1
+001XX        3-6    2-3
+0001XXX      7-14   4-7
+00001XXXX   15-30   8-15
+*/
+//FIXME try accurate rate
+//FIXME intra and inter predictors if surrounding blocks are not the same type
+    if(b->type & BLOCK_INTRA){
+        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
+                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
+                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
+    }else{
+        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
+        dmx-= b->mx;
+        dmy-= b->my;
+        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
+                    + av_log2(2*FFABS(dmy))
+                    + av_log2(2*b->ref));
+    }
+}
+
+static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, uint8_t (*obmc_edged)[MB_SIZE * 2]){
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst= s->current_picture->data[plane_index];
+    uint8_t *src= s->  input_picture->data[plane_index];
+    IDWTELEM *pred= (IDWTELEM*)s->m.sc.obmc_scratchpad + plane_index*block_size*block_size*4;
+    uint8_t *cur = s->scratchbuf;
+    uint8_t *tmp = s->emu_edge_buffer;
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int b_height = s->b_height<< s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int distortion;
+    int rate= 0;
+    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
+    int sx= block_w*mb_x - block_w/2;
+    int sy= block_h*mb_y - block_h/2;
+    int x0= FFMAX(0,-sx);
+    int y0= FFMAX(0,-sy);
+    int x1= FFMIN(block_w*2, w-sx);
+    int y1= FFMIN(block_h*2, h-sy);
+    int i,x,y;
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below chckinhg only block_w
+
+    ff_snow_pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_h*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
+
+    for(y=y0; y<y1; y++){
+        const uint8_t *obmc1= obmc_edged[y];
+        const IDWTELEM *pred1 = pred + y*obmc_stride;
+        uint8_t *cur1 = cur + y*ref_stride;
+        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
+        for(x=x0; x<x1; x++){
+#if FRAC_BITS >= LOG2_OBMC_MAX
+            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
+#else
+            int v = (cur1[x] * obmc1[x] + (1<<(LOG2_OBMC_MAX - FRAC_BITS-1))) >> (LOG2_OBMC_MAX - FRAC_BITS);
+#endif
+            v = (v + pred1[x]) >> FRAC_BITS;
+            if(v&(~255)) v= ~(v>>31);
+            dst1[x] = v;
+        }
+    }
+
+    /* copy the regions where obmc[] = (uint8_t)256 */
+    if(LOG2_OBMC_MAX == 8
+        && (mb_x == 0 || mb_x == b_stride-1)
+        && (mb_y == 0 || mb_y == b_height-1)){
+        if(mb_x == 0)
+            x1 = block_w;
+        else
+            x0 = block_w;
+        if(mb_y == 0)
+            y1 = block_h;
+        else
+            y0 = block_h;
+        for(y=y0; y<y1; y++)
+            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
+    }
+
+    if(block_w==16){
+        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
+        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
+        /* FIXME cmps overlap but do not cover the wavelet's whole support.
+         * So improving the score of one block is not strictly guaranteed
+         * to improve the score of the whole frame, thus iterative motion
+         * estimation does not always converge. */
+        if(s->avctx->me_cmp == FF_CMP_W97)
+            distortion = ff_w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else if(s->avctx->me_cmp == FF_CMP_W53)
+            distortion = ff_w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else{
+            distortion = 0;
+            for(i=0; i<4; i++){
+                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
+                distortion += s->mecc.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
+            }
+        }
+    }else{
+        av_assert2(block_w==8);
+        distortion = s->mecc.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
+    }
+
+    if(plane_index==0){
+        for(i=0; i<4; i++){
+/* ..RRr
+ * .RXx.
+ * rxx..
+ */
+            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
+        }
+        if(mb_x == b_stride-2)
+            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
+    }
+    return distortion + rate*penalty_factor;
+}
+
+static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
+    int i, y2;
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst= s->current_picture->data[plane_index];
+    uint8_t *src= s-> input_picture->data[plane_index];
+    //FIXME zero_dst is const but add_yblock changes dst if add is 0 (this is never the case for dst=zero_dst
+    // const has only been removed from zero_dst to suppress a warning
+    static IDWTELEM zero_dst[4096]; //FIXME
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int distortion= 0;
+    int rate= 0;
+    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below
+
+    for(i=0; i<9; i++){
+        int mb_x2= mb_x + (i%3) - 1;
+        int mb_y2= mb_y + (i/3) - 1;
+        int x= block_w*mb_x2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
+
+        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
+                   x, y, block_w, block_h, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
+
+        //FIXME find a cleaner/simpler way to skip the outside stuff
+        for(y2= y; y2<0; y2++)
+            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
+        for(y2= h; y2<y+block_h; y2++)
+            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
+        if(x<0){
+            for(y2= y; y2<y+block_h; y2++)
+                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
+        }
+        if(x+block_w > w){
+            for(y2= y; y2<y+block_h; y2++)
+                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
+        }
+
+        av_assert1(block_w== 8 || block_w==16);
+        distortion += s->mecc.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h);
+    }
+
+    if(plane_index==0){
+        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
+        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
+
+/* ..RRRr
+ * .RXXx.
+ * .RXXx.
+ * rxxx.
+ */
+        if(merged)
+            rate = get_block_bits(s, mb_x, mb_y, 2);
+        for(i=merged?4:0; i<9; i++){
+            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
+            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
+        }
+    }
+    return distortion + rate*penalty_factor;
+}
+
+static int encode_subband_c0run(SnowContext *s, SubBand *b, const IDWTELEM *src, const IDWTELEM *parent, int stride, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x, y;
+
+    if(1){
+        int run=0;
+        int *runs = s->run_buffer;
+        int run_index=0;
+        int max_index;
+
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height)
+                        p= parent[px + py*2*stride];
+                }
+                if(!(/*ll|*/l|lt|t|rt|p)){
+                    if(v){
+                        runs[run_index++]= run;
+                        run=0;
+                    }else{
+                        run++;
+                    }
+                }
+            }
+        }
+        max_index= run_index;
+        runs[run_index++]= run;
+        run_index=0;
+        run= runs[run_index++];
+
+        put_symbol2(&s->c, b->state[30], max_index, 0);
+        if(run_index <= max_index)
+            put_symbol2(&s->c, b->state[1], run, 3);
+
+        for(y=0; y<h; y++){
+            if(s->c.bytestream_end - s->c.bytestream < w*40){
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return AVERROR(ENOMEM);
+            }
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height)
+                        p= parent[px + py*2*stride];
+                }
+                if(/*ll|*/l|lt|t|rt|p){
+                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
+
+                    put_rac(&s->c, &b->state[0][context], !!v);
+                }else{
+                    if(!run){
+                        run= runs[run_index++];
+
+                        if(run_index <= max_index)
+                            put_symbol2(&s->c, b->state[1], run, 3);
+                        av_assert2(v);
+                    }else{
+                        run--;
+                        av_assert2(!v);
+                    }
+                }
+                if(v){
+                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
+                    int l2= 2*FFABS(l) + (l<0);
+                    int t2= 2*FFABS(t) + (t<0);
+
+                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
+                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + ff_quant3bA[l2&0xFF] + 3*ff_quant3bA[t2&0xFF]], v<0);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int encode_subband(SnowContext *s, SubBand *b, const IDWTELEM *src, const IDWTELEM *parent, int stride, int orientation){
+//    encode_subband_qtree(s, b, src, parent, stride, orientation);
+//    encode_subband_z0run(s, b, src, parent, stride, orientation);
+    return encode_subband_c0run(s, b, src, parent, stride, orientation);
+//    encode_subband_dzr(s, b, src, parent, stride, orientation);
+}
+
+static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
+    const int b_stride= s->b_width << s->block_max_depth;
+    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
+    BlockNode backup= *block;
+    unsigned value;
+    int rd, index;
+
+    av_assert2(mb_x>=0 && mb_y>=0);
+    av_assert2(mb_x<b_stride);
+
+    if(intra){
+        block->color[0] = p[0];
+        block->color[1] = p[1];
+        block->color[2] = p[2];
+        block->type |= BLOCK_INTRA;
+    }else{
+        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
+        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
+        if(s->me_cache[index] == value)
+            return 0;
+        s->me_cache[index]= value;
+
+        block->mx= p[0];
+        block->my= p[1];
+        block->type &= ~BLOCK_INTRA;
+    }
+
+    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged) + s->intra_penalty * !!intra;
+
+//FIXME chroma
+    if(rd < *best_rd){
+        *best_rd= rd;
+        return 1;
+    }else{
+        *block= backup;
+        return 0;
+    }
+}
+
+/* special case for int[2] args we discard afterwards,
+ * fixes compilation problem with gcc 2.95 */
+static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
+    int p[2] = {p0, p1};
+    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
+}
+
+static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
+    const int b_stride= s->b_width << s->block_max_depth;
+    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
+    BlockNode backup[4];
+    unsigned value;
+    int rd, index;
+
+    /* We don't initialize backup[] during variable declaration, because
+     * that fails to compile on MSVC: "cannot convert from 'BlockNode' to
+     * 'int16_t'". */
+    backup[0] = block[0];
+    backup[1] = block[1];
+    backup[2] = block[b_stride];
+    backup[3] = block[b_stride + 1];
+
+    av_assert2(mb_x>=0 && mb_y>=0);
+    av_assert2(mb_x<b_stride);
+    av_assert2(((mb_x|mb_y)&1) == 0);
+
+    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
+    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
+    if(s->me_cache[index] == value)
+        return 0;
+    s->me_cache[index]= value;
+
+    block->mx= p0;
+    block->my= p1;
+    block->ref= ref;
+    block->type &= ~BLOCK_INTRA;
+    block[1]= block[b_stride]= block[b_stride+1]= *block;
+
+    rd= get_4block_rd(s, mb_x, mb_y, 0);
+
+//FIXME chroma
+    if(rd < *best_rd){
+        *best_rd= rd;
+        return 1;
+    }else{
+        block[0]= backup[0];
+        block[1]= backup[1];
+        block[b_stride]= backup[2];
+        block[b_stride+1]= backup[3];
+        return 0;
+    }
+}
+
+static void iterative_me(SnowContext *s){
+    int pass, mb_x, mb_y;
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    int color[3];
+
+    {
+        RangeCoder r = s->c;
+        uint8_t state[sizeof(s->block_state)];
+        memcpy(state, s->block_state, sizeof(s->block_state));
+        for(mb_y= 0; mb_y<s->b_height; mb_y++)
+            for(mb_x= 0; mb_x<s->b_width; mb_x++)
+                encode_q_branch(s, 0, mb_x, mb_y);
+        s->c = r;
+        memcpy(s->block_state, state, sizeof(s->block_state));
+    }
+
+    for(pass=0; pass<25; pass++){
+        int change= 0;
+
+        for(mb_y= 0; mb_y<b_height; mb_y++){
+            for(mb_x= 0; mb_x<b_width; mb_x++){
+                int dia_change, i, j, ref;
+                int best_rd= INT_MAX, ref_rd;
+                BlockNode backup, ref_b;
+                const int index= mb_x + mb_y * b_stride;
+                BlockNode *block= &s->block[index];
+                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
+                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
+                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
+                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
+                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
+                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
+                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
+                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
+                const int b_w= (MB_SIZE >> s->block_max_depth);
+                uint8_t obmc_edged[MB_SIZE * 2][MB_SIZE * 2];
+
+                if(pass && (block->type & BLOCK_OPT))
+                    continue;
+                block->type |= BLOCK_OPT;
+
+                backup= *block;
+
+                if(!s->me_cache_generation)
+                    memset(s->me_cache, 0, sizeof(s->me_cache));
+                s->me_cache_generation += 1<<22;
+
+                //FIXME precalculate
+                {
+                    int x, y;
+                    for (y = 0; y < b_w * 2; y++)
+                        memcpy(obmc_edged[y], ff_obmc_tab[s->block_max_depth] + y * b_w * 2, b_w * 2);
+                    if(mb_x==0)
+                        for(y=0; y<b_w*2; y++)
+                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
+                    if(mb_x==b_stride-1)
+                        for(y=0; y<b_w*2; y++)
+                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
+                    if(mb_y==0){
+                        for(x=0; x<b_w*2; x++)
+                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
+                        for(y=1; y<b_w; y++)
+                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
+                    }
+                    if(mb_y==b_height-1){
+                        for(x=0; x<b_w*2; x++)
+                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
+                        for(y=b_w; y<b_w*2-1; y++)
+                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
+                    }
+                }
+
+                //skip stuff outside the picture
+                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1){
+                    uint8_t *src= s->  input_picture->data[0];
+                    uint8_t *dst= s->current_picture->data[0];
+                    const int stride= s->current_picture->linesize[0];
+                    const int block_w= MB_SIZE >> s->block_max_depth;
+                    const int block_h= MB_SIZE >> s->block_max_depth;
+                    const int sx= block_w*mb_x - block_w/2;
+                    const int sy= block_h*mb_y - block_h/2;
+                    const int w= s->plane[0].width;
+                    const int h= s->plane[0].height;
+                    int y;
+
+                    for(y=sy; y<0; y++)
+                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
+                    for(y=h; y<sy+block_h*2; y++)
+                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
+                    if(sx<0){
+                        for(y=sy; y<sy+block_h*2; y++)
+                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
+                    }
+                    if(sx+block_w*2 > w){
+                        for(y=sy; y<sy+block_h*2; y++)
+                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
+                    }
+                }
+
+                // intra(black) = neighbors' contribution to the current block
+                for(i=0; i < s->nb_planes; i++)
+                    color[i]= get_dc(s, mb_x, mb_y, i);
+
+                // get previous score (cannot be cached due to OBMC)
+                if(pass > 0 && (block->type&BLOCK_INTRA)){
+                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
+                    check_block(s, mb_x, mb_y, color0, 1, obmc_edged, &best_rd);
+                }else
+                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, obmc_edged, &best_rd);
+
+                ref_b= *block;
+                ref_rd= best_rd;
+                for(ref=0; ref < s->ref_frames; ref++){
+                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
+                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
+                        continue;
+                    block->ref= ref;
+                    best_rd= INT_MAX;
+
+                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, 0, 0, obmc_edged, &best_rd);
+                    if(tb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], obmc_edged, &best_rd);
+                    if(lb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], obmc_edged, &best_rd);
+                    if(rb)
+                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], obmc_edged, &best_rd);
+                    if(bb)
+                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], obmc_edged, &best_rd);
+
+                    /* fullpel ME */
+                    //FIXME avoid subpel interpolation / round to nearest integer
+                    do{
+                        int newx = block->mx;
+                        int newy = block->my;
+                        int dia_size = s->iterative_dia_size ? s->iterative_dia_size : FFMAX(s->avctx->dia_size, 1);
+                        dia_change=0;
+                        for(i=0; i < dia_size; i++){
+                            for(j=0; j<i; j++){
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx+4*(i-j), newy+(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx-4*(i-j), newy-(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx-(4*j), newy+4*(i-j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx+(4*j), newy-4*(i-j), obmc_edged, &best_rd);
+                            }
+                        }
+                    }while(dia_change);
+                    /* subpel ME */
+                    do{
+                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
+                        dia_change=0;
+                        for(i=0; i<8; i++)
+                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], obmc_edged, &best_rd);
+                    }while(dia_change);
+                    //FIXME or try the standard 2 pass qpel or similar
+
+                    mvr[0][0]= block->mx;
+                    mvr[0][1]= block->my;
+                    if(ref_rd > best_rd){
+                        ref_rd= best_rd;
+                        ref_b= *block;
+                    }
+                }
+                best_rd= ref_rd;
+                *block= ref_b;
+                check_block(s, mb_x, mb_y, color, 1, obmc_edged, &best_rd);
+                //FIXME RD style color selection
+                if(!same_block(block, &backup)){
+                    if(tb ) tb ->type &= ~BLOCK_OPT;
+                    if(lb ) lb ->type &= ~BLOCK_OPT;
+                    if(rb ) rb ->type &= ~BLOCK_OPT;
+                    if(bb ) bb ->type &= ~BLOCK_OPT;
+                    if(tlb) tlb->type &= ~BLOCK_OPT;
+                    if(trb) trb->type &= ~BLOCK_OPT;
+                    if(blb) blb->type &= ~BLOCK_OPT;
+                    if(brb) brb->type &= ~BLOCK_OPT;
+                    change ++;
+                }
+            }
+        }
+        av_log(s->avctx, AV_LOG_DEBUG, "pass:%d changed:%d\n", pass, change);
+        if(!change)
+            break;
+    }
+
+    if(s->block_max_depth == 1){
+        int change= 0;
+        for(mb_y= 0; mb_y<b_height; mb_y+=2){
+            for(mb_x= 0; mb_x<b_width; mb_x+=2){
+                int i;
+                int best_rd, init_rd;
+                const int index= mb_x + mb_y * b_stride;
+                BlockNode *b[4];
+
+                b[0]= &s->block[index];
+                b[1]= b[0]+1;
+                b[2]= b[0]+b_stride;
+                b[3]= b[2]+1;
+                if(same_block(b[0], b[1]) &&
+                   same_block(b[0], b[2]) &&
+                   same_block(b[0], b[3]))
+                    continue;
+
+                if(!s->me_cache_generation)
+                    memset(s->me_cache, 0, sizeof(s->me_cache));
+                s->me_cache_generation += 1<<22;
+
+                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
+
+                //FIXME more multiref search?
+                check_4block_inter(s, mb_x, mb_y,
+                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
+                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
+
+                for(i=0; i<4; i++)
+                    if(!(b[i]->type&BLOCK_INTRA))
+                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
+
+                if(init_rd != best_rd)
+                    change++;
+            }
+        }
+        av_log(s->avctx, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
+    }
+}
+
+static void encode_blocks(SnowContext *s, int search){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+
+    if(s->motion_est == FF_ME_ITER && !s->keyframe && search)
+        iterative_me(s);
+
+    for(y=0; y<h; y++){
+        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return;
+        }
+        for(x=0; x<w; x++){
+            if(s->motion_est == FF_ME_ITER || !search)
+                encode_q_branch2(s, 0, x, y);
+            else
+                encode_q_branch (s, 0, x, y);
+        }
+    }
+}
+
+static void quantize(SnowContext *s, SubBand *b, IDWTELEM *dst, DWTELEM *src, int stride, int bias){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
+    int x,y, thres1, thres2;
+
+    if(s->qlog == LOSSLESS_QLOG){
+        for(y=0; y<h; y++)
+            for(x=0; x<w; x++)
+                dst[x + y*stride]= src[x + y*stride];
+        return;
+    }
+
+    bias= bias ? 0 : (3*qmul)>>3;
+    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
+    thres2= 2*thres1;
+
+    if(!bias){
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride];
+
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        dst[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        dst[x + y*stride]= -i;
+                    }
+                }else
+                    dst[x + y*stride]= 0;
+            }
+        }
+    }else{
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride];
+
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        dst[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        dst[x + y*stride]= -i;
+                    }
+                }else
+                    dst[x + y*stride]= 0;
+            }
+        }
+    }
+}
+
+static void dequantize(SnowContext *s, SubBand *b, IDWTELEM *src, int stride){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+
+    if(s->qlog == LOSSLESS_QLOG) return;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= src[x + y*stride];
+            if(i<0){
+                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+}
+
+static void decorrelate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    for(y=h-1; y>=0; y--){
+        for(x=w-1; x>=0; x--){
+            int i= x + y*stride;
+
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] -= src[i - 1];
+                }else{
+                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] -= src[i - 1];
+                }
+            }else{
+                if(y) src[i] -= src[i - stride];
+            }
+        }
+    }
+}
+
+static void correlate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= x + y*stride;
+
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] += src[i - 1];
+                }else{
+                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] += src[i - 1];
+                }
+            }else{
+                if(y) src[i] += src[i - stride];
+            }
+        }
+    }
+}
+
+static void encode_qlogs(SnowContext *s){
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                if(orientation==2) continue;
+                put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
+            }
+        }
+    }
+}
+
+static void encode_header(SnowContext *s){
+    int plane_index, i;
+    uint8_t kstate[32];
+
+    memset(kstate, MID_STATE, sizeof(kstate));
+
+    put_rac(&s->c, kstate, s->keyframe);
+    if(s->keyframe || s->always_reset){
+        ff_snow_reset_contexts(s);
+        s->last_spatial_decomposition_type=
+        s->last_qlog=
+        s->last_qbias=
+        s->last_mv_scale=
+        s->last_block_max_depth= 0;
+        for(plane_index=0; plane_index<2; plane_index++){
+            Plane *p= &s->plane[plane_index];
+            p->last_htaps=0;
+            p->last_diag_mc=0;
+            memset(p->last_hcoeff, 0, sizeof(p->last_hcoeff));
+        }
+    }
+    if(s->keyframe){
+        put_symbol(&s->c, s->header_state, s->version, 0);
+        put_rac(&s->c, s->header_state, s->always_reset);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
+        if (s->nb_planes > 2) {
+            put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
+            put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
+        }
+        put_rac(&s->c, s->header_state, s->spatial_scalability);
+//        put_rac(&s->c, s->header_state, s->rate_scalability);
+        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
+
+        encode_qlogs(s);
+    }
+
+    if(!s->keyframe){
+        int update_mc=0;
+        for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+            Plane *p= &s->plane[plane_index];
+            update_mc |= p->last_htaps   != p->htaps;
+            update_mc |= p->last_diag_mc != p->diag_mc;
+            update_mc |= !!memcmp(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
+        }
+        put_rac(&s->c, s->header_state, update_mc);
+        if(update_mc){
+            for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+                Plane *p= &s->plane[plane_index];
+                put_rac(&s->c, s->header_state, p->diag_mc);
+                put_symbol(&s->c, s->header_state, p->htaps/2-1, 0);
+                for(i= p->htaps/2; i; i--)
+                    put_symbol(&s->c, s->header_state, FFABS(p->hcoeff[i]), 0);
+            }
+        }
+        if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
+            put_rac(&s->c, s->header_state, 1);
+            put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
+            encode_qlogs(s);
+        }else
+            put_rac(&s->c, s->header_state, 0);
+    }
+
+    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
+    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
+    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
+    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
+    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
+
+}
+
+static void update_last_header_values(SnowContext *s){
+    int plane_index;
+
+    if(!s->keyframe){
+        for(plane_index=0; plane_index<2; plane_index++){
+            Plane *p= &s->plane[plane_index];
+            p->last_diag_mc= p->diag_mc;
+            p->last_htaps  = p->htaps;
+            memcpy(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
+        }
+    }
+
+    s->last_spatial_decomposition_type  = s->spatial_decomposition_type;
+    s->last_qlog                        = s->qlog;
+    s->last_qbias                       = s->qbias;
+    s->last_mv_scale                    = s->mv_scale;
+    s->last_block_max_depth             = s->block_max_depth;
+    s->last_spatial_decomposition_count = s->spatial_decomposition_count;
+}
+
+static int qscale2qlog(int qscale){
+    return lrint(QROOT*log2(qscale / (float)FF_QP2LAMBDA))
+           + 61*QROOT/8; ///< 64 > 60
+}
+
+static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
+{
+    /* Estimate the frame's complexity as a sum of weighted dwt coefficients.
+     * FIXME we know exact mv bits at this point,
+     * but ratecontrol isn't set up to include them. */
+    uint32_t coef_sum= 0;
+    int level, orientation, delta_qlog;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &s->plane[0].band[level][orientation];
+            IDWTELEM *buf= b->ibuf;
+            const int w= b->width;
+            const int h= b->height;
+            const int stride= b->stride;
+            const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
+            const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+            const int qdiv= (1<<16)/qmul;
+            int x, y;
+            //FIXME this is ugly
+            for(y=0; y<h; y++)
+                for(x=0; x<w; x++)
+                    buf[x+y*stride]= b->buf[x+y*stride];
+            if(orientation==0)
+                decorrelate(s, b, buf, stride, 1, 0);
+            for(y=0; y<h; y++)
+                for(x=0; x<w; x++)
+                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
+        }
+    }
+
+    /* ugly, ratecontrol just takes a sqrt again */
+    av_assert0(coef_sum < INT_MAX);
+    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
+
+    if(pict->pict_type == AV_PICTURE_TYPE_I){
+        s->m.current_picture.mb_var_sum= coef_sum;
+        s->m.current_picture.mc_mb_var_sum= 0;
+    }else{
+        s->m.current_picture.mc_mb_var_sum= coef_sum;
+        s->m.current_picture.mb_var_sum= 0;
+    }
+
+    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
+    if (pict->quality < 0)
+        return INT_MIN;
+    s->lambda= pict->quality * 3/2;
+    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
+    s->qlog+= delta_qlog;
+    return delta_qlog;
+}
+
+static void calculate_visual_weight(SnowContext *s, Plane *p){
+    int width = p->width;
+    int height= p->height;
+    int level, orientation, x, y;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &p->band[level][orientation];
+            IDWTELEM *ibuf= b->ibuf;
+            int64_t error=0;
+
+            memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
+            ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
+                    error += d*d;
+                }
+            }
+
+            b->qlog= (int)(QROOT * log2(352256.0/sqrt(error)) + 0.5);
+        }
+    }
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    AVFrame *pic;
+    const int width= s->avctx->width;
+    const int height= s->avctx->height;
+    int level, orientation, plane_index, i, y, ret;
+    uint8_t rc_header_bak[sizeof(s->header_state)];
+    uint8_t rc_block_bak[sizeof(s->block_state)];
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->b_width*s->b_height*MB_SIZE*MB_SIZE*3 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    ff_init_range_encoder(c, pkt->data, pkt->size);
+    ff_build_rac_states(c, (1LL<<32)/20, 256-8);
+
+    for(i=0; i < s->nb_planes; i++){
+        int hshift= i ? s->chroma_h_shift : 0;
+        int vshift= i ? s->chroma_v_shift : 0;
+        for(y=0; y<AV_CEIL_RSHIFT(height, vshift); y++)
+            memcpy(&s->input_picture->data[i][y * s->input_picture->linesize[i]],
+                   &pict->data[i][y * pict->linesize[i]],
+                   AV_CEIL_RSHIFT(width, hshift));
+        s->mpvencdsp.draw_edges(s->input_picture->data[i], s->input_picture->linesize[i],
+                                AV_CEIL_RSHIFT(width, hshift), AV_CEIL_RSHIFT(height, vshift),
+                                EDGE_WIDTH >> hshift, EDGE_WIDTH >> vshift,
+                                EDGE_TOP | EDGE_BOTTOM);
+
+    }
+    emms_c();
+    pic = s->input_picture;
+    pic->pict_type = pict->pict_type;
+    pic->quality = pict->quality;
+
+    s->m.picture_number= avctx->frame_number;
+    if(avctx->flags&AV_CODEC_FLAG_PASS2){
+        s->m.pict_type = pic->pict_type = s->m.rc_context.entry[avctx->frame_number].new_pict_type;
+        s->keyframe = pic->pict_type == AV_PICTURE_TYPE_I;
+        if(!(avctx->flags&AV_CODEC_FLAG_QSCALE)) {
+            pic->quality = ff_rate_estimate_qscale(&s->m, 0);
+            if (pic->quality < 0)
+                return -1;
+        }
+    }else{
+        s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
+        s->m.pict_type = pic->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+    }
+
+    if(s->pass1_rc && avctx->frame_number == 0)
+        pic->quality = 2*FF_QP2LAMBDA;
+    if (pic->quality) {
+        s->qlog   = qscale2qlog(pic->quality);
+        s->lambda = pic->quality * 3/2;
+    }
+    if (s->qlog < 0 || (!pic->quality && (avctx->flags & AV_CODEC_FLAG_QSCALE))) {
+        s->qlog= LOSSLESS_QLOG;
+        s->lambda = 0;
+    }//else keep previous frame's qlog until after motion estimation
+
+    if (s->current_picture->data[0]) {
+        int w = s->avctx->width;
+        int h = s->avctx->height;
+
+        s->mpvencdsp.draw_edges(s->current_picture->data[0],
+                                s->current_picture->linesize[0], w   , h   ,
+                                EDGE_WIDTH  , EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
+        if (s->current_picture->data[2]) {
+            s->mpvencdsp.draw_edges(s->current_picture->data[1],
+                                    s->current_picture->linesize[1], w>>s->chroma_h_shift, h>>s->chroma_v_shift,
+                                    EDGE_WIDTH>>s->chroma_h_shift, EDGE_WIDTH>>s->chroma_v_shift, EDGE_TOP | EDGE_BOTTOM);
+            s->mpvencdsp.draw_edges(s->current_picture->data[2],
+                                    s->current_picture->linesize[2], w>>s->chroma_h_shift, h>>s->chroma_v_shift,
+                                    EDGE_WIDTH>>s->chroma_h_shift, EDGE_WIDTH>>s->chroma_v_shift, EDGE_TOP | EDGE_BOTTOM);
+        }
+        emms_c();
+    }
+
+    ff_snow_frame_start(s);
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    av_frame_unref(avctx->coded_frame);
+    ret = av_frame_ref(avctx->coded_frame, s->current_picture);
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    if (ret < 0)
+        return ret;
+
+    s->m.current_picture_ptr= &s->m.current_picture;
+    s->m.current_picture.f = s->current_picture;
+    s->m.current_picture.f->pts = pict->pts;
+    if(pic->pict_type == AV_PICTURE_TYPE_P){
+        int block_width = (width +15)>>4;
+        int block_height= (height+15)>>4;
+        int stride= s->current_picture->linesize[0];
+
+        av_assert0(s->current_picture->data[0]);
+        av_assert0(s->last_picture[0]->data[0]);
+
+        s->m.avctx= s->avctx;
+        s->m.   last_picture.f = s->last_picture[0];
+        s->m.    new_picture.f = s->input_picture;
+        s->m.   last_picture_ptr= &s->m.   last_picture;
+        s->m.linesize = stride;
+        s->m.uvlinesize= s->current_picture->linesize[1];
+        s->m.width = width;
+        s->m.height= height;
+        s->m.mb_width = block_width;
+        s->m.mb_height= block_height;
+        s->m.mb_stride=   s->m.mb_width+1;
+        s->m.b8_stride= 2*s->m.mb_width+1;
+        s->m.f_code=1;
+        s->m.pict_type = pic->pict_type;
+        s->m.motion_est= s->motion_est;
+        s->m.me.scene_change_score=0;
+        s->m.me.dia_size = avctx->dia_size;
+        s->m.quarter_sample= (s->avctx->flags & AV_CODEC_FLAG_QPEL)!=0;
+        s->m.out_format= FMT_H263;
+        s->m.unrestricted_mv= 1;
+
+        s->m.lambda = s->lambda;
+        s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
+        s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
+
+        s->m.mecc= s->mecc; //move
+        s->m.qdsp= s->qdsp; //move
+        s->m.hdsp = s->hdsp;
+        ff_init_me(&s->m);
+        s->hdsp = s->m.hdsp;
+        s->mecc= s->m.mecc;
+    }
+
+    if(s->pass1_rc){
+        memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
+        memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
+    }
+
+redo_frame:
+
+    s->spatial_decomposition_count= 5;
+
+    while(   !(width >>(s->chroma_h_shift + s->spatial_decomposition_count))
+          || !(height>>(s->chroma_v_shift + s->spatial_decomposition_count)))
+        s->spatial_decomposition_count--;
+
+    if (s->spatial_decomposition_count <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Resolution too low\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->m.pict_type = pic->pict_type;
+    s->qbias = pic->pict_type == AV_PICTURE_TYPE_P ? 2 : 0;
+
+    ff_snow_common_init_after_header(avctx);
+
+    if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
+        for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+            calculate_visual_weight(s, &s->plane[plane_index]);
+        }
+    }
+
+    encode_header(s);
+    s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    encode_blocks(s, 1);
+    s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+//        int bits= put_bits_count(&s->c.pb);
+
+        if (!s->memc_only) {
+            //FIXME optimize
+            if(pict->data[plane_index]) //FIXME gray hack
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_idwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
+                    }
+                }
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
+
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+            if(s->avctx->scenechange_threshold)
+                s->scenechange_threshold = s->avctx->scenechange_threshold;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+            if(   plane_index==0
+               && pic->pict_type == AV_PICTURE_TYPE_P
+               && !(avctx->flags&AV_CODEC_FLAG_PASS2)
+               && s->m.me.scene_change_score > s->scenechange_threshold){
+                ff_init_range_encoder(c, pkt->data, pkt->size);
+                ff_build_rac_states(c, (1LL<<32)/20, 256-8);
+                pic->pict_type= AV_PICTURE_TYPE_I;
+                s->keyframe=1;
+                s->current_picture->key_frame=1;
+                goto redo_frame;
+            }
+
+            if(s->qlog == LOSSLESS_QLOG){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_dwt_buffer[y*w + x]= (s->spatial_idwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
+                    }
+                }
+            }else{
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_dwt_buffer[y*w + x]=s->spatial_idwt_buffer[y*w + x]<<ENCODER_EXTRA_BITS;
+                    }
+                }
+            }
+
+            ff_spatial_dwt(s->spatial_dwt_buffer, s->temp_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+
+            if(s->pass1_rc && plane_index==0){
+                int delta_qlog = ratecontrol_1pass(s, pic);
+                if (delta_qlog <= INT_MIN)
+                    return -1;
+                if(delta_qlog){
+                    //reordering qlog in the bitstream would eliminate this reset
+                    ff_init_range_encoder(c, pkt->data, pkt->size);
+                    memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
+                    memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
+                    encode_header(s);
+                    encode_blocks(s, 0);
+                }
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+
+                    quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
+                    if(orientation==0)
+                        decorrelate(s, b, b->ibuf, b->stride, pic->pict_type == AV_PICTURE_TYPE_P, 0);
+                    if (!s->no_bitstream)
+                    encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
+                    av_assert0(b->parent==NULL || b->parent->stride == b->stride*2);
+                    if(orientation==0)
+                        correlate(s, b, b->ibuf, b->stride, 1, 0);
+                }
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+
+                    dequantize(s, b, b->ibuf, b->stride);
+                }
+            }
+
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            if(s->qlog == LOSSLESS_QLOG){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_idwt_buffer[y*w + x]<<=FRAC_BITS;
+                    }
+                }
+            }
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+        }else{
+            //ME/MC only
+            if(pic->pict_type == AV_PICTURE_TYPE_I){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x]=
+                            pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                    }
+                }
+            }else{
+                memset(s->spatial_idwt_buffer, 0, sizeof(IDWTELEM)*w*h);
+                predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+            }
+        }
+        if(s->avctx->flags&AV_CODEC_FLAG_PSNR){
+            int64_t error= 0;
+
+            if(pict->data[plane_index]) //FIXME gray hack
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        int d= s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                        error += d*d;
+                    }
+                }
+            s->avctx->error[plane_index] += error;
+            s->encoding_error[plane_index] = error;
+        }
+
+    }
+    emms_c();
+
+    update_last_header_values(s);
+
+    ff_snow_release_buffer(avctx);
+
+    s->current_picture->coded_picture_number = avctx->frame_number;
+    s->current_picture->pict_type = pic->pict_type;
+    s->current_picture->quality = pic->quality;
+    s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
+    s->m.current_picture.f->display_picture_number =
+    s->m.current_picture.f->coded_picture_number   = avctx->frame_number;
+    s->m.current_picture.f->quality                = pic->quality;
+    s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
+    if(s->pass1_rc)
+        if (ff_rate_estimate_qscale(&s->m, 0) < 0)
+            return -1;
+    if(avctx->flags&AV_CODEC_FLAG_PASS1)
+        ff_write_pass1_stats(&s->m);
+    s->m.last_pict_type = s->m.pict_type;
+#if FF_API_STAT_BITS
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->frame_bits = s->m.frame_bits;
+    avctx->mv_bits = s->m.mv_bits;
+    avctx->misc_bits = s->m.misc_bits;
+    avctx->p_tex_bits = s->m.p_tex_bits;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    emms_c();
+
+    ff_side_data_set_encoder_stats(pkt, s->current_picture->quality,
+                                   s->encoding_error,
+                                   (s->avctx->flags&AV_CODEC_FLAG_PSNR) ? 4 : 0,
+                                   s->current_picture->pict_type);
+
+#if FF_API_ERROR_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    memcpy(s->current_picture->error, s->encoding_error, sizeof(s->encoding_error));
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    pkt->size = ff_rac_terminate(c, 0);
+    if (s->current_picture->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int encode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    ff_snow_common_end(s);
+    ff_rate_control_uninit(&s->m);
+    av_frame_free(&s->input_picture);
+    av_freep(&avctx->stats_out);
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(SnowContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    {"motion_est", "motion estimation algorithm", OFFSET(motion_est), AV_OPT_TYPE_INT, {.i64 = FF_ME_EPZS }, FF_ME_ZERO, FF_ME_ITER, VE, "motion_est" },
+    { "zero", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_ZERO }, 0, 0, VE, "motion_est" },
+    { "epzs", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_EPZS }, 0, 0, VE, "motion_est" },
+    { "xone", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_XONE }, 0, 0, VE, "motion_est" },
+    { "iter", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_ITER }, 0, 0, VE, "motion_est" },
+    { "memc_only",      "Only do ME/MC (I frames -> ref, P frame -> ME+MC).",   OFFSET(memc_only), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "no_bitstream",   "Skip final bitstream writeout.",                    OFFSET(no_bitstream), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "intra_penalty",  "Penalty for intra blocks in block decission",      OFFSET(intra_penalty), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "iterative_dia_size",  "Dia size for the iterative ME",          OFFSET(iterative_dia_size), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "sc_threshold",   "Scene change threshold",                   OFFSET(scenechange_threshold), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, VE },
+    { "pred",           "Spatial decomposition type",                                OFFSET(pred), AV_OPT_TYPE_INT, { .i64 = 0 }, DWT_97, DWT_53, VE, "pred" },
+        { "dwt97", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, INT_MIN, INT_MAX, VE, "pred" },
+        { "dwt53", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, VE, "pred" },
+    { NULL },
+};
+
+static const AVClass snowenc_class = {
+    .class_name = "snow encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_snow_encoder = {
+    .name           = "snow",
+    .long_name      = NULL_IF_CONFIG_SMALL("Snow"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SNOW,
+    .priv_data_size = sizeof(SnowContext),
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &snowenc_class,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/sonic.c b/libavcodec/sonic.c
new file mode 100644
index 0000000..34d2952
--- /dev/null
+++ b/libavcodec/sonic.c
@@ -0,0 +1,1126 @@
+/*
+ * Simple free lossless/lossy audio codec
+ * Copyright (c) 2004 Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "get_bits.h"
+#include "golomb.h"
+#include "internal.h"
+#include "rangecoder.h"
+
+
+/**
+ * @file
+ * Simple free lossless/lossy audio codec
+ * Based on Paul Francis Harrison's Bonk (http://www.logarithmic.net/pfh/bonk)
+ * Written and designed by Alex Beregszaszi
+ *
+ * TODO:
+ *  - CABAC put/get_symbol
+ *  - independent quantizer for channels
+ *  - >2 channels support
+ *  - more decorrelation types
+ *  - more tap_quant tests
+ *  - selectable intlist writers/readers (bonk-style, golomb, cabac)
+ */
+
+#define MAX_CHANNELS 2
+
+#define MID_SIDE 0
+#define LEFT_SIDE 1
+#define RIGHT_SIDE 2
+
+typedef struct SonicContext {
+    int version;
+    int minor_version;
+    int lossless, decorrelation;
+
+    int num_taps, downsampling;
+    double quantization;
+
+    int channels, samplerate, block_align, frame_size;
+
+    int *tap_quant;
+    int *int_samples;
+    int *coded_samples[MAX_CHANNELS];
+
+    // for encoding
+    int *tail;
+    int tail_size;
+    int *window;
+    int window_size;
+
+    // for decoding
+    int *predictor_k;
+    int *predictor_state[MAX_CHANNELS];
+} SonicContext;
+
+#define LATTICE_SHIFT   10
+#define SAMPLE_SHIFT    4
+#define LATTICE_FACTOR  (1 << LATTICE_SHIFT)
+#define SAMPLE_FACTOR   (1 << SAMPLE_SHIFT)
+
+#define BASE_QUANT      0.6
+#define RATE_VARIATION  3.0
+
+static inline int shift(int a,int b)
+{
+    return (a+(1<<(b-1))) >> b;
+}
+
+static inline int shift_down(int a,int b)
+{
+    return (a>>b)+(a<0);
+}
+
+static av_always_inline av_flatten void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed, uint64_t rc_stat[256][2], uint64_t rc_stat2[32][2]){
+    int i;
+
+#define put_rac(C,S,B) \
+do{\
+    if(rc_stat){\
+        rc_stat[*(S)][B]++;\
+        rc_stat2[(S)-state][B]++;\
+    }\
+    put_rac(C,S,B);\
+}while(0)
+
+    if(v){
+        const int a= FFABS(v);
+        const int e= av_log2(a);
+        put_rac(c, state+0, 0);
+        if(e<=9){
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+i, 1);  //1..10
+            }
+            put_rac(c, state+1+i, 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+i, (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + e, v < 0); //11..21
+        }else{
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
+            }
+            put_rac(c, state+1+9, 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + 10, v < 0); //11..21
+        }
+    }else{
+        put_rac(c, state+0, 1);
+    }
+#undef put_rac
+}
+
+static inline av_flatten int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
+        return 0;
+    else{
+        int i, e, a;
+        e= 0;
+        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
+            e++;
+        }
+
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
+        }
+
+        e= -(is_signed && get_rac(c, state+11 + FFMIN(e, 10))); //11..21
+        return (a^e)-e;
+    }
+}
+
+#if 1
+static inline int intlist_write(RangeCoder *c, uint8_t *state, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        put_symbol(c, state, buf[i], 1, NULL, NULL);
+
+    return 1;
+}
+
+static inline int intlist_read(RangeCoder *c, uint8_t *state, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        buf[i] = get_symbol(c, state, 1);
+
+    return 1;
+}
+#elif 1
+static inline int intlist_write(PutBitContext *pb, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        set_se_golomb(pb, buf[i]);
+
+    return 1;
+}
+
+static inline int intlist_read(GetBitContext *gb, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        buf[i] = get_se_golomb(gb);
+
+    return 1;
+}
+
+#else
+
+#define ADAPT_LEVEL 8
+
+static int bits_to_store(uint64_t x)
+{
+    int res = 0;
+
+    while(x)
+    {
+        res++;
+        x >>= 1;
+    }
+    return res;
+}
+
+static void write_uint_max(PutBitContext *pb, unsigned int value, unsigned int max)
+{
+    int i, bits;
+
+    if (!max)
+        return;
+
+    bits = bits_to_store(max);
+
+    for (i = 0; i < bits-1; i++)
+        put_bits(pb, 1, value & (1 << i));
+
+    if ( (value | (1 << (bits-1))) <= max)
+        put_bits(pb, 1, value & (1 << (bits-1)));
+}
+
+static unsigned int read_uint_max(GetBitContext *gb, int max)
+{
+    int i, bits, value = 0;
+
+    if (!max)
+        return 0;
+
+    bits = bits_to_store(max);
+
+    for (i = 0; i < bits-1; i++)
+        if (get_bits1(gb))
+            value += 1 << i;
+
+    if ( (value | (1<<(bits-1))) <= max)
+        if (get_bits1(gb))
+            value += 1 << (bits-1);
+
+    return value;
+}
+
+static int intlist_write(PutBitContext *pb, int *buf, int entries, int base_2_part)
+{
+    int i, j, x = 0, low_bits = 0, max = 0;
+    int step = 256, pos = 0, dominant = 0, any = 0;
+    int *copy, *bits;
+
+    copy = av_calloc(entries, sizeof(*copy));
+    if (!copy)
+        return AVERROR(ENOMEM);
+
+    if (base_2_part)
+    {
+        int energy = 0;
+
+        for (i = 0; i < entries; i++)
+            energy += abs(buf[i]);
+
+        low_bits = bits_to_store(energy / (entries * 2));
+        if (low_bits > 15)
+            low_bits = 15;
+
+        put_bits(pb, 4, low_bits);
+    }
+
+    for (i = 0; i < entries; i++)
+    {
+        put_bits(pb, low_bits, abs(buf[i]));
+        copy[i] = abs(buf[i]) >> low_bits;
+        if (copy[i] > max)
+            max = abs(copy[i]);
+    }
+
+    bits = av_calloc(entries*max, sizeof(*bits));
+    if (!bits)
+    {
+        av_free(copy);
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i <= max; i++)
+    {
+        for (j = 0; j < entries; j++)
+            if (copy[j] >= i)
+                bits[x++] = copy[j] > i;
+    }
+
+    // store bitstream
+    while (pos < x)
+    {
+        int steplet = step >> 8;
+
+        if (pos + steplet > x)
+            steplet = x - pos;
+
+        for (i = 0; i < steplet; i++)
+            if (bits[i+pos] != dominant)
+                any = 1;
+
+        put_bits(pb, 1, any);
+
+        if (!any)
+        {
+            pos += steplet;
+            step += step / ADAPT_LEVEL;
+        }
+        else
+        {
+            int interloper = 0;
+
+            while (((pos + interloper) < x) && (bits[pos + interloper] == dominant))
+                interloper++;
+
+            // note change
+            write_uint_max(pb, interloper, (step >> 8) - 1);
+
+            pos += interloper + 1;
+            step -= step / ADAPT_LEVEL;
+        }
+
+        if (step < 256)
+        {
+            step = 65536 / step;
+            dominant = !dominant;
+        }
+    }
+
+    // store signs
+    for (i = 0; i < entries; i++)
+        if (buf[i])
+            put_bits(pb, 1, buf[i] < 0);
+
+    av_free(bits);
+    av_free(copy);
+
+    return 0;
+}
+
+static int intlist_read(GetBitContext *gb, int *buf, int entries, int base_2_part)
+{
+    int i, low_bits = 0, x = 0;
+    int n_zeros = 0, step = 256, dominant = 0;
+    int pos = 0, level = 0;
+    int *bits = av_calloc(entries, sizeof(*bits));
+
+    if (!bits)
+        return AVERROR(ENOMEM);
+
+    if (base_2_part)
+    {
+        low_bits = get_bits(gb, 4);
+
+        if (low_bits)
+            for (i = 0; i < entries; i++)
+                buf[i] = get_bits(gb, low_bits);
+    }
+
+//    av_log(NULL, AV_LOG_INFO, "entries: %d, low bits: %d\n", entries, low_bits);
+
+    while (n_zeros < entries)
+    {
+        int steplet = step >> 8;
+
+        if (!get_bits1(gb))
+        {
+            for (i = 0; i < steplet; i++)
+                bits[x++] = dominant;
+
+            if (!dominant)
+                n_zeros += steplet;
+
+            step += step / ADAPT_LEVEL;
+        }
+        else
+        {
+            int actual_run = read_uint_max(gb, steplet-1);
+
+//            av_log(NULL, AV_LOG_INFO, "actual run: %d\n", actual_run);
+
+            for (i = 0; i < actual_run; i++)
+                bits[x++] = dominant;
+
+            bits[x++] = !dominant;
+
+            if (!dominant)
+                n_zeros += actual_run;
+            else
+                n_zeros++;
+
+            step -= step / ADAPT_LEVEL;
+        }
+
+        if (step < 256)
+        {
+            step = 65536 / step;
+            dominant = !dominant;
+        }
+    }
+
+    // reconstruct unsigned values
+    n_zeros = 0;
+    for (i = 0; n_zeros < entries; i++)
+    {
+        while(1)
+        {
+            if (pos >= entries)
+            {
+                pos = 0;
+                level += 1 << low_bits;
+            }
+
+            if (buf[pos] >= level)
+                break;
+
+            pos++;
+        }
+
+        if (bits[i])
+            buf[pos] += 1 << low_bits;
+        else
+            n_zeros++;
+
+        pos++;
+    }
+    av_free(bits);
+
+    // read signs
+    for (i = 0; i < entries; i++)
+        if (buf[i] && get_bits1(gb))
+            buf[i] = -buf[i];
+
+//    av_log(NULL, AV_LOG_INFO, "zeros: %d pos: %d\n", n_zeros, pos);
+
+    return 0;
+}
+#endif
+
+static void predictor_init_state(int *k, int *state, int order)
+{
+    int i;
+
+    for (i = order-2; i >= 0; i--)
+    {
+        int j, p, x = state[i];
+
+        for (j = 0, p = i+1; p < order; j++,p++)
+            {
+            int tmp = x + shift_down(k[j] * state[p], LATTICE_SHIFT);
+            state[p] += shift_down(k[j]*x, LATTICE_SHIFT);
+            x = tmp;
+        }
+    }
+}
+
+static int predictor_calc_error(int *k, int *state, int order, int error)
+{
+    int i, x = error - shift_down(k[order-1] * state[order-1], LATTICE_SHIFT);
+
+#if 1
+    int *k_ptr = &(k[order-2]),
+        *state_ptr = &(state[order-2]);
+    for (i = order-2; i >= 0; i--, k_ptr--, state_ptr--)
+    {
+        int k_value = *k_ptr, state_value = *state_ptr;
+        x -= shift_down(k_value * state_value, LATTICE_SHIFT);
+        state_ptr[1] = state_value + shift_down(k_value * x, LATTICE_SHIFT);
+    }
+#else
+    for (i = order-2; i >= 0; i--)
+    {
+        x -= shift_down(k[i] * state[i], LATTICE_SHIFT);
+        state[i+1] = state[i] + shift_down(k[i] * x, LATTICE_SHIFT);
+    }
+#endif
+
+    // don't drift too far, to avoid overflows
+    if (x >  (SAMPLE_FACTOR<<16)) x =  (SAMPLE_FACTOR<<16);
+    if (x < -(SAMPLE_FACTOR<<16)) x = -(SAMPLE_FACTOR<<16);
+
+    state[0] = x;
+
+    return x;
+}
+
+#if CONFIG_SONIC_ENCODER || CONFIG_SONIC_LS_ENCODER
+// Heavily modified Levinson-Durbin algorithm which
+// copes better with quantization, and calculates the
+// actual whitened result as it goes.
+
+static int modified_levinson_durbin(int *window, int window_entries,
+        int *out, int out_entries, int channels, int *tap_quant)
+{
+    int i;
+    int *state = av_calloc(window_entries, sizeof(*state));
+
+    if (!state)
+        return AVERROR(ENOMEM);
+
+    memcpy(state, window, 4* window_entries);
+
+    for (i = 0; i < out_entries; i++)
+    {
+        int step = (i+1)*channels, k, j;
+        double xx = 0.0, xy = 0.0;
+#if 1
+        int *x_ptr = &(window[step]);
+        int *state_ptr = &(state[0]);
+        j = window_entries - step;
+        for (;j>0;j--,x_ptr++,state_ptr++)
+        {
+            double x_value = *x_ptr;
+            double state_value = *state_ptr;
+            xx += state_value*state_value;
+            xy += x_value*state_value;
+        }
+#else
+        for (j = 0; j <= (window_entries - step); j++);
+        {
+            double stepval = window[step+j];
+            double stateval = window[j];
+//            xx += (double)window[j]*(double)window[j];
+//            xy += (double)window[step+j]*(double)window[j];
+            xx += stateval*stateval;
+            xy += stepval*stateval;
+        }
+#endif
+        if (xx == 0.0)
+            k = 0;
+        else
+            k = (int)(floor(-xy/xx * (double)LATTICE_FACTOR / (double)(tap_quant[i]) + 0.5));
+
+        if (k > (LATTICE_FACTOR/tap_quant[i]))
+            k = LATTICE_FACTOR/tap_quant[i];
+        if (-k > (LATTICE_FACTOR/tap_quant[i]))
+            k = -(LATTICE_FACTOR/tap_quant[i]);
+
+        out[i] = k;
+        k *= tap_quant[i];
+
+#if 1
+        x_ptr = &(window[step]);
+        state_ptr = &(state[0]);
+        j = window_entries - step;
+        for (;j>0;j--,x_ptr++,state_ptr++)
+        {
+            int x_value = *x_ptr;
+            int state_value = *state_ptr;
+            *x_ptr = x_value + shift_down(k*state_value,LATTICE_SHIFT);
+            *state_ptr = state_value + shift_down(k*x_value, LATTICE_SHIFT);
+        }
+#else
+        for (j=0; j <= (window_entries - step); j++)
+        {
+            int stepval = window[step+j];
+            int stateval=state[j];
+            window[step+j] += shift_down(k * stateval, LATTICE_SHIFT);
+            state[j] += shift_down(k * stepval, LATTICE_SHIFT);
+        }
+#endif
+    }
+
+    av_free(state);
+    return 0;
+}
+
+static inline int code_samplerate(int samplerate)
+{
+    switch (samplerate)
+    {
+        case 44100: return 0;
+        case 22050: return 1;
+        case 11025: return 2;
+        case 96000: return 3;
+        case 48000: return 4;
+        case 32000: return 5;
+        case 24000: return 6;
+        case 16000: return 7;
+        case 8000: return 8;
+    }
+    return AVERROR(EINVAL);
+}
+
+static av_cold int sonic_encode_init(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    PutBitContext pb;
+    int i;
+
+    s->version = 2;
+
+    if (avctx->channels > MAX_CHANNELS)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo streams are supported by now\n");
+        return AVERROR(EINVAL); /* only stereo or mono for now */
+    }
+
+    if (avctx->channels == 2)
+        s->decorrelation = MID_SIDE;
+    else
+        s->decorrelation = 3;
+
+    if (avctx->codec->id == AV_CODEC_ID_SONIC_LS)
+    {
+        s->lossless = 1;
+        s->num_taps = 32;
+        s->downsampling = 1;
+        s->quantization = 0.0;
+    }
+    else
+    {
+        s->num_taps = 128;
+        s->downsampling = 2;
+        s->quantization = 1.0;
+    }
+
+    // max tap 2048
+    if (s->num_taps < 32 || s->num_taps > 1024 || s->num_taps % 32) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of taps\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // generate taps
+    s->tap_quant = av_calloc(s->num_taps, sizeof(*s->tap_quant));
+    if (!s->tap_quant)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->num_taps; i++)
+        s->tap_quant[i] = ff_sqrt(i+1);
+
+    s->channels = avctx->channels;
+    s->samplerate = avctx->sample_rate;
+
+    s->block_align = 2048LL*s->samplerate/(44100*s->downsampling);
+    s->frame_size = s->channels*s->block_align*s->downsampling;
+
+    s->tail_size = s->num_taps*s->channels;
+    s->tail = av_calloc(s->tail_size, sizeof(*s->tail));
+    if (!s->tail)
+        return AVERROR(ENOMEM);
+
+    s->predictor_k = av_calloc(s->num_taps, sizeof(*s->predictor_k) );
+    if (!s->predictor_k)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->coded_samples[i] = av_calloc(s->block_align, sizeof(**s->coded_samples));
+        if (!s->coded_samples[i])
+            return AVERROR(ENOMEM);
+    }
+
+    s->int_samples = av_calloc(s->frame_size, sizeof(*s->int_samples));
+
+    s->window_size = ((2*s->tail_size)+s->frame_size);
+    s->window = av_calloc(s->window_size, sizeof(*s->window));
+    if (!s->window || !s->int_samples)
+        return AVERROR(ENOMEM);
+
+    avctx->extradata = av_mallocz(16);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+    init_put_bits(&pb, avctx->extradata, 16*8);
+
+    put_bits(&pb, 2, s->version); // version
+    if (s->version >= 1)
+    {
+        if (s->version >= 2) {
+            put_bits(&pb, 8, s->version);
+            put_bits(&pb, 8, s->minor_version);
+        }
+        put_bits(&pb, 2, s->channels);
+        put_bits(&pb, 4, code_samplerate(s->samplerate));
+    }
+    put_bits(&pb, 1, s->lossless);
+    if (!s->lossless)
+        put_bits(&pb, 3, SAMPLE_SHIFT); // XXX FIXME: sample precision
+    put_bits(&pb, 2, s->decorrelation);
+    put_bits(&pb, 2, s->downsampling);
+    put_bits(&pb, 5, (s->num_taps >> 5)-1); // 32..1024
+    put_bits(&pb, 1, 0); // XXX FIXME: no custom tap quant table
+
+    flush_put_bits(&pb);
+    avctx->extradata_size = put_bits_count(&pb)/8;
+
+    av_log(avctx, AV_LOG_INFO, "Sonic: ver: %d.%d ls: %d dr: %d taps: %d block: %d frame: %d downsamp: %d\n",
+        s->version, s->minor_version, s->lossless, s->decorrelation, s->num_taps, s->block_align, s->frame_size, s->downsampling);
+
+    avctx->frame_size = s->block_align*s->downsampling;
+
+    return 0;
+}
+
+static av_cold int sonic_encode_close(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->channels; i++)
+        av_freep(&s->coded_samples[i]);
+
+    av_freep(&s->predictor_k);
+    av_freep(&s->tail);
+    av_freep(&s->tap_quant);
+    av_freep(&s->window);
+    av_freep(&s->int_samples);
+
+    return 0;
+}
+
+static int sonic_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                              const AVFrame *frame, int *got_packet_ptr)
+{
+    SonicContext *s = avctx->priv_data;
+    RangeCoder c;
+    int i, j, ch, quant = 0, x = 0;
+    int ret;
+    const short *samples = (const int16_t*)frame->data[0];
+    uint8_t state[32];
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->frame_size * 5 + 1000, 0)) < 0)
+        return ret;
+
+    ff_init_range_encoder(&c, avpkt->data, avpkt->size);
+    ff_build_rac_states(&c, 0.05*(1LL<<32), 256-8);
+    memset(state, 128, sizeof(state));
+
+    // short -> internal
+    for (i = 0; i < s->frame_size; i++)
+        s->int_samples[i] = samples[i];
+
+    if (!s->lossless)
+        for (i = 0; i < s->frame_size; i++)
+            s->int_samples[i] = s->int_samples[i] << SAMPLE_SHIFT;
+
+    switch(s->decorrelation)
+    {
+        case MID_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+            {
+                s->int_samples[i] += s->int_samples[i+1];
+                s->int_samples[i+1] -= shift(s->int_samples[i], 1);
+            }
+            break;
+        case LEFT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i+1] -= s->int_samples[i];
+            break;
+        case RIGHT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i] -= s->int_samples[i+1];
+            break;
+    }
+
+    memset(s->window, 0, 4* s->window_size);
+
+    for (i = 0; i < s->tail_size; i++)
+        s->window[x++] = s->tail[i];
+
+    for (i = 0; i < s->frame_size; i++)
+        s->window[x++] = s->int_samples[i];
+
+    for (i = 0; i < s->tail_size; i++)
+        s->window[x++] = 0;
+
+    for (i = 0; i < s->tail_size; i++)
+        s->tail[i] = s->int_samples[s->frame_size - s->tail_size + i];
+
+    // generate taps
+    ret = modified_levinson_durbin(s->window, s->window_size,
+                s->predictor_k, s->num_taps, s->channels, s->tap_quant);
+    if (ret < 0)
+        return ret;
+
+    if ((ret = intlist_write(&c, state, s->predictor_k, s->num_taps, 0)) < 0)
+        return ret;
+
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        x = s->tail_size+ch;
+        for (i = 0; i < s->block_align; i++)
+        {
+            int sum = 0;
+            for (j = 0; j < s->downsampling; j++, x += s->channels)
+                sum += s->window[x];
+            s->coded_samples[ch][i] = sum;
+        }
+    }
+
+    // simple rate control code
+    if (!s->lossless)
+    {
+        double energy1 = 0.0, energy2 = 0.0;
+        for (ch = 0; ch < s->channels; ch++)
+        {
+            for (i = 0; i < s->block_align; i++)
+            {
+                double sample = s->coded_samples[ch][i];
+                energy2 += sample*sample;
+                energy1 += fabs(sample);
+            }
+        }
+
+        energy2 = sqrt(energy2/(s->channels*s->block_align));
+        energy1 = M_SQRT2*energy1/(s->channels*s->block_align);
+
+        // increase bitrate when samples are like a gaussian distribution
+        // reduce bitrate when samples are like a two-tailed exponential distribution
+
+        if (energy2 > energy1)
+            energy2 += (energy2-energy1)*RATE_VARIATION;
+
+        quant = (int)(BASE_QUANT*s->quantization*energy2/SAMPLE_FACTOR);
+//        av_log(avctx, AV_LOG_DEBUG, "quant: %d energy: %f / %f\n", quant, energy1, energy2);
+
+        quant = av_clip(quant, 1, 65534);
+
+        put_symbol(&c, state, quant, 0, NULL, NULL);
+
+        quant *= SAMPLE_FACTOR;
+    }
+
+    // write out coded samples
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        if (!s->lossless)
+            for (i = 0; i < s->block_align; i++)
+                s->coded_samples[ch][i] = ROUNDED_DIV(s->coded_samples[ch][i], quant);
+
+        if ((ret = intlist_write(&c, state, s->coded_samples[ch], s->block_align, 1)) < 0)
+            return ret;
+    }
+
+//    av_log(avctx, AV_LOG_DEBUG, "used bytes: %d\n", (put_bits_count(&pb)+7)/8);
+
+    avpkt->size = ff_rac_terminate(&c, 0);
+    *got_packet_ptr = 1;
+    return 0;
+
+}
+#endif /* CONFIG_SONIC_ENCODER || CONFIG_SONIC_LS_ENCODER */
+
+#if CONFIG_SONIC_DECODER
+static const int samplerate_table[] =
+    { 44100, 22050, 11025, 96000, 48000, 32000, 24000, 16000, 8000 };
+
+static av_cold int sonic_decode_init(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    GetBitContext gb;
+    int i;
+    int ret;
+
+    s->channels = avctx->channels;
+    s->samplerate = avctx->sample_rate;
+
+    if (!avctx->extradata)
+    {
+        av_log(avctx, AV_LOG_ERROR, "No mandatory headers present\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size);
+    if (ret < 0)
+        return ret;
+
+    s->version = get_bits(&gb, 2);
+    if (s->version >= 2) {
+        s->version       = get_bits(&gb, 8);
+        s->minor_version = get_bits(&gb, 8);
+    }
+    if (s->version != 2)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported Sonic version, please report\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->version >= 1)
+    {
+        int sample_rate_index;
+        s->channels = get_bits(&gb, 2);
+        sample_rate_index = get_bits(&gb, 4);
+        if (sample_rate_index >= FF_ARRAY_ELEMS(samplerate_table)) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid sample_rate_index %d\n", sample_rate_index);
+            return AVERROR_INVALIDDATA;
+        }
+        s->samplerate = samplerate_table[sample_rate_index];
+        av_log(avctx, AV_LOG_INFO, "Sonicv2 chans: %d samprate: %d\n",
+            s->channels, s->samplerate);
+    }
+
+    if (s->channels > MAX_CHANNELS || s->channels < 1)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo streams are supported by now\n");
+        return AVERROR_INVALIDDATA;
+    }
+    avctx->channels = s->channels;
+
+    s->lossless = get_bits1(&gb);
+    if (!s->lossless)
+        skip_bits(&gb, 3); // XXX FIXME
+    s->decorrelation = get_bits(&gb, 2);
+    if (s->decorrelation != 3 && s->channels != 2) {
+        av_log(avctx, AV_LOG_ERROR, "invalid decorrelation %d\n", s->decorrelation);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->downsampling = get_bits(&gb, 2);
+    if (!s->downsampling) {
+        av_log(avctx, AV_LOG_ERROR, "invalid downsampling value\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->num_taps = (get_bits(&gb, 5)+1)<<5;
+    if (get_bits1(&gb)) // XXX FIXME
+        av_log(avctx, AV_LOG_INFO, "Custom quant table\n");
+
+    s->block_align = 2048LL*s->samplerate/(44100*s->downsampling);
+    s->frame_size = s->channels*s->block_align*s->downsampling;
+//    avctx->frame_size = s->block_align;
+
+    if (s->num_taps * s->channels > s->frame_size) {
+        av_log(avctx, AV_LOG_ERROR,
+               "number of taps times channels (%d * %d) larger than frame size %d\n",
+               s->num_taps, s->channels, s->frame_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    av_log(avctx, AV_LOG_INFO, "Sonic: ver: %d.%d ls: %d dr: %d taps: %d block: %d frame: %d downsamp: %d\n",
+        s->version, s->minor_version, s->lossless, s->decorrelation, s->num_taps, s->block_align, s->frame_size, s->downsampling);
+
+    // generate taps
+    s->tap_quant = av_calloc(s->num_taps, sizeof(*s->tap_quant));
+    if (!s->tap_quant)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->num_taps; i++)
+        s->tap_quant[i] = ff_sqrt(i+1);
+
+    s->predictor_k = av_calloc(s->num_taps, sizeof(*s->predictor_k));
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->predictor_state[i] = av_calloc(s->num_taps, sizeof(**s->predictor_state));
+        if (!s->predictor_state[i])
+            return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->coded_samples[i] = av_calloc(s->block_align, sizeof(**s->coded_samples));
+        if (!s->coded_samples[i])
+            return AVERROR(ENOMEM);
+    }
+    s->int_samples = av_calloc(s->frame_size, sizeof(*s->int_samples));
+    if (!s->int_samples)
+        return AVERROR(ENOMEM);
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+}
+
+static av_cold int sonic_decode_close(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    int i;
+
+    av_freep(&s->int_samples);
+    av_freep(&s->tap_quant);
+    av_freep(&s->predictor_k);
+
+    for (i = 0; i < s->channels; i++)
+    {
+        av_freep(&s->predictor_state[i]);
+        av_freep(&s->coded_samples[i]);
+    }
+
+    return 0;
+}
+
+static int sonic_decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_frame_ptr,
+                            AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    SonicContext *s = avctx->priv_data;
+    RangeCoder c;
+    uint8_t state[32];
+    int i, quant, ch, j, ret;
+    int16_t *samples;
+    AVFrame *frame = data;
+
+    if (buf_size == 0) return 0;
+
+    frame->nb_samples = s->frame_size / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    samples = (int16_t *)frame->data[0];
+
+//    av_log(NULL, AV_LOG_INFO, "buf_size: %d\n", buf_size);
+
+    memset(state, 128, sizeof(state));
+    ff_init_range_decoder(&c, buf, buf_size);
+    ff_build_rac_states(&c, 0.05*(1LL<<32), 256-8);
+
+    intlist_read(&c, state, s->predictor_k, s->num_taps, 0);
+
+    // dequantize
+    for (i = 0; i < s->num_taps; i++)
+        s->predictor_k[i] *= s->tap_quant[i];
+
+    if (s->lossless)
+        quant = 1;
+    else
+        quant = get_symbol(&c, state, 0) * SAMPLE_FACTOR;
+
+//    av_log(NULL, AV_LOG_INFO, "quant: %d\n", quant);
+
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        int x = ch;
+
+        predictor_init_state(s->predictor_k, s->predictor_state[ch], s->num_taps);
+
+        intlist_read(&c, state, s->coded_samples[ch], s->block_align, 1);
+
+        for (i = 0; i < s->block_align; i++)
+        {
+            for (j = 0; j < s->downsampling - 1; j++)
+            {
+                s->int_samples[x] = predictor_calc_error(s->predictor_k, s->predictor_state[ch], s->num_taps, 0);
+                x += s->channels;
+            }
+
+            s->int_samples[x] = predictor_calc_error(s->predictor_k, s->predictor_state[ch], s->num_taps, s->coded_samples[ch][i] * quant);
+            x += s->channels;
+        }
+
+        for (i = 0; i < s->num_taps; i++)
+            s->predictor_state[ch][i] = s->int_samples[s->frame_size - s->channels + ch - i*s->channels];
+    }
+
+    switch(s->decorrelation)
+    {
+        case MID_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+            {
+                s->int_samples[i+1] += shift(s->int_samples[i], 1);
+                s->int_samples[i] -= s->int_samples[i+1];
+            }
+            break;
+        case LEFT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i+1] += s->int_samples[i];
+            break;
+        case RIGHT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i] += s->int_samples[i+1];
+            break;
+    }
+
+    if (!s->lossless)
+        for (i = 0; i < s->frame_size; i++)
+            s->int_samples[i] = shift(s->int_samples[i], SAMPLE_SHIFT);
+
+    // internal -> short
+    for (i = 0; i < s->frame_size; i++)
+        samples[i] = av_clip_int16(s->int_samples[i]);
+
+    *got_frame_ptr = 1;
+
+    return buf_size;
+}
+
+AVCodec ff_sonic_decoder = {
+    .name           = "sonic",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_decode_init,
+    .close          = sonic_decode_close,
+    .decode         = sonic_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_EXPERIMENTAL,
+};
+#endif /* CONFIG_SONIC_DECODER */
+
+#if CONFIG_SONIC_ENCODER
+AVCodec ff_sonic_encoder = {
+    .name           = "sonic",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_encode_init,
+    .encode2        = sonic_encode_frame,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_EXPERIMENTAL,
+    .close          = sonic_encode_close,
+};
+#endif
+
+#if CONFIG_SONIC_LS_ENCODER
+AVCodec ff_sonic_ls_encoder = {
+    .name           = "sonicls",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic lossless"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC_LS,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_encode_init,
+    .encode2        = sonic_encode_frame,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_EXPERIMENTAL,
+    .close          = sonic_encode_close,
+};
+#endif
diff --git a/libavcodec/sp5x.h b/libavcodec/sp5x.h
index 090662b..21c4571 100644
--- a/libavcodec/sp5x.h
+++ b/libavcodec/sp5x.h
@@ -2,20 +2,20 @@
  * Sunplus JPEG tables
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sp5xdec.c b/libavcodec/sp5xdec.c
index 7f57b63..815f9ad 100644
--- a/libavcodec/sp5xdec.c
+++ b/libavcodec/sp5xdec.c
@@ -2,20 +2,20 @@
  * Sunplus JPEG decoder (SP5X)
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
         for (i = 2; i < buf_size-2 && j < buf_size+1024-2; i++)
             recoded[j++] = buf[i];
     else
-    for (i = 14; i < buf_size && j < buf_size+1024-2; i++)
+    for (i = 14; i < buf_size && j < buf_size+1024-3; i++)
     {
         recoded[j++] = buf[i];
         if (buf[i] == 0xff)
@@ -91,9 +91,10 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
 
     av_free(recoded);
 
-    return i;
+    return i < 0 ? i : avpkt->size;
 }
 
+#if CONFIG_SP5X_DECODER
 AVCodec ff_sp5x_decoder = {
     .name           = "sp5x",
     .long_name      = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"),
@@ -104,9 +105,11 @@ AVCodec ff_sp5x_decoder = {
     .close          = ff_mjpeg_decode_end,
     .decode         = sp5x_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
-
+#endif
+#if CONFIG_AMV_DECODER
 AVCodec ff_amv_decoder = {
     .name           = "amv",
     .long_name      = NULL_IF_CONFIG_SMALL("AMV Video"),
@@ -116,6 +119,8 @@ AVCodec ff_amv_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = sp5x_decode_frame,
+    .max_lowres     = 3,
     .capabilities   = AV_CODEC_CAP_DR1,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
+#endif
diff --git a/libavcodec/sparc/README b/libavcodec/sparc/README
new file mode 100644
index 0000000..f9f2349
--- /dev/null
+++ b/libavcodec/sparc/README
@@ -0,0 +1,6 @@
+SPARC optimizations have been removed in
+commit b4dd424d96f09f9bafb88e47f37df65dc4529143
+The last revission with the optimizations is fb1b70c1ed50951c5fc1a309c3c446b2eaaf564b
+
+If you want to maintain these (or other) SPARC optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/speedhq.c b/libavcodec/speedhq.c
new file mode 100644
index 0000000..890b825
--- /dev/null
+++ b/libavcodec/speedhq.c
@@ -0,0 +1,685 @@
+/*
+ * NewTek SpeedHQ codec
+ * Copyright 2017 Steinar H. Gunderson
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * NewTek SpeedHQ decoder.
+ */
+
+#define BITSTREAM_READER_LE
+
+#include "libavutil/attributes.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+#include "libavutil/thread.h"
+#include "mathops.h"
+#include "mpeg12.h"
+#include "mpeg12data.h"
+#include "mpeg12vlc.h"
+
+#define MAX_INDEX (64 - 1)
+
+/*
+ * 5 bits makes for very small tables, with no more than two lookups needed
+ * for the longest (10-bit) codes.
+ */
+#define ALPHA_VLC_BITS 5
+
+typedef struct SHQContext {
+    AVCodecContext *avctx;
+    BlockDSPContext bdsp;
+    IDCTDSPContext idsp;
+    ScanTable intra_scantable;
+    int quant_matrix[64];
+    enum { SHQ_SUBSAMPLING_420, SHQ_SUBSAMPLING_422, SHQ_SUBSAMPLING_444 }
+        subsampling;
+    enum { SHQ_NO_ALPHA, SHQ_RLE_ALPHA, SHQ_DCT_ALPHA } alpha_type;
+} SHQContext;
+
+
+/* AC codes: Very similar but not identical to MPEG-2. */
+static const uint16_t speedhq_vlc[123][2] = {
+    {0x0001,  2}, {0x0003,  3}, {0x000E,  4}, {0x0007,  5},
+    {0x0017,  5}, {0x0028,  6}, {0x0008,  6}, {0x006F,  7},
+    {0x001F,  7}, {0x00C4,  8}, {0x0044,  8}, {0x005F,  8},
+    {0x00DF,  8}, {0x007F,  8}, {0x00FF,  8}, {0x3E00, 14},
+    {0x1E00, 14}, {0x2E00, 14}, {0x0E00, 14}, {0x3600, 14},
+    {0x1600, 14}, {0x2600, 14}, {0x0600, 14}, {0x3A00, 14},
+    {0x1A00, 14}, {0x2A00, 14}, {0x0A00, 14}, {0x3200, 14},
+    {0x1200, 14}, {0x2200, 14}, {0x0200, 14}, {0x0C00, 15},
+    {0x7400, 15}, {0x3400, 15}, {0x5400, 15}, {0x1400, 15},
+    {0x6400, 15}, {0x2400, 15}, {0x4400, 15}, {0x0400, 15},
+    {0x0002,  3}, {0x000C,  5}, {0x004F,  7}, {0x00E4,  8},
+    {0x0004,  8}, {0x0D00, 13}, {0x1500, 13}, {0x7C00, 15},
+    {0x3C00, 15}, {0x5C00, 15}, {0x1C00, 15}, {0x6C00, 15},
+    {0x2C00, 15}, {0x4C00, 15}, {0xC800, 16}, {0x4800, 16},
+    {0x8800, 16}, {0x0800, 16}, {0x0300, 13}, {0x1D00, 13},
+    {0x0014,  5}, {0x0070,  7}, {0x003F,  8}, {0x00C0, 10},
+    {0x0500, 13}, {0x0180, 12}, {0x0280, 12}, {0x0C80, 12},
+    {0x0080, 12}, {0x0B00, 13}, {0x1300, 13}, {0x001C,  5},
+    {0x0064,  8}, {0x0380, 12}, {0x1900, 13}, {0x0D80, 12},
+    {0x0018,  6}, {0x00BF,  8}, {0x0480, 12}, {0x0B80, 12},
+    {0x0038,  6}, {0x0040,  9}, {0x0900, 13}, {0x0030,  7},
+    {0x0780, 12}, {0x2800, 16}, {0x0010,  7}, {0x0A80, 12},
+    {0x0050,  7}, {0x0880, 12}, {0x000F,  7}, {0x1100, 13},
+    {0x002F,  7}, {0x0100, 13}, {0x0084,  8}, {0x5800, 16},
+    {0x00A4,  8}, {0x9800, 16}, {0x0024,  8}, {0x1800, 16},
+    {0x0140,  9}, {0xE800, 16}, {0x01C0,  9}, {0x6800, 16},
+    {0x02C0, 10}, {0xA800, 16}, {0x0F80, 12}, {0x0580, 12},
+    {0x0980, 12}, {0x0E80, 12}, {0x0680, 12}, {0x1F00, 13},
+    {0x0F00, 13}, {0x1700, 13}, {0x0700, 13}, {0x1B00, 13},
+    {0xF800, 16}, {0x7800, 16}, {0xB800, 16}, {0x3800, 16},
+    {0xD800, 16},
+    {0x0020,  6}, /* escape */
+    {0x0006,  4}  /* EOB */
+};
+
+static const uint8_t speedhq_level[121] = {
+     1,  2,  3,  4,  5,  6,  7,  8,
+     9, 10, 11, 12, 13, 14, 15, 16,
+    17, 18, 19, 20, 21, 22, 23, 24,
+    25, 26, 27, 28, 29, 30, 31, 32,
+    33, 34, 35, 36, 37, 38, 39, 40,
+     1,  2,  3,  4,  5,  6,  7,  8,
+     9, 10, 11, 12, 13, 14, 15, 16,
+    17, 18, 19, 20,  1,  2,  3,  4,
+     5,  6,  7,  8,  9, 10, 11,  1,
+     2,  3,  4,  5,  1,  2,  3,  4,
+     1,  2,  3,  1,  2,  3,  1,  2,
+     1,  2,  1,  2,  1,  2,  1,  2,
+     1,  2,  1,  2,  1,  2,  1,  2,
+     1,  2,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,
+     1,
+};
+
+static const uint8_t speedhq_run[121] = {
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  3,
+     3,  3,  3,  3,  4,  4,  4,  4,
+     5,  5,  5,  6,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11,
+    12, 12, 13, 13, 14, 14, 15, 15,
+    16, 16, 17, 18, 19, 20, 21, 22,
+    23, 24, 25, 26, 27, 28, 29, 30,
+    31,
+};
+
+static RLTable ff_rl_speedhq = {
+    121,
+    121,
+    (const uint16_t (*)[])speedhq_vlc,
+    speedhq_run,
+    speedhq_level,
+};
+
+/* NOTE: The first element is always 16, unscaled. */
+static const uint8_t unscaled_quant_matrix[64] = {
+    16, 16, 19, 22, 26, 27, 29, 34,
+    16, 16, 22, 24, 27, 29, 34, 37,
+    19, 22, 26, 27, 29, 34, 34, 38,
+    22, 22, 26, 27, 29, 34, 37, 40,
+    22, 26, 27, 29, 32, 35, 40, 48,
+    26, 27, 29, 32, 35, 40, 48, 58,
+    26, 27, 29, 34, 38, 46, 56, 69,
+    27, 29, 35, 38, 46, 56, 69, 83
+};
+
+static uint8_t ff_speedhq_static_rl_table_store[2][2*MAX_RUN + MAX_LEVEL + 3];
+
+static VLC ff_dc_lum_vlc_le;
+static VLC ff_dc_chroma_vlc_le;
+static VLC ff_dc_alpha_run_vlc_le;
+static VLC ff_dc_alpha_level_vlc_le;
+
+static inline int decode_dc_le(GetBitContext *gb, int component)
+{
+    int code, diff;
+
+    if (component == 0 || component == 3) {
+        code = get_vlc2(gb, ff_dc_lum_vlc_le.table, DC_VLC_BITS, 2);
+    } else {
+        code = get_vlc2(gb, ff_dc_chroma_vlc_le.table, DC_VLC_BITS, 2);
+    }
+    if (code < 0) {
+        av_log(NULL, AV_LOG_ERROR, "invalid dc code at\n");
+        return 0xffff;
+    }
+    if (!code) {
+        diff = 0;
+    } else {
+        diff = get_xbits_le(gb, code);
+    }
+    return diff;
+}
+
+static inline int decode_alpha_block(const SHQContext *s, GetBitContext *gb, uint8_t last_alpha[16], uint8_t *dest, int linesize)
+{
+    uint8_t block[128];
+    int i = 0, x, y;
+
+    memset(block, 0, sizeof(block));
+
+    {
+        OPEN_READER(re, gb);
+
+        for ( ;; ) {
+            int run, level;
+
+            UPDATE_CACHE_LE(re, gb);
+            GET_VLC(run, re, gb, ff_dc_alpha_run_vlc_le.table, ALPHA_VLC_BITS, 2);
+
+            if (run < 0) break;
+            i += run;
+            if (i >= 128)
+                return AVERROR_INVALIDDATA;
+
+            UPDATE_CACHE_LE(re, gb);
+            GET_VLC(level, re, gb, ff_dc_alpha_level_vlc_le.table, ALPHA_VLC_BITS, 2);
+            block[i++] = level;
+        }
+
+        CLOSE_READER(re, gb);
+    }
+
+    for (y = 0; y < 8; y++) {
+        for (x = 0; x < 16; x++) {
+            last_alpha[x] -= block[y * 16 + x];
+        }
+        memcpy(dest, last_alpha, 16);
+        dest += linesize;
+    }
+
+    return 0;
+}
+
+static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb, int last_dc[4], int component, uint8_t *dest, int linesize)
+{
+    const int *quant_matrix = s->quant_matrix;
+    const uint8_t *scantable = s->intra_scantable.permutated;
+    LOCAL_ALIGNED_32(int16_t, block, [64]);
+    int dc_offset;
+
+    s->bdsp.clear_block(block);
+
+    dc_offset = decode_dc_le(gb, component);
+    last_dc[component] -= dc_offset;  /* Note: Opposite of most codecs. */
+    block[scantable[0]] = last_dc[component];  /* quant_matrix[0] is always 16. */
+
+    /* Read AC coefficients. */
+    {
+        int i = 0;
+        OPEN_READER(re, gb);
+        for ( ;; ) {
+            int level, run;
+            UPDATE_CACHE_LE(re, gb);
+            GET_RL_VLC(level, run, re, gb, ff_rl_speedhq.rl_vlc[0],
+                       TEX_VLC_BITS, 2, 0);
+            if (level == 127) {
+                break;
+            } else if (level) {
+                i += run;
+                if (i > MAX_INDEX)
+                    return AVERROR_INVALIDDATA;
+                /* If next bit is 1, level = -level */
+                level = (level ^ SHOW_SBITS(re, gb, 1)) -
+                        SHOW_SBITS(re, gb, 1);
+                LAST_SKIP_BITS(re, gb, 1);
+            } else {
+                /* Escape. */
+#if MIN_CACHE_BITS < 6 + 6 + 12
+#error MIN_CACHE_BITS is too small for the escape code, add UPDATE_CACHE
+#endif
+                run = SHOW_UBITS(re, gb, 6) + 1;
+                SKIP_BITS(re, gb, 6);
+                level = SHOW_UBITS(re, gb, 12) - 2048;
+                LAST_SKIP_BITS(re, gb, 12);
+
+                i += run;
+                if (i > MAX_INDEX)
+                    return AVERROR_INVALIDDATA;
+            }
+
+            block[scantable[i]] = (level * quant_matrix[i]) >> 4;
+        }
+        CLOSE_READER(re, gb);
+    }
+
+    s->idsp.idct_put(dest, linesize, block);
+
+    return 0;
+}
+
+static int decode_speedhq_field(const SHQContext *s, const uint8_t *buf, int buf_size, AVFrame *frame, int field_number, int start, int end, int line_stride)
+{
+    int ret, slice_number, slice_offsets[5];
+    int linesize_y  = frame->linesize[0] * line_stride;
+    int linesize_cb = frame->linesize[1] * line_stride;
+    int linesize_cr = frame->linesize[2] * line_stride;
+    int linesize_a;
+
+    if (s->alpha_type != SHQ_NO_ALPHA)
+        linesize_a = frame->linesize[3] * line_stride;
+
+    if (end < start || end - start < 3 || end > buf_size)
+        return AVERROR_INVALIDDATA;
+
+    slice_offsets[0] = start;
+    slice_offsets[4] = end;
+    for (slice_number = 1; slice_number < 4; slice_number++) {
+        uint32_t last_offset, slice_len;
+
+        last_offset = slice_offsets[slice_number - 1];
+        slice_len = AV_RL24(buf + last_offset);
+        slice_offsets[slice_number] = last_offset + slice_len;
+
+        if (slice_len < 3 || slice_offsets[slice_number] > end - 3)
+            return AVERROR_INVALIDDATA;
+    }
+
+    for (slice_number = 0; slice_number < 4; slice_number++) {
+        GetBitContext gb;
+        uint32_t slice_begin, slice_end;
+        int x, y;
+
+        slice_begin = slice_offsets[slice_number];
+        slice_end = slice_offsets[slice_number + 1];
+
+        if ((ret = init_get_bits8(&gb, buf + slice_begin + 3, slice_end - slice_begin - 3)) < 0)
+            return ret;
+
+        for (y = slice_number * 16 * line_stride; y < frame->height; y += line_stride * 64) {
+            uint8_t *dest_y, *dest_cb, *dest_cr, *dest_a;
+            int last_dc[4] = { 1024, 1024, 1024, 1024 };
+            uint8_t last_alpha[16];
+
+            memset(last_alpha, 255, sizeof(last_alpha));
+
+            dest_y = frame->data[0] + frame->linesize[0] * (y + field_number);
+            if (s->subsampling == SHQ_SUBSAMPLING_420) {
+                dest_cb = frame->data[1] + frame->linesize[1] * (y/2 + field_number);
+                dest_cr = frame->data[2] + frame->linesize[2] * (y/2 + field_number);
+            } else {
+                dest_cb = frame->data[1] + frame->linesize[1] * (y + field_number);
+                dest_cr = frame->data[2] + frame->linesize[2] * (y + field_number);
+            }
+            if (s->alpha_type != SHQ_NO_ALPHA) {
+                dest_a = frame->data[3] + frame->linesize[3] * (y + field_number);
+            }
+
+            for (x = 0; x < frame->width; x += 16) {
+                /* Decode the four luma blocks. */
+                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y, linesize_y)) < 0)
+                    return ret;
+                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8, linesize_y)) < 0)
+                    return ret;
+                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8 * linesize_y, linesize_y)) < 0)
+                    return ret;
+                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8 * linesize_y + 8, linesize_y)) < 0)
+                    return ret;
+
+                /*
+                 * Decode the first chroma block. For 4:2:0, this is the only one;
+                 * for 4:2:2, it's the top block; for 4:4:4, it's the top-left block.
+                 */
+                if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb, linesize_cb)) < 0)
+                    return ret;
+                if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr, linesize_cr)) < 0)
+                    return ret;
+
+                if (s->subsampling != SHQ_SUBSAMPLING_420) {
+                    /* For 4:2:2, this is the bottom block; for 4:4:4, it's the bottom-left block. */
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8 * linesize_cb, linesize_cb)) < 0)
+                        return ret;
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8 * linesize_cr, linesize_cr)) < 0)
+                        return ret;
+
+                    if (s->subsampling == SHQ_SUBSAMPLING_444) {
+                        /* Top-right and bottom-right blocks. */
+                        if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8, linesize_cb)) < 0)
+                            return ret;
+                        if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8, linesize_cr)) < 0)
+                            return ret;
+                        if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8 * linesize_cb + 8, linesize_cb)) < 0)
+                            return ret;
+                        if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8 * linesize_cr + 8, linesize_cr)) < 0)
+                            return ret;
+
+                        dest_cb += 8;
+                        dest_cr += 8;
+                    }
+                }
+                dest_y += 16;
+                dest_cb += 8;
+                dest_cr += 8;
+
+                if (s->alpha_type == SHQ_RLE_ALPHA) {
+                    /* Alpha coded using 16x8 RLE blocks. */
+                    if ((ret = decode_alpha_block(s, &gb, last_alpha, dest_a, linesize_a)) < 0)
+                        return ret;
+                    if ((ret = decode_alpha_block(s, &gb, last_alpha, dest_a + 8 * linesize_a, linesize_a)) < 0)
+                        return ret;
+                    dest_a += 16;
+                } else if (s->alpha_type == SHQ_DCT_ALPHA) {
+                    /* Alpha encoded exactly like luma. */
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a, linesize_a)) < 0)
+                        return ret;
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8, linesize_a)) < 0)
+                        return ret;
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8 * linesize_a, linesize_a)) < 0)
+                        return ret;
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8 * linesize_a + 8, linesize_a)) < 0)
+                        return ret;
+                    dest_a += 16;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void compute_quant_matrix(int *output, int qscale)
+{
+    int i;
+    for (i = 0; i < 64; i++) output[i] = unscaled_quant_matrix[ff_zigzag_direct[i]] * qscale;
+}
+
+static int speedhq_decode_frame(AVCodecContext *avctx,
+                                void *data, int *got_frame,
+                                AVPacket *avpkt)
+{
+    SHQContext * const s = avctx->priv_data;
+    const uint8_t *buf   = avpkt->data;
+    int buf_size         = avpkt->size;
+    AVFrame *frame       = data;
+    uint8_t quality;
+    uint32_t second_field_offset;
+    int ret;
+
+    if (buf_size < 4)
+        return AVERROR_INVALIDDATA;
+
+    quality = buf[0];
+    if (quality >= 100) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    compute_quant_matrix(s->quant_matrix, 100 - quality);
+
+    second_field_offset = AV_RL24(buf + 1);
+    if (second_field_offset >= buf_size - 3) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    avctx->coded_width = FFALIGN(avctx->width, 16);
+    avctx->coded_height = FFALIGN(avctx->height, 16);
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
+        return ret;
+    }
+    frame->key_frame = 1;
+
+    if (second_field_offset == 4) {
+        /*
+         * Overlapping first and second fields is used to signal
+         * encoding only a single field. In this case, "height"
+         * is ambiguous; it could mean either the height of the
+         * frame as a whole, or of the field. The former would make
+         * more sense for compatibility with legacy decoders,
+         * but this matches the convention used in NDI, which is
+         * the primary user of this trick.
+         */
+        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 0, 4, buf_size, 1)) < 0)
+            return ret;
+    } else {
+        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 0, 4, second_field_offset, 2)) < 0)
+            return ret;
+        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 1, second_field_offset, buf_size, 2)) < 0)
+            return ret;
+    }
+
+    *got_frame = 1;
+    return buf_size;
+}
+
+/*
+ * Alpha VLC. Run and level are independently coded, and would be
+ * outside the default limits for MAX_RUN/MAX_LEVEL, so we don't
+ * bother with combining them into one table.
+ */
+static av_cold void compute_alpha_vlcs(void)
+{
+    uint16_t run_code[134], level_code[266];
+    uint8_t run_bits[134], level_bits[266];
+    int16_t run_symbols[134], level_symbols[266];
+    int entry, i, sign;
+
+    /* Initialize VLC for alpha run. */
+    entry = 0;
+
+    /* 0 -> 0. */
+    run_code[entry] = 0;
+    run_bits[entry] = 1;
+    run_symbols[entry] = 0;
+    ++entry;
+
+    /* 10xx -> xx plus 1. */
+    for (i = 0; i < 4; ++i) {
+        run_code[entry] = (i << 2) | 1;
+        run_bits[entry] = 4;
+        run_symbols[entry] = i + 1;
+        ++entry;
+    }
+
+    /* 111xxxxxxx -> xxxxxxx. */
+    for (i = 0; i < 128; ++i) {
+        run_code[entry] = (i << 3) | 7;
+        run_bits[entry] = 10;
+        run_symbols[entry] = i;
+        ++entry;
+    }
+
+    /* 110 -> EOB. */
+    run_code[entry] = 3;
+    run_bits[entry] = 3;
+    run_symbols[entry] = -1;
+    ++entry;
+
+    av_assert0(entry == FF_ARRAY_ELEMS(run_code));
+
+    INIT_LE_VLC_SPARSE_STATIC(&ff_dc_alpha_run_vlc_le, ALPHA_VLC_BITS,
+                              FF_ARRAY_ELEMS(run_code),
+                              run_bits, 1, 1,
+                              run_code, 2, 2,
+                              run_symbols, 2, 2, 160);
+
+    /* Initialize VLC for alpha level. */
+    entry = 0;
+
+    for (sign = 0; sign <= 1; ++sign) {
+        /* 1s -> -1 or +1 (depending on sign bit). */
+        level_code[entry] = (sign << 1) | 1;
+        level_bits[entry] = 2;
+        level_symbols[entry] = sign ? -1 : 1;
+        ++entry;
+
+        /* 01sxx -> xx plus 2 (2..5 or -2..-5, depending on sign bit). */
+        for (i = 0; i < 4; ++i) {
+            level_code[entry] = (i << 3) | (sign << 2) | 2;
+            level_bits[entry] = 5;
+            level_symbols[entry] = sign ? -(i + 2) : (i + 2);
+            ++entry;
+        }
+    }
+
+    /*
+     * 00xxxxxxxx -> xxxxxxxx, in two's complement. There are many codes
+     * here that would better be encoded in other ways (e.g. 0 would be
+     * encoded by increasing run, and +/- 1 would be encoded with a
+     * shorter code), but it doesn't hurt to allow everything.
+     */
+    for (i = 0; i < 256; ++i) {
+        level_code[entry] = i << 2;
+        level_bits[entry] = 10;
+        level_symbols[entry] = i;
+        ++entry;
+    }
+
+    av_assert0(entry == FF_ARRAY_ELEMS(level_code));
+
+    INIT_LE_VLC_SPARSE_STATIC(&ff_dc_alpha_level_vlc_le, ALPHA_VLC_BITS,
+                              FF_ARRAY_ELEMS(level_code),
+                              level_bits, 1, 1,
+                              level_code, 2, 2,
+                              level_symbols, 2, 2, 288);
+}
+
+static uint32_t reverse(uint32_t num, int bits)
+{
+    return bitswap_32(num) >> (32 - bits);
+}
+
+static void reverse_code(const uint16_t *code, const uint8_t *bits,
+                         uint16_t *reversed_code, int num_entries)
+{
+    int i;
+    for (i = 0; i < num_entries; i++) {
+        reversed_code[i] = reverse(code[i], bits[i]);
+    }
+}
+
+static av_cold void speedhq_static_init(void)
+{
+    uint16_t ff_mpeg12_vlc_dc_lum_code_reversed[12];
+    uint16_t ff_mpeg12_vlc_dc_chroma_code_reversed[12];
+
+    /* Exactly the same as MPEG-2, except little-endian. */
+    reverse_code(ff_mpeg12_vlc_dc_lum_code,
+                 ff_mpeg12_vlc_dc_lum_bits,
+                 ff_mpeg12_vlc_dc_lum_code_reversed,
+                 12);
+    INIT_LE_VLC_STATIC(&ff_dc_lum_vlc_le, DC_VLC_BITS, 12,
+                       ff_mpeg12_vlc_dc_lum_bits, 1, 1,
+                       ff_mpeg12_vlc_dc_lum_code_reversed, 2, 2, 512);
+    reverse_code(ff_mpeg12_vlc_dc_chroma_code,
+                 ff_mpeg12_vlc_dc_chroma_bits,
+                 ff_mpeg12_vlc_dc_chroma_code_reversed,
+                 12);
+    INIT_LE_VLC_STATIC(&ff_dc_chroma_vlc_le, DC_VLC_BITS, 12,
+                       ff_mpeg12_vlc_dc_chroma_bits, 1, 1,
+                       ff_mpeg12_vlc_dc_chroma_code_reversed, 2, 2, 514);
+
+    ff_rl_init(&ff_rl_speedhq, ff_speedhq_static_rl_table_store);
+    INIT_2D_VLC_RL(ff_rl_speedhq, 674, INIT_VLC_LE);
+
+    compute_alpha_vlcs();
+}
+
+static av_cold int speedhq_decode_init(AVCodecContext *avctx)
+{
+    int ret;
+    static AVOnce init_once = AV_ONCE_INIT;
+    SHQContext * const s = avctx->priv_data;
+
+    s->avctx = avctx;
+
+    ret = ff_thread_once(&init_once, speedhq_static_init);
+    if (ret)
+        return AVERROR_UNKNOWN;
+
+    ff_blockdsp_init(&s->bdsp, avctx);
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct);
+
+    switch (avctx->codec_tag) {
+    case MKTAG('S', 'H', 'Q', '0'):
+        s->subsampling = SHQ_SUBSAMPLING_420;
+        s->alpha_type = SHQ_NO_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        break;
+    case MKTAG('S', 'H', 'Q', '1'):
+        s->subsampling = SHQ_SUBSAMPLING_420;
+        s->alpha_type = SHQ_RLE_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+        break;
+    case MKTAG('S', 'H', 'Q', '2'):
+        s->subsampling = SHQ_SUBSAMPLING_422;
+        s->alpha_type = SHQ_NO_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        break;
+    case MKTAG('S', 'H', 'Q', '3'):
+        s->subsampling = SHQ_SUBSAMPLING_422;
+        s->alpha_type = SHQ_RLE_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+        break;
+    case MKTAG('S', 'H', 'Q', '4'):
+        s->subsampling = SHQ_SUBSAMPLING_444;
+        s->alpha_type = SHQ_NO_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        break;
+    case MKTAG('S', 'H', 'Q', '5'):
+        s->subsampling = SHQ_SUBSAMPLING_444;
+        s->alpha_type = SHQ_RLE_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        break;
+    case MKTAG('S', 'H', 'Q', '7'):
+        s->subsampling = SHQ_SUBSAMPLING_422;
+        s->alpha_type = SHQ_DCT_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+        break;
+    case MKTAG('S', 'H', 'Q', '9'):
+        s->subsampling = SHQ_SUBSAMPLING_444;
+        s->alpha_type = SHQ_DCT_ALPHA;
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown NewTek SpeedHQ FOURCC provided (%08X)\n",
+               avctx->codec_tag);
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* This matches what NDI's RGB -> Y'CbCr 4:2:2 converter uses. */
+    avctx->colorspace = AVCOL_SPC_BT470BG;
+    avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
+
+    return 0;
+}
+
+AVCodec ff_speedhq_decoder = {
+    .name           = "speedhq",
+    .long_name      = NULL_IF_CONFIG_SMALL("NewTek SpeedHQ"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SPEEDHQ,
+    .priv_data_size = sizeof(SHQContext),
+    .init           = speedhq_decode_init,
+    .decode         = speedhq_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/srtdec.c b/libavcodec/srtdec.c
index 4c5f6e5..ecc0801 100644
--- a/libavcodec/srtdec.c
+++ b/libavcodec/srtdec.c
@@ -2,241 +2,112 @@
  * SubRip subtitle decoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/parseutils.h"
 #include "avcodec.h"
 #include "ass.h"
+#include "htmlsubtitles.h"
 
-static int html_color_parse(AVCodecContext *avctx, const char *str)
+static int srt_to_ass(AVCodecContext *avctx, AVBPrint *dst,
+                       const char *in, int x1, int y1, int x2, int y2)
 {
-    uint8_t rgba[4];
-    if (av_parse_color(rgba, str, strcspn(str, "\" >"), avctx) < 0)
-        return -1;
-    return rgba[0] | rgba[1] << 8 | rgba[2] << 16;
-}
-
-enum {
-    PARAM_UNKNOWN = -1,
-    PARAM_SIZE,
-    PARAM_COLOR,
-    PARAM_FACE,
-    PARAM_NUMBER
-};
-
-typedef struct SrtStack {
-    char tag[128];
-    char param[PARAM_NUMBER][128];
-} SrtStack;
-
-static const char *srt_to_ass(AVCodecContext *avctx, char *out, char *out_end,
-                              const char *in, int x1, int y1, int x2, int y2)
-{
-    char c, *param, buffer[128], tmp[128];
-    int len, tag_close, sptr = 1, line_start = 1, an = 0, end = 0;
-    SrtStack stack[16];
-
-    stack[0].tag[0] = 0;
-    strcpy(stack[0].param[PARAM_SIZE],  "{\\fs}");
-    strcpy(stack[0].param[PARAM_COLOR], "{\\c}");
-    strcpy(stack[0].param[PARAM_FACE],  "{\\fn}");
-
     if (x1 >= 0 && y1 >= 0) {
-        if (x2 >= 0 && y2 >= 0 && (x2 != x1 || y2 != y1))
-            out += snprintf(out, out_end-out,
-                            "{\\an1}{\\move(%d,%d,%d,%d)}", x1, y1, x2, y2);
-        else
-            out += snprintf(out, out_end-out, "{\\an1}{\\pos(%d,%d)}", x1, y1);
-    }
-
-    for (; out < out_end && !end && *in; in++) {
-        switch (*in) {
-        case '\r':
-            break;
-        case '\n':
-            if (line_start) {
-                end = 1;
-                break;
-            }
-            while (out[-1] == ' ')
-                out--;
-            out += snprintf(out, out_end-out, "\\N");
-            line_start = 1;
-            break;
-        case ' ':
-            if (!line_start)
-                *out++ = *in;
-            break;
-        case '{':    /* skip all {\xxx} substrings except for {\an%d}
-                        and all microdvd like styles such as {Y:xxx} */
-            an += sscanf(in, "{\\an%*1u}%c", &c) == 1;
-            if ((an != 1 && sscanf(in, "{\\%*[^}]}%n%c", &len, &c) > 0) ||
-                sscanf(in, "{%*1[CcFfoPSsYy]:%*[^}]}%n%c", &len, &c) > 0) {
-                in += len - 1;
-            } else
-                *out++ = *in;
-            break;
-        case '<':
-            tag_close = in[1] == '/';
-            if (sscanf(in+tag_close+1, "%127[^>]>%n%c", buffer, &len,&c) >= 2) {
-                if ((param = strchr(buffer, ' ')))
-                    *param++ = 0;
-                if ((!tag_close && sptr < FF_ARRAY_ELEMS(stack)) ||
-                    ( tag_close && sptr > 0 && !strcmp(stack[sptr-1].tag, buffer))) {
-                    int i, j, unknown = 0;
-                    in += len + tag_close;
-                    if (!tag_close)
-                        memset(stack+sptr, 0, sizeof(*stack));
-                    if (!strcmp(buffer, "font")) {
-                        if (tag_close) {
-                            for (i=PARAM_NUMBER-1; i>=0; i--)
-                                if (stack[sptr-1].param[i][0])
-                                    for (j=sptr-2; j>=0; j--)
-                                        if (stack[j].param[i][0]) {
-                                            out += snprintf(out, out_end-out,
-                                                            "%s", stack[j].param[i]);
-                                            break;
-                                        }
-                        } else {
-                            while (param) {
-                                if (!strncmp(param, "size=", 5)) {
-                                    unsigned font_size;
-                                    param += 5 + (param[5] == '"');
-                                    if (sscanf(param, "%u", &font_size) == 1) {
-                                        snprintf(stack[sptr].param[PARAM_SIZE],
-                                             sizeof(stack[0].param[PARAM_SIZE]),
-                                             "{\\fs%u}", font_size);
-                                    }
-                                } else if (!strncmp(param, "color=", 6)) {
-                                    param += 6 + (param[6] == '"');
-                                    snprintf(stack[sptr].param[PARAM_COLOR],
-                                         sizeof(stack[0].param[PARAM_COLOR]),
-                                         "{\\c&H%X&}",
-                                         html_color_parse(avctx, param));
-                                } else if (!strncmp(param, "face=", 5)) {
-                                    param += 5 + (param[5] == '"');
-                                    len = strcspn(param,
-                                                  param[-1] == '"' ? "\"" :" ");
-                                    av_strlcpy(tmp, param,
-                                               FFMIN(sizeof(tmp), len+1));
-                                    param += len;
-                                    snprintf(stack[sptr].param[PARAM_FACE],
-                                             sizeof(stack[0].param[PARAM_FACE]),
-                                             "{\\fn%s}", tmp);
-                                }
-                                if ((param = strchr(param, ' ')))
-                                    param++;
-                            }
-                            for (i=0; i<PARAM_NUMBER; i++)
-                                if (stack[sptr].param[i][0])
-                                    out += snprintf(out, out_end-out,
-                                                    "%s", stack[sptr].param[i]);
-                        }
-                    } else if (!buffer[1] && strspn(buffer, "bisu") == 1) {
-                        out += snprintf(out, out_end-out,
-                                        "{\\%c%d}", buffer[0], !tag_close);
-                    } else {
-                        unknown = 1;
-                        snprintf(tmp, sizeof(tmp), "</%s>", buffer);
-                    }
-                    if (tag_close) {
-                        sptr--;
-                    } else if (unknown && !strstr(in, tmp)) {
-                        in -= len + tag_close;
-                        *out++ = *in;
-                    } else
-                        av_strlcpy(stack[sptr++].tag, buffer,
-                                   sizeof(stack[0].tag));
-                    break;
-                }
-            }
-        default:
-            *out++ = *in;
-            break;
+        /* XXX: here we rescale coordinate assuming they are in DVD resolution
+         * (720x480) since we don't have anything better */
+
+        if (x2 >= 0 && y2 >= 0 && (x2 != x1 || y2 != y1) && x2 >= x1 && y2 >= y1) {
+            /* text rectangle defined, write the text at the center of the rectangle */
+            const int cx = x1 + (x2 - x1)/2;
+            const int cy = y1 + (y2 - y1)/2;
+            const int scaled_x = cx * (int64_t)ASS_DEFAULT_PLAYRESX / 720;
+            const int scaled_y = cy * (int64_t)ASS_DEFAULT_PLAYRESY / 480;
+            av_bprintf(dst, "{\\an5}{\\pos(%d,%d)}", scaled_x, scaled_y);
+        } else {
+            /* only the top left corner, assume the text starts in that corner */
+            const int scaled_x = x1 * (int64_t)ASS_DEFAULT_PLAYRESX / 720;
+            const int scaled_y = y1 * (int64_t)ASS_DEFAULT_PLAYRESY / 480;
+            av_bprintf(dst, "{\\an1}{\\pos(%d,%d)}", scaled_x, scaled_y);
         }
-        if (*in != ' ' && *in != '\r' && *in != '\n')
-            line_start = 0;
     }
 
-    out = FFMIN(out, out_end-3);
-    while (!strncmp(out-2, "\\N", 2))
-        out -= 2;
-    while (out[-1] == ' ')
-        out--;
-    out += snprintf(out, out_end-out, "\r\n");
-    return in;
-}
-
-static const char *read_ts(const char *buf, int *ts_start, int *ts_end,
-                           int *x1, int *y1, int *x2, int *y2)
-{
-    int i, hs, ms, ss, he, me, se;
-
-    for (i=0; i<2; i++) {
-        /* try to read timestamps in either the first or second line */
-        int c = sscanf(buf, "%d:%2d:%2d%*1[,.]%3d --> %d:%2d:%2d%*1[,.]%3d"
-                       "%*[ ]X1:%d X2:%d Y1:%d Y2:%d",
-                       &hs, &ms, &ss, ts_start, &he, &me, &se, ts_end,
-                       x1, x2, y1, y2);
-        buf += strcspn(buf, "\n") + 1;
-        if (c >= 8) {
-            *ts_start = 100*(ss + 60*(ms + 60*hs)) + *ts_start/10;
-            *ts_end   = 100*(se + 60*(me + 60*he)) + *ts_end  /10;
-            return buf;
-        }
-    }
-    return NULL;
+    return ff_htmlmarkup_to_ass(avctx, dst, in);
 }
 
 static int srt_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_sub_ptr, AVPacket *avpkt)
 {
     AVSubtitle *sub = data;
-    int ts_start, ts_end, x1 = -1, y1 = -1, x2 = -1, y2 = -1;
-    char buffer[2048];
-    const char *ptr = avpkt->data;
-    const char *end = avpkt->data + avpkt->size;
+    AVBPrint buffer;
+    int x1 = -1, y1 = -1, x2 = -1, y2 = -1;
+    int size, ret;
+    const uint8_t *p = av_packet_get_side_data(avpkt, AV_PKT_DATA_SUBTITLE_POSITION, &size);
+    FFASSDecoderContext *s = avctx->priv_data;
+
+    if (p && size == 16) {
+        x1 = AV_RL32(p     );
+        y1 = AV_RL32(p +  4);
+        x2 = AV_RL32(p +  8);
+        y2 = AV_RL32(p + 12);
+    }
 
     if (avpkt->size <= 0)
         return avpkt->size;
 
-    ff_ass_init(sub);
+    av_bprint_init(&buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
 
-    while (ptr < end && *ptr) {
-        ptr = read_ts(ptr, &ts_start, &ts_end, &x1, &y1, &x2, &y2);
-        if (!ptr)
-            break;
-        ptr = srt_to_ass(avctx, buffer, buffer+sizeof(buffer), ptr,
-                         x1, y1, x2, y2);
-        ff_ass_add_rect(sub, buffer, ts_start, ts_end, 0);
-    }
+    ret = srt_to_ass(avctx, &buffer, avpkt->data, x1, y1, x2, y2);
+    if (ret >= 0)
+        ret = ff_ass_add_rect(sub, buffer.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buffer, NULL);
+    if (ret < 0)
+        return ret;
 
     *got_sub_ptr = sub->num_rects > 0;
     return avpkt->size;
 }
 
+#if CONFIG_SRT_DECODER
+/* deprecated decoder */
 AVCodec ff_srt_decoder = {
     .name         = "srt",
     .long_name    = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
-    .id           = AV_CODEC_ID_SRT,
+    .id           = AV_CODEC_ID_SUBRIP,
+    .init         = ff_ass_subtitle_header_default,
+    .decode       = srt_decode_frame,
+    .flush        = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
+#endif
+
+#if CONFIG_SUBRIP_DECODER
+AVCodec ff_subrip_decoder = {
+    .name         = "subrip",
+    .long_name    = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_SUBRIP,
     .init         = ff_ass_subtitle_header_default,
     .decode       = srt_decode_frame,
+    .flush        = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
 };
+#endif
diff --git a/libavcodec/srtenc.c b/libavcodec/srtenc.c
new file mode 100644
index 0000000..34f0f0d
--- /dev/null
+++ b/libavcodec/srtenc.c
@@ -0,0 +1,344 @@
+/*
+ * SubRip subtitle encoder
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "ass_split.h"
+#include "ass.h"
+
+
+#define SRT_STACK_SIZE 64
+
+typedef struct {
+    AVCodecContext *avctx;
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    char stack[SRT_STACK_SIZE];
+    int stack_ptr;
+    int alignment_applied;
+} SRTContext;
+
+
+#ifdef __GNUC__
+__attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+static void srt_print(SRTContext *s, const char *str, ...)
+{
+    va_list vargs;
+    va_start(vargs, str);
+    av_vbprintf(&s->buffer, str, vargs);
+    va_end(vargs);
+}
+
+static int srt_stack_push(SRTContext *s, const char c)
+{
+    if (s->stack_ptr >= SRT_STACK_SIZE)
+        return -1;
+    s->stack[s->stack_ptr++] = c;
+    return 0;
+}
+
+static char srt_stack_pop(SRTContext *s)
+{
+    if (s->stack_ptr <= 0)
+        return 0;
+    return s->stack[--s->stack_ptr];
+}
+
+static int srt_stack_find(SRTContext *s, const char c)
+{
+    int i;
+    for (i = s->stack_ptr-1; i >= 0; i--)
+        if (s->stack[i] == c)
+            break;
+    return i;
+}
+
+static void srt_close_tag(SRTContext *s, char tag)
+{
+    srt_print(s, "</%c%s>", tag, tag == 'f' ? "ont" : "");
+}
+
+static void srt_stack_push_pop(SRTContext *s, const char c, int close)
+{
+    if (close) {
+        int i = c ? srt_stack_find(s, c) : 0;
+        if (i < 0)
+            return;
+        while (s->stack_ptr != i)
+            srt_close_tag(s, srt_stack_pop(s));
+    } else if (srt_stack_push(s, c) < 0)
+        av_log(s->avctx, AV_LOG_ERROR, "tag stack overflow\n");
+}
+
+static void srt_style_apply(SRTContext *s, const char *style)
+{
+    ASSStyle *st = ff_ass_style_get(s->ass_ctx, style);
+    if (st) {
+        int c = st->primary_color & 0xFFFFFF;
+        if (st->font_name && strcmp(st->font_name, ASS_DEFAULT_FONT) ||
+            st->font_size != ASS_DEFAULT_FONT_SIZE ||
+            c != ASS_DEFAULT_COLOR) {
+            srt_print(s, "<font");
+            if (st->font_name && strcmp(st->font_name, ASS_DEFAULT_FONT))
+                srt_print(s, " face=\"%s\"", st->font_name);
+            if (st->font_size != ASS_DEFAULT_FONT_SIZE)
+                srt_print(s, " size=\"%d\"", st->font_size);
+            if (c != ASS_DEFAULT_COLOR)
+                srt_print(s, " color=\"#%06x\"",
+                          (c & 0xFF0000) >> 16 | c & 0xFF00 | (c & 0xFF) << 16);
+            srt_print(s, ">");
+            srt_stack_push(s, 'f');
+        }
+        if (st->bold != ASS_DEFAULT_BOLD) {
+            srt_print(s, "<b>");
+            srt_stack_push(s, 'b');
+        }
+        if (st->italic != ASS_DEFAULT_ITALIC) {
+            srt_print(s, "<i>");
+            srt_stack_push(s, 'i');
+        }
+        if (st->underline != ASS_DEFAULT_UNDERLINE) {
+            srt_print(s, "<u>");
+            srt_stack_push(s, 'u');
+        }
+        if (st->alignment != ASS_DEFAULT_ALIGNMENT) {
+            srt_print(s, "{\\an%d}", st->alignment);
+            s->alignment_applied = 1;
+        }
+    }
+}
+
+
+static av_cold int srt_encode_init(AVCodecContext *avctx)
+{
+    SRTContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+static void srt_text_cb(void *priv, const char *text, int len)
+{
+    SRTContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+}
+
+static void srt_new_line_cb(void *priv, int forced)
+{
+    srt_print(priv, "\r\n");
+}
+
+static void srt_style_cb(void *priv, char style, int close)
+{
+    srt_stack_push_pop(priv, style, close);
+    if (!close)
+        srt_print(priv, "<%c>", style);
+}
+
+static void srt_color_cb(void *priv, unsigned int color, unsigned int color_id)
+{
+    if (color_id > 1)
+        return;
+    srt_stack_push_pop(priv, 'f', color == 0xFFFFFFFF);
+    if (color != 0xFFFFFFFF)
+        srt_print(priv, "<font color=\"#%06x\">",
+              (color & 0xFF0000) >> 16 | color & 0xFF00 | (color & 0xFF) << 16);
+}
+
+static void srt_font_name_cb(void *priv, const char *name)
+{
+    srt_stack_push_pop(priv, 'f', !name);
+    if (name)
+        srt_print(priv, "<font face=\"%s\">", name);
+}
+
+static void srt_font_size_cb(void *priv, int size)
+{
+    srt_stack_push_pop(priv, 'f', size < 0);
+    if (size >= 0)
+        srt_print(priv, "<font size=\"%d\">", size);
+}
+
+static void srt_alignment_cb(void *priv, int alignment)
+{
+    SRTContext *s = priv;
+    if (!s->alignment_applied && alignment >= 0) {
+        srt_print(s, "{\\an%d}", alignment);
+        s->alignment_applied = 1;
+    }
+}
+
+static void srt_cancel_overrides_cb(void *priv, const char *style)
+{
+    srt_stack_push_pop(priv, 0, 1);
+    srt_style_apply(priv, style);
+}
+
+static void srt_move_cb(void *priv, int x1, int y1, int x2, int y2,
+                        int t1, int t2)
+{
+    // TODO: add a AV_PKT_DATA_SUBTITLE_POSITION side data when a new subtitles
+    // encoding API passing the AVPacket is available.
+}
+
+static void srt_end_cb(void *priv)
+{
+    srt_stack_push_pop(priv, 0, 1);
+}
+
+static const ASSCodesCallbacks srt_callbacks = {
+    .text             = srt_text_cb,
+    .new_line         = srt_new_line_cb,
+    .style            = srt_style_cb,
+    .color            = srt_color_cb,
+    .font_name        = srt_font_name_cb,
+    .font_size        = srt_font_size_cb,
+    .alignment        = srt_alignment_cb,
+    .cancel_overrides = srt_cancel_overrides_cb,
+    .move             = srt_move_cb,
+    .end              = srt_end_cb,
+};
+
+static const ASSCodesCallbacks text_callbacks = {
+    .text             = srt_text_cb,
+    .new_line         = srt_new_line_cb,
+};
+
+static int encode_frame(AVCodecContext *avctx,
+                        unsigned char *buf, int bufsize, const AVSubtitle *sub,
+                        const ASSCodesCallbacks *cb)
+{
+    SRTContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i;
+
+    av_bprint_clear(&s->buffer);
+
+    for (i=0; i<sub->num_rects; i++) {
+        const char *ass = sub->rects[i]->ass;
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            int num;
+            dialog = ff_ass_split_dialog(s->ass_ctx, ass, 0, &num);
+            for (; dialog && num--; dialog++) {
+                s->alignment_applied = 0;
+                if (avctx->codec_id == AV_CODEC_ID_SUBRIP)
+                    srt_style_apply(s, dialog->style);
+                ff_ass_split_override_codes(cb, s, dialog->text);
+            }
+        } else {
+#endif
+            dialog = ff_ass_split_dialog2(s->ass_ctx, ass);
+            if (!dialog)
+                return AVERROR(ENOMEM);
+            s->alignment_applied = 0;
+            if (avctx->codec_id == AV_CODEC_ID_SUBRIP)
+                srt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(cb, s, dialog->text);
+            ff_ass_free_dialog(&dialog);
+#if FF_API_ASS_TIMING
+        }
+#endif
+    }
+
+    if (!av_bprint_is_complete(&s->buffer))
+        return AVERROR(ENOMEM);
+    if (!s->buffer.len)
+        return 0;
+
+    if (s->buffer.len > bufsize) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        return -1;
+    }
+    memcpy(buf, s->buffer.str, s->buffer.len);
+
+    return s->buffer.len;
+}
+
+static int srt_encode_frame(AVCodecContext *avctx,
+                               unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    return encode_frame(avctx, buf, bufsize, sub, &srt_callbacks);
+}
+
+static int text_encode_frame(AVCodecContext *avctx,
+                             unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    return encode_frame(avctx, buf, bufsize, sub, &text_callbacks);
+}
+
+static int srt_encode_close(AVCodecContext *avctx)
+{
+    SRTContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+#if CONFIG_SRT_ENCODER
+/* deprecated encoder */
+AVCodec ff_srt_encoder = {
+    .name           = "srt",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBRIP,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = srt_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
+
+#if CONFIG_SUBRIP_ENCODER
+AVCodec ff_subrip_encoder = {
+    .name           = "subrip",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBRIP,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = srt_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
+
+#if CONFIG_TEXT_ENCODER
+AVCodec ff_text_encoder = {
+    .name           = "text",
+    .long_name      = NULL_IF_CONFIG_SMALL("Raw text subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_TEXT,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = text_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
index 5c1ec84..9efdffe 100644
--- a/libavcodec/startcode.c
+++ b/libavcodec/startcode.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/startcode.h b/libavcodec/startcode.h
index f38ce54..cfa02b0 100644
--- a/libavcodec/startcode.h
+++ b/libavcodec/startcode.h
@@ -1,21 +1,27 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * @file
+ * Accelerated start code search function for start codes common to
+ * MPEG-1/2/4 video, VC-1, H.264/5
+ */
+
 #ifndef AVCODEC_STARTCODE_H
 #define AVCODEC_STARTCODE_H
 
diff --git a/libavcodec/subviewerdec.c b/libavcodec/subviewerdec.c
new file mode 100644
index 0000000..805c7dd
--- /dev/null
+++ b/libavcodec/subviewerdec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SubViewer subtitle decoder
+ * @see https://en.wikipedia.org/wiki/SubViewer
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static int subviewer_event_to_ass(AVBPrint *buf, const char *p)
+{
+    while (*p) {
+        if (!strncmp(p, "[br]", 4)) {
+            av_bprintf(buf, "\\N");
+            p += 4;
+        } else {
+            if (p[0] == '\n' && p[1])
+                av_bprintf(buf, "\\N");
+            else if (*p != '\n' && *p != '\r')
+                av_bprint_chars(buf, *p, 1);
+            p++;
+        }
+    }
+
+    return 0;
+}
+
+static int subviewer_decode_frame(AVCodecContext *avctx,
+                                  void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && !subviewer_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_subviewer_decoder = {
+    .name           = "subviewer",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubViewer subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBVIEWER,
+    .decode         = subviewer_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/sunrast.c b/libavcodec/sunrast.c
index 6a928bb..0af5626 100644
--- a/libavcodec/sunrast.c
+++ b/libavcodec/sunrast.c
@@ -2,20 +2,20 @@
  * Sun Rasterfile (.sun/.ras/im{1,8,24}/.sunras) image decoder
  * Copyright (c) 2007, 2008 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     const uint8_t *buf_end   = avpkt->data + avpkt->size;
     AVFrame * const p        = data;
     unsigned int w, h, depth, type, maptype, maplength, stride, x, y, len, alen;
-    uint8_t *ptr;
+    uint8_t *ptr, *ptr2 = NULL;
     const uint8_t *bufstart = buf;
     int ret;
 
@@ -53,7 +53,7 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     maplength = AV_RB32(buf + 28);
     buf      += 32;
 
-    if (type == RT_FORMAT_TIFF || type == RT_FORMAT_IFF || type == RT_EXPERIMENTAL) {
+    if (type == RT_EXPERIMENTAL) {
         avpriv_request_sample(avctx, "TIFF/IFF/EXPERIMENTAL (compression) type");
         return AVERROR_PATCHWELCOME;
     }
@@ -70,10 +70,17 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    if (type == RT_FORMAT_TIFF || type == RT_FORMAT_IFF) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported (compression) type\n");
+        return -1;
+    }
 
     switch (depth) {
         case 1:
-            avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+            avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_MONOWHITE;
+            break;
+        case 4:
+            avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_NONE;
             break;
         case 8:
             avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
@@ -81,6 +88,9 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         case 24:
             avctx->pix_fmt = (type == RT_FORMAT_RGB) ? AV_PIX_FMT_RGB24 : AV_PIX_FMT_BGR24;
             break;
+        case 32:
+            avctx->pix_fmt = (type == RT_FORMAT_RGB) ? AV_PIX_FMT_0RGB : AV_PIX_FMT_0BGR;
+            break;
         default:
             av_log(avctx, AV_LOG_ERROR, "invalid depth\n");
             return AVERROR_INVALIDDATA;
@@ -90,17 +100,15 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
     if (buf_end - buf < maplength)
         return AVERROR_INVALIDDATA;
 
-    if (depth != 8 && maplength) {
+    if (depth > 8 && maplength) {
         av_log(avctx, AV_LOG_WARNING, "useless colormap found or file is corrupted, trying to recover\n");
 
     } else if (maplength) {
@@ -113,13 +121,20 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
 
         ptr = p->data[1];
         for (x = 0; x < len; x++, ptr += 4)
-            *(uint32_t *)ptr = (buf[x] << 16) + (buf[len + x] << 8) + buf[len + len + x];
+            *(uint32_t *)ptr = (0xFFU<<24) + (buf[x]<<16) + (buf[len+x]<<8) + buf[len+len+x];
     }
 
     buf += maplength;
 
+    if (maplength && depth < 8) {
+        ptr = ptr2 = av_malloc_array((w + 15), h);
+        if (!ptr)
+            return AVERROR(ENOMEM);
+        stride = (w + 15 >> 3) * depth;
+    } else {
     ptr    = p->data[0];
     stride = p->linesize[0];
+    }
 
     /* scanlines are aligned on 16 bit boundaries */
     len  = (depth * w + 7) >> 3;
@@ -153,13 +168,37 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         }
     } else {
         for (y = 0; y < h; y++) {
-            if (buf_end - buf < len)
+            if (buf_end - buf < alen)
                 break;
             memcpy(ptr, buf, len);
             ptr += stride;
             buf += alen;
         }
     }
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8 && depth < 8) {
+        uint8_t *ptr_free = ptr2;
+        ptr = p->data[0];
+        for (y=0; y<h; y++) {
+            for (x = 0; x < (w + 7 >> 3) * depth; x++) {
+                if (depth == 1) {
+                    ptr[8*x]   = ptr2[x] >> 7;
+                    ptr[8*x+1] = ptr2[x] >> 6 & 1;
+                    ptr[8*x+2] = ptr2[x] >> 5 & 1;
+                    ptr[8*x+3] = ptr2[x] >> 4 & 1;
+                    ptr[8*x+4] = ptr2[x] >> 3 & 1;
+                    ptr[8*x+5] = ptr2[x] >> 2 & 1;
+                    ptr[8*x+6] = ptr2[x] >> 1 & 1;
+                    ptr[8*x+7] = ptr2[x]      & 1;
+                } else {
+                    ptr[2*x]   = ptr2[x] >> 4;
+                    ptr[2*x+1] = ptr2[x] & 0xF;
+                }
+            }
+            ptr  += p->linesize[0];
+            ptr2 += (w + 15 >> 3) * depth;
+        }
+        av_freep(&ptr_free);
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/sunrast.h b/libavcodec/sunrast.h
index d9fe307..d162e63 100644
--- a/libavcodec/sunrast.h
+++ b/libavcodec/sunrast.h
@@ -2,20 +2,20 @@
  * Sun Rasterfile Image Format
  * Copyright (c) 2007, 2008 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sunrastenc.c b/libavcodec/sunrastenc.c
index 3a5f410..97b2242 100644
--- a/libavcodec/sunrastenc.c
+++ b/libavcodec/sunrastenc.c
@@ -2,20 +2,20 @@
  * Sun Rasterfile (.sun/.ras/im{1,8,24}/.sunras) image encoder
  * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,7 +59,7 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
 {
     SUNRASTContext *s = avctx->priv_data;
     const uint8_t *ptr;
-    int len, alen, x;
+    int len, alen, x, y;
 
     if (s->maplength) {     // palettized
         PutByteContext pb_r, pb_g;
@@ -86,33 +86,29 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
      if (s->type == RT_BYTE_ENCODED) {
         uint8_t value, value2;
         int run;
-        const uint8_t *start = linesize < 0 ? pixels + (avctx->height - 1) * linesize
-                                            : pixels;
-        const uint8_t *end   = linesize < 0 ? pixels - linesize
-                                            : pixels + avctx->height * linesize;
 
         ptr = pixels;
 
-#define GET_VALUE ptr >= end || ptr < start ? 0 : x >= len ? ptr[len-1] : ptr[x]
+#define GET_VALUE y >= avctx->height ? 0 : x >= len ? ptr[len-1] : ptr[x]
 
-        x = 0;
+        x = 0, y = 0;
         value2 = GET_VALUE;
-        while (ptr < end && ptr >= start) {
+        while (y < avctx->height) {
             run = 1;
             value = value2;
             x++;
             if (x >= alen) {
                 x = 0;
-                ptr += linesize;
+                ptr += linesize, y++;
             }
 
             value2 = GET_VALUE;
-            while (value2 == value && run < 256 && ptr < end && ptr >= start) {
+            while (value2 == value && run < 256 && y < avctx->height) {
                 x++;
                 run++;
                 if (x >= alen) {
                     x = 0;
-                    ptr += linesize;
+                    ptr += linesize, y++;
                 }
                 value2 = GET_VALUE;
             }
@@ -131,7 +127,6 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
         // update data length for header
         s->length = bytestream2_tell_p(&s->p) - 32 - s->maplength;
     } else {
-        int y;
         for (y = 0; y < avctx->height; y++) {
             bytestream2_put_buffer(&s->p, ptr, len);
             if (len < alen)
@@ -164,12 +159,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     // adjust boolean option to RT equivalent
     s->type++;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->key_frame = 1;
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
     s->maptype                    = RMT_NONE;
     s->maplength                  = 0;
 
@@ -202,7 +191,7 @@ static int sunrast_encode_frame(AVCodecContext *avctx,  AVPacket *avpkt,
     SUNRASTContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, s->size)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->size, 0)) < 0)
         return ret;
 
     bytestream2_init_writer(&s->p, avpkt->data, avpkt->size);
@@ -248,12 +237,12 @@ AVCodec ff_sunrast_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_SUNRAST,
     .priv_data_size = sizeof(SUNRASTContext),
-    .priv_class     = &sunrast_class,
     .init           = sunrast_encode_init,
     .encode2        = sunrast_encode_frame,
 #if FF_API_CODER_TYPE
     .defaults       = sunrast_defaults,
 #endif
+    .priv_class     = &sunrast_class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_BGR24,
                                                   AV_PIX_FMT_PAL8,
                                                   AV_PIX_FMT_GRAY8,
diff --git a/libavcodec/svq1.c b/libavcodec/svq1.c
index b9922a7..cc214f9 100644
--- a/libavcodec/svq1.c
+++ b/libavcodec/svq1.c
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1.h b/libavcodec/svq1.h
index 988a0a0..63c0479 100644
--- a/libavcodec/svq1.h
+++ b/libavcodec/svq1.h
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq13.c b/libavcodec/svq13.c
index e0d2154..b821a44 100644
--- a/libavcodec/svq13.c
+++ b/libavcodec/svq13.c
@@ -1,20 +1,20 @@
 /*
  * SVQ1/SVQ3 decoder common code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1_cb.h b/libavcodec/svq1_cb.h
index 396cdf7..11f7969 100644
--- a/libavcodec/svq1_cb.h
+++ b/libavcodec/svq1_cb.h
@@ -3,23 +3,23 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1_vlc.h b/libavcodec/svq1_vlc.h
index 073bb6d..06e3509 100644
--- a/libavcodec/svq1_vlc.h
+++ b/libavcodec/svq1_vlc.h
@@ -1,20 +1,20 @@
 /*
- * copyright (C) 2003 The FFmpeg project
+ * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1dec.c b/libavcodec/svq1dec.c
index bce04e8..d3e60c3 100644
--- a/libavcodec/svq1dec.c
+++ b/libavcodec/svq1dec.c
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "h263.h"
 #include "hpeldsp.h"
 #include "internal.h"
@@ -55,7 +55,7 @@ typedef struct svq1_pmv_s {
 
 typedef struct SVQ1Context {
     HpelDSPContext hdsp;
-    BitstreamContext bc;
+    GetBitContext gb;
     AVFrame *prev;
 
     uint8_t *pkt_swapped;
@@ -111,12 +111,11 @@ static const uint8_t string_table[256] = {
                 break;                                                  \
         }                                                               \
         /* divide block if next bit set */                              \
-        if (bitstream_read_bit(bc) == 0)                                \
+        if (!get_bits1(bitbuf))                                         \
             break;                                                      \
         /* add child nodes */                                           \
         list[n++] = list[i];                                            \
-        list[n++] = list[i] +                                           \
-                    (((level & 1) ? pitch : 1) << (level / 2 + 1));     \
+        list[n++] = list[i] + (((level & 1) ? pitch : 1) << ((level >> 1) + 1));\
     }
 
 #define SVQ1_ADD_CODEBOOK()                                             \
@@ -145,16 +144,16 @@ static const uint8_t string_table[256] = {
 #define SVQ1_CALC_CODEBOOK_ENTRIES(cbook)                               \
     codebook = (const uint32_t *)cbook[level];                          \
     if (stages > 0)                                                     \
-        bit_cache = bitstream_read(bc, 4 * stages);                     \
+        bit_cache = get_bits(bitbuf, 4 * stages);                       \
     /* calculate codebook entries for this vector */                    \
     for (j = 0; j < stages; j++) {                                      \
         entries[j] = (((bit_cache >> (4 * (stages - j - 1))) & 0xF) +   \
                       16 * j) << (level + 1);                           \
     }                                                                   \
     mean -= stages * 128;                                               \
-    n4    = mean + (mean >> 31) << 16 | (mean & 0xFFFF);
+    n4    = (mean << 16) + mean;
 
-static int svq1_decode_block_intra(BitstreamContext *bc, uint8_t *pixels,
+static int svq1_decode_block_intra(GetBitContext *bitbuf, uint8_t *pixels,
                                    ptrdiff_t pitch)
 {
     uint32_t bit_cache;
@@ -163,7 +162,8 @@ static int svq1_decode_block_intra(BitstreamContext *bc, uint8_t *pixels,
     const uint32_t *codebook;
     int entries[6];
     int i, j, m, n;
-    int mean, stages;
+    int stages;
+    unsigned mean;
     unsigned x, y, width, height, level;
     uint32_t n1, n2, n3, n4;
 
@@ -180,7 +180,7 @@ static int svq1_decode_block_intra(BitstreamContext *bc, uint8_t *pixels,
         height = 1 << ((3 + level) / 2);
 
         /* get number of stages (-1 skips vector, 0 for mean only) */
-        stages = bitstream_read_vlc(bc, svq1_intra_multistage[level].table, 3, 3) - 1;
+        stages = get_vlc2(bitbuf, svq1_intra_multistage[level].table, 3, 3) - 1;
 
         if (stages == -1) {
             for (y = 0; y < height; y++)
@@ -188,14 +188,15 @@ static int svq1_decode_block_intra(BitstreamContext *bc, uint8_t *pixels,
             continue;   /* skip vector */
         }
 
-        if ((stages > 0 && level >= 4) || stages < 0) {
+        if ((stages > 0 && level >= 4)) {
             ff_dlog(NULL,
                     "Error (svq1_decode_block_intra): invalid vector: stages=%i level=%i\n",
                     stages, level);
             return AVERROR_INVALIDDATA;  /* invalid vector */
         }
+        av_assert0(stages >= 0);
 
-        mean = bitstream_read_vlc(bc, svq1_intra_mean.table, 8, 3);
+        mean = get_vlc2(bitbuf, svq1_intra_mean.table, 8, 3);
 
         if (stages == 0) {
             for (y = 0; y < height; y++)
@@ -219,7 +220,7 @@ static int svq1_decode_block_intra(BitstreamContext *bc, uint8_t *pixels,
     return 0;
 }
 
-static int svq1_decode_block_non_intra(BitstreamContext *bc, uint8_t *pixels,
+static int svq1_decode_block_non_intra(GetBitContext *bitbuf, uint8_t *pixels,
                                        ptrdiff_t pitch)
 {
     uint32_t bit_cache;
@@ -228,7 +229,8 @@ static int svq1_decode_block_non_intra(BitstreamContext *bc, uint8_t *pixels,
     const uint32_t *codebook;
     int entries[6];
     int i, j, m, n;
-    int mean, stages;
+    int stages;
+    unsigned mean;
     int x, y, width, height, level;
     uint32_t n1, n2, n3, n4;
 
@@ -245,19 +247,20 @@ static int svq1_decode_block_non_intra(BitstreamContext *bc, uint8_t *pixels,
         height = 1 << ((3 + level) / 2);
 
         /* get number of stages (-1 skips vector, 0 for mean only) */
-        stages = bitstream_read_vlc(bc, svq1_inter_multistage[level].table, 3, 2) - 1;
+        stages = get_vlc2(bitbuf, svq1_inter_multistage[level].table, 3, 2) - 1;
 
         if (stages == -1)
             continue;           /* skip vector */
 
-        if ((stages > 0 && level >= 4) || stages < 0) {
+        if ((stages > 0 && level >= 4)) {
             ff_dlog(NULL,
                     "Error (svq1_decode_block_non_intra): invalid vector: stages=%i level=%i\n",
                     stages, level);
             return AVERROR_INVALIDDATA;  /* invalid vector */
         }
+        av_assert0(stages >= 0);
 
-        mean = bitstream_read_vlc(bc, svq1_inter_mean.table, 9, 3) - 256;
+        mean = get_vlc2(bitbuf, svq1_inter_mean.table, 9, 3) - 256;
 
         SVQ1_CALC_CODEBOOK_ENTRIES(ff_svq1_inter_codebooks);
 
@@ -277,7 +280,7 @@ static int svq1_decode_block_non_intra(BitstreamContext *bc, uint8_t *pixels,
     return 0;
 }
 
-static int svq1_decode_motion_vector(BitstreamContext *bc, svq1_pmv *mv,
+static int svq1_decode_motion_vector(GetBitContext *bitbuf, svq1_pmv *mv,
                                      svq1_pmv **pmv)
 {
     int diff;
@@ -285,11 +288,11 @@ static int svq1_decode_motion_vector(BitstreamContext *bc, svq1_pmv *mv,
 
     for (i = 0; i < 2; i++) {
         /* get motion code */
-        diff = bitstream_read_vlc(bc, svq1_motion_component.table, 7, 2);
+        diff = get_vlc2(bitbuf, svq1_motion_component.table, 7, 2);
         if (diff < 0)
             return AVERROR_INVALIDDATA;
         else if (diff) {
-            if (bitstream_read_bit(bc))
+            if (get_bits1(bitbuf))
                 diff = -diff;
         }
 
@@ -320,7 +323,7 @@ static void svq1_skip_block(uint8_t *current, uint8_t *previous,
     }
 }
 
-static int svq1_motion_inter_block(HpelDSPContext *hdsp, BitstreamContext *bc,
+static int svq1_motion_inter_block(HpelDSPContext *hdsp, GetBitContext *bitbuf,
                                    uint8_t *current, uint8_t *previous,
                                    ptrdiff_t pitch, svq1_pmv *motion, int x, int y,
                                    int width, int height)
@@ -341,9 +344,8 @@ static int svq1_motion_inter_block(HpelDSPContext *hdsp, BitstreamContext *bc,
         pmv[2] = &motion[x / 8 + 4];
     }
 
-    result = svq1_decode_motion_vector(bc, &mv, pmv);
-
-    if (result != 0)
+    result = svq1_decode_motion_vector(bitbuf, &mv, pmv);
+    if (result)
         return result;
 
     motion[0].x         =
@@ -364,7 +366,7 @@ static int svq1_motion_inter_block(HpelDSPContext *hdsp, BitstreamContext *bc,
     return 0;
 }
 
-static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, BitstreamContext *bc,
+static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbuf,
                                       uint8_t *current, uint8_t *previous,
                                       ptrdiff_t pitch, svq1_pmv *motion, int x, int y,
                                       int width, int height)
@@ -385,9 +387,8 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, BitstreamContext *bc
         pmv[2] = &motion[(x / 8) + 4];
     }
 
-    result = svq1_decode_motion_vector(bc, &mv, pmv);
-
-    if (result != 0)
+    result = svq1_decode_motion_vector(bitbuf, &mv, pmv);
+    if (result)
         return result;
 
     /* predict and decode motion vector (1) */
@@ -398,27 +399,24 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, BitstreamContext *bc
     } else {
         pmv[1] = &motion[(x / 8) + 3];
     }
-    result = svq1_decode_motion_vector(bc, &motion[0], pmv);
-
-    if (result != 0)
+    result = svq1_decode_motion_vector(bitbuf, &motion[0], pmv);
+    if (result)
         return result;
 
     /* predict and decode motion vector (2) */
     pmv[1] = &motion[0];
     pmv[2] = &motion[(x / 8) + 1];
 
-    result = svq1_decode_motion_vector(bc, &motion[(x / 8) + 2], pmv);
-
-    if (result != 0)
+    result = svq1_decode_motion_vector(bitbuf, &motion[(x / 8) + 2], pmv);
+    if (result)
         return result;
 
     /* predict and decode motion vector (3) */
     pmv[2] = &motion[(x / 8) + 2];
     pmv[3] = &motion[(x / 8) + 3];
 
-    result = svq1_decode_motion_vector(bc, pmv[3], pmv);
-
-    if (result != 0)
+    result = svq1_decode_motion_vector(bitbuf, pmv[3], pmv);
+    if (result)
         return result;
 
     /* form predictions */
@@ -446,7 +444,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, BitstreamContext *bc
 }
 
 static int svq1_decode_delta_block(AVCodecContext *avctx, HpelDSPContext *hdsp,
-                                   BitstreamContext *bc,
+                                   GetBitContext *bitbuf,
                                    uint8_t *current, uint8_t *previous,
                                    ptrdiff_t pitch, svq1_pmv *motion, int x, int y,
                                    int width, int height)
@@ -455,7 +453,7 @@ static int svq1_decode_delta_block(AVCodecContext *avctx, HpelDSPContext *hdsp,
     int result = 0;
 
     /* get block type */
-    block_type = bitstream_read_vlc(bc, svq1_block_type.table, 2, 2);
+    block_type = get_vlc2(bitbuf, svq1_block_type.table, 2, 2);
 
     /* reset motion vectors */
     if (block_type == SVQ1_BLOCK_SKIP || block_type == SVQ1_BLOCK_INTRA) {
@@ -473,60 +471,63 @@ static int svq1_decode_delta_block(AVCodecContext *avctx, HpelDSPContext *hdsp,
         break;
 
     case SVQ1_BLOCK_INTER:
-        result = svq1_motion_inter_block(hdsp, bc, current, previous,
+        result = svq1_motion_inter_block(hdsp, bitbuf, current, previous,
                                          pitch, motion, x, y, width, height);
 
         if (result != 0) {
             ff_dlog(avctx, "Error in svq1_motion_inter_block %i\n", result);
             break;
         }
-        result = svq1_decode_block_non_intra(bc, current, pitch);
+        result = svq1_decode_block_non_intra(bitbuf, current, pitch);
         break;
 
     case SVQ1_BLOCK_INTER_4V:
-        result = svq1_motion_inter_4v_block(hdsp, bc, current, previous,
+        result = svq1_motion_inter_4v_block(hdsp, bitbuf, current, previous,
                                             pitch, motion, x, y, width, height);
 
         if (result != 0) {
             ff_dlog(avctx, "Error in svq1_motion_inter_4v_block %i\n", result);
             break;
         }
-        result = svq1_decode_block_non_intra(bc, current, pitch);
+        result = svq1_decode_block_non_intra(bitbuf, current, pitch);
         break;
 
     case SVQ1_BLOCK_INTRA:
-        result = svq1_decode_block_intra(bc, current, pitch);
+        result = svq1_decode_block_intra(bitbuf, current, pitch);
         break;
     }
 
     return result;
 }
 
-static void svq1_parse_string(BitstreamContext *bc, uint8_t *out)
+static void svq1_parse_string(GetBitContext *bitbuf, uint8_t out[257])
 {
     uint8_t seed;
     int i;
 
-    out[0] = bitstream_read(bc, 8);
+    out[0] = get_bits(bitbuf, 8);
     seed   = string_table[out[0]];
 
     for (i = 1; i <= out[0]; i++) {
-        out[i] = bitstream_read(bc, 8) ^ seed;
+        out[i] = get_bits(bitbuf, 8) ^ seed;
         seed   = string_table[out[i] ^ seed];
     }
+    out[i] = 0;
 }
 
 static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
 {
     SVQ1Context *s = avctx->priv_data;
-    BitstreamContext *bc = &s->bc;
+    GetBitContext *bitbuf = &s->gb;
     int frame_size_code;
+    int width  = s->width;
+    int height = s->height;
 
-    bitstream_skip(bc, 8); /* temporal_reference */
+    skip_bits(bitbuf, 8); /* temporal_reference */
 
     /* frame type */
     s->nonref = 0;
-    switch (bitstream_read(bc, 2)) {
+    switch (get_bits(bitbuf, 2)) {
     case 0:
         frame->pict_type = AV_PICTURE_TYPE_I;
         break;
@@ -543,9 +544,10 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
     if (frame->pict_type == AV_PICTURE_TYPE_I) {
         /* unknown fields */
         if (s->frame_code == 0x50 || s->frame_code == 0x60) {
-            int csum = bitstream_read(bc, 16);
+            int csum = get_bits(bitbuf, 16);
 
-            csum = ff_svq1_packet_checksum(bc->buffer, bc->size_in_bits >> 3,
+            csum = ff_svq1_packet_checksum(bitbuf->buffer,
+                                           bitbuf->size_in_bits >> 3,
                                            csum);
 
             ff_dlog(avctx, "%s checksum (%02x) for packet data\n",
@@ -553,54 +555,56 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
         }
 
         if ((s->frame_code ^ 0x10) >= 0x50) {
-            uint8_t msg[256];
+            uint8_t msg[257];
 
-            svq1_parse_string(bc, msg);
+            svq1_parse_string(bitbuf, msg);
 
             av_log(avctx, AV_LOG_INFO,
-                   "embedded message: \"%s\"\n", (char *)msg);
+                   "embedded message:\n%s\n", ((char *)msg) + 1);
         }
 
-        bitstream_skip(bc, 2);
-        bitstream_skip(bc, 2);
-        bitstream_skip(bc, 1);
+        skip_bits(bitbuf, 2);
+        skip_bits(bitbuf, 2);
+        skip_bits1(bitbuf);
 
         /* load frame size */
-        frame_size_code = bitstream_read(bc, 3);
+        frame_size_code = get_bits(bitbuf, 3);
 
         if (frame_size_code == 7) {
             /* load width, height (12 bits each) */
-            s->width  = bitstream_read(bc, 12);
-            s->height = bitstream_read(bc, 12);
+            width  = get_bits(bitbuf, 12);
+            height = get_bits(bitbuf, 12);
 
-            if (!s->width || !s->height)
+            if (!width || !height)
                 return AVERROR_INVALIDDATA;
         } else {
             /* get width, height from table */
-            s->width  = ff_svq1_frame_size_table[frame_size_code][0];
-            s->height = ff_svq1_frame_size_table[frame_size_code][1];
+            width  = ff_svq1_frame_size_table[frame_size_code][0];
+            height = ff_svq1_frame_size_table[frame_size_code][1];
         }
     }
 
     /* unknown fields */
-    if (bitstream_read_bit(bc) == 1) {
-        bitstream_skip(bc, 1); /* use packet checksum if (1) */
-        bitstream_skip(bc, 1); /* component checksums after image data if (1) */
+    if (get_bits1(bitbuf)) {
+        skip_bits1(bitbuf);    /* use packet checksum if (1) */
+        skip_bits1(bitbuf);    /* component checksums after image data if (1) */
 
-        if (bitstream_read(bc, 2) != 0)
+        if (get_bits(bitbuf, 2) != 0)
             return AVERROR_INVALIDDATA;
     }
 
-    if (bitstream_read_bit(bc) == 1) {
-        bitstream_skip(bc, 1);
-        bitstream_skip(bc, 4);
-        bitstream_skip(bc, 1);
-        bitstream_skip(bc, 2);
+    if (get_bits1(bitbuf)) {
+        skip_bits1(bitbuf);
+        skip_bits(bitbuf, 4);
+        skip_bits1(bitbuf);
+        skip_bits(bitbuf, 2);
 
-        while (bitstream_read_bit(bc) == 1)
-            bitstream_skip(bc, 8);
+        if (skip_1stop_8data_bits(bitbuf) < 0)
+            return AVERROR_INVALIDDATA;
     }
 
+    s->width  = width;
+    s->height = height;
     return 0;
 }
 
@@ -614,12 +618,15 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *current;
     int result, i, x, y, width, height;
     svq1_pmv *pmv;
+    int ret;
 
     /* initialize bit buffer */
-    bitstream_init8(&s->bc, buf, buf_size);
+    ret = init_get_bits8(&s->gb, buf, buf_size);
+    if (ret < 0)
+        return ret;
 
     /* decode frame header */
-    s->frame_code = bitstream_read(&s->bc, 22);
+    s->frame_code = get_bits(&s->gb, 22);
 
     if ((s->frame_code & ~0x70) || !(s->frame_code & 0x60))
         return AVERROR_INVALIDDATA;
@@ -641,18 +648,16 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
 
         memcpy(s->pkt_swapped, buf, buf_size);
         buf = s->pkt_swapped;
+        init_get_bits(&s->gb, buf, buf_size * 8);
+        skip_bits(&s->gb, 22);
 
         src = (uint32_t *)(s->pkt_swapped + 4);
 
         for (i = 0; i < 4; i++)
             src[i] = ((src[i] << 16) | (src[i] >> 16)) ^ src[7 - i];
-
-        bitstream_init8(&s->bc, buf, buf_size);
-        bitstream_skip(&s->bc, 22);
     }
 
     result = svq1_decode_frame_header(avctx, cur);
-
     if (result != 0) {
         ff_dlog(avctx, "Error in svq1_decode_frame_header %i\n", result);
         return result;
@@ -695,10 +700,10 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
             /* keyframe */
             for (y = 0; y < height; y += 16) {
                 for (x = 0; x < width; x += 16) {
-                    result = svq1_decode_block_intra(&s->bc, &current[x],
+                    result = svq1_decode_block_intra(&s->gb, &current[x],
                                                      linesize);
-                    if (result != 0) {
-                        av_log(avctx, AV_LOG_INFO,
+                    if (result) {
+                        av_log(avctx, AV_LOG_ERROR,
                                "Error in svq1_decode_block %i (keyframe)\n",
                                result);
                         goto err;
@@ -721,7 +726,7 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
             for (y = 0; y < height; y += 16) {
                 for (x = 0; x < width; x += 16) {
                     result = svq1_decode_delta_block(avctx, &s->hdsp,
-                                                     &s->bc, &current[x],
+                                                     &s->gb, &current[x],
                                                      previous, linesize,
                                                      pmv, x, y, width, height);
                     if (result != 0) {
@@ -816,6 +821,7 @@ static av_cold int svq1_decode_end(AVCodecContext *avctx)
 
     av_frame_free(&s->prev);
     av_freep(&s->pkt_swapped);
+    s->pkt_swapped_allocated = 0;
 
     return 0;
 }
diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c
index 963fbe4..80a8af1 100644
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@@ -2,20 +2,20 @@
  * SVQ1 Encoder
  * Copyright (C) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,9 +36,8 @@
 #include "svq1.h"
 #include "svq1enc.h"
 #include "svq1enc_cb.h"
+#include "libavutil/avassert.h"
 
-#undef NDEBUG
-#include <assert.h>
 
 static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 {
@@ -59,7 +58,7 @@ static void svq1_write_header(SVQ1EncContext *s, int frame_type)
         /* output 5 unknown bits (2 + 2 + 1) */
         put_bits(&s->pb, 5, 2); /* 2 needed by quicktime decoder */
 
-        i = ff_match_2uint16(ff_svq1_frame_size_table,
+        i = ff_match_2uint16((void*)ff_svq1_frame_size_table,
                              FF_ARRAY_ELEMS(ff_svq1_frame_size_table),
                              s->frame_width, s->frame_height);
         put_bits(&s->pb, 3, i);
@@ -78,7 +77,7 @@ static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 #define THRESHOLD_MULTIPLIER 0.6
 
 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
-                               int size)
+                               intptr_t size)
 {
     int score = 0, i;
 
@@ -97,7 +96,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     int w            = 2 << (level + 2 >> 1);
     int h            = 2 << (level + 1 >> 1);
     int size         = w * h;
-    int16_t block[7][256];
+    int16_t (*block)[256] = s->encoded_block_levels[level];
     const int8_t *codebook_sum, *codebook;
     const uint16_t(*mean_vlc)[2];
     const uint8_t(*multistage_vlc)[2];
@@ -105,7 +104,9 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     best_score = 0;
     // FIXME: Optimize, this does not need to be done multiple times.
     if (intra) {
-        codebook_sum   = svq1_intra_codebook_sum[level];
+        // level is 5 when encode_block is called from svq1_encode_plane
+        // and always < 4 when called recursively from this function.
+        codebook_sum   = level < 4 ? svq1_intra_codebook_sum[level] : NULL;
         codebook       = ff_svq1_intra_codebooks[level];
         mean_vlc       = ff_svq1_intra_mean_vlc;
         multistage_vlc = ff_svq1_intra_multistage_vlc[level];
@@ -118,7 +119,8 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
             }
         }
     } else {
-        codebook_sum   = svq1_inter_codebook_sum[level];
+        // level is 5 or < 4, see above for details.
+        codebook_sum   = level < 4 ? svq1_inter_codebook_sum[level] : NULL;
         codebook       = ff_svq1_inter_codebooks[level];
         mean_vlc       = ff_svq1_inter_mean_vlc + 256;
         multistage_vlc = ff_svq1_inter_multistage_vlc[level];
@@ -153,7 +155,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                 score  = sqr - (diff * (int64_t)diff >> (level + 3)); // FIXME: 64 bits slooow
                 if (score < best_vector_score) {
                     int mean = diff + (size >> 1) >> (level + 3);
-                    assert(mean > -300 && mean < 300);
+                    av_assert2(mean > -300 && mean < 300);
                     mean               = av_clip(mean, intra ? 0 : -256, 255);
                     best_vector_score  = score;
                     best_vector[stage] = i;
@@ -161,7 +163,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                     best_vector_mean   = mean;
                 }
             }
-            assert(best_vector_mean != -999);
+            av_assert0(best_vector_mean != -999);
             vector = codebook + stage * size * 16 + best_vector[stage] * size;
             for (j = 0; j < size; j++)
                 block[stage + 1][j] = block[stage][j] - vector[j];
@@ -205,10 +207,10 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
         put_bits(&s->reorder_pb[level], 1, split);
 
     if (!split) {
-        assert(best_mean >= 0 && best_mean < 256 || !intra);
-        assert(best_mean >= -256 && best_mean < 256);
-        assert(best_count >= 0 && best_count < 7);
-        assert(level < 4 || best_count == 0);
+        av_assert1(best_mean >= 0 && best_mean < 256 || !intra);
+        av_assert1(best_mean >= -256 && best_mean < 256);
+        av_assert1(best_count >= 0 && best_count < 7);
+        av_assert1(level < 4 || best_count == 0);
 
         /* output the encoding */
         put_bits(&s->reorder_pb[level],
@@ -218,7 +220,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                  mean_vlc[best_mean][0]);
 
         for (i = 0; i < best_count; i++) {
-            assert(best_vector[i] >= 0 && best_vector[i] < 16);
+            av_assert2(best_vector[i] >= 0 && best_vector[i] < 16);
             put_bits(&s->reorder_pb[level], 4, best_vector[i]);
         }
 
@@ -232,6 +234,15 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     return best_score;
 }
 
+static void init_block_index(MpegEncContext *s){
+    s->block_index[0]= s->b8_stride*(s->mb_y*2    )     + s->mb_x*2;
+    s->block_index[1]= s->b8_stride*(s->mb_y*2    ) + 1 + s->mb_x*2;
+    s->block_index[2]= s->b8_stride*(s->mb_y*2 + 1)     + s->mb_x*2;
+    s->block_index[3]= s->b8_stride*(s->mb_y*2 + 1) + 1 + s->mb_x*2;
+    s->block_index[4]= s->mb_stride*(s->mb_y + 1)                + s->b8_stride*s->mb_height*2 + s->mb_x;
+    s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x;
+}
+
 static int svq1_encode_plane(SVQ1EncContext *s, int plane,
                              unsigned char *src_plane,
                              unsigned char *ref_plane,
@@ -243,7 +254,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
     int block_width, block_height;
     int level;
     int threshold[6];
-    uint8_t *src     = s->scratchbuf + stride * 16;
+    uint8_t *src     = s->scratchbuf + stride * 32;
     const int lambda = (s->quality * s->quality) >>
                        (2 * FF_LAMBDA_SHIFT);
 
@@ -327,8 +338,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
 
             for (x = 0; x < block_width; x++) {
                 s->m.mb_x = x;
-                ff_init_block_index(&s->m);
-                ff_update_block_index(&s->m);
+                init_block_index(&s->m);
 
                 ff_estimate_p_frame_motion(&s->m, x, y);
             }
@@ -353,8 +363,8 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
 
         s->m.mb_y = y;
         for (x = 0; x < block_width; x++) {
-            uint8_t reorder_buffer[3][6][7 * 32];
-            int count[3][6];
+            uint8_t reorder_buffer[2][6][7 * 32];
+            int count[2][6];
             int offset       = y * 16 * stride + x * 16;
             uint8_t *decoded = decoded_plane + offset;
             uint8_t *ref     = ref_plane + offset;
@@ -368,8 +378,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
             }
 
             s->m.mb_x = x;
-            ff_init_block_index(&s->m);
-            ff_update_block_index(&s->m);
+            init_block_index(&s->m);
 
             if (s->pict_type == AV_PICTURE_TYPE_I ||
                 (s->m.mb_type[x + y * s->m.mb_stride] &
@@ -410,23 +419,23 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
                     s->m.pb = s->reorder_pb[5];
                     mx      = motion_ptr[0];
                     my      = motion_ptr[1];
-                    assert(mx     >= -32 && mx     <= 31);
-                    assert(my     >= -32 && my     <= 31);
-                    assert(pred_x >= -32 && pred_x <= 31);
-                    assert(pred_y >= -32 && pred_y <= 31);
-                    ff_h263_encode_motion(&s->m, mx - pred_x, 1);
-                    ff_h263_encode_motion(&s->m, my - pred_y, 1);
+                    av_assert1(mx     >= -32 && mx     <= 31);
+                    av_assert1(my     >= -32 && my     <= 31);
+                    av_assert1(pred_x >= -32 && pred_x <= 31);
+                    av_assert1(pred_y >= -32 && pred_y <= 31);
+                    ff_h263_encode_motion(&s->m.pb, mx - pred_x, 1);
+                    ff_h263_encode_motion(&s->m.pb, my - pred_y, 1);
                     s->reorder_pb[5] = s->m.pb;
                     score[1]        += lambda * put_bits_count(&s->reorder_pb[5]);
 
                     dxy = (mx & 1) + 2 * (my & 1);
 
-                    s->hdsp.put_pixels_tab[0][dxy](temp + 16,
+                    s->hdsp.put_pixels_tab[0][dxy](temp + 16*stride,
                                                    ref + (mx >> 1) +
                                                    stride * (my >> 1),
                                                    stride, 16);
 
-                    score[1] += encode_block(s, src + 16 * x, temp + 16,
+                    score[1] += encode_block(s, src + 16 * x, temp + 16*stride,
                                              decoded, stride, 5, 64, lambda, 0);
                     best      = score[1] <= score[0];
 
@@ -437,8 +446,6 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
                     if (score[2] < score[best] && mx == 0 && my == 0) {
                         best = 2;
                         s->hdsp.put_pixels_tab[0][0](decoded, ref, stride, 16);
-                        for (i = 0; i < 6; i++)
-                            count[2][i] = 0;
                         put_bits(&s->pb, vlc[1], vlc[0]);
                     }
                 }
@@ -462,6 +469,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
 
             s->rd_total += score[best];
 
+            if (best != 2)
             for (i = 5; i >= 0; i--)
                 avpriv_copy_bits(&s->pb, reorder_buffer[best][i],
                                  count[best][i]);
@@ -508,6 +516,11 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
     SVQ1EncContext *const s = avctx->priv_data;
     int ret;
 
+    if (avctx->width >= 4096 || avctx->height >= 4096) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions too large, maximum is 4095x4095\n");
+        return AVERROR(EINVAL);
+    }
+
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
     ff_me_cmp_init(&s->mecc, avctx);
     ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
@@ -569,14 +582,10 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 {
     SVQ1EncContext *const s = avctx->priv_data;
     int i, ret;
-    uint8_t *sd;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, s->y_block_width * s->y_block_height *
-                             MAX_MB_BYTES * 3 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->y_block_width * s->y_block_height *
+                             MAX_MB_BYTES*3 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (avctx->pix_fmt != AV_PIX_FMT_YUV410P) {
         av_log(avctx, AV_LOG_ERROR, "unsupported pixel format\n");
@@ -584,9 +593,9 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     if (!s->current_picture->data[0]) {
-        ret = ff_get_buffer(avctx, s->current_picture, 0);
-        if (ret < 0)
+        if ((ret = ff_get_buffer(avctx, s->current_picture, 0)) < 0) {
             return ret;
+        }
     }
     if (!s->last_picture->data[0]) {
         ret = ff_get_buffer(avctx, s->last_picture, 0);
@@ -594,7 +603,7 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             return ret;
     }
     if (!s->scratchbuf) {
-        s->scratchbuf = av_malloc(s->current_picture->linesize[0] * 16 * 2);
+        s->scratchbuf = av_malloc_array(s->current_picture->linesize[0], 16 * 3);
         if (!s->scratchbuf)
             return AVERROR(ENOMEM);
     }
@@ -616,21 +625,20 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = pict->quality;
+    ff_side_data_set_encoder_stats(pkt, pict->quality, NULL, 0, s->pict_type);
 
     svq1_write_header(s, s->pict_type);
-    for (i = 0; i < 3; i++)
-        if (svq1_encode_plane(s, i,
+    for (i = 0; i < 3; i++) {
+        int ret = svq1_encode_plane(s, i,
                               pict->data[i],
                               s->last_picture->data[i],
                               s->current_picture->data[i],
                               s->frame_width  / (i ? 4 : 1),
                               s->frame_height / (i ? 4 : 1),
                               pict->linesize[i],
-                              s->current_picture->linesize[i]) < 0) {
+                              s->current_picture->linesize[i]);
+        emms_c();
+        if (ret < 0) {
             int j;
             for (j = 0; j < i; j++) {
                 av_freep(&s->motion_val8[j]);
@@ -639,6 +647,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_freep(&s->scratchbuf);
             return -1;
         }
+    }
 
     // avpriv_align_put_bits(&s->pb);
     while (put_bits_count(&s->pb) & 31)
diff --git a/libavcodec/svq1enc.h b/libavcodec/svq1enc.h
index 94458d6..b4ef763 100644
--- a/libavcodec/svq1enc.h
+++ b/libavcodec/svq1enc.h
@@ -1,20 +1,20 @@
 /*
  * SVQ1 encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,6 +62,8 @@ typedef struct SVQ1EncContext {
     int c_block_width;
     int c_block_height;
 
+    DECLARE_ALIGNED(16, int16_t, encoded_block_levels)[6][7][256];
+
     uint16_t *mb_type;
     uint32_t *dummy;
     int16_t (*motion_val8[3])[2];
@@ -74,7 +76,7 @@ typedef struct SVQ1EncContext {
     int motion_est;
 
     int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
-                             int size);
+                             intptr_t size);
 } SVQ1EncContext;
 
 void ff_svq1enc_init_ppc(SVQ1EncContext *c);
diff --git a/libavcodec/svq1enc_cb.h b/libavcodec/svq1enc_cb.h
index a5cd179..1edb4ec 100644
--- a/libavcodec/svq1enc_cb.h
+++ b/libavcodec/svq1enc_cb.h
@@ -2,20 +2,20 @@
  * SVQ1 Encoder
  * Copyright (C) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 667d390..18a4448 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2003 The Libav Project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,20 +37,18 @@
  *
  * You will know you have these parameters passed correctly when the decoder
  * correctly decodes this file:
- *  http://samples.libav.org/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
+ *  http://samples.mplayerhq.hu/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
  */
 
 #include <inttypes.h>
 
 #include "libavutil/attributes.h"
-
-#include "bitstream.h"
-#include "golomb.h"
 #include "internal.h"
 #include "avcodec.h"
 #include "mpegutils.h"
 #include "h264dec.h"
 #include "h264data.h"
+#include "golomb.h"
 #include "hpeldsp.h"
 #include "mathops.h"
 #include "rectangle.h"
@@ -93,14 +91,16 @@ typedef struct SVQ3Context {
     SVQ3Frame *cur_pic;
     SVQ3Frame *next_pic;
     SVQ3Frame *last_pic;
-    BitstreamContext bc;
-    BitstreamContext bc_slice;
+    GetBitContext gb;
+    GetBitContext gb_slice;
     uint8_t *slice_buf;
     int slice_size;
     int halfpel_flag;
     int thirdpel_flag;
-    int unknown_flag;
+    int has_watermark;
     uint32_t watermark_key;
+    uint8_t *buf;
+    int buf_size;
     int adaptive_quant;
     int next_p_frame_damaged;
     int h_edge_pos;
@@ -115,6 +115,7 @@ typedef struct SVQ3Context {
     int prev_frame_num;
 
     enum AVPictureType pict_type;
+    enum AVPictureType slice_type;
     int low_delay;
 
     int mb_x, mb_y;
@@ -218,9 +219,11 @@ static const uint32_t svq3_dequant_coeff[32] = {
     61694, 68745, 77615, 89113, 100253, 109366, 126635, 141533
 };
 
+static int svq3_decode_end(AVCodecContext *avctx);
+
 static void svq3_luma_dc_dequant_idct_c(int16_t *output, int16_t *input, int qp)
 {
-    const int qmul = svq3_dequant_coeff[qp];
+    const unsigned qmul = svq3_dequant_coeff[qp];
 #define stride 16
     int i;
     int temp[16];
@@ -245,10 +248,10 @@ static void svq3_luma_dc_dequant_idct_c(int16_t *output, int16_t *input, int qp)
         const int z2     =  7 *  temp[4 * 1 + i] - 17 * temp[4 * 3 + i];
         const int z3     = 17 *  temp[4 * 1 + i] +  7 * temp[4 * 3 + i];
 
-        output[stride *  0 + offset] = (z0 + z3) * qmul + 0x80000 >> 20;
-        output[stride *  2 + offset] = (z1 + z2) * qmul + 0x80000 >> 20;
-        output[stride *  8 + offset] = (z1 - z2) * qmul + 0x80000 >> 20;
-        output[stride * 10 + offset] = (z0 - z3) * qmul + 0x80000 >> 20;
+        output[stride *  0 + offset] = (int)((z0 + z3) * qmul + 0x80000) >> 20;
+        output[stride *  2 + offset] = (int)((z1 + z2) * qmul + 0x80000) >> 20;
+        output[stride *  8 + offset] = (int)((z1 - z2) * qmul + 0x80000) >> 20;
+        output[stride * 10 + offset] = (int)((z0 - z3) * qmul + 0x80000) >> 20;
     }
 }
 #undef stride
@@ -260,7 +263,7 @@ static void svq3_add_idct_c(uint8_t *dst, int16_t *block,
     int i;
 
     if (dc) {
-        dc       = 13 * 13 * (dc == 1 ? 1538 * block[0]
+        dc       = 13 * 13 * (dc == 1 ? 1538U* block[0]
                                       : qmul * (block[0] >> 3) / 2);
         block[0] = 0;
     }
@@ -278,36 +281,39 @@ static void svq3_add_idct_c(uint8_t *dst, int16_t *block,
     }
 
     for (i = 0; i < 4; i++) {
-        const int z0 = 13 * (block[i + 4 * 0] +      block[i + 4 * 2]);
-        const int z1 = 13 * (block[i + 4 * 0] -      block[i + 4 * 2]);
-        const int z2 =  7 *  block[i + 4 * 1] - 17 * block[i + 4 * 3];
-        const int z3 = 17 *  block[i + 4 * 1] +  7 * block[i + 4 * 3];
-        const int rr = (dc + 0x80000);
-
-        dst[i + stride * 0] = av_clip_uint8(dst[i + stride * 0] + ((z0 + z3) * qmul + rr >> 20));
-        dst[i + stride * 1] = av_clip_uint8(dst[i + stride * 1] + ((z1 + z2) * qmul + rr >> 20));
-        dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20));
-        dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20));
+        const unsigned z0 = 13 * (block[i + 4 * 0] +      block[i + 4 * 2]);
+        const unsigned z1 = 13 * (block[i + 4 * 0] -      block[i + 4 * 2]);
+        const unsigned z2 =  7 *  block[i + 4 * 1] - 17 * block[i + 4 * 3];
+        const unsigned z3 = 17 *  block[i + 4 * 1] +  7 * block[i + 4 * 3];
+        const int rr = (dc + 0x80000u);
+
+        dst[i + stride * 0] = av_clip_uint8(dst[i + stride * 0] + ((int)((z0 + z3) * qmul + rr) >> 20));
+        dst[i + stride * 1] = av_clip_uint8(dst[i + stride * 1] + ((int)((z1 + z2) * qmul + rr) >> 20));
+        dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((int)((z1 - z2) * qmul + rr) >> 20));
+        dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((int)((z0 - z3) * qmul + rr) >> 20));
     }
 
     memset(block, 0, 16 * sizeof(int16_t));
 }
 
-static inline int svq3_decode_block(BitstreamContext *bc, int16_t *block,
+static inline int svq3_decode_block(GetBitContext *gb, int16_t *block,
                                     int index, const int type)
 {
     static const uint8_t *const scan_patterns[4] = {
         luma_dc_zigzag_scan, ff_zigzag_scan, svq3_scan, ff_h264_chroma_dc_scan
     };
 
-    int run, level, limit;
+    int run, level, sign, limit;
     unsigned vlc;
     const int intra           = 3 * type >> 2;
     const uint8_t *const scan = scan_patterns[type];
 
     for (limit = (16 >> intra); index < 16; index = limit, limit += 8) {
-        for (; (vlc = get_interleaved_ue_golomb(bc)) != 0; index++) {
-            int sign = (vlc & 1) ? 0 : -1;
+        for (; (vlc = get_interleaved_ue_golomb(gb)) != 0; index++) {
+            if ((int32_t)vlc < 0)
+                return -1;
+
+            sign     = (vlc & 1) ? 0 : -1;
             vlc      = vlc + 1 >> 1;
 
             if (type == 3) {
@@ -322,20 +328,19 @@ static inline int svq3_decode_block(BitstreamContext *bc, int16_t *block,
                     level = (vlc + 9 >> 2) - run;
                 }
             } else {
-                if (vlc < 16) {
+                if (vlc < 16U) {
                     run   = svq3_dct_tables[intra][vlc].run;
                     level = svq3_dct_tables[intra][vlc].level;
                 } else if (intra) {
                     run   = vlc & 0x7;
-                    level = (vlc >> 3) +
-                            ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
+                    level = (vlc >> 3) + ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
                 } else {
                     run   = vlc & 0xF;
-                    level = (vlc >> 4) +
-                            ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
+                    level = (vlc >> 4) + ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
                 }
             }
 
+
             if ((index += run) >= limit)
                 return -1;
 
@@ -519,8 +524,8 @@ static inline int svq3_mc_dir(SVQ3Context *s, int size, int mode,
             if (mode != PREDICT_MODE) {
                 svq3_pred_motion(s, k, part_width >> 2, dir, 1, &mx, &my);
             } else {
-                mx = s->next_pic->motion_val[0][b_xy][0] << 1;
-                my = s->next_pic->motion_val[0][b_xy][1] << 1;
+                mx = s->next_pic->motion_val[0][b_xy][0] * 2;
+                my = s->next_pic->motion_val[0][b_xy][1] * 2;
 
                 if (dir == 0) {
                     mx = mx * s->frame_num_offset /
@@ -543,10 +548,10 @@ static inline int svq3_mc_dir(SVQ3Context *s, int size, int mode,
             if (mode == PREDICT_MODE) {
                 dx = dy = 0;
             } else {
-                dy = get_interleaved_se_golomb(&s->bc_slice);
-                dx = get_interleaved_se_golomb(&s->bc_slice);
+                dy = get_interleaved_se_golomb(&s->gb_slice);
+                dx = get_interleaved_se_golomb(&s->gb_slice);
 
-                if (dx == INVALID_VLC || dy == INVALID_VLC) {
+                if (dx != (int16_t)dx || dy != (int16_t)dy) {
                     av_log(s->avctx, AV_LOG_ERROR, "invalid MV vlc\n");
                     return -1;
                 }
@@ -557,8 +562,8 @@ static inline int svq3_mc_dir(SVQ3Context *s, int size, int mode,
                 int fx, fy;
                 mx  = (mx + 1 >> 1) + dx;
                 my  = (my + 1 >> 1) + dy;
-                fx  = (unsigned)(mx + 0x3000) / 3 - 0x1000;
-                fy  = (unsigned)(my + 0x3000) / 3 - 0x1000;
+                fx  = (unsigned)(mx + 0x30000) / 3 - 0x10000;
+                fy  = (unsigned)(my + 0x30000) / 3 - 0x10000;
                 dxy = (mx - 3 * fx) + 4 * (my - 3 * fy);
 
                 svq3_mc_dir_part(s, x, y, part_width, part_height,
@@ -566,8 +571,8 @@ static inline int svq3_mc_dir(SVQ3Context *s, int size, int mode,
                 mx += mx;
                 my += my;
             } else if (mode == HALFPEL_MODE || mode == PREDICT_MODE) {
-                mx  = (unsigned)(mx + 1 + 0x3000) / 3 + dx - 0x1000;
-                my  = (unsigned)(my + 1 + 0x3000) / 3 + dy - 0x1000;
+                mx  = (unsigned)(mx + 1 + 0x30000) / 3 + dx - 0x10000;
+                my  = (unsigned)(my + 1 + 0x30000) / 3 + dy - 0x10000;
                 dxy = (mx & 1) + 2 * (my & 1);
 
                 svq3_mc_dir_part(s, x, y, part_width, part_height,
@@ -575,8 +580,8 @@ static inline int svq3_mc_dir(SVQ3Context *s, int size, int mode,
                 mx *= 3;
                 my *= 3;
             } else {
-                mx = (unsigned)(mx + 3 + 0x6000) / 6 + dx - 0x1000;
-                my = (unsigned)(my + 3 + 0x6000) / 6 + dy - 0x1000;
+                mx = (unsigned)(mx + 3 + 0x60000) / 6 + dx - 0x10000;
+                my = (unsigned)(my + 3 + 0x60000) / 6 + dy - 0x10000;
 
                 svq3_mc_dir_part(s, x, y, part_width, part_height,
                                  mx, my, 0, 0, dir, avg);
@@ -642,7 +647,7 @@ static av_always_inline void hl_decode_mb_predict_luma(SVQ3Context *s,
             int nnz, tr;
             if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
                 const int topright_avail = (s->topright_samples_available << i) & 0x8000;
-                assert(s->mb_y || linesize <= block_offset[i]);
+                av_assert2(s->mb_y || linesize <= block_offset[i]);
                 if (!topright_avail) {
                     tr       = ptr[3 - linesize] * 0x01010101u;
                     topright = (uint8_t *)&tr;
@@ -745,10 +750,10 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
             mb_type = MB_TYPE_16x16;
         }
     } else if (mb_type < 8) {     /* INTER */
-        if (s->thirdpel_flag && s->halfpel_flag == !bitstream_read_bit(&s->bc_slice))
+        if (s->thirdpel_flag && s->halfpel_flag == !get_bits1(&s->gb_slice))
             mode = THIRDPEL_MODE;
         else if (s->halfpel_flag &&
-                 s->thirdpel_flag == !bitstream_read_bit(&s->bc_slice))
+                 s->thirdpel_flag == !get_bits1(&s->gb_slice))
             mode = HALFPEL_MODE;
         else
             mode = FULLPEL_MODE;
@@ -850,9 +855,9 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
 
             /* decode prediction codes for luma blocks */
             for (i = 0; i < 16; i += 2) {
-                vlc = get_interleaved_ue_golomb(&s->bc_slice);
+                vlc = get_interleaved_ue_golomb(&s->gb_slice);
 
-                if (vlc >= 25) {
+                if (vlc >= 25U) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "luma prediction:%"PRIu32"\n", vlc);
                     return -1;
@@ -928,7 +933,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
 
     if (!IS_INTRA16x16(mb_type) &&
         (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B)) {
-        if ((vlc = get_interleaved_ue_golomb(&s->bc_slice)) >= 48) {
+        if ((vlc = get_interleaved_ue_golomb(&s->gb_slice)) >= 48U){
             av_log(s->avctx, AV_LOG_ERROR, "cbp_vlc=%"PRIu32"\n", vlc);
             return -1;
         }
@@ -938,7 +943,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
     }
     if (IS_INTRA16x16(mb_type) ||
         (s->pict_type != AV_PICTURE_TYPE_I && s->adaptive_quant && cbp)) {
-        s->qscale += get_interleaved_se_golomb(&s->bc_slice);
+        s->qscale += get_interleaved_se_golomb(&s->gb_slice);
 
         if (s->qscale > 31u) {
             av_log(s->avctx, AV_LOG_ERROR, "qscale:%d\n", s->qscale);
@@ -948,7 +953,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
     if (IS_INTRA16x16(mb_type)) {
         AV_ZERO128(s->mb_luma_dc[0] + 0);
         AV_ZERO128(s->mb_luma_dc[0] + 8);
-        if (svq3_decode_block(&s->bc_slice, s->mb_luma_dc[0], 0, 1)) {
+        if (svq3_decode_block(&s->gb_slice, s->mb_luma_dc[0], 0, 1)) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "error while decoding intra luma dc\n");
             return -1;
@@ -967,7 +972,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
                               : (4 * i + j);
                     s->non_zero_count_cache[scan8[k]] = 1;
 
-                    if (svq3_decode_block(&s->bc_slice, &s->mb[16 * k], index, type)) {
+                    if (svq3_decode_block(&s->gb_slice, &s->mb[16 * k], index, type)) {
                         av_log(s->avctx, AV_LOG_ERROR,
                                "error while decoding block\n");
                         return -1;
@@ -977,7 +982,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
 
         if ((cbp & 0x30)) {
             for (i = 1; i < 3; ++i)
-                if (svq3_decode_block(&s->bc_slice, &s->mb[16 * 16 * i], 0, 3)) {
+                if (svq3_decode_block(&s->gb_slice, &s->mb[16 * 16 * i], 0, 3)) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "error while decoding chroma dc block\n");
                     return -1;
@@ -989,7 +994,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
                         k                                 = 16 * i + j;
                         s->non_zero_count_cache[scan8[k]] = 1;
 
-                        if (svq3_decode_block(&s->bc_slice, &s->mb[16 * k], 1, 1)) {
+                        if (svq3_decode_block(&s->gb_slice, &s->mb[16 * k], 1, 1)) {
                             av_log(s->avctx, AV_LOG_ERROR,
                                    "error while decoding chroma ac block\n");
                             return -1;
@@ -1017,7 +1022,7 @@ static int svq3_decode_slice_header(AVCodecContext *avctx)
     int i, header;
     unsigned slice_id;
 
-    header = bitstream_read(&s->bc, 8);
+    header = get_bits(&s->gb, 8);
 
     if (((header & 0x9F) != 1 && (header & 0x9F) != 2) || (header & 0x60) == 0) {
         /* TODO: what? */
@@ -1027,62 +1032,64 @@ static int svq3_decode_slice_header(AVCodecContext *avctx)
         int slice_bits, slice_bytes, slice_length;
         int length = header >> 5 & 3;
 
-        slice_length = bitstream_peek(&s->bc, 8 * length);
+        slice_length = show_bits(&s->gb, 8 * length);
         slice_bits   = slice_length * 8;
         slice_bytes  = slice_length + length - 1;
 
-        bitstream_skip(&s->bc, 8);
+        skip_bits(&s->gb, 8);
 
         av_fast_malloc(&s->slice_buf, &s->slice_size, slice_bytes + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!s->slice_buf)
             return AVERROR(ENOMEM);
 
-        if (slice_bytes * 8 > bitstream_bits_left(&s->bc)) {
+        if (slice_bytes * 8LL > get_bits_left(&s->gb)) {
             av_log(avctx, AV_LOG_ERROR, "slice after bitstream end\n");
             return AVERROR_INVALIDDATA;
         }
-        memcpy(s->slice_buf, s->bc.buffer + bitstream_tell(&s->bc) / 8, slice_bytes);
+        memcpy(s->slice_buf, s->gb.buffer + s->gb.index / 8, slice_bytes);
 
         if (s->watermark_key) {
-            uint32_t header = AV_RL32(&s->bc_slice.buffer[1]);
-            AV_WL32(&s->bc_slice.buffer[1], header ^ s->watermark_key);
+            uint32_t header = AV_RL32(&s->slice_buf[1]);
+            AV_WL32(&s->slice_buf[1], header ^ s->watermark_key);
         }
+        init_get_bits(&s->gb_slice, s->slice_buf, slice_bits);
+
         if (length > 0) {
-            memcpy(s->slice_buf, &s->slice_buf[slice_length], length - 1);
+            memmove(s->slice_buf, &s->slice_buf[slice_length], length - 1);
         }
-        bitstream_skip(&s->bc, slice_bytes * 8);
-        bitstream_init(&s->bc_slice, s->slice_buf, slice_bits);
+        skip_bits_long(&s->gb, slice_bytes * 8);
     }
 
-    if ((slice_id = get_interleaved_ue_golomb(&s->bc_slice)) >= 3) {
+    if ((slice_id = get_interleaved_ue_golomb(&s->gb_slice)) >= 3) {
         av_log(s->avctx, AV_LOG_ERROR, "illegal slice type %u \n", slice_id);
         return -1;
     }
 
-    s->pict_type = ff_h264_golomb_to_pict_type[slice_id];
+    s->slice_type = ff_h264_golomb_to_pict_type[slice_id];
 
     if ((header & 0x9F) == 2) {
         i = (s->mb_num < 64) ? 6 : (1 + av_log2(s->mb_num - 1));
-        bitstream_read(&s->bc_slice, i);
-    } else {
-        bitstream_skip(&s->bc_slice, 1);
+        get_bits(&s->gb_slice, i);
+    } else if (get_bits1(&s->gb_slice)) {
+        avpriv_report_missing_feature(s->avctx, "Media key encryption");
+        return AVERROR_PATCHWELCOME;
     }
 
-    s->slice_num      = bitstream_read(&s->bc_slice, 8);
-    s->qscale         = bitstream_read(&s->bc_slice, 5);
-    s->adaptive_quant = bitstream_read_bit(&s->bc_slice);
+    s->slice_num      = get_bits(&s->gb_slice, 8);
+    s->qscale         = get_bits(&s->gb_slice, 5);
+    s->adaptive_quant = get_bits1(&s->gb_slice);
 
     /* unknown fields */
-    bitstream_skip(&s->bc_slice, 1);
+    skip_bits1(&s->gb_slice);
 
-    if (s->unknown_flag)
-        bitstream_skip(&s->bc_slice, 1);
+    if (s->has_watermark)
+        skip_bits1(&s->gb_slice);
 
-    bitstream_skip(&s->bc_slice, 1);
-    bitstream_skip(&s->bc_slice, 2);
+    skip_bits1(&s->gb_slice);
+    skip_bits(&s->gb_slice, 2);
 
-    while (bitstream_read_bit(&s->bc_slice))
-        bitstream_skip(&s->bc_slice, 8);
+    if (skip_1stop_8data_bits(&s->gb_slice) < 0)
+        return AVERROR_INVALIDDATA;
 
     /* reset intra predictors and invalidate motion vector references */
     if (s->mb_x > 0) {
@@ -1124,15 +1131,14 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     unsigned char *extradata_end;
     unsigned int size;
     int marker_found = 0;
+    int ret;
 
     s->cur_pic  = av_mallocz(sizeof(*s->cur_pic));
     s->last_pic = av_mallocz(sizeof(*s->last_pic));
     s->next_pic = av_mallocz(sizeof(*s->next_pic));
     if (!s->next_pic || !s->last_pic || !s->cur_pic) {
-        av_freep(&s->cur_pic);
-        av_freep(&s->last_pic);
-        av_freep(&s->next_pic);
-        return AVERROR(ENOMEM);
+        ret = AVERROR(ENOMEM);
+        goto fail;
     }
 
     s->cur_pic->f  = av_frame_alloc();
@@ -1145,6 +1151,9 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_SVQ3, 8, 1);
     ff_videodsp_init(&s->vdsp, 8);
 
+
+    avctx->bits_per_raw_sample = 8;
+
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
     ff_tpeldsp_init(&s->tdsp);
 
@@ -1154,7 +1163,7 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     s->avctx         = avctx;
     s->halfpel_flag  = 1;
     s->thirdpel_flag = 1;
-    s->unknown_flag  = 0;
+    s->has_watermark = 0;
 
     /* prowl for the "SEQH" marker in the extradata */
     extradata     = (unsigned char *)avctx->extradata;
@@ -1171,16 +1180,19 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
 
     /* if a match was found, parse the extra data */
     if (marker_found) {
-        BitstreamContext bc;
+        GetBitContext gb;
         int frame_size_code;
+        int unk0, unk1, unk2, unk3, unk4;
 
         size = AV_RB32(&extradata[4]);
-        if (size > extradata_end - extradata - 8)
-            return AVERROR_INVALIDDATA;
-        bitstream_init8(&bc, extradata + 8, size);
+        if (size > extradata_end - extradata - 8) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+        init_get_bits(&gb, extradata + 8, size * 8);
 
         /* 'frame size code' and optional 'width, height' */
-        frame_size_code = bitstream_read(&bc, 3);
+        frame_size_code = get_bits(&gb, 3);
         switch (frame_size_code) {
         case 0:
             avctx->width  = 160;
@@ -1211,48 +1223,59 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
             avctx->height = 240;
             break;
         case 7:
-            avctx->width  = bitstream_read(&bc, 12);
-            avctx->height = bitstream_read(&bc, 12);
+            avctx->width  = get_bits(&gb, 12);
+            avctx->height = get_bits(&gb, 12);
             break;
         }
 
-        s->halfpel_flag  = bitstream_read_bit(&bc);
-        s->thirdpel_flag = bitstream_read_bit(&bc);
+        s->halfpel_flag  = get_bits1(&gb);
+        s->thirdpel_flag = get_bits1(&gb);
 
         /* unknown fields */
-        bitstream_skip(&bc, 1);
-        bitstream_skip(&bc, 1);
-        bitstream_skip(&bc, 1);
-        bitstream_skip(&bc, 1);
+        unk0 = get_bits1(&gb);
+        unk1 = get_bits1(&gb);
+        unk2 = get_bits1(&gb);
+        unk3 = get_bits1(&gb);
 
-        s->low_delay = bitstream_read_bit(&bc);
+        s->low_delay = get_bits1(&gb);
 
         /* unknown field */
-        bitstream_skip(&bc, 1);
+        unk4 = get_bits1(&gb);
+
+        av_log(avctx, AV_LOG_DEBUG, "Unknown fields %d %d %d %d %d\n",
+               unk0, unk1, unk2, unk3, unk4);
 
-        while (bitstream_read_bit(&bc))
-            bitstream_skip(&bc, 8);
+        if (skip_1stop_8data_bits(&gb) < 0) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
 
-        s->unknown_flag  = bitstream_read_bit(&bc);
+        s->has_watermark  = get_bits1(&gb);
         avctx->has_b_frames = !s->low_delay;
-        if (s->unknown_flag) {
+        if (s->has_watermark) {
 #if CONFIG_ZLIB
-            unsigned watermark_width  = get_interleaved_ue_golomb(&bc);
-            unsigned watermark_height = get_interleaved_ue_golomb(&bc);
-            int u1                    = get_interleaved_ue_golomb(&bc);
-            int u2                    = bitstream_read(&bc, 8);
-            int u3                    = bitstream_read(&bc, 2);
-            int u4                    = get_interleaved_ue_golomb(&bc);
+            unsigned watermark_width  = get_interleaved_ue_golomb(&gb);
+            unsigned watermark_height = get_interleaved_ue_golomb(&gb);
+            int u1                    = get_interleaved_ue_golomb(&gb);
+            int u2                    = get_bits(&gb, 8);
+            int u3                    = get_bits(&gb, 2);
+            int u4                    = get_interleaved_ue_golomb(&gb);
             unsigned long buf_len     = watermark_width *
                                         watermark_height * 4;
-            int offset                = bitstream_tell(&bc) + 7 >> 3;
+            int offset                = get_bits_count(&gb) + 7 >> 3;
             uint8_t *buf;
 
-            if (watermark_height > 0 &&
-                (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height)
-                return -1;
+            if (watermark_height <= 0 ||
+                (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height) {
+                ret = -1;
+                goto fail;
+            }
 
             buf = av_malloc(buf_len);
+            if (!buf) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
             av_log(avctx, AV_LOG_DEBUG, "watermark size: %ux%u\n",
                    watermark_width, watermark_height);
             av_log(avctx, AV_LOG_DEBUG,
@@ -1263,7 +1286,8 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
                 av_log(avctx, AV_LOG_ERROR,
                        "could not uncompress watermark logo\n");
                 av_free(buf);
-                return -1;
+                ret = -1;
+                goto fail;
             }
             s->watermark_key = ff_svq1_packet_checksum(buf, buf_len, 0);
             s->watermark_key = s->watermark_key << 16 | s->watermark_key;
@@ -1273,7 +1297,8 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
 #else
             av_log(avctx, AV_LOG_ERROR,
                    "this svq3 file contains watermark which need zlib support compiled in\n");
-            return -1;
+            ret = -1;
+            goto fail;
 #endif
         }
     }
@@ -1305,6 +1330,9 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     init_dequant4_coeff_table(s);
 
     return 0;
+fail:
+    svq3_decode_end(avctx);
+    return ret;
 }
 
 static void free_picture(AVCodecContext *avctx, SVQ3Frame *pic)
@@ -1356,7 +1384,7 @@ static int get_buffer(AVCodecContext *avctx, SVQ3Frame *pic)
         goto fail;
 
     if (!s->edge_emu_buffer) {
-        s->edge_emu_buffer = av_mallocz(pic->f->linesize[0] * 17);
+        s->edge_emu_buffer = av_mallocz_array(pic->f->linesize[0], 17);
         if (!s->edge_emu_buffer)
             return AVERROR(ENOMEM);
     }
@@ -1370,9 +1398,10 @@ fail:
 static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
     SVQ3Context *s     = avctx->priv_data;
     int buf_size       = avpkt->size;
+    int left;
+    uint8_t *buf;
     int ret, m, i;
 
     /* special case for last picture */
@@ -1387,15 +1416,27 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
         return 0;
     }
 
-    ret = bitstream_init8(&s->bc, buf, buf_size);
+    s->mb_x = s->mb_y = s->mb_xy = 0;
+
+    if (s->watermark_key) {
+        av_fast_padded_malloc(&s->buf, &s->buf_size, buf_size);
+        if (!s->buf)
+            return AVERROR(ENOMEM);
+        memcpy(s->buf, avpkt->data, buf_size);
+        buf = s->buf;
+    } else {
+        buf = avpkt->data;
+    }
+
+    ret = init_get_bits(&s->gb, buf, 8 * buf_size);
     if (ret < 0)
         return ret;
 
-    s->mb_x = s->mb_y = s->mb_xy = 0;
-
     if (svq3_decode_slice_header(avctx))
         return -1;
 
+    s->pict_type = s->slice_type;
+
     if (s->pict_type != AV_PICTURE_TYPE_B)
         FFSWAP(SVQ3Frame*, s->next_pic, s->last_pic);
 
@@ -1423,6 +1464,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
     if (s->pict_type != AV_PICTURE_TYPE_I) {
         if (!s->last_pic->f->data[0]) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+            av_frame_unref(s->last_pic->f);
             ret = get_buffer(avctx, s->last_pic);
             if (ret < 0)
                 return ret;
@@ -1435,6 +1477,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
 
         if (s->pict_type == AV_PICTURE_TYPE_B && !s->next_pic->f->data[0]) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+            av_frame_unref(s->next_pic->f);
             ret = get_buffer(avctx, s->next_pic);
             if (ret < 0)
                 return ret;
@@ -1500,17 +1543,20 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
             unsigned mb_type;
             s->mb_xy = s->mb_x + s->mb_y * s->mb_stride;
 
-            if ((bitstream_bits_left(&s->bc_slice)) <= 7) {
-                if (((bitstream_tell(&s->bc_slice) & 7) == 0 ||
-                    bitstream_peek(&s->bc_slice, bitstream_bits_left(&s->bc_slice) & 7) == 0)) {
+            if ((get_bits_left(&s->gb_slice)) <= 7) {
+                if (((get_bits_count(&s->gb_slice) & 7) == 0 ||
+                    show_bits(&s->gb_slice, get_bits_left(&s->gb_slice) & 7) == 0)) {
 
                     if (svq3_decode_slice_header(avctx))
                         return -1;
                 }
+                if (s->slice_type != s->pict_type) {
+                    avpriv_request_sample(avctx, "non constant slice type");
+                }
                 /* TODO: support s->mb_skip_run */
             }
 
-            mb_type = get_interleaved_ue_golomb(&s->bc_slice);
+            mb_type = get_interleaved_ue_golomb(&s->gb_slice);
 
             if (s->pict_type == AV_PICTURE_TYPE_I)
                 mb_type += 8;
@@ -1522,7 +1568,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                 return -1;
             }
 
-            if (mb_type != 0)
+            if (mb_type != 0 || s->cbp)
                 hl_decode_mb(s);
 
             if (s->pict_type != AV_PICTURE_TYPE_B && !s->low_delay)
@@ -1536,6 +1582,18 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                            s->low_delay);
     }
 
+    left = buf_size*8 - get_bits_count(&s->gb_slice);
+
+    if (s->mb_y != s->mb_height || s->mb_x != s->mb_width) {
+        av_log(avctx, AV_LOG_INFO, "frame num %d incomplete pic x %d y %d left %d\n", avctx->frame_number, s->mb_y, s->mb_x, left);
+        //av_hex_dump(stderr, buf+buf_size-8, 8);
+    }
+
+    if (left < 0) {
+        av_log(avctx, AV_LOG_ERROR, "frame num %d left %d\n", avctx->frame_number, left);
+        return -1;
+    }
+
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay)
         ret = av_frame_ref(data, s->cur_pic->f);
     else if (s->last_pic->f->data[0])
@@ -1574,6 +1632,10 @@ static av_cold int svq3_decode_end(AVCodecContext *avctx)
     av_freep(&s->edge_emu_buffer);
     av_freep(&s->mb2br_xy);
 
+
+    av_freep(&s->buf);
+    s->buf_size = 0;
+
     return 0;
 }
 
diff --git a/libavcodec/synth_filter.c b/libavcodec/synth_filter.c
index 708bd4e..1c5dab5 100644
--- a/libavcodec/synth_filter.c
+++ b/libavcodec/synth_filter.c
@@ -1,64 +1,179 @@
 /*
  * copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "fft.h"
+#include "dcadct.h"
+#include "dcamath.h"
 #include "synth_filter.h"
 
 static void synth_filter_float(FFTContext *imdct,
-                           float *synth_buf_ptr, int *synth_buf_offset,
-                           float synth_buf2[32], const float window[512],
-                           float out[32], const float in[32], float scale)
+                               float *synth_buf_ptr, int *synth_buf_offset,
+                               float synth_buf2[32], const float window[512],
+                               float out[32], const float in[32], float scale)
 {
-    float *synth_buf= synth_buf_ptr + *synth_buf_offset;
+    float *synth_buf = synth_buf_ptr + *synth_buf_offset;
     int i, j;
 
     imdct->imdct_half(imdct, synth_buf, in);
 
-    for (i = 0; i < 16; i++){
-        float a= synth_buf2[i     ];
-        float b= synth_buf2[i + 16];
-        float c= 0;
-        float d= 0;
-        for (j = 0; j < 512 - *synth_buf_offset; j += 64){
-            a += window[i + j     ]*(-synth_buf[15 - i + j      ]);
-            b += window[i + j + 16]*( synth_buf[     i + j      ]);
-            c += window[i + j + 32]*( synth_buf[16 + i + j      ]);
-            d += window[i + j + 48]*( synth_buf[31 - i + j      ]);
+    for (i = 0; i < 16; i++) {
+        float a = synth_buf2[i     ];
+        float b = synth_buf2[i + 16];
+        float c = 0;
+        float d = 0;
+        for (j = 0; j < 512 - *synth_buf_offset; j += 64) {
+            a += window[i + j     ] * (-synth_buf[15 - i + j      ]);
+            b += window[i + j + 16] * ( synth_buf[     i + j      ]);
+            c += window[i + j + 32] * ( synth_buf[16 + i + j      ]);
+            d += window[i + j + 48] * ( synth_buf[31 - i + j      ]);
         }
-        for (     ; j < 512; j += 64){
-            a += window[i + j     ]*(-synth_buf[15 - i + j - 512]);
-            b += window[i + j + 16]*( synth_buf[     i + j - 512]);
-            c += window[i + j + 32]*( synth_buf[16 + i + j - 512]);
-            d += window[i + j + 48]*( synth_buf[31 - i + j - 512]);
+        for (     ; j < 512; j += 64) {
+            a += window[i + j     ] * (-synth_buf[15 - i + j - 512]);
+            b += window[i + j + 16] * ( synth_buf[     i + j - 512]);
+            c += window[i + j + 32] * ( synth_buf[16 + i + j - 512]);
+            d += window[i + j + 48] * ( synth_buf[31 - i + j - 512]);
         }
-        out[i     ] = a*scale;
-        out[i + 16] = b*scale;
+        out[i     ] = a * scale;
+        out[i + 16] = b * scale;
         synth_buf2[i     ] = c;
         synth_buf2[i + 16] = d;
     }
-    *synth_buf_offset= (*synth_buf_offset - 32)&511;
+
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;
+}
+
+static void synth_filter_float_64(FFTContext *imdct,
+                                  float *synth_buf_ptr, int *synth_buf_offset,
+                                  float synth_buf2[64], const float window[1024],
+                                  float out[64], const float in[64], float scale)
+{
+    float *synth_buf = synth_buf_ptr + *synth_buf_offset;
+    int i, j;
+
+    imdct->imdct_half(imdct, synth_buf, in);
+
+    for (i = 0; i < 32; i++) {
+        float a = synth_buf2[i     ];
+        float b = synth_buf2[i + 32];
+        float c = 0;
+        float d = 0;
+        for (j = 0; j < 1024 - *synth_buf_offset; j += 128) {
+            a += window[i + j     ] * (-synth_buf[31 - i + j       ]);
+            b += window[i + j + 32] * ( synth_buf[     i + j       ]);
+            c += window[i + j + 64] * ( synth_buf[32 + i + j       ]);
+            d += window[i + j + 96] * ( synth_buf[63 - i + j       ]);
+        }
+        for (     ; j < 1024; j += 128) {
+            a += window[i + j     ] * (-synth_buf[31 - i + j - 1024]);
+            b += window[i + j + 32] * ( synth_buf[     i + j - 1024]);
+            c += window[i + j + 64] * ( synth_buf[32 + i + j - 1024]);
+            d += window[i + j + 96] * ( synth_buf[63 - i + j - 1024]);
+        }
+        out[i     ] = a * scale;
+        out[i + 32] = b * scale;
+        synth_buf2[i     ] = c;
+        synth_buf2[i + 32] = d;
+    }
+
+    *synth_buf_offset = (*synth_buf_offset - 64) & 1023;
+}
+
+static void synth_filter_fixed(DCADCTContext *imdct,
+                               int32_t *synth_buf_ptr, int *synth_buf_offset,
+                               int32_t synth_buf2[32], const int32_t window[512],
+                               int32_t out[32], const int32_t in[32])
+{
+    int32_t *synth_buf = synth_buf_ptr + *synth_buf_offset;
+    int i, j;
+
+    imdct->imdct_half[0](synth_buf, in);
+
+    for (i = 0; i < 16; i++) {
+        int64_t a = synth_buf2[i     ] * (INT64_C(1) << 21);
+        int64_t b = synth_buf2[i + 16] * (INT64_C(1) << 21);
+        int64_t c = 0;
+        int64_t d = 0;
+        for (j = 0; j < 512 - *synth_buf_offset; j += 64) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j      ];
+            b += (int64_t)window[i + j + 16] * synth_buf[15 - i + j      ];
+            c += (int64_t)window[i + j + 32] * synth_buf[16 + i + j      ];
+            d += (int64_t)window[i + j + 48] * synth_buf[31 - i + j      ];
+        }
+        for (     ; j < 512; j += 64) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j - 512];
+            b += (int64_t)window[i + j + 16] * synth_buf[15 - i + j - 512];
+            c += (int64_t)window[i + j + 32] * synth_buf[16 + i + j - 512];
+            d += (int64_t)window[i + j + 48] * synth_buf[31 - i + j - 512];
+        }
+        out[i     ] = clip23(norm21(a));
+        out[i + 16] = clip23(norm21(b));
+        synth_buf2[i     ] = norm21(c);
+        synth_buf2[i + 16] = norm21(d);
+    }
+
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;
+}
+
+static void synth_filter_fixed_64(DCADCTContext *imdct,
+                                  int32_t *synth_buf_ptr, int *synth_buf_offset,
+                                  int32_t synth_buf2[64], const int32_t window[1024],
+                                  int32_t out[64], const int32_t in[64])
+{
+    int32_t *synth_buf = synth_buf_ptr + *synth_buf_offset;
+    int i, j;
+
+    imdct->imdct_half[1](synth_buf, in);
+
+    for (i = 0; i < 32; i++) {
+        int64_t a = synth_buf2[i     ] * (INT64_C(1) << 20);
+        int64_t b = synth_buf2[i + 32] * (INT64_C(1) << 20);
+        int64_t c = 0;
+        int64_t d = 0;
+        for (j = 0; j < 1024 - *synth_buf_offset; j += 128) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j       ];
+            b += (int64_t)window[i + j + 32] * synth_buf[31 - i + j       ];
+            c += (int64_t)window[i + j + 64] * synth_buf[32 + i + j       ];
+            d += (int64_t)window[i + j + 96] * synth_buf[63 - i + j       ];
+        }
+        for (     ; j < 1024; j += 128) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j - 1024];
+            b += (int64_t)window[i + j + 32] * synth_buf[31 - i + j - 1024];
+            c += (int64_t)window[i + j + 64] * synth_buf[32 + i + j - 1024];
+            d += (int64_t)window[i + j + 96] * synth_buf[63 - i + j - 1024];
+        }
+        out[i     ] = clip23(norm20(a));
+        out[i + 32] = clip23(norm20(b));
+        synth_buf2[i     ] = norm20(c);
+        synth_buf2[i + 32] = norm20(d);
+    }
+
+    *synth_buf_offset = (*synth_buf_offset - 64) & 1023;
 }
 
 av_cold void ff_synth_filter_init(SynthFilterContext *c)
 {
-    c->synth_filter_float = synth_filter_float;
+    c->synth_filter_float    = synth_filter_float;
+    c->synth_filter_float_64 = synth_filter_float_64;
+    c->synth_filter_fixed    = synth_filter_fixed;
+    c->synth_filter_fixed_64 = synth_filter_fixed_64;
 
     if (ARCH_AARCH64)
         ff_synth_filter_init_aarch64(c);
diff --git a/libavcodec/synth_filter.h b/libavcodec/synth_filter.h
index a93dc4f..df3589a 100644
--- a/libavcodec/synth_filter.h
+++ b/libavcodec/synth_filter.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,7 @@
 #define AVCODEC_SYNTH_FILTER_H
 
 #include "fft.h"
+#include "dcadct.h"
 
 typedef struct SynthFilterContext {
     void (*synth_filter_float)(FFTContext *imdct,
@@ -29,6 +30,18 @@ typedef struct SynthFilterContext {
                                float synth_buf2[32], const float window[512],
                                float out[32], const float in[32],
                                float scale);
+    void (*synth_filter_float_64)(FFTContext *imdct,
+                                  float *synth_buf_ptr, int *synth_buf_offset,
+                                  float synth_buf2[64], const float window[1024],
+                                  float out[64], const float in[64], float scale);
+    void (*synth_filter_fixed)(DCADCTContext *imdct,
+                               int32_t *synth_buf_ptr, int *synth_buf_offset,
+                               int32_t synth_buf2[32], const int32_t window[512],
+                               int32_t out[32], const int32_t in[32]);
+    void (*synth_filter_fixed_64)(DCADCTContext *imdct,
+                                  int32_t *synth_buf_ptr, int *synth_buf_offset,
+                                  int32_t synth_buf2[64], const int32_t window[1024],
+                                  int32_t out[64], const int32_t in[64]);
 } SynthFilterContext;
 
 void ff_synth_filter_init(SynthFilterContext *c);
diff --git a/libavcodec/tableprint.h b/libavcodec/tableprint.h
index daa89fe..6f61c71 100644
--- a/libavcodec/tableprint.h
+++ b/libavcodec/tableprint.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,6 +64,7 @@ void write_int8_t_array     (const int8_t   *, int);
 void write_uint8_t_array    (const uint8_t  *, int);
 void write_uint16_t_array   (const uint16_t *, int);
 void write_uint32_t_array   (const uint32_t *, int);
+void write_int32_t_array    (const int32_t  *, int);
 void write_float_array      (const float    *, int);
 void write_int8_t_2d_array  (const void *, int, int);
 void write_uint8_t_2d_array (const void *, int, int);
@@ -81,6 +82,16 @@ void write_float_2d_array   (const void *, int, int);
 #define FMT "zu"
 #endif
 
+#define WRITE_ARRAY_ALIGNED(prefix, align, type, name)  \
+    do {                                                \
+        const size_t array_size = FF_ARRAY_ELEMS(name); \
+        printf(prefix" DECLARE_ALIGNED("#align", "      \
+               #type", "#name")[%"FMT"] = {\n",         \
+               array_size);                             \
+        write_##type##_array(name, array_size);         \
+        printf("};\n");                                 \
+    } while(0)
+
 #define WRITE_ARRAY(prefix, type, name)                 \
     do {                                                \
         const size_t array_size = FF_ARRAY_ELEMS(name); \
@@ -104,7 +115,9 @@ void write_float_2d_array   (const void *, int, int);
 WRITE_1D_FUNC(int8_t,   "%3"PRIi8, 15)
 WRITE_1D_FUNC(uint8_t,  "0x%02"PRIx8, 15)
 WRITE_1D_FUNC(uint16_t, "0x%08"PRIx16, 7)
+WRITE_1D_FUNC(int16_t,  "%5"PRIi16, 7)
 WRITE_1D_FUNC(uint32_t, "0x%08"PRIx32, 7)
+WRITE_1D_FUNC(int32_t,  "0x%08"PRIx32, 7)
 WRITE_1D_FUNC(float,    "%.18e", 3)
 
 WRITE_2D_FUNC(int8_t)
diff --git a/libavcodec/tableprint_vlc.h b/libavcodec/tableprint_vlc.h
new file mode 100644
index 0000000..3004be3
--- /dev/null
+++ b/libavcodec/tableprint_vlc.h
@@ -0,0 +1,83 @@
+/*
+ * Helpers for generating hard-coded VLC tables
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TABLEPRINT_VLC_H
+#define AVCODEC_TABLEPRINT_VLC_H
+
+#define FFMPEG_CONFIG_H
+#define AVUTIL_LOG_H
+#define av_log(a, ...) while(0)
+#define ff_dlog(a, ...) while(0)
+#define AVUTIL_MEM_H
+#define av_malloc(s) NULL
+#define av_malloc_array(a, b) NULL
+#define av_realloc_f(p, o, n) NULL
+#define av_free(p) while(0)
+#define av_freep(p) while(0)
+#define AVCODEC_AVCODEC_H
+#define AVCODEC_INTERNAL_H
+#define AV_INPUT_BUFFER_PADDING_SIZE 64 // the value does not matter for this
+#include "tableprint.h"
+#include "get_bits.h"
+#include "mathtables.c"
+#include "libavutil/reverse.c"
+#include "bitstream.c"
+
+#define REPLACE_DEFINE2(type) write_##type##_array
+#define REPLACE_DEFINE(type) REPLACE_DEFINE2(type)
+static void write_VLC_TYPE_array(const VLC_TYPE *p, int s) {
+    REPLACE_DEFINE(VLC_TYPE)(p, s);
+}
+
+WRITE_2D_FUNC(VLC_TYPE)
+
+static void write_vlc_type(const VLC *vlc, VLC_TYPE (*base_table)[2], const char *base_table_name)
+{
+    printf("    .bits = %i,\n", vlc->bits);
+    // Unfortunately need to cast away const currently
+    printf("    .table = (VLC_TYPE (*)[2])(%s + 0x%x),\n", base_table_name, (int)(vlc->table - base_table));
+    printf("    .table_size = 0x%x,\n", vlc->table_size);
+    printf("    .table_allocated = 0x%x,\n", vlc->table_allocated);
+}
+
+#define WRITE_VLC_TYPE(prefix, name, base_table)        \
+    do {                                                \
+        printf(prefix" VLC "#name" = {\n");             \
+        write_vlc_type(&name, base_table, #base_table); \
+        printf("};\n");                                 \
+    } while(0)
+
+#define WRITE_VLC_ARRAY(prefix, name, base_table)       \
+    do {                                                \
+        int i;                                          \
+        const size_t array_size = FF_ARRAY_ELEMS(name); \
+        printf(prefix" VLC "#name"[%"FMT"] = {{\n",     \
+               array_size);                             \
+        for (i = 0; i < array_size; i++) {              \
+            write_vlc_type(name + i,                    \
+                           base_table, #base_table);    \
+            if (i != array_size - 1) printf("}, {\n");  \
+        }                                               \
+        printf("}};\n");                                \
+    } while(0)
+
+#endif /* AVCODEC_TABLEPRINT_VLC_H */
diff --git a/libavcodec/tak.c b/libavcodec/tak.c
index c90e55a..8aa956b 100644
--- a/libavcodec/tak.c
+++ b/libavcodec/tak.c
@@ -2,31 +2,51 @@
  * TAK common code
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/bswap.h"
 #include "libavutil/crc.h"
 #include "libavutil/intreadwrite.h"
 
 #define BITSTREAM_READER_LE
-#include "bitstream.h"
 #include "tak.h"
 
+static const int64_t tak_channel_layouts[] = {
+    0,
+    AV_CH_FRONT_LEFT,
+    AV_CH_FRONT_RIGHT,
+    AV_CH_FRONT_CENTER,
+    AV_CH_LOW_FREQUENCY,
+    AV_CH_BACK_LEFT,
+    AV_CH_BACK_RIGHT,
+    AV_CH_FRONT_LEFT_OF_CENTER,
+    AV_CH_FRONT_RIGHT_OF_CENTER,
+    AV_CH_BACK_CENTER,
+    AV_CH_SIDE_LEFT,
+    AV_CH_SIDE_RIGHT,
+    AV_CH_TOP_CENTER,
+    AV_CH_TOP_FRONT_LEFT,
+    AV_CH_TOP_FRONT_CENTER,
+    AV_CH_TOP_FRONT_RIGHT,
+    AV_CH_TOP_BACK_LEFT,
+    AV_CH_TOP_BACK_CENTER,
+    AV_CH_TOP_BACK_RIGHT,
+};
+
 static const uint16_t frame_duration_type_quants[] = {
     3, 4, 6, 8, 4096, 8192, 16384, 512, 1024, 2048,
 };
@@ -54,22 +74,6 @@ static int tak_get_nb_samples(int sample_rate, enum TAKFrameSizeType type)
     return nb_samples;
 }
 
-static int crc_init = 0;
-#if CONFIG_SMALL
-#define CRC_TABLE_SIZE 257
-#else
-#define CRC_TABLE_SIZE 1024
-#endif
-static AVCRC crc_24[CRC_TABLE_SIZE];
-
-av_cold void ff_tak_init_crc(void)
-{
-    if (!crc_init) {
-        av_crc_init(crc_24, 0, 24, 0x864CFBU, sizeof(crc_24));
-        crc_init = 1;
-    }
-}
-
 int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size)
 {
     uint32_t crc, CRC;
@@ -78,41 +82,41 @@ int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size)
         return AVERROR_INVALIDDATA;
     buf_size -= 3;
 
-    CRC = av_bswap32(AV_RL24(buf + buf_size)) >> 8;
-    crc = av_crc(crc_24, 0xCE04B7U, buf, buf_size);
+    CRC = AV_RB24(buf + buf_size);
+    crc = av_crc(av_crc_get_table(AV_CRC_24_IEEE), 0xCE04B7U, buf, buf_size);
     if (CRC != crc)
         return AVERROR_INVALIDDATA;
 
     return 0;
 }
 
-void avpriv_tak_parse_streaminfo(BitstreamContext *bc, TAKStreamInfo *s)
+void ff_tak_parse_streaminfo(TAKStreamInfo *s, GetBitContext *gb)
 {
     uint64_t channel_mask = 0;
     int frame_type, i;
 
-    s->codec = bitstream_read(bc, TAK_ENCODER_CODEC_BITS);
-    bitstream_skip(bc, TAK_ENCODER_PROFILE_BITS);
+    s->codec = get_bits(gb, TAK_ENCODER_CODEC_BITS);
+    skip_bits(gb, TAK_ENCODER_PROFILE_BITS);
 
-    frame_type = bitstream_read(bc, TAK_SIZE_FRAME_DURATION_BITS);
-    s->samples = bitstream_read_63(bc, TAK_SIZE_SAMPLES_NUM_BITS);
+    frame_type = get_bits(gb, TAK_SIZE_FRAME_DURATION_BITS);
+    s->samples = get_bits64(gb, TAK_SIZE_SAMPLES_NUM_BITS);
 
-    s->data_type   = bitstream_read(bc, TAK_FORMAT_DATA_TYPE_BITS);
-    s->sample_rate = bitstream_read(bc, TAK_FORMAT_SAMPLE_RATE_BITS) +
+    s->data_type   = get_bits(gb, TAK_FORMAT_DATA_TYPE_BITS);
+    s->sample_rate = get_bits(gb, TAK_FORMAT_SAMPLE_RATE_BITS) +
                      TAK_SAMPLE_RATE_MIN;
-    s->bps         = bitstream_read(bc, TAK_FORMAT_BPS_BITS) +
+    s->bps         = get_bits(gb, TAK_FORMAT_BPS_BITS) +
                      TAK_BPS_MIN;
-    s->channels    = bitstream_read(bc, TAK_FORMAT_CHANNEL_BITS) +
+    s->channels    = get_bits(gb, TAK_FORMAT_CHANNEL_BITS) +
                      TAK_CHANNELS_MIN;
 
-    if (bitstream_read_bit(bc)) {
-        bitstream_skip(bc, TAK_FORMAT_VALID_BITS);
-        if (bitstream_read_bit(bc)) {
+    if (get_bits1(gb)) {
+        skip_bits(gb, TAK_FORMAT_VALID_BITS);
+        if (get_bits1(gb)) {
             for (i = 0; i < s->channels; i++) {
-                int value = bitstream_read(bc, TAK_FORMAT_CH_LAYOUT_BITS);
+                int value = get_bits(gb, TAK_FORMAT_CH_LAYOUT_BITS);
 
-                if (value > 0 && value <= 18)
-                    channel_mask |= 1 << (value - 1);
+                if (value < FF_ARRAY_ELEMS(tak_channel_layouts))
+                    channel_mask |= tak_channel_layouts[value];
             }
         }
     }
@@ -121,33 +125,49 @@ void avpriv_tak_parse_streaminfo(BitstreamContext *bc, TAKStreamInfo *s)
     s->frame_samples = tak_get_nb_samples(s->sample_rate, frame_type);
 }
 
-int ff_tak_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
+int avpriv_tak_parse_streaminfo(TAKStreamInfo *s, const uint8_t *buf, int size)
+{
+    GetBitContext gb;
+    int ret = init_get_bits8(&gb, buf, size);
+
+    if (ret < 0)
+        return AVERROR_INVALIDDATA;
+
+    ff_tak_parse_streaminfo(s, &gb);
+
+    return 0;
+}
+
+int ff_tak_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
                                TAKStreamInfo *ti, int log_level_offset)
 {
-    if (bitstream_read(bc, TAK_FRAME_HEADER_SYNC_ID_BITS) != TAK_FRAME_HEADER_SYNC_ID) {
+    if (get_bits(gb, TAK_FRAME_HEADER_SYNC_ID_BITS) != TAK_FRAME_HEADER_SYNC_ID) {
         av_log(avctx, AV_LOG_ERROR + log_level_offset, "missing sync id\n");
         return AVERROR_INVALIDDATA;
     }
 
-    ti->flags     = bitstream_read(bc, TAK_FRAME_HEADER_FLAGS_BITS);
-    ti->frame_num = bitstream_read(bc, TAK_FRAME_HEADER_NO_BITS);
+    ti->flags     = get_bits(gb, TAK_FRAME_HEADER_FLAGS_BITS);
+    ti->frame_num = get_bits(gb, TAK_FRAME_HEADER_NO_BITS);
 
     if (ti->flags & TAK_FRAME_FLAG_IS_LAST) {
-        ti->last_frame_samples = bitstream_read(bc, TAK_FRAME_HEADER_SAMPLE_COUNT_BITS) + 1;
-        bitstream_skip(bc, 2);
+        ti->last_frame_samples = get_bits(gb, TAK_FRAME_HEADER_SAMPLE_COUNT_BITS) + 1;
+        skip_bits(gb, 2);
     } else {
         ti->last_frame_samples = 0;
     }
 
     if (ti->flags & TAK_FRAME_FLAG_HAS_INFO) {
-        avpriv_tak_parse_streaminfo(bc, ti);
+        ff_tak_parse_streaminfo(ti, gb);
 
-        if (bitstream_read(bc, 6))
-            bitstream_skip(bc, 25);
-        bitstream_align(bc);
+        if (get_bits(gb, 6))
+            skip_bits(gb, 25);
+        align_get_bits(gb);
     }
 
-    bitstream_skip(bc, 24);
+    if (ti->flags & TAK_FRAME_FLAG_HAS_METADATA)
+        return AVERROR_INVALIDDATA;
+
+    skip_bits(gb, 24);
 
     return 0;
 }
diff --git a/libavcodec/tak.h b/libavcodec/tak.h
index c752062..dc45a8c 100644
--- a/libavcodec/tak.h
+++ b/libavcodec/tak.h
@@ -2,20 +2,20 @@
  * TAK decoder/demuxer common code
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include <stdint.h>
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 
 #define TAK_FORMAT_DATA_TYPE_BITS               3
 #define TAK_FORMAT_SAMPLE_RATE_BITS            18
@@ -98,7 +98,7 @@
 
 enum TAKCodecType {
     TAK_CODEC_MONO_STEREO  = 2,
-    TAK_CODEC_MULTICHANNEL = 4
+    TAK_CODEC_MULTICHANNEL = 4,
 };
 
 enum TAKMetaDataType {
@@ -139,27 +139,28 @@ typedef struct TAKStreamInfo {
     int64_t           samples;
 } TAKStreamInfo;
 
-void ff_tak_init_crc(void);
-
 int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size);
 
 /**
  * Parse the Streaminfo metadata block.
- * @param[in]  bc pointer to BitstreamContext
  * @param[out] s  storage for parsed information
+ * @param[in]  buf   input buffer
+ * @param[in]  size  size of input buffer in bytes
+ * @return non-zero on error, 0 if OK
  */
-void avpriv_tak_parse_streaminfo(BitstreamContext *bc, TAKStreamInfo *s);
+int avpriv_tak_parse_streaminfo(TAKStreamInfo *s, const uint8_t *buf, int size);
+
+void ff_tak_parse_streaminfo(TAKStreamInfo *s, GetBitContext *gb);
 
 /**
  * Validate and decode a frame header.
  * @param      avctx             AVCodecContext to use as av_log() context
- * @param[in]  bc                BitstreamContext from which to read frame header
+ * @param[in]  gb                GetBitContext from which to read frame header
  * @param[out] s                 frame information
  * @param      log_level_offset  log level offset, can be used to silence
  *                               error messages.
  * @return non-zero on error, 0 if OK
  */
-int ff_tak_decode_frame_header(AVCodecContext *avctx, BitstreamContext *bc,
+int ff_tak_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
                                TAKStreamInfo *s, int log_level_offset);
-
 #endif /* AVCODEC_TAK_H */
diff --git a/libavcodec/tak_parser.c b/libavcodec/tak_parser.c
index 82fb7e8..835a47b 100644
--- a/libavcodec/tak_parser.c
+++ b/libavcodec/tak_parser.c
@@ -2,20 +2,20 @@
  * TAK parser
  * Copyright (c) 2012 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,6 @@
  **/
 
 #define BITSTREAM_READER_LE
-#include "bitstream.h"
 #include "parser.h"
 #include "tak.h"
 
@@ -35,12 +34,6 @@ typedef struct TAKParseContext {
     int           index;
 } TAKParseContext;
 
-static av_cold int tak_init(AVCodecParserContext *s)
-{
-    ff_tak_init_crc();
-    return 0;
-}
-
 static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                      const uint8_t **poutbuf, int *poutbuf_size,
                      const uint8_t *buf, int buf_size)
@@ -48,14 +41,16 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     TAKParseContext *t = s->priv_data;
     ParseContext *pc   = &t->pc;
     int next           = END_NOT_FOUND;
-    BitstreamContext bc;
+    GetBitContext gb;
     int consumed = 0;
     int needed   = buf_size ? TAK_MAX_FRAME_HEADER_BYTES : 8;
+    int ret;
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         TAKStreamInfo ti;
-        bitstream_init(&bc, buf, buf_size);
-        if (!ff_tak_decode_frame_header(avctx, &bc, &ti, 127))
+        if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+            return ret;
+        if (!ff_tak_decode_frame_header(avctx, &gb, &ti, 127))
             s->duration = t->ti.last_frame_samples ? t->ti.last_frame_samples
                                                    : t->ti.frame_samples;
         *poutbuf      = buf;
@@ -65,34 +60,35 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
     while (buf_size || t->index + needed <= pc->index) {
         if (buf_size && t->index + TAK_MAX_FRAME_HEADER_BYTES > pc->index) {
-            int tmp_buf_size       = FFMIN(2 * TAK_MAX_FRAME_HEADER_BYTES,
+            int tmp_buf_size       = FFMIN(TAK_MAX_FRAME_HEADER_BYTES,
                                            buf_size);
             const uint8_t *tmp_buf = buf;
 
-            ff_combine_frame(pc, END_NOT_FOUND, &tmp_buf, &tmp_buf_size);
+            if (ff_combine_frame(pc, END_NOT_FOUND, &tmp_buf, &tmp_buf_size) != -1)
+                return AVERROR(ENOMEM);
             consumed += tmp_buf_size;
             buf      += tmp_buf_size;
             buf_size -= tmp_buf_size;
         }
 
-        for (; t->index + needed <= pc->index; t->index++)
-            if (pc->buffer[t->index]     == 0xFF &&
-                pc->buffer[t->index + 1] == 0xA0) {
+        for (; t->index + needed <= pc->index; t->index++) {
+            if (pc->buffer[ t->index     ] == 0xFF &&
+                pc->buffer[ t->index + 1 ] == 0xA0) {
                 TAKStreamInfo ti;
 
-                bitstream_init8(&bc, pc->buffer + t->index,
-                                pc->index - t->index);
-                if (!ff_tak_decode_frame_header(avctx, &bc,
-                                                pc->frame_start_found ? &ti
-                                                                      : &t->ti,
-                                                127) &&
+                if ((ret = init_get_bits8(&gb, pc->buffer + t->index,
+                                          pc->index - t->index)) < 0)
+                    return ret;
+                if (!ff_tak_decode_frame_header(avctx, &gb,
+                        pc->frame_start_found ? &ti : &t->ti, 127) &&
                     !ff_tak_check_crc(pc->buffer + t->index,
-                                      bitstream_tell(&bc) / 8)) {
+                                      get_bits_count(&gb) / 8)) {
                     if (!pc->frame_start_found) {
                         pc->frame_start_found = 1;
                         s->duration           = t->ti.last_frame_samples ?
                                                 t->ti.last_frame_samples :
                                                 t->ti.frame_samples;
+                        s->key_frame          = !!(t->ti.flags & TAK_FRAME_FLAG_HAS_INFO);
                     } else {
                         pc->frame_start_found = 0;
                         next                  = t->index - pc->index;
@@ -101,9 +97,10 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                     }
                 }
             }
+        }
     }
-
 found:
+
     if (consumed && !buf_size && next == END_NOT_FOUND ||
         ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
         *poutbuf      = NULL;
@@ -124,7 +121,6 @@ found:
 AVCodecParser ff_tak_parser = {
     .codec_ids      = { AV_CODEC_ID_TAK },
     .priv_data_size = sizeof(TAKParseContext),
-    .parser_init    = tak_init,
     .parser_parse   = tak_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/takdec.c b/libavcodec/takdec.c
index 11c04f4..0439a3a 100644
--- a/libavcodec/takdec.c
+++ b/libavcodec/takdec.c
@@ -2,20 +2,20 @@
  * TAK decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,44 +30,50 @@
 
 #define BITSTREAM_READER_LE
 #include "audiodsp.h"
+#include "thread.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
 #include "unary.h"
 #include "tak.h"
+#include "takdsp.h"
 
-#define MAX_SUBFRAMES     8                         // max number of subframes per channel
+#define MAX_SUBFRAMES     8                         ///< max number of subframes per channel
 #define MAX_PREDICTORS  256
 
 typedef struct MCDParam {
-    int8_t present;                                 // decorrelation parameter availability for this channel
-    int8_t index;                                   // index into array of decorrelation types
+    int8_t present;                                 ///< decorrelation parameter availability for this channel
+    int8_t index;                                   ///< index into array of decorrelation types
     int8_t chan1;
     int8_t chan2;
 } MCDParam;
 
 typedef struct TAKDecContext {
-    AVCodecContext *avctx;                          // parent AVCodecContext
+    AVCodecContext *avctx;                          ///< parent AVCodecContext
     AudioDSPContext adsp;
+    TAKDSPContext   tdsp;
     TAKStreamInfo   ti;
-    BitstreamContext bc;                            // bitstream reader initialized to start at the current frame
+    GetBitContext   gb;                             ///< bitstream reader initialized to start at the current frame
 
     int             uval;
-    int             nb_samples;                     // number of samples in the current frame
+    int             nb_samples;                     ///< number of samples in the current frame
     uint8_t        *decode_buffer;
     unsigned int    decode_buffer_size;
-    int32_t        *decoded[TAK_MAX_CHANNELS];      // decoded samples for each channel
+    int32_t        *decoded[TAK_MAX_CHANNELS];      ///< decoded samples for each channel
 
     int8_t          lpc_mode[TAK_MAX_CHANNELS];
-    int8_t          sample_shift[TAK_MAX_CHANNELS]; // shift applied to every sample in the channel
+    int8_t          sample_shift[TAK_MAX_CHANNELS]; ///< shift applied to every sample in the channel
+    int16_t         predictors[MAX_PREDICTORS];
+    int             nb_subframes;                   ///< number of subframes in the current frame
+    int16_t         subframe_len[MAX_SUBFRAMES];    ///< subframe length in samples
     int             subframe_scale;
 
-    int8_t          dmode;                          // channel decorrelation type in the current frame
+    int8_t          dmode;                          ///< channel decorrelation type in the current frame
 
-    MCDParam        mcdparams[TAK_MAX_CHANNELS];    // multichannel decorrelation parameters
+    MCDParam        mcdparams[TAK_MAX_CHANNELS];    ///< multichannel decorrelation parameters
 
-    int16_t        *residues;
-    unsigned int    residues_buf_size;
+    int8_t          coding_mode[128];
+    DECLARE_ALIGNED(16, int16_t, filter)[MAX_PREDICTORS];
+    DECLARE_ALIGNED(16, int16_t, residues)[544];
 } TAKDecContext;
 
 static const int8_t mc_dmodes[] = { 1, 3, 4, 6, };
@@ -135,14 +141,9 @@ static const struct CParam {
     { 0x1A, 0x1800000, 0x1800000, 0x6800000, 0xC000000 },
 };
 
-static av_cold void tak_init_static_data(AVCodec *codec)
-{
-    ff_tak_init_crc();
-}
-
 static int set_bps_params(AVCodecContext *avctx)
 {
-    switch (avctx->bits_per_coded_sample) {
+    switch (avctx->bits_per_raw_sample) {
     case 8:
         avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
         break;
@@ -153,11 +154,10 @@ static int set_bps_params(AVCodecContext *avctx)
         avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "unsupported bits per sample: %d\n",
-               avctx->bits_per_coded_sample);
+        av_log(avctx, AV_LOG_ERROR, "invalid/unsupported bits per sample: %d\n",
+               avctx->bits_per_raw_sample);
         return AVERROR_INVALIDDATA;
     }
-    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     return 0;
 }
@@ -165,8 +165,17 @@ static int set_bps_params(AVCodecContext *avctx)
 static void set_sample_rate_params(AVCodecContext *avctx)
 {
     TAKDecContext *s  = avctx->priv_data;
-    int shift         = 3 - (avctx->sample_rate / 11025);
-    shift             = FFMAX(0, shift);
+    int shift;
+
+    if (avctx->sample_rate < 11025) {
+        shift = 3;
+    } else if (avctx->sample_rate < 22050) {
+        shift = 2;
+    } else if (avctx->sample_rate < 44100) {
+        shift = 1;
+    } else {
+        shift = 0;
+    }
     s->uval           = FFALIGN(avctx->sample_rate + 511 >> 9, 4) << shift;
     s->subframe_scale = FFALIGN(avctx->sample_rate + 511 >> 9, 4) << 1;
 }
@@ -176,8 +185,10 @@ static av_cold int tak_decode_init(AVCodecContext *avctx)
     TAKDecContext *s = avctx->priv_data;
 
     ff_audiodsp_init(&s->adsp);
+    ff_takdsp_init(&s->tdsp);
 
     s->avctx = avctx;
+    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     set_sample_rate_params(avctx);
 
@@ -192,24 +203,24 @@ static void decode_lpc(int32_t *coeffs, int mode, int length)
         return;
 
     if (mode == 1) {
-        int a1 = *coeffs++;
+        unsigned a1 = *coeffs++;
         for (i = 0; i < length - 1 >> 1; i++) {
             *coeffs   += a1;
-            coeffs[1] += *coeffs;
+            coeffs[1] += (unsigned)*coeffs;
             a1         = coeffs[1];
             coeffs    += 2;
         }
         if (length - 1 & 1)
             *coeffs += a1;
     } else if (mode == 2) {
-        int a1    = coeffs[1];
-        int a2    = a1 + *coeffs;
+        unsigned a1    = coeffs[1];
+        unsigned a2    = a1 + *coeffs;
         coeffs[1] = a2;
         if (length > 2) {
             coeffs += 2;
             for (i = 0; i < length - 2 >> 1; i++) {
-                int a3    = *coeffs + a1;
-                int a4    = a3 + a2;
+                unsigned a3    = *coeffs + a1;
+                unsigned a4    = a3 + a2;
                 *coeffs   = a4;
                 a1        = coeffs[1] + a3;
                 a2        = a1 + a4;
@@ -220,13 +231,14 @@ static void decode_lpc(int32_t *coeffs, int mode, int length)
                 *coeffs += a1 + a2;
         }
     } else if (mode == 3) {
-        int a1    = coeffs[1];
-        int a2    = a1 + *coeffs;
+        unsigned a1    = coeffs[1];
+        unsigned a2    = a1 + *coeffs;
         coeffs[1] = a2;
         if (length > 2) {
-            int a3  = coeffs[2];
-            int a4  = a3 + a1;
-            int a5  = a4 + a2;
+            unsigned a3  = coeffs[2];
+            unsigned a4  = a3 + a1;
+            unsigned a5  = a4 + a2;
+            coeffs[2] = a5;
             coeffs += 3;
             for (i = 0; i < length - 3; i++) {
                 a3     += *coeffs;
@@ -239,10 +251,10 @@ static void decode_lpc(int32_t *coeffs, int mode, int length)
     }
 }
 
-static int decode_segment(BitstreamContext *bc, int mode, int32_t *decoded,
-                          int len)
+static int decode_segment(TAKDecContext *s, int8_t mode, int32_t *decoded, int len)
 {
     struct CParam code;
+    GetBitContext *gb = &s->gb;
     int i;
 
     if (!mode) {
@@ -255,20 +267,20 @@ static int decode_segment(BitstreamContext *bc, int mode, int32_t *decoded,
     code = xcodes[mode - 1];
 
     for (i = 0; i < len; i++) {
-        int x = bitstream_read(bc, code.init);
-        if (x >= code.escape && bitstream_read_bit(bc)) {
+        unsigned x = get_bits_long(gb, code.init);
+        if (x >= code.escape && get_bits1(gb)) {
             x |= 1 << code.init;
             if (x >= code.aescape) {
-                int scale = get_unary(bc, 1, 9);
+                unsigned scale = get_unary(gb, 1, 9);
                 if (scale == 9) {
-                    int scale_bits = bitstream_read(bc, 3);
+                    int scale_bits = get_bits(gb, 3);
                     if (scale_bits > 0) {
                         if (scale_bits == 7) {
-                            scale_bits += bitstream_read(bc, 5);
+                            scale_bits += get_bits(gb, 5);
                             if (scale_bits > 29)
                                 return AVERROR_INVALIDDATA;
                         }
-                        scale = bitstream_read(bc, scale_bits) + 1;
+                        scale = get_bits_long(gb, scale_bits) + 1;
                         x    += code.scale * scale;
                     }
                     x += code.bias;
@@ -285,15 +297,14 @@ static int decode_segment(BitstreamContext *bc, int mode, int32_t *decoded,
 
 static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
 {
-    BitstreamContext *bc = &s->bc;
+    GetBitContext *gb = &s->gb;
     int i, mode, ret;
 
     if (length > s->nb_samples)
         return AVERROR_INVALIDDATA;
 
-    if (bitstream_read_bit(bc)) {
+    if (get_bits1(gb)) {
         int wlength, rval;
-        int coding_mode[128];
 
         wlength = length / s->uval;
 
@@ -307,21 +318,20 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
         if (wlength <= 1 || wlength > 128)
             return AVERROR_INVALIDDATA;
 
-        coding_mode[0] =
-        mode           = bitstream_read(bc, 6);
+        s->coding_mode[0] = mode = get_bits(gb, 6);
 
         for (i = 1; i < wlength; i++) {
-            int c = get_unary(bc, 1, 6);
+            int c = get_unary(gb, 1, 6);
 
             switch (c) {
             case 6:
-                mode = bitstream_read(bc, 6);
+                mode = get_bits(gb, 6);
                 break;
             case 5:
             case 4:
             case 3: {
                 /* mode += sign ? (1 - c) : (c - 1) */
-                int sign = bitstream_read_bit(bc);
+                int sign = get_bits1(gb);
                 mode    += (-sign ^ (c - 1)) + sign;
                 break;
             }
@@ -332,14 +342,14 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
                 mode--;
                 break;
             }
-            coding_mode[i] = mode;
+            s->coding_mode[i] = mode;
         }
 
         i = 0;
         while (i < wlength) {
             int len = 0;
 
-            mode = coding_mode[i];
+            mode = s->coding_mode[i];
             do {
                 if (i >= wlength - 1)
                     len += rval;
@@ -349,92 +359,43 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
 
                 if (i == wlength)
                     break;
-            } while (coding_mode[i] == mode);
+            } while (s->coding_mode[i] == mode);
 
-            if ((ret = decode_segment(bc, mode, decoded, len)) < 0)
+            if ((ret = decode_segment(s, mode, decoded, len)) < 0)
                 return ret;
             decoded += len;
         }
     } else {
-        mode = bitstream_read(bc, 6);
-        if ((ret = decode_segment(bc, mode, decoded, length)) < 0)
+        mode = get_bits(gb, 6);
+        if ((ret = decode_segment(s, mode, decoded, length)) < 0)
             return ret;
     }
 
     return 0;
 }
 
-static int bits_esc4(BitstreamContext *bc)
+static int get_bits_esc4(GetBitContext *gb)
 {
-    if (bitstream_read_bit(bc))
-        return bitstream_read(bc, 4) + 1;
+    if (get_bits1(gb))
+        return get_bits(gb, 4) + 1;
     else
         return 0;
 }
 
-static void decode_filter_coeffs(TAKDecContext *s, int filter_order, int size,
-                                 int filter_quant, int16_t *filter)
-{
-    BitstreamContext *bc = &s->bc;
-    int i, j, a, b;
-    int filter_tmp[MAX_PREDICTORS];
-    int16_t predictors[MAX_PREDICTORS];
-
-    predictors[0] = bitstream_read_signed(bc, 10);
-    predictors[1] = bitstream_read_signed(bc, 10);
-    predictors[2] = bitstream_read_signed(bc, size) << (10 - size);
-    predictors[3] = bitstream_read_signed(bc, size) << (10 - size);
-    if (filter_order > 4) {
-        int av_uninit(code_size);
-        int code_size_base = size - bitstream_read_bit(bc);
-
-        for (i = 4; i < filter_order; i++) {
-            if (!(i & 3))
-            code_size     = code_size_base - bitstream_read(bc, 2);
-            predictors[i] = bitstream_read_signed(bc, code_size) << (10 - size);
-        }
-    }
-
-    filter_tmp[0] = predictors[0] << 6;
-    for (i = 1; i < filter_order; i++) {
-        int *p1 = &filter_tmp[0];
-        int *p2 = &filter_tmp[i - 1];
-
-        for (j = 0; j < (i + 1) / 2; j++) {
-            int tmp = *p1 + (predictors[i] * *p2 + 256 >> 9);
-            *p2     = *p2 + (predictors[i] * *p1 + 256 >> 9);
-            *p1     = tmp;
-            p1++;
-            p2--;
-        }
-
-        filter_tmp[i] = predictors[i] << 6;
-    }
-
-    a = 1 << (32 - (15 - filter_quant));
-    b = 1 << ((15 - filter_quant) - 1);
-    for (i = 0, j = filter_order - 1; i < filter_order / 2; i++, j--) {
-        filter[j] = a - ((filter_tmp[i] + b) >> (15 - filter_quant));
-        filter[i] = a - ((filter_tmp[j] + b) >> (15 - filter_quant));
-    }
-}
-
 static int decode_subframe(TAKDecContext *s, int32_t *decoded,
                            int subframe_size, int prev_subframe_size)
 {
-    LOCAL_ALIGNED_16(int16_t, filter, [MAX_PREDICTORS]);
-    BitstreamContext *bc = &s->bc;
-    int i, ret;
+    GetBitContext *gb = &s->gb;
+    int x, y, i, j, ret = 0;
     int dshift, size, filter_quant, filter_order;
+    int tfilter[MAX_PREDICTORS];
 
-    memset(filter, 0, MAX_PREDICTORS * sizeof(*filter));
-
-    if (!bitstream_read_bit(bc))
+    if (!get_bits1(gb))
         return decode_residues(s, decoded, subframe_size);
 
-    filter_order = predictor_sizes[bitstream_read(bc, 4)];
+    filter_order = predictor_sizes[get_bits(gb, 4)];
 
-    if (prev_subframe_size > 0 && bitstream_read_bit(bc)) {
+    if (prev_subframe_size > 0 && get_bits1(gb)) {
         if (filter_order > prev_subframe_size)
             return AVERROR_INVALIDDATA;
 
@@ -449,7 +410,7 @@ static int decode_subframe(TAKDecContext *s, int32_t *decoded,
         if (filter_order > subframe_size)
             return AVERROR_INVALIDDATA;
 
-        lpc_mode = bitstream_read(bc, 2);
+        lpc_mode = get_bits(gb, 2);
         if (lpc_mode > 2)
             return AVERROR_INVALIDDATA;
 
@@ -460,40 +421,84 @@ static int decode_subframe(TAKDecContext *s, int32_t *decoded,
             decode_lpc(decoded, lpc_mode, filter_order);
     }
 
-    dshift = bits_esc4(bc);
-    size   = bitstream_read_bit(bc) + 6;
+    dshift = get_bits_esc4(gb);
+    size   = get_bits1(gb) + 6;
 
     filter_quant = 10;
-    if (bitstream_read_bit(bc)) {
-        filter_quant -= bitstream_read(bc, 3) + 1;
+    if (get_bits1(gb)) {
+        filter_quant -= get_bits(gb, 3) + 1;
         if (filter_quant < 3)
             return AVERROR_INVALIDDATA;
     }
 
-    decode_filter_coeffs(s, filter_order, size, filter_quant, filter);
+    s->predictors[0] = get_sbits(gb, 10);
+    s->predictors[1] = get_sbits(gb, 10);
+    s->predictors[2] = get_sbits(gb, size) * (1 << (10 - size));
+    s->predictors[3] = get_sbits(gb, size) * (1 << (10 - size));
+    if (filter_order > 4) {
+        int tmp = size - get_bits1(gb);
+
+        for (i = 4; i < filter_order; i++) {
+            if (!(i & 3))
+                x = tmp - get_bits(gb, 2);
+            s->predictors[i] = get_sbits(gb, x) * (1 << (10 - size));
+        }
+    }
+
+    tfilter[0] = s->predictors[0] * 64;
+    for (i = 1; i < filter_order; i++) {
+        uint32_t *p1 = &tfilter[0];
+        uint32_t *p2 = &tfilter[i - 1];
+
+        for (j = 0; j < (i + 1) / 2; j++) {
+            x     = *p1 + ((int32_t)(s->predictors[i] * *p2 + 256) >> 9);
+            *p2  += (int32_t)(s->predictors[i] * *p1 + 256) >> 9;
+            *p1++ = x;
+            p2--;
+        }
+
+        tfilter[i] = s->predictors[i] * 64;
+    }
+
+    x = 1 << (32 - (15 - filter_quant));
+    y = 1 << ((15 - filter_quant) - 1);
+    for (i = 0, j = filter_order - 1; i < filter_order / 2; i++, j--) {
+        s->filter[j] = x - ((tfilter[i] + y) >> (15 - filter_quant));
+        s->filter[i] = x - ((tfilter[j] + y) >> (15 - filter_quant));
+    }
 
     if ((ret = decode_residues(s, &decoded[filter_order],
                                subframe_size - filter_order)) < 0)
         return ret;
 
-    av_fast_malloc(&s->residues, &s->residues_buf_size,
-                   FFALIGN(subframe_size + 16, 16) * sizeof(*s->residues));
-    if (!s->residues)
-        return AVERROR(ENOMEM);
-    memset(s->residues, 0, s->residues_buf_size);
-
     for (i = 0; i < filter_order; i++)
         s->residues[i] = *decoded++ >> dshift;
 
-    for (i = 0; i < subframe_size - filter_order; i++) {
-        int v = 1 << (filter_quant - 1);
-
-        v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
-                                         FFALIGN(filter_order, 16));
+    y    = FF_ARRAY_ELEMS(s->residues) - filter_order;
+    x    = subframe_size - filter_order;
+    while (x > 0) {
+        int tmp = FFMIN(y, x);
+
+        for (i = 0; i < tmp; i++) {
+            int v = 1 << (filter_quant - 1);
+
+            if (filter_order & -16)
+                v += (unsigned)s->adsp.scalarproduct_int16(&s->residues[i], s->filter,
+                                                 filter_order & -16);
+            for (j = filter_order & -16; j < filter_order; j += 4) {
+                v += s->residues[i + j + 3] * (unsigned)s->filter[j + 3] +
+                     s->residues[i + j + 2] * (unsigned)s->filter[j + 2] +
+                     s->residues[i + j + 1] * (unsigned)s->filter[j + 1] +
+                     s->residues[i + j    ] * (unsigned)s->filter[j    ];
+            }
+            v = (av_clip_intp2(v >> filter_quant, 13) * (1 << dshift)) - (unsigned)*decoded;
+            *decoded++ = v;
+            s->residues[filter_order + i] = v >> dshift;
+        }
 
-        v = (av_clip_intp2(v >> filter_quant, 13) << dshift) - *decoded;
-        *decoded++ = v;
-        s->residues[filter_order + i] = v >> dshift;
+        x -= tmp;
+        if (x > 0)
+            memcpy(s->residues, &s->residues[y], 2 * filter_order);
     }
 
     emms_c();
@@ -504,53 +509,45 @@ static int decode_subframe(TAKDecContext *s, int32_t *decoded,
 static int decode_channel(TAKDecContext *s, int chan)
 {
     AVCodecContext *avctx = s->avctx;
-    BitstreamContext *bc  = &s->bc;
+    GetBitContext *gb     = &s->gb;
     int32_t *decoded      = s->decoded[chan];
     int left              = s->nb_samples - 1;
-    int i, prev, ret, nb_subframes;
-    int subframe_len[MAX_SUBFRAMES];
+    int i = 0, ret, prev = 0;
 
-    s->sample_shift[chan] = bits_esc4(bc);
-    if (s->sample_shift[chan] >= avctx->bits_per_coded_sample)
+    s->sample_shift[chan] = get_bits_esc4(gb);
+    if (s->sample_shift[chan] >= avctx->bits_per_raw_sample)
         return AVERROR_INVALIDDATA;
 
-    /* NOTE: TAK 2.2.0 appears to set the sample value to 0 if
-     *       bits_per_coded_sample - sample_shift is 1, but this produces
-     *       non-bit-exact output. Reading the 1 bit using bitstream_read_signed()
-     *       instead of skipping it produces bit-exact output. This has been
-     *       reported to the TAK author. */
-    *decoded++        = bitstream_read_signed(bc,
-                                              avctx->bits_per_coded_sample -
-                                              s->sample_shift[chan]);
-    s->lpc_mode[chan] = bitstream_read(bc, 2);
-    nb_subframes      = bitstream_read(bc, 3) + 1;
-
-    i = 0;
-    if (nb_subframes > 1) {
-        if (bitstream_bits_left(bc) < (nb_subframes - 1) * 6)
+    *decoded++ = get_sbits(gb, avctx->bits_per_raw_sample - s->sample_shift[chan]);
+    s->lpc_mode[chan] = get_bits(gb, 2);
+    s->nb_subframes   = get_bits(gb, 3) + 1;
+
+    if (s->nb_subframes > 1) {
+        if (get_bits_left(gb) < (s->nb_subframes - 1) * 6)
             return AVERROR_INVALIDDATA;
 
-        prev = 0;
-        for (; i < nb_subframes - 1; i++) {
-            int subframe_end = bitstream_read(bc, 6) * s->subframe_scale;
-            if (subframe_end <= prev)
+        for (; i < s->nb_subframes - 1; i++) {
+            int v = get_bits(gb, 6);
+
+            s->subframe_len[i] = (v - prev) * s->subframe_scale;
+            if (s->subframe_len[i] <= 0)
                 return AVERROR_INVALIDDATA;
-            subframe_len[i] = subframe_end - prev;
-            left           -= subframe_len[i];
-            prev            = subframe_end;
+
+            left -= s->subframe_len[i];
+            prev  = v;
         }
 
         if (left <= 0)
             return AVERROR_INVALIDDATA;
     }
-    subframe_len[i] = left;
+    s->subframe_len[i] = left;
 
     prev = 0;
-    for (i = 0; i < nb_subframes; i++) {
-        if ((ret = decode_subframe(s, decoded, subframe_len[i], prev)) < 0)
+    for (i = 0; i < s->nb_subframes; i++) {
+        if ((ret = decode_subframe(s, decoded, s->subframe_len[i], prev)) < 0)
             return ret;
-        decoded += subframe_len[i];
-        prev     = subframe_len[i];
+        decoded += s->subframe_len[i];
+        prev     = s->subframe_len[i];
     }
 
     return 0;
@@ -558,69 +555,52 @@ static int decode_channel(TAKDecContext *s, int chan)
 
 static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
 {
-    BitstreamContext *bc = &s->bc;
-    int32_t *p1       = s->decoded[c1] + 1;
-    int32_t *p2       = s->decoded[c2] + 1;
+    GetBitContext *gb = &s->gb;
+    int32_t *p1       = s->decoded[c1] + (s->dmode > 5);
+    int32_t *p2       = s->decoded[c2] + (s->dmode > 5);
+    int32_t bp1       = p1[0];
+    int32_t bp2       = p2[0];
     int i;
     int dshift, dfactor;
 
+    length += s->dmode < 6;
+
     switch (s->dmode) {
     case 1: /* left/side */
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            p2[i]     = a + b;
-        }
+        s->tdsp.decorrelate_ls(p1, p2, length);
         break;
     case 2: /* side/right */
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            p1[i]     = b - a;
-        }
+        s->tdsp.decorrelate_sr(p1, p2, length);
         break;
     case 3: /* side/mid */
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            a        -= b >> 1;
-            p1[i]     = a;
-            p2[i]     = a + b;
-        }
+        s->tdsp.decorrelate_sm(p1, p2, length);
         break;
     case 4: /* side/left with scale factor */
         FFSWAP(int32_t*, p1, p2);
+        FFSWAP(int32_t, bp1, bp2);
     case 5: /* side/right with scale factor */
-        dshift  = bits_esc4(bc);
-        dfactor = bitstream_read_signed(bc, 10);
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            b         = dfactor * (b >> dshift) + 128 >> 8 << dshift;
-            p1[i]     = b - a;
-        }
+        dshift  = get_bits_esc4(gb);
+        dfactor = get_sbits(gb, 10);
+        s->tdsp.decorrelate_sf(p1, p2, length, dshift, dfactor);
         break;
     case 6:
         FFSWAP(int32_t*, p1, p2);
     case 7: {
-        LOCAL_ALIGNED_16(int16_t, filter, [MAX_PREDICTORS]);
         int length2, order_half, filter_order, dval1, dval2;
-        int av_uninit(code_size);
-
-        memset(filter, 0, MAX_PREDICTORS * sizeof(*filter));
+        int tmp, x, code_size;
 
         if (length < 256)
             return AVERROR_INVALIDDATA;
 
-        dshift       = bits_esc4(bc);
-        filter_order = 8 << bitstream_read_bit(bc);
-        dval1        = bitstream_read_bit(bc);
-        dval2        = bitstream_read_bit(bc);
+        dshift       = get_bits_esc4(gb);
+        filter_order = 8 << get_bits1(gb);
+        dval1        = get_bits1(gb);
+        dval2        = get_bits1(gb);
 
         for (i = 0; i < filter_order; i++) {
             if (!(i & 3))
-                code_size = 14 - bitstream_read(bc, 3);
-            filter[i] = bitstream_read_signed(bc, code_size);
+                code_size = 14 - get_bits(gb, 3);
+            s->filter[i] = get_sbits(gb, code_size);
         }
 
         order_half = filter_order / 2;
@@ -644,24 +624,40 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
             }
         }
 
-        av_fast_malloc(&s->residues, &s->residues_buf_size,
-                       FFALIGN(length + 16, 16) * sizeof(*s->residues));
-        if (!s->residues)
-            return AVERROR(ENOMEM);
-        memset(s->residues, 0, s->residues_buf_size);
 
-        for (i = 0; i < length; i++)
-            s->residues[i] = p2[i] >> dshift;
+        for (i = 0; i < filter_order; i++)
+            s->residues[i] = *p2++ >> dshift;
 
         p1 += order_half;
+        x = FF_ARRAY_ELEMS(s->residues) - filter_order;
+        for (; length2 > 0; length2 -= tmp) {
+            tmp = FFMIN(length2, x);
+
+            for (i = 0; i < tmp - (tmp == length2); i++)
+                s->residues[filter_order + i] = *p2++ >> dshift;
+
+            for (i = 0; i < tmp; i++) {
+                int v = 1 << 9;
+
+                if (filter_order == 16) {
+                    v += s->adsp.scalarproduct_int16(&s->residues[i], s->filter,
+                                                     filter_order);
+                } else {
+                    v += s->residues[i + 7] * s->filter[7] +
+                         s->residues[i + 6] * s->filter[6] +
+                         s->residues[i + 5] * s->filter[5] +
+                         s->residues[i + 4] * s->filter[4] +
+                         s->residues[i + 3] * s->filter[3] +
+                         s->residues[i + 2] * s->filter[2] +
+                         s->residues[i + 1] * s->filter[1] +
+                         s->residues[i    ] * s->filter[0];
+                }
 
-        for (i = 0; i < length2; i++) {
-            int v = 1 << 9;
-
-            v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
-                                             FFALIGN(filter_order, 16));
+                v = av_clip_intp2(v >> 10, 13) * (1 << dshift) - *p1;
+                *p1++ = v;
+            }
 
-            p1[i] = (av_clip_intp2(v >> 10, 13) << dshift) - p1[i];
+            memmove(s->residues, &s->residues[tmp], 2 * filter_order);
         }
 
         emms_c();
@@ -669,6 +665,11 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
     }
     }
 
+    if (s->dmode > 0 && s->dmode < 6) {
+        p1[0] = bp1;
+        p2[0] = bp2;
+    }
+
     return 0;
 }
 
@@ -677,24 +678,21 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
 {
     TAKDecContext *s  = avctx->priv_data;
     AVFrame *frame    = data;
-    BitstreamContext *bc = &s->bc;
+    ThreadFrame tframe = { .f = data };
+    GetBitContext *gb = &s->gb;
     int chan, i, ret, hsize;
 
     if (pkt->size < TAK_MIN_FRAME_HEADER_BYTES)
         return AVERROR_INVALIDDATA;
 
-    bitstream_init8(bc, pkt->data, pkt->size);
-
-    if ((ret = ff_tak_decode_frame_header(avctx, bc, &s->ti, 0)) < 0)
+    if ((ret = init_get_bits8(gb, pkt->data, pkt->size)) < 0)
         return ret;
 
-    if (s->ti.flags & TAK_FRAME_FLAG_HAS_METADATA) {
-        avpriv_request_sample(avctx, "Frame metadata");
-        return AVERROR_PATCHWELCOME;
-    }
+    if ((ret = ff_tak_decode_frame_header(avctx, gb, &s->ti, 0)) < 0)
+        return ret;
 
-    hsize = bitstream_tell(bc) / 8;
-    if (avctx->err_recognition & AV_EF_CRCCHECK) {
+    hsize = get_bits_count(gb) / 8;
+    if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_COMPLIANT)) {
         if (ff_tak_check_crc(pkt->data, hsize)) {
             av_log(avctx, AV_LOG_ERROR, "CRC error\n");
             if (avctx->err_recognition & AV_EF_EXPLODE)
@@ -728,11 +726,9 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->ti.bps != avctx->bits_per_coded_sample) {
-        avctx->bits_per_coded_sample = s->ti.bps;
-        if ((ret = set_bps_params(avctx)) < 0)
-            return ret;
-    }
+    avctx->bits_per_raw_sample = s->ti.bps;
+    if ((ret = set_bps_params(avctx)) < 0)
+        return ret;
     if (s->ti.sample_rate != avctx->sample_rate) {
         avctx->sample_rate = s->ti.sample_rate;
         set_sample_rate_params(avctx);
@@ -745,10 +741,11 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                                              : s->ti.frame_samples;
 
     frame->nb_samples = s->nb_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
+    ff_thread_finish_setup(avctx);
 
-    if (avctx->bits_per_coded_sample <= 16) {
+    if (avctx->bits_per_raw_sample <= 16) {
         int buf_size = av_samples_get_buffer_size(NULL, avctx->channels,
                                                   s->nb_samples,
                                                   AV_SAMPLE_FMT_S32P, 0);
@@ -771,7 +768,7 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
         for (chan = 0; chan < avctx->channels; chan++) {
             int32_t *decoded = s->decoded[chan];
             for (i = 0; i < s->nb_samples; i++)
-                decoded[i] = bitstream_read_signed(bc, avctx->bits_per_coded_sample);
+                decoded[i] = get_sbits(gb, avctx->bits_per_raw_sample);
         }
     } else {
         if (s->ti.codec == TAK_CODEC_MONO_STEREO) {
@@ -780,25 +777,25 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                     return ret;
 
             if (avctx->channels == 2) {
-                if (bitstream_read_bit(bc)) {
-                    // some kind of subframe length, but it seems to be unused
-                    bitstream_skip(bc, 6);
+                s->nb_subframes = get_bits(gb, 1) + 1;
+                if (s->nb_subframes > 1) {
+                    s->subframe_len[1] = get_bits(gb, 6);
                 }
 
-                s->dmode = bitstream_read(bc, 3);
+                s->dmode = get_bits(gb, 3);
                 if (ret = decorrelate(s, 0, 1, s->nb_samples - 1))
                     return ret;
             }
         } else if (s->ti.codec == TAK_CODEC_MULTICHANNEL) {
-            if (bitstream_read_bit(bc)) {
+            if (get_bits1(gb)) {
                 int ch_mask = 0;
 
-                chan = bitstream_read(bc, 4) + 1;
+                chan = get_bits(gb, 4) + 1;
                 if (chan > avctx->channels)
                     return AVERROR_INVALIDDATA;
 
                 for (i = 0; i < chan; i++) {
-                    int nbit = bitstream_read(bc, 4);
+                    int nbit = get_bits(gb, 4);
 
                     if (nbit >= avctx->channels)
                         return AVERROR_INVALIDDATA;
@@ -806,10 +803,10 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                     if (ch_mask & 1 << nbit)
                         return AVERROR_INVALIDDATA;
 
-                    s->mcdparams[i].present = bitstream_read_bit(bc);
+                    s->mcdparams[i].present = get_bits1(gb);
                     if (s->mcdparams[i].present) {
-                        s->mcdparams[i].index = bitstream_read(bc, 2);
-                        s->mcdparams[i].chan2 = bitstream_read(bc, 4);
+                        s->mcdparams[i].index = get_bits(gb, 2);
+                        s->mcdparams[i].chan2 = get_bits(gb, 4);
                         if (s->mcdparams[i].chan2 >= avctx->channels) {
                             av_log(avctx, AV_LOG_ERROR,
                                    "invalid channel 2 (%d) for %d channel(s)\n",
@@ -865,20 +862,20 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
 
             if (s->sample_shift[chan] > 0)
                 for (i = 0; i < s->nb_samples; i++)
-                    decoded[i] <<= s->sample_shift[chan];
+                    decoded[i] *= 1U << s->sample_shift[chan];
         }
     }
 
-    bitstream_align(bc);
-    bitstream_skip(bc, 24);
-    if (bitstream_bits_left(bc) < 0)
+    align_get_bits(gb);
+    skip_bits(gb, 24);
+    if (get_bits_left(gb) < 0)
         av_log(avctx, AV_LOG_DEBUG, "overread\n");
-    else if (bitstream_bits_left(bc) > 0)
+    else if (get_bits_left(gb) > 0)
         av_log(avctx, AV_LOG_DEBUG, "underread\n");
 
-    if (avctx->err_recognition & AV_EF_CRCCHECK) {
+    if (avctx->err_recognition & (AV_EF_CRCCHECK | AV_EF_COMPLIANT)) {
         if (ff_tak_check_crc(pkt->data + hsize,
-                             bitstream_tell(bc) / 8 - hsize)) {
+                             get_bits_count(gb) / 8 - hsize)) {
             av_log(avctx, AV_LOG_ERROR, "CRC error\n");
             if (avctx->err_recognition & AV_EF_EXPLODE)
                 return AVERROR_INVALIDDATA;
@@ -892,7 +889,7 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
             uint8_t *samples = (uint8_t *)frame->extended_data[chan];
             int32_t *decoded = s->decoded[chan];
             for (i = 0; i < s->nb_samples; i++)
-                samples[i] = decoded[i] + 0x80;
+                samples[i] = decoded[i] + 0x80U;
         }
         break;
     case AV_SAMPLE_FMT_S16P:
@@ -907,7 +904,7 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
         for (chan = 0; chan < avctx->channels; chan++) {
             int32_t *samples = (int32_t *)frame->extended_data[chan];
             for (i = 0; i < s->nb_samples; i++)
-                samples[i] <<= 8;
+                samples[i] *= 1U << 8;
         }
         break;
     }
@@ -917,12 +914,32 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
     return pkt->size;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    TAKDecContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return 0;
+}
+
+static int update_thread_context(AVCodecContext *dst,
+                                 const AVCodecContext *src)
+{
+    TAKDecContext *tsrc = src->priv_data;
+    TAKDecContext *tdst = dst->priv_data;
+
+    if (dst == src)
+        return 0;
+    memcpy(&tdst->ti, &tsrc->ti, sizeof(TAKStreamInfo));
+    return 0;
+}
+#endif
+
 static av_cold int tak_decode_close(AVCodecContext *avctx)
 {
     TAKDecContext *s = avctx->priv_data;
 
     av_freep(&s->decode_buffer);
-    av_freep(&s->residues);
 
     return 0;
 }
@@ -934,10 +951,11 @@ AVCodec ff_tak_decoder = {
     .id               = AV_CODEC_ID_TAK,
     .priv_data_size   = sizeof(TAKDecContext),
     .init             = tak_decode_init,
-    .init_static_data = tak_init_static_data,
     .close            = tak_decode_close,
     .decode           = tak_decode_frame,
-    .capabilities     = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .sample_fmts      = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                         AV_SAMPLE_FMT_S16P,
                                                         AV_SAMPLE_FMT_S32P,
diff --git a/libavcodec/takdsp.c b/libavcodec/takdsp.c
new file mode 100644
index 0000000..2441c2b
--- /dev/null
+++ b/libavcodec/takdsp.c
@@ -0,0 +1,82 @@
+/*
+ * TAK decoder
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "takdsp.h"
+#include "config.h"
+
+static void decorrelate_ls(int32_t *p1, int32_t *p2, int length)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        p2[i]     = a + b;
+    }
+}
+
+static void decorrelate_sr(int32_t *p1, int32_t *p2, int length)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        p1[i]     = b - a;
+    }
+}
+
+static void decorrelate_sm(int32_t *p1, int32_t *p2, int length)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        a        -= b >> 1;
+        p1[i]     = a;
+        p2[i]     = a + b;
+    }
+}
+
+static void decorrelate_sf(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        b         = dfactor * (b >> dshift) + 128 >> 8 << dshift;
+        p1[i]     = b - a;
+    }
+}
+
+av_cold void ff_takdsp_init(TAKDSPContext *c)
+{
+    c->decorrelate_ls = decorrelate_ls;
+    c->decorrelate_sr = decorrelate_sr;
+    c->decorrelate_sm = decorrelate_sm;
+    c->decorrelate_sf = decorrelate_sf;
+
+    if (ARCH_X86)
+        ff_takdsp_init_x86(c);
+}
diff --git a/libavcodec/takdsp.h b/libavcodec/takdsp.h
new file mode 100644
index 0000000..c05b574
--- /dev/null
+++ b/libavcodec/takdsp.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TAKDSP_H
+#define AVCODEC_TAKDSP_H
+
+#include <stdint.h>
+
+typedef struct TAKDSPContext {
+    void (*decorrelate_ls)(int32_t *p1, int32_t *p2, int length);
+    void (*decorrelate_sr)(int32_t *p1, int32_t *p2, int length);
+    void (*decorrelate_sm)(int32_t *p1, int32_t *p2, int length);
+    void (*decorrelate_sf)(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+} TAKDSPContext;
+
+void ff_takdsp_init(TAKDSPContext *c);
+void ff_takdsp_init_x86(TAKDSPContext *c);
+
+#endif /* AVCODEC_TAKDSP_H */
diff --git a/libavcodec/targa.c b/libavcodec/targa.c
index ef8565f..93e0ef7 100644
--- a/libavcodec/targa.c
+++ b/libavcodec/targa.c
@@ -2,20 +2,20 @@
  * Targa (.tga) image decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,22 +28,37 @@
 
 typedef struct TargaContext {
     GetByteContext gb;
-
-    int color_type;
-    int compression_type;
 } TargaContext;
 
+static uint8_t *advance_line(uint8_t *start, uint8_t *line,
+                             int stride, int *y, int h, int interleave)
+{
+    *y += interleave;
+
+    if (*y < h) {
+        return line + interleave * stride;
+    } else {
+        *y = (*y + 1) & (interleave - 1);
+        if (*y && *y < h) {
+            return start + *y * stride;
+        } else {
+            return NULL;
+        }
+    }
+}
+
 static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
-                            uint8_t *dst, int w, int h, int stride, int bpp)
+                            uint8_t *start, int w, int h, int stride,
+                            int bpp, int interleave)
 {
     int x, y;
     int depth = (bpp + 1) >> 3;
     int type, count;
-    int diff;
+    uint8_t *line = start;
+    uint8_t *dst  = line;
 
-    diff = stride - w * depth;
-    x = y = 0;
-    while (y < h) {
+    x = y = count = 0;
+    while (dst) {
         if (bytestream2_get_bytes_left(&s->gb) <= 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "Ran ouf of data before end-of-image\n");
@@ -52,12 +67,6 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
         type  = bytestream2_get_byteu(&s->gb);
         count = (type & 0x7F) + 1;
         type &= 0x80;
-        if (x + count > w && x + count + 1 > (h - y) * w) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Packet went out of bounds: position (%i,%i) size %i\n",
-                   x, y, count);
-            return AVERROR_INVALIDDATA;
-        }
         if (!type) {
             do {
                 int n  = FFMIN(count, w - x);
@@ -67,10 +76,9 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
                 x     += n;
                 if (x == w) {
                     x    = 0;
-                    y++;
-                    dst += diff;
+                    dst = line = advance_line(start, line, stride, &y, h, interleave);
                 }
-            } while (count > 0);
+            } while (dst && count > 0);
         } else {
             uint8_t tmp[4];
             bytestream2_get_buffer(&s->gb, tmp, depth);
@@ -84,12 +92,17 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
                 } while (--n);
                 if (x == w) {
                     x    = 0;
-                    y++;
-                    dst += diff;
+                    dst = line = advance_line(start, line, stride, &y, h, interleave);
                 }
-            } while (count > 0);
+            } while (dst && count > 0);
         }
     }
+
+    if (count) {
+        av_log(avctx, AV_LOG_ERROR, "Packet went out of bounds\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
@@ -101,14 +114,15 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p = data;
     uint8_t *dst;
     int stride;
-    int idlen, compr, y, w, h, bpp, flags, ret;
+    int idlen, pal, compr, y, w, h, bpp, flags, ret;
     int first_clr, colors, csize;
+    int interleave;
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
     /* parse image header */
     idlen     = bytestream2_get_byte(&s->gb);
-    bytestream2_skip(&s->gb, 1); /* pal */
+    pal       = bytestream2_get_byte(&s->gb);
     compr     = bytestream2_get_byte(&s->gb);
     first_clr = bytestream2_get_le16(&s->gb);
     colors    = bytestream2_get_le16(&s->gb);
@@ -117,17 +131,29 @@ static int decode_frame(AVCodecContext *avctx,
     w         = bytestream2_get_le16(&s->gb);
     h         = bytestream2_get_le16(&s->gb);
     bpp       = bytestream2_get_byte(&s->gb);
+
+    if (bytestream2_get_bytes_left(&s->gb) <= idlen) {
+        av_log(avctx, AV_LOG_ERROR,
+                "Not enough data to read header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     flags     = bytestream2_get_byte(&s->gb);
+
+    if (!pal && (first_clr || colors || csize)) {
+        av_log(avctx, AV_LOG_WARNING, "File without colormap has colormap information set.\n");
+        // specification says we should ignore those value in this case
+        first_clr = colors = csize = 0;
+    }
+
     // skip identifier if any
     bytestream2_skip(&s->gb, idlen);
 
-    switch(bpp){
+    switch (bpp) {
     case 8:
         avctx->pix_fmt = ((compr & (~TGA_RLE)) == TGA_BW) ? AV_PIX_FMT_GRAY8 : AV_PIX_FMT_PAL8;
         break;
     case 15:
-        avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
-        break;
     case 16:
         avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
         break;
@@ -142,28 +168,34 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    if (colors && (colors + first_clr) > 256) {
+        av_log(avctx, AV_LOG_ERROR, "Incorrect palette: %i colors with offset %i\n", colors, first_clr);
+        return AVERROR_INVALIDDATA;
+    }
+
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
-    if(flags & 0x20){
+    p->pict_type = AV_PICTURE_TYPE_I;
+
+    if (flags & TGA_TOPTOBOTTOM) {
         dst = p->data[0];
         stride = p->linesize[0];
-    }else{ //image is upside-down
+    } else { //image is upside-down
         dst = p->data[0] + p->linesize[0] * (h - 1);
         stride = -p->linesize[0];
     }
 
-    if(colors){
+    interleave = flags & TGA_INTERLEAVE2 ? 2 :
+                 flags & TGA_INTERLEAVE4 ? 4 : 1;
+
+    if (colors) {
         int pal_size, pal_sample_size;
-        if((colors + first_clr) > 256){
-            av_log(avctx, AV_LOG_ERROR, "Incorrect palette: %i colors with offset %i\n", colors, first_clr);
-            return AVERROR_INVALIDDATA;
-        }
+
         switch (csize) {
+        case 32: pal_sample_size = 4; break;
         case 24: pal_sample_size = 3; break;
         case 16:
         case 15: pal_sample_size = 2; break;
@@ -172,9 +204,9 @@ static int decode_frame(AVCodecContext *avctx,
             return AVERROR_INVALIDDATA;
         }
         pal_size = colors * pal_sample_size;
-        if(avctx->pix_fmt != AV_PIX_FMT_PAL8)//should not occur but skip palette anyway
+        if (avctx->pix_fmt != AV_PIX_FMT_PAL8) //should not occur but skip palette anyway
             bytestream2_skip(&s->gb, pal_size);
-        else{
+        else {
             int t;
             uint32_t *pal = ((uint32_t *)p->data[1]) + first_clr;
 
@@ -184,10 +216,14 @@ static int decode_frame(AVCodecContext *avctx,
                 return AVERROR_INVALIDDATA;
             }
             switch (pal_sample_size) {
+            case 4:
+                for (t = 0; t < colors; t++)
+                    *pal++ = bytestream2_get_le32u(&s->gb);
+                break;
             case 3:
                 /* RGB24 */
                 for (t = 0; t < colors; t++)
-                    *pal++ = bytestream2_get_le24u(&s->gb);
+                    *pal++ = (0xffU<<24) | bytestream2_get_le24u(&s->gb);
                 break;
             case 2:
                 /* RGB555 */
@@ -198,34 +234,64 @@ static int decode_frame(AVCodecContext *avctx,
                         ((v & 0x001F) <<  3);
                     /* left bit replication */
                     v |= (v & 0xE0E0E0U) >> 5;
-                    *pal++ = v;
+                    *pal++ = (0xffU<<24) | v;
                 }
                 break;
             }
             p->palette_has_changed = 1;
         }
     }
+
     if ((compr & (~TGA_RLE)) == TGA_NODATA) {
         memset(p->data[0], 0, p->linesize[0] * h);
     } else {
-        if(compr & TGA_RLE){
-            int res = targa_decode_rle(avctx, s, dst, w, h, stride, bpp);
+        if (compr & TGA_RLE) {
+            int res = targa_decode_rle(avctx, s, dst, w, h, stride, bpp, interleave);
             if (res < 0)
                 return res;
         } else {
             size_t img_size = w * ((bpp + 1) >> 3);
+            uint8_t *line;
             if (bytestream2_get_bytes_left(&s->gb) < img_size * h) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Not enough data available for image\n");
                 return AVERROR_INVALIDDATA;
             }
+
+            line = dst;
+            y = 0;
+            do {
+                bytestream2_get_buffer(&s->gb, line, img_size);
+                line = advance_line(dst, line, stride, &y, h, interleave);
+            } while (line);
+        }
+
+        if (flags & TGA_RIGHTTOLEFT) { // right-to-left, needs horizontal flip
+            int x;
             for (y = 0; y < h; y++) {
-                bytestream2_get_bufferu(&s->gb, dst, img_size);
-                dst += stride;
+                void *line = &p->data[0][y * p->linesize[0]];
+                for (x = 0; x < w >> 1; x++) {
+                    switch (bpp) {
+                    case 32:
+                        FFSWAP(uint32_t, ((uint32_t *)line)[x], ((uint32_t *)line)[w - x - 1]);
+                        break;
+                    case 24:
+                        FFSWAP(uint8_t, ((uint8_t *)line)[3 * x    ], ((uint8_t *)line)[3 * w - 3 * x - 3]);
+                        FFSWAP(uint8_t, ((uint8_t *)line)[3 * x + 1], ((uint8_t *)line)[3 * w - 3 * x - 2]);
+                        FFSWAP(uint8_t, ((uint8_t *)line)[3 * x + 2], ((uint8_t *)line)[3 * w - 3 * x - 1]);
+                        break;
+                    case 16:
+                        FFSWAP(uint16_t, ((uint16_t *)line)[x], ((uint16_t *)line)[w - x - 1]);
+                        break;
+                    case 8:
+                        FFSWAP(uint8_t, ((uint8_t *)line)[x], ((uint8_t *)line)[w - x - 1]);
+                    }
+                }
             }
         }
     }
 
+
     *got_frame = 1;
 
     return avpkt->size;
diff --git a/libavcodec/targa.h b/libavcodec/targa.h
index f4ef553..c2f5224 100644
--- a/libavcodec/targa.h
+++ b/libavcodec/targa.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,4 +38,11 @@ enum TargaCompr {
     TGA_RLE    = 8, // flag pointing that data is RLE-coded
 };
 
+enum TargaFlags {
+    TGA_RIGHTTOLEFT = 0x10, // right-to-left (flipped horizontally)
+    TGA_TOPTOBOTTOM = 0x20, // top-to-bottom (NOT flipped vertically)
+    TGA_INTERLEAVE2 = 0x40, // 2-way interleave, odd then even lines
+    TGA_INTERLEAVE4 = 0x80, // 4-way interleave
+};
+
 #endif /* AVCODEC_TARGA_H */
diff --git a/libavcodec/targa_y216dec.c b/libavcodec/targa_y216dec.c
new file mode 100644
index 0000000..443d48a
--- /dev/null
+++ b/libavcodec/targa_y216dec.c
@@ -0,0 +1,84 @@
+/*
+ * Pinnacle TARGA CineWave YUV16 decoder
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y216_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV422P16;
+    avctx->bits_per_raw_sample = 14;
+
+    return 0;
+}
+
+static int y216_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint16_t *src = (uint16_t *)avpkt->data;
+    uint16_t *y, *u, *v;
+    int aligned_width = FFALIGN(avctx->width, 4);
+    int i, j, ret;
+
+    if (avpkt->size < 4 * avctx->height * aligned_width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = (uint16_t *)pic->data[0];
+    u = (uint16_t *)pic->data[1];
+    v = (uint16_t *)pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width >> 1; j++) {
+            u[    j    ] = src[4 * j    ] << 2 | src[4 * j    ] >> 14;
+            y[2 * j    ] = src[4 * j + 1] << 2 | src[4 * j + 1] >> 14;
+            v[    j    ] = src[4 * j + 2] << 2 | src[4 * j + 2] >> 14;
+            y[2 * j + 1] = src[4 * j + 3] << 2 | src[4 * j + 3] >> 14;
+        }
+
+        y += pic->linesize[0] >> 1;
+        u += pic->linesize[1] >> 1;
+        v += pic->linesize[2] >> 1;
+        src += aligned_width << 1;
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_targa_y216_decoder = {
+    .name         = "targa_y216",
+    .long_name    = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_TARGA_Y216,
+    .init         = y216_decode_init,
+    .decode       = y216_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/targaenc.c b/libavcodec/targaenc.c
index f0cee38..79030a0 100644
--- a/libavcodec/targaenc.c
+++ b/libavcodec/targaenc.c
@@ -2,20 +2,20 @@
  * Targa (.tga) image encoder
  * Copyright (c) 2007 Bobby Bingham
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -86,15 +86,13 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                               const AVFrame *p, int *got_packet)
 {
     TargaContext *s = avctx->priv_data;
-    int bpp, picsize, datasize = -1, ret;
+    int bpp, picsize, datasize = -1, ret, i;
     uint8_t *out;
 
     picsize = av_image_get_buffer_size(avctx->pix_fmt,
                                        avctx->width, avctx->height, 1);
-    if ((ret = ff_alloc_packet(pkt, picsize + 45)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, picsize + 45, 0)) < 0)
         return ret;
-    }
 
     /* zero out the header and only set applicable fields */
     memset(pkt->data, 0, 12);
@@ -103,13 +101,39 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     /* image descriptor byte: origin is always top-left, bits 0-3 specify alpha */
     pkt->data[17] = 0x20 | (avctx->pix_fmt == AV_PIX_FMT_BGRA ? 8 : 0);
 
+    out = pkt->data + 18;  /* skip past the header we write */
+
+    avctx->bits_per_coded_sample = av_get_bits_per_pixel(av_pix_fmt_desc_get(avctx->pix_fmt));
     switch(avctx->pix_fmt) {
+    case AV_PIX_FMT_PAL8: {
+        int pal_bpp = 24; /* Only write 32bit palette if there is transparency information */
+        for (i = 0; i < 256; i++)
+            if (AV_RN32(p->data[1] + 4 * i) >> 24 != 0xFF) {
+                pal_bpp = 32;
+                break;
+            }
+        pkt->data[1]  = 1;          /* palette present */
+        pkt->data[2]  = TGA_PAL;    /* uncompressed palettised image */
+        pkt->data[6]  = 1;          /* palette contains 256 entries */
+        pkt->data[7]  = pal_bpp;    /* palette contains pal_bpp bit entries */
+        pkt->data[16] = 8;          /* bpp */
+        for (i = 0; i < 256; i++)
+            if (pal_bpp == 32) {
+                AV_WL32(pkt->data + 18 + 4 * i, *(uint32_t *)(p->data[1] + i * 4));
+            } else {
+            AV_WL24(pkt->data + 18 + 3 * i, *(uint32_t *)(p->data[1] + i * 4));
+            }
+        out += 32 * pal_bpp;        /* skip past the palette we just output */
+        break;
+        }
     case AV_PIX_FMT_GRAY8:
         pkt->data[2]  = TGA_BW;     /* uncompressed grayscale image */
+        avctx->bits_per_coded_sample = 0x28;
         pkt->data[16] = 8;          /* bpp */
         break;
     case AV_PIX_FMT_RGB555LE:
-        pkt->data[2]  = TGA_RGB;    /* uncompresses true-color image */
+        pkt->data[2]  = TGA_RGB;    /* uncompressed true-color image */
+        avctx->bits_per_coded_sample =
         pkt->data[16] = 16;         /* bpp */
         break;
     case AV_PIX_FMT_BGR24:
@@ -127,7 +151,6 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
     bpp = pkt->data[16] >> 3;
 
-    out = pkt->data + 18;  /* skip past the header we just output */
 
 #if FF_API_CODER_TYPE
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -142,7 +165,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     /* if that worked well, mark the picture as RLE compressed */
     if(datasize >= 0)
-        pkt->data[2] |= 8;
+        pkt->data[2] |= TGA_RLE;
 
     /* if RLE didn't make it smaller, go back to no compression */
     else datasize = targa_encode_normal(out, p, bpp, avctx->width, avctx->height);
@@ -203,7 +226,7 @@ AVCodec ff_targa_encoder = {
     .init           = targa_encode_init,
     .encode2        = targa_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_RGB555LE, AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_RGB555LE, AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/tdsc.c b/libavcodec/tdsc.c
index 5b952b3..4182404 100644
--- a/libavcodec/tdsc.c
+++ b/libavcodec/tdsc.c
@@ -2,20 +2,20 @@
  * TDSC decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -124,8 +124,8 @@ static av_cold int tdsc_init(AVCodecContext *avctx)
     ctx->jpeg_avctx->flags = avctx->flags;
     ctx->jpeg_avctx->flags2 = avctx->flags2;
     ctx->jpeg_avctx->dct_algo = avctx->dct_algo;
-    ctx->jpeg_avctx->idct_algo = avctx->idct_algo;;
-    ret = avcodec_open2(ctx->jpeg_avctx, codec, NULL);
+    ctx->jpeg_avctx->idct_algo = avctx->idct_algo;
+    ret = ff_codec_open2_recursive(ctx->jpeg_avctx, codec, NULL);
     if (ret < 0)
         return ret;
 
@@ -612,7 +612,7 @@ static int tdsc_decode_frame(AVCodecContext *avctx, void *data,
     }
     *got_frame = 1;
 
-    return 0;
+    return avpkt->size;
 }
 
 AVCodec ff_tdsc_decoder = {
diff --git a/libavcodec/tests/.gitignore b/libavcodec/tests/.gitignore
index 488a3be..56ddb2c 100644
--- a/libavcodec/tests/.gitignore
+++ b/libavcodec/tests/.gitignore
@@ -1,7 +1,23 @@
+/avfft
+/avpacket
+/cabac
+/celp_math
+/codec_desc
 /dct
 /fft
 /fft-fixed
+/fft-fixed32
 /golomb
+/h264_levels
+/htmlsubtitles
 /iirfilter
+/imgconvert
+/jpeg2000dwt
+/mathops
+/mjpegenc_huffman
+/motion
 /mpeg12framerate
+/options
 /rangecoder
+/snowenc
+/utils
diff --git a/libavcodec/tests/aarch64/dct.c b/libavcodec/tests/aarch64/dct.c
new file mode 100644
index 0000000..032a963
--- /dev/null
+++ b/libavcodec/tests/aarch64/dct.c
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavcodec/aarch64/idct.h"
+
+static const struct algo fdct_tab_arch[] = {
+    { 0 }
+};
+
+static const struct algo idct_tab_arch[] = {
+    { "SIMPLE-NEON", ff_simple_idct_neon, FF_IDCT_PERM_PARTTRANS, AV_CPU_FLAG_NEON },
+    { 0 }
+};
diff --git a/libavcodec/tests/arm/dct.c b/libavcodec/tests/arm/dct.c
index d18cb52..596d369 100644
--- a/libavcodec/tests/arm/dct.c
+++ b/libavcodec/tests/arm/dct.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tests/avfft.c b/libavcodec/tests/avfft.c
new file mode 100644
index 0000000..22aa99a
--- /dev/null
+++ b/libavcodec/tests/avfft.c
@@ -0,0 +1,25 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * This test is similar to fft-fixed.c or fft-fixed32.c
+ */
+
+#define AVFFT 1
+#define FFT_FLOAT 1
+#include "fft.c"
diff --git a/libavcodec/tests/avpacket.c b/libavcodec/tests/avpacket.c
new file mode 100644
index 0000000..90b7234
--- /dev/null
+++ b/libavcodec/tests/avpacket.c
@@ -0,0 +1,128 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <string.h>
+#include "libavcodec/avcodec.h"
+#include "libavutil/error.h"
+
+
+
+static int setup_side_data_entry(AVPacket* avpkt)
+{
+    const uint8_t *data_name = NULL;
+    int ret = 0, bytes;
+    uint8_t *extra_data = NULL;
+
+
+    /* get side_data_name string */
+    data_name = av_packet_side_data_name(AV_PKT_DATA_NEW_EXTRADATA);
+
+    /* Allocate a memory bloc */
+    bytes = strlen(data_name);
+
+    if(!(extra_data = av_malloc(bytes))){
+        ret = AVERROR(ENOMEM);
+        fprintf(stderr, "Error occurred: %s\n", av_err2str(ret));
+        exit(1);
+    }
+    /* copy side_data_name to extra_data array */
+    memcpy(extra_data, data_name, bytes);
+
+    /* create side data for AVPacket */
+    ret = av_packet_add_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
+                                        extra_data, bytes);
+    if(ret < 0){
+        fprintf(stderr,
+                "Error occurred in av_packet_add_side_data: %s\n",
+                av_err2str(ret));
+    }
+
+    return ret;
+}
+
+static int initializations(AVPacket* avpkt)
+{
+    const static uint8_t* data = "selftest for av_packet_clone(...)";
+    int ret = 0;
+
+    /* initialize avpkt */
+    av_init_packet(avpkt);
+
+    /* set values for avpkt */
+    avpkt->pts = 17;
+    avpkt->dts = 2;
+    avpkt->data = (uint8_t*)data;
+    avpkt->size = strlen(data);
+    avpkt->flags = AV_PKT_FLAG_DISCARD;
+    avpkt->duration = 100;
+    avpkt->pos = 3;
+
+    ret = setup_side_data_entry(avpkt);
+
+    return ret;
+}
+
+int main(void)
+{
+    AVPacket avpkt;
+    AVPacket *avpkt_clone = NULL;
+    int ret = 0;
+
+    if(initializations(&avpkt) < 0){
+        printf("failed to initialize variables\n");
+        return 1;
+    }
+    /* test av_packet_clone*/
+    avpkt_clone = av_packet_clone(&avpkt);
+
+    if(!avpkt_clone) {
+        av_log(NULL, AV_LOG_ERROR,"av_packet_clone failed to clone AVPacket\n");
+        return 1;
+    }
+    /*test av_grow_packet*/
+    if(av_grow_packet(avpkt_clone, 20) < 0){
+        av_log(NULL, AV_LOG_ERROR, "av_grow_packet failed\n");
+        return 1;
+    }
+    if(av_grow_packet(avpkt_clone, INT_MAX) == 0){
+        printf( "av_grow_packet failed to return error "
+                "when \"grow_by\" parameter is too large.\n" );
+        ret = 1;
+    }
+    /* test size error check in av_new_packet*/
+    if(av_new_packet(avpkt_clone, INT_MAX) == 0){
+        printf( "av_new_packet failed to return error "
+                "when \"size\" parameter is too large.\n" );
+        ret = 1;
+    }
+    /*test size error check in av_packet_from_data*/
+    if(av_packet_from_data(avpkt_clone, avpkt_clone->data, INT_MAX) == 0){
+        printf("av_packet_from_data failed to return error "
+                "when \"size\" parameter is too large.\n" );
+        ret = 1;
+    }
+    /*clean up*/
+    av_packet_free(&avpkt_clone);
+    av_packet_unref(&avpkt);
+
+
+    return ret;
+}
diff --git a/libavcodec/tests/cabac.c b/libavcodec/tests/cabac.c
new file mode 100644
index 0000000..affe4eb
--- /dev/null
+++ b/libavcodec/tests/cabac.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/cabac.c"
+
+#define SIZE 10240
+
+#include "libavutil/lfg.h"
+#include "libavcodec/avcodec.h"
+
+static inline void put_cabac_bit(CABACContext *c, int b){
+    put_bits(&c->pb, 1, b);
+    for(;c->outstanding_count; c->outstanding_count--){
+        put_bits(&c->pb, 1, 1-b);
+    }
+}
+
+static inline void renorm_cabac_encoder(CABACContext *c){
+    while(c->range < 0x100){
+        //FIXME optimize
+        if(c->low<0x100){
+            put_cabac_bit(c, 0);
+        }else if(c->low<0x200){
+            c->outstanding_count++;
+            c->low -= 0x100;
+        }else{
+            put_cabac_bit(c, 1);
+            c->low -= 0x200;
+        }
+
+        c->range+= c->range;
+        c->low += c->low;
+    }
+}
+
+static void put_cabac(CABACContext *c, uint8_t * const state, int bit){
+    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + *state];
+
+    if(bit == ((*state)&1)){
+        c->range -= RangeLPS;
+        *state    = ff_h264_mlps_state[128 + *state];
+    }else{
+        c->low += c->range - RangeLPS;
+        c->range = RangeLPS;
+        *state= ff_h264_mlps_state[127 - *state];
+    }
+
+    renorm_cabac_encoder(c);
+}
+
+/**
+ * @param bit 0 -> write zero bit, !=0 write one bit
+ */
+static void put_cabac_bypass(CABACContext *c, int bit){
+    c->low += c->low;
+
+    if(bit){
+        c->low += c->range;
+    }
+//FIXME optimize
+    if(c->low<0x200){
+        put_cabac_bit(c, 0);
+    }else if(c->low<0x400){
+        c->outstanding_count++;
+        c->low -= 0x200;
+    }else{
+        put_cabac_bit(c, 1);
+        c->low -= 0x400;
+    }
+}
+
+/**
+ *
+ * @return the number of bytes written
+ */
+static int put_cabac_terminate(CABACContext *c, int bit){
+    c->range -= 2;
+
+    if(!bit){
+        renorm_cabac_encoder(c);
+    }else{
+        c->low += c->range;
+        c->range= 2;
+
+        renorm_cabac_encoder(c);
+
+        av_assert0(c->low <= 0x1FF);
+        put_cabac_bit(c, c->low>>9);
+        put_bits(&c->pb, 2, ((c->low>>7)&3)|1);
+
+        flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
+    }
+
+    return (put_bits_count(&c->pb)+7)>>3;
+}
+
+int main(void){
+    CABACContext c;
+    uint8_t b[9*SIZE];
+    uint8_t r[9*SIZE];
+    int i, ret = 0;
+    uint8_t state[10]= {0};
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);
+    ff_init_cabac_encoder(&c, b, SIZE);
+
+    for(i=0; i<SIZE; i++){
+        if(2*i<SIZE) r[i] = av_lfg_get(&prng) % 7;
+        else         r[i] = (i>>8)&1;
+    }
+
+    for(i=0; i<SIZE; i++){
+        put_cabac_bypass(&c, r[i]&1);
+    }
+
+    for(i=0; i<SIZE; i++){
+        put_cabac(&c, state, r[i]&1);
+    }
+
+    i= put_cabac_terminate(&c, 1);
+    b[i++] = av_lfg_get(&prng);
+    b[i  ] = av_lfg_get(&prng);
+
+    ff_init_cabac_decoder(&c, b, SIZE);
+
+    memset(state, 0, sizeof(state));
+
+    for(i=0; i<SIZE; i++){
+        if( (r[i]&1) != get_cabac_bypass(&c) ) {
+            av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i);
+            ret = 1;
+        }
+    }
+
+    for(i=0; i<SIZE; i++){
+        if( (r[i]&1) != get_cabac_noinline(&c, state) ) {
+            av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
+            ret = 1;
+        }
+    }
+    if(!get_cabac_terminate(&c)) {
+        av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
+        ret = 1;
+    }
+
+    return ret;
+}
diff --git a/libavcodec/tests/celp_math.c b/libavcodec/tests/celp_math.c
new file mode 100644
index 0000000..669ea70
--- /dev/null
+++ b/libavcodec/tests/celp_math.c
@@ -0,0 +1,49 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/libm.h"
+#include "libavcodec/celp_math.c"
+
+static inline void IsAlmostEqual(float A, float B, float epsilon)
+{
+    float diff = fabsf(A - B);
+    float absa = fabsf(A);
+    float absb = fabsf(B);
+    float largest = (absb > absa) ? absb : absa;
+    av_assert0(diff <= largest * epsilon);
+}
+
+int main(void)
+{
+    int i;
+    const float f1[3]   = {0.0,  1.1,  2.2};
+    const float f2[3]   = {3.3,  4.4,  5.5};
+    const int16_t i1[3] = {6,  7,  8};
+    const int16_t i2[3] = {9, 10, 11};
+
+    float   r = ff_dot_productf(f1, f2, FF_ARRAY_ELEMS(f1));
+    int64_t d = ff_dot_product(i1, i2, FF_ARRAY_ELEMS(i1));
+
+    IsAlmostEqual(16.94f, r, 0.000001f);
+    av_assert0(212 == d);
+
+    for (i = 1024; i >= 1; i/=2)
+        av_assert0(ff_log2_q15(i) == (1<<15)*((int)log2(i))+(1<<2));
+
+    return 0;
+}
diff --git a/libavcodec/tests/codec_desc.c b/libavcodec/tests/codec_desc.c
new file mode 100644
index 0000000..c9b3497
--- /dev/null
+++ b/libavcodec/tests/codec_desc.c
@@ -0,0 +1,45 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+
+int main(int argc, char **argv)
+{
+    const AVCodecDescriptor *old_desc = NULL, *desc;
+
+    while (desc = avcodec_descriptor_next(old_desc)) {
+        if (old_desc && old_desc->id >= desc->id) {
+            av_log(NULL, AV_LOG_FATAL, "Unsorted codec_descriptors '%s' and '%s'.\n", old_desc->name, desc->name);
+            return 1;
+        }
+
+        if (avcodec_descriptor_get(desc->id) != desc) {
+            av_log(NULL, AV_LOG_FATAL, "avcodec_descriptor_get() failed with '%s'.\n", desc->name);
+            return 1;
+        }
+
+        if (avcodec_descriptor_get_by_name(desc->name) != desc) {
+            av_log(NULL, AV_LOG_FATAL, "avcodec_descriptor_get_by_name() failed with '%s'.\n", desc->name);
+            return 1;
+        }
+
+        old_desc = desc;
+    }
+
+    return 0;
+}
diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c
index 84ad5f3..2ca8039 100644
--- a/libavcodec/tests/dct.c
+++ b/libavcodec/tests/dct.c
@@ -2,20 +2,20 @@
  * (c) 2001 Fabrice Bellard
  *     2007 Marc Hoffman <marc.hoffman@analog.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,14 +40,14 @@
 #include "libavutil/lfg.h"
 #include "libavutil/time.h"
 
-#include "libavcodec/aandcttab.h"
 #include "libavcodec/dct.h"
-#include "libavcodec/dctref.h"
-#include "libavcodec/faandct.h"
-#include "libavcodec/faanidct.h"
 #include "libavcodec/idctdsp.h"
 #include "libavcodec/simple_idct.h"
 #include "libavcodec/xvididct.h"
+#include "libavcodec/aandcttab.h"
+#include "libavcodec/faandct.h"
+#include "libavcodec/faanidct.h"
+#include "libavcodec/dctref.h"
 
 struct algo {
     const char *name;
@@ -66,10 +66,26 @@ static const struct algo fdct_tab[] = {
 #endif /* CONFIG_FAANDCT */
 };
 
+static void ff_prores_idct_wrap(int16_t *dst){
+    LOCAL_ALIGNED(16, int16_t, qmat, [64]);
+    int i;
+
+    for(i=0; i<64; i++){
+        qmat[i]=4;
+    }
+    ff_prores_idct_10(dst, qmat);
+    for(i=0; i<64; i++) {
+         dst[i] -= 512;
+    }
+}
+
 static const struct algo idct_tab[] = {
     { "REF-DBL",     ff_ref_idct,          FF_IDCT_PERM_NONE },
     { "INT",         ff_j_rev_dct,         FF_IDCT_PERM_LIBMPEG2 },
-    { "SIMPLE-C",    ff_simple_idct_8,     FF_IDCT_PERM_NONE },
+    { "SIMPLE-C",    ff_simple_idct_int16_8bit,     FF_IDCT_PERM_NONE },
+    { "SIMPLE-C10",  ff_simple_idct_int16_10bit,    FF_IDCT_PERM_NONE },
+    { "SIMPLE-C12",  ff_simple_idct_int16_12bit,    FF_IDCT_PERM_NONE, 0, 1 },
+    { "PR-C",        ff_prores_idct_wrap,  FF_IDCT_PERM_NONE, 0, 1 },
 #if CONFIG_FAANIDCT
     { "FAANI",       ff_faanidct,          FF_IDCT_PERM_NONE },
 #endif /* CONFIG_FAANIDCT */
@@ -78,7 +94,9 @@ static const struct algo idct_tab[] = {
 #endif /* CONFIG_MPEG4_DECODER */
 };
 
-#if ARCH_ARM
+#if ARCH_AARCH64
+#include "aarch64/dct.c"
+#elif ARCH_ARM
 #include "arm/dct.c"
 #elif ARCH_PPC
 #include "ppc/dct.c"
@@ -97,7 +115,7 @@ static const struct algo idct_tab_arch[] = { { 0 } };
 DECLARE_ALIGNED(16, static int16_t, block)[64];
 DECLARE_ALIGNED(8,  static int16_t, block1)[64];
 
-static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
+static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
 {
     int i, j;
 
@@ -106,7 +124,7 @@ static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
     switch (test) {
     case 0:
         for (i = 0; i < 64; i++)
-            block[i] = (av_lfg_get(prng) % 512) - 256;
+            block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
         if (is_idct) {
             ff_ref_fdct(block);
             for (i = 0; i < 64; i++)
@@ -115,11 +133,13 @@ static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
         break;
     case 1:
         j = av_lfg_get(prng) % 10 + 1;
-        for (i = 0; i < j; i++)
-            block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
+        for (i = 0; i < j; i++) {
+            int idx = av_lfg_get(prng) % 64;
+            block[idx] = av_lfg_get(prng) % (2*vals) -vals;
+        }
         break;
     case 2:
-        block[ 0] = av_lfg_get(prng) % 4096 - 2048;
+        block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
         block[63] = (block[0] & 1) ^ 1;
         break;
     }
@@ -144,6 +164,10 @@ static void permute(int16_t dst[64], const int16_t src[64],
         for (i = 0; i < 64; i++)
             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
         break;
+    case FF_IDCT_PERM_TRANSPOSE:
+        for (i = 0; i < 64; i++)
+            dst[(i>>3) | ((i<<3)&0x38)] = src[i];
+        break;
     default:
         for (i = 0; i < 64; i++)
             dst[i] = src[i];
@@ -151,16 +175,18 @@ static void permute(int16_t dst[64], const int16_t src[64],
     }
 }
 
-static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
+static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 {
     void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
     int it, i, scale;
     int err_inf, v;
     int64_t err2, ti, ti1, it1, err_sum = 0;
     int64_t sysErr[64], sysErrMax = 0;
+    int64_t err2_matrix[64], err2_max = 0;
     int maxout = 0;
     int blockSumErrMax = 0, blockSumErr;
     AVLFG prng;
+    const int vals=1<<bits;
     double omse, ome;
     int spec_err;
 
@@ -169,9 +195,9 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
     err_inf = 0;
     err2 = 0;
     for (i = 0; i < 64; i++)
-        sysErr[i] = 0;
+        err2_matrix[i] = sysErr[i] = 0;
     for (it = 0; it < NB_ITS; it++) {
-        init_block(block1, test, is_idct, &prng);
+        init_block(block1, test, is_idct, &prng, vals);
         permute(block, block1, dct->perm_type);
 
         dct->func(block);
@@ -185,6 +211,9 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
         }
 
         ref(block1);
+        if (!strcmp(dct->name, "PR-SSE2"))
+            for (i = 0; i < 64; i++)
+                block1[i] = av_clip(block1[i], 4-512, 1019-512);
 
         blockSumErr = 0;
         for (i = 0; i < 64; i++) {
@@ -193,6 +222,7 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
             v = abs(err);
             if (v > err_inf)
                 err_inf = v;
+            err2_matrix[i] += v * v;
             err2 += v * v;
             sysErr[i] += block[i] - block1[i];
             blockSumErr += v;
@@ -202,8 +232,10 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
         if (blockSumErrMax < blockSumErr)
             blockSumErrMax = blockSumErr;
     }
-    for (i = 0; i < 64; i++)
+    for (i = 0; i < 64; i++) {
         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
+        err2_max  = FFMAX(err2_max , FFABS(err2_matrix[i]));
+    }
 
     for (i = 0; i < 64; i++) {
         if (i % 8 == 0)
@@ -216,20 +248,25 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
     ome  = (double) err_sum / NB_ITS / 64;
 
     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
+    if (test < 2)
+        spec_err = is_idct && ((double) err2_max / NB_ITS > 0.06 || (double) sysErrMax / NB_ITS > 0.015);
 
-    printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
+    printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
            omse, ome, (double) sysErrMax / NB_ITS,
            maxout, blockSumErrMax);
 
-    if (spec_err && !dct->nonspec)
+    if (spec_err && !dct->nonspec) {
+        printf("Failed!\n");
         return 1;
+    }
 
     if (!speed)
         return 0;
 
     /* speed test */
-    init_block(block, test, is_idct, &prng);
+
+    init_block(block, test, is_idct, &prng, vals);
     permute(block1, block, dct->perm_type);
 
     ti = av_gettime_relative();
@@ -239,10 +276,10 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
             memcpy(block, block1, sizeof(block));
             dct->func(block);
         }
+        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
-    emms_c();
 
     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
            (double) it1 * 1000.0 / (double) ti1);
@@ -368,6 +405,25 @@ static void idct248_error(const char *name,
             if (v > err_max)
                 err_max = v;
         }
+#if 0
+        printf("ref=\n");
+        for(i=0;i<8;i++) {
+            int j;
+            for(j=0;j<8;j++) {
+                printf(" %3d", img_dest1[i*8+j]);
+            }
+            printf("\n");
+        }
+
+        printf("out=\n");
+        for(i=0;i<8;i++) {
+            int j;
+            for(j=0;j<8;j++) {
+                printf(" %3d", img_dest[i*8+j]);
+            }
+            printf("\n");
+        }
+#endif
     }
     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 
@@ -382,10 +438,10 @@ static void idct248_error(const char *name,
                 block[i] = block1[i];
             idct248_put(img_dest, 8, block);
         }
+        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
-    emms_c();
 
     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
            (double) it1 * 1000.0 / (double) ti1);
@@ -393,10 +449,11 @@ static void idct248_error(const char *name,
 
 static void help(void)
 {
-    printf("dct-test [-i] [<test-number>]\n"
+    printf("dct-test [-i] [<test-number>] [<bits>]\n"
            "test-number 0 -> test with random matrixes\n"
            "            1 -> test with random sparse matrixes\n"
            "            2 -> do 3. test from MPEG-4 std\n"
+           "bits        Number of time domain bits to use, 8 is default\n"
            "-i          test IDCT implementations\n"
            "-4          test IDCT248 implementations\n"
            "-t          speed test\n");
@@ -413,6 +470,7 @@ int main(int argc, char **argv)
     int test = 1;
     int speed = 0;
     int err = 0;
+    int bits=8;
 
     ff_ref_dct_init();
 
@@ -439,8 +497,9 @@ int main(int argc, char **argv)
 
     if (optind < argc)
         test = atoi(argv[optind]);
+    if(optind+1 < argc) bits= atoi(argv[optind+1]);
 
-    printf("Libav DCT/IDCT test\n");
+    printf("ffmpeg DCT/IDCT test\n");
 
     if (test_248_dct) {
         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
@@ -448,20 +507,20 @@ int main(int argc, char **argv)
         const int cpu_flags = av_get_cpu_flags();
         if (test_idct) {
             for (i = 0; i < FF_ARRAY_ELEMS(idct_tab); i++)
-                err |= dct_error(&idct_tab[i], test, test_idct, speed);
+                err |= dct_error(&idct_tab[i], test, test_idct, speed, bits);
 
             for (i = 0; idct_tab_arch[i].name; i++)
                 if (!(~cpu_flags & idct_tab_arch[i].cpu_flag))
-                    err |= dct_error(&idct_tab_arch[i], test, test_idct, speed);
+                    err |= dct_error(&idct_tab_arch[i], test, test_idct, speed, bits);
         }
 #if CONFIG_FDCTDSP
         else {
             for (i = 0; i < FF_ARRAY_ELEMS(fdct_tab); i++)
-                err |= dct_error(&fdct_tab[i], test, test_idct, speed);
+                err |= dct_error(&fdct_tab[i], test, test_idct, speed, bits);
 
             for (i = 0; fdct_tab_arch[i].name; i++)
                 if (!(~cpu_flags & fdct_tab_arch[i].cpu_flag))
-                    err |= dct_error(&fdct_tab_arch[i], test, test_idct, speed);
+                    err |= dct_error(&fdct_tab_arch[i], test, test_idct, speed, bits);
         }
 #endif /* CONFIG_FDCTDSP */
     }
diff --git a/libavcodec/tests/fft-fixed.c b/libavcodec/tests/fft-fixed.c
index 6edd810..3c50bf1 100644
--- a/libavcodec/tests/fft-fixed.c
+++ b/libavcodec/tests/fft-fixed.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 0
+#define AVFFT 0
 #include "fft.c"
diff --git a/libavcodec/tests/fft-fixed32.c b/libavcodec/tests/fft-fixed32.c
new file mode 100644
index 0000000..9fadd8a
--- /dev/null
+++ b/libavcodec/tests/fft-fixed32.c
@@ -0,0 +1,22 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#define AVFFT 0
+#include "fft.c"
diff --git a/libavcodec/tests/fft.c b/libavcodec/tests/fft.c
index db1ce98..83f2ff2 100644
--- a/libavcodec/tests/fft.c
+++ b/libavcodec/tests/fft.c
@@ -1,20 +1,20 @@
 /*
  * (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,10 @@
 
 #include "config.h"
 
+#ifndef AVFFT
+#define AVFFT 0
+#endif
+
 #include <math.h>
 #if HAVE_UNISTD_H
 #include <unistd.h>
@@ -39,7 +43,12 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/time.h"
 
+#if AVFFT
+#include "libavcodec/avfft.h"
+#else
 #include "libavcodec/fft.h"
+#endif
+
 #if FFT_FLOAT
 #include "libavcodec/dct.h"
 #include "libavcodec/rdft.h"
@@ -55,10 +64,14 @@
         pim += (MUL16(are, bim) + MUL16(bre, aim)); \
     }
 
-#if FFT_FLOAT
+#if FFT_FLOAT || AVFFT
 #define RANGE 1.0
 #define REF_SCALE(x, bits)  (x)
 #define FMT "%10.6f"
+#elif FFT_FIXED_32
+#define RANGE 8388608
+#define REF_SCALE(x, bits) (x)
+#define FMT "%6d"
 #else
 #define RANGE 16384
 #define REF_SCALE(x, bits) ((x) / (1 << (bits)))
@@ -73,7 +86,7 @@ static int fft_ref_init(int nbits, int inverse)
 {
     int i, n = 1 << nbits;
 
-    exptab = av_malloc((n / 2) * sizeof(*exptab));
+    exptab = av_malloc_array((n / 2), sizeof(*exptab));
     if (!exptab)
         return AVERROR(ENOMEM);
 
@@ -150,7 +163,7 @@ static void mdct_ref(FFTSample *output, FFTSample *input, int nbits)
 
 #if FFT_FLOAT
 #if CONFIG_DCT
-static void idct_ref(float *output, float *input, int nbits)
+static void idct_ref(FFTSample *output, FFTSample *input, int nbits)
 {
     int i, k, n = 1 << nbits;
 
@@ -165,7 +178,7 @@ static void idct_ref(float *output, float *input, int nbits)
     }
 }
 
-static void dct_ref(float *output, float *input, int nbits)
+static void dct_ref(FFTSample *output, FFTSample *input, int nbits)
 {
     int i, k, n = 1 << nbits;
 
@@ -203,10 +216,138 @@ static int check_diff(FFTSample *tab1, FFTSample *tab2, int n, double scale)
         if (e > max)
             max = e;
     }
-    av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error) / n);
+    av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error / n));
     return err;
 }
 
+static inline void fft_init(FFTContext **s, int nbits, int inverse)
+{
+#if AVFFT
+    *s = av_fft_init(nbits, inverse);
+#else
+    ff_fft_init(*s, nbits, inverse);
+#endif
+}
+
+static inline void mdct_init(FFTContext **s, int nbits, int inverse, double scale)
+{
+#if AVFFT
+    *s = av_mdct_init(nbits, inverse, scale);
+#else
+    ff_mdct_init(*s, nbits, inverse, scale);
+#endif
+}
+
+static inline void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+#if AVFFT
+    av_mdct_calc(s, output, input);
+#else
+    s->mdct_calc(s, output, input);
+#endif
+}
+
+static inline void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+#if AVFFT
+    av_imdct_calc(s, output, input);
+#else
+    s->imdct_calc(s, output, input);
+#endif
+}
+
+static inline void fft_permute(FFTContext *s, FFTComplex *z)
+{
+#if AVFFT
+    av_fft_permute(s, z);
+#else
+    s->fft_permute(s, z);
+#endif
+}
+
+static inline void fft_calc(FFTContext *s, FFTComplex *z)
+{
+#if AVFFT
+    av_fft_calc(s, z);
+#else
+    s->fft_calc(s, z);
+#endif
+}
+
+static inline void mdct_end(FFTContext *s)
+{
+#if AVFFT
+    av_mdct_end(s);
+#else
+    ff_mdct_end(s);
+#endif
+}
+
+static inline void fft_end(FFTContext *s)
+{
+#if AVFFT
+    av_fft_end(s);
+#else
+    ff_fft_end(s);
+#endif
+}
+
+#if FFT_FLOAT
+static inline void rdft_init(RDFTContext **r, int nbits, enum RDFTransformType trans)
+{
+#if AVFFT
+    *r = av_rdft_init(nbits, trans);
+#else
+    ff_rdft_init(*r, nbits, trans);
+#endif
+}
+
+static inline void dct_init(DCTContext **d, int nbits, enum DCTTransformType trans)
+{
+#if AVFFT
+    *d = av_dct_init(nbits, trans);
+#else
+    ff_dct_init(*d, nbits, trans);
+#endif
+}
+
+static inline void rdft_calc(RDFTContext *r, FFTSample *tab)
+{
+#if AVFFT
+    av_rdft_calc(r, tab);
+#else
+    r->rdft_calc(r, tab);
+#endif
+}
+
+static inline void dct_calc(DCTContext *d, FFTSample *data)
+{
+#if AVFFT
+    av_dct_calc(d, data);
+#else
+    d->dct_calc(d, data);
+#endif
+}
+
+static inline void rdft_end(RDFTContext *r)
+{
+#if AVFFT
+    av_rdft_end(r);
+#else
+    ff_rdft_end(r);
+#endif
+}
+
+static inline void dct_end(DCTContext *d)
+{
+#if AVFFT
+    av_dct_end(d);
+#else
+    ff_dct_end(d);
+#endif
+}
+#endif /* FFT_FLOAT */
+
 static void help(void)
 {
     av_log(NULL, AV_LOG_INFO,
@@ -237,10 +378,10 @@ int main(int argc, char **argv)
     FFTComplex *tab, *tab1, *tab_ref;
     FFTSample *tab2;
     enum tf_transform transform = TRANSFORM_FFT;
-    FFTContext m, s;
+    FFTContext *m, *s;
 #if FFT_FLOAT
-    RDFTContext r;
-    DCTContext d;
+    RDFTContext *r;
+    DCTContext *d;
 #endif /* FFT_FLOAT */
     int it, i, err = 1;
     int do_speed = 0, do_inverse = 0;
@@ -248,6 +389,16 @@ int main(int argc, char **argv)
     double scale = 1.0;
     AVLFG prng;
 
+#if !AVFFT
+    s = av_mallocz(sizeof(*s));
+    m = av_mallocz(sizeof(*m));
+#endif
+
+#if !AVFFT && FFT_FLOAT
+    r = av_mallocz(sizeof(*r));
+    d = av_mallocz(sizeof(*d));
+#endif
+
     av_lfg_init(&prng, 1);
 
     for (;;) {
@@ -281,20 +432,22 @@ int main(int argc, char **argv)
             break;
         case 'c':
         {
-            int cpuflags = av_parse_cpu_flags(optarg);
-            if (cpuflags < 0)
+            unsigned cpuflags = av_get_cpu_flags();
+
+            if (av_parse_cpu_caps(&cpuflags, optarg) < 0)
                 return 1;
-            av_set_cpu_flags_mask(cpuflags);
+
+            av_force_cpu_flags(cpuflags);
             break;
         }
         }
     }
 
     fft_size = 1 << fft_nbits;
-    tab      = av_malloc(fft_size * sizeof(FFTComplex));
-    tab1     = av_malloc(fft_size * sizeof(FFTComplex));
-    tab_ref  = av_malloc(fft_size * sizeof(FFTComplex));
-    tab2     = av_malloc(fft_size * sizeof(FFTSample));
+    tab      = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab1     = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab_ref  = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab2     = av_malloc_array(fft_size, sizeof(FFTSample));
 
     if (!(tab && tab1 && tab_ref && tab2))
         goto cleanup;
@@ -307,7 +460,7 @@ int main(int argc, char **argv)
             av_log(NULL, AV_LOG_INFO, "IMDCT");
         else
             av_log(NULL, AV_LOG_INFO, "MDCT");
-        ff_mdct_init(&m, fft_nbits, do_inverse, scale);
+        mdct_init(&m, fft_nbits, do_inverse, scale);
         break;
 #endif /* CONFIG_MDCT */
     case TRANSFORM_FFT:
@@ -315,31 +468,31 @@ int main(int argc, char **argv)
             av_log(NULL, AV_LOG_INFO, "IFFT");
         else
             av_log(NULL, AV_LOG_INFO, "FFT");
-        ff_fft_init(&s, fft_nbits, do_inverse);
-        if (err = fft_ref_init(fft_nbits, do_inverse) < 0)
+        fft_init(&s, fft_nbits, do_inverse);
+        if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
             goto cleanup;
         break;
 #if FFT_FLOAT
-#if CONFIG_RDFT
+#    if CONFIG_RDFT
     case TRANSFORM_RDFT:
         if (do_inverse)
             av_log(NULL, AV_LOG_INFO, "IDFT_C2R");
         else
             av_log(NULL, AV_LOG_INFO, "DFT_R2C");
-        ff_rdft_init(&r, fft_nbits, do_inverse ? IDFT_C2R : DFT_R2C);
-        if (err = fft_ref_init(fft_nbits, do_inverse) < 0)
+        rdft_init(&r, fft_nbits, do_inverse ? IDFT_C2R : DFT_R2C);
+        if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
             goto cleanup;
         break;
-#endif /* CONFIG_RDFT */
-#if CONFIG_DCT
+#    endif /* CONFIG_RDFT */
+#    if CONFIG_DCT
     case TRANSFORM_DCT:
         if (do_inverse)
             av_log(NULL, AV_LOG_INFO, "DCT_III");
         else
             av_log(NULL, AV_LOG_INFO, "DCT_II");
-        ff_dct_init(&d, fft_nbits, do_inverse ? DCT_III : DCT_II);
+        dct_init(&d, fft_nbits, do_inverse ? DCT_III : DCT_II);
         break;
-#endif /* CONFIG_DCT */
+#    endif /* CONFIG_DCT */
 #endif /* FFT_FLOAT */
     default:
         av_log(NULL, AV_LOG_ERROR, "Requested transform not supported\n");
@@ -362,19 +515,19 @@ int main(int argc, char **argv)
     case TRANSFORM_MDCT:
         if (do_inverse) {
             imdct_ref(&tab_ref->re, &tab1->re, fft_nbits);
-            m.imdct_calc(&m, tab2, &tab1->re);
+            imdct_calc(m, tab2, &tab1->re);
             err = check_diff(&tab_ref->re, tab2, fft_size, scale);
         } else {
             mdct_ref(&tab_ref->re, &tab1->re, fft_nbits);
-            m.mdct_calc(&m, tab2, &tab1->re);
+            mdct_calc(m, tab2, &tab1->re);
             err = check_diff(&tab_ref->re, tab2, fft_size / 2, scale);
         }
         break;
 #endif /* CONFIG_MDCT */
     case TRANSFORM_FFT:
         memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
-        s.fft_permute(&s, tab);
-        s.fft_calc(&s, tab);
+        fft_permute(s, tab);
+        fft_calc(s, tab);
 
         fft_ref(tab_ref, tab1, fft_nbits);
         err = check_diff(&tab_ref->re, &tab->re, fft_size * 2, 1.0);
@@ -395,7 +548,7 @@ int main(int argc, char **argv)
             memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
             tab2[1] = tab1[fft_size_2].re;
 
-            r.rdft_calc(&r, tab2);
+            rdft_calc(r, tab2);
             fft_ref(tab_ref, tab1, fft_nbits);
             for (i = 0; i < fft_size; i++) {
                 tab[i].re = tab2[i];
@@ -407,7 +560,7 @@ int main(int argc, char **argv)
                 tab2[i]    = tab1[i].re;
                 tab1[i].im = 0;
             }
-            r.rdft_calc(&r, tab2);
+            rdft_calc(r, tab2);
             fft_ref(tab_ref, tab1, fft_nbits);
             tab_ref[0].im = tab_ref[fft_size_2].re;
             err = check_diff(&tab_ref->re, tab2, fft_size, 1.0);
@@ -418,7 +571,7 @@ int main(int argc, char **argv)
 #if CONFIG_DCT
     case TRANSFORM_DCT:
         memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
-        d.dct_calc(&d, &tab->re);
+        dct_calc(d, &tab->re);
         if (do_inverse)
             idct_ref(&tab_ref->re, &tab1->re, fft_nbits);
         else
@@ -444,22 +597,22 @@ int main(int argc, char **argv)
                 switch (transform) {
                 case TRANSFORM_MDCT:
                     if (do_inverse)
-                        m.imdct_calc(&m, &tab->re, &tab1->re);
+                        imdct_calc(m, &tab->re, &tab1->re);
                     else
-                        m.mdct_calc(&m, &tab->re, &tab1->re);
+                        mdct_calc(m, &tab->re, &tab1->re);
                     break;
                 case TRANSFORM_FFT:
                     memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
-                    s.fft_calc(&s, tab);
+                    fft_calc(s, tab);
                     break;
 #if FFT_FLOAT
                 case TRANSFORM_RDFT:
                     memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
-                    r.rdft_calc(&r, tab2);
+                    rdft_calc(r, tab2);
                     break;
                 case TRANSFORM_DCT:
                     memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
-                    d.dct_calc(&d, tab2);
+                    dct_calc(d, tab2);
                     break;
 #endif /* FFT_FLOAT */
                 }
@@ -479,23 +632,23 @@ int main(int argc, char **argv)
     switch (transform) {
 #if CONFIG_MDCT
     case TRANSFORM_MDCT:
-        ff_mdct_end(&m);
+        mdct_end(m);
         break;
 #endif /* CONFIG_MDCT */
     case TRANSFORM_FFT:
-        ff_fft_end(&s);
+        fft_end(s);
         break;
 #if FFT_FLOAT
-#if CONFIG_RDFT
+#    if CONFIG_RDFT
     case TRANSFORM_RDFT:
-        ff_rdft_end(&r);
+        rdft_end(r);
         break;
-#endif /* CONFIG_RDFT */
-#if CONFIG_DCT
+#    endif /* CONFIG_RDFT */
+#    if CONFIG_DCT
     case TRANSFORM_DCT:
-        ff_dct_end(&d);
+        dct_end(d);
         break;
-#endif /* CONFIG_DCT */
+#    endif /* CONFIG_DCT */
 #endif /* FFT_FLOAT */
     }
 
@@ -506,6 +659,16 @@ cleanup:
     av_free(tab_ref);
     av_free(exptab);
 
+#if !AVFFT
+    av_free(s);
+    av_free(m);
+#endif
+
+#if !AVFFT && FFT_FLOAT
+    av_free(r);
+    av_free(d);
+#endif
+
     if (err)
         printf("Error: %d.\n", err);
 
diff --git a/libavcodec/tests/golomb.c b/libavcodec/tests/golomb.c
index d8fff71..85b8a93 100644
--- a/libavcodec/tests/golomb.c
+++ b/libavcodec/tests/golomb.c
@@ -1,29 +1,32 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdint.h>
 #include <stdio.h>
 
+#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
-#include "libavcodec/bitstream.h"
-#include "libavcodec/put_bits.h"
+#include "libavcodec/get_bits.h"
 #include "libavcodec/golomb.h"
+#include "libavcodec/put_bits.h"
 
 #define COUNT 8191
 #define SIZE (COUNT * 4)
@@ -33,7 +36,7 @@ int main(void)
     int i, ret = 0;
     uint8_t *temp;
     PutBitContext pb;
-    BitstreamContext bc;
+    GetBitContext gb;
 
     temp = av_malloc(SIZE);
     if (!temp)
@@ -44,11 +47,11 @@ int main(void)
         set_ue_golomb(&pb, i);
     flush_put_bits(&pb);
 
-    bitstream_init8(&bc, temp, SIZE);
+    init_get_bits(&gb, temp, 8 * SIZE);
     for (i = 0; i < COUNT; i++) {
-        int j, s = bitstream_peek(&bc, 25);
+        int j, s = show_bits(&gb, 25);
 
-        j = get_ue_golomb(&bc);
+        j = get_ue_golomb(&gb);
         if (j != i) {
             fprintf(stderr, "get_ue_golomb: expected %d, got %d. bits: %7x\n",
                     i, j, s);
@@ -56,17 +59,17 @@ int main(void)
         }
     }
 
-#define EXTEND(i) (i << 3 | i & 7)
+#define EXTEND(i) ((i) << 3 | (i) & 7)
     init_put_bits(&pb, temp, SIZE);
     for (i = 0; i < COUNT; i++)
         set_ue_golomb(&pb, EXTEND(i));
     flush_put_bits(&pb);
 
-    bitstream_init8(&bc, temp, SIZE);
+    init_get_bits(&gb, temp, 8 * SIZE);
     for (i = 0; i < COUNT; i++) {
-        int j, s = bitstream_peek(&bc, 32);
+        int j, s = show_bits_long(&gb, 32);
 
-        j = get_ue_golomb_long(&bc);
+        j = get_ue_golomb_long(&gb);
         if (j != EXTEND(i)) {
             fprintf(stderr, "get_ue_golomb_long: expected %d, got %d. "
                     "bits: %8x\n", EXTEND(i), j, s);
@@ -74,16 +77,34 @@ int main(void)
         }
     }
 
+#define EXTEND_L(i) ((i) << 4 | (i) & 15)
+    init_put_bits(&pb, temp, SIZE);
+    for (i = 0; i < COUNT; i++)
+        set_ue_golomb_long(&pb, EXTEND_L(i));
+    flush_put_bits(&pb);
+
+    init_get_bits(&gb, temp, 8 * SIZE);
+    for (i = 0; i < COUNT; i++) {
+        int j, s = show_bits_long(&gb, 32);
+
+        j = get_ue_golomb_long(&gb);
+        if (j != EXTEND_L(i)) {
+            fprintf(stderr, "get_ue_golomb_long: expected %d, got %d. "
+                    "bits: %8x\n", EXTEND_L(i), j, s);
+            ret = 1;
+        }
+    }
+
     init_put_bits(&pb, temp, SIZE);
     for (i = 0; i < COUNT; i++)
         set_se_golomb(&pb, i - COUNT / 2);
     flush_put_bits(&pb);
 
-    bitstream_init8(&bc, temp, SIZE);
+    init_get_bits(&gb, temp, 8 * SIZE);
     for (i = 0; i < COUNT; i++) {
-        int j, s = bitstream_peek(&bc, 25);
+        int j, s = show_bits(&gb, 25);
 
-        j = get_se_golomb(&bc);
+        j = get_se_golomb(&gb);
         if (j != i - COUNT / 2) {
             fprintf(stderr, "get_se_golomb: expected %d, got %d. bits: %7x\n",
                     i - COUNT / 2, j, s);
diff --git a/libavcodec/tests/h264_levels.c b/libavcodec/tests/h264_levels.c
new file mode 100644
index 0000000..0e00f05
--- /dev/null
+++ b/libavcodec/tests/h264_levels.c
@@ -0,0 +1,183 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavcodec/h264_levels.h"
+
+static const struct {
+    int width;
+    int height;
+    int level_idc;
+} test_sizes[] = {
+    // First level usable at some standard sizes.
+    // (From H.264 table A-6.)
+    {  176,  144, 10 }, // QCIF
+    {  352,  288, 11 }, // CIF
+    {  640,  480, 22 }, // VGA
+    {  720,  480, 22 }, // NTSC
+    {  720,  576, 22 }, // PAL
+    {  800,  600, 31 }, // SVGA
+    { 1280,  720, 31 }, // 720p
+    { 1280, 1024, 32 }, // SXGA
+    { 1920, 1080, 40 }, // 1080p
+    { 2048, 1080, 42 }, // 2Kx1080
+    { 2048, 1536, 50 }, // 4XGA
+    { 3840, 2160, 51 }, // 4K
+    { 7680, 4320, 60 }, // 8K
+
+    // Overly wide or tall sizes.
+    {    1,  256, 10 },
+    {    1,  512, 11 },
+    {    1, 1024, 21 },
+    {    1, 1808, 22 },
+    {    1, 1824, 31 },
+    {  256,    1, 10 },
+    {  512,    1, 11 },
+    { 1024,    1, 21 },
+    { 1808,    1, 22 },
+    { 1824,    1, 31 },
+    {  512, 4096, 40 },
+    {  256, 4112, 42 },
+    { 8688, 1024, 51 },
+    { 8704,  512, 60 },
+    { 16880,   1, 60 },
+    { 16896,   1,  0 },
+};
+
+static const struct {
+    int width;
+    int height;
+    int dpb_size;
+    int level_idc;
+} test_dpb[] = {
+    // First level usable for some DPB sizes.
+    // (From H.264 table A-7.)
+    {  176,  144,  4, 10 },
+    {  176,  144,  8, 11 },
+    {  176,  144, 16, 12 },
+    { 1280,  720,  1, 31 },
+    { 1280,  720,  5, 31 },
+    { 1280,  720,  9, 40 },
+    { 1280,  720, 10, 50 },
+    { 1920, 1080,  1, 40 },
+    { 1920, 1080,  5, 50 },
+    { 1920, 1080, 13, 50 },
+    { 1920, 1080, 14, 51 },
+    { 3840, 2160,  5, 51 },
+    { 3840, 2160,  6, 60 },
+    { 3840, 2160, 16, 60 },
+    { 7680, 4320,  5, 60 },
+    { 7680, 4320,  6,  0 },
+};
+
+static const struct {
+    int64_t bitrate;
+    int profile_idc;
+    int level_idc;
+} test_bitrate[] = {
+    // Values where profile affects level at a given bitrate.
+    {   2500000,  77, 21 },
+    {   2500000, 100, 20 },
+    {   2500000, 244, 13 },
+    { 100000000,  77, 50 },
+    { 100000000, 100, 50 },
+    { 100000000, 244, 41 },
+    { 999999999,  77,  0 },
+    { 999999999, 100, 62 },
+    // Check level 1b.
+    {  32 * 1200,  66, 10 },
+    {  32 * 1500, 100, 10 },
+    {  96 * 1200,  66, 11 },
+    {  96 * 1500, 100,  9 },
+    { 144 * 1200,  66, 11 },
+    { 144 * 1500, 100, 11 },
+};
+
+static const struct {
+    const char *name;
+    int profile_idc;
+    int64_t bitrate;
+    int width;
+    int height;
+    int dpb_frames;
+    int level_idc;
+} test_all[] = {
+    { "Bluray 1080p 40Mb/s", 100, 40000000, 1920, 1080, 4, 41 },
+    { "Bluray 1080p 24Mb/s", 100, 24000000, 1920, 1080, 4, 40 },
+    { "Bluray 720p 40Mb/s",  100, 40000000, 1280,  720, 6, 41 },
+    { "Bluray 720p 24Mb/s",  100, 24000000, 1280,  720, 6, 40 },
+    { "Bluray PAL 40Mb/s",   100, 40000000,  720,  576, 6, 41 },
+    { "Bluray PAL 24Mb/s",   100, 24000000,  720,  576, 6, 32 },
+    { "Bluray PAL 16Mb/s",   100, 16800000,  720,  576, 6, 31 },
+    { "Bluray PAL 12Mb/s",   100, 12000000,  720,  576, 5, 30 },
+    { "Bluray NTSC 40Mb/s",  100, 40000000,  720,  480, 6, 41 },
+    { "Bluray NTSC 24Mb/s",  100, 24000000,  720,  480, 6, 32 },
+    { "Bluray NTSC 16Mb/s",  100, 16800000,  720,  480, 6, 31 },
+    { "Bluray NTSC 12Mb/s",  100, 12000000,  720,  480, 6, 30 },
+};
+
+int main(void)
+{
+    const H264LevelDescriptor *level;
+    int i;
+
+#define CHECK(expected, format, ...) do { \
+        if (expected ? (!level || level->level_idc != expected) \
+                     : !!level) { \
+            av_log(NULL, AV_LOG_ERROR, "Incorrect level for " \
+                   format ": expected %d, got %d.\n", __VA_ARGS__, \
+                   expected, level ? level->level_idc : -1); \
+            return 1; \
+        } \
+    } while (0)
+
+    for (i = 0; i < FF_ARRAY_ELEMS(test_sizes); i++) {
+        level = ff_h264_guess_level(0, 0, test_sizes[i].width,
+                                    test_sizes[i].height, 0);
+        CHECK(test_sizes[i].level_idc, "size %dx%d",
+              test_sizes[i].width, test_sizes[i].height);
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(test_dpb); i++) {
+        level = ff_h264_guess_level(0, 0, test_dpb[i].width,
+                                    test_dpb[i].height,
+                                    test_dpb[i].dpb_size);
+        CHECK(test_dpb[i].level_idc, "size %dx%d dpb %d",
+              test_dpb[i].width, test_dpb[i].height,
+              test_dpb[i].dpb_size);
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(test_bitrate); i++) {
+        level = ff_h264_guess_level(test_bitrate[i].profile_idc,
+                                    test_bitrate[i].bitrate,
+                                    0, 0, 0);
+        CHECK(test_bitrate[i].level_idc, "bitrate %"PRId64" profile %d",
+              test_bitrate[i].bitrate, test_bitrate[i].profile_idc);
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(test_all); i++) {
+        level = ff_h264_guess_level(test_all[i].profile_idc,
+                                    test_all[i].bitrate,
+                                    test_all[i].width,
+                                    test_all[i].height,
+                                    test_all[i].dpb_frames);
+        CHECK(test_all[i].level_idc, "%s", test_all[i].name);
+    }
+
+    return 0;
+}
diff --git a/libavcodec/tests/htmlsubtitles.c b/libavcodec/tests/htmlsubtitles.c
new file mode 100644
index 0000000..7c89ee9
--- /dev/null
+++ b/libavcodec/tests/htmlsubtitles.c
@@ -0,0 +1,51 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavcodec/htmlsubtitles.c"
+
+static const char * const test_cases[] = {
+    /* latin guillemets and other < > garbage */
+    "<<hello>>",                            // guillemets
+    "<<<b>hello</b>>>",                     // guillemets + tags
+    "< hello < 2000 > world >",             // unlikely tags due to spaces
+    "<h1>TITLE</h1>",                       // likely unhandled tags
+    "< font color=red >red</font>",         // invalid format of valid tag
+    "Foo <foo@bar.com>",                    // not a tag (not alnum)
+
+    "<b> foo <I> bar </B> bla </i>",        // broken nesting
+
+    "A<br>B<BR/>C<br  / >D<  Br >E<brk><brk/>", // misc line breaks
+};
+
+int main(void)
+{
+    int i;
+    AVBPrint dst;
+
+    av_bprint_init(&dst, 0, AV_BPRINT_SIZE_UNLIMITED);
+    for (i = 0; i < FF_ARRAY_ELEMS(test_cases); i++) {
+        int ret = ff_htmlmarkup_to_ass(NULL, &dst, test_cases[i]);
+        if (ret < 0)
+            return ret;
+        printf("%s --> %s\n", test_cases[i], dst.str);
+        av_bprint_clear(&dst);
+    }
+    av_bprint_finalize(&dst, NULL);
+    return 0;
+}
diff --git a/libavcodec/tests/iirfilter.c b/libavcodec/tests/iirfilter.c
index a6001a3..60cc6fc 100644
--- a/libavcodec/tests/iirfilter.c
+++ b/libavcodec/tests/iirfilter.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,7 +48,7 @@ int main(void)
     for (i = 0; i < SIZE; i++)
         printf("%6d %6d\n", x[i], y[i]);
 
-    ff_iir_filter_free_coeffs(fcoeffs);
-    ff_iir_filter_free_state(fstate);
+    ff_iir_filter_free_coeffsp(&fcoeffs);
+    ff_iir_filter_free_statep(&fstate);
     return 0;
 }
diff --git a/libavcodec/tests/imgconvert.c b/libavcodec/tests/imgconvert.c
new file mode 100644
index 0000000..aefc324
--- /dev/null
+++ b/libavcodec/tests/imgconvert.c
@@ -0,0 +1,46 @@
+/*
+ * Misc image conversion routines
+ * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/imgconvert.c"
+
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+int main(void){
+    int i;
+    int err=0;
+    int skip = 0;
+
+    for (i=0; i<AV_PIX_FMT_NB*2; i++) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(i);
+        if(!desc || !desc->name) {
+            skip ++;
+            continue;
+        }
+        if (skip) {
+            av_log(NULL, AV_LOG_INFO, "%3d unused pixel format values\n", skip);
+            skip = 0;
+        }
+        av_log(NULL, AV_LOG_INFO, "pix fmt %s yuv_plan:%d avg_bpp:%d\n", desc->name, is_yuv_planar(desc), av_get_padded_bits_per_pixel(desc));
+    }
+    return err;
+}
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif /* FF_API_AVPICTURE */
diff --git a/libavcodec/tests/jpeg2000dwt.c b/libavcodec/tests/jpeg2000dwt.c
new file mode 100644
index 0000000..80b33be
--- /dev/null
+++ b/libavcodec/tests/jpeg2000dwt.c
@@ -0,0 +1,141 @@
+/*
+ * Discrete wavelet transform
+ * Copyright (c) 2007 Kamil Nowosad
+ * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/jpeg2000dwt.c"
+
+#include "libavutil/lfg.h"
+
+#define MAX_W 256
+
+static int test_dwt(int *array, int *ref, int border[2][2], int decomp_levels, int type, int max_diff) {
+    int ret, j;
+    DWTContext s1={{{0}}}, *s= &s1;
+    int64_t err2 = 0;
+
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, type);
+    if (ret < 0) {
+        fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
+        return 1;
+    }
+    ret = ff_dwt_encode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    ret = ff_dwt_decode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    for (j = 0; j<MAX_W * MAX_W; j++) {
+        if (FFABS(array[j] - ref[j]) > max_diff) {
+            fprintf(stderr, "missmatch at %d (%d != %d) decomp:%d border %d %d %d %d\n",
+                    j, array[j], ref[j],decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1]);
+            return 2;
+        }
+        err2 += (array[j] - ref[j]) * (array[j] - ref[j]);
+        array[j] = ref[j];
+    }
+    ff_dwt_destroy(s);
+
+    printf("%s, decomp:%2d border %3d %3d %3d %3d milli-err2:%9"PRId64"\n",
+           type == FF_DWT53 ? "5/3i" : "9/7i",
+           decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1],
+           1000*err2 / ((border[0][1] - border[0][0])*(border[1][1] - border[1][0])));
+
+    return 0;
+}
+
+static int test_dwtf(float *array, float *ref, int border[2][2], int decomp_levels, float max_diff) {
+    int ret, j;
+    DWTContext s1={{{0}}}, *s= &s1;
+    double err2 = 0;
+
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, FF_DWT97);
+    if (ret < 0) {
+        fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
+        return 1;
+    }
+    ret = ff_dwt_encode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    ret = ff_dwt_decode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    for (j = 0; j<MAX_W * MAX_W; j++) {
+        if (FFABS(array[j] - ref[j]) > max_diff) {
+            fprintf(stderr, "missmatch at %d (%f != %f) decomp:%d border %d %d %d %d\n",
+                    j, array[j], ref[j],decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1]);
+            return 2;
+        }
+        err2 += (array[j] - ref[j]) * (array[j] - ref[j]);
+        array[j] = ref[j];
+    }
+    ff_dwt_destroy(s);
+
+    printf("9/7f, decomp:%2d border %3d %3d %3d %3d err2:%20.3f\n",
+           decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1],
+           err2 / ((border[0][1] - border[0][0])*(border[1][1] - border[1][0])));
+
+    return 0;
+}
+
+static int array[MAX_W * MAX_W];
+static int ref  [MAX_W * MAX_W];
+static float arrayf[MAX_W * MAX_W];
+static float reff  [MAX_W * MAX_W];
+
+int main(void) {
+    AVLFG prng;
+    int i,j;
+    int border[2][2];
+    int ret, decomp_levels;
+
+    av_lfg_init(&prng, 1);
+
+    for (i = 0; i<MAX_W * MAX_W; i++)
+        arrayf[i] = reff[i] = array[i] = ref[i] =  av_lfg_get(&prng) % 2048;
+
+    for (i = 0; i < 100; i++) {
+        for (j=0; j<4; j++)
+            border[j>>1][j&1] = av_lfg_get(&prng) % MAX_W;
+        if (border[0][0] >= border[0][1] || border[1][0] >= border[1][1])
+            continue;
+        decomp_levels = av_lfg_get(&prng) % FF_DWT_MAX_DECLVLS;
+
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT53, 0);
+        if (ret)
+            return ret;
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT97_INT, FFMIN(7+5*decomp_levels, 15+3*decomp_levels));
+        if (ret)
+            return ret;
+        ret = test_dwtf(arrayf, reff, border, decomp_levels, 0.05);
+        if (ret)
+            return ret;
+    }
+
+    return 0;
+}
diff --git a/libavcodec/tests/mathops.c b/libavcodec/tests/mathops.c
new file mode 100644
index 0000000..33a059c
--- /dev/null
+++ b/libavcodec/tests/mathops.c
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/mathops.h"
+
+#include <stdlib.h>
+
+int main(void)
+{
+    unsigned u;
+
+    for(u=0; u<65536; u++) {
+        unsigned s = u*u;
+        unsigned root = ff_sqrt(s);
+        unsigned root_m1 = ff_sqrt(s-1);
+        if (s && root != u) {
+            fprintf(stderr, "ff_sqrt failed at %u with %u\n", s, root);
+            return 1;
+        }
+        if (u && root_m1 != u - 1) {
+            fprintf(stderr, "ff_sqrt failed at %u with %u\n", s, root);
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/libavcodec/tests/mjpegenc_huffman.c b/libavcodec/tests/mjpegenc_huffman.c
new file mode 100644
index 0000000..2ed92d0
--- /dev/null
+++ b/libavcodec/tests/mjpegenc_huffman.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2016 William Ma, Sofia Kim, Dustin Woo
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Optimal Huffman Encoding tests.
+ */
+
+#include "libavcodec/avcodec.h"
+#include <stdlib.h>
+#include "libavcodec/mjpegenc.h"
+#include "libavcodec/mjpegenc_huffman.h"
+#include "libavcodec/mjpegenc_common.h"
+#include "libavcodec/mpegvideo.h"
+
+// Validate the computed lengths satisfy the JPEG restrictions and is optimal.
+static int check_lengths(int L, int expected_length,
+                         const int *probs, int nprobs)
+{
+    HuffTable lengths[256];
+    PTable val_counts[256];
+    int actual_length = 0, i, j, k, prob, length;
+    int ret = 0;
+    double cantor_measure = 0;
+    av_assert0(nprobs <= 256);
+
+    for (i = 0; i < nprobs; i++) {
+        val_counts[i] = (PTable){.value = i, .prob = probs[i]};
+    }
+
+    ff_mjpegenc_huffman_compute_bits(val_counts, lengths, nprobs, L);
+
+    for (i = 0; i < nprobs; i++) {
+        // Find the value's prob and length
+        for (j = 0; j < nprobs; j++)
+            if (val_counts[j].value == i) break;
+        for (k = 0; k < nprobs; k++)
+            if (lengths[k].code == i) break;
+        if (!(j < nprobs && k < nprobs)) return 1;
+        prob = val_counts[j].prob;
+        length = lengths[k].length;
+
+        if (prob) {
+            actual_length += prob * length;
+            cantor_measure += 1. / (1 << length);
+        }
+
+        if (length > L || length < 1) return 1;
+    }
+    // Check that the codes can be prefix-free.
+    if (cantor_measure > 1) ret = 1;
+    // Check that the total length is optimal
+    if (actual_length != expected_length) ret = 1;
+
+    if (ret == 1) {
+      fprintf(stderr,
+              "Cantor measure: %f\n"
+              "Actual length: %d\n"
+              "Expected length: %d\n",
+              cantor_measure, actual_length, expected_length);
+    }
+
+    return ret;
+}
+
+static const int probs_zeroes[] = {
+    6, 6, 0, 0, 0
+};
+
+static const int probs_skewed[] = {
+    2, 0, 0, 0, 0, 1, 0, 0, 20, 0, 2, 0, 10, 5, 1, 1, 9, 1, 1, 6, 0, 5, 0, 1, 0, 7, 6,
+    1, 1, 5, 0, 0, 0, 0, 11, 0, 0, 0, 51, 1, 0, 20, 0, 1, 0, 0, 0, 0, 6, 106, 1, 0, 1,
+    0, 2, 1, 16, 0, 0, 5, 0, 0, 0, 4, 3, 15, 4, 4, 0, 0, 0, 3, 0, 0, 1, 0, 3, 0, 3, 2,
+    2, 0, 0, 4, 3, 40, 1, 2, 0, 22, 0, 0, 0, 9, 0, 0, 0, 0, 1, 1, 0, 1, 6, 11, 4, 10,
+    28, 6, 1, 0, 0, 9, 9, 4, 0, 0, 0, 0, 8, 33844, 2, 0, 2, 1, 1, 5, 0, 0, 1, 9, 1, 0,
+    4, 14, 4, 0, 0, 3, 8, 0, 51, 9, 6, 1, 1, 2, 2, 3, 1, 5, 5, 29, 0, 0, 0, 0, 14, 29,
+    6, 4, 13, 12, 2, 3, 1, 0, 5, 4, 1, 1, 0, 0, 29, 1, 0, 0, 0, 0, 4, 0, 0, 1, 0, 1,
+    7, 0, 42, 0, 0, 0, 0, 0, 2, 0, 3, 9, 0, 0, 0, 2, 1, 0, 0, 6, 5, 6, 1, 2, 3, 0, 0,
+    0, 3, 0, 0, 28, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 23, 0, 0, 0, 0,
+    0, 21, 1, 0, 3, 24, 2, 0, 0, 7, 0, 0, 1, 5, 1, 2, 0, 5
+};
+
+static const int probs_sat[] = {
+    74, 8, 14, 7, 9345, 40, 0, 2014, 2, 1, 115, 0, 2, 1, 194, 388, 20, 0, 0, 2, 1, 121,
+    1, 1583, 0, 16, 21, 2, 132, 2, 15, 9, 13, 1, 0, 2293, 2, 8, 5, 2, 30, 0, 0, 4, 54,
+    783, 4, 1, 2, 4, 0, 22, 93, 1, 143, 19, 0, 36, 32, 4, 6, 33, 3, 45, 0, 8, 1, 0, 18,
+    17, 1, 0, 1, 0, 0, 1, 1004, 38, 3, 8, 90, 23, 0, 2819, 3, 0, 970, 158, 9, 6, 4, 48,
+    4, 0, 1, 0, 0, 60, 3, 62, 0, 2, 2, 2, 279, 66, 16, 1, 20, 0, 7, 9, 32, 1411, 6, 3,
+    27, 1, 5, 49, 0, 0, 0, 0, 0, 2, 10, 1, 1, 2, 3, 801, 3, 25, 5, 1, 1, 0, 632, 0, 14,
+    18, 5, 8, 200, 4, 4, 22, 12, 0, 4, 1, 0, 2, 4, 9, 3, 16, 7, 2, 2, 213, 0, 2, 620,
+    39303, 0, 1, 0, 2, 1, 183781, 1, 0, 0, 0, 94, 7, 3, 4, 0, 4, 306, 43, 352, 76, 34,
+    13, 11, 0, 51, 1, 13, 19, 0, 26, 0, 7276, 4, 207, 31, 1, 2, 4, 6, 19, 8, 17, 4, 6,
+    0, 1085, 0, 0, 0, 3, 489, 36, 1, 0, 1, 9420, 294, 28, 0, 57, 5, 0, 9, 2, 0, 1, 2,
+    2, 0, 0, 9, 2, 29, 2, 2, 7, 0, 5, 490, 0, 7, 5, 0, 1, 8, 0, 0, 23255, 0, 1
+};
+
+// Test the example given on @see
+// http://guru.multimedia.cx/small-tasks-for-ffmpeg/
+int main(int argc, char **argv)
+{
+    int i, ret = 0;
+    // Probabilities of symbols 0..4
+    PTable val_counts[] = {
+        {.value = 0, .prob = 1},
+        {.value = 1, .prob = 2},
+        {.value = 2, .prob = 5},
+        {.value = 3, .prob = 10},
+        {.value = 4, .prob = 21},
+    };
+    // Expected code lengths for each symbol
+    static const HuffTable expected[] = {
+        {.code = 0, .length = 3},
+        {.code = 1, .length = 3},
+        {.code = 2, .length = 3},
+        {.code = 3, .length = 3},
+        {.code = 4, .length = 1},
+    };
+    // Actual code lengths
+    HuffTable distincts[5];
+
+    // Build optimal huffman tree using an internal function, to allow for
+    // smaller-than-normal test cases. This mutates val_counts by sorting.
+    ff_mjpegenc_huffman_compute_bits(val_counts, distincts,
+                                     FF_ARRAY_ELEMS(distincts), 3);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(distincts); i++) {
+        if (distincts[i].code != expected[i].code ||
+            distincts[i].length != expected[i].length) {
+            fprintf(stderr,
+                    "Built huffman does not equal expectations. "
+                    "Expected: code %d probability %d, "
+                    "Actual: code %d probability %d\n",
+                    expected[i].code, expected[i].length,
+                    distincts[i].code, distincts[i].length);
+            ret = 1;
+        }
+    }
+
+    // Check handling of zero probabilities
+    if (check_lengths(16, 18, probs_zeroes, FF_ARRAY_ELEMS(probs_zeroes)))
+        ret = 1;
+    // Check skewed distribution over 256 without saturated lengths
+    if (check_lengths(16, 41282, probs_skewed, FF_ARRAY_ELEMS(probs_skewed)))
+        ret = 1;
+    // Check skewed distribution over 256 with saturated lengths
+    if (check_lengths(16, 669904, probs_sat, FF_ARRAY_ELEMS(probs_sat)))
+        ret = 1;
+
+    return ret;
+}
diff --git a/libavcodec/tests/motion.c b/libavcodec/tests/motion.c
new file mode 100644
index 0000000..d89f940
--- /dev/null
+++ b/libavcodec/tests/motion.c
@@ -0,0 +1,152 @@
+/*
+ * (c) 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * motion test.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "config.h"
+#include "libavcodec/me_cmp.h"
+#include "libavutil/internal.h"
+#include "libavutil/lfg.h"
+#include "libavutil/mem.h"
+#include "libavutil/time.h"
+
+#undef printf
+
+#define WIDTH 64
+#define HEIGHT 64
+
+static uint8_t img1[WIDTH * HEIGHT];
+static uint8_t img2[WIDTH * HEIGHT];
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);
+    for(i=0;i<size;i++) {
+        tab[i] = av_lfg_get(&prng) % 256;
+    }
+}
+
+static void help(void)
+{
+    printf("motion-test [-h]\n"
+           "test motion implementations\n");
+}
+
+#define NB_ITS 500
+
+int dummy;
+
+static void test_motion(const char *name,
+                 me_cmp_func test_func, me_cmp_func ref_func)
+{
+    int x, y, d1, d2, it;
+    uint8_t *ptr;
+    int64_t ti;
+    printf("testing '%s'\n", name);
+
+    /* test correctness */
+    for(it=0;it<20;it++) {
+
+        fill_random(img1, WIDTH * HEIGHT);
+        fill_random(img2, WIDTH * HEIGHT);
+
+        for(y=0;y<HEIGHT-17;y++) {
+            for(x=0;x<WIDTH-17;x++) {
+                ptr = img2 + y * WIDTH + x;
+                d1 = test_func(NULL, img1, ptr, WIDTH, 8);
+                d2 = ref_func(NULL, img1, ptr, WIDTH, 8);
+                if (d1 != d2) {
+                    printf("error: mmx=%d c=%d\n", d1, d2);
+                }
+            }
+        }
+    }
+    emms_c();
+
+    /* speed test */
+    ti = av_gettime_relative();
+    d1 = 0;
+    for(it=0;it<NB_ITS;it++) {
+        for(y=0;y<HEIGHT-17;y++) {
+            for(x=0;x<WIDTH-17;x++) {
+                ptr = img2 + y * WIDTH + x;
+                d1 += test_func(NULL, img1, ptr, WIDTH, 8);
+            }
+        }
+    }
+    emms_c();
+    dummy = d1; /* avoid optimization */
+    ti = av_gettime_relative() - ti;
+
+    printf("  %0.0f kop/s\n",
+           (double)NB_ITS * (WIDTH - 16) * (HEIGHT - 16) /
+           (double)(ti / 1000.0));
+}
+
+
+int main(int argc, char **argv)
+{
+    AVCodecContext *ctx;
+    int c;
+    MECmpContext cctx, mmxctx;
+    int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMXEXT };
+    int flags_size = HAVE_MMXEXT ? 2 : 1;
+
+    if (argc > 1) {
+        help();
+        return 1;
+    }
+
+    printf("ffmpeg motion test\n");
+
+    ctx = avcodec_alloc_context3(NULL);
+    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+    av_force_cpu_flags(0);
+    memset(&cctx, 0, sizeof(cctx));
+    ff_me_cmp_init(&cctx, ctx);
+    for (c = 0; c < flags_size; c++) {
+        int x;
+        av_force_cpu_flags(flags[c]);
+        memset(&mmxctx, 0, sizeof(mmxctx));
+        ff_me_cmp_init(&mmxctx, ctx);
+
+        for (x = 0; x < 2; x++) {
+            printf("%s for %dx%d pixels\n", c ? "mmx2" : "mmx",
+                   x ? 8 : 16, x ? 8 : 16);
+            test_motion("mmx",     mmxctx.pix_abs[x][0], cctx.pix_abs[x][0]);
+            test_motion("mmx_x2",  mmxctx.pix_abs[x][1], cctx.pix_abs[x][1]);
+            test_motion("mmx_y2",  mmxctx.pix_abs[x][2], cctx.pix_abs[x][2]);
+            test_motion("mmx_xy2", mmxctx.pix_abs[x][3], cctx.pix_abs[x][3]);
+        }
+    }
+    av_free(ctx);
+
+    return 0;
+}
diff --git a/libavcodec/tests/mpeg12framerate.c b/libavcodec/tests/mpeg12framerate.c
index c493cd0..595bdb2 100644
--- a/libavcodec/tests/mpeg12framerate.c
+++ b/libavcodec/tests/mpeg12framerate.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tests/options.c b/libavcodec/tests/options.c
new file mode 100644
index 0000000..2e19a6e
--- /dev/null
+++ b/libavcodec/tests/options.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/options.c"
+
+static int dummy_init(AVCodecContext *ctx)
+{
+    //TODO: this code should set every possible pointer that could be set by codec and is not an option;
+    ctx->extradata_size = 8;
+    ctx->extradata = av_malloc(ctx->extradata_size);
+    return 0;
+}
+
+static int dummy_close(AVCodecContext *ctx)
+{
+    av_freep(&ctx->extradata);
+    ctx->extradata_size = 0;
+    return 0;
+}
+
+static int dummy_encode(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame, int *got_packet)
+{
+    return AVERROR(ENOSYS);
+}
+
+typedef struct Dummy12Context {
+    AVClass  *av_class;
+    int      num;
+    char*    str;
+} Dummy12Context;
+
+typedef struct Dummy3Context {
+    void     *fake_av_class;
+    int      num;
+    char*    str;
+} Dummy3Context;
+
+#define OFFSET(x) offsetof(Dummy12Context, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption dummy_options[] = {
+    { "str", "set str", OFFSET(str), AV_OPT_TYPE_STRING, { .str = "i'm src default value" }, 0, 0, VE},
+    { "num", "set num", OFFSET(num), AV_OPT_TYPE_INT,    { .i64 = 1500100900 },    0, INT_MAX, VE},
+    { NULL },
+};
+
+static const AVClass dummy_v1_class = {
+    .class_name = "dummy_v1_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass dummy_v2_class = {
+    .class_name = "dummy_v2_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+/* codec with options */
+static AVCodec dummy_v1_encoder = {
+    .name             = "dummy_v1_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 1,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v1_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with options, different class */
+static AVCodec dummy_v2_encoder = {
+    .name             = "dummy_v2_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 2,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v2_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with priv data, but no class */
+static AVCodec dummy_v3_encoder = {
+    .name             = "dummy_v3_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 3,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_data_size   = sizeof(Dummy3Context),
+};
+
+/* codec without priv data */
+static AVCodec dummy_v4_encoder = {
+    .name             = "dummy_v4_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 4,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+};
+
+static void test_copy_print_codec(const AVCodecContext *ctx)
+{
+    printf("%-14s: %dx%d prv: %s",
+           ctx->codec ? ctx->codec->name : "NULL",
+           ctx->width, ctx->height,
+           ctx->priv_data ? "set" : "null");
+    if (ctx->codec && ctx->codec->priv_class && ctx->codec->priv_data_size) {
+        int64_t i64;
+        char *str = NULL;
+        av_opt_get_int(ctx->priv_data, "num", 0, &i64);
+        av_opt_get(ctx->priv_data, "str", 0, (uint8_t**)&str);
+        printf(" opts: %"PRId64" %s", i64, str);
+        av_free(str);
+    }
+    printf("\n");
+}
+
+static void test_copy(const AVCodec *c1, const AVCodec *c2)
+{
+    AVCodecContext *ctx1, *ctx2;
+    printf("%s -> %s\nclosed:\n", c1 ? c1->name : "NULL", c2 ? c2->name : "NULL");
+    ctx1 = avcodec_alloc_context3(c1);
+    ctx2 = avcodec_alloc_context3(c2);
+    ctx1->width = ctx1->height = 128;
+    ctx1->time_base = (AVRational){12,34};
+    if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+        av_opt_set(ctx2->priv_data, "num", "667", 0);
+        av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+    }
+    avcodec_copy_context(ctx2, ctx1);
+    test_copy_print_codec(ctx1);
+    test_copy_print_codec(ctx2);
+    if (ctx1->codec) {
+        int ret;
+        printf("opened:\n");
+        ret = avcodec_open2(ctx1, ctx1->codec, NULL);
+        if (ret < 0) {
+            fprintf(stderr, "avcodec_open2 failed\n");
+            exit(1);
+        }
+        if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+            av_opt_set(ctx2->priv_data, "num", "667", 0);
+            av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+        }
+        avcodec_copy_context(ctx2, ctx1);
+        test_copy_print_codec(ctx1);
+        test_copy_print_codec(ctx2);
+        avcodec_close(ctx1);
+    }
+    avcodec_free_context(&ctx1);
+    avcodec_free_context(&ctx2);
+}
+
+int main(void)
+{
+    AVCodec *dummy_codec[] = {
+        &dummy_v1_encoder,
+        &dummy_v2_encoder,
+        &dummy_v3_encoder,
+        &dummy_v4_encoder,
+        NULL,
+    };
+    int i, j;
+
+    for (i = 0; dummy_codec[i]; i++)
+        avcodec_register(dummy_codec[i]);
+
+    printf("testing avcodec_copy_context()\n");
+    for (i = 0; i < FF_ARRAY_ELEMS(dummy_codec); i++)
+        for (j = 0; j < FF_ARRAY_ELEMS(dummy_codec); j++)
+            test_copy(dummy_codec[i], dummy_codec[j]);
+    return 0;
+}
diff --git a/libavcodec/tests/ppc/dct.c b/libavcodec/tests/ppc/dct.c
index 3d160d3..d95db52 100644
--- a/libavcodec/tests/ppc/dct.c
+++ b/libavcodec/tests/ppc/dct.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,7 +21,7 @@
 #include "libavcodec/ppc/fdct.h"
 
 static const struct algo fdct_tab_arch[] = {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     { "altivecfdct", ff_fdct_altivec, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
 #endif
     { 0 }
diff --git a/libavcodec/tests/rangecoder.c b/libavcodec/tests/rangecoder.c
index 26bb589..d6cf9ec 100644
--- a/libavcodec/tests/rangecoder.c
+++ b/libavcodec/tests/rangecoder.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,41 +24,53 @@
 
 #include "libavcodec/rangecoder.h"
 
-#define SIZE 10240
+#define SIZE 1240
 
 int main(void)
 {
     RangeCoder c;
-    uint8_t b[9 * SIZE];
+    uint8_t b[9 * SIZE] = {0};
     uint8_t r[9 * SIZE];
-    int i;
+    int i, p, actual_length, version;
     uint8_t state[10];
     AVLFG prng;
 
     av_lfg_init(&prng, 1);
+    for (version = 0; version < 2; version++) {
+        for (p = 0; p< 1024; p++) {
+            ff_init_range_encoder(&c, b, SIZE);
+            ff_build_rac_states(&c, (1LL << 32) / 20, 128 + 64 + 32 + 16);
 
-    ff_init_range_encoder(&c, b, SIZE);
-    ff_build_rac_states(&c, 0.05 * (1LL << 32), 128 + 64 + 32 + 16);
+            memset(state, 128, sizeof(state));
 
-    memset(state, 128, sizeof(state));
+            for (i = 0; i < SIZE; i++)
+                r[i] = av_lfg_get(&prng) % 7;
 
-    for (i = 0; i < SIZE; i++)
-        r[i] = av_lfg_get(&prng) % 7;
+            for (i = 0; i < SIZE; i++)
+                put_rac(&c, state, r[i] & 1);
 
-    for (i = 0; i < SIZE; i++)
-        put_rac(&c, state, r[i] & 1);
+            actual_length = ff_rac_terminate(&c, version);
 
-    ff_rac_terminate(&c);
+            ff_init_range_decoder(&c, b, version ? SIZE : actual_length);
 
-    ff_init_range_decoder(&c, b, SIZE);
+            memset(state, 128, sizeof(state));
 
-    memset(state, 128, sizeof(state));
+            for (i = 0; i < SIZE; i++)
+                if ((r[i] & 1) != get_rac(&c, state)) {
+                    av_log(NULL, AV_LOG_ERROR, "rac failure at %d pass %d version %d\n", i, p, version);
+                    return 1;
+                }
 
-    for (i = 0; i < SIZE; i++)
-        if ((r[i] & 1) != get_rac(&c, state)) {
-            av_log(NULL, AV_LOG_ERROR, "rac failure at %d\n", i);
-            return 1;
+            if (ff_rac_check_termination(&c, version) < 0) {
+                av_log(NULL, AV_LOG_ERROR, "rac failure at termination pass %d version %d\n", p, version);
+                return 1;
+            }
+            if (c.bytestream - c.bytestream_start - actual_length != version) {
+                av_log(NULL, AV_LOG_ERROR, "rac failure at pass %d version %d\n", p, version);
+                return 1;
+            }
         }
+    }
 
     return 0;
 }
diff --git a/libavcodec/tests/snowenc.c b/libavcodec/tests/snowenc.c
new file mode 100644
index 0000000..d5f94e8
--- /dev/null
+++ b/libavcodec/tests/snowenc.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/snowenc.c"
+
+#undef malloc
+#undef free
+#undef printf
+
+#include "libavutil/lfg.h"
+#include "libavutil/mathematics.h"
+
+int main(void){
+#define width  256
+#define height 256
+    int buffer[2][width*height];
+    SnowContext s;
+    int i;
+    AVLFG prng;
+    s.spatial_decomposition_count=6;
+    s.spatial_decomposition_type=1;
+
+    s.temp_dwt_buffer  = av_mallocz_array(width, sizeof(DWTELEM));
+    s.temp_idwt_buffer = av_mallocz_array(width, sizeof(IDWTELEM));
+
+    if (!s.temp_dwt_buffer || !s.temp_idwt_buffer) {
+        fprintf(stderr, "Failed to allocate memory\n");
+        return 1;
+    }
+
+    av_lfg_init(&prng, 1);
+
+    printf("testing 5/3 DWT\n");
+    for(i=0; i<width*height; i++)
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
+
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+
+    for(i=0; i<width*height; i++)
+        if(buffer[0][i]!= buffer[1][i]) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
+
+    printf("testing 9/7 DWT\n");
+    s.spatial_decomposition_type=0;
+    for(i=0; i<width*height; i++)
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
+
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+
+    for(i=0; i<width*height; i++)
+        if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
+
+    {
+    int level, orientation, x, y;
+    int64_t errors[8][4];
+    int64_t g=0;
+
+        memset(errors, 0, sizeof(errors));
+        s.spatial_decomposition_count=3;
+        s.spatial_decomposition_type=0;
+        for(level=0; level<s.spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                int w= width  >> (s.spatial_decomposition_count-level);
+                int h= height >> (s.spatial_decomposition_count-level);
+                int stride= width  << (s.spatial_decomposition_count-level);
+                DWTELEM *buf= buffer[0];
+                int64_t error=0;
+
+                if(orientation&1) buf+=w;
+                if(orientation>1) buf+=stride>>1;
+
+                memset(buffer[0], 0, sizeof(int)*width*height);
+                buf[w/2 + h/2*stride]= 256*256;
+                ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+                for(y=0; y<height; y++){
+                    for(x=0; x<width; x++){
+                        int64_t d= buffer[0][x + y*width];
+                        error += d*d;
+                        if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
+                    }
+                    if(FFABS(height/2-y)<9 && level==2) printf("\n");
+                }
+                error= (int)(sqrt(error)+0.5);
+                errors[level][orientation]= error;
+                if(g) g=av_gcd(g, error);
+                else g= error;
+            }
+        }
+        printf("static int const visual_weight[][4]={\n");
+        for(level=0; level<s.spatial_decomposition_count; level++){
+            printf("  {");
+            for(orientation=0; orientation<4; orientation++){
+                printf("%8"PRId64",", errors[level][orientation]/g);
+            }
+            printf("},\n");
+        }
+        printf("};\n");
+        {
+            int level=2;
+            int w= width  >> (s.spatial_decomposition_count-level);
+            //int h= height >> (s.spatial_decomposition_count-level);
+            int stride= width  << (s.spatial_decomposition_count-level);
+            DWTELEM *buf= buffer[0];
+            int64_t error=0;
+
+            buf+=w;
+            buf+=stride>>1;
+
+            memset(buffer[0], 0, sizeof(int)*width*height);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int tab[4]={0,2,3,1};
+                    buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
+                }
+            }
+            ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= buffer[0][x + y*width];
+                    error += d*d;
+                    if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
+                }
+                if(FFABS(height/2-y)<9) printf("\n");
+            }
+        }
+
+    }
+    return 0;
+}
diff --git a/libavcodec/tests/utils.c b/libavcodec/tests/utils.c
new file mode 100644
index 0000000..f6ba7fe
--- /dev/null
+++ b/libavcodec/tests/utils.c
@@ -0,0 +1,36 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+
+int main(void){
+    AVCodec *codec = NULL;
+    int ret = 0;
+
+    while (codec = av_codec_next(codec)) {
+        if (av_codec_is_encoder(codec)) {
+            if (codec->type == AVMEDIA_TYPE_AUDIO) {
+                if (!codec->sample_fmts) {
+                    av_log(NULL, AV_LOG_FATAL, "Encoder %s is missing the sample_fmts field\n", codec->name);
+                    ret = 1;
+                }
+            }
+        }
+    }
+    return ret;
+}
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 2ddb555..afb03b1 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,37 @@
 #include "libavcodec/x86/xvididct.h"
 #include "libavcodec/x86/simple_idct.h"
 
+#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_X86ASM
+void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
+                                int16_t *block, int16_t *qmat);
+
+#define PR_WRAP(INSN) \
+static void ff_prores_idct_put_10_##INSN##_wrap(int16_t *dst){ \
+    LOCAL_ALIGNED(16, int16_t, qmat, [64]); \
+    LOCAL_ALIGNED(16, int16_t, tmp, [64]); \
+    int i; \
+ \
+    for(i=0; i<64; i++){ \
+        qmat[i]=4; \
+        tmp[i]= dst[i]; \
+    } \
+    ff_prores_idct_put_10_##INSN (dst, 16, tmp, qmat); \
+ \
+    for(i=0; i<64; i++) { \
+         dst[i] -= 512; \
+    } \
+}
+
+PR_WRAP(sse2)
+
+# if HAVE_AVX_EXTERNAL
+void ff_prores_idct_put_10_avx(uint16_t *dst, int linesize,
+                               int16_t *block, int16_t *qmat);
+PR_WRAP(avx)
+# endif
+
+#endif
+
 static const struct algo fdct_tab_arch[] = {
 #if HAVE_MMX_INLINE
     { "MMX",    ff_fdct_mmx,    FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
@@ -36,24 +67,42 @@ static const struct algo fdct_tab_arch[] = {
 };
 
 static const struct algo idct_tab_arch[] = {
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
     { "SIMPLE-MMX",  ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX },
 #endif
-#if CONFIG_MPEG4_DECODER
-#if HAVE_MMX_INLINE
+#if CONFIG_MPEG4_DECODER && HAVE_X86ASM
+#if ARCH_X86_32
     { "XVID-MMX",    ff_xvid_idct_mmx,    FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMX,    1 },
-#endif
-#if HAVE_MMXEXT_INLINE
     { "XVID-MMXEXT", ff_xvid_idct_mmxext, FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMXEXT, 1 },
 #endif
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
     { "XVID-SSE2",   ff_xvid_idct_sse2,   FF_IDCT_PERM_SSE2,   AV_CPU_FLAG_SSE2,   1 },
 #endif
-#endif /* CONFIG_MPEG4_DECODER */
+#endif /* CONFIG_MPEG4_DECODER && HAVE_X86ASM */
+#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_X86ASM
+    { "PR-SSE2",     ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
+# if HAVE_AVX_EXTERNAL
+    { "PR-AVX",      ff_prores_idct_put_10_avx_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 },
+# endif
+#endif
+#if HAVE_X86ASM
+#if ARCH_X86_64
+#if HAVE_SSE2_EXTERNAL
+    { "SIMPLE8-SSE2",   ff_simple_idct8_sse2,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
+    { "SIMPLE10-SSE2",  ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
+    { "SIMPLE12-SSE2",  ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
+#endif
+#if HAVE_AVX_EXTERNAL
+    { "SIMPLE8-AVX",    ff_simple_idct8_avx,   FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
+    { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
+    { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX,  1 },
+#endif
+#endif
+#endif
     { 0 }
 };
 
-static short idct_simple_mmx_perm[64] = {
+static const uint8_t idct_simple_mmx_perm[64] = {
     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
diff --git a/libavcodec/textdec.c b/libavcodec/textdec.c
new file mode 100644
index 0000000..964da72
--- /dev/null
+++ b/libavcodec/textdec.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Raw subtitles decoder
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+    AVClass *class;
+    const char *linebreaks;
+    int keep_ass_markup;
+    int readorder;
+} TextContext;
+
+#define OFFSET(x) offsetof(TextContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "keep_ass_markup", "Set if ASS tags must be escaped", OFFSET(keep_ass_markup), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, .flags=SD },
+    { NULL }
+};
+
+static int text_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVBPrint buf;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    TextContext *text = avctx->priv_data;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && *ptr) {
+        ff_ass_bprint_text_event(&buf, ptr, avpkt->size, text->linebreaks, text->keep_ass_markup);
+        ret = ff_ass_add_rect(sub, buf.str, text->readorder++, 0, NULL, NULL);
+    }
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static void text_flush(AVCodecContext *avctx)
+{
+    TextContext *text = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        text->readorder = 0;
+}
+
+#define DECLARE_CLASS(decname) static const AVClass decname ## _decoder_class = {   \
+    .class_name = #decname " decoder",      \
+    .item_name  = av_default_item_name,     \
+    .option     = decname ## _options,      \
+    .version    = LIBAVUTIL_VERSION_INT,    \
+}
+
+#if CONFIG_TEXT_DECODER
+#define text_options options
+DECLARE_CLASS(text);
+
+AVCodec ff_text_decoder = {
+    .name           = "text",
+    .long_name      = NULL_IF_CONFIG_SMALL("Raw text subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_TEXT,
+    .decode         = text_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .priv_class     = &text_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_VPLAYER_DECODER || CONFIG_PJS_DECODER || CONFIG_SUBVIEWER1_DECODER || CONFIG_STL_DECODER
+
+static int linebreak_init(AVCodecContext *avctx)
+{
+    TextContext *text = avctx->priv_data;
+    text->linebreaks = "|";
+    return ff_ass_subtitle_header_default(avctx);
+}
+
+#if CONFIG_VPLAYER_DECODER
+#define vplayer_options options
+DECLARE_CLASS(vplayer);
+
+AVCodec ff_vplayer_decoder = {
+    .name           = "vplayer",
+    .long_name      = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_VPLAYER,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &vplayer_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_STL_DECODER
+#define stl_options options
+DECLARE_CLASS(stl);
+
+AVCodec ff_stl_decoder = {
+    .name           = "stl",
+    .long_name      = NULL_IF_CONFIG_SMALL("Spruce subtitle format"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_STL,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &stl_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_PJS_DECODER
+#define pjs_options options
+DECLARE_CLASS(pjs);
+
+AVCodec ff_pjs_decoder = {
+    .name           = "pjs",
+    .long_name      = NULL_IF_CONFIG_SMALL("PJS subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_PJS,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &pjs_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_SUBVIEWER1_DECODER
+#define subviewer1_options options
+DECLARE_CLASS(subviewer1);
+
+AVCodec ff_subviewer1_decoder = {
+    .name           = "subviewer1",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubViewer1 subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBVIEWER1,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &subviewer1_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#endif /* text subtitles with '|' line break */
diff --git a/libavcodec/texturedsp.c b/libavcodec/texturedsp.c
index 7b54a5d..b7dd8ba 100644
--- a/libavcodec/texturedsp.c
+++ b/libavcodec/texturedsp.c
@@ -28,13 +28,14 @@
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/libm.h"
 
 #include "texturedsp.h"
 
-#define RGBA(r, g, b, a) ((uint8_t)(r) <<  0) | \
-                         ((uint8_t)(g) <<  8) | \
-                         ((uint8_t)(b) << 16) | \
-                         ((uint8_t)(a) << 24)
+#define RGBA(r, g, b, a) (((uint8_t)(r) <<  0) | \
+                          ((uint8_t)(g) <<  8) | \
+                          ((uint8_t)(b) << 16) | \
+                          ((unsigned)(uint8_t)(a) << 24))
 
 static av_always_inline void extract_color(uint32_t colors[4],
                                            uint16_t color0,
@@ -157,7 +158,7 @@ static inline void dxt3_block_internal(uint8_t *dst, ptrdiff_t stride,
 
         for (x = 0; x < 4; x++) {
             uint8_t alpha = alpha_values[x];
-            uint32_t pixel = colors[code & 3] | (alpha << 24);
+            uint32_t pixel = colors[code & 3] | ((unsigned)alpha << 24);
             code >>= 2;
 
             AV_WL32(dst + x * 4, pixel);
@@ -290,7 +291,7 @@ static inline void dxt5_block_internal(uint8_t *dst, ptrdiff_t stride,
                     }
                 }
             }
-            pixel = colors[code & 3] | (alpha << 24);
+            pixel = colors[code & 3] | ((unsigned)alpha << 24);
             code >>= 2;
             AV_WL32(dst + x * 4, pixel);
         }
@@ -412,7 +413,7 @@ static int dxt5ys_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
 
 static inline void rgtc_block_internal(uint8_t *dst, ptrdiff_t stride,
                                        const uint8_t *block,
-                                       const int *color_tab)
+                                       const int *color_tab, int mono, int offset, int pix_size)
 {
     uint8_t indices[16];
     int x, y;
@@ -428,14 +429,20 @@ static inline void rgtc_block_internal(uint8_t *dst, ptrdiff_t stride,
             int i = indices[x + y * 4];
             /* Interval expansion from [-1 1] or [0 1] to [0 255]. */
             int c = color_tab[i];
-            uint32_t pixel = RGBA(c, c, c, 255);
-            AV_WL32(dst + x * 4 + y * stride, pixel);
+
+            if (mono){
+                dst [x * pix_size + y * stride + offset] = (uint8_t)c;
+            }
+            else{
+                uint32_t pixel = RGBA(c, c, c, 255U);
+                AV_WL32(dst + x * pix_size + y * stride, pixel);
+            }
         }
     }
 }
 
 static inline void rgtc1_block_internal(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *block, int sign)
+                                        const uint8_t *block, int sign, int mono, int offset, int pix_size)
 {
     int color_table[8];
     int r0, r1;
@@ -471,7 +478,7 @@ static inline void rgtc1_block_internal(uint8_t *dst, ptrdiff_t stride,
         color_table[7] = 255;  /* max range */  // bit code 111
     }
 
-    rgtc_block_internal(dst, stride, block, color_table);
+    rgtc_block_internal(dst, stride, block, color_table, mono, offset, pix_size);
 }
 
 /**
@@ -485,7 +492,7 @@ static inline void rgtc1_block_internal(uint8_t *dst, ptrdiff_t stride,
  */
 static int rgtc1s_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
 {
-    rgtc1_block_internal(dst, stride, block, 1);
+    rgtc1_block_internal(dst, stride, block, 1, 0, 0, 4);
 
     return 8;
 }
@@ -501,7 +508,39 @@ static int rgtc1s_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
  */
 static int rgtc1u_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
 {
-    rgtc1_block_internal(dst, stride, block, 0);
+    rgtc1_block_internal(dst, stride, block, 0, 0, 0, 4);
+
+    return 8;
+}
+
+/**
+ * Decompress one block of a RGTC1 texture with unsigned components
+ * and overwrite the alpha component in 'dst' (RGBA data).
+ *
+ * @param dst    output buffer.
+ * @param stride scanline in bytes.
+ * @param block  block to decompress.
+ * @return how much texture data has been consumed.
+ */
+static int rgtc1u_alpha_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
+{
+    rgtc1_block_internal(dst, stride, block, 0, 1, 3, 4);
+
+    return 8;
+}
+
+/**
+ * Decompress one block of a RGTC1 texture with unsigned components
+ * to Gray 8.
+ *
+ * @param dst    output buffer.
+ * @param stride scanline in bytes.
+ * @param block  block to decompress.
+ * @return how much texture data has been consumed.
+ */
+static int rgtc1u_gray_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
+{
+    rgtc1_block_internal(dst, stride, block, 0, 1, 0, 1);
 
     return 8;
 }
@@ -515,8 +554,8 @@ static inline void rgtc2_block_internal(uint8_t *dst, ptrdiff_t stride,
     int x, y;
 
     /* Decompress the two channels separately and interleave them afterwards. */
-    rgtc1_block_internal(c0, 16, block, sign);
-    rgtc1_block_internal(c1, 16, block + 8, sign);
+    rgtc1_block_internal(c0, 16, block, sign, 0, 0, 4);
+    rgtc1_block_internal(c1, 16, block + 8, sign, 0, 0, 4);
 
     /* B is rebuilt exactly like a normal map. */
     for (y = 0; y < 4; y++) {
@@ -528,7 +567,7 @@ static inline void rgtc2_block_internal(uint8_t *dst, ptrdiff_t stride,
 
             int d = (255 * 255 - r * r - g * g) / 2;
             if (d > 0)
-                b = rint(sqrtf(d));
+                b = lrint(sqrtf(d));
 
             p[0] = r;
             p[1] = g;
@@ -597,17 +636,19 @@ static int dxn3dc_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
 
 av_cold void ff_texturedsp_init(TextureDSPContext *c)
 {
-    c->dxt1_block   = dxt1_block;
-    c->dxt1a_block  = dxt1a_block;
-    c->dxt2_block   = dxt2_block;
-    c->dxt3_block   = dxt3_block;
-    c->dxt4_block   = dxt4_block;
-    c->dxt5_block   = dxt5_block;
-    c->dxt5y_block  = dxt5y_block;
-    c->dxt5ys_block = dxt5ys_block;
-    c->rgtc1s_block = rgtc1s_block;
-    c->rgtc1u_block = rgtc1u_block;
-    c->rgtc2s_block = rgtc2s_block;
-    c->rgtc2u_block = rgtc2u_block;
-    c->dxn3dc_block = dxn3dc_block;
+    c->dxt1_block         = dxt1_block;
+    c->dxt1a_block        = dxt1a_block;
+    c->dxt2_block         = dxt2_block;
+    c->dxt3_block         = dxt3_block;
+    c->dxt4_block         = dxt4_block;
+    c->dxt5_block         = dxt5_block;
+    c->dxt5y_block        = dxt5y_block;
+    c->dxt5ys_block       = dxt5ys_block;
+    c->rgtc1s_block       = rgtc1s_block;
+    c->rgtc1u_block       = rgtc1u_block;
+    c->rgtc1u_gray_block  = rgtc1u_gray_block;
+    c->rgtc1u_alpha_block = rgtc1u_alpha_block;
+    c->rgtc2s_block       = rgtc2s_block;
+    c->rgtc2u_block       = rgtc2u_block;
+    c->dxn3dc_block       = dxn3dc_block;
 }
diff --git a/libavcodec/texturedsp.h b/libavcodec/texturedsp.h
index fcbe7a4..90ceb2b 100644
--- a/libavcodec/texturedsp.h
+++ b/libavcodec/texturedsp.h
@@ -2,20 +2,20 @@
  * Texture block module
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,19 +43,21 @@
 #define TEXTURE_BLOCK_H 4
 
 typedef struct TextureDSPContext {
-    int (*dxt1_block)  (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxt1a_block) (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxt2_block)  (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxt3_block)  (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxt4_block)  (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxt5_block)  (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxt5y_block) (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxt5ys_block)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*rgtc1s_block)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*rgtc1u_block)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*rgtc2s_block)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*rgtc2u_block)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
-    int (*dxn3dc_block)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt1_block)        (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt1a_block)       (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt2_block)        (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt3_block)        (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt4_block)        (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt5_block)        (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt5y_block)       (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxt5ys_block)      (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*rgtc1s_block)      (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*rgtc1u_block)      (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*rgtc1u_gray_block) (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*rgtc1u_alpha_block)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*rgtc2s_block)      (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*rgtc2u_block)      (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
+    int (*dxn3dc_block)      (uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
 } TextureDSPContext;
 
 void ff_texturedsp_init(TextureDSPContext *c);
diff --git a/libavcodec/texturedspenc.c b/libavcodec/texturedspenc.c
index 27aaa78..3d68e0c 100644
--- a/libavcodec/texturedspenc.c
+++ b/libavcodec/texturedspenc.c
@@ -3,7 +3,7 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Based on public domain code by Fabian Giesen, Sean Barrett and Yann Collet.
  *
- * This file is part of Libav
+ * This file is part of FFmpeg
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -140,14 +140,14 @@ static const uint8_t match6[256][2] = {
 };
 
 /* Multiplication over 8 bit emulation */
-#define mul8(a, b) (a * b + 128 + ((a * b + 128) >> 8)) >> 8
+#define mul8(a, b) (((a) * (b) + 128 + (((a) * (b) + 128) >> 8)) >> 8)
 
 /* Conversion from rgb24 to rgb565 */
 #define rgb2rgb565(r, g, b) \
-    (mul8(r, 31) << 11) | (mul8(g, 63) << 5) | (mul8(b, 31) << 0)
+    ((mul8(r, 31) << 11) | (mul8(g, 63) << 5) | (mul8(b, 31) << 0))
 
 /* Linear interpolation at 1/3 point between a and b */
-#define lerp13(a, b) (2 * a + b) / 3
+#define lerp13(a, b) ((2 * (a) + (b)) / 3)
 
 /* Linear interpolation on an RGB pixel */
 static inline void lerp13rgb(uint8_t *out, uint8_t *p1, uint8_t *p2)
@@ -647,9 +647,26 @@ static int dxt5ys_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
     return 16;
 }
 
+/**
+ * Compress one block of RGBA pixels in a RGTC1U texture and store the
+ * resulting bytes in 'dst'. Use the alpha channel of the input image.
+ *
+ * @param dst    output buffer.
+ * @param stride scanline in bytes.
+ * @param block  block to compress.
+ * @return how much texture data has been written.
+ */
+static int rgtc1u_alpha_block(uint8_t *dst, ptrdiff_t stride, const uint8_t *block)
+{
+    compress_alpha(dst, stride, block);
+
+    return 8;
+}
+
 av_cold void ff_texturedspenc_init(TextureDSPContext *c)
 {
-    c->dxt1_block   = dxt1_block;
-    c->dxt5_block   = dxt5_block;
-    c->dxt5ys_block = dxt5ys_block;
+    c->dxt1_block         = dxt1_block;
+    c->dxt5_block         = dxt5_block;
+    c->dxt5ys_block       = dxt5ys_block;
+    c->rgtc1u_alpha_block = rgtc1u_alpha_block;
 }
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index b06958d..540135f 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Alexander Strange <astrange@ithinksw.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 typedef struct ThreadFrame {
     AVFrame *f;
-    AVCodecContext *owner;
+    AVCodecContext *owner[2];
     // progress->data is an array of 2 ints holding progress for top/bottom
     // fields
     AVBufferRef *progress;
@@ -97,6 +97,16 @@ void ff_thread_report_progress(ThreadFrame *f, int progress, int field);
 void ff_thread_await_progress(ThreadFrame *f, int progress, int field);
 
 /**
+ * Wrapper around get_format() for frame-multithreaded codecs.
+ * Call this function instead of avctx->get_format().
+ * Cannot be called after the codec has called ff_thread_finish_setup().
+ *
+ * @param avctx The current context.
+ * @param fmt The list of available formats.
+ */
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt);
+
+/**
  * Wrapper around get_buffer() for frame-multithreaded codecs.
  * Call this function instead of ff_get_buffer(f).
  * Cannot be called after the codec has called ff_thread_finish_setup().
@@ -122,6 +132,13 @@ void ff_thread_release_buffer(AVCodecContext *avctx, ThreadFrame *f);
 int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src);
 
 int ff_thread_init(AVCodecContext *s);
+int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx,
+        int (*action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr),
+        int (*main_func)(AVCodecContext *c), void *arg, int *ret, int job_count);
 void ff_thread_free(AVCodecContext *s);
+int ff_alloc_entries(AVCodecContext *avctx, int count);
+void ff_reset_entries(AVCodecContext *avctx);
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n);
+void ff_thread_await_progress2(AVCodecContext *avctx,  int field, int thread, int shift);
 
 #endif /* AVCODEC_THREAD_H */
diff --git a/libavcodec/tiertexseqv.c b/libavcodec/tiertexseqv.c
index e24d401..af39f74 100644
--- a/libavcodec/tiertexseqv.c
+++ b/libavcodec/tiertexseqv.c
@@ -2,20 +2,20 @@
  * Tiertex Limited SEQ Video Decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 
@@ -41,18 +41,18 @@ static const unsigned char *seq_unpack_rle_block(const unsigned char *src,
                                                  unsigned char *dst, int dst_size)
 {
     int i, len, sz;
-    BitstreamContext bc;
+    GetBitContext gb;
     int code_table[64];
 
     /* get the rle codes */
-    bitstream_init8(&bc, src, src_end - src);
+    init_get_bits(&gb, src, (src_end - src) * 8);
     for (i = 0, sz = 0; i < 64 && sz < dst_size; i++) {
-        if (bitstream_bits_left(&bc) < 4)
+        if (get_bits_left(&gb) < 4)
             return NULL;
-        code_table[i] = bitstream_read_signed(&bc, 4);
+        code_table[i] = get_sbits(&gb, 4);
         sz += FFABS(code_table[i]);
     }
-    src += (bitstream_tell(&bc) + 7) / 8;
+    src += (get_bits_count(&gb) + 7) / 8;
 
     /* do the rle unpacking */
     for (i = 0; i < 64 && dst_size > 0; i++) {
@@ -81,7 +81,7 @@ static const unsigned char *seq_decode_op1(SeqVideoContext *seq,
 {
     const unsigned char *color_table;
     int b, i, len, bits;
-    BitstreamContext bc;
+    GetBitContext gb;
     unsigned char block[8 * 8];
 
     if (src_end - src < 1)
@@ -113,11 +113,10 @@ static const unsigned char *seq_decode_op1(SeqVideoContext *seq,
             return NULL;
         color_table = src;
         src += len;
-        bitstream_init8(&bc, src, bits * 8);
-        src += bits * 8;
+        init_get_bits(&gb, src, bits * 8 * 8); src += bits * 8;
         for (b = 0; b < 8; b++) {
             for (i = 0; i < 8; i++)
-                dst[i] = color_table[bitstream_read(&bc, bits)];
+                dst[i] = color_table[get_bits(&gb, bits)];
             dst += seq->frame->linesize[0];
         }
     }
@@ -165,7 +164,7 @@ static const unsigned char *seq_decode_op3(SeqVideoContext *seq,
 static int seqvideo_decode(SeqVideoContext *seq, const unsigned char *data, int data_size)
 {
     const unsigned char *data_end = data + data_size;
-    BitstreamContext bc;
+    GetBitContext gb;
     int flags, i, j, x, y, op;
     unsigned char c[3];
     unsigned char *dst;
@@ -180,7 +179,7 @@ static int seqvideo_decode(SeqVideoContext *seq, const unsigned char *data, int
         for (i = 0; i < 256; i++) {
             for (j = 0; j < 3; j++, data++)
                 c[j] = (*data << 2) | (*data >> 4);
-            palette[i] = AV_RB24(c);
+            palette[i] = 0xFFU << 24 | AV_RB24(c);
         }
         seq->frame->palette_has_changed = 1;
     }
@@ -188,12 +187,11 @@ static int seqvideo_decode(SeqVideoContext *seq, const unsigned char *data, int
     if (flags & 2) {
         if (data_end - data < 128)
             return AVERROR_INVALIDDATA;
-        bitstream_init8(&bc, data, 128);
-        data += 128;
+        init_get_bits(&gb, data, 128 * 8); data += 128;
         for (y = 0; y < 128; y += 8)
             for (x = 0; x < 256; x += 8) {
                 dst = &seq->frame->data[0][y * seq->frame->linesize[0] + x];
-                op = bitstream_read(&bc, 2);
+                op = get_bits(&gb, 2);
                 switch (op) {
                 case 1:
                     data = seq_decode_op1(seq, data, data_end, dst);
@@ -215,10 +213,15 @@ static int seqvideo_decode(SeqVideoContext *seq, const unsigned char *data, int
 static av_cold int seqvideo_decode_init(AVCodecContext *avctx)
 {
     SeqVideoContext *seq = avctx->priv_data;
+    int ret;
 
     seq->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
+    ret = ff_set_dimensions(avctx, 256, 128);
+    if (ret < 0)
+        return ret;
+
     seq->frame = av_frame_alloc();
     if (!seq->frame)
         return AVERROR(ENOMEM);
@@ -236,10 +239,8 @@ static int seqvideo_decode_frame(AVCodecContext *avctx,
 
     SeqVideoContext *seq = avctx->priv_data;
 
-    if ((ret = ff_reget_buffer(avctx, seq->frame)) < 0) {
-        av_log(seq->avctx, AV_LOG_ERROR, "tiertexseqvideo: reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, seq->frame)) < 0)
         return ret;
-    }
 
     if (seqvideo_decode(seq, buf, buf_size))
         return AVERROR_INVALIDDATA;
diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index 97b9d6f..112f5b5 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -1,21 +1,20 @@
 /*
- * TIFF image decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,10 +28,16 @@
 #if CONFIG_ZLIB
 #include <zlib.h>
 #endif
+#if CONFIG_LZMA
+#define LZMA_API_STATIC
+#include <lzma.h>
+#endif
 
 #include "libavutil/attributes.h"
+#include "libavutil/avstring.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "faxcompr.h"
@@ -40,11 +45,17 @@
 #include "lzw.h"
 #include "mathops.h"
 #include "tiff.h"
+#include "tiff_data.h"
+#include "thread.h"
+#include "get_bits.h"
 
 typedef struct TiffContext {
+    AVClass *class;
     AVCodecContext *avctx;
     GetByteContext gb;
 
+    int get_subimage;
+
     int width, height;
     unsigned int bpp, bppcount;
     uint32_t palette[256];
@@ -53,33 +64,290 @@ typedef struct TiffContext {
     enum TiffCompr compr;
     enum TiffPhotometric photometric;
     int planar;
+    int subsampling[2];
     int fax_opts;
     int predictor;
     int fill_order;
+    uint32_t res[4];
+
+    int is_bayer;
+    uint8_t pattern[4];
+    unsigned white_level;
+
+    uint32_t sub_ifd;
 
     int strips, rps, sstype;
     int sot;
     int stripsizesoff, stripsize, stripoff, strippos;
     LZWState *lzw;
+
+    uint8_t *deinvert_buf;
+    int deinvert_buf_size;
+    uint8_t *yuv_line;
+    unsigned int yuv_line_size;
+    uint8_t *fax_buffer;
+    unsigned int fax_buffer_size;
+
+    int geotag_count;
+    TiffGeoTag *geotags;
 } TiffContext;
 
-static unsigned tget_short(GetByteContext *gb, int le)
+static void free_geotags(TiffContext *const s)
+{
+    int i;
+    for (i = 0; i < s->geotag_count; i++) {
+        if (s->geotags[i].val)
+            av_freep(&s->geotags[i].val);
+    }
+    av_freep(&s->geotags);
+    s->geotag_count = 0;
+}
+
+#define RET_GEOKEY(TYPE, array, element)\
+    if (key >= TIFF_##TYPE##_KEY_ID_OFFSET &&\
+        key - TIFF_##TYPE##_KEY_ID_OFFSET < FF_ARRAY_ELEMS(ff_tiff_##array##_name_type_map))\
+        return ff_tiff_##array##_name_type_map[key - TIFF_##TYPE##_KEY_ID_OFFSET].element;
+
+static const char *get_geokey_name(int key)
+{
+    RET_GEOKEY(VERT, vert, name);
+    RET_GEOKEY(PROJ, proj, name);
+    RET_GEOKEY(GEOG, geog, name);
+    RET_GEOKEY(CONF, conf, name);
+
+    return NULL;
+}
+
+static int get_geokey_type(int key)
+{
+    RET_GEOKEY(VERT, vert, type);
+    RET_GEOKEY(PROJ, proj, type);
+    RET_GEOKEY(GEOG, geog, type);
+    RET_GEOKEY(CONF, conf, type);
+
+    return AVERROR_INVALIDDATA;
+}
+
+static int cmp_id_key(const void *id, const void *k)
 {
-    return le ? bytestream2_get_le16(gb) : bytestream2_get_be16(gb);
+    return *(const int*)id - ((const TiffGeoTagKeyName*)k)->key;
 }
 
-static unsigned tget_long(GetByteContext *gb, int le)
+static const char *search_keyval(const TiffGeoTagKeyName *keys, int n, int id)
 {
-    return le ? bytestream2_get_le32(gb) : bytestream2_get_be32(gb);
+    TiffGeoTagKeyName *r = bsearch(&id, keys, n, sizeof(keys[0]), cmp_id_key);
+    if(r)
+        return r->name;
+
+    return NULL;
 }
 
-static unsigned tget(GetByteContext *gb, int type, int le)
+static char *get_geokey_val(int key, int val)
 {
-    switch (type) {
-    case TIFF_BYTE:  return bytestream2_get_byte(gb);
-    case TIFF_SHORT: return tget_short(gb, le);
-    case TIFF_LONG:  return tget_long(gb, le);
-    default:         return UINT_MAX;
+    char *ap;
+
+    if (val == TIFF_GEO_KEY_UNDEFINED)
+        return av_strdup("undefined");
+    if (val == TIFF_GEO_KEY_USER_DEFINED)
+        return av_strdup("User-Defined");
+
+#define RET_GEOKEY_VAL(TYPE, array)\
+    if (val >= TIFF_##TYPE##_OFFSET &&\
+        val - TIFF_##TYPE##_OFFSET < FF_ARRAY_ELEMS(ff_tiff_##array##_codes))\
+        return av_strdup(ff_tiff_##array##_codes[val - TIFF_##TYPE##_OFFSET]);
+
+    switch (key) {
+    case TIFF_GT_MODEL_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GT_MODEL_TYPE, gt_model_type);
+        break;
+    case TIFF_GT_RASTER_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GT_RASTER_TYPE, gt_raster_type);
+        break;
+    case TIFF_GEOG_LINEAR_UNITS_GEOKEY:
+    case TIFF_PROJ_LINEAR_UNITS_GEOKEY:
+    case TIFF_VERTICAL_UNITS_GEOKEY:
+        RET_GEOKEY_VAL(LINEAR_UNIT, linear_unit);
+        break;
+    case TIFF_GEOG_ANGULAR_UNITS_GEOKEY:
+    case TIFF_GEOG_AZIMUTH_UNITS_GEOKEY:
+        RET_GEOKEY_VAL(ANGULAR_UNIT, angular_unit);
+        break;
+    case TIFF_GEOGRAPHIC_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GCS_TYPE, gcs_type);
+        RET_GEOKEY_VAL(GCSE_TYPE, gcse_type);
+        break;
+    case TIFF_GEOG_GEODETIC_DATUM_GEOKEY:
+        RET_GEOKEY_VAL(GEODETIC_DATUM, geodetic_datum);
+        RET_GEOKEY_VAL(GEODETIC_DATUM_E, geodetic_datum_e);
+        break;
+    case TIFF_GEOG_ELLIPSOID_GEOKEY:
+        RET_GEOKEY_VAL(ELLIPSOID, ellipsoid);
+        break;
+    case TIFF_GEOG_PRIME_MERIDIAN_GEOKEY:
+        RET_GEOKEY_VAL(PRIME_MERIDIAN, prime_meridian);
+        break;
+    case TIFF_PROJECTED_CS_TYPE_GEOKEY:
+        ap = av_strdup(search_keyval(ff_tiff_proj_cs_type_codes, FF_ARRAY_ELEMS(ff_tiff_proj_cs_type_codes), val));
+        if(ap) return ap;
+        break;
+    case TIFF_PROJECTION_GEOKEY:
+        ap = av_strdup(search_keyval(ff_tiff_projection_codes, FF_ARRAY_ELEMS(ff_tiff_projection_codes), val));
+        if(ap) return ap;
+        break;
+    case TIFF_PROJ_COORD_TRANS_GEOKEY:
+        RET_GEOKEY_VAL(COORD_TRANS, coord_trans);
+        break;
+    case TIFF_VERTICAL_CS_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(VERT_CS, vert_cs);
+        RET_GEOKEY_VAL(ORTHO_VERT_CS, ortho_vert_cs);
+        break;
+
+    }
+
+    ap = av_malloc(14);
+    if (ap)
+        snprintf(ap, 14, "Unknown-%d", val);
+    return ap;
+}
+
+static char *doubles2str(double *dp, int count, const char *sep)
+{
+    int i;
+    char *ap, *ap0;
+    uint64_t component_len;
+    if (!sep) sep = ", ";
+    component_len = 24LL + strlen(sep);
+    if (count >= (INT_MAX - 1)/component_len)
+        return NULL;
+    ap = av_malloc(component_len * count + 1);
+    if (!ap)
+        return NULL;
+    ap0   = ap;
+    ap[0] = '\0';
+    for (i = 0; i < count; i++) {
+        unsigned l = snprintf(ap, component_len, "%.15g%s", dp[i], sep);
+        if(l >= component_len) {
+            av_free(ap0);
+            return NULL;
+        }
+        ap += l;
+    }
+    ap0[strlen(ap0) - strlen(sep)] = '\0';
+    return ap0;
+}
+
+static int add_metadata(int count, int type,
+                        const char *name, const char *sep, TiffContext *s, AVFrame *frame)
+{
+    switch(type) {
+    case TIFF_DOUBLE: return ff_tadd_doubles_metadata(count, name, sep, &s->gb, s->le, &frame->metadata);
+    case TIFF_SHORT : return ff_tadd_shorts_metadata(count, name, sep, &s->gb, s->le, 0, &frame->metadata);
+    case TIFF_STRING: return ff_tadd_string_metadata(count, name, &s->gb, s->le, &frame->metadata);
+    default         : return AVERROR_INVALIDDATA;
+    };
+}
+
+static void av_always_inline horizontal_fill(TiffContext *s,
+                                             unsigned int bpp, uint8_t* dst,
+                                             int usePtr, const uint8_t *src,
+                                             uint8_t c, int width, int offset)
+{
+    switch (bpp) {
+    case 1:
+        while (--width >= 0) {
+            dst[(width+offset)*8+7] = (usePtr ? src[width] : c)      & 0x1;
+            dst[(width+offset)*8+6] = (usePtr ? src[width] : c) >> 1 & 0x1;
+            dst[(width+offset)*8+5] = (usePtr ? src[width] : c) >> 2 & 0x1;
+            dst[(width+offset)*8+4] = (usePtr ? src[width] : c) >> 3 & 0x1;
+            dst[(width+offset)*8+3] = (usePtr ? src[width] : c) >> 4 & 0x1;
+            dst[(width+offset)*8+2] = (usePtr ? src[width] : c) >> 5 & 0x1;
+            dst[(width+offset)*8+1] = (usePtr ? src[width] : c) >> 6 & 0x1;
+            dst[(width+offset)*8+0] = (usePtr ? src[width] : c) >> 7;
+        }
+        break;
+    case 2:
+        while (--width >= 0) {
+            dst[(width+offset)*4+3] = (usePtr ? src[width] : c) & 0x3;
+            dst[(width+offset)*4+2] = (usePtr ? src[width] : c) >> 2 & 0x3;
+            dst[(width+offset)*4+1] = (usePtr ? src[width] : c) >> 4 & 0x3;
+            dst[(width+offset)*4+0] = (usePtr ? src[width] : c) >> 6;
+        }
+        break;
+    case 4:
+        while (--width >= 0) {
+            dst[(width+offset)*2+1] = (usePtr ? src[width] : c) & 0xF;
+            dst[(width+offset)*2+0] = (usePtr ? src[width] : c) >> 4;
+        }
+        break;
+    case 12: {
+                 uint16_t *dst16 = (uint16_t *)dst;
+                 GetBitContext gb;
+                 init_get_bits8(&gb, src, width);
+                 for (int i = 0; i < s->width; i++) {
+                     dst16[i] = get_bits(&gb, 12) << 4;
+                 }
+             }
+        break;
+    default:
+        if (usePtr) {
+            memcpy(dst + offset, src, width);
+        } else {
+            memset(dst + offset, c, width);
+        }
+    }
+}
+
+static int deinvert_buffer(TiffContext *s, const uint8_t *src, int size)
+{
+    int i;
+
+    av_fast_padded_malloc(&s->deinvert_buf, &s->deinvert_buf_size, size);
+    if (!s->deinvert_buf)
+        return AVERROR(ENOMEM);
+    for (i = 0; i < size; i++)
+        s->deinvert_buf[i] = ff_reverse[src[i]];
+
+    return 0;
+}
+
+static void unpack_gray(TiffContext *s, AVFrame *p,
+                       const uint8_t *src, int lnum, int width, int bpp)
+{
+    GetBitContext gb;
+    uint16_t *dst = (uint16_t *)(p->data[0] + lnum * p->linesize[0]);
+
+    init_get_bits8(&gb, src, width);
+
+    for (int i = 0; i < s->width; i++) {
+        dst[i] = get_bits(&gb, bpp);
+    }
+}
+
+static void unpack_yuv(TiffContext *s, AVFrame *p,
+                       const uint8_t *src, int lnum)
+{
+    int i, j, k;
+    int w       = (s->width - 1) / s->subsampling[0] + 1;
+    uint8_t *pu = &p->data[1][lnum / s->subsampling[1] * p->linesize[1]];
+    uint8_t *pv = &p->data[2][lnum / s->subsampling[1] * p->linesize[2]];
+    if (s->width % s->subsampling[0] || s->height % s->subsampling[1]) {
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    p->data[0][FFMIN(lnum + j, s->height-1) * p->linesize[0] +
+                               FFMIN(i * s->subsampling[0] + k, s->width-1)] = *src++;
+            *pu++ = *src++;
+            *pv++ = *src++;
+        }
+    }else{
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    p->data[0][(lnum + j) * p->linesize[0] +
+                               i * s->subsampling[0] + k] = *src++;
+            *pu++ = *src++;
+            *pv++ = *src++;
+        }
     }
 }
 
@@ -90,7 +358,7 @@ static int tiff_uncompress(uint8_t *dst, unsigned long *len, const uint8_t *src,
     z_stream zstream = { 0 };
     int zret;
 
-    zstream.next_in   = src;
+    zstream.next_in   = (uint8_t *)src;
     zstream.avail_in  = size;
     zstream.next_out  = dst;
     zstream.avail_out = *len;
@@ -105,9 +373,9 @@ static int tiff_uncompress(uint8_t *dst, unsigned long *len, const uint8_t *src,
     return zret == Z_STREAM_END ? Z_OK : zret;
 }
 
-static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
-                            const uint8_t *src, int size,
-                            int width, int lines)
+static int tiff_unpack_zlib(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                            const uint8_t *src, int size, int width, int lines,
+                            int strip_start, int is_yuv)
 {
     uint8_t *zbuf;
     unsigned long outlen;
@@ -116,6 +384,13 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
     zbuf   = av_malloc(outlen);
     if (!zbuf)
         return AVERROR(ENOMEM);
+    if (s->fill_order) {
+        if ((ret = deinvert_buffer(s, src, size)) < 0) {
+            av_free(zbuf);
+            return ret;
+        }
+        src = s->deinvert_buf;
+    }
     ret = tiff_uncompress(zbuf, &outlen, src, size);
     if (ret != Z_OK) {
         av_log(s->avctx, AV_LOG_ERROR,
@@ -126,7 +401,15 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
     }
     src = zbuf;
     for (line = 0; line < lines; line++) {
-        memcpy(dst, src, width);
+        if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            horizontal_fill(s, s->bpp, dst, 1, src, 0, width, 0);
+        } else {
+            memcpy(dst, src, width);
+        }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
         dst += stride;
         src += width;
     }
@@ -135,24 +418,87 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
 }
 #endif
 
+#if CONFIG_LZMA
+static int tiff_uncompress_lzma(uint8_t *dst, uint64_t *len, const uint8_t *src,
+                                int size)
+{
+    lzma_stream stream = LZMA_STREAM_INIT;
+    lzma_ret ret;
+
+    stream.next_in   = (uint8_t *)src;
+    stream.avail_in  = size;
+    stream.next_out  = dst;
+    stream.avail_out = *len;
+    ret              = lzma_stream_decoder(&stream, UINT64_MAX, 0);
+    if (ret != LZMA_OK) {
+        av_log(NULL, AV_LOG_ERROR, "LZMA init error: %d\n", ret);
+        return ret;
+    }
+    ret = lzma_code(&stream, LZMA_RUN);
+    lzma_end(&stream);
+    *len = stream.total_out;
+    return ret == LZMA_STREAM_END ? LZMA_OK : ret;
+}
+
+static int tiff_unpack_lzma(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                            const uint8_t *src, int size, int width, int lines,
+                            int strip_start, int is_yuv)
+{
+    uint64_t outlen = width * (uint64_t)lines;
+    int ret, line;
+    uint8_t *buf = av_malloc(outlen);
+    if (!buf)
+        return AVERROR(ENOMEM);
+    if (s->fill_order) {
+        if ((ret = deinvert_buffer(s, src, size)) < 0) {
+            av_free(buf);
+            return ret;
+        }
+        src = s->deinvert_buf;
+    }
+    ret = tiff_uncompress_lzma(buf, &outlen, src, size);
+    if (ret != LZMA_OK) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Uncompressing failed (%"PRIu64" of %"PRIu64") with error %d\n", outlen,
+               (uint64_t)width * lines, ret);
+        av_free(buf);
+        return AVERROR_UNKNOWN;
+    }
+    src = buf;
+    for (line = 0; line < lines; line++) {
+        if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            horizontal_fill(s, s->bpp, dst, 1, src, 0, width, 0);
+        } else {
+            memcpy(dst, src, width);
+        }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
+        dst += stride;
+        src += width;
+    }
+    av_free(buf);
+    return 0;
+}
+#endif
 
 static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
-                           const uint8_t *src, int size, int lines)
+                           const uint8_t *src, int size, int width, int lines)
 {
     int i, ret = 0;
-    uint8_t *src2 = av_malloc((unsigned)size +
-                              AV_INPUT_BUFFER_PADDING_SIZE);
+    int line;
+    uint8_t *src2;
+
+    av_fast_padded_malloc(&s->fax_buffer, &s->fax_buffer_size, size);
+    src2 = s->fax_buffer;
 
     if (!src2) {
         av_log(s->avctx, AV_LOG_ERROR,
                "Error allocating temporary buffer\n");
         return AVERROR(ENOMEM);
     }
-    if (s->fax_opts & 2) {
-        avpriv_request_sample(s->avctx, "Uncompressed fax mode");
-        av_free(src2);
-        return AVERROR_PATCHWELCOME;
-    }
+
     if (!s->fill_order) {
         memcpy(src2, src, size);
     } else {
@@ -162,16 +508,25 @@ static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
     memset(src2 + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     ret = ff_ccitt_unpack(s->avctx, src2, size, dst, lines, stride,
                           s->compr, s->fax_opts);
-    av_free(src2);
+    if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        for (line = 0; line < lines; line++) {
+            horizontal_fill(s, s->bpp, dst, 1, dst, 0, width, 0);
+            dst += stride;
+        }
     return ret;
 }
 
-static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
-                             const uint8_t *src, int size, int lines)
+static int tiff_unpack_strip(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                             const uint8_t *src, int size, int strip_start, int lines)
 {
     PutByteContext pb;
     int c, line, pixels, code, ret;
+    const uint8_t *ssrc = src;
     int width = ((s->width * s->bpp) + 7) >> 3;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(p->format);
+    int is_yuv = !(desc->flags & AV_PIX_FMT_FLAG_RGB) &&
+                 (desc->flags & AV_PIX_FMT_FLAG_PLANAR) &&
+                 desc->nb_components >= 3;
 
     if (s->planar)
         width /= s->bppcount;
@@ -179,9 +534,39 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
     if (size <= 0)
         return AVERROR_INVALIDDATA;
 
+    if (is_yuv) {
+        int bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
+                            s->subsampling[0] * s->subsampling[1] + 7) >> 3;
+        av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, bytes_per_row);
+        if (s->yuv_line == NULL) {
+            av_log(s->avctx, AV_LOG_ERROR, "Not enough memory\n");
+            return AVERROR(ENOMEM);
+        }
+        dst = s->yuv_line;
+        stride = 0;
+
+        width = (s->width - 1) / s->subsampling[0] + 1;
+        width = width * s->subsampling[0] * s->subsampling[1] + 2*width;
+        av_assert0(width <= bytes_per_row);
+        av_assert0(s->bpp == 24);
+    }
+    if (s->is_bayer) {
+        width = (s->bpp * s->width + 7) >> 3;
+    }
+    if (p->format == AV_PIX_FMT_GRAY12) {
+        av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, width);
+        if (s->yuv_line == NULL) {
+            av_log(s->avctx, AV_LOG_ERROR, "Not enough memory\n");
+            return AVERROR(ENOMEM);
+        }
+        dst = s->yuv_line;
+        stride = 0;
+    }
+
     if (s->compr == TIFF_DEFLATE || s->compr == TIFF_ADOBE_DEFLATE) {
 #if CONFIG_ZLIB
-        return tiff_unpack_zlib(s, dst, stride, src, size, width, lines);
+        return tiff_unpack_zlib(s, p, dst, stride, src, size, width, lines,
+                                strip_start, is_yuv);
 #else
         av_log(s->avctx, AV_LOG_ERROR,
                "zlib support not enabled, "
@@ -189,7 +574,25 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
         return AVERROR(ENOSYS);
 #endif
     }
+    if (s->compr == TIFF_LZMA) {
+#if CONFIG_LZMA
+        return tiff_unpack_lzma(s, p, dst, stride, src, size, width, lines,
+                                strip_start, is_yuv);
+#else
+        av_log(s->avctx, AV_LOG_ERROR,
+               "LZMA support not enabled\n");
+        return AVERROR(ENOSYS);
+#endif
+    }
     if (s->compr == TIFF_LZW) {
+        if (s->fill_order) {
+            if ((ret = deinvert_buffer(s, src, size)) < 0)
+                return ret;
+            ssrc = src = s->deinvert_buf;
+        }
+        if (size > 1 && !src[0] && (src[1]&1)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Old style LZW is unsupported\n");
+        }
         if ((ret = ff_lzw_decode_init(s->lzw, 8, src, size, FF_LZW_TIFF)) < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Error initializing LZW decoder\n");
             return ret;
@@ -201,6 +604,14 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
                        pixels, width);
                 return AVERROR_INVALIDDATA;
             }
+            if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+                horizontal_fill(s, s->bpp, dst, 1, dst, 0, width, 0);
+            if (is_yuv) {
+                unpack_yuv(s, p, dst, strip_start + line);
+                line += s->subsampling[1] - 1;
+            } else if (p->format == AV_PIX_FMT_GRAY12) {
+                unpack_gray(s, p, dst, strip_start + line, width, s->bpp);
+            }
             dst += stride;
         }
         return 0;
@@ -208,49 +619,93 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
     if (s->compr == TIFF_CCITT_RLE ||
         s->compr == TIFF_G3        ||
         s->compr == TIFF_G4) {
-        return tiff_unpack_fax(s, dst, stride, src, size, lines);
+        if (is_yuv || p->format == AV_PIX_FMT_GRAY12)
+            return AVERROR_INVALIDDATA;
+
+        return tiff_unpack_fax(s, dst, stride, src, size, width, lines);
     }
 
     bytestream2_init(&s->gb, src, size);
-    bytestream2_init_writer(&pb, dst, stride * lines);
+    bytestream2_init_writer(&pb, dst, is_yuv ? s->yuv_line_size : (stride * lines));
 
     for (line = 0; line < lines; line++) {
+        if (src - ssrc > size) {
+            av_log(s->avctx, AV_LOG_ERROR, "Source data overread\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (bytestream2_get_bytes_left(&s->gb) == 0 || bytestream2_get_eof(&pb))
             break;
         bytestream2_seek_p(&pb, stride * line, SEEK_SET);
         switch (s->compr) {
         case TIFF_RAW:
+            if (ssrc + size - src < width)
+                return AVERROR_INVALIDDATA;
+
             if (!s->fill_order) {
-                bytestream2_copy_buffer(&pb, &s->gb, width);
+                horizontal_fill(s, s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8 || s->is_bayer),
+                                dst, 1, src, 0, width, 0);
             } else {
                 int i;
                 for (i = 0; i < width; i++)
-                    bytestream2_put_byte(&pb, ff_reverse[bytestream2_get_byte(&s->gb)]);
+                    dst[i] = ff_reverse[src[i]];
             }
+            src += width;
             break;
         case TIFF_PACKBITS:
             for (pixels = 0; pixels < width;) {
-                code = ff_u8_to_s8(bytestream2_get_byte(&s->gb));
+                if (ssrc + size - src < 2) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Read went out of bounds\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                code = s->fill_order ? (int8_t) ff_reverse[*src++]: (int8_t) *src++;
                 if (code >= 0) {
                     code++;
-                    bytestream2_copy_buffer(&pb, &s->gb, code);
+                    if (pixels + code > width ||
+                        ssrc + size - src < code) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Copy went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    horizontal_fill(s, s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                    dst, 1, src, 0, code, pixels);
+                    src    += code;
                     pixels += code;
                 } else if (code != -128) { // -127..-1
                     code = (-code) + 1;
-                    c    = bytestream2_get_byte(&s->gb);
-                    bytestream2_set_buffer(&pb, c, code);
+                    if (pixels + code > width) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Run went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    c = *src++;
+                    horizontal_fill(s, s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                    dst, 0, NULL, c, code, pixels);
                     pixels += code;
                 }
             }
+            if (s->fill_order) {
+                int i;
+                for (i = 0; i < width; i++)
+                    dst[i] = ff_reverse[dst[i]];
+            }
             break;
         }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        } else if (p->format == AV_PIX_FMT_GRAY12) {
+            unpack_gray(s, p, dst, strip_start + line, width, s->bpp);
+        }
+        dst += stride;
     }
     return 0;
 }
 
-static int init_image(TiffContext *s, AVFrame *frame)
+static int init_image(TiffContext *s, ThreadFrame *frame)
 {
     int ret;
+    int create_gray_palette = 0;
 
     // make sure there is no aliasing in the following switch
     if (s->bpp >= 100 || s->bppcount >= 10) {
@@ -260,15 +715,105 @@ static int init_image(TiffContext *s, AVFrame *frame)
         return AVERROR_INVALIDDATA;
     }
 
-    switch (s->planar * 1000 + s->bpp * 10 + s->bppcount) {
+    switch (s->planar * 1000 + s->bpp * 10 + s->bppcount + s->is_bayer * 10000) {
     case 11:
-        s->avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+        if (!s->palette_is_set) {
+            s->avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+            break;
+        }
+    case 21:
+    case 41:
+        s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        if (!s->palette_is_set) {
+            create_gray_palette = 1;
+        }
         break;
     case 81:
         s->avctx->pix_fmt = s->palette_is_set ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
         break;
+    case 121:
+        s->avctx->pix_fmt = AV_PIX_FMT_GRAY12;
+        break;
+    case 10081:
+        switch (AV_RL32(s->pattern)) {
+        case 0x02010100:
+            s->avctx->pix_fmt = AV_PIX_FMT_BAYER_RGGB8;
+            break;
+        case 0x00010102:
+            s->avctx->pix_fmt = AV_PIX_FMT_BAYER_BGGR8;
+            break;
+        case 0x01000201:
+            s->avctx->pix_fmt = AV_PIX_FMT_BAYER_GBRG8;
+            break;
+        case 0x01020001:
+            s->avctx->pix_fmt = AV_PIX_FMT_BAYER_GRBG8;
+            break;
+        default:
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported Bayer pattern: 0x%X\n",
+                   AV_RL32(s->pattern));
+            return AVERROR_PATCHWELCOME;
+        }
+        break;
+    case 10121:
+        switch (AV_RL32(s->pattern)) {
+        case 0x02010100:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_RGGB16LE : AV_PIX_FMT_BAYER_RGGB16BE;
+            break;
+        case 0x00010102:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_BGGR16LE : AV_PIX_FMT_BAYER_BGGR16BE;
+            break;
+        case 0x01000201:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_GBRG16LE : AV_PIX_FMT_BAYER_GBRG16BE;
+            break;
+        case 0x01020001:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_GRBG16LE : AV_PIX_FMT_BAYER_GRBG16BE;
+            break;
+        default:
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported Bayer pattern: 0x%X\n",
+                   AV_RL32(s->pattern));
+            return AVERROR_PATCHWELCOME;
+        }
+        break;
+    case 10161:
+        switch (AV_RL32(s->pattern)) {
+        case 0x02010100:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_RGGB16LE : AV_PIX_FMT_BAYER_RGGB16BE;
+            break;
+        case 0x00010102:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_BGGR16LE : AV_PIX_FMT_BAYER_BGGR16BE;
+            break;
+        case 0x01000201:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_GBRG16LE : AV_PIX_FMT_BAYER_GBRG16BE;
+            break;
+        case 0x01020001:
+            s->avctx->pix_fmt = s->le ? AV_PIX_FMT_BAYER_GRBG16LE : AV_PIX_FMT_BAYER_GRBG16BE;
+            break;
+        default:
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported Bayer pattern: 0x%X\n",
+                   AV_RL32(s->pattern));
+            return AVERROR_PATCHWELCOME;
+        }
+        break;
     case 243:
-        s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+            if (s->subsampling[0] == 1 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+            } else if (s->subsampling[0] == 2 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            } else if (s->subsampling[0] == 4 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+            } else if (s->subsampling[0] == 1 && s->subsampling[1] == 2) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+            } else if (s->subsampling[0] == 2 && s->subsampling[1] == 2) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            } else if (s->subsampling[0] == 4 && s->subsampling[1] == 4) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV410P;
+            } else {
+                av_log(s->avctx, AV_LOG_ERROR, "Unsupported YCbCr subsampling\n");
+                return AVERROR_PATCHWELCOME;
+            }
+        } else
+            s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
         break;
     case 161:
         s->avctx->pix_fmt = s->le ? AV_PIX_FMT_GRAY16LE : AV_PIX_FMT_GRAY16BE;
@@ -280,13 +825,13 @@ static int init_image(TiffContext *s, AVFrame *frame)
         s->avctx->pix_fmt = s->le ? AV_PIX_FMT_YA16LE : AV_PIX_FMT_YA16BE;
         break;
     case 324:
-        s->avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        s->avctx->pix_fmt = s->photometric == TIFF_PHOTOMETRIC_SEPARATED ? AV_PIX_FMT_RGB0 : AV_PIX_FMT_RGBA;
         break;
     case 483:
-        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGB48LE : AV_PIX_FMT_RGB48BE;
+        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGB48LE  : AV_PIX_FMT_RGB48BE;
         break;
     case 644:
-        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGBA64LE : AV_PIX_FMT_RGBA64BE;
+        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGBA64LE  : AV_PIX_FMT_RGBA64BE;
         break;
     case 1243:
         s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
@@ -306,64 +851,89 @@ static int init_image(TiffContext *s, AVFrame *frame)
                s->bpp, s->bppcount);
         return AVERROR_INVALIDDATA;
     }
+
+    if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
+        if((desc->flags & AV_PIX_FMT_FLAG_RGB) ||
+           !(desc->flags & AV_PIX_FMT_FLAG_PLANAR) ||
+           desc->nb_components < 3) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported YCbCr variant\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     if (s->width != s->avctx->width || s->height != s->avctx->height) {
         ret = ff_set_dimensions(s->avctx, s->width, s->height);
         if (ret < 0)
             return ret;
     }
-    if ((ret = ff_get_buffer(s->avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(s->avctx, frame, 0)) < 0)
         return ret;
-    }
     if (s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
-        memcpy(frame->data[1], s->palette, sizeof(s->palette));
+        if (!create_gray_palette)
+            memcpy(frame->f->data[1], s->palette, sizeof(s->palette));
+        else {
+            /* make default grayscale pal */
+            int i;
+            uint32_t *pal = (uint32_t *)frame->f->data[1];
+            for (i = 0; i < 1<<s->bpp; i++)
+                pal[i] = 0xFFU << 24 | i * 255 / ((1<<s->bpp) - 1) * 0x010101;
+        }
     }
     return 0;
 }
 
-static int tiff_decode_tag(TiffContext *s)
+static void set_sar(TiffContext *s, unsigned tag, unsigned num, unsigned den)
+{
+    int offset = tag == TIFF_YRES ? 2 : 0;
+    s->res[offset++] = num;
+    s->res[offset]   = den;
+    if (s->res[0] && s->res[1] && s->res[2] && s->res[3]) {
+        uint64_t num = s->res[2] * (uint64_t)s->res[1];
+        uint64_t den = s->res[0] * (uint64_t)s->res[3];
+        if (num > INT64_MAX || den > INT64_MAX) {
+            num = num >> 1;
+            den = den >> 1;
+        }
+        av_reduce(&s->avctx->sample_aspect_ratio.num, &s->avctx->sample_aspect_ratio.den,
+                  num, den, INT32_MAX);
+        if (!s->avctx->sample_aspect_ratio.den)
+            s->avctx->sample_aspect_ratio = (AVRational) {0, 1};
+    }
+}
+
+static int tiff_decode_tag(TiffContext *s, AVFrame *frame)
 {
-    unsigned tag, type, count, off, value = 0;
+    unsigned tag, type, count, off, value = 0, value2 = 0;
     int i, start;
+    int pos;
+    int ret;
+    double *dp;
 
-    if (bytestream2_get_bytes_left(&s->gb) < 12)
-        return AVERROR_INVALIDDATA;
-    tag   = tget_short(&s->gb, s->le);
-    type  = tget_short(&s->gb, s->le);
-    count = tget_long(&s->gb, s->le);
-    off   = tget_long(&s->gb, s->le);
-    start = bytestream2_tell(&s->gb);
-
-    if (type == 0 || type >= FF_ARRAY_ELEMS(type_sizes)) {
-        av_log(s->avctx, AV_LOG_DEBUG, "Unknown tiff type (%u) encountered\n",
-               type);
-        return 0;
+    ret = ff_tread_tag(&s->gb, s->le, &tag, &type, &count, &start);
+    if (ret < 0) {
+        goto end;
     }
 
+    off = bytestream2_tell(&s->gb);
     if (count == 1) {
         switch (type) {
         case TIFF_BYTE:
         case TIFF_SHORT:
-            bytestream2_seek(&s->gb, -4, SEEK_CUR);
-            value = tget(&s->gb, type, s->le);
-            break;
         case TIFF_LONG:
-            value = off;
+            value = ff_tget(&s->gb, type, s->le);
+            break;
+        case TIFF_RATIONAL:
+            value  = ff_tget(&s->gb, TIFF_LONG, s->le);
+            value2 = ff_tget(&s->gb, TIFF_LONG, s->le);
             break;
         case TIFF_STRING:
             if (count <= 4) {
-                bytestream2_seek(&s->gb, -4, SEEK_CUR);
                 break;
             }
         default:
             value = UINT_MAX;
-            bytestream2_seek(&s->gb, off, SEEK_SET);
         }
-    } else {
-        if (count <= 4 && type_sizes[type] * count <= 4)
-            bytestream2_seek(&s->gb, -4, SEEK_CUR);
-        else
-            bytestream2_seek(&s->gb, off, SEEK_SET);
     }
 
     switch (tag) {
@@ -374,26 +944,25 @@ static int tiff_decode_tag(TiffContext *s)
         s->height = value;
         break;
     case TIFF_BPP:
-        s->bppcount = count;
-        if (count > 4) {
+        if (count > 4U) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "This format is not supported (bpp=%d, %d components)\n",
-                   s->bpp, count);
+                   value, count);
             return AVERROR_INVALIDDATA;
         }
+        s->bppcount = count;
         if (count == 1)
             s->bpp = value;
         else {
             switch (type) {
             case TIFF_BYTE:
-                s->bpp = (off & 0xFF) + ((off >> 8) & 0xFF) +
-                         ((off >> 16) & 0xFF) + ((off >> 24) & 0xFF);
-                break;
             case TIFF_SHORT:
             case TIFF_LONG:
                 s->bpp = 0;
+                if (bytestream2_get_bytes_left(&s->gb) < type_sizes[type] * count)
+                    return AVERROR_INVALIDDATA;
                 for (i = 0; i < count; i++)
-                    s->bpp += tget(&s->gb, type, s->le);
+                    s->bpp += ff_tget(&s->gb, type, s->le);
                 break;
             default:
                 s->bpp = -1;
@@ -406,12 +975,18 @@ static int tiff_decode_tag(TiffContext *s)
                    "Samples per pixel requires a single value, many provided\n");
             return AVERROR_INVALIDDATA;
         }
+        if (value > 4U) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Samples per pixel %d is too large\n", value);
+            return AVERROR_INVALIDDATA;
+        }
         if (s->bppcount == 1)
             s->bpp *= value;
         s->bppcount = value;
         break;
     case TIFF_COMPR:
         s->compr     = value;
+        av_log(s->avctx, AV_LOG_DEBUG, "compression: %d\n", s->compr);
         s->predictor = 0;
         switch (s->compr) {
         case TIFF_RAW:
@@ -436,8 +1011,12 @@ static int tiff_decode_tag(TiffContext *s)
             avpriv_report_missing_feature(s->avctx, "JPEG compression");
             return AVERROR_PATCHWELCOME;
         case TIFF_LZMA:
-            avpriv_report_missing_feature(s->avctx, "LZMA compression");
-            return AVERROR_PATCHWELCOME;
+#if CONFIG_LZMA
+            break;
+#else
+            av_log(s->avctx, AV_LOG_ERROR, "LZMA not compiled in\n");
+            return AVERROR(ENOSYS);
+#endif
         default:
             av_log(s->avctx, AV_LOG_ERROR, "Unknown compression method %i\n",
                    s->compr);
@@ -451,6 +1030,11 @@ static int tiff_decode_tag(TiffContext *s)
         break;
     case TIFF_STRIP_OFFS:
         if (count == 1) {
+            if (value > INT_MAX) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                    "strippos %u too large\n", value);
+                return AVERROR_INVALIDDATA;
+            }
             s->strippos = 0;
             s->stripoff = value;
         } else
@@ -462,6 +1046,11 @@ static int tiff_decode_tag(TiffContext *s)
         break;
     case TIFF_STRIP_SIZE:
         if (count == 1) {
+            if (value > INT_MAX) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                    "stripsize %u too large\n", value);
+                return AVERROR_INVALIDDATA;
+            }
             s->stripsizesoff = 0;
             s->stripsize     = value;
             s->strips        = 1;
@@ -471,24 +1060,55 @@ static int tiff_decode_tag(TiffContext *s)
         s->strips = count;
         s->sstype = type;
         break;
+    case TIFF_XRES:
+    case TIFF_YRES:
+        set_sar(s, tag, value, value2);
+        break;
+    case TIFF_TILE_BYTE_COUNTS:
+    case TIFF_TILE_LENGTH:
+    case TIFF_TILE_OFFSETS:
+    case TIFF_TILE_WIDTH:
+        av_log(s->avctx, AV_LOG_ERROR, "Tiled images are not supported\n");
+        return AVERROR_PATCHWELCOME;
+        break;
     case TIFF_PREDICTOR:
         s->predictor = value;
         break;
+    case TIFF_SUB_IFDS:
+        s->sub_ifd = value;
+        break;
+    case TIFF_WHITE_LEVEL:
+        s->white_level = value;
+        break;
+    case TIFF_CFA_PATTERN_DIM:
+        if (count != 2 || (ff_tget(&s->gb, type, s->le) != 2 &&
+                           ff_tget(&s->gb, type, s->le) != 2)) {
+            av_log(s->avctx, AV_LOG_ERROR, "CFA Pattern dimensions are not 2x2\n");
+            return AVERROR_INVALIDDATA;
+        }
+        break;
+    case TIFF_CFA_PATTERN:
+        s->is_bayer = 1;
+        s->pattern[0] = ff_tget(&s->gb, type, s->le);
+        s->pattern[1] = ff_tget(&s->gb, type, s->le);
+        s->pattern[2] = ff_tget(&s->gb, type, s->le);
+        s->pattern[3] = ff_tget(&s->gb, type, s->le);
+        break;
     case TIFF_PHOTOMETRIC:
         switch (value) {
         case TIFF_PHOTOMETRIC_WHITE_IS_ZERO:
         case TIFF_PHOTOMETRIC_BLACK_IS_ZERO:
         case TIFF_PHOTOMETRIC_RGB:
         case TIFF_PHOTOMETRIC_PALETTE:
+        case TIFF_PHOTOMETRIC_SEPARATED:
+        case TIFF_PHOTOMETRIC_YCBCR:
+        case TIFF_PHOTOMETRIC_CFA:
             s->photometric = value;
             break;
         case TIFF_PHOTOMETRIC_ALPHA_MASK:
-        case TIFF_PHOTOMETRIC_SEPARATED:
-        case TIFF_PHOTOMETRIC_YCBCR:
         case TIFF_PHOTOMETRIC_CIE_LAB:
         case TIFF_PHOTOMETRIC_ICC_LAB:
         case TIFF_PHOTOMETRIC_ITU_LAB:
-        case TIFF_PHOTOMETRIC_CFA:
         case TIFF_PHOTOMETRIC_LOG_L:
         case TIFF_PHOTOMETRIC_LOG_LUV:
         case TIFF_PHOTOMETRIC_LINEAR_RAW:
@@ -516,15 +1136,22 @@ static int tiff_decode_tag(TiffContext *s)
         if (count / 3 > 256 ||
             bytestream2_get_bytes_left(&s->gb) < count / 3 * off * 3)
             return AVERROR_INVALIDDATA;
+
         pal_gb[0] = pal_gb[1] = pal_gb[2] = s->gb;
         bytestream2_skip(&pal_gb[1], count / 3 * off);
         bytestream2_skip(&pal_gb[2], count / 3 * off * 2);
+
         off = (type_sizes[type] - 1) << 3;
+        if (off > 31U) {
+            av_log(s->avctx, AV_LOG_ERROR, "palette shift %d is out of range\n", off);
+            return AVERROR_INVALIDDATA;
+        }
+
         for (i = 0; i < count / 3; i++) {
             uint32_t p = 0xFF000000;
-            p |= (tget(&pal_gb[0], type, s->le) >> off) << 16;
-            p |= (tget(&pal_gb[1], type, s->le) >> off) << 8;
-            p |=  tget(&pal_gb[2], type, s->le) >> off;
+            p |= (ff_tget(&pal_gb[0], type, s->le) >> off) << 16;
+            p |= (ff_tget(&pal_gb[1], type, s->le) >> off) << 8;
+            p |=  ff_tget(&pal_gb[2], type, s->le) >> off;
             s->palette[i] = p;
         }
         s->palette_is_set = 1;
@@ -533,6 +1160,20 @@ static int tiff_decode_tag(TiffContext *s)
     case TIFF_PLANAR:
         s->planar = value == 2;
         break;
+    case TIFF_YCBCR_SUBSAMPLING:
+        if (count != 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "subsample count invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < count; i++) {
+            s->subsampling[i] = ff_tget(&s->gb, type, s->le);
+            if (s->subsampling[i] <= 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "subsampling %d is invalid\n", s->subsampling[i]);
+                s->subsampling[i] = 1;
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        break;
     case TIFF_T4OPTIONS:
         if (s->compr == TIFF_G3)
             s->fax_opts = value;
@@ -541,14 +1182,162 @@ static int tiff_decode_tag(TiffContext *s)
         if (s->compr == TIFF_G4)
             s->fax_opts = value;
         break;
+#define ADD_METADATA(count, name, sep)\
+    if ((ret = add_metadata(count, type, name, sep, s, frame)) < 0) {\
+        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");\
+        goto end;\
+    }
+    case TIFF_MODEL_PIXEL_SCALE:
+        ADD_METADATA(count, "ModelPixelScaleTag", NULL);
+        break;
+    case TIFF_MODEL_TRANSFORMATION:
+        ADD_METADATA(count, "ModelTransformationTag", NULL);
+        break;
+    case TIFF_MODEL_TIEPOINT:
+        ADD_METADATA(count, "ModelTiepointTag", NULL);
+        break;
+    case TIFF_GEO_KEY_DIRECTORY:
+        if (s->geotag_count) {
+            avpriv_request_sample(s->avctx, "Multiple geo key directories\n");
+            return AVERROR_INVALIDDATA;
+        }
+        ADD_METADATA(1, "GeoTIFF_Version", NULL);
+        ADD_METADATA(2, "GeoTIFF_Key_Revision", ".");
+        s->geotag_count   = ff_tget_short(&s->gb, s->le);
+        if (s->geotag_count > count / 4 - 1) {
+            s->geotag_count = count / 4 - 1;
+            av_log(s->avctx, AV_LOG_WARNING, "GeoTIFF key directory buffer shorter than specified\n");
+        }
+        if (   bytestream2_get_bytes_left(&s->gb) < s->geotag_count * sizeof(int16_t) * 4
+            || s->geotag_count == 0) {
+            s->geotag_count = 0;
+            return -1;
+        }
+        s->geotags = av_mallocz_array(s->geotag_count, sizeof(TiffGeoTag));
+        if (!s->geotags) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+            s->geotag_count = 0;
+            goto end;
+        }
+        for (i = 0; i < s->geotag_count; i++) {
+            s->geotags[i].key    = ff_tget_short(&s->gb, s->le);
+            s->geotags[i].type   = ff_tget_short(&s->gb, s->le);
+            s->geotags[i].count  = ff_tget_short(&s->gb, s->le);
+
+            if (!s->geotags[i].type)
+                s->geotags[i].val  = get_geokey_val(s->geotags[i].key, ff_tget_short(&s->gb, s->le));
+            else
+                s->geotags[i].offset = ff_tget_short(&s->gb, s->le);
+        }
+        break;
+    case TIFF_GEO_DOUBLE_PARAMS:
+        if (count >= INT_MAX / sizeof(int64_t))
+            return AVERROR_INVALIDDATA;
+        if (bytestream2_get_bytes_left(&s->gb) < count * sizeof(int64_t))
+            return AVERROR_INVALIDDATA;
+        dp = av_malloc_array(count, sizeof(double));
+        if (!dp) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+            goto end;
+        }
+        for (i = 0; i < count; i++)
+            dp[i] = ff_tget_double(&s->gb, s->le);
+        for (i = 0; i < s->geotag_count; i++) {
+            if (s->geotags[i].type == TIFF_GEO_DOUBLE_PARAMS) {
+                if (s->geotags[i].count == 0
+                    || s->geotags[i].offset + s->geotags[i].count > count) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid GeoTIFF key %d\n", s->geotags[i].key);
+                } else if (s->geotags[i].val) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Duplicate GeoTIFF key %d\n", s->geotags[i].key);
+                } else {
+                    char *ap = doubles2str(&dp[s->geotags[i].offset], s->geotags[i].count, ", ");
+                    if (!ap) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+                        av_freep(&dp);
+                        return AVERROR(ENOMEM);
+                    }
+                    s->geotags[i].val = ap;
+                }
+            }
+        }
+        av_freep(&dp);
+        break;
+    case TIFF_GEO_ASCII_PARAMS:
+        pos = bytestream2_tell(&s->gb);
+        for (i = 0; i < s->geotag_count; i++) {
+            if (s->geotags[i].type == TIFF_GEO_ASCII_PARAMS) {
+                if (s->geotags[i].count == 0
+                    || s->geotags[i].offset +  s->geotags[i].count > count) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid GeoTIFF key %d\n", s->geotags[i].key);
+                } else {
+                    char *ap;
+
+                    bytestream2_seek(&s->gb, pos + s->geotags[i].offset, SEEK_SET);
+                    if (bytestream2_get_bytes_left(&s->gb) < s->geotags[i].count)
+                        return AVERROR_INVALIDDATA;
+                    if (s->geotags[i].val)
+                        return AVERROR_INVALIDDATA;
+                    ap = av_malloc(s->geotags[i].count);
+                    if (!ap) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+                        return AVERROR(ENOMEM);
+                    }
+                    bytestream2_get_bufferu(&s->gb, ap, s->geotags[i].count);
+                    ap[s->geotags[i].count - 1] = '\0'; //replace the "|" delimiter with a 0 byte
+                    s->geotags[i].val = ap;
+                }
+            }
+        }
+        break;
+    case TIFF_ARTIST:
+        ADD_METADATA(count, "artist", NULL);
+        break;
+    case TIFF_COPYRIGHT:
+        ADD_METADATA(count, "copyright", NULL);
+        break;
+    case TIFF_DATE:
+        ADD_METADATA(count, "date", NULL);
+        break;
+    case TIFF_DOCUMENT_NAME:
+        ADD_METADATA(count, "document_name", NULL);
+        break;
+    case TIFF_HOST_COMPUTER:
+        ADD_METADATA(count, "computer", NULL);
+        break;
+    case TIFF_IMAGE_DESCRIPTION:
+        ADD_METADATA(count, "description", NULL);
+        break;
+    case TIFF_MAKE:
+        ADD_METADATA(count, "make", NULL);
+        break;
+    case TIFF_MODEL:
+        ADD_METADATA(count, "model", NULL);
+        break;
+    case TIFF_PAGE_NAME:
+        ADD_METADATA(count, "page_name", NULL);
+        break;
+    case TIFF_PAGE_NUMBER:
+        ADD_METADATA(count, "page_number", " / ");
+        break;
+    case TIFF_SOFTWARE_NAME:
+        ADD_METADATA(count, "software", NULL);
+        break;
     default:
         if (s->avctx->err_recognition & AV_EF_EXPLODE) {
             av_log(s->avctx, AV_LOG_ERROR,
-                   "Unknown or unsupported tag %d/0X%0X\n",
+                   "Unknown or unsupported tag %d/0x%0X\n",
                    tag, tag);
             return AVERROR_INVALIDDATA;
         }
     }
+end:
+    if (s->bpp > 64U) {
+        av_log(s->avctx, AV_LOG_ERROR,
+                "This format is not supported (bpp=%d, %d components)\n",
+                s->bpp, count);
+        s->bpp = 0;
+        return AVERROR_INVALIDDATA;
+    }
     bytestream2_seek(&s->gb, start, SEEK_SET);
     return 0;
 }
@@ -558,8 +1347,9 @@ static int decode_frame(AVCodecContext *avctx,
 {
     TiffContext *const s = avctx->priv_data;
     AVFrame *const p = data;
+    ThreadFrame frame = { .f = data };
     unsigned off;
-    int id, le, ret, plane, planes;
+    int le, ret, plane, planes;
     int i, j, entries, stride;
     unsigned soff, ssize;
     uint8_t *dst;
@@ -569,48 +1359,69 @@ static int decode_frame(AVCodecContext *avctx,
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
     // parse image header
-    if (avpkt->size < 8)
-        return AVERROR_INVALIDDATA;
-    id = bytestream2_get_le16(&s->gb);
-    if (id == 0x4949)
-        le = 1;
-    else if (id == 0x4D4D)
-        le = 0;
-    else {
-        av_log(avctx, AV_LOG_ERROR, "TIFF header not found\n");
+    if ((ret = ff_tdecode_header(&s->gb, &le, &off))) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid TIFF header\n");
+        return ret;
+    } else if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
+        av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
         return AVERROR_INVALIDDATA;
     }
     s->le          = le;
+    // TIFF_BPP is not a required tag and defaults to 1
+again:
+    s->bppcount    = s->bpp = 1;
     s->photometric = TIFF_PHOTOMETRIC_NONE;
     s->compr       = TIFF_RAW;
     s->fill_order  = 0;
-    // As TIFF 6.0 specification puts it "An arbitrary but carefully chosen number
-    // that further identifies the file as a TIFF file"
-    if (tget_short(&s->gb, le) != 42) {
-        av_log(avctx, AV_LOG_ERROR,
-               "The answer to life, universe and everything is not correct!\n");
-        return AVERROR_INVALIDDATA;
-    }
+    s->white_level = 0;
+    s->is_bayer    = 0;
+    free_geotags(s);
+
     // Reset these offsets so we can tell if they were set this frame
     s->stripsizesoff = s->strippos = 0;
     /* parse image file directory */
-    off = tget_long(&s->gb, le);
-    if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
-        av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
-        return AVERROR_INVALIDDATA;
-    }
     bytestream2_seek(&s->gb, off, SEEK_SET);
-    entries = tget_short(&s->gb, le);
+    entries = ff_tget_short(&s->gb, le);
+    if (bytestream2_get_bytes_left(&s->gb) < entries * 12)
+        return AVERROR_INVALIDDATA;
     for (i = 0; i < entries; i++) {
-        if ((ret = tiff_decode_tag(s)) < 0)
+        if ((ret = tiff_decode_tag(s, p)) < 0)
+            return ret;
+    }
+
+    if (s->sub_ifd && s->get_subimage) {
+        off = s->sub_ifd;
+        if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
+            av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->sub_ifd = 0;
+        goto again;
+    }
+
+    for (i = 0; i<s->geotag_count; i++) {
+        const char *keyname = get_geokey_name(s->geotags[i].key);
+        if (!keyname) {
+            av_log(avctx, AV_LOG_WARNING, "Unknown or unsupported GeoTIFF key %d\n", s->geotags[i].key);
+            continue;
+        }
+        if (get_geokey_type(s->geotags[i].key) != s->geotags[i].type) {
+            av_log(avctx, AV_LOG_WARNING, "Type of GeoTIFF key %d is wrong\n", s->geotags[i].key);
+            continue;
+        }
+        ret = av_dict_set(&p->metadata, keyname, s->geotags[i].val, 0);
+        if (ret<0) {
+            av_log(avctx, AV_LOG_ERROR, "Writing metadata with key '%s' failed\n", keyname);
             return ret;
+        }
     }
+
     if (!s->strippos && !s->stripoff) {
         av_log(avctx, AV_LOG_ERROR, "Image data is missing\n");
         return AVERROR_INVALIDDATA;
     }
     /* now we have the data and may start decoding */
-    if ((ret = init_image(s, p)) < 0)
+    if ((ret = init_image(s, &frame)) < 0)
         return ret;
 
     if (s->strips == 1 && !s->stripsize) {
@@ -619,65 +1430,90 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     if (s->stripsizesoff) {
-        if (s->stripsizesoff >= avpkt->size)
+        if (s->stripsizesoff >= (unsigned)avpkt->size)
             return AVERROR_INVALIDDATA;
         bytestream2_init(&stripsizes, avpkt->data + s->stripsizesoff,
                          avpkt->size - s->stripsizesoff);
     }
     if (s->strippos) {
-        if (s->strippos >= avpkt->size)
+        if (s->strippos >= (unsigned)avpkt->size)
             return AVERROR_INVALIDDATA;
         bytestream2_init(&stripdata, avpkt->data + s->strippos,
                          avpkt->size - s->strippos);
     }
 
+    if (s->rps <= 0 || s->rps % s->subsampling[1]) {
+        av_log(avctx, AV_LOG_ERROR, "rps %d invalid\n", s->rps);
+        return AVERROR_INVALIDDATA;
+    }
+
     planes = s->planar ? s->bppcount : 1;
     for (plane = 0; plane < planes; plane++) {
+        int remaining = avpkt->size;
+        int decoded_height;
         stride = p->linesize[plane];
         dst = p->data[plane];
         for (i = 0; i < s->height; i += s->rps) {
+            if (i)
+                dst += s->rps * stride;
             if (s->stripsizesoff)
-                ssize = tget(&stripsizes, s->sstype, le);
+                ssize = ff_tget(&stripsizes, s->sstype, le);
             else
                 ssize = s->stripsize;
 
             if (s->strippos)
-                soff = tget(&stripdata, s->sot, le);
+                soff = ff_tget(&stripdata, s->sot, le);
             else
                 soff = s->stripoff;
 
-            if (soff > avpkt->size || ssize > avpkt->size - soff) {
+            if (soff > avpkt->size || ssize > avpkt->size - soff || ssize > remaining) {
                 av_log(avctx, AV_LOG_ERROR, "Invalid strip size/offset\n");
                 return AVERROR_INVALIDDATA;
             }
-            if ((ret = tiff_unpack_strip(s, dst, stride, avpkt->data + soff, ssize,
+            remaining -= ssize;
+            if ((ret = tiff_unpack_strip(s, p, dst, stride, avpkt->data + soff, ssize, i,
                                          FFMIN(s->rps, s->height - i))) < 0) {
                 if (avctx->err_recognition & AV_EF_EXPLODE)
                     return ret;
                 break;
             }
-            dst += s->rps * stride;
         }
+        decoded_height = FFMIN(i, s->height);
+
         if (s->predictor == 2) {
+            if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+                av_log(s->avctx, AV_LOG_ERROR, "predictor == 2 with YUV is unsupported");
+                return AVERROR_PATCHWELCOME;
+            }
             dst   = p->data[plane];
             soff  = s->bpp >> 3;
+            if (s->planar)
+                soff  = FFMAX(soff / s->bppcount, 1);
             ssize = s->width * soff;
             if (s->avctx->pix_fmt == AV_PIX_FMT_RGB48LE ||
-                s->avctx->pix_fmt == AV_PIX_FMT_RGBA64LE) {
-                for (i = 0; i < s->height; i++) {
+                s->avctx->pix_fmt == AV_PIX_FMT_RGBA64LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GRAY16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_YA16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GBRP16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GBRAP16LE) {
+                for (i = 0; i < decoded_height; i++) {
                     for (j = soff; j < ssize; j += 2)
                         AV_WL16(dst + j, AV_RL16(dst + j) + AV_RL16(dst + j - soff));
                     dst += stride;
                 }
             } else if (s->avctx->pix_fmt == AV_PIX_FMT_RGB48BE ||
-                       s->avctx->pix_fmt == AV_PIX_FMT_RGBA64BE) {
-                for (i = 0; i < s->height; i++) {
+                       s->avctx->pix_fmt == AV_PIX_FMT_RGBA64BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GRAY16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_YA16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GBRP16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GBRAP16BE) {
+                for (i = 0; i < decoded_height; i++) {
                     for (j = soff; j < ssize; j += 2)
                         AV_WB16(dst + j, AV_RB16(dst + j) + AV_RB16(dst + j - soff));
                     dst += stride;
                 }
             } else {
-                for (i = 0; i < s->height; i++) {
+                for (i = 0; i < decoded_height; i++) {
                     for (j = soff; j < ssize; j++)
                         dst[j] += dst[j - soff];
                     dst += stride;
@@ -686,13 +1522,32 @@ static int decode_frame(AVCodecContext *avctx,
         }
 
         if (s->photometric == TIFF_PHOTOMETRIC_WHITE_IS_ZERO) {
+            int c = (s->avctx->pix_fmt == AV_PIX_FMT_PAL8 ? (1<<s->bpp) - 1 : 255);
             dst = p->data[plane];
             for (i = 0; i < s->height; i++) {
                 for (j = 0; j < stride; j++)
-                    dst[j] = 255 - dst[j];
+                    dst[j] = c - dst[j];
                 dst += stride;
             }
         }
+
+        if (s->photometric == TIFF_PHOTOMETRIC_SEPARATED &&
+            s->avctx->pix_fmt == AV_PIX_FMT_RGB0) {
+            dst = p->data[plane];
+            for (i = 0; i < s->height; i++) {
+                for (j = 0; j < s->width; j++) {
+                    int k =  255 - dst[4 * j + 3];
+                    int r = (255 - dst[4 * j    ]) * k;
+                    int g = (255 - dst[4 * j + 1]) * k;
+                    int b = (255 - dst[4 * j + 2]) * k;
+                    dst[4 * j    ] = r * 257 >> 16;
+                    dst[4 * j + 1] = g * 257 >> 16;
+                    dst[4 * j + 2] = b * 257 >> 16;
+                    dst[4 * j + 3] = 255;
+                }
+                dst += p->linesize[plane];
+            }
+        }
     }
 
     if (s->planar && s->bppcount > 2) {
@@ -702,6 +1557,15 @@ static int decode_frame(AVCodecContext *avctx,
         FFSWAP(int,      p->linesize[0], p->linesize[1]);
     }
 
+    if (s->is_bayer && s->white_level && s->bpp == 16) {
+        uint16_t *dst = (uint16_t *)p->data[0];
+        for (i = 0; i < s->height; i++) {
+            for (j = 0; j < s->width; j++)
+                dst[j] = FFMIN((dst[j] / (float)s->white_level) * 65535, 65535);
+            dst += stride / 2;
+        }
+    }
+
     *got_frame = 1;
 
     return avpkt->size;
@@ -713,8 +1577,12 @@ static av_cold int tiff_init(AVCodecContext *avctx)
 
     s->width  = 0;
     s->height = 0;
+    s->subsampling[0] =
+    s->subsampling[1] = 1;
     s->avctx  = avctx;
     ff_lzw_decode_open(&s->lzw);
+    if (!s->lzw)
+        return AVERROR(ENOMEM);
     ff_ccitt_unpack_init();
 
     return 0;
@@ -724,10 +1592,31 @@ static av_cold int tiff_end(AVCodecContext *avctx)
 {
     TiffContext *const s = avctx->priv_data;
 
+    free_geotags(s);
+
     ff_lzw_decode_close(&s->lzw);
+    av_freep(&s->deinvert_buf);
+    s->deinvert_buf_size = 0;
+    av_freep(&s->yuv_line);
+    s->yuv_line_size = 0;
+    av_freep(&s->fax_buffer);
+    s->fax_buffer_size = 0;
     return 0;
 }
 
+#define OFFSET(x) offsetof(TiffContext, x)
+static const AVOption tiff_options[] = {
+    { "subimage", "decode subimage instead if available", OFFSET(get_subimage), AV_OPT_TYPE_BOOL, {.i64=0},  0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM },
+    { NULL },
+};
+
+static const AVClass tiff_decoder_class = {
+    .class_name = "TIFF decoder",
+    .item_name  = av_default_item_name,
+    .option     = tiff_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_tiff_decoder = {
     .name           = "tiff",
     .long_name      = NULL_IF_CONFIG_SMALL("TIFF image"),
@@ -737,5 +1626,7 @@ AVCodec ff_tiff_decoder = {
     .init           = tiff_init,
     .close          = tiff_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(tiff_init),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .priv_class     = &tiff_decoder_class,
 };
diff --git a/libavcodec/tiff.h b/libavcodec/tiff.h
index 68ac695..4b08650 100644
--- a/libavcodec/tiff.h
+++ b/libavcodec/tiff.h
@@ -1,27 +1,29 @@
 /*
- * TIFF tables
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * TIFF tables
+ *
+ * For more information about the TIFF format, check the official docs at:
+ * http://partners.adobe.com/public/developer/tiff/index.html
  * @author Konstantin Shishkov
  */
 
@@ -29,6 +31,7 @@
 #define AVCODEC_TIFF_H
 
 #include <stdint.h>
+#include "tiff_common.h"
 
 /** abridged list of TIFF tags */
 enum TiffTags {
@@ -39,6 +42,10 @@ enum TiffTags {
     TIFF_COMPR,
     TIFF_PHOTOMETRIC        = 0x106,
     TIFF_FILL_ORDER         = 0x10A,
+    TIFF_DOCUMENT_NAME      = 0x10D,
+    TIFF_IMAGE_DESCRIPTION  = 0x10E,
+    TIFF_MAKE               = 0x10F,
+    TIFF_MODEL              = 0x110,
     TIFF_STRIP_OFFS         = 0x111,
     TIFF_SAMPLES_PER_PIXEL  = 0x115,
     TIFF_ROWSPERSTRIP       = 0x116,
@@ -46,18 +53,39 @@ enum TiffTags {
     TIFF_XRES               = 0x11A,
     TIFF_YRES               = 0x11B,
     TIFF_PLANAR             = 0x11C,
+    TIFF_PAGE_NAME          = 0x11D,
     TIFF_XPOS               = 0x11E,
     TIFF_YPOS               = 0x11F,
     TIFF_T4OPTIONS          = 0x124,
     TIFF_T6OPTIONS,
     TIFF_RES_UNIT           = 0x128,
+    TIFF_PAGE_NUMBER        = 0x129,
     TIFF_SOFTWARE_NAME      = 0x131,
+    TIFF_DATE               = 0x132,
+    TIFF_ARTIST             = 0x13B,
+    TIFF_HOST_COMPUTER      = 0x13C,
     TIFF_PREDICTOR          = 0x13D,
     TIFF_PAL                = 0x140,
+    TIFF_TILE_WIDTH         = 0x142,
+    TIFF_TILE_LENGTH        = 0x143,
+    TIFF_TILE_OFFSETS       = 0x144,
+    TIFF_TILE_BYTE_COUNTS   = 0x145,
+    TIFF_SUB_IFDS           = 0x14A,
+    TIFF_EXTRASAMPLES       = 0x152,
     TIFF_YCBCR_COEFFICIENTS = 0x211,
     TIFF_YCBCR_SUBSAMPLING  = 0x212,
     TIFF_YCBCR_POSITIONING  = 0x213,
     TIFF_REFERENCE_BW       = 0x214,
+    TIFF_CFA_PATTERN_DIM    = 0x828D,
+    TIFF_CFA_PATTERN        = 0x828E,
+    TIFF_COPYRIGHT          = 0x8298,
+    TIFF_MODEL_TIEPOINT     = 0x8482,
+    TIFF_MODEL_PIXEL_SCALE  = 0x830E,
+    TIFF_MODEL_TRANSFORMATION= 0x8480,
+    TIFF_GEO_KEY_DIRECTORY  = 0x87AF,
+    TIFF_GEO_DOUBLE_PARAMS  = 0x87B0,
+    TIFF_GEO_ASCII_PARAMS   = 0x87B1,
+    TIFF_WHITE_LEVEL        = 0xC61D,
 };
 
 /** list of TIFF compression types */
@@ -75,12 +103,52 @@ enum TiffCompr {
     TIFF_LZMA     = 0x886D,
 };
 
-enum TiffTypes {
-    TIFF_BYTE = 1,
-    TIFF_STRING,
-    TIFF_SHORT,
-    TIFF_LONG,
-    TIFF_RATIONAL,
+enum TiffGeoTagKey {
+    TIFF_GT_MODEL_TYPE_GEOKEY                = 1024,
+    TIFF_GT_RASTER_TYPE_GEOKEY               = 1025,
+    TIFF_GT_CITATION_GEOKEY                  = 1026,
+    TIFF_GEOGRAPHIC_TYPE_GEOKEY              = 2048,
+    TIFF_GEOG_CITATION_GEOKEY                = 2049,
+    TIFF_GEOG_GEODETIC_DATUM_GEOKEY          = 2050,
+    TIFF_GEOG_PRIME_MERIDIAN_GEOKEY          = 2051,
+    TIFF_GEOG_LINEAR_UNITS_GEOKEY            = 2052,
+    TIFF_GEOG_LINEAR_UNIT_SIZE_GEOKEY        = 2053,
+    TIFF_GEOG_ANGULAR_UNITS_GEOKEY           = 2054,
+    TIFF_GEOG_ANGULAR_UNIT_SIZE_GEOKEY       = 2055,
+    TIFF_GEOG_ELLIPSOID_GEOKEY               = 2056,
+    TIFF_GEOG_SEMI_MAJOR_AXIS_GEOKEY         = 2057,
+    TIFF_GEOG_SEMI_MINOR_AXIS_GEOKEY         = 2058,
+    TIFF_GEOG_INV_FLATTENING_GEOKEY          = 2059,
+    TIFF_GEOG_AZIMUTH_UNITS_GEOKEY           = 2060,
+    TIFF_GEOG_PRIME_MERIDIAN_LONG_GEOKEY     = 2061,
+    TIFF_PROJECTED_CS_TYPE_GEOKEY            = 3072,
+    TIFF_PCS_CITATION_GEOKEY                 = 3073,
+    TIFF_PROJECTION_GEOKEY                   = 3074,
+    TIFF_PROJ_COORD_TRANS_GEOKEY             = 3075,
+    TIFF_PROJ_LINEAR_UNITS_GEOKEY            = 3076,
+    TIFF_PROJ_LINEAR_UNIT_SIZE_GEOKEY        = 3077,
+    TIFF_PROJ_STD_PARALLEL1_GEOKEY           = 3078,
+    TIFF_PROJ_STD_PARALLEL2_GEOKEY           = 3079,
+    TIFF_PROJ_NAT_ORIGIN_LONG_GEOKEY         = 3080,
+    TIFF_PROJ_NAT_ORIGIN_LAT_GEOKEY          = 3081,
+    TIFF_PROJ_FALSE_EASTING_GEOKEY           = 3082,
+    TIFF_PROJ_FALSE_NORTHING_GEOKEY          = 3083,
+    TIFF_PROJ_FALSE_ORIGIN_LONG_GEOKEY       = 3084,
+    TIFF_PROJ_FALSE_ORIGIN_LAT_GEOKEY        = 3085,
+    TIFF_PROJ_FALSE_ORIGIN_EASTING_GEOKEY    = 3086,
+    TIFF_PROJ_FALSE_ORIGIN_NORTHING_GEOKEY   = 3087,
+    TIFF_PROJ_CENTER_LONG_GEOKEY             = 3088,
+    TIFF_PROJ_CENTER_LAT_GEOKEY              = 3089,
+    TIFF_PROJ_CENTER_EASTING_GEOKEY          = 3090,
+    TIFF_PROJ_CENTER_NORTHING_GEOKEY         = 3091,
+    TIFF_PROJ_SCALE_AT_NAT_ORIGIN_GEOKEY     = 3092,
+    TIFF_PROJ_SCALE_AT_CENTER_GEOKEY         = 3093,
+    TIFF_PROJ_AZIMUTH_ANGLE_GEOKEY           = 3094,
+    TIFF_PROJ_STRAIGHT_VERT_POLE_LONG_GEOKEY = 3095,
+    TIFF_VERTICAL_CS_TYPE_GEOKEY             = 4096,
+    TIFF_VERTICAL_CITATION_GEOKEY            = 4097,
+    TIFF_VERTICAL_DATUM_GEOKEY               = 4098,
+    TIFF_VERTICAL_UNITS_GEOKEY               = 4099
 };
 
 enum TiffPhotometric {
@@ -101,9 +169,28 @@ enum TiffPhotometric {
     TIFF_PHOTOMETRIC_LINEAR_RAW = 34892, /* Linear Raw (DNG) */
 };
 
-/** sizes of various TIFF field types (string size = 100)*/
-static const uint8_t type_sizes[6] = {
-    0, 1, 100, 2, 4, 8
+enum TiffGeoTagType {
+    GEOTIFF_SHORT  = 0,
+    GEOTIFF_DOUBLE = 34736,
+    GEOTIFF_STRING = 34737
 };
 
+typedef struct TiffGeoTag {
+    enum TiffGeoTagKey key;
+    enum TiffTags type;
+    int count;
+    int offset;
+    char *val;
+} TiffGeoTag;
+
+typedef struct TiffGeoTagKeyName {
+    const enum TiffGeoTagKey key;
+    const char *const name;
+} TiffGeoTagKeyName;
+
+typedef struct TiffGeoTagNameType {
+    const char *const name;
+    const enum TiffGeoTagType type;
+} TiffGeoTagNameType;
+
 #endif /* AVCODEC_TIFF_H */
diff --git a/libavcodec/tiff_common.c b/libavcodec/tiff_common.c
new file mode 100644
index 0000000..0af62ee
--- /dev/null
+++ b/libavcodec/tiff_common.c
@@ -0,0 +1,313 @@
+/*
+ * TIFF Common Routines
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF Common Routines
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#include "tiff_common.h"
+
+
+int ff_tis_ifd(unsigned tag)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(ifd_tags); i++) {
+        if (ifd_tags[i] == tag) {
+            return i + 1;
+        }
+    }
+    return 0;
+}
+
+
+unsigned ff_tget_short(GetByteContext *gb, int le)
+{
+    return le ? bytestream2_get_le16(gb) : bytestream2_get_be16(gb);
+}
+
+
+unsigned ff_tget_long(GetByteContext *gb, int le)
+{
+    return le ? bytestream2_get_le32(gb) : bytestream2_get_be32(gb);
+}
+
+
+double ff_tget_double(GetByteContext *gb, int le)
+{
+    av_alias64 i = { .u64 = le ? bytestream2_get_le64(gb) : bytestream2_get_be64(gb)};
+    return i.f64;
+}
+
+
+unsigned ff_tget(GetByteContext *gb, int type, int le)
+{
+    switch (type) {
+    case TIFF_BYTE:  return bytestream2_get_byte(gb);
+    case TIFF_SHORT: return ff_tget_short(gb, le);
+    case TIFF_LONG:  return ff_tget_long(gb, le);
+    default:         return UINT_MAX;
+    }
+}
+
+static const char *auto_sep(int count, const char *sep, int i, int columns)
+{
+    if (sep)
+        return i ? sep : "";
+    if (i && i%columns) {
+        return ", ";
+    } else
+        return columns < count ? "\n" : "";
+}
+
+int ff_tadd_rational_metadata(int count, const char *name, const char *sep,
+                              GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int32_t nom, denom;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int64_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int64_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        nom   = ff_tget_long(gb, le);
+        denom = ff_tget_long(gb, le);
+        av_bprintf(&bp, "%s%7"PRId32":%-7"PRId32, auto_sep(count, sep, i, 4), nom, denom);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_long_metadata(int count, const char *name, const char *sep,
+                          GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int32_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int32_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        av_bprintf(&bp, "%s%7i", auto_sep(count, sep, i, 8), ff_tget_long(gb, le));
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_doubles_metadata(int count, const char *name, const char *sep,
+                             GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int64_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int64_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, 100 * count);
+
+    for (i = 0; i < count; i++) {
+        av_bprintf(&bp, "%s%.15g", auto_sep(count, sep, i, 4), ff_tget_double(gb, le));
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_shorts_metadata(int count, const char *name, const char *sep,
+                            GetByteContext *gb, int le, int is_signed, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int16_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int16_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        int v = is_signed ? (int16_t)ff_tget_short(gb, le) :  ff_tget_short(gb, le);
+        av_bprintf(&bp, "%s%5i", auto_sep(count, sep, i, 8), v);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_bytes_metadata(int count, const char *name, const char *sep,
+                           GetByteContext *gb, int le, int is_signed, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int8_t) || count < 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int8_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        int v = is_signed ? (int8_t)bytestream2_get_byte(gb) :  bytestream2_get_byte(gb);
+        av_bprintf(&bp, "%s%3i", auto_sep(count, sep, i, 16), v);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+int ff_tadd_string_metadata(int count, const char *name,
+                            GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    char *value;
+
+    if (bytestream2_get_bytes_left(gb) < count || count < 0)
+        return AVERROR_INVALIDDATA;
+
+    value = av_malloc(count + 1);
+    if (!value)
+        return AVERROR(ENOMEM);
+
+    bytestream2_get_bufferu(gb, value, count);
+    value[count] = 0;
+
+    av_dict_set(metadata, name, value, AV_DICT_DONT_STRDUP_VAL);
+    return 0;
+}
+
+
+int ff_tdecode_header(GetByteContext *gb, int *le, int *ifd_offset)
+{
+    if (bytestream2_get_bytes_left(gb) < 8) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    *le = bytestream2_get_le16u(gb);
+    if (*le == AV_RB16("II")) {
+        *le = 1;
+    } else if (*le == AV_RB16("MM")) {
+        *le = 0;
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ff_tget_short(gb, *le) != 42) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    *ifd_offset = ff_tget_long(gb, *le);
+
+    return 0;
+}
+
+
+int ff_tread_tag(GetByteContext *gb, int le, unsigned *tag, unsigned *type,
+                 unsigned *count, int *next)
+{
+    int ifd_tag;
+    int valid_type;
+
+    *tag    = ff_tget_short(gb, le);
+    *type   = ff_tget_short(gb, le);
+    *count  = ff_tget_long (gb, le);
+
+    ifd_tag    = ff_tis_ifd(*tag);
+    valid_type = *type != 0 && *type < FF_ARRAY_ELEMS(type_sizes);
+
+    *next = bytestream2_tell(gb) + 4;
+
+    // check for valid type
+    if (!valid_type) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    // seek to offset if this is an IFD-tag or
+    // if count values do not fit into the offset value
+    if (ifd_tag || (*count > 4 || !(type_sizes[*type] * (*count) <= 4 || *type == TIFF_STRING))) {
+        bytestream2_seek(gb, ff_tget_long (gb, le), SEEK_SET);
+    }
+
+    return 0;
+}
diff --git a/libavcodec/tiff_common.h b/libavcodec/tiff_common.h
new file mode 100644
index 0000000..03558c3
--- /dev/null
+++ b/libavcodec/tiff_common.h
@@ -0,0 +1,152 @@
+/*
+ * TIFF Common Routines
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF Common Routines
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#ifndef AVCODEC_TIFF_COMMON_H
+#define AVCODEC_TIFF_COMMON_H
+
+#include "avcodec.h"
+#include "tiff.h"
+#include "bytestream.h"
+#include "libavutil/bprint.h"
+
+/** data type identifiers for TIFF tags */
+enum TiffTypes {
+    TIFF_BYTE = 1,
+    TIFF_STRING,
+    TIFF_SHORT,
+    TIFF_LONG,
+    TIFF_RATIONAL,
+    TIFF_SBYTE,
+    TIFF_UNDEFINED,
+    TIFF_SSHORT,
+    TIFF_SLONG,
+    TIFF_SRATIONAL,
+    TIFF_FLOAT,
+    TIFF_DOUBLE,
+    TIFF_IFD
+};
+
+/** sizes of various TIFF field types (string size = 100)*/
+static const uint8_t type_sizes[14] = {
+    0, 1, 100, 2, 4, 8, 1, 1, 2, 4, 8, 4, 8, 4
+};
+
+static const uint16_t ifd_tags[] = {
+    0x8769, // EXIF IFD
+    0x8825, // GPS IFD
+    0xA005  // Interoperability IFD
+};
+
+
+/** Returns a value > 0 if the tag is a known IFD-tag.
+ *  The return value is the array index + 1 within ifd_tags[].
+ */
+int ff_tis_ifd(unsigned tag);
+
+/** Reads a short from the bytestream using given endianness. */
+unsigned ff_tget_short(GetByteContext *gb, int le);
+
+/** Reads a long from the bytestream using given endianness. */
+unsigned ff_tget_long(GetByteContext *gb, int le);
+
+/** Reads a double from the bytestream using given endianness. */
+double   ff_tget_double(GetByteContext *gb, int le);
+
+/** Reads a byte from the bytestream using given endianness. */
+unsigned ff_tget(GetByteContext *gb, int type, int le);
+
+/** Returns an allocated string containing count
+ *  rational values using the given separator.
+ */
+char *ff_trationals2str(int *rp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  long values using the given separator.
+ */
+char *ff_tlongs2str(int32_t *lp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  double values using the given separator.
+ */
+char *ff_tdoubles2str(double *dp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  short values using the given separator.
+ */
+char *ff_tshorts2str(int16_t *sp, int count, const char *sep);
+
+/** Adds count rationals converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_rational_metadata(int count, const char *name, const char *sep,
+                              GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count longs converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_long_metadata(int count, const char *name, const char *sep,
+                          GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count doubles converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_doubles_metadata(int count, const char *name, const char *sep,
+                             GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count shorts converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_shorts_metadata(int count, const char *name, const char *sep,
+                            GetByteContext *gb, int le, int is_signed, AVDictionary **metadata);
+
+/** Adds count bytes converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_bytes_metadata(int count, const char *name, const char *sep,
+                           GetByteContext *gb, int le, int is_signed, AVDictionary **metadata);
+
+/** Adds a string of count characters
+ *  into the metadata dictionary.
+ */
+int ff_tadd_string_metadata(int count, const char *name,
+                            GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Decodes a TIFF header from the input bytestream
+ *  and sets the endianness in *le and the offset to
+ *  the first IFD in *ifd_offset accordingly.
+ */
+int ff_tdecode_header(GetByteContext *gb, int *le, int *ifd_offset);
+
+/** Reads the first 3 fields of a TIFF tag, which are
+ *  the tag id, the tag type and the count of values for that tag.
+ *  Afterwards the bytestream is located at the first value to read and
+ *  *next holds the bytestream offset of the following tag.
+ */
+int ff_tread_tag(GetByteContext *gb, int le, unsigned *tag, unsigned *type,
+                 unsigned *count, int *next);
+
+#endif /* AVCODEC_TIFF_COMMON_H */
diff --git a/libavcodec/tiff_data.c b/libavcodec/tiff_data.c
new file mode 100644
index 0000000..88c2256
--- /dev/null
+++ b/libavcodec/tiff_data.c
@@ -0,0 +1,1870 @@
+/*
+ * TIFF data tables
+ * Copyright (c) 2011 Thomas Kuehnel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF data tables
+ * @author Thomas Kuehnel
+ * @see GeoTIFF specification at
+ * http://www.remotesensing.org/geotiff/spec/geotiffhome.html
+ */
+
+#include "tiff_data.h"
+
+const TiffGeoTagNameType ff_tiff_conf_name_type_map[] = {
+    {"GTModelTypeGeoKey",              GEOTIFF_SHORT },
+    {"GTRasterTypeGeoKey",             GEOTIFF_SHORT },
+    {"GTCitationGeoKey",               GEOTIFF_STRING}
+};
+
+const TiffGeoTagNameType ff_tiff_geog_name_type_map[] = {
+    {"GeographicTypeGeoKey",           GEOTIFF_SHORT },
+    {"GeogCitationGeoKey",             GEOTIFF_STRING},
+    {"GeogGeodeticDatumGeoKey",        GEOTIFF_SHORT },
+    {"GeogPrimeMeridianGeoKey",        GEOTIFF_SHORT },
+    {"GeogLinearUnitsGeoKey",          GEOTIFF_SHORT },
+    {"GeogLinearUnitSizeGeoKey",       GEOTIFF_DOUBLE},
+    {"GeogAngularUnitsGeoKey",         GEOTIFF_SHORT },
+    {"GeogAngularUnitSizeGeoKey",      GEOTIFF_DOUBLE},
+    {"GeogEllipsoidGeoKey",            GEOTIFF_SHORT },
+    {"GeogSemiMajorAxisGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogSemiMinorAxisGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogInvFlatteningGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogAzimuthUnitsGeoKey",         GEOTIFF_SHORT },
+    {"GeogPrimeMeridianLongGeoKey",    GEOTIFF_DOUBLE}
+};
+
+const TiffGeoTagNameType ff_tiff_proj_name_type_map[] = {
+    {"ProjectedCSTypeGeoKey",          GEOTIFF_SHORT },
+    {"PCSCitationGeoKey",              GEOTIFF_STRING},
+    {"ProjectionGeoKey",               GEOTIFF_SHORT },
+    {"ProjCoordTransGeoKey",           GEOTIFF_SHORT },
+    {"ProjLinearUnitsGeoKey",          GEOTIFF_SHORT },
+    {"ProjLinearUnitSizeGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjStdParallel1GeoKey",         GEOTIFF_DOUBLE},
+    {"ProjStdParallel2GeoKey",         GEOTIFF_DOUBLE},
+    {"ProjNatOriginLongGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjNatOriginLatGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjFalseEastingGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjFalseNorthingGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjFalseOriginLongGeoKey",      GEOTIFF_DOUBLE},
+    {"ProjFalseOriginLatGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjFalseOriginEastingGeoKey",   GEOTIFF_DOUBLE},
+    {"ProjFalseOriginNorthingGeoKey",  GEOTIFF_DOUBLE},
+    {"ProjCenterLongGeoKey",           GEOTIFF_DOUBLE},
+    {"ProjCenterLatGeoKey",            GEOTIFF_DOUBLE},
+    {"ProjCenterEastingGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjCenterNorthingGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjScaleAtNatOriginGeoKey",     GEOTIFF_DOUBLE},
+    {"ProjScaleAtCenterGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjAzimuthAngleGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjStraightVertPoleLongGeoKey", GEOTIFF_DOUBLE}
+};
+
+const TiffGeoTagNameType ff_tiff_vert_name_type_map[] = {
+    {"VerticalCSTypeGeoKey",           GEOTIFF_SHORT },
+    {"VerticalCitationGeoKey",         GEOTIFF_STRING},
+    {"VerticalDatumGeoKey",            GEOTIFF_SHORT },
+    {"VerticalUnitsGeoKey",            GEOTIFF_SHORT }
+};
+
+const char *const ff_tiff_gt_model_type_codes[] = {
+    "ModelTypeProjected",
+    "ModelTypeGeographic",
+    "ModelTypeGeocentric"
+};
+
+const char *const ff_tiff_gt_raster_type_codes[] = {
+    "RasterPixelIsArea",
+    "RasterPixelIsPoint"
+};
+
+const char *const ff_tiff_linear_unit_codes[] = {
+    "Linear_Meter",
+    "Linear_Foot",
+    "Linear_Foot_US_Survey",
+    "Linear_Foot_Modified_American",
+    "Linear_Foot_Clarke",
+    "Linear_Foot_Indian",
+    "Linear_Link",
+    "Linear_Link_Benoit",
+    "Linear_Link_Sears",
+    "Linear_Chain_Benoit",
+    "Linear_Chain_Sears",
+    "Linear_Yard_Sears",
+    "Linear_Yard_Indian",
+    "Linear_Fathom",
+    "Linear_Mile_International_Nautical"
+};
+
+const char *const ff_tiff_angular_unit_codes[] = {
+    "Angular_Radian",
+    "Angular_Degree",
+    "Angular_Arc_Minute",
+    "Angular_Arc_Second",
+    "Angular_Grad",
+    "Angular_Gon",
+    "Angular_DMS",
+    "Angular_DMS_Hemisphere"
+};
+
+const char *const ff_tiff_gcs_type_codes[] = {
+    "GCS_Adindan",
+    "GCS_AGD66",
+    "GCS_AGD84",
+    "GCS_Ain_el_Abd",
+    "GCS_Afgooye",
+    "GCS_Agadez",
+    "GCS_Lisbon",
+    "GCS_Aratu",
+    "GCS_Arc_1950",
+    "GCS_Arc_1960",
+    "GCS_Batavia",
+    "GCS_Barbados",
+    "GCS_Beduaram",
+    "GCS_Beijing_1954",
+    "GCS_Belge_1950",
+    "GCS_Bermuda_1957",
+    "GCS_Bern_1898",
+    "GCS_Bogota",
+    "GCS_Bukit_Rimpah",
+    "GCS_Camacupa",
+    "GCS_Campo_Inchauspe",
+    "GCS_Cape",
+    "GCS_Carthage",
+    "GCS_Chua",
+    "GCS_Corrego_Alegre",
+    "GCS_Cote_d_Ivoire",
+    "GCS_Deir_ez_Zor",
+    "GCS_Douala",
+    "GCS_Egypt_1907",
+    "GCS_ED50",
+    "GCS_ED87",
+    "GCS_Fahud",
+    "GCS_Gandajika_1970",
+    "GCS_Garoua",
+    "GCS_Guyane_Francaise",
+    "GCS_Hu_Tzu_Shan",
+    "GCS_HD72",
+    "GCS_ID74",
+    "GCS_Indian_1954",
+    "GCS_Indian_1975",
+    "GCS_Jamaica_1875",
+    "GCS_JAD69",
+    "GCS_Kalianpur",
+    "GCS_Kandawala",
+    "GCS_Kertau",
+    "GCS_KOC",
+    "GCS_La_Canoa",
+    "GCS_PSAD56",
+    "GCS_Lake",
+    "GCS_Leigon",
+    "GCS_Liberia_1964",
+    "GCS_Lome",
+    "GCS_Luzon_1911",
+    "GCS_Hito_XVIII_1963",
+    "GCS_Herat_North",
+    "GCS_Mahe_1971",
+    "GCS_Makassar",
+    "GCS_EUREF89",
+    "GCS_Malongo_1987",
+    "GCS_Manoca",
+    "GCS_Merchich",
+    "GCS_Massawa",
+    "GCS_Minna",
+    "GCS_Mhast",
+    "GCS_Monte_Mario",
+    "GCS_M_poraloko",
+    "GCS_NAD27",
+    "GCS_NAD_Michigan",
+    "GCS_NAD83",
+    "GCS_Nahrwan_1967",
+    "GCS_Naparima_1972",
+    "GCS_GD49",
+    "GCS_NGO_1948",
+    "GCS_Datum_73",
+    "GCS_NTF",
+    "GCS_NSWC_9Z_2",
+    "GCS_OSGB_1936",
+    "GCS_OSGB70",
+    "GCS_OS_SN80",
+    "GCS_Padang",
+    "GCS_Palestine_1923",
+    "GCS_Pointe_Noire",
+    "GCS_GDA94",
+    "GCS_Pulkovo_1942",
+    "GCS_Qatar",
+    "GCS_Qatar_1948",
+    "GCS_Qornoq",
+    "GCS_Loma_Quintana",
+    "GCS_Amersfoort",
+    "GCS_RT38",
+    "GCS_SAD69",
+    "GCS_Sapper_Hill_1943",
+    "GCS_Schwarzeck",
+    "GCS_Segora",
+    "GCS_Serindung",
+    "GCS_Sudan",
+    "GCS_Tananarive",
+    "GCS_Timbalai_1948",
+    "GCS_TM65",
+    "GCS_TM75",
+    "GCS_Tokyo",
+    "GCS_Trinidad_1903",
+    "GCS_TC_1948",
+    "GCS_Voirol_1875",
+    "GCS_Voirol_Unifie",
+    "GCS_Bern_1938",
+    "GCS_Nord_Sahara_1959",
+    "GCS_Stockholm_1938",
+    "GCS_Yacare",
+    "GCS_Yoff",
+    "GCS_Zanderij",
+    "GCS_MGI",
+    "GCS_Belge_1972",
+    "GCS_DHDN",
+    "GCS_Conakry_1905",
+    "GCS_WGS_72",
+    "GCS_WGS_72BE",
+    "GCS_WGS_84",
+    "GCS_Bern_1898_Bern",
+    "GCS_Bogota_Bogota",
+    "GCS_Lisbon_Lisbon",
+    "GCS_Makassar_Jakarta",
+    "GCS_MGI_Ferro",
+    "GCS_Monte_Mario_Rome",
+    "GCS_NTF_Paris",
+    "GCS_Padang_Jakarta",
+    "GCS_Belge_1950_Brussels",
+    "GCS_Tananarive_Paris",
+    "GCS_Voirol_1875_Paris",
+    "GCS_Voirol_Unifie_Paris",
+    "GCS_Batavia_Jakarta",
+    "GCS_ATF_Paris",
+    "GCS_NDG_Paris"
+};
+
+const char *const ff_tiff_gcse_type_codes[] = {
+    "GCSE_Airy1830",
+    "GCSE_AiryModified1849",
+    "GCSE_AustralianNationalSpheroid",
+    "GCSE_Bessel1841",
+    "GCSE_BesselModified",
+    "GCSE_BesselNamibia",
+    "GCSE_Clarke1858",
+    "GCSE_Clarke1866",
+    "GCSE_Clarke1866Michigan",
+    "GCSE_Clarke1880_Benoit",
+    "GCSE_Clarke1880_IGN",
+    "GCSE_Clarke1880_RGS",
+    "GCSE_Clarke1880_Arc",
+    "GCSE_Clarke1880_SGA1922",
+    "GCSE_Everest1830_1937Adjustment",
+    "GCSE_Everest1830_1967Definition",
+    "GCSE_Everest1830_1975Definition",
+    "GCSE_Everest1830Modified",
+    "GCSE_GRS1980",
+    "GCSE_Helmert1906",
+    "GCSE_IndonesianNationalSpheroid",
+    "GCSE_International1924",
+    "GCSE_International1967",
+    "GCSE_Krassowsky1940",
+    "GCSE_NWL9D",
+    "GCSE_NWL10D",
+    "GCSE_Plessis1817",
+    "GCSE_Struve1860",
+    "GCSE_WarOffice",
+    "GCSE_WGS84",
+    "GCSE_GEM10C",
+    "GCSE_OSU86F",
+    "GCSE_OSU91A",
+    "GCSE_Clarke1880",
+    "GCSE_Sphere"
+};
+
+const char *const ff_tiff_geodetic_datum_codes[] = {
+    "Datum_Adindan",
+    "Datum_Australian_Geodetic_Datum_1966",
+    "Datum_Australian_Geodetic_Datum_1984",
+    "Datum_Ain_el_Abd_1970",
+    "Datum_Afgooye",
+    "Datum_Agadez",
+    "Datum_Lisbon",
+    "Datum_Aratu",
+    "Datum_Arc_1950",
+    "Datum_Arc_1960",
+    "Datum_Batavia",
+    "Datum_Barbados",
+    "Datum_Beduaram",
+    "Datum_Beijing_1954",
+    "Datum_Reseau_National_Belge_1950",
+    "Datum_Bermuda_1957",
+    "Datum_Bern_1898",
+    "Datum_Bogota",
+    "Datum_Bukit_Rimpah",
+    "Datum_Camacupa",
+    "Datum_Campo_Inchauspe",
+    "Datum_Cape",
+    "Datum_Carthage",
+    "Datum_Chua",
+    "Datum_Corrego_Alegre",
+    "Datum_Cote_d_Ivoire",
+    "Datum_Deir_ez_Zor",
+    "Datum_Douala",
+    "Datum_Egypt_1907",
+    "Datum_European_Datum_1950",
+    "Datum_European_Datum_1987",
+    "Datum_Fahud",
+    "Datum_Gandajika_1970",
+    "Datum_Garoua",
+    "Datum_Guyane_Francaise",
+    "Datum_Hu_Tzu_Shan",
+    "Datum_Hungarian_Datum_1972",
+    "Datum_Indonesian_Datum_1974",
+    "Datum_Indian_1954",
+    "Datum_Indian_1975",
+    "Datum_Jamaica_1875",
+    "Datum_Jamaica_1969",
+    "Datum_Kalianpur",
+    "Datum_Kandawala",
+    "Datum_Kertau",
+    "Datum_Kuwait_Oil_Company",
+    "Datum_La_Canoa",
+    "Datum_Provisional_S_American_Datum_1956",
+    "Datum_Lake",
+    "Datum_Leigon",
+    "Datum_Liberia_1964",
+    "Datum_Lome",
+    "Datum_Luzon_1911",
+    "Datum_Hito_XVIII_1963",
+    "Datum_Herat_North",
+    "Datum_Mahe_1971",
+    "Datum_Makassar",
+    "Datum_European_Reference_System_1989",
+    "Datum_Malongo_1987",
+    "Datum_Manoca",
+    "Datum_Merchich",
+    "Datum_Massawa",
+    "Datum_Minna",
+    "Datum_Mhast",
+    "Datum_Monte_Mario",
+    "Datum_M_poraloko",
+    "Datum_North_American_Datum_1927",
+    "Datum_NAD_Michigan",
+    "Datum_North_American_Datum_1983",
+    "Datum_Nahrwan_1967",
+    "Datum_Naparima_1972",
+    "Datum_New_Zealand_Geodetic_Datum_1949",
+    "Datum_NGO_1948",
+    "Datum_Datum_73",
+    "Datum_Nouvelle_Triangulation_Francaise",
+    "Datum_NSWC_9Z_2",
+    "Datum_OSGB_1936",
+    "Datum_OSGB_1970_SN",
+    "Datum_OS_SN_1980",
+    "Datum_Padang_1884",
+    "Datum_Palestine_1923",
+    "Datum_Pointe_Noire",
+    "Datum_Geocentric_Datum_of_Australia_1994",
+    "Datum_Pulkovo_1942",
+    "Datum_Qatar",
+    "Datum_Qatar_1948",
+    "Datum_Qornoq",
+    "Datum_Loma_Quintana",
+    "Datum_Amersfoort",
+    "Datum_RT38",
+    "Datum_South_American_Datum_1969",
+    "Datum_Sapper_Hill_1943",
+    "Datum_Schwarzeck",
+    "Datum_Segora",
+    "Datum_Serindung",
+    "Datum_Sudan",
+    "Datum_Tananarive_1925",
+    "Datum_Timbalai_1948",
+    "Datum_TM65",
+    "Datum_TM75",
+    "Datum_Tokyo",
+    "Datum_Trinidad_1903",
+    "Datum_Trucial_Coast_1948",
+    "Datum_Voirol_1875",
+    "Datum_Voirol_Unifie_1960",
+    "Datum_Bern_1938",
+    "Datum_Nord_Sahara_1959",
+    "Datum_Stockholm_1938",
+    "Datum_Yacare",
+    "Datum_Yoff",
+    "Datum_Zanderij",
+    "Datum_Militar_Geographische_Institut",
+    "Datum_Reseau_National_Belge_1972",
+    "Datum_Deutsche_Hauptdreiecksnetz",
+    "Datum_Conakry_1905",
+    "Datum_WGS72",
+    "Datum_WGS72_Transit_Broadcast_Ephemeris",
+    "Datum_WGS84",
+    "Datum_Ancienne_Triangulation_Francaise",
+    "Datum_Nord_de_Guerre"
+};
+
+const char *const ff_tiff_geodetic_datum_e_codes[] = {
+    "DatumE_Airy1830",
+    "DatumE_AiryModified1849",
+    "DatumE_AustralianNationalSpheroid",
+    "DatumE_Bessel1841",
+    "DatumE_BesselModified",
+    "DatumE_BesselNamibia",
+    "DatumE_Clarke1858",
+    "DatumE_Clarke1866",
+    "DatumE_Clarke1866Michigan",
+    "DatumE_Clarke1880_Benoit",
+    "DatumE_Clarke1880_IGN",
+    "DatumE_Clarke1880_RGS",
+    "DatumE_Clarke1880_Arc",
+    "DatumE_Clarke1880_SGA1922",
+    "DatumE_Everest1830_1937Adjustment",
+    "DatumE_Everest1830_1967Definition",
+    "DatumE_Everest1830_1975Definition",
+    "DatumE_Everest1830Modified",
+    "DatumE_GRS1980",
+    "DatumE_Helmert1906",
+    "DatumE_IndonesianNationalSpheroid",
+    "DatumE_International1924",
+    "DatumE_International1967",
+    "DatumE_Krassowsky1960",
+    "DatumE_NWL9D",
+    "DatumE_NWL10D",
+    "DatumE_Plessis1817",
+    "DatumE_Struve1860",
+    "DatumE_WarOffice",
+    "DatumE_WGS84",
+    "DatumE_GEM10C",
+    "DatumE_OSU86F",
+    "DatumE_OSU91A",
+    "DatumE_Clarke1880",
+    "DatumE_Sphere"
+};
+
+const char *const ff_tiff_ellipsoid_codes[] = {
+    "Ellipse_Airy_1830",
+    "Ellipse_Airy_Modified_1849",
+    "Ellipse_Australian_National_Spheroid",
+    "Ellipse_Bessel_1841",
+    "Ellipse_Bessel_Modified",
+    "Ellipse_Bessel_Namibia",
+    "Ellipse_Clarke_1858",
+    "Ellipse_Clarke_1866",
+    "Ellipse_Clarke_1866_Michigan",
+    "Ellipse_Clarke_1880_Benoit",
+    "Ellipse_Clarke_1880_IGN",
+    "Ellipse_Clarke_1880_RGS",
+    "Ellipse_Clarke_1880_Arc",
+    "Ellipse_Clarke_1880_SGA_1922",
+    "Ellipse_Everest_1830_1937_Adjustment",
+    "Ellipse_Everest_1830_1967_Definition",
+    "Ellipse_Everest_1830_1975_Definition",
+    "Ellipse_Everest_1830_Modified",
+    "Ellipse_GRS_1980",
+    "Ellipse_Helmert_1906",
+    "Ellipse_Indonesian_National_Spheroid",
+    "Ellipse_International_1924",
+    "Ellipse_International_1967",
+    "Ellipse_Krassowsky_1940",
+    "Ellipse_NWL_9D",
+    "Ellipse_NWL_10D",
+    "Ellipse_Plessis_1817",
+    "Ellipse_Struve_1860",
+    "Ellipse_War_Office",
+    "Ellipse_WGS_84",
+    "Ellipse_GEM_10C",
+    "Ellipse_OSU86F",
+    "Ellipse_OSU91A",
+    "Ellipse_Clarke_1880",
+    "Ellipse_Sphere"
+};
+
+const char *const ff_tiff_prime_meridian_codes[] = {
+    "PM_Greenwich",
+    "PM_Lisbon",
+    "PM_Paris",
+    "PM_Bogota",
+    "PM_Madrid",
+    "PM_Rome",
+    "PM_Bern",
+    "PM_Jakarta",
+    "PM_Ferro",
+    "PM_Brussels",
+    "PM_Stockholm"
+};
+
+const TiffGeoTagKeyName ff_tiff_proj_cs_type_codes[] = {
+    {20137, "PCS_Adindan_UTM_zone_37N"},
+    {20138, "PCS_Adindan_UTM_zone_38N"},
+    {20248, "PCS_AGD66_AMG_zone_48"},
+    {20249, "PCS_AGD66_AMG_zone_49"},
+    {20250, "PCS_AGD66_AMG_zone_50"},
+    {20251, "PCS_AGD66_AMG_zone_51"},
+    {20252, "PCS_AGD66_AMG_zone_52"},
+    {20253, "PCS_AGD66_AMG_zone_53"},
+    {20254, "PCS_AGD66_AMG_zone_54"},
+    {20255, "PCS_AGD66_AMG_zone_55"},
+    {20256, "PCS_AGD66_AMG_zone_56"},
+    {20257, "PCS_AGD66_AMG_zone_57"},
+    {20258, "PCS_AGD66_AMG_zone_58"},
+    {20348, "PCS_AGD84_AMG_zone_48"},
+    {20349, "PCS_AGD84_AMG_zone_49"},
+    {20350, "PCS_AGD84_AMG_zone_50"},
+    {20351, "PCS_AGD84_AMG_zone_51"},
+    {20352, "PCS_AGD84_AMG_zone_52"},
+    {20353, "PCS_AGD84_AMG_zone_53"},
+    {20354, "PCS_AGD84_AMG_zone_54"},
+    {20355, "PCS_AGD84_AMG_zone_55"},
+    {20356, "PCS_AGD84_AMG_zone_56"},
+    {20357, "PCS_AGD84_AMG_zone_57"},
+    {20358, "PCS_AGD84_AMG_zone_58"},
+    {20437, "PCS_Ain_el_Abd_UTM_zone_37N"},
+    {20438, "PCS_Ain_el_Abd_UTM_zone_38N"},
+    {20439, "PCS_Ain_el_Abd_UTM_zone_39N"},
+    {20499, "PCS_Ain_el_Abd_Bahrain_Grid"},
+    {20538, "PCS_Afgooye_UTM_zone_38N"},
+    {20539, "PCS_Afgooye_UTM_zone_39N"},
+    {20700, "PCS_Lisbon_Portugese_Grid"},
+    {20822, "PCS_Aratu_UTM_zone_22S"},
+    {20823, "PCS_Aratu_UTM_zone_23S"},
+    {20824, "PCS_Aratu_UTM_zone_24S"},
+    {20973, "PCS_Arc_1950_Lo13"},
+    {20975, "PCS_Arc_1950_Lo15"},
+    {20977, "PCS_Arc_1950_Lo17"},
+    {20979, "PCS_Arc_1950_Lo19"},
+    {20981, "PCS_Arc_1950_Lo21"},
+    {20983, "PCS_Arc_1950_Lo23"},
+    {20985, "PCS_Arc_1950_Lo25"},
+    {20987, "PCS_Arc_1950_Lo27"},
+    {20989, "PCS_Arc_1950_Lo29"},
+    {20991, "PCS_Arc_1950_Lo31"},
+    {20993, "PCS_Arc_1950_Lo33"},
+    {20995, "PCS_Arc_1950_Lo35"},
+    {21100, "PCS_Batavia_NEIEZ"},
+    {21148, "PCS_Batavia_UTM_zone_48S"},
+    {21149, "PCS_Batavia_UTM_zone_49S"},
+    {21150, "PCS_Batavia_UTM_zone_50S"},
+    {21413, "PCS_Beijing_Gauss_zone_13"},
+    {21414, "PCS_Beijing_Gauss_zone_14"},
+    {21415, "PCS_Beijing_Gauss_zone_15"},
+    {21416, "PCS_Beijing_Gauss_zone_16"},
+    {21417, "PCS_Beijing_Gauss_zone_17"},
+    {21418, "PCS_Beijing_Gauss_zone_18"},
+    {21419, "PCS_Beijing_Gauss_zone_19"},
+    {21420, "PCS_Beijing_Gauss_zone_20"},
+    {21421, "PCS_Beijing_Gauss_zone_21"},
+    {21422, "PCS_Beijing_Gauss_zone_22"},
+    {21423, "PCS_Beijing_Gauss_zone_23"},
+    {21473, "PCS_Beijing_Gauss_13N"},
+    {21474, "PCS_Beijing_Gauss_14N"},
+    {21475, "PCS_Beijing_Gauss_15N"},
+    {21476, "PCS_Beijing_Gauss_16N"},
+    {21477, "PCS_Beijing_Gauss_17N"},
+    {21478, "PCS_Beijing_Gauss_18N"},
+    {21479, "PCS_Beijing_Gauss_19N"},
+    {21480, "PCS_Beijing_Gauss_20N"},
+    {21481, "PCS_Beijing_Gauss_21N"},
+    {21482, "PCS_Beijing_Gauss_22N"},
+    {21483, "PCS_Beijing_Gauss_23N"},
+    {21500, "PCS_Belge_Lambert_50"},
+    {21790, "PCS_Bern_1898_Swiss_Old"},
+    {21817, "PCS_Bogota_UTM_zone_17N"},
+    {21818, "PCS_Bogota_UTM_zone_18N"},
+    {21891, "PCS_Bogota_Colombia_3W"},
+    {21892, "PCS_Bogota_Colombia_Bogota"},
+    {21893, "PCS_Bogota_Colombia_3E"},
+    {21894, "PCS_Bogota_Colombia_6E"},
+    {22032, "PCS_Camacupa_UTM_32S"},
+    {22033, "PCS_Camacupa_UTM_33S"},
+    {22191, "PCS_C_Inchauspe_Argentina_1"},
+    {22192, "PCS_C_Inchauspe_Argentina_2"},
+    {22193, "PCS_C_Inchauspe_Argentina_3"},
+    {22194, "PCS_C_Inchauspe_Argentina_4"},
+    {22195, "PCS_C_Inchauspe_Argentina_5"},
+    {22196, "PCS_C_Inchauspe_Argentina_6"},
+    {22197, "PCS_C_Inchauspe_Argentina_7"},
+    {22332, "PCS_Carthage_UTM_zone_32N"},
+    {22391, "PCS_Carthage_Nord_Tunisie"},
+    {22392, "PCS_Carthage_Sud_Tunisie"},
+    {22523, "PCS_Corrego_Alegre_UTM_23S"},
+    {22524, "PCS_Corrego_Alegre_UTM_24S"},
+    {22832, "PCS_Douala_UTM_zone_32N"},
+    {22992, "PCS_Egypt_1907_Red_Belt"},
+    {22993, "PCS_Egypt_1907_Purple_Belt"},
+    {22994, "PCS_Egypt_1907_Ext_Purple"},
+    {23028, "PCS_ED50_UTM_zone_28N"},
+    {23029, "PCS_ED50_UTM_zone_29N"},
+    {23030, "PCS_ED50_UTM_zone_30N"},
+    {23031, "PCS_ED50_UTM_zone_31N"},
+    {23032, "PCS_ED50_UTM_zone_32N"},
+    {23033, "PCS_ED50_UTM_zone_33N"},
+    {23034, "PCS_ED50_UTM_zone_34N"},
+    {23035, "PCS_ED50_UTM_zone_35N"},
+    {23036, "PCS_ED50_UTM_zone_36N"},
+    {23037, "PCS_ED50_UTM_zone_37N"},
+    {23038, "PCS_ED50_UTM_zone_38N"},
+    {23239, "PCS_Fahud_UTM_zone_39N"},
+    {23240, "PCS_Fahud_UTM_zone_40N"},
+    {23433, "PCS_Garoua_UTM_zone_33N"},
+    {23846, "PCS_ID74_UTM_zone_46N"},
+    {23847, "PCS_ID74_UTM_zone_47N"},
+    {23848, "PCS_ID74_UTM_zone_48N"},
+    {23849, "PCS_ID74_UTM_zone_49N"},
+    {23850, "PCS_ID74_UTM_zone_50N"},
+    {23851, "PCS_ID74_UTM_zone_51N"},
+    {23852, "PCS_ID74_UTM_zone_52N"},
+    {23853, "PCS_ID74_UTM_zone_53N"},
+    {23886, "PCS_ID74_UTM_zone_46S"},
+    {23887, "PCS_ID74_UTM_zone_47S"},
+    {23888, "PCS_ID74_UTM_zone_48S"},
+    {23889, "PCS_ID74_UTM_zone_49S"},
+    {23890, "PCS_ID74_UTM_zone_50S"},
+    {23891, "PCS_ID74_UTM_zone_51S"},
+    {23892, "PCS_ID74_UTM_zone_52S"},
+    {23893, "PCS_ID74_UTM_zone_53S"},
+    {23894, "PCS_ID74_UTM_zone_54S"},
+    {23947, "PCS_Indian_1954_UTM_47N"},
+    {23948, "PCS_Indian_1954_UTM_48N"},
+    {24047, "PCS_Indian_1975_UTM_47N"},
+    {24048, "PCS_Indian_1975_UTM_48N"},
+    {24100, "PCS_Jamaica_1875_Old_Grid"},
+    {24200, "PCS_JAD69_Jamaica_Grid"},
+    {24370, "PCS_Kalianpur_India_0"},
+    {24371, "PCS_Kalianpur_India_I"},
+    {24372, "PCS_Kalianpur_India_IIa"},
+    {24373, "PCS_Kalianpur_India_IIIa"},
+    {24374, "PCS_Kalianpur_India_IVa"},
+    {24382, "PCS_Kalianpur_India_IIb"},
+    {24383, "PCS_Kalianpur_India_IIIb"},
+    {24384, "PCS_Kalianpur_India_IVb"},
+    {24500, "PCS_Kertau_Singapore_Grid"},
+    {24547, "PCS_Kertau_UTM_zone_47N"},
+    {24548, "PCS_Kertau_UTM_zone_48N"},
+    {24720, "PCS_La_Canoa_UTM_zone_20N"},
+    {24721, "PCS_La_Canoa_UTM_zone_21N"},
+    {24818, "PCS_PSAD56_UTM_zone_18N"},
+    {24819, "PCS_PSAD56_UTM_zone_19N"},
+    {24820, "PCS_PSAD56_UTM_zone_20N"},
+    {24821, "PCS_PSAD56_UTM_zone_21N"},
+    {24877, "PCS_PSAD56_UTM_zone_17S"},
+    {24878, "PCS_PSAD56_UTM_zone_18S"},
+    {24879, "PCS_PSAD56_UTM_zone_19S"},
+    {24880, "PCS_PSAD56_UTM_zone_20S"},
+    {24891, "PCS_PSAD56_Peru_west_zone"},
+    {24892, "PCS_PSAD56_Peru_central"},
+    {24893, "PCS_PSAD56_Peru_east_zone"},
+    {25000, "PCS_Leigon_Ghana_Grid"},
+    {25231, "PCS_Lome_UTM_zone_31N"},
+    {25391, "PCS_Luzon_Philippines_I"},
+    {25392, "PCS_Luzon_Philippines_II"},
+    {25393, "PCS_Luzon_Philippines_III"},
+    {25394, "PCS_Luzon_Philippines_IV"},
+    {25395, "PCS_Luzon_Philippines_V"},
+    {25700, "PCS_Makassar_NEIEZ"},
+    {25932, "PCS_Malongo_1987_UTM_32S"},
+    {26191, "PCS_Merchich_Nord_Maroc"},
+    {26192, "PCS_Merchich_Sud_Maroc"},
+    {26193, "PCS_Merchich_Sahara"},
+    {26237, "PCS_Massawa_UTM_zone_37N"},
+    {26331, "PCS_Minna_UTM_zone_31N"},
+    {26332, "PCS_Minna_UTM_zone_32N"},
+    {26391, "PCS_Minna_Nigeria_West"},
+    {26392, "PCS_Minna_Nigeria_Mid_Belt"},
+    {26393, "PCS_Minna_Nigeria_East"},
+    {26432, "PCS_Mhast_UTM_zone_32S"},
+    {26591, "PCS_Monte_Mario_Italy_1"},
+    {26592, "PCS_Monte_Mario_Italy_2"},
+    {26632, "PCS_M_poraloko_UTM_32N"},
+    {26692, "PCS_M_poraloko_UTM_32S"},
+    {26703, "PCS_NAD27_UTM_zone_3N"},
+    {26704, "PCS_NAD27_UTM_zone_4N"},
+    {26705, "PCS_NAD27_UTM_zone_5N"},
+    {26706, "PCS_NAD27_UTM_zone_6N"},
+    {26707, "PCS_NAD27_UTM_zone_7N"},
+    {26708, "PCS_NAD27_UTM_zone_8N"},
+    {26709, "PCS_NAD27_UTM_zone_9N"},
+    {26710, "PCS_NAD27_UTM_zone_10N"},
+    {26711, "PCS_NAD27_UTM_zone_11N"},
+    {26712, "PCS_NAD27_UTM_zone_12N"},
+    {26713, "PCS_NAD27_UTM_zone_13N"},
+    {26714, "PCS_NAD27_UTM_zone_14N"},
+    {26715, "PCS_NAD27_UTM_zone_15N"},
+    {26716, "PCS_NAD27_UTM_zone_16N"},
+    {26717, "PCS_NAD27_UTM_zone_17N"},
+    {26718, "PCS_NAD27_UTM_zone_18N"},
+    {26719, "PCS_NAD27_UTM_zone_19N"},
+    {26720, "PCS_NAD27_UTM_zone_20N"},
+    {26721, "PCS_NAD27_UTM_zone_21N"},
+    {26722, "PCS_NAD27_UTM_zone_22N"},
+    {26729, "PCS_NAD27_Alabama_East"},
+    {26730, "PCS_NAD27_Alabama_West"},
+    {26731, "PCS_NAD27_Alaska_zone_1"},
+    {26732, "PCS_NAD27_Alaska_zone_2"},
+    {26733, "PCS_NAD27_Alaska_zone_3"},
+    {26734, "PCS_NAD27_Alaska_zone_4"},
+    {26735, "PCS_NAD27_Alaska_zone_5"},
+    {26736, "PCS_NAD27_Alaska_zone_6"},
+    {26737, "PCS_NAD27_Alaska_zone_7"},
+    {26738, "PCS_NAD27_Alaska_zone_8"},
+    {26739, "PCS_NAD27_Alaska_zone_9"},
+    {26740, "PCS_NAD27_Alaska_zone_10"},
+    {26741, "PCS_NAD27_California_I"},
+    {26742, "PCS_NAD27_California_II"},
+    {26743, "PCS_NAD27_California_III"},
+    {26744, "PCS_NAD27_California_IV"},
+    {26745, "PCS_NAD27_California_V"},
+    {26746, "PCS_NAD27_California_VI"},
+    {26747, "PCS_NAD27_California_VII"},
+    {26748, "PCS_NAD27_Arizona_East"},
+    {26749, "PCS_NAD27_Arizona_Central"},
+    {26750, "PCS_NAD27_Arizona_West"},
+    {26751, "PCS_NAD27_Arkansas_North"},
+    {26752, "PCS_NAD27_Arkansas_South"},
+    {26753, "PCS_NAD27_Colorado_North"},
+    {26754, "PCS_NAD27_Colorado_Central"},
+    {26755, "PCS_NAD27_Colorado_South"},
+    {26756, "PCS_NAD27_Connecticut"},
+    {26757, "PCS_NAD27_Delaware"},
+    {26758, "PCS_NAD27_Florida_East"},
+    {26759, "PCS_NAD27_Florida_West"},
+    {26760, "PCS_NAD27_Florida_North"},
+    {26761, "PCS_NAD27_Hawaii_zone_1"},
+    {26762, "PCS_NAD27_Hawaii_zone_2"},
+    {26763, "PCS_NAD27_Hawaii_zone_3"},
+    {26764, "PCS_NAD27_Hawaii_zone_4"},
+    {26765, "PCS_NAD27_Hawaii_zone_5"},
+    {26766, "PCS_NAD27_Georgia_East"},
+    {26767, "PCS_NAD27_Georgia_West"},
+    {26768, "PCS_NAD27_Idaho_East"},
+    {26769, "PCS_NAD27_Idaho_Central"},
+    {26770, "PCS_NAD27_Idaho_West"},
+    {26771, "PCS_NAD27_Illinois_East"},
+    {26772, "PCS_NAD27_Illinois_West"},
+    {26773, "PCS_NAD27_Indiana_East"},
+    {26774, "PCS_NAD27_BLM_14N_feet"},
+    {26774, "PCS_NAD27_Indiana_West"},
+    {26775, "PCS_NAD27_BLM_15N_feet"},
+    {26775, "PCS_NAD27_Iowa_North"},
+    {26776, "PCS_NAD27_BLM_16N_feet"},
+    {26776, "PCS_NAD27_Iowa_South"},
+    {26777, "PCS_NAD27_BLM_17N_feet"},
+    {26777, "PCS_NAD27_Kansas_North"},
+    {26778, "PCS_NAD27_Kansas_South"},
+    {26779, "PCS_NAD27_Kentucky_North"},
+    {26780, "PCS_NAD27_Kentucky_South"},
+    {26781, "PCS_NAD27_Louisiana_North"},
+    {26782, "PCS_NAD27_Louisiana_South"},
+    {26783, "PCS_NAD27_Maine_East"},
+    {26784, "PCS_NAD27_Maine_West"},
+    {26785, "PCS_NAD27_Maryland"},
+    {26786, "PCS_NAD27_Massachusetts"},
+    {26787, "PCS_NAD27_Massachusetts_Is"},
+    {26788, "PCS_NAD27_Michigan_North"},
+    {26789, "PCS_NAD27_Michigan_Central"},
+    {26790, "PCS_NAD27_Michigan_South"},
+    {26791, "PCS_NAD27_Minnesota_North"},
+    {26792, "PCS_NAD27_Minnesota_Cent"},
+    {26793, "PCS_NAD27_Minnesota_South"},
+    {26794, "PCS_NAD27_Mississippi_East"},
+    {26795, "PCS_NAD27_Mississippi_West"},
+    {26796, "PCS_NAD27_Missouri_East"},
+    {26797, "PCS_NAD27_Missouri_Central"},
+    {26798, "PCS_NAD27_Missouri_West"},
+    {26801, "PCS_NAD_Michigan_Michigan_East"},
+    {26802, "PCS_NAD_Michigan_Michigan_Old_Central"},
+    {26803, "PCS_NAD_Michigan_Michigan_West"},
+    {26903, "PCS_NAD83_UTM_zone_3N"},
+    {26904, "PCS_NAD83_UTM_zone_4N"},
+    {26905, "PCS_NAD83_UTM_zone_5N"},
+    {26906, "PCS_NAD83_UTM_zone_6N"},
+    {26907, "PCS_NAD83_UTM_zone_7N"},
+    {26908, "PCS_NAD83_UTM_zone_8N"},
+    {26909, "PCS_NAD83_UTM_zone_9N"},
+    {26910, "PCS_NAD83_UTM_zone_10N"},
+    {26911, "PCS_NAD83_UTM_zone_11N"},
+    {26912, "PCS_NAD83_UTM_zone_12N"},
+    {26913, "PCS_NAD83_UTM_zone_13N"},
+    {26914, "PCS_NAD83_UTM_zone_14N"},
+    {26915, "PCS_NAD83_UTM_zone_15N"},
+    {26916, "PCS_NAD83_UTM_zone_16N"},
+    {26917, "PCS_NAD83_UTM_zone_17N"},
+    {26918, "PCS_NAD83_UTM_zone_18N"},
+    {26919, "PCS_NAD83_UTM_zone_19N"},
+    {26920, "PCS_NAD83_UTM_zone_20N"},
+    {26921, "PCS_NAD83_UTM_zone_21N"},
+    {26922, "PCS_NAD83_UTM_zone_22N"},
+    {26923, "PCS_NAD83_UTM_zone_23N"},
+    {26929, "PCS_NAD83_Alabama_East"},
+    {26930, "PCS_NAD83_Alabama_West"},
+    {26931, "PCS_NAD83_Alaska_zone_1"},
+    {26932, "PCS_NAD83_Alaska_zone_2"},
+    {26933, "PCS_NAD83_Alaska_zone_3"},
+    {26934, "PCS_NAD83_Alaska_zone_4"},
+    {26935, "PCS_NAD83_Alaska_zone_5"},
+    {26936, "PCS_NAD83_Alaska_zone_6"},
+    {26937, "PCS_NAD83_Alaska_zone_7"},
+    {26938, "PCS_NAD83_Alaska_zone_8"},
+    {26939, "PCS_NAD83_Alaska_zone_9"},
+    {26940, "PCS_NAD83_Alaska_zone_10"},
+    {26941, "PCS_NAD83_California_1"},
+    {26942, "PCS_NAD83_California_2"},
+    {26943, "PCS_NAD83_California_3"},
+    {26944, "PCS_NAD83_California_4"},
+    {26945, "PCS_NAD83_California_5"},
+    {26946, "PCS_NAD83_California_6"},
+    {26948, "PCS_NAD83_Arizona_East"},
+    {26949, "PCS_NAD83_Arizona_Central"},
+    {26950, "PCS_NAD83_Arizona_West"},
+    {26951, "PCS_NAD83_Arkansas_North"},
+    {26952, "PCS_NAD83_Arkansas_South"},
+    {26953, "PCS_NAD83_Colorado_North"},
+    {26954, "PCS_NAD83_Colorado_Central"},
+    {26955, "PCS_NAD83_Colorado_South"},
+    {26956, "PCS_NAD83_Connecticut"},
+    {26957, "PCS_NAD83_Delaware"},
+    {26958, "PCS_NAD83_Florida_East"},
+    {26959, "PCS_NAD83_Florida_West"},
+    {26960, "PCS_NAD83_Florida_North"},
+    {26961, "PCS_NAD83_Hawaii_zone_1"},
+    {26962, "PCS_NAD83_Hawaii_zone_2"},
+    {26963, "PCS_NAD83_Hawaii_zone_3"},
+    {26964, "PCS_NAD83_Hawaii_zone_4"},
+    {26965, "PCS_NAD83_Hawaii_zone_5"},
+    {26966, "PCS_NAD83_Georgia_East"},
+    {26967, "PCS_NAD83_Georgia_West"},
+    {26968, "PCS_NAD83_Idaho_East"},
+    {26969, "PCS_NAD83_Idaho_Central"},
+    {26970, "PCS_NAD83_Idaho_West"},
+    {26971, "PCS_NAD83_Illinois_East"},
+    {26972, "PCS_NAD83_Illinois_West"},
+    {26973, "PCS_NAD83_Indiana_East"},
+    {26974, "PCS_NAD83_Indiana_West"},
+    {26975, "PCS_NAD83_Iowa_North"},
+    {26976, "PCS_NAD83_Iowa_South"},
+    {26977, "PCS_NAD83_Kansas_North"},
+    {26978, "PCS_NAD83_Kansas_South"},
+    {26979, "PCS_NAD83_Kentucky_North"},
+    {26980, "PCS_NAD83_Kentucky_South"},
+    {26981, "PCS_NAD83_Louisiana_North"},
+    {26982, "PCS_NAD83_Louisiana_South"},
+    {26983, "PCS_NAD83_Maine_East"},
+    {26984, "PCS_NAD83_Maine_West"},
+    {26985, "PCS_NAD83_Maryland"},
+    {26986, "PCS_NAD83_Massachusetts"},
+    {26987, "PCS_NAD83_Massachusetts_Is"},
+    {26988, "PCS_NAD83_Michigan_North"},
+    {26989, "PCS_NAD83_Michigan_Central"},
+    {26990, "PCS_NAD83_Michigan_South"},
+    {26991, "PCS_NAD83_Minnesota_North"},
+    {26992, "PCS_NAD83_Minnesota_Cent"},
+    {26993, "PCS_NAD83_Minnesota_South"},
+    {26994, "PCS_NAD83_Mississippi_East"},
+    {26995, "PCS_NAD83_Mississippi_West"},
+    {26996, "PCS_NAD83_Missouri_East"},
+    {26997, "PCS_NAD83_Missouri_Central"},
+    {26998, "PCS_NAD83_Missouri_West"},
+    {27038, "PCS_Nahrwan_1967_UTM_38N"},
+    {27039, "PCS_Nahrwan_1967_UTM_39N"},
+    {27040, "PCS_Nahrwan_1967_UTM_40N"},
+    {27120, "PCS_Naparima_UTM_20N"},
+    {27200, "PCS_GD49_NZ_Map_Grid"},
+    {27291, "PCS_GD49_North_Island_Grid"},
+    {27292, "PCS_GD49_South_Island_Grid"},
+    {27429, "PCS_Datum_73_UTM_zone_29N"},
+    {27500, "PCS_ATF_Nord_de_Guerre"},
+    {27581, "PCS_NTF_France_I"},
+    {27582, "PCS_NTF_France_II"},
+    {27583, "PCS_NTF_France_III"},
+    {27591, "PCS_NTF_Nord_France"},
+    {27592, "PCS_NTF_Centre_France"},
+    {27593, "PCS_NTF_Sud_France"},
+    {27700, "PCS_British_National_Grid"},
+    {28232, "PCS_Point_Noire_UTM_32S"},
+    {28348, "PCS_GDA94_MGA_zone_48"},
+    {28349, "PCS_GDA94_MGA_zone_49"},
+    {28350, "PCS_GDA94_MGA_zone_50"},
+    {28351, "PCS_GDA94_MGA_zone_51"},
+    {28352, "PCS_GDA94_MGA_zone_52"},
+    {28353, "PCS_GDA94_MGA_zone_53"},
+    {28354, "PCS_GDA94_MGA_zone_54"},
+    {28355, "PCS_GDA94_MGA_zone_55"},
+    {28356, "PCS_GDA94_MGA_zone_56"},
+    {28357, "PCS_GDA94_MGA_zone_57"},
+    {28358, "PCS_GDA94_MGA_zone_58"},
+    {28404, "PCS_Pulkovo_Gauss_zone_4"},
+    {28405, "PCS_Pulkovo_Gauss_zone_5"},
+    {28406, "PCS_Pulkovo_Gauss_zone_6"},
+    {28407, "PCS_Pulkovo_Gauss_zone_7"},
+    {28408, "PCS_Pulkovo_Gauss_zone_8"},
+    {28409, "PCS_Pulkovo_Gauss_zone_9"},
+    {28410, "PCS_Pulkovo_Gauss_zone_10"},
+    {28411, "PCS_Pulkovo_Gauss_zone_11"},
+    {28412, "PCS_Pulkovo_Gauss_zone_12"},
+    {28413, "PCS_Pulkovo_Gauss_zone_13"},
+    {28414, "PCS_Pulkovo_Gauss_zone_14"},
+    {28415, "PCS_Pulkovo_Gauss_zone_15"},
+    {28416, "PCS_Pulkovo_Gauss_zone_16"},
+    {28417, "PCS_Pulkovo_Gauss_zone_17"},
+    {28418, "PCS_Pulkovo_Gauss_zone_18"},
+    {28419, "PCS_Pulkovo_Gauss_zone_19"},
+    {28420, "PCS_Pulkovo_Gauss_zone_20"},
+    {28421, "PCS_Pulkovo_Gauss_zone_21"},
+    {28422, "PCS_Pulkovo_Gauss_zone_22"},
+    {28423, "PCS_Pulkovo_Gauss_zone_23"},
+    {28424, "PCS_Pulkovo_Gauss_zone_24"},
+    {28425, "PCS_Pulkovo_Gauss_zone_25"},
+    {28426, "PCS_Pulkovo_Gauss_zone_26"},
+    {28427, "PCS_Pulkovo_Gauss_zone_27"},
+    {28428, "PCS_Pulkovo_Gauss_zone_28"},
+    {28429, "PCS_Pulkovo_Gauss_zone_29"},
+    {28430, "PCS_Pulkovo_Gauss_zone_30"},
+    {28431, "PCS_Pulkovo_Gauss_zone_31"},
+    {28432, "PCS_Pulkovo_Gauss_zone_32"},
+    {28464, "PCS_Pulkovo_Gauss_4N"},
+    {28465, "PCS_Pulkovo_Gauss_5N"},
+    {28466, "PCS_Pulkovo_Gauss_6N"},
+    {28467, "PCS_Pulkovo_Gauss_7N"},
+    {28468, "PCS_Pulkovo_Gauss_8N"},
+    {28469, "PCS_Pulkovo_Gauss_9N"},
+    {28470, "PCS_Pulkovo_Gauss_10N"},
+    {28471, "PCS_Pulkovo_Gauss_11N"},
+    {28472, "PCS_Pulkovo_Gauss_12N"},
+    {28473, "PCS_Pulkovo_Gauss_13N"},
+    {28474, "PCS_Pulkovo_Gauss_14N"},
+    {28475, "PCS_Pulkovo_Gauss_15N"},
+    {28476, "PCS_Pulkovo_Gauss_16N"},
+    {28477, "PCS_Pulkovo_Gauss_17N"},
+    {28478, "PCS_Pulkovo_Gauss_18N"},
+    {28479, "PCS_Pulkovo_Gauss_19N"},
+    {28480, "PCS_Pulkovo_Gauss_20N"},
+    {28481, "PCS_Pulkovo_Gauss_21N"},
+    {28482, "PCS_Pulkovo_Gauss_22N"},
+    {28483, "PCS_Pulkovo_Gauss_23N"},
+    {28484, "PCS_Pulkovo_Gauss_24N"},
+    {28485, "PCS_Pulkovo_Gauss_25N"},
+    {28486, "PCS_Pulkovo_Gauss_26N"},
+    {28487, "PCS_Pulkovo_Gauss_27N"},
+    {28488, "PCS_Pulkovo_Gauss_28N"},
+    {28489, "PCS_Pulkovo_Gauss_29N"},
+    {28490, "PCS_Pulkovo_Gauss_30N"},
+    {28491, "PCS_Pulkovo_Gauss_31N"},
+    {28492, "PCS_Pulkovo_Gauss_32N"},
+    {28600, "PCS_Qatar_National_Grid"},
+    {28991, "PCS_RD_Netherlands_Old"},
+    {28992, "PCS_RD_Netherlands_New"},
+    {29118, "PCS_SAD69_UTM_zone_18N"},
+    {29119, "PCS_SAD69_UTM_zone_19N"},
+    {29120, "PCS_SAD69_UTM_zone_20N"},
+    {29121, "PCS_SAD69_UTM_zone_21N"},
+    {29122, "PCS_SAD69_UTM_zone_22N"},
+    {29177, "PCS_SAD69_UTM_zone_17S"},
+    {29178, "PCS_SAD69_UTM_zone_18S"},
+    {29179, "PCS_SAD69_UTM_zone_19S"},
+    {29180, "PCS_SAD69_UTM_zone_20S"},
+    {29181, "PCS_SAD69_UTM_zone_21S"},
+    {29182, "PCS_SAD69_UTM_zone_22S"},
+    {29183, "PCS_SAD69_UTM_zone_23S"},
+    {29184, "PCS_SAD69_UTM_zone_24S"},
+    {29185, "PCS_SAD69_UTM_zone_25S"},
+    {29220, "PCS_Sapper_Hill_UTM_20S"},
+    {29221, "PCS_Sapper_Hill_UTM_21S"},
+    {29333, "PCS_Schwarzeck_UTM_33S"},
+    {29635, "PCS_Sudan_UTM_zone_35N"},
+    {29636, "PCS_Sudan_UTM_zone_36N"},
+    {29700, "PCS_Tananarive_Laborde"},
+    {29738, "PCS_Tananarive_UTM_38S"},
+    {29739, "PCS_Tananarive_UTM_39S"},
+    {29800, "PCS_Timbalai_1948_Borneo"},
+    {29849, "PCS_Timbalai_1948_UTM_49N"},
+    {29850, "PCS_Timbalai_1948_UTM_50N"},
+    {29900, "PCS_TM65_Irish_Nat_Grid"},
+    {30200, "PCS_Trinidad_1903_Trinidad"},
+    {30339, "PCS_TC_1948_UTM_zone_39N"},
+    {30340, "PCS_TC_1948_UTM_zone_40N"},
+    {30491, "PCS_Voirol_N_Algerie_ancien"},
+    {30492, "PCS_Voirol_S_Algerie_ancien"},
+    {30591, "PCS_Voirol_Unifie_N_Algerie"},
+    {30592, "PCS_Voirol_Unifie_S_Algerie"},
+    {30600, "PCS_Bern_1938_Swiss_New"},
+    {30729, "PCS_Nord_Sahara_UTM_29N"},
+    {30730, "PCS_Nord_Sahara_UTM_30N"},
+    {30731, "PCS_Nord_Sahara_UTM_31N"},
+    {30732, "PCS_Nord_Sahara_UTM_32N"},
+    {31028, "PCS_Yoff_UTM_zone_28N"},
+    {31121, "PCS_Zanderij_UTM_zone_21N"},
+    {31291, "PCS_MGI_Austria_West"},
+    {31292, "PCS_MGI_Austria_Central"},
+    {31293, "PCS_MGI_Austria_East"},
+    {31300, "PCS_Belge_Lambert_72"},
+    {31491, "PCS_DHDN_Germany_zone_1"},
+    {31492, "PCS_DHDN_Germany_zone_2"},
+    {31493, "PCS_DHDN_Germany_zone_3"},
+    {31494, "PCS_DHDN_Germany_zone_4"},
+    {31495, "PCS_DHDN_Germany_zone_5"},
+    {32001, "PCS_NAD27_Montana_North"},
+    {32002, "PCS_NAD27_Montana_Central"},
+    {32003, "PCS_NAD27_Montana_South"},
+    {32005, "PCS_NAD27_Nebraska_North"},
+    {32006, "PCS_NAD27_Nebraska_South"},
+    {32007, "PCS_NAD27_Nevada_East"},
+    {32008, "PCS_NAD27_Nevada_Central"},
+    {32009, "PCS_NAD27_Nevada_West"},
+    {32010, "PCS_NAD27_New_Hampshire"},
+    {32011, "PCS_NAD27_New_Jersey"},
+    {32012, "PCS_NAD27_New_Mexico_East"},
+    {32013, "PCS_NAD27_New_Mexico_Cent"},
+    {32014, "PCS_NAD27_New_Mexico_West"},
+    {32015, "PCS_NAD27_New_York_East"},
+    {32016, "PCS_NAD27_New_York_Central"},
+    {32017, "PCS_NAD27_New_York_West"},
+    {32018, "PCS_NAD27_New_York_Long_Is"},
+    {32019, "PCS_NAD27_North_Carolina"},
+    {32020, "PCS_NAD27_North_Dakota_N"},
+    {32021, "PCS_NAD27_North_Dakota_S"},
+    {32022, "PCS_NAD27_Ohio_North"},
+    {32023, "PCS_NAD27_Ohio_South"},
+    {32024, "PCS_NAD27_Oklahoma_North"},
+    {32025, "PCS_NAD27_Oklahoma_South"},
+    {32026, "PCS_NAD27_Oregon_North"},
+    {32027, "PCS_NAD27_Oregon_South"},
+    {32028, "PCS_NAD27_Pennsylvania_N"},
+    {32029, "PCS_NAD27_Pennsylvania_S"},
+    {32030, "PCS_NAD27_Rhode_Island"},
+    {32031, "PCS_NAD27_South_Carolina_N"},
+    {32033, "PCS_NAD27_South_Carolina_S"},
+    {32034, "PCS_NAD27_South_Dakota_N"},
+    {32035, "PCS_NAD27_South_Dakota_S"},
+    {32036, "PCS_NAD27_Tennessee"},
+    {32037, "PCS_NAD27_Texas_North"},
+    {32038, "PCS_NAD27_Texas_North_Cen"},
+    {32039, "PCS_NAD27_Texas_Central"},
+    {32040, "PCS_NAD27_Texas_South_Cen"},
+    {32041, "PCS_NAD27_Texas_South"},
+    {32042, "PCS_NAD27_Utah_North"},
+    {32043, "PCS_NAD27_Utah_Central"},
+    {32044, "PCS_NAD27_Utah_South"},
+    {32045, "PCS_NAD27_Vermont"},
+    {32046, "PCS_NAD27_Virginia_North"},
+    {32047, "PCS_NAD27_Virginia_South"},
+    {32048, "PCS_NAD27_Washington_North"},
+    {32049, "PCS_NAD27_Washington_South"},
+    {32050, "PCS_NAD27_West_Virginia_N"},
+    {32051, "PCS_NAD27_West_Virginia_S"},
+    {32052, "PCS_NAD27_Wisconsin_North"},
+    {32053, "PCS_NAD27_Wisconsin_Cen"},
+    {32054, "PCS_NAD27_Wisconsin_South"},
+    {32055, "PCS_NAD27_Wyoming_East"},
+    {32056, "PCS_NAD27_Wyoming_E_Cen"},
+    {32057, "PCS_NAD27_Wyoming_W_Cen"},
+    {32058, "PCS_NAD27_Wyoming_West"},
+    {32059, "PCS_NAD27_Puerto_Rico"},
+    {32060, "PCS_NAD27_St_Croix"},
+    {32100, "PCS_NAD83_Montana"},
+    {32104, "PCS_NAD83_Nebraska"},
+    {32107, "PCS_NAD83_Nevada_East"},
+    {32108, "PCS_NAD83_Nevada_Central"},
+    {32109, "PCS_NAD83_Nevada_West"},
+    {32110, "PCS_NAD83_New_Hampshire"},
+    {32111, "PCS_NAD83_New_Jersey"},
+    {32112, "PCS_NAD83_New_Mexico_East"},
+    {32113, "PCS_NAD83_New_Mexico_Cent"},
+    {32114, "PCS_NAD83_New_Mexico_West"},
+    {32115, "PCS_NAD83_New_York_East"},
+    {32116, "PCS_NAD83_New_York_Central"},
+    {32117, "PCS_NAD83_New_York_West"},
+    {32118, "PCS_NAD83_New_York_Long_Is"},
+    {32119, "PCS_NAD83_North_Carolina"},
+    {32120, "PCS_NAD83_North_Dakota_N"},
+    {32121, "PCS_NAD83_North_Dakota_S"},
+    {32122, "PCS_NAD83_Ohio_North"},
+    {32123, "PCS_NAD83_Ohio_South"},
+    {32124, "PCS_NAD83_Oklahoma_North"},
+    {32125, "PCS_NAD83_Oklahoma_South"},
+    {32126, "PCS_NAD83_Oregon_North"},
+    {32127, "PCS_NAD83_Oregon_South"},
+    {32128, "PCS_NAD83_Pennsylvania_N"},
+    {32129, "PCS_NAD83_Pennsylvania_S"},
+    {32130, "PCS_NAD83_Rhode_Island"},
+    {32133, "PCS_NAD83_South_Carolina"},
+    {32134, "PCS_NAD83_South_Dakota_N"},
+    {32135, "PCS_NAD83_South_Dakota_S"},
+    {32136, "PCS_NAD83_Tennessee"},
+    {32137, "PCS_NAD83_Texas_North"},
+    {32138, "PCS_NAD83_Texas_North_Cen"},
+    {32139, "PCS_NAD83_Texas_Central"},
+    {32140, "PCS_NAD83_Texas_South_Cen"},
+    {32141, "PCS_NAD83_Texas_South"},
+    {32142, "PCS_NAD83_Utah_North"},
+    {32143, "PCS_NAD83_Utah_Central"},
+    {32144, "PCS_NAD83_Utah_South"},
+    {32145, "PCS_NAD83_Vermont"},
+    {32146, "PCS_NAD83_Virginia_North"},
+    {32147, "PCS_NAD83_Virginia_South"},
+    {32148, "PCS_NAD83_Washington_North"},
+    {32149, "PCS_NAD83_Washington_South"},
+    {32150, "PCS_NAD83_West_Virginia_N"},
+    {32151, "PCS_NAD83_West_Virginia_S"},
+    {32152, "PCS_NAD83_Wisconsin_North"},
+    {32153, "PCS_NAD83_Wisconsin_Cen"},
+    {32154, "PCS_NAD83_Wisconsin_South"},
+    {32155, "PCS_NAD83_Wyoming_East"},
+    {32156, "PCS_NAD83_Wyoming_E_Cen"},
+    {32157, "PCS_NAD83_Wyoming_W_Cen"},
+    {32158, "PCS_NAD83_Wyoming_West"},
+    {32161, "PCS_NAD83_Puerto_Rico_Virgin_Is"},
+    {32201, "PCS_WGS72_UTM_zone_1N"},
+    {32202, "PCS_WGS72_UTM_zone_2N"},
+    {32203, "PCS_WGS72_UTM_zone_3N"},
+    {32204, "PCS_WGS72_UTM_zone_4N"},
+    {32205, "PCS_WGS72_UTM_zone_5N"},
+    {32206, "PCS_WGS72_UTM_zone_6N"},
+    {32207, "PCS_WGS72_UTM_zone_7N"},
+    {32208, "PCS_WGS72_UTM_zone_8N"},
+    {32209, "PCS_WGS72_UTM_zone_9N"},
+    {32210, "PCS_WGS72_UTM_zone_10N"},
+    {32211, "PCS_WGS72_UTM_zone_11N"},
+    {32212, "PCS_WGS72_UTM_zone_12N"},
+    {32213, "PCS_WGS72_UTM_zone_13N"},
+    {32214, "PCS_WGS72_UTM_zone_14N"},
+    {32215, "PCS_WGS72_UTM_zone_15N"},
+    {32216, "PCS_WGS72_UTM_zone_16N"},
+    {32217, "PCS_WGS72_UTM_zone_17N"},
+    {32218, "PCS_WGS72_UTM_zone_18N"},
+    {32219, "PCS_WGS72_UTM_zone_19N"},
+    {32220, "PCS_WGS72_UTM_zone_20N"},
+    {32221, "PCS_WGS72_UTM_zone_21N"},
+    {32222, "PCS_WGS72_UTM_zone_22N"},
+    {32223, "PCS_WGS72_UTM_zone_23N"},
+    {32224, "PCS_WGS72_UTM_zone_24N"},
+    {32225, "PCS_WGS72_UTM_zone_25N"},
+    {32226, "PCS_WGS72_UTM_zone_26N"},
+    {32227, "PCS_WGS72_UTM_zone_27N"},
+    {32228, "PCS_WGS72_UTM_zone_28N"},
+    {32229, "PCS_WGS72_UTM_zone_29N"},
+    {32230, "PCS_WGS72_UTM_zone_30N"},
+    {32231, "PCS_WGS72_UTM_zone_31N"},
+    {32232, "PCS_WGS72_UTM_zone_32N"},
+    {32233, "PCS_WGS72_UTM_zone_33N"},
+    {32234, "PCS_WGS72_UTM_zone_34N"},
+    {32235, "PCS_WGS72_UTM_zone_35N"},
+    {32236, "PCS_WGS72_UTM_zone_36N"},
+    {32237, "PCS_WGS72_UTM_zone_37N"},
+    {32238, "PCS_WGS72_UTM_zone_38N"},
+    {32239, "PCS_WGS72_UTM_zone_39N"},
+    {32240, "PCS_WGS72_UTM_zone_40N"},
+    {32241, "PCS_WGS72_UTM_zone_41N"},
+    {32242, "PCS_WGS72_UTM_zone_42N"},
+    {32243, "PCS_WGS72_UTM_zone_43N"},
+    {32244, "PCS_WGS72_UTM_zone_44N"},
+    {32245, "PCS_WGS72_UTM_zone_45N"},
+    {32246, "PCS_WGS72_UTM_zone_46N"},
+    {32247, "PCS_WGS72_UTM_zone_47N"},
+    {32248, "PCS_WGS72_UTM_zone_48N"},
+    {32249, "PCS_WGS72_UTM_zone_49N"},
+    {32250, "PCS_WGS72_UTM_zone_50N"},
+    {32251, "PCS_WGS72_UTM_zone_51N"},
+    {32252, "PCS_WGS72_UTM_zone_52N"},
+    {32253, "PCS_WGS72_UTM_zone_53N"},
+    {32254, "PCS_WGS72_UTM_zone_54N"},
+    {32255, "PCS_WGS72_UTM_zone_55N"},
+    {32256, "PCS_WGS72_UTM_zone_56N"},
+    {32257, "PCS_WGS72_UTM_zone_57N"},
+    {32258, "PCS_WGS72_UTM_zone_58N"},
+    {32259, "PCS_WGS72_UTM_zone_59N"},
+    {32260, "PCS_WGS72_UTM_zone_60N"},
+    {32301, "PCS_WGS72_UTM_zone_1S"},
+    {32302, "PCS_WGS72_UTM_zone_2S"},
+    {32303, "PCS_WGS72_UTM_zone_3S"},
+    {32304, "PCS_WGS72_UTM_zone_4S"},
+    {32305, "PCS_WGS72_UTM_zone_5S"},
+    {32306, "PCS_WGS72_UTM_zone_6S"},
+    {32307, "PCS_WGS72_UTM_zone_7S"},
+    {32308, "PCS_WGS72_UTM_zone_8S"},
+    {32309, "PCS_WGS72_UTM_zone_9S"},
+    {32310, "PCS_WGS72_UTM_zone_10S"},
+    {32311, "PCS_WGS72_UTM_zone_11S"},
+    {32312, "PCS_WGS72_UTM_zone_12S"},
+    {32313, "PCS_WGS72_UTM_zone_13S"},
+    {32314, "PCS_WGS72_UTM_zone_14S"},
+    {32315, "PCS_WGS72_UTM_zone_15S"},
+    {32316, "PCS_WGS72_UTM_zone_16S"},
+    {32317, "PCS_WGS72_UTM_zone_17S"},
+    {32318, "PCS_WGS72_UTM_zone_18S"},
+    {32319, "PCS_WGS72_UTM_zone_19S"},
+    {32320, "PCS_WGS72_UTM_zone_20S"},
+    {32321, "PCS_WGS72_UTM_zone_21S"},
+    {32322, "PCS_WGS72_UTM_zone_22S"},
+    {32323, "PCS_WGS72_UTM_zone_23S"},
+    {32324, "PCS_WGS72_UTM_zone_24S"},
+    {32325, "PCS_WGS72_UTM_zone_25S"},
+    {32326, "PCS_WGS72_UTM_zone_26S"},
+    {32327, "PCS_WGS72_UTM_zone_27S"},
+    {32328, "PCS_WGS72_UTM_zone_28S"},
+    {32329, "PCS_WGS72_UTM_zone_29S"},
+    {32330, "PCS_WGS72_UTM_zone_30S"},
+    {32331, "PCS_WGS72_UTM_zone_31S"},
+    {32332, "PCS_WGS72_UTM_zone_32S"},
+    {32333, "PCS_WGS72_UTM_zone_33S"},
+    {32334, "PCS_WGS72_UTM_zone_34S"},
+    {32335, "PCS_WGS72_UTM_zone_35S"},
+    {32336, "PCS_WGS72_UTM_zone_36S"},
+    {32337, "PCS_WGS72_UTM_zone_37S"},
+    {32338, "PCS_WGS72_UTM_zone_38S"},
+    {32339, "PCS_WGS72_UTM_zone_39S"},
+    {32340, "PCS_WGS72_UTM_zone_40S"},
+    {32341, "PCS_WGS72_UTM_zone_41S"},
+    {32342, "PCS_WGS72_UTM_zone_42S"},
+    {32343, "PCS_WGS72_UTM_zone_43S"},
+    {32344, "PCS_WGS72_UTM_zone_44S"},
+    {32345, "PCS_WGS72_UTM_zone_45S"},
+    {32346, "PCS_WGS72_UTM_zone_46S"},
+    {32347, "PCS_WGS72_UTM_zone_47S"},
+    {32348, "PCS_WGS72_UTM_zone_48S"},
+    {32349, "PCS_WGS72_UTM_zone_49S"},
+    {32350, "PCS_WGS72_UTM_zone_50S"},
+    {32351, "PCS_WGS72_UTM_zone_51S"},
+    {32352, "PCS_WGS72_UTM_zone_52S"},
+    {32353, "PCS_WGS72_UTM_zone_53S"},
+    {32354, "PCS_WGS72_UTM_zone_54S"},
+    {32355, "PCS_WGS72_UTM_zone_55S"},
+    {32356, "PCS_WGS72_UTM_zone_56S"},
+    {32357, "PCS_WGS72_UTM_zone_57S"},
+    {32358, "PCS_WGS72_UTM_zone_58S"},
+    {32359, "PCS_WGS72_UTM_zone_59S"},
+    {32360, "PCS_WGS72_UTM_zone_60S"},
+    {32401, "PCS_WGS72BE_UTM_zone_1N"},
+    {32402, "PCS_WGS72BE_UTM_zone_2N"},
+    {32403, "PCS_WGS72BE_UTM_zone_3N"},
+    {32404, "PCS_WGS72BE_UTM_zone_4N"},
+    {32405, "PCS_WGS72BE_UTM_zone_5N"},
+    {32406, "PCS_WGS72BE_UTM_zone_6N"},
+    {32407, "PCS_WGS72BE_UTM_zone_7N"},
+    {32408, "PCS_WGS72BE_UTM_zone_8N"},
+    {32409, "PCS_WGS72BE_UTM_zone_9N"},
+    {32410, "PCS_WGS72BE_UTM_zone_10N"},
+    {32411, "PCS_WGS72BE_UTM_zone_11N"},
+    {32412, "PCS_WGS72BE_UTM_zone_12N"},
+    {32413, "PCS_WGS72BE_UTM_zone_13N"},
+    {32414, "PCS_WGS72BE_UTM_zone_14N"},
+    {32415, "PCS_WGS72BE_UTM_zone_15N"},
+    {32416, "PCS_WGS72BE_UTM_zone_16N"},
+    {32417, "PCS_WGS72BE_UTM_zone_17N"},
+    {32418, "PCS_WGS72BE_UTM_zone_18N"},
+    {32419, "PCS_WGS72BE_UTM_zone_19N"},
+    {32420, "PCS_WGS72BE_UTM_zone_20N"},
+    {32421, "PCS_WGS72BE_UTM_zone_21N"},
+    {32422, "PCS_WGS72BE_UTM_zone_22N"},
+    {32423, "PCS_WGS72BE_UTM_zone_23N"},
+    {32424, "PCS_WGS72BE_UTM_zone_24N"},
+    {32425, "PCS_WGS72BE_UTM_zone_25N"},
+    {32426, "PCS_WGS72BE_UTM_zone_26N"},
+    {32427, "PCS_WGS72BE_UTM_zone_27N"},
+    {32428, "PCS_WGS72BE_UTM_zone_28N"},
+    {32429, "PCS_WGS72BE_UTM_zone_29N"},
+    {32430, "PCS_WGS72BE_UTM_zone_30N"},
+    {32431, "PCS_WGS72BE_UTM_zone_31N"},
+    {32432, "PCS_WGS72BE_UTM_zone_32N"},
+    {32433, "PCS_WGS72BE_UTM_zone_33N"},
+    {32434, "PCS_WGS72BE_UTM_zone_34N"},
+    {32435, "PCS_WGS72BE_UTM_zone_35N"},
+    {32436, "PCS_WGS72BE_UTM_zone_36N"},
+    {32437, "PCS_WGS72BE_UTM_zone_37N"},
+    {32438, "PCS_WGS72BE_UTM_zone_38N"},
+    {32439, "PCS_WGS72BE_UTM_zone_39N"},
+    {32440, "PCS_WGS72BE_UTM_zone_40N"},
+    {32441, "PCS_WGS72BE_UTM_zone_41N"},
+    {32442, "PCS_WGS72BE_UTM_zone_42N"},
+    {32443, "PCS_WGS72BE_UTM_zone_43N"},
+    {32444, "PCS_WGS72BE_UTM_zone_44N"},
+    {32445, "PCS_WGS72BE_UTM_zone_45N"},
+    {32446, "PCS_WGS72BE_UTM_zone_46N"},
+    {32447, "PCS_WGS72BE_UTM_zone_47N"},
+    {32448, "PCS_WGS72BE_UTM_zone_48N"},
+    {32449, "PCS_WGS72BE_UTM_zone_49N"},
+    {32450, "PCS_WGS72BE_UTM_zone_50N"},
+    {32451, "PCS_WGS72BE_UTM_zone_51N"},
+    {32452, "PCS_WGS72BE_UTM_zone_52N"},
+    {32453, "PCS_WGS72BE_UTM_zone_53N"},
+    {32454, "PCS_WGS72BE_UTM_zone_54N"},
+    {32455, "PCS_WGS72BE_UTM_zone_55N"},
+    {32456, "PCS_WGS72BE_UTM_zone_56N"},
+    {32457, "PCS_WGS72BE_UTM_zone_57N"},
+    {32458, "PCS_WGS72BE_UTM_zone_58N"},
+    {32459, "PCS_WGS72BE_UTM_zone_59N"},
+    {32460, "PCS_WGS72BE_UTM_zone_60N"},
+    {32501, "PCS_WGS72BE_UTM_zone_1S"},
+    {32502, "PCS_WGS72BE_UTM_zone_2S"},
+    {32503, "PCS_WGS72BE_UTM_zone_3S"},
+    {32504, "PCS_WGS72BE_UTM_zone_4S"},
+    {32505, "PCS_WGS72BE_UTM_zone_5S"},
+    {32506, "PCS_WGS72BE_UTM_zone_6S"},
+    {32507, "PCS_WGS72BE_UTM_zone_7S"},
+    {32508, "PCS_WGS72BE_UTM_zone_8S"},
+    {32509, "PCS_WGS72BE_UTM_zone_9S"},
+    {32510, "PCS_WGS72BE_UTM_zone_10S"},
+    {32511, "PCS_WGS72BE_UTM_zone_11S"},
+    {32512, "PCS_WGS72BE_UTM_zone_12S"},
+    {32513, "PCS_WGS72BE_UTM_zone_13S"},
+    {32514, "PCS_WGS72BE_UTM_zone_14S"},
+    {32515, "PCS_WGS72BE_UTM_zone_15S"},
+    {32516, "PCS_WGS72BE_UTM_zone_16S"},
+    {32517, "PCS_WGS72BE_UTM_zone_17S"},
+    {32518, "PCS_WGS72BE_UTM_zone_18S"},
+    {32519, "PCS_WGS72BE_UTM_zone_19S"},
+    {32520, "PCS_WGS72BE_UTM_zone_20S"},
+    {32521, "PCS_WGS72BE_UTM_zone_21S"},
+    {32522, "PCS_WGS72BE_UTM_zone_22S"},
+    {32523, "PCS_WGS72BE_UTM_zone_23S"},
+    {32524, "PCS_WGS72BE_UTM_zone_24S"},
+    {32525, "PCS_WGS72BE_UTM_zone_25S"},
+    {32526, "PCS_WGS72BE_UTM_zone_26S"},
+    {32527, "PCS_WGS72BE_UTM_zone_27S"},
+    {32528, "PCS_WGS72BE_UTM_zone_28S"},
+    {32529, "PCS_WGS72BE_UTM_zone_29S"},
+    {32530, "PCS_WGS72BE_UTM_zone_30S"},
+    {32531, "PCS_WGS72BE_UTM_zone_31S"},
+    {32532, "PCS_WGS72BE_UTM_zone_32S"},
+    {32533, "PCS_WGS72BE_UTM_zone_33S"},
+    {32534, "PCS_WGS72BE_UTM_zone_34S"},
+    {32535, "PCS_WGS72BE_UTM_zone_35S"},
+    {32536, "PCS_WGS72BE_UTM_zone_36S"},
+    {32537, "PCS_WGS72BE_UTM_zone_37S"},
+    {32538, "PCS_WGS72BE_UTM_zone_38S"},
+    {32539, "PCS_WGS72BE_UTM_zone_39S"},
+    {32540, "PCS_WGS72BE_UTM_zone_40S"},
+    {32541, "PCS_WGS72BE_UTM_zone_41S"},
+    {32542, "PCS_WGS72BE_UTM_zone_42S"},
+    {32543, "PCS_WGS72BE_UTM_zone_43S"},
+    {32544, "PCS_WGS72BE_UTM_zone_44S"},
+    {32545, "PCS_WGS72BE_UTM_zone_45S"},
+    {32546, "PCS_WGS72BE_UTM_zone_46S"},
+    {32547, "PCS_WGS72BE_UTM_zone_47S"},
+    {32548, "PCS_WGS72BE_UTM_zone_48S"},
+    {32549, "PCS_WGS72BE_UTM_zone_49S"},
+    {32550, "PCS_WGS72BE_UTM_zone_50S"},
+    {32551, "PCS_WGS72BE_UTM_zone_51S"},
+    {32552, "PCS_WGS72BE_UTM_zone_52S"},
+    {32553, "PCS_WGS72BE_UTM_zone_53S"},
+    {32554, "PCS_WGS72BE_UTM_zone_54S"},
+    {32555, "PCS_WGS72BE_UTM_zone_55S"},
+    {32556, "PCS_WGS72BE_UTM_zone_56S"},
+    {32557, "PCS_WGS72BE_UTM_zone_57S"},
+    {32558, "PCS_WGS72BE_UTM_zone_58S"},
+    {32559, "PCS_WGS72BE_UTM_zone_59S"},
+    {32560, "PCS_WGS72BE_UTM_zone_60S"},
+    {32601, "PCS_WGS84_UTM_zone_1N"},
+    {32602, "PCS_WGS84_UTM_zone_2N"},
+    {32603, "PCS_WGS84_UTM_zone_3N"},
+    {32604, "PCS_WGS84_UTM_zone_4N"},
+    {32605, "PCS_WGS84_UTM_zone_5N"},
+    {32606, "PCS_WGS84_UTM_zone_6N"},
+    {32607, "PCS_WGS84_UTM_zone_7N"},
+    {32608, "PCS_WGS84_UTM_zone_8N"},
+    {32609, "PCS_WGS84_UTM_zone_9N"},
+    {32610, "PCS_WGS84_UTM_zone_10N"},
+    {32611, "PCS_WGS84_UTM_zone_11N"},
+    {32612, "PCS_WGS84_UTM_zone_12N"},
+    {32613, "PCS_WGS84_UTM_zone_13N"},
+    {32614, "PCS_WGS84_UTM_zone_14N"},
+    {32615, "PCS_WGS84_UTM_zone_15N"},
+    {32616, "PCS_WGS84_UTM_zone_16N"},
+    {32617, "PCS_WGS84_UTM_zone_17N"},
+    {32618, "PCS_WGS84_UTM_zone_18N"},
+    {32619, "PCS_WGS84_UTM_zone_19N"},
+    {32620, "PCS_WGS84_UTM_zone_20N"},
+    {32621, "PCS_WGS84_UTM_zone_21N"},
+    {32622, "PCS_WGS84_UTM_zone_22N"},
+    {32623, "PCS_WGS84_UTM_zone_23N"},
+    {32624, "PCS_WGS84_UTM_zone_24N"},
+    {32625, "PCS_WGS84_UTM_zone_25N"},
+    {32626, "PCS_WGS84_UTM_zone_26N"},
+    {32627, "PCS_WGS84_UTM_zone_27N"},
+    {32628, "PCS_WGS84_UTM_zone_28N"},
+    {32629, "PCS_WGS84_UTM_zone_29N"},
+    {32630, "PCS_WGS84_UTM_zone_30N"},
+    {32631, "PCS_WGS84_UTM_zone_31N"},
+    {32632, "PCS_WGS84_UTM_zone_32N"},
+    {32633, "PCS_WGS84_UTM_zone_33N"},
+    {32634, "PCS_WGS84_UTM_zone_34N"},
+    {32635, "PCS_WGS84_UTM_zone_35N"},
+    {32636, "PCS_WGS84_UTM_zone_36N"},
+    {32637, "PCS_WGS84_UTM_zone_37N"},
+    {32638, "PCS_WGS84_UTM_zone_38N"},
+    {32639, "PCS_WGS84_UTM_zone_39N"},
+    {32640, "PCS_WGS84_UTM_zone_40N"},
+    {32641, "PCS_WGS84_UTM_zone_41N"},
+    {32642, "PCS_WGS84_UTM_zone_42N"},
+    {32643, "PCS_WGS84_UTM_zone_43N"},
+    {32644, "PCS_WGS84_UTM_zone_44N"},
+    {32645, "PCS_WGS84_UTM_zone_45N"},
+    {32646, "PCS_WGS84_UTM_zone_46N"},
+    {32647, "PCS_WGS84_UTM_zone_47N"},
+    {32648, "PCS_WGS84_UTM_zone_48N"},
+    {32649, "PCS_WGS84_UTM_zone_49N"},
+    {32650, "PCS_WGS84_UTM_zone_50N"},
+    {32651, "PCS_WGS84_UTM_zone_51N"},
+    {32652, "PCS_WGS84_UTM_zone_52N"},
+    {32653, "PCS_WGS84_UTM_zone_53N"},
+    {32654, "PCS_WGS84_UTM_zone_54N"},
+    {32655, "PCS_WGS84_UTM_zone_55N"},
+    {32656, "PCS_WGS84_UTM_zone_56N"},
+    {32657, "PCS_WGS84_UTM_zone_57N"},
+    {32658, "PCS_WGS84_UTM_zone_58N"},
+    {32659, "PCS_WGS84_UTM_zone_59N"},
+    {32660, "PCS_WGS84_UTM_zone_60N"},
+    {32701, "PCS_WGS84_UTM_zone_1S"},
+    {32702, "PCS_WGS84_UTM_zone_2S"},
+    {32703, "PCS_WGS84_UTM_zone_3S"},
+    {32704, "PCS_WGS84_UTM_zone_4S"},
+    {32705, "PCS_WGS84_UTM_zone_5S"},
+    {32706, "PCS_WGS84_UTM_zone_6S"},
+    {32707, "PCS_WGS84_UTM_zone_7S"},
+    {32708, "PCS_WGS84_UTM_zone_8S"},
+    {32709, "PCS_WGS84_UTM_zone_9S"},
+    {32710, "PCS_WGS84_UTM_zone_10S"},
+    {32711, "PCS_WGS84_UTM_zone_11S"},
+    {32712, "PCS_WGS84_UTM_zone_12S"},
+    {32713, "PCS_WGS84_UTM_zone_13S"},
+    {32714, "PCS_WGS84_UTM_zone_14S"},
+    {32715, "PCS_WGS84_UTM_zone_15S"},
+    {32716, "PCS_WGS84_UTM_zone_16S"},
+    {32717, "PCS_WGS84_UTM_zone_17S"},
+    {32718, "PCS_WGS84_UTM_zone_18S"},
+    {32719, "PCS_WGS84_UTM_zone_19S"},
+    {32720, "PCS_WGS84_UTM_zone_20S"},
+    {32721, "PCS_WGS84_UTM_zone_21S"},
+    {32722, "PCS_WGS84_UTM_zone_22S"},
+    {32723, "PCS_WGS84_UTM_zone_23S"},
+    {32724, "PCS_WGS84_UTM_zone_24S"},
+    {32725, "PCS_WGS84_UTM_zone_25S"},
+    {32726, "PCS_WGS84_UTM_zone_26S"},
+    {32727, "PCS_WGS84_UTM_zone_27S"},
+    {32728, "PCS_WGS84_UTM_zone_28S"},
+    {32729, "PCS_WGS84_UTM_zone_29S"},
+    {32730, "PCS_WGS84_UTM_zone_30S"},
+    {32731, "PCS_WGS84_UTM_zone_31S"},
+    {32732, "PCS_WGS84_UTM_zone_32S"},
+    {32733, "PCS_WGS84_UTM_zone_33S"},
+    {32734, "PCS_WGS84_UTM_zone_34S"},
+    {32735, "PCS_WGS84_UTM_zone_35S"},
+    {32736, "PCS_WGS84_UTM_zone_36S"},
+    {32737, "PCS_WGS84_UTM_zone_37S"},
+    {32738, "PCS_WGS84_UTM_zone_38S"},
+    {32739, "PCS_WGS84_UTM_zone_39S"},
+    {32740, "PCS_WGS84_UTM_zone_40S"},
+    {32741, "PCS_WGS84_UTM_zone_41S"},
+    {32742, "PCS_WGS84_UTM_zone_42S"},
+    {32743, "PCS_WGS84_UTM_zone_43S"},
+    {32744, "PCS_WGS84_UTM_zone_44S"},
+    {32745, "PCS_WGS84_UTM_zone_45S"},
+    {32746, "PCS_WGS84_UTM_zone_46S"},
+    {32747, "PCS_WGS84_UTM_zone_47S"},
+    {32748, "PCS_WGS84_UTM_zone_48S"},
+    {32749, "PCS_WGS84_UTM_zone_49S"},
+    {32750, "PCS_WGS84_UTM_zone_50S"},
+    {32751, "PCS_WGS84_UTM_zone_51S"},
+    {32752, "PCS_WGS84_UTM_zone_52S"},
+    {32753, "PCS_WGS84_UTM_zone_53S"},
+    {32754, "PCS_WGS84_UTM_zone_54S"},
+    {32755, "PCS_WGS84_UTM_zone_55S"},
+    {32756, "PCS_WGS84_UTM_zone_56S"},
+    {32757, "PCS_WGS84_UTM_zone_57S"},
+    {32758, "PCS_WGS84_UTM_zone_58S"},
+    {32759, "PCS_WGS84_UTM_zone_59S"},
+    {32760, "PCS_WGS84_UTM_zone_60S"}
+};
+
+const TiffGeoTagKeyName ff_tiff_projection_codes[] = {
+    {10101, "Proj_Alabama_CS27_East"},
+    {10102, "Proj_Alabama_CS27_West"},
+    {10131, "Proj_Alabama_CS83_East"},
+    {10132, "Proj_Alabama_CS83_West"},
+    {10201, "Proj_Arizona_Coordinate_System_east"},
+    {10202, "Proj_Arizona_Coordinate_System_Central"},
+    {10203, "Proj_Arizona_Coordinate_System_west"},
+    {10231, "Proj_Arizona_CS83_east"},
+    {10232, "Proj_Arizona_CS83_Central"},
+    {10233, "Proj_Arizona_CS83_west"},
+    {10301, "Proj_Arkansas_CS27_North"},
+    {10302, "Proj_Arkansas_CS27_South"},
+    {10331, "Proj_Arkansas_CS83_North"},
+    {10332, "Proj_Arkansas_CS83_South"},
+    {10401, "Proj_California_CS27_I"},
+    {10402, "Proj_California_CS27_II"},
+    {10403, "Proj_California_CS27_III"},
+    {10404, "Proj_California_CS27_IV"},
+    {10405, "Proj_California_CS27_V"},
+    {10406, "Proj_California_CS27_VI"},
+    {10407, "Proj_California_CS27_VII"},
+    {10431, "Proj_California_CS83_1"},
+    {10432, "Proj_California_CS83_2"},
+    {10433, "Proj_California_CS83_3"},
+    {10434, "Proj_California_CS83_4"},
+    {10435, "Proj_California_CS83_5"},
+    {10436, "Proj_California_CS83_6"},
+    {10501, "Proj_Colorado_CS27_North"},
+    {10502, "Proj_Colorado_CS27_Central"},
+    {10503, "Proj_Colorado_CS27_South"},
+    {10531, "Proj_Colorado_CS83_North"},
+    {10532, "Proj_Colorado_CS83_Central"},
+    {10533, "Proj_Colorado_CS83_South"},
+    {10600, "Proj_Connecticut_CS27"},
+    {10630, "Proj_Connecticut_CS83"},
+    {10700, "Proj_Delaware_CS27"},
+    {10730, "Proj_Delaware_CS83"},
+    {10901, "Proj_Florida_CS27_East"},
+    {10902, "Proj_Florida_CS27_West"},
+    {10903, "Proj_Florida_CS27_North"},
+    {10931, "Proj_Florida_CS83_East"},
+    {10932, "Proj_Florida_CS83_West"},
+    {10933, "Proj_Florida_CS83_North"},
+    {11001, "Proj_Georgia_CS27_East"},
+    {11002, "Proj_Georgia_CS27_West"},
+    {11031, "Proj_Georgia_CS83_East"},
+    {11032, "Proj_Georgia_CS83_West"},
+    {11101, "Proj_Idaho_CS27_East"},
+    {11102, "Proj_Idaho_CS27_Central"},
+    {11103, "Proj_Idaho_CS27_West"},
+    {11131, "Proj_Idaho_CS83_East"},
+    {11132, "Proj_Idaho_CS83_Central"},
+    {11133, "Proj_Idaho_CS83_West"},
+    {11201, "Proj_Illinois_CS27_East"},
+    {11202, "Proj_Illinois_CS27_West"},
+    {11231, "Proj_Illinois_CS83_East"},
+    {11232, "Proj_Illinois_CS83_West"},
+    {11301, "Proj_Indiana_CS27_East"},
+    {11302, "Proj_Indiana_CS27_West"},
+    {11331, "Proj_Indiana_CS83_East"},
+    {11332, "Proj_Indiana_CS83_West"},
+    {11401, "Proj_Iowa_CS27_North"},
+    {11402, "Proj_Iowa_CS27_South"},
+    {11431, "Proj_Iowa_CS83_North"},
+    {11432, "Proj_Iowa_CS83_South"},
+    {11501, "Proj_Kansas_CS27_North"},
+    {11502, "Proj_Kansas_CS27_South"},
+    {11531, "Proj_Kansas_CS83_North"},
+    {11532, "Proj_Kansas_CS83_South"},
+    {11601, "Proj_Kentucky_CS27_North"},
+    {11602, "Proj_Kentucky_CS27_South"},
+    {11631, "Proj_Kentucky_CS83_North"},
+    {11632, "Proj_Kentucky_CS83_South"},
+    {11701, "Proj_Louisiana_CS27_North"},
+    {11702, "Proj_Louisiana_CS27_South"},
+    {11731, "Proj_Louisiana_CS83_North"},
+    {11732, "Proj_Louisiana_CS83_South"},
+    {11801, "Proj_Maine_CS27_East"},
+    {11802, "Proj_Maine_CS27_West"},
+    {11831, "Proj_Maine_CS83_East"},
+    {11832, "Proj_Maine_CS83_West"},
+    {11900, "Proj_Maryland_CS27"},
+    {11930, "Proj_Maryland_CS83"},
+    {12001, "Proj_Massachusetts_CS27_Mainland"},
+    {12002, "Proj_Massachusetts_CS27_Island"},
+    {12031, "Proj_Massachusetts_CS83_Mainland"},
+    {12032, "Proj_Massachusetts_CS83_Island"},
+    {12101, "Proj_Michigan_State_Plane_East"},
+    {12102, "Proj_Michigan_State_Plane_Old_Central"},
+    {12103, "Proj_Michigan_State_Plane_West"},
+    {12111, "Proj_Michigan_CS27_North"},
+    {12112, "Proj_Michigan_CS27_Central"},
+    {12113, "Proj_Michigan_CS27_South"},
+    {12141, "Proj_Michigan_CS83_North"},
+    {12142, "Proj_Michigan_CS83_Central"},
+    {12143, "Proj_Michigan_CS83_South"},
+    {12201, "Proj_Minnesota_CS27_North"},
+    {12202, "Proj_Minnesota_CS27_Central"},
+    {12203, "Proj_Minnesota_CS27_South"},
+    {12231, "Proj_Minnesota_CS83_North"},
+    {12232, "Proj_Minnesota_CS83_Central"},
+    {12233, "Proj_Minnesota_CS83_South"},
+    {12301, "Proj_Mississippi_CS27_East"},
+    {12302, "Proj_Mississippi_CS27_West"},
+    {12331, "Proj_Mississippi_CS83_East"},
+    {12332, "Proj_Mississippi_CS83_West"},
+    {12401, "Proj_Missouri_CS27_East"},
+    {12402, "Proj_Missouri_CS27_Central"},
+    {12403, "Proj_Missouri_CS27_West"},
+    {12431, "Proj_Missouri_CS83_East"},
+    {12432, "Proj_Missouri_CS83_Central"},
+    {12433, "Proj_Missouri_CS83_West"},
+    {12501, "Proj_Montana_CS27_North"},
+    {12502, "Proj_Montana_CS27_Central"},
+    {12503, "Proj_Montana_CS27_South"},
+    {12530, "Proj_Montana_CS83"},
+    {12601, "Proj_Nebraska_CS27_North"},
+    {12602, "Proj_Nebraska_CS27_South"},
+    {12630, "Proj_Nebraska_CS83"},
+    {12701, "Proj_Nevada_CS27_East"},
+    {12702, "Proj_Nevada_CS27_Central"},
+    {12703, "Proj_Nevada_CS27_West"},
+    {12731, "Proj_Nevada_CS83_East"},
+    {12732, "Proj_Nevada_CS83_Central"},
+    {12733, "Proj_Nevada_CS83_West"},
+    {12800, "Proj_New_Hampshire_CS27"},
+    {12830, "Proj_New_Hampshire_CS83"},
+    {12900, "Proj_New_Jersey_CS27"},
+    {12930, "Proj_New_Jersey_CS83"},
+    {13001, "Proj_New_Mexico_CS27_East"},
+    {13002, "Proj_New_Mexico_CS27_Central"},
+    {13003, "Proj_New_Mexico_CS27_West"},
+    {13031, "Proj_New_Mexico_CS83_East"},
+    {13032, "Proj_New_Mexico_CS83_Central"},
+    {13033, "Proj_New_Mexico_CS83_West"},
+    {13101, "Proj_New_York_CS27_East"},
+    {13102, "Proj_New_York_CS27_Central"},
+    {13103, "Proj_New_York_CS27_West"},
+    {13104, "Proj_New_York_CS27_Long_Island"},
+    {13131, "Proj_New_York_CS83_East"},
+    {13132, "Proj_New_York_CS83_Central"},
+    {13133, "Proj_New_York_CS83_West"},
+    {13134, "Proj_New_York_CS83_Long_Island"},
+    {13200, "Proj_North_Carolina_CS27"},
+    {13230, "Proj_North_Carolina_CS83"},
+    {13301, "Proj_North_Dakota_CS27_North"},
+    {13302, "Proj_North_Dakota_CS27_South"},
+    {13331, "Proj_North_Dakota_CS83_North"},
+    {13332, "Proj_North_Dakota_CS83_South"},
+    {13401, "Proj_Ohio_CS27_North"},
+    {13402, "Proj_Ohio_CS27_South"},
+    {13431, "Proj_Ohio_CS83_North"},
+    {13432, "Proj_Ohio_CS83_South"},
+    {13501, "Proj_Oklahoma_CS27_North"},
+    {13502, "Proj_Oklahoma_CS27_South"},
+    {13531, "Proj_Oklahoma_CS83_North"},
+    {13532, "Proj_Oklahoma_CS83_South"},
+    {13601, "Proj_Oregon_CS27_North"},
+    {13602, "Proj_Oregon_CS27_South"},
+    {13631, "Proj_Oregon_CS83_North"},
+    {13632, "Proj_Oregon_CS83_South"},
+    {13701, "Proj_Pennsylvania_CS27_North"},
+    {13702, "Proj_Pennsylvania_CS27_South"},
+    {13731, "Proj_Pennsylvania_CS83_North"},
+    {13732, "Proj_Pennsylvania_CS83_South"},
+    {13800, "Proj_Rhode_Island_CS27"},
+    {13830, "Proj_Rhode_Island_CS83"},
+    {13901, "Proj_South_Carolina_CS27_North"},
+    {13902, "Proj_South_Carolina_CS27_South"},
+    {13930, "Proj_South_Carolina_CS83"},
+    {14001, "Proj_South_Dakota_CS27_North"},
+    {14002, "Proj_South_Dakota_CS27_South"},
+    {14031, "Proj_South_Dakota_CS83_North"},
+    {14032, "Proj_South_Dakota_CS83_South"},
+    {14100, "Proj_Tennessee_CS27"},
+    {14130, "Proj_Tennessee_CS83"},
+    {14201, "Proj_Texas_CS27_North"},
+    {14202, "Proj_Texas_CS27_North_Central"},
+    {14203, "Proj_Texas_CS27_Central"},
+    {14204, "Proj_Texas_CS27_South_Central"},
+    {14205, "Proj_Texas_CS27_South"},
+    {14231, "Proj_Texas_CS83_North"},
+    {14232, "Proj_Texas_CS83_North_Central"},
+    {14233, "Proj_Texas_CS83_Central"},
+    {14234, "Proj_Texas_CS83_South_Central"},
+    {14235, "Proj_Texas_CS83_South"},
+    {14301, "Proj_Utah_CS27_North"},
+    {14302, "Proj_Utah_CS27_Central"},
+    {14303, "Proj_Utah_CS27_South"},
+    {14331, "Proj_Utah_CS83_North"},
+    {14332, "Proj_Utah_CS83_Central"},
+    {14333, "Proj_Utah_CS83_South"},
+    {14400, "Proj_Vermont_CS27"},
+    {14430, "Proj_Vermont_CS83"},
+    {14501, "Proj_Virginia_CS27_North"},
+    {14502, "Proj_Virginia_CS27_South"},
+    {14531, "Proj_Virginia_CS83_North"},
+    {14532, "Proj_Virginia_CS83_South"},
+    {14601, "Proj_Washington_CS27_North"},
+    {14602, "Proj_Washington_CS27_South"},
+    {14631, "Proj_Washington_CS83_North"},
+    {14632, "Proj_Washington_CS83_South"},
+    {14701, "Proj_West_Virginia_CS27_North"},
+    {14702, "Proj_West_Virginia_CS27_South"},
+    {14731, "Proj_West_Virginia_CS83_North"},
+    {14732, "Proj_West_Virginia_CS83_South"},
+    {14801, "Proj_Wisconsin_CS27_North"},
+    {14802, "Proj_Wisconsin_CS27_Central"},
+    {14803, "Proj_Wisconsin_CS27_South"},
+    {14831, "Proj_Wisconsin_CS83_North"},
+    {14832, "Proj_Wisconsin_CS83_Central"},
+    {14833, "Proj_Wisconsin_CS83_South"},
+    {14901, "Proj_Wyoming_CS27_East"},
+    {14902, "Proj_Wyoming_CS27_East_Central"},
+    {14903, "Proj_Wyoming_CS27_West_Central"},
+    {14904, "Proj_Wyoming_CS27_West"},
+    {14931, "Proj_Wyoming_CS83_East"},
+    {14932, "Proj_Wyoming_CS83_East_Central"},
+    {14933, "Proj_Wyoming_CS83_West_Central"},
+    {14934, "Proj_Wyoming_CS83_West"},
+    {15001, "Proj_Alaska_CS27_1"},
+    {15002, "Proj_Alaska_CS27_2"},
+    {15003, "Proj_Alaska_CS27_3"},
+    {15004, "Proj_Alaska_CS27_4"},
+    {15005, "Proj_Alaska_CS27_5"},
+    {15006, "Proj_Alaska_CS27_6"},
+    {15007, "Proj_Alaska_CS27_7"},
+    {15008, "Proj_Alaska_CS27_8"},
+    {15009, "Proj_Alaska_CS27_9"},
+    {15010, "Proj_Alaska_CS27_10"},
+    {15031, "Proj_Alaska_CS83_1"},
+    {15032, "Proj_Alaska_CS83_2"},
+    {15033, "Proj_Alaska_CS83_3"},
+    {15034, "Proj_Alaska_CS83_4"},
+    {15035, "Proj_Alaska_CS83_5"},
+    {15036, "Proj_Alaska_CS83_6"},
+    {15037, "Proj_Alaska_CS83_7"},
+    {15038, "Proj_Alaska_CS83_8"},
+    {15039, "Proj_Alaska_CS83_9"},
+    {15040, "Proj_Alaska_CS83_10"},
+    {15101, "Proj_Hawaii_CS27_1"},
+    {15102, "Proj_Hawaii_CS27_2"},
+    {15103, "Proj_Hawaii_CS27_3"},
+    {15104, "Proj_Hawaii_CS27_4"},
+    {15105, "Proj_Hawaii_CS27_5"},
+    {15131, "Proj_Hawaii_CS83_1"},
+    {15132, "Proj_Hawaii_CS83_2"},
+    {15133, "Proj_Hawaii_CS83_3"},
+    {15134, "Proj_Hawaii_CS83_4"},
+    {15135, "Proj_Hawaii_CS83_5"},
+    {15201, "Proj_Puerto_Rico_CS27"},
+    {15202, "Proj_St_Croix"},
+    {15230, "Proj_Puerto_Rico_Virgin_Is"},
+    {15914, "Proj_BLM_14N_feet"},
+    {15915, "Proj_BLM_15N_feet"},
+    {15916, "Proj_BLM_16N_feet"},
+    {15917, "Proj_BLM_17N_feet"},
+    {17348, "Proj_Map_Grid_of_Australia_48"},
+    {17349, "Proj_Map_Grid_of_Australia_49"},
+    {17350, "Proj_Map_Grid_of_Australia_50"},
+    {17351, "Proj_Map_Grid_of_Australia_51"},
+    {17352, "Proj_Map_Grid_of_Australia_52"},
+    {17353, "Proj_Map_Grid_of_Australia_53"},
+    {17354, "Proj_Map_Grid_of_Australia_54"},
+    {17355, "Proj_Map_Grid_of_Australia_55"},
+    {17356, "Proj_Map_Grid_of_Australia_56"},
+    {17357, "Proj_Map_Grid_of_Australia_57"},
+    {17358, "Proj_Map_Grid_of_Australia_58"},
+    {17448, "Proj_Australian_Map_Grid_48"},
+    {17449, "Proj_Australian_Map_Grid_49"},
+    {17450, "Proj_Australian_Map_Grid_50"},
+    {17451, "Proj_Australian_Map_Grid_51"},
+    {17452, "Proj_Australian_Map_Grid_52"},
+    {17453, "Proj_Australian_Map_Grid_53"},
+    {17454, "Proj_Australian_Map_Grid_54"},
+    {17455, "Proj_Australian_Map_Grid_55"},
+    {17456, "Proj_Australian_Map_Grid_56"},
+    {17457, "Proj_Australian_Map_Grid_57"},
+    {17458, "Proj_Australian_Map_Grid_58"},
+    {18031, "Proj_Argentina_1"},
+    {18032, "Proj_Argentina_2"},
+    {18033, "Proj_Argentina_3"},
+    {18034, "Proj_Argentina_4"},
+    {18035, "Proj_Argentina_5"},
+    {18036, "Proj_Argentina_6"},
+    {18037, "Proj_Argentina_7"},
+    {18051, "Proj_Colombia_3W"},
+    {18052, "Proj_Colombia_Bogota"},
+    {18053, "Proj_Colombia_3E"},
+    {18054, "Proj_Colombia_6E"},
+    {18072, "Proj_Egypt_Red_Belt"},
+    {18073, "Proj_Egypt_Purple_Belt"},
+    {18074, "Proj_Extended_Purple_Belt"},
+    {18141, "Proj_New_Zealand_North_Island_Nat_Grid"},
+    {18142, "Proj_New_Zealand_South_Island_Nat_Grid"},
+    {19900, "Proj_Bahrain_Grid"},
+    {19905, "Proj_Netherlands_E_Indies_Equatorial"},
+    {19912, "Proj_RSO_Borneo"}
+};
+
+const char *const ff_tiff_coord_trans_codes[] = {
+    "CT_TransverseMercator",
+    "CT_TransvMercator_Modified_Alaska",
+    "CT_ObliqueMercator",
+    "CT_ObliqueMercator_Laborde",
+    "CT_ObliqueMercator_Rosenmund",
+    "CT_ObliqueMercator_Spherical",
+    "CT_Mercator",
+    "CT_LambertConfConic_2SP",
+    "CT_LambertConfConic_Helmert",
+    "CT_LambertAzimEqualArea",
+    "CT_AlbersEqualArea",
+    "CT_AzimuthalEquidistant",
+    "CT_EquidistantConic",
+    "CT_Stereographic",
+    "CT_PolarStereographic",
+    "CT_ObliqueStereographic",
+    "CT_Equirectangular",
+    "CT_CassiniSoldner",
+    "CT_Gnomonic",
+    "CT_MillerCylindrical",
+    "CT_Orthographic",
+    "CT_Polyconic",
+    "CT_Robinson",
+    "CT_Sinusoidal",
+    "CT_VanDerGrinten",
+    "CT_NewZealandMapGrid",
+    "CT_TransvMercator_SouthOriented"
+};
+
+const char *const ff_tiff_vert_cs_codes[] = {
+    "VertCS_Airy_1830_ellipsoid",
+    "VertCS_Airy_Modified_1849_ellipsoid",
+    "VertCS_ANS_ellipsoid",
+    "VertCS_Bessel_1841_ellipsoid",
+    "VertCS_Bessel_Modified_ellipsoid",
+    "VertCS_Bessel_Namibia_ellipsoid",
+    "VertCS_Clarke_1858_ellipsoid",
+    "VertCS_Clarke_1866_ellipsoid",
+    "VertCS_Clarke_1880_Benoit_ellipsoid",
+    "VertCS_Clarke_1880_IGN_ellipsoid",
+    "VertCS_Clarke_1880_RGS_ellipsoid",
+    "VertCS_Clarke_1880_Arc_ellipsoid",
+    "VertCS_Clarke_1880_SGA_1922_ellipsoid",
+    "VertCS_Everest_1830_1937_Adjustment_ellipsoid",
+    "VertCS_Everest_1830_1967_Definition_ellipsoid",
+    "VertCS_Everest_1830_1975_Definition_ellipsoid",
+    "VertCS_Everest_1830_Modified_ellipsoid",
+    "VertCS_GRS_1980_ellipsoid",
+    "VertCS_Helmert_1906_ellipsoid",
+    "VertCS_INS_ellipsoid",
+    "VertCS_International_1924_ellipsoid",
+    "VertCS_International_1967_ellipsoid",
+    "VertCS_Krassowsky_1940_ellipsoid",
+    "VertCS_NWL_9D_ellipsoid",
+    "VertCS_NWL_10D_ellipsoid",
+    "VertCS_Plessis_1817_ellipsoid",
+    "VertCS_Struve_1860_ellipsoid",
+    "VertCS_War_Office_ellipsoid",
+    "VertCS_WGS_84_ellipsoid",
+    "VertCS_GEM_10C_ellipsoid",
+    "VertCS_OSU86F_ellipsoid",
+    "VertCS_OSU91A_ellipsoid"
+};
+
+const char *const ff_tiff_ortho_vert_cs_codes[] = {
+    "VertCS_Newlyn",
+    "VertCS_North_American_Vertical_Datum_1929",
+    "VertCS_North_American_Vertical_Datum_1988",
+    "VertCS_Yellow_Sea_1956",
+    "VertCS_Baltic_Sea",
+    "VertCS_Caspian_Sea"
+};
diff --git a/libavcodec/tiff_data.h b/libavcodec/tiff_data.h
new file mode 100644
index 0000000..57515f9
--- /dev/null
+++ b/libavcodec/tiff_data.h
@@ -0,0 +1,92 @@
+/*
+ * TIFF data tables
+ * Copyright (c) 2011 Thomas Kuehnel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF data tables
+ * @author Thomas Kuehnel
+ * @see GeoTIFF specification at
+ * http://www.remotesensing.org/geotiff/spec/geotiffhome.html
+ */
+
+#ifndef AVCODEC_TIFF_DATA_H
+#define AVCODEC_TIFF_DATA_H
+
+#include "tiff.h"
+
+#define TIFF_CONF_KEY_ID_OFFSET 1024
+extern const TiffGeoTagNameType ff_tiff_conf_name_type_map[3];
+
+#define TIFF_GEOG_KEY_ID_OFFSET 2048
+extern const TiffGeoTagNameType ff_tiff_geog_name_type_map[14];
+
+#define TIFF_PROJ_KEY_ID_OFFSET 3072
+extern const TiffGeoTagNameType ff_tiff_proj_name_type_map[24];
+
+#define TIFF_VERT_KEY_ID_OFFSET 4096
+extern const TiffGeoTagNameType ff_tiff_vert_name_type_map[4];
+
+#define TIFF_GEO_KEY_UNDEFINED    0
+#define TIFF_GEO_KEY_USER_DEFINED 32767
+
+#define TIFF_GT_MODEL_TYPE_OFFSET 1
+extern const char *const ff_tiff_gt_model_type_codes[3];
+
+#define TIFF_GT_RASTER_TYPE_OFFSET 1
+extern const char *const ff_tiff_gt_raster_type_codes[2];
+
+#define TIFF_LINEAR_UNIT_OFFSET 9001
+extern const char *const ff_tiff_linear_unit_codes[15];
+
+#define TIFF_ANGULAR_UNIT_OFFSET 9101
+extern const char *const ff_tiff_angular_unit_codes[8];
+
+#define TIFF_GCS_TYPE_OFFSET 4201
+extern const char *const ff_tiff_gcs_type_codes[133];
+
+#define TIFF_GCSE_TYPE_OFFSET 4001
+extern const char *const ff_tiff_gcse_type_codes[35];
+
+#define TIFF_GEODETIC_DATUM_OFFSET 6201
+extern const char *const ff_tiff_geodetic_datum_codes[120];
+
+#define TIFF_GEODETIC_DATUM_E_OFFSET 6001
+extern const char *const ff_tiff_geodetic_datum_e_codes[35];
+
+#define TIFF_ELLIPSOID_OFFSET 7001
+extern const char *const ff_tiff_ellipsoid_codes[35];
+
+#define TIFF_PRIME_MERIDIAN_OFFSET 8901
+extern const char *const ff_tiff_prime_meridian_codes[11];
+
+extern const TiffGeoTagKeyName ff_tiff_proj_cs_type_codes[978];
+
+extern const TiffGeoTagKeyName ff_tiff_projection_codes[298];
+
+#define TIFF_COORD_TRANS_OFFSET 1
+extern const char *const ff_tiff_coord_trans_codes[27];
+
+#define TIFF_VERT_CS_OFFSET 5001
+extern const char *const ff_tiff_vert_cs_codes[32];
+
+#define TIFF_ORTHO_VERT_CS_OFFSET 5101
+extern const char *const ff_tiff_ortho_vert_cs_codes[6];
+#endif
diff --git a/libavcodec/tiffenc.c b/libavcodec/tiffenc.c
index 8791c54..f59816e 100644
--- a/libavcodec/tiffenc.c
+++ b/libavcodec/tiffenc.c
@@ -2,20 +2,20 @@
  * TIFF image encoder
  * Copyright (c) 2007 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,11 +30,13 @@
 #include <zlib.h>
 #endif
 
+#include "libavutil/imgutils.h"
 #include "libavutil/log.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
 #include "lzw.h"
 #include "put_bits.h"
 #include "rle.h"
@@ -43,8 +45,8 @@
 #define TIFF_MAX_ENTRY 32
 
 /** sizes of various TIFF field types (string size = 1)*/
-static const uint8_t type_sizes2[6] = {
-    0, 1, 1, 2, 4, 8
+static const uint8_t type_sizes2[14] = {
+    0, 1, 1, 2, 4, 8, 1, 1, 2, 4, 8, 4, 8, 4
 };
 
 typedef struct TiffEncoderContext {
@@ -58,6 +60,12 @@ typedef struct TiffEncoderContext {
     int bpp_tab_size;                       ///< bpp_tab size
     enum TiffPhotometric photometric_interpretation;  ///< photometric interpretation
     int strips;                             ///< number of strips
+    uint32_t *strip_sizes;
+    unsigned int strip_sizes_size;
+    uint32_t *strip_offsets;
+    unsigned int strip_offsets_size;
+    uint8_t *yuv_line;
+    unsigned int yuv_line_size;
     int rps;                                ///< row per strip
     uint8_t entries[TIFF_MAX_ENTRY * 12];   ///< entries in header
     int num_entries;                        ///< number of entries
@@ -66,10 +74,12 @@ typedef struct TiffEncoderContext {
     int buf_size;                           ///< buffer size
     uint16_t subsampling[2];                ///< YUV subsampling factors
     struct LZWEncodeState *lzws;            ///< LZW encode state
+    uint32_t dpi;                           ///< image resolution in DPI
 } TiffEncoderContext;
 
 /**
- * Check free space in buffer
+ * Check free space in buffer.
+ *
  * @param s Tiff context
  * @param need Needed bytes
  * @return 0 - ok, 1 - no free space
@@ -85,13 +95,13 @@ static inline int check_size(TiffEncoderContext *s, uint64_t need)
 }
 
 /**
- * Put n values to buffer
+ * Put n values to buffer.
  *
- * @param p Pointer to pointer to output buffer
- * @param n Number of values
- * @param val Pointer to values
- * @param type Type of values
- * @param flip =0 - normal copy, >0 - flip
+ * @param p pointer to pointer to output buffer
+ * @param n number of values
+ * @param val pointer to values
+ * @param type type of values
+ * @param flip = 0 - normal copy, >0 - flip
  */
 static void tnput(uint8_t **p, int n, const uint8_t *val, enum TiffTypes type,
                   int flip)
@@ -106,28 +116,29 @@ static void tnput(uint8_t **p, int n, const uint8_t *val, enum TiffTypes type,
 
 /**
  * Add entry to directory in tiff header.
+ *
  * @param s Tiff context
- * @param tag Tag that identifies the entry
- * @param type Entry type
- * @param count The number of values
- * @param ptr_val Pointer to values
+ * @param tag tag that identifies the entry
+ * @param type entry type
+ * @param count the number of values
+ * @param ptr_val pointer to values
  */
 static int add_entry(TiffEncoderContext *s, enum TiffTags tag,
                      enum TiffTypes type, int count, const void *ptr_val)
 {
     uint8_t *entries_ptr = s->entries + 12 * s->num_entries;
 
-    assert(s->num_entries < TIFF_MAX_ENTRY);
+    av_assert0(s->num_entries < TIFF_MAX_ENTRY);
 
     bytestream_put_le16(&entries_ptr, tag);
     bytestream_put_le16(&entries_ptr, type);
     bytestream_put_le32(&entries_ptr, count);
 
-    if (type_sizes[type] * count <= 4) {
+    if (type_sizes[type] * (int64_t)count <= 4) {
         tnput(&entries_ptr, count, ptr_val, type, 0);
     } else {
         bytestream_put_le32(&entries_ptr, *s->buf - s->buf_start);
-        if (check_size(s, count * type_sizes2[type]))
+        if (check_size(s, count * (int64_t)type_sizes2[type]))
             return AVERROR_INVALIDDATA;
         tnput(s->buf, count, ptr_val, type, 0);
     }
@@ -146,14 +157,14 @@ static int add_entry1(TiffEncoderContext *s,
 }
 
 /**
- * Encode one strip in tiff file
+ * Encode one strip in tiff file.
  *
  * @param s Tiff context
- * @param src Input buffer
- * @param dst Output buffer
- * @param n Size of input buffer
- * @param compr Compression method
- * @return Number of output bytes. If an output error is encountered, a negative
+ * @param src input buffer
+ * @param dst output buffer
+ * @param n size of input buffer
+ * @param compr compression method
+ * @return number of output bytes. If an output error is encountered, a negative
  * value corresponding to an AVERROR error code is returned.
  */
 static int encode_strip(TiffEncoderContext *s, const int8_t *src,
@@ -167,7 +178,7 @@ static int encode_strip(TiffEncoderContext *s, const int8_t *src,
         unsigned long zlen = s->buf_size - (*s->buf - s->buf_start);
         if (compress(dst, &zlen, src, n) != Z_OK) {
             av_log(s->avctx, AV_LOG_ERROR, "Compressing failed\n");
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
         }
         return zlen;
     }
@@ -183,6 +194,8 @@ static int encode_strip(TiffEncoderContext *s, const int8_t *src,
     case TIFF_LZW:
         return ff_lzw_encode(s->lzws, src, n);
     default:
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported compression method: %d\n",
+               compr);
         return AVERROR(EINVAL);
     }
 }
@@ -194,13 +207,24 @@ static void pack_yuv(TiffEncoderContext *s, const AVFrame *p,
     int w       = (s->width - 1) / s->subsampling[0] + 1;
     uint8_t *pu = &p->data[1][lnum / s->subsampling[1] * p->linesize[1]];
     uint8_t *pv = &p->data[2][lnum / s->subsampling[1] * p->linesize[2]];
-    for (i = 0; i < w; i++) {
-        for (j = 0; j < s->subsampling[1]; j++)
-            for (k = 0; k < s->subsampling[0]; k++)
-                *dst++ = p->data[0][(lnum + j) * p->linesize[0] +
-                                    i * s->subsampling[0] + k];
-        *dst++ = *pu++;
-        *dst++ = *pv++;
+    if (s->width % s->subsampling[0] || s->height % s->subsampling[1]) {
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    *dst++ = p->data[0][FFMIN(lnum + j, s->height-1) * p->linesize[0] +
+                                        FFMIN(i * s->subsampling[0] + k, s->width-1)];
+            *dst++ = *pu++;
+            *dst++ = *pv++;
+        }
+    }else{
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    *dst++ = p->data[0][(lnum + j) * p->linesize[0] +
+                                        i * s->subsampling[0] + k];
+            *dst++ = *pu++;
+            *dst++ = *pv++;
+        }
     }
 }
 
@@ -209,86 +233,78 @@ static void pack_yuv(TiffEncoderContext *s, const AVFrame *p,
         ret = add_entry(s, tag, type, count, ptr_val);  \
         if (ret < 0)                                    \
             goto fail;                                  \
-    } while(0);
+    } while (0)
 
 #define ADD_ENTRY1(s, tag, type, val)           \
     do {                                        \
         ret = add_entry1(s, tag, type, val);    \
         if (ret < 0)                            \
             goto fail;                          \
-    } while(0);
+    } while (0)
 
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pict, int *got_packet)
 {
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
     TiffEncoderContext *s = avctx->priv_data;
     const AVFrame *const p = pict;
     int i;
     uint8_t *ptr;
     uint8_t *offset;
     uint32_t strips;
-    uint32_t *strip_sizes   = NULL;
-    uint32_t *strip_offsets = NULL;
     int bytes_per_row;
-    uint32_t res[2]    = { 72, 1 };     // image resolution (72/1)
-    uint16_t bpp_tab[] = { 8, 8, 8, 8 };
+    uint32_t res[2] = { s->dpi, 1 };    // image resolution (72/1)
+    uint16_t bpp_tab[4];
     int ret = 0;
-    int is_yuv = 0;
-    uint8_t *yuv_line = NULL;
+    int is_yuv = 0, alpha = 0;
     int shift_h, shift_v;
     int packet_size;
-    const AVPixFmtDescriptor *pfd;
-
-    s->avctx = avctx;
 
     s->width          = avctx->width;
     s->height         = avctx->height;
     s->subsampling[0] = 1;
     s->subsampling[1] = 1;
 
+    if (!desc)
+        return AVERROR(EINVAL);
+
+    avctx->bits_per_coded_sample =
+    s->bpp          = av_get_bits_per_pixel(desc);
+    s->bpp_tab_size = desc->nb_components;
+
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_RGBA64LE:
-    case AV_PIX_FMT_RGB48LE:
-    case AV_PIX_FMT_GRAY16LE:
     case AV_PIX_FMT_RGBA:
+        alpha = 1;
+    case AV_PIX_FMT_RGB48LE:
     case AV_PIX_FMT_RGB24:
-    case AV_PIX_FMT_GRAY8:
-    case AV_PIX_FMT_PAL8:
-        pfd = av_pix_fmt_desc_get(avctx->pix_fmt);
-        if (!pfd)
-            return AVERROR_BUG;
-        s->bpp = av_get_bits_per_pixel(pfd);
-        if (pfd->flags & AV_PIX_FMT_FLAG_PAL)
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_PALETTE;
-        else if (pfd->flags & AV_PIX_FMT_FLAG_RGB)
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_RGB;
-        else
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_BLACK_IS_ZERO;
-        s->bpp_tab_size = pfd->nb_components;
-        for (i = 0; i < s->bpp_tab_size; i++)
-            bpp_tab[i] = s->bpp / s->bpp_tab_size;
+        s->photometric_interpretation = TIFF_PHOTOMETRIC_RGB;
         break;
+    case AV_PIX_FMT_GRAY8:
+        avctx->bits_per_coded_sample = 0x28;
+    case AV_PIX_FMT_GRAY8A:
+    case AV_PIX_FMT_YA16LE:
+        alpha = avctx->pix_fmt == AV_PIX_FMT_GRAY8A || avctx->pix_fmt == AV_PIX_FMT_YA16LE;
+    case AV_PIX_FMT_GRAY16LE:
     case AV_PIX_FMT_MONOBLACK:
-        s->bpp                        = 1;
         s->photometric_interpretation = TIFF_PHOTOMETRIC_BLACK_IS_ZERO;
-        s->bpp_tab_size               = 0;
+        break;
+    case AV_PIX_FMT_PAL8:
+        s->photometric_interpretation = TIFF_PHOTOMETRIC_PALETTE;
         break;
     case AV_PIX_FMT_MONOWHITE:
-        s->bpp                        = 1;
         s->photometric_interpretation = TIFF_PHOTOMETRIC_WHITE_IS_ZERO;
-        s->bpp_tab_size               = 0;
         break;
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUV410P:
     case AV_PIX_FMT_YUV411P:
         av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &shift_h, &shift_v);
         s->photometric_interpretation = TIFF_PHOTOMETRIC_YCBCR;
-        s->bpp                        = 8 + (16 >> (shift_h + shift_v));
         s->subsampling[0]             = 1 << shift_h;
         s->subsampling[1]             = 1 << shift_v;
-        s->bpp_tab_size               = 3;
         is_yuv                        = 1;
         break;
     default:
@@ -297,6 +313,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         return AVERROR(EINVAL);
     }
 
+    for (i = 0; i < s->bpp_tab_size; i++)
+        bpp_tab[i] = desc->comp[i].depth;
+
     if (s->compr == TIFF_DEFLATE       ||
         s->compr == TIFF_ADOBE_DEFLATE ||
         s->compr == TIFF_LZW)
@@ -310,14 +329,13 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     strips = (s->height - 1) / s->rps + 1;
 
-    packet_size = avctx->height * ((avctx->width * s->bpp + 7) >> 3) * 2 +
+    bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
+                     s->subsampling[0] * s->subsampling[1] + 7) >> 3;
+    packet_size = avctx->height * bytes_per_row * 2 +
                   avctx->height * 4 + AV_INPUT_BUFFER_MIN_SIZE;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, packet_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, packet_size, 0)) < 0)
         return ret;
-    }
     ptr          = pkt->data;
     s->buf_start = pkt->data;
     s->buf       = &ptr;
@@ -335,18 +353,21 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     offset = ptr;
     bytestream_put_le32(&ptr, 0);
 
-    strip_sizes   = av_mallocz_array(strips, sizeof(*strip_sizes));
-    strip_offsets = av_mallocz_array(strips, sizeof(*strip_offsets));
-    if (!strip_sizes || !strip_offsets) {
+    if (strips > INT_MAX / FFMAX(sizeof(s->strip_sizes[0]), sizeof(s->strip_offsets[0]))) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    av_fast_padded_mallocz(&s->strip_sizes  , &s->strip_sizes_size  , sizeof(s->strip_sizes  [0]) * strips);
+    av_fast_padded_mallocz(&s->strip_offsets, &s->strip_offsets_size, sizeof(s->strip_offsets[0]) * strips);
+
+    if (!s->strip_sizes || !s->strip_offsets) {
         ret = AVERROR(ENOMEM);
         goto fail;
     }
 
-    bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
-                     s->subsampling[0] * s->subsampling[1] + 7) >> 3;
     if (is_yuv) {
-        yuv_line = av_malloc(bytes_per_row);
-        if (!yuv_line) {
+        av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, bytes_per_row);
+        if (s->yuv_line == NULL) {
             av_log(s->avctx, AV_LOG_ERROR, "Not enough memory\n");
             ret = AVERROR(ENOMEM);
             goto fail;
@@ -365,12 +386,12 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             ret = AVERROR(ENOMEM);
             goto fail;
         }
-        strip_offsets[0] = ptr - pkt->data;
+        s->strip_offsets[0] = ptr - pkt->data;
         zn               = 0;
         for (j = 0; j < s->rps; j++) {
             if (is_yuv) {
-                pack_yuv(s, p, yuv_line, j);
-                memcpy(zbuf + zn, yuv_line, bytes_per_row);
+                pack_yuv(s, p, s->yuv_line, j);
+                memcpy(zbuf + zn, s->yuv_line, bytes_per_row);
                 j += s->subsampling[1] - 1;
             } else
                 memcpy(zbuf + j * bytes_per_row,
@@ -384,9 +405,10 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             goto fail;
         }
         ptr           += ret;
-        strip_sizes[0] = ptr - pkt->data - strip_offsets[0];
+        s->strip_sizes[0] = ptr - pkt->data - s->strip_offsets[0];
     } else
 #endif
+    {
     if (s->compr == TIFF_LZW) {
         s->lzws = av_malloc(ff_lzw_encode_state_size);
         if (!s->lzws) {
@@ -395,17 +417,17 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
     for (i = 0; i < s->height; i++) {
-        if (strip_sizes[i / s->rps] == 0) {
+        if (s->strip_sizes[i / s->rps] == 0) {
             if (s->compr == TIFF_LZW) {
                 ff_lzw_encode_init(s->lzws, ptr,
                                    s->buf_size - (*s->buf - s->buf_start),
                                    12, FF_LZW_TIFF, put_bits);
             }
-            strip_offsets[i / s->rps] = ptr - pkt->data;
+            s->strip_offsets[i / s->rps] = ptr - pkt->data;
         }
         if (is_yuv) {
-            pack_yuv(s, p, yuv_line, i);
-            ret = encode_strip(s, yuv_line, ptr, bytes_per_row, s->compr);
+            pack_yuv(s, p, s->yuv_line, i);
+            ret = encode_strip(s, s->yuv_line, ptr, bytes_per_row, s->compr);
             i  += s->subsampling[1] - 1;
         } else
             ret = encode_strip(s, p->data[0] + i * p->linesize[0],
@@ -414,17 +436,18 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             av_log(s->avctx, AV_LOG_ERROR, "Encode strip failed\n");
             goto fail;
         }
-        strip_sizes[i / s->rps] += ret;
+        s->strip_sizes[i / s->rps] += ret;
         ptr                     += ret;
         if (s->compr == TIFF_LZW &&
             (i == s->height - 1 || i % s->rps == s->rps - 1)) {
             ret = ff_lzw_encode_flush(s->lzws, flush_put_bits);
-            strip_sizes[(i / s->rps)] += ret;
-            ptr                       += ret;
+            s->strip_sizes[(i / s->rps)] += ret;
+            ptr                          += ret;
         }
     }
     if (s->compr == TIFF_LZW)
-        av_free(s->lzws);
+        av_freep(&s->lzws);
+    }
 
     s->num_entries = 0;
 
@@ -437,14 +460,21 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     ADD_ENTRY1(s, TIFF_COMPR,       TIFF_SHORT, s->compr);
     ADD_ENTRY1(s, TIFF_PHOTOMETRIC, TIFF_SHORT, s->photometric_interpretation);
-    ADD_ENTRY(s,  TIFF_STRIP_OFFS,  TIFF_LONG,  strips, strip_offsets);
+    ADD_ENTRY(s,  TIFF_STRIP_OFFS,  TIFF_LONG,  strips, s->strip_offsets);
 
     if (s->bpp_tab_size)
         ADD_ENTRY1(s, TIFF_SAMPLES_PER_PIXEL, TIFF_SHORT, s->bpp_tab_size);
 
     ADD_ENTRY1(s, TIFF_ROWSPERSTRIP, TIFF_LONG,     s->rps);
-    ADD_ENTRY(s,  TIFF_STRIP_SIZE,   TIFF_LONG,     strips, strip_sizes);
+    ADD_ENTRY(s,  TIFF_STRIP_SIZE,   TIFF_LONG,     strips, s->strip_sizes);
     ADD_ENTRY(s,  TIFF_XRES,         TIFF_RATIONAL, 1,      res);
+    if (avctx->sample_aspect_ratio.num > 0 &&
+        avctx->sample_aspect_ratio.den > 0) {
+        AVRational y = av_mul_q(av_make_q(s->dpi, 1),
+                                avctx->sample_aspect_ratio);
+        res[0] = y.num;
+        res[1] = y.den;
+    }
     ADD_ENTRY(s,  TIFF_YRES,         TIFF_RATIONAL, 1,      res);
     ADD_ENTRY1(s, TIFF_RES_UNIT,     TIFF_SHORT,    2);
 
@@ -462,10 +492,14 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
         ADD_ENTRY(s, TIFF_PAL, TIFF_SHORT, 256 * 3, pal);
     }
+    if (alpha)
+        ADD_ENTRY1(s,TIFF_EXTRASAMPLES,      TIFF_SHORT,            2);
     if (is_yuv) {
         /** according to CCIR Recommendation 601.1 */
         uint32_t refbw[12] = { 15, 1, 235, 1, 128, 1, 240, 1, 128, 1, 240, 1 };
         ADD_ENTRY(s, TIFF_YCBCR_SUBSAMPLING, TIFF_SHORT,    2, s->subsampling);
+        if (avctx->chroma_sample_location == AVCHROMA_LOC_TOPLEFT)
+            ADD_ENTRY1(s, TIFF_YCBCR_POSITIONING, TIFF_SHORT, 2);
         ADD_ENTRY(s, TIFF_REFERENCE_BW,      TIFF_RATIONAL, 6, refbw);
     }
     // write offset to dir
@@ -484,17 +518,14 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     *got_packet = 1;
 
 fail:
-    av_free(strip_sizes);
-    av_free(strip_offsets);
-    av_free(yuv_line);
-    return ret;
+    return ret < 0 ? ret : 0;
 }
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
-#if !CONFIG_ZLIB
     TiffEncoderContext *s = avctx->priv_data;
 
+#if !CONFIG_ZLIB
     if (s->compr == TIFF_DEFLATE) {
         av_log(avctx, AV_LOG_ERROR,
                "Deflate compression needs zlib compiled in\n");
@@ -508,6 +539,18 @@ FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->key_frame = 1;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+    s->avctx = avctx;
+
+    return 0;
+}
+
+static av_cold int encode_close(AVCodecContext *avctx)
+{
+    TiffEncoderContext *s = avctx->priv_data;
+
+    av_freep(&s->strip_sizes);
+    av_freep(&s->strip_offsets);
+    av_freep(&s->yuv_line);
 
     return 0;
 }
@@ -515,6 +558,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define OFFSET(x) offsetof(TiffEncoderContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
+    {"dpi", "set the image resolution (in dpi)", OFFSET(dpi), AV_OPT_TYPE_INT, {.i64 = 72}, 1, 0x10000, AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_ENCODING_PARAM},
     { "compression_algo", NULL, OFFSET(compr), AV_OPT_TYPE_INT,   { .i64 = TIFF_PACKBITS }, TIFF_RAW, TIFF_DEFLATE, VE, "compression_algo" },
     { "packbits",         NULL, 0,             AV_OPT_TYPE_CONST, { .i64 = TIFF_PACKBITS }, 0,        0,            VE, "compression_algo" },
     { "raw",              NULL, 0,             AV_OPT_TYPE_CONST, { .i64 = TIFF_RAW      }, 0,        0,            VE, "compression_algo" },
@@ -537,13 +581,15 @@ AVCodec ff_tiff_encoder = {
     .id             = AV_CODEC_ID_TIFF,
     .priv_data_size = sizeof(TiffEncoderContext),
     .init           = encode_init,
+    .close          = encode_close,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .encode2        = encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB48LE, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_RGBA, AV_PIX_FMT_RGBA64LE,
-        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16LE,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A, AV_PIX_FMT_GRAY16LE, AV_PIX_FMT_YA16LE,
         AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_MONOWHITE,
-        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
         AV_PIX_FMT_NONE
     },
diff --git a/libavcodec/tmv.c b/libavcodec/tmv.c
index a9fcdf3..b738fcb 100644
--- a/libavcodec/tmv.c
+++ b/libavcodec/tmv.c
@@ -2,20 +2,20 @@
  * 8088flex TMV video decoder
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "libavutil/internal.h"
+#include "libavutil/xga_font_data.h"
 
 #include "cga_data.h"
 
@@ -45,10 +46,8 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
     unsigned x, y, fg, bg, c;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (avpkt->size < 2*char_rows*char_cols) {
         av_log(avctx, AV_LOG_ERROR,
@@ -63,6 +62,7 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
 
     frame->palette_has_changed = 1;
     memcpy(frame->data[1], ff_cga_palette, 16 * 4);
+    memset(frame->data[1] + 16 * 4, 0, AVPALETTE_SIZE - 16 * 4);
 
     for (y = 0; y < char_rows; y++) {
         for (x = 0; x < char_cols; x++) {
@@ -70,7 +70,7 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
             bg = *src  >> 4;
             fg = *src++ & 0xF;
             ff_draw_pc_font(dst + x * 8, frame->linesize[0],
-                            ff_cga_font, 8, c, fg, bg);
+                            avpriv_cga_font, 8, c, fg, bg);
         }
         dst += frame->linesize[0] * 8;
     }
diff --git a/libavcodec/tpeldsp.c b/libavcodec/tpeldsp.c
index 7ea1da4..cc4fed3 100644
--- a/libavcodec/tpeldsp.c
+++ b/libavcodec/tpeldsp.c
@@ -1,20 +1,20 @@
 /*
  * thirdpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tpeldsp.h b/libavcodec/tpeldsp.h
index 9c67d60..3732f17 100644
--- a/libavcodec/tpeldsp.h
+++ b/libavcodec/tpeldsp.h
@@ -1,20 +1,20 @@
 /*
  * thirdpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/trace_headers_bsf.c b/libavcodec/trace_headers_bsf.c
index 9c97dd4..3ec78fe 100644
--- a/libavcodec/trace_headers_bsf.c
+++ b/libavcodec/trace_headers_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 
 typedef struct TraceHeadersContext {
     CodedBitstreamContext *cbc;
+    CodedBitstreamFragment fragment;
 } TraceHeadersContext;
 
 
@@ -44,83 +45,69 @@ static int trace_headers_init(AVBSFContext *bsf)
     ctx->cbc->trace_level  = AV_LOG_INFO;
 
     if (bsf->par_in->extradata) {
-        CodedBitstreamFragment ps;
+        CodedBitstreamFragment *frag = &ctx->fragment;
 
         av_log(bsf, AV_LOG_INFO, "Extradata\n");
 
-        err = ff_cbs_read_extradata(ctx->cbc, &ps, bsf->par_in);
-        if (err < 0) {
-            av_log(bsf, AV_LOG_ERROR, "Failed to read extradata.\n");
-            return err;
-        }
+        err = ff_cbs_read_extradata(ctx->cbc, frag, bsf->par_in);
 
-        ff_cbs_fragment_uninit(ctx->cbc, &ps);
+        ff_cbs_fragment_reset(ctx->cbc, frag);
     }
 
-    return 0;
+    return err;
 }
 
 static void trace_headers_close(AVBSFContext *bsf)
 {
     TraceHeadersContext *ctx = bsf->priv_data;
 
+    ff_cbs_fragment_free(ctx->cbc, &ctx->fragment);
     ff_cbs_close(&ctx->cbc);
 }
 
-static int trace_headers(AVBSFContext *bsf, AVPacket *out)
+static int trace_headers(AVBSFContext *bsf, AVPacket *pkt)
 {
     TraceHeadersContext *ctx = bsf->priv_data;
-    CodedBitstreamFragment au;
-    AVPacket *in;
+    CodedBitstreamFragment *frag = &ctx->fragment;
     char tmp[256] = { 0 };
     int err;
 
-    err = ff_bsf_get_packet(bsf, &in);
+    err = ff_bsf_get_packet_ref(bsf, pkt);
     if (err < 0)
         return err;
 
-    if (in->flags & AV_PKT_FLAG_KEY)
+    if (pkt->flags & AV_PKT_FLAG_KEY)
         av_strlcat(tmp, ", key frame", sizeof(tmp));
-    if (in->flags & AV_PKT_FLAG_CORRUPT)
+    if (pkt->flags & AV_PKT_FLAG_CORRUPT)
         av_strlcat(tmp, ", corrupt", sizeof(tmp));
 
-    if (in->pts != AV_NOPTS_VALUE)
-        av_strlcatf(tmp, sizeof(tmp), ", pts %"PRId64, in->pts);
+    if (pkt->pts != AV_NOPTS_VALUE)
+        av_strlcatf(tmp, sizeof(tmp), ", pts %"PRId64, pkt->pts);
     else
         av_strlcat(tmp, ", no pts", sizeof(tmp));
-    if (in->dts != AV_NOPTS_VALUE)
-        av_strlcatf(tmp, sizeof(tmp), ", dts %"PRId64, in->dts);
+    if (pkt->dts != AV_NOPTS_VALUE)
+        av_strlcatf(tmp, sizeof(tmp), ", dts %"PRId64, pkt->dts);
     else
         av_strlcat(tmp, ", no dts", sizeof(tmp));
-    if (in->duration > 0)
-        av_strlcatf(tmp, sizeof(tmp), ", duration %"PRId64, in->duration);
+    if (pkt->duration > 0)
+        av_strlcatf(tmp, sizeof(tmp), ", duration %"PRId64, pkt->duration);
 
-    av_log(bsf, AV_LOG_INFO, "Packet: %d bytes%s.\n", in->size, tmp);
+    av_log(bsf, AV_LOG_INFO, "Packet: %d bytes%s.\n", pkt->size, tmp);
 
-    err = ff_cbs_read_packet(ctx->cbc, &au, in);
-    if (err < 0)
-        return err;
-
-    ff_cbs_fragment_uninit(ctx->cbc, &au);
+    err = ff_cbs_read_packet(ctx->cbc, frag, pkt);
 
-    av_packet_move_ref(out, in);
-    av_packet_free(&in);
+    ff_cbs_fragment_reset(ctx->cbc, frag);
 
-    return 0;
+    if (err < 0)
+        av_packet_unref(pkt);
+    return err;
 }
 
-static const enum AVCodecID trace_headers_codec_ids[] = {
-    AV_CODEC_ID_H264,
-    AV_CODEC_ID_HEVC,
-    AV_CODEC_ID_MPEG2VIDEO,
-    AV_CODEC_ID_NONE,
-};
-
 const AVBitStreamFilter ff_trace_headers_bsf = {
     .name           = "trace_headers",
     .priv_data_size = sizeof(TraceHeadersContext),
     .init           = &trace_headers_init,
     .close          = &trace_headers_close,
     .filter         = &trace_headers,
-    .codec_ids      = trace_headers_codec_ids,
+    .codec_ids      = ff_cbs_all_codec_ids,
 };
diff --git a/libavcodec/truehd_core_bsf.c b/libavcodec/truehd_core_bsf.c
new file mode 100644
index 0000000..be021af
--- /dev/null
+++ b/libavcodec/truehd_core_bsf.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "get_bits.h"
+#include "mlp_parser.h"
+#include "mlp.h"
+
+typedef struct AccessUnit {
+    uint8_t bits[4];
+    uint16_t offset;
+    uint16_t optional;
+} AccessUnit;
+
+typedef struct TrueHDCoreContext {
+    const AVClass *class;
+
+    MLPHeaderInfo hdr;
+} TrueHDCoreContext;
+
+static int truehd_core_filter(AVBSFContext *ctx, AVPacket *out)
+{
+    TrueHDCoreContext *s = ctx->priv_data;
+    GetBitContext gbc;
+    AccessUnit units[MAX_SUBSTREAMS];
+    AVPacket *in;
+    int ret, i, size, last_offset = 0;
+    int in_size, out_size;
+    int have_header = 0;
+    int substream_bits = 0;
+    int start, end;
+    uint16_t dts;
+
+    ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
+
+    if (in->size < 4)
+        goto fail;
+
+    ret = init_get_bits(&gbc, in->data, 32);
+    if (ret < 0)
+        goto fail;
+
+    skip_bits(&gbc, 4);
+    in_size = get_bits(&gbc, 12) * 2;
+    if (in_size < 4 || in_size > in->size)
+        goto fail;
+
+    out_size = in_size;
+    dts = get_bits(&gbc, 16);
+
+    ret = init_get_bits8(&gbc, in->data + 4, in->size - 4);
+    if (ret < 0)
+        goto fail;
+
+    if (show_bits_long(&gbc, 32) == 0xf8726fba) {
+        if ((ret = ff_mlp_read_major_sync(ctx, &s->hdr, &gbc)) != 0)
+            goto fail;
+        have_header = 1;
+    }
+
+    if (s->hdr.num_substreams > MAX_SUBSTREAMS)
+        goto fail;
+
+    start = get_bits_count(&gbc);
+    for (i = 0; i < s->hdr.num_substreams; i++) {
+        for (int j = 0; j < 4; j++)
+            units[i].bits[j] = get_bits1(&gbc);
+
+        units[i].offset = get_bits(&gbc, 12) * 2;
+        if (i < FFMIN(s->hdr.num_substreams, 3)) {
+            last_offset = units[i].offset;
+            substream_bits += 16;
+        }
+
+        if (units[i].bits[0]) {
+            units[i].optional = get_bits(&gbc, 16);
+            if (i < FFMIN(s->hdr.num_substreams, 3))
+                substream_bits += 16;
+        }
+    }
+    end = get_bits_count(&gbc);
+
+    size = ((end + 7) >> 3) + 4 + last_offset;
+    if (size >= 0 && size <= in->size)
+        out_size = size;
+    if (out_size < in_size) {
+        int bpos = 0, reduce = (end - start - substream_bits) >> 4;
+        uint16_t parity_nibble = 0;
+        uint16_t auheader;
+
+        ret = av_new_packet(out, out_size);
+        if (ret < 0)
+            goto fail;
+
+        AV_WB16(out->data + 2, dts);
+        parity_nibble = dts;
+        out->size -= reduce * 2;
+        parity_nibble ^= out->size / 2;
+
+        if (have_header) {
+            memcpy(out->data + 4, in->data + 4, 28);
+            out->data[16 + 4] = (out->data[16 + 4] & 0x0f) | (FFMIN(s->hdr.num_substreams, 3) << 4);
+            out->data[25 + 4] = out->data[25 + 4] & 0xfe;
+            out->data[26 + 4] = 0xff;
+            out->data[27 + 4] = 0xff;
+            AV_WL16(out->data + 4 + 26, ff_mlp_checksum16(out->data + 4, 26));
+        }
+
+        for (i = 0; i < FFMIN(s->hdr.num_substreams, 3); i++) {
+            uint16_t substr_hdr = 0;
+
+            substr_hdr |= (units[i].bits[0] << 15);
+            substr_hdr |= (units[i].bits[1] << 14);
+            substr_hdr |= (units[i].bits[2] << 13);
+            substr_hdr |= (units[i].bits[3] << 12);
+            substr_hdr |= (units[i].offset / 2) & 0x0FFF;
+
+            AV_WB16(out->data + have_header * 28 + 4 + bpos, substr_hdr);
+
+            parity_nibble ^= out->data[have_header * 28 + 4 + bpos++];
+            parity_nibble ^= out->data[have_header * 28 + 4 + bpos++];
+
+            if (units[i].bits[0]) {
+                AV_WB16(out->data + have_header * 28 + 4 + bpos, units[i].optional);
+
+                parity_nibble ^= out->data[have_header * 28 + 4 + bpos++];
+                parity_nibble ^= out->data[have_header * 28 + 4 + bpos++];
+            }
+        }
+
+        parity_nibble ^= parity_nibble >> 8;
+        parity_nibble ^= parity_nibble >> 4;
+        parity_nibble &= 0xF;
+
+        memcpy(out->data + have_header * 28 + 4 + bpos,
+               in->data + 4 + (end >> 3),
+               out_size - (4 + (end >> 3)));
+        auheader  = (parity_nibble ^ 0xF) << 12;
+        auheader |= (out->size / 2) & 0x0fff;
+        AV_WB16(out->data, auheader);
+
+        ret = av_packet_copy_props(out, in);
+    } else {
+        av_packet_move_ref(out, in);
+    }
+
+fail:
+    if (ret < 0)
+        av_packet_unref(out);
+    av_packet_free(&in);
+
+    return ret;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_TRUEHD, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_truehd_core_bsf = {
+    .name           = "truehd_core",
+    .priv_data_size = sizeof(TrueHDCoreContext),
+    .filter         = truehd_core_filter,
+    .codec_ids      = codec_ids,
+};
diff --git a/libavcodec/truemotion1.c b/libavcodec/truemotion1.c
index 3eab33a..e182438 100644
--- a/libavcodec/truemotion1.c
+++ b/libavcodec/truemotion1.c
@@ -2,20 +2,20 @@
  * Duck TrueMotion 1.0 Decoder
  * Copyright (C) 2003 Alex Beregszaszi & Mike Melanson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -177,10 +177,10 @@ static int make_ydt15_entry(int p1, int p2, int16_t *ydt)
     int lo, hi;
 
     lo = ydt[p1];
-    lo += (lo << 5) + (lo << 10);
+    lo += (lo * 32) + (lo * 1024);
     hi = ydt[p2];
-    hi += (hi << 5) + (hi << 10);
-    return (lo + (hi << 16)) << 1;
+    hi += (hi * 32) + (hi * 1024);
+    return (lo + (hi * (1U << 16))) * 2;
 }
 
 static int make_cdt15_entry(int p1, int p2, int16_t *cdt)
@@ -188,9 +188,9 @@ static int make_cdt15_entry(int p1, int p2, int16_t *cdt)
     int r, b, lo;
 
     b = cdt[p2];
-    r = cdt[p1] << 10;
+    r = cdt[p1] * 1024;
     lo = b + r;
-    return (lo + (lo << 16)) << 1;
+    return (lo + (lo * (1U << 16))) * 2;
 }
 
 #if HAVE_BIGENDIAN
@@ -215,7 +215,7 @@ static int make_cdt16_entry(int p1, int p2, int16_t *cdt)
     b = cdt[p2];
     r = cdt[p1] << 11;
     lo = b + r;
-    return (lo + (lo << 16)) << 1;
+    return (lo + (lo * (1 << 16))) * 2;
 }
 
 static int make_ydt24_entry(int p1, int p2, int16_t *ydt)
@@ -224,7 +224,7 @@ static int make_ydt24_entry(int p1, int p2, int16_t *ydt)
 
     lo = ydt[p1];
     hi = ydt[p2];
-    return (lo + (hi << 8) + (hi << 16)) << 1;
+    return (lo + (hi * (1 << 8)) + (hi * (1 << 16))) * 2;
 }
 
 static int make_cdt24_entry(int p1, int p2, int16_t *cdt)
@@ -232,8 +232,8 @@ static int make_cdt24_entry(int p1, int p2, int16_t *cdt)
     int r, b;
 
     b = cdt[p2];
-    r = cdt[p1]<<16;
-    return (b+r) << 1;
+    r = cdt[p1] * (1 << 16);
+    return (b+r) * 2;
 }
 
 static void gen_vector_table15(TrueMotion1Context *s, const uint8_t *sel_vector_table)
@@ -396,12 +396,16 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     }
 
     if (compression_types[header.compression].algorithm == ALGO_RGB24H) {
-        new_pix_fmt = AV_PIX_FMT_RGB32;
+        new_pix_fmt = AV_PIX_FMT_0RGB32;
         width_shift = 1;
     } else
         new_pix_fmt = AV_PIX_FMT_RGB555; // RGB565 is supported as well
 
     s->w >>= width_shift;
+    if (s->w & 1) {
+        avpriv_request_sample(s->avctx, "Frame with odd width");
+        return AVERROR_PATCHWELCOME;
+    }
 
     if (s->w != s->avctx->width || s->h != s->avctx->height ||
         new_pix_fmt != s->avctx->pix_fmt) {
@@ -415,6 +419,8 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
         ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
 
         av_fast_malloc(&s->vert_pred, &s->vert_pred_size, s->avctx->width * sizeof(unsigned int));
+        if (!s->vert_pred)
+            return AVERROR(ENOMEM);
     }
 
     /* There is 1 change bit per 4 pixels, so each change byte represents
@@ -483,6 +489,10 @@ static av_cold int truemotion1_decode_init(AVCodecContext *avctx)
     /* there is a vertical predictor for each pixel in a line; each vertical
      * predictor is 0 to start with */
     av_fast_malloc(&s->vert_pred, &s->vert_pred_size, s->avctx->width * sizeof(unsigned int));
+    if (!s->vert_pred) {
+        av_frame_free(&s->frame);
+        return AVERROR(ENOMEM);
+    }
 
     return 0;
 }
@@ -637,7 +647,8 @@ static void truemotion1_decode_16bit(TrueMotion1Context *s)
         current_pixel_pair = (unsigned int *)current_line;
         vert_pred = s->vert_pred;
         mb_change_index = 0;
-        mb_change_byte = mb_change_bits[mb_change_index++];
+        if (!keyframe)
+            mb_change_byte = mb_change_bits[mb_change_index++];
         mb_change_byte_mask = 0x01;
         pixels_left = s->avctx->width;
 
@@ -871,10 +882,8 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
     if ((ret = truemotion1_decode_header(s)) < 0)
         return ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (compression_types[s->compression].algorithm == ALGO_RGB24H) {
         truemotion1_decode_24bit(s);
@@ -896,7 +905,7 @@ static av_cold int truemotion1_decode_end(AVCodecContext *avctx)
     TrueMotion1Context *s = avctx->priv_data;
 
     av_frame_free(&s->frame);
-    av_free(s->vert_pred);
+    av_freep(&s->vert_pred);
 
     return 0;
 }
diff --git a/libavcodec/truemotion1data.h b/libavcodec/truemotion1data.h
index e950450..3e58143 100644
--- a/libavcodec/truemotion1data.h
+++ b/libavcodec/truemotion1data.h
@@ -6,20 +6,20 @@
  * the GNU LGPL using the common understanding that data tables necessary
  * for decoding algorithms are not necessarily copyrightable.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_TRUEMOTION1DATA_H
diff --git a/libavcodec/truemotion2.c b/libavcodec/truemotion2.c
index cc3b0b7..4d27f0c 100644
--- a/libavcodec/truemotion2.c
+++ b/libavcodec/truemotion2.c
@@ -2,20 +2,20 @@
  * Duck/ON2 TrueMotion 2 Decoder
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,11 +27,10 @@
 #include <inttypes.h>
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bswapdsp.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
-#include "vlc.h"
 
 #define TM2_ESCAPE 0x80000000
 #define TM2_DELTAS 64
@@ -63,9 +62,13 @@ typedef struct TM2Context {
     AVCodecContext *avctx;
     AVFrame *pic;
 
-    BitstreamContext bc;
+    GetBitContext gb;
+    int error;
     BswapDSPContext bdsp;
 
+    uint8_t *buffer;
+    int buffer_size;
+
     /* TM2 streams */
     int *tokens[TM2_NUM_STREAMS];
     int tok_lens[TM2_NUM_STREAMS];
@@ -88,7 +91,7 @@ typedef struct TM2Context {
 * Huffman codes for each of streams
 */
 typedef struct TM2Codes {
-    VLC vlc; ///< table for Libav bitstream reader
+    VLC vlc; ///< table for FFmpeg bitstream reader
     int bits;
     int *recode; ///< table for converting from code indexes to values
     int length;
@@ -109,16 +112,20 @@ typedef struct TM2Huff {
     int *lens; ///< codelengths
 } TM2Huff;
 
+/**
+ *
+ * @returns the length of the longest code or an AVERROR code
+ */
 static int tm2_read_tree(TM2Context *ctx, uint32_t prefix, int length, TM2Huff *huff)
 {
-    int ret;
+    int ret, ret2;
     if (length > huff->max_bits) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Tree exceeded its given depth (%i)\n",
                huff->max_bits);
         return AVERROR_INVALIDDATA;
     }
 
-    if (!bitstream_read_bit(&ctx->bc)) { /* literal */
+    if (!get_bits1(&ctx->gb)) { /* literal */
         if (length == 0) {
             length = 1;
         }
@@ -126,18 +133,18 @@ static int tm2_read_tree(TM2Context *ctx, uint32_t prefix, int length, TM2Huff *
             av_log(ctx->avctx, AV_LOG_DEBUG, "Too many literals\n");
             return AVERROR_INVALIDDATA;
         }
-        huff->nums[huff->num] = bitstream_read(&ctx->bc, huff->val_bits);
+        huff->nums[huff->num] = get_bits_long(&ctx->gb, huff->val_bits);
         huff->bits[huff->num] = prefix;
         huff->lens[huff->num] = length;
         huff->num++;
-        return 0;
+        return length;
     } else { /* non-terminal node */
-        if ((ret = tm2_read_tree(ctx, prefix << 1, length + 1, huff)) < 0)
-            return ret;
+        if ((ret2 = tm2_read_tree(ctx, prefix << 1, length + 1, huff)) < 0)
+            return ret2;
         if ((ret = tm2_read_tree(ctx, (prefix << 1) | 1, length + 1, huff)) < 0)
             return ret;
     }
-    return 0;
+    return FFMAX(ret, ret2);
 }
 
 static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
@@ -145,10 +152,10 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
     TM2Huff huff;
     int res = 0;
 
-    huff.val_bits = bitstream_read(&ctx->bc, 5);
-    huff.max_bits = bitstream_read(&ctx->bc, 5);
-    huff.min_bits = bitstream_read(&ctx->bc, 5);
-    huff.nodes    = bitstream_read(&ctx->bc, 17);
+    huff.val_bits = get_bits(&ctx->gb, 5);
+    huff.max_bits = get_bits(&ctx->gb, 5);
+    huff.min_bits = get_bits(&ctx->gb, 5);
+    huff.nodes    = get_bits_long(&ctx->gb, 17);
     huff.num      = 0;
 
     /* check for correct codes parameters */
@@ -169,9 +176,10 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
 
     /* allocate space for codes - it is exactly ceil(nodes / 2) entries */
     huff.max_num = (huff.nodes + 1) >> 1;
-    huff.nums    = av_mallocz(huff.max_num * sizeof(int));
-    huff.bits    = av_mallocz(huff.max_num * sizeof(uint32_t));
-    huff.lens    = av_mallocz(huff.max_num * sizeof(int));
+    huff.nums    = av_calloc(huff.max_num, sizeof(int));
+    huff.bits    = av_calloc(huff.max_num, sizeof(uint32_t));
+    huff.lens    = av_calloc(huff.max_num, sizeof(int));
+
     if (!huff.nums || !huff.bits || !huff.lens) {
         res = AVERROR(ENOMEM);
         goto out;
@@ -179,6 +187,11 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
 
     res = tm2_read_tree(ctx, 0, 0, &huff);
 
+    if (res >= 0 && res != huff.max_bits) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Got less bits than expected: %i of %i\n",
+               res, huff.max_bits);
+        res = AVERROR_INVALIDDATA;
+    }
     if (huff.num != huff.max_num) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Got less codes than expected: %i of %i\n",
                huff.num, huff.max_num);
@@ -197,7 +210,7 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
         else {
             code->bits = huff.max_bits;
             code->length = huff.max_num;
-            code->recode = av_malloc(code->length * sizeof(int));
+            code->recode = av_malloc_array(code->length, sizeof(int));
             if (!code->recode) {
                 res = AVERROR(ENOMEM);
                 goto out;
@@ -223,10 +236,12 @@ static void tm2_free_codes(TM2Codes *code)
         ff_free_vlc(&code->vlc);
 }
 
-static inline int tm2_get_token(BitstreamContext *bc, TM2Codes *code)
+static inline int tm2_get_token(GetBitContext *gb, TM2Codes *code)
 {
     int val;
-    val = bitstream_read_vlc(bc, code->vlc.table, code->bits, 1);
+    val = get_vlc2(gb, code->vlc.table, code->bits, 1);
+    if(val<0)
+        return -1;
     return code->recode[val];
 }
 
@@ -255,18 +270,19 @@ static int tm2_read_deltas(TM2Context *ctx, int stream_id)
     int d, mb;
     int i, v;
 
-    d  = bitstream_read(&ctx->bc, 9);
-    mb = bitstream_read(&ctx->bc, 5);
+    d  = get_bits(&ctx->gb, 9);
+    mb = get_bits(&ctx->gb, 5);
 
-    if ((d < 1) || (d > TM2_DELTAS) || (mb < 1) || (mb > 32)) {
+    av_assert2(mb < 32);
+    if ((d < 1) || (d > TM2_DELTAS) || (mb < 1)) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect delta table: %i deltas x %i bits\n", d, mb);
         return AVERROR_INVALIDDATA;
     }
 
     for (i = 0; i < d; i++) {
-        v = bitstream_read(&ctx->bc, mb);
+        v = get_bits_long(&ctx->gb, mb);
         if (v & (1 << (mb - 1)))
-            ctx->deltas[stream_id][i] = v - (1 << mb);
+            ctx->deltas[stream_id][i] = v - (1U << mb);
         else
             ctx->deltas[stream_id][i] = v;
     }
@@ -284,18 +300,23 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
     TM2Codes codes;
     GetByteContext gb;
 
+    if (buf_size < 4) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "not enough space for len left\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* get stream length in dwords */
     bytestream2_init(&gb, buf, buf_size);
     len  = bytestream2_get_be32(&gb);
-    skip = len * 4 + 4;
 
     if (len == 0)
         return 4;
 
-    if (len >= INT_MAX / 4 - 1 || len < 0 || skip > buf_size) {
+    if (len >= INT_MAX / 4 - 1 || len < 0 || len * 4 + 4 > buf_size) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Error, invalid stream size.\n");
         return AVERROR_INVALIDDATA;
     }
+    skip = len * 4 + 4;
 
     toks = bytestream2_get_be32(&gb);
     if (toks & 1) {
@@ -307,10 +328,10 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
             pos = bytestream2_tell(&gb);
             if (skip <= pos)
                 return AVERROR_INVALIDDATA;
-            bitstream_init8(&ctx->bc, buf + pos, skip - pos);
+            init_get_bits(&ctx->gb, buf + pos, (skip - pos) * 8);
             if ((ret = tm2_read_deltas(ctx, stream_id)) < 0)
                 return ret;
-            bytestream2_skip(&gb, ((bitstream_tell(&ctx->bc) + 31) >> 5) << 2);
+            bytestream2_skip(&gb, ((get_bits_count(&ctx->gb) + 31) >> 5) << 2);
         }
     }
     /* skip unused fields */
@@ -324,61 +345,83 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
     pos = bytestream2_tell(&gb);
     if (skip <= pos)
         return AVERROR_INVALIDDATA;
-    bitstream_init8(&ctx->bc, buf + pos, skip - pos);
+    init_get_bits(&ctx->gb, buf + pos, (skip - pos) * 8);
     if ((ret = tm2_build_huff_table(ctx, &codes)) < 0)
         return ret;
-    bytestream2_skip(&gb, ((bitstream_tell(&ctx->bc) + 31) >> 5) << 2);
+    bytestream2_skip(&gb, ((get_bits_count(&ctx->gb) + 31) >> 5) << 2);
 
     toks >>= 1;
     /* check if we have sane number of tokens */
     if ((toks < 0) || (toks > 0xFFFFFF)) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect number of tokens: %i\n", toks);
-        tm2_free_codes(&codes);
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    ret = av_reallocp_array(&ctx->tokens[stream_id], toks, sizeof(int));
+    if (ret < 0) {
+        ctx->tok_lens[stream_id] = 0;
+        goto end;
     }
-    ctx->tokens[stream_id]   = av_realloc(ctx->tokens[stream_id], toks * sizeof(int));
     ctx->tok_lens[stream_id] = toks;
     len = bytestream2_get_be32(&gb);
     if (len > 0) {
         pos = bytestream2_tell(&gb);
-        if (skip <= pos)
-            return AVERROR_INVALIDDATA;
-        bitstream_init8(&ctx->bc, buf + pos, skip - pos);
+        if (skip <= pos) {
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
+        init_get_bits(&ctx->gb, buf + pos, (skip - pos) * 8);
         for (i = 0; i < toks; i++) {
-            if (bitstream_bits_left(&ctx->bc) <= 0) {
+            if (get_bits_left(&ctx->gb) <= 0) {
                 av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect number of tokens: %i\n", toks);
-                return AVERROR_INVALIDDATA;
+                ret = AVERROR_INVALIDDATA;
+                goto end;
             }
-            ctx->tokens[stream_id][i] = tm2_get_token(&ctx->bc, &codes);
-            if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS) {
+            ctx->tokens[stream_id][i] = tm2_get_token(&ctx->gb, &codes);
+            if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS || ctx->tokens[stream_id][i]<0) {
                 av_log(ctx->avctx, AV_LOG_ERROR, "Invalid delta token index %d for type %d, n=%d\n",
                        ctx->tokens[stream_id][i], stream_id, i);
-                return AVERROR_INVALIDDATA;
+                ret = AVERROR_INVALIDDATA;
+                goto end;
             }
         }
     } else {
+        if (len < 0) {
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
         for (i = 0; i < toks; i++) {
             ctx->tokens[stream_id][i] = codes.recode[0];
             if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS) {
                 av_log(ctx->avctx, AV_LOG_ERROR, "Invalid delta token index %d for type %d, n=%d\n",
                        ctx->tokens[stream_id][i], stream_id, i);
-                return AVERROR_INVALIDDATA;
+                ret = AVERROR_INVALIDDATA;
+                goto end;
             }
         }
     }
-    tm2_free_codes(&codes);
 
-    return skip;
+    ret = skip;
+
+end:
+    tm2_free_codes(&codes);
+    return ret;
 }
 
 static inline int GET_TOK(TM2Context *ctx,int type)
 {
     if (ctx->tok_ptrs[type] >= ctx->tok_lens[type]) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Read token from stream %i out of bounds (%i>=%i)\n", type, ctx->tok_ptrs[type], ctx->tok_lens[type]);
+        ctx->error = 1;
         return 0;
     }
-    if (type <= TM2_MOT)
+    if (type <= TM2_MOT) {
+        if (ctx->tokens[type][ctx->tok_ptrs[type]] >= TM2_DELTAS) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "token %d is too large\n", ctx->tokens[type][ctx->tok_ptrs[type]]);
+            return 0;
+        }
         return ctx->deltas[type][ctx->tokens[type][ctx->tok_ptrs[type]++]];
+    }
     return ctx->tokens[type][ctx->tok_ptrs[type]++];
 }
 
@@ -413,15 +456,15 @@ static inline int GET_TOK(TM2Context *ctx,int type)
 
 /* recalculate last and delta values for next blocks */
 #define TM2_RECALC_BLOCK(CHR, stride, last, CD) {\
-    CD[0] = CHR[1] - last[1];\
-    CD[1] = (int)CHR[stride + 1] - (int)CHR[1];\
+    CD[0] = (unsigned)CHR[         1] - (unsigned)last[1];\
+    CD[1] = (unsigned)CHR[stride + 1] - (unsigned) CHR[1];\
     last[0] = (int)CHR[stride + 0];\
     last[1] = (int)CHR[stride + 1];}
 
 /* common operations - add deltas to 4x4 block of luma or 2x2 blocks of chroma */
 static inline void tm2_apply_deltas(TM2Context *ctx, int* Y, int stride, int *deltas, int *last)
 {
-    int ct, d;
+    unsigned ct, d;
     int i, j;
 
     for (j = 0; j < 4; j++){
@@ -437,7 +480,7 @@ static inline void tm2_apply_deltas(TM2Context *ctx, int* Y, int stride, int *de
     }
 }
 
-static inline void tm2_high_chroma(int *data, int stride, int *last, int *CD, int *deltas)
+static inline void tm2_high_chroma(int *data, int stride, int *last, unsigned *CD, int *deltas)
 {
     int i, j;
     for (j = 0; j < 2; j++) {
@@ -450,7 +493,7 @@ static inline void tm2_high_chroma(int *data, int stride, int *last, int *CD, in
     }
 }
 
-static inline void tm2_low_chroma(int *data, int stride, int *clast, int *CD, int *deltas, int bx)
+static inline void tm2_low_chroma(int *data, int stride, int *clast, unsigned *CD, int *deltas, int bx)
 {
     int t;
     int l;
@@ -460,8 +503,8 @@ static inline void tm2_low_chroma(int *data, int stride, int *clast, int *CD, in
         prev = clast[-3];
     else
         prev = 0;
-    t        = (CD[0] + CD[1]) >> 1;
-    l        = (prev - CD[0] - CD[1] + clast[1]) >> 1;
+    t        = (int)(CD[0] + CD[1]) >> 1;
+    l        = (int)(prev - CD[0] - CD[1] + clast[1]) >> 1;
     CD[1]    = CD[0] + CD[1] - t;
     CD[0]    = t;
     clast[0] = l;
@@ -685,6 +728,11 @@ static inline void tm2_motion_block(TM2Context *ctx, AVFrame *pic, int bx, int b
     mx = av_clip(mx, -(bx * 4 + 4), ctx->avctx->width  - bx * 4);
     my = av_clip(my, -(by * 4 + 4), ctx->avctx->height - by * 4);
 
+    if (4*bx+mx<0 || 4*by+my<0 || 4*bx+mx+4 > ctx->avctx->width || 4*by+my+4 > ctx->avctx->height) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "MV out of picture\n");
+        return;
+    }
+
     Yo += my * oYstride + mx;
     Uo += (my >> 1) * oUstride + (mx >> 1);
     Vo += (my >> 1) * oVstride + (mx >> 1);
@@ -776,6 +824,8 @@ static int tm2_decode_blocks(TM2Context *ctx, AVFrame *p)
             default:
                 av_log(ctx->avctx, AV_LOG_ERROR, "Skipping unknown block type %i\n", type);
             }
+            if (ctx->error)
+                return AVERROR_INVALIDDATA;
         }
     }
 
@@ -855,37 +905,37 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p    = l->pic;
     int offset           = TM2_HEADER_SIZE;
     int i, t, ret;
-    uint8_t *swbuf;
 
-    swbuf = av_malloc(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!swbuf) {
+    l->error = 0;
+
+    av_fast_padded_malloc(&l->buffer, &l->buffer_size, buf_size);
+    if (!l->buffer) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
         return AVERROR(ENOMEM);
     }
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        av_free(swbuf);
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
 
-    l->bdsp.bswap_buf((uint32_t *) swbuf, (const uint32_t *) buf,
+    l->bdsp.bswap_buf((uint32_t *) l->buffer, (const uint32_t *) buf,
                       buf_size >> 2);
 
-    if ((ret = tm2_read_header(l, swbuf)) < 0) {
-        av_free(swbuf);
+    if ((ret = tm2_read_header(l, l->buffer)) < 0) {
         return ret;
     }
 
     for (i = 0; i < TM2_NUM_STREAMS; i++) {
         if (offset >= buf_size) {
-            av_free(swbuf);
+            av_log(avctx, AV_LOG_ERROR, "no space for tm2_read_stream\n");
             return AVERROR_INVALIDDATA;
         }
-        t = tm2_read_stream(l, swbuf + offset, tm2_stream_order[i],
+
+        t = tm2_read_stream(l, l->buffer + offset, tm2_stream_order[i],
                             buf_size - offset);
         if (t < 0) {
-            av_free(swbuf);
+            int j = tm2_stream_order[i];
+            if (l->tok_lens[j])
+                memset(l->tokens[j], 0, sizeof(**l->tokens) * l->tok_lens[j]);
             return t;
         }
         offset += t;
@@ -899,7 +949,6 @@ static int decode_frame(AVCodecContext *avctx,
     l->cur = !l->cur;
     *got_frame      = 1;
     ret = av_frame_ref(data, l->pic);
-    av_free(swbuf);
 
     return (ret < 0) ? ret : buf_size;
 }
@@ -923,8 +972,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     ff_bswapdsp_init(&l->bdsp);
 
-    l->last  = av_malloc(4 * sizeof(*l->last)  * (w >> 2));
-    l->clast = av_malloc(4 * sizeof(*l->clast) * (w >> 2));
+    l->last  = av_malloc_array(w >> 2, 4 * sizeof(*l->last) );
+    l->clast = av_malloc_array(w >> 2, 4 * sizeof(*l->clast));
 
     for (i = 0; i < TM2_NUM_STREAMS; i++) {
         l->tokens[i] = NULL;
@@ -933,15 +982,15 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     w += 8;
     h += 8;
-    l->Y1_base = av_malloc(sizeof(*l->Y1_base) * w * h);
-    l->Y2_base = av_malloc(sizeof(*l->Y2_base) * w * h);
+    l->Y1_base = av_calloc(w * h, sizeof(*l->Y1_base));
+    l->Y2_base = av_calloc(w * h, sizeof(*l->Y2_base));
     l->y_stride = w;
     w = (w + 1) >> 1;
     h = (h + 1) >> 1;
-    l->U1_base = av_malloc(sizeof(*l->U1_base) * w * h);
-    l->V1_base = av_malloc(sizeof(*l->V1_base) * w * h);
-    l->U2_base = av_malloc(sizeof(*l->U2_base) * w * h);
-    l->V2_base = av_malloc(sizeof(*l->V1_base) * w * h);
+    l->U1_base = av_calloc(w * h, sizeof(*l->U1_base));
+    l->V1_base = av_calloc(w * h, sizeof(*l->V1_base));
+    l->U2_base = av_calloc(w * h, sizeof(*l->U2_base));
+    l->V2_base = av_calloc(w * h, sizeof(*l->V1_base));
     l->uv_stride = w;
     l->cur = 0;
     if (!l->Y1_base || !l->Y2_base || !l->U1_base ||
@@ -955,6 +1004,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         av_freep(&l->V2_base);
         av_freep(&l->last);
         av_freep(&l->clast);
+        av_frame_free(&l->pic);
         return AVERROR(ENOMEM);
     }
     l->Y1 = l->Y1_base + l->y_stride  * 4 + 4;
@@ -975,15 +1025,17 @@ static av_cold int decode_end(AVCodecContext *avctx)
     av_free(l->last);
     av_free(l->clast);
     for (i = 0; i < TM2_NUM_STREAMS; i++)
-        av_free(l->tokens[i]);
+        av_freep(&l->tokens[i]);
     if (l->Y1) {
-        av_free(l->Y1_base);
-        av_free(l->U1_base);
-        av_free(l->V1_base);
-        av_free(l->Y2_base);
-        av_free(l->U2_base);
-        av_free(l->V2_base);
+        av_freep(&l->Y1_base);
+        av_freep(&l->U1_base);
+        av_freep(&l->V1_base);
+        av_freep(&l->Y2_base);
+        av_freep(&l->U2_base);
+        av_freep(&l->V2_base);
     }
+    av_freep(&l->buffer);
+    l->buffer_size = 0;
 
     av_frame_free(&l->pic);
 
diff --git a/libavcodec/truemotion2rt.c b/libavcodec/truemotion2rt.c
index e6dbad8..e3ab998 100644
--- a/libavcodec/truemotion2rt.c
+++ b/libavcodec/truemotion2rt.c
@@ -1,20 +1,20 @@
 /*
  * Duck TrueMotion 2.0 Real Time decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,11 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 typedef struct TrueMotion2RTContext {
-    BitstreamContext bc;
+    GetBitContext gb;
     int delta_size;
     int hscale;
 } TrueMotion2RTContext;
@@ -56,7 +56,7 @@ static const int16_t *const delta_tabs[] = {
 
 /* Returns the number of bytes consumed from the bytestream, or
  * AVERROR_INVALIDDATA if there was an error while decoding the header. */
-static int truemotion2rt_decode_header(AVCodecContext *avctx, AVPacket *avpkt)
+static int truemotion2rt_decode_header(AVCodecContext *avctx, const AVPacket *avpkt)
 {
     TrueMotion2RTContext *s = avctx->priv_data;
     int header_size;
@@ -107,7 +107,7 @@ static int truemotion2rt_decode_frame(AVCodecContext *avctx, void *data,
 {
     TrueMotion2RTContext *s = avctx->priv_data;
     AVFrame * const p = data;
-    BitstreamContext *bc = &s->bc;
+    GetBitContext *gb = &s->gb;
     uint8_t *dst;
     int x, y, delta_mode;
     int ret;
@@ -116,7 +116,10 @@ static int truemotion2rt_decode_frame(AVCodecContext *avctx, void *data,
     if (ret < 0)
         return ret;
 
-    ret = bitstream_init8(bc, avpkt->data + ret, avpkt->size - ret);
+    if ((avctx->width + s->hscale - 1)/ s->hscale * avctx->height * s->delta_size > avpkt->size * 8LL * 4)
+        return AVERROR_INVALIDDATA;
+
+    ret = init_get_bits8(gb, avpkt->data + ret, avpkt->size - ret);
     if (ret < 0)
         return ret;
 
@@ -124,13 +127,13 @@ static int truemotion2rt_decode_frame(AVCodecContext *avctx, void *data,
     if (ret < 0)
         return ret;
 
-    bitstream_skip(bc, 32);
+    skip_bits(gb, 32);
     delta_mode = s->delta_size - 2;
     dst = p->data[0];
     for (y = 0; y < avctx->height; y++) {
         int diff = 0;
         for (x = 0; x < avctx->width; x += s->hscale) {
-            diff  += delta_tabs[delta_mode][bitstream_read(bc, s->delta_size)];
+            diff  += delta_tabs[delta_mode][get_bits(gb, s->delta_size)];
             dst[x] = av_clip_uint8((y ? dst[x - p->linesize[0]] : 0) + diff);
         }
         dst += p->linesize[0];
@@ -156,7 +159,7 @@ static int truemotion2rt_decode_frame(AVCodecContext *avctx, void *data,
     for (y = 0; y < avctx->height >> 2; y++) {
         int diff = 0;
         for (x = 0; x < avctx->width >> 2; x += s->hscale) {
-            diff  += delta_tabs[delta_mode][bitstream_read(bc, s->delta_size)];
+            diff  += delta_tabs[delta_mode][get_bits(gb, s->delta_size)];
             dst[x] = av_clip_uint8((y ? dst[x - p->linesize[1]] : 128) + diff);
         }
         dst += p->linesize[1];
@@ -182,7 +185,7 @@ static int truemotion2rt_decode_frame(AVCodecContext *avctx, void *data,
     for (y = 0; y < avctx->height >> 2; y++) {
         int diff = 0;
         for (x = 0; x < avctx->width >> 2; x += s->hscale) {
-            diff  += delta_tabs[delta_mode][bitstream_read(bc, s->delta_size)];
+            diff  += delta_tabs[delta_mode][get_bits(gb, s->delta_size)];
             dst[x] = av_clip_uint8((y ? dst[x - p->linesize[2]] : 128) + diff);
         }
         dst += p->linesize[2];
diff --git a/libavcodec/truespeech.c b/libavcodec/truespeech.c
index 6b9afae..d4ddfcb 100644
--- a/libavcodec/truespeech.c
+++ b/libavcodec/truespeech.c
@@ -2,33 +2,31 @@
  * DSP Group TrueSpeech compatible decoder
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/intreadwrite.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bswapdsp.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #include "truespeech_data.h"
-
 /**
  * @file
  * TrueSpeech decoder.
@@ -79,50 +77,50 @@ static av_cold int truespeech_decode_init(AVCodecContext * avctx)
 
 static void truespeech_read_frame(TSContext *dec, const uint8_t *input)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
 
     dec->bdsp.bswap_buf((uint32_t *) dec->buffer, (const uint32_t *) input, 8);
-    bitstream_init8(&bc, dec->buffer, 32);
-
-    dec->vector[7] = ts_codebook[7][bitstream_read(&bc, 3)];
-    dec->vector[6] = ts_codebook[6][bitstream_read(&bc, 3)];
-    dec->vector[5] = ts_codebook[5][bitstream_read(&bc, 3)];
-    dec->vector[4] = ts_codebook[4][bitstream_read(&bc, 4)];
-    dec->vector[3] = ts_codebook[3][bitstream_read(&bc, 4)];
-    dec->vector[2] = ts_codebook[2][bitstream_read(&bc, 4)];
-    dec->vector[1] = ts_codebook[1][bitstream_read(&bc, 5)];
-    dec->vector[0] = ts_codebook[0][bitstream_read(&bc, 5)];
-    dec->flag      = bitstream_read_bit(&bc);
-
-    dec->offset1[0] = bitstream_read(&bc, 4) << 4;
-    dec->offset2[3] = bitstream_read(&bc, 7);
-    dec->offset2[2] = bitstream_read(&bc, 7);
-    dec->offset2[1] = bitstream_read(&bc, 7);
-    dec->offset2[0] = bitstream_read(&bc, 7);
-
-    dec->offset1[1]  = bitstream_read(&bc, 4);
-    dec->pulseval[1] = bitstream_read(&bc, 14);
-    dec->pulseval[0] = bitstream_read(&bc, 14);
-
-    dec->offset1[1] |= bitstream_read(&bc, 4) << 4;
-    dec->pulseval[3] = bitstream_read(&bc, 14);
-    dec->pulseval[2] = bitstream_read(&bc, 14);
-
-    dec->offset1[0] |= bitstream_read_bit(&bc);
-    dec->pulsepos[0] = bitstream_read(&bc, 27);
-    dec->pulseoff[0] = bitstream_read(&bc, 4);
-
-    dec->offset1[0] |= bitstream_read_bit(&bc) << 1;
-    dec->pulsepos[1] = bitstream_read(&bc, 27);
-    dec->pulseoff[1] = bitstream_read(&bc, 4);
-
-    dec->offset1[0] |= bitstream_read_bit(&bc) << 2;
-    dec->pulsepos[2] = bitstream_read(&bc, 27);
-    dec->pulseoff[2] = bitstream_read(&bc, 4);
-
-    dec->offset1[0] |= bitstream_read_bit(&bc) << 3;
-    dec->pulsepos[3] = bitstream_read(&bc, 27);
-    dec->pulseoff[3] = bitstream_read(&bc, 4);
+    init_get_bits(&gb, dec->buffer, 32 * 8);
+
+    dec->vector[7] = ts_codebook[7][get_bits(&gb, 3)];
+    dec->vector[6] = ts_codebook[6][get_bits(&gb, 3)];
+    dec->vector[5] = ts_codebook[5][get_bits(&gb, 3)];
+    dec->vector[4] = ts_codebook[4][get_bits(&gb, 4)];
+    dec->vector[3] = ts_codebook[3][get_bits(&gb, 4)];
+    dec->vector[2] = ts_codebook[2][get_bits(&gb, 4)];
+    dec->vector[1] = ts_codebook[1][get_bits(&gb, 5)];
+    dec->vector[0] = ts_codebook[0][get_bits(&gb, 5)];
+    dec->flag      = get_bits1(&gb);
+
+    dec->offset1[0] = get_bits(&gb, 4) << 4;
+    dec->offset2[3] = get_bits(&gb, 7);
+    dec->offset2[2] = get_bits(&gb, 7);
+    dec->offset2[1] = get_bits(&gb, 7);
+    dec->offset2[0] = get_bits(&gb, 7);
+
+    dec->offset1[1]  = get_bits(&gb, 4);
+    dec->pulseval[1] = get_bits(&gb, 14);
+    dec->pulseval[0] = get_bits(&gb, 14);
+
+    dec->offset1[1] |= get_bits(&gb, 4) << 4;
+    dec->pulseval[3] = get_bits(&gb, 14);
+    dec->pulseval[2] = get_bits(&gb, 14);
+
+    dec->offset1[0] |= get_bits1(&gb);
+    dec->pulsepos[0] = get_bits_long(&gb, 27);
+    dec->pulseoff[0] = get_bits(&gb, 4);
+
+    dec->offset1[0] |= get_bits1(&gb) << 1;
+    dec->pulsepos[1] = get_bits_long(&gb, 27);
+    dec->pulseoff[1] = get_bits(&gb, 4);
+
+    dec->offset1[0] |= get_bits1(&gb) << 2;
+    dec->pulsepos[2] = get_bits_long(&gb, 27);
+    dec->pulseoff[2] = get_bits(&gb, 4);
+
+    dec->offset1[0] |= get_bits1(&gb) << 3;
+    dec->pulsepos[3] = get_bits_long(&gb, 27);
+    dec->pulseoff[3] = get_bits(&gb, 4);
 }
 
 static void truespeech_correlate_filter(TSContext *dec)
@@ -327,10 +325,8 @@ static int truespeech_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = iterations * 240;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     memset(samples, 0, iterations * 240 * sizeof(*samples));
diff --git a/libavcodec/truespeech_data.h b/libavcodec/truespeech_data.h
index 6e9806a..73ebda5 100644
--- a/libavcodec/truespeech_data.h
+++ b/libavcodec/truespeech_data.h
@@ -2,20 +2,20 @@
  * DSP Group TrueSpeech compatible decoder
  * copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tscc.c b/libavcodec/tscc.c
index 7c54473..fc1ec4d 100644
--- a/libavcodec/tscc.c
+++ b/libavcodec/tscc.c
@@ -2,20 +2,20 @@
  * TechSmith Camtasia decoder
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,6 +46,7 @@
 typedef struct TsccContext {
 
     AVCodecContext *avctx;
+    AVFrame *frame;
 
     // Bits per pixel
     int bpp;
@@ -66,12 +67,20 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     CamtasiaContext * const c = avctx->priv_data;
-    AVFrame *frame = data;
+    AVFrame *frame = c->frame;
     int ret;
+    int palette_has_changed = 0;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (c->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        int size;
+        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, &size);
+
+        if (pal && size == AVPALETTE_SIZE) {
+            palette_has_changed = 1;
+            memcpy(c->pal, pal, AVPALETTE_SIZE);
+        } else if (pal) {
+            av_log(avctx, AV_LOG_ERROR, "Palette size %d is wrong\n", size);
+        }
     }
 
     ret = inflateReset(&c->zstream);
@@ -85,11 +94,17 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     c->zstream.avail_out = c->decomp_size;
     ret = inflate(&c->zstream, Z_FINISH);
     // Z_DATA_ERROR means empty picture
+    if (ret == Z_DATA_ERROR && !palette_has_changed) {
+        return buf_size;
+    }
+
     if ((ret != Z_OK) && (ret != Z_STREAM_END) && (ret != Z_DATA_ERROR)) {
         av_log(avctx, AV_LOG_ERROR, "Inflate error: %d\n", ret);
         return AVERROR_UNKNOWN;
     }
 
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
+        return ret;
 
     if (ret != Z_DATA_ERROR) {
         bytestream2_init(&c->gb, c->decomp_buf,
@@ -99,15 +114,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     /* make the palette available on the way out */
     if (c->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
-        const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
-
-        if (pal) {
-            frame->palette_has_changed = 1;
-            memcpy(c->pal, pal, AVPALETTE_SIZE);
-        }
+        frame->palette_has_changed = palette_has_changed;
         memcpy(frame->data[1], c->pal, AVPALETTE_SIZE);
     }
 
+    if ((ret = av_frame_ref(data, frame)) < 0)
+        return ret;
     *got_frame      = 1;
 
     /* always report that the buffer was completely consumed */
@@ -131,9 +143,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     case 24:
              avctx->pix_fmt = AV_PIX_FMT_BGR24;
              break;
-    case 32: avctx->pix_fmt = AV_PIX_FMT_RGB32; break;
+    case 32: avctx->pix_fmt = AV_PIX_FMT_0RGB32; break;
     default: av_log(avctx, AV_LOG_ERROR, "Camtasia error: unknown depth %i bpp\n", avctx->bits_per_coded_sample);
-             return AVERROR_INVALIDDATA;
+             return AVERROR_PATCHWELCOME;
     }
     c->bpp = avctx->bits_per_coded_sample;
     // buffer size for RLE 'best' case when 2-byte code precedes each pixel and there may be padding after it too
@@ -156,6 +168,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_UNKNOWN;
     }
 
+    c->frame = av_frame_alloc();
+    if (!c->frame)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
@@ -164,6 +180,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
     CamtasiaContext * const c = avctx->priv_data;
 
     av_freep(&c->decomp_buf);
+    av_frame_free(&c->frame);
 
     inflateEnd(&c->zstream);
 
@@ -180,4 +197,5 @@ AVCodec ff_tscc_decoder = {
     .close          = decode_end,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
diff --git a/libavcodec/tscc2.c b/libavcodec/tscc2.c
index 28d6464..a8c7ee7 100644
--- a/libavcodec/tscc2.c
+++ b/libavcodec/tscc2.c
@@ -2,20 +2,20 @@
  * TechSmith Screen Codec 2 (aka Dora) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,11 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
 #include "tscc2data.h"
-#include "vlc.h"
 
 typedef struct TSCC2Context {
     AVCodecContext *avctx;
@@ -42,7 +41,7 @@ typedef struct TSCC2Context {
     uint8_t        *slice_quants;
     int            quant[2];
     int            q[2][3];
-    BitstreamContext bc;
+    GetBitContext  gb;
 
     VLC            dc_vlc, nc_vlc[NUM_VLC_SETS], ac_vlc[NUM_VLC_SETS];
     int            block[16];
@@ -92,14 +91,14 @@ static av_cold int init_vlcs(TSCC2Context *c)
     return 0;
 }
 
-#define DEQUANT(val, q) ((q * val + 0x80) >> 8)
+#define DEQUANT(val, q) (((q) * (val) + 0x80) >> 8)
 #define DCT1D(d0, d1, d2, d3, s0, s1, s2, s3, OP) \
     OP(d0, 5 * ((s0) + (s1) + (s2)) + 2 * (s3));  \
     OP(d1, 5 * ((s0) - (s2) - (s3)) + 2 * (s1));  \
     OP(d2, 5 * ((s0) - (s2) + (s3)) - 2 * (s1));  \
     OP(d3, 5 * ((s0) - (s1) + (s2)) - 2 * (s3));  \
 
-#define COL_OP(a, b)  a = b
+#define COL_OP(a, b)  a = (b)
 #define ROW_OP(a, b)  a = ((b) + 0x20) >> 6
 
 static void tscc2_idct4_put(int *in, int q[3], uint8_t *dst, int stride)
@@ -128,21 +127,21 @@ static void tscc2_idct4_put(int *in, int q[3], uint8_t *dst, int stride)
 static int tscc2_decode_mb(TSCC2Context *c, int *q, int vlc_set,
                            uint8_t *dst, int stride, int plane)
 {
-    BitstreamContext *bc = &c->bc;
+    GetBitContext *gb = &c->gb;
     int prev_dc, dc, nc, ac, bpos, val;
     int i, j, k, l;
 
-    if (bitstream_read_bit(bc)) {
-        if (bitstream_read_bit(bc)) {
-            val = bitstream_read(bc, 8);
+    if (get_bits1(gb)) {
+        if (get_bits1(gb)) {
+            val = get_bits(gb, 8);
             for (i = 0; i < 8; i++, dst += stride)
                 memset(dst, val, 16);
         } else {
-            if (bitstream_bits_left(bc) < 16 * 8 * 8)
+            if (get_bits_left(gb) < 16 * 8 * 8)
                 return AVERROR_INVALIDDATA;
             for (i = 0; i < 8; i++) {
                 for (j = 0; j < 16; j++)
-                    dst[j] = bitstream_read(bc, 8);
+                    dst[j] = get_bits(gb, 8);
                 dst += stride;
             }
         }
@@ -153,30 +152,30 @@ static int tscc2_decode_mb(TSCC2Context *c, int *q, int vlc_set,
     for (j = 0; j < 2; j++) {
         for (k = 0; k < 4; k++) {
             if (!(j | k)) {
-                dc = bitstream_read(bc, 8);
+                dc = get_bits(gb, 8);
             } else {
-                dc = bitstream_read_vlc(bc, c->dc_vlc.table, 9, 2);
+                dc = get_vlc2(gb, c->dc_vlc.table, 9, 2);
                 if (dc == -1)
                     return AVERROR_INVALIDDATA;
                 if (dc == 0x100)
-                    dc = bitstream_read(bc, 8);
+                    dc = get_bits(gb, 8);
             }
             dc          = (dc + prev_dc) & 0xFF;
             prev_dc     = dc;
             c->block[0] = dc;
 
-            nc = bitstream_read_vlc(bc, c->nc_vlc[vlc_set].table, 9, 1);
+            nc = get_vlc2(gb, c->nc_vlc[vlc_set].table, 9, 1);
             if (nc == -1)
                 return AVERROR_INVALIDDATA;
 
             bpos = 1;
             memset(c->block + 1, 0, 15 * sizeof(*c->block));
             for (l = 0; l < nc; l++) {
-                ac = bitstream_read_vlc(bc, c->ac_vlc[vlc_set].table, 9, 2);
+                ac = get_vlc2(gb, c->ac_vlc[vlc_set].table, 9, 2);
                 if (ac == -1)
                     return AVERROR_INVALIDDATA;
                 if (ac == 0x1000)
-                    ac = bitstream_read(bc, 12);
+                    ac = get_bits(gb, 12);
                 bpos += ac & 0xF;
                 if (bpos >= 16)
                     return AVERROR_INVALIDDATA;
@@ -196,7 +195,8 @@ static int tscc2_decode_slice(TSCC2Context *c, int mb_y,
     int i, mb_x, q, ret;
     int off;
 
-    bitstream_init8(&c->bc, buf, buf_size);
+    if ((ret = init_get_bits8(&c->gb, buf, buf_size)) < 0)
+        return ret;
 
     for (mb_x = 0; mb_x < c->mb_width; mb_x++) {
         q = c->slice_quants[mb_x + c->mb_width * mb_y];
@@ -235,19 +235,15 @@ static int tscc2_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-        return ret;
-    }
-
     if (frame_type == 0) {
-        *got_frame      = 1;
-        if ((ret = av_frame_ref(data, c->pic)) < 0)
-            return ret;
-
+        // Skip duplicate frames
         return buf_size;
     }
 
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
+        return ret;
+    }
+
     if (bytestream2_get_bytes_left(&gb) < 4) {
         av_log(avctx, AV_LOG_ERROR, "Frame is too short\n");
         return AVERROR_INVALIDDATA;
diff --git a/libavcodec/tscc2data.h b/libavcodec/tscc2data.h
index bcadc09..5e8d18b 100644
--- a/libavcodec/tscc2data.h
+++ b/libavcodec/tscc2data.h
@@ -2,25 +2,25 @@
  * TechSmith Screen Codec 2 (aka Dora) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_TSCC2_DATA_H
-#define AVCODEC_TSCC2_DATA_H
+#ifndef AVCODEC_TSCC2DATA_H
+#define AVCODEC_TSCC2DATA_H
 
 #include <stdint.h>
 
@@ -925,4 +925,4 @@ static const uint8_t * const tscc2_ac_vlc_bits[NUM_VLC_SETS] = {
     ac_vlc_descC_bits,
 };
 
-#endif /* AVCODEC_TSCC2_DATA_H */
+#endif /* AVCODEC_TSCC2DATA_H */
diff --git a/libavcodec/tta.c b/libavcodec/tta.c
index db5e094..8f097b3 100644
--- a/libavcodec/tta.c
+++ b/libavcodec/tta.c
@@ -2,20 +2,20 @@
  * TTA (The Lossless True Audio) decoder
  * Copyright (c) 2006 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,37 +30,24 @@
 #include <limits.h>
 
 #include "libavutil/crc.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
 
 #define BITSTREAM_READER_LE
+#include "ttadata.h"
+#include "ttadsp.h"
 #include "avcodec.h"
-#include "bitstream.h"
-#include "internal.h"
+#include "get_bits.h"
+#include "thread.h"
 #include "unary.h"
+#include "internal.h"
 
 #define FORMAT_SIMPLE    1
 #define FORMAT_ENCRYPTED 2
 
-#define MAX_ORDER 16
-typedef struct TTAFilter {
-    int32_t shift, round, error;
-    int32_t qm[MAX_ORDER];
-    int32_t dx[MAX_ORDER];
-    int32_t dl[MAX_ORDER];
-} TTAFilter;
-
-typedef struct TTARice {
-    uint32_t k0, k1, sum0, sum1;
-} TTARice;
-
-typedef struct TTAChannel {
-    int32_t predictor;
-    TTAFilter filter;
-    TTARice rice;
-} TTAChannel;
-
 typedef struct TTAContext {
+    AVClass *class;
     AVCodecContext *avctx;
-    BitstreamContext bc;
     const AVCRC *crc_table;
 
     int format, channels, bps;
@@ -69,118 +56,65 @@ typedef struct TTAContext {
 
     int32_t *decode_buffer;
 
+    uint8_t crc_pass[8];
+    uint8_t *pass;
     TTAChannel *ch_ctx;
+    TTADSPContext dsp;
 } TTAContext;
 
-static const uint32_t shift_1[] = {
-    0x00000001, 0x00000002, 0x00000004, 0x00000008,
-    0x00000010, 0x00000020, 0x00000040, 0x00000080,
-    0x00000100, 0x00000200, 0x00000400, 0x00000800,
-    0x00001000, 0x00002000, 0x00004000, 0x00008000,
-    0x00010000, 0x00020000, 0x00040000, 0x00080000,
-    0x00100000, 0x00200000, 0x00400000, 0x00800000,
-    0x01000000, 0x02000000, 0x04000000, 0x08000000,
-    0x10000000, 0x20000000, 0x40000000, 0x80000000,
-    0x80000000, 0x80000000, 0x80000000, 0x80000000,
-    0x80000000, 0x80000000, 0x80000000, 0x80000000
+static const int64_t tta_channel_layouts[7] = {
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_STEREO|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_QUAD,
+    0,
+    AV_CH_LAYOUT_5POINT1_BACK,
+    AV_CH_LAYOUT_5POINT1_BACK|AV_CH_BACK_CENTER,
+    AV_CH_LAYOUT_7POINT1_WIDE
 };
 
-static const uint32_t * const shift_16 = shift_1 + 4;
-
-static const int32_t ttafilter_configs[4] = {
-    10,
-    9,
-    10,
-    12
-};
+static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
+{
+    uint32_t crc, CRC;
 
-static void ttafilter_init(TTAFilter *c, int32_t shift) {
-    memset(c, 0, sizeof(TTAFilter));
-    c->shift = shift;
-   c->round = shift_1[shift-1];
-//    c->round = 1 << (shift - 1);
-}
+    CRC = AV_RL32(buf + buf_size);
+    crc = av_crc(s->crc_table, 0xFFFFFFFFU, buf, buf_size);
+    if (CRC != (crc ^ 0xFFFFFFFFU)) {
+        av_log(s->avctx, AV_LOG_ERROR, "CRC error\n");
+        return AVERROR_INVALIDDATA;
+    }
 
-// FIXME: copy paste from original
-static inline void memshl(register int32_t *a, register int32_t *b) {
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a = *b;
+    return 0;
 }
 
-static inline void ttafilter_process(TTAFilter *c, int32_t *in)
+static uint64_t tta_check_crc64(uint8_t *pass)
 {
-    register int32_t *dl = c->dl, *qm = c->qm, *dx = c->dx, sum = c->round;
-
-    if (!c->error) {
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        dx += 8;
-    } else if(c->error < 0) {
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-    } else {
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
+    uint64_t crc = UINT64_MAX, poly = 0x42F0E1EBA9EA3693U;
+    uint8_t *end = pass + strlen(pass);
+    int i;
+
+    while (pass < end) {
+        crc ^= (uint64_t)*pass++ << 56;
+        for (i = 0; i < 8; i++)
+            crc = (crc << 1) ^ (poly & (((int64_t) crc) >> 63));
     }
 
-    *(dx-0) = ((*(dl-1) >> 30) | 1) << 2;
-    *(dx-1) = ((*(dl-2) >> 30) | 1) << 1;
-    *(dx-2) = ((*(dl-3) >> 30) | 1) << 1;
-    *(dx-3) = ((*(dl-4) >> 30) | 1);
-
-    c->error = *in;
-    *in += (sum >> c->shift);
-    *dl = *in;
-
-    *(dl-1) = *dl - *(dl-1);
-    *(dl-2) = *(dl-1) - *(dl-2);
-    *(dl-3) = *(dl-2) - *(dl-3);
-
-    memshl(c->dl, c->dl + 1);
-    memshl(c->dx, c->dx + 1);
+    return crc ^ UINT64_MAX;
 }
 
-static void rice_init(TTARice *c, uint32_t k0, uint32_t k1)
-{
-    c->k0 = k0;
-    c->k1 = k1;
-    c->sum0 = shift_16[k0];
-    c->sum1 = shift_16[k1];
-}
-
-static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
+static int allocate_buffers(AVCodecContext *avctx)
 {
-    uint32_t crc, CRC;
+    TTAContext *s = avctx->priv_data;
 
-    CRC = AV_RL32(buf + buf_size);
-    crc = av_crc(s->crc_table, 0xFFFFFFFFU, buf, buf_size);
-    if (CRC != (crc ^ 0xFFFFFFFFU)) {
-        av_log(s->avctx, AV_LOG_ERROR, "CRC error\n");
-        return AVERROR_INVALIDDATA;
+    if (s->bps < 3) {
+        s->decode_buffer = av_mallocz_array(sizeof(int32_t)*s->frame_length, s->channels);
+        if (!s->decode_buffer)
+            return AVERROR(ENOMEM);
+    } else
+        s->decode_buffer = NULL;
+    s->ch_ctx = av_malloc_array(avctx->channels, sizeof(*s->ch_ctx));
+    if (!s->ch_ctx) {
+        av_freep(&s->decode_buffer);
+        return AVERROR(ENOMEM);
     }
 
     return 0;
@@ -189,58 +123,63 @@ static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
 static av_cold int tta_decode_init(AVCodecContext * avctx)
 {
     TTAContext *s = avctx->priv_data;
+    GetBitContext gb;
     int total_frames;
+    int ret;
 
     s->avctx = avctx;
 
-    // 30bytes includes a seektable with one frame
-    if (avctx->extradata_size < 30)
-        return -1;
+    // 30bytes includes TTA1 header
+    if (avctx->extradata_size < 22)
+        return AVERROR_INVALIDDATA;
 
-    bitstream_init8(&s->bc, avctx->extradata, avctx->extradata_size);
-    if (bitstream_peek(&s->bc, 32) == AV_RL32("TTA1")) {
-        if (avctx->err_recognition & AV_EF_CRCCHECK) {
-            s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
-            tta_check_crc(s, avctx->extradata, 18);
-        }
+    s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size);
+    if (ret < 0)
+        return ret;
 
+    if (show_bits_long(&gb, 32) == AV_RL32("TTA1")) {
         /* signature */
-        bitstream_skip(&s->bc, 32);
+        skip_bits_long(&gb, 32);
 
-        s->format = bitstream_read(&s->bc, 16);
+        s->format = get_bits(&gb, 16);
         if (s->format > 2) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid format\n");
-            return -1;
+            av_log(avctx, AV_LOG_ERROR, "Invalid format\n");
+            return AVERROR_INVALIDDATA;
         }
         if (s->format == FORMAT_ENCRYPTED) {
-            avpriv_report_missing_feature(s->avctx, "Encrypted TTA");
-            return AVERROR_PATCHWELCOME;
+            if (!s->pass) {
+                av_log(avctx, AV_LOG_ERROR, "Missing password for encrypted stream. Please use the -password option\n");
+                return AVERROR(EINVAL);
+            }
+            AV_WL64(s->crc_pass, tta_check_crc64(s->pass));
         }
-        avctx->channels              =
-        s->channels                  = bitstream_read(&s->bc, 16);
-        avctx->bits_per_coded_sample = bitstream_read(&s->bc, 16);
-        s->bps = (avctx->bits_per_coded_sample + 7) / 8;
-        avctx->sample_rate           = bitstream_read(&s->bc, 32);
-        s->data_length               = bitstream_read(&s->bc, 32);
-        bitstream_skip(&s->bc, 32); // CRC32 of header
+        avctx->channels = s->channels = get_bits(&gb, 16);
+        if (s->channels > 1 && s->channels < 9)
+            avctx->channel_layout = tta_channel_layouts[s->channels-2];
+        avctx->bits_per_raw_sample = get_bits(&gb, 16);
+        s->bps = (avctx->bits_per_raw_sample + 7) / 8;
+        avctx->sample_rate = get_bits_long(&gb, 32);
+        s->data_length = get_bits_long(&gb, 32);
+        skip_bits_long(&gb, 32); // CRC32 of header
 
         if (s->channels == 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of channels\n");
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
             return AVERROR_INVALIDDATA;
         } else if (avctx->sample_rate == 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid samplerate\n");
+            av_log(avctx, AV_LOG_ERROR, "Invalid samplerate\n");
             return AVERROR_INVALIDDATA;
         }
 
         switch(s->bps) {
+        case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8; break;
         case 2:
             avctx->sample_fmt = AV_SAMPLE_FMT_S16;
-            avctx->bits_per_raw_sample = 16;
             break;
         case 3:
             avctx->sample_fmt = AV_SAMPLE_FMT_S32;
-            avctx->bits_per_raw_sample = 24;
             break;
+        //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32; break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Invalid/unsupported sample format.\n");
             return AVERROR_INVALIDDATA;
@@ -257,54 +196,35 @@ static av_cold int tta_decode_init(AVCodecContext * avctx)
         total_frames = s->data_length / s->frame_length +
                        (s->last_frame_length ? 1 : 0);
 
-        av_log(s->avctx, AV_LOG_DEBUG, "format: %d chans: %d bps: %d rate: %d block: %d\n",
+        av_log(avctx, AV_LOG_DEBUG, "format: %d chans: %d bps: %d rate: %d block: %d\n",
             s->format, avctx->channels, avctx->bits_per_coded_sample, avctx->sample_rate,
             avctx->block_align);
-        av_log(s->avctx, AV_LOG_DEBUG, "data_length: %d frame_length: %d last: %d total: %d\n",
+        av_log(avctx, AV_LOG_DEBUG, "data_length: %d frame_length: %d last: %d total: %d\n",
             s->data_length, s->frame_length, s->last_frame_length, total_frames);
 
-        // FIXME: seek table
-        if (avctx->extradata_size <= 26 || total_frames > INT_MAX / 4 ||
-            avctx->extradata_size - 26 < total_frames * 4)
-            av_log(avctx, AV_LOG_WARNING, "Seek table missing or too small\n");
-        else if (avctx->err_recognition & AV_EF_CRCCHECK) {
-            int ret = tta_check_crc(s, avctx->extradata + 22, total_frames * 4);
-            if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE)
-                return AVERROR_INVALIDDATA;
-        }
-        bitstream_skip(&s->bc, 32 * total_frames);
-        bitstream_skip(&s->bc, 32); // CRC32 of seektable
-
         if(s->frame_length >= UINT_MAX / (s->channels * sizeof(int32_t))){
             av_log(avctx, AV_LOG_ERROR, "frame_length too large\n");
-            return -1;
-        }
-
-        if (s->bps == 2) {
-            s->decode_buffer = av_mallocz(sizeof(int32_t)*s->frame_length*s->channels);
-            if (!s->decode_buffer)
-                return AVERROR(ENOMEM);
-        }
-        s->ch_ctx = av_malloc(avctx->channels * sizeof(*s->ch_ctx));
-        if (!s->ch_ctx) {
-            av_freep(&s->decode_buffer);
-            return AVERROR(ENOMEM);
+            return AVERROR_INVALIDDATA;
         }
     } else {
         av_log(avctx, AV_LOG_ERROR, "Wrong extradata present\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
-    return 0;
+    ff_ttadsp_init(&s->dsp);
+
+    return allocate_buffers(avctx);
 }
 
 static int tta_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame_ptr, AVPacket *avpkt)
 {
     AVFrame *frame     = data;
+    ThreadFrame tframe = { .f = data };
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     TTAContext *s = avctx->priv_data;
+    GetBitContext gb;
     int i, ret;
     int cur_chan = 0, framelen = s->frame_length;
     int32_t *p;
@@ -315,14 +235,13 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             return AVERROR_INVALIDDATA;
     }
 
-    bitstream_init8(&s->bc, buf, buf_size);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     /* get output buffer */
     frame->nb_samples = framelen;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
-    }
 
     // decode directly to output buffer for 24-bit sample format
     if (s->bps == 3)
@@ -330,9 +249,15 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
 
     // init per channel states
     for (i = 0; i < s->channels; i++) {
+        TTAFilter *filter = &s->ch_ctx[i].filter;
         s->ch_ctx[i].predictor = 0;
-        ttafilter_init(&s->ch_ctx[i].filter, ttafilter_configs[s->bps-1]);
-        rice_init(&s->ch_ctx[i].rice, 10, 10);
+        ff_tta_filter_init(filter, ff_tta_filter_configs[s->bps-1]);
+        if (s->format == FORMAT_ENCRYPTED) {
+            int i;
+            for (i = 0; i < 8; i++)
+                filter->qm[i] = sign_extend(s->crc_pass[i], 8);
+        }
+        ff_tta_rice_init(&s->ch_ctx[i].rice, 10, 10);
     }
 
     i = 0;
@@ -343,7 +268,7 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         uint32_t unary, depth, k;
         int32_t value;
 
-        unary = get_unary(&s->bc, 0, bitstream_bits_left(&s->bc));
+        unary = get_unary(&gb, 0, get_bits_left(&gb));
 
         if (unary == 0) {
             depth = 0;
@@ -354,17 +279,17 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             unary--;
         }
 
-        if (bitstream_bits_left(&s->bc) < k) {
+        if (get_bits_left(&gb) < k) {
             ret = AVERROR_INVALIDDATA;
             goto error;
         }
 
         if (k) {
-            if (k >= 32 || unary > INT32_MAX >> k) {
+            if (k > MIN_CACHE_BITS || unary > INT32_MAX >> k) {
                 ret = AVERROR_INVALIDDATA;
                 goto error;
             }
-            value = (unary << k) + bitstream_read(&s->bc, k);
+            value = (unary << k) + get_bits(&gb, k);
         } else
             value = unary;
 
@@ -372,16 +297,16 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         switch (depth) {
         case 1:
             rice->sum1 += value - (rice->sum1 >> 4);
-            if (rice->k1 > 0 && rice->sum1 < shift_16[rice->k1])
+            if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
                 rice->k1--;
-            else if(rice->sum1 > shift_16[rice->k1 + 1])
+            else if(rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
                 rice->k1++;
-            value += shift_1[rice->k0];
+            value += ff_tta_shift_1[rice->k0];
         default:
             rice->sum0 += value - (rice->sum0 >> 4);
-            if (rice->k0 > 0 && rice->sum0 < shift_16[rice->k0])
+            if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
                 rice->k0--;
-            else if(rice->sum0 > shift_16[rice->k0 + 1])
+            else if(rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
                 rice->k0++;
         }
 
@@ -389,10 +314,11 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         *p = 1 + ((value >> 1) ^ ((value & 1) - 1));
 
         // run hybrid filter
-        ttafilter_process(filter, p);
+        s->dsp.filter_process(filter->qm, filter->dx, filter->dl, &filter->error, p,
+                              filter->shift, filter->round);
 
         // fixed order prediction
-#define PRED(x, k) (int32_t)((((uint64_t)x << k) - x) >> k)
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
         switch (s->bps) {
         case 1: *p += PRED(*predictor, 4); break;
         case 2:
@@ -414,32 +340,43 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             cur_chan = 0;
             i++;
             // check for last frame
-            if (i == s->last_frame_length && bitstream_bits_left(&s->bc) / 8 == 4) {
+            if (i == s->last_frame_length && get_bits_left(&gb) / 8 == 4) {
                 frame->nb_samples = framelen = s->last_frame_length;
                 break;
             }
         }
     }
 
-    bitstream_align(&s->bc);
-    if (bitstream_bits_left(&s->bc) < 32) {
+    align_get_bits(&gb);
+    if (get_bits_left(&gb) < 32) {
         ret = AVERROR_INVALIDDATA;
         goto error;
     }
-    bitstream_skip(&s->bc, 32); // frame CRC
+    skip_bits_long(&gb, 32); // frame crc
 
     // convert to output buffer
-    if (s->bps == 2) {
+    switch (s->bps) {
+    case 1: {
+        uint8_t *samples = (uint8_t *)frame->data[0];
+        for (p = s->decode_buffer; p < s->decode_buffer + (framelen * s->channels); p++)
+            *samples++ = *p + 0x80;
+        break;
+        }
+    case 2: {
         int16_t *samples = (int16_t *)frame->data[0];
         for (p = s->decode_buffer; p < s->decode_buffer + (framelen * s->channels); p++)
             *samples++ = *p;
-    } else {
+        break;
+        }
+    case 3: {
         // shift samples for 24-bit sample format
         int32_t *samples = (int32_t *)frame->data[0];
         for (i = 0; i < framelen * s->channels; i++)
             *samples++ <<= 8;
         // reset decode buffer
         s->decode_buffer = NULL;
+        break;
+        }
     }
 
     *got_frame_ptr = 1;
@@ -452,15 +389,38 @@ error:
     return ret;
 }
 
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    TTAContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return allocate_buffers(avctx);
+}
+
 static av_cold int tta_decode_close(AVCodecContext *avctx) {
     TTAContext *s = avctx->priv_data;
 
-    av_free(s->decode_buffer);
+    if (s->bps < 3)
+        av_freep(&s->decode_buffer);
+    s->decode_buffer = NULL;
     av_freep(&s->ch_ctx);
 
     return 0;
 }
 
+#define OFFSET(x) offsetof(TTAContext, x)
+#define DEC (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM)
+static const AVOption options[] = {
+    { "password", "Set decoding password", OFFSET(pass), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, DEC },
+    { NULL },
+};
+
+static const AVClass tta_decoder_class = {
+    .class_name = "TTA Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_tta_decoder = {
     .name           = "tta",
     .long_name      = NULL_IF_CONFIG_SMALL("TTA (True Audio)"),
@@ -470,5 +430,7 @@ AVCodec ff_tta_decoder = {
     .init           = tta_decode_init,
     .close          = tta_decode_close,
     .decode         = tta_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .priv_class     = &tta_decoder_class,
 };
diff --git a/libavcodec/ttadata.c b/libavcodec/ttadata.c
new file mode 100644
index 0000000..bf793a4
--- /dev/null
+++ b/libavcodec/ttadata.c
@@ -0,0 +1,52 @@
+/*
+ * TTA (The Lossless True Audio) data
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ttadata.h"
+
+const uint32_t ff_tta_shift_1[] = {
+    0x00000001, 0x00000002, 0x00000004, 0x00000008,
+    0x00000010, 0x00000020, 0x00000040, 0x00000080,
+    0x00000100, 0x00000200, 0x00000400, 0x00000800,
+    0x00001000, 0x00002000, 0x00004000, 0x00008000,
+    0x00010000, 0x00020000, 0x00040000, 0x00080000,
+    0x00100000, 0x00200000, 0x00400000, 0x00800000,
+    0x01000000, 0x02000000, 0x04000000, 0x08000000,
+    0x10000000, 0x20000000, 0x40000000, 0x80000000,
+    0x80000000, 0x80000000, 0x80000000, 0x80000000,
+    0x80000000, 0x80000000, 0x80000000, 0x80000000
+};
+
+const uint32_t * const ff_tta_shift_16 = ff_tta_shift_1 + 4;
+
+const uint8_t ff_tta_filter_configs[] = { 10, 9, 10, 12 };
+
+void ff_tta_rice_init(TTARice *c, uint32_t k0, uint32_t k1)
+{
+    c->k0 = k0;
+    c->k1 = k1;
+    c->sum0 = ff_tta_shift_16[k0];
+    c->sum1 = ff_tta_shift_16[k1];
+}
+
+void ff_tta_filter_init(TTAFilter *c, int32_t shift) {
+    memset(c, 0, sizeof(TTAFilter));
+    c->shift = shift;
+    c->round = ff_tta_shift_1[shift-1];
+}
diff --git a/libavcodec/ttadata.h b/libavcodec/ttadata.h
new file mode 100644
index 0000000..48c4cd0
--- /dev/null
+++ b/libavcodec/ttadata.h
@@ -0,0 +1,50 @@
+/*
+ * TTA (The Lossless True Audio) data
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TTADATA_H
+#define AVCODEC_TTADATA_H
+
+#include "internal.h"
+
+#define MAX_ORDER 16
+typedef struct TTAFilter {
+    int32_t shift, round, error;
+    int32_t qm[MAX_ORDER];
+    int32_t dx[MAX_ORDER];
+    int32_t dl[MAX_ORDER];
+} TTAFilter;
+
+typedef struct TTARice {
+    uint32_t k0, k1, sum0, sum1;
+} TTARice;
+
+typedef struct TTAChannel {
+    int32_t predictor;
+    TTAFilter filter;
+    TTARice rice;
+} TTAChannel;
+
+extern const uint32_t ff_tta_shift_1[];
+extern const uint32_t * const ff_tta_shift_16;
+extern const uint8_t ff_tta_filter_configs[];
+
+void ff_tta_rice_init(TTARice *c, uint32_t k0, uint32_t k1);
+void ff_tta_filter_init(TTAFilter *c, int32_t shift);
+#endif /* AVCODEC_TTADATA_H */
diff --git a/libavcodec/ttadsp.c b/libavcodec/ttadsp.c
new file mode 100644
index 0000000..056a2c7
--- /dev/null
+++ b/libavcodec/ttadsp.c
@@ -0,0 +1,59 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "ttadsp.h"
+#include "config.h"
+
+static void tta_filter_process_c(int32_t *qm, int32_t *dx, int32_t *dl,
+                                 int32_t *error, int32_t *in, int32_t shift,
+                                 int32_t round) {
+    if (*error < 0) {
+        qm[0] -= dx[0]; qm[1] -= dx[1]; qm[2] -= dx[2]; qm[3] -= dx[3];
+        qm[4] -= dx[4]; qm[5] -= dx[5]; qm[6] -= dx[6]; qm[7] -= dx[7];
+    } else if (*error > 0) {
+        qm[0] += dx[0]; qm[1] += dx[1]; qm[2] += dx[2]; qm[3] += dx[3];
+        qm[4] += dx[4]; qm[5] += dx[5]; qm[6] += dx[6]; qm[7] += dx[7];
+    }
+
+    round += dl[0] * qm[0] + dl[1] * qm[1] + dl[2] * qm[2] + dl[3] * qm[3] +
+             dl[4] * qm[4] + dl[5] * qm[5] + dl[6] * qm[6] + dl[7] * qm[7];
+
+    dx[0] = dx[1]; dx[1] = dx[2]; dx[2] = dx[3]; dx[3] = dx[4];
+    dl[0] = dl[1]; dl[1] = dl[2]; dl[2] = dl[3]; dl[3] = dl[4];
+
+    dx[4] = ((dl[4] >> 30) | 1);
+    dx[5] = ((dl[5] >> 30) | 2) & ~1;
+    dx[6] = ((dl[6] >> 30) | 2) & ~1;
+    dx[7] = ((dl[7] >> 30) | 4) & ~3;
+
+    *error = *in;
+    *in += (round >> shift);
+
+    dl[4] = -dl[5]; dl[5] = -dl[6];
+    dl[6] = *in - dl[7]; dl[7] = *in;
+    dl[5] += dl[6]; dl[4] += dl[5];
+}
+
+av_cold void ff_ttadsp_init(TTADSPContext *c)
+{
+    c->filter_process = tta_filter_process_c;
+
+    if (ARCH_X86)
+        ff_ttadsp_init_x86(c);
+}
diff --git a/libavcodec/ttadsp.h b/libavcodec/ttadsp.h
new file mode 100644
index 0000000..737d9bd
--- /dev/null
+++ b/libavcodec/ttadsp.h
@@ -0,0 +1,33 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TTADSP_H
+#define AVCODEC_TTADSP_H
+
+#include <stdint.h>
+
+typedef struct TTADSPContext {
+    void (*filter_process)(int32_t *qm, int32_t *dx, int32_t *dl,
+                           int32_t *error, int32_t *in, int32_t shift,
+                           int32_t round);
+} TTADSPContext;
+
+void ff_ttadsp_init(TTADSPContext *c);
+void ff_ttadsp_init_x86(TTADSPContext *c);
+
+#endif /* AVCODEC_TTADSP_H */
diff --git a/libavcodec/ttaenc.c b/libavcodec/ttaenc.c
new file mode 100644
index 0000000..3cc54d7
--- /dev/null
+++ b/libavcodec/ttaenc.c
@@ -0,0 +1,217 @@
+/*
+ * TTA (The Lossless True Audio) encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_WRITER_LE
+#include "ttadata.h"
+#include "ttaencdsp.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "internal.h"
+#include "libavutil/crc.h"
+
+typedef struct TTAEncContext {
+    const AVCRC *crc_table;
+    int bps;
+    TTAChannel *ch_ctx;
+    TTAEncDSPContext dsp;
+} TTAEncContext;
+
+static av_cold int tta_encode_init(AVCodecContext *avctx)
+{
+    TTAEncContext *s = avctx->priv_data;
+
+    s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_U8:
+        avctx->bits_per_raw_sample = 8;
+        break;
+    case AV_SAMPLE_FMT_S16:
+        avctx->bits_per_raw_sample = 16;
+        break;
+    case AV_SAMPLE_FMT_S32:
+        if (avctx->bits_per_raw_sample > 24)
+            av_log(avctx, AV_LOG_WARNING, "encoding as 24 bits-per-sample\n");
+        avctx->bits_per_raw_sample = 24;
+    }
+
+    s->bps = avctx->bits_per_raw_sample >> 3;
+    avctx->frame_size = 256 * avctx->sample_rate / 245;
+
+    s->ch_ctx = av_malloc_array(avctx->channels, sizeof(*s->ch_ctx));
+    if (!s->ch_ctx)
+        return AVERROR(ENOMEM);
+
+    ff_ttaencdsp_init(&s->dsp);
+
+    return 0;
+}
+
+static int32_t get_sample(const AVFrame *frame, int sample,
+                          enum AVSampleFormat format)
+{
+    int32_t ret;
+
+    if (format == AV_SAMPLE_FMT_U8) {
+        ret = frame->data[0][sample] - 0x80;
+    } else if (format == AV_SAMPLE_FMT_S16) {
+        const int16_t *ptr = (const int16_t *)frame->data[0];
+        ret = ptr[sample];
+    } else {
+        const int32_t *ptr = (const int32_t *)frame->data[0];
+        ret = ptr[sample] >> 8;
+    }
+
+    return ret;
+}
+
+static int tta_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet_ptr)
+{
+    TTAEncContext *s = avctx->priv_data;
+    PutBitContext pb;
+    int ret, i, out_bytes, cur_chan, res, samples;
+    int64_t pkt_size =  frame->nb_samples * 2LL * avctx->channels * s->bps;
+
+pkt_alloc:
+    cur_chan = 0, res = 0, samples = 0;
+    if ((ret = ff_alloc_packet2(avctx, avpkt, pkt_size, 0)) < 0)
+        return ret;
+    init_put_bits(&pb, avpkt->data, avpkt->size);
+
+    // init per channel states
+    for (i = 0; i < avctx->channels; i++) {
+        s->ch_ctx[i].predictor = 0;
+        ff_tta_filter_init(&s->ch_ctx[i].filter, ff_tta_filter_configs[s->bps - 1]);
+        ff_tta_rice_init(&s->ch_ctx[i].rice, 10, 10);
+    }
+
+    for (i = 0; i < frame->nb_samples * avctx->channels; i++) {
+        TTAChannel *c = &s->ch_ctx[cur_chan];
+        TTAFilter *filter = &c->filter;
+        TTARice *rice = &c->rice;
+        uint32_t k, unary, outval;
+        int32_t value, temp;
+
+        value = get_sample(frame, samples++, avctx->sample_fmt);
+
+        if (avctx->channels > 1) {
+            if (cur_chan < avctx->channels - 1)
+                value  = res = get_sample(frame, samples, avctx->sample_fmt) - value;
+            else
+                value -= res / 2;
+        }
+
+        temp = value;
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
+        switch (s->bps) {
+        case 1: value -= PRED(c->predictor, 4); break;
+        case 2:
+        case 3: value -= PRED(c->predictor, 5); break;
+        }
+        c->predictor = temp;
+
+        s->dsp.filter_process(filter->qm, filter->dx, filter->dl, &filter->error, &value,
+                              filter->shift, filter->round);
+        outval = (value > 0) ? (value << 1) - 1: -value << 1;
+
+        k = rice->k0;
+
+        rice->sum0 += outval - (rice->sum0 >> 4);
+        if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
+            rice->k0--;
+        else if (rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
+            rice->k0++;
+
+        if (outval >= ff_tta_shift_1[k]) {
+            outval -= ff_tta_shift_1[k];
+            k = rice->k1;
+
+            rice->sum1 += outval - (rice->sum1 >> 4);
+            if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
+                rice->k1--;
+            else if (rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
+                rice->k1++;
+
+            unary = 1 + (outval >> k);
+            if (unary + 100LL > put_bits_left(&pb)) {
+                if (pkt_size < INT_MAX/2) {
+                    pkt_size *= 2;
+                    av_packet_unref(avpkt);
+                    goto pkt_alloc;
+                } else
+                    return AVERROR(ENOMEM);
+            }
+            do {
+                if (unary > 31) {
+                    put_bits(&pb, 31, 0x7FFFFFFF);
+                    unary -= 31;
+                } else {
+                    put_bits(&pb, unary, (1 << unary) - 1);
+                    unary = 0;
+                }
+            } while (unary);
+        }
+
+        put_bits(&pb, 1, 0);
+
+        if (k)
+            put_bits(&pb, k, outval & (ff_tta_shift_1[k] - 1));
+
+        if (cur_chan < avctx->channels - 1)
+            cur_chan++;
+        else
+            cur_chan = 0;
+    }
+
+    flush_put_bits(&pb);
+    out_bytes = put_bits_count(&pb) >> 3;
+    put_bits32(&pb, av_crc(s->crc_table, UINT32_MAX, avpkt->data, out_bytes) ^ UINT32_MAX);
+    flush_put_bits(&pb);
+
+    avpkt->pts      = frame->pts;
+    avpkt->size     = out_bytes + 4;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static av_cold int tta_encode_close(AVCodecContext *avctx)
+{
+    TTAEncContext *s = avctx->priv_data;
+    av_freep(&s->ch_ctx);
+    return 0;
+}
+
+AVCodec ff_tta_encoder = {
+    .name           = "tta",
+    .long_name      = NULL_IF_CONFIG_SMALL("TTA (True Audio)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_TTA,
+    .priv_data_size = sizeof(TTAEncContext),
+    .init           = tta_encode_init,
+    .close          = tta_encode_close,
+    .encode2        = tta_encode_frame,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_LOSSLESS,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_U8,
+                                                     AV_SAMPLE_FMT_S16,
+                                                     AV_SAMPLE_FMT_S32,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/ttaencdsp.c b/libavcodec/ttaencdsp.c
new file mode 100644
index 0000000..6ba594e
--- /dev/null
+++ b/libavcodec/ttaencdsp.c
@@ -0,0 +1,59 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "ttaencdsp.h"
+#include "config.h"
+
+static void ttaenc_filter_process_c(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round) {
+    if (*error < 0) {
+        qm[0] -= dx[0]; qm[1] -= dx[1]; qm[2] -= dx[2]; qm[3] -= dx[3];
+        qm[4] -= dx[4]; qm[5] -= dx[5]; qm[6] -= dx[6]; qm[7] -= dx[7];
+    } else if (*error > 0) {
+        qm[0] += dx[0]; qm[1] += dx[1]; qm[2] += dx[2]; qm[3] += dx[3];
+        qm[4] += dx[4]; qm[5] += dx[5]; qm[6] += dx[6]; qm[7] += dx[7];
+    }
+
+    round += dl[0] * qm[0] + dl[1] * qm[1] + dl[2] * qm[2] + dl[3] * qm[3] +
+             dl[4] * qm[4] + dl[5] * qm[5] + dl[6] * qm[6] + dl[7] * qm[7];
+
+    dx[0] = dx[1]; dx[1] = dx[2]; dx[2] = dx[3]; dx[3] = dx[4];
+    dl[0] = dl[1]; dl[1] = dl[2]; dl[2] = dl[3]; dl[3] = dl[4];
+
+    dx[4] = ((dl[4] >> 30) | 1);
+    dx[5] = ((dl[5] >> 30) | 2) & ~1;
+    dx[6] = ((dl[6] >> 30) | 2) & ~1;
+    dx[7] = ((dl[7] >> 30) | 4) & ~3;
+
+    dl[4] = -dl[5]; dl[5] = -dl[6];
+    dl[6] = *in - dl[7]; dl[7] = *in;
+    dl[5] += dl[6]; dl[4] += dl[5];
+
+    *in -= (round >> shift);
+    *error = *in;
+}
+
+av_cold void ff_ttaencdsp_init(TTAEncDSPContext *c)
+{
+    c->filter_process = ttaenc_filter_process_c;
+
+    if (ARCH_X86)
+        ff_ttaencdsp_init_x86(c);
+}
diff --git a/libavcodec/ttaencdsp.h b/libavcodec/ttaencdsp.h
new file mode 100644
index 0000000..4b00728
--- /dev/null
+++ b/libavcodec/ttaencdsp.h
@@ -0,0 +1,33 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TTAENCDSP_H
+#define AVCODEC_TTAENCDSP_H
+
+#include <stdint.h>
+
+typedef struct TTAEncDSPContext {
+    void (*filter_process)(int32_t *qm, int32_t *dx, int32_t *dl,
+                           int32_t *error, int32_t *in, int32_t shift,
+                           int32_t round);
+} TTAEncDSPContext;
+
+void ff_ttaencdsp_init(TTAEncDSPContext *c);
+void ff_ttaencdsp_init_x86(TTAEncDSPContext *c);
+
+#endif /* AVCODEC_TTAENCDSP_H */
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index 940def4..7b2e19e 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -358,7 +358,7 @@ static void imdct_and_window(TwinVQContext *tctx, enum TwinVQFrameType ftype,
 
         mdct->imdct_half(mdct, buf1 + bsize * j, in + bsize * j);
 
-        tctx->fdsp.vector_fmul_window(out2, prev_buf + (bsize - wsize) / 2,
+        tctx->fdsp->vector_fmul_window(out2, prev_buf + (bsize - wsize) / 2,
                                       buf1 + bsize * j,
                                       ff_sine_windows[av_log2(wsize)],
                                       wsize / 2);
@@ -405,7 +405,7 @@ static void imdct_output(TwinVQContext *tctx, enum TwinVQFrameType ftype,
                size1 * sizeof(*out2));
         memcpy(out2 + size1, &tctx->curr_frame[2 * mtab->size],
                size2 * sizeof(*out2));
-        tctx->fdsp.butterflies_float(out1, out2, mtab->size);
+        tctx->fdsp->butterflies_float(out1, out2, mtab->size);
     }
 }
 
@@ -446,7 +446,7 @@ static void read_and_decode_spectrum(TwinVQContext *tctx, float *out,
                                bits->bark_use_hist[i][j], i,
                                tctx->tmp_buf, gain[sub * i + j], ftype);
 
-            tctx->fdsp.vector_fmul(chunk + block_size * j,
+            tctx->fdsp->vector_fmul(chunk + block_size * j,
                                    chunk + block_size * j,
                                    tctx->tmp_buf, block_size);
         }
@@ -461,7 +461,7 @@ static void read_and_decode_spectrum(TwinVQContext *tctx, float *out,
         dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf);
 
         for (j = 0; j < mtab->fmode[ftype].sub; j++) {
-            tctx->fdsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
+            tctx->fdsp->vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
             chunk += block_size;
         }
     }
@@ -487,10 +487,8 @@ int ff_twinvq_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     if (tctx->discarded_packets >= 2) {
         frame->nb_samples = mtab->size * tctx->frames_per_packet;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
         out = (float **)frame->extended_data;
     }
 
@@ -548,24 +546,24 @@ static av_cold int init_mdct_win(TwinVQContext *tctx)
             return ret;
     }
 
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->tmp_buf,
-                     mtab->size * sizeof(*tctx->tmp_buf), alloc_fail);
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->tmp_buf,
+                     mtab->size, sizeof(*tctx->tmp_buf), alloc_fail);
 
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->spectrum,
-                     2 * mtab->size * channels * sizeof(*tctx->spectrum),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->spectrum,
+                     2 * mtab->size, channels * sizeof(*tctx->spectrum),
                      alloc_fail);
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->curr_frame,
-                     2 * mtab->size * channels * sizeof(*tctx->curr_frame),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->curr_frame,
+                     2 * mtab->size, channels * sizeof(*tctx->curr_frame),
                      alloc_fail);
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->prev_frame,
-                     2 * mtab->size * channels * sizeof(*tctx->prev_frame),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->prev_frame,
+                     2 * mtab->size, channels * sizeof(*tctx->prev_frame),
                      alloc_fail);
 
     for (i = 0; i < 3; i++) {
         int m       = 4 * mtab->size / mtab->fmode[i].sub;
         double freq = 2 * M_PI / m;
-        FF_ALLOC_OR_GOTO(tctx->avctx, tctx->cos_tabs[i],
-                         (m / 4) * sizeof(*tctx->cos_tabs[i]), alloc_fail);
+        FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->cos_tabs[i],
+                         (m / 4), sizeof(*tctx->cos_tabs[i]), alloc_fail);
 
         for (j = 0; j <= m / 8; j++)
             tctx->cos_tabs[i][j] = cos((2 * j + 1) * freq);
@@ -757,13 +755,14 @@ av_cold int ff_twinvq_decode_close(AVCodecContext *avctx)
 
     for (i = 0; i < 3; i++) {
         ff_mdct_end(&tctx->mdct_ctx[i]);
-        av_free(tctx->cos_tabs[i]);
+        av_freep(&tctx->cos_tabs[i]);
     }
 
-    av_free(tctx->curr_frame);
-    av_free(tctx->spectrum);
-    av_free(tctx->prev_frame);
-    av_free(tctx->tmp_buf);
+    av_freep(&tctx->curr_frame);
+    av_freep(&tctx->spectrum);
+    av_freep(&tctx->prev_frame);
+    av_freep(&tctx->tmp_buf);
+    av_freep(&tctx->fdsp);
 
     return 0;
 }
@@ -790,7 +789,11 @@ av_cold int ff_twinvq_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    avpriv_float_dsp_init(&tctx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    tctx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!tctx->fdsp) {
+        ff_twinvq_decode_close(avctx);
+        return AVERROR(ENOMEM);
+    }
     if ((ret = init_mdct_win(tctx))) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing MDCT\n");
         ff_twinvq_decode_close(avctx);
diff --git a/libavcodec/twinvq.h b/libavcodec/twinvq.h
index e810565..24e5ebc 100644
--- a/libavcodec/twinvq.h
+++ b/libavcodec/twinvq.h
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -136,7 +136,7 @@ typedef struct TwinVQModeTab {
 
 typedef struct TwinVQContext {
     AVCodecContext *avctx;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext mdct_ctx[3];
 
     const TwinVQModeTab *mtab;
diff --git a/libavcodec/twinvq_data.h b/libavcodec/twinvq_data.h
index cc7ba59..375acc2 100644
--- a/libavcodec/twinvq_data.h
+++ b/libavcodec/twinvq_data.h
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -130,7 +130,7 @@ static const uint16_t bark_tab_s44_128[] = {
 /**
  * TwinVQ codebooks. They are coded in a struct so we can use code such as
  *
- * float val = tab.fcb0808l[bitstream_read(bc, 12)];
+ * float val = tab.fcb0808l[get_bits(gb, 12)];
  *
  * without risking a segfault on malformed files.
  */
diff --git a/libavcodec/twinvqdec.c b/libavcodec/twinvqdec.c
index 8981d95..c2353f5 100644
--- a/libavcodec/twinvqdec.c
+++ b/libavcodec/twinvqdec.c
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,9 +23,8 @@
 #include <stdint.h>
 
 #include "libavutil/channel_layout.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "twinvq.h"
 #include "twinvq_data.h"
@@ -236,7 +235,7 @@ static void dec_bark_env(TwinVQContext *tctx, const uint8_t *in, int use_hist,
         }
 }
 
-static void read_cb_data(TwinVQContext *tctx, BitstreamContext *bc,
+static void read_cb_data(TwinVQContext *tctx, GetBitContext *gb,
                          uint8_t *dst, enum TwinVQFrameType ftype)
 {
     int i;
@@ -244,8 +243,8 @@ static void read_cb_data(TwinVQContext *tctx, BitstreamContext *bc,
     for (i = 0; i < tctx->n_div[ftype]; i++) {
         int bs_second_part = (i >= tctx->bits_main_spec_change[ftype]);
 
-        *dst++ = bitstream_read(bc, tctx->bits_main_spec[0][ftype][bs_second_part]);
-        *dst++ = bitstream_read(bc, tctx->bits_main_spec[1][ftype][bs_second_part]);
+        *dst++ = get_bits(gb, tctx->bits_main_spec[0][ftype][bs_second_part]);
+        *dst++ = get_bits(gb, tctx->bits_main_spec[1][ftype][bs_second_part]);
     }
 }
 
@@ -256,13 +255,14 @@ static int twinvq_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
     const TwinVQModeTab *mtab = tctx->mtab;
     int channels              = tctx->avctx->channels;
     int sub;
-    BitstreamContext bc;
-    int i, j, k;
+    GetBitContext gb;
+    int i, j, k, ret;
 
-    bitstream_init8(&bc, buf, buf_size);
-    bitstream_skip(&bc, bitstream_read(&bc, 8));
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
+    skip_bits(&gb, get_bits(&gb, 8));
 
-    bits->window_type = bitstream_read(&bc, TWINVQ_WINDOW_TYPE_BITS);
+    bits->window_type = get_bits(&gb, TWINVQ_WINDOW_TYPE_BITS);
 
     if (bits->window_type > 8) {
         av_log(avctx, AV_LOG_ERROR, "Invalid window type, broken sample?\n");
@@ -273,46 +273,47 @@ static int twinvq_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
 
     sub = mtab->fmode[bits->ftype].sub;
 
-    read_cb_data(tctx, &bc, bits->main_coeffs, bits->ftype);
+    read_cb_data(tctx, &gb, bits->main_coeffs, bits->ftype);
 
     for (i = 0; i < channels; i++)
         for (j = 0; j < sub; j++)
             for (k = 0; k < mtab->fmode[bits->ftype].bark_n_coef; k++)
                 bits->bark1[i][j][k] =
-                    bitstream_read(&bc, mtab->fmode[bits->ftype].bark_n_bit);
+                    get_bits(&gb, mtab->fmode[bits->ftype].bark_n_bit);
 
     for (i = 0; i < channels; i++)
         for (j = 0; j < sub; j++)
-            bits->bark_use_hist[i][j] = bitstream_read_bit(&bc);
+            bits->bark_use_hist[i][j] = get_bits1(&gb);
 
     if (bits->ftype == TWINVQ_FT_LONG) {
         for (i = 0; i < channels; i++)
-            bits->gain_bits[i] = bitstream_read(&bc, TWINVQ_GAIN_BITS);
+            bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS);
     } else {
         for (i = 0; i < channels; i++) {
-            bits->gain_bits[i] = bitstream_read(&bc, TWINVQ_GAIN_BITS);
+            bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS);
             for (j = 0; j < sub; j++)
-                bits->sub_gain_bits[i * sub + j] = bitstream_read(&bc, TWINVQ_SUB_GAIN_BITS);
+                bits->sub_gain_bits[i * sub + j] = get_bits(&gb,
+                                                       TWINVQ_SUB_GAIN_BITS);
         }
     }
 
     for (i = 0; i < channels; i++) {
-        bits->lpc_hist_idx[i] = bitstream_read(&bc, mtab->lsp_bit0);
-        bits->lpc_idx1[i]     = bitstream_read(&bc, mtab->lsp_bit1);
+        bits->lpc_hist_idx[i] = get_bits(&gb, mtab->lsp_bit0);
+        bits->lpc_idx1[i]     = get_bits(&gb, mtab->lsp_bit1);
 
         for (j = 0; j < mtab->lsp_split; j++)
-            bits->lpc_idx2[i][j] = bitstream_read(&bc, mtab->lsp_bit2);
+            bits->lpc_idx2[i][j] = get_bits(&gb, mtab->lsp_bit2);
     }
 
     if (bits->ftype == TWINVQ_FT_LONG) {
-        read_cb_data(tctx, &bc, bits->ppc_coeffs, 3);
+        read_cb_data(tctx, &gb, bits->ppc_coeffs, 3);
         for (i = 0; i < channels; i++) {
-            bits->p_coef[i] = bitstream_read(&bc, mtab->ppc_period_bit);
-            bits->g_coef[i] = bitstream_read(&bc, mtab->pgain_bit);
+            bits->p_coef[i] = get_bits(&gb, mtab->ppc_period_bit);
+            bits->g_coef[i] = get_bits(&gb, mtab->pgain_bit);
         }
     }
 
-    return 0;
+    return (get_bits_count(&gb) + 7) / 8;
 }
 
 static av_cold int twinvq_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/txd.c b/libavcodec/txd.c
index db1d954..8b20475 100644
--- a/libavcodec/txd.c
+++ b/libavcodec/txd.c
@@ -4,27 +4,27 @@
  *
  * See also: http://wiki.multimedia.cx/index.php?title=TXD
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
-#include "avcodec.h"
 #include "bytestream.h"
+#include "avcodec.h"
 #include "internal.h"
 #include "texturedsp.h"
 
@@ -75,10 +75,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     avctx->coded_width  = FFALIGN(w, 4);
     avctx->coded_height = FFALIGN(h, 4);
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
@@ -91,6 +89,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             v = bytestream2_get_be32(&gb);
             pal[y] = (v >> 8) + (v << 24);
         }
+        if (bytestream2_get_bytes_left(&gb) < w * h)
+            return AVERROR_INVALIDDATA;
         bytestream2_skip(&gb, 4);
         for (y=0; y<h; y++) {
             bytestream2_get_buffer(&gb, ptr, w);
@@ -103,6 +103,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             if (!(flags & 1))
                 goto unsupported;
         case TXD_DXT1:
+            if (bytestream2_get_bytes_left(&gb) < AV_CEIL_RSHIFT(w, 2) * AV_CEIL_RSHIFT(h, 2) * 8)
+                return AVERROR_INVALIDDATA;
             for (j = 0; j < avctx->height; j += 4) {
                 for (i = 0; i < avctx->width; i += 4) {
                     uint8_t *p = ptr + i * 4 + j * stride;
@@ -112,6 +114,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             }
             break;
         case TXD_DXT3:
+            if (bytestream2_get_bytes_left(&gb) < AV_CEIL_RSHIFT(w, 2) * AV_CEIL_RSHIFT(h, 2) * 16)
+                return AVERROR_INVALIDDATA;
             for (j = 0; j < avctx->height; j += 4) {
                 for (i = 0; i < avctx->width; i += 4) {
                     uint8_t *p = ptr + i * 4 + j * stride;
@@ -127,6 +131,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         switch (d3d_format) {
         case 0x15:
         case 0x16:
+            if (bytestream2_get_bytes_left(&gb) < h * w * 4)
+                return AVERROR_INVALIDDATA;
             for (y=0; y<h; y++) {
                 bytestream2_get_buffer(&gb, ptr, w * 4);
                 ptr += stride;
diff --git a/libavcodec/ulti.c b/libavcodec/ulti.c
index 46aa27d..9318af0 100644
--- a/libavcodec/ulti.c
+++ b/libavcodec/ulti.c
@@ -2,20 +2,20 @@
  * IBM Ultimotion Video Decoder
  * Copyright (C) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,6 +50,8 @@ static av_cold int ulti_decode_init(AVCodecContext *avctx)
     s->width = avctx->width;
     s->height = avctx->height;
     s->blocks = (s->width / 8) * (s->height / 8);
+    if (s->blocks == 0)
+        return AVERROR_INVALIDDATA;
     avctx->pix_fmt = AV_PIX_FMT_YUV410P;
     s->ulti_codebook = ulti_codebook;
 
@@ -60,7 +62,8 @@ static av_cold int ulti_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static av_cold int ulti_decode_end(AVCodecContext *avctx){
+static av_cold int ulti_decode_end(AVCodecContext *avctx)
+{
     UltimotionDecodeContext *s = avctx->priv_data;
 
     av_frame_free(&s->frame);
@@ -227,10 +230,8 @@ static int ulti_decode_frame(AVCodecContext *avctx,
     int skip;
     int tmp;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     bytestream2_init(&s->gb, buf, buf_size);
 
diff --git a/libavcodec/ulti_cb.h b/libavcodec/ulti_cb.h
index 0bd83ff..7061d83 100644
--- a/libavcodec/ulti_cb.h
+++ b/libavcodec/ulti_cb.h
@@ -2,20 +2,20 @@
  * IBM Ultimotion Video Decoder
  * copyright (C) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/unary.h b/libavcodec/unary.h
index 2992017..d57f9f7 100644
--- a/libavcodec/unary.h
+++ b/libavcodec/unary.h
@@ -1,57 +1,69 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_UNARY_H
 #define AVCODEC_UNARY_H
 
-#include "bitstream.h"
+#include "get_bits.h"
 
 /**
  * Get unary code of limited length
- * @param bc BitstreamContext
+ * @param gb GetBitContext
  * @param[in] stop The bitstop value (unary code of 1's or 0's)
  * @param[in] len Maximum length
- * @return Unary length/index
+ * @return unary 0 based code index. This is also the length in bits of the
+ * code excluding the stop bit.
+ * (in case len=1)
+ * 1            0
+ * 0            1
+ * (in case len=2)
+ * 1            0
+ * 01           1
+ * 00           2
+ * (in case len=3)
+ * 1            0
+ * 01           1
+ * 001          2
+ * 000          3
  */
-static inline int get_unary(BitstreamContext *bc, int stop, int len)
+static inline int get_unary(GetBitContext *gb, int stop, int len)
 {
     int i;
 
-    for (i = 0; i < len && bitstream_read_bit(bc) != stop; i++)
-        ;
+    for(i = 0; i < len && get_bits1(gb) != stop; i++);
     return i;
 }
 
 /**
  * Get unary code terminated by a 0 with a maximum length of 33
- * @param bc BitstreamContext
+ * @param gb GetBitContext
  * @return Unary length/index
  */
-static inline int get_unary_0_33(BitstreamContext *bc)
+static inline int get_unary_0_33(GetBitContext *gb)
 {
-    return get_unary(bc, 0, 33);
+    return get_unary(gb, 0, 33);
 }
 
-static inline int get_unary_0_9(BitstreamContext *bc)
+static inline int get_unary_0_9(GetBitContext *gb)
 {
-    return get_unary(bc, 0, 9);
+    return get_unary(gb, 0, 9);
 }
 
 #endif /* AVCODEC_UNARY_H */
diff --git a/libavcodec/unary_legacy.h b/libavcodec/unary_legacy.h
deleted file mode 100644
index d14929f..0000000
--- a/libavcodec/unary_legacy.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_UNARY_H
-#define AVCODEC_UNARY_H
-
-#include "get_bits.h"
-
-/**
- * Get unary code of limited length
- * @param gb GetBitContext
- * @param[in] stop The bitstop value (unary code of 1's or 0's)
- * @param[in] len Maximum length
- * @return Unary length/index
- */
-static inline int get_unary(GetBitContext *gb, int stop, int len)
-{
-    int i;
-
-    for(i = 0; i < len && get_bits1(gb) != stop; i++);
-    return i;
-}
-
-/**
- * Get unary code terminated by a 0 with a maximum length of 33
- * @param gb GetBitContext
- * @return Unary length/index
- */
-static inline int get_unary_0_33(GetBitContext *gb)
-{
-    return get_unary(gb, 0, 33);
-}
-
-static inline int get_unary_0_9(GetBitContext *gb)
-{
-    return get_unary(gb, 0, 9);
-}
-
-#endif /* AVCODEC_UNARY_H */
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 701ef50..cc04b73 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,70 +29,66 @@
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/crc.h"
 #include "libavutil/frame.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/internal.h"
 #include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/samplefmt.h"
 #include "libavutil/dict.h"
+#include "libavutil/thread.h"
 #include "avcodec.h"
 #include "decode.h"
 #include "hwaccel.h"
 #include "libavutil/opt.h"
-#include "me_cmp.h"
 #include "mpegvideo.h"
 #include "thread.h"
+#include "frame_thread_encoder.h"
 #include "internal.h"
+#include "raw.h"
 #include "bytestream.h"
 #include "version.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdatomic.h>
 #include <limits.h>
 #include <float.h>
+#if CONFIG_ICONV
+# include <iconv.h>
+#endif
+
+#include "libavutil/ffversion.h"
+const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
 
-static int volatile entangled_thread_counter = 0;
-static int (*lockmgr_cb)(void **mutex, enum AVLockOp op);
-static void *codec_mutex;
-static void *avformat_mutex;
+static AVMutex codec_mutex = AV_MUTEX_INITIALIZER;
 
 void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size)
 {
-    void **p = ptr;
+    uint8_t **p = ptr;
     if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
         av_freep(p);
         *size = 0;
         return;
     }
-    av_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (*size)
-        memset((uint8_t *)*p + min_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-}
-
-/* encoder management */
-static AVCodec *first_avcodec = NULL;
-
-AVCodec *av_codec_next(const AVCodec *c)
-{
-    if (c)
-        return c->next;
-    else
-        return first_avcodec;
+    if (!ff_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE, 1))
+        memset(*p + min_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 }
 
-static av_cold void avcodec_init(void)
+void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size)
 {
-    static int initialized = 0;
-
-    if (initialized != 0)
+    uint8_t **p = ptr;
+    if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_freep(p);
+        *size = 0;
         return;
-    initialized = 1;
-
-    if (CONFIG_ME_CMP)
-        ff_me_cmp_init_static();
+    }
+    if (!ff_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE, 1))
+        memset(*p, 0, min_size + AV_INPUT_BUFFER_PADDING_SIZE);
 }
 
 int av_codec_is_encoder(const AVCodec *codec)
@@ -105,28 +101,17 @@ int av_codec_is_decoder(const AVCodec *codec)
     return codec && (codec->decode || codec->receive_frame);
 }
 
-av_cold void avcodec_register(AVCodec *codec)
-{
-    AVCodec **p;
-    avcodec_init();
-    p = &first_avcodec;
-    while (*p)
-        p = &(*p)->next;
-    *p          = codec;
-    codec->next = NULL;
-
-    if (codec->init_static_data)
-        codec->init_static_data(codec);
-}
-
 int ff_set_dimensions(AVCodecContext *s, int width, int height)
 {
-    int ret = av_image_check_size(width, height, 0, s);
+    int ret = av_image_check_size2(width, height, s->max_pixels, AV_PIX_FMT_NONE, 0, s);
 
     if (ret < 0)
         width = height = 0;
-    s->width  = s->coded_width  = width;
-    s->height = s->coded_height = height;
+
+    s->coded_width  = width;
+    s->coded_height = height;
+    s->width        = AV_CEIL_RSHIFT(width,  s->lowres);
+    s->height       = AV_CEIL_RSHIFT(height, s->lowres);
 
     return ret;
 }
@@ -169,10 +154,15 @@ int ff_side_data_update_matrix_encoding(AVFrame *frame,
 void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
                                int linesize_align[AV_NUM_DATA_POINTERS])
 {
-    size_t max_align = av_cpu_max_align();
     int i;
     int w_align = 1;
     int h_align = 1;
+    AVPixFmtDescriptor const *desc = av_pix_fmt_desc_get(s->pix_fmt);
+
+    if (desc) {
+        w_align = 1 << desc->log2_chroma_w;
+        h_align = 1 << desc->log2_chroma_h;
+    }
 
     switch (s->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
@@ -198,49 +188,110 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
     case AV_PIX_FMT_YUV420P9BE:
     case AV_PIX_FMT_YUV420P10LE:
     case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV420P12LE:
+    case AV_PIX_FMT_YUV420P12BE:
+    case AV_PIX_FMT_YUV420P14LE:
+    case AV_PIX_FMT_YUV420P14BE:
+    case AV_PIX_FMT_YUV420P16LE:
+    case AV_PIX_FMT_YUV420P16BE:
+    case AV_PIX_FMT_YUVA420P9LE:
+    case AV_PIX_FMT_YUVA420P9BE:
+    case AV_PIX_FMT_YUVA420P10LE:
+    case AV_PIX_FMT_YUVA420P10BE:
+    case AV_PIX_FMT_YUVA420P16LE:
+    case AV_PIX_FMT_YUVA420P16BE:
     case AV_PIX_FMT_YUV422P9LE:
     case AV_PIX_FMT_YUV422P9BE:
     case AV_PIX_FMT_YUV422P10LE:
     case AV_PIX_FMT_YUV422P10BE:
+    case AV_PIX_FMT_YUV422P12LE:
+    case AV_PIX_FMT_YUV422P12BE:
+    case AV_PIX_FMT_YUV422P14LE:
+    case AV_PIX_FMT_YUV422P14BE:
+    case AV_PIX_FMT_YUV422P16LE:
+    case AV_PIX_FMT_YUV422P16BE:
+    case AV_PIX_FMT_YUVA422P9LE:
+    case AV_PIX_FMT_YUVA422P9BE:
     case AV_PIX_FMT_YUVA422P10LE:
     case AV_PIX_FMT_YUVA422P10BE:
+    case AV_PIX_FMT_YUVA422P12LE:
+    case AV_PIX_FMT_YUVA422P12BE:
+    case AV_PIX_FMT_YUVA422P16LE:
+    case AV_PIX_FMT_YUVA422P16BE:
+    case AV_PIX_FMT_YUV440P10LE:
+    case AV_PIX_FMT_YUV440P10BE:
+    case AV_PIX_FMT_YUV440P12LE:
+    case AV_PIX_FMT_YUV440P12BE:
     case AV_PIX_FMT_YUV444P9LE:
     case AV_PIX_FMT_YUV444P9BE:
     case AV_PIX_FMT_YUV444P10LE:
     case AV_PIX_FMT_YUV444P10BE:
+    case AV_PIX_FMT_YUV444P12LE:
+    case AV_PIX_FMT_YUV444P12BE:
+    case AV_PIX_FMT_YUV444P14LE:
+    case AV_PIX_FMT_YUV444P14BE:
+    case AV_PIX_FMT_YUV444P16LE:
+    case AV_PIX_FMT_YUV444P16BE:
+    case AV_PIX_FMT_YUVA444P9LE:
+    case AV_PIX_FMT_YUVA444P9BE:
     case AV_PIX_FMT_YUVA444P10LE:
     case AV_PIX_FMT_YUVA444P10BE:
+    case AV_PIX_FMT_YUVA444P12LE:
+    case AV_PIX_FMT_YUVA444P12BE:
+    case AV_PIX_FMT_YUVA444P16LE:
+    case AV_PIX_FMT_YUVA444P16BE:
     case AV_PIX_FMT_GBRP9LE:
     case AV_PIX_FMT_GBRP9BE:
     case AV_PIX_FMT_GBRP10LE:
     case AV_PIX_FMT_GBRP10BE:
+    case AV_PIX_FMT_GBRP12LE:
+    case AV_PIX_FMT_GBRP12BE:
+    case AV_PIX_FMT_GBRP14LE:
+    case AV_PIX_FMT_GBRP14BE:
+    case AV_PIX_FMT_GBRP16LE:
+    case AV_PIX_FMT_GBRP16BE:
     case AV_PIX_FMT_GBRAP12LE:
     case AV_PIX_FMT_GBRAP12BE:
+    case AV_PIX_FMT_GBRAP16LE:
+    case AV_PIX_FMT_GBRAP16BE:
         w_align = 16; //FIXME assume 16 pixel per macroblock
         h_align = 16 * 2; // interlaced needs 2 macroblocks height
         break;
     case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUVJ411P:
     case AV_PIX_FMT_UYYVYY411:
         w_align = 32;
-        h_align = 8;
+        h_align = 16 * 2;
         break;
     case AV_PIX_FMT_YUV410P:
         if (s->codec_id == AV_CODEC_ID_SVQ1) {
             w_align = 64;
             h_align = 64;
         }
+        break;
     case AV_PIX_FMT_RGB555:
         if (s->codec_id == AV_CODEC_ID_RPZA) {
             w_align = 4;
             h_align = 4;
         }
+        if (s->codec_id == AV_CODEC_ID_INTERPLAY_VIDEO) {
+            w_align = 8;
+            h_align = 8;
+        }
+        break;
     case AV_PIX_FMT_PAL8:
     case AV_PIX_FMT_BGR8:
     case AV_PIX_FMT_RGB8:
-        if (s->codec_id == AV_CODEC_ID_SMC) {
+        if (s->codec_id == AV_CODEC_ID_SMC ||
+            s->codec_id == AV_CODEC_ID_CINEPAK) {
             w_align = 4;
             h_align = 4;
         }
+        if (s->codec_id == AV_CODEC_ID_JV ||
+            s->codec_id == AV_CODEC_ID_INTERPLAY_VIDEO) {
+            w_align = 8;
+            h_align = 8;
+        }
         break;
     case AV_PIX_FMT_BGR24:
         if ((s->codec_id == AV_CODEC_ID_MSZH) ||
@@ -249,20 +300,39 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
             h_align = 4;
         }
         break;
+    case AV_PIX_FMT_RGB24:
+        if (s->codec_id == AV_CODEC_ID_CINEPAK) {
+            w_align = 4;
+            h_align = 4;
+        }
+        break;
     default:
-        w_align = 1;
-        h_align = 1;
         break;
     }
 
+    if (s->codec_id == AV_CODEC_ID_IFF_ILBM) {
+        w_align = FFMAX(w_align, 8);
+    }
+
     *width  = FFALIGN(*width, w_align);
     *height = FFALIGN(*height, h_align);
-    if (s->codec_id == AV_CODEC_ID_H264)
+    if (s->codec_id == AV_CODEC_ID_H264 || s->lowres ||
+        s->codec_id == AV_CODEC_ID_VP5  || s->codec_id == AV_CODEC_ID_VP6 ||
+        s->codec_id == AV_CODEC_ID_VP6F || s->codec_id == AV_CODEC_ID_VP6A
+    ) {
         // some of the optimized chroma MC reads one line too much
+        // which is also done in mpeg decoders with lowres > 0
         *height += 2;
 
+        // H.264 uses edge emulation for out of frame motion vectors, for this
+        // it requires a temporary area large enough to hold a 21x21 block,
+        // increasing witdth ensure that the temporary area is large enough,
+        // the next rounded up width is 32
+        *width = FFMAX(*width, 32);
+    }
+
     for (i = 0; i < 4; i++)
-        linesize_align[i] = max_align;
+        linesize_align[i] = STRIDE_ALIGN;
 }
 
 void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height)
@@ -280,6 +350,29 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height)
     *width              = FFALIGN(*width, align);
 }
 
+int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos)
+{
+    if (pos <= AVCHROMA_LOC_UNSPECIFIED || pos >= AVCHROMA_LOC_NB)
+        return AVERROR(EINVAL);
+    pos--;
+
+    *xpos = (pos&1) * 128;
+    *ypos = ((pos>>1)^(pos<4)) * 128;
+
+    return 0;
+}
+
+enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos)
+{
+    int pos, xout, yout;
+
+    for (pos = AVCHROMA_LOC_UNSPECIFIED + 1; pos < AVCHROMA_LOC_NB; pos++) {
+        if (avcodec_enum_to_chroma_pos(&xout, &yout, pos) == 0 && xout == xpos && yout == ypos)
+            return pos;
+    }
+    return AVCHROMA_LOC_UNSPECIFIED;
+}
+
 int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
                              enum AVSampleFormat sample_fmt, const uint8_t *buf,
                              int buf_size, int align)
@@ -294,7 +387,7 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
 
     planar = av_sample_fmt_is_planar(sample_fmt);
     if (planar && nb_channels > AV_NUM_DATA_POINTERS) {
-        if (!(frame->extended_data = av_mallocz(nb_channels *
+        if (!(frame->extended_data = av_mallocz_array(nb_channels,
                                                 sizeof(*frame->extended_data))))
             return AVERROR(ENOMEM);
     } else {
@@ -302,10 +395,10 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
     }
 
     if ((ret = av_samples_fill_arrays(frame->extended_data, &frame->linesize[0],
-                                      buf, nb_channels, frame->nb_samples,
+                                      (uint8_t *)(intptr_t)buf, nb_channels, frame->nb_samples,
                                       sample_fmt, align)) < 0) {
         if (frame->extended_data != frame->data)
-            av_free(frame->extended_data);
+            av_freep(&frame->extended_data);
         return ret;
     }
     if (frame->extended_data != frame->data) {
@@ -316,6 +409,29 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
     return ret;
 }
 
+void ff_color_frame(AVFrame *frame, const int c[4])
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    int p, y, x;
+
+    av_assert0(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
+
+    for (p = 0; p<desc->nb_components; p++) {
+        uint8_t *dst = frame->data[p];
+        int is_chroma = p == 1 || p == 2;
+        int bytes  = is_chroma ? AV_CEIL_RSHIFT(frame->width,  desc->log2_chroma_w) : frame->width;
+        int height = is_chroma ? AV_CEIL_RSHIFT(frame->height, desc->log2_chroma_h) : frame->height;
+        for (y = 0; y < height; y++) {
+            if (desc->comp[0].depth >= 9) {
+                for (x = 0; x<bytes; x++)
+                    ((uint16_t*)dst)[x] = c[p];
+            }else
+                memset(dst, c[p], bytes);
+            dst += frame->linesize[p];
+        }
+    }
+}
+
 int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2), void *arg, int *ret, int count, int size)
 {
     int i;
@@ -325,6 +441,7 @@ int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, v
         if (ret)
             ret[i] = r;
     }
+    emms_c();
     return 0;
 }
 
@@ -337,24 +454,107 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
         if (ret)
             ret[i] = r;
     }
+    emms_c();
     return 0;
 }
 
+enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags,
+                                       unsigned int fourcc)
+{
+    while (tags->pix_fmt >= 0) {
+        if (tags->fourcc == fourcc)
+            return tags->pix_fmt;
+        tags++;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
+#if FF_API_CODEC_GET_SET
+MAKE_ACCESSORS(AVCodecContext, codec, AVRational, pkt_timebase)
+MAKE_ACCESSORS(AVCodecContext, codec, const AVCodecDescriptor *, codec_descriptor)
+MAKE_ACCESSORS(AVCodecContext, codec, int, lowres)
+MAKE_ACCESSORS(AVCodecContext, codec, int, seek_preroll)
+MAKE_ACCESSORS(AVCodecContext, codec, uint16_t*, chroma_intra_matrix)
+
+unsigned av_codec_get_codec_properties(const AVCodecContext *codec)
+{
+    return codec->properties;
+}
+
+int av_codec_get_max_lowres(const AVCodec *codec)
+{
+    return codec->max_lowres;
+}
+#endif
+
+int avpriv_codec_get_cap_skip_frame_fill_param(const AVCodec *codec){
+    return !!(codec->caps_internal & FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM);
+}
+
+static int64_t get_bit_rate(AVCodecContext *ctx)
+{
+    int64_t bit_rate;
+    int bits_per_sample;
+
+    switch (ctx->codec_type) {
+    case AVMEDIA_TYPE_VIDEO:
+    case AVMEDIA_TYPE_DATA:
+    case AVMEDIA_TYPE_SUBTITLE:
+    case AVMEDIA_TYPE_ATTACHMENT:
+        bit_rate = ctx->bit_rate;
+        break;
+    case AVMEDIA_TYPE_AUDIO:
+        bits_per_sample = av_get_bits_per_sample(ctx->codec_id);
+        bit_rate = bits_per_sample ? ctx->sample_rate * (int64_t)ctx->channels * bits_per_sample : ctx->bit_rate;
+        break;
+    default:
+        bit_rate = 0;
+        break;
+    }
+    return bit_rate;
+}
+
+
+static void ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec)
+{
+    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init)
+        ff_mutex_lock(&codec_mutex);
+}
+
+static void ff_unlock_avcodec(const AVCodec *codec)
+{
+    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init)
+        ff_mutex_unlock(&codec_mutex);
+}
+
+int attribute_align_arg ff_codec_open2_recursive(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options)
+{
+    int ret = 0;
+
+    ff_unlock_avcodec(codec);
+
+    ret = avcodec_open2(avctx, codec, options);
+
+    ff_lock_avcodec(avctx, codec);
+    return ret;
+}
+
 int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options)
 {
     int ret = 0;
     AVDictionary *tmp = NULL;
+    const AVPixFmtDescriptor *pixdesc;
 
     if (avcodec_is_open(avctx))
         return 0;
 
     if ((!codec && !avctx->codec)) {
-        av_log(avctx, AV_LOG_ERROR, "No codec provided to avcodec_open2().\n");
+        av_log(avctx, AV_LOG_ERROR, "No codec provided to avcodec_open2()\n");
         return AVERROR(EINVAL);
     }
     if ((codec && avctx->codec && codec != avctx->codec)) {
         av_log(avctx, AV_LOG_ERROR, "This AVCodecContext was allocated for %s, "
-                                    "but %s passed to avcodec_open2().\n", avctx->codec->name, codec->name);
+                                    "but %s passed to avcodec_open2()\n", avctx->codec->name, codec->name);
         return AVERROR(EINVAL);
     }
     if (!codec)
@@ -366,25 +566,9 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if (options)
         av_dict_copy(&tmp, *options, 0);
 
-    /* If there is a user-supplied mutex locking routine, call it. */
-    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init) {
-        if (lockmgr_cb) {
-            if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_OBTAIN))
-                return -1;
-        }
-
-        entangled_thread_counter++;
-        if (entangled_thread_counter != 1) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Insufficient thread locking. At least %d threads are "
-                   "calling avcodec_open2() at the same time right now.\n",
-                   entangled_thread_counter);
-            ret = -1;
-            goto end;
-        }
-    }
+    ff_lock_avcodec(avctx, codec);
 
-    avctx->internal = av_mallocz(sizeof(AVCodecInternal));
+    avctx->internal = av_mallocz(sizeof(*avctx->internal));
     if (!avctx->internal) {
         ret = AVERROR(ENOMEM);
         goto end;
@@ -432,6 +616,8 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
         goto free_and_end;
     }
 
+    avctx->internal->skip_samples_multiplier = 1;
+
     if (codec->priv_data_size > 0) {
         if (!avctx->priv_data) {
             avctx->priv_data = av_mallocz(codec->priv_data_size);
@@ -452,17 +638,27 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if ((ret = av_opt_set_dict(avctx, &tmp)) < 0)
         goto free_and_end;
 
-    if (avctx->coded_width && avctx->coded_height && !avctx->width && !avctx->height)
+    if (avctx->codec_whitelist && av_match_list(codec->name, avctx->codec_whitelist, ',') <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Codec (%s) not on whitelist \'%s\'\n", codec->name, avctx->codec_whitelist);
+        ret = AVERROR(EINVAL);
+        goto free_and_end;
+    }
+
+    // only call ff_set_dimensions() for non H.264/VP6F/DXV codecs so as not to overwrite previously setup dimensions
+    if (!(avctx->coded_width && avctx->coded_height && avctx->width && avctx->height &&
+          (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_VP6F || avctx->codec_id == AV_CODEC_ID_DXV))) {
+    if (avctx->coded_width && avctx->coded_height)
         ret = ff_set_dimensions(avctx, avctx->coded_width, avctx->coded_height);
     else if (avctx->width && avctx->height)
         ret = ff_set_dimensions(avctx, avctx->width, avctx->height);
     if (ret < 0)
         goto free_and_end;
+    }
 
     if ((avctx->coded_width || avctx->coded_height || avctx->width || avctx->height)
-        && (  av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx) < 0
-           || av_image_check_size(avctx->width,       avctx->height,       0, avctx) < 0)) {
-        av_log(avctx, AV_LOG_WARNING, "ignoring invalid width/height values\n");
+        && (  av_image_check_size2(avctx->coded_width, avctx->coded_height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx) < 0
+           || av_image_check_size2(avctx->width,       avctx->height,       avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx) < 0)) {
+        av_log(avctx, AV_LOG_WARNING, "Ignoring invalid width/height values\n");
         ff_set_dimensions(avctx, 0, 0);
     }
 
@@ -482,6 +678,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
         av_freep(&avctx->subtitle_header);
 
     if (avctx->channels > FF_SANE_NB_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "Too many channels: %d\n", avctx->channels);
         ret = AVERROR(EINVAL);
         goto free_and_end;
     }
@@ -494,14 +691,25 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     }
     if (avctx->codec_id != codec->id || (avctx->codec_type != codec->type
                                          && avctx->codec_type != AVMEDIA_TYPE_ATTACHMENT)) {
-        av_log(avctx, AV_LOG_ERROR, "codec type or id mismatches\n");
+        av_log(avctx, AV_LOG_ERROR, "Codec type or id mismatches\n");
         ret = AVERROR(EINVAL);
         goto free_and_end;
     }
     avctx->frame_number = 0;
+    avctx->codec_descriptor = avcodec_descriptor_get(avctx->codec_id);
 
     if ((avctx->codec->capabilities & AV_CODEC_CAP_EXPERIMENTAL) &&
         avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        const char *codec_string = av_codec_is_encoder(codec) ? "encoder" : "decoder";
+        AVCodec *codec2;
+        av_log(avctx, AV_LOG_ERROR,
+               "The %s '%s' is experimental but experimental codecs are not enabled, "
+               "add '-strict %d' if you want to use it.\n",
+               codec_string, codec->name, FF_COMPLIANCE_EXPERIMENTAL);
+        codec2 = av_codec_is_encoder(codec) ? avcodec_find_encoder(codec->id) : avcodec_find_decoder(codec->id);
+        if (!(codec2->capabilities & AV_CODEC_CAP_EXPERIMENTAL))
+            av_log(avctx, AV_LOG_ERROR, "Alternatively use the non experimental %s '%s'.\n",
+                codec_string, codec2->name);
         ret = AVERROR_EXPERIMENTAL;
         goto free_and_end;
     }
@@ -512,13 +720,25 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
         avctx->time_base.den = avctx->sample_rate;
     }
 
+    if (!HAVE_THREADS)
+        av_log(avctx, AV_LOG_WARNING, "Warning: not compiled with thread support, using thread emulation\n");
+
+    if (CONFIG_FRAME_THREAD_ENCODER && av_codec_is_encoder(avctx->codec)) {
+        ff_unlock_avcodec(codec); //we will instantiate a few encoders thus kick the counter to prevent false detection of a problem
+        ret = ff_frame_thread_encoder_init(avctx, options ? *options : NULL);
+        ff_lock_avcodec(avctx, codec);
+        if (ret < 0)
+            goto free_and_end;
+    }
+
     if (av_codec_is_decoder(avctx->codec)) {
         ret = ff_decode_bsfs_init(avctx);
         if (ret < 0)
             goto free_and_end;
     }
 
-    if (HAVE_THREADS) {
+    if (HAVE_THREADS
+        && !(avctx->internal->frame_thread_encoder && (avctx->active_thread_type&FF_THREAD_FRAME))) {
         ret = ff_thread_init(avctx);
         if (ret < 0) {
             goto free_and_end;
@@ -527,6 +747,12 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if (!HAVE_THREADS && !(codec->capabilities & AV_CODEC_CAP_AUTO_THREADS))
         avctx->thread_count = 1;
 
+    if (avctx->codec->max_lowres < avctx->lowres || avctx->lowres < 0) {
+        av_log(avctx, AV_LOG_WARNING, "The maximum value for lowres supported by the decoder is %d\n",
+               avctx->codec->max_lowres);
+        avctx->lowres = avctx->codec->max_lowres;
+    }
+
     if (av_codec_is_encoder(avctx->codec)) {
         int i;
 #if FF_API_CODED_FRAME
@@ -557,7 +783,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
             }
             if (avctx->codec->sample_fmts[i] == AV_SAMPLE_FMT_NONE) {
-                av_log(avctx, AV_LOG_ERROR, "Specified sample_fmt is not supported.\n");
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%d", avctx->sample_fmt);
+                av_log(avctx, AV_LOG_ERROR, "Specified sample format %s is invalid or not supported\n",
+                       (char *)av_x_if_null(av_get_sample_fmt_name(avctx->sample_fmt), buf));
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
@@ -566,12 +795,18 @@ FF_ENABLE_DEPRECATION_WARNINGS
             for (i = 0; avctx->codec->pix_fmts[i] != AV_PIX_FMT_NONE; i++)
                 if (avctx->pix_fmt == avctx->codec->pix_fmts[i])
                     break;
-            if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_NONE) {
-                av_log(avctx, AV_LOG_ERROR, "Specified pix_fmt is not supported\n");
+            if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_NONE
+                && !((avctx->codec_id == AV_CODEC_ID_MJPEG || avctx->codec_id == AV_CODEC_ID_LJPEG)
+                     && avctx->strict_std_compliance <= FF_COMPLIANCE_UNOFFICIAL)) {
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%d", avctx->pix_fmt);
+                av_log(avctx, AV_LOG_ERROR, "Specified pixel format %s is invalid or not supported\n",
+                       (char *)av_x_if_null(av_get_pix_fmt_name(avctx->pix_fmt), buf));
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
             if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ420P ||
+                avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ411P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ422P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ440P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ444P)
@@ -582,39 +817,77 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 if (avctx->sample_rate == avctx->codec->supported_samplerates[i])
                     break;
             if (avctx->codec->supported_samplerates[i] == 0) {
-                av_log(avctx, AV_LOG_ERROR, "Specified sample_rate is not supported\n");
+                av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n",
+                       avctx->sample_rate);
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
         }
+        if (avctx->sample_rate < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n",
+                    avctx->sample_rate);
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
         if (avctx->codec->channel_layouts) {
             if (!avctx->channel_layout) {
-                av_log(avctx, AV_LOG_WARNING, "channel_layout not specified\n");
+                av_log(avctx, AV_LOG_WARNING, "Channel layout not specified\n");
             } else {
                 for (i = 0; avctx->codec->channel_layouts[i] != 0; i++)
                     if (avctx->channel_layout == avctx->codec->channel_layouts[i])
                         break;
                 if (avctx->codec->channel_layouts[i] == 0) {
-                    av_log(avctx, AV_LOG_ERROR, "Specified channel_layout is not supported\n");
+                    char buf[512];
+                    av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
+                    av_log(avctx, AV_LOG_ERROR, "Specified channel layout '%s' is not supported\n", buf);
                     ret = AVERROR(EINVAL);
                     goto free_and_end;
                 }
             }
         }
         if (avctx->channel_layout && avctx->channels) {
-            if (av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels) {
-                av_log(avctx, AV_LOG_ERROR, "channel layout does not match number of channels\n");
+            int channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
+            if (channels != avctx->channels) {
+                char buf[512];
+                av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
+                av_log(avctx, AV_LOG_ERROR,
+                       "Channel layout '%s' with %d channels does not match number of specified channels %d\n",
+                       buf, channels, avctx->channels);
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
         } else if (avctx->channel_layout) {
             avctx->channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
         }
+        if (avctx->channels < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Specified number of channels %d is not supported\n",
+                    avctx->channels);
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
+        if(avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+            pixdesc = av_pix_fmt_desc_get(avctx->pix_fmt);
+            if (    avctx->bits_per_raw_sample < 0
+                || (avctx->bits_per_raw_sample > 8 && pixdesc->comp[0].depth <= 8)) {
+                av_log(avctx, AV_LOG_WARNING, "Specified bit depth %d not possible with the specified pixel formats depth %d\n",
+                    avctx->bits_per_raw_sample, pixdesc->comp[0].depth);
+                avctx->bits_per_raw_sample = pixdesc->comp[0].depth;
+            }
+            if (avctx->width <= 0 || avctx->height <= 0) {
+                av_log(avctx, AV_LOG_ERROR, "dimensions not set\n");
+                ret = AVERROR(EINVAL);
+                goto free_and_end;
+            }
+        }
+        if (   (avctx->codec_type == AVMEDIA_TYPE_VIDEO || avctx->codec_type == AVMEDIA_TYPE_AUDIO)
+            && avctx->bit_rate>0 && avctx->bit_rate<1000) {
+            av_log(avctx, AV_LOG_WARNING, "Bitrate %"PRId64" is extremely low, maybe you mean %"PRId64"k\n", avctx->bit_rate, avctx->bit_rate);
+        }
 
         if (!avctx->rc_initial_buffer_occupancy)
-            avctx->rc_initial_buffer_occupancy = avctx->rc_buffer_size * 3 / 4;
+            avctx->rc_initial_buffer_occupancy = avctx->rc_buffer_size * 3LL / 4;
 
-        if (avctx->ticks_per_frame &&
+        if (avctx->ticks_per_frame && avctx->time_base.num &&
             avctx->ticks_per_frame > INT_MAX / avctx->time_base.num) {
             av_log(avctx, AV_LOG_ERROR,
                    "ticks_per_frame %d too large for the timebase %d/%d.",
@@ -646,22 +919,41 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     }
 
-    if (avctx->codec->init && !(avctx->active_thread_type & FF_THREAD_FRAME)) {
+    avctx->pts_correction_num_faulty_pts =
+    avctx->pts_correction_num_faulty_dts = 0;
+    avctx->pts_correction_last_pts =
+    avctx->pts_correction_last_dts = INT64_MIN;
+
+    if (   !CONFIG_GRAY && avctx->flags & AV_CODEC_FLAG_GRAY
+        && avctx->codec_descriptor->type == AVMEDIA_TYPE_VIDEO)
+        av_log(avctx, AV_LOG_WARNING,
+               "gray decoding requested but not enabled at configuration time\n");
+
+    if (   avctx->codec->init && (!(avctx->active_thread_type&FF_THREAD_FRAME)
+        || avctx->internal->frame_thread_encoder)) {
         ret = avctx->codec->init(avctx);
         if (ret < 0) {
             goto free_and_end;
         }
     }
 
+    ret=0;
+
     if (av_codec_is_decoder(avctx->codec)) {
+        if (!avctx->bit_rate)
+            avctx->bit_rate = get_bit_rate(avctx);
         /* validate channel layout from the decoder */
         if (avctx->channel_layout) {
             int channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
             if (!avctx->channels)
                 avctx->channels = channels;
             else if (channels != avctx->channels) {
+                char buf[512];
+                av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
                 av_log(avctx, AV_LOG_WARNING,
-                       "channel layout does not match number of channels\n");
+                       "Channel layout '%s' with %d channels does not match specified number of channels %d: "
+                       "ignoring specified channel layout\n",
+                       buf, channels, avctx->channels);
                 avctx->channel_layout = 0;
             }
         }
@@ -670,17 +962,55 @@ FF_ENABLE_DEPRECATION_WARNINGS
             ret = AVERROR(EINVAL);
             goto free_and_end;
         }
-    }
-end:
-    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init) {
-        entangled_thread_counter--;
-
-        /* Release any user-supplied mutex. */
-        if (lockmgr_cb) {
-            (*lockmgr_cb)(&codec_mutex, AV_LOCK_RELEASE);
+        if (avctx->sub_charenc) {
+            if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
+                av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
+                       "supported with subtitles codecs\n");
+                ret = AVERROR(EINVAL);
+                goto free_and_end;
+            } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
+                av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
+                       "subtitles character encoding will be ignored\n",
+                       avctx->codec_descriptor->name);
+                avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
+            } else {
+                /* input character encoding is set for a text based subtitle
+                 * codec at this point */
+                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
+                    avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_PRE_DECODER;
+
+                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_PRE_DECODER) {
+#if CONFIG_ICONV
+                    iconv_t cd = iconv_open("UTF-8", avctx->sub_charenc);
+                    if (cd == (iconv_t)-1) {
+                        ret = AVERROR(errno);
+                        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
+                               "with input character encoding \"%s\"\n", avctx->sub_charenc);
+                        goto free_and_end;
+                    }
+                    iconv_close(cd);
+#else
+                    av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
+                           "conversion needs a libavcodec built with iconv support "
+                           "for this codec\n");
+                    ret = AVERROR(ENOSYS);
+                    goto free_and_end;
+#endif
+                }
+            }
         }
+
+#if FF_API_AVCTX_TIMEBASE
+        if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
+            avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+#endif
+    }
+    if (codec->priv_data_size > 0 && avctx->priv_data && codec->priv_class) {
+        av_assert0(*(const AVClass **)avctx->priv_data == codec->priv_class);
     }
 
+end:
+    ff_unlock_avcodec(codec);
     if (options) {
         av_dict_free(options);
         *options = tmp;
@@ -692,7 +1022,7 @@ free_and_end:
         (avctx->codec->caps_internal & FF_CODEC_CAP_INIT_CLEANUP))
         avctx->codec->close(avctx);
 
-    if (avctx->priv_data && avctx->codec && avctx->codec->priv_class)
+    if (codec->priv_class && codec->priv_data_size)
         av_opt_free(avctx->priv_data);
     av_opt_free(avctx);
 
@@ -737,20 +1067,28 @@ void avsubtitle_free(AVSubtitle *sub)
 
     av_freep(&sub->rects);
 
-    memset(sub, 0, sizeof(AVSubtitle));
+    memset(sub, 0, sizeof(*sub));
 }
 
 av_cold int avcodec_close(AVCodecContext *avctx)
 {
     int i;
 
+    if (!avctx)
+        return 0;
+
     if (avcodec_is_open(avctx)) {
         FramePool *pool = avctx->internal->pool;
-
+        if (CONFIG_FRAME_THREAD_ENCODER &&
+            avctx->internal->frame_thread_encoder && avctx->thread_count > 1) {
+            ff_frame_thread_encoder_free(avctx);
+        }
         if (HAVE_THREADS && avctx->internal->thread_ctx)
             ff_thread_free(avctx);
         if (avctx->codec && avctx->codec->close)
             avctx->codec->close(avctx);
+        avctx->internal->byte_buffer_size = 0;
+        av_freep(&avctx->internal->byte_buffer);
         av_frame_free(&avctx->internal->to_free);
         av_frame_free(&avctx->internal->compat_decode_frame);
         av_frame_free(&avctx->internal->buffer_frame);
@@ -798,82 +1136,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
-static AVCodec *find_encdec(enum AVCodecID id, int encoder)
+const char *avcodec_get_name(enum AVCodecID id)
 {
-    AVCodec *p, *experimental = NULL;
-    p = first_avcodec;
-    while (p) {
-        if ((encoder ? av_codec_is_encoder(p) : av_codec_is_decoder(p)) &&
-            p->id == id) {
-            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
-                experimental = p;
-            } else
-                return p;
-        }
-        p = p->next;
-    }
-    return experimental;
-}
-
-AVCodec *avcodec_find_encoder(enum AVCodecID id)
-{
-    return find_encdec(id, 1);
-}
-
-AVCodec *avcodec_find_encoder_by_name(const char *name)
-{
-    AVCodec *p;
-    if (!name)
-        return NULL;
-    p = first_avcodec;
-    while (p) {
-        if (av_codec_is_encoder(p) && strcmp(name, p->name) == 0)
-            return p;
-        p = p->next;
-    }
-    return NULL;
-}
-
-AVCodec *avcodec_find_decoder(enum AVCodecID id)
-{
-    return find_encdec(id, 0);
-}
-
-AVCodec *avcodec_find_decoder_by_name(const char *name)
-{
-    AVCodec *p;
-    if (!name)
-        return NULL;
-    p = first_avcodec;
-    while (p) {
-        if (av_codec_is_decoder(p) && strcmp(name, p->name) == 0)
-            return p;
-        p = p->next;
-    }
-    return NULL;
-}
-
-static int get_bit_rate(AVCodecContext *ctx)
-{
-    int bit_rate;
-    int bits_per_sample;
-
-    switch (ctx->codec_type) {
-    case AVMEDIA_TYPE_VIDEO:
-    case AVMEDIA_TYPE_DATA:
-    case AVMEDIA_TYPE_SUBTITLE:
-    case AVMEDIA_TYPE_ATTACHMENT:
-        bit_rate = ctx->bit_rate;
-        break;
-    case AVMEDIA_TYPE_AUDIO:
-        bits_per_sample = av_get_bits_per_sample(ctx->codec_id);
-        bit_rate = bits_per_sample ? ctx->sample_rate * ctx->channels * bits_per_sample : ctx->bit_rate;
-        break;
-    default:
-        bit_rate = 0;
-        break;
-    }
-    return bit_rate;
+    const AVCodecDescriptor *cd;
+    AVCodec *codec;
+
+    if (id == AV_CODEC_ID_NONE)
+        return "none";
+    cd = avcodec_descriptor_get(id);
+    if (cd)
+        return cd->name;
+    av_log(NULL, AV_LOG_WARNING, "Codec 0x%x is not in the full list.\n", id);
+    codec = avcodec_find_decoder(id);
+    if (codec)
+        return codec->name;
+    codec = avcodec_find_encoder(id);
+    if (codec)
+        return codec->name;
+    return "unknown_codec";
 }
 
 size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_tag)
@@ -883,7 +1163,7 @@ size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_ta
 #define TAG_PRINT(x)                                              \
     (((x) >= '0' && (x) <= '9') ||                                \
      ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z') ||  \
-     ((x) == '.' || (x) == ' '))
+     ((x) == '.' || (x) == ' ' || (x) == '-' || (x) == '_'))
 
     for (i = 0; i < 4; i++) {
         len = snprintf(buf, buf_size,
@@ -898,68 +1178,99 @@ size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_ta
 
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 {
+    const char *codec_type;
     const char *codec_name;
     const char *profile = NULL;
-    char buf1[32];
-    int bitrate;
+    int64_t bitrate;
     int new_line = 0;
     AVRational display_aspect_ratio;
-    const AVCodecDescriptor *desc = avcodec_descriptor_get(enc->codec_id);
+    const char *separator = enc->dump_separator ? (const char *)enc->dump_separator : ", ";
 
-    if (desc) {
-        codec_name = desc->name;
-        profile = avcodec_profile_name(enc->codec_id, enc->profile);
-    } else if (enc->codec_id == AV_CODEC_ID_MPEG2TS) {
-        /* fake mpeg2 transport stream codec (currently not
-         * registered) */
-        codec_name = "mpeg2ts";
-    } else {
-        /* output avi tags */
-        char tag_buf[32];
-        av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-        snprintf(buf1, sizeof(buf1), "%s / 0x%04X", tag_buf, enc->codec_tag);
-        codec_name = buf1;
-    }
+    if (!buf || buf_size <= 0)
+        return;
+    codec_type = av_get_media_type_string(enc->codec_type);
+    codec_name = avcodec_get_name(enc->codec_id);
+    profile = avcodec_profile_name(enc->codec_id, enc->profile);
+
+    snprintf(buf, buf_size, "%s: %s", codec_type ? codec_type : "unknown",
+             codec_name);
+    buf[0] ^= 'a' ^ 'A'; /* first letter in uppercase */
+
+    if (enc->codec && strcmp(enc->codec->name, codec_name))
+        snprintf(buf + strlen(buf), buf_size - strlen(buf), " (%s)", enc->codec->name);
+
+    if (profile)
+        snprintf(buf + strlen(buf), buf_size - strlen(buf), " (%s)", profile);
+    if (   enc->codec_type == AVMEDIA_TYPE_VIDEO
+        && av_log_get_level() >= AV_LOG_VERBOSE
+        && enc->refs)
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 ", %d reference frame%s",
+                 enc->refs, enc->refs > 1 ? "s" : "");
+
+    if (enc->codec_tag)
+        snprintf(buf + strlen(buf), buf_size - strlen(buf), " (%s / 0x%04X)",
+                 av_fourcc2str(enc->codec_tag), enc->codec_tag);
 
     switch (enc->codec_type) {
     case AVMEDIA_TYPE_VIDEO:
-        snprintf(buf, buf_size,
-                 "Video: %s%s",
-                 codec_name, enc->mb_decision ? " (hq)" : "");
-        if (profile)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " (%s)", profile);
-        if (enc->codec_tag) {
-            char tag_buf[32];
-            av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " [%s / 0x%04X]", tag_buf, enc->codec_tag);
-        }
+        {
+            char detail[256] = "(";
 
-        av_strlcat(buf, "\n      ", buf_size);
-        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+            av_strlcat(buf, separator, buf_size);
+
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
                  "%s", enc->pix_fmt == AV_PIX_FMT_NONE ? "none" :
                      av_get_pix_fmt_name(enc->pix_fmt));
+            if (enc->bits_per_raw_sample && enc->pix_fmt != AV_PIX_FMT_NONE &&
+                enc->bits_per_raw_sample < av_pix_fmt_desc_get(enc->pix_fmt)->comp[0].depth)
+                av_strlcatf(detail, sizeof(detail), "%d bpc, ", enc->bits_per_raw_sample);
+            if (enc->color_range != AVCOL_RANGE_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_color_range_name(enc->color_range));
+
+            if (enc->colorspace != AVCOL_SPC_UNSPECIFIED ||
+                enc->color_primaries != AVCOL_PRI_UNSPECIFIED ||
+                enc->color_trc != AVCOL_TRC_UNSPECIFIED) {
+                if (enc->colorspace != (int)enc->color_primaries ||
+                    enc->colorspace != (int)enc->color_trc) {
+                    new_line = 1;
+                    av_strlcatf(detail, sizeof(detail), "%s/%s/%s, ",
+                                av_color_space_name(enc->colorspace),
+                                av_color_primaries_name(enc->color_primaries),
+                                av_color_transfer_name(enc->color_trc));
+                } else
+                    av_strlcatf(detail, sizeof(detail), "%s, ",
+                                av_get_colorspace_name(enc->colorspace));
+            }
+
+            if (enc->field_order != AV_FIELD_UNKNOWN) {
+                const char *field_order = "progressive";
+                if (enc->field_order == AV_FIELD_TT)
+                    field_order = "top first";
+                else if (enc->field_order == AV_FIELD_BB)
+                    field_order = "bottom first";
+                else if (enc->field_order == AV_FIELD_TB)
+                    field_order = "top coded first (swapped)";
+                else if (enc->field_order == AV_FIELD_BT)
+                    field_order = "bottom coded first (swapped)";
+
+                av_strlcatf(detail, sizeof(detail), "%s, ", field_order);
+            }
+
+            if (av_log_get_level() >= AV_LOG_VERBOSE &&
+                enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_chroma_location_name(enc->chroma_sample_location));
 
-        if (enc->color_range != AVCOL_RANGE_UNSPECIFIED)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s",
-                     av_color_range_name(enc->color_range));
-        if (enc->colorspace != AVCOL_SPC_UNSPECIFIED ||
-            enc->color_primaries != AVCOL_PRI_UNSPECIFIED ||
-            enc->color_trc != AVCOL_TRC_UNSPECIFIED) {
-            new_line = 1;
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s/%s/%s",
-                     av_color_space_name(enc->colorspace),
-                     av_color_primaries_name(enc->color_primaries),
-                     av_color_transfer_name(enc->color_trc));
+            if (strlen(detail) > 1) {
+                detail[strlen(detail) - 2] = 0;
+                av_strlcatf(buf, buf_size, "%s)", detail);
+            }
         }
-        if (av_log_get_level() >= AV_LOG_DEBUG &&
-            enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s",
-                     av_chroma_location_name(enc->chroma_sample_location));
 
         if (enc->width) {
-            av_strlcat(buf, new_line ? "\n      " : ", ", buf_size);
+            av_strlcat(buf, new_line ? separator : ", ", buf_size);
 
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      "%dx%d",
@@ -973,11 +1284,11 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 
             if (enc->sample_aspect_ratio.num) {
                 av_reduce(&display_aspect_ratio.num, &display_aspect_ratio.den,
-                          enc->width * enc->sample_aspect_ratio.num,
-                          enc->height * enc->sample_aspect_ratio.den,
+                          enc->width * (int64_t)enc->sample_aspect_ratio.num,
+                          enc->height * (int64_t)enc->sample_aspect_ratio.den,
                           1024 * 1024);
                 snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                         " [PAR %d:%d DAR %d:%d]",
+                         " [SAR %d:%d DAR %d:%d]",
                          enc->sample_aspect_ratio.num, enc->sample_aspect_ratio.den,
                          display_aspect_ratio.num, display_aspect_ratio.den);
             }
@@ -991,23 +1302,18 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
         if (encode) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", q=%d-%d", enc->qmin, enc->qmax);
+        } else {
+            if (enc->properties & FF_CODEC_PROPERTY_CLOSED_CAPTIONS)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", Closed Captions");
+            if (enc->properties & FF_CODEC_PROPERTY_LOSSLESS)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", lossless");
         }
         break;
     case AVMEDIA_TYPE_AUDIO:
-        snprintf(buf, buf_size,
-                 "Audio: %s",
-                 codec_name);
-        if (profile)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " (%s)", profile);
-        if (enc->codec_tag) {
-            char tag_buf[32];
-            av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " [%s / 0x%04X]", tag_buf, enc->codec_tag);
-        }
+        av_strlcat(buf, separator, buf_size);
 
-        av_strlcat(buf, "\n      ", buf_size);
         if (enc->sample_rate) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      "%d Hz, ", enc->sample_rate);
@@ -1017,18 +1323,34 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", %s", av_get_sample_fmt_name(enc->sample_fmt));
         }
+        if (   enc->bits_per_raw_sample > 0
+            && enc->bits_per_raw_sample != av_get_bytes_per_sample(enc->sample_fmt) * 8)
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                     " (%d bit)", enc->bits_per_raw_sample);
+        if (av_log_get_level() >= AV_LOG_VERBOSE) {
+            if (enc->initial_padding)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", delay %d", enc->initial_padding);
+            if (enc->trailing_padding)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", padding %d", enc->trailing_padding);
+        }
         break;
     case AVMEDIA_TYPE_DATA:
-        snprintf(buf, buf_size, "Data: %s", codec_name);
+        if (av_log_get_level() >= AV_LOG_DEBUG) {
+            int g = av_gcd(enc->time_base.num, enc->time_base.den);
+            if (g)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", %d/%d",
+                         enc->time_base.num / g, enc->time_base.den / g);
+        }
         break;
     case AVMEDIA_TYPE_SUBTITLE:
-        snprintf(buf, buf_size, "Subtitle: %s", codec_name);
-        break;
-    case AVMEDIA_TYPE_ATTACHMENT:
-        snprintf(buf, buf_size, "Attachment: %s", codec_name);
+        if (enc->width)
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                     ", %dx%d", enc->width, enc->height);
         break;
     default:
-        snprintf(buf, buf_size, "Invalid Codec type %d", enc->codec_type);
         return;
     }
     if (encode) {
@@ -1042,7 +1364,10 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
     bitrate = get_bit_rate(enc);
     if (bitrate != 0) {
         snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                 ", %d kb/s", bitrate / 1000);
+                 ", %"PRId64" kb/s", bitrate / 1000);
+    } else if (enc->rc_max_rate > 0) {
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 ", max. %"PRId64" kb/s", enc->rc_max_rate / 1000);
     }
 }
 
@@ -1076,35 +1401,51 @@ const char *avcodec_profile_name(enum AVCodecID codec_id, int profile)
 
 unsigned avcodec_version(void)
 {
+    av_assert0(AV_CODEC_ID_PCM_S8_PLANAR==65563);
+    av_assert0(AV_CODEC_ID_ADPCM_G722==69660);
+    av_assert0(AV_CODEC_ID_SRT==94216);
+    av_assert0(LIBAVCODEC_VERSION_MICRO >= 100);
+
     return LIBAVCODEC_VERSION_INT;
 }
 
 const char *avcodec_configuration(void)
 {
-    return LIBAV_CONFIGURATION;
+    return FFMPEG_CONFIGURATION;
 }
 
 const char *avcodec_license(void)
 {
 #define LICENSE_PREFIX "libavcodec license: "
-    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
 int av_get_exact_bits_per_sample(enum AVCodecID codec_id)
 {
     switch (codec_id) {
+    case AV_CODEC_ID_8SVX_EXP:
+    case AV_CODEC_ID_8SVX_FIB:
     case AV_CODEC_ID_ADPCM_CT:
     case AV_CODEC_ID_ADPCM_IMA_APC:
     case AV_CODEC_ID_ADPCM_IMA_EA_SEAD:
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
     case AV_CODEC_ID_ADPCM_IMA_WS:
     case AV_CODEC_ID_ADPCM_G722:
     case AV_CODEC_ID_ADPCM_YAMAHA:
+    case AV_CODEC_ID_ADPCM_AICA:
         return 4;
+    case AV_CODEC_ID_DSD_LSBF:
+    case AV_CODEC_ID_DSD_MSBF:
+    case AV_CODEC_ID_DSD_LSBF_PLANAR:
+    case AV_CODEC_ID_DSD_MSBF_PLANAR:
     case AV_CODEC_ID_PCM_ALAW:
     case AV_CODEC_ID_PCM_MULAW:
+    case AV_CODEC_ID_PCM_VIDC:
     case AV_CODEC_ID_PCM_S8:
+    case AV_CODEC_ID_PCM_S8_PLANAR:
     case AV_CODEC_ID_PCM_U8:
     case AV_CODEC_ID_PCM_ZORK:
+    case AV_CODEC_ID_SDX2_DPCM:
         return 8;
     case AV_CODEC_ID_PCM_S16BE:
     case AV_CODEC_ID_PCM_S16BE_PLANAR:
@@ -1127,15 +1468,41 @@ int av_get_exact_bits_per_sample(enum AVCodecID codec_id)
     case AV_CODEC_ID_PCM_U32LE:
     case AV_CODEC_ID_PCM_F32BE:
     case AV_CODEC_ID_PCM_F32LE:
+    case AV_CODEC_ID_PCM_F24LE:
+    case AV_CODEC_ID_PCM_F16LE:
         return 32;
     case AV_CODEC_ID_PCM_F64BE:
     case AV_CODEC_ID_PCM_F64LE:
+    case AV_CODEC_ID_PCM_S64BE:
+    case AV_CODEC_ID_PCM_S64LE:
         return 64;
     default:
         return 0;
     }
 }
 
+enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be)
+{
+    static const enum AVCodecID map[AV_SAMPLE_FMT_NB][2] = {
+        [AV_SAMPLE_FMT_U8  ] = { AV_CODEC_ID_PCM_U8,    AV_CODEC_ID_PCM_U8    },
+        [AV_SAMPLE_FMT_S16 ] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE },
+        [AV_SAMPLE_FMT_S32 ] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE },
+        [AV_SAMPLE_FMT_FLT ] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE },
+        [AV_SAMPLE_FMT_DBL ] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE },
+        [AV_SAMPLE_FMT_U8P ] = { AV_CODEC_ID_PCM_U8,    AV_CODEC_ID_PCM_U8    },
+        [AV_SAMPLE_FMT_S16P] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE },
+        [AV_SAMPLE_FMT_S32P] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE },
+        [AV_SAMPLE_FMT_S64P] = { AV_CODEC_ID_PCM_S64LE, AV_CODEC_ID_PCM_S64BE },
+        [AV_SAMPLE_FMT_FLTP] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE },
+        [AV_SAMPLE_FMT_DBLP] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE },
+    };
+    if (fmt < 0 || fmt >= AV_SAMPLE_FMT_NB)
+        return AV_CODEC_ID_NONE;
+    if (be < 0 || be > 1)
+        be = AV_NE(1, 0);
+    return map[fmt][be];
+}
+
 int av_get_bits_per_sample(enum AVCodecID codec_id)
 {
     switch (codec_id) {
@@ -1155,13 +1522,15 @@ int av_get_bits_per_sample(enum AVCodecID codec_id)
 }
 
 static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
-                                    uint32_t tag, int bits_per_coded_sample, int frame_bytes)
+                                    uint32_t tag, int bits_per_coded_sample, int64_t bitrate,
+                                    uint8_t * extradata, int frame_size, int frame_bytes)
 {
     int bps = av_get_exact_bits_per_sample(id);
+    int framecount = (ba > 0 && frame_bytes / ba > 0) ? frame_bytes / ba : 1;
 
     /* codecs with an exact constant bits per sample */
-    if (bps > 0 && ch > 0 && frame_bytes > 0)
-        return (frame_bytes * 8) / (bps * ch);
+    if (bps > 0 && ch > 0 && frame_bytes > 0 && ch < 32768 && bps < 32768)
+        return (frame_bytes * 8LL) / (bps * ch);
     bps = bits_per_coded_sample;
 
     /* codecs with a fixed packet duration */
@@ -1170,16 +1539,17 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
     case AV_CODEC_ID_ADPCM_IMA_QT: return   64;
     case AV_CODEC_ID_ADPCM_EA_XAS: return  128;
     case AV_CODEC_ID_AMR_NB:
+    case AV_CODEC_ID_EVRC:
     case AV_CODEC_ID_GSM:
     case AV_CODEC_ID_QCELP:
-    case AV_CODEC_ID_RA_144:
     case AV_CODEC_ID_RA_288:       return  160;
-    case AV_CODEC_ID_IMC:          return  256;
     case AV_CODEC_ID_AMR_WB:
     case AV_CODEC_ID_GSM_MS:       return  320;
     case AV_CODEC_ID_MP1:          return  384;
     case AV_CODEC_ID_ATRAC1:       return  512;
-    case AV_CODEC_ID_ATRAC3:       return 1024;
+    case AV_CODEC_ID_ATRAC9:
+    case AV_CODEC_ID_ATRAC3:       return 1024 * framecount;
+    case AV_CODEC_ID_ATRAC3P:      return 2048;
     case AV_CODEC_ID_MP2:
     case AV_CODEC_ID_MUSEPACK7:    return 1152;
     case AV_CODEC_ID_AC3:          return 1536;
@@ -1189,6 +1559,8 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
         /* calc from sample rate */
         if (id == AV_CODEC_ID_TTA)
             return 256 * sr / 245;
+        else if (id == AV_CODEC_ID_DST)
+            return 588 * sr / 44100;
 
         if (ch > 0) {
             /* calc from sample rate and channels */
@@ -1223,23 +1595,36 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
             return 240 * (frame_bytes / 32);
         if (id == AV_CODEC_ID_NELLYMOSER)
             return 256 * (frame_bytes / 64);
+        if (id == AV_CODEC_ID_RA_144)
+            return 160 * (frame_bytes / 20);
 
         if (bps > 0) {
             /* calc from frame_bytes and bits_per_coded_sample */
-            if (id == AV_CODEC_ID_ADPCM_G726)
+            if (id == AV_CODEC_ID_ADPCM_G726 || id == AV_CODEC_ID_ADPCM_G726LE)
                 return frame_bytes * 8 / bps;
         }
 
-        if (ch > 0) {
+        if (ch > 0 && ch < INT_MAX/16) {
             /* calc from frame_bytes and channels */
             switch (id) {
+            case AV_CODEC_ID_ADPCM_AFC:
+                return frame_bytes / (9 * ch) * 16;
+            case AV_CODEC_ID_ADPCM_PSX:
+            case AV_CODEC_ID_ADPCM_DTK:
+                return frame_bytes / (16 * ch) * 28;
             case AV_CODEC_ID_ADPCM_4XM:
+            case AV_CODEC_ID_ADPCM_IMA_DAT4:
             case AV_CODEC_ID_ADPCM_IMA_ISS:
                 return (frame_bytes - 4 * ch) * 2 / ch;
             case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
                 return (frame_bytes - 4) * 2 / ch;
             case AV_CODEC_ID_ADPCM_IMA_AMV:
                 return (frame_bytes - 8) * 2 / ch;
+            case AV_CODEC_ID_ADPCM_THP:
+            case AV_CODEC_ID_ADPCM_THP_LE:
+                if (extradata)
+                    return frame_bytes * 14 / (8 * ch);
+                break;
             case AV_CODEC_ID_ADPCM_XA:
                 return (frame_bytes / 128) * 224 / ch;
             case AV_CODEC_ID_INTERPLAY_DPCM:
@@ -1254,6 +1639,9 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 return 6 * frame_bytes / ch;
             case AV_CODEC_ID_PCM_LXF:
                 return 2 * (frame_bytes / (5 * ch));
+            case AV_CODEC_ID_IAC:
+            case AV_CODEC_ID_IMC:
+                return 4 * frame_bytes / ch;
             }
 
             if (tag) {
@@ -1271,13 +1659,19 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 int blocks = frame_bytes / ba;
                 switch (id) {
                 case AV_CODEC_ID_ADPCM_IMA_WAV:
-                    return blocks * (1 + (ba - 4 * ch) / (4 * ch) * 8);
+                    if (bps < 2 || bps > 5)
+                        return 0;
+                    return blocks * (1 + (ba - 4 * ch) / (bps * ch) * 8);
                 case AV_CODEC_ID_ADPCM_IMA_DK3:
                     return blocks * (((ba - 16) * 2 / 3 * 4) / ch);
                 case AV_CODEC_ID_ADPCM_IMA_DK4:
                     return blocks * (1 + (ba - 4 * ch) * 2 / ch);
+                case AV_CODEC_ID_ADPCM_IMA_RAD:
+                    return blocks * ((ba - 4 * ch) * 2 / ch);
                 case AV_CODEC_ID_ADPCM_MS:
                     return blocks * (2 + (ba - 7 * ch) * 2 / ch);
+                case AV_CODEC_ID_ADPCM_MTAF:
+                    return blocks * (ba - 16) * 2 / ch;
                 }
             }
 
@@ -1285,9 +1679,13 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 /* calc from frame_bytes, channels, and bits_per_coded_sample */
                 switch (id) {
                 case AV_CODEC_ID_PCM_DVD:
-                    return 2 * (frame_bytes / ((bps * 2 / 8) * ch));
+                    if(bps<4 || frame_bytes<3)
+                        return 0;
+                    return 2 * ((frame_bytes - 3) / ((bps * 2 / 8) * ch));
                 case AV_CODEC_ID_PCM_BLURAY:
-                    return frame_bytes / ((FFALIGN(ch, 2) * bps) / 8);
+                    if(bps<4 || frame_bytes<4)
+                        return 0;
+                    return (frame_bytes - 4) / ((FFALIGN(ch, 2) * bps) / 8);
                 case AV_CODEC_ID_S302M:
                     return 2 * (frame_bytes / ((bps + 4) / 4)) / ch;
                 }
@@ -1295,6 +1693,17 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
         }
     }
 
+    /* Fall back on using frame_size */
+    if (frame_size > 1 && frame_bytes)
+        return frame_size;
+
+    //For WMA we currently have no other means to calculate duration thus we
+    //do it here by assuming CBR, which is true for all known cases.
+    if (bitrate > 0 && frame_bytes > 0 && sr > 0 && ba > 1) {
+        if (id == AV_CODEC_ID_WMAV1 || id == AV_CODEC_ID_WMAV2)
+            return  (frame_bytes * 8LL * sr) / bitrate;
+    }
+
     return 0;
 }
 
@@ -1303,6 +1712,7 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
     return get_audio_frame_duration(avctx->codec_id, avctx->sample_rate,
                                     avctx->channels, avctx->block_align,
                                     avctx->codec_tag, avctx->bits_per_coded_sample,
+                                    avctx->bit_rate, avctx->extradata, avctx->frame_size,
                                     frame_bytes);
 }
 
@@ -1311,6 +1721,7 @@ int av_get_audio_frame_duration2(AVCodecParameters *par, int frame_bytes)
     return get_audio_frame_duration(par->codec_id, par->sample_rate,
                                     par->channels, par->block_align,
                                     par->codec_tag, par->bits_per_coded_sample,
+                                    par->bit_rate, par->extradata, par->frame_size,
                                     frame_bytes);
 }
 
@@ -1365,77 +1776,37 @@ void av_register_hwaccel(AVHWAccel *hwaccel)
 }
 #endif
 
+#if FF_API_LOCKMGR
 int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op))
 {
-    if (lockmgr_cb) {
-        // There is no good way to rollback a failure to destroy the
-        // mutex, so we ignore failures.
-        lockmgr_cb(&codec_mutex,    AV_LOCK_DESTROY);
-        lockmgr_cb(&avformat_mutex, AV_LOCK_DESTROY);
-        lockmgr_cb     = NULL;
-        codec_mutex    = NULL;
-        avformat_mutex = NULL;
-    }
-
-    if (cb) {
-        void *new_codec_mutex    = NULL;
-        void *new_avformat_mutex = NULL;
-        int err;
-        if (err = cb(&new_codec_mutex, AV_LOCK_CREATE)) {
-            return err > 0 ? AVERROR_UNKNOWN : err;
-        }
-        if (err = cb(&new_avformat_mutex, AV_LOCK_CREATE)) {
-            // Ignore failures to destroy the newly created mutex.
-            cb(&new_codec_mutex, AV_LOCK_DESTROY);
-            return err > 0 ? AVERROR_UNKNOWN : err;
-        }
-        lockmgr_cb     = cb;
-        codec_mutex    = new_codec_mutex;
-        avformat_mutex = new_avformat_mutex;
-    }
-
-    return 0;
-}
-
-int avpriv_lock_avformat(void)
-{
-    if (lockmgr_cb) {
-        if ((*lockmgr_cb)(&avformat_mutex, AV_LOCK_OBTAIN))
-            return -1;
-    }
-    return 0;
-}
-
-int avpriv_unlock_avformat(void)
-{
-    if (lockmgr_cb) {
-        if ((*lockmgr_cb)(&avformat_mutex, AV_LOCK_RELEASE))
-            return -1;
-    }
     return 0;
 }
+#endif
 
 unsigned int avpriv_toupper4(unsigned int x)
 {
     return av_toupper(x & 0xFF) +
           (av_toupper((x >>  8) & 0xFF) << 8)  +
           (av_toupper((x >> 16) & 0xFF) << 16) +
-          (av_toupper((x >> 24) & 0xFF) << 24);
+((unsigned)av_toupper((x >> 24) & 0xFF) << 24);
 }
 
 int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
 {
     int ret;
 
-    dst->owner = src->owner;
+    dst->owner[0] = src->owner[0];
+    dst->owner[1] = src->owner[1];
 
     ret = av_frame_ref(dst->f, src->f);
     if (ret < 0)
         return ret;
 
+    av_assert0(!dst->progress);
+
     if (src->progress &&
         !(dst->progress = av_buffer_ref(src->progress))) {
-        ff_thread_release_buffer(dst->owner, dst);
+        ff_thread_release_buffer(dst->owner[0], dst);
         return AVERROR(ENOMEM);
     }
 
@@ -1444,9 +1815,14 @@ int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
 
 #if !HAVE_THREADS
 
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
+{
+    return ff_get_format(avctx, fmt);
+}
+
 int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
 {
-    f->owner = avctx;
+    f->owner[0] = f->owner[1] = avctx;
     return ff_get_buffer(avctx, f->f, flags);
 }
 
@@ -1468,6 +1844,28 @@ void ff_thread_await_progress(ThreadFrame *f, int progress, int field)
 {
 }
 
+int ff_thread_can_start_frame(AVCodecContext *avctx)
+{
+    return 1;
+}
+
+int ff_alloc_entries(AVCodecContext *avctx, int count)
+{
+    return 0;
+}
+
+void ff_reset_entries(AVCodecContext *avctx)
+{
+}
+
+void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift)
+{
+}
+
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n)
+{
+}
+
 #endif
 
 int avcodec_is_open(AVCodecContext *s)
@@ -1475,13 +1873,36 @@ int avcodec_is_open(AVCodecContext *s)
     return !!s->internal;
 }
 
-const uint8_t *avpriv_find_start_code(const uint8_t *restrict p,
+int avpriv_bprint_to_extradata(AVCodecContext *avctx, struct AVBPrint *buf)
+{
+    int ret;
+    char *str;
+
+    ret = av_bprint_finalize(buf, &str);
+    if (ret < 0)
+        return ret;
+    if (!av_bprint_is_complete(buf)) {
+        av_free(str);
+        return AVERROR(ENOMEM);
+    }
+
+    avctx->extradata = str;
+    /* Note: the string is NUL terminated (so extradata can be read as a
+     * string), but the ending character is not accounted in the size (in
+     * binary formats you are likely not supposed to mux that character). When
+     * extradata is copied, it is also padded with AV_INPUT_BUFFER_PADDING_SIZE
+     * zeros. */
+    avctx->extradata_size = buf->len;
+    return 0;
+}
+
+const uint8_t *avpriv_find_start_code(const uint8_t *av_restrict p,
                                       const uint8_t *end,
-                                      uint32_t * restrict state)
+                                      uint32_t *av_restrict state)
 {
     int i;
 
-    assert(p <= end);
+    av_assert0(p <= end);
     if (p >= end)
         return end;
 
@@ -1564,6 +1985,8 @@ static void codec_parameters_reset(AVCodecParameters *par)
     par->color_space         = AVCOL_SPC_UNSPECIFIED;
     par->chroma_location     = AVCHROMA_LOC_UNSPECIFIED;
     par->sample_aspect_ratio = (AVRational){ 0, 1 };
+    par->profile             = FF_PROFILE_UNKNOWN;
+    par->level               = FF_LEVEL_UNKNOWN;
 }
 
 AVCodecParameters *avcodec_parameters_alloc(void)
@@ -1616,6 +2039,7 @@ int avcodec_parameters_from_context(AVCodecParameters *par,
 
     par->bit_rate              = codec->bit_rate;
     par->bits_per_coded_sample = codec->bits_per_coded_sample;
+    par->bits_per_raw_sample   = codec->bits_per_raw_sample;
     par->profile               = codec->profile;
     par->level                 = codec->level;
 
@@ -1631,14 +2055,22 @@ int avcodec_parameters_from_context(AVCodecParameters *par,
         par->color_space         = codec->colorspace;
         par->chroma_location     = codec->chroma_sample_location;
         par->sample_aspect_ratio = codec->sample_aspect_ratio;
+        par->video_delay         = codec->has_b_frames;
         break;
     case AVMEDIA_TYPE_AUDIO:
-        par->format          = codec->sample_fmt;
-        par->channel_layout  = codec->channel_layout;
-        par->channels        = codec->channels;
-        par->sample_rate     = codec->sample_rate;
-        par->block_align     = codec->block_align;
-        par->initial_padding = codec->initial_padding;
+        par->format           = codec->sample_fmt;
+        par->channel_layout   = codec->channel_layout;
+        par->channels         = codec->channels;
+        par->sample_rate      = codec->sample_rate;
+        par->block_align      = codec->block_align;
+        par->frame_size       = codec->frame_size;
+        par->initial_padding  = codec->initial_padding;
+        par->trailing_padding = codec->trailing_padding;
+        par->seek_preroll     = codec->seek_preroll;
+        break;
+    case AVMEDIA_TYPE_SUBTITLE:
+        par->width  = codec->width;
+        par->height = codec->height;
         break;
     }
 
@@ -1662,6 +2094,7 @@ int avcodec_parameters_to_context(AVCodecContext *codec,
 
     codec->bit_rate              = par->bit_rate;
     codec->bits_per_coded_sample = par->bits_per_coded_sample;
+    codec->bits_per_raw_sample   = par->bits_per_raw_sample;
     codec->profile               = par->profile;
     codec->level                 = par->level;
 
@@ -1677,14 +2110,23 @@ int avcodec_parameters_to_context(AVCodecContext *codec,
         codec->colorspace             = par->color_space;
         codec->chroma_sample_location = par->chroma_location;
         codec->sample_aspect_ratio    = par->sample_aspect_ratio;
+        codec->has_b_frames           = par->video_delay;
         break;
     case AVMEDIA_TYPE_AUDIO:
-        codec->sample_fmt      = par->format;
-        codec->channel_layout  = par->channel_layout;
-        codec->channels        = par->channels;
-        codec->sample_rate     = par->sample_rate;
-        codec->block_align     = par->block_align;
-        codec->initial_padding = par->initial_padding;
+        codec->sample_fmt       = par->format;
+        codec->channel_layout   = par->channel_layout;
+        codec->channels         = par->channels;
+        codec->sample_rate      = par->sample_rate;
+        codec->block_align      = par->block_align;
+        codec->frame_size       = par->frame_size;
+        codec->delay            =
+        codec->initial_padding  = par->initial_padding;
+        codec->trailing_padding = par->trailing_padding;
+        codec->seek_preroll     = par->seek_preroll;
+        break;
+    case AVMEDIA_TYPE_SUBTITLE:
+        codec->width  = par->width;
+        codec->height = par->height;
         break;
     }
 
@@ -1699,3 +2141,86 @@ int avcodec_parameters_to_context(AVCodecContext *codec,
 
     return 0;
 }
+
+int ff_alloc_a53_sei(const AVFrame *frame, size_t prefix_len,
+                     void **data, size_t *sei_size)
+{
+    AVFrameSideData *side_data = NULL;
+    uint8_t *sei_data;
+
+    if (frame)
+        side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_A53_CC);
+
+    if (!side_data) {
+        *data = NULL;
+        return 0;
+    }
+
+    *sei_size = side_data->size + 11;
+    *data = av_mallocz(*sei_size + prefix_len);
+    if (!*data)
+        return AVERROR(ENOMEM);
+    sei_data = (uint8_t*)*data + prefix_len;
+
+    // country code
+    sei_data[0] = 181;
+    sei_data[1] = 0;
+    sei_data[2] = 49;
+
+    /**
+     * 'GA94' is standard in North America for ATSC, but hard coding
+     * this style may not be the right thing to do -- other formats
+     * do exist. This information is not available in the side_data
+     * so we are going with this right now.
+     */
+    AV_WL32(sei_data + 3, MKTAG('G', 'A', '9', '4'));
+    sei_data[7] = 3;
+    sei_data[8] = ((side_data->size/3) & 0x1f) | 0x40;
+    sei_data[9] = 0;
+
+    memcpy(sei_data + 10, side_data->data, side_data->size);
+
+    sei_data[side_data->size+10] = 255;
+
+    return 0;
+}
+
+int64_t ff_guess_coded_bitrate(AVCodecContext *avctx)
+{
+    AVRational framerate = avctx->framerate;
+    int bits_per_coded_sample = avctx->bits_per_coded_sample;
+    int64_t bitrate;
+
+    if (!(framerate.num && framerate.den))
+        framerate = av_inv_q(avctx->time_base);
+    if (!(framerate.num && framerate.den))
+        return 0;
+
+    if (!bits_per_coded_sample) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+        bits_per_coded_sample = av_get_bits_per_pixel(desc);
+    }
+    bitrate = (int64_t)bits_per_coded_sample * avctx->width * avctx->height *
+              framerate.num / framerate.den;
+
+    return bitrate;
+}
+
+int ff_int_from_list_or_default(void *ctx, const char * val_name, int val,
+                                const int * array_valid_values, int default_value)
+{
+    int i = 0, ref_val;
+
+    while (1) {
+        ref_val = array_valid_values[i];
+        if (ref_val == INT_MAX)
+            break;
+        if (val == ref_val)
+            return val;
+        i++;
+    }
+    /* val is not a valid value */
+    av_log(ctx, AV_LOG_DEBUG,
+           "%s %d are not supported. Set to default value : %d\n", val_name, val, default_value);
+    return default_value;
+}
diff --git a/libavcodec/utvideo.c b/libavcodec/utvideo.c
index 556b4de..5828d5e 100644
--- a/libavcodec/utvideo.c
+++ b/libavcodec/utvideo.c
@@ -2,20 +2,20 @@
  * Common Ut Video code
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,5 +43,5 @@ int ff_ut_huff_cmp_len(const void *a, const void *b)
 int ff_ut10_huff_cmp_len(const void *a, const void *b)
 {
     const HuffEntry *aa = a, *bb = b;
-    return (aa->len - bb->len) * 1024 + aa->sym - bb->sym;
+    return (aa->len - bb->len)*1024 + aa->sym - bb->sym;
 }
diff --git a/libavcodec/utvideo.h b/libavcodec/utvideo.h
index 2fa2b7c..cf0bb28 100644
--- a/libavcodec/utvideo.h
+++ b/libavcodec/utvideo.h
@@ -2,20 +2,20 @@
  * Common Ut Video header
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,9 @@
 #include "libavutil/common.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
-#include "huffyuvdsp.h"
-#include "huffyuvencdsp.h"
+#include "utvideodsp.h"
+#include "lossless_videodsp.h"
+#include "lossless_videoencdsp.h"
 
 enum {
     PRED_NONE = 0,
@@ -57,32 +58,37 @@ enum {
     UTVIDEO_RGBA = MKTAG(0x00, 0x00, 0x02, 0x18),
     UTVIDEO_420  = MKTAG('Y', 'V', '1', '2'),
     UTVIDEO_422  = MKTAG('Y', 'U', 'Y', '2'),
+    UTVIDEO_444  = MKTAG('Y', 'V', '2', '4'),
 };
 
 /* Mapping of libavcodec prediction modes to Ut Video's */
 extern const int ff_ut_pred_order[5];
 
-/* Order of RGB(A) planes in Ut Video */
-extern const int ff_ut_rgb_order[4];
-
 typedef struct UtvideoContext {
     const AVClass *class;
     AVCodecContext *avctx;
+    UTVideoDSPContext utdsp;
     BswapDSPContext bdsp;
-    HuffYUVDSPContext hdspdec;
-    HuffYUVEncDSPContext hdsp;
+    LLVidDSPContext llviddsp;
+    LLVidEncDSPContext llvidencdsp;
 
-    uint32_t frame_info_size, flags, frame_info;
+    uint32_t frame_info_size, flags, frame_info, offset;
     int      planes;
     int      slices;
     int      compression;
     int      interlaced;
     int      frame_pred;
     int      pro;
+    int      pack;
 
     ptrdiff_t slice_stride;
     uint8_t *slice_bits, *slice_buffer[4];
     int      slice_bits_size;
+
+    const uint8_t *packed_stream[4][256];
+    size_t packed_stream_size[4][256];
+    const uint8_t *control_stream[4][256];
+    size_t control_stream_size[4][256];
 } UtvideoContext;
 
 typedef struct HuffEntry {
diff --git a/libavcodec/utvideodec.c b/libavcodec/utvideodec.c
index 26b991c..3891df3 100644
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@@ -2,20 +2,20 @@
  * Ut Video decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,16 +27,18 @@
 #include <inttypes.h>
 #include <stdlib.h>
 
-#include "libavutil/intreadwrite.h"
+#define CACHED_BITSTREAM_READER !ARCH_X86_32
+#define UNCHECKED_BITSTREAM_READER 1
 
+#include "libavutil/intreadwrite.h"
+#include "libavutil/pixdesc.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bswapdsp.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "thread.h"
 #include "utvideo.h"
-#include "vlc.h"
 
 static int build_huff10(const uint8_t *src, VLC *vlc, int *fsym)
 {
@@ -75,8 +77,8 @@ static int build_huff10(const uint8_t *src, VLC *vlc, int *fsym)
         syms[i]  = he[i].sym;
         code += 0x80000000u >> (he[i].len - 1);
     }
-
-    return ff_init_vlc_sparse(vlc, FFMIN(he[last].len, 11), last + 1,
+#define VLC_BITS 11
+    return ff_init_vlc_sparse(vlc, VLC_BITS, last + 1,
                               bits,  sizeof(*bits),  sizeof(*bits),
                               codes, sizeof(*codes), sizeof(*codes),
                               syms,  sizeof(*syms),  sizeof(*syms), 0);
@@ -103,13 +105,14 @@ static int build_huff(const uint8_t *src, VLC *vlc, int *fsym)
         *fsym = he[0].sym;
         return 0;
     }
-    if (he[0].len > 32)
-        return -1;
 
     last = 255;
     while (he[last].len == 255 && last)
         last--;
 
+    if (he[last].len > 32)
+        return -1;
+
     code = 1;
     for (i = last; i >= 0; i--) {
         codes[i] = code >> (32 - he[i].len);
@@ -118,22 +121,22 @@ static int build_huff(const uint8_t *src, VLC *vlc, int *fsym)
         code += 0x80000000u >> (he[i].len - 1);
     }
 
-    return ff_init_vlc_sparse(vlc, FFMIN(he[last].len, 9), last + 1,
+    return ff_init_vlc_sparse(vlc, VLC_BITS, last + 1,
                               bits,  sizeof(*bits),  sizeof(*bits),
                               codes, sizeof(*codes), sizeof(*codes),
                               syms,  sizeof(*syms),  sizeof(*syms), 0);
 }
 
 static int decode_plane10(UtvideoContext *c, int plane_no,
-                          uint16_t *dst, int step, int stride,
+                          uint16_t *dst, ptrdiff_t stride,
                           int width, int height,
                           const uint8_t *src, const uint8_t *huff,
                           int use_pred)
 {
-    BitstreamContext bc;
     int i, j, slice, pix, ret;
     int sstart, send;
     VLC vlc;
+    GetBitContext gb;
     int prev, fsym;
 
     if ((ret = build_huff10(huff, &vlc, &fsym)) < 0) {
@@ -151,7 +154,7 @@ static int decode_plane10(UtvideoContext *c, int plane_no,
 
             prev = 0x200;
             for (j = sstart; j < send; j++) {
-                for (i = 0; i < width * step; i += step) {
+                for (i = 0; i < width; i++) {
                     pix = fsym;
                     if (use_pred) {
                         prev += pix;
@@ -186,23 +189,16 @@ static int decode_plane10(UtvideoContext *c, int plane_no,
             goto fail;
         }
 
-        memcpy(c->slice_bits, src + slice_data_start + c->slices * 4,
-               slice_size);
         memset(c->slice_bits + slice_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
         c->bdsp.bswap_buf((uint32_t *) c->slice_bits,
-                          (uint32_t *) c->slice_bits,
+                          (uint32_t *)(src + slice_data_start + c->slices * 4),
                           (slice_data_end - slice_data_start + 3) >> 2);
-        bitstream_init8(&bc, c->slice_bits, slice_size);
+        init_get_bits(&gb, c->slice_bits, slice_size * 8);
 
         prev = 0x200;
         for (j = sstart; j < send; j++) {
-            for (i = 0; i < width * step; i += step) {
-                if (bitstream_bits_left(&bc) <= 0) {
-                    av_log(c->avctx, AV_LOG_ERROR,
-                           "Slice decoding ran out of bits\n");
-                    goto fail;
-                }
-                pix = bitstream_read_vlc(&bc, vlc.table, vlc.bits, 3);
+            for (i = 0; i < width; i++) {
+                pix = get_vlc2(&gb, vlc.table, VLC_BITS, 3);
                 if (pix < 0) {
                     av_log(c->avctx, AV_LOG_ERROR, "Decoding error\n");
                     goto fail;
@@ -215,10 +211,15 @@ static int decode_plane10(UtvideoContext *c, int plane_no,
                 dest[i] = pix;
             }
             dest += stride;
+            if (get_bits_left(&gb) < 0) {
+                av_log(c->avctx, AV_LOG_ERROR,
+                        "Slice decoding ran out of bits\n");
+                goto fail;
+            }
         }
-        if (bitstream_bits_left(&bc) > 32)
+        if (get_bits_left(&gb) > 32)
             av_log(c->avctx, AV_LOG_WARNING,
-                   "%d bits left after decoding slice\n", bitstream_bits_left(&bc));
+                   "%d bits left after decoding slice\n", get_bits_left(&gb));
     }
 
     ff_free_vlc(&vlc);
@@ -229,7 +230,7 @@ fail:
     return AVERROR_INVALIDDATA;
 }
 
-static int compute_cmask(int plane_no, int interlaced, int pix_fmt)
+static int compute_cmask(int plane_no, int interlaced, enum AVPixelFormat pix_fmt)
 {
     const int is_luma = (pix_fmt == AV_PIX_FMT_YUV420P) && !plane_no;
 
@@ -240,17 +241,64 @@ static int compute_cmask(int plane_no, int interlaced, int pix_fmt)
 }
 
 static int decode_plane(UtvideoContext *c, int plane_no,
-                        uint8_t *dst, int step, ptrdiff_t stride,
+                        uint8_t *dst, ptrdiff_t stride,
                         int width, int height,
                         const uint8_t *src, int use_pred)
 {
     int i, j, slice, pix;
     int sstart, send;
     VLC vlc;
-    BitstreamContext bc;
-    int prev, fsym;
+    GetBitContext gb;
+    int ret, prev, fsym;
     const int cmask = compute_cmask(plane_no, c->interlaced, c->avctx->pix_fmt);
 
+    if (c->pack) {
+        send = 0;
+        for (slice = 0; slice < c->slices; slice++) {
+            GetBitContext cbit, pbit;
+            uint8_t *dest, *p;
+
+            ret = init_get_bits8(&cbit, c->control_stream[plane_no][slice], c->control_stream_size[plane_no][slice]);
+            if (ret < 0)
+                return ret;
+
+            ret = init_get_bits8(&pbit, c->packed_stream[plane_no][slice], c->packed_stream_size[plane_no][slice]);
+            if (ret < 0)
+                return ret;
+
+            sstart = send;
+            send   = (height * (slice + 1) / c->slices) & cmask;
+            dest   = dst + sstart * stride;
+
+            if (3 * ((dst + send * stride - dest + 7)/8) > get_bits_left(&cbit))
+                return AVERROR_INVALIDDATA;
+
+            for (p = dest; p < dst + send * stride; p += 8) {
+                int bits = get_bits_le(&cbit, 3);
+
+                if (bits == 0) {
+                    *(uint64_t *) p = 0;
+                } else {
+                    uint32_t sub = 0x80 >> (8 - (bits + 1)), add;
+                    int k;
+
+                    if ((bits + 1) * 8 > get_bits_left(&pbit))
+                        return AVERROR_INVALIDDATA;
+
+                    for (k = 0; k < 8; k++) {
+
+                        p[k] = get_bits_le(&pbit, bits + 1);
+                        add = (~p[k] & sub) << (8 - bits);
+                        p[k] -= sub;
+                        p[k] += add;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
     if (build_huff(src, &vlc, &fsym)) {
         av_log(c->avctx, AV_LOG_ERROR, "Cannot build Huffman codes\n");
         return AVERROR_INVALIDDATA;
@@ -266,7 +314,7 @@ static int decode_plane(UtvideoContext *c, int plane_no,
 
             prev = 0x80;
             for (j = sstart; j < send; j++) {
-                for (i = 0; i < width * step; i += step) {
+                for (i = 0; i < width; i++) {
                     pix = fsym;
                     if (use_pred) {
                         prev += pix;
@@ -302,23 +350,16 @@ static int decode_plane(UtvideoContext *c, int plane_no,
             goto fail;
         }
 
-        memcpy(c->slice_bits, src + slice_data_start + c->slices * 4,
-               slice_size);
         memset(c->slice_bits + slice_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
         c->bdsp.bswap_buf((uint32_t *) c->slice_bits,
-                          (uint32_t *) c->slice_bits,
+                          (uint32_t *)(src + slice_data_start + c->slices * 4),
                           (slice_data_end - slice_data_start + 3) >> 2);
-        bitstream_init8(&bc, c->slice_bits, slice_size);
+        init_get_bits(&gb, c->slice_bits, slice_size * 8);
 
         prev = 0x80;
         for (j = sstart; j < send; j++) {
-            for (i = 0; i < width * step; i += step) {
-                if (bitstream_bits_left(&bc) <= 0) {
-                    av_log(c->avctx, AV_LOG_ERROR,
-                           "Slice decoding ran out of bits\n");
-                    goto fail;
-                }
-                pix = bitstream_read_vlc(&bc, vlc.table, vlc.bits, 4);
+            for (i = 0; i < width; i++) {
+                pix = get_vlc2(&gb, vlc.table, VLC_BITS, 3);
                 if (pix < 0) {
                     av_log(c->avctx, AV_LOG_ERROR, "Decoding error\n");
                     goto fail;
@@ -329,11 +370,16 @@ static int decode_plane(UtvideoContext *c, int plane_no,
                 }
                 dest[i] = pix;
             }
+            if (get_bits_left(&gb) < 0) {
+                av_log(c->avctx, AV_LOG_ERROR,
+                        "Slice decoding ran out of bits\n");
+                goto fail;
+            }
             dest += stride;
         }
-        if (bitstream_bits_left(&bc) > 32)
+        if (get_bits_left(&gb) > 32)
             av_log(c->avctx, AV_LOG_WARNING,
-                   "%d bits left after decoding slice\n", bitstream_bits_left(&bc));
+                   "%d bits left after decoding slice\n", get_bits_left(&gb));
     }
 
     ff_free_vlc(&vlc);
@@ -344,49 +390,12 @@ fail:
     return AVERROR_INVALIDDATA;
 }
 
-static void restore_rgb_planes(uint8_t *src, int step, ptrdiff_t stride,
-                               int width, int height)
-{
-    int i, j;
-    uint8_t r, g, b;
-
-    for (j = 0; j < height; j++) {
-        for (i = 0; i < width * step; i += step) {
-            r = src[i];
-            g = src[i + 1];
-            b = src[i + 2];
-            src[i]     = r + g - 0x80;
-            src[i + 2] = b + g - 0x80;
-        }
-        src += stride;
-    }
-}
-
-static void restore_rgb_planes10(AVFrame *frame, int width, int height)
-{
-    uint16_t *src_r = (uint16_t *)frame->data[2];
-    uint16_t *src_g = (uint16_t *)frame->data[0];
-    uint16_t *src_b = (uint16_t *)frame->data[1];
-    int r, g, b;
-    int i, j;
+#undef A
+#undef B
+#undef C
 
-    for (j = 0; j < height; j++) {
-        for (i = 0; i < width; i++) {
-            r = src_r[i];
-            g = src_g[i];
-            b = src_b[i];
-            src_r[i] = (r + g - 0x200) & 0x3FF;
-            src_b[i] = (b + g - 0x200) & 0x3FF;
-        }
-        src_r += frame->linesize[2] / 2;
-        src_g += frame->linesize[0] / 2;
-        src_b += frame->linesize[1] / 2;
-    }
-}
-
-static void restore_median_planar(UtvideoContext *c, uint8_t *src,
-                                  ptrdiff_t stride, int width, int height,
-                                  int slices, int rmode)
+static void restore_median_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t stride,
+                                  int width, int height, int slices, int rmode)
 {
     int i, j, slice;
     int A, B, C;
@@ -405,7 +414,7 @@ static void restore_median_planar(UtvideoContext *c, uint8_t *src,
 
         // first line - left neighbour prediction
         bsrc[0] += 0x80;
-        c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
+        c->llviddsp.add_left_pred(bsrc, bsrc, width, 0);
         bsrc += stride;
         if (slice_height <= 1)
             continue;
@@ -413,16 +422,20 @@ static void restore_median_planar(UtvideoContext *c, uint8_t *src,
         C        = bsrc[-stride];
         bsrc[0] += C;
         A        = bsrc[0];
-        for (i = 1; i < width; i++) {
+        for (i = 1; i < FFMIN(width, 16); i++) { /* scalar loop (DSP need align 16) */
             B        = bsrc[i - stride];
             bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
             C        = B;
             A        = bsrc[i];
         }
+        if (width > 16)
+            c->llviddsp.add_median_pred(bsrc + 16, bsrc - stride + 16,
+                                        bsrc + 16, width - 16, &A, &B);
+
         bsrc += stride;
         // the rest of lines use continuous median prediction
         for (j = 2; j < slice_height; j++) {
-            c->hdspdec.add_hfyu_median_pred(bsrc, bsrc - stride,
+            c->llviddsp.add_median_pred(bsrc, bsrc - stride,
                                             bsrc, width, &A, &B);
             bsrc += stride;
         }
@@ -433,16 +446,15 @@ static void restore_median_planar(UtvideoContext *c, uint8_t *src,
  * so restoring function should take care of possible padding between
  * two parts of the same "line".
  */
-static void restore_median_planar_il(UtvideoContext *c, uint8_t *src,
-                                     ptrdiff_t stride, int width, int height,
-                                     int slices, int rmode)
+static void restore_median_planar_il(UtvideoContext *c, uint8_t *src, ptrdiff_t stride,
+                                     int width, int height, int slices, int rmode)
 {
     int i, j, slice;
     int A, B, C;
     uint8_t *bsrc;
     int slice_start, slice_height;
     const int cmask   = ~(rmode ? 3 : 1);
-    const int stride2 = stride << 1;
+    const ptrdiff_t stride2 = stride << 1;
 
     for (slice = 0; slice < slices; slice++) {
         slice_start    = ((slice * height) / slices) & cmask;
@@ -456,8 +468,8 @@ static void restore_median_planar_il(UtvideoContext *c, uint8_t *src,
 
         // first line - left neighbour prediction
         bsrc[0] += 0x80;
-        A = c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
-        c->hdspdec.add_hfyu_left_pred(bsrc + stride, bsrc + stride, width, A);
+        A = c->llviddsp.add_left_pred(bsrc, bsrc, width, 0);
+        c->llviddsp.add_left_pred(bsrc + stride, bsrc + stride, width, A);
         bsrc += stride2;
         if (slice_height <= 1)
             continue;
@@ -465,154 +477,30 @@ static void restore_median_planar_il(UtvideoContext *c, uint8_t *src,
         C        = bsrc[-stride2];
         bsrc[0] += C;
         A        = bsrc[0];
-        for (i = 1; i < width; i++) {
+        for (i = 1; i < FFMIN(width, 16); i++) { /* scalar loop (DSP need align 16) */
             B        = bsrc[i - stride2];
             bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
             C        = B;
             A        = bsrc[i];
         }
-        c->hdspdec.add_hfyu_median_pred(bsrc + stride, bsrc - stride,
+        if (width > 16)
+            c->llviddsp.add_median_pred(bsrc + 16, bsrc - stride2 + 16,
+                                        bsrc + 16, width - 16, &A, &B);
+
+        c->llviddsp.add_median_pred(bsrc + stride, bsrc - stride,
                                         bsrc + stride, width, &A, &B);
         bsrc += stride2;
         // the rest of lines use continuous median prediction
         for (j = 2; j < slice_height; j++) {
-            c->hdspdec.add_hfyu_median_pred(bsrc, bsrc - stride2,
+            c->llviddsp.add_median_pred(bsrc, bsrc - stride2,
                                             bsrc, width, &A, &B);
-            c->hdspdec.add_hfyu_median_pred(bsrc + stride, bsrc - stride,
+            c->llviddsp.add_median_pred(bsrc + stride, bsrc - stride,
                                             bsrc + stride, width, &A, &B);
             bsrc += stride2;
         }
     }
 }
 
-static void restore_median_packed(uint8_t *src, int step, ptrdiff_t stride,
-                                  int width, int height,
-                                  int slices, int rmode)
-{
-    int i, j, slice;
-    int A, B, C;
-    uint8_t *bsrc;
-    int slice_start, slice_height;
-    const int cmask = ~rmode;
-
-    for (slice = 0; slice < slices; slice++) {
-        slice_start  = ((slice * height) / slices) & cmask;
-        slice_height = ((((slice + 1) * height) / slices) & cmask) -
-                       slice_start;
-        if (!slice_height)
-            continue;
-
-        bsrc = src + slice_start * stride;
-
-        // first line - left neighbour prediction
-        bsrc[0] += 0x80;
-        A = bsrc[0];
-        for (i = step; i < width * step; i += step) {
-            bsrc[i] += A;
-            A        = bsrc[i];
-        }
-        bsrc += stride;
-        if (slice_height == 1)
-            continue;
-        // second line - first element has top prediction, the rest uses median
-        C        = bsrc[-stride];
-        bsrc[0] += C;
-        A        = bsrc[0];
-        for (i = step; i < width * step; i += step) {
-            B        = bsrc[i - stride];
-            bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
-            C        = B;
-            A        = bsrc[i];
-        }
-        bsrc += stride;
-        // the rest of lines use continuous median prediction
-        for (j = 2; j < slice_height; j++) {
-            for (i = 0; i < width * step; i += step) {
-                B        = bsrc[i - stride];
-                bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
-                C        = B;
-                A        = bsrc[i];
-            }
-            bsrc += stride;
-        }
-    }
-}
-
-/* UtVideo interlaced mode treats every two lines as a single one,
- * so restoring function should take care of possible padding between
- * two parts of the same "line".
- */
-static void restore_median_packed_il(uint8_t *src, int step, ptrdiff_t stride,
-                                     int width, int height,
-                                     int slices, int rmode)
-{
-    int i, j, slice;
-    int A, B, C;
-    uint8_t *bsrc;
-    int slice_start, slice_height;
-    const int cmask   = ~(rmode ? 3 : 1);
-    const ptrdiff_t stride2 = stride << 1;
-
-    for (slice = 0; slice < slices; slice++) {
-        slice_start    = ((slice * height) / slices) & cmask;
-        slice_height   = ((((slice + 1) * height) / slices) & cmask) -
-                         slice_start;
-        slice_height >>= 1;
-        if (!slice_height)
-            continue;
-
-        bsrc = src + slice_start * stride;
-
-        // first line - left neighbour prediction
-        bsrc[0] += 0x80;
-        A        = bsrc[0];
-        for (i = step; i < width * step; i += step) {
-            bsrc[i] += A;
-            A        = bsrc[i];
-        }
-        for (i = 0; i < width * step; i += step) {
-            bsrc[stride + i] += A;
-            A                 = bsrc[stride + i];
-        }
-        bsrc += stride2;
-        if (slice_height == 1)
-            continue;
-        // second line - first element has top prediction, the rest uses median
-        C        = bsrc[-stride2];
-        bsrc[0] += C;
-        A        = bsrc[0];
-        for (i = step; i < width * step; i += step) {
-            B        = bsrc[i - stride2];
-            bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
-            C        = B;
-            A        = bsrc[i];
-        }
-        for (i = 0; i < width * step; i += step) {
-            B                 = bsrc[i - stride];
-            bsrc[stride + i] += mid_pred(A, B, (uint8_t)(A + B - C));
-            C                 = B;
-            A                 = bsrc[stride + i];
-        }
-        bsrc += stride2;
-        // the rest of lines use continuous median prediction
-        for (j = 2; j < slice_height; j++) {
-            for (i = 0; i < width * step; i += step) {
-                B        = bsrc[i - stride2];
-                bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
-                C        = B;
-                A        = bsrc[i];
-            }
-            for (i = 0; i < width * step; i += step) {
-                B                 = bsrc[i - stride];
-                bsrc[i + stride] += mid_pred(A, B, (uint8_t)(A + B - C));
-                C                 = B;
-                A                 = bsrc[i + stride];
-            }
-            bsrc += stride2;
-        }
-    }
-}
-
 static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t stride,
                                     int width, int height, int slices, int rmode)
 {
@@ -621,6 +509,7 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s
     uint8_t *bsrc;
     int slice_start, slice_height;
     const int cmask = ~rmode;
+    int min_width = FFMIN(width, 32);
 
     for (slice = 0; slice < slices; slice++) {
         slice_start  = ((slice * height) / slices) & cmask;
@@ -633,19 +522,21 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s
 
         // first line - left neighbour prediction
         bsrc[0] += 0x80;
-        c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
+        c->llviddsp.add_left_pred(bsrc, bsrc, width, 0);
         bsrc += stride;
         if (slice_height <= 1)
             continue;
         for (j = 1; j < slice_height; j++) {
             // second line - first element has top prediction, the rest uses gradient
             bsrc[0] = (bsrc[0] + bsrc[-stride]) & 0xFF;
-            for (i = 1; i < width; i++) {
+            for (i = 1; i < min_width; i++) { /* dsp need align 32 */
                 A = bsrc[i - stride];
                 B = bsrc[i - (stride + 1)];
                 C = bsrc[i - 1];
                 bsrc[i] = (A - B + C + bsrc[i]) & 0xFF;
             }
+            if (width > 32)
+                c->llviddsp.add_gradient_pred(bsrc + 32, stride, width - 32);
             bsrc += stride;
         }
     }
@@ -660,6 +551,7 @@ static void restore_gradient_planar_il(UtvideoContext *c, uint8_t *src, ptrdiff_
     int slice_start, slice_height;
     const int cmask   = ~(rmode ? 3 : 1);
     const ptrdiff_t stride2 = stride << 1;
+    int min_width = FFMIN(width, 32);
 
     for (slice = 0; slice < slices; slice++) {
         slice_start    = ((slice * height) / slices) & cmask;
@@ -673,20 +565,23 @@ static void restore_gradient_planar_il(UtvideoContext *c, uint8_t *src, ptrdiff_
 
         // first line - left neighbour prediction
         bsrc[0] += 0x80;
-        A = c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
-        c->hdspdec.add_hfyu_left_pred(bsrc + stride, bsrc + stride, width, A);
+        A = c->llviddsp.add_left_pred(bsrc, bsrc, width, 0);
+        c->llviddsp.add_left_pred(bsrc + stride, bsrc + stride, width, A);
         bsrc += stride2;
         if (slice_height <= 1)
             continue;
         for (j = 1; j < slice_height; j++) {
             // second line - first element has top prediction, the rest uses gradient
             bsrc[0] = (bsrc[0] + bsrc[-stride2]) & 0xFF;
-            for (i = 1; i < width; i++) {
+            for (i = 1; i < min_width; i++) { /* dsp need align 32 */
                 A = bsrc[i - stride2];
                 B = bsrc[i - (stride2 + 1)];
                 C = bsrc[i - 1];
                 bsrc[i] = (A - B + C + bsrc[i]) & 0xFF;
             }
+            if (width > 32)
+                c->llviddsp.add_gradient_pred(bsrc + 32, stride2, width - 32);
+
             A = bsrc[-stride];
             B = bsrc[-(1 + stride + stride - width)];
             C = bsrc[width - 1];
@@ -702,108 +597,6 @@ static void restore_gradient_planar_il(UtvideoContext *c, uint8_t *src, ptrdiff_
     }
 }
 
-static void restore_gradient_packed(uint8_t *src, int step, ptrdiff_t stride,
-                                    int width, int height, int slices, int rmode)
-{
-    int i, j, slice;
-    int A, B, C;
-    uint8_t *bsrc;
-    int slice_start, slice_height;
-    const int cmask = ~rmode;
-
-    for (slice = 0; slice < slices; slice++) {
-        slice_start  = ((slice * height) / slices) & cmask;
-        slice_height = ((((slice + 1) * height) / slices) & cmask) -
-                       slice_start;
-
-        if (!slice_height)
-            continue;
-        bsrc = src + slice_start * stride;
-
-        // first line - left neighbour prediction
-        bsrc[0] += 0x80;
-        A = bsrc[0];
-        for (i = step; i < width * step; i += step) {
-            bsrc[i] += A;
-            A        = bsrc[i];
-        }
-        bsrc += stride;
-        if (slice_height <= 1)
-            continue;
-        for (j = 1; j < slice_height; j++) {
-            // second line - first element has top prediction, the rest uses gradient
-            C        = bsrc[-stride];
-            bsrc[0] += C;
-            for (i = step; i < width * step; i += step) {
-                A = bsrc[i - stride];
-                B = bsrc[i - (stride + step)];
-                C = bsrc[i - step];
-                bsrc[i] = (A - B + C + bsrc[i]) & 0xFF;
-            }
-            bsrc += stride;
-        }
-    }
-}
-
-static void restore_gradient_packed_il(uint8_t *src, int step, ptrdiff_t stride,
-                                       int width, int height, int slices, int rmode)
-{
-    int i, j, slice;
-    int A, B, C;
-    uint8_t *bsrc;
-    int slice_start, slice_height;
-    const int cmask   = ~(rmode ? 3 : 1);
-    const ptrdiff_t stride2 = stride << 1;
-
-    for (slice = 0; slice < slices; slice++) {
-        slice_start    = ((slice * height) / slices) & cmask;
-        slice_height   = ((((slice + 1) * height) / slices) & cmask) -
-                         slice_start;
-        slice_height >>= 1;
-        if (!slice_height)
-            continue;
-
-        bsrc = src + slice_start * stride;
-
-        // first line - left neighbour prediction
-        bsrc[0] += 0x80;
-        A        = bsrc[0];
-        for (i = step; i < width * step; i += step) {
-            bsrc[i] += A;
-            A        = bsrc[i];
-        }
-        for (i = 0; i < width * step; i += step) {
-            bsrc[stride + i] += A;
-            A                 = bsrc[stride + i];
-        }
-        bsrc += stride2;
-        if (slice_height <= 1)
-            continue;
-        for (j = 1; j < slice_height; j++) {
-            // second line - first element has top prediction, the rest uses gradient
-            C        = bsrc[-stride2];
-            bsrc[0] += C;
-            for (i = step; i < width * step; i += step) {
-                A = bsrc[i - stride2];
-                B = bsrc[i - (stride2 + step)];
-                C = bsrc[i - step];
-                bsrc[i] = (A - B + C + bsrc[i]) & 0xFF;
-            }
-            A = bsrc[-stride];
-            B = bsrc[-(step + stride + stride - width * step)];
-            C = bsrc[width * step - step];
-            bsrc[stride] = (A - B + C + bsrc[stride]) & 0xFF;
-            for (i = step; i < width * step; i += step) {
-                A = bsrc[i - stride];
-                B = bsrc[i - (step + stride)];
-                C = bsrc[i - step + stride];
-                bsrc[i + stride] = (A - B + C + bsrc[i + stride]) & 0xFF;
-            }
-            bsrc += stride2;
-        }
-    }
-}
-
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
@@ -817,16 +610,63 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     GetByteContext gb;
     ThreadFrame frame = { .f = data };
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
-
-    ff_thread_finish_setup(avctx);
 
     /* parse plane structure to get frame flags and validate slice offsets */
     bytestream2_init(&gb, buf, buf_size);
-    if (c->pro) {
+
+    if (c->pack) {
+        const uint8_t *packed_stream;
+        const uint8_t *control_stream;
+        GetByteContext pb;
+        uint32_t nb_cbs;
+        int left;
+
+        c->frame_info = PRED_GRADIENT << 8;
+
+        if (bytestream2_get_byte(&gb) != 1)
+            return AVERROR_INVALIDDATA;
+        bytestream2_skip(&gb, 3);
+        c->offset = bytestream2_get_le32(&gb);
+
+        if (buf_size <= c->offset + 8LL)
+            return AVERROR_INVALIDDATA;
+
+        bytestream2_init(&pb, buf + 8 + c->offset, buf_size - 8 - c->offset);
+
+        nb_cbs = bytestream2_get_le32(&pb);
+        if (nb_cbs > c->offset)
+            return AVERROR_INVALIDDATA;
+
+        packed_stream = buf + 8;
+        control_stream = packed_stream + (c->offset - nb_cbs);
+        left = control_stream - packed_stream;
+
+        for (i = 0; i < c->planes; i++) {
+            for (j = 0; j < c->slices; j++) {
+                c->packed_stream[i][j] = packed_stream;
+                c->packed_stream_size[i][j] = bytestream2_get_le32(&pb);
+                if (c->packed_stream_size[i][j] > left)
+                    return AVERROR_INVALIDDATA;
+                left -= c->packed_stream_size[i][j];
+                packed_stream += c->packed_stream_size[i][j];
+            }
+        }
+
+        left = buf + buf_size - control_stream;
+
+        for (i = 0; i < c->planes; i++) {
+            for (j = 0; j < c->slices; j++) {
+                c->control_stream[i][j] = control_stream;
+                c->control_stream_size[i][j] = bytestream2_get_le32(&pb);
+                if (c->control_stream_size[i][j] > left)
+                    return AVERROR_INVALIDDATA;
+                left -= c->control_stream_size[i][j];
+                control_stream += c->control_stream_size[i][j];
+            }
+        }
+    } else if (c->pro) {
         if (bytestream2_get_bytes_left(&gb) < c->frame_info_size) {
             av_log(avctx, AV_LOG_ERROR, "Not enough data for frame information\n");
             return AVERROR_INVALIDDATA;
@@ -844,7 +684,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             for (j = 0; j < c->slices; j++) {
                 slice_end   = bytestream2_get_le32u(&gb);
                 if (slice_end < 0 || slice_end < slice_start ||
-                    bytestream2_get_bytes_left(&gb) < slice_end) {
+                    bytestream2_get_bytes_left(&gb) < slice_end + 1024LL) {
                     av_log(avctx, AV_LOG_ERROR, "Incorrect slice size\n");
                     return AVERROR_INVALIDDATA;
                 }
@@ -893,56 +733,60 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     c->frame_pred = (c->frame_info >> 8) & 3;
 
-    av_fast_malloc(&c->slice_bits, &c->slice_bits_size,
-                   max_slice_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    max_slice_size += 4*avctx->width;
+
+    if (!c->pack) {
+        av_fast_malloc(&c->slice_bits, &c->slice_bits_size,
+                       max_slice_size + AV_INPUT_BUFFER_PADDING_SIZE);
 
-    if (!c->slice_bits) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
-        return AVERROR(ENOMEM);
+        if (!c->slice_bits) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
+            return AVERROR(ENOMEM);
+        }
     }
 
     switch (c->avctx->pix_fmt) {
-    case AV_PIX_FMT_RGB24:
-    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRAP:
         for (i = 0; i < c->planes; i++) {
-            ret = decode_plane(c, i, frame.f->data[0] + ff_ut_rgb_order[i],
-                               c->planes, frame.f->linesize[0], avctx->width,
+            ret = decode_plane(c, i, frame.f->data[i],
+                               frame.f->linesize[i], avctx->width,
                                avctx->height, plane_start[i],
                                c->frame_pred == PRED_LEFT);
             if (ret)
                 return ret;
             if (c->frame_pred == PRED_MEDIAN) {
                 if (!c->interlaced) {
-                    restore_median_packed(frame.f->data[0] + ff_ut_rgb_order[i],
-                                          c->planes, frame.f->linesize[0], avctx->width,
+                    restore_median_planar(c, frame.f->data[i],
+                                          frame.f->linesize[i], avctx->width,
                                           avctx->height, c->slices, 0);
                 } else {
-                    restore_median_packed_il(frame.f->data[0] + ff_ut_rgb_order[i],
-                                             c->planes, frame.f->linesize[0],
+                    restore_median_planar_il(c, frame.f->data[i],
+                                             frame.f->linesize[i],
                                              avctx->width, avctx->height, c->slices,
                                              0);
                 }
             } else if (c->frame_pred == PRED_GRADIENT) {
                 if (!c->interlaced) {
-                    restore_gradient_packed(frame.f->data[0] + ff_ut_rgb_order[i],
-                                            c->planes, frame.f->linesize[0],
-                                            avctx->width, avctx->height,
-                                            c->slices, 0);
+                    restore_gradient_planar(c, frame.f->data[i],
+                                            frame.f->linesize[i], avctx->width,
+                                            avctx->height, c->slices, 0);
                 } else {
-                    restore_gradient_packed_il(frame.f->data[0] + ff_ut_rgb_order[i],
-                                               c->planes, frame.f->linesize[0],
-                                               avctx->width, avctx->height,
-                                               c->slices, 0);
+                    restore_gradient_planar_il(c, frame.f->data[i],
+                                               frame.f->linesize[i],
+                                               avctx->width, avctx->height, c->slices,
+                                               0);
                 }
             }
         }
-        restore_rgb_planes(frame.f->data[0], c->planes, frame.f->linesize[0],
-                           avctx->width, avctx->height);
+        c->utdsp.restore_rgb_planes(frame.f->data[2], frame.f->data[0], frame.f->data[1],
+                                    frame.f->linesize[2], frame.f->linesize[0], frame.f->linesize[1],
+                                    avctx->width, avctx->height);
         break;
     case AV_PIX_FMT_GBRAP10:
     case AV_PIX_FMT_GBRP10:
         for (i = 0; i < c->planes; i++) {
-            ret = decode_plane10(c, i, (uint16_t *)frame.f->data[i], 1,
+            ret = decode_plane10(c, i, (uint16_t *)frame.f->data[i],
                                  frame.f->linesize[i] / 2, avctx->width,
                                  avctx->height, plane_start[i],
                                  plane_start[i + 1] - 1024,
@@ -950,11 +794,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             if (ret)
                 return ret;
         }
-        restore_rgb_planes10(frame.f, avctx->width, avctx->height);
+        c->utdsp.restore_rgb_planes10((uint16_t *)frame.f->data[2], (uint16_t *)frame.f->data[0], (uint16_t *)frame.f->data[1],
+                                      frame.f->linesize[2] / 2, frame.f->linesize[0] / 2, frame.f->linesize[1] / 2,
+                                      avctx->width, avctx->height);
         break;
     case AV_PIX_FMT_YUV420P:
         for (i = 0; i < 3; i++) {
-            ret = decode_plane(c, i, frame.f->data[i], 1, frame.f->linesize[i],
+            ret = decode_plane(c, i, frame.f->data[i], frame.f->linesize[i],
                                avctx->width >> !!i, avctx->height >> !!i,
                                plane_start[i], c->frame_pred == PRED_LEFT);
             if (ret)
@@ -973,8 +819,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             } else if (c->frame_pred == PRED_GRADIENT) {
                 if (!c->interlaced) {
                     restore_gradient_planar(c, frame.f->data[i], frame.f->linesize[i],
-                                            avctx->width >> !!i,
-                                            avctx->height >> !!i,
+                                            avctx->width >> !!i, avctx->height >> !!i,
                                             c->slices, !i);
                 } else {
                     restore_gradient_planar_il(c, frame.f->data[i], frame.f->linesize[i],
@@ -987,7 +832,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         break;
     case AV_PIX_FMT_YUV422P:
         for (i = 0; i < 3; i++) {
-            ret = decode_plane(c, i, frame.f->data[i], 1, frame.f->linesize[i],
+            ret = decode_plane(c, i, frame.f->data[i], frame.f->linesize[i],
                                avctx->width >> !!i, avctx->height,
                                plane_start[i], c->frame_pred == PRED_LEFT);
             if (ret)
@@ -1017,7 +862,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         break;
     case AV_PIX_FMT_YUV444P:
         for (i = 0; i < 3; i++) {
-            ret = decode_plane(c, i, frame.f->data[i], 1, frame.f->linesize[i],
+            ret = decode_plane(c, i, frame.f->data[i], frame.f->linesize[i],
                                avctx->width, avctx->height,
                                plane_start[i], c->frame_pred == PRED_LEFT);
             if (ret)
@@ -1047,7 +892,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         break;
     case AV_PIX_FMT_YUV422P10:
         for (i = 0; i < 3; i++) {
-            ret = decode_plane10(c, i, (uint16_t *)frame.f->data[i], 1, frame.f->linesize[i] / 2,
+            ret = decode_plane10(c, i, (uint16_t *)frame.f->data[i], frame.f->linesize[i] / 2,
                                  avctx->width >> !!i, avctx->height,
                                  plane_start[i], plane_start[i + 1] - 1024, c->frame_pred == PRED_LEFT);
             if (ret)
@@ -1069,53 +914,24 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     UtvideoContext * const c = avctx->priv_data;
+    int h_shift, v_shift;
 
     c->avctx = avctx;
 
+    ff_utvideodsp_init(&c->utdsp);
     ff_bswapdsp_init(&c->bdsp);
-    ff_huffyuvdsp_init(&c->hdspdec);
-
-    if (avctx->extradata_size >= 16) {
-        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
-               avctx->extradata[3], avctx->extradata[2],
-               avctx->extradata[1], avctx->extradata[0]);
-        av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
-               AV_RB32(avctx->extradata + 4));
-        c->frame_info_size = AV_RL32(avctx->extradata + 8);
-        c->flags           = AV_RL32(avctx->extradata + 12);
-
-        if (c->frame_info_size != 4)
-            avpriv_request_sample(avctx, "Frame info not 4 bytes");
-        av_log(avctx, AV_LOG_DEBUG, "Encoding parameters %08"PRIX32"\n", c->flags);
-        c->slices      = (c->flags >> 24) + 1;
-        c->compression = c->flags & 1;
-        c->interlaced  = c->flags & 0x800;
-    } else if (avctx->extradata_size == 8) {
-        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
-               avctx->extradata[3], avctx->extradata[2],
-               avctx->extradata[1], avctx->extradata[0]);
-        av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
-               AV_RB32(avctx->extradata + 4));
-        c->interlaced  = 0;
-        c->pro         = 1;
-        c->frame_info_size = 4;
-    } else {
-        av_log(avctx, AV_LOG_ERROR,
-               "Insufficient extradata size %d, should be at least 16\n",
-               avctx->extradata_size);
-        return AVERROR_INVALIDDATA;
-    }
+    ff_llviddsp_init(&c->llviddsp);
 
     c->slice_bits_size = 0;
 
     switch (avctx->codec_tag) {
     case MKTAG('U', 'L', 'R', 'G'):
         c->planes      = 3;
-        avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        avctx->pix_fmt = AV_PIX_FMT_GBRP;
         break;
     case MKTAG('U', 'L', 'R', 'A'):
         c->planes      = 4;
-        avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP;
         break;
     case MKTAG('U', 'L', 'Y', '0'):
         c->planes      = 3;
@@ -1134,14 +950,17 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case MKTAG('U', 'Q', 'Y', '2'):
         c->planes      = 3;
+        c->pro         = 1;
         avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
         break;
     case MKTAG('U', 'Q', 'R', 'G'):
         c->planes      = 3;
+        c->pro         = 1;
         avctx->pix_fmt = AV_PIX_FMT_GBRP10;
         break;
     case MKTAG('U', 'Q', 'R', 'A'):
         c->planes      = 4;
+        c->pro         = 1;
         avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
         break;
     case MKTAG('U', 'L', 'H', '0'):
@@ -1159,12 +978,93 @@ static av_cold int decode_init(AVCodecContext *avctx)
         avctx->pix_fmt = AV_PIX_FMT_YUV444P;
         avctx->colorspace = AVCOL_SPC_BT709;
         break;
+    case MKTAG('U', 'M', 'Y', '2'):
+        c->planes      = 3;
+        c->pack        = 1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        avctx->colorspace = AVCOL_SPC_BT470BG;
+        break;
+    case MKTAG('U', 'M', 'H', '2'):
+        c->planes      = 3;
+        c->pack        = 1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        avctx->colorspace = AVCOL_SPC_BT709;
+        break;
+    case MKTAG('U', 'M', 'Y', '4'):
+        c->planes      = 3;
+        c->pack        = 1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        avctx->colorspace = AVCOL_SPC_BT470BG;
+        break;
+    case MKTAG('U', 'M', 'H', '4'):
+        c->planes      = 3;
+        c->pack        = 1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        avctx->colorspace = AVCOL_SPC_BT709;
+        break;
+    case MKTAG('U', 'M', 'R', 'G'):
+        c->planes      = 3;
+        c->pack        = 1;
+        avctx->pix_fmt = AV_PIX_FMT_GBRP;
+        break;
+    case MKTAG('U', 'M', 'R', 'A'):
+        c->planes      = 4;
+        c->pack        = 1;
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+        break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unknown Ut Video FOURCC provided (%08X)\n",
                avctx->codec_tag);
         return AVERROR_INVALIDDATA;
     }
 
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &h_shift, &v_shift);
+    if ((avctx->width  & ((1<<h_shift)-1)) ||
+        (avctx->height & ((1<<v_shift)-1))) {
+        avpriv_request_sample(avctx, "Odd dimensions");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (c->pack && avctx->extradata_size >= 16) {
+        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
+               avctx->extradata[3], avctx->extradata[2],
+               avctx->extradata[1], avctx->extradata[0]);
+        av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
+               AV_RB32(avctx->extradata + 4));
+        c->compression = avctx->extradata[8];
+        if (c->compression != 2)
+            avpriv_request_sample(avctx, "Unknown compression type");
+        c->slices      = avctx->extradata[9] + 1;
+    } else if (!c->pro && avctx->extradata_size >= 16) {
+        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
+               avctx->extradata[3], avctx->extradata[2],
+               avctx->extradata[1], avctx->extradata[0]);
+        av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
+               AV_RB32(avctx->extradata + 4));
+        c->frame_info_size = AV_RL32(avctx->extradata + 8);
+        c->flags           = AV_RL32(avctx->extradata + 12);
+
+        if (c->frame_info_size != 4)
+            avpriv_request_sample(avctx, "Frame info not 4 bytes");
+        av_log(avctx, AV_LOG_DEBUG, "Encoding parameters %08"PRIX32"\n", c->flags);
+        c->slices      = (c->flags >> 24) + 1;
+        c->compression = c->flags & 1;
+        c->interlaced  = c->flags & 0x800;
+    } else if (c->pro && avctx->extradata_size == 8) {
+        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
+               avctx->extradata[3], avctx->extradata[2],
+               avctx->extradata[1], avctx->extradata[0]);
+        av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
+               AV_RB32(avctx->extradata + 4));
+        c->interlaced  = 0;
+        c->frame_info_size = 4;
+    } else {
+        av_log(avctx, AV_LOG_ERROR,
+               "Insufficient extradata size %d, should be at least 16\n",
+               avctx->extradata_size);
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
diff --git a/libavcodec/utvideodsp.c b/libavcodec/utvideodsp.c
new file mode 100644
index 0000000..0831a6b
--- /dev/null
+++ b/libavcodec/utvideodsp.c
@@ -0,0 +1,82 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "utvideodsp.h"
+
+static void restore_rgb_planes_c(uint8_t *src_r,
+                                 uint8_t *src_g,
+                                 uint8_t *src_b,
+                                 ptrdiff_t linesize_r,
+                                 ptrdiff_t linesize_g,
+                                 ptrdiff_t linesize_b,
+                                 int width, int height)
+{
+    uint8_t r, g, b;
+    int i, j;
+
+    for (j = 0; j < height; j++) {
+        for (i = 0; i < width; i++) {
+            r = src_r[i];
+            g = src_g[i];
+            b = src_b[i];
+            src_r[i] = r + g - 0x80;
+            src_b[i] = b + g - 0x80;
+        }
+        src_r += linesize_r;
+        src_g += linesize_g;
+        src_b += linesize_b;
+    }
+}
+
+static void restore_rgb_planes10_c(uint16_t *src_r,
+                                   uint16_t *src_g,
+                                   uint16_t *src_b,
+                                   ptrdiff_t linesize_r,
+                                   ptrdiff_t linesize_g,
+                                   ptrdiff_t linesize_b,
+                                   int width, int height)
+{
+    int r, g, b;
+    int i, j;
+
+    for (j = 0; j < height; j++) {
+        for (i = 0; i < width; i++) {
+            r = src_r[i];
+            g = src_g[i];
+            b = src_b[i];
+            src_r[i] = (r + g - 0x200) & 0x3FF;
+            src_b[i] = (b + g - 0x200) & 0x3FF;
+        }
+        src_r += linesize_r;
+        src_g += linesize_g;
+        src_b += linesize_b;
+    }
+}
+
+av_cold void ff_utvideodsp_init(UTVideoDSPContext *c)
+{
+    c->restore_rgb_planes   = restore_rgb_planes_c;
+    c->restore_rgb_planes10 = restore_rgb_planes10_c;
+
+    if (ARCH_X86)
+        ff_utvideodsp_init_x86(c);
+}
diff --git a/libavcodec/utvideodsp.h b/libavcodec/utvideodsp.h
new file mode 100644
index 0000000..a3d2550
--- /dev/null
+++ b/libavcodec/utvideodsp.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_UTVIDEODSP_H
+#define AVCODEC_UTVIDEODSP_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavutil/pixfmt.h"
+#include "config.h"
+
+typedef struct UTVideoDSPContext {
+    void (*restore_rgb_planes)(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
+                               ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                               ptrdiff_t linesize_b, int width, int height);
+    void (*restore_rgb_planes10)(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
+                                 ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                 ptrdiff_t linesize_b, int width, int height);
+} UTVideoDSPContext;
+
+void ff_utvideodsp_init(UTVideoDSPContext *c);
+void ff_utvideodsp_init_x86(UTVideoDSPContext *c);
+
+#endif /* AVCODEC_UTVIDEODSP_H */
diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c
index ef51ed0..db00e1e 100644
--- a/libavcodec/utvideoenc.c
+++ b/libavcodec/utvideoenc.c
@@ -2,20 +2,20 @@
  * Ut Video encoder
  * Copyright (c) 2012 Jan Ekström
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,6 @@
 #include "bswapdsp.h"
 #include "bytestream.h"
 #include "put_bits.h"
-#include "huffyuvencdsp.h"
 #include "mathops.h"
 #include "utvideo.h"
 #include "huffman.h"
@@ -68,15 +67,16 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx)
     c->slice_stride    = FFALIGN(avctx->width, 32);
 
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_GBRP:
         c->planes        = 3;
         avctx->codec_tag = MKTAG('U', 'L', 'R', 'G');
         original_format  = UTVIDEO_RGB;
         break;
-    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_GBRAP:
         c->planes        = 4;
         avctx->codec_tag = MKTAG('U', 'L', 'R', 'A');
         original_format  = UTVIDEO_RGBA;
+        avctx->bits_per_coded_sample = 32;
         break;
     case AV_PIX_FMT_YUV420P:
         if (avctx->width & 1 || avctx->height & 1) {
@@ -104,6 +104,14 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx)
             avctx->codec_tag = MKTAG('U', 'L', 'Y', '2');
         original_format  = UTVIDEO_422;
         break;
+    case AV_PIX_FMT_YUV444P:
+        c->planes        = 3;
+        if (avctx->colorspace == AVCOL_SPC_BT709)
+            avctx->codec_tag = MKTAG('U', 'L', 'H', '4');
+        else
+            avctx->codec_tag = MKTAG('U', 'L', 'Y', '4');
+        original_format  = UTVIDEO_444;
+        break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unknown pixel format: %d\n",
                avctx->pix_fmt);
@@ -111,7 +119,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx)
     }
 
     ff_bswapdsp_init(&c->bdsp);
-    ff_huffyuvencdsp_init(&c->hdsp);
+    ff_llvidencdsp_init(&c->llvidencdsp);
 
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -235,55 +243,48 @@ FF_ENABLE_DEPRECATION_WARNINGS
 }
 
 static void mangle_rgb_planes(uint8_t *dst[4], ptrdiff_t dst_stride,
-                              uint8_t *src, int step, ptrdiff_t stride,
+                              uint8_t *const src[4], int planes, const int stride[4],
                               int width, int height)
 {
     int i, j;
     int k = 2 * dst_stride;
+    const uint8_t *sg = src[0];
+    const uint8_t *sb = src[1];
+    const uint8_t *sr = src[2];
+    const uint8_t *sa = src[3];
     unsigned int g;
 
     for (j = 0; j < height; j++) {
-        if (step == 3) {
-            for (i = 0; i < width * step; i += step) {
-                g         = src[i + 1];
+        if (planes == 3) {
+            for (i = 0; i < width; i++) {
+                g         = sg[i];
                 dst[0][k] = g;
                 g        += 0x80;
-                dst[1][k] = src[i + 2] - g;
-                dst[2][k] = src[i + 0] - g;
+                dst[1][k] = sb[i] - g;
+                dst[2][k] = sr[i] - g;
                 k++;
             }
         } else {
-            for (i = 0; i < width * step; i += step) {
-                g         = src[i + 1];
+            for (i = 0; i < width; i++) {
+                g         = sg[i];
                 dst[0][k] = g;
                 g        += 0x80;
-                dst[1][k] = src[i + 2] - g;
-                dst[2][k] = src[i + 0] - g;
-                dst[3][k] = src[i + 3];
+                dst[1][k] = sb[i] - g;
+                dst[2][k] = sr[i] - g;
+                dst[3][k] = sa[i];
                 k++;
             }
+            sa += stride[3];
         }
         k += dst_stride - width;
-        src += stride;
+        sg += stride[0];
+        sb += stride[1];
+        sr += stride[2];
     }
 }
 
-/* Write data to a plane with left prediction */
-static void left_predict(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                         int width, int height)
-{
-    int i, j;
-    uint8_t prev;
-
-    prev = 0x80; /* Set the initial value */
-    for (j = 0; j < height; j++) {
-        for (i = 0; i < width; i++) {
-            *dst++ = src[i] - prev;
-            prev   = src[i];
-        }
-        src += stride;
-    }
-}
+#undef A
+#undef B
 
 /* Write data to a plane with median prediction */
 static void median_predict(UtvideoContext *c, uint8_t *src, uint8_t *dst,
@@ -313,7 +314,7 @@ static void median_predict(UtvideoContext *c, uint8_t *src, uint8_t *dst,
 
     /* Rest of the coded part uses median prediction */
     for (j = 1; j < height; j++) {
-        c->hdsp.sub_hfyu_median_pred(dst, src - stride, src, width, &A, &B);
+        c->llvidencdsp.sub_median_pred(dst, src - stride, src, width, &A, &B);
         dst += width;
         src += stride;
     }
@@ -388,7 +389,7 @@ static int write_huff_codes(uint8_t *src, uint8_t *dst, int dst_size,
 }
 
 static int encode_plane(AVCodecContext *avctx, uint8_t *src,
-                        uint8_t *dst, ptrdiff_t stride,
+                        uint8_t *dst, ptrdiff_t stride, int plane_no,
                         int width, int height, PutByteContext *pb)
 {
     UtvideoContext *c        = avctx->priv_data;
@@ -398,15 +399,17 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     HuffEntry he[256];
 
     uint32_t offset = 0, slice_len = 0;
+    const int cmask = ~(!plane_no && avctx->pix_fmt == AV_PIX_FMT_YUV420P);
     int      i, sstart, send = 0;
     int      symbol;
+    int      ret;
 
     /* Do prediction / make planes */
     switch (c->frame_pred) {
     case PRED_NONE:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             av_image_copy_plane(dst + sstart * width, width,
                                 src + sstart * stride, stride,
                                 width, send - sstart);
@@ -415,15 +418,14 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     case PRED_LEFT:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
-            left_predict(src + sstart * stride, dst + sstart * width,
-                         stride, width, send - sstart);
+            send   = height * (i + 1) / c->slices & cmask;
+            c->llvidencdsp.sub_left_predict(dst + sstart * width, src + sstart * stride, stride, width, send - sstart);
         }
         break;
     case PRED_MEDIAN:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             median_predict(c, src + sstart * stride, dst + sstart * width,
                            stride, width, send - sstart);
         }
@@ -442,7 +444,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
         /* If non-zero count is found, see if it matches width * height */
         if (counts[symbol]) {
             /* Special case if only one symbol was used */
-            if (counts[symbol] == width * height) {
+            if (counts[symbol] == width * (int64_t)height) {
                 /*
                  * Write a zero for the single symbol
                  * used in the plane, else 0xFF.
@@ -466,7 +468,8 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     }
 
     /* Calculate huffman lengths */
-    ff_huff_gen_len_table(lengths, counts);
+    if ((ret = ff_huff_gen_len_table(lengths, counts, 256, 1)) < 0)
+        return ret;
 
     /*
      * Write the plane's header into the output packet:
@@ -486,14 +489,14 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     send = 0;
     for (i = 0; i < c->slices; i++) {
         sstart  = send;
-        send    = height * (i + 1) / c->slices;
+        send    = height * (i + 1) / c->slices & cmask;
 
         /*
          * Write the huffman codes to a buffer,
          * get the offset in bits and convert to bytes.
          */
         offset += write_huff_codes(dst + sstart * width, c->slice_bits,
-                                   width * (send - sstart), width,
+                                   width * height + 4, width,
                                    send - sstart, he) >> 3;
 
         slice_len = offset - slice_len;
@@ -540,22 +543,17 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int i, ret = 0;
 
     /* Allocate a new packet if needed, and set it to the pointer dst */
-    ret = ff_alloc_packet(pkt, (256 + 4 * c->slices + width * height) *
-                          c->planes + 4);
+    ret = ff_alloc_packet2(avctx, pkt, (256 + 4 * c->slices + width * height) *
+                           c->planes + 4, 0);
 
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Error allocating the output packet, or the provided packet "
-               "was too small.\n");
+    if (ret < 0)
         return ret;
-    }
 
     dst = pkt->data;
 
     bytestream2_init_writer(&pb, dst, pkt->size);
 
-    av_fast_malloc(&c->slice_bits, &c->slice_bits_size,
-                   width * height + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&c->slice_bits, &c->slice_bits_size, width * height + 4);
 
     if (!c->slice_bits) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer 2.\n");
@@ -563,17 +561,17 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     /* In case of RGB, mangle the planes to Ut Video's format */
-    if (avctx->pix_fmt == AV_PIX_FMT_RGBA || avctx->pix_fmt == AV_PIX_FMT_RGB24)
-        mangle_rgb_planes(c->slice_buffer, c->slice_stride, pic->data[0],
-                          c->planes, pic->linesize[0], width, height);
+    if (avctx->pix_fmt == AV_PIX_FMT_GBRAP || avctx->pix_fmt == AV_PIX_FMT_GBRP)
+        mangle_rgb_planes(c->slice_buffer, c->slice_stride, pic->data,
+                          c->planes, pic->linesize, width, height);
 
     /* Deal with the planes */
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGB24:
-    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRAP:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, c->slice_buffer[i] + 2 * c->slice_stride,
-                               c->slice_buffer[i], c->slice_stride,
+                               c->slice_buffer[i], c->slice_stride, i,
                                width, height, &pb);
 
             if (ret) {
@@ -582,10 +580,21 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             }
         }
         break;
+    case AV_PIX_FMT_YUV444P:
+        for (i = 0; i < c->planes; i++) {
+            ret = encode_plane(avctx, pic->data[i], c->slice_buffer[0],
+                               pic->linesize[i], i, width, height, &pb);
+
+            if (ret) {
+                av_log(avctx, AV_LOG_ERROR, "Error encoding plane %d.\n", i);
+                return ret;
+            }
+        }
+        break;
     case AV_PIX_FMT_YUV422P:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, pic->data[i], c->slice_buffer[0],
-                               pic->linesize[i], width >> !!i, height, &pb);
+                               pic->linesize[i], i, width >> !!i, height, &pb);
 
             if (ret) {
                 av_log(avctx, AV_LOG_ERROR, "Error encoding plane %d.\n", i);
@@ -596,7 +605,7 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV420P:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, pic->data[i], c->slice_buffer[0],
-                               pic->linesize[i], width >> !!i, height >> !!i,
+                               pic->linesize[i], i, width >> !!i, height >> !!i,
                                &pb);
 
             if (ret) {
@@ -668,8 +677,9 @@ AVCodec ff_utvideo_encoder = {
     .init           = utvideo_encode_init,
     .encode2        = utvideo_encode_frame,
     .close          = utvideo_encode_close,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
-                          AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA, AV_PIX_FMT_YUV422P,
-                          AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE
+                          AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP, AV_PIX_FMT_YUV422P,
+                          AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_NONE
                       },
 };
diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 74301fe..ddc5dbe 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -4,54 +4,106 @@
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste dot coudurier at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "internal.h"
+#include "v210dec.h"
 #include "libavutil/bswap.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
+#include "libavutil/intreadwrite.h"
 
-static av_cold int decode_init(AVCodecContext *avctx)
+#define READ_PIXELS(a, b, c)         \
+    do {                             \
+        val  = av_le2ne32(*src++);   \
+        *a++ =  val & 0x3FF;         \
+        *b++ = (val >> 10) & 0x3FF;  \
+        *c++ = (val >> 20) & 0x3FF;  \
+    } while (0)
+
+static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
 {
-    if (avctx->width & 1) {
-        av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
-        return AVERROR_INVALIDDATA;
+    uint32_t val;
+    int i;
+
+    for( i = 0; i < width-5; i += 6 ){
+        READ_PIXELS(u, y, v);
+        READ_PIXELS(y, u, y);
+        READ_PIXELS(v, y, u);
+        READ_PIXELS(y, v, y);
     }
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    V210DecContext *s = avctx->priv_data;
+
     avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
     avctx->bits_per_raw_sample = 10;
 
+    s->unpack_frame            = v210_planar_unpack_c;
+
+    if (HAVE_MMX)
+        ff_v210_x86_init(s);
+
     return 0;
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
-    int h, w, ret;
+    V210DecContext *s = avctx->priv_data;
+
+    int h, w, ret, stride, aligned_input;
     AVFrame *pic = data;
     const uint8_t *psrc = avpkt->data;
     uint16_t *y, *u, *v;
-    int aligned_width = ((avctx->width + 47) / 48) * 48;
-    int stride = aligned_width * 8 / 3;
+
+    if (s->custom_stride )
+        stride = s->custom_stride;
+    else {
+        int aligned_width = ((avctx->width + 47) / 48) * 48;
+        stride = aligned_width * 8 / 3;
+    }
 
     if (avpkt->size < stride * avctx->height) {
-        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
-        return AVERROR_INVALIDDATA;
+        if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
+            stride = avpkt->size / avctx->height;
+            if (!s->stride_warning_shown)
+                av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
+            s->stride_warning_shown = 1;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    if (   avctx->codec_tag == MKTAG('C', '2', '1', '0')
+        && avpkt->size > 64
+        && AV_RN32(psrc) == AV_RN32("INFO")
+        && avpkt->size - 64 >= stride * avctx->height)
+        psrc += 64;
+
+    aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
+    if (aligned_input != s->aligned_input) {
+        s->aligned_input = aligned_input;
+        if (HAVE_MMX)
+            ff_v210_x86_init(s);
     }
 
     if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
@@ -63,55 +115,73 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
 
-#define READ_PIXELS(a, b, c)         \
-    do {                             \
-        val  = av_le2ne32(*src++);   \
-        *a++ =  val & 0x3FF;         \
-        *b++ = (val >> 10) & 0x3FF;  \
-        *c++ = (val >> 20) & 0x3FF;  \
-    } while (0)
-
     for (h = 0; h < avctx->height; h++) {
         const uint32_t *src = (const uint32_t*)psrc;
         uint32_t val;
-        for (w = 0; w < avctx->width - 5; w += 6) {
-            READ_PIXELS(u, y, v);
-            READ_PIXELS(y, u, y);
-            READ_PIXELS(v, y, u);
-            READ_PIXELS(y, v, y);
-        }
+
+        w = (avctx->width / 6) * 6;
+        s->unpack_frame(src, y, u, v, w);
+
+        y += w;
+        u += w >> 1;
+        v += w >> 1;
+        src += (w << 1) / 3;
+
         if (w < avctx->width - 1) {
             READ_PIXELS(u, y, v);
 
             val  = av_le2ne32(*src++);
             *y++ =  val & 0x3FF;
-        }
-        if (w < avctx->width - 3) {
-            *u++ = (val >> 10) & 0x3FF;
-            *y++ = (val >> 20) & 0x3FF;
+            if (w < avctx->width - 3) {
+                *u++ = (val >> 10) & 0x3FF;
+                *y++ = (val >> 20) & 0x3FF;
 
-            val  = av_le2ne32(*src++);
-            *v++ =  val & 0x3FF;
-            *y++ = (val >> 10) & 0x3FF;
+                val  = av_le2ne32(*src++);
+                *v++ =  val & 0x3FF;
+                *y++ = (val >> 10) & 0x3FF;
+            }
         }
 
         psrc += stride;
-        y += pic->linesize[0] / 2 - avctx->width;
+        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
         u += pic->linesize[1] / 2 - avctx->width / 2;
         v += pic->linesize[2] / 2 - avctx->width / 2;
     }
 
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        /* we have interlaced material flagged in container */
+        pic->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            pic->top_field_first = 1;
+    }
+
     *got_frame      = 1;
 
     return avpkt->size;
 }
 
+#define V210DEC_FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+static const AVOption v210dec_options[] = {
+    {"custom_stride", "Custom V210 stride", offsetof(V210DecContext, custom_stride), AV_OPT_TYPE_INT,
+     {.i64 = 0}, INT_MIN, INT_MAX, V210DEC_FLAGS},
+    {NULL}
+};
+
+static const AVClass v210dec_class = {
+    .class_name = "V210 Decoder",
+    .item_name  = av_default_item_name,
+    .option     = v210dec_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_v210_decoder = {
     .name           = "v210",
     .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_V210,
+    .priv_data_size = sizeof(V210DecContext),
     .init           = decode_init,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &v210dec_class,
 };
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
new file mode 100644
index 0000000..533afc4
--- /dev/null
+++ b/libavcodec/v210dec.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V210DEC_H
+#define AVCODEC_V210DEC_H
+
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+
+
+typedef struct {
+    AVClass *av_class;
+    int custom_stride;
+    int aligned_input;
+    int stride_warning_shown;
+    void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+} V210DecContext;
+
+void ff_v210_x86_init(V210DecContext *s);
+
+#endif /* AVCODEC_V210DEC_H */
diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c
index 51c182c..b024806 100644
--- a/libavcodec/v210enc.c
+++ b/libavcodec/v210enc.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste dot coudurier at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -110,6 +110,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     ff_v210enc_init(s);
 
+    avctx->bits_per_coded_sample = 20;
+    avctx->bit_rate = ff_guess_coded_bitrate(avctx) * 16 / 15;
+
     return 0;
 }
 
@@ -120,10 +123,11 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int aligned_width = ((avctx->width + 47) / 48) * 48;
     int stride = aligned_width * 8 / 3;
     int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4;
+    AVFrameSideData *side_data;
     int h, w, ret;
     uint8_t *dst;
 
-    ret = ff_alloc_packet(pkt, avctx->height * stride);
+    ret = ff_alloc_packet2(avctx, pkt, avctx->height * stride, avctx->height * stride);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
         return ret;
@@ -230,6 +234,22 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
 
+    side_data = av_frame_get_side_data(pic, AV_FRAME_DATA_A53_CC);
+    if (side_data && side_data->size) {
+        uint8_t *buf = av_packet_new_side_data(pkt, AV_PKT_DATA_A53_CC, side_data->size);
+        if (!buf)
+            return AVERROR(ENOMEM);
+        memcpy(buf, side_data->data, side_data->size);
+    }
+
+    side_data = av_frame_get_side_data(pic, AV_FRAME_DATA_AFD);
+    if (side_data && side_data->size) {
+        uint8_t *buf = av_packet_new_side_data(pkt, AV_PKT_DATA_AFD, side_data->size);
+        if (!buf)
+            return AVERROR(ENOMEM);
+        memcpy(buf, side_data->data, side_data->size);
+    }
+
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
     return 0;
diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h
index ee3637a..51305c1 100644
--- a/libavcodec/v210enc.h
+++ b/libavcodec/v210enc.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/v210x.c b/libavcodec/v210x.c
index 3f220ff..f6a453a 100644
--- a/libavcodec/v210x.c
+++ b/libavcodec/v210x.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/v308dec.c b/libavcodec/v308dec.c
new file mode 100644
index 0000000..dd53fbd
--- /dev/null
+++ b/libavcodec/v308dec.c
@@ -0,0 +1,83 @@
+/*
+ * v308 decoder
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v308_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+
+    if (avctx->width & 1)
+        av_log(avctx, AV_LOG_WARNING, "v308 requires width to be even.\n");
+
+    return 0;
+}
+
+static int v308_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 3 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            v[j] = *src++;
+            y[j] = *src++;
+            u[j] = *src++;
+        }
+
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_v308_decoder = {
+    .name         = "v308",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V308,
+    .init         = v308_decode_init,
+    .decode       = v308_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/v308enc.c b/libavcodec/v308enc.c
new file mode 100644
index 0000000..e88f1f4
--- /dev/null
+++ b/libavcodec/v308enc.c
@@ -0,0 +1,86 @@
+/*
+ * v308 encoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v308_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width & 1) {
+        av_log(avctx, AV_LOG_ERROR, "v308 requires width to be even.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    avctx->bits_per_coded_sample = 24;
+    avctx->bit_rate = ff_guess_coded_bitrate(avctx);
+
+    return 0;
+}
+
+static int v308_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 3, 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            *dst++ = v[j];
+            *dst++ = y[j];
+            *dst++ = u[j];
+        }
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int v308_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_v308_encoder = {
+    .name         = "v308",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V308,
+    .init         = v308_encode_init,
+    .encode2      = v308_encode_frame,
+    .close        = v308_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/v408dec.c b/libavcodec/v408dec.c
new file mode 100644
index 0000000..196c575
--- /dev/null
+++ b/libavcodec/v408dec.c
@@ -0,0 +1,105 @@
+/*
+ * v408 decoder
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v408_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+
+    return 0;
+}
+
+static int v408_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v, *a;
+    int i, j, ret;
+
+    if (avpkt->size < 4 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+    a = pic->data[3];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            if (avctx->codec_id==AV_CODEC_ID_AYUV) {
+                v[j] = *src++;
+                u[j] = *src++;
+                y[j] = *src++;
+                a[j] = *src++;
+            } else {
+                u[j] = *src++;
+                y[j] = *src++;
+                v[j] = *src++;
+                a[j] = *src++;
+            }
+        }
+
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+        a += pic->linesize[3];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+#if CONFIG_AYUV_DECODER
+AVCodec ff_ayuv_decoder = {
+    .name         = "ayuv",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AYUV,
+    .init         = v408_decode_init,
+    .decode       = v408_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
+#endif
+#if CONFIG_V408_DECODER
+AVCodec ff_v408_decoder = {
+    .name         = "v408",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V408,
+    .init         = v408_decode_init,
+    .decode       = v408_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
+#endif
diff --git a/libavcodec/v408enc.c b/libavcodec/v408enc.c
new file mode 100644
index 0000000..e12965b
--- /dev/null
+++ b/libavcodec/v408enc.c
@@ -0,0 +1,106 @@
+/*
+ * v408 encoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v408_encode_init(AVCodecContext *avctx)
+{
+    avctx->bits_per_coded_sample = 32;
+    avctx->bit_rate = ff_guess_coded_bitrate(avctx);
+
+    return 0;
+}
+
+static int v408_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v, *a;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 4, 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+    a = pic->data[3];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+           if (avctx->codec_id==AV_CODEC_ID_AYUV) {
+                *dst++ = v[j];
+                *dst++ = u[j];
+                *dst++ = y[j];
+                *dst++ = a[j];
+            } else {
+                *dst++ = u[j];
+                *dst++ = y[j];
+                *dst++ = v[j];
+                *dst++ = a[j];
+            }
+        }
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+        a += pic->linesize[3];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int v408_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+#if CONFIG_AYUV_ENCODER
+AVCodec ff_ayuv_encoder = {
+    .name         = "ayuv",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AYUV,
+    .init         = v408_encode_init,
+    .encode2      = v408_encode_frame,
+    .close        = v408_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVA444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_V408_ENCODER
+AVCodec ff_v408_encoder = {
+    .name         = "v408",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V408,
+    .init         = v408_encode_init,
+    .encode2      = v408_encode_frame,
+    .close        = v408_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVA444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
index ca68e0b..48fab68 100644
--- a/libavcodec/v410dec.c
+++ b/libavcodec/v410dec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,17 +49,15 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *src = avpkt->data;
     uint16_t *y, *u, *v;
     uint32_t val;
-    int i, j;
+    int i, j, ret;
 
     if (avpkt->size < 4 * avctx->height * avctx->width) {
         av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
         return AVERROR(EINVAL);
     }
 
-    if (ff_get_buffer(avctx, pic, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
-    }
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
 
     pic->key_frame = 1;
     pic->pict_type = AV_PICTURE_TYPE_I;
diff --git a/libavcodec/v410enc.c b/libavcodec/v410enc.c
index 1e3f38f..5e24502 100644
--- a/libavcodec/v410enc.c
+++ b/libavcodec/v410enc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,13 @@
 static av_cold int v410_encode_init(AVCodecContext *avctx)
 {
     if (avctx->width & 1) {
-        av_log(avctx, AV_LOG_ERROR, "v410 requires even width.\n");
+        av_log(avctx, AV_LOG_ERROR, "v410 requires width to be even.\n");
         return AVERROR_INVALIDDATA;
     }
 
+    avctx->bits_per_coded_sample = 32;
+    avctx->bit_rate = ff_guess_coded_bitrate(avctx);
+
     return 0;
 }
 
@@ -43,10 +46,9 @@ static int v410_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     uint32_t val;
     int i, j, ret;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height * 4)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 4,
+                                            avctx->width * avctx->height * 4)) < 0)
         return ret;
-    }
     dst = pkt->data;
 
 #if FF_API_CODED_FRAME
diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
new file mode 100644
index 0000000..aef911f
--- /dev/null
+++ b/libavcodec/v4l2_buffers.c
@@ -0,0 +1,478 @@
+/*
+ * V4L2 buffer helper functions.
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/videodev2.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <poll.h>
+#include "libavcodec/avcodec.h"
+#include "libavcodec/internal.h"
+#include "v4l2_context.h"
+#include "v4l2_buffers.h"
+#include "v4l2_m2m.h"
+
+#define USEC_PER_SEC 1000000
+
+static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
+{
+    return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
+        container_of(buf->context, V4L2m2mContext, output) :
+        container_of(buf->context, V4L2m2mContext, capture);
+}
+
+static inline AVCodecContext *logger(V4L2Buffer *buf)
+{
+    return buf_to_m2mctx(buf)->avctx;
+}
+
+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
+{
+    V4L2m2mContext *s = buf_to_m2mctx(out);
+    AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+    int64_t v4l2_pts;
+
+    if (pts == AV_NOPTS_VALUE)
+        pts = 0;
+
+    /* convert pts to v4l2 timebase */
+    v4l2_pts = av_rescale_q(pts, s->avctx->time_base, v4l2_timebase);
+    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
+}
+
+static inline uint64_t v4l2_get_pts(V4L2Buffer *avbuf)
+{
+    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+    int64_t v4l2_pts;
+
+    /* convert pts back to encoder timebase */
+    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
+                        avbuf->buf.timestamp.tv_usec;
+
+    return av_rescale_q(v4l2_pts, v4l2_timebase, s->avctx->time_base);
+}
+
+static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.colorspace :
+        buf->context->format.fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.ycbcr_enc:
+        buf->context->format.fmt.pix.ycbcr_enc;
+
+    switch(ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709;
+    case V4L2_YCBCR_ENC_XV601:
+    case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M;
+    default:
+        break;
+    }
+
+    switch(cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020;
+    default:
+        break;
+    }
+
+    return AVCOL_PRI_UNSPECIFIED;
+}
+
+static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+{
+    enum v4l2_quantization qt;
+
+    qt = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.quantization :
+        buf->context->format.fmt.pix.quantization;
+
+    switch (qt) {
+    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
+    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
+    default:
+        break;
+    }
+
+     return AVCOL_RANGE_UNSPECIFIED;
+}
+
+static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.colorspace :
+        buf->context->format.fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.ycbcr_enc:
+        buf->context->format.fmt.pix.ycbcr_enc;
+
+    switch(cs) {
+    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
+    case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
+    case V4L2_COLORSPACE_BT2020:
+        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
+            return AVCOL_SPC_BT2020_CL;
+        else
+             return AVCOL_SPC_BT2020_NCL;
+    default:
+        break;
+    }
+
+    return AVCOL_SPC_UNSPECIFIED;
+}
+
+static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
+{
+    enum v4l2_ycbcr_encoding ycbcr;
+    enum v4l2_xfer_func xfer;
+    enum v4l2_colorspace cs;
+
+    cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.colorspace :
+        buf->context->format.fmt.pix.colorspace;
+
+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.ycbcr_enc:
+        buf->context->format.fmt.pix.ycbcr_enc;
+
+    xfer = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+        buf->context->format.fmt.pix_mp.xfer_func:
+        buf->context->format.fmt.pix.xfer_func;
+
+    switch (xfer) {
+    case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709;
+    case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1;
+    default:
+        break;
+    }
+
+    switch (cs) {
+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22;
+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28;
+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M;
+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M;
+    default:
+        break;
+    }
+
+    switch (ycbcr) {
+    case V4L2_YCBCR_ENC_XV709:
+    case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG;
+    default:
+        break;
+    }
+
+    return AVCOL_TRC_UNSPECIFIED;
+}
+
+static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+{
+    V4L2Buffer* avbuf = opaque;
+    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+
+    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
+        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
+
+        if (s->reinit) {
+            if (!atomic_load(&s->refcount))
+                sem_post(&s->refsync);
+        } else {
+            if (s->draining) {
+                /* no need to queue more buffers to the driver */
+                avbuf->status = V4L2BUF_AVAILABLE;
+            }
+            else if (avbuf->context->streamon)
+                ff_v4l2_buffer_enqueue(avbuf);
+        }
+
+        av_buffer_unref(&avbuf->context_ref);
+    }
+}
+
+static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
+{
+    V4L2m2mContext *s = buf_to_m2mctx(in);
+
+    if (plane >= in->num_planes)
+        return AVERROR(EINVAL);
+
+    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
+    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
+                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
+    if (!*buf)
+        return AVERROR(ENOMEM);
+
+    if (in->context_ref)
+        atomic_fetch_add(&in->context_refcount, 1);
+    else {
+        in->context_ref = av_buffer_ref(s->self_ref);
+        if (!in->context_ref) {
+            av_buffer_unref(buf);
+            return AVERROR(ENOMEM);
+        }
+        in->context_refcount = 1;
+    }
+
+    in->status = V4L2BUF_RET_USER;
+    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
+
+    return 0;
+}
+
+static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, AVBufferRef* bref)
+{
+    unsigned int bytesused, length;
+
+    if (plane >= out->num_planes)
+        return AVERROR(EINVAL);
+
+    bytesused = FFMIN(size, out->plane_info[plane].length);
+    length = out->plane_info[plane].length;
+
+    memcpy(out->plane_info[plane].mm_addr, data, FFMIN(size, out->plane_info[plane].length));
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+        out->planes[plane].bytesused = bytesused;
+        out->planes[plane].length = length;
+    } else {
+        out->buf.bytesused = bytesused;
+        out->buf.length = length;
+    }
+
+    return 0;
+}
+
+/******************************************************************************
+ *
+ *              V4L2uffer interface
+ *
+ ******************************************************************************/
+
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer* out)
+{
+    int i, ret;
+
+    for(i = 0; i < out->num_planes; i++) {
+        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, frame->buf[i]);
+        if (ret)
+            return ret;
+    }
+
+    v4l2_set_pts(out, frame->pts);
+
+    return 0;
+}
+
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+{
+    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+    int i, ret;
+
+    av_frame_unref(frame);
+
+    /* 1. get references to the actual data */
+    for (i = 0; i < avbuf->num_planes; i++) {
+        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
+        if (ret)
+            return ret;
+
+        frame->linesize[i] = avbuf->plane_info[i].bytesperline;
+        frame->data[i] = frame->buf[i]->data;
+    }
+
+    /* 1.1 fixup special cases */
+    switch (avbuf->context->av_pix_fmt) {
+    case AV_PIX_FMT_NV12:
+        if (avbuf->num_planes > 1)
+            break;
+        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
+        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+        break;
+    default:
+        break;
+    }
+
+    /* 2. get frame information */
+    frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
+    frame->format = avbuf->context->av_pix_fmt;
+    frame->color_primaries = v4l2_get_color_primaries(avbuf);
+    frame->colorspace = v4l2_get_color_space(avbuf);
+    frame->color_range = v4l2_get_color_range(avbuf);
+    frame->color_trc = v4l2_get_color_trc(avbuf);
+    frame->pts = v4l2_get_pts(avbuf);
+
+    /* these two values are updated also during re-init in v4l2_process_driver_event */
+    frame->height = s->output.height;
+    frame->width = s->output.width;
+
+    /* 3. report errors upstream */
+    if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
+        av_log(logger(avbuf), AV_LOG_ERROR, "%s: driver decode error\n", avbuf->context->name);
+        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
+    }
+
+    return 0;
+}
+
+int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+{
+    int ret;
+
+    av_packet_unref(pkt);
+    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
+    if (ret)
+        return ret;
+
+    pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
+    pkt->data = pkt->buf->data;
+
+    if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+
+    if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
+        av_log(logger(avbuf), AV_LOG_ERROR, "%s driver encode error\n", avbuf->context->name);
+        pkt->flags |= AV_PKT_FLAG_CORRUPT;
+    }
+
+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
+
+    return 0;
+}
+
+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
+{
+    int ret;
+
+    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, pkt->buf);
+    if (ret)
+        return ret;
+
+    v4l2_set_pts(out, pkt->pts);
+
+    if (pkt->flags & AV_PKT_FLAG_KEY)
+        out->flags = V4L2_BUF_FLAG_KEYFRAME;
+
+    return 0;
+}
+
+int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+{
+    V4L2Context *ctx = avbuf->context;
+    int ret, i;
+
+    avbuf->buf.memory = V4L2_MEMORY_MMAP;
+    avbuf->buf.type = ctx->type;
+    avbuf->buf.index = index;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+        avbuf->buf.length = VIDEO_MAX_PLANES;
+        avbuf->buf.m.planes = avbuf->planes;
+    }
+
+    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+    if (ret < 0)
+        return AVERROR(errno);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+        avbuf->num_planes = 0;
+        for (;;) {
+            /* in MP, the V4L2 API states that buf.length means num_planes */
+            if (avbuf->num_planes >= avbuf->buf.length)
+                break;
+            if (avbuf->buf.m.planes[avbuf->num_planes].length)
+                avbuf->num_planes++;
+        }
+    } else
+        avbuf->num_planes = 1;
+
+    for (i = 0; i < avbuf->num_planes; i++) {
+
+        avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+            ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+            ctx->format.fmt.pix.bytesperline;
+
+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+            avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+                                           PROT_READ | PROT_WRITE, MAP_SHARED,
+                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+        } else {
+            avbuf->plane_info[i].length = avbuf->buf.length;
+            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+                                          PROT_READ | PROT_WRITE, MAP_SHARED,
+                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+        }
+
+        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
+            return AVERROR(ENOMEM);
+    }
+
+    avbuf->status = V4L2BUF_AVAILABLE;
+
+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+        return 0;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+        avbuf->buf.m.planes = avbuf->planes;
+        avbuf->buf.length   = avbuf->num_planes;
+
+    } else {
+        avbuf->buf.bytesused = avbuf->planes[0].bytesused;
+        avbuf->buf.length    = avbuf->planes[0].length;
+    }
+
+    return ff_v4l2_buffer_enqueue(avbuf);
+}
+
+int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
+{
+    int ret;
+
+    avbuf->buf.flags = avbuf->flags;
+
+    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
+    if (ret < 0)
+        return AVERROR(errno);
+
+    avbuf->status = V4L2BUF_IN_DRIVER;
+
+    return 0;
+}
diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
new file mode 100644
index 0000000..7a57caf
--- /dev/null
+++ b/libavcodec/v4l2_buffers.h
@@ -0,0 +1,131 @@
+/*
+ * V4L2 buffer helper functions.
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V4L2_BUFFERS_H
+#define AVCODEC_V4L2_BUFFERS_H
+
+#include <stdatomic.h>
+#include <linux/videodev2.h>
+
+#include "avcodec.h"
+
+enum V4L2Buffer_status {
+    V4L2BUF_AVAILABLE,
+    V4L2BUF_IN_DRIVER,
+    V4L2BUF_RET_USER,
+};
+
+/**
+ * V4L2Buffer (wrapper for v4l2_buffer management)
+ */
+typedef struct V4L2Buffer {
+    /* each buffer needs to have a reference to its context */
+    struct V4L2Context *context;
+
+    /* This object is refcounted per-plane, so we need to keep track
+     * of how many context-refs we are holding. */
+    AVBufferRef *context_ref;
+    atomic_uint context_refcount;
+
+    /* keep track of the mmap address and mmap length */
+    struct V4L2Plane_info {
+        int bytesperline;
+        void * mm_addr;
+        size_t length;
+    } plane_info[VIDEO_MAX_PLANES];
+
+    int num_planes;
+
+    /* the v4l2_buffer buf.m.planes pointer uses the planes[] mem */
+    struct v4l2_buffer buf;
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+
+    int flags;
+    enum V4L2Buffer_status status;
+
+} V4L2Buffer;
+
+/**
+ * Extracts the data from a V4L2Buffer to an AVFrame
+ *
+ * @param[in] frame The AVFRame to push the information to
+ * @param[in] buf The V4L2Buffer to get the information from
+ *
+ * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
+ * AVERROR(ENOMEM) if the AVBufferRef can't be created.
+ */
+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
+
+/**
+ * Extracts the data from a V4L2Buffer to an AVPacket
+ *
+ * @param[in] pkt The AVPacket to push the information to
+ * @param[in] buf The V4L2Buffer to get the information from
+ *
+ * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
+ * AVERROR(ENOMEM) if the AVBufferRef can't be created.
+ *
+ */
+int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
+
+/**
+ * Extracts the data from an AVPacket to a V4L2Buffer
+ *
+ * @param[in]  frame AVPacket to get the data from
+ * @param[in]  avbuf V4L2Bfuffer to push the information to
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+
+/**
+ * Extracts the data from an AVFrame to a V4L2Buffer
+ *
+ * @param[in]  frame AVFrame to get the data from
+ * @param[in]  avbuf V4L2Bfuffer to push the information to
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer* out);
+
+/**
+ * Initializes a V4L2Buffer
+ *
+ * @param[in]  avbuf V4L2Bfuffer to initialize
+ * @param[in]  index v4l2 buffer id
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
+
+/**
+ * Enqueues a V4L2Buffer
+ *
+ * @param[in] avbuf V4L2Bfuffer to push to the driver
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+
+
+#endif // AVCODEC_V4L2_BUFFERS_H
diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
new file mode 100644
index 0000000..efcb042
--- /dev/null
+++ b/libavcodec/v4l2_context.c
@@ -0,0 +1,710 @@
+/*
+ * V4L2 context helper functions.
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/videodev2.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <poll.h>
+#include "libavcodec/avcodec.h"
+#include "libavcodec/internal.h"
+#include "v4l2_buffers.h"
+#include "v4l2_fmt.h"
+#include "v4l2_m2m.h"
+
+struct v4l2_format_update {
+    uint32_t v4l2_fmt;
+    int update_v4l2;
+
+    enum AVPixelFormat av_fmt;
+    int update_avfmt;
+};
+
+static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
+{
+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+        container_of(ctx, V4L2m2mContext, output) :
+        container_of(ctx, V4L2m2mContext, capture);
+}
+
+static inline AVCodecContext *logger(V4L2Context *ctx)
+{
+    return ctx_to_m2mctx(ctx)->avctx;
+}
+
+static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
+}
+
+static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
+{
+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
+}
+
+static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
+{
+    struct v4l2_format *fmt1 = &ctx->format;
+    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+        :
+        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
+
+    if (ret)
+        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
+            ctx->name,
+            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
+            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
+
+    return ret;
+}
+
+static inline int v4l2_type_supported(V4L2Context *ctx)
+{
+    return ctx->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE ||
+        ctx->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE ||
+        ctx->type == V4L2_BUF_TYPE_VIDEO_CAPTURE ||
+        ctx->type == V4L2_BUF_TYPE_VIDEO_OUTPUT;
+}
+
+static inline int v4l2_get_framesize_compressed(V4L2Context* ctx, int width, int height)
+{
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    const int SZ_4K = 0x1000;
+    int size;
+
+    if (av_codec_is_decoder(s->avctx->codec))
+        return ((width * height * 3 / 2) / 2) + 128;
+
+    /* encoder */
+    size = FFALIGN(height, 32) * FFALIGN(width, 32) * 3 / 2 / 2;
+    return FFALIGN(size, SZ_4K);
+}
+
+static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_update *fmt)
+{
+    ctx->format.type = ctx->type;
+
+    if (fmt->update_avfmt)
+        ctx->av_pix_fmt = fmt->av_fmt;
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+        /* update the sizes to handle the reconfiguration of the capture stream at runtime */
+        ctx->format.fmt.pix_mp.height = ctx->height;
+        ctx->format.fmt.pix_mp.width = ctx->width;
+        if (fmt->update_v4l2) {
+            ctx->format.fmt.pix_mp.pixelformat = fmt->v4l2_fmt;
+
+            /* s5p-mfc requires the user to specify a buffer size */
+            ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage =
+                v4l2_get_framesize_compressed(ctx, ctx->width, ctx->height);
+        }
+    } else {
+        ctx->format.fmt.pix.height = ctx->height;
+        ctx->format.fmt.pix.width = ctx->width;
+        if (fmt->update_v4l2) {
+            ctx->format.fmt.pix.pixelformat = fmt->v4l2_fmt;
+
+            /* s5p-mfc requires the user to specify a buffer size */
+            ctx->format.fmt.pix.sizeimage =
+                v4l2_get_framesize_compressed(ctx, ctx->width, ctx->height);
+        }
+    }
+}
+
+/**
+ * returns 1 if reinit was successful, negative if it failed
+ * returns 0 if reinit was not executed
+ */
+static int v4l2_handle_event(V4L2Context *ctx)
+{
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    struct v4l2_format cap_fmt = s->capture.format;
+    struct v4l2_format out_fmt = s->output.format;
+    struct v4l2_event evt = { 0 };
+    int full_reinit, reinit, ret;
+
+    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
+    if (ret < 0) {
+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
+        return 0;
+    }
+
+    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
+        return 0;
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
+    if (ret) {
+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
+        return 0;
+    }
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
+    if (ret) {
+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
+        return 0;
+    }
+
+    full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
+    if (full_reinit) {
+        s->output.height = v4l2_get_height(&out_fmt);
+        s->output.width = v4l2_get_width(&out_fmt);
+    }
+
+    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+    if (reinit) {
+        s->capture.height = v4l2_get_height(&cap_fmt);
+        s->capture.width = v4l2_get_width(&cap_fmt);
+    }
+
+    if (full_reinit || reinit)
+        s->reinit = 1;
+
+    if (full_reinit) {
+        ret = ff_v4l2_m2m_codec_full_reinit(s);
+        if (ret) {
+            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
+            return -EINVAL;
+        }
+        goto reinit_run;
+    }
+
+    if (reinit) {
+        ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
+        if (ret < 0)
+            av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
+
+        ret = ff_v4l2_m2m_codec_reinit(s);
+        if (ret) {
+            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
+            return -EINVAL;
+        }
+        goto reinit_run;
+    }
+
+    /* dummy event received */
+    return 0;
+
+    /* reinit executed */
+reinit_run:
+    return 1;
+}
+
+static int v4l2_stop_decode(V4L2Context *ctx)
+{
+    struct v4l2_decoder_cmd cmd = {
+        .cmd = V4L2_DEC_CMD_STOP,
+        .flags = 0,
+    };
+    int ret;
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DECODER_CMD, &cmd);
+    if (ret) {
+        /* DECODER_CMD is optional */
+        if (errno == ENOTTY)
+            return ff_v4l2_context_set_status(ctx, VIDIOC_STREAMOFF);
+    }
+
+    return 0;
+}
+
+static int v4l2_stop_encode(V4L2Context *ctx)
+{
+    struct v4l2_encoder_cmd cmd = {
+        .cmd = V4L2_ENC_CMD_STOP,
+        .flags = 0,
+    };
+    int ret;
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENCODER_CMD, &cmd);
+    if (ret) {
+        /* ENCODER_CMD is optional */
+        if (errno == ENOTTY)
+            return ff_v4l2_context_set_status(ctx, VIDIOC_STREAMOFF);
+    }
+
+    return 0;
+}
+
+static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
+{
+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+    struct v4l2_buffer buf = { 0 };
+    V4L2Buffer* avbuf = NULL;
+    struct pollfd pfd = {
+        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
+        .fd = ctx_to_m2mctx(ctx)->fd,
+    };
+    int i, ret;
+
+    /* if we are draining and there are no more capture buffers queued in the driver we are done */
+    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
+        for (i = 0; i < ctx->num_buffers; i++) {
+            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+                goto start;
+        }
+        ctx->done = 1;
+        return NULL;
+    }
+
+start:
+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+        pfd.events =  POLLOUT | POLLWRNORM;
+    else {
+        /* no need to listen to requests for more input while draining */
+        if (ctx_to_m2mctx(ctx)->draining)
+            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
+    }
+
+    for (;;) {
+        ret = poll(&pfd, 1, timeout);
+        if (ret > 0)
+            break;
+        if (errno == EINTR)
+            continue;
+        return NULL;
+    }
+
+    /* 0. handle errors */
+    if (pfd.revents & POLLERR) {
+        /* if we are trying to get free buffers but none have been queued yet
+           no need to raise a warning */
+        if (timeout == 0) {
+            for (i = 0; i < ctx->num_buffers; i++) {
+                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+            }
+        }
+        else
+            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+
+        return NULL;
+    }
+
+    /* 1. handle resolution changes */
+    if (pfd.revents & POLLPRI) {
+        ret = v4l2_handle_event(ctx);
+        if (ret < 0) {
+            /* if re-init failed, abort */
+            ctx->done = 1;
+            return NULL;
+        }
+        if (ret) {
+            /* if re-init was successful drop the buffer (if there was one)
+             * since we had to reconfigure capture (unmap all buffers)
+             */
+            return NULL;
+        }
+    }
+
+    /* 2. dequeue the buffer */
+    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
+
+        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+            /* there is a capture buffer ready */
+            if (pfd.revents & (POLLIN | POLLRDNORM))
+                goto dequeue;
+
+            /* the driver is ready to accept more input; instead of waiting for the capture
+             * buffer to complete we return NULL so input can proceed (we are single threaded)
+             */
+            if (pfd.revents & (POLLOUT | POLLWRNORM))
+                return NULL;
+        }
+
+dequeue:
+        memset(&buf, 0, sizeof(buf));
+        buf.memory = V4L2_MEMORY_MMAP;
+        buf.type = ctx->type;
+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+            memset(planes, 0, sizeof(planes));
+            buf.length = VIDEO_MAX_PLANES;
+            buf.m.planes = planes;
+        }
+
+        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
+        if (ret) {
+            if (errno != EAGAIN) {
+                ctx->done = 1;
+                if (errno != EPIPE)
+                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+                        ctx->name, av_err2str(AVERROR(errno)));
+            }
+            return NULL;
+        }
+
+        avbuf = &ctx->buffers[buf.index];
+        avbuf->status = V4L2BUF_AVAILABLE;
+        avbuf->buf = buf;
+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+            memcpy(avbuf->planes, planes, sizeof(planes));
+            avbuf->buf.m.planes = avbuf->planes;
+        }
+        return avbuf;
+    }
+
+    return NULL;
+}
+
+static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+{
+    int timeout = 0; /* return when no more buffers to dequeue */
+    int i;
+
+    /* get back as many output buffers as possible */
+    if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+          do {
+          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
+    }
+
+    for (i = 0; i < ctx->num_buffers; i++) {
+        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
+            return &ctx->buffers[i];
+    }
+
+    return NULL;
+}
+
+static int v4l2_release_buffers(V4L2Context* ctx)
+{
+    struct v4l2_requestbuffers req = {
+        .memory = V4L2_MEMORY_MMAP,
+        .type = ctx->type,
+        .count = 0, /* 0 -> unmaps buffers from the driver */
+    };
+    int i, j;
+
+    for (i = 0; i < ctx->num_buffers; i++) {
+        V4L2Buffer *buffer = &ctx->buffers[i];
+
+        for (j = 0; j < buffer->num_planes; j++) {
+            struct V4L2Plane_info *p = &buffer->plane_info[j];
+            if (p->mm_addr && p->length)
+                if (munmap(p->mm_addr, p->length) < 0)
+                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
+        }
+    }
+
+    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
+}
+
+static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
+{
+    struct v4l2_format *fmt = &ctx->format;
+    uint32_t v4l2_fmt;
+    int ret;
+
+    v4l2_fmt = ff_v4l2_format_avfmt_to_v4l2(pixfmt);
+    if (!v4l2_fmt)
+        return AVERROR(EINVAL);
+
+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type))
+        fmt->fmt.pix_mp.pixelformat = v4l2_fmt;
+    else
+        fmt->fmt.pix.pixelformat = v4l2_fmt;
+
+    fmt->type = ctx->type;
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_TRY_FMT, fmt);
+    if (ret)
+        return AVERROR(EINVAL);
+
+    return 0;
+}
+
+static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+{
+    enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
+    struct v4l2_fmtdesc fdesc;
+    int ret;
+
+    memset(&fdesc, 0, sizeof(fdesc));
+    fdesc.type = ctx->type;
+
+    if (pixfmt != AV_PIX_FMT_NONE) {
+        ret = v4l2_try_raw_format(ctx, pixfmt);
+        if (!ret)
+            return 0;
+    }
+
+    for (;;) {
+        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
+        if (ret)
+            return AVERROR(EINVAL);
+
+        pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
+        ret = v4l2_try_raw_format(ctx, pixfmt);
+        if (ret){
+            fdesc.index++;
+            continue;
+        }
+
+        *p = pixfmt;
+
+        return 0;
+    }
+
+    return AVERROR(EINVAL);
+}
+
+static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
+{
+    struct v4l2_fmtdesc fdesc;
+    uint32_t v4l2_fmt;
+    int ret;
+
+    /* translate to a valid v4l2 format */
+    v4l2_fmt = ff_v4l2_format_avcodec_to_v4l2(ctx->av_codec_id);
+    if (!v4l2_fmt)
+        return AVERROR(EINVAL);
+
+    /* check if the driver supports this format */
+    memset(&fdesc, 0, sizeof(fdesc));
+    fdesc.type = ctx->type;
+
+    for (;;) {
+        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
+        if (ret)
+            return AVERROR(EINVAL);
+
+        if (fdesc.pixelformat == v4l2_fmt)
+            break;
+
+        fdesc.index++;
+    }
+
+    *p = v4l2_fmt;
+
+    return 0;
+}
+
+ /*****************************************************************************
+  *
+  *             V4L2 Context Interface
+  *
+  *****************************************************************************/
+
+int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
+{
+    int type = ctx->type;
+    int ret;
+
+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
+    if (ret < 0)
+        return AVERROR(errno);
+
+    ctx->streamon = (cmd == VIDIOC_STREAMON);
+
+    return 0;
+}
+
+int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+{
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2Buffer* avbuf;
+    int ret;
+
+    if (!frame) {
+        ret = v4l2_stop_encode(ctx);
+        if (ret)
+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+        s->draining= 1;
+        return 0;
+    }
+
+    avbuf = v4l2_getfree_v4l2buf(ctx);
+    if (!avbuf)
+        return AVERROR(ENOMEM);
+
+    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
+    if (ret)
+        return ret;
+
+    return ff_v4l2_buffer_enqueue(avbuf);
+}
+
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+{
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    V4L2Buffer* avbuf;
+    int ret;
+
+    if (!pkt->size) {
+        ret = v4l2_stop_decode(ctx);
+        if (ret)
+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
+        s->draining = 1;
+        return 0;
+    }
+
+    avbuf = v4l2_getfree_v4l2buf(ctx);
+    if (!avbuf)
+        return AVERROR(ENOMEM);
+
+    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
+    if (ret)
+        return ret;
+
+    return ff_v4l2_buffer_enqueue(avbuf);
+}
+
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame)
+{
+    V4L2Buffer* avbuf = NULL;
+
+    /*
+     * blocks until:
+     *  1. decoded frame available
+     *  2. an input buffer is ready to be dequeued
+     */
+    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+    if (!avbuf) {
+        if (ctx->done)
+            return AVERROR_EOF;
+
+        return AVERROR(EAGAIN);
+    }
+
+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
+}
+
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
+{
+    V4L2Buffer* avbuf = NULL;
+
+    /*
+     * blocks until:
+     *  1. encoded packet available
+     *  2. an input buffer ready to be dequeued
+     */
+    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+    if (!avbuf) {
+        if (ctx->done)
+            return AVERROR_EOF;
+
+        return AVERROR(EAGAIN);
+    }
+
+    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
+}
+
+int ff_v4l2_context_get_format(V4L2Context* ctx)
+{
+    struct v4l2_format_update fmt = { 0 };
+    int ret;
+
+    if  (ctx->av_codec_id == AV_CODEC_ID_RAWVIDEO) {
+        ret = v4l2_get_raw_format(ctx, &fmt.av_fmt);
+        if (ret)
+            return ret;
+
+        fmt.update_avfmt = 1;
+        v4l2_save_to_context(ctx, &fmt);
+
+        /* format has been tried already */
+        return ret;
+    }
+
+    ret = v4l2_get_coded_format(ctx, &fmt.v4l2_fmt);
+    if (ret)
+        return ret;
+
+    fmt.update_v4l2 = 1;
+    v4l2_save_to_context(ctx, &fmt);
+
+    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_TRY_FMT, &ctx->format);
+}
+
+int ff_v4l2_context_set_format(V4L2Context* ctx)
+{
+    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
+}
+
+void ff_v4l2_context_release(V4L2Context* ctx)
+{
+    int ret;
+
+    if (!ctx->buffers)
+        return;
+
+    ret = v4l2_release_buffers(ctx);
+    if (ret)
+        av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
+
+    av_free(ctx->buffers);
+    ctx->buffers = NULL;
+}
+
+int ff_v4l2_context_init(V4L2Context* ctx)
+{
+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+    struct v4l2_requestbuffers req;
+    int ret, i;
+
+    if (!v4l2_type_supported(ctx)) {
+        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+    if (ret)
+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
+
+    memset(&req, 0, sizeof(req));
+    req.count = ctx->num_buffers;
+    req.memory = V4L2_MEMORY_MMAP;
+    req.type = ctx->type;
+    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
+    if (ret < 0)
+        return AVERROR(errno);
+
+    ctx->num_buffers = req.count;
+    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
+    if (!ctx->buffers) {
+            av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
+            return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < req.count; i++) {
+        ctx->buffers[i].context = ctx;
+        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
+        if (ret < 0) {
+            av_log(logger(ctx), AV_LOG_ERROR, "%s buffer initialization (%s)\n", ctx->name, av_err2str(ret));
+            av_free(ctx->buffers);
+            return ret;
+        }
+    }
+
+    av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
+        V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
+        req.count,
+        v4l2_get_width(&ctx->format),
+        v4l2_get_height(&ctx->format),
+        V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
+        V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
+
+    return 0;
+}
diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
new file mode 100644
index 0000000..632f1d0
--- /dev/null
+++ b/libavcodec/v4l2_context.h
@@ -0,0 +1,183 @@
+/*
+ * V4L2 context helper functions.
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V4L2_CONTEXT_H
+#define AVCODEC_V4L2_CONTEXT_H
+
+#include <stdatomic.h>
+#include <linux/videodev2.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/frame.h"
+#include "libavutil/buffer.h"
+#include "v4l2_buffers.h"
+
+typedef struct V4L2Context {
+    /**
+     * context name.
+     */
+    const char* name;
+
+    /**
+     * Type of this buffer context.
+     * See V4L2_BUF_TYPE_VIDEO_* in videodev2.h
+     * Readonly after init.
+     */
+    enum v4l2_buf_type type;
+
+    /**
+     * AVPixelFormat corresponding to this buffer context.
+     * AV_PIX_FMT_NONE means this is an encoded stream.
+     */
+    enum AVPixelFormat av_pix_fmt;
+
+    /**
+     * AVCodecID corresponding to this buffer context.
+     * AV_CODEC_ID_RAWVIDEO means this is a raw stream and av_pix_fmt must be set to a valid value.
+     */
+    enum AVCodecID av_codec_id;
+
+    /**
+     * Format returned by the driver after initializing the buffer context.
+     * Readonly after init.
+     */
+    struct v4l2_format format;
+
+    /**
+     * Width and height of the frames it produces (in case of a capture context, e.g. when decoding)
+     * or accepts (in case of an output context, e.g. when encoding).
+     */
+    int width, height;
+
+    /**
+     * Indexed array of V4L2Buffers
+     */
+    V4L2Buffer *buffers;
+
+    /**
+     * Readonly after init.
+     */
+    int num_buffers;
+
+    /**
+     * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+     */
+    int streamon;
+
+    /**
+     *  Either no more buffers available or an unrecoverable error was notified
+     *  by the V4L2 kernel driver: once set the context has to be exited.
+     */
+    int done;
+
+} V4L2Context;
+
+/**
+ * Initializes a V4L2Context.
+ *
+ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context description for required variables.
+ * @return 0 in case of success, a negative value representing the error otherwise.
+ */
+int ff_v4l2_context_init(V4L2Context* ctx);
+
+/**
+ * Sets the V4L2Context format in the v4l2 driver.
+ *
+ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context description for required variables.
+ * @return 0 in case of success, a negative value representing the error otherwise.
+ */
+int ff_v4l2_context_set_format(V4L2Context* ctx);
+
+/**
+ * Queries the driver for a valid v4l2 format and copies it to the context.
+ *
+ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context description for required variables.
+ * @return 0 in case of success, a negative value representing the error otherwise.
+ */
+int ff_v4l2_context_get_format(V4L2Context* ctx);
+
+/**
+ * Releases a V4L2Context.
+ *
+ * @param[in] ctx A pointer to a V4L2Context.
+ *               The caller is reponsible for freeing it.
+ *               It must not be used after calling this function.
+ */
+void ff_v4l2_context_release(V4L2Context* ctx);
+
+/**
+ * Sets the status of a V4L2Context.
+ *
+ * @param[in] ctx A pointer to a V4L2Context.
+ * @param[in] cmd The status to set (VIDIOC_STREAMON or VIDIOC_STREAMOFF).
+ *                Warning: If VIDIOC_STREAMOFF is sent to a buffer context that still has some frames buffered,
+ *                those frames will be dropped.
+ * @return 0 in case of success, a negative value representing the error otherwise.
+ */
+int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
+
+/**
+ * Dequeues a buffer from a V4L2Context to an AVPacket.
+ *
+ * The pkt must be non NULL.
+ * @param[in] ctx The V4L2Context to dequeue from.
+ * @param[inout] pkt The AVPacket to dequeue to.
+ * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
+ */
+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+
+/**
+ * Dequeues a buffer from a V4L2Context to an AVFrame.
+ *
+ * The frame must be non NULL.
+ * @param[in] ctx The V4L2Context to dequeue from.
+ * @param[inout] f The AVFrame to dequeue to.
+ * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
+ */
+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f);
+
+/**
+ * Enqueues a buffer to a V4L2Context from an AVPacket
+ *
+ * The packet must be non NULL.
+ * When the size of the pkt is null, the buffer is not queued but a V4L2_DEC_CMD_STOP command is sent instead to the driver.
+ *
+ * @param[in] ctx The V4L2Context to enqueue to.
+ * @param[in] pkt A pointer to an AVPacket.
+ * @return 0 in case of success, a negative error otherwise.
+ */
+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
+
+/**
+ * Enqueues a buffer to a V4L2Context from an AVFrame
+ *
+ * The frame must be non NULL.
+ *
+ * @param[in] ctx The V4L2Context to enqueue to.
+ * @param[in] f A pointer to an AVFrame to enqueue.
+ * @return 0 in case of success, a negative error otherwise.
+ */
+int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
+
+#endif // AVCODEC_V4L2_CONTEXT_H
diff --git a/libavcodec/v4l2_fmt.c b/libavcodec/v4l2_fmt.c
new file mode 100644
index 0000000..6df47e3
--- /dev/null
+++ b/libavcodec/v4l2_fmt.c
@@ -0,0 +1,141 @@
+/*
+ * V4L2 format helper functions
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/videodev2.h>
+#include <search.h>
+#include "v4l2_fmt.h"
+
+#define V4L2_FMT(x) V4L2_PIX_FMT_##x
+#define AV_CODEC(x) AV_CODEC_ID_##x
+#define AV_FMT(x)   AV_PIX_FMT_##x
+
+static const struct fmt_conversion {
+    enum AVPixelFormat avfmt;
+    enum AVCodecID avcodec;
+    uint32_t v4l2_fmt;
+} fmt_map[] = {
+    { AV_FMT(RGB555LE),    AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB555) },
+    { AV_FMT(RGB555BE),    AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB555X) },
+    { AV_FMT(RGB565LE),    AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB565) },
+    { AV_FMT(RGB565BE),    AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB565X) },
+    { AV_FMT(BGR24),       AV_CODEC(RAWVIDEO),    V4L2_FMT(BGR24) },
+    { AV_FMT(RGB24),       AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB24) },
+    { AV_FMT(BGR0),        AV_CODEC(RAWVIDEO),    V4L2_FMT(BGR32) },
+    { AV_FMT(0RGB),        AV_CODEC(RAWVIDEO),    V4L2_FMT(RGB32) },
+    { AV_FMT(GRAY8),       AV_CODEC(RAWVIDEO),    V4L2_FMT(GREY) },
+    { AV_FMT(YUV420P),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUV420) },
+    { AV_FMT(YUYV422),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUYV) },
+    { AV_FMT(UYVY422),     AV_CODEC(RAWVIDEO),    V4L2_FMT(UYVY) },
+    { AV_FMT(YUV422P),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUV422P) },
+    { AV_FMT(YUV411P),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUV411P) },
+    { AV_FMT(YUV410P),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUV410) },
+    { AV_FMT(YUV410P),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YVU410) },
+    { AV_FMT(NV12),        AV_CODEC(RAWVIDEO),    V4L2_FMT(NV12) },
+    { AV_FMT(NONE),        AV_CODEC(MJPEG),       V4L2_FMT(MJPEG) },
+    { AV_FMT(NONE),        AV_CODEC(MJPEG),       V4L2_FMT(JPEG) },
+#ifdef V4L2_PIX_FMT_SRGGB8
+    { AV_FMT(BAYER_BGGR8), AV_CODEC(RAWVIDEO),    V4L2_FMT(SBGGR8) },
+    { AV_FMT(BAYER_GBRG8), AV_CODEC(RAWVIDEO),    V4L2_FMT(SGBRG8) },
+    { AV_FMT(BAYER_GRBG8), AV_CODEC(RAWVIDEO),    V4L2_FMT(SGRBG8) },
+    { AV_FMT(BAYER_RGGB8), AV_CODEC(RAWVIDEO),    V4L2_FMT(SRGGB8) },
+#endif
+#ifdef V4L2_PIX_FMT_Y16
+    { AV_FMT(GRAY16LE),    AV_CODEC(RAWVIDEO),    V4L2_FMT(Y16) },
+#endif
+#ifdef V4L2_PIX_FMT_NV12M
+    { AV_FMT(NV12),        AV_CODEC(RAWVIDEO),    V4L2_FMT(NV12M) },
+#endif
+#ifdef V4L2_PIX_FMT_NV21M
+    { AV_FMT(NV21),        AV_CODEC(RAWVIDEO),    V4L2_FMT(NV21M) },
+#endif
+#ifdef V4L2_PIX_FMT_YUV420M
+    { AV_FMT(YUV420P),     AV_CODEC(RAWVIDEO),    V4L2_FMT(YUV420M) },
+#endif
+#ifdef V4L2_PIX_FMT_NV16M
+    { AV_FMT(NV16),        AV_CODEC(RAWVIDEO),    V4L2_FMT(NV16M) },
+#endif
+#ifdef V4L2_PIX_FMT_H263
+    { AV_FMT(NONE),        AV_CODEC(H263),        V4L2_FMT(H263) },
+#endif
+#ifdef V4L2_PIX_FMT_H264
+    { AV_FMT(NONE),        AV_CODEC(H264),        V4L2_FMT(H264) },
+#endif
+#ifdef V4L2_PIX_FMT_MPEG4
+    { AV_FMT(NONE),        AV_CODEC(MPEG4),       V4L2_FMT(MPEG4) },
+#endif
+#ifdef V4L2_PIX_FMT_CPIA1
+    { AV_FMT(NONE),        AV_CODEC(CPIA),        V4L2_FMT(CPIA1) },
+#endif
+#ifdef V4L2_PIX_FMT_DV
+    { AV_FMT(NONE),        AV_CODEC(DVVIDEO),     V4L2_FMT(DV) },
+#endif
+#ifdef V4L2_PIX_FMT_MPEG1
+    { AV_FMT(NONE),        AV_CODEC(MPEG1VIDEO),  V4L2_FMT(MPEG1) },
+#endif
+#ifdef V4L2_PIX_FMT_MPEG2
+    { AV_FMT(NONE),        AV_CODEC(MPEG2VIDEO),  V4L2_FMT(MPEG2) },
+#endif
+#ifdef V4L2_PIX_FMT_VP8
+    { AV_FMT(NONE),        AV_CODEC(VP8),         V4L2_FMT(VP8) },
+#endif
+#ifdef V4L2_PIX_FMT_VP9
+    { AV_FMT(NONE),        AV_CODEC(VP9),         V4L2_FMT(VP9) },
+#endif
+#ifdef V4L2_PIX_FMT_HEVC
+    { AV_FMT(NONE),        AV_CODEC(HEVC),        V4L2_FMT(HEVC) },
+#endif
+#ifdef V4L2_PIX_FMT_VC1_ANNEX_G
+    { AV_FMT(NONE),        AV_CODEC(VC1),         V4L2_FMT(VC1_ANNEX_G) },
+#endif
+};
+
+uint32_t ff_v4l2_format_avcodec_to_v4l2(enum AVCodecID avcodec)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(fmt_map); i++) {
+        if (fmt_map[i].avcodec == avcodec)
+            return fmt_map[i].v4l2_fmt;
+    }
+    return 0;
+}
+
+uint32_t ff_v4l2_format_avfmt_to_v4l2(enum AVPixelFormat avfmt)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(fmt_map); i++) {
+        if (fmt_map[i].avfmt == avfmt)
+            return fmt_map[i].v4l2_fmt;
+    }
+    return 0;
+}
+
+enum AVPixelFormat ff_v4l2_format_v4l2_to_avfmt(uint32_t v4l2_fmt, enum AVCodecID avcodec)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(fmt_map); i++) {
+        if (fmt_map[i].avcodec  == avcodec &&
+            fmt_map[i].v4l2_fmt == v4l2_fmt)
+            return fmt_map[i].avfmt;
+    }
+    return AV_PIX_FMT_NONE;
+}
diff --git a/libavcodec/v4l2_fmt.h b/libavcodec/v4l2_fmt.h
new file mode 100644
index 0000000..0136002
--- /dev/null
+++ b/libavcodec/v4l2_fmt.h
@@ -0,0 +1,34 @@
+/*
+ * V4L2 format helper functions
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V4L2_FMT_H
+#define AVCODEC_V4L2_FMT_H
+
+#include "libavcodec/avcodec.h"
+#include "libavutil/pixfmt.h"
+
+enum AVPixelFormat ff_v4l2_format_v4l2_to_avfmt(uint32_t v4l2_fmt, enum AVCodecID avcodec);
+uint32_t ff_v4l2_format_avcodec_to_v4l2(enum AVCodecID avcodec);
+uint32_t ff_v4l2_format_avfmt_to_v4l2(enum AVPixelFormat avfmt);
+
+#endif /* AVCODEC_V4L2_FMT_H*/
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
new file mode 100644
index 0000000..427e165
--- /dev/null
+++ b/libavcodec/v4l2_m2m.c
@@ -0,0 +1,406 @@
+/*
+ * V4L mem2mem
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/videodev2.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include "libavcodec/avcodec.h"
+#include "libavcodec/internal.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixfmt.h"
+#include "v4l2_context.h"
+#include "v4l2_fmt.h"
+#include "v4l2_m2m.h"
+
+static inline int v4l2_splane_video(struct v4l2_capability *cap)
+{
+    if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
+        cap->capabilities & V4L2_CAP_STREAMING)
+        return 1;
+
+    if (cap->capabilities & V4L2_CAP_VIDEO_M2M)
+        return 1;
+
+    return 0;
+}
+
+static inline int v4l2_mplane_video(struct v4l2_capability *cap)
+{
+    if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE_MPLANE | V4L2_CAP_VIDEO_OUTPUT_MPLANE) &&
+        cap->capabilities & V4L2_CAP_STREAMING)
+        return 1;
+
+    if (cap->capabilities & V4L2_CAP_VIDEO_M2M_MPLANE)
+        return 1;
+
+    return 0;
+}
+
+static int v4l2_prepare_contexts(V4L2m2mContext* s)
+{
+    struct v4l2_capability cap;
+    int ret;
+
+    s->capture.done = s->output.done = 0;
+    s->capture.name = "capture";
+    s->output.name = "output ";
+    atomic_init(&s->refcount, 0);
+    sem_init(&s->refsync, 0, 0);
+
+    memset(&cap, 0, sizeof(cap));
+    ret = ioctl(s->fd, VIDIOC_QUERYCAP, &cap);
+    if (ret < 0)
+        return ret;
+
+    av_log(s->avctx, AV_LOG_INFO, "driver '%s' on card '%s'\n", cap.driver, cap.card);
+
+    if (v4l2_mplane_video(&cap)) {
+        s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+        s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+        return 0;
+    }
+
+    if (v4l2_splane_video(&cap)) {
+        s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+        s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+        return 0;
+    }
+
+    return AVERROR(EINVAL);
+}
+
+static int v4l2_probe_driver(V4L2m2mContext* s)
+{
+    int ret;
+
+    s->fd = open(s->devname, O_RDWR | O_NONBLOCK, 0);
+    if (s->fd < 0)
+        return AVERROR(errno);
+
+    ret = v4l2_prepare_contexts(s);
+    if (ret < 0)
+        goto done;
+
+    ret = ff_v4l2_context_get_format(&s->output);
+    if (ret) {
+        av_log(s->avctx, AV_LOG_DEBUG, "v4l2 output format not supported\n");
+        goto done;
+    }
+
+    ret = ff_v4l2_context_get_format(&s->capture);
+    if (ret) {
+        av_log(s->avctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");
+        goto done;
+    }
+
+done:
+    if (close(s->fd) < 0) {
+        ret = AVERROR(errno);
+        av_log(s->avctx, AV_LOG_ERROR, "failure closing %s (%s)\n", s->devname, av_err2str(AVERROR(errno)));
+    }
+
+    s->fd = -1;
+
+    return ret;
+}
+
+static int v4l2_configure_contexts(V4L2m2mContext* s)
+{
+    void *log_ctx = s->avctx;
+    int ret;
+
+    s->fd = open(s->devname, O_RDWR | O_NONBLOCK, 0);
+    if (s->fd < 0)
+        return AVERROR(errno);
+
+    ret = v4l2_prepare_contexts(s);
+    if (ret < 0)
+        goto error;
+
+    ret = ff_v4l2_context_set_format(&s->output);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_ERROR, "can't set v4l2 output format\n");
+        goto error;
+    }
+
+    ret = ff_v4l2_context_set_format(&s->capture);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_ERROR, "can't to set v4l2 capture format\n");
+        goto error;
+    }
+
+    ret = ff_v4l2_context_init(&s->output);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_ERROR, "no v4l2 output context's buffers\n");
+        goto error;
+    }
+
+    /* decoder's buffers need to be updated at a later stage */
+    if (!av_codec_is_decoder(s->avctx->codec)) {
+        ret = ff_v4l2_context_init(&s->capture);
+        if (ret) {
+            av_log(log_ctx, AV_LOG_ERROR, "no v4l2 capture context's buffers\n");
+            goto error;
+        }
+    }
+
+    return 0;
+
+error:
+    if (close(s->fd) < 0) {
+        ret = AVERROR(errno);
+        av_log(log_ctx, AV_LOG_ERROR, "error closing %s (%s)\n",
+            s->devname, av_err2str(AVERROR(errno)));
+    }
+    s->fd = -1;
+
+    return ret;
+}
+
+/******************************************************************************
+ *
+ *                  V4L2 M2M Interface
+ *
+ ******************************************************************************/
+int ff_v4l2_m2m_codec_reinit(V4L2m2mContext* s)
+{
+    int ret;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "reinit context\n");
+
+    /* 1. streamoff */
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+    if (ret)
+        av_log(s->avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
+
+    /* 2. unmap the capture buffers (v4l2 and ffmpeg):
+     *    we must wait for all references to be released before being allowed
+     *    to queue new buffers.
+     */
+    av_log(s->avctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
+    if (atomic_load(&s->refcount))
+        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
+
+    ff_v4l2_context_release(&s->capture);
+
+    /* 3. get the new capture format */
+    ret = ff_v4l2_context_get_format(&s->capture);
+    if (ret) {
+        av_log(s->avctx, AV_LOG_ERROR, "query the new capture format\n");
+        return ret;
+    }
+
+    /* 4. set the capture format */
+    ret = ff_v4l2_context_set_format(&s->capture);
+    if (ret) {
+        av_log(s->avctx, AV_LOG_ERROR, "setting capture format\n");
+        return ret;
+    }
+
+    /* 5. complete reinit */
+    s->draining = 0;
+    s->reinit = 0;
+
+    return 0;
+}
+
+int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s)
+{
+    void *log_ctx = s->avctx;
+    int ret;
+
+    av_log(log_ctx, AV_LOG_DEBUG, "%s full reinit\n", s->devname);
+
+    /* wait for pending buffer references */
+    if (atomic_load(&s->refcount))
+        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
+
+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
+    if (ret) {
+        av_log(s->avctx, AV_LOG_ERROR, "output VIDIOC_STREAMOFF\n");
+        goto error;
+    }
+
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+    if (ret) {
+            av_log(s->avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
+            goto error;
+    }
+
+    /* release and unmmap the buffers */
+    ff_v4l2_context_release(&s->output);
+    ff_v4l2_context_release(&s->capture);
+
+    /* start again now that we know the stream dimensions */
+    s->draining = 0;
+    s->reinit = 0;
+
+    ret = ff_v4l2_context_get_format(&s->output);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_DEBUG, "v4l2 output format not supported\n");
+        goto error;
+    }
+
+    ret = ff_v4l2_context_get_format(&s->capture);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");
+        goto error;
+    }
+
+    ret = ff_v4l2_context_set_format(&s->output);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_ERROR, "can't set v4l2 output format\n");
+        goto error;
+    }
+
+    ret = ff_v4l2_context_set_format(&s->capture);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_ERROR, "can't to set v4l2 capture format\n");
+        goto error;
+    }
+
+    ret = ff_v4l2_context_init(&s->output);
+    if (ret) {
+        av_log(log_ctx, AV_LOG_ERROR, "no v4l2 output context's buffers\n");
+        goto error;
+    }
+
+    /* decoder's buffers need to be updated at a later stage */
+    if (!av_codec_is_decoder(s->avctx->codec)) {
+        ret = ff_v4l2_context_init(&s->capture);
+        if (ret) {
+            av_log(log_ctx, AV_LOG_ERROR, "no v4l2 capture context's buffers\n");
+            goto error;
+        }
+    }
+
+    return 0;
+
+error:
+    return ret;
+}
+
+static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
+{
+    V4L2m2mContext *s = (V4L2m2mContext*)context;
+
+    ff_v4l2_context_release(&s->capture);
+    sem_destroy(&s->refsync);
+
+    close(s->fd);
+
+    av_free(s);
+}
+
+int ff_v4l2_m2m_codec_end(AVCodecContext *avctx)
+{
+    V4L2m2mPriv *priv = avctx->priv_data;
+    V4L2m2mContext* s = priv->context;
+    int ret;
+
+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
+    if (ret)
+            av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
+
+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+    if (ret)
+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
+
+    ff_v4l2_context_release(&s->output);
+
+    s->self_ref = NULL;
+    av_buffer_unref(&priv->context_ref);
+
+    return 0;
+}
+
+int ff_v4l2_m2m_codec_init(AVCodecContext *avctx)
+{
+    int ret = AVERROR(EINVAL);
+    struct dirent *entry;
+    char node[PATH_MAX];
+    DIR *dirp;
+
+    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    s->avctx = avctx;
+
+    dirp = opendir("/dev");
+    if (!dirp)
+        return AVERROR(errno);
+
+    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
+
+        if (strncmp(entry->d_name, "video", 5))
+            continue;
+
+        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
+        av_log(s->avctx, AV_LOG_DEBUG, "probing device %s\n", node);
+        strncpy(s->devname, node, strlen(node) + 1);
+        ret = v4l2_probe_driver(s);
+        if (!ret)
+                break;
+    }
+
+    closedir(dirp);
+
+    if (ret) {
+        av_log(s->avctx, AV_LOG_ERROR, "Could not find a valid device\n");
+        memset(s->devname, 0, sizeof(s->devname));
+
+        return ret;
+    }
+
+    av_log(s->avctx, AV_LOG_INFO, "Using device %s\n", node);
+
+    return v4l2_configure_contexts(s);
+}
+
+int ff_v4l2_m2m_create_context(AVCodecContext *avctx, V4L2m2mContext **s)
+{
+    V4L2m2mPriv *priv = avctx->priv_data;
+
+    *s = av_mallocz(sizeof(V4L2m2mContext));
+    if (!*s)
+        return AVERROR(ENOMEM);
+
+    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
+                                         &v4l2_m2m_destroy_context, NULL, 0);
+    if (!priv->context_ref) {
+        av_freep(s);
+        return AVERROR(ENOMEM);
+    }
+
+    /* assign the context */
+    priv->context = *s;
+
+    /* populate it */
+    priv->context->capture.num_buffers = priv->num_capture_buffers;
+    priv->context->output.num_buffers  = priv->num_output_buffers;
+    priv->context->self_ref = priv->context_ref;
+
+    return 0;
+}
diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
new file mode 100644
index 0000000..0d4671b
--- /dev/null
+++ b/libavcodec/v4l2_m2m.h
@@ -0,0 +1,126 @@
+/*
+ * V4L2 mem2mem helper functions
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V4L2_M2M_H
+#define AVCODEC_V4L2_M2M_H
+
+#include <semaphore.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <linux/videodev2.h>
+
+#include "libavcodec/avcodec.h"
+#include "v4l2_context.h"
+
+#define container_of(ptr, type, member) ({ \
+        const __typeof__(((type *)0)->member ) *__mptr = (ptr); \
+        (type *)((char *)__mptr - offsetof(type,member) );})
+
+#define V4L_M2M_DEFAULT_OPTS \
+    { "num_output_buffers", "Number of buffers in the output context",\
+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
+
+typedef struct V4L2m2mContext {
+    char devname[PATH_MAX];
+    int fd;
+
+    /* the codec context queues */
+    V4L2Context capture;
+    V4L2Context output;
+
+    /* dynamic stream reconfig */
+    AVCodecContext *avctx;
+    sem_t refsync;
+    atomic_uint refcount;
+    int reinit;
+
+    /* null frame/packet received */
+    int draining;
+
+    /* Reference to self; only valid while codec is active. */
+    AVBufferRef *self_ref;
+} V4L2m2mContext;
+
+typedef struct V4L2m2mPriv
+{
+    AVClass *class;
+
+    V4L2m2mContext *context;
+    AVBufferRef    *context_ref;
+
+    int num_output_buffers;
+    int num_capture_buffers;
+} V4L2m2mPriv;
+
+/**
+ * Allocate a new context and references for a V4L2 M2M instance.
+ *
+ * @param[in] ctx The AVCodecContext instantiated by the encoder/decoder.
+ * @param[out] ctx The V4L2m2mContext.
+ *
+ * @returns 0 in success, a negative error code otherwise.
+ */
+int ff_v4l2_m2m_create_context(AVCodecContext *avctx, V4L2m2mContext **s);
+
+
+/**
+ * Probes the video nodes looking for the required codec capabilities.
+ *
+ * @param[in] ctx The AVCodecContext instantiated by the encoder/decoder.
+ *
+ * @returns 0 if a driver is found, a negative number otherwise.
+ */
+int ff_v4l2_m2m_codec_init(AVCodecContext *avctx);
+
+/**
+ * Releases all the codec resources if all AVBufferRefs have been returned to the
+ * ctx. Otherwise keep the driver open.
+ *
+ * @param[in] The AVCodecContext instantiated by the encoder/decoder.
+ *
+ * @returns 0
+ *
+ */
+int ff_v4l2_m2m_codec_end(AVCodecContext *avctx);
+
+/**
+ * Reinitializes the V4L2m2mContext when the driver cannot continue processing
+ * with the capture parameters.
+ *
+ * @param[in] ctx The V4L2m2mContext instantiated by the encoder/decoder.
+ *
+ * @returns 0 in case of success, negative number otherwise
+ */
+int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
+
+/**
+ * Reinitializes the V4L2m2mContext when the driver cannot continue processing
+ * with the  any of the current V4L2Contexts (ie, changes in output and capture).
+ *
+ * @param[in] ctx The V4L2m2mContext instantiated by the encoder/decoder.
+ *
+ * @returns 0 in case of success, negative number otherwise
+ */
+int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
+
+#endif /* AVCODEC_V4L2_M2M_H */
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
new file mode 100644
index 0000000..710e40e
--- /dev/null
+++ b/libavcodec/v4l2_m2m_dec.c
@@ -0,0 +1,240 @@
+/*
+ * V4L2 mem2mem decoders
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/videodev2.h>
+#include <sys/ioctl.h>
+#include "libavutil/pixfmt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/decode.h"
+
+#include "v4l2_context.h"
+#include "v4l2_m2m.h"
+#include "v4l2_fmt.h"
+
+static int v4l2_try_start(AVCodecContext *avctx)
+{
+    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    V4L2Context *const capture = &s->capture;
+    V4L2Context *const output = &s->output;
+    struct v4l2_selection selection;
+    int ret;
+
+    /* 1. start the output process */
+    if (!output->streamon) {
+        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
+            return ret;
+        }
+    }
+
+    if (capture->streamon)
+        return 0;
+
+    /* 2. get the capture format */
+    capture->format.type = capture->type;
+    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
+    if (ret) {
+        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
+        return ret;
+    }
+
+    /* 2.1 update the AVCodecContext */
+    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
+    capture->av_pix_fmt = avctx->pix_fmt;
+
+    /* 3. set the crop parameters */
+    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    selection.r.height = avctx->coded_height;
+    selection.r.width = avctx->coded_width;
+    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
+    if (!ret) {
+        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+        if (ret) {
+            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
+            /* update the size of the resulting frame */
+            capture->height = selection.r.height;
+            capture->width  = selection.r.width;
+        }
+    }
+
+    /* 4. init the capture context now that we have the capture format */
+    if (!capture->buffers) {
+        ret = ff_v4l2_context_init(capture);
+        if (ret) {
+            av_log(avctx, AV_LOG_DEBUG, "can't request output buffers\n");
+            return ret;
+        }
+    }
+
+    /* 5. start the capture process */
+    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+    if (ret) {
+        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
+        return ret;
+    }
+
+    return 0;
+}
+
+static int v4l2_prepare_decoder(V4L2m2mContext *s)
+{
+    struct v4l2_event_subscription sub;
+    V4L2Context *output = &s->output;
+    int ret;
+
+    /**
+     * requirements
+     */
+    memset(&sub, 0, sizeof(sub));
+    sub.type = V4L2_EVENT_SOURCE_CHANGE;
+    ret = ioctl(s->fd, VIDIOC_SUBSCRIBE_EVENT, &sub);
+    if ( ret < 0) {
+        if (output->height == 0 || output->width == 0) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                "the v4l2 driver does not support VIDIOC_SUBSCRIBE_EVENT\n"
+                "you must provide codec_height and codec_width on input\n");
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    V4L2Context *const capture = &s->capture;
+    V4L2Context *const output = &s->output;
+    AVPacket avpkt = {0};
+    int ret;
+
+    ret = ff_decode_get_packet(avctx, &avpkt);
+    if (ret < 0 && ret != AVERROR_EOF)
+        return ret;
+
+    if (s->draining)
+        goto dequeue;
+
+    ret = ff_v4l2_context_enqueue_packet(output, &avpkt);
+    if (ret < 0) {
+        if (ret != AVERROR(ENOMEM))
+           return ret;
+        /* no input buffers available, continue dequeing */
+    }
+
+    if (avpkt.size) {
+        ret = v4l2_try_start(avctx);
+        if (ret) {
+            av_packet_unref(&avpkt);
+            return 0;
+        }
+    }
+
+dequeue:
+    av_packet_unref(&avpkt);
+    return ff_v4l2_context_dequeue_frame(capture, frame);
+}
+
+static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+{
+    V4L2Context *capture, *output;
+    V4L2m2mContext *s;
+    int ret;
+
+    ret = ff_v4l2_m2m_create_context(avctx, &s);
+    if (ret < 0)
+        return ret;
+
+    capture = &s->capture;
+    output = &s->output;
+
+    /* if these dimensions are invalid (ie, 0 or too small) an event will be raised
+     * by the v4l2 driver; this event will trigger a full pipeline reconfig and
+     * the proper values will be retrieved from the kernel driver.
+     */
+    output->height = capture->height = avctx->coded_height;
+    output->width = capture->width = avctx->coded_width;
+
+    output->av_codec_id = avctx->codec_id;
+    output->av_pix_fmt  = AV_PIX_FMT_NONE;
+
+    capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+    capture->av_pix_fmt = avctx->pix_fmt;
+
+    ret = ff_v4l2_m2m_codec_init(avctx);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "can't configure decoder\n");
+        return ret;
+    }
+
+    return v4l2_prepare_decoder(s);
+}
+
+#define OFFSET(x) offsetof(V4L2m2mPriv, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption options[] = {
+    V4L_M2M_DEFAULT_OPTS,
+    { "num_capture_buffers", "Number of buffers in the capture context",
+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
+    { NULL},
+};
+
+#define M2MDEC(NAME, LONGNAME, CODEC, bsf_name) \
+static const AVClass v4l2_m2m_ ## NAME ## _dec_class = {\
+    .class_name = #NAME "_v4l2_m2m_decoder",\
+    .item_name  = av_default_item_name,\
+    .option     = options,\
+    .version    = LIBAVUTIL_VERSION_INT,\
+};\
+\
+AVCodec ff_ ## NAME ## _v4l2m2m_decoder = { \
+    .name           = #NAME "_v4l2m2m" ,\
+    .long_name      = NULL_IF_CONFIG_SMALL("V4L2 mem2mem " LONGNAME " decoder wrapper"),\
+    .type           = AVMEDIA_TYPE_VIDEO,\
+    .id             = CODEC ,\
+    .priv_data_size = sizeof(V4L2m2mPriv),\
+    .priv_class     = &v4l2_m2m_ ## NAME ## _dec_class,\
+    .init           = v4l2_decode_init,\
+    .receive_frame  = v4l2_receive_frame,\
+    .close          = ff_v4l2_m2m_codec_end,\
+    .bsfs           = bsf_name, \
+    .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | \
+                      AV_CODEC_CAP_AVOID_PROBING, \
+    .wrapper_name   = "v4l2m2m", \
+};
+
+M2MDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");
+M2MDEC(hevc,  "HEVC",  AV_CODEC_ID_HEVC,       "hevc_mp4toannexb");
+M2MDEC(mpeg1, "MPEG1", AV_CODEC_ID_MPEG1VIDEO, NULL);
+M2MDEC(mpeg2, "MPEG2", AV_CODEC_ID_MPEG2VIDEO, NULL);
+M2MDEC(mpeg4, "MPEG4", AV_CODEC_ID_MPEG4,      NULL);
+M2MDEC(h263,  "H.263", AV_CODEC_ID_H263,       NULL);
+M2MDEC(vc1 ,  "VC1",   AV_CODEC_ID_VC1,        NULL);
+M2MDEC(vp8,   "VP8",   AV_CODEC_ID_VP8,        NULL);
+M2MDEC(vp9,   "VP9",   AV_CODEC_ID_VP9,        NULL);
diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
new file mode 100644
index 0000000..636e1a9
--- /dev/null
+++ b/libavcodec/v4l2_m2m_enc.c
@@ -0,0 +1,352 @@
+/*
+ * V4L2 mem2mem encoders
+ *
+ * Copyright (C) 2017 Alexis Ballier <aballier@gentoo.org>
+ * Copyright (C) 2017 Jorge Ramirez <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/videodev2.h>
+#include <sys/ioctl.h>
+#include <search.h>
+#include "libavcodec/avcodec.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/opt.h"
+#include "v4l2_context.h"
+#include "v4l2_m2m.h"
+
+#define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+#define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+
+static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+{
+    struct v4l2_streamparm parm = { 0 };
+
+    parm.type = V4L2_TYPE_IS_MULTIPLANAR(s->output.type) ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE : V4L2_BUF_TYPE_VIDEO_OUTPUT;
+    parm.parm.output.timeperframe.denominator = den;
+    parm.parm.output.timeperframe.numerator = num;
+
+    if (ioctl(s->fd, VIDIOC_S_PARM, &parm) < 0)
+        av_log(s->avctx, AV_LOG_WARNING, "Failed to set timeperframe");
+}
+
+static inline void v4l2_set_ext_ctrl(V4L2m2mContext *s, unsigned int id, signed int value, const char *name)
+{
+    struct v4l2_ext_controls ctrls = { { 0 } };
+    struct v4l2_ext_control ctrl = { 0 };
+
+    /* set ctrls */
+    ctrls.ctrl_class = V4L2_CTRL_CLASS_MPEG;
+    ctrls.controls = &ctrl;
+    ctrls.count = 1;
+
+    /* set ctrl*/
+    ctrl.value = value;
+    ctrl.id = id ;
+
+    if (ioctl(s->fd, VIDIOC_S_EXT_CTRLS, &ctrls) < 0)
+        av_log(s->avctx, AV_LOG_WARNING, "Failed to set %s\n", name);
+    else
+        av_log(s->avctx, AV_LOG_DEBUG, "Encoder: %s = %d\n", name, value);
+}
+
+static inline int v4l2_get_ext_ctrl(V4L2m2mContext *s, unsigned int id, signed int *value, const char *name)
+{
+    struct v4l2_ext_controls ctrls = { { 0 } };
+    struct v4l2_ext_control ctrl = { 0 };
+    int ret;
+
+    /* set ctrls */
+    ctrls.ctrl_class = V4L2_CTRL_CLASS_MPEG;
+    ctrls.controls = &ctrl;
+    ctrls.count = 1;
+
+    /* set ctrl*/
+    ctrl.id = id ;
+
+    ret = ioctl(s->fd, VIDIOC_G_EXT_CTRLS, &ctrls);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_WARNING, "Failed to set %s\n", name);
+        return ret;
+    }
+
+    *value = ctrl.value;
+
+    return 0;
+}
+
+static inline unsigned int v4l2_h264_profile_from_ff(int p)
+{
+    static const struct h264_profile  {
+        unsigned int ffmpeg_val;
+        unsigned int v4l2_val;
+    } profile[] = {
+        { FF_PROFILE_H264_CONSTRAINED_BASELINE, MPEG_VIDEO(H264_PROFILE_CONSTRAINED_BASELINE) },
+        { FF_PROFILE_H264_HIGH_444_PREDICTIVE, MPEG_VIDEO(H264_PROFILE_HIGH_444_PREDICTIVE) },
+        { FF_PROFILE_H264_HIGH_422_INTRA, MPEG_VIDEO(H264_PROFILE_HIGH_422_INTRA) },
+        { FF_PROFILE_H264_HIGH_444_INTRA, MPEG_VIDEO(H264_PROFILE_HIGH_444_INTRA) },
+        { FF_PROFILE_H264_HIGH_10_INTRA, MPEG_VIDEO(H264_PROFILE_HIGH_10_INTRA) },
+        { FF_PROFILE_H264_HIGH_422, MPEG_VIDEO(H264_PROFILE_HIGH_422) },
+        { FF_PROFILE_H264_BASELINE, MPEG_VIDEO(H264_PROFILE_BASELINE) },
+        { FF_PROFILE_H264_EXTENDED, MPEG_VIDEO(H264_PROFILE_EXTENDED) },
+        { FF_PROFILE_H264_HIGH_10, MPEG_VIDEO(H264_PROFILE_HIGH_10) },
+        { FF_PROFILE_H264_MAIN, MPEG_VIDEO(H264_PROFILE_MAIN) },
+        { FF_PROFILE_H264_HIGH, MPEG_VIDEO(H264_PROFILE_HIGH) },
+    };
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(profile); i++) {
+        if (profile[i].ffmpeg_val == p)
+            return profile[i].v4l2_val;
+    }
+    return AVERROR(ENOENT);
+}
+
+static inline int v4l2_mpeg4_profile_from_ff(int p)
+{
+    static const struct mpeg4_profile {
+        unsigned int ffmpeg_val;
+        unsigned int v4l2_val;
+    } profile[] = {
+        { FF_PROFILE_MPEG4_ADVANCED_CODING, MPEG_VIDEO(MPEG4_PROFILE_ADVANCED_CODING_EFFICIENCY) },
+        { FF_PROFILE_MPEG4_ADVANCED_SIMPLE, MPEG_VIDEO(MPEG4_PROFILE_ADVANCED_SIMPLE) },
+        { FF_PROFILE_MPEG4_SIMPLE_SCALABLE, MPEG_VIDEO(MPEG4_PROFILE_SIMPLE_SCALABLE) },
+        { FF_PROFILE_MPEG4_SIMPLE, MPEG_VIDEO(MPEG4_PROFILE_SIMPLE) },
+        { FF_PROFILE_MPEG4_CORE, MPEG_VIDEO(MPEG4_PROFILE_CORE) },
+    };
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(profile); i++) {
+        if (profile[i].ffmpeg_val == p)
+            return profile[i].v4l2_val;
+    }
+    return AVERROR(ENOENT);
+}
+
+static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+{
+    if (s->avctx->max_b_frames)
+        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
+
+    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames");
+    v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames");
+    if (s->avctx->max_b_frames == 0)
+        return 0;
+
+    avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+
+    return AVERROR_PATCHWELCOME;
+}
+
+static int v4l2_prepare_encoder(V4L2m2mContext *s)
+{
+    AVCodecContext *avctx = s->avctx;
+    int qmin_cid, qmax_cid, qmin, qmax;
+    int ret, val;
+
+    /**
+     * requirements
+     */
+    ret = v4l2_check_b_frame_support(s);
+    if (ret)
+        return ret;
+
+    /**
+     * settingss
+     */
+    if (avctx->framerate.num || avctx->framerate.den)
+        v4l2_set_timeperframe(s, avctx->framerate.num, avctx->framerate.den);
+
+    /* set ext ctrls */
+    v4l2_set_ext_ctrl(s, MPEG_CID(HEADER_MODE), MPEG_VIDEO(HEADER_MODE_SEPARATE), "header mode");
+    v4l2_set_ext_ctrl(s, MPEG_CID(BITRATE) , avctx->bit_rate, "bit rate");
+    v4l2_set_ext_ctrl(s, MPEG_CID(GOP_SIZE), avctx->gop_size,"gop size");
+
+    av_log(avctx, AV_LOG_DEBUG,
+        "Encoder Context: id (%d), profile (%d), frame rate(%d/%d), number b-frames (%d), "
+        "gop size (%d), bit rate (%"PRId64"), qmin (%d), qmax (%d)\n",
+        avctx->codec_id, avctx->profile, avctx->framerate.num, avctx->framerate.den,
+        avctx->max_b_frames, avctx->gop_size, avctx->bit_rate, avctx->qmin, avctx->qmax);
+
+    switch (avctx->codec_id) {
+    case AV_CODEC_ID_H264:
+        val = v4l2_h264_profile_from_ff(avctx->profile);
+        if (val < 0)
+            av_log(avctx, AV_LOG_WARNING, "h264 profile not found\n");
+        else
+            v4l2_set_ext_ctrl(s, MPEG_CID(H264_PROFILE), val, "h264 profile");
+        qmin_cid = MPEG_CID(H264_MIN_QP);
+        qmax_cid = MPEG_CID(H264_MAX_QP);
+        qmin = 0;
+        qmax = 51;
+        break;
+    case AV_CODEC_ID_MPEG4:
+        val = v4l2_mpeg4_profile_from_ff(avctx->profile);
+        if (val < 0)
+            av_log(avctx, AV_LOG_WARNING, "mpeg4 profile not found\n");
+        else
+            v4l2_set_ext_ctrl(s, MPEG_CID(MPEG4_PROFILE), val, "mpeg4 profile");
+        qmin_cid = MPEG_CID(MPEG4_MIN_QP);
+        qmax_cid = MPEG_CID(MPEG4_MAX_QP);
+        if (avctx->flags & AV_CODEC_FLAG_QPEL)
+            v4l2_set_ext_ctrl(s, MPEG_CID(MPEG4_QPEL), 1, "qpel");
+        qmin = 1;
+        qmax = 31;
+        break;
+    case AV_CODEC_ID_H263:
+        qmin_cid = MPEG_CID(H263_MIN_QP);
+        qmax_cid = MPEG_CID(H263_MAX_QP);
+        qmin = 1;
+        qmax = 31;
+        break;
+    case AV_CODEC_ID_VP8:
+        qmin_cid = MPEG_CID(VPX_MIN_QP);
+        qmax_cid = MPEG_CID(VPX_MAX_QP);
+        qmin = 0;
+        qmax = 127;
+        break;
+    case AV_CODEC_ID_VP9:
+        qmin_cid = MPEG_CID(VPX_MIN_QP);
+        qmax_cid = MPEG_CID(VPX_MAX_QP);
+        qmin = 0;
+        qmax = 255;
+        break;
+    default:
+        return 0;
+    }
+
+    if (qmin != avctx->qmin || qmax != avctx->qmax)
+        av_log(avctx, AV_LOG_WARNING, "Encoder adjusted: qmin (%d), qmax (%d)\n", qmin, qmax);
+
+    v4l2_set_ext_ctrl(s, qmin_cid, qmin, "minimum video quantizer scale");
+    v4l2_set_ext_ctrl(s, qmax_cid, qmax, "maximum video quantizer scale");
+
+    return 0;
+}
+
+static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+{
+    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    V4L2Context *const output = &s->output;
+
+    return ff_v4l2_context_enqueue_frame(output, frame);
+}
+
+static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+{
+    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+    V4L2Context *const capture = &s->capture;
+    V4L2Context *const output = &s->output;
+    int ret;
+
+    if (s->draining)
+        goto dequeue;
+
+    if (!output->streamon) {
+        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+        if (ret) {
+            av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF failed on output context\n");
+            return ret;
+        }
+    }
+
+    if (!capture->streamon) {
+        ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+        if (ret) {
+            av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON failed on capture context\n");
+            return ret;
+        }
+    }
+
+dequeue:
+    return ff_v4l2_context_dequeue_packet(capture, avpkt);
+}
+
+static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+{
+    V4L2Context *capture, *output;
+    V4L2m2mContext *s;
+    int ret;
+
+    ret = ff_v4l2_m2m_create_context(avctx, &s);
+    if (ret < 0)
+        return ret;
+
+    capture = &s->capture;
+    output  = &s->output;
+
+    /* common settings output/capture */
+    output->height = capture->height = avctx->height;
+    output->width = capture->width = avctx->width;
+
+    /* output context */
+    output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+    output->av_pix_fmt = avctx->pix_fmt;
+
+    /* capture context */
+    capture->av_codec_id = avctx->codec_id;
+    capture->av_pix_fmt = AV_PIX_FMT_NONE;
+
+    ret = ff_v4l2_m2m_codec_init(avctx);
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "can't configure encoder\n");
+        return ret;
+    }
+
+    return v4l2_prepare_encoder(s);
+}
+
+#define OFFSET(x) offsetof(V4L2m2mPriv, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption options[] = {
+    V4L_M2M_DEFAULT_OPTS,
+    { "num_capture_buffers", "Number of buffers in the capture context",
+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS },
+    { NULL },
+};
+
+#define M2MENC(NAME, LONGNAME, CODEC) \
+static const AVClass v4l2_m2m_ ## NAME ## _enc_class = {\
+    .class_name = #NAME "_v4l2_m2m_encoder",\
+    .item_name  = av_default_item_name,\
+    .option     = options,\
+    .version    = LIBAVUTIL_VERSION_INT,\
+};\
+\
+AVCodec ff_ ## NAME ## _v4l2m2m_encoder = { \
+    .name           = #NAME "_v4l2m2m" ,\
+    .long_name      = NULL_IF_CONFIG_SMALL("V4L2 mem2mem " LONGNAME " encoder wrapper"),\
+    .type           = AVMEDIA_TYPE_VIDEO,\
+    .id             = CODEC ,\
+    .priv_data_size = sizeof(V4L2m2mPriv),\
+    .priv_class     = &v4l2_m2m_ ## NAME ##_enc_class,\
+    .init           = v4l2_encode_init,\
+    .send_frame     = v4l2_send_frame,\
+    .receive_packet = v4l2_receive_packet,\
+    .close          = ff_v4l2_m2m_codec_end,\
+    .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY, \
+    .wrapper_name   = "v4l2m2m", \
+};
+
+M2MENC(mpeg4,"MPEG4", AV_CODEC_ID_MPEG4);
+M2MENC(h263, "H.263", AV_CODEC_ID_H263);
+M2MENC(h264, "H.264", AV_CODEC_ID_H264);
+M2MENC(hevc, "HEVC",  AV_CODEC_ID_HEVC);
+M2MENC(vp8,  "VP8",   AV_CODEC_ID_VP8);
diff --git a/libavcodec/vaapi.h b/libavcodec/vaapi.h
index 391368c..2cf7da5 100644
--- a/libavcodec/vaapi.h
+++ b/libavcodec/vaapi.h
@@ -1,23 +1,23 @@
 /*
- * Video Acceleration API (shared data between Libav and the video player)
+ * Video Acceleration API (shared data between FFmpeg and the video player)
  * HW decode acceleration for MPEG-2, MPEG-4, H.264 and VC-1
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,12 +31,10 @@
  */
 
 #include <stdint.h>
-
 #include "libavutil/attributes.h"
-
 #include "version.h"
 
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
 
 /**
  * @defgroup lavc_codec_hwaccel_vaapi VA API Decoding
@@ -45,7 +43,7 @@
  */
 
 /**
- * This structure is used to share data between the Libav library and
+ * This structure is used to share data between the FFmpeg library and
  * the client video application.
  * This shall be zero-allocated and available as
  * AVCodecContext.hwaccel_context. All user members can be set once
@@ -79,105 +77,10 @@ struct attribute_deprecated vaapi_context {
      * - decoding: Set by user
      */
     uint32_t context_id;
-
-    /**
-     * VAPictureParameterBuffer ID
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    uint32_t pic_param_buf_id;
-
-    /**
-     * VAIQMatrixBuffer ID
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    uint32_t iq_matrix_buf_id;
-
-    /**
-     * VABitPlaneBuffer ID (for VC-1 decoding)
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    uint32_t bitplane_buf_id;
-
-    /**
-     * Slice parameter/data buffer IDs
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    uint32_t *slice_buf_ids;
-
-    /**
-     * Number of effective slice buffer IDs to send to the HW
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    unsigned int n_slice_buf_ids;
-
-    /**
-     * Size of pre-allocated slice_buf_ids
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    unsigned int slice_buf_ids_alloc;
-
-    /**
-     * Pointer to VASliceParameterBuffers
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    void *slice_params;
-
-    /**
-     * Size of a VASliceParameterBuffer element
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    unsigned int slice_param_size;
-
-    /**
-     * Size of pre-allocated slice_params
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    unsigned int slice_params_alloc;
-
-    /**
-     * Number of slices currently filled in
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    unsigned int slice_count;
-
-    /**
-     * Pointer to slice data buffer base
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    const uint8_t *slice_data;
-
-    /**
-     * Current size of slice data
-     *
-     * - encoding: unused
-     * - decoding: Set by libavcodec
-     */
-    uint32_t slice_data_size;
 };
 
 /* @} */
 
-#endif /* FF_API_VAAPI_CONTEXT */
+#endif /* FF_API_STRUCT_VAAPI_CONTEXT */
 
 #endif /* AVCODEC_VAAPI_H */
diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c
index cc79875..69512e1 100644
--- a/libavcodec/vaapi_decode.c
+++ b/libavcodec/vaapi_decode.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -189,23 +189,19 @@ int ff_vaapi_decode_issue(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Failed to end picture decode "
                "issue: %d (%s).\n", vas, vaErrorStr(vas));
         err = AVERROR(EIO);
-        if (HAVE_VAAPI_1 || ctx->hwctx->driver_quirks &
+        if (CONFIG_VAAPI_1 || ctx->hwctx->driver_quirks &
             AV_VAAPI_DRIVER_QUIRK_RENDER_PARAM_BUFFERS)
             goto fail;
         else
             goto fail_at_end;
     }
 
-    if (HAVE_VAAPI_1 || ctx->hwctx->driver_quirks &
+    if (CONFIG_VAAPI_1 || ctx->hwctx->driver_quirks &
         AV_VAAPI_DRIVER_QUIRK_RENDER_PARAM_BUFFERS)
         ff_vaapi_decode_destroy_buffers(avctx, pic);
 
-    pic->nb_param_buffers = 0;
-    pic->nb_slices        = 0;
-    pic->slices_allocated = 0;
-    av_freep(&pic->slice_buffers);
-
-    return 0;
+    err = 0;
+    goto exit;
 
 fail_with_picture:
     vas = vaEndPicture(ctx->hwctx->display, ctx->va_context);
@@ -216,6 +212,12 @@ fail_with_picture:
 fail:
     ff_vaapi_decode_destroy_buffers(avctx, pic);
 fail_at_end:
+exit:
+    pic->nb_param_buffers = 0;
+    pic->nb_slices        = 0;
+    pic->slices_allocated = 0;
+    av_freep(&pic->slice_buffers);
+
     return err;
 }
 
@@ -233,6 +235,132 @@ int ff_vaapi_decode_cancel(AVCodecContext *avctx,
 }
 
 static const struct {
+    uint32_t fourcc;
+    enum AVPixelFormat pix_fmt;
+} vaapi_format_map[] = {
+#define MAP(va, av) { VA_FOURCC_ ## va, AV_PIX_FMT_ ## av }
+    // 4:0:0
+    MAP(Y800, GRAY8),
+    // 4:2:0
+    MAP(NV12, NV12),
+    MAP(YV12, YUV420P),
+    MAP(IYUV, YUV420P),
+#ifdef VA_FOURCC_I420
+    MAP(I420, YUV420P),
+#endif
+    MAP(IMC3, YUV420P),
+    // 4:1:1
+    MAP(411P, YUV411P),
+    // 4:2:2
+    MAP(422H, YUV422P),
+#ifdef VA_FOURCC_YV16
+    MAP(YV16, YUV422P),
+#endif
+    // 4:4:0
+    MAP(422V, YUV440P),
+    // 4:4:4
+    MAP(444P, YUV444P),
+    // 4:2:0 10-bit
+#ifdef VA_FOURCC_P010
+    MAP(P010, P010),
+#endif
+#ifdef VA_FOURCC_I010
+    MAP(I010, YUV420P10),
+#endif
+#undef MAP
+};
+
+static int vaapi_decode_find_best_format(AVCodecContext *avctx,
+                                         AVHWDeviceContext *device,
+                                         VAConfigID config_id,
+                                         AVHWFramesContext *frames)
+{
+    AVVAAPIDeviceContext *hwctx = device->hwctx;
+    VAStatus vas;
+    VASurfaceAttrib *attr;
+    enum AVPixelFormat source_format, best_format, format;
+    uint32_t best_fourcc, fourcc;
+    int i, j, nb_attr;
+
+    source_format = avctx->sw_pix_fmt;
+    av_assert0(source_format != AV_PIX_FMT_NONE);
+
+    vas = vaQuerySurfaceAttributes(hwctx->display, config_id,
+                                   NULL, &nb_attr);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query surface attributes: "
+               "%d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR(ENOSYS);
+    }
+
+    attr = av_malloc_array(nb_attr, sizeof(*attr));
+    if (!attr)
+        return AVERROR(ENOMEM);
+
+    vas = vaQuerySurfaceAttributes(hwctx->display, config_id,
+                                   attr, &nb_attr);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query surface attributes: "
+               "%d (%s).\n", vas, vaErrorStr(vas));
+        av_freep(&attr);
+        return AVERROR(ENOSYS);
+    }
+
+    best_format = AV_PIX_FMT_NONE;
+
+    for (i = 0; i < nb_attr; i++) {
+        if (attr[i].type != VASurfaceAttribPixelFormat)
+            continue;
+
+        fourcc = attr[i].value.value.i;
+        for (j = 0; j < FF_ARRAY_ELEMS(vaapi_format_map); j++) {
+            if (fourcc == vaapi_format_map[j].fourcc)
+                break;
+        }
+        if (j >= FF_ARRAY_ELEMS(vaapi_format_map)) {
+            av_log(avctx, AV_LOG_DEBUG, "Ignoring unknown format %#x.\n",
+                   fourcc);
+            continue;
+        }
+        format = vaapi_format_map[j].pix_fmt;
+        av_log(avctx, AV_LOG_DEBUG, "Considering format %#x -> %s.\n",
+               fourcc, av_get_pix_fmt_name(format));
+
+        best_format = av_find_best_pix_fmt_of_2(format, best_format,
+                                                source_format, 0, NULL);
+        if (format == best_format)
+            best_fourcc = fourcc;
+    }
+
+    av_freep(&attr);
+
+    if (best_format == AV_PIX_FMT_NONE) {
+        av_log(avctx, AV_LOG_ERROR, "No usable formats for decoding!\n");
+        return AVERROR(EINVAL);
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Picked %s (%#x) as best match for %s.\n",
+           av_get_pix_fmt_name(best_format), best_fourcc,
+           av_get_pix_fmt_name(source_format));
+
+    frames->sw_format = best_format;
+    if (avctx->internal->hwaccel_priv_data) {
+        VAAPIDecodeContext    *ctx = avctx->internal->hwaccel_priv_data;
+        AVVAAPIFramesContext *avfc = frames->hwctx;
+
+        ctx->pixel_format_attribute = (VASurfaceAttrib) {
+            .type          = VASurfaceAttribPixelFormat,
+            .value.value.i = best_fourcc,
+        };
+
+        avfc->attributes    = &ctx->pixel_format_attribute;
+        avfc->nb_attributes = 1;
+    }
+
+    return 0;
+}
+
+static const struct {
     enum AVCodecID codec_id;
     int codec_profile;
     VAProfile va_profile;
@@ -253,6 +381,8 @@ static const struct {
     MAP(HEVC,        HEVC_MAIN,       HEVCMain    ),
     MAP(HEVC,        HEVC_MAIN_10,    HEVCMain10  ),
 #endif
+    MAP(MJPEG,       MJPEG_HUFFMAN_BASELINE_DCT,
+                                      JPEGBaseline),
     MAP(WMV3,        VC1_SIMPLE,      VC1Simple   ),
     MAP(WMV3,        VC1_MAIN,        VC1Main     ),
     MAP(WMV3,        VC1_COMPLEX,     VC1Advanced ),
@@ -261,12 +391,13 @@ static const struct {
     MAP(VC1,         VC1_MAIN,        VC1Main     ),
     MAP(VC1,         VC1_COMPLEX,     VC1Advanced ),
     MAP(VC1,         VC1_ADVANCED,    VC1Advanced ),
-#if VA_CHECK_VERSION(0, 35, 0)
     MAP(VP8,         UNKNOWN,       VP8Version0_3 ),
-#endif
 #if VA_CHECK_VERSION(0, 38, 0)
     MAP(VP9,         VP9_0,           VP9Profile0 ),
 #endif
+#if VA_CHECK_VERSION(0, 39, 0)
+    MAP(VP9,         VP9_2,           VP9Profile2 ),
+#endif
 #undef MAP
 };
 
@@ -286,7 +417,6 @@ static int vaapi_decode_make_config(AVCodecContext *avctx,
     const AVCodecDescriptor *codec_desc;
     VAProfile *profile_list = NULL, matched_va_profile;
     int profile_count, exact_match, matched_ff_profile;
-    const AVPixFmtDescriptor *sw_desc, *desc;
 
     AVHWDeviceContext    *device = (AVHWDeviceContext*)device_ref->data;
     AVVAAPIDeviceContext *hwctx = device->hwctx;
@@ -414,27 +544,10 @@ static int vaapi_decode_make_config(AVCodecContext *avctx,
         frames->width = avctx->coded_width;
         frames->height = avctx->coded_height;
 
-        // Find the first format in the list which matches the expected
-        // bit depth and subsampling.  If none are found (this can happen
-        // when 10-bit streams are decoded to 8-bit surfaces, for example)
-        // then just take the first format on the list.
-        frames->sw_format = constraints->valid_sw_formats[0];
-        sw_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
-        for (i = 0; constraints->valid_sw_formats[i] != AV_PIX_FMT_NONE; i++) {
-            desc = av_pix_fmt_desc_get(constraints->valid_sw_formats[i]);
-            if (desc->nb_components != sw_desc->nb_components ||
-                desc->log2_chroma_w != sw_desc->log2_chroma_w ||
-                desc->log2_chroma_h != sw_desc->log2_chroma_h)
-                continue;
-            for (j = 0; j < desc->nb_components; j++) {
-                if (desc->comp[j].depth != sw_desc->comp[j].depth)
-                    break;
-            }
-            if (j < desc->nb_components)
-                continue;
-            frames->sw_format = constraints->valid_sw_formats[i];
-            break;
-        }
+        err = vaapi_decode_find_best_format(avctx, device,
+                                            *va_config, frames);
+        if (err < 0)
+            goto fail;
 
         frames->initial_pool_size = 1;
         // Add per-codec number of surfaces used for storing reference frames.
@@ -503,7 +616,7 @@ int ff_vaapi_decode_init(AVCodecContext *avctx)
     ctx->va_config  = VA_INVALID_ID;
     ctx->va_context = VA_INVALID_ID;
 
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
     if (avctx->hwaccel_context) {
         av_log(avctx, AV_LOG_WARNING, "Using deprecated struct "
                "vaapi_context in decode.\n");
@@ -533,7 +646,7 @@ int ff_vaapi_decode_init(AVCodecContext *avctx)
     }
 #endif
 
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
     if (ctx->have_old_context) {
         ctx->va_config  = ctx->old_context->config_id;
         ctx->va_context = ctx->old_context->context_id;
@@ -572,7 +685,7 @@ int ff_vaapi_decode_init(AVCodecContext *avctx)
 
     av_log(avctx, AV_LOG_DEBUG, "Decode context initialised: "
            "%#x/%#x.\n", ctx->va_config, ctx->va_context);
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
     }
 #endif
 
@@ -588,7 +701,7 @@ int ff_vaapi_decode_uninit(AVCodecContext *avctx)
     VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data;
     VAStatus vas;
 
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
     if (ctx->have_old_context) {
         av_buffer_unref(&ctx->device_ref);
     } else {
@@ -611,7 +724,7 @@ int ff_vaapi_decode_uninit(AVCodecContext *avctx)
         }
     }
 
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
     }
 #endif
 
diff --git a/libavcodec/vaapi_decode.h b/libavcodec/vaapi_decode.h
index fda228b..6b415dd 100644
--- a/libavcodec/vaapi_decode.h
+++ b/libavcodec/vaapi_decode.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,12 +24,11 @@
 #include "libavutil/frame.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_vaapi.h"
-#include "libavutil/internal.h"
 
 #include "avcodec.h"
 
 #include "version.h"
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
 #include "vaapi.h"
 #endif
 
@@ -57,7 +56,7 @@ typedef struct VAAPIDecodeContext {
     VAConfigID            va_config;
     VAContextID           va_context;
 
-#if FF_API_VAAPI_CONTEXT
+#if FF_API_STRUCT_VAAPI_CONTEXT
 FF_DISABLE_DEPRECATION_WARNINGS
     int                   have_old_context;
     struct vaapi_context *old_context;
@@ -73,6 +72,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     enum AVPixelFormat    surface_format;
     int                   surface_count;
+
+    VASurfaceAttrib       pixel_format_attribute;
 } VAAPIDecodeContext;
 
 
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index 398b8e1..2dda451 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -158,16 +158,10 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_DEBUG, ".\n");
     }
 
-    av_assert0(pic->input_available && !pic->encode_issued);
+    av_assert0(!pic->encode_issued);
     for (i = 0; i < pic->nb_refs; i++) {
         av_assert0(pic->refs[i]);
-        // If we are serialised then the references must have already
-        // completed.  If not, they must have been issued but need not
-        // have completed yet.
-        if (ctx->issue_mode == ISSUE_MODE_SERIALISE_EVERYTHING)
-            av_assert0(pic->refs[i]->encode_complete);
-        else
-            av_assert0(pic->refs[i]->encode_issued);
+        av_assert0(pic->refs[i]->encode_issued);
     }
 
     av_log(avctx, AV_LOG_DEBUG, "Input surface is %#x.\n", pic->input_surface);
@@ -207,9 +201,16 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
 
     pic->nb_param_buffers = 0;
 
-    if (pic->encode_order == 0) {
-        // Global parameter buffers are set on the first picture only.
+    if (pic->type == PICTURE_TYPE_IDR && ctx->codec->init_sequence_params) {
+        err = vaapi_encode_make_param_buffer(avctx, pic,
+                                             VAEncSequenceParameterBufferType,
+                                             ctx->codec_sequence_params,
+                                             ctx->codec->sequence_params_size);
+        if (err < 0)
+            goto fail;
+    }
 
+    if (pic->type == PICTURE_TYPE_IDR) {
         for (i = 0; i < ctx->nb_global_params; i++) {
             err = vaapi_encode_make_param_buffer(avctx, pic,
                                                  VAEncMiscParameterBufferType,
@@ -220,15 +221,6 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
         }
     }
 
-    if (pic->type == PICTURE_TYPE_IDR && ctx->codec->init_sequence_params) {
-        err = vaapi_encode_make_param_buffer(avctx, pic,
-                                             VAEncSequenceParameterBufferType,
-                                             ctx->codec_sequence_params,
-                                             ctx->codec->sequence_params_size);
-        if (err < 0)
-            goto fail;
-    }
-
     if (ctx->codec->init_picture_params) {
         err = ctx->codec->init_picture_params(avctx, pic);
         if (err < 0) {
@@ -321,16 +313,60 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
         }
     }
 
+    if (pic->nb_slices == 0)
+        pic->nb_slices = ctx->nb_slices;
     if (pic->nb_slices > 0) {
+        int rounding;
+
         pic->slices = av_mallocz_array(pic->nb_slices, sizeof(*pic->slices));
         if (!pic->slices) {
             err = AVERROR(ENOMEM);
             goto fail;
         }
+
+        for (i = 0; i < pic->nb_slices; i++)
+            pic->slices[i].row_size = ctx->slice_size;
+
+        rounding = ctx->slice_block_rows - ctx->nb_slices * ctx->slice_size;
+        if (rounding > 0) {
+            // Place rounding error at top and bottom of frame.
+            av_assert0(rounding < pic->nb_slices);
+            // Some Intel drivers contain a bug where the encoder will fail
+            // if the last slice is smaller than the one before it.  Since
+            // that's straightforward to avoid here, just do so.
+            if (rounding <= 2) {
+                for (i = 0; i < rounding; i++)
+                    ++pic->slices[i].row_size;
+            } else {
+                for (i = 0; i < (rounding + 1) / 2; i++)
+                    ++pic->slices[pic->nb_slices - i - 1].row_size;
+                for (i = 0; i < rounding / 2; i++)
+                    ++pic->slices[i].row_size;
+            }
+        } else if (rounding < 0) {
+            // Remove rounding error from last slice only.
+            av_assert0(rounding < ctx->slice_size);
+            pic->slices[pic->nb_slices - 1].row_size += rounding;
+        }
     }
     for (i = 0; i < pic->nb_slices; i++) {
         slice = &pic->slices[i];
         slice->index = i;
+        if (i == 0) {
+            slice->row_start   = 0;
+            slice->block_start = 0;
+        } else {
+            const VAAPIEncodeSlice *prev = &pic->slices[i - 1];
+            slice->row_start   = prev->row_start   + prev->row_size;
+            slice->block_start = prev->block_start + prev->block_size;
+        }
+        slice->block_size  = slice->row_size * ctx->slice_block_cols;
+
+        av_log(avctx, AV_LOG_DEBUG, "Slice %d: %d-%d (%d rows), "
+               "%d-%d (%d blocks).\n", i, slice->row_start,
+               slice->row_start + slice->row_size - 1, slice->row_size,
+               slice->block_start, slice->block_start + slice->block_size - 1,
+               slice->block_size);
 
         if (ctx->codec->slice_params_size > 0) {
             slice->codec_slice_params = av_mallocz(ctx->codec->slice_params_size);
@@ -343,7 +379,7 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
         if (ctx->codec->init_slice_params) {
             err = ctx->codec->init_slice_params(avctx, pic, slice);
             if (err < 0) {
-                av_log(avctx, AV_LOG_ERROR, "Failed to initalise slice "
+                av_log(avctx, AV_LOG_ERROR, "Failed to initialise slice "
                        "parameters: %d.\n", err);
                 goto fail;
             }
@@ -401,14 +437,14 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
         err = AVERROR(EIO);
         // vaRenderPicture() has been called here, so we should not destroy
         // the parameter buffers unless separate destruction is required.
-        if (HAVE_VAAPI_1 || ctx->hwctx->driver_quirks &
+        if (CONFIG_VAAPI_1 || ctx->hwctx->driver_quirks &
             AV_VAAPI_DRIVER_QUIRK_RENDER_PARAM_BUFFERS)
             goto fail;
         else
             goto fail_at_end;
     }
 
-    if (HAVE_VAAPI_1 || ctx->hwctx->driver_quirks &
+    if (CONFIG_VAAPI_1 || ctx->hwctx->driver_quirks &
         AV_VAAPI_DRIVER_QUIRK_RENDER_PARAM_BUFFERS) {
         for (i = 0; i < pic->nb_param_buffers; i++) {
             vas = vaDestroyBuffer(ctx->hwctx->display,
@@ -424,10 +460,7 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
 
     pic->encode_issued = 1;
 
-    if (ctx->issue_mode == ISSUE_MODE_SERIALISE_EVERYTHING)
-        return vaapi_encode_wait(avctx, pic);
-    else
-        return 0;
+    return 0;
 
 fail_with_picture:
     vaEndPicture(ctx->hwctx->display, ctx->va_context);
@@ -528,14 +561,23 @@ static int vaapi_encode_discard(AVCodecContext *avctx,
     return 0;
 }
 
-static VAAPIEncodePicture *vaapi_encode_alloc(void)
+static VAAPIEncodePicture *vaapi_encode_alloc(AVCodecContext *avctx)
 {
+    VAAPIEncodeContext *ctx = avctx->priv_data;
     VAAPIEncodePicture *pic;
 
     pic = av_mallocz(sizeof(*pic));
     if (!pic)
         return NULL;
 
+    if (ctx->codec->picture_priv_data_size > 0) {
+        pic->priv_data = av_mallocz(ctx->codec->picture_priv_data_size);
+        if (!pic->priv_data) {
+            av_freep(&pic);
+            return NULL;
+        }
+    }
+
     pic->input_surface = VA_INVALID_ID;
     pic->recon_surface = VA_INVALID_ID;
     pic->output_buffer = VA_INVALID_ID;
@@ -575,315 +617,330 @@ static int vaapi_encode_free(AVCodecContext *avctx,
     return 0;
 }
 
-static int vaapi_encode_step(AVCodecContext *avctx,
-                             VAAPIEncodePicture *target)
+static void vaapi_encode_add_ref(AVCodecContext *avctx,
+                                 VAAPIEncodePicture *pic,
+                                 VAAPIEncodePicture *target,
+                                 int is_ref, int in_dpb, int prev)
 {
-    VAAPIEncodeContext *ctx = avctx->priv_data;
-    VAAPIEncodePicture *pic;
-    int i, err;
-
-    if (ctx->issue_mode == ISSUE_MODE_SERIALISE_EVERYTHING ||
-        ctx->issue_mode == ISSUE_MODE_MINIMISE_LATENCY) {
-        // These two modes are equivalent, except that we wait for
-        // immediate completion on each operation if serialised.
-
-        if (!target) {
-            // No target, nothing to do yet.
-            return 0;
-        }
+    int refs = 0;
 
-        if (target->encode_complete) {
-            // Already done.
-            return 0;
-        }
-
-        pic = target;
-        for (i = 0; i < pic->nb_refs; i++) {
-            if (!pic->refs[i]->encode_complete) {
-                err = vaapi_encode_step(avctx, pic->refs[i]);
-                if (err < 0)
-                    return err;
-            }
-        }
-
-        err = vaapi_encode_issue(avctx, pic);
-        if (err < 0)
-            return err;
-
-    } else if (ctx->issue_mode == ISSUE_MODE_MAXIMISE_THROUGHPUT) {
-        int activity;
-
-        // Run through the list of all available pictures repeatedly
-        // and issue the first one found which has all dependencies
-        // available (including previously-issued but not necessarily
-        // completed pictures).
-        do {
-            activity = 0;
-            for (pic = ctx->pic_start; pic; pic = pic->next) {
-                if (!pic->input_available || pic->encode_issued)
-                    continue;
-                for (i = 0; i < pic->nb_refs; i++) {
-                    if (!pic->refs[i]->encode_issued)
-                        break;
-                }
-                if (i < pic->nb_refs)
-                    continue;
-                err = vaapi_encode_issue(avctx, pic);
-                if (err < 0)
-                    return err;
-                activity = 1;
-                // Start again from the beginning of the list,
-                // because issuing this picture may have satisfied
-                // forward dependencies of earlier ones.
-                break;
-            }
-        } while(activity);
+    if (is_ref) {
+        av_assert0(pic != target);
+        av_assert0(pic->nb_refs < MAX_PICTURE_REFERENCES);
+        pic->refs[pic->nb_refs++] = target;
+        ++refs;
+    }
 
-        // If we had a defined target for this step then it will
-        // always have been issued by now.
-        if (target) {
-            av_assert0(target->encode_issued && "broken dependencies?");
-        }
+    if (in_dpb) {
+        av_assert0(pic->nb_dpb_pics < MAX_DPB_SIZE);
+        pic->dpb[pic->nb_dpb_pics++] = target;
+        ++refs;
+    }
 
-    } else {
-        av_assert0(0);
+    if (prev) {
+        av_assert0(!pic->prev);
+        pic->prev = target;
+        ++refs;
     }
 
-    return 0;
+    target->ref_count[0] += refs;
+    target->ref_count[1] += refs;
 }
 
-static int vaapi_encode_get_next(AVCodecContext *avctx,
-                                 VAAPIEncodePicture **pic_out)
+static void vaapi_encode_remove_refs(AVCodecContext *avctx,
+                                     VAAPIEncodePicture *pic,
+                                     int level)
 {
-    VAAPIEncodeContext *ctx = avctx->priv_data;
-    VAAPIEncodePicture *start, *end, *pic;
     int i;
 
-    for (pic = ctx->pic_start; pic; pic = pic->next) {
-        if (pic->next)
-            av_assert0(pic->display_order + 1 == pic->next->display_order);
-        if (pic->display_order == ctx->input_order) {
-            *pic_out = pic;
-            return 0;
-        }
+    if (pic->ref_removed[level])
+        return;
+
+    for (i = 0; i < pic->nb_refs; i++) {
+        av_assert0(pic->refs[i]);
+        --pic->refs[i]->ref_count[level];
+        av_assert0(pic->refs[i]->ref_count[level] >= 0);
     }
 
-    pic = vaapi_encode_alloc();
-    if (!pic)
-        return AVERROR(ENOMEM);
+    for (i = 0; i < pic->nb_dpb_pics; i++) {
+        av_assert0(pic->dpb[i]);
+        --pic->dpb[i]->ref_count[level];
+        av_assert0(pic->dpb[i]->ref_count[level] >= 0);
+    }
 
-    if (ctx->input_order == 0 || ctx->force_idr ||
-        ctx->gop_counter >= avctx->gop_size) {
-        pic->type = PICTURE_TYPE_IDR;
-        ctx->force_idr = 0;
-        ctx->gop_counter = 1;
-        ctx->p_counter = 0;
-    } else if (ctx->p_counter >= ctx->p_per_i) {
-        pic->type = PICTURE_TYPE_I;
-        ++ctx->gop_counter;
-        ctx->p_counter = 0;
-    } else {
-        pic->type = PICTURE_TYPE_P;
-        pic->refs[0] = ctx->pic_end;
-        pic->nb_refs = 1;
-        ++ctx->gop_counter;
-        ++ctx->p_counter;
+    av_assert0(pic->prev || pic->type == PICTURE_TYPE_IDR);
+    if (pic->prev) {
+        --pic->prev->ref_count[level];
+        av_assert0(pic->prev->ref_count[level] >= 0);
     }
-    start = end = pic;
 
-    if (pic->type != PICTURE_TYPE_IDR) {
-        // If that was not an IDR frame, add B-frames display-before and
-        // encode-after it, but not exceeding the GOP size.
+    pic->ref_removed[level] = 1;
+}
 
-        for (i = 0; i < ctx->b_per_p &&
-             ctx->gop_counter < avctx->gop_size; i++) {
-            pic = vaapi_encode_alloc();
-            if (!pic)
-                goto fail;
+static void vaapi_encode_set_b_pictures(AVCodecContext *avctx,
+                                        VAAPIEncodePicture *start,
+                                        VAAPIEncodePicture *end,
+                                        VAAPIEncodePicture *prev,
+                                        int current_depth,
+                                        VAAPIEncodePicture **last)
+{
+    VAAPIEncodeContext *ctx = avctx->priv_data;
+    VAAPIEncodePicture *pic, *next, *ref;
+    int i, len;
 
-            pic->type = PICTURE_TYPE_B;
-            pic->refs[0] = ctx->pic_end;
-            pic->refs[1] = end;
-            pic->nb_refs = 2;
+    av_assert0(start && end && start != end && start->next != end);
 
-            pic->next = start;
-            pic->display_order = ctx->input_order + ctx->b_per_p - i - 1;
-            pic->encode_order  = pic->display_order + 1;
-            start = pic;
+    // If we are at the maximum depth then encode all pictures as
+    // non-referenced B-pictures.  Also do this if there is exactly one
+    // picture left, since there will be nothing to reference it.
+    if (current_depth == ctx->max_b_depth || start->next->next == end) {
+        for (pic = start->next; pic; pic = pic->next) {
+            if (pic == end)
+                break;
+            pic->type    = PICTURE_TYPE_B;
+            pic->b_depth = current_depth;
+
+            vaapi_encode_add_ref(avctx, pic, start, 1, 1, 0);
+            vaapi_encode_add_ref(avctx, pic, end,   1, 1, 0);
+            vaapi_encode_add_ref(avctx, pic, prev,  0, 0, 1);
 
-            ++ctx->gop_counter;
+            for (ref = end->refs[1]; ref; ref = ref->refs[1])
+                vaapi_encode_add_ref(avctx, pic, ref, 0, 1, 0);
         }
-    }
+        *last = prev;
 
-    if (ctx->input_order == 0) {
-        pic->display_order = 0;
-        pic->encode_order  = 0;
+    } else {
+        // Split the current list at the midpoint with a referenced
+        // B-picture, then descend into each side separately.
+        len = 0;
+        for (pic = start->next; pic != end; pic = pic->next)
+            ++len;
+        for (pic = start->next, i = 1; 2 * i < len; pic = pic->next, i++);
 
-        ctx->pic_start = ctx->pic_end = pic;
+        pic->type    = PICTURE_TYPE_B;
+        pic->b_depth = current_depth;
 
-    } else {
-        for (i = 0, pic = start; pic; i++, pic = pic->next) {
-            pic->display_order = ctx->input_order + i;
-            if (end->type == PICTURE_TYPE_IDR)
-                pic->encode_order = ctx->input_order + i;
-            else if (pic == end)
-                pic->encode_order = ctx->input_order;
-            else
-                pic->encode_order = ctx->input_order + i + 1;
-        }
+        pic->is_reference = 1;
 
-        av_assert0(ctx->pic_end);
-        ctx->pic_end->next = start;
-        ctx->pic_end = end;
-    }
-    *pic_out = start;
+        vaapi_encode_add_ref(avctx, pic, pic,   0, 1, 0);
+        vaapi_encode_add_ref(avctx, pic, start, 1, 1, 0);
+        vaapi_encode_add_ref(avctx, pic, end,   1, 1, 0);
+        vaapi_encode_add_ref(avctx, pic, prev,  0, 0, 1);
 
-    av_log(avctx, AV_LOG_DEBUG, "Pictures:");
-    for (pic = ctx->pic_start; pic; pic = pic->next) {
-        av_log(avctx, AV_LOG_DEBUG, " %s (%"PRId64"/%"PRId64")",
-               picture_type_name[pic->type],
-               pic->display_order, pic->encode_order);
-    }
-    av_log(avctx, AV_LOG_DEBUG, "\n");
+        for (ref = end->refs[1]; ref; ref = ref->refs[1])
+            vaapi_encode_add_ref(avctx, pic, ref, 0, 1, 0);
 
-    return 0;
+        if (i > 1)
+            vaapi_encode_set_b_pictures(avctx, start, pic, pic,
+                                        current_depth + 1, &next);
+        else
+            next = pic;
 
-fail:
-    while (start) {
-        pic = start->next;
-        vaapi_encode_free(avctx, start);
-        start = pic;
+        vaapi_encode_set_b_pictures(avctx, pic, end, next,
+                                    current_depth + 1, last);
     }
-    return AVERROR(ENOMEM);
 }
 
-static int vaapi_encode_truncate_gop(AVCodecContext *avctx)
+static int vaapi_encode_pick_next(AVCodecContext *avctx,
+                                  VAAPIEncodePicture **pic_out)
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
-    VAAPIEncodePicture *pic, *last_pic, *next;
+    VAAPIEncodePicture *pic = NULL, *next, *start;
+    int i, b_counter, closed_gop_end;
 
-    // Find the last picture we actually have input for.
+    // If there are any B-frames already queued, the next one to encode
+    // is the earliest not-yet-issued frame for which all references are
+    // available.
     for (pic = ctx->pic_start; pic; pic = pic->next) {
-        if (!pic->input_available)
+        if (pic->encode_issued)
+            continue;
+        if (pic->type != PICTURE_TYPE_B)
+            continue;
+        for (i = 0; i < pic->nb_refs; i++) {
+            if (!pic->refs[i]->encode_issued)
+                break;
+        }
+        if (i == pic->nb_refs)
             break;
-        last_pic = pic;
     }
 
     if (pic) {
-        av_assert0(last_pic);
+        av_log(avctx, AV_LOG_DEBUG, "Pick B-picture at depth %d to "
+               "encode next.\n", pic->b_depth);
+        *pic_out = pic;
+        return 0;
+    }
 
-        if (last_pic->type == PICTURE_TYPE_B) {
-            // Some fixing up is required.  Change the type of this
-            // picture to P, then modify preceding B references which
-            // point beyond it to point at it instead.
+    // Find the B-per-Pth available picture to become the next picture
+    // on the top layer.
+    start = NULL;
+    b_counter = 0;
+    closed_gop_end = ctx->closed_gop ||
+                     ctx->idr_counter == ctx->gop_per_idr;
+    for (pic = ctx->pic_start; pic; pic = next) {
+        next = pic->next;
+        if (pic->encode_issued) {
+            start = pic;
+            continue;
+        }
+        // If the next available picture is force-IDR, encode it to start
+        // a new GOP immediately.
+        if (pic->force_idr)
+            break;
+        if (b_counter == ctx->b_per_p)
+            break;
+        // If this picture ends a closed GOP or starts a new GOP then it
+        // needs to be in the top layer.
+        if (ctx->gop_counter + b_counter + closed_gop_end >= ctx->gop_size)
+            break;
+        // If the picture after this one is force-IDR, we need to encode
+        // this one in the top layer.
+        if (next && next->force_idr)
+            break;
+        ++b_counter;
+    }
 
-            last_pic->type = PICTURE_TYPE_P;
-            last_pic->encode_order = last_pic->refs[1]->encode_order;
+    // At the end of the stream the last picture must be in the top layer.
+    if (!pic && ctx->end_of_stream) {
+        --b_counter;
+        pic = ctx->pic_end;
+        if (pic->encode_issued)
+            return AVERROR_EOF;
+    }
 
-            for (pic = ctx->pic_start; pic != last_pic; pic = pic->next) {
-                if (pic->type == PICTURE_TYPE_B &&
-                    pic->refs[1] == last_pic->refs[1])
-                    pic->refs[1] = last_pic;
-            }
+    if (!pic) {
+        av_log(avctx, AV_LOG_DEBUG, "Pick nothing to encode next - "
+               "need more input for reference pictures.\n");
+        return AVERROR(EAGAIN);
+    }
+    if (ctx->input_order <= ctx->decode_delay && !ctx->end_of_stream) {
+        av_log(avctx, AV_LOG_DEBUG, "Pick nothing to encode next - "
+               "need more input for timestamps.\n");
+        return AVERROR(EAGAIN);
+    }
 
-            last_pic->nb_refs = 1;
-            last_pic->refs[1] = NULL;
-        } else {
-            // We can use the current structure (no references point
-            // beyond the end), but there are unused pics to discard.
-        }
+    if (pic->force_idr) {
+        av_log(avctx, AV_LOG_DEBUG, "Pick forced IDR-picture to "
+               "encode next.\n");
+        pic->type = PICTURE_TYPE_IDR;
+        ctx->idr_counter = 1;
+        ctx->gop_counter = 1;
 
-        // Discard all following pics, they will never be used.
-        for (pic = last_pic->next; pic; pic = next) {
-            next = pic->next;
-            vaapi_encode_free(avctx, pic);
+    } else if (ctx->gop_counter + b_counter >= ctx->gop_size) {
+        if (ctx->idr_counter == ctx->gop_per_idr) {
+            av_log(avctx, AV_LOG_DEBUG, "Pick new-GOP IDR-picture to "
+                   "encode next.\n");
+            pic->type = PICTURE_TYPE_IDR;
+            ctx->idr_counter = 1;
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "Pick new-GOP I-picture to "
+                   "encode next.\n");
+            pic->type = PICTURE_TYPE_I;
+            ++ctx->idr_counter;
         }
-
-        last_pic->next = NULL;
-        ctx->pic_end = last_pic;
+        ctx->gop_counter = 1;
 
     } else {
-        // Input is available for all pictures, so we don't need to
-        // mangle anything.
+        if (ctx->gop_counter + b_counter + closed_gop_end == ctx->gop_size) {
+            av_log(avctx, AV_LOG_DEBUG, "Pick group-end P-picture to "
+                   "encode next.\n");
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "Pick normal P-picture to "
+                   "encode next.\n");
+        }
+        pic->type = PICTURE_TYPE_P;
+        av_assert0(start);
+        ctx->gop_counter += 1 + b_counter;
     }
+    pic->is_reference = 1;
+    *pic_out = pic;
 
-    av_log(avctx, AV_LOG_DEBUG, "Pictures ending truncated GOP:");
-    for (pic = ctx->pic_start; pic; pic = pic->next) {
-        av_log(avctx, AV_LOG_DEBUG, " %s (%"PRId64"/%"PRId64")",
-               picture_type_name[pic->type],
-               pic->display_order, pic->encode_order);
+    vaapi_encode_add_ref(avctx, pic, pic, 0, 1, 0);
+    if (pic->type != PICTURE_TYPE_IDR) {
+        vaapi_encode_add_ref(avctx, pic, start,
+                             pic->type == PICTURE_TYPE_P,
+                             b_counter > 0, 0);
+        vaapi_encode_add_ref(avctx, pic, ctx->next_prev, 0, 0, 1);
     }
-    av_log(avctx, AV_LOG_DEBUG, "\n");
+    if (ctx->next_prev)
+        --ctx->next_prev->ref_count[0];
 
+    if (b_counter > 0) {
+        vaapi_encode_set_b_pictures(avctx, start, pic, pic, 1,
+                                    &ctx->next_prev);
+    } else {
+        ctx->next_prev = pic;
+    }
+    ++ctx->next_prev->ref_count[0];
     return 0;
 }
 
 static int vaapi_encode_clear_old(AVCodecContext *avctx)
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
-    VAAPIEncodePicture *pic, *old;
-    int i;
+    VAAPIEncodePicture *pic, *prev, *next;
 
-    while (ctx->pic_start != ctx->pic_end) {
-        old = ctx->pic_start;
-        if (old->encode_order > ctx->output_order)
-            break;
+    av_assert0(ctx->pic_start);
 
-        for (pic = old->next; pic; pic = pic->next) {
-            if (pic->encode_complete)
-                continue;
-            for (i = 0; i < pic->nb_refs; i++) {
-                if (pic->refs[i] == old) {
-                    // We still need this picture because it's referred to
-                    // directly by a later one, so it and all following
-                    // pictures have to stay.
-                    return 0;
-                }
-            }
-        }
+    // Remove direct references once each picture is complete.
+    for (pic = ctx->pic_start; pic; pic = pic->next) {
+        if (pic->encode_complete && pic->next)
+            vaapi_encode_remove_refs(avctx, pic, 0);
+    }
 
-        pic = ctx->pic_start;
-        ctx->pic_start = pic->next;
-        vaapi_encode_free(avctx, pic);
+    // Remove indirect references once a picture has no direct references.
+    for (pic = ctx->pic_start; pic; pic = pic->next) {
+        if (pic->encode_complete && pic->ref_count[0] == 0)
+            vaapi_encode_remove_refs(avctx, pic, 1);
+    }
+
+    // Clear out all complete pictures with no remaining references.
+    prev = NULL;
+    for (pic = ctx->pic_start; pic; pic = next) {
+        next = pic->next;
+        if (pic->encode_complete && pic->ref_count[1] == 0) {
+            av_assert0(pic->ref_removed[0] && pic->ref_removed[1]);
+            if (prev)
+                prev->next = next;
+            else
+                ctx->pic_start = next;
+            vaapi_encode_free(avctx, pic);
+        } else {
+            prev = pic;
+        }
     }
 
     return 0;
 }
 
-int ff_vaapi_encode2(AVCodecContext *avctx, AVPacket *pkt,
-                     const AVFrame *input_image, int *got_packet)
+int ff_vaapi_encode_send_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
     VAAPIEncodePicture *pic;
     int err;
 
-    if (input_image) {
-        av_log(avctx, AV_LOG_DEBUG, "Encode frame: %ux%u (%"PRId64").\n",
-               input_image->width, input_image->height, input_image->pts);
+    if (frame) {
+        av_log(avctx, AV_LOG_DEBUG, "Input frame: %ux%u (%"PRId64").\n",
+               frame->width, frame->height, frame->pts);
 
-        if (input_image->pict_type == AV_PICTURE_TYPE_I) {
-            err = vaapi_encode_truncate_gop(avctx);
-            if (err < 0)
-                goto fail;
-            ctx->force_idr = 1;
-        }
-
-        err = vaapi_encode_get_next(avctx, &pic);
-        if (err) {
-            av_log(avctx, AV_LOG_ERROR, "Input setup failed: %d.\n", err);
-            return err;
-        }
+        pic = vaapi_encode_alloc(avctx);
+        if (!pic)
+            return AVERROR(ENOMEM);
 
         pic->input_image = av_frame_alloc();
         if (!pic->input_image) {
             err = AVERROR(ENOMEM);
             goto fail;
         }
-        err = av_frame_ref(pic->input_image, input_image);
+        err = av_frame_ref(pic->input_image, frame);
         if (err < 0)
             goto fail;
-        pic->input_surface = (VASurfaceID)(uintptr_t)input_image->data[3];
-        pic->pts = input_image->pts;
+
+        if (ctx->input_order == 0)
+            pic->force_idr = 1;
+
+        pic->input_surface = (VASurfaceID)(uintptr_t)frame->data[3];
+        pic->pts = frame->pts;
 
         if (ctx->input_order == 0)
             ctx->first_pts = pic->pts;
@@ -892,302 +949,665 @@ int ff_vaapi_encode2(AVCodecContext *avctx, AVPacket *pkt,
         if (ctx->output_delay > 0)
             ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
 
-        pic->input_available = 1;
+        pic->display_order = ctx->input_order;
+        ++ctx->input_order;
 
-    } else {
-        if (!ctx->end_of_stream) {
-            err = vaapi_encode_truncate_gop(avctx);
-            if (err < 0)
-                goto fail;
-            ctx->end_of_stream = 1;
+        if (ctx->pic_start) {
+            ctx->pic_end->next = pic;
+            ctx->pic_end       = pic;
+        } else {
+            ctx->pic_start     = pic;
+            ctx->pic_end       = pic;
         }
+
+    } else {
+        ctx->end_of_stream = 1;
+
+        // Fix timestamps if we hit end-of-stream before the initial decode
+        // delay has elapsed.
+        if (ctx->input_order < ctx->decode_delay)
+            ctx->dts_pts_diff = ctx->pic_end->pts - ctx->first_pts;
     }
 
-    ++ctx->input_order;
-    ++ctx->output_order;
-    av_assert0(ctx->output_order + ctx->output_delay + 1 == ctx->input_order);
+    return 0;
 
-    for (pic = ctx->pic_start; pic; pic = pic->next)
-        if (pic->encode_order == ctx->output_order)
-            break;
+fail:
+    return err;
+}
 
-    // pic can be null here if we don't have a specific target in this
-    // iteration.  We might still issue encodes if things can be overlapped,
-    // even though we don't intend to output anything.
+int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
+{
+    VAAPIEncodeContext *ctx = avctx->priv_data;
+    VAAPIEncodePicture *pic;
+    int err;
 
-    err = vaapi_encode_step(avctx, pic);
-    if (err < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
-        goto fail;
+    if (!ctx->pic_start) {
+        if (ctx->end_of_stream)
+            return AVERROR_EOF;
+        else
+            return AVERROR(EAGAIN);
     }
 
-    if (!pic) {
-        *got_packet = 0;
-    } else {
-        err = vaapi_encode_output(avctx, pic, pkt);
-        if (err < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err);
-            goto fail;
-        }
+    pic = NULL;
+    err = vaapi_encode_pick_next(avctx, &pic);
+    if (err < 0)
+        return err;
+    av_assert0(pic);
 
-        if (ctx->output_delay == 0) {
-            pkt->dts = pkt->pts;
-        } else if (ctx->output_order < ctx->decode_delay) {
-            if (ctx->ts_ring[ctx->output_order] < INT64_MIN + ctx->dts_pts_diff)
-                pkt->dts = INT64_MIN;
-            else
-                pkt->dts = ctx->ts_ring[ctx->output_order] - ctx->dts_pts_diff;
-        } else {
-            pkt->dts = ctx->ts_ring[(ctx->output_order - ctx->decode_delay) %
-                                    (3 * ctx->output_delay)];
-        }
+    pic->encode_order = ctx->encode_order++;
 
-        *got_packet = 1;
+    err = vaapi_encode_issue(avctx, pic);
+    if (err < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
+        return err;
     }
 
-    err = vaapi_encode_clear_old(avctx);
+    err = vaapi_encode_output(avctx, pic, pkt);
     if (err < 0) {
-        av_log(avctx, AV_LOG_ERROR, "List clearing failed: %d.\n", err);
-        goto fail;
+        av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err);
+        return err;
     }
 
+    if (ctx->output_delay == 0) {
+        pkt->dts = pkt->pts;
+    } else if (pic->encode_order < ctx->decode_delay) {
+        if (ctx->ts_ring[pic->encode_order] < INT64_MIN + ctx->dts_pts_diff)
+            pkt->dts = INT64_MIN;
+        else
+            pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
+    } else {
+        pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
+                                (3 * ctx->output_delay)];
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n",
+           pkt->pts, pkt->dts);
+
+    ctx->output_order = pic->encode_order;
+    vaapi_encode_clear_old(avctx);
+
     return 0;
+}
 
-fail:
-    // Unclear what to clean up on failure.  There are probably some things we
-    // could do usefully clean up here, but for now just leave them for uninit()
-    // to do instead.
-    return err;
+int ff_vaapi_encode2(AVCodecContext *avctx, AVPacket *pkt,
+                     const AVFrame *input_image, int *got_packet)
+{
+    return AVERROR(ENOSYS);
 }
 
-static av_cold int vaapi_encode_config_attributes(AVCodecContext *avctx)
+static av_cold void vaapi_encode_add_global_param(AVCodecContext *avctx,
+                                                  VAEncMiscParameterBuffer *buffer,
+                                                  size_t size)
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
+
+    av_assert0(ctx->nb_global_params < MAX_GLOBAL_PARAMS);
+
+    ctx->global_params     [ctx->nb_global_params] = buffer;
+    ctx->global_params_size[ctx->nb_global_params] = size;
+
+    ++ctx->nb_global_params;
+}
+
+typedef struct VAAPIEncodeRTFormat {
+    const char *name;
+    unsigned int value;
+    int depth;
+    int nb_components;
+    int log2_chroma_w;
+    int log2_chroma_h;
+} VAAPIEncodeRTFormat;
+
+static const VAAPIEncodeRTFormat vaapi_encode_rt_formats[] = {
+    { "YUV400",    VA_RT_FORMAT_YUV400,        8, 1,      },
+    { "YUV420",    VA_RT_FORMAT_YUV420,        8, 3, 1, 1 },
+    { "YUV422",    VA_RT_FORMAT_YUV422,        8, 3, 1, 0 },
+    { "YUV444",    VA_RT_FORMAT_YUV444,        8, 3, 0, 0 },
+    { "YUV411",    VA_RT_FORMAT_YUV411,        8, 3, 2, 0 },
+#if VA_CHECK_VERSION(0, 38, 1)
+    { "YUV420_10", VA_RT_FORMAT_YUV420_10BPP, 10, 3, 1, 1 },
+#endif
+};
+
+static const VAEntrypoint vaapi_encode_entrypoints_normal[] = {
+    VAEntrypointEncSlice,
+    VAEntrypointEncPicture,
+#if VA_CHECK_VERSION(0, 39, 2)
+    VAEntrypointEncSliceLP,
+#endif
+    0
+};
+#if VA_CHECK_VERSION(0, 39, 2)
+static const VAEntrypoint vaapi_encode_entrypoints_low_power[] = {
+    VAEntrypointEncSliceLP,
+    0
+};
+#endif
+
+static av_cold int vaapi_encode_profile_entrypoint(AVCodecContext *avctx)
+{
+    VAAPIEncodeContext      *ctx = avctx->priv_data;
+    VAProfile    *va_profiles    = NULL;
+    VAEntrypoint *va_entrypoints = NULL;
     VAStatus vas;
-    int i, n, err;
-    VAProfile    *profiles    = NULL;
-    VAEntrypoint *entrypoints = NULL;
-    VAConfigAttrib attr[] = {
-        { VAConfigAttribRTFormat         },
-        { VAConfigAttribRateControl      },
-        { VAConfigAttribEncMaxRefFrames  },
-        { VAConfigAttribEncPackedHeaders },
-    };
+    const VAEntrypoint *usable_entrypoints;
+    const VAAPIEncodeProfile *profile;
+    const AVPixFmtDescriptor *desc;
+    VAConfigAttrib rt_format_attr;
+    const VAAPIEncodeRTFormat *rt_format;
+    const char *profile_string, *entrypoint_string;
+    int i, j, n, depth, err;
+
+
+    if (ctx->low_power) {
+#if VA_CHECK_VERSION(0, 39, 2)
+        usable_entrypoints = vaapi_encode_entrypoints_low_power;
+#else
+        av_log(avctx, AV_LOG_ERROR, "Low-power encoding is not "
+               "supported with this VAAPI version.\n");
+        return AVERROR(EINVAL);
+#endif
+    } else {
+        usable_entrypoints = vaapi_encode_entrypoints_normal;
+    }
+
+    desc = av_pix_fmt_desc_get(ctx->input_frames->sw_format);
+    if (!desc) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid input pixfmt (%d).\n",
+               ctx->input_frames->sw_format);
+        return AVERROR(EINVAL);
+    }
+    depth = desc->comp[0].depth;
+    for (i = 1; i < desc->nb_components; i++) {
+        if (desc->comp[i].depth != depth) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid input pixfmt (%s).\n",
+                   desc->name);
+            return AVERROR(EINVAL);
+        }
+    }
+    av_log(avctx, AV_LOG_VERBOSE, "Input surface format is %s.\n",
+           desc->name);
 
     n = vaMaxNumProfiles(ctx->hwctx->display);
-    profiles = av_malloc_array(n, sizeof(VAProfile));
-    if (!profiles) {
+    va_profiles = av_malloc_array(n, sizeof(VAProfile));
+    if (!va_profiles) {
         err = AVERROR(ENOMEM);
         goto fail;
     }
-    vas = vaQueryConfigProfiles(ctx->hwctx->display, profiles, &n);
+    vas = vaQueryConfigProfiles(ctx->hwctx->display, va_profiles, &n);
     if (vas != VA_STATUS_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to query profiles: %d (%s).\n",
+        av_log(avctx, AV_LOG_ERROR, "Failed to query profiles: %d (%s).\n",
                vas, vaErrorStr(vas));
-        err = AVERROR(ENOSYS);
+        err = AVERROR_EXTERNAL;
         goto fail;
     }
-    for (i = 0; i < n; i++) {
-        if (profiles[i] == ctx->va_profile)
-            break;
+
+    av_assert0(ctx->codec->profiles);
+    for (i = 0; (ctx->codec->profiles[i].av_profile !=
+                 FF_PROFILE_UNKNOWN); i++) {
+        profile = &ctx->codec->profiles[i];
+        if (depth               != profile->depth ||
+            desc->nb_components != profile->nb_components)
+            continue;
+        if (desc->nb_components > 1 &&
+            (desc->log2_chroma_w != profile->log2_chroma_w ||
+             desc->log2_chroma_h != profile->log2_chroma_h))
+            continue;
+        if (avctx->profile != profile->av_profile &&
+            avctx->profile != FF_PROFILE_UNKNOWN)
+            continue;
+
+#if VA_CHECK_VERSION(1, 0, 0)
+        profile_string = vaProfileStr(profile->va_profile);
+#else
+        profile_string = "(no profile names)";
+#endif
+
+        for (j = 0; j < n; j++) {
+            if (va_profiles[j] == profile->va_profile)
+                break;
+        }
+        if (j >= n) {
+            av_log(avctx, AV_LOG_VERBOSE, "Compatible profile %s (%d) "
+                   "is not supported by driver.\n", profile_string,
+                   profile->va_profile);
+            continue;
+        }
+
+        ctx->profile = profile;
+        break;
     }
-    if (i >= n) {
-        av_log(ctx, AV_LOG_ERROR, "Encoding profile not found (%d).\n",
-               ctx->va_profile);
+    if (!ctx->profile) {
+        av_log(avctx, AV_LOG_ERROR, "No usable encoding profile found.\n");
         err = AVERROR(ENOSYS);
         goto fail;
     }
 
+    avctx->profile  = profile->av_profile;
+    ctx->va_profile = profile->va_profile;
+    av_log(avctx, AV_LOG_VERBOSE, "Using VAAPI profile %s (%d).\n",
+           profile_string, ctx->va_profile);
+
     n = vaMaxNumEntrypoints(ctx->hwctx->display);
-    entrypoints = av_malloc_array(n, sizeof(VAEntrypoint));
-    if (!entrypoints) {
+    va_entrypoints = av_malloc_array(n, sizeof(VAEntrypoint));
+    if (!va_entrypoints) {
         err = AVERROR(ENOMEM);
         goto fail;
     }
     vas = vaQueryConfigEntrypoints(ctx->hwctx->display, ctx->va_profile,
-                                   entrypoints, &n);
+                                   va_entrypoints, &n);
     if (vas != VA_STATUS_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to query entrypoints for "
-               "profile %u: %d (%s).\n", ctx->va_profile,
-               vas, vaErrorStr(vas));
-        err = AVERROR(ENOSYS);
+        av_log(avctx, AV_LOG_ERROR, "Failed to query entrypoints for "
+               "profile %s (%d): %d (%s).\n", profile_string,
+               ctx->va_profile, vas, vaErrorStr(vas));
+        err = AVERROR_EXTERNAL;
         goto fail;
     }
+
     for (i = 0; i < n; i++) {
-        if (entrypoints[i] == ctx->va_entrypoint)
+        for (j = 0; usable_entrypoints[j]; j++) {
+            if (va_entrypoints[i] == usable_entrypoints[j])
+                break;
+        }
+        if (usable_entrypoints[j])
             break;
     }
     if (i >= n) {
-        av_log(ctx, AV_LOG_ERROR, "Encoding entrypoint not found "
-               "(%d / %d).\n", ctx->va_profile, ctx->va_entrypoint);
+        av_log(avctx, AV_LOG_ERROR, "No usable encoding entrypoint found "
+               "for profile %s (%d).\n", profile_string, ctx->va_profile);
         err = AVERROR(ENOSYS);
         goto fail;
     }
 
+    ctx->va_entrypoint = va_entrypoints[i];
+#if VA_CHECK_VERSION(1, 0, 0)
+    entrypoint_string = vaEntrypointStr(ctx->va_entrypoint);
+#else
+    entrypoint_string = "(no entrypoint names)";
+#endif
+    av_log(avctx, AV_LOG_VERBOSE, "Using VAAPI entrypoint %s (%d).\n",
+           entrypoint_string, ctx->va_entrypoint);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(vaapi_encode_rt_formats); i++) {
+        rt_format = &vaapi_encode_rt_formats[i];
+        if (rt_format->depth         == depth &&
+            rt_format->nb_components == profile->nb_components &&
+            rt_format->log2_chroma_w == profile->log2_chroma_w &&
+            rt_format->log2_chroma_h == profile->log2_chroma_h)
+            break;
+    }
+    if (i >= FF_ARRAY_ELEMS(vaapi_encode_rt_formats)) {
+        av_log(avctx, AV_LOG_ERROR, "No usable render target format "
+               "found for profile %s (%d) entrypoint %s (%d).\n",
+               profile_string, ctx->va_profile,
+               entrypoint_string, ctx->va_entrypoint);
+        err = AVERROR(ENOSYS);
+        goto fail;
+    }
+
+    rt_format_attr = (VAConfigAttrib) { VAConfigAttribRTFormat };
     vas = vaGetConfigAttributes(ctx->hwctx->display,
                                 ctx->va_profile, ctx->va_entrypoint,
-                                attr, FF_ARRAY_ELEMS(attr));
+                                &rt_format_attr, 1);
     if (vas != VA_STATUS_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Failed to fetch config "
-               "attributes: %d (%s).\n", vas, vaErrorStr(vas));
-        return AVERROR(EINVAL);
+        av_log(avctx, AV_LOG_ERROR, "Failed to query RT format "
+               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        err = AVERROR_EXTERNAL;
+        goto fail;
     }
 
-    for (i = 0; i < FF_ARRAY_ELEMS(attr); i++) {
-        if (attr[i].value == VA_ATTRIB_NOT_SUPPORTED) {
-            // Unfortunately we have to treat this as "don't know" and hope
-            // for the best, because the Intel MJPEG encoder returns this
-            // for all the interesting attributes.
-            continue;
-        }
-        switch (attr[i].type) {
-        case VAConfigAttribRTFormat:
-            if (!(ctx->va_rt_format & attr[i].value)) {
-                av_log(avctx, AV_LOG_ERROR, "Surface RT format %#x "
-                       "is not supported (mask %#x).\n",
-                       ctx->va_rt_format, attr[i].value);
-                err = AVERROR(EINVAL);
-                goto fail;
-            }
-            ctx->config_attributes[ctx->nb_config_attributes++] =
-                (VAConfigAttrib) {
-                .type  = VAConfigAttribRTFormat,
-                .value = ctx->va_rt_format,
-            };
-            break;
-        case VAConfigAttribRateControl:
-            // Hack for backward compatibility: CBR was the only
-            // usable RC mode for a long time, so old drivers will
-            // only have it.  Normal default options may now choose
-            // VBR and then fail, however, so override it here with
-            // CBR if that is the only supported mode.
-            if (ctx->va_rc_mode == VA_RC_VBR &&
-                !(attr[i].value & VA_RC_VBR) &&
-                (attr[i].value & VA_RC_CBR)) {
-                av_log(avctx, AV_LOG_WARNING, "VBR rate control is "
-                       "not supported with this driver version; "
-                       "using CBR instead.\n");
-                ctx->va_rc_mode = VA_RC_CBR;
-            }
-            if (!(ctx->va_rc_mode & attr[i].value)) {
-                av_log(avctx, AV_LOG_ERROR, "Rate control mode %#x "
-                       "is not supported (mask: %#x).\n",
-                       ctx->va_rc_mode, attr[i].value);
-                err = AVERROR(EINVAL);
-                goto fail;
-            }
-            ctx->config_attributes[ctx->nb_config_attributes++] =
-                (VAConfigAttrib) {
-                .type  = VAConfigAttribRateControl,
-                .value = ctx->va_rc_mode,
-            };
-            break;
-        case VAConfigAttribEncMaxRefFrames:
-        {
-            unsigned int ref_l0 = attr[i].value & 0xffff;
-            unsigned int ref_l1 = (attr[i].value >> 16) & 0xffff;
-
-            if (avctx->gop_size > 1 && ref_l0 < 1) {
-                av_log(avctx, AV_LOG_ERROR, "P frames are not "
-                       "supported (%#x).\n", attr[i].value);
-                err = AVERROR(EINVAL);
-                goto fail;
-            }
-            if (avctx->max_b_frames > 0 && ref_l1 < 1) {
-                av_log(avctx, AV_LOG_ERROR, "B frames are not "
-                       "supported (%#x).\n", attr[i].value);
-                err = AVERROR(EINVAL);
-                goto fail;
-            }
-        }
-        break;
-        case VAConfigAttribEncPackedHeaders:
-            if (ctx->va_packed_headers & ~attr[i].value) {
-                // This isn't fatal, but packed headers are always
-                // preferable because they are under our control.
-                // When absent, the driver is generating them and some
-                // features may not work (e.g. VUI or SEI in H.264).
-                av_log(avctx, AV_LOG_WARNING, "Warning: some packed "
-                       "headers are not supported (want %#x, got %#x).\n",
-                       ctx->va_packed_headers, attr[i].value);
-                ctx->va_packed_headers &= attr[i].value;
-            }
-            ctx->config_attributes[ctx->nb_config_attributes++] =
-                (VAConfigAttrib) {
-                .type  = VAConfigAttribEncPackedHeaders,
-                .value = ctx->va_packed_headers,
-            };
-            break;
-        default:
-            av_assert0(0 && "Unexpected config attribute.");
-        }
+    if (rt_format_attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        av_log(avctx, AV_LOG_VERBOSE, "RT format config attribute not "
+               "supported by driver: assuming surface RT format %s "
+               "is valid.\n", rt_format->name);
+    } else if (!(rt_format_attr.value & rt_format->value)) {
+        av_log(avctx, AV_LOG_ERROR, "Surface RT format %s not supported "
+               "by driver for encoding profile %s (%d) entrypoint %s (%d).\n",
+               rt_format->name, profile_string, ctx->va_profile,
+               entrypoint_string, ctx->va_entrypoint);
+        err = AVERROR(ENOSYS);
+        goto fail;
+    } else {
+        av_log(avctx, AV_LOG_VERBOSE, "Using VAAPI render target "
+               "format %s (%#x).\n", rt_format->name, rt_format->value);
+        ctx->config_attributes[ctx->nb_config_attributes++] =
+            (VAConfigAttrib) {
+            .type  = VAConfigAttribRTFormat,
+            .value = rt_format->value,
+        };
     }
 
     err = 0;
 fail:
-    av_freep(&profiles);
-    av_freep(&entrypoints);
+    av_freep(&va_profiles);
+    av_freep(&va_entrypoints);
     return err;
 }
 
+static const VAAPIEncodeRCMode vaapi_encode_rc_modes[] = {
+    //                                  Bitrate   Quality
+    //                                     | Maxrate | HRD/VBV
+    { 0 }, //                              |    |    |    |
+    { RC_MODE_CQP,  "CQP",  1, VA_RC_CQP,  0,   0,   1,   0 },
+    { RC_MODE_CBR,  "CBR",  1, VA_RC_CBR,  1,   0,   0,   1 },
+    { RC_MODE_VBR,  "VBR",  1, VA_RC_VBR,  1,   1,   0,   1 },
+#if VA_CHECK_VERSION(1, 1, 0)
+    { RC_MODE_ICQ,  "ICQ",  1, VA_RC_ICQ,  0,   0,   1,   0 },
+#else
+    { RC_MODE_ICQ,  "ICQ",  0 },
+#endif
+#if VA_CHECK_VERSION(1, 3, 0)
+    { RC_MODE_QVBR, "QVBR", 1, VA_RC_QVBR, 1,   1,   1,   1 },
+    { RC_MODE_AVBR, "AVBR", 0, VA_RC_AVBR, 1,   0,   0,   0 },
+#else
+    { RC_MODE_QVBR, "QVBR", 0 },
+    { RC_MODE_AVBR, "AVBR", 0 },
+#endif
+};
+
 static av_cold int vaapi_encode_init_rate_control(AVCodecContext *avctx)
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
-    int rc_bits_per_second;
-    int rc_target_percentage;
-    int rc_window_size;
-    int hrd_buffer_size;
-    int hrd_initial_buffer_fullness;
+    uint32_t supported_va_rc_modes;
+    const VAAPIEncodeRCMode *rc_mode;
+    int64_t rc_bits_per_second;
+    int     rc_target_percentage;
+    int     rc_window_size;
+    int     rc_quality;
+    int64_t hrd_buffer_size;
+    int64_t hrd_initial_buffer_fullness;
     int fr_num, fr_den;
+    VAConfigAttrib rc_attr = { VAConfigAttribRateControl };
+    VAStatus vas;
+    char supported_rc_modes_string[64];
 
-    if (avctx->rc_buffer_size)
-        hrd_buffer_size = avctx->rc_buffer_size;
-    else
-        hrd_buffer_size = avctx->bit_rate;
-    if (avctx->rc_initial_buffer_occupancy)
-        hrd_initial_buffer_fullness = avctx->rc_initial_buffer_occupancy;
-    else
-        hrd_initial_buffer_fullness = hrd_buffer_size * 3 / 4;
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile, ctx->va_entrypoint,
+                                &rc_attr, 1);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query rate control "
+               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    }
+    if (rc_attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        av_log(avctx, AV_LOG_VERBOSE, "Driver does not report any "
+               "supported rate control modes: assuming CQP only.\n");
+        supported_va_rc_modes = VA_RC_CQP;
+        strcpy(supported_rc_modes_string, "unknown");
+    } else {
+        char *str = supported_rc_modes_string;
+        size_t len = sizeof(supported_rc_modes_string);
+        int i, first = 1, res;
+
+        supported_va_rc_modes = rc_attr.value;
+        for (i = 0; i < FF_ARRAY_ELEMS(vaapi_encode_rc_modes); i++) {
+            rc_mode = &vaapi_encode_rc_modes[i];
+            if (supported_va_rc_modes & rc_mode->va_mode) {
+                res = snprintf(str, len, "%s%s",
+                               first ? "" : ", ", rc_mode->name);
+                first = 0;
+                if (res < 0) {
+                    *str = 0;
+                    break;
+                }
+                len -= res;
+                str += res;
+                if (len == 0)
+                    break;
+            }
+        }
 
-    if (ctx->va_rc_mode == VA_RC_CBR) {
-        rc_bits_per_second   = avctx->bit_rate;
-        rc_target_percentage = 100;
-        rc_window_size       = 1000;
+        av_log(avctx, AV_LOG_DEBUG, "Driver supports RC modes %s.\n",
+               supported_rc_modes_string);
+    }
+
+    // Rate control mode selection:
+    // * If the user has set a mode explicitly with the rc_mode option,
+    //   use it and fail if it is not available.
+    // * If an explicit QP option has been set, use CQP.
+    // * If the codec is CQ-only, use CQP.
+    // * If the QSCALE avcodec option is set, use CQP.
+    // * If bitrate and quality are both set, try QVBR.
+    // * If quality is set, try ICQ, then CQP.
+    // * If bitrate and maxrate are set and have the same value, try CBR.
+    // * If a bitrate is set, try AVBR, then VBR, then CBR.
+    // * If no bitrate is set, try ICQ, then CQP.
+
+#define TRY_RC_MODE(mode, fail) do { \
+        rc_mode = &vaapi_encode_rc_modes[mode]; \
+        if (!(rc_mode->va_mode & supported_va_rc_modes)) { \
+            if (fail) { \
+                av_log(avctx, AV_LOG_ERROR, "Driver does not support %s " \
+                       "RC mode (supported modes: %s).\n", rc_mode->name, \
+                       supported_rc_modes_string); \
+                return AVERROR(EINVAL); \
+            } \
+            av_log(avctx, AV_LOG_DEBUG, "Driver does not support %s " \
+                   "RC mode.\n", rc_mode->name); \
+            rc_mode = NULL; \
+        } else { \
+            goto rc_mode_found; \
+        } \
+    } while (0)
+
+    if (ctx->explicit_rc_mode)
+        TRY_RC_MODE(ctx->explicit_rc_mode, 1);
+
+    if (ctx->explicit_qp)
+        TRY_RC_MODE(RC_MODE_CQP, 1);
+
+    if (ctx->codec->flags & FLAG_CONSTANT_QUALITY_ONLY)
+        TRY_RC_MODE(RC_MODE_CQP, 1);
+
+    if (avctx->flags & AV_CODEC_FLAG_QSCALE)
+        TRY_RC_MODE(RC_MODE_CQP, 1);
+
+    if (avctx->bit_rate > 0 && avctx->global_quality > 0)
+        TRY_RC_MODE(RC_MODE_QVBR, 0);
+
+    if (avctx->global_quality > 0) {
+        TRY_RC_MODE(RC_MODE_ICQ, 0);
+        TRY_RC_MODE(RC_MODE_CQP, 0);
+    }
+
+    if (avctx->bit_rate > 0 && avctx->rc_max_rate == avctx->bit_rate)
+        TRY_RC_MODE(RC_MODE_CBR, 0);
+
+    if (avctx->bit_rate > 0) {
+        TRY_RC_MODE(RC_MODE_AVBR, 0);
+        TRY_RC_MODE(RC_MODE_VBR, 0);
+        TRY_RC_MODE(RC_MODE_CBR, 0);
     } else {
-        if (avctx->rc_max_rate < avctx->bit_rate) {
-            // Max rate is unset or invalid, just use the normal bitrate.
+        TRY_RC_MODE(RC_MODE_ICQ, 0);
+        TRY_RC_MODE(RC_MODE_CQP, 0);
+    }
+
+    av_log(avctx, AV_LOG_ERROR, "Driver does not support any "
+           "RC mode compatible with selected options "
+           "(supported modes: %s).\n", supported_rc_modes_string);
+    return AVERROR(EINVAL);
+
+rc_mode_found:
+    if (rc_mode->bitrate) {
+        if (avctx->bit_rate <= 0) {
+            av_log(avctx, AV_LOG_ERROR, "Bitrate must be set for %s "
+                   "RC mode.\n", rc_mode->name);
+            return AVERROR(EINVAL);
+        }
+
+        if (rc_mode->mode == RC_MODE_AVBR) {
+            // For maximum confusion AVBR is hacked into the existing API
+            // by overloading some of the fields with completely different
+            // meanings.
+
+            // Target percentage does not apply in AVBR mode.
+            rc_bits_per_second = avctx->bit_rate;
+
+            // Accuracy tolerance range for meeting the specified target
+            // bitrate.  It's very unclear how this is actually intended
+            // to work - since we do want to get the specified bitrate,
+            // set the accuracy to 100% for now.
+            rc_target_percentage = 100;
+
+            // Convergence period in frames.  The GOP size reflects the
+            // user's intended block size for cutting, so reusing that
+            // as the convergence period seems a reasonable default.
+            rc_window_size = avctx->gop_size > 0 ? avctx->gop_size : 60;
+
+        } else if (rc_mode->maxrate) {
+            if (avctx->rc_max_rate > 0) {
+                if (avctx->rc_max_rate < avctx->bit_rate) {
+                    av_log(avctx, AV_LOG_ERROR, "Invalid bitrate settings: "
+                           "bitrate (%"PRId64") must not be greater than "
+                           "maxrate (%"PRId64").\n", avctx->bit_rate,
+                           avctx->rc_max_rate);
+                    return AVERROR(EINVAL);
+                }
+                rc_bits_per_second   = avctx->rc_max_rate;
+                rc_target_percentage = (avctx->bit_rate * 100) /
+                                       avctx->rc_max_rate;
+            } else {
+                // We only have a target bitrate, but this mode requires
+                // that a maximum rate be supplied as well.  Since the
+                // user does not want this to be a constraint, arbitrarily
+                // pick a maximum rate of double the target rate.
+                rc_bits_per_second   = 2 * avctx->bit_rate;
+                rc_target_percentage = 50;
+            }
+        } else {
+            if (avctx->rc_max_rate > avctx->bit_rate) {
+                av_log(avctx, AV_LOG_WARNING, "Max bitrate is ignored "
+                       "in %s RC mode.\n", rc_mode->name);
+            }
             rc_bits_per_second   = avctx->bit_rate;
             rc_target_percentage = 100;
+        }
+    } else {
+        rc_bits_per_second   = 0;
+        rc_target_percentage = 100;
+    }
+
+    if (rc_mode->quality) {
+        if (ctx->explicit_qp) {
+            rc_quality = ctx->explicit_qp;
+        } else if (avctx->global_quality > 0) {
+            rc_quality = avctx->global_quality;
         } else {
-            rc_bits_per_second   = avctx->rc_max_rate;
-            rc_target_percentage = (avctx->bit_rate * 100) / rc_bits_per_second;
+            rc_quality = ctx->codec->default_quality;
+            av_log(avctx, AV_LOG_WARNING, "No quality level set; "
+                   "using default (%d).\n", rc_quality);
         }
-        rc_window_size = (hrd_buffer_size * 1000) / avctx->bit_rate;
+    } else {
+        rc_quality = 0;
     }
 
-    ctx->rc_params.misc.type = VAEncMiscParameterTypeRateControl;
-    ctx->rc_params.rc = (VAEncMiscParameterRateControl) {
-        .bits_per_second   = rc_bits_per_second,
-        .target_percentage = rc_target_percentage,
-        .window_size       = rc_window_size,
-        .initial_qp        = 0,
-        .min_qp            = (avctx->qmin > 0 ? avctx->qmin : 0),
-        .basic_unit_size   = 0,
-    };
-    ctx->global_params[ctx->nb_global_params] =
-        &ctx->rc_params.misc;
-    ctx->global_params_size[ctx->nb_global_params++] =
-        sizeof(ctx->rc_params);
-
-    ctx->hrd_params.misc.type = VAEncMiscParameterTypeHRD;
-    ctx->hrd_params.hrd = (VAEncMiscParameterHRD) {
-        .initial_buffer_fullness = hrd_initial_buffer_fullness,
-        .buffer_size             = hrd_buffer_size,
-    };
-    ctx->global_params[ctx->nb_global_params] =
-        &ctx->hrd_params.misc;
-    ctx->global_params_size[ctx->nb_global_params++] =
-        sizeof(ctx->hrd_params);
+    if (rc_mode->hrd) {
+        if (avctx->rc_buffer_size)
+            hrd_buffer_size = avctx->rc_buffer_size;
+        else if (avctx->rc_max_rate > 0)
+            hrd_buffer_size = avctx->rc_max_rate;
+        else
+            hrd_buffer_size = avctx->bit_rate;
+        if (avctx->rc_initial_buffer_occupancy) {
+            if (avctx->rc_initial_buffer_occupancy > hrd_buffer_size) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid RC buffer settings: "
+                       "must have initial buffer size (%d) <= "
+                       "buffer size (%"PRId64").\n",
+                       avctx->rc_initial_buffer_occupancy, hrd_buffer_size);
+                return AVERROR(EINVAL);
+            }
+            hrd_initial_buffer_fullness = avctx->rc_initial_buffer_occupancy;
+        } else {
+            hrd_initial_buffer_fullness = hrd_buffer_size * 3 / 4;
+        }
+
+        rc_window_size = (hrd_buffer_size * 1000) / rc_bits_per_second;
+    } else {
+        if (avctx->rc_buffer_size || avctx->rc_initial_buffer_occupancy) {
+            av_log(avctx, AV_LOG_WARNING, "Buffering settings are ignored "
+                   "in %s RC mode.\n", rc_mode->name);
+        }
+
+        hrd_buffer_size             = 0;
+        hrd_initial_buffer_fullness = 0;
+
+        if (rc_mode->mode != RC_MODE_AVBR) {
+            // Already set (with completely different meaning) for AVBR.
+            rc_window_size = 1000;
+        }
+    }
+
+    if (rc_bits_per_second          > UINT32_MAX ||
+        hrd_buffer_size             > UINT32_MAX ||
+        hrd_initial_buffer_fullness > UINT32_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "RC parameters of 2^32 or "
+               "greater are not supported by VAAPI.\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->rc_mode     = rc_mode;
+    ctx->rc_quality  = rc_quality;
+    ctx->va_rc_mode  = rc_mode->va_mode;
+    ctx->va_bit_rate = rc_bits_per_second;
+
+    av_log(avctx, AV_LOG_VERBOSE, "RC mode: %s.\n", rc_mode->name);
+    if (rc_attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        // This driver does not want the RC mode attribute to be set.
+    } else {
+        ctx->config_attributes[ctx->nb_config_attributes++] =
+            (VAConfigAttrib) {
+            .type  = VAConfigAttribRateControl,
+            .value = ctx->va_rc_mode,
+        };
+    }
+
+    if (rc_mode->quality)
+        av_log(avctx, AV_LOG_VERBOSE, "RC quality: %d.\n", rc_quality);
+
+    if (rc_mode->va_mode != VA_RC_CQP) {
+        if (rc_mode->mode == RC_MODE_AVBR) {
+            av_log(avctx, AV_LOG_VERBOSE, "RC target: %"PRId64" bps "
+                   "converging in %d frames with %d%% accuracy.\n",
+                   rc_bits_per_second, rc_window_size,
+                   rc_target_percentage);
+        } else if (rc_mode->bitrate) {
+            av_log(avctx, AV_LOG_VERBOSE, "RC target: %d%% of "
+                   "%"PRId64" bps over %d ms.\n", rc_target_percentage,
+                   rc_bits_per_second, rc_window_size);
+        }
+
+        ctx->rc_params.misc.type = VAEncMiscParameterTypeRateControl;
+        ctx->rc_params.rc = (VAEncMiscParameterRateControl) {
+            .bits_per_second    = rc_bits_per_second,
+            .target_percentage  = rc_target_percentage,
+            .window_size        = rc_window_size,
+            .initial_qp         = 0,
+            .min_qp             = (avctx->qmin > 0 ? avctx->qmin : 0),
+            .basic_unit_size    = 0,
+#if VA_CHECK_VERSION(1, 1, 0)
+            .ICQ_quality_factor = av_clip(rc_quality, 1, 51),
+            .max_qp             = (avctx->qmax > 0 ? avctx->qmax : 0),
+#endif
+#if VA_CHECK_VERSION(1, 3, 0)
+            .quality_factor     = rc_quality,
+#endif
+        };
+        vaapi_encode_add_global_param(avctx, &ctx->rc_params.misc,
+                                      sizeof(ctx->rc_params));
+    }
+
+    if (rc_mode->hrd) {
+        av_log(avctx, AV_LOG_VERBOSE, "RC buffer: %"PRId64" bits, "
+               "initial fullness %"PRId64" bits.\n",
+               hrd_buffer_size, hrd_initial_buffer_fullness);
+
+        ctx->hrd_params.misc.type = VAEncMiscParameterTypeHRD;
+        ctx->hrd_params.hrd = (VAEncMiscParameterHRD) {
+            .initial_buffer_fullness = hrd_initial_buffer_fullness,
+            .buffer_size             = hrd_buffer_size,
+        };
+        vaapi_encode_add_global_param(avctx, &ctx->hrd_params.misc,
+                                      sizeof(ctx->hrd_params));
+    }
 
     if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
         av_reduce(&fr_num, &fr_den,
@@ -1196,14 +1616,284 @@ static av_cold int vaapi_encode_init_rate_control(AVCodecContext *avctx)
         av_reduce(&fr_num, &fr_den,
                   avctx->time_base.den, avctx->time_base.num, 65535);
 
+    av_log(avctx, AV_LOG_VERBOSE, "RC framerate: %d/%d (%.2f fps).\n",
+           fr_num, fr_den, (double)fr_num / fr_den);
+
     ctx->fr_params.misc.type = VAEncMiscParameterTypeFrameRate;
     ctx->fr_params.fr.framerate = (unsigned int)fr_den << 16 | fr_num;
 
 #if VA_CHECK_VERSION(0, 40, 0)
-    ctx->global_params[ctx->nb_global_params] =
-        &ctx->fr_params.misc;
-    ctx->global_params_size[ctx->nb_global_params++] =
-        sizeof(ctx->fr_params);
+    vaapi_encode_add_global_param(avctx, &ctx->fr_params.misc,
+                                  sizeof(ctx->fr_params));
+#endif
+
+    return 0;
+}
+
+static av_cold int vaapi_encode_init_gop_structure(AVCodecContext *avctx)
+{
+    VAAPIEncodeContext *ctx = avctx->priv_data;
+    VAStatus vas;
+    VAConfigAttrib attr = { VAConfigAttribEncMaxRefFrames };
+    uint32_t ref_l0, ref_l1;
+
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile,
+                                ctx->va_entrypoint,
+                                &attr, 1);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query reference frames "
+               "attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    }
+
+    if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        ref_l0 = ref_l1 = 0;
+    } else {
+        ref_l0 = attr.value       & 0xffff;
+        ref_l1 = attr.value >> 16 & 0xffff;
+    }
+
+    if (ctx->codec->flags & FLAG_INTRA_ONLY ||
+        avctx->gop_size <= 1) {
+        av_log(avctx, AV_LOG_VERBOSE, "Using intra frames only.\n");
+        ctx->gop_size = 1;
+    } else if (ref_l0 < 1) {
+        av_log(avctx, AV_LOG_ERROR, "Driver does not support any "
+               "reference frames.\n");
+        return AVERROR(EINVAL);
+    } else if (!(ctx->codec->flags & FLAG_B_PICTURES) ||
+               ref_l1 < 1 || avctx->max_b_frames < 1) {
+        av_log(avctx, AV_LOG_VERBOSE, "Using intra and P-frames "
+               "(supported references: %d / %d).\n", ref_l0, ref_l1);
+        ctx->gop_size = avctx->gop_size;
+        ctx->p_per_i  = INT_MAX;
+        ctx->b_per_p  = 0;
+    } else {
+        av_log(avctx, AV_LOG_VERBOSE, "Using intra, P- and B-frames "
+               "(supported references: %d / %d).\n", ref_l0, ref_l1);
+        ctx->gop_size = avctx->gop_size;
+        ctx->p_per_i  = INT_MAX;
+        ctx->b_per_p  = avctx->max_b_frames;
+        if (ctx->codec->flags & FLAG_B_PICTURE_REFERENCES) {
+            ctx->max_b_depth = FFMIN(ctx->desired_b_depth,
+                                     av_log2(ctx->b_per_p) + 1);
+        } else {
+            ctx->max_b_depth = 1;
+        }
+    }
+
+    if (ctx->codec->flags & FLAG_NON_IDR_KEY_PICTURES) {
+        ctx->closed_gop  = !!(avctx->flags & AV_CODEC_FLAG_CLOSED_GOP);
+        ctx->gop_per_idr = ctx->idr_interval + 1;
+    } else {
+        ctx->closed_gop  = 1;
+        ctx->gop_per_idr = 1;
+    }
+
+    return 0;
+}
+
+static av_cold int vaapi_encode_init_slice_structure(AVCodecContext *avctx)
+{
+    VAAPIEncodeContext *ctx = avctx->priv_data;
+    VAConfigAttrib attr[2] = { { VAConfigAttribEncMaxSlices },
+                               { VAConfigAttribEncSliceStructure } };
+    VAStatus vas;
+    uint32_t max_slices, slice_structure;
+    int req_slices;
+
+    if (!(ctx->codec->flags & FLAG_SLICE_CONTROL)) {
+        if (avctx->slices > 0) {
+            av_log(avctx, AV_LOG_WARNING, "Multiple slices were requested "
+                   "but this codec does not support controlling slices.\n");
+        }
+        return 0;
+    }
+
+    ctx->slice_block_rows = (avctx->height + ctx->slice_block_height - 1) /
+                             ctx->slice_block_height;
+    ctx->slice_block_cols = (avctx->width  + ctx->slice_block_width  - 1) /
+                             ctx->slice_block_width;
+
+    if (avctx->slices <= 1) {
+        ctx->nb_slices  = 1;
+        ctx->slice_size = ctx->slice_block_rows;
+        return 0;
+    }
+
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile,
+                                ctx->va_entrypoint,
+                                attr, FF_ARRAY_ELEMS(attr));
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query slice "
+               "attributes: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    }
+    max_slices      = attr[0].value;
+    slice_structure = attr[1].value;
+    if (max_slices      == VA_ATTRIB_NOT_SUPPORTED ||
+        slice_structure == VA_ATTRIB_NOT_SUPPORTED) {
+        av_log(avctx, AV_LOG_ERROR, "Driver does not support encoding "
+               "pictures as multiple slices.\n.");
+        return AVERROR(EINVAL);
+    }
+
+    // For fixed-size slices currently we only support whole rows, making
+    // rectangular slices.  This could be extended to arbitrary runs of
+    // blocks, but since slices tend to be a conformance requirement and
+    // most cases (such as broadcast or bluray) want rectangular slices
+    // only it would need to be gated behind another option.
+    if (avctx->slices > ctx->slice_block_rows) {
+        av_log(avctx, AV_LOG_WARNING, "Not enough rows to use "
+               "configured number of slices (%d < %d); using "
+               "maximum.\n", ctx->slice_block_rows, avctx->slices);
+        req_slices = ctx->slice_block_rows;
+    } else {
+        req_slices = avctx->slices;
+    }
+    if (slice_structure & VA_ENC_SLICE_STRUCTURE_ARBITRARY_ROWS ||
+        slice_structure & VA_ENC_SLICE_STRUCTURE_ARBITRARY_MACROBLOCKS) {
+        ctx->nb_slices  = req_slices;
+        ctx->slice_size = ctx->slice_block_rows / ctx->nb_slices;
+    } else if (slice_structure & VA_ENC_SLICE_STRUCTURE_POWER_OF_TWO_ROWS) {
+        int k;
+        for (k = 1;; k *= 2) {
+            if (2 * k * (req_slices - 1) + 1 >= ctx->slice_block_rows)
+                break;
+        }
+        ctx->nb_slices  = (ctx->slice_block_rows + k - 1) / k;
+        ctx->slice_size = k;
+#if VA_CHECK_VERSION(1, 0, 0)
+    } else if (slice_structure & VA_ENC_SLICE_STRUCTURE_EQUAL_ROWS) {
+        ctx->nb_slices  = ctx->slice_block_rows;
+        ctx->slice_size = 1;
+#endif
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Driver does not support any usable "
+               "slice structure modes (%#x).\n", slice_structure);
+        return AVERROR(EINVAL);
+    }
+
+    if (ctx->nb_slices > avctx->slices) {
+        av_log(avctx, AV_LOG_WARNING, "Slice count rounded up to "
+               "%d (from %d) due to driver constraints on slice "
+               "structure.\n", ctx->nb_slices, avctx->slices);
+    }
+    if (ctx->nb_slices > max_slices) {
+        av_log(avctx, AV_LOG_ERROR, "Driver does not support "
+               "encoding with %d slices (max %"PRIu32").\n",
+               ctx->nb_slices, max_slices);
+        return AVERROR(EINVAL);
+    }
+
+    av_log(avctx, AV_LOG_VERBOSE, "Encoding pictures with %d slices "
+           "(default size %d block rows).\n",
+           ctx->nb_slices, ctx->slice_size);
+    return 0;
+}
+
+static av_cold int vaapi_encode_init_packed_headers(AVCodecContext *avctx)
+{
+    VAAPIEncodeContext *ctx = avctx->priv_data;
+    VAStatus vas;
+    VAConfigAttrib attr = { VAConfigAttribEncPackedHeaders };
+
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile,
+                                ctx->va_entrypoint,
+                                &attr, 1);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query packed headers "
+               "attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    }
+
+    if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        if (ctx->desired_packed_headers) {
+            av_log(avctx, AV_LOG_WARNING, "Driver does not support any "
+                   "packed headers (wanted %#x).\n",
+                   ctx->desired_packed_headers);
+        } else {
+            av_log(avctx, AV_LOG_VERBOSE, "Driver does not support any "
+                   "packed headers (none wanted).\n");
+        }
+        ctx->va_packed_headers = 0;
+    } else {
+        if (ctx->desired_packed_headers & ~attr.value) {
+            av_log(avctx, AV_LOG_WARNING, "Driver does not support some "
+                   "wanted packed headers (wanted %#x, found %#x).\n",
+                   ctx->desired_packed_headers, attr.value);
+        } else {
+            av_log(avctx, AV_LOG_VERBOSE, "All wanted packed headers "
+                   "available (wanted %#x, found %#x).\n",
+                   ctx->desired_packed_headers, attr.value);
+        }
+        ctx->va_packed_headers = ctx->desired_packed_headers & attr.value;
+    }
+
+    if (ctx->va_packed_headers) {
+        ctx->config_attributes[ctx->nb_config_attributes++] =
+            (VAConfigAttrib) {
+            .type  = VAConfigAttribEncPackedHeaders,
+            .value = ctx->va_packed_headers,
+        };
+    }
+
+    if ( (ctx->desired_packed_headers & VA_ENC_PACKED_HEADER_SEQUENCE) &&
+        !(ctx->va_packed_headers      & VA_ENC_PACKED_HEADER_SEQUENCE) &&
+         (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)) {
+        av_log(avctx, AV_LOG_WARNING, "Driver does not support packed "
+               "sequence headers, but a global header is requested.\n");
+        av_log(avctx, AV_LOG_WARNING, "No global header will be written: "
+               "this may result in a stream which is not usable for some "
+               "purposes (e.g. not muxable to some containers).\n");
+    }
+
+    return 0;
+}
+
+static av_cold int vaapi_encode_init_quality(AVCodecContext *avctx)
+{
+#if VA_CHECK_VERSION(0, 36, 0)
+    VAAPIEncodeContext *ctx = avctx->priv_data;
+    VAStatus vas;
+    VAConfigAttrib attr = { VAConfigAttribEncQualityRange };
+    int quality = avctx->compression_level;
+
+    vas = vaGetConfigAttributes(ctx->hwctx->display,
+                                ctx->va_profile,
+                                ctx->va_entrypoint,
+                                &attr, 1);
+    if (vas != VA_STATUS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to query quality "
+               "config attribute: %d (%s).\n", vas, vaErrorStr(vas));
+        return AVERROR_EXTERNAL;
+    }
+
+    if (attr.value == VA_ATTRIB_NOT_SUPPORTED) {
+        if (quality != 0) {
+            av_log(avctx, AV_LOG_WARNING, "Quality attribute is not "
+                   "supported: will use default quality level.\n");
+        }
+    } else {
+        if (quality > attr.value) {
+            av_log(avctx, AV_LOG_WARNING, "Invalid quality level: "
+                   "valid range is 0-%d, using %d.\n",
+                   attr.value, attr.value);
+            quality = attr.value;
+        }
+
+        ctx->quality_params.misc.type = VAEncMiscParameterTypeQualityLevel;
+        ctx->quality_params.quality.quality_level = quality;
+
+        vaapi_encode_add_global_param(avctx, &ctx->quality_params.misc,
+                                      sizeof(ctx->quality_params));
+    }
+#else
+    av_log(avctx, AV_LOG_WARNING, "The encode quality option is "
+           "not supported with this VAAPI version.\n");
 #endif
 
     return 0;
@@ -1333,9 +2023,6 @@ static av_cold int vaapi_encode_create_recon_frames(AVCodecContext *avctx)
     ctx->recon_frames->sw_format = recon_format;
     ctx->recon_frames->width     = ctx->surface_width;
     ctx->recon_frames->height    = ctx->surface_height;
-    // At most three IDR/I/P frames and two runs of B frames can be in
-    // flight at any one time.
-    ctx->recon_frames->initial_pool_size = 3 + 2 * avctx->max_b_frames;
 
     err = av_hwframe_ctx_init(ctx->recon_frames_ref);
     if (err < 0) {
@@ -1364,17 +2051,9 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    ctx->codec_options = ctx->codec_options_data;
-
     ctx->va_config  = VA_INVALID_ID;
     ctx->va_context = VA_INVALID_ID;
 
-    ctx->priv_data = av_mallocz(ctx->codec->priv_data_size);
-    if (!ctx->priv_data) {
-        err = AVERROR(ENOMEM);
-        goto fail;
-    }
-
     ctx->input_frames_ref = av_buffer_ref(avctx->hw_frames_ctx);
     if (!ctx->input_frames_ref) {
         err = AVERROR(ENOMEM);
@@ -1390,10 +2069,32 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
     ctx->device = (AVHWDeviceContext*)ctx->device_ref->data;
     ctx->hwctx = ctx->device->hwctx;
 
-    err = vaapi_encode_config_attributes(avctx);
+    err = vaapi_encode_profile_entrypoint(avctx);
+    if (err < 0)
+        goto fail;
+
+    err = vaapi_encode_init_rate_control(avctx);
+    if (err < 0)
+        goto fail;
+
+    err = vaapi_encode_init_gop_structure(avctx);
     if (err < 0)
         goto fail;
 
+    err = vaapi_encode_init_slice_structure(avctx);
+    if (err < 0)
+        goto fail;
+
+    err = vaapi_encode_init_packed_headers(avctx);
+    if (err < 0)
+        goto fail;
+
+    if (avctx->compression_level >= 0) {
+        err = vaapi_encode_init_quality(avctx);
+        if (err < 0)
+            goto fail;
+    }
+
     vas = vaCreateConfig(ctx->hwctx->display,
                          ctx->va_profile, ctx->va_entrypoint,
                          ctx->config_attributes, ctx->nb_config_attributes,
@@ -1431,61 +2132,14 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
         goto fail;
     }
 
-    if (ctx->va_rc_mode & ~VA_RC_CQP) {
-        err = vaapi_encode_init_rate_control(avctx);
-        if (err < 0)
-            goto fail;
-    }
-
     if (ctx->codec->configure) {
         err = ctx->codec->configure(avctx);
         if (err < 0)
             goto fail;
     }
 
-    if (avctx->compression_level >= 0) {
-#if VA_CHECK_VERSION(0, 36, 0)
-        VAConfigAttrib attr = { VAConfigAttribEncQualityRange };
-
-        vas = vaGetConfigAttributes(ctx->hwctx->display,
-                                    ctx->va_profile,
-                                    ctx->va_entrypoint,
-                                    &attr, 1);
-        if (vas != VA_STATUS_SUCCESS) {
-            av_log(avctx, AV_LOG_WARNING, "Failed to query quality "
-                   "attribute: will use default compression level.\n");
-        } else {
-            if (avctx->compression_level > attr.value) {
-                av_log(avctx, AV_LOG_WARNING, "Invalid compression "
-                       "level: valid range is 0-%d, using %d.\n",
-                       attr.value, attr.value);
-                avctx->compression_level = attr.value;
-            }
-
-            ctx->quality_params.misc.type =
-                VAEncMiscParameterTypeQualityLevel;
-            ctx->quality_params.quality.quality_level =
-                avctx->compression_level;
-
-            ctx->global_params[ctx->nb_global_params] =
-                &ctx->quality_params.misc;
-            ctx->global_params_size[ctx->nb_global_params++] =
-                sizeof(ctx->quality_params);
-        }
-#else
-        av_log(avctx, AV_LOG_WARNING, "The encode compression level "
-               "option is not supported with this VAAPI version.\n");
-#endif
-    }
-
-    ctx->input_order  = 0;
-    ctx->output_delay = avctx->max_b_frames;
-    ctx->decode_delay = 1;
-    ctx->output_order = - ctx->output_delay - 1;
-
-    // Currently we never generate I frames, only IDR.
-    ctx->p_per_i = INT_MAX;
-    ctx->b_per_p = avctx->max_b_frames;
+    ctx->output_delay = ctx->b_per_p;
+    ctx->decode_delay = ctx->max_b_depth;
 
     if (ctx->codec->sequence_params_size > 0) {
         ctx->codec_sequence_params =
@@ -1513,12 +2167,9 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
         }
     }
 
-    // This should be configurable somehow.  (Needs testing on a machine
-    // where it actually overlaps properly, though.)
-    ctx->issue_mode = ISSUE_MODE_MAXIMISE_THROUGHPUT;
-
     if (ctx->va_packed_headers & VA_ENC_PACKED_HEADER_SEQUENCE &&
-        ctx->codec->write_sequence_header) {
+        ctx->codec->write_sequence_header &&
+        avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
         char data[MAX_PARAM_BUFFER_SIZE];
         size_t bit_len = 8 * sizeof(data);
 
@@ -1575,7 +2226,5 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
     av_buffer_unref(&ctx->input_frames_ref);
     av_buffer_unref(&ctx->device_ref);
 
-    av_freep(&ctx->priv_data);
-
     return 0;
 }
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index 31c3790..44a8db5 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,10 @@
 
 #include <va/va.h>
 
+#if VA_CHECK_VERSION(1, 0, 0)
+#include <va/va_str.h>
+#endif
+
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_vaapi.h"
 
@@ -34,6 +38,7 @@ struct VAAPIEncodePicture;
 enum {
     MAX_CONFIG_ATTRIBUTES  = 4,
     MAX_GLOBAL_PARAMS      = 4,
+    MAX_DPB_SIZE           = 16,
     MAX_PICTURE_REFERENCES = 2,
     MAX_REORDER_DELAY      = 16,
     MAX_PARAM_BUFFER_SIZE  = 1024,
@@ -48,6 +53,10 @@ enum {
 
 typedef struct VAAPIEncodeSlice {
     int             index;
+    int             row_start;
+    int             row_size;
+    int             block_start;
+    int             block_size;
     void           *priv_data;
     void           *codec_slice_params;
 } VAAPIEncodeSlice;
@@ -58,9 +67,10 @@ typedef struct VAAPIEncodePicture {
     int64_t         display_order;
     int64_t         encode_order;
     int64_t         pts;
+    int             force_idr;
 
     int             type;
-    int             input_available;
+    int             b_depth;
     int             encode_issued;
     int             encode_complete;
 
@@ -79,30 +89,103 @@ typedef struct VAAPIEncodePicture {
     void           *priv_data;
     void           *codec_picture_params;
 
-    int          nb_refs;
+    // Whether this picture is a reference picture.
+    int             is_reference;
+
+    // The contents of the DPB after this picture has been decoded.
+    // This will contain the picture itself if it is a reference picture,
+    // but not if it isn't.
+    int                     nb_dpb_pics;
+    struct VAAPIEncodePicture *dpb[MAX_DPB_SIZE];
+    // The reference pictures used in decoding this picture.  If they are
+    // used by later pictures they will also appear in the DPB.
+    int                     nb_refs;
     struct VAAPIEncodePicture *refs[MAX_PICTURE_REFERENCES];
+    // The previous reference picture in encode order.  Must be in at least
+    // one of the reference list and DPB list.
+    struct VAAPIEncodePicture *prev;
+    // Reference count for other pictures referring to this one through
+    // the above pointers, directly from incomplete pictures and indirectly
+    // through completed pictures.
+    int             ref_count[2];
+    int             ref_removed[2];
 
     int          nb_slices;
     VAAPIEncodeSlice *slices;
 } VAAPIEncodePicture;
 
+typedef struct VAAPIEncodeProfile {
+    // lavc profile value (FF_PROFILE_*).
+    int       av_profile;
+    // Supported bit depth.
+    int       depth;
+    // Number of components.
+    int       nb_components;
+    // Chroma subsampling in width dimension.
+    int       log2_chroma_w;
+    // Chroma subsampling in height dimension.
+    int       log2_chroma_h;
+    // VAAPI profile value.
+    VAProfile va_profile;
+} VAAPIEncodeProfile;
+
+enum {
+    RC_MODE_AUTO,
+    RC_MODE_CQP,
+    RC_MODE_CBR,
+    RC_MODE_VBR,
+    RC_MODE_ICQ,
+    RC_MODE_QVBR,
+    RC_MODE_AVBR,
+    RC_MODE_MAX = RC_MODE_AVBR,
+};
+
+typedef struct VAAPIEncodeRCMode {
+    // Mode from above enum (RC_MODE_*).
+    int mode;
+    // Name.
+    const char *name;
+    // Supported in the compile-time VAAPI version.
+    int supported;
+    // VA mode value (VA_RC_*).
+    uint32_t va_mode;
+    // Uses bitrate parameters.
+    int bitrate;
+    // Supports maxrate distinct from bitrate.
+    int maxrate;
+    // Uses quality value.
+    int quality;
+    // Supports HRD/VBV parameters.
+    int hrd;
+} VAAPIEncodeRCMode;
+
 typedef struct VAAPIEncodeContext {
     const AVClass *class;
 
     // Codec-specific hooks.
     const struct VAAPIEncodeType *codec;
 
-    // Encoding profile (VAProfileXXX).
-    VAProfile       va_profile;
-    // Encoding entrypoint (usually VAEntryointEncSlice).
-    VAEntrypoint    va_entrypoint;
-    // Surface colour/sampling format (usually VA_RT_FORMAT_YUV420).
-    unsigned int    va_rt_format;
-    // Rate control mode.
-    unsigned int    va_rc_mode;
-    // Supported packed headers (initially the desired set, modified
-    // later to what is actually supported).
-    unsigned int    va_packed_headers;
+    // Global options.
+
+    // Use low power encoding mode.
+    int             low_power;
+
+    // Number of I frames between IDR frames.
+    int             idr_interval;
+
+    // Desired B frame reference depth.
+    int             desired_b_depth;
+
+    // Explicitly set RC mode (otherwise attempt to pick from
+    // available modes).
+    int             explicit_rc_mode;
+
+    // Explicitly-set QP, for use with the "qp" options.
+    // (Forces CQP mode when set, overriding everything else.)
+    int             explicit_qp;
+
+    // Desired packed headers.
+    unsigned int    desired_packed_headers;
 
     // The required size of surfaces.  This is probably the input
     // size (AVCodecContext.width|height) aligned up to whatever
@@ -110,11 +193,32 @@ typedef struct VAAPIEncodeContext {
     int             surface_width;
     int             surface_height;
 
+    // The block size for slice calculations.
+    int             slice_block_width;
+    int             slice_block_height;
+
     // Everything above this point must be set before calling
     // ff_vaapi_encode_init().
 
-    // Codec-specific state.
-    void *priv_data;
+    // Chosen encoding profile details.
+    const VAAPIEncodeProfile *profile;
+
+    // Chosen rate control mode details.
+    const VAAPIEncodeRCMode *rc_mode;
+    // RC quality level - meaning depends on codec and RC mode.
+    // In CQP mode this sets the fixed quantiser value.
+    int             rc_quality;
+
+    // Encoding profile (VAProfile*).
+    VAProfile       va_profile;
+    // Encoding entrypoint (VAEntryoint*).
+    VAEntrypoint    va_entrypoint;
+    // Rate control mode.
+    unsigned int    va_rc_mode;
+    // Bitrate for codec-specific encoder parameters.
+    unsigned int    va_bit_rate;
+    // Packed headers which will actually be sent.
+    unsigned int    va_packed_headers;
 
     // Configuration attributes to use when creating va_config.
     VAConfigAttrib  config_attributes[MAX_CONFIG_ATTRIBUTES];
@@ -173,55 +277,82 @@ typedef struct VAAPIEncodeContext {
 
     // Current encoding window, in display (input) order.
     VAAPIEncodePicture *pic_start, *pic_end;
+    // The next picture to use as the previous reference picture in
+    // encoding order.
+    VAAPIEncodePicture *next_prev;
 
     // Next input order index (display order).
     int64_t         input_order;
     // Number of frames that output is behind input.
     int64_t         output_delay;
+    // Next encode order index.
+    int64_t         encode_order;
     // Number of frames decode output will need to be delayed.
     int64_t         decode_delay;
-    // Next output order index (encode order).
+    // Next output order index (in encode order).
     int64_t         output_order;
 
-    enum {
-        // All encode operations are done independently (synchronise
-        // immediately after every operation).
-        ISSUE_MODE_SERIALISE_EVERYTHING = 0,
-        // Overlap as many operations as possible.
-        ISSUE_MODE_MAXIMISE_THROUGHPUT,
-        // Overlap operations only when satisfying parallel dependencies.
-        ISSUE_MODE_MINIMISE_LATENCY,
-    } issue_mode;
-
     // Timestamp handling.
     int64_t         first_pts;
     int64_t         dts_pts_diff;
     int64_t         ts_ring[MAX_REORDER_DELAY * 3];
 
+    // Slice structure.
+    int slice_block_rows;
+    int slice_block_cols;
+    int nb_slices;
+    int slice_size;
+
     // Frame type decision.
+    int gop_size;
+    int closed_gop;
+    int gop_per_idr;
     int p_per_i;
+    int max_b_depth;
     int b_per_p;
     int force_idr;
+    int idr_counter;
     int gop_counter;
-    int p_counter;
     int end_of_stream;
-
-    // Codec-local options are allocated to follow this structure in
-    // memory (in the AVCodec definition, set priv_data_size to
-    // sizeof(VAAPIEncodeContext) + sizeof(VAAPIEncodeFooOptions)).
-    void *codec_options;
-    char codec_options_data[0];
 } VAAPIEncodeContext;
 
+enum {
+    // Codec supports controlling the subdivision of pictures into slices.
+    FLAG_SLICE_CONTROL         = 1 << 0,
+    // Codec only supports constant quality (no rate control).
+    FLAG_CONSTANT_QUALITY_ONLY = 1 << 1,
+    // Codec is intra-only.
+    FLAG_INTRA_ONLY            = 1 << 2,
+    // Codec supports B-pictures.
+    FLAG_B_PICTURES            = 1 << 3,
+    // Codec supports referencing B-pictures.
+    FLAG_B_PICTURE_REFERENCES  = 1 << 4,
+    // Codec supports non-IDR key pictures (that is, key pictures do
+    // not necessarily empty the DPB).
+    FLAG_NON_IDR_KEY_PICTURES  = 1 << 5,
+};
 
 typedef struct VAAPIEncodeType {
-    size_t priv_data_size;
+    // List of supported profiles and corresponding VAAPI profiles.
+    // (Must end with FF_PROFILE_UNKNOWN.)
+    const VAAPIEncodeProfile *profiles;
+
+    // Codec feature flags.
+    int flags;
+
+    // Default quality for this codec - used as quantiser or RC quality
+    // factor depending on RC mode.
+    int default_quality;
 
     // Perform any extra codec-specific configuration after the
     // codec context is initialised (set up the private data and
     // add any necessary global parameters).
     int (*configure)(AVCodecContext *avctx);
 
+    // The size of any private data structure associated with each
+    // picture (can be zero if not required).
+    size_t picture_priv_data_size;
+
     // The size of the parameter structures:
     // sizeof(VAEnc{type}ParameterBuffer{codec}).
     size_t sequence_params_size;
@@ -277,7 +408,44 @@ typedef struct VAAPIEncodeType {
 int ff_vaapi_encode2(AVCodecContext *avctx, AVPacket *pkt,
                      const AVFrame *input_image, int *got_packet);
 
+int ff_vaapi_encode_send_frame(AVCodecContext *avctx, const AVFrame *frame);
+int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt);
+
 int ff_vaapi_encode_init(AVCodecContext *avctx);
 int ff_vaapi_encode_close(AVCodecContext *avctx);
 
+
+#define VAAPI_ENCODE_COMMON_OPTIONS \
+    { "low_power", \
+      "Use low-power encoding mode (only available on some platforms; " \
+      "may not support all encoding features)", \
+      OFFSET(common.low_power), AV_OPT_TYPE_BOOL, \
+      { .i64 = 0 }, 0, 1, FLAGS }, \
+    { "idr_interval", \
+      "Distance (in I-frames) between IDR frames", \
+      OFFSET(common.idr_interval), AV_OPT_TYPE_INT, \
+      { .i64 = 0 }, 0, INT_MAX, FLAGS }, \
+    { "b_depth", \
+      "Maximum B-frame reference depth", \
+      OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
+      { .i64 = 1 }, 1, INT_MAX, FLAGS }
+
+#define VAAPI_ENCODE_RC_MODE(name, desc) \
+    { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
+      0, 0, FLAGS, "rc_mode" }
+#define VAAPI_ENCODE_RC_OPTIONS \
+    { "rc_mode",\
+      "Set rate control mode", \
+      OFFSET(common.explicit_rc_mode), AV_OPT_TYPE_INT, \
+      { .i64 = RC_MODE_AUTO }, RC_MODE_AUTO, RC_MODE_MAX, FLAGS, "rc_mode" }, \
+    { "auto", "Choose mode automatically based on other parameters", \
+      0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_AUTO }, 0, 0, FLAGS, "rc_mode" }, \
+    VAAPI_ENCODE_RC_MODE(CQP,  "Constant-quality"), \
+    VAAPI_ENCODE_RC_MODE(CBR,  "Constant-bitrate"), \
+    VAAPI_ENCODE_RC_MODE(VBR,  "Variable-bitrate"), \
+    VAAPI_ENCODE_RC_MODE(ICQ,  "Intelligent constant-quality"), \
+    VAAPI_ENCODE_RC_MODE(QVBR, "Quality-defined variable-bitrate"), \
+    VAAPI_ENCODE_RC_MODE(AVBR, "Average variable-bitrate")
+
+
 #endif /* AVCODEC_VAAPI_ENCODE_H */
diff --git a/libavcodec/vaapi_encode_h264.c b/libavcodec/vaapi_encode_h264.c
index 74a6417..91be33f 100644
--- a/libavcodec/vaapi_encode_h264.c
+++ b/libavcodec/vaapi_encode_h264.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 #include "cbs.h"
 #include "cbs_h264.h"
 #include "h264.h"
+#include "h264_levels.h"
 #include "h264_sei.h"
 #include "internal.h"
 #include "vaapi_encode.h"
@@ -46,62 +47,69 @@ static const uint8_t vaapi_encode_h264_sei_identifier_uuid[16] = {
     0x96, 0x75, 0x19, 0xd4, 0x1f, 0xea, 0xa9, 0x4d,
 };
 
-typedef struct VAAPIEncodeH264Context {
-    int mb_width;
-    int mb_height;
-
-    int fixed_qp_idr;
-    int fixed_qp_p;
-    int fixed_qp_b;
-
-    H264RawAUD aud;
-    H264RawSPS sps;
-    H264RawPPS pps;
-    H264RawSEI sei;
-    H264RawSlice slice;
-
-    H264RawSEIBufferingPeriod buffering_period;
-    H264RawSEIPicTiming pic_timing;
-    H264RawSEIRecoveryPoint recovery_point;
-    H264RawSEIUserDataUnregistered identifier;
-    char *identifier_string;
-
+typedef struct VAAPIEncodeH264Picture {
     int frame_num;
     int pic_order_cnt;
-    int next_frame_num;
+
     int64_t last_idr_frame;
-    int64_t idr_pic_count;
+    uint16_t idr_pic_id;
 
     int primary_pic_type;
     int slice_type;
 
     int cpb_delay;
     int dpb_delay;
+} VAAPIEncodeH264Picture;
 
-    CodedBitstreamContext *cbc;
-    CodedBitstreamFragment current_access_unit;
-    int aud_needed;
-    int sei_needed;
-    int sei_cbr_workaround_needed;
-} VAAPIEncodeH264Context;
+typedef struct VAAPIEncodeH264Context {
+    VAAPIEncodeContext common;
 
-typedef struct VAAPIEncodeH264Options {
+    // User options.
     int qp;
     int quality;
-    int low_power;
+    int coder;
     int aud;
     int sei;
     int profile;
     int level;
-} VAAPIEncodeH264Options;
+
+    // Derived settings.
+    int mb_width;
+    int mb_height;
+
+    int fixed_qp_idr;
+    int fixed_qp_p;
+    int fixed_qp_b;
+
+    int dpb_frames;
+
+    // Writer structures.
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment current_access_unit;
+
+    H264RawAUD   raw_aud;
+    H264RawSPS   raw_sps;
+    H264RawPPS   raw_pps;
+    H264RawSEI   raw_sei;
+    H264RawSlice raw_slice;
+
+    H264RawSEIBufferingPeriod      sei_buffering_period;
+    H264RawSEIPicTiming            sei_pic_timing;
+    H264RawSEIRecoveryPoint        sei_recovery_point;
+    H264RawSEIUserDataUnregistered sei_identifier;
+    char                          *sei_identifier_string;
+
+    int aud_needed;
+    int sei_needed;
+    int sei_cbr_workaround_needed;
+} VAAPIEncodeH264Context;
 
 
 static int vaapi_encode_h264_write_access_unit(AVCodecContext *avctx,
                                                char *data, size_t *data_len,
                                                CodedBitstreamFragment *au)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context *priv = ctx->priv_data;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
     int err;
 
     err = ff_cbs_write_fragment_data(priv->cbc, au);
@@ -127,8 +135,7 @@ static int vaapi_encode_h264_add_nal(AVCodecContext *avctx,
                                      CodedBitstreamFragment *au,
                                      void *nal_unit)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context *priv = ctx->priv_data;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
     H264RawNALUnitHeader *header = nal_unit;
     int err;
 
@@ -146,29 +153,28 @@ static int vaapi_encode_h264_add_nal(AVCodecContext *avctx,
 static int vaapi_encode_h264_write_sequence_header(AVCodecContext *avctx,
                                                    char *data, size_t *data_len)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context *priv = ctx->priv_data;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
     CodedBitstreamFragment   *au = &priv->current_access_unit;
     int err;
 
     if (priv->aud_needed) {
-        err = vaapi_encode_h264_add_nal(avctx, au, &priv->aud);
+        err = vaapi_encode_h264_add_nal(avctx, au, &priv->raw_aud);
         if (err < 0)
             goto fail;
         priv->aud_needed = 0;
     }
 
-    err = vaapi_encode_h264_add_nal(avctx, au, &priv->sps);
+    err = vaapi_encode_h264_add_nal(avctx, au, &priv->raw_sps);
     if (err < 0)
         goto fail;
 
-    err = vaapi_encode_h264_add_nal(avctx, au, &priv->pps);
+    err = vaapi_encode_h264_add_nal(avctx, au, &priv->raw_pps);
     if (err < 0)
         goto fail;
 
     err = vaapi_encode_h264_write_access_unit(avctx, data, data_len, au);
 fail:
-    ff_cbs_fragment_uninit(priv->cbc, au);
+    ff_cbs_fragment_reset(priv->cbc, au);
     return err;
 }
 
@@ -177,25 +183,24 @@ static int vaapi_encode_h264_write_slice_header(AVCodecContext *avctx,
                                                 VAAPIEncodeSlice *slice,
                                                 char *data, size_t *data_len)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context *priv = ctx->priv_data;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
     CodedBitstreamFragment   *au = &priv->current_access_unit;
     int err;
 
     if (priv->aud_needed) {
-        err = vaapi_encode_h264_add_nal(avctx, au, &priv->aud);
+        err = vaapi_encode_h264_add_nal(avctx, au, &priv->raw_aud);
         if (err < 0)
             goto fail;
         priv->aud_needed = 0;
     }
 
-    err = vaapi_encode_h264_add_nal(avctx, au, &priv->slice);
+    err = vaapi_encode_h264_add_nal(avctx, au, &priv->raw_slice);
     if (err < 0)
         goto fail;
 
     err = vaapi_encode_h264_write_access_unit(avctx, data, data_len, au);
 fail:
-    ff_cbs_fragment_uninit(priv->cbc, au);
+    ff_cbs_fragment_reset(priv->cbc, au);
     return err;
 }
 
@@ -204,49 +209,53 @@ static int vaapi_encode_h264_write_extra_header(AVCodecContext *avctx,
                                                 int index, int *type,
                                                 char *data, size_t *data_len)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context *priv = ctx->priv_data;
-    VAAPIEncodeH264Options  *opt = ctx->codec_options;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
     CodedBitstreamFragment   *au = &priv->current_access_unit;
     int err, i;
 
     if (priv->sei_needed) {
+        H264RawSEI *sei = &priv->raw_sei;
+
         if (priv->aud_needed) {
-            err = vaapi_encode_h264_add_nal(avctx, au, &priv->aud);
+            err = vaapi_encode_h264_add_nal(avctx, au, &priv->raw_aud);
             if (err < 0)
                 goto fail;
             priv->aud_needed = 0;
         }
 
-        memset(&priv->sei, 0, sizeof(priv->sei));
-        priv->sei.nal_unit_header.nal_unit_type = H264_NAL_SEI;
+        *sei = (H264RawSEI) {
+            .nal_unit_header = {
+                .nal_unit_type = H264_NAL_SEI,
+            },
+        };
 
         i = 0;
-        if (pic->encode_order == 0 && opt->sei & SEI_IDENTIFIER) {
-            priv->sei.payload[i].payload_type = H264_SEI_TYPE_USER_DATA_UNREGISTERED;
-            priv->sei.payload[i].payload.user_data_unregistered = priv->identifier;
+
+        if (priv->sei_needed & SEI_IDENTIFIER) {
+            sei->payload[i].payload_type = H264_SEI_TYPE_USER_DATA_UNREGISTERED;
+            sei->payload[i].payload.user_data_unregistered = priv->sei_identifier;
             ++i;
         }
-        if (opt->sei & SEI_TIMING) {
+        if (priv->sei_needed & SEI_TIMING) {
             if (pic->type == PICTURE_TYPE_IDR) {
-                priv->sei.payload[i].payload_type = H264_SEI_TYPE_BUFFERING_PERIOD;
-                priv->sei.payload[i].payload.buffering_period = priv->buffering_period;
+                sei->payload[i].payload_type = H264_SEI_TYPE_BUFFERING_PERIOD;
+                sei->payload[i].payload.buffering_period = priv->sei_buffering_period;
                 ++i;
             }
-            priv->sei.payload[i].payload_type = H264_SEI_TYPE_PIC_TIMING;
-            priv->sei.payload[i].payload.pic_timing = priv->pic_timing;
+            sei->payload[i].payload_type = H264_SEI_TYPE_PIC_TIMING;
+            sei->payload[i].payload.pic_timing = priv->sei_pic_timing;
             ++i;
         }
-        if (opt->sei & SEI_RECOVERY_POINT && pic->type == PICTURE_TYPE_I) {
-            priv->sei.payload[i].payload_type = H264_SEI_TYPE_RECOVERY_POINT;
-            priv->sei.payload[i].payload.recovery_point = priv->recovery_point;
+        if (priv->sei_needed & SEI_RECOVERY_POINT) {
+            sei->payload[i].payload_type = H264_SEI_TYPE_RECOVERY_POINT;
+            sei->payload[i].payload.recovery_point = priv->sei_recovery_point;
             ++i;
         }
 
-        priv->sei.payload_count = i;
-        av_assert0(priv->sei.payload_count > 0);
+        sei->payload_count = i;
+        av_assert0(sei->payload_count > 0);
 
-        err = vaapi_encode_h264_add_nal(avctx, au, &priv->sei);
+        err = vaapi_encode_h264_add_nal(avctx, au, sei);
         if (err < 0)
             goto fail;
         priv->sei_needed = 0;
@@ -255,12 +264,12 @@ static int vaapi_encode_h264_write_extra_header(AVCodecContext *avctx,
         if (err < 0)
             goto fail;
 
-        ff_cbs_fragment_uninit(priv->cbc, au);
+        ff_cbs_fragment_reset(priv->cbc, au);
 
         *type = VAEncPackedHeaderRawData;
         return 0;
 
-#if !HAVE_VAAPI_1
+#if !CONFIG_VAAPI_1
     } else if (priv->sei_cbr_workaround_needed) {
         // Insert a zero-length header using the old SEI type.  This is
         // required to avoid triggering broken behaviour on Intel platforms
@@ -277,23 +286,19 @@ static int vaapi_encode_h264_write_extra_header(AVCodecContext *avctx,
     }
 
 fail:
-    ff_cbs_fragment_uninit(priv->cbc, au);
+    ff_cbs_fragment_reset(priv->cbc, au);
     return err;
 }
 
 static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
 {
     VAAPIEncodeContext                *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context           *priv = ctx->priv_data;
-    VAAPIEncodeH264Options            *opt = ctx->codec_options;
-    H264RawSPS                        *sps = &priv->sps;
-    H264RawPPS                        *pps = &priv->pps;
+    VAAPIEncodeH264Context           *priv = avctx->priv_data;
+    H264RawSPS                        *sps = &priv->raw_sps;
+    H264RawPPS                        *pps = &priv->raw_pps;
     VAEncSequenceParameterBufferH264 *vseq = ctx->codec_sequence_params;
     VAEncPictureParameterBufferH264  *vpic = ctx->codec_picture_params;
 
-    memset(&priv->current_access_unit, 0,
-           sizeof(priv->current_access_unit));
-
     memset(sps, 0, sizeof(*sps));
     memset(pps, 0, sizeof(*pps));
 
@@ -301,24 +306,55 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
     sps->nal_unit_header.nal_unit_type = H264_NAL_SPS;
 
     sps->profile_idc = avctx->profile & 0xff;
-    sps->constraint_set1_flag =
-        !!(avctx->profile & FF_PROFILE_H264_CONSTRAINED);
-    sps->constraint_set3_flag =
-        !!(avctx->profile & FF_PROFILE_H264_INTRA);
 
-    sps->level_idc = avctx->level;
+    if (avctx->profile == FF_PROFILE_H264_CONSTRAINED_BASELINE ||
+        avctx->profile == FF_PROFILE_H264_MAIN)
+        sps->constraint_set1_flag = 1;
+
+    if (avctx->profile == FF_PROFILE_H264_HIGH)
+        sps->constraint_set3_flag = ctx->gop_size == 1;
+
+    if (avctx->profile == FF_PROFILE_H264_MAIN ||
+        avctx->profile == FF_PROFILE_H264_HIGH) {
+        sps->constraint_set4_flag = 1;
+        sps->constraint_set5_flag = ctx->b_per_p == 0;
+    }
+
+    if (ctx->gop_size == 1)
+        priv->dpb_frames = 0;
+    else
+        priv->dpb_frames = 1 + ctx->max_b_depth;
+
+    if (avctx->level != FF_LEVEL_UNKNOWN) {
+        sps->level_idc = avctx->level;
+    } else {
+        const H264LevelDescriptor *level;
+
+        level = ff_h264_guess_level(sps->profile_idc,
+                                    avctx->bit_rate,
+                                    priv->mb_width  * 16,
+                                    priv->mb_height * 16,
+                                    priv->dpb_frames);
+        if (level) {
+            av_log(avctx, AV_LOG_VERBOSE, "Using level %s.\n", level->name);
+            if (level->constraint_set3_flag)
+                sps->constraint_set3_flag = 1;
+            sps->level_idc = level->level_idc;
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "Stream will not conform "
+                   "to any level: using level 6.2.\n");
+            sps->level_idc = 62;
+        }
+    }
 
     sps->seq_parameter_set_id = 0;
     sps->chroma_format_idc    = 1;
 
     sps->log2_max_frame_num_minus4 = 4;
     sps->pic_order_cnt_type        = 0;
-    sps->log2_max_pic_order_cnt_lsb_minus4 =
-        av_clip(av_log2(ctx->b_per_p + 1) - 2, 0, 12);
+    sps->log2_max_pic_order_cnt_lsb_minus4 = 4;
 
-    sps->max_num_ref_frames =
-        (avctx->profile & FF_PROFILE_H264_INTRA) ? 0 :
-        1 + (ctx->b_per_p > 0);
+    sps->max_num_ref_frames = priv->dpb_frames;
 
     sps->pic_width_in_mbs_minus1        = priv->mb_width  - 1;
     sps->pic_height_in_map_units_minus1 = priv->mb_height - 1;
@@ -351,18 +387,20 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
             {  80, 33 }, {  18, 11 }, {  15, 11 }, {  64, 33 },
             { 160, 99 }, {   4,  3 }, {   3,  2 }, {   2,  1 },
         };
-        int i;
+        int num, den, i;
+        av_reduce(&num, &den, avctx->sample_aspect_ratio.num,
+                  avctx->sample_aspect_ratio.den, 65535);
         for (i = 0; i < FF_ARRAY_ELEMS(sar_idc); i++) {
-            if (avctx->sample_aspect_ratio.num == sar_idc[i].num &&
-                avctx->sample_aspect_ratio.den == sar_idc[i].den) {
+            if (num == sar_idc[i].num &&
+                den == sar_idc[i].den) {
                 sps->vui.aspect_ratio_idc = i;
                 break;
             }
         }
         if (i >= FF_ARRAY_ELEMS(sar_idc)) {
             sps->vui.aspect_ratio_idc = 255;
-            sps->vui.sar_width  = avctx->sample_aspect_ratio.num;
-            sps->vui.sar_height = avctx->sample_aspect_ratio.den;
+            sps->vui.sar_width  = num;
+            sps->vui.sar_height = den;
         }
         sps->vui.aspect_ratio_info_present_flag = 1;
     }
@@ -410,8 +448,9 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
         sps->vui.fixed_frame_rate_flag = 0;
     }
 
-    if (opt->sei & SEI_TIMING) {
+    if (priv->sei & SEI_TIMING) {
         H264RawHRD *hrd = &sps->vui.nal_hrd_parameters;
+        H264RawSEIBufferingPeriod *bp = &priv->sei_buffering_period;
 
         sps->vui.nal_hrd_parameters_present_flag = 1;
 
@@ -420,9 +459,9 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
         // Try to scale these to a sensible range so that the
         // golomb encode of the value is not overlong.
         hrd->bit_rate_scale =
-            av_clip_uintp2(av_log2(avctx->bit_rate) - 15 - 6, 4);
+            av_clip_uintp2(av_log2(ctx->va_bit_rate) - 15 - 6, 4);
         hrd->bit_rate_value_minus1[0] =
-            (avctx->bit_rate >> hrd->bit_rate_scale + 6) - 1;
+            (ctx->va_bit_rate >> hrd->bit_rate_scale + 6) - 1;
 
         hrd->cpb_size_scale =
             av_clip_uintp2(av_log2(ctx->hrd_params.hrd.buffer_size) - 15 - 4, 4);
@@ -438,13 +477,13 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
         hrd->dpb_output_delay_length_minus1          = 7;
         hrd->time_offset_length                      = 0;
 
-        priv->buffering_period.seq_parameter_set_id = sps->seq_parameter_set_id;
+        bp->seq_parameter_set_id = sps->seq_parameter_set_id;
 
         // This calculation can easily overflow 32 bits.
-        priv->buffering_period.nal.initial_cpb_removal_delay[0] = 90000 *
+        bp->nal.initial_cpb_removal_delay[0] = 90000 *
             (uint64_t)ctx->hrd_params.hrd.initial_buffer_fullness /
             ctx->hrd_params.hrd.buffer_size;
-        priv->buffering_period.nal.initial_cpb_removal_delay_offset[0] = 0;
+        bp->nal.initial_cpb_removal_delay_offset[0] = 0;
     } else {
         sps->vui.nal_hrd_parameters_present_flag = 0;
         sps->vui.low_delay_hrd_flag = 1 - sps->vui.fixed_frame_rate_flag;
@@ -452,10 +491,10 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
 
     sps->vui.bitstream_restriction_flag    = 1;
     sps->vui.motion_vectors_over_pic_boundaries_flag = 1;
-    sps->vui.log2_max_mv_length_horizontal = 16;
-    sps->vui.log2_max_mv_length_vertical   = 16;
-    sps->vui.max_num_reorder_frames        = (ctx->b_per_p > 0);
-    sps->vui.max_dec_frame_buffering       = sps->max_num_ref_frames;
+    sps->vui.log2_max_mv_length_horizontal = 15;
+    sps->vui.log2_max_mv_length_vertical   = 15;
+    sps->vui.max_num_reorder_frames        = ctx->max_b_depth;
+    sps->vui.max_dec_frame_buffering       = ctx->max_b_depth + 1;
 
     pps->nal_unit_header.nal_ref_idc = 3;
     pps->nal_unit_header.nal_unit_type = H264_NAL_PPS;
@@ -467,6 +506,8 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
         !(sps->profile_idc == FF_PROFILE_H264_BASELINE ||
           sps->profile_idc == FF_PROFILE_H264_EXTENDED ||
           sps->profile_idc == FF_PROFILE_H264_CAVLC_444);
+    if (!priv->coder && pps->entropy_coding_mode_flag)
+        pps->entropy_coding_mode_flag = 0;
 
     pps->num_ref_idx_l0_default_active_minus1 = 0;
     pps->num_ref_idx_l1_default_active_minus1 = 0;
@@ -486,11 +527,11 @@ static int vaapi_encode_h264_init_sequence_params(AVCodecContext *avctx)
     *vseq = (VAEncSequenceParameterBufferH264) {
         .seq_parameter_set_id = sps->seq_parameter_set_id,
         .level_idc        = sps->level_idc,
-        .intra_period     = avctx->gop_size,
-        .intra_idr_period = avctx->gop_size,
+        .intra_period     = ctx->gop_size,
+        .intra_idr_period = ctx->gop_size,
         .ip_period        = ctx->b_per_p + 1,
 
-        .bits_per_second       = avctx->bit_rate,
+        .bits_per_second       = ctx->va_bit_rate,
         .max_num_ref_frames    = sps->max_num_ref_frames,
         .picture_width_in_mbs  = sps->pic_width_in_mbs_minus1 + 1,
         .picture_height_in_mbs = sps->pic_height_in_map_units_minus1 + 1,
@@ -573,102 +614,106 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
                                                  VAAPIEncodePicture *pic)
 {
     VAAPIEncodeContext               *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context          *priv = ctx->priv_data;
-    VAAPIEncodeH264Options           *opt = ctx->codec_options;
-    H264RawSPS                       *sps = &priv->sps;
+    VAAPIEncodeH264Context          *priv = avctx->priv_data;
+    VAAPIEncodeH264Picture          *hpic = pic->priv_data;
+    VAAPIEncodePicture              *prev = pic->prev;
+    VAAPIEncodeH264Picture         *hprev = prev ? prev->priv_data : NULL;
     VAEncPictureParameterBufferH264 *vpic = pic->codec_picture_params;
     int i;
 
-    memset(&priv->current_access_unit, 0,
-           sizeof(priv->current_access_unit));
-
     if (pic->type == PICTURE_TYPE_IDR) {
         av_assert0(pic->display_order == pic->encode_order);
-        priv->frame_num      = 0;
-        priv->next_frame_num = 1;
-        priv->cpb_delay      = 0;
-        priv->last_idr_frame = pic->display_order;
-        ++priv->idr_pic_count;
-
-        priv->slice_type       = 7;
-        priv->primary_pic_type = 0;
+
+        hpic->frame_num      = 0;
+        hpic->last_idr_frame = pic->display_order;
+        hpic->idr_pic_id     = hprev ? hprev->idr_pic_id + 1 : 0;
+
+        hpic->primary_pic_type = 0;
+        hpic->slice_type       = 7;
     } else {
-        priv->frame_num      = priv->next_frame_num;
+        av_assert0(prev);
 
-        if (pic->type != PICTURE_TYPE_B) {
-            // Reference picture, so frame_num advances.
-            priv->next_frame_num = (priv->frame_num + 1) &
-                ((1 << (4 + sps->log2_max_frame_num_minus4)) - 1);
-        }
-        ++priv->cpb_delay;
+        hpic->frame_num = hprev->frame_num + prev->is_reference;
+
+        hpic->last_idr_frame = hprev->last_idr_frame;
+        hpic->idr_pic_id     = hprev->idr_pic_id;
 
         if (pic->type == PICTURE_TYPE_I) {
-            priv->slice_type       = 7;
-            priv->primary_pic_type = 0;
+            hpic->slice_type       = 7;
+            hpic->primary_pic_type = 0;
         } else if (pic->type == PICTURE_TYPE_P) {
-            priv->slice_type       = 5;
-            priv->primary_pic_type = 1;
+            hpic->slice_type       = 5;
+            hpic->primary_pic_type = 1;
         } else {
-            priv->slice_type       = 6;
-            priv->primary_pic_type = 2;
+            hpic->slice_type       = 6;
+            hpic->primary_pic_type = 2;
         }
     }
-    priv->pic_order_cnt = pic->display_order - priv->last_idr_frame;
-    priv->dpb_delay     = pic->display_order - pic->encode_order + 1;
+    hpic->pic_order_cnt = pic->display_order - hpic->last_idr_frame;
+    hpic->dpb_delay     = pic->display_order - pic->encode_order + ctx->max_b_depth;
+    hpic->cpb_delay     = pic->encode_order - hpic->last_idr_frame;
 
-    if (opt->aud) {
+    if (priv->aud) {
         priv->aud_needed = 1;
-        priv->aud.nal_unit_header.nal_unit_type = H264_NAL_AUD;
-        priv->aud.primary_pic_type = priv->primary_pic_type;
+        priv->raw_aud = (H264RawAUD) {
+            .nal_unit_header = {
+                .nal_unit_type = H264_NAL_AUD,
+            },
+            .primary_pic_type  = hpic->primary_pic_type,
+        };
     } else {
         priv->aud_needed = 0;
     }
 
-    if (opt->sei & SEI_IDENTIFIER && pic->encode_order == 0)
-        priv->sei_needed = 1;
-#if !HAVE_VAAPI_1
+    priv->sei_needed = 0;
+
+    if (priv->sei & SEI_IDENTIFIER && pic->encode_order == 0)
+        priv->sei_needed |= SEI_IDENTIFIER;
+#if !CONFIG_VAAPI_1
     if (ctx->va_rc_mode == VA_RC_CBR)
         priv->sei_cbr_workaround_needed = 1;
 #endif
 
-    if (opt->sei & SEI_TIMING) {
-        memset(&priv->pic_timing, 0, sizeof(priv->pic_timing));
-
-        priv->pic_timing.cpb_removal_delay = 2 * priv->cpb_delay;
-        priv->pic_timing.dpb_output_delay  = 2 * priv->dpb_delay;
+    if (priv->sei & SEI_TIMING) {
+        priv->sei_pic_timing = (H264RawSEIPicTiming) {
+            .cpb_removal_delay = 2 * hpic->cpb_delay,
+            .dpb_output_delay  = 2 * hpic->dpb_delay,
+        };
 
-        priv->sei_needed = 1;
+        priv->sei_needed |= SEI_TIMING;
     }
 
-    if (opt->sei & SEI_RECOVERY_POINT && pic->type == PICTURE_TYPE_I) {
-        priv->recovery_point.recovery_frame_cnt = 0;
-        priv->recovery_point.exact_match_flag   = 1;
-        priv->recovery_point.broken_link_flag   = ctx->b_per_p > 0;
+    if (priv->sei & SEI_RECOVERY_POINT && pic->type == PICTURE_TYPE_I) {
+        priv->sei_recovery_point = (H264RawSEIRecoveryPoint) {
+            .recovery_frame_cnt = 0,
+            .exact_match_flag   = 1,
+            .broken_link_flag   = ctx->b_per_p > 0,
+        };
 
-        priv->sei_needed = 1;
+        priv->sei_needed |= SEI_RECOVERY_POINT;
     }
 
     vpic->CurrPic = (VAPictureH264) {
         .picture_id          = pic->recon_surface,
-        .frame_idx           = priv->frame_num,
+        .frame_idx           = hpic->frame_num,
         .flags               = 0,
-        .TopFieldOrderCnt    = priv->pic_order_cnt,
-        .BottomFieldOrderCnt = priv->pic_order_cnt,
+        .TopFieldOrderCnt    = hpic->pic_order_cnt,
+        .BottomFieldOrderCnt = hpic->pic_order_cnt,
     };
 
     for (i = 0; i < pic->nb_refs; i++) {
-        VAAPIEncodePicture *ref = pic->refs[i];
-        unsigned int frame_num = (ref->encode_order - priv->last_idr_frame) &
-            ((1 << (4 + sps->log2_max_frame_num_minus4)) - 1);
-        unsigned int pic_order_cnt = ref->display_order - priv->last_idr_frame;
+        VAAPIEncodePicture      *ref = pic->refs[i];
+        VAAPIEncodeH264Picture *href;
 
         av_assert0(ref && ref->encode_order < pic->encode_order);
+        href = ref->priv_data;
+
         vpic->ReferenceFrames[i] = (VAPictureH264) {
             .picture_id          = ref->recon_surface,
-            .frame_idx           = frame_num,
+            .frame_idx           = href->frame_num,
             .flags               = VA_PICTURE_H264_SHORT_TERM_REFERENCE,
-            .TopFieldOrderCnt    = pic_order_cnt,
-            .BottomFieldOrderCnt = pic_order_cnt,
+            .TopFieldOrderCnt    = href->pic_order_cnt,
+            .BottomFieldOrderCnt = href->pic_order_cnt,
         };
     }
     for (; i < FF_ARRAY_ELEMS(vpic->ReferenceFrames); i++) {
@@ -680,47 +725,142 @@ static int vaapi_encode_h264_init_picture_params(AVCodecContext *avctx,
 
     vpic->coded_buf = pic->output_buffer;
 
-    vpic->frame_num = priv->frame_num;
+    vpic->frame_num = hpic->frame_num;
 
     vpic->pic_fields.bits.idr_pic_flag       = (pic->type == PICTURE_TYPE_IDR);
     vpic->pic_fields.bits.reference_pic_flag = (pic->type != PICTURE_TYPE_B);
 
-    pic->nb_slices = 1;
-
     return 0;
 }
 
+static void vaapi_encode_h264_default_ref_pic_list(AVCodecContext *avctx,
+                                                   VAAPIEncodePicture *pic,
+                                                   VAAPIEncodePicture **rpl0,
+                                                   VAAPIEncodePicture **rpl1,
+                                                   int *rpl_size)
+{
+    VAAPIEncodePicture *prev;
+    VAAPIEncodeH264Picture *hp, *hn, *hc;
+    int i, j, n = 0;
+
+    prev = pic->prev;
+    av_assert0(prev);
+    hp = pic->priv_data;
+
+    for (i = 0; i < pic->prev->nb_dpb_pics; i++) {
+        hn = prev->dpb[i]->priv_data;
+        av_assert0(hn->frame_num < hp->frame_num);
+
+        if (pic->type == PICTURE_TYPE_P) {
+            for (j = n; j > 0; j--) {
+                hc = rpl0[j - 1]->priv_data;
+                av_assert0(hc->frame_num != hn->frame_num);
+                if (hc->frame_num > hn->frame_num)
+                    break;
+                rpl0[j] = rpl0[j - 1];
+            }
+            rpl0[j] = prev->dpb[i];
+
+        } else if (pic->type == PICTURE_TYPE_B) {
+            for (j = n; j > 0; j--) {
+                hc = rpl0[j - 1]->priv_data;
+                av_assert0(hc->pic_order_cnt != hp->pic_order_cnt);
+                if (hc->pic_order_cnt < hp->pic_order_cnt) {
+                    if (hn->pic_order_cnt > hp->pic_order_cnt ||
+                        hn->pic_order_cnt < hc->pic_order_cnt)
+                        break;
+                } else {
+                    if (hn->pic_order_cnt > hc->pic_order_cnt)
+                        break;
+                }
+                rpl0[j] = rpl0[j - 1];
+            }
+            rpl0[j] = prev->dpb[i];
+
+            for (j = n; j > 0; j--) {
+                hc = rpl1[j - 1]->priv_data;
+                av_assert0(hc->pic_order_cnt != hp->pic_order_cnt);
+                if (hc->pic_order_cnt > hp->pic_order_cnt) {
+                    if (hn->pic_order_cnt < hp->pic_order_cnt ||
+                        hn->pic_order_cnt > hc->pic_order_cnt)
+                        break;
+                } else {
+                    if (hn->pic_order_cnt < hc->pic_order_cnt)
+                        break;
+                }
+                rpl1[j] = rpl1[j - 1];
+            }
+            rpl1[j] = prev->dpb[i];
+        }
+
+        ++n;
+    }
+
+    if (pic->type == PICTURE_TYPE_B) {
+        for (i = 0; i < n; i++) {
+            if (rpl0[i] != rpl1[i])
+                break;
+        }
+        if (i == n)
+            FFSWAP(VAAPIEncodePicture*, rpl1[0], rpl1[1]);
+    }
+
+    if (pic->type == PICTURE_TYPE_P ||
+        pic->type == PICTURE_TYPE_B) {
+        av_log(avctx, AV_LOG_DEBUG, "Default RefPicList0 for fn=%d/poc=%d:",
+               hp->frame_num, hp->pic_order_cnt);
+        for (i = 0; i < n; i++) {
+            hn = rpl0[i]->priv_data;
+            av_log(avctx, AV_LOG_DEBUG, "  fn=%d/poc=%d",
+                   hn->frame_num, hn->pic_order_cnt);
+        }
+        av_log(avctx, AV_LOG_DEBUG, "\n");
+    }
+    if (pic->type == PICTURE_TYPE_B) {
+        av_log(avctx, AV_LOG_DEBUG, "Default RefPicList1 for fn=%d/poc=%d:",
+               hp->frame_num, hp->pic_order_cnt);
+        for (i = 0; i < n; i++) {
+            hn = rpl1[i]->priv_data;
+            av_log(avctx, AV_LOG_DEBUG, "  fn=%d/poc=%d",
+                   hn->frame_num, hn->pic_order_cnt);
+        }
+        av_log(avctx, AV_LOG_DEBUG, "\n");
+    }
+
+    *rpl_size = n;
+}
+
 static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
                                                VAAPIEncodePicture *pic,
                                                VAAPIEncodeSlice *slice)
 {
-    VAAPIEncodeContext               *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context          *priv = ctx->priv_data;
-    H264RawSPS                       *sps = &priv->sps;
-    H264RawPPS                       *pps = &priv->pps;
-    H264RawSliceHeader                *sh = &priv->slice.header;
+    VAAPIEncodeH264Context          *priv = avctx->priv_data;
+    VAAPIEncodeH264Picture          *hpic = pic->priv_data;
+    VAAPIEncodePicture              *prev = pic->prev;
+    H264RawSPS                       *sps = &priv->raw_sps;
+    H264RawPPS                       *pps = &priv->raw_pps;
+    H264RawSliceHeader                *sh = &priv->raw_slice.header;
     VAEncPictureParameterBufferH264 *vpic = pic->codec_picture_params;
     VAEncSliceParameterBufferH264 *vslice = slice->codec_slice_params;
-    int i;
+    int i, j;
 
     if (pic->type == PICTURE_TYPE_IDR) {
         sh->nal_unit_header.nal_unit_type = H264_NAL_IDR_SLICE;
         sh->nal_unit_header.nal_ref_idc   = 3;
     } else {
         sh->nal_unit_header.nal_unit_type = H264_NAL_SLICE;
-        sh->nal_unit_header.nal_ref_idc   = pic->type != PICTURE_TYPE_B;
+        sh->nal_unit_header.nal_ref_idc   = pic->is_reference;
     }
 
-    // Only one slice per frame.
-    sh->first_mb_in_slice = 0;
-    sh->slice_type        = priv->slice_type;
+    sh->first_mb_in_slice = slice->block_start;
+    sh->slice_type        = hpic->slice_type;
 
     sh->pic_parameter_set_id = pps->pic_parameter_set_id;
 
-    sh->frame_num  = priv->frame_num;
-    sh->idr_pic_id = priv->idr_pic_count;
-
-    sh->pic_order_cnt_lsb = priv->pic_order_cnt &
+    sh->frame_num = hpic->frame_num &
+        ((1 << (4 + sps->log2_max_frame_num_minus4)) - 1);
+    sh->idr_pic_id = hpic->idr_pic_id;
+    sh->pic_order_cnt_lsb = hpic->pic_order_cnt &
         ((1 << (4 + sps->log2_max_pic_order_cnt_lsb_minus4)) - 1);
 
     sh->direct_spatial_mv_pred_flag = 1;
@@ -732,9 +872,149 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
     else
         sh->slice_qp_delta = priv->fixed_qp_idr - (pps->pic_init_qp_minus26 + 26);
 
+    if (pic->is_reference && pic->type != PICTURE_TYPE_IDR) {
+        VAAPIEncodePicture *discard_list[MAX_DPB_SIZE];
+        int discard = 0, keep = 0;
 
-    vslice->macroblock_address = sh->first_mb_in_slice;
-    vslice->num_macroblocks    = priv->mb_width * priv->mb_height;
+        // Discard everything which is in the DPB of the previous frame but
+        // not in the DPB of this one.
+        for (i = 0; i < prev->nb_dpb_pics; i++) {
+            for (j = 0; j < pic->nb_dpb_pics; j++) {
+                if (prev->dpb[i] == pic->dpb[j])
+                    break;
+            }
+            if (j == pic->nb_dpb_pics) {
+                discard_list[discard] = prev->dpb[i];
+                ++discard;
+            } else {
+                ++keep;
+            }
+        }
+        av_assert0(keep <= priv->dpb_frames);
+
+        if (discard == 0) {
+            sh->adaptive_ref_pic_marking_mode_flag = 0;
+        } else {
+            sh->adaptive_ref_pic_marking_mode_flag = 1;
+            for (i = 0; i < discard; i++) {
+                VAAPIEncodeH264Picture *old = discard_list[i]->priv_data;
+                av_assert0(old->frame_num < hpic->frame_num);
+                sh->mmco[i].memory_management_control_operation = 1;
+                sh->mmco[i].difference_of_pic_nums_minus1 =
+                    hpic->frame_num - old->frame_num - 1;
+            }
+            sh->mmco[i].memory_management_control_operation = 0;
+        }
+    }
+
+    // If the intended references are not the first entries of RefPicListN
+    // by default, use ref-pic-list-modification to move them there.
+    if (pic->type == PICTURE_TYPE_P || pic->type == PICTURE_TYPE_B) {
+        VAAPIEncodePicture *def_l0[MAX_DPB_SIZE], *def_l1[MAX_DPB_SIZE];
+        VAAPIEncodeH264Picture *href;
+        int n;
+
+        vaapi_encode_h264_default_ref_pic_list(avctx, pic,
+                                               def_l0, def_l1, &n);
+
+        if (pic->type == PICTURE_TYPE_P) {
+            int need_rplm = 0;
+            for (i = 0; i < pic->nb_refs; i++) {
+                av_assert0(pic->refs[i]);
+                if (pic->refs[i] != def_l0[i])
+                    need_rplm = 1;
+            }
+
+            sh->ref_pic_list_modification_flag_l0 = need_rplm;
+            if (need_rplm) {
+                int pic_num = hpic->frame_num;
+                for (i = 0; i < pic->nb_refs; i++) {
+                    href = pic->refs[i]->priv_data;
+                    av_assert0(href->frame_num != pic_num);
+                    if (href->frame_num < pic_num) {
+                        sh->rplm_l0[i].modification_of_pic_nums_idc = 0;
+                        sh->rplm_l0[i].abs_diff_pic_num_minus1 =
+                            pic_num - href->frame_num - 1;
+                    } else {
+                        sh->rplm_l0[i].modification_of_pic_nums_idc = 1;
+                        sh->rplm_l0[i].abs_diff_pic_num_minus1 =
+                            href->frame_num - pic_num - 1;
+                    }
+                    pic_num = href->frame_num;
+                }
+                sh->rplm_l0[i].modification_of_pic_nums_idc = 3;
+            }
+
+        } else {
+            int need_rplm_l0 = 0, need_rplm_l1 = 0;
+            int n0 = 0, n1 = 0;
+            for (i = 0; i < pic->nb_refs; i++) {
+                av_assert0(pic->refs[i]);
+                href = pic->refs[i]->priv_data;
+                av_assert0(href->pic_order_cnt != hpic->pic_order_cnt);
+                if (href->pic_order_cnt < hpic->pic_order_cnt) {
+                    if (pic->refs[i] != def_l0[n0])
+                        need_rplm_l0 = 1;
+                    ++n0;
+                } else {
+                    if (pic->refs[i] != def_l1[n1])
+                        need_rplm_l1 = 1;
+                    ++n1;
+                }
+            }
+
+            sh->ref_pic_list_modification_flag_l0 = need_rplm_l0;
+            if (need_rplm_l0) {
+                int pic_num = hpic->frame_num;
+                for (i = j = 0; i < pic->nb_refs; i++) {
+                    href = pic->refs[i]->priv_data;
+                    if (href->pic_order_cnt > hpic->pic_order_cnt)
+                        continue;
+                    av_assert0(href->frame_num != pic_num);
+                    if (href->frame_num < pic_num) {
+                        sh->rplm_l0[j].modification_of_pic_nums_idc = 0;
+                        sh->rplm_l0[j].abs_diff_pic_num_minus1 =
+                            pic_num - href->frame_num - 1;
+                    } else {
+                        sh->rplm_l0[j].modification_of_pic_nums_idc = 1;
+                        sh->rplm_l0[j].abs_diff_pic_num_minus1 =
+                            href->frame_num - pic_num - 1;
+                    }
+                    pic_num = href->frame_num;
+                    ++j;
+                }
+                av_assert0(j == n0);
+                sh->rplm_l0[j].modification_of_pic_nums_idc = 3;
+            }
+
+            sh->ref_pic_list_modification_flag_l1 = need_rplm_l1;
+            if (need_rplm_l1) {
+                int pic_num = hpic->frame_num;
+                for (i = j = 0; i < pic->nb_refs; i++) {
+                    href = pic->refs[i]->priv_data;
+                    if (href->pic_order_cnt < hpic->pic_order_cnt)
+                        continue;
+                    av_assert0(href->frame_num != pic_num);
+                    if (href->frame_num < pic_num) {
+                        sh->rplm_l1[j].modification_of_pic_nums_idc = 0;
+                        sh->rplm_l1[j].abs_diff_pic_num_minus1 =
+                            pic_num - href->frame_num - 1;
+                    } else {
+                        sh->rplm_l1[j].modification_of_pic_nums_idc = 1;
+                        sh->rplm_l1[j].abs_diff_pic_num_minus1 =
+                            href->frame_num - pic_num - 1;
+                    }
+                    pic_num = href->frame_num;
+                    ++j;
+                }
+                av_assert0(j == n1);
+                sh->rplm_l1[j].modification_of_pic_nums_idc = 3;
+            }
+        }
+    }
+
+    vslice->macroblock_address = slice->block_start;
+    vslice->num_macroblocks    = slice->block_size;
 
     vslice->macroblock_info = VA_INVALID_ID;
 
@@ -774,8 +1054,7 @@ static int vaapi_encode_h264_init_slice_params(AVCodecContext *avctx,
 static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
 {
     VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context *priv = ctx->priv_data;
-    VAAPIEncodeH264Options  *opt = ctx->codec_options;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
     int err;
 
     err = ff_cbs_init(&priv->cbc, AV_CODEC_ID_H264, avctx);
@@ -786,51 +1065,45 @@ static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
     priv->mb_height = FFALIGN(avctx->height, 16) / 16;
 
     if (ctx->va_rc_mode == VA_RC_CQP) {
-        priv->fixed_qp_p = opt->qp;
+        priv->fixed_qp_p = av_clip(ctx->rc_quality, 1, 51);
         if (avctx->i_quant_factor > 0.0)
-            priv->fixed_qp_idr = (int)((priv->fixed_qp_p * avctx->i_quant_factor +
-                                        avctx->i_quant_offset) + 0.5);
+            priv->fixed_qp_idr =
+                av_clip((avctx->i_quant_factor * priv->fixed_qp_p +
+                         avctx->i_quant_offset) + 0.5, 1, 51);
         else
             priv->fixed_qp_idr = priv->fixed_qp_p;
         if (avctx->b_quant_factor > 0.0)
-            priv->fixed_qp_b = (int)((priv->fixed_qp_p * avctx->b_quant_factor +
-                                      avctx->b_quant_offset) + 0.5);
+            priv->fixed_qp_b =
+                av_clip((avctx->b_quant_factor * priv->fixed_qp_p +
+                         avctx->b_quant_offset) + 0.5, 1, 51);
         else
             priv->fixed_qp_b = priv->fixed_qp_p;
 
-        opt->sei &= ~SEI_TIMING;
-
         av_log(avctx, AV_LOG_DEBUG, "Using fixed QP = "
                "%d / %d / %d for IDR- / P- / B-frames.\n",
                priv->fixed_qp_idr, priv->fixed_qp_p, priv->fixed_qp_b);
 
-    } else if (ctx->va_rc_mode == VA_RC_CBR ||
-               ctx->va_rc_mode == VA_RC_VBR) {
+    } else {
         // These still need to be  set for pic_init_qp/slice_qp_delta.
         priv->fixed_qp_idr = 26;
         priv->fixed_qp_p   = 26;
         priv->fixed_qp_b   = 26;
-
-        av_log(avctx, AV_LOG_DEBUG, "Using %s-bitrate = %d bps.\n",
-               ctx->va_rc_mode == VA_RC_CBR ? "constant" : "variable",
-               avctx->bit_rate);
-
-    } else {
-        av_assert0(0 && "Invalid RC mode.");
     }
 
-    if (avctx->compression_level == FF_COMPRESSION_DEFAULT)
-        avctx->compression_level = opt->quality;
+    if (!ctx->rc_mode->hrd) {
+        // Timing SEI requires a mode respecting HRD parameters.
+        priv->sei &= ~SEI_TIMING;
+    }
 
-    if (opt->sei & SEI_IDENTIFIER) {
+    if (priv->sei & SEI_IDENTIFIER) {
         const char *lavc  = LIBAVCODEC_IDENT;
         const char *vaapi = VA_VERSION_S;
         const char *driver;
         int len;
 
-        memcpy(priv->identifier.uuid_iso_iec_11578,
+        memcpy(priv->sei_identifier.uuid_iso_iec_11578,
                vaapi_encode_h264_sei_identifier_uuid,
-               sizeof(priv->identifier.uuid_iso_iec_11578));
+               sizeof(priv->sei_identifier.uuid_iso_iec_11578));
 
         driver = vaQueryVendorString(ctx->hwctx->display);
         if (!driver)
@@ -838,26 +1111,43 @@ static av_cold int vaapi_encode_h264_configure(AVCodecContext *avctx)
 
         len = snprintf(NULL, 0, "%s / VAAPI %s / %s", lavc, vaapi, driver);
         if (len >= 0) {
-            priv->identifier_string = av_malloc(len + 1);
-            if (!priv->identifier_string)
+            priv->sei_identifier_string = av_malloc(len + 1);
+            if (!priv->sei_identifier_string)
                 return AVERROR(ENOMEM);
 
-            snprintf(priv->identifier_string, len + 1,
+            snprintf(priv->sei_identifier_string, len + 1,
                      "%s / VAAPI %s / %s", lavc, vaapi, driver);
 
-            priv->identifier.data = priv->identifier_string;
-            priv->identifier.data_length = len + 1;
+            priv->sei_identifier.data        = priv->sei_identifier_string;
+            priv->sei_identifier.data_length = len + 1;
         }
     }
 
     return 0;
 }
 
+static const VAAPIEncodeProfile vaapi_encode_h264_profiles[] = {
+    { FF_PROFILE_H264_HIGH, 8, 3, 1, 1, VAProfileH264High },
+    { FF_PROFILE_H264_MAIN, 8, 3, 1, 1, VAProfileH264Main },
+    { FF_PROFILE_H264_CONSTRAINED_BASELINE,
+                            8, 3, 1, 1, VAProfileH264ConstrainedBaseline },
+    { FF_PROFILE_UNKNOWN }
+};
+
 static const VAAPIEncodeType vaapi_encode_type_h264 = {
-    .priv_data_size        = sizeof(VAAPIEncodeH264Context),
+    .profiles              = vaapi_encode_h264_profiles,
+
+    .flags                 = FLAG_SLICE_CONTROL |
+                             FLAG_B_PICTURES |
+                             FLAG_B_PICTURE_REFERENCES |
+                             FLAG_NON_IDR_KEY_PICTURES,
+
+    .default_quality       = 20,
 
     .configure             = &vaapi_encode_h264_configure,
 
+    .picture_priv_data_size = sizeof(VAAPIEncodeH264Picture),
+
     .sequence_params_size  = sizeof(VAEncSequenceParameterBufferH264),
     .init_sequence_params  = &vaapi_encode_h264_init_sequence_params,
 
@@ -878,36 +1168,29 @@ static const VAAPIEncodeType vaapi_encode_type_h264 = {
 
 static av_cold int vaapi_encode_h264_init(AVCodecContext *avctx)
 {
-    VAAPIEncodeContext     *ctx = avctx->priv_data;
-    VAAPIEncodeH264Options *opt =
-        (VAAPIEncodeH264Options*)ctx->codec_options_data;
+    VAAPIEncodeContext      *ctx = avctx->priv_data;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
 
     ctx->codec = &vaapi_encode_type_h264;
 
     if (avctx->profile == FF_PROFILE_UNKNOWN)
-        avctx->profile = opt->profile;
+        avctx->profile = priv->profile;
     if (avctx->level == FF_LEVEL_UNKNOWN)
-        avctx->level = opt->level;
+        avctx->level = priv->level;
+    if (avctx->compression_level == FF_COMPRESSION_DEFAULT)
+        avctx->compression_level = priv->quality;
 
+    // Reject unsupported profiles.
     switch (avctx->profile) {
     case FF_PROFILE_H264_BASELINE:
         av_log(avctx, AV_LOG_WARNING, "H.264 baseline profile is not "
                "supported, using constrained baseline profile instead.\n");
         avctx->profile = FF_PROFILE_H264_CONSTRAINED_BASELINE;
-    case FF_PROFILE_H264_CONSTRAINED_BASELINE:
-        ctx->va_profile = VAProfileH264ConstrainedBaseline;
-        break;
-    case FF_PROFILE_H264_MAIN:
-        ctx->va_profile = VAProfileH264Main;
         break;
     case FF_PROFILE_H264_EXTENDED:
         av_log(avctx, AV_LOG_ERROR, "H.264 extended profile "
                "is not supported.\n");
         return AVERROR_PATCHWELCOME;
-    case FF_PROFILE_UNKNOWN:
-    case FF_PROFILE_H264_HIGH:
-        ctx->va_profile = VAProfileH264High;
-        break;
     case FF_PROFILE_H264_HIGH_10:
     case FF_PROFILE_H264_HIGH_10_INTRA:
         av_log(avctx, AV_LOG_ERROR, "H.264 10-bit profiles "
@@ -922,35 +1205,15 @@ static av_cold int vaapi_encode_h264_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "H.264 non-4:2:0 profiles "
                "are not supported.\n");
         return AVERROR_PATCHWELCOME;
-    default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown H.264 profile %d.\n",
-               avctx->profile);
-        return AVERROR(EINVAL);
     }
-    if (opt->low_power) {
-#if VA_CHECK_VERSION(0, 39, 2)
-        ctx->va_entrypoint = VAEntrypointEncSliceLP;
-#else
-        av_log(avctx, AV_LOG_ERROR, "Low-power encoding is not "
-               "supported with this VAAPI version.\n");
+
+    if (avctx->level != FF_LEVEL_UNKNOWN && avctx->level & ~0xff) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid level %d: must fit "
+               "in 8-bit unsigned integer.\n", avctx->level);
         return AVERROR(EINVAL);
-#endif
-    } else {
-        ctx->va_entrypoint = VAEntrypointEncSlice;
     }
 
-    // Only 8-bit encode is supported.
-    ctx->va_rt_format = VA_RT_FORMAT_YUV420;
-
-    if (avctx->bit_rate > 0) {
-        if (avctx->rc_max_rate == avctx->bit_rate)
-            ctx->va_rc_mode = VA_RC_CBR;
-        else
-            ctx->va_rc_mode = VA_RC_VBR;
-    } else
-        ctx->va_rc_mode = VA_RC_CQP;
-
-    ctx->va_packed_headers =
+    ctx->desired_packed_headers =
         VA_ENC_PACKED_HEADER_SEQUENCE | // SPS and PPS.
         VA_ENC_PACKED_HEADER_SLICE    | // Slice headers.
         VA_ENC_PACKED_HEADER_MISC;      // SEI.
@@ -958,36 +1221,44 @@ static av_cold int vaapi_encode_h264_init(AVCodecContext *avctx)
     ctx->surface_width  = FFALIGN(avctx->width,  16);
     ctx->surface_height = FFALIGN(avctx->height, 16);
 
+    ctx->slice_block_height = ctx->slice_block_width = 16;
+
+    if (priv->qp > 0)
+        ctx->explicit_qp = priv->qp;
+
     return ff_vaapi_encode_init(avctx);
 }
 
 static av_cold int vaapi_encode_h264_close(AVCodecContext *avctx)
 {
-    VAAPIEncodeContext *ctx = avctx->priv_data;
-    VAAPIEncodeH264Context *priv = ctx->priv_data;
+    VAAPIEncodeH264Context *priv = avctx->priv_data;
 
-    if (priv) {
-        ff_cbs_close(&priv->cbc);
-        av_freep(&priv->identifier_string);
-    }
+    ff_cbs_fragment_free(priv->cbc, &priv->current_access_unit);
+    ff_cbs_close(&priv->cbc);
+    av_freep(&priv->sei_identifier_string);
 
     return ff_vaapi_encode_close(avctx);
 }
 
-#define OFFSET(x) (offsetof(VAAPIEncodeContext, codec_options_data) + \
-                   offsetof(VAAPIEncodeH264Options, x))
+#define OFFSET(x) offsetof(VAAPIEncodeH264Context, x)
 #define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
 static const AVOption vaapi_encode_h264_options[] = {
+    VAAPI_ENCODE_COMMON_OPTIONS,
+    VAAPI_ENCODE_RC_OPTIONS,
+
     { "qp", "Constant QP (for P-frames; scaled by qfactor/qoffset for I/B)",
-      OFFSET(qp), AV_OPT_TYPE_INT, { .i64 = 20 }, 0, 52, FLAGS },
+      OFFSET(qp), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 52, FLAGS },
     { "quality", "Set encode quality (trades off against speed, higher is faster)",
-      OFFSET(quality), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 8, FLAGS },
-    { "low_power", "Use low-power encoding mode (experimental: only supported "
-      "on some platforms, does not support all features)",
-      OFFSET(low_power), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+      OFFSET(quality), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, FLAGS },
+    { "coder", "Entropy coder type",
+      OFFSET(coder), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, FLAGS, "coder" },
+        { "cavlc", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, INT_MIN, INT_MAX, FLAGS, "coder" },
+        { "cabac", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, FLAGS, "coder" },
+        { "vlc",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, INT_MIN, INT_MAX, FLAGS, "coder" },
+        { "ac",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, FLAGS, "coder" },
 
     { "aud", "Include AUD",
-      OFFSET(aud), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+      OFFSET(aud), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
 
     { "sei", "Set SEI to include",
       OFFSET(sei), AV_OPT_TYPE_FLAGS,
@@ -1005,7 +1276,7 @@ static const AVOption vaapi_encode_h264_options[] = {
 
     { "profile", "Set profile (profile_idc and constraint_set*_flag)",
       OFFSET(profile), AV_OPT_TYPE_INT,
-      { .i64 = FF_PROFILE_H264_HIGH }, 0x0000, 0xffff, FLAGS, "profile" },
+      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 0xffff, FLAGS, "profile" },
 
 #define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
       { .i64 = value }, 0, 0, FLAGS, "profile"
@@ -1016,7 +1287,7 @@ static const AVOption vaapi_encode_h264_options[] = {
 
     { "level", "Set level (level_idc)",
       OFFSET(level), AV_OPT_TYPE_INT,
-      { .i64 = 51 }, 0x00, 0xff, FLAGS, "level" },
+      { .i64 = FF_LEVEL_UNKNOWN }, FF_LEVEL_UNKNOWN, 0xff, FLAGS, "level" },
 
 #define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
       { .i64 = value }, 0, 0, FLAGS, "level"
@@ -1048,11 +1319,12 @@ static const AVCodecDefault vaapi_encode_h264_defaults[] = {
     { "b",              "0"   },
     { "bf",             "2"   },
     { "g",              "120" },
-    { "i_qfactor",      "1.0" },
-    { "i_qoffset",      "0.0" },
-    { "b_qfactor",      "1.2" },
-    { "b_qoffset",      "0.0" },
-    { "qmin",           "0"   },
+    { "i_qfactor",      "1"   },
+    { "i_qoffset",      "0"   },
+    { "b_qfactor",      "6/5" },
+    { "b_qoffset",      "0"   },
+    { "qmin",           "-1"  },
+    { "qmax",           "-1"  },
     { NULL },
 };
 
@@ -1068,10 +1340,10 @@ AVCodec ff_h264_vaapi_encoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("H.264/AVC (VAAPI)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
-    .priv_data_size = (sizeof(VAAPIEncodeContext) +
-                       sizeof(VAAPIEncodeH264Options)),
+    .priv_data_size = sizeof(VAAPIEncodeH264Context),
     .init           = &vaapi_encode_h264_init,
-    .encode2        = &ff_vaapi_encode2,
+    .send_frame     = &ff_vaapi_encode_send_frame,
+    .receive_packet = &ff_vaapi_encode_receive_packet,
     .close          = &vaapi_encode_h264_close,
     .priv_class     = &vaapi_encode_h264_class,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
diff --git a/libavcodec/vaapi_encode_h265.c b/libavcodec/vaapi_encode_h265.c
index 9080aa8..758bd40 100644
--- a/libavcodec/vaapi_encode_h265.c
+++ b/libavcodec/vaapi_encode_h265.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,57 +23,74 @@
 
 #include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/pixdesc.h"
 #include "libavutil/opt.h"
+#include "libavutil/mastering_display_metadata.h"
 
 #include "avcodec.h"
 #include "cbs.h"
 #include "cbs_h265.h"
+#include "h265_profile_level.h"
 #include "hevc.h"
+#include "hevc_sei.h"
 #include "internal.h"
 #include "put_bits.h"
 #include "vaapi_encode.h"
 
+enum {
+    SEI_MASTERING_DISPLAY       = 0x08,
+    SEI_CONTENT_LIGHT_LEVEL     = 0x10,
+};
 
-typedef struct VAAPIEncodeH265Context {
-    unsigned int ctu_width;
-    unsigned int ctu_height;
-
-    int fixed_qp_idr;
-    int fixed_qp_p;
-    int fixed_qp_b;
-
-    H265RawAUD aud;
-    H265RawVPS vps;
-    H265RawSPS sps;
-    H265RawPPS pps;
-    H265RawSlice slice;
+typedef struct VAAPIEncodeH265Picture {
+    int pic_order_cnt;
 
     int64_t last_idr_frame;
-    int pic_order_cnt;
 
     int slice_nal_unit;
     int slice_type;
     int pic_type;
+} VAAPIEncodeH265Picture;
 
-    CodedBitstreamContext *cbc;
-    CodedBitstreamFragment current_access_unit;
-    int aud_needed;
-} VAAPIEncodeH265Context;
+typedef struct VAAPIEncodeH265Context {
+    VAAPIEncodeContext common;
 
-typedef struct VAAPIEncodeH265Options {
+    // User options.
     int qp;
     int aud;
     int profile;
+    int tier;
     int level;
-} VAAPIEncodeH265Options;
+    int sei;
+
+    // Derived settings.
+    int fixed_qp_idr;
+    int fixed_qp_p;
+    int fixed_qp_b;
+
+    // Writer structures.
+    H265RawAUD   raw_aud;
+    H265RawVPS   raw_vps;
+    H265RawSPS   raw_sps;
+    H265RawPPS   raw_pps;
+    H265RawSEI   raw_sei;
+    H265RawSlice raw_slice;
+
+    H265RawSEIMasteringDisplayColourVolume sei_mastering_display;
+    H265RawSEIContentLightLevelInfo        sei_content_light_level;
+
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment current_access_unit;
+    int aud_needed;
+    int sei_needed;
+} VAAPIEncodeH265Context;
 
 
 static int vaapi_encode_h265_write_access_unit(AVCodecContext *avctx,
                                                char *data, size_t *data_len,
                                                CodedBitstreamFragment *au)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context *priv = ctx->priv_data;
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
     int err;
 
     err = ff_cbs_write_fragment_data(priv->cbc, au);
@@ -99,8 +116,7 @@ static int vaapi_encode_h265_add_nal(AVCodecContext *avctx,
                                      CodedBitstreamFragment *au,
                                      void *nal_unit)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context *priv = ctx->priv_data;
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
     H265RawNALUnitHeader *header = nal_unit;
     int err;
 
@@ -118,33 +134,32 @@ static int vaapi_encode_h265_add_nal(AVCodecContext *avctx,
 static int vaapi_encode_h265_write_sequence_header(AVCodecContext *avctx,
                                                    char *data, size_t *data_len)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context *priv = ctx->priv_data;
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
     CodedBitstreamFragment   *au = &priv->current_access_unit;
     int err;
 
     if (priv->aud_needed) {
-        err = vaapi_encode_h265_add_nal(avctx, au, &priv->aud);
+        err = vaapi_encode_h265_add_nal(avctx, au, &priv->raw_aud);
         if (err < 0)
             goto fail;
         priv->aud_needed = 0;
     }
 
-    err = vaapi_encode_h265_add_nal(avctx, au, &priv->vps);
+    err = vaapi_encode_h265_add_nal(avctx, au, &priv->raw_vps);
     if (err < 0)
         goto fail;
 
-    err = vaapi_encode_h265_add_nal(avctx, au, &priv->sps);
+    err = vaapi_encode_h265_add_nal(avctx, au, &priv->raw_sps);
     if (err < 0)
         goto fail;
 
-    err = vaapi_encode_h265_add_nal(avctx, au, &priv->pps);
+    err = vaapi_encode_h265_add_nal(avctx, au, &priv->raw_pps);
     if (err < 0)
         goto fail;
 
     err = vaapi_encode_h265_write_access_unit(avctx, data, data_len, au);
 fail:
-    ff_cbs_fragment_uninit(priv->cbc, au);
+    ff_cbs_fragment_reset(priv->cbc, au);
     return err;
 }
 
@@ -153,48 +168,133 @@ static int vaapi_encode_h265_write_slice_header(AVCodecContext *avctx,
                                                 VAAPIEncodeSlice *slice,
                                                 char *data, size_t *data_len)
 {
-    VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context *priv = ctx->priv_data;
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
     CodedBitstreamFragment   *au = &priv->current_access_unit;
     int err;
 
     if (priv->aud_needed) {
-        err = vaapi_encode_h265_add_nal(avctx, au, &priv->aud);
+        err = vaapi_encode_h265_add_nal(avctx, au, &priv->raw_aud);
         if (err < 0)
             goto fail;
         priv->aud_needed = 0;
     }
 
-    err = vaapi_encode_h265_add_nal(avctx, au, &priv->slice);
+    err = vaapi_encode_h265_add_nal(avctx, au, &priv->raw_slice);
     if (err < 0)
         goto fail;
 
     err = vaapi_encode_h265_write_access_unit(avctx, data, data_len, au);
 fail:
-    ff_cbs_fragment_uninit(priv->cbc, au);
+    ff_cbs_fragment_reset(priv->cbc, au);
+    return err;
+}
+
+static int vaapi_encode_h265_write_extra_header(AVCodecContext *avctx,
+                                                VAAPIEncodePicture *pic,
+                                                int index, int *type,
+                                                char *data, size_t *data_len)
+{
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
+    CodedBitstreamFragment   *au = &priv->current_access_unit;
+    int err, i;
+
+    if (priv->sei_needed) {
+        H265RawSEI *sei = &priv->raw_sei;
+
+        if (priv->aud_needed) {
+            err = vaapi_encode_h265_add_nal(avctx, au, &priv->aud);
+            if (err < 0)
+                goto fail;
+            priv->aud_needed = 0;
+        }
+
+        *sei = (H265RawSEI) {
+            .nal_unit_header = {
+                .nal_unit_type         = HEVC_NAL_SEI_PREFIX,
+                .nuh_layer_id          = 0,
+                .nuh_temporal_id_plus1 = 1,
+            },
+        };
+
+        i = 0;
+
+        if (priv->sei_needed & SEI_MASTERING_DISPLAY) {
+            sei->payload[i].payload_type = HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO;
+            sei->payload[i].payload.mastering_display = priv->sei_mastering_display;
+            ++i;
+        }
+
+        if (priv->sei_needed & SEI_CONTENT_LIGHT_LEVEL) {
+            sei->payload[i].payload_type = HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO;
+            sei->payload[i].payload.content_light_level = priv->sei_content_light_level;
+            ++i;
+        }
+
+        sei->payload_count = i;
+        av_assert0(sei->payload_count > 0);
+
+        err = vaapi_encode_h265_add_nal(avctx, au, sei);
+        if (err < 0)
+            goto fail;
+        priv->sei_needed = 0;
+
+        err = vaapi_encode_h265_write_access_unit(avctx, data, data_len, au);
+        if (err < 0)
+            goto fail;
+
+        ff_cbs_fragment_reset(priv->cbc, au);
+
+        *type = VAEncPackedHeaderRawData;
+        return 0;
+    } else {
+        return AVERROR_EOF;
+    }
+
+fail:
+    ff_cbs_fragment_reset(priv->cbc, au);
     return err;
 }
 
 static int vaapi_encode_h265_init_sequence_params(AVCodecContext *avctx)
 {
     VAAPIEncodeContext                *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context           *priv = ctx->priv_data;
-    H265RawVPS                        *vps = &priv->vps;
-    H265RawSPS                        *sps = &priv->sps;
-    H265RawPPS                        *pps = &priv->pps;
+    VAAPIEncodeH265Context           *priv = avctx->priv_data;
+    H265RawVPS                        *vps = &priv->raw_vps;
+    H265RawSPS                        *sps = &priv->raw_sps;
+    H265RawPPS                        *pps = &priv->raw_pps;
+    H265RawProfileTierLevel           *ptl = &vps->profile_tier_level;
     H265RawVUI                        *vui = &sps->vui;
     VAEncSequenceParameterBufferHEVC *vseq = ctx->codec_sequence_params;
     VAEncPictureParameterBufferHEVC  *vpic = ctx->codec_picture_params;
+    const AVPixFmtDescriptor *desc;
+    int chroma_format, bit_depth;
     int i;
 
-    memset(&priv->current_access_unit, 0,
-           sizeof(priv->current_access_unit));
-
     memset(vps, 0, sizeof(*vps));
     memset(sps, 0, sizeof(*sps));
     memset(pps, 0, sizeof(*pps));
 
 
+    desc = av_pix_fmt_desc_get(priv->common.input_frames->sw_format);
+    av_assert0(desc);
+    if (desc->nb_components == 1) {
+        chroma_format = 0;
+    } else {
+        if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
+            chroma_format = 1;
+        } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
+            chroma_format = 2;
+        } else if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
+            chroma_format = 3;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Chroma format of input pixel format "
+                   "%s is not supported.\n", desc->name);
+            return AVERROR(EINVAL);
+        }
+    }
+    bit_depth = desc->comp[0].depth;
+
+
     // VPS
 
     vps->nal_unit_header = (H265RawNALUnitHeader) {
@@ -211,23 +311,57 @@ static int vaapi_encode_h265_init_sequence_params(AVCodecContext *avctx)
     vps->vps_max_sub_layers_minus1     = 0;
     vps->vps_temporal_id_nesting_flag  = 1;
 
-    vps->profile_tier_level = (H265RawProfileTierLevel) {
-        .general_profile_space = 0,
-        .general_profile_idc   = avctx->profile,
-        .general_tier_flag     = 0,
+    ptl->general_profile_space = 0;
+    ptl->general_profile_idc   = avctx->profile;
+    ptl->general_tier_flag     = priv->tier;
 
-        .general_progressive_source_flag    = 1,
-        .general_interlaced_source_flag     = 0,
-        .general_non_packed_constraint_flag = 1,
-        .general_frame_only_constraint_flag = 1,
+    if (chroma_format == 1) {
+        ptl->general_profile_compatibility_flag[1] = bit_depth ==  8;
+        ptl->general_profile_compatibility_flag[2] = bit_depth <= 10;
+    }
+    ptl->general_profile_compatibility_flag[4] = 1;
 
-        .general_level_idc     = avctx->level,
-    };
-    vps->profile_tier_level.general_profile_compatibility_flag[avctx->profile & 31] = 1;
+    ptl->general_progressive_source_flag    = 1;
+    ptl->general_interlaced_source_flag     = 0;
+    ptl->general_non_packed_constraint_flag = 1;
+    ptl->general_frame_only_constraint_flag = 1;
+
+    ptl->general_max_12bit_constraint_flag = bit_depth <= 12;
+    ptl->general_max_10bit_constraint_flag = bit_depth <= 10;
+    ptl->general_max_8bit_constraint_flag  = bit_depth ==  8;
+
+    ptl->general_max_422chroma_constraint_flag  = chroma_format <= 2;
+    ptl->general_max_420chroma_constraint_flag  = chroma_format <= 1;
+    ptl->general_max_monochrome_constraint_flag = chroma_format == 0;
+
+    ptl->general_intra_constraint_flag = ctx->gop_size == 1;
+
+    ptl->general_lower_bit_rate_constraint_flag = 1;
+
+    if (avctx->level != FF_LEVEL_UNKNOWN) {
+        ptl->general_level_idc = avctx->level;
+    } else {
+        const H265LevelDescriptor *level;
+
+        level = ff_h265_guess_level(ptl, avctx->bit_rate,
+                                    ctx->surface_width, ctx->surface_height,
+                                    ctx->nb_slices, 1, 1,
+                                    (ctx->b_per_p > 0) + 1);
+        if (level) {
+            av_log(avctx, AV_LOG_VERBOSE, "Using level %s.\n", level->name);
+            ptl->general_level_idc = level->level_idc;
+        } else {
+            av_log(avctx, AV_LOG_VERBOSE, "Stream will not conform to "
+                   "any normal level; using level 8.5.\n");
+            ptl->general_level_idc = 255;
+            // The tier flag must be set in level 8.5.
+            ptl->general_tier_flag = 1;
+        }
+    }
 
     vps->vps_sub_layer_ordering_info_present_flag = 0;
-    vps->vps_max_dec_pic_buffering_minus1[0]      = (ctx->b_per_p > 0) + 1;
-    vps->vps_max_num_reorder_pics[0]              = (ctx->b_per_p > 0);
+    vps->vps_max_dec_pic_buffering_minus1[0]      = ctx->max_b_depth + 1;
+    vps->vps_max_num_reorder_pics[0]              = ctx->max_b_depth;
     vps->vps_max_latency_increase_plus1[0]        = 0;
 
     vps->vps_max_layer_id             = 0;
@@ -265,7 +399,7 @@ static int vaapi_encode_h265_init_sequence_params(AVCodecContext *avctx)
 
     sps->sps_seq_parameter_set_id = 0;
 
-    sps->chroma_format_idc          = 1; // YUV 4:2:0.
+    sps->chroma_format_idc          = chroma_format;
     sps->separate_colour_plane_flag = 0;
 
     sps->pic_width_in_luma_samples  = ctx->surface_width;
@@ -284,9 +418,8 @@ static int vaapi_encode_h265_init_sequence_params(AVCodecContext *avctx)
         sps->conformance_window_flag = 0;
     }
 
-    sps->bit_depth_luma_minus8 =
-        avctx->profile == FF_PROFILE_HEVC_MAIN_10 ? 2 : 0;
-    sps->bit_depth_chroma_minus8 = sps->bit_depth_luma_minus8;
+    sps->bit_depth_luma_minus8   = bit_depth - 8;
+    sps->bit_depth_chroma_minus8 = bit_depth - 8;
 
     sps->log2_max_pic_order_cnt_lsb_minus4 = 8;
 
@@ -338,18 +471,20 @@ static int vaapi_encode_h265_init_sequence_params(AVCodecContext *avctx)
             {  80, 33 }, {  18, 11 }, {  15, 11 }, {  64, 33 },
             { 160, 99 }, {   4,  3 }, {   3,  2 }, {   2,  1 },
         };
-        int i;
+        int num, den, i;
+        av_reduce(&num, &den, avctx->sample_aspect_ratio.num,
+                  avctx->sample_aspect_ratio.den, 65535);
         for (i = 0; i < FF_ARRAY_ELEMS(sar_idc); i++) {
-            if (avctx->sample_aspect_ratio.num == sar_idc[i].num &&
-                avctx->sample_aspect_ratio.den == sar_idc[i].den) {
+            if (num == sar_idc[i].num &&
+                den == sar_idc[i].den) {
                 vui->aspect_ratio_idc = i;
                 break;
             }
         }
         if (i >= FF_ARRAY_ELEMS(sar_idc)) {
             vui->aspect_ratio_idc = 255;
-            vui->sar_width  = avctx->sample_aspect_ratio.num;
-            vui->sar_height = avctx->sample_aspect_ratio.den;
+            vui->sar_width  = num;
+            vui->sar_height = den;
         }
         vui->aspect_ratio_info_present_flag = 1;
     }
@@ -431,10 +566,10 @@ static int vaapi_encode_h265_init_sequence_params(AVCodecContext *avctx)
         .general_level_idc   = vps->profile_tier_level.general_level_idc,
         .general_tier_flag   = vps->profile_tier_level.general_tier_flag,
 
-        .intra_period     = avctx->gop_size,
-        .intra_idr_period = avctx->gop_size,
+        .intra_period     = ctx->gop_size,
+        .intra_idr_period = ctx->gop_size,
         .ip_period        = ctx->b_per_p + 1,
-        .bits_per_second   = avctx->bit_rate,
+        .bits_per_second  = ctx->va_bit_rate,
 
         .pic_width_in_luma_samples  = sps->pic_width_in_luma_samples,
         .pic_height_in_luma_samples = sps->pic_height_in_luma_samples,
@@ -538,68 +673,156 @@ static int vaapi_encode_h265_init_picture_params(AVCodecContext *avctx,
                                                  VAAPIEncodePicture *pic)
 {
     VAAPIEncodeContext               *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context          *priv = ctx->priv_data;
-    VAAPIEncodeH265Options           *opt = ctx->codec_options;
+    VAAPIEncodeH265Context          *priv = avctx->priv_data;
+    VAAPIEncodeH265Picture          *hpic = pic->priv_data;
+    VAAPIEncodePicture              *prev = pic->prev;
+    VAAPIEncodeH265Picture         *hprev = prev ? prev->priv_data : NULL;
     VAEncPictureParameterBufferHEVC *vpic = pic->codec_picture_params;
     int i;
 
     if (pic->type == PICTURE_TYPE_IDR) {
         av_assert0(pic->display_order == pic->encode_order);
 
-        priv->last_idr_frame = pic->display_order;
+        hpic->last_idr_frame = pic->display_order;
 
-        priv->slice_nal_unit = HEVC_NAL_IDR_W_RADL;
-        priv->slice_type     = HEVC_SLICE_I;
-        priv->pic_type       = 0;
+        hpic->slice_nal_unit = HEVC_NAL_IDR_W_RADL;
+        hpic->slice_type     = HEVC_SLICE_I;
+        hpic->pic_type       = 0;
     } else {
-        av_assert0(pic->encode_order > priv->last_idr_frame);
+        av_assert0(prev);
+        hpic->last_idr_frame = hprev->last_idr_frame;
 
         if (pic->type == PICTURE_TYPE_I) {
-            priv->slice_nal_unit = HEVC_NAL_CRA_NUT;
-            priv->slice_type     = HEVC_SLICE_I;
-            priv->pic_type       = 0;
+            hpic->slice_nal_unit = HEVC_NAL_CRA_NUT;
+            hpic->slice_type     = HEVC_SLICE_I;
+            hpic->pic_type       = 0;
         } else if (pic->type == PICTURE_TYPE_P) {
             av_assert0(pic->refs[0]);
-            priv->slice_nal_unit = HEVC_NAL_TRAIL_R;
-            priv->slice_type     = HEVC_SLICE_P;
-            priv->pic_type       = 1;
+            hpic->slice_nal_unit = HEVC_NAL_TRAIL_R;
+            hpic->slice_type     = HEVC_SLICE_P;
+            hpic->pic_type       = 1;
         } else {
+            VAAPIEncodePicture *irap_ref;
             av_assert0(pic->refs[0] && pic->refs[1]);
-            if (pic->refs[1]->type == PICTURE_TYPE_I)
-                priv->slice_nal_unit = HEVC_NAL_RASL_N;
-            else
-                priv->slice_nal_unit = HEVC_NAL_TRAIL_N;
-            priv->slice_type = HEVC_SLICE_B;
-            priv->pic_type   = 2;
+            for (irap_ref = pic; irap_ref; irap_ref = irap_ref->refs[1]) {
+                if (irap_ref->type == PICTURE_TYPE_I)
+                    break;
+            }
+            if (pic->b_depth == ctx->max_b_depth) {
+                hpic->slice_nal_unit = irap_ref ? HEVC_NAL_RASL_N
+                                                : HEVC_NAL_TRAIL_N;
+            } else {
+                hpic->slice_nal_unit = irap_ref ? HEVC_NAL_RASL_R
+                                                : HEVC_NAL_TRAIL_R;
+            }
+            hpic->slice_type = HEVC_SLICE_B;
+            hpic->pic_type   = 2;
         }
     }
-    priv->pic_order_cnt = pic->display_order - priv->last_idr_frame;
+    hpic->pic_order_cnt = pic->display_order - hpic->last_idr_frame;
 
-    if (opt->aud) {
+    if (priv->aud) {
         priv->aud_needed = 1;
-        priv->aud.nal_unit_header = (H265RawNALUnitHeader) {
-            .nal_unit_type         = HEVC_NAL_AUD,
-            .nuh_layer_id          = 0,
-            .nuh_temporal_id_plus1 = 1,
+        priv->raw_aud = (H265RawAUD) {
+            .nal_unit_header = {
+                .nal_unit_type         = HEVC_NAL_AUD,
+                .nuh_layer_id          = 0,
+                .nuh_temporal_id_plus1 = 1,
+            },
+            .pic_type = hpic->pic_type,
         };
-        priv->aud.pic_type = priv->pic_type;
     } else {
         priv->aud_needed = 0;
     }
 
+    priv->sei_needed = 0;
+
+    // Only look for the metadata on I/IDR frame on the output. We
+    // may force an IDR frame on the output where the medadata gets
+    // changed on the input frame.
+    if ((priv->sei & SEI_MASTERING_DISPLAY) &&
+        (pic->type == PICTURE_TYPE_I || pic->type == PICTURE_TYPE_IDR)) {
+        AVFrameSideData *sd =
+            av_frame_get_side_data(pic->input_image,
+                                   AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+
+        if (sd) {
+            AVMasteringDisplayMetadata *mdm =
+                (AVMasteringDisplayMetadata *)sd->data;
+
+            // SEI is needed when both the primaries and luminance are set
+            if (mdm->has_primaries && mdm->has_luminance) {
+                H265RawSEIMasteringDisplayColourVolume *mdcv =
+                    &priv->sei_mastering_display;
+                const int mapping[3] = {1, 2, 0};
+                const int chroma_den = 50000;
+                const int luma_den   = 10000;
+
+                for (i = 0; i < 3; i++) {
+                    const int j = mapping[i];
+                    mdcv->display_primaries_x[i] =
+                        FFMIN(lrint(chroma_den *
+                                    av_q2d(mdm->display_primaries[j][0])),
+                              chroma_den);
+                    mdcv->display_primaries_y[i] =
+                        FFMIN(lrint(chroma_den *
+                                    av_q2d(mdm->display_primaries[j][1])),
+                              chroma_den);
+                }
+
+                mdcv->white_point_x =
+                    FFMIN(lrint(chroma_den * av_q2d(mdm->white_point[0])),
+                          chroma_den);
+                mdcv->white_point_y =
+                    FFMIN(lrint(chroma_den * av_q2d(mdm->white_point[1])),
+                          chroma_den);
+
+                mdcv->max_display_mastering_luminance =
+                    lrint(luma_den * av_q2d(mdm->max_luminance));
+                mdcv->min_display_mastering_luminance =
+                    FFMIN(lrint(luma_den * av_q2d(mdm->min_luminance)),
+                          mdcv->max_display_mastering_luminance);
+
+                priv->sei_needed |= SEI_MASTERING_DISPLAY;
+            }
+        }
+    }
+
+    if ((priv->sei & SEI_CONTENT_LIGHT_LEVEL) &&
+        (pic->type == PICTURE_TYPE_I || pic->type == PICTURE_TYPE_IDR)) {
+        AVFrameSideData *sd =
+            av_frame_get_side_data(pic->input_image,
+                                   AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
+
+        if (sd) {
+            AVContentLightMetadata *clm =
+                (AVContentLightMetadata *)sd->data;
+            H265RawSEIContentLightLevelInfo *clli =
+                &priv->sei_content_light_level;
+
+            clli->max_content_light_level     = FFMIN(clm->MaxCLL,  65535);
+            clli->max_pic_average_light_level = FFMIN(clm->MaxFALL, 65535);
+
+            priv->sei_needed |= SEI_CONTENT_LIGHT_LEVEL;
+        }
+    }
+
     vpic->decoded_curr_pic = (VAPictureHEVC) {
         .picture_id    = pic->recon_surface,
-        .pic_order_cnt = priv->pic_order_cnt,
+        .pic_order_cnt = hpic->pic_order_cnt,
         .flags         = 0,
     };
 
     for (i = 0; i < pic->nb_refs; i++) {
-        VAAPIEncodePicture *ref = pic->refs[i];
+        VAAPIEncodePicture      *ref = pic->refs[i];
+        VAAPIEncodeH265Picture *href;
+
         av_assert0(ref && ref->encode_order < pic->encode_order);
+        href = ref->priv_data;
 
         vpic->reference_frames[i] = (VAPictureHEVC) {
             .picture_id    = ref->recon_surface,
-            .pic_order_cnt = ref->display_order - priv->last_idr_frame,
+            .pic_order_cnt = href->pic_order_cnt,
             .flags = (ref->display_order < pic->display_order ?
                       VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE : 0) |
                      (ref->display_order > pic->display_order ?
@@ -615,7 +838,7 @@ static int vaapi_encode_h265_init_picture_params(AVCodecContext *avctx,
 
     vpic->coded_buf = pic->output_buffer;
 
-    vpic->nal_unit_type = priv->slice_nal_unit;
+    vpic->nal_unit_type = hpic->slice_nal_unit;
 
     switch (pic->type) {
     case PICTURE_TYPE_IDR:
@@ -642,8 +865,6 @@ static int vaapi_encode_h265_init_picture_params(AVCodecContext *avctx,
         av_assert0(0 && "invalid picture type");
     }
 
-    pic->nb_slices = 1;
-
     return 0;
 }
 
@@ -651,87 +872,105 @@ static int vaapi_encode_h265_init_slice_params(AVCodecContext *avctx,
                                                VAAPIEncodePicture *pic,
                                                VAAPIEncodeSlice *slice)
 {
-    VAAPIEncodeContext                *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context           *priv = ctx->priv_data;
-    const H265RawSPS                  *sps = &priv->sps;
-    const H265RawPPS                  *pps = &priv->pps;
-    H265RawSliceHeader                 *sh = &priv->slice.header;
+    VAAPIEncodeH265Context           *priv = avctx->priv_data;
+    VAAPIEncodeH265Picture           *hpic = pic->priv_data;
+    const H265RawSPS                  *sps = &priv->raw_sps;
+    const H265RawPPS                  *pps = &priv->raw_pps;
+    H265RawSliceHeader                 *sh = &priv->raw_slice.header;
     VAEncPictureParameterBufferHEVC  *vpic = pic->codec_picture_params;
     VAEncSliceParameterBufferHEVC  *vslice = slice->codec_slice_params;
     int i;
 
     sh->nal_unit_header = (H265RawNALUnitHeader) {
-        .nal_unit_type         = priv->slice_nal_unit,
+        .nal_unit_type         = hpic->slice_nal_unit,
         .nuh_layer_id          = 0,
         .nuh_temporal_id_plus1 = 1,
     };
 
     sh->slice_pic_parameter_set_id      = pps->pps_pic_parameter_set_id;
 
-    // Currently we only support one slice per frame.
-    sh->first_slice_segment_in_pic_flag = 1;
-    sh->slice_segment_address           = 0;
+    sh->first_slice_segment_in_pic_flag = slice->index == 0;
+    sh->slice_segment_address           = slice->block_start;
 
-    sh->slice_type = priv->slice_type;
+    sh->slice_type = hpic->slice_type;
 
-    sh->slice_pic_order_cnt_lsb = priv->pic_order_cnt &
+    sh->slice_pic_order_cnt_lsb = hpic->pic_order_cnt &
         (1 << (sps->log2_max_pic_order_cnt_lsb_minus4 + 4)) - 1;
 
     if (pic->type != PICTURE_TYPE_IDR) {
         H265RawSTRefPicSet *rps;
-        VAAPIEncodePicture *st;
-        int used;
+        const VAAPIEncodeH265Picture *strp;
+        int rps_poc[MAX_DPB_SIZE];
+        int rps_used[MAX_DPB_SIZE];
+        int i, j, poc, rps_pics;
 
         sh->short_term_ref_pic_set_sps_flag = 0;
 
         rps = &sh->short_term_ref_pic_set;
         memset(rps, 0, sizeof(*rps));
 
-        for (st = ctx->pic_start; st; st = st->next) {
-            if (st->encode_order >= pic->encode_order) {
-                // Not yet in DPB.
+        rps_pics = 0;
+        for (i = 0; i < pic->nb_refs; i++) {
+            strp = pic->refs[i]->priv_data;
+            rps_poc[rps_pics]  = strp->pic_order_cnt;
+            rps_used[rps_pics] = 1;
+            ++rps_pics;
+        }
+        for (i = 0; i < pic->nb_dpb_pics; i++) {
+            if (pic->dpb[i] == pic)
                 continue;
+            for (j = 0; j < pic->nb_refs; j++) {
+                if (pic->dpb[i] == pic->refs[j])
+                    break;
             }
-            used = 0;
-            for (i = 0; i < pic->nb_refs; i++) {
-                if (pic->refs[i] == st)
-                    used = 1;
-            }
-            if (!used) {
-                // Usually each picture always uses all of the others in the
-                // DPB as references.  The one case we have to treat here is
-                // a non-IDR IRAP picture, which may need to hold unused
-                // references across itself to be used for the decoding of
-                // following RASL pictures.  This looks for such an RASL
-                // picture, and keeps the reference if there is one.
-                VAAPIEncodePicture *rp;
-                for (rp = ctx->pic_start; rp; rp = rp->next) {
-                    if (rp->encode_order < pic->encode_order)
-                        continue;
-                    if (rp->type != PICTURE_TYPE_B)
-                        continue;
-                    if (rp->refs[0] == st && rp->refs[1] == pic)
-                        break;
-                }
-                if (!rp)
-                    continue;
-            }
-            // This only works for one instance of each (delta_poc_sN_minus1
-            // is relative to the previous frame in the list, not relative to
-            // the current frame directly).
-            if (st->display_order < pic->display_order) {
-                rps->delta_poc_s0_minus1[rps->num_negative_pics] =
-                    pic->display_order - st->display_order - 1;
-                rps->used_by_curr_pic_s0_flag[rps->num_negative_pics] = used;
-                ++rps->num_negative_pics;
-            } else {
-                rps->delta_poc_s1_minus1[rps->num_positive_pics] =
-                    st->display_order - pic->display_order - 1;
-                rps->used_by_curr_pic_s1_flag[rps->num_positive_pics] = used;
-                ++rps->num_positive_pics;
+            if (j < pic->nb_refs)
+                continue;
+            strp = pic->dpb[i]->priv_data;
+            rps_poc[rps_pics]  = strp->pic_order_cnt;
+            rps_used[rps_pics] = 0;
+            ++rps_pics;
+        }
+
+        for (i = 1; i < rps_pics; i++) {
+            for (j = i; j > 0; j--) {
+                if (rps_poc[j] > rps_poc[j - 1])
+                    break;
+                av_assert0(rps_poc[j] != rps_poc[j - 1]);
+                FFSWAP(int, rps_poc[j],  rps_poc[j - 1]);
+                FFSWAP(int, rps_used[j], rps_used[j - 1]);
             }
         }
 
+        av_log(avctx, AV_LOG_DEBUG, "RPS for POC %d:",
+               hpic->pic_order_cnt);
+        for (i = 0; i < rps_pics; i++) {
+            av_log(avctx, AV_LOG_DEBUG, " (%d,%d)",
+                   rps_poc[i], rps_used[i]);
+        }
+        av_log(avctx, AV_LOG_DEBUG, "\n");
+
+        for (i = 0; i < rps_pics; i++) {
+            av_assert0(rps_poc[i] != hpic->pic_order_cnt);
+            if (rps_poc[i] > hpic->pic_order_cnt)
+                break;
+        }
+
+        rps->num_negative_pics = i;
+        poc = hpic->pic_order_cnt;
+        for (j = i - 1; j >= 0; j--) {
+            rps->delta_poc_s0_minus1[i - 1 - j] = poc - rps_poc[j] - 1;
+            rps->used_by_curr_pic_s0_flag[i - 1 - j] = rps_used[j];
+            poc = rps_poc[j];
+        }
+
+        rps->num_positive_pics = rps_pics - i;
+        poc = hpic->pic_order_cnt;
+        for (j = i; j < rps_pics; j++) {
+            rps->delta_poc_s1_minus1[j - i] = rps_poc[j] - poc - 1;
+            rps->used_by_curr_pic_s1_flag[j - i] = rps_used[j];
+            poc = rps_poc[j];
+        }
+
         sh->num_long_term_sps  = 0;
         sh->num_long_term_pics = 0;
 
@@ -760,7 +999,7 @@ static int vaapi_encode_h265_init_slice_params(AVCodecContext *avctx,
 
     *vslice = (VAEncSliceParameterBufferHEVC) {
         .slice_segment_address = sh->slice_segment_address,
-        .num_ctu_in_slice      = priv->ctu_width * priv->ctu_height,
+        .num_ctu_in_slice      = slice->block_size,
 
         .slice_type                 = sh->slice_type,
         .slice_pic_parameter_set_id = sh->slice_pic_parameter_set_id,
@@ -781,7 +1020,7 @@ static int vaapi_encode_h265_init_slice_params(AVCodecContext *avctx,
         .slice_tc_offset_div2   = sh->slice_tc_offset_div2,
 
         .slice_fields.bits = {
-            .last_slice_of_pic_flag       = 1,
+            .last_slice_of_pic_flag       = slice->index == pic->nb_slices - 1,
             .dependent_slice_segment_flag = sh->dependent_slice_segment_flag,
             .colour_plane_id              = sh->colour_plane_id,
             .slice_temporal_mvp_enabled_flag =
@@ -826,31 +1065,29 @@ static int vaapi_encode_h265_init_slice_params(AVCodecContext *avctx,
 static av_cold int vaapi_encode_h265_configure(AVCodecContext *avctx)
 {
     VAAPIEncodeContext      *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context *priv = ctx->priv_data;
-    VAAPIEncodeH265Options  *opt = ctx->codec_options;
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
     int err;
 
     err = ff_cbs_init(&priv->cbc, AV_CODEC_ID_HEVC, avctx);
     if (err < 0)
         return err;
 
-    priv->ctu_width     = FFALIGN(ctx->surface_width,  32) / 32;
-    priv->ctu_height    = FFALIGN(ctx->surface_height, 32) / 32;
-
-    av_log(avctx, AV_LOG_VERBOSE, "Input %ux%u -> Surface %ux%u -> CTU %ux%u.\n",
-           avctx->width, avctx->height, ctx->surface_width,
-           ctx->surface_height, priv->ctu_width, priv->ctu_height);
-
     if (ctx->va_rc_mode == VA_RC_CQP) {
-        priv->fixed_qp_p = opt->qp;
+        // Note that VAAPI only supports positive QP values - the range is
+        // therefore always bounded below by 1, even in 10-bit mode where
+        // it should go down to -12.
+
+        priv->fixed_qp_p = av_clip(ctx->rc_quality, 1, 51);
         if (avctx->i_quant_factor > 0.0)
-            priv->fixed_qp_idr = (int)((priv->fixed_qp_p * avctx->i_quant_factor +
-                                        avctx->i_quant_offset) + 0.5);
+            priv->fixed_qp_idr =
+                av_clip((avctx->i_quant_factor * priv->fixed_qp_p +
+                         avctx->i_quant_offset) + 0.5, 1, 51);
         else
             priv->fixed_qp_idr = priv->fixed_qp_p;
         if (avctx->b_quant_factor > 0.0)
-            priv->fixed_qp_b = (int)((priv->fixed_qp_p * avctx->b_quant_factor +
-                                      avctx->b_quant_offset) + 0.5);
+            priv->fixed_qp_b =
+                av_clip((avctx->b_quant_factor * priv->fixed_qp_p +
+                         avctx->b_quant_offset) + 0.5, 1, 51);
         else
             priv->fixed_qp_b = priv->fixed_qp_p;
 
@@ -858,29 +1095,40 @@ static av_cold int vaapi_encode_h265_configure(AVCodecContext *avctx)
                "%d / %d / %d for IDR- / P- / B-frames.\n",
                priv->fixed_qp_idr, priv->fixed_qp_p, priv->fixed_qp_b);
 
-    } else if (ctx->va_rc_mode == VA_RC_CBR ||
-               ctx->va_rc_mode == VA_RC_VBR) {
-        // These still need to be  set for pic_init_qp/slice_qp_delta.
+    } else {
+        // These still need to be set for init_qp/slice_qp_delta.
         priv->fixed_qp_idr = 30;
         priv->fixed_qp_p   = 30;
         priv->fixed_qp_b   = 30;
-
-        av_log(avctx, AV_LOG_DEBUG, "Using %s-bitrate = %d bps.\n",
-               ctx->va_rc_mode == VA_RC_CBR ? "constant" : "variable",
-               avctx->bit_rate);
-
-    } else {
-        av_assert0(0 && "Invalid RC mode.");
     }
 
     return 0;
 }
 
+static const VAAPIEncodeProfile vaapi_encode_h265_profiles[] = {
+    { FF_PROFILE_HEVC_MAIN,     8, 3, 1, 1, VAProfileHEVCMain       },
+    { FF_PROFILE_HEVC_REXT,     8, 3, 1, 1, VAProfileHEVCMain       },
+#if VA_CHECK_VERSION(0, 37, 0)
+    { FF_PROFILE_HEVC_MAIN_10, 10, 3, 1, 1, VAProfileHEVCMain10     },
+    { FF_PROFILE_HEVC_REXT,    10, 3, 1, 1, VAProfileHEVCMain10     },
+#endif
+    { FF_PROFILE_UNKNOWN }
+};
+
 static const VAAPIEncodeType vaapi_encode_type_h265 = {
-    .priv_data_size        = sizeof(VAAPIEncodeH265Context),
+    .profiles              = vaapi_encode_h265_profiles,
+
+    .flags                 = FLAG_SLICE_CONTROL |
+                             FLAG_B_PICTURES |
+                             FLAG_B_PICTURE_REFERENCES |
+                             FLAG_NON_IDR_KEY_PICTURES,
+
+    .default_quality       = 25,
 
     .configure             = &vaapi_encode_h265_configure,
 
+    .picture_priv_data_size = sizeof(VAAPIEncodeH265Picture),
+
     .sequence_params_size  = sizeof(VAEncSequenceParameterBufferHEVC),
     .init_sequence_params  = &vaapi_encode_h265_init_sequence_params,
 
@@ -895,96 +1143,89 @@ static const VAAPIEncodeType vaapi_encode_type_h265 = {
 
     .slice_header_type     = VAEncPackedHeaderHEVC_Slice,
     .write_slice_header    = &vaapi_encode_h265_write_slice_header,
+
+    .write_extra_header    = &vaapi_encode_h265_write_extra_header,
 };
 
 static av_cold int vaapi_encode_h265_init(AVCodecContext *avctx)
 {
-    VAAPIEncodeContext     *ctx = avctx->priv_data;
-    VAAPIEncodeH265Options *opt =
-        (VAAPIEncodeH265Options*)ctx->codec_options_data;
+    VAAPIEncodeContext      *ctx = avctx->priv_data;
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
 
     ctx->codec = &vaapi_encode_type_h265;
 
     if (avctx->profile == FF_PROFILE_UNKNOWN)
-        avctx->profile = opt->profile;
+        avctx->profile = priv->profile;
     if (avctx->level == FF_LEVEL_UNKNOWN)
-        avctx->level = opt->level;
+        avctx->level = priv->level;
 
-    switch (avctx->profile) {
-    case FF_PROFILE_HEVC_MAIN:
-    case FF_PROFILE_UNKNOWN:
-        ctx->va_profile = VAProfileHEVCMain;
-        ctx->va_rt_format = VA_RT_FORMAT_YUV420;
-        break;
-    case FF_PROFILE_HEVC_MAIN_10:
-#ifdef VA_RT_FORMAT_YUV420_10BPP
-        ctx->va_profile = VAProfileHEVCMain10;
-        ctx->va_rt_format = VA_RT_FORMAT_YUV420_10BPP;
-        break;
-#else
-        av_log(avctx, AV_LOG_ERROR, "10-bit encoding is not "
-               "supported with this VAAPI version.\n");
-        return AVERROR(ENOSYS);
-#endif
-    default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown H.265 profile %d.\n",
-               avctx->profile);
+    if (avctx->level != FF_LEVEL_UNKNOWN && avctx->level & ~0xff) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid level %d: must fit "
+               "in 8-bit unsigned integer.\n", avctx->level);
         return AVERROR(EINVAL);
     }
-    ctx->va_entrypoint = VAEntrypointEncSlice;
-
-    if (avctx->bit_rate > 0) {
-        if (avctx->rc_max_rate == avctx->bit_rate)
-            ctx->va_rc_mode = VA_RC_CBR;
-        else
-            ctx->va_rc_mode = VA_RC_VBR;
-    } else
-        ctx->va_rc_mode = VA_RC_CQP;
 
-    ctx->va_packed_headers =
+    ctx->desired_packed_headers =
         VA_ENC_PACKED_HEADER_SEQUENCE | // VPS, SPS and PPS.
-        VA_ENC_PACKED_HEADER_SLICE;     // Slice headers.
+        VA_ENC_PACKED_HEADER_SLICE    | // Slice headers.
+        VA_ENC_PACKED_HEADER_MISC;      // SEI
 
     ctx->surface_width  = FFALIGN(avctx->width,  16);
     ctx->surface_height = FFALIGN(avctx->height, 16);
 
+    // CTU size is currently hard-coded to 32.
+    ctx->slice_block_width = ctx->slice_block_height = 32;
+
+    if (priv->qp > 0)
+        ctx->explicit_qp = priv->qp;
+
     return ff_vaapi_encode_init(avctx);
 }
 
 static av_cold int vaapi_encode_h265_close(AVCodecContext *avctx)
 {
-    VAAPIEncodeContext *ctx = avctx->priv_data;
-    VAAPIEncodeH265Context *priv = ctx->priv_data;
+    VAAPIEncodeH265Context *priv = avctx->priv_data;
 
-    if (priv)
-        ff_cbs_close(&priv->cbc);
+    ff_cbs_fragment_free(priv->cbc, &priv->current_access_unit);
+    ff_cbs_close(&priv->cbc);
 
     return ff_vaapi_encode_close(avctx);
 }
 
-#define OFFSET(x) (offsetof(VAAPIEncodeContext, codec_options_data) + \
-                   offsetof(VAAPIEncodeH265Options, x))
+#define OFFSET(x) offsetof(VAAPIEncodeH265Context, x)
 #define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
 static const AVOption vaapi_encode_h265_options[] = {
+    VAAPI_ENCODE_COMMON_OPTIONS,
+    VAAPI_ENCODE_RC_OPTIONS,
+
     { "qp", "Constant QP (for P-frames; scaled by qfactor/qoffset for I/B)",
-      OFFSET(qp), AV_OPT_TYPE_INT, { .i64 = 25 }, 0, 52, FLAGS },
+      OFFSET(qp), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 52, FLAGS },
 
     { "aud", "Include AUD",
-      OFFSET(aud), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+      OFFSET(aud), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
 
     { "profile", "Set profile (general_profile_idc)",
       OFFSET(profile), AV_OPT_TYPE_INT,
-      { .i64 = FF_PROFILE_HEVC_MAIN }, 0x00, 0xff, FLAGS, "profile" },
+      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 0xff, FLAGS, "profile" },
 
 #define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
       { .i64 = value }, 0, 0, FLAGS, "profile"
     { PROFILE("main",               FF_PROFILE_HEVC_MAIN) },
     { PROFILE("main10",             FF_PROFILE_HEVC_MAIN_10) },
+    { PROFILE("rext",               FF_PROFILE_HEVC_REXT) },
 #undef PROFILE
 
+    { "tier", "Set tier (general_tier_flag)",
+      OFFSET(tier), AV_OPT_TYPE_INT,
+      { .i64 = 0 }, 0, 1, FLAGS, "tier" },
+    { "main", NULL, 0, AV_OPT_TYPE_CONST,
+      { .i64 = 0 }, 0, 0, FLAGS, "tier" },
+    { "high", NULL, 0, AV_OPT_TYPE_CONST,
+      { .i64 = 1 }, 0, 0, FLAGS, "tier" },
+
     { "level", "Set level (general_level_idc)",
       OFFSET(level), AV_OPT_TYPE_INT,
-      { .i64 = 153 }, 0x00, 0xff, FLAGS, "level" },
+      { .i64 = FF_LEVEL_UNKNOWN }, FF_LEVEL_UNKNOWN, 0xff, FLAGS, "level" },
 
 #define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
       { .i64 = value }, 0, 0, FLAGS, "level"
@@ -1003,6 +1244,17 @@ static const AVOption vaapi_encode_h265_options[] = {
     { LEVEL("6.2", 186) },
 #undef LEVEL
 
+    { "sei", "Set SEI to include",
+      OFFSET(sei), AV_OPT_TYPE_FLAGS,
+      { .i64 = SEI_MASTERING_DISPLAY | SEI_CONTENT_LIGHT_LEVEL },
+      0, INT_MAX, FLAGS, "sei" },
+    { "hdr",
+      "Include HDR metadata for mastering display colour volume "
+      "and content light level information",
+      0, AV_OPT_TYPE_CONST,
+      { .i64 = SEI_MASTERING_DISPLAY | SEI_CONTENT_LIGHT_LEVEL },
+      INT_MIN, INT_MAX, FLAGS, "sei" },
+
     { NULL },
 };
 
@@ -1010,10 +1262,12 @@ static const AVCodecDefault vaapi_encode_h265_defaults[] = {
     { "b",              "0"   },
     { "bf",             "2"   },
     { "g",              "120" },
-    { "i_qfactor",      "1.0" },
-    { "i_qoffset",      "0.0" },
-    { "b_qfactor",      "1.2" },
-    { "b_qoffset",      "0.0" },
+    { "i_qfactor",      "1"   },
+    { "i_qoffset",      "0"   },
+    { "b_qfactor",      "6/5" },
+    { "b_qoffset",      "0"   },
+    { "qmin",           "-1"  },
+    { "qmax",           "-1"  },
     { NULL },
 };
 
@@ -1029,10 +1283,10 @@ AVCodec ff_hevc_vaapi_encoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("H.265/HEVC (VAAPI)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_HEVC,
-    .priv_data_size = (sizeof(VAAPIEncodeContext) +
-                       sizeof(VAAPIEncodeH265Options)),
+    .priv_data_size = sizeof(VAAPIEncodeH265Context),
     .init           = &vaapi_encode_h265_init,
-    .encode2        = &ff_vaapi_encode2,
+    .send_frame     = &ff_vaapi_encode_send_frame,
+    .receive_packet = &ff_vaapi_encode_receive_packet,
     .close          = &vaapi_encode_h265_close,
     .priv_class     = &vaapi_encode_h265_class,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
diff --git a/libavcodec/vaapi_encode_mjpeg.c b/libavcodec/vaapi_encode_mjpeg.c
index a3cd8ca..4dcdc3d 100644
--- a/libavcodec/vaapi_encode_mjpeg.c
+++ b/libavcodec/vaapi_encode_mjpeg.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,9 +23,12 @@
 #include "libavutil/common.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
-#include "libavutil/pixfmt.h"
+#include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
+#include "bytestream.h"
+#include "cbs.h"
+#include "cbs_jpeg.h"
 #include "internal.h"
 #include "jpegtables.h"
 #include "mjpeg.h"
@@ -56,257 +59,350 @@ static const unsigned char vaapi_encode_mjpeg_quant_chrominance[64] = {
 };
 
 typedef struct VAAPIEncodeMJPEGContext {
+    VAAPIEncodeContext common;
+
+    // User options.
+    int jfif;
+    int huffman;
+
+    // Derived settings.
     int quality;
-    int component_subsample_h[3];
-    int component_subsample_v[3];
+    uint8_t jfif_data[14];
 
-    VAQMatrixBufferJPEG quant_tables;
-    VAHuffmanTableBufferJPEGBaseline huffman_tables;
+    // Writer structures.
+    JPEGRawFrameHeader     frame_header;
+    JPEGRawScan            scan;
+    JPEGRawApplicationData jfif_header;
+    JPEGRawQuantisationTableSpecification quant_tables;
+    JPEGRawHuffmanTableSpecification      huffman_tables;
+
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment current_fragment;
 } VAAPIEncodeMJPEGContext;
 
-static av_cold void vaapi_encode_mjpeg_copy_huffman(unsigned char *dst_lengths,
-                                                    unsigned char *dst_values,
-                                                    const unsigned char *src_lengths,
-                                                    const unsigned char *src_values)
+static int vaapi_encode_mjpeg_write_image_header(AVCodecContext *avctx,
+                                                 VAAPIEncodePicture *pic,
+                                                 VAAPIEncodeSlice *slice,
+                                                 char *data, size_t *data_len)
 {
-    int i, mt;
-
-    ++src_lengths;
+    VAAPIEncodeMJPEGContext *priv = avctx->priv_data;
+    CodedBitstreamFragment  *frag = &priv->current_fragment;
+    int err;
+
+    if (priv->jfif) {
+        err = ff_cbs_insert_unit_content(priv->cbc, frag, -1,
+                                         JPEG_MARKER_APPN + 0,
+                                         &priv->jfif_header, NULL);
+        if (err < 0)
+            goto fail;
+    }
 
-    mt = 0;
-    for (i = 0; i < 16; i++)
-        mt += (dst_lengths[i] = src_lengths[i]);
+    err = ff_cbs_insert_unit_content(priv->cbc, frag, -1,
+                                     JPEG_MARKER_DQT,
+                                     &priv->quant_tables, NULL);
+    if (err < 0)
+        goto fail;
+
+    err = ff_cbs_insert_unit_content(priv->cbc, frag, -1,
+                                     JPEG_MARKER_SOF0,
+                                     &priv->frame_header, NULL);
+    if (err < 0)
+        goto fail;
+
+    if (priv->huffman) {
+        err = ff_cbs_insert_unit_content(priv->cbc, frag, -1,
+                                         JPEG_MARKER_DHT,
+                                         &priv->huffman_tables, NULL);
+        if (err < 0)
+            goto fail;
+    }
 
-    for (i = 0; i < mt; i++)
-        dst_values[i] = src_values[i];
-}
+    err = ff_cbs_insert_unit_content(priv->cbc, frag, -1,
+                                     JPEG_MARKER_SOS,
+                                     &priv->scan, NULL);
+    if (err < 0)
+        goto fail;
 
-static av_cold void vaapi_encode_mjpeg_init_tables(AVCodecContext *avctx)
-{
-    VAAPIEncodeContext                *ctx = avctx->priv_data;
-    VAAPIEncodeMJPEGContext          *priv = ctx->priv_data;
-    VAQMatrixBufferJPEG             *quant = &priv->quant_tables;
-    VAHuffmanTableBufferJPEGBaseline *huff = &priv->huffman_tables;
-    int i;
-
-    quant->load_lum_quantiser_matrix = 1;
-    quant->load_chroma_quantiser_matrix = 1;
+    err = ff_cbs_write_fragment_data(priv->cbc, frag);
+    if (err < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to write image header.\n");
+        goto fail;
+    }
 
-    for (i = 0; i < 64; i++) {
-        quant->lum_quantiser_matrix[i] =
-            vaapi_encode_mjpeg_quant_luminance[i];
-        quant->chroma_quantiser_matrix[i] =
-            vaapi_encode_mjpeg_quant_chrominance[i];
+    if (*data_len < 8 * frag->data_size) {
+        av_log(avctx, AV_LOG_ERROR, "Image header too large: "
+               "%zu < %zu.\n", *data_len, 8 * frag->data_size);
+        err = AVERROR(ENOSPC);
+        goto fail;
     }
 
-    huff->load_huffman_table[0] = 1;
-    vaapi_encode_mjpeg_copy_huffman(huff->huffman_table[0].num_dc_codes,
-                                    huff->huffman_table[0].dc_values,
-                                    avpriv_mjpeg_bits_dc_luminance,
-                                    avpriv_mjpeg_val_dc);
-    vaapi_encode_mjpeg_copy_huffman(huff->huffman_table[0].num_ac_codes,
-                                    huff->huffman_table[0].ac_values,
-                                    avpriv_mjpeg_bits_ac_luminance,
-                                    avpriv_mjpeg_val_ac_luminance);
-    memset(huff->huffman_table[0].pad, 0, sizeof(huff->huffman_table[0].pad));
-
-    huff->load_huffman_table[1] = 1;
-    vaapi_encode_mjpeg_copy_huffman(huff->huffman_table[1].num_dc_codes,
-                                    huff->huffman_table[1].dc_values,
-                                    avpriv_mjpeg_bits_dc_chrominance,
-                                    avpriv_mjpeg_val_dc);
-    vaapi_encode_mjpeg_copy_huffman(huff->huffman_table[1].num_ac_codes,
-                                    huff->huffman_table[1].ac_values,
-                                    avpriv_mjpeg_bits_ac_chrominance,
-                                    avpriv_mjpeg_val_ac_chrominance);
-    memset(huff->huffman_table[1].pad, 0, sizeof(huff->huffman_table[1].pad));
-}
+    // Remove the EOI at the end of the fragment.
+    memcpy(data, frag->data, frag->data_size - 2);
+    *data_len = 8 * (frag->data_size - 2);
 
-static void vaapi_encode_mjpeg_write_marker(PutBitContext *pbc, int marker)
-{
-    put_bits(pbc, 8, 0xff);
-    put_bits(pbc, 8, marker);
+    err = 0;
+fail:
+    ff_cbs_fragment_reset(priv->cbc, frag);
+    return err;
 }
 
-static int vaapi_encode_mjpeg_write_image_header(AVCodecContext *avctx,
+static int vaapi_encode_mjpeg_write_extra_buffer(AVCodecContext *avctx,
                                                  VAAPIEncodePicture *pic,
-                                                 VAAPIEncodeSlice *slice,
+                                                 int index, int *type,
                                                  char *data, size_t *data_len)
 {
-    VAAPIEncodeContext               *ctx = avctx->priv_data;
-    VAEncPictureParameterBufferJPEG *vpic = pic->codec_picture_params;
-    VAEncSliceParameterBufferJPEG *vslice = slice->codec_slice_params;
-    VAAPIEncodeMJPEGContext         *priv = ctx->priv_data;
-    PutBitContext pbc;
-    int t, i, quant_scale;
+    VAAPIEncodeMJPEGContext *priv = avctx->priv_data;
+    int t, i, k;
 
-    init_put_bits(&pbc, data, *data_len);
+    if (index == 0) {
+        // Write quantisation tables.
+        JPEGRawFrameHeader                     *fh = &priv->frame_header;
+        JPEGRawQuantisationTableSpecification *dqt = &priv->quant_tables;
+        VAQMatrixBufferJPEG *quant;
+
+        if (*data_len < sizeof(*quant))
+            return AVERROR(ENOSPC);
+        *type     = VAQMatrixBufferType;
+        *data_len = sizeof(*quant);
+
+        quant = (VAQMatrixBufferJPEG*)data;
+        memset(quant, 0, sizeof(*quant));
+
+        quant->load_lum_quantiser_matrix = 1;
+        for (i = 0; i < 64; i++)
+            quant->lum_quantiser_matrix[i] = dqt->table[fh->Tq[0]].Q[i];
+
+        if (fh->Nf > 1) {
+            quant->load_chroma_quantiser_matrix = 1;
+            for (i = 0; i < 64; i++)
+                quant->chroma_quantiser_matrix[i] =
+                    dqt->table[fh->Tq[1]].Q[i];
+        }
 
-    vaapi_encode_mjpeg_write_marker(&pbc, SOI);
+    } else if (index == 1) {
+        // Write huffman tables.
+        JPEGRawScanHeader                 *sh = &priv->scan.header;
+        JPEGRawHuffmanTableSpecification *dht = &priv->huffman_tables;
+        VAHuffmanTableBufferJPEGBaseline *huff;
+
+        if (*data_len < sizeof(*huff))
+            return AVERROR(ENOSPC);
+        *type     = VAHuffmanTableBufferType;
+        *data_len = sizeof(*huff);
+
+        huff = (VAHuffmanTableBufferJPEGBaseline*)data;
+        memset(huff, 0, sizeof(*huff));
+
+        for (t = 0; t < 1 + (sh->Ns > 1); t++) {
+            const JPEGRawHuffmanTable *ht;
+
+            huff->load_huffman_table[t] = 1;
+
+            ht = &dht->table[2 * t];
+            for (i = k = 0; i < 16; i++)
+                k += (huff->huffman_table[t].num_dc_codes[i] = ht->L[i]);
+            av_assert0(k <= sizeof(huff->huffman_table[t].dc_values));
+            for (i = 0; i < k; i++)
+                huff->huffman_table[t].dc_values[i] = ht->V[i];
+
+            ht = &dht->table[2 * t + 1];
+            for (i = k = 0; i < 16; i++)
+                k += (huff->huffman_table[t].num_ac_codes[i] = ht->L[i]);
+            av_assert0(k <= sizeof(huff->huffman_table[t].ac_values));
+            for (i = 0; i < k; i++)
+                huff->huffman_table[t].ac_values[i] = ht->V[i];
+        }
 
-    // Quantisation table coefficients are scaled for quality by the driver,
-    // so we also need to do it ourselves here so that headers match.
-    if (priv->quality < 50)
-        quant_scale = 5000 / priv->quality;
-    else
-        quant_scale = 200 - 2 * priv->quality;
+    } else {
+        return AVERROR_EOF;
+    }
+    return 0;
+}
 
-    for (t = 0; t < 2; t++) {
-        int q;
+static int vaapi_encode_mjpeg_init_picture_params(AVCodecContext *avctx,
+                                                  VAAPIEncodePicture *pic)
+{
+    VAAPIEncodeMJPEGContext         *priv = avctx->priv_data;
+    JPEGRawFrameHeader                *fh = &priv->frame_header;
+    JPEGRawScanHeader                 *sh = &priv->scan.header;
+    VAEncPictureParameterBufferJPEG *vpic = pic->codec_picture_params;
+    const AVPixFmtDescriptor *desc;
+    const uint8_t *components;
+    int t, i, quant_scale, len;
 
-        vaapi_encode_mjpeg_write_marker(&pbc, DQT);
+    av_assert0(pic->type == PICTURE_TYPE_IDR);
 
-        put_bits(&pbc, 16, 3 + 64); // Lq
-        put_bits(&pbc, 4, 0); // Pq
-        put_bits(&pbc, 4, t); // Tq
+    desc = av_pix_fmt_desc_get(priv->common.input_frames->sw_format);
+    av_assert0(desc);
+    if (desc->flags & AV_PIX_FMT_FLAG_RGB)
+        components = (uint8_t[3]) { 'R', 'G', 'B' };
+    else
+        components = (uint8_t[3]) {  1,   2,   3  };
 
-        for (i = 0; i < 64; i++) {
-            q = i[t ? priv->quant_tables.chroma_quantiser_matrix
-                    : priv->quant_tables.lum_quantiser_matrix];
-            q = (q * quant_scale) / 100;
-            if (q < 1)   q = 1;
-            if (q > 255) q = 255;
-            put_bits(&pbc, 8, q);
-        }
-    }
+    // Frame header.
 
-    vaapi_encode_mjpeg_write_marker(&pbc, SOF0);
+    fh->P  = 8;
+    fh->Y  = avctx->height;
+    fh->X  = avctx->width;
+    fh->Nf = desc->nb_components;
 
-    put_bits(&pbc, 16, 8 + 3 * vpic->num_components); // Lf
-    put_bits(&pbc, 8,  vpic->sample_bit_depth); // P
-    put_bits(&pbc, 16, vpic->picture_height);   // Y
-    put_bits(&pbc, 16, vpic->picture_width);    // X
-    put_bits(&pbc, 8,  vpic->num_components);   // Nf
+    for (i = 0; i < fh->Nf; i++) {
+        fh->C[i] = components[i];
+        fh->H[i] = 1 + (i == 0 ? desc->log2_chroma_w : 0);
+        fh->V[i] = 1 + (i == 0 ? desc->log2_chroma_h : 0);
 
-    for (i = 0; i < vpic->num_components; i++) {
-        put_bits(&pbc, 8, vpic->component_id[i]); // Ci
-        put_bits(&pbc, 4, priv->component_subsample_h[i]); // Hi
-        put_bits(&pbc, 4, priv->component_subsample_v[i]); // Vi
-        put_bits(&pbc, 8, vpic->quantiser_table_selector[i]); // Tqi
+        fh->Tq[i] = !!i;
     }
 
-    for (t = 0; t < 4; t++) {
-        int mt;
-        unsigned char *lengths, *values;
+    fh->Lf = 8 + 3 * fh->Nf;
+
+    // JFIF header.
+    if (priv->jfif) {
+        JPEGRawApplicationData *app = &priv->jfif_header;
+        AVRational sar = pic->input_image->sample_aspect_ratio;
+        int sar_w, sar_h;
+        PutByteContext pbc;
 
-        vaapi_encode_mjpeg_write_marker(&pbc, DHT);
+        bytestream2_init_writer(&pbc, priv->jfif_data,
+                                sizeof(priv->jfif_data));
 
-        if ((t & 1) == 0) {
-            lengths = priv->huffman_tables.huffman_table[t / 2].num_dc_codes;
-            values  = priv->huffman_tables.huffman_table[t / 2].dc_values;
+        bytestream2_put_buffer(&pbc, "JFIF", 5);
+        bytestream2_put_be16(&pbc, 0x0102);
+        bytestream2_put_byte(&pbc, 0);
+
+        av_reduce(&sar_w, &sar_h, sar.num, sar.den, 65535);
+        if (sar_w && sar_h) {
+            bytestream2_put_be16(&pbc, sar_w);
+            bytestream2_put_be16(&pbc, sar_h);
         } else {
-            lengths = priv->huffman_tables.huffman_table[t / 2].num_ac_codes;
-            values  = priv->huffman_tables.huffman_table[t / 2].ac_values;
+            bytestream2_put_be16(&pbc, 1);
+            bytestream2_put_be16(&pbc, 1);
         }
 
-        mt = 0;
-        for (i = 0; i < 16; i++)
-            mt += lengths[i];
+        bytestream2_put_byte(&pbc, 0);
+        bytestream2_put_byte(&pbc, 0);
 
-        put_bits(&pbc, 16, 2 + 17 + mt); // Lh
-        put_bits(&pbc, 4, t & 1); // Tc
-        put_bits(&pbc, 4, t / 2); // Th
+        av_assert0(bytestream2_get_bytes_left_p(&pbc) == 0);
 
-        for (i = 0; i < 16; i++)
-            put_bits(&pbc, 8, lengths[i]);
-        for (i = 0; i < mt; i++)
-            put_bits(&pbc, 8, values[i]);
+        app->Lp     = 2 + sizeof(priv->jfif_data);
+        app->Ap     = priv->jfif_data;
+        app->Ap_ref = NULL;
     }
 
-    vaapi_encode_mjpeg_write_marker(&pbc, SOS);
+    // Quantisation tables.
 
-    av_assert0(vpic->num_components == vslice->num_components);
+    if (priv->quality < 50)
+        quant_scale = 5000 / priv->quality;
+    else
+        quant_scale = 200 - 2 * priv->quality;
 
-    put_bits(&pbc, 16, 6 + 2 * vslice->num_components); // Ls
-    put_bits(&pbc, 8,  vslice->num_components); // Ns
+    len = 2;
 
-    for (i = 0; i < vslice->num_components; i++) {
-        put_bits(&pbc, 8, vslice->components[i].component_selector); // Csj
-        put_bits(&pbc, 4, vslice->components[i].dc_table_selector);  // Tdj
-        put_bits(&pbc, 4, vslice->components[i].ac_table_selector);  // Taj
-    }
+    for (t = 0; t < 1 + (fh->Nf > 1); t++) {
+        JPEGRawQuantisationTable *quant = &priv->quant_tables.table[t];
+        const uint8_t *data = t == 0 ?
+            vaapi_encode_mjpeg_quant_luminance :
+            vaapi_encode_mjpeg_quant_chrominance;
 
-    put_bits(&pbc, 8, 0); // Ss
-    put_bits(&pbc, 8, 63); // Se
-    put_bits(&pbc, 4, 0); // Ah
-    put_bits(&pbc, 4, 0); // Al
+        quant->Pq = 0;
+        quant->Tq = t;
+        for (i = 0; i < 64; i++)
+            quant->Q[i] = av_clip(data[i] * quant_scale / 100, 1, 255);
 
-    *data_len = put_bits_count(&pbc);
-    flush_put_bits(&pbc);
+        len += 65;
+    }
 
-    return 0;
-}
+    priv->quant_tables.Lq = len;
+
+    // Huffman tables.
+
+    len = 2;
+
+    for (t = 0; t < 2 + 2 * (fh->Nf > 1); t++) {
+        JPEGRawHuffmanTable *huff = &priv->huffman_tables.table[t];
+        const uint8_t *lengths, *values;
+        int k;
+
+        switch (t) {
+        case 0:
+            lengths = avpriv_mjpeg_bits_dc_luminance + 1;
+            values  = avpriv_mjpeg_val_dc;
+            break;
+        case 1:
+            lengths = avpriv_mjpeg_bits_ac_luminance + 1;
+            values  = avpriv_mjpeg_val_ac_luminance;
+            break;
+        case 2:
+            lengths = avpriv_mjpeg_bits_dc_chrominance + 1;
+            values  = avpriv_mjpeg_val_dc;
+            break;
+        case 3:
+            lengths = avpriv_mjpeg_bits_ac_chrominance + 1;
+            values  = avpriv_mjpeg_val_ac_chrominance;
+            break;
+        }
 
-static int vaapi_encode_mjpeg_write_extra_buffer(AVCodecContext *avctx,
-                                                 VAAPIEncodePicture *pic,
-                                                 int index, int *type,
-                                                 char *data, size_t *data_len)
-{
-    VAAPIEncodeContext       *ctx = avctx->priv_data;
-    VAAPIEncodeMJPEGContext *priv = ctx->priv_data;
+        huff->Tc = t % 2;
+        huff->Th = t / 2;
 
-    if (index == 0) {
-        // Write quantisation tables.
-        if (*data_len < sizeof(priv->quant_tables))
-            return AVERROR(EINVAL);
-        *type = VAQMatrixBufferType;
-        memcpy(data, &priv->quant_tables,
-               *data_len = sizeof(priv->quant_tables));
+        for (i = k = 0; i < 16; i++)
+            k += (huff->L[i] = lengths[i]);
 
-    } else if (index == 1) {
-        // Write huffman tables.
-        if (*data_len < sizeof(priv->huffman_tables))
-            return AVERROR(EINVAL);
-        *type = VAHuffmanTableBufferType;
-        memcpy(data, &priv->huffman_tables,
-               *data_len = sizeof(priv->huffman_tables));
+        for (i = 0; i < k; i++)
+            huff->V[i] = values[i];
 
-    } else {
-        return AVERROR_EOF;
+        len += 17 + k;
     }
-    return 0;
-}
 
-static int vaapi_encode_mjpeg_init_picture_params(AVCodecContext *avctx,
-                                                  VAAPIEncodePicture *pic)
-{
-    VAAPIEncodeContext               *ctx = avctx->priv_data;
-    VAEncPictureParameterBufferJPEG *vpic = pic->codec_picture_params;
-    VAAPIEncodeMJPEGContext         *priv = ctx->priv_data;
+    priv->huffman_tables.Lh = len;
+
+    // Scan header.
+
+    sh->Ns = fh->Nf;
 
-    vpic->reconstructed_picture = pic->recon_surface;
-    vpic->coded_buf = pic->output_buffer;
+    for (i = 0; i < fh->Nf; i++) {
+        sh->Cs[i] = fh->C[i];
+        sh->Td[i] = i > 0;
+        sh->Ta[i] = i > 0;
+    }
+
+    sh->Ss = 0;
+    sh->Se = 63;
+    sh->Ah = 0;
+    sh->Al = 0;
 
-    vpic->picture_width  = avctx->width;
-    vpic->picture_height = avctx->height;
+    sh->Ls = 6 + 2 * sh->Ns;
 
-    vpic->pic_flags.bits.profile      = 0;
-    vpic->pic_flags.bits.progressive  = 0;
-    vpic->pic_flags.bits.huffman      = 1;
-    vpic->pic_flags.bits.interleaved  = 0;
-    vpic->pic_flags.bits.differential = 0;
 
-    vpic->sample_bit_depth = 8;
-    vpic->num_scan = 1;
+    *vpic = (VAEncPictureParameterBufferJPEG) {
+        .reconstructed_picture = pic->recon_surface,
+        .coded_buf             = pic->output_buffer,
 
-    vpic->num_components = 3;
+        .picture_width  = fh->X,
+        .picture_height = fh->Y,
 
-    vpic->component_id[0] = 1;
-    vpic->component_id[1] = 2;
-    vpic->component_id[2] = 3;
+        .pic_flags.bits = {
+            .profile      = 0,
+            .progressive  = 0,
+            .huffman      = 1,
+            .interleaved  = 0,
+            .differential = 0,
+        },
 
-    priv->component_subsample_h[0] = 2;
-    priv->component_subsample_v[0] = 2;
-    priv->component_subsample_h[1] = 1;
-    priv->component_subsample_v[1] = 1;
-    priv->component_subsample_h[2] = 1;
-    priv->component_subsample_v[2] = 1;
+        .sample_bit_depth = fh->P,
+        .num_scan         = 1,
+        .num_components   = fh->Nf,
 
-    vpic->quantiser_table_selector[0] = 0;
-    vpic->quantiser_table_selector[1] = 1;
-    vpic->quantiser_table_selector[2] = 1;
+        // The driver modifies the provided quantisation tables according
+        // to this quality value; the middle value of 50 makes that the
+        // identity so that they are used unchanged.
+        .quality = 50,
+    };
 
-    vpic->quality = priv->quality;
+    for (i = 0; i < fh->Nf; i++) {
+        vpic->component_id[i]             = fh->C[i];
+        vpic->quantiser_table_selector[i] = fh->Tq[i];
+    }
 
     pic->nb_slices = 1;
 
@@ -317,17 +413,20 @@ static int vaapi_encode_mjpeg_init_slice_params(AVCodecContext *avctx,
                                                 VAAPIEncodePicture *pic,
                                                 VAAPIEncodeSlice *slice)
 {
-    VAEncPictureParameterBufferJPEG *vpic = pic->codec_picture_params;
+    VAAPIEncodeMJPEGContext         *priv = avctx->priv_data;
+    JPEGRawScanHeader                 *sh = &priv->scan.header;
     VAEncSliceParameterBufferJPEG *vslice = slice->codec_slice_params;
     int i;
 
-    vslice->restart_interval = 0;
+    *vslice = (VAEncSliceParameterBufferJPEG) {
+        .restart_interval = 0,
+        .num_components   = sh->Ns,
+    };
 
-    vslice->num_components = vpic->num_components;
-    for (i = 0; i < vslice->num_components; i++) {
-        vslice->components[i].component_selector = i + 1;
-        vslice->components[i].dc_table_selector = (i > 0);
-        vslice->components[i].ac_table_selector = (i > 0);
+    for (i = 0; i < sh->Ns; i++) {
+        vslice->components[i].component_selector = sh->Cs[i];
+        vslice->components[i].dc_table_selector  = sh->Td[i];
+        vslice->components[i].ac_table_selector  = sh->Ta[i];
     }
 
     return 0;
@@ -336,9 +435,10 @@ static int vaapi_encode_mjpeg_init_slice_params(AVCodecContext *avctx,
 static av_cold int vaapi_encode_mjpeg_configure(AVCodecContext *avctx)
 {
     VAAPIEncodeContext       *ctx = avctx->priv_data;
-    VAAPIEncodeMJPEGContext *priv = ctx->priv_data;
+    VAAPIEncodeMJPEGContext *priv = avctx->priv_data;
+    int err;
 
-    priv->quality = avctx->global_quality;
+    priv->quality = ctx->rc_quality;
     if (priv->quality < 1 || priv->quality > 100) {
         av_log(avctx, AV_LOG_ERROR, "Invalid quality value %d "
                "(must be 1-100).\n", priv->quality);
@@ -356,16 +456,35 @@ static av_cold int vaapi_encode_mjpeg_configure(AVCodecContext *avctx)
         ctx->va_packed_headers |=  VA_ENC_PACKED_HEADER_SLICE;
     }
 
-    vaapi_encode_mjpeg_init_tables(avctx);
+    err = ff_cbs_init(&priv->cbc, AV_CODEC_ID_MJPEG, avctx);
+    if (err < 0)
+        return err;
 
     return 0;
 }
 
+static const VAAPIEncodeProfile vaapi_encode_mjpeg_profiles[] = {
+    { FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT,
+            8, 1, 0, 0, VAProfileJPEGBaseline },
+    { FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT,
+            8, 3, 1, 1, VAProfileJPEGBaseline },
+    { FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT,
+            8, 3, 1, 0, VAProfileJPEGBaseline },
+    { FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT,
+            8, 3, 0, 0, VAProfileJPEGBaseline },
+    { FF_PROFILE_UNKNOWN }
+};
+
 static const VAAPIEncodeType vaapi_encode_type_mjpeg = {
-    .priv_data_size        = sizeof(VAAPIEncodeMJPEGContext),
+    .profiles              = vaapi_encode_mjpeg_profiles,
+
+    .flags                 = FLAG_CONSTANT_QUALITY_ONLY |
+                             FLAG_INTRA_ONLY,
 
     .configure             = &vaapi_encode_mjpeg_configure,
 
+    .default_quality       = 80,
+
     .picture_params_size   = sizeof(VAEncPictureParameterBufferJPEG),
     .init_picture_params   = &vaapi_encode_mjpeg_init_picture_params,
 
@@ -384,15 +503,8 @@ static av_cold int vaapi_encode_mjpeg_init(AVCodecContext *avctx)
 
     ctx->codec = &vaapi_encode_type_mjpeg;
 
-    ctx->va_profile    = VAProfileJPEGBaseline;
-    ctx->va_entrypoint = VAEntrypointEncPicture;
-
-    ctx->va_rt_format = VA_RT_FORMAT_YUV420;
-
-    ctx->va_rc_mode = VA_RC_CQP;
-
     // The JPEG image header - see note above.
-    ctx->va_packed_headers =
+    ctx->desired_packed_headers =
         VA_ENC_PACKED_HEADER_RAW_DATA;
 
     ctx->surface_width  = FFALIGN(avctx->width,  8);
@@ -401,14 +513,40 @@ static av_cold int vaapi_encode_mjpeg_init(AVCodecContext *avctx)
     return ff_vaapi_encode_init(avctx);
 }
 
+static av_cold int vaapi_encode_mjpeg_close(AVCodecContext *avctx)
+{
+    VAAPIEncodeMJPEGContext *priv = avctx->priv_data;
+
+    ff_cbs_fragment_free(priv->cbc, &priv->current_fragment);
+    ff_cbs_close(&priv->cbc);
+
+    return ff_vaapi_encode_close(avctx);
+}
+
+#define OFFSET(x) offsetof(VAAPIEncodeMJPEGContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
+static const AVOption vaapi_encode_mjpeg_options[] = {
+    VAAPI_ENCODE_COMMON_OPTIONS,
+
+    { "jfif", "Include JFIF header",
+      OFFSET(jfif), AV_OPT_TYPE_BOOL,
+      { .i64 = 0 }, 0, 1, FLAGS },
+    { "huffman", "Include huffman tables",
+      OFFSET(huffman), AV_OPT_TYPE_BOOL,
+      { .i64 = 1 }, 0, 1, FLAGS },
+
+    { NULL },
+};
+
 static const AVCodecDefault vaapi_encode_mjpeg_defaults[] = {
-    { "global_quality", "80" },
+    { "b",              "0"  },
     { NULL },
 };
 
 static const AVClass vaapi_encode_mjpeg_class = {
     .class_name = "mjpeg_vaapi",
     .item_name  = av_default_item_name,
+    .option     = vaapi_encode_mjpeg_options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
@@ -417,12 +555,14 @@ AVCodec ff_mjpeg_vaapi_encoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("MJPEG (VAAPI)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_MJPEG,
-    .priv_data_size = sizeof(VAAPIEncodeContext),
+    .priv_data_size = sizeof(VAAPIEncodeMJPEGContext),
     .init           = &vaapi_encode_mjpeg_init,
-    .encode2        = &ff_vaapi_encode2,
-    .close          = &ff_vaapi_encode_close,
+    .send_frame     = &ff_vaapi_encode_send_frame,
+    .receive_packet = &ff_vaapi_encode_receive_packet,
+    .close          = &vaapi_encode_mjpeg_close,
     .priv_class     = &vaapi_encode_mjpeg_class,
-    .capabilities   = AV_CODEC_CAP_HARDWARE,
+    .capabilities   = AV_CODEC_CAP_HARDWARE |
+                      AV_CODEC_CAP_INTRA_ONLY,
     .defaults       = vaapi_encode_mjpeg_defaults,
     .pix_fmts = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_VAAPI,
diff --git a/libavcodec/vaapi_encode_mpeg2.c b/libavcodec/vaapi_encode_mpeg2.c
index df26ed4..fb1ef71 100644
--- a/libavcodec/vaapi_encode_mpeg2.c
+++ b/libavcodec/vaapi_encode_mpeg2.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,22 +28,17 @@
 #include "vaapi_encode.h"
 
 typedef struct VAAPIEncodeMPEG2Context {
-    int mb_width;
-    int mb_height;
+    VAAPIEncodeContext common;
 
+    // User options.
+    int profile;
+    int level;
+
+    // Derived settings.
     int quant_i;
     int quant_p;
     int quant_b;
 
-    MPEG2RawSequenceHeader sequence_header;
-    MPEG2RawExtensionData  sequence_extension;
-    MPEG2RawExtensionData  sequence_display_extension;
-    MPEG2RawGroupOfPicturesHeader gop_header;
-    MPEG2RawPictureHeader  picture_header;
-    MPEG2RawExtensionData  picture_coding_extension;
-
-    int64_t last_i_frame;
-
     unsigned int bit_rate;
     unsigned int vbv_buffer_size;
 
@@ -52,6 +47,17 @@ typedef struct VAAPIEncodeMPEG2Context {
     unsigned int f_code_horizontal;
     unsigned int f_code_vertical;
 
+    // Stream state.
+    int64_t last_i_frame;
+
+    // Writer structures.
+    MPEG2RawSequenceHeader sequence_header;
+    MPEG2RawExtensionData  sequence_extension;
+    MPEG2RawExtensionData  sequence_display_extension;
+    MPEG2RawGroupOfPicturesHeader gop_header;
+    MPEG2RawPictureHeader  picture_header;
+    MPEG2RawExtensionData  picture_coding_extension;
+
     CodedBitstreamContext *cbc;
     CodedBitstreamFragment current_fragment;
 } VAAPIEncodeMPEG2Context;
@@ -61,8 +67,7 @@ static int vaapi_encode_mpeg2_write_fragment(AVCodecContext *avctx,
                                              char *data, size_t *data_len,
                                              CodedBitstreamFragment *frag)
 {
-    VAAPIEncodeContext       *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context *priv = avctx->priv_data;
     int err;
 
     err = ff_cbs_write_fragment_data(priv->cbc, frag);
@@ -88,8 +93,7 @@ static int vaapi_encode_mpeg2_add_header(AVCodecContext *avctx,
                                          CodedBitstreamFragment *frag,
                                          int type, void *header)
 {
-    VAAPIEncodeContext       *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context *priv = avctx->priv_data;
     int err;
 
     err = ff_cbs_insert_unit_content(priv->cbc, frag, -1, type, header, NULL);
@@ -105,8 +109,7 @@ static int vaapi_encode_mpeg2_add_header(AVCodecContext *avctx,
 static int vaapi_encode_mpeg2_write_sequence_header(AVCodecContext *avctx,
                                                     char *data, size_t *data_len)
 {
-    VAAPIEncodeContext       *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context *priv = avctx->priv_data;
     CodedBitstreamFragment  *frag = &priv->current_fragment;
     int err;
 
@@ -132,7 +135,7 @@ static int vaapi_encode_mpeg2_write_sequence_header(AVCodecContext *avctx,
 
     err = vaapi_encode_mpeg2_write_fragment(avctx, data, data_len, frag);
 fail:
-    ff_cbs_fragment_uninit(priv->cbc, frag);
+    ff_cbs_fragment_reset(priv->cbc, frag);
     return 0;
 }
 
@@ -140,8 +143,7 @@ static int vaapi_encode_mpeg2_write_picture_header(AVCodecContext *avctx,
                                                    VAAPIEncodePicture *pic,
                                                    char *data, size_t *data_len)
 {
-    VAAPIEncodeContext       *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context *priv = avctx->priv_data;
     CodedBitstreamFragment  *frag = &priv->current_fragment;
     int err;
 
@@ -157,14 +159,14 @@ static int vaapi_encode_mpeg2_write_picture_header(AVCodecContext *avctx,
 
     err = vaapi_encode_mpeg2_write_fragment(avctx, data, data_len, frag);
 fail:
-    ff_cbs_fragment_uninit(priv->cbc, frag);
+    ff_cbs_fragment_reset(priv->cbc, frag);
     return 0;
 }
 
 static int vaapi_encode_mpeg2_init_sequence_params(AVCodecContext *avctx)
 {
     VAAPIEncodeContext                 *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context           *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context           *priv = avctx->priv_data;
     MPEG2RawSequenceHeader              *sh = &priv->sequence_header;
     MPEG2RawSequenceExtension           *se = &priv->sequence_extension.data.sequence;
     MPEG2RawSequenceDisplayExtension   *sde = &priv->sequence_display_extension.data.sequence_display;
@@ -183,8 +185,8 @@ static int vaapi_encode_mpeg2_init_sequence_params(AVCodecContext *avctx)
     memset(pce,  0, sizeof(*pce));
 
 
-    if (avctx->bit_rate > 0) {
-        priv->bit_rate = (avctx->bit_rate + 399) / 400;
+    if (ctx->va_bit_rate > 0) {
+        priv->bit_rate = (ctx->va_bit_rate + 399) / 400;
     } else {
         // Unknown (not a bitrate-targetting mode), so just use the
         // highest value.
@@ -311,7 +313,8 @@ static int vaapi_encode_mpeg2_init_sequence_params(AVCodecContext *avctx)
 
     goph->group_start_code = MPEG2_START_GROUP;
 
-    goph->time_code   = 0;
+    // Marker bit in the middle of time_code.
+    goph->time_code   = 1 << 12;
     goph->closed_gop  = 1;
     goph->broken_link = 0;
 
@@ -350,13 +353,13 @@ static int vaapi_encode_mpeg2_init_sequence_params(AVCodecContext *avctx)
 
 
     *vseq = (VAEncSequenceParameterBufferMPEG2) {
-        .intra_period = avctx->gop_size,
+        .intra_period = ctx->gop_size,
         .ip_period    = ctx->b_per_p + 1,
 
         .picture_width  = avctx->width,
         .picture_height = avctx->height,
 
-        .bits_per_second          = avctx->bit_rate,
+        .bits_per_second          = ctx->va_bit_rate,
         .frame_rate               = av_q2d(priv->frame_rate),
         .aspect_ratio_information = sh->aspect_ratio_information,
         .vbv_buffer_size          = priv->vbv_buffer_size,
@@ -416,8 +419,7 @@ static int vaapi_encode_mpeg2_init_sequence_params(AVCodecContext *avctx)
 static int vaapi_encode_mpeg2_init_picture_params(AVCodecContext *avctx,
                                                  VAAPIEncodePicture *pic)
 {
-    VAAPIEncodeContext                *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context          *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context          *priv = avctx->priv_data;
     MPEG2RawPictureHeader              *ph = &priv->picture_header;
     MPEG2RawPictureCodingExtension    *pce = &priv->picture_coding_extension.data.picture_coding;
     VAEncPictureParameterBufferMPEG2 *vpic = pic->codec_picture_params;
@@ -473,8 +475,6 @@ static int vaapi_encode_mpeg2_init_picture_params(AVCodecContext *avctx,
     vpic->f_code[1][0]       = pce->f_code[1][0];
     vpic->f_code[1][1]       = pce->f_code[1][1];
 
-    pic->nb_slices = priv->mb_height;
-
     return 0;
 }
 
@@ -482,13 +482,12 @@ static int vaapi_encode_mpeg2_init_slice_params(AVCodecContext *avctx,
                                                VAAPIEncodePicture *pic,
                                                VAAPIEncodeSlice *slice)
 {
-    VAAPIEncodeContext                  *ctx = avctx->priv_data;
+    VAAPIEncodeMPEG2Context            *priv = avctx->priv_data;
     VAEncSliceParameterBufferMPEG2   *vslice = slice->codec_slice_params;
-    VAAPIEncodeMPEG2Context            *priv = ctx->priv_data;
     int qp;
 
-    vslice->macroblock_address = priv->mb_width * slice->index;
-    vslice->num_macroblocks    = priv->mb_width;
+    vslice->macroblock_address = slice->block_start;
+    vslice->num_macroblocks    = slice->block_size;
 
     switch (pic->type) {
     case PICTURE_TYPE_IDR:
@@ -515,30 +514,25 @@ static int vaapi_encode_mpeg2_init_slice_params(AVCodecContext *avctx,
 static av_cold int vaapi_encode_mpeg2_configure(AVCodecContext *avctx)
 {
     VAAPIEncodeContext       *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context *priv = avctx->priv_data;
     int err;
 
     err = ff_cbs_init(&priv->cbc, AV_CODEC_ID_MPEG2VIDEO, avctx);
     if (err < 0)
         return err;
 
-    priv->mb_width  = FFALIGN(avctx->width,  16) / 16;
-    priv->mb_height = FFALIGN(avctx->height, 16) / 16;
-
     if (ctx->va_rc_mode == VA_RC_CQP) {
-        priv->quant_p = av_clip(avctx->global_quality, 1, 31);
+        priv->quant_p = av_clip(ctx->rc_quality, 1, 31);
         if (avctx->i_quant_factor > 0.0)
-            priv->quant_i = av_clip((avctx->global_quality *
-                                     avctx->i_quant_factor +
-                                     avctx->i_quant_offset) + 0.5,
-                                    1, 31);
+            priv->quant_i =
+                av_clip((avctx->i_quant_factor * priv->quant_p +
+                         avctx->i_quant_offset) + 0.5, 1, 31);
         else
             priv->quant_i = priv->quant_p;
         if (avctx->b_quant_factor > 0.0)
-            priv->quant_b = av_clip((avctx->global_quality *
-                                     avctx->b_quant_factor +
-                                     avctx->b_quant_offset) + 0.5,
-                                    1, 31);
+            priv->quant_b =
+                av_clip((avctx->b_quant_factor * priv->quant_p +
+                         avctx->b_quant_offset) + 0.5, 1, 31);
         else
             priv->quant_b = priv->quant_p;
 
@@ -547,17 +541,35 @@ static av_cold int vaapi_encode_mpeg2_configure(AVCodecContext *avctx)
                priv->quant_i, priv->quant_p, priv->quant_b);
 
     } else {
-        av_assert0(0 && "Invalid RC mode.");
+        priv->quant_i = 16;
+        priv->quant_p = 16;
+        priv->quant_b = 16;
     }
 
+    ctx->slice_block_rows = FFALIGN(avctx->height, 16) / 16;
+    ctx->slice_block_cols = FFALIGN(avctx->width,  16) / 16;
+
+    ctx->nb_slices  = ctx->slice_block_rows;
+    ctx->slice_size = 1;
+
     return 0;
 }
 
+static const VAAPIEncodeProfile vaapi_encode_mpeg2_profiles[] = {
+    { FF_PROFILE_MPEG2_MAIN,   8, 3, 1, 1, VAProfileMPEG2Main   },
+    { FF_PROFILE_MPEG2_SIMPLE, 8, 3, 1, 1, VAProfileMPEG2Simple },
+    { FF_PROFILE_UNKNOWN }
+};
+
 static const VAAPIEncodeType vaapi_encode_type_mpeg2 = {
-    .priv_data_size        = sizeof(VAAPIEncodeMPEG2Context),
+    .profiles              = vaapi_encode_mpeg2_profiles,
+
+    .flags                 = FLAG_B_PICTURES,
 
     .configure             = &vaapi_encode_mpeg2_configure,
 
+    .default_quality       = 10,
+
     .sequence_params_size  = sizeof(VAEncSequenceParameterBufferMPEG2),
     .init_sequence_params  = &vaapi_encode_mpeg2_init_sequence_params,
 
@@ -576,35 +588,18 @@ static const VAAPIEncodeType vaapi_encode_type_mpeg2 = {
 
 static av_cold int vaapi_encode_mpeg2_init(AVCodecContext *avctx)
 {
-    VAAPIEncodeContext *ctx = avctx->priv_data;
+    VAAPIEncodeContext       *ctx = avctx->priv_data;
+    VAAPIEncodeMPEG2Context *priv = avctx->priv_data;
 
     ctx->codec = &vaapi_encode_type_mpeg2;
 
-    switch (avctx->profile) {
-    case FF_PROFILE_MPEG2_SIMPLE:
-        ctx->va_profile = VAProfileMPEG2Simple;
-        break;
-    case FF_PROFILE_MPEG2_MAIN:
-        ctx->va_profile = VAProfileMPEG2Main;
-        break;
-    case FF_PROFILE_MPEG2_422:
-        av_log(avctx, AV_LOG_ERROR, "MPEG-2 4:2:2 profile "
-               "is not supported.\n");
-        return AVERROR_PATCHWELCOME;
-    case FF_PROFILE_MPEG2_HIGH:
-        av_log(avctx, AV_LOG_ERROR, "MPEG-2 high profile "
-               "is not supported.\n");
-        return AVERROR_PATCHWELCOME;
-    case FF_PROFILE_MPEG2_SS:
-    case FF_PROFILE_MPEG2_SNR_SCALABLE:
-        av_log(avctx, AV_LOG_ERROR, "MPEG-2 scalable profiles "
-               "are not supported.\n");
-        return AVERROR_PATCHWELCOME;
-    default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown MPEG-2 profile %d.\n",
-               avctx->profile);
-        return AVERROR(EINVAL);
-    }
+    if (avctx->profile == FF_PROFILE_UNKNOWN)
+        avctx->profile = priv->profile;
+    if (avctx->level == FF_LEVEL_UNKNOWN)
+        avctx->level = priv->level;
+
+    // Reject unknown levels (these are required to set f_code for
+    // motion vector encoding).
     switch (avctx->level) {
     case 4: // High
     case 6: // High 1440
@@ -623,12 +618,8 @@ static av_cold int vaapi_encode_mpeg2_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    ctx->va_entrypoint = VAEntrypointEncSlice;
-    ctx->va_rt_format  = VA_RT_FORMAT_YUV420;
-    ctx->va_rc_mode    = VA_RC_CQP;
-
-    ctx->va_packed_headers = VA_ENC_PACKED_HEADER_SEQUENCE |
-                             VA_ENC_PACKED_HEADER_PICTURE;
+    ctx->desired_packed_headers = VA_ENC_PACKED_HEADER_SEQUENCE |
+                                  VA_ENC_PACKED_HEADER_PICTURE;
 
     ctx->surface_width  = FFALIGN(avctx->width,  16);
     ctx->surface_height = FFALIGN(avctx->height, 16);
@@ -638,37 +629,76 @@ static av_cold int vaapi_encode_mpeg2_init(AVCodecContext *avctx)
 
 static av_cold int vaapi_encode_mpeg2_close(AVCodecContext *avctx)
 {
-    VAAPIEncodeContext *ctx = avctx->priv_data;
-    VAAPIEncodeMPEG2Context *priv = ctx->priv_data;
+    VAAPIEncodeMPEG2Context *priv = avctx->priv_data;
 
-    if (priv)
-        ff_cbs_close(&priv->cbc);
+    ff_cbs_fragment_free(priv->cbc, &priv->current_fragment);
+    ff_cbs_close(&priv->cbc);
 
     return ff_vaapi_encode_close(avctx);
 }
 
+#define OFFSET(x) offsetof(VAAPIEncodeMPEG2Context, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
+static const AVOption vaapi_encode_mpeg2_options[] = {
+    VAAPI_ENCODE_COMMON_OPTIONS,
+    VAAPI_ENCODE_RC_OPTIONS,
+
+    { "profile", "Set profile (in profile_and_level_indication)",
+      OFFSET(profile), AV_OPT_TYPE_INT,
+      { .i64 = FF_PROFILE_UNKNOWN }, FF_PROFILE_UNKNOWN, 7, FLAGS, "profile" },
+
+#define PROFILE(name, value)  name, NULL, 0, AV_OPT_TYPE_CONST, \
+      { .i64 = value }, 0, 0, FLAGS, "profile"
+    { PROFILE("simple", FF_PROFILE_MPEG2_SIMPLE) },
+    { PROFILE("main",   FF_PROFILE_MPEG2_MAIN)   },
+#undef PROFILE
+
+    { "level", "Set level (in profile_and_level_indication)",
+      OFFSET(level), AV_OPT_TYPE_INT,
+      { .i64 = 4 }, 0, 15, FLAGS, "level" },
+
+#define LEVEL(name, value) name, NULL, 0, AV_OPT_TYPE_CONST, \
+      { .i64 = value }, 0, 0, FLAGS, "level"
+    { LEVEL("low",       10) },
+    { LEVEL("main",       8) },
+    { LEVEL("high_1440",  6) },
+    { LEVEL("high",       4) },
+#undef LEVEL
+
+    { NULL },
+};
+
 static const AVCodecDefault vaapi_encode_mpeg2_defaults[] = {
-    { "profile",        "4"   },
-    { "level",          "4"   },
+    { "b",              "0"   },
     { "bf",             "1"   },
     { "g",              "120" },
-    { "i_qfactor",      "1.0" },
-    { "i_qoffset",      "0.0" },
-    { "b_qfactor",      "1.2" },
-    { "b_qoffset",      "0.0" },
-    { "global_quality", "10"  },
+    { "i_qfactor",      "1"   },
+    { "i_qoffset",      "0"   },
+    { "b_qfactor",      "6/5" },
+    { "b_qoffset",      "0"   },
+    { "qmin",           "-1"  },
+    { "qmax",           "-1"  },
     { NULL },
 };
 
+static const AVClass vaapi_encode_mpeg2_class = {
+    .class_name = "mpeg2_vaapi",
+    .item_name  = av_default_item_name,
+    .option     = vaapi_encode_mpeg2_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_mpeg2_vaapi_encoder = {
     .name           = "mpeg2_vaapi",
     .long_name      = NULL_IF_CONFIG_SMALL("MPEG-2 (VAAPI)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_MPEG2VIDEO,
-    .priv_data_size = sizeof(VAAPIEncodeContext),
+    .priv_data_size = sizeof(VAAPIEncodeMPEG2Context),
     .init           = &vaapi_encode_mpeg2_init,
-    .encode2        = &ff_vaapi_encode2,
+    .send_frame     = &ff_vaapi_encode_send_frame,
+    .receive_packet = &ff_vaapi_encode_receive_packet,
     .close          = &vaapi_encode_mpeg2_close,
+    .priv_class     = &vaapi_encode_mpeg2_class,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
     .defaults       = vaapi_encode_mpeg2_defaults,
     .pix_fmts = (const enum AVPixelFormat[]) {
diff --git a/libavcodec/vaapi_encode_vp8.c b/libavcodec/vaapi_encode_vp8.c
index 857054d..ddbe4c9 100644
--- a/libavcodec/vaapi_encode_vp8.c
+++ b/libavcodec/vaapi_encode_vp8.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,17 +28,20 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "vaapi_encode.h"
+#include "vp8.h"
 
 
 typedef struct VAAPIEncodeVP8Context {
-    int q_index_i;
-    int q_index_p;
-} VAAPIEncodeVP8Context;
+    VAAPIEncodeContext common;
 
-typedef struct VAAPIEncodeVP8Options {
+    // User options.
     int loop_filter_level;
     int loop_filter_sharpness;
-} VAAPIEncodeVP8Options;
+
+    // Derived settings.
+    int q_index_i;
+    int q_index_p;
+} VAAPIEncodeVP8Context;
 
 
 #define vseq_var(name)     vseq->name, name
@@ -62,8 +65,8 @@ static int vaapi_encode_vp8_init_sequence_params(AVCodecContext *avctx)
     vseq->kf_auto = 0;
 
     if (!(ctx->va_rc_mode & VA_RC_CQP)) {
-        vseq->bits_per_second = avctx->bit_rate;
-        vseq->intra_period    = avctx->gop_size;
+        vseq->bits_per_second = ctx->va_bit_rate;
+        vseq->intra_period    = ctx->gop_size;
     }
 
     return 0;
@@ -72,9 +75,8 @@ static int vaapi_encode_vp8_init_sequence_params(AVCodecContext *avctx)
 static int vaapi_encode_vp8_init_picture_params(AVCodecContext *avctx,
                                                 VAAPIEncodePicture *pic)
 {
-    VAAPIEncodeContext              *ctx = avctx->priv_data;
+    VAAPIEncodeVP8Context          *priv = avctx->priv_data;
     VAEncPictureParameterBufferVP8 *vpic = pic->codec_picture_params;
-    VAAPIEncodeVP8Options           *opt = ctx->codec_options;
     int i;
 
     vpic->reconstructed_frame = pic->recon_surface;
@@ -115,8 +117,8 @@ static int vaapi_encode_vp8_init_picture_params(AVCodecContext *avctx,
     vpic->pic_flags.bits.version = 0;
     vpic->pic_flags.bits.loop_filter_type = 0;
     for (i = 0; i < 4; i++)
-        vpic->loop_filter_level[i] = opt->loop_filter_level;
-    vpic->sharpness_level = opt->loop_filter_sharpness;
+        vpic->loop_filter_level[i] = priv->loop_filter_level;
+    vpic->sharpness_level = priv->loop_filter_sharpness;
 
     vpic->clamp_qindex_low  = 0;
     vpic->clamp_qindex_high = 127;
@@ -129,8 +131,7 @@ static int vaapi_encode_vp8_write_quant_table(AVCodecContext *avctx,
                                               int index, int *type,
                                               char *data, size_t *data_len)
 {
-    VAAPIEncodeContext     *ctx = avctx->priv_data;
-    VAAPIEncodeVP8Context *priv = ctx->priv_data;
+    VAAPIEncodeVP8Context *priv = avctx->priv_data;
     VAQMatrixBufferVP8 quant;
     int i, q;
 
@@ -142,6 +143,8 @@ static int vaapi_encode_vp8_write_quant_table(AVCodecContext *avctx,
     *type     = VAQMatrixBufferType;
     *data_len = sizeof(quant);
 
+    memset(&quant, 0, sizeof(quant));
+
     if (pic->type == PICTURE_TYPE_P)
         q = priv->q_index_p;
     else
@@ -159,24 +162,31 @@ static int vaapi_encode_vp8_write_quant_table(AVCodecContext *avctx,
 static av_cold int vaapi_encode_vp8_configure(AVCodecContext *avctx)
 {
     VAAPIEncodeContext     *ctx = avctx->priv_data;
-    VAAPIEncodeVP8Context *priv = ctx->priv_data;
+    VAAPIEncodeVP8Context *priv = avctx->priv_data;
 
-    priv->q_index_p = av_clip(avctx->global_quality, 0, 127);
+    priv->q_index_p = av_clip(ctx->rc_quality, 0, VP8_MAX_QUANT);
     if (avctx->i_quant_factor > 0.0)
-        priv->q_index_i = av_clip((avctx->global_quality *
-                                   avctx->i_quant_factor +
-                                   avctx->i_quant_offset) + 0.5,
-                                  0, 127);
+        priv->q_index_i =
+            av_clip((avctx->i_quant_factor * priv->q_index_p  +
+                     avctx->i_quant_offset) + 0.5,
+                    0, VP8_MAX_QUANT);
     else
         priv->q_index_i = priv->q_index_p;
 
     return 0;
 }
 
+static const VAAPIEncodeProfile vaapi_encode_vp8_profiles[] = {
+    { 0 /* VP8 has no profiles */, 8, 3, 1, 1, VAProfileVP8Version0_3 },
+    { FF_PROFILE_UNKNOWN }
+};
+
 static const VAAPIEncodeType vaapi_encode_type_vp8 = {
+    .profiles              = vaapi_encode_vp8_profiles,
+
     .configure             = &vaapi_encode_vp8_configure,
 
-    .priv_data_size        = sizeof(VAAPIEncodeVP8Context),
+    .default_quality       = 40,
 
     .sequence_params_size  = sizeof(VAEncSequenceParameterBufferVP8),
     .init_sequence_params  = &vaapi_encode_vp8_init_sequence_params,
@@ -191,30 +201,12 @@ static av_cold int vaapi_encode_vp8_init(AVCodecContext *avctx)
 {
     VAAPIEncodeContext *ctx = avctx->priv_data;
 
-    if (avctx->max_b_frames > 0) {
-        av_log(avctx, AV_LOG_ERROR, "B-frames are not supported.\n");
-        return AVERROR_PATCHWELCOME;
-    }
-
     ctx->codec = &vaapi_encode_type_vp8;
 
-    ctx->va_profile    = VAProfileVP8Version0_3;
-    ctx->va_entrypoint = VAEntrypointEncSlice;
-    ctx->va_rt_format  = VA_RT_FORMAT_YUV420;
-
-    if (avctx->flags & AV_CODEC_FLAG_QSCALE) {
-        ctx->va_rc_mode = VA_RC_CQP;
-    } else if (avctx->bit_rate > 0) {
-        if (avctx->rc_max_rate == avctx->bit_rate)
-            ctx->va_rc_mode = VA_RC_CBR;
-        else
-            ctx->va_rc_mode = VA_RC_VBR;
-    } else {
-        ctx->va_rc_mode = VA_RC_CQP;
-    }
-
-    // Packed headers are not currently supported.
-    ctx->va_packed_headers = 0;
+    // No packed headers are currently desired.  VP8 has no metadata
+    // which would be useful to write, and no existing driver supports
+    // adding them anyway.
+    ctx->desired_packed_headers = 0;
 
     ctx->surface_width  = FFALIGN(avctx->width,  16);
     ctx->surface_height = FFALIGN(avctx->height, 16);
@@ -222,10 +214,12 @@ static av_cold int vaapi_encode_vp8_init(AVCodecContext *avctx)
     return ff_vaapi_encode_init(avctx);
 }
 
-#define OFFSET(x) (offsetof(VAAPIEncodeContext, codec_options_data) + \
-                   offsetof(VAAPIEncodeVP8Options, x))
+#define OFFSET(x) offsetof(VAAPIEncodeVP8Context, x)
 #define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
 static const AVOption vaapi_encode_vp8_options[] = {
+    VAAPI_ENCODE_COMMON_OPTIONS,
+    VAAPI_ENCODE_RC_OPTIONS,
+
     { "loop_filter_level", "Loop filter level",
       OFFSET(loop_filter_level), AV_OPT_TYPE_INT, { .i64 = 16 }, 0, 63, FLAGS },
     { "loop_filter_sharpness", "Loop filter sharpness",
@@ -237,7 +231,8 @@ static const AVCodecDefault vaapi_encode_vp8_defaults[] = {
     { "b",              "0"   },
     { "bf",             "0"   },
     { "g",              "120" },
-    { "global_quality", "40"  },
+    { "qmin",           "-1"  },
+    { "qmax",           "-1"  },
     { NULL },
 };
 
@@ -253,10 +248,10 @@ AVCodec ff_vp8_vaapi_encoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("VP8 (VAAPI)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_VP8,
-    .priv_data_size = (sizeof(VAAPIEncodeContext) +
-                       sizeof(VAAPIEncodeVP8Options)),
+    .priv_data_size = sizeof(VAAPIEncodeVP8Context),
     .init           = &vaapi_encode_vp8_init,
-    .encode2        = &ff_vaapi_encode2,
+    .send_frame     = &ff_vaapi_encode_send_frame,
+    .receive_packet = &ff_vaapi_encode_receive_packet,
     .close          = &ff_vaapi_encode_close,
     .priv_class     = &vaapi_encode_vp8_class,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
diff --git a/libavcodec/vaapi_encode_vp9.c b/libavcodec/vaapi_encode_vp9.c
index 6d9899d..f89fd0d 100644
--- a/libavcodec/vaapi_encode_vp9.c
+++ b/libavcodec/vaapi_encode_vp9.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,28 +29,25 @@
 #include "internal.h"
 #include "vaapi_encode.h"
 
+#define VP9_MAX_QUANT 255
 
-typedef struct VAAPIEncodeVP9Context {
-    int q_idx_idr;
-    int q_idx_p;
-    int q_idx_b;
 
-    // Reference direction for B-like frames:
-    // 0 - most recent P/IDR frame is last.
-    // 1 - most recent P frame is golden.
-    int last_ref_dir;
-} VAAPIEncodeVP9Context;
+typedef struct VAAPIEncodeVP9Picture {
+    int slot;
+} VAAPIEncodeVP9Picture;
 
-typedef struct VAAPIEncodeVP9Options {
+typedef struct VAAPIEncodeVP9Context {
+    VAAPIEncodeContext common;
+
+    // User options.
     int loop_filter_level;
     int loop_filter_sharpness;
-} VAAPIEncodeVP9Options;
-
 
-#define vseq_var(name)     vseq->name, name
-#define vseq_field(name)   vseq->seq_fields.bits.name, name
-#define vpic_var(name)     vpic->name, name
-#define vpic_field(name)   vpic->pic_fields.bits.name, name
+    // Derived settings.
+    int q_idx_idr;
+    int q_idx_p;
+    int q_idx_b;
+} VAAPIEncodeVP9Context;
 
 
 static int vaapi_encode_vp9_init_sequence_params(AVCodecContext *avctx)
@@ -65,8 +62,8 @@ static int vaapi_encode_vp9_init_sequence_params(AVCodecContext *avctx)
     vseq->kf_auto = 0;
 
     if (!(ctx->va_rc_mode & VA_RC_CQP)) {
-        vseq->bits_per_second = avctx->bit_rate;
-        vseq->intra_period    = avctx->gop_size;
+        vseq->bits_per_second = ctx->va_bit_rate;
+        vseq->intra_period    = ctx->gop_size;
     }
 
     vpic->frame_width_src  = avctx->width;
@@ -81,9 +78,9 @@ static int vaapi_encode_vp9_init_picture_params(AVCodecContext *avctx,
                                                 VAAPIEncodePicture *pic)
 {
     VAAPIEncodeContext              *ctx = avctx->priv_data;
+    VAAPIEncodeVP9Context          *priv = avctx->priv_data;
+    VAAPIEncodeVP9Picture          *hpic = pic->priv_data;
     VAEncPictureParameterBufferVP9 *vpic = pic->codec_picture_params;
-    VAAPIEncodeVP9Context          *priv = ctx->priv_data;
-    VAAPIEncodeVP9Options           *opt = ctx->codec_options;
     int i;
 
     vpic->reconstructed_frame = pic->recon_surface;
@@ -93,65 +90,71 @@ static int vaapi_encode_vp9_init_picture_params(AVCodecContext *avctx,
     case PICTURE_TYPE_IDR:
         av_assert0(pic->nb_refs == 0);
         vpic->ref_flags.bits.force_kf = 1;
-        vpic->refresh_frame_flags = 0x01;
-        priv->last_ref_dir = 0;
+        vpic->refresh_frame_flags = 0xff;
+        hpic->slot = 0;
         break;
     case PICTURE_TYPE_P:
         av_assert0(pic->nb_refs == 1);
-        if (avctx->max_b_frames > 0) {
-            if (priv->last_ref_dir) {
-                vpic->ref_flags.bits.ref_frame_ctrl_l0  = 2;
-                vpic->ref_flags.bits.ref_gf_idx         = 1;
-                vpic->ref_flags.bits.ref_gf_sign_bias   = 1;
-                vpic->refresh_frame_flags = 0x01;
+        {
+            VAAPIEncodeVP9Picture *href = pic->refs[0]->priv_data;
+            av_assert0(href->slot == 0 || href->slot == 1);
+
+            if (ctx->max_b_depth > 0) {
+                hpic->slot = !href->slot;
+                vpic->refresh_frame_flags = 1 << hpic->slot | 0xfc;
             } else {
-                vpic->ref_flags.bits.ref_frame_ctrl_l0  = 1;
-                vpic->ref_flags.bits.ref_last_idx       = 0;
-                vpic->ref_flags.bits.ref_last_sign_bias = 1;
-                vpic->refresh_frame_flags = 0x02;
+                hpic->slot = 0;
+                vpic->refresh_frame_flags = 0xff;
             }
-        } else {
             vpic->ref_flags.bits.ref_frame_ctrl_l0  = 1;
-            vpic->ref_flags.bits.ref_last_idx       = 0;
+            vpic->ref_flags.bits.ref_last_idx       = href->slot;
             vpic->ref_flags.bits.ref_last_sign_bias = 1;
-            vpic->refresh_frame_flags = 0x01;
         }
         break;
     case PICTURE_TYPE_B:
         av_assert0(pic->nb_refs == 2);
-        if (priv->last_ref_dir) {
+        {
+            VAAPIEncodeVP9Picture *href0 = pic->refs[0]->priv_data,
+                                  *href1 = pic->refs[1]->priv_data;
+            av_assert0(href0->slot < pic->b_depth + 1 &&
+                       href1->slot < pic->b_depth + 1);
+
+            if (pic->b_depth == ctx->max_b_depth) {
+                // Unreferenced frame.
+                vpic->refresh_frame_flags = 0x00;
+                hpic->slot = 8;
+            } else {
+                vpic->refresh_frame_flags = 0xfe << pic->b_depth & 0xff;
+                hpic->slot = 1 + pic->b_depth;
+            }
             vpic->ref_flags.bits.ref_frame_ctrl_l0  = 1;
             vpic->ref_flags.bits.ref_frame_ctrl_l1  = 2;
-            vpic->ref_flags.bits.ref_last_idx       = 0;
+            vpic->ref_flags.bits.ref_last_idx       = href0->slot;
             vpic->ref_flags.bits.ref_last_sign_bias = 1;
-            vpic->ref_flags.bits.ref_gf_idx         = 1;
+            vpic->ref_flags.bits.ref_gf_idx         = href1->slot;
             vpic->ref_flags.bits.ref_gf_sign_bias   = 0;
-        } else {
-            vpic->ref_flags.bits.ref_frame_ctrl_l0  = 2;
-            vpic->ref_flags.bits.ref_frame_ctrl_l1  = 1;
-            vpic->ref_flags.bits.ref_last_idx       = 0;
-            vpic->ref_flags.bits.ref_last_sign_bias = 0;
-            vpic->ref_flags.bits.ref_gf_idx         = 1;
-            vpic->ref_flags.bits.ref_gf_sign_bias   = 1;
         }
-        vpic->refresh_frame_flags = 0x00;
         break;
     default:
         av_assert0(0 && "invalid picture type");
     }
+    if (vpic->refresh_frame_flags == 0x00) {
+        av_log(avctx, AV_LOG_DEBUG, "Pic %"PRId64" not stored.\n",
+               pic->display_order);
+    } else {
+        av_log(avctx, AV_LOG_DEBUG, "Pic %"PRId64" stored in slot %d.\n",
+               pic->display_order, hpic->slot);
+    }
 
     for (i = 0; i < FF_ARRAY_ELEMS(vpic->reference_frames); i++)
         vpic->reference_frames[i] = VA_INVALID_SURFACE;
-    if (pic->type == PICTURE_TYPE_P) {
-        av_assert0(pic->refs[0]);
-        vpic->reference_frames[priv->last_ref_dir] =
-            pic->refs[0]->recon_surface;
-    } else if (pic->type == PICTURE_TYPE_B) {
-        av_assert0(pic->refs[0] && pic->refs[1]);
-        vpic->reference_frames[!priv->last_ref_dir] =
-            pic->refs[0]->recon_surface;
-        vpic->reference_frames[priv->last_ref_dir] =
-            pic->refs[1]->recon_surface;
+
+    for (i = 0; i < pic->nb_refs; i++) {
+        VAAPIEncodePicture *ref_pic = pic->refs[i];
+        int slot;
+        slot = ((VAAPIEncodeVP9Picture*)ref_pic->priv_data)->slot;
+        av_assert0(vpic->reference_frames[slot] == VA_INVALID_SURFACE);
+        vpic->reference_frames[slot] = ref_pic->recon_surface;
     }
 
     vpic->pic_flags.bits.frame_type = (pic->type != PICTURE_TYPE_IDR);
@@ -167,11 +170,8 @@ static int vaapi_encode_vp9_init_picture_params(AVCodecContext *avctx,
     vpic->chroma_ac_qindex_delta = 0;
     vpic->chroma_dc_qindex_delta = 0;
 
-    vpic->filter_level    = opt->loop_filter_level;
-    vpic->sharpness_level = opt->loop_filter_sharpness;
-
-    if (avctx->max_b_frames > 0 && pic->type == PICTURE_TYPE_P)
-        priv->last_ref_dir = !priv->last_ref_dir;
+    vpic->filter_level    = priv->loop_filter_level;
+    vpic->sharpness_level = priv->loop_filter_sharpness;
 
     return 0;
 }
@@ -179,31 +179,49 @@ static int vaapi_encode_vp9_init_picture_params(AVCodecContext *avctx,
 static av_cold int vaapi_encode_vp9_configure(AVCodecContext *avctx)
 {
     VAAPIEncodeContext     *ctx = avctx->priv_data;
-    VAAPIEncodeVP9Context *priv = ctx->priv_data;
-
-    priv->q_idx_p = av_clip(avctx->global_quality, 0, 255);
-    if (avctx->i_quant_factor > 0.0)
-        priv->q_idx_idr = av_clip((avctx->global_quality *
-                                   avctx->i_quant_factor +
-                                   avctx->i_quant_offset) + 0.5,
-                                  0, 255);
-    else
-        priv->q_idx_idr = priv->q_idx_p;
-    if (avctx->b_quant_factor > 0.0)
-        priv->q_idx_b = av_clip((avctx->global_quality *
-                                 avctx->b_quant_factor +
-                                 avctx->b_quant_offset) + 0.5,
-                                0, 255);
-    else
-        priv->q_idx_b = priv->q_idx_p;
+    VAAPIEncodeVP9Context *priv = avctx->priv_data;
+
+    if (ctx->rc_mode->quality) {
+        priv->q_idx_p = av_clip(ctx->rc_quality, 0, VP9_MAX_QUANT);
+        if (avctx->i_quant_factor > 0.0)
+            priv->q_idx_idr =
+                av_clip((avctx->i_quant_factor * priv->q_idx_p  +
+                         avctx->i_quant_offset) + 0.5,
+                        0, VP9_MAX_QUANT);
+        else
+            priv->q_idx_idr = priv->q_idx_p;
+        if (avctx->b_quant_factor > 0.0)
+            priv->q_idx_b =
+                av_clip((avctx->b_quant_factor * priv->q_idx_p  +
+                         avctx->b_quant_offset) + 0.5,
+                        0, VP9_MAX_QUANT);
+        else
+            priv->q_idx_b = priv->q_idx_p;
+    } else {
+        // Arbitrary value.
+        priv->q_idx_idr = priv->q_idx_p = priv->q_idx_b = 100;
+    }
 
     return 0;
 }
 
+static const VAAPIEncodeProfile vaapi_encode_vp9_profiles[] = {
+    { FF_PROFILE_VP9_0,  8, 3, 1, 1, VAProfileVP9Profile0 },
+    { FF_PROFILE_VP9_2, 10, 3, 1, 1, VAProfileVP9Profile2 },
+    { FF_PROFILE_UNKNOWN }
+};
+
 static const VAAPIEncodeType vaapi_encode_type_vp9 = {
-    .configure             = &vaapi_encode_vp9_configure,
+    .profiles              = vaapi_encode_vp9_profiles,
+
+    .flags                 = FLAG_B_PICTURES |
+                             FLAG_B_PICTURE_REFERENCES,
+
+    .default_quality       = 100,
 
-    .priv_data_size        = sizeof(VAAPIEncodeVP9Context),
+    .picture_priv_data_size = sizeof(VAAPIEncodeVP9Picture),
+
+    .configure             = &vaapi_encode_vp9_configure,
 
     .sequence_params_size  = sizeof(VAEncSequenceParameterBufferVP9),
     .init_sequence_params  = &vaapi_encode_vp9_init_sequence_params,
@@ -218,44 +236,10 @@ static av_cold int vaapi_encode_vp9_init(AVCodecContext *avctx)
 
     ctx->codec = &vaapi_encode_type_vp9;
 
-    switch (avctx->profile) {
-    case FF_PROFILE_VP9_0:
-    case FF_PROFILE_UNKNOWN:
-        ctx->va_profile = VAProfileVP9Profile0;
-        ctx->va_rt_format = VA_RT_FORMAT_YUV420;
-        break;
-    case FF_PROFILE_VP9_1:
-        av_log(avctx, AV_LOG_ERROR, "VP9 profile 1 is not "
-               "supported.\n");
-        return AVERROR_PATCHWELCOME;
-    case FF_PROFILE_VP9_2:
-        ctx->va_profile = VAProfileVP9Profile2;
-        ctx->va_rt_format = VA_RT_FORMAT_YUV420_10BPP;
-        break;
-    case FF_PROFILE_VP9_3:
-        av_log(avctx, AV_LOG_ERROR, "VP9 profile 3 is not "
-               "supported.\n");
-        return AVERROR_PATCHWELCOME;
-    default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown VP9 profile %d.\n",
-               avctx->profile);
-        return AVERROR(EINVAL);
-    }
-    ctx->va_entrypoint = VAEntrypointEncSlice;
-
-    if (avctx->flags & AV_CODEC_FLAG_QSCALE) {
-        ctx->va_rc_mode = VA_RC_CQP;
-    } else if (avctx->bit_rate > 0) {
-        if (avctx->bit_rate == avctx->rc_max_rate)
-            ctx->va_rc_mode = VA_RC_CBR;
-        else
-            ctx->va_rc_mode = VA_RC_VBR;
-    } else {
-        ctx->va_rc_mode = VA_RC_CQP;
-    }
-
-    // Packed headers are not currently supported.
-    ctx->va_packed_headers = 0;
+    // No packed headers are currently desired.  They could be written,
+    // but there isn't any reason to do so - the one usable driver (i965)
+    // can write its own headers and there is no metadata to include.
+    ctx->desired_packed_headers = 0;
 
     // Surfaces must be aligned to superblock boundaries.
     ctx->surface_width  = FFALIGN(avctx->width,  64);
@@ -264,10 +248,12 @@ static av_cold int vaapi_encode_vp9_init(AVCodecContext *avctx)
     return ff_vaapi_encode_init(avctx);
 }
 
-#define OFFSET(x) (offsetof(VAAPIEncodeContext, codec_options_data) + \
-                   offsetof(VAAPIEncodeVP9Options, x))
+#define OFFSET(x) offsetof(VAAPIEncodeVP9Context, x)
 #define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
 static const AVOption vaapi_encode_vp9_options[] = {
+    VAAPI_ENCODE_COMMON_OPTIONS,
+    VAAPI_ENCODE_RC_OPTIONS,
+
     { "loop_filter_level", "Loop filter level",
       OFFSET(loop_filter_level), AV_OPT_TYPE_INT, { .i64 = 16 }, 0, 63, FLAGS },
     { "loop_filter_sharpness", "Loop filter sharpness",
@@ -276,11 +262,11 @@ static const AVOption vaapi_encode_vp9_options[] = {
 };
 
 static const AVCodecDefault vaapi_encode_vp9_defaults[] = {
-    { "profile",        "0"   },
     { "b",              "0"   },
     { "bf",             "0"   },
     { "g",              "250" },
-    { "global_quality", "100" },
+    { "qmin",           "-1"  },
+    { "qmax",           "-1"  },
     { NULL },
 };
 
@@ -296,10 +282,10 @@ AVCodec ff_vp9_vaapi_encoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("VP9 (VAAPI)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_VP9,
-    .priv_data_size = (sizeof(VAAPIEncodeContext) +
-                       sizeof(VAAPIEncodeVP9Options)),
+    .priv_data_size = sizeof(VAAPIEncodeVP9Context),
     .init           = &vaapi_encode_vp9_init,
-    .encode2        = &ff_vaapi_encode2,
+    .send_frame     = &ff_vaapi_encode_send_frame,
+    .receive_packet = &ff_vaapi_encode_receive_packet,
     .close          = &ff_vaapi_encode_close,
     .priv_class     = &vaapi_encode_vp9_class,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
diff --git a/libavcodec/vaapi_h264.c b/libavcodec/vaapi_h264.c
index 97d4387..5854587 100644
--- a/libavcodec/vaapi_h264.c
+++ b/libavcodec/vaapi_h264.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 /**
  * @file
- * This file implements the glue code between Libav's and VA API's
+ * This file implements the glue code between FFmpeg's and VA API's
  * structures for H.264 decoding.
  */
 
@@ -45,10 +45,10 @@ static void init_vaapi_pic(VAPictureH264 *va_pic)
 }
 
 /**
- * Translate an Libav Picture into its VA API form.
+ * Translate an FFmpeg Picture into its VA API form.
  *
  * @param[out] va_pic          A pointer to VA API's own picture struct
- * @param[in]  pic             A pointer to the Libav picture struct to convert
+ * @param[in]  pic             A pointer to the FFmpeg picture struct to convert
  * @param[in]  pic_structure   The picture field type (as defined in mpegvideo.h),
  *                             supersedes pic's field type if nonzero.
  */
@@ -149,11 +149,11 @@ static int fill_vaapi_ReferenceFrames(VAPictureParameterBufferH264 *pic_param,
 }
 
 /**
- * Fill in VA API reference picture lists from the Libav reference
+ * Fill in VA API reference picture lists from the FFmpeg reference
  * picture list.
  *
  * @param[out] RefPicList  VA API internal reference picture list
- * @param[in]  ref_list    A pointer to the Libav reference list
+ * @param[in]  ref_list    A pointer to the FFmpeg reference list
  * @param[in]  ref_count   The number of reference pictures in ref_list
  */
 static void fill_vaapi_RefPicList(VAPictureH264 RefPicList[32],
diff --git a/libavcodec/vaapi_hevc.c b/libavcodec/vaapi_hevc.c
index 71fab77..19aabcd 100644
--- a/libavcodec/vaapi_hevc.c
+++ b/libavcodec/vaapi_hevc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2015 Timo Rothenpieler <timo@rothenpieler.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -232,11 +232,11 @@ static int vaapi_hevc_start_frame(AVCodecContext          *avctx,
                 iq_matrix.ScalingList8x8[i][j]   = scaling_list->sl[1][i][j];
                 iq_matrix.ScalingList16x16[i][j] = scaling_list->sl[2][i][j];
                 if (i < 2)
-                    iq_matrix.ScalingList32x32[i][j] = scaling_list->sl[3][i][j];
+                    iq_matrix.ScalingList32x32[i][j] = scaling_list->sl[3][i * 3][j];
             }
             iq_matrix.ScalingListDC16x16[i] = scaling_list->sl_dc[0][i];
             if (i < 2)
-                iq_matrix.ScalingListDC32x32[i] = scaling_list->sl_dc[1][i];
+                iq_matrix.ScalingListDC32x32[i] = scaling_list->sl_dc[1][i * 3];
         }
 
         err = ff_vaapi_decode_make_param_buffer(avctx, &pic->pic,
@@ -379,7 +379,7 @@ static int vaapi_hevc_decode_slice(AVCodecContext *avctx,
         .slice_data_flag               = VA_SLICE_DATA_FLAG_ALL,
         /* Add 1 to the bits count here to account for the byte_alignment bit, which
          * always is at least one bit and not accounted for otherwise. */
-        .slice_data_byte_offset        = (get_bits_count(&h->HEVClc.gb) + 1 + 7) / 8,
+        .slice_data_byte_offset        = (get_bits_count(&h->HEVClc->gb) + 1 + 7) / 8,
         .slice_segment_address         = sh->slice_segment_addr,
         .slice_qp_delta                = sh->slice_qp_delta,
         .slice_cb_qp_offset            = sh->slice_cb_qp_offset,
diff --git a/libavcodec/vaapi_mjpeg.c b/libavcodec/vaapi_mjpeg.c
new file mode 100644
index 0000000..14e0206
--- /dev/null
+++ b/libavcodec/vaapi_mjpeg.c
@@ -0,0 +1,159 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <va/va.h>
+#include <va/va_dec_jpeg.h>
+
+#include "hwaccel.h"
+#include "vaapi_decode.h"
+#include "mjpegdec.h"
+
+static int vaapi_mjpeg_start_frame(AVCodecContext          *avctx,
+                                   av_unused const uint8_t *buffer,
+                                   av_unused uint32_t       size)
+{
+    const MJpegDecodeContext *s = avctx->priv_data;
+    VAAPIDecodePicture *pic = s->hwaccel_picture_private;
+    VAPictureParameterBufferJPEGBaseline pp;
+    int err, i;
+
+    pic->output_surface = ff_vaapi_get_surface_id(s->picture_ptr);
+
+    pp = (VAPictureParameterBufferJPEGBaseline) {
+        .picture_width  = avctx->width,
+        .picture_height = avctx->height,
+
+        .num_components = s->nb_components,
+    };
+
+    for (i = 0; i < s->nb_components; i++) {
+        pp.components[i].component_id             = s->component_id[i];
+        pp.components[i].h_sampling_factor        = s->h_count[i];
+        pp.components[i].v_sampling_factor        = s->v_count[i];
+        pp.components[i].quantiser_table_selector = s->quant_index[i];
+    }
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, pic,
+                                            VAPictureParameterBufferType,
+                                            &pp, sizeof(pp));
+    if (err < 0)
+        goto fail;
+
+    return 0;
+
+fail:
+    ff_vaapi_decode_cancel(avctx, pic);
+    return err;
+}
+
+static int vaapi_mjpeg_end_frame(AVCodecContext *avctx)
+{
+    const MJpegDecodeContext *s = avctx->priv_data;
+    VAAPIDecodePicture *pic = s->hwaccel_picture_private;
+
+    return ff_vaapi_decode_issue(avctx, pic);
+}
+
+static int vaapi_mjpeg_decode_slice(AVCodecContext *avctx,
+                                    const uint8_t  *buffer,
+                                    uint32_t        size)
+{
+    const MJpegDecodeContext *s = avctx->priv_data;
+    VAAPIDecodePicture *pic = s->hwaccel_picture_private;
+    VAHuffmanTableBufferJPEGBaseline huff;
+    VAIQMatrixBufferJPEGBaseline quant;
+    VASliceParameterBufferJPEGBaseline sp;
+    int err, i, j;
+
+    memset(&huff, 0, sizeof(huff));
+    for (i = 0; i < 2; i++) {
+        huff.load_huffman_table[i] = 1;
+        for (j = 0; j < 16; j++)
+            huff.huffman_table[i].num_dc_codes[j] = s->raw_huffman_lengths[0][i][j];
+        for (j = 0; j < 12; j++)
+            huff.huffman_table[i].dc_values[j] = s->raw_huffman_values[0][i][j];
+        for (j = 0; j < 16; j++)
+            huff.huffman_table[i].num_ac_codes[j] = s->raw_huffman_lengths[1][i][j];
+        for (j = 0; j < 162; j++)
+            huff.huffman_table[i].ac_values[j] = s->raw_huffman_values[1][i][j];
+    }
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, pic,
+                                            VAHuffmanTableBufferType,
+                                            &huff, sizeof(huff));
+    if (err < 0)
+        goto fail;
+
+    memset(&quant, 0, sizeof(quant));
+    for (i = 0; i < 4; i++) {
+        quant.load_quantiser_table[i] = 1;
+        for (j = 0; j < 64; j++)
+            quant.quantiser_table[i][j] = s->quant_matrixes[i][j];
+    }
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, pic,
+                                            VAIQMatrixBufferType,
+                                            &quant, sizeof(quant));
+    if (err < 0)
+        goto fail;
+
+    sp = (VASliceParameterBufferJPEGBaseline) {
+        .slice_data_size   = size,
+        .slice_data_offset = 0,
+        .slice_data_flag   = VA_SLICE_DATA_FLAG_ALL,
+
+        .slice_horizontal_position = 0,
+        .slice_vertical_position   = 0,
+
+        .restart_interval          = s->restart_interval,
+        .num_mcus                  = s->mb_width * s->mb_height,
+    };
+
+    sp.num_components = s->nb_components;
+    for (i = 0; i < s->nb_components; i++) {
+        sp.components[i].component_selector = s->component_id[s->comp_index[i]];
+        sp.components[i].dc_table_selector  = s->dc_index[i];
+        sp.components[i].ac_table_selector  = s->ac_index[i];
+    }
+
+    err = ff_vaapi_decode_make_slice_buffer(avctx, pic, &sp, sizeof(sp), buffer, size);
+    if (err)
+        goto fail;
+
+    return 0;
+
+fail:
+    ff_vaapi_decode_cancel(avctx, pic);
+    return err;
+}
+
+const AVHWAccel ff_mjpeg_vaapi_hwaccel = {
+    .name                 = "mjpeg_vaapi",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_MJPEG,
+    .pix_fmt              = AV_PIX_FMT_VAAPI,
+    .start_frame          = &vaapi_mjpeg_start_frame,
+    .end_frame            = &vaapi_mjpeg_end_frame,
+    .decode_slice         = &vaapi_mjpeg_decode_slice,
+    .frame_priv_data_size = sizeof(VAAPIDecodePicture),
+    .init                 = &ff_vaapi_decode_init,
+    .uninit               = &ff_vaapi_decode_uninit,
+    .frame_params         = &ff_vaapi_common_frame_params,
+    .priv_data_size       = sizeof(VAAPIDecodeContext),
+    .caps_internal        = HWACCEL_CAP_ASYNC_SAFE,
+};
diff --git a/libavcodec/vaapi_mpeg2.c b/libavcodec/vaapi_mpeg2.c
index 4cca00c..aaed434 100644
--- a/libavcodec/vaapi_mpeg2.c
+++ b/libavcodec/vaapi_mpeg2.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -145,8 +145,8 @@ static int vaapi_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer
     intra_slice_flag = get_bits1(&gb);
     if (intra_slice_flag) {
         skip_bits(&gb, 8);
-        while (get_bits1(&gb) != 0)
-            skip_bits(&gb, 8);
+        if (skip_1stop_8data_bits(&gb) < 0)
+            return AVERROR_INVALIDDATA;
     }
     macroblock_offset = get_bits_count(&gb);
 
@@ -169,7 +169,6 @@ static int vaapi_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer
         return err;
     }
 
-
     return 0;
 }
 
diff --git a/libavcodec/vaapi_mpeg4.c b/libavcodec/vaapi_mpeg4.c
index 5dc94a4..11860ff 100644
--- a/libavcodec/vaapi_mpeg4.c
+++ b/libavcodec/vaapi_mpeg4.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -157,26 +157,15 @@ static int vaapi_mpeg4_decode_slice(AVCodecContext *avctx, const uint8_t *buffer
     VASliceParameterBufferMPEG4 slice_param;
     int err;
 
-    /* video_plane_with_short_video_header() contains all GOBs
-     * in-order, and this is what VA API (Intel backend) expects: only
-     * a single slice param. So fake macroblock_number for Libav so
-     * that we don't call vaapi_mpeg4_decode_slice() again
-     */
-    if (avctx->codec->id == AV_CODEC_ID_H263)
-        size = s->gb.buffer_end - buffer;
-
     slice_param = (VASliceParameterBufferMPEG4) {
         .slice_data_size   = size,
         .slice_data_offset = 0,
         .slice_data_flag   = VA_SLICE_DATA_FLAG_ALL,
         .macroblock_offset = get_bits_count(&s->gb) % 8,
-        .macroblock_number = s->mb_y * s->mb_width + s->mb_x,
+        .macroblock_number = 0,
         .quant_scale       = s->qscale,
     };
 
-    if (avctx->codec->id == AV_CODEC_ID_H263)
-        s->mb_y = s->mb_height;
-
     err = ff_vaapi_decode_make_slice_buffer(avctx, pic,
                                             &slice_param, sizeof(slice_param),
                                             buffer, size);
diff --git a/libavcodec/vaapi_vc1.c b/libavcodec/vaapi_vc1.c
index 8b7d49f..921ca63 100644
--- a/libavcodec/vaapi_vc1.c
+++ b/libavcodec/vaapi_vc1.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 #include "vc1.h"
 #include "vc1data.h"
 
-/** Translate Libav MV modes to VA API */
+/** Translate FFmpeg MV modes to VA API */
 static int get_VAMvModeVC1(enum MVModes mv_mode)
 {
     switch (mv_mode) {
@@ -44,7 +44,8 @@ static inline int vc1_has_MVTYPEMB_bitplane(const VC1Context *v)
 {
     if (v->mv_type_is_raw)
         return 0;
-    return v->s.pict_type == AV_PICTURE_TYPE_P &&
+    return v->fcm == PROGRESSIVE &&
+           (v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) &&
            (v->mv_mode == MV_PMODE_MIXED_MV ||
             (v->mv_mode == MV_PMODE_INTENSITY_COMP &&
              v->mv_mode2 == MV_PMODE_MIXED_MV));
@@ -55,8 +56,9 @@ static inline int vc1_has_SKIPMB_bitplane(const VC1Context *v)
 {
     if (v->skip_is_raw)
         return 0;
-    return v->s.pict_type == AV_PICTURE_TYPE_P ||
-           (v->s.pict_type == AV_PICTURE_TYPE_B && !v->bi_type);
+    return (v->fcm == PROGRESSIVE || v->fcm == ILACE_FRAME) &&
+           ((v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) ||
+            (v->s.pict_type == AV_PICTURE_TYPE_B && !v->bi_type));
 }
 
 /** Check whether the DIRECTMB bitplane is present */
@@ -64,7 +66,8 @@ static inline int vc1_has_DIRECTMB_bitplane(const VC1Context *v)
 {
     if (v->dmb_is_raw)
         return 0;
-    return v->s.pict_type == AV_PICTURE_TYPE_B && !v->bi_type;
+    return (v->fcm == PROGRESSIVE || v->fcm == ILACE_FRAME) &&
+           (v->s.pict_type == AV_PICTURE_TYPE_B && !v->bi_type);
 }
 
 /** Check whether the ACPRED bitplane is present */
@@ -89,6 +92,25 @@ static inline int vc1_has_OVERFLAGS_bitplane(const VC1Context *v)
            v->condover == CONDOVER_SELECT;
 }
 
+/** Check whether the FIELDTX bitplane is present */
+static inline int vc1_has_FIELDTX_bitplane(const VC1Context *v)
+{
+    if (v->fieldtx_is_raw)
+        return 0;
+    return v->fcm == ILACE_FRAME &&
+           (v->s.pict_type == AV_PICTURE_TYPE_I ||
+            (v->s.pict_type == AV_PICTURE_TYPE_B && v->bi_type));
+}
+
+/** Check whether the FORWARDMB bitplane is present */
+static inline int vc1_has_FORWARDMB_bitplane(const VC1Context *v)
+{
+    if (v->fmb_is_raw)
+        return 0;
+    return v->fcm == ILACE_FIELD &&
+           (v->s.pict_type == AV_PICTURE_TYPE_B && !v->bi_type);
+}
+
 /** Reconstruct bitstream PTYPE (7.1.1.4, index into Table-35) */
 static int vc1_get_PTYPE(const VC1Context *v)
 {
@@ -101,11 +123,24 @@ static int vc1_get_PTYPE(const VC1Context *v)
     return 0;
 }
 
+/** Reconstruct bitstream FPTYPE (9.1.1.42, index into Table-105) */
+static int vc1_get_FPTYPE(const VC1Context *v)
+{
+    const MpegEncContext *s = &v->s;
+    switch (s->pict_type) {
+    case AV_PICTURE_TYPE_I: return 0;
+    case AV_PICTURE_TYPE_P: return 3;
+    case AV_PICTURE_TYPE_B: return v->bi_type ? 7 : 4;
+    }
+    return 0;
+}
+
 /** Reconstruct bitstream MVMODE (7.1.1.32) */
 static inline VAMvModeVC1 vc1_get_MVMODE(const VC1Context *v)
 {
-    if (v->s.pict_type == AV_PICTURE_TYPE_P ||
-        (v->s.pict_type == AV_PICTURE_TYPE_B && !v->bi_type))
+    if ((v->fcm == PROGRESSIVE || v->fcm == ILACE_FIELD) &&
+        ((v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) ||
+         (v->s.pict_type == AV_PICTURE_TYPE_B && !v->bi_type)))
         return get_VAMvModeVC1(v->mv_mode);
     return 0;
 }
@@ -113,11 +148,78 @@ static inline VAMvModeVC1 vc1_get_MVMODE(const VC1Context *v)
 /** Reconstruct bitstream MVMODE2 (7.1.1.33) */
 static inline VAMvModeVC1 vc1_get_MVMODE2(const VC1Context *v)
 {
-    if (v->s.pict_type == AV_PICTURE_TYPE_P && v->mv_mode == MV_PMODE_INTENSITY_COMP)
+    if ((v->fcm == PROGRESSIVE || v->fcm == ILACE_FIELD) &&
+        (v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) &&
+        v->mv_mode == MV_PMODE_INTENSITY_COMP)
         return get_VAMvModeVC1(v->mv_mode2);
     return 0;
 }
 
+av_unused static inline int vc1_get_INTCOMPFIELD(const VC1Context *v)
+{
+    if ((v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) &&
+        v->fcm == ILACE_FIELD &&
+        v->mv_mode == MV_PMODE_INTENSITY_COMP)
+        switch (v->intcompfield) {
+        case 1: return 1;
+        case 2: return 2;
+        case 3: return 0;
+        }
+    return 0;
+}
+
+static inline int vc1_get_LUMSCALE(const VC1Context *v)
+{
+    if (v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) {
+        if ((v->fcm == PROGRESSIVE && v->mv_mode == MV_PMODE_INTENSITY_COMP) ||
+            (v->fcm == ILACE_FRAME && v->intcomp))
+            return v->lumscale;
+        else if (v->fcm == ILACE_FIELD && v->mv_mode == MV_PMODE_INTENSITY_COMP)
+            switch (v->intcompfield) {
+            case 1: return v->lumscale;
+            case 2: return v->lumscale2;
+            case 3: return v->lumscale;
+        }
+    }
+    return 0;
+}
+
+static inline int vc1_get_LUMSHIFT(const VC1Context *v)
+{
+    if (v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) {
+        if ((v->fcm == PROGRESSIVE && v->mv_mode == MV_PMODE_INTENSITY_COMP) ||
+            (v->fcm == ILACE_FRAME && v->intcomp))
+            return v->lumshift;
+        else if (v->fcm == ILACE_FIELD && v->mv_mode == MV_PMODE_INTENSITY_COMP)
+            switch (v->intcompfield) {
+            case 1: return v->lumshift;
+            case 2: return v->lumshift2;
+            case 3: return v->lumshift;
+        }
+    }
+    return 0;
+}
+
+av_unused static inline int vc1_get_LUMSCALE2(const VC1Context *v)
+{
+    if ((v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) &&
+        v->fcm == ILACE_FIELD &&
+        v->mv_mode == MV_PMODE_INTENSITY_COMP &&
+        v->intcompfield == 3)
+        return v->lumscale2;
+    return 0;
+}
+
+av_unused static inline int vc1_get_LUMSHIFT2(const VC1Context *v)
+{
+    if ((v->s.pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) &&
+        v->fcm == ILACE_FIELD &&
+        v->mv_mode == MV_PMODE_INTENSITY_COMP &&
+        v->intcompfield == 3)
+        return v->lumshift2;
+    return 0;
+}
+
 /** Reconstruct bitstream TTFRM (7.1.1.41, Table-53) */
 static inline int vc1_get_TTFRM(const VC1Context *v)
 {
@@ -130,7 +232,7 @@ static inline int vc1_get_TTFRM(const VC1Context *v)
     return 0;
 }
 
-/** Pack Libav bitplanes into a VABitPlaneBuffer element */
+/** Pack FFmpeg bitplanes into a VABitPlaneBuffer element */
 static inline void vc1_pack_bitplanes(uint8_t *bitplane, int n, const uint8_t *ff_bp[3], int x, int y, int stride)
 {
     const int bitplane_index = n / 2;
@@ -189,27 +291,32 @@ static int vaapi_vc1_start_frame(AVCodecContext *avctx, av_unused const uint8_t
             .chroma                        = v->range_mapuv,
         },
         .b_picture_fraction                = v->bfraction_lut_index,
-        .cbp_table                         = v->cbpcy_vlc ? v->cbpcy_vlc - ff_vc1_cbpcy_p_vlc : 0,
-        .mb_mode_table                     = 0, /* XXX: interlaced frame */
+        .cbp_table                         = (v->fcm == PROGRESSIVE ? v->cbptab : v->icbptab),
+        .mb_mode_table                     = v->mbmodetab,
         .range_reduction_frame             = v->rangeredfrm,
         .rounding_control                  = v->rnd,
         .post_processing                   = v->postproc,
         .picture_resolution_index          = v->respic,
-        .luma_scale                        = v->lumscale,
-        .luma_shift                        = v->lumshift,
         .picture_fields.bits = {
-            .picture_type                  = vc1_get_PTYPE(v),
+            .picture_type                  = (v->fcm == ILACE_FIELD ? vc1_get_FPTYPE(v) : vc1_get_PTYPE(v)),
             .frame_coding_mode             = v->fcm,
             .top_field_first               = v->tff,
-            .is_first_field                = v->fcm == 0, /* XXX: interlaced frame */
-            .intensity_compensation        = v->mv_mode == MV_PMODE_INTENSITY_COMP,
+            .is_first_field                = !v->second_field,
+            .intensity_compensation        = v->intcomp,
         },
+        .luma_scale                        = vc1_get_LUMSCALE(v),
+        .luma_shift                        = vc1_get_LUMSHIFT(v),
+#if VA_CHECK_VERSION(1, 1, 0)
+        .luma_scale2                       = vc1_get_LUMSCALE2(v),
+        .luma_shift2                       = vc1_get_LUMSHIFT2(v),
+        .intensity_compensation_field      = vc1_get_INTCOMPFIELD(v),
+#endif
         .raw_coding.flags = {
             .mv_type_mb                    = v->mv_type_is_raw,
             .direct_mb                     = v->dmb_is_raw,
             .skip_mb                       = v->skip_is_raw,
-            .field_tx                      = 0, /* XXX: interlaced frame */
-            .forward_mb                    = 0, /* XXX: interlaced frame */
+            .field_tx                      = v->fieldtx_is_raw,
+            .forward_mb                    = v->fmb_is_raw,
             .ac_pred                       = v->acpred_is_raw,
             .overflags                     = v->overflg_is_raw,
         },
@@ -217,28 +324,28 @@ static int vaapi_vc1_start_frame(AVCodecContext *avctx, av_unused const uint8_t
             .bp_mv_type_mb                 = vc1_has_MVTYPEMB_bitplane(v),
             .bp_direct_mb                  = vc1_has_DIRECTMB_bitplane(v),
             .bp_skip_mb                    = vc1_has_SKIPMB_bitplane(v),
-            .bp_field_tx                   = 0, /* XXX: interlaced frame */
-            .bp_forward_mb                 = 0, /* XXX: interlaced frame */
+            .bp_field_tx                   = vc1_has_FIELDTX_bitplane(v),
+            .bp_forward_mb                 = vc1_has_FORWARDMB_bitplane(v),
             .bp_ac_pred                    = vc1_has_ACPRED_bitplane(v),
             .bp_overflags                  = vc1_has_OVERFLAGS_bitplane(v),
         },
         .reference_fields.bits = {
             .reference_distance_flag       = v->refdist_flag,
-            .reference_distance            = 0, /* XXX: interlaced frame */
-            .num_reference_pictures        = 0, /* XXX: interlaced frame */
-            .reference_field_pic_indicator = 0, /* XXX: interlaced frame */
+            .reference_distance            = v->refdist,
+            .num_reference_pictures        = v->numref,
+            .reference_field_pic_indicator = v->reffield,
         },
         .mv_fields.bits = {
             .mv_mode                       = vc1_get_MVMODE(v),
             .mv_mode2                      = vc1_get_MVMODE2(v),
-            .mv_table                      = s->mv_table_index,
-            .two_mv_block_pattern_table    = 0, /* XXX: interlaced frame */
-            .four_mv_switch                = 0, /* XXX: interlaced frame */
-            .four_mv_block_pattern_table   = 0, /* XXX: interlaced frame */
+            .mv_table                      = (v->fcm == PROGRESSIVE ? s->mv_table_index : v->imvtab),
+            .two_mv_block_pattern_table    = v->twomvbptab,
+            .four_mv_switch                = v->fourmvswitch,
+            .four_mv_block_pattern_table   = v->fourmvbptab,
             .extended_mv_flag              = v->extended_mv,
             .extended_mv_range             = v->mvrange,
             .extended_dmv_flag             = v->extended_dmv,
-            .extended_dmv_range            = 0, /* XXX: interlaced frame */
+            .extended_dmv_range            = v->dmvrange,
         },
         .pic_quantizer_fields.bits = {
             .dquant                        = v->dquant,
@@ -278,7 +385,7 @@ static int vaapi_vc1_start_frame(AVCodecContext *avctx, av_unused const uint8_t
     if (err)
         goto fail;
 
-    if (pic_param.bitplane_present.value) {
+    if (pic_param.bitplane_present.value & 0x7f) {
         uint8_t *bitplane;
         const uint8_t *ff_bp[3];
         int x, y, n;
@@ -298,14 +405,14 @@ static int vaapi_vc1_start_frame(AVCodecContext *avctx, av_unused const uint8_t
             break;
         case AV_PICTURE_TYPE_B:
             if (!v->bi_type) {
-                ff_bp[0] = pic_param.bitplane_present.flags.bp_direct_mb ? v->direct_mb_plane : NULL;
-                ff_bp[1] = pic_param.bitplane_present.flags.bp_skip_mb   ? s->mbskip_table    : NULL;
-                ff_bp[2] = NULL; /* XXX: interlaced frame (FORWARD plane) */
+                ff_bp[0] = pic_param.bitplane_present.flags.bp_direct_mb  ? v->direct_mb_plane  : NULL;
+                ff_bp[1] = pic_param.bitplane_present.flags.bp_skip_mb    ? s->mbskip_table     : NULL;
+                ff_bp[2] = pic_param.bitplane_present.flags.bp_forward_mb ? v->forward_mb_plane : NULL;
                 break;
             }
             /* fall-through (BI-type) */
         case AV_PICTURE_TYPE_I:
-            ff_bp[0] = NULL; /* XXX: interlaced frame (FIELDTX plane) */
+            ff_bp[0] = pic_param.bitplane_present.flags.bp_field_tx   ? v->fieldtx_plane      : NULL;
             ff_bp[1] = pic_param.bitplane_present.flags.bp_ac_pred    ? v->acpred_plane       : NULL;
             ff_bp[2] = pic_param.bitplane_present.flags.bp_overflags  ? v->over_flags_plane   : NULL;
             break;
@@ -360,6 +467,7 @@ static int vaapi_vc1_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
     const MpegEncContext *s = &v->s;
     VAAPIDecodePicture *pic = s->current_picture_ptr->hwaccel_picture_private;
     VASliceParameterBufferVC1 slice_param;
+    int mb_height;
     int err;
 
     /* Current bit buffer is beyond any marker for VC-1, so skip it */
@@ -368,12 +476,17 @@ static int vaapi_vc1_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
         size -= 4;
     }
 
+    if (v->fcm == ILACE_FIELD)
+        mb_height = avctx->coded_height + 31 >> 5;
+    else
+        mb_height = avctx->coded_height + 15 >> 4;
+
     slice_param = (VASliceParameterBufferVC1) {
         .slice_data_size         = size,
         .slice_data_offset       = 0,
         .slice_data_flag         = VA_SLICE_DATA_FLAG_ALL,
         .macroblock_offset       = get_bits_count(&s->gb),
-        .slice_vertical_position = s->mb_y,
+        .slice_vertical_position = s->mb_y % mb_height,
     };
 
     err = ff_vaapi_decode_make_slice_buffer(avctx, pic,
diff --git a/libavcodec/vaapi_vp8.c b/libavcodec/vaapi_vp8.c
index e18b1cb..2426b30 100644
--- a/libavcodec/vaapi_vp8.c
+++ b/libavcodec/vaapi_vp8.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -231,7 +231,7 @@ const AVHWAccel ff_vp8_vaapi_hwaccel = {
     .frame_priv_data_size = sizeof(VAAPIDecodePicture),
     .init                 = &ff_vaapi_decode_init,
     .uninit               = &ff_vaapi_decode_uninit,
-    .priv_data_size       = sizeof(VAAPIDecodeContext),
     .frame_params         = &ff_vaapi_common_frame_params,
+    .priv_data_size       = sizeof(VAAPIDecodeContext),
     .caps_internal        = HWACCEL_CAP_ASYNC_SAFE,
 };
diff --git a/libavcodec/vaapi_vp9.c b/libavcodec/vaapi_vp9.c
new file mode 100644
index 0000000..f384ba7
--- /dev/null
+++ b/libavcodec/vaapi_vp9.c
@@ -0,0 +1,185 @@
+/*
+ * VP9 HW decode acceleration through VA API
+ *
+ * Copyright (C) 2015 Timo Rothenpieler <timo@rothenpieler.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+
+#include "hwaccel.h"
+#include "vaapi_decode.h"
+#include "vp9shared.h"
+
+static VASurfaceID vaapi_vp9_surface_id(const VP9Frame *vf)
+{
+    if (vf)
+        return ff_vaapi_get_surface_id(vf->tf.f);
+    else
+        return VA_INVALID_SURFACE;
+}
+
+static int vaapi_vp9_start_frame(AVCodecContext          *avctx,
+                                 av_unused const uint8_t *buffer,
+                                 av_unused uint32_t       size)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    VAAPIDecodePicture *pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    VADecPictureParameterBufferVP9 pic_param;
+    const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+    int err, i;
+
+    pic->output_surface = vaapi_vp9_surface_id(&h->frames[CUR_FRAME]);
+
+    pic_param = (VADecPictureParameterBufferVP9) {
+        .frame_width                      = avctx->width,
+        .frame_height                     = avctx->height,
+
+        .pic_fields.bits = {
+            .subsampling_x                = pixdesc->log2_chroma_w,
+            .subsampling_y                = pixdesc->log2_chroma_h,
+            .frame_type                   = !h->h.keyframe,
+            .show_frame                   = !h->h.invisible,
+            .error_resilient_mode         = h->h.errorres,
+            .intra_only                   = h->h.intraonly,
+            .allow_high_precision_mv      = h->h.keyframe ? 0 : h->h.highprecisionmvs,
+            .mcomp_filter_type            = h->h.filtermode ^ (h->h.filtermode <= 1),
+            .frame_parallel_decoding_mode = h->h.parallelmode,
+            .reset_frame_context          = h->h.resetctx,
+            .refresh_frame_context        = h->h.refreshctx,
+            .frame_context_idx            = h->h.framectxid,
+
+            .segmentation_enabled          = h->h.segmentation.enabled,
+            .segmentation_temporal_update  = h->h.segmentation.temporal,
+            .segmentation_update_map       = h->h.segmentation.update_map,
+
+            .last_ref_frame                = h->h.refidx[0],
+            .last_ref_frame_sign_bias      = h->h.signbias[0],
+            .golden_ref_frame              = h->h.refidx[1],
+            .golden_ref_frame_sign_bias    = h->h.signbias[1],
+            .alt_ref_frame                 = h->h.refidx[2],
+            .alt_ref_frame_sign_bias       = h->h.signbias[2],
+            .lossless_flag                 = h->h.lossless,
+        },
+
+        .filter_level                      = h->h.filter.level,
+        .sharpness_level                   = h->h.filter.sharpness,
+        .log2_tile_rows                    = h->h.tiling.log2_tile_rows,
+        .log2_tile_columns                 = h->h.tiling.log2_tile_cols,
+
+        .frame_header_length_in_bytes      = h->h.uncompressed_header_size,
+        .first_partition_size              = h->h.compressed_header_size,
+
+        .profile                           = h->h.profile,
+        .bit_depth                         = h->h.bpp,
+    };
+
+    for (i = 0; i < 7; i++)
+        pic_param.mb_segment_tree_probs[i] = h->h.segmentation.prob[i];
+
+    if (h->h.segmentation.temporal) {
+        for (i = 0; i < 3; i++)
+            pic_param.segment_pred_probs[i] = h->h.segmentation.pred_prob[i];
+    } else {
+        memset(pic_param.segment_pred_probs, 255, sizeof(pic_param.segment_pred_probs));
+    }
+
+    for (i = 0; i < 8; i++) {
+        if (h->refs[i].f->buf[0])
+            pic_param.reference_frames[i] = ff_vaapi_get_surface_id(h->refs[i].f);
+        else
+            pic_param.reference_frames[i] = VA_INVALID_ID;
+    }
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, pic,
+                                            VAPictureParameterBufferType,
+                                            &pic_param, sizeof(pic_param));
+    if (err < 0) {
+        ff_vaapi_decode_cancel(avctx, pic);
+        return err;
+    }
+
+    return 0;
+}
+
+static int vaapi_vp9_end_frame(AVCodecContext *avctx)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    VAAPIDecodePicture *pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+
+    return ff_vaapi_decode_issue(avctx, pic);
+}
+
+static int vaapi_vp9_decode_slice(AVCodecContext *avctx,
+                                  const uint8_t  *buffer,
+                                  uint32_t        size)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    VAAPIDecodePicture *pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    VASliceParameterBufferVP9 slice_param;
+    int err, i;
+
+    slice_param = (VASliceParameterBufferVP9) {
+        .slice_data_size   = size,
+        .slice_data_offset = 0,
+        .slice_data_flag   = VA_SLICE_DATA_FLAG_ALL,
+    };
+
+    for (i = 0; i < 8; i++) {
+        slice_param.seg_param[i] = (VASegmentParameterVP9) {
+            .segment_flags.fields = {
+                .segment_reference_enabled = h->h.segmentation.feat[i].ref_enabled,
+                .segment_reference         = h->h.segmentation.feat[i].ref_val,
+                .segment_reference_skipped = h->h.segmentation.feat[i].skip_enabled,
+            },
+
+            .luma_dc_quant_scale           = h->h.segmentation.feat[i].qmul[0][0],
+            .luma_ac_quant_scale           = h->h.segmentation.feat[i].qmul[0][1],
+            .chroma_dc_quant_scale         = h->h.segmentation.feat[i].qmul[1][0],
+            .chroma_ac_quant_scale         = h->h.segmentation.feat[i].qmul[1][1],
+        };
+
+        memcpy(slice_param.seg_param[i].filter_level, h->h.segmentation.feat[i].lflvl, sizeof(slice_param.seg_param[i].filter_level));
+    }
+
+    err = ff_vaapi_decode_make_slice_buffer(avctx, pic,
+                                            &slice_param, sizeof(slice_param),
+                                            buffer, size);
+    if (err) {
+        ff_vaapi_decode_cancel(avctx, pic);
+        return err;
+    }
+
+    return 0;
+}
+
+const AVHWAccel ff_vp9_vaapi_hwaccel = {
+    .name                 = "vp9_vaapi",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_VP9,
+    .pix_fmt              = AV_PIX_FMT_VAAPI,
+    .start_frame          = vaapi_vp9_start_frame,
+    .end_frame            = vaapi_vp9_end_frame,
+    .decode_slice         = vaapi_vp9_decode_slice,
+    .frame_priv_data_size = sizeof(VAAPIDecodePicture),
+    .init                 = ff_vaapi_decode_init,
+    .uninit               = ff_vaapi_decode_uninit,
+    .frame_params         = ff_vaapi_common_frame_params,
+    .priv_data_size       = sizeof(VAAPIDecodeContext),
+    .caps_internal        = HWACCEL_CAP_ASYNC_SAFE,
+};
diff --git a/libavcodec/vb.c b/libavcodec/vb.c
index 43954c1..c6dd6fb 100644
--- a/libavcodec/vb.c
+++ b/libavcodec/vb.c
@@ -2,20 +2,20 @@
  * Beam Software VB decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static void vb_decode_palette(VBDecContext *c, int data_size)
         return;
     }
     for (i = start; i <= start + size; i++)
-        c->pal[i] = bytestream2_get_be24(&c->stream);
+        c->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&c->stream);
 }
 
 static inline int check_pixel(uint8_t *buf, uint8_t *start, uint8_t *end)
@@ -107,6 +107,10 @@ static int vb_decode_framedata(VBDecContext *c, int offset)
     blk2   = 0;
     for (blk = 0; blk < blocks; blk++) {
         if (!(blk & 3)) {
+            if (bytestream2_get_bytes_left(&g) < 1) {
+                av_log(c->avctx, AV_LOG_ERROR, "Insufficient data\n");
+                return AVERROR_INVALIDDATA;
+            }
             blocktypes = bytestream2_get_byte(&g);
         }
         switch (blocktypes & 0xC0) {
@@ -197,20 +201,26 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     bytestream2_init(&c->stream, avpkt->data, avpkt->size);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     flags = bytestream2_get_le16(&c->stream);
 
     if (flags & VB_HAS_GMC) {
         i = (int16_t)bytestream2_get_le16(&c->stream);
         j = (int16_t)bytestream2_get_le16(&c->stream);
+        if (FFABS(j) > avctx->height) {
+            av_log(avctx, AV_LOG_ERROR, "GMV out of range\n");
+            return AVERROR_INVALIDDATA;
+        }
         offset = i + j * avctx->width;
     }
     if (flags & VB_HAS_VIDEO) {
         size = bytestream2_get_le32(&c->stream);
+        if(size > bytestream2_get_bytes_left(&c->stream)+4 || size<4){
+            av_log(avctx, AV_LOG_ERROR, "Frame size invalid\n");
+            return -1;
+        }
         vb_decode_framedata(c, offset);
         bytestream2_skip(&c->stream, size - 4);
     }
@@ -249,6 +259,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->frame      = av_mallocz(avctx->width * avctx->height);
     c->prev_frame = av_mallocz(avctx->width * avctx->height);
 
+    if (!c->frame || !c->prev_frame) {
+        av_freep(&c->frame);
+        av_freep(&c->prev_frame);
+        return AVERROR(ENOMEM);
+    }
+
     return 0;
 }
 
diff --git a/libavcodec/vble.c b/libavcodec/vble.c
index c3451cb..c25ee98 100644
--- a/libavcodec/vble.c
+++ b/libavcodec/vble.c
@@ -2,20 +2,20 @@
  * VBLE Decoder
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,66 +28,63 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
-#include "huffyuvdsp.h"
+#include "get_bits.h"
 #include "internal.h"
+#include "lossless_videodsp.h"
 #include "mathops.h"
+#include "thread.h"
 
 typedef struct VBLEContext {
     AVCodecContext *avctx;
-    HuffYUVDSPContext hdsp;
+    LLVidDSPContext llviddsp;
 
     int            size;
-    uint8_t        *val; /* First holds the lengths of vlc symbols and then their values */
+    uint8_t        *val; ///< This array first holds the lengths of vlc symbols and then their value.
 } VBLEContext;
 
-static uint8_t vble_read_reverse_unary(BitstreamContext *bc)
-{
-    /* At most we need to read 9 bits total to get indices up to 8 */
-    uint8_t val = bitstream_peek(bc, 8);
-
-    if (val) {
-        val = 7 - av_log2_16bit(ff_reverse[val]);
-        bitstream_skip(bc, val + 1);
-        return val;
-    } else {
-        bitstream_skip(bc, 8);
-        if (bitstream_read_bit(bc))
-            return 8;
-    }
-
-    /* Return something larger than 8 on error */
-    return UINT8_MAX;
-}
-
-static int vble_unpack(VBLEContext *ctx, BitstreamContext *bc)
+static int vble_unpack(VBLEContext *ctx, GetBitContext *gb)
 {
     int i;
+    int allbits = 0;
+    static const uint8_t LUT[256] = {
+        8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+    };
 
     /* Read all the lengths in first */
     for (i = 0; i < ctx->size; i++) {
-        ctx->val[i] = vble_read_reverse_unary(bc);
-
-        if (ctx->val[i] == UINT8_MAX)
-            return -1;
-    }
-
-    for (i = 0; i < ctx->size; i++) {
-        /* Check we have enough bits left */
-        if (bitstream_bits_left(bc) < ctx->val[i])
-            return -1;
-
-        /* get_bits can't take a length of 0 */
-        if (ctx->val[i])
-            ctx->val[i] = (1 << ctx->val[i]) + bitstream_read(bc, ctx->val[i]) - 1;
+        /* At most we need to read 9 bits total to get indices up to 8 */
+        int val = show_bits(gb, 8);
+
+        // read reverse unary
+        if (val) {
+            val = LUT[val];
+            skip_bits(gb, val + 1);
+            ctx->val[i] = val;
+        } else {
+            skip_bits(gb, 8);
+            if (!get_bits1(gb))
+                return -1;
+            ctx->val[i] = 8;
+        }
+        allbits += ctx->val[i];
     }
 
+    /* Check we have enough bits left */
+    if (get_bits_left(gb) < allbits)
+        return -1;
     return 0;
 }
 
 static void vble_restore_plane(VBLEContext *ctx, AVFrame *pic,
-                               int plane, int offset,
-                               int width, int height)
+                               GetBitContext *gb, int plane,
+                               int offset, int width, int height)
 {
     uint8_t *dst = pic->data[plane];
     uint8_t *val = ctx->val + offset;
@@ -95,14 +92,18 @@ static void vble_restore_plane(VBLEContext *ctx, AVFrame *pic,
     int i, j, left, left_top;
 
     for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j++)
-            val[j] = (val[j] >> 1) ^ -(val[j] & 1);
-
+        for (j = 0; j < width; j++) {
+            /* get_bits can't take a length of 0 */
+            if (val[j]) {
+                int v = (1 << val[j]) + get_bits(gb, val[j]) - 1;
+                val[j] = (v >> 1) ^ -(v & 1);
+            }
+        }
         if (i) {
             left = 0;
             left_top = dst[-stride];
-            ctx->hdsp.add_hfyu_median_pred(dst, dst - stride, val,
-                                           width, &left, &left_top);
+            ctx->llviddsp.add_median_pred(dst, dst - stride, val,
+                                          width, &left, &left_top);
         } else {
             dst[0] = val[0];
             for (j = 1; j < width; j++)
@@ -118,18 +119,23 @@ static int vble_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 {
     VBLEContext *ctx = avctx->priv_data;
     AVFrame *pic     = data;
-    BitstreamContext bc;
+    GetBitContext gb;
     const uint8_t *src = avpkt->data;
     int version;
     int offset = 0;
     int width_uv = avctx->width / 2, height_uv = avctx->height / 2;
+    int ret;
+    ThreadFrame frame = { .f = data };
 
-    /* Allocate buffer */
-    if (ff_get_buffer(avctx, pic, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
+    if (avpkt->size < 4 || avpkt->size - 4 > INT_MAX/8) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid packet size\n");
+        return AVERROR_INVALIDDATA;
     }
 
+    /* Allocate buffer */
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
     /* Set flags */
     pic->key_frame = 1;
     pic->pict_type = AV_PICTURE_TYPE_I;
@@ -140,24 +146,24 @@ static int vble_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (version != 1)
         av_log(avctx, AV_LOG_WARNING, "Unsupported VBLE Version: %d\n", version);
 
-    bitstream_init8(&bc, src + 4, avpkt->size - 4);
+    init_get_bits(&gb, src + 4, (avpkt->size - 4) * 8);
 
     /* Unpack */
-    if (vble_unpack(ctx, &bc) < 0) {
+    if (vble_unpack(ctx, &gb) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Invalid Code\n");
         return AVERROR_INVALIDDATA;
     }
 
     /* Restore planes. Should be almost identical to Huffyuv's. */
-    vble_restore_plane(ctx, pic, 0, offset, avctx->width, avctx->height);
+    vble_restore_plane(ctx, pic, &gb, 0, offset, avctx->width, avctx->height);
 
     /* Chroma */
     if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
         offset += avctx->width * avctx->height;
-        vble_restore_plane(ctx, pic, 1, offset, width_uv, height_uv);
+        vble_restore_plane(ctx, pic, &gb, 1, offset, width_uv, height_uv);
 
         offset += width_uv * height_uv;
-        vble_restore_plane(ctx, pic, 2, offset, width_uv, height_uv);
+        vble_restore_plane(ctx, pic, &gb, 2, offset, width_uv, height_uv);
     }
 
     *got_frame       = 1;
@@ -179,7 +185,7 @@ static av_cold int vble_decode_init(AVCodecContext *avctx)
 
     /* Stash for later use */
     ctx->avctx = avctx;
-    ff_huffyuvdsp_init(&ctx->hdsp);
+    ff_llviddsp_init(&ctx->llviddsp);
 
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->bits_per_raw_sample = 8;
@@ -187,7 +193,7 @@ static av_cold int vble_decode_init(AVCodecContext *avctx)
     ctx->size = av_image_get_buffer_size(avctx->pix_fmt,
                                          avctx->width, avctx->height, 1);
 
-    ctx->val = av_malloc(ctx->size * sizeof(*ctx->val));
+    ctx->val = av_malloc_array(ctx->size, sizeof(*ctx->val));
 
     if (!ctx->val) {
         av_log(avctx, AV_LOG_ERROR, "Could not allocate values buffer.\n");
@@ -207,6 +213,7 @@ AVCodec ff_vble_decoder = {
     .init           = vble_decode_init,
     .close          = vble_decode_close,
     .decode         = vble_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(vble_decode_init),
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 8eba65c..e102b93 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,10 +30,10 @@
 #include "internal.h"
 #include "avcodec.h"
 #include "mpegvideo.h"
-#include "unary_legacy.h"
 #include "vc1.h"
 #include "vc1data.h"
 #include "wmv2data.h"
+#include "unary.h"
 #include "simple_idct.h"
 
 /***********************************************************************/
@@ -43,21 +43,6 @@
  * @{
  */
 
-/**
- * Imode types
- * @{
- */
-enum Imode {
-    IMODE_RAW,
-    IMODE_NORM2,
-    IMODE_DIFF2,
-    IMODE_NORM6,
-    IMODE_DIFF6,
-    IMODE_ROWSKIP,
-    IMODE_COLSKIP
-};
-/** @} */ //imode defines
-
 /** Decode rows by checking if they are skipped
  * @param plane Buffer to store decoded bits
  * @param[in] width Width of this buffer
@@ -133,12 +118,16 @@ static int bitplane_decoding(uint8_t* data, int *raw_flag, VC1Context *v)
     case IMODE_NORM2:
         if ((height * width) & 1) {
             *planep++ = get_bits1(gb);
-            offset    = 1;
+            y = offset = 1;
+            if (offset == width) {
+                offset = 0;
+                planep += stride - width;
+            }
         }
         else
-            offset = 0;
+            y = offset = 0;
         // decode bitplane as one long line
-        for (y = offset; y < height * width; y += 2) {
+        for (; y < height * width; y += 2) {
             code = get_vlc2(gb, ff_vc1_norm2_vlc.table, VC1_NORM2_VLC_BITS, 1);
             *planep++ = code & 1;
             offset++;
@@ -244,37 +233,34 @@ static int vop_dquant_decoding(VC1Context *v)
     int pqdiff;
 
     //variable size
-    if (v->dquant == 2) {
-        pqdiff = get_bits(gb, 3);
-        if (pqdiff == 7)
-            v->altpq = get_bits(gb, 5);
-        else
-            v->altpq = v->pq + pqdiff + 1;
-    } else {
+    if (v->dquant != 2) {
         v->dquantfrm = get_bits1(gb);
-        if (v->dquantfrm) {
-            v->dqprofile = get_bits(gb, 2);
-            switch (v->dqprofile) {
-            case DQPROFILE_SINGLE_EDGE:
-            case DQPROFILE_DOUBLE_EDGES:
-                v->dqsbedge = get_bits(gb, 2);
-                break;
-            case DQPROFILE_ALL_MBS:
-                v->dqbilevel = get_bits1(gb);
-                if (!v->dqbilevel)
-                    v->halfpq = 0;
-            default:
-                break; //Forbidden ?
-            }
-            if (v->dqbilevel || v->dqprofile != DQPROFILE_ALL_MBS) {
-                pqdiff = get_bits(gb, 3);
-                if (pqdiff == 7)
-                    v->altpq = get_bits(gb, 5);
-                else
-                    v->altpq = v->pq + pqdiff + 1;
+        if (!v->dquantfrm)
+            return 0;
+
+        v->dqprofile = get_bits(gb, 2);
+        switch (v->dqprofile) {
+        case DQPROFILE_SINGLE_EDGE:
+        case DQPROFILE_DOUBLE_EDGES:
+            v->dqsbedge = get_bits(gb, 2);
+            break;
+        case DQPROFILE_ALL_MBS:
+            v->dqbilevel = get_bits1(gb);
+            if (!v->dqbilevel) {
+                v->halfpq = 0;
+                return 0;
             }
+        default:
+            break; //Forbidden ?
         }
     }
+
+    pqdiff = get_bits(gb, 3);
+    if (pqdiff == 7)
+        v->altpq = get_bits(gb, 5);
+    else
+        v->altpq = v->pq + pqdiff + 1;
+
     return 0;
 }
 
@@ -289,7 +275,7 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb);
  */
 int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitContext *gb)
 {
-    av_log(avctx, AV_LOG_DEBUG, "Header: %0X\n", show_bits(gb, 32));
+    av_log(avctx, AV_LOG_DEBUG, "Header: %0X\n", show_bits_long(gb, 32));
     v->profile = get_bits(gb, 2);
     if (v->profile == PROFILE_COMPLEX) {
         av_log(avctx, AV_LOG_WARNING, "WMV3 Complex Profile is not fully supported\n");
@@ -300,6 +286,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
         v->zz_4x8 = ff_vc1_adv_progressive_4x8_zz;
         return decode_sequence_header_adv(v, gb);
     } else {
+        v->chromaformat = 1;
         v->zz_8x4 = ff_wmv2_scantableA;
         v->zz_4x8 = ff_wmv2_scantableB;
         v->res_y411   = get_bits1(gb);
@@ -327,11 +314,11 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     v->multires        = get_bits1(gb);
     v->res_fasttx      = get_bits1(gb);
     if (!v->res_fasttx) {
-        v->vc1dsp.vc1_inv_trans_8x8    = ff_simple_idct_8;
+        v->vc1dsp.vc1_inv_trans_8x8    = ff_simple_idct_int16_8bit;
         v->vc1dsp.vc1_inv_trans_8x4    = ff_simple_idct84_add;
         v->vc1dsp.vc1_inv_trans_4x8    = ff_simple_idct48_add;
         v->vc1dsp.vc1_inv_trans_4x4    = ff_simple_idct44_add;
-        v->vc1dsp.vc1_inv_trans_8x8_dc = ff_simple_idct_add_8;
+        v->vc1dsp.vc1_inv_trans_8x8_dc = ff_simple_idct_add_int16_8bit;
         v->vc1dsp.vc1_inv_trans_8x4_dc = ff_simple_idct84_add;
         v->vc1dsp.vc1_inv_trans_4x8_dc = ff_simple_idct48_add;
         v->vc1dsp.vc1_inv_trans_4x4_dc = ff_simple_idct44_add;
@@ -344,8 +331,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
         return -1;
     }
     v->extended_mv     = get_bits1(gb); //common
-    if (!v->profile && v->extended_mv)
-    {
+    if (!v->profile && v->extended_mv) {
         av_log(avctx, AV_LOG_ERROR,
                "Extended MVs unavailable in Simple Profile\n");
         return -1;
@@ -354,8 +340,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     v->vstransform     = get_bits1(gb); //common
 
     v->res_transtab    = get_bits1(gb);
-    if (v->res_transtab)
-    {
+    if (v->res_transtab) {
         av_log(avctx, AV_LOG_ERROR,
                "1 for reserved RES_TRANSTAB is forbidden\n");
         return -1;
@@ -376,8 +361,13 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     v->finterpflag = get_bits1(gb); //common
 
     if (v->res_sprite) {
-        v->s.avctx->width  = v->s.avctx->coded_width  = get_bits(gb, 11);
-        v->s.avctx->height = v->s.avctx->coded_height = get_bits(gb, 11);
+        int w = get_bits(gb, 11);
+        int h = get_bits(gb, 11);
+        int ret = ff_set_dimensions(v->s.avctx, w, h);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions %d %d\n", w, h);
+            return ret;
+        }
         skip_bits(gb, 5); //frame rate
         v->res_x8 = get_bits1(gb);
         if (get_bits1(gb)) { // something to do with DC VLC selection
@@ -389,11 +379,6 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     } else {
         v->res_rtm_flag = get_bits1(gb); //reserved
     }
-    if (!v->res_rtm_flag) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Old WMV3 version detected, some frames may be decoded incorrectly\n");
-        //return -1;
-    }
     //TODO: figure out what they mean (always 0x402F)
     if (!v->res_fasttx)
         skip_bits(gb, 16);
@@ -429,10 +414,8 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
     v->bitrtq_postproc       = get_bits(gb, 5); //common
     v->postprocflag          = get_bits1(gb);   //common
 
-    v->s.avctx->coded_width  = (get_bits(gb, 12) + 1) << 1;
-    v->s.avctx->coded_height = (get_bits(gb, 12) + 1) << 1;
-    v->s.avctx->width        = v->s.avctx->coded_width;
-    v->s.avctx->height       = v->s.avctx->coded_height;
+    v->max_coded_width       = (get_bits(gb, 12) + 1) << 1;
+    v->max_coded_height      = (get_bits(gb, 12) + 1) << 1;
     v->broadcast             = get_bits1(gb);
     v->interlace             = get_bits1(gb);
     v->tfcntrflag            = get_bits1(gb);
@@ -493,7 +476,6 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
                 }
             }
             if (v->broadcast) { // Pulldown may be present
-                v->s.avctx->framerate.num  *= 2;
                 v->s.avctx->ticks_per_frame = 2;
             }
         }
@@ -522,6 +504,8 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
 int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContext *gb)
 {
     int i;
+    int w,h;
+    int ret;
 
     av_log(avctx, AV_LOG_DEBUG, "Entry point: %08X\n", show_bits_long(gb, 32));
     v->broken_link    = get_bits1(gb);
@@ -529,6 +513,8 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
     v->panscanflag    = get_bits1(gb);
     v->refdist_flag   = get_bits1(gb);
     v->s.loop_filter  = get_bits1(gb);
+    if (v->s.avctx->skip_loop_filter >= AVDISCARD_ALL)
+        v->s.loop_filter = 0;
     v->fastuvmc       = get_bits1(gb);
     v->extended_mv    = get_bits1(gb);
     v->dquant         = get_bits(gb, 2);
@@ -542,10 +528,18 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
         }
     }
 
-    if (get_bits1(gb)) {
-        avctx->width  = avctx->coded_width  = (get_bits(gb, 12) + 1) << 1;
-        avctx->height = avctx->coded_height = (get_bits(gb, 12) + 1) << 1;
+    if(get_bits1(gb)){
+        w = (get_bits(gb, 12)+1)<<1;
+        h = (get_bits(gb, 12)+1)<<1;
+    } else {
+        w = v->max_coded_width;
+        h = v->max_coded_height;
+    }
+    if ((ret = ff_set_dimensions(avctx, w, h)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions %d %d\n", w, h);
+        return ret;
     }
+
     if (v->extended_mv)
         v->extended_dmv = get_bits1(gb);
     if ((v->range_mapy_flag = get_bits1(gb))) {
@@ -572,13 +566,13 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
         int scale, shift, i;                                                  \
         if (!lumscale) {                                                      \
             scale = -64;                                                      \
-            shift = (255 - lumshift * 2) << 6;                                \
+            shift = (255 - lumshift * 2) * 64;                                \
             if (lumshift > 31)                                                \
                 shift += 128 << 6;                                            \
         } else {                                                              \
             scale = lumscale + 32;                                            \
             if (lumshift > 31)                                                \
-                shift = (lumshift - 64) << 6;                                 \
+                shift = (lumshift - 64) * 64;                                 \
             else                                                              \
                 shift = lumshift << 6;                                        \
         }                                                                     \
@@ -597,32 +591,44 @@ static void rotate_luts(VC1Context *v)
             C = A;                                            \
         } else {                                              \
             DEF;                                              \
-            memcpy(&tmp, &L  , sizeof(tmp));                  \
-            memcpy(&L  , &N  , sizeof(tmp));                  \
-            memcpy(&N  , &tmp, sizeof(tmp));                  \
+            memcpy(&tmp, L   , sizeof(tmp));                  \
+            memcpy(L   , N   , sizeof(tmp));                  \
+            memcpy(N   , &tmp, sizeof(tmp));                  \
             C = N;                                            \
         }                                                     \
     } while(0)
 
-    ROTATE(int tmp,             v->last_use_ic, v->next_use_ic, v->curr_use_ic, v->aux_use_ic);
+    ROTATE(int tmp,             &v->last_use_ic, &v->next_use_ic, v->curr_use_ic, &v->aux_use_ic);
     ROTATE(uint8_t tmp[2][256], v->last_luty,   v->next_luty,   v->curr_luty,   v->aux_luty);
     ROTATE(uint8_t tmp[2][256], v->last_lutuv,  v->next_lutuv,  v->curr_lutuv,  v->aux_lutuv);
 
     INIT_LUT(32, 0, v->curr_luty[0], v->curr_lutuv[0], 0);
     INIT_LUT(32, 0, v->curr_luty[1], v->curr_lutuv[1], 0);
-    v->curr_use_ic = 0;
-    if (v->curr_luty == v->next_luty) {
-        // If we just initialized next_lut, clear next_use_ic to match.
-        v->next_use_ic = 0;
+    *v->curr_use_ic = 0;
+}
+
+static int read_bfraction(VC1Context *v, GetBitContext* gb) {
+    int bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
+
+    if (bfraction_lut_index == 21 || bfraction_lut_index < 0) {
+        av_log(v->s.avctx, AV_LOG_ERROR, "bfraction invalid\n");
+        return AVERROR_INVALIDDATA;
     }
+    v->bfraction_lut_index = bfraction_lut_index;
+    v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+    return 0;
 }
 
 int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
 {
     int pqindex, lowquant, status;
 
+    v->field_mode = 0;
+    v->fcm = PROGRESSIVE;
     if (v->finterpflag)
         v->interpfrm = get_bits1(gb);
+    if (!v->s.avctx->codec)
+        return -1;
     if (v->s.avctx->codec_id == AV_CODEC_ID_MSS2)
         v->respic   =
         v->rangered =
@@ -632,22 +638,19 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
     v->rangeredfrm = 0;
     if (v->rangered)
         v->rangeredfrm = get_bits1(gb);
-    v->s.pict_type = get_bits1(gb);
-    if (v->s.avctx->max_b_frames) {
-        if (!v->s.pict_type) {
-            if (get_bits1(gb))
-                v->s.pict_type = AV_PICTURE_TYPE_I;
-            else
-                v->s.pict_type = AV_PICTURE_TYPE_B;
+    if (get_bits1(gb)) {
+        v->s.pict_type = AV_PICTURE_TYPE_P;
+    } else {
+        if (v->s.avctx->max_b_frames && !get_bits1(gb)) {
+            v->s.pict_type = AV_PICTURE_TYPE_B;
         } else
-            v->s.pict_type = AV_PICTURE_TYPE_P;
-    } else
-        v->s.pict_type = v->s.pict_type ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+            v->s.pict_type = AV_PICTURE_TYPE_I;
+    }
 
     v->bi_type = 0;
     if (v->s.pict_type == AV_PICTURE_TYPE_B) {
-        v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-        v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+        if (read_bfraction(v, gb) < 0)
+            return AVERROR_INVALIDDATA;
         if (v->bfraction == 0) {
             v->s.pict_type = AV_PICTURE_TYPE_BI;
         }
@@ -672,19 +675,25 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
         v->pq = ff_vc1_pquant_table[0][pqindex];
     else
         v->pq = ff_vc1_pquant_table[1][pqindex];
-
-    v->pquantizer = 1;
-    if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
-        v->pquantizer = pqindex < 9;
-    if (v->quantizer_mode == QUANT_NON_UNIFORM)
-        v->pquantizer = 0;
     v->pqindex = pqindex;
     if (pqindex < 9)
         v->halfpq = get_bits1(gb);
     else
         v->halfpq = 0;
-    if (v->quantizer_mode == QUANT_FRAME_EXPLICIT)
+    switch (v->quantizer_mode) {
+    case QUANT_FRAME_IMPLICIT:
+        v->pquantizer = pqindex < 9;
+        break;
+    case QUANT_NON_UNIFORM:
+        v->pquantizer = 0;
+        break;
+    case QUANT_FRAME_EXPLICIT:
         v->pquantizer = get_bits1(gb);
+        break;
+    default:
+        v->pquantizer = 1;
+        break;
+    }
     v->dquantfrm = 0;
     if (v->extended_mv == 1)
         v->mvrange = get_unary(gb, 0, 3);
@@ -708,9 +717,7 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
 
     switch (v->s.pict_type) {
     case AV_PICTURE_TYPE_P:
-        if (v->pq < 5)       v->tt_index = 0;
-        else if (v->pq < 13) v->tt_index = 1;
-        else                 v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         lowquant = (v->pq > 12) ? 0 : 1;
         v->mv_mode = ff_vc1_mv_pmode_table[lowquant][get_unary(gb, 1, 4)];
@@ -724,16 +731,15 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
             INIT_LUT(v->lumscale, v->lumshift, v->last_luty[1], v->last_lutuv[1], 1);
         }
         v->qs_last = v->s.quarter_sample;
-        if (v->mv_mode == MV_PMODE_1MV_HPEL || v->mv_mode == MV_PMODE_1MV_HPEL_BILIN)
-            v->s.quarter_sample = 0;
-        else if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
-            if (v->mv_mode2 == MV_PMODE_1MV_HPEL || v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN)
-                v->s.quarter_sample = 0;
-            else
-                v->s.quarter_sample = 1;
-        } else
-            v->s.quarter_sample = 1;
-        v->s.mspel = !(v->mv_mode == MV_PMODE_1MV_HPEL_BILIN || (v->mv_mode == MV_PMODE_INTENSITY_COMP && v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN));
+        if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
+            v->s.quarter_sample = (v->mv_mode2 != MV_PMODE_1MV_HPEL &&
+                                   v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+            v->s.mspel          = (v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+        } else {
+            v->s.quarter_sample = (v->mv_mode != MV_PMODE_1MV_HPEL &&
+                                   v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+            v->s.mspel          = (v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+        }
 
         if ((v->mv_mode  == MV_PMODE_INTENSITY_COMP &&
              v->mv_mode2 == MV_PMODE_MIXED_MV)      ||
@@ -755,28 +761,27 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
 
         /* Hopefully this is correct for P-frames */
         v->s.mv_table_index = get_bits(gb, 2); //but using ff_vc1_ tables
-        v->cbpcy_vlc = &ff_vc1_cbpcy_p_vlc[get_bits(gb, 2)];
+        v->cbptab = get_bits(gb, 2);
+        v->cbpcy_vlc = &ff_vc1_cbpcy_p_vlc[v->cbptab];
 
         if (v->dquant) {
             av_log(v->s.avctx, AV_LOG_DEBUG, "VOP DQuant info\n");
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0; //FIXME Is that so ?
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0; //FIXME Is that so ?
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
         }
         break;
     case AV_PICTURE_TYPE_B:
-        if (v->pq < 5)       v->tt_index = 0;
-        else if (v->pq < 13) v->tt_index = 1;
-        else                 v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         v->mv_mode          = get_bits1(gb) ? MV_PMODE_1MV : MV_PMODE_1MV_HPEL_BILIN;
         v->qs_last          = v->s.quarter_sample;
@@ -795,19 +800,20 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
                "Imode: %i, Invert: %i\n", status>>1, status&1);
 
         v->s.mv_table_index = get_bits(gb, 2);
-        v->cbpcy_vlc        = &ff_vc1_cbpcy_p_vlc[get_bits(gb, 2)];
+        v->cbptab           = get_bits(gb, 2);
+        v->cbpcy_vlc        = &ff_vc1_cbpcy_p_vlc[v->cbptab];
 
         if (v->dquant) {
             av_log(v->s.avctx, AV_LOG_DEBUG, "VOP DQuant info\n");
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0;
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0;
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -836,15 +842,17 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
 {
     int pqindex, lowquant;
     int status;
-    int mbmodetab, imvtab, icbptab, twomvbptab, fourmvbptab; /* useful only for debugging */
     int field_mode, fcm;
 
     v->numref          = 0;
     v->p_frame_skipped = 0;
     if (v->second_field) {
-        v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+        if (v->fcm != ILACE_FIELD || v->field_mode!=1)
+            return -1;
         if (v->fptype & 4)
             v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_BI : AV_PICTURE_TYPE_B;
+        else
+            v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
         v->s.current_picture_ptr->f->pict_type = v->s.pict_type;
         if (!v->pic_header_flag)
             goto parse_common_info;
@@ -865,12 +873,15 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->field_mode = field_mode;
     v->fcm = fcm;
 
+    av_assert0(    v->s.mb_height == v->s.height + 15 >> 4
+                || v->s.mb_height == FFALIGN(v->s.height + 15 >> 4, 2));
     if (v->field_mode) {
         v->s.mb_height = FFALIGN(v->s.height + 15 >> 4, 2);
         v->fptype = get_bits(gb, 3);
-        v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
         if (v->fptype & 4) // B-picture
             v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_BI : AV_PICTURE_TYPE_B;
+        else
+            v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
     } else {
         v->s.mb_height = v->s.height + 15 >> 4;
         switch (get_unary(gb, 0, 4)) {
@@ -901,6 +912,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             v->tff = get_bits1(gb);
             v->rff = get_bits1(gb);
         }
+    } else {
+        v->tff = 1;
     }
     if (v->panscanflag) {
         avpriv_report_missing_feature(v->s.avctx, "Pan-scan");
@@ -912,6 +925,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->rnd = get_bits1(gb);
     if (v->interlace)
         v->uvsamp = get_bits1(gb);
+    if(!ff_vc1_bfraction_vlc.table)
+        return 0; //parsing only, vlc tables havnt been allocated
     if (v->field_mode) {
         if (!v->refdist_flag)
             v->refdist = 0;
@@ -921,8 +936,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 v->refdist += get_unary(gb, 0, 16);
         }
         if ((v->s.pict_type == AV_PICTURE_TYPE_B) || (v->s.pict_type == AV_PICTURE_TYPE_BI)) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             v->frfd = (v->bfraction * v->refdist) >> 8;
             v->brfd = v->refdist - v->frfd - 1;
             if (v->brfd < 0)
@@ -934,8 +949,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         if (v->finterpflag)
             v->interpfrm = get_bits1(gb);
         if (v->s.pict_type == AV_PICTURE_TYPE_B) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             if (v->bfraction == 0) {
                 v->s.pict_type = AV_PICTURE_TYPE_BI; /* XXX: should not happen here */
             }
@@ -948,24 +963,30 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     pqindex = get_bits(gb, 5);
     if (!pqindex)
         return -1;
-    v->pqindex = pqindex;
     if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
         v->pq = ff_vc1_pquant_table[0][pqindex];
     else
         v->pq = ff_vc1_pquant_table[1][pqindex];
-
-    v->pquantizer = 1;
-    if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
-        v->pquantizer = pqindex < 9;
-    if (v->quantizer_mode == QUANT_NON_UNIFORM)
-        v->pquantizer = 0;
     v->pqindex = pqindex;
     if (pqindex < 9)
         v->halfpq = get_bits1(gb);
     else
         v->halfpq = 0;
-    if (v->quantizer_mode == QUANT_FRAME_EXPLICIT)
+    switch (v->quantizer_mode) {
+    case QUANT_FRAME_IMPLICIT:
+        v->pquantizer = pqindex < 9;
+        break;
+    case QUANT_NON_UNIFORM:
+        v->pquantizer = 0;
+        break;
+    case QUANT_FRAME_EXPLICIT:
         v->pquantizer = get_bits1(gb);
+        break;
+    default:
+        v->pquantizer = 1;
+        break;
+    }
+    v->dquantfrm = 0;
     if (v->postprocflag)
         v->postproc = get_bits(gb, 2);
 
@@ -984,7 +1005,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 return -1;
             av_log(v->s.avctx, AV_LOG_DEBUG, "FIELDTX plane encoding: "
                    "Imode: %i, Invert: %i\n", status>>1, status&1);
-        }
+        } else
+            v->fieldtx_is_raw = 0;
         status = bitplane_decoding(v->acpred_plane, &v->acpred_is_raw, v);
         if (status < 0)
             return -1;
@@ -1030,23 +1052,25 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                     v->last_use_ic = 1;
                 }
                 status = bitplane_decoding(v->s.mbskip_table, &v->skip_is_raw, v);
+                if (status < 0)
+                    return -1;
                 av_log(v->s.avctx, AV_LOG_DEBUG, "SKIPMB plane encoding: "
                        "Imode: %i, Invert: %i\n", status>>1, status&1);
-                mbmodetab = get_bits(gb, 2);
+                v->mbmodetab = get_bits(gb, 2);
                 if (v->fourmvswitch)
-                    v->mbmode_vlc = &ff_vc1_intfr_4mv_mbmode_vlc[mbmodetab];
+                    v->mbmode_vlc = &ff_vc1_intfr_4mv_mbmode_vlc[v->mbmodetab];
                 else
-                    v->mbmode_vlc = &ff_vc1_intfr_non4mv_mbmode_vlc[mbmodetab];
-                imvtab         = get_bits(gb, 2);
-                v->imv_vlc     = &ff_vc1_1ref_mvdata_vlc[imvtab];
+                    v->mbmode_vlc = &ff_vc1_intfr_non4mv_mbmode_vlc[v->mbmodetab];
+                v->imvtab      = get_bits(gb, 2);
+                v->imv_vlc     = &ff_vc1_1ref_mvdata_vlc[v->imvtab];
                 // interlaced p-picture cbpcy range is [1, 63]
-                icbptab        = get_bits(gb, 3);
-                v->cbpcy_vlc   = &ff_vc1_icbpcy_vlc[icbptab];
-                twomvbptab     = get_bits(gb, 2);
-                v->twomvbp_vlc = &ff_vc1_2mv_block_pattern_vlc[twomvbptab];
+                v->icbptab     = get_bits(gb, 3);
+                v->cbpcy_vlc   = &ff_vc1_icbpcy_vlc[v->icbptab];
+                v->twomvbptab     = get_bits(gb, 2);
+                v->twomvbp_vlc = &ff_vc1_2mv_block_pattern_vlc[v->twomvbptab];
                 if (v->fourmvswitch) {
-                    fourmvbptab     = get_bits(gb, 2);
-                    v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[fourmvbptab];
+                    v->fourmvbptab     = get_bits(gb, 2);
+                    v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[v->fourmvbptab];
                 }
             }
         }
@@ -1055,12 +1079,7 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         v->range_x = 1 << (v->k_x - 1);
         v->range_y = 1 << (v->k_y - 1);
 
-        if (v->pq < 5)
-            v->tt_index = 0;
-        else if (v->pq < 13)
-            v->tt_index = 1;
-        else
-            v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
         if (v->fcm != ILACE_FRAME) {
             int mvmode;
             mvmode     = get_unary(gb, 1, 4);
@@ -1096,7 +1115,7 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                         INIT_LUT(v->lumscale2, v->lumshift2, v->curr_luty[v->cur_field_type^1], v->curr_lutuv[v->cur_field_type^1], 0);
                         INIT_LUT(v->lumscale , v->lumshift , v->last_luty[v->cur_field_type  ], v->last_lutuv[v->cur_field_type  ], 1);
                     }
-                    v->next_use_ic = v->curr_use_ic = 1;
+                    v->next_use_ic = *v->curr_use_ic = 1;
                 } else {
                     INIT_LUT(v->lumscale , v->lumshift , v->last_luty[0], v->last_lutuv[0], 1);
                     INIT_LUT(v->lumscale2, v->lumshift2, v->last_luty[1], v->last_lutuv[1], 1);
@@ -1104,18 +1123,15 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 v->last_use_ic = 1;
             }
             v->qs_last = v->s.quarter_sample;
-            if (v->mv_mode == MV_PMODE_1MV_HPEL || v->mv_mode == MV_PMODE_1MV_HPEL_BILIN)
-                v->s.quarter_sample = 0;
-            else if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
-                if (v->mv_mode2 == MV_PMODE_1MV_HPEL || v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN)
-                    v->s.quarter_sample = 0;
-                else
-                    v->s.quarter_sample = 1;
-            } else
-                v->s.quarter_sample = 1;
-            v->s.mspel = !(v->mv_mode == MV_PMODE_1MV_HPEL_BILIN
-                           || (v->mv_mode == MV_PMODE_INTENSITY_COMP
-                               && v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN));
+            if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
+                v->s.quarter_sample = (v->mv_mode2 != MV_PMODE_1MV_HPEL &&
+                                       v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+                v->s.mspel          = (v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+            } else {
+                v->s.quarter_sample = (v->mv_mode != MV_PMODE_1MV_HPEL &&
+                                       v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+                v->s.mspel          = (v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+            }
         }
         if (v->fcm == PROGRESSIVE) { // progressive
             if ((v->mv_mode == MV_PMODE_INTENSITY_COMP &&
@@ -1138,27 +1154,28 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
 
             /* Hopefully this is correct for P-frames */
             v->s.mv_table_index = get_bits(gb, 2); //but using ff_vc1_ tables
-            v->cbpcy_vlc        = &ff_vc1_cbpcy_p_vlc[get_bits(gb, 2)];
+            v->cbptab           = get_bits(gb, 2);
+            v->cbpcy_vlc        = &ff_vc1_cbpcy_p_vlc[v->cbptab];
         } else if (v->fcm == ILACE_FRAME) { // frame interlaced
             v->qs_last          = v->s.quarter_sample;
             v->s.quarter_sample = 1;
             v->s.mspel          = 1;
         } else {    // field interlaced
-            mbmodetab = get_bits(gb, 3);
-            imvtab = get_bits(gb, 2 + v->numref);
+            v->mbmodetab = get_bits(gb, 3);
+            v->imvtab = get_bits(gb, 2 + v->numref);
             if (!v->numref)
-                v->imv_vlc = &ff_vc1_1ref_mvdata_vlc[imvtab];
+                v->imv_vlc = &ff_vc1_1ref_mvdata_vlc[v->imvtab];
             else
-                v->imv_vlc = &ff_vc1_2ref_mvdata_vlc[imvtab];
-            icbptab = get_bits(gb, 3);
-            v->cbpcy_vlc = &ff_vc1_icbpcy_vlc[icbptab];
+                v->imv_vlc = &ff_vc1_2ref_mvdata_vlc[v->imvtab];
+            v->icbptab = get_bits(gb, 3);
+            v->cbpcy_vlc = &ff_vc1_icbpcy_vlc[v->icbptab];
             if ((v->mv_mode == MV_PMODE_INTENSITY_COMP &&
                 v->mv_mode2 == MV_PMODE_MIXED_MV) || v->mv_mode == MV_PMODE_MIXED_MV) {
-                fourmvbptab     = get_bits(gb, 2);
-                v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[fourmvbptab];
-                v->mbmode_vlc = &ff_vc1_if_mmv_mbmode_vlc[mbmodetab];
+                v->fourmvbptab     = get_bits(gb, 2);
+                v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[v->fourmvbptab];
+                v->mbmode_vlc = &ff_vc1_if_mmv_mbmode_vlc[v->mbmodetab];
             } else {
-                v->mbmode_vlc = &ff_vc1_if_1mv_mbmode_vlc[mbmodetab];
+                v->mbmode_vlc = &ff_vc1_if_1mv_mbmode_vlc[v->mbmodetab];
             }
         }
         if (v->dquant) {
@@ -1166,12 +1183,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0; //FIXME Is that so ?
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0; //FIXME Is that so ?
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -1179,8 +1196,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         break;
     case AV_PICTURE_TYPE_B:
         if (v->fcm == ILACE_FRAME) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             if (v->bfraction == 0) {
                 return -1;
             }
@@ -1194,15 +1211,11 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         v->range_x = 1 << (v->k_x - 1);
         v->range_y = 1 << (v->k_y - 1);
 
-        if (v->pq < 5)
-            v->tt_index = 0;
-        else if (v->pq < 13)
-            v->tt_index = 1;
-        else
-            v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         if (v->field_mode) {
             int mvmode;
+            av_log(v->s.avctx, AV_LOG_DEBUG, "B Fields\n");
             if (v->extended_dmv)
                 v->dmvrange = get_unary(gb, 0, 3);
             mvmode = get_unary(gb, 1, 3);
@@ -1210,24 +1223,24 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             v->mv_mode          = ff_vc1_mv_pmode_table2[lowquant][mvmode];
             v->qs_last          = v->s.quarter_sample;
             v->s.quarter_sample = (v->mv_mode == MV_PMODE_1MV || v->mv_mode == MV_PMODE_MIXED_MV);
-            v->s.mspel          = !(v->mv_mode == MV_PMODE_1MV_HPEL_BILIN || v->mv_mode == MV_PMODE_1MV_HPEL);
+            v->s.mspel          = (v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
             status = bitplane_decoding(v->forward_mb_plane, &v->fmb_is_raw, v);
             if (status < 0)
                 return -1;
             av_log(v->s.avctx, AV_LOG_DEBUG, "MB Forward Type plane encoding: "
                    "Imode: %i, Invert: %i\n", status>>1, status&1);
-            mbmodetab = get_bits(gb, 3);
+            v->mbmodetab = get_bits(gb, 3);
             if (v->mv_mode == MV_PMODE_MIXED_MV)
-                v->mbmode_vlc = &ff_vc1_if_mmv_mbmode_vlc[mbmodetab];
+                v->mbmode_vlc = &ff_vc1_if_mmv_mbmode_vlc[v->mbmodetab];
             else
-                v->mbmode_vlc = &ff_vc1_if_1mv_mbmode_vlc[mbmodetab];
-            imvtab       = get_bits(gb, 3);
-            v->imv_vlc   = &ff_vc1_2ref_mvdata_vlc[imvtab];
-            icbptab      = get_bits(gb, 3);
-            v->cbpcy_vlc = &ff_vc1_icbpcy_vlc[icbptab];
+                v->mbmode_vlc = &ff_vc1_if_1mv_mbmode_vlc[v->mbmodetab];
+            v->imvtab     = get_bits(gb, 3);
+            v->imv_vlc   = &ff_vc1_2ref_mvdata_vlc[v->imvtab];
+            v->icbptab   = get_bits(gb, 3);
+            v->cbpcy_vlc = &ff_vc1_icbpcy_vlc[v->icbptab];
             if (v->mv_mode == MV_PMODE_MIXED_MV) {
-                fourmvbptab     = get_bits(gb, 2);
-                v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[fourmvbptab];
+                v->fourmvbptab     = get_bits(gb, 2);
+                v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[v->fourmvbptab];
             }
             v->numref = 1; // interlaced field B pictures are always 2-ref
         } else if (v->fcm == ILACE_FRAME) {
@@ -1251,17 +1264,17 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 return -1;
             av_log(v->s.avctx, AV_LOG_DEBUG, "MB Skip plane encoding: "
                    "Imode: %i, Invert: %i\n", status>>1, status&1);
-            mbmodetab       = get_bits(gb, 2);
-            v->mbmode_vlc   = &ff_vc1_intfr_non4mv_mbmode_vlc[mbmodetab];
-            imvtab          = get_bits(gb, 2);
-            v->imv_vlc      = &ff_vc1_1ref_mvdata_vlc[imvtab];
+            v->mbmodetab       = get_bits(gb, 2);
+            v->mbmode_vlc   = &ff_vc1_intfr_non4mv_mbmode_vlc[v->mbmodetab];
+            v->imvtab       = get_bits(gb, 2);
+            v->imv_vlc      = &ff_vc1_1ref_mvdata_vlc[v->imvtab];
             // interlaced p/b-picture cbpcy range is [1, 63]
-            icbptab         = get_bits(gb, 3);
-            v->cbpcy_vlc    = &ff_vc1_icbpcy_vlc[icbptab];
-            twomvbptab      = get_bits(gb, 2);
-            v->twomvbp_vlc  = &ff_vc1_2mv_block_pattern_vlc[twomvbptab];
-            fourmvbptab     = get_bits(gb, 2);
-            v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[fourmvbptab];
+            v->icbptab      = get_bits(gb, 3);
+            v->cbpcy_vlc    = &ff_vc1_icbpcy_vlc[v->icbptab];
+            v->twomvbptab      = get_bits(gb, 2);
+            v->twomvbp_vlc  = &ff_vc1_2mv_block_pattern_vlc[v->twomvbptab];
+            v->fourmvbptab     = get_bits(gb, 2);
+            v->fourmvbp_vlc = &ff_vc1_4mv_block_pattern_vlc[v->fourmvbptab];
         } else {
             v->mv_mode          = get_bits1(gb) ? MV_PMODE_1MV : MV_PMODE_1MV_HPEL_BILIN;
             v->qs_last          = v->s.quarter_sample;
@@ -1278,7 +1291,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             av_log(v->s.avctx, AV_LOG_DEBUG, "MB Skip plane encoding: "
                    "Imode: %i, Invert: %i\n", status>>1, status&1);
             v->s.mv_table_index = get_bits(gb, 2);
-            v->cbpcy_vlc = &ff_vc1_cbpcy_p_vlc[get_bits(gb, 2)];
+            v->cbptab = get_bits(gb, 2);
+            v->cbpcy_vlc = &ff_vc1_cbpcy_p_vlc[v->cbptab];
         }
 
         if (v->dquant) {
@@ -1286,12 +1300,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0;
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0;
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -1317,11 +1331,10 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         vop_dquant_decoding(v);
     }
 
-    v->bi_type = 0;
-    if (v->s.pict_type == AV_PICTURE_TYPE_BI) {
+    v->bi_type = (v->s.pict_type == AV_PICTURE_TYPE_BI);
+    if (v->bi_type)
         v->s.pict_type = AV_PICTURE_TYPE_B;
-        v->bi_type = 1;
-    }
+
     return 0;
 }
 
diff --git a/libavcodec/vc1.h b/libavcodec/vc1.h
index 5087b7c..69f6ca9 100644
--- a/libavcodec/vc1.h
+++ b/libavcodec/vc1.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -151,6 +151,21 @@ enum FrameCodingMode {
     ILACE_FIELD         ///<  in the bitstream is reported as 11b
 };
 
+/**
+ * Imode types
+ * @{
+ */
+enum Imode {
+    IMODE_RAW,
+    IMODE_NORM2,
+    IMODE_DIFF2,
+    IMODE_NORM6,
+    IMODE_DIFF6,
+    IMODE_ROWSKIP,
+    IMODE_COLSKIP
+};
+/** @} */ //imode defines
+
 /** The VC1 Context
  * @todo Change size wherever another size is more efficient
  * Many members are only used for Advanced Profile
@@ -201,8 +216,9 @@ typedef struct VC1Context{
      */
     //@{
     int profile;          ///< 2 bits, Profile
-    int frmrtq_postproc;  ///< 3 bits
+    int frmrtq_postproc;  ///< 3 bits,
     int bitrtq_postproc;  ///< 5 bits, quantized framerate-based postprocessing strength
+    int max_coded_width, max_coded_height;
     int fastuvmc;         ///< Rounding of qpel vector to hpel ? (not in Simple)
     int extended_mv;      ///< Ext MV in P/B (not in Simple)
     int dquant;           ///< How qscale varies with MBs, 2 bits (not in Simple)
@@ -278,8 +294,9 @@ typedef struct VC1Context{
     uint8_t  aux_luty[2][256],  aux_lutuv[2][256];  ///< lookup tables used for intensity compensation
     uint8_t next_luty[2][256], next_lutuv[2][256];  ///< lookup tables used for intensity compensation
     uint8_t (*curr_luty)[256]  ,(*curr_lutuv)[256];
-    int last_use_ic, curr_use_ic, next_use_ic, aux_use_ic;
+    int last_use_ic, *curr_use_ic, next_use_ic, aux_use_ic;
     int rnd;                        ///< rounding control
+    int cbptab;
 
     /** Frame decoding info for S/M profiles only */
     //@{
@@ -329,7 +346,7 @@ typedef struct VC1Context{
     uint8_t fourmvbp;
     uint8_t* fieldtx_plane;
     int fieldtx_is_raw;
-    int8_t zzi_8x8[64];
+    uint8_t zzi_8x8[64];
     uint8_t *blk_mv_type_base, *blk_mv_type;    ///< 0: frame MV, 1: field MV (interlaced frame)
     uint8_t *mv_f_base, *mv_f[2];               ///< 0: MV obtained from same field, 1: opposite field
     uint8_t *mv_f_next_base, *mv_f_next[2];
@@ -351,6 +368,11 @@ typedef struct VC1Context{
     int frfd, brfd;         ///< reference frame distance (forward or backward)
     int first_pic_header_flag;
     int pic_header_flag;
+    int mbmodetab;
+    int icbptab;
+    int imvtab;
+    int twomvbptab;
+    int fourmvbptab;
 
     /** Frame decoding info for sprite modes */
     //@{
@@ -400,10 +422,12 @@ void ff_vc1_init_transposed_scantables(VC1Context *v);
 int  ff_vc1_decode_end(AVCodecContext *avctx);
 void ff_vc1_decode_blocks(VC1Context *v);
 
-void ff_vc1_loop_filter_iblk(VC1Context *v, int pq);
-void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq);
-void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v);
-void ff_vc1_apply_p_loop_filter(VC1Context *v);
+void ff_vc1_i_overlap_filter(VC1Context *v);
+void ff_vc1_p_overlap_filter(VC1Context *v);
+void ff_vc1_i_loop_filter(VC1Context *v);
+void ff_vc1_p_loop_filter(VC1Context *v);
+void ff_vc1_p_intfr_loop_filter(VC1Context *v);
+void ff_vc1_b_intfi_loop_filter(VC1Context *v);
 
 void ff_vc1_mc_1mv(VC1Context *v, int dir);
 void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg);
diff --git a/libavcodec/vc1_block.c b/libavcodec/vc1_block.c
index d8f45f9..86320db 100644
--- a/libavcodec/vc1_block.c
+++ b/libavcodec/vc1_block.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "msmpeg4data.h"
-#include "unary_legacy.h"
+#include "unary.h"
 #include "vc1.h"
 #include "vc1_pred.h"
 #include "vc1acdata.h"
@@ -40,8 +40,13 @@
 #define DC_VLC_BITS 9
 
 // offset tables for interlaced picture MVDATA decoding
-static const int offset_table1[9] = {  0,  1,  2,  4,  8, 16, 32,  64, 128 };
-static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
+static const uint8_t offset_table[2][9] = {
+    {  0,  1,  2,  4,  8, 16, 32,  64, 128 },
+    {  0,  1,  3,  7, 15, 31, 63, 127, 255 },
+};
+
+// mapping table for internal block representation
+static const int block_map[6] = {0, 2, 1, 3, 4, 5};
 
 /***********************************************************************/
 /**
@@ -50,22 +55,8 @@ static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
  * @{
  */
 
-/**
- * Imode types
- * @{
- */
-enum Imode {
-    IMODE_RAW,
-    IMODE_NORM2,
-    IMODE_DIFF2,
-    IMODE_NORM6,
-    IMODE_DIFF6,
-    IMODE_ROWSKIP,
-    IMODE_COLSKIP
-};
-/** @} */ //imode defines
 
-static void init_block_index(VC1Context *v)
+static inline void init_block_index(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     ff_init_block_index(s);
@@ -78,72 +69,97 @@ static void init_block_index(VC1Context *v)
 
 /** @} */ //Bitplane group
 
-static void vc1_put_signed_blocks_clamped(VC1Context *v)
+static void vc1_put_blocks_clamped(VC1Context *v, int put_signed)
 {
     MpegEncContext *s = &v->s;
-    int topleft_mb_pos, top_mb_pos;
-    int stride_y, fieldtx = 0;
-    int v_dist;
-
-    /* The put pixels loop is always one MB row behind the decoding loop,
-     * because we can only put pixels when overlap filtering is done, and
-     * for filtering of the bottom edge of a MB, we need the next MB row
-     * present as well.
-     * Within the row, the put pixels loop is also one MB col behind the
-     * decoding loop. The reason for this is again, because for filtering
-     * of the right MB edge, we need the next MB present. */
-    if (!s->first_slice_line) {
+    uint8_t *dest;
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
+    int fieldtx = 0;
+    int i;
+
+    /* The put pixels loop is one MB row and one MB column behind the decoding
+     * loop because we can only put pixels when overlap filtering is done. For
+     * interlaced frame pictures, however, the put pixels loop is only one
+     * column behind the decoding loop as interlaced frame pictures only need
+     * horizontal overlap filtering. */
+    if (!s->first_slice_line && v->fcm != ILACE_FRAME) {
+        if (s->mb_x) {
+            for (i = 0; i < block_count; i++) {
+                if (i > 3 ? v->mb_type[0][s->block_index[i] - s->block_wrap[i] - 1] :
+                            v->mb_type[0][s->block_index[i] - 2 * s->block_wrap[i] - 2]) {
+                    dest = s->dest[0] + ((i & 2) - 4) * 4 * s->linesize + ((i & 1) - 2) * 8;
+                    if (put_signed)
+                        s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][block_map[i]],
+                                                          i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest,
+                                                          i > 3 ? s->uvlinesize : s->linesize);
+                    else
+                        s->idsp.put_pixels_clamped(v->block[v->topleft_blk_idx][block_map[i]],
+                                                   i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest,
+                                                   i > 3 ? s->uvlinesize : s->linesize);
+                }
+            }
+        }
+        if (s->mb_x == v->end_mb_x - 1) {
+            for (i = 0; i < block_count; i++) {
+                if (i > 3 ? v->mb_type[0][s->block_index[i] - s->block_wrap[i]] :
+                            v->mb_type[0][s->block_index[i] - 2 * s->block_wrap[i]]) {
+                    dest = s->dest[0] + ((i & 2) - 4) * 4 * s->linesize + (i & 1) * 8;
+                    if (put_signed)
+                        s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][block_map[i]],
+                                                          i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest,
+                                                          i > 3 ? s->uvlinesize : s->linesize);
+                    else
+                        s->idsp.put_pixels_clamped(v->block[v->top_blk_idx][block_map[i]],
+                                                   i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest,
+                                                   i > 3 ? s->uvlinesize : s->linesize);
+                }
+            }
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1 || v->fcm == ILACE_FRAME) {
         if (s->mb_x) {
-            topleft_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x - 1;
             if (v->fcm == ILACE_FRAME)
-                fieldtx = v->fieldtx_plane[topleft_mb_pos];
-            stride_y       = s->linesize << fieldtx;
-            v_dist         = (16 - fieldtx) >> (fieldtx == 0);
-            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0],
-                                              s->dest[0] - 16 * s->linesize - 16,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][1],
-                                              s->dest[0] - 16 * s->linesize - 8,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][2],
-                                              s->dest[0] - v_dist * s->linesize - 16,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3],
-                                              s->dest[0] - v_dist * s->linesize - 8,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4],
-                                              s->dest[1] - 8 * s->uvlinesize - 8,
-                                              s->uvlinesize);
-            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5],
-                                              s->dest[2] - 8 * s->uvlinesize - 8,
-                                              s->uvlinesize);
+                fieldtx = v->fieldtx_plane[s->mb_y * s->mb_stride + s->mb_x - 1];
+            for (i = 0; i < block_count; i++) {
+                if (i > 3 ? v->mb_type[0][s->block_index[i] - 1] :
+                            v->mb_type[0][s->block_index[i] - 2]) {
+                    if (fieldtx)
+                        dest = s->dest[0] + ((i & 2) >> 1) * s->linesize + ((i & 1) - 2) * 8;
+                    else
+                        dest = s->dest[0] + (i & 2) * 4 * s->linesize + ((i & 1) - 2) * 8;
+                    if (put_signed)
+                        s->idsp.put_signed_pixels_clamped(v->block[v->left_blk_idx][block_map[i]],
+                                                          i > 3 ? s->dest[i - 3] - 8 : dest,
+                                                          i > 3 ? s->uvlinesize : s->linesize << fieldtx);
+                    else
+                        s->idsp.put_pixels_clamped(v->block[v->left_blk_idx][block_map[i]],
+                                                   i > 3 ? s->dest[i - 3] - 8 : dest,
+                                                   i > 3 ? s->uvlinesize : s->linesize << fieldtx);
+                }
+            }
         }
-        if (s->mb_x == s->mb_width - 1) {
-            top_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x;
+        if (s->mb_x == v->end_mb_x - 1) {
             if (v->fcm == ILACE_FRAME)
-                fieldtx = v->fieldtx_plane[top_mb_pos];
-            stride_y   = s->linesize << fieldtx;
-            v_dist     = fieldtx ? 15 : 8;
-            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0],
-                                              s->dest[0] - 16 * s->linesize,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][1],
-                                              s->dest[0] - 16 * s->linesize + 8,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][2],
-                                              s->dest[0] - v_dist * s->linesize,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3],
-                                              s->dest[0] - v_dist * s->linesize + 8,
-                                              stride_y);
-            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4],
-                                              s->dest[1] - 8 * s->uvlinesize,
-                                              s->uvlinesize);
-            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5],
-                                              s->dest[2] - 8 * s->uvlinesize,
-                                              s->uvlinesize);
+                fieldtx = v->fieldtx_plane[s->mb_y * s->mb_stride + s->mb_x];
+            for (i = 0; i < block_count; i++) {
+                if (v->mb_type[0][s->block_index[i]]) {
+                    if (fieldtx)
+                        dest = s->dest[0] + ((i & 2) >> 1) * s->linesize + (i & 1) * 8;
+                    else
+                        dest = s->dest[0] + (i & 2) * 4 * s->linesize + (i & 1) * 8;
+                    if (put_signed)
+                        s->idsp.put_signed_pixels_clamped(v->block[v->cur_blk_idx][block_map[i]],
+                                                          i > 3 ? s->dest[i - 3] : dest,
+                                                          i > 3 ? s->uvlinesize : s->linesize << fieldtx);
+                    else
+                        s->idsp.put_pixels_clamped(v->block[v->cur_blk_idx][block_map[i]],
+                                                   i > 3 ? s->dest[i - 3] : dest,
+                                                   i > 3 ? s->uvlinesize : s->linesize << fieldtx);
+                }
+            }
         }
     }
+}
 
 #define inc_blk_idx(idx) do { \
         idx++; \
@@ -151,12 +167,6 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
             idx = 0; \
     } while (0)
 
-    inc_blk_idx(v->topleft_blk_idx);
-    inc_blk_idx(v->top_blk_idx);
-    inc_blk_idx(v->left_blk_idx);
-    inc_blk_idx(v->cur_blk_idx);
-}
-
 /***********************************************************************/
 /**
  * @name VC-1 Block-level functions
@@ -173,13 +183,13 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
         int edges = 0;                                         \
         if (v->dqprofile == DQPROFILE_ALL_MBS) {               \
             if (v->dqbilevel) {                                \
-                mquant = (get_bits1(gb)) ? v->altpq : v->pq;   \
+                mquant = (get_bits1(gb)) ? -v->altpq : v->pq;  \
             } else {                                           \
                 mqdiff = get_bits(gb, 3);                      \
                 if (mqdiff != 7)                               \
-                    mquant = v->pq + mqdiff;                   \
+                    mquant = -v->pq - mqdiff;                  \
                 else                                           \
-                    mquant = get_bits(gb, 5);                  \
+                    mquant = -get_bits(gb, 5);                 \
             }                                                  \
         }                                                      \
         if (v->dqprofile == DQPROFILE_SINGLE_EDGE)             \
@@ -189,14 +199,15 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
         else if (v->dqprofile == DQPROFILE_FOUR_EDGES)         \
             edges = 15;                                        \
         if ((edges&1) && !s->mb_x)                             \
-            mquant = v->altpq;                                 \
-        if ((edges&2) && s->first_slice_line)                  \
-            mquant = v->altpq;                                 \
+            mquant = -v->altpq;                                \
+        if ((edges&2) && !s->mb_y)                             \
+            mquant = -v->altpq;                                \
         if ((edges&4) && s->mb_x == (s->mb_width - 1))         \
-            mquant = v->altpq;                                 \
-        if ((edges&8) && s->mb_y == (s->mb_height - 1))        \
-            mquant = v->altpq;                                 \
-        if (!mquant || mquant > 31) {                          \
+            mquant = -v->altpq;                                \
+        if ((edges&8) &&                                       \
+            s->mb_y == ((s->mb_height >> v->field_mode) - 1))  \
+            mquant = -v->altpq;                                \
+        if (!mquant || mquant > 31 || mquant < -31) {                          \
             av_log(v->s.avctx, AV_LOG_ERROR,                   \
                    "Overriding invalid mquant %d\n", mquant);  \
             mquant = 1;                                        \
@@ -230,33 +241,32 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
         s->mb_intra = 1;                                                \
     } else {                                                            \
         index1 = index % 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val&1);                                             \
-        _dmv_x = (sign ^ ((val>>1) + offset_table[index1])) - sign;     \
+        _dmv_x = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_x = (sign ^ ((val >> 1) + _dmv_x)) - sign;             \
+        }                                                               \
                                                                         \
         index1 = index / 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val & 1);                                           \
-        _dmv_y = (sign ^ ((val >> 1) + offset_table[index1])) - sign;   \
+        _dmv_y = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_y = (sign ^ ((val >> 1) + _dmv_y)) - sign;             \
+        }                                                               \
     }
 
 static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
                                                    int *dmv_y, int *pred_flag)
 {
     int index, index1;
-    int extend_x = 0, extend_y = 0;
+    int extend_x, extend_y;
     GetBitContext *gb = &v->s.gb;
     int bits, esc;
     int val, sign;
-    const int* offs_tab;
 
     if (v->numref) {
         bits = VC1_2REF_MVDATA_VLC_BITS;
@@ -265,51 +275,32 @@ static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
         bits = VC1_1REF_MVDATA_VLC_BITS;
         esc  = 71;
     }
-    switch (v->dmvrange) {
-    case 1:
-        extend_x = 1;
-        break;
-    case 2:
-        extend_y = 1;
-        break;
-    case 3:
-        extend_x = extend_y = 1;
-        break;
-    }
+    extend_x = v->dmvrange & 1;
+    extend_y = (v->dmvrange >> 1) & 1;
     index = get_vlc2(gb, v->imv_vlc->table, bits, 3);
     if (index == esc) {
         *dmv_x = get_bits(gb, v->k_x);
         *dmv_y = get_bits(gb, v->k_y);
         if (v->numref) {
-            if (pred_flag) {
+            if (pred_flag)
                 *pred_flag = *dmv_y & 1;
-                *dmv_y     = (*dmv_y + *pred_flag) >> 1;
-            } else {
-                *dmv_y     = (*dmv_y + (*dmv_y & 1)) >> 1;
-            }
+            *dmv_y = (*dmv_y + (*dmv_y & 1)) >> 1;
         }
     }
     else {
-        if (extend_x)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
+        av_assert0(index < esc);
         index1 = (index + 1) % 9;
         if (index1 != 0) {
             val    = get_bits(gb, index1 + extend_x);
-            sign   = 0 -(val & 1);
-            *dmv_x = (sign ^ ((val >> 1) + offs_tab[index1])) - sign;
+            sign   = 0 - (val & 1);
+            *dmv_x = (sign ^ ((val >> 1) + offset_table[extend_x][index1])) - sign;
         } else
             *dmv_x = 0;
-        if (extend_y)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
         index1 = (index + 1) / 9;
         if (index1 > v->numref) {
-            val    = get_bits(gb, (index1 + (extend_y << v->numref)) >> v->numref);
+            val    = get_bits(gb, (index1 >> v->numref) + extend_y);
             sign   = 0 - (val & 1);
-            *dmv_y = (sign ^ ((val >> 1) + offs_tab[index1 >> v->numref])) - sign;
+            *dmv_y = (sign ^ ((val >> 1) + offset_table[extend_y][index1 >> v->numref])) - sign;
         } else
             *dmv_y = 0;
         if (v->numref && pred_flag)
@@ -420,6 +411,12 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     int q1, q2 = 0;
     int dqscale_index;
 
+    /* scale predictors if needed */
+    q1 = FFABS(s->current_picture.qscale_table[mb_pos]);
+    dqscale_index = s->y_dc_scale_table[q1] - 1;
+    if (dqscale_index < 0)
+        return 0;
+
     wrap = s->block_wrap[n];
     dc_val = s->dc_val[0] + s->block_index[n];
 
@@ -429,18 +426,14 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     c = dc_val[ - 1];
     b = dc_val[ - 1 - wrap];
     a = dc_val[ - wrap];
-    /* scale predictors if needed */
-    q1 = s->current_picture.qscale_table[mb_pos];
-    dqscale_index = s->y_dc_scale_table[q1] - 1;
-    if (dqscale_index < 0)
-        return 0;
+
     if (c_avail && (n != 1 && n != 3)) {
-        q2 = s->current_picture.qscale_table[mb_pos - 1];
+        q2 = FFABS(s->current_picture.qscale_table[mb_pos - 1]);
         if (q2 && q2 != q1)
             c = (c * s->y_dc_scale_table[q2] * ff_vc1_dqscale[dqscale_index] + 0x20000) >> 18;
     }
     if (a_avail && (n != 2 && n != 3)) {
-        q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
+        q2 = FFABS(s->current_picture.qscale_table[mb_pos - s->mb_stride]);
         if (q2 && q2 != q1)
             a = (a * s->y_dc_scale_table[q2] * ff_vc1_dqscale[dqscale_index] + 0x20000) >> 18;
     }
@@ -450,25 +443,17 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
             off--;
         if (n != 2)
             off -= s->mb_stride;
-        q2 = s->current_picture.qscale_table[off];
+        q2 = FFABS(s->current_picture.qscale_table[off]);
         if (q2 && q2 != q1)
             b = (b * s->y_dc_scale_table[q2] * ff_vc1_dqscale[dqscale_index] + 0x20000) >> 18;
     }
 
-    if (a_avail && c_avail) {
-        if (abs(a - b) <= abs(b - c)) {
-            pred     = c;
-            *dir_ptr = 1; // left
-        } else {
-            pred     = a;
-            *dir_ptr = 0; // top
-        }
+    if (c_avail && (!a_avail || abs(a - b) <= abs(b - c))) {
+        pred     = c;
+        *dir_ptr = 1; // left
     } else if (a_avail) {
         pred     = a;
         *dir_ptr = 0; // top
-    } else if (c_avail) {
-        pred     = c;
-        *dir_ptr = 1; // left
     } else {
         pred     = 0;
         *dir_ptr = 1; // left
@@ -527,17 +512,16 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                                 int *value, int codingset)
 {
     GetBitContext *gb = &v->s.gb;
-    int index, escape, run = 0, level = 0, lst = 0;
+    int index, run, level, lst, sign;
 
     index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
     if (index != ff_vc1_ac_sizes[codingset] - 1) {
         run   = vc1_index_decode_table[codingset][index][0];
         level = vc1_index_decode_table[codingset][index][1];
         lst   = index >= vc1_last_decode_table[codingset] || get_bits_left(gb) < 0;
-        if (get_bits1(gb))
-            level = -level;
+        sign  = get_bits1(gb);
     } else {
-        escape = decode210(gb);
+        int escape = decode210(gb);
         if (escape != 2) {
             index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
             run   = vc1_index_decode_table[codingset][index][0];
@@ -554,10 +538,8 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                 else
                     run += vc1_delta_run_table[codingset][level] + 1;
             }
-            if (get_bits1(gb))
-                level = -level;
+            sign = get_bits1(gb);
         } else {
-            int sign;
             lst = get_bits1(gb);
             if (v->s.esc3_level_length == 0) {
                 if (v->pq < 8 || v->dquantfrm) { // table 59
@@ -572,14 +554,12 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
             run   = get_bits(gb, v->s.esc3_run_length);
             sign  = get_bits1(gb);
             level = get_bits(gb, v->s.esc3_level_length);
-            if (sign)
-                level = -level;
         }
     }
 
     *last  = lst;
     *skip  = run;
-    *value = level;
+    *value = (level ^ -sign) + sign;
 }
 
 /** Decode intra block in intra frames - should be faster than decode_intra_block
@@ -598,7 +578,7 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     int i;
     int16_t *dc_val;
     int16_t *ac_val, *ac_val2;
-    int dcdiff;
+    int dcdiff, scale;
 
     /* Get DC differential */
     if (n < 4) {
@@ -611,16 +591,12 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (v->pq == 1 || v->pq == 2) ? 3 - v->pq : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (v->pq == 1)      dcdiff = get_bits(gb, 10);
-            else if (v->pq == 2) dcdiff = get_bits(gb, 9);
-            else                 dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (v->pq == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (v->pq == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -631,27 +607,29 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-    /* Skip ? */
-    if (!coded) {
-        goto not_coded;
-    }
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
-    // AC Decoding
-    i = 1;
+    ac_val  = s->ac_val[0][s->block_index[n]];
+    ac_val2 = ac_val;
+    if (dc_pred_dir) // left
+        ac_val -= 16;
+    else // top
+        ac_val -= 16 * s->block_wrap[n];
+
+    scale = v->pq * 2 + v->halfpq;
 
-    {
+    //AC Decoding
+    i = !!coded;
+
+    if (coded) {
         int last = 0, skip, value;
         const uint8_t *zz_table;
-        int scale;
         int k;
 
-        scale = v->pq * 2 + v->halfpq;
-
         if (v->s.ac_pred) {
             if (!dc_pred_dir)
                 zz_table = v->zz_8x8[2];
@@ -660,13 +638,6 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
         } else
             zz_table = v->zz_8x8[1];
 
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
-        if (dc_pred_dir) // left
-            ac_val -= 16;
-        else // top
-            ac_val -= 16 * s->block_wrap[n];
-
         while (!last) {
             vc1_decode_ac_coeff(v, &last, &skip, &value, codingset);
             i += skip;
@@ -677,13 +648,15 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++)
-                    block[k << v->left_blk_sh] += ac_val[k];
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++)
-                    block[k << v->top_blk_sh] += ac_val[k + 8];
+                sh = v->top_blk_sh;
+                ac_val += 8;
             }
+            for (k = 1; k < 8; k++)
+                block[k << sh] += ac_val[k];
         }
         /* save AC coeffs for further prediction */
         for (k = 1; k < 8; k++) {
@@ -699,46 +672,30 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -v->pq : v->pq;
             }
 
-        if (s->ac_pred) i = 63;
-    }
-
-not_coded:
-    if (!coded) {
-        int k, scale;
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
+    } else {
+        int k;
 
-        i = 0;
-        scale = v->pq * 2 + v->halfpq;
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            ac_val -= 16;
-            if (s->ac_pred)
-                memcpy(ac_val2, ac_val, 8 * 2);
-        } else { // top
-            ac_val -= 16 * s->block_wrap[n];
-            if (s->ac_pred)
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-        }
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { //left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -v->pq : v->pq;
             }
-            i = 63;
         }
     }
+    if (s->ac_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -759,7 +716,7 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int a_avail = v->a_avail, c_avail = v->c_avail;
@@ -767,6 +724,7 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     int scale;
     int q1, q2 = 0;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    int quant = FFABS(mquant);
 
     /* Get DC differential */
     if (n < 4) {
@@ -779,59 +737,58 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (quant == 1 || quant == 2) ? 3 - quant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
     }
 
     /* Prediction */
-    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, mquant, n, v->a_avail, v->c_avail, &dc_val, &dc_pred_dir);
+    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, quant, n, v->a_avail, v->c_avail, &dc_val, &dc_pred_dir);
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-
-    //AC Decoding
-    i = 1;
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
     /* check if AC is needed at all */
     if (!a_avail && !c_avail)
         use_pred = 0;
-    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-    ac_val2 = ac_val;
 
-    scale = mquant * 2 + ((mquant == v->pq) ? v->halfpq : 0);
+    scale = quant * 2 + ((mquant < 0) ? 0 : v->halfpq);
 
+    ac_val  = s->ac_val[0][s->block_index[n]];
+    ac_val2 = ac_val;
     if (dc_pred_dir) // left
         ac_val -= 16;
     else // top
         ac_val -= 16 * s->block_wrap[n];
 
     q1 = s->current_picture.qscale_table[mb_pos];
-    if (dc_pred_dir && c_avail && mb_pos)
-        q2 = s->current_picture.qscale_table[mb_pos - 1];
-    if (!dc_pred_dir && a_avail && mb_pos >= s->mb_stride)
-        q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
-    if (dc_pred_dir && n == 1)
-        q2 = q1;
-    if (!dc_pred_dir && n == 2)
-        q2 = q1;
     if (n == 3)
         q2 = q1;
+    else if (dc_pred_dir) {
+        if (n == 1)
+            q2 = q1;
+        else if (c_avail && mb_pos)
+            q2 = s->current_picture.qscale_table[mb_pos - 1];
+    } else {
+        if (n == 2)
+            q2 = q1;
+        else if (a_avail && mb_pos >= s->mb_stride)
+            q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
+    }
+
+    //AC Decoding
+    i = 1;
 
     if (coded) {
         int last = 0, skip, value;
@@ -864,28 +821,25 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
+            if (dc_pred_dir) { // left
+                sh = v->left_blk_sh;
+            } else { // top
+                sh = v->top_blk_sh;
+                ac_val += 8;
+            }
             /* scale predictors if needed*/
+            q1 = FFABS(q1) * 2 + ((q1 < 0) ? 0 : v->halfpq) - 1;
+            if (q1 < 1)
+                return AVERROR_INVALIDDATA;
+            if (q2)
+                q2 = FFABS(q2) * 2 + ((q2 < 0) ? 0 : v->halfpq) - 1;
             if (q2 && q1 != q2) {
-                q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-
-                if (q1 < 1)
-                    return AVERROR_INVALIDDATA;
-                if (dc_pred_dir) { // left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                } else { // top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += (ac_val[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
             } else {
-                if (dc_pred_dir) { //left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += ac_val[k];
-                } else { //top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += ac_val[k + 8];
-                }
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += ac_val[k];
             }
         }
         /* save AC coeffs for further prediction */
@@ -899,58 +853,42 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
             if (block[k]) {
                 block[k] *= scale;
                 if (!v->pquantizer)
-                    block[k] += (block[k] < 0) ? -mquant : mquant;
+                    block[k] += (block[k] < 0) ? -quant : quant;
             }
 
-        if (use_pred) i = 63;
     } else { // no AC coeffs
         int k;
 
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            if (use_pred) {
-                memcpy(ac_val2, ac_val, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        } else { // top
-            if (use_pred) {
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k + 8] = (ac_val2[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        }
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val2[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val2[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            q1 = FFABS(q1) * 2 + ((q1 < 0) ? 0 : v->halfpq) - 1;
+            if (q1 < 1)
+                return AVERROR_INVALIDDATA;
+            if (q2)
+                q2 = FFABS(q2) * 2 + ((q2 < 0) ? 0 : v->halfpq) - 1;
+            if (q2 && q1 != q2) {
+                for (k = 1; k < 8; k++)
+                    ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
+            }
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val2[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -quant : quant;
             }
-            i = 63;
         }
     }
+    if (use_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -971,7 +909,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
@@ -979,15 +917,16 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     int use_pred = s->ac_pred;
     int scale;
     int q1, q2 = 0;
+    int quant = FFABS(mquant);
 
     s->bdsp.clear_block(block);
 
     /* XXX: Guard against dumb values of mquant */
-    mquant = (mquant < 1) ? 0 : ((mquant > 31) ? 31 : mquant);
+    quant = av_clip_uintp2(quant, 5);
 
     /* Set DC scale - y and c use the same */
-    s->y_dc_scale = s->y_dc_scale_table[mquant];
-    s->c_dc_scale = s->c_dc_scale_table[mquant];
+    s->y_dc_scale = s->y_dc_scale_table[quant];
+    s->c_dc_scale = s->c_dc_scale_table[quant];
 
     /* Get DC differential */
     if (n < 4) {
@@ -1000,23 +939,19 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (quant == 1 || quant == 2) ? 3 - quant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
     }
 
     /* Prediction */
-    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, mquant, n, a_avail, c_avail, &dc_val, &dc_pred_dir);
+    dcdiff += ff_vc1_pred_dc(&v->s, v->overlap, quant, n, a_avail, c_avail, &dc_val, &dc_pred_dir);
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
@@ -1034,10 +969,10 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     if (!a_avail) dc_pred_dir = 1;
     if (!c_avail) dc_pred_dir = 0;
     if (!a_avail && !c_avail) use_pred = 0;
-    ac_val = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val = s->ac_val[0][s->block_index[n]];
     ac_val2 = ac_val;
 
-    scale = mquant * 2 + v->halfpq;
+    scale = quant * 2 + ((mquant < 0) ? 0 : v->halfpq);
 
     if (dc_pred_dir) //left
         ac_val -= 16;
@@ -1081,12 +1016,12 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
         /* apply AC prediction if needed */
         if (use_pred) {
             /* scale predictors if needed*/
+            q1 = FFABS(q1) * 2 + ((q1 < 0) ? 0 : v->halfpq) - 1;
+            if (q1 < 1)
+                return AVERROR_INVALIDDATA;
+            if (q2)
+                q2 = FFABS(q2) * 2 + ((q2 < 0) ? 0 : v->halfpq) - 1;
             if (q2 && q1 != q2) {
-                q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-
-                if (q1 < 1)
-                    return AVERROR_INVALIDDATA;
                 if (dc_pred_dir) { // left
                     for (k = 1; k < 8; k++)
                         block[k << v->left_blk_sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
@@ -1115,7 +1050,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
             if (block[k]) {
                 block[k] *= scale;
                 if (!v->pquantizer)
-                    block[k] += (block[k] < 0) ? -mquant : mquant;
+                    block[k] += (block[k] < 0) ? -quant : quant;
             }
 
         if (use_pred) i = 63;
@@ -1126,11 +1061,12 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
         if (dc_pred_dir) { // left
             if (use_pred) {
                 memcpy(ac_val2, ac_val, 8 * 2);
+                q1 = FFABS(q1) * 2 + ((q1 < 0) ? 0 : v->halfpq) - 1;
+                if (q1 < 1)
+                    return AVERROR_INVALIDDATA;
+                if (q2)
+                    q2 = FFABS(q2) * 2 + ((q2 < 0) ? 0 : v->halfpq) - 1;
                 if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
                     for (k = 1; k < 8; k++)
                         ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
                 }
@@ -1138,11 +1074,12 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
         } else { // top
             if (use_pred) {
                 memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
+                q1 = FFABS(q1) * 2 + ((q1 < 0) ? 0 : v->halfpq) - 1;
+                if (q1 < 1)
+                    return AVERROR_INVALIDDATA;
+                if (q2)
+                    q2 = FFABS(q2) * 2 + ((q2 < 0) ? 0 : v->halfpq) - 1;
                 if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
                     for (k = 1; k < 8; k++)
                         ac_val2[k + 8] = (ac_val2[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
                 }
@@ -1155,13 +1092,13 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
                 for (k = 1; k < 8; k++) {
                     block[k << v->left_blk_sh] = ac_val2[k] * scale;
                     if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -mquant : mquant;
+                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -quant : quant;
                 }
             } else { // top
                 for (k = 1; k < 8; k++) {
                     block[k << v->top_blk_sh] = ac_val2[k + 8] * scale;
                     if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -mquant : mquant;
+                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -quant : quant;
                 }
             }
             i = 63;
@@ -1186,6 +1123,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
     int scale, off, idx, last, skip, value;
     int ttblk = ttmb & 7;
     int pat = 0;
+    int quant = FFABS(mquant);
 
     s->bdsp.clear_block(block);
 
@@ -1206,7 +1144,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
         if (ttblk == TT_4X8_RIGHT || ttblk == TT_4X8_LEFT)
             ttblk = TT_4X8;
     }
-    scale = 2 * mquant + ((v->pq == mquant) ? v->halfpq : 0);
+    scale = quant * 2 + ((mquant < 0) ? 0 : v->halfpq);
 
     // convert transforms like 8X4_TOP to generic TT and SUBBLKPAT
     if (ttblk == TT_8X4_TOP || ttblk == TT_8X4_BOTTOM) {
@@ -1233,7 +1171,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                 idx = v->zzi_8x8[i++];
             block[idx] = value * scale;
             if (!v->pquantizer)
-                block[idx] += (block[idx] < 0) ? -mquant : mquant;
+                block[idx] += (block[idx] < 0) ? -quant : quant;
         }
         if (!skip_block) {
             if (i == 1)
@@ -1261,7 +1199,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                     idx = ff_vc1_adv_interlaced_4x4_zz[i++];
                 block[idx + off] = value * scale;
                 if (!v->pquantizer)
-                    block[idx + off] += (block[idx + off] < 0) ? -mquant : mquant;
+                    block[idx + off] += (block[idx + off] < 0) ? -quant : quant;
             }
             if (!(subblkpat & (1 << (3 - j))) && !skip_block) {
                 if (i == 1)
@@ -1288,7 +1226,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                     idx = ff_vc1_adv_interlaced_8x4_zz[i++] + off;
                 block[idx] = value * scale;
                 if (!v->pquantizer)
-                    block[idx] += (block[idx] < 0) ? -mquant : mquant;
+                    block[idx] += (block[idx] < 0) ? -quant : quant;
             }
             if (!(subblkpat & (1 << (1 - j))) && !skip_block) {
                 if (i == 1)
@@ -1315,7 +1253,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                     idx = ff_vc1_adv_interlaced_4x8_zz[i++] + off;
                 block[idx] = value * scale;
                 if (!v->pquantizer)
-                    block[idx] += (block[idx] < 0) ? -mquant : mquant;
+                    block[idx] += (block[idx] < 0) ? -quant : quant;
             }
             if (!(subblkpat & (1 << (1 - j))) && !skip_block) {
                 if (i == 1)
@@ -1333,8 +1271,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
 
 /** @} */ // Macroblock group
 
-static const int size_table  [6] = { 0, 2, 3, 4,  5,  8 };
-static const int offset_table[6] = { 0, 1, 3, 7, 15, 31 };
+static const uint8_t size_table[6] = { 0, 2, 3, 4,  5,  8 };
 
 /** Decode one P-frame MB
  */
@@ -1414,30 +1351,20 @@ static int vc1_decode_p_mb(VC1Context *v)
                     if (i == 1 || i == 3 || s->mb_x)
                         v->c_avail = v->mb_type[0][s->block_index[i] - 1];
 
-                    vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                    vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
                                            (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                         continue;
-                    v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
+                    v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
-                            s->block[i][j] <<= 1;
-                    s->idsp.put_signed_pixels_clamped(s->block[i],
-                                                      s->dest[dst_idx] + off,
-                                                      i & 4 ? s->uvlinesize
-                                                            : s->linesize);
-                    if (v->pq >= 9 && v->overlap) {
-                        if (v->c_avail)
-                            v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
-                        if (v->a_avail)
-                            v->vc1dsp.vc1_v_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
-                    }
+                            v->block[v->cur_blk_idx][block_map[i]][j] <<= 1;
                     block_cbp   |= 0xF << (i << 2);
                     block_intra |= 1 << i;
                 } else if (val) {
-                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block,
+                    pat = vc1_decode_p_block(v, v->block[v->cur_blk_idx][block_map[i]], i, mquant, ttmb, first_block,
                                              s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -1525,31 +1452,21 @@ static int vc1_decode_p_mb(VC1Context *v)
                     if (i == 1 || i == 3 || s->mb_x)
                         v->c_avail = v->mb_type[0][s->block_index[i] - 1];
 
-                    vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant,
+                    vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, is_coded[i], mquant,
                                            (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                         continue;
-                    v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
+                    v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
-                            s->block[i][j] <<= 1;
-                    s->idsp.put_signed_pixels_clamped(s->block[i],
-                                                      s->dest[dst_idx] + off,
-                                                      (i & 4) ? s->uvlinesize
-                                                              : s->linesize);
-                    if (v->pq >= 9 && v->overlap) {
-                        if (v->c_avail)
-                            v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
-                        if (v->a_avail)
-                            v->vc1dsp.vc1_v_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
-                    }
+                            v->block[v->cur_blk_idx][block_map[i]][j] <<= 1;
                     block_cbp   |= 0xF << (i << 2);
                     block_intra |= 1 << i;
                 } else if (is_coded[i]) {
-                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                    pat = vc1_decode_p_block(v, v->block[v->cur_blk_idx][block_map[i]], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
                                              &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
@@ -1573,6 +1490,10 @@ static int vc1_decode_p_mb(VC1Context *v)
         }
     }
 end:
+    if (v->overlap && v->pq >= 9)
+        ff_vc1_p_overlap_filter(v);
+    vc1_put_blocks_clamped(v, 1);
+
     v->cbp[s->mb_x]      = block_cbp;
     v->ttblk[s->mb_x]    = block_tt;
     v->is_intra[s->mb_x] = block_intra;
@@ -1600,7 +1521,7 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
     int skipped, fourmv = 0, twomv = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0, mvbp;
-    int stride_y, fieldtx;
+    int fieldtx;
 
     mquant = v->pq; /* Lossy initialization */
 
@@ -1659,8 +1580,8 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
             GET_MQUANT();
             s->current_picture.qscale_table[mb_pos] = mquant;
             /* Set DC scale - y and c use the same (not sure if necessary here) */
-            s->y_dc_scale = s->y_dc_scale_table[mquant];
-            s->c_dc_scale = s->c_dc_scale_table[mquant];
+            s->y_dc_scale = s->y_dc_scale_table[FFABS(mquant)];
+            s->c_dc_scale = s->c_dc_scale_table[FFABS(mquant)];
             dst_idx = 0;
             for (i = 0; i < 6; i++) {
                 v->a_avail = v->c_avail          = 0;
@@ -1673,22 +1594,16 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                 if (i == 1 || i == 3 || s->mb_x)
                     v->c_avail = v->mb_type[0][s->block_index[i] - 1];
 
-                vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+                vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
                                        (i & 4) ? v->codingset2 : v->codingset);
-                if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
-                v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
-                if (i < 4) {
-                    stride_y = s->linesize << fieldtx;
+                v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
+                if (i < 4)
                     off = (fieldtx) ? ((i & 1) * 8) + ((i & 2) >> 1) * s->linesize : (i & 1) * 8 + 4 * (i & 2) * s->linesize;
-                } else {
-                    stride_y = s->uvlinesize;
+                else
                     off = 0;
-                }
-                s->idsp.put_signed_pixels_clamped(s->block[i],
-                                                  s->dest[dst_idx] + off,
-                                                  stride_y);
-                //TODO: loop filter
+                block_cbp |= 0xf << (i << 2);
             }
 
         } else { // inter MB
@@ -1711,19 +1626,14 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
             dst_idx = 0;
             if (fourmv) {
                 mvbp = v->fourmvbp;
-                for (i = 0; i < 6; i++) {
-                    if (i < 4) {
-                        dmv_x = dmv_y = 0;
-                        val   = ((mvbp >> (3 - i)) & 1);
-                        if (val) {
-                            get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                        }
-                        ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
-                        ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                    } else if (i == 4) {
-                        ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
-                    }
+                for (i = 0; i < 4; i++) {
+                    dmv_x = dmv_y = 0;
+                    if (mvbp & (8 >> i))
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
+                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
                 }
+                ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
             } else if (twomv) {
                 mvbp  = v->twomvbp;
                 dmv_x = dmv_y = 0;
@@ -1764,10 +1674,10 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                 else
                     off = (i & 4) ? 0 : ((i & 1) * 8 + ((i > 1) * s->linesize));
                 if (val) {
-                    pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                    pat = vc1_decode_p_block(v, v->block[v->cur_blk_idx][block_map[i]], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -1789,9 +1699,15 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
         v->blk_mv_type[s->block_index[3]] = 0;
         ff_vc1_pred_mv_intfr(v, 0, 0, 0, 1, v->range_x, v->range_y, v->mb_type[0], 0);
         ff_vc1_mc_1mv(v, 0);
+        v->fieldtx_plane[mb_pos] = 0;
     }
-    if (s->mb_x == s->mb_width - 1)
-        memmove(v->is_intra_base, v->is_intra, sizeof(v->is_intra_base[0])*s->mb_stride);
+    if (v->overlap && v->pq >= 9)
+        ff_vc1_p_overlap_filter(v);
+    vc1_put_blocks_clamped(v, 1);
+
+    v->cbp[s->mb_x]      = block_cbp;
+    v->ttblk[s->mb_x]    = block_tt;
+
     return 0;
 }
 
@@ -1810,7 +1726,7 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
     int val; /* temp values */
     int first_block = 1;
     int dst_idx, off;
-    int pred_flag;
+    int pred_flag = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0;
 
@@ -1826,8 +1742,8 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
         GET_MQUANT();
         s->current_picture.qscale_table[mb_pos] = mquant;
         /* Set DC scale - y and c use the same (not sure if necessary here) */
-        s->y_dc_scale = s->y_dc_scale_table[mquant];
-        s->c_dc_scale = s->c_dc_scale_table[mquant];
+        s->y_dc_scale = s->y_dc_scale_table[FFABS(mquant)];
+        s->c_dc_scale = s->c_dc_scale_table[FFABS(mquant)];
         v->s.ac_pred  = v->acpred_plane[mb_pos] = get_bits1(gb);
         mb_has_coeffs = idx_mbmode & 1;
         if (mb_has_coeffs)
@@ -1844,22 +1760,19 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
             if (i == 1 || i == 3 || s->mb_x)
                 v->c_avail = v->mb_type[0][s->block_index[i] - 1];
 
-            vc1_decode_intra_block(v, s->block[i], i, val, mquant,
+            vc1_decode_intra_block(v, v->block[v->cur_blk_idx][block_map[i]], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
-            v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
+            v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[i]]);
             off  = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
-            s->idsp.put_signed_pixels_clamped(s->block[i],
-                                              s->dest[dst_idx] + off,
-                                              (i & 4) ? s->uvlinesize
-                                                      : s->linesize);
-            // TODO: loop filter
+            block_cbp |= 0xf << (i << 2);
         }
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (idx_mbmode <= 5) { // 1-MV
             dmv_x = dmv_y = pred_flag = 0;
             if (idx_mbmode & 1) {
@@ -1870,18 +1783,14 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
             mb_has_coeffs = !(idx_mbmode & 2);
         } else { // 4-MV
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x = dmv_y = pred_flag = 0;
-                    val   = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
-                    }
-                    ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
-                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, 0);
+            for (i = 0; i < 4; i++) {
+                dmv_x = dmv_y = pred_flag = 0;
+                if (v->fourmvbp & (8 >> i))
+                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
+                ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
+                ff_vc1_mc_4mv_luma(v, i, 0, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, 0);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -1900,19 +1809,25 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
             val = ((cbp >> (5 - i)) & 1);
             off = (i & 4) ? 0 : (i & 1) * 8 + (i & 2) * 4 * s->linesize;
             if (val) {
-                pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                pat = vc1_decode_p_block(v, v->block[v->cur_blk_idx][block_map[i]], i, mquant, ttmb,
                                          first_block, s->dest[dst_idx] + off,
                                          (i & 4) ? s->uvlinesize : s->linesize,
-                                         (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
+                                         CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
                                          &block_tt);
                 block_cbp |= pat << (i << 2);
-                if (!v->ttmbf && ttmb < 8) ttmb = -1;
+                if (!v->ttmbf && ttmb < 8)
+                    ttmb = -1;
                 first_block = 0;
             }
         }
     }
-    if (s->mb_x == s->mb_width - 1)
-        memmove(v->is_intra_base, v->is_intra, sizeof(v->is_intra_base[0]) * s->mb_stride);
+    if (v->overlap && v->pq >= 9)
+        ff_vc1_p_overlap_filter(v);
+    vc1_put_blocks_clamped(v, 1);
+
+    v->cbp[s->mb_x]      = block_cbp;
+    v->ttblk[s->mb_x]    = block_tt;
+
     return 0;
 }
 
@@ -2049,7 +1964,7 @@ static void vc1_decode_b_mb(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
@@ -2063,7 +1978,7 @@ static void vc1_decode_b_mb(VC1Context *v)
             vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                first_block, s->dest[dst_idx] + off,
                                (i & 4) ? s->uvlinesize : s->linesize,
-                               (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
+                               CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
             if (!v->ttmbf && ttmb < 8)
                 ttmb = -1;
             first_block = 0;
@@ -2089,7 +2004,8 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
     int fwd;
     int dmv_x[2], dmv_y[2], pred_flag[2];
     int bmvtype = BMV_TYPE_BACKWARD;
-    int idx_mbmode, interpmvp;
+    int block_cbp = 0, pat, block_tt = 0;
+    int idx_mbmode;
 
     mquant      = v->pq; /* Lossy initialization */
     s->mb_intra = 0;
@@ -2104,8 +2020,8 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
         GET_MQUANT();
         s->current_picture.qscale_table[mb_pos] = mquant;
         /* Set DC scale - y and c use the same (not sure if necessary here) */
-        s->y_dc_scale = s->y_dc_scale_table[mquant];
-        s->c_dc_scale = s->c_dc_scale_table[mquant];
+        s->y_dc_scale = s->y_dc_scale_table[FFABS(mquant)];
+        s->c_dc_scale = s->c_dc_scale_table[FFABS(mquant)];
         v->s.ac_pred  = v->acpred_plane[mb_pos] = get_bits1(gb);
         mb_has_coeffs = idx_mbmode & 1;
         if (mb_has_coeffs)
@@ -2124,7 +2040,7 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
@@ -2135,17 +2051,18 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                                               s->dest[dst_idx] + off,
                                               (i & 4) ? s->uvlinesize
                                                       : s->linesize);
-            // TODO: yet to perform loop filter
         }
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (v->fmb_is_raw)
             fwd = v->forward_mb_plane[mb_pos] = get_bits1(gb);
         else
             fwd = v->forward_mb_plane[mb_pos];
         if (idx_mbmode <= 5) { // 1-MV
+            int interpmvp = 0;
             dmv_x[0]     = dmv_x[1] = dmv_y[0] = dmv_y[1] = 0;
             pred_flag[0] = pred_flag[1] = 0;
             if (fwd)
@@ -2168,12 +2085,16 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
             if (bmvtype != BMV_TYPE_DIRECT && idx_mbmode & 1) {
                 get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD], &dmv_y[bmvtype == BMV_TYPE_BACKWARD], &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
             }
-            if (bmvtype == BMV_TYPE_INTERPOLATED && interpmvp) {
+            if (interpmvp) {
                 get_mvdata_interlaced(v, &dmv_x[1], &dmv_y[1], &pred_flag[1]);
             }
             if (bmvtype == BMV_TYPE_DIRECT) {
                 dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
                 dmv_x[1] = dmv_y[1] = pred_flag[0] = 0;
+                if (!s->next_picture_ptr->field_picture) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Mixed field/frame direct mode not supported\n");
+                    return;
+                }
             }
             ff_vc1_pred_b_mv_intfi(v, 0, dmv_x, dmv_y, 1, pred_flag);
             vc1_b_mc(v, dmv_x, dmv_y, (bmvtype == BMV_TYPE_DIRECT), bmvtype);
@@ -2183,21 +2104,18 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                 bmvtype = BMV_TYPE_FORWARD;
             v->bmvtype  = bmvtype;
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
-                    dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
-                    val = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
-                                                 &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
-                                             &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
-                    }
-                    ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
-                    ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
+            for (i = 0; i < 4; i++) {
+                dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
+                dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
+                if (v->fourmvbp & (8 >> i)) {
+                    get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
+                                             &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
+                                         &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
+                }
+                ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
+                ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -2216,16 +2134,19 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
             val = ((cbp >> (5 - i)) & 1);
             off = (i & 4) ? 0 : (i & 1) * 8 + (i & 2) * 4 * s->linesize;
             if (val) {
-                vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
-                                   first_block, s->dest[dst_idx] + off,
-                                   (i & 4) ? s->uvlinesize : s->linesize,
-                                   (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
+                pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
+                                         first_block, s->dest[dst_idx] + off,
+                                         (i & 4) ? s->uvlinesize : s->linesize,
+                                         CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                block_cbp |= pat << (i << 2);
                 if (!v->ttmbf && ttmb < 8)
                     ttmb = -1;
                 first_block = 0;
             }
         }
     }
+    v->cbp[s->mb_x]      = block_cbp;
+    v->ttblk[s->mb_x]    = block_tt;
 }
 
 /** Decode one B-frame MB (in interlaced frame B picture)
@@ -2275,39 +2196,6 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
         }
     }
 
-    if (v->dmb_is_raw)
-        direct = get_bits1(gb);
-    else
-        direct = v->direct_mb_plane[mb_pos];
-
-    if (direct) {
-        s->mv[0][0][0] = s->current_picture.motion_val[0][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 0, s->quarter_sample);
-        s->mv[0][0][1] = s->current_picture.motion_val[0][s->block_index[0]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][1], v->bfraction, 0, s->quarter_sample);
-        s->mv[1][0][0] = s->current_picture.motion_val[1][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 1, s->quarter_sample);
-        s->mv[1][0][1] = s->current_picture.motion_val[1][s->block_index[0]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][1], v->bfraction, 1, s->quarter_sample);
-
-        if (twomv) {
-            s->mv[0][2][0] = s->current_picture.motion_val[0][s->block_index[2]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][0], v->bfraction, 0, s->quarter_sample);
-            s->mv[0][2][1] = s->current_picture.motion_val[0][s->block_index[2]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][1], v->bfraction, 0, s->quarter_sample);
-            s->mv[1][2][0] = s->current_picture.motion_val[1][s->block_index[2]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][0], v->bfraction, 1, s->quarter_sample);
-            s->mv[1][2][1] = s->current_picture.motion_val[1][s->block_index[2]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][1], v->bfraction, 1, s->quarter_sample);
-
-            for (i = 1; i < 4; i += 2) {
-                s->mv[0][i][0] = s->current_picture.motion_val[0][s->block_index[i]][0] = s->mv[0][i-1][0];
-                s->mv[0][i][1] = s->current_picture.motion_val[0][s->block_index[i]][1] = s->mv[0][i-1][1];
-                s->mv[1][i][0] = s->current_picture.motion_val[1][s->block_index[i]][0] = s->mv[1][i-1][0];
-                s->mv[1][i][1] = s->current_picture.motion_val[1][s->block_index[i]][1] = s->mv[1][i-1][1];
-            }
-        } else {
-            for (i = 1; i < 4; i++) {
-                s->mv[0][i][0] = s->current_picture.motion_val[0][s->block_index[i]][0] = s->mv[0][0][0];
-                s->mv[0][i][1] = s->current_picture.motion_val[0][s->block_index[i]][1] = s->mv[0][0][1];
-                s->mv[1][i][0] = s->current_picture.motion_val[1][s->block_index[i]][0] = s->mv[1][0][0];
-                s->mv[1][i][1] = s->current_picture.motion_val[1][s->block_index[i]][1] = s->mv[1][0][1];
-            }
-        }
-    }
-
     if (ff_vc1_mbmode_intfrp[0][idx_mbmode][0] == MV_PMODE_INTFR_INTRA) { // intra MB
         for (i = 0; i < 4; i++) {
             s->mv[0][i][0] = s->current_picture.motion_val[0][s->block_index[i]][0] = 0;
@@ -2326,8 +2214,8 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
         GET_MQUANT();
         s->current_picture.qscale_table[mb_pos] = mquant;
         /* Set DC scale - y and c use the same (not sure if necessary here) */
-        s->y_dc_scale = s->y_dc_scale_table[mquant];
-        s->c_dc_scale = s->c_dc_scale_table[mquant];
+        s->y_dc_scale = s->y_dc_scale_table[FFABS(mquant)];
+        s->c_dc_scale = s->c_dc_scale_table[FFABS(mquant)];
         dst_idx = 0;
         for (i = 0; i < 6; i++) {
             v->a_avail = v->c_avail          = 0;
@@ -2342,7 +2230,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if (i > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && i > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (i < 4) {
@@ -2358,6 +2246,42 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
         }
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
+
+        if (v->dmb_is_raw)
+            direct = get_bits1(gb);
+        else
+            direct = v->direct_mb_plane[mb_pos];
+
+        if (direct) {
+            if (s->next_picture_ptr->field_picture)
+                av_log(s->avctx, AV_LOG_WARNING, "Mixed frame/field direct mode not supported\n");
+            s->mv[0][0][0] = s->current_picture.motion_val[0][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 0, s->quarter_sample);
+            s->mv[0][0][1] = s->current_picture.motion_val[0][s->block_index[0]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][1], v->bfraction, 0, s->quarter_sample);
+            s->mv[1][0][0] = s->current_picture.motion_val[1][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 1, s->quarter_sample);
+            s->mv[1][0][1] = s->current_picture.motion_val[1][s->block_index[0]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][1], v->bfraction, 1, s->quarter_sample);
+
+            if (twomv) {
+                s->mv[0][2][0] = s->current_picture.motion_val[0][s->block_index[2]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][0], v->bfraction, 0, s->quarter_sample);
+                s->mv[0][2][1] = s->current_picture.motion_val[0][s->block_index[2]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][1], v->bfraction, 0, s->quarter_sample);
+                s->mv[1][2][0] = s->current_picture.motion_val[1][s->block_index[2]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][0], v->bfraction, 1, s->quarter_sample);
+                s->mv[1][2][1] = s->current_picture.motion_val[1][s->block_index[2]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[2]][1], v->bfraction, 1, s->quarter_sample);
+
+                for (i = 1; i < 4; i += 2) {
+                    s->mv[0][i][0] = s->current_picture.motion_val[0][s->block_index[i]][0] = s->mv[0][i-1][0];
+                    s->mv[0][i][1] = s->current_picture.motion_val[0][s->block_index[i]][1] = s->mv[0][i-1][1];
+                    s->mv[1][i][0] = s->current_picture.motion_val[1][s->block_index[i]][0] = s->mv[1][i-1][0];
+                    s->mv[1][i][1] = s->current_picture.motion_val[1][s->block_index[i]][1] = s->mv[1][i-1][1];
+                }
+            } else {
+                for (i = 1; i < 4; i++) {
+                    s->mv[0][i][0] = s->current_picture.motion_val[0][s->block_index[i]][0] = s->mv[0][0][0];
+                    s->mv[0][i][1] = s->current_picture.motion_val[0][s->block_index[i]][1] = s->mv[0][0][1];
+                    s->mv[1][i][0] = s->current_picture.motion_val[1][s->block_index[i]][0] = s->mv[1][0][0];
+                    s->mv[1][i][1] = s->current_picture.motion_val[1][s->block_index[i]][1] = s->mv[1][0][1];
+                }
+            }
+        }
+
         if (!direct) {
             if (skipped || !s->mb_intra) {
                 bmvtype = decode012(gb);
@@ -2508,7 +2432,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -2564,12 +2488,12 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
             if (direct || bmvtype == BMV_TYPE_INTERPOLATED) {
                 ff_vc1_interp_mc(v);
             }
+            v->fieldtx_plane[mb_pos] = 0;
         }
     }
-    if (s->mb_x == s->mb_width - 1)
-        memmove(v->is_intra_base, v->is_intra, sizeof(v->is_intra_base[0]) * s->mb_stride);
     v->cbp[s->mb_x]      = block_cbp;
     v->ttblk[s->mb_x]    = block_tt;
+
     return 0;
 }
 
@@ -2616,30 +2540,27 @@ static void vc1_decode_i_blocks(VC1Context *v)
     s->mb_x = s->mb_y = 0;
     s->mb_intra         = 1;
     s->first_slice_line = 1;
-    for (s->mb_y = 0; s->mb_y < s->end_mb_y; s->mb_y++) {
+    for (s->mb_y = s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         init_block_index(v);
         for (; s->mb_x < v->end_mb_x; s->mb_x++) {
-            uint8_t *dst[6];
             ff_update_block_index(s);
-            dst[0] = s->dest[0];
-            dst[1] = dst[0] + 8;
-            dst[2] = s->dest[0] + s->linesize * 8;
-            dst[3] = dst[2] + 8;
-            dst[4] = s->dest[1];
-            dst[5] = s->dest[2];
-            s->bdsp.clear_blocks(s->block[0]);
+            s->bdsp.clear_blocks(v->block[v->cur_blk_idx][0]);
             mb_pos = s->mb_x + s->mb_y * s->mb_width;
             s->current_picture.mb_type[mb_pos]                     = MB_TYPE_INTRA;
             s->current_picture.qscale_table[mb_pos]                = v->pq;
-            s->current_picture.motion_val[1][s->block_index[0]][0] = 0;
-            s->current_picture.motion_val[1][s->block_index[0]][1] = 0;
+            for (int i = 0; i < 4; i++) {
+                s->current_picture.motion_val[1][s->block_index[i]][0] = 0;
+                s->current_picture.motion_val[1][s->block_index[i]][1] = 0;
+            }
 
             // do actual MB decoding and displaying
             cbp = get_vlc2(&v->s.gb, ff_msmp4_mb_i_vlc.table, MB_INTRA_VLC_BITS, 2);
             v->s.ac_pred = get_bits1(&v->s.gb);
 
             for (k = 0; k < 6; k++) {
+                v->mb_type[0][s->block_index[k]] = 1;
+
                 val = ((cbp >> (5 - k)) & 1);
 
                 if (k < 4) {
@@ -2649,52 +2570,30 @@ static void vc1_decode_i_blocks(VC1Context *v)
                 }
                 cbp |= val << (5 - k);
 
-                vc1_decode_i_block(v, s->block[k], k, val, (k < 4) ? v->codingset : v->codingset2);
+                vc1_decode_i_block(v, v->block[v->cur_blk_idx][block_map[k]], k, val, (k < 4) ? v->codingset : v->codingset2);
 
-                if (k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
-                v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
-                if (v->pq >= 9 && v->overlap) {
-                    if (v->rangeredfrm)
+                v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[k]]);
+            }
+
+            if (v->overlap && v->pq >= 9) {
+                ff_vc1_i_overlap_filter(v);
+                if (v->rangeredfrm)
+                    for (k = 0; k < 6; k++)
                         for (j = 0; j < 64; j++)
-                            s->block[k][j] <<= 1;
-                    s->idsp.put_signed_pixels_clamped(s->block[k], dst[k],
-                                                      k & 4 ? s->uvlinesize
-                                                            : s->linesize);
-                } else {
-                    if (v->rangeredfrm)
+                            v->block[v->cur_blk_idx][block_map[k]][j] <<= 1;
+                vc1_put_blocks_clamped(v, 1);
+            } else {
+                if (v->rangeredfrm)
+                    for (k = 0; k < 6; k++)
                         for (j = 0; j < 64; j++)
-                            s->block[k][j] = (s->block[k][j] - 64) << 1;
-                    s->idsp.put_pixels_clamped(s->block[k], dst[k],
-                                               k & 4 ? s->uvlinesize
-                                                     : s->linesize);
-                }
+                            v->block[v->cur_blk_idx][block_map[k]][j] = (v->block[v->cur_blk_idx][block_map[k]][j] - 64) << 1;
+                vc1_put_blocks_clamped(v, 0);
             }
 
-            if (v->pq >= 9 && v->overlap) {
-                if (s->mb_x) {
-                    v->vc1dsp.vc1_h_overlap(s->dest[0], s->linesize);
-                    v->vc1dsp.vc1_h_overlap(s->dest[0] + 8 * s->linesize, s->linesize);
-                    if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-                        v->vc1dsp.vc1_h_overlap(s->dest[1], s->uvlinesize);
-                        v->vc1dsp.vc1_h_overlap(s->dest[2], s->uvlinesize);
-                    }
-                }
-                v->vc1dsp.vc1_h_overlap(s->dest[0] + 8, s->linesize);
-                v->vc1dsp.vc1_h_overlap(s->dest[0] + 8 * s->linesize + 8, s->linesize);
-                if (!s->first_slice_line) {
-                    v->vc1dsp.vc1_v_overlap(s->dest[0], s->linesize);
-                    v->vc1dsp.vc1_v_overlap(s->dest[0] + 8, s->linesize);
-                    if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-                        v->vc1dsp.vc1_v_overlap(s->dest[1], s->uvlinesize);
-                        v->vc1dsp.vc1_v_overlap(s->dest[2], s->uvlinesize);
-                    }
-                }
-                v->vc1dsp.vc1_v_overlap(s->dest[0] + 8 * s->linesize, s->linesize);
-                v->vc1dsp.vc1_v_overlap(s->dest[0] + 8 * s->linesize + 8, s->linesize);
-            }
             if (v->s.loop_filter)
-                ff_vc1_loop_filter_iblk(v, v->pq);
+                ff_vc1_i_loop_filter(v);
 
             if (get_bits_count(&s->gb) > v->bits) {
                 ff_er_add_slice(&s->er, 0, 0, s->mb_x, s->mb_y, ER_MB_ERROR);
@@ -2702,6 +2601,11 @@ static void vc1_decode_i_blocks(VC1Context *v)
                        get_bits_count(&s->gb), v->bits);
                 return;
             }
+
+            v->topleft_blk_idx = (v->topleft_blk_idx + 1) % (v->end_mb_x + 2);
+            v->top_blk_idx = (v->top_blk_idx + 1) % (v->end_mb_x + 2);
+            v->left_blk_idx = (v->left_blk_idx + 1) % (v->end_mb_x + 2);
+            v->cur_blk_idx = (v->cur_blk_idx + 1) % (v->end_mb_x + 2);
         }
         if (!v->s.loop_filter)
             ff_mpeg_draw_horiz_band(s, s->mb_y * 16, 16);
@@ -2727,7 +2631,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
     int cbp, val;
     uint8_t *coded_val;
     int mb_pos;
-    int mquant = v->pq;
+    int mquant;
     int mqdiff;
     GetBitContext *gb = &s->gb;
 
@@ -2771,13 +2675,15 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
         s->mb_x = 0;
         init_block_index(v);
         for (;s->mb_x < s->mb_width; s->mb_x++) {
-            int16_t (*block)[64] = v->block[v->cur_blk_idx];
+            mquant = v->pq;
             ff_update_block_index(s);
-            s->bdsp.clear_blocks(block[0]);
+            s->bdsp.clear_blocks(v->block[v->cur_blk_idx][0]);
             mb_pos = s->mb_x + s->mb_y * s->mb_stride;
             s->current_picture.mb_type[mb_pos + v->mb_off]                         = MB_TYPE_INTRA;
-            s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = 0;
-            s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = 0;
+            for (int i = 0; i < 4; i++) {
+                s->current_picture.motion_val[1][s->block_index[i] + v->blocks_off][0] = 0;
+                s->current_picture.motion_val[1][s->block_index[i] + v->blocks_off][1] = 0;
+            }
 
             // do actual MB decoding and displaying
             if (v->fieldtx_is_raw)
@@ -2795,10 +2701,12 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
 
             s->current_picture.qscale_table[mb_pos] = mquant;
             /* Set DC scale - y and c use the same */
-            s->y_dc_scale = s->y_dc_scale_table[mquant];
-            s->c_dc_scale = s->c_dc_scale_table[mquant];
+            s->y_dc_scale = s->y_dc_scale_table[FFABS(mquant)];
+            s->c_dc_scale = s->c_dc_scale_table[FFABS(mquant)];
 
             for (k = 0; k < 6; k++) {
+                v->mb_type[0][s->block_index[k]] = 1;
+
                 val = ((cbp >> (5 - k)) & 1);
 
                 if (k < 4) {
@@ -2811,18 +2719,19 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
                 v->a_avail = !s->first_slice_line || (k == 2 || k == 3);
                 v->c_avail = !!s->mb_x || (k == 1 || k == 3);
 
-                vc1_decode_i_block_adv(v, block[k], k, val,
+                vc1_decode_i_block_adv(v, v->block[v->cur_blk_idx][block_map[k]], k, val,
                                        (k < 4) ? v->codingset : v->codingset2, mquant);
 
-                if (k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
-                v->vc1dsp.vc1_inv_trans_8x8(block[k]);
+                v->vc1dsp.vc1_inv_trans_8x8(v->block[v->cur_blk_idx][block_map[k]]);
             }
 
-            ff_vc1_smooth_overlap_filter_iblk(v);
-            vc1_put_signed_blocks_clamped(v);
+            if (v->overlap && (v->pq >= 9 || v->condover != CONDOVER_NONE))
+                ff_vc1_i_overlap_filter(v);
+            vc1_put_blocks_clamped(v, 1);
             if (v->s.loop_filter)
-                ff_vc1_loop_filter_iblk_delayed(v, v->pq);
+                ff_vc1_i_loop_filter(v);
 
             if (get_bits_count(&s->gb) > v->bits) {
                 // TODO: may need modification to handle slice coding
@@ -2831,6 +2740,10 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
                        get_bits_count(&s->gb), v->bits);
                 return;
             }
+            inc_blk_idx(v->topleft_blk_idx);
+            inc_blk_idx(v->top_blk_idx);
+            inc_blk_idx(v->left_blk_idx);
+            inc_blk_idx(v->cur_blk_idx);
         }
         if (!v->s.loop_filter)
             ff_mpeg_draw_horiz_band(s, s->mb_y * 16, 16);
@@ -2839,18 +2752,8 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
         s->first_slice_line = 0;
     }
 
-    /* raw bottom MB row */
-    s->mb_x = 0;
-    init_block_index(v);
-
-    for (;s->mb_x < s->mb_width; s->mb_x++) {
-        ff_update_block_index(s);
-        vc1_put_signed_blocks_clamped(v);
-        if (v->s.loop_filter)
-            ff_vc1_loop_filter_iblk_delayed(v, v->pq);
-    }
     if (v->s.loop_filter)
-        ff_mpeg_draw_horiz_band(s, (s->end_mb_y-1)*16, 16);
+        ff_mpeg_draw_horiz_band(s, (s->end_mb_y - 1) * 16, 16);
     ff_er_add_slice(&s->er, 0, s->start_mb_y << v->field_mode, s->mb_width - 1,
                     (s->end_mb_y << v->field_mode) - 1, ER_MB_END);
 }
@@ -2885,23 +2788,28 @@ static void vc1_decode_p_blocks(VC1Context *v)
         break;
     }
 
-    apply_loop_filter   = s->loop_filter && !(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY) &&
-                          v->fcm == PROGRESSIVE;
+    apply_loop_filter   = s->loop_filter && !(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY);
     s->first_slice_line = 1;
-    memset(v->cbp_base, 0, sizeof(v->cbp_base[0])*2*s->mb_stride);
+    memset(v->cbp_base, 0, sizeof(v->cbp_base[0]) * 3 * s->mb_stride);
     for (s->mb_y = s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
         s->mb_x = 0;
         init_block_index(v);
         for (; s->mb_x < s->mb_width; s->mb_x++) {
             ff_update_block_index(s);
 
-            if (v->fcm == ILACE_FIELD)
+            if (v->fcm == ILACE_FIELD) {
                 vc1_decode_p_mb_intfi(v);
-            else if (v->fcm == ILACE_FRAME)
+                if (apply_loop_filter)
+                    ff_vc1_p_loop_filter(v);
+            } else if (v->fcm == ILACE_FRAME) {
                 vc1_decode_p_mb_intfr(v);
-            else vc1_decode_p_mb(v);
-            if (s->mb_y != s->start_mb_y && apply_loop_filter)
-                ff_vc1_apply_p_loop_filter(v);
+                if (apply_loop_filter)
+                    ff_vc1_p_intfr_loop_filter(v);
+            } else {
+                vc1_decode_p_mb(v);
+                if (apply_loop_filter)
+                    ff_vc1_p_loop_filter(v);
+            }
             if (get_bits_count(&s->gb) > v->bits || get_bits_count(&s->gb) < 0) {
                 // TODO: may need modification to handle slice coding
                 ff_er_add_slice(&s->er, 0, s->start_mb_y, s->mb_x, s->mb_y, ER_MB_ERROR);
@@ -2909,22 +2817,27 @@ static void vc1_decode_p_blocks(VC1Context *v)
                        get_bits_count(&s->gb), v->bits, s->mb_x, s->mb_y);
                 return;
             }
+            inc_blk_idx(v->topleft_blk_idx);
+            inc_blk_idx(v->top_blk_idx);
+            inc_blk_idx(v->left_blk_idx);
+            inc_blk_idx(v->cur_blk_idx);
         }
-        memmove(v->cbp_base,      v->cbp,      sizeof(v->cbp_base[0])      * s->mb_stride);
-        memmove(v->ttblk_base,    v->ttblk,    sizeof(v->ttblk_base[0])    * s->mb_stride);
-        memmove(v->is_intra_base, v->is_intra, sizeof(v->is_intra_base[0]) * s->mb_stride);
-        memmove(v->luma_mv_base,  v->luma_mv,  sizeof(v->luma_mv_base[0])  * s->mb_stride);
-        if (s->mb_y != s->start_mb_y) ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
+        memmove(v->cbp_base,
+                v->cbp - s->mb_stride,
+                sizeof(v->cbp_base[0]) * 2 * s->mb_stride);
+        memmove(v->ttblk_base,
+                v->ttblk - s->mb_stride,
+                sizeof(v->ttblk_base[0]) * 2 * s->mb_stride);
+        memmove(v->is_intra_base,
+                v->is_intra - s->mb_stride,
+                sizeof(v->is_intra_base[0]) * 2 * s->mb_stride);
+        memmove(v->luma_mv_base,
+                v->luma_mv - s->mb_stride,
+                sizeof(v->luma_mv_base[0]) * 2 * s->mb_stride);
+        if (s->mb_y != s->start_mb_y)
+            ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
         s->first_slice_line = 0;
     }
-    if (apply_loop_filter) {
-        s->mb_x = 0;
-        init_block_index(v);
-        for (; s->mb_x < s->mb_width; s->mb_x++) {
-            ff_update_block_index(s);
-            ff_vc1_apply_p_loop_filter(v);
-        }
-    }
     if (s->end_mb_y >= s->start_mb_y)
         ff_mpeg_draw_horiz_band(s, (s->end_mb_y - 1) * 16, 16);
     ff_er_add_slice(&s->er, 0, s->start_mb_y << v->field_mode, s->mb_width - 1,
@@ -2967,12 +2880,19 @@ static void vc1_decode_b_blocks(VC1Context *v)
         for (; s->mb_x < s->mb_width; s->mb_x++) {
             ff_update_block_index(s);
 
-            if (v->fcm == ILACE_FIELD)
+            if (v->fcm == ILACE_FIELD) {
                 vc1_decode_b_mb_intfi(v);
-            else if (v->fcm == ILACE_FRAME)
+                if (v->s.loop_filter)
+                    ff_vc1_b_intfi_loop_filter(v);
+            } else if (v->fcm == ILACE_FRAME) {
                 vc1_decode_b_mb_intfr(v);
-            else
+                if (v->s.loop_filter)
+                    ff_vc1_p_intfr_loop_filter(v);
+            } else {
                 vc1_decode_b_mb(v);
+                if (v->s.loop_filter)
+                    ff_vc1_i_loop_filter(v);
+            }
             if (get_bits_count(&s->gb) > v->bits || get_bits_count(&s->gb) < 0) {
                 // TODO: may need modification to handle slice coding
                 ff_er_add_slice(&s->er, 0, s->start_mb_y, s->mb_x, s->mb_y, ER_MB_ERROR);
@@ -2980,9 +2900,16 @@ static void vc1_decode_b_blocks(VC1Context *v)
                        get_bits_count(&s->gb), v->bits, s->mb_x, s->mb_y);
                 return;
             }
-            if (v->s.loop_filter)
-                ff_vc1_loop_filter_iblk(v, v->pq);
         }
+        memmove(v->cbp_base,
+                v->cbp - s->mb_stride,
+                sizeof(v->cbp_base[0]) * 2 * s->mb_stride);
+        memmove(v->ttblk_base,
+                v->ttblk - s->mb_stride,
+                sizeof(v->ttblk_base[0]) * 2 * s->mb_stride);
+        memmove(v->is_intra_base,
+                v->is_intra - s->mb_stride,
+                sizeof(v->is_intra_base[0]) * 2 * s->mb_stride);
         if (!v->s.loop_filter)
             ff_mpeg_draw_horiz_band(s, s->mb_y * 16, 16);
         else if (s->mb_y)
diff --git a/libavcodec/vc1_common.h b/libavcodec/vc1_common.h
index 788d324..b46c33f 100644
--- a/libavcodec/vc1_common.h
+++ b/libavcodec/vc1_common.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "internal.h"
 
 /** Markers used in VC-1 AP frame data */
 //@{
@@ -57,12 +58,9 @@ enum Profile {
  */
 static av_always_inline const uint8_t* find_next_marker(const uint8_t *src, const uint8_t *end)
 {
-    uint32_t mrk = 0xFFFFFFFF;
-
-    if (end-src < 4)
-        return end;
-    while (src < end) {
-        mrk = (mrk << 8) | *src++;
+    if (end - src >= 4) {
+        uint32_t mrk = 0xFFFFFFFF;
+        src = avpriv_find_start_code(src, end, &mrk);
         if (IS_MARKER(mrk))
             return src - 4;
     }
diff --git a/libavcodec/vc1_loopfilter.c b/libavcodec/vc1_loopfilter.c
index 52cff1e..0f990cc 100644
--- a/libavcodec/vc1_loopfilter.c
+++ b/libavcodec/vc1_loopfilter.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,323 +31,1211 @@
 #include "vc1.h"
 #include "vc1dsp.h"
 
-void ff_vc1_loop_filter_iblk(VC1Context *v, int pq)
+static av_always_inline void vc1_h_overlap_filter(VC1Context *v, int16_t (*left_block)[64],
+                                                  int16_t (*right_block)[64], int left_fieldtx,
+                                                  int right_fieldtx, int block_num)
+{
+    switch (block_num) {
+    case 0:
+        v->vc1dsp.vc1_h_s_overlap(left_block[2],
+                                  right_block[0],
+                                  left_fieldtx ^ right_fieldtx ? 16 - 8 * left_fieldtx : 8,
+                                  left_fieldtx ^ right_fieldtx ? 16 - 8 * right_fieldtx : 8,
+                                  left_fieldtx || right_fieldtx ? 0 : 1);
+        break;
+
+    case 1:
+        v->vc1dsp.vc1_h_s_overlap(right_block[0],
+                                  right_block[2],
+                                  8,
+                                  8,
+                                  right_fieldtx ? 0 : 1);
+        break;
+
+    case 2:
+        v->vc1dsp.vc1_h_s_overlap(!left_fieldtx && right_fieldtx ? left_block[2] + 8 : left_block[3],
+                                  left_fieldtx && !right_fieldtx ? right_block[0] + 8 : right_block[1],
+                                  left_fieldtx ^ right_fieldtx ? 16 - 8 * left_fieldtx : 8,
+                                  left_fieldtx ^ right_fieldtx ? 16 - 8 * right_fieldtx : 8,
+                                  left_fieldtx || right_fieldtx ? 2 : 1);
+        break;
+
+    case 3:
+        v->vc1dsp.vc1_h_s_overlap(right_block[1],
+                                  right_block[3],
+                                  8,
+                                  8,
+                                  right_fieldtx ? 2 : 1);
+        break;
+
+    case 4:
+    case 5:
+        v->vc1dsp.vc1_h_s_overlap(left_block[block_num], right_block[block_num], 8, 8, 1);
+        break;
+    }
+}
+
+static av_always_inline void vc1_v_overlap_filter(VC1Context *v, int16_t (*top_block)[64],
+                                                  int16_t (*bottom_block)[64], int block_num)
+{
+    switch (block_num) {
+    case 0:
+        v->vc1dsp.vc1_v_s_overlap(top_block[1], bottom_block[0]);
+        break;
+
+    case 1:
+        v->vc1dsp.vc1_v_s_overlap(top_block[3], bottom_block[2]);
+        break;
+
+    case 2:
+        v->vc1dsp.vc1_v_s_overlap(bottom_block[0], bottom_block[1]);
+        break;
+
+    case 3:
+        v->vc1dsp.vc1_v_s_overlap(bottom_block[2], bottom_block[3]);
+        break;
+
+    case 4:
+    case 5:
+        v->vc1dsp.vc1_v_s_overlap(top_block[block_num], bottom_block[block_num]);
+        break;
+    }
+}
+
+void ff_vc1_i_overlap_filter(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
-    int j;
-    if (!s->first_slice_line) {
-        v->vc1dsp.vc1_v_loop_filter16(s->dest[0], s->linesize, pq);
-        if (s->mb_x)
-            v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize, s->linesize, pq);
-        v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize + 8, s->linesize, pq);
-        for (j = 0; j < 2; j++) {
-            v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1], s->uvlinesize, pq);
-            if (s->mb_x)
-                v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize, s->uvlinesize, pq);
+    int16_t (*topleft_blk)[64], (*top_blk)[64], (*left_blk)[64], (*cur_blk)[64];
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    int i;
+
+    topleft_blk = v->block[v->topleft_blk_idx];
+    top_blk = v->block[v->top_blk_idx];
+    left_blk = v->block[v->left_blk_idx];
+    cur_blk = v->block[v->cur_blk_idx];
+
+    /* Within a MB, the horizontal overlap always runs before the vertical.
+     * To accomplish that, we run the H on the left and internal vertical
+     * borders of the currently decoded MB. Then, we wait for the next overlap
+     * iteration to do H overlap on the right edge of this MB, before moving
+     * over and running the V overlap on the top and internal horizontal
+     * borders. Therefore, the H overlap trails by one MB col and the
+     * V overlap trails by one MB row. This is reflected in the time at which
+     * we run the put_pixels loop, i.e. delayed by one row and one column. */
+    for (i = 0; i < block_count; i++) {
+        if (s->mb_x == 0 && (i & 5) != 1)
+            continue;
+
+        if (v->pq >= 9 || (v->profile == PROFILE_ADVANCED &&
+                           (v->condover == CONDOVER_ALL ||
+                            (v->over_flags_plane[mb_pos] &&
+                             ((i & 5) == 1 || v->over_flags_plane[mb_pos - 1])))))
+            vc1_h_overlap_filter(v,
+                                 s->mb_x ? left_blk : cur_blk, cur_blk,
+                                 v->fcm == ILACE_FRAME && s->mb_x && v->fieldtx_plane[mb_pos - 1],
+                                 v->fcm == ILACE_FRAME && v->fieldtx_plane[mb_pos],
+                                 i);
+    }
+
+    if (v->fcm != ILACE_FRAME)
+        for (i = 0; i < block_count; i++) {
+            if (s->first_slice_line && !(i & 2))
+                continue;
+
+            if (s->mb_x &&
+                (v->pq >= 9 || (v->profile == PROFILE_ADVANCED &&
+                                (v->condover == CONDOVER_ALL ||
+                                 (v->over_flags_plane[mb_pos - 1] &&
+                                  ((i & 2) || v->over_flags_plane[mb_pos - 1 - s->mb_stride]))))))
+                vc1_v_overlap_filter(v, s->first_slice_line ? left_blk : topleft_blk, left_blk, i);
+            if (s->mb_x == s->mb_width - 1 &&
+                (v->pq >= 9 || (v->profile == PROFILE_ADVANCED &&
+                                (v->condover == CONDOVER_ALL ||
+                                 (v->over_flags_plane[mb_pos] &&
+                                  ((i & 2) || v->over_flags_plane[mb_pos - s->mb_stride]))))))
+                vc1_v_overlap_filter(v, s->first_slice_line ? cur_blk : top_blk, cur_blk, i);
         }
+}
+
+void ff_vc1_p_overlap_filter(VC1Context *v)
+{
+    MpegEncContext *s = &v->s;
+    int16_t (*topleft_blk)[64], (*top_blk)[64], (*left_blk)[64], (*cur_blk)[64];
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    int i;
+
+    topleft_blk = v->block[v->topleft_blk_idx];
+    top_blk = v->block[v->top_blk_idx];
+    left_blk = v->block[v->left_blk_idx];
+    cur_blk = v->block[v->cur_blk_idx];
+
+    for (i = 0; i < block_count; i++) {
+        if (s->mb_x == 0 && (i & 5) != 1)
+            continue;
+
+        if (v->mb_type[0][s->block_index[i]] && v->mb_type[0][s->block_index[i] - 1])
+            vc1_h_overlap_filter(v,
+                                 s->mb_x ? left_blk : cur_blk, cur_blk,
+                                 v->fcm == ILACE_FRAME && s->mb_x && v->fieldtx_plane[mb_pos - 1],
+                                 v->fcm == ILACE_FRAME && v->fieldtx_plane[mb_pos],
+                                 i);
     }
-    v->vc1dsp.vc1_v_loop_filter16(s->dest[0] + 8 * s->linesize, s->linesize, pq);
 
-    if (s->mb_y == s->end_mb_y - 1) {
-        if (s->mb_x) {
-            v->vc1dsp.vc1_h_loop_filter16(s->dest[0], s->linesize, pq);
-            v->vc1dsp.vc1_h_loop_filter8(s->dest[1], s->uvlinesize, pq);
-            v->vc1dsp.vc1_h_loop_filter8(s->dest[2], s->uvlinesize, pq);
+    if (v->fcm != ILACE_FRAME)
+        for (i = 0; i < block_count; i++) {
+            if (s->first_slice_line && !(i & 2))
+                continue;
+
+            if (s->mb_x && v->mb_type[0][s->block_index[i] - 2 + (i > 3)] &&
+                v->mb_type[0][s->block_index[i] - s->block_wrap[i] - 2 + (i > 3)])
+                vc1_v_overlap_filter(v, s->first_slice_line ? left_blk : topleft_blk, left_blk, i);
+            if (s->mb_x == s->mb_width - 1)
+                if (v->mb_type[0][s->block_index[i]] &&
+                    v->mb_type[0][s->block_index[i] - s->block_wrap[i]])
+                    vc1_v_overlap_filter(v, s->first_slice_line ? cur_blk : top_blk, cur_blk, i);
         }
-        v->vc1dsp.vc1_h_loop_filter16(s->dest[0] + 8, s->linesize, pq);
+}
+
+#define LEFT_EDGE   (1 << 0)
+#define RIGHT_EDGE  (1 << 1)
+#define TOP_EDGE    (1 << 2)
+#define BOTTOM_EDGE (1 << 3)
+
+static av_always_inline void vc1_i_h_loop_filter(VC1Context *v, uint8_t *dest,
+                                                 uint32_t flags, int block_num)
+{
+    MpegEncContext *s  = &v->s;
+    int pq = v->pq;
+    uint8_t *dst;
+
+    if (block_num & 2)
+        return;
+
+    if (!(flags & LEFT_EDGE) || (block_num & 5) == 1) {
+        if (block_num > 3)
+            dst = dest;
+        else
+            dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
+
+        if (v->fcm == ILACE_FRAME)
+            if (block_num > 3) {
+                v->vc1dsp.vc1_h_loop_filter4(dst, 2 * s->uvlinesize, pq);
+                v->vc1dsp.vc1_h_loop_filter4(dst + s->uvlinesize, 2 * s->uvlinesize, pq);
+            } else {
+                v->vc1dsp.vc1_h_loop_filter8(dst, 2 * s->linesize, pq);
+                v->vc1dsp.vc1_h_loop_filter8(dst + s->linesize, 2 * s->linesize, pq);
+            }
+        else
+            if (block_num > 3)
+                v->vc1dsp.vc1_h_loop_filter8(dst, s->uvlinesize, pq);
+            else
+                v->vc1dsp.vc1_h_loop_filter16(dst, s->linesize, pq);
+    }
+}
+
+static av_always_inline void vc1_i_v_loop_filter(VC1Context *v, uint8_t *dest,
+                                                 uint32_t flags, uint8_t fieldtx,
+                                                 int block_num)
+{
+    MpegEncContext *s  = &v->s;
+    int pq = v->pq;
+    uint8_t *dst;
+
+    if ((block_num & 5) == 1)
+        return;
+
+    if (!(flags & TOP_EDGE) || block_num & 2) {
+        if (block_num > 3)
+            dst = dest;
+        else
+            dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
+
+        if (v->fcm == ILACE_FRAME) {
+            if (block_num > 3) {
+                v->vc1dsp.vc1_v_loop_filter8(dst, 2 * s->uvlinesize, pq);
+                v->vc1dsp.vc1_v_loop_filter8(dst + s->uvlinesize, 2 * s->uvlinesize, pq);
+            } else if (block_num < 2 || !fieldtx) {
+                v->vc1dsp.vc1_v_loop_filter16(dst, 2 * s->linesize, pq);
+                v->vc1dsp.vc1_v_loop_filter16(dst + s->linesize, 2 * s->linesize, pq);
+            }
+        } else
+            if (block_num > 3)
+                v->vc1dsp.vc1_v_loop_filter8(dst, s->uvlinesize, pq);
+            else
+                v->vc1dsp.vc1_v_loop_filter16(dst, s->linesize, pq);
     }
 }
 
-void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
+void ff_vc1_i_loop_filter(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
-    int j;
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    uint8_t *dest, fieldtx;
+    uint32_t flags = 0;
+    int i;
 
-    /* The loopfilter runs 1 row and 1 column behind the overlap filter, which
-     * means it runs two rows/cols behind the decoding loop. */
+    /* Within a MB, the vertical loop filter always runs before the horizontal.
+     * To accomplish that, we run the V loop filter on top and internal
+     * horizontal borders of the last overlap filtered MB. Then, we wait for
+     * the loop filter iteration on the next row to do V loop filter on the
+     * bottom edge of this MB, before moving over and running the H loop
+     * filter on the left and internal vertical borders. Therefore, the loop
+     * filter trails by one row and one column relative to the overlap filter
+     * and two rows and two columns relative to the decoding loop. */
     if (!s->first_slice_line) {
+        dest = s->dest[0] - 16 * s->linesize - 16;
+        flags = s->mb_y == s->start_mb_y + 1 ? TOP_EDGE : 0;
         if (s->mb_x) {
-            if (s->mb_y >= s->start_mb_y + 2) {
-                v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 16 * s->linesize - 16, s->linesize, pq);
-
-                if (s->mb_x >= 2)
-                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize - 16, s->linesize, pq);
-                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize - 8, s->linesize, pq);
-                for (j = 0; j < 2; j++) {
-                    v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize - 8, s->uvlinesize, pq);
-                    if (s->mb_x >= 2) {
-                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 16 * s->uvlinesize - 8, s->uvlinesize, pq);
-                    }
-                }
-            }
-            v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 8 * s->linesize - 16, s->linesize, pq);
+            fieldtx = v->fieldtx_plane[mb_pos - s->mb_stride - 1];
+            for (i = 0; i < block_count; i++)
+                vc1_i_v_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest, flags, fieldtx, i);
         }
-
-        if (s->mb_x == s->mb_width - 1) {
-            if (s->mb_y >= s->start_mb_y + 2) {
-                v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 16 * s->linesize, s->linesize, pq);
-
-                if (s->mb_x)
-                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize, s->linesize, pq);
-                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize + 8, s->linesize, pq);
-                for (j = 0; j < 2; j++) {
-                    v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize, s->uvlinesize, pq);
-                    if (s->mb_x >= 2) {
-                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 16 * s->uvlinesize, s->uvlinesize, pq);
-                    }
-                }
-            }
-            v->vc1dsp.vc1_v_loop_filter16(s->dest[0] - 8 * s->linesize, s->linesize, pq);
+        if (s->mb_x == v->end_mb_x - 1) {
+            dest += 16;
+            fieldtx = v->fieldtx_plane[mb_pos - s->mb_stride];
+            for (i = 0; i < block_count; i++)
+                vc1_i_v_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest, flags, fieldtx, i);
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        dest = s->dest[0] - 16;
+        flags = s->first_slice_line ? TOP_EDGE | BOTTOM_EDGE : BOTTOM_EDGE;
+        if (s->mb_x) {
+            fieldtx = v->fieldtx_plane[mb_pos - 1];
+            for (i = 0; i < block_count; i++)
+                vc1_i_v_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 : dest, flags, fieldtx, i);
+        }
+        if (s->mb_x == v->end_mb_x - 1) {
+            dest += 16;
+            fieldtx = v->fieldtx_plane[mb_pos];
+            for (i = 0; i < block_count; i++)
+                vc1_i_v_loop_filter(v, i > 3 ? s->dest[i - 3] : dest, flags, fieldtx, i);
         }
+    }
 
-        if (s->mb_y == s->end_mb_y) {
+    if (s->mb_y >= s->start_mb_y + 2) {
+        dest = s->dest[0] - 32 * s->linesize - 16;
+        if (s->mb_x) {
+            flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_i_h_loop_filter(v, i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize - 8 : dest, flags, i);
+        }
+        if (s->mb_x == v->end_mb_x - 1) {
+            dest += 16;
+            flags = s->mb_x == 0 ? LEFT_EDGE | RIGHT_EDGE : RIGHT_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_i_h_loop_filter(v, i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize : dest, flags, i);
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        if (s->mb_y >= s->start_mb_y + 1) {
+            dest = s->dest[0] - 16 * s->linesize - 16;
             if (s->mb_x) {
-                if (s->mb_x >= 2)
-                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize - 16, s->linesize, pq);
-                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize - 8, s->linesize, pq);
-                if (s->mb_x >= 2) {
-                    for (j = 0; j < 2; j++) {
-                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize - 8, s->uvlinesize, pq);
-                    }
-                }
+                flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+                for (i = 0; i < block_count; i++)
+                    vc1_i_h_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest, flags, i);
             }
-
-            if (s->mb_x == s->mb_width - 1) {
-                if (s->mb_x)
-                    v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize, s->linesize, pq);
-                v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize + 8, s->linesize, pq);
-                if (s->mb_x) {
-                    for (j = 0; j < 2; j++) {
-                        v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize, s->uvlinesize, pq);
-                    }
-                }
+            if (s->mb_x == v->end_mb_x - 1) {
+                flags = s->mb_x == 0 ? LEFT_EDGE | RIGHT_EDGE : RIGHT_EDGE;
+                dest += 16;
+                for (i = 0; i < block_count; i++)
+                    vc1_i_h_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest, flags, i);
             }
         }
+        dest = s->dest[0] - 16;
+        if (s->mb_x) {
+            flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_i_h_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 : dest, flags, i);
+        }
+        if (s->mb_x == v->end_mb_x - 1) {
+            dest += 16;
+            flags = s->mb_x == 0 ? LEFT_EDGE | RIGHT_EDGE : RIGHT_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_i_h_loop_filter(v, i > 3 ? s->dest[i - 3] : dest, flags, i);
+        }
     }
 }
 
-void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v)
+static av_always_inline void vc1_p_h_loop_filter(VC1Context *v, uint8_t *dest, uint32_t *cbp,
+                                                 uint8_t *is_intra, int16_t (*mv)[2], uint8_t *mv_f,
+                                                 int *ttblk, uint32_t flags, int block_num)
 {
-    MpegEncContext *s = &v->s;
-    int mb_pos;
+    MpegEncContext *s  = &v->s;
+    int pq = v->pq;
+    uint32_t left_cbp = cbp[0] >> (block_num * 4), right_cbp;
+    uint8_t left_is_intra, right_is_intra;
+    int tt;
+    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
+    uint8_t *dst;
 
-    if (v->condover == CONDOVER_NONE)
-        return;
+    if (block_num > 3)
+        dst = dest;
+    else
+        dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
 
-    mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    if (!(flags & RIGHT_EDGE) || !(block_num & 5)) {
+        left_is_intra = is_intra[0] & (1 << block_num);
 
-    /* Within a MB, the horizontal overlap always runs before the vertical.
-     * To accomplish that, we run the H on left and internal borders of the
-     * currently decoded MB. Then, we wait for the next overlap iteration
-     * to do H overlap on the right edge of this MB, before moving over and
-     * running the V overlap. Therefore, the V overlap makes us trail by one
-     * MB col and the H overlap filter makes us trail by one MB row. This
-     * is reflected in the time at which we run the put_pixels loop. */
-    if (v->condover == CONDOVER_ALL || v->pq >= 9 || v->over_flags_plane[mb_pos]) {
-        if (s->mb_x && (v->condover == CONDOVER_ALL || v->pq >= 9 ||
-                        v->over_flags_plane[mb_pos - 1])) {
-            v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][1],
-                                      v->block[v->cur_blk_idx][0]);
-            v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][3],
-                                      v->block[v->cur_blk_idx][2]);
-            if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-                v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][4],
-                                          v->block[v->cur_blk_idx][4]);
-                v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][5],
-                                          v->block[v->cur_blk_idx][5]);
-            }
+        if (block_num > 3) {
+            right_is_intra = is_intra[1] & (1 << block_num);
+            right_cbp = cbp[1] >> (block_num * 4);
+        } else if (block_num & 1) {
+            right_is_intra = is_intra[1] & (1 << block_num - 1);
+            right_cbp = cbp[1] >> ((block_num - 1) * 4);
+        } else {
+            right_is_intra = is_intra[0] & (1 << block_num + 1);
+            right_cbp = cbp[0] >> ((block_num + 1) * 4);
         }
-        v->vc1dsp.vc1_h_s_overlap(v->block[v->cur_blk_idx][0],
-                                  v->block[v->cur_blk_idx][1]);
-        v->vc1dsp.vc1_h_s_overlap(v->block[v->cur_blk_idx][2],
-                                  v->block[v->cur_blk_idx][3]);
 
-        if (s->mb_x == s->mb_width - 1) {
-            if (!s->first_slice_line && (v->condover == CONDOVER_ALL || v->pq >= 9 ||
-                                         v->over_flags_plane[mb_pos - s->mb_stride])) {
-                v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][2],
-                                          v->block[v->cur_blk_idx][0]);
-                v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][3],
-                                          v->block[v->cur_blk_idx][1]);
-                if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-                    v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][4],
-                                              v->block[v->cur_blk_idx][4]);
-                    v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][5],
-                                              v->block[v->cur_blk_idx][5]);
-                }
-            }
-            v->vc1dsp.vc1_v_s_overlap(v->block[v->cur_blk_idx][0],
-                                      v->block[v->cur_blk_idx][2]);
-            v->vc1dsp.vc1_v_s_overlap(v->block[v->cur_blk_idx][1],
-                                      v->block[v->cur_blk_idx][3]);
-        }
-    }
-    if (s->mb_x && (v->condover == CONDOVER_ALL || v->over_flags_plane[mb_pos - 1])) {
-        if (!s->first_slice_line && (v->condover == CONDOVER_ALL || v->pq >= 9 ||
-                                     v->over_flags_plane[mb_pos - s->mb_stride - 1])) {
-            v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][2],
-                                      v->block[v->left_blk_idx][0]);
-            v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][3],
-                                      v->block[v->left_blk_idx][1]);
-            if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-                v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][4],
-                                          v->block[v->left_blk_idx][4]);
-                v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][5],
-                                          v->block[v->left_blk_idx][5]);
-            }
+        if (left_is_intra || right_is_intra ||
+            mv[0][0] != mv[1][0] || mv[0][1] != mv[1][1] ||
+            (v->fcm == ILACE_FIELD && mv_f[0] != mv_f[1]))
+            v->vc1dsp.vc1_h_loop_filter8(dst + 8, linesize, pq);
+        else {
+            idx = (left_cbp | (right_cbp >> 1)) & 5;
+            if (idx & 1)
+                v->vc1dsp.vc1_h_loop_filter4(dst + 4 * linesize + 8, linesize, pq);
+            if (idx & 4)
+                v->vc1dsp.vc1_h_loop_filter4(dst + 8, linesize, pq);
         }
-        v->vc1dsp.vc1_v_s_overlap(v->block[v->left_blk_idx][0],
-                                  v->block[v->left_blk_idx][2]);
-        v->vc1dsp.vc1_v_s_overlap(v->block[v->left_blk_idx][1],
-                                  v->block[v->left_blk_idx][3]);
+    }
+
+    tt = ttblk[0] >> (block_num * 4) & 0xf;
+    if (tt == TT_4X4 || tt == TT_4X8) {
+        if (left_cbp & 3)
+            v->vc1dsp.vc1_h_loop_filter4(dst + 4 * linesize + 4, linesize, pq);
+        if (left_cbp & 12)
+            v->vc1dsp.vc1_h_loop_filter4(dst + 4, linesize, pq);
     }
 }
 
-static av_always_inline void vc1_apply_p_v_loop_filter(VC1Context *v, int block_num)
+static av_always_inline void vc1_p_v_loop_filter(VC1Context *v, uint8_t *dest, uint32_t *cbp,
+                                                 uint8_t *is_intra, int16_t (*mv)[2], uint8_t *mv_f,
+                                                 int *ttblk, uint32_t flags, int block_num)
 {
     MpegEncContext *s  = &v->s;
-    int mb_cbp         = v->cbp[s->mb_x - s->mb_stride],
-        block_cbp      = mb_cbp      >> (block_num * 4), bottom_cbp,
-        mb_is_intra    = v->is_intra[s->mb_x - s->mb_stride],
-        block_is_intra = mb_is_intra >> (block_num * 4), bottom_is_intra;
-    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize, ttblk;
+    int pq = v->pq;
+    uint32_t top_cbp = cbp[0] >> (block_num * 4), bottom_cbp;
+    uint8_t top_is_intra, bottom_is_intra;
+    int tt;
+    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
     uint8_t *dst;
 
-    if (block_num > 3) {
-        dst      = s->dest[block_num - 3];
-    } else {
-        dst      = s->dest[0] + (block_num & 1) * 8 + ((block_num & 2) * 4 - 8) * linesize;
-    }
-    if (s->mb_y != s->end_mb_y || block_num < 2) {
-        int16_t (*mv)[2];
-        int mv_stride;
+    if (block_num > 3)
+        dst = dest;
+    else
+        dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
+
+    if(!(flags & BOTTOM_EDGE) || block_num < 2) {
+        top_is_intra = is_intra[0] & (1 << block_num);
 
         if (block_num > 3) {
-            bottom_cbp      = v->cbp[s->mb_x]      >> (block_num * 4);
-            bottom_is_intra = v->is_intra[s->mb_x] >> block_num;
-            mv              = &v->luma_mv[s->mb_x - s->mb_stride];
-            mv_stride       = s->mb_stride;
+            bottom_is_intra = is_intra[s->mb_stride] & (1 << block_num);
+            bottom_cbp = cbp[s->mb_stride] >> (block_num * 4);
+        } else if (block_num < 2) {
+            bottom_is_intra = is_intra[0] & (1 << block_num + 2);
+            bottom_cbp = cbp[0] >> ((block_num + 2) * 4);
         } else {
-            bottom_cbp      = (block_num < 2) ? (mb_cbp               >> ((block_num + 2) * 4))
-                                              : (v->cbp[s->mb_x]      >> ((block_num - 2) * 4));
-            bottom_is_intra = (block_num < 2) ? (mb_is_intra          >> (block_num + 2))
-                                              : (v->is_intra[s->mb_x] >> (block_num - 2));
-            mv_stride       = s->b8_stride;
-            mv              = &s->current_picture.motion_val[0][s->block_index[block_num] - 2 * mv_stride];
+            bottom_is_intra = is_intra[s->mb_stride] & (1 << block_num - 2);
+            bottom_cbp = cbp[s->mb_stride] >> ((block_num - 2) * 4);
         }
 
-        if (bottom_is_intra & 1 || block_is_intra & 1 ||
-            mv[0][0] != mv[mv_stride][0] || mv[0][1] != mv[mv_stride][1]) {
-            v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
-        } else {
-            idx = ((bottom_cbp >> 2) | block_cbp) & 3;
-            if (idx == 3) {
-                v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
-            } else if (idx) {
-                if (idx == 1)
-                    v->vc1dsp.vc1_v_loop_filter4(dst + 4, linesize, v->pq);
-                else
-                    v->vc1dsp.vc1_v_loop_filter4(dst,     linesize, v->pq);
+        if (top_is_intra || bottom_is_intra ||
+            mv[0][0] != mv[block_num > 3 ? s->mb_stride : s->b8_stride][0] ||
+            mv[0][1] != mv[block_num > 3 ? s->mb_stride : s->b8_stride][1] ||
+            (v->fcm == ILACE_FIELD && mv_f[0] != mv_f[block_num > 3 ? s->mb_stride : s->b8_stride]))
+            v->vc1dsp.vc1_v_loop_filter8(dst + 8 * linesize, linesize, pq);
+        else {
+            idx = (top_cbp | (bottom_cbp >> 2)) & 3;
+            if (idx & 1)
+                v->vc1dsp.vc1_v_loop_filter4(dst + 8 * linesize + 4, linesize, pq);
+            if (idx & 2)
+                v->vc1dsp.vc1_v_loop_filter4(dst + 8 * linesize, linesize, pq);
+        }
+    }
+
+    tt = ttblk[0] >> (block_num * 4) & 0xf;
+    if (tt == TT_4X4 || tt == TT_8X4) {
+        if (top_cbp & 5)
+            v->vc1dsp.vc1_v_loop_filter4(dst + 4 * linesize + 4, linesize, pq);
+        if (top_cbp & 10)
+            v->vc1dsp.vc1_v_loop_filter4(dst + 4 * linesize, linesize, pq);
+    }
+}
+
+void ff_vc1_p_loop_filter(VC1Context *v)
+{
+    MpegEncContext *s = &v->s;
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
+    uint8_t *dest;
+    uint32_t *cbp;
+    uint8_t *is_intra;
+    int16_t (*uvmv)[2];
+    int *ttblk;
+    uint32_t flags;
+    int i;
+
+    /* Within a MB, the vertical loop filter always runs before the horizontal.
+     * To accomplish that, we run the V loop filter on all applicable
+     * horizontal borders of the MB above the last overlap filtered MB. Then,
+     * we wait for the next loop filter iteration to do H loop filter on all
+     * applicable vertical borders of this MB. Therefore, the loop filter
+     * trails by one row and one column relative to the overlap filter and two
+     * rows and two columns relative to the decoding loop. */
+    if (s->mb_y >= s->start_mb_y + 2) {
+        if (s->mb_x) {
+            dest = s->dest[0] - 32 * s->linesize - 16;
+            cbp = &v->cbp[s->mb_x - 2 * s->mb_stride - 1];
+            is_intra = &v->is_intra[s->mb_x - 2 * s->mb_stride - 1];
+            uvmv = &v->luma_mv[s->mb_x - 2 * s->mb_stride - 1];
+            ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride - 1];
+            flags = s->mb_y == s->start_mb_y + 2 ? TOP_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize - 8 : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] - 4 * s->b8_stride - 2 + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] - 2 * s->mb_stride - 1 + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] - 4 * s->b8_stride - 2 + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            dest = s->dest[0] - 32 * s->linesize;
+            cbp = &v->cbp[s->mb_x - 2 * s->mb_stride];
+            is_intra = &v->is_intra[s->mb_x - 2 * s->mb_stride];
+            uvmv = &v->luma_mv[s->mb_x - 2 * s->mb_stride];
+            ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride];
+            flags = s->mb_y == s->start_mb_y + 2 ? TOP_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] - 4 * s->b8_stride + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] - 2 * s->mb_stride + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] - 4 * s->b8_stride + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        if (s->mb_x) {
+            if (s->mb_y >= s->start_mb_y + 1) {
+                dest = s->dest[0] - 16 * s->linesize - 16;
+                cbp = &v->cbp[s->mb_x - s->mb_stride - 1];
+                is_intra = &v->is_intra[s->mb_x - s->mb_stride - 1];
+                uvmv = &v->luma_mv[s->mb_x - s->mb_stride - 1];
+                ttblk = &v->ttblk[s->mb_x - s->mb_stride - 1];
+                flags = s->mb_y == s->start_mb_y + 1 ? TOP_EDGE : 0;
+                for (i = 0; i < block_count; i++)
+                    vc1_p_v_loop_filter(v,
+                                        i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest,
+                                        cbp,
+                                        is_intra,
+                                        i > 3 ? uvmv :
+                                                &s->current_picture.motion_val[0][s->block_index[i] - 2 * s->b8_stride - 2 + v->blocks_off],
+                                        i > 3 ? &v->mv_f[0][s->block_index[i] - s->mb_stride - 1 + v->mb_off] :
+                                                &v->mv_f[0][s->block_index[i] - 2 * s->b8_stride - 2 + v->blocks_off],
+                                        ttblk,
+                                        flags,
+                                        i);
             }
+            dest = s->dest[0] - 16;
+            cbp = &v->cbp[s->mb_x - 1];
+            is_intra = &v->is_intra[s->mb_x - 1];
+            uvmv = &v->luma_mv[s->mb_x - 1];
+            ttblk = &v->ttblk[s->mb_x - 1];
+            flags = s->mb_y == s->start_mb_y ? TOP_EDGE | BOTTOM_EDGE : BOTTOM_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] - 8 : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] - 2 + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] - 1 + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] - 2 + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            if (s->mb_y >= s->start_mb_y + 1) {
+                dest = s->dest[0] - 16 * s->linesize;
+                cbp = &v->cbp[s->mb_x - s->mb_stride];
+                is_intra = &v->is_intra[s->mb_x - s->mb_stride];
+                uvmv = &v->luma_mv[s->mb_x - s->mb_stride];
+                ttblk = &v->ttblk[s->mb_x - s->mb_stride];
+                flags = s->mb_y == s->start_mb_y + 1 ? TOP_EDGE : 0;
+                for (i = 0; i < block_count; i++)
+                    vc1_p_v_loop_filter(v,
+                                        i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest,
+                                        cbp,
+                                        is_intra,
+                                        i > 3 ? uvmv :
+                                                &s->current_picture.motion_val[0][s->block_index[i] - 2 * s->b8_stride + v->blocks_off],
+                                        i > 3 ? &v->mv_f[0][s->block_index[i] - s->mb_stride + v->mb_off] :
+                                                &v->mv_f[0][s->block_index[i] - 2 * s->b8_stride + v->blocks_off],
+                                        ttblk,
+                                        flags,
+                                        i);
+            }
+            dest = s->dest[0];
+            cbp = &v->cbp[s->mb_x];
+            is_intra = &v->is_intra[s->mb_x];
+            uvmv = &v->luma_mv[s->mb_x];
+            ttblk = &v->ttblk[s->mb_x];
+            flags = s->mb_y == s->start_mb_y ? TOP_EDGE | BOTTOM_EDGE : BOTTOM_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
         }
     }
 
-    dst -= 4 * linesize;
-    ttblk = (v->ttblk[s->mb_x - s->mb_stride] >> (block_num * 4)) & 0xF;
-    if (ttblk == TT_4X4 || ttblk == TT_8X4) {
-        idx = (block_cbp | (block_cbp >> 2)) & 3;
-        if (idx == 3) {
-            v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
-        } else if (idx) {
-            if (idx == 1)
-                v->vc1dsp.vc1_v_loop_filter4(dst + 4, linesize, v->pq);
-            else
-                v->vc1dsp.vc1_v_loop_filter4(dst,     linesize, v->pq);
+    if (s->mb_y >= s->start_mb_y + 2) {
+        if (s->mb_x >= 2) {
+            dest = s->dest[0] - 32 * s->linesize - 32;
+            cbp = &v->cbp[s->mb_x - 2 * s->mb_stride - 2];
+            is_intra = &v->is_intra[s->mb_x - 2 * s->mb_stride - 2];
+            uvmv = &v->luma_mv[s->mb_x - 2 * s->mb_stride - 2];
+            ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride - 2];
+            flags = s->mb_x == 2 ? LEFT_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize - 16 : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] - 4 * s->b8_stride - 4 + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] - 2 * s->mb_stride - 2 + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] - 4 * s->b8_stride - 4 + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            if (s->mb_x >= 1) {
+                dest = s->dest[0] - 32 * s->linesize - 16;
+                cbp = &v->cbp[s->mb_x - 2 * s->mb_stride - 1];
+                is_intra = &v->is_intra[s->mb_x - 2 * s->mb_stride - 1];
+                uvmv = &v->luma_mv[s->mb_x - 2 * s->mb_stride - 1];
+                ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride - 1];
+                flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+                for (i = 0; i < block_count; i++)
+                        vc1_p_h_loop_filter(v,
+                                            i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize - 8 : dest,
+                                            cbp,
+                                            is_intra,
+                                            i > 3 ? uvmv :
+                                                    &s->current_picture.motion_val[0][s->block_index[i] - 4 * s->b8_stride - 2 + v->blocks_off],
+                                            i > 3 ? &v->mv_f[0][s->block_index[i] - 2 * s->mb_stride - 1 + v->mb_off] :
+                                                    &v->mv_f[0][s->block_index[i] - 4 * s->b8_stride - 2 + v->blocks_off],
+                                            ttblk,
+                                            flags,
+                                            i);
+            }
+            dest = s->dest[0] - 32 * s->linesize;
+            cbp = &v->cbp[s->mb_x - 2 * s->mb_stride];
+            is_intra = &v->is_intra[s->mb_x - 2 * s->mb_stride];
+            uvmv = &v->luma_mv[s->mb_x - 2 * s->mb_stride];
+            ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride];
+            flags = s->mb_x ? RIGHT_EDGE : LEFT_EDGE | RIGHT_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] - 4 * s->b8_stride + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] - 2 * s->mb_stride + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] - 4 * s->b8_stride + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        if (s->mb_y >= s->start_mb_y + 1) {
+            if (s->mb_x >= 2) {
+                dest = s->dest[0] - 16 * s->linesize - 32;
+                cbp = &v->cbp[s->mb_x - s->mb_stride - 2];
+                is_intra = &v->is_intra[s->mb_x - s->mb_stride - 2];
+                uvmv = &v->luma_mv[s->mb_x - s->mb_stride - 2];
+                ttblk = &v->ttblk[s->mb_x - s->mb_stride - 2];
+                flags = s->mb_x == 2 ? LEFT_EDGE : 0;
+                for (i = 0; i < block_count; i++)
+                    vc1_p_h_loop_filter(v,
+                                        i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 16 : dest,
+                                        cbp,
+                                        is_intra,
+                                        i > 3 ? uvmv :
+                                                &s->current_picture.motion_val[0][s->block_index[i] - 2 * s->b8_stride - 4 + v->blocks_off],
+                                        i > 3 ? &v->mv_f[0][s->block_index[i] - s->mb_stride - 2 + v->mb_off] :
+                                                &v->mv_f[0][s->block_index[i] - 2 * s->b8_stride - 4 + v->blocks_off],
+                                        ttblk,
+                                        flags,
+                                        i);
+            }
+            if (s->mb_x == s->mb_width - 1) {
+                if (s->mb_x >= 1) {
+                    dest = s->dest[0] - 16 * s->linesize - 16;
+                    cbp = &v->cbp[s->mb_x - s->mb_stride - 1];
+                    is_intra = &v->is_intra[s->mb_x - s->mb_stride - 1];
+                    uvmv = &v->luma_mv[s->mb_x - s->mb_stride - 1];
+                    ttblk = &v->ttblk[s->mb_x - s->mb_stride - 1];
+                    flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+                    for (i = 0; i < block_count; i++)
+                            vc1_p_h_loop_filter(v,
+                                                i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest,
+                                                cbp,
+                                                is_intra,
+                                                i > 3 ? uvmv :
+                                                        &s->current_picture.motion_val[0][s->block_index[i] - 2 * s->b8_stride - 2 + v->blocks_off],
+                                                i > 3 ? &v->mv_f[0][s->block_index[i] - s->mb_stride - 1 + v->mb_off] :
+                                                        &v->mv_f[0][s->block_index[i] - 2 * s->b8_stride - 2 + v->blocks_off],
+                                                ttblk,
+                                                flags,
+                                                i);
+                }
+                dest = s->dest[0] - 16 * s->linesize;
+                cbp = &v->cbp[s->mb_x - s->mb_stride];
+                is_intra = &v->is_intra[s->mb_x - s->mb_stride];
+                uvmv = &v->luma_mv[s->mb_x - s->mb_stride];
+                ttblk = &v->ttblk[s->mb_x - s->mb_stride];
+                flags = s->mb_x ? RIGHT_EDGE : LEFT_EDGE | RIGHT_EDGE;
+                for (i = 0; i < block_count; i++)
+                    vc1_p_h_loop_filter(v,
+                                        i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest,
+                                        cbp,
+                                        is_intra,
+                                        i > 3 ? uvmv :
+                                                &s->current_picture.motion_val[0][s->block_index[i] - 2 * s->b8_stride + v->blocks_off],
+                                        i > 3 ? &v->mv_f[0][s->block_index[i] - s->mb_stride + v->mb_off] :
+                                                &v->mv_f[0][s->block_index[i] - 2 * s->b8_stride + v->blocks_off],
+                                        ttblk,
+                                        flags,
+                                        i);
+            }
+        }
+        if (s->mb_x >= 2) {
+            dest = s->dest[0] - 32;
+            cbp = &v->cbp[s->mb_x - 2];
+            is_intra = &v->is_intra[s->mb_x - 2];
+            uvmv = &v->luma_mv[s->mb_x - 2];
+            ttblk = &v->ttblk[s->mb_x - 2];
+            flags = s->mb_x == 2 ? LEFT_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] - 16 : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] - 4 + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] - 2 + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] - 4 + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            if (s->mb_x >= 1) {
+                dest = s->dest[0] - 16;
+                cbp = &v->cbp[s->mb_x - 1];
+                is_intra = &v->is_intra[s->mb_x - 1];
+                uvmv = &v->luma_mv[s->mb_x - 1];
+                ttblk = &v->ttblk[s->mb_x - 1];
+                flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+                for (i = 0; i < block_count; i++)
+                    vc1_p_h_loop_filter(v,
+                                        i > 3 ? s->dest[i - 3] - 8 : dest,
+                                        cbp,
+                                        is_intra,
+                                        i > 3 ? uvmv :
+                                                &s->current_picture.motion_val[0][s->block_index[i] - 2 + v->blocks_off],
+                                        i > 3 ? &v->mv_f[0][s->block_index[i] - 1 + v->mb_off] :
+                                                &v->mv_f[0][s->block_index[i] - 2 + v->blocks_off],
+                                        ttblk,
+                                        flags,
+                                        i);
+            }
+            dest = s->dest[0];
+            cbp = &v->cbp[s->mb_x];
+            is_intra = &v->is_intra[s->mb_x];
+            uvmv = &v->luma_mv[s->mb_x];
+            ttblk = &v->ttblk[s->mb_x];
+            flags = s->mb_x ? RIGHT_EDGE : LEFT_EDGE | RIGHT_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_loop_filter(v,
+                                    i > 3 ? s->dest[i - 3] : dest,
+                                    cbp,
+                                    is_intra,
+                                    i > 3 ? uvmv :
+                                            &s->current_picture.motion_val[0][s->block_index[i] + v->blocks_off],
+                                    i > 3 ? &v->mv_f[0][s->block_index[i] + v->mb_off] :
+                                            &v->mv_f[0][s->block_index[i] + v->blocks_off],
+                                    ttblk,
+                                    flags,
+                                    i);
         }
     }
 }
 
-static av_always_inline void vc1_apply_p_h_loop_filter(VC1Context *v, int block_num)
+static av_always_inline void vc1_p_h_intfr_loop_filter(VC1Context *v, uint8_t *dest, int *ttblk,
+                                                       uint32_t flags, uint8_t fieldtx, int block_num)
 {
     MpegEncContext *s  = &v->s;
-    int mb_cbp         = v->cbp[s->mb_x - 1 - s->mb_stride],
-        block_cbp      = mb_cbp      >> (block_num * 4), right_cbp,
-        mb_is_intra    = v->is_intra[s->mb_x - 1 - s->mb_stride],
-        block_is_intra = mb_is_intra >> block_num, right_is_intra;
-    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize, ttblk;
+    int pq = v->pq;
+    int tt;
+    int linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
     uint8_t *dst;
 
-    if (block_num > 3) {
-        dst = s->dest[block_num - 3] - 8 * linesize;
+    if (block_num > 3)
+        dst = dest;
+    else
+        dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
+
+    tt = ttblk[0] >> (block_num * 4) & 0xf;
+    if (block_num < 4) {
+        if (fieldtx) {
+            if (block_num < 2) {
+                if (tt == TT_4X4 || tt == TT_4X8)
+                    v->vc1dsp.vc1_h_loop_filter8(dst + 4, 2 * linesize, pq);
+                if (!(flags & RIGHT_EDGE) || block_num == 0)
+                    v->vc1dsp.vc1_h_loop_filter8(dst + 8, 2 * linesize, pq);
+            } else {
+                if (tt == TT_4X4 || tt == TT_4X8)
+                    v->vc1dsp.vc1_h_loop_filter8(dst - 7 * linesize + 4, 2 * linesize, pq);
+                if (!(flags & RIGHT_EDGE) || block_num == 2)
+                    v->vc1dsp.vc1_h_loop_filter8(dst - 7 * linesize + 8, 2 * linesize, pq);
+            }
+        } else {
+            if(tt == TT_4X4 || tt == TT_4X8) {
+                v->vc1dsp.vc1_h_loop_filter4(dst + 4, 2 * linesize, pq);
+                v->vc1dsp.vc1_h_loop_filter4(dst + linesize + 4, 2 * linesize, pq);
+            }
+            if (!(flags & RIGHT_EDGE) || !(block_num & 5)) {
+                v->vc1dsp.vc1_h_loop_filter4(dst + 8, 2 * linesize, pq);
+                v->vc1dsp.vc1_h_loop_filter4(dst + linesize + 8, 2 * linesize, pq);
+            }
+        }
     } else {
-        dst = s->dest[0] + (block_num & 1) * 8 + ((block_num & 2) * 4 - 16) * linesize - 8;
+        if (tt == TT_4X4 || tt == TT_4X8) {
+            v->vc1dsp.vc1_h_loop_filter4(dst + 4, 2 * linesize, pq);
+            v->vc1dsp.vc1_h_loop_filter4(dst + linesize + 4, 2 * linesize, pq);
+        }
+        if (!(flags & RIGHT_EDGE)) {
+            v->vc1dsp.vc1_h_loop_filter4(dst + 8, 2 * linesize, pq);
+            v->vc1dsp.vc1_h_loop_filter4(dst + linesize + 8, 2 * linesize, pq);
+        }
     }
+}
+
+static av_always_inline void vc1_p_v_intfr_loop_filter(VC1Context *v, uint8_t *dest, int *ttblk,
+                                                       uint32_t flags, uint8_t fieldtx, int block_num)
+{
+    MpegEncContext *s  = &v->s;
+    int pq = v->pq;
+    int tt;
+    int linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
+    uint8_t *dst;
 
-    if (s->mb_x != s->mb_width || !(block_num & 5)) {
-        int16_t (*mv)[2];
+    if (block_num > 3)
+        dst = dest;
+    else
+        dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
 
-        if (block_num > 3) {
-            right_cbp      = v->cbp[s->mb_x - s->mb_stride] >> (block_num * 4);
-            right_is_intra = v->is_intra[s->mb_x - s->mb_stride] >> block_num;
-            mv             = &v->luma_mv[s->mb_x - s->mb_stride - 1];
-        } else {
-            right_cbp      = (block_num & 1) ? (v->cbp[s->mb_x - s->mb_stride]      >> ((block_num - 1) * 4))
-                                             : (mb_cbp                              >> ((block_num + 1) * 4));
-            right_is_intra = (block_num & 1) ? (v->is_intra[s->mb_x - s->mb_stride] >> (block_num - 1))
-                                             : (mb_is_intra                         >> (block_num + 1));
-            mv             = &s->current_picture.motion_val[0][s->block_index[block_num] - s->b8_stride * 2 - 2];
-        }
-        if (block_is_intra & 1 || right_is_intra & 1 || mv[0][0] != mv[1][0] || mv[0][1] != mv[1][1]) {
-            v->vc1dsp.vc1_h_loop_filter8(dst, linesize, v->pq);
+    tt = ttblk[0] >> (block_num * 4) & 0xf;
+    if (block_num < 4) {
+        if (fieldtx) {
+            if (block_num < 2) {
+                if (tt == TT_4X4 || tt == TT_8X4)
+                    v->vc1dsp.vc1_v_loop_filter8(dst + 8 * linesize, 2 * linesize, pq);
+                if (!(flags & BOTTOM_EDGE))
+                    v->vc1dsp.vc1_v_loop_filter8(dst + 16 * linesize, 2 * linesize, pq);
+            } else {
+                if (tt == TT_4X4 || tt == TT_8X4)
+                    v->vc1dsp.vc1_v_loop_filter8(dst + linesize, 2 * linesize, pq);
+                if (!(flags & BOTTOM_EDGE))
+                    v->vc1dsp.vc1_v_loop_filter8(dst + 9 * linesize, 2 * linesize, pq);
+            }
         } else {
-            idx = ((right_cbp >> 1) | block_cbp) & 5; // FIXME check
-            if (idx == 5) {
-                v->vc1dsp.vc1_h_loop_filter8(dst, linesize, v->pq);
-            } else if (idx) {
-                if (idx == 1)
-                    v->vc1dsp.vc1_h_loop_filter4(dst + 4 * linesize, linesize, v->pq);
-                else
-                    v->vc1dsp.vc1_h_loop_filter4(dst,                linesize, v->pq);
+            if (block_num < 2) {
+                if (!(flags & TOP_EDGE) && (tt == TT_4X4 || tt == TT_8X4)) {
+                    v->vc1dsp.vc1_v_loop_filter8(dst + 4 * linesize, 2 * linesize, pq);
+                    v->vc1dsp.vc1_v_loop_filter8(dst + 5 * linesize, 2 * linesize, pq);
+                }
+                v->vc1dsp.vc1_v_loop_filter8(dst + 8 * linesize, 2 * linesize, pq);
+                v->vc1dsp.vc1_v_loop_filter8(dst + 9 * linesize, 2 * linesize, pq);
+            } else if (!(flags & BOTTOM_EDGE)) {
+                if (tt == TT_4X4 || tt == TT_8X4) {
+                    v->vc1dsp.vc1_v_loop_filter8(dst + 4 * linesize, 2 * linesize, pq);
+                    v->vc1dsp.vc1_v_loop_filter8(dst + 5 * linesize, 2 * linesize, pq);
+                }
+                v->vc1dsp.vc1_v_loop_filter8(dst + 8 * linesize, 2 * linesize, pq);
+                v->vc1dsp.vc1_v_loop_filter8(dst + 9 * linesize, 2 * linesize, pq);
             }
         }
+    } else {
+        if (!(flags & BOTTOM_EDGE)) {
+            if (!(flags & TOP_EDGE) && (tt == TT_4X4 || tt == TT_8X4)) {
+                v->vc1dsp.vc1_v_loop_filter8(dst + 4 * linesize, 2 * linesize, pq);
+                v->vc1dsp.vc1_v_loop_filter8(dst + 5 * linesize, 2 * linesize, pq);
+            }
+                v->vc1dsp.vc1_v_loop_filter8(dst + 8 * linesize, 2 * linesize, pq);
+                v->vc1dsp.vc1_v_loop_filter8(dst + 9 * linesize, 2 * linesize, pq);
+        }
     }
+}
 
-    dst -= 4;
-    ttblk = (v->ttblk[s->mb_x - s->mb_stride - 1] >> (block_num * 4)) & 0xf;
-    if (ttblk == TT_4X4 || ttblk == TT_4X8) {
-        idx = (block_cbp | (block_cbp >> 1)) & 5;
-        if (idx == 5) {
-            v->vc1dsp.vc1_h_loop_filter8(dst, linesize, v->pq);
-        } else if (idx) {
-            if (idx == 1)
-                v->vc1dsp.vc1_h_loop_filter4(dst + linesize * 4, linesize, v->pq);
-            else
-                v->vc1dsp.vc1_h_loop_filter4(dst,                linesize, v->pq);
+void ff_vc1_p_intfr_loop_filter(VC1Context *v)
+{
+    MpegEncContext *s = &v->s;
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    uint8_t *dest;
+    int *ttblk;
+    uint32_t flags;
+    uint8_t fieldtx;
+    int i;
+
+    /* Within a MB, the vertical loop filter always runs before the horizontal.
+     * To accomplish that, we run the V loop filter on all applicable
+     * horizontal borders of the MB above the last overlap filtered MB. Then,
+     * we wait for the loop filter iteration on the next row and next column to
+     * do H loop filter on all applicable vertical borders of this MB.
+     * Therefore, the loop filter trails by two rows and one column relative to
+     * the overlap filter and two rows and two columns relative to the decoding
+     * loop. */
+    if (s->mb_x) {
+        if (s->mb_y >= s->start_mb_y + 1) {
+            dest = s->dest[0] - 16 * s->linesize - 16;
+            ttblk = &v->ttblk[s->mb_x - s->mb_stride - 1];
+            flags = s->mb_y == s->start_mb_y + 1 ? TOP_EDGE : 0;
+            fieldtx = v->fieldtx_plane[mb_pos - s->mb_stride - 1];
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
         }
     }
+    if (s->mb_x == s->mb_width - 1) {
+        if (s->mb_y >= s->start_mb_y + 1) {
+            dest = s->dest[0] - 16 * s->linesize;
+            ttblk = &v->ttblk[s->mb_x - s->mb_stride];
+            flags = s->mb_y == s->start_mb_y + 1 ? TOP_EDGE : 0;
+            fieldtx = v->fieldtx_plane[mb_pos - s->mb_stride];
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        if (s->mb_x) {
+            dest = s->dest[0] - 16;
+            ttblk = &v->ttblk[s->mb_x - 1];
+            flags = s->mb_y == s->start_mb_y ? TOP_EDGE | BOTTOM_EDGE : BOTTOM_EDGE;
+            fieldtx = v->fieldtx_plane[mb_pos - 1];
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] - 8 : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            dest = s->dest[0];
+            ttblk = &v->ttblk[s->mb_x];
+            flags = s->mb_y == s->start_mb_y ? TOP_EDGE | BOTTOM_EDGE : BOTTOM_EDGE;
+            fieldtx = v->fieldtx_plane[mb_pos];
+            for (i = 0; i < block_count; i++)
+                vc1_p_v_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
+        }
+    }
+
+    if (s->mb_y >= s->start_mb_y + 2) {
+        if (s->mb_x >= 2) {
+            dest = s->dest[0] - 32 * s->linesize - 32;
+            ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride - 2];
+            flags = s->mb_x == 2 ? LEFT_EDGE : 0;
+            fieldtx = v->fieldtx_plane[mb_pos - 2 * s->mb_stride - 2];
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize - 16 : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            if (s->mb_x >= 1) {
+                dest = s->dest[0] - 32 * s->linesize - 16;
+                ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride - 1];
+                flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+                fieldtx = v->fieldtx_plane[mb_pos - 2 * s->mb_stride - 1];
+                for (i = 0; i < block_count; i++)
+                    vc1_p_h_intfr_loop_filter(v,
+                                              i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize - 8 : dest,
+                                              ttblk,
+                                              flags,
+                                              fieldtx,
+                                              i);
+            }
+            dest = s->dest[0] - 32 * s->linesize;
+            ttblk = &v->ttblk[s->mb_x - 2 * s->mb_stride];
+            flags = s->mb_x ? RIGHT_EDGE : LEFT_EDGE | RIGHT_EDGE;
+            fieldtx = v->fieldtx_plane[mb_pos - 2 * s->mb_stride];
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] - 16 * s->uvlinesize : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        if (s->mb_y >= s->start_mb_y + 1) {
+            if (s->mb_x >= 2) {
+                dest = s->dest[0] - 16 * s->linesize - 32;
+                ttblk = &v->ttblk[s->mb_x - s->mb_stride - 2];
+                flags = s->mb_x == 2 ? LEFT_EDGE : 0;
+                fieldtx = v->fieldtx_plane[mb_pos - s->mb_stride - 2];
+                for (i = 0; i < block_count; i++)
+                    vc1_p_h_intfr_loop_filter(v,
+                                              i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 16 : dest,
+                                              ttblk,
+                                              flags,
+                                              fieldtx,
+                                              i);
+            }
+            if (s->mb_x == s->mb_width - 1) {
+                if (s->mb_x >= 1) {
+                    dest = s->dest[0] - 16 * s->linesize - 16;
+                    ttblk = &v->ttblk[s->mb_x - s->mb_stride - 1];
+                    flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+                    fieldtx = v->fieldtx_plane[mb_pos - s->mb_stride - 1];
+                    for (i = 0; i < block_count; i++)
+                        vc1_p_h_intfr_loop_filter(v,
+                                                  i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest,
+                                                  ttblk,
+                                                  flags,
+                                                  fieldtx,
+                                                  i);
+                }
+                dest = s->dest[0] - 16 * s->linesize;
+                ttblk = &v->ttblk[s->mb_x - s->mb_stride];
+                flags = s->mb_x ? RIGHT_EDGE : LEFT_EDGE | RIGHT_EDGE;
+                fieldtx = v->fieldtx_plane[mb_pos - s->mb_stride];
+                for (i = 0; i < block_count; i++)
+                    vc1_p_h_intfr_loop_filter(v,
+                                              i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest,
+                                              ttblk,
+                                              flags,
+                                              fieldtx,
+                                              i);
+            }
+        }
+        if (s->mb_x >= 2) {
+            dest = s->dest[0] - 32;
+            ttblk = &v->ttblk[s->mb_x - 2];
+            flags = s->mb_x == 2 ? LEFT_EDGE : 0;
+            fieldtx = v->fieldtx_plane[mb_pos - 2];
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] - 16 : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            if (s->mb_x >= 1) {
+                dest = s->dest[0] - 16;
+                ttblk = &v->ttblk[s->mb_x - 1];
+                flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+                fieldtx = v->fieldtx_plane[mb_pos - 1];
+                for (i = 0; i < block_count; i++)
+                    vc1_p_h_intfr_loop_filter(v,
+                                              i > 3 ? s->dest[i - 3] - 8 : dest,
+                                              ttblk,
+                                              flags,
+                                              fieldtx,
+                                              i);
+            }
+            dest = s->dest[0];
+            ttblk = &v->ttblk[s->mb_x];
+            flags = s->mb_x ? RIGHT_EDGE : LEFT_EDGE | RIGHT_EDGE;
+            fieldtx = v->fieldtx_plane[mb_pos];
+            for (i = 0; i < block_count; i++)
+                vc1_p_h_intfr_loop_filter(v,
+                                          i > 3 ? s->dest[i - 3] : dest,
+                                          ttblk,
+                                          flags,
+                                          fieldtx,
+                                          i);
+        }
+    }
+}
+
+static av_always_inline void vc1_b_h_intfi_loop_filter(VC1Context *v, uint8_t *dest, uint32_t *cbp,
+                                                       int *ttblk, uint32_t flags, int block_num)
+{
+    MpegEncContext *s  = &v->s;
+    int pq = v->pq;
+    uint8_t *dst;
+    uint32_t block_cbp = cbp[0] >> (block_num * 4);
+    int tt;
+    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
+
+    if (block_num > 3)
+        dst = dest;
+    else
+        dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
+
+    if (!(flags & RIGHT_EDGE) || !(block_num & 5)) {
+        if (block_num > 3)
+            v->vc1dsp.vc1_h_loop_filter8(dst + 8, linesize, pq);
+        else
+            v->vc1dsp.vc1_h_loop_filter8(dst + 8, linesize, pq);
+    }
+
+    tt = ttblk[0] >> (block_num * 4) & 0xf;
+    if (tt == TT_4X4 || tt == TT_4X8) {
+        idx = (block_cbp | (block_cbp >> 1)) & 5;
+        if (idx & 1)
+            v->vc1dsp.vc1_h_loop_filter4(dst + 4 * linesize + 4, linesize, pq);
+        if (idx & 4)
+            v->vc1dsp.vc1_h_loop_filter4(dst + 4, linesize, pq);
+    }
+}
+
+static av_always_inline void vc1_b_v_intfi_loop_filter(VC1Context *v, uint8_t *dest, uint32_t *cbp,
+                                                       int *ttblk, uint32_t flags, int block_num)
+{
+    MpegEncContext *s  = &v->s;
+    int pq = v->pq;
+    uint8_t *dst;
+    uint32_t block_cbp = cbp[0] >> (block_num * 4);
+    int tt;
+    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
+
+    if (block_num > 3)
+        dst = dest;
+    else
+        dst = dest + (block_num & 2) * 4 * s->linesize + (block_num & 1) * 8;
+
+    if(!(flags & BOTTOM_EDGE) || block_num < 2)
+        v->vc1dsp.vc1_v_loop_filter8(dst + 8 * linesize, linesize, pq);
+
+    tt = ttblk[0] >> (block_num * 4) & 0xf;
+    if (tt == TT_4X4 || tt == TT_8X4) {
+        idx = (block_cbp | (block_cbp >> 2)) & 3;
+        if (idx & 1)
+            v->vc1dsp.vc1_v_loop_filter4(dst + 4 * linesize + 4, linesize, pq);
+        if (idx & 2)
+            v->vc1dsp.vc1_v_loop_filter4(dst + 4 * linesize, linesize, pq);
+    }
 }
 
-void ff_vc1_apply_p_loop_filter(VC1Context *v)
+void ff_vc1_b_intfi_loop_filter(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
+    uint8_t *dest;
+    uint32_t *cbp;
+    int *ttblk;
+    uint32_t flags = 0;
     int i;
 
-    for (i = 0; i < 6; i++) {
-        vc1_apply_p_v_loop_filter(v, i);
+    /* Within a MB, the vertical loop filter always runs before the horizontal.
+     * To accomplish that, we run the V loop filter on all applicable
+     * horizontal borders of the MB above the currently decoded MB. Then,
+     * we wait for the next loop filter iteration to do H loop filter on all
+     * applicable vertical borders of this MB. Therefore, the loop filter
+     * trails by one row and one column relative to the decoding loop. */
+    if (!s->first_slice_line) {
+        dest = s->dest[0] - 16 * s->linesize;
+        cbp = &v->cbp[s->mb_x - s->mb_stride];
+        ttblk = &v->ttblk[s->mb_x - s->mb_stride];
+        flags = s->mb_y == s->start_mb_y + 1 ? TOP_EDGE : 0;
+        for (i = 0; i < block_count; i++)
+            vc1_b_v_intfi_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest, cbp, ttblk, flags, i);
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        dest = s->dest[0];
+        cbp = &v->cbp[s->mb_x];
+        ttblk = &v->ttblk[s->mb_x];
+        flags = s->first_slice_line ? TOP_EDGE | BOTTOM_EDGE : BOTTOM_EDGE;
+        for (i = 0; i < block_count; i++)
+            vc1_b_v_intfi_loop_filter(v, i > 3 ? s->dest[i - 3] : dest, cbp, ttblk, flags, i);
     }
 
-    /* V always precedes H, therefore we run H one MB before V;
-     * at the end of a row, we catch up to complete the row */
-    if (s->mb_x) {
-        for (i = 0; i < 6; i++) {
-            vc1_apply_p_h_loop_filter(v, i);
+    if (!s->first_slice_line) {
+        dest = s->dest[0] - 16 * s->linesize - 16;
+        cbp = &v->cbp[s->mb_x - s->mb_stride - 1];
+        ttblk = &v->ttblk[s->mb_x - s->mb_stride - 1];
+        if (s->mb_x) {
+            flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_b_h_intfi_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize - 8 : dest, cbp, ttblk, flags, i);
         }
         if (s->mb_x == s->mb_width - 1) {
-            s->mb_x++;
-            ff_update_block_index(s);
-            for (i = 0; i < 6; i++) {
-                vc1_apply_p_h_loop_filter(v, i);
-            }
+            dest += 16;
+            cbp++;
+            ttblk++;
+            flags = s->mb_x == 0 ? LEFT_EDGE | RIGHT_EDGE : RIGHT_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_b_h_intfi_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 * s->uvlinesize : dest, cbp, ttblk, flags, i);
+        }
+    }
+    if (s->mb_y == s->end_mb_y - 1) {
+        dest = s->dest[0] - 16;
+        cbp = &v->cbp[s->mb_x - 1];
+        ttblk = &v->ttblk[s->mb_x - 1];
+        if (s->mb_x) {
+            flags = s->mb_x == 1 ? LEFT_EDGE : 0;
+            for (i = 0; i < block_count; i++)
+                vc1_b_h_intfi_loop_filter(v, i > 3 ? s->dest[i - 3] - 8 : dest, cbp, ttblk, flags, i);
+        }
+        if (s->mb_x == s->mb_width - 1) {
+            dest += 16;
+            cbp++;
+            ttblk++;
+            flags = s->mb_x == 0 ? LEFT_EDGE | RIGHT_EDGE : RIGHT_EDGE;
+            for (i = 0; i < block_count; i++)
+                vc1_b_h_intfi_loop_filter(v, i > 3 ? s->dest[i - 3] : dest, cbp, ttblk, flags, i);
         }
     }
 }
diff --git a/libavcodec/vc1_mc.c b/libavcodec/vc1_mc.c
index 18ac47a..1b8d879 100644
--- a/libavcodec/vc1_mc.c
+++ b/libavcodec/vc1_mc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,140 @@
 #include "mpegvideo.h"
 #include "vc1.h"
 
+static av_always_inline void vc1_scale_luma(uint8_t *srcY,
+                                            int k, int linesize)
+{
+    int i, j;
+    for (j = 0; j < k; j++) {
+        for (i = 0; i < k; i++)
+            srcY[i] = ((srcY[i] - 128) >> 1) + 128;
+        srcY += linesize;
+    }
+}
+
+static av_always_inline void vc1_scale_chroma(uint8_t *srcU, uint8_t *srcV,
+                                              int k, int uvlinesize)
+{
+    int i, j;
+    for (j = 0; j < k; j++) {
+        for (i = 0; i < k; i++) {
+            srcU[i] = ((srcU[i] - 128) >> 1) + 128;
+            srcV[i] = ((srcV[i] - 128) >> 1) + 128;
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+    }
+}
+
+static av_always_inline void vc1_lut_scale_luma(uint8_t *srcY,
+                                                uint8_t *lut1, uint8_t *lut2,
+                                                int k, int linesize)
+{
+    int i, j;
+
+    for (j = 0; j < k; j += 2) {
+        for (i = 0; i < k; i++)
+            srcY[i] = lut1[srcY[i]];
+        srcY += linesize;
+
+        if (j + 1 == k)
+            break;
+
+        for (i = 0; i < k; i++)
+            srcY[i] = lut2[srcY[i]];
+        srcY += linesize;
+    }
+}
+
+static av_always_inline void vc1_lut_scale_chroma(uint8_t *srcU, uint8_t *srcV,
+                                                  uint8_t *lut1, uint8_t *lut2,
+                                                  int k, int uvlinesize)
+{
+    int i, j;
+
+    for (j = 0; j < k; j += 2) {
+        for (i = 0; i < k; i++) {
+            srcU[i] = lut1[srcU[i]];
+            srcV[i] = lut1[srcV[i]];
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+
+        if (j + 1 == k)
+            break;
+
+        for (i = 0; i < k; i++) {
+            srcU[i] = lut2[srcU[i]];
+            srcV[i] = lut2[srcV[i]];
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+    }
+}
+
+static const uint8_t popcount4[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+
+static av_always_inline int get_luma_mv(VC1Context *v, int dir, int16_t *tx, int16_t *ty)
+{
+    MpegEncContext *s = &v->s;
+    int idx = v->mv_f[dir][s->block_index[0] + v->blocks_off] |
+             (v->mv_f[dir][s->block_index[1] + v->blocks_off] << 1) |
+             (v->mv_f[dir][s->block_index[2] + v->blocks_off] << 2) |
+             (v->mv_f[dir][s->block_index[3] + v->blocks_off] << 3);
+    static const uint8_t index2[16] = { 0, 0, 0, 0x23, 0, 0x13, 0x03, 0, 0, 0x12, 0x02, 0, 0x01, 0, 0, 0 };
+    int opp_count = popcount4[idx];
+
+    switch (opp_count) {
+    case 0:
+    case 4:
+        *tx = median4(s->mv[dir][0][0], s->mv[dir][1][0], s->mv[dir][2][0], s->mv[dir][3][0]);
+        *ty = median4(s->mv[dir][0][1], s->mv[dir][1][1], s->mv[dir][2][1], s->mv[dir][3][1]);
+        break;
+    case 1:
+        *tx = mid_pred(s->mv[dir][idx < 2][0], s->mv[dir][1 + (idx < 4)][0], s->mv[dir][2 + (idx < 8)][0]);
+        *ty = mid_pred(s->mv[dir][idx < 2][1], s->mv[dir][1 + (idx < 4)][1], s->mv[dir][2 + (idx < 8)][1]);
+        break;
+    case 3:
+        *tx = mid_pred(s->mv[dir][idx > 0xd][0], s->mv[dir][1 + (idx > 0xb)][0], s->mv[dir][2 + (idx > 0x7)][0]);
+        *ty = mid_pred(s->mv[dir][idx > 0xd][1], s->mv[dir][1 + (idx > 0xb)][1], s->mv[dir][2 + (idx > 0x7)][1]);
+        break;
+    case 2:
+        *tx = (s->mv[dir][index2[idx] >> 4][0] + s->mv[dir][index2[idx] & 0xf][0]) / 2;
+        *ty = (s->mv[dir][index2[idx] >> 4][1] + s->mv[dir][index2[idx] & 0xf][1]) / 2;
+        break;
+    }
+    return opp_count;
+}
+
+static av_always_inline int get_chroma_mv(VC1Context *v, int dir, int16_t *tx, int16_t *ty)
+{
+    MpegEncContext *s = &v->s;
+    int idx = !v->mb_type[0][s->block_index[0]] |
+             (!v->mb_type[0][s->block_index[1]] << 1) |
+             (!v->mb_type[0][s->block_index[2]] << 2) |
+             (!v->mb_type[0][s->block_index[3]] << 3);
+    static const uint8_t index2[16] = { 0, 0, 0, 0x01, 0, 0x02, 0x12, 0, 0, 0x03, 0x13, 0, 0x23, 0, 0, 0 };
+    int valid_count = popcount4[idx];
+
+    switch (valid_count) {
+    case 4:
+        *tx = median4(s->mv[dir][0][0], s->mv[dir][1][0], s->mv[dir][2][0], s->mv[dir][3][0]);
+        *ty = median4(s->mv[dir][0][1], s->mv[dir][1][1], s->mv[dir][2][1], s->mv[dir][3][1]);
+        break;
+    case 3:
+        *tx = mid_pred(s->mv[dir][idx > 0xd][0], s->mv[dir][1 + (idx > 0xb)][0], s->mv[dir][2 + (idx > 0x7)][0]);
+        *ty = mid_pred(s->mv[dir][idx > 0xd][1], s->mv[dir][1 + (idx > 0xb)][1], s->mv[dir][2 + (idx > 0x7)][1]);
+        break;
+    case 2:
+        *tx = (s->mv[dir][index2[idx] >> 4][0] + s->mv[dir][index2[idx] & 0xf][0]) / 2;
+        *ty = (s->mv[dir][index2[idx] >> 4][1] + s->mv[dir][index2[idx] & 0xf][1]) / 2;
+        break;
+    default:
+        return 0;
+    }
+    return valid_count;
+}
+
 /** Do motion compensation over 1 macroblock
  * Mostly adapted hpel_motion and qpel_motion from mpegvideo.c
  */
@@ -45,12 +179,17 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
     int i;
     uint8_t (*luty)[256], (*lutuv)[256];
     int use_ic;
+    int interlace;
+    int linesize, uvlinesize;
 
     if ((!v->field_mode ||
          (v->ref_field_type[dir] == 1 && v->cur_field_type == 1)) &&
         !v->s.last_picture.f->data[0])
         return;
 
+    linesize = s->current_picture_ptr->f->linesize[0];
+    uvlinesize = s->current_picture_ptr->f->linesize[1];
+
     mx = s->mv[dir][0][0];
     my = s->mv[dir][0][1];
 
@@ -85,7 +224,8 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
             srcV = s->current_picture.f->data[2];
             luty  = v->curr_luty;
             lutuv = v->curr_lutuv;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
+            interlace = 1;
         } else {
             srcY = s->last_picture.f->data[0];
             srcU = s->last_picture.f->data[1];
@@ -93,6 +233,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
             luty  = v->last_luty;
             lutuv = v->last_lutuv;
             use_ic = v->last_use_ic;
+            interlace = s->last_picture.f->interlaced_frame;
         }
     } else {
         srcY = s->next_picture.f->data[0];
@@ -101,6 +242,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
         luty  = v->next_luty;
         lutuv = v->next_lutuv;
         use_ic = v->next_use_ic;
+        interlace = s->next_picture.f->interlaced_frame;
     }
 
     if (!srcY || !srcU) {
@@ -120,9 +262,14 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
         uvsrc_y = av_clip(uvsrc_y,  -8, s->mb_height *  8);
     } else {
         src_x   = av_clip(  src_x, -17, s->avctx->coded_width);
-        src_y   = av_clip(  src_y, -18, s->avctx->coded_height + 1);
         uvsrc_x = av_clip(uvsrc_x,  -8, s->avctx->coded_width  >> 1);
-        uvsrc_y = av_clip(uvsrc_y,  -8, s->avctx->coded_height >> 1);
+        if (v->fcm == ILACE_FRAME) {
+            src_y = av_clip(src_y, -18 + (src_y & 1), s->avctx->coded_height + (src_y & 1));
+            uvsrc_y = av_clip(uvsrc_y, -8 + (uvsrc_y & 1), (s->avctx->coded_height >> 1) + (uvsrc_y & 1));
+        } else {
+            src_y = av_clip(src_y, -18, s->avctx->coded_height + 1);
+            uvsrc_y = av_clip(uvsrc_y, -8, s->avctx->coded_height >> 1);
+        }
     }
 
     srcY += src_y   * s->linesize   + src_x;
@@ -130,13 +277,13 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
     srcV += uvsrc_y * s->uvlinesize + uvsrc_x;
 
     if (v->field_mode && v->ref_field_type[dir]) {
-        srcY += s->current_picture_ptr->f->linesize[0];
-        srcU += s->current_picture_ptr->f->linesize[1];
-        srcV += s->current_picture_ptr->f->linesize[2];
+        srcY += linesize;
+        srcU += uvlinesize;
+        srcV += uvlinesize;
     }
 
     /* for grayscale we should not try to read from unknown area */
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY) {
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY) {
         srcU = s->sc.edge_emu_buffer + 18 * s->linesize;
         srcV = s->sc.edge_emu_buffer + 18 * s->linesize;
     }
@@ -145,81 +292,135 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
         || s->h_edge_pos < 22 || v_edge_pos < 22
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx&3) - 16 - s->mspel * 3
         || (unsigned)(src_y - 1)        > v_edge_pos    - (my&3) - 16 - 3) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *ubuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+        const int k = 17 + s->mspel * 2;
 
         srcY -= s->mspel * (1 + s->linesize);
-        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
-                                 s->linesize, s->linesize,
-                                 17 + s->mspel * 2, 17 + s->mspel * 2,
-                                 src_x - s->mspel, src_y - s->mspel,
-                                 s->h_edge_pos, v_edge_pos);
+        if (interlace) {
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcY,
+                                     linesize << 1,
+                                     linesize << 1,
+                                     k,
+                                     v->field_mode ? k : k + 1 >> 1,
+                                     src_x - s->mspel,
+                                     src_y - s->mspel >> !v->field_mode,
+                                     s->h_edge_pos,
+                                     s->v_edge_pos >> 1);
+            if (!v->field_mode)
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + linesize,
+                                         srcY + linesize,
+                                         linesize << 1,
+                                         linesize << 1,
+                                         k,
+                                         k >> 1,
+                                         src_x - s->mspel,
+                                         src_y - s->mspel + 1 >> 1,
+                                         s->h_edge_pos,
+                                         s->v_edge_pos >> 1);
+        } else
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcY,
+                                     linesize,
+                                     linesize,
+                                     k,
+                                     v->field_mode ? (k << 1) - 1 : k,
+                                     src_x - s->mspel,
+                                     v->field_mode ? 2 * (src_y - s->mspel) + v->ref_field_type[dir] :
+                                                     src_y - s->mspel,
+                                     s->h_edge_pos,
+                                     s->v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
-                                 s->uvlinesize, s->uvlinesize,
-                                 8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
-                                 s->uvlinesize, s->uvlinesize,
-                                 8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+        if (interlace) {
+            s->vdsp.emulated_edge_mc(ubuf,
+                                     srcU,
+                                     uvlinesize << 1,
+                                     uvlinesize << 1,
+                                     9,
+                                     v->field_mode ? 9 : 5,
+                                     uvsrc_x,
+                                     uvsrc_y >> !v->field_mode,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 2);
+            s->vdsp.emulated_edge_mc(vbuf,
+                                     srcV,
+                                     uvlinesize << 1,
+                                     uvlinesize << 1,
+                                     9,
+                                     v->field_mode ? 9 : 5,
+                                     uvsrc_x,
+                                     uvsrc_y >> !v->field_mode,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 2);
+            if (!v->field_mode) {
+                s->vdsp.emulated_edge_mc(ubuf + uvlinesize,
+                                         srcU + uvlinesize,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         9,
+                                         4,
+                                         uvsrc_x,
+                                         uvsrc_y + 1 >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+                s->vdsp.emulated_edge_mc(vbuf + uvlinesize,
+                                         srcV + uvlinesize,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         9,
+                                         4,
+                                         uvsrc_x,
+                                         uvsrc_y + 1 >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+            }
+        } else {
+            s->vdsp.emulated_edge_mc(ubuf,
+                                     srcU,
+                                     uvlinesize,
+                                     uvlinesize,
+                                     9,
+                                     v->field_mode ? 17 : 9,
+                                     uvsrc_x,
+                                     v->field_mode ? 2 * uvsrc_y + v->ref_field_type[dir] : uvsrc_y,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf,
+                                     srcV,
+                                     uvlinesize,
+                                     uvlinesize,
+                                     9,
+                                     v->field_mode ? 17 : 9,
+                                     uvsrc_x,
+                                     v->field_mode ? 2 * uvsrc_y + v->ref_field_type[dir] : uvsrc_y,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 1);
+        }
+        srcU = ubuf;
+        srcV = vbuf;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_luma(srcY, k, s->linesize);
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : ((j + src_y - s->mspel) & 1) ;
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : ((j + uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[dir] : ((0 + src_y - s->mspel) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[dir] : ((1 + src_y - s->mspel) & 1)],
+                               k, s->linesize);
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? v->ref_field_type[dir] : ((0 + uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? v->ref_field_type[dir] : ((1 + uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
         srcY += s->mspel * (1 + s->linesize);
     }
 
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0]    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8, srcY + 8, s->linesize, v->rnd);
-        srcY += s->linesize * 8;
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8 * s->linesize    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8 * s->linesize + 8, srcY + 8, s->linesize, v->rnd);
+        v->vc1dsp.put_vc1_mspel_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, v->rnd);
     } else { // hpel mc - always used for luma
         dxy = (my & 2) | ((mx & 2) >> 1);
         if (!v->rnd)
@@ -228,7 +429,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
             s->hdsp.put_no_rnd_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, 16);
     }
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
     /* Chroma MC always uses qpel bilinear */
     uvmx = (uvmx & 3) << 1;
@@ -240,16 +441,9 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
         v->vc1dsp.put_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
         v->vc1dsp.put_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     }
-}
-
-static inline int median4(int a, int b, int c, int d)
-{
-    if (a < b) {
-        if (c < d) return (FFMIN(b, d) + FFMAX(a, c)) / 2;
-        else       return (FFMIN(b, c) + FFMAX(a, d)) / 2;
-    } else {
-        if (c < d) return (FFMIN(a, d) + FFMAX(b, c)) / 2;
-        else       return (FFMIN(a, c) + FFMAX(b, d)) / 2;
+    if (v->field_mode) {
+        v->mv_f[dir][s->block_index[4] + v->mb_off] = v->cur_field_type != v->ref_field_type[dir];
+        v->mv_f[dir][s->block_index[5] + v->mb_off] = v->cur_field_type != v->ref_field_type[dir];
     }
 }
 
@@ -265,12 +459,16 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     int v_edge_pos = s->v_edge_pos >> v->field_mode;
     uint8_t (*luty)[256];
     int use_ic;
+    int interlace;
+    int linesize;
 
     if ((!v->field_mode ||
          (v->ref_field_type[dir] == 1 && v->cur_field_type == 1)) &&
         !v->s.last_picture.f->data[0])
         return;
 
+    linesize = s->current_picture_ptr->f->linesize[0];
+
     mx = s->mv[dir][n][0];
     my = s->mv[dir][n][1];
 
@@ -278,16 +476,19 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
         if (v->field_mode && (v->cur_field_type != v->ref_field_type[dir]) && v->second_field) {
             srcY = s->current_picture.f->data[0];
             luty = v->curr_luty;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
+            interlace = 1;
         } else {
             srcY = s->last_picture.f->data[0];
             luty = v->last_luty;
             use_ic = v->last_use_ic;
+            interlace = s->last_picture.f->interlaced_frame;
         }
     } else {
         srcY = s->next_picture.f->data[0];
         luty = v->next_luty;
         use_ic = v->next_use_ic;
+        interlace = s->next_picture.f->interlaced_frame;
     }
 
     if (!srcY) {
@@ -301,35 +502,10 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     }
 
     if (s->pict_type == AV_PICTURE_TYPE_P && n == 3 && v->field_mode) {
-        int same_count = 0, opp_count = 0, k;
-        int chosen_mv[2][4][2], f;
-        int tx = 0, ty = 0;
-        for (k = 0; k < 4; k++) {
-            f = v->mv_f[0][s->block_index[k] + v->blocks_off];
-            chosen_mv[f][f ? opp_count : same_count][0] = s->mv[0][k][0];
-            chosen_mv[f][f ? opp_count : same_count][1] = s->mv[0][k][1];
-            opp_count  += f;
-            same_count += 1 - f;
-        }
-        f = opp_count > same_count;
-        switch (f ? opp_count : same_count) {
-        case 4:
-            tx = median4(chosen_mv[f][0][0], chosen_mv[f][1][0],
-                         chosen_mv[f][2][0], chosen_mv[f][3][0]);
-            ty = median4(chosen_mv[f][0][1], chosen_mv[f][1][1],
-                         chosen_mv[f][2][1], chosen_mv[f][3][1]);
-            break;
-        case 3:
-            tx = mid_pred(chosen_mv[f][0][0], chosen_mv[f][1][0], chosen_mv[f][2][0]);
-            ty = mid_pred(chosen_mv[f][0][1], chosen_mv[f][1][1], chosen_mv[f][2][1]);
-            break;
-        case 2:
-            tx = (chosen_mv[f][0][0] + chosen_mv[f][1][0]) / 2;
-            ty = (chosen_mv[f][0][1] + chosen_mv[f][1][1]) / 2;
-            break;
-        }
-        s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = tx;
-        s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = ty;
+        int opp_count = get_luma_mv(v, 0,
+                                    &s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0],
+                                    &s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1]);
+        int k, f = opp_count > 2;
         for (k = 0; k < 4; k++)
             v->mv_f[1][s->block_index[k] + v->blocks_off] = f;
     }
@@ -371,60 +547,69 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
         src_y = av_clip(src_y, -16, s->mb_height * 16);
     } else {
         src_x = av_clip(src_x, -17, s->avctx->coded_width);
-        if (v->fcm == ILACE_FRAME) {
-            if (src_y & 1)
-                src_y = av_clip(src_y, -17, s->avctx->coded_height + 1);
-            else
-                src_y = av_clip(src_y, -18, s->avctx->coded_height);
-        } else {
+        if (v->fcm == ILACE_FRAME)
+            src_y = av_clip(src_y, -18 + (src_y & 1), s->avctx->coded_height + (src_y & 1));
+        else
             src_y = av_clip(src_y, -18, s->avctx->coded_height + 1);
-        }
     }
 
     srcY += src_y * s->linesize + src_x;
     if (v->field_mode && v->ref_field_type[dir])
-        srcY += s->current_picture_ptr->f->linesize[0];
+        srcY += linesize;
 
-    if (fieldmv && !(src_y & 1))
-        v_edge_pos--;
-    if (fieldmv && (src_y & 1) && src_y < 4)
-        src_y--;
     if (v->rangeredfrm || use_ic
         || s->h_edge_pos < 13 || v_edge_pos < 23
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 8 - s->mspel * 2
         || (unsigned)(src_y - (s->mspel << fieldmv)) > v_edge_pos - (my & 3) - ((8 + s->mspel * 2) << fieldmv)) {
+        const int k = 9 + s->mspel * 2;
+
         srcY -= s->mspel * (1 + (s->linesize << fieldmv));
         /* check emulate edge stride and offset */
-        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
-                                 s->linesize, s->linesize,
-                                 9 + s->mspel * 2, (9 + s->mspel * 2) << fieldmv,
-                                 src_x - s->mspel, src_y - (s->mspel << fieldmv),
-                                 s->h_edge_pos, v_edge_pos);
+        if (interlace) {
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcY,
+                                     linesize << 1,
+                                     linesize << 1,
+                                     k,
+                                     v->field_mode ? k : (k << fieldmv) + 1 >> 1,
+                                     src_x - s->mspel,
+                                     src_y - (s->mspel << fieldmv) >> !v->field_mode,
+                                     s->h_edge_pos,
+                                     s->v_edge_pos >> 1);
+            if (!v->field_mode && !fieldmv)
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + linesize,
+                                         srcY + linesize,
+                                         linesize << 1,
+                                         linesize << 1,
+                                         k,
+                                         k >> 1,
+                                         src_x - s->mspel,
+                                         src_y - s->mspel + 1 >> 1,
+                                         s->h_edge_pos,
+                                         s->v_edge_pos >> 1);
+        } else
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcY,
+                                     linesize,
+                                     linesize,
+                                     k,
+                                     v->field_mode ? (k << 1) - 1 : k << fieldmv,
+                                     src_x - s->mspel,
+                                     v->field_mode ? 2 * (src_y - s->mspel) + v->ref_field_type[dir] :
+                                                     src_y - (s->mspel << fieldmv),
+                                     s->h_edge_pos,
+                                     s->v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src;
-
-            src = srcY;
-            for (j = 0; j < 9 + s->mspel * 2; j++) {
-                for (i = 0; i < 9 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize << fieldmv;
-            }
+            vc1_scale_luma(srcY, k, s->linesize << fieldmv);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src;
-
-            src = srcY;
-            for (j = 0; j < 9 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : (((j<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1);
-                for (i = 0; i < 9 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize << fieldmv;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[dir] : (((0<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[dir] : (((1<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1)],
+                               k, s->linesize << fieldmv);
         }
         srcY += s->mspel * (1 + (s->linesize << fieldmv));
     }
@@ -432,9 +617,9 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
         if (avg)
-            v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
+            v->vc1dsp.avg_vc1_mspel_pixels_tab[1][dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
         else
-            v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
+            v->vc1dsp.put_vc1_mspel_pixels_tab[1][dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
     } else { // hpel mc - always used for luma
         dxy = (my & 2) | ((mx & 2) >> 1);
         if (!v->rnd)
@@ -444,59 +629,6 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     }
 }
 
-static av_always_inline int get_chroma_mv(int *mvx, int *mvy, int *a, int flag, int *tx, int *ty)
-{
-    int idx, i;
-    static const int count[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-    idx =  ((a[3] != flag) << 3)
-         | ((a[2] != flag) << 2)
-         | ((a[1] != flag) << 1)
-         |  (a[0] != flag);
-    if (!idx) {
-        *tx = median4(mvx[0], mvx[1], mvx[2], mvx[3]);
-        *ty = median4(mvy[0], mvy[1], mvy[2], mvy[3]);
-        return 4;
-    } else if (count[idx] == 1) {
-        switch (idx) {
-        case 0x1:
-            *tx = mid_pred(mvx[1], mvx[2], mvx[3]);
-            *ty = mid_pred(mvy[1], mvy[2], mvy[3]);
-            return 3;
-        case 0x2:
-            *tx = mid_pred(mvx[0], mvx[2], mvx[3]);
-            *ty = mid_pred(mvy[0], mvy[2], mvy[3]);
-            return 3;
-        case 0x4:
-            *tx = mid_pred(mvx[0], mvx[1], mvx[3]);
-            *ty = mid_pred(mvy[0], mvy[1], mvy[3]);
-            return 3;
-        case 0x8:
-            *tx = mid_pred(mvx[0], mvx[1], mvx[2]);
-            *ty = mid_pred(mvy[0], mvy[1], mvy[2]);
-            return 3;
-        }
-    } else if (count[idx] == 2) {
-        int t1 = 0, t2 = 0;
-        for (i = 0; i < 3; i++)
-            if (!a[i]) {
-                t1 = i;
-                break;
-            }
-        for (i = t1 + 1; i < 4; i++)
-            if (!a[i]) {
-                t2 = i;
-                break;
-            }
-        *tx = (mvx[t1] + mvx[t2]) / 2;
-        *ty = (mvy[t1] + mvy[t2]) / 2;
-        return 2;
-    } else {
-        return 0;
-    }
-    return -1;
-}
-
 /** Do motion compensation for 4-MV macroblock - both chroma blocks
  */
 void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
@@ -505,49 +637,40 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
     H264ChromaContext *h264chroma = &v->h264chroma;
     uint8_t *srcU, *srcV;
     int uvmx, uvmy, uvsrc_x, uvsrc_y;
-    int k, tx = 0, ty = 0;
-    int mvx[4], mvy[4], intra[4], mv_f[4];
-    int valid_count;
-    int chroma_ref_type = v->cur_field_type;
+    int16_t tx, ty;
+    int chroma_ref_type;
     int v_edge_pos = s->v_edge_pos >> v->field_mode;
     uint8_t (*lutuv)[256];
     int use_ic;
+    int interlace;
+    int uvlinesize;
 
     if (!v->field_mode && !v->s.last_picture.f->data[0])
         return;
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
-    for (k = 0; k < 4; k++) {
-        mvx[k] = s->mv[dir][k][0];
-        mvy[k] = s->mv[dir][k][1];
-        intra[k] = v->mb_type[0][s->block_index[k]];
-        if (v->field_mode)
-            mv_f[k] = v->mv_f[dir][s->block_index[k] + v->blocks_off];
-    }
-
     /* calculate chroma MV vector from four luma MVs */
-    if (!v->field_mode || (v->field_mode && !v->numref)) {
-        valid_count = get_chroma_mv(mvx, mvy, intra, 0, &tx, &ty);
-        chroma_ref_type = v->reffield;
+    if (!v->field_mode || !v->numref) {
+        int valid_count = get_chroma_mv(v, dir, &tx, &ty);
         if (!valid_count) {
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = 0;
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = 0;
             v->luma_mv[s->mb_x][0] = v->luma_mv[s->mb_x][1] = 0;
             return; //no need to do MC for intra blocks
         }
+        chroma_ref_type = v->ref_field_type[dir];
     } else {
-        int dominant = 0;
-        if (mv_f[0] + mv_f[1] + mv_f[2] + mv_f[3] > 2)
-            dominant = 1;
-        valid_count = get_chroma_mv(mvx, mvy, mv_f, dominant, &tx, &ty);
-        if (dominant)
-            chroma_ref_type = !v->cur_field_type;
+        int opp_count = get_luma_mv(v, dir, &tx, &ty);
+        chroma_ref_type = v->cur_field_type ^ (opp_count > 2);
     }
     if (v->field_mode && chroma_ref_type == 1 && v->cur_field_type == 1 && !v->s.last_picture.f->data[0])
         return;
     s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = tx;
     s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = ty;
+
+    uvlinesize = s->current_picture_ptr->f->linesize[1];
+
     uvmx = (tx + ((tx & 3) == 3)) >> 1;
     uvmy = (ty + ((ty & 3) == 3)) >> 1;
 
@@ -578,18 +701,21 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
             srcU = s->current_picture.f->data[1];
             srcV = s->current_picture.f->data[2];
             lutuv = v->curr_lutuv;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
+            interlace = 1;
         } else {
             srcU = s->last_picture.f->data[1];
             srcV = s->last_picture.f->data[2];
             lutuv = v->last_lutuv;
             use_ic = v->last_use_ic;
+            interlace = s->last_picture.f->interlaced_frame;
         }
     } else {
         srcU = s->next_picture.f->data[1];
         srcV = s->next_picture.f->data[2];
         lutuv = v->next_lutuv;
         use_ic = v->next_use_ic;
+        interlace = s->next_picture.f->interlaced_frame;
     }
 
     if (!srcU) {
@@ -602,8 +728,8 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
 
     if (v->field_mode) {
         if (chroma_ref_type) {
-            srcU += s->current_picture_ptr->f->linesize[1];
-            srcV += s->current_picture_ptr->f->linesize[2];
+            srcU += uvlinesize;
+            srcV += uvlinesize;
         }
     }
 
@@ -611,49 +737,84 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
         || s->h_edge_pos < 18 || v_edge_pos < 18
         || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 9
         || (unsigned)uvsrc_y > (v_edge_pos    >> 1) - 9) {
-        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcU,
-                                 s->uvlinesize, s->uvlinesize,
-                                 8 + 1, 8 + 1, uvsrc_x, uvsrc_y,
-                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16, srcV,
-                                 s->uvlinesize, s->uvlinesize,
-                                 8 + 1, 8 + 1, uvsrc_x, uvsrc_y,
-                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        if (interlace) {
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcU,
+                                     uvlinesize << 1,
+                                     uvlinesize << 1,
+                                     9,
+                                     v->field_mode ? 9 : 5,
+                                     uvsrc_x,
+                                     uvsrc_y >> !v->field_mode,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 2);
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16,
+                                     srcV,
+                                     uvlinesize << 1,
+                                     uvlinesize << 1,
+                                     9,
+                                     v->field_mode ? 9 : 5,
+                                     uvsrc_x,
+                                     uvsrc_y >> !v->field_mode,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 2);
+            if (!v->field_mode) {
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + uvlinesize,
+                                         srcU + uvlinesize,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         9,
+                                         4,
+                                         uvsrc_x,
+                                         uvsrc_y + 1 >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16 + uvlinesize,
+                                         srcV + uvlinesize,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         9,
+                                         4,
+                                         uvsrc_x,
+                                         uvsrc_y + 1 >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+            }
+        } else {
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcU,
+                                     uvlinesize,
+                                     uvlinesize,
+                                     9,
+                                     v->field_mode ? 17 : 9,
+                                     uvsrc_x,
+                                     v->field_mode ? 2 * uvsrc_y + chroma_ref_type : uvsrc_y,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16,
+                                     srcV,
+                                     uvlinesize,
+                                     uvlinesize,
+                                     9,
+                                     v->field_mode ? 17 : 9,
+                                     uvsrc_x,
+                                     v->field_mode ? 2 * uvsrc_y + chroma_ref_type : uvsrc_y,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 1);
+        }
         srcU = s->sc.edge_emu_buffer;
         srcV = s->sc.edge_emu_buffer + 16;
 
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? chroma_ref_type : ((j + uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? chroma_ref_type : ((0 + uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? chroma_ref_type : ((1 + uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
     }
 
@@ -667,6 +828,10 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
         v->vc1dsp.put_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
         v->vc1dsp.put_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     }
+    if (v->field_mode) {
+        v->mv_f[dir][s->block_index[4] + v->mb_off] = v->cur_field_type != chroma_ref_type;
+        v->mv_f[dir][s->block_index[5] + v->mb_off] = v->cur_field_type != chroma_ref_type;
+    }
 }
 
 /** Do motion compensation for 4-MV interlaced frame chroma macroblock (both U and V)
@@ -680,19 +845,18 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
     int uvmx_field[4], uvmy_field[4];
     int i, off, tx, ty;
     int fieldmv = v->blk_mv_type[s->block_index[0]];
-    static const int s_rndtblfield[16] = { 0, 0, 1, 2, 4, 4, 5, 6, 2, 2, 3, 8, 6, 6, 7, 12 };
+    static const uint8_t s_rndtblfield[16] = { 0, 0, 1, 2, 4, 4, 5, 6, 2, 2, 3, 8, 6, 6, 7, 12 };
     int v_dist = fieldmv ? 1 : 4; // vertical offset for lower sub-blocks
     int v_edge_pos = s->v_edge_pos >> 1;
     int use_ic;
+    int interlace;
+    int uvlinesize;
     uint8_t (*lutuv)[256];
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
-    if (!s->last_picture.f->data[1]) {
-      av_log(s->avctx, AV_LOG_ERROR, "Bad data in last picture frame.\n");
-      return;
-    }
+    uvlinesize = s->current_picture_ptr->f->linesize[1];
 
     for (i = 0; i < 4; i++) {
         int d = i < 2 ? dir: dir2;
@@ -711,56 +875,108 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
         uvsrc_y = s->mb_y * 8 + ((i & 2) ? v_dist : 0) + (uvmy_field[i] >> 2);
         // FIXME: implement proper pull-back (see vc1cropmv.c, vc1CROPMV_ChromaPullBack())
         uvsrc_x = av_clip(uvsrc_x, -8, s->avctx->coded_width  >> 1);
-        uvsrc_y = av_clip(uvsrc_y, -8, s->avctx->coded_height >> 1);
+        if (v->fcm == ILACE_FRAME)
+            uvsrc_y = av_clip(uvsrc_y, -8 + (uvsrc_y & 1), (s->avctx->coded_height >> 1) + (uvsrc_y & 1));
+        else
+            uvsrc_y = av_clip(uvsrc_y, -8, s->avctx->coded_height >> 1);
         if (i < 2 ? dir : dir2) {
-            srcU = s->next_picture.f->data[1] + uvsrc_y * s->uvlinesize + uvsrc_x;
-            srcV = s->next_picture.f->data[2] + uvsrc_y * s->uvlinesize + uvsrc_x;
+            srcU = s->next_picture.f->data[1];
+            srcV = s->next_picture.f->data[2];
             lutuv  = v->next_lutuv;
             use_ic = v->next_use_ic;
+            interlace = s->next_picture.f->interlaced_frame;
         } else {
-            srcU = s->last_picture.f->data[1] + uvsrc_y * s->uvlinesize + uvsrc_x;
-            srcV = s->last_picture.f->data[2] + uvsrc_y * s->uvlinesize + uvsrc_x;
+            srcU = s->last_picture.f->data[1];
+            srcV = s->last_picture.f->data[2];
             lutuv  = v->last_lutuv;
             use_ic = v->last_use_ic;
+            interlace = s->last_picture.f->interlaced_frame;
         }
+        if (!srcU)
+            return;
+        srcU += uvsrc_y * s->uvlinesize + uvsrc_x;
+        srcV += uvsrc_y * s->uvlinesize + uvsrc_x;
         uvmx_field[i] = (uvmx_field[i] & 3) << 1;
         uvmy_field[i] = (uvmy_field[i] & 3) << 1;
 
-        if (fieldmv && !(uvsrc_y & 1))
-            v_edge_pos--;
-        if (fieldmv && (uvsrc_y & 1) && uvsrc_y < 2)
-            uvsrc_y--;
         if (use_ic
             || s->h_edge_pos < 10 || v_edge_pos < (5 << fieldmv)
             || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 5
             || (unsigned)uvsrc_y > v_edge_pos - (5 << fieldmv)) {
-            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcU,
-                                     s->uvlinesize, s->uvlinesize,
-                                     5, (5 << fieldmv), uvsrc_x, uvsrc_y,
-                                     s->h_edge_pos >> 1, v_edge_pos);
-            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16, srcV,
-                                     s->uvlinesize, s->uvlinesize,
-                                     5, (5 << fieldmv), uvsrc_x, uvsrc_y,
-                                     s->h_edge_pos >> 1, v_edge_pos);
+            if (interlace) {
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                         srcU,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         5,
+                                         (5 << fieldmv) + 1 >> 1,
+                                         uvsrc_x,
+                                         uvsrc_y >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16,
+                                         srcV,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         5,
+                                         (5 << fieldmv) + 1 >> 1,
+                                         uvsrc_x,
+                                         uvsrc_y >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+                if (!fieldmv) {
+                    s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + uvlinesize,
+                                             srcU + uvlinesize,
+                                             uvlinesize << 1,
+                                             uvlinesize << 1,
+                                             5,
+                                             2,
+                                             uvsrc_x,
+                                             uvsrc_y + 1 >> 1,
+                                             s->h_edge_pos >> 1,
+                                             s->v_edge_pos >> 2);
+                    s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16 + uvlinesize,
+                                             srcV + uvlinesize,
+                                             uvlinesize << 1,
+                                             uvlinesize << 1,
+                                             5,
+                                             2,
+                                             uvsrc_x,
+                                             uvsrc_y + 1 >> 1,
+                                             s->h_edge_pos >> 1,
+                                             s->v_edge_pos >> 2);
+                }
+            } else {
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                         srcU,
+                                         uvlinesize,
+                                         uvlinesize,
+                                         5,
+                                         5 << fieldmv,
+                                         uvsrc_x,
+                                         uvsrc_y,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 1);
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + 16,
+                                         srcV,
+                                         uvlinesize,
+                                         uvlinesize,
+                                         5,
+                                         5 << fieldmv,
+                                         uvsrc_x,
+                                         uvsrc_y,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 1);
+            }
             srcU = s->sc.edge_emu_buffer;
             srcV = s->sc.edge_emu_buffer + 16;
 
             /* if we deal with intensity compensation we need to scale source blocks */
             if (use_ic) {
-                int i, j;
-                uint8_t *src, *src2;
-
-                src  = srcU;
-                src2 = srcV;
-                for (j = 0; j < 5; j++) {
-                    int f = (uvsrc_y + (j << fieldmv)) & 1;
-                    for (i = 0; i < 5; i++) {
-                        src[i]  = lutuv[f][src[i]];
-                        src2[i] = lutuv[f][src2[i]];
-                    }
-                    src  += s->uvlinesize << fieldmv;
-                    src2 += s->uvlinesize << fieldmv;
-                }
+                vc1_lut_scale_chroma(srcU, srcV,
+                                     lutuv[(uvsrc_y + (0 << fieldmv)) & 1],
+                                     lutuv[(uvsrc_y + (1 << fieldmv)) & 1],
+                                     5, s->uvlinesize << fieldmv);
             }
         }
         if (avg) {
@@ -791,13 +1007,17 @@ void ff_vc1_interp_mc(VC1Context *v)
     H264ChromaContext *h264chroma = &v->h264chroma;
     uint8_t *srcY, *srcU, *srcV;
     int dxy, mx, my, uvmx, uvmy, src_x, src_y, uvsrc_x, uvsrc_y;
-    int off, off_uv;
     int v_edge_pos = s->v_edge_pos >> v->field_mode;
     int use_ic = v->next_use_ic;
+    int interlace;
+    int linesize, uvlinesize;
 
     if (!v->field_mode && !v->s.next_picture.f->data[0])
         return;
 
+    linesize = s->current_picture_ptr->f->linesize[0];
+    uvlinesize = s->current_picture_ptr->f->linesize[1];
+
     mx   = s->mv[1][0][0];
     my   = s->mv[1][0][1];
     uvmx = (mx + ((mx & 3) == 3)) >> 1;
@@ -814,6 +1034,8 @@ void ff_vc1_interp_mc(VC1Context *v)
     srcU = s->next_picture.f->data[1];
     srcV = s->next_picture.f->data[2];
 
+    interlace = s->next_picture.f->interlaced_frame;
+
     src_x   = s->mb_x * 16 + (mx   >> 2);
     src_y   = s->mb_y * 16 + (my   >> 2);
     uvsrc_x = s->mb_x *  8 + (uvmx >> 2);
@@ -826,9 +1048,14 @@ void ff_vc1_interp_mc(VC1Context *v)
         uvsrc_y = av_clip(uvsrc_y,  -8, s->mb_height *  8);
     } else {
         src_x   = av_clip(  src_x, -17, s->avctx->coded_width);
-        src_y   = av_clip(  src_y, -18, s->avctx->coded_height + 1);
         uvsrc_x = av_clip(uvsrc_x,  -8, s->avctx->coded_width  >> 1);
-        uvsrc_y = av_clip(uvsrc_y,  -8, s->avctx->coded_height >> 1);
+        if (v->fcm == ILACE_FRAME) {
+            src_y = av_clip(src_y, -18 + (src_y & 1), s->avctx->coded_height + (src_y & 1));
+            uvsrc_y = av_clip(uvsrc_y, -8 + (uvsrc_y & 1), (s->avctx->coded_height >> 1) + (uvsrc_y & 1));
+        } else {
+            src_y = av_clip(src_y, -18, s->avctx->coded_height + 1);
+            uvsrc_y = av_clip(uvsrc_y,  -8, s->avctx->coded_height >> 1);
+        }
     }
 
     srcY += src_y   * s->linesize   + src_x;
@@ -836,13 +1063,13 @@ void ff_vc1_interp_mc(VC1Context *v)
     srcV += uvsrc_y * s->uvlinesize + uvsrc_x;
 
     if (v->field_mode && v->ref_field_type[1]) {
-        srcY += s->current_picture_ptr->f->linesize[0];
-        srcU += s->current_picture_ptr->f->linesize[1];
-        srcV += s->current_picture_ptr->f->linesize[2];
+        srcY += linesize;
+        srcU += uvlinesize;
+        srcV += uvlinesize;
     }
 
     /* for grayscale we should not try to read from unknown area */
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY) {
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY) {
         srcU = s->sc.edge_emu_buffer + 18 * s->linesize;
         srcV = s->sc.edge_emu_buffer + 18 * s->linesize;
     }
@@ -850,105 +1077,156 @@ void ff_vc1_interp_mc(VC1Context *v)
     if (v->rangeredfrm || s->h_edge_pos < 22 || v_edge_pos < 22 || use_ic
         || (unsigned)(src_x - 1) > s->h_edge_pos - (mx & 3) - 16 - 3
         || (unsigned)(src_y - 1) > v_edge_pos    - (my & 3) - 16 - 3) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *ubuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+        const int k = 17 + s->mspel * 2;
 
         srcY -= s->mspel * (1 + s->linesize);
-        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
-                                 s->linesize, s->linesize,
-                                 17 + s->mspel * 2, 17 + s->mspel * 2,
-                                 src_x - s->mspel, src_y - s->mspel,
-                                 s->h_edge_pos, v_edge_pos);
+        if (interlace) {
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcY,
+                                     linesize << 1,
+                                     linesize << 1,
+                                     k,
+                                     v->field_mode ? k : (k + 1 >> 1),
+                                     src_x - s->mspel,
+                                     src_y - s->mspel >> !v->field_mode,
+                                     s->h_edge_pos,
+                                     s->v_edge_pos >> 1);
+            if (!v->field_mode)
+                s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer + linesize,
+                                         srcY + linesize,
+                                         linesize << 1,
+                                         linesize << 1,
+                                         k,
+                                         k >> 1,
+                                         src_x - s->mspel,
+                                         src_y - s->mspel + 1 >> 1,
+                                         s->h_edge_pos,
+                                         s->v_edge_pos >> 1);
+        } else
+            s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer,
+                                     srcY,
+                                     linesize,
+                                     linesize,
+                                     k,
+                                     v->field_mode ? (k << 1) - 1 : k,
+                                     src_x - s->mspel,
+                                     v->field_mode ? 2 * (src_y - s->mspel) + v->ref_field_type[1] :
+                                                     src_y - s->mspel,
+                                     s->h_edge_pos,
+                                     s->v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
-                                 s->uvlinesize, s->uvlinesize,
-                                 8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
-                                 s->uvlinesize, s->uvlinesize,
-                                 8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+        if (interlace) {
+            s->vdsp.emulated_edge_mc(ubuf,
+                                     srcU,
+                                     uvlinesize << 1,
+                                     uvlinesize << 1,
+                                     9,
+                                     v->field_mode ? 9 : 5,
+                                     uvsrc_x,
+                                     uvsrc_y >> !v->field_mode,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 2);
+            s->vdsp.emulated_edge_mc(vbuf,
+                                     srcV,
+                                     uvlinesize << 1,
+                                     uvlinesize << 1,
+                                     9,
+                                     v->field_mode ? 9 : 5,
+                                     uvsrc_x,
+                                     uvsrc_y >> !v->field_mode,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 2);
+            if (!v->field_mode) {
+                s->vdsp.emulated_edge_mc(ubuf + uvlinesize,
+                                         srcU + uvlinesize,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         9,
+                                         4,
+                                         uvsrc_x,
+                                         uvsrc_y + 1 >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+                s->vdsp.emulated_edge_mc(vbuf + uvlinesize,
+                                         srcV + uvlinesize,
+                                         uvlinesize << 1,
+                                         uvlinesize << 1,
+                                         9,
+                                         4,
+                                         uvsrc_x,
+                                         uvsrc_y + 1 >> 1,
+                                         s->h_edge_pos >> 1,
+                                         s->v_edge_pos >> 2);
+            }
+        } else {
+            s->vdsp.emulated_edge_mc(ubuf,
+                                     srcU,
+                                     uvlinesize,
+                                     uvlinesize,
+                                     9,
+                                     v->field_mode ? 17 : 9,
+                                     uvsrc_x,
+                                     v->field_mode ? 2 * uvsrc_y + v->ref_field_type[1] : uvsrc_y,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf,
+                                     srcV,
+                                     uvlinesize,
+                                     uvlinesize,
+                                     9,
+                                     v->field_mode ? 17 : 9,
+                                     uvsrc_x,
+                                     v->field_mode ? 2 * uvsrc_y + v->ref_field_type[1] : uvsrc_y,
+                                     s->h_edge_pos >> 1,
+                                     s->v_edge_pos >> 1);
+        }
+        srcU = ubuf;
+        srcV = vbuf;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize;
-            }
-            src = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_luma(srcY, k, s->linesize);
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
 
         if (use_ic) {
             uint8_t (*luty )[256] = v->next_luty;
             uint8_t (*lutuv)[256] = v->next_lutuv;
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[1] : ((j+src_y - s->mspel) & 1);
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? v->ref_field_type[1] : ((j+uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[1] : ((0+src_y - s->mspel) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[1] : ((1+src_y - s->mspel) & 1)],
+                               k, s->linesize);
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? v->ref_field_type[1] : ((0+uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? v->ref_field_type[1] : ((1+uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
         srcY += s->mspel * (1 + s->linesize);
     }
 
-    off    = 0;
-    off_uv = 0;
-
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8, srcY + 8, s->linesize, v->rnd);
-        srcY += s->linesize * 8;
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8 * s->linesize    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8 * s->linesize + 8, srcY + 8, s->linesize, v->rnd);
+        v->vc1dsp.avg_vc1_mspel_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, v->rnd);
     } else { // hpel mc
         dxy = (my & 2) | ((mx & 2) >> 1);
 
         if (!v->rnd)
-            s->hdsp.avg_pixels_tab[0][dxy](s->dest[0] + off, srcY, s->linesize, 16);
+            s->hdsp.avg_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, 16);
         else
-            s->hdsp.avg_no_rnd_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize, 16);
+            s->hdsp.avg_no_rnd_pixels_tab[dxy](s->dest[0], srcY, s->linesize, 16);
     }
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
     /* Chroma MC always uses qpel bilinear */
     uvmx = (uvmx & 3) << 1;
     uvmy = (uvmy & 3) << 1;
     if (!v->rnd) {
-        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[1] + off_uv, srcU, s->uvlinesize, 8, uvmx, uvmy);
-        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[2] + off_uv, srcV, s->uvlinesize, 8, uvmx, uvmy);
+        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
+        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     } else {
-        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1] + off_uv, srcU, s->uvlinesize, 8, uvmx, uvmy);
-        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2] + off_uv, srcV, s->uvlinesize, 8, uvmx, uvmy);
+        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
+        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     }
 }
diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
index 7234db6..493ffde 100644
--- a/libavcodec/vc1_parser.c
+++ b/libavcodec/vc1_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 #include "parser.h"
 #include "vc1.h"
 #include "get_bits.h"
+#include "internal.h"
 
 /** The maximum number of bytes of a sequence, entry point or
  *  frame header whose values we pay any attention to */
@@ -63,9 +64,10 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
     /* Parse the header we just finished unescaping */
     VC1ParseContext *vpc = s->priv_data;
     GetBitContext gb;
+    int ret;
     vpc->v.s.avctx = avctx;
     vpc->v.parse_only = 1;
-    init_get_bits(&gb, buf, buf_size * 8);
+    init_get_bits8(&gb, buf, buf_size);
     switch (vpc->prev_start_code) {
     case VC1_CODE_SEQHDR & 0xFF:
         ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
@@ -75,9 +77,12 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
         break;
     case VC1_CODE_FRAME & 0xFF:
         if(vpc->v.profile < PROFILE_ADVANCED)
-            ff_vc1_parse_frame_header    (&vpc->v, &gb);
+            ret = ff_vc1_parse_frame_header    (&vpc->v, &gb);
         else
-            ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+            ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+
+        if (ret < 0)
+            break;
 
         /* keep AV_PICTURE_TYPE_BI internal to VC1 */
         if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
@@ -108,6 +113,8 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
 
         break;
     }
+    if (avctx->framerate.num)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
     s->format = vpc->v.chromaformat == 1 ? AV_PIX_FMT_YUV420P
                                          : AV_PIX_FMT_NONE;
     if (avctx->width && avctx->height) {
@@ -241,7 +248,7 @@ static int vc1_parse(AVCodecParserContext *s,
      * the start code we've already seen, or cause extra bytes to be
      * inserted at the start of the unescaped buffer. */
     vpc->bytes_to_skip = 4;
-    if (next < 0 && start_code_found)
+    if (next < 0 && next != END_NOT_FOUND)
         vpc->bytes_to_skip += next;
 
     *poutbuf = buf;
@@ -252,20 +259,18 @@ static int vc1_parse(AVCodecParserContext *s,
 static int vc1_split(AVCodecContext *avctx,
                            const uint8_t *buf, int buf_size)
 {
-    int i;
-    uint32_t state= -1;
-    int charged=0;
+    uint32_t state = -1;
+    int charged = 0;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
 
-    for(i=0; i<buf_size; i++){
-        state= (state<<8) | buf[i];
-        if(IS_MARKER(state)){
-            if(state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT){
-                charged=1;
-            }else if(charged){
-                return i-3;
-            }
-        }
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if (state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT) {
+            charged = 1;
+        } else if (charged && IS_MARKER(state))
+            return ptr - 4 - buf;
     }
+
     return 0;
 }
 
@@ -273,6 +278,7 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s)
 {
     VC1ParseContext *vpc = s->priv_data;
     vpc->v.s.slice_context_count = 1;
+    vpc->v.first_pic_header_flag = 1;
     vpc->prev_start_code = 0;
     vpc->bytes_to_skip = 0;
     vpc->unesc_index = 0;
diff --git a/libavcodec/vc1_pred.c b/libavcodec/vc1_pred.c
index 25be787..9e29b44 100644
--- a/libavcodec/vc1_pred.c
+++ b/libavcodec/vc1_pred.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -170,9 +170,9 @@ static av_always_inline int scaleforsame(VC1Context *v, int i, int n /* MV */,
     n >>= hpel;
     if (v->s.pict_type != AV_PICTURE_TYPE_B || v->second_field || !dir) {
         if (dim)
-            n = scaleforsame_y(v, i, n, dir) << hpel;
+            n = scaleforsame_y(v, i, n, dir) * (1 << hpel);
         else
-            n = scaleforsame_x(v, n, dir) << hpel;
+            n = scaleforsame_x(v, n, dir) * (1 << hpel);
         return n;
     }
     brfd      = FFMIN(v->brfd, 3);
@@ -202,7 +202,7 @@ static av_always_inline int scaleforopp(VC1Context *v, int n /* MV */,
         refdist = dir ? v->brfd : v->frfd;
     scaleopp = ff_vc1_field_mvpred_scales[dir ^ v->second_field][0][refdist];
 
-    n = (n * scaleopp >> 8) << hpel;
+    n = (n * scaleopp >> 8) * (1 << hpel);
     return n;
 }
 
@@ -231,8 +231,10 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     else
         mixedmv_pic = 0;
     /* scale MV difference to be quad-pel */
-    dmv_x <<= 1 - s->quarter_sample;
-    dmv_y <<= 1 - s->quarter_sample;
+    if (!s->quarter_sample) {
+        dmv_x *= 2;
+        dmv_y *= 2;
+    }
 
     wrap = s->b8_stride;
     xy   = s->block_index[n];
@@ -252,7 +254,7 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
             v->luma_mv[s->mb_x][0] = v->luma_mv[s->mb_x][1] = 0;
             s->current_picture.motion_val[1][xy + 1 + v->blocks_off][0]        = 0;
             s->current_picture.motion_val[1][xy + 1 + v->blocks_off][1]        = 0;
-            s->current_picture.motion_val[1][xy + wrap][0]                     = 0;
+            s->current_picture.motion_val[1][xy + wrap + v->blocks_off][0]     = 0;
             s->current_picture.motion_val[1][xy + wrap + v->blocks_off][1]     = 0;
             s->current_picture.motion_val[1][xy + wrap + 1 + v->blocks_off][0] = 0;
             s->current_picture.motion_val[1][xy + wrap + 1 + v->blocks_off][1] = 0;
@@ -260,18 +262,23 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
         return;
     }
 
-    C = s->current_picture.motion_val[dir][xy -    1 + v->blocks_off];
-    A = s->current_picture.motion_val[dir][xy - wrap + v->blocks_off];
+    a_valid = !s->first_slice_line || (n == 2 || n == 3);
+    b_valid = a_valid;
+    c_valid = s->mb_x || (n == 1 || n == 3);
     if (mv1) {
         if (v->field_mode && mixedmv_pic)
             off = (s->mb_x == (s->mb_width - 1)) ? -2 : 2;
         else
             off = (s->mb_x == (s->mb_width - 1)) ? -1 : 2;
+        b_valid = b_valid && s->mb_width > 1;
     } else {
         //in 4-MV mode different blocks have different B predictor position
         switch (n) {
         case 0:
-            off = (s->mb_x > 0) ? -1 : 1;
+            if (v->res_rtm_flag)
+                off = s->mb_x ? -1 : 1;
+            else
+                off = s->mb_x ? -1 : 2 * s->mb_width - wrap - 1;
             break;
         case 1:
             off = (s->mb_x == (s->mb_width - 1)) ? -1 : 1;
@@ -282,12 +289,10 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
         case 3:
             off = -1;
         }
+        if (v->field_mode && s->mb_width == 1)
+            b_valid = b_valid && c_valid;
     }
-    B = s->current_picture.motion_val[dir][xy - wrap + off + v->blocks_off];
 
-    a_valid = !s->first_slice_line || (n == 2 || n == 3);
-    b_valid = a_valid && (s->mb_width > 1);
-    c_valid = s->mb_x || (n == 1 || n == 3);
     if (v->field_mode) {
         a_valid = a_valid && !is_intra[xy - wrap];
         b_valid = b_valid && !is_intra[xy - wrap + off];
@@ -295,6 +300,7 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     }
 
     if (a_valid) {
+        A = s->current_picture.motion_val[dir][xy - wrap + v->blocks_off];
         a_f = v->mv_f[dir][xy - wrap + v->blocks_off];
         num_oppfield  += a_f;
         num_samefield += 1 - a_f;
@@ -305,6 +311,7 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
         a_f = 0;
     }
     if (b_valid) {
+        B = s->current_picture.motion_val[dir][xy - wrap + off + v->blocks_off];
         b_f = v->mv_f[dir][xy - wrap + off + v->blocks_off];
         num_oppfield  += b_f;
         num_samefield += 1 - b_f;
@@ -315,6 +322,7 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
         b_f = 0;
     }
     if (c_valid) {
+        C = s->current_picture.motion_val[dir][xy - 1 + v->blocks_off];
         c_f = v->mv_f[dir][xy - 1 + v->blocks_off];
         num_oppfield  += c_f;
         num_samefield += 1 - c_f;
@@ -339,6 +347,8 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     } else
         opposite = 0;
     if (opposite) {
+        v->mv_f[dir][xy + v->blocks_off] = 1;
+        v->ref_field_type[dir] = !v->cur_field_type;
         if (a_valid && !a_f) {
             field_predA[0] = scaleforopp(v, field_predA[0], 0, dir);
             field_predA[1] = scaleforopp(v, field_predA[1], 1, dir);
@@ -351,9 +361,9 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
             field_predC[0] = scaleforopp(v, field_predC[0], 0, dir);
             field_predC[1] = scaleforopp(v, field_predC[1], 1, dir);
         }
-        v->mv_f[dir][xy + v->blocks_off] = 1;
-        v->ref_field_type[dir] = !v->cur_field_type;
     } else {
+        v->mv_f[dir][xy + v->blocks_off] = 0;
+        v->ref_field_type[dir] = v->cur_field_type;
         if (a_valid && a_f) {
             field_predA[0] = scaleforsame(v, n, field_predA[0], 0, dir);
             field_predA[1] = scaleforsame(v, n, field_predA[1], 1, dir);
@@ -366,8 +376,6 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
             field_predC[0] = scaleforsame(v, n, field_predC[0], 0, dir);
             field_predC[1] = scaleforsame(v, n, field_predC[1], 1, dir);
         }
-        v->mv_f[dir][xy + v->blocks_off] = 0;
-        v->ref_field_type[dir] = v->cur_field_type;
     }
 
     if (a_valid) {
@@ -392,17 +400,13 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     /* Pullback MV as specified in 8.3.5.3.4 */
     if (!v->field_mode) {
         int qx, qy, X, Y;
+        int MV = mv1 ? -60 : -28;
         qx = (s->mb_x << 6) + ((n == 1 || n == 3) ? 32 : 0);
         qy = (s->mb_y << 6) + ((n == 2 || n == 3) ? 32 : 0);
         X  = (s->mb_width  << 6) - 4;
         Y  = (s->mb_height << 6) - 4;
-        if (mv1) {
-            if (qx + px < -60) px = -60 - qx;
-            if (qy + py < -60) py = -60 - qy;
-        } else {
-            if (qx + px < -28) px = -28 - qx;
-            if (qy + py < -28) py = -28 - qy;
-        }
+        if (qx + px < MV) px = MV - qx;
+        if (qy + py < MV) py = MV - qy;
         if (qx + px > X) px = X - qx;
         if (qy + py > Y) py = Y - qy;
     }
@@ -602,9 +606,9 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 px = mid_pred(A[0], B[0], C[0]);
                 py = mid_pred(A[1], B[1], C[1]);
             } else if (total_valid) {
-                if (a_valid) { px = A[0]; py = A[1]; }
-                if (b_valid) { px = B[0]; py = B[1]; }
-                if (c_valid) { px = C[0]; py = C[1]; }
+                if      (a_valid) { px = A[0]; py = A[1]; }
+                else if (b_valid) { px = B[0]; py = B[1]; }
+                else              { px = C[0]; py = C[1]; }
             }
         }
     } else {
@@ -644,7 +648,8 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 } else if (!field_b && b_valid) {
                     px = B[0];
                     py = B[1];
-                } else if (c_valid) {
+                } else /*if (c_valid)*/ {
+                    av_assert1(c_valid);
                     px = C[0];
                     py = C[1];
                 }
@@ -652,7 +657,8 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 if (field_a && a_valid) {
                     px = A[0];
                     py = A[1];
-                } else if (field_b && b_valid) {
+                } else /*if (field_b && b_valid)*/ {
+                    av_assert1(field_b && b_valid);
                     px = B[0];
                     py = B[1];
                 }
@@ -692,25 +698,31 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
     int r_x, r_y;
     const uint8_t *is_intra = v->mb_type[0];
 
+    av_assert0(!v->field_mode);
+
     r_x = v->range_x;
     r_y = v->range_y;
     /* scale MV difference to be quad-pel */
-    dmv_x[0] <<= 1 - s->quarter_sample;
-    dmv_y[0] <<= 1 - s->quarter_sample;
-    dmv_x[1] <<= 1 - s->quarter_sample;
-    dmv_y[1] <<= 1 - s->quarter_sample;
+    if (!s->quarter_sample) {
+        dmv_x[0] *= 2;
+        dmv_y[0] *= 2;
+        dmv_x[1] *= 2;
+        dmv_y[1] *= 2;
+    }
 
     wrap = s->b8_stride;
     xy = s->block_index[0];
 
     if (s->mb_intra) {
-        s->current_picture.motion_val[0][xy + v->blocks_off][0] =
-        s->current_picture.motion_val[0][xy + v->blocks_off][1] =
-        s->current_picture.motion_val[1][xy + v->blocks_off][0] =
-        s->current_picture.motion_val[1][xy + v->blocks_off][1] = 0;
+        s->current_picture.motion_val[0][xy][0] =
+        s->current_picture.motion_val[0][xy][1] =
+        s->current_picture.motion_val[1][xy][0] =
+        s->current_picture.motion_val[1][xy][1] = 0;
         return;
     }
-    if (!v->field_mode) {
+        if (direct && s->next_picture_ptr->field_picture)
+            av_log(s->avctx, AV_LOG_WARNING, "Mixed frame/field direct mode not supported\n");
+
         s->mv[0][0][0] = scale_mv(s->next_picture.motion_val[1][xy][0], v->bfraction, 0, s->quarter_sample);
         s->mv[0][0][1] = scale_mv(s->next_picture.motion_val[1][xy][1], v->bfraction, 0, s->quarter_sample);
         s->mv[1][0][0] = scale_mv(s->next_picture.motion_val[1][xy][0], v->bfraction, 1, s->quarter_sample);
@@ -721,12 +733,11 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         s->mv[0][0][1] = av_clip(s->mv[0][0][1], -60 - (s->mb_y << 6), (s->mb_height << 6) - 4 - (s->mb_y << 6));
         s->mv[1][0][0] = av_clip(s->mv[1][0][0], -60 - (s->mb_x << 6), (s->mb_width  << 6) - 4 - (s->mb_x << 6));
         s->mv[1][0][1] = av_clip(s->mv[1][0][1], -60 - (s->mb_y << 6), (s->mb_height << 6) - 4 - (s->mb_y << 6));
-    }
     if (direct) {
-        s->current_picture.motion_val[0][xy + v->blocks_off][0] = s->mv[0][0][0];
-        s->current_picture.motion_val[0][xy + v->blocks_off][1] = s->mv[0][0][1];
-        s->current_picture.motion_val[1][xy + v->blocks_off][0] = s->mv[1][0][0];
-        s->current_picture.motion_val[1][xy + v->blocks_off][1] = s->mv[1][0][1];
+        s->current_picture.motion_val[0][xy][0] = s->mv[0][0][0];
+        s->current_picture.motion_val[0][xy][1] = s->mv[0][0][1];
+        s->current_picture.motion_val[1][xy][0] = s->mv[1][0][0];
+        s->current_picture.motion_val[1][xy][1] = s->mv[1][0][1];
         return;
     }
 
@@ -754,25 +765,16 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         /* Pullback MV as specified in 8.3.5.3.4 */
         {
             int qx, qy, X, Y;
-            if (v->profile < PROFILE_ADVANCED) {
-                qx = (s->mb_x << 5);
-                qy = (s->mb_y << 5);
-                X  = (s->mb_width  << 5) - 4;
-                Y  = (s->mb_height << 5) - 4;
-                if (qx + px < -28) px = -28 - qx;
-                if (qy + py < -28) py = -28 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            } else {
-                qx = (s->mb_x << 6);
-                qy = (s->mb_y << 6);
-                X  = (s->mb_width  << 6) - 4;
-                Y  = (s->mb_height << 6) - 4;
-                if (qx + px < -60) px = -60 - qx;
-                if (qy + py < -60) py = -60 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            }
+            int sh = v->profile < PROFILE_ADVANCED ? 5 : 6;
+            int MV = 4 - (1 << sh);
+            qx = (s->mb_x << sh);
+            qy = (s->mb_y << sh);
+            X  = (s->mb_width  << sh) - 4;
+            Y  = (s->mb_height << sh) - 4;
+            if (qx + px < MV) px = MV - qx;
+            if (qy + py < MV) py = MV - qy;
+            if (qx + px > X) px = X - qx;
+            if (qy + py > Y) py = Y - qy;
         }
         /* Calculate hybrid prediction as specified in 8.3.5.3.5 */
         if (0 && !s->first_slice_line && s->mb_x) {
@@ -833,25 +835,16 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         /* Pullback MV as specified in 8.3.5.3.4 */
         {
             int qx, qy, X, Y;
-            if (v->profile < PROFILE_ADVANCED) {
-                qx = (s->mb_x << 5);
-                qy = (s->mb_y << 5);
-                X  = (s->mb_width  << 5) - 4;
-                Y  = (s->mb_height << 5) - 4;
-                if (qx + px < -28) px = -28 - qx;
-                if (qy + py < -28) py = -28 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            } else {
-                qx = (s->mb_x << 6);
-                qy = (s->mb_y << 6);
-                X  = (s->mb_width  << 6) - 4;
-                Y  = (s->mb_height << 6) - 4;
-                if (qx + px < -60) px = -60 - qx;
-                if (qy + py < -60) py = -60 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            }
+            int sh = v->profile < PROFILE_ADVANCED ? 5 : 6;
+            int MV = 4 - (1 << sh);
+            qx = (s->mb_x << sh);
+            qy = (s->mb_y << sh);
+            X  = (s->mb_width  << sh) - 4;
+            Y  = (s->mb_height << sh) - 4;
+            if (qx + px < MV) px = MV - qx;
+            if (qy + py < MV) py = MV - qy;
+            if (qx + px > X) px = X - qx;
+            if (qy + py > Y) py = Y - qy;
         }
         /* Calculate hybrid prediction as specified in 8.3.5.3.5 */
         if (0 && !s->first_slice_line && s->mb_x) {
diff --git a/libavcodec/vc1_pred.h b/libavcodec/vc1_pred.h
index 34c9c1a..4d47f86 100644
--- a/libavcodec/vc1_pred.h
+++ b/libavcodec/vc1_pred.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vc1acdata.h b/libavcodec/vc1acdata.h
index 73ebe40..a70b44a 100644
--- a/libavcodec/vc1acdata.h
+++ b/libavcodec/vc1acdata.h
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder
  * copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vc1data.c b/libavcodec/vc1data.c
index 70cead8..19f1cad 100644
--- a/libavcodec/vc1data.c
+++ b/libavcodec/vc1data.c
@@ -4,20 +4,20 @@
  * copyright (c) 2006 Konstantin Shishkov
  * (c) 2005 anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,7 +61,7 @@ const uint8_t ff_vc1_mbmode_intfrp[2][15][4] = {
         { MV_PMODE_INTFR_1MV      , 1, 0, 1 },
         { MV_PMODE_INTFR_2MV_FIELD, 0, 0, 1 },
         { MV_PMODE_INTFR_2MV_FIELD, 1, 0, 1 },
-        { MV_PMODE_INTFR_2MV_FIELD, 0, 0, 0 },
+        { MV_PMODE_INTFR_2MV_FIELD, 1, 0, 0 },
         { MV_PMODE_INTFR_INTRA    , 0, 0, 0 }
     },
     {
@@ -73,13 +73,13 @@ const uint8_t ff_vc1_mbmode_intfrp[2][15][4] = {
         { MV_PMODE_INTFR_1MV      , 1, 0, 1 },
         { MV_PMODE_INTFR_2MV_FIELD, 0, 0, 1 },
         { MV_PMODE_INTFR_2MV_FIELD, 1, 0, 1 },
-        { MV_PMODE_INTFR_2MV_FIELD, 0, 0, 0 },
+        { MV_PMODE_INTFR_2MV_FIELD, 1, 0, 0 },
         { MV_PMODE_INTFR_4MV      , 0, 0, 1 },
         { MV_PMODE_INTFR_4MV      , 1, 0, 1 },
         { MV_PMODE_INTFR_4MV      , 0, 0, 0 },
         { MV_PMODE_INTFR_4MV_FIELD, 0, 0, 1 },
         { MV_PMODE_INTFR_4MV_FIELD, 1, 0, 1 },
-        { MV_PMODE_INTFR_4MV_FIELD, 0, 0, 0 },
+        { MV_PMODE_INTFR_4MV_FIELD, 1, 0, 0 },
         { MV_PMODE_INTFR_INTRA    , 0, 0, 0 }
     }
 };
@@ -1019,21 +1019,21 @@ const uint8_t ff_vc1_mv_diff_bits[4][73] = {
 /* DC differentials low+hi-mo, p217 are the same as in msmpeg4data .h */
 
 /* Table 232 */
-const int8_t ff_vc1_simple_progressive_4x4_zz [16] = {
+const uint8_t ff_vc1_simple_progressive_4x4_zz [16] = {
      0,     8,    16,     1,
      9,    24,    17,     2,
     10,    18,    25,     3,
     11,    26,    19,    27
 };
 
-const int8_t ff_vc1_adv_progressive_8x4_zz [32] = { /* Table 233 */
+const uint8_t ff_vc1_adv_progressive_8x4_zz [32] = { /* Table 233 */
      0,     8,     1,    16,     2,     9,    10,     3,
     24,    17,     4,    11,    18,    12,     5,    19,
     25,    13,    20,    26,    27,     6,    21,    28,
     14,    22,    29,     7,    30,    15,    23,    31
 };
 
-const int8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
+const uint8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
      0,     1,     8,     2,
      9,    16,    17,    24,
     10,    32,    25,    18,
@@ -1044,7 +1044,7 @@ const int8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
     35,    43,    51,    59
 };
 
-const int8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
+const uint8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
      0,     8,     1,    16,    24,     9,     2,    32,
     40,    48,    56,    17,    10,     3,    25,    18,
     11,     4,    33,    41,    49,    57,    26,    34,
@@ -1055,14 +1055,14 @@ const int8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
     61,    62,    54,    46,    39,    47,    55,    63
 };
 
-const int8_t ff_vc1_adv_interlaced_8x4_zz [32] = { /* Table 236 */
+const uint8_t ff_vc1_adv_interlaced_8x4_zz [32] = { /* Table 236 */
      0,     8,    16,    24,     1,     9,     2,    17,
     25,    10,     3,    18,    26,     4,    11,    19,
     12,     5,    13,    20,    27,     6,    21,    28,
     14,    22,    29,     7,    30,    15,    23,    31
 };
 
-const int8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
+const uint8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
      0,     1,     2,     8,
     16,     9,    24,    17,
     10,     3,    32,    40,
@@ -1073,7 +1073,7 @@ const int8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
     35,    43,    51,    59
 };
 
-const int8_t ff_vc1_adv_interlaced_4x4_zz [16] = { /* Table 238 */
+const uint8_t ff_vc1_adv_interlaced_4x4_zz [16] = { /* Table 238 */
      0,     8,    16,    24,
      1,     9,    17,     2,
     25,    10,    18,     3,
@@ -1090,7 +1090,7 @@ const int32_t ff_vc1_dqscale[63] = {
      0x1F08,  0x1E1E,  0x1D42,  0x1C72, 0x1BAD, 0x1AF3, 0x1A42, 0x199A,
      0x18FA,  0x1862,  0x17D0,  0x1746, 0x16C1, 0x1643, 0x15CA, 0x1555,
      0x14E6,  0x147B,  0x1414,  0x13B1, 0x1352, 0x12F7, 0x129E, 0x1249,
-     0x11F7,  0x11A8,  0x115B,  0x1111, 0x10C9, 0x1084, 0x1000
+     0x11F7,  0x11A8,  0x115B,  0x1111, 0x10C9, 0x1084, 0x1041
 };
 
 /* P Interlaced field picture MV predictor scaling values (Table 114) */
diff --git a/libavcodec/vc1data.h b/libavcodec/vc1data.h
index 79a434f..90dd8ba 100644
--- a/libavcodec/vc1data.h
+++ b/libavcodec/vc1data.h
@@ -3,20 +3,20 @@
  * copyright (c) 2006 Konstantin Shishkov
  * (c) 2005 anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -183,15 +183,15 @@ extern const uint8_t ff_vc1_2ref_mvdata_bits[8][126];
 /* DC differentials low+hi-mo, p217 are the same as in msmpeg4data .h */
 
 /* Scantables/ZZ scan are at 11.9 (p262) and 8.1.1.12 (p10) */
-extern const int8_t ff_vc1_simple_progressive_4x4_zz [16];
-extern const int8_t ff_vc1_adv_progressive_8x4_zz [32];
-extern const int8_t ff_vc1_adv_progressive_4x8_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_8x8_zz [64];
-extern const int8_t ff_vc1_adv_interlaced_8x4_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_4x8_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_4x4_zz [16];
-extern const int8_t ff_vc1_intra_horz_8x8_zz [64];
-extern const int8_t ff_vc1_intra_vert_8x8_zz [64];
+extern const uint8_t ff_vc1_simple_progressive_4x4_zz [16];
+extern const uint8_t ff_vc1_adv_progressive_8x4_zz [32];
+extern const uint8_t ff_vc1_adv_progressive_4x8_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_8x8_zz [64];
+extern const uint8_t ff_vc1_adv_interlaced_8x4_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_4x8_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_4x4_zz [16];
+extern const uint8_t ff_vc1_intra_horz_8x8_zz [64];
+extern const uint8_t ff_vc1_intra_vert_8x8_zz [64];
 
 /* DQScale as specified in 8.1.3.9 - almost identical to 0x40000/i */
 extern const int32_t ff_vc1_dqscale[63];
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 5005a21..9519864 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,8 @@
 #include "profiles.h"
 #include "vc1.h"
 #include "vc1data.h"
+#include "libavutil/avassert.h"
+
 
 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
 
@@ -99,7 +101,7 @@ static void vc1_sprite_parse_transform(GetBitContext* gb, int c[7])
         c[6] = 1 << 16;
 }
 
-static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
+static int vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
 {
     AVCodecContext *avctx = v->s.avctx;
     int sprite, i;
@@ -143,7 +145,7 @@ static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
         sd->effect_pcount2 = get_bits(gb, 16);
         if (sd->effect_pcount2 > 10) {
             av_log(avctx, AV_LOG_ERROR, "Too many effect parameters\n");
-            return;
+            return AVERROR_INVALIDDATA;
         } else if (sd->effect_pcount2) {
             i = -1;
             av_log(avctx, AV_LOG_DEBUG, "Effect params 2: ");
@@ -160,10 +162,14 @@ static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
         av_log(avctx, AV_LOG_DEBUG, "Effect flag set\n");
 
     if (get_bits_count(gb) >= gb->size_in_bits +
-       (avctx->codec_id == AV_CODEC_ID_WMV3IMAGE ? 64 : 0))
+       (avctx->codec_id == AV_CODEC_ID_WMV3IMAGE ? 64 : 0)) {
         av_log(avctx, AV_LOG_ERROR, "Buffer overrun\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (get_bits_count(gb) < gb->size_in_bits - 8)
         av_log(avctx, AV_LOG_WARNING, "Buffer not fully read\n");
+
+    return 0;
 }
 
 static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
@@ -175,7 +181,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
     int ysub[2];
     MpegEncContext *s = &v->s;
 
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i <= v->two_sprites; i++) {
         xoff[i] = av_clip(sd->coefs[i][2], 0, v->sprite_width-1 << 16);
         xadv[i] = sd->coefs[i][0];
         if (xadv[i] != 1<<16 || (v->sprite_width << 16) - (v->output_width << 16) - xoff[i])
@@ -186,7 +192,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
     }
     alpha = av_clip_uint16(sd->coefs[1][6]);
 
-    for (plane = 0; plane < (s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++) {
+    for (plane = 0; plane < (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++) {
         int width = v->output_width>>!!plane;
 
         for (row = 0; row < v->output_height>>!!plane; row++) {
@@ -253,7 +259,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
         }
 
         if (!plane) {
-            for (i = 0; i < 2; i++) {
+            for (i = 0; i <= v->two_sprites; i++) {
                 xoff[i] >>= 1;
                 yoff[i] >>= 1;
             }
@@ -265,15 +271,20 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
 
 static int vc1_decode_sprites(VC1Context *v, GetBitContext* gb)
 {
+    int ret;
     MpegEncContext *s     = &v->s;
     AVCodecContext *avctx = s->avctx;
     SpriteData sd;
 
-    vc1_parse_sprites(v, gb, &sd);
+    memset(&sd, 0, sizeof(sd));
+
+    ret = vc1_parse_sprites(v, gb, &sd);
+    if (ret < 0)
+        return ret;
 
     if (!s->current_picture.f || !s->current_picture.f->data[0]) {
         av_log(avctx, AV_LOG_ERROR, "Got no sprites\n");
-        return -1;
+        return AVERROR_UNKNOWN;
     }
 
     if (v->two_sprites && (!s->last_picture_ptr || !s->last_picture.f->data[0])) {
@@ -282,10 +293,8 @@ static int vc1_decode_sprites(VC1Context *v, GetBitContext* gb)
     }
 
     av_frame_unref(v->sprite_output_frame);
-    if (ff_get_buffer(avctx, v->sprite_output_frame, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return -1;
-    }
+    if ((ret = ff_get_buffer(avctx, v->sprite_output_frame, 0)) < 0)
+        return ret;
 
     vc1_draw_sprites(v, &sd);
 
@@ -304,7 +313,7 @@ static void vc1_sprite_flush(AVCodecContext *avctx)
        wrong but it looks better than doing nothing. */
 
     if (f && f->data[0])
-        for (plane = 0; plane < (s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++)
+        for (plane = 0; plane < (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++)
             for (i = 0; i < v->sprite_height>>!!plane; i++)
                 memset(f->data[plane] + i * f->linesize[plane],
                        plane ? 128 : 0, f->linesize[plane]);
@@ -331,22 +340,22 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
 
     v->n_allocated_blks = s->mb_width + 2;
     v->block            = av_malloc(sizeof(*v->block) * v->n_allocated_blks);
-    v->cbp_base         = av_malloc(sizeof(v->cbp_base[0]) * 2 * s->mb_stride);
+    v->cbp_base         = av_malloc(sizeof(v->cbp_base[0]) * 3 * s->mb_stride);
     if (!v->block || !v->cbp_base)
         goto error;
-    v->cbp              = v->cbp_base + s->mb_stride;
-    v->ttblk_base       = av_malloc(sizeof(v->ttblk_base[0]) * 2 * s->mb_stride);
+    v->cbp              = v->cbp_base + 2 * s->mb_stride;
+    v->ttblk_base       = av_malloc(sizeof(v->ttblk_base[0]) * 3 * s->mb_stride);
     if (!v->ttblk_base)
         goto error;
-    v->ttblk            = v->ttblk_base + s->mb_stride;
-    v->is_intra_base    = av_mallocz(sizeof(v->is_intra_base[0]) * 2 * s->mb_stride);
+    v->ttblk            = v->ttblk_base + 2 * s->mb_stride;
+    v->is_intra_base    = av_mallocz(sizeof(v->is_intra_base[0]) * 3 * s->mb_stride);
     if (!v->is_intra_base)
         goto error;
-    v->is_intra         = v->is_intra_base + s->mb_stride;
-    v->luma_mv_base     = av_malloc(sizeof(v->luma_mv_base[0]) * 2 * s->mb_stride);
+    v->is_intra         = v->is_intra_base + 2 * s->mb_stride;
+    v->luma_mv_base     = av_mallocz(sizeof(v->luma_mv_base[0]) * 3 * s->mb_stride);
     if (!v->luma_mv_base)
         goto error;
-    v->luma_mv          = v->luma_mv_base + s->mb_stride;
+    v->luma_mv          = v->luma_mv_base + 2 * s->mb_stride;
 
     /* allocate block type info in that way so it could be used with s->block_index[] */
     v->mb_type_base = av_malloc(s->b8_stride * (mb_height * 2 + 1) + s->mb_stride * (mb_height + 1) * 2);
@@ -373,11 +382,9 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
     v->mv_f_next[1]     = v->mv_f_next[0] + (s->b8_stride * (mb_height * 2 + 1) + s->mb_stride * (mb_height + 1) * 2);
 
     if (s->avctx->codec_id == AV_CODEC_ID_WMV3IMAGE || s->avctx->codec_id == AV_CODEC_ID_VC1IMAGE) {
-        for (i = 0; i < 4; i++) {
-            v->sr_rows[i >> 1][i & 1] = av_malloc(v->output_width);
-            if (!v->sr_rows[i >> 1][i & 1])
-                goto error;
-        }
+        for (i = 0; i < 4; i++)
+            if (!(v->sr_rows[i >> 1][i & 1] = av_malloc(v->output_width)))
+                return AVERROR(ENOMEM);
     }
 
     ret = ff_intrax8_common_init(s->avctx, &v->x8, &s->idsp,
@@ -397,7 +404,7 @@ av_cold void ff_vc1_init_transposed_scantables(VC1Context *v)
 {
     int i;
     for (i = 0; i < 64; i++) {
-#define transpose(x) ((x >> 3) | ((x & 7) << 3))
+#define transpose(x) (((x) >> 3) | (((x) & 7) << 3))
         v->zz_8x8[0][i] = transpose(ff_wmv1_scantable[0][i]);
         v->zz_8x8[1][i] = transpose(ff_wmv1_scantable[1][i]);
         v->zz_8x8[2][i] = transpose(ff_wmv1_scantable[2][i]);
@@ -417,6 +424,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
     VC1Context *v = avctx->priv_data;
     MpegEncContext *s = &v->s;
     GetBitContext gb;
+    int ret;
 
     /* save the container output size for WMImage */
     v->output_width  = avctx->width;
@@ -424,17 +432,10 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 
     if (!avctx->extradata_size || !avctx->extradata)
         return -1;
-    if (!(avctx->flags & AV_CODEC_FLAG_GRAY))
-        avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-    else
-        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
     v->s.avctx = avctx;
 
-    if (ff_vc1_init_common(v) < 0)
-        return -1;
-    ff_blockdsp_init(&s->bdsp);
-    ff_h264chroma_init(&v->h264chroma, 8);
-    ff_qpeldsp_init(&s->qdsp);
+    if ((ret = ff_vc1_init_common(v)) < 0)
+        return ret;
 
     if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) {
         int count = 0;
@@ -446,8 +447,8 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 
         init_get_bits(&gb, avctx->extradata, avctx->extradata_size*8);
 
-        if (ff_vc1_decode_sequence_header(avctx, v, &gb) < 0)
-          return -1;
+        if ((ret = ff_vc1_decode_sequence_header(avctx, v, &gb)) < 0)
+          return ret;
 
         count = avctx->extradata_size*8 - get_bits_count(&gb);
         if (count > 0) {
@@ -470,6 +471,9 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
         }
 
         buf2  = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!buf2)
+            return AVERROR(ENOMEM);
+
         start = find_next_marker(start, end); // in WVC1 extradata first byte is its size, but can be 0 in mkv
         next  = start;
         for (; next < end; start = next) {
@@ -481,16 +485,16 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             init_get_bits(&gb, buf2, buf2_size * 8);
             switch (AV_RB32(start)) {
             case VC1_CODE_SEQHDR:
-                if (ff_vc1_decode_sequence_header(avctx, v, &gb) < 0) {
+                if ((ret = ff_vc1_decode_sequence_header(avctx, v, &gb)) < 0) {
                     av_free(buf2);
-                    return -1;
+                    return ret;
                 }
                 seq_initialized = 1;
                 break;
             case VC1_CODE_ENTRYPOINT:
-                if (ff_vc1_decode_entry_point(avctx, v, &gb) < 0) {
+                if ((ret = ff_vc1_decode_entry_point(avctx, v, &gb)) < 0) {
                     av_free(buf2);
-                    return -1;
+                    return ret;
                 }
                 ep_initialized = 1;
                 break;
@@ -504,14 +508,38 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
         v->res_sprite = (avctx->codec_id == AV_CODEC_ID_VC1IMAGE);
     }
 
-    v->sprite_output_frame = av_frame_alloc();
-    if (!v->sprite_output_frame)
-        return AVERROR(ENOMEM);
-
     avctx->profile = v->profile;
     if (v->profile == PROFILE_ADVANCED)
         avctx->level = v->level;
 
+    if (!CONFIG_GRAY || !(avctx->flags & AV_CODEC_FLAG_GRAY))
+        avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    else {
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        if (avctx->color_range == AVCOL_RANGE_UNSPECIFIED)
+            avctx->color_range = AVCOL_RANGE_MPEG;
+    }
+
+    // ensure static VLC tables are initialized
+    if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
+        return ret;
+    if ((ret = ff_vc1_decode_init_alloc_tables(v)) < 0)
+        return ret;
+    // Hack to ensure the above functions will be called
+    // again once we know all necessary settings.
+    // That this is necessary might indicate a bug.
+    ff_vc1_decode_end(avctx);
+
+    ff_blockdsp_init(&s->bdsp, avctx);
+    ff_h264chroma_init(&v->h264chroma, 8);
+    ff_qpeldsp_init(&s->qdsp);
+
+    // Must happen after calling ff_vc1_decode_end
+    // to avoid de-allocating the sprite_output_frame
+    v->sprite_output_frame = av_frame_alloc();
+    if (!v->sprite_output_frame)
+        return AVERROR(ENOMEM);
+
     avctx->has_b_frames = !!avctx->max_b_frames;
 
     if (v->color_prim == 1 || v->color_prim == 5 || v->color_prim == 6)
@@ -544,6 +572,11 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             v->sprite_height > 1 << 14 ||
             v->output_width  > 1 << 14 ||
             v->output_height > 1 << 14) return -1;
+
+        if ((v->sprite_width&1) || (v->sprite_height&1)) {
+            avpriv_request_sample(avctx, "odd sprites support");
+            return AVERROR_PATCHWELCOME;
+        }
     }
     return 0;
 }
@@ -595,14 +628,21 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     MpegEncContext *s = &v->s;
     AVFrame *pict = data;
     uint8_t *buf2 = NULL;
-    const uint8_t *buf_start = buf;
-    int mb_height, n_slices1;
+    const uint8_t *buf_start = buf, *buf_start_second_field = NULL;
+    int mb_height, n_slices1=-1;
     struct {
         uint8_t *buf;
         GetBitContext gb;
         int mby_start;
+        const uint8_t *rawbuf;
+        int raw_size;
     } *slices = NULL, *tmp;
 
+    v->second_field = 0;
+
+    if(s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY)
+        s->low_delay = 1;
+
     /* no supplementary picture */
     if (buf_size == 0 || (buf_size == 4 && AV_RB32(buf) == VC1_CODE_ENDOFSEQ)) {
         /* special case for last picture */
@@ -614,13 +654,15 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             *got_frame = 1;
         }
 
-        return 0;
+        return buf_size;
     }
 
     //for advanced profile we may need to parse and unescape data
     if (avctx->codec_id == AV_CODEC_ID_VC1 || avctx->codec_id == AV_CODEC_ID_VC1IMAGE) {
         int buf_size2 = 0;
         buf2 = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!buf2)
+            return AVERROR(ENOMEM);
 
         if (IS_MARKER(AV_RB32(buf))) { /* frame starts with marker and needs to be parsed */
             const uint8_t *start, *end, *next;
@@ -639,20 +681,26 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 case VC1_CODE_FIELD: {
                     int buf_size3;
-                    tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                    if (!tmp)
+                    if (avctx->hwaccel)
+                        buf_start_second_field = start;
+                    tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                    if (!tmp) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     slices = tmp;
                     slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                    if (!slices[n_slices].buf)
+                    if (!slices[n_slices].buf) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     buf_size3 = vc1_unescape_buffer(start + 4, size,
                                                     slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
-                    /* assuming that the field marker is at the exact middle,
-                       hope it's correct */
-                    slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+                    slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
+                    slices[n_slices].rawbuf = start;
+                    slices[n_slices].raw_size = size + 4;
                     n_slices1 = n_slices - 1; // index of the last slice of the first field
                     n_slices++;
                     break;
@@ -664,18 +712,24 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 case VC1_CODE_SLICE: {
                     int buf_size3;
-                    tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                    if (!tmp)
+                    tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                    if (!tmp) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     slices = tmp;
                     slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                    if (!slices[n_slices].buf)
+                    if (!slices[n_slices].buf) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     buf_size3 = vc1_unescape_buffer(start + 4, size,
                                                     slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
+                    slices[n_slices].rawbuf = start;
+                    slices[n_slices].raw_size = size + 4;
                     n_slices++;
                     break;
                 }
@@ -688,19 +742,28 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             divider = find_next_marker(buf, buf + buf_size);
             if ((divider == (buf + buf_size)) || AV_RB32(divider) != VC1_CODE_FIELD) {
                 av_log(avctx, AV_LOG_ERROR, "Error in WVC1 interlaced frame\n");
+                ret = AVERROR_INVALIDDATA;
                 goto err;
             } else { // found field marker, unescape second field
-                tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                if (!tmp)
+                if (avctx->hwaccel)
+                    buf_start_second_field = divider;
+                tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                if (!tmp) {
+                    ret = AVERROR(ENOMEM);
                     goto err;
+                }
                 slices = tmp;
                 slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                if (!slices[n_slices].buf)
+                if (!slices[n_slices].buf) {
+                    ret = AVERROR(ENOMEM);
                     goto err;
+                }
                 buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                               buf_size3 << 3);
                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+                slices[n_slices].rawbuf = divider;
+                slices[n_slices].raw_size = buf + buf_size - divider;
                 n_slices1 = n_slices - 1;
                 n_slices++;
             }
@@ -737,9 +800,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (!s->context_initialized) {
-        if (ff_msmpeg4_decode_init(avctx) < 0)
+        if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
             goto err;
-        if (ff_vc1_decode_init_alloc_tables(v) < 0) {
+        if ((ret = ff_vc1_decode_init_alloc_tables(v)) < 0) {
             ff_mpv_common_end(s);
             goto err;
         }
@@ -747,6 +810,10 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
         s->low_delay = !avctx->has_b_frames || v->res_sprite;
 
         if (v->profile == PROFILE_ADVANCED) {
+            if(avctx->coded_width<=1 || avctx->coded_height<=1) {
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
             s->h_edge_pos = avctx->coded_width;
             s->v_edge_pos = avctx->coded_height;
         }
@@ -756,19 +823,29 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     v->pic_header_flag = 0;
     v->first_pic_header_flag = 1;
     if (v->profile < PROFILE_ADVANCED) {
-        if (ff_vc1_parse_frame_header(v, &s->gb) < 0) {
+        if ((ret = ff_vc1_parse_frame_header(v, &s->gb)) < 0) {
             goto err;
         }
     } else {
-        if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+        if ((ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
             goto err;
         }
     }
     v->first_pic_header_flag = 0;
 
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(v->s.avctx, AV_LOG_DEBUG, "pict_type: %c\n", av_get_picture_type_char(s->pict_type));
+
     if ((avctx->codec_id == AV_CODEC_ID_WMV3IMAGE || avctx->codec_id == AV_CODEC_ID_VC1IMAGE)
         && s->pict_type != AV_PICTURE_TYPE_I) {
         av_log(v->s.avctx, AV_LOG_ERROR, "Sprite decoder: expected I-frame\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    if ((s->mb_height >> v->field_mode) == 0) {
+        av_log(v->s.avctx, AV_LOG_ERROR, "image too short\n");
+        ret = AVERROR_INVALIDDATA;
         goto err;
     }
 
@@ -778,6 +855,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
 
     /* skip B-frames if we don't have reference frames */
     if (!s->last_picture_ptr && (s->pict_type == AV_PICTURE_TYPE_B || s->droppable)) {
+        av_log(v->s.avctx, AV_LOG_DEBUG, "Skipping B frame without reference frames\n");
         goto end;
     }
     if ((avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type == AV_PICTURE_TYPE_B) ||
@@ -793,10 +871,14 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             s->next_p_frame_damaged = 0;
     }
 
-    if (ff_mpv_frame_start(s, avctx) < 0) {
+    if ((ret = ff_mpv_frame_start(s, avctx)) < 0) {
         goto err;
     }
 
+    v->s.current_picture_ptr->field_picture = v->field_mode;
+    v->s.current_picture_ptr->f->interlaced_frame = (v->fcm != PROGRESSIVE);
+    v->s.current_picture_ptr->f->top_field_first  = v->tff;
+
     // process pulldown flags
     s->current_picture_ptr->f->repeat_pict = 0;
     // Pulldown flags are only valid when 'broadcast' has been set.
@@ -813,12 +895,127 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     s->me.qpel_avg = s->qdsp.avg_qpel_pixels_tab;
 
     if (avctx->hwaccel) {
-        if (avctx->hwaccel->start_frame(avctx, buf, buf_size) < 0)
-            goto err;
-        if (avctx->hwaccel->decode_slice(avctx, buf_start, (buf + buf_size) - buf_start) < 0)
-            goto err;
-        if (avctx->hwaccel->end_frame(avctx) < 0)
-            goto err;
+        s->mb_y = 0;
+        if (v->field_mode && buf_start_second_field) {
+            // decode first field
+            s->picture_structure = PICT_BOTTOM_FIELD - v->tff;
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start, buf_start_second_field - buf_start)) < 0)
+                goto err;
+
+            if (n_slices1 == -1) {
+                // no slices, decode the field as-is
+                if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, buf_start_second_field - buf_start)) < 0)
+                    goto err;
+            } else {
+                if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, slices[0].rawbuf - buf_start)) < 0)
+                    goto err;
+
+                for (i = 0 ; i < n_slices1 + 1; i++) {
+                    s->gb = slices[i].gb;
+                    s->mb_y = slices[i].mby_start;
+
+                    v->pic_header_flag = get_bits1(&s->gb);
+                    if (v->pic_header_flag) {
+                        if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+                            av_log(v->s.avctx, AV_LOG_ERROR, "Slice header damaged\n");
+                            ret = AVERROR_INVALIDDATA;
+                            if (avctx->err_recognition & AV_EF_EXPLODE)
+                                goto err;
+                            continue;
+                        }
+                    }
+
+                    if ((ret = avctx->hwaccel->decode_slice(avctx, slices[i].rawbuf, slices[i].raw_size)) < 0)
+                        goto err;
+                }
+            }
+
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+
+            // decode second field
+            s->gb = slices[n_slices1 + 1].gb;
+            s->mb_y = slices[n_slices1 + 1].mby_start;
+            s->picture_structure = PICT_TOP_FIELD + v->tff;
+            v->second_field = 1;
+            v->pic_header_flag = 0;
+            if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "parsing header for second field failed");
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            v->s.current_picture_ptr->f->pict_type = v->s.pict_type;
+
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start_second_field, (buf + buf_size) - buf_start_second_field)) < 0)
+                goto err;
+
+            if (n_slices - n_slices1 == 2) {
+                // no slices, decode the field as-is
+                if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start_second_field, (buf + buf_size) - buf_start_second_field)) < 0)
+                    goto err;
+            } else {
+                if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start_second_field, slices[n_slices1 + 2].rawbuf - buf_start_second_field)) < 0)
+                    goto err;
+
+                for (i = n_slices1 + 2; i < n_slices; i++) {
+                    s->gb = slices[i].gb;
+                    s->mb_y = slices[i].mby_start;
+
+                    v->pic_header_flag = get_bits1(&s->gb);
+                    if (v->pic_header_flag) {
+                        if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+                            av_log(v->s.avctx, AV_LOG_ERROR, "Slice header damaged\n");
+                            ret = AVERROR_INVALIDDATA;
+                            if (avctx->err_recognition & AV_EF_EXPLODE)
+                                goto err;
+                            continue;
+                        }
+                    }
+
+                    if ((ret = avctx->hwaccel->decode_slice(avctx, slices[i].rawbuf, slices[i].raw_size)) < 0)
+                        goto err;
+                }
+            }
+
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+        } else {
+            s->picture_structure = PICT_FRAME;
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start, (buf + buf_size) - buf_start)) < 0)
+                goto err;
+
+            if (n_slices == 0) {
+                // no slices, decode the frame as-is
+                if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, (buf + buf_size) - buf_start)) < 0)
+                    goto err;
+            } else {
+                // decode the frame part as the first slice
+                if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, slices[0].rawbuf - buf_start)) < 0)
+                    goto err;
+
+                // and process the slices as additional slices afterwards
+                for (i = 0 ; i < n_slices; i++) {
+                    s->gb = slices[i].gb;
+                    s->mb_y = slices[i].mby_start;
+
+                    v->pic_header_flag = get_bits1(&s->gb);
+                    if (v->pic_header_flag) {
+                        if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+                            av_log(v->s.avctx, AV_LOG_ERROR, "Slice header damaged\n");
+                            ret = AVERROR_INVALIDDATA;
+                            if (avctx->err_recognition & AV_EF_EXPLODE)
+                                goto err;
+                            continue;
+                        }
+                    }
+
+                    if ((ret = avctx->hwaccel->decode_slice(avctx, slices[i].rawbuf, slices[i].raw_size)) < 0)
+                        goto err;
+                }
+            }
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+        }
     } else {
         int header_ret = 0;
 
@@ -835,10 +1032,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
         }
         mb_height = s->mb_height >> v->field_mode;
 
-        if (!mb_height) {
-            av_log(v->s.avctx, AV_LOG_ERROR, "Invalid mb_height.\n");
-            goto err;
-        }
+        av_assert0 (mb_height > 0);
 
         for (i = 0; i <= n_slices; i++) {
             if (i > 0 &&  slices[i - 1].mby_start >= mb_height) {
@@ -849,7 +1043,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     continue;
                 }
                 v->second_field = 1;
-                v->blocks_off   = s->mb_width  * s->mb_height << 1;
+                av_assert0((s->mb_height & 1) == 0);
+                v->blocks_off   = s->b8_stride * (s->mb_height&~1);
                 v->mb_off       = s->mb_stride * s->mb_height >> 1;
             } else {
                 v->second_field = 0;
@@ -861,6 +1056,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 if (v->field_mode && i == n_slices1 + 2) {
                     if ((header_ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
                         av_log(v->s.avctx, AV_LOG_ERROR, "Field header damaged\n");
+                        ret = AVERROR_INVALIDDATA;
                         if (avctx->err_recognition & AV_EF_EXPLODE)
                             goto err;
                         continue;
@@ -869,6 +1065,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     v->pic_header_flag = 1;
                     if ((header_ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
                         av_log(v->s.avctx, AV_LOG_ERROR, "Slice header damaged\n");
+                        ret = AVERROR_INVALIDDATA;
                         if (avctx->err_recognition & AV_EF_EXPLODE)
                             goto err;
                         continue;
@@ -880,14 +1077,23 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             s->start_mb_y = (i == 0) ? 0 : FFMAX(0, slices[i-1].mby_start % mb_height);
             if (!v->field_mode || v->second_field)
                 s->end_mb_y = (i == n_slices     ) ? mb_height : FFMIN(mb_height, slices[i].mby_start % mb_height);
-            else
-                s->end_mb_y = (i <= n_slices1 + 1) ? mb_height : FFMIN(mb_height, slices[i].mby_start % mb_height);
-
+            else {
+                if (i >= n_slices) {
+                    av_log(v->s.avctx, AV_LOG_ERROR, "first field slice count too large\n");
+                    continue;
+                }
+                s->end_mb_y = (i == n_slices1 + 1) ? mb_height : FFMIN(mb_height, slices[i].mby_start % mb_height);
+            }
             if (s->end_mb_y <= s->start_mb_y) {
-                av_log(v->s.avctx, AV_LOG_ERROR, "Invalid slice size\n");
-                goto err;
+                av_log(v->s.avctx, AV_LOG_ERROR, "end mb y %d %d invalid\n", s->end_mb_y, s->start_mb_y);
+                continue;
+            }
+            if (((s->pict_type == AV_PICTURE_TYPE_P && !v->p_frame_skipped) ||
+                 (s->pict_type == AV_PICTURE_TYPE_B && !v->bi_type)) &&
+                !v->cbpcy_vlc) {
+                av_log(v->s.avctx, AV_LOG_ERROR, "missing cbpcy_vlc\n");
+                continue;
             }
-
             ff_vc1_decode_blocks(v);
             if (i != n_slices)
                 s->gb = slices[i].gb;
@@ -908,6 +1114,10 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 get_bits_count(&s->gb), s->gb.size_in_bits);
 //  if (get_bits_count(&s->gb) > buf_size * 8)
 //      return -1;
+        if(s->er.error_occurred && s->pict_type == AV_PICTURE_TYPE_B) {
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
         if (!v->field_mode)
             ff_er_frame_end(&s->er);
     }
@@ -921,7 +1131,7 @@ image:
         if (avctx->skip_frame >= AVDISCARD_NONREF)
             goto end;
 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
-        if (vc1_decode_sprites(v, &s->gb))
+        if ((ret = vc1_decode_sprites(v, &s->gb)) < 0)
             goto err;
 #endif
         if ((ret = av_frame_ref(pict, v->sprite_output_frame)) < 0)
@@ -931,12 +1141,12 @@ image:
         if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
             if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
                 goto err;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
             *got_frame = 1;
         } else if (s->last_picture_ptr) {
             if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
                 goto err;
-            ff_print_debug_info(s, s->last_picture_ptr);
+            ff_print_debug_info(s, s->last_picture_ptr, pict);
             *got_frame = 1;
         }
     }
@@ -953,7 +1163,7 @@ err:
     for (i = 0; i < n_slices; i++)
         av_free(slices[i].buf);
     av_free(slices);
-    return -1;
+    return ret;
 }
 
 
@@ -965,6 +1175,9 @@ static const enum AVPixelFormat vc1_hwaccel_pixfmt_list_420[] = {
     AV_PIX_FMT_D3D11VA_VLD,
     AV_PIX_FMT_D3D11,
 #endif
+#if CONFIG_VC1_NVDEC_HWACCEL
+    AV_PIX_FMT_CUDA,
+#endif
 #if CONFIG_VC1_VAAPI_HWACCEL
     AV_PIX_FMT_VAAPI,
 #endif
@@ -997,6 +1210,9 @@ AVCodec ff_vc1_decoder = {
 #if CONFIG_VC1_D3D11VA2_HWACCEL
                         HWACCEL_D3D11VA2(vc1),
 #endif
+#if CONFIG_VC1_NVDEC_HWACCEL
+                        HWACCEL_NVDEC(vc1),
+#endif
 #if CONFIG_VC1_VAAPI_HWACCEL
                         HWACCEL_VAAPI(vc1),
 #endif
@@ -1031,6 +1247,9 @@ AVCodec ff_wmv3_decoder = {
 #if CONFIG_WMV3_D3D11VA2_HWACCEL
                         HWACCEL_D3D11VA2(wmv3),
 #endif
+#if CONFIG_WMV3_NVDEC_HWACCEL
+                        HWACCEL_NVDEC(wmv3),
+#endif
 #if CONFIG_WMV3_VAAPI_HWACCEL
                         HWACCEL_VAAPI(wmv3),
 #endif
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index 571309b..778b811 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,12 @@
  * VC-1 and WMV3 decoder
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "h264chroma.h"
 #include "qpeldsp.h"
+#include "rnd_avg.h"
 #include "vc1dsp.h"
 #include "startcode.h"
 
@@ -104,12 +107,13 @@ static void vc1_v_s_overlap_c(int16_t *top, int16_t *bottom)
     }
 }
 
-static void vc1_h_s_overlap_c(int16_t *left, int16_t *right)
+static void vc1_h_s_overlap_c(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
 {
     int i;
     int a, b, c, d;
     int d1, d2;
-    int rnd1 = 4, rnd2 = 3;
+    int rnd1 = flags & 2 ? 3 : 4;
+    int rnd2 = 7 - rnd1;
     for (i = 0; i < 8; i++) {
         a  = left[6];
         b  = left[7];
@@ -123,10 +127,12 @@ static void vc1_h_s_overlap_c(int16_t *left, int16_t *right)
         right[0] = ((c << 3) + d2 + rnd1) >> 3;
         right[1] = ((d << 3) + d1 + rnd2) >> 3;
 
-        right += 8;
-        left  += 8;
-        rnd2   = 7 - rnd2;
-        rnd1   = 7 - rnd1;
+        right += right_stride;
+        left  += left_stride;
+        if (flags & 1) {
+            rnd2   = 7 - rnd2;
+            rnd1   = 7 - rnd1;
+        }
     }
 }
 
@@ -581,10 +587,10 @@ static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride,
 }
 
 /* Function used to do motion compensation with bicubic interpolation */
-#define VC1_MSPEL_MC(OP, OPNAME)                                              \
+#define VC1_MSPEL_MC(OP, OP4, OPNAME)                                         \
 static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
                                                     const uint8_t *src,       \
-                                                    int stride,               \
+                                                    ptrdiff_t stride,         \
                                                     int hmode,                \
                                                     int vmode,                \
                                                     int rnd)                  \
@@ -639,13 +645,93 @@ static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
         dst += stride;                                                        \
         src += stride;                                                        \
     }                                                                         \
+}\
+static av_always_inline void OPNAME ## vc1_mspel_mc_16(uint8_t *dst,          \
+                                                       const uint8_t *src,    \
+                                                       ptrdiff_t stride,      \
+                                                       int hmode,             \
+                                                       int vmode,             \
+                                                       int rnd)               \
+{                                                                             \
+    int i, j;                                                                 \
+                                                                              \
+    if (vmode) { /* Horizontal filter to apply */                             \
+        int r;                                                                \
+                                                                              \
+        if (hmode) { /* Vertical filter to apply, output to tmp */            \
+            static const int shift_value[] = { 0, 5, 1, 5 };                  \
+            int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;       \
+            int16_t tmp[19 * 16], *tptr = tmp;                                \
+                                                                              \
+            r = (1 << (shift - 1)) + rnd - 1;                                 \
+                                                                              \
+            src -= 1;                                                         \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 19; i++)                                      \
+                    tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode) + r) >> shift; \
+                src  += stride;                                               \
+                tptr += 19;                                                   \
+            }                                                                 \
+                                                                              \
+            r    = 64 - rnd;                                                  \
+            tptr = tmp + 1;                                                   \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 16; i++)                                      \
+                    OP(dst[i], (vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode) + r) >> 7); \
+                dst  += stride;                                               \
+                tptr += 19;                                                   \
+            }                                                                 \
+                                                                              \
+            return;                                                           \
+        } else { /* No horizontal filter, output 8 lines to dst */            \
+            r = 1 - rnd;                                                      \
+                                                                              \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 16; i++)                                      \
+                    OP(dst[i], vc1_mspel_filter(src + i, stride, vmode, r));  \
+                src += stride;                                                \
+                dst += stride;                                                \
+            }                                                                 \
+            return;                                                           \
+        }                                                                     \
+    }                                                                         \
+                                                                              \
+    /* Horizontal mode with no vertical mode */                               \
+    for (j = 0; j < 16; j++) {                                                \
+        for (i = 0; i < 16; i++)                                              \
+            OP(dst[i], vc1_mspel_filter(src + i, 1, hmode, rnd));             \
+        dst += stride;                                                        \
+        src += stride;                                                        \
+    }                                                                         \
+}\
+static void OPNAME ## pixels8x8_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\
+    int i;\
+    for(i=0; i<8; i++){\
+        OP4(*(uint32_t*)(block  ), AV_RN32(pixels  ));\
+        OP4(*(uint32_t*)(block+4), AV_RN32(pixels+4));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static void OPNAME ## pixels16x16_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\
+    int i;\
+    for(i=0; i<16; i++){\
+        OP4(*(uint32_t*)(block   ), AV_RN32(pixels   ));\
+        OP4(*(uint32_t*)(block+ 4), AV_RN32(pixels+ 4));\
+        OP4(*(uint32_t*)(block+ 8), AV_RN32(pixels+ 8));\
+        OP4(*(uint32_t*)(block+12), AV_RN32(pixels+12));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
 }
 
-#define op_put(a, b) a = av_clip_uint8(b)
-#define op_avg(a, b) a = (a + av_clip_uint8(b) + 1) >> 1
+#define op_put(a, b) (a) = av_clip_uint8(b)
+#define op_avg(a, b) (a) = ((a) + av_clip_uint8(b) + 1) >> 1
+#define op4_avg(a, b) (a) = rnd_avg32(a, b)
+#define op4_put(a, b) (a) = (b)
 
-VC1_MSPEL_MC(op_put, put_)
-VC1_MSPEL_MC(op_avg, avg_)
+VC1_MSPEL_MC(op_put, op4_put, put_)
+VC1_MSPEL_MC(op_avg, op4_avg, avg_)
 
 /* pixel functions - really are entry points to vc1_mspel_mc */
 
@@ -661,6 +747,18 @@ static void avg_vc1_mspel_mc ## a ## b ## _c(uint8_t *dst,                    \
                                              ptrdiff_t stride, int rnd)       \
 {                                                                             \
     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                            \
+}                                                                             \
+static void put_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst,                 \
+                                                const uint8_t *src,           \
+                                                ptrdiff_t stride, int rnd)    \
+{                                                                             \
+    put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                         \
+}                                                                             \
+static void avg_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst,                 \
+                                                const uint8_t *src,           \
+                                                ptrdiff_t stride, int rnd)    \
+{                                                                             \
+    avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                         \
 }
 
 PUT_VC1_MSPEL(1, 0)
@@ -682,19 +780,6 @@ PUT_VC1_MSPEL(1, 3)
 PUT_VC1_MSPEL(2, 3)
 PUT_VC1_MSPEL(3, 3)
 
-
-static void put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride, int rnd)
-{
-    ff_put_pixels8x8_c(dst, src, stride);
-}
-
-static void avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride, int rnd)
-{
-    ff_avg_pixels8x8_c(dst, src, stride);
-}
-
 #define chroma_mc(a) \
     ((A * src[a] + B * src[a + 1] + \
       C * src[stride + a] + D * src[stride + a + 1] + 32 - 4) >> 6)
@@ -708,7 +793,7 @@ static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = chroma_mc(0);
@@ -733,7 +818,7 @@ static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = chroma_mc(0);
@@ -756,7 +841,7 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = avg2(dst[0], chroma_mc(0));
@@ -782,7 +867,7 @@ static void avg_no_rnd_vc1_chroma_mc4_c(uint8_t *dst /* align 8 */,
     const int D = (    x) * (    y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = avg2(dst[0], chroma_mc(0));
@@ -877,6 +962,11 @@ static void sprite_v_double_twoscale_c(uint8_t *dst,
 }
 
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+#define FN_ASSIGN(X, Y) \
+    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = put_vc1_mspel_mc##X##Y##_c; \
+    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = put_vc1_mspel_mc##X##Y##_16_c; \
+    dsp->avg_vc1_mspel_pixels_tab[1][X+4*Y] = avg_vc1_mspel_mc##X##Y##_c; \
+    dsp->avg_vc1_mspel_pixels_tab[0][X+4*Y] = avg_vc1_mspel_mc##X##Y##_16_c
 
 av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 {
@@ -901,39 +991,28 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
     dsp->vc1_v_loop_filter16  = vc1_v_loop_filter16_c;
     dsp->vc1_h_loop_filter16  = vc1_h_loop_filter16_c;
 
-    dsp->put_vc1_mspel_pixels_tab[0]  = put_vc1_mspel_mc00_c;
-    dsp->put_vc1_mspel_pixels_tab[1]  = put_vc1_mspel_mc10_c;
-    dsp->put_vc1_mspel_pixels_tab[2]  = put_vc1_mspel_mc20_c;
-    dsp->put_vc1_mspel_pixels_tab[3]  = put_vc1_mspel_mc30_c;
-    dsp->put_vc1_mspel_pixels_tab[4]  = put_vc1_mspel_mc01_c;
-    dsp->put_vc1_mspel_pixels_tab[5]  = put_vc1_mspel_mc11_c;
-    dsp->put_vc1_mspel_pixels_tab[6]  = put_vc1_mspel_mc21_c;
-    dsp->put_vc1_mspel_pixels_tab[7]  = put_vc1_mspel_mc31_c;
-    dsp->put_vc1_mspel_pixels_tab[8]  = put_vc1_mspel_mc02_c;
-    dsp->put_vc1_mspel_pixels_tab[9]  = put_vc1_mspel_mc12_c;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_c;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_c;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_c;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_c;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_c;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_c;
-
-    dsp->avg_vc1_mspel_pixels_tab[0]  = avg_vc1_mspel_mc00_c;
-    dsp->avg_vc1_mspel_pixels_tab[1]  = avg_vc1_mspel_mc10_c;
-    dsp->avg_vc1_mspel_pixels_tab[2]  = avg_vc1_mspel_mc20_c;
-    dsp->avg_vc1_mspel_pixels_tab[3]  = avg_vc1_mspel_mc30_c;
-    dsp->avg_vc1_mspel_pixels_tab[4]  = avg_vc1_mspel_mc01_c;
-    dsp->avg_vc1_mspel_pixels_tab[5]  = avg_vc1_mspel_mc11_c;
-    dsp->avg_vc1_mspel_pixels_tab[6]  = avg_vc1_mspel_mc21_c;
-    dsp->avg_vc1_mspel_pixels_tab[7]  = avg_vc1_mspel_mc31_c;
-    dsp->avg_vc1_mspel_pixels_tab[8]  = avg_vc1_mspel_mc02_c;
-    dsp->avg_vc1_mspel_pixels_tab[9]  = avg_vc1_mspel_mc12_c;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_c;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_c;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_c;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_c;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_c;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_c;
+    dsp->put_vc1_mspel_pixels_tab[0][0] = put_pixels16x16_c;
+    dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_pixels16x16_c;
+    dsp->put_vc1_mspel_pixels_tab[1][0] = put_pixels8x8_c;
+    dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_pixels8x8_c;
+    FN_ASSIGN(0, 1);
+    FN_ASSIGN(0, 2);
+    FN_ASSIGN(0, 3);
+
+    FN_ASSIGN(1, 0);
+    FN_ASSIGN(1, 1);
+    FN_ASSIGN(1, 2);
+    FN_ASSIGN(1, 3);
+
+    FN_ASSIGN(2, 0);
+    FN_ASSIGN(2, 1);
+    FN_ASSIGN(2, 2);
+    FN_ASSIGN(2, 3);
+
+    FN_ASSIGN(3, 0);
+    FN_ASSIGN(3, 1);
+    FN_ASSIGN(3, 2);
+    FN_ASSIGN(3, 3);
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_c;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_c;
@@ -958,4 +1037,6 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
         ff_vc1dsp_init_ppc(dsp);
     if (ARCH_X86)
         ff_vc1dsp_init_x86(dsp);
+    if (ARCH_MIPS)
+        ff_vc1dsp_init_mips(dsp);
 }
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index a9bd712..75db62b 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 #include "hpeldsp.h"
 #include "h264chroma.h"
 
+typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, ptrdiff_t line_size, int h);
+
 typedef struct VC1DSPContext {
     /* vc1 functions */
     void (*vc1_inv_trans_8x8)(int16_t *b);
@@ -43,7 +45,7 @@ typedef struct VC1DSPContext {
     void (*vc1_v_overlap)(uint8_t *src, int stride);
     void (*vc1_h_overlap)(uint8_t *src, int stride);
     void (*vc1_v_s_overlap)(int16_t *top,  int16_t *bottom);
-    void (*vc1_h_s_overlap)(int16_t *left, int16_t *right);
+    void (*vc1_h_s_overlap)(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags);
     void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
     void (*vc1_h_loop_filter4)(uint8_t *src, int stride, int pq);
     void (*vc1_v_loop_filter8)(uint8_t *src, int stride, int pq);
@@ -54,8 +56,8 @@ typedef struct VC1DSPContext {
     /* put 8x8 block with bicubic interpolation and quarterpel precision
      * last argument is actually round value instead of height
      */
-    op_pixels_func put_vc1_mspel_pixels_tab[16];
-    op_pixels_func avg_vc1_mspel_pixels_tab[16];
+    vc1op_pixels_func put_vc1_mspel_pixels_tab[2][16];
+    vc1op_pixels_func avg_vc1_mspel_pixels_tab[2][16];
 
     /* This is really one func used in VC-1 decoding */
     h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
@@ -85,5 +87,6 @@ void ff_vc1dsp_init_aarch64(VC1DSPContext* dsp);
 void ff_vc1dsp_init_arm(VC1DSPContext* dsp);
 void ff_vc1dsp_init_ppc(VC1DSPContext *c);
 void ff_vc1dsp_init_x86(VC1DSPContext* dsp);
+void ff_vc1dsp_init_mips(VC1DSPContext* dsp);
 
 #endif /* AVCODEC_VC1DSP_H */
diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
new file mode 100644
index 0000000..d0101e0
--- /dev/null
+++ b/libavcodec/vc2enc.c
@@ -0,0 +1,1242 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+#include "dirac.h"
+#include "put_bits.h"
+#include "internal.h"
+#include "version.h"
+
+#include "vc2enc_dwt.h"
+#include "diractab.h"
+
+/* The limited size resolution of each slice forces us to do this */
+#define SSIZE_ROUND(b) (FFALIGN((b), s->size_scaler) + 4 + s->prefix_bytes)
+
+/* Decides the cutoff point in # of slices to distribute the leftover bytes */
+#define SLICE_REDIST_TOTAL 150
+
+typedef struct VC2BaseVideoFormat {
+    enum AVPixelFormat pix_fmt;
+    AVRational time_base;
+    int width, height, interlaced, level;
+    const char *name;
+} VC2BaseVideoFormat;
+
+static const VC2BaseVideoFormat base_video_fmts[] = {
+    { 0 }, /* Custom format, here just to make indexing equal to base_vf */
+    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  176,  120, 0, 1,     "QSIF525" },
+    { AV_PIX_FMT_YUV420P,   {    2,    25 },  176,  144, 0, 1,     "QCIF"    },
+    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  352,  240, 0, 1,     "SIF525"  },
+    { AV_PIX_FMT_YUV420P,   {    2,    25 },  352,  288, 0, 1,     "CIF"     },
+    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  704,  480, 0, 1,     "4SIF525" },
+    { AV_PIX_FMT_YUV420P,   {    2,    25 },  704,  576, 0, 1,     "4CIF"    },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  480, 1, 2,   "SD480I-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    25 },  720,  576, 1, 2,   "SD576I-50" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1280,  720, 0, 3,  "HD720P-60"  },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1280,  720, 0, 3,  "HD720P-50"  },
+    { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 1920, 1080, 1, 3,  "HD1080I-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    25 }, 1920, 1080, 1, 3,  "HD1080I-50" },
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1920, 1080, 0, 3,  "HD1080P-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1920, 1080, 0, 3,  "HD1080P-50" },
+
+    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 2048, 1080, 0, 4,        "DC2K" },
+    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 4096, 2160, 0, 5,        "DC4K" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 3840, 2160, 0, 6, "UHDTV 4K-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 3840, 2160, 0, 6, "UHDTV 4K-50" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 7680, 4320, 0, 7, "UHDTV 8K-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 7680, 4320, 0, 7, "UHDTV 8K-50" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 24000 }, 1920, 1080, 0, 3,  "HD1080P-24" },
+    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  486, 1, 2,  "SD Pro486"  },
+};
+static const int base_video_fmts_len = FF_ARRAY_ELEMS(base_video_fmts);
+
+enum VC2_QM {
+    VC2_QM_DEF = 0,
+    VC2_QM_COL,
+    VC2_QM_FLAT,
+
+    VC2_QM_NB
+};
+
+typedef struct SubBand {
+    dwtcoef *buf;
+    ptrdiff_t stride;
+    int width;
+    int height;
+} SubBand;
+
+typedef struct Plane {
+    SubBand band[MAX_DWT_LEVELS][4];
+    dwtcoef *coef_buf;
+    int width;
+    int height;
+    int dwt_width;
+    int dwt_height;
+    ptrdiff_t coef_stride;
+} Plane;
+
+typedef struct SliceArgs {
+    PutBitContext pb;
+    int cache[DIRAC_MAX_QUANT_INDEX];
+    void *ctx;
+    int x;
+    int y;
+    int quant_idx;
+    int bits_ceil;
+    int bits_floor;
+    int bytes;
+} SliceArgs;
+
+typedef struct TransformArgs {
+    void *ctx;
+    Plane *plane;
+    void *idata;
+    ptrdiff_t istride;
+    int field;
+    VC2TransformContext t;
+} TransformArgs;
+
+typedef struct VC2EncContext {
+    AVClass *av_class;
+    PutBitContext pb;
+    Plane plane[3];
+    AVCodecContext *avctx;
+    DiracVersionInfo ver;
+
+    SliceArgs *slice_args;
+    TransformArgs transform_args[3];
+
+    /* For conversion from unsigned pixel values to signed */
+    int diff_offset;
+    int bpp;
+    int bpp_idx;
+
+    /* Picture number */
+    uint32_t picture_number;
+
+    /* Base video format */
+    int base_vf;
+    int level;
+    int profile;
+
+    /* Quantization matrix */
+    uint8_t quant[MAX_DWT_LEVELS][4];
+    int custom_quant_matrix;
+
+    /* Division LUT */
+    uint32_t qmagic_lut[116][2];
+
+    int num_x; /* #slices horizontally */
+    int num_y; /* #slices vertically */
+    int prefix_bytes;
+    int size_scaler;
+    int chroma_x_shift;
+    int chroma_y_shift;
+
+    /* Rate control stuff */
+    int frame_max_bytes;
+    int slice_max_bytes;
+    int slice_min_bytes;
+    int q_ceil;
+    int q_avg;
+
+    /* Options */
+    double tolerance;
+    int wavelet_idx;
+    int wavelet_depth;
+    int strict_compliance;
+    int slice_height;
+    int slice_width;
+    int interlaced;
+    enum VC2_QM quant_matrix;
+
+    /* Parse code state */
+    uint32_t next_parse_offset;
+    enum DiracParseCodes last_parse_code;
+} VC2EncContext;
+
+static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
+{
+    int i;
+    int pbits = 0, bits = 0, topbit = 1, maxval = 1;
+
+    if (!val++) {
+        put_bits(pb, 1, 1);
+        return;
+    }
+
+    while (val > maxval) {
+        topbit <<= 1;
+        maxval <<= 1;
+        maxval |=  1;
+    }
+
+    bits = ff_log2(topbit);
+
+    for (i = 0; i < bits; i++) {
+        topbit >>= 1;
+        pbits <<= 2;
+        if (val & topbit)
+            pbits |= 0x1;
+    }
+
+    put_bits(pb, bits*2 + 1, (pbits << 1) | 1);
+}
+
+static av_always_inline int count_vc2_ue_uint(uint32_t val)
+{
+    int topbit = 1, maxval = 1;
+
+    if (!val++)
+        return 1;
+
+    while (val > maxval) {
+        topbit <<= 1;
+        maxval <<= 1;
+        maxval |=  1;
+    }
+
+    return ff_log2(topbit)*2 + 1;
+}
+
+/* VC-2 10.4 - parse_info() */
+static void encode_parse_info(VC2EncContext *s, enum DiracParseCodes pcode)
+{
+    uint32_t cur_pos, dist;
+
+    avpriv_align_put_bits(&s->pb);
+
+    cur_pos = put_bits_count(&s->pb) >> 3;
+
+    /* Magic string */
+    avpriv_put_string(&s->pb, "BBCD", 0);
+
+    /* Parse code */
+    put_bits(&s->pb, 8, pcode);
+
+    /* Next parse offset */
+    dist = cur_pos - s->next_parse_offset;
+    AV_WB32(s->pb.buf + s->next_parse_offset + 5, dist);
+    s->next_parse_offset = cur_pos;
+    put_bits32(&s->pb, pcode == DIRAC_PCODE_END_SEQ ? 13 : 0);
+
+    /* Last parse offset */
+    put_bits32(&s->pb, s->last_parse_code == DIRAC_PCODE_END_SEQ ? 13 : dist);
+
+    s->last_parse_code = pcode;
+}
+
+/* VC-2 11.1 - parse_parameters()
+ * The level dictates what the decoder should expect in terms of resolution
+ * and allows it to quickly reject whatever it can't support. Remember,
+ * this codec kinda targets cheapo FPGAs without much memory. Unfortunately
+ * it also limits us greatly in our choice of formats, hence the flag to disable
+ * strict_compliance */
+static void encode_parse_params(VC2EncContext *s)
+{
+    put_vc2_ue_uint(&s->pb, s->ver.major); /* VC-2 demands this to be 2 */
+    put_vc2_ue_uint(&s->pb, s->ver.minor); /* ^^ and this to be 0       */
+    put_vc2_ue_uint(&s->pb, s->profile);   /* 3 to signal HQ profile    */
+    put_vc2_ue_uint(&s->pb, s->level);     /* 3 - 1080/720, 6 - 4K      */
+}
+
+/* VC-2 11.3 - frame_size() */
+static void encode_frame_size(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        AVCodecContext *avctx = s->avctx;
+        put_vc2_ue_uint(&s->pb, avctx->width);
+        put_vc2_ue_uint(&s->pb, avctx->height);
+    }
+}
+
+/* VC-2 11.3.3 - color_diff_sampling_format() */
+static void encode_sample_fmt(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        int idx;
+        if (s->chroma_x_shift == 1 && s->chroma_y_shift == 0)
+            idx = 1; /* 422 */
+        else if (s->chroma_x_shift == 1 && s->chroma_y_shift == 1)
+            idx = 2; /* 420 */
+        else
+            idx = 0; /* 444 */
+        put_vc2_ue_uint(&s->pb, idx);
+    }
+}
+
+/* VC-2 11.3.4 - scan_format() */
+static void encode_scan_format(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance)
+        put_vc2_ue_uint(&s->pb, s->interlaced);
+}
+
+/* VC-2 11.3.5 - frame_rate() */
+static void encode_frame_rate(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        AVCodecContext *avctx = s->avctx;
+        put_vc2_ue_uint(&s->pb, 0);
+        put_vc2_ue_uint(&s->pb, avctx->time_base.den);
+        put_vc2_ue_uint(&s->pb, avctx->time_base.num);
+    }
+}
+
+/* VC-2 11.3.6 - aspect_ratio() */
+static void encode_aspect_ratio(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        AVCodecContext *avctx = s->avctx;
+        put_vc2_ue_uint(&s->pb, 0);
+        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.num);
+        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.den);
+    }
+}
+
+/* VC-2 11.3.7 - clean_area() */
+static void encode_clean_area(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, 0);
+}
+
+/* VC-2 11.3.8 - signal_range() */
+static void encode_signal_range(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance)
+        put_vc2_ue_uint(&s->pb, s->bpp_idx);
+}
+
+/* VC-2 11.3.9 - color_spec() */
+static void encode_color_spec(VC2EncContext *s)
+{
+    AVCodecContext *avctx = s->avctx;
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        int val;
+        put_vc2_ue_uint(&s->pb, 0);
+
+        /* primaries */
+        put_bits(&s->pb, 1, 1);
+        if (avctx->color_primaries == AVCOL_PRI_BT470BG)
+            val = 2;
+        else if (avctx->color_primaries == AVCOL_PRI_SMPTE170M)
+            val = 1;
+        else if (avctx->color_primaries == AVCOL_PRI_SMPTE240M)
+            val = 1;
+        else
+            val = 0;
+        put_vc2_ue_uint(&s->pb, val);
+
+        /* color matrix */
+        put_bits(&s->pb, 1, 1);
+        if (avctx->colorspace == AVCOL_SPC_RGB)
+            val = 3;
+        else if (avctx->colorspace == AVCOL_SPC_YCOCG)
+            val = 2;
+        else if (avctx->colorspace == AVCOL_SPC_BT470BG)
+            val = 1;
+        else
+            val = 0;
+        put_vc2_ue_uint(&s->pb, val);
+
+        /* transfer function */
+        put_bits(&s->pb, 1, 1);
+        if (avctx->color_trc == AVCOL_TRC_LINEAR)
+            val = 2;
+        else if (avctx->color_trc == AVCOL_TRC_BT1361_ECG)
+            val = 1;
+        else
+            val = 0;
+        put_vc2_ue_uint(&s->pb, val);
+    }
+}
+
+/* VC-2 11.3 - source_parameters() */
+static void encode_source_params(VC2EncContext *s)
+{
+    encode_frame_size(s);
+    encode_sample_fmt(s);
+    encode_scan_format(s);
+    encode_frame_rate(s);
+    encode_aspect_ratio(s);
+    encode_clean_area(s);
+    encode_signal_range(s);
+    encode_color_spec(s);
+}
+
+/* VC-2 11 - sequence_header() */
+static void encode_seq_header(VC2EncContext *s)
+{
+    avpriv_align_put_bits(&s->pb);
+    encode_parse_params(s);
+    put_vc2_ue_uint(&s->pb, s->base_vf);
+    encode_source_params(s);
+    put_vc2_ue_uint(&s->pb, s->interlaced); /* Frames or fields coding */
+}
+
+/* VC-2 12.1 - picture_header() */
+static void encode_picture_header(VC2EncContext *s)
+{
+    avpriv_align_put_bits(&s->pb);
+    put_bits32(&s->pb, s->picture_number++);
+}
+
+/* VC-2 12.3.4.1 - slice_parameters() */
+static void encode_slice_params(VC2EncContext *s)
+{
+    put_vc2_ue_uint(&s->pb, s->num_x);
+    put_vc2_ue_uint(&s->pb, s->num_y);
+    put_vc2_ue_uint(&s->pb, s->prefix_bytes);
+    put_vc2_ue_uint(&s->pb, s->size_scaler);
+}
+
+/* 1st idx = LL, second - vertical, third - horizontal, fourth - total */
+const uint8_t vc2_qm_col_tab[][4] = {
+    {20,  9, 15,  4},
+    { 0,  6,  6,  4},
+    { 0,  3,  3,  5},
+    { 0,  3,  5,  1},
+    { 0, 11, 10, 11}
+};
+
+const uint8_t vc2_qm_flat_tab[][4] = {
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0}
+};
+
+static void init_quant_matrix(VC2EncContext *s)
+{
+    int level, orientation;
+
+    if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) {
+        s->custom_quant_matrix = 0;
+        for (level = 0; level < s->wavelet_depth; level++) {
+            s->quant[level][0] = ff_dirac_default_qmat[s->wavelet_idx][level][0];
+            s->quant[level][1] = ff_dirac_default_qmat[s->wavelet_idx][level][1];
+            s->quant[level][2] = ff_dirac_default_qmat[s->wavelet_idx][level][2];
+            s->quant[level][3] = ff_dirac_default_qmat[s->wavelet_idx][level][3];
+        }
+        return;
+    }
+
+    s->custom_quant_matrix = 1;
+
+    if (s->quant_matrix == VC2_QM_DEF) {
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = 0; orientation < 4; orientation++) {
+                if (level <= 3)
+                    s->quant[level][orientation] = ff_dirac_default_qmat[s->wavelet_idx][level][orientation];
+                else
+                    s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
+            }
+        }
+    } else if (s->quant_matrix == VC2_QM_COL) {
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = 0; orientation < 4; orientation++) {
+                s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
+            }
+        }
+    } else {
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = 0; orientation < 4; orientation++) {
+                s->quant[level][orientation] = vc2_qm_flat_tab[level][orientation];
+            }
+        }
+    }
+}
+
+/* VC-2 12.3.4.2 - quant_matrix() */
+static void encode_quant_matrix(VC2EncContext *s)
+{
+    int level;
+    put_bits(&s->pb, 1, s->custom_quant_matrix);
+    if (s->custom_quant_matrix) {
+        put_vc2_ue_uint(&s->pb, s->quant[0][0]);
+        for (level = 0; level < s->wavelet_depth; level++) {
+            put_vc2_ue_uint(&s->pb, s->quant[level][1]);
+            put_vc2_ue_uint(&s->pb, s->quant[level][2]);
+            put_vc2_ue_uint(&s->pb, s->quant[level][3]);
+        }
+    }
+}
+
+/* VC-2 12.3 - transform_parameters() */
+static void encode_transform_params(VC2EncContext *s)
+{
+    put_vc2_ue_uint(&s->pb, s->wavelet_idx);
+    put_vc2_ue_uint(&s->pb, s->wavelet_depth);
+
+    encode_slice_params(s);
+    encode_quant_matrix(s);
+}
+
+/* VC-2 12.2 - wavelet_transform() */
+static void encode_wavelet_transform(VC2EncContext *s)
+{
+    encode_transform_params(s);
+    avpriv_align_put_bits(&s->pb);
+}
+
+/* VC-2 12 - picture_parse() */
+static void encode_picture_start(VC2EncContext *s)
+{
+    avpriv_align_put_bits(&s->pb);
+    encode_picture_header(s);
+    avpriv_align_put_bits(&s->pb);
+    encode_wavelet_transform(s);
+}
+
+#define QUANT(c, mul, add, shift) (((mul) * (c) + (add)) >> (shift))
+
+/* VC-2 13.5.5.2 - slice_band() */
+static void encode_subband(VC2EncContext *s, PutBitContext *pb, int sx, int sy,
+                           SubBand *b, int quant)
+{
+    int x, y;
+
+    const int left   = b->width  * (sx+0) / s->num_x;
+    const int right  = b->width  * (sx+1) / s->num_x;
+    const int top    = b->height * (sy+0) / s->num_y;
+    const int bottom = b->height * (sy+1) / s->num_y;
+
+    dwtcoef *coeff = b->buf + top * b->stride;
+    const uint64_t q_m = ((uint64_t)(s->qmagic_lut[quant][0])) << 2;
+    const uint64_t q_a = s->qmagic_lut[quant][1];
+    const int q_s = av_log2(ff_dirac_qscale_tab[quant]) + 32;
+
+    for (y = top; y < bottom; y++) {
+        for (x = left; x < right; x++) {
+            uint32_t c_abs = QUANT(FFABS(coeff[x]), q_m, q_a, q_s);
+            put_vc2_ue_uint(pb, c_abs);
+            if (c_abs)
+                put_bits(pb, 1, coeff[x] < 0);
+        }
+        coeff += b->stride;
+    }
+}
+
+static int count_hq_slice(SliceArgs *slice, int quant_idx)
+{
+    int x, y;
+    uint8_t quants[MAX_DWT_LEVELS][4];
+    int bits = 0, p, level, orientation;
+    VC2EncContext *s = slice->ctx;
+
+    if (slice->cache[quant_idx])
+        return slice->cache[quant_idx];
+
+    bits += 8*s->prefix_bytes;
+    bits += 8; /* quant_idx */
+
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++)
+            quants[level][orientation] = FFMAX(quant_idx - s->quant[level][orientation], 0);
+
+    for (p = 0; p < 3; p++) {
+        int bytes_start, bytes_len, pad_s, pad_c;
+        bytes_start = bits >> 3;
+        bits += 8;
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = !!level; orientation < 4; orientation++) {
+                SubBand *b = &s->plane[p].band[level][orientation];
+
+                const int q_idx = quants[level][orientation];
+                const uint64_t q_m = ((uint64_t)s->qmagic_lut[q_idx][0]) << 2;
+                const uint64_t q_a = s->qmagic_lut[q_idx][1];
+                const int q_s = av_log2(ff_dirac_qscale_tab[q_idx]) + 32;
+
+                const int left   = b->width  * slice->x    / s->num_x;
+                const int right  = b->width  *(slice->x+1) / s->num_x;
+                const int top    = b->height * slice->y    / s->num_y;
+                const int bottom = b->height *(slice->y+1) / s->num_y;
+
+                dwtcoef *buf = b->buf + top * b->stride;
+
+                for (y = top; y < bottom; y++) {
+                    for (x = left; x < right; x++) {
+                        uint32_t c_abs = QUANT(FFABS(buf[x]), q_m, q_a, q_s);
+                        bits += count_vc2_ue_uint(c_abs);
+                        bits += !!c_abs;
+                    }
+                    buf += b->stride;
+                }
+            }
+        }
+        bits += FFALIGN(bits, 8) - bits;
+        bytes_len = (bits >> 3) - bytes_start - 1;
+        pad_s = FFALIGN(bytes_len, s->size_scaler)/s->size_scaler;
+        pad_c = (pad_s*s->size_scaler) - bytes_len;
+        bits += pad_c*8;
+    }
+
+    slice->cache[quant_idx] = bits;
+
+    return bits;
+}
+
+/* Approaches the best possible quantizer asymptotically, its kinda exaustive
+ * but we have a LUT to get the coefficient size in bits. Guaranteed to never
+ * overshoot, which is apparently very important when streaming */
+static int rate_control(AVCodecContext *avctx, void *arg)
+{
+    SliceArgs *slice_dat = arg;
+    VC2EncContext *s = slice_dat->ctx;
+    const int top = slice_dat->bits_ceil;
+    const int bottom = slice_dat->bits_floor;
+    int quant_buf[2] = {-1, -1};
+    int quant = slice_dat->quant_idx, step = 1;
+    int bits_last, bits = count_hq_slice(slice_dat, quant);
+    while ((bits > top) || (bits < bottom)) {
+        const int signed_step = bits > top ? +step : -step;
+        quant  = av_clip(quant + signed_step, 0, s->q_ceil-1);
+        bits   = count_hq_slice(slice_dat, quant);
+        if (quant_buf[1] == quant) {
+            quant = FFMAX(quant_buf[0], quant);
+            bits  = quant == quant_buf[0] ? bits_last : bits;
+            break;
+        }
+        step         = av_clip(step/2, 1, (s->q_ceil-1)/2);
+        quant_buf[1] = quant_buf[0];
+        quant_buf[0] = quant;
+        bits_last    = bits;
+    }
+    slice_dat->quant_idx = av_clip(quant, 0, s->q_ceil-1);
+    slice_dat->bytes = SSIZE_ROUND(bits >> 3);
+    return 0;
+}
+
+static int calc_slice_sizes(VC2EncContext *s)
+{
+    int i, j, slice_x, slice_y, bytes_left = 0;
+    int bytes_top[SLICE_REDIST_TOTAL] = {0};
+    int64_t total_bytes_needed = 0;
+    int slice_redist_range = FFMIN(SLICE_REDIST_TOTAL, s->num_x*s->num_y);
+    SliceArgs *enc_args = s->slice_args;
+    SliceArgs *top_loc[SLICE_REDIST_TOTAL] = {NULL};
+
+    init_quant_matrix(s);
+
+    for (slice_y = 0; slice_y < s->num_y; slice_y++) {
+        for (slice_x = 0; slice_x < s->num_x; slice_x++) {
+            SliceArgs *args = &enc_args[s->num_x*slice_y + slice_x];
+            args->ctx = s;
+            args->x   = slice_x;
+            args->y   = slice_y;
+            args->bits_ceil  = s->slice_max_bytes << 3;
+            args->bits_floor = s->slice_min_bytes << 3;
+            memset(args->cache, 0, s->q_ceil*sizeof(*args->cache));
+        }
+    }
+
+    /* First pass - determine baseline slice sizes w.r.t. max_slice_size */
+    s->avctx->execute(s->avctx, rate_control, enc_args, NULL, s->num_x*s->num_y,
+                      sizeof(SliceArgs));
+
+    for (i = 0; i < s->num_x*s->num_y; i++) {
+        SliceArgs *args = &enc_args[i];
+        bytes_left += args->bytes;
+        for (j = 0; j < slice_redist_range; j++) {
+            if (args->bytes > bytes_top[j]) {
+                bytes_top[j] = args->bytes;
+                top_loc[j]   = args;
+                break;
+            }
+        }
+    }
+
+    bytes_left = s->frame_max_bytes - bytes_left;
+
+    /* Second pass - distribute leftover bytes */
+    while (bytes_left > 0) {
+        int distributed = 0;
+        for (i = 0; i < slice_redist_range; i++) {
+            SliceArgs *args;
+            int bits, bytes, diff, prev_bytes, new_idx;
+            if (bytes_left <= 0)
+                break;
+            if (!top_loc[i] || !top_loc[i]->quant_idx)
+                break;
+            args = top_loc[i];
+            prev_bytes = args->bytes;
+            new_idx = FFMAX(args->quant_idx - 1, 0);
+            bits  = count_hq_slice(args, new_idx);
+            bytes = SSIZE_ROUND(bits >> 3);
+            diff  = bytes - prev_bytes;
+            if ((bytes_left - diff) > 0) {
+                args->quant_idx = new_idx;
+                args->bytes = bytes;
+                bytes_left -= diff;
+                distributed++;
+            }
+        }
+        if (!distributed)
+            break;
+    }
+
+    for (i = 0; i < s->num_x*s->num_y; i++) {
+        SliceArgs *args = &enc_args[i];
+        total_bytes_needed += args->bytes;
+        s->q_avg = (s->q_avg + args->quant_idx)/2;
+    }
+
+    return total_bytes_needed;
+}
+
+/* VC-2 13.5.3 - hq_slice */
+static int encode_hq_slice(AVCodecContext *avctx, void *arg)
+{
+    SliceArgs *slice_dat = arg;
+    VC2EncContext *s = slice_dat->ctx;
+    PutBitContext *pb = &slice_dat->pb;
+    const int slice_x = slice_dat->x;
+    const int slice_y = slice_dat->y;
+    const int quant_idx = slice_dat->quant_idx;
+    const int slice_bytes_max = slice_dat->bytes;
+    uint8_t quants[MAX_DWT_LEVELS][4];
+    int p, level, orientation;
+
+    /* The reference decoder ignores it, and its typical length is 0 */
+    memset(put_bits_ptr(pb), 0, s->prefix_bytes);
+    skip_put_bytes(pb, s->prefix_bytes);
+
+    put_bits(pb, 8, quant_idx);
+
+    /* Slice quantization (slice_quantizers() in the specs) */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++)
+            quants[level][orientation] = FFMAX(quant_idx - s->quant[level][orientation], 0);
+
+    /* Luma + 2 Chroma planes */
+    for (p = 0; p < 3; p++) {
+        int bytes_start, bytes_len, pad_s, pad_c;
+        bytes_start = put_bits_count(pb) >> 3;
+        put_bits(pb, 8, 0);
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = !!level; orientation < 4; orientation++) {
+                encode_subband(s, pb, slice_x, slice_y,
+                               &s->plane[p].band[level][orientation],
+                               quants[level][orientation]);
+            }
+        }
+        avpriv_align_put_bits(pb);
+        bytes_len = (put_bits_count(pb) >> 3) - bytes_start - 1;
+        if (p == 2) {
+            int len_diff = slice_bytes_max - (put_bits_count(pb) >> 3);
+            pad_s = FFALIGN((bytes_len + len_diff), s->size_scaler)/s->size_scaler;
+            pad_c = (pad_s*s->size_scaler) - bytes_len;
+        } else {
+            pad_s = FFALIGN(bytes_len, s->size_scaler)/s->size_scaler;
+            pad_c = (pad_s*s->size_scaler) - bytes_len;
+        }
+        pb->buf[bytes_start] = pad_s;
+        flush_put_bits(pb);
+        /* vc2-reference uses that padding that decodes to '0' coeffs */
+        memset(put_bits_ptr(pb), 0xFF, pad_c);
+        skip_put_bytes(pb, pad_c);
+    }
+
+    return 0;
+}
+
+/* VC-2 13.5.1 - low_delay_transform_data() */
+static int encode_slices(VC2EncContext *s)
+{
+    uint8_t *buf;
+    int slice_x, slice_y, skip = 0;
+    SliceArgs *enc_args = s->slice_args;
+
+    avpriv_align_put_bits(&s->pb);
+    flush_put_bits(&s->pb);
+    buf = put_bits_ptr(&s->pb);
+
+    for (slice_y = 0; slice_y < s->num_y; slice_y++) {
+        for (slice_x = 0; slice_x < s->num_x; slice_x++) {
+            SliceArgs *args = &enc_args[s->num_x*slice_y + slice_x];
+            init_put_bits(&args->pb, buf + skip, args->bytes+s->prefix_bytes);
+            skip += args->bytes;
+        }
+    }
+
+    s->avctx->execute(s->avctx, encode_hq_slice, enc_args, NULL, s->num_x*s->num_y,
+                      sizeof(SliceArgs));
+
+    skip_put_bytes(&s->pb, skip);
+
+    return 0;
+}
+
+/*
+ * Transform basics for a 3 level transform
+ * |---------------------------------------------------------------------|
+ * |  LL-0  | HL-0  |                 |                                  |
+ * |--------|-------|      HL-1       |                                  |
+ * |  LH-0  | HH-0  |                 |                                  |
+ * |----------------|-----------------|              HL-2                |
+ * |                |                 |                                  |
+ * |     LH-1       |      HH-1       |                                  |
+ * |                |                 |                                  |
+ * |----------------------------------|----------------------------------|
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |              LH-2                |              HH-2                |
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |---------------------------------------------------------------------|
+ *
+ * DWT transforms are generally applied by splitting the image in two vertically
+ * and applying a low pass transform on the left part and a corresponding high
+ * pass transform on the right hand side. This is known as the horizontal filter
+ * stage.
+ * After that, the same operation is performed except the image is divided
+ * horizontally, with the high pass on the lower and the low pass on the higher
+ * side.
+ * Therefore, you're left with 4 subdivisions - known as  low-low, low-high,
+ * high-low and high-high. They're referred to as orientations in the decoder
+ * and encoder.
+ *
+ * The LL (low-low) area contains the original image downsampled by the amount
+ * of levels. The rest of the areas can be thought as the details needed
+ * to restore the image perfectly to its original size.
+ */
+static int dwt_plane(AVCodecContext *avctx, void *arg)
+{
+    TransformArgs *transform_dat = arg;
+    VC2EncContext *s = transform_dat->ctx;
+    const void *frame_data = transform_dat->idata;
+    const ptrdiff_t linesize = transform_dat->istride;
+    const int field = transform_dat->field;
+    const Plane *p = transform_dat->plane;
+    VC2TransformContext *t = &transform_dat->t;
+    dwtcoef *buf = p->coef_buf;
+    const int idx = s->wavelet_idx;
+    const int skip = 1 + s->interlaced;
+
+    int x, y, level, offset;
+    ptrdiff_t pix_stride = linesize >> (s->bpp - 1);
+
+    if (field == 1) {
+        offset = 0;
+        pix_stride <<= 1;
+    } else if (field == 2) {
+        offset = pix_stride;
+        pix_stride <<= 1;
+    } else {
+        offset = 0;
+    }
+
+    if (s->bpp == 1) {
+        const uint8_t *pix = (const uint8_t *)frame_data + offset;
+        for (y = 0; y < p->height*skip; y+=skip) {
+            for (x = 0; x < p->width; x++) {
+                buf[x] = pix[x] - s->diff_offset;
+            }
+            buf += p->coef_stride;
+            pix += pix_stride;
+        }
+    } else {
+        const uint16_t *pix = (const uint16_t *)frame_data + offset;
+        for (y = 0; y < p->height*skip; y+=skip) {
+            for (x = 0; x < p->width; x++) {
+                buf[x] = pix[x] - s->diff_offset;
+            }
+            buf += p->coef_stride;
+            pix += pix_stride;
+        }
+    }
+
+    memset(buf, 0, p->coef_stride * (p->dwt_height - p->height) * sizeof(dwtcoef));
+
+    for (level = s->wavelet_depth-1; level >= 0; level--) {
+        const SubBand *b = &p->band[level][0];
+        t->vc2_subband_dwt[idx](t, p->coef_buf, p->coef_stride,
+                                b->width, b->height);
+    }
+
+    return 0;
+}
+
+static int encode_frame(VC2EncContext *s, AVPacket *avpkt, const AVFrame *frame,
+                        const char *aux_data, const int header_size, int field)
+{
+    int i, ret;
+    int64_t max_frame_bytes;
+
+     /* Threaded DWT transform */
+    for (i = 0; i < 3; i++) {
+        s->transform_args[i].ctx   = s;
+        s->transform_args[i].field = field;
+        s->transform_args[i].plane = &s->plane[i];
+        s->transform_args[i].idata = frame->data[i];
+        s->transform_args[i].istride = frame->linesize[i];
+    }
+    s->avctx->execute(s->avctx, dwt_plane, s->transform_args, NULL, 3,
+                      sizeof(TransformArgs));
+
+    /* Calculate per-slice quantizers and sizes */
+    max_frame_bytes = header_size + calc_slice_sizes(s);
+
+    if (field < 2) {
+        ret = ff_alloc_packet2(s->avctx, avpkt,
+                               max_frame_bytes << s->interlaced,
+                               max_frame_bytes << s->interlaced);
+        if (ret) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+            return ret;
+        }
+        init_put_bits(&s->pb, avpkt->data, avpkt->size);
+    }
+
+    /* Sequence header */
+    encode_parse_info(s, DIRAC_PCODE_SEQ_HEADER);
+    encode_seq_header(s);
+
+    /* Encoder version */
+    if (aux_data) {
+        encode_parse_info(s, DIRAC_PCODE_AUX);
+        avpriv_put_string(&s->pb, aux_data, 1);
+    }
+
+    /* Picture header */
+    encode_parse_info(s, DIRAC_PCODE_PICTURE_HQ);
+    encode_picture_start(s);
+
+    /* Encode slices */
+    encode_slices(s);
+
+    /* End sequence */
+    encode_parse_info(s, DIRAC_PCODE_END_SEQ);
+
+    return 0;
+}
+
+static av_cold int vc2_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                      const AVFrame *frame, int *got_packet)
+{
+    int ret = 0;
+    int slice_ceil, sig_size = 256;
+    VC2EncContext *s = avctx->priv_data;
+    const int bitexact = avctx->flags & AV_CODEC_FLAG_BITEXACT;
+    const char *aux_data = bitexact ? "Lavc" : LIBAVCODEC_IDENT;
+    const int aux_data_size = bitexact ? sizeof("Lavc") : sizeof(LIBAVCODEC_IDENT);
+    const int header_size = 100 + aux_data_size;
+    int64_t r_bitrate = avctx->bit_rate >> (s->interlaced);
+
+    s->avctx = avctx;
+    s->size_scaler = 2;
+    s->prefix_bytes = 0;
+    s->last_parse_code = 0;
+    s->next_parse_offset = 0;
+
+    /* Rate control */
+    s->frame_max_bytes = (av_rescale(r_bitrate, s->avctx->time_base.num,
+                                     s->avctx->time_base.den) >> 3) - header_size;
+    s->slice_max_bytes = slice_ceil = av_rescale(s->frame_max_bytes, 1, s->num_x*s->num_y);
+
+    /* Find an appropriate size scaler */
+    while (sig_size > 255) {
+        int r_size = SSIZE_ROUND(s->slice_max_bytes);
+        if (r_size > slice_ceil) {
+            s->slice_max_bytes -= r_size - slice_ceil;
+            r_size = SSIZE_ROUND(s->slice_max_bytes);
+        }
+        sig_size = r_size/s->size_scaler; /* Signalled slize size */
+        s->size_scaler <<= 1;
+    }
+
+    s->slice_min_bytes = s->slice_max_bytes - s->slice_max_bytes*(s->tolerance/100.0f);
+
+    ret = encode_frame(s, avpkt, frame, aux_data, header_size, s->interlaced);
+    if (ret)
+        return ret;
+    if (s->interlaced) {
+        ret = encode_frame(s, avpkt, frame, aux_data, header_size, 2);
+        if (ret)
+            return ret;
+    }
+
+    flush_put_bits(&s->pb);
+    avpkt->size = put_bits_count(&s->pb) >> 3;
+
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int vc2_encode_end(AVCodecContext *avctx)
+{
+    int i;
+    VC2EncContext *s = avctx->priv_data;
+
+    av_log(avctx, AV_LOG_INFO, "Qavg: %i\n", s->q_avg);
+
+    for (i = 0; i < 3; i++) {
+        ff_vc2enc_free_transforms(&s->transform_args[i].t);
+        av_freep(&s->plane[i].coef_buf);
+    }
+
+    av_freep(&s->slice_args);
+
+    return 0;
+}
+
+static av_cold int vc2_encode_init(AVCodecContext *avctx)
+{
+    Plane *p;
+    SubBand *b;
+    int i, level, o, shift, ret;
+    const AVPixFmtDescriptor *fmt = av_pix_fmt_desc_get(avctx->pix_fmt);
+    const int depth = fmt->comp[0].depth;
+    VC2EncContext *s = avctx->priv_data;
+
+    s->picture_number = 0;
+
+    /* Total allowed quantization range */
+    s->q_ceil    = DIRAC_MAX_QUANT_INDEX;
+
+    s->ver.major = 2;
+    s->ver.minor = 0;
+    s->profile   = 3;
+    s->level     = 3;
+
+    s->base_vf   = -1;
+    s->strict_compliance = 1;
+
+    s->q_avg = 0;
+    s->slice_max_bytes = 0;
+    s->slice_min_bytes = 0;
+
+    /* Mark unknown as progressive */
+    s->interlaced = !((avctx->field_order == AV_FIELD_UNKNOWN) ||
+                      (avctx->field_order == AV_FIELD_PROGRESSIVE));
+
+    for (i = 0; i < base_video_fmts_len; i++) {
+        const VC2BaseVideoFormat *fmt = &base_video_fmts[i];
+        if (avctx->pix_fmt != fmt->pix_fmt)
+            continue;
+        if (avctx->time_base.num != fmt->time_base.num)
+            continue;
+        if (avctx->time_base.den != fmt->time_base.den)
+            continue;
+        if (avctx->width != fmt->width)
+            continue;
+        if (avctx->height != fmt->height)
+            continue;
+        if (s->interlaced != fmt->interlaced)
+            continue;
+        s->base_vf = i;
+        s->level   = base_video_fmts[i].level;
+        break;
+    }
+
+    if (s->interlaced)
+        av_log(avctx, AV_LOG_WARNING, "Interlacing enabled!\n");
+
+    if ((s->slice_width  & (s->slice_width  - 1)) ||
+        (s->slice_height & (s->slice_height - 1))) {
+        av_log(avctx, AV_LOG_ERROR, "Slice size is not a power of two!\n");
+        return AVERROR_UNKNOWN;
+    }
+
+    if ((s->slice_width > avctx->width) ||
+        (s->slice_height > avctx->height)) {
+        av_log(avctx, AV_LOG_ERROR, "Slice size is bigger than the image!\n");
+        return AVERROR_UNKNOWN;
+    }
+
+    if (s->base_vf <= 0) {
+        if (avctx->strict_std_compliance < FF_COMPLIANCE_STRICT) {
+            s->strict_compliance = s->base_vf = 0;
+            av_log(avctx, AV_LOG_WARNING, "Format does not strictly comply with VC2 specs\n");
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "Given format does not strictly comply with "
+                   "the specifications, decrease strictness to use it.\n");
+            return AVERROR_UNKNOWN;
+        }
+    } else {
+        av_log(avctx, AV_LOG_INFO, "Selected base video format = %i (%s)\n",
+               s->base_vf, base_video_fmts[s->base_vf].name);
+    }
+
+    /* Chroma subsampling */
+    ret = av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
+    if (ret)
+        return ret;
+
+    /* Bit depth and color range index */
+    if (depth == 8 && avctx->color_range == AVCOL_RANGE_JPEG) {
+        s->bpp = 1;
+        s->bpp_idx = 1;
+        s->diff_offset = 128;
+    } else if (depth == 8 && (avctx->color_range == AVCOL_RANGE_MPEG ||
+               avctx->color_range == AVCOL_RANGE_UNSPECIFIED)) {
+        s->bpp = 1;
+        s->bpp_idx = 2;
+        s->diff_offset = 128;
+    } else if (depth == 10) {
+        s->bpp = 2;
+        s->bpp_idx = 3;
+        s->diff_offset = 512;
+    } else {
+        s->bpp = 2;
+        s->bpp_idx = 4;
+        s->diff_offset = 2048;
+    }
+
+    /* Planes initialization */
+    for (i = 0; i < 3; i++) {
+        int w, h;
+        p = &s->plane[i];
+        p->width      = avctx->width  >> (i ? s->chroma_x_shift : 0);
+        p->height     = avctx->height >> (i ? s->chroma_y_shift : 0);
+        if (s->interlaced)
+            p->height >>= 1;
+        p->dwt_width  = w = FFALIGN(p->width,  (1 << s->wavelet_depth));
+        p->dwt_height = h = FFALIGN(p->height, (1 << s->wavelet_depth));
+        p->coef_stride = FFALIGN(p->dwt_width, 32);
+        p->coef_buf = av_mallocz(p->coef_stride*p->dwt_height*sizeof(dwtcoef));
+        if (!p->coef_buf)
+            goto alloc_fail;
+        for (level = s->wavelet_depth-1; level >= 0; level--) {
+            w = w >> 1;
+            h = h >> 1;
+            for (o = 0; o < 4; o++) {
+                b = &p->band[level][o];
+                b->width  = w;
+                b->height = h;
+                b->stride = p->coef_stride;
+                shift = (o > 1)*b->height*b->stride + (o & 1)*b->width;
+                b->buf = p->coef_buf + shift;
+            }
+        }
+
+        /* DWT init */
+        if (ff_vc2enc_init_transforms(&s->transform_args[i].t,
+                                      s->plane[i].coef_stride,
+                                      s->plane[i].dwt_height,
+                                      s->slice_width, s->slice_height))
+            goto alloc_fail;
+    }
+
+    /* Slices */
+    s->num_x = s->plane[0].dwt_width/s->slice_width;
+    s->num_y = s->plane[0].dwt_height/s->slice_height;
+
+    s->slice_args = av_calloc(s->num_x*s->num_y, sizeof(SliceArgs));
+    if (!s->slice_args)
+        goto alloc_fail;
+
+    for (i = 0; i < 116; i++) {
+        const uint64_t qf = ff_dirac_qscale_tab[i];
+        const uint32_t m = av_log2(qf);
+        const uint32_t t = (1ULL << (m + 32)) / qf;
+        const uint32_t r = (t*qf + qf) & UINT32_MAX;
+        if (!(qf & (qf - 1))) {
+            s->qmagic_lut[i][0] = 0xFFFFFFFF;
+            s->qmagic_lut[i][1] = 0xFFFFFFFF;
+        } else if (r <= 1 << m) {
+            s->qmagic_lut[i][0] = t + 1;
+            s->qmagic_lut[i][1] = 0;
+        } else {
+            s->qmagic_lut[i][0] = t;
+            s->qmagic_lut[i][1] = t;
+        }
+    }
+
+    return 0;
+
+alloc_fail:
+    vc2_encode_end(avctx);
+    av_log(avctx, AV_LOG_ERROR, "Unable to allocate memory!\n");
+    return AVERROR(ENOMEM);
+}
+
+#define VC2ENC_FLAGS (AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption vc2enc_options[] = {
+    {"tolerance",     "Max undershoot in percent", offsetof(VC2EncContext, tolerance), AV_OPT_TYPE_DOUBLE, {.dbl = 5.0f}, 0.0f, 45.0f, VC2ENC_FLAGS, "tolerance"},
+    {"slice_width",   "Slice width",  offsetof(VC2EncContext, slice_width), AV_OPT_TYPE_INT, {.i64 = 32}, 32, 1024, VC2ENC_FLAGS, "slice_width"},
+    {"slice_height",  "Slice height", offsetof(VC2EncContext, slice_height), AV_OPT_TYPE_INT, {.i64 = 16}, 8, 1024, VC2ENC_FLAGS, "slice_height"},
+    {"wavelet_depth", "Transform depth", offsetof(VC2EncContext, wavelet_depth), AV_OPT_TYPE_INT, {.i64 = 4}, 1, 5, VC2ENC_FLAGS, "wavelet_depth"},
+    {"wavelet_type",  "Transform type",  offsetof(VC2EncContext, wavelet_idx), AV_OPT_TYPE_INT, {.i64 = VC2_TRANSFORM_9_7}, 0, VC2_TRANSFORMS_NB, VC2ENC_FLAGS, "wavelet_idx"},
+        {"9_7",          "Deslauriers-Dubuc (9,7)", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_9_7},    INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+        {"5_3",          "LeGall (5,3)",            0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_5_3},    INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+        {"haar",         "Haar (with shift)",       0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR_S}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+        {"haar_noshift", "Haar (without shift)",    0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR},   INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+    {"qm", "Custom quantization matrix", offsetof(VC2EncContext, quant_matrix), AV_OPT_TYPE_INT, {.i64 = VC2_QM_DEF}, 0, VC2_QM_NB, VC2ENC_FLAGS, "quant_matrix"},
+        {"default",   "Default from the specifications", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_DEF}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "quant_matrix"},
+        {"color",     "Prevents low bitrate discoloration", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_COL}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "quant_matrix"},
+        {"flat",      "Optimize for PSNR", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_FLAT}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "quant_matrix"},
+    {NULL}
+};
+
+static const AVClass vc2enc_class = {
+    .class_name = "SMPTE VC-2 encoder",
+    .category = AV_CLASS_CATEGORY_ENCODER,
+    .option = vc2enc_options,
+    .item_name = av_default_item_name,
+    .version = LIBAVUTIL_VERSION_INT
+};
+
+static const AVCodecDefault vc2enc_defaults[] = {
+    { "b",              "600000000"   },
+    { NULL },
+};
+
+static const enum AVPixelFormat allowed_pix_fmts[] = {
+    AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
+    AV_PIX_FMT_NONE
+};
+
+AVCodec ff_vc2_encoder = {
+    .name           = "vc2",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE VC-2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DIRAC,
+    .priv_data_size = sizeof(VC2EncContext),
+    .init           = vc2_encode_init,
+    .close          = vc2_encode_end,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .encode2        = vc2_encode_frame,
+    .priv_class     = &vc2enc_class,
+    .defaults       = vc2enc_defaults,
+    .pix_fmts       = allowed_pix_fmts
+};
diff --git a/libavcodec/vc2enc_dwt.c b/libavcodec/vc2enc_dwt.c
new file mode 100644
index 0000000..d22af8a
--- /dev/null
+++ b/libavcodec/vc2enc_dwt.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+#include "vc2enc_dwt.h"
+
+/* Since the transforms spit out interleaved coefficients, this function
+ * rearranges the coefficients into the more traditional subdivision,
+ * making it easier to encode and perform another level. */
+static av_always_inline void deinterleave(dwtcoef *linell, ptrdiff_t stride,
+                                          int width, int height, dwtcoef *synthl)
+{
+    int x, y;
+    ptrdiff_t synthw = width << 1;
+    dwtcoef *linehl = linell + width;
+    dwtcoef *linelh = linell + height*stride;
+    dwtcoef *linehh = linelh + width;
+
+    /* Deinterleave the coefficients. */
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            linell[x] = synthl[(x << 1)];
+            linehl[x] = synthl[(x << 1) + 1];
+            linelh[x] = synthl[(x << 1) + synthw];
+            linehh[x] = synthl[(x << 1) + synthw + 1];
+        }
+        synthl += synthw << 1;
+        linell += stride;
+        linelh += stride;
+        linehl += stride;
+        linehh += stride;
+    }
+}
+
+static void vc2_subband_dwt_97(VC2TransformContext *t, dwtcoef *data,
+                               ptrdiff_t stride, int width, int height)
+{
+    int x, y;
+    dwtcoef *datal = data, *synth = t->buffer, *synthl = synth;
+    const ptrdiff_t synth_width  = width  << 1;
+    const ptrdiff_t synth_height = height << 1;
+
+    /*
+     * Shift in one bit that is used for additional precision and copy
+     * the data to the buffer.
+     */
+    for (y = 0; y < synth_height; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] = datal[x] << 1;
+        synthl += synth_width;
+        datal += stride;
+    }
+
+    /* Horizontal synthesis. */
+    synthl = synth;
+    for (y = 0; y < synth_height; y++) {
+        /* Lifting stage 2. */
+        synthl[1] -= (8*synthl[0] + 9*synthl[2] - synthl[4] + 8) >> 4;
+        for (x = 1; x < width - 2; x++)
+            synthl[2*x + 1] -= (9*synthl[2*x] + 9*synthl[2*x + 2] - synthl[2*x + 4] -
+                                synthl[2 * x - 2] + 8) >> 4;
+        synthl[synth_width - 1] -= (17*synthl[synth_width - 2] -
+                                    synthl[synth_width - 4] + 8) >> 4;
+        synthl[synth_width - 3] -= (8*synthl[synth_width - 2] +
+                                    9*synthl[synth_width - 4] -
+                                    synthl[synth_width - 6] + 8) >> 4;
+        /* Lifting stage 1. */
+        synthl[0] += (synthl[1] + synthl[1] + 2) >> 2;
+        for (x = 1; x < width - 1; x++)
+            synthl[2*x] += (synthl[2*x - 1] + synthl[2*x + 1] + 2) >> 2;
+
+        synthl[synth_width - 2] += (synthl[synth_width - 3] +
+                                    synthl[synth_width - 1] + 2) >> 2;
+        synthl += synth_width;
+    }
+
+    /* Vertical synthesis: Lifting stage 2. */
+    synthl = synth + synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] -= (8*synthl[x - synth_width] + 9*synthl[x + synth_width] -
+                      synthl[x + 3 * synth_width] + 8) >> 4;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 2; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x + synth_width] -= (9*synthl[x] +
+                                        9*synthl[x + 2 * synth_width] -
+                                        synthl[x - 2 * synth_width] -
+                                        synthl[x + 4 * synth_width] + 8) >> 4;
+        synthl += synth_width << 1;
+    }
+
+    synthl = synth + (synth_height - 1) * synth_width;
+    for (x = 0; x < synth_width; x++) {
+        synthl[x] -= (17*synthl[x - synth_width] -
+                      synthl[x - 3*synth_width] + 8) >> 4;
+                      synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] +
+                      8*synthl[x - 1*synth_width] - synthl[x - 5*synth_width] + 8) >> 4;
+    }
+
+    /* Vertical synthesis: Lifting stage 1. */
+    synthl = synth;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (synthl[x + synth_width] + synthl[x + synth_width] + 2) >> 2;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 1; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] += (synthl[x - synth_width] + synthl[x + synth_width] + 2) >> 2;
+        synthl += synth_width << 1;
+    }
+
+    synthl = synth + (synth_height - 2) * synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (synthl[x - synth_width] + synthl[x + synth_width] + 2) >> 2;
+
+    deinterleave(data, stride, width, height, synth);
+}
+
+static void vc2_subband_dwt_53(VC2TransformContext *t, dwtcoef *data,
+                               ptrdiff_t stride, int width, int height)
+{
+    int x, y;
+    dwtcoef *synth = t->buffer, *synthl = synth, *datal = data;
+    const ptrdiff_t synth_width  = width  << 1;
+    const ptrdiff_t synth_height = height << 1;
+
+    /*
+     * Shift in one bit that is used for additional precision and copy
+     * the data to the buffer.
+     */
+    for (y = 0; y < synth_height; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] = datal[x] << 1;
+        synthl += synth_width;
+        datal  += stride;
+    }
+
+    /* Horizontal synthesis. */
+    synthl = synth;
+    for (y = 0; y < synth_height; y++) {
+        /* Lifting stage 2. */
+        for (x = 0; x < width - 1; x++)
+            synthl[2 * x + 1] -= (synthl[2 * x] + synthl[2 * x + 2] + 1) >> 1;
+
+        synthl[synth_width - 1] -= (2*synthl[synth_width - 2] + 1) >> 1;
+
+        /* Lifting stage 1. */
+        synthl[0] += (2*synthl[1] + 2) >> 2;
+        for (x = 1; x < width - 1; x++)
+            synthl[2 * x] += (synthl[2 * x - 1] + synthl[2 * x + 1] + 2) >> 2;
+
+        synthl[synth_width - 2] += (synthl[synth_width - 3] + synthl[synth_width - 1] + 2) >> 2;
+
+        synthl += synth_width;
+    }
+
+    /* Vertical synthesis: Lifting stage 2. */
+    synthl = synth + synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] -= (synthl[x - synth_width] + synthl[x + synth_width] + 1) >> 1;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 1; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x + synth_width] -= (synthl[x] + synthl[x + synth_width * 2] + 1) >> 1;
+        synthl += (synth_width << 1);
+    }
+
+    synthl = synth + (synth_height - 1) * synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] -= (2*synthl[x - synth_width] + 1) >> 1;
+
+    /* Vertical synthesis: Lifting stage 1. */
+    synthl = synth;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (2*synthl[synth_width + x] + 2) >> 2;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 1; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] += (synthl[x + synth_width] + synthl[x - synth_width] + 2) >> 2;
+        synthl += (synth_width << 1);
+    }
+
+    synthl = synth + (synth_height - 2)*synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (synthl[x - synth_width] + synthl[x + synth_width] + 2) >> 2;
+
+
+    deinterleave(data, stride, width, height, synth);
+}
+
+static av_always_inline void dwt_haar(VC2TransformContext *t, dwtcoef *data,
+                                      ptrdiff_t stride, int width, int height,
+                                      const int s)
+{
+    int x, y;
+    dwtcoef *synth = t->buffer, *synthl = synth, *datal = data;
+    const ptrdiff_t synth_width  = width  << 1;
+    const ptrdiff_t synth_height = height << 1;
+
+    /* Horizontal synthesis. */
+    for (y = 0; y < synth_height; y++) {
+        for (x = 0; x < synth_width; x += 2) {
+            synthl[y*synth_width + x + 1] = (datal[y*stride + x + 1] << s) -
+                                            (datal[y*stride + x] << s);
+            synthl[y*synth_width + x] = (datal[y*stride + x + 0] << s) +
+                                        ((synthl[y*synth_width + x + 1] + 1) >> 1);
+        }
+    }
+
+    /* Vertical synthesis. */
+    for (x = 0; x < synth_width; x++) {
+        for (y = 0; y < synth_height; y += 2) {
+            synthl[(y + 1)*synth_width + x] = synthl[(y + 1)*synth_width + x] -
+                                              synthl[y*synth_width + x];
+            synthl[y*synth_width + x] = synthl[y*synth_width + x] +
+                                        ((synthl[(y + 1)*synth_width + x] + 1) >> 1);
+        }
+    }
+
+    deinterleave(data, stride, width, height, synth);
+}
+
+static void vc2_subband_dwt_haar(VC2TransformContext *t, dwtcoef *data,
+                                 ptrdiff_t stride, int width, int height)
+{
+    dwt_haar(t, data, stride, width, height, 0);
+}
+
+static void vc2_subband_dwt_haar_shift(VC2TransformContext *t, dwtcoef *data,
+                                       ptrdiff_t stride, int width, int height)
+{
+    dwt_haar(t, data, stride, width, height, 1);
+}
+
+av_cold int ff_vc2enc_init_transforms(VC2TransformContext *s, int p_stride,
+                                      int p_height, int slice_w, int slice_h)
+{
+    s->vc2_subband_dwt[VC2_TRANSFORM_9_7]    = vc2_subband_dwt_97;
+    s->vc2_subband_dwt[VC2_TRANSFORM_5_3]    = vc2_subband_dwt_53;
+    s->vc2_subband_dwt[VC2_TRANSFORM_HAAR]   = vc2_subband_dwt_haar;
+    s->vc2_subband_dwt[VC2_TRANSFORM_HAAR_S] = vc2_subband_dwt_haar_shift;
+
+    /* Pad by the slice size, only matters for non-Haar wavelets */
+    s->buffer = av_calloc((p_stride + slice_w)*(p_height + slice_h), sizeof(dwtcoef));
+    if (!s->buffer)
+        return 1;
+
+    s->padding = (slice_h >> 1)*p_stride + (slice_w >> 1);
+    s->buffer += s->padding;
+
+    return 0;
+}
+
+av_cold void ff_vc2enc_free_transforms(VC2TransformContext *s)
+{
+    av_free(s->buffer - s->padding);
+    s->buffer = NULL;
+}
diff --git a/libavcodec/vc2enc_dwt.h b/libavcodec/vc2enc_dwt.h
new file mode 100644
index 0000000..a6932bc
--- /dev/null
+++ b/libavcodec/vc2enc_dwt.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VC2ENC_DWT_H
+#define AVCODEC_VC2ENC_DWT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef int32_t dwtcoef;
+
+enum VC2TransformType {
+    VC2_TRANSFORM_9_7    = 0,   /* Deslauriers-Dubuc (9,7)  */
+    VC2_TRANSFORM_5_3    = 1,   /* LeGall (5,3)             */
+    VC2_TRANSFORM_13_7   = 2,   /* Deslauriers-Dubuc (13,7) */
+    VC2_TRANSFORM_HAAR   = 3,   /* Haar without shift       */
+    VC2_TRANSFORM_HAAR_S = 4,   /* Haar with 1 shift/lvl    */
+    VC2_TRANSFORM_FIDEL  = 5,   /* Fidelity filter          */
+    VC2_TRANSFORM_9_7_I  = 6,   /* Daubechies (9,7)         */
+
+    VC2_TRANSFORMS_NB
+};
+
+typedef struct VC2TransformContext {
+    dwtcoef *buffer;
+    int padding;
+    void (*vc2_subband_dwt[VC2_TRANSFORMS_NB])(struct VC2TransformContext *t,
+                                               dwtcoef *data, ptrdiff_t stride,
+                                               int width, int height);
+} VC2TransformContext;
+
+int  ff_vc2enc_init_transforms(VC2TransformContext *t, int p_stride, int p_height,
+                               int slice_w, int slice_h);
+void ff_vc2enc_free_transforms(VC2TransformContext *t);
+
+#endif /* AVCODEC_VC2ENC_DWT_H */
diff --git a/libavcodec/vcr1.c b/libavcodec/vcr1.c
index 76c47eb..28a5eec 100644
--- a/libavcodec/vcr1.c
+++ b/libavcodec/vcr1.c
@@ -2,20 +2,20 @@
  * ATI VCR1 codec
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 
 #include "avcodec.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 
 typedef struct VCR1Context {
@@ -37,8 +38,8 @@ static av_cold int vcr1_decode_init(AVCodecContext *avctx)
 {
     avctx->pix_fmt = AV_PIX_FMT_YUV410P;
 
-    if (avctx->width & 7) {
-        av_log(avctx, AV_LOG_ERROR, "Width %d is not divisble by 8.\n", avctx->width);
+    if (avctx->width % 8 || avctx->height%4) {
+        avpriv_request_sample(avctx, "odd dimensions (%d x %d) support", avctx->width, avctx->height);
         return AVERROR_INVALIDDATA;
     }
 
@@ -48,27 +49,25 @@ static av_cold int vcr1_decode_init(AVCodecContext *avctx)
 static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf        = avpkt->data;
-    int buf_size              = avpkt->size;
     VCR1Context *const a      = avctx->priv_data;
     AVFrame *const p          = data;
-    const uint8_t *bytestream = buf;
+    const uint8_t *bytestream = avpkt->data;
+    const uint8_t *bytestream_end = bytestream + avpkt->size;
     int i, x, y, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if(avpkt->size < 32 + avctx->height + avctx->width*avctx->height*5/8){
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data. %d < %d\n", avpkt->size ,  32 + avctx->height + avctx->width*avctx->height*5/8);
+        return AVERROR(EINVAL);
     }
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
-    if (buf_size < 32)
-        goto packet_small;
-
     for (i = 0; i < 16; i++) {
         a->delta[i] = *bytestream++;
         bytestream++;
-        buf_size--;
     }
 
     for (y = 0; y < avctx->height; y++) {
@@ -79,12 +78,10 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
             uint8_t *cb = &p->data[1][(y >> 2) * p->linesize[1]];
             uint8_t *cr = &p->data[2][(y >> 2) * p->linesize[2]];
 
-            if (buf_size < 4 + avctx->width)
-                goto packet_small;
+            av_assert0 (bytestream_end - bytestream >= 4 + avctx->width);
 
             for (i = 0; i < 4; i++)
                 a->offset[i] = *bytestream++;
-            buf_size -= 4;
 
             offset = a->offset[0] - a->delta[bytestream[2] & 0xF];
             for (x = 0; x < avctx->width; x += 4) {
@@ -98,11 +95,9 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                 *cr++       = bytestream[1];
 
                 bytestream += 4;
-                buf_size   -= 4;
             }
         } else {
-            if (buf_size < avctx->width / 2)
-                goto packet_small;
+            av_assert0 (bytestream_end - bytestream >= avctx->width / 2);
 
             offset = a->offset[y & 3] - a->delta[bytestream[2] & 0xF];
 
@@ -117,17 +112,13 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                 luma[7]     = offset += a->delta[bytestream[1] >>  4];
                 luma       += 8;
                 bytestream += 4;
-                buf_size   -= 4;
             }
         }
     }
 
     *got_frame = 1;
 
-    return buf_size;
-packet_small:
-    av_log(avctx, AV_LOG_ERROR, "Input packet too small.\n");
-    return AVERROR_INVALIDDATA;
+    return bytestream - avpkt->data;
 }
 
 AVCodec ff_vcr1_decoder = {
diff --git a/libavcodec/vda.c b/libavcodec/vda.c
deleted file mode 100644
index eb4b998..0000000
--- a/libavcodec/vda.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/mem.h"
-
-#include "vda.h"
-#include "vda_internal.h"
-
-#if CONFIG_H264_VDA_HWACCEL
-AVVDAContext *av_vda_alloc_context(void)
-{
-    AVVDAContext *ret = av_mallocz(sizeof(*ret));
-
-    if (ret) {
-        ret->output_callback = ff_vda_output_callback;
-        ret->cv_pix_fmt_type = kCVPixelFormatType_422YpCbCr8;
-    }
-
-    return ret;
-}
-
-int av_vda_default_init(AVCodecContext *avctx)
-{
-    return av_vda_default_init2(avctx, NULL);
-}
-
-int av_vda_default_init2(AVCodecContext *avctx, AVVDAContext *vdactx)
-{
-    avctx->hwaccel_context = vdactx ?: av_vda_alloc_context();
-    if (!avctx->hwaccel_context)
-        return AVERROR(ENOMEM);
-    return ff_vda_default_init(avctx);
-}
-
-void av_vda_default_free(AVCodecContext *avctx)
-{
-    ff_vda_default_free(avctx);
-    av_freep(&avctx->hwaccel_context);
-}
-
-void ff_vda_default_free(AVCodecContext *avctx)
-{
-    AVVDAContext *vda = avctx->hwaccel_context;
-    if (vda && vda->decoder)
-        VDADecoderDestroy(vda->decoder);
-}
-
-#else
-AVVDAContext *av_vda_alloc_context(void)
-{
-    return NULL;
-}
-
-int av_vda_default_init(AVCodecContext *avctx)
-{
-    return AVERROR(ENOSYS);
-}
-
-void av_vda_default_free(AVCodecContext *ctx)
-{
-}
-#endif
diff --git a/libavcodec/vda.h b/libavcodec/vda.h
deleted file mode 100644
index 5e7228c..0000000
--- a/libavcodec/vda.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * VDA HW acceleration
- *
- * copyright (c) 2011 Sebastien Zwickert
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_VDA_H
-#define AVCODEC_VDA_H
-
-/**
- * @file
- * @ingroup lavc_codec_hwaccel_vda
- * Public libavcodec VDA header.
- */
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/version.h"
-
-#include <stdint.h>
-
-// emmintrin.h is unable to compile with -std=c99 -Werror=missing-prototypes
-// http://openradar.appspot.com/8026390
-#undef __GNUC_STDC_INLINE__
-
-#define Picture QuickdrawPicture
-#include <VideoDecodeAcceleration/VDADecoder.h>
-#undef Picture
-
-/**
- * @defgroup lavc_codec_hwaccel_vda VDA
- * @ingroup lavc_codec_hwaccel
- *
- * @{
- */
-
-/**
- * This structure is used to provide the necessary configurations and data
- * to the VDA Libav HWAccel implementation.
- *
- * The application must make it available as AVCodecContext.hwaccel_context.
- */
-struct vda_context {
-    /**
-     * VDA decoder object.
-     *
-     * - encoding: unused
-     * - decoding: Set/Unset by libavcodec.
-     */
-    VDADecoder          decoder;
-
-    /**
-     * The Core Video pixel buffer that contains the current image data.
-     *
-     * encoding: unused
-     * decoding: Set by libavcodec. Unset by user.
-     */
-    CVPixelBufferRef    cv_buffer;
-
-    /**
-     * Use the hardware decoder in synchronous mode.
-     *
-     * encoding: unused
-     * decoding: Set by user.
-     */
-    int                 use_sync_decoding;
-
-    /**
-     * The frame width.
-     *
-     * - encoding: unused
-     * - decoding: Set/Unset by user.
-     */
-    int                 width;
-
-    /**
-     * The frame height.
-     *
-     * - encoding: unused
-     * - decoding: Set/Unset by user.
-     */
-    int                 height;
-
-    /**
-     * The frame format.
-     *
-     * - encoding: unused
-     * - decoding: Set/Unset by user.
-     */
-    int                 format;
-
-    /**
-     * The pixel format for output image buffers.
-     *
-     * - encoding: unused
-     * - decoding: Set/Unset by user.
-     */
-    OSType              cv_pix_fmt_type;
-
-    /**
-     * unused
-     */
-    uint8_t             *priv_bitstream;
-
-    /**
-     * unused
-     */
-    int                 priv_bitstream_size;
-
-    /**
-     * unused
-     */
-    int                 priv_allocated_size;
-};
-
-/** Create the video decoder. */
-int ff_vda_create_decoder(struct vda_context *vda_ctx,
-                          uint8_t *extradata,
-                          int extradata_size);
-
-/** Destroy the video decoder. */
-int ff_vda_destroy_decoder(struct vda_context *vda_ctx);
-
-/**
- * This struct holds all the information that needs to be passed
- * between the caller and libavcodec for initializing VDA decoding.
- * Its size is not a part of the public ABI, it must be allocated with
- * av_vda_alloc_context() and freed with av_free().
- */
-typedef struct AVVDAContext {
-    /**
-     * VDA decoder object. Created and freed by the caller.
-     */
-    VDADecoder decoder;
-
-    /**
-     * The output callback that must be passed to VDADecoderCreate.
-     * Set by av_vda_alloc_context().
-     */
-    VDADecoderOutputCallback output_callback;
-
-    /**
-     * CVPixelBuffer Format Type that VDA will use for decoded frames; set by
-     * the caller.
-     */
-    OSType cv_pix_fmt_type;
-} AVVDAContext;
-
-/**
- * Allocate and initialize a VDA context.
- *
- * This function should be called from the get_format() callback when the caller
- * selects the AV_PIX_FMT_VDA format. The caller must then create the decoder
- * object (using the output callback provided by libavcodec) that will be used
- * for VDA-accelerated decoding.
- *
- * When decoding with VDA is finished, the caller must destroy the decoder
- * object and free the VDA context using av_free().
- *
- * @return the newly allocated context or NULL on failure
- */
-AVVDAContext *av_vda_alloc_context(void);
-
-/**
- * This is a convenience function that creates and sets up the VDA context using
- * an internal implementation.
- *
- * @param avctx the corresponding codec context
- *
- * @return >= 0 on success, a negative AVERROR code on failure
- */
-int av_vda_default_init(AVCodecContext *avctx);
-
-/**
- * This is a convenience function that creates and sets up the VDA context using
- * an internal implementation.
- *
- * @param avctx the corresponding codec context
- * @param vdactx the VDA context to use
- *
- * @return >= 0 on success, a negative AVERROR code on failure
- */
-int av_vda_default_init2(AVCodecContext *avctx, AVVDAContext *vdactx);
-
-/**
- * This function must be called to free the VDA context initialized with
- * av_vda_default_init().
- *
- * @param avctx the corresponding codec context
- */
-void av_vda_default_free(AVCodecContext *avctx);
-
-/**
- * @}
- */
-
-#endif /* AVCODEC_VDA_H */
diff --git a/libavcodec/vda_h264.c b/libavcodec/vda_h264.c
deleted file mode 100644
index 4dfe532..0000000
--- a/libavcodec/vda_h264.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * VDA H.264 hardware acceleration
- *
- * copyright (c) 2011 Sebastien Zwickert
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <CoreFoundation/CFNumber.h>
-#include <CoreFoundation/CFData.h>
-#include <CoreFoundation/CFString.h>
-
-#include "libavutil/avutil.h"
-#include "h264dec.h"
-#include "internal.h"
-#include "vda.h"
-#include "vda_internal.h"
-
-typedef struct VDAContext {
-    // The current bitstream buffer.
-    uint8_t             *bitstream;
-
-    // The current size of the bitstream.
-    int                  bitstream_size;
-
-    // The reference size used for fast reallocation.
-    int                  allocated_size;
-
-    CVImageBufferRef frame;
-} VDAContext;
-
-/* Decoder callback that adds the VDA frame to the queue in display order. */
-static void vda_decoder_callback(void *vda_hw_ctx,
-                                 CFDictionaryRef user_info,
-                                 OSStatus status,
-                                 uint32_t infoFlags,
-                                 CVImageBufferRef image_buffer)
-{
-    struct vda_context *vda_ctx = vda_hw_ctx;
-
-    if (!image_buffer)
-        return;
-
-    if (vda_ctx->cv_pix_fmt_type != CVPixelBufferGetPixelFormatType(image_buffer))
-        return;
-
-    vda_ctx->cv_buffer = CVPixelBufferRetain(image_buffer);
-}
-
-static int vda_sync_decode(VDAContext *ctx, struct vda_context *vda_ctx)
-{
-    OSStatus status;
-    CFDataRef coded_frame;
-    uint32_t flush_flags = 1 << 0; ///< kVDADecoderFlush_emitFrames
-
-    coded_frame = CFDataCreate(kCFAllocatorDefault,
-                               ctx->bitstream,
-                               ctx->bitstream_size);
-
-    status = VDADecoderDecode(vda_ctx->decoder, 0, coded_frame, NULL);
-
-    if (kVDADecoderNoErr == status)
-        status = VDADecoderFlush(vda_ctx->decoder, flush_flags);
-
-    CFRelease(coded_frame);
-
-    return status;
-}
-
-
-static int vda_old_h264_start_frame(AVCodecContext *avctx,
-                                av_unused const uint8_t *buffer,
-                                av_unused uint32_t size)
-{
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-    struct vda_context *vda_ctx         = avctx->hwaccel_context;
-
-    if (!vda_ctx->decoder)
-        return -1;
-
-    vda->bitstream_size = 0;
-
-    return 0;
-}
-
-static int vda_old_h264_decode_slice(AVCodecContext *avctx,
-                                 const uint8_t *buffer,
-                                 uint32_t size)
-{
-    VDAContext *vda                     = avctx->internal->hwaccel_priv_data;
-    struct vda_context *vda_ctx         = avctx->hwaccel_context;
-    void *tmp;
-
-    if (!vda_ctx->decoder)
-        return -1;
-
-    tmp = av_fast_realloc(vda->bitstream,
-                          &vda->allocated_size,
-                          vda->bitstream_size + size + 4);
-    if (!tmp)
-        return AVERROR(ENOMEM);
-
-    vda->bitstream = tmp;
-
-    AV_WB32(vda->bitstream + vda->bitstream_size, size);
-    memcpy(vda->bitstream + vda->bitstream_size + 4, buffer, size);
-
-    vda->bitstream_size += size + 4;
-
-    return 0;
-}
-
-static int vda_old_h264_end_frame(AVCodecContext *avctx)
-{
-    H264Context *h                      = avctx->priv_data;
-    VDAContext *vda                     = avctx->internal->hwaccel_priv_data;
-    struct vda_context *vda_ctx         = avctx->hwaccel_context;
-    AVFrame *frame                      = h->cur_pic_ptr->f;
-    int status;
-
-    if (!vda_ctx->decoder || !vda->bitstream)
-        return -1;
-
-    status = vda_sync_decode(vda, vda_ctx);
-    frame->data[3] = (void*)vda_ctx->cv_buffer;
-
-    if (status)
-        av_log(avctx, AV_LOG_ERROR, "Failed to decode frame (%d)\n", status);
-
-    return status;
-}
-
-int ff_vda_create_decoder(struct vda_context *vda_ctx,
-                          uint8_t *extradata,
-                          int extradata_size)
-{
-    OSStatus status = kVDADecoderNoErr;
-    CFNumberRef height;
-    CFNumberRef width;
-    CFNumberRef format;
-    CFDataRef avc_data;
-    CFMutableDictionaryRef config_info;
-    CFMutableDictionaryRef buffer_attributes;
-    CFMutableDictionaryRef io_surface_properties;
-    CFNumberRef cv_pix_fmt;
-
-    /* Each VCL NAL in the bitstream sent to the decoder
-     * is preceded by a 4 bytes length header.
-     * Change the avcC atom header if needed, to signal headers of 4 bytes. */
-    if (extradata_size >= 4 && (extradata[4] & 0x03) != 0x03) {
-        uint8_t *rw_extradata;
-
-        if (!(rw_extradata = av_malloc(extradata_size)))
-            return AVERROR(ENOMEM);
-
-        memcpy(rw_extradata, extradata, extradata_size);
-
-        rw_extradata[4] |= 0x03;
-
-        avc_data = CFDataCreate(kCFAllocatorDefault, rw_extradata, extradata_size);
-
-        av_freep(&rw_extradata);
-    } else {
-        avc_data = CFDataCreate(kCFAllocatorDefault, extradata, extradata_size);
-    }
-
-    config_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
-                                            4,
-                                            &kCFTypeDictionaryKeyCallBacks,
-                                            &kCFTypeDictionaryValueCallBacks);
-
-    height   = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &vda_ctx->height);
-    width    = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &vda_ctx->width);
-    format   = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &vda_ctx->format);
-
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_Height, height);
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_Width, width);
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_SourceFormat, format);
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_avcCData, avc_data);
-
-    buffer_attributes = CFDictionaryCreateMutable(kCFAllocatorDefault,
-                                                  2,
-                                                  &kCFTypeDictionaryKeyCallBacks,
-                                                  &kCFTypeDictionaryValueCallBacks);
-    io_surface_properties = CFDictionaryCreateMutable(kCFAllocatorDefault,
-                                                      0,
-                                                      &kCFTypeDictionaryKeyCallBacks,
-                                                      &kCFTypeDictionaryValueCallBacks);
-    cv_pix_fmt      = CFNumberCreate(kCFAllocatorDefault,
-                                     kCFNumberSInt32Type,
-                                     &vda_ctx->cv_pix_fmt_type);
-    CFDictionarySetValue(buffer_attributes,
-                         kCVPixelBufferPixelFormatTypeKey,
-                         cv_pix_fmt);
-    CFDictionarySetValue(buffer_attributes,
-                         kCVPixelBufferIOSurfacePropertiesKey,
-                         io_surface_properties);
-
-    status = VDADecoderCreate(config_info,
-                              buffer_attributes,
-                              (VDADecoderOutputCallback *)vda_decoder_callback,
-                              vda_ctx,
-                              &vda_ctx->decoder);
-
-    CFRelease(height);
-    CFRelease(width);
-    CFRelease(format);
-    CFRelease(avc_data);
-    CFRelease(config_info);
-    CFRelease(io_surface_properties);
-    CFRelease(cv_pix_fmt);
-    CFRelease(buffer_attributes);
-
-    return status;
-}
-
-int ff_vda_destroy_decoder(struct vda_context *vda_ctx)
-{
-    OSStatus status = kVDADecoderNoErr;
-
-    if (vda_ctx->decoder)
-        status = VDADecoderDestroy(vda_ctx->decoder);
-
-    return status;
-}
-
-static int vda_h264_uninit(AVCodecContext *avctx)
-{
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-    av_freep(&vda->bitstream);
-    if (vda->frame)
-        CVPixelBufferRelease(vda->frame);
-    return 0;
-}
-
-const AVHWAccel ff_h264_vda_old_hwaccel = {
-    .name           = "h264_vda",
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_H264,
-    .pix_fmt        = AV_PIX_FMT_VDA_VLD,
-    .start_frame    = vda_old_h264_start_frame,
-    .decode_slice   = vda_old_h264_decode_slice,
-    .end_frame      = vda_old_h264_end_frame,
-    .uninit         = vda_h264_uninit,
-    .priv_data_size = sizeof(VDAContext),
-};
-
-void ff_vda_output_callback(void *opaque,
-                            CFDictionaryRef user_info,
-                            OSStatus status,
-                            uint32_t infoFlags,
-                            CVImageBufferRef image_buffer)
-{
-    AVCodecContext *ctx = opaque;
-    VDAContext *vda = ctx->internal->hwaccel_priv_data;
-
-
-    if (vda->frame) {
-        CVPixelBufferRelease(vda->frame);
-        vda->frame = NULL;
-    }
-
-    if (!image_buffer)
-        return;
-
-    vda->frame = CVPixelBufferRetain(image_buffer);
-}
-
-static int vda_h264_start_frame(AVCodecContext *avctx,
-                                const uint8_t *buffer,
-                                uint32_t size)
-{
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-
-    vda->bitstream_size = 0;
-
-    return 0;
-}
-
-static int vda_h264_decode_slice(AVCodecContext *avctx,
-                                 const uint8_t *buffer,
-                                 uint32_t size)
-{
-    VDAContext *vda       = avctx->internal->hwaccel_priv_data;
-    void *tmp;
-
-    tmp = av_fast_realloc(vda->bitstream,
-                          &vda->allocated_size,
-                          vda->bitstream_size + size + 4);
-    if (!tmp)
-        return AVERROR(ENOMEM);
-
-    vda->bitstream = tmp;
-
-    AV_WB32(vda->bitstream + vda->bitstream_size, size);
-    memcpy(vda->bitstream + vda->bitstream_size + 4, buffer, size);
-
-    vda->bitstream_size += size + 4;
-
-    return 0;
-}
-
-static void release_buffer(void *opaque, uint8_t *data)
-{
-    CVImageBufferRef frame = (CVImageBufferRef)data;
-    CVPixelBufferRelease(frame);
-}
-
-static int vda_h264_end_frame(AVCodecContext *avctx)
-{
-    H264Context *h        = avctx->priv_data;
-    VDAContext *vda       = avctx->internal->hwaccel_priv_data;
-    AVVDAContext *vda_ctx = avctx->hwaccel_context;
-    AVFrame *frame        = h->cur_pic_ptr->f;
-    uint32_t flush_flags  = 1 << 0; ///< kVDADecoderFlush_emitFrames
-    CFDataRef coded_frame;
-    OSStatus status;
-
-    if (!vda->bitstream_size)
-        return AVERROR_INVALIDDATA;
-
-
-    coded_frame = CFDataCreate(kCFAllocatorDefault,
-                               vda->bitstream,
-                               vda->bitstream_size);
-
-    status = VDADecoderDecode(vda_ctx->decoder, 0, coded_frame, NULL);
-
-    if (status == kVDADecoderNoErr)
-        status = VDADecoderFlush(vda_ctx->decoder, flush_flags);
-
-    CFRelease(coded_frame);
-
-    if (!vda->frame)
-        return AVERROR_UNKNOWN;
-
-    if (status != kVDADecoderNoErr) {
-        av_log(avctx, AV_LOG_ERROR, "Failed to decode frame (%d)\n", status);
-        return AVERROR_UNKNOWN;
-    }
-
-    av_buffer_unref(&frame->buf[0]);
-
-    frame->buf[0] = av_buffer_create((uint8_t*)vda->frame,
-                                     sizeof(vda->frame),
-                                     release_buffer, NULL,
-                                     AV_BUFFER_FLAG_READONLY);
-    if (!frame->buf[0])
-        return AVERROR(ENOMEM);
-
-    frame->data[3] = (uint8_t*)vda->frame;
-    vda->frame = NULL;
-
-    return 0;
-}
-
-int ff_vda_default_init(AVCodecContext *avctx)
-{
-    AVVDAContext *vda_ctx = avctx->hwaccel_context;
-    OSStatus status = kVDADecoderNoErr;
-    CFNumberRef height;
-    CFNumberRef width;
-    CFNumberRef format;
-    CFDataRef avc_data;
-    CFMutableDictionaryRef config_info;
-    CFMutableDictionaryRef buffer_attributes;
-    CFMutableDictionaryRef io_surface_properties;
-    CFNumberRef cv_pix_fmt;
-    int32_t fmt = 'avc1', pix_fmt = vda_ctx->cv_pix_fmt_type;
-
-    // kCVPixelFormatType_420YpCbCr8Planar;
-
-    /* Each VCL NAL in the bitstream sent to the decoder
-     * is preceded by a 4 bytes length header.
-     * Change the avcC atom header if needed, to signal headers of 4 bytes. */
-    if (avctx->extradata_size >= 4 && (avctx->extradata[4] & 0x03) != 0x03) {
-        uint8_t *rw_extradata;
-
-        if (!(rw_extradata = av_malloc(avctx->extradata_size)))
-            return AVERROR(ENOMEM);
-
-        memcpy(rw_extradata, avctx->extradata, avctx->extradata_size);
-
-        rw_extradata[4] |= 0x03;
-
-        avc_data = CFDataCreate(kCFAllocatorDefault, rw_extradata, avctx->extradata_size);
-
-        av_freep(&rw_extradata);
-    } else {
-        avc_data = CFDataCreate(kCFAllocatorDefault,
-                                avctx->extradata, avctx->extradata_size);
-    }
-
-    config_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
-                                            4,
-                                            &kCFTypeDictionaryKeyCallBacks,
-                                            &kCFTypeDictionaryValueCallBacks);
-
-    height = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &avctx->height);
-    width  = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &avctx->width);
-    format = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &fmt);
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_Height, height);
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_Width, width);
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_avcCData, avc_data);
-    CFDictionarySetValue(config_info, kVDADecoderConfiguration_SourceFormat, format);
-
-    buffer_attributes = CFDictionaryCreateMutable(kCFAllocatorDefault,
-                                                  2,
-                                                  &kCFTypeDictionaryKeyCallBacks,
-                                                  &kCFTypeDictionaryValueCallBacks);
-    io_surface_properties = CFDictionaryCreateMutable(kCFAllocatorDefault,
-                                                      0,
-                                                      &kCFTypeDictionaryKeyCallBacks,
-                                                      &kCFTypeDictionaryValueCallBacks);
-    cv_pix_fmt      = CFNumberCreate(kCFAllocatorDefault,
-                                     kCFNumberSInt32Type,
-                                     &pix_fmt);
-
-    CFDictionarySetValue(buffer_attributes,
-                         kCVPixelBufferPixelFormatTypeKey,
-                         cv_pix_fmt);
-    CFDictionarySetValue(buffer_attributes,
-                         kCVPixelBufferIOSurfacePropertiesKey,
-                         io_surface_properties);
-
-    status = VDADecoderCreate(config_info,
-                              buffer_attributes,
-                              (VDADecoderOutputCallback *)ff_vda_output_callback,
-                              avctx,
-                              &vda_ctx->decoder);
-
-    CFRelease(format);
-    CFRelease(height);
-    CFRelease(width);
-    CFRelease(avc_data);
-    CFRelease(config_info);
-    CFRelease(cv_pix_fmt);
-    CFRelease(io_surface_properties);
-    CFRelease(buffer_attributes);
-
-    if (status != kVDADecoderNoErr) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot initialize VDA %d\n", status);
-    }
-
-    switch (status) {
-    case kVDADecoderHardwareNotSupportedErr:
-    case kVDADecoderFormatNotSupportedErr:
-        return AVERROR(ENOSYS);
-    case kVDADecoderConfigurationError:
-        return AVERROR(EINVAL);
-    case kVDADecoderDecoderFailedErr:
-        return AVERROR_INVALIDDATA;
-    case kVDADecoderNoErr:
-        return 0;
-    default:
-        return AVERROR_UNKNOWN;
-    }
-}
-
-static int vda_h264_alloc_frame(AVCodecContext *avctx, AVFrame *frame)
-{
-    frame->width  = avctx->width;
-    frame->height = avctx->height;
-    frame->format = avctx->pix_fmt;
-    frame->buf[0] = av_buffer_alloc(1);
-
-    if (!frame->buf[0])
-        return AVERROR(ENOMEM);
-    return 0;
-}
-
-const AVHWAccel ff_h264_vda_hwaccel = {
-    .name           = "h264_vda",
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_H264,
-    .pix_fmt        = AV_PIX_FMT_VDA,
-    .alloc_frame    = vda_h264_alloc_frame,
-    .start_frame    = vda_h264_start_frame,
-    .decode_slice   = vda_h264_decode_slice,
-    .end_frame      = vda_h264_end_frame,
-    .uninit         = vda_h264_uninit,
-    .priv_data_size = sizeof(VDAContext),
-};
diff --git a/libavcodec/vda_internal.h b/libavcodec/vda_internal.h
deleted file mode 100644
index 9d0ed80..0000000
--- a/libavcodec/vda_internal.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_VDA_INTERNAL_H
-#define AVCODEC_VDA_INTERNAL_H
-
-#include "vda.h"
-
-void ff_vda_output_callback(void *vda_hw_ctx,
-                            CFDictionaryRef user_info,
-                            OSStatus status,
-                            uint32_t infoFlags,
-                            CVImageBufferRef image_buffer);
-
-int ff_vda_default_init(AVCodecContext *avctx);
-void ff_vda_default_free(AVCodecContext *avctx);
-
-#endif /* AVCODEC_VDA_INTERNAL_H */
diff --git a/libavcodec/vdpau.c b/libavcodec/vdpau.c
index da6fc1e..167f06d 100644
--- a/libavcodec/vdpau.c
+++ b/libavcodec/vdpau.c
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,9 @@
 #include "vdpau.h"
 #include "vdpau_internal.h"
 
+// XXX: at the time of adding this ifdefery, av_assert* wasn't use outside.
+// When dropping it, make sure other av_assert* were not added since then.
+
 /**
  * @addtogroup VDPAU_Decoding
  *
@@ -61,6 +64,13 @@ static int vdpau_error(VdpStatus status)
     }
 }
 
+AVVDPAUContext *av_alloc_vdpaucontext(void)
+{
+    return av_vdpau_alloc_context();
+}
+
+MAKE_ACCESSORS(AVVDPAUContext, vdpau_hwaccel, AVVDPAU_Render2, render2)
+
 int av_vdpau_get_surface_parameters(AVCodecContext *avctx,
                                     VdpChromaType *type,
                                     uint32_t *width, uint32_t *height)
@@ -128,6 +138,8 @@ int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
     VdpVideoSurfaceQueryCapabilities *surface_query_caps;
     VdpDecoderQueryCapabilities *decoder_query_caps;
     VdpDecoderCreate *create;
+    VdpGetInformationString *info;
+    const char *info_string;
     void *func;
     VdpStatus status;
     VdpBool supported;
@@ -184,6 +196,27 @@ int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
         return AVERROR(ENOTSUP);
 
     status = vdctx->get_proc_address(vdctx->device,
+                                     VDP_FUNC_ID_GET_INFORMATION_STRING,
+                                     &func);
+    if (status != VDP_STATUS_OK)
+        return vdpau_error(status);
+    else
+        info = func;
+
+    status = info(&info_string);
+    if (status != VDP_STATUS_OK)
+        return vdpau_error(status);
+    if (avctx->codec_id == AV_CODEC_ID_HEVC && strncmp(info_string, "NVIDIA ", 7) == 0 &&
+        !(avctx->hwaccel_flags & AV_HWACCEL_FLAG_ALLOW_PROFILE_MISMATCH)) {
+        int driver_version = 0;
+        sscanf(info_string, "NVIDIA VDPAU Driver Shared Library  %d", &driver_version);
+        if (driver_version < 410) {
+            av_log(avctx, AV_LOG_VERBOSE, "HEVC with NVIDIA VDPAU drivers is buggy, skipping.\n");
+            return AVERROR(ENOTSUP);
+        }
+    }
+
+    status = vdctx->get_proc_address(vdctx->device,
                                      VDP_FUNC_ID_VIDEO_SURFACE_QUERY_CAPABILITIES,
                                      &func);
     if (status != VDP_STATUS_OK)
@@ -300,6 +333,7 @@ int ff_vdpau_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
                               struct vdpau_picture_context *pic_ctx)
 {
     VDPAUContext *vdctx = avctx->internal->hwaccel_priv_data;
+    AVVDPAUContext *hwctx = avctx->hwaccel_context;
     VdpVideoSurface surf = ff_vdpau_get_surface_id(frame);
     VdpStatus status;
     int val;
@@ -308,11 +342,16 @@ int ff_vdpau_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
     if (val < 0)
         return val;
 
+    if (hwctx && !hwctx->render && hwctx->render2) {
+        status = hwctx->render2(avctx, frame, (void *)&pic_ctx->info,
+                                pic_ctx->bitstream_buffers_used, pic_ctx->bitstream_buffers);
+    } else
     status = vdctx->render(vdctx->decoder, surf, &pic_ctx->info,
                            pic_ctx->bitstream_buffers_used,
                            pic_ctx->bitstream_buffers);
 
     av_freep(&pic_ctx->bitstream_buffers);
+
     return vdpau_error(status);
 }
 
@@ -405,7 +444,7 @@ do {                                       \
 
 AVVDPAUContext *av_vdpau_alloc_context(void)
 {
-    return av_mallocz(sizeof(AVVDPAUContext));
+    return av_mallocz(sizeof(VDPAUHWContext));
 }
 
 int av_vdpau_bind_context(AVCodecContext *avctx, VdpDevice device,
diff --git a/libavcodec/vdpau.h b/libavcodec/vdpau.h
index a5d31cb..4d99943 100644
--- a/libavcodec/vdpau.h
+++ b/libavcodec/vdpau.h
@@ -4,20 +4,20 @@
  *
  * Copyright (C) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,7 +39,7 @@
  * - VDPAU decoding
  * - VDPAU presentation
  *
- * The VDPAU decoding module parses all headers using Libav
+ * The VDPAU decoding module parses all headers using FFmpeg
  * parsing mechanisms and uses VDPAU for the actual decoding.
  *
  * As per the current implementation, the actual decoding
@@ -51,15 +51,24 @@
 
 #include <vdpau/vdpau.h>
 
+#include "libavutil/avconfig.h"
 #include "libavutil/attributes.h"
 
 #include "avcodec.h"
 #include "version.h"
 
+struct AVCodecContext;
+struct AVFrame;
+
+typedef int (*AVVDPAU_Render2)(struct AVCodecContext *, struct AVFrame *,
+                               const VdpPictureInfo *, uint32_t,
+                               const VdpBitstreamBuffer *);
+
 /**
  * This structure is used to share data between the libavcodec library and
  * the client video application.
- * The user shall zero-allocate the structure and make it available as
+ * The user shall allocate the structure via the av_alloc_vdpau_hwaccel
+ * function and make it available as
  * AVCodecContext.hwaccel_context. Members can be set by the user once
  * during initialization or through each AVCodecContext.get_buffer()
  * function call. In any case, they must be valid prior to calling
@@ -83,9 +92,21 @@ typedef struct AVVDPAUContext {
      * Set by the user.
      */
     VdpDecoderRender *render;
+
+    AVVDPAU_Render2 render2;
 } AVVDPAUContext;
 
 /**
+ * @brief allocation function for AVVDPAUContext
+ *
+ * Allows extending the struct without breaking API/ABI
+ */
+AVVDPAUContext *av_alloc_vdpaucontext(void);
+
+AVVDPAU_Render2 av_vdpau_hwaccel_get_render2(const AVVDPAUContext *);
+void av_vdpau_hwaccel_set_render2(AVVDPAUContext *, AVVDPAU_Render2);
+
+/**
  * Associate a VDPAU device with a codec context for hardware acceleration.
  * This function is meant to be called from the get_format() codec callback,
  * or earlier. It can also be called after avcodec_flush_buffers() to change
diff --git a/libavcodec/vdpau_h264.c b/libavcodec/vdpau_h264.c
index 7c4c977..2a260f7 100644
--- a/libavcodec/vdpau_h264.c
+++ b/libavcodec/vdpau_h264.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_hevc.c b/libavcodec/vdpau_hevc.c
index 3b575eb..421135b 100644
--- a/libavcodec/vdpau_hevc.c
+++ b/libavcodec/vdpau_hevc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Philip Langdale
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -236,7 +236,7 @@ static int vdpau_hevc_start_frame(AVCodecContext *avctx,
         const HEVCFrame *frame = &h->DPB[i];
         if (frame != h->ref && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF |
                                                 HEVC_FRAME_FLAG_SHORT_REF))) {
-            if (j > 16) {
+            if (j > 15) {
                 av_log(avctx, AV_LOG_WARNING,
                      "VDPAU only supports up to 16 references in the DPB. "
                      "This frame may not be decoded correctly.\n");
diff --git a/libavcodec/vdpau_internal.h b/libavcodec/vdpau_internal.h
index 8194a9c..4d63e50 100644
--- a/libavcodec/vdpau_internal.h
+++ b/libavcodec/vdpau_internal.h
@@ -4,20 +4,20 @@
  *
  * Copyright (C) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_mpeg12.c b/libavcodec/vdpau_mpeg12.c
index bc9ff5d..d286e7e 100644
--- a/libavcodec/vdpau_mpeg12.c
+++ b/libavcodec/vdpau_mpeg12.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_mpeg4.c b/libavcodec/vdpau_mpeg4.c
index 87db6a7..96f8302 100644
--- a/libavcodec/vdpau_mpeg4.c
+++ b/libavcodec/vdpau_mpeg4.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -97,6 +97,9 @@ static int vdpau_mpeg4_init(AVCodecContext *avctx)
     case FF_PROFILE_MPEG4_SIMPLE:
         profile = VDP_DECODER_PROFILE_MPEG4_PART2_SP;
         break;
+    // As any ASP decoder must be able to decode SP, this
+    // should be a safe fallback if profile is unknown/unspecified.
+    case FF_PROFILE_UNKNOWN:
     case FF_PROFILE_MPEG4_ADVANCED_SIMPLE:
         profile = VDP_DECODER_PROFILE_MPEG4_PART2_ASP;
         break;
diff --git a/libavcodec/vdpau_vc1.c b/libavcodec/vdpau_vc1.c
index 1b55588..671baf9 100644
--- a/libavcodec/vdpau_vc1.c
+++ b/libavcodec/vdpau_vc1.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,14 +45,18 @@ static int vdpau_vc1_start_frame(AVCodecContext *avctx,
 
     switch (s->pict_type) {
     case AV_PICTURE_TYPE_B:
+        if (s->next_picture_ptr) {
         ref = ff_vdpau_get_surface_id(s->next_picture.f);
         assert(ref != VDP_INVALID_HANDLE);
         info->backward_reference = ref;
+        }
         /* fall-through */
     case AV_PICTURE_TYPE_P:
+        if (s->last_picture_ptr) {
         ref = ff_vdpau_get_surface_id(s->last_picture.f);
         assert(ref != VDP_INVALID_HANDLE);
         info->forward_reference  = ref;
+        }
     }
 
     info->slice_count       = 0;
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 831a249..309e411 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,9 @@
 
 #include "libavutil/version.h"
 
-#define LIBAVCODEC_VERSION_MAJOR 58
-#define LIBAVCODEC_VERSION_MINOR 12
-#define LIBAVCODEC_VERSION_MICRO  1
+#define LIBAVCODEC_VERSION_MAJOR  58
+#define LIBAVCODEC_VERSION_MINOR  47
+#define LIBAVCODEC_VERSION_MICRO 103
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
                                                LIBAVCODEC_VERSION_MINOR, \
@@ -45,8 +45,21 @@
  * FF_API_* defines may be placed below to indicate public API that will be
  * dropped at a future version bump. The defines themselves are not part of
  * the public API and may change, break or disappear at any time.
+ *
+ * @note, when bumping the major version it is recommended to manually
+ * disable each FF_API_* in its own commit instead of disabling them all
+ * at once through the bump. This improves the git bisect-ability of the change.
  */
 
+#ifndef FF_API_LOWRES
+#define FF_API_LOWRES            (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_DEBUG_MV
+#define FF_API_DEBUG_MV          (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AVCTX_TIMEBASE
+#define FF_API_AVCTX_TIMEBASE    (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
 #ifndef FF_API_CODED_FRAME
 #define FF_API_CODED_FRAME       (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
@@ -77,12 +90,12 @@
 #ifndef FF_API_STAT_BITS
 #define FF_API_STAT_BITS         (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
-#ifndef FF_API_NVENC_OLD_NAME
-#define FF_API_NVENC_OLD_NAME    (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
 #ifndef FF_API_PRIVATE_OPT
 #define FF_API_PRIVATE_OPT      (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
+#ifndef FF_API_ASS_TIMING
+#define FF_API_ASS_TIMING       (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
 #ifndef FF_API_OLD_BSF
 #define FF_API_OLD_BSF          (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
@@ -92,11 +105,36 @@
 #ifndef FF_API_GET_CONTEXT_DEFAULTS
 #define FF_API_GET_CONTEXT_DEFAULTS (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
-#ifndef FF_API_VAAPI_CONTEXT
-#define FF_API_VAAPI_CONTEXT    (LIBAVCODEC_VERSION_MAJOR < 59)
+#ifndef FF_API_NVENC_OLD_NAME
+#define FF_API_NVENC_OLD_NAME    (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_STRUCT_VAAPI_CONTEXT
+#define FF_API_STRUCT_VAAPI_CONTEXT (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_MERGE_SD_API
+#define FF_API_MERGE_SD_API      (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_TAG_STRING
+#define FF_API_TAG_STRING        (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_GETCHROMA
+#define FF_API_GETCHROMA         (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_CODEC_GET_SET
+#define FF_API_CODEC_GET_SET     (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
 #ifndef FF_API_USER_VISIBLE_AVHWACCEL
-#define FF_API_USER_VISIBLE_AVHWACCEL (LIBAVCODEC_VERSION_MAJOR < 60)
+#define FF_API_USER_VISIBLE_AVHWACCEL (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
+#ifndef FF_API_LOCKMGR
+#define FF_API_LOCKMGR (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_NEXT
+#define FF_API_NEXT              (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_UNSANITIZED_BITRATES
+#define FF_API_UNSANITIZED_BITRATES (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+
 
 #endif /* AVCODEC_VERSION_H */
diff --git a/libavcodec/videodsp.c b/libavcodec/videodsp.c
index e6d9303..ce9e9eb 100644
--- a/libavcodec/videodsp.c
+++ b/libavcodec/videodsp.c
@@ -1,24 +1,25 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "videodsp.h"
 
@@ -51,4 +52,6 @@ av_cold void ff_videodsp_init(VideoDSPContext *ctx, int bpc)
         ff_videodsp_init_ppc(ctx, bpc);
     if (ARCH_X86)
         ff_videodsp_init_x86(ctx, bpc);
+    if (ARCH_MIPS)
+        ff_videodsp_init_mips(ctx, bpc);
 }
diff --git a/libavcodec/videodsp.h b/libavcodec/videodsp.h
index 04c012a..c0545f2 100644
--- a/libavcodec/videodsp.h
+++ b/libavcodec/videodsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,14 +29,25 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#define EMULATED_EDGE(depth) \
+void ff_emulated_edge_mc_ ## depth(uint8_t *dst, const uint8_t *src, \
+                                   ptrdiff_t dst_stride, ptrdiff_t src_stride, \
+                                   int block_w, int block_h,\
+                                   int src_x, int src_y, int w, int h);
+
+EMULATED_EDGE(8)
+EMULATED_EDGE(16)
+
 typedef struct VideoDSPContext {
     /**
      * Copy a rectangular area of samples to a temporary buffer and replicate
      * the border samples.
      *
-     * @param buf destination buffer
+     * @param dst destination buffer
+     * @param dst_stride number of bytes between 2 vertically adjacent samples
+     *                   in destination buffer
      * @param src source buffer
-     * @param buf_linesize number of bytes between 2 vertically adjacent
+     * @param dst_linesize number of bytes between 2 vertically adjacent
      *                     samples in the destination buffer
      * @param src_linesize number of bytes between 2 vertically adjacent
      *                     samples in both the source buffer
@@ -49,8 +60,8 @@ typedef struct VideoDSPContext {
      * @param w width of the source buffer
      * @param h height of the source buffer
      */
-    void (*emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
-                             ptrdiff_t buf_linesize,
+    void (*emulated_edge_mc)(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t dst_linesize,
                              ptrdiff_t src_linesize,
                              int block_w, int block_h,
                              int src_x, int src_y, int w, int h);
@@ -72,5 +83,6 @@ void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc);
 void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc);
+void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc);
 
 #endif /* AVCODEC_VIDEODSP_H */
diff --git a/libavcodec/videodsp_template.c b/libavcodec/videodsp_template.c
index 28b8c32..94c1b71 100644
--- a/libavcodec/videodsp_template.c
+++ b/libavcodec/videodsp_template.c
@@ -1,42 +1,46 @@
 /*
- * Copyright (c) 2002-2004 Michael Niedermayer
+ * Copyright (c) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
-
 #include "bit_depth_template.c"
-
-static void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
-                                      ptrdiff_t buf_linesize,
-                                      ptrdiff_t src_linesize,
-                                      int block_w, int block_h,
-                                      int src_x, int src_y, int w, int h)
+void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
+                               ptrdiff_t buf_linesize,
+                               ptrdiff_t src_linesize,
+                               int block_w, int block_h,
+                               int src_x, int src_y, int w, int h)
 {
     int x, y;
     int start_y, start_x, end_y, end_x;
 
+    if (!w || !h)
+        return;
+
+    av_assert2(block_w * sizeof(pixel) <= FFABS(buf_linesize));
+
     if (src_y >= h) {
-        src  += (h - 1 - src_y) * src_linesize;
+        src -= src_y * src_linesize;
+        src += (h - 1) * src_linesize;
         src_y = h - 1;
     } else if (src_y <= -block_h) {
-        src  += (1 - block_h - src_y) * src_linesize;
+        src -= src_y * src_linesize;
+        src += (1 - block_h) * src_linesize;
         src_y = 1 - block_h;
     }
     if (src_x >= w) {
@@ -51,8 +55,8 @@ static void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y = FFMIN(block_h, h-src_y);
     end_x = FFMIN(block_w, w-src_x);
-    assert(start_y < end_y && block_h);
-    assert(start_x < end_x && block_w);
+    av_assert2(start_y < end_y && block_h);
+    av_assert2(start_x < end_x && block_w);
 
     w    = end_x - start_x;
     src += start_y * src_linesize + start_x * sizeof(pixel);
diff --git a/libavcodec/videotoolbox.c b/libavcodec/videotoolbox.c
new file mode 100644
index 0000000..da7236f
--- /dev/null
+++ b/libavcodec/videotoolbox.c
@@ -0,0 +1,1234 @@
+/*
+ * Videotoolbox hardware acceleration
+ *
+ * copyright (c) 2012 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "videotoolbox.h"
+#include "libavutil/hwcontext_videotoolbox.h"
+#include "vt_internal.h"
+#include "libavutil/avutil.h"
+#include "libavutil/hwcontext.h"
+#include "bytestream.h"
+#include "decode.h"
+#include "h264dec.h"
+#include "hevcdec.h"
+#include "mpegvideo.h"
+#include <TargetConditionals.h>
+
+#ifndef kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder
+#  define kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder CFSTR("RequireHardwareAcceleratedVideoDecoder")
+#endif
+#ifndef kVTVideoDecoderSpecification_EnableHardwareAcceleratedVideoDecoder
+#  define kVTVideoDecoderSpecification_EnableHardwareAcceleratedVideoDecoder CFSTR("EnableHardwareAcceleratedVideoDecoder")
+#endif
+
+#if !HAVE_KCMVIDEOCODECTYPE_HEVC
+enum { kCMVideoCodecType_HEVC = 'hvc1' };
+#endif
+
+#define VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING  12
+
+typedef struct VTHWFrame {
+    CVPixelBufferRef pixbuf;
+    AVBufferRef *hw_frames_ctx;
+} VTHWFrame;
+
+static void videotoolbox_buffer_release(void *opaque, uint8_t *data)
+{
+    VTHWFrame *ref = (VTHWFrame *)data;
+    av_buffer_unref(&ref->hw_frames_ctx);
+    CVPixelBufferRelease(ref->pixbuf);
+
+    av_free(data);
+}
+
+static int videotoolbox_buffer_copy(VTContext *vtctx,
+                                    const uint8_t *buffer,
+                                    uint32_t size)
+{
+    void *tmp;
+
+    tmp = av_fast_realloc(vtctx->bitstream,
+                         &vtctx->allocated_size,
+                         size);
+
+    if (!tmp)
+        return AVERROR(ENOMEM);
+
+    vtctx->bitstream = tmp;
+    memcpy(vtctx->bitstream, buffer, size);
+    vtctx->bitstream_size = size;
+
+    return 0;
+}
+
+static int videotoolbox_postproc_frame(void *avctx, AVFrame *frame)
+{
+    VTHWFrame *ref = (VTHWFrame *)frame->buf[0]->data;
+
+    if (!ref->pixbuf) {
+        av_log(avctx, AV_LOG_ERROR, "No frame decoded?\n");
+        av_frame_unref(frame);
+        return AVERROR_EXTERNAL;
+    }
+
+    frame->data[3] = (uint8_t*)ref->pixbuf;
+
+    if (ref->hw_frames_ctx) {
+        av_buffer_unref(&frame->hw_frames_ctx);
+        frame->hw_frames_ctx = av_buffer_ref(ref->hw_frames_ctx);
+        if (!frame->hw_frames_ctx)
+            return AVERROR(ENOMEM);
+    }
+
+    return 0;
+}
+
+int ff_videotoolbox_alloc_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    size_t      size = sizeof(VTHWFrame);
+    uint8_t    *data = NULL;
+    AVBufferRef *buf = NULL;
+    int ret = ff_attach_decode_data(frame);
+    FrameDecodeData *fdd;
+    if (ret < 0)
+        return ret;
+
+    data = av_mallocz(size);
+    if (!data)
+        return AVERROR(ENOMEM);
+    buf = av_buffer_create(data, size, videotoolbox_buffer_release, NULL, 0);
+    if (!buf) {
+        av_freep(&data);
+        return AVERROR(ENOMEM);
+    }
+    frame->buf[0] = buf;
+
+    fdd = (FrameDecodeData*)frame->private_ref->data;
+    fdd->post_process = videotoolbox_postproc_frame;
+
+    frame->width  = avctx->width;
+    frame->height = avctx->height;
+    frame->format = avctx->pix_fmt;
+
+    return 0;
+}
+
+#define AV_W8(p, v) *(p) = (v)
+
+CFDataRef ff_videotoolbox_avcc_extradata_create(AVCodecContext *avctx)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    H264Context *h = avctx->priv_data;
+    CFDataRef data = NULL;
+    uint8_t *p;
+    int vt_extradata_size = 6 + 2 + h->ps.sps->data_size + 3 + h->ps.pps->data_size;
+    uint8_t *vt_extradata = av_malloc(vt_extradata_size);
+    if (!vt_extradata)
+        return NULL;
+
+    p = vt_extradata;
+
+    AV_W8(p + 0, 1); /* version */
+    AV_W8(p + 1, h->ps.sps->data[1]); /* profile */
+    AV_W8(p + 2, h->ps.sps->data[2]); /* profile compat */
+    AV_W8(p + 3, h->ps.sps->data[3]); /* level */
+    AV_W8(p + 4, 0xff); /* 6 bits reserved (111111) + 2 bits nal size length - 3 (11) */
+    AV_W8(p + 5, 0xe1); /* 3 bits reserved (111) + 5 bits number of sps (00001) */
+    AV_WB16(p + 6, h->ps.sps->data_size);
+    memcpy(p + 8, h->ps.sps->data, h->ps.sps->data_size);
+    p += 8 + h->ps.sps->data_size;
+    AV_W8(p + 0, 1); /* number of pps */
+    AV_WB16(p + 1, h->ps.pps->data_size);
+    memcpy(p + 3, h->ps.pps->data, h->ps.pps->data_size);
+
+    p += 3 + h->ps.pps->data_size;
+    av_assert0(p - vt_extradata == vt_extradata_size);
+
+    // save sps header (profile/level) used to create decoder session,
+    // so we can detect changes and recreate it.
+    if (vtctx)
+        memcpy(vtctx->sps, h->ps.sps->data + 1, 3);
+
+    data = CFDataCreate(kCFAllocatorDefault, vt_extradata, vt_extradata_size);
+    av_free(vt_extradata);
+    return data;
+}
+
+CFDataRef ff_videotoolbox_hvcc_extradata_create(AVCodecContext *avctx)
+{
+    HEVCContext *h = avctx->priv_data;
+    const HEVCVPS *vps = (const HEVCVPS *)h->ps.vps_list[0]->data;
+    const HEVCSPS *sps = (const HEVCSPS *)h->ps.sps_list[0]->data;
+    int i, num_pps = 0;
+    const HEVCPPS *pps = h->ps.pps;
+    PTLCommon ptlc = vps->ptl.general_ptl;
+    VUI vui = sps->vui;
+    uint8_t parallelismType;
+    CFDataRef data = NULL;
+    uint8_t *p;
+    int vt_extradata_size = 23 + 5 + vps->data_size + 5 + sps->data_size + 3;
+    uint8_t *vt_extradata;
+
+    for (i = 0; i < HEVC_MAX_PPS_COUNT; i++) {
+        if (h->ps.pps_list[i]) {
+            const HEVCPPS *pps = (const HEVCPPS *)h->ps.pps_list[i]->data;
+            vt_extradata_size += 2 + pps->data_size;
+            num_pps++;
+        }
+    }
+
+    vt_extradata = av_malloc(vt_extradata_size);
+    if (!vt_extradata)
+        return NULL;
+    p = vt_extradata;
+
+    /* unsigned int(8) configurationVersion = 1; */
+    AV_W8(p + 0, 1);
+
+    /*
+     * unsigned int(2) general_profile_space;
+     * unsigned int(1) general_tier_flag;
+     * unsigned int(5) general_profile_idc;
+     */
+    AV_W8(p + 1, ptlc.profile_space << 6 |
+                 ptlc.tier_flag     << 5 |
+                 ptlc.profile_idc);
+
+    /* unsigned int(32) general_profile_compatibility_flags; */
+    memcpy(p + 2, ptlc.profile_compatibility_flag, 4);
+
+    /* unsigned int(48) general_constraint_indicator_flags; */
+    AV_W8(p + 6, ptlc.progressive_source_flag    << 7 |
+                 ptlc.interlaced_source_flag     << 6 |
+                 ptlc.non_packed_constraint_flag << 5 |
+                 ptlc.frame_only_constraint_flag << 4);
+    AV_W8(p + 7, 0);
+    AV_WN32(p + 8, 0);
+
+    /* unsigned int(8) general_level_idc; */
+    AV_W8(p + 12, ptlc.level_idc);
+
+    /*
+     * bit(4) reserved = ‘1111’b;
+     * unsigned int(12) min_spatial_segmentation_idc;
+     */
+    AV_W8(p + 13, 0xf0 | (vui.min_spatial_segmentation_idc >> 4));
+    AV_W8(p + 14, vui.min_spatial_segmentation_idc & 0xff);
+
+    /*
+     * bit(6) reserved = ‘111111’b;
+     * unsigned int(2) parallelismType;
+     */
+    if (!vui.min_spatial_segmentation_idc)
+        parallelismType = 0;
+    else if (pps->entropy_coding_sync_enabled_flag && pps->tiles_enabled_flag)
+        parallelismType = 0;
+    else if (pps->entropy_coding_sync_enabled_flag)
+        parallelismType = 3;
+    else if (pps->tiles_enabled_flag)
+        parallelismType = 2;
+    else
+        parallelismType = 1;
+    AV_W8(p + 15, 0xfc | parallelismType);
+
+    /*
+     * bit(6) reserved = ‘111111’b;
+     * unsigned int(2) chromaFormat;
+     */
+    AV_W8(p + 16, sps->chroma_format_idc | 0xfc);
+
+    /*
+     * bit(5) reserved = ‘11111’b;
+     * unsigned int(3) bitDepthLumaMinus8;
+     */
+    AV_W8(p + 17, (sps->bit_depth - 8) | 0xfc);
+
+    /*
+     * bit(5) reserved = ‘11111’b;
+     * unsigned int(3) bitDepthChromaMinus8;
+     */
+    AV_W8(p + 18, (sps->bit_depth_chroma - 8) | 0xfc);
+
+    /* bit(16) avgFrameRate; */
+    AV_WB16(p + 19, 0);
+
+    /*
+     * bit(2) constantFrameRate;
+     * bit(3) numTemporalLayers;
+     * bit(1) temporalIdNested;
+     * unsigned int(2) lengthSizeMinusOne;
+     */
+    AV_W8(p + 21, 0                             << 6 |
+                  sps->max_sub_layers           << 3 |
+                  sps->temporal_id_nesting_flag << 2 |
+                  3);
+
+    /* unsigned int(8) numOfArrays; */
+    AV_W8(p + 22, 3);
+
+    p += 23;
+    /* vps */
+    /*
+     * bit(1) array_completeness;
+     * unsigned int(1) reserved = 0;
+     * unsigned int(6) NAL_unit_type;
+     */
+    AV_W8(p, 1 << 7 |
+             HEVC_NAL_VPS & 0x3f);
+    /* unsigned int(16) numNalus; */
+    AV_WB16(p + 1, 1);
+    /* unsigned int(16) nalUnitLength; */
+    AV_WB16(p + 3, vps->data_size);
+    /* bit(8*nalUnitLength) nalUnit; */
+    memcpy(p + 5, vps->data, vps->data_size);
+    p += 5 + vps->data_size;
+
+    /* sps */
+    AV_W8(p, 1 << 7 |
+             HEVC_NAL_SPS & 0x3f);
+    AV_WB16(p + 1, 1);
+    AV_WB16(p + 3, sps->data_size);
+    memcpy(p + 5, sps->data, sps->data_size);
+    p += 5 + sps->data_size;
+
+    /* pps */
+    AV_W8(p, 1 << 7 |
+             HEVC_NAL_PPS & 0x3f);
+    AV_WB16(p + 1, num_pps);
+    p += 3;
+    for (i = 0; i < HEVC_MAX_PPS_COUNT; i++) {
+        if (h->ps.pps_list[i]) {
+            const HEVCPPS *pps = (const HEVCPPS *)h->ps.pps_list[i]->data;
+            AV_WB16(p, pps->data_size);
+            memcpy(p + 2, pps->data, pps->data_size);
+            p += 2 + pps->data_size;
+        }
+    }
+
+    av_assert0(p - vt_extradata == vt_extradata_size);
+
+    data = CFDataCreate(kCFAllocatorDefault, vt_extradata, vt_extradata_size);
+    av_free(vt_extradata);
+    return data;
+}
+
+int ff_videotoolbox_h264_start_frame(AVCodecContext *avctx,
+                                     const uint8_t *buffer,
+                                     uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    H264Context *h = avctx->priv_data;
+
+    if (h->is_avc == 1) {
+        return videotoolbox_buffer_copy(vtctx, buffer, size);
+    }
+
+    return 0;
+}
+
+static int videotoolbox_h264_decode_params(AVCodecContext *avctx,
+                                           int type,
+                                           const uint8_t *buffer,
+                                           uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    H264Context *h = avctx->priv_data;
+
+    // save sps header (profile/level) used to create decoder session
+    if (!vtctx->sps[0])
+        memcpy(vtctx->sps, h->ps.sps->data + 1, 3);
+
+    if (type == H264_NAL_SPS) {
+        if (size > 4 && memcmp(vtctx->sps, buffer + 1, 3) != 0) {
+            vtctx->reconfig_needed = true;
+            memcpy(vtctx->sps, buffer + 1, 3);
+        }
+    }
+
+    // pass-through SPS/PPS changes to the decoder
+    return ff_videotoolbox_h264_decode_slice(avctx, buffer, size);
+}
+
+static int videotoolbox_common_decode_slice(AVCodecContext *avctx,
+                                            const uint8_t *buffer,
+                                            uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    void *tmp;
+
+    tmp = av_fast_realloc(vtctx->bitstream,
+                          &vtctx->allocated_size,
+                          vtctx->bitstream_size+size+4);
+    if (!tmp)
+        return AVERROR(ENOMEM);
+
+    vtctx->bitstream = tmp;
+
+    AV_WB32(vtctx->bitstream + vtctx->bitstream_size, size);
+    memcpy(vtctx->bitstream + vtctx->bitstream_size + 4, buffer, size);
+
+    vtctx->bitstream_size += size + 4;
+
+    return 0;
+}
+
+int ff_videotoolbox_h264_decode_slice(AVCodecContext *avctx,
+                                      const uint8_t *buffer,
+                                      uint32_t size)
+{
+    H264Context *h = avctx->priv_data;
+
+    if (h->is_avc == 1)
+        return 0;
+
+    return videotoolbox_common_decode_slice(avctx, buffer, size);
+}
+
+int ff_videotoolbox_uninit(AVCodecContext *avctx)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    if (vtctx) {
+        av_freep(&vtctx->bitstream);
+        if (vtctx->frame)
+            CVPixelBufferRelease(vtctx->frame);
+    }
+
+    return 0;
+}
+
+#if CONFIG_VIDEOTOOLBOX
+// Return the AVVideotoolboxContext that matters currently. Where it comes from
+// depends on the API used.
+static AVVideotoolboxContext *videotoolbox_get_context(AVCodecContext *avctx)
+{
+    // Somewhat tricky because the user can call av_videotoolbox_default_free()
+    // at any time, even when the codec is closed.
+    if (avctx->internal && avctx->internal->hwaccel_priv_data) {
+        VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+        if (vtctx->vt_ctx)
+            return vtctx->vt_ctx;
+    }
+    return avctx->hwaccel_context;
+}
+
+static int videotoolbox_buffer_create(AVCodecContext *avctx, AVFrame *frame)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    CVPixelBufferRef pixbuf = (CVPixelBufferRef)vtctx->frame;
+    OSType pixel_format = CVPixelBufferGetPixelFormatType(pixbuf);
+    enum AVPixelFormat sw_format = av_map_videotoolbox_format_to_pixfmt(pixel_format);
+    int width = CVPixelBufferGetWidth(pixbuf);
+    int height = CVPixelBufferGetHeight(pixbuf);
+    AVHWFramesContext *cached_frames;
+    VTHWFrame *ref;
+    int ret;
+
+    if (!frame->buf[0] || frame->data[3]) {
+        av_log(avctx, AV_LOG_ERROR, "videotoolbox: invalid state\n");
+        av_frame_unref(frame);
+        return AVERROR_EXTERNAL;
+    }
+
+    ref = (VTHWFrame *)frame->buf[0]->data;
+
+    if (ref->pixbuf)
+        CVPixelBufferRelease(ref->pixbuf);
+    ref->pixbuf = vtctx->frame;
+    vtctx->frame = NULL;
+
+    // Old API code path.
+    if (!vtctx->cached_hw_frames_ctx)
+        return 0;
+
+    cached_frames = (AVHWFramesContext*)vtctx->cached_hw_frames_ctx->data;
+
+    if (cached_frames->sw_format != sw_format ||
+        cached_frames->width != width ||
+        cached_frames->height != height) {
+        AVBufferRef *hw_frames_ctx = av_hwframe_ctx_alloc(cached_frames->device_ref);
+        AVHWFramesContext *hw_frames;
+        if (!hw_frames_ctx)
+            return AVERROR(ENOMEM);
+
+        hw_frames = (AVHWFramesContext*)hw_frames_ctx->data;
+        hw_frames->format = cached_frames->format;
+        hw_frames->sw_format = sw_format;
+        hw_frames->width = width;
+        hw_frames->height = height;
+
+        ret = av_hwframe_ctx_init(hw_frames_ctx);
+        if (ret < 0) {
+            av_buffer_unref(&hw_frames_ctx);
+            return ret;
+        }
+
+        av_buffer_unref(&vtctx->cached_hw_frames_ctx);
+        vtctx->cached_hw_frames_ctx = hw_frames_ctx;
+    }
+
+    av_buffer_unref(&ref->hw_frames_ctx);
+    ref->hw_frames_ctx = av_buffer_ref(vtctx->cached_hw_frames_ctx);
+    if (!ref->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static void videotoolbox_write_mp4_descr_length(PutByteContext *pb, int length)
+{
+    int i;
+    uint8_t b;
+
+    for (i = 3; i >= 0; i--) {
+        b = (length >> (i * 7)) & 0x7F;
+        if (i != 0)
+            b |= 0x80;
+
+        bytestream2_put_byteu(pb, b);
+    }
+}
+
+static CFDataRef videotoolbox_esds_extradata_create(AVCodecContext *avctx)
+{
+    CFDataRef data;
+    uint8_t *rw_extradata;
+    PutByteContext pb;
+    int full_size = 3 + 5 + 13 + 5 + avctx->extradata_size + 3;
+    // ES_DescrTag data + DecoderConfigDescrTag + data + DecSpecificInfoTag + size + SLConfigDescriptor
+    int config_size = 13 + 5 + avctx->extradata_size;
+    int s;
+
+    if (!(rw_extradata = av_mallocz(full_size + VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING)))
+        return NULL;
+
+    bytestream2_init_writer(&pb, rw_extradata, full_size + VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING);
+    bytestream2_put_byteu(&pb, 0);        // version
+    bytestream2_put_ne24(&pb, 0);         // flags
+
+    // elementary stream descriptor
+    bytestream2_put_byteu(&pb, 0x03);     // ES_DescrTag
+    videotoolbox_write_mp4_descr_length(&pb, full_size);
+    bytestream2_put_ne16(&pb, 0);         // esid
+    bytestream2_put_byteu(&pb, 0);        // stream priority (0-32)
+
+    // decoder configuration descriptor
+    bytestream2_put_byteu(&pb, 0x04);     // DecoderConfigDescrTag
+    videotoolbox_write_mp4_descr_length(&pb, config_size);
+    bytestream2_put_byteu(&pb, 32);       // object type indication. 32 = AV_CODEC_ID_MPEG4
+    bytestream2_put_byteu(&pb, 0x11);     // stream type
+    bytestream2_put_ne24(&pb, 0);         // buffer size
+    bytestream2_put_ne32(&pb, 0);         // max bitrate
+    bytestream2_put_ne32(&pb, 0);         // avg bitrate
+
+    // decoder specific descriptor
+    bytestream2_put_byteu(&pb, 0x05);     ///< DecSpecificInfoTag
+    videotoolbox_write_mp4_descr_length(&pb, avctx->extradata_size);
+
+    bytestream2_put_buffer(&pb, avctx->extradata, avctx->extradata_size);
+
+    // SLConfigDescriptor
+    bytestream2_put_byteu(&pb, 0x06);     // SLConfigDescrTag
+    bytestream2_put_byteu(&pb, 0x01);     // length
+    bytestream2_put_byteu(&pb, 0x02);     //
+
+    s = bytestream2_size_p(&pb);
+
+    data = CFDataCreate(kCFAllocatorDefault, rw_extradata, s);
+
+    av_freep(&rw_extradata);
+    return data;
+}
+
+static CMSampleBufferRef videotoolbox_sample_buffer_create(CMFormatDescriptionRef fmt_desc,
+                                                           void *buffer,
+                                                           int size)
+{
+    OSStatus status;
+    CMBlockBufferRef  block_buf;
+    CMSampleBufferRef sample_buf;
+
+    block_buf  = NULL;
+    sample_buf = NULL;
+
+    status = CMBlockBufferCreateWithMemoryBlock(kCFAllocatorDefault,// structureAllocator
+                                                buffer,             // memoryBlock
+                                                size,               // blockLength
+                                                kCFAllocatorNull,   // blockAllocator
+                                                NULL,               // customBlockSource
+                                                0,                  // offsetToData
+                                                size,               // dataLength
+                                                0,                  // flags
+                                                &block_buf);
+
+    if (!status) {
+        status = CMSampleBufferCreate(kCFAllocatorDefault,  // allocator
+                                      block_buf,            // dataBuffer
+                                      TRUE,                 // dataReady
+                                      0,                    // makeDataReadyCallback
+                                      0,                    // makeDataReadyRefcon
+                                      fmt_desc,             // formatDescription
+                                      1,                    // numSamples
+                                      0,                    // numSampleTimingEntries
+                                      NULL,                 // sampleTimingArray
+                                      0,                    // numSampleSizeEntries
+                                      NULL,                 // sampleSizeArray
+                                      &sample_buf);
+    }
+
+    if (block_buf)
+        CFRelease(block_buf);
+
+    return sample_buf;
+}
+
+static void videotoolbox_decoder_callback(void *opaque,
+                                          void *sourceFrameRefCon,
+                                          OSStatus status,
+                                          VTDecodeInfoFlags flags,
+                                          CVImageBufferRef image_buffer,
+                                          CMTime pts,
+                                          CMTime duration)
+{
+    AVCodecContext *avctx = opaque;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    if (vtctx->frame) {
+        CVPixelBufferRelease(vtctx->frame);
+        vtctx->frame = NULL;
+    }
+
+    if (!image_buffer) {
+        av_log(NULL, AV_LOG_DEBUG, "vt decoder cb: output image buffer is null\n");
+        return;
+    }
+
+    vtctx->frame = CVPixelBufferRetain(image_buffer);
+}
+
+static OSStatus videotoolbox_session_decode_frame(AVCodecContext *avctx)
+{
+    OSStatus status;
+    CMSampleBufferRef sample_buf;
+    AVVideotoolboxContext *videotoolbox = videotoolbox_get_context(avctx);
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    sample_buf = videotoolbox_sample_buffer_create(videotoolbox->cm_fmt_desc,
+                                                   vtctx->bitstream,
+                                                   vtctx->bitstream_size);
+
+    if (!sample_buf)
+        return -1;
+
+    status = VTDecompressionSessionDecodeFrame(videotoolbox->session,
+                                               sample_buf,
+                                               0,       // decodeFlags
+                                               NULL,    // sourceFrameRefCon
+                                               0);      // infoFlagsOut
+    if (status == noErr)
+        status = VTDecompressionSessionWaitForAsynchronousFrames(videotoolbox->session);
+
+    CFRelease(sample_buf);
+
+    return status;
+}
+
+static CMVideoFormatDescriptionRef videotoolbox_format_desc_create(CMVideoCodecType codec_type,
+                                                                   CFDictionaryRef decoder_spec,
+                                                                   int width,
+                                                                   int height)
+{
+    CMFormatDescriptionRef cm_fmt_desc;
+    OSStatus status;
+
+    status = CMVideoFormatDescriptionCreate(kCFAllocatorDefault,
+                                            codec_type,
+                                            width,
+                                            height,
+                                            decoder_spec, // Dictionary of extension
+                                            &cm_fmt_desc);
+
+    if (status)
+        return NULL;
+
+    return cm_fmt_desc;
+}
+
+static CFDictionaryRef videotoolbox_buffer_attributes_create(int width,
+                                                             int height,
+                                                             OSType pix_fmt)
+{
+    CFMutableDictionaryRef buffer_attributes;
+    CFMutableDictionaryRef io_surface_properties;
+    CFNumberRef cv_pix_fmt;
+    CFNumberRef w;
+    CFNumberRef h;
+
+    w = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &width);
+    h = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &height);
+    cv_pix_fmt = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &pix_fmt);
+
+    buffer_attributes = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                  4,
+                                                  &kCFTypeDictionaryKeyCallBacks,
+                                                  &kCFTypeDictionaryValueCallBacks);
+    io_surface_properties = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                      0,
+                                                      &kCFTypeDictionaryKeyCallBacks,
+                                                      &kCFTypeDictionaryValueCallBacks);
+
+    if (pix_fmt)
+        CFDictionarySetValue(buffer_attributes, kCVPixelBufferPixelFormatTypeKey, cv_pix_fmt);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferIOSurfacePropertiesKey, io_surface_properties);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferWidthKey, w);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferHeightKey, h);
+#if TARGET_OS_IPHONE
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferOpenGLESCompatibilityKey, kCFBooleanTrue);
+#else
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferIOSurfaceOpenGLTextureCompatibilityKey, kCFBooleanTrue);
+#endif
+
+    CFRelease(io_surface_properties);
+    CFRelease(cv_pix_fmt);
+    CFRelease(w);
+    CFRelease(h);
+
+    return buffer_attributes;
+}
+
+static CFDictionaryRef videotoolbox_decoder_config_create(CMVideoCodecType codec_type,
+                                                          AVCodecContext *avctx)
+{
+    CFMutableDictionaryRef config_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                                   0,
+                                                                   &kCFTypeDictionaryKeyCallBacks,
+                                                                   &kCFTypeDictionaryValueCallBacks);
+
+    CFDictionarySetValue(config_info,
+                         codec_type == kCMVideoCodecType_HEVC ?
+                            kVTVideoDecoderSpecification_EnableHardwareAcceleratedVideoDecoder :
+                            kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder,
+                         kCFBooleanTrue);
+
+    CFMutableDictionaryRef avc_info;
+    CFDataRef data = NULL;
+
+    avc_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                         1,
+                                         &kCFTypeDictionaryKeyCallBacks,
+                                         &kCFTypeDictionaryValueCallBacks);
+
+    switch (codec_type) {
+    case kCMVideoCodecType_MPEG4Video :
+        if (avctx->extradata_size)
+            data = videotoolbox_esds_extradata_create(avctx);
+        if (data)
+            CFDictionarySetValue(avc_info, CFSTR("esds"), data);
+        break;
+    case kCMVideoCodecType_H264 :
+        data = ff_videotoolbox_avcc_extradata_create(avctx);
+        if (data)
+            CFDictionarySetValue(avc_info, CFSTR("avcC"), data);
+        break;
+    case kCMVideoCodecType_HEVC :
+        data = ff_videotoolbox_hvcc_extradata_create(avctx);
+        if (data)
+            CFDictionarySetValue(avc_info, CFSTR("hvcC"), data);
+        break;
+    default:
+        break;
+    }
+
+    CFDictionarySetValue(config_info,
+            kCMFormatDescriptionExtension_SampleDescriptionExtensionAtoms,
+            avc_info);
+
+    if (data)
+        CFRelease(data);
+
+    CFRelease(avc_info);
+    return config_info;
+}
+
+static int videotoolbox_start(AVCodecContext *avctx)
+{
+    AVVideotoolboxContext *videotoolbox = videotoolbox_get_context(avctx);
+    OSStatus status;
+    VTDecompressionOutputCallbackRecord decoder_cb;
+    CFDictionaryRef decoder_spec;
+    CFDictionaryRef buf_attr;
+
+    if (!videotoolbox) {
+        av_log(avctx, AV_LOG_ERROR, "hwaccel context is not set\n");
+        return -1;
+    }
+
+    switch( avctx->codec_id ) {
+    case AV_CODEC_ID_H263 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_H263;
+        break;
+    case AV_CODEC_ID_H264 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_H264;
+        break;
+    case AV_CODEC_ID_HEVC :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_HEVC;
+        break;
+    case AV_CODEC_ID_MPEG1VIDEO :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG1Video;
+        break;
+    case AV_CODEC_ID_MPEG2VIDEO :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG2Video;
+        break;
+    case AV_CODEC_ID_MPEG4 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG4Video;
+        break;
+    default :
+        break;
+    }
+
+    decoder_spec = videotoolbox_decoder_config_create(videotoolbox->cm_codec_type, avctx);
+
+    if (!decoder_spec) {
+        av_log(avctx, AV_LOG_ERROR, "decoder specification creation failed\n");
+        return -1;
+    }
+
+    videotoolbox->cm_fmt_desc = videotoolbox_format_desc_create(videotoolbox->cm_codec_type,
+                                                                decoder_spec,
+                                                                avctx->width,
+                                                                avctx->height);
+    if (!videotoolbox->cm_fmt_desc) {
+        if (decoder_spec)
+            CFRelease(decoder_spec);
+
+        av_log(avctx, AV_LOG_ERROR, "format description creation failed\n");
+        return -1;
+    }
+
+    buf_attr = videotoolbox_buffer_attributes_create(avctx->width,
+                                                     avctx->height,
+                                                     videotoolbox->cv_pix_fmt_type);
+
+    decoder_cb.decompressionOutputCallback = videotoolbox_decoder_callback;
+    decoder_cb.decompressionOutputRefCon   = avctx;
+
+    status = VTDecompressionSessionCreate(NULL,                      // allocator
+                                          videotoolbox->cm_fmt_desc, // videoFormatDescription
+                                          decoder_spec,              // videoDecoderSpecification
+                                          buf_attr,                  // destinationImageBufferAttributes
+                                          &decoder_cb,               // outputCallback
+                                          &videotoolbox->session);   // decompressionSessionOut
+
+    if (decoder_spec)
+        CFRelease(decoder_spec);
+    if (buf_attr)
+        CFRelease(buf_attr);
+
+    switch (status) {
+    case kVTVideoDecoderNotAvailableNowErr:
+        av_log(avctx, AV_LOG_VERBOSE, "VideoToolbox session not available.\n");
+        return AVERROR(ENOSYS);
+    case kVTVideoDecoderUnsupportedDataFormatErr:
+        av_log(avctx, AV_LOG_VERBOSE, "VideoToolbox does not support this format.\n");
+        return AVERROR(ENOSYS);
+    case kVTCouldNotFindVideoDecoderErr:
+        av_log(avctx, AV_LOG_VERBOSE, "VideoToolbox decoder for this format not found.\n");
+        return AVERROR(ENOSYS);
+    case kVTVideoDecoderMalfunctionErr:
+        av_log(avctx, AV_LOG_VERBOSE, "VideoToolbox malfunction.\n");
+        return AVERROR(EINVAL);
+    case kVTVideoDecoderBadDataErr:
+        av_log(avctx, AV_LOG_VERBOSE, "VideoToolbox reported invalid data.\n");
+        return AVERROR_INVALIDDATA;
+    case 0:
+        return 0;
+    default:
+        av_log(avctx, AV_LOG_VERBOSE, "Unknown VideoToolbox session creation error %d\n", (int)status);
+        return AVERROR_UNKNOWN;
+    }
+}
+
+static void videotoolbox_stop(AVCodecContext *avctx)
+{
+    AVVideotoolboxContext *videotoolbox = videotoolbox_get_context(avctx);
+    if (!videotoolbox)
+        return;
+
+    if (videotoolbox->cm_fmt_desc) {
+        CFRelease(videotoolbox->cm_fmt_desc);
+        videotoolbox->cm_fmt_desc = NULL;
+    }
+
+    if (videotoolbox->session) {
+        VTDecompressionSessionInvalidate(videotoolbox->session);
+        CFRelease(videotoolbox->session);
+        videotoolbox->session = NULL;
+    }
+}
+
+static const char *videotoolbox_error_string(OSStatus status)
+{
+    switch (status) {
+        case kVTVideoDecoderBadDataErr:
+            return "bad data";
+        case kVTVideoDecoderMalfunctionErr:
+            return "decoder malfunction";
+        case kVTInvalidSessionErr:
+            return "invalid session";
+    }
+    return "unknown";
+}
+
+static int videotoolbox_common_end_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    OSStatus status;
+    AVVideotoolboxContext *videotoolbox = videotoolbox_get_context(avctx);
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    frame->crop_right = 0;
+    frame->crop_left = 0;
+    frame->crop_top = 0;
+    frame->crop_bottom = 0;
+
+    if (vtctx->reconfig_needed == true) {
+        vtctx->reconfig_needed = false;
+        av_log(avctx, AV_LOG_VERBOSE, "VideoToolbox decoder needs reconfig, restarting..\n");
+        videotoolbox_stop(avctx);
+        if (videotoolbox_start(avctx) != 0) {
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (!videotoolbox->session || !vtctx->bitstream || !vtctx->bitstream_size)
+        return AVERROR_INVALIDDATA;
+
+    status = videotoolbox_session_decode_frame(avctx);
+    if (status != noErr) {
+        if (status == kVTVideoDecoderMalfunctionErr || status == kVTInvalidSessionErr)
+            vtctx->reconfig_needed = true;
+        av_log(avctx, AV_LOG_ERROR, "Failed to decode frame (%s, %d)\n", videotoolbox_error_string(status), (int)status);
+        return AVERROR_UNKNOWN;
+    }
+
+    if (!vtctx->frame) {
+        vtctx->reconfig_needed = true;
+        return AVERROR_UNKNOWN;
+    }
+
+    return videotoolbox_buffer_create(avctx, frame);
+}
+
+static int videotoolbox_h264_end_frame(AVCodecContext *avctx)
+{
+    H264Context *h = avctx->priv_data;
+    AVFrame *frame = h->cur_pic_ptr->f;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    int ret = videotoolbox_common_end_frame(avctx, frame);
+    vtctx->bitstream_size = 0;
+    return ret;
+}
+
+static int videotoolbox_hevc_start_frame(AVCodecContext *avctx,
+                                         const uint8_t *buffer,
+                                         uint32_t size)
+{
+    return 0;
+}
+
+static int videotoolbox_hevc_decode_slice(AVCodecContext *avctx,
+                                          const uint8_t *buffer,
+                                          uint32_t size)
+{
+    return videotoolbox_common_decode_slice(avctx, buffer, size);
+}
+
+
+static int videotoolbox_hevc_decode_params(AVCodecContext *avctx,
+                                           int type,
+                                           const uint8_t *buffer,
+                                           uint32_t size)
+{
+    return videotoolbox_common_decode_slice(avctx, buffer, size);
+}
+
+static int videotoolbox_hevc_end_frame(AVCodecContext *avctx)
+{
+    HEVCContext *h = avctx->priv_data;
+    AVFrame *frame = h->ref->frame;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    h->output_frame->crop_right = 0;
+    h->output_frame->crop_left = 0;
+    h->output_frame->crop_top = 0;
+    h->output_frame->crop_bottom = 0;
+
+    int ret = videotoolbox_common_end_frame(avctx, frame);
+    vtctx->bitstream_size = 0;
+    return ret;
+}
+
+static int videotoolbox_mpeg_start_frame(AVCodecContext *avctx,
+                                         const uint8_t *buffer,
+                                         uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    return videotoolbox_buffer_copy(vtctx, buffer, size);
+}
+
+static int videotoolbox_mpeg_decode_slice(AVCodecContext *avctx,
+                                          const uint8_t *buffer,
+                                          uint32_t size)
+{
+    return 0;
+}
+
+static int videotoolbox_mpeg_end_frame(AVCodecContext *avctx)
+{
+    MpegEncContext *s = avctx->priv_data;
+    AVFrame *frame = s->current_picture_ptr->f;
+
+    return videotoolbox_common_end_frame(avctx, frame);
+}
+
+static int videotoolbox_uninit(AVCodecContext *avctx)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    if (!vtctx)
+        return 0;
+
+    ff_videotoolbox_uninit(avctx);
+
+    if (vtctx->vt_ctx)
+        videotoolbox_stop(avctx);
+
+    av_buffer_unref(&vtctx->cached_hw_frames_ctx);
+    av_freep(&vtctx->vt_ctx);
+
+    return 0;
+}
+
+static int videotoolbox_common_init(AVCodecContext *avctx)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    AVHWFramesContext *hw_frames;
+    int err;
+
+    // Old API - do nothing.
+    if (avctx->hwaccel_context)
+        return 0;
+
+    if (!avctx->hw_frames_ctx && !avctx->hw_device_ctx) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Either hw_frames_ctx or hw_device_ctx must be set.\n");
+        return AVERROR(EINVAL);
+    }
+
+    vtctx->vt_ctx = av_videotoolbox_alloc_context();
+    if (!vtctx->vt_ctx) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    if (avctx->hw_frames_ctx) {
+        hw_frames = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+    } else {
+        avctx->hw_frames_ctx = av_hwframe_ctx_alloc(avctx->hw_device_ctx);
+        if (!avctx->hw_frames_ctx) {
+            err = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        hw_frames = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+        hw_frames->format = AV_PIX_FMT_VIDEOTOOLBOX;
+        hw_frames->sw_format = AV_PIX_FMT_NV12; // same as av_videotoolbox_alloc_context()
+        hw_frames->width = avctx->width;
+        hw_frames->height = avctx->height;
+
+        err = av_hwframe_ctx_init(avctx->hw_frames_ctx);
+        if (err < 0) {
+            av_buffer_unref(&avctx->hw_frames_ctx);
+            goto fail;
+        }
+    }
+
+    vtctx->cached_hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
+    if (!vtctx->cached_hw_frames_ctx) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    vtctx->vt_ctx->cv_pix_fmt_type =
+        av_map_videotoolbox_format_from_pixfmt(hw_frames->sw_format);
+    if (!vtctx->vt_ctx->cv_pix_fmt_type) {
+        av_log(avctx, AV_LOG_ERROR, "Unknown sw_format.\n");
+        err = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    err = videotoolbox_start(avctx);
+    if (err < 0)
+        goto fail;
+
+    return 0;
+
+fail:
+    videotoolbox_uninit(avctx);
+    return err;
+}
+
+static int videotoolbox_frame_params(AVCodecContext *avctx,
+                                     AVBufferRef *hw_frames_ctx)
+{
+    AVHWFramesContext *frames_ctx = (AVHWFramesContext*)hw_frames_ctx->data;
+
+    frames_ctx->format            = AV_PIX_FMT_VIDEOTOOLBOX;
+    frames_ctx->width             = avctx->coded_width;
+    frames_ctx->height            = avctx->coded_height;
+    frames_ctx->sw_format         = AV_PIX_FMT_NV12;
+
+    return 0;
+}
+
+const AVHWAccel ff_h263_videotoolbox_hwaccel = {
+    .name           = "h263_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H263,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .frame_params   = videotoolbox_frame_params,
+    .init           = videotoolbox_common_init,
+    .uninit         = videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+const AVHWAccel ff_hevc_videotoolbox_hwaccel = {
+    .name           = "hevc_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_hevc_start_frame,
+    .decode_slice   = videotoolbox_hevc_decode_slice,
+    .decode_params  = videotoolbox_hevc_decode_params,
+    .end_frame      = videotoolbox_hevc_end_frame,
+    .frame_params   = videotoolbox_frame_params,
+    .init           = videotoolbox_common_init,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+const AVHWAccel ff_h264_videotoolbox_hwaccel = {
+    .name           = "h264_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = ff_videotoolbox_h264_start_frame,
+    .decode_slice   = ff_videotoolbox_h264_decode_slice,
+    .decode_params  = videotoolbox_h264_decode_params,
+    .end_frame      = videotoolbox_h264_end_frame,
+    .frame_params   = videotoolbox_frame_params,
+    .init           = videotoolbox_common_init,
+    .uninit         = videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+const AVHWAccel ff_mpeg1_videotoolbox_hwaccel = {
+    .name           = "mpeg1_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .frame_params   = videotoolbox_frame_params,
+    .init           = videotoolbox_common_init,
+    .uninit         = videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+const AVHWAccel ff_mpeg2_videotoolbox_hwaccel = {
+    .name           = "mpeg2_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .frame_params   = videotoolbox_frame_params,
+    .init           = videotoolbox_common_init,
+    .uninit         = videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+const AVHWAccel ff_mpeg4_videotoolbox_hwaccel = {
+    .name           = "mpeg4_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG4,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .frame_params   = videotoolbox_frame_params,
+    .init           = videotoolbox_common_init,
+    .uninit         = videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVVideotoolboxContext *av_videotoolbox_alloc_context(void)
+{
+    AVVideotoolboxContext *ret = av_mallocz(sizeof(*ret));
+
+    if (ret) {
+        ret->output_callback = videotoolbox_decoder_callback;
+        ret->cv_pix_fmt_type = kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange;
+    }
+
+    return ret;
+}
+
+int av_videotoolbox_default_init(AVCodecContext *avctx)
+{
+    return av_videotoolbox_default_init2(avctx, NULL);
+}
+
+int av_videotoolbox_default_init2(AVCodecContext *avctx, AVVideotoolboxContext *vtctx)
+{
+    avctx->hwaccel_context = vtctx ?: av_videotoolbox_alloc_context();
+    if (!avctx->hwaccel_context)
+        return AVERROR(ENOMEM);
+    return videotoolbox_start(avctx);
+}
+
+void av_videotoolbox_default_free(AVCodecContext *avctx)
+{
+
+    videotoolbox_stop(avctx);
+    av_freep(&avctx->hwaccel_context);
+}
+#endif /* CONFIG_VIDEOTOOLBOX */
diff --git a/libavcodec/videotoolbox.h b/libavcodec/videotoolbox.h
new file mode 100644
index 0000000..af2db0d
--- /dev/null
+++ b/libavcodec/videotoolbox.h
@@ -0,0 +1,127 @@
+/*
+ * Videotoolbox hardware acceleration
+ *
+ * copyright (c) 2012 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VIDEOTOOLBOX_H
+#define AVCODEC_VIDEOTOOLBOX_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_videotoolbox
+ * Public libavcodec Videotoolbox header.
+ */
+
+#include <stdint.h>
+
+#define Picture QuickdrawPicture
+#include <VideoToolbox/VideoToolbox.h>
+#undef Picture
+
+#include "libavcodec/avcodec.h"
+
+/**
+ * This struct holds all the information that needs to be passed
+ * between the caller and libavcodec for initializing Videotoolbox decoding.
+ * Its size is not a part of the public ABI, it must be allocated with
+ * av_videotoolbox_alloc_context() and freed with av_free().
+ */
+typedef struct AVVideotoolboxContext {
+    /**
+     * Videotoolbox decompression session object.
+     * Created and freed the caller.
+     */
+    VTDecompressionSessionRef session;
+
+    /**
+     * The output callback that must be passed to the session.
+     * Set by av_videottoolbox_default_init()
+     */
+    VTDecompressionOutputCallback output_callback;
+
+    /**
+     * CVPixelBuffer Format Type that Videotoolbox will use for decoded frames.
+     * set by the caller. If this is set to 0, then no specific format is
+     * requested from the decoder, and its native format is output.
+     */
+    OSType cv_pix_fmt_type;
+
+    /**
+     * CoreMedia Format Description that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    CMVideoFormatDescriptionRef cm_fmt_desc;
+
+    /**
+     * CoreMedia codec type that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    int cm_codec_type;
+} AVVideotoolboxContext;
+
+/**
+ * Allocate and initialize a Videotoolbox context.
+ *
+ * This function should be called from the get_format() callback when the caller
+ * selects the AV_PIX_FMT_VIDETOOLBOX format. The caller must then create
+ * the decoder object (using the output callback provided by libavcodec) that
+ * will be used for Videotoolbox-accelerated decoding.
+ *
+ * When decoding with Videotoolbox is finished, the caller must destroy the decoder
+ * object and free the Videotoolbox context using av_free().
+ *
+ * @return the newly allocated context or NULL on failure
+ */
+AVVideotoolboxContext *av_videotoolbox_alloc_context(void);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init(AVCodecContext *avctx);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ * @param vtctx the Videotoolbox context to use
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init2(AVCodecContext *avctx, AVVideotoolboxContext *vtctx);
+
+/**
+ * This function must be called to free the Videotoolbox context initialized with
+ * av_videotoolbox_default_init().
+ *
+ * @param avctx the corresponding codec context
+ */
+void av_videotoolbox_default_free(AVCodecContext *avctx);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_VIDEOTOOLBOX_H */
diff --git a/libavcodec/videotoolboxenc.c b/libavcodec/videotoolboxenc.c
new file mode 100644
index 0000000..50aba2d
--- /dev/null
+++ b/libavcodec/videotoolboxenc.c
@@ -0,0 +1,2638 @@
+/*
+ * copyright (c) 2015 Rick Kern <kernrj@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <VideoToolbox/VideoToolbox.h>
+#include <CoreVideo/CoreVideo.h>
+#include <CoreMedia/CoreMedia.h>
+#include <TargetConditionals.h>
+#include <Availability.h>
+#include "avcodec.h"
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavcodec/avcodec.h"
+#include "libavutil/pixdesc.h"
+#include "internal.h"
+#include <pthread.h>
+#include "h264.h"
+#include "h264_sei.h"
+#include <dlfcn.h>
+
+#if !HAVE_KCMVIDEOCODECTYPE_HEVC
+enum { kCMVideoCodecType_HEVC = 'hvc1' };
+#endif
+
+typedef OSStatus (*getParameterSetAtIndex)(CMFormatDescriptionRef videoDesc,
+                                           size_t parameterSetIndex,
+                                           const uint8_t **parameterSetPointerOut,
+                                           size_t *parameterSetSizeOut,
+                                           size_t *parameterSetCountOut,
+                                           int *NALUnitHeaderLengthOut);
+
+//These symbols may not be present
+static struct{
+    CFStringRef kCVImageBufferColorPrimaries_ITU_R_2020;
+    CFStringRef kCVImageBufferTransferFunction_ITU_R_2020;
+    CFStringRef kCVImageBufferYCbCrMatrix_ITU_R_2020;
+
+    CFStringRef kVTCompressionPropertyKey_H264EntropyMode;
+    CFStringRef kVTH264EntropyMode_CAVLC;
+    CFStringRef kVTH264EntropyMode_CABAC;
+
+    CFStringRef kVTProfileLevel_H264_Baseline_4_0;
+    CFStringRef kVTProfileLevel_H264_Baseline_4_2;
+    CFStringRef kVTProfileLevel_H264_Baseline_5_0;
+    CFStringRef kVTProfileLevel_H264_Baseline_5_1;
+    CFStringRef kVTProfileLevel_H264_Baseline_5_2;
+    CFStringRef kVTProfileLevel_H264_Baseline_AutoLevel;
+    CFStringRef kVTProfileLevel_H264_Main_4_2;
+    CFStringRef kVTProfileLevel_H264_Main_5_1;
+    CFStringRef kVTProfileLevel_H264_Main_5_2;
+    CFStringRef kVTProfileLevel_H264_Main_AutoLevel;
+    CFStringRef kVTProfileLevel_H264_High_3_0;
+    CFStringRef kVTProfileLevel_H264_High_3_1;
+    CFStringRef kVTProfileLevel_H264_High_3_2;
+    CFStringRef kVTProfileLevel_H264_High_4_0;
+    CFStringRef kVTProfileLevel_H264_High_4_1;
+    CFStringRef kVTProfileLevel_H264_High_4_2;
+    CFStringRef kVTProfileLevel_H264_High_5_1;
+    CFStringRef kVTProfileLevel_H264_High_5_2;
+    CFStringRef kVTProfileLevel_H264_High_AutoLevel;
+
+    CFStringRef kVTProfileLevel_HEVC_Main_AutoLevel;
+    CFStringRef kVTProfileLevel_HEVC_Main10_AutoLevel;
+
+    CFStringRef kVTCompressionPropertyKey_RealTime;
+
+    CFStringRef kVTVideoEncoderSpecification_EnableHardwareAcceleratedVideoEncoder;
+    CFStringRef kVTVideoEncoderSpecification_RequireHardwareAcceleratedVideoEncoder;
+
+    getParameterSetAtIndex CMVideoFormatDescriptionGetHEVCParameterSetAtIndex;
+} compat_keys;
+
+#define GET_SYM(symbol, defaultVal)                                     \
+do{                                                                     \
+    CFStringRef* handle = (CFStringRef*)dlsym(RTLD_DEFAULT, #symbol);   \
+    if(!handle)                                                         \
+        compat_keys.symbol = CFSTR(defaultVal);                         \
+    else                                                                \
+        compat_keys.symbol = *handle;                                   \
+}while(0)
+
+static pthread_once_t once_ctrl = PTHREAD_ONCE_INIT;
+
+static void loadVTEncSymbols(){
+    compat_keys.CMVideoFormatDescriptionGetHEVCParameterSetAtIndex =
+        (getParameterSetAtIndex)dlsym(
+            RTLD_DEFAULT,
+            "CMVideoFormatDescriptionGetHEVCParameterSetAtIndex"
+        );
+
+    GET_SYM(kCVImageBufferColorPrimaries_ITU_R_2020,   "ITU_R_2020");
+    GET_SYM(kCVImageBufferTransferFunction_ITU_R_2020, "ITU_R_2020");
+    GET_SYM(kCVImageBufferYCbCrMatrix_ITU_R_2020,      "ITU_R_2020");
+
+    GET_SYM(kVTCompressionPropertyKey_H264EntropyMode, "H264EntropyMode");
+    GET_SYM(kVTH264EntropyMode_CAVLC, "CAVLC");
+    GET_SYM(kVTH264EntropyMode_CABAC, "CABAC");
+
+    GET_SYM(kVTProfileLevel_H264_Baseline_4_0,       "H264_Baseline_4_0");
+    GET_SYM(kVTProfileLevel_H264_Baseline_4_2,       "H264_Baseline_4_2");
+    GET_SYM(kVTProfileLevel_H264_Baseline_5_0,       "H264_Baseline_5_0");
+    GET_SYM(kVTProfileLevel_H264_Baseline_5_1,       "H264_Baseline_5_1");
+    GET_SYM(kVTProfileLevel_H264_Baseline_5_2,       "H264_Baseline_5_2");
+    GET_SYM(kVTProfileLevel_H264_Baseline_AutoLevel, "H264_Baseline_AutoLevel");
+    GET_SYM(kVTProfileLevel_H264_Main_4_2,           "H264_Main_4_2");
+    GET_SYM(kVTProfileLevel_H264_Main_5_1,           "H264_Main_5_1");
+    GET_SYM(kVTProfileLevel_H264_Main_5_2,           "H264_Main_5_2");
+    GET_SYM(kVTProfileLevel_H264_Main_AutoLevel,     "H264_Main_AutoLevel");
+    GET_SYM(kVTProfileLevel_H264_High_3_0,           "H264_High_3_0");
+    GET_SYM(kVTProfileLevel_H264_High_3_1,           "H264_High_3_1");
+    GET_SYM(kVTProfileLevel_H264_High_3_2,           "H264_High_3_2");
+    GET_SYM(kVTProfileLevel_H264_High_4_0,           "H264_High_4_0");
+    GET_SYM(kVTProfileLevel_H264_High_4_1,           "H264_High_4_1");
+    GET_SYM(kVTProfileLevel_H264_High_4_2,           "H264_High_4_2");
+    GET_SYM(kVTProfileLevel_H264_High_5_1,           "H264_High_5_1");
+    GET_SYM(kVTProfileLevel_H264_High_5_2,           "H264_High_5_2");
+    GET_SYM(kVTProfileLevel_H264_High_AutoLevel,     "H264_High_AutoLevel");
+
+    GET_SYM(kVTProfileLevel_HEVC_Main_AutoLevel,     "HEVC_Main_AutoLevel");
+    GET_SYM(kVTProfileLevel_HEVC_Main10_AutoLevel,   "HEVC_Main10_AutoLevel");
+
+    GET_SYM(kVTCompressionPropertyKey_RealTime, "RealTime");
+
+    GET_SYM(kVTVideoEncoderSpecification_EnableHardwareAcceleratedVideoEncoder,
+            "EnableHardwareAcceleratedVideoEncoder");
+    GET_SYM(kVTVideoEncoderSpecification_RequireHardwareAcceleratedVideoEncoder,
+            "RequireHardwareAcceleratedVideoEncoder");
+}
+
+typedef enum VT_H264Profile {
+    H264_PROF_AUTO,
+    H264_PROF_BASELINE,
+    H264_PROF_MAIN,
+    H264_PROF_HIGH,
+    H264_PROF_COUNT
+} VT_H264Profile;
+
+typedef enum VTH264Entropy{
+    VT_ENTROPY_NOT_SET,
+    VT_CAVLC,
+    VT_CABAC
+} VTH264Entropy;
+
+typedef enum VT_HEVCProfile {
+    HEVC_PROF_AUTO,
+    HEVC_PROF_MAIN,
+    HEVC_PROF_MAIN10,
+    HEVC_PROF_COUNT
+} VT_HEVCProfile;
+
+static const uint8_t start_code[] = { 0, 0, 0, 1 };
+
+typedef struct ExtraSEI {
+  void *data;
+  size_t size;
+} ExtraSEI;
+
+typedef struct BufNode {
+    CMSampleBufferRef cm_buffer;
+    ExtraSEI *sei;
+    struct BufNode* next;
+    int error;
+} BufNode;
+
+typedef struct VTEncContext {
+    AVClass *class;
+    enum AVCodecID codec_id;
+    VTCompressionSessionRef session;
+    CFStringRef ycbcr_matrix;
+    CFStringRef color_primaries;
+    CFStringRef transfer_function;
+    getParameterSetAtIndex get_param_set_func;
+
+    pthread_mutex_t lock;
+    pthread_cond_t  cv_sample_sent;
+
+    int async_error;
+
+    BufNode *q_head;
+    BufNode *q_tail;
+
+    int64_t frame_ct_out;
+    int64_t frame_ct_in;
+
+    int64_t first_pts;
+    int64_t dts_delta;
+
+    int64_t profile;
+    int64_t level;
+    int64_t entropy;
+    int64_t realtime;
+    int64_t frames_before;
+    int64_t frames_after;
+
+    int64_t allow_sw;
+
+    bool flushing;
+    bool has_b_frames;
+    bool warned_color_range;
+    bool a53_cc;
+} VTEncContext;
+
+static int vtenc_populate_extradata(AVCodecContext   *avctx,
+                                    CMVideoCodecType codec_type,
+                                    CFStringRef      profile_level,
+                                    CFNumberRef      gamma_level,
+                                    CFDictionaryRef  enc_info,
+                                    CFDictionaryRef  pixel_buffer_info);
+
+/**
+ * NULL-safe release of *refPtr, and sets value to NULL.
+ */
+static void vt_release_num(CFNumberRef* refPtr){
+    if (!*refPtr) {
+        return;
+    }
+
+    CFRelease(*refPtr);
+    *refPtr = NULL;
+}
+
+static void set_async_error(VTEncContext *vtctx, int err)
+{
+    BufNode *info;
+
+    pthread_mutex_lock(&vtctx->lock);
+
+    vtctx->async_error = err;
+
+    info = vtctx->q_head;
+    vtctx->q_head = vtctx->q_tail = NULL;
+
+    while (info) {
+        BufNode *next = info->next;
+        CFRelease(info->cm_buffer);
+        av_free(info);
+        info = next;
+    }
+
+    pthread_mutex_unlock(&vtctx->lock);
+}
+
+static void clear_frame_queue(VTEncContext *vtctx)
+{
+    set_async_error(vtctx, 0);
+}
+
+static int vtenc_q_pop(VTEncContext *vtctx, bool wait, CMSampleBufferRef *buf, ExtraSEI **sei)
+{
+    BufNode *info;
+
+    pthread_mutex_lock(&vtctx->lock);
+
+    if (vtctx->async_error) {
+        pthread_mutex_unlock(&vtctx->lock);
+        return vtctx->async_error;
+    }
+
+    if (vtctx->flushing && vtctx->frame_ct_in == vtctx->frame_ct_out) {
+        *buf = NULL;
+
+        pthread_mutex_unlock(&vtctx->lock);
+        return 0;
+    }
+
+    while (!vtctx->q_head && !vtctx->async_error && wait) {
+        pthread_cond_wait(&vtctx->cv_sample_sent, &vtctx->lock);
+    }
+
+    if (!vtctx->q_head) {
+        pthread_mutex_unlock(&vtctx->lock);
+        *buf = NULL;
+        return 0;
+    }
+
+    info = vtctx->q_head;
+    vtctx->q_head = vtctx->q_head->next;
+    if (!vtctx->q_head) {
+        vtctx->q_tail = NULL;
+    }
+
+    pthread_mutex_unlock(&vtctx->lock);
+
+    *buf = info->cm_buffer;
+    if (sei && *buf) {
+        *sei = info->sei;
+    } else if (info->sei) {
+        if (info->sei->data) av_free(info->sei->data);
+        av_free(info->sei);
+    }
+    av_free(info);
+
+    vtctx->frame_ct_out++;
+
+    return 0;
+}
+
+static void vtenc_q_push(VTEncContext *vtctx, CMSampleBufferRef buffer, ExtraSEI *sei)
+{
+    BufNode *info = av_malloc(sizeof(BufNode));
+    if (!info) {
+        set_async_error(vtctx, AVERROR(ENOMEM));
+        return;
+    }
+
+    CFRetain(buffer);
+    info->cm_buffer = buffer;
+    info->sei = sei;
+    info->next = NULL;
+
+    pthread_mutex_lock(&vtctx->lock);
+    pthread_cond_signal(&vtctx->cv_sample_sent);
+
+    if (!vtctx->q_head) {
+        vtctx->q_head = info;
+    } else {
+        vtctx->q_tail->next = info;
+    }
+
+    vtctx->q_tail = info;
+
+    pthread_mutex_unlock(&vtctx->lock);
+}
+
+static int count_nalus(size_t length_code_size,
+                       CMSampleBufferRef sample_buffer,
+                       int *count)
+{
+    size_t offset = 0;
+    int status;
+    int nalu_ct = 0;
+    uint8_t size_buf[4];
+    size_t src_size = CMSampleBufferGetTotalSampleSize(sample_buffer);
+    CMBlockBufferRef block = CMSampleBufferGetDataBuffer(sample_buffer);
+
+    if (length_code_size > 4)
+        return AVERROR_INVALIDDATA;
+
+    while (offset < src_size) {
+        size_t curr_src_len;
+        size_t box_len = 0;
+        size_t i;
+
+        status = CMBlockBufferCopyDataBytes(block,
+                                            offset,
+                                            length_code_size,
+                                            size_buf);
+
+        for (i = 0; i < length_code_size; i++) {
+            box_len <<= 8;
+            box_len |= size_buf[i];
+        }
+
+        curr_src_len = box_len + length_code_size;
+        offset += curr_src_len;
+
+        nalu_ct++;
+    }
+
+    *count = nalu_ct;
+    return 0;
+}
+
+static CMVideoCodecType get_cm_codec_type(enum AVCodecID id)
+{
+    switch (id) {
+    case AV_CODEC_ID_H264: return kCMVideoCodecType_H264;
+    case AV_CODEC_ID_HEVC: return kCMVideoCodecType_HEVC;
+    default:               return 0;
+    }
+}
+
+/**
+ * Get the parameter sets from a CMSampleBufferRef.
+ * @param dst If *dst isn't NULL, the parameters are copied into existing
+ *            memory. *dst_size must be set accordingly when *dst != NULL.
+ *            If *dst is NULL, it will be allocated.
+ *            In all cases, *dst_size is set to the number of bytes used starting
+ *            at *dst.
+ */
+static int get_params_size(
+    AVCodecContext              *avctx,
+    CMVideoFormatDescriptionRef vid_fmt,
+    size_t                      *size)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    size_t total_size = 0;
+    size_t ps_count;
+    int is_count_bad = 0;
+    size_t i;
+    int status;
+    status = vtctx->get_param_set_func(vid_fmt,
+                                       0,
+                                       NULL,
+                                       NULL,
+                                       &ps_count,
+                                       NULL);
+    if (status) {
+        is_count_bad = 1;
+        ps_count     = 0;
+        status       = 0;
+    }
+
+    for (i = 0; i < ps_count || is_count_bad; i++) {
+        const uint8_t *ps;
+        size_t ps_size;
+        status = vtctx->get_param_set_func(vid_fmt,
+                                           i,
+                                           &ps,
+                                           &ps_size,
+                                           NULL,
+                                           NULL);
+        if (status) {
+            /*
+             * When ps_count is invalid, status != 0 ends the loop normally
+             * unless we didn't get any parameter sets.
+             */
+            if (i > 0 && is_count_bad) status = 0;
+
+            break;
+        }
+
+        total_size += ps_size + sizeof(start_code);
+    }
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting parameter set sizes: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    *size = total_size;
+    return 0;
+}
+
+static int copy_param_sets(
+    AVCodecContext              *avctx,
+    CMVideoFormatDescriptionRef vid_fmt,
+    uint8_t                     *dst,
+    size_t                      dst_size)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    size_t ps_count;
+    int is_count_bad = 0;
+    int status;
+    size_t offset = 0;
+    size_t i;
+
+    status = vtctx->get_param_set_func(vid_fmt,
+                                       0,
+                                       NULL,
+                                       NULL,
+                                       &ps_count,
+                                       NULL);
+    if (status) {
+        is_count_bad = 1;
+        ps_count     = 0;
+        status       = 0;
+    }
+
+
+    for (i = 0; i < ps_count || is_count_bad; i++) {
+        const uint8_t *ps;
+        size_t ps_size;
+        size_t next_offset;
+
+        status = vtctx->get_param_set_func(vid_fmt,
+                                           i,
+                                           &ps,
+                                           &ps_size,
+                                           NULL,
+                                           NULL);
+        if (status) {
+            if (i > 0 && is_count_bad) status = 0;
+
+            break;
+        }
+
+        next_offset = offset + sizeof(start_code) + ps_size;
+        if (dst_size < next_offset) {
+            av_log(avctx, AV_LOG_ERROR, "Error: buffer too small for parameter sets.\n");
+            return AVERROR_BUFFER_TOO_SMALL;
+        }
+
+        memcpy(dst + offset, start_code, sizeof(start_code));
+        offset += sizeof(start_code);
+
+        memcpy(dst + offset, ps, ps_size);
+        offset = next_offset;
+    }
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting parameter set data: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+static int set_extradata(AVCodecContext *avctx, CMSampleBufferRef sample_buffer)
+{
+    CMVideoFormatDescriptionRef vid_fmt;
+    size_t total_size;
+    int status;
+
+    vid_fmt = CMSampleBufferGetFormatDescription(sample_buffer);
+    if (!vid_fmt) {
+        av_log(avctx, AV_LOG_ERROR, "No video format.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    status = get_params_size(avctx, vid_fmt, &total_size);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Could not get parameter sets.\n");
+        return status;
+    }
+
+    avctx->extradata = av_mallocz(total_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata) {
+        return AVERROR(ENOMEM);
+    }
+    avctx->extradata_size = total_size;
+
+    status = copy_param_sets(avctx, vid_fmt, avctx->extradata, total_size);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Could not copy param sets.\n");
+        return status;
+    }
+
+    return 0;
+}
+
+static void vtenc_output_callback(
+    void *ctx,
+    void *sourceFrameCtx,
+    OSStatus status,
+    VTEncodeInfoFlags flags,
+    CMSampleBufferRef sample_buffer)
+{
+    AVCodecContext *avctx = ctx;
+    VTEncContext   *vtctx = avctx->priv_data;
+    ExtraSEI *sei = sourceFrameCtx;
+
+    if (vtctx->async_error) {
+        if(sample_buffer) CFRelease(sample_buffer);
+        return;
+    }
+
+    if (status || !sample_buffer) {
+        av_log(avctx, AV_LOG_ERROR, "Error encoding frame: %d\n", (int)status);
+        set_async_error(vtctx, AVERROR_EXTERNAL);
+        return;
+    }
+
+    if (!avctx->extradata && (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)) {
+        int set_status = set_extradata(avctx, sample_buffer);
+        if (set_status) {
+            set_async_error(vtctx, set_status);
+            return;
+        }
+    }
+
+    vtenc_q_push(vtctx, sample_buffer, sei);
+}
+
+static int get_length_code_size(
+    AVCodecContext    *avctx,
+    CMSampleBufferRef sample_buffer,
+    size_t            *size)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    CMVideoFormatDescriptionRef vid_fmt;
+    int isize;
+    int status;
+
+    vid_fmt = CMSampleBufferGetFormatDescription(sample_buffer);
+    if (!vid_fmt) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting buffer format description.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    status = vtctx->get_param_set_func(vid_fmt,
+                                       0,
+                                       NULL,
+                                       NULL,
+                                       NULL,
+                                       &isize);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting length code size: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    *size = isize;
+    return 0;
+}
+
+/*
+ * Returns true on success.
+ *
+ * If profile_level_val is NULL and this method returns true, don't specify the
+ * profile/level to the encoder.
+ */
+static bool get_vt_h264_profile_level(AVCodecContext *avctx,
+                                      CFStringRef    *profile_level_val)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    int64_t profile = vtctx->profile;
+
+    if (profile == H264_PROF_AUTO && vtctx->level) {
+        //Need to pick a profile if level is not auto-selected.
+        profile = vtctx->has_b_frames ? H264_PROF_MAIN : H264_PROF_BASELINE;
+    }
+
+    *profile_level_val = NULL;
+
+    switch (profile) {
+        case H264_PROF_AUTO:
+            return true;
+
+        case H264_PROF_BASELINE:
+            switch (vtctx->level) {
+                case  0: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Baseline_AutoLevel; break;
+                case 13: *profile_level_val = kVTProfileLevel_H264_Baseline_1_3;       break;
+                case 30: *profile_level_val = kVTProfileLevel_H264_Baseline_3_0;       break;
+                case 31: *profile_level_val = kVTProfileLevel_H264_Baseline_3_1;       break;
+                case 32: *profile_level_val = kVTProfileLevel_H264_Baseline_3_2;       break;
+                case 40: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Baseline_4_0;       break;
+                case 41: *profile_level_val = kVTProfileLevel_H264_Baseline_4_1;       break;
+                case 42: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Baseline_4_2;       break;
+                case 50: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Baseline_5_0;       break;
+                case 51: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Baseline_5_1;       break;
+                case 52: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Baseline_5_2;       break;
+            }
+            break;
+
+        case H264_PROF_MAIN:
+            switch (vtctx->level) {
+                case  0: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Main_AutoLevel; break;
+                case 30: *profile_level_val = kVTProfileLevel_H264_Main_3_0;       break;
+                case 31: *profile_level_val = kVTProfileLevel_H264_Main_3_1;       break;
+                case 32: *profile_level_val = kVTProfileLevel_H264_Main_3_2;       break;
+                case 40: *profile_level_val = kVTProfileLevel_H264_Main_4_0;       break;
+                case 41: *profile_level_val = kVTProfileLevel_H264_Main_4_1;       break;
+                case 42: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Main_4_2;       break;
+                case 50: *profile_level_val = kVTProfileLevel_H264_Main_5_0;       break;
+                case 51: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Main_5_1;       break;
+                case 52: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_Main_5_2;       break;
+            }
+            break;
+
+        case H264_PROF_HIGH:
+            switch (vtctx->level) {
+                case  0: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_AutoLevel; break;
+                case 30: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_3_0;       break;
+                case 31: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_3_1;       break;
+                case 32: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_3_2;       break;
+                case 40: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_4_0;       break;
+                case 41: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_4_1;       break;
+                case 42: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_4_2;       break;
+                case 50: *profile_level_val = kVTProfileLevel_H264_High_5_0;       break;
+                case 51: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_5_1;       break;
+                case 52: *profile_level_val =
+                                  compat_keys.kVTProfileLevel_H264_High_5_2;       break;
+            }
+            break;
+    }
+
+    if (!*profile_level_val) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid Profile/Level.\n");
+        return false;
+    }
+
+    return true;
+}
+
+/*
+ * Returns true on success.
+ *
+ * If profile_level_val is NULL and this method returns true, don't specify the
+ * profile/level to the encoder.
+ */
+static bool get_vt_hevc_profile_level(AVCodecContext *avctx,
+                                      CFStringRef    *profile_level_val)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    int64_t profile = vtctx->profile;
+
+    *profile_level_val = NULL;
+
+    switch (profile) {
+        case HEVC_PROF_AUTO:
+            return true;
+        case HEVC_PROF_MAIN:
+            *profile_level_val =
+                compat_keys.kVTProfileLevel_HEVC_Main_AutoLevel;
+            break;
+        case HEVC_PROF_MAIN10:
+            *profile_level_val =
+                compat_keys.kVTProfileLevel_HEVC_Main10_AutoLevel;
+            break;
+    }
+
+    if (!*profile_level_val) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid Profile/Level.\n");
+        return false;
+    }
+
+    return true;
+}
+
+static int get_cv_pixel_format(AVCodecContext* avctx,
+                               enum AVPixelFormat fmt,
+                               enum AVColorRange range,
+                               int* av_pixel_format,
+                               int* range_guessed)
+{
+    if (range_guessed) *range_guessed = range != AVCOL_RANGE_MPEG &&
+                                        range != AVCOL_RANGE_JPEG;
+
+    //MPEG range is used when no range is set
+    if (fmt == AV_PIX_FMT_NV12) {
+        *av_pixel_format = range == AVCOL_RANGE_JPEG ?
+                                        kCVPixelFormatType_420YpCbCr8BiPlanarFullRange :
+                                        kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange;
+    } else if (fmt == AV_PIX_FMT_YUV420P) {
+        *av_pixel_format = range == AVCOL_RANGE_JPEG ?
+                                        kCVPixelFormatType_420YpCbCr8PlanarFullRange :
+                                        kCVPixelFormatType_420YpCbCr8Planar;
+    } else {
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static void add_color_attr(AVCodecContext *avctx, CFMutableDictionaryRef dict) {
+    VTEncContext *vtctx = avctx->priv_data;
+
+    if (vtctx->color_primaries) {
+        CFDictionarySetValue(dict,
+                             kCVImageBufferColorPrimariesKey,
+                             vtctx->color_primaries);
+    }
+
+    if (vtctx->transfer_function) {
+        CFDictionarySetValue(dict,
+                             kCVImageBufferTransferFunctionKey,
+                             vtctx->transfer_function);
+    }
+
+    if (vtctx->ycbcr_matrix) {
+        CFDictionarySetValue(dict,
+                             kCVImageBufferYCbCrMatrixKey,
+                             vtctx->ycbcr_matrix);
+    }
+}
+
+static int create_cv_pixel_buffer_info(AVCodecContext* avctx,
+                                       CFMutableDictionaryRef* dict)
+{
+    CFNumberRef cv_color_format_num = NULL;
+    CFNumberRef width_num = NULL;
+    CFNumberRef height_num = NULL;
+    CFMutableDictionaryRef pixel_buffer_info = NULL;
+    int cv_color_format;
+    int status = get_cv_pixel_format(avctx,
+                                     avctx->pix_fmt,
+                                     avctx->color_range,
+                                     &cv_color_format,
+                                     NULL);
+    if (status) return status;
+
+    pixel_buffer_info = CFDictionaryCreateMutable(
+                            kCFAllocatorDefault,
+                            20,
+                            &kCFCopyStringDictionaryKeyCallBacks,
+                            &kCFTypeDictionaryValueCallBacks);
+
+    if (!pixel_buffer_info) goto pbinfo_nomem;
+
+    cv_color_format_num = CFNumberCreate(kCFAllocatorDefault,
+                                         kCFNumberSInt32Type,
+                                         &cv_color_format);
+    if (!cv_color_format_num) goto pbinfo_nomem;
+
+    CFDictionarySetValue(pixel_buffer_info,
+                         kCVPixelBufferPixelFormatTypeKey,
+                         cv_color_format_num);
+    vt_release_num(&cv_color_format_num);
+
+    width_num = CFNumberCreate(kCFAllocatorDefault,
+                               kCFNumberSInt32Type,
+                               &avctx->width);
+    if (!width_num) return AVERROR(ENOMEM);
+
+    CFDictionarySetValue(pixel_buffer_info,
+                         kCVPixelBufferWidthKey,
+                         width_num);
+    vt_release_num(&width_num);
+
+    height_num = CFNumberCreate(kCFAllocatorDefault,
+                                kCFNumberSInt32Type,
+                                &avctx->height);
+    if (!height_num) goto pbinfo_nomem;
+
+    CFDictionarySetValue(pixel_buffer_info,
+                         kCVPixelBufferHeightKey,
+                         height_num);
+    vt_release_num(&height_num);
+
+    add_color_attr(avctx, pixel_buffer_info);
+
+    *dict = pixel_buffer_info;
+    return 0;
+
+pbinfo_nomem:
+    vt_release_num(&cv_color_format_num);
+    vt_release_num(&width_num);
+    vt_release_num(&height_num);
+    if (pixel_buffer_info) CFRelease(pixel_buffer_info);
+
+    return AVERROR(ENOMEM);
+}
+
+static int get_cv_color_primaries(AVCodecContext *avctx,
+                                  CFStringRef *primaries)
+{
+    enum AVColorPrimaries pri = avctx->color_primaries;
+    switch (pri) {
+        case AVCOL_PRI_UNSPECIFIED:
+            *primaries = NULL;
+            break;
+
+        case AVCOL_PRI_BT709:
+            *primaries = kCVImageBufferColorPrimaries_ITU_R_709_2;
+            break;
+
+        case AVCOL_PRI_BT2020:
+            *primaries = compat_keys.kCVImageBufferColorPrimaries_ITU_R_2020;
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Color primaries %s is not supported.\n", av_color_primaries_name(pri));
+            *primaries = NULL;
+            return -1;
+    }
+
+    return 0;
+}
+
+static int get_cv_transfer_function(AVCodecContext *avctx,
+                                    CFStringRef *transfer_fnc,
+                                    CFNumberRef *gamma_level)
+{
+    enum AVColorTransferCharacteristic trc = avctx->color_trc;
+    Float32 gamma;
+    *gamma_level = NULL;
+
+    switch (trc) {
+        case AVCOL_TRC_UNSPECIFIED:
+            *transfer_fnc = NULL;
+            break;
+
+        case AVCOL_TRC_BT709:
+            *transfer_fnc = kCVImageBufferTransferFunction_ITU_R_709_2;
+            break;
+
+        case AVCOL_TRC_SMPTE240M:
+            *transfer_fnc = kCVImageBufferTransferFunction_SMPTE_240M_1995;
+            break;
+
+        case AVCOL_TRC_GAMMA22:
+            gamma = 2.2;
+            *transfer_fnc = kCVImageBufferTransferFunction_UseGamma;
+            *gamma_level = CFNumberCreate(NULL, kCFNumberFloat32Type, &gamma);
+            break;
+
+        case AVCOL_TRC_GAMMA28:
+            gamma = 2.8;
+            *transfer_fnc = kCVImageBufferTransferFunction_UseGamma;
+            *gamma_level = CFNumberCreate(NULL, kCFNumberFloat32Type, &gamma);
+            break;
+
+        case AVCOL_TRC_BT2020_10:
+        case AVCOL_TRC_BT2020_12:
+            *transfer_fnc = compat_keys.kCVImageBufferTransferFunction_ITU_R_2020;
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Transfer function %s is not supported.\n", av_color_transfer_name(trc));
+            return -1;
+    }
+
+    return 0;
+}
+
+static int get_cv_ycbcr_matrix(AVCodecContext *avctx, CFStringRef *matrix) {
+    switch(avctx->colorspace) {
+        case AVCOL_SPC_BT709:
+            *matrix = kCVImageBufferYCbCrMatrix_ITU_R_709_2;
+            break;
+
+        case AVCOL_SPC_UNSPECIFIED:
+            *matrix = NULL;
+            break;
+
+        case AVCOL_SPC_BT470BG:
+        case AVCOL_SPC_SMPTE170M:
+            *matrix = kCVImageBufferYCbCrMatrix_ITU_R_601_4;
+            break;
+
+        case AVCOL_SPC_SMPTE240M:
+            *matrix = kCVImageBufferYCbCrMatrix_SMPTE_240M_1995;
+            break;
+
+        case AVCOL_SPC_BT2020_NCL:
+            *matrix = compat_keys.kCVImageBufferYCbCrMatrix_ITU_R_2020;
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Color space %s is not supported.\n", av_color_space_name(avctx->colorspace));
+            return -1;
+    }
+
+    return 0;
+}
+
+static int vtenc_create_encoder(AVCodecContext   *avctx,
+                                CMVideoCodecType codec_type,
+                                CFStringRef      profile_level,
+                                CFNumberRef      gamma_level,
+                                CFDictionaryRef  enc_info,
+                                CFDictionaryRef  pixel_buffer_info,
+                                VTCompressionSessionRef *session)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    SInt32       bit_rate = avctx->bit_rate;
+    SInt32       max_rate = avctx->rc_max_rate;
+    CFNumberRef  bit_rate_num;
+    CFNumberRef  bytes_per_second;
+    CFNumberRef  one_second;
+    CFArrayRef   data_rate_limits;
+    int64_t      bytes_per_second_value = 0;
+    int64_t      one_second_value = 0;
+    void         *nums[2];
+
+    int status = VTCompressionSessionCreate(kCFAllocatorDefault,
+                                            avctx->width,
+                                            avctx->height,
+                                            codec_type,
+                                            enc_info,
+                                            pixel_buffer_info,
+                                            kCFAllocatorDefault,
+                                            vtenc_output_callback,
+                                            avctx,
+                                            session);
+
+    if (status || !vtctx->session) {
+        av_log(avctx, AV_LOG_ERROR, "Error: cannot create compression session: %d\n", status);
+
+#if !TARGET_OS_IPHONE
+        if (!vtctx->allow_sw) {
+            av_log(avctx, AV_LOG_ERROR, "Try -allow_sw 1. The hardware encoder may be busy, or not supported.\n");
+        }
+#endif
+
+        return AVERROR_EXTERNAL;
+    }
+
+    bit_rate_num = CFNumberCreate(kCFAllocatorDefault,
+                                  kCFNumberSInt32Type,
+                                  &bit_rate);
+    if (!bit_rate_num) return AVERROR(ENOMEM);
+
+    status = VTSessionSetProperty(vtctx->session,
+                                  kVTCompressionPropertyKey_AverageBitRate,
+                                  bit_rate_num);
+    CFRelease(bit_rate_num);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting bitrate property: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    if (vtctx->codec_id == AV_CODEC_ID_H264 && max_rate > 0) {
+        // kVTCompressionPropertyKey_DataRateLimits is not available for HEVC
+        bytes_per_second_value = max_rate >> 3;
+        bytes_per_second = CFNumberCreate(kCFAllocatorDefault,
+                                          kCFNumberSInt64Type,
+                                          &bytes_per_second_value);
+        if (!bytes_per_second) {
+            return AVERROR(ENOMEM);
+        }
+        one_second_value = 1;
+        one_second = CFNumberCreate(kCFAllocatorDefault,
+                                    kCFNumberSInt64Type,
+                                    &one_second_value);
+        if (!one_second) {
+            CFRelease(bytes_per_second);
+            return AVERROR(ENOMEM);
+        }
+        nums[0] = (void *)bytes_per_second;
+        nums[1] = (void *)one_second;
+        data_rate_limits = CFArrayCreate(kCFAllocatorDefault,
+                                         (const void **)nums,
+                                         2,
+                                         &kCFTypeArrayCallBacks);
+
+        if (!data_rate_limits) {
+            CFRelease(bytes_per_second);
+            CFRelease(one_second);
+            return AVERROR(ENOMEM);
+        }
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_DataRateLimits,
+                                      data_rate_limits);
+
+        CFRelease(bytes_per_second);
+        CFRelease(one_second);
+        CFRelease(data_rate_limits);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting max bitrate property: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (vtctx->codec_id == AV_CODEC_ID_H264) {
+        // kVTCompressionPropertyKey_ProfileLevel is not available for HEVC
+        if (profile_level) {
+            status = VTSessionSetProperty(vtctx->session,
+                                        kVTCompressionPropertyKey_ProfileLevel,
+                                        profile_level);
+            if (status) {
+                av_log(avctx, AV_LOG_ERROR, "Error setting profile/level property: %d\n", status);
+            }
+        }
+    }
+
+    if (avctx->gop_size > 0) {
+        CFNumberRef interval = CFNumberCreate(kCFAllocatorDefault,
+                                              kCFNumberIntType,
+                                              &avctx->gop_size);
+        if (!interval) {
+            return AVERROR(ENOMEM);
+        }
+
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_MaxKeyFrameInterval,
+                                      interval);
+        CFRelease(interval);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting 'max key-frame interval' property: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (vtctx->frames_before) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_MoreFramesBeforeStart,
+                                      kCFBooleanTrue);
+
+        if (status == kVTPropertyNotSupportedErr) {
+            av_log(avctx, AV_LOG_WARNING, "frames_before property is not supported on this device. Ignoring.\n");
+        } else if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting frames_before property: %d\n", status);
+        }
+    }
+
+    if (vtctx->frames_after) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_MoreFramesAfterEnd,
+                                      kCFBooleanTrue);
+
+        if (status == kVTPropertyNotSupportedErr) {
+            av_log(avctx, AV_LOG_WARNING, "frames_after property is not supported on this device. Ignoring.\n");
+        } else if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting frames_after property: %d\n", status);
+        }
+    }
+
+    if (avctx->sample_aspect_ratio.num != 0) {
+        CFNumberRef num;
+        CFNumberRef den;
+        CFMutableDictionaryRef par;
+        AVRational *avpar = &avctx->sample_aspect_ratio;
+
+        av_reduce(&avpar->num, &avpar->den,
+                   avpar->num,  avpar->den,
+                  0xFFFFFFFF);
+
+        num = CFNumberCreate(kCFAllocatorDefault,
+                             kCFNumberIntType,
+                             &avpar->num);
+
+        den = CFNumberCreate(kCFAllocatorDefault,
+                             kCFNumberIntType,
+                             &avpar->den);
+
+
+
+        par = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                        2,
+                                        &kCFCopyStringDictionaryKeyCallBacks,
+                                        &kCFTypeDictionaryValueCallBacks);
+
+        if (!par || !num || !den) {
+            if (par) CFRelease(par);
+            if (num) CFRelease(num);
+            if (den) CFRelease(den);
+
+            return AVERROR(ENOMEM);
+        }
+
+        CFDictionarySetValue(
+            par,
+            kCMFormatDescriptionKey_PixelAspectRatioHorizontalSpacing,
+            num);
+
+        CFDictionarySetValue(
+            par,
+            kCMFormatDescriptionKey_PixelAspectRatioVerticalSpacing,
+            den);
+
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_PixelAspectRatio,
+                                      par);
+
+        CFRelease(par);
+        CFRelease(num);
+        CFRelease(den);
+
+        if (status) {
+            av_log(avctx,
+                   AV_LOG_ERROR,
+                   "Error setting pixel aspect ratio to %d:%d: %d.\n",
+                   avctx->sample_aspect_ratio.num,
+                   avctx->sample_aspect_ratio.den,
+                   status);
+
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+
+    if (vtctx->transfer_function) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_TransferFunction,
+                                      vtctx->transfer_function);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set transfer function: %d\n", status);
+        }
+    }
+
+
+    if (vtctx->ycbcr_matrix) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_YCbCrMatrix,
+                                      vtctx->ycbcr_matrix);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set ycbcr matrix: %d\n", status);
+        }
+    }
+
+
+    if (vtctx->color_primaries) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_ColorPrimaries,
+                                      vtctx->color_primaries);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set color primaries: %d\n", status);
+        }
+    }
+
+    if (gamma_level) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kCVImageBufferGammaLevelKey,
+                                      gamma_level);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set gamma level: %d\n", status);
+        }
+    }
+
+    if (!vtctx->has_b_frames) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_AllowFrameReordering,
+                                      kCFBooleanFalse);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting 'allow frame reordering' property: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (vtctx->entropy != VT_ENTROPY_NOT_SET) {
+        CFStringRef entropy = vtctx->entropy == VT_CABAC ?
+                                compat_keys.kVTH264EntropyMode_CABAC:
+                                compat_keys.kVTH264EntropyMode_CAVLC;
+
+        status = VTSessionSetProperty(vtctx->session,
+                                      compat_keys.kVTCompressionPropertyKey_H264EntropyMode,
+                                      entropy);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting entropy property: %d\n", status);
+        }
+    }
+
+    if (vtctx->realtime) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      compat_keys.kVTCompressionPropertyKey_RealTime,
+                                      kCFBooleanTrue);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting realtime property: %d\n", status);
+        }
+    }
+
+    status = VTCompressionSessionPrepareToEncodeFrames(vtctx->session);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: cannot prepare encoder: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+static int vtenc_configure_encoder(AVCodecContext *avctx)
+{
+    CFMutableDictionaryRef enc_info;
+    CFMutableDictionaryRef pixel_buffer_info;
+    CMVideoCodecType       codec_type;
+    VTEncContext           *vtctx = avctx->priv_data;
+    CFStringRef            profile_level;
+    CFNumberRef            gamma_level = NULL;
+    int                    status;
+
+    codec_type = get_cm_codec_type(avctx->codec_id);
+    if (!codec_type) {
+        av_log(avctx, AV_LOG_ERROR, "Error: no mapping for AVCodecID %d\n", avctx->codec_id);
+        return AVERROR(EINVAL);
+    }
+
+    vtctx->codec_id = avctx->codec_id;
+
+    if (vtctx->codec_id == AV_CODEC_ID_H264) {
+        vtctx->get_param_set_func = CMVideoFormatDescriptionGetH264ParameterSetAtIndex;
+
+        vtctx->has_b_frames = avctx->max_b_frames > 0;
+        if(vtctx->has_b_frames && vtctx->profile == H264_PROF_BASELINE){
+            av_log(avctx, AV_LOG_WARNING, "Cannot use B-frames with baseline profile. Output will not contain B-frames.\n");
+            vtctx->has_b_frames = false;
+        }
+
+        if (vtctx->entropy == VT_CABAC && vtctx->profile == H264_PROF_BASELINE) {
+            av_log(avctx, AV_LOG_WARNING, "CABAC entropy requires 'main' or 'high' profile, but baseline was requested. Encode will not use CABAC entropy.\n");
+            vtctx->entropy = VT_ENTROPY_NOT_SET;
+        }
+
+        if (!get_vt_h264_profile_level(avctx, &profile_level)) return AVERROR(EINVAL);
+    } else {
+        vtctx->get_param_set_func = compat_keys.CMVideoFormatDescriptionGetHEVCParameterSetAtIndex;
+        if (!vtctx->get_param_set_func) return AVERROR(EINVAL);
+        if (!get_vt_hevc_profile_level(avctx, &profile_level)) return AVERROR(EINVAL);
+    }
+
+    enc_info = CFDictionaryCreateMutable(
+        kCFAllocatorDefault,
+        20,
+        &kCFCopyStringDictionaryKeyCallBacks,
+        &kCFTypeDictionaryValueCallBacks
+    );
+
+    if (!enc_info) return AVERROR(ENOMEM);
+
+#if !TARGET_OS_IPHONE
+    if (!vtctx->allow_sw) {
+        CFDictionarySetValue(enc_info,
+                             compat_keys.kVTVideoEncoderSpecification_RequireHardwareAcceleratedVideoEncoder,
+                             kCFBooleanTrue);
+    } else {
+        CFDictionarySetValue(enc_info,
+                             compat_keys.kVTVideoEncoderSpecification_EnableHardwareAcceleratedVideoEncoder,
+                             kCFBooleanTrue);
+    }
+#endif
+
+    if (avctx->pix_fmt != AV_PIX_FMT_VIDEOTOOLBOX) {
+        status = create_cv_pixel_buffer_info(avctx, &pixel_buffer_info);
+        if (status)
+            goto init_cleanup;
+    } else {
+        pixel_buffer_info = NULL;
+    }
+
+    vtctx->dts_delta = vtctx->has_b_frames ? -1 : 0;
+
+    get_cv_transfer_function(avctx, &vtctx->transfer_function, &gamma_level);
+    get_cv_ycbcr_matrix(avctx, &vtctx->ycbcr_matrix);
+    get_cv_color_primaries(avctx, &vtctx->color_primaries);
+
+
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+        status = vtenc_populate_extradata(avctx,
+                                          codec_type,
+                                          profile_level,
+                                          gamma_level,
+                                          enc_info,
+                                          pixel_buffer_info);
+        if (status)
+            goto init_cleanup;
+    }
+
+    status = vtenc_create_encoder(avctx,
+                                  codec_type,
+                                  profile_level,
+                                  gamma_level,
+                                  enc_info,
+                                  pixel_buffer_info,
+                                  &vtctx->session);
+
+init_cleanup:
+    if (gamma_level)
+        CFRelease(gamma_level);
+
+    if (pixel_buffer_info)
+        CFRelease(pixel_buffer_info);
+
+    CFRelease(enc_info);
+
+    return status;
+}
+
+static av_cold int vtenc_init(AVCodecContext *avctx)
+{
+    VTEncContext    *vtctx = avctx->priv_data;
+    CFBooleanRef    has_b_frames_cfbool;
+    int             status;
+
+    pthread_once(&once_ctrl, loadVTEncSymbols);
+
+    pthread_mutex_init(&vtctx->lock, NULL);
+    pthread_cond_init(&vtctx->cv_sample_sent, NULL);
+
+    vtctx->session = NULL;
+    status = vtenc_configure_encoder(avctx);
+    if (status) return status;
+
+    status = VTSessionCopyProperty(vtctx->session,
+                                   kVTCompressionPropertyKey_AllowFrameReordering,
+                                   kCFAllocatorDefault,
+                                   &has_b_frames_cfbool);
+
+    if (!status && has_b_frames_cfbool) {
+        //Some devices don't output B-frames for main profile, even if requested.
+        vtctx->has_b_frames = CFBooleanGetValue(has_b_frames_cfbool);
+        CFRelease(has_b_frames_cfbool);
+    }
+    avctx->has_b_frames = vtctx->has_b_frames;
+
+    return 0;
+}
+
+static void vtenc_get_frame_info(CMSampleBufferRef buffer, bool *is_key_frame)
+{
+    CFArrayRef      attachments;
+    CFDictionaryRef attachment;
+    CFBooleanRef    not_sync;
+    CFIndex         len;
+
+    attachments = CMSampleBufferGetSampleAttachmentsArray(buffer, false);
+    len = !attachments ? 0 : CFArrayGetCount(attachments);
+
+    if (!len) {
+        *is_key_frame = true;
+        return;
+    }
+
+    attachment = CFArrayGetValueAtIndex(attachments, 0);
+
+    if (CFDictionaryGetValueIfPresent(attachment,
+                                      kCMSampleAttachmentKey_NotSync,
+                                      (const void **)&not_sync))
+    {
+        *is_key_frame = !CFBooleanGetValue(not_sync);
+    } else {
+        *is_key_frame = true;
+    }
+}
+
+static int is_post_sei_nal_type(int nal_type){
+    return nal_type != H264_NAL_SEI &&
+           nal_type != H264_NAL_SPS &&
+           nal_type != H264_NAL_PPS &&
+           nal_type != H264_NAL_AUD;
+}
+
+/*
+ * Finds the sei message start/size of type find_sei_type.
+ * If more than one of that type exists, the last one is returned.
+ */
+static int find_sei_end(AVCodecContext *avctx,
+                        uint8_t        *nal_data,
+                        size_t          nal_size,
+                        uint8_t       **sei_end)
+{
+    int nal_type;
+    size_t sei_payload_size = 0;
+    int sei_payload_type = 0;
+    *sei_end = NULL;
+    uint8_t *nal_start = nal_data;
+
+    if (!nal_size)
+        return 0;
+
+    nal_type = *nal_data & 0x1F;
+    if (nal_type != H264_NAL_SEI)
+        return 0;
+
+    nal_data++;
+    nal_size--;
+
+    if (nal_data[nal_size - 1] == 0x80)
+        nal_size--;
+
+    while (nal_size > 0 && *nal_data > 0) {
+        do{
+            sei_payload_type += *nal_data;
+            nal_data++;
+            nal_size--;
+        } while (nal_size > 0 && *nal_data == 0xFF);
+
+        if (!nal_size) {
+            av_log(avctx, AV_LOG_ERROR, "Unexpected end of SEI NAL Unit parsing type.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        do{
+            sei_payload_size += *nal_data;
+            nal_data++;
+            nal_size--;
+        } while (nal_size > 0 && *nal_data == 0xFF);
+
+        if (nal_size < sei_payload_size) {
+            av_log(avctx, AV_LOG_ERROR, "Unexpected end of SEI NAL Unit parsing size.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        nal_data += sei_payload_size;
+        nal_size -= sei_payload_size;
+    }
+
+    *sei_end = nal_data;
+
+    return nal_data - nal_start + 1;
+}
+
+/**
+ * Copies the data inserting emulation prevention bytes as needed.
+ * Existing data in the destination can be taken into account by providing
+ * dst with a dst_offset > 0.
+ *
+ * @return The number of bytes copied on success. On failure, the negative of
+ *         the number of bytes needed to copy src is returned.
+ */
+static int copy_emulation_prev(const uint8_t *src,
+                               size_t         src_size,
+                               uint8_t       *dst,
+                               ssize_t        dst_offset,
+                               size_t         dst_size)
+{
+    int zeros = 0;
+    int wrote_bytes;
+    uint8_t* dst_start;
+    uint8_t* dst_end = dst + dst_size;
+    const uint8_t* src_end = src + src_size;
+    int start_at = dst_offset > 2 ? dst_offset - 2 : 0;
+    int i;
+    for (i = start_at; i < dst_offset && i < dst_size; i++) {
+        if (!dst[i])
+            zeros++;
+        else
+            zeros = 0;
+    }
+
+    dst += dst_offset;
+    dst_start = dst;
+    for (; src < src_end; src++, dst++) {
+        if (zeros == 2) {
+            int insert_ep3_byte = *src <= 3;
+            if (insert_ep3_byte) {
+                if (dst < dst_end)
+                    *dst = 3;
+                dst++;
+            }
+
+            zeros = 0;
+        }
+
+        if (dst < dst_end)
+            *dst = *src;
+
+        if (!*src)
+            zeros++;
+        else
+            zeros = 0;
+    }
+
+    wrote_bytes = dst - dst_start;
+
+    if (dst > dst_end)
+        return -wrote_bytes;
+
+    return wrote_bytes;
+}
+
+static int write_sei(const ExtraSEI *sei,
+                     int             sei_type,
+                     uint8_t        *dst,
+                     size_t          dst_size)
+{
+    uint8_t *sei_start = dst;
+    size_t remaining_sei_size = sei->size;
+    size_t remaining_dst_size = dst_size;
+    int header_bytes;
+    int bytes_written;
+    ssize_t offset;
+
+    if (!remaining_dst_size)
+        return AVERROR_BUFFER_TOO_SMALL;
+
+    while (sei_type && remaining_dst_size != 0) {
+        int sei_byte = sei_type > 255 ? 255 : sei_type;
+        *dst = sei_byte;
+
+        sei_type -= sei_byte;
+        dst++;
+        remaining_dst_size--;
+    }
+
+    if (!dst_size)
+        return AVERROR_BUFFER_TOO_SMALL;
+
+    while (remaining_sei_size && remaining_dst_size != 0) {
+        int size_byte = remaining_sei_size > 255 ? 255 : remaining_sei_size;
+        *dst = size_byte;
+
+        remaining_sei_size -= size_byte;
+        dst++;
+        remaining_dst_size--;
+    }
+
+    if (remaining_dst_size < sei->size)
+        return AVERROR_BUFFER_TOO_SMALL;
+
+    header_bytes = dst - sei_start;
+
+    offset = header_bytes;
+    bytes_written = copy_emulation_prev(sei->data,
+                                        sei->size,
+                                        sei_start,
+                                        offset,
+                                        dst_size);
+    if (bytes_written < 0)
+        return AVERROR_BUFFER_TOO_SMALL;
+
+    bytes_written += header_bytes;
+    return bytes_written;
+}
+
+/**
+ * Copies NAL units and replaces length codes with
+ * H.264 Annex B start codes. On failure, the contents of
+ * dst_data may have been modified.
+ *
+ * @param length_code_size Byte length of each length code
+ * @param sample_buffer NAL units prefixed with length codes.
+ * @param sei Optional A53 closed captions SEI data.
+ * @param dst_data Must be zeroed before calling this function.
+ *                 Contains the copied NAL units prefixed with
+ *                 start codes when the function returns
+ *                 successfully.
+ * @param dst_size Length of dst_data
+ * @return 0 on success
+ *         AVERROR_INVALIDDATA if length_code_size is invalid
+ *         AVERROR_BUFFER_TOO_SMALL if dst_data is too small
+ *         or if a length_code in src_data specifies data beyond
+ *         the end of its buffer.
+ */
+static int copy_replace_length_codes(
+    AVCodecContext *avctx,
+    size_t        length_code_size,
+    CMSampleBufferRef sample_buffer,
+    ExtraSEI      *sei,
+    uint8_t       *dst_data,
+    size_t        dst_size)
+{
+    size_t src_size = CMSampleBufferGetTotalSampleSize(sample_buffer);
+    size_t remaining_src_size = src_size;
+    size_t remaining_dst_size = dst_size;
+    size_t src_offset = 0;
+    int wrote_sei = 0;
+    int status;
+    uint8_t size_buf[4];
+    uint8_t nal_type;
+    CMBlockBufferRef block = CMSampleBufferGetDataBuffer(sample_buffer);
+
+    if (length_code_size > 4) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    while (remaining_src_size > 0) {
+        size_t curr_src_len;
+        size_t curr_dst_len;
+        size_t box_len = 0;
+        size_t i;
+
+        uint8_t       *dst_box;
+
+        status = CMBlockBufferCopyDataBytes(block,
+                                            src_offset,
+                                            length_code_size,
+                                            size_buf);
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot copy length: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+
+        status = CMBlockBufferCopyDataBytes(block,
+                                            src_offset + length_code_size,
+                                            1,
+                                            &nal_type);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot copy type: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+
+        nal_type &= 0x1F;
+
+        for (i = 0; i < length_code_size; i++) {
+            box_len <<= 8;
+            box_len |= size_buf[i];
+        }
+
+        if (sei && !wrote_sei && is_post_sei_nal_type(nal_type)) {
+            //No SEI NAL unit - insert.
+            int wrote_bytes;
+
+            memcpy(dst_data, start_code, sizeof(start_code));
+            dst_data += sizeof(start_code);
+            remaining_dst_size -= sizeof(start_code);
+
+            *dst_data = H264_NAL_SEI;
+            dst_data++;
+            remaining_dst_size--;
+
+            wrote_bytes = write_sei(sei,
+                                    H264_SEI_TYPE_USER_DATA_REGISTERED,
+                                    dst_data,
+                                    remaining_dst_size);
+
+            if (wrote_bytes < 0)
+                return wrote_bytes;
+
+            remaining_dst_size -= wrote_bytes;
+            dst_data += wrote_bytes;
+
+            if (remaining_dst_size <= 0)
+                return AVERROR_BUFFER_TOO_SMALL;
+
+            *dst_data = 0x80;
+
+            dst_data++;
+            remaining_dst_size--;
+
+            wrote_sei = 1;
+        }
+
+        curr_src_len = box_len + length_code_size;
+        curr_dst_len = box_len + sizeof(start_code);
+
+        if (remaining_src_size < curr_src_len) {
+            return AVERROR_BUFFER_TOO_SMALL;
+        }
+
+        if (remaining_dst_size < curr_dst_len) {
+            return AVERROR_BUFFER_TOO_SMALL;
+        }
+
+        dst_box = dst_data + sizeof(start_code);
+
+        memcpy(dst_data, start_code, sizeof(start_code));
+        status = CMBlockBufferCopyDataBytes(block,
+                                            src_offset + length_code_size,
+                                            box_len,
+                                            dst_box);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot copy data: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+
+        if (sei && !wrote_sei && nal_type == H264_NAL_SEI) {
+            //Found SEI NAL unit - append.
+            int wrote_bytes;
+            int old_sei_length;
+            int extra_bytes;
+            uint8_t *new_sei;
+            old_sei_length = find_sei_end(avctx, dst_box, box_len, &new_sei);
+            if (old_sei_length < 0)
+                return status;
+
+            wrote_bytes = write_sei(sei,
+                                    H264_SEI_TYPE_USER_DATA_REGISTERED,
+                                    new_sei,
+                                    remaining_dst_size - old_sei_length);
+            if (wrote_bytes < 0)
+                return wrote_bytes;
+
+            if (new_sei + wrote_bytes >= dst_data + remaining_dst_size)
+                return AVERROR_BUFFER_TOO_SMALL;
+
+            new_sei[wrote_bytes++] = 0x80;
+            extra_bytes = wrote_bytes - (dst_box + box_len - new_sei);
+
+            dst_data += extra_bytes;
+            remaining_dst_size -= extra_bytes;
+
+            wrote_sei = 1;
+        }
+
+        src_offset += curr_src_len;
+        dst_data += curr_dst_len;
+
+        remaining_src_size -= curr_src_len;
+        remaining_dst_size -= curr_dst_len;
+    }
+
+    return 0;
+}
+
+/**
+ * Returns a sufficient number of bytes to contain the sei data.
+ * It may be greater than the minimum required.
+ */
+static int get_sei_msg_bytes(const ExtraSEI* sei, int type){
+    int copied_size;
+    if (sei->size == 0)
+        return 0;
+
+    copied_size = -copy_emulation_prev(sei->data,
+                                       sei->size,
+                                       NULL,
+                                       0,
+                                       0);
+
+    if ((sei->size % 255) == 0) //may result in an extra byte
+        copied_size++;
+
+    return copied_size + sei->size / 255 + 1 + type / 255 + 1;
+}
+
+static int vtenc_cm_to_avpacket(
+    AVCodecContext    *avctx,
+    CMSampleBufferRef sample_buffer,
+    AVPacket          *pkt,
+    ExtraSEI          *sei)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+
+    int     status;
+    bool    is_key_frame;
+    bool    add_header;
+    size_t  length_code_size;
+    size_t  header_size = 0;
+    size_t  in_buf_size;
+    size_t  out_buf_size;
+    size_t  sei_nalu_size = 0;
+    int64_t dts_delta;
+    int64_t time_base_num;
+    int nalu_count;
+    CMTime  pts;
+    CMTime  dts;
+    CMVideoFormatDescriptionRef vid_fmt;
+
+
+    vtenc_get_frame_info(sample_buffer, &is_key_frame);
+    status = get_length_code_size(avctx, sample_buffer, &length_code_size);
+    if (status) return status;
+
+    add_header = is_key_frame && !(avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER);
+
+    if (add_header) {
+        vid_fmt = CMSampleBufferGetFormatDescription(sample_buffer);
+        if (!vid_fmt) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot get format description.\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        int status = get_params_size(avctx, vid_fmt, &header_size);
+        if (status) return status;
+    }
+
+    status = count_nalus(length_code_size, sample_buffer, &nalu_count);
+    if(status)
+        return status;
+
+    if (sei) {
+        size_t msg_size = get_sei_msg_bytes(sei,
+                                            H264_SEI_TYPE_USER_DATA_REGISTERED);
+
+        sei_nalu_size = sizeof(start_code) + 1 + msg_size + 1;
+    }
+
+    in_buf_size = CMSampleBufferGetTotalSampleSize(sample_buffer);
+    out_buf_size = header_size +
+                   in_buf_size +
+                   sei_nalu_size +
+                   nalu_count * ((int)sizeof(start_code) - (int)length_code_size);
+
+    status = ff_alloc_packet2(avctx, pkt, out_buf_size, out_buf_size);
+    if (status < 0)
+        return status;
+
+    if (add_header) {
+        status = copy_param_sets(avctx, vid_fmt, pkt->data, out_buf_size);
+        if(status) return status;
+    }
+
+    status = copy_replace_length_codes(
+        avctx,
+        length_code_size,
+        sample_buffer,
+        sei,
+        pkt->data + header_size,
+        pkt->size - header_size
+    );
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error copying packet data: %d\n", status);
+        return status;
+    }
+
+    if (is_key_frame) {
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    }
+
+    pts = CMSampleBufferGetPresentationTimeStamp(sample_buffer);
+    dts = CMSampleBufferGetDecodeTimeStamp      (sample_buffer);
+
+    if (CMTIME_IS_INVALID(dts)) {
+        if (!vtctx->has_b_frames) {
+            dts = pts;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "DTS is invalid.\n");
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    dts_delta = vtctx->dts_delta >= 0 ? vtctx->dts_delta : 0;
+    time_base_num = avctx->time_base.num;
+    pkt->pts = pts.value / time_base_num;
+    pkt->dts = dts.value / time_base_num - dts_delta;
+    pkt->size = out_buf_size;
+
+    return 0;
+}
+
+/*
+ * contiguous_buf_size is 0 if not contiguous, and the size of the buffer
+ * containing all planes if so.
+ */
+static int get_cv_pixel_info(
+    AVCodecContext *avctx,
+    const AVFrame  *frame,
+    int            *color,
+    int            *plane_count,
+    size_t         *widths,
+    size_t         *heights,
+    size_t         *strides,
+    size_t         *contiguous_buf_size)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    int av_format       = frame->format;
+    int av_color_range  = frame->color_range;
+    int i;
+    int range_guessed;
+    int status;
+
+    status = get_cv_pixel_format(avctx, av_format, av_color_range, color, &range_guessed);
+    if (status) {
+        av_log(avctx,
+            AV_LOG_ERROR,
+            "Could not get pixel format for color format '%s' range '%s'.\n",
+            av_get_pix_fmt_name(av_format),
+            av_color_range > AVCOL_RANGE_UNSPECIFIED &&
+            av_color_range < AVCOL_RANGE_NB ?
+               av_color_range_name(av_color_range) :
+               "Unknown");
+
+        return AVERROR(EINVAL);
+    }
+
+    if (range_guessed) {
+        if (!vtctx->warned_color_range) {
+            vtctx->warned_color_range = true;
+            av_log(avctx,
+                   AV_LOG_WARNING,
+                   "Color range not set for %s. Using MPEG range.\n",
+                   av_get_pix_fmt_name(av_format));
+        }
+    }
+
+    switch (av_format) {
+    case AV_PIX_FMT_NV12:
+        *plane_count = 2;
+
+        widths [0] = avctx->width;
+        heights[0] = avctx->height;
+        strides[0] = frame ? frame->linesize[0] : avctx->width;
+
+        widths [1] = (avctx->width  + 1) / 2;
+        heights[1] = (avctx->height + 1) / 2;
+        strides[1] = frame ? frame->linesize[1] : (avctx->width + 1) & -2;
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+        *plane_count = 3;
+
+        widths [0] = avctx->width;
+        heights[0] = avctx->height;
+        strides[0] = frame ? frame->linesize[0] : avctx->width;
+
+        widths [1] = (avctx->width  + 1) / 2;
+        heights[1] = (avctx->height + 1) / 2;
+        strides[1] = frame ? frame->linesize[1] : (avctx->width + 1) / 2;
+
+        widths [2] = (avctx->width  + 1) / 2;
+        heights[2] = (avctx->height + 1) / 2;
+        strides[2] = frame ? frame->linesize[2] : (avctx->width + 1) / 2;
+        break;
+
+    default:
+        av_log(
+               avctx,
+               AV_LOG_ERROR,
+               "Could not get frame format info for color %d range %d.\n",
+               av_format,
+               av_color_range);
+
+        return AVERROR(EINVAL);
+    }
+
+    *contiguous_buf_size = 0;
+    for (i = 0; i < *plane_count; i++) {
+        if (i < *plane_count - 1 &&
+            frame->data[i] + strides[i] * heights[i] != frame->data[i + 1]) {
+            *contiguous_buf_size = 0;
+            break;
+        }
+
+        *contiguous_buf_size += strides[i] * heights[i];
+    }
+
+    return 0;
+}
+
+#if !TARGET_OS_IPHONE
+//Not used on iOS - frame is always copied.
+static void free_avframe(
+    void       *release_ctx,
+    const void *data,
+    size_t      size,
+    size_t      plane_count,
+    const void *plane_addresses[])
+{
+    AVFrame *frame = release_ctx;
+    av_frame_free(&frame);
+}
+#else
+//Not used on OSX - frame is never copied.
+static int copy_avframe_to_pixel_buffer(AVCodecContext   *avctx,
+                                        const AVFrame    *frame,
+                                        CVPixelBufferRef cv_img,
+                                        const size_t     *plane_strides,
+                                        const size_t     *plane_rows)
+{
+    int i, j;
+    size_t plane_count;
+    int status;
+    int rows;
+    int src_stride;
+    int dst_stride;
+    uint8_t *src_addr;
+    uint8_t *dst_addr;
+    size_t copy_bytes;
+
+    status = CVPixelBufferLockBaseAddress(cv_img, 0);
+    if (status) {
+        av_log(
+            avctx,
+            AV_LOG_ERROR,
+            "Error: Could not lock base address of CVPixelBuffer: %d.\n",
+            status
+        );
+    }
+
+    if (CVPixelBufferIsPlanar(cv_img)) {
+        plane_count = CVPixelBufferGetPlaneCount(cv_img);
+        for (i = 0; frame->data[i]; i++) {
+            if (i == plane_count) {
+                CVPixelBufferUnlockBaseAddress(cv_img, 0);
+                av_log(avctx,
+                    AV_LOG_ERROR,
+                    "Error: different number of planes in AVFrame and CVPixelBuffer.\n"
+                );
+
+                return AVERROR_EXTERNAL;
+            }
+
+            dst_addr = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(cv_img, i);
+            src_addr = (uint8_t*)frame->data[i];
+            dst_stride = CVPixelBufferGetBytesPerRowOfPlane(cv_img, i);
+            src_stride = plane_strides[i];
+            rows = plane_rows[i];
+
+            if (dst_stride == src_stride) {
+                memcpy(dst_addr, src_addr, src_stride * rows);
+            } else {
+                copy_bytes = dst_stride < src_stride ? dst_stride : src_stride;
+
+                for (j = 0; j < rows; j++) {
+                    memcpy(dst_addr + j * dst_stride, src_addr + j * src_stride, copy_bytes);
+                }
+            }
+        }
+    } else {
+        if (frame->data[1]) {
+            CVPixelBufferUnlockBaseAddress(cv_img, 0);
+            av_log(avctx,
+                AV_LOG_ERROR,
+                "Error: different number of planes in AVFrame and non-planar CVPixelBuffer.\n"
+            );
+
+            return AVERROR_EXTERNAL;
+        }
+
+        dst_addr = (uint8_t*)CVPixelBufferGetBaseAddress(cv_img);
+        src_addr = (uint8_t*)frame->data[0];
+        dst_stride = CVPixelBufferGetBytesPerRow(cv_img);
+        src_stride = plane_strides[0];
+        rows = plane_rows[0];
+
+        if (dst_stride == src_stride) {
+            memcpy(dst_addr, src_addr, src_stride * rows);
+        } else {
+            copy_bytes = dst_stride < src_stride ? dst_stride : src_stride;
+
+            for (j = 0; j < rows; j++) {
+                memcpy(dst_addr + j * dst_stride, src_addr + j * src_stride, copy_bytes);
+            }
+        }
+    }
+
+    status = CVPixelBufferUnlockBaseAddress(cv_img, 0);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: Could not unlock CVPixelBuffer base address: %d.\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+#endif //!TARGET_OS_IPHONE
+
+static int create_cv_pixel_buffer(AVCodecContext   *avctx,
+                                  const AVFrame    *frame,
+                                  CVPixelBufferRef *cv_img)
+{
+    int plane_count;
+    int color;
+    size_t widths [AV_NUM_DATA_POINTERS];
+    size_t heights[AV_NUM_DATA_POINTERS];
+    size_t strides[AV_NUM_DATA_POINTERS];
+    int status;
+    size_t contiguous_buf_size;
+#if TARGET_OS_IPHONE
+    CVPixelBufferPoolRef pix_buf_pool;
+    VTEncContext* vtctx = avctx->priv_data;
+#else
+    CFMutableDictionaryRef pix_buf_attachments = CFDictionaryCreateMutable(
+                                                   kCFAllocatorDefault,
+                                                   10,
+                                                   &kCFCopyStringDictionaryKeyCallBacks,
+                                                   &kCFTypeDictionaryValueCallBacks);
+
+    if (!pix_buf_attachments) return AVERROR(ENOMEM);
+#endif
+
+    if (avctx->pix_fmt == AV_PIX_FMT_VIDEOTOOLBOX) {
+        av_assert0(frame->format == AV_PIX_FMT_VIDEOTOOLBOX);
+
+        *cv_img = (CVPixelBufferRef)frame->data[3];
+        av_assert0(*cv_img);
+
+        CFRetain(*cv_img);
+        return 0;
+    }
+
+    memset(widths,  0, sizeof(widths));
+    memset(heights, 0, sizeof(heights));
+    memset(strides, 0, sizeof(strides));
+
+    status = get_cv_pixel_info(
+        avctx,
+        frame,
+        &color,
+        &plane_count,
+        widths,
+        heights,
+        strides,
+        &contiguous_buf_size
+    );
+
+    if (status) {
+        av_log(
+            avctx,
+            AV_LOG_ERROR,
+            "Error: Cannot convert format %d color_range %d: %d\n",
+            frame->format,
+            frame->color_range,
+            status
+        );
+
+        return AVERROR_EXTERNAL;
+    }
+
+#if TARGET_OS_IPHONE
+    pix_buf_pool = VTCompressionSessionGetPixelBufferPool(vtctx->session);
+    if (!pix_buf_pool) {
+        /* On iOS, the VT session is invalidated when the APP switches from
+         * foreground to background and vice versa. Fetch the actual error code
+         * of the VT session to detect that case and restart the VT session
+         * accordingly. */
+        OSStatus vtstatus;
+
+        vtstatus = VTCompressionSessionPrepareToEncodeFrames(vtctx->session);
+        if (vtstatus == kVTInvalidSessionErr) {
+            CFRelease(vtctx->session);
+            vtctx->session = NULL;
+            status = vtenc_configure_encoder(avctx);
+            if (status == 0)
+                pix_buf_pool = VTCompressionSessionGetPixelBufferPool(vtctx->session);
+        }
+        if (!pix_buf_pool) {
+            av_log(avctx, AV_LOG_ERROR, "Could not get pixel buffer pool.\n");
+            return AVERROR_EXTERNAL;
+        }
+        else
+            av_log(avctx, AV_LOG_WARNING, "VT session restarted because of a "
+                   "kVTInvalidSessionErr error.\n");
+    }
+
+    status = CVPixelBufferPoolCreatePixelBuffer(NULL,
+                                                pix_buf_pool,
+                                                cv_img);
+
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Could not create pixel buffer from pool: %d.\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    status = copy_avframe_to_pixel_buffer(avctx, frame, *cv_img, strides, heights);
+    if (status) {
+        CFRelease(*cv_img);
+        *cv_img = NULL;
+        return status;
+    }
+#else
+    AVFrame *enc_frame = av_frame_alloc();
+    if (!enc_frame) return AVERROR(ENOMEM);
+
+    status = av_frame_ref(enc_frame, frame);
+    if (status) {
+        av_frame_free(&enc_frame);
+        return status;
+    }
+
+    status = CVPixelBufferCreateWithPlanarBytes(
+        kCFAllocatorDefault,
+        enc_frame->width,
+        enc_frame->height,
+        color,
+        NULL,
+        contiguous_buf_size,
+        plane_count,
+        (void **)enc_frame->data,
+        widths,
+        heights,
+        strides,
+        free_avframe,
+        enc_frame,
+        NULL,
+        cv_img
+    );
+
+    add_color_attr(avctx, pix_buf_attachments);
+    CVBufferSetAttachments(*cv_img, pix_buf_attachments, kCVAttachmentMode_ShouldPropagate);
+    CFRelease(pix_buf_attachments);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: Could not create CVPixelBuffer: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+#endif
+
+    return 0;
+}
+
+static int create_encoder_dict_h264(const AVFrame *frame,
+                                    CFDictionaryRef* dict_out)
+{
+    CFDictionaryRef dict = NULL;
+    if (frame->pict_type == AV_PICTURE_TYPE_I) {
+        const void *keys[] = { kVTEncodeFrameOptionKey_ForceKeyFrame };
+        const void *vals[] = { kCFBooleanTrue };
+
+        dict = CFDictionaryCreate(NULL, keys, vals, 1, NULL, NULL);
+        if(!dict) return AVERROR(ENOMEM);
+    }
+
+    *dict_out = dict;
+    return 0;
+}
+
+static int vtenc_send_frame(AVCodecContext *avctx,
+                            VTEncContext   *vtctx,
+                            const AVFrame  *frame)
+{
+    CMTime time;
+    CFDictionaryRef frame_dict;
+    CVPixelBufferRef cv_img = NULL;
+    AVFrameSideData *side_data = NULL;
+    ExtraSEI *sei = NULL;
+    int status = create_cv_pixel_buffer(avctx, frame, &cv_img);
+
+    if (status) return status;
+
+    status = create_encoder_dict_h264(frame, &frame_dict);
+    if (status) {
+        CFRelease(cv_img);
+        return status;
+    }
+
+    side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_A53_CC);
+    if (vtctx->a53_cc && side_data && side_data->size) {
+        sei = av_mallocz(sizeof(*sei));
+        if (!sei) {
+            av_log(avctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+        } else {
+            int ret = ff_alloc_a53_sei(frame, 0, &sei->data, &sei->size);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+                av_free(sei);
+                sei = NULL;
+            }
+        }
+    }
+
+    time = CMTimeMake(frame->pts * avctx->time_base.num, avctx->time_base.den);
+    status = VTCompressionSessionEncodeFrame(
+        vtctx->session,
+        cv_img,
+        time,
+        kCMTimeInvalid,
+        frame_dict,
+        sei,
+        NULL
+    );
+
+    if (frame_dict) CFRelease(frame_dict);
+    CFRelease(cv_img);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: cannot encode frame: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+static av_cold int vtenc_frame(
+    AVCodecContext *avctx,
+    AVPacket       *pkt,
+    const AVFrame  *frame,
+    int            *got_packet)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    bool get_frame;
+    int status;
+    CMSampleBufferRef buf = NULL;
+    ExtraSEI *sei = NULL;
+
+    if (frame) {
+        status = vtenc_send_frame(avctx, vtctx, frame);
+
+        if (status) {
+            status = AVERROR_EXTERNAL;
+            goto end_nopkt;
+        }
+
+        if (vtctx->frame_ct_in == 0) {
+            vtctx->first_pts = frame->pts;
+        } else if(vtctx->frame_ct_in == 1 && vtctx->has_b_frames) {
+            vtctx->dts_delta = frame->pts - vtctx->first_pts;
+        }
+
+        vtctx->frame_ct_in++;
+    } else if(!vtctx->flushing) {
+        vtctx->flushing = true;
+
+        status = VTCompressionSessionCompleteFrames(vtctx->session,
+                                                    kCMTimeIndefinite);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error flushing frames: %d\n", status);
+            status = AVERROR_EXTERNAL;
+            goto end_nopkt;
+        }
+    }
+
+    *got_packet = 0;
+    get_frame = vtctx->dts_delta >= 0 || !frame;
+    if (!get_frame) {
+        status = 0;
+        goto end_nopkt;
+    }
+
+    status = vtenc_q_pop(vtctx, !frame, &buf, &sei);
+    if (status) goto end_nopkt;
+    if (!buf)   goto end_nopkt;
+
+    status = vtenc_cm_to_avpacket(avctx, buf, pkt, sei);
+    if (sei) {
+        if (sei->data) av_free(sei->data);
+        av_free(sei);
+    }
+    CFRelease(buf);
+    if (status) goto end_nopkt;
+
+    *got_packet = 1;
+    return 0;
+
+end_nopkt:
+    av_packet_unref(pkt);
+    return status;
+}
+
+static int vtenc_populate_extradata(AVCodecContext   *avctx,
+                                    CMVideoCodecType codec_type,
+                                    CFStringRef      profile_level,
+                                    CFNumberRef      gamma_level,
+                                    CFDictionaryRef  enc_info,
+                                    CFDictionaryRef  pixel_buffer_info)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    AVFrame *frame = av_frame_alloc();
+    int y_size = avctx->width * avctx->height;
+    int chroma_size = (avctx->width / 2) * (avctx->height / 2);
+    CMSampleBufferRef buf = NULL;
+    int status;
+
+    if (!frame)
+        return AVERROR(ENOMEM);
+
+    frame->buf[0] = av_buffer_alloc(y_size + 2 * chroma_size);
+
+    if(!frame->buf[0]){
+        status = AVERROR(ENOMEM);
+        goto pe_cleanup;
+    }
+
+    status = vtenc_create_encoder(avctx,
+                                  codec_type,
+                                  profile_level,
+                                  gamma_level,
+                                  enc_info,
+                                  pixel_buffer_info,
+                                  &vtctx->session);
+    if (status)
+        goto pe_cleanup;
+
+    frame->data[0] = frame->buf[0]->data;
+    memset(frame->data[0],   0,      y_size);
+
+    frame->data[1] = frame->buf[0]->data + y_size;
+    memset(frame->data[1], 128, chroma_size);
+
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
+        frame->data[2] = frame->buf[0]->data + y_size + chroma_size;
+        memset(frame->data[2], 128, chroma_size);
+    }
+
+    frame->linesize[0] = avctx->width;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
+        frame->linesize[1] =
+        frame->linesize[2] = (avctx->width + 1) / 2;
+    } else {
+        frame->linesize[1] = (avctx->width + 1) / 2;
+    }
+
+    frame->format          = avctx->pix_fmt;
+    frame->width           = avctx->width;
+    frame->height          = avctx->height;
+    frame->colorspace      = avctx->colorspace;
+    frame->color_range     = avctx->color_range;
+    frame->color_trc       = avctx->color_trc;
+    frame->color_primaries = avctx->color_primaries;
+
+    frame->pts = 0;
+    status = vtenc_send_frame(avctx, vtctx, frame);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error sending frame: %d\n", status);
+        goto pe_cleanup;
+    }
+
+    //Populates extradata - output frames are flushed and param sets are available.
+    status = VTCompressionSessionCompleteFrames(vtctx->session,
+                                                kCMTimeIndefinite);
+
+    if (status)
+        goto pe_cleanup;
+
+    status = vtenc_q_pop(vtctx, 0, &buf, NULL);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "popping: %d\n", status);
+        goto pe_cleanup;
+    }
+
+    CFRelease(buf);
+
+
+
+pe_cleanup:
+    if(vtctx->session)
+        CFRelease(vtctx->session);
+
+    vtctx->session = NULL;
+    vtctx->frame_ct_out = 0;
+
+    av_frame_unref(frame);
+    av_frame_free(&frame);
+
+    av_assert0(status != 0 || (avctx->extradata && avctx->extradata_size > 0));
+
+    return status;
+}
+
+static av_cold int vtenc_close(AVCodecContext *avctx)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+
+    pthread_cond_destroy(&vtctx->cv_sample_sent);
+    pthread_mutex_destroy(&vtctx->lock);
+
+    if(!vtctx->session) return 0;
+
+    VTCompressionSessionCompleteFrames(vtctx->session,
+                                       kCMTimeIndefinite);
+    clear_frame_queue(vtctx);
+    CFRelease(vtctx->session);
+    vtctx->session = NULL;
+
+    if (vtctx->color_primaries) {
+        CFRelease(vtctx->color_primaries);
+        vtctx->color_primaries = NULL;
+    }
+
+    if (vtctx->transfer_function) {
+        CFRelease(vtctx->transfer_function);
+        vtctx->transfer_function = NULL;
+    }
+
+    if (vtctx->ycbcr_matrix) {
+        CFRelease(vtctx->ycbcr_matrix);
+        vtctx->ycbcr_matrix = NULL;
+    }
+
+    return 0;
+}
+
+static const enum AVPixelFormat pix_fmts[] = {
+    AV_PIX_FMT_VIDEOTOOLBOX,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+#define COMMON_OPTIONS \
+    { "allow_sw", "Allow software encoding", OFFSET(allow_sw), AV_OPT_TYPE_BOOL, \
+        { .i64 = 0 }, 0, 1, VE }, \
+    { "realtime", "Hint that encoding should happen in real-time if not faster (e.g. capturing from camera).", \
+        OFFSET(realtime), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
+    { "frames_before", "Other frames will come before the frames in this session. This helps smooth concatenation issues.", \
+        OFFSET(frames_before), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
+    { "frames_after", "Other frames will come after the frames in this session. This helps smooth concatenation issues.", \
+        OFFSET(frames_after), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+
+#define OFFSET(x) offsetof(VTEncContext, x)
+static const AVOption h264_options[] = {
+    { "profile", "Profile", OFFSET(profile), AV_OPT_TYPE_INT, { .i64 = H264_PROF_AUTO }, H264_PROF_AUTO, H264_PROF_COUNT, VE, "profile" },
+    { "baseline", "Baseline Profile", 0, AV_OPT_TYPE_CONST, { .i64 = H264_PROF_BASELINE }, INT_MIN, INT_MAX, VE, "profile" },
+    { "main",     "Main Profile",     0, AV_OPT_TYPE_CONST, { .i64 = H264_PROF_MAIN     }, INT_MIN, INT_MAX, VE, "profile" },
+    { "high",     "High Profile",     0, AV_OPT_TYPE_CONST, { .i64 = H264_PROF_HIGH     }, INT_MIN, INT_MAX, VE, "profile" },
+
+    { "level", "Level", OFFSET(level), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 52, VE, "level" },
+    { "1.3", "Level 1.3, only available with Baseline Profile", 0, AV_OPT_TYPE_CONST, { .i64 = 13 }, INT_MIN, INT_MAX, VE, "level" },
+    { "3.0", "Level 3.0", 0, AV_OPT_TYPE_CONST, { .i64 = 30 }, INT_MIN, INT_MAX, VE, "level" },
+    { "3.1", "Level 3.1", 0, AV_OPT_TYPE_CONST, { .i64 = 31 }, INT_MIN, INT_MAX, VE, "level" },
+    { "3.2", "Level 3.2", 0, AV_OPT_TYPE_CONST, { .i64 = 32 }, INT_MIN, INT_MAX, VE, "level" },
+    { "4.0", "Level 4.0", 0, AV_OPT_TYPE_CONST, { .i64 = 40 }, INT_MIN, INT_MAX, VE, "level" },
+    { "4.1", "Level 4.1", 0, AV_OPT_TYPE_CONST, { .i64 = 41 }, INT_MIN, INT_MAX, VE, "level" },
+    { "4.2", "Level 4.2", 0, AV_OPT_TYPE_CONST, { .i64 = 42 }, INT_MIN, INT_MAX, VE, "level" },
+    { "5.0", "Level 5.0", 0, AV_OPT_TYPE_CONST, { .i64 = 50 }, INT_MIN, INT_MAX, VE, "level" },
+    { "5.1", "Level 5.1", 0, AV_OPT_TYPE_CONST, { .i64 = 51 }, INT_MIN, INT_MAX, VE, "level" },
+    { "5.2", "Level 5.2", 0, AV_OPT_TYPE_CONST, { .i64 = 52 }, INT_MIN, INT_MAX, VE, "level" },
+
+    { "coder", "Entropy coding", OFFSET(entropy), AV_OPT_TYPE_INT, { .i64 = VT_ENTROPY_NOT_SET }, VT_ENTROPY_NOT_SET, VT_CABAC, VE, "coder" },
+    { "cavlc", "CAVLC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CAVLC }, INT_MIN, INT_MAX, VE, "coder" },
+    { "vlc",   "CAVLC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CAVLC }, INT_MIN, INT_MAX, VE, "coder" },
+    { "cabac", "CABAC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CABAC }, INT_MIN, INT_MAX, VE, "coder" },
+    { "ac",    "CABAC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CABAC }, INT_MIN, INT_MAX, VE, "coder" },
+
+    { "a53cc", "Use A53 Closed Captions (if available)", OFFSET(a53_cc), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, VE },
+
+    COMMON_OPTIONS
+    { NULL },
+};
+
+static const AVClass h264_videotoolbox_class = {
+    .class_name = "h264_videotoolbox",
+    .item_name  = av_default_item_name,
+    .option     = h264_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_h264_videotoolbox_encoder = {
+    .name             = "h264_videotoolbox",
+    .long_name        = NULL_IF_CONFIG_SMALL("VideoToolbox H.264 Encoder"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_H264,
+    .priv_data_size   = sizeof(VTEncContext),
+    .pix_fmts         = pix_fmts,
+    .init             = vtenc_init,
+    .encode2          = vtenc_frame,
+    .close            = vtenc_close,
+    .capabilities     = AV_CODEC_CAP_DELAY,
+    .priv_class       = &h264_videotoolbox_class,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+};
+
+static const AVOption hevc_options[] = {
+    { "profile", "Profile", OFFSET(profile), AV_OPT_TYPE_INT, { .i64 = HEVC_PROF_AUTO }, HEVC_PROF_AUTO, HEVC_PROF_COUNT, VE, "profile" },
+    { "main",     "Main Profile",     0, AV_OPT_TYPE_CONST, { .i64 = HEVC_PROF_MAIN   }, INT_MIN, INT_MAX, VE, "profile" },
+    { "main10",   "Main10 Profile",   0, AV_OPT_TYPE_CONST, { .i64 = HEVC_PROF_MAIN10 }, INT_MIN, INT_MAX, VE, "profile" },
+
+    COMMON_OPTIONS
+    { NULL },
+};
+
+static const AVClass hevc_videotoolbox_class = {
+    .class_name = "hevc_videotoolbox",
+    .item_name  = av_default_item_name,
+    .option     = hevc_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_hevc_videotoolbox_encoder = {
+    .name             = "hevc_videotoolbox",
+    .long_name        = NULL_IF_CONFIG_SMALL("VideoToolbox H.265 Encoder"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_HEVC,
+    .priv_data_size   = sizeof(VTEncContext),
+    .pix_fmts         = pix_fmts,
+    .init             = vtenc_init,
+    .encode2          = vtenc_frame,
+    .close            = vtenc_close,
+    .capabilities     = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE,
+    .priv_class       = &hevc_videotoolbox_class,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+    .wrapper_name     = "videotoolbox",
+};
diff --git a/libavcodec/vima.c b/libavcodec/vima.c
index 0db1897..b4620ac 100644
--- a/libavcodec/vima.c
+++ b/libavcodec/vima.c
@@ -2,20 +2,20 @@
  * LucasArts VIMA decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 
 #include "adpcm_data.h"
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 static int predict_table_init = 0;
@@ -118,7 +118,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 static int decode_frame(AVCodecContext *avctx, void *data,
                         int *got_frame_ptr, AVPacket *pkt)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     AVFrame *frame = data;
     int16_t pcm_data[2];
     uint32_t samples;
@@ -129,19 +129,19 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (pkt->size < 13)
         return AVERROR_INVALIDDATA;
 
-    if ((ret = bitstream_init8(&bc, pkt->data, pkt->size)) < 0)
+    if ((ret = init_get_bits8(&gb, pkt->data, pkt->size)) < 0)
         return ret;
 
-    samples = bitstream_read(&bc, 32);
+    samples = get_bits_long(&gb, 32);
     if (samples == 0xffffffff) {
-        bitstream_skip(&bc, 32);
-        samples = bitstream_read(&bc, 32);
+        skip_bits_long(&gb, 32);
+        samples = get_bits_long(&gb, 32);
     }
 
     if (samples > pkt->size * 2)
         return AVERROR_INVALIDDATA;
 
-    channel_hint[0] = bitstream_read_signed(&bc, 8);
+    channel_hint[0] = get_sbits(&gb, 8);
     if (channel_hint[0] & 0x80) {
         channel_hint[0] = ~channel_hint[0];
         channels = 2;
@@ -149,10 +149,10 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     avctx->channels = channels;
     avctx->channel_layout = (channels == 2) ? AV_CH_LAYOUT_STEREO
                                             : AV_CH_LAYOUT_MONO;
-    pcm_data[0] = bitstream_read_signed(&bc, 16);
+    pcm_data[0] = get_sbits(&gb, 16);
     if (channels > 1) {
-        channel_hint[1] = bitstream_read_signed(&bc, 8);
-        pcm_data[1]     = bitstream_read_signed(&bc, 16);
+        channel_hint[1] = get_sbits(&gb, 8);
+        pcm_data[1]     = get_sbits(&gb, 16);
     }
 
     frame->nb_samples = samples;
@@ -170,7 +170,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
             step_index  = av_clip(step_index, 0, 88);
             lookup_size = size_table[step_index];
-            lookup      = bitstream_read(&bc, lookup_size);
+            lookup      = get_bits(&gb, lookup_size);
             highbit     = 1 << (lookup_size - 1);
             lowbits     = highbit - 1;
 
@@ -180,7 +180,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
                 highbit = 0;
 
             if (lookup == lowbits) {
-                output = bitstream_read_signed(&bc, 16);
+                output = get_sbits(&gb, 16);
             } else {
                 int predict_index, diff;
 
diff --git a/libavcodec/vlc.h b/libavcodec/vlc.h
index 8ac5238..42ccddf 100644
--- a/libavcodec/vlc.h
+++ b/libavcodec/vlc.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,8 +21,6 @@
 
 #include <stdint.h>
 
-#include "bitstream.h"
-
 #define VLC_TYPE int16_t
 
 typedef struct VLC {
@@ -56,84 +54,28 @@ void ff_free_vlc(VLC *vlc);
 #define INIT_VLC_LE             2
 #define INIT_VLC_USE_NEW_STATIC 4
 
-#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size)       \
+#define INIT_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, h, i, j, static_size) \
     do {                                                                   \
         static VLC_TYPE table[static_size][2];                             \
         (vlc)->table           = table;                                    \
         (vlc)->table_allocated = static_size;                              \
-        init_vlc(vlc, bits, a, b, c, d, e, f, g, INIT_VLC_USE_NEW_STATIC); \
+        ff_init_vlc_sparse(vlc, bits, a, b, c, d, e, f, g, h, i, j,        \
+            INIT_VLC_USE_NEW_STATIC);                                      \
     } while (0)
 
-/* Return the LUT element for the given bitstream configuration. */
-static inline int set_idx(BitstreamContext *bc, int code, int *n, int *nb_bits,
-                          VLC_TYPE (*table)[2])
-{
-    unsigned idx;
-
-    *nb_bits = -*n;
-    idx = bitstream_peek(bc, *nb_bits) + code;
-    *n = table[idx][1];
-
-    return table[idx][0];
-}
-
-/**
- * Parse a VLC code.
- * @param bits      is the number of bits which will be read at once, must be
- *                  identical to nb_bits in init_vlc()
- * @param max_depth is the number of times bits bits must be read to completely
- *                  read the longest VLC code
- *                  = (max_vlc_length + bits - 1) / bits
- * If the VLC code is invalid and max_depth = 1, then no bits will be removed.
- * If the VLC code is invalid and max_depth > 1, then the number of bits removed
- * is undefined. */
-static inline int bitstream_read_vlc(BitstreamContext *bc, VLC_TYPE (*table)[2],
-                                     int bits, int max_depth)
-{
-    int nb_bits;
-    unsigned idx = bitstream_peek(bc, bits);
-    int code = table[idx][0];
-    int n    = table[idx][1];
-
-    if (max_depth > 1 && n < 0) {
-        skip_remaining(bc, bits);
-        code = set_idx(bc, code, &n, &nb_bits, table);
-        if (max_depth > 2 && n < 0) {
-            skip_remaining(bc, nb_bits);
-            code = set_idx(bc, code, &n, &nb_bits, table);
-        }
-    }
-    skip_remaining(bc, n);
+#define INIT_LE_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, h, i, j, static_size) \
+    do {                                                                   \
+        static VLC_TYPE table[static_size][2];                             \
+        (vlc)->table           = table;                                    \
+        (vlc)->table_allocated = static_size;                              \
+        ff_init_vlc_sparse(vlc, bits, a, b, c, d, e, f, g, h, i, j,        \
+            INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);                        \
+    } while (0)
 
-    return code;
-}
+#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size)       \
+    INIT_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, NULL, 0, 0, static_size)
 
-#define BITSTREAM_RL_VLC(level, run, bc, table, bits, max_depth) \
-    do {                                                         \
-        int n, nb_bits;                                          \
-        unsigned index = bitstream_peek(bc, bits);               \
-        level = table[index].level;                              \
-        n     = table[index].len;                                \
-                                                                 \
-        if (max_depth > 1 && n < 0) {                            \
-            bitstream_skip(bc, bits);                            \
-                                                                 \
-            nb_bits = -n;                                        \
-                                                                 \
-            index = bitstream_peek(bc, nb_bits) + level;         \
-            level = table[index].level;                          \
-            n     = table[index].len;                            \
-            if (max_depth > 2 && n < 0) {                        \
-                bitstream_skip(bc, nb_bits);                     \
-                nb_bits = -n;                                    \
-                                                                 \
-                index = bitstream_peek(bc, nb_bits) + level;     \
-                level = table[index].level;                      \
-                n     = table[index].len;                        \
-            }                                                    \
-        }                                                        \
-        run = table[index].run;                                  \
-        bitstream_skip(bc, n);                                   \
-    } while (0)
+#define INIT_LE_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size) \
+    INIT_LE_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, NULL, 0, 0, static_size)
 
 #endif /* AVCODEC_VLC_H */
diff --git a/libavcodec/vmdaudio.c b/libavcodec/vmdaudio.c
index 9e02ba7..e8c8a06 100644
--- a/libavcodec/vmdaudio.c
+++ b/libavcodec/vmdaudio.c
@@ -1,20 +1,21 @@
 /*
  * Sierra VMD audio decoder
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +35,7 @@
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
@@ -74,7 +76,7 @@ static av_cold int vmdaudio_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
         return AVERROR(EINVAL);
     }
-    if (avctx->block_align < 1) {
+    if (avctx->block_align < 1 || avctx->block_align % avctx->channels) {
         av_log(avctx, AV_LOG_ERROR, "invalid block align\n");
         return AVERROR(EINVAL);
     }
@@ -180,17 +182,16 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     frame->nb_samples = ((silent_chunks + audio_chunks) * avctx->block_align) /
                         avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples_u8  =            frame->data[0];
     output_samples_s16 = (int16_t *)frame->data[0];
 
     /* decode silent chunks */
     if (silent_chunks > 0) {
-        int silent_size = FFMIN(avctx->block_align * silent_chunks,
-                                frame->nb_samples * avctx->channels);
+        int silent_size = avctx->block_align * silent_chunks;
+        av_assert0(avctx->block_align * silent_chunks <= frame->nb_samples * avctx->channels);
+
         if (s->out_bps == 2) {
             memset(output_samples_s16, 0x00, silent_size * 2);
             output_samples_s16 += silent_size;
@@ -202,8 +203,9 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx, void *data,
 
     /* decode audio chunks */
     if (audio_chunks > 0) {
-        buf_end = buf + (buf_size & ~(avctx->channels > 1));
-        while (buf + s->chunk_size <= buf_end) {
+        buf_end = buf + buf_size;
+        av_assert0((buf_size & (avctx->channels > 1)) == 0);
+        while (buf_end - buf >= s->chunk_size) {
             if (s->out_bps == 2) {
                 decode_audio_s16(output_samples_s16, buf, s->chunk_size,
                                  avctx->channels);
diff --git a/libavcodec/vmdvideo.c b/libavcodec/vmdvideo.c
index 2e91c06..b97032f 100644
--- a/libavcodec/vmdvideo.c
+++ b/libavcodec/vmdvideo.c
@@ -1,20 +1,21 @@
 /*
  * Sierra VMD video decoder
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,7 +63,7 @@ typedef struct VmdVideoContext {
 #define QUEUE_SIZE 0x1000
 #define QUEUE_MASK 0x0FFF
 
-static void lz_unpack(const unsigned char *src, int src_len,
+static int lz_unpack(const unsigned char *src, int src_len,
                       unsigned char *dest, int dest_len)
 {
     unsigned char *d;
@@ -83,9 +84,9 @@ static void lz_unpack(const unsigned char *src, int src_len,
     dataleft = bytestream2_get_le32(&gb);
     memset(queue, 0x20, QUEUE_SIZE);
     if (bytestream2_get_bytes_left(&gb) < 4)
-        return;
+        return AVERROR_INVALIDDATA;
     if (bytestream2_peek_le32(&gb) == 0x56781234) {
-        bytestream2_get_le32(&gb);
+        bytestream2_skipu(&gb, 4);
         qpos = 0x111;
         speclen = 0xF + 3;
     } else {
@@ -96,8 +97,8 @@ static void lz_unpack(const unsigned char *src, int src_len,
     while (dataleft > 0 && bytestream2_get_bytes_left(&gb) > 0) {
         tag = bytestream2_get_byteu(&gb);
         if ((tag == 0xFF) && (dataleft > 8)) {
-            if (d + 8 > d_end || bytestream2_get_bytes_left(&gb) < 8)
-                return;
+            if (d_end - d < 8 || bytestream2_get_bytes_left(&gb) < 8)
+                return AVERROR_INVALIDDATA;
             for (i = 0; i < 8; i++) {
                 queue[qpos++] = *d++ = bytestream2_get_byteu(&gb);
                 qpos &= QUEUE_MASK;
@@ -108,9 +109,9 @@ static void lz_unpack(const unsigned char *src, int src_len,
                 if (dataleft == 0)
                     break;
                 if (tag & 0x01) {
-                    if (d + 1 > d_end || bytestream2_get_bytes_left(&gb) < 1)
-                        return;
-                    queue[qpos++] = *d++ = bytestream2_get_byte(&gb);
+                    if (d_end - d < 1 || bytestream2_get_bytes_left(&gb) < 1)
+                        return AVERROR_INVALIDDATA;
+                    queue[qpos++] = *d++ = bytestream2_get_byteu(&gb);
                     qpos &= QUEUE_MASK;
                     dataleft--;
                 } else {
@@ -120,8 +121,8 @@ static void lz_unpack(const unsigned char *src, int src_len,
                     if (chainlen == speclen) {
                         chainlen = bytestream2_get_byte(&gb) + 0xF + 3;
                     }
-                    if (d + chainlen > d_end)
-                        return;
+                    if (d_end - d < chainlen)
+                        return AVERROR_INVALIDDATA;
                     for (j = 0; j < chainlen; j++) {
                         *d = queue[chainofs++ & QUEUE_MASK];
                         queue[qpos++] = *d++;
@@ -133,10 +134,10 @@ static void lz_unpack(const unsigned char *src, int src_len,
             }
         }
     }
+    return d - dest;
 }
-
 static int rle_unpack(const unsigned char *src, unsigned char *dest,
-    int src_count, int src_size, int dest_len)
+                      int src_count, int src_size, int dest_len)
 {
     unsigned char *pd;
     int i, l, used = 0;
@@ -159,12 +160,12 @@ static int rle_unpack(const unsigned char *src, unsigned char *dest,
         l = bytestream2_get_byteu(&gb);
         if (l & 0x80) {
             l = (l & 0x7F) * 2;
-            if (pd + l > dest_end || bytestream2_get_bytes_left(&gb) < l)
+            if (dest_end - pd < l || bytestream2_get_bytes_left(&gb) < l)
                 return bytestream2_tell(&gb);
-            bytestream2_get_buffer(&gb, pd, l);
+            bytestream2_get_bufferu(&gb, pd, l);
             pd += l;
         } else {
-            if (pd + l > dest_end || bytestream2_get_bytes_left(&gb) < 2)
+            if (dest_end - pd < 2*l || bytestream2_get_bytes_left(&gb) < 2)
                 return bytestream2_tell(&gb);
             run_val = bytestream2_get_ne16(&gb);
             for (i = 0; i < l; i++) {
@@ -200,6 +201,16 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
     frame_y = AV_RL16(&s->buf[8]);
     frame_width = AV_RL16(&s->buf[10]) - frame_x + 1;
     frame_height = AV_RL16(&s->buf[12]) - frame_y + 1;
+
+    if ((frame_width == s->avctx->width && frame_height == s->avctx->height) &&
+        (frame_x || frame_y)) {
+
+        s->x_off = frame_x;
+        s->y_off = frame_y;
+    }
+    frame_x -= s->x_off;
+    frame_y -= s->y_off;
+
     if (frame_x < 0 || frame_width < 0 ||
         frame_x >= s->avctx->width ||
         frame_width > s->avctx->width ||
@@ -219,15 +230,6 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
         return AVERROR_INVALIDDATA;
     }
 
-    if ((frame_width == s->avctx->width && frame_height == s->avctx->height) &&
-        (frame_x || frame_y)) {
-
-        s->x_off = frame_x;
-        s->y_off = frame_y;
-    }
-    frame_x -= s->x_off;
-    frame_y -= s->y_off;
-
     /* if only a certain region will be updated, copy the entire previous
      * frame before the decode */
     if (s->prev_frame->data[0] &&
@@ -248,13 +250,13 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                 r = bytestream2_get_byteu(&gb) * 4;
                 g = bytestream2_get_byteu(&gb) * 4;
                 b = bytestream2_get_byteu(&gb) * 4;
-                palette32[i] = (r << 16) | (g << 8) | (b);
+                palette32[i] = 0xFFU << 24 | (r << 16) | (g << 8) | (b);
+                palette32[i] |= palette32[i] >> 6 & 0x30303;
             }
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "Incomplete palette\n");
             return AVERROR_INVALIDDATA;
         }
-        s->size -= PALETTE_COUNT * 3 + 2;
     }
 
     if (!s->size)
@@ -265,15 +267,18 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
         return AVERROR_INVALIDDATA;
     meth = bytestream2_get_byteu(&gb);
     if (meth & 0x80) {
+        int size;
         if (!s->unpack_buffer_size) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Trying to unpack LZ-compressed frame with no LZ buffer\n");
             return AVERROR_INVALIDDATA;
         }
-        lz_unpack(gb.buffer, bytestream2_get_bytes_left(&gb),
-                  s->unpack_buffer, s->unpack_buffer_size);
+        size = lz_unpack(gb.buffer, bytestream2_get_bytes_left(&gb),
+                         s->unpack_buffer, s->unpack_buffer_size);
+        if (size < 0)
+            return size;
         meth &= 0x7F;
-        bytestream2_init(&gb, s->unpack_buffer, s->unpack_buffer_size);
+        bytestream2_init(&gb, s->unpack_buffer, size);
     }
 
     dp = &frame->data[0][frame_y * frame->linesize[0] + frame_x];
@@ -289,7 +294,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                     if (ofs + len > frame_width ||
                         bytestream2_get_bytes_left(&gb) < len)
                         return AVERROR_INVALIDDATA;
-                    bytestream2_get_buffer(&gb, &dp[ofs], len);
+                    bytestream2_get_bufferu(&gb, &dp[ofs], len);
                     ofs += len;
                 } else {
                     /* interframe pixel copy */
@@ -301,7 +306,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
             } while (ofs < frame_width);
             if (ofs > frame_width) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "VMD video: offset > width (%d > %d)\n",
+                       "offset > width (%d > %d)\n",
                        ofs, frame_width);
                 return AVERROR_INVALIDDATA;
             }
@@ -334,6 +339,9 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                         ofs += slen;
                         bytestream2_skip(&gb, len);
                     } else {
+                        if (ofs + len > frame_width ||
+                            bytestream2_get_bytes_left(&gb) < len)
+                            return AVERROR_INVALIDDATA;
                         bytestream2_get_buffer(&gb, &dp[ofs], len);
                         ofs += len;
                     }
@@ -347,7 +355,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
             } while (ofs < frame_width);
             if (ofs > frame_width) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "VMD video: offset > width (%d > %d)\n",
+                       "offset > width (%d > %d)\n",
                        ofs, frame_width);
                 return AVERROR_INVALIDDATA;
             }
@@ -364,7 +372,8 @@ static av_cold int vmdvideo_decode_end(AVCodecContext *avctx)
     VmdVideoContext *s = avctx->priv_data;
 
     av_frame_free(&s->prev_frame);
-    av_free(s->unpack_buffer);
+    av_freep(&s->unpack_buffer);
+    s->unpack_buffer_size = 0;
 
     return 0;
 }
@@ -384,9 +393,9 @@ static av_cold int vmdvideo_decode_init(AVCodecContext *avctx)
 
     /* make sure the VMD header made it */
     if (s->avctx->extradata_size != VMD_HEADER_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR, "VMD video: expected extradata size of %d\n",
+        av_log(s->avctx, AV_LOG_ERROR, "expected extradata size of %d\n",
             VMD_HEADER_SIZE);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     vmd_header = (unsigned char *)avctx->extradata;
 
@@ -404,7 +413,8 @@ static av_cold int vmdvideo_decode_init(AVCodecContext *avctx)
         r = raw_palette[palette_index++] * 4;
         g = raw_palette[palette_index++] * 4;
         b = raw_palette[palette_index++] * 4;
-        palette32[i] = (r << 16) | (g << 8) | (b);
+        palette32[i] = 0xFFU << 24 | (r << 16) | (g << 8) | (b);
+        palette32[i] |= palette32[i] >> 6 & 0x30303;
     }
 
     s->prev_frame = av_frame_alloc();
@@ -432,10 +442,8 @@ static int vmdvideo_decode_frame(AVCodecContext *avctx,
     if (buf_size < 16)
         return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "VMD Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if ((ret = vmd_decode(s, frame)) < 0)
         return ret;
diff --git a/libavcodec/vmnc.c b/libavcodec/vmnc.c
index 7a01f1e..30b1414 100644
--- a/libavcodec/vmnc.c
+++ b/libavcodec/vmnc.c
@@ -2,20 +2,20 @@
  * VMware Screen Codec (VMnc) decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -298,8 +298,8 @@ static int decode_hextile(VmncContext *c, uint8_t* dst, GetByteContext *gb,
                     rect_w = (wh >> 4) + 1;
                     rect_h = (wh & 0xF) + 1;
 
-                    if (rect_x + rect_w > bw || rect_y + rect_h > bh) {
-                        av_log(c->avctx, AV_LOG_ERROR, "Invalid subrect\n");
+                    if (rect_x + rect_w > w - i || rect_y + rect_h > h - j) {
+                        av_log(c->avctx, AV_LOG_ERROR, "Rectangle outside picture\n");
                         return AVERROR_INVALIDDATA;
                     }
 
@@ -319,6 +319,8 @@ static void reset_buffers(VmncContext *c)
     av_freep(&c->curmask);
     av_freep(&c->screendta);
     c->cur_w = c->cur_h = 0;
+    c->cur_hx = c->cur_hy = 0;
+
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
@@ -331,10 +333,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     uint8_t *outptr;
     int dx, dy, w, h, depth, enc, chunks, res, size_left, ret;
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
 
     bytestream2_init(gb, buf, buf_size);
 
@@ -372,15 +372,29 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     bytestream2_skip(gb, 2);
     chunks = bytestream2_get_be16(gb);
     while (chunks--) {
+        if (bytestream2_get_bytes_left(gb) < 12) {
+            av_log(avctx, AV_LOG_ERROR, "Premature end of data!\n");
+            return -1;
+        }
         dx  = bytestream2_get_be16(gb);
         dy  = bytestream2_get_be16(gb);
         w   = bytestream2_get_be16(gb);
         h   = bytestream2_get_be16(gb);
         enc = bytestream2_get_be32(gb);
+        if ((dx + w > c->width) || (dy + h > c->height)) {
+            av_log(avctx, AV_LOG_ERROR,
+                    "Incorrect frame size: %ix%i+%ix%i of %ix%i\n",
+                    w, h, dx, dy, c->width, c->height);
+            return AVERROR_INVALIDDATA;
+        }
         outptr = c->pic->data[0] + dx * c->bpp2 + dy * c->pic->linesize[0];
         size_left = bytestream2_get_bytes_left(gb);
         switch (enc) {
         case MAGIC_WMVd: // cursor
+            if (w*(int64_t)h*c->bpp2 > INT_MAX/2 - 2) {
+                av_log(avctx, AV_LOG_ERROR, "dimensions too large\n");
+                return AVERROR_INVALIDDATA;
+            }
             if (size_left < 2 + w * h * c->bpp2 * 2) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Premature end of data! (need %i got %i)\n",
@@ -431,18 +445,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             c->pic->pict_type = AV_PICTURE_TYPE_I;
             depth = bytestream2_get_byte(gb);
             if (depth != c->bpp) {
-                av_log(avctx, AV_LOG_WARNING, "Depth mismatch. "
-                       "Container %i bpp / Codec %i bpp\n", c->bpp, depth);
-
-                if (depth != 8 && depth != 16 && depth != 32) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Unsupported codec bitdepth %i\n", depth);
-                    return AVERROR_INVALIDDATA;
-                }
-
-                /* reset values */
-                c->bpp  = depth;
-                c->bpp2 = c->bpp / 8;
+                av_log(avctx, AV_LOG_INFO,
+                       "Depth mismatch. Container %i bpp, "
+                       "Frame data: %i bpp\n",
+                       c->bpp, depth);
             }
             bytestream2_skip(gb, 1);
             c->bigendian = bytestream2_get_byte(gb);
@@ -458,12 +464,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             bytestream2_skip(gb, 2);
             break;
         case 0x00000000: // raw rectangle data
-            if ((dx + w > c->width) || (dy + h > c->height)) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Incorrect frame size: %ix%i+%ix%i of %ix%i\n",
-                       w, h, dx, dy, c->width, c->height);
-                return AVERROR_INVALIDDATA;
-            }
             if (size_left < w * h * c->bpp2) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Premature end of data! (need %i got %i)\n",
@@ -474,12 +474,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                       c->pic->linesize[0]);
             break;
         case 0x00000005: // HexTile encoded rectangle
-            if ((dx + w > c->width) || (dy + h > c->height)) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Incorrect frame size: %ix%i+%ix%i of %ix%i\n",
-                       w, h, dx, dy, c->width, c->height);
-                return AVERROR_INVALIDDATA;
-            }
             res = decode_hextile(c, outptr, gb, w, h, c->pic->linesize[0]);
             if (res < 0)
                 return res;
@@ -535,7 +529,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->width  = avctx->width;
     c->height = avctx->height;
     c->bpp    = avctx->bits_per_coded_sample;
-    c->bpp2   = c->bpp / 8;
 
     switch (c->bpp) {
     case 8:
@@ -546,14 +539,16 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case 24:
         /* 24 bits is not technically supported, but some clients might
-         * mistakenly set it -- delay the actual check until decode_frame() */
+         * mistakenly set it, so let's assume they actually meant 32 bits */
+        c->bpp = 32;
     case 32:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_0RGB32;
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unsupported bitdepth %i\n", c->bpp);
         return AVERROR_INVALIDDATA;
     }
+    c->bpp2 = c->bpp / 8;
 
     c->pic = av_frame_alloc();
     if (!c->pic)
@@ -568,9 +563,9 @@ static av_cold int decode_end(AVCodecContext *avctx)
 
     av_frame_free(&c->pic);
 
-    av_free(c->curbits);
-    av_free(c->curmask);
-    av_free(c->screendta);
+    av_freep(&c->curbits);
+    av_freep(&c->curmask);
+    av_freep(&c->screendta);
     return 0;
 }
 
diff --git a/libavcodec/vorbis.c b/libavcodec/vorbis.c
index 2c54320..cca2aa7 100644
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -1,18 +1,22 @@
-/*
- * This file is part of Libav.
+/**
+ * @file
+ * Common code for Vorbis I encoder and decoder
+ * @author Denes Balatoni  ( dbalatoni programozo hu )
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,22 +58,27 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
     uint32_t exit_at_level[33] = { 404 };
     unsigned i, j, p, code;
 
-    for (p = 0; (bits[p] == 0) && (p < num); ++p)
+    for (p = 0; (p < num) && (bits[p] == 0); ++p)
         ;
     if (p == num)
         return 0;
 
     codes[p] = 0;
     if (bits[p] > 32)
-        return 1;
+        return AVERROR_INVALIDDATA;
     for (i = 0; i < bits[p]; ++i)
-        exit_at_level[i+1] = 1 << i;
+        exit_at_level[i+1] = 1u << i;
 
     ++p;
 
+    for (i = p; (i < num) && (bits[i] == 0); ++i)
+        ;
+    if (i == num)
+        return 0;
+
     for (; p < num; ++p) {
         if (bits[p] > 32)
-             return 1;
+             return AVERROR_INVALIDDATA;
         if (bits[p] == 0)
              continue;
         // find corresponding exit(node which the tree can grow further from)
@@ -77,19 +86,19 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
             if (exit_at_level[i])
                 break;
         if (!i) // overspecified tree
-             return 1;
+             return AVERROR_INVALIDDATA;
         code = exit_at_level[i];
         exit_at_level[i] = 0;
         // construct code (append 0s to end) and introduce new exits
         for (j = i + 1 ;j <= bits[p]; ++j)
-            exit_at_level[j] = code + (1 << (j - 1));
+            exit_at_level[j] = code + (1u << (j - 1));
         codes[p] = code;
     }
 
     //no exits should be left (underspecified tree - ie. unused valid vlcs - not allowed by SPEC)
     for (p = 1; p < 33; p++)
         if (exit_at_level[p])
-            return 1;
+            return AVERROR_INVALIDDATA;
 
     return 0;
 }
diff --git a/libavcodec/vorbis.h b/libavcodec/vorbis.h
index 5ae20ac..98dd14f 100644
--- a/libavcodec/vorbis.h
+++ b/libavcodec/vorbis.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbis_data.c b/libavcodec/vorbis_data.c
index bafb77b..063a075 100644
--- a/libavcodec/vorbis_data.c
+++ b/libavcodec/vorbis_data.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2005 Denes Balatoni ( dbalatoni programozo hu )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbis_enc_data.h b/libavcodec/vorbis_enc_data.h
index a1e743e..a51aaec 100644
--- a/libavcodec/vorbis_enc_data.h
+++ b/libavcodec/vorbis_enc_data.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -492,13 +492,13 @@ static const struct {
     int dim;
     int subclass;
     int masterbook;
-    const int *nbooks;
+    const int nbooks[4];
 } floor_classes[] = {
-    { 3, 0, 0, (const int[]){  4             } },
-    { 4, 1, 0, (const int[]){  5,  6         } },
-    { 3, 1, 1, (const int[]){  7,  8         } },
-    { 4, 2, 2, (const int[]){ -1,  9, 10, 11 } },
-    { 3, 2, 3, (const int[]){ -1, 12, 13, 14 } },
+    { 3, 0, 0, {  4             } },
+    { 4, 1, 0, {  5,  6         } },
+    { 3, 1, 1, {  7,  8         } },
+    { 4, 2, 2, { -1,  9, 10, 11 } },
+    { 3, 2, 3, { -1, 12, 13, 14 } },
 };
 
 #endif /* AVCODEC_VORBIS_ENC_DATA_H */
diff --git a/libavcodec/vorbis_parser.c b/libavcodec/vorbis_parser.c
index 054635d..0b2c97c 100644
--- a/libavcodec/vorbis_parser.c
+++ b/libavcodec/vorbis_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -173,7 +173,7 @@ static int parse_setup_header(AVVorbisParseContext *s,
     skip_bits_long(&gb, got_framing_bit);
     for (i = mode_count - 1; i >= 0; i--) {
         skip_bits_long(&gb, 40);
-        s->mode_blocksize[i] = s->blocksize[get_bits1(&gb)];
+        s->mode_blocksize[i] = get_bits1(&gb);
     }
 
 bad_header:
@@ -184,7 +184,7 @@ bad_header:
 static int vorbis_parse_init(AVVorbisParseContext *s,
                              const uint8_t *extradata, int extradata_size)
 {
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     int ret;
 
@@ -205,13 +205,13 @@ static int vorbis_parse_init(AVVorbisParseContext *s,
         return ret;
 
     s->valid_extradata = 1;
-    s->previous_blocksize = s->mode_blocksize[0];
+    s->previous_blocksize = s->blocksize[s->mode_blocksize[0]];
 
     return 0;
 }
 
-int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                          int buf_size)
+int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf,
+                                int buf_size, int *flags)
 {
     int duration = 0;
 
@@ -220,6 +220,24 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
         int previous_blocksize = s->previous_blocksize;
 
         if (buf[0] & 1) {
+            /* If the user doesn't care about special packets, it's a bad one. */
+            if (!flags)
+                goto bad_packet;
+
+            /* Set the flag for which kind of special packet it is. */
+            if (buf[0] == 1)
+                *flags |= VORBIS_FLAG_HEADER;
+            else if (buf[0] == 3)
+                *flags |= VORBIS_FLAG_COMMENT;
+            else if (buf[0] == 5)
+                *flags |= VORBIS_FLAG_SETUP;
+            else
+                goto bad_packet;
+
+            /* Special packets have no duration. */
+            return 0;
+
+bad_packet:
             av_log(s, AV_LOG_ERROR, "Invalid packet\n");
             return AVERROR_INVALIDDATA;
         }
@@ -231,11 +249,11 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
             av_log(s, AV_LOG_ERROR, "Invalid mode in packet\n");
             return AVERROR_INVALIDDATA;
         }
-        if (mode) {
+        if(s->mode_blocksize[mode]){
             int flag = !!(buf[0] & s->prev_mask);
             previous_blocksize = s->blocksize[flag];
         }
-        current_blocksize     = s->mode_blocksize[mode];
+        current_blocksize     = s->blocksize[s->mode_blocksize[mode]];
         duration              = (previous_blocksize + current_blocksize) >> 2;
         s->previous_blocksize = current_blocksize;
     }
@@ -243,10 +261,16 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
     return duration;
 }
 
+int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
+                          int buf_size)
+{
+    return av_vorbis_parse_frame_flags(s, buf, buf_size, NULL);
+}
+
 void av_vorbis_parse_reset(AVVorbisParseContext *s)
 {
     if (s->valid_extradata)
-        s->previous_blocksize = s->mode_blocksize[0];
+        s->previous_blocksize = s->blocksize[0];
 }
 
 void av_vorbis_parse_free(AVVorbisParseContext **s)
@@ -272,22 +296,6 @@ AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata,
     return s;
 }
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-int avpriv_vorbis_parse_extradata(AVCodecContext *avctx, AVVorbisParseContext *s)
-{
-    return vorbis_parse_init(s, avctx->extradata, avctx->extradata_size);
-}
-void avpriv_vorbis_parse_reset(AVVorbisParseContext *s)
-{
-    av_vorbis_parse_reset(s);
-}
-int avpriv_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                              int buf_size)
-{
-    return av_vorbis_parse_frame(s, buf, buf_size);
-}
-#endif
-
 #if CONFIG_VORBIS_PARSER
 
 typedef struct VorbisParseContext {
diff --git a/libavcodec/vorbis_parser.h b/libavcodec/vorbis_parser.h
index 88d4d59..789932a 100644
--- a/libavcodec/vorbis_parser.h
+++ b/libavcodec/vorbis_parser.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,8 +23,8 @@
  * Determines the duration for each packet.
  */
 
-#ifndef AVCODEC_VORBIS_PARSE_H
-#define AVCODEC_VORBIS_PARSE_H
+#ifndef AVCODEC_VORBIS_PARSER_H
+#define AVCODEC_VORBIS_PARSER_H
 
 #include <stdint.h>
 
@@ -41,6 +41,24 @@ AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata,
  */
 void av_vorbis_parse_free(AVVorbisParseContext **s);
 
+#define VORBIS_FLAG_HEADER  0x00000001
+#define VORBIS_FLAG_COMMENT 0x00000002
+#define VORBIS_FLAG_SETUP   0x00000004
+
+/**
+ * Get the duration for a Vorbis packet.
+ *
+ * If @p flags is @c NULL,
+ * special frames are considered invalid.
+ *
+ * @param s        Vorbis parser context
+ * @param buf      buffer containing a Vorbis frame
+ * @param buf_size size of the buffer
+ * @param flags    flags for special frames
+ */
+int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf,
+                                int buf_size, int *flags);
+
 /**
  * Get the duration for a Vorbis packet.
  *
@@ -53,4 +71,4 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
 
 void av_vorbis_parse_reset(AVVorbisParseContext *s);
 
-#endif /* AVCODEC_VORBIS_PARSE_H */
+#endif /* AVCODEC_VORBIS_PARSER_H */
diff --git a/libavcodec/vorbis_parser_internal.h b/libavcodec/vorbis_parser_internal.h
index 8f76af7..691a842 100644
--- a/libavcodec/vorbis_parser_internal.h
+++ b/libavcodec/vorbis_parser_internal.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,8 @@
  * Determines the duration for each packet.
  */
 
-#ifndef AVCODEC_VORBIS_PARSER_H
-#define AVCODEC_VORBIS_PARSER_H
+#ifndef AVCODEC_VORBIS_PARSER_INTERNAL_H
+#define AVCODEC_VORBIS_PARSER_INTERNAL_H
 
 #include "avcodec.h"
 #include "vorbis_parser.h"
@@ -43,29 +43,4 @@ struct AVVorbisParseContext {
     int prev_mask;              ///< bitmask used to get the previous mode flag in each packet
 };
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-/**
- * Initialize the Vorbis parser using headers in the extradata.
- *
- * @param avctx codec context
- * @param s     Vorbis parser context
- */
-int avpriv_vorbis_parse_extradata(AVCodecContext *avctx, AVVorbisParseContext *s);
-
-/**
- * Get the duration for a Vorbis packet.
- *
- * avpriv_vorbis_parse_extradata() must have been successfully called prior to
- * this in order for a correct duration to be returned.
- *
- * @param s        Vorbis parser context
- * @param buf      buffer containing a Vorbis frame
- * @param buf_size size of the buffer
- */
-int avpriv_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                              int buf_size);
-
-void avpriv_vorbis_parse_reset(AVVorbisParseContext *s);
-#endif
-
-#endif /* AVCODEC_VORBIS_PARSER_H */
+#endif /* AVCODEC_VORBIS_PARSER_INTERNAL_H */
diff --git a/libavcodec/vorbisdec.c b/libavcodec/vorbisdec.c
index cc0f6f4..00e9cd8 100644
--- a/libavcodec/vorbisdec.c
+++ b/libavcodec/vorbisdec.c
@@ -1,18 +1,22 @@
-/*
- * This file is part of Libav.
+/**
+ * @file
+ * Vorbis I decoder
+ * @author Denes Balatoni  ( dbalatoni programozo hu )
+ *
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +29,7 @@
 #include <inttypes.h>
 #include <math.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/float_dsp.h"
 
 #define BITSTREAM_READER_LE
@@ -32,7 +37,6 @@
 #include "fft.h"
 #include "get_bits.h"
 #include "internal.h"
-#include "vlc.h"
 #include "vorbis.h"
 #include "vorbisdsp.h"
 #include "xiph.h"
@@ -122,7 +126,7 @@ typedef struct vorbis_context_s {
     AVCodecContext *avctx;
     GetBitContext gb;
     VorbisDSPContext dsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
     FFTContext mdct[2];
     uint8_t       first_frame;
@@ -145,7 +149,7 @@ typedef struct vorbis_context_s {
     uint8_t       mode_count;
     vorbis_mode  *modes;
     uint8_t       mode_number; // mode number for the current packet
-    uint8_t       previous_window;
+    int8_t       previous_window;
     float        *channel_residues;
     float        *saved;
 } vorbis_context;
@@ -187,38 +191,43 @@ static void vorbis_free(vorbis_context *vc)
 
     av_freep(&vc->channel_residues);
     av_freep(&vc->saved);
+    av_freep(&vc->fdsp);
 
-    for (i = 0; i < vc->residue_count; i++)
-        av_free(vc->residues[i].classifs);
+    if (vc->residues)
+        for (i = 0; i < vc->residue_count; i++)
+            av_freep(&vc->residues[i].classifs);
     av_freep(&vc->residues);
     av_freep(&vc->modes);
 
     ff_mdct_end(&vc->mdct[0]);
     ff_mdct_end(&vc->mdct[1]);
 
-    for (i = 0; i < vc->codebook_count; ++i) {
-        av_free(vc->codebooks[i].codevectors);
-        ff_free_vlc(&vc->codebooks[i].vlc);
-    }
+    if (vc->codebooks)
+        for (i = 0; i < vc->codebook_count; ++i) {
+            av_freep(&vc->codebooks[i].codevectors);
+            ff_free_vlc(&vc->codebooks[i].vlc);
+        }
     av_freep(&vc->codebooks);
 
-    for (i = 0; i < vc->floor_count; ++i) {
-        if (vc->floors[i].floor_type == 0) {
-            av_free(vc->floors[i].data.t0.map[0]);
-            av_free(vc->floors[i].data.t0.map[1]);
-            av_free(vc->floors[i].data.t0.book_list);
-            av_free(vc->floors[i].data.t0.lsp);
-        } else {
-            av_free(vc->floors[i].data.t1.list);
+    if (vc->floors)
+        for (i = 0; i < vc->floor_count; ++i) {
+            if (vc->floors[i].floor_type == 0) {
+                av_freep(&vc->floors[i].data.t0.map[0]);
+                av_freep(&vc->floors[i].data.t0.map[1]);
+                av_freep(&vc->floors[i].data.t0.book_list);
+                av_freep(&vc->floors[i].data.t0.lsp);
+            } else {
+                av_freep(&vc->floors[i].data.t1.list);
+            }
         }
-    }
     av_freep(&vc->floors);
 
-    for (i = 0; i < vc->mapping_count; ++i) {
-        av_free(vc->mappings[i].magnitude);
-        av_free(vc->mappings[i].angle);
-        av_free(vc->mappings[i].mux);
-    }
+    if (vc->mappings)
+        for (i = 0; i < vc->mapping_count; ++i) {
+            av_freep(&vc->mappings[i].magnitude);
+            av_freep(&vc->mappings[i].angle);
+            av_freep(&vc->mappings[i].mux);
+        }
     av_freep(&vc->mappings);
 }
 
@@ -370,10 +379,12 @@ static int vorbis_parse_setup_hdr_codebooks(vorbis_context *vc)
 // Weed out unused vlcs and build codevector vector
             if (used_entries) {
                 codebook_setup->codevectors =
-                    av_mallocz(used_entries * codebook_setup->dimensions *
+                    av_mallocz_array(used_entries, codebook_setup->dimensions *
                                sizeof(*codebook_setup->codevectors));
-                if (!codebook_setup->codevectors)
-                    return AVERROR(ENOMEM);
+                if (!codebook_setup->codevectors) {
+                    ret = AVERROR(ENOMEM);
+                    goto error;
+                }
             } else
                 codebook_setup->codevectors = NULL;
 
@@ -556,12 +567,17 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc)
             for (j = 0; j < floor_setup->data.t1.partitions; ++j)
                 floor_setup->data.t1.x_list_dim+=floor_setup->data.t1.class_dimensions[floor_setup->data.t1.partition_class[j]];
 
-            floor_setup->data.t1.list = av_mallocz(floor_setup->data.t1.x_list_dim *
+            floor_setup->data.t1.list = av_mallocz_array(floor_setup->data.t1.x_list_dim,
                                                    sizeof(*floor_setup->data.t1.list));
             if (!floor_setup->data.t1.list)
                 return AVERROR(ENOMEM);
 
             rangebits = get_bits(gb, 4);
+            if (!rangebits && floor_setup->data.t1.partitions) {
+                av_log(vc->avctx, AV_LOG_ERROR,
+                       "A rangebits value of 0 is not compliant with the Vorbis I specification.\n");
+                return AVERROR_INVALIDDATA;
+            }
             rangemax = (1 << rangebits);
             if (rangemax > vc->blocksize[1] / 2) {
                 av_log(vc->avctx, AV_LOG_ERROR,
@@ -635,8 +651,8 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc)
             /* codebook dim is for padding if codebook dim doesn't *
              * divide order+1 then we need to read more data       */
             floor_setup->data.t0.lsp =
-                av_malloc((floor_setup->data.t0.order + 1 + max_codebook_dim)
-                          * sizeof(*floor_setup->data.t0.lsp));
+                av_malloc_array((floor_setup->data.t0.order + 1 + max_codebook_dim),
+                                sizeof(*floor_setup->data.t0.lsp));
             if (!floor_setup->data.t0.lsp)
                 return AVERROR(ENOMEM);
 
@@ -696,8 +712,7 @@ static int vorbis_parse_setup_hdr_residues(vorbis_context *vc)
         res_setup->partition_size = get_bits(gb, 24) + 1;
         /* Validations to prevent a buffer overflow later. */
         if (res_setup->begin>res_setup->end ||
-            res_setup->end > (res_setup->type == 2 ? vc->avctx->channels : 1) * vc->blocksize[1] / 2 ||
-            (res_setup->end-res_setup->begin) / res_setup->partition_size > V_MAX_PARTITIONS) {
+            (res_setup->end-res_setup->begin) / res_setup->partition_size > FFMIN(V_MAX_PARTITIONS, 65535)) {
             av_log(vc->avctx, AV_LOG_ERROR,
                    "partition out of bounds: type, begin, end, size, blocksize: %"PRIu16", %"PRIu32", %"PRIu32", %u, %"PRIu32"\n",
                    res_setup->type, res_setup->begin, res_setup->end,
@@ -710,7 +725,7 @@ static int vorbis_parse_setup_hdr_residues(vorbis_context *vc)
 
         res_setup->ptns_to_read =
             (res_setup->end - res_setup->begin) / res_setup->partition_size;
-        res_setup->classifs = av_malloc(res_setup->ptns_to_read *
+        res_setup->classifs = av_malloc_array(res_setup->ptns_to_read,
                                         vc->audio_channels *
                                         sizeof(*res_setup->classifs));
         if (!res_setup->classifs)
@@ -779,6 +794,11 @@ static int vorbis_parse_setup_hdr_mappings(vorbis_context *vc)
 
         if (get_bits1(gb)) {
             mapping_setup->coupling_steps = get_bits(gb, 8) + 1;
+            if (vc->audio_channels < 2) {
+                av_log(vc->avctx, AV_LOG_ERROR,
+                       "Square polar channel mapping with less than two channels is not compliant with the Vorbis I specification.\n");
+                return AVERROR_INVALIDDATA;
+            }
             mapping_setup->magnitude      = av_mallocz(mapping_setup->coupling_steps *
                                                        sizeof(*mapping_setup->magnitude));
             mapping_setup->angle          = av_mallocz(mapping_setup->coupling_steps *
@@ -803,7 +823,7 @@ static int vorbis_parse_setup_hdr_mappings(vorbis_context *vc)
         }
 
         if (mapping_setup->submaps>1) {
-            mapping_setup->mux = av_mallocz(vc->audio_channels *
+            mapping_setup->mux = av_mallocz_array(vc->audio_channels,
                                             sizeof(*mapping_setup->mux));
             if (!mapping_setup->mux)
                 return AVERROR(ENOMEM);
@@ -838,7 +858,7 @@ static int create_map(vorbis_context *vc, unsigned floor_number)
     for (blockflag = 0; blockflag < 2; ++blockflag) {
         n = vc->blocksize[blockflag] / 2;
         floors[floor_number].data.t0.map[blockflag] =
-            av_malloc((n + 1) * sizeof(int32_t)); // n + sentinel
+            av_malloc_array(n + 1, sizeof(int32_t)); // n + sentinel
         if (!floors[floor_number].data.t0.map[blockflag])
             return AVERROR(ENOMEM);
 
@@ -965,12 +985,12 @@ static int vorbis_parse_id_hdr(vorbis_context *vc)
     vc->bitrate_minimum = get_bits_long(gb, 32);
     bl0 = get_bits(gb, 4);
     bl1 = get_bits(gb, 4);
-    vc->blocksize[0] = (1 << bl0);
-    vc->blocksize[1] = (1 << bl1);
     if (bl0 > 13 || bl0 < 6 || bl1 > 13 || bl1 < 6 || bl1 < bl0) {
         av_log(vc->avctx, AV_LOG_ERROR, " Vorbis id header packet corrupt (illegal blocksize). \n");
         return AVERROR_INVALIDDATA;
     }
+    vc->blocksize[0] = (1 << bl0);
+    vc->blocksize[1] = (1 << bl1);
     vc->win[0] = ff_vorbis_vwin[bl0 - 6];
     vc->win[1] = ff_vorbis_vwin[bl1 - 6];
 
@@ -979,15 +999,18 @@ static int vorbis_parse_id_hdr(vorbis_context *vc)
         return AVERROR_INVALIDDATA;
     }
 
-    vc->channel_residues =  av_malloc((vc->blocksize[1]  / 2) * vc->audio_channels * sizeof(*vc->channel_residues));
-    vc->saved            =  av_mallocz((vc->blocksize[1] / 4) * vc->audio_channels * sizeof(*vc->saved));
+    vc->channel_residues =  av_malloc_array(vc->blocksize[1]  / 2, vc->audio_channels * sizeof(*vc->channel_residues));
+    vc->saved            =  av_mallocz_array(vc->blocksize[1] / 4, vc->audio_channels * sizeof(*vc->saved));
     if (!vc->channel_residues || !vc->saved)
         return AVERROR(ENOMEM);
 
-    vc->previous_window  = 0;
+    vc->previous_window  = -1;
 
     ff_mdct_init(&vc->mdct[0], bl0, 1, -1.0);
     ff_mdct_init(&vc->mdct[1], bl1, 1, -1.0);
+    vc->fdsp = avpriv_float_dsp_alloc(vc->avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!vc->fdsp)
+        return AVERROR(ENOMEM);
 
     ff_dlog(NULL, " vorbis version %"PRIu32" \n audio_channels %"PRIu8" \n audio_samplerate %"PRIu32" \n bitrate_max %"PRIu32" \n bitrate_nom %"PRIu32" \n bitrate_min %"PRIu32" \n blk_0 %"PRIu32" blk_1 %"PRIu32" \n ",
             vc->version, vc->audio_channels, vc->audio_samplerate, vc->bitrate_maximum, vc->bitrate_nominal, vc->bitrate_minimum, vc->blocksize[0], vc->blocksize[1]);
@@ -1009,14 +1032,13 @@ static av_cold int vorbis_decode_init(AVCodecContext *avctx)
     vorbis_context *vc = avctx->priv_data;
     uint8_t *headers   = avctx->extradata;
     int headers_len    = avctx->extradata_size;
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     GetBitContext *gb = &vc->gb;
     int hdr_type, ret;
 
     vc->avctx = avctx;
     ff_vorbisdsp_init(&vc->dsp);
-    avpriv_float_dsp_init(&vc->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
@@ -1188,7 +1210,7 @@ static int vorbis_floor1_decode(vorbis_context *vc,
     uint16_t floor1_Y[258];
     uint16_t floor1_Y_final[258];
     int floor1_flag[258];
-    unsigned class, cdim, cbits, csub, cval, offset, i, j;
+    unsigned partition_class, cdim, cbits, csub, cval, offset, i, j;
     int book, adx, ady, dy, off, predicted, err;
 
 
@@ -1204,28 +1226,31 @@ static int vorbis_floor1_decode(vorbis_context *vc,
 
     offset = 2;
     for (i = 0; i < vf->partitions; ++i) {
-        class = vf->partition_class[i];
-        cdim   = vf->class_dimensions[class];
-        cbits  = vf->class_subclasses[class];
+        partition_class = vf->partition_class[i];
+        cdim   = vf->class_dimensions[partition_class];
+        cbits  = vf->class_subclasses[partition_class];
         csub = (1 << cbits) - 1;
         cval = 0;
 
         ff_dlog(NULL, "Cbits %u\n", cbits);
 
         if (cbits) // this reads all subclasses for this partition's class
-            cval = get_vlc2(gb, vc->codebooks[vf->class_masterbook[class]].vlc.table,
-                            vc->codebooks[vf->class_masterbook[class]].nb_bits, 3);
+            cval = get_vlc2(gb, vc->codebooks[vf->class_masterbook[partition_class]].vlc.table,
+                            vc->codebooks[vf->class_masterbook[partition_class]].nb_bits, 3);
 
         for (j = 0; j < cdim; ++j) {
-            book = vf->subclass_books[class][cval & csub];
+            book = vf->subclass_books[partition_class][cval & csub];
 
             ff_dlog(NULL, "book %d Cbits %u cval %u  bits:%d\n",
                     book, cbits, cval, get_bits_count(gb));
 
             cval = cval >> cbits;
             if (book > -1) {
-                floor1_Y[offset+j] = get_vlc2(gb, vc->codebooks[book].vlc.table,
-                vc->codebooks[book].nb_bits, 3);
+                int v = get_vlc2(gb, vc->codebooks[book].vlc.table,
+                                 vc->codebooks[book].nb_bits, 3);
+                if (v < 0)
+                    return AVERROR_INVALIDDATA;
+                floor1_Y[offset+j] = v;
             } else {
                 floor1_Y[offset+j] = 0;
             }
@@ -1306,7 +1331,9 @@ static av_always_inline int setup_classifs(vorbis_context *vc,
                                            vorbis_residue *vr,
                                            uint8_t *do_not_decode,
                                            unsigned ch_used,
-                                           int partition_count)
+                                           int partition_count,
+                                           int ptns_to_read
+                                          )
 {
     vorbis_codebook *codebook = vc->codebooks + vr->classbook;
     int p, j, i;
@@ -1320,21 +1347,25 @@ static av_always_inline int setup_classifs(vorbis_context *vc,
 
             ff_dlog(NULL, "Classword: %u\n", temp);
 
+            av_assert0(temp < 65536);
+
             if (temp < 0) {
                 av_log(vc->avctx, AV_LOG_ERROR,
                        "Invalid vlc code decoding %d channel.", j);
                 return AVERROR_INVALIDDATA;
             }
 
+            av_assert0(vr->classifications > 1); //needed for inverse[]
+
             for (i = partition_count + c_p_c - 1; i >= partition_count; i--) {
                 temp2 = (((uint64_t)temp) * inverse_class) >> 32;
 
-                if (i < vr->ptns_to_read)
+                if (i < ptns_to_read)
                     vr->classifs[p + i] = temp - temp2 * vr->classifications;
                 temp = temp2;
             }
         }
-        p += vr->ptns_to_read;
+        p += ptns_to_read;
     }
     return 0;
 }
@@ -1355,6 +1386,7 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
     unsigned pass, ch_used, i, j, k, l;
     unsigned max_output = (ch - 1) * vlen;
     int ptns_to_read = vr->ptns_to_read;
+    int libvorbis_bug = 0;
 
     if (vr_type == 2) {
         for (j = 1; j < ch; ++j)
@@ -1369,8 +1401,13 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
     }
 
     if (max_output > ch_left * vlen) {
-        av_log(vc->avctx, AV_LOG_ERROR, "Insufficient output buffer\n");
-        return AVERROR_INVALIDDATA;
+        if (max_output <= ch_left * vlen + vr->partition_size*ch_used/ch) {
+            ptns_to_read--;
+            libvorbis_bug = 1;
+        } else {
+            av_log(vc->avctx, AV_LOG_ERROR, "Insufficient output buffer\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     ff_dlog(NULL, " residue type 0/1/2 decode begin, ch: %d  cpc %d  \n", ch, c_p_c);
@@ -1381,7 +1418,7 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
         voffset = vr->begin;
         for (partition_count = 0; partition_count < ptns_to_read;) {  // SPEC        error
             if (!pass) {
-                int ret = setup_classifs(vc, vr, do_not_decode, ch_used, partition_count);
+                int ret = setup_classifs(vc, vr, do_not_decode, ch_used, partition_count, ptns_to_read);
                 if (ret < 0)
                     return ret;
             }
@@ -1479,6 +1516,14 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
                 voffset += vr->partition_size;
             }
         }
+        if (libvorbis_bug && !pass) {
+            for (j = 0; j < ch_used; ++j) {
+                if (!do_not_decode[j]) {
+                    get_vlc2(&vc->gb, vc->codebooks[vr->classbook].vlc.table,
+                                vc->codebooks[vr->classbook].nb_bits, 3);
+                }
+            }
+        }
     }
     return 0;
 }
@@ -1531,7 +1576,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
 {
     GetBitContext *gb = &vc->gb;
     FFTContext *mdct;
-    unsigned previous_window = vc->previous_window;
+    int previous_window = vc->previous_window;
     unsigned mode_number, blockflag, blocksize;
     int i, j;
     uint8_t no_residue[255];
@@ -1564,9 +1609,11 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
     blocksize = vc->blocksize[blockflag];
     vlen = blocksize / 2;
     if (blockflag) {
-        previous_window = get_bits(gb, 1);
-        skip_bits1(gb); // next_window
-    }
+        int code = get_bits(gb, 2);
+        if (previous_window < 0)
+            previous_window = code>>1;
+    } else if (previous_window < 0)
+        previous_window = 0;
 
     memset(ch_res_ptr,   0, sizeof(float) * vc->audio_channels * vlen); //FIXME can this be removed ?
     for (i = 0; i < vc->audio_channels; ++i)
@@ -1654,7 +1701,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
 
     for (j = vc->audio_channels-1;j >= 0; j--) {
         ch_res_ptr   = vc->channel_residues + res_chan[j] * blocksize / 2;
-        vc->fdsp.vector_fmul(floor_ptr[j], floor_ptr[j], ch_res_ptr, blocksize / 2);
+        vc->fdsp->vector_fmul(floor_ptr[j], floor_ptr[j], ch_res_ptr, blocksize / 2);
         mdct->imdct_half(mdct, ch_res_ptr, floor_ptr[j]);
     }
 
@@ -1671,13 +1718,13 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
         const float *win  = vc->win[blockflag & previous_window];
 
         if (blockflag == previous_window) {
-            vc->fdsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4);
+            vc->fdsp->vector_fmul_window(ret, saved, buf, win, blocksize / 4);
         } else if (blockflag > previous_window) {
-            vc->fdsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4);
+            vc->fdsp->vector_fmul_window(ret, saved, buf, win, bs0 / 4);
             memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float));
         } else {
             memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float));
-            vc->fdsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
+            vc->fdsp->vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
         }
         memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float));
     }
@@ -1701,12 +1748,53 @@ static int vorbis_decode_frame(AVCodecContext *avctx, void *data,
 
     ff_dlog(NULL, "packet length %d \n", buf_size);
 
+    if (*buf == 1 && buf_size > 7) {
+        if ((ret = init_get_bits8(gb, buf + 1, buf_size - 1)) < 0)
+            return ret;
+
+        vorbis_free(vc);
+        if ((ret = vorbis_parse_id_hdr(vc))) {
+            av_log(avctx, AV_LOG_ERROR, "Id header corrupt.\n");
+            vorbis_free(vc);
+            return ret;
+        }
+
+        if (vc->audio_channels > 8)
+            avctx->channel_layout = 0;
+        else
+            avctx->channel_layout = ff_vorbis_channel_layouts[vc->audio_channels - 1];
+
+        avctx->channels    = vc->audio_channels;
+        avctx->sample_rate = vc->audio_samplerate;
+        return buf_size;
+    }
+
+    if (*buf == 3 && buf_size > 7) {
+        av_log(avctx, AV_LOG_DEBUG, "Ignoring comment header\n");
+        return buf_size;
+    }
+
+    if (*buf == 5 && buf_size > 7 && vc->channel_residues && !vc->modes) {
+        if ((ret = init_get_bits8(gb, buf + 1, buf_size - 1)) < 0)
+            return ret;
+
+        if ((ret = vorbis_parse_setup_hdr(vc))) {
+            av_log(avctx, AV_LOG_ERROR, "Setup header corrupt.\n");
+            vorbis_free(vc);
+            return ret;
+        }
+        return buf_size;
+    }
+
+    if (!vc->channel_residues || !vc->modes) {
+        av_log(avctx, AV_LOG_ERROR, "Data packet before valid headers\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* get output buffer */
     frame->nb_samples = vc->blocksize[1] / 2;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (vc->audio_channels > 8) {
         for (i = 0; i < vc->audio_channels; i++)
@@ -1718,7 +1806,8 @@ static int vorbis_decode_frame(AVCodecContext *avctx, void *data,
         }
     }
 
-    init_get_bits(gb, buf, buf_size*8);
+    if ((ret = init_get_bits8(gb, buf, buf_size)) < 0)
+        return ret;
 
     if ((len = vorbis_parse_audio_packet(vc, channel_ptrs)) <= 0)
         return len;
@@ -1758,7 +1847,8 @@ static av_cold void vorbis_decode_flush(AVCodecContext *avctx)
         memset(vc->saved, 0, (vc->blocksize[1] / 4) * vc->audio_channels *
                              sizeof(*vc->saved));
     }
-    vc->previous_window = 0;
+    vc->previous_window = -1;
+    vc->first_frame = 0;
 }
 
 AVCodec ff_vorbis_decoder = {
@@ -1772,6 +1862,7 @@ AVCodec ff_vorbis_decoder = {
     .decode          = vorbis_decode_frame,
     .flush           = vorbis_decode_flush,
     .capabilities    = AV_CODEC_CAP_DR1,
+    .caps_internal   = FF_CODEC_CAP_INIT_CLEANUP,
     .channel_layouts = ff_vorbis_channel_layouts,
     .sample_fmts     = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                        AV_SAMPLE_FMT_NONE },
diff --git a/libavcodec/vorbisdsp.c b/libavcodec/vorbisdsp.c
index c37e2c4..362a276 100644
--- a/libavcodec/vorbisdsp.c
+++ b/libavcodec/vorbisdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbisdsp.h b/libavcodec/vorbisdsp.h
index ea41c40..7abec4e 100644
--- a/libavcodec/vorbisdsp.h
+++ b/libavcodec/vorbisdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
index 35bdd57..18a679f 100644
--- a/libavcodec/vorbisenc.c
+++ b/libavcodec/vorbisenc.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  */
 
 #include <float.h>
+#include "libavutil/float_dsp.h"
 
 #include "avcodec.h"
 #include "internal.h"
@@ -33,6 +34,9 @@
 #include "vorbis.h"
 #include "vorbis_enc_data.h"
 
+#include "audio_frame_queue.h"
+#include "libavfilter/bufferqueue.h"
+
 #define BITSTREAM_WRITER_LE
 #include "put_bits.h"
 
@@ -108,8 +112,12 @@ typedef struct vorbis_enc_context {
     float *samples;
     float *floor;  // also used for tmp values for mdct
     float *coeffs; // also used for residue after floor
+    float *scratch; // used for tmp values for psy model
     float quality;
 
+    AudioFrameQueue afq;
+    struct FFBufQueue bufqueue;
+
     int ncodebooks;
     vorbis_enc_codebook *codebooks;
 
@@ -126,6 +134,8 @@ typedef struct vorbis_enc_context {
     vorbis_enc_mode *modes;
 
     int64_t next_pts;
+
+    AVFloatDSPContext *fdsp;
 } vorbis_enc_context;
 
 #define MAX_CHANNELS     2
@@ -142,9 +152,9 @@ typedef struct vorbis_enc_context {
 static inline int put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb,
                                int entry)
 {
-    assert(entry >= 0);
-    assert(entry < cb->nentries);
-    assert(cb->lens[entry]);
+    av_assert2(entry >= 0);
+    av_assert2(entry < cb->nentries);
+    av_assert2(cb->lens[entry]);
     if (pb->size_in_bits - put_bits_count(pb) < cb->lens[entry])
         return AVERROR(EINVAL);
     put_bits(pb, cb->lens[entry], cb->codewords[entry]);
@@ -170,8 +180,8 @@ static int ready_codebook(vorbis_enc_codebook *cb)
         cb->pow2 = cb->dimensions = NULL;
     } else {
         int vals = cb_lookup_vals(cb->lookup, cb->ndimensions, cb->nentries);
-        cb->dimensions = av_malloc(sizeof(float) * cb->nentries * cb->ndimensions);
-        cb->pow2 = av_mallocz(sizeof(float) * cb->nentries);
+        cb->dimensions = av_malloc_array(cb->nentries, sizeof(float) * cb->ndimensions);
+        cb->pow2 = av_mallocz_array(cb->nentries, sizeof(float));
         if (!cb->dimensions || !cb->pow2)
             return AVERROR(ENOMEM);
         for (i = 0; i < cb->nentries; i++) {
@@ -200,8 +210,8 @@ static int ready_codebook(vorbis_enc_codebook *cb)
 static int ready_residue(vorbis_enc_residue *rc, vorbis_enc_context *venc)
 {
     int i;
-    assert(rc->type == 2);
-    rc->maxes = av_mallocz(sizeof(float[2]) * rc->classifications);
+    av_assert0(rc->type == 2);
+    rc->maxes = av_mallocz_array(rc->classifications, sizeof(float[2]));
     if (!rc->maxes)
         return AVERROR(ENOMEM);
     for (i = 0; i < rc->classifications; i++) {
@@ -236,6 +246,26 @@ static int ready_residue(vorbis_enc_residue *rc, vorbis_enc_context *venc)
     return 0;
 }
 
+static av_cold int dsp_init(AVCodecContext *avctx, vorbis_enc_context *venc)
+{
+    int ret = 0;
+
+    venc->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!venc->fdsp)
+        return AVERROR(ENOMEM);
+
+    // init windows
+    venc->win[0] = ff_vorbis_vwin[venc->log2_blocksize[0] - 6];
+    venc->win[1] = ff_vorbis_vwin[venc->log2_blocksize[1] - 6];
+
+    if ((ret = ff_mdct_init(&venc->mdct[0], venc->log2_blocksize[0], 0, 1.0)) < 0)
+        return ret;
+    if ((ret = ff_mdct_init(&venc->mdct[1], venc->log2_blocksize[1], 0, 1.0)) < 0)
+        return ret;
+
+    return 0;
+}
+
 static int create_vorbis_context(vorbis_enc_context *venc,
                                  AVCodecContext *avctx)
 {
@@ -266,8 +296,8 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         cb->lookup      = cvectors[book].lookup;
         cb->seq_p       = 0;
 
-        cb->lens      = av_malloc(sizeof(uint8_t)  * cb->nentries);
-        cb->codewords = av_malloc(sizeof(uint32_t) * cb->nentries);
+        cb->lens      = av_malloc_array(cb->nentries, sizeof(uint8_t));
+        cb->codewords = av_malloc_array(cb->nentries, sizeof(uint32_t));
         if (!cb->lens || !cb->codewords)
             return AVERROR(ENOMEM);
         memcpy(cb->lens, cvectors[book].clens, cvectors[book].len);
@@ -275,7 +305,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
 
         if (cb->lookup) {
             vals = cb_lookup_vals(cb->lookup, cb->ndimensions, cb->nentries);
-            cb->quantlist = av_malloc(sizeof(int) * vals);
+            cb->quantlist = av_malloc_array(vals, sizeof(int));
             if (!cb->quantlist)
                 return AVERROR(ENOMEM);
             for (i = 0; i < vals; i++)
@@ -305,7 +335,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         fc->nclasses = FFMAX(fc->nclasses, fc->partition_to_class[i]);
     }
     fc->nclasses++;
-    fc->classes = av_malloc(sizeof(vorbis_enc_floor_class) * fc->nclasses);
+    fc->classes = av_malloc_array(fc->nclasses, sizeof(vorbis_enc_floor_class));
     if (!fc->classes)
         return AVERROR(ENOMEM);
     for (i = 0; i < fc->nclasses; i++) {
@@ -315,20 +345,20 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         c->subclass   = floor_classes[i].subclass;
         c->masterbook = floor_classes[i].masterbook;
         books         = (1 << c->subclass);
-        c->books      = av_malloc(sizeof(int) * books);
+        c->books      = av_malloc_array(books, sizeof(int));
         if (!c->books)
             return AVERROR(ENOMEM);
         for (j = 0; j < books; j++)
             c->books[j] = floor_classes[i].nbooks[j];
     }
     fc->multiplier = 2;
-    fc->rangebits  = venc->log2_blocksize[0] - 1;
+    fc->rangebits  = venc->log2_blocksize[1] - 1;
 
     fc->values = 2;
     for (i = 0; i < fc->partitions; i++)
         fc->values += fc->classes[fc->partition_to_class[i]].dim;
 
-    fc->list = av_malloc(sizeof(vorbis_floor1_entry) * fc->values);
+    fc->list = av_malloc_array(fc->values, sizeof(vorbis_floor1_entry));
     if (!fc->list)
         return AVERROR(ENOMEM);
     fc->list[0].x = 0;
@@ -409,29 +439,29 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         mc->angle[0]     = 1;
     }
 
-    venc->nmodes = 1;
+    venc->nmodes = 2;
     venc->modes  = av_malloc(sizeof(vorbis_enc_mode) * venc->nmodes);
     if (!venc->modes)
         return AVERROR(ENOMEM);
 
-    // single mode
+    // Short block
     venc->modes[0].blockflag = 0;
     venc->modes[0].mapping   = 0;
+    // Long block
+    venc->modes[1].blockflag = 1;
+    venc->modes[1].mapping   = 0;
 
     venc->have_saved = 0;
-    venc->saved      = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
-    venc->samples    = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]));
-    venc->floor      = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
-    venc->coeffs     = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
-    if (!venc->saved || !venc->samples || !venc->floor || !venc->coeffs)
-        return AVERROR(ENOMEM);
+    venc->saved      = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
+    venc->samples    = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]));
+    venc->floor      = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
+    venc->coeffs     = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
+    venc->scratch    = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]));
 
-    venc->win[0] = ff_vorbis_vwin[venc->log2_blocksize[0] - 6];
-    venc->win[1] = ff_vorbis_vwin[venc->log2_blocksize[1] - 6];
+    if (!venc->saved || !venc->samples || !venc->floor || !venc->coeffs || !venc->scratch)
+        return AVERROR(ENOMEM);
 
-    if ((ret = ff_mdct_init(&venc->mdct[0], venc->log2_blocksize[0], 0, 1.0)) < 0)
-        return ret;
-    if ((ret = ff_mdct_init(&venc->mdct[1], venc->log2_blocksize[1], 0, 1.0)) < 0)
+    if ((ret = dsp_init(avctx, venc)) < 0)
         return ret;
 
     return 0;
@@ -585,9 +615,11 @@ static int put_main_header(vorbis_enc_context *venc, uint8_t **out)
 {
     int i;
     PutBitContext pb;
-    uint8_t buffer[50000] = {0}, *p = buffer;
-    int buffer_len = sizeof buffer;
     int len, hlens[3];
+    int buffer_len = 50000;
+    uint8_t *buffer = av_mallocz(buffer_len), *p = buffer;
+    if (!buffer)
+        return AVERROR(ENOMEM);
 
     // identification header
     init_put_bits(&pb, p, buffer_len);
@@ -710,6 +742,7 @@ static int put_main_header(vorbis_enc_context *venc, uint8_t **out)
         buffer_len += hlens[i];
     }
 
+    av_freep(&buffer);
     return p - *out;
 }
 
@@ -880,8 +913,8 @@ static int residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
     int classes[MAX_CHANNELS][NUM_RESIDUE_PARTITIONS];
     int classwords = venc->codebooks[rc->classbook].ndimensions;
 
-    assert(rc->type == 2);
-    assert(real_ch == 2);
+    av_assert0(rc->type == 2);
+    av_assert0(real_ch == 2);
     for (p = 0; p < partitions; p++) {
         float max1 = 0.0, max2 = 0.0;
         int s = rc->begin + p * psize;
@@ -964,78 +997,142 @@ static int residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
     return 0;
 }
 
-static int apply_window_and_mdct(vorbis_enc_context *venc,
-                                 float **audio, int samples)
+static int apply_window_and_mdct(vorbis_enc_context *venc)
 {
-    int i, channel;
-    const float * win = venc->win[0];
-    int window_len = 1 << (venc->log2_blocksize[0] - 1);
-    float n = (float)(1 << venc->log2_blocksize[0]) / 4.0;
-    // FIXME use dsp
+    int channel;
+    const float * win = venc->win[1];
+    int window_len = 1 << (venc->log2_blocksize[1] - 1);
+    float n = (float)(1 << venc->log2_blocksize[1]) / 4.0;
+    AVFloatDSPContext *fdsp = venc->fdsp;
 
-    if (!venc->have_saved && !samples)
-        return 0;
+    for (channel = 0; channel < venc->channels; channel++) {
+        float *offset = venc->samples + channel * window_len * 2;
 
-    if (venc->have_saved) {
-        for (channel = 0; channel < venc->channels; channel++)
-            memcpy(venc->samples + channel * window_len * 2,
-                   venc->saved + channel * window_len, sizeof(float) * window_len);
-    } else {
-        for (channel = 0; channel < venc->channels; channel++)
-            memset(venc->samples + channel * window_len * 2, 0,
-                   sizeof(float) * window_len);
+        fdsp->vector_fmul(offset, offset, win, window_len);
+        fdsp->vector_fmul_scalar(offset, offset, 1/n, window_len);
+
+        offset += window_len;
+
+        fdsp->vector_fmul_reverse(offset, offset, win, window_len);
+        fdsp->vector_fmul_scalar(offset, offset, 1/n, window_len);
+
+        venc->mdct[1].mdct_calc(&venc->mdct[1], venc->coeffs + channel * window_len,
+                     venc->samples + channel * window_len * 2);
     }
+    return 1;
+}
 
-    if (samples) {
-        for (channel = 0; channel < venc->channels; channel++) {
-            float * offset = venc->samples + channel*window_len*2 + window_len;
-            for (i = 0; i < samples; i++)
-                offset[i] = audio[channel][i] / n * win[window_len - i - 1];
-        }
-    } else {
-        for (channel = 0; channel < venc->channels; channel++)
-            memset(venc->samples + channel * window_len * 2 + window_len,
-                   0, sizeof(float) * window_len);
+/* Used for padding the last encoded packet */
+static AVFrame *spawn_empty_frame(AVCodecContext *avctx, int channels)
+{
+    AVFrame *f = av_frame_alloc();
+    int ch;
+
+    if (!f)
+        return NULL;
+
+    f->format = avctx->sample_fmt;
+    f->nb_samples = avctx->frame_size;
+    f->channel_layout = avctx->channel_layout;
+
+    if (av_frame_get_buffer(f, 4)) {
+        av_frame_free(&f);
+        return NULL;
     }
 
-    for (channel = 0; channel < venc->channels; channel++)
-        venc->mdct[0].mdct_calc(&venc->mdct[0], venc->coeffs + channel * window_len,
-                     venc->samples + channel * window_len * 2);
+    for (ch = 0; ch < channels; ch++) {
+        size_t bps = av_get_bytes_per_sample(f->format);
+        memset(f->extended_data[ch], 0, bps * f->nb_samples);
+    }
+    return f;
+}
 
-    if (samples) {
-        for (channel = 0; channel < venc->channels; channel++) {
-            float *offset = venc->saved + channel * window_len;
-            for (i = 0; i < samples; i++)
-                offset[i] = audio[channel][i] / n * win[i];
+/* Set up audio samples for psy analysis and window/mdct */
+static void move_audio(vorbis_enc_context *venc, int sf_size)
+{
+    AVFrame *cur = NULL;
+    int frame_size = 1 << (venc->log2_blocksize[1] - 1);
+    int subframes = frame_size / sf_size;
+    int sf, ch;
+
+    /* Copy samples from last frame into current frame */
+    if (venc->have_saved)
+        for (ch = 0; ch < venc->channels; ch++)
+            memcpy(venc->samples + 2 * ch * frame_size,
+                   venc->saved + ch * frame_size, sizeof(float) * frame_size);
+    else
+        for (ch = 0; ch < venc->channels; ch++)
+            memset(venc->samples + 2 * ch * frame_size, 0, sizeof(float) * frame_size);
+
+    for (sf = 0; sf < subframes; sf++) {
+        cur = ff_bufqueue_get(&venc->bufqueue);
+
+        for (ch = 0; ch < venc->channels; ch++) {
+            float *offset = venc->samples + 2 * ch * frame_size + frame_size;
+            float *save = venc->saved + ch * frame_size;
+            const float *input = (float *) cur->extended_data[ch];
+            const size_t len  = cur->nb_samples * sizeof(float);
+
+            memcpy(offset + sf*sf_size, input, len);
+            memcpy(save + sf*sf_size, input, len);   // Move samples for next frame
         }
-        venc->have_saved = 1;
-    } else {
-        venc->have_saved = 0;
+        av_frame_free(&cur);
     }
-    return 1;
+    venc->have_saved = 1;
+    memcpy(venc->scratch, venc->samples, 2 * venc->channels * frame_size);
 }
 
-
 static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                                const AVFrame *frame, int *got_packet_ptr)
 {
     vorbis_enc_context *venc = avctx->priv_data;
-    float **audio = frame ? (float **)frame->extended_data : NULL;
-    int samples = frame ? frame->nb_samples : 0;
+    int i, ret, need_more;
+    int frame_size = 1 << (venc->log2_blocksize[1] - 1);
     vorbis_enc_mode *mode;
     vorbis_enc_mapping *mapping;
     PutBitContext pb;
-    int i, ret;
 
-    if (!apply_window_and_mdct(venc, audio, samples))
+    if (frame) {
+        AVFrame *clone;
+        if ((ret = ff_af_queue_add(&venc->afq, frame)) < 0)
+            return ret;
+        clone = av_frame_clone(frame);
+        if (!clone)
+            return AVERROR(ENOMEM);
+        ff_bufqueue_add(avctx, &venc->bufqueue, clone);
+    } else
+        if (!venc->afq.remaining_samples)
+            return 0;
+
+    need_more = venc->bufqueue.available * avctx->frame_size < frame_size;
+    need_more = frame && need_more;
+    if (need_more)
         return 0;
-    samples = 1 << (venc->log2_blocksize[0] - 1);
 
-    if ((ret = ff_alloc_packet(avpkt, 8192))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
-        return ret;
+    /* Pad the bufqueue with empty frames for encoding the last packet. */
+    if (!frame) {
+        if (venc->bufqueue.available * avctx->frame_size < frame_size) {
+            int frames_needed = (frame_size/avctx->frame_size) - venc->bufqueue.available;
+            int i;
+
+            for (i = 0; i < frames_needed; i++) {
+               AVFrame *empty = spawn_empty_frame(avctx, venc->channels);
+               if (!empty)
+                   return AVERROR(ENOMEM);
+
+               ff_bufqueue_add(avctx, &venc->bufqueue, empty);
+            }
+        }
     }
 
+    move_audio(venc, avctx->frame_size);
+
+    if (!apply_window_and_mdct(venc))
+        return 0;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 8192, 0)) < 0)
+        return ret;
+
     init_put_bits(&pb, avpkt->data, avpkt->size);
 
     if (pb.size_in_bits - put_bits_count(&pb) < 1 + ilog(venc->nmodes - 1)) {
@@ -1045,33 +1142,33 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     put_bits(&pb, 1, 0); // magic bit
 
-    put_bits(&pb, ilog(venc->nmodes - 1), 0); // 0 bits, the mode
+    put_bits(&pb, ilog(venc->nmodes - 1), 1); // Mode for current frame
 
-    mode    = &venc->modes[0];
+    mode    = &venc->modes[1];
     mapping = &venc->mappings[mode->mapping];
     if (mode->blockflag) {
-        put_bits(&pb, 1, 0);
-        put_bits(&pb, 1, 0);
+        put_bits(&pb, 1, 1); // Previous windowflag
+        put_bits(&pb, 1, 1); // Next windowflag
     }
 
     for (i = 0; i < venc->channels; i++) {
         vorbis_enc_floor *fc = &venc->floors[mapping->floor[mapping->mux[i]]];
         uint16_t posts[MAX_FLOOR_VALUES];
-        floor_fit(venc, fc, &venc->coeffs[i * samples], posts, samples);
-        if (floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples)) {
+        floor_fit(venc, fc, &venc->coeffs[i * frame_size], posts, frame_size);
+        if (floor_encode(venc, fc, &pb, posts, &venc->floor[i * frame_size], frame_size)) {
             av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
             return AVERROR(EINVAL);
         }
     }
 
-    for (i = 0; i < venc->channels * samples; i++)
+    for (i = 0; i < venc->channels * frame_size; i++)
         venc->coeffs[i] /= venc->floor[i];
 
     for (i = 0; i < mapping->coupling_steps; i++) {
-        float *mag = venc->coeffs + mapping->magnitude[i] * samples;
-        float *ang = venc->coeffs + mapping->angle[i]     * samples;
+        float *mag = venc->coeffs + mapping->magnitude[i] * frame_size;
+        float *ang = venc->coeffs + mapping->angle[i]     * frame_size;
         int j;
-        for (j = 0; j < samples; j++) {
+        for (j = 0; j < frame_size; j++) {
             float a = ang[j];
             ang[j] -= mag[j];
             if (mag[j] > 0)
@@ -1082,7 +1179,7 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
 
     if (residue_encode(venc, &venc->residues[mapping->residue[mapping->mux[0]]],
-                       &pb, venc->coeffs, samples, venc->channels)) {
+                       &pb, venc->coeffs, frame_size, venc->channels)) {
         av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
         return AVERROR(EINVAL);
     }
@@ -1090,15 +1187,14 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     flush_put_bits(&pb);
     avpkt->size = put_bits_count(&pb) >> 3;
 
-    avpkt->duration = ff_samples_to_time_base(avctx, avctx->frame_size);
-    if (frame) {
-        if (frame->pts != AV_NOPTS_VALUE)
-            avpkt->pts = ff_samples_to_time_base(avctx, frame->pts);
-    } else {
-        avpkt->pts = venc->next_pts;
+    ff_af_queue_remove(&venc->afq, frame_size, &avpkt->pts, &avpkt->duration);
+
+    if (frame_size > avpkt->duration) {
+        uint8_t *side = av_packet_new_side_data(avpkt, AV_PKT_DATA_SKIP_SAMPLES, 10);
+        if (!side)
+            return AVERROR(ENOMEM);
+        AV_WL32(&side[4], frame_size - avpkt->duration);
     }
-    if (avpkt->pts != AV_NOPTS_VALUE)
-        venc->next_pts = avpkt->pts + avpkt->duration;
 
     *got_packet_ptr = 1;
     return 0;
@@ -1155,9 +1251,13 @@ static av_cold int vorbis_encode_close(AVCodecContext *avctx)
     av_freep(&venc->samples);
     av_freep(&venc->floor);
     av_freep(&venc->coeffs);
+    av_freep(&venc->scratch);
+    av_freep(&venc->fdsp);
 
     ff_mdct_end(&venc->mdct[0]);
     ff_mdct_end(&venc->mdct[1]);
+    ff_af_queue_close(&venc->afq);
+    ff_bufqueue_discard_all(&venc->bufqueue);
 
     av_freep(&avctx->extradata);
 
@@ -1170,7 +1270,7 @@ static av_cold int vorbis_encode_init(AVCodecContext *avctx)
     int ret;
 
     if (avctx->channels != 2) {
-        av_log(avctx, AV_LOG_ERROR, "Current Libav Vorbis encoder only supports 2 channels.\n");
+        av_log(avctx, AV_LOG_ERROR, "Current FFmpeg Vorbis encoder only supports 2 channels.\n");
         return -1;
     }
 
@@ -1181,14 +1281,16 @@ static av_cold int vorbis_encode_init(AVCodecContext *avctx)
     if (avctx->flags & AV_CODEC_FLAG_QSCALE)
         venc->quality = avctx->global_quality / (float)FF_QP2LAMBDA;
     else
-        venc->quality = 3.0;
+        venc->quality = 8;
     venc->quality *= venc->quality;
 
     if ((ret = put_main_header(venc, (uint8_t**)&avctx->extradata)) < 0)
         goto error;
     avctx->extradata_size = ret;
 
-    avctx->frame_size = 1 << (venc->log2_blocksize[0] - 1);
+    avctx->frame_size = 64;
+
+    ff_af_queue_init(avctx, &venc->afq);
 
     return 0;
 error:
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index cb8925b..b248c90 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003-2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,6 +77,10 @@ typedef struct Vp3Fragment {
 /* special internal mode */
 #define MODE_COPY             8
 
+static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb);
+static int theora_decode_tables(AVCodecContext *avctx, GetBitContext *gb);
+
+
 /* There are 6 preset schemes, plus a free-form scheme */
 static const int ModeAlphabet[6][CODING_MODE_COUNT] = {
     /* scheme 1: Last motion vector dominates */
@@ -127,7 +131,7 @@ static const uint8_t hilbert_offset[16][2] = {
 
 typedef struct Vp3DecodeContext {
     AVCodecContext *avctx;
-    int theora, theora_tables;
+    int theora, theora_tables, theora_header;
     int version;
     int width, height;
     int chroma_x_shift, chroma_y_shift;
@@ -173,6 +177,7 @@ typedef struct Vp3DecodeContext {
     int data_offset[3];
     uint8_t offset_x;
     uint8_t offset_y;
+    int offset_x_warned;
 
     int8_t (*motion_val[2])[2];
 
@@ -204,8 +209,8 @@ typedef struct Vp3DecodeContext {
     int16_t *dct_tokens[3][64];
     int16_t *dct_tokens_base;
 #define TOKEN_EOB(eob_run)              ((eob_run) << 2)
-#define TOKEN_ZERO_RUN(coeff, zero_run) (((coeff) << 9) + ((zero_run) << 2) + 1)
-#define TOKEN_COEFF(coeff)              (((coeff) << 2) + 2)
+#define TOKEN_ZERO_RUN(coeff, zero_run) (((coeff) * 512) + ((zero_run) << 2) + 1)
+#define TOKEN_COEFF(coeff)              (((coeff) * 4) + 2)
 
     /**
      * number of blocks that contain DCT coefficients at
@@ -218,6 +223,10 @@ typedef struct Vp3DecodeContext {
      * which of the fragments are coded */
     int *coded_fragment_list[3];
 
+    int *kf_coded_fragment_list;
+    int *nkf_coded_fragment_list;
+    int num_kf_coded_fragment[3];
+
     VLC dc_vlc[16];
     VLC ac_vlc_1[16];
     VLC ac_vlc_2[16];
@@ -260,6 +269,21 @@ typedef struct Vp3DecodeContext {
  * VP3 specific functions
  ************************************************************************/
 
+static av_cold void free_tables(AVCodecContext *avctx)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+
+    av_freep(&s->superblock_coding);
+    av_freep(&s->all_fragments);
+    av_freep(&s->nkf_coded_fragment_list);
+    av_freep(&s->kf_coded_fragment_list);
+    av_freep(&s->dct_tokens_base);
+    av_freep(&s->superblock_fragments);
+    av_freep(&s->macroblock_coding);
+    av_freep(&s->motion_val[0]);
+    av_freep(&s->motion_val[1]);
+}
+
 static void vp3_decode_flush(AVCodecContext *avctx)
 {
     Vp3DecodeContext *s = avctx->priv_data;
@@ -277,16 +301,11 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     int i;
 
-    av_freep(&s->superblock_coding);
-    av_freep(&s->all_fragments);
-    av_freep(&s->coded_fragment_list[0]);
-    av_freep(&s->dct_tokens_base);
-    av_freep(&s->superblock_fragments);
-    av_freep(&s->macroblock_coding);
-    av_freep(&s->motion_val[0]);
-    av_freep(&s->motion_val[1]);
+    free_tables(avctx);
     av_freep(&s->edge_emu_buffer);
 
+    s->theora_tables = 0;
+
     /* release all frames */
     vp3_decode_flush(avctx);
     av_frame_free(&s->current_frame.f);
@@ -312,7 +331,7 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx)
     return 0;
 }
 
-/*
+/**
  * This function sets up all of the various blocks mappings:
  * superblocks <-> fragments, macroblocks <-> fragments,
  * superblocks <-> macroblocks
@@ -397,27 +416,7 @@ static void init_dequantizer(Vp3DecodeContext *s, int qpi)
  */
 static void init_loop_filter(Vp3DecodeContext *s)
 {
-    int *bounding_values = s->bounding_values_array + 127;
-    int filter_limit;
-    int x;
-    int value;
-
-    filter_limit = s->filter_limit_values[s->qps[0]];
-    assert(filter_limit < 128);
-
-    /* set up the bounding values */
-    memset(s->bounding_values_array, 0, 256 * sizeof(int));
-    for (x = 0; x < filter_limit; x++) {
-        bounding_values[-x] = -x;
-        bounding_values[x] = x;
-    }
-    for (x = value = filter_limit; x < 128 && value; x++, value--) {
-        bounding_values[ x] =  value;
-        bounding_values[-x] = -value;
-    }
-    if (value)
-        bounding_values[128] = value;
-    bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202;
+    ff_vp3dsp_set_bounding_values(s->bounding_values_array, s->filter_limit_values[s->qps[0]]);
 }
 
 /*
@@ -437,6 +436,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
     int i, j;
     int current_fragment;
     int plane;
+    int plane0_num_coded_frags = 0;
 
     if (s->keyframe) {
         memset(s->superblock_coding, SB_FULLY_CODED, s->superblock_count);
@@ -456,7 +456,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
             if (current_run == 34)
                 current_run += get_bits(gb, 12);
 
-            if (current_superblock + current_run > s->superblock_count) {
+            if (current_run > s->superblock_count - current_superblock) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Invalid partially coded superblock run length\n");
                 return -1;
@@ -523,45 +523,71 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
     s->total_num_coded_frags = 0;
     memset(s->macroblock_coding, MODE_COPY, s->macroblock_count);
 
+    s->coded_fragment_list[0] = s->keyframe ? s->kf_coded_fragment_list
+                                            : s->nkf_coded_fragment_list;
+
     for (plane = 0; plane < 3; plane++) {
         int sb_start = superblock_starts[plane];
         int sb_end   = sb_start + (plane ? s->c_superblock_count
                                          : s->y_superblock_count);
         int num_coded_frags = 0;
 
-        for (i = sb_start; i < sb_end && get_bits_left(gb) > 0; i++) {
-            /* iterate through all 16 fragments in a superblock */
-            for (j = 0; j < 16; j++) {
-                /* if the fragment is in bounds, check its coding status */
-                current_fragment = s->superblock_fragments[i * 16 + j];
-                if (current_fragment != -1) {
-                    int coded = s->superblock_coding[i];
-
-                    if (s->superblock_coding[i] == SB_PARTIALLY_CODED) {
-                        /* fragment may or may not be coded; this is the case
-                         * that cares about the fragment coding runs */
-                        if (current_run-- == 0) {
-                            bit        ^= 1;
-                            current_run = get_vlc2(gb, s->fragment_run_length_vlc.table, 5, 2);
+        if (s->keyframe) {
+            if (s->num_kf_coded_fragment[plane] == -1) {
+                for (i = sb_start; i < sb_end; i++) {
+                    /* iterate through all 16 fragments in a superblock */
+                    for (j = 0; j < 16; j++) {
+                        /* if the fragment is in bounds, check its coding status */
+                        current_fragment = s->superblock_fragments[i * 16 + j];
+                        if (current_fragment != -1) {
+                            s->coded_fragment_list[plane][num_coded_frags++] =
+                                current_fragment;
                         }
-                        coded = bit;
                     }
+                }
+                s->num_kf_coded_fragment[plane] = num_coded_frags;
+            } else
+                num_coded_frags = s->num_kf_coded_fragment[plane];
+        } else {
+            for (i = sb_start; i < sb_end && get_bits_left(gb) > 0; i++) {
+                if (get_bits_left(gb) < plane0_num_coded_frags >> 2) {
+                    return AVERROR_INVALIDDATA;
+                }
+                /* iterate through all 16 fragments in a superblock */
+                for (j = 0; j < 16; j++) {
+                    /* if the fragment is in bounds, check its coding status */
+                    current_fragment = s->superblock_fragments[i * 16 + j];
+                    if (current_fragment != -1) {
+                        int coded = s->superblock_coding[i];
+
+                        if (coded == SB_PARTIALLY_CODED) {
+                            /* fragment may or may not be coded; this is the case
+                             * that cares about the fragment coding runs */
+                            if (current_run-- == 0) {
+                                bit        ^= 1;
+                                current_run = get_vlc2(gb, s->fragment_run_length_vlc.table, 5, 2);
+                            }
+                            coded = bit;
+                        }
 
-                    if (coded) {
-                        /* default mode; actual mode will be decoded in
-                         * the next phase */
-                        s->all_fragments[current_fragment].coding_method =
-                            MODE_INTER_NO_MV;
-                        s->coded_fragment_list[plane][num_coded_frags++] =
-                            current_fragment;
-                    } else {
-                        /* not coded; copy this fragment from the prior frame */
-                        s->all_fragments[current_fragment].coding_method =
-                            MODE_COPY;
+                        if (coded) {
+                            /* default mode; actual mode will be decoded in
+                             * the next phase */
+                            s->all_fragments[current_fragment].coding_method =
+                                MODE_INTER_NO_MV;
+                            s->coded_fragment_list[plane][num_coded_frags++] =
+                                current_fragment;
+                        } else {
+                            /* not coded; copy this fragment from the prior frame */
+                            s->all_fragments[current_fragment].coding_method =
+                                MODE_COPY;
+                        }
                     }
                 }
             }
         }
+        if (!plane)
+            plane0_num_coded_frags = num_coded_frags;
         s->total_num_coded_frags += num_coded_frags;
         for (i = 0; i < 64; i++)
             s->num_coded_frags[plane][i] = num_coded_frags;
@@ -937,9 +963,11 @@ static int unpack_vlcs(Vp3DecodeContext *s, GetBitContext *gb,
     Vp3Fragment *all_fragments = s->all_fragments;
     VLC_TYPE(*vlc_table)[2] = table->table;
 
-    if (num_coeffs < 0)
+    if (num_coeffs < 0) {
         av_log(s->avctx, AV_LOG_ERROR,
                "Invalid number of coefficients at level %d\n", coeff_index);
+        return AVERROR_INVALIDDATA;
+    }
 
     if (eob_run > num_coeffs) {
         coeff_i      =
@@ -964,6 +992,9 @@ static int unpack_vlcs(Vp3DecodeContext *s, GetBitContext *gb,
             if (eob_run_get_bits[token])
                 eob_run += get_bits(gb, eob_run_get_bits[token]);
 
+            if (!eob_run)
+                eob_run = INT_MAX;
+
             // record only the number of blocks ended in this plane,
             // any spill will be recorded in the next plane.
             if (eob_run > num_coeffs - coeff_i) {
@@ -1057,6 +1088,9 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
 
     s->dct_tokens[0][0] = s->dct_tokens_base;
 
+    if (get_bits_left(gb) < 16)
+        return AVERROR_INVALIDDATA;
+
     /* fetch the DC table indexes */
     dc_y_table = get_bits(gb, 4);
     dc_c_table = get_bits(gb, 4);
@@ -1066,6 +1100,8 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
                                    0, residual_eob_run);
     if (residual_eob_run < 0)
         return residual_eob_run;
+    if (get_bits_left(gb) < 8)
+        return AVERROR_INVALIDDATA;
 
     /* reverse prediction of the Y-plane DC coefficients */
     reverse_dc_prediction(s, 0, s->fragment_width[0], s->fragment_height[0]);
@@ -1088,6 +1124,8 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
                               s->fragment_width[1], s->fragment_height[1]);
     }
 
+    if (get_bits_left(gb) < 8)
+        return AVERROR_INVALIDDATA;
     /* fetch the AC table indexes */
     ac_y_table = get_bits(gb, 4);
     ac_c_table = get_bits(gb, 4);
@@ -1600,20 +1638,14 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                         /* invert DCT and place (or add) in final output */
 
                         if (s->all_fragments[i].coding_method == MODE_INTRA) {
-                            int index;
-                            index = vp3_dequant(s, s->all_fragments + i,
-                                                plane, 0, block);
-                            if (index > 63)
-                                continue;
+                            vp3_dequant(s, s->all_fragments + i,
+                                        plane, 0, block);
                             s->vp3dsp.idct_put(output_plane + first_pixel,
                                                stride,
                                                block);
                         } else {
-                            int index = vp3_dequant(s, s->all_fragments + i,
-                                                    plane, 1, block);
-                            if (index > 63)
-                                continue;
-                            if (index > 0) {
+                            if (vp3_dequant(s, s->all_fragments + i,
+                                            plane, 1, block)) {
                                 s->vp3dsp.idct_add(output_plane + first_pixel,
                                                    stride,
                                                    block);
@@ -1657,25 +1689,30 @@ static av_cold int allocate_tables(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     int y_fragment_count, c_fragment_count;
 
+    free_tables(avctx);
+
     y_fragment_count = s->fragment_width[0] * s->fragment_height[0];
     c_fragment_count = s->fragment_width[1] * s->fragment_height[1];
 
-    s->superblock_coding = av_malloc(s->superblock_count);
-    s->all_fragments     = av_malloc(s->fragment_count * sizeof(Vp3Fragment));
+    s->superblock_coding = av_mallocz(s->superblock_count);
+    s->all_fragments     = av_mallocz_array(s->fragment_count, sizeof(Vp3Fragment));
 
-    s->coded_fragment_list[0] = av_malloc(s->fragment_count * sizeof(int));
+    s-> kf_coded_fragment_list = av_mallocz_array(s->fragment_count, sizeof(int));
+    s->nkf_coded_fragment_list = av_mallocz_array(s->fragment_count, sizeof(int));
+    memset(s-> num_kf_coded_fragment, -1, sizeof(s-> num_kf_coded_fragment));
 
-    s->dct_tokens_base = av_malloc(64 * s->fragment_count *
-                                   sizeof(*s->dct_tokens_base));
-    s->motion_val[0] = av_malloc(y_fragment_count * sizeof(*s->motion_val[0]));
-    s->motion_val[1] = av_malloc(c_fragment_count * sizeof(*s->motion_val[1]));
+    s->dct_tokens_base = av_mallocz_array(s->fragment_count,
+                                          64 * sizeof(*s->dct_tokens_base));
+    s->motion_val[0] = av_mallocz_array(y_fragment_count, sizeof(*s->motion_val[0]));
+    s->motion_val[1] = av_mallocz_array(c_fragment_count, sizeof(*s->motion_val[1]));
 
     /* work out the block mapping tables */
-    s->superblock_fragments = av_malloc(s->superblock_count * 16 * sizeof(int));
-    s->macroblock_coding    = av_malloc(s->macroblock_count + 1);
+    s->superblock_fragments = av_mallocz_array(s->superblock_count, 16 * sizeof(int));
+    s->macroblock_coding    = av_mallocz(s->macroblock_count + 1);
 
     if (!s->superblock_coding    || !s->all_fragments          ||
-        !s->dct_tokens_base      || !s->coded_fragment_list[0] ||
+        !s->dct_tokens_base      || !s->kf_coded_fragment_list ||
+        !s->nkf_coded_fragment_list ||
         !s->superblock_fragments || !s->macroblock_coding      ||
         !s->motion_val[0]        || !s->motion_val[1]) {
         vp3_decode_end(avctx);
@@ -1725,7 +1762,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     s->avctx  = avctx;
     s->width  = FFALIGN(avctx->coded_width, 16);
     s->height = FFALIGN(avctx->coded_height, 16);
-    if (avctx->pix_fmt == AV_PIX_FMT_NONE)
+    if (avctx->codec_id != AV_CODEC_ID_THEORA)
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
     ff_hpeldsp_init(&s->hdsp, avctx->flags | AV_CODEC_FLAG_BITEXACT);
@@ -1733,7 +1770,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
 
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
         s->idct_permutation[i] = TRANSPOSE(i);
         s->idct_scantable[i]   = TRANSPOSE(ff_zigzag_direct[i]);
 #undef TRANSPOSE
@@ -1744,8 +1781,9 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 3; i++)
         s->qps[i] = -1;
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift,
-                                     &s->chroma_y_shift);
+    ret = av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
+    if (ret)
+        return ret;
 
     s->y_superblock_width  = (s->width  + 31) / 32;
     s->y_superblock_height = (s->height + 31) / 32;
@@ -1903,6 +1941,7 @@ fail:
     return ret;
 }
 
+#if HAVE_THREADS
 static int ref_frame(Vp3DecodeContext *s, ThreadFrame *dst, ThreadFrame *src)
 {
     ff_thread_release_buffer(s->avctx, dst);
@@ -1938,6 +1977,8 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *
     }
 
     if (s != s1) {
+        if (!s->current_frame.f)
+            return AVERROR(ENOMEM);
         // init tables if the first frame hasn't been decoded
         if (!s->current_frame.f->data[0]) {
             int y_fragment_count, c_fragment_count;
@@ -1978,6 +2019,7 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *
 
     return update_frames(dst);
 }
+#endif
 
 static int vp3_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_frame,
@@ -1990,15 +2032,52 @@ static int vp3_decode_frame(AVCodecContext *avctx,
     GetBitContext gb;
     int i, ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
 
+#if CONFIG_THEORA_DECODER
     if (s->theora && get_bits1(&gb)) {
+        int type = get_bits(&gb, 7);
+        skip_bits_long(&gb, 6*8); /* "theora" */
+
+        if (s->avctx->active_thread_type&FF_THREAD_FRAME) {
+            av_log(avctx, AV_LOG_ERROR, "midstream reconfiguration with multithreading is unsupported, try -threads 1\n");
+            return AVERROR_PATCHWELCOME;
+        }
+        if (type == 0) {
+            vp3_decode_end(avctx);
+            ret = theora_decode_header(avctx, &gb);
+
+            if (ret >= 0)
+                ret = vp3_decode_init(avctx);
+            if (ret < 0) {
+                vp3_decode_end(avctx);
+                return ret;
+            }
+            return buf_size;
+        } else if (type == 2) {
+            vp3_decode_end(avctx);
+            ret = theora_decode_tables(avctx, &gb);
+            if (ret >= 0)
+                ret = vp3_decode_init(avctx);
+            if (ret < 0) {
+                vp3_decode_end(avctx);
+                return ret;
+            }
+            return buf_size;
+        }
+
         av_log(avctx, AV_LOG_ERROR,
                "Header packet passed to frame decoder, skipping\n");
         return -1;
     }
+#endif
 
     s->keyframe = !get_bits1(&gb);
+    if (!s->all_fragments) {
+        av_log(avctx, AV_LOG_ERROR, "Data packet without prior valid headers\n");
+        return -1;
+    }
     if (!s->theora)
         skip_bits(&gb, 1);
     for (i = 0; i < 3; i++)
@@ -2033,10 +2112,9 @@ static int vp3_decode_frame(AVCodecContext *avctx,
 
     s->current_frame.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
                                                 : AV_PICTURE_TYPE_P;
-    if (ff_thread_get_buffer(avctx, &s->current_frame, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    s->current_frame.f->key_frame = s->keyframe;
+    if (ff_thread_get_buffer(avctx, &s->current_frame, AV_GET_BUFFER_FLAG_REF) < 0)
         goto error;
-    }
 
     if (!s->edge_emu_buffer)
         s->edge_emu_buffer = av_malloc(9 * FFABS(s->current_frame.f->linesize[0]));
@@ -2065,10 +2143,8 @@ static int vp3_decode_frame(AVCodecContext *avctx,
 
             s->golden_frame.f->pict_type = AV_PICTURE_TYPE_I;
             if (ff_thread_get_buffer(avctx, &s->golden_frame,
-                                     AV_GET_BUFFER_FLAG_REF) < 0) {
-                av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                                     AV_GET_BUFFER_FLAG_REF) < 0)
                 goto error;
-            }
             ff_thread_release_buffer(avctx, &s->last_frame);
             if ((ret = ff_thread_ref_frame(&s->last_frame,
                                            &s->golden_frame)) < 0)
@@ -2182,6 +2258,7 @@ static int read_huffman_tree(AVCodecContext *avctx, GetBitContext *gb)
     return 0;
 }
 
+#if HAVE_THREADS
 static int vp3_init_thread_copy(AVCodecContext *avctx)
 {
     Vp3DecodeContext *s = avctx->priv_data;
@@ -2189,6 +2266,8 @@ static int vp3_init_thread_copy(AVCodecContext *avctx)
     s->superblock_coding      = NULL;
     s->all_fragments          = NULL;
     s->coded_fragment_list[0] = NULL;
+    s-> kf_coded_fragment_list= NULL;
+    s->nkf_coded_fragment_list= NULL;
     s->dct_tokens_base        = NULL;
     s->superblock_fragments   = NULL;
     s->macroblock_coding      = NULL;
@@ -2198,6 +2277,7 @@ static int vp3_init_thread_copy(AVCodecContext *avctx)
 
     return init_frames(s);
 }
+#endif
 
 #if CONFIG_THEORA_DECODER
 static const enum AVPixelFormat theora_pix_fmts[4] = {
@@ -2212,6 +2292,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     int ret;
     AVRational fps, aspect;
 
+    s->theora_header = 0;
     s->theora = get_bits_long(gb, 24);
     av_log(avctx, AV_LOG_DEBUG, "Theora bitstream version %X\n", s->theora);
 
@@ -2240,7 +2321,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     if (av_image_check_size(visible_width, visible_height, 0, avctx) < 0 ||
         visible_width  + offset_x > s->width ||
         visible_height + offset_y > s->height) {
-        av_log(s, AV_LOG_ERROR,
+        av_log(avctx, AV_LOG_ERROR,
                "Invalid frame dimensions - w:%d h:%d x:%d y:%d (%dx%d).\n",
                visible_width, visible_height, offset_x, offset_y,
                s->width, s->height);
@@ -2277,14 +2358,18 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     if (s->theora >= 0x030200) {
         skip_bits(gb, 5); /* keyframe frequency force */
         avctx->pix_fmt = theora_pix_fmts[get_bits(gb, 2)];
+        if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid pixel format\n");
+            return AVERROR_INVALIDDATA;
+        }
         skip_bits(gb, 3); /* reserved */
-    }
+    } else
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     ret = ff_set_dimensions(avctx, s->width, s->height);
     if (ret < 0)
         return ret;
-    if (!(avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) &&
-        (visible_width != s->width || visible_height != s->height)) {
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP)) {
         avctx->width  = visible_width;
         avctx->height = visible_height;
         // translate offsets from theora axis ([0,0] lower left)
@@ -2303,6 +2388,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
         avctx->color_trc  = AVCOL_TRC_BT709;
     }
 
+    s->theora_header = 1;
     return 0;
 }
 
@@ -2311,6 +2397,9 @@ static int theora_decode_tables(AVCodecContext *avctx, GetBitContext *gb)
     Vp3DecodeContext *s = avctx->priv_data;
     int i, n, matrices, inter, plane;
 
+    if (!s->theora_header)
+        return AVERROR_INVALIDDATA;
+
     if (s->theora >= 0x030200) {
         n = get_bits(gb, 3);
         /* loop filter limit values table */
@@ -2420,9 +2509,12 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     GetBitContext gb;
     int ptype;
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     int i;
+    int ret;
+
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     s->theora = 1;
 
@@ -2440,7 +2532,9 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 3; i++) {
         if (header_len[i] <= 0)
             continue;
-        init_get_bits(&gb, header_start[i], header_len[i] * 8);
+        ret = init_get_bits8(&gb, header_start[i], header_len[i]);
+        if (ret < 0)
+            return ret;
 
         ptype = get_bits(&gb, 8);
 
@@ -2454,7 +2548,8 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
 
         switch (ptype) {
         case 0x80:
-            theora_decode_header(avctx, &gb);
+            if (theora_decode_header(avctx, &gb) < 0)
+                return -1;
             break;
         case 0x81:
 // FIXME: is this needed? it breaks sometimes
diff --git a/libavcodec/vp3_parser.c b/libavcodec/vp3_parser.c
index e8fdcca..7ee046c 100644
--- a/libavcodec/vp3_parser.c
+++ b/libavcodec/vp3_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp3data.h b/libavcodec/vp3data.h
index 5603d3b..c82b1b3 100644
--- a/libavcodec/vp3data.h
+++ b/libavcodec/vp3data.h
@@ -1,20 +1,20 @@
 /*
- * copyright (C) 2003 The FFmpeg project
+ * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 
 /* these coefficients dequantize intraframe Y plane coefficients
  * (note: same as JPEG) */
-static const int16_t vp31_intra_y_dequant[64] = {
+static const int8_t vp31_intra_y_dequant[64] = {
     16, 11, 10, 16,  24,  40,  51,  61,
     12, 12, 14, 19,  26,  58,  60,  55,
     14, 13, 16, 24,  40,  57,  69,  56,
@@ -39,7 +39,7 @@ static const int16_t vp31_intra_y_dequant[64] = {
 
 /* these coefficients dequantize intraframe C plane coefficients
  * (note: same as JPEG) */
-static const int16_t vp31_intra_c_dequant[64] = {
+static const int8_t vp31_intra_c_dequant[64] = {
     17, 18, 24, 47, 99, 99, 99, 99,
     18, 21, 26, 66, 99, 99, 99, 99,
     24, 26, 56, 99, 99, 99, 99, 99,
@@ -51,7 +51,7 @@ static const int16_t vp31_intra_c_dequant[64] = {
 };
 
 /* these coefficients dequantize interframe coefficients (all planes) */
-static const int16_t vp31_inter_dequant[64] = {
+static const int8_t vp31_inter_dequant[64] = {
     16, 16, 16, 20, 24, 28,  32,  40,
     16, 16, 20, 24, 28, 32,  40,  48,
     16, 20, 24, 28, 32, 40,  48,  64,
@@ -62,7 +62,7 @@ static const int16_t vp31_inter_dequant[64] = {
     40, 48, 64, 64, 64, 96, 128, 128
 };
 
-static const int16_t vp31_dc_scale_factor[64] = {
+static const uint8_t vp31_dc_scale_factor[64] = {
     220, 200, 190, 180, 170, 170, 160, 160,
     150, 150, 140, 140, 130, 130, 120, 120,
     110, 110, 100, 100,  90,  90,  90,  80,
@@ -73,7 +73,7 @@ static const int16_t vp31_dc_scale_factor[64] = {
      20,  10,  10,  10,  10,  10,  10,  10
 };
 
-static const uint32_t vp31_ac_scale_factor[64] = {
+static const uint16_t vp31_ac_scale_factor[64] = {
     500, 450, 400, 370, 340, 310, 285, 265,
     245, 225, 210, 195, 185, 180, 170, 160,
     150, 145, 135, 130, 125, 115, 110, 107,
@@ -176,7 +176,7 @@ static const uint8_t motion_vector_vlc_table[63][2] = {
     { 0xFC, 8 }, { 0xFD, 8 }, { 0xFE, 8 }, { 0xFF, 8 }
 };
 
-static const int motion_vector_table[63] = {
+static const int8_t motion_vector_table[63] = {
      0,   1, -1,
      2,  -2,
      3,  -3,
@@ -198,21 +198,21 @@ static const int8_t fixed_motion_vector_table[64] = {
 };
 
 /* only tokens 0..6 indicate eob runs */
-static const int eob_run_base[7] = {
+static const uint8_t eob_run_base[7] = {
     1, 2, 3, 4, 8, 16, 0
 };
-static const int eob_run_get_bits[7] = {
+static const uint8_t eob_run_get_bits[7] = {
     0, 0, 0, 2, 3, 4, 12
 };
 
-static const int zero_run_base[32] = {
+static const uint8_t zero_run_base[32] = {
     0,  0, 0, 0, 0, 0, 0,   /* 0..6 are never used */
     0,  0,                  /* 7..8 */
     0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9..22 */
     1,  2, 3, 4, 5,         /* 23..27 */
     6, 10, 1, 2             /* 28..31 */
 };
-static const int zero_run_get_bits[32] = {
+static const uint8_t zero_run_get_bits[32] = {
     0, 0, 0, 0, 0, 0, 0,    /* 0..6 are never used */
     3, 6,                   /* 7..8 */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9..22 */
@@ -220,7 +220,7 @@ static const int zero_run_get_bits[32] = {
     2, 3, 0, 1              /* 28..31 */
 };
 
-static const int coeff_get_bits[32] = {
+static const uint8_t coeff_get_bits[32] = {
     0, 0, 0, 0, 0, 0, 0,    /* 0..6 are never used */
     0, 0, 0, 0, 0, 0,       /* 7..12 use constant coeffs */
     1, 1, 1, 1,             /* 13..16 are constants but still need sign bit */
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index 459441e..ac4c574 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,9 @@
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/intreadwrite.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/avassert.h"
 
 #include "avcodec.h"
 #include "rnd_avg.h"
@@ -42,7 +42,7 @@
 #define xC6S2 25080
 #define xC7S1 12785
 
-#define M(a, b) (((a) * (b)) >> 16)
+#define M(a, b) ((int)((SUINT)(a) * (b)) >> 16)
 
 static av_always_inline void idct(uint8_t *dst, ptrdiff_t stride,
                                   int16_t *input, int type)
@@ -195,6 +195,158 @@ static av_always_inline void idct(uint8_t *dst, ptrdiff_t stride,
     }
 }
 
+static av_always_inline void idct10(uint8_t *dst, ptrdiff_t stride,
+                                    int16_t *input, int type)
+{
+    int16_t *ip = input;
+
+    int A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
+    int Ed, Gd, Add, Bdd, Fd, Hd;
+
+    int i;
+
+    /* Inverse DCT on the rows now */
+    for (i = 0; i < 4; i++) {
+        /* Check for non-zero values */
+        if (ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8]) {
+            A =  M(xC1S7, ip[1 * 8]);
+            B =  M(xC7S1, ip[1 * 8]);
+            C =  M(xC3S5, ip[3 * 8]);
+            D = -M(xC5S3, ip[3 * 8]);
+
+            Ad = M(xC4S4, (A - C));
+            Bd = M(xC4S4, (B - D));
+
+            Cd = A + C;
+            Dd = B + D;
+
+            E = M(xC4S4, ip[0 * 8]);
+            F = E;
+
+            G = M(xC2S6, ip[2 * 8]);
+            H = M(xC6S2, ip[2 * 8]);
+
+            Ed = E - G;
+            Gd = E + G;
+
+            Add = F + Ad;
+            Bdd = Bd - H;
+
+            Fd = F - Ad;
+            Hd = Bd + H;
+
+            /* Final sequence of operations over-write original inputs */
+            ip[0 * 8] = Gd + Cd;
+            ip[7 * 8] = Gd - Cd;
+
+            ip[1 * 8] = Add + Hd;
+            ip[2 * 8] = Add - Hd;
+
+            ip[3 * 8] = Ed + Dd;
+            ip[4 * 8] = Ed - Dd;
+
+            ip[5 * 8] = Fd + Bdd;
+            ip[6 * 8] = Fd - Bdd;
+
+        }
+
+        ip += 1;
+    }
+
+    ip = input;
+
+    for (i = 0; i < 8; i++) {
+        /* Check for non-zero values (bitwise or faster than ||) */
+        if (ip[0] | ip[1] | ip[2] | ip[3]) {
+            A =  M(xC1S7, ip[1]);
+            B =  M(xC7S1, ip[1]);
+            C =  M(xC3S5, ip[3]);
+            D = -M(xC5S3, ip[3]);
+
+            Ad = M(xC4S4, (A - C));
+            Bd = M(xC4S4, (B - D));
+
+            Cd = A + C;
+            Dd = B + D;
+
+            E = M(xC4S4, ip[0]);
+            if (type == 1)
+                E += 16 * 128;
+            F = E;
+
+            G = M(xC2S6, ip[2]);
+            H = M(xC6S2, ip[2]);
+
+            Ed = E - G;
+            Gd = E + G;
+
+            Add = F + Ad;
+            Bdd = Bd - H;
+
+            Fd = F - Ad;
+            Hd = Bd + H;
+
+            Gd += 8;
+            Add += 8;
+            Ed += 8;
+            Fd += 8;
+
+            /* Final sequence of operations over-write original inputs. */
+            if (type == 1) {
+                dst[0 * stride] = av_clip_uint8((Gd + Cd) >> 4);
+                dst[7 * stride] = av_clip_uint8((Gd - Cd) >> 4);
+
+                dst[1 * stride] = av_clip_uint8((Add + Hd) >> 4);
+                dst[2 * stride] = av_clip_uint8((Add - Hd) >> 4);
+
+                dst[3 * stride] = av_clip_uint8((Ed + Dd) >> 4);
+                dst[4 * stride] = av_clip_uint8((Ed - Dd) >> 4);
+
+                dst[5 * stride] = av_clip_uint8((Fd + Bdd) >> 4);
+                dst[6 * stride] = av_clip_uint8((Fd - Bdd) >> 4);
+            } else {
+                dst[0 * stride] = av_clip_uint8(dst[0 * stride] + ((Gd + Cd) >> 4));
+                dst[7 * stride] = av_clip_uint8(dst[7 * stride] + ((Gd - Cd) >> 4));
+
+                dst[1 * stride] = av_clip_uint8(dst[1 * stride] + ((Add + Hd) >> 4));
+                dst[2 * stride] = av_clip_uint8(dst[2 * stride] + ((Add - Hd) >> 4));
+
+                dst[3 * stride] = av_clip_uint8(dst[3 * stride] + ((Ed + Dd) >> 4));
+                dst[4 * stride] = av_clip_uint8(dst[4 * stride] + ((Ed - Dd) >> 4));
+
+                dst[5 * stride] = av_clip_uint8(dst[5 * stride] + ((Fd + Bdd) >> 4));
+                dst[6 * stride] = av_clip_uint8(dst[6 * stride] + ((Fd - Bdd) >> 4));
+            }
+        } else {
+            if (type == 1) {
+                dst[0*stride] =
+                dst[1*stride] =
+                dst[2*stride] =
+                dst[3*stride] =
+                dst[4*stride] =
+                dst[5*stride] =
+                dst[6*stride] =
+                dst[7*stride] = 128;
+            }
+        }
+
+        ip += 8;
+        dst++;
+    }
+}
+
+void ff_vp3dsp_idct10_put(uint8_t *dest, ptrdiff_t stride, int16_t *block)
+{
+    idct10(dest, stride, block, 1);
+    memset(block, 0, sizeof(*block) * 64);
+}
+
+void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block)
+{
+    idct10(dest, stride, block, 2);
+    memset(block, 0, sizeof(*block) * 64);
+}
+
 static void vp3_idct_put_c(uint8_t *dest /* align 8 */, ptrdiff_t stride,
                            int16_t *block /* align 16 */)
 {
@@ -228,14 +380,14 @@ static void vp3_idct_dc_add_c(uint8_t *dest /* align 8 */, ptrdiff_t stride,
     block[0] = 0;
 }
 
-static void vp3_v_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
-                                int *bounding_values)
+static av_always_inline void vp3_v_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
+                                                 int *bounding_values, int count)
 {
     unsigned char *end;
     int filter_value;
     const ptrdiff_t nstride = -stride;
 
-    for (end = first_pixel + 8; first_pixel < end; first_pixel++) {
+    for (end = first_pixel + count; first_pixel < end; first_pixel++) {
         filter_value = (first_pixel[2 * nstride] - first_pixel[stride]) +
                        (first_pixel[0] - first_pixel[nstride]) * 3;
         filter_value = bounding_values[(filter_value + 4) >> 3];
@@ -245,13 +397,13 @@ static void vp3_v_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
     }
 }
 
-static void vp3_h_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
-                                int *bounding_values)
+static av_always_inline void vp3_h_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
+                                                 int *bounding_values, int count)
 {
     unsigned char *end;
     int filter_value;
 
-    for (end = first_pixel + 8 * stride; first_pixel != end; first_pixel += stride) {
+    for (end = first_pixel + count * stride; first_pixel != end; first_pixel += stride) {
         filter_value = (first_pixel[-2] - first_pixel[1]) +
                        (first_pixel[ 0] - first_pixel[-1]) * 3;
         filter_value = bounding_values[(filter_value + 4) >> 3];
@@ -261,6 +413,18 @@ static void vp3_h_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride,
     }
 }
 
+#define LOOP_FILTER(prefix, suffix, dim, count) \
+void prefix##_##dim##_loop_filter_##count##suffix(uint8_t *first_pixel, ptrdiff_t stride, \
+                                int *bounding_values) \
+{ \
+    vp3_##dim##_loop_filter_c(first_pixel, stride, bounding_values, count); \
+}
+
+static LOOP_FILTER(vp3,_c, v, 8)
+static LOOP_FILTER(vp3,_c, h, 8)
+LOOP_FILTER(ff_vp3dsp, , v, 12)
+LOOP_FILTER(ff_vp3dsp, , h, 12)
+
 static void put_no_rnd_pixels_l2(uint8_t *dst, const uint8_t *src1,
                                  const uint8_t *src2, ptrdiff_t stride, int h)
 {
@@ -285,8 +449,8 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
     c->idct_put      = vp3_idct_put_c;
     c->idct_add      = vp3_idct_add_c;
     c->idct_dc_add   = vp3_idct_dc_add_c;
-    c->v_loop_filter = vp3_v_loop_filter_c;
-    c->h_loop_filter = vp3_h_loop_filter_c;
+    c->v_loop_filter = vp3_v_loop_filter_8_c;
+    c->h_loop_filter = vp3_h_loop_filter_8_c;
 
     if (ARCH_ARM)
         ff_vp3dsp_init_arm(c, flags);
@@ -294,4 +458,37 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
         ff_vp3dsp_init_ppc(c, flags);
     if (ARCH_X86)
         ff_vp3dsp_init_x86(c, flags);
+    if (ARCH_MIPS)
+        ff_vp3dsp_init_mips(c, flags);
+}
+
+/*
+ * This function initializes the loop filter boundary limits if the frame's
+ * quality index is different from the previous frame's.
+ *
+ * where sizeof(bounding_values_array) is 256 * sizeof(int)
+ *
+ * The filter_limit_values may not be larger than 127.
+ */
+void ff_vp3dsp_set_bounding_values(int * bounding_values_array, int filter_limit)
+{
+    int *bounding_values = bounding_values_array + 127;
+    int x;
+    int value;
+
+    av_assert0(filter_limit < 128U);
+
+    /* set up the bounding values */
+    memset(bounding_values_array, 0, 256 * sizeof(int));
+    for (x = 0; x < filter_limit; x++) {
+        bounding_values[-x] = -x;
+        bounding_values[x] = x;
+    }
+    for (x = value = filter_limit; x < 128 && value; x++, value--) {
+        bounding_values[ x] =  value;
+        bounding_values[-x] = -value;
+    }
+    if (value)
+        bounding_values[128] = value;
+    bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202;
 }
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 010f905..32b2cad0 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,9 +45,18 @@ typedef struct VP3DSPContext {
     void (*h_loop_filter)(uint8_t *src, ptrdiff_t stride, int *bounding_values);
 } VP3DSPContext;
 
+void ff_vp3dsp_v_loop_filter_12(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values);
+void ff_vp3dsp_h_loop_filter_12(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values);
+
+void ff_vp3dsp_idct10_put(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
 void ff_vp3dsp_init(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags);
+void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags);
+
+void ff_vp3dsp_set_bounding_values(int * bound_values_array, int filter_limit);
 
 #endif /* AVCODEC_VP3DSP_H */
diff --git a/libavcodec/vp5.c b/libavcodec/vp5.c
index e5036e6..49988b8 100644
--- a/libavcodec/vp5.c
+++ b/libavcodec/vp5.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,13 +34,15 @@
 #include "vp5data.h"
 
 
-static int vp5_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
-                            int *golden_frame)
+static int vp5_parse_header(VP56Context *s, const uint8_t *buf, int buf_size)
 {
     VP56RangeCoder *c = &s->c;
     int rows, cols;
+    int ret;
 
-    ff_vp56_init_range_decoder(&s->c, buf, buf_size);
+    ret = ff_vp56_init_range_decoder(&s->c, buf, buf_size);
+    if (ret < 0)
+        return ret;
     s->frames[VP56_FRAME_CURRENT]->key_frame = !vp56_rac_get(c);
     vp56_rac_get(c);
     ff_vp56_init_dequant(s, vp56_rac_gets(c, 6));
@@ -85,7 +87,7 @@ static void vp5_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
 
     for (comp=0; comp<2; comp++) {
         int delta = 0;
-        if (vp56_rac_get_prob(c, model->vector_dct[comp])) {
+        if (vp56_rac_get_prob_branchy(c, model->vector_dct[comp])) {
             int sign = vp56_rac_get_prob(c, model->vector_sig[comp]);
             di  = vp56_rac_get_prob(c, model->vector_pdi[comp][0]);
             di |= vp56_rac_get_prob(c, model->vector_pdi[comp][1]) << 1;
@@ -108,19 +110,19 @@ static void vp5_parse_vector_models(VP56Context *s)
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][0]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][0]))
             model->vector_dct[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][1]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][1]))
             model->vector_sig[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][2]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][2]))
             model->vector_pdi[comp][0] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][3]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][3]))
             model->vector_pdi[comp][1] = vp56_rac_gets_nn(c, 7);
     }
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<7; node++)
-            if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][4 + node]))
+            if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][4 + node]))
                 model->vector_pdv[comp][node] = vp56_rac_gets_nn(c, 7);
 }
 
@@ -137,7 +139,7 @@ static int vp5_parse_coeff_models(VP56Context *s)
 
     for (pt=0; pt<2; pt++)
         for (node=0; node<11; node++)
-            if (vp56_rac_get_prob(c, vp5_dccv_pct[pt][node])) {
+            if (vp56_rac_get_prob_branchy(c, vp5_dccv_pct[pt][node])) {
                 def_prob[node] = vp56_rac_gets_nn(c, 7);
                 model->coeff_dccv[pt][node] = def_prob[node];
             } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -148,7 +150,7 @@ static int vp5_parse_coeff_models(VP56Context *s)
         for (pt=0; pt<2; pt++)
             for (cg=0; cg<6; cg++)
                 for (node=0; node<11; node++)
-                    if (vp56_rac_get_prob(c, vp5_ract_pct[ct][pt][cg][node])) {
+                    if (vp56_rac_get_prob_branchy(c, vp5_ract_pct[ct][pt][cg][node])) {
                         def_prob[node] = vp56_rac_gets_nn(c, 7);
                         model->coeff_ract[pt][ct][cg][node] = def_prob[node];
                     } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -171,7 +173,7 @@ static int vp5_parse_coeff_models(VP56Context *s)
     return 0;
 }
 
-static void vp5_parse_coeff(VP56Context *s)
+static int vp5_parse_coeff(VP56Context *s)
 {
     VP56RangeCoder *c = &s->c;
     VP56Model *model = s->modelp;
@@ -181,6 +183,11 @@ static void vp5_parse_coeff(VP56Context *s)
     int b, i, cg, idx, ctx, ctx_last;
     int pt = 0;    /* plane type (0 for Y, 1 for U or V) */
 
+    if (c->end <= c->buffer && c->bits >= 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "End of AC stream reached in vp5_parse_coeff\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     for (b=0; b<6; b++) {
         int ct = 1;    /* code type */
 
@@ -193,9 +200,9 @@ static void vp5_parse_coeff(VP56Context *s)
 
         coeff_idx = 0;
         for (;;) {
-            if (vp56_rac_get_prob(c, model2[0])) {
-                if (vp56_rac_get_prob(c, model2[2])) {
-                    if (vp56_rac_get_prob(c, model2[3])) {
+            if (vp56_rac_get_prob_branchy(c, model2[0])) {
+                if (vp56_rac_get_prob_branchy(c, model2[2])) {
+                    if (vp56_rac_get_prob_branchy(c, model2[3])) {
                         s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 4;
                         idx = vp56_rac_get_tree(c, ff_vp56_pc_tree, model1);
                         sign = vp56_rac_get(c);
@@ -203,7 +210,7 @@ static void vp5_parse_coeff(VP56Context *s)
                         for (i=ff_vp56_coeff_bit_length[idx]; i>=0; i--)
                             coeff += vp56_rac_get_prob(c, ff_vp56_coeff_parse_table[idx][i]) << i;
                     } else {
-                        if (vp56_rac_get_prob(c, model2[4])) {
+                        if (vp56_rac_get_prob_branchy(c, model2[4])) {
                             coeff = 3 + vp56_rac_get_prob(c, model1[5]);
                             s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 3;
                         } else {
@@ -224,7 +231,7 @@ static void vp5_parse_coeff(VP56Context *s)
                     coeff *= s->dequant_ac;
                 s->block_coeff[b][permute[coeff_idx]] = coeff;
             } else {
-                if (ct && !vp56_rac_get_prob(c, model2[1]))
+                if (ct && !vp56_rac_get_prob_branchy(c, model2[1]))
                     break;
                 ct = 0;
                 s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 0;
@@ -245,7 +252,9 @@ static void vp5_parse_coeff(VP56Context *s)
             for (i=coeff_idx; i<=ctx_last; i++)
                 s->coeff_ctx[ff_vp56_b6to4[b]][i] = 5;
         s->above_blocks[s->above_block_idx[b]].not_null_dc = s->coeff_ctx[ff_vp56_b6to4[b]][0];
+        s->idct_selector[b] = 63;
     }
+    return 0;
 }
 
 static void vp5_default_models_init(VP56Context *s)
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index 39d82e9..72fea37 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,8 @@
 
 void ff_vp56_init_dequant(VP56Context *s, int quantizer)
 {
+    if (s->quantizer != quantizer)
+        ff_vp3dsp_set_bounding_values(s->bounding_values_array, ff_vp56_filter_threshold[quantizer]);
     s->quantizer = quantizer;
     s->dequant_dc = ff_vp56_dc_dequant[quantizer] << 2;
     s->dequant_ac = ff_vp56_ac_dequant[quantizer] << 2;
@@ -83,16 +85,16 @@ static void vp56_parse_mb_type_models(VP56Context *s)
     int i, ctx, type;
 
     for (ctx=0; ctx<3; ctx++) {
-        if (vp56_rac_get_prob(c, 174)) {
+        if (vp56_rac_get_prob_branchy(c, 174)) {
             int idx = vp56_rac_gets(c, 4);
             memcpy(model->mb_types_stats[ctx],
                    ff_vp56_pre_def_mb_type_stats[idx][ctx],
                    sizeof(model->mb_types_stats[ctx]));
         }
-        if (vp56_rac_get_prob(c, 254)) {
+        if (vp56_rac_get_prob_branchy(c, 254)) {
             for (type=0; type<10; type++) {
                 for(i=0; i<2; i++) {
-                    if (vp56_rac_get_prob(c, 205)) {
+                    if (vp56_rac_get_prob_branchy(c, 205)) {
                         int delta, sign = vp56_rac_get(c);
 
                         delta = vp56_rac_get_tree(c, ff_vp56_pmbtm_tree,
@@ -153,7 +155,7 @@ static VP56mb vp56_parse_mb_type(VP56Context *s,
     uint8_t *mb_type_model = s->modelp->mb_type[ctx][prev_type];
     VP56RangeCoder *c = &s->c;
 
-    if (vp56_rac_get_prob(c, mb_type_model[0]))
+    if (vp56_rac_get_prob_branchy(c, mb_type_model[0]))
         return prev_type;
     else
         return vp56_rac_get_tree(c, ff_vp56_pmbt_tree, mb_type_model);
@@ -196,12 +198,8 @@ static void vp56_decode_4mv(VP56Context *s, int row, int col)
     s->macroblocks[row * s->mb_width + col].mv = s->mv[3];
 
     /* chroma vectors are average luma vectors */
-    if (s->avctx->codec->id == AV_CODEC_ID_VP5) {
-        s->mv[4].x = s->mv[5].x = RSHIFT(mv.x,2);
-        s->mv[4].y = s->mv[5].y = RSHIFT(mv.y,2);
-    } else {
-        s->mv[4] = s->mv[5] = (VP56mv) {mv.x/4, mv.y/4};
-    }
+    s->mv[4].x = s->mv[5].x = RSHIFT(mv.x,2);
+    s->mv[4].y = s->mv[5].y = RSHIFT(mv.y,2);
 }
 
 static VP56mb vp56_decode_mv(VP56Context *s, int row, int col)
@@ -261,6 +259,25 @@ static VP56mb vp56_decode_mv(VP56Context *s, int row, int col)
     return s->mb_type;
 }
 
+static VP56mb vp56_conceal_mv(VP56Context *s, int row, int col)
+{
+    VP56mv *mv, vect = {0,0};
+    int b;
+
+    s->mb_type = VP56_MB_INTER_NOVEC_PF;
+    s->macroblocks[row * s->mb_width + col].type = s->mb_type;
+
+    mv = &vect;
+
+    s->macroblocks[row*s->mb_width + col].mv = *mv;
+
+    /* same vector for all blocks */
+    for (b=0; b<6; b++)
+        s->mv[b] = *mv;
+
+    return s->mb_type;
+}
+
 static void vp56_add_predictors_dc(VP56Context *s, VP56Frame ref_frame)
 {
     int idx = s->idct_scantable[0];
@@ -305,9 +322,17 @@ static void vp56_add_predictors_dc(VP56Context *s, VP56Frame ref_frame)
 static void vp56_deblock_filter(VP56Context *s, uint8_t *yuv,
                                 ptrdiff_t stride, int dx, int dy)
 {
+    if (s->avctx->codec->id == AV_CODEC_ID_VP5) {
     int t = ff_vp56_filter_threshold[s->quantizer];
     if (dx)  s->vp56dsp.edge_filter_hor(yuv +         10-dx , stride, t);
     if (dy)  s->vp56dsp.edge_filter_ver(yuv + stride*(10-dy), stride, t);
+    } else {
+        int * bounding_values = s->bounding_values_array + 127;
+        if (dx)
+            ff_vp3dsp_h_loop_filter_12(yuv +         10-dx, stride, bounding_values);
+        if (dy)
+            ff_vp3dsp_v_loop_filter_12(yuv + stride*(10-dy), stride, bounding_values);
+    }
 }
 
 static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src,
@@ -340,11 +365,11 @@ static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src,
     if (x<0 || x+12>=s->plane_width[plane] ||
         y<0 || y+12>=s->plane_height[plane]) {
         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                            src + s->block_offset[b] + (dy-2)*stride + (dx-2),
-                            stride, stride,
-                            12, 12, x, y,
-                            s->plane_width[plane],
-                            s->plane_height[plane]);
+                                 src + s->block_offset[b] + (dy-2)*stride + (dx-2),
+                                 stride, stride,
+                                 12, 12, x, y,
+                                 s->plane_width[plane],
+                                 s->plane_height[plane]);
         src_block = s->edge_emu_buffer;
         src_offset = 2 + 2*stride;
     } else if (deblock_filtering) {
@@ -381,20 +406,29 @@ static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src,
     }
 }
 
-static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
+static void vp56_idct_put(VP56Context *s, uint8_t * dest, ptrdiff_t stride, int16_t *block, int selector)
 {
-    AVFrame *frame_current, *frame_ref;
-    VP56mb mb_type;
-    VP56Frame ref_frame;
-    int b, ab, b_max, plane, off;
+    if (selector > 10 || selector == 1)
+        s->vp3dsp.idct_put(dest, stride, block);
+    else
+        ff_vp3dsp_idct10_put(dest, stride, block);
+}
 
-    if (s->frames[VP56_FRAME_CURRENT]->key_frame)
-        mb_type = VP56_MB_INTRA;
+static void vp56_idct_add(VP56Context *s, uint8_t * dest, ptrdiff_t stride, int16_t *block, int selector)
+{
+    if (selector > 10)
+        s->vp3dsp.idct_add(dest, stride, block);
+    else if (selector > 1)
+        ff_vp3dsp_idct10_add(dest, stride, block);
     else
-        mb_type = vp56_decode_mv(s, row, col);
-    ref_frame = ff_vp56_reference_frame[mb_type];
+        s->vp3dsp.idct_dc_add(dest, stride, block);
+}
 
-    s->parse_coeff(s);
+static av_always_inline void vp56_render_mb(VP56Context *s, int row, int col, int is_alpha, VP56mb mb_type)
+{
+    int b, ab, b_max, plane, off;
+    AVFrame *frame_current, *frame_ref;
+    VP56Frame ref_frame = ff_vp56_reference_frame[mb_type];
 
     vp56_add_predictors_dc(s, ref_frame);
 
@@ -410,8 +444,8 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
         case VP56_MB_INTRA:
             for (b=0; b<b_max; b++) {
                 plane = ff_vp56_b2p[b+ab];
-                s->vp3dsp.idct_put(frame_current->data[plane] + s->block_offset[b],
-                                s->stride[plane], s->block_coeff[b]);
+                vp56_idct_put(s, frame_current->data[plane] + s->block_offset[b],
+                                s->stride[plane], s->block_coeff[b], s->idct_selector[b]);
             }
             break;
 
@@ -423,8 +457,8 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
                 s->hdsp.put_pixels_tab[1][0](frame_current->data[plane] + off,
                                              frame_ref->data[plane] + off,
                                              s->stride[plane], 8);
-                s->vp3dsp.idct_add(frame_current->data[plane] + off,
-                                s->stride[plane], s->block_coeff[b]);
+                vp56_idct_add(s, frame_current->data[plane] + off,
+                              s->stride[plane], s->block_coeff[b], s->idct_selector[b]);
             }
             break;
 
@@ -441,8 +475,8 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
                 plane = ff_vp56_b2p[b+ab];
                 vp56_mc(s, b, plane, frame_ref->data[plane], s->stride[plane],
                         16*col+x_off, 16*row+y_off);
-                s->vp3dsp.idct_add(frame_current->data[plane] + s->block_offset[b],
-                                s->stride[plane], s->block_coeff[b]);
+                vp56_idct_add(s, frame_current->data[plane] + s->block_offset[b],
+                              s->stride[plane], s->block_coeff[b], s->idct_selector[b]);
             }
             break;
     }
@@ -453,9 +487,42 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
     }
 }
 
-static int vp56_size_changed(AVCodecContext *avctx)
+static int vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
 {
-    VP56Context *s = avctx->priv_data;
+    VP56mb mb_type;
+    int ret;
+
+    if (s->frames[VP56_FRAME_CURRENT]->key_frame)
+        mb_type = VP56_MB_INTRA;
+    else
+        mb_type = vp56_decode_mv(s, row, col);
+
+    ret = s->parse_coeff(s);
+    if (ret < 0)
+        return ret;
+
+    vp56_render_mb(s, row, col, is_alpha, mb_type);
+
+    return 0;
+}
+
+static int vp56_conceal_mb(VP56Context *s, int row, int col, int is_alpha)
+{
+    VP56mb mb_type;
+
+    if (s->frames[VP56_FRAME_CURRENT]->key_frame)
+        mb_type = VP56_MB_INTRA;
+    else
+        mb_type = vp56_conceal_mv(s, row, col);
+
+    vp56_render_mb(s, row, col, is_alpha, mb_type);
+
+    return 0;
+}
+
+static int vp56_size_changed(VP56Context *s)
+{
+    AVCodecContext *avctx = s->avctx;
     int stride = s->frames[VP56_FRAME_CURRENT]->linesize[0];
     int i;
 
@@ -464,6 +531,8 @@ static int vp56_size_changed(AVCodecContext *avctx)
     s->plane_height[0] = s->plane_height[3] = avctx->coded_height;
     s->plane_height[1] = s->plane_height[2] = avctx->coded_height/2;
 
+    s->have_undamaged_frame = 0;
+
     for (i=0; i<4; i++)
         s->stride[i] = s->flip * s->frames[VP56_FRAME_CURRENT]->linesize[i];
 
@@ -476,19 +545,26 @@ static int vp56_size_changed(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    s->above_blocks = av_realloc(s->above_blocks,
-                                 (4*s->mb_width+6) * sizeof(*s->above_blocks));
-    s->macroblocks = av_realloc(s->macroblocks,
-                                s->mb_width*s->mb_height*sizeof(*s->macroblocks));
+    av_reallocp_array(&s->above_blocks, 4*s->mb_width+6,
+                      sizeof(*s->above_blocks));
+    av_reallocp_array(&s->macroblocks, s->mb_width*s->mb_height,
+                      sizeof(*s->macroblocks));
     av_free(s->edge_emu_buffer_alloc);
     s->edge_emu_buffer_alloc = av_malloc(16*stride);
     s->edge_emu_buffer = s->edge_emu_buffer_alloc;
+    if (!s->above_blocks || !s->macroblocks || !s->edge_emu_buffer_alloc)
+        return AVERROR(ENOMEM);
     if (s->flip < 0)
         s->edge_emu_buffer += 15 * stride;
 
+    if (s->alpha_context)
+        return vp56_size_changed(s->alpha_context);
+
     return 0;
 }
 
+static int ff_vp56_decode_mbs(AVCodecContext *avctx, void *, int, int);
+
 int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                          AVPacket *avpkt)
 {
@@ -496,8 +572,9 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     VP56Context *s = avctx->priv_data;
     AVFrame *const p = s->frames[VP56_FRAME_CURRENT];
     int remaining_buf_size = avpkt->size;
-    int is_alpha, av_uninit(alpha_offset);
-    int res;
+    int av_uninit(alpha_offset);
+    int i, res;
+    int ret;
 
     if (s->has_alpha) {
         if (remaining_buf_size < 3)
@@ -508,163 +585,215 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
     }
 
-    for (is_alpha=0; is_alpha < 1+s->has_alpha; is_alpha++) {
-        int mb_row, mb_col, mb_row_flip, mb_offset = 0;
-        int block, y, uv;
-        ptrdiff_t stride_y, stride_uv;
-        int golden_frame = 0;
-
-        s->modelp = &s->models[is_alpha];
+    res = s->parse_header(s, buf, remaining_buf_size);
+    if (res < 0)
+        return res;
 
-        res = s->parse_header(s, buf, remaining_buf_size, &golden_frame);
-        if (res < 0) {
-            int i;
-            for (i = 0; i < 4; i++)
-                av_frame_unref(s->frames[i]);
-            return res;
+    if (res == VP56_SIZE_CHANGE) {
+        for (i = 0; i < 4; i++) {
+            av_frame_unref(s->frames[i]);
+            if (s->alpha_context)
+                av_frame_unref(s->alpha_context->frames[i]);
         }
+    }
 
-        if (res == VP56_SIZE_CHANGE) {
-            int i;
-            for (i = 0; i < 4; i++)
-                av_frame_unref(s->frames[i]);
-            if (is_alpha) {
+    ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF);
+    if (ret < 0) {
+        if (res == VP56_SIZE_CHANGE)
+            ff_set_dimensions(avctx, 0, 0);
+        return ret;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
+        av_frame_unref(s->alpha_context->frames[VP56_FRAME_CURRENT]);
+        if ((ret = av_frame_ref(s->alpha_context->frames[VP56_FRAME_CURRENT], p)) < 0) {
+            av_frame_unref(p);
+            if (res == VP56_SIZE_CHANGE)
                 ff_set_dimensions(avctx, 0, 0);
-                return AVERROR_INVALIDDATA;
-            }
+            return ret;
         }
+    }
 
-        if (!is_alpha) {
-            int ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF);
-            if (ret < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                return ret;
-            }
-
-            if (res == VP56_SIZE_CHANGE)
-                if (vp56_size_changed(avctx)) {
-                    av_frame_unref(p);
-                    return AVERROR_INVALIDDATA;
-                }
+    if (res == VP56_SIZE_CHANGE) {
+        if (vp56_size_changed(s)) {
+            av_frame_unref(p);
+            return AVERROR_INVALIDDATA;
         }
+    }
 
-        if (p->key_frame) {
-            p->pict_type = AV_PICTURE_TYPE_I;
-            s->default_models_init(s);
-            for (block=0; block<s->mb_height*s->mb_width; block++)
-                s->macroblocks[block].type = VP56_MB_INTRA;
-        } else {
-            p->pict_type = AV_PICTURE_TYPE_P;
-            vp56_parse_mb_type_models(s);
-            s->parse_vector_models(s);
-            s->mb_type = VP56_MB_INTER_NOVEC_PF;
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
+        int bak_w = avctx->width;
+        int bak_h = avctx->height;
+        int bak_cw = avctx->coded_width;
+        int bak_ch = avctx->coded_height;
+        buf += alpha_offset;
+        remaining_buf_size -= alpha_offset;
+
+        res = s->alpha_context->parse_header(s->alpha_context, buf, remaining_buf_size);
+        if (res != 0) {
+            if(res==VP56_SIZE_CHANGE) {
+                av_log(avctx, AV_LOG_ERROR, "Alpha reconfiguration\n");
+                avctx->width  = bak_w;
+                avctx->height = bak_h;
+                avctx->coded_width  = bak_cw;
+                avctx->coded_height = bak_ch;
+            }
+            av_frame_unref(p);
+            return AVERROR_INVALIDDATA;
         }
+    }
 
-        if (s->parse_coeff_models(s))
-            goto next;
+    s->discard_frame = 0;
+    avctx->execute2(avctx, ff_vp56_decode_mbs, 0, 0, (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) + 1);
 
-        memset(s->prev_dc, 0, sizeof(s->prev_dc));
-        s->prev_dc[1][VP56_FRAME_CURRENT] = 128;
-        s->prev_dc[2][VP56_FRAME_CURRENT] = 128;
+    if (s->discard_frame)
+        return AVERROR_INVALIDDATA;
 
-        for (block=0; block < 4*s->mb_width+6; block++) {
-            s->above_blocks[block].ref_frame = VP56_FRAME_NONE;
-            s->above_blocks[block].dc_coeff = 0;
-            s->above_blocks[block].not_null_dc = 0;
-        }
-        s->above_blocks[2*s->mb_width + 2].ref_frame = VP56_FRAME_CURRENT;
-        s->above_blocks[3*s->mb_width + 4].ref_frame = VP56_FRAME_CURRENT;
+    if ((res = av_frame_ref(data, p)) < 0)
+        return res;
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static int ff_vp56_decode_mbs(AVCodecContext *avctx, void *data,
+                              int jobnr, int threadnr)
+{
+    VP56Context *s0 = avctx->priv_data;
+    int is_alpha = (jobnr == 1);
+    VP56Context *s = is_alpha ? s0->alpha_context : s0;
+    AVFrame *const p = s->frames[VP56_FRAME_CURRENT];
+    int mb_row, mb_col, mb_row_flip, mb_offset = 0;
+    int block, y, uv;
+    ptrdiff_t stride_y, stride_uv;
+    int res;
+    int damaged = 0;
+
+    if (p->key_frame) {
+        p->pict_type = AV_PICTURE_TYPE_I;
+        s->default_models_init(s);
+        for (block=0; block<s->mb_height*s->mb_width; block++)
+            s->macroblocks[block].type = VP56_MB_INTRA;
+    } else {
+        p->pict_type = AV_PICTURE_TYPE_P;
+        vp56_parse_mb_type_models(s);
+        s->parse_vector_models(s);
+        s->mb_type = VP56_MB_INTER_NOVEC_PF;
+    }
 
-        stride_y  = p->linesize[0];
-        stride_uv = p->linesize[1];
+    if (s->parse_coeff_models(s))
+        goto next;
 
+    memset(s->prev_dc, 0, sizeof(s->prev_dc));
+    s->prev_dc[1][VP56_FRAME_CURRENT] = 128;
+    s->prev_dc[2][VP56_FRAME_CURRENT] = 128;
+
+    for (block=0; block < 4*s->mb_width+6; block++) {
+        s->above_blocks[block].ref_frame = VP56_FRAME_NONE;
+        s->above_blocks[block].dc_coeff = 0;
+        s->above_blocks[block].not_null_dc = 0;
+    }
+    s->above_blocks[2*s->mb_width + 2].ref_frame = VP56_FRAME_CURRENT;
+    s->above_blocks[3*s->mb_width + 4].ref_frame = VP56_FRAME_CURRENT;
+
+    stride_y  = p->linesize[0];
+    stride_uv = p->linesize[1];
+
+    if (s->flip < 0)
+        mb_offset = 7;
+
+    /* main macroblocks loop */
+    for (mb_row=0; mb_row<s->mb_height; mb_row++) {
         if (s->flip < 0)
-            mb_offset = 7;
-
-        /* main macroblocks loop */
-        for (mb_row=0; mb_row<s->mb_height; mb_row++) {
-            if (s->flip < 0)
-                mb_row_flip = s->mb_height - mb_row - 1;
-            else
-                mb_row_flip = mb_row;
-
-            for (block=0; block<4; block++) {
-                s->left_block[block].ref_frame = VP56_FRAME_NONE;
-                s->left_block[block].dc_coeff = 0;
-                s->left_block[block].not_null_dc = 0;
-            }
-            memset(s->coeff_ctx, 0, sizeof(s->coeff_ctx));
-            memset(s->coeff_ctx_last, 24, sizeof(s->coeff_ctx_last));
-
-            s->above_block_idx[0] = 1;
-            s->above_block_idx[1] = 2;
-            s->above_block_idx[2] = 1;
-            s->above_block_idx[3] = 2;
-            s->above_block_idx[4] = 2*s->mb_width + 2 + 1;
-            s->above_block_idx[5] = 3*s->mb_width + 4 + 1;
-
-            s->block_offset[s->frbi] = (mb_row_flip*16 + mb_offset) * stride_y;
-            s->block_offset[s->srbi] = s->block_offset[s->frbi] + 8*stride_y;
-            s->block_offset[1] = s->block_offset[0] + 8;
-            s->block_offset[3] = s->block_offset[2] + 8;
-            s->block_offset[4] = (mb_row_flip*8 + mb_offset) * stride_uv;
-            s->block_offset[5] = s->block_offset[4];
-
-            for (mb_col=0; mb_col<s->mb_width; mb_col++) {
-                vp56_decode_mb(s, mb_row, mb_col, is_alpha);
-
-                for (y=0; y<4; y++) {
-                    s->above_block_idx[y] += 2;
-                    s->block_offset[y] += 16;
-                }
+            mb_row_flip = s->mb_height - mb_row - 1;
+        else
+            mb_row_flip = mb_row;
 
-                for (uv=4; uv<6; uv++) {
-                    s->above_block_idx[uv] += 1;
-                    s->block_offset[uv] += 8;
+        for (block=0; block<4; block++) {
+            s->left_block[block].ref_frame = VP56_FRAME_NONE;
+            s->left_block[block].dc_coeff = 0;
+            s->left_block[block].not_null_dc = 0;
+        }
+        memset(s->coeff_ctx, 0, sizeof(s->coeff_ctx));
+        memset(s->coeff_ctx_last, 24, sizeof(s->coeff_ctx_last));
+
+        s->above_block_idx[0] = 1;
+        s->above_block_idx[1] = 2;
+        s->above_block_idx[2] = 1;
+        s->above_block_idx[3] = 2;
+        s->above_block_idx[4] = 2*s->mb_width + 2 + 1;
+        s->above_block_idx[5] = 3*s->mb_width + 4 + 1;
+
+        s->block_offset[s->frbi] = (mb_row_flip*16 + mb_offset) * stride_y;
+        s->block_offset[s->srbi] = s->block_offset[s->frbi] + 8*stride_y;
+        s->block_offset[1] = s->block_offset[0] + 8;
+        s->block_offset[3] = s->block_offset[2] + 8;
+        s->block_offset[4] = (mb_row_flip*8 + mb_offset) * stride_uv;
+        s->block_offset[5] = s->block_offset[4];
+
+        for (mb_col=0; mb_col<s->mb_width; mb_col++) {
+            if (!damaged) {
+                int ret = vp56_decode_mb(s, mb_row, mb_col, is_alpha);
+                if (ret < 0) {
+                    damaged = 1;
+                    if (!s->have_undamaged_frame || !avctx->error_concealment) {
+                        s->discard_frame = 1;
+                        return AVERROR_INVALIDDATA;
+                    }
                 }
             }
-        }
+            if (damaged)
+                vp56_conceal_mb(s, mb_row, mb_col, is_alpha);
 
-    next:
-        if (p->key_frame || golden_frame) {
-            av_frame_unref(s->frames[VP56_FRAME_GOLDEN]);
-            if ((res = av_frame_ref(s->frames[VP56_FRAME_GOLDEN], p)) < 0)
-                return res;
-        }
+            for (y=0; y<4; y++) {
+                s->above_block_idx[y] += 2;
+                s->block_offset[y] += 16;
+            }
 
-        if (s->has_alpha) {
-            FFSWAP(AVFrame *, s->frames[VP56_FRAME_GOLDEN],
-                              s->frames[VP56_FRAME_GOLDEN2]);
-            buf += alpha_offset;
-            remaining_buf_size -= alpha_offset;
+            for (uv=4; uv<6; uv++) {
+                s->above_block_idx[uv] += 1;
+                s->block_offset[uv] += 8;
+            }
         }
     }
 
+    if (!damaged)
+        s->have_undamaged_frame = 1;
+
+next:
+    if (p->key_frame || s->golden_frame) {
+        av_frame_unref(s->frames[VP56_FRAME_GOLDEN]);
+        if ((res = av_frame_ref(s->frames[VP56_FRAME_GOLDEN], p)) < 0)
+            return res;
+    }
+
     av_frame_unref(s->frames[VP56_FRAME_PREVIOUS]);
     FFSWAP(AVFrame *, s->frames[VP56_FRAME_CURRENT],
                       s->frames[VP56_FRAME_PREVIOUS]);
-
-    if ((res = av_frame_ref(data, p)) < 0)
-        return res;
-    *got_frame = 1;
-
-    return avpkt->size;
+    return 0;
 }
 
 av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 {
     VP56Context *s = avctx->priv_data;
+    return ff_vp56_init_context(avctx, s, flip, has_alpha);
+}
+
+av_cold int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
+                                  int flip, int has_alpha)
+{
     int i;
 
     s->avctx = avctx;
     avctx->pix_fmt = has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
+    if (avctx->skip_alpha) avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     ff_h264chroma_init(&s->h264chroma, 8);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
     ff_videodsp_init(&s->vdsp, 8);
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
         s->idct_scantable[i] = TRANSPOSE(ff_zigzag_direct[i]);
 #undef TRANSPOSE
     }
@@ -682,10 +811,14 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
     s->macroblocks = NULL;
     s->quantizer = -1;
     s->deblock_filtering = 1;
+    s->golden_frame = 0;
 
     s->filter = NULL;
 
     s->has_alpha = has_alpha;
+
+    s->modelp = &s->model;
+
     if (flip) {
         s->flip = -1;
         s->frbi = 2;
@@ -702,6 +835,11 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 av_cold int ff_vp56_free(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
+    return ff_vp56_free_context(s);
+}
+
+av_cold int ff_vp56_free_context(VP56Context *s)
+{
     int i;
 
     av_freep(&s->above_blocks);
diff --git a/libavcodec/vp56.h b/libavcodec/vp56.h
index 8b7806e..84b2f6c 100644
--- a/libavcodec/vp56.h
+++ b/libavcodec/vp56.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,12 +75,12 @@ typedef void (*VP56ParseVectorAdjustment)(VP56Context *s,
 typedef void (*VP56Filter)(VP56Context *s, uint8_t *dst, uint8_t *src,
                            int offset1, int offset2, ptrdiff_t stride,
                            VP56mv mv, int mask, int select, int luma);
-typedef void (*VP56ParseCoeff)(VP56Context *s);
+typedef int  (*VP56ParseCoeff)(VP56Context *s);
 typedef void (*VP56DefaultModelsInit)(VP56Context *s);
 typedef void (*VP56ParseVectorModels)(VP56Context *s);
 typedef int  (*VP56ParseCoeffModels)(VP56Context *s);
 typedef int  (*VP56ParseHeader)(VP56Context *s, const uint8_t *buf,
-                                int buf_size, int *golden_frame);
+                                int buf_size);
 
 typedef struct VP56RangeCoder {
     int high;
@@ -105,6 +105,7 @@ typedef struct VP56Macroblock {
 typedef struct VP56Model {
     uint8_t coeff_reorder[64];       /* used in vp6 only */
     uint8_t coeff_index_to_pos[64];  /* used in vp6 only */
+    uint8_t coeff_index_to_idct_selector[64]; /* used in vp6 only */
     uint8_t vector_sig[2];           /* delta sign */
     uint8_t vector_dct[2];           /* delta coding types */
     uint8_t vector_pdi[2][2];        /* predefined delta init */
@@ -136,6 +137,7 @@ struct vp56_context {
     int sub_version;
 
     /* frame info */
+    int golden_frame;
     int plane_width[4];
     int plane_height[4];
     int mb_width;   /* number of horizontal MB */
@@ -156,6 +158,7 @@ struct vp56_context {
     VP56mb mb_type;
     VP56Macroblock *macroblocks;
     DECLARE_ALIGNED(16, int16_t, block_coeff)[6][64];
+    int idct_selector[6];
 
     /* motion vectors */
     VP56mv mv[6];  /* vectors for each block in MB */
@@ -169,6 +172,7 @@ struct vp56_context {
     int filter_mode;
     int max_vector_length;
     int sample_variance_threshold;
+    DECLARE_ALIGNED(8, int, bounding_values_array)[256];
 
     uint8_t coeff_ctx[4][64];              /* used in vp5 only */
     uint8_t coeff_ctx_last[4];             /* used in vp5 only */
@@ -190,8 +194,11 @@ struct vp56_context {
     VP56ParseCoeffModels parse_coeff_models;
     VP56ParseHeader parse_header;
 
+    /* for "slice" parallelism between YUV and A */
+    VP56Context *alpha_context;
+
     VP56Model *modelp;
-    VP56Model models[2];
+    VP56Model model;
 
     /* huffman decoding */
     int use_huffman;
@@ -200,11 +207,17 @@ struct vp56_context {
     VLC runv_vlc[2];
     VLC ract_vlc[2][3][6];
     unsigned int nb_null[2][2];       /* number of consecutive NULL DC/AC */
+
+    int have_undamaged_frame;
+    int discard_frame;
 };
 
 
 int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha);
+int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
+                          int flip, int has_alpha);
 int ff_vp56_free(AVCodecContext *avctx);
+int ff_vp56_free_context(VP56Context *s);
 void ff_vp56_init_dequant(VP56Context *s, int quantizer);
 int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                          AVPacket *avpkt);
@@ -215,7 +228,15 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
  */
 
 extern const uint8_t ff_vp56_norm_shift[256];
-void ff_vp56_init_range_decoder(VP56RangeCoder *c, const uint8_t *buf, int buf_size);
+int ff_vp56_init_range_decoder(VP56RangeCoder *c, const uint8_t *buf, int buf_size);
+
+/**
+ * vp5689 returns 1 if the end of the stream has been reached, 0 otherwise.
+ */
+static av_always_inline int vpX_rac_is_end(VP56RangeCoder *c)
+{
+    return c->end <= c->buffer && c->bits >= 0;
+}
 
 static av_always_inline unsigned int vp56_rac_renorm(VP56RangeCoder *c)
 {
@@ -357,7 +378,7 @@ int vp56_rac_get_tree(VP56RangeCoder *c,
                       const uint8_t *probs)
 {
     while (tree->val > 0) {
-        if (vp56_rac_get_prob(c, probs[tree->prob_idx]))
+        if (vp56_rac_get_prob_branchy(c, probs[tree->prob_idx]))
             tree += tree->val;
         else
             tree++;
@@ -365,15 +386,13 @@ int vp56_rac_get_tree(VP56RangeCoder *c,
     return -tree->val;
 }
 
-/**
- * This is identical to vp8_rac_get_tree except for the possibility of starting
- * on a node other than the root node, needed for coeff decode where this is
- * used to save a bit after a 0 token (by disallowing EOB to immediately follow.)
- */
-static av_always_inline
-int vp8_rac_get_tree_with_offset(VP56RangeCoder *c, const int8_t (*tree)[2],
-                                 const uint8_t *probs, int i)
+// how probabilities are associated with decisions is different I think
+// well, the new scheme fits in the old but this way has one fewer branches per decision
+static av_always_inline int vp8_rac_get_tree(VP56RangeCoder *c, const int8_t (*tree)[2],
+                                   const uint8_t *probs)
 {
+    int i = 0;
+
     do {
         i = tree[i][vp56_rac_get_prob(c, probs[i])];
     } while (i > 0);
@@ -381,15 +400,6 @@ int vp8_rac_get_tree_with_offset(VP56RangeCoder *c, const int8_t (*tree)[2],
     return -i;
 }
 
-// how probabilities are associated with decisions is different I think
-// well, the new scheme fits in the old but this way has one fewer branches per decision
-static av_always_inline
-int vp8_rac_get_tree(VP56RangeCoder *c, const int8_t (*tree)[2],
-                     const uint8_t *probs)
-{
-    return vp8_rac_get_tree_with_offset(c, tree, probs, 0);
-}
-
 // DCTextra
 static av_always_inline int vp8_rac_get_coeff(VP56RangeCoder *c, const uint8_t *prob)
 {
diff --git a/libavcodec/vp56data.c b/libavcodec/vp56data.c
index 989c76a..0080370 100644
--- a/libavcodec/vp56data.c
+++ b/libavcodec/vp56data.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56data.h b/libavcodec/vp56data.h
index 21907bd..3be268c 100644
--- a/libavcodec/vp56data.h
+++ b/libavcodec/vp56data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56dsp.c b/libavcodec/vp56dsp.c
index fb3b37c..e8d93d6 100644
--- a/libavcodec/vp56dsp.c
+++ b/libavcodec/vp56dsp.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2006 Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,27 +72,8 @@ av_cold void ff_vp5dsp_init(VP56DSPContext *s)
 #endif /* CONFIG_VP5_DECODER */
 
 #if CONFIG_VP6_DECODER
-static int vp6_adjust(int v, int t)
-{
-    int V = v, s = v >> 31;
-    V ^= s;
-    V -= s;
-    if (V-t-1 >= (unsigned)(t-1))
-        return v;
-    V = 2*t - V;
-    V += s;
-    V ^= s;
-    return V;
-}
-
-VP56_EDGE_FILTER(vp6, hor, 1, stride)
-VP56_EDGE_FILTER(vp6, ver, stride, 1)
-
 av_cold void ff_vp6dsp_init(VP56DSPContext *s)
 {
-    s->edge_filter_hor = vp6_edge_filter_hor;
-    s->edge_filter_ver = vp6_edge_filter_ver;
-
     s->vp6_filter_diag4 = ff_vp6_filter_diag4_c;
 
     if (ARCH_ARM)
diff --git a/libavcodec/vp56dsp.h b/libavcodec/vp56dsp.h
index 8bf7a46..e35e232 100644
--- a/libavcodec/vp56dsp.h
+++ b/libavcodec/vp56dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56rac.c b/libavcodec/vp56rac.c
index 270a3ca..e70302b 100644
--- a/libavcodec/vp56rac.c
+++ b/libavcodec/vp56rac.c
@@ -2,20 +2,20 @@
  * VP5/6/8 decoder
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,11 +37,14 @@ const uint8_t ff_vp56_norm_shift[256]= {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 };
 
-void ff_vp56_init_range_decoder(VP56RangeCoder *c, const uint8_t *buf, int buf_size)
+int ff_vp56_init_range_decoder(VP56RangeCoder *c, const uint8_t *buf, int buf_size)
 {
     c->high = 255;
     c->bits = -16;
     c->buffer = buf;
     c->end = buf + buf_size;
+    if (buf_size < 1)
+        return AVERROR_INVALIDDATA;
     c->code_word = bytestream_get_be24(&c->buffer);
+    return 0;
 }
diff --git a/libavcodec/vp5data.h b/libavcodec/vp5data.h
index b11b99d..e16ff2d 100644
--- a/libavcodec/vp5data.h
+++ b/libavcodec/vp5data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp6.c b/libavcodec/vp6.c
index 40948ad..977fcb7 100644
--- a/libavcodec/vp6.c
+++ b/libavcodec/vp6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,11 +40,10 @@
 
 #define VP6_MAX_HUFF_SIZE 12
 
-static void vp6_parse_coeff(VP56Context *s);
-static void vp6_parse_coeff_huffman(VP56Context *s);
+static int vp6_parse_coeff(VP56Context *s);
+static int vp6_parse_coeff_huffman(VP56Context *s);
 
-static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
-                            int *golden_frame)
+static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size)
 {
     VP56RangeCoder *c = &s->c;
     int parse_filter_info = 0;
@@ -53,6 +52,7 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
     int sub_version;
     int rows, cols;
     int res = 0;
+    int ret;
     int separated_coeff = buf[0] & 1;
 
     s->frames[VP56_FRAME_CURRENT]->key_frame = !(buf[0] & 0x80);
@@ -94,7 +94,7 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
                 s->avctx->coded_width  = 16 * cols;
                 s->avctx->coded_height = 16 * rows;
             } else {
-                int ret = ff_set_dimensions(s->avctx, 16 * cols, 16 * rows);
+                ret = ff_set_dimensions(s->avctx, 16 * cols, 16 * rows);
                 if (ret < 0)
                     return ret;
 
@@ -106,13 +106,16 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
             res = VP56_SIZE_CHANGE;
         }
 
-        ff_vp56_init_range_decoder(c, buf+6, buf_size-6);
+        ret = ff_vp56_init_range_decoder(c, buf+6, buf_size-6);
+        if (ret < 0)
+            goto fail;
         vp56_rac_gets(c, 2);
 
         parse_filter_info = s->filter_header;
         if (sub_version < 8)
             vrt_shift = 5;
         s->sub_version = sub_version;
+        s->golden_frame = 0;
     } else {
         if (!s->sub_version || !s->avctx->coded_width || !s->avctx->coded_height)
             return AVERROR_INVALIDDATA;
@@ -122,9 +125,11 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
             buf += 2;
             buf_size -= 2;
         }
-        ff_vp56_init_range_decoder(c, buf+1, buf_size-1);
+        ret = ff_vp56_init_range_decoder(c, buf+1, buf_size-1);
+        if (ret < 0)
+            return ret;
 
-        *golden_frame = vp56_rac_get(c);
+        s->golden_frame = vp56_rac_get(c);
         if (s->filter_header) {
             s->deblock_filtering = vp56_rac_get(c);
             if (s->deblock_filtering)
@@ -157,15 +162,16 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
         buf      += coeff_offset;
         buf_size -= coeff_offset;
         if (buf_size < 0) {
-            if (s->frames[VP56_FRAME_CURRENT]->key_frame)
-                ff_set_dimensions(s->avctx, 0, 0);
-            return AVERROR_INVALIDDATA;
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
         }
         if (s->use_huffman) {
             s->parse_coeff = vp6_parse_coeff_huffman;
             init_get_bits(&s->gb, buf, buf_size<<3);
         } else {
-            ff_vp56_init_range_decoder(&s->cc, buf, buf_size);
+            ret = ff_vp56_init_range_decoder(&s->cc, buf, buf_size);
+            if (ret < 0)
+                goto fail;
             s->ccp = &s->cc;
         }
     } else {
@@ -173,6 +179,10 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
     }
 
     return res;
+fail:
+    if (res == VP56_SIZE_CHANGE)
+        ff_set_dimensions(s->avctx, 0, 0);
+    return ret;
 }
 
 static void vp6_coeff_order_table_init(VP56Context *s)
@@ -184,6 +194,18 @@ static void vp6_coeff_order_table_init(VP56Context *s)
         for (pos=1; pos<64; pos++)
             if (s->modelp->coeff_reorder[pos] == i)
                 s->modelp->coeff_index_to_pos[idx++] = pos;
+
+    for (idx = 0; idx < 64; idx++) {
+        int max = 0;
+        for (i = 0; i <= idx; i++) {
+            int v = s->modelp->coeff_index_to_pos[i];
+            if (v > max)
+                max = v;
+        }
+        if (s->sub_version > 6)
+            max++;
+        s->modelp->coeff_index_to_idct_selector[idx] = max;
+    }
 }
 
 static void vp6_default_models_init(VP56Context *s)
@@ -211,20 +233,20 @@ static void vp6_parse_vector_models(VP56Context *s)
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
-        if (vp56_rac_get_prob(c, vp6_sig_dct_pct[comp][0]))
+        if (vp56_rac_get_prob_branchy(c, vp6_sig_dct_pct[comp][0]))
             model->vector_dct[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp6_sig_dct_pct[comp][1]))
+        if (vp56_rac_get_prob_branchy(c, vp6_sig_dct_pct[comp][1]))
             model->vector_sig[comp] = vp56_rac_gets_nn(c, 7);
     }
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<7; node++)
-            if (vp56_rac_get_prob(c, vp6_pdv_pct[comp][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_pdv_pct[comp][node]))
                 model->vector_pdv[comp][node] = vp56_rac_gets_nn(c, 7);
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<8; node++)
-            if (vp56_rac_get_prob(c, vp6_fdv_pct[comp][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_fdv_pct[comp][node]))
                 model->vector_fdv[comp][node] = vp56_rac_gets_nn(c, 7);
 }
 
@@ -270,7 +292,7 @@ static int vp6_parse_coeff_models(VP56Context *s)
 
     for (pt=0; pt<2; pt++)
         for (node=0; node<11; node++)
-            if (vp56_rac_get_prob(c, vp6_dccv_pct[pt][node])) {
+            if (vp56_rac_get_prob_branchy(c, vp6_dccv_pct[pt][node])) {
                 def_prob[node] = vp56_rac_gets_nn(c, 7);
                 model->coeff_dccv[pt][node] = def_prob[node];
             } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -279,21 +301,21 @@ static int vp6_parse_coeff_models(VP56Context *s)
 
     if (vp56_rac_get(c)) {
         for (pos=1; pos<64; pos++)
-            if (vp56_rac_get_prob(c, vp6_coeff_reorder_pct[pos]))
+            if (vp56_rac_get_prob_branchy(c, vp6_coeff_reorder_pct[pos]))
                 model->coeff_reorder[pos] = vp56_rac_gets(c, 4);
         vp6_coeff_order_table_init(s);
     }
 
     for (cg=0; cg<2; cg++)
         for (node=0; node<14; node++)
-            if (vp56_rac_get_prob(c, vp6_runv_pct[cg][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_runv_pct[cg][node]))
                 model->coeff_runv[cg][node] = vp56_rac_gets_nn(c, 7);
 
     for (ct=0; ct<3; ct++)
         for (pt=0; pt<2; pt++)
             for (cg=0; cg<6; cg++)
                 for (node=0; node<11; node++)
-                    if (vp56_rac_get_prob(c, vp6_ract_pct[ct][pt][cg][node])) {
+                    if (vp56_rac_get_prob_branchy(c, vp6_ract_pct[ct][pt][cg][node])) {
                         def_prob[node] = vp56_rac_gets_nn(c, 7);
                         model->coeff_ract[pt][ct][cg][node] = def_prob[node];
                     } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -339,7 +361,7 @@ static void vp6_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
     for (comp=0; comp<2; comp++) {
         int i, delta = 0;
 
-        if (vp56_rac_get_prob(c, model->vector_dct[comp])) {
+        if (vp56_rac_get_prob_branchy(c, model->vector_dct[comp])) {
             static const uint8_t prob_order[] = {0, 1, 2, 7, 6, 5, 4};
             for (i=0; i<sizeof(prob_order); i++) {
                 int j = prob_order[i];
@@ -354,7 +376,7 @@ static void vp6_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
                                       model->vector_pdv[comp]);
         }
 
-        if (delta && vp56_rac_get_prob(c, model->vector_sig[comp]))
+        if (delta && vp56_rac_get_prob_branchy(c, model->vector_sig[comp]))
             delta = -delta;
 
         if (!comp)
@@ -380,7 +402,7 @@ static unsigned vp6_get_nb_null(VP56Context *s)
     return val;
 }
 
-static void vp6_parse_coeff_huffman(VP56Context *s)
+static int vp6_parse_coeff_huffman(VP56Context *s)
 {
     VP56Model *model = s->modelp;
     uint8_t *permute = s->idct_scantable;
@@ -402,7 +424,7 @@ static void vp6_parse_coeff_huffman(VP56Context *s)
                     break;
             } else {
                 if (get_bits_left(&s->gb) <= 0)
-                    return;
+                    return AVERROR_INVALIDDATA;
                 coeff = get_vlc2(&s->gb, vlc_coeff->table, FF_HUFFMAN_BITS, 3);
                 if (coeff == 0) {
                     if (coeff_idx) {
@@ -436,10 +458,12 @@ static void vp6_parse_coeff_huffman(VP56Context *s)
             cg = FFMIN(vp6_coeff_groups[coeff_idx], 3);
             vlc_coeff = &s->ract_vlc[pt][ct][cg];
         }
+        s->idct_selector[b] = model->coeff_index_to_idct_selector[FFMIN(coeff_idx, 63)];
     }
+    return 0;
 }
 
-static void vp6_parse_coeff(VP56Context *s)
+static int vp6_parse_coeff(VP56Context *s)
 {
     VP56RangeCoder *c = s->ccp;
     VP56Model *model = s->modelp;
@@ -449,6 +473,11 @@ static void vp6_parse_coeff(VP56Context *s)
     int b, i, cg, idx, ctx;
     int pt = 0;    /* plane type (0 for Y, 1 for U or V) */
 
+    if (c->end <= c->buffer && c->bits >= 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "End of AC stream reached in vp6_parse_coeff\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     for (b=0; b<6; b++) {
         int ct = 1;    /* code type */
         int run = 1;
@@ -462,16 +491,16 @@ static void vp6_parse_coeff(VP56Context *s)
 
         coeff_idx = 0;
         for (;;) {
-            if ((coeff_idx>1 && ct==0) || vp56_rac_get_prob(c, model2[0])) {
+            if ((coeff_idx>1 && ct==0) || vp56_rac_get_prob_branchy(c, model2[0])) {
                 /* parse a coeff */
-                if (vp56_rac_get_prob(c, model2[2])) {
-                    if (vp56_rac_get_prob(c, model2[3])) {
+                if (vp56_rac_get_prob_branchy(c, model2[2])) {
+                    if (vp56_rac_get_prob_branchy(c, model2[3])) {
                         idx = vp56_rac_get_tree(c, ff_vp56_pc_tree, model1);
                         coeff = ff_vp56_coeff_bias[idx+5];
                         for (i=ff_vp56_coeff_bit_length[idx]; i>=0; i--)
                             coeff += vp56_rac_get_prob(c, ff_vp56_coeff_parse_table[idx][i]) << i;
                     } else {
-                        if (vp56_rac_get_prob(c, model2[4]))
+                        if (vp56_rac_get_prob_branchy(c, model2[4]))
                             coeff = 3 + vp56_rac_get_prob(c, model1[5]);
                         else
                             coeff = 2;
@@ -492,7 +521,7 @@ static void vp6_parse_coeff(VP56Context *s)
                 /* parse a run */
                 ct = 0;
                 if (coeff_idx > 0) {
-                    if (!vp56_rac_get_prob(c, model2[1]))
+                    if (!vp56_rac_get_prob_branchy(c, model2[1]))
                         break;
 
                     model3 = model->coeff_runv[coeff_idx >= 6];
@@ -511,7 +540,9 @@ static void vp6_parse_coeff(VP56Context *s)
 
         s->left_block[ff_vp56_b6to4[b]].not_null_dc =
         s->above_blocks[s->above_block_idx[b]].not_null_dc = !!s->block_coeff[b][0];
+        s->idct_selector[b] = model->coeff_index_to_idct_selector[FFMIN(coeff_idx, 63)];
     }
+    return 0;
 }
 
 static int vp6_block_variance(uint8_t *src, ptrdiff_t stride)
@@ -604,6 +635,8 @@ static void vp6_filter(VP56Context *s, uint8_t *dst, uint8_t *src,
     }
 }
 
+static av_cold void vp6_decode_init_context(VP56Context *s);
+
 static av_cold int vp6_decode_init(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
@@ -614,6 +647,22 @@ static av_cold int vp6_decode_init(AVCodecContext *avctx)
         return ret;
     ff_vp6dsp_init(&s->vp56dsp);
 
+    vp6_decode_init_context(s);
+
+    if (s->has_alpha) {
+        s->alpha_context = av_mallocz(sizeof(VP56Context));
+        ff_vp56_init_context(avctx, s->alpha_context,
+                             s->flip == -1, s->has_alpha);
+        ff_vp6dsp_init(&s->alpha_context->vp56dsp);
+        vp6_decode_init_context(s->alpha_context);
+    }
+
+    return 0;
+}
+
+static av_cold void vp6_decode_init_context(VP56Context *s)
+{
+    s->deblock_filtering = 0;
     s->vp56_coord_div = vp6_coord_div;
     s->parse_vector_adjustment = vp6_parse_vector_adjustment;
     s->filter = vp6_filter;
@@ -621,16 +670,29 @@ static av_cold int vp6_decode_init(AVCodecContext *avctx)
     s->parse_vector_models = vp6_parse_vector_models;
     s->parse_coeff_models = vp6_parse_coeff_models;
     s->parse_header = vp6_parse_header;
-
-    return 0;
 }
 
+static av_cold void vp6_decode_free_context(VP56Context *s);
+
 static av_cold int vp6_decode_free(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
-    int pt, ct, cg;
 
     ff_vp56_free(avctx);
+    vp6_decode_free_context(s);
+
+    if (s->alpha_context) {
+        ff_vp56_free_context(s->alpha_context);
+        vp6_decode_free_context(s->alpha_context);
+        av_freep(&s->alpha_context);
+    }
+
+    return 0;
+}
+
+static av_cold void vp6_decode_free_context(VP56Context *s)
+{
+    int pt, ct, cg;
 
     for (pt=0; pt<2; pt++) {
         ff_free_vlc(&s->dccv_vlc[pt]);
@@ -639,7 +701,6 @@ static av_cold int vp6_decode_free(AVCodecContext *avctx)
             for (cg=0; cg<6; cg++)
                 ff_free_vlc(&s->ract_vlc[pt][ct][cg]);
     }
-    return 0;
 }
 
 AVCodec ff_vp6_decoder = {
@@ -677,5 +738,5 @@ AVCodec ff_vp6a_decoder = {
     .init           = vp6_decode_init,
     .close          = vp6_decode_free,
     .decode         = ff_vp56_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
 };
diff --git a/libavcodec/vp6data.h b/libavcodec/vp6data.h
index 2de90e7..539e19a 100644
--- a/libavcodec/vp6data.h
+++ b/libavcodec/vp6data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp6dsp.c b/libavcodec/vp6dsp.c
index c9968f2..f7f6856 100644
--- a/libavcodec/vp6dsp.c
+++ b/libavcodec/vp6dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 5c0b474..ba79e5f 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -7,20 +7,20 @@
  * Copyright (C) 2012 Daniel Kang
  * Copyright (C) 2014 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,14 @@
 #   include "arm/vp8.h"
 #endif
 
+#if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
+#define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
+#elif CONFIG_VP7_DECODER
+#define VPX(vp7, f) vp7_ ## f
+#else // CONFIG_VP8_DECODER
+#define VPX(vp7, f) vp8_ ## f
+#endif
+
 static void free_buffers(VP8Context *s)
 {
     int i;
@@ -159,13 +167,29 @@ static VP8Frame *vp8_find_free_buffer(VP8Context *s)
     return frame;
 }
 
+static enum AVPixelFormat get_pixel_format(VP8Context *s)
+{
+    enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_VP8_VAAPI_HWACCEL
+        AV_PIX_FMT_VAAPI,
+#endif
+#if CONFIG_VP8_NVDEC_HWACCEL
+        AV_PIX_FMT_CUDA,
+#endif
+        AV_PIX_FMT_YUV420P,
+        AV_PIX_FMT_NONE,
+    };
+
+    return ff_get_format(s->avctx, pix_fmts);
+}
+
 static av_always_inline
 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 {
     AVCodecContext *avctx = s->avctx;
     int i, ret;
 
-    if (width  != s->avctx->width ||
+    if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
         height != s->avctx->height) {
         vp8_decode_flush_impl(s->avctx, 1);
 
@@ -174,11 +198,18 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
             return ret;
     }
 
+    if (!s->actually_webp && !is_vp7) {
+        s->pix_fmt = get_pixel_format(s);
+        if (s->pix_fmt < 0)
+            return AVERROR(EINVAL);
+        avctx->pix_fmt = s->pix_fmt;
+    }
+
     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
     s->mb_height = (s->avctx->coded_height + 15) / 16;
 
     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
-                   FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
+                   avctx->thread_count > 1;
     if (!s->mb_layout) { // Frame threading and one thread
         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
                                                sizeof(*s->macroblocks));
@@ -224,6 +255,7 @@ static int vp8_update_dimensions(VP8Context *s, int width, int height)
     return update_dimensions(s, width, height, IS_VP8);
 }
 
+
 static void parse_segment_info(VP8Context *s)
 {
     VP56RangeCoder *c = &s->c;
@@ -274,6 +306,7 @@ static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 {
     const uint8_t *sizes = buf;
     int i;
+    int ret;
 
     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 
@@ -288,7 +321,9 @@ static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
             return -1;
         s->coeff_partition_size[i] = size;
 
-        ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
+        ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
+        if (ret < 0)
+            return ret;
         buf      += size;
         buf_size -= size;
     }
@@ -318,7 +353,7 @@ static void vp7_get_quants(VP8Context *s)
     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 }
 
-static void get_quants(VP8Context *s)
+static void vp8_get_quants(VP8Context *s)
 {
     VP56RangeCoder *c = &s->c;
     int i, base_qi;
@@ -439,7 +474,7 @@ static void update_refs(VP8Context *s)
     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 }
 
-static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
+static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 {
     int i, j;
 
@@ -450,16 +485,18 @@ static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
     }
 }
 
-static void fade(uint8_t *dst, uint8_t *src,
-                 int width, int height, ptrdiff_t linesize,
+static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
+                 const uint8_t *src, ptrdiff_t src_linesize,
+                 int width, int height,
                  int alpha, int beta)
 {
     int i, j;
-
     for (j = 0; j < height; j++) {
+        const uint8_t *src2 = src + j * src_linesize;
+        uint8_t *dst2 = dst + j * dst_linesize;
         for (i = 0; i < width; i++) {
-            uint8_t y = src[j * linesize + i];
-            dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
+            uint8_t y = src2[i];
+            dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
         }
     }
 }
@@ -470,13 +507,19 @@ static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
     int ret;
 
+    if (c->end <= c->buffer && c->bits >= 0)
+        return AVERROR_INVALIDDATA;
+
     if (!s->keyframe && (alpha || beta)) {
         int width  = s->mb_width * 16;
         int height = s->mb_height * 16;
         AVFrame *src, *dst;
 
-        if (!s->framep[VP56_FRAME_PREVIOUS])
+        if (!s->framep[VP56_FRAME_PREVIOUS] ||
+            !s->framep[VP56_FRAME_GOLDEN]) {
+            av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
             return AVERROR_INVALIDDATA;
+        }
 
         dst =
         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
@@ -485,15 +528,16 @@ static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
-               return ret;
+                return ret;
 
             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 
-            copy_luma(dst, src, width, height);
+            copy_chroma(dst, src, width, height);
         }
 
-        fade(dst->data[0], src->data[0],
-             width, height, dst->linesize[0], alpha, beta);
+        fade(dst->data[0], dst->linesize[0],
+             src->data[0], src->linesize[0],
+             width, height, alpha, beta);
     }
 
     return 0;
@@ -520,16 +564,19 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     s->invisible = 0;
     part1_size   = AV_RL24(buf) >> 4;
 
-    buf      += 4 - s->profile;
-    buf_size -= 4 - s->profile;
-
-    if (buf_size < part1_size) {
+    if (buf_size < 4 - s->profile + part1_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
         return AVERROR_INVALIDDATA;
     }
 
+    buf      += 4 - s->profile;
+    buf_size -= 4 - s->profile;
+
     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 
-    ff_vp56_init_range_decoder(c, buf, part1_size);
+    ret = ff_vp56_init_range_decoder(c, buf, part1_size);
+    if (ret < 0)
+        return ret;
     buf      += part1_size;
     buf_size -= part1_size;
 
@@ -572,7 +619,7 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
              if (vp7_feature_value_size[s->profile][i])
                  for (j = 0; j < 4; j++)
                      s->feature_value[i][j] =
-                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
+                        vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
         }
     }
 
@@ -581,7 +628,9 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     s->lf_delta.enabled        = 0;
 
     s->num_coeff_partitions = 1;
-    ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
+    ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
+    if (ret < 0)
+        return ret;
 
     if (!s->macroblocks_base || /* first frame */
         width != s->avctx->width || height != s->avctx->height ||
@@ -612,6 +661,8 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
             s->fade_present = vp8_rac_get(c);
     }
 
+    if (c->end <= c->buffer && c->bits >= 0)
+        return AVERROR_INVALIDDATA;
     /* E. Fading information for previous frame */
     if (s->fade_present && vp8_rac_get(c)) {
         if ((ret = vp7_fade_frame(s ,c)) < 0)
@@ -655,6 +706,11 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     int width  = s->avctx->width;
     int height = s->avctx->height;
 
+    if (buf_size < 3) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
+        return AVERROR_INVALIDDATA;
+    }
+
     s->keyframe  = !(buf[0] & 1);
     s->profile   =  (buf[0]>>1) & 7;
     s->invisible = !(buf[0] & 0x10);
@@ -707,7 +763,9 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
     }
 
-    ff_vp56_init_range_decoder(c, buf, header_size);
+    ret = ff_vp56_init_range_decoder(c, buf, header_size);
+    if (ret < 0)
+        return ret;
     buf      += header_size;
     buf_size -= header_size;
 
@@ -739,11 +797,12 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     }
 
     if (!s->macroblocks_base || /* first frame */
-        width != s->avctx->width || height != s->avctx->height)
+        width != s->avctx->width || height != s->avctx->height ||
+        (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
             return ret;
 
-    get_quants(s);
+    vp8_get_quants(s);
 
     if (!s->keyframe) {
         update_refs(s);
@@ -781,16 +840,18 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
 }
 
 static av_always_inline
-void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
+void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 {
-    dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
-    dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
+    dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
+                             av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
+    dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
+                             av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 }
 
 /**
  * Motion vector coding, 17.1.
  */
-static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
+static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 {
     int bit, x = 0;
 
@@ -818,6 +879,16 @@ static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 }
 
+static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
+{
+    return read_mv_component(c, p, 1);
+}
+
+static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
+{
+    return read_mv_component(c, p, 0);
+}
+
 static av_always_inline
 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 {
@@ -1008,8 +1079,8 @@ void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
                     mb->mode = VP8_MVMODE_SPLIT;
                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
                 } else {
-                    mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
-                    mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
+                    mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
+                    mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
                     mb->bmv[0] = mb->mv;
                 }
             } else {
@@ -1028,7 +1099,7 @@ void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 }
 
 static av_always_inline
-void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
+void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
                     int mb_x, int mb_y, int layout)
 {
     VP8Macroblock *mb_edge[3] = { 0      /* top */,
@@ -1099,7 +1170,7 @@ void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
                 /* Choose the best mv out of 0,0 and the nearest mv */
-                clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
+                clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
@@ -1108,16 +1179,16 @@ void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
                     mb->mode = VP8_MVMODE_SPLIT;
                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
                 } else {
-                    mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
-                    mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
+                    mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
+                    mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
                     mb->bmv[0] = mb->mv;
                 }
             } else {
-                clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
+                clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
                 mb->bmv[0] = mb->mv;
             }
         } else {
-            clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
+            clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
             mb->bmv[0] = mb->mv;
         }
     } else {
@@ -1133,7 +1204,7 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 {
     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 
-    if (layout == 1) {
+    if (layout) {
         VP8Macroblock *mb_top = mb - s->mb_width - 1;
         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
     }
@@ -1141,7 +1212,7 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
         int x, y;
         uint8_t *top;
         uint8_t *const left = s->intra4x4_pred_mode_left;
-        if (layout == 1)
+        if (layout)
             top = mb->intra4x4_pred_mode_top;
         else
             top = s->intra4x4_pred_mode_top + 4 * mb_x;
@@ -1163,7 +1234,8 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 }
 
 static av_always_inline
-void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
+void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
+                    VP8Macroblock *mb, int mb_x, int mb_y,
                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
 {
     VP56RangeCoder *c = &s->c;
@@ -1176,7 +1248,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         *segment = 0;
         for (i = 0; i < 4; i++) {
             if (s->feature_enabled[i]) {
-                if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
+                if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
                                                    s->feature_index_prob[i]);
                       av_log(s->avctx, AV_LOG_WARNING,
@@ -1185,9 +1257,10 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
                 }
            }
         }
-    } else if (s->segmentation.update_map)
-        *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
-    else if (s->segmentation.enabled)
+    } else if (s->segmentation.update_map) {
+        int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
+        *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
+    } else if (s->segmentation.enabled)
         *segment = ref ? *ref : *segment;
     mb->segment = *segment;
 
@@ -1202,7 +1275,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         } else {
             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
-            if (s->mb_layout == 1)
+            if (s->mb_layout)
                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
             else
                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
@@ -1226,7 +1299,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         if (is_vp7)
             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
         else
-            vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
+            vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
     } else {
         // intra MB, 16.1
         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
@@ -1366,6 +1439,7 @@ static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
  * @param zero_nhood the initial prediction context for number of surrounding
  *                   all-zero blocks (only left/top, so 0-2)
  * @param qmul       array holding the dc/ac dequant factor at position 0/1
+ * @param scan       scan pattern (VP7 only)
  *
  * @return 0 if no coeffs were decoded
  *         otherwise, the index of the last coeff decoded plus one
@@ -1630,7 +1704,7 @@ void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
                 int copy = 0;
                 ptrdiff_t linesize = s->linesize;
                 uint8_t *dst = ptr + 4 * x;
-                DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
+                LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
 
                 if ((y == 0 || x == 3) && mb_y == 0) {
                     topright = tr_top;
@@ -1736,8 +1810,8 @@ void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
     if (AV_RN32A(mv)) {
         ptrdiff_t src_linesize = linesize;
 
-        int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
-        int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
+        int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
+        int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
 
         x_off += mv->x >> 2;
         y_off += mv->y >> 2;
@@ -1807,7 +1881,8 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                      src1 - my_idx * linesize - mx_idx,
                                      EDGE_EMU_LINESIZE, linesize,
-                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                     block_w + subpel_idx[1][mx],
+                                     block_h + subpel_idx[1][my],
                                      x_off - mx_idx, y_off - my_idx, width, height);
             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
@@ -1815,7 +1890,8 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                      src2 - my_idx * linesize - mx_idx,
                                      EDGE_EMU_LINESIZE, linesize,
-                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                     block_w + subpel_idx[1][mx],
+                                     block_h + subpel_idx[1][my],
                                      x_off - mx_idx, y_off - my_idx, width, height);
             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
@@ -2192,14 +2268,14 @@ void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
 
 #define MARGIN (16 << 2)
 static av_always_inline
-void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
+int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
                                     VP8Frame *prev_frame, int is_vp7)
 {
     VP8Context *s = avctx->priv_data;
     int mb_x, mb_y;
 
-    s->mv_min.y = -MARGIN;
-    s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
+    s->mv_bounds.mv_min.y = -MARGIN;
+    s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         VP8Macroblock *mb = s->macroblocks_base +
                             ((s->mb_width + 1) * (mb_y + 1) + 1);
@@ -2207,51 +2283,56 @@ void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
 
         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
 
-        s->mv_min.x = -MARGIN;
-        s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
+        s->mv_bounds.mv_min.x = -MARGIN;
+        s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
+
+        if (vpX_rac_is_end(&s->c)) {
+            return AVERROR_INVALIDDATA;
+        }
         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
             if (mb_y == 0)
                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
                          DC_PRED * 0x01010101);
-            decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
+            decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
                            prev_frame && prev_frame->seg_map ?
                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
-            s->mv_min.x -= 64;
-            s->mv_max.x -= 64;
+            s->mv_bounds.mv_min.x -= 64;
+            s->mv_bounds.mv_max.x -= 64;
         }
-        s->mv_min.y -= 64;
-        s->mv_max.y -= 64;
+        s->mv_bounds.mv_min.y -= 64;
+        s->mv_bounds.mv_max.y -= 64;
     }
+    return 0;
 }
 
-static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
+static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
                                    VP8Frame *prev_frame)
 {
-    vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
+    return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
 }
 
-static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
+static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
                                    VP8Frame *prev_frame)
 {
-    vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
+    return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
 }
 
 #if HAVE_THREADS
 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
     do {                                                                      \
         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
-        if (otd->thread_mb_pos < tmp) {                                       \
+        if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
             pthread_mutex_lock(&otd->lock);                                   \
-            td->wait_mb_pos = tmp;                                            \
+            atomic_store(&td->wait_mb_pos, tmp);                              \
             do {                                                              \
-                if (otd->thread_mb_pos >= tmp)                                \
+                if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
                     break;                                                    \
                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
             } while (1);                                                      \
-            td->wait_mb_pos = INT_MAX;                                        \
+            atomic_store(&td->wait_mb_pos, INT_MAX);                          \
             pthread_mutex_unlock(&otd->lock);                                 \
         }                                                                     \
-    } while (0);
+    } while (0)
 
 #define update_pos(td, mb_y, mb_x)                                            \
     do {                                                                      \
@@ -2259,29 +2340,27 @@ static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
                                (num_jobs > 1);                                \
         int is_null          = !next_td || !prev_td;                          \
-        int pos_check        = (is_null) ? 1                                  \
-                                         : (next_td != td &&                  \
-                                            pos >= next_td->wait_mb_pos) ||   \
-                                           (prev_td != td &&                  \
-                                            pos >= prev_td->wait_mb_pos);     \
-        td->thread_mb_pos = pos;                                              \
+        int pos_check        = (is_null) ? 1 :                                \
+            (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
+            (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
+        atomic_store(&td->thread_mb_pos, pos);                                \
         if (sliced_threading && pos_check) {                                  \
             pthread_mutex_lock(&td->lock);                                    \
             pthread_cond_broadcast(&td->cond);                                \
             pthread_mutex_unlock(&td->lock);                                  \
         }                                                                     \
-    } while (0);
+    } while (0)
 #else
-#define check_thread_pos(td, otd, mb_x_check, mb_y_check)
-#define update_pos(td, mb_y, mb_x)
+#define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
+#define update_pos(td, mb_y, mb_x) while(0)
 #endif
 
-static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
                                         int jobnr, int threadnr, int is_vp7)
 {
     VP8Context *s = avctx->priv_data;
     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
-    int mb_y = td->thread_mb_pos >> 16;
+    int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
     int mb_x, mb_xy = mb_y * s->mb_width;
     int num_jobs = s->num_jobs;
     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
@@ -2292,6 +2371,10 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
     };
+
+    if (c->end <= c->buffer && c->bits >= 0)
+         return AVERROR_INVALIDDATA;
+
     if (mb_y == 0)
         prev_td = td;
     else
@@ -2316,10 +2399,12 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
     if (!is_vp7 || mb_y == 0)
         memset(td->left_nnz, 0, sizeof(td->left_nnz));
 
-    s->mv_min.x = -MARGIN;
-    s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
+    td->mv_bounds.mv_min.x = -MARGIN;
+    td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
 
     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
+        if (c->end <= c->buffer && c->bits >= 0)
+            return AVERROR_INVALIDDATA;
         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
         if (prev_td != td) {
             if (threadnr != 0) {
@@ -2339,7 +2424,7 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
                          dst[2] - dst[1], 2);
 
         if (!s->mb_layout)
-            decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
+            decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
                            prev_frame && prev_frame->seg_map ?
                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
 
@@ -2386,8 +2471,8 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
         dst[0]      += 16;
         dst[1]      += 8;
         dst[2]      += 8;
-        s->mv_min.x -= 64;
-        s->mv_max.x -= 64;
+        td->mv_bounds.mv_min.x -= 64;
+        td->mv_bounds.mv_max.x -= 64;
 
         if (mb_x == s->mb_width + 1) {
             update_pos(td, mb_y, s->mb_width + 3);
@@ -2395,14 +2480,27 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
             update_pos(td, mb_y, mb_x);
         }
     }
+    return 0;
 }
 
-static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
+static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+                                        int jobnr, int threadnr)
+{
+    return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
+}
+
+static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+                                        int jobnr, int threadnr)
+{
+    return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
+}
+
+static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
                               int jobnr, int threadnr, int is_vp7)
 {
     VP8Context *s = avctx->priv_data;
     VP8ThreadData *td = &s->thread_data[threadnr];
-    int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
+    int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
     AVFrame *curframe = s->curframe->tf.f;
     VP8Macroblock *mb;
     VP8ThreadData *prev_td, *next_td;
@@ -2456,6 +2554,18 @@ static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
     }
 }
 
+static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
+                              int jobnr, int threadnr)
+{
+    filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
+}
+
+static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
+                              int jobnr, int threadnr)
+{
+    filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
+}
+
 static av_always_inline
 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
                               int threadnr, int is_vp7)
@@ -2465,19 +2575,24 @@ int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
     VP8ThreadData *next_td = NULL, *prev_td = NULL;
     VP8Frame *curframe = s->curframe;
     int mb_y, num_jobs = s->num_jobs;
+    int ret;
 
     td->thread_nr = threadnr;
+    td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
+    td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
-        if (mb_y >= s->mb_height)
-            break;
-        td->thread_mb_pos = mb_y << 16;
-        vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
+        atomic_store(&td->thread_mb_pos, mb_y << 16);
+        ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
+        if (ret < 0) {
+            update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
+            return ret;
+        }
         if (s->deblock_filter)
-            vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
+            s->filter_mb_row(avctx, tdata, jobnr, threadnr);
         update_pos(td, mb_y, INT_MAX & 0xFFFF);
 
-        s->mv_min.y -= 64;
-        s->mv_max.y -= 64;
+        td->mv_bounds.mv_min.y -= 64 * num_jobs;
+        td->mv_bounds.mv_max.y -= 64 * num_jobs;
 
         if (avctx->active_thread_type == FF_THREAD_FRAME)
             ff_thread_report_progress(&curframe->tf, mb_y, 0);
@@ -2518,15 +2633,7 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (s->actually_webp) {
         // avctx->pix_fmt already set in caller.
     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
-        enum AVPixelFormat pix_fmts[] = {
-#if CONFIG_VP8_VAAPI_HWACCEL
-            AV_PIX_FMT_VAAPI,
-#endif
-            AV_PIX_FMT_YUV420P,
-            AV_PIX_FMT_NONE,
-        };
-
-        s->pix_fmt = ff_get_format(s->avctx, pix_fmts);
+        s->pix_fmt = get_pixel_format(s);
         if (s->pix_fmt < 0) {
             ret = AVERROR(EINVAL);
             goto err;
@@ -2552,7 +2659,7 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // release no longer referenced frames
     for (i = 0; i < 5; i++)
-        if (s->frames[i].tf.f->data[0] &&
+        if (s->frames[i].tf.f->buf[0] &&
             &s->frames[i] != prev_frame &&
             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
@@ -2584,10 +2691,8 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     curframe->tf.f->key_frame = s->keyframe;
     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
                                             : AV_PICTURE_TYPE_P;
-    if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
+    if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
         goto err;
-    }
 
     // check if golden and altref are swapped
     if (s->update_altref != VP56_FRAME_NONE)
@@ -2644,9 +2749,11 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 !s->segmentation.update_map)
                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
             if (is_vp7)
-                vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
+                ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
             else
-                vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
+                ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
+            if (ret < 0)
+                goto err;
         }
 
         if (avctx->active_thread_type == FF_THREAD_FRAME)
@@ -2656,13 +2763,13 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         s->num_jobs   = num_jobs;
         s->curframe   = curframe;
         s->prev_frame = prev_frame;
-        s->mv_min.y   = -MARGIN;
-        s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
+        s->mv_bounds.mv_min.y   = -MARGIN;
+        s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
         for (i = 0; i < MAX_THREADS; i++) {
-            s->thread_data[i].thread_mb_pos = 0;
-            s->thread_data[i].wait_mb_pos   = INT_MAX;
+            VP8ThreadData *td = &s->thread_data[i];
+            atomic_init(&td->thread_mb_pos, 0);
+            atomic_init(&td->wait_mb_pos, INT_MAX);
         }
-
         if (is_vp7)
             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
                             num_jobs);
@@ -2711,6 +2818,9 @@ av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
     VP8Context *s = avctx->priv_data;
     int i;
 
+    if (!s)
+        return 0;
+
     vp8_decode_flush_impl(avctx, 1);
     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
         av_frame_free(&s->frames[i].tf.f);
@@ -2736,6 +2846,7 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     int ret;
 
     s->avctx = avctx;
+    s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
     s->pix_fmt = AV_PIX_FMT_NONE;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->internal->allocate_progress = 1;
@@ -2746,9 +2857,13 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     if (CONFIG_VP7_DECODER && is_vp7) {
         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
         ff_vp7dsp_init(&s->vp8dsp);
+        s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
+        s->filter_mb_row           = vp7_filter_mb_row;
     } else if (CONFIG_VP8_DECODER && !is_vp7) {
         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
         ff_vp8dsp_init(&s->vp8dsp);
+        s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
+        s->filter_mb_row           = vp8_filter_mb_row;
     }
 
     /* does not change for VP8 */
@@ -2775,6 +2890,7 @@ av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
 }
 
 #if CONFIG_VP8_DECODER
+#if HAVE_THREADS
 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
 {
     VP8Context *s = avctx->priv_data;
@@ -2790,7 +2906,7 @@ static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
     return 0;
 }
 
-#define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
+#define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
 
 static int vp8_decode_update_thread_context(AVCodecContext *dst,
                                             const AVCodecContext *src)
@@ -2805,13 +2921,14 @@ static int vp8_decode_update_thread_context(AVCodecContext *dst,
         s->mb_height = s_src->mb_height;
     }
 
+    s->pix_fmt      = s_src->pix_fmt;
     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
     s->segmentation = s_src->segmentation;
     s->lf_delta     = s_src->lf_delta;
     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
 
     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
-        if (s_src->frames[i].tf.f->data[0]) {
+        if (s_src->frames[i].tf.f->buf[0]) {
             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
             if (ret < 0)
                 return ret;
@@ -2825,6 +2942,7 @@ static int vp8_decode_update_thread_context(AVCodecContext *dst,
 
     return 0;
 }
+#endif /* HAVE_THREADS */
 #endif /* CONFIG_VP8_DECODER */
 
 #if CONFIG_VP7_DECODER
@@ -2854,14 +2972,17 @@ AVCodec ff_vp8_decoder = {
     .decode                = ff_vp8_decode_frame,
     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
                              AV_CODEC_CAP_SLICE_THREADS,
+    .flush                 = vp8_decode_flush,
+    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
 #if CONFIG_VP8_VAAPI_HWACCEL
                                HWACCEL_VAAPI(vp8),
 #endif
+#if CONFIG_VP8_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(vp8),
+#endif
                                NULL
                            },
-    .flush                 = vp8_decode_flush,
-    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
-    .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
 };
 #endif /* CONFIG_VP7_DECODER */
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 1bf7561..70d21e3 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -6,39 +6,36 @@
  * Copyright (C) 2010 Fiona Glaser
  * Copyright (C) 2012 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_VP8_H
 #define AVCODEC_VP8_H
 
+#include <stdatomic.h>
+
 #include "libavutil/buffer.h"
+#include "libavutil/thread.h"
 
 #include "h264pred.h"
 #include "thread.h"
 #include "vp56.h"
 #include "vp8dsp.h"
 
-#if HAVE_PTHREADS
-#   include <pthread.h>
-#elif HAVE_W32THREADS
-#   include "compat/w32pthreads.h"
-#endif
-
 #define VP8_MAX_QUANT 127
 
 enum dct_token {
@@ -96,6 +93,16 @@ typedef struct VP8Macroblock {
     VP56mv bmv[16];
 } VP8Macroblock;
 
+typedef struct VP8intmv {
+    int x;
+    int y;
+} VP8intmv;
+
+typedef struct VP8mvbounds {
+    VP8intmv mv_min;
+    VP8intmv mv_max;
+} VP8mvbounds;
+
 typedef struct VP8ThreadData {
     DECLARE_ALIGNED(16, int16_t, block)[6][4][16];
     DECLARE_ALIGNED(16, int16_t, block_dc)[16];
@@ -119,12 +126,13 @@ typedef struct VP8ThreadData {
     pthread_mutex_t lock;
     pthread_cond_t cond;
 #endif
-    int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF)
-    int wait_mb_pos; // What the current thread is waiting on.
+    atomic_int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF)
+    atomic_int wait_mb_pos; // What the current thread is waiting on.
 
 #define EDGE_EMU_LINESIZE 32
     DECLARE_ALIGNED(16, uint8_t, edge_emu_buffer)[21 * EDGE_EMU_LINESIZE];
     VP8FilterStrength *filter_strength;
+    VP8mvbounds mv_bounds;
 } VP8ThreadData;
 
 typedef struct VP8Frame {
@@ -156,8 +164,7 @@ typedef struct VP8Context {
     uint8_t deblock_filter;
     uint8_t mbskip_enabled;
     uint8_t profile;
-    VP56mv mv_min;
-    VP56mv mv_max;
+    VP8mvbounds mv_bounds;
 
     int8_t sign_bias[4]; ///< one state [0, 1] per ref frame type
     int ref_count[3];
@@ -308,6 +315,11 @@ typedef struct VP8Context {
      */
     int mb_layout;
 
+    int (*decode_mb_row_no_filter)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr);
+    void (*filter_mb_row)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr);
+
+    int vp7;
+
     /**
      * Fade bit present in bitstream (VP7)
      */
diff --git a/libavcodec/vp8_parser.c b/libavcodec/vp8_parser.c
index fad652d..7ce35e7 100644
--- a/libavcodec/vp8_parser.c
+++ b/libavcodec/vp8_parser.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,14 +28,17 @@ static int parse(AVCodecParserContext *s,
     unsigned int frame_type;
     unsigned int profile;
 
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+
     if (buf_size < 3)
-        return AVERROR_INVALIDDATA;
+        return buf_size;
 
     frame_type = buf[0] & 1;
     profile    = (buf[0] >> 1) & 7;
     if (profile > 3) {
         av_log(avctx, AV_LOG_ERROR, "Invalid profile %u.\n", profile);
-        return AVERROR_INVALIDDATA;
+        return buf_size;
     }
 
     avctx->profile = profile;
@@ -50,12 +53,12 @@ static int parse(AVCodecParserContext *s,
         unsigned int width, height;
 
         if (buf_size < 10)
-            return AVERROR_INVALIDDATA;
+            return buf_size;
 
         sync_code = AV_RL24(buf + 3);
         if (sync_code != 0x2a019d) {
             av_log(avctx, AV_LOG_ERROR, "Invalid sync code %06x.\n", sync_code);
-            return AVERROR_INVALIDDATA;
+            return buf_size;
         }
 
         width  = AV_RL16(buf + 6) & 0x3fff;
@@ -67,8 +70,6 @@ static int parse(AVCodecParserContext *s,
         s->coded_height = FFALIGN(height, 16);
     }
 
-    *poutbuf      = buf;
-    *poutbuf_size = buf_size;
     return buf_size;
 }
 
diff --git a/libavcodec/vp8data.h b/libavcodec/vp8data.h
index f8f9fff..5e6dea7 100644
--- a/libavcodec/vp8data.h
+++ b/libavcodec/vp8data.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index ac9a6af..4ff63d0 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2014 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
 #include "mathops.h"
 #include "vp8dsp.h"
@@ -52,7 +53,8 @@ static void name ## _idct_dc_add4y_c(uint8_t *dst, int16_t block[4][16],      \
 #if CONFIG_VP7_DECODER
 static void vp7_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
 {
-    int i, a1, b1, c1, d1;
+    int i;
+    unsigned a1, b1, c1, d1;
     int16_t tmp[16];
 
     for (i = 0; i < 4; i++) {
@@ -60,10 +62,10 @@ static void vp7_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
         b1 = (dc[i * 4 + 0] - dc[i * 4 + 2]) * 23170;
         c1 = dc[i * 4 + 1] * 12540 - dc[i * 4 + 3] * 30274;
         d1 = dc[i * 4 + 1] * 30274 + dc[i * 4 + 3] * 12540;
-        tmp[i * 4 + 0] = (a1 + d1) >> 14;
-        tmp[i * 4 + 3] = (a1 - d1) >> 14;
-        tmp[i * 4 + 1] = (b1 + c1) >> 14;
-        tmp[i * 4 + 2] = (b1 - c1) >> 14;
+        tmp[i * 4 + 0] = (int)(a1 + d1) >> 14;
+        tmp[i * 4 + 3] = (int)(a1 - d1) >> 14;
+        tmp[i * 4 + 1] = (int)(b1 + c1) >> 14;
+        tmp[i * 4 + 2] = (int)(b1 - c1) >> 14;
     }
 
     for (i = 0; i < 4; i++) {
@@ -71,14 +73,11 @@ static void vp7_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
         b1 = (tmp[i + 0] - tmp[i + 8]) * 23170;
         c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274;
         d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540;
-        dc[i * 4 + 0] = 0;
-        dc[i * 4 + 1] = 0;
-        dc[i * 4 + 2] = 0;
-        dc[i * 4 + 3] = 0;
-        block[0][i][0] = (a1 + d1 + 0x20000) >> 18;
-        block[3][i][0] = (a1 - d1 + 0x20000) >> 18;
-        block[1][i][0] = (b1 + c1 + 0x20000) >> 18;
-        block[2][i][0] = (b1 - c1 + 0x20000) >> 18;
+        AV_ZERO64(dc + i * 4);
+        block[0][i][0] = (int)(a1 + d1 + 0x20000) >> 18;
+        block[3][i][0] = (int)(a1 - d1 + 0x20000) >> 18;
+        block[1][i][0] = (int)(b1 + c1 + 0x20000) >> 18;
+        block[2][i][0] = (int)(b1 - c1 + 0x20000) >> 18;
     }
 }
 
@@ -97,7 +96,8 @@ static void vp7_luma_dc_wht_dc_c(int16_t block[4][4][16], int16_t dc[16])
 
 static void vp7_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
 {
-    int i, a1, b1, c1, d1;
+    int i;
+    unsigned a1, b1, c1, d1;
     int16_t tmp[16];
 
     for (i = 0; i < 4; i++) {
@@ -105,14 +105,11 @@ static void vp7_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
         b1 = (block[i * 4 + 0] - block[i * 4 + 2]) * 23170;
         c1 = block[i * 4 + 1] * 12540 - block[i * 4 + 3] * 30274;
         d1 = block[i * 4 + 1] * 30274 + block[i * 4 + 3] * 12540;
-        block[i * 4 + 0] = 0;
-        block[i * 4 + 1] = 0;
-        block[i * 4 + 2] = 0;
-        block[i * 4 + 3] = 0;
-        tmp[i * 4 + 0] = (a1 + d1) >> 14;
-        tmp[i * 4 + 3] = (a1 - d1) >> 14;
-        tmp[i * 4 + 1] = (b1 + c1) >> 14;
-        tmp[i * 4 + 2] = (b1 - c1) >> 14;
+        AV_ZERO64(block + i * 4);
+        tmp[i * 4 + 0] = (int)(a1 + d1) >> 14;
+        tmp[i * 4 + 3] = (int)(a1 - d1) >> 14;
+        tmp[i * 4 + 1] = (int)(b1 + c1) >> 14;
+        tmp[i * 4 + 2] = (int)(b1 - c1) >> 14;
     }
 
     for (i = 0; i < 4; i++) {
@@ -121,13 +118,13 @@ static void vp7_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
         c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274;
         d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540;
         dst[0 * stride + i] = av_clip_uint8(dst[0 * stride + i] +
-                                            ((a1 + d1 + 0x20000) >> 18));
+                                            ((int)(a1 + d1 + 0x20000) >> 18));
         dst[3 * stride + i] = av_clip_uint8(dst[3 * stride + i] +
-                                            ((a1 - d1 + 0x20000) >> 18));
+                                            ((int)(a1 - d1 + 0x20000) >> 18));
         dst[1 * stride + i] = av_clip_uint8(dst[1 * stride + i] +
-                                            ((b1 + c1 + 0x20000) >> 18));
+                                            ((int)(b1 + c1 + 0x20000) >> 18));
         dst[2 * stride + i] = av_clip_uint8(dst[2 * stride + i] +
-                                            ((b1 - c1 + 0x20000) >> 18));
+                                            ((int)(b1 - c1 + 0x20000) >> 18));
     }
 }
 
@@ -171,10 +168,7 @@ static void vp8_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
         t1 = dc[i * 4 + 1] + dc[i * 4 + 2];
         t2 = dc[i * 4 + 1] - dc[i * 4 + 2];
         t3 = dc[i * 4 + 0] - dc[i * 4 + 3] + 3; // rounding
-        dc[i * 4 + 0] = 0;
-        dc[i * 4 + 1] = 0;
-        dc[i * 4 + 2] = 0;
-        dc[i * 4 + 3] = 0;
+        AV_ZERO64(dc + i * 4);
 
         block[i][0][0] = (t0 + t1) >> 3;
         block[i][1][0] = (t3 + t2) >> 3;
@@ -262,7 +256,7 @@ MK_IDCT_DC_ADD4_C(vp8)
     int av_unused q2 = p[ 2 * stride];                                        \
     int av_unused q3 = p[ 3 * stride];
 
-#define clip_int8(n) (cm[n + 0x80] - 0x80)
+#define clip_int8(n) (cm[(n) + 0x80] - 0x80)
 
 static av_always_inline void filter_common(uint8_t *p, ptrdiff_t stride,
                                            int is4tap, int is_vp7)
@@ -747,5 +741,7 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
         ff_vp8dsp_init_arm(dsp);
     if (ARCH_X86)
         ff_vp8dsp_init_x86(dsp);
+    if (ARCH_MIPS)
+        ff_vp8dsp_init_mips(dsp);
 }
 #endif /* CONFIG_VP8_DECODER */
diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h
index 65d8418..cfe1524 100644
--- a/libavcodec/vp8dsp.h
+++ b/libavcodec/vp8dsp.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -100,6 +100,7 @@ void ff_vp8dsp_init(VP8DSPContext *c);
 void ff_vp8dsp_init_aarch64(VP8DSPContext *c);
 void ff_vp8dsp_init_arm(VP8DSPContext *c);
 void ff_vp8dsp_init_x86(VP8DSPContext *c);
+void ff_vp8dsp_init_mips(VP8DSPContext *c);
 
 #define IS_VP7 1
 #define IS_VP8 0
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 48f8afe..acf3ffc 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -4,43 +4,102 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/avassert.h"
-
 #include "avcodec.h"
 #include "get_bits.h"
+#include "hwaccel.h"
 #include "internal.h"
+#include "profiles.h"
+#include "thread.h"
 #include "videodsp.h"
 #include "vp56.h"
 #include "vp9.h"
 #include "vp9data.h"
+#include "vp9dec.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
 
 #define VP9_SYNCCODE 0x498342
-#define MAX_PROB 255
+
+#if HAVE_THREADS
+static void vp9_free_entries(AVCodecContext *avctx) {
+    VP9Context *s = avctx->priv_data;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        pthread_mutex_destroy(&s->progress_mutex);
+        pthread_cond_destroy(&s->progress_cond);
+        av_freep(&s->entries);
+    }
+}
+
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) {
+    VP9Context *s = avctx->priv_data;
+    int i;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        if (s->entries)
+            av_freep(&s->entries);
+
+        s->entries = av_malloc_array(n, sizeof(atomic_int));
+
+        if (!s->entries) {
+            av_freep(&s->entries);
+            return AVERROR(ENOMEM);
+        }
+
+        for (i  = 0; i < n; i++)
+            atomic_init(&s->entries[i], 0);
+
+        pthread_mutex_init(&s->progress_mutex, NULL);
+        pthread_cond_init(&s->progress_cond, NULL);
+    }
+    return 0;
+}
+
+static void vp9_report_tile_progress(VP9Context *s, int field, int n) {
+    pthread_mutex_lock(&s->progress_mutex);
+    atomic_fetch_add_explicit(&s->entries[field], n, memory_order_release);
+    pthread_cond_signal(&s->progress_cond);
+    pthread_mutex_unlock(&s->progress_mutex);
+}
+
+static void vp9_await_tile_progress(VP9Context *s, int field, int n) {
+    if (atomic_load_explicit(&s->entries[field], memory_order_acquire) >= n)
+        return;
+
+    pthread_mutex_lock(&s->progress_mutex);
+    while (atomic_load_explicit(&s->entries[field], memory_order_relaxed) != n)
+        pthread_cond_wait(&s->progress_cond, &s->progress_mutex);
+    pthread_mutex_unlock(&s->progress_mutex);
+}
+#else
+static void vp9_free_entries(AVCodecContext *avctx) {}
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) { return 0; }
+#endif
 
 static void vp9_frame_unref(AVCodecContext *avctx, VP9Frame *f)
 {
     ff_thread_release_buffer(avctx, &f->tf);
-    av_buffer_unref(&f->segmentation_map_buf);
-    av_buffer_unref(&f->mv_buf);
+    av_buffer_unref(&f->extradata);
+    av_buffer_unref(&f->hwaccel_priv_buf);
     f->segmentation_map = NULL;
-    f->mv               = NULL;
+    f->hwaccel_picture_private = NULL;
 }
 
 static int vp9_frame_alloc(AVCodecContext *avctx, VP9Frame *f)
@@ -53,139 +112,230 @@ static int vp9_frame_alloc(AVCodecContext *avctx, VP9Frame *f)
         return ret;
 
     sz = 64 * s->sb_cols * s->sb_rows;
-    f->segmentation_map_buf = av_buffer_allocz(sz * sizeof(*f->segmentation_map));
-    f->mv_buf               = av_buffer_allocz(sz * sizeof(*f->mv));
-    if (!f->segmentation_map_buf || !f->mv_buf) {
-        vp9_frame_unref(avctx, f);
-        return AVERROR(ENOMEM);
+    f->extradata = av_buffer_allocz(sz * (1 + sizeof(VP9mvrefPair)));
+    if (!f->extradata) {
+        goto fail;
     }
 
-    f->segmentation_map = f->segmentation_map_buf->data;
-    f->mv               = (VP9MVRefPair*)f->mv_buf->data;
-
-    if (s->segmentation.enabled && !s->segmentation.update_map &&
-        !s->keyframe && !s->intraonly && !s->errorres)
-        memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
+    f->segmentation_map = f->extradata->data;
+    f->mv = (VP9mvrefPair *) (f->extradata->data + sz);
+
+    if (avctx->hwaccel) {
+        const AVHWAccel *hwaccel = avctx->hwaccel;
+        av_assert0(!f->hwaccel_picture_private);
+        if (hwaccel->frame_priv_data_size) {
+            f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
+            if (!f->hwaccel_priv_buf)
+                goto fail;
+            f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
+        }
+    }
 
     return 0;
+
+fail:
+    vp9_frame_unref(avctx, f);
+    return AVERROR(ENOMEM);
 }
 
-static int vp9_frame_ref(VP9Frame *dst, VP9Frame *src)
+static int vp9_frame_ref(AVCodecContext *avctx, VP9Frame *dst, VP9Frame *src)
 {
     int ret;
 
-    dst->segmentation_map_buf = av_buffer_ref(src->segmentation_map_buf);
-    dst->mv_buf               = av_buffer_ref(src->mv_buf);
-    if (!dst->segmentation_map_buf || !dst->mv_buf) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-
     ret = ff_thread_ref_frame(&dst->tf, &src->tf);
     if (ret < 0)
+        return ret;
+
+    dst->extradata = av_buffer_ref(src->extradata);
+    if (!dst->extradata)
         goto fail;
 
     dst->segmentation_map = src->segmentation_map;
-    dst->mv               = src->mv;
+    dst->mv = src->mv;
+    dst->uses_2pass = src->uses_2pass;
+
+    if (src->hwaccel_picture_private) {
+        dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
+        if (!dst->hwaccel_priv_buf)
+            goto fail;
+        dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
+    }
 
     return 0;
+
 fail:
-    av_buffer_unref(&dst->segmentation_map_buf);
-    av_buffer_unref(&dst->mv_buf);
-    return ret;
+    vp9_frame_unref(avctx, dst);
+    return AVERROR(ENOMEM);
 }
 
-static void vp9_decode_flush(AVCodecContext *avctx)
+static int update_size(AVCodecContext *avctx, int w, int h)
 {
+#define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + \
+                     CONFIG_VP9_D3D11VA_HWACCEL * 2 + \
+                     CONFIG_VP9_NVDEC_HWACCEL + \
+                     CONFIG_VP9_VAAPI_HWACCEL)
+    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
     VP9Context *s = avctx->priv_data;
-    int i;
+    uint8_t *p;
+    int bytesperpixel = s->bytesperpixel, ret, cols, rows;
+    int lflvl_len, i;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
-        vp9_frame_unref(avctx, &s->frames[i]);
+    av_assert0(w > 0 && h > 0);
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
-        ff_thread_release_buffer(avctx, &s->refs[i]);
+    if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
+        if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
+            return ret;
 
-    s->use_last_frame_mvs = 0;
+        switch (s->pix_fmt) {
+        case AV_PIX_FMT_YUV420P:
+        case AV_PIX_FMT_YUV420P10:
+#if CONFIG_VP9_DXVA2_HWACCEL
+            *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
+#endif
+#if CONFIG_VP9_D3D11VA_HWACCEL
+            *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
+            *fmtp++ = AV_PIX_FMT_D3D11;
+#endif
+#if CONFIG_VP9_NVDEC_HWACCEL
+            *fmtp++ = AV_PIX_FMT_CUDA;
+#endif
+#if CONFIG_VP9_VAAPI_HWACCEL
+            *fmtp++ = AV_PIX_FMT_VAAPI;
+#endif
+            break;
+        case AV_PIX_FMT_YUV420P12:
+#if CONFIG_VP9_NVDEC_HWACCEL
+            *fmtp++ = AV_PIX_FMT_CUDA;
+#endif
+#if CONFIG_VP9_VAAPI_HWACCEL
+            *fmtp++ = AV_PIX_FMT_VAAPI;
+#endif
+            break;
+        }
 
-    s->alloc_width  = 0;
-    s->alloc_height = 0;
-}
+        *fmtp++ = s->pix_fmt;
+        *fmtp = AV_PIX_FMT_NONE;
 
-static int update_size(AVCodecContext *avctx, int w, int h)
-{
-    VP9Context *s = avctx->priv_data;
-    uint8_t *p;
-    int nb_blocks, nb_superblocks;
+        ret = ff_thread_get_format(avctx, pix_fmts);
+        if (ret < 0)
+            return ret;
 
-    if (s->above_partition_ctx && w == s->alloc_width && h == s->alloc_height)
-        return 0;
+        avctx->pix_fmt = ret;
+        s->gf_fmt  = s->pix_fmt;
+        s->w = w;
+        s->h = h;
+    }
 
-    vp9_decode_flush(avctx);
+    cols = (w + 7) >> 3;
+    rows = (h + 7) >> 3;
 
-    if (w <= 0 || h <= 0)
-        return AVERROR_INVALIDDATA;
+    if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
+        return 0;
 
-    avctx->width  = w;
-    avctx->height = h;
-    s->sb_cols    = (w + 63) >> 6;
-    s->sb_rows    = (h + 63) >> 6;
-    s->cols       = (w +  7) >> 3;
-    s->rows       = (h +  7) >> 3;
-
-#define assign(var, type, n) var = (type)p; p += s->sb_cols * n * sizeof(*var)
-    av_free(s->above_partition_ctx);
-    p = av_malloc(s->sb_cols *
-                  (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+    s->last_fmt  = s->pix_fmt;
+    s->sb_cols   = (w + 63) >> 6;
+    s->sb_rows   = (h + 63) >> 6;
+    s->cols      = (w + 7) >> 3;
+    s->rows      = (h + 7) >> 3;
+    lflvl_len    = avctx->active_thread_type == FF_THREAD_SLICE ? s->sb_rows : 1;
+
+#define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
+    av_freep(&s->intra_pred_data[0]);
+    // FIXME we slightly over-allocate here for subsampled chroma, but a little
+    // bit of padding shouldn't affect performance...
+    p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
+                                lflvl_len * sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
     if (!p)
         return AVERROR(ENOMEM);
-    assign(s->above_partition_ctx, uint8_t *,     8);
-    assign(s->above_skip_ctx,      uint8_t *,     8);
-    assign(s->above_txfm_ctx,      uint8_t *,     8);
-    assign(s->above_mode_ctx,      uint8_t *,    16);
-    assign(s->above_y_nnz_ctx,     uint8_t *,    16);
-    assign(s->above_uv_nnz_ctx[0], uint8_t *,     8);
-    assign(s->above_uv_nnz_ctx[1], uint8_t *,     8);
-    assign(s->intra_pred_data[0],  uint8_t *,    64);
-    assign(s->intra_pred_data[1],  uint8_t *,    32);
-    assign(s->intra_pred_data[2],  uint8_t *,    32);
-    assign(s->above_segpred_ctx,   uint8_t *,     8);
-    assign(s->above_intra_ctx,     uint8_t *,     8);
-    assign(s->above_comp_ctx,      uint8_t *,     8);
-    assign(s->above_ref_ctx,       uint8_t *,     8);
-    assign(s->above_filter_ctx,    uint8_t *,     8);
-    assign(s->lflvl,               VP9Filter *,   1);
-    assign(s->above_mv_ctx,        VP56mv(*)[2], 16);
+    assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
+    assign(s->intra_pred_data[1],  uint8_t *,             64 * bytesperpixel);
+    assign(s->intra_pred_data[2],  uint8_t *,             64 * bytesperpixel);
+    assign(s->above_y_nnz_ctx,     uint8_t *,             16);
+    assign(s->above_mode_ctx,      uint8_t *,             16);
+    assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
+    assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
+    assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
+    assign(s->above_partition_ctx, uint8_t *,              8);
+    assign(s->above_skip_ctx,      uint8_t *,              8);
+    assign(s->above_txfm_ctx,      uint8_t *,              8);
+    assign(s->above_segpred_ctx,   uint8_t *,              8);
+    assign(s->above_intra_ctx,     uint8_t *,              8);
+    assign(s->above_comp_ctx,      uint8_t *,              8);
+    assign(s->above_ref_ctx,       uint8_t *,              8);
+    assign(s->above_filter_ctx,    uint8_t *,              8);
+    assign(s->lflvl,               VP9Filter *,            lflvl_len);
 #undef assign
 
-    av_freep(&s->b_base);
-    av_freep(&s->block_base);
+    if (s->td) {
+        for (i = 0; i < s->active_tile_cols; i++) {
+            av_freep(&s->td[i].b_base);
+            av_freep(&s->td[i].block_base);
+        }
+    }
 
-    if (avctx->active_thread_type & FF_THREAD_FRAME) {
-        nb_blocks      = s->cols * s->rows;
-        nb_superblocks = s->sb_cols * s->sb_rows;
-    } else {
-        nb_blocks = nb_superblocks = 1;
+    if (s->s.h.bpp != s->last_bpp) {
+        ff_vp9dsp_init(&s->dsp, s->s.h.bpp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+        ff_videodsp_init(&s->vdsp, s->s.h.bpp);
+        s->last_bpp = s->s.h.bpp;
     }
 
-    s->b_base     = av_malloc_array(nb_blocks, sizeof(*s->b_base));
-    s->block_base = av_mallocz_array(nb_superblocks, (64 * 64 + 128) * 3);
-    if (!s->b_base || !s->block_base)
-        return AVERROR(ENOMEM);
-    s->uvblock_base[0] = s->block_base      + nb_superblocks * 64 * 64;
-    s->uvblock_base[1] = s->uvblock_base[0] + nb_superblocks * 32 * 32;
-    s->eob_base        = (uint8_t *)(s->uvblock_base[1] + nb_superblocks * 32 * 32);
-    s->uveob_base[0]   = s->eob_base + nb_superblocks * 256;
-    s->uveob_base[1]   = s->uveob_base[0] + nb_superblocks * 64;
+    return 0;
+}
 
-    s->alloc_width  = w;
-    s->alloc_height = h;
+static int update_block_buffers(AVCodecContext *avctx)
+{
+    int i;
+    VP9Context *s = avctx->priv_data;
+    int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
+    VP9TileData *td = &s->td[0];
+
+    if (td->b_base && td->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
+        return 0;
+
+    av_free(td->b_base);
+    av_free(td->block_base);
+    chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
+    chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
+    if (s->s.frames[CUR_FRAME].uses_2pass) {
+        int sbs = s->sb_cols * s->sb_rows;
+
+        td->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
+        td->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                    16 * 16 + 2 * chroma_eobs) * sbs);
+        if (!td->b_base || !td->block_base)
+            return AVERROR(ENOMEM);
+        td->uvblock_base[0] = td->block_base + sbs * 64 * 64 * bytesperpixel;
+        td->uvblock_base[1] = td->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
+        td->eob_base = (uint8_t *) (td->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
+        td->uveob_base[0] = td->eob_base + 16 * 16 * sbs;
+        td->uveob_base[1] = td->uveob_base[0] + chroma_eobs * sbs;
+    } else {
+        for (i = 1; i < s->active_tile_cols; i++) {
+            if (s->td[i].b_base && s->td[i].block_base) {
+                av_free(s->td[i].b_base);
+                av_free(s->td[i].block_base);
+            }
+        }
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].b_base = av_malloc(sizeof(VP9Block));
+            s->td[i].block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                       16 * 16 + 2 * chroma_eobs);
+            if (!s->td[i].b_base || !s->td[i].block_base)
+                return AVERROR(ENOMEM);
+            s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 * bytesperpixel;
+            s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] + chroma_blocks * bytesperpixel;
+            s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] + chroma_blocks * bytesperpixel);
+            s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16;
+            s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs;
+        }
+    }
+    s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
 
     return 0;
 }
 
 // The sign bit is at the end, not the start, of a bit sequence
-static av_always_inline int get_bits_with_sign(GetBitContext *gb, int n)
+static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 {
     int v = get_bits(gb, n);
     return get_bits1(gb) ? -v : v;
@@ -203,7 +353,7 @@ static av_always_inline int inv_recenter_nonneg(int v, int m)
 // differential forward probability updates
 static int update_prob(VP56RangeCoder *c, int p)
 {
-    static const int inv_map_table[MAX_PROB - 1] = {
+    static const int inv_map_table[255] = {
           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
@@ -222,7 +372,7 @@ static int update_prob(VP56RangeCoder *c, int p)
         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
-        252, 253,
+        252, 253, 253,
     };
     int d;
 
@@ -249,16 +399,76 @@ static int update_prob(VP56RangeCoder *c, int p)
         d = vp8_rac_get_uint(c, 5) + 32;
     } else {
         d = vp8_rac_get_uint(c, 7);
-        if (d >= 65) {
+        if (d >= 65)
             d = (d << 1) - 65 + vp8_rac_get(c);
-            d = av_clip(d, 0, MAX_PROB - 65 - 1);
-        }
         d += 64;
+        av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
+    }
+
+    return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
+                    255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
+}
+
+static int read_colorspace_details(AVCodecContext *avctx)
+{
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+    VP9Context *s = avctx->priv_data;
+    int bits = avctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
+
+    s->bpp_index = bits;
+    s->s.h.bpp = 8 + bits * 2;
+    s->bytesperpixel = (7 + s->s.h.bpp) >> 3;
+    avctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
+    if (avctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
+        static const enum AVPixelFormat pix_fmt_rgb[3] = {
+            AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
+        };
+        s->ss_h = s->ss_v = 0;
+        avctx->color_range = AVCOL_RANGE_JPEG;
+        s->pix_fmt = pix_fmt_rgb[bits];
+        if (avctx->profile & 1) {
+            if (get_bits1(&s->gb)) {
+                av_log(avctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
+                   avctx->profile);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
+            { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
+              { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
+            { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
+              { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
+            { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
+              { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
+        };
+        avctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
+        if (avctx->profile & 1) {
+            s->ss_h = get_bits1(&s->gb);
+            s->ss_v = get_bits1(&s->gb);
+            s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
+            if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
+                av_log(avctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
+                       avctx->profile);
+                return AVERROR_INVALIDDATA;
+            } else if (get_bits1(&s->gb)) {
+                av_log(avctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
+                       avctx->profile);
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            s->ss_h = s->ss_v = 1;
+            s->pix_fmt = pix_fmt_for_ss[bits][1][1];
+        }
     }
 
-    return p <= 128
-           ?   1 + inv_recenter_nonneg(inv_map_table[d], p - 1)
-           : 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
+    return 0;
 }
 
 static int decode_frame_header(AVCodecContext *avctx,
@@ -278,273 +488,338 @@ static int decode_frame_header(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Invalid frame marker\n");
         return AVERROR_INVALIDDATA;
     }
-    s->profile = get_bits1(&s->gb);
-    if (get_bits1(&s->gb)) { // reserved bit
-        av_log(avctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
+    avctx->profile  = get_bits1(&s->gb);
+    avctx->profile |= get_bits1(&s->gb) << 1;
+    if (avctx->profile == 3) avctx->profile += get_bits1(&s->gb);
+    if (avctx->profile > 3) {
+        av_log(avctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", avctx->profile);
         return AVERROR_INVALIDDATA;
     }
+    s->s.h.profile = avctx->profile;
     if (get_bits1(&s->gb)) {
         *ref = get_bits(&s->gb, 3);
         return 0;
     }
 
-    s->last_keyframe = s->keyframe;
-    s->keyframe      = !get_bits1(&s->gb);
+    s->last_keyframe  = s->s.h.keyframe;
+    s->s.h.keyframe   = !get_bits1(&s->gb);
 
-    last_invisible = s->invisible;
-    s->invisible   = !get_bits1(&s->gb);
-    s->errorres    = get_bits1(&s->gb);
-    s->use_last_frame_mvs = !s->errorres && !last_invisible;
+    last_invisible   = s->s.h.invisible;
+    s->s.h.invisible = !get_bits1(&s->gb);
+    s->s.h.errorres  = get_bits1(&s->gb);
+    s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
 
-    if (s->keyframe) {
+    if (s->s.h.keyframe) {
         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
             av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
             return AVERROR_INVALIDDATA;
         }
-        s->colorspace = get_bits(&s->gb, 3);
-        if (s->colorspace == 7) { // RGB = profile 1
-            av_log(avctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->fullrange = get_bits1(&s->gb);
-
-        // subsampling bits
-        if (s->profile == 1 || s->profile == 3) {
-            s->sub_x = get_bits1(&s->gb);
-            s->sub_y = get_bits1(&s->gb);
-            if (s->sub_x && s->sub_y) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "4:2:0 color not supported in profile 1 or 3\n");
-                return AVERROR_INVALIDDATA;
-            }
-            if (get_bits1(&s->gb)) { // reserved bit
-                av_log(avctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
-                return AVERROR_INVALIDDATA;
-            }
-        } else {
-            s->sub_x = s->sub_y = 1;
-        }
-        if (!s->sub_x || !s->sub_y) {
-            avpriv_report_missing_feature(avctx, "Subsampling %d:%d",
-                                          s->sub_x, s->sub_y);
-            return AVERROR_PATCHWELCOME;
-        }
-
-        s->refreshrefmask = 0xff;
+        if ((ret = read_colorspace_details(avctx)) < 0)
+            return ret;
+        // for profile 1, here follows the subsampling bits
+        s->s.h.refreshrefmask = 0xff;
         w = get_bits(&s->gb, 16) + 1;
         h = get_bits(&s->gb, 16) + 1;
         if (get_bits1(&s->gb)) // display size
             skip_bits(&s->gb, 32);
     } else {
-        s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
-        s->resetctx  = s->errorres ? 0 : get_bits(&s->gb, 2);
-        if (s->intraonly) {
+        s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
+        s->s.h.resetctx  = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
+        if (s->s.h.intraonly) {
             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
                 av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
                 return AVERROR_INVALIDDATA;
             }
-            s->refreshrefmask = get_bits(&s->gb, 8);
+            if (avctx->profile >= 1) {
+                if ((ret = read_colorspace_details(avctx)) < 0)
+                    return ret;
+            } else {
+                s->ss_h = s->ss_v = 1;
+                s->s.h.bpp = 8;
+                s->bpp_index = 0;
+                s->bytesperpixel = 1;
+                s->pix_fmt = AV_PIX_FMT_YUV420P;
+                avctx->colorspace = AVCOL_SPC_BT470BG;
+                avctx->color_range = AVCOL_RANGE_MPEG;
+            }
+            s->s.h.refreshrefmask = get_bits(&s->gb, 8);
             w = get_bits(&s->gb, 16) + 1;
             h = get_bits(&s->gb, 16) + 1;
             if (get_bits1(&s->gb)) // display size
                 skip_bits(&s->gb, 32);
         } else {
-            s->refreshrefmask = get_bits(&s->gb, 8);
-            s->refidx[0]      = get_bits(&s->gb, 3);
-            s->signbias[0]    = get_bits1(&s->gb);
-            s->refidx[1]      = get_bits(&s->gb, 3);
-            s->signbias[1]    = get_bits1(&s->gb);
-            s->refidx[2]      = get_bits(&s->gb, 3);
-            s->signbias[2]    = get_bits1(&s->gb);
-            if (!s->refs[s->refidx[0]].f->buf[0] ||
-                !s->refs[s->refidx[1]].f->buf[0] ||
-                !s->refs[s->refidx[2]].f->buf[0]) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Not all references are available\n");
+            s->s.h.refreshrefmask = get_bits(&s->gb, 8);
+            s->s.h.refidx[0]      = get_bits(&s->gb, 3);
+            s->s.h.signbias[0]    = get_bits1(&s->gb) && !s->s.h.errorres;
+            s->s.h.refidx[1]      = get_bits(&s->gb, 3);
+            s->s.h.signbias[1]    = get_bits1(&s->gb) && !s->s.h.errorres;
+            s->s.h.refidx[2]      = get_bits(&s->gb, 3);
+            s->s.h.signbias[2]    = get_bits1(&s->gb) && !s->s.h.errorres;
+            if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
+                !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
+                !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
+                av_log(avctx, AV_LOG_ERROR, "Not all references are available\n");
                 return AVERROR_INVALIDDATA;
             }
             if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[0]].f->width;
-                h = s->refs[s->refidx[0]].f->height;
+                w = s->s.refs[s->s.h.refidx[0]].f->width;
+                h = s->s.refs[s->s.h.refidx[0]].f->height;
             } else if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[1]].f->width;
-                h = s->refs[s->refidx[1]].f->height;
+                w = s->s.refs[s->s.h.refidx[1]].f->width;
+                h = s->s.refs[s->s.h.refidx[1]].f->height;
             } else if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[2]].f->width;
-                h = s->refs[s->refidx[2]].f->height;
+                w = s->s.refs[s->s.h.refidx[2]].f->width;
+                h = s->s.refs[s->s.h.refidx[2]].f->height;
             } else {
                 w = get_bits(&s->gb, 16) + 1;
                 h = get_bits(&s->gb, 16) + 1;
             }
+            // Note that in this code, "CUR_FRAME" is actually before we
+            // have formally allocated a frame, and thus actually represents
+            // the _last_ frame
+            s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
+                                       s->s.frames[CUR_FRAME].tf.f->height == h;
             if (get_bits1(&s->gb)) // display size
                 skip_bits(&s->gb, 32);
-            s->highprecisionmvs = get_bits1(&s->gb);
-            s->filtermode       = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
-                                  get_bits(&s->gb, 2);
-            s->allowcompinter   = s->signbias[0] != s->signbias[1] ||
-                                  s->signbias[0] != s->signbias[2];
-            if (s->allowcompinter) {
-                if (s->signbias[0] == s->signbias[1]) {
-                    s->fixcompref    = 2;
-                    s->varcompref[0] = 0;
-                    s->varcompref[1] = 1;
-                } else if (s->signbias[0] == s->signbias[2]) {
-                    s->fixcompref    = 1;
-                    s->varcompref[0] = 0;
-                    s->varcompref[1] = 2;
+            s->s.h.highprecisionmvs = get_bits1(&s->gb);
+            s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
+                                                  get_bits(&s->gb, 2);
+            s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
+                                  s->s.h.signbias[0] != s->s.h.signbias[2];
+            if (s->s.h.allowcompinter) {
+                if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
+                    s->s.h.fixcompref    = 2;
+                    s->s.h.varcompref[0] = 0;
+                    s->s.h.varcompref[1] = 1;
+                } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
+                    s->s.h.fixcompref    = 1;
+                    s->s.h.varcompref[0] = 0;
+                    s->s.h.varcompref[1] = 2;
                 } else {
-                    s->fixcompref    = 0;
-                    s->varcompref[0] = 1;
-                    s->varcompref[1] = 2;
+                    s->s.h.fixcompref    = 0;
+                    s->s.h.varcompref[0] = 1;
+                    s->s.h.varcompref[1] = 2;
                 }
             }
         }
     }
-
-    s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
-    s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
-    s->framectxid   = c = get_bits(&s->gb, 2);
+    s->s.h.refreshctx   = s->s.h.errorres ? 0 : get_bits1(&s->gb);
+    s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
+    s->s.h.framectxid   = c = get_bits(&s->gb, 2);
+    if (s->s.h.keyframe || s->s.h.intraonly)
+        s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes
 
     /* loopfilter header data */
-    s->filter.level = get_bits(&s->gb, 6);
-    sharp           = get_bits(&s->gb, 3);
-    /* If sharpness changed, reinit lim/mblim LUTs. if it didn't change,
-     * keep the old cache values since they are still valid. */
-    if (s->filter.sharpness != sharp)
-        memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
-    s->filter.sharpness = sharp;
-    if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
-        if (get_bits1(&s->gb)) {
+    if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
+        // reset loopfilter defaults
+        s->s.h.lf_delta.ref[0] = 1;
+        s->s.h.lf_delta.ref[1] = 0;
+        s->s.h.lf_delta.ref[2] = -1;
+        s->s.h.lf_delta.ref[3] = -1;
+        s->s.h.lf_delta.mode[0] = 0;
+        s->s.h.lf_delta.mode[1] = 0;
+        memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
+    }
+    s->s.h.filter.level = get_bits(&s->gb, 6);
+    sharp = get_bits(&s->gb, 3);
+    // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
+    // the old cache values since they are still valid
+    if (s->s.h.filter.sharpness != sharp) {
+        for (i = 1; i <= 63; i++) {
+            int limit = i;
+
+            if (sharp > 0) {
+                limit >>= (sharp + 3) >> 2;
+                limit = FFMIN(limit, 9 - sharp);
+            }
+            limit = FFMAX(limit, 1);
+
+            s->filter_lut.lim_lut[i] = limit;
+            s->filter_lut.mblim_lut[i] = 2 * (i + 2) + limit;
+        }
+    }
+    s->s.h.filter.sharpness = sharp;
+    if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
+        if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
             for (i = 0; i < 4; i++)
                 if (get_bits1(&s->gb))
-                    s->lf_delta.ref[i] = get_bits_with_sign(&s->gb, 6);
+                    s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
             for (i = 0; i < 2; i++)
                 if (get_bits1(&s->gb))
-                    s->lf_delta.mode[i] = get_bits_with_sign(&s->gb, 6);
+                    s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
         }
-    } else {
-        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
     }
 
     /* quantization header data */
-    s->yac_qi      = get_bits(&s->gb, 8);
-    s->ydc_qdelta  = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->uvdc_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->uvac_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
-                     s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
+    s->s.h.yac_qi      = get_bits(&s->gb, 8);
+    s->s.h.ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->s.h.lossless    = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
+                       s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
+    if (s->s.h.lossless)
+        avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
 
     /* segmentation header info */
-    if ((s->segmentation.enabled = get_bits1(&s->gb))) {
-        if ((s->segmentation.update_map = get_bits1(&s->gb))) {
+    if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
+        if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
             for (i = 0; i < 7; i++)
-                s->prob.seg[i] = get_bits1(&s->gb) ?
+                s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
                                  get_bits(&s->gb, 8) : 255;
-            if ((s->segmentation.temporal = get_bits1(&s->gb)))
+            if ((s->s.h.segmentation.temporal = get_bits1(&s->gb)))
                 for (i = 0; i < 3; i++)
-                    s->prob.segpred[i] = get_bits1(&s->gb) ?
+                    s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
                                          get_bits(&s->gb, 8) : 255;
         }
 
         if (get_bits1(&s->gb)) {
-            s->segmentation.absolute_vals = get_bits1(&s->gb);
+            s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
             for (i = 0; i < 8; i++) {
-                if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].q_val = get_bits_with_sign(&s->gb, 8);
-                if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].lf_val = get_bits_with_sign(&s->gb, 6);
-                if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
-                s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
+                if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
+                    s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
+                if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
+                    s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
+                if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
+                    s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
+                s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
             }
         }
-    } else {
-        s->segmentation.feat[0].q_enabled    = 0;
-        s->segmentation.feat[0].lf_enabled   = 0;
-        s->segmentation.feat[0].skip_enabled = 0;
-        s->segmentation.feat[0].ref_enabled  = 0;
     }
 
     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
-    for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
+    for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
         int qyac, qydc, quvac, quvdc, lflvl, sh;
 
-        if (s->segmentation.feat[i].q_enabled) {
-            if (s->segmentation.absolute_vals)
-                qyac = s->segmentation.feat[i].q_val;
+        if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
+            if (s->s.h.segmentation.absolute_vals)
+                qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
             else
-                qyac = s->yac_qi + s->segmentation.feat[i].q_val;
+                qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
         } else {
-            qyac = s->yac_qi;
+            qyac  = s->s.h.yac_qi;
         }
-        qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
-        quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
-        quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
+        qydc  = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
+        quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
+        quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
         qyac  = av_clip_uintp2(qyac, 8);
 
-        s->segmentation.feat[i].qmul[0][0] = ff_vp9_dc_qlookup[qydc];
-        s->segmentation.feat[i].qmul[0][1] = ff_vp9_ac_qlookup[qyac];
-        s->segmentation.feat[i].qmul[1][0] = ff_vp9_dc_qlookup[quvdc];
-        s->segmentation.feat[i].qmul[1][1] = ff_vp9_ac_qlookup[quvac];
+        s->s.h.segmentation.feat[i].qmul[0][0] = ff_vp9_dc_qlookup[s->bpp_index][qydc];
+        s->s.h.segmentation.feat[i].qmul[0][1] = ff_vp9_ac_qlookup[s->bpp_index][qyac];
+        s->s.h.segmentation.feat[i].qmul[1][0] = ff_vp9_dc_qlookup[s->bpp_index][quvdc];
+        s->s.h.segmentation.feat[i].qmul[1][1] = ff_vp9_ac_qlookup[s->bpp_index][quvac];
 
-        sh = s->filter.level >= 32;
-        if (s->segmentation.feat[i].lf_enabled) {
-            if (s->segmentation.absolute_vals)
-                lflvl = s->segmentation.feat[i].lf_val;
+        sh = s->s.h.filter.level >= 32;
+        if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
+            if (s->s.h.segmentation.absolute_vals)
+                lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
             else
-                lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
+                lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
         } else {
-            lflvl = s->filter.level;
+            lflvl  = s->s.h.filter.level;
         }
-        s->segmentation.feat[i].lflvl[0][0] =
-        s->segmentation.feat[i].lflvl[0][1] =
-            av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
-        for (j = 1; j < 4; j++) {
-            s->segmentation.feat[i].lflvl[j][0] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[0]) << sh), 6);
-            s->segmentation.feat[i].lflvl[j][1] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[1]) << sh), 6);
+        if (s->s.h.lf_delta.enabled) {
+            s->s.h.segmentation.feat[i].lflvl[0][0] =
+            s->s.h.segmentation.feat[i].lflvl[0][1] =
+                av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
+            for (j = 1; j < 4; j++) {
+                s->s.h.segmentation.feat[i].lflvl[j][0] =
+                    av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
+                                             s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
+                s->s.h.segmentation.feat[i].lflvl[j][1] =
+                    av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
+                                             s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
+            }
+        } else {
+            memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
+                   sizeof(s->s.h.segmentation.feat[i].lflvl));
         }
     }
 
     /* tiling info */
     if ((ret = update_size(avctx, w, h)) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Failed to initialize decoder for %dx%d\n", w, h);
+        av_log(avctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
+               w, h, s->pix_fmt);
         return ret;
     }
-    for (s->tiling.log2_tile_cols = 0;
-         (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
-         s->tiling.log2_tile_cols++) ;
+    for (s->s.h.tiling.log2_tile_cols = 0;
+         s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
+         s->s.h.tiling.log2_tile_cols++) ;
     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
     max = FFMAX(0, max - 1);
-    while (max > s->tiling.log2_tile_cols) {
+    while (max > s->s.h.tiling.log2_tile_cols) {
         if (get_bits1(&s->gb))
-            s->tiling.log2_tile_cols++;
+            s->s.h.tiling.log2_tile_cols++;
         else
             break;
     }
-    s->tiling.log2_tile_rows = decode012(&s->gb);
-    s->tiling.tile_rows      = 1 << s->tiling.log2_tile_rows;
-    if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
-        s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
-        s->c_b              = av_fast_realloc(s->c_b, &s->c_b_size,
-                                              sizeof(VP56RangeCoder) *
-                                              s->tiling.tile_cols);
-        if (!s->c_b) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Ran out of memory during range coder init\n");
+    s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
+    s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
+    if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
+        int n_range_coders;
+        VP56RangeCoder *rc;
+
+        if (s->td) {
+            for (i = 0; i < s->active_tile_cols; i++) {
+                av_free(s->td[i].b_base);
+                av_free(s->td[i].block_base);
+            }
+            av_free(s->td);
+        }
+
+        s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
+        vp9_free_entries(avctx);
+        s->active_tile_cols = avctx->active_thread_type == FF_THREAD_SLICE ?
+                              s->s.h.tiling.tile_cols : 1;
+        vp9_alloc_entries(avctx, s->sb_rows);
+        if (avctx->active_thread_type == FF_THREAD_SLICE) {
+            n_range_coders = 4; // max_tile_rows
+        } else {
+            n_range_coders = s->s.h.tiling.tile_cols;
+        }
+        s->td = av_mallocz_array(s->active_tile_cols, sizeof(VP9TileData) +
+                                 n_range_coders * sizeof(VP56RangeCoder));
+        if (!s->td)
             return AVERROR(ENOMEM);
+        rc = (VP56RangeCoder *) &s->td[s->active_tile_cols];
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].s = s;
+            s->td[i].c_b = rc;
+            rc += n_range_coders;
         }
     }
 
-    if (s->keyframe || s->errorres || s->intraonly) {
-        s->prob_ctx[0].p =
-        s->prob_ctx[1].p =
-        s->prob_ctx[2].p =
-        s->prob_ctx[3].p = ff_vp9_default_probs;
+    /* check reference frames */
+    if (!s->s.h.keyframe && !s->s.h.intraonly) {
+        for (i = 0; i < 3; i++) {
+            AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
+            int refw = ref->width, refh = ref->height;
+
+            if (ref->format != avctx->pix_fmt) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Ref pixfmt (%s) did not match current frame (%s)",
+                       av_get_pix_fmt_name(ref->format),
+                       av_get_pix_fmt_name(avctx->pix_fmt));
+                return AVERROR_INVALIDDATA;
+            } else if (refw == w && refh == h) {
+                s->mvscale[i][0] = s->mvscale[i][1] = 0;
+            } else {
+                if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
+                    av_log(avctx, AV_LOG_ERROR,
+                           "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
+                           refw, refh, w, h);
+                    return AVERROR_INVALIDDATA;
+                }
+                s->mvscale[i][0] = (refw << 14) / w;
+                s->mvscale[i][1] = (refh << 14) / h;
+                s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
+                s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
+            }
+        }
+    }
+
+    if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
+        s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
+                           s->prob_ctx[3].p = ff_vp9_default_probs;
         memcpy(s->prob_ctx[0].coef, ff_vp9_default_coef_probs,
                sizeof(ff_vp9_default_coef_probs));
         memcpy(s->prob_ctx[1].coef, ff_vp9_default_coef_probs,
@@ -553,26 +828,38 @@ static int decode_frame_header(AVCodecContext *avctx,
                sizeof(ff_vp9_default_coef_probs));
         memcpy(s->prob_ctx[3].coef, ff_vp9_default_coef_probs,
                sizeof(ff_vp9_default_coef_probs));
+    } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
+        s->prob_ctx[c].p = ff_vp9_default_probs;
+        memcpy(s->prob_ctx[c].coef, ff_vp9_default_coef_probs,
+               sizeof(ff_vp9_default_coef_probs));
     }
 
     // next 16 bits is size of the rest of the header (arith-coded)
-    size2 = get_bits(&s->gb, 16);
+    s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
+    s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
+
     data2 = align_get_bits(&s->gb);
     if (size2 > size - (data2 - data)) {
         av_log(avctx, AV_LOG_ERROR, "Invalid compressed header size\n");
         return AVERROR_INVALIDDATA;
     }
-    ff_vp56_init_range_decoder(&s->c, data2, size2);
+    ret = ff_vp56_init_range_decoder(&s->c, data2, size2);
+    if (ret < 0)
+        return ret;
+
     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
         av_log(avctx, AV_LOG_ERROR, "Marker bit was set\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->keyframe || s->intraonly)
-        memset(s->counts.coef, 0,
-               sizeof(s->counts.coef) + sizeof(s->counts.eob));
-    else
-        memset(&s->counts, 0, sizeof(s->counts));
+    for (i = 0; i < s->active_tile_cols; i++) {
+        if (s->s.h.keyframe || s->s.h.intraonly) {
+            memset(s->td[i].counts.coef, 0, sizeof(s->td[0].counts.coef));
+            memset(s->td[i].counts.eob,  0, sizeof(s->td[0].counts.eob));
+        } else {
+            memset(&s->td[i].counts, 0, sizeof(s->td[0].counts));
+        }
+    }
 
     /* FIXME is it faster to not copy here, but do it down in the fw updates
      * as explicit copies if the fw update is missing (and skip the copy upon
@@ -580,14 +867,14 @@ static int decode_frame_header(AVCodecContext *avctx,
     s->prob.p = s->prob_ctx[c].p;
 
     // txfm updates
-    if (s->lossless) {
-        s->txfmmode = TX_4X4;
+    if (s->s.h.lossless) {
+        s->s.h.txfmmode = TX_4X4;
     } else {
-        s->txfmmode = vp8_rac_get_uint(&s->c, 2);
-        if (s->txfmmode == 3)
-            s->txfmmode += vp8_rac_get(&s->c);
+        s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
+        if (s->s.h.txfmmode == 3)
+            s->s.h.txfmmode += vp8_rac_get(&s->c);
 
-        if (s->txfmmode == TX_SWITCHABLE) {
+        if (s->s.h.txfmmode == TX_SWITCHABLE) {
             for (i = 0; i < 2; i++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
@@ -622,7 +909,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                                 else
                                     p[n] = r[n];
                             }
-                            p[3] = 0;
+                            memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
                         }
         } else {
             for (j = 0; j < 2; j++)
@@ -634,10 +921,10 @@ static int decode_frame_header(AVCodecContext *avctx,
                             if (m > 3 && l == 0) // dc only has 3 pt
                                 break;
                             memcpy(p, r, 3);
-                            p[3] = 0;
+                            memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
                         }
         }
-        if (s->txfmmode == i)
+        if (s->s.h.txfmmode == i)
             break;
     }
 
@@ -645,14 +932,14 @@ static int decode_frame_header(AVCodecContext *avctx,
     for (i = 0; i < 3; i++)
         if (vp56_rac_get_prob_branchy(&s->c, 252))
             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
-    if (!s->keyframe && !s->intraonly) {
+    if (!s->s.h.keyframe && !s->s.h.intraonly) {
         for (i = 0; i < 7; i++)
             for (j = 0; j < 3; j++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.mv_mode[i][j] =
                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 
-        if (s->filtermode == FILTER_SWITCHABLE)
+        if (s->s.h.filtermode == FILTER_SWITCHABLE)
             for (i = 0; i < 4; i++)
                 for (j = 0; j < 2; j++)
                     if (vp56_rac_get_prob_branchy(&s->c, 252))
@@ -663,20 +950,20 @@ static int decode_frame_header(AVCodecContext *avctx,
             if (vp56_rac_get_prob_branchy(&s->c, 252))
                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 
-        if (s->allowcompinter) {
-            s->comppredmode = vp8_rac_get(&s->c);
-            if (s->comppredmode)
-                s->comppredmode += vp8_rac_get(&s->c);
-            if (s->comppredmode == PRED_SWITCHABLE)
+        if (s->s.h.allowcompinter) {
+            s->s.h.comppredmode = vp8_rac_get(&s->c);
+            if (s->s.h.comppredmode)
+                s->s.h.comppredmode += vp8_rac_get(&s->c);
+            if (s->s.h.comppredmode == PRED_SWITCHABLE)
                 for (i = 0; i < 5; i++)
                     if (vp56_rac_get_prob_branchy(&s->c, 252))
                         s->prob.p.comp[i] =
                             update_prob(&s->c, s->prob.p.comp[i]);
         } else {
-            s->comppredmode = PRED_SINGLEREF;
+            s->s.h.comppredmode = PRED_SINGLEREF;
         }
 
-        if (s->comppredmode != PRED_COMPREF) {
+        if (s->s.h.comppredmode != PRED_COMPREF) {
             for (i = 0; i < 5; i++) {
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.single_ref[i][0] =
@@ -687,7 +974,7 @@ static int decode_frame_header(AVCodecContext *avctx,
             }
         }
 
-        if (s->comppredmode != PRED_SINGLEREF) {
+        if (s->s.h.comppredmode != PRED_SINGLEREF) {
             for (i = 0; i < 5; i++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.comp_ref[i] =
@@ -746,7 +1033,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
         }
 
-        if (s->highprecisionmvs) {
+        if (s->s.h.highprecisionmvs) {
             for (i = 0; i < 2; i++) {
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.mv_comp[i].class0_hp =
@@ -762,672 +1049,635 @@ static int decode_frame_header(AVCodecContext *avctx,
     return (data2 - data) + size2;
 }
 
-static int decode_subblock(AVCodecContext *avctx, int row, int col,
-                           VP9Filter *lflvl,
-                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+static void decode_sb(VP9TileData *td, int row, int col, VP9Filter *lflvl,
+                      ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
 {
-    VP9Context *s = avctx->priv_data;
-    AVFrame    *f = s->frames[CUR_FRAME].tf.f;
-    int c = ((s->above_partition_ctx[col]       >> (3 - bl)) & 1) |
-            (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
-    int ret;
-    const uint8_t *p = s->keyframe ? ff_vp9_default_kf_partition_probs[bl][c]
-                                   : s->prob.p.partition[bl][c];
+    const VP9Context *s = td->s;
+    int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
+            (((td->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
+    const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? ff_vp9_default_kf_partition_probs[bl][c] :
+                                                     s->prob.p.partition[bl][c];
     enum BlockPartition bp;
     ptrdiff_t hbs = 4 >> bl;
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
+    ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
+    int bytesperpixel = s->bytesperpixel;
 
     if (bl == BL_8X8) {
-        bp  = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
-        ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
-    } else if (col + hbs < s->cols) {
-        if (row + hbs < s->rows) {
-            bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+        bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
+    } else if (col + hbs < s->cols) { // FIXME why not <=?
+        if (row + hbs < s->rows) { // FIXME why not <=?
+            bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
             switch (bp) {
             case PARTITION_NONE:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_H:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
-                if (!ret) {
-                    yoff  += hbs * 8 * f->linesize[0];
-                    uvoff += hbs * 4 * f->linesize[1];
-                    ret    = ff_vp9_decode_block(avctx, row + hbs, col, lflvl,
-                                                 yoff, uvoff, bl, bp);
-                }
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_V:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
-                if (!ret) {
-                    yoff  += hbs * 8;
-                    uvoff += hbs * 4;
-                    ret    = ff_vp9_decode_block(avctx, row, col + hbs, lflvl,
-                                                 yoff, uvoff, bl, bp);
-                }
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
+                yoff  += hbs * 8 * bytesperpixel;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+                ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_SPLIT:
-                ret = decode_subblock(avctx, row, col, lflvl,
-                                      yoff, uvoff, bl + 1);
-                if (!ret) {
-                    ret = decode_subblock(avctx, row, col + hbs, lflvl,
-                                          yoff + 8 * hbs, uvoff + 4 * hbs,
-                                          bl + 1);
-                    if (!ret) {
-                        yoff  += hbs * 8 * f->linesize[0];
-                        uvoff += hbs * 4 * f->linesize[1];
-                        ret    = decode_subblock(avctx, row + hbs, col, lflvl,
-                                                 yoff, uvoff, bl + 1);
-                        if (!ret) {
-                            ret = decode_subblock(avctx, row + hbs, col + hbs,
-                                                  lflvl, yoff + 8 * hbs,
-                                                  uvoff + 4 * hbs, bl + 1);
-                        }
-                    }
-                }
+                decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(td, row, col + hbs, lflvl,
+                          yoff + 8 * hbs * bytesperpixel,
+                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(td, row + hbs, col + hbs, lflvl,
+                          yoff + 8 * hbs * bytesperpixel,
+                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 break;
             default:
-                av_log(avctx, AV_LOG_ERROR, "Unexpected partition %d.", bp);
-                return AVERROR_INVALIDDATA;
+                av_assert0(0);
             }
-        } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
-            bp  = PARTITION_SPLIT;
-            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            if (!ret)
-                ret = decode_subblock(avctx, row, col + hbs, lflvl,
-                                      yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+        } else if (vp56_rac_get_prob_branchy(td->c, p[1])) {
+            bp = PARTITION_SPLIT;
+            decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(td, row, col + hbs, lflvl,
+                      yoff + 8 * hbs * bytesperpixel,
+                      uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
         } else {
-            bp  = PARTITION_H;
-            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                      bl, bp);
+            bp = PARTITION_H;
+            ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
         }
-    } else if (row + hbs < s->rows) {
-        if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
-            bp  = PARTITION_SPLIT;
-            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            if (!ret) {
-                yoff  += hbs * 8 * f->linesize[0];
-                uvoff += hbs * 4 * f->linesize[1];
-                ret    = decode_subblock(avctx, row + hbs, col, lflvl,
-                                         yoff, uvoff, bl + 1);
-            }
+    } else if (row + hbs < s->rows) { // FIXME why not <=?
+        if (vp56_rac_get_prob_branchy(td->c, p[2])) {
+            bp = PARTITION_SPLIT;
+            decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+            yoff  += hbs * 8 * y_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         } else {
-            bp  = PARTITION_V;
-            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                      bl, bp);
+            bp = PARTITION_V;
+            ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
         }
     } else {
-        bp  = PARTITION_SPLIT;
-        ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        bp = PARTITION_SPLIT;
+        decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
     }
-    s->counts.partition[bl][c][bp]++;
-
-    return ret;
+    td->counts.partition[bl][c][bp]++;
 }
 
-static int decode_superblock_mem(AVCodecContext *avctx, int row, int col, struct VP9Filter *lflvl,
-                                 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+static void decode_sb_mem(VP9TileData *td, int row, int col, VP9Filter *lflvl,
+                          ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
 {
-    VP9Context *s = avctx->priv_data;
-    VP9Block *b = s->b;
+    const VP9Context *s = td->s;
+    VP9Block *b = td->b;
     ptrdiff_t hbs = 4 >> bl;
-    AVFrame *f = s->frames[CUR_FRAME].tf.f;
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
-    int res;
+    int bytesperpixel = s->bytesperpixel;
 
     if (bl == BL_8X8) {
         av_assert2(b->bl == BL_8X8);
-        res = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
-    } else if (s->b->bl == bl) {
-        if ((res = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp)) < 0)
-            return res;
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+    } else if (td->b->bl == bl) {
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
         if (b->bp == PARTITION_H && row + hbs < s->rows) {
             yoff  += hbs * 8 * y_stride;
-            uvoff += hbs * 4 * uv_stride;
-            res = ff_vp9_decode_block(avctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
-            yoff  += hbs * 8;
-            uvoff += hbs * 4;
-            res = ff_vp9_decode_block(avctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
+            yoff  += hbs * 8 * bytesperpixel;
+            uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+            ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
         }
     } else {
-        if ((res = decode_superblock_mem(avctx, row, col, lflvl, yoff, uvoff, bl + 1)) < 0)
-            return res;
+        decode_sb_mem(td, row, col, lflvl, yoff, uvoff, bl + 1);
         if (col + hbs < s->cols) { // FIXME why not <=?
             if (row + hbs < s->rows) {
-                if ((res = decode_superblock_mem(avctx, row, col + hbs, lflvl, yoff + 8 * hbs,
-                                                 uvoff + 4 * hbs, bl + 1)) < 0)
-                    return res;
+                decode_sb_mem(td, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
+                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 yoff  += hbs * 8 * y_stride;
-                uvoff += hbs * 4 * uv_stride;
-                if ((res = decode_superblock_mem(avctx, row + hbs, col, lflvl, yoff,
-                                                 uvoff, bl + 1)) < 0)
-                    return res;
-                res = decode_superblock_mem(avctx, row + hbs, col + hbs, lflvl,
-                                            yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb_mem(td, row + hbs, col + hbs, lflvl,
+                              yoff + 8 * hbs * bytesperpixel,
+                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
             } else {
-                yoff  += hbs * 8;
-                uvoff += hbs * 4;
-                res = decode_superblock_mem(avctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
+                yoff  += hbs * 8 * bytesperpixel;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+                decode_sb_mem(td, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
             }
         } else if (row + hbs < s->rows) {
             yoff  += hbs * 8 * y_stride;
-            uvoff += hbs * 4 * uv_stride;
-            res = decode_superblock_mem(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         }
     }
+}
+
+static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
+{
+    int sb_start = ( idx      * n) >> log2_n;
+    int sb_end   = ((idx + 1) * n) >> log2_n;
+    *start = FFMIN(sb_start, n) << 3;
+    *end   = FFMIN(sb_end,   n) << 3;
+}
+
+static void free_buffers(VP9Context *s)
+{
+    int i;
+
+    av_freep(&s->intra_pred_data[0]);
+    for (i = 0; i < s->active_tile_cols; i++) {
+        av_freep(&s->td[i].b_base);
+        av_freep(&s->td[i].block_base);
+    }
+}
+
+static av_cold int vp9_decode_free(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    int i;
 
-    return res;
+    for (i = 0; i < 3; i++) {
+        if (s->s.frames[i].tf.f->buf[0])
+            vp9_frame_unref(avctx, &s->s.frames[i]);
+        av_frame_free(&s->s.frames[i].tf.f);
+    }
+    for (i = 0; i < 8; i++) {
+        if (s->s.refs[i].f->buf[0])
+            ff_thread_release_buffer(avctx, &s->s.refs[i]);
+        av_frame_free(&s->s.refs[i].f);
+        if (s->next_refs[i].f->buf[0])
+            ff_thread_release_buffer(avctx, &s->next_refs[i]);
+        av_frame_free(&s->next_refs[i].f);
+    }
+
+    free_buffers(s);
+    vp9_free_entries(avctx);
+    av_freep(&s->td);
+    return 0;
 }
 
-static void loopfilter_subblock(AVCodecContext *avctx, VP9Filter *lflvl,
-                                int row, int col,
-                                ptrdiff_t yoff, ptrdiff_t uvoff)
+static int decode_tiles(AVCodecContext *avctx,
+                        const uint8_t *data, int size)
 {
     VP9Context *s = avctx->priv_data;
-    AVFrame    *f = s->frames[CUR_FRAME].tf.f;
-    uint8_t *dst   = f->data[0] + yoff;
-    ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
-    uint8_t *lvl = lflvl->level;
-    int y, x, p;
-
-    /* FIXME: In how far can we interleave the v/h loopfilter calls? E.g.
-     * if you think of them as acting on a 8x8 block max, we can interleave
-     * each v/h within the single x loop, but that only works if we work on
-     * 8 pixel blocks, and we won't always do that (we want at least 16px
-     * to use SSE2 optimizations, perhaps 32 for AVX2). */
-
-    // filter edges between columns, Y plane (e.g. block1 | block2)
-    for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
-        uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
-        uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
-        unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
-        unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
-        unsigned hm  = hm1 | hm2 | hm13 | hm23;
-
-        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
-            if (hm1 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                if (col || x > 1) {
-                    if (hmask1[0] & x) {
-                        if (hmask2[0] & x) {
-                            av_assert2(l[8] == L);
-                            s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
-                        }
-                    } else if (hm2 & x) {
-                        L  = l[8];
-                        H |= (L >> 4) << 8;
-                        E |= s->filter.mblim_lut[L] << 8;
-                        I |= s->filter.lim_lut[L] << 8;
-                        s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
-                                               [!!(hmask2[1] & x)]
-                                               [0](ptr, ls_y, E, I, H);
-                    } else {
-                        s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                            [0](ptr, ls_y, E, I, H);
-                    }
-                }
-            } else if (hm2 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+    VP9TileData *td = &s->td[0];
+    int row, col, tile_row, tile_col, ret;
+    int bytesperpixel;
+    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    AVFrame *f;
+    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
 
-                if (col || x > 1) {
-                    s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                        [0](ptr + 8 * ls_y, ls_y, E, I, H);
-                }
-            }
-            if (hm13 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                if (hm23 & x) {
-                    L  = l[8];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
-                }
-            } else if (hm23 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+    bytesperpixel = s->bytesperpixel;
+
+    yoff = uvoff = 0;
+    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+        set_tile_offset(&tile_row_start, &tile_row_end,
+                        tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+        for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+            int64_t tile_size;
 
-                s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
+            if (tile_col == s->s.h.tiling.tile_cols - 1 &&
+                tile_row == s->s.h.tiling.tile_rows - 1) {
+                tile_size = size;
+            } else {
+                tile_size = AV_RB32(data);
+                data += 4;
+                size -= 4;
+            }
+            if (tile_size > size) {
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                return AVERROR_INVALIDDATA;
+            }
+            ret = ff_vp56_init_range_decoder(&td->c_b[tile_col], data, tile_size);
+            if (ret < 0)
+                return ret;
+            if (vp56_rac_get_prob_branchy(&td->c_b[tile_col], 128)) { // marker bit
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                return AVERROR_INVALIDDATA;
             }
+            data += tile_size;
+            size -= tile_size;
         }
-    }
 
-    //                                          block1
-    // filter edges between rows, Y plane (e.g. ------)
-    //                                          block2
-    dst = f->data[0] + yoff;
-    lvl = lflvl->level;
-    for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
-        uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
-        unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
-
-        for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
-            if (row || y) {
-                if (vm & x) {
-                    int L = *l, H = L >> 4;
-                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                    if (vmask[0] & x) {
-                        if (vmask[0] & (x << 1)) {
-                            av_assert2(l[1] == L);
-                            s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
-                        }
-                    } else if (vm & (x << 1)) {
-                        L  = l[1];
-                        H |= (L >> 4) << 8;
-                        E |= s->filter.mblim_lut[L] << 8;
-                        I |= s->filter.lim_lut[L] << 8;
-                        s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                               [!!(vmask[1] & (x << 1))]
-                                               [1](ptr, ls_y, E, I, H);
+        for (row = tile_row_start; row < tile_row_end;
+             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+            VP9Filter *lflvl_ptr = s->lflvl;
+            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+
+            for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+                set_tile_offset(&tile_col_start, &tile_col_end,
+                                tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+                td->tile_col_start = tile_col_start;
+                if (s->pass != 2) {
+                    memset(td->left_partition_ctx, 0, 8);
+                    memset(td->left_skip_ctx, 0, 8);
+                    if (s->s.h.keyframe || s->s.h.intraonly) {
+                        memset(td->left_mode_ctx, DC_PRED, 16);
                     } else {
-                        s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                            [1](ptr, ls_y, E, I, H);
+                        memset(td->left_mode_ctx, NEARESTMV, 8);
                     }
-                } else if (vm & (x << 1)) {
-                    int L = l[1], H = L >> 4;
-                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                    memset(td->left_y_nnz_ctx, 0, 16);
+                    memset(td->left_uv_nnz_ctx, 0, 32);
+                    memset(td->left_segpred_ctx, 0, 8);
 
-                    s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
-                                        [1](ptr + 8, ls_y, E, I, H);
-                }
-            }
-            if (vm3 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                if (vm3 & (x << 1)) {
-                    L  = l[1];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
+                    td->c = &td->c_b[tile_col];
                 }
-            } else if (vm3 & (x << 1)) {
-                int L = l[1], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
-            }
-        }
-    }
+                for (col = tile_col_start;
+                     col < tile_col_end;
+                     col += 8, yoff2 += 64 * bytesperpixel,
+                     uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                    // FIXME integrate with lf code (i.e. zero after each
+                    // use, similar to invtxfm coefficients, or similar)
+                    if (s->pass != 1) {
+                        memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                    }
 
-    // same principle but for U/V planes
-    for (p = 0; p < 2; p++) {
-        lvl = lflvl->level;
-        dst = f->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
-            uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
-            uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
-            unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
-            unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
-
-            for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
-                if (col || x > 1) {
-                    if (hm1 & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
-
-                        if (hmask1[0] & x) {
-                            if (hmask2[0] & x) {
-                                av_assert2(l[16] == L);
-                                s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (hm2 & x) {
-                            L  = l[16];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
-                                                   [!!(hmask2[1] & x)]
-                                                   [0](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                                [0](ptr, ls_uv, E, I, H);
+                    if (s->pass == 2) {
+                        decode_sb_mem(td, row, col, lflvl_ptr,
+                                      yoff2, uvoff2, BL_64X64);
+                    } else {
+                        if (vpX_rac_is_end(td->c)) {
+                            return AVERROR_INVALIDDATA;
                         }
-                    } else if (hm2 & x) {
-                        int L = l[16], H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
-
-                        s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                            [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
+                        decode_sb(td, row, col, lflvl_ptr,
+                                  yoff2, uvoff2, BL_64X64);
                     }
                 }
-                if (x & 0xAA)
-                    l += 2;
             }
-        }
-        lvl = lflvl->level;
-        dst = f->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
-            uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
-            unsigned vm = vmask[0] | vmask[1] | vmask[2];
-
-            for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
-                if (row || y) {
-                    if (vm & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
-
-                        if (vmask[0] & x) {
-                            if (vmask[0] & (x << 2)) {
-                                av_assert2(l[2] == L);
-                                s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (vm & (x << 2)) {
-                            L  = l[2];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                                   [!!(vmask[1] & (x << 2))]
-                                                   [1](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                                [1](ptr, ls_uv, E, I, H);
-                        }
-                    } else if (vm & (x << 2)) {
-                        int L = l[2], H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
 
-                        s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
-                                            [1](ptr + 8, ls_uv, E, I, H);
-                    }
+            if (s->pass == 1)
+                continue;
+
+            // backup pre-loopfilter reconstruction data for intra
+            // prediction of next row of sb64s
+            if (row + 8 < s->rows) {
+                memcpy(s->intra_pred_data[0],
+                       f->data[0] + yoff + 63 * ls_y,
+                       8 * s->cols * bytesperpixel);
+                memcpy(s->intra_pred_data[1],
+                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * s->cols * bytesperpixel >> s->ss_h);
+                memcpy(s->intra_pred_data[2],
+                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * s->cols * bytesperpixel >> s->ss_h);
+            }
+
+            // loopfilter one row
+            if (s->s.h.filter.level) {
+                yoff2 = yoff;
+                uvoff2 = uvoff;
+                lflvl_ptr = s->lflvl;
+                for (col = 0; col < s->cols;
+                     col += 8, yoff2 += 64 * bytesperpixel,
+                     uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                    ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
+                                         yoff2, uvoff2);
                 }
             }
-            if (y & 1)
-                lvl += 16;
+
+            // FIXME maybe we can make this more finegrained by running the
+            // loopfilter per-block instead of after each sbrow
+            // In fact that would also make intra pred left preparation easier?
+            ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
         }
     }
+    return 0;
 }
 
-static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
+#if HAVE_THREADS
+static av_always_inline
+int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr,
+                              int threadnr)
 {
-    int sb_start =  (idx      * n) >> log2_n;
-    int sb_end   = ((idx + 1) * n) >> log2_n;
-    *start = FFMIN(sb_start, n) << 3;
-    *end   = FFMIN(sb_end,   n) << 3;
+    VP9Context *s = avctx->priv_data;
+    VP9TileData *td = &s->td[jobnr];
+    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+    int bytesperpixel = s->bytesperpixel, row, col, tile_row;
+    unsigned tile_cols_len;
+    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    VP9Filter *lflvl_ptr_base;
+    AVFrame *f;
+
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+
+    set_tile_offset(&tile_col_start, &tile_col_end,
+                    jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+    td->tile_col_start  = tile_col_start;
+    uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3);
+    yoff = (64 * bytesperpixel)*(tile_col_start >> 3);
+    lflvl_ptr_base = s->lflvl+(tile_col_start >> 3);
+
+    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+        set_tile_offset(&tile_row_start, &tile_row_end,
+                        tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+        td->c = &td->c_b[tile_row];
+        for (row = tile_row_start; row < tile_row_end;
+             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+            VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3);
+
+            memset(td->left_partition_ctx, 0, 8);
+            memset(td->left_skip_ctx, 0, 8);
+            if (s->s.h.keyframe || s->s.h.intraonly) {
+                memset(td->left_mode_ctx, DC_PRED, 16);
+            } else {
+                memset(td->left_mode_ctx, NEARESTMV, 8);
+            }
+            memset(td->left_y_nnz_ctx, 0, 16);
+            memset(td->left_uv_nnz_ctx, 0, 32);
+            memset(td->left_segpred_ctx, 0, 8);
+
+            for (col = tile_col_start;
+                 col < tile_col_end;
+                 col += 8, yoff2 += 64 * bytesperpixel,
+                 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                // FIXME integrate with lf code (i.e. zero after each
+                // use, similar to invtxfm coefficients, or similar)
+                memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                decode_sb(td, row, col, lflvl_ptr,
+                            yoff2, uvoff2, BL_64X64);
+            }
+
+            // backup pre-loopfilter reconstruction data for intra
+            // prediction of next row of sb64s
+            tile_cols_len = tile_col_end - tile_col_start;
+            if (row + 8 < s->rows) {
+                memcpy(s->intra_pred_data[0] + (tile_col_start * 8 * bytesperpixel),
+                       f->data[0] + yoff + 63 * ls_y,
+                       8 * tile_cols_len * bytesperpixel);
+                memcpy(s->intra_pred_data[1] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
+                memcpy(s->intra_pred_data[2] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
+            }
+
+            vp9_report_tile_progress(s, row >> 3, 1);
+        }
+    }
+    return 0;
 }
 
-static int update_refs(AVCodecContext *avctx)
+static av_always_inline
+int loopfilter_proc(AVCodecContext *avctx)
 {
     VP9Context *s = avctx->priv_data;
-    int i, ret;
+    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+    VP9Filter *lflvl_ptr;
+    int bytesperpixel = s->bytesperpixel, col, i;
+    AVFrame *f;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
-        if (s->refreshrefmask & (1 << i)) {
-            ff_thread_release_buffer(avctx, &s->refs[i]);
-            ret = ff_thread_ref_frame(&s->refs[i], &s->frames[CUR_FRAME].tf);
-            if (ret < 0)
-                return ret;
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+
+    for (i = 0; i < s->sb_rows; i++) {
+        vp9_await_tile_progress(s, i, s->s.h.tiling.tile_cols);
+
+        if (s->s.h.filter.level) {
+            yoff = (ls_y * 64)*i;
+            uvoff =  (ls_uv * 64 >> s->ss_v)*i;
+            lflvl_ptr = s->lflvl+s->sb_cols*i;
+            for (col = 0; col < s->cols;
+                 col += 8, yoff += 64 * bytesperpixel,
+                 uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col,
+                                     yoff, uvoff);
+            }
         }
-
+    }
     return 0;
 }
+#endif
 
-static int vp9_decode_frame(AVCodecContext *avctx, void *output,
+static int vp9_decode_frame(AVCodecContext *avctx, void *frame,
                             int *got_frame, AVPacket *pkt)
 {
-    VP9Context *s = avctx->priv_data;
-    AVFrame      *frame = output;
     const uint8_t *data = pkt->data;
-    int            size = pkt->size;
+    int size = pkt->size;
+    VP9Context *s = avctx->priv_data;
+    int ret, i, j, ref;
+    int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
+                            (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
     AVFrame *f;
-    int ret, tile_row, tile_col, i, ref = -1, row, col;
-
-    s->setup_finished = 0;
 
-    ret = decode_frame_header(avctx, data, size, &ref);
-    if (ret < 0) {
+    if ((ret = decode_frame_header(avctx, data, size, &ref)) < 0) {
         return ret;
-    } else if (!ret) {
-        if (!s->refs[ref].f->buf[0]) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Requested reference %d not available\n", ref);
+    } else if (ret == 0) {
+        if (!s->s.refs[ref].f->buf[0]) {
+            av_log(avctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
             return AVERROR_INVALIDDATA;
         }
-
-        ret = av_frame_ref(frame, s->refs[ref].f);
-        if (ret < 0)
+        if ((ret = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
             return ret;
+        ((AVFrame *)frame)->pts = pkt->pts;
+#if FF_API_PKT_PTS
+FF_DISABLE_DEPRECATION_WARNINGS
+        ((AVFrame *)frame)->pkt_pts = pkt->pts;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+        ((AVFrame *)frame)->pkt_dts = pkt->dts;
+        for (i = 0; i < 8; i++) {
+            if (s->next_refs[i].f->buf[0])
+                ff_thread_release_buffer(avctx, &s->next_refs[i]);
+            if (s->s.refs[i].f->buf[0] &&
+                (ret = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
+                return ret;
+        }
         *got_frame = 1;
         return pkt->size;
     }
     data += ret;
     size -= ret;
 
-    vp9_frame_unref(avctx, &s->frames[LAST_FRAME]);
-    if (!s->keyframe && s->frames[CUR_FRAME].tf.f->buf[0]) {
-        ret = vp9_frame_ref(&s->frames[LAST_FRAME], &s->frames[CUR_FRAME]);
-        if (ret < 0)
+    if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
+        if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
+            vp9_frame_unref(avctx, &s->s.frames[REF_FRAME_SEGMAP]);
+        if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
+            (ret = vp9_frame_ref(avctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
             return ret;
     }
-
-    vp9_frame_unref(avctx, &s->frames[CUR_FRAME]);
-    ret = vp9_frame_alloc(avctx, &s->frames[CUR_FRAME]);
-    if (ret < 0)
+    if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
+        vp9_frame_unref(avctx, &s->s.frames[REF_FRAME_MVPAIR]);
+    if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
+        (ret = vp9_frame_ref(avctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
         return ret;
+    if (s->s.frames[CUR_FRAME].tf.f->buf[0])
+        vp9_frame_unref(avctx, &s->s.frames[CUR_FRAME]);
+    if ((ret = vp9_frame_alloc(avctx, &s->s.frames[CUR_FRAME])) < 0)
+        return ret;
+    f = s->s.frames[CUR_FRAME].tf.f;
+    f->key_frame = s->s.h.keyframe;
+    f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
+        (s->s.frames[REF_FRAME_MVPAIR].tf.f->width  != s->s.frames[CUR_FRAME].tf.f->width ||
+         s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
+        vp9_frame_unref(avctx, &s->s.frames[REF_FRAME_SEGMAP]);
+    }
 
-    f = s->frames[CUR_FRAME].tf.f;
-    f->key_frame = s->keyframe;
-    f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
-
-    if (s->fullrange)
-        avctx->color_range = AVCOL_RANGE_JPEG;
-    else
-        avctx->color_range = AVCOL_RANGE_MPEG;
-
-    switch (s->colorspace) {
-    case 1: avctx->colorspace = AVCOL_SPC_BT470BG; break;
-    case 2: avctx->colorspace = AVCOL_SPC_BT709; break;
-    case 3: avctx->colorspace = AVCOL_SPC_SMPTE170M; break;
-    case 4: avctx->colorspace = AVCOL_SPC_SMPTE240M; break;
+    // ref frame setup
+    for (i = 0; i < 8; i++) {
+        if (s->next_refs[i].f->buf[0])
+            ff_thread_release_buffer(avctx, &s->next_refs[i]);
+        if (s->s.h.refreshrefmask & (1 << i)) {
+            ret = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
+        } else if (s->s.refs[i].f->buf[0]) {
+            ret = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
+        }
+        if (ret < 0)
+            return ret;
     }
 
-    s->pass = s->uses_2pass =
-        avctx->active_thread_type & FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
+    if (avctx->hwaccel) {
+        ret = avctx->hwaccel->start_frame(avctx, NULL, 0);
+        if (ret < 0)
+            return ret;
+        ret = avctx->hwaccel->decode_slice(avctx, pkt->data, pkt->size);
+        if (ret < 0)
+            return ret;
+        ret = avctx->hwaccel->end_frame(avctx);
+        if (ret < 0)
+            return ret;
+        goto finish;
+    }
 
-    if (s->refreshctx && s->parallelmode) {
+    // main tile decode loop
+    memset(s->above_partition_ctx, 0, s->cols);
+    memset(s->above_skip_ctx, 0, s->cols);
+    if (s->s.h.keyframe || s->s.h.intraonly) {
+        memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
+    } else {
+        memset(s->above_mode_ctx, NEARESTMV, s->cols);
+    }
+    memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
+    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
+    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
+    memset(s->above_segpred_ctx, 0, s->cols);
+    s->pass = s->s.frames[CUR_FRAME].uses_2pass =
+        avctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
+    if ((ret = update_block_buffers(avctx)) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Failed to allocate block buffers\n");
+        return ret;
+    }
+    if (s->s.h.refreshctx && s->s.h.parallelmode) {
         int j, k, l, m;
+
         for (i = 0; i < 4; i++) {
             for (j = 0; j < 2; j++)
                 for (k = 0; k < 2; k++)
                     for (l = 0; l < 6; l++)
                         for (m = 0; m < 6; m++)
-                            memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
+                            memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
                                    s->prob.coef[i][j][k][l][m], 3);
-            if (s->txfmmode == i)
+            if (s->s.h.txfmmode == i)
                 break;
         }
-        s->prob_ctx[s->framectxid].p = s->prob.p;
-    }
-    if ((s->parallelmode || !s->refreshctx) &&
-        avctx->active_thread_type & FF_THREAD_FRAME) {
+        s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
+        ff_thread_finish_setup(avctx);
+    } else if (!s->s.h.refreshctx) {
         ff_thread_finish_setup(avctx);
-        s->setup_finished = 1;
     }
 
-    // main tile decode loop
-    memset(s->above_partition_ctx, 0, s->cols);
-    memset(s->above_skip_ctx, 0, s->cols);
-    if (s->keyframe || s->intraonly)
-        memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
-    else
-        memset(s->above_mode_ctx, NEARESTMV, s->cols);
-    memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
-    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
-    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
-    memset(s->above_segpred_ctx, 0, s->cols);
+#if HAVE_THREADS
+    if (avctx->active_thread_type & FF_THREAD_SLICE) {
+        for (i = 0; i < s->sb_rows; i++)
+            atomic_store(&s->entries[i], 0);
+    }
+#endif
 
     do {
-        ptrdiff_t yoff = 0, uvoff = 0;
-        s->b          = s->b_base;
-        s->block      = s->block_base;
-        s->uvblock[0] = s->uvblock_base[0];
-        s->uvblock[1] = s->uvblock_base[1];
-        s->eob        = s->eob_base;
-        s->uveob[0]   = s->uveob_base[0];
-        s->uveob[1]   = s->uveob_base[1];
-
-        for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
-            set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
-                            tile_row, s->tiling.log2_tile_rows, s->sb_rows);
-
-            if (s->pass != 2) {
-                for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].b = s->td[i].b_base;
+            s->td[i].block = s->td[i].block_base;
+            s->td[i].uvblock[0] = s->td[i].uvblock_base[0];
+            s->td[i].uvblock[1] = s->td[i].uvblock_base[1];
+            s->td[i].eob = s->td[i].eob_base;
+            s->td[i].uveob[0] = s->td[i].uveob_base[0];
+            s->td[i].uveob[1] = s->td[i].uveob_base[1];
+        }
+
+#if HAVE_THREADS
+        if (avctx->active_thread_type == FF_THREAD_SLICE) {
+            int tile_row, tile_col;
+
+            av_assert1(!s->pass);
+
+            for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+                for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
                     int64_t tile_size;
 
-                    if (tile_col == s->tiling.tile_cols - 1 &&
-                        tile_row == s->tiling.tile_rows - 1) {
+                    if (tile_col == s->s.h.tiling.tile_cols - 1 &&
+                        tile_row == s->s.h.tiling.tile_rows - 1) {
                         tile_size = size;
                     } else {
                         tile_size = AV_RB32(data);
-                        data     += 4;
-                        size     -= 4;
-                    }
-                    if (tile_size > size) {
-                        ret = AVERROR_INVALIDDATA;
-                        goto fail;
-                    }
-                    ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
-                    if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
-                        ret = AVERROR_INVALIDDATA;
-                        goto fail;
+                        data += 4;
+                        size -= 4;
                     }
+                    if (tile_size > size)
+                        return AVERROR_INVALIDDATA;
+                    ret = ff_vp56_init_range_decoder(&s->td[tile_col].c_b[tile_row], data, tile_size);
+                    if (ret < 0)
+                        return ret;
+                    if (vp56_rac_get_prob_branchy(&s->td[tile_col].c_b[tile_row], 128)) // marker bit
+                        return AVERROR_INVALIDDATA;
                     data += tile_size;
                     size -= tile_size;
                 }
             }
 
-            for (row = s->tiling.tile_row_start;
-                 row < s->tiling.tile_row_end;
-                 row += 8, yoff += f->linesize[0] * 64,
-                 uvoff += f->linesize[1] * 32) {
-                VP9Filter *lflvl = s->lflvl;
-                ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
-
-                for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
-                    set_tile_offset(&s->tiling.tile_col_start,
-                                    &s->tiling.tile_col_end,
-                                    tile_col, s->tiling.log2_tile_cols, s->sb_cols);
-
-                    memset(s->left_partition_ctx, 0, 8);
-                    memset(s->left_skip_ctx, 0, 8);
-                    if (s->keyframe || s->intraonly)
-                        memset(s->left_mode_ctx, DC_PRED, 16);
-                    else
-                        memset(s->left_mode_ctx, NEARESTMV, 8);
-                    memset(s->left_y_nnz_ctx, 0, 16);
-                    memset(s->left_uv_nnz_ctx, 0, 16);
-                    memset(s->left_segpred_ctx, 0, 8);
-
-                    memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
-                    for (col = s->tiling.tile_col_start;
-                         col < s->tiling.tile_col_end;
-                         col += 8, yoff2 += 64, uvoff2 += 32, lflvl++) {
-                        // FIXME integrate with lf code (i.e. zero after each
-                        // use, similar to invtxfm coefficients, or similar)
-                        if (s->pass != 1)
-                            memset(lflvl->mask, 0, sizeof(lflvl->mask));
-
-                        if (s->pass == 2) {
-                            ret = decode_superblock_mem(avctx, row, col, lflvl,
-                                                        yoff2, uvoff2, BL_64X64);
-                        } else {
-                            ret = decode_subblock(avctx, row, col, lflvl,
-                                                  yoff2, uvoff2, BL_64X64);
-                        }
-                        if (ret < 0)
-                            goto fail;
-                    }
-                    if (s->pass != 2)
-                        memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
-                }
-
-                if (s->pass == 1)
-                    continue;
-
-                // backup pre-loopfilter reconstruction data for intra
-                // prediction of next row of sb64s
-                if (row + 8 < s->rows) {
-                    memcpy(s->intra_pred_data[0],
-                           f->data[0] + yoff +
-                           63 * f->linesize[0],
-                           8 * s->cols);
-                    memcpy(s->intra_pred_data[1],
-                           f->data[1] + uvoff +
-                           31 * f->linesize[1],
-                           4 * s->cols);
-                    memcpy(s->intra_pred_data[2],
-                           f->data[2] + uvoff +
-                           31 * f->linesize[2],
-                           4 * s->cols);
-                }
-
-                // loopfilter one row
-                if (s->filter.level) {
-                    yoff2  = yoff;
-                    uvoff2 = uvoff;
-                    lflvl  = s->lflvl;
-                    for (col = 0; col < s->cols;
-                         col += 8, yoff2 += 64, uvoff2 += 32, lflvl++)
-                        loopfilter_subblock(avctx, lflvl, row, col, yoff2, uvoff2);
-                }
-
-                // FIXME maybe we can make this more finegrained by running the
-                // loopfilter per-block instead of after each sbrow
-                // In fact that would also make intra pred left preparation easier?
-                ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
+            ff_slice_thread_execute_with_mainfunc(avctx, decode_tiles_mt, loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols);
+        } else
+#endif
+        {
+            ret = decode_tiles(avctx, data, size);
+            if (ret < 0) {
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                return ret;
             }
         }
 
-        if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
+        // Sum all counts fields into td[0].counts for tile threading
+        if (avctx->active_thread_type == FF_THREAD_SLICE)
+            for (i = 1; i < s->s.h.tiling.tile_cols; i++)
+                for (j = 0; j < sizeof(s->td[i].counts) / sizeof(unsigned); j++)
+                    ((unsigned *)&s->td[0].counts)[j] += ((unsigned *)&s->td[i].counts)[j];
+
+        if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
             ff_vp9_adapt_probs(s);
-            if (avctx->active_thread_type & FF_THREAD_FRAME) {
-                ff_thread_finish_setup(avctx);
-                s->setup_finished = 1;
-            }
+            ff_thread_finish_setup(avctx);
         }
     } while (s->pass++ == 1);
-fail:
-    ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
-    if (ret < 0)
-        return ret;
+    ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
 
+finish:
     // ref frame setup
-    if (!s->setup_finished) {
-        ret = update_refs(avctx);
-        if (ret < 0)
+    for (i = 0; i < 8; i++) {
+        if (s->s.refs[i].f->buf[0])
+            ff_thread_release_buffer(avctx, &s->s.refs[i]);
+        if (s->next_refs[i].f->buf[0] &&
+            (ret = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
             return ret;
     }
 
-    if (!s->invisible) {
-        av_frame_unref(frame);
-        ret = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f);
-        if (ret < 0)
+    if (!s->s.h.invisible) {
+        if ((ret = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
             return ret;
         *got_frame = 1;
     }
@@ -1435,25 +1685,39 @@ fail:
     return pkt->size;
 }
 
-static av_cold int vp9_decode_free(AVCodecContext *avctx)
+static void vp9_decode_flush(AVCodecContext *avctx)
 {
     VP9Context *s = avctx->priv_data;
     int i;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
-        vp9_frame_unref(avctx, &s->frames[i]);
-        av_frame_free(&s->frames[i].tf.f);
-    }
+    for (i = 0; i < 3; i++)
+        vp9_frame_unref(avctx, &s->s.frames[i]);
+    for (i = 0; i < 8; i++)
+        ff_thread_release_buffer(avctx, &s->s.refs[i]);
+}
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++) {
-        ff_thread_release_buffer(avctx, &s->refs[i]);
-        av_frame_free(&s->refs[i].f);
-    }
+static int init_frames(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    int i;
 
-    av_freep(&s->c_b);
-    av_freep(&s->above_partition_ctx);
-    av_freep(&s->b_base);
-    av_freep(&s->block_base);
+    for (i = 0; i < 3; i++) {
+        s->s.frames[i].tf.f = av_frame_alloc();
+        if (!s->s.frames[i].tf.f) {
+            vp9_decode_free(avctx);
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
+            return AVERROR(ENOMEM);
+        }
+    }
+    for (i = 0; i < 8; i++) {
+        s->s.refs[i].f = av_frame_alloc();
+        s->next_refs[i].f = av_frame_alloc();
+        if (!s->s.refs[i].f || !s->next_refs[i].f) {
+            vp9_decode_free(avctx);
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
+            return AVERROR(ENOMEM);
+        }
+    }
 
     return 0;
 }
@@ -1461,78 +1725,65 @@ static av_cold int vp9_decode_free(AVCodecContext *avctx)
 static av_cold int vp9_decode_init(AVCodecContext *avctx)
 {
     VP9Context *s = avctx->priv_data;
-    int i;
-
-    memset(s, 0, sizeof(*s));
 
     avctx->internal->allocate_progress = 1;
+    s->last_bpp = 0;
+    s->s.h.filter.sharpness = -1;
 
-    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-
-    ff_vp9dsp_init(&s->dsp);
-    ff_videodsp_init(&s->vdsp, 8);
-
-    s->frames[0].tf.f = av_frame_alloc();
-    s->frames[1].tf.f = av_frame_alloc();
-    if (!s->frames[0].tf.f || !s->frames[1].tf.f)
-        goto fail;
-
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++) {
-        s->refs[i].f = av_frame_alloc();
-        if (!s->refs[i].f)
-            goto fail;
-    }
-
-    s->filter.sharpness = -1;
+    return init_frames(avctx);
+}
 
-    return 0;
-fail:
-    vp9_decode_free(avctx);
-    return AVERROR(ENOMEM);
+#if HAVE_THREADS
+static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
+{
+    return init_frames(avctx);
 }
 
 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
 {
-    VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
     int i, ret;
+    VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
 
-    ret = update_size(dst, ssrc->alloc_width, ssrc->alloc_height);
-    if (ret < 0)
-        return ret;
-
-    for (i = 0; i < 2; i++) {
-        if (s->frames[i].tf.f->data[0])
-            vp9_frame_unref(dst, &s->frames[i]);
-        if (ssrc->frames[i].tf.f->data[0]) {
-            if ((ret = vp9_frame_ref(&s->frames[i], &ssrc->frames[i])) < 0)
+    for (i = 0; i < 3; i++) {
+        if (s->s.frames[i].tf.f->buf[0])
+            vp9_frame_unref(dst, &s->s.frames[i]);
+        if (ssrc->s.frames[i].tf.f->buf[0]) {
+            if ((ret = vp9_frame_ref(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
                 return ret;
         }
     }
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++) {
-        ff_thread_release_buffer(dst, &s->refs[i]);
-        if (ssrc->refs[i].f->buf[0]) {
-            ret = ff_thread_ref_frame(&s->refs[i], &ssrc->refs[i]);
-            if (ret < 0)
+    for (i = 0; i < 8; i++) {
+        if (s->s.refs[i].f->buf[0])
+            ff_thread_release_buffer(dst, &s->s.refs[i]);
+        if (ssrc->next_refs[i].f->buf[0]) {
+            if ((ret = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
                 return ret;
         }
     }
 
-    s->refreshrefmask = ssrc->refreshrefmask;
-    ret = update_refs(dst);
-    if (ret < 0)
-        return ret;
-
-    s->invisible       = ssrc->invisible;
-    s->keyframe        = ssrc->keyframe;
-    s->last_uses_2pass = ssrc->uses_2pass;
-
+    s->s.h.invisible = ssrc->s.h.invisible;
+    s->s.h.keyframe = ssrc->s.h.keyframe;
+    s->s.h.intraonly = ssrc->s.h.intraonly;
+    s->ss_v = ssrc->ss_v;
+    s->ss_h = ssrc->ss_h;
+    s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
+    s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
+    s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
+    s->bytesperpixel = ssrc->bytesperpixel;
+    s->gf_fmt = ssrc->gf_fmt;
+    s->w = ssrc->w;
+    s->h = ssrc->h;
+    s->s.h.bpp = ssrc->s.h.bpp;
+    s->bpp_index = ssrc->bpp_index;
+    s->pix_fmt = ssrc->pix_fmt;
     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
-    memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
-    memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
-           sizeof(s->segmentation.feat));
+    memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
+    memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
+           sizeof(s->s.h.segmentation.feat));
 
     return 0;
 }
+#endif
 
 AVCodec ff_vp9_decoder = {
     .name                  = "vp9",
@@ -1541,11 +1792,31 @@ AVCodec ff_vp9_decoder = {
     .id                    = AV_CODEC_ID_VP9,
     .priv_data_size        = sizeof(VP9Context),
     .init                  = vp9_decode_init,
+    .close                 = vp9_decode_free,
     .decode                = vp9_decode_frame,
+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal         = FF_CODEC_CAP_SLICE_THREAD_HAS_MF,
     .flush                 = vp9_decode_flush,
-    .close                 = vp9_decode_free,
-    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
-    .init_thread_copy      = vp9_decode_init,
-    .update_thread_context = vp9_decode_update_thread_context,
+    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
+    .profiles              = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
     .bsfs                  = "vp9_superframe_split",
+    .hw_configs            = (const AVCodecHWConfigInternal*[]) {
+#if CONFIG_VP9_DXVA2_HWACCEL
+                               HWACCEL_DXVA2(vp9),
+#endif
+#if CONFIG_VP9_D3D11VA_HWACCEL
+                               HWACCEL_D3D11VA(vp9),
+#endif
+#if CONFIG_VP9_D3D11VA2_HWACCEL
+                               HWACCEL_D3D11VA2(vp9),
+#endif
+#if CONFIG_VP9_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(vp9),
+#endif
+#if CONFIG_VP9_VAAPI_HWACCEL
+                               HWACCEL_VAAPI(vp9),
+#endif
+                               NULL
+                           },
 };
diff --git a/libavcodec/vp9.h b/libavcodec/vp9.h
index 6fd5ba1..c8d07ad 100644
--- a/libavcodec/vp9.h
+++ b/libavcodec/vp9.h
@@ -4,36 +4,26 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_VP9_H
 #define AVCODEC_VP9_H
 
-#include <stddef.h>
-#include <stdint.h>
-
-#include "libavutil/buffer.h"
-#include "libavutil/internal.h"
-
-#include "avcodec.h"
-#include "thread.h"
-#include "vp56.h"
-
 enum TxfmMode {
     TX_4X4,
     TX_8X8,
@@ -76,375 +66,8 @@ enum FilterMode {
     FILTER_8TAP_REGULAR,
     FILTER_8TAP_SHARP,
     FILTER_BILINEAR,
-    FILTER_SWITCHABLE,
-};
-
-enum BlockPartition {
-    PARTITION_NONE,    // [ ] <-.
-    PARTITION_H,       // [-]   |
-    PARTITION_V,       // [|]   |
-    PARTITION_SPLIT,   // [+] --'
-};
-
-enum InterPredMode {
-    NEARESTMV = 10,
-    NEARMV    = 11,
-    ZEROMV    = 12,
-    NEWMV     = 13,
-};
-
-enum MVJoint {
-    MV_JOINT_ZERO,
-    MV_JOINT_H,
-    MV_JOINT_V,
-    MV_JOINT_HV,
-};
-
-typedef struct ProbContext {
-    uint8_t y_mode[4][9];
-    uint8_t uv_mode[10][9];
-    uint8_t filter[4][2];
-    uint8_t mv_mode[7][3];
-    uint8_t intra[4];
-    uint8_t comp[5];
-    uint8_t single_ref[5][2];
-    uint8_t comp_ref[5];
-    uint8_t tx32p[2][3];
-    uint8_t tx16p[2][2];
-    uint8_t tx8p[2];
-    uint8_t skip[3];
-    uint8_t mv_joint[3];
-    struct {
-        uint8_t sign;
-        uint8_t classes[10];
-        uint8_t class0;
-        uint8_t bits[10];
-        uint8_t class0_fp[2][3];
-        uint8_t fp[3];
-        uint8_t class0_hp;
-        uint8_t hp;
-    } mv_comp[2];
-    uint8_t partition[4][4][3];
-} ProbContext;
-
-typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
-                            const uint8_t *ref, ptrdiff_t ref_stride,
-                            int h, int mx, int my);
-
-typedef struct VP9DSPContext {
-    /*
-     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32
-     * dimension 2: intra prediction modes
-     *
-     * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
-     * stride is aligned by 16 pixels
-     * top[-1] is top/left; top[4,7] is top-right for 4x4
-     */
-    // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/
-    // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place?
-    // also needs to fit in with what H.264/VP8/etc do
-    void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst,
-                                                         ptrdiff_t stride,
-                                                         const uint8_t *left,
-                                                         const uint8_t *top);
-
-    /*
-     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only)
-     * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst
-     *
-     * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
-     * stride is aligned by 16 pixels
-     * block is 16-byte aligned
-     * eob indicates the position (+1) of the last non-zero coefficient,
-     * in scan-order. This can be used to write faster versions, e.g. a
-     * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32,
-     * etc.
-     */
-    // FIXME also write idct_add_block() versions for whole (inter) pred
-    // blocks, so we can do 2 4x4s at once
-    void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst,
-                                                      ptrdiff_t stride,
-                                                      int16_t *block, int eob);
-
-    /*
-     * dimension 1: width of filter (0=4, 1=8, 2=16)
-     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * dst/stride are aligned by 8
-     */
-    void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride,
-                                int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * The width of filter is assumed to be 16; dst/stride are aligned by 16
-     */
-    void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride,
-                              int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1/2: width of filter (0=4, 1=8) for each filter half
-     * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * dst/stride are aligned by operation size
-     * this basically calls loop_filter[d1][d3][0](), followed by
-     * loop_filter[d2][d3][0]() on the next 8 pixels
-     * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the
-     * integer.
-     */
-    // FIXME perhaps a mix4 that operates on 32px (for AVX2)
-    void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride,
-                                      int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4)
-     * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin)
-     * dimension 3: averaging type (0: put, 1: avg)
-     * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin)
-     * dimension 5: y subpel interpolation (1: none, 1: 8tap/bilin)
-     *
-     * dst/stride are aligned by hsize
-     */
-    vp9_mc_func mc[5][4][2][2][2];
-} VP9DSPContext;
-
-enum CompPredMode {
-    PRED_SINGLEREF,
-    PRED_COMPREF,
-    PRED_SWITCHABLE,
+    N_FILTERS,
+    FILTER_SWITCHABLE = N_FILTERS,
 };
 
-typedef struct VP9MVRefPair {
-    VP56mv mv[2];
-    int8_t ref[2];
-} VP9MVRefPair;
-
-typedef struct VP9Filter {
-    uint8_t level[8 * 8];
-    uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
-                              [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
-} VP9Filter;
-
-typedef struct VP9Frame {
-    ThreadFrame tf;
-
-    uint8_t *segmentation_map;
-    VP9MVRefPair *mv;
-
-    AVBufferRef *segmentation_map_buf;
-    AVBufferRef *mv_buf;
-} VP9Frame;
-
-enum BlockLevel {
-    BL_64X64,
-    BL_32X32,
-    BL_16X16,
-    BL_8X8,
-};
-
-enum BlockSize {
-    BS_64x64,
-    BS_64x32,
-    BS_32x64,
-    BS_32x32,
-    BS_32x16,
-    BS_16x32,
-    BS_16x16,
-    BS_16x8,
-    BS_8x16,
-    BS_8x8,
-    BS_8x4,
-    BS_4x8,
-    BS_4x4,
-    N_BS_SIZES,
-};
-
-typedef struct VP9Block {
-    uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
-    enum FilterMode filter;
-    VP56mv mv[4 /* b_idx */][2 /* ref */];
-    enum BlockSize bs;
-    enum TxfmMode tx, uvtx;
-
-    int row, row7, col, col7;
-    uint8_t *dst[3];
-    ptrdiff_t y_stride, uv_stride;
-
-    enum BlockLevel bl;
-    enum BlockPartition bp;
-} VP9Block;
-
-typedef struct VP9Context {
-    VP9DSPContext dsp;
-    VideoDSPContext vdsp;
-    GetBitContext gb;
-    VP56RangeCoder c;
-    VP56RangeCoder *c_b;
-    unsigned c_b_size;
-    VP9Block *b;
-    VP9Block *b_base;
-
-    int alloc_width;
-    int alloc_height;
-
-    int pass;
-    int uses_2pass;
-    int last_uses_2pass;
-    int setup_finished;
-
-    // bitstream header
-    uint8_t profile;
-    uint8_t keyframe, last_keyframe;
-    uint8_t invisible;
-    uint8_t use_last_frame_mvs;
-    uint8_t errorres;
-    uint8_t colorspace;
-    uint8_t sub_x;
-    uint8_t sub_y;
-    uint8_t fullrange;
-    uint8_t intraonly;
-    uint8_t resetctx;
-    uint8_t refreshrefmask;
-    uint8_t highprecisionmvs;
-    enum FilterMode filtermode;
-    uint8_t allowcompinter;
-    uint8_t fixcompref;
-    uint8_t refreshctx;
-    uint8_t parallelmode;
-    uint8_t framectxid;
-    uint8_t refidx[3];
-    uint8_t signbias[3];
-    uint8_t varcompref[2];
-
-    ThreadFrame refs[8];
-
-#define CUR_FRAME 0
-#define LAST_FRAME 1
-    VP9Frame frames[2];
-
-    struct {
-        uint8_t level;
-        int8_t sharpness;
-        uint8_t lim_lut[64];
-        uint8_t mblim_lut[64];
-    } filter;
-    struct {
-        uint8_t enabled;
-        int8_t mode[2];
-        int8_t ref[4];
-    } lf_delta;
-    uint8_t yac_qi;
-    int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
-    uint8_t lossless;
-    struct {
-        uint8_t enabled;
-        uint8_t temporal;
-        uint8_t absolute_vals;
-        uint8_t update_map;
-        #define MAX_SEGMENT 8
-        struct {
-            uint8_t q_enabled;
-            uint8_t lf_enabled;
-            uint8_t ref_enabled;
-            uint8_t skip_enabled;
-            uint8_t ref_val;
-            int16_t q_val;
-            int8_t lf_val;
-            int16_t qmul[2][2];
-            uint8_t lflvl[4][2];
-        } feat[MAX_SEGMENT];
-    } segmentation;
-    struct {
-        unsigned log2_tile_cols, log2_tile_rows;
-        unsigned tile_cols, tile_rows;
-        unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
-    } tiling;
-    unsigned sb_cols, sb_rows, rows, cols;
-    struct {
-        ProbContext p;
-        uint8_t coef[4][2][2][6][6][3];
-    } prob_ctx[4];
-    struct {
-        ProbContext p;
-        uint8_t coef[4][2][2][6][6][11];
-        uint8_t seg[7];
-        uint8_t segpred[3];
-    } prob;
-    struct {
-        unsigned y_mode[4][10];
-        unsigned uv_mode[10][10];
-        unsigned filter[4][3];
-        unsigned mv_mode[7][4];
-        unsigned intra[4][2];
-        unsigned comp[5][2];
-        unsigned single_ref[5][2][2];
-        unsigned comp_ref[5][2];
-        unsigned tx32p[2][4];
-        unsigned tx16p[2][3];
-        unsigned tx8p[2][2];
-        unsigned skip[3][2];
-        unsigned mv_joint[4];
-        struct {
-            unsigned sign[2];
-            unsigned classes[11];
-            unsigned class0[2];
-            unsigned bits[10][2];
-            unsigned class0_fp[2][4];
-            unsigned fp[4];
-            unsigned class0_hp[2];
-            unsigned hp[2];
-        } mv_comp[2];
-        unsigned partition[4][4][4];
-        unsigned coef[4][2][2][6][6][3];
-        unsigned eob[4][2][2][6][6][2];
-    } counts;
-    enum TxfmMode txfmmode;
-    enum CompPredMode comppredmode;
-
-    // contextual (left/above) cache
-    uint8_t left_partition_ctx[8], *above_partition_ctx;
-    uint8_t left_mode_ctx[16], *above_mode_ctx;
-    // FIXME maybe merge some of the below in a flags field?
-    uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
-    uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
-    uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
-    uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
-    uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
-    uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
-    uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
-    uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
-    uint8_t left_filter_ctx[8], *above_filter_ctx;
-    VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
-
-    // whole-frame cache
-    uint8_t *intra_pred_data[3];
-    VP9Filter *lflvl;
-    // This requires 64 + 8 rows, with 80 bytes stride
-    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[72 * 80];
-
-    // block reconstruction intermediates
-    int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
-    uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
-    struct { int x, y; } min_mv, max_mv;
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32 * 32];
-} VP9Context;
-
-extern const int8_t ff_vp9_subpel_filters[3][15][8];
-
-void ff_vp9dsp_init(VP9DSPContext *dsp);
-
-void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp);
-void ff_vp9dsp_init_arm(VP9DSPContext *dsp);
-void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
-
-void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb);
-
-void ff_vp9_adapt_probs(VP9Context *s);
-
-int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
-                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
-                        enum BlockLevel bl, enum BlockPartition bp);
-
 #endif /* AVCODEC_VP9_H */
diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c
new file mode 100644
index 0000000..31e692f
--- /dev/null
+++ b/libavcodec/vp9_mc_template.c
@@ -0,0 +1,439 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define ROUNDED_DIV_MVx2(a, b) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x, 2), .y = ROUNDED_DIV(a.y + b.y, 2) }
+#define ROUNDED_DIV_MVx4(a, b, c, d) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x + c.x + d.x, 4), \
+               .y = ROUNDED_DIV(a.y + b.y + c.y + d.y, 4) }
+
+static void FN(inter_pred)(VP9TileData *td)
+{
+    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
+        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
+        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
+    };
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
+    int row = td->row, col = td->col;
+    ThreadFrame *tref1 = &s->s.refs[s->s.h.refidx[b->ref[0]]], *tref2;
+    AVFrame *ref1 = tref1->f, *ref2;
+    int w1 = ref1->width, h1 = ref1->height, w2, h2;
+    ptrdiff_t ls_y = td->y_stride, ls_uv = td->uv_stride;
+    int bytesperpixel = BYTES_PER_PIXEL;
+
+    if (b->comp) {
+        tref2 = &s->s.refs[s->s.h.refidx[b->ref[1]]];
+        ref2 = tref2->f;
+        w2 = ref2->width;
+        h2 = ref2->height;
+    }
+
+    // y inter pred
+    if (b->bs > BS_8x8) {
+        VP56mv uvmv;
+
+#if SCALED == 0
+        if (b->bs == BS_8x4) {
+            mc_luma_dir(td, mc[3][b->filter][0], td->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],,,,, 8, 4, w1, h1, 0);
+            mc_luma_dir(td, mc[3][b->filter][0],
+                        td->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0],,,,, 8, 4, w1, h1, 0);
+            w1 = (w1 + s->ss_h) >> s->ss_h;
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][0],
+                              td->dst[1], td->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 2, col << (3 - s->ss_h),
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+            } else {
+                mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][0],
+                              td->dst[1], td->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 3, col << (3 - s->ss_h),
+                              &b->mv[0][0],,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+                // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                // to get the motion vector for the bottom 4x4 block
+                // https://code.google.com/p/webm/issues/detail?id=993
+                if (s->ss_h == 0) {
+                    uvmv = b->mv[2][0];
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                }
+                mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][0],
+                              td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              (row << 3) + 4, col << (3 - s->ss_h),
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+            }
+
+            if (b->comp) {
+                mc_luma_dir(td, mc[3][b->filter][1], td->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1],,,,, 8, 4, w2, h2, 1);
+                mc_luma_dir(td, mc[3][b->filter][1],
+                            td->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1],,,,, 8, 4, w2, h2, 1);
+                w2 = (w2 + s->ss_h) >> s->ss_h;
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][1],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 2, col << (3 - s->ss_h),
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][1],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 3, col << (3 - s->ss_h),
+                                  &b->mv[0][1],,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                    // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                    // to get the motion vector for the bottom 4x4 block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    if (s->ss_h == 0) {
+                        uvmv = b->mv[2][1];
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    }
+                    mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][1],
+                                  td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  (row << 3) + 4, col << (3 - s->ss_h),
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                }
+            }
+        } else if (b->bs == BS_4x8) {
+            mc_luma_dir(td, mc[4][b->filter][0], td->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],,,,, 4, 8, w1, h1, 0);
+            mc_luma_dir(td, mc[4][b->filter][0], td->dst[0] + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0],,,,, 4, 8, w1, h1, 0);
+            h1 = (h1 + s->ss_v) >> s->ss_v;
+            if (s->ss_h) {
+                w1 = (w1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                mc_chroma_dir(td, mc[4][b->filter][0],
+                              td->dst[1], td->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 2,
+                              &uvmv,,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+            } else {
+                mc_chroma_dir(td, mc[4][b->filter][0],
+                              td->dst[1], td->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 3,
+                              &b->mv[0][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+                mc_chroma_dir(td, mc[4][b->filter][0],
+                              td->dst[1] + 4 * bytesperpixel,
+                              td->dst[2] + 4 * bytesperpixel, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), (col << 3) + 4,
+                              &b->mv[1][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+            }
+
+            if (b->comp) {
+                mc_luma_dir(td, mc[4][b->filter][1], td->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1],,,,, 4, 8, w2, h2, 1);
+                mc_luma_dir(td, mc[4][b->filter][1], td->dst[0] + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1],,,,, 4, 8, w2, h2, 1);
+                h2 = (h2 + s->ss_v) >> s->ss_v;
+                if (s->ss_h) {
+                    w2 = (w2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                    mc_chroma_dir(td, mc[4][b->filter][1],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 2,
+                                  &uvmv,,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(td, mc[4][b->filter][1],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 3,
+                                  &b->mv[0][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                    mc_chroma_dir(td, mc[4][b->filter][1],
+                                  td->dst[1] + 4 * bytesperpixel,
+                                  td->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), (col << 3) + 4,
+                                  &b->mv[1][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                }
+            }
+        } else
+#endif
+        {
+#if SCALED == 0
+            av_assert2(b->bs == BS_4x4);
+#endif
+
+            // FIXME if two horizontally adjacent blocks have the same MV,
+            // do a w8 instead of a w4 call
+            mc_luma_dir(td, mc[4][b->filter][0], td->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],
+                        0, 0, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(td, mc[4][b->filter][0], td->dst[0] + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0],
+                        4, 0, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(td, mc[4][b->filter][0],
+                        td->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0],
+                        0, 4, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(td, mc[4][b->filter][0],
+                        td->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0],
+                        4, 4, 8, 8, 4, 4, w1, h1, 0);
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx4(b->mv[0][0], b->mv[1][0],
+                                            b->mv[2][0], b->mv[3][0]);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 2,
+                                  &uvmv, 0, 0, 4, 4, 4, 4, w1, h1, 0);
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 3,
+                                  &uvmv, 0, 0, 8, 4, 4, 4, w1, h1, 0);
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1] + 4 * bytesperpixel,
+                                  td->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, (col << 3) + 4,
+                                  &uvmv, 4, 0, 8, 4, 4, 4, w1, h1, 0);
+                }
+            } else {
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 2,
+                                  &uvmv, 0, 0, 4, 8, 4, 4, w1, h1, 0);
+                    // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                    // bottom block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[2][0]);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 2,
+                                  &uvmv, 0, 4, 4, 8, 4, 4, w1, h1, 0);
+                } else {
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1], td->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 3,
+                                  &b->mv[0][0], 0, 0, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1] + 4 * bytesperpixel,
+                                  td->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, (col << 3) + 4,
+                                  &b->mv[1][0], 4, 0, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 3,
+                                  &b->mv[2][0], 0, 4, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(td, mc[4][b->filter][0],
+                                  td->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
+                                  td->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, (col << 3) + 4,
+                                  &b->mv[3][0], 4, 4, 8, 8, 4, 4, w1, h1, 0);
+                }
+            }
+
+            if (b->comp) {
+                mc_luma_dir(td, mc[4][b->filter][1], td->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(td, mc[4][b->filter][1], td->dst[0] + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(td, mc[4][b->filter][1],
+                            td->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(td, mc[4][b->filter][1],
+                            td->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx4(b->mv[0][1], b->mv[1][1],
+                                                b->mv[2][1], b->mv[3][1]);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1], td->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 2,
+                                      &uvmv, 0, 0, 4, 4, 4, 4, w2, h2, 1);
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1], td->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 3,
+                                      &uvmv, 0, 0, 8, 4, 4, 4, w2, h2, 1);
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1] + 4 * bytesperpixel,
+                                      td->dst[2] + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, (col << 3) + 4,
+                                      &uvmv, 4, 0, 8, 4, 4, 4, w2, h2, 1);
+                    }
+                } else {
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1], td->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 2,
+                                      &uvmv, 0, 0, 4, 8, 4, 4, w2, h2, 1);
+                        // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                        // bottom block
+                        // https://code.google.com/p/webm/issues/detail?id=993
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[2][1]);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 2,
+                                      &uvmv, 0, 4, 4, 8, 4, 4, w2, h2, 1);
+                    } else {
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1], td->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 3,
+                                      &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1] + 4 * bytesperpixel,
+                                      td->dst[2] + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, (col << 3) + 4,
+                                      &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 3,
+                                      &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(td, mc[4][b->filter][1],
+                                      td->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
+                                      td->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, (col << 3) + 4,
+                                      &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
+                    }
+                }
+            }
+        }
+    } else {
+        int bwl = bwlog_tab[0][b->bs];
+        int bw = ff_vp9_bwh_tab[0][b->bs][0] * 4;
+        int bh = ff_vp9_bwh_tab[0][b->bs][1] * 4;
+        int uvbw = ff_vp9_bwh_tab[s->ss_h][b->bs][0] * 4;
+        int uvbh = ff_vp9_bwh_tab[s->ss_v][b->bs][1] * 4;
+
+        mc_luma_dir(td, mc[bwl][b->filter][0], td->dst[0], ls_y,
+                    ref1->data[0], ref1->linesize[0], tref1,
+                    row << 3, col << 3, &b->mv[0][0], 0, 0, bw, bh, bw, bh, w1, h1, 0);
+        w1 = (w1 + s->ss_h) >> s->ss_h;
+        h1 = (h1 + s->ss_v) >> s->ss_v;
+        mc_chroma_dir(td, mc[bwl + s->ss_h][b->filter][0],
+                      td->dst[1], td->dst[2], ls_uv,
+                      ref1->data[1], ref1->linesize[1],
+                      ref1->data[2], ref1->linesize[2], tref1,
+                      row << (3 - s->ss_v), col << (3 - s->ss_h),
+                      &b->mv[0][0], 0, 0, uvbw, uvbh, uvbw, uvbh, w1, h1, 0);
+
+        if (b->comp) {
+            mc_luma_dir(td, mc[bwl][b->filter][1], td->dst[0], ls_y,
+                        ref2->data[0], ref2->linesize[0], tref2,
+                        row << 3, col << 3, &b->mv[0][1], 0, 0, bw, bh, bw, bh, w2, h2, 1);
+            w2 = (w2 + s->ss_h) >> s->ss_h;
+            h2 = (h2 + s->ss_v) >> s->ss_v;
+            mc_chroma_dir(td, mc[bwl + s->ss_h][b->filter][1],
+                          td->dst[1], td->dst[2], ls_uv,
+                          ref2->data[1], ref2->linesize[1],
+                          ref2->data[2], ref2->linesize[2], tref2,
+                          row << (3 - s->ss_v), col << (3 - s->ss_h),
+                          &b->mv[0][1], 0, 0, uvbw, uvbh, uvbw, uvbh, w2, h2, 1);
+        }
+    }
+}
diff --git a/libavcodec/vp9_metadata_bsf.c b/libavcodec/vp9_metadata_bsf.c
new file mode 100644
index 0000000..b79f08a
--- /dev/null
+++ b/libavcodec/vp9_metadata_bsf.c
@@ -0,0 +1,164 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/opt.h"
+
+#include "bsf.h"
+#include "cbs.h"
+#include "cbs_vp9.h"
+
+typedef struct VP9MetadataContext {
+    const AVClass *class;
+
+    CodedBitstreamContext *cbc;
+    CodedBitstreamFragment fragment;
+
+    int color_space;
+    int color_range;
+
+    int color_range_rgb_warned;
+} VP9MetadataContext;
+
+
+static int vp9_metadata_filter(AVBSFContext *bsf, AVPacket *out)
+{
+    VP9MetadataContext *ctx = bsf->priv_data;
+    AVPacket *in = NULL;
+    CodedBitstreamFragment *frag = &ctx->fragment;
+    int err, i;
+
+    err = ff_bsf_get_packet(bsf, &in);
+    if (err < 0)
+        return err;
+
+    err = ff_cbs_read_packet(ctx->cbc, frag, in);
+    if (err < 0) {
+        av_log(bsf, AV_LOG_ERROR, "Failed to read packet.\n");
+        goto fail;
+    }
+
+    for (i = 0; i < frag->nb_units; i++) {
+        VP9RawFrame *frame = frag->units[i].content;
+        VP9RawFrameHeader *header = &frame->header;
+
+        if (ctx->color_space >= 0) {
+            header->color_space = ctx->color_space;
+        }
+        if (ctx->color_range >= 0) {
+            if (ctx->color_range == 0 &&
+                header->color_space == VP9_CS_RGB &&
+                !ctx->color_range_rgb_warned) {
+                av_log(bsf, AV_LOG_WARNING, "Warning: color_range cannot "
+                       "be set to limited in RGB streams.\n");
+                ctx->color_range_rgb_warned = 1;
+            } else {
+                header->color_range = ctx->color_range;
+            }
+        }
+    }
+
+    err = ff_cbs_write_packet(ctx->cbc, out, frag);
+    if (err < 0) {
+        av_log(bsf, AV_LOG_ERROR, "Failed to write packet.\n");
+        goto fail;
+    }
+
+    err = av_packet_copy_props(out, in);
+    if (err < 0)
+        goto fail;
+
+    err = 0;
+fail:
+    ff_cbs_fragment_reset(ctx->cbc, frag);
+
+    if (err < 0)
+        av_packet_unref(out);
+    av_packet_free(&in);
+
+    return err;
+}
+
+static int vp9_metadata_init(AVBSFContext *bsf)
+{
+    VP9MetadataContext *ctx = bsf->priv_data;
+
+    return ff_cbs_init(&ctx->cbc, AV_CODEC_ID_VP9, bsf);
+}
+
+static void vp9_metadata_close(AVBSFContext *bsf)
+{
+    VP9MetadataContext *ctx = bsf->priv_data;
+
+    ff_cbs_fragment_free(ctx->cbc, &ctx->fragment);
+    ff_cbs_close(&ctx->cbc);
+}
+
+#define OFFSET(x) offsetof(VP9MetadataContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_BSF_PARAM)
+static const AVOption vp9_metadata_options[] = {
+    { "color_space", "Set colour space (section 7.2.2)",
+        OFFSET(color_space), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, VP9_CS_RGB, FLAGS, "cs" },
+    { "unknown",  "Unknown/unspecified",  0, AV_OPT_TYPE_CONST,
+        { .i64 = VP9_CS_UNKNOWN   }, .flags = FLAGS, .unit = "cs" },
+    { "bt601",    "ITU-R BT.601-7",       0, AV_OPT_TYPE_CONST,
+        { .i64 = VP9_CS_BT_601    }, .flags = FLAGS, .unit = "cs" },
+    { "bt709",    "ITU-R BT.709-6",       0, AV_OPT_TYPE_CONST,
+        { .i64 = VP9_CS_BT_709    }, .flags = FLAGS, .unit = "cs" },
+    { "smpte170", "SMPTE-170",            0, AV_OPT_TYPE_CONST,
+        { .i64 = VP9_CS_SMPTE_170 }, .flags = FLAGS, .unit = "cs" },
+    { "smpte240", "SMPTE-240",            0, AV_OPT_TYPE_CONST,
+        { .i64 = VP9_CS_SMPTE_240 }, .flags = FLAGS, .unit = "cs" },
+    { "bt2020",   "ITU-R BT.2020-2",      0, AV_OPT_TYPE_CONST,
+        { .i64 = VP9_CS_BT_2020   }, .flags = FLAGS, .unit = "cs" },
+    { "rgb",      "sRGB / IEC 61966-2-1", 0, AV_OPT_TYPE_CONST,
+        { .i64 = VP9_CS_RGB       }, .flags = FLAGS, .unit = "cs" },
+
+    { "color_range", "Set colour range (section 7.2.2)",
+        OFFSET(color_range), AV_OPT_TYPE_INT,
+        { .i64 = -1 }, -1, 1, FLAGS, "cr" },
+    { "tv", "TV (limited) range", 0, AV_OPT_TYPE_CONST,
+        { .i64 = 0 }, .flags = FLAGS, .unit = "cr" },
+    { "pc", "PC (full) range",    0, AV_OPT_TYPE_CONST,
+        { .i64 = 1 }, .flags = FLAGS, .unit = "cr" },
+
+    { NULL }
+};
+
+static const AVClass vp9_metadata_class = {
+    .class_name = "vp9_metadata_bsf",
+    .item_name  = av_default_item_name,
+    .option     = vp9_metadata_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const enum AVCodecID vp9_metadata_codec_ids[] = {
+    AV_CODEC_ID_VP9, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_vp9_metadata_bsf = {
+    .name           = "vp9_metadata",
+    .priv_data_size = sizeof(VP9MetadataContext),
+    .priv_class     = &vp9_metadata_class,
+    .init           = &vp9_metadata_init,
+    .close          = &vp9_metadata_close,
+    .filter         = &vp9_metadata_filter,
+    .codec_ids      = vp9_metadata_codec_ids,
+};
diff --git a/libavcodec/vp9_parser.c b/libavcodec/vp9_parser.c
new file mode 100644
index 0000000..c957a75
--- /dev/null
+++ b/libavcodec/vp9_parser.c
@@ -0,0 +1,70 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavcodec/get_bits.h"
+#include "parser.h"
+
+static int parse(AVCodecParserContext *ctx,
+                 AVCodecContext *avctx,
+                 const uint8_t **out_data, int *out_size,
+                 const uint8_t *data, int size)
+{
+    GetBitContext gb;
+    int res, profile, keyframe;
+
+    *out_data = data;
+    *out_size = size;
+
+    if (!size || (res = init_get_bits8(&gb, data, size)) < 0)
+        return size; // parsers can't return errors
+    get_bits(&gb, 2); // frame marker
+    profile  = get_bits1(&gb);
+    profile |= get_bits1(&gb) << 1;
+    if (profile == 3) profile += get_bits1(&gb);
+    if (profile > 3)
+        return size;
+
+    avctx->profile = profile;
+
+    if (get_bits1(&gb)) {
+        keyframe = 0;
+    } else {
+        keyframe  = !get_bits1(&gb);
+    }
+
+    if (!keyframe) {
+        ctx->pict_type = AV_PICTURE_TYPE_P;
+        ctx->key_frame = 0;
+    } else {
+        ctx->pict_type = AV_PICTURE_TYPE_I;
+        ctx->key_frame = 1;
+    }
+
+    return size;
+}
+
+AVCodecParser ff_vp9_parser = {
+    .codec_ids      = { AV_CODEC_ID_VP9 },
+    .parser_parse   = parse,
+};
diff --git a/libavcodec/vp9_raw_reorder_bsf.c b/libavcodec/vp9_raw_reorder_bsf.c
index f5a5e49..01f3dad 100644
--- a/libavcodec/vp9_raw_reorder_bsf.c
+++ b/libavcodec/vp9_raw_reorder_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,8 +22,8 @@
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 
-#include "bitstream.h"
 #include "bsf.h"
+#include "get_bits.h"
 #include "put_bits.h"
 
 #define FRAME_SLOTS 8
@@ -73,7 +73,7 @@ static void vp9_raw_reorder_clear_slot(VP9RawReorderContext *ctx, int s)
 
 static int vp9_raw_reorder_frame_parse(AVBSFContext *bsf, VP9RawReorderFrame *frame)
 {
-    BitstreamContext bc;
+    GetBitContext bc;
     int err;
 
     unsigned int frame_marker;
@@ -81,22 +81,22 @@ static int vp9_raw_reorder_frame_parse(AVBSFContext *bsf, VP9RawReorderFrame *fr
     unsigned int error_resilient_mode;
     unsigned int frame_sync_code;
 
-    err = bitstream_init8(&bc, frame->packet->data, frame->packet->size);
+    err = init_get_bits(&bc, frame->packet->data, 8 * frame->packet->size);
     if (err)
         return err;
 
-    frame_marker = bitstream_read(&bc, 2);
+    frame_marker = get_bits(&bc, 2);
     if (frame_marker != 2) {
         av_log(bsf, AV_LOG_ERROR, "Invalid frame marker: %u.\n",
                frame_marker);
         return AVERROR_INVALIDDATA;
     }
 
-    profile_low_bit  = bitstream_read_bit(&bc);
-    profile_high_bit = bitstream_read_bit(&bc);
+    profile_low_bit  = get_bits1(&bc);
+    profile_high_bit = get_bits1(&bc);
     frame->profile = (profile_high_bit << 1) | profile_low_bit;
     if (frame->profile == 3) {
-        reserved_zero = bitstream_read_bit(&bc);
+        reserved_zero = get_bits1(&bc);
         if (reserved_zero != 0) {
             av_log(bsf, AV_LOG_ERROR, "Profile reserved_zero bit set: "
                    "unsupported profile or invalid bitstream.\n");
@@ -104,18 +104,18 @@ static int vp9_raw_reorder_frame_parse(AVBSFContext *bsf, VP9RawReorderFrame *fr
         }
     }
 
-    frame->show_existing_frame = bitstream_read_bit(&bc);
+    frame->show_existing_frame = get_bits1(&bc);
     if (frame->show_existing_frame) {
-        frame->frame_to_show = bitstream_read(&bc, 3);
+        frame->frame_to_show = get_bits(&bc, 3);
         return 0;
     }
 
-    frame->frame_type = bitstream_read_bit(&bc);
-    frame->show_frame = bitstream_read_bit(&bc);
-    error_resilient_mode = bitstream_read_bit(&bc);
+    frame->frame_type = get_bits1(&bc);
+    frame->show_frame = get_bits1(&bc);
+    error_resilient_mode = get_bits1(&bc);
 
     if (frame->frame_type == 0) {
-        frame_sync_code = bitstream_read(&bc, 24);
+        frame_sync_code = get_bits(&bc, 24);
         if (frame_sync_code != 0x498342) {
             av_log(bsf, AV_LOG_ERROR, "Invalid frame sync code: %06x.\n",
                    frame_sync_code);
@@ -126,15 +126,15 @@ static int vp9_raw_reorder_frame_parse(AVBSFContext *bsf, VP9RawReorderFrame *fr
         unsigned int intra_only;
 
         if (frame->show_frame == 0)
-            intra_only = bitstream_read_bit(&bc);
+            intra_only = get_bits1(&bc);
         else
             intra_only = 0;
         if (error_resilient_mode == 0) {
             // reset_frame_context
-            bitstream_skip(&bc, 2);
+            skip_bits(&bc, 2);
         }
         if (intra_only) {
-            frame_sync_code = bitstream_read(&bc, 24);
+            frame_sync_code = get_bits(&bc, 24);
             if (frame_sync_code != 0x498342) {
                 av_log(bsf, AV_LOG_ERROR, "Invalid frame sync code: "
                        "%06x.\n", frame_sync_code);
@@ -144,24 +144,24 @@ static int vp9_raw_reorder_frame_parse(AVBSFContext *bsf, VP9RawReorderFrame *fr
                 unsigned int color_space;
                 if (frame->profile >= 2) {
                     // ten_or_twelve_bit
-                    bitstream_skip(&bc, 1);
+                    skip_bits(&bc, 1);
                 }
-                color_space = bitstream_read(&bc, 3);
+                color_space = get_bits(&bc, 3);
                 if (color_space != 7 /* CS_RGB */) {
                     // color_range
-                    bitstream_skip(&bc, 1);
+                    skip_bits(&bc, 1);
                     if (frame->profile == 1 || frame->profile == 3) {
                         // subsampling
-                        bitstream_skip(&bc, 3);
+                        skip_bits(&bc, 3);
                     }
                 } else {
                     if (frame->profile == 1 || frame->profile == 3)
-                        bitstream_skip(&bc, 1);
+                        skip_bits(&bc, 1);
                 }
             }
-            frame->refresh_frame_flags = bitstream_read(&bc, 8);
+            frame->refresh_frame_flags = get_bits(&bc, 8);
         } else {
-            frame->refresh_frame_flags = bitstream_read(&bc, 8);
+            frame->refresh_frame_flags = get_bits(&bc, 8);
         }
     }
 
diff --git a/libavcodec/vp9_superframe_bsf.c b/libavcodec/vp9_superframe_bsf.c
index 04b158f..ea67507 100644
--- a/libavcodec/vp9_superframe_bsf.c
+++ b/libavcodec/vp9_superframe_bsf.c
@@ -1,28 +1,28 @@
 /*
- * VP9 invisible (alt-ref) frame to superframe merge bitstream filter
+ * Vp9 invisible (alt-ref) frame to superframe merge bitstream filter
  * Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/avassert.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bsf.h"
+#include "get_bits.h"
 
 #define MAX_CACHE 8
 typedef struct VP9BSFContext {
@@ -67,7 +67,8 @@ static int merge_superframe(AVPacket * const *in, int n_in, AVPacket *out)
         ptr += in[n]->size;
     }
 
-#define wloop(mag, wr) do { \
+#define wloop(mag, wr) \
+    do { \
         for (n = 0; n < n_in; n++) { \
             wr; \
             ptr += mag + 1; \
@@ -98,7 +99,7 @@ static int merge_superframe(AVPacket * const *in, int n_in, AVPacket *out)
 
 static int vp9_superframe_filter(AVBSFContext *ctx, AVPacket *out)
 {
-    BitstreamContext bc;
+    GetBitContext gb;
     VP9BSFContext *s = ctx->priv_data;
     AVPacket *in;
     int res, invisible, profile, marker, uses_superframe_syntax = 0, n;
@@ -115,21 +116,19 @@ static int vp9_superframe_filter(AVBSFContext *ctx, AVPacket *out)
         uses_superframe_syntax = in->size >= idx_sz && in->data[in->size - idx_sz] == marker;
     }
 
-    res = bitstream_init8(&bc, in->data, in->size);
-    if (res < 0)
+    if ((res = init_get_bits8(&gb, in->data, in->size)) < 0)
         goto done;
 
-    bitstream_read(&bc, 2); // frame marker
-    profile  = bitstream_read(&bc, 1);
-    profile |= bitstream_read(&bc, 1) << 1;
-    if (profile == 3)
-        profile += bitstream_read(&bc, 1);
+    get_bits(&gb, 2); // frame marker
+    profile  = get_bits1(&gb);
+    profile |= get_bits1(&gb) << 1;
+    if (profile == 3) profile += get_bits1(&gb);
 
-    if (bitstream_read(&bc, 1)) {
+    if (get_bits1(&gb)) {
         invisible = 0;
     } else {
-        bitstream_read(&bc, 1); // keyframe
-        invisible = !bitstream_read(&bc, 1);
+        get_bits1(&gb); // keyframe
+        invisible = !get_bits1(&gb);
     }
 
     if (uses_superframe_syntax && s->n_cache > 0) {
@@ -148,9 +147,8 @@ static int vp9_superframe_filter(AVBSFContext *ctx, AVPacket *out)
         goto done;
     }
 
-    res = av_packet_ref(s->cache[s->n_cache++], in);
-    if (res < 0)
-        goto done;
+    av_packet_move_ref(s->cache[s->n_cache++], in);
+
     if (invisible) {
         res = AVERROR(EAGAIN);
         goto done;
diff --git a/libavcodec/vp9_superframe_split_bsf.c b/libavcodec/vp9_superframe_split_bsf.c
index 0d31123..13e85c3 100644
--- a/libavcodec/vp9_superframe_split_bsf.c
+++ b/libavcodec/vp9_superframe_split_bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,8 @@
 
 #include "avcodec.h"
 #include "bsf.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 
 typedef struct VP9SFSplitContext {
     AVPacket *buffer_pkt;
@@ -43,10 +43,10 @@ static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out)
     VP9SFSplitContext *s = ctx->priv_data;
     AVPacket *in;
     int i, j, ret, marker;
-    int is_superframe = !!s->buffer_pkt;
+    int is_superframe = !!s->buffer_pkt->data;
 
-    if (!s->buffer_pkt) {
-        ret = ff_bsf_get_packet(ctx, &s->buffer_pkt);
+    if (!s->buffer_pkt->data) {
+        ret = ff_bsf_get_packet_ref(ctx, s->buffer_pkt);
         if (ret < 0)
             return ret;
         in = s->buffer_pkt;
@@ -59,7 +59,7 @@ static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out)
 
             if (in->size >= idx_size && in->data[in->size - idx_size] == marker) {
                 GetByteContext bc;
-                int total_size = 0;
+                int64_t total_size = 0;
 
                 bytestream2_init(&bc, in->data + in->size + 1 - idx_size,
                                  nb_frames * length_size);
@@ -70,7 +70,7 @@ static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out)
                         frame_size |= bytestream2_get_byte(&bc) << (j * 8);
 
                     total_size += frame_size;
-                    if (total_size > in->size - idx_size) {
+                    if (frame_size < 0 || total_size > in->size - idx_size) {
                         av_log(ctx, AV_LOG_ERROR,
                                "Invalid frame size in a superframe: %d\n", frame_size);
                         ret = AVERROR(EINVAL);
@@ -87,7 +87,7 @@ static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out)
     }
 
     if (is_superframe) {
-        BitstreamContext bc;
+        GetBitContext gb;
         int profile, invisible = 0;
 
         ret = av_packet_ref(out, s->buffer_pkt);
@@ -101,20 +101,20 @@ static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out)
         s->next_frame++;
 
         if (s->next_frame >= s->nb_frames)
-            av_packet_free(&s->buffer_pkt);
+            av_packet_unref(s->buffer_pkt);
 
-        ret = bitstream_init8(&bc, out->data, out->size);
+        ret = init_get_bits8(&gb, out->data, out->size);
         if (ret < 0)
             goto fail;
 
-        bitstream_read(&bc, 2); // frame_marker
-        profile  = bitstream_read(&bc, 1);
-        profile |= bitstream_read(&bc, 1) << 1;
+        get_bits(&gb, 2); // frame_marker
+        profile  = get_bits1(&gb);
+        profile |= get_bits1(&gb) << 1;
         if (profile == 3)
-            bitstream_read(&bc, 1);
-        if (!bitstream_read(&bc, 1)) {
-            bitstream_read(&bc, 1);
-            invisible = !bitstream_read(&bc, 1);
+            get_bits1(&gb);
+        if (!get_bits1(&gb)) {
+            get_bits1(&gb);
+            invisible = !get_bits1(&gb);
         }
 
         if (invisible)
@@ -122,19 +122,31 @@ static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out)
 
     } else {
         av_packet_move_ref(out, s->buffer_pkt);
-        av_packet_free(&s->buffer_pkt);
     }
 
     return 0;
 fail:
-    av_packet_free(&s->buffer_pkt);
+    if (ret < 0)
+        av_packet_unref(out);
+    av_packet_unref(s->buffer_pkt);
     return ret;
 }
 
+static int vp9_superframe_split_init(AVBSFContext *ctx)
+{
+    VP9SFSplitContext *s = ctx->priv_data;
+
+    s->buffer_pkt = av_packet_alloc();
+    if (!s->buffer_pkt)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
 static void vp9_superframe_split_flush(AVBSFContext *ctx)
 {
     VP9SFSplitContext *s = ctx->priv_data;
-    av_packet_free(&s->buffer_pkt);
+    av_packet_unref(s->buffer_pkt);
 }
 
 static void vp9_superframe_split_uninit(AVBSFContext *ctx)
@@ -146,6 +158,7 @@ static void vp9_superframe_split_uninit(AVBSFContext *ctx)
 const AVBitStreamFilter ff_vp9_superframe_split_bsf = {
     .name = "vp9_superframe_split",
     .priv_data_size = sizeof(VP9SFSplitContext),
+    .init           = vp9_superframe_split_init,
     .flush          = vp9_superframe_split_flush,
     .close          = vp9_superframe_split_uninit,
     .filter         = vp9_superframe_split_filter,
diff --git a/libavcodec/vp9block.c b/libavcodec/vp9block.c
index 35c9c27..1c3f7a7 100644
--- a/libavcodec/vp9block.c
+++ b/libavcodec/vp9block.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,19 +29,55 @@
 #include "vp56.h"
 #include "vp9.h"
 #include "vp9data.h"
+#include "vp9dec.h"
 
-static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
-    {
-        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
-        {  4,  4 }, {  4, 2 }, { 2,  4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
-    },  {
-        {  8,  8 }, {  8, 4 }, { 4,  8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
-        {  2,  2 }, {  2, 1 }, { 1,  2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
+                                       ptrdiff_t stride, int v)
+{
+    switch (w) {
+    case 1:
+        do {
+            *ptr = v;
+            ptr += stride;
+        } while (--h);
+        break;
+    case 2: {
+        int v16 = v * 0x0101;
+        do {
+            AV_WN16A(ptr, v16);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 4: {
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr, v32);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 8: {
+#if HAVE_FAST_64BIT
+        uint64_t v64 = v * 0x0101010101010101ULL;
+        do {
+            AV_WN64A(ptr, v64);
+            ptr += stride;
+        } while (--h);
+#else
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr,     v32);
+            AV_WN32A(ptr + 4, v32);
+            ptr += stride;
+        } while (--h);
+#endif
+        break;
     }
-};
+    }
+}
 
-// differential forward probability updates
-static void decode_mode(VP9Context *s, VP9Block *const b)
+static void decode_mode(VP9TileData *td)
 {
     static const uint8_t left_ctx[N_BS_SIZES] = {
         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
@@ -53,133 +89,133 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
         TX_16X16, TX_8X8,   TX_8X8,   TX_8X8,   TX_4X4,   TX_4X4,  TX_4X4
     };
-    int row = b->row, col = b->col, row7 = b->row7;
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
+    int row = td->row, col = td->col, row7 = td->row7;
     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
-    int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
-    int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]);
-    int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
-    int y;
+    int bw4 = ff_vp9_bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
+    int bh4 = ff_vp9_bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
+    int have_a = row > 0, have_l = col > td->tile_col_start;
+    int vref, filter_id;
 
-    if (!s->segmentation.enabled) {
+    if (!s->s.h.segmentation.enabled) {
         b->seg_id = 0;
-    } else if (s->keyframe || s->intraonly) {
-        b->seg_id = s->segmentation.update_map ?
-                    vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree, s->prob.seg) : 0;
-    } else if (!s->segmentation.update_map ||
-               (s->segmentation.temporal &&
-                vp56_rac_get_prob_branchy(&s->c,
-                                          s->prob.segpred[s->above_segpred_ctx[col] +
-                                                          s->left_segpred_ctx[row7]]))) {
-        if (!s->errorres) {
-            uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
-            int pred = MAX_SEGMENT - 1;
-            int x;
-
-            if (!s->last_uses_2pass)
-                ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
-
-            for (y = 0; y < h4; y++)
+    } else if (s->s.h.keyframe || s->s.h.intraonly) {
+        b->seg_id = !s->s.h.segmentation.update_map ? 0 :
+                    vp8_rac_get_tree(td->c, ff_vp9_segmentation_tree, s->s.h.segmentation.prob);
+    } else if (!s->s.h.segmentation.update_map ||
+               (s->s.h.segmentation.temporal &&
+                vp56_rac_get_prob_branchy(td->c,
+                    s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
+                                    td->left_segpred_ctx[row7]]))) {
+        if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
+            int pred = 8, x;
+            uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
+
+            if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
+                ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
+            for (y = 0; y < h4; y++) {
+                int idx_base = (y + row) * 8 * s->sb_cols + col;
                 for (x = 0; x < w4; x++)
-                    pred = FFMIN(pred,
-                                 refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
+                    pred = FFMIN(pred, refsegmap[idx_base + x]);
+            }
+            av_assert1(pred < 8);
             b->seg_id = pred;
         } else {
             b->seg_id = 0;
         }
 
         memset(&s->above_segpred_ctx[col], 1, w4);
-        memset(&s->left_segpred_ctx[row7], 1, h4);
+        memset(&td->left_segpred_ctx[row7], 1, h4);
     } else {
-        b->seg_id = vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree,
-                                     s->prob.seg);
+        b->seg_id = vp8_rac_get_tree(td->c, ff_vp9_segmentation_tree,
+                                     s->s.h.segmentation.prob);
 
         memset(&s->above_segpred_ctx[col], 0, w4);
-        memset(&s->left_segpred_ctx[row7], 0, h4);
+        memset(&td->left_segpred_ctx[row7], 0, h4);
     }
-    if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
-        uint8_t *segmap = s->frames[CUR_FRAME].segmentation_map;
-
-        for (y = 0; y < h4; y++)
-            memset(&segmap[(y + row) * 8 * s->sb_cols + col],
-                   b->seg_id, w4);
+    if (s->s.h.segmentation.enabled &&
+        (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
+        setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
+                  bw4, bh4, 8 * s->sb_cols, b->seg_id);
     }
 
-    b->skip = s->segmentation.enabled &&
-              s->segmentation.feat[b->seg_id].skip_enabled;
+    b->skip = s->s.h.segmentation.enabled &&
+        s->s.h.segmentation.feat[b->seg_id].skip_enabled;
     if (!b->skip) {
-        int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
-        b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
-        s->counts.skip[c][b->skip]++;
+        int c = td->left_skip_ctx[row7] + s->above_skip_ctx[col];
+        b->skip = vp56_rac_get_prob(td->c, s->prob.p.skip[c]);
+        td->counts.skip[c][b->skip]++;
     }
 
-    if (s->keyframe || s->intraonly) {
+    if (s->s.h.keyframe || s->s.h.intraonly) {
         b->intra = 1;
-    } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
-        b->intra = !s->segmentation.feat[b->seg_id].ref_val;
+    } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
+        b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
     } else {
         int c, bit;
 
         if (have_a && have_l) {
-            c  = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
+            c = s->above_intra_ctx[col] + td->left_intra_ctx[row7];
             c += (c == 2);
         } else {
             c = have_a ? 2 * s->above_intra_ctx[col] :
-                have_l ? 2 * s->left_intra_ctx[row7] : 0;
+                have_l ? 2 * td->left_intra_ctx[row7] : 0;
         }
-        bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
-        s->counts.intra[c][bit]++;
+        bit = vp56_rac_get_prob(td->c, s->prob.p.intra[c]);
+        td->counts.intra[c][bit]++;
         b->intra = !bit;
     }
 
-    if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
+    if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
         int c;
         if (have_a) {
             if (have_l) {
                 c = (s->above_skip_ctx[col] ? max_tx :
                      s->above_txfm_ctx[col]) +
-                    (s->left_skip_ctx[row7] ? max_tx :
-                     s->left_txfm_ctx[row7]) > max_tx;
+                    (td->left_skip_ctx[row7] ? max_tx :
+                     td->left_txfm_ctx[row7]) > max_tx;
             } else {
                 c = s->above_skip_ctx[col] ? 1 :
                     (s->above_txfm_ctx[col] * 2 > max_tx);
             }
         } else if (have_l) {
-            c = s->left_skip_ctx[row7] ? 1 :
-                (s->left_txfm_ctx[row7] * 2 > max_tx);
+            c = td->left_skip_ctx[row7] ? 1 :
+                (td->left_txfm_ctx[row7] * 2 > max_tx);
         } else {
             c = 1;
         }
         switch (max_tx) {
         case TX_32X32:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
+            b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][0]);
             if (b->tx) {
-                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
+                b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][1]);
                 if (b->tx == 2)
-                    b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
+                    b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][2]);
             }
-            s->counts.tx32p[c][b->tx]++;
+            td->counts.tx32p[c][b->tx]++;
             break;
         case TX_16X16:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
+            b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx16p[c][0]);
             if (b->tx)
-                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
-            s->counts.tx16p[c][b->tx]++;
+                b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx16p[c][1]);
+            td->counts.tx16p[c][b->tx]++;
             break;
         case TX_8X8:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
-            s->counts.tx8p[c][b->tx]++;
+            b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx8p[c]);
+            td->counts.tx8p[c][b->tx]++;
             break;
         case TX_4X4:
             b->tx = TX_4X4;
             break;
         }
     } else {
-        b->tx = FFMIN(max_tx, s->txfmmode);
+        b->tx = FFMIN(max_tx, s->s.h.txfmmode);
     }
 
-    if (s->keyframe || s->intraonly) {
+    if (s->s.h.keyframe || s->s.h.intraonly) {
         uint8_t *a = &s->above_mode_ctx[col * 2];
-        uint8_t *l = &s->left_mode_ctx[(row7) << 1];
+        uint8_t *l = &td->left_mode_ctx[(row7) << 1];
 
         b->comp = 0;
         if (b->bs > BS_8x8) {
@@ -187,10 +223,10 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
             // necessary, they're just there to make the code slightly
             // simpler for now
             b->mode[0] =
-            a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+            a[0]       = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                           ff_vp9_default_kf_ymode_probs[a[0]][l[0]]);
             if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                               ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
                 l[0]       =
                 a[1]       = b->mode[1];
@@ -201,10 +237,10 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
             }
             if (b->bs != BS_4x8) {
                 b->mode[2] =
-                a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                a[0]       = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                               ff_vp9_default_kf_ymode_probs[a[0]][l[1]]);
                 if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                    b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                                   ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
                     l[1]       =
                     a[1]       = b->mode[3];
@@ -220,38 +256,38 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
                 b->mode[3] = b->mode[1];
             }
         } else {
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+            b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                           ff_vp9_default_kf_ymode_probs[*a][*l]);
             b->mode[3] =
             b->mode[2] =
             b->mode[1] = b->mode[0];
             // FIXME this can probably be optimized
-            memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
-            memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
+            memset(a, b->mode[0], ff_vp9_bwh_tab[0][b->bs][0]);
+            memset(l, b->mode[0], ff_vp9_bwh_tab[0][b->bs][1]);
         }
-        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+        b->uvmode = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                      ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
     } else if (b->intra) {
         b->comp = 0;
         if (b->bs > BS_8x8) {
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+            b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                           s->prob.p.y_mode[0]);
-            s->counts.y_mode[0][b->mode[0]]++;
+            td->counts.y_mode[0][b->mode[0]]++;
             if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                               s->prob.p.y_mode[0]);
-                s->counts.y_mode[0][b->mode[1]]++;
+                td->counts.y_mode[0][b->mode[1]]++;
             } else {
                 b->mode[1] = b->mode[0];
             }
             if (b->bs != BS_4x8) {
-                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                b->mode[2] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                               s->prob.p.y_mode[0]);
-                s->counts.y_mode[0][b->mode[2]]++;
+                td->counts.y_mode[0][b->mode[2]]++;
                 if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                    b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                                   s->prob.p.y_mode[0]);
-                    s->counts.y_mode[0][b->mode[3]]++;
+                    td->counts.y_mode[0][b->mode[3]]++;
                 } else {
                     b->mode[3] = b->mode[2];
                 }
@@ -265,16 +301,16 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
             };
             int sz = size_group[b->bs];
 
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+            b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                           s->prob.p.y_mode[sz]);
             b->mode[1] =
             b->mode[2] =
             b->mode[3] = b->mode[0];
-            s->counts.y_mode[sz][b->mode[3]]++;
+            td->counts.y_mode[sz][b->mode[3]]++;
         }
-        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+        b->uvmode = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
                                      s->prob.p.uv_mode[b->mode[3]]);
-        s->counts.uv_mode[b->mode[3]][b->uvmode]++;
+        td->counts.uv_mode[b->mode[3]][b->uvmode]++;
     } else {
         static const uint8_t inter_mode_ctx_lut[14][14] = {
             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
@@ -293,91 +329,91 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
         };
 
-        if (s->segmentation.feat[b->seg_id].ref_enabled) {
-            av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
-            b->comp   = 0;
-            b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
+        if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
+            av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
+            b->comp = 0;
+            b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
         } else {
             // read comp_pred flag
-            if (s->comppredmode != PRED_SWITCHABLE) {
-                b->comp = s->comppredmode == PRED_COMPREF;
+            if (s->s.h.comppredmode != PRED_SWITCHABLE) {
+                b->comp = s->s.h.comppredmode == PRED_COMPREF;
             } else {
                 int c;
 
                 // FIXME add intra as ref=0xff (or -1) to make these easier?
                 if (have_a) {
                     if (have_l) {
-                        if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
+                        if (s->above_comp_ctx[col] && td->left_comp_ctx[row7]) {
                             c = 4;
                         } else if (s->above_comp_ctx[col]) {
-                            c = 2 + (s->left_intra_ctx[row7] ||
-                                     s->left_ref_ctx[row7] == s->fixcompref);
-                        } else if (s->left_comp_ctx[row7]) {
+                            c = 2 + (td->left_intra_ctx[row7] ||
+                                     td->left_ref_ctx[row7] == s->s.h.fixcompref);
+                        } else if (td->left_comp_ctx[row7]) {
                             c = 2 + (s->above_intra_ctx[col] ||
-                                     s->above_ref_ctx[col] == s->fixcompref);
+                                     s->above_ref_ctx[col] == s->s.h.fixcompref);
                         } else {
                             c = (!s->above_intra_ctx[col] &&
-                                 s->above_ref_ctx[col] == s->fixcompref) ^
-                                (!s->left_intra_ctx[row7] &&
-                                 s->left_ref_ctx[row & 7] == s->fixcompref);
+                                 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
+                                (!td->left_intra_ctx[row7] &&
+                                 td->left_ref_ctx[row & 7] == s->s.h.fixcompref);
                         }
                     } else {
                         c = s->above_comp_ctx[col] ? 3 :
-                            (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
+                        (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
                     }
                 } else if (have_l) {
-                    c = s->left_comp_ctx[row7] ? 3 :
-                        (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
+                    c = td->left_comp_ctx[row7] ? 3 :
+                    (!td->left_intra_ctx[row7] && td->left_ref_ctx[row7] == s->s.h.fixcompref);
                 } else {
                     c = 1;
                 }
-                b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
-                s->counts.comp[c][b->comp]++;
+                b->comp = vp56_rac_get_prob(td->c, s->prob.p.comp[c]);
+                td->counts.comp[c][b->comp]++;
             }
 
             // read actual references
             // FIXME probably cache a few variables here to prevent repetitive
             // memory accesses below
             if (b->comp) { /* two references */
-                int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
+                int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
 
-                b->ref[fix_idx] = s->fixcompref;
+                b->ref[fix_idx] = s->s.h.fixcompref;
                 // FIXME can this codeblob be replaced by some sort of LUT?
                 if (have_a) {
                     if (have_l) {
                         if (s->above_intra_ctx[col]) {
-                            if (s->left_intra_ctx[row7]) {
+                            if (td->left_intra_ctx[row7]) {
                                 c = 2;
                             } else {
-                                c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                                c = 1 + 2 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
                             }
-                        } else if (s->left_intra_ctx[row7]) {
-                            c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                        } else if (td->left_intra_ctx[row7]) {
+                            c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
                         } else {
-                            int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
+                            int refl = td->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
 
-                            if (refl == refa && refa == s->varcompref[1]) {
+                            if (refl == refa && refa == s->s.h.varcompref[1]) {
                                 c = 0;
-                            } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
-                                if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
-                                    (refl == s->fixcompref && refa == s->varcompref[0])) {
+                            } else if (!td->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
+                                if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
+                                    (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
                                     c = 4;
                                 } else {
                                     c = (refa == refl) ? 3 : 1;
                                 }
-                            } else if (!s->left_comp_ctx[row7]) {
-                                if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
+                            } else if (!td->left_comp_ctx[row7]) {
+                                if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
                                     c = 1;
                                 } else {
-                                    c = (refl == s->varcompref[1] &&
-                                         refa != s->varcompref[1]) ? 2 : 4;
+                                    c = (refl == s->s.h.varcompref[1] &&
+                                         refa != s->s.h.varcompref[1]) ? 2 : 4;
                                 }
                             } else if (!s->above_comp_ctx[col]) {
-                                if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
+                                if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
                                     c = 1;
                                 } else {
-                                    c = (refa == s->varcompref[1] &&
-                                         refl != s->varcompref[1]) ? 2 : 4;
+                                    c = (refa == s->s.h.varcompref[1] &&
+                                         refl != s->s.h.varcompref[1]) ? 2 : 4;
                                 }
                             } else {
                                 c = (refl == refa) ? 4 : 2;
@@ -387,75 +423,75 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
                         if (s->above_intra_ctx[col]) {
                             c = 2;
                         } else if (s->above_comp_ctx[col]) {
-                            c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                            c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
                         } else {
-                            c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                            c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
                         }
                     }
                 } else if (have_l) {
-                    if (s->left_intra_ctx[row7]) {
+                    if (td->left_intra_ctx[row7]) {
                         c = 2;
-                    } else if (s->left_comp_ctx[row7]) {
-                        c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                    } else if (td->left_comp_ctx[row7]) {
+                        c = 4 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
                     } else {
-                        c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                        c = 3 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
                     }
                 } else {
                     c = 2;
                 }
-                bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
-                b->ref[var_idx] = s->varcompref[bit];
-                s->counts.comp_ref[c][bit]++;
-            } else { /* single reference */
+                bit = vp56_rac_get_prob(td->c, s->prob.p.comp_ref[c]);
+                b->ref[var_idx] = s->s.h.varcompref[bit];
+                td->counts.comp_ref[c][bit]++;
+            } else /* single reference */ {
                 int bit, c;
 
                 if (have_a && !s->above_intra_ctx[col]) {
-                    if (have_l && !s->left_intra_ctx[row7]) {
-                        if (s->left_comp_ctx[row7]) {
+                    if (have_l && !td->left_intra_ctx[row7]) {
+                        if (td->left_comp_ctx[row7]) {
                             if (s->above_comp_ctx[col]) {
-                                c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
+                                c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7] ||
                                          !s->above_ref_ctx[col]);
                             } else {
                                 c = (3 * !s->above_ref_ctx[col]) +
-                                    (!s->fixcompref || !s->left_ref_ctx[row7]);
+                                    (!s->s.h.fixcompref || !td->left_ref_ctx[row7]);
                             }
                         } else if (s->above_comp_ctx[col]) {
-                            c = (3 * !s->left_ref_ctx[row7]) +
-                                (!s->fixcompref || !s->above_ref_ctx[col]);
+                            c = (3 * !td->left_ref_ctx[row7]) +
+                                (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
                         } else {
-                            c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
+                            c = 2 * !td->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
                         }
                     } else if (s->above_intra_ctx[col]) {
                         c = 2;
                     } else if (s->above_comp_ctx[col]) {
-                        c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
+                        c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
                     } else {
                         c = 4 * (!s->above_ref_ctx[col]);
                     }
-                } else if (have_l && !s->left_intra_ctx[row7]) {
-                    if (s->left_intra_ctx[row7]) {
+                } else if (have_l && !td->left_intra_ctx[row7]) {
+                    if (td->left_intra_ctx[row7]) {
                         c = 2;
-                    } else if (s->left_comp_ctx[row7]) {
-                        c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
+                    } else if (td->left_comp_ctx[row7]) {
+                        c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7]);
                     } else {
-                        c = 4 * (!s->left_ref_ctx[row7]);
+                        c = 4 * (!td->left_ref_ctx[row7]);
                     }
                 } else {
                     c = 2;
                 }
-                bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
-                s->counts.single_ref[c][0][bit]++;
+                bit = vp56_rac_get_prob(td->c, s->prob.p.single_ref[c][0]);
+                td->counts.single_ref[c][0][bit]++;
                 if (!bit) {
                     b->ref[0] = 0;
                 } else {
                     // FIXME can this codeblob be replaced by some sort of LUT?
                     if (have_a) {
                         if (have_l) {
-                            if (s->left_intra_ctx[row7]) {
+                            if (td->left_intra_ctx[row7]) {
                                 if (s->above_intra_ctx[col]) {
                                     c = 2;
                                 } else if (s->above_comp_ctx[col]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
                                                  s->above_ref_ctx[col] == 1);
                                 } else if (!s->above_ref_ctx[col]) {
                                     c = 3;
@@ -463,49 +499,49 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
                                     c = 4 * (s->above_ref_ctx[col] == 1);
                                 }
                             } else if (s->above_intra_ctx[col]) {
-                                if (s->left_intra_ctx[row7]) {
+                                if (td->left_intra_ctx[row7]) {
                                     c = 2;
-                                } else if (s->left_comp_ctx[row7]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
-                                } else if (!s->left_ref_ctx[row7]) {
+                                } else if (td->left_comp_ctx[row7]) {
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
+                                                 td->left_ref_ctx[row7] == 1);
+                                } else if (!td->left_ref_ctx[row7]) {
                                     c = 3;
                                 } else {
-                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                    c = 4 * (td->left_ref_ctx[row7] == 1);
                                 }
                             } else if (s->above_comp_ctx[col]) {
-                                if (s->left_comp_ctx[row7]) {
-                                    if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
-                                        c = 3 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
+                                if (td->left_comp_ctx[row7]) {
+                                    if (td->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
+                                        c = 3 * (s->s.h.fixcompref == 1 ||
+                                                 td->left_ref_ctx[row7] == 1);
                                     } else {
                                         c = 2;
                                     }
-                                } else if (!s->left_ref_ctx[row7]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                } else if (!td->left_ref_ctx[row7]) {
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
                                                  s->above_ref_ctx[col] == 1);
                                 } else {
-                                    c = 3 * (s->left_ref_ctx[row7] == 1) +
-                                        (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                                    c = 3 * (td->left_ref_ctx[row7] == 1) +
+                                    (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
                                 }
-                            } else if (s->left_comp_ctx[row7]) {
+                            } else if (td->left_comp_ctx[row7]) {
                                 if (!s->above_ref_ctx[col]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
+                                                 td->left_ref_ctx[row7] == 1);
                                 } else {
                                     c = 3 * (s->above_ref_ctx[col] == 1) +
-                                        (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                                    (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1);
                                 }
                             } else if (!s->above_ref_ctx[col]) {
-                                if (!s->left_ref_ctx[row7]) {
+                                if (!td->left_ref_ctx[row7]) {
                                     c = 3;
                                 } else {
-                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                    c = 4 * (td->left_ref_ctx[row7] == 1);
                                 }
-                            } else if (!s->left_ref_ctx[row7]) {
+                            } else if (!td->left_ref_ctx[row7]) {
                                 c = 4 * (s->above_ref_ctx[col] == 1);
                             } else {
-                                c = 2 * (s->left_ref_ctx[row7] == 1) +
+                                c = 2 * (td->left_ref_ctx[row7] == 1) +
                                     2 * (s->above_ref_ctx[col] == 1);
                             }
                         } else {
@@ -513,32 +549,32 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
                                 c = 2;
                             } else if (s->above_comp_ctx[col]) {
-                                c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                                c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
                             } else {
                                 c = 4 * (s->above_ref_ctx[col] == 1);
                             }
                         }
                     } else if (have_l) {
-                        if (s->left_intra_ctx[row7] ||
-                            (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
+                        if (td->left_intra_ctx[row7] ||
+                            (!td->left_comp_ctx[row7] && !td->left_ref_ctx[row7])) {
                             c = 2;
-                        } else if (s->left_comp_ctx[row7]) {
-                            c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                        } else if (td->left_comp_ctx[row7]) {
+                            c = 3 * (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1);
                         } else {
-                            c = 4 * (s->left_ref_ctx[row7] == 1);
+                            c = 4 * (td->left_ref_ctx[row7] == 1);
                         }
                     } else {
                         c = 2;
                     }
-                    bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
-                    s->counts.single_ref[c][1][bit]++;
+                    bit = vp56_rac_get_prob(td->c, s->prob.p.single_ref[c][1]);
+                    td->counts.single_ref[c][1][bit]++;
                     b->ref[0] = 1 + bit;
                 }
             }
         }
 
         if (b->bs <= BS_8x8) {
-            if (s->segmentation.feat[b->seg_id].skip_enabled) {
+            if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
                 b->mode[0] =
                 b->mode[1] =
                 b->mode[2] =
@@ -551,53 +587,54 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
                 // FIXME this needs to use the LUT tables from find_ref_mvs
                 // because not all are -1,0/0,-1
                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
-                                          [s->left_mode_ctx[row7 + off[b->bs]]];
+                                          [td->left_mode_ctx[row7 + off[b->bs]]];
 
-                b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
                                               s->prob.p.mv_mode[c]);
                 b->mode[1] =
                 b->mode[2] =
                 b->mode[3] = b->mode[0];
-                s->counts.mv_mode[c][b->mode[0] - 10]++;
+                td->counts.mv_mode[c][b->mode[0] - 10]++;
             }
         }
 
-        if (s->filtermode == FILTER_SWITCHABLE) {
+        if (s->s.h.filtermode == FILTER_SWITCHABLE) {
             int c;
 
             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
-                if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
-                    c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
-                        s->left_filter_ctx[row7] : 3;
+                if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) {
+                    c = s->above_filter_ctx[col] == td->left_filter_ctx[row7] ?
+                        td->left_filter_ctx[row7] : 3;
                 } else {
                     c = s->above_filter_ctx[col];
                 }
-            } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
-                c = s->left_filter_ctx[row7];
+            } else if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) {
+                c = td->left_filter_ctx[row7];
             } else {
                 c = 3;
             }
 
-            b->filter = vp8_rac_get_tree(&s->c, ff_vp9_filter_tree,
+            filter_id = vp8_rac_get_tree(td->c, ff_vp9_filter_tree,
                                          s->prob.p.filter[c]);
-            s->counts.filter[c][b->filter]++;
+            td->counts.filter[c][filter_id]++;
+            b->filter = ff_vp9_filter_lut[filter_id];
         } else {
-            b->filter = s->filtermode;
+            b->filter = s->s.h.filtermode;
         }
 
         if (b->bs > BS_8x8) {
-            int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
+            int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][td->left_mode_ctx[row7]];
 
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+            b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
                                           s->prob.p.mv_mode[c]);
-            s->counts.mv_mode[c][b->mode[0] - 10]++;
-            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], 0);
+            td->counts.mv_mode[c][b->mode[0] - 10]++;
+            ff_vp9_fill_mv(td, b->mv[0], b->mode[0], 0);
 
             if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
                                               s->prob.p.mv_mode[c]);
-                s->counts.mv_mode[c][b->mode[1] - 10]++;
-                ff_vp9_fill_mv(s, b->mv[1], b->mode[1], 1);
+                td->counts.mv_mode[c][b->mode[1] - 10]++;
+                ff_vp9_fill_mv(td, b->mv[1], b->mode[1], 1);
             } else {
                 b->mode[1] = b->mode[0];
                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
@@ -605,16 +642,16 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
             }
 
             if (b->bs != BS_4x8) {
-                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                b->mode[2] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
                                               s->prob.p.mv_mode[c]);
-                s->counts.mv_mode[c][b->mode[2] - 10]++;
-                ff_vp9_fill_mv(s, b->mv[2], b->mode[2], 2);
+                td->counts.mv_mode[c][b->mode[2] - 10]++;
+                ff_vp9_fill_mv(td, b->mv[2], b->mode[2], 2);
 
                 if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                    b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
                                                   s->prob.p.mv_mode[c]);
-                    s->counts.mv_mode[c][b->mode[3] - 10]++;
-                    ff_vp9_fill_mv(s, b->mv[3], b->mode[3], 3);
+                    td->counts.mv_mode[c][b->mode[3] - 10]++;
+                    ff_vp9_fill_mv(td, b->mv[3], b->mode[3], 3);
                 } else {
                     b->mode[3] = b->mode[2];
                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
@@ -629,7 +666,7 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
             }
         } else {
-            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], -1);
+            ff_vp9_fill_mv(td, b->mv[0], b->mode[0], -1);
             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
@@ -637,34 +674,87 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
         }
+
+        vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
     }
 
-    // FIXME this can probably be optimized
-    memset(&s->above_skip_ctx[col], b->skip, w4);
-    memset(&s->left_skip_ctx[row7], b->skip, h4);
-    memset(&s->above_txfm_ctx[col], b->tx, w4);
-    memset(&s->left_txfm_ctx[row7], b->tx, h4);
-    memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
-    memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
-    if (!s->keyframe && !s->intraonly) {
-        memset(&s->above_intra_ctx[col], b->intra, w4);
-        memset(&s->left_intra_ctx[row7], b->intra, h4);
-        memset(&s->above_comp_ctx[col], b->comp, w4);
-        memset(&s->left_comp_ctx[row7], b->comp, h4);
-        memset(&s->above_mode_ctx[col], b->mode[3], w4);
-        memset(&s->left_mode_ctx[row7], b->mode[3], h4);
-        if (s->filtermode == FILTER_SWITCHABLE && !b->intra) {
-            memset(&s->above_filter_ctx[col], b->filter, w4);
-            memset(&s->left_filter_ctx[row7], b->filter, h4);
-            b->filter = ff_vp9_filter_lut[b->filter];
-        }
+#if HAVE_FAST_64BIT
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                                    break; \
+    case 2:  AV_WN16A(&var, val *             0x0101);     break; \
+    case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
+    case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
+    case 16: { \
+        uint64_t v64 = val * 0x0101010101010101ULL; \
+        AV_WN64A(              &var,     v64); \
+        AV_WN64A(&((uint8_t *) &var)[8], v64); \
+        break; \
+    } \
+    }
+#else
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                         break; \
+    case 2:  AV_WN16A(&var, val *     0x0101);  break; \
+    case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
+    case 8: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,     v32); \
+        AV_WN32A(&((uint8_t *) &var)[4], v32); \
+        break; \
+    } \
+    case 16: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,      v32); \
+        AV_WN32A(&((uint8_t *) &var)[4],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[8],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[12], v32); \
+        break; \
+    } \
+    }
+#endif
+
+    switch (ff_vp9_bwh_tab[1][b->bs][0]) {
+#define SET_CTXS(perf, dir, off, n) \
+    do { \
+        SPLAT_CTX(perf->dir##_skip_ctx[off],      b->skip,          n); \
+        SPLAT_CTX(perf->dir##_txfm_ctx[off],      b->tx,            n); \
+        SPLAT_CTX(perf->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
+        if (!s->s.h.keyframe && !s->s.h.intraonly) { \
+            SPLAT_CTX(perf->dir##_intra_ctx[off], b->intra,   n); \
+            SPLAT_CTX(perf->dir##_comp_ctx[off],  b->comp,    n); \
+            SPLAT_CTX(perf->dir##_mode_ctx[off],  b->mode[3], n); \
+            if (!b->intra) { \
+                SPLAT_CTX(perf->dir##_ref_ctx[off], vref, n); \
+                if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
+                    SPLAT_CTX(perf->dir##_filter_ctx[off], filter_id, n); \
+                } \
+            } \
+        } \
+    } while (0)
+    case 1: SET_CTXS(s, above, col, 1); break;
+    case 2: SET_CTXS(s, above, col, 2); break;
+    case 4: SET_CTXS(s, above, col, 4); break;
+    case 8: SET_CTXS(s, above, col, 8); break;
+    }
+    switch (ff_vp9_bwh_tab[1][b->bs][1]) {
+    case 1: SET_CTXS(td, left, row7, 1); break;
+    case 2: SET_CTXS(td, left, row7, 2); break;
+    case 4: SET_CTXS(td, left, row7, 4); break;
+    case 8: SET_CTXS(td, left, row7, 8); break;
+    }
+#undef SPLAT_CTX
+#undef SET_CTXS
+
+    if (!s->s.h.keyframe && !s->s.h.intraonly) {
         if (b->bs > BS_8x8) {
             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
 
-            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
-            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
-            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
-            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
+            AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
+            AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
+            AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][0], mv0);
+            AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][1], mv1);
             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
@@ -677,24 +767,16 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
             }
             for (n = 0; n < h4 * 2; n++) {
-                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
-                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
+                AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][0], mv0);
+                AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][1], mv1);
             }
         }
-
-        if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
-                         // as a direct check in above branches
-            int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
-
-            memset(&s->above_ref_ctx[col], vref, w4);
-            memset(&s->left_ref_ctx[row7], vref, h4);
-        }
     }
 
     // FIXME kinda ugly
     for (y = 0; y < h4; y++) {
         int x, o = (row + y) * s->sb_cols * 8 + col;
-        VP9MVRefPair *mv = &s->frames[CUR_FRAME].mv[o];
+        VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
 
         if (b->intra) {
             for (x = 0; x < w4; x++) {
@@ -718,16 +800,16 @@ static void decode_mode(VP9Context *s, VP9Block *const b)
     }
 }
 
-// FIXME remove tx argument, and merge cnt/eob arguments?
-static int decode_block_coeffs(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
-                               enum TxfmMode tx, unsigned (*cnt)[6][3],
-                               unsigned (*eob)[6][2], uint8_t(*p)[6][11],
-                               int nnz, const int16_t *scan,
-                               const int16_t(*nb)[2],
-                               const int16_t *band_counts, const int16_t *qmul)
+// FIXME merge cnt/eob arguments?
+static av_always_inline int
+decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+                        int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
+                        unsigned (*eob)[6][2], uint8_t (*p)[6][11],
+                        int nnz, const int16_t *scan, const int16_t (*nb)[2],
+                        const int16_t *band_counts, int16_t *qmul)
 {
     int i = 0, band = 0, band_left = band_counts[band];
-    uint8_t *tp = p[0][nnz];
+    const uint8_t *tp = p[0][nnz];
     uint8_t cache[1024];
 
     do {
@@ -757,10 +839,6 @@ skip_eob:
             val       = 1;
             cache[rc] = 1;
         } else {
-            // fill in p[3-10] (model fill) - only once per frame for each pos
-            if (!tp[3])
-                memcpy(&tp[3], ff_vp9_model_pareto8[tp[2]], 8);
-
             cnt[band][nnz][2]++;
             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
@@ -781,14 +859,14 @@ skip_eob:
                 cache[rc] = 5;
                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
-                        val  = (vp56_rac_get_prob(c, 173) << 2) + 11;
-                        val += (vp56_rac_get_prob(c, 148) << 1);
-                        val +=  vp56_rac_get_prob(c, 140);
+                        val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
+                        val +=      (vp56_rac_get_prob(c, 148) << 1);
+                        val +=       vp56_rac_get_prob(c, 140);
                     } else {
-                        val  = (vp56_rac_get_prob(c, 176) << 3) + 19;
-                        val += (vp56_rac_get_prob(c, 155) << 2);
-                        val += (vp56_rac_get_prob(c, 140) << 1);
-                        val +=  vp56_rac_get_prob(c, 135);
+                        val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
+                        val +=      (vp56_rac_get_prob(c, 155) << 2);
+                        val +=      (vp56_rac_get_prob(c, 140) << 1);
+                        val +=       vp56_rac_get_prob(c, 135);
                     }
                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
                     val  = (vp56_rac_get_prob(c, 180) << 4) + 35;
@@ -797,7 +875,16 @@ skip_eob:
                     val += (vp56_rac_get_prob(c, 134) << 1);
                     val +=  vp56_rac_get_prob(c, 130);
                 } else {
-                    val  = (vp56_rac_get_prob(c, 254) << 13) + 67;
+                    val = 67;
+                    if (!is8bitsperpixel) {
+                        if (bpp == 12) {
+                            val += vp56_rac_get_prob(c, 255) << 17;
+                            val += vp56_rac_get_prob(c, 255) << 16;
+                        }
+                        val +=  (vp56_rac_get_prob(c, 255) << 15);
+                        val +=  (vp56_rac_get_prob(c, 255) << 14);
+                    }
+                    val += (vp56_rac_get_prob(c, 254) << 13);
                     val += (vp56_rac_get_prob(c, 254) << 12);
                     val += (vp56_rac_get_prob(c, 254) << 11);
                     val += (vp56_rac_get_prob(c, 252) << 10);
@@ -814,643 +901,251 @@ skip_eob:
                 }
             }
         }
+#define STORE_COEF(c, i, v) do { \
+    if (is8bitsperpixel) { \
+        c[i] = v; \
+    } else { \
+        AV_WN32A(&c[i * 2], v); \
+    } \
+} while (0)
         if (!--band_left)
             band_left = band_counts[++band];
-        if (tx == TX_32X32) // FIXME slow
-            coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
+        if (is_tx32x32)
+            STORE_COEF(coef, rc, (int)((vp8_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]) / 2);
         else
-            coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
+            STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]);
         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
-        tp  = p[band][nnz];
+        tp = p[band][nnz];
     } while (++i < n_coeffs);
 
     return i;
 }
 
-static int decode_coeffs(AVCodecContext *avctx)
+static int decode_coeffs_b_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
+                                unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                const int16_t (*nb)[2], const int16_t *band_counts,
+                                int16_t *qmul)
+{
+    return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b32_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
+                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                  const int16_t (*nb)[2], const int16_t *band_counts,
+                                  int16_t *qmul)
+{
+    return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
+                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                 const int16_t (*nb)[2], const int16_t *band_counts,
+                                 int16_t *qmul)
+{
+    return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 0, td->s->s.h.bpp, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b32_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
+                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                   const int16_t (*nb)[2], const int16_t *band_counts,
+                                   int16_t *qmul)
 {
-    VP9Context *s = avctx->priv_data;
-    VP9Block *b = s->b;
-    int row = b->row, col = b->col;
+    return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 0, td->s->s.h.bpp, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static av_always_inline int decode_coeffs(VP9TileData *td, int is8bitsperpixel)
+{
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
+    int row = td->row, col = td->col;
     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
-    unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
-    unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
-    int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
+    unsigned (*c)[6][3] = td->counts.coef[b->tx][0 /* y */][!b->intra];
+    unsigned (*e)[6][2] = td->counts.eob[b->tx][0 /* y */][!b->intra];
+    int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1;
     int end_x = FFMIN(2 * (s->cols - col), w4);
     int end_y = FFMIN(2 * (s->rows - row), h4);
-    int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
-    int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), ret;
-    int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
-    int tx = 4 * s->lossless + b->tx;
+    int n, pl, x, y, ret;
+    int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
+    int tx = 4 * s->s.h.lossless + b->tx;
     const int16_t * const *yscans = ff_vp9_scans[tx];
     const int16_t (* const * ynbs)[2] = ff_vp9_scans_nb[tx];
     const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT];
     const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT];
     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
-    uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
+    uint8_t *l = &td->left_y_nnz_ctx[(row & 7) << 1];
     static const int16_t band_counts[4][8] = {
-        { 1, 2, 3, 4,  3,   16 - 13, 0 },
-        { 1, 2, 3, 4, 11,   64 - 21, 0 },
-        { 1, 2, 3, 4, 11,  256 - 21, 0 },
-        { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+        { 1, 2, 3, 4,  3,   16 - 13 },
+        { 1, 2, 3, 4, 11,   64 - 21 },
+        { 1, 2, 3, 4, 11,  256 - 21 },
+        { 1, 2, 3, 4, 11, 1024 - 21 },
     };
-    const int16_t *y_band_counts  = band_counts[b->tx];
+    const int16_t *y_band_counts = band_counts[b->tx];
     const int16_t *uv_band_counts = band_counts[b->uvtx];
-
-    /* y tokens */
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            for (x = 1; x < step1d; x++)
-                l[y] |= l[y + x];
-        for (x = 0; x < end_x; x += step1d)
-            for (y = 1; y < step1d; y++)
-                a[x] |= a[x + y];
-    }
-    for (n = 0, y = 0; y < end_y; y += step1d) {
-        for (x = 0; x < end_x; x += step1d, n += step) {
-            enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
-                                                                b->bs > BS_8x8 ?
-                                                                n : 0]];
-            int nnz = a[x] + l[y];
-            if ((ret = decode_block_coeffs(&s->c, s->block + 16 * n, 16 * step,
-                                           b->tx, c, e, p, nnz, yscans[txtp],
-                                           ynbs[txtp], y_band_counts,
-                                           qmul[0])) < 0)
-                return ret;
-            a[x] = l[y] = !!ret;
-            if (b->tx > TX_8X8)
-                AV_WN16A(&s->eob[n], ret);
-            else
-                s->eob[n] = ret;
-        }
-    }
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
-        for (x = 0; x < end_x; x += step1d)
-            memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
+    int bytesperpixel = is8bitsperpixel ? 1 : 2;
+    int total_coeff = 0;
+
+#define MERGE(la, end, step, rd) \
+    for (n = 0; n < end; n += step) \
+        la[n] = !!rd(&la[n])
+#define MERGE_CTX(step, rd) \
+    do { \
+        MERGE(l, end_y, step, rd); \
+        MERGE(a, end_x, step, rd); \
+    } while (0)
+
+#define DECODE_Y_COEF_LOOP(step, mode_index, v) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[mode_index]]; \
+            ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
+                                    (td, td->block + 16 * n * bytesperpixel, 16 * step * step, \
+                                     c, e, p, a[x] + l[y], yscans[txtp], \
+                                     ynbs[txtp], y_band_counts, qmul[0]); \
+            a[x] = l[y] = !!ret; \
+            total_coeff |= !!ret; \
+            if (step >= 4) { \
+                AV_WN16A(&td->eob[n], ret); \
+            } else { \
+                td->eob[n] = ret; \
+            } \
+        } \
     }
 
-    p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
-    c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
-    e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
-    w4    >>= 1;
-    h4    >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
-    for (pl = 0; pl < 2; pl++) {
-        a = &s->above_uv_nnz_ctx[pl][col];
-        l = &s->left_uv_nnz_ctx[pl][row & 7];
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                for (x = 1; x < uvstep1d; x++)
-                    l[y] |= l[y + x];
-            for (x = 0; x < end_x; x += uvstep1d)
-                for (y = 1; y < uvstep1d; y++)
-                    a[x] |= a[x + y];
-        }
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-            for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
-                int nnz = a[x] + l[y];
-                if ((ret = decode_block_coeffs(&s->c, s->uvblock[pl] + 16 * n,
-                                               16 * uvstep, b->uvtx, c, e, p,
-                                               nnz, uvscan, uvnb,
-                                               uv_band_counts, qmul[1])) < 0)
-                    return ret;
-                a[x] = l[y] = !!ret;
-                if (b->uvtx > TX_8X8)
-                    AV_WN16A(&s->uveob[pl][n], ret);
-                else
-                    s->uveob[pl][n] = ret;
-            }
-        }
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
-            for (x = 0; x < end_x; x += uvstep1d)
-                memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
-        }
+#define SPLAT(la, end, step, cond) \
+    if (step == 2) { \
+        for (n = 1; n < end; n += step) \
+            la[n] = la[n - 1]; \
+    } else if (step == 4) { \
+        if (cond) { \
+            for (n = 0; n < end; n += step) \
+                AV_WN32A(&la[n], la[n] * 0x01010101); \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
+        } \
+    } else /* step == 8 */ { \
+        if (cond) { \
+            if (HAVE_FAST_64BIT) { \
+                for (n = 0; n < end; n += step) \
+                    AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
+            } else { \
+                for (n = 0; n < end; n += step) { \
+                    uint32_t v32 = la[n] * 0x01010101; \
+                    AV_WN32A(&la[n],     v32); \
+                    AV_WN32A(&la[n + 4], v32); \
+                } \
+            } \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
+        } \
     }
+#define SPLAT_CTX(step) \
+    do { \
+        SPLAT(a, end_x, step, end_x == w4); \
+        SPLAT(l, end_y, step, end_y == h4); \
+    } while (0)
 
-    return 0;
-}
-
-static av_always_inline int check_intra_mode(VP9Context *s, int mode,
-                                             uint8_t **a,
-                                             uint8_t *dst_edge,
-                                             ptrdiff_t stride_edge,
-                                             uint8_t *dst_inner,
-                                             ptrdiff_t stride_inner,
-                                             uint8_t *l, int col, int x, int w,
-                                             int row, int y, enum TxfmMode tx,
-                                             int p)
-{
-    int have_top   = row > 0 || y > 0;
-    int have_left  = col > s->tiling.tile_col_start || x > 0;
-    int have_right = x < w - 1;
-    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
-        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED            },
-                                   { DC_127_PRED,          VERT_PRED            } },
-        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED          },
-                                   { HOR_PRED,             HOR_PRED             } },
-        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED          },
-                                   { LEFT_DC_PRED,         DC_PRED              } },
-        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  },
-                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  } },
-        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
-                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
-        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      },
-                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      } },
-        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED        },
-                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED        } },
-        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED       },
-                                   { DC_127_PRED,          VERT_LEFT_PRED       } },
-        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED          },
-                                   { HOR_UP_PRED,          HOR_UP_PRED          } },
-        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED            },
-                                   { HOR_PRED,             TM_VP8_PRED          } },
-    };
-    static const struct {
-        uint8_t needs_left:1;
-        uint8_t needs_top:1;
-        uint8_t needs_topleft:1;
-        uint8_t needs_topright:1;
-    } edges[N_INTRA_PRED_MODES] = {
-        [VERT_PRED]            = { .needs_top  = 1 },
-        [HOR_PRED]             = { .needs_left = 1 },
-        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
-        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
-        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
-        [HOR_UP_PRED]          = { .needs_left = 1 },
-        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [LEFT_DC_PRED]         = { .needs_left = 1 },
-        [TOP_DC_PRED]          = { .needs_top  = 1 },
-        [DC_128_PRED]          = { 0 },
-        [DC_127_PRED]          = { 0 },
-        [DC_129_PRED]          = { 0 }
-    };
-
-    av_assert2(mode >= 0 && mode < 10);
-    mode = mode_conv[mode][have_left][have_top];
-    if (edges[mode].needs_top) {
-        uint8_t *top = NULL, *topleft = NULL;
-        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
-        int n_px_need_tr = 0;
-
-        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
-            n_px_need_tr = 4;
-
-        // if top of sb64-row, use s->intra_pred_data[] instead of
-        // dst[-stride] for intra prediction (it contains pre- instead of
-        // post-loopfilter data)
-        if (have_top) {
-            top = !(row & 7) && !y ?
-                  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
-                  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
-            if (have_left)
-                topleft = !(row & 7) && !y ?
-                          s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
-                          y == 0 || x == 0 ? &dst_edge[-stride_edge] :
-                          &dst_inner[-stride_inner];
-        }
-
-        if (have_top &&
-            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
-            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
-            n_px_need + n_px_need_tr <= n_px_have) {
-            *a = top;
-        } else {
-            if (have_top) {
-                if (n_px_need <= n_px_have) {
-                    memcpy(*a, top, n_px_need);
-                } else {
-                    memcpy(*a, top, n_px_have);
-                    memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
-                           n_px_need - n_px_have);
-                }
-            } else {
-                memset(*a, 127, n_px_need);
-            }
-            if (edges[mode].needs_topleft) {
-                if (have_left && have_top)
-                    (*a)[-1] = topleft[-1];
-                else
-                    (*a)[-1] = have_top ? 129 : 127;
-            }
-            if (tx == TX_4X4 && edges[mode].needs_topright) {
-                if (have_top && have_right &&
-                    n_px_need + n_px_need_tr <= n_px_have) {
-                    memcpy(&(*a)[4], &top[4], 4);
-                } else {
-                    memset(&(*a)[4], (*a)[3], 4);
-                }
-            }
-        }
-    }
-    if (edges[mode].needs_left) {
-        if (have_left) {
-            int i;
-            int n_px_need = 4 << tx;
-            int n_px_have = (((s->rows - row) << !p) - y) * 4;
-            uint8_t *dst     = x == 0 ? dst_edge : dst_inner;
-            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
-
-            if (n_px_need <= n_px_have) {
-                for (i = 0; i < n_px_need; i++)
-                    l[i] = dst[i * stride - 1];
-            } else {
-                for (i = 0; i < n_px_have; i++)
-                    l[i] = dst[i * stride - 1];
-                memset(&l[i], l[i - 1], n_px_need - n_px_have);
-            }
-        } else {
-            memset(l, 129, 4 << tx);
-        }
+    /* y tokens */
+    switch (b->tx) {
+    case TX_4X4:
+        DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
+        break;
+    case TX_8X8:
+        MERGE_CTX(2, AV_RN16A);
+        DECODE_Y_COEF_LOOP(2, 0,);
+        SPLAT_CTX(2);
+        break;
+    case TX_16X16:
+        MERGE_CTX(4, AV_RN32A);
+        DECODE_Y_COEF_LOOP(4, 0,);
+        SPLAT_CTX(4);
+        break;
+    case TX_32X32:
+        MERGE_CTX(8, AV_RN64A);
+        DECODE_Y_COEF_LOOP(8, 0, 32);
+        SPLAT_CTX(8);
+        break;
     }
 
-    return mode;
-}
-
-static void intra_recon(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
-{
-    VP9Context *s = avctx->priv_data;
-    VP9Block *b = s->b;
-    AVFrame *f = s->frames[CUR_FRAME].tf.f;
-    int row = b->row, col = b->col;
-    int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
-    int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
-    int end_x = FFMIN(2 * (s->cols - col), w4);
-    int end_y = FFMIN(2 * (s->rows - row), h4);
-    int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
-    int uvstep1d = 1 << b->uvtx, p;
-    uint8_t *dst = b->dst[0], *dst_r = f->data[0] + y_off;
-
-    for (n = 0, y = 0; y < end_y; y += step1d) {
-        uint8_t *ptr = dst, *ptr_r = dst_r;
-        for (x = 0; x < end_x;
-             x += step1d, ptr += 4 * step1d, ptr_r += 4 * step1d, n += step) {
-            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
-                               y * 2 + x : 0];
-            LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-            uint8_t *a = &a_buf[16], l[32];
-            enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
-            int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
-
-            mode = check_intra_mode(s, mode, &a, ptr_r,
-                                    f->linesize[0],
-                                    ptr, b->y_stride, l,
-                                    col, x, w4, row, y, b->tx, 0);
-            s->dsp.intra_pred[b->tx][mode](ptr, b->y_stride, l, a);
-            if (eob)
-                s->dsp.itxfm_add[tx][txtp](ptr, b->y_stride,
-                                           s->block + 16 * n, eob);
-        }
-        dst_r += 4 * f->linesize[0] * step1d;
-        dst   += 4 * b->y_stride * step1d;
+#define DECODE_UV_COEF_LOOP(step, v) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
+                                    (td, td->uvblock[pl] + 16 * n * bytesperpixel, \
+                                     16 * step * step, c, e, p, a[x] + l[y], \
+                                     uvscan, uvnb, uv_band_counts, qmul[1]); \
+            a[x] = l[y] = !!ret; \
+            total_coeff |= !!ret; \
+            if (step >= 4) { \
+                AV_WN16A(&td->uveob[pl][n], ret); \
+            } else { \
+                td->uveob[pl][n] = ret; \
+            } \
+        } \
     }
 
-    // U/V
-    h4    >>= 1;
-    w4    >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
-    step    = 1 << (b->uvtx * 2);
-    for (p = 0; p < 2; p++) {
-        dst   = b->dst[1 + p];
-        dst_r = f->data[1 + p] + uv_off;
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-            uint8_t *ptr = dst, *ptr_r = dst_r;
-            for (x = 0; x < end_x;
-                 x += uvstep1d, ptr += 4 * uvstep1d,
-                 ptr_r += 4 * uvstep1d, n += step) {
-                int mode = b->uvmode;
-                LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-                uint8_t *a = &a_buf[16], l[32];
-                int eob    = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
-                                              : s->uveob[p][n];
-
-                mode = check_intra_mode(s, mode, &a, ptr_r,
-                                        f->linesize[1],
-                                        ptr, b->uv_stride, l,
-                                        col, x, w4, row, y, b->uvtx, p + 1);
-                s->dsp.intra_pred[b->uvtx][mode](ptr, b->uv_stride, l, a);
-                if (eob)
-                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
-                                                    s->uvblock[p] + 16 * n,
-                                                    eob);
-            }
-            dst_r += 4 * uvstep1d * f->linesize[1];
-            dst   += 4 * uvstep1d * b->uv_stride;
+    p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
+    c = td->counts.coef[b->uvtx][1 /* uv */][!b->intra];
+    e = td->counts.eob[b->uvtx][1 /* uv */][!b->intra];
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    h4 >>= s->ss_v;
+    end_y >>= s->ss_v;
+    for (pl = 0; pl < 2; pl++) {
+        a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
+        l = &td->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
+        switch (b->uvtx) {
+        case TX_4X4:
+            DECODE_UV_COEF_LOOP(1,);
+            break;
+        case TX_8X8:
+            MERGE_CTX(2, AV_RN16A);
+            DECODE_UV_COEF_LOOP(2,);
+            SPLAT_CTX(2);
+            break;
+        case TX_16X16:
+            MERGE_CTX(4, AV_RN32A);
+            DECODE_UV_COEF_LOOP(4,);
+            SPLAT_CTX(4);
+            break;
+        case TX_32X32:
+            MERGE_CTX(8, AV_RN64A);
+            DECODE_UV_COEF_LOOP(8, 32);
+            SPLAT_CTX(8);
+            break;
         }
     }
-}
 
-static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
-                                         uint8_t *dst, ptrdiff_t dst_stride,
-                                         const uint8_t *ref,
-                                         ptrdiff_t ref_stride,
-                                         ThreadFrame *ref_frame,
-                                         ptrdiff_t y, ptrdiff_t x,
-                                         const VP56mv *mv,
-                                         int bw, int bh, int w, int h)
-{
-    int mx = mv->x, my = mv->y;
-    int th;
-
-    y   += my >> 3;
-    x   += mx >> 3;
-    ref += y * ref_stride + x;
-    mx  &= 7;
-    my  &= 7;
-
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
-    // the longest loopfilter of the next sbrow
-    th = (y + bh + 4 * !!my + 7) >> 6;
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
-
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    // The arm/aarch64 _hv filters read one more row than what actually is
-    // needed, so switch to emulated edge one pixel sooner vertically
-    // (!!my * 5) than horizontally (!!mx * 4).
-    if (x < !!mx * 3 || y < !!my * 3 ||
-        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref - !!my * 3 * ref_stride - !!mx * 3,
-                                 80,
-                                 ref_stride,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref        = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        ref_stride = 80;
-    }
-    mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
+    return total_coeff;
 }
 
-static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
-                                           uint8_t *dst_u, uint8_t *dst_v,
-                                           ptrdiff_t dst_stride,
-                                           const uint8_t *ref_u,
-                                           ptrdiff_t src_stride_u,
-                                           const uint8_t *ref_v,
-                                           ptrdiff_t src_stride_v,
-                                           ThreadFrame *ref_frame,
-                                           ptrdiff_t y, ptrdiff_t x,
-                                           const VP56mv *mv,
-                                           int bw, int bh, int w, int h)
+static int decode_coeffs_8bpp(VP9TileData *td)
 {
-    int mx = mv->x, my = mv->y;
-    int th;
-
-    y     += my >> 4;
-    x     += mx >> 4;
-    ref_u += y * src_stride_u + x;
-    ref_v += y * src_stride_v + x;
-    mx    &= 15;
-    my    &= 15;
-
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
-    // the longest loopfilter of the next sbrow
-    th = (y + bh + 4 * !!my + 7) >> 5;
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
-
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    // The arm/aarch64 _hv filters read one more row than what actually is
-    // needed, so switch to emulated edge one pixel sooner vertically
-    // (!!my * 5) than horizontally (!!mx * 4).
-    if (x < !!mx * 3 || y < !!my * 3 ||
-        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3,
-                                 80,
-                                 src_stride_u,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
-
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3,
-                                 80,
-                                 src_stride_v,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
-    } else {
-        mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
-        mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
-    }
+    return decode_coeffs(td, 1);
 }
 
-static int inter_recon(AVCodecContext *avctx)
+static int decode_coeffs_16bpp(VP9TileData *td)
 {
-    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
-        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
-        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
-    };
-    VP9Context *s = avctx->priv_data;
-    VP9Block *b = s->b;
-    int row = b->row, col = b->col;
-
-    ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]];
-    ThreadFrame *tref2 = b->comp ? &s->refs[s->refidx[b->ref[1]]] : NULL;
-    AVFrame      *ref1 = tref1->f;
-    AVFrame      *ref2 = tref2 ? tref2->f : NULL;
-
-    int w = avctx->width, h = avctx->height;
-    ptrdiff_t ls_y = b->y_stride, ls_uv = b->uv_stride;
-
-    if (!ref1->data[0] || (b->comp && !ref2->data[0]))
-        return AVERROR_INVALIDDATA;
-
-    // y inter pred
-    if (b->bs > BS_8x8) {
-        if (b->bs == BS_8x4) {
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
-                        b->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
-                            b->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
-            }
-        } else if (b->bs == BS_4x8) {
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
-            }
-        } else {
-            av_assert2(b->bs == BS_4x4);
-
-            // FIXME if two horizontally adjacent blocks have the same MV,
-            // do a w8 instead of a w4 call
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        b->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        b->dst[0] + 4 * ls_y + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            b->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            b->dst[0] + 4 * ls_y + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
-            }
-        }
-    } else {
-        int bwl = bwlog_tab[0][b->bs];
-        int bw  = bwh_tab[0][b->bs][0] * 4;
-        int bh  = bwh_tab[0][b->bs][1] * 4;
-
-        mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], b->dst[0], ls_y,
-                    ref1->data[0], ref1->linesize[0], tref1,
-                    row << 3, col << 3, &b->mv[0][0], bw, bh, w, h);
-
-        if (b->comp)
-            mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], b->dst[0], ls_y,
-                        ref2->data[0], ref2->linesize[0], tref2,
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
-    }
-
-    // uv inter pred
-    {
-        int bwl = bwlog_tab[1][b->bs];
-        int bw  = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
-        VP56mv mvuv;
-
-        w = (w + 1) >> 1;
-        h = (h + 1) >> 1;
-        if (b->bs > BS_8x8) {
-            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x +
-                                 b->mv[2][0].x + b->mv[3][0].x, 4);
-            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y +
-                                 b->mv[2][0].y + b->mv[3][0].y, 4);
-        } else {
-            mvuv = b->mv[0][0];
-        }
-
-        mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
-                      b->dst[1], b->dst[2], ls_uv,
-                      ref1->data[1], ref1->linesize[1],
-                      ref1->data[2], ref1->linesize[2], tref1,
-                      row << 2, col << 2, &mvuv, bw, bh, w, h);
-
-        if (b->comp) {
-            if (b->bs > BS_8x8) {
-                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x +
-                                     b->mv[2][1].x + b->mv[3][1].x, 4);
-                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y +
-                                     b->mv[2][1].y + b->mv[3][1].y, 4);
-            } else {
-                mvuv = b->mv[0][1];
-            }
-            mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
-                          b->dst[1], b->dst[2], ls_uv,
-                          ref2->data[1], ref2->linesize[1],
-                          ref2->data[2], ref2->linesize[2], tref2,
-                          row << 2, col << 2, &mvuv, bw, bh, w, h);
-        }
-    }
-
-    if (!b->skip) {
-        /* mostly copied intra_reconn() */
-
-        int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
-        int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
-        int end_x = FFMIN(2 * (s->cols - col), w4);
-        int end_y = FFMIN(2 * (s->rows - row), h4);
-        int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
-        int uvstep1d = 1 << b->uvtx, p;
-        uint8_t *dst = b->dst[0];
-
-        // y itxfm add
-        for (n = 0, y = 0; y < end_y; y += step1d) {
-            uint8_t *ptr = dst;
-            for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
-                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
-
-                if (eob)
-                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, b->y_stride,
-                                                  s->block + 16 * n, eob);
-            }
-            dst += 4 * b->y_stride * step1d;
-        }
-
-        // uv itxfm add
-        h4    >>= 1;
-        w4    >>= 1;
-        end_x >>= 1;
-        end_y >>= 1;
-        step    = 1 << (b->uvtx * 2);
-        for (p = 0; p < 2; p++) {
-            dst = b->dst[p + 1];
-            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-                uint8_t *ptr = dst;
-                for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
-                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
-                                               : s->uveob[p][n];
-                    if (eob)
-                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
-                                                        s->uvblock[p] + 16 * n, eob);
-                }
-                dst += 4 * uvstep1d * b->uv_stride;
-            }
-        }
-    }
-    return 0;
+    return decode_coeffs(td, 0);
 }
 
-static av_always_inline void mask_edges(VP9Filter *lflvl, int is_uv,
+static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
                                         int row_and_7, int col_and_7,
                                         int w, int h, int col_end, int row_end,
                                         enum TxfmMode tx, int skip_inter)
 {
+    static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
+    static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
+
     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
@@ -1461,14 +1156,14 @@ static av_always_inline void mask_edges(VP9Filter *lflvl, int is_uv,
     // a time, and we only use the topleft block's mode information to set
     // things like block strength. Thus, for any block size smaller than
     // 16x16, ignore the odd portion of the block.
-    if (tx == TX_4X4 && is_uv) {
-        if (h == 1) {
+    if (tx == TX_4X4 && (ss_v | ss_h)) {
+        if (h == ss_v) {
             if (row_and_7 & 1)
                 return;
             if (!row_end)
                 h += 1;
         }
-        if (w == 1) {
+        if (w == ss_h) {
             if (col_and_7 & 1)
                 return;
             if (!col_end)
@@ -1478,44 +1173,36 @@ static av_always_inline void mask_edges(VP9Filter *lflvl, int is_uv,
 
     if (tx == TX_4X4 && !skip_inter) {
         int t = 1 << col_and_7, m_col = (t << w) - t, y;
-        int m_col_odd = (t << (w - 1)) - t;
-
         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
-        if (is_uv) {
-            int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 7);
-
-                lflvl->mask[is_uv][0][y][1] |= m_row_8;
-                lflvl->mask[is_uv][0][y][2] |= m_row_4;
-                // for odd lines, if the odd col is not being filtered,
-                // skip odd row also:
-                // .---. <-- a
-                // |   |
-                // |___| <-- b
-                // ^   ^
-                // c   d
-                //
-                // if a/c are even row/col and b/d are odd, and d is skipped,
-                // e.g. right edge of size-66x66.webm, then skip b also (bug)
-                if ((col_end & 1) && (y & 1)) {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
-                } else {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
-                }
+        int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
+
+        for (y = row_and_7; y < h + row_and_7; y++) {
+            int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
+
+            mask[0][y][1] |= m_row_8;
+            mask[0][y][2] |= m_row_4;
+            // for odd lines, if the odd col is not being filtered,
+            // skip odd row also:
+            // .---. <-- a
+            // |   |
+            // |___| <-- b
+            // ^   ^
+            // c   d
+            //
+            // if a/c are even row/col and b/d are odd, and d is skipped,
+            // e.g. right edge of size-66x66.webm, then skip b also (bug)
+            if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
+                mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
+            } else {
+                mask[1][y][col_mask_id] |= m_col;
             }
-        } else {
-            int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 3);
-
-                lflvl->mask[is_uv][0][y][1]           |= m_row_8; // row edge
-                lflvl->mask[is_uv][0][y][2]           |= m_row_4;
-                lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
-                lflvl->mask[is_uv][0][y][3]           |= m_col;
-                lflvl->mask[is_uv][1][y][3]           |= m_col;
+            if (!ss_h)
+                mask[0][y][3] |= m_col;
+            if (!ss_v) {
+                if (ss_h && (col_end & 1))
+                    mask[1][y][3] |= (t << (w - 1)) - t;
+                else
+                    mask[1][y][3] |= m_col;
             }
         }
     } else {
@@ -1523,227 +1210,240 @@ static av_always_inline void mask_edges(VP9Filter *lflvl, int is_uv,
 
         if (!skip_inter) {
             int mask_id = (tx == TX_8X8);
-            int l2 = tx + is_uv - 1, step1d = 1 << l2;
+            int l2 = tx + ss_h - 1, step1d;
             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
             int m_row = m_col & masks[l2];
 
             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
             // 8wd loopfilter to prevent going off the visible edge.
-            if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
+            if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
-                int m_row_8  = m_row - m_row_16;
+                int m_row_8 = m_row - m_row_16;
 
                 for (y = row_and_7; y < h + row_and_7; y++) {
-                    lflvl->mask[is_uv][0][y][0] |= m_row_16;
-                    lflvl->mask[is_uv][0][y][1] |= m_row_8;
+                    mask[0][y][0] |= m_row_16;
+                    mask[0][y][1] |= m_row_8;
                 }
             } else {
                 for (y = row_and_7; y < h + row_and_7; y++)
-                    lflvl->mask[is_uv][0][y][mask_id] |= m_row;
+                    mask[0][y][mask_id] |= m_row;
             }
 
-            if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
+            l2 = tx + ss_v - 1;
+            step1d = 1 << l2;
+            if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
-                    lflvl->mask[is_uv][1][y][0] |= m_col;
+                    mask[1][y][0] |= m_col;
                 if (y - row_and_7 == h - 1)
-                    lflvl->mask[is_uv][1][y][1] |= m_col;
+                    mask[1][y][1] |= m_col;
             } else {
                 for (y = row_and_7; y < h + row_and_7; y += step1d)
-                    lflvl->mask[is_uv][1][y][mask_id] |= m_col;
+                    mask[1][y][mask_id] |= m_col;
             }
         } else if (tx != TX_4X4) {
             int mask_id;
 
-            mask_id = (tx == TX_8X8) || (is_uv && h == 1);
-            lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
-            mask_id = (tx == TX_8X8) || (is_uv && w == 1);
+            mask_id = (tx == TX_8X8) || (h == ss_v);
+            mask[1][row_and_7][mask_id] |= m_col;
+            mask_id = (tx == TX_8X8) || (w == ss_h);
             for (y = row_and_7; y < h + row_and_7; y++)
-                lflvl->mask[is_uv][0][y][mask_id] |= t;
-        } else if (is_uv) {
-            int t8 = t & 0x01, t4 = t - t8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
-            }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
+                mask[0][y][mask_id] |= t;
         } else {
-            int t8 = t & 0x11, t4 = t - t8;
+            int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
 
             for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
+                mask[0][y][2] |= t4;
+                mask[0][y][1] |= t8;
             }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
+            mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
         }
     }
 }
 
-int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
-                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
-                        enum BlockLevel bl, enum BlockPartition bp)
+void ff_vp9_decode_block(VP9TileData *td, int row, int col,
+                         VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
+                         enum BlockLevel bl, enum BlockPartition bp)
 {
-    VP9Context *s = avctx->priv_data;
-    VP9Block *b = s->b;
-    AVFrame *f = s->frames[CUR_FRAME].tf.f;
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
     enum BlockSize bs = bl * 3 + bp;
-    int ret, y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
+    int bytesperpixel = s->bytesperpixel;
+    int w4 = ff_vp9_bwh_tab[1][bs][0], h4 = ff_vp9_bwh_tab[1][bs][1], lvl;
     int emu[2];
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
 
-    b->row  = row;
-    b->row7 = row & 7;
-    b->col  = col;
-    b->col7 = col & 7;
+    td->row = row;
+    td->row7 = row & 7;
+    td->col = col;
+    td->col7 = col & 7;
 
-    s->min_mv.x = -(128 + col * 64);
-    s->min_mv.y = -(128 + row * 64);
-    s->max_mv.x = 128 + (s->cols - col - w4) * 64;
-    s->max_mv.y = 128 + (s->rows - row - h4) * 64;
+    td->min_mv.x = -(128 + col * 64);
+    td->min_mv.y = -(128 + row * 64);
+    td->max_mv.x = 128 + (s->cols - col - w4) * 64;
+    td->max_mv.y = 128 + (s->rows - row - h4) * 64;
 
     if (s->pass < 2) {
         b->bs = bs;
         b->bl = bl;
         b->bp = bp;
-        decode_mode(s, b);
-        b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
+        decode_mode(td);
+        b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
+                           (s->ss_v && h4 * 2 == (1 << b->tx)));
 
         if (!b->skip) {
-            if ((ret = decode_coeffs(avctx)) < 0)
-                return ret;
-        } else {
-            int pl;
+            int has_coeffs;
 
-            memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
-            memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
-            for (pl = 0; pl < 2; pl++) {
-                memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
-                memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
+            if (bytesperpixel == 1) {
+                has_coeffs = decode_coeffs_8bpp(td);
+            } else {
+                has_coeffs = decode_coeffs_16bpp(td);
+            }
+            if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
+                b->skip = 1;
+                memset(&s->above_skip_ctx[col], 1, w4);
+                memset(&td->left_skip_ctx[td->row7], 1, h4);
+            }
+        } else {
+            int row7 = td->row7;
+
+#define SPLAT_ZERO_CTX(v, n) \
+    switch (n) { \
+    case 1:  v = 0;          break; \
+    case 2:  AV_ZERO16(&v);  break; \
+    case 4:  AV_ZERO32(&v);  break; \
+    case 8:  AV_ZERO64(&v);  break; \
+    case 16: AV_ZERO128(&v); break; \
+    }
+#define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
+    do { \
+        SPLAT_ZERO_CTX(dir##_y_##var[off * 2], n * 2); \
+        if (s->ss_##dir2) { \
+            SPLAT_ZERO_CTX(dir##_uv_##var[0][off], n); \
+            SPLAT_ZERO_CTX(dir##_uv_##var[1][off], n); \
+        } else { \
+            SPLAT_ZERO_CTX(dir##_uv_##var[0][off * 2], n * 2); \
+            SPLAT_ZERO_CTX(dir##_uv_##var[1][off * 2], n * 2); \
+        } \
+    } while (0)
+
+            switch (w4) {
+            case 1: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 1, h); break;
+            case 2: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 2, h); break;
+            case 4: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 4, h); break;
+            case 8: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 8, h); break;
+            }
+            switch (h4) {
+            case 1: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 1, v); break;
+            case 2: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 2, v); break;
+            case 4: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 4, v); break;
+            case 8: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 8, v); break;
             }
         }
 
         if (s->pass == 1) {
-            s->b++;
-            s->block      += w4 * h4 * 64;
-            s->uvblock[0] += w4 * h4 * 16;
-            s->uvblock[1] += w4 * h4 * 16;
-            s->eob        += w4 * h4 * 4;
-            s->uveob[0]   += w4 * h4;
-            s->uveob[1]   += w4 * h4;
-
-            return 0;
+            s->td[0].b++;
+            s->td[0].block += w4 * h4 * 64 * bytesperpixel;
+            s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
+            s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
+            s->td[0].eob += 4 * w4 * h4;
+            s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+            s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+
+            return;
         }
     }
 
-    /* Emulated overhangs if the stride of the target buffer can't hold.
-     * This allows to support emu-edge and so on even if we have large
-     * block overhangs. */
-    emu[0] = (col + w4) * 8 > f->linesize[0] ||
+    // emulated overhangs if the stride of the target buffer can't hold. This
+    // makes it possible to support emu-edge and so on even if we have large block
+    // overhangs
+    emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
              (row + h4) > s->rows;
-    emu[1] = (col + w4) * 4 > f->linesize[1] ||
+    emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
              (row + h4) > s->rows;
     if (emu[0]) {
-        b->dst[0]   = s->tmp_y;
-        b->y_stride = 64;
+        td->dst[0] = td->tmp_y;
+        td->y_stride = 128;
     } else {
-        b->dst[0]   = f->data[0] + yoff;
-        b->y_stride = f->linesize[0];
+        td->dst[0] = f->data[0] + yoff;
+        td->y_stride = f->linesize[0];
     }
     if (emu[1]) {
-        b->dst[1]    = s->tmp_uv[0];
-        b->dst[2]    = s->tmp_uv[1];
-        b->uv_stride = 32;
+        td->dst[1] = td->tmp_uv[0];
+        td->dst[2] = td->tmp_uv[1];
+        td->uv_stride = 128;
     } else {
-        b->dst[1]    = f->data[1] + uvoff;
-        b->dst[2]    = f->data[2] + uvoff;
-        b->uv_stride = f->linesize[1];
+        td->dst[1] = f->data[1] + uvoff;
+        td->dst[2] = f->data[2] + uvoff;
+        td->uv_stride = f->linesize[1];
     }
     if (b->intra) {
-        intra_recon(avctx, yoff, uvoff);
+        if (s->s.h.bpp > 8) {
+            ff_vp9_intra_recon_16bpp(td, yoff, uvoff);
+        } else {
+            ff_vp9_intra_recon_8bpp(td, yoff, uvoff);
+        }
     } else {
-        if ((ret = inter_recon(avctx)) < 0)
-            return ret;
+        if (s->s.h.bpp > 8) {
+            ff_vp9_inter_recon_16bpp(td);
+        } else {
+            ff_vp9_inter_recon_8bpp(td);
+        }
     }
     if (emu[0]) {
-        int w = FFMIN(s->cols - col, w4) * 8;
-        int h = FFMIN(s->rows - row, h4) * 8;
-        int n, o = 0;
+        int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
 
         for (n = 0; o < w; n++) {
             int bw = 64 >> n;
 
             av_assert2(n <= 4);
             if (w & bw) {
-                s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o,
-                                         f->linesize[0],
-                                         s->tmp_y + o,
-                                         64, h, 0, 0);
+                s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
+                                         td->tmp_y + o * bytesperpixel, 128, h, 0, 0);
                 o += bw;
             }
         }
     }
     if (emu[1]) {
-        int w = FFMIN(s->cols - col, w4) * 4;
-        int h = FFMIN(s->rows - row, h4) * 4;
-        int n, o = 0;
+        int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
+        int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
 
-        for (n = 1; o < w; n++) {
+        for (n = s->ss_h; o < w; n++) {
             int bw = 64 >> n;
 
             av_assert2(n <= 4);
             if (w & bw) {
-                s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o,
-                                         f->linesize[1],
-                                         s->tmp_uv[0] + o,
-                                         32, h, 0, 0);
-                s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o,
-                                         f->linesize[2],
-                                         s->tmp_uv[1] + o,
-                                         32, h, 0, 0);
+                s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
+                                         td->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
+                s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
+                                         td->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
                 o += bw;
             }
         }
     }
 
     // pick filter level and find edges to apply filter to
-    if (s->filter.level &&
-        (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
-                                                    [b->mode[3] != ZEROMV]) > 0) {
-        int x_end = FFMIN(s->cols - col, w4);
-        int y_end = FFMIN(s->rows - row, h4);
-        int skip_inter = !b->intra && b->skip;
-
-        for (y = 0; y < h4; y++)
-            memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
-        mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
-        mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
-                   s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
-                   s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
-                   b->uvtx, skip_inter);
-
-        if (!s->filter.lim_lut[lvl]) {
-            int sharp = s->filter.sharpness;
-            int limit = lvl;
-
-            if (sharp > 0) {
-                limit >>= (sharp + 3) >> 2;
-                limit   = FFMIN(limit, 9 - sharp);
-            }
-            limit = FFMAX(limit, 1);
-
-            s->filter.lim_lut[lvl]   = limit;
-            s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
-        }
+    if (s->s.h.filter.level &&
+        (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
+                                                      [b->mode[3] != ZEROMV]) > 0) {
+        int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
+        int skip_inter = !b->intra && b->skip, col7 = td->col7, row7 = td->row7;
+
+        setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
+        mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
+        if (s->ss_h || s->ss_v)
+            mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
+                       s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
+                       s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
+                       b->uvtx, skip_inter);
     }
 
     if (s->pass == 2) {
-        s->b++;
-        s->block      += w4 * h4 * 64;
-        s->uvblock[0] += w4 * h4 * 16;
-        s->uvblock[1] += w4 * h4 * 16;
-        s->eob        += w4 * h4 * 4;
-        s->uveob[0]   += w4 * h4;
-        s->uveob[1]   += w4 * h4;
+        s->td[0].b++;
+        s->td[0].block += w4 * h4 * 64 * bytesperpixel;
+        s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
+        s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
+        s->td[0].eob += 4 * w4 * h4;
+        s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
+        s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
     }
-
-    return 0;
 }
diff --git a/libavcodec/vp9data.c b/libavcodec/vp9data.c
index 2b67878..7af8a97 100644
--- a/libavcodec/vp9data.c
+++ b/libavcodec/vp9data.c
@@ -2,30 +2,40 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "vp9.h"
 #include "vp9data.h"
 
+const uint8_t ff_vp9_bwh_tab[2][N_BS_SIZES][2] = {
+    {
+        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
+        {  4,  4 }, {  4, 2 }, { 2,  4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
+    }, {
+        {  8,  8 }, {  8, 4 }, { 4,  8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
+        {  2,  2 }, {  2, 1 }, { 1,  2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+    }
+};
+
 const int8_t ff_vp9_partition_tree[3][2] = {
-    { -PARTITION_NONE,                1 }, // '0'
-    {    -PARTITION_H,                2 }, // '10'
-    {    -PARTITION_V, -PARTITION_SPLIT }, // '110', '111'
+    { -PARTITION_NONE, 1 },                      // '0'
+        { -PARTITION_H, 2 },                     // '10'
+            { -PARTITION_V, -PARTITION_SPLIT },  // '110', '111'
 };
 
 const uint8_t ff_vp9_default_kf_partition_probs[4][4][3] = {
@@ -53,25 +63,25 @@ const uint8_t ff_vp9_default_kf_partition_probs[4][4][3] = {
 };
 
 const int8_t ff_vp9_segmentation_tree[7][2] = {
-    {  1,  2 },
-    {  3,  4 },
-    {  5,  6 },
-    { -0, -1 }, // '00x'
-    { -2, -3 }, // '01x'
-    { -4, -5 }, // '10x'
-    { -6, -7 }, // '11x'
+    { 1, 2 },
+        { 3, 4 },
+        { 5, 6 },
+            { -0, -1 },  // '00x'
+            { -2, -3 },  // '01x'
+            { -4, -5 },  // '10x'
+            { -6, -7 },  // '11x'
 };
 
 const int8_t ff_vp9_intramode_tree[9][2] = {
-    {              -DC_PRED,                1 }, // '0'
-    {          -TM_VP8_PRED,                2 }, // '10'
-    {            -VERT_PRED,                3 }, // '110'
-    {                     4,                6 },
-    {             -HOR_PRED,                5 }, // '11100'
-    { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '11101x'
-    {  -DIAG_DOWN_LEFT_PRED,                7 }, // '11110'
-    {       -VERT_LEFT_PRED,                8 }, // '111110'
-    {        -HOR_DOWN_PRED,     -HOR_UP_PRED }, // '111111x'
+    { -DC_PRED, 1 },                                                  // '0'
+        { -TM_VP8_PRED, 2 },                                          // '10'
+            { -VERT_PRED, 3 },                                        // '110'
+                { 4, 6 },
+                    { -HOR_PRED, 5 },                                 // '11100'
+                        { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED },  // '11101x'
+                    { -DIAG_DOWN_LEFT_PRED, 7 },                      // '11110'
+                        { -VERT_LEFT_PRED, 8 },                       // '111110'
+                            { -HOR_DOWN_PRED, -HOR_UP_PRED },         // '111111x'
 };
 
 const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9] = {
@@ -202,14 +212,14 @@ const uint8_t ff_vp9_default_kf_uvmode_probs[10][9] = {
 };
 
 const int8_t ff_vp9_inter_mode_tree[3][2] = {
-    {    -ZEROMV,      1 }, // '0'
-    { -NEARESTMV,      2 }, // '10'
-    {    -NEARMV, -NEWMV }, // '11x'
+    { -ZEROMV, 1 },               // '0'
+        { -NEARESTMV, 2 },        // '10'
+            { -NEARMV, -NEWMV },  // '11x'
 };
 
 const int8_t ff_vp9_filter_tree[2][2] = {
-    { -0,  1 },  // '0'
-    { -1, -2 },  // '1x'
+    { -0,  1 },     // '0'
+        { -1, -2 }, // '1x'
 };
 
 const enum FilterMode ff_vp9_filter_lut[3] = {
@@ -218,74 +228,210 @@ const enum FilterMode ff_vp9_filter_lut[3] = {
     FILTER_8TAP_SHARP,
 };
 
-const int16_t ff_vp9_dc_qlookup[256] = {
-       4,    8,    8,    9,   10,   11,   12,   12,
-      13,   14,   15,   16,   17,   18,   19,   19,
-      20,   21,   22,   23,   24,   25,   26,   26,
-      27,   28,   29,   30,   31,   32,   32,   33,
-      34,   35,   36,   37,   38,   38,   39,   40,
-      41,   42,   43,   43,   44,   45,   46,   47,
-      48,   48,   49,   50,   51,   52,   53,   53,
-      54,   55,   56,   57,   57,   58,   59,   60,
-      61,   62,   62,   63,   64,   65,   66,   66,
-      67,   68,   69,   70,   70,   71,   72,   73,
-      74,   74,   75,   76,   77,   78,   78,   79,
-      80,   81,   81,   82,   83,   84,   85,   85,
-      87,   88,   90,   92,   93,   95,   96,   98,
-      99,  101,  102,  104,  105,  107,  108,  110,
-     111,  113,  114,  116,  117,  118,  120,  121,
-     123,  125,  127,  129,  131,  134,  136,  138,
-     140,  142,  144,  146,  148,  150,  152,  154,
-     156,  158,  161,  164,  166,  169,  172,  174,
-     177,  180,  182,  185,  187,  190,  192,  195,
-     199,  202,  205,  208,  211,  214,  217,  220,
-     223,  226,  230,  233,  237,  240,  243,  247,
-     250,  253,  257,  261,  265,  269,  272,  276,
-     280,  284,  288,  292,  296,  300,  304,  309,
-     313,  317,  322,  326,  330,  335,  340,  344,
-     349,  354,  359,  364,  369,  374,  379,  384,
-     389,  395,  400,  406,  411,  417,  423,  429,
-     435,  441,  447,  454,  461,  467,  475,  482,
-     489,  497,  505,  513,  522,  530,  539,  549,
-     559,  569,  579,  590,  602,  614,  626,  640,
-     654,  668,  684,  700,  717,  736,  755,  775,
-     796,  819,  843,  869,  896,  925,  955,  988,
-    1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+const int16_t ff_vp9_dc_qlookup[3][256] = {
+    {
+            4,     8,     8,     9,    10,    11,    12,    12,
+           13,    14,    15,    16,    17,    18,    19,    19,
+           20,    21,    22,    23,    24,    25,    26,    26,
+           27,    28,    29,    30,    31,    32,    32,    33,
+           34,    35,    36,    37,    38,    38,    39,    40,
+           41,    42,    43,    43,    44,    45,    46,    47,
+           48,    48,    49,    50,    51,    52,    53,    53,
+           54,    55,    56,    57,    57,    58,    59,    60,
+           61,    62,    62,    63,    64,    65,    66,    66,
+           67,    68,    69,    70,    70,    71,    72,    73,
+           74,    74,    75,    76,    77,    78,    78,    79,
+           80,    81,    81,    82,    83,    84,    85,    85,
+           87,    88,    90,    92,    93,    95,    96,    98,
+           99,   101,   102,   104,   105,   107,   108,   110,
+          111,   113,   114,   116,   117,   118,   120,   121,
+          123,   125,   127,   129,   131,   134,   136,   138,
+          140,   142,   144,   146,   148,   150,   152,   154,
+          156,   158,   161,   164,   166,   169,   172,   174,
+          177,   180,   182,   185,   187,   190,   192,   195,
+          199,   202,   205,   208,   211,   214,   217,   220,
+          223,   226,   230,   233,   237,   240,   243,   247,
+          250,   253,   257,   261,   265,   269,   272,   276,
+          280,   284,   288,   292,   296,   300,   304,   309,
+          313,   317,   322,   326,   330,   335,   340,   344,
+          349,   354,   359,   364,   369,   374,   379,   384,
+          389,   395,   400,   406,   411,   417,   423,   429,
+          435,   441,   447,   454,   461,   467,   475,   482,
+          489,   497,   505,   513,   522,   530,   539,   549,
+          559,   569,   579,   590,   602,   614,   626,   640,
+          654,   668,   684,   700,   717,   736,   755,   775,
+          796,   819,   843,   869,   896,   925,   955,   988,
+         1022,  1058,  1098,  1139,  1184,  1232,  1282,  1336,
+    }, {
+            4,     9,    10,    13,    15,    17,    20,    22,
+           25,    28,    31,    34,    37,    40,    43,    47,
+           50,    53,    57,    60,    64,    68,    71,    75,
+           78,    82,    86,    90,    93,    97,   101,   105,
+          109,   113,   116,   120,   124,   128,   132,   136,
+          140,   143,   147,   151,   155,   159,   163,   166,
+          170,   174,   178,   182,   185,   189,   193,   197,
+          200,   204,   208,   212,   215,   219,   223,   226,
+          230,   233,   237,   241,   244,   248,   251,   255,
+          259,   262,   266,   269,   273,   276,   280,   283,
+          287,   290,   293,   297,   300,   304,   307,   310,
+          314,   317,   321,   324,   327,   331,   334,   337,
+          343,   350,   356,   362,   369,   375,   381,   387,
+          394,   400,   406,   412,   418,   424,   430,   436,
+          442,   448,   454,   460,   466,   472,   478,   484,
+          490,   499,   507,   516,   525,   533,   542,   550,
+          559,   567,   576,   584,   592,   601,   609,   617,
+          625,   634,   644,   655,   666,   676,   687,   698,
+          708,   718,   729,   739,   749,   759,   770,   782,
+          795,   807,   819,   831,   844,   856,   868,   880,
+          891,   906,   920,   933,   947,   961,   975,   988,
+         1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
+         1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
+         1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
+         1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
+         1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
+         1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
+         1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
+         2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
+         2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
+         3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
+         4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
+    }, {
+            4,    12,    18,    25,    33,    41,    50,    60,
+           70,    80,    91,   103,   115,   127,   140,   153,
+          166,   180,   194,   208,   222,   237,   251,   266,
+          281,   296,   312,   327,   343,   358,   374,   390,
+          405,   421,   437,   453,   469,   484,   500,   516,
+          532,   548,   564,   580,   596,   611,   627,   643,
+          659,   674,   690,   706,   721,   737,   752,   768,
+          783,   798,   814,   829,   844,   859,   874,   889,
+          904,   919,   934,   949,   964,   978,   993,  1008,
+         1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
+         1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+         1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
+         1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
+         1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
+         1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
+         1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
+         2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
+         2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
+         2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
+         3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
+         3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
+         4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+         4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
+         5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
+         5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
+         6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
+         6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
+         7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
+         8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
+        10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+        12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+        16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
+    }
 };
 
-const int16_t ff_vp9_ac_qlookup[256] = {
-       4,    8,    9,   10,   11,   12,   13,   14,
-      15,   16,   17,   18,   19,   20,   21,   22,
-      23,   24,   25,   26,   27,   28,   29,   30,
-      31,   32,   33,   34,   35,   36,   37,   38,
-      39,   40,   41,   42,   43,   44,   45,   46,
-      47,   48,   49,   50,   51,   52,   53,   54,
-      55,   56,   57,   58,   59,   60,   61,   62,
-      63,   64,   65,   66,   67,   68,   69,   70,
-      71,   72,   73,   74,   75,   76,   77,   78,
-      79,   80,   81,   82,   83,   84,   85,   86,
-      87,   88,   89,   90,   91,   92,   93,   94,
-      95,   96,   97,   98,   99,  100,  101,  102,
-     104,  106,  108,  110,  112,  114,  116,  118,
-     120,  122,  124,  126,  128,  130,  132,  134,
-     136,  138,  140,  142,  144,  146,  148,  150,
-     152,  155,  158,  161,  164,  167,  170,  173,
-     176,  179,  182,  185,  188,  191,  194,  197,
-     200,  203,  207,  211,  215,  219,  223,  227,
-     231,  235,  239,  243,  247,  251,  255,  260,
-     265,  270,  275,  280,  285,  290,  295,  300,
-     305,  311,  317,  323,  329,  335,  341,  347,
-     353,  359,  366,  373,  380,  387,  394,  401,
-     408,  416,  424,  432,  440,  448,  456,  465,
-     474,  483,  492,  501,  510,  520,  530,  540,
-     550,  560,  571,  582,  593,  604,  615,  627,
-     639,  651,  663,  676,  689,  702,  715,  729,
-     743,  757,  771,  786,  801,  816,  832,  848,
-     864,  881,  898,  915,  933,  951,  969,  988,
-    1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
-    1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
-    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
-    1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+const int16_t ff_vp9_ac_qlookup[3][256] = {
+    {
+            4,     8,     9,    10,    11,    12,    13,    14,
+           15,    16,    17,    18,    19,    20,    21,    22,
+           23,    24,    25,    26,    27,    28,    29,    30,
+           31,    32,    33,    34,    35,    36,    37,    38,
+           39,    40,    41,    42,    43,    44,    45,    46,
+           47,    48,    49,    50,    51,    52,    53,    54,
+           55,    56,    57,    58,    59,    60,    61,    62,
+           63,    64,    65,    66,    67,    68,    69,    70,
+           71,    72,    73,    74,    75,    76,    77,    78,
+           79,    80,    81,    82,    83,    84,    85,    86,
+           87,    88,    89,    90,    91,    92,    93,    94,
+           95,    96,    97,    98,    99,   100,   101,   102,
+          104,   106,   108,   110,   112,   114,   116,   118,
+          120,   122,   124,   126,   128,   130,   132,   134,
+          136,   138,   140,   142,   144,   146,   148,   150,
+          152,   155,   158,   161,   164,   167,   170,   173,
+          176,   179,   182,   185,   188,   191,   194,   197,
+          200,   203,   207,   211,   215,   219,   223,   227,
+          231,   235,   239,   243,   247,   251,   255,   260,
+          265,   270,   275,   280,   285,   290,   295,   300,
+          305,   311,   317,   323,   329,   335,   341,   347,
+          353,   359,   366,   373,   380,   387,   394,   401,
+          408,   416,   424,   432,   440,   448,   456,   465,
+          474,   483,   492,   501,   510,   520,   530,   540,
+          550,   560,   571,   582,   593,   604,   615,   627,
+          639,   651,   663,   676,   689,   702,   715,   729,
+          743,   757,   771,   786,   801,   816,   832,   848,
+          864,   881,   898,   915,   933,   951,   969,   988,
+         1007,  1026,  1046,  1066,  1087,  1108,  1129,  1151,
+         1173,  1196,  1219,  1243,  1267,  1292,  1317,  1343,
+         1369,  1396,  1423,  1451,  1479,  1508,  1537,  1567,
+         1597,  1628,  1660,  1692,  1725,  1759,  1793,  1828,
+    }, {
+            4,     9,    11,    13,    16,    18,    21,    24,
+           27,    30,    33,    37,    40,    44,    48,    51,
+           55,    59,    63,    67,    71,    75,    79,    83,
+           88,    92,    96,   100,   105,   109,   114,   118,
+          122,   127,   131,   136,   140,   145,   149,   154,
+          158,   163,   168,   172,   177,   181,   186,   190,
+          195,   199,   204,   208,   213,   217,   222,   226,
+          231,   235,   240,   244,   249,   253,   258,   262,
+          267,   271,   275,   280,   284,   289,   293,   297,
+          302,   306,   311,   315,   319,   324,   328,   332,
+          337,   341,   345,   349,   354,   358,   362,   367,
+          371,   375,   379,   384,   388,   392,   396,   401,
+          409,   417,   425,   433,   441,   449,   458,   466,
+          474,   482,   490,   498,   506,   514,   523,   531,
+          539,   547,   555,   563,   571,   579,   588,   596,
+          604,   616,   628,   640,   652,   664,   676,   688,
+          700,   713,   725,   737,   749,   761,   773,   785,
+          797,   809,   825,   841,   857,   873,   889,   905,
+          922,   938,   954,   970,   986,  1002,  1018,  1038,
+         1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
+         1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
+         1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
+         1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
+         1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
+         2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
+         2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
+         2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
+         3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
+         4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
+         4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
+         5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
+         6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
+    }, {
+            4,    13,    19,    27,    35,    44,    54,    64,
+           75,    87,    99,   112,   126,   139,   154,   168,
+          183,   199,   214,   230,   247,   263,   280,   297,
+          314,   331,   349,   366,   384,   402,   420,   438,
+          456,   475,   493,   511,   530,   548,   567,   586,
+          604,   623,   642,   660,   679,   698,   716,   735,
+          753,   772,   791,   809,   828,   846,   865,   884,
+          902,   920,   939,   957,   976,   994,  1012,  1030,
+         1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
+         1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
+         1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+         1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
+         1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
+         1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
+         2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
+         2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
+         2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
+         3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
+         3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
+         4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
+         4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
+         5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+         6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
+         7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
+         8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
+        10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+        11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+        13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+        16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+        18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+        21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+        25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
+    }
 };
 
 const enum TxfmType ff_vp9_intra_txfm_type[14] = {
@@ -417,70 +563,38 @@ const int16_t ff_vp9_row_scan_16x16[256] = {
 };
 
 const int16_t ff_vp9_default_scan_32x32[1024] = {
-       0,    1,   32,    2,   33,   64,    3,   34,   65,    4,   96,   35,   66,    5,   36,   97,
-      67,  128,   98,   68,   37,    6,  129,   99,    7,  160,   69,   38,  130,  100,  161,  131,
-      39,   70,    8,  101,  162,  132,  192,   71,   40,    9,  102,  163,  133,  193,   72,  224,
-     103,   41,  164,   10,  194,  134,  165,   73,  104,  135,  225,   42,  195,   11,  256,  166,
-     226,  196,   74,  105,  136,   43,   12,  167,  197,  227,  257,   75,  106,  137,  228,   44,
-     198,  168,  258,  288,   13,  229,   76,  107,  199,  138,  259,  169,  289,   45,  230,  260,
-     200,  108,   14,  170,  139,  320,  290,   77,  231,  261,   46,  201,  140,  291,  109,  232,
-     321,  262,  171,   78,  292,   15,  322,  202,  263,  352,  172,  293,  233,  141,  323,  110,
-      47,  203,  264,  234,  294,  353,  324,   16,   79,  204,  265,  295,  325,  173,  354,  142,
-     235,  384,   48,  296,  111,  266,  355,  326,   80,   17,  205,  236,  174,  356,  385,  327,
-     143,  297,  267,  357,  386,  112,   49,  328,  298,  206,  416,  237,  358,  387,   81,  175,
-      18,  329,  359,  388,  299,  330,  389,  113,  417,  238,  360,   50,  207,  418,  390,  331,
-      19,  448,  361,   82,  419,  391,  239,   51,  362,  420,  114,  449,  480,  421,   83,  363,
-     450,  422,  512,  451,  423,  115,  452,  481,  453,  482,  454,  544,  483,  455,  513,  484,
-     514,  485,  515,  486,  545,  576,  487,  546,  547,  608,  577,  578,  579,  609,  610,  611,
-      20,  144,  268,  392,  516,  640,   21,   52,  145,  176,  269,  300,  393,  424,  517,  548,
-     641,  672,   22,   53,   84,  146,  177,  208,  270,  301,  332,  394,  425,  456,  518,  549,
-     580,  642,  673,  704,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
-     395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,   55,   86,  117,  179,
-     210,  241,  303,  334,  365,  427,  458,  489,  551,  582,  613,  675,  706,  737,   87,  118,
-     211,  242,  335,  366,  459,  490,  583,  614,  707,  738,  119,  243,  367,  491,  615,  739,
-      24,  148,  272,  396,  520,  644,  768,   25,   56,  149,  180,  273,  304,  397,  428,  521,
-     552,  645,  676,  769,  800,   26,   57,   88,  150,  181,  212,  274,  305,  336,  398,  429,
-     460,  522,  553,  584,  646,  677,  708,  770,  801,  832,   27,   58,   89,  120,  151,  182,
-     213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,
-     709,  740,  771,  802,  833,  864,   59,   90,  121,  183,  214,  245,  307,  338,  369,  431,
-     462,  493,  555,  586,  617,  679,  710,  741,  803,  834,  865,   91,  122,  215,  246,  339,
-     370,  463,  494,  587,  618,  711,  742,  835,  866,  123,  247,  371,  495,  619,  743,  867,
-      28,  152,  276,  400,  524,  648,  772,  896,   29,   60,  153,  184,  277,  308,  401,  432,
-     525,  556,  649,  680,  773,  804,  897,  928,   30,   61,   92,  154,  185,  216,  278,  309,
-     340,  402,  433,  464,  526,  557,  588,  650,  681,  712,  774,  805,  836,  898,  929,  960,
-      31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
-     527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,
-      63,   94,  125,  187,  218,  249,  311,  342,  373,  435,  466,  497,  559,  590,  621,  683,
-     714,  745,  807,  838,  869,  931,  962,  993,   95,  126,  219,  250,  343,  374,  467,  498,
-     591,  622,  715,  746,  839,  870,  963,  994,  127,  251,  375,  499,  623,  747,  871,  995,
-     156,  280,  404,  528,  652,  776,  900,  157,  188,  281,  312,  405,  436,  529,  560,  653,
-     684,  777,  808,  901,  932,  158,  189,  220,  282,  313,  344,  406,  437,  468,  530,  561,
-     592,  654,  685,  716,  778,  809,  840,  902,  933,  964,  159,  190,  221,  252,  283,  314,
-     345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
-     841,  872,  903,  934,  965,  996,  191,  222,  253,  315,  346,  377,  439,  470,  501,  563,
-     594,  625,  687,  718,  749,  811,  842,  873,  935,  966,  997,  223,  254,  347,  378,  471,
-     502,  595,  626,  719,  750,  843,  874,  967,  998,  255,  379,  503,  627,  751,  875,  999,
-     284,  408,  532,  656,  780,  904,  285,  316,  409,  440,  533,  564,  657,  688,  781,  812,
-     905,  936,  286,  317,  348,  410,  441,  472,  534,  565,  596,  658,  689,  720,  782,  813,
-     844,  906,  937,  968,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
-     659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000,  319,  350,  381,  443,
-     474,  505,  567,  598,  629,  691,  722,  753,  815,  846,  877,  939,  970, 1001,  351,  382,
-     475,  506,  599,  630,  723,  754,  847,  878,  971, 1002,  383,  507,  631,  755,  879, 1003,
-     412,  536,  660,  784,  908,  413,  444,  537,  568,  661,  692,  785,  816,  909,  940,  414,
-     445,  476,  538,  569,  600,  662,  693,  724,  786,  817,  848,  910,  941,  972,  415,  446,
-     477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
-     973, 1004,  447,  478,  509,  571,  602,  633,  695,  726,  757,  819,  850,  881,  943,  974,
-    1005,  479,  510,  603,  634,  727,  758,  851,  882,  975, 1006,  511,  635,  759,  883, 1007,
-     540,  664,  788,  912,  541,  572,  665,  696,  789,  820,  913,  944,  542,  573,  604,  666,
-     697,  728,  790,  821,  852,  914,  945,  976,  543,  574,  605,  636,  667,  698,  729,  760,
-     791,  822,  853,  884,  915,  946,  977, 1008,  575,  606,  637,  699,  730,  761,  823,  854,
-     885,  947,  978, 1009,  607,  638,  731,  762,  855,  886,  979, 1010,  639,  763,  887, 1011,
-     668,  792,  916,  669,  700,  793,  824,  917,  948,  670,  701,  732,  794,  825,  856,  918,
-     949,  980,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012,  703,  734,
-     765,  827,  858,  889,  951,  982, 1013,  735,  766,  859,  890,  983, 1014,  767,  891, 1015,
-     796,  920,  797,  828,  921,  952,  798,  829,  860,  922,  953,  984,  799,  830,  861,  892,
-     923,  954,  985, 1016,  831,  862,  893,  955,  986, 1017,  863,  894,  987, 1018,  895, 1019,
-     924,  925,  956,  926,  957,  988,  927,  958,  989, 1020,  959,  990, 1021,  991, 1022, 1023,
+       0,    1,   32,    2,   33,   64,    3,   34,   65,    4,   96,   35,   66,    5,   36,   97,   67,  128,   98,   68,   37,    6,  129,   99,    7,  160,   69,   38,  130,  100,  161,  131,
+      39,   70,    8,  101,  162,  132,  192,   71,   40,    9,  102,  163,  133,  193,   72,  224,  103,   41,  164,   10,  194,  134,  165,   73,  104,  135,  225,   42,  195,   11,  256,  166,
+     226,  196,   74,  105,  136,   43,   12,  167,  197,  227,  257,   75,  106,  137,  228,   44,  198,  168,  258,  288,   13,  229,   76,  107,  199,  138,  259,  169,  289,   45,  230,  260,
+     200,  108,   14,  170,  139,  320,  290,   77,  231,  261,   46,  201,  140,  291,  109,  232,  321,  262,  171,   78,  292,   15,  322,  202,  263,  352,  172,  293,  233,  141,  323,  110,
+      47,  203,  264,  234,  294,  353,  324,   16,   79,  204,  265,  295,  325,  173,  354,  142,  235,  384,   48,  296,  111,  266,  355,  326,   80,   17,  205,  236,  174,  356,  385,  327,
+     143,  297,  267,  357,  386,  112,   49,  328,  298,  206,  416,  237,  358,  387,   81,  175,   18,  329,  359,  388,  299,  330,  389,  113,  417,  238,  360,   50,  207,  418,  390,  331,
+      19,  448,  361,   82,  419,  391,  239,   51,  362,  420,  114,  449,  480,  421,   83,  363,  450,  422,  512,  451,  423,  115,  452,  481,  453,  482,  454,  544,  483,  455,  513,  484,
+     514,  485,  515,  486,  545,  576,  487,  546,  547,  608,  577,  578,  579,  609,  610,  611,   20,  144,  268,  392,  516,  640,   21,   52,  145,  176,  269,  300,  393,  424,  517,  548,
+     641,  672,   22,   53,   84,  146,  177,  208,  270,  301,  332,  394,  425,  456,  518,  549,  580,  642,  673,  704,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
+     395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,   55,   86,  117,  179,  210,  241,  303,  334,  365,  427,  458,  489,  551,  582,  613,  675,  706,  737,   87,  118,
+     211,  242,  335,  366,  459,  490,  583,  614,  707,  738,  119,  243,  367,  491,  615,  739,   24,  148,  272,  396,  520,  644,  768,   25,   56,  149,  180,  273,  304,  397,  428,  521,
+     552,  645,  676,  769,  800,   26,   57,   88,  150,  181,  212,  274,  305,  336,  398,  429,  460,  522,  553,  584,  646,  677,  708,  770,  801,  832,   27,   58,   89,  120,  151,  182,
+     213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,   59,   90,  121,  183,  214,  245,  307,  338,  369,  431,
+     462,  493,  555,  586,  617,  679,  710,  741,  803,  834,  865,   91,  122,  215,  246,  339,  370,  463,  494,  587,  618,  711,  742,  835,  866,  123,  247,  371,  495,  619,  743,  867,
+      28,  152,  276,  400,  524,  648,  772,  896,   29,   60,  153,  184,  277,  308,  401,  432,  525,  556,  649,  680,  773,  804,  897,  928,   30,   61,   92,  154,  185,  216,  278,  309,
+     340,  402,  433,  464,  526,  557,  588,  650,  681,  712,  774,  805,  836,  898,  929,  960,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
+     527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,   63,   94,  125,  187,  218,  249,  311,  342,  373,  435,  466,  497,  559,  590,  621,  683,
+     714,  745,  807,  838,  869,  931,  962,  993,   95,  126,  219,  250,  343,  374,  467,  498,  591,  622,  715,  746,  839,  870,  963,  994,  127,  251,  375,  499,  623,  747,  871,  995,
+     156,  280,  404,  528,  652,  776,  900,  157,  188,  281,  312,  405,  436,  529,  560,  653,  684,  777,  808,  901,  932,  158,  189,  220,  282,  313,  344,  406,  437,  468,  530,  561,
+     592,  654,  685,  716,  778,  809,  840,  902,  933,  964,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
+     841,  872,  903,  934,  965,  996,  191,  222,  253,  315,  346,  377,  439,  470,  501,  563,  594,  625,  687,  718,  749,  811,  842,  873,  935,  966,  997,  223,  254,  347,  378,  471,
+     502,  595,  626,  719,  750,  843,  874,  967,  998,  255,  379,  503,  627,  751,  875,  999,  284,  408,  532,  656,  780,  904,  285,  316,  409,  440,  533,  564,  657,  688,  781,  812,
+     905,  936,  286,  317,  348,  410,  441,  472,  534,  565,  596,  658,  689,  720,  782,  813,  844,  906,  937,  968,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
+     659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000,  319,  350,  381,  443,  474,  505,  567,  598,  629,  691,  722,  753,  815,  846,  877,  939,  970, 1001,  351,  382,
+     475,  506,  599,  630,  723,  754,  847,  878,  971, 1002,  383,  507,  631,  755,  879, 1003,  412,  536,  660,  784,  908,  413,  444,  537,  568,  661,  692,  785,  816,  909,  940,  414,
+     445,  476,  538,  569,  600,  662,  693,  724,  786,  817,  848,  910,  941,  972,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
+     973, 1004,  447,  478,  509,  571,  602,  633,  695,  726,  757,  819,  850,  881,  943,  974, 1005,  479,  510,  603,  634,  727,  758,  851,  882,  975, 1006,  511,  635,  759,  883, 1007,
+     540,  664,  788,  912,  541,  572,  665,  696,  789,  820,  913,  944,  542,  573,  604,  666,  697,  728,  790,  821,  852,  914,  945,  976,  543,  574,  605,  636,  667,  698,  729,  760,
+     791,  822,  853,  884,  915,  946,  977, 1008,  575,  606,  637,  699,  730,  761,  823,  854,  885,  947,  978, 1009,  607,  638,  731,  762,  855,  886,  979, 1010,  639,  763,  887, 1011,
+     668,  792,  916,  669,  700,  793,  824,  917,  948,  670,  701,  732,  794,  825,  856,  918,  949,  980,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012,  703,  734,
+     765,  827,  858,  889,  951,  982, 1013,  735,  766,  859,  890,  983, 1014,  767,  891, 1015,  796,  920,  797,  828,  921,  952,  798,  829,  860,  922,  953,  984,  799,  830,  861,  892,
+     923,  954,  985, 1016,  831,  862,  893,  955,  986, 1017,  863,  894,  987, 1018,  895, 1019,  924,  925,  956,  926,  957,  988,  927,  958,  989, 1020,  959,  990, 1021,  991, 1022, 1023,
 };
 
 const int16_t * const ff_vp9_scans[5][4] = {
@@ -2108,26 +2222,26 @@ const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3] = {
 };
 
 const int8_t ff_vp9_mv_joint_tree[3][2] = {
-    { -MV_JOINT_ZERO,            1 }, // '0'
-    {    -MV_JOINT_H,            2 }, // '10'
-    {    -MV_JOINT_V, -MV_JOINT_HV }, // '11x'
+    { -MV_JOINT_ZERO, 1 },                  // '0'
+        { -MV_JOINT_H, 2 },                 // '10'
+            { -MV_JOINT_V, -MV_JOINT_HV },  // '11x'
 };
 
 const int8_t ff_vp9_mv_class_tree[10][2] = {
-    { -0,   1 }, // '0'
-    { -1,   2 }, // '10'
-    {  3,   4 },
-    { -2,  -3 }, // '110x'
-    {  5,   6 },
-    { -4,  -5 }, // '1110x'
-    { -6,   7 }, // '11110'
-    {  8,   9 },
-    { -7,  -8 }, // '111110x'
-    { -9, -10 }, // '111111x'
+    { -0,   1 },                          // '0'
+        { -1,   2 },                      // '10'
+            {  3,   4 },
+                { -2,  -3 },              // '110x'
+                {  5,   6 },
+                    { -4,  -5 },          // '1110x'
+                    { -6,   7 },          // '11110'
+                        {  8,   9 },
+                            { -7,  -8 },  // '111110x'
+                            { -9, -10 },  // '111111x'
 };
 
 const int8_t ff_vp9_mv_fp_tree[3][2] = {
-    { -0,  1 },   // '0'
-    { -1,  2 },   // '10'
-    { -2, -3 },   // '11x'
+    { -0,  1 },          // '0'
+        { -1,  2 },      // '10'
+            { -2, -3 },  // '11x'
 };
diff --git a/libavcodec/vp9data.h b/libavcodec/vp9data.h
index f9ad911..086dbde 100644
--- a/libavcodec/vp9data.h
+++ b/libavcodec/vp9data.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,8 +24,9 @@
 
 #include <stdint.h>
 
-#include "vp9.h"
+#include "vp9dec.h"
 
+extern const uint8_t ff_vp9_bwh_tab[2][N_BS_SIZES][2];
 extern const int8_t ff_vp9_partition_tree[3][2];
 extern const uint8_t ff_vp9_default_kf_partition_probs[4][4][3];
 extern const int8_t ff_vp9_segmentation_tree[7][2];
@@ -35,8 +36,8 @@ extern const uint8_t ff_vp9_default_kf_uvmode_probs[10][9];
 extern const int8_t ff_vp9_inter_mode_tree[3][2];
 extern const int8_t ff_vp9_filter_tree[2][2];
 extern const enum FilterMode ff_vp9_filter_lut[3];
-extern const int16_t ff_vp9_dc_qlookup[256];
-extern const int16_t ff_vp9_ac_qlookup[256];
+extern const int16_t ff_vp9_dc_qlookup[3][256];
+extern const int16_t ff_vp9_ac_qlookup[3][256];
 extern const enum TxfmType ff_vp9_intra_txfm_type[14];
 extern const int16_t ff_vp9_default_scan_4x4[16];
 extern const int16_t ff_vp9_col_scan_4x4[16];
diff --git a/libavcodec/vp9dec.h b/libavcodec/vp9dec.h
new file mode 100644
index 0000000..66573ed
--- /dev/null
+++ b/libavcodec/vp9dec.h
@@ -0,0 +1,240 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VP9DEC_H
+#define AVCODEC_VP9DEC_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdatomic.h>
+
+#include "libavutil/buffer.h"
+#include "libavutil/thread.h"
+#include "libavutil/internal.h"
+
+#include "vp9.h"
+#include "vp9dsp.h"
+#include "vp9shared.h"
+
+enum MVJoint {
+    MV_JOINT_ZERO,
+    MV_JOINT_H,
+    MV_JOINT_V,
+    MV_JOINT_HV,
+};
+
+typedef struct ProbContext {
+    uint8_t y_mode[4][9];
+    uint8_t uv_mode[10][9];
+    uint8_t filter[4][2];
+    uint8_t mv_mode[7][3];
+    uint8_t intra[4];
+    uint8_t comp[5];
+    uint8_t single_ref[5][2];
+    uint8_t comp_ref[5];
+    uint8_t tx32p[2][3];
+    uint8_t tx16p[2][2];
+    uint8_t tx8p[2];
+    uint8_t skip[3];
+    uint8_t mv_joint[3];
+    struct {
+        uint8_t sign;
+        uint8_t classes[10];
+        uint8_t class0;
+        uint8_t bits[10];
+        uint8_t class0_fp[2][3];
+        uint8_t fp[3];
+        uint8_t class0_hp;
+        uint8_t hp;
+    } mv_comp[2];
+    uint8_t partition[4][4][3];
+} ProbContext;
+
+typedef struct VP9Filter {
+    uint8_t level[8 * 8];
+    uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
+                              [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
+} VP9Filter;
+
+typedef struct VP9Block {
+    uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
+    enum FilterMode filter;
+    VP56mv mv[4 /* b_idx */][2 /* ref */];
+    enum BlockSize bs;
+    enum TxfmMode tx, uvtx;
+    enum BlockLevel bl;
+    enum BlockPartition bp;
+} VP9Block;
+
+typedef struct VP9TileData VP9TileData;
+
+typedef struct VP9Context {
+    VP9SharedContext s;
+    VP9TileData *td;
+
+    VP9DSPContext dsp;
+    VideoDSPContext vdsp;
+    GetBitContext gb;
+    VP56RangeCoder c;
+    int pass, active_tile_cols;
+
+#if HAVE_THREADS
+    pthread_mutex_t progress_mutex;
+    pthread_cond_t progress_cond;
+    atomic_int *entries;
+#endif
+
+    uint8_t ss_h, ss_v;
+    uint8_t last_bpp, bpp_index, bytesperpixel;
+    uint8_t last_keyframe;
+    // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
+    // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
+    // and are therefore per-stream. pix_fmt represents the value in the header
+    // of the currently processed frame.
+    int w, h;
+    enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
+    unsigned sb_cols, sb_rows, rows, cols;
+    ThreadFrame next_refs[8];
+
+    struct {
+        uint8_t lim_lut[64];
+        uint8_t mblim_lut[64];
+    } filter_lut;
+    struct {
+        ProbContext p;
+        uint8_t coef[4][2][2][6][6][3];
+    } prob_ctx[4];
+    struct {
+        ProbContext p;
+        uint8_t coef[4][2][2][6][6][11];
+    } prob;
+
+    // contextual (above) cache
+    uint8_t *above_partition_ctx;
+    uint8_t *above_mode_ctx;
+    // FIXME maybe merge some of the below in a flags field?
+    uint8_t *above_y_nnz_ctx;
+    uint8_t *above_uv_nnz_ctx[2];
+    uint8_t *above_skip_ctx; // 1bit
+    uint8_t *above_txfm_ctx; // 2bit
+    uint8_t *above_segpred_ctx; // 1bit
+    uint8_t *above_intra_ctx; // 1bit
+    uint8_t *above_comp_ctx; // 1bit
+    uint8_t *above_ref_ctx; // 2bit
+    uint8_t *above_filter_ctx;
+    VP56mv (*above_mv_ctx)[2];
+
+    // whole-frame cache
+    uint8_t *intra_pred_data[3];
+    VP9Filter *lflvl;
+
+    // block reconstruction intermediates
+    int block_alloc_using_2pass;
+    uint16_t mvscale[3][2];
+    uint8_t mvstep[3][2];
+} VP9Context;
+
+struct VP9TileData {
+    //VP9Context should be const, but because of the threading API(generates
+    //a lot of warnings) it's not.
+    VP9Context *s;
+    VP56RangeCoder *c_b;
+    VP56RangeCoder *c;
+    int row, row7, col, col7;
+    uint8_t *dst[3];
+    ptrdiff_t y_stride, uv_stride;
+    VP9Block *b_base, *b;
+    unsigned tile_col_start;
+
+    struct {
+        unsigned y_mode[4][10];
+        unsigned uv_mode[10][10];
+        unsigned filter[4][3];
+        unsigned mv_mode[7][4];
+        unsigned intra[4][2];
+        unsigned comp[5][2];
+        unsigned single_ref[5][2][2];
+        unsigned comp_ref[5][2];
+        unsigned tx32p[2][4];
+        unsigned tx16p[2][3];
+        unsigned tx8p[2][2];
+        unsigned skip[3][2];
+        unsigned mv_joint[4];
+        struct {
+            unsigned sign[2];
+            unsigned classes[11];
+            unsigned class0[2];
+            unsigned bits[10][2];
+            unsigned class0_fp[2][4];
+            unsigned fp[4];
+            unsigned class0_hp[2];
+            unsigned hp[2];
+        } mv_comp[2];
+        unsigned partition[4][4][4];
+        unsigned coef[4][2][2][6][6][3];
+        unsigned eob[4][2][2][6][6][2];
+    } counts;
+
+    // whole-frame cache
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
+
+    // contextual (left) cache
+    DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
+    DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
+    DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
+    DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
+    DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
+    // block reconstruction intermediates
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
+    struct { int x, y; } min_mv, max_mv;
+    int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
+    uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
+};
+
+void ff_vp9_fill_mv(VP9TileData *td, VP56mv *mv, int mode, int sb);
+
+void ff_vp9_adapt_probs(VP9Context *s);
+
+void ff_vp9_decode_block(VP9TileData *td, int row, int col,
+                         VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
+                         enum BlockLevel bl, enum BlockPartition bp);
+
+void ff_vp9_loopfilter_sb(AVCodecContext *avctx, VP9Filter *lflvl,
+                          int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff);
+
+void ff_vp9_intra_recon_8bpp(VP9TileData *td,
+                             ptrdiff_t y_off, ptrdiff_t uv_off);
+void ff_vp9_intra_recon_16bpp(VP9TileData *td,
+                              ptrdiff_t y_off, ptrdiff_t uv_off);
+void ff_vp9_inter_recon_8bpp(VP9TileData *td);
+void ff_vp9_inter_recon_16bpp(VP9TileData *td);
+
+#endif /* AVCODEC_VP9DEC_H */
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index 7f86394..f6d73f7 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -4,1806 +4,30 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
-#include "libavutil/intreadwrite.h"
+#include "vp9dsp.h"
 
-#include "rnd_avg.h"
-#include "vp9.h"
-
-// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
-// back with h264pred.[ch]
-
-static void vert_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    unsigned p4 = AV_RN32A(top);
-
-    AV_WN32A(dst + stride * 0, p4);
-    AV_WN32A(dst + stride * 1, p4);
-    AV_WN32A(dst + stride * 2, p4);
-    AV_WN32A(dst + stride * 3, p4);
-}
-
-static void vert_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8 = AV_RN64A(top);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, p8);
-        dst += stride;
-    }
-}
-
-static void vert_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8a = AV_RN64A(top + 0), p8b = AV_RN64A(top + 8);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, p8a);
-        AV_WN64A(dst + 8, p8b);
-        dst += stride;
-    }
-}
-
-static void vert_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8a = AV_RN64A(top + 0),  p8b = AV_RN64A(top + 8),
-             p8c = AV_RN64A(top + 16), p8d = AV_RN64A(top + 24);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, p8a);
-        AV_WN64A(dst +  8, p8b);
-        AV_WN64A(dst + 16, p8c);
-        AV_WN64A(dst + 24, p8d);
-        dst += stride;
-    }
-}
-
-static void hor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                      const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, left[0] * 0x01010101U);
-    AV_WN32A(dst + stride * 1, left[1] * 0x01010101U);
-    AV_WN32A(dst + stride * 2, left[2] * 0x01010101U);
-    AV_WN32A(dst + stride * 3, left[3] * 0x01010101U);
-}
-
-static void hor_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                      const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, left[y] * 0x0101010101010101ULL);
-        dst += stride;
-    }
-}
-
-static void hor_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                        const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        uint64_t p8 = left[y] * 0x0101010101010101ULL;
-
-        AV_WN64A(dst + 0, p8);
-        AV_WN64A(dst + 8, p8);
-        dst += stride;
-    }
-}
-
-static void hor_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                        const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        uint64_t p8 = left[y] * 0x0101010101010101ULL;
-
-        AV_WN64A(dst +  0, p8);
-        AV_WN64A(dst +  8, p8);
-        AV_WN64A(dst + 16, p8);
-        AV_WN64A(dst + 24, p8);
-        dst += stride;
-    }
-}
-
-static void tm_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 4; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0] = av_clip_uint8(top[0] + l_m_tl);
-        dst[1] = av_clip_uint8(top[1] + l_m_tl);
-        dst[2] = av_clip_uint8(top[2] + l_m_tl);
-        dst[3] = av_clip_uint8(top[3] + l_m_tl);
-        dst   += stride;
-    }
-}
-
-static void tm_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 8; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0] = av_clip_uint8(top[0] + l_m_tl);
-        dst[1] = av_clip_uint8(top[1] + l_m_tl);
-        dst[2] = av_clip_uint8(top[2] + l_m_tl);
-        dst[3] = av_clip_uint8(top[3] + l_m_tl);
-        dst[4] = av_clip_uint8(top[4] + l_m_tl);
-        dst[5] = av_clip_uint8(top[5] + l_m_tl);
-        dst[6] = av_clip_uint8(top[6] + l_m_tl);
-        dst[7] = av_clip_uint8(top[7] + l_m_tl);
-        dst   += stride;
-    }
-}
-
-static void tm_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 16; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
-        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
-        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
-        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
-        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
-        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
-        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
-        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
-        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
-        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
-        dst[10] = av_clip_uint8(top[10] + l_m_tl);
-        dst[11] = av_clip_uint8(top[11] + l_m_tl);
-        dst[12] = av_clip_uint8(top[12] + l_m_tl);
-        dst[13] = av_clip_uint8(top[13] + l_m_tl);
-        dst[14] = av_clip_uint8(top[14] + l_m_tl);
-        dst[15] = av_clip_uint8(top[15] + l_m_tl);
-        dst    += stride;
-    }
-}
-
-static void tm_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 32; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
-        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
-        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
-        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
-        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
-        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
-        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
-        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
-        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
-        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
-        dst[10] = av_clip_uint8(top[10] + l_m_tl);
-        dst[11] = av_clip_uint8(top[11] + l_m_tl);
-        dst[12] = av_clip_uint8(top[12] + l_m_tl);
-        dst[13] = av_clip_uint8(top[13] + l_m_tl);
-        dst[14] = av_clip_uint8(top[14] + l_m_tl);
-        dst[15] = av_clip_uint8(top[15] + l_m_tl);
-        dst[16] = av_clip_uint8(top[16] + l_m_tl);
-        dst[17] = av_clip_uint8(top[17] + l_m_tl);
-        dst[18] = av_clip_uint8(top[18] + l_m_tl);
-        dst[19] = av_clip_uint8(top[19] + l_m_tl);
-        dst[20] = av_clip_uint8(top[20] + l_m_tl);
-        dst[21] = av_clip_uint8(top[21] + l_m_tl);
-        dst[22] = av_clip_uint8(top[22] + l_m_tl);
-        dst[23] = av_clip_uint8(top[23] + l_m_tl);
-        dst[24] = av_clip_uint8(top[24] + l_m_tl);
-        dst[25] = av_clip_uint8(top[25] + l_m_tl);
-        dst[26] = av_clip_uint8(top[26] + l_m_tl);
-        dst[27] = av_clip_uint8(top[27] + l_m_tl);
-        dst[28] = av_clip_uint8(top[28] + l_m_tl);
-        dst[29] = av_clip_uint8(top[29] + l_m_tl);
-        dst[30] = av_clip_uint8(top[30] + l_m_tl);
-        dst[31] = av_clip_uint8(top[31] + l_m_tl);
-        dst    += stride;
-    }
-}
-
-static void dc_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    top[0]  + top[1]  + top[2]  + top[3]  + 4) >> 3);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    left[4] + left[5] + left[6] + left[7] +
-                    top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    top[0]   + top[1]   + top[2]   + top[3]   +
-                    top[4]   + top[5]   + top[6]   + top[7]   +
-                    top[8]   + top[9]   + top[10]  + top[11]  +
-                    top[12]  + top[13]  + top[14]  + top[15]  + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    left[16] + left[17] + left[18] + left[19] +
-                    left[20] + left[21] + left[22] + left[23] +
-                    left[24] + left[25] + left[26] + left[27] +
-                    left[28] + left[29] + left[30] + left[31] +
-                    top[0]   + top[1]   + top[2]   + top[3]   +
-                    top[4]   + top[5]   + top[6]   + top[7]   +
-                    top[8]   + top[9]   + top[10]  + top[11]  +
-                    top[12]  + top[13]  + top[14]  + top[15]  +
-                    top[16]  + top[17]  + top[18]  + top[19]  +
-                    top[20]  + top[21]  + top[22]  + top[23]  +
-                    top[24]  + top[25]  + top[26]  + top[27]  +
-                    top[28]  + top[29]  + top[30]  + top[31]  + 32) >> 6);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U *
-                  ((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_left_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    left[4] + left[5] + left[6] + left[7] + 4) >> 3);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    left[16] + left[17] + left[18] + left[19] +
-                    left[20] + left[21] + left[22] + left[23] +
-                    left[24] + left[25] + left[26] + left[27] +
-                    left[28] + left[29] + left[30] + left[31] + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U * ((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_top_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0] + top[1] + top[2] + top[3] +
-                    top[4] + top[5] + top[6] + top[7] + 4) >> 3);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  +
-                    top[8]  + top[9]  + top[10] + top[11] +
-                    top[12] + top[13] + top[14] + top[15] + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  +
-                    top[8]  + top[9]  + top[10] + top[11] +
-                    top[12] + top[13] + top[14] + top[15] +
-                    top[16] + top[17] + top[18] + top[19] +
-                    top[20] + top[21] + top[22] + top[23] +
-                    top[24] + top[25] + top[26] + top[27] +
-                    top[28] + top[29] + top[30] + top[31] + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_128_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x80808080U);
-    AV_WN32A(dst + stride * 1, 0x80808080U);
-    AV_WN32A(dst + stride * 2, 0x80808080U);
-    AV_WN32A(dst + stride * 3, 0x80808080U);
-}
-
-static void dc_128_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_128_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x8080808080808080ULL);
-        AV_WN64A(dst + 8, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_128_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x8080808080808080ULL);
-        AV_WN64A(dst +  8, 0x8080808080808080ULL);
-        AV_WN64A(dst + 16, 0x8080808080808080ULL);
-        AV_WN64A(dst + 24, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 1, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 2, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 3, 0x7F7F7F7FU);
-}
-
-static void dc_127_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 8, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst +  8, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 16, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 24, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x81818181U);
-    AV_WN32A(dst + stride * 1, 0x81818181U);
-    AV_WN32A(dst + stride * 2, 0x81818181U);
-    AV_WN32A(dst + stride * 3, 0x81818181U);
-}
-
-static void dc_129_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x8181818181818181ULL);
-        AV_WN64A(dst + 8, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x8181818181818181ULL);
-        AV_WN64A(dst +  8, 0x8181818181818181ULL);
-        AV_WN64A(dst + 16, 0x8181818181818181ULL);
-        AV_WN64A(dst + 24, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-#define DST(x, y) dst[(x) + (y) * stride]
-
-static void diag_downleft_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *left, const uint8_t *top)
-{
-    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
-
-    DST(0, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(1, 0) =
-    DST(0, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
-    DST(2, 0) =
-    DST(1, 1) =
-    DST(0, 2) = (a2 + a3 * 2 + a4 + 2) >> 2;
-    DST(3, 0) =
-    DST(2, 1) =
-    DST(1, 2) =
-    DST(0, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
-    DST(3, 1) =
-    DST(2, 2) =
-    DST(1, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
-    DST(3, 2) =
-    DST(2, 3) = (a5 + a6 * 2 + a7 + 2) >> 2;
-    DST(3, 3) = a7;  // note: this is different from vp8 and such
-}
-
-#define def_diag_downleft(size)                                             \
-static void diag_downleft_ ## size ## x ## size ## _c(uint8_t *dst,         \
-                                                      ptrdiff_t stride,     \
-                                                      const uint8_t *left,  \
-                                                      const uint8_t *top)   \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size - 1];                                                    \
-                                                                            \
-    for (i = 0; i < size - 2; i++)                                          \
-        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;             \
-    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;             \
-                                                                            \
-    for (j = 0; j < size; j++) {                                            \
-        memcpy(dst + j * stride, v + j, size - 1 - j);                      \
-        memset(dst + j * stride + size - 1 - j, top[size - 1], j + 1);      \
-    }                                                                       \
-}
-
-def_diag_downleft(8)
-def_diag_downleft(16)
-def_diag_downleft(32)
-
-static void diag_downright_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *left, const uint8_t *top)
-{
-    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
-
-    DST(0, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
-    DST(0, 2) =
-    DST(1, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 1) =
-    DST(1, 2) =
-    DST(2, 3) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 0) =
-    DST(1, 1) =
-    DST(2, 2) =
-    DST(3, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
-    DST(1, 0) =
-    DST(2, 1) =
-    DST(3, 2) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(2, 0) =
-    DST(3, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(3, 0) = (a1 + a2 * 2 + a3 + 2) >> 2;
-}
-
-#define def_diag_downright(size)                                            \
-static void diag_downright_ ## size ## x ## size ## _c(uint8_t *dst,        \
-                                                       ptrdiff_t stride,    \
-                                                       const uint8_t *left, \
-                                                       const uint8_t *top)  \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size + size - 1];                                             \
-                                                                            \
-    for (i = 0; i < size - 2; i++) {                                        \
-        v[i]            = (left[size - 1 - i] +                             \
-                           left[size - 2 - i] * 2 +                         \
-                           left[size - 3 - i] + 2) >> 2;                    \
-        v[size + 1 + i] = (top[i]             +                             \
-                           top[i + 1]         * 2 +                         \
-                           top[i + 2]         + 2) >> 2;                    \
-    }                                                                       \
-    v[size - 2] = (left[1] + left[0] * 2 + top[-1] + 2) >> 2;               \
-    v[size - 1] = (left[0] + top[-1] * 2 + top[0]  + 2) >> 2;               \
-    v[size]     = (top[-1] + top[0]  * 2 + top[1]  + 2) >> 2;               \
-                                                                            \
-    for (j = 0; j < size; j++)                                              \
-        memcpy(dst + j * stride, v + size - 1 - j, size);                   \
-}
-
-def_diag_downright(8)
-def_diag_downright(16)
-def_diag_downright(32)
-
-static void vert_right_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *left, const uint8_t *top)
-{
-    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        l0 = left[0], l1 = left[1], l2 = left[2];
-
-    DST(0, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 0) =
-    DST(1, 2) = (tl + a0          + 1) >> 1;
-    DST(0, 1) =
-    DST(1, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
-    DST(1, 0) =
-    DST(2, 2) = (a0 + a1          + 1) >> 1;
-    DST(1, 1) =
-    DST(2, 3) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(2, 0) =
-    DST(3, 2) = (a1 + a2          + 1) >> 1;
-    DST(2, 1) =
-    DST(3, 3) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(3, 0) = (a2 + a3          + 1) >> 1;
-    DST(3, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
-}
-
-#define def_vert_right(size)                                                \
-static void vert_right_ ## size ## x ## size ## _c(uint8_t *dst,            \
-                                                   ptrdiff_t stride,        \
-                                                   const uint8_t *left,     \
-                                                   const uint8_t *top)      \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t ve[size + size / 2 - 1], vo[size + size / 2 - 1];               \
-                                                                            \
-    for (i = 0; i < size / 2 - 2; i++) {                                    \
-        vo[i] = (left[size - 4 - i * 2] +                                   \
-                 left[size - 3 - i * 2] * 2 +                               \
-                 left[size - 2 - i * 2] + 2) >> 2;                          \
-        ve[i] = (left[size - 5 - i * 2] +                                   \
-                 left[size - 4 - i * 2] * 2 +                               \
-                 left[size - 3 - i * 2] + 2) >> 2;                          \
-    }                                                                       \
-    vo[size / 2 - 2] = (left[0] + left[1] * 2 + left[2] + 2) >> 2;          \
-    ve[size / 2 - 2] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;          \
-                                                                            \
-    ve[size / 2 - 1] = (top[-1] + top[0] + 1) >> 1;                         \
-    vo[size / 2 - 1] = (left[0] + top[-1] * 2 + top[0] + 2) >> 2;           \
-    for (i = 0; i < size - 1; i++) {                                        \
-        ve[size / 2 + i] = (top[i] + top[i + 1] + 1) >> 1;                  \
-        vo[size / 2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
-    }                                                                       \
-                                                                            \
-    for (j = 0; j < size / 2; j++) {                                        \
-        memcpy(dst +  j * 2      * stride, ve + size / 2 - 1 - j, size);    \
-        memcpy(dst + (j * 2 + 1) * stride, vo + size / 2 - 1 - j, size);    \
-    }                                                                       \
-}
-
-def_vert_right(8)
-def_vert_right(16)
-def_vert_right(32)
-
-static void hor_down_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3],
-        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
-
-    DST(2, 0) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(3, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(0, 0) =
-    DST(2, 1) = (tl + l0          + 1) >> 1;
-    DST(1, 0) =
-    DST(3, 1) = (a0 + tl * 2 + l0 + 2) >> 2;
-    DST(0, 1) =
-    DST(2, 2) = (l0 + l1          + 1) >> 1;
-    DST(1, 1) =
-    DST(3, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 2) =
-    DST(2, 3) = (l1 + l2          + 1) >> 1;
-    DST(1, 2) =
-    DST(3, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 3) = (l2 + l3          + 1) >> 1;
-    DST(1, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
-}
-
-#define def_hor_down(size)                                              \
-static void hor_down_ ## size ## x ## size ## _c(uint8_t *dst,          \
-                                                 ptrdiff_t stride,      \
-                                                 const uint8_t *left,   \
-                                                 const uint8_t *top)    \
-{                                                                       \
-    int i, j;                                                           \
-    uint8_t v[size * 3 - 2];                                            \
-                                                                        \
-    for (i = 0; i < size - 2; i++) {                                    \
-        v[i * 2]        = (left[size - 2 - i] +                         \
-                           left[size - 1 - i] + 1) >> 1;                \
-        v[i * 2    + 1] = (left[size - 3 - i] +                         \
-                           left[size - 2 - i] * 2 +                     \
-                           left[size - 1 - i] + 2) >> 2;                \
-        v[size * 2 + i] = (top[i - 1] +                                 \
-                           top[i] * 2 +                                 \
-                           top[i + 1] + 2) >> 2;                        \
-    }                                                                   \
-    v[size * 2 - 2] = (top[-1] + left[0] + 1) >> 1;                     \
-    v[size * 2 - 4] = (left[0] + left[1] + 1) >> 1;                     \
-    v[size * 2 - 1] = (top[0]  + top[-1] * 2 + left[0] + 2) >> 2;       \
-    v[size * 2 - 3] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;       \
-                                                                        \
-    for (j = 0; j < size; j++)                                          \
-        memcpy(dst + j * stride, v + size * 2 - 2 - j * 2, size);       \
-}
-
-def_hor_down(8)
-def_hor_down(16)
-def_hor_down(32)
-
-static void vert_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        a4 = top[4], a5 = top[5], a6 = top[6];
-
-    DST(0, 0) = (a0 + a1          + 1) >> 1;
-    DST(0, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(1, 0) =
-    DST(0, 2) = (a1 + a2          + 1) >> 1;
-    DST(1, 1) =
-    DST(0, 3) = (a1 + a2 * 2 + a3 + 2) >> 2;
-    DST(2, 0) =
-    DST(1, 2) = (a2 + a3          + 1) >> 1;
-    DST(2, 1) =
-    DST(1, 3) = (a2 + a3 * 2 + a4 + 2) >> 2;
-    DST(3, 0) =
-    DST(2, 2) = (a3 + a4          + 1) >> 1;
-    DST(3, 1) =
-    DST(2, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
-    DST(3, 2) = (a4 + a5          + 1) >> 1;
-    DST(3, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
-}
-
-#define def_vert_left(size)                                             \
-static void vert_left_ ## size ## x ## size ## _c(uint8_t *dst,         \
-                                                  ptrdiff_t stride,     \
-                                                  const uint8_t *left,  \
-                                                  const uint8_t *top)   \
-{                                                                       \
-    int i, j;                                                           \
-    uint8_t ve[size - 1], vo[size - 1];                                 \
-                                                                        \
-    for (i = 0; i < size - 2; i++) {                                    \
-        ve[i] = (top[i] + top[i + 1] + 1) >> 1;                         \
-        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;        \
-    }                                                                   \
-    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1;            \
-    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;        \
-                                                                        \
-    for (j = 0; j < size / 2; j++) {                                    \
-        memcpy(dst +  j * 2      * stride, ve + j, size - (j + 1));     \
-        memset(dst +  j * 2      * stride + size - j - 1,               \
-               top[size - 1], j + 1);                                   \
-        memcpy(dst + (j * 2 + 1) * stride, vo + j, size - (j + 1));     \
-        memset(dst + (j * 2 + 1) * stride + size - j - 1,               \
-               top[size - 1], j + 1);                                   \
-    }                                                                   \
-}
-
-def_vert_left(8)
-def_vert_left(16)
-def_vert_left(32)
-
-static void hor_up_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
-
-    DST(0, 0) = (l0 + l1          + 1) >> 1;
-    DST(1, 0) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 1) =
-    DST(2, 0) = (l1 + l2          + 1) >> 1;
-    DST(1, 1) =
-    DST(3, 0) = (l1 + l2 * 2 + l3 + 2) >> 2;
-    DST(0, 2) =
-    DST(2, 1) = (l2 + l3          + 1) >> 1;
-    DST(1, 2) =
-    DST(3, 1) = (l2 + l3 * 3      + 2) >> 2;
-    DST(0, 3) =
-    DST(1, 3) =
-    DST(2, 2) =
-    DST(2, 3) =
-    DST(3, 2) =
-    DST(3, 3) = l3;
-}
-
-#define def_hor_up(size)                                                    \
-static void hor_up_ ## size ## x ## size ## _c(uint8_t *dst,                \
-                                               ptrdiff_t stride,            \
-                                               const uint8_t *left,         \
-                                               const uint8_t *top)          \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size * 2 - 2];                                                \
-                                                                            \
-    for (i = 0; i < size - 2; i++) {                                        \
-        v[i * 2]     = (left[i] + left[i + 1] + 1) >> 1;                    \
-        v[i * 2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2;  \
-    }                                                                       \
-    v[size * 2 - 4] = (left[size - 2] + left[size - 1]     + 1) >> 1;       \
-    v[size * 2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2;       \
-                                                                            \
-    for (j = 0; j < size / 2; j++)                                          \
-        memcpy(dst + j * stride, v + j * 2, size);                          \
-    for (j = size / 2; j < size; j++) {                                     \
-        memcpy(dst + j * stride, v + j * 2, size * 2 - 2 - j * 2);          \
-        memset(dst + j * stride + size * 2 - 2 - j * 2, left[size - 1],     \
-               2 + j * 2 - size);                                           \
-    }                                                                       \
-}
-
-def_hor_up(8)
-def_hor_up(16)
-def_hor_up(32)
-
-#undef DST
-
-static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp)
-{
-#define init_intra_pred(tx, sz)                                              \
-    dsp->intra_pred[tx][VERT_PRED]            = vert_           ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_PRED]             = hor_            ## sz ## _c; \
-    dsp->intra_pred[tx][DC_PRED]              = dc_             ## sz ## _c; \
-    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_  ## sz ## _c; \
-    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_ ## sz ## _c; \
-    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_     ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_       ## sz ## _c; \
-    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_      ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_         ## sz ## _c; \
-    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_             ## sz ## _c; \
-    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_        ## sz ## _c; \
-    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_         ## sz ## _c
-
-    init_intra_pred(TX_4X4,   4x4);
-    init_intra_pred(TX_8X8,   8x8);
-    init_intra_pred(TX_16X16, 16x16);
-    init_intra_pred(TX_32X32, 32x32);
-
-#undef init_intra_pred
-}
-
-#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly)                 \
-static void                                                                 \
-type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
-                                                      ptrdiff_t stride,     \
-                                                      int16_t *block,       \
-                                                      int eob)              \
-{                                                                           \
-    int i, j;                                                               \
-    int16_t tmp[sz * sz], out[sz];                                          \
-                                                                            \
-    if (has_dconly && eob == 1) {                                           \
-        const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)              \
-                                   * 11585 + (1 << 13)) >> 14;              \
-        block[0] = 0;                                                       \
-        for (i = 0; i < sz; i++) {                                          \
-            for (j = 0; j < sz; j++)                                        \
-                dst[j * stride] =                                           \
-                    av_clip_uint8(dst[j * stride] +                         \
-                                  (bits ? (t + (1 << (bits - 1))) >> bits   \
-                                        : t));                              \
-            dst++;                                                          \
-        }                                                                   \
-        return;                                                             \
-    }                                                                       \
-                                                                            \
-    for (i = 0; i < sz; i++)                                                \
-        type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);                \
-    memset(block, 0, sz * sz * sizeof(*block));                             \
-    for (i = 0; i < sz; i++) {                                              \
-        type_b ## sz ## _1d(out, tmp + i, sz, 1);                           \
-        for (j = 0; j < sz; j++)                                            \
-            dst[j * stride] =                                               \
-                av_clip_uint8(dst[j * stride] +                             \
-                              (bits ? (out[j] + (1 << (bits - 1))) >> bits  \
-                                    : out[j]));                             \
-        dst++;                                                              \
-    }                                                                       \
-}
-
-#define itxfm_wrap(sz, bits)                 \
-    itxfm_wrapper(idct,  idct,  sz, bits, 1) \
-    itxfm_wrapper(iadst, idct,  sz, bits, 0) \
-    itxfm_wrapper(idct,  iadst, sz, bits, 0) \
-    itxfm_wrapper(iadst, iadst, sz, bits, 0)
-
-#define IN(x) in[x * stride]
-
-static av_always_inline void idct4_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3;
-
-    t0 = ((IN(0)        + IN(2)) * 11585 + (1 << 13)) >> 14;
-    t1 = ((IN(0)        - IN(2)) * 11585 + (1 << 13)) >> 14;
-    t2 = (IN(1) *  6270 - IN(3)  * 15137 + (1 << 13)) >> 14;
-    t3 = (IN(1) * 15137 + IN(3)  *  6270 + (1 << 13)) >> 14;
-
-    out[0] = t0 + t3;
-    out[1] = t1 + t2;
-    out[2] = t1 - t2;
-    out[3] = t0 - t3;
-}
-
-static av_always_inline void iadst4_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3;
-
-    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
-    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
-    t2 = 13377 * (IN(0) - IN(2) + IN(3));
-    t3 = 13377 * IN(1);
-
-    out[0] = (t0 + t3      + (1 << 13)) >> 14;
-    out[1] = (t1 + t3      + (1 << 13)) >> 14;
-    out[2] = (t2           + (1 << 13)) >> 14;
-    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
-}
-
-itxfm_wrap(4, 4)
-
-static av_always_inline void idct8_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
-
-    t0a = ((IN(0)        + IN(4)) * 11585 + (1 << 13)) >> 14;
-    t1a = ((IN(0)        - IN(4)) * 11585 + (1 << 13)) >> 14;
-    t2a = (IN(2) *  6270 - IN(6)  * 15137 + (1 << 13)) >> 14;
-    t3a = (IN(2) * 15137 + IN(6)  *  6270 + (1 << 13)) >> 14;
-    t4a = (IN(1) *  3196 - IN(7)  * 16069 + (1 << 13)) >> 14;
-    t5a = (IN(5) * 13623 - IN(3)  *  9102 + (1 << 13)) >> 14;
-    t6a = (IN(5) *  9102 + IN(3)  * 13623 + (1 << 13)) >> 14;
-    t7a = (IN(1) * 16069 + IN(7)  *  3196 + (1 << 13)) >> 14;
-
-    t0  = t0a + t3a;
-    t1  = t1a + t2a;
-    t2  = t1a - t2a;
-    t3  = t0a - t3a;
-    t4  = t4a + t5a;
-    t5a = t4a - t5a;
-    t7  = t7a + t6a;
-    t6a = t7a - t6a;
-
-    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
-    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
-
-    out[0] = t0 + t7;
-    out[1] = t1 + t6;
-    out[2] = t2 + t5;
-    out[3] = t3 + t4;
-    out[4] = t3 - t4;
-    out[5] = t2 - t5;
-    out[6] = t1 - t6;
-    out[7] = t0 - t7;
-}
-
-static av_always_inline void iadst8_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
-
-    t0a = 16305 * IN(7) +  1606 * IN(0);
-    t1a =  1606 * IN(7) - 16305 * IN(0);
-    t2a = 14449 * IN(5) +  7723 * IN(2);
-    t3a =  7723 * IN(5) - 14449 * IN(2);
-    t4a = 10394 * IN(3) + 12665 * IN(4);
-    t5a = 12665 * IN(3) - 10394 * IN(4);
-    t6a =  4756 * IN(1) + 15679 * IN(6);
-    t7a = 15679 * IN(1) -  4756 * IN(6);
-
-    t0  = (t0a + t4a + (1 << 13)) >> 14;
-    t1  = (t1a + t5a + (1 << 13)) >> 14;
-    t2  = (t2a + t6a + (1 << 13)) >> 14;
-    t3  = (t3a + t7a + (1 << 13)) >> 14;
-    t4  = (t0a - t4a + (1 << 13)) >> 14;
-    t5  = (t1a - t5a + (1 << 13)) >> 14;
-    t6  = (t2a - t6a + (1 << 13)) >> 14;
-    t7  = (t3a - t7a + (1 << 13)) >> 14;
-
-    t4a = 15137 * t4 +  6270 * t5;
-    t5a =  6270 * t4 - 15137 * t5;
-    t6a = 15137 * t7 -  6270 * t6;
-    t7a =  6270 * t7 + 15137 * t6;
-
-    out[0] =   t0 + t2;
-    out[7] = -(t1 + t3);
-    t2     =   t0 - t2;
-    t3     =   t1 - t3;
-
-    out[1] = -((t4a + t6a + (1 << 13)) >> 14);
-    out[6] =   (t5a + t7a + (1 << 13)) >> 14;
-    t6     =   (t4a - t6a + (1 << 13)) >> 14;
-    t7     =   (t5a - t7a + (1 << 13)) >> 14;
-
-    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
-    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
-    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
-    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
-}
-
-itxfm_wrap(8, 5)
-
-static av_always_inline void idct16_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
-    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
-    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
-
-    t0a  = ((IN(0)         + IN(8)) * 11585 + (1 << 13)) >> 14;
-    t1a  = ((IN(0)         - IN(8)) * 11585 + (1 << 13)) >> 14;
-    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
-    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
-    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
-    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
-    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
-    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
-    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
-    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
-    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
-    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
-    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
-    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
-    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
-    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
-
-    t0   = t0a  + t3a;
-    t1   = t1a  + t2a;
-    t2   = t1a  - t2a;
-    t3   = t0a  - t3a;
-    t4   = t4a  + t5a;
-    t5   = t4a  - t5a;
-    t6   = t7a  - t6a;
-    t7   = t7a  + t6a;
-    t8   = t8a  + t9a;
-    t9   = t8a  - t9a;
-    t10  = t11a - t10a;
-    t11  = t11a + t10a;
-    t12  = t12a + t13a;
-    t13  = t12a - t13a;
-    t14  = t15a - t14a;
-    t15  = t15a + t14a;
-
-    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
-    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
-    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
-    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
-    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
-    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
-
-    t0a  = t0   + t7;
-    t1a  = t1   + t6a;
-    t2a  = t2   + t5a;
-    t3a  = t3   + t4;
-    t4   = t3   - t4;
-    t5   = t2   - t5a;
-    t6   = t1   - t6a;
-    t7   = t0   - t7;
-    t8a  = t8   + t11;
-    t9   = t9a  + t10a;
-    t10  = t9a  - t10a;
-    t11a = t8   - t11;
-    t12a = t15  - t12;
-    t13  = t14a - t13a;
-    t14  = t14a + t13a;
-    t15a = t15  + t12;
-
-    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
-    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
-    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
-    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
-
-    out[0]  = t0a + t15a;
-    out[1]  = t1a + t14;
-    out[2]  = t2a + t13a;
-    out[3]  = t3a + t12;
-    out[4]  = t4  + t11;
-    out[5]  = t5  + t10a;
-    out[6]  = t6  + t9;
-    out[7]  = t7  + t8a;
-    out[8]  = t7  - t8a;
-    out[9]  = t6  - t9;
-    out[10] = t5  - t10a;
-    out[11] = t4  - t11;
-    out[12] = t3a - t12;
-    out[13] = t2a - t13a;
-    out[14] = t1a - t14;
-    out[15] = t0a - t15a;
-}
-
-static av_always_inline void iadst16_1d(int16_t *out, const int16_t *in,
-                                        ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
-    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
-    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
-
-    t0  = IN(15) * 16364 + IN(0)  *   804;
-    t1  = IN(15) *   804 - IN(0)  * 16364;
-    t2  = IN(13) * 15893 + IN(2)  *  3981;
-    t3  = IN(13) *  3981 - IN(2)  * 15893;
-    t4  = IN(11) * 14811 + IN(4)  *  7005;
-    t5  = IN(11) *  7005 - IN(4)  * 14811;
-    t6  = IN(9)  * 13160 + IN(6)  *  9760;
-    t7  = IN(9)  *  9760 - IN(6)  * 13160;
-    t8  = IN(7)  * 11003 + IN(8)  * 12140;
-    t9  = IN(7)  * 12140 - IN(8)  * 11003;
-    t10 = IN(5)  *  8423 + IN(10) * 14053;
-    t11 = IN(5)  * 14053 - IN(10) *  8423;
-    t12 = IN(3)  *  5520 + IN(12) * 15426;
-    t13 = IN(3)  * 15426 - IN(12) *  5520;
-    t14 = IN(1)  *  2404 + IN(14) * 16207;
-    t15 = IN(1)  * 16207 - IN(14) *  2404;
-
-    t0a  = (t0 + t8  + (1 << 13)) >> 14;
-    t1a  = (t1 + t9  + (1 << 13)) >> 14;
-    t2a  = (t2 + t10 + (1 << 13)) >> 14;
-    t3a  = (t3 + t11 + (1 << 13)) >> 14;
-    t4a  = (t4 + t12 + (1 << 13)) >> 14;
-    t5a  = (t5 + t13 + (1 << 13)) >> 14;
-    t6a  = (t6 + t14 + (1 << 13)) >> 14;
-    t7a  = (t7 + t15 + (1 << 13)) >> 14;
-    t8a  = (t0 - t8  + (1 << 13)) >> 14;
-    t9a  = (t1 - t9  + (1 << 13)) >> 14;
-    t10a = (t2 - t10 + (1 << 13)) >> 14;
-    t11a = (t3 - t11 + (1 << 13)) >> 14;
-    t12a = (t4 - t12 + (1 << 13)) >> 14;
-    t13a = (t5 - t13 + (1 << 13)) >> 14;
-    t14a = (t6 - t14 + (1 << 13)) >> 14;
-    t15a = (t7 - t15 + (1 << 13)) >> 14;
-
-    t8   = t8a  * 16069 + t9a  *  3196;
-    t9   = t8a  *  3196 - t9a  * 16069;
-    t10  = t10a *  9102 + t11a * 13623;
-    t11  = t10a * 13623 - t11a *  9102;
-    t12  = t13a * 16069 - t12a *  3196;
-    t13  = t13a *  3196 + t12a * 16069;
-    t14  = t15a *  9102 - t14a * 13623;
-    t15  = t15a * 13623 + t14a *  9102;
-
-    t0   = t0a  + t4a;
-    t1   = t1a  + t5a;
-    t2   = t2a  + t6a;
-    t3   = t3a  + t7a;
-    t4   = t0a  - t4a;
-    t5   = t1a  - t5a;
-    t6   = t2a  - t6a;
-    t7   = t3a  - t7a;
-    t8a  = (t8  + t12 + (1 << 13)) >> 14;
-    t9a  = (t9  + t13 + (1 << 13)) >> 14;
-    t10a = (t10 + t14 + (1 << 13)) >> 14;
-    t11a = (t11 + t15 + (1 << 13)) >> 14;
-    t12a = (t8  - t12 + (1 << 13)) >> 14;
-    t13a = (t9  - t13 + (1 << 13)) >> 14;
-    t14a = (t10 - t14 + (1 << 13)) >> 14;
-    t15a = (t11 - t15 + (1 << 13)) >> 14;
-
-    t4a  = t4   * 15137 + t5   *  6270;
-    t5a  = t4   *  6270 - t5   * 15137;
-    t6a  = t7   * 15137 - t6   *  6270;
-    t7a  = t7   *  6270 + t6   * 15137;
-    t12  = t12a * 15137 + t13a *  6270;
-    t13  = t12a *  6270 - t13a * 15137;
-    t14  = t15a * 15137 - t14a *  6270;
-    t15  = t15a *  6270 + t14a * 15137;
-
-    out[0]  =     t0 + t2;
-    out[15] =   -(t1 + t3);
-    t2a     =     t0 - t2;
-    t3a     =     t1 - t3;
-    out[3]  = -((t4a + t6a + (1 << 13)) >> 14);
-    out[12] =   (t5a + t7a + (1 << 13)) >> 14;
-    t6      =   (t4a - t6a + (1 << 13)) >> 14;
-    t7      =   (t5a - t7a + (1 << 13)) >> 14;
-    out[1]  =  -(t8a + t10a);
-    out[14] =    t9a + t11a;
-    t10     =    t8a - t10a;
-    t11     =    t9a - t11a;
-    out[2]  =   (t12 + t14 + (1 << 13)) >> 14;
-    out[13] = -((t13 + t15 + (1 << 13)) >> 14);
-    t14a    =   (t12 - t14 + (1 << 13)) >> 14;
-    t15a    =   (t13 - t15 + (1 << 13)) >> 14;
-
-    out[7]  = ((t2a  + t3a)  * -11585 + (1 << 13)) >> 14;
-    out[8]  = ((t2a  - t3a)  *  11585 + (1 << 13)) >> 14;
-    out[4]  = ((t7   + t6)   *  11585 + (1 << 13)) >> 14;
-    out[11] = ((t7   - t6)   *  11585 + (1 << 13)) >> 14;
-    out[6]  = ((t11  + t10)  *  11585 + (1 << 13)) >> 14;
-    out[9]  = ((t11  - t10)  *  11585 + (1 << 13)) >> 14;
-    out[5]  = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
-    out[10] = ((t14a - t15a) *  11585 + (1 << 13)) >> 14;
-}
-
-itxfm_wrap(16, 6)
-
-static av_always_inline void idct32_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0a  = ((IN(0)         + IN(16)) * 11585 + (1 << 13)) >> 14;
-    int t1a  = ((IN(0)         - IN(16)) * 11585 + (1 << 13)) >> 14;
-    int t2a  = (IN(8)  *  6270 - IN(24)  * 15137 + (1 << 13)) >> 14;
-    int t3a  = (IN(8)  * 15137 + IN(24)  *  6270 + (1 << 13)) >> 14;
-    int t4a  = (IN(4)  *  3196 - IN(28)  * 16069 + (1 << 13)) >> 14;
-    int t7a  = (IN(4)  * 16069 + IN(28)  *  3196 + (1 << 13)) >> 14;
-    int t5a  = (IN(20) * 13623 - IN(12)  *  9102 + (1 << 13)) >> 14;
-    int t6a  = (IN(20) *  9102 + IN(12)  * 13623 + (1 << 13)) >> 14;
-    int t8a  = (IN(2)  *  1606 - IN(30)  * 16305 + (1 << 13)) >> 14;
-    int t15a = (IN(2)  * 16305 + IN(30)  *  1606 + (1 << 13)) >> 14;
-    int t9a  = (IN(18) * 12665 - IN(14)  * 10394 + (1 << 13)) >> 14;
-    int t14a = (IN(18) * 10394 + IN(14)  * 12665 + (1 << 13)) >> 14;
-    int t10a = (IN(10) *  7723 - IN(22)  * 14449 + (1 << 13)) >> 14;
-    int t13a = (IN(10) * 14449 + IN(22)  *  7723 + (1 << 13)) >> 14;
-    int t11a = (IN(26) * 15679 - IN(6)   *  4756 + (1 << 13)) >> 14;
-    int t12a = (IN(26) *  4756 + IN(6)   * 15679 + (1 << 13)) >> 14;
-    int t16a = (IN(1)  *   804 - IN(31)  * 16364 + (1 << 13)) >> 14;
-    int t31a = (IN(1)  * 16364 + IN(31)  *   804 + (1 << 13)) >> 14;
-    int t17a = (IN(17) * 12140 - IN(15)  * 11003 + (1 << 13)) >> 14;
-    int t30a = (IN(17) * 11003 + IN(15)  * 12140 + (1 << 13)) >> 14;
-    int t18a = (IN(9)  *  7005 - IN(23)  * 14811 + (1 << 13)) >> 14;
-    int t29a = (IN(9)  * 14811 + IN(23)  *  7005 + (1 << 13)) >> 14;
-    int t19a = (IN(25) * 15426 - IN(7)   *  5520 + (1 << 13)) >> 14;
-    int t28a = (IN(25) *  5520 + IN(7)   * 15426 + (1 << 13)) >> 14;
-    int t20a = (IN(5)  *  3981 - IN(27)  * 15893 + (1 << 13)) >> 14;
-    int t27a = (IN(5)  * 15893 + IN(27)  *  3981 + (1 << 13)) >> 14;
-    int t21a = (IN(21) * 14053 - IN(11)  *  8423 + (1 << 13)) >> 14;
-    int t26a = (IN(21) *  8423 + IN(11)  * 14053 + (1 << 13)) >> 14;
-    int t22a = (IN(13) *  9760 - IN(19)  * 13160 + (1 << 13)) >> 14;
-    int t25a = (IN(13) * 13160 + IN(19)  *  9760 + (1 << 13)) >> 14;
-    int t23a = (IN(29) * 16207 - IN(3)   *  2404 + (1 << 13)) >> 14;
-    int t24a = (IN(29) *  2404 + IN(3)   * 16207 + (1 << 13)) >> 14;
-
-    int t0  = t0a  + t3a;
-    int t1  = t1a  + t2a;
-    int t2  = t1a  - t2a;
-    int t3  = t0a  - t3a;
-    int t4  = t4a  + t5a;
-    int t5  = t4a  - t5a;
-    int t6  = t7a  - t6a;
-    int t7  = t7a  + t6a;
-    int t8  = t8a  + t9a;
-    int t9  = t8a  - t9a;
-    int t10 = t11a - t10a;
-    int t11 = t11a + t10a;
-    int t12 = t12a + t13a;
-    int t13 = t12a - t13a;
-    int t14 = t15a - t14a;
-    int t15 = t15a + t14a;
-    int t16 = t16a + t17a;
-    int t17 = t16a - t17a;
-    int t18 = t19a - t18a;
-    int t19 = t19a + t18a;
-    int t20 = t20a + t21a;
-    int t21 = t20a - t21a;
-    int t22 = t23a - t22a;
-    int t23 = t23a + t22a;
-    int t24 = t24a + t25a;
-    int t25 = t24a - t25a;
-    int t26 = t27a - t26a;
-    int t27 = t27a + t26a;
-    int t28 = t28a + t29a;
-    int t29 = t28a - t29a;
-    int t30 = t31a - t30a;
-    int t31 = t31a + t30a;
-
-    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
-    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
-    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
-    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
-    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
-    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
-    t17a =   (t30 *  3196 - t17 * 16069  + (1 << 13)) >> 14;
-    t30a =   (t30 * 16069 + t17 *  3196  + (1 << 13)) >> 14;
-    t18a = (-(t29 * 16069 + t18 *  3196) + (1 << 13)) >> 14;
-    t29a =   (t29 *  3196 - t18 * 16069  + (1 << 13)) >> 14;
-    t21a =   (t26 * 13623 - t21 *  9102  + (1 << 13)) >> 14;
-    t26a =   (t26 *  9102 + t21 * 13623  + (1 << 13)) >> 14;
-    t22a = (-(t25 *  9102 + t22 * 13623) + (1 << 13)) >> 14;
-    t25a =   (t25 * 13623 - t22 *  9102  + (1 << 13)) >> 14;
-
-    t0a  = t0   + t7;
-    t1a  = t1   + t6a;
-    t2a  = t2   + t5a;
-    t3a  = t3   + t4;
-    t4a  = t3   - t4;
-    t5   = t2   - t5a;
-    t6   = t1   - t6a;
-    t7a  = t0   - t7;
-    t8a  = t8   + t11;
-    t9   = t9a  + t10a;
-    t10  = t9a  - t10a;
-    t11a = t8   - t11;
-    t12a = t15  - t12;
-    t13  = t14a - t13a;
-    t14  = t14a + t13a;
-    t15a = t15  + t12;
-    t16a = t16  + t19;
-    t17  = t17a + t18a;
-    t18  = t17a - t18a;
-    t19a = t16  - t19;
-    t20a = t23  - t20;
-    t21  = t22a - t21a;
-    t22  = t22a + t21a;
-    t23a = t23  + t20;
-    t24a = t24  + t27;
-    t25  = t25a + t26a;
-    t26  = t25a - t26a;
-    t27a = t24  - t27;
-    t28a = t31  - t28;
-    t29  = t30a - t29a;
-    t30  = t30a + t29a;
-    t31a = t31  + t28;
-
-    t10a = ((t13           - t10)  * 11585  + (1 << 13)) >> 14;
-    t13a = ((t13           + t10)  * 11585  + (1 << 13)) >> 14;
-    t11  = ((t12a          - t11a) * 11585  + (1 << 13)) >> 14;
-    t12  = ((t12a          + t11a) * 11585  + (1 << 13)) >> 14;
-    t18a =   (t29  *  6270 - t18   * 15137  + (1 << 13)) >> 14;
-    t29a =   (t29  * 15137 + t18   *  6270  + (1 << 13)) >> 14;
-    t19  =   (t28a *  6270 - t19a  * 15137  + (1 << 13)) >> 14;
-    t28  =   (t28a * 15137 + t19a  *  6270  + (1 << 13)) >> 14;
-    t20  = (-(t27a * 15137 + t20a  *  6270) + (1 << 13)) >> 14;
-    t27  =   (t27a *  6270 - t20a  * 15137  + (1 << 13)) >> 14;
-    t21a = (-(t26  * 15137 + t21   *  6270) + (1 << 13)) >> 14;
-    t26a =   (t26  *  6270 - t21   * 15137  + (1 << 13)) >> 14;
-
-    t0   = t0a  + t15a;
-    t1   = t1a  + t14;
-    t2   = t2a  + t13a;
-    t3   = t3a  + t12;
-    t4   = t4a  + t11;
-    t5a  = t5   + t10a;
-    t6a  = t6   + t9;
-    t7   = t7a  + t8a;
-    t8   = t7a  - t8a;
-    t9a  = t6   - t9;
-    t10  = t5   - t10a;
-    t11a = t4a  - t11;
-    t12a = t3a  - t12;
-    t13  = t2a  - t13a;
-    t14a = t1a  - t14;
-    t15  = t0a  - t15a;
-    t16  = t16a + t23a;
-    t17a = t17  + t22;
-    t18  = t18a + t21a;
-    t19a = t19  + t20;
-    t20a = t19  - t20;
-    t21  = t18a - t21a;
-    t22a = t17  - t22;
-    t23  = t16a - t23a;
-    t24  = t31a - t24a;
-    t25a = t30  - t25;
-    t26  = t29a - t26a;
-    t27a = t28  - t27;
-    t28a = t28  + t27;
-    t29  = t29a + t26a;
-    t30a = t30  + t25;
-    t31  = t31a + t24a;
-
-    t20  = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
-    t27  = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
-    t21a = ((t26  - t21)  * 11585 + (1 << 13)) >> 14;
-    t26a = ((t26  + t21)  * 11585 + (1 << 13)) >> 14;
-    t22  = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
-    t25  = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
-    t23a = ((t24  - t23)  * 11585 + (1 << 13)) >> 14;
-    t24a = ((t24  + t23)  * 11585 + (1 << 13)) >> 14;
-
-    out[0]  = t0   + t31;
-    out[1]  = t1   + t30a;
-    out[2]  = t2   + t29;
-    out[3]  = t3   + t28a;
-    out[4]  = t4   + t27;
-    out[5]  = t5a  + t26a;
-    out[6]  = t6a  + t25;
-    out[7]  = t7   + t24a;
-    out[8]  = t8   + t23a;
-    out[9]  = t9a  + t22;
-    out[10] = t10  + t21a;
-    out[11] = t11a + t20;
-    out[12] = t12a + t19a;
-    out[13] = t13  + t18;
-    out[14] = t14a + t17a;
-    out[15] = t15  + t16;
-    out[16] = t15  - t16;
-    out[17] = t14a - t17a;
-    out[18] = t13  - t18;
-    out[19] = t12a - t19a;
-    out[20] = t11a - t20;
-    out[21] = t10  - t21a;
-    out[22] = t9a  - t22;
-    out[23] = t8   - t23a;
-    out[24] = t7   - t24a;
-    out[25] = t6a  - t25;
-    out[26] = t5a  - t26a;
-    out[27] = t4   - t27;
-    out[28] = t3   - t28a;
-    out[29] = t2   - t29;
-    out[30] = t1   - t30a;
-    out[31] = t0   - t31;
-}
-
-itxfm_wrapper(idct, idct, 32, 6, 1)
-
-static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4;
-
-    if (pass == 0) {
-        t0 = IN(0) >> 2;
-        t1 = IN(3) >> 2;
-        t2 = IN(1) >> 2;
-        t3 = IN(2) >> 2;
-    } else {
-        t0 = IN(0);
-        t1 = IN(3);
-        t2 = IN(1);
-        t3 = IN(2);
-    }
-
-    t0 += t2;
-    t3 -= t1;
-    t4 = (t0 - t3) >> 1;
-    t1 = t4 - t1;
-    t2 = t4 - t2;
-    t0 -= t1;
-    t3 += t2;
-
-    out[0] = t0;
-    out[1] = t1;
-    out[2] = t2;
-    out[3] = t3;
-}
-
-itxfm_wrapper(iwht, iwht, 4, 0, 0)
-
-#undef IN
-#undef itxfm_wrapper
-#undef itxfm_wrap
-
-static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
-{
-#define init_itxfm(tx, sz)                                        \
-    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_   ## sz ## _add_c; \
-    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_  ## sz ## _add_c; \
-    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_  ## sz ## _add_c; \
-    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_ ## sz ## _add_c
-
-#define init_idct(tx, nm)                               \
-    dsp->itxfm_add[tx][DCT_DCT]   =                     \
-    dsp->itxfm_add[tx][ADST_DCT]  =                     \
-    dsp->itxfm_add[tx][DCT_ADST]  =                     \
-    dsp->itxfm_add[tx][ADST_ADST] = nm ## _add_c
-
-    init_itxfm(TX_4X4, 4x4);
-    init_itxfm(TX_8X8, 8x8);
-    init_itxfm(TX_16X16, 16x16);
-    init_idct(TX_32X32, idct_idct_32x32);
-    init_idct(4 /* lossless */, iwht_iwht_4x4);
-
-#undef init_itxfm
-#undef init_idct
-}
-
-static av_always_inline void loop_filter(uint8_t *dst, ptrdiff_t stride,
-                                         int E, int I, int H,
-                                         ptrdiff_t stridea, ptrdiff_t strideb,
-                                         int wd)
-{
-    int i;
-
-    for (i = 0; i < 8; i++, dst += stridea) {
-        int p7, p6, p5, p4;
-        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
-        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
-        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
-        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
-        int q4, q5, q6, q7;
-        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
-                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
-                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
-                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
-        int flat8out, flat8in;
-
-        if (!fm)
-            continue;
-
-        if (wd >= 16) {
-            p7 = dst[strideb * -8];
-            p6 = dst[strideb * -7];
-            p5 = dst[strideb * -6];
-            p4 = dst[strideb * -5];
-            q4 = dst[strideb * +4];
-            q5 = dst[strideb * +5];
-            q6 = dst[strideb * +6];
-            q7 = dst[strideb * +7];
-
-            flat8out = FFABS(p7 - p0) <= 1 && FFABS(p6 - p0) <= 1 &&
-                       FFABS(p5 - p0) <= 1 && FFABS(p4 - p0) <= 1 &&
-                       FFABS(q4 - q0) <= 1 && FFABS(q5 - q0) <= 1 &&
-                       FFABS(q6 - q0) <= 1 && FFABS(q7 - q0) <= 1;
-        }
-
-        if (wd >= 8)
-            flat8in = FFABS(p3 - p0) <= 1 && FFABS(p2 - p0) <= 1 &&
-                      FFABS(p1 - p0) <= 1 && FFABS(q1 - q0) <= 1 &&
-                      FFABS(q2 - q0) <= 1 && FFABS(q3 - q0) <= 1;
-
-        if (wd >= 16 && flat8out && flat8in) {
-            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
-                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
-            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
-                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
-            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
-                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
-            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
-                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
-            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
-                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
-            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
-                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
-            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
-            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
-                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
-            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
-                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
-            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
-                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
-                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
-                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
-                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
-                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-        } else if (wd >= 8 && flat8in) {
-            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
-            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
-            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
-            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
-            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
-            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
-        } else {
-            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
-
-            if (hev) {
-                int f = av_clip_int8(3 * (q0 - p0) + av_clip_int8(p1 - q1));
-                int f1 = FFMIN(f + 4, 127) >> 3;
-                int f2 = FFMIN(f + 3, 127) >> 3;
-
-                dst[strideb * -1] = av_clip_uint8(p0 + f2);
-                dst[strideb * +0] = av_clip_uint8(q0 - f1);
-            } else {
-                int f = av_clip_int8(3 * (q0 - p0));
-                int f1 = FFMIN(f + 4, 127) >> 3;
-                int f2 = FFMIN(f + 3, 127) >> 3;
-
-                dst[strideb * -1] = av_clip_uint8(p0 + f2);
-                dst[strideb * +0] = av_clip_uint8(q0 - f1);
-
-                f = (f1 + 1) >> 1;
-                dst[strideb * -2] = av_clip_uint8(p1 + f);
-                dst[strideb * +1] = av_clip_uint8(q1 - f);
-            }
-        }
-    }
-}
-
-#define lf_8_fn(dir, wd, stridea, strideb)                                  \
-static void loop_filter_ ## dir ## _ ## wd  ## _8_c(uint8_t *dst,           \
-                                                    ptrdiff_t stride,       \
-                                                    int E, int I, int H)    \
-{                                                                           \
-    loop_filter(dst, stride, E, I, H, stridea, strideb, wd);                \
-}
-
-#define lf_8_fns(wd)          \
-    lf_8_fn(h, wd, stride, 1) \
-    lf_8_fn(v, wd, 1, stride)
-
-lf_8_fns(4)
-lf_8_fns(8)
-lf_8_fns(16)
-
-#undef lf_8_fn
-#undef lf_8_fns
-
-#define lf_16_fn(dir, stridea)                                          \
-static void loop_filter_ ## dir ## _16_16_c(uint8_t *dst,               \
-                                            ptrdiff_t stride,           \
-                                            int E, int I, int H)        \
-{                                                                       \
-    loop_filter_ ## dir ## _16_8_c(dst, stride, E, I, H);               \
-    loop_filter_ ## dir ## _16_8_c(dst + 8 * stridea, stride, E, I, H); \
-}
-
-lf_16_fn(h, stride)
-lf_16_fn(v, 1)
-
-#undef lf_16_fn
-
-#define lf_mix_fn(dir, wd1, wd2, stridea)                                     \
-static void loop_filter_ ## dir ## _ ## wd1 ## wd2 ## _16_c(uint8_t *dst,     \
-                                                            ptrdiff_t stride, \
-                                                            int E, int I,     \
-                                                            int H)            \
-{                                                                             \
-    loop_filter_ ## dir ## _ ## wd1 ## _8_c(dst, stride, E & 0xff,            \
-                                            I & 0xff, H & 0xff);              \
-    loop_filter_ ## dir ## _ ## wd2 ## _8_c(dst + 8 * stridea, stride,        \
-                                            E >> 8, I >> 8, H >> 8);          \
-}
-
-#define lf_mix_fns(wd1, wd2)       \
-    lf_mix_fn(h, wd1, wd2, stride) \
-    lf_mix_fn(v, wd1, wd2, 1)
-
-lf_mix_fns(4, 4)
-lf_mix_fns(4, 8)
-lf_mix_fns(8, 4)
-lf_mix_fns(8, 8)
-
-#undef lf_mix_fn
-#undef lf_mix_fns
-
-static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
-{
-    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
-    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
-    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
-    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
-    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
-    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
-
-    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
-    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
-
-    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
-    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
-    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
-    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
-    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
-    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
-    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
-    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
-}
-
-static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride,
-                                    const uint8_t *src, ptrdiff_t src_stride,
-                                    int w, int h)
-{
-    do {
-        memcpy(dst, src, w);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-static av_always_inline void avg_c(uint8_t *dst, ptrdiff_t dst_stride,
-                                   const uint8_t *src, ptrdiff_t src_stride,
-                                   int w, int h)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x += 4)
-            AV_WN32A(&dst[x], rnd_avg32(AV_RN32A(&dst[x]), AV_RN32(&src[x])));
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define fpel_fn(type, sz)                                              \
-static void type ## sz ## _c(uint8_t *dst, ptrdiff_t dst_stride,       \
-                             const uint8_t *src, ptrdiff_t src_stride, \
-                             int h, int mx, int my)                    \
-{                                                                      \
-    type ## _c(dst, dst_stride, src, src_stride, sz, h);               \
-}
-
-#define copy_avg_fn(sz) \
-    fpel_fn(copy, sz)   \
-    fpel_fn(avg, sz)
-
-copy_avg_fn(64)
-copy_avg_fn(32)
-copy_avg_fn(16)
-copy_avg_fn(8)
-copy_avg_fn(4)
-
-#undef fpel_fn
-#undef copy_avg_fn
-
-const DECLARE_ALIGNED(8, int8_t, ff_vp9_subpel_filters)[3][15][8] = {
+const DECLARE_ALIGNED(16, int16_t, ff_vp9_subpel_filters)[3][16][8] = {
     [FILTER_8TAP_REGULAR] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
         {  0,  1,  -5, 126,   8,  -3,  1,  0 },
         { -1,  3, -10, 122,  18,  -6,  2,  0 },
         { -1,  4, -13, 118,  27,  -9,  3, -1 },
@@ -1820,6 +44,7 @@ const DECLARE_ALIGNED(8, int8_t, ff_vp9_subpel_filters)[3][15][8] = {
         {  0,  2,  -6,  18, 122, -10,  3, -1 },
         {  0,  1,  -3,   8, 126,  -5,  1,  0 },
     }, [FILTER_8TAP_SHARP] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
         { -1,  3,  -7, 127,   8,  -3,  1,  0 },
         { -2,  5, -13, 125,  17,  -6,  3, -1 },
         { -3,  7, -17, 121,  27, -10,  5, -2 },
@@ -1836,6 +61,7 @@ const DECLARE_ALIGNED(8, int8_t, ff_vp9_subpel_filters)[3][15][8] = {
         { -1,  3,  -6,  17, 125, -13,  5, -2 },
         {  0,  1,  -3,   8, 127,  -7,  3, -1 },
     }, [FILTER_8TAP_SMOOTH] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
         { -3, -1,  32,  64,  38,   1, -3,  0 },
         { -2, -2,  29,  63,  41,   2, -3,  0 },
         { -2, -2,  26,  63,  43,   4, -4,  0 },
@@ -1854,336 +80,20 @@ const DECLARE_ALIGNED(8, int8_t, ff_vp9_subpel_filters)[3][15][8] = {
     }
 };
 
-#define FILTER_8TAP(src, x, F, stride)              \
-    av_clip_uint8((F[0] * src[x + -3 * stride] +    \
-                   F[1] * src[x + -2 * stride] +    \
-                   F[2] * src[x + -1 * stride] +    \
-                   F[3] * src[x + +0 * stride] +    \
-                   F[4] * src[x + +1 * stride] +    \
-                   F[5] * src[x + +2 * stride] +    \
-                   F[6] * src[x + +3 * stride] +    \
-                   F[7] * src[x + +4 * stride] + 64) >> 7)
-
-static av_always_inline void do_8tap_1d_c(uint8_t *dst, ptrdiff_t dst_stride,
-                                          const uint8_t *src, ptrdiff_t src_stride,
-                                          int w, int h, ptrdiff_t ds,
-                                          const int8_t *filter, int avg)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
-            else
-                dst[x] = FILTER_8TAP(src, x, filter, ds);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define filter_8tap_1d_fn(opn, opa, dir, ds)                                \
-static av_noinline void opn ## _8tap_1d_ ## dir ## _c(uint8_t *dst,         \
-                                                      ptrdiff_t dst_stride, \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t src_stride, \
-                                                      int w, int h,         \
-                                                      const int8_t *filter) \
-{                                                                           \
-    do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa);  \
-}
-
-filter_8tap_1d_fn(put, 0, v, src_stride)
-filter_8tap_1d_fn(put, 0, h, 1)
-filter_8tap_1d_fn(avg, 1, v, src_stride)
-filter_8tap_1d_fn(avg, 1, h, 1)
-
-#undef filter_8tap_1d_fn
 
-static av_always_inline void do_8tap_2d_c(uint8_t *dst, ptrdiff_t dst_stride,
-                                          const uint8_t *src, ptrdiff_t src_stride,
-                                          int w, int h, const int8_t *filterx,
-                                          const int8_t *filtery, int avg)
+av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
 {
-    int tmp_h = h + 7;
-    uint8_t tmp[64 * 71], *tmp_ptr = tmp;
-
-    src -= src_stride * 3;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
-
-        tmp_ptr += 64;
-        src     += src_stride;
-    } while (--tmp_h);
-
-    tmp_ptr = tmp + 64 * 3;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
-            else
-                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
-
-        tmp_ptr += 64;
-        dst += dst_stride;
-    } while (--h);
-}
-
-#define filter_8tap_2d_fn(opn, opa)                                     \
-static av_noinline void opn ## _8tap_2d_hv_c(uint8_t *dst,              \
-                                             ptrdiff_t dst_stride,      \
-                                             const uint8_t *src,        \
-                                             ptrdiff_t src_stride,      \
-                                             int w, int h,              \
-                                             const int8_t *filterx,     \
-                                             const int8_t *filtery)     \
-{                                                                       \
-    do_8tap_2d_c(dst, dst_stride, src, src_stride,                      \
-                 w, h, filterx, filtery, opa);                          \
-}
-
-filter_8tap_2d_fn(put, 0)
-filter_8tap_2d_fn(avg, 1)
-
-#undef filter_8tap_2d_fn
-
-#undef FILTER_8TAP
-
-#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg)                   \
-static void                                                                 \
-avg ## _8tap_ ## type ## _ ## sz ## dir ## _c(uint8_t *dst,                 \
-                                              ptrdiff_t dst_stride,         \
-                                              const uint8_t *src,           \
-                                              ptrdiff_t src_stride,         \
-                                              int h, int mx, int my)        \
-{                                                                           \
-    avg ## _8tap_1d_ ## dir ## _c(dst, dst_stride, src, src_stride, sz, h,  \
-                                  ff_vp9_subpel_filters[type_idx][dir_m - 1]); \
-}
-
-#define filter_fn_2d(sz, type, type_idx, avg)                               \
-static void avg ## _8tap_ ## type ## _ ## sz ## hv_c(uint8_t *dst,          \
-                                                     ptrdiff_t dst_stride,  \
-                                                     const uint8_t *src,    \
-                                                     ptrdiff_t src_stride,  \
-                                                     int h, int mx, int my) \
-{                                                                           \
-    avg ## _8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h,           \
-                         ff_vp9_subpel_filters[type_idx][mx - 1],           \
-                         ff_vp9_subpel_filters[type_idx][my - 1]);          \
-}
-
-#define FILTER_BILIN(src, x, mxy, stride)                       \
-    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
-
-static av_always_inline void do_bilin_1d_c(uint8_t *dst,
-                                           ptrdiff_t dst_stride,
-                                           const uint8_t *src,
-                                           ptrdiff_t src_stride,
-                                           int w, int h, ptrdiff_t ds,
-                                           int mxy, int avg)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
-            else
-                dst[x] = FILTER_BILIN(src, x, mxy, ds);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define bilin_1d_fn(opn, opa, dir, ds)                                        \
-static av_noinline void opn ## _bilin_1d_ ## dir ## _c(uint8_t *dst,          \
-                                                       ptrdiff_t dst_stride,  \
-                                                       const uint8_t *src,    \
-                                                       ptrdiff_t src_stride,  \
-                                                       int w, int h, int mxy) \
-{                                                                             \
-    do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa);      \
-}
-
-bilin_1d_fn(put, 0, v, src_stride)
-bilin_1d_fn(put, 0, h, 1)
-bilin_1d_fn(avg, 1, v, src_stride)
-bilin_1d_fn(avg, 1, h, 1)
-
-#undef bilin_1d_fn
-
-static av_always_inline void do_bilin_2d_c(uint8_t *dst,
-                                           ptrdiff_t dst_stride,
-                                           const uint8_t *src,
-                                           ptrdiff_t src_stride,
-                                           int w, int h, int mx, int my,
-                                           int avg)
-{
-    uint8_t tmp[64 * 65], *tmp_ptr = tmp;
-    int tmp_h = h + 1;
-
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
-
-        tmp_ptr += 64;
-        src     += src_stride;
-    } while (--tmp_h);
-
-    tmp_ptr = tmp;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
-            else
-                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
-
-        tmp_ptr += 64;
-        dst += dst_stride;
-    } while (--h);
-}
-
-#define bilin_2d_fn(opn, opa)                                           \
-static av_noinline void opn ## _bilin_2d_hv_c(uint8_t *dst,             \
-                                              ptrdiff_t dst_stride,     \
-                                              const uint8_t *src,       \
-                                              ptrdiff_t src_stride,     \
-                                              int w, int h,             \
-                                              int mx, int my)           \
-{                                                                       \
-    do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
-}
-
-bilin_2d_fn(put, 0)
-bilin_2d_fn(avg, 1)
-
-#undef bilin_2d_fn
-
-#undef FILTER_BILIN
-
-#define bilinf_fn_1d(sz, dir, dir_m, avg)                               \
-static void avg ## _bilin_ ## sz ## dir ## _c(uint8_t *dst,             \
-                                              ptrdiff_t dst_stride,     \
-                                              const uint8_t *src,       \
-                                              ptrdiff_t src_stride,     \
-                                              int h, int mx, int my)    \
-{                                                                       \
-    avg ## _bilin_1d_ ## dir ## _c(dst, dst_stride, src, src_stride,    \
-                                   sz, h, dir_m);                       \
-}
-
-#define bilinf_fn_2d(sz, avg)                                        \
-static void avg ## _bilin_ ## sz ## hv_c(uint8_t *dst,               \
-                                         ptrdiff_t dst_stride,       \
-                                         const uint8_t *src,         \
-                                         ptrdiff_t src_stride,       \
-                                         int h, int mx, int my)      \
-{                                                                    \
-    avg ## _bilin_2d_hv_c(dst, dst_stride, src, src_stride,          \
-                          sz, h, mx, my);                            \
-}
-
-#define filter_fn(sz, avg)                                     \
-    filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
-    filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
-    filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg)        \
-    filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg)   \
-    filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg)   \
-    filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg)          \
-    filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg)     \
-    filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg)     \
-    filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg)            \
-    bilinf_fn_1d(sz, h, mx, avg)                               \
-    bilinf_fn_1d(sz, v, my, avg)                               \
-    bilinf_fn_2d(sz, avg)
-
-#define filter_fn_set(avg) \
-    filter_fn(64, avg)     \
-    filter_fn(32, avg)     \
-    filter_fn(16, avg)     \
-    filter_fn(8, avg)      \
-    filter_fn(4, avg)
-
-filter_fn_set(put)
-filter_fn_set(avg)
-
-#undef filter_fn
-#undef filter_fn_set
-#undef filter_fn_1d
-#undef filter_fn_2d
-#undef bilinf_fn_1d
-#undef bilinf_fn_2d
-
-static av_cold void vp9dsp_mc_init(VP9DSPContext *dsp)
-{
-#define init_fpel(idx1, idx2, sz, type)                                \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][0][0]  = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][0][0]   = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_BILINEAR][idx2][0][0]     = type ## sz ## _c
-
-#define init_copy_avg(idx, sz)          \
-    init_fpel(idx, 0, sz, copy);        \
-    init_fpel(idx, 1, sz, avg)
-
-    init_copy_avg(0, 64);
-    init_copy_avg(1, 32);
-    init_copy_avg(2, 16);
-    init_copy_avg(3,  8);
-    init_copy_avg(4,  4);
-
-#undef init_copy_avg
-#undef init_fpel
-
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)             \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_BILINEAR][idx2][idxh][idxv]     = type ## _bilin_        ## sz ## dir ## _c
-
-#define init_subpel2(idx, idxh, idxv, dir, type)     \
-    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
-    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
-    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
-    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
-    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
-
-#define init_subpel3(idx, type)         \
-    init_subpel2(idx, 1, 1, hv, type);  \
-    init_subpel2(idx, 0, 1, v, type);   \
-    init_subpel2(idx, 1, 0, h, type)
-
-    init_subpel3(0, put);
-    init_subpel3(1, avg);
-
-#undef init_subpel1
-#undef init_subpel2
-#undef init_subpel3
-}
-
-av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
-{
-    vp9dsp_intrapred_init(dsp);
-    vp9dsp_itxfm_init(dsp);
-    vp9dsp_loopfilter_init(dsp);
-    vp9dsp_mc_init(dsp);
+    if (bpp == 8) {
+        ff_vp9dsp_init_8(dsp);
+    } else if (bpp == 10) {
+        ff_vp9dsp_init_10(dsp);
+    } else {
+        av_assert0(bpp == 12);
+        ff_vp9dsp_init_12(dsp);
+    }
 
-    if (ARCH_AARCH64)
-        ff_vp9dsp_init_aarch64(dsp);
-    if (ARCH_ARM)
-        ff_vp9dsp_init_arm(dsp);
-    if (ARCH_X86)
-        ff_vp9dsp_init_x86(dsp);
+    if (ARCH_AARCH64) ff_vp9dsp_init_aarch64(dsp, bpp);
+    if (ARCH_ARM) ff_vp9dsp_init_arm(dsp, bpp);
+    if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp, bitexact);
+    if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp);
 }
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h
new file mode 100644
index 0000000..e225631
--- /dev/null
+++ b/libavcodec/vp9dsp.h
@@ -0,0 +1,136 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VP9DSP_H
+#define AVCODEC_VP9DSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavcodec/vp9.h"
+
+typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+                            const uint8_t *ref, ptrdiff_t ref_stride,
+                            int h, int mx, int my);
+typedef void (*vp9_scaled_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+                                   const uint8_t *ref, ptrdiff_t ref_stride,
+                                   int h, int mx, int my, int dx, int dy);
+
+typedef struct VP9DSPContext {
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32
+     * dimension 2: intra prediction modes
+     *
+     * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * top[-1] is top/left; top[4,7] is top-right for 4x4
+     */
+    // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/
+    // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place?
+    // also needs to fit in with what H.264/VP8/etc do
+    void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst,
+                                                         ptrdiff_t stride,
+                                                         const uint8_t *left,
+                                                         const uint8_t *top);
+
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only)
+     * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst
+     *
+     * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * block is 16-byte aligned
+     * eob indicates the position (+1) of the last non-zero coefficient,
+     * in scan-order. This can be used to write faster versions, e.g. a
+     * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32,
+     * etc.
+     */
+    // FIXME also write idct_add_block() versions for whole (inter) pred
+    // blocks, so we can do 2 4x4s at once
+    void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst,
+                                                      ptrdiff_t stride,
+                                                      int16_t *block, int eob);
+
+    /*
+     * dimension 1: width of filter (0=4, 1=8, 2=16)
+     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by 8
+     */
+    void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride,
+                                int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * The width of filter is assumed to be 16; dst/stride are aligned by 16
+     */
+    void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride,
+                              int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1/2: width of filter (0=4, 1=8) for each filter half
+     * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by operation size
+     * this basically calls loop_filter[d1][d3][0](), followed by
+     * loop_filter[d2][d3][0]() on the next 8 pixels
+     * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the
+     * integer.
+     */
+    // FIXME perhaps a mix4 that operates on 32px (for AVX2)
+    void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride,
+                                      int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4)
+     * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin)
+     * dimension 3: averaging type (0: put, 1: avg)
+     * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin)
+     * dimension 5: y subpel interpolation (0: none, 1: 8tap/bilin)
+     *
+     * dst/stride are aligned by hsize
+     */
+    vp9_mc_func mc[5][N_FILTERS][2][2][2];
+
+    /*
+     * for scalable MC, first 3 dimensions identical to above, the other two
+     * don't exist since it changes per stepsize.
+     */
+    vp9_scaled_mc_func smc[5][N_FILTERS][2];
+} VP9DSPContext;
+
+extern const int16_t ff_vp9_subpel_filters[3][16][8];
+
+void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact);
+
+void ff_vp9dsp_init_8(VP9DSPContext *dsp);
+void ff_vp9dsp_init_10(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12(VP9DSPContext *dsp);
+
+void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp);
+void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp);
+void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact);
+void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp);
+
+#endif /* AVCODEC_VP9DSP_H */
diff --git a/libavcodec/vp9dsp_10bpp.c b/libavcodec/vp9dsp_10bpp.c
new file mode 100644
index 0000000..62ce182
--- /dev/null
+++ b/libavcodec/vp9dsp_10bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 10
+#define dctint int64_t
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_12bpp.c b/libavcodec/vp9dsp_12bpp.c
new file mode 100644
index 0000000..2f36471
--- /dev/null
+++ b/libavcodec/vp9dsp_12bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 12
+#define dctint int64_t
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_8bpp.c b/libavcodec/vp9dsp_8bpp.c
new file mode 100644
index 0000000..4b219b0
--- /dev/null
+++ b/libavcodec/vp9dsp_8bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 8
+#define dctint int
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_template.c b/libavcodec/vp9dsp_template.c
new file mode 100644
index 0000000..bb54561
--- /dev/null
+++ b/libavcodec/vp9dsp_template.c
@@ -0,0 +1,2546 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "bit_depth_template.c"
+#include "vp9dsp.h"
+
+#if BIT_DEPTH != 12
+
+// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
+// back with h264pred.[ch]
+
+static void vert_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4 = AV_RN4PA(top);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, p4);
+    AV_WN4PA(dst + stride * 1, p4);
+    AV_WN4PA(dst + stride * 2, p4);
+    AV_WN4PA(dst + stride * 3, p4);
+}
+
+static void vert_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top + 0);
+    pixel4 p4b = AV_RN4PA(top + 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, p4a);
+        AV_WN4PA(dst + 4, p4b);
+        dst += stride;
+    }
+}
+
+static void vert_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+        dst += stride;
+    }
+}
+
+static void vert_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+    pixel4 p4e = AV_RN4PA(top + 16);
+    pixel4 p4f = AV_RN4PA(top + 20);
+    pixel4 p4g = AV_RN4PA(top + 24);
+    pixel4 p4h = AV_RN4PA(top + 28);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+        AV_WN4PA(dst + 16, p4e);
+        AV_WN4PA(dst + 20, p4f);
+        AV_WN4PA(dst + 24, p4g);
+        AV_WN4PA(dst + 28, p4h);
+        dst += stride;
+    }
+}
+
+static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
+    AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
+    AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
+    AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
+}
+
+static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
+
+        AV_WN4PA(dst + 0, p4);
+        AV_WN4PA(dst + 4, p4);
+        dst += stride;
+    }
+}
+
+static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        dst += stride;
+    }
+}
+
+static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        AV_WN4PA(dst + 16, p4);
+        AV_WN4PA(dst + 20, p4);
+        AV_WN4PA(dst + 24, p4);
+        AV_WN4PA(dst + 28, p4);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 4; y++) {
+        int l_m_tl = left[3 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        int l_m_tl = left[7 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst[4] = av_clip_pixel(top[4] + l_m_tl);
+        dst[5] = av_clip_pixel(top[5] + l_m_tl);
+        dst[6] = av_clip_pixel(top[6] + l_m_tl);
+        dst[7] = av_clip_pixel(top[7] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        int l_m_tl = left[15 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        int l_m_tl = left[31 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst[16] = av_clip_pixel(top[16] + l_m_tl);
+        dst[17] = av_clip_pixel(top[17] + l_m_tl);
+        dst[18] = av_clip_pixel(top[18] + l_m_tl);
+        dst[19] = av_clip_pixel(top[19] + l_m_tl);
+        dst[20] = av_clip_pixel(top[20] + l_m_tl);
+        dst[21] = av_clip_pixel(top[21] + l_m_tl);
+        dst[22] = av_clip_pixel(top[22] + l_m_tl);
+        dst[23] = av_clip_pixel(top[23] + l_m_tl);
+        dst[24] = av_clip_pixel(top[24] + l_m_tl);
+        dst[25] = av_clip_pixel(top[25] + l_m_tl);
+        dst[26] = av_clip_pixel(top[26] + l_m_tl);
+        dst[27] = av_clip_pixel(top[27] + l_m_tl);
+        dst[28] = av_clip_pixel(top[28] + l_m_tl);
+        dst[29] = av_clip_pixel(top[29] + l_m_tl);
+        dst[30] = av_clip_pixel(top[30] + l_m_tl);
+        dst[31] = av_clip_pixel(top[31] + l_m_tl);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
+                                top[0] + top[1] + top[2] + top[3] + 4) >> 3);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
+          top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
+          left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
+          left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
+          left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
+          top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
+          top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
+          top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] +
+          left[4] + left[5] + left[6] + left[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
+          left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
+          left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
+          left[30] + left[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
+          top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
+          top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
+          top[30] + top[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);}
+
+static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+#if BIT_DEPTH == 8
+#define memset_bpc memset
+#else
+static inline void memset_bpc(uint16_t *dst, int val, int len) {
+    int n;
+    for (n = 0; n < len; n++) {
+        dst[n] = val;
+    }
+}
+#endif
+
+#define DST(x, y) dst[(x) + (y) * stride]
+
+static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+    DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
+    DST(3,3) = a7;  // note: this is different from vp8 and such
+}
+
+#define def_diag_downleft(size) \
+static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                              const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel v[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) \
+        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) { \
+        memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
+    } \
+}
+
+def_diag_downleft(8)
+def_diag_downleft(16)
+def_diag_downleft(32)
+
+static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                 const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_diag_downright(size) \
+static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                               const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size + size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i           ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+        v[size + 1 + i] = (top[i]  + top[i + 1]  * 2 + top[i + 2]  + 2) >> 2; \
+    } \
+    v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
+    v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
+    v[size    ] = (top[-1] + top[0]  * 2 + top[ 1] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
+}
+
+def_diag_downright(8)
+def_diag_downright(16)
+def_diag_downright(32)
+
+static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                             const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
+    DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
+    DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
+    DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a2 + a3 + 1) >> 1;
+    DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_vert_right(size) \
+static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                           const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size/2 - 2; i++) { \
+        vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
+        ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
+    } \
+    vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
+    ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
+    vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
+    for (i = 0; i < size - 1; i++) { \
+        ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2     *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
+        memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
+    } \
+}
+
+def_vert_right(8)
+def_vert_right(16)
+def_vert_right(32)
+
+static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
+        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
+
+    stride /= sizeof(pixel);
+    DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
+    DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
+    DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
+    DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
+    DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,3) = (l2 + l3 + 1) >> 1;
+    DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+}
+
+#define def_hor_down(size) \
+static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                         const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size * 3 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2       ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
+        v[i*2    + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
+        v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+    v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
+    v[size*2 - 1] = (top[0]  + top[-1] * 2 + left[size - 1] + 2) >> 2; \
+    v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
+}
+
+def_hor_down(8)
+def_hor_down(16)
+def_hor_down(32)
+
+static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 + 1) >> 1;
+    DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
+    DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
+    DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
+    DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,2) = (a4 + a5 + 1) >> 1;
+    DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+}
+
+#define def_vert_left(size) \
+static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                          const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel ve[size - 1], vo[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    } \
+    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
+    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2      * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst +  j*2      * stride + size - j - 1, top[size - 1], j + 1); \
+        memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
+    } \
+}
+
+def_vert_left(8)
+def_vert_left(16)
+def_vert_left(32)
+
+static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (l0 + l1 + 1) >> 1;
+    DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
+    DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
+    DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
+    DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
+}
+
+#define def_hor_up(size) \
+static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                       const uint8_t *_left, const uint8_t *top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size*2 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2    ] = (left[i] + left[i + 1] + 1) >> 1; \
+        v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+    } \
+    v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) \
+        memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
+    for (j = size / 2; j < size; j++) { \
+        memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
+                   2 + j*2 - size); \
+    } \
+}
+
+def_hor_up(8)
+def_hor_up(16)
+def_hor_up(32)
+
+#undef DST
+
+#endif /* BIT_DEPTH != 12 */
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
+{
+#define init_intra_pred_bd_aware(tx, sz) \
+    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_##sz##_c; \
+    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_##sz##_c; \
+    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_##sz##_c; \
+    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_intrapred_init_10(dsp);
+#define init_intra_pred(tx, sz) \
+    init_intra_pred_bd_aware(tx, sz)
+#else
+    #define init_intra_pred(tx, sz) \
+    dsp->intra_pred[tx][VERT_PRED]            = vert_##sz##_c; \
+    dsp->intra_pred[tx][HOR_PRED]             = hor_##sz##_c; \
+    dsp->intra_pred[tx][DC_PRED]              = dc_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
+    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_##sz##_c; \
+    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_##sz##_c; \
+    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_##sz##_c; \
+    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_##sz##_c; \
+    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_##sz##_c; \
+    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_##sz##_c; \
+    init_intra_pred_bd_aware(tx, sz)
+#endif
+
+    init_intra_pred(TX_4X4,   4x4);
+    init_intra_pred(TX_8X8,   8x8);
+    init_intra_pred(TX_16X16, 16x16);
+    init_intra_pred(TX_32X32, 32x32);
+
+#undef init_intra_pred
+#undef init_intra_pred_bd_aware
+}
+
+#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
+static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
+                                                    ptrdiff_t stride, \
+                                                    int16_t *_block, int eob) \
+{ \
+    int i, j; \
+    pixel *dst = (pixel *) _dst; \
+    dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
+\
+    stride /= sizeof(pixel); \
+    if (has_dconly && eob == 1) { \
+        const int t  = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
+                                            * 11585 + (1 << 13)) >> 14; \
+        block[0] = 0; \
+        for (i = 0; i < sz; i++) { \
+            for (j = 0; j < sz; j++) \
+                dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                                (bits ? \
+                                                 (t + (1 << (bits - 1))) >> bits : \
+                                                 t)); \
+            dst++; \
+        } \
+        return; \
+    } \
+\
+    for (i = 0; i < sz; i++) \
+        type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
+    memset(block, 0, sz * sz * sizeof(*block)); \
+    for (i = 0; i < sz; i++) { \
+        type_b##sz##_1d(tmp + i, sz, out, 1); \
+        for (j = 0; j < sz; j++) \
+            dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                            (bits ? \
+                                             (out[j] + (1 << (bits - 1))) >> bits : \
+                                             out[j])); \
+        dst++; \
+    } \
+}
+
+#define itxfm_wrap(sz, bits) \
+itxfm_wrapper(idct,  idct,  sz, bits, 1) \
+itxfm_wrapper(iadst, idct,  sz, bits, 0) \
+itxfm_wrapper(idct,  iadst, sz, bits, 0) \
+itxfm_wrapper(iadst, iadst, sz, bits, 0)
+
+#define IN(x) ((dctint) in[(x) * stride])
+
+static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
+    t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
+    t2 = (IN(1) *  6270 - IN(3) * 15137 + (1 << 13)) >> 14;
+    t3 = (IN(1) * 15137 + IN(3) *  6270 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t3;
+    out[1] = t1 + t2;
+    out[2] = t1 - t2;
+    out[3] = t0 - t3;
+}
+
+static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
+    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
+    t2 = 13377 * (IN(0) - IN(2) + IN(3));
+    t3 = 13377 * IN(1);
+
+    out[0] = (t0 + t3      + (1 << 13)) >> 14;
+    out[1] = (t1 + t3      + (1 << 13)) >> 14;
+    out[2] = (t2           + (1 << 13)) >> 14;
+    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(4, 4)
+
+static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
+    t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
+    t2a = (IN(2) *  6270 - IN(6) * 15137 + (1 << 13)) >> 14;
+    t3a = (IN(2) * 15137 + IN(6) *  6270 + (1 << 13)) >> 14;
+    t4a = (IN(1) *  3196 - IN(7) * 16069 + (1 << 13)) >> 14;
+    t5a = (IN(5) * 13623 - IN(3) *  9102 + (1 << 13)) >> 14;
+    t6a = (IN(5) *  9102 + IN(3) * 13623 + (1 << 13)) >> 14;
+    t7a = (IN(1) * 16069 + IN(7) *  3196 + (1 << 13)) >> 14;
+
+    t0  = t0a + t3a;
+    t1  = t1a + t2a;
+    t2  = t1a - t2a;
+    t3  = t0a - t3a;
+    t4  = t4a + t5a;
+    t5a = t4a - t5a;
+    t7  = t7a + t6a;
+    t6a = t7a - t6a;
+
+    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
+    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t7;
+    out[1] = t1 + t6;
+    out[2] = t2 + t5;
+    out[3] = t3 + t4;
+    out[4] = t3 - t4;
+    out[5] = t2 - t5;
+    out[6] = t1 - t6;
+    out[7] = t0 - t7;
+}
+
+static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = 16305 * IN(7) +  1606 * IN(0);
+    t1a =  1606 * IN(7) - 16305 * IN(0);
+    t2a = 14449 * IN(5) +  7723 * IN(2);
+    t3a =  7723 * IN(5) - 14449 * IN(2);
+    t4a = 10394 * IN(3) + 12665 * IN(4);
+    t5a = 12665 * IN(3) - 10394 * IN(4);
+    t6a =  4756 * IN(1) + 15679 * IN(6);
+    t7a = 15679 * IN(1) -  4756 * IN(6);
+
+    t0 = (t0a + t4a + (1 << 13)) >> 14;
+    t1 = (t1a + t5a + (1 << 13)) >> 14;
+    t2 = (t2a + t6a + (1 << 13)) >> 14;
+    t3 = (t3a + t7a + (1 << 13)) >> 14;
+    t4 = (t0a - t4a + (1 << 13)) >> 14;
+    t5 = (t1a - t5a + (1 << 13)) >> 14;
+    t6 = (t2a - t6a + (1 << 13)) >> 14;
+    t7 = (t3a - t7a + (1 << 13)) >> 14;
+
+    t4a = 15137 * t4 +  6270 * t5;
+    t5a =  6270 * t4 - 15137 * t5;
+    t6a = 15137 * t7 -  6270 * t6;
+    t7a =  6270 * t7 + 15137 * t6;
+
+    out[0] =   t0 + t2;
+    out[7] = -(t1 + t3);
+    t2     =   t0 - t2;
+    t3     =   t1 - t3;
+
+    out[1] = -((t4a + t6a + (1 << 13)) >> 14);
+    out[6] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6     =   (t4a - t6a + (1 << 13)) >> 14;
+    t7     =   (t5a - t7a + (1 << 13)) >> 14;
+
+    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
+    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
+    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
+    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
+}
+
+itxfm_wrap(8, 5)
+
+static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0a  = ((IN(0) + IN(8)) * 11585 + (1 << 13)) >> 14;
+    t1a  = ((IN(0) - IN(8)) * 11585 + (1 << 13)) >> 14;
+    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
+    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
+    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
+    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
+    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
+    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
+    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
+    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
+    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
+    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
+    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
+    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
+    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
+    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
+
+    t0  = t0a  + t3a;
+    t1  = t1a  + t2a;
+    t2  = t1a  - t2a;
+    t3  = t0a  - t3a;
+    t4  = t4a  + t5a;
+    t5  = t4a  - t5a;
+    t6  = t7a  - t6a;
+    t7  = t7a  + t6a;
+    t8  = t8a  + t9a;
+    t9  = t8a  - t9a;
+    t10 = t11a - t10a;
+    t11 = t11a + t10a;
+    t12 = t12a + t13a;
+    t13 = t12a - t13a;
+    t14 = t15a - t14a;
+    t15 = t15a + t14a;
+
+    t5a  = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
+    t6a  = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
+    t9a  = (  t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a = (  t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a = (  t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4   = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7   = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+
+    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
+    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
+    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
+    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
+
+    out[ 0] = t0a + t15a;
+    out[ 1] = t1a + t14;
+    out[ 2] = t2a + t13a;
+    out[ 3] = t3a + t12;
+    out[ 4] = t4  + t11;
+    out[ 5] = t5  + t10a;
+    out[ 6] = t6  + t9;
+    out[ 7] = t7  + t8a;
+    out[ 8] = t7  - t8a;
+    out[ 9] = t6  - t9;
+    out[10] = t5  - t10a;
+    out[11] = t4  - t11;
+    out[12] = t3a - t12;
+    out[13] = t2a - t13a;
+    out[14] = t1a - t14;
+    out[15] = t0a - t15a;
+}
+
+static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
+                                        dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0  = IN(15) * 16364 + IN(0)  *   804;
+    t1  = IN(15) *   804 - IN(0)  * 16364;
+    t2  = IN(13) * 15893 + IN(2)  *  3981;
+    t3  = IN(13) *  3981 - IN(2)  * 15893;
+    t4  = IN(11) * 14811 + IN(4)  *  7005;
+    t5  = IN(11) *  7005 - IN(4)  * 14811;
+    t6  = IN(9)  * 13160 + IN(6)  *  9760;
+    t7  = IN(9)  *  9760 - IN(6)  * 13160;
+    t8  = IN(7)  * 11003 + IN(8)  * 12140;
+    t9  = IN(7)  * 12140 - IN(8)  * 11003;
+    t10 = IN(5)  *  8423 + IN(10) * 14053;
+    t11 = IN(5)  * 14053 - IN(10) *  8423;
+    t12 = IN(3)  *  5520 + IN(12) * 15426;
+    t13 = IN(3)  * 15426 - IN(12) *  5520;
+    t14 = IN(1)  *  2404 + IN(14) * 16207;
+    t15 = IN(1)  * 16207 - IN(14) *  2404;
+
+    t0a  = (t0 + t8  + (1 << 13)) >> 14;
+    t1a  = (t1 + t9  + (1 << 13)) >> 14;
+    t2a  = (t2 + t10 + (1 << 13)) >> 14;
+    t3a  = (t3 + t11 + (1 << 13)) >> 14;
+    t4a  = (t4 + t12 + (1 << 13)) >> 14;
+    t5a  = (t5 + t13 + (1 << 13)) >> 14;
+    t6a  = (t6 + t14 + (1 << 13)) >> 14;
+    t7a  = (t7 + t15 + (1 << 13)) >> 14;
+    t8a  = (t0 - t8  + (1 << 13)) >> 14;
+    t9a  = (t1 - t9  + (1 << 13)) >> 14;
+    t10a = (t2 - t10 + (1 << 13)) >> 14;
+    t11a = (t3 - t11 + (1 << 13)) >> 14;
+    t12a = (t4 - t12 + (1 << 13)) >> 14;
+    t13a = (t5 - t13 + (1 << 13)) >> 14;
+    t14a = (t6 - t14 + (1 << 13)) >> 14;
+    t15a = (t7 - t15 + (1 << 13)) >> 14;
+
+    t8   = t8a  * 16069 + t9a  *  3196;
+    t9   = t8a  *  3196 - t9a  * 16069;
+    t10  = t10a *  9102 + t11a * 13623;
+    t11  = t10a * 13623 - t11a *  9102;
+    t12  = t13a * 16069 - t12a *  3196;
+    t13  = t13a *  3196 + t12a * 16069;
+    t14  = t15a *  9102 - t14a * 13623;
+    t15  = t15a * 13623 + t14a *  9102;
+
+    t0   = t0a + t4a;
+    t1   = t1a + t5a;
+    t2   = t2a + t6a;
+    t3   = t3a + t7a;
+    t4   = t0a - t4a;
+    t5   = t1a - t5a;
+    t6   = t2a - t6a;
+    t7   = t3a - t7a;
+    t8a  = (t8  + t12 + (1 << 13)) >> 14;
+    t9a  = (t9  + t13 + (1 << 13)) >> 14;
+    t10a = (t10 + t14 + (1 << 13)) >> 14;
+    t11a = (t11 + t15 + (1 << 13)) >> 14;
+    t12a = (t8  - t12 + (1 << 13)) >> 14;
+    t13a = (t9  - t13 + (1 << 13)) >> 14;
+    t14a = (t10 - t14 + (1 << 13)) >> 14;
+    t15a = (t11 - t15 + (1 << 13)) >> 14;
+
+    t4a  = t4 * 15137 + t5 *  6270;
+    t5a  = t4 *  6270 - t5 * 15137;
+    t6a  = t7 * 15137 - t6 *  6270;
+    t7a  = t7 *  6270 + t6 * 15137;
+    t12  = t12a * 15137 + t13a *  6270;
+    t13  = t12a *  6270 - t13a * 15137;
+    t14  = t15a * 15137 - t14a *  6270;
+    t15  = t15a *  6270 + t14a * 15137;
+
+    out[ 0] =   t0 + t2;
+    out[15] = -(t1 + t3);
+    t2a     =   t0 - t2;
+    t3a     =   t1 - t3;
+    out[ 3] = -((t4a + t6a + (1 << 13)) >> 14);
+    out[12] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6      =   (t4a - t6a + (1 << 13)) >> 14;
+    t7      =   (t5a - t7a + (1 << 13)) >> 14;
+    out[ 1] = -(t8a + t10a);
+    out[14] =   t9a + t11a;
+    t10     =   t8a - t10a;
+    t11     =   t9a - t11a;
+    out[ 2] =   (t12 + t14 + (1 << 13)) >> 14;
+    out[13] = -((t13 + t15 + (1 << 13)) >> 14);
+    t14a    =   (t12 - t14 + (1 << 13)) >> 14;
+    t15a    =   (t13 - t15 + (1 << 13)) >> 14;
+
+    out[ 7] = ((t2a  + t3a)  * -11585 + (1 << 13)) >> 14;
+    out[ 8] = ((t2a  - t3a)  *  11585 + (1 << 13)) >> 14;
+    out[ 4] = ((t7   + t6)   *  11585 + (1 << 13)) >> 14;
+    out[11] = ((t7   - t6)   *  11585 + (1 << 13)) >> 14;
+    out[ 6] = ((t11  + t10)  *  11585 + (1 << 13)) >> 14;
+    out[ 9] = ((t11  - t10)  *  11585 + (1 << 13)) >> 14;
+    out[ 5] = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
+    out[10] = ((t14a - t15a) *  11585 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(16, 6)
+
+static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0a  = ((IN(0) + IN(16)) * 11585 + (1 << 13)) >> 14;
+    dctint t1a  = ((IN(0) - IN(16)) * 11585 + (1 << 13)) >> 14;
+    dctint t2a  = (IN( 8) *  6270 - IN(24) * 15137 + (1 << 13)) >> 14;
+    dctint t3a  = (IN( 8) * 15137 + IN(24) *  6270 + (1 << 13)) >> 14;
+    dctint t4a  = (IN( 4) *  3196 - IN(28) * 16069 + (1 << 13)) >> 14;
+    dctint t7a  = (IN( 4) * 16069 + IN(28) *  3196 + (1 << 13)) >> 14;
+    dctint t5a  = (IN(20) * 13623 - IN(12) *  9102 + (1 << 13)) >> 14;
+    dctint t6a  = (IN(20) *  9102 + IN(12) * 13623 + (1 << 13)) >> 14;
+    dctint t8a  = (IN( 2) *  1606 - IN(30) * 16305 + (1 << 13)) >> 14;
+    dctint t15a = (IN( 2) * 16305 + IN(30) *  1606 + (1 << 13)) >> 14;
+    dctint t9a  = (IN(18) * 12665 - IN(14) * 10394 + (1 << 13)) >> 14;
+    dctint t14a = (IN(18) * 10394 + IN(14) * 12665 + (1 << 13)) >> 14;
+    dctint t10a = (IN(10) *  7723 - IN(22) * 14449 + (1 << 13)) >> 14;
+    dctint t13a = (IN(10) * 14449 + IN(22) *  7723 + (1 << 13)) >> 14;
+    dctint t11a = (IN(26) * 15679 - IN( 6) *  4756 + (1 << 13)) >> 14;
+    dctint t12a = (IN(26) *  4756 + IN( 6) * 15679 + (1 << 13)) >> 14;
+    dctint t16a = (IN( 1) *   804 - IN(31) * 16364 + (1 << 13)) >> 14;
+    dctint t31a = (IN( 1) * 16364 + IN(31) *   804 + (1 << 13)) >> 14;
+    dctint t17a = (IN(17) * 12140 - IN(15) * 11003 + (1 << 13)) >> 14;
+    dctint t30a = (IN(17) * 11003 + IN(15) * 12140 + (1 << 13)) >> 14;
+    dctint t18a = (IN( 9) *  7005 - IN(23) * 14811 + (1 << 13)) >> 14;
+    dctint t29a = (IN( 9) * 14811 + IN(23) *  7005 + (1 << 13)) >> 14;
+    dctint t19a = (IN(25) * 15426 - IN( 7) *  5520 + (1 << 13)) >> 14;
+    dctint t28a = (IN(25) *  5520 + IN( 7) * 15426 + (1 << 13)) >> 14;
+    dctint t20a = (IN( 5) *  3981 - IN(27) * 15893 + (1 << 13)) >> 14;
+    dctint t27a = (IN( 5) * 15893 + IN(27) *  3981 + (1 << 13)) >> 14;
+    dctint t21a = (IN(21) * 14053 - IN(11) *  8423 + (1 << 13)) >> 14;
+    dctint t26a = (IN(21) *  8423 + IN(11) * 14053 + (1 << 13)) >> 14;
+    dctint t22a = (IN(13) *  9760 - IN(19) * 13160 + (1 << 13)) >> 14;
+    dctint t25a = (IN(13) * 13160 + IN(19) *  9760 + (1 << 13)) >> 14;
+    dctint t23a = (IN(29) * 16207 - IN( 3) *  2404 + (1 << 13)) >> 14;
+    dctint t24a = (IN(29) *  2404 + IN( 3) * 16207 + (1 << 13)) >> 14;
+
+    dctint t0  = t0a  + t3a;
+    dctint t1  = t1a  + t2a;
+    dctint t2  = t1a  - t2a;
+    dctint t3  = t0a  - t3a;
+    dctint t4  = t4a  + t5a;
+    dctint t5  = t4a  - t5a;
+    dctint t6  = t7a  - t6a;
+    dctint t7  = t7a  + t6a;
+    dctint t8  = t8a  + t9a;
+    dctint t9  = t8a  - t9a;
+    dctint t10 = t11a - t10a;
+    dctint t11 = t11a + t10a;
+    dctint t12 = t12a + t13a;
+    dctint t13 = t12a - t13a;
+    dctint t14 = t15a - t14a;
+    dctint t15 = t15a + t14a;
+    dctint t16 = t16a + t17a;
+    dctint t17 = t16a - t17a;
+    dctint t18 = t19a - t18a;
+    dctint t19 = t19a + t18a;
+    dctint t20 = t20a + t21a;
+    dctint t21 = t20a - t21a;
+    dctint t22 = t23a - t22a;
+    dctint t23 = t23a + t22a;
+    dctint t24 = t24a + t25a;
+    dctint t25 = t24a - t25a;
+    dctint t26 = t27a - t26a;
+    dctint t27 = t27a + t26a;
+    dctint t28 = t28a + t29a;
+    dctint t29 = t28a - t29a;
+    dctint t30 = t31a - t30a;
+    dctint t31 = t31a + t30a;
+
+    t5a = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
+    t6a = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
+    t9a  = (  t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a = (  t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a = (  t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+    t17a = (  t30 *  3196 - t17 * 16069  + (1 << 13)) >> 14;
+    t30a = (  t30 * 16069 + t17 *  3196  + (1 << 13)) >> 14;
+    t18a = (-(t29 * 16069 + t18 *  3196) + (1 << 13)) >> 14;
+    t29a = (  t29 *  3196 - t18 * 16069  + (1 << 13)) >> 14;
+    t21a = (  t26 * 13623 - t21 *  9102  + (1 << 13)) >> 14;
+    t26a = (  t26 *  9102 + t21 * 13623  + (1 << 13)) >> 14;
+    t22a = (-(t25 *  9102 + t22 * 13623) + (1 << 13)) >> 14;
+    t25a = (  t25 * 13623 - t22 *  9102  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4a  = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7a  = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+    t16a = t16  + t19;
+    t17  = t17a + t18a;
+    t18  = t17a - t18a;
+    t19a = t16  - t19;
+    t20a = t23  - t20;
+    t21  = t22a - t21a;
+    t22  = t22a + t21a;
+    t23a = t23  + t20;
+    t24a = t24  + t27;
+    t25  = t25a + t26a;
+    t26  = t25a - t26a;
+    t27a = t24  - t27;
+    t28a = t31  - t28;
+    t29  = t30a - t29a;
+    t30  = t30a + t29a;
+    t31a = t31  + t28;
+
+    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
+    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
+    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
+    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
+    t18a = (  t29  *  6270 - t18  * 15137  + (1 << 13)) >> 14;
+    t29a = (  t29  * 15137 + t18  *  6270  + (1 << 13)) >> 14;
+    t19  = (  t28a *  6270 - t19a * 15137  + (1 << 13)) >> 14;
+    t28  = (  t28a * 15137 + t19a *  6270  + (1 << 13)) >> 14;
+    t20  = (-(t27a * 15137 + t20a *  6270) + (1 << 13)) >> 14;
+    t27  = (  t27a *  6270 - t20a * 15137  + (1 << 13)) >> 14;
+    t21a = (-(t26  * 15137 + t21  *  6270) + (1 << 13)) >> 14;
+    t26a = (  t26  *  6270 - t21  * 15137  + (1 << 13)) >> 14;
+
+    t0   = t0a + t15a;
+    t1   = t1a + t14;
+    t2   = t2a + t13a;
+    t3   = t3a + t12;
+    t4   = t4a + t11;
+    t5a  = t5  + t10a;
+    t6a  = t6  + t9;
+    t7   = t7a + t8a;
+    t8   = t7a - t8a;
+    t9a  = t6  - t9;
+    t10  = t5  - t10a;
+    t11a = t4a - t11;
+    t12a = t3a - t12;
+    t13  = t2a - t13a;
+    t14a = t1a - t14;
+    t15  = t0a - t15a;
+    t16  = t16a + t23a;
+    t17a = t17  + t22;
+    t18  = t18a + t21a;
+    t19a = t19  + t20;
+    t20a = t19  - t20;
+    t21  = t18a - t21a;
+    t22a = t17  - t22;
+    t23  = t16a - t23a;
+    t24  = t31a - t24a;
+    t25a = t30  - t25;
+    t26  = t29a - t26a;
+    t27a = t28  - t27;
+    t28a = t28  + t27;
+    t29  = t29a + t26a;
+    t30a = t30  + t25;
+    t31  = t31a + t24a;
+
+    t20  = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
+    t27  = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
+    t21a = ((t26  - t21 ) * 11585 + (1 << 13)) >> 14;
+    t26a = ((t26  + t21 ) * 11585 + (1 << 13)) >> 14;
+    t22  = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
+    t25  = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
+    t23a = ((t24  - t23 ) * 11585 + (1 << 13)) >> 14;
+    t24a = ((t24  + t23 ) * 11585 + (1 << 13)) >> 14;
+
+    out[ 0] = t0   + t31;
+    out[ 1] = t1   + t30a;
+    out[ 2] = t2   + t29;
+    out[ 3] = t3   + t28a;
+    out[ 4] = t4   + t27;
+    out[ 5] = t5a  + t26a;
+    out[ 6] = t6a  + t25;
+    out[ 7] = t7   + t24a;
+    out[ 8] = t8   + t23a;
+    out[ 9] = t9a  + t22;
+    out[10] = t10  + t21a;
+    out[11] = t11a + t20;
+    out[12] = t12a + t19a;
+    out[13] = t13  + t18;
+    out[14] = t14a + t17a;
+    out[15] = t15  + t16;
+    out[16] = t15  - t16;
+    out[17] = t14a - t17a;
+    out[18] = t13  - t18;
+    out[19] = t12a - t19a;
+    out[20] = t11a - t20;
+    out[21] = t10  - t21a;
+    out[22] = t9a  - t22;
+    out[23] = t8   - t23a;
+    out[24] = t7   - t24a;
+    out[25] = t6a  - t25;
+    out[26] = t5a  - t26a;
+    out[27] = t4   - t27;
+    out[28] = t3   - t28a;
+    out[29] = t2   - t29;
+    out[30] = t1   - t30a;
+    out[31] = t0   - t31;
+}
+
+itxfm_wrapper(idct, idct, 32, 6, 1)
+
+static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    int t0, t1, t2, t3, t4;
+
+    if (pass == 0) {
+        t0 = IN(0) >> 2;
+        t1 = IN(3) >> 2;
+        t2 = IN(1) >> 2;
+        t3 = IN(2) >> 2;
+    } else {
+        t0 = IN(0);
+        t1 = IN(3);
+        t2 = IN(1);
+        t3 = IN(2);
+    }
+
+    t0 += t2;
+    t3 -= t1;
+    t4 = (t0 - t3) >> 1;
+    t1 = t4 - t1;
+    t2 = t4 - t2;
+    t0 -= t1;
+    t3 += t2;
+
+    out[0] = t0;
+    out[1] = t1;
+    out[2] = t2;
+    out[3] = t3;
+}
+
+itxfm_wrapper(iwht, iwht, 4, 0, 0)
+
+#undef IN
+#undef itxfm_wrapper
+#undef itxfm_wrap
+
+static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
+{
+#define init_itxfm(tx, sz) \
+    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
+
+#define init_idct(tx, nm) \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
+
+    init_itxfm(TX_4X4,   4x4);
+    init_itxfm(TX_8X8,   8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32,  idct_idct_32x32);
+    init_idct(4 /* lossless */, iwht_iwht_4x4);
+
+#undef init_itxfm
+#undef init_idct
+}
+
+static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
+                                         ptrdiff_t stridea, ptrdiff_t strideb,
+                                         int wd)
+{
+    int i, F = 1 << (BIT_DEPTH - 8);
+
+    E <<= (BIT_DEPTH - 8);
+    I <<= (BIT_DEPTH - 8);
+    H <<= (BIT_DEPTH - 8);
+    for (i = 0; i < 8; i++, dst += stridea) {
+        int p7, p6, p5, p4;
+        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
+        int q4, q5, q6, q7;
+        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
+                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
+                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
+                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
+        int flat8out, flat8in;
+
+        if (!fm)
+            continue;
+
+        if (wd >= 16) {
+            p7 = dst[strideb * -8];
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+            q7 = dst[strideb * +7];
+
+            flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
+                       FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
+                       FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
+                       FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
+        }
+
+        if (wd >= 8)
+            flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
+                      FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
+                      FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
+
+        if (wd >= 16 && flat8out && flat8in) {
+            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
+                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
+                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
+                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
+                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
+                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
+            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
+            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
+                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
+                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
+                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
+                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else {
+            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
+
+            if (hev) {
+                int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
+                f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+            } else {
+                int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = av_clip_pixel(p1 + f);
+                dst[strideb * +1] = av_clip_pixel(q1 - f);
+            }
+        }
+    }
+}
+
+#define lf_8_fn(dir, wd, stridea, strideb) \
+static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
+                                           ptrdiff_t stride, \
+                                           int E, int I, int H) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    stride /= sizeof(pixel); \
+    loop_filter(dst, E, I, H, stridea, strideb, wd); \
+}
+
+#define lf_8_fns(wd) \
+lf_8_fn(h, wd, stride, 1) \
+lf_8_fn(v, wd, 1, stride)
+
+lf_8_fns(4)
+lf_8_fns(8)
+lf_8_fns(16)
+
+#undef lf_8_fn
+#undef lf_8_fns
+
+#define lf_16_fn(dir, stridea) \
+static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
+                                        ptrdiff_t stride, \
+                                        int E, int I, int H) \
+{ \
+    loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
+    loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
+}
+
+lf_16_fn(h, stride)
+lf_16_fn(v, sizeof(pixel))
+
+#undef lf_16_fn
+
+#define lf_mix_fn(dir, wd1, wd2, stridea) \
+static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  int E, int I, int H) \
+{ \
+    loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
+    loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
+}
+
+#define lf_mix_fns(wd1, wd2) \
+lf_mix_fn(h, wd1, wd2, stride) \
+lf_mix_fn(v, wd1, wd2, sizeof(pixel))
+
+lf_mix_fns(4, 4)
+lf_mix_fns(4, 8)
+lf_mix_fns(8, 4)
+lf_mix_fns(8, 8)
+
+#undef lf_mix_fn
+#undef lf_mix_fns
+
+static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
+{
+    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
+    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
+    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
+    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
+    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
+    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
+
+    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
+    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
+
+    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
+    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
+    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
+    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
+    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
+    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
+    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
+    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride,
+                                    const uint8_t *src, ptrdiff_t src_stride,
+                                    int w, int h)
+{
+    do {
+        memcpy(dst, src, w * sizeof(pixel));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+static av_always_inline void avg_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                   const uint8_t *_src, ptrdiff_t src_stride,
+                                   int w, int h)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x += 4)
+            AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define fpel_fn(type, sz) \
+static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                         const uint8_t *src, ptrdiff_t src_stride, \
+                         int h, int mx, int my) \
+{ \
+    type##_c(dst, dst_stride, src, src_stride, sz, h); \
+}
+
+#define copy_avg_fn(sz) \
+fpel_fn(copy, sz) \
+fpel_fn(avg,  sz)
+
+copy_avg_fn(64)
+copy_avg_fn(32)
+copy_avg_fn(16)
+copy_avg_fn(8)
+copy_avg_fn(4)
+
+#undef fpel_fn
+#undef copy_avg_fn
+
+#endif /* BIT_DEPTH != 12 */
+
+#define FILTER_8TAP(src, x, F, stride) \
+    av_clip_pixel((F[0] * src[x + -3 * stride] + \
+                   F[1] * src[x + -2 * stride] + \
+                   F[2] * src[x + -1 * stride] + \
+                   F[3] * src[x + +0 * stride] + \
+                   F[4] * src[x + +1 * stride] + \
+                   F[5] * src[x + +2 * stride] + \
+                   F[6] * src[x + +3 * stride] + \
+                   F[7] * src[x + +4 * stride] + 64) >> 7)
+
+static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, ptrdiff_t ds,
+                                          const int16_t *filter, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(src, x, filter, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define filter_8tap_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src, ptrdiff_t src_stride, \
+                                                int w, int h, const int16_t *filter) \
+{ \
+    do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
+}
+
+filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(put, 0, h, 1)
+filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(avg, 1, h, 1)
+
+#undef filter_8tap_1d_fn
+
+static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, const int16_t *filterx,
+                                          const int16_t *filtery, int avg)
+{
+    int tmp_h = h + 7;
+    pixel tmp[64 * 71], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define filter_8tap_2d_fn(opn, opa) \
+static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int w, int h, const int16_t *filterx, \
+                                           const int16_t *filtery) \
+{ \
+    do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
+}
+
+filter_8tap_2d_fn(put, 0)
+filter_8tap_2d_fn(avg, 1)
+
+#undef filter_8tap_2d_fn
+
+#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                              const uint8_t *src, ptrdiff_t src_stride, \
+                                              int h, int mx, int my) \
+{ \
+    avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
+                            ff_vp9_subpel_filters[type_idx][dir_m]); \
+}
+
+#define filter_fn_2d(sz, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my) \
+{ \
+    avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
+                       ff_vp9_subpel_filters[type_idx][mx], \
+                       ff_vp9_subpel_filters[type_idx][my]); \
+}
+
+#if BIT_DEPTH != 12
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
+
+static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, ptrdiff_t ds, int mxy, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(src, x, mxy, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define bilin_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                 const uint8_t *src, ptrdiff_t src_stride, \
+                                                 int w, int h, int mxy) \
+{ \
+    do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
+}
+
+bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+bilin_1d_fn(put, 0, h, 1)
+bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+bilin_1d_fn(avg, 1, h, 1)
+
+#undef bilin_1d_fn
+
+static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, int mx, int my, int avg)
+{
+    pixel tmp[64 * 65], *tmp_ptr = tmp;
+    int tmp_h = h + 1;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define bilin_2d_fn(opn, opa) \
+static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my) \
+{ \
+    do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
+}
+
+bilin_2d_fn(put, 0)
+bilin_2d_fn(avg, 1)
+
+#undef bilin_2d_fn
+
+#define bilinf_fn_1d(sz, dir, dir_m, avg) \
+static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my) \
+{ \
+    avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
+}
+
+#define bilinf_fn_2d(sz, avg) \
+static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my) \
+{ \
+    avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
+}
+
+#else
+
+#define bilinf_fn_1d(a, b, c, d)
+#define bilinf_fn_2d(a, b)
+
+#endif
+
+#define filter_fn(sz, avg) \
+filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_2d(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, h, mx, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, v, my, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_2d(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, h, mx, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_1d(sz, v, my, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_2d(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+bilinf_fn_1d(sz, h, mx,                               avg) \
+bilinf_fn_1d(sz, v, my,                               avg) \
+bilinf_fn_2d(sz,                                      avg)
+
+#define filter_fn_set(avg) \
+filter_fn(64, avg) \
+filter_fn(32, avg) \
+filter_fn(16, avg) \
+filter_fn(8,  avg) \
+filter_fn(4,  avg)
+
+filter_fn_set(put)
+filter_fn_set(avg)
+
+#undef filter_fn
+#undef filter_fn_set
+#undef filter_fn_1d
+#undef filter_fn_2d
+#undef bilinf_fn_1d
+#undef bilinf_fn_2d
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
+{
+#if BIT_DEPTH == 12
+    ff_vp9dsp_mc_init_10(dsp);
+#else /* BIT_DEPTH == 12 */
+
+#define init_fpel(idx1, idx2, sz, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = type##sz##_c
+
+#define init_copy_avg(idx, sz) \
+    init_fpel(idx, 0, sz, copy); \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_copy_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_fpel
+
+#endif /* BIT_DEPTH == 12 */
+
+#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
+
+#if BIT_DEPTH == 12
+#define init_subpel1 init_subpel1_bd_aware
+#else
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
+    init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
+#endif
+
+#define init_subpel2(idx, idxh, idxv, dir, type) \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type) \
+    init_subpel2(idx, 1, 1, hv, type); \
+    init_subpel2(idx, 0, 1, v, type); \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+#undef init_subpel1_bd_aware
+}
+
+static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                              const uint8_t *_src, ptrdiff_t src_stride,
+                                              int w, int h, int mx, int my,
+                                              int dx, int dy, int avg,
+                                              const int16_t (*filters)[8])
+{
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
+    pixel tmp[64 * 135], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+        const int16_t *filter = filters[my];
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_filter_8tap_fn(opn, opa) \
+static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my, int dx, int dy, \
+                                            const int16_t (*filters)[8]) \
+{ \
+    do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+                     opa, filters); \
+}
+
+scaled_filter_8tap_fn(put, 0)
+scaled_filter_8tap_fn(avg, 1)
+
+#undef scaled_filter_8tap_fn
+
+#undef FILTER_8TAP
+
+#define scaled_filter_fn(sz, type, type_idx, avg) \
+static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
+                        ff_vp9_subpel_filters[type_idx]); \
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                               const uint8_t *_src, ptrdiff_t src_stride,
+                                               int w, int h, int mx, int my,
+                                               int dx, int dy, int avg)
+{
+    pixel tmp[64 * 129], *tmp_ptr = tmp;
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_bilin_fn(opn, opa) \
+static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                             const uint8_t *src, ptrdiff_t src_stride, \
+                                             int w, int h, int mx, int my, int dx, int dy) \
+{ \
+    do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
+}
+
+scaled_bilin_fn(put, 0)
+scaled_bilin_fn(avg, 1)
+
+#undef scaled_bilin_fn
+
+#undef FILTER_BILIN
+
+#define scaled_bilinf_fn(sz, avg) \
+static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
+}
+
+#else
+
+#define scaled_bilinf_fn(a, b)
+
+#endif
+
+#define scaled_filter_fns(sz, avg) \
+scaled_filter_fn(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+scaled_filter_fn(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+scaled_filter_fn(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+scaled_bilinf_fn(sz,                                      avg)
+
+#define scaled_filter_fn_set(avg) \
+scaled_filter_fns(64, avg) \
+scaled_filter_fns(32, avg) \
+scaled_filter_fns(16, avg) \
+scaled_filter_fns(8,  avg) \
+scaled_filter_fns(4,  avg)
+
+scaled_filter_fn_set(put)
+scaled_filter_fn_set(avg)
+
+#undef scaled_filter_fns
+#undef scaled_filter_fn_set
+#undef scaled_filter_fn
+#undef scaled_bilinf_fn
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
+{
+#define init_scaled_bd_aware(idx1, idx2, sz, type) \
+    dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_SHARP  ][idx2] = type##_scaled_sharp_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_scaled_mc_init_10(dsp);
+#define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
+#else
+#define init_scaled(idx1, idx2, sz, type) \
+    init_scaled_bd_aware(idx1, idx2, sz, type); \
+    dsp->smc[idx1][FILTER_BILINEAR    ][idx2] = type##_scaled_bilin_##sz##_c
+#endif
+
+#define init_scaled_put_avg(idx, sz) \
+    init_scaled(idx, 0, sz, put); \
+    init_scaled(idx, 1, sz, avg)
+
+    init_scaled_put_avg(0, 64);
+    init_scaled_put_avg(1, 32);
+    init_scaled_put_avg(2, 16);
+    init_scaled_put_avg(3,  8);
+    init_scaled_put_avg(4,  4);
+
+#undef init_scaled_put_avg
+#undef init_scaled
+#undef init_scaled_bd_aware
+}
+
+av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp)
+{
+    FUNC(ff_vp9dsp_intrapred_init)(dsp);
+    vp9dsp_itxfm_init(dsp);
+    vp9dsp_loopfilter_init(dsp);
+    FUNC(ff_vp9dsp_mc_init)(dsp);
+    FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
+}
diff --git a/libavcodec/vp9lpf.c b/libavcodec/vp9lpf.c
new file mode 100644
index 0000000..414cede
--- /dev/null
+++ b/libavcodec/vp9lpf.c
@@ -0,0 +1,202 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp9dec.h"
+
+static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
+{
+    int y, x, bytesperpixel = s->bytesperpixel;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
+        unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
+        unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
+        unsigned hm = hm1 | hm2 | hm13 | hm23;
+
+        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
+            if (col || x > 1) {
+                if (hm1 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    if (hmask1[0] & x) {
+                        if (hmask2[0] & x) {
+                            av_assert2(l[8 << ss_v] == L);
+                            s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
+                        }
+                    } else if (hm2 & x) {
+                        L = l[8 << ss_v];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
+                                               [!!(hmask2[1] & x)]
+                                               [0](ptr, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[!!(hmask1[1] & x)]
+                                            [0](ptr, ls, E, I, H);
+                    }
+                } else if (hm2 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    s->dsp.loop_filter_8[!!(hmask2[1] & x)]
+                                        [0](ptr + 8 * ls, ls, E, I, H);
+                }
+            }
+            if (ss_h) {
+                if (x & 0xAA)
+                    l += 2;
+            } else {
+                if (hm13 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    if (hm23 & x) {
+                        L = l[8 << ss_v];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
+                    }
+                } else if (hm23 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
+                }
+                l++;
+            }
+        }
+    }
+}
+
+static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
+{
+    int y, x, bytesperpixel = s->bytesperpixel;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
+        unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
+
+        for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
+            if (row || y) {
+                if (vm & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    if (vmask[0] & x) {
+                        if (vmask[0] & (x << (1 + ss_h))) {
+                            av_assert2(l[1 + ss_h] == L);
+                            s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
+                        }
+                    } else if (vm & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
+                                               [!!(vmask[1] & (x << (1 + ss_h)))]
+                                               [1](ptr, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[!!(vmask[1] & x)]
+                                            [1](ptr, ls, E, I, H);
+                    }
+                } else if (vm & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
+                                        [1](ptr + 8 * bytesperpixel, ls, E, I, H);
+                }
+            }
+            if (!ss_v) {
+                if (vm3 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    if (vm3 & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
+                    }
+                } else if (vm3 & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
+
+                    s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
+                }
+            }
+        }
+        if (ss_v) {
+            if (y & 1)
+                lvl += 16;
+        } else {
+            lvl += 8;
+        }
+    }
+}
+
+void ff_vp9_loopfilter_sb(AVCodecContext *avctx, VP9Filter *lflvl,
+                          int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
+{
+    VP9Context *s = avctx->priv_data;
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
+    uint8_t *dst = f->data[0] + yoff;
+    ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
+    uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
+    int p;
+
+    /* FIXME: In how far can we interleave the v/h loopfilter calls? E.g.
+     * if you think of them as acting on a 8x8 block max, we can interleave
+     * each v/h within the single x loop, but that only works if we work on
+     * 8 pixel blocks, and we won't always do that (we want at least 16px
+     * to use SSE2 optimizations, perhaps 32 for AVX2) */
+
+    filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
+    filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
+
+    for (p = 0; p < 2; p++) {
+        dst = f->data[1 + p] + uvoff;
+        filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
+        filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
+    }
+}
diff --git a/libavcodec/vp9mvs.c b/libavcodec/vp9mvs.c
index dde0e84..88db1c3 100644
--- a/libavcodec/vp9mvs.c
+++ b/libavcodec/vp9mvs.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,15 +25,16 @@
 #include "vp56.h"
 #include "vp9.h"
 #include "vp9data.h"
+#include "vp9dec.h"
 
 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
-                                      VP9Context *s)
+                                      VP9TileData *td)
 {
-    dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
-    dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
+    dst->x = av_clip(src->x, td->min_mv.x, td->max_mv.x);
+    dst->y = av_clip(src->y, td->min_mv.y, td->max_mv.y);
 }
 
-static void find_ref_mvs(VP9Context *s,
+static void find_ref_mvs(VP9TileData *td,
                          VP56mv *pmv, int ref, int z, int idx, int sb)
 {
     static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
@@ -64,11 +65,12 @@ static void find_ref_mvs(VP9Context *s,
         [BS_4x4]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
                        { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
     };
-    VP9Block *b = s->b;
-    int row = b->row, col = b->col, row7 = b->row7;
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
+    int row = td->row, col = td->col, row7 = td->row7;
     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 #define INVALID_MV 0x80008000U
-    uint32_t mem = INVALID_MV;
+    uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
     int i;
 
 #define RETURN_DIRECT_MV(mv)                    \
@@ -94,64 +96,72 @@ static void find_ref_mvs(VP9Context *s,
             RETURN_DIRECT_MV(b->mv[0][z]);
         }
 
-#define RETURN_MV(mv)                           \
-    do {                                        \
-        if (sb > 0) {                           \
-            VP56mv tmp;                         \
-            uint32_t m;                         \
-            clamp_mv(&tmp, &mv, s);             \
-            m = AV_RN32A(&tmp);                 \
-            if (!idx) {                         \
-                AV_WN32A(pmv, m);               \
-                return;                         \
-            } else if (mem == INVALID_MV) {     \
-                mem = m;                        \
-            } else if (m != mem) {              \
-                AV_WN32A(pmv, m);               \
-                return;                         \
-            }                                   \
-        } else {                                \
-            uint32_t m = AV_RN32A(&mv);         \
-            if (!idx) {                         \
-                clamp_mv(pmv, &mv, s);          \
-                return;                         \
-            } else if (mem == INVALID_MV) {     \
-                mem = m;                        \
-            } else if (m != mem) {              \
-                clamp_mv(pmv, &mv, s);          \
-                return;                         \
-            }                                   \
-        }                                       \
+#define RETURN_MV(mv)                                                  \
+    do {                                                               \
+        if (sb > 0) {                                                  \
+            VP56mv tmp;                                                \
+            uint32_t m;                                                \
+            av_assert2(idx == 1);                                      \
+            av_assert2(mem != INVALID_MV);                             \
+            if (mem_sub8x8 == INVALID_MV) {                            \
+                clamp_mv(&tmp, &mv, td);                               \
+                m = AV_RN32A(&tmp);                                    \
+                if (m != mem) {                                        \
+                    AV_WN32A(pmv, m);                                  \
+                    return;                                            \
+                }                                                      \
+                mem_sub8x8 = AV_RN32A(&mv);                            \
+            } else if (mem_sub8x8 != AV_RN32A(&mv)) {                  \
+                clamp_mv(&tmp, &mv, td);                               \
+                m = AV_RN32A(&tmp);                                    \
+                if (m != mem) {                                        \
+                    AV_WN32A(pmv, m);                                  \
+                } else {                                               \
+                    /* BUG I'm pretty sure this isn't the intention */ \
+                    AV_WN32A(pmv, 0);                                  \
+                }                                                      \
+                return;                                                \
+            }                                                          \
+        } else {                                                       \
+            uint32_t m = AV_RN32A(&mv);                                \
+            if (!idx) {                                                \
+                clamp_mv(pmv, &mv, td);                                \
+                return;                                                \
+            } else if (mem == INVALID_MV) {                            \
+                mem = m;                                               \
+            } else if (m != mem) {                                     \
+                clamp_mv(pmv, &mv, td);                                \
+                return;                                                \
+            }                                                          \
+        }                                                              \
     } while (0)
 
         if (row > 0) {
-            VP9MVRefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
-
+            VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
             if (mv->ref[0] == ref)
                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
             else if (mv->ref[1] == ref)
                 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
         }
-        if (col > s->tiling.tile_col_start) {
-            VP9MVRefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
-
+        if (col > td->tile_col_start) {
+            VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
             if (mv->ref[0] == ref)
-                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
+                RETURN_MV(td->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
             else if (mv->ref[1] == ref)
-                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
+                RETURN_MV(td->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
         }
         i = 2;
     } else {
         i = 0;
     }
 
-    // previously coded MVs in the neighborhood, using same reference frame
+    // previously coded MVs in this neighborhood, using same reference frame
     for (; i < 8; i++) {
         int c = p[i][0] + col, r = p[i][1] + row;
 
-        if (c >= s->tiling.tile_col_start && c < s->cols &&
+        if (c >= td->tile_col_start && c < s->cols &&
             r >= 0 && r < s->rows) {
-            VP9MVRefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
+            VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
 
             if (mv->ref[0] == ref)
                 RETURN_MV(mv->mv[0]);
@@ -161,12 +171,11 @@ static void find_ref_mvs(VP9Context *s,
     }
 
     // MV at this position in previous frame, using same reference frame
-    if (s->use_last_frame_mvs) {
-        VP9MVRefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
-
-        if (!s->last_uses_2pass)
-            ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
+    if (s->s.h.use_last_frame_mvs) {
+        VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
 
+        if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
+            ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
         if (mv->ref[0] == ref)
             RETURN_MV(mv->mv[0]);
         else if (mv->ref[1] == ref)
@@ -183,119 +192,118 @@ static void find_ref_mvs(VP9Context *s,
         }                                       \
     } while (0)
 
-    // previously coded MVs in the neighborhood, using different reference frame
+    // previously coded MVs in this neighborhood, using different reference frame
     for (i = 0; i < 8; i++) {
         int c = p[i][0] + col, r = p[i][1] + row;
 
-        if (c >= s->tiling.tile_col_start && c < s->cols &&
-            r >= 0 && r < s->rows) {
-            VP9MVRefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
+        if (c >= td->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
+            VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
 
             if (mv->ref[0] != ref && mv->ref[0] >= 0)
                 RETURN_SCALE_MV(mv->mv[0],
-                                s->signbias[mv->ref[0]] != s->signbias[ref]);
+                                s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
             if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
                 // BUG - libvpx has this condition regardless of whether
                 // we used the first ref MV and pre-scaling
                 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
-                RETURN_SCALE_MV(mv->mv[1],
-                                s->signbias[mv->ref[1]] != s->signbias[ref]);
+                RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
             }
         }
     }
 
     // MV at this position in previous frame, using different reference frame
-    if (s->use_last_frame_mvs) {
-        VP9MVRefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
+    if (s->s.h.use_last_frame_mvs) {
+        VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
 
         // no need to await_progress, because we already did that above
         if (mv->ref[0] != ref && mv->ref[0] >= 0)
-            RETURN_SCALE_MV(mv->mv[0],
-                            s->signbias[mv->ref[0]] != s->signbias[ref]);
+            RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
         if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
             // BUG - libvpx has this condition regardless of whether
             // we used the first ref MV and pre-scaling
             AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
-            RETURN_SCALE_MV(mv->mv[1],
-                            s->signbias[mv->ref[1]] != s->signbias[ref]);
+            RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
         }
     }
 
     AV_ZERO32(pmv);
+    clamp_mv(pmv, pmv, td);
 #undef INVALID_MV
 #undef RETURN_MV
 #undef RETURN_SCALE_MV
 }
 
-static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
+static av_always_inline int read_mv_component(VP9TileData *td, int idx, int hp)
 {
-    int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
-    int n, c = vp8_rac_get_tree(&s->c, ff_vp9_mv_class_tree,
+    VP9Context *s = td->s;
+    int bit, sign = vp56_rac_get_prob(td->c, s->prob.p.mv_comp[idx].sign);
+    int n, c = vp8_rac_get_tree(td->c, ff_vp9_mv_class_tree,
                                 s->prob.p.mv_comp[idx].classes);
 
-    s->counts.mv_comp[idx].sign[sign]++;
-    s->counts.mv_comp[idx].classes[c]++;
+    td->counts.mv_comp[idx].sign[sign]++;
+    td->counts.mv_comp[idx].classes[c]++;
     if (c) {
         int m;
 
         for (n = 0, m = 0; m < c; m++) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
-            n  |= bit << m;
-            s->counts.mv_comp[idx].bits[m][bit]++;
+            bit = vp56_rac_get_prob(td->c, s->prob.p.mv_comp[idx].bits[m]);
+            n |= bit << m;
+            td->counts.mv_comp[idx].bits[m][bit]++;
         }
         n <<= 3;
-        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
+        bit = vp8_rac_get_tree(td->c, ff_vp9_mv_fp_tree,
                                s->prob.p.mv_comp[idx].fp);
         n  |= bit << 1;
-        s->counts.mv_comp[idx].fp[bit]++;
+        td->counts.mv_comp[idx].fp[bit]++;
         if (hp) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
-            s->counts.mv_comp[idx].hp[bit]++;
+            bit = vp56_rac_get_prob(td->c, s->prob.p.mv_comp[idx].hp);
+            td->counts.mv_comp[idx].hp[bit]++;
             n |= bit;
         } else {
             n |= 1;
             // bug in libvpx - we count for bw entropy purposes even if the
             // bit wasn't coded
-            s->counts.mv_comp[idx].hp[1]++;
+            td->counts.mv_comp[idx].hp[1]++;
         }
         n += 8 << c;
     } else {
-        n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
-        s->counts.mv_comp[idx].class0[n]++;
-        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
+        n = vp56_rac_get_prob(td->c, s->prob.p.mv_comp[idx].class0);
+        td->counts.mv_comp[idx].class0[n]++;
+        bit = vp8_rac_get_tree(td->c, ff_vp9_mv_fp_tree,
                                s->prob.p.mv_comp[idx].class0_fp[n]);
-        s->counts.mv_comp[idx].class0_fp[n][bit]++;
+        td->counts.mv_comp[idx].class0_fp[n][bit]++;
         n = (n << 3) | (bit << 1);
         if (hp) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
-            s->counts.mv_comp[idx].class0_hp[bit]++;
+            bit = vp56_rac_get_prob(td->c, s->prob.p.mv_comp[idx].class0_hp);
+            td->counts.mv_comp[idx].class0_hp[bit]++;
             n |= bit;
         } else {
             n |= 1;
             // bug in libvpx - we count for bw entropy purposes even if the
             // bit wasn't coded
-            s->counts.mv_comp[idx].class0_hp[1]++;
+            td->counts.mv_comp[idx].class0_hp[1]++;
         }
     }
 
     return sign ? -(n + 1) : (n + 1);
 }
 
-void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb)
+void ff_vp9_fill_mv(VP9TileData *td, VP56mv *mv, int mode, int sb)
 {
-    VP9Block *b = s->b;
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
 
     if (mode == ZEROMV) {
-        memset(mv, 0, sizeof(*mv) * 2);
+        AV_ZERO64(mv);
     } else {
         int hp;
 
         // FIXME cache this value and reuse for other subblocks
-        find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
+        find_ref_mvs(td, &mv[0], b->ref[0], 0, mode == NEARMV,
                      mode == NEWMV ? -1 : sb);
         // FIXME maybe move this code into find_ref_mvs()
         if ((mode == NEWMV || sb == -1) &&
-            !(hp = s->highprecisionmvs &&
+            !(hp = s->s.h.highprecisionmvs &&
               abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
             if (mv[0].y & 1) {
                 if (mv[0].y < 0)
@@ -311,22 +319,22 @@ void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb)
             }
         }
         if (mode == NEWMV) {
-            enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
+            enum MVJoint j = vp8_rac_get_tree(td->c, ff_vp9_mv_joint_tree,
                                               s->prob.p.mv_joint);
 
-            s->counts.mv_joint[j]++;
+            td->counts.mv_joint[j]++;
             if (j >= MV_JOINT_V)
-                mv[0].y += read_mv_component(s, 0, hp);
+                mv[0].y += read_mv_component(td, 0, hp);
             if (j & 1)
-                mv[0].x += read_mv_component(s, 1, hp);
+                mv[0].x += read_mv_component(td, 1, hp);
         }
 
         if (b->comp) {
             // FIXME cache this value and reuse for other subblocks
-            find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
+            find_ref_mvs(td, &mv[1], b->ref[1], 1, mode == NEARMV,
                          mode == NEWMV ? -1 : sb);
             if ((mode == NEWMV || sb == -1) &&
-                !(hp = s->highprecisionmvs &&
+                !(hp = s->s.h.highprecisionmvs &&
                   abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
                 if (mv[1].y & 1) {
                     if (mv[1].y < 0)
@@ -342,14 +350,14 @@ void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb)
                 }
             }
             if (mode == NEWMV) {
-                enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
+                enum MVJoint j = vp8_rac_get_tree(td->c, ff_vp9_mv_joint_tree,
                                                   s->prob.p.mv_joint);
 
-                s->counts.mv_joint[j]++;
+                td->counts.mv_joint[j]++;
                 if (j >= MV_JOINT_V)
-                    mv[1].y += read_mv_component(s, 0, hp);
+                    mv[1].y += read_mv_component(td, 0, hp);
                 if (j & 1)
-                    mv[1].x += read_mv_component(s, 1, hp);
+                    mv[1].x += read_mv_component(td, 1, hp);
             }
         }
     }
diff --git a/libavcodec/vp9prob.c b/libavcodec/vp9prob.c
index b8a7c22..fb295b4 100644
--- a/libavcodec/vp9prob.c
+++ b/libavcodec/vp9prob.c
@@ -4,26 +4,27 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "vp56.h"
 #include "vp9.h"
 #include "vp9data.h"
+#include "vp9dec.h"
 
 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
                                         int max_count, int update_factor)
@@ -33,11 +34,10 @@ static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
     if (!ct)
         return;
 
+    update_factor = FASTDIV(update_factor * FFMIN(ct, max_count), max_count);
     p1 = *p;
-    p2 = ((ct0 << 8) + (ct >> 1)) / ct;
+    p2 = ((((int64_t) ct0) << 8) + (ct >> 1)) / ct;
     p2 = av_clip(p2, 1, 255);
-    ct = FFMIN(ct, max_count);
-    update_factor = FASTDIV(update_factor * ct, max_count);
 
     // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
     *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
@@ -46,8 +46,8 @@ static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
 void ff_vp9_adapt_probs(VP9Context *s)
 {
     int i, j, k, l, m;
-    ProbContext *p = &s->prob_ctx[s->framectxid].p;
-    int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
+    ProbContext *p = &s->prob_ctx[s->s.h.framectxid].p;
+    int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
 
     // coefficients
     for (i = 0; i < 4; i++)
@@ -55,9 +55,9 @@ void ff_vp9_adapt_probs(VP9Context *s)
             for (k = 0; k < 2; k++)
                 for (l = 0; l < 6; l++)
                     for (m = 0; m < 6; m++) {
-                        uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
-                        unsigned *e = s->counts.eob[i][j][k][l][m];
-                        unsigned *c = s->counts.coef[i][j][k][l][m];
+                        uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
+                        unsigned *e = s->td[0].counts.eob[i][j][k][l][m];
+                        unsigned *c = s->td[0].counts.coef[i][j][k][l][m];
 
                         if (l == 0 && m >= 3) // dc only has 3 pt
                             break;
@@ -67,7 +67,7 @@ void ff_vp9_adapt_probs(VP9Context *s)
                         adapt_prob(&pp[2], c[1], c[2], 24, uf);
                     }
 
-    if (s->keyframe || s->intraonly) {
+    if (s->s.h.keyframe || s->s.h.intraonly) {
         memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
         memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
         memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
@@ -77,32 +77,32 @@ void ff_vp9_adapt_probs(VP9Context *s)
 
     // skip flag
     for (i = 0; i < 3; i++)
-        adapt_prob(&p->skip[i], s->counts.skip[i][0],
-                   s->counts.skip[i][1], 20, 128);
+        adapt_prob(&p->skip[i], s->td[0].counts.skip[i][0],
+                   s->td[0].counts.skip[i][1], 20, 128);
 
     // intra/inter flag
     for (i = 0; i < 4; i++)
-        adapt_prob(&p->intra[i], s->counts.intra[i][0],
-                   s->counts.intra[i][1], 20, 128);
+        adapt_prob(&p->intra[i], s->td[0].counts.intra[i][0],
+                   s->td[0].counts.intra[i][1], 20, 128);
 
     // comppred flag
-    if (s->comppredmode == PRED_SWITCHABLE) {
+    if (s->s.h.comppredmode == PRED_SWITCHABLE) {
         for (i = 0; i < 5; i++)
-            adapt_prob(&p->comp[i], s->counts.comp[i][0],
-                       s->counts.comp[i][1], 20, 128);
+            adapt_prob(&p->comp[i], s->td[0].counts.comp[i][0],
+                       s->td[0].counts.comp[i][1], 20, 128);
     }
 
     // reference frames
-    if (s->comppredmode != PRED_SINGLEREF) {
+    if (s->s.h.comppredmode != PRED_SINGLEREF) {
         for (i = 0; i < 5; i++)
-            adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
-                       s->counts.comp_ref[i][1], 20, 128);
+            adapt_prob(&p->comp_ref[i], s->td[0].counts.comp_ref[i][0],
+                       s->td[0].counts.comp_ref[i][1], 20, 128);
     }
 
-    if (s->comppredmode != PRED_COMPREF) {
+    if (s->s.h.comppredmode != PRED_COMPREF) {
         for (i = 0; i < 5; i++) {
             uint8_t *pp = p->single_ref[i];
-            unsigned (*c)[2] = s->counts.single_ref[i];
+            unsigned (*c)[2] = s->td[0].counts.single_ref[i];
 
             adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
             adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
@@ -113,7 +113,7 @@ void ff_vp9_adapt_probs(VP9Context *s)
     for (i = 0; i < 4; i++)
         for (j = 0; j < 4; j++) {
             uint8_t *pp = p->partition[i][j];
-            unsigned *c = s->counts.partition[i][j];
+            unsigned *c = s->td[0].counts.partition[i][j];
 
             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
@@ -121,12 +121,12 @@ void ff_vp9_adapt_probs(VP9Context *s)
         }
 
     // tx size
-    if (s->txfmmode == TX_SWITCHABLE) {
+    if (s->s.h.txfmmode == TX_SWITCHABLE) {
         for (i = 0; i < 2; i++) {
-            unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
+            unsigned *c16 = s->td[0].counts.tx16p[i], *c32 = s->td[0].counts.tx32p[i];
 
-            adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0],
-                       s->counts.tx8p[i][1], 20, 128);
+            adapt_prob(&p->tx8p[i], s->td[0].counts.tx8p[i][0],
+                       s->td[0].counts.tx8p[i][1], 20, 128);
             adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
             adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
             adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
@@ -136,10 +136,10 @@ void ff_vp9_adapt_probs(VP9Context *s)
     }
 
     // interpolation filter
-    if (s->filtermode == FILTER_SWITCHABLE) {
+    if (s->s.h.filtermode == FILTER_SWITCHABLE) {
         for (i = 0; i < 4; i++) {
             uint8_t *pp = p->filter[i];
-            unsigned *c = s->counts.filter[i];
+            unsigned *c = s->td[0].counts.filter[i];
 
             adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
             adapt_prob(&pp[1], c[1], c[2], 20, 128);
@@ -149,7 +149,7 @@ void ff_vp9_adapt_probs(VP9Context *s)
     // inter modes
     for (i = 0; i < 7; i++) {
         uint8_t *pp = p->mv_mode[i];
-        unsigned *c = s->counts.mv_mode[i];
+        unsigned *c = s->td[0].counts.mv_mode[i];
 
         adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
         adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
@@ -159,7 +159,7 @@ void ff_vp9_adapt_probs(VP9Context *s)
     // mv joints
     {
         uint8_t *pp = p->mv_joint;
-        unsigned *c = s->counts.mv_joint;
+        unsigned *c = s->td[0].counts.mv_joint;
 
         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
@@ -171,11 +171,11 @@ void ff_vp9_adapt_probs(VP9Context *s)
         uint8_t *pp;
         unsigned *c, (*c2)[2], sum;
 
-        adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
-                   s->counts.mv_comp[i].sign[1], 20, 128);
+        adapt_prob(&p->mv_comp[i].sign, s->td[0].counts.mv_comp[i].sign[0],
+                   s->td[0].counts.mv_comp[i].sign[1], 20, 128);
 
         pp  = p->mv_comp[i].classes;
-        c   = s->counts.mv_comp[i].classes;
+        c   = s->td[0].counts.mv_comp[i].classes;
         sum = c[1] + c[2] + c[3] + c[4] + c[5] +
               c[6] + c[7] + c[8] + c[9] + c[10];
         adapt_prob(&pp[0], c[0], sum, 20, 128);
@@ -193,39 +193,39 @@ void ff_vp9_adapt_probs(VP9Context *s)
         adapt_prob(&pp[8], c[7], c[8], 20, 128);
         adapt_prob(&pp[9], c[9], c[10], 20, 128);
 
-        adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
-                   s->counts.mv_comp[i].class0[1], 20, 128);
+        adapt_prob(&p->mv_comp[i].class0, s->td[0].counts.mv_comp[i].class0[0],
+                   s->td[0].counts.mv_comp[i].class0[1], 20, 128);
         pp = p->mv_comp[i].bits;
-        c2 = s->counts.mv_comp[i].bits;
+        c2 = s->td[0].counts.mv_comp[i].bits;
         for (j = 0; j < 10; j++)
             adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
 
         for (j = 0; j < 2; j++) {
             pp = p->mv_comp[i].class0_fp[j];
-            c  = s->counts.mv_comp[i].class0_fp[j];
+            c  = s->td[0].counts.mv_comp[i].class0_fp[j];
             adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
             adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
             adapt_prob(&pp[2], c[2], c[3], 20, 128);
         }
         pp = p->mv_comp[i].fp;
-        c  = s->counts.mv_comp[i].fp;
+        c  = s->td[0].counts.mv_comp[i].fp;
         adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
         adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
         adapt_prob(&pp[2], c[2], c[3], 20, 128);
 
-        if (s->highprecisionmvs) {
+        if (s->s.h.highprecisionmvs) {
             adapt_prob(&p->mv_comp[i].class0_hp,
-                       s->counts.mv_comp[i].class0_hp[0],
-                       s->counts.mv_comp[i].class0_hp[1], 20, 128);
-            adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
-                       s->counts.mv_comp[i].hp[1], 20, 128);
+                       s->td[0].counts.mv_comp[i].class0_hp[0],
+                       s->td[0].counts.mv_comp[i].class0_hp[1], 20, 128);
+            adapt_prob(&p->mv_comp[i].hp, s->td[0].counts.mv_comp[i].hp[0],
+                       s->td[0].counts.mv_comp[i].hp[1], 20, 128);
         }
     }
 
     // y intra modes
     for (i = 0; i < 4; i++) {
         uint8_t *pp = p->y_mode[i];
-        unsigned *c = s->counts.y_mode[i], sum, s2;
+        unsigned *c = s->td[0].counts.y_mode[i], sum, s2;
 
         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
@@ -250,7 +250,7 @@ void ff_vp9_adapt_probs(VP9Context *s)
     // uv intra modes
     for (i = 0; i < 10; i++) {
         uint8_t *pp = p->uv_mode[i];
-        unsigned *c = s->counts.uv_mode[i], sum, s2;
+        unsigned *c = s->td[0].counts.uv_mode[i], sum, s2;
 
         sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
         adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
diff --git a/libavcodec/vp9recon.c b/libavcodec/vp9recon.c
new file mode 100644
index 0000000..49bb04e
--- /dev/null
+++ b/libavcodec/vp9recon.c
@@ -0,0 +1,644 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "videodsp.h"
+#include "vp9data.h"
+#include "vp9dec.h"
+
+static av_always_inline int check_intra_mode(VP9TileData *td, int mode, uint8_t **a,
+                                             uint8_t *dst_edge, ptrdiff_t stride_edge,
+                                             uint8_t *dst_inner, ptrdiff_t stride_inner,
+                                             uint8_t *l, int col, int x, int w,
+                                             int row, int y, enum TxfmMode tx,
+                                             int p, int ss_h, int ss_v, int bytesperpixel)
+{
+    VP9Context *s = td->s;
+    int have_top = row > 0 || y > 0;
+    int have_left = col > td->tile_col_start || x > 0;
+    int have_right = x < w - 1;
+    int bpp = s->s.h.bpp;
+    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
+        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED            },
+                                   { DC_127_PRED,          VERT_PRED            } },
+        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED          },
+                                   { HOR_PRED,             HOR_PRED             } },
+        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED          },
+                                   { LEFT_DC_PRED,         DC_PRED              } },
+        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  },
+                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  } },
+        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
+                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
+        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      },
+                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      } },
+        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED        },
+                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED        } },
+        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED       },
+                                   { DC_127_PRED,          VERT_LEFT_PRED       } },
+        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED          },
+                                   { HOR_UP_PRED,          HOR_UP_PRED          } },
+        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED            },
+                                   { HOR_PRED,             TM_VP8_PRED          } },
+    };
+    static const struct {
+        uint8_t needs_left:1;
+        uint8_t needs_top:1;
+        uint8_t needs_topleft:1;
+        uint8_t needs_topright:1;
+        uint8_t invert_left:1;
+    } edges[N_INTRA_PRED_MODES] = {
+        [VERT_PRED]            = { .needs_top  = 1 },
+        [HOR_PRED]             = { .needs_left = 1 },
+        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
+        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
+        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
+        [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
+        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [LEFT_DC_PRED]         = { .needs_left = 1 },
+        [TOP_DC_PRED]          = { .needs_top  = 1 },
+        [DC_128_PRED]          = { 0 },
+        [DC_127_PRED]          = { 0 },
+        [DC_129_PRED]          = { 0 }
+    };
+
+    av_assert2(mode >= 0 && mode < 10);
+    mode = mode_conv[mode][have_left][have_top];
+    if (edges[mode].needs_top) {
+        uint8_t *top, *topleft;
+        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
+        int n_px_need_tr = 0;
+
+        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
+            n_px_need_tr = 4;
+
+        // if top of sb64-row, use s->intra_pred_data[] instead of
+        // dst[-stride] for intra prediction (it contains pre- instead of
+        // post-loopfilter data)
+        if (have_top) {
+            top = !(row & 7) && !y ?
+                s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
+                y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
+            if (have_left)
+                topleft = !(row & 7) && !y ?
+                    s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
+                    y == 0 || x == 0 ? &dst_edge[-stride_edge] :
+                    &dst_inner[-stride_inner];
+        }
+
+        if (have_top &&
+            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
+            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
+            n_px_need + n_px_need_tr <= n_px_have) {
+            *a = top;
+        } else {
+            if (have_top) {
+                if (n_px_need <= n_px_have) {
+                    memcpy(*a, top, n_px_need * bytesperpixel);
+                } else {
+#define memset_bpp(c, i1, v, i2, num) do { \
+    if (bytesperpixel == 1) { \
+        memset(&(c)[(i1)], (v)[(i2)], (num)); \
+    } else { \
+        int n, val = AV_RN16A(&(v)[(i2) * 2]); \
+        for (n = 0; n < (num); n++) { \
+            AV_WN16A(&(c)[((i1) + n) * 2], val); \
+        } \
+    } \
+} while (0)
+                    memcpy(*a, top, n_px_have * bytesperpixel);
+                    memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
+                }
+            } else {
+#define memset_val(c, val, num) do { \
+    if (bytesperpixel == 1) { \
+        memset((c), (val), (num)); \
+    } else { \
+        int n; \
+        for (n = 0; n < (num); n++) { \
+            AV_WN16A(&(c)[n * 2], (val)); \
+        } \
+    } \
+} while (0)
+                memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
+            }
+            if (edges[mode].needs_topleft) {
+                if (have_left && have_top) {
+#define assign_bpp(c, i1, v, i2) do { \
+    if (bytesperpixel == 1) { \
+        (c)[(i1)] = (v)[(i2)]; \
+    } else { \
+        AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
+    } \
+} while (0)
+                    assign_bpp(*a, -1, topleft, -1);
+                } else {
+#define assign_val(c, i, v) do { \
+    if (bytesperpixel == 1) { \
+        (c)[(i)] = (v); \
+    } else { \
+        AV_WN16A(&(c)[(i) * 2], (v)); \
+    } \
+} while (0)
+                    assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
+                }
+            }
+            if (tx == TX_4X4 && edges[mode].needs_topright) {
+                if (have_top && have_right &&
+                    n_px_need + n_px_need_tr <= n_px_have) {
+                    memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
+                } else {
+                    memset_bpp(*a, 4, *a, 3, 4);
+                }
+            }
+        }
+    }
+    if (edges[mode].needs_left) {
+        if (have_left) {
+            int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
+            uint8_t *dst = x == 0 ? dst_edge : dst_inner;
+            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
+
+            if (edges[mode].invert_left) {
+                if (n_px_need <= n_px_have) {
+                    for (i = 0; i < n_px_need; i++)
+                        assign_bpp(l, i, &dst[i * stride], -1);
+                } else {
+                    for (i = 0; i < n_px_have; i++)
+                        assign_bpp(l, i, &dst[i * stride], -1);
+                    memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
+                }
+            } else {
+                if (n_px_need <= n_px_have) {
+                    for (i = 0; i < n_px_need; i++)
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
+                } else {
+                    for (i = 0; i < n_px_have; i++)
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
+                    memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
+                }
+            }
+        } else {
+            memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
+        }
+    }
+
+    return mode;
+}
+
+static av_always_inline void intra_recon(VP9TileData *td, ptrdiff_t y_off,
+                                         ptrdiff_t uv_off, int bytesperpixel)
+{
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
+    int row = td->row, col = td->col;
+    int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+    int h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+    int end_x = FFMIN(2 * (s->cols - col), w4);
+    int end_y = FFMIN(2 * (s->rows - row), h4);
+    int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
+    int uvstep1d = 1 << b->uvtx, p;
+    uint8_t *dst = td->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
+    LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
+    LOCAL_ALIGNED_32(uint8_t, l, [64]);
+
+    for (n = 0, y = 0; y < end_y; y += step1d) {
+        uint8_t *ptr = dst, *ptr_r = dst_r;
+        for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
+                               ptr_r += 4 * step1d * bytesperpixel, n += step) {
+            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
+                               y * 2 + x : 0];
+            uint8_t *a = &a_buf[32];
+            enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
+            int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&td->eob[n]) : td->eob[n];
+
+            mode = check_intra_mode(td, mode, &a, ptr_r,
+                                    s->s.frames[CUR_FRAME].tf.f->linesize[0],
+                                    ptr, td->y_stride, l,
+                                    col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
+            s->dsp.intra_pred[b->tx][mode](ptr, td->y_stride, l, a);
+            if (eob)
+                s->dsp.itxfm_add[tx][txtp](ptr, td->y_stride,
+                                           td->block + 16 * n * bytesperpixel, eob);
+        }
+        dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
+        dst   += 4 * step1d * td->y_stride;
+    }
+
+    // U/V
+    w4    >>= s->ss_h;
+    end_x >>= s->ss_h;
+    end_y >>= s->ss_v;
+    step = 1 << (b->uvtx * 2);
+    for (p = 0; p < 2; p++) {
+        dst   = td->dst[1 + p];
+        dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
+        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+            uint8_t *ptr = dst, *ptr_r = dst_r;
+            for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
+                                   ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
+                int mode = b->uvmode;
+                uint8_t *a = &a_buf[32];
+                int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&td->uveob[p][n]) : td->uveob[p][n];
+
+                mode = check_intra_mode(td, mode, &a, ptr_r,
+                                        s->s.frames[CUR_FRAME].tf.f->linesize[1],
+                                        ptr, td->uv_stride, l, col, x, w4, row, y,
+                                        b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
+                s->dsp.intra_pred[b->uvtx][mode](ptr, td->uv_stride, l, a);
+                if (eob)
+                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, td->uv_stride,
+                                                    td->uvblock[p] + 16 * n * bytesperpixel, eob);
+            }
+            dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
+            dst   += 4 * uvstep1d * td->uv_stride;
+        }
+    }
+}
+
+void ff_vp9_intra_recon_8bpp(VP9TileData *td, ptrdiff_t y_off, ptrdiff_t uv_off)
+{
+    intra_recon(td, y_off, uv_off, 1);
+}
+
+void ff_vp9_intra_recon_16bpp(VP9TileData *td, ptrdiff_t y_off, ptrdiff_t uv_off)
+{
+    intra_recon(td, y_off, uv_off, 2);
+}
+
+static av_always_inline void mc_luma_unscaled(VP9TileData *td, vp9_mc_func (*mc)[2],
+                                              uint8_t *dst, ptrdiff_t dst_stride,
+                                              const uint8_t *ref, ptrdiff_t ref_stride,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              int bw, int bh, int w, int h, int bytesperpixel)
+{
+    VP9Context *s = td->s;
+    int mx = mv->x, my = mv->y, th;
+
+    y += my >> 3;
+    x += mx >> 3;
+    ref += y * ref_stride + x * bytesperpixel;
+    mx &= 7;
+    my &= 7;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + bh + 4 * !!my + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    // The arm/aarch64 _hv filters read one more row than what actually is
+    // needed, so switch to emulated edge one pixel sooner vertically
+    // (!!my * 5) than horizontally (!!mx * 4).
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
+        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
+                                 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
+                                 160, ref_stride,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref = td->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        ref_stride = 160;
+    }
+    mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
+}
+
+static av_always_inline void mc_chroma_unscaled(VP9TileData *td, vp9_mc_func (*mc)[2],
+                                                uint8_t *dst_u, uint8_t *dst_v,
+                                                ptrdiff_t dst_stride,
+                                                const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                                const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                                ThreadFrame *ref_frame,
+                                                ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                                int bw, int bh, int w, int h, int bytesperpixel)
+{
+    VP9Context *s = td->s;
+    int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
+
+    y += my >> 4;
+    x += mx >> 4;
+    ref_u += y * src_stride_u + x * bytesperpixel;
+    ref_v += y * src_stride_v + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    // The arm/aarch64 _hv filters read one more row than what actually is
+    // needed, so switch to emulated edge one pixel sooner vertically
+    // (!!my * 5) than horizontally (!!mx * 4).
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
+        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
+                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
+                                 160, src_stride_u,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_u = td->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
+
+        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
+                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
+                                 160, src_stride_v,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_v = td->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
+    } else {
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
+    }
+}
+
+#define mc_luma_dir(td, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
+    mc_luma_unscaled(td, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                     mv, bw, bh, w, h, bytesperpixel)
+#define mc_chroma_dir(td, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
+    mc_chroma_unscaled(td, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                       row, col, mv, bw, bh, w, h, bytesperpixel)
+#define SCALED 0
+#define FN(x) x##_8bpp
+#define BYTES_PER_PIXEL 1
+#include "vp9_mc_template.c"
+#undef FN
+#undef BYTES_PER_PIXEL
+#define FN(x) x##_16bpp
+#define BYTES_PER_PIXEL 2
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+#undef BYTES_PER_PIXEL
+#undef SCALED
+
+static av_always_inline void mc_luma_scaled(VP9TileData *td, vp9_scaled_mc_func smc,
+                                            vp9_mc_func (*mc)[2],
+                                            uint8_t *dst, ptrdiff_t dst_stride,
+                                            const uint8_t *ref, ptrdiff_t ref_stride,
+                                            ThreadFrame *ref_frame,
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                            int px, int py, int pw, int ph,
+                                            int bw, int bh, int w, int h, int bytesperpixel,
+                                            const uint16_t *scale, const uint8_t *step)
+{
+    VP9Context *s = td->s;
+    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_luma_unscaled(td, mc, dst, dst_stride, ref, ref_stride, ref_frame,
+                         y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
+    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
+    // BUG libvpx seems to scale the two components separately. This introduces
+    // rounding errors but we have to reproduce them to be exactly compatible
+    // with the output from libvpx...
+    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
+    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref += y * ref_stride + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    // The arm/aarch64 _hv filters read one more row than what actually is
+    // needed, so switch to emulated edge one pixel sooner vertically
+    // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
+                                 ref - 3 * ref_stride - 3 * bytesperpixel,
+                                 288, ref_stride,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref = td->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        ref_stride = 288;
+    }
+    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
+    }
+}
+
+static av_always_inline void mc_chroma_scaled(VP9TileData *td, vp9_scaled_mc_func smc,
+                                              vp9_mc_func (*mc)[2],
+                                              uint8_t *dst_u, uint8_t *dst_v,
+                                              ptrdiff_t dst_stride,
+                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                              int px, int py, int pw, int ph,
+                                              int bw, int bh, int w, int h, int bytesperpixel,
+                                              const uint16_t *scale, const uint8_t *step)
+{
+    VP9Context *s = td->s;
+    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_chroma_unscaled(td, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
+                           ref_v, src_stride_v, ref_frame,
+                           y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    if (s->ss_h) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
+        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    } else {
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
+        mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
+    }
+    if (s->ss_v) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
+        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+    } else {
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
+        my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
+    }
+#undef scale_mv
+    y = my >> 4;
+    x = mx >> 4;
+    ref_u += y * src_stride_u + x * bytesperpixel;
+    ref_v += y * src_stride_v + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    // The arm/aarch64 _hv filters read one more row than what actually is
+    // needed, so switch to emulated edge one pixel sooner vertically
+    // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
+                                 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
+                                 288, src_stride_u,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_u = td->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
+
+        s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
+                                 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
+                                 288, src_stride_v,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_v = td->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
+    } else {
+        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
+        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
+    }
+    }
+}
+
+#define mc_luma_dir(td, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
+    mc_luma_scaled(td, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                   s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define mc_chroma_dir(td, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
+    mc_chroma_scaled(td, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                     s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define SCALED 1
+#define FN(x) x##_scaled_8bpp
+#define BYTES_PER_PIXEL 1
+#include "vp9_mc_template.c"
+#undef FN
+#undef BYTES_PER_PIXEL
+#define FN(x) x##_scaled_16bpp
+#define BYTES_PER_PIXEL 2
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+#undef BYTES_PER_PIXEL
+#undef SCALED
+
+static av_always_inline void inter_recon(VP9TileData *td, int bytesperpixel)
+{
+    VP9Context *s = td->s;
+    VP9Block *b = td->b;
+    int row = td->row, col = td->col;
+
+    if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
+        if (bytesperpixel == 1) {
+            inter_pred_scaled_8bpp(td);
+        } else {
+            inter_pred_scaled_16bpp(td);
+        }
+    } else {
+        if (bytesperpixel == 1) {
+            inter_pred_8bpp(td);
+        } else {
+            inter_pred_16bpp(td);
+        }
+    }
+
+    if (!b->skip) {
+        /* mostly copied intra_recon() */
+
+        int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+        int h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+        int end_x = FFMIN(2 * (s->cols - col), w4);
+        int end_y = FFMIN(2 * (s->rows - row), h4);
+        int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
+        int uvstep1d = 1 << b->uvtx, p;
+        uint8_t *dst = td->dst[0];
+
+        // y itxfm add
+        for (n = 0, y = 0; y < end_y; y += step1d) {
+            uint8_t *ptr = dst;
+            for (x = 0; x < end_x; x += step1d,
+                 ptr += 4 * step1d * bytesperpixel, n += step) {
+                int eob = b->tx > TX_8X8 ? AV_RN16A(&td->eob[n]) : td->eob[n];
+
+                if (eob)
+                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, td->y_stride,
+                                                  td->block + 16 * n * bytesperpixel, eob);
+            }
+            dst += 4 * td->y_stride * step1d;
+        }
+
+        // uv itxfm add
+        end_x >>= s->ss_h;
+        end_y >>= s->ss_v;
+        step = 1 << (b->uvtx * 2);
+        for (p = 0; p < 2; p++) {
+            dst = td->dst[p + 1];
+            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+                uint8_t *ptr = dst;
+                for (x = 0; x < end_x; x += uvstep1d,
+                     ptr += 4 * uvstep1d * bytesperpixel, n += step) {
+                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&td->uveob[p][n]) : td->uveob[p][n];
+
+                    if (eob)
+                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, td->uv_stride,
+                                                        td->uvblock[p] + 16 * n * bytesperpixel, eob);
+                }
+                dst += 4 * uvstep1d * td->uv_stride;
+            }
+        }
+    }
+}
+
+void ff_vp9_inter_recon_8bpp(VP9TileData *td)
+{
+    inter_recon(td, 1);
+}
+
+void ff_vp9_inter_recon_16bpp(VP9TileData *td)
+{
+    inter_recon(td, 2);
+}
diff --git a/libavcodec/vp9shared.h b/libavcodec/vp9shared.h
new file mode 100644
index 0000000..54726df
--- /dev/null
+++ b/libavcodec/vp9shared.h
@@ -0,0 +1,169 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VP9SHARED_H
+#define AVCODEC_VP9SHARED_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "vp9.h"
+#include "thread.h"
+#include "vp56.h"
+
+enum BlockPartition {
+    PARTITION_NONE,    // [ ] <-.
+    PARTITION_H,       // [-]   |
+    PARTITION_V,       // [|]   |
+    PARTITION_SPLIT,   // [+] --'
+};
+
+enum InterPredMode {
+    NEARESTMV = 10,
+    NEARMV    = 11,
+    ZEROMV    = 12,
+    NEWMV     = 13,
+};
+
+enum CompPredMode {
+    PRED_SINGLEREF,
+    PRED_COMPREF,
+    PRED_SWITCHABLE,
+};
+
+typedef struct VP9mvrefPair {
+    VP56mv mv[2];
+    int8_t ref[2];
+} VP9mvrefPair;
+
+typedef struct VP9Frame {
+    ThreadFrame tf;
+    AVBufferRef *extradata;
+    uint8_t *segmentation_map;
+    VP9mvrefPair *mv;
+    int uses_2pass;
+
+    AVBufferRef *hwaccel_priv_buf;
+    void *hwaccel_picture_private;
+} VP9Frame;
+
+enum BlockLevel {
+    BL_64X64,
+    BL_32X32,
+    BL_16X16,
+    BL_8X8,
+};
+
+enum BlockSize {
+    BS_64x64,
+    BS_64x32,
+    BS_32x64,
+    BS_32x32,
+    BS_32x16,
+    BS_16x32,
+    BS_16x16,
+    BS_16x8,
+    BS_8x16,
+    BS_8x8,
+    BS_8x4,
+    BS_4x8,
+    BS_4x4,
+    N_BS_SIZES,
+};
+
+typedef struct VP9BitstreamHeader {
+    // bitstream header
+    uint8_t profile;
+    uint8_t bpp;
+    uint8_t keyframe;
+    uint8_t invisible;
+    uint8_t errorres;
+    uint8_t intraonly;
+    uint8_t resetctx;
+    uint8_t refreshrefmask;
+    uint8_t highprecisionmvs;
+    enum FilterMode filtermode;
+    uint8_t allowcompinter;
+    uint8_t refreshctx;
+    uint8_t parallelmode;
+    uint8_t framectxid;
+    uint8_t use_last_frame_mvs;
+    uint8_t refidx[3];
+    uint8_t signbias[3];
+    uint8_t fixcompref;
+    uint8_t varcompref[2];
+    struct {
+        uint8_t level;
+        int8_t sharpness;
+    } filter;
+    struct {
+        uint8_t enabled;
+        uint8_t updated;
+        int8_t mode[2];
+        int8_t ref[4];
+    } lf_delta;
+    uint8_t yac_qi;
+    int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
+    uint8_t lossless;
+#define MAX_SEGMENT 8
+    struct {
+        uint8_t enabled;
+        uint8_t temporal;
+        uint8_t absolute_vals;
+        uint8_t update_map;
+        uint8_t prob[7];
+        uint8_t pred_prob[3];
+        struct {
+            uint8_t q_enabled;
+            uint8_t lf_enabled;
+            uint8_t ref_enabled;
+            uint8_t skip_enabled;
+            uint8_t ref_val;
+            int16_t q_val;
+            int8_t lf_val;
+            int16_t qmul[2][2];
+            uint8_t lflvl[4][2];
+        } feat[MAX_SEGMENT];
+    } segmentation;
+    enum TxfmMode txfmmode;
+    enum CompPredMode comppredmode;
+    struct {
+        unsigned log2_tile_cols, log2_tile_rows;
+        unsigned tile_cols, tile_rows;
+    } tiling;
+
+    int uncompressed_header_size;
+    int compressed_header_size;
+} VP9BitstreamHeader;
+
+typedef struct VP9SharedContext {
+    VP9BitstreamHeader h;
+
+    ThreadFrame refs[8];
+#define CUR_FRAME 0
+#define REF_FRAME_MVPAIR 1
+#define REF_FRAME_SEGMAP 2
+    VP9Frame frames[3];
+} VP9SharedContext;
+
+#endif /* AVCODEC_VP9SHARED_H */
diff --git a/libavcodec/vqavideo.c b/libavcodec/vqavideo.c
index 0d0d59a..0e70be1 100644
--- a/libavcodec/vqavideo.c
+++ b/libavcodec/vqavideo.c
@@ -2,20 +2,20 @@
  * Westwood Studios VQA Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
 
     /* make sure the extradata made it */
     if (s->avctx->extradata_size != VQA_HEADER_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: expected extradata size of %d\n", VQA_HEADER_SIZE);
+        av_log(s->avctx, AV_LOG_ERROR, "expected extradata size of %d\n", VQA_HEADER_SIZE);
         return AVERROR(EINVAL);
     }
 
@@ -162,8 +162,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->width  & (s->vector_width  - 1) ||
-        s->height & (s->vector_height - 1)) {
+    if (s->width % s->vector_width || s->height % s->vector_height) {
         av_log(avctx, AV_LOG_ERROR, "Image size not multiple of block size\n");
         return AVERROR_INVALIDDATA;
     }
@@ -180,7 +179,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
     /* allocate decode buffer */
     s->decode_buffer_size = (s->width / s->vector_width) *
         (s->height / s->vector_height) * 2;
-    s->decode_buffer = av_malloc(s->decode_buffer_size);
+    s->decode_buffer = av_mallocz(s->decode_buffer_size);
     if (!s->decode_buffer)
         goto fail;
 
@@ -208,22 +207,22 @@ fail:
 
 #define CHECK_COUNT() \
     if (dest_index + count > dest_size) { \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: next op would overflow dest_index\n"); \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: current dest_index = %d, count = %d, dest_size = %d\n", \
+        av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: next op would overflow dest_index\n"); \
+        av_log(s->avctx, AV_LOG_ERROR, "current dest_index = %d, count = %d, dest_size = %d\n", \
             dest_index, count, dest_size); \
         return AVERROR_INVALIDDATA; \
     }
 
 #define CHECK_COPY(idx) \
     if (idx < 0 || idx + count > dest_size) { \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: next op would overflow dest_index\n"); \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: current src_pos = %d, count = %d, dest_size = %d\n", \
+        av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: next op would overflow dest_index\n"); \
+        av_log(s->avctx, AV_LOG_ERROR, "current src_pos = %d, count = %d, dest_size = %d\n", \
             src_pos, count, dest_size); \
         return AVERROR_INVALIDDATA; \
     }
 
 
-static int decode_format80(GetByteContext *gb, int src_size,
+static int decode_format80(VqaContext *s, int src_size,
     unsigned char *dest, int dest_size, int check_size) {
 
     int dest_index = 0;
@@ -232,26 +231,32 @@ static int decode_format80(GetByteContext *gb, int src_size,
     unsigned char color;
     int i;
 
-    start = bytestream2_tell(gb);
-    while (bytestream2_tell(gb) - start < src_size) {
-        opcode = bytestream2_get_byte(gb);
-        ff_dlog(NULL, "      opcode %02X: ", opcode);
+    if (src_size < 0 || src_size > bytestream2_get_bytes_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Chunk size %d is out of range\n",
+               src_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    start = bytestream2_tell(&s->gb);
+    while (bytestream2_tell(&s->gb) - start < src_size) {
+        opcode = bytestream2_get_byte(&s->gb);
+        ff_tlog(s->avctx, "opcode %02X: ", opcode);
 
         /* 0x80 means that frame is finished */
         if (opcode == 0x80)
-            return 0;
+            break;
 
         if (dest_index >= dest_size) {
-            av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: dest_index (%d) exceeded dest_size (%d)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: dest_index (%d) exceeded dest_size (%d)\n",
                 dest_index, dest_size);
             return AVERROR_INVALIDDATA;
         }
 
         if (opcode == 0xFF) {
 
-            count   = bytestream2_get_le16(gb);
-            src_pos = bytestream2_get_le16(gb);
-            ff_dlog(NULL, "(1) copy %X bytes from absolute pos %X\n", count, src_pos);
+            count   = bytestream2_get_le16(&s->gb);
+            src_pos = bytestream2_get_le16(&s->gb);
+            ff_tlog(s->avctx, "(1) copy %X bytes from absolute pos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(src_pos);
             for (i = 0; i < count; i++)
@@ -260,9 +265,9 @@ static int decode_format80(GetByteContext *gb, int src_size,
 
         } else if (opcode == 0xFE) {
 
-            count = bytestream2_get_le16(gb);
-            color = bytestream2_get_byte(gb);
-            ff_dlog(NULL, "(2) set %X bytes to %02X\n", count, color);
+            count = bytestream2_get_le16(&s->gb);
+            color = bytestream2_get_byte(&s->gb);
+            ff_tlog(s->avctx, "(2) set %X bytes to %02X\n", count, color);
             CHECK_COUNT();
             memset(&dest[dest_index], color, count);
             dest_index += count;
@@ -270,8 +275,8 @@ static int decode_format80(GetByteContext *gb, int src_size,
         } else if ((opcode & 0xC0) == 0xC0) {
 
             count = (opcode & 0x3F) + 3;
-            src_pos = bytestream2_get_le16(gb);
-            ff_dlog(NULL, "(3) copy %X bytes from absolute pos %X\n", count, src_pos);
+            src_pos = bytestream2_get_le16(&s->gb);
+            ff_tlog(s->avctx, "(3) copy %X bytes from absolute pos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(src_pos);
             for (i = 0; i < count; i++)
@@ -281,16 +286,16 @@ static int decode_format80(GetByteContext *gb, int src_size,
         } else if (opcode > 0x80) {
 
             count = opcode & 0x3F;
-            ff_dlog(NULL, "(4) copy %X bytes from source to dest\n", count);
+            ff_tlog(s->avctx, "(4) copy %X bytes from source to dest\n", count);
             CHECK_COUNT();
-            bytestream2_get_buffer(gb, &dest[dest_index], count);
+            bytestream2_get_buffer(&s->gb, &dest[dest_index], count);
             dest_index += count;
 
         } else {
 
             count = ((opcode & 0x70) >> 4) + 3;
-            src_pos = bytestream2_get_byte(gb) | ((opcode & 0x0F) << 8);
-            ff_dlog(NULL, "(5) copy %X bytes from relpos %X\n", count, src_pos);
+            src_pos = bytestream2_get_byte(&s->gb) | ((opcode & 0x0F) << 8);
+            ff_tlog(s->avctx, "(5) copy %X bytes from relpos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(dest_index - src_pos);
             for (i = 0; i < count; i++)
@@ -304,9 +309,11 @@ static int decode_format80(GetByteContext *gb, int src_size,
      * codebook entry; it is not important for compressed codebooks because
      * not every entry needs to be filled */
     if (check_size)
-        if (dest_index < dest_size)
-            av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: decode finished with dest_index (%d) < dest_size (%d)\n",
+        if (dest_index < dest_size) {
+            av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: decode finished with dest_index (%d) < dest_size (%d)\n",
                 dest_index, dest_size);
+            memset(dest + dest_index, 0, dest_size - dest_index);
+        }
 
     return 0; // let's display what we decoded anyway
 }
@@ -377,12 +384,8 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
             break;
 
         default:
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: Found unknown chunk type: %c%c%c%c (%08X)\n",
-            (chunk_type >> 24) & 0xFF,
-            (chunk_type >> 16) & 0xFF,
-            (chunk_type >>  8) & 0xFF,
-            (chunk_type >>  0) & 0xFF,
-            chunk_type);
+            av_log(s->avctx, AV_LOG_ERROR, "Found unknown chunk type: %s (%08X)\n",
+                   av_fourcc2str(av_bswap32(chunk_type)), chunk_type);
             break;
         }
 
@@ -394,7 +397,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if ((cpl0_chunk != -1) && (cplz_chunk != -1)) {
 
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CPL0 and CPLZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CPL0 and CPLZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -412,7 +415,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         chunk_size = bytestream2_get_be32(&s->gb);
         /* sanity check the palette size */
         if (chunk_size / 3 > 256 || chunk_size > bytestream2_get_bytes_left(&s->gb)) {
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found a palette chunk with %d colors\n",
+            av_log(s->avctx, AV_LOG_ERROR, "problem: found a palette chunk with %d colors\n",
                 chunk_size / 3);
             return AVERROR_INVALIDDATA;
         }
@@ -421,7 +424,8 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
             r = bytestream2_get_byteu(&s->gb) * 4;
             g = bytestream2_get_byteu(&s->gb) * 4;
             b = bytestream2_get_byteu(&s->gb) * 4;
-            s->palette[i] = (r << 16) | (g << 8) | (b);
+            s->palette[i] = 0xFFU << 24 | r << 16 | g << 8 | b;
+            s->palette[i] |= s->palette[i] >> 6 & 0x30303;
         }
     }
 
@@ -429,7 +433,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if ((cbf0_chunk != -1) && (cbfz_chunk != -1)) {
 
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CBF0 and CBFZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CBF0 and CBFZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -438,7 +442,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
 
         bytestream2_seek(&s->gb, cbfz_chunk, SEEK_SET);
         chunk_size = bytestream2_get_be32(&s->gb);
-        if ((res = decode_format80(&s->gb, chunk_size, s->codebook,
+        if ((res = decode_format80(s, chunk_size, s->codebook,
                                    s->codebook_size, 0)) < 0)
             return res;
     }
@@ -450,7 +454,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         chunk_size = bytestream2_get_be32(&s->gb);
         /* sanity check the full codebook size */
         if (chunk_size > MAX_CODEBOOK_SIZE) {
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: CBF0 chunk too large (0x%X bytes)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "problem: CBF0 chunk too large (0x%X bytes)\n",
                 chunk_size);
             return AVERROR_INVALIDDATA;
         }
@@ -462,13 +466,13 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if (vptz_chunk == -1) {
 
         /* something is wrong if there is no VPTZ chunk */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: no VPTZ chunk found\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: no VPTZ chunk found\n");
         return AVERROR_INVALIDDATA;
     }
 
     bytestream2_seek(&s->gb, vptz_chunk, SEEK_SET);
     chunk_size = bytestream2_get_be32(&s->gb);
-    if ((res = decode_format80(&s->gb, chunk_size,
+    if ((res = decode_format80(s, chunk_size,
                                s->decode_buffer, s->decode_buffer_size, 1)) < 0)
         return res;
 
@@ -531,7 +535,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     /* handle partial codebook */
     if ((cbp0_chunk != -1) && (cbpz_chunk != -1)) {
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CBP0 and CBPZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CBP0 and CBPZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -552,7 +556,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         s->next_codebook_buffer_index += chunk_size;
 
         s->partial_countdown--;
-        if (s->partial_countdown == 0) {
+        if (s->partial_countdown <= 0) {
 
             /* time to replace codebook */
             memcpy(s->codebook, s->next_codebook_buffer,
@@ -581,12 +585,10 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         s->next_codebook_buffer_index += chunk_size;
 
         s->partial_countdown--;
-        if (s->partial_countdown == 0) {
-            GetByteContext gb;
-
-            bytestream2_init(&gb, s->next_codebook_buffer, s->next_codebook_buffer_index);
+        if (s->partial_countdown <= 0) {
+            bytestream2_init(&s->gb, s->next_codebook_buffer, s->next_codebook_buffer_index);
             /* decompress codebook */
-            if ((res = decode_format80(&gb, s->next_codebook_buffer_index,
+            if ((res = decode_format80(s, s->next_codebook_buffer_index,
                                        s->codebook, s->codebook_size, 0)) < 0)
                 return res;
 
@@ -607,10 +609,8 @@ static int vqa_decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     int res;
 
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA Video: get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
     if ((res = vqa_decode_chunk(s, frame)) < 0)
diff --git a/libavcodec/vt_internal.h b/libavcodec/vt_internal.h
new file mode 100644
index 0000000..fb64735
--- /dev/null
+++ b/libavcodec/vt_internal.h
@@ -0,0 +1,58 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VT_INTERNAL_H
+#define AVCODEC_VT_INTERNAL_H
+
+typedef struct VTContext {
+    // The current bitstream buffer.
+    uint8_t                     *bitstream;
+
+    // The current size of the bitstream.
+    int                         bitstream_size;
+
+    // The reference size used for fast reallocation.
+    int                         allocated_size;
+
+    // The core video buffer
+    CVImageBufferRef            frame;
+
+    // Current dummy frames context (depends on exact CVImageBufferRef params).
+    struct AVBufferRef         *cached_hw_frames_ctx;
+
+    // Non-NULL if the new hwaccel API is used. This is only a separate struct
+    // to ease compatibility with the old API.
+    struct AVVideotoolboxContext *vt_ctx;
+
+    // Current H264 parameters (used to trigger decoder restart on SPS changes).
+    uint8_t                     sps[3];
+    bool                        reconfig_needed;
+} VTContext;
+
+int ff_videotoolbox_alloc_frame(AVCodecContext *avctx, AVFrame *frame);
+int ff_videotoolbox_uninit(AVCodecContext *avctx);
+int ff_videotoolbox_h264_start_frame(AVCodecContext *avctx,
+                                     const uint8_t *buffer,
+                                     uint32_t size);
+int ff_videotoolbox_h264_decode_slice(AVCodecContext *avctx,
+                                      const uint8_t *buffer,
+                                      uint32_t size);
+CFDataRef ff_videotoolbox_avcc_extradata_create(AVCodecContext *avctx);
+CFDataRef ff_videotoolbox_hvcc_extradata_create(AVCodecContext *avctx);
+
+#endif /* AVCODEC_VT_INTERNAL_H */
diff --git a/libavcodec/wavpack.c b/libavcodec/wavpack.c
index 66430b9..d024280 100644
--- a/libavcodec/wavpack.c
+++ b/libavcodec/wavpack.c
@@ -2,20 +2,20 @@
  * WavPack lossless audio decoder
  * Copyright (c) 2006,2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,62 +23,18 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
+#include "thread.h"
 #include "unary.h"
+#include "wavpack.h"
 
 /**
  * @file
  * WavPack lossless audio decoder
  */
 
-#define WV_HEADER_SIZE    32
-
-#define WV_MONO           0x00000004
-#define WV_JOINT_STEREO   0x00000010
-#define WV_FALSE_STEREO   0x40000000
-
-#define WV_HYBRID_MODE    0x00000008
-#define WV_HYBRID_SHAPE   0x00000008
-#define WV_HYBRID_BITRATE 0x00000200
-#define WV_HYBRID_BALANCE 0x00000400
-#define WV_INITIAL_BLOCK  0x00000800
-#define WV_FINAL_BLOCK    0x00001000
-
-#define WV_SINGLE_BLOCK (WV_INITIAL_BLOCK | WV_FINAL_BLOCK)
-
-#define WV_FLT_SHIFT_ONES 0x01
-#define WV_FLT_SHIFT_SAME 0x02
-#define WV_FLT_SHIFT_SENT 0x04
-#define WV_FLT_ZERO_SENT  0x08
-#define WV_FLT_ZERO_SIGN  0x10
-
-enum WP_ID_Flags {
-    WP_IDF_MASK   = 0x3F,
-    WP_IDF_IGNORE = 0x20,
-    WP_IDF_ODD    = 0x40,
-    WP_IDF_LONG   = 0x80
-};
-
-enum WP_ID {
-    WP_ID_DUMMY = 0,
-    WP_ID_ENCINFO,
-    WP_ID_DECTERMS,
-    WP_ID_DECWEIGHTS,
-    WP_ID_DECSAMPLES,
-    WP_ID_ENTROPY,
-    WP_ID_HYBRID,
-    WP_ID_SHAPING,
-    WP_ID_FLOATINFO,
-    WP_ID_INT32INFO,
-    WP_ID_DATA,
-    WP_ID_CORR,
-    WP_ID_EXTRABITS,
-    WP_ID_CHANINFO,
-    WP_ID_SAMPLE_RATE = 0x27,
-};
-
 typedef struct SavedContext {
     int offset;
     int size;
@@ -86,33 +42,16 @@ typedef struct SavedContext {
     uint32_t crc;
 } SavedContext;
 
-#define MAX_TERMS 16
-
-typedef struct Decorr {
-    int delta;
-    int value;
-    int weightA;
-    int weightB;
-    int samplesA[8];
-    int samplesB[8];
-} Decorr;
-
-typedef struct WvChannel {
-    int median[3];
-    int slow_level, error_limit;
-    int bitrate_acc, bitrate_delta;
-} WvChannel;
-
 typedef struct WavpackFrameContext {
     AVCodecContext *avctx;
     int frame_flags;
     int stereo, stereo_in;
     int joint;
     uint32_t CRC;
-    BitstreamContext bc;
+    GetBitContext gb;
     int got_extra_bits;
     uint32_t crc_extra_bits;
-    BitstreamContext bc_extra_bits;
+    GetBitContext gb_extra_bits;
     int data_size; // in bits
     int samples;
     int terms;
@@ -144,103 +83,9 @@ typedef struct WavpackContext {
     int ch_offset;
 } WavpackContext;
 
-static const int wv_rates[16] = {
-     6000,  8000,  9600, 11025, 12000, 16000,  22050, 24000,
-    32000, 44100, 48000, 64000, 88200, 96000, 192000,     0
-};
-
-// exponent table copied from WavPack source
-static const uint8_t wp_exp2_table[256] = {
-    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
-    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
-    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
-    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
-    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
-    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
-    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
-    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
-    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
-    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
-    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
-    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
-    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
-};
-
-static const uint8_t wp_log2_table [] = {
-    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
-    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
-    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
-    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
-    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
-    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
-    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
-    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
-    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
-    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
-    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
-    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
-    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
-    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
-    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
-    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
-};
-
-static av_always_inline int wp_exp2(int16_t val)
-{
-    int res, neg = 0;
-
-    if (val < 0) {
-        val = -val;
-        neg = 1;
-    }
-
-    res   = wp_exp2_table[val & 0xFF] | 0x100;
-    val >>= 8;
-    res   = (val > 9) ? (res << (val - 9)) : (res >> (9 - val));
-    return neg ? -res : res;
-}
-
-static av_always_inline int wp_log2(int32_t val)
-{
-    int bits;
-
-    if (!val)
-        return 0;
-    if (val == 1)
-        return 256;
-    val += val >> 9;
-    bits = av_log2(val) + 1;
-    if (bits < 9)
-        return (bits << 8) + wp_log2_table[(val << (9 - bits)) & 0xFF];
-    else
-        return (bits << 8) + wp_log2_table[(val >> (bits - 9)) & 0xFF];
-}
-
-#define LEVEL_DECAY(a)  ((a + 0x80) >> 8)
-
-// macros for manipulating median values
-#define GET_MED(n) ((c->median[n] >> 4) + 1)
-#define DEC_MED(n) c->median[n] -= ((c->median[n] + (128 >> n) - 2) / (128 >> n)) * 2
-#define INC_MED(n) c->median[n] += ((c->median[n] + (128 >> n)    ) / (128 >> n)) * 5
-
-// macros for applying weight
-#define UPDATE_WEIGHT_CLIP(weight, delta, samples, in) \
-    if (samples && in) { \
-        if ((samples ^ in) < 0) { \
-            weight -= delta; \
-            if (weight < -1024) \
-                weight = -1024; \
-        } else { \
-            weight += delta; \
-            if (weight > 1024) \
-                weight = 1024; \
-        } \
-    }
+#define LEVEL_DECAY(a)  (((a) + 0x80) >> 8)
 
-static av_always_inline int get_tail(BitstreamContext *bc, int k)
+static av_always_inline unsigned get_tail(GetBitContext *gb, int k)
 {
     int p, e, res;
 
@@ -248,17 +93,19 @@ static av_always_inline int get_tail(BitstreamContext *bc, int k)
         return 0;
     p   = av_log2(k);
     e   = (1 << (p + 1)) - k - 1;
-    res = bitstream_read(bc, p);
+    res = get_bitsz(gb, p);
     if (res >= e)
-        res = (res << 1) - e + bitstream_read_bit(bc);
+        res = (res << 1) - e + get_bits1(gb);
     return res;
 }
 
-static void update_error_limit(WavpackFrameContext *ctx)
+static int update_error_limit(WavpackFrameContext *ctx)
 {
     int i, br[2], sl[2];
 
     for (i = 0; i <= ctx->stereo_in; i++) {
+        if (ctx->ch[i].bitrate_acc > UINT_MAX - ctx->ch[i].bitrate_delta)
+            return AVERROR_INVALIDDATA;
         ctx->ch[i].bitrate_acc += ctx->ch[i].bitrate_delta;
         br[i]                   = ctx->ch[i].bitrate_acc >> 16;
         sl[i]                   = LEVEL_DECAY(ctx->ch[i].slow_level);
@@ -266,10 +113,10 @@ static void update_error_limit(WavpackFrameContext *ctx)
     if (ctx->stereo_in && ctx->hybrid_bitrate) {
         int balance = (sl[1] - sl[0] + br[1] + 1) >> 1;
         if (balance > br[0]) {
-            br[1] = br[0] << 1;
+            br[1] = br[0] * 2;
             br[0] = 0;
         } else if (-balance > br[0]) {
-            br[0] <<= 1;
+            br[0]  *= 2;
             br[1]   = 0;
         } else {
             br[1] = br[0] + balance;
@@ -286,9 +133,11 @@ static void update_error_limit(WavpackFrameContext *ctx)
             ctx->ch[i].error_limit = wp_exp2(br[i]);
         }
     }
+
+    return 0;
 }
 
-static int wv_get_value(WavpackFrameContext *ctx, BitstreamContext *bc,
+static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
                         int channel, int *last)
 {
     int t, t2;
@@ -306,13 +155,13 @@ static int wv_get_value(WavpackFrameContext *ctx, BitstreamContext *bc,
                 return 0;
             }
         } else {
-            t = get_unary_0_33(bc);
+            t = get_unary_0_33(gb);
             if (t >= 2) {
-                if (bitstream_bits_left(bc) < t - 1)
+                if (t >= 32 || get_bits_left(gb) < t - 1)
                     goto error;
-                t = bitstream_read(bc, t - 1) | (1 << (t - 1));
+                t = get_bits_long(gb, t - 1) | (1 << (t - 1));
             } else {
-                if (bitstream_bits_left(bc) < 0)
+                if (get_bits_left(gb) < 0)
                     goto error;
             }
             ctx->zeroes = t;
@@ -329,19 +178,19 @@ static int wv_get_value(WavpackFrameContext *ctx, BitstreamContext *bc,
         t         = 0;
         ctx->zero = 0;
     } else {
-        t = get_unary_0_33(bc);
-        if (bitstream_bits_left(bc) < 0)
+        t = get_unary_0_33(gb);
+        if (get_bits_left(gb) < 0)
             goto error;
         if (t == 16) {
-            t2 = get_unary_0_33(bc);
+            t2 = get_unary_0_33(gb);
             if (t2 < 2) {
-                if (bitstream_bits_left(bc) < 0)
+                if (get_bits_left(gb) < 0)
                     goto error;
                 t += t2;
             } else {
-                if (bitstream_bits_left(bc) < t2 - 1)
+                if (t2 >= 32 || get_bits_left(gb) < t2 - 1)
                     goto error;
-                t += bitstream_read(bc, t2 - 1) | (1 << (t2 - 1));
+                t += get_bits_long(gb, t2 - 1) | (1 << (t2 - 1));
             }
         }
 
@@ -355,8 +204,10 @@ static int wv_get_value(WavpackFrameContext *ctx, BitstreamContext *bc,
         ctx->zero = !ctx->one;
     }
 
-    if (ctx->hybrid && !channel)
-        update_error_limit(ctx);
+    if (ctx->hybrid && !channel) {
+        if (update_error_limit(ctx) < 0)
+            goto error;
+    }
 
     if (!t) {
         base = 0;
@@ -374,51 +225,59 @@ static int wv_get_value(WavpackFrameContext *ctx, BitstreamContext *bc,
         INC_MED(1);
         DEC_MED(2);
     } else {
-        base = GET_MED(0) + GET_MED(1) + GET_MED(2) * (t - 2);
+        base = GET_MED(0) + GET_MED(1) + GET_MED(2) * (t - 2U);
         add  = GET_MED(2) - 1;
         INC_MED(0);
         INC_MED(1);
         INC_MED(2);
     }
     if (!c->error_limit) {
-        ret = base + get_tail(bc, add);
-        if (bitstream_bits_left(bc) <= 0)
+        if (add >= 0x2000000U) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "k %d is too large\n", add);
+            goto error;
+        }
+        ret = base + get_tail(gb, add);
+        if (get_bits_left(gb) <= 0)
             goto error;
     } else {
-        int mid = (base * 2 + add + 1) >> 1;
+        int mid = (base * 2U + add + 1) >> 1;
         while (add > c->error_limit) {
-            if (bitstream_bits_left(bc) <= 0)
+            if (get_bits_left(gb) <= 0)
                 goto error;
-            if (bitstream_read_bit(bc)) {
-                add -= (mid - base);
+            if (get_bits1(gb)) {
+                add -= (mid - (unsigned)base);
                 base = mid;
             } else
-                add = mid - base - 1;
-            mid = (base * 2 + add + 1) >> 1;
+                add = mid - (unsigned)base - 1;
+            mid = (base * 2U + add + 1) >> 1;
         }
         ret = mid;
     }
-    sign = bitstream_read_bit(bc);
+    sign = get_bits1(gb);
     if (ctx->hybrid_bitrate)
         c->slow_level += wp_log2(ret) - LEVEL_DECAY(c->slow_level);
     return sign ? ~ret : ret;
 
 error:
+    ret = get_bits_left(gb);
+    if (ret <= 0) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Too few bits (%d) left\n", ret);
+    }
     *last = 1;
     return 0;
 }
 
 static inline int wv_get_value_integer(WavpackFrameContext *s, uint32_t *crc,
-                                       int S)
+                                       unsigned S)
 {
-    int bit;
+    unsigned bit;
 
     if (s->extra_bits) {
-        S <<= s->extra_bits;
+        S *= 1 << s->extra_bits;
 
         if (s->got_extra_bits &&
-            bitstream_bits_left(&s->bc_extra_bits) >= s->extra_bits) {
-            S   |= bitstream_read(&s->bc_extra_bits, s->extra_bits);
+            get_bits_left(&s->gb_extra_bits) >= s->extra_bits) {
+            S   |= get_bits_long(&s->gb_extra_bits, s->extra_bits);
             *crc = *crc * 9 + (S & 0xffff) * 3 + ((unsigned)S >> 16);
         }
     }
@@ -444,20 +303,20 @@ static float wv_get_value_float(WavpackFrameContext *s, uint32_t *crc, int S)
 
     if (s->got_extra_bits) {
         const int max_bits  = 1 + 23 + 8 + 1;
-        const int left_bits = bitstream_bits_left(&s->bc_extra_bits);
+        const int left_bits = get_bits_left(&s->gb_extra_bits);
 
         if (left_bits + 8 * AV_INPUT_BUFFER_PADDING_SIZE < max_bits)
             return 0.0;
     }
 
     if (S) {
-        S  <<= s->float_shift;
+        S  *= 1U << s->float_shift;
         sign = S < 0;
         if (sign)
-            S = -S;
-        if (S >= 0x1000000) {
-            if (s->got_extra_bits && bitstream_read_bit(&s->bc_extra_bits))
-                S = bitstream_read(&s->bc_extra_bits, 23);
+            S = -(unsigned)S;
+        if (S >= 0x1000000U) {
+            if (s->got_extra_bits && get_bits1(&s->gb_extra_bits))
+                S = get_bits(&s->gb_extra_bits, 23);
             else
                 S = 0;
             exp = 255;
@@ -473,11 +332,11 @@ static float wv_get_value_float(WavpackFrameContext *s, uint32_t *crc, int S)
                 if ((s->float_flag & WV_FLT_SHIFT_ONES) ||
                     (s->got_extra_bits &&
                      (s->float_flag & WV_FLT_SHIFT_SAME) &&
-                     bitstream_read_bit(&s->bc_extra_bits))) {
+                     get_bits1(&s->gb_extra_bits))) {
                     S |= (1 << shift) - 1;
                 } else if (s->got_extra_bits &&
                            (s->float_flag & WV_FLT_SHIFT_SENT)) {
-                    S |= bitstream_read(&s->bc_extra_bits, shift);
+                    S |= get_bits(&s->gb_extra_bits, shift);
                 }
             }
         } else {
@@ -488,14 +347,14 @@ static float wv_get_value_float(WavpackFrameContext *s, uint32_t *crc, int S)
         sign = 0;
         exp  = 0;
         if (s->got_extra_bits && (s->float_flag & WV_FLT_ZERO_SENT)) {
-            if (bitstream_read_bit(&s->bc_extra_bits)) {
-                S = bitstream_read(&s->bc_extra_bits, 23);
+            if (get_bits1(&s->gb_extra_bits)) {
+                S = get_bits(&s->gb_extra_bits, 23);
                 if (s->float_max_exp >= 25)
-                    exp = bitstream_read(&s->bc_extra_bits, 8);
-                sign = bitstream_read_bit(&s->bc_extra_bits);
+                    exp = get_bits(&s->gb_extra_bits, 8);
+                sign = get_bits1(&s->gb_extra_bits);
             } else {
                 if (s->float_flag & WV_FLT_ZERO_SIGN)
-                    sign = bitstream_read_bit(&s->bc_extra_bits);
+                    sign = get_bits1(&s->gb_extra_bits);
             }
         }
     }
@@ -527,7 +386,7 @@ static inline int wv_check_crc(WavpackFrameContext *s, uint32_t crc,
     return 0;
 }
 
-static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
+static inline int wv_unpack_stereo(WavpackFrameContext *s, GetBitContext *gb,
                                    void *dst_l, void *dst_r, const int type)
 {
     int i, j, count = 0;
@@ -545,10 +404,10 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
 
     s->one = s->zero = s->zeroes = 0;
     do {
-        L = wv_get_value(s, bc, 0, &last);
+        L = wv_get_value(s, gb, 0, &last);
         if (last)
             break;
-        R = wv_get_value(s, bc, 1, &last);
+        R = wv_get_value(s, gb, 1, &last);
         if (last)
             break;
         for (i = 0; i < s->terms; i++) {
@@ -556,11 +415,11 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
             if (t > 0) {
                 if (t > 8) {
                     if (t & 1) {
-                        A = 2 * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1];
-                        B = 2 * s->decorr[i].samplesB[0] - s->decorr[i].samplesB[1];
+                        A = 2U * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1];
+                        B = 2U * s->decorr[i].samplesB[0] - s->decorr[i].samplesB[1];
                     } else {
-                        A = (3 * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1]) >> 1;
-                        B = (3 * s->decorr[i].samplesB[0] - s->decorr[i].samplesB[1]) >> 1;
+                        A = (int)(3U * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1]) >> 1;
+                        B = (int)(3U * s->decorr[i].samplesB[0] - s->decorr[i].samplesB[1]) >> 1;
                     }
                     s->decorr[i].samplesA[1] = s->decorr[i].samplesA[0];
                     s->decorr[i].samplesB[1] = s->decorr[i].samplesB[0];
@@ -574,8 +433,8 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
                     L2 = L + ((s->decorr[i].weightA * (int64_t)A + 512) >> 10);
                     R2 = R + ((s->decorr[i].weightB * (int64_t)B + 512) >> 10);
                 } else {
-                    L2 = L + ((s->decorr[i].weightA * A + 512) >> 10);
-                    R2 = R + ((s->decorr[i].weightB * B + 512) >> 10);
+                    L2 = L + (unsigned)((int)(s->decorr[i].weightA * (unsigned)A + 512) >> 10);
+                    R2 = R + (unsigned)((int)(s->decorr[i].weightB * (unsigned)B + 512) >> 10);
                 }
                 if (A && L)
                     s->decorr[i].weightA -= ((((L ^ A) >> 30) & 2) - 1) * s->decorr[i].delta;
@@ -587,13 +446,13 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
                 if (type != AV_SAMPLE_FMT_S16P)
                     L2 = L + ((s->decorr[i].weightA * (int64_t)s->decorr[i].samplesA[0] + 512) >> 10);
                 else
-                    L2 = L + ((s->decorr[i].weightA * s->decorr[i].samplesA[0] + 512) >> 10);
+                    L2 = L + (unsigned)((int)(s->decorr[i].weightA * (unsigned)s->decorr[i].samplesA[0] + 512) >> 10);
                 UPDATE_WEIGHT_CLIP(s->decorr[i].weightA, s->decorr[i].delta, s->decorr[i].samplesA[0], L);
                 L = L2;
                 if (type != AV_SAMPLE_FMT_S16P)
                     R2 = R + ((s->decorr[i].weightB * (int64_t)L2 + 512) >> 10);
                 else
-                    R2 = R + ((s->decorr[i].weightB * L2 + 512) >> 10);
+                    R2 = R + (unsigned)((int)(s->decorr[i].weightB * (unsigned)L2 + 512) >> 10);
                 UPDATE_WEIGHT_CLIP(s->decorr[i].weightB, s->decorr[i].delta, L2, R);
                 R                        = R2;
                 s->decorr[i].samplesA[0] = R;
@@ -601,7 +460,7 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
                 if (type != AV_SAMPLE_FMT_S16P)
                     R2 = R + ((s->decorr[i].weightB * (int64_t)s->decorr[i].samplesB[0] + 512) >> 10);
                 else
-                    R2 = R + ((s->decorr[i].weightB * s->decorr[i].samplesB[0] + 512) >> 10);
+                    R2 = R + (unsigned)((int)(s->decorr[i].weightB * (unsigned)s->decorr[i].samplesB[0] + 512) >> 10);
                 UPDATE_WEIGHT_CLIP(s->decorr[i].weightB, s->decorr[i].delta, s->decorr[i].samplesB[0], R);
                 R = R2;
 
@@ -613,15 +472,23 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
                 if (type != AV_SAMPLE_FMT_S16P)
                     L2 = L + ((s->decorr[i].weightA * (int64_t)R2 + 512) >> 10);
                 else
-                    L2 = L + ((s->decorr[i].weightA * R2 + 512) >> 10);
+                    L2 = L + (unsigned)((int)(s->decorr[i].weightA * (unsigned)R2 + 512) >> 10);
                 UPDATE_WEIGHT_CLIP(s->decorr[i].weightA, s->decorr[i].delta, R2, L);
                 L                        = L2;
                 s->decorr[i].samplesB[0] = L;
             }
         }
+
+        if (type == AV_SAMPLE_FMT_S16P) {
+            if (FFABS((int64_t)L) + FFABS((int64_t)R) > (1<<19)) {
+                av_log(s->avctx, AV_LOG_ERROR, "sample %d %d too large\n", L, R);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+
         pos = (pos + 1) & 7;
         if (s->joint)
-            L += (R -= (L >> 1));
+            L += (unsigned)(R -= (unsigned)(L >> 1));
         crc = (crc * 3 + L) * 3 + R;
 
         if (type == AV_SAMPLE_FMT_FLTP) {
@@ -638,6 +505,13 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
     } while (!last && count < s->samples);
 
     wv_reset_saved_context(s);
+
+    if (last && count < s->samples) {
+        int size = av_get_bytes_per_sample(type);
+        memset((uint8_t*)dst_l + count*size, 0, (s->samples-count)*size);
+        memset((uint8_t*)dst_r + count*size, 0, (s->samples-count)*size);
+    }
+
     if ((s->avctx->err_recognition & AV_EF_CRCCHECK) &&
         wv_check_crc(s, crc, crc_extra_bits))
         return AVERROR_INVALIDDATA;
@@ -645,7 +519,7 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, BitstreamContext *bc,
     return 0;
 }
 
-static inline int wv_unpack_mono(WavpackFrameContext *s, BitstreamContext *bc,
+static inline int wv_unpack_mono(WavpackFrameContext *s, GetBitContext *gb,
                                  void *dst, const int type)
 {
     int i, j, count = 0;
@@ -660,7 +534,7 @@ static inline int wv_unpack_mono(WavpackFrameContext *s, BitstreamContext *bc,
 
     s->one = s->zero = s->zeroes = 0;
     do {
-        T = wv_get_value(s, bc, 0, &last);
+        T = wv_get_value(s, gb, 0, &last);
         S = 0;
         if (last)
             break;
@@ -668,9 +542,9 @@ static inline int wv_unpack_mono(WavpackFrameContext *s, BitstreamContext *bc,
             t = s->decorr[i].value;
             if (t > 8) {
                 if (t & 1)
-                    A =  2 * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1];
+                    A =  2U * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1];
                 else
-                    A = (3 * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1]) >> 1;
+                    A = (int)(3U * s->decorr[i].samplesA[0] - s->decorr[i].samplesA[1]) >> 1;
                 s->decorr[i].samplesA[1] = s->decorr[i].samplesA[0];
                 j                        = 0;
             } else {
@@ -680,7 +554,7 @@ static inline int wv_unpack_mono(WavpackFrameContext *s, BitstreamContext *bc,
             if (type != AV_SAMPLE_FMT_S16P)
                 S = T + ((s->decorr[i].weightA * (int64_t)A + 512) >> 10);
             else
-                S = T + ((s->decorr[i].weightA * A + 512) >> 10);
+                S = T + (unsigned)((int)(s->decorr[i].weightA * (unsigned)A + 512) >> 10);
             if (A && T)
                 s->decorr[i].weightA -= ((((T ^ A) >> 30) & 2) - 1) * s->decorr[i].delta;
             s->decorr[i].samplesA[j] = T = S;
@@ -699,6 +573,12 @@ static inline int wv_unpack_mono(WavpackFrameContext *s, BitstreamContext *bc,
     } while (!last && count < s->samples);
 
     wv_reset_saved_context(s);
+
+    if (last && count < s->samples) {
+        int size = av_get_bytes_per_sample(type);
+        memset((uint8_t*)dst + count*size, 0, (s->samples-count)*size);
+    }
+
     if (s->avctx->err_recognition & AV_EF_CRCCHECK) {
         int ret = wv_check_crc(s, crc, crc_extra_bits);
         if (ret < 0 && s->avctx->err_recognition & AV_EF_EXPLODE)
@@ -723,6 +603,15 @@ static av_cold int wv_alloc_frame_context(WavpackContext *c)
     return 0;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    WavpackContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return 0;
+}
+#endif
+
 static av_cold int wavpack_decode_init(AVCodecContext *avctx)
 {
     WavpackContext *s = avctx->priv_data;
@@ -750,9 +639,10 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                                 AVFrame *frame, const uint8_t *buf, int buf_size)
 {
     WavpackContext *wc = avctx->priv_data;
+    ThreadFrame tframe = { .f = frame };
     WavpackFrameContext *s;
     GetByteContext gb;
-    void *samples_l, *samples_r;
+    void *samples_l = NULL, *samples_r = NULL;
     int ret;
     int got_terms   = 0, got_weights = 0, got_samples = 0,
         got_entropy = 0, got_bs      = 0, got_float   = 0, got_hybrid = 0;
@@ -797,8 +687,11 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
     s->hybrid         =   s->frame_flags & WV_HYBRID_MODE;
     s->hybrid_bitrate =   s->frame_flags & WV_HYBRID_BITRATE;
     s->post_shift     = bpp * 8 - orig_bpp + ((s->frame_flags >> 13) & 0x1f);
+    if (s->post_shift < 0 || s->post_shift > 31) {
+        return AVERROR_INVALIDDATA;
+    }
     s->hybrid_maxclip =  ((1LL << (orig_bpp - 1)) - 1);
-    s->hybrid_minclip = ((-1LL << (orig_bpp - 1)));
+    s->hybrid_minclip = ((-1UL << (orig_bpp - 1)));
     s->CRC            = bytestream2_get_le32(&gb);
 
     // parse metadata blocks
@@ -852,13 +745,13 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
             }
             for (i = 0; i < weights; i++) {
                 t = (int8_t)bytestream2_get_byte(&gb);
-                s->decorr[s->terms - i - 1].weightA = t << 3;
+                s->decorr[s->terms - i - 1].weightA = t * (1 << 3);
                 if (s->decorr[s->terms - i - 1].weightA > 0)
                     s->decorr[s->terms - i - 1].weightA +=
                         (s->decorr[s->terms - i - 1].weightA + 64) >> 7;
                 if (s->stereo_in) {
                     t = (int8_t)bytestream2_get_byte(&gb);
-                    s->decorr[s->terms - i - 1].weightB = t << 3;
+                    s->decorr[s->terms - i - 1].weightB = t * (1 << 3);
                     if (s->decorr[s->terms - i - 1].weightB > 0)
                         s->decorr[s->terms - i - 1].weightB +=
                             (s->decorr[s->terms - i - 1].weightB + 64) >> 7;
@@ -910,7 +803,7 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         case WP_ID_ENTROPY:
             if (size != 6 * (s->stereo_in + 1)) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Entropy vars size should be %i, got %i",
+                       "Entropy vars size should be %i, got %i.\n",
                        6 * (s->stereo_in + 1), size);
                 bytestream2_skip(&gb, ssize);
                 continue;
@@ -953,7 +846,11 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                 continue;
             }
             bytestream2_get_buffer(&gb, val, 4);
-            if (val[0]) {
+            if (val[0] > 30) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Invalid INT32INFO, extra_bits = %d (> 30)\n", val[0]);
+                continue;
+            } else if (val[0]) {
                 s->extra_bits = val[0];
             } else if (val[1]) {
                 s->shift = val[1];
@@ -964,6 +861,12 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                 s->and   = 1;
                 s->shift = val[3];
             }
+            if (s->shift > 31) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Invalid INT32INFO, shift = %d (> 31)\n", s->shift);
+                s->and = s->or = s->shift = 0;
+                continue;
+            }
             /* original WavPack decoder forces 32-bit lossy sound to be treated
              * as 24-bit one in order to have proper clipping */
             if (s->hybrid && bpp == 4 && s->post_shift < 8 && s->shift > 8) {
@@ -984,13 +887,20 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
             s->float_flag    = bytestream2_get_byte(&gb);
             s->float_shift   = bytestream2_get_byte(&gb);
             s->float_max_exp = bytestream2_get_byte(&gb);
+            if (s->float_shift > 31) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Invalid FLOATINFO, shift = %d (> 31)\n", s->float_shift);
+                s->float_shift = 0;
+                continue;
+            }
             got_float        = 1;
             bytestream2_skip(&gb, 1);
             break;
         case WP_ID_DATA:
             s->sc.offset = bytestream2_tell(&gb);
             s->sc.size   = size * 8;
-            bitstream_init8(&s->bc, gb.buffer, size);
+            if ((ret = init_get_bits8(&s->gb, gb.buffer, size)) < 0)
+                return ret;
             s->data_size = size * 8;
             bytestream2_skip(&gb, size);
             got_bs       = 1;
@@ -1004,8 +914,9 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
             }
             s->extra_sc.offset = bytestream2_tell(&gb);
             s->extra_sc.size   = size * 8;
-            bitstream_init8(&s->bc_extra_bits, gb.buffer, size);
-            s->crc_extra_bits  = bitstream_read(&s->bc_extra_bits, 32);
+            if ((ret = init_get_bits8(&s->gb_extra_bits, gb.buffer, size)) < 0)
+                return ret;
+            s->crc_extra_bits  = get_bits_long(&s->gb_extra_bits, 32);
             bytestream2_skip(&gb, size);
             s->got_extra_bits  = 1;
             break;
@@ -1027,12 +938,25 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                 chmask = bytestream2_get_le24(&gb);
                 break;
             case 3:
-                chmask = bytestream2_get_le32(&gb);;
+                chmask = bytestream2_get_le32(&gb);
+                break;
+            case 4:
+                size = bytestream2_get_byte(&gb);
+                chan  |= (bytestream2_get_byte(&gb) & 0xF) << 8;
+                chan  += 1;
+                if (avctx->channels != chan)
+                    av_log(avctx, AV_LOG_WARNING, "%i channels signalled"
+                           " instead of %i.\n", chan, avctx->channels);
+                chmask = bytestream2_get_le24(&gb);
                 break;
             case 5:
-                bytestream2_skip(&gb, 1);
+                size = bytestream2_get_byte(&gb);
                 chan  |= (bytestream2_get_byte(&gb) & 0xF) << 8;
-                chmask = bytestream2_get_le16(&gb);
+                chan  += 1;
+                if (avctx->channels != chan)
+                    av_log(avctx, AV_LOG_WARNING, "%i channels signalled"
+                           " instead of %i.\n", chan, avctx->channels);
+                chmask = bytestream2_get_le32(&gb);
                 break;
             default:
                 av_log(avctx, AV_LOG_ERROR, "Invalid channel info size %d\n",
@@ -1084,7 +1008,7 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         return AVERROR_INVALIDDATA;
     }
     if (s->got_extra_bits && avctx->sample_fmt != AV_SAMPLE_FMT_FLTP) {
-        const int size   = bitstream_bits_left(&s->bc_extra_bits);
+        const int size   = get_bits_left(&s->gb_extra_bits);
         const int wanted = s->samples * s->extra_bits << s->stereo_in;
         if (size < wanted) {
             av_log(avctx, AV_LOG_ERROR, "Too small EXTRABITS\n");
@@ -1115,16 +1039,15 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         }
 
         /* get output buffer */
-        frame->nb_samples = s->samples;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        frame->nb_samples = s->samples + 1;
+        if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
             return ret;
-        }
+        frame->nb_samples = s->samples;
     }
 
     if (wc->ch_offset + s->stereo >= avctx->channels) {
         av_log(avctx, AV_LOG_WARNING, "Too many channels coded in a packet.\n");
-        return (avctx->err_recognition & AV_EF_EXPLODE) ? AVERROR_INVALIDDATA : 0;
+        return ((avctx->err_recognition & AV_EF_EXPLODE) || !wc->ch_offset) ? AVERROR_INVALIDDATA : 0;
     }
 
     samples_l = frame->extended_data[wc->ch_offset];
@@ -1134,11 +1057,11 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
     wc->ch_offset += 1 + s->stereo;
 
     if (s->stereo_in) {
-        ret = wv_unpack_stereo(s, &s->bc, samples_l, samples_r, avctx->sample_fmt);
+        ret = wv_unpack_stereo(s, &s->gb, samples_l, samples_r, avctx->sample_fmt);
         if (ret < 0)
             return ret;
     } else {
-        ret = wv_unpack_mono(s, &s->bc, samples_l, avctx->sample_fmt);
+        ret = wv_unpack_mono(s, &s->gb, samples_l, avctx->sample_fmt);
         if (ret < 0)
             return ret;
 
@@ -1176,7 +1099,7 @@ static int wavpack_decode_frame(AVCodecContext *avctx, void *data,
     /* determine number of samples */
     s->samples  = AV_RL32(buf + 20);
     frame_flags = AV_RL32(buf + 24);
-    if (s->samples <= 0) {
+    if (s->samples <= 0 || s->samples > WV_MAX_SAMPLES) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of samples: %d\n",
                s->samples);
         return AVERROR_INVALIDDATA;
@@ -1234,5 +1157,6 @@ AVCodec ff_wavpack_decoder = {
     .close          = wavpack_decode_end,
     .decode         = wavpack_decode_frame,
     .flush          = wavpack_decode_flush,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/wavpack.h b/libavcodec/wavpack.h
new file mode 100644
index 0000000..6caad03
--- /dev/null
+++ b/libavcodec/wavpack.h
@@ -0,0 +1,196 @@
+/*
+ * WavPack decoder/encoder common code
+ * Copyright (c) 2006,2011 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_WAVPACK_H
+#define AVCODEC_WAVPACK_H
+
+#include "libavutil/common.h"
+
+#define MAX_TERMS      16
+#define MAX_TERM        8
+
+#define WV_HEADER_SIZE    32
+
+#define WV_MONO           0x00000004
+#define WV_JOINT_STEREO   0x00000010
+#define WV_CROSS_DECORR   0x00000020
+#define WV_FLOAT_DATA     0x00000080
+#define WV_INT32_DATA     0x00000100
+#define WV_FALSE_STEREO   0x40000000
+
+#define WV_HYBRID_MODE    0x00000008
+#define WV_HYBRID_SHAPE   0x00000008
+#define WV_HYBRID_BITRATE 0x00000200
+#define WV_HYBRID_BALANCE 0x00000400
+#define WV_INITIAL_BLOCK  0x00000800
+#define WV_FINAL_BLOCK    0x00001000
+
+#define WV_MONO_DATA    (WV_MONO | WV_FALSE_STEREO)
+
+#define WV_SINGLE_BLOCK (WV_INITIAL_BLOCK | WV_FINAL_BLOCK)
+
+#define WV_FLT_SHIFT_ONES 0x01
+#define WV_FLT_SHIFT_SAME 0x02
+#define WV_FLT_SHIFT_SENT 0x04
+#define WV_FLT_ZERO_SENT  0x08
+#define WV_FLT_ZERO_SIGN  0x10
+
+#define WV_MAX_SAMPLES    150000
+
+enum WP_ID_Flags {
+    WP_IDF_MASK   = 0x3F,
+    WP_IDF_IGNORE = 0x20,
+    WP_IDF_ODD    = 0x40,
+    WP_IDF_LONG   = 0x80
+};
+
+enum WP_ID {
+    WP_ID_DUMMY = 0,
+    WP_ID_ENCINFO,
+    WP_ID_DECTERMS,
+    WP_ID_DECWEIGHTS,
+    WP_ID_DECSAMPLES,
+    WP_ID_ENTROPY,
+    WP_ID_HYBRID,
+    WP_ID_SHAPING,
+    WP_ID_FLOATINFO,
+    WP_ID_INT32INFO,
+    WP_ID_DATA,
+    WP_ID_CORR,
+    WP_ID_EXTRABITS,
+    WP_ID_CHANINFO,
+    WP_ID_SAMPLE_RATE = 0x27,
+};
+
+typedef struct Decorr {
+    int delta;
+    int value;
+    int weightA;
+    int weightB;
+    int samplesA[MAX_TERM];
+    int samplesB[MAX_TERM];
+    int sumA;
+    int sumB;
+} Decorr;
+
+typedef struct WvChannel {
+    int median[3];
+    int slow_level, error_limit;
+    unsigned bitrate_acc, bitrate_delta;
+} WvChannel;
+
+// macros for manipulating median values
+#define GET_MED(n) ((c->median[n] >> 4) + 1)
+#define DEC_MED(n) c->median[n] -= ((int)(c->median[n] + (128U >> (n)) - 2) / (128 >> (n))) * 2U
+#define INC_MED(n) c->median[n] += ((int)(c->median[n] + (128U >> (n))    ) / (128 >> (n))) * 5U
+
+// macros for applying weight
+#define UPDATE_WEIGHT_CLIP(weight, delta, samples, in) \
+    if ((samples) && (in)) { \
+        if (((samples) ^ (in)) < 0) { \
+            (weight) -= (delta); \
+            if ((weight) < -1024) \
+                (weight) = -1024; \
+        } else { \
+            (weight) += (delta); \
+            if ((weight) > 1024) \
+                (weight) = 1024; \
+        } \
+    }
+
+static const int wv_rates[16] = {
+     6000,  8000,  9600, 11025, 12000, 16000,  22050, 24000,
+    32000, 44100, 48000, 64000, 88200, 96000, 192000,     0
+};
+
+// exponent table copied from WavPack source
+static const uint8_t wp_exp2_table[256] = {
+    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
+    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
+    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
+    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
+    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
+    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
+    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
+    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
+    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
+    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
+    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
+    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
+};
+
+static const uint8_t wp_log2_table [] = {
+    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
+    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
+    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
+    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
+    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
+    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
+    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
+    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
+    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
+    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
+    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
+    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
+    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
+    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
+};
+
+static av_always_inline int wp_exp2(int16_t val)
+{
+    int res, neg = 0;
+
+    if (val < 0) {
+        val = -val;
+        neg = 1;
+    }
+
+    res   = wp_exp2_table[val & 0xFF] | 0x100;
+    val >>= 8;
+    if (val > 31U)
+        return INT_MIN;
+    res   = (val > 9) ? (res << (val - 9)) : (res >> (9 - val));
+    return neg ? -res : res;
+}
+
+static av_always_inline int wp_log2(uint32_t val)
+{
+    int bits;
+
+    if (!val)
+        return 0;
+    if (val == 1)
+        return 256;
+    val += val >> 9;
+    bits = av_log2(val) + 1;
+    if (bits < 9)
+        return (bits << 8) + wp_log2_table[(val << (9 - bits)) & 0xFF];
+    else
+        return (bits << 8) + wp_log2_table[(val >> (bits - 9)) & 0xFF];
+}
+
+#endif /* AVCODEC_WAVPACK_H */
diff --git a/libavcodec/wavpackenc.c b/libavcodec/wavpackenc.c
new file mode 100644
index 0000000..979b921
--- /dev/null
+++ b/libavcodec/wavpackenc.c
@@ -0,0 +1,2990 @@
+/*
+ * WavPack lossless audio encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_WRITER_LE
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+#include "wavpackenc.h"
+#include "wavpack.h"
+
+#define UPDATE_WEIGHT(weight, delta, source, result) \
+    if ((source) && (result)) { \
+        int32_t s = (int32_t) ((source) ^ (result)) >> 31; \
+        weight = ((delta) ^ s) + ((weight) - s); \
+    }
+
+#define APPLY_WEIGHT_F(weight, sample) ((((((sample) & 0xffff) * (weight)) >> 9) + \
+    ((((sample) & ~0xffff) >> 9) * (weight)) + 1) >> 1)
+
+#define APPLY_WEIGHT_I(weight, sample) (((weight) * (sample) + 512) >> 10)
+
+#define APPLY_WEIGHT(weight, sample) ((sample) != (short) (sample) ? \
+    APPLY_WEIGHT_F(weight, sample) : APPLY_WEIGHT_I (weight, sample))
+
+#define CLEAR(destin) memset(&destin, 0, sizeof(destin));
+
+#define SHIFT_LSB       13
+#define SHIFT_MASK      (0x1FU << SHIFT_LSB)
+
+#define MAG_LSB         18
+#define MAG_MASK        (0x1FU << MAG_LSB)
+
+#define SRATE_LSB       23
+#define SRATE_MASK      (0xFU << SRATE_LSB)
+
+#define EXTRA_TRY_DELTAS     1
+#define EXTRA_ADJUST_DELTAS  2
+#define EXTRA_SORT_FIRST     4
+#define EXTRA_BRANCHES       8
+#define EXTRA_SORT_LAST     16
+
+typedef struct WavPackExtraInfo {
+    struct Decorr dps[MAX_TERMS];
+    int nterms, log_limit, gt16bit;
+    uint32_t best_bits;
+} WavPackExtraInfo;
+
+typedef struct WavPackWords {
+    int pend_data, holding_one, zeros_acc;
+    int holding_zero, pend_count;
+    WvChannel c[2];
+} WavPackWords;
+
+typedef struct WavPackEncodeContext {
+    AVClass *class;
+    AVCodecContext *avctx;
+    PutBitContext pb;
+    int block_samples;
+    int buffer_size;
+    int sample_index;
+    int stereo, stereo_in;
+    int ch_offset;
+
+    int32_t *samples[2];
+    int samples_size[2];
+
+    int32_t *sampleptrs[MAX_TERMS+2][2];
+    int sampleptrs_size[MAX_TERMS+2][2];
+
+    int32_t *temp_buffer[2][2];
+    int temp_buffer_size[2][2];
+
+    int32_t *best_buffer[2];
+    int best_buffer_size[2];
+
+    int32_t *js_left, *js_right;
+    int js_left_size, js_right_size;
+
+    int32_t *orig_l, *orig_r;
+    int orig_l_size, orig_r_size;
+
+    unsigned extra_flags;
+    int optimize_mono;
+    int decorr_filter;
+    int joint;
+    int num_branches;
+
+    uint32_t flags;
+    uint32_t crc_x;
+    WavPackWords w;
+
+    uint8_t int32_sent_bits, int32_zeros, int32_ones, int32_dups;
+    uint8_t float_flags, float_shift, float_max_exp, max_exp;
+    int32_t shifted_ones, shifted_zeros, shifted_both;
+    int32_t false_zeros, neg_zeros, ordata;
+
+    int num_terms, shift, joint_stereo, false_stereo;
+    int num_decorrs, num_passes, best_decorr, mask_decorr;
+    struct Decorr decorr_passes[MAX_TERMS];
+    const WavPackDecorrSpec *decorr_specs;
+    float delta_decay;
+} WavPackEncodeContext;
+
+static av_cold int wavpack_encode_init(AVCodecContext *avctx)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+
+    s->avctx = avctx;
+
+    if (avctx->channels > 255) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid channel count: %d\n", avctx->channels);
+        return AVERROR(EINVAL);
+    }
+
+    if (!avctx->frame_size) {
+        int block_samples;
+        if (!(avctx->sample_rate & 1))
+            block_samples = avctx->sample_rate / 2;
+        else
+            block_samples = avctx->sample_rate;
+
+        while (block_samples * avctx->channels > WV_MAX_SAMPLES)
+            block_samples /= 2;
+
+        while (block_samples * avctx->channels < 40000)
+            block_samples *= 2;
+        avctx->frame_size = block_samples;
+    } else if (avctx->frame_size && (avctx->frame_size < 128 ||
+                              avctx->frame_size > WV_MAX_SAMPLES)) {
+        av_log(avctx, AV_LOG_ERROR, "invalid block size: %d\n", avctx->frame_size);
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->compression_level != FF_COMPRESSION_DEFAULT) {
+        if (avctx->compression_level >= 3) {
+            s->decorr_filter = 3;
+            s->num_passes = 9;
+            if      (avctx->compression_level >= 8) {
+                s->num_branches = 4;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_SORT_LAST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 7) {
+                s->num_branches = 3;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 6) {
+                s->num_branches = 2;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 5) {
+                s->num_branches = 1;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 4) {
+                s->num_branches = 1;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_BRANCHES;
+            }
+        } else if (avctx->compression_level == 2) {
+            s->decorr_filter = 2;
+            s->num_passes = 4;
+        } else if (avctx->compression_level == 1) {
+            s->decorr_filter = 1;
+            s->num_passes = 2;
+        } else if (avctx->compression_level < 1) {
+            s->decorr_filter = 0;
+            s->num_passes = 0;
+        }
+    }
+
+    s->num_decorrs = decorr_filter_sizes[s->decorr_filter];
+    s->decorr_specs = decorr_filters[s->decorr_filter];
+
+    s->delta_decay = 2.0;
+
+    return 0;
+}
+
+static void shift_mono(int32_t *samples, int nb_samples, int shift)
+{
+    int i;
+    for (i = 0; i < nb_samples; i++)
+        samples[i] >>= shift;
+}
+
+static void shift_stereo(int32_t *left, int32_t *right,
+                         int nb_samples, int shift)
+{
+    int i;
+    for (i = 0; i < nb_samples; i++) {
+        left [i] >>= shift;
+        right[i] >>= shift;
+    }
+}
+
+#define FLOAT_SHIFT_ONES 1
+#define FLOAT_SHIFT_SAME 2
+#define FLOAT_SHIFT_SENT 4
+#define FLOAT_ZEROS_SENT 8
+#define FLOAT_NEG_ZEROS  0x10
+#define FLOAT_EXCEPTIONS 0x20
+
+#define get_mantissa(f)     ((f) & 0x7fffff)
+#define get_exponent(f)     (((f) >> 23) & 0xff)
+#define get_sign(f)         (((f) >> 31) & 0x1)
+
+static void process_float(WavPackEncodeContext *s, int32_t *sample)
+{
+    int32_t shift_count, value, f = *sample;
+
+    if (get_exponent(f) == 255) {
+        s->float_flags |= FLOAT_EXCEPTIONS;
+        value = 0x1000000;
+        shift_count = 0;
+    } else if (get_exponent(f)) {
+        shift_count = s->max_exp - get_exponent(f);
+        value = 0x800000 + get_mantissa(f);
+    } else {
+        shift_count = s->max_exp ? s->max_exp - 1 : 0;
+        value = get_mantissa(f);
+    }
+
+    if (shift_count < 25)
+        value >>= shift_count;
+    else
+        value = 0;
+
+    if (!value) {
+        if (get_exponent(f) || get_mantissa(f))
+            s->false_zeros++;
+        else if (get_sign(f))
+            s->neg_zeros++;
+    } else if (shift_count) {
+        int32_t mask = (1 << shift_count) - 1;
+
+        if (!(get_mantissa(f) & mask))
+            s->shifted_zeros++;
+        else if ((get_mantissa(f) & mask) == mask)
+            s->shifted_ones++;
+        else
+            s->shifted_both++;
+    }
+
+    s->ordata |= value;
+    *sample = get_sign(f) ? -value : value;
+}
+
+static int scan_float(WavPackEncodeContext *s,
+                      int32_t *samples_l, int32_t *samples_r,
+                      int nb_samples)
+{
+    uint32_t crc = 0xffffffffu;
+    int i;
+
+    s->shifted_ones = s->shifted_zeros = s->shifted_both = s->ordata = 0;
+    s->float_shift = s->float_flags = 0;
+    s->false_zeros = s->neg_zeros = 0;
+    s->max_exp = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t f = samples_l[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t f;
+
+            f = samples_l[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+
+            f = samples_r[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+        }
+    }
+
+    s->crc_x = crc;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            process_float(s, &samples_l[i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            process_float(s, &samples_l[i]);
+            process_float(s, &samples_r[i]);
+        }
+    }
+
+    s->float_max_exp = s->max_exp;
+
+    if (s->shifted_both)
+        s->float_flags |= FLOAT_SHIFT_SENT;
+    else if (s->shifted_ones && !s->shifted_zeros)
+        s->float_flags |= FLOAT_SHIFT_ONES;
+    else if (s->shifted_ones && s->shifted_zeros)
+        s->float_flags |= FLOAT_SHIFT_SAME;
+    else if (s->ordata && !(s->ordata & 1)) {
+        do {
+            s->float_shift++;
+            s->ordata >>= 1;
+        } while (!(s->ordata & 1));
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, s->float_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, s->float_shift);
+    }
+
+    s->flags &= ~MAG_MASK;
+
+    while (s->ordata) {
+        s->flags += 1 << MAG_LSB;
+        s->ordata >>= 1;
+    }
+
+    if (s->false_zeros || s->neg_zeros)
+        s->float_flags |= FLOAT_ZEROS_SENT;
+
+    if (s->neg_zeros)
+        s->float_flags |= FLOAT_NEG_ZEROS;
+
+    return s->float_flags & (FLOAT_EXCEPTIONS | FLOAT_ZEROS_SENT |
+                             FLOAT_SHIFT_SENT | FLOAT_SHIFT_SAME);
+}
+
+static void scan_int23(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    uint32_t magdata = 0, ordata = 0, xordata = 0, anddata = ~0;
+    int i, total_shift = 0;
+
+    s->int32_sent_bits = s->int32_zeros = s->int32_ones = s->int32_dups = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t M = samples_l[i];
+
+            magdata |= (M < 0) ? ~M : M;
+            xordata |= M ^ -(M & 1);
+            anddata &= M;
+            ordata  |= M;
+
+            if ((ordata & 1) && !(anddata & 1) && (xordata & 2))
+                return;
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t L = samples_l[i];
+            int32_t R = samples_r[i];
+
+            magdata |= (L < 0) ? ~L : L;
+            magdata |= (R < 0) ? ~R : R;
+            xordata |= L ^ -(L & 1);
+            xordata |= R ^ -(R & 1);
+            anddata &= L & R;
+            ordata  |= L | R;
+
+            if ((ordata & 1) && !(anddata & 1) && (xordata & 2))
+                return;
+        }
+    }
+
+    s->flags &= ~MAG_MASK;
+
+    while (magdata) {
+        s->flags += 1 << MAG_LSB;
+        magdata >>= 1;
+    }
+
+    if (!(s->flags & MAG_MASK))
+        return;
+
+    if (!(ordata & 1)) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_zeros++;
+            total_shift++;
+            ordata >>= 1;
+        } while (!(ordata & 1));
+    } else if (anddata & 1) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_ones++;
+            total_shift++;
+            anddata >>= 1;
+        } while (anddata & 1);
+    } else if (!(xordata & 2)) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_dups++;
+            total_shift++;
+            xordata >>= 1;
+        } while (!(xordata & 2));
+    }
+
+    if (total_shift) {
+        s->flags |= WV_INT32_DATA;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, total_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, total_shift);
+    }
+}
+
+static int scan_int32(WavPackEncodeContext *s,
+                      int32_t *samples_l, int32_t *samples_r,
+                      int nb_samples)
+{
+    uint32_t magdata = 0, ordata = 0, xordata = 0, anddata = ~0;
+    uint32_t crc = 0xffffffffu;
+    int i, total_shift = 0;
+
+    s->int32_sent_bits = s->int32_zeros = s->int32_ones = s->int32_dups = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t M = samples_l[i];
+
+            crc = crc * 9 + (M & 0xffff) * 3 + ((M >> 16) & 0xffff);
+            magdata |= (M < 0) ? ~M : M;
+            xordata |= M ^ -(M & 1);
+            anddata &= M;
+            ordata  |= M;
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t L = samples_l[i];
+            int32_t R = samples_r[i];
+
+            crc = crc * 9 + (L & 0xffff) * 3 + ((L >> 16) & 0xffff);
+            crc = crc * 9 + (R & 0xffff) * 3 + ((R >> 16) & 0xffff);
+            magdata |= (L < 0) ? ~L : L;
+            magdata |= (R < 0) ? ~R : R;
+            xordata |= L ^ -(L & 1);
+            xordata |= R ^ -(R & 1);
+            anddata &= L & R;
+            ordata  |= L | R;
+        }
+    }
+
+    s->crc_x = crc;
+    s->flags &= ~MAG_MASK;
+
+    while (magdata) {
+        s->flags += 1 << MAG_LSB;
+        magdata >>= 1;
+    }
+
+    if (!((s->flags & MAG_MASK) >> MAG_LSB)) {
+        s->flags &= ~WV_INT32_DATA;
+        return 0;
+    }
+
+    if (!(ordata & 1))
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_zeros++;
+            total_shift++;
+            ordata >>= 1;
+        } while (!(ordata & 1));
+    else if (anddata & 1)
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_ones++;
+            total_shift++;
+            anddata >>= 1;
+        } while (anddata & 1);
+    else if (!(xordata & 2))
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_dups++;
+            total_shift++;
+            xordata >>= 1;
+        } while (!(xordata & 2));
+
+    if (((s->flags & MAG_MASK) >> MAG_LSB) > 23) {
+        s->int32_sent_bits = (uint8_t)(((s->flags & MAG_MASK) >> MAG_LSB) - 23);
+        total_shift += s->int32_sent_bits;
+        s->flags &= ~MAG_MASK;
+        s->flags += 23 << MAG_LSB;
+    }
+
+    if (total_shift) {
+        s->flags |= WV_INT32_DATA;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, total_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, total_shift);
+    }
+
+    return s->int32_sent_bits;
+}
+
+static int8_t store_weight(int weight)
+{
+    weight = av_clip(weight, -1024, 1024);
+    if (weight > 0)
+        weight -= (weight + 64) >> 7;
+
+    return (weight + 4) >> 3;
+}
+
+static int restore_weight(int8_t weight)
+{
+    int result;
+
+    if ((result = (int) weight << 3) > 0)
+        result += (result + 64) >> 7;
+
+    return result;
+}
+
+static int log2s(int32_t value)
+{
+    return (value < 0) ? -wp_log2(-value) : wp_log2(value);
+}
+
+static void decorr_mono(int32_t *in_samples, int32_t *out_samples,
+                        int nb_samples, struct Decorr *dpp, int dir)
+{
+    int m = 0, i;
+
+    dpp->sumA = 0;
+
+    if (dir < 0) {
+        out_samples += (nb_samples - 1);
+        in_samples  += (nb_samples - 1);
+    }
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+
+    for (i = 0; i < MAX_TERM; i++)
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+
+    if (dpp->value > MAX_TERM) {
+        while (nb_samples--) {
+            int32_t left, sam_A;
+
+            sam_A = ((3 - (dpp->value & 1)) * dpp->samplesA[0] - dpp->samplesA[1]) >> !(dpp->value & 1);
+
+            dpp->samplesA[1] = dpp->samplesA[0];
+            dpp->samplesA[0] = left = in_samples[0];
+
+            left -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam_A, left);
+            dpp->sumA += dpp->weightA;
+            out_samples[0] = left;
+            in_samples += dir;
+            out_samples += dir;
+        }
+    } else if (dpp->value > 0) {
+        while (nb_samples--) {
+            int k = (m + dpp->value) & (MAX_TERM - 1);
+            int32_t left, sam_A;
+
+            sam_A = dpp->samplesA[m];
+            dpp->samplesA[k] = left = in_samples[0];
+            m = (m + 1) & (MAX_TERM - 1);
+
+            left -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam_A, left);
+            dpp->sumA += dpp->weightA;
+            out_samples[0] = left;
+            in_samples += dir;
+            out_samples += dir;
+        }
+    }
+
+    if (m && dpp->value > 0 && dpp->value <= MAX_TERM) {
+        int32_t temp_A[MAX_TERM];
+
+        memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+
+        for (i = 0; i < MAX_TERM; i++) {
+            dpp->samplesA[i] = temp_A[m];
+            m = (m + 1) & (MAX_TERM - 1);
+        }
+    }
+}
+
+static void reverse_mono_decorr(struct Decorr *dpp)
+{
+    if (dpp->value > MAX_TERM) {
+        int32_t sam_A;
+
+        if (dpp->value & 1)
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+        else
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+        dpp->samplesA[1] = dpp->samplesA[0];
+        dpp->samplesA[0] = sam_A;
+
+        if (dpp->value & 1)
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+        else
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+        dpp->samplesA[1] = sam_A;
+    } else if (dpp->value > 1) {
+        int i, j, k;
+
+        for (i = 0, j = dpp->value - 1, k = 0; k < dpp->value / 2; i++, j--, k++) {
+            i &= (MAX_TERM - 1);
+            j &= (MAX_TERM - 1);
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesA[j] ^= dpp->samplesA[i];
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+        }
+    }
+}
+
+static uint32_t log2sample(uint32_t v, int limit, uint32_t *result)
+{
+    uint32_t dbits;
+
+    if ((v += v >> 9) < (1 << 8)) {
+        dbits = nbits_table[v];
+        *result += (dbits << 8) + wp_log2_table[(v << (9 - dbits)) & 0xff];
+    } else {
+        if (v < (1 << 16))
+            dbits = nbits_table[v >> 8] + 8;
+        else if (v < (1 << 24))
+            dbits = nbits_table[v >> 16] + 16;
+        else
+            dbits = nbits_table[v >> 24] + 24;
+
+        *result += dbits = (dbits << 8) + wp_log2_table[(v >> (dbits - 9)) & 0xff];
+
+        if (limit && dbits >= limit)
+            return 1;
+    }
+
+    return 0;
+}
+
+static uint32_t log2mono(int32_t *samples, int nb_samples, int limit)
+{
+    uint32_t result = 0;
+    while (nb_samples--) {
+        if (log2sample(abs(*samples++), limit, &result))
+            return UINT32_MAX;
+    }
+    return result;
+}
+
+static uint32_t log2stereo(int32_t *samples_l, int32_t *samples_r,
+                           int nb_samples, int limit)
+{
+    uint32_t result = 0;
+    while (nb_samples--) {
+        if (log2sample(abs(*samples_l++), limit, &result) ||
+            log2sample(abs(*samples_r++), limit, &result))
+            return UINT32_MAX;
+    }
+    return result;
+}
+
+static void decorr_mono_buffer(int32_t *samples, int32_t *outsamples,
+                               int nb_samples, struct Decorr *dpp,
+                               int tindex)
+{
+    struct Decorr dp, *dppi = dpp + tindex;
+    int delta = dppi->delta, pre_delta, term = dppi->value;
+
+    if (delta == 7)
+        pre_delta = 7;
+    else if (delta < 2)
+        pre_delta = 3;
+    else
+        pre_delta = delta + 1;
+
+    CLEAR(dp);
+    dp.value = term;
+    dp.delta = pre_delta;
+    decorr_mono(samples, outsamples, FFMIN(2048, nb_samples), &dp, -1);
+    dp.delta = delta;
+
+    if (tindex == 0)
+        reverse_mono_decorr(&dp);
+    else
+        CLEAR(dp.samplesA);
+
+    memcpy(dppi->samplesA, dp.samplesA, sizeof(dp.samplesA));
+    dppi->weightA = dp.weightA;
+
+    if (delta == 0) {
+        dp.delta = 1;
+        decorr_mono(samples, outsamples, nb_samples, &dp, 1);
+        dp.delta = 0;
+        memcpy(dp.samplesA, dppi->samplesA, sizeof(dp.samplesA));
+        dppi->weightA = dp.weightA = dp.sumA / nb_samples;
+    }
+
+    decorr_mono(samples, outsamples, nb_samples, &dp, 1);
+}
+
+static void recurse_mono(WavPackEncodeContext *s, WavPackExtraInfo *info,
+                         int depth, int delta, uint32_t input_bits)
+{
+    int term, branches = s->num_branches - depth;
+    int32_t *samples, *outsamples;
+    uint32_t term_bits[22], bits;
+
+    if (branches < 1 || depth + 1 == info->nterms)
+        branches = 1;
+
+    CLEAR(term_bits);
+    samples = s->sampleptrs[depth][0];
+    outsamples = s->sampleptrs[depth + 1][0];
+
+    for (term = 1; term <= 18; term++) {
+        if (term == 17 && branches == 1 && depth + 1 < info->nterms)
+            continue;
+
+        if (term > 8 && term < 17)
+            continue;
+
+        if (!s->extra_flags && (term > 4 && term < 17))
+            continue;
+
+        info->dps[depth].value = term;
+        info->dps[depth].delta = delta;
+        decorr_mono_buffer(samples, outsamples, s->block_samples, info->dps, depth);
+        bits = log2mono(outsamples, s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * (depth + 1));
+            memcpy(s->sampleptrs[info->nterms + 1][0],
+                   s->sampleptrs[depth + 1][0], s->block_samples * 4);
+        }
+
+        term_bits[term + 3] = bits;
+    }
+
+    while (depth + 1 < info->nterms && branches--) {
+        uint32_t local_best_bits = input_bits;
+        int best_term = 0, i;
+
+        for (i = 0; i < 22; i++)
+            if (term_bits[i] && term_bits[i] < local_best_bits) {
+                local_best_bits = term_bits[i];
+                best_term = i - 3;
+            }
+
+        if (!best_term)
+            break;
+
+        term_bits[best_term + 3] = 0;
+
+        info->dps[depth].value = best_term;
+        info->dps[depth].delta = delta;
+        decorr_mono_buffer(samples, outsamples, s->block_samples, info->dps, depth);
+
+        recurse_mono(s, info, depth + 1, delta, local_best_bits);
+    }
+}
+
+static void sort_mono(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int reversed = 1;
+    uint32_t bits;
+
+    while (reversed) {
+        int ri, i;
+
+        memcpy(info->dps, s->decorr_passes, sizeof(s->decorr_passes));
+        reversed = 0;
+
+        for (ri = 0; ri < info->nterms && s->decorr_passes[ri].value; ri++) {
+
+            if (ri + 1 >= info->nterms || !s->decorr_passes[ri+1].value)
+                break;
+
+            if (s->decorr_passes[ri].value == s->decorr_passes[ri+1].value) {
+                decorr_mono_buffer(s->sampleptrs[ri][0], s->sampleptrs[ri+1][0],
+                                   s->block_samples, info->dps, ri);
+                continue;
+            }
+
+            info->dps[ri  ] = s->decorr_passes[ri+1];
+            info->dps[ri+1] = s->decorr_passes[ri  ];
+
+            for (i = ri; i < info->nterms && s->decorr_passes[i].value; i++)
+                decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                                   s->block_samples, info->dps, i);
+
+            bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+            if (bits < info->best_bits) {
+                reversed = 1;
+                info->best_bits = bits;
+                CLEAR(s->decorr_passes);
+                memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+                memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+                       s->block_samples * 4);
+            } else {
+                info->dps[ri  ] = s->decorr_passes[ri];
+                info->dps[ri+1] = s->decorr_passes[ri+1];
+                decorr_mono_buffer(s->sampleptrs[ri][0], s->sampleptrs[ri+1][0],
+                                   s->block_samples, info->dps, ri);
+            }
+        }
+    }
+}
+
+static void delta_mono(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int lower = 0, delta, d;
+    uint32_t bits;
+
+    if (!s->decorr_passes[0].value)
+        return;
+    delta = s->decorr_passes[0].delta;
+
+    for (d = delta - 1; d >= 0; d--) {
+        int i;
+
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                               s->block_samples, info->dps, i);
+        }
+
+        bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+
+        lower = 1;
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0],  s->sampleptrs[i][0],
+               s->block_samples * 4);
+    }
+
+    for (d = delta + 1; !lower && d <= 7; d++) {
+        int i;
+
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                               s->block_samples, info->dps, i);
+        }
+
+        bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+               s->block_samples * 4);
+    }
+}
+
+static int allocate_buffers2(WavPackEncodeContext *s, int nterms)
+{
+    int i;
+
+    for (i = 0; i < nterms + 2; i++) {
+        av_fast_padded_malloc(&s->sampleptrs[i][0], &s->sampleptrs_size[i][0],
+                              s->block_samples * 4);
+        if (!s->sampleptrs[i][0])
+            return AVERROR(ENOMEM);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->sampleptrs[i][1], &s->sampleptrs_size[i][1],
+                                  s->block_samples * 4);
+            if (!s->sampleptrs[i][1])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static int allocate_buffers(WavPackEncodeContext *s)
+{
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        av_fast_padded_malloc(&s->best_buffer[0], &s->best_buffer_size[0],
+                              s->block_samples * 4);
+        if (!s->best_buffer[0])
+            return AVERROR(ENOMEM);
+
+        av_fast_padded_malloc(&s->temp_buffer[i][0], &s->temp_buffer_size[i][0],
+                              s->block_samples * 4);
+        if (!s->temp_buffer[i][0])
+            return AVERROR(ENOMEM);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->best_buffer[1], &s->best_buffer_size[1],
+                                  s->block_samples * 4);
+            if (!s->best_buffer[1])
+                return AVERROR(ENOMEM);
+
+            av_fast_padded_malloc(&s->temp_buffer[i][1], &s->temp_buffer_size[i][1],
+                                  s->block_samples * 4);
+            if (!s->temp_buffer[i][1])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static void analyze_mono(WavPackEncodeContext *s, int32_t *samples, int do_samples)
+{
+    WavPackExtraInfo info;
+    int i;
+
+    info.log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    info.log_limit = FFMIN(6912, info.log_limit);
+
+    info.nterms = s->num_terms;
+
+    if (allocate_buffers2(s, s->num_terms))
+        return;
+
+    memcpy(info.dps, s->decorr_passes, sizeof(info.dps));
+    memcpy(s->sampleptrs[0][0], samples, s->block_samples * 4);
+
+    for (i = 0; i < info.nterms && info.dps[i].value; i++)
+        decorr_mono(s->sampleptrs[i][0], s->sampleptrs[i + 1][0],
+                    s->block_samples, info.dps + i, 1);
+
+    info.best_bits = log2mono(s->sampleptrs[info.nterms][0], s->block_samples, 0) * 1;
+    memcpy(s->sampleptrs[info.nterms + 1][0], s->sampleptrs[i][0], s->block_samples * 4);
+
+    if (s->extra_flags & EXTRA_BRANCHES)
+        recurse_mono(s, &info, 0, (int) floor(s->delta_decay + 0.5),
+                     log2mono(s->sampleptrs[0][0], s->block_samples, 0));
+
+    if (s->extra_flags & EXTRA_SORT_FIRST)
+        sort_mono(s, &info);
+
+    if (s->extra_flags & EXTRA_TRY_DELTAS) {
+        delta_mono(s, &info);
+
+        if ((s->extra_flags & EXTRA_ADJUST_DELTAS) && s->decorr_passes[0].value)
+            s->delta_decay = (float)((s->delta_decay * 2.0 + s->decorr_passes[0].delta) / 3.0);
+        else
+            s->delta_decay = 2.0;
+    }
+
+    if (s->extra_flags & EXTRA_SORT_LAST)
+        sort_mono(s, &info);
+
+    if (do_samples)
+        memcpy(samples, s->sampleptrs[info.nterms + 1][0], s->block_samples * 4);
+
+    for (i = 0; i < info.nterms; i++)
+        if (!s->decorr_passes[i].value)
+            break;
+
+    s->num_terms = i;
+}
+
+static void scan_word(WavPackEncodeContext *s, WvChannel *c,
+                      int32_t *samples, int nb_samples, int dir)
+{
+    if (dir < 0)
+        samples += nb_samples - 1;
+
+    while (nb_samples--) {
+        uint32_t low, value = labs(samples[0]);
+
+        if (value < GET_MED(0)) {
+            DEC_MED(0);
+        } else {
+            low = GET_MED(0);
+            INC_MED(0);
+
+            if (value - low < GET_MED(1)) {
+                DEC_MED(1);
+            } else {
+                low += GET_MED(1);
+                INC_MED(1);
+
+                if (value - low < GET_MED(2)) {
+                    DEC_MED(2);
+                } else {
+                    INC_MED(2);
+                }
+            }
+        }
+        samples += dir;
+    }
+}
+
+static int wv_mono(WavPackEncodeContext *s, int32_t *samples,
+                   int no_history, int do_samples)
+{
+    struct Decorr temp_decorr_pass, save_decorr_passes[MAX_TERMS] = {{0}};
+    int nb_samples = s->block_samples;
+    int buf_size = sizeof(int32_t) * nb_samples;
+    uint32_t best_size = UINT32_MAX, size;
+    int log_limit, pi, i, ret;
+
+    for (i = 0; i < nb_samples; i++)
+        if (samples[i])
+            break;
+
+    if (i == nb_samples) {
+        CLEAR(s->decorr_passes);
+        CLEAR(s->w);
+        s->num_terms = 0;
+        return 0;
+    }
+
+    log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    log_limit = FFMIN(6912, log_limit);
+
+    if ((ret = allocate_buffers(s)) < 0)
+        return ret;
+
+    if (no_history || s->num_passes >= 7)
+        s->best_decorr = s->mask_decorr = 0;
+
+    for (pi = 0; pi < s->num_passes;) {
+        const WavPackDecorrSpec *wpds;
+        int nterms, c, j;
+
+        if (!pi) {
+            c = s->best_decorr;
+        } else {
+            if (s->mask_decorr == 0)
+                c = 0;
+            else
+                c = (s->best_decorr & (s->mask_decorr - 1)) | s->mask_decorr;
+
+            if (c == s->best_decorr) {
+                s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+                continue;
+            }
+        }
+
+        wpds = &s->decorr_specs[c];
+        nterms = decorr_filter_nterms[s->decorr_filter];
+
+        while (1) {
+        memcpy(s->temp_buffer[0][0], samples, buf_size);
+        CLEAR(save_decorr_passes);
+
+        for (j = 0; j < nterms; j++) {
+            CLEAR(temp_decorr_pass);
+            temp_decorr_pass.delta = wpds->delta;
+            temp_decorr_pass.value = wpds->terms[j];
+
+            if (temp_decorr_pass.value < 0)
+                temp_decorr_pass.value = 1;
+
+            decorr_mono(s->temp_buffer[j&1][0], s->temp_buffer[~j&1][0],
+                        FFMIN(nb_samples, 2048), &temp_decorr_pass, -1);
+
+            if (j) {
+                CLEAR(temp_decorr_pass.samplesA);
+            } else {
+                reverse_mono_decorr(&temp_decorr_pass);
+            }
+
+            memcpy(save_decorr_passes + j, &temp_decorr_pass, sizeof(struct Decorr));
+            decorr_mono(s->temp_buffer[j&1][0], s->temp_buffer[~j&1][0],
+                        nb_samples, &temp_decorr_pass, 1);
+        }
+
+        size = log2mono(s->temp_buffer[j&1][0], nb_samples, log_limit);
+        if (size != UINT32_MAX || !nterms)
+            break;
+        nterms >>= 1;
+        }
+
+        if (size < best_size) {
+            memcpy(s->best_buffer[0], s->temp_buffer[j&1][0], buf_size);
+            memcpy(s->decorr_passes, save_decorr_passes, sizeof(struct Decorr) * MAX_TERMS);
+            s->num_terms = nterms;
+            s->best_decorr = c;
+            best_size = size;
+        }
+
+        if (pi++)
+            s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+    }
+
+    if (s->extra_flags)
+        analyze_mono(s, samples, do_samples);
+    else if (do_samples)
+        memcpy(samples, s->best_buffer[0], buf_size);
+
+    if (no_history || s->extra_flags) {
+        CLEAR(s->w);
+        scan_word(s, &s->w.c[0], s->best_buffer[0], nb_samples, -1);
+    }
+    return 0;
+}
+
+static void decorr_stereo(int32_t *in_left, int32_t *in_right,
+                          int32_t *out_left, int32_t *out_right,
+                          int nb_samples, struct Decorr *dpp, int dir)
+{
+    int m = 0, i;
+
+    dpp->sumA = dpp->sumB = 0;
+
+    if (dir < 0) {
+        out_left  += nb_samples - 1;
+        out_right += nb_samples - 1;
+        in_left   += nb_samples - 1;
+        in_right  += nb_samples - 1;
+    }
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+    dpp->weightB = restore_weight(store_weight(dpp->weightB));
+
+    for (i = 0; i < MAX_TERM; i++) {
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+        dpp->samplesB[i] = wp_exp2(log2s(dpp->samplesB[i]));
+    }
+
+    switch (dpp->value) {
+    case 2:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0];
+            dpp->samplesA[0] = dpp->samplesA[1];
+            out_left[0] = tmp = (dpp->samplesA[1] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[0];
+            dpp->samplesB[0] = dpp->samplesB[1];
+            out_right[0] = tmp = (dpp->samplesB[1] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case 17:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[0] = tmp = (dpp->samplesA[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[0] = tmp = (dpp->samplesB[0] = in_right[0]) - APPLY_WEIGHT (dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case 18:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[0] = tmp = (dpp->samplesA[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[0] = tmp = (dpp->samplesB[0] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    default: {
+        int k = dpp->value & (MAX_TERM - 1);
+
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            out_left[0] = tmp = (dpp->samplesA[k] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[m];
+            out_right[0] = tmp = (dpp->samplesB[k] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+            int k;
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+        }
+    case -1:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            out_left[0] = tmp = (sam_B = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            out_right[0] = tmp = (dpp->samplesA[0] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case -2:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            out_right[0] = tmp = (sam_A = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            out_left[0] = tmp = (dpp->samplesB[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case -3:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = in_right[0];
+            out_right[0] = tmp -= APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            dpp->samplesB[0] = tmp = in_left[0];
+            out_left[0] = tmp -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    }
+}
+
+static void reverse_decorr(struct Decorr *dpp)
+{
+    if (dpp->value > MAX_TERM) {
+        int32_t sam_A, sam_B;
+
+        if (dpp->value & 1) {
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            sam_B = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+        } else {
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+            sam_B = (3 * dpp->samplesB[0] - dpp->samplesB[1]) >> 1;
+        }
+
+        dpp->samplesA[1] = dpp->samplesA[0];
+        dpp->samplesB[1] = dpp->samplesB[0];
+        dpp->samplesA[0] = sam_A;
+        dpp->samplesB[0] = sam_B;
+
+        if (dpp->value & 1) {
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            sam_B = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+        } else {
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+            sam_B = (3 * dpp->samplesB[0] - dpp->samplesB[1]) >> 1;
+        }
+
+        dpp->samplesA[1] = sam_A;
+        dpp->samplesB[1] = sam_B;
+    } else if (dpp->value > 1) {
+        int i, j, k;
+
+        for (i = 0, j = dpp->value - 1, k = 0; k < dpp->value / 2; i++, j--, k++) {
+            i &= (MAX_TERM - 1);
+            j &= (MAX_TERM - 1);
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesA[j] ^= dpp->samplesA[i];
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesB[i] ^= dpp->samplesB[j];
+            dpp->samplesB[j] ^= dpp->samplesB[i];
+            dpp->samplesB[i] ^= dpp->samplesB[j];
+        }
+    }
+}
+
+static void decorr_stereo_quick(int32_t *in_left,  int32_t *in_right,
+                                int32_t *out_left, int32_t *out_right,
+                                int nb_samples, struct Decorr *dpp)
+{
+    int m = 0, i;
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+    dpp->weightB = restore_weight(store_weight(dpp->weightB));
+
+    for (i = 0; i < MAX_TERM; i++) {
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+        dpp->samplesB[i] = wp_exp2(log2s(dpp->samplesB[i]));
+    }
+
+    switch (dpp->value) {
+    case 2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0];
+            dpp->samplesA[0] = dpp->samplesA[1];
+            out_left[i] = tmp = (dpp->samplesA[1] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0];
+            dpp->samplesB[0] = dpp->samplesB[1];
+            out_right[i] = tmp = (dpp->samplesB[1] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[i] = tmp = (dpp->samplesA[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[i] = tmp = (dpp->samplesB[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[i] = tmp = (dpp->samplesA[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[i] = tmp = (dpp->samplesB[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default: {
+        int k = dpp->value & (MAX_TERM - 1);
+
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            out_left[i] = tmp = (dpp->samplesA[k] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            out_right[i] = tmp = (dpp->samplesB[k] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+            int k;
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    }
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            out_left[i] = tmp = (sam_B = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            out_right[i] = tmp = (dpp->samplesA[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            out_right[i] = tmp = (sam_A = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            out_left[i] = tmp = (dpp->samplesB[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = in_right[i];
+            out_right[i] = tmp -= APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = in_left[i];
+            out_left[i] = tmp -= APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+static void decorr_stereo_buffer(WavPackExtraInfo *info,
+                                 int32_t *in_left,  int32_t *in_right,
+                                 int32_t *out_left, int32_t *out_right,
+                                 int nb_samples, int tindex)
+{
+    struct Decorr dp = {0}, *dppi = info->dps + tindex;
+    int delta = dppi->delta, pre_delta;
+    int term = dppi->value;
+
+    if (delta == 7)
+        pre_delta = 7;
+    else if (delta < 2)
+        pre_delta = 3;
+    else
+        pre_delta = delta + 1;
+
+    dp.value = term;
+    dp.delta = pre_delta;
+    decorr_stereo(in_left, in_right, out_left, out_right,
+                  FFMIN(2048, nb_samples), &dp, -1);
+    dp.delta = delta;
+
+    if (tindex == 0) {
+        reverse_decorr(&dp);
+    } else {
+        CLEAR(dp.samplesA);
+        CLEAR(dp.samplesB);
+    }
+
+    memcpy(dppi->samplesA, dp.samplesA, sizeof(dp.samplesA));
+    memcpy(dppi->samplesB, dp.samplesB, sizeof(dp.samplesB));
+    dppi->weightA = dp.weightA;
+    dppi->weightB = dp.weightB;
+
+    if (delta == 0) {
+        dp.delta = 1;
+        decorr_stereo(in_left, in_right, out_left, out_right, nb_samples, &dp, 1);
+        dp.delta = 0;
+        memcpy(dp.samplesA, dppi->samplesA, sizeof(dp.samplesA));
+        memcpy(dp.samplesB, dppi->samplesB, sizeof(dp.samplesB));
+        dppi->weightA = dp.weightA = dp.sumA / nb_samples;
+        dppi->weightB = dp.weightB = dp.sumB / nb_samples;
+    }
+
+    if (info->gt16bit)
+        decorr_stereo(in_left, in_right, out_left, out_right,
+                           nb_samples, &dp, 1);
+    else
+        decorr_stereo_quick(in_left, in_right, out_left, out_right,
+                            nb_samples, &dp);
+}
+
+static void sort_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int reversed = 1;
+    uint32_t bits;
+
+    while (reversed) {
+        int ri, i;
+
+        memcpy(info->dps, s->decorr_passes, sizeof(s->decorr_passes));
+        reversed = 0;
+
+        for (ri = 0; ri < info->nterms && s->decorr_passes[ri].value; ri++) {
+
+            if (ri + 1 >= info->nterms || !s->decorr_passes[ri+1].value)
+                break;
+
+            if (s->decorr_passes[ri].value == s->decorr_passes[ri+1].value) {
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[ri  ][0], s->sampleptrs[ri  ][1],
+                                     s->sampleptrs[ri+1][0], s->sampleptrs[ri+1][1],
+                                     s->block_samples, ri);
+                continue;
+            }
+
+            info->dps[ri  ] = s->decorr_passes[ri+1];
+            info->dps[ri+1] = s->decorr_passes[ri  ];
+
+            for (i = ri; i < info->nterms && s->decorr_passes[i].value; i++)
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                     s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                     s->block_samples, i);
+
+            bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                              s->block_samples, info->log_limit);
+
+            if (bits < info->best_bits) {
+                reversed = 1;
+                info->best_bits = bits;
+                CLEAR(s->decorr_passes);
+                memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+                memcpy(s->sampleptrs[info->nterms + 1][0],
+                       s->sampleptrs[i][0], s->block_samples * 4);
+                memcpy(s->sampleptrs[info->nterms + 1][1],
+                       s->sampleptrs[i][1], s->block_samples * 4);
+            } else {
+                info->dps[ri  ] = s->decorr_passes[ri  ];
+                info->dps[ri+1] = s->decorr_passes[ri+1];
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[ri  ][0], s->sampleptrs[ri  ][1],
+                                     s->sampleptrs[ri+1][0], s->sampleptrs[ri+1][1],
+                                     s->block_samples, ri);
+            }
+        }
+    }
+}
+
+static void delta_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int lower = 0, delta, d, i;
+    uint32_t bits;
+
+    if (!s->decorr_passes[0].value)
+        return;
+    delta = s->decorr_passes[0].delta;
+
+    for (d = delta - 1; d >= 0; d--) {
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_stereo_buffer(info,
+                                 s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                 s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                 s->block_samples, i);
+        }
+
+        bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                          s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+        lower = 1;
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+               s->block_samples * 4);
+        memcpy(s->sampleptrs[info->nterms + 1][1], s->sampleptrs[i][1],
+               s->block_samples * 4);
+    }
+
+    for (d = delta + 1; !lower && d <= 7; d++) {
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_stereo_buffer(info,
+                                 s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                 s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                 s->block_samples, i);
+        }
+
+        bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                          s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+            memcpy(s->sampleptrs[info->nterms + 1][0],
+                   s->sampleptrs[i][0], s->block_samples * 4);
+            memcpy(s->sampleptrs[info->nterms + 1][1],
+                   s->sampleptrs[i][1], s->block_samples * 4);
+        }
+        else
+            break;
+    }
+}
+
+static void recurse_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info,
+                           int depth, int delta, uint32_t input_bits)
+{
+    int term, branches = s->num_branches - depth;
+    int32_t *in_left, *in_right, *out_left, *out_right;
+    uint32_t term_bits[22], bits;
+
+    if (branches < 1 || depth + 1 == info->nterms)
+        branches = 1;
+
+    CLEAR(term_bits);
+    in_left   = s->sampleptrs[depth    ][0];
+    in_right  = s->sampleptrs[depth    ][1];
+    out_left  = s->sampleptrs[depth + 1][0];
+    out_right = s->sampleptrs[depth + 1][1];
+
+    for (term = -3; term <= 18; term++) {
+        if (!term || (term > 8 && term < 17))
+            continue;
+
+        if (term == 17 && branches == 1 && depth + 1 < info->nterms)
+            continue;
+
+        if (term == -1 || term == -2)
+            if (!(s->flags & WV_CROSS_DECORR))
+                continue;
+
+        if (!s->extra_flags && (term > 4 && term < 17))
+            continue;
+
+        info->dps[depth].value = term;
+        info->dps[depth].delta = delta;
+        decorr_stereo_buffer(info, in_left, in_right, out_left, out_right,
+                             s->block_samples, depth);
+        bits = log2stereo(out_left, out_right, s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * (depth + 1));
+            memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[depth + 1][0],
+                   s->block_samples * 4);
+            memcpy(s->sampleptrs[info->nterms + 1][1], s->sampleptrs[depth + 1][1],
+                   s->block_samples * 4);
+        }
+
+        term_bits[term + 3] = bits;
+    }
+
+    while (depth + 1 < info->nterms && branches--) {
+        uint32_t local_best_bits = input_bits;
+        int best_term = 0, i;
+
+        for (i = 0; i < 22; i++)
+            if (term_bits[i] && term_bits[i] < local_best_bits) {
+                local_best_bits = term_bits[i];
+                best_term = i - 3;
+            }
+
+        if (!best_term)
+            break;
+
+        term_bits[best_term + 3] = 0;
+
+        info->dps[depth].value = best_term;
+        info->dps[depth].delta = delta;
+        decorr_stereo_buffer(info, in_left, in_right, out_left, out_right,
+                             s->block_samples, depth);
+
+        recurse_stereo(s, info, depth + 1, delta, local_best_bits);
+    }
+}
+
+static void analyze_stereo(WavPackEncodeContext *s,
+                           int32_t *in_left, int32_t *in_right,
+                           int do_samples)
+{
+    WavPackExtraInfo info;
+    int i;
+
+    info.gt16bit = ((s->flags & MAG_MASK) >> MAG_LSB) >= 16;
+
+    info.log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    info.log_limit = FFMIN(6912, info.log_limit);
+
+    info.nterms = s->num_terms;
+
+    if (allocate_buffers2(s, s->num_terms))
+        return;
+
+    memcpy(info.dps, s->decorr_passes, sizeof(info.dps));
+    memcpy(s->sampleptrs[0][0], in_left,  s->block_samples * 4);
+    memcpy(s->sampleptrs[0][1], in_right, s->block_samples * 4);
+
+    for (i = 0; i < info.nterms && info.dps[i].value; i++)
+        if (info.gt16bit)
+            decorr_stereo(s->sampleptrs[i    ][0], s->sampleptrs[i    ][1],
+                          s->sampleptrs[i + 1][0], s->sampleptrs[i + 1][1],
+                          s->block_samples, info.dps + i, 1);
+        else
+            decorr_stereo_quick(s->sampleptrs[i    ][0], s->sampleptrs[i    ][1],
+                                s->sampleptrs[i + 1][0], s->sampleptrs[i + 1][1],
+                                s->block_samples, info.dps + i);
+
+    info.best_bits = log2stereo(s->sampleptrs[info.nterms][0], s->sampleptrs[info.nterms][1],
+                                s->block_samples, 0);
+
+    memcpy(s->sampleptrs[info.nterms + 1][0], s->sampleptrs[i][0], s->block_samples * 4);
+    memcpy(s->sampleptrs[info.nterms + 1][1], s->sampleptrs[i][1], s->block_samples * 4);
+
+    if (s->extra_flags & EXTRA_BRANCHES)
+        recurse_stereo(s, &info, 0, (int) floor(s->delta_decay + 0.5),
+                       log2stereo(s->sampleptrs[0][0], s->sampleptrs[0][1],
+                                  s->block_samples, 0));
+
+    if (s->extra_flags & EXTRA_SORT_FIRST)
+        sort_stereo(s, &info);
+
+    if (s->extra_flags & EXTRA_TRY_DELTAS) {
+        delta_stereo(s, &info);
+
+        if ((s->extra_flags & EXTRA_ADJUST_DELTAS) && s->decorr_passes[0].value)
+            s->delta_decay = (float)((s->delta_decay * 2.0 + s->decorr_passes[0].delta) / 3.0);
+        else
+            s->delta_decay = 2.0;
+    }
+
+    if (s->extra_flags & EXTRA_SORT_LAST)
+        sort_stereo(s, &info);
+
+    if (do_samples) {
+        memcpy(in_left,  s->sampleptrs[info.nterms + 1][0], s->block_samples * 4);
+        memcpy(in_right, s->sampleptrs[info.nterms + 1][1], s->block_samples * 4);
+    }
+
+    for (i = 0; i < info.nterms; i++)
+        if (!s->decorr_passes[i].value)
+            break;
+
+    s->num_terms = i;
+}
+
+static int wv_stereo(WavPackEncodeContext *s,
+                     int32_t *samples_l, int32_t *samples_r,
+                     int no_history, int do_samples)
+{
+    struct Decorr temp_decorr_pass, save_decorr_passes[MAX_TERMS] = {{0}};
+    int nb_samples = s->block_samples, ret;
+    int buf_size = sizeof(int32_t) * nb_samples;
+    int log_limit, force_js = 0, force_ts = 0, got_js = 0, pi, i;
+    uint32_t best_size = UINT32_MAX, size;
+
+    for (i = 0; i < nb_samples; i++)
+        if (samples_l[i] || samples_r[i])
+            break;
+
+    if (i == nb_samples) {
+        s->flags &= ~((uint32_t) WV_JOINT_STEREO);
+        CLEAR(s->decorr_passes);
+        CLEAR(s->w);
+        s->num_terms = 0;
+        return 0;
+    }
+
+    log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    log_limit = FFMIN(6912, log_limit);
+
+    if (s->joint != -1) {
+        force_js =  s->joint;
+        force_ts = !s->joint;
+    }
+
+    if ((ret = allocate_buffers(s)) < 0)
+        return ret;
+
+    if (no_history || s->num_passes >= 7)
+        s->best_decorr = s->mask_decorr = 0;
+
+    for (pi = 0; pi < s->num_passes;) {
+        const WavPackDecorrSpec *wpds;
+        int nterms, c, j;
+
+        if (!pi)
+            c = s->best_decorr;
+        else {
+            if (s->mask_decorr == 0)
+                c = 0;
+            else
+                c = (s->best_decorr & (s->mask_decorr - 1)) | s->mask_decorr;
+
+            if (c == s->best_decorr) {
+                s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+                continue;
+            }
+        }
+
+        wpds = &s->decorr_specs[c];
+        nterms = decorr_filter_nterms[s->decorr_filter];
+
+        while (1) {
+            if (force_js || (wpds->joint_stereo && !force_ts)) {
+                if (!got_js) {
+                    av_fast_padded_malloc(&s->js_left,  &s->js_left_size,  buf_size);
+                    av_fast_padded_malloc(&s->js_right, &s->js_right_size, buf_size);
+                    memcpy(s->js_left,  samples_l, buf_size);
+                    memcpy(s->js_right, samples_r, buf_size);
+
+                    for (i = 0; i < nb_samples; i++)
+                        s->js_right[i] += ((s->js_left[i] -= s->js_right[i]) >> 1);
+                    got_js = 1;
+                }
+
+                memcpy(s->temp_buffer[0][0], s->js_left,  buf_size);
+                memcpy(s->temp_buffer[0][1], s->js_right, buf_size);
+            } else {
+                memcpy(s->temp_buffer[0][0], samples_l, buf_size);
+                memcpy(s->temp_buffer[0][1], samples_r, buf_size);
+            }
+
+            CLEAR(save_decorr_passes);
+
+            for (j = 0; j < nterms; j++) {
+                CLEAR(temp_decorr_pass);
+                temp_decorr_pass.delta = wpds->delta;
+                temp_decorr_pass.value = wpds->terms[j];
+
+                if (temp_decorr_pass.value < 0 && !(s->flags & WV_CROSS_DECORR))
+                    temp_decorr_pass.value = -3;
+
+                decorr_stereo(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                              s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                              FFMIN(2048, nb_samples), &temp_decorr_pass, -1);
+
+                if (j) {
+                    CLEAR(temp_decorr_pass.samplesA);
+                    CLEAR(temp_decorr_pass.samplesB);
+                } else {
+                    reverse_decorr(&temp_decorr_pass);
+                }
+
+                memcpy(save_decorr_passes + j, &temp_decorr_pass, sizeof(struct Decorr));
+
+                if (((s->flags & MAG_MASK) >> MAG_LSB) >= 16)
+                    decorr_stereo(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                                  s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                                  nb_samples, &temp_decorr_pass, 1);
+                else
+                    decorr_stereo_quick(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                                        s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                                        nb_samples, &temp_decorr_pass);
+            }
+
+            size = log2stereo(s->temp_buffer[j&1][0], s->temp_buffer[j&1][1],
+                              nb_samples, log_limit);
+            if (size != UINT32_MAX || !nterms)
+                break;
+            nterms >>= 1;
+        }
+
+        if (size < best_size) {
+            memcpy(s->best_buffer[0], s->temp_buffer[j&1][0], buf_size);
+            memcpy(s->best_buffer[1], s->temp_buffer[j&1][1], buf_size);
+            memcpy(s->decorr_passes, save_decorr_passes, sizeof(struct Decorr) * MAX_TERMS);
+            s->num_terms = nterms;
+            s->best_decorr = c;
+            best_size = size;
+        }
+
+        if (pi++)
+            s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+    }
+
+    if (force_js || (s->decorr_specs[s->best_decorr].joint_stereo && !force_ts))
+        s->flags |= WV_JOINT_STEREO;
+    else
+        s->flags &= ~((uint32_t) WV_JOINT_STEREO);
+
+    if (s->extra_flags) {
+        if (s->flags & WV_JOINT_STEREO) {
+            analyze_stereo(s, s->js_left, s->js_right, do_samples);
+
+            if (do_samples) {
+                memcpy(samples_l, s->js_left,  buf_size);
+                memcpy(samples_r, s->js_right, buf_size);
+            }
+        } else
+            analyze_stereo(s, samples_l, samples_r, do_samples);
+    } else if (do_samples) {
+        memcpy(samples_l, s->best_buffer[0], buf_size);
+        memcpy(samples_r, s->best_buffer[1], buf_size);
+    }
+
+    if (s->extra_flags || no_history ||
+        s->joint_stereo != s->decorr_specs[s->best_decorr].joint_stereo) {
+        s->joint_stereo = s->decorr_specs[s->best_decorr].joint_stereo;
+        CLEAR(s->w);
+        scan_word(s, &s->w.c[0], s->best_buffer[0], nb_samples, -1);
+        scan_word(s, &s->w.c[1], s->best_buffer[1], nb_samples, -1);
+    }
+    return 0;
+}
+
+#define count_bits(av) ( \
+ (av) < (1 << 8) ? nbits_table[av] : \
+  ( \
+   (av) < (1 << 16) ? nbits_table[(av) >> 8] + 8 : \
+   ((av) < (1 << 24) ? nbits_table[(av) >> 16] + 16 : nbits_table[(av) >> 24] + 24) \
+  ) \
+)
+
+static void encode_flush(WavPackEncodeContext *s)
+{
+    WavPackWords *w = &s->w;
+    PutBitContext *pb = &s->pb;
+
+    if (w->zeros_acc) {
+        int cbits = count_bits(w->zeros_acc);
+
+        do {
+            if (cbits > 31) {
+                put_bits(pb, 31, 0x7FFFFFFF);
+                cbits -= 31;
+            } else {
+                put_bits(pb, cbits, (1 << cbits) - 1);
+                cbits = 0;
+            }
+        } while (cbits);
+
+        put_bits(pb, 1, 0);
+
+        while (w->zeros_acc > 1) {
+            put_bits(pb, 1, w->zeros_acc & 1);
+            w->zeros_acc >>= 1;
+        }
+
+        w->zeros_acc = 0;
+    }
+
+    if (w->holding_one) {
+        if (w->holding_one >= 16) {
+            int cbits;
+
+            put_bits(pb, 16, (1 << 16) - 1);
+            put_bits(pb, 1, 0);
+            w->holding_one -= 16;
+            cbits = count_bits(w->holding_one);
+
+            do {
+                if (cbits > 31) {
+                    put_bits(pb, 31, 0x7FFFFFFF);
+                    cbits -= 31;
+                } else {
+                    put_bits(pb, cbits, (1 << cbits) - 1);
+                    cbits = 0;
+                }
+            } while (cbits);
+
+            put_bits(pb, 1, 0);
+
+            while (w->holding_one > 1) {
+                put_bits(pb, 1, w->holding_one & 1);
+                w->holding_one >>= 1;
+            }
+
+            w->holding_zero = 0;
+        } else {
+            put_bits(pb, w->holding_one, (1 << w->holding_one) - 1);
+        }
+
+        w->holding_one = 0;
+    }
+
+    if (w->holding_zero) {
+        put_bits(pb, 1, 0);
+        w->holding_zero = 0;
+    }
+
+    if (w->pend_count) {
+        put_bits(pb, w->pend_count, w->pend_data);
+        w->pend_data = w->pend_count = 0;
+    }
+}
+
+static void wavpack_encode_sample(WavPackEncodeContext *s, WvChannel *c, int32_t sample)
+{
+    WavPackWords *w = &s->w;
+    uint32_t ones_count, low, high;
+    int sign = sample < 0;
+
+    if (s->w.c[0].median[0] < 2 && !s->w.holding_zero && s->w.c[1].median[0] < 2) {
+        if (w->zeros_acc) {
+            if (sample)
+                encode_flush(s);
+            else {
+                w->zeros_acc++;
+                return;
+            }
+        } else if (sample) {
+            put_bits(&s->pb, 1, 0);
+        } else {
+            CLEAR(s->w.c[0].median);
+            CLEAR(s->w.c[1].median);
+            w->zeros_acc = 1;
+            return;
+        }
+    }
+
+    if (sign)
+        sample = ~sample;
+
+    if (sample < (int32_t) GET_MED(0)) {
+        ones_count = low = 0;
+        high = GET_MED(0) - 1;
+        DEC_MED(0);
+    } else {
+        low = GET_MED(0);
+        INC_MED(0);
+
+        if (sample - low < GET_MED(1)) {
+            ones_count = 1;
+            high = low + GET_MED(1) - 1;
+            DEC_MED(1);
+        } else {
+            low += GET_MED(1);
+            INC_MED(1);
+
+            if (sample - low < GET_MED(2)) {
+                ones_count = 2;
+                high = low + GET_MED(2) - 1;
+                DEC_MED(2);
+            } else {
+                ones_count = 2 + (sample - low) / GET_MED(2);
+                low += (ones_count - 2) * GET_MED(2);
+                high = low + GET_MED(2) - 1;
+                INC_MED(2);
+            }
+        }
+    }
+
+    if (w->holding_zero) {
+        if (ones_count)
+            w->holding_one++;
+
+        encode_flush(s);
+
+        if (ones_count) {
+            w->holding_zero = 1;
+            ones_count--;
+        } else
+            w->holding_zero = 0;
+    } else
+        w->holding_zero = 1;
+
+    w->holding_one = ones_count * 2;
+
+    if (high != low) {
+        uint32_t maxcode = high - low, code = sample - low;
+        int bitcount = count_bits(maxcode);
+        uint32_t extras = (1 << bitcount) - maxcode - 1;
+
+        if (code < extras) {
+            w->pend_data |= code << w->pend_count;
+            w->pend_count += bitcount - 1;
+        } else {
+            w->pend_data |= ((code + extras) >> 1) << w->pend_count;
+            w->pend_count += bitcount - 1;
+            w->pend_data |= ((code + extras) & 1) << w->pend_count++;
+        }
+    }
+
+    w->pend_data |= ((int32_t) sign << w->pend_count++);
+
+    if (!w->holding_zero)
+        encode_flush(s);
+}
+
+static void pack_int32(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    const int sent_bits = s->int32_sent_bits;
+    PutBitContext *pb = &s->pb;
+    int i, pre_shift;
+
+    pre_shift = s->int32_zeros + s->int32_ones + s->int32_dups;
+
+    if (!sent_bits)
+        return;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            put_sbits(pb, sent_bits, samples_l[i] >> pre_shift);
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            put_sbits(pb, sent_bits, samples_l[i] >> pre_shift);
+            put_sbits(pb, sent_bits, samples_r[i] >> pre_shift);
+        }
+    }
+}
+
+static void pack_float_sample(WavPackEncodeContext *s, int32_t *sample)
+{
+    const int max_exp = s->float_max_exp;
+    PutBitContext *pb = &s->pb;
+    int32_t value, shift_count;
+
+    if (get_exponent(*sample) == 255) {
+        if (get_mantissa(*sample)) {
+            put_bits(pb, 1, 1);
+            put_bits(pb, 23, get_mantissa(*sample));
+        } else {
+            put_bits(pb, 1, 0);
+        }
+
+        value = 0x1000000;
+        shift_count = 0;
+    } else if (get_exponent(*sample)) {
+        shift_count = max_exp - get_exponent(*sample);
+        value = 0x800000 + get_mantissa(*sample);
+    } else {
+        shift_count = max_exp ? max_exp - 1 : 0;
+        value = get_mantissa(*sample);
+    }
+
+    if (shift_count < 25)
+        value >>= shift_count;
+    else
+        value = 0;
+
+    if (!value) {
+        if (s->float_flags & FLOAT_ZEROS_SENT) {
+            if (get_exponent(*sample) || get_mantissa(*sample)) {
+                put_bits(pb, 1, 1);
+                put_bits(pb, 23, get_mantissa(*sample));
+
+                if (max_exp >= 25)
+                    put_bits(pb, 8, get_exponent(*sample));
+
+                put_bits(pb, 1, get_sign(*sample));
+            } else {
+                put_bits(pb, 1, 0);
+
+                if (s->float_flags & FLOAT_NEG_ZEROS)
+                    put_bits(pb, 1, get_sign(*sample));
+            }
+        }
+    } else if (shift_count) {
+        if (s->float_flags & FLOAT_SHIFT_SENT) {
+            put_sbits(pb, shift_count, get_mantissa(*sample));
+        } else if (s->float_flags & FLOAT_SHIFT_SAME) {
+            put_bits(pb, 1, get_mantissa(*sample) & 1);
+        }
+    }
+}
+
+static void pack_float(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    int i;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            pack_float_sample(s, &samples_l[i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            pack_float_sample(s, &samples_l[i]);
+            pack_float_sample(s, &samples_r[i]);
+        }
+    }
+}
+
+static void decorr_stereo_pass2(struct Decorr *dpp,
+                                int32_t *samples_l, int32_t *samples_r,
+                                int nb_samples)
+{
+    int i, m, k;
+
+    switch (dpp->value) {
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default:
+        for (m = 0, k = dpp->value & (MAX_TERM - 1), i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            samples_l[i] = tmp = (dpp->samplesA[k] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            samples_r[i] = tmp = (dpp->samplesB[k] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+
+            memcpy(temp_A, dpp->samplesA, sizeof (dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof (dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            samples_l[i] = tmp = (sam_B = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            samples_r[i] = tmp = (dpp->samplesA[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            samples_r[i] = tmp = (sam_A = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            samples_l[i] = tmp = (dpp->samplesB[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = samples_r[i];
+            samples_r[i] = tmp -= APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = samples_l[i];
+            samples_l[i] = tmp -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+#define update_weight_d2(weight, delta, source, result) \
+    if (source && result) \
+        weight -= (((source ^ result) >> 29) & 4) - 2;
+
+#define update_weight_clip_d2(weight, delta, source, result) \
+    if (source && result) { \
+        const int32_t s = (source ^ result) >> 31; \
+        if ((weight = (weight ^ s) + (2 - s)) > 1024) weight = 1024; \
+        weight = (weight ^ s) - s; \
+    }
+
+static void decorr_stereo_pass_id2(struct Decorr *dpp,
+                                   int32_t *samples_l, int32_t *samples_r,
+                                   int nb_samples)
+{
+    int i, m, k;
+
+    switch (dpp->value) {
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default:
+        for (m = 0, k = dpp->value & (MAX_TERM - 1), i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            samples_l[i] = tmp = (dpp->samplesA[k] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            samples_r[i] = tmp = (dpp->samplesB[k] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            samples_l[i] = tmp = (sam_B = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            samples_r[i] = tmp = (dpp->samplesA[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            samples_r[i] = tmp = (sam_A = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            samples_l[i] = tmp = (dpp->samplesB[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = samples_r[i];
+            samples_r[i] = tmp -= APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = samples_l[i];
+            samples_l[i] = tmp -= APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+static void put_metadata_block(PutByteContext *pb, int flags, int size)
+{
+    if (size & 1)
+        flags |= WP_IDF_ODD;
+
+    bytestream2_put_byte(pb, flags);
+    bytestream2_put_byte(pb, (size + 1) >> 1);
+}
+
+static int wavpack_encode_block(WavPackEncodeContext *s,
+                                int32_t *samples_l, int32_t *samples_r,
+                                uint8_t *out, int out_size)
+{
+    int block_size, start, end, data_size, tcount, temp, m = 0;
+    int i, j, ret = 0, got_extra = 0, nb_samples = s->block_samples;
+    uint32_t crc = 0xffffffffu;
+    struct Decorr *dpp;
+    PutByteContext pb;
+
+    if (s->flags & WV_MONO_DATA) {
+        CLEAR(s->w);
+    }
+    if (!(s->flags & WV_MONO) && s->optimize_mono) {
+        int32_t lor = 0, diff = 0;
+
+        for (i = 0; i < nb_samples; i++) {
+            lor  |= samples_l[i] | samples_r[i];
+            diff |= samples_l[i] - samples_r[i];
+
+            if (lor && diff)
+                break;
+        }
+
+        if (i == nb_samples && lor && !diff) {
+            s->flags &= ~(WV_JOINT_STEREO | WV_CROSS_DECORR);
+            s->flags |= WV_FALSE_STEREO;
+
+            if (!s->false_stereo) {
+                s->false_stereo = 1;
+                s->num_terms = 0;
+                CLEAR(s->w);
+            }
+        } else if (s->false_stereo) {
+            s->false_stereo = 0;
+            s->num_terms = 0;
+            CLEAR(s->w);
+        }
+    }
+
+    if (s->flags & SHIFT_MASK) {
+        int shift = (s->flags & SHIFT_MASK) >> SHIFT_LSB;
+        int mag = (s->flags & MAG_MASK) >> MAG_LSB;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, shift);
+
+        if ((mag -= shift) < 0)
+            s->flags &= ~MAG_MASK;
+        else
+            s->flags -= (1 << MAG_LSB) * shift;
+    }
+
+    if ((s->flags & WV_FLOAT_DATA) || (s->flags & MAG_MASK) >> MAG_LSB >= 24) {
+        av_fast_padded_malloc(&s->orig_l, &s->orig_l_size, sizeof(int32_t) * nb_samples);
+        memcpy(s->orig_l, samples_l, sizeof(int32_t) * nb_samples);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->orig_r, &s->orig_r_size, sizeof(int32_t) * nb_samples);
+            memcpy(s->orig_r, samples_r, sizeof(int32_t) * nb_samples);
+        }
+
+        if (s->flags & WV_FLOAT_DATA)
+            got_extra = scan_float(s, samples_l, samples_r, nb_samples);
+        else
+            got_extra = scan_int32(s, samples_l, samples_r, nb_samples);
+        s->num_terms = 0;
+    } else {
+        scan_int23(s, samples_l, samples_r, nb_samples);
+        if (s->shift != s->int32_zeros + s->int32_ones + s->int32_dups) {
+            s->shift = s->int32_zeros + s->int32_ones + s->int32_dups;
+            s->num_terms = 0;
+        }
+    }
+
+    if (!s->num_passes && !s->num_terms) {
+        s->num_passes = 1;
+
+        if (s->flags & WV_MONO_DATA)
+            ret = wv_mono(s, samples_l, 1, 0);
+        else
+            ret = wv_stereo(s, samples_l, samples_r, 1, 0);
+
+        s->num_passes = 0;
+    }
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            crc += (crc << 1) + samples_l[i];
+
+        if (s->num_passes)
+            ret = wv_mono(s, samples_l, !s->num_terms, 1);
+    } else {
+        for (i = 0; i < nb_samples; i++)
+            crc += (crc << 3) + (samples_l[i] << 1) + samples_l[i] + samples_r[i];
+
+        if (s->num_passes)
+            ret = wv_stereo(s, samples_l, samples_r, !s->num_terms, 1);
+    }
+    if (ret < 0)
+        return ret;
+
+    if (!s->ch_offset)
+        s->flags |= WV_INITIAL_BLOCK;
+
+    s->ch_offset += 1 + !(s->flags & WV_MONO);
+
+    if (s->ch_offset == s->avctx->channels)
+        s->flags |= WV_FINAL_BLOCK;
+
+    bytestream2_init_writer(&pb, out, out_size);
+    bytestream2_put_le32(&pb, MKTAG('w', 'v', 'p', 'k'));
+    bytestream2_put_le32(&pb, 0);
+    bytestream2_put_le16(&pb, 0x410);
+    bytestream2_put_le16(&pb, 0);
+    bytestream2_put_le32(&pb, 0);
+    bytestream2_put_le32(&pb, s->sample_index);
+    bytestream2_put_le32(&pb, nb_samples);
+    bytestream2_put_le32(&pb, s->flags);
+    bytestream2_put_le32(&pb, crc);
+
+    if (s->flags & WV_INITIAL_BLOCK &&
+        s->avctx->channel_layout != AV_CH_LAYOUT_MONO &&
+        s->avctx->channel_layout != AV_CH_LAYOUT_STEREO) {
+        put_metadata_block(&pb, WP_ID_CHANINFO, 5);
+        bytestream2_put_byte(&pb, s->avctx->channels);
+        bytestream2_put_le32(&pb, s->avctx->channel_layout);
+        bytestream2_put_byte(&pb, 0);
+    }
+
+    if ((s->flags & SRATE_MASK) == SRATE_MASK) {
+        put_metadata_block(&pb, WP_ID_SAMPLE_RATE, 3);
+        bytestream2_put_le24(&pb, s->avctx->sample_rate);
+        bytestream2_put_byte(&pb, 0);
+    }
+
+    put_metadata_block(&pb, WP_ID_DECTERMS, s->num_terms);
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        bytestream2_put_byte(&pb, ((dpp->value + 5) & 0x1f) | ((dpp->delta << 5) & 0xe0));
+    }
+    if (s->num_terms & 1)
+        bytestream2_put_byte(&pb, 0);
+
+#define WRITE_DECWEIGHT(type) do {            \
+        temp = store_weight(type);    \
+        bytestream2_put_byte(&pb, temp);      \
+        type = restore_weight(temp);  \
+    } while (0)
+
+    bytestream2_put_byte(&pb, WP_ID_DECWEIGHTS);
+    bytestream2_put_byte(&pb, 0);
+    start = bytestream2_tell_p(&pb);
+    for (i = s->num_terms - 1; i >= 0; --i) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+
+        if (store_weight(dpp->weightA) ||
+            (!(s->flags & WV_MONO_DATA) && store_weight(dpp->weightB)))
+                break;
+    }
+    tcount = i + 1;
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        if (i < tcount) {
+            WRITE_DECWEIGHT(dpp->weightA);
+            if (!(s->flags & WV_MONO_DATA))
+                WRITE_DECWEIGHT(dpp->weightB);
+        } else {
+            dpp->weightA = dpp->weightB = 0;
+        }
+    }
+    end = bytestream2_tell_p(&pb);
+    out[start - 2] = WP_ID_DECWEIGHTS | (((end - start) & 1) ? WP_IDF_ODD: 0);
+    out[start - 1] = (end - start + 1) >> 1;
+    if ((end - start) & 1)
+        bytestream2_put_byte(&pb, 0);
+
+#define WRITE_DECSAMPLE(type) do {        \
+        temp = log2s(type);               \
+        type = wp_exp2(temp);             \
+        bytestream2_put_le16(&pb, temp);  \
+    } while (0)
+
+    bytestream2_put_byte(&pb, WP_ID_DECSAMPLES);
+    bytestream2_put_byte(&pb, 0);
+    start = bytestream2_tell_p(&pb);
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        if (i == 0) {
+            if (dpp->value > MAX_TERM) {
+                WRITE_DECSAMPLE(dpp->samplesA[0]);
+                WRITE_DECSAMPLE(dpp->samplesA[1]);
+                if (!(s->flags & WV_MONO_DATA)) {
+                    WRITE_DECSAMPLE(dpp->samplesB[0]);
+                    WRITE_DECSAMPLE(dpp->samplesB[1]);
+                }
+            } else if (dpp->value < 0) {
+                WRITE_DECSAMPLE(dpp->samplesA[0]);
+                WRITE_DECSAMPLE(dpp->samplesB[0]);
+            } else {
+                for (j = 0; j < dpp->value; j++) {
+                    WRITE_DECSAMPLE(dpp->samplesA[j]);
+                    if (!(s->flags & WV_MONO_DATA))
+                        WRITE_DECSAMPLE(dpp->samplesB[j]);
+                }
+            }
+        } else {
+            CLEAR(dpp->samplesA);
+            CLEAR(dpp->samplesB);
+        }
+    }
+    end = bytestream2_tell_p(&pb);
+    out[start - 1] = (end - start) >> 1;
+
+#define WRITE_CHAN_ENTROPY(chan) do {               \
+        for (i = 0; i < 3; i++) {                   \
+            temp = wp_log2(s->w.c[chan].median[i]); \
+            bytestream2_put_le16(&pb, temp);        \
+            s->w.c[chan].median[i] = wp_exp2(temp); \
+        }                                           \
+    } while (0)
+
+    put_metadata_block(&pb, WP_ID_ENTROPY, 6 * (1 + (!(s->flags & WV_MONO_DATA))));
+    WRITE_CHAN_ENTROPY(0);
+    if (!(s->flags & WV_MONO_DATA))
+        WRITE_CHAN_ENTROPY(1);
+
+    if (s->flags & WV_FLOAT_DATA) {
+        put_metadata_block(&pb, WP_ID_FLOATINFO, 4);
+        bytestream2_put_byte(&pb, s->float_flags);
+        bytestream2_put_byte(&pb, s->float_shift);
+        bytestream2_put_byte(&pb, s->float_max_exp);
+        bytestream2_put_byte(&pb, 127);
+    }
+
+    if (s->flags & WV_INT32_DATA) {
+        put_metadata_block(&pb, WP_ID_INT32INFO, 4);
+        bytestream2_put_byte(&pb, s->int32_sent_bits);
+        bytestream2_put_byte(&pb, s->int32_zeros);
+        bytestream2_put_byte(&pb, s->int32_ones);
+        bytestream2_put_byte(&pb, s->int32_dups);
+    }
+
+    if (s->flags & WV_MONO_DATA && !s->num_passes) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t code = samples_l[i];
+
+            for (tcount = s->num_terms, dpp = s->decorr_passes; tcount--; dpp++) {
+                int32_t sam;
+
+                if (dpp->value > MAX_TERM) {
+                    if (dpp->value & 1)
+                        sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+                    else
+                        sam = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+                    dpp->samplesA[1] = dpp->samplesA[0];
+                    dpp->samplesA[0] = code;
+                } else {
+                    sam = dpp->samplesA[m];
+                    dpp->samplesA[(m + dpp->value) & (MAX_TERM - 1)] = code;
+                }
+
+                code -= APPLY_WEIGHT(dpp->weightA, sam);
+                UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, code);
+            }
+
+            m = (m + 1) & (MAX_TERM - 1);
+            samples_l[i] = code;
+        }
+        if (m) {
+            for (tcount = s->num_terms, dpp = s->decorr_passes; tcount--; dpp++)
+                if (dpp->value > 0 && dpp->value <= MAX_TERM) {
+                int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+                int k;
+
+                memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+                memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+                for (k = 0; k < MAX_TERM; k++) {
+                    dpp->samplesA[k] = temp_A[m];
+                    dpp->samplesB[k] = temp_B[m];
+                    m = (m + 1) & (MAX_TERM - 1);
+                }
+            }
+        }
+    } else if (!s->num_passes) {
+        if (s->flags & WV_JOINT_STEREO) {
+            for (i = 0; i < nb_samples; i++)
+                samples_r[i] += ((samples_l[i] -= samples_r[i]) >> 1);
+        }
+
+        for (i = 0; i < s->num_terms; i++) {
+            struct Decorr *dpp = &s->decorr_passes[i];
+            if (((s->flags & MAG_MASK) >> MAG_LSB) >= 16 || dpp->delta != 2)
+                decorr_stereo_pass2(dpp, samples_l, samples_r, nb_samples);
+            else
+                decorr_stereo_pass_id2(dpp, samples_l, samples_r, nb_samples);
+        }
+    }
+
+    bytestream2_put_byte(&pb, WP_ID_DATA | WP_IDF_LONG);
+    init_put_bits(&s->pb, pb.buffer + 3, bytestream2_get_bytes_left_p(&pb));
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            wavpack_encode_sample(s, &s->w.c[0], s->samples[0][i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            wavpack_encode_sample(s, &s->w.c[0], s->samples[0][i]);
+            wavpack_encode_sample(s, &s->w.c[1], s->samples[1][i]);
+        }
+    }
+    encode_flush(s);
+    flush_put_bits(&s->pb);
+    data_size = put_bits_count(&s->pb) >> 3;
+    bytestream2_put_le24(&pb, (data_size + 1) >> 1);
+    bytestream2_skip_p(&pb, data_size);
+    if (data_size & 1)
+        bytestream2_put_byte(&pb, 0);
+
+    if (got_extra) {
+        bytestream2_put_byte(&pb, WP_ID_EXTRABITS | WP_IDF_LONG);
+        init_put_bits(&s->pb, pb.buffer + 7, bytestream2_get_bytes_left_p(&pb));
+        if (s->flags & WV_FLOAT_DATA)
+            pack_float(s, s->orig_l, s->orig_r, nb_samples);
+        else
+            pack_int32(s, s->orig_l, s->orig_r, nb_samples);
+        flush_put_bits(&s->pb);
+        data_size = put_bits_count(&s->pb) >> 3;
+        bytestream2_put_le24(&pb, (data_size + 5) >> 1);
+        bytestream2_put_le32(&pb, s->crc_x);
+        bytestream2_skip_p(&pb, data_size);
+        if (data_size & 1)
+            bytestream2_put_byte(&pb, 0);
+    }
+
+    block_size = bytestream2_tell_p(&pb);
+    AV_WL32(out + 4, block_size - 8);
+
+    av_assert0(!bytestream2_get_eof(&pb));
+
+    return block_size;
+}
+
+static void fill_buffer(WavPackEncodeContext *s,
+                        const int8_t *src, int32_t *dst,
+                        int nb_samples)
+{
+    int i;
+
+#define COPY_SAMPLES(type, offset, shift) do {            \
+        const type *sptr = (const type *)src;             \
+        for (i = 0; i < nb_samples; i++)                  \
+            dst[i] = (sptr[i] - offset) >> shift;         \
+    } while (0)
+
+    switch (s->avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_U8P:
+        COPY_SAMPLES(int8_t, 0x80, 0);
+        break;
+    case AV_SAMPLE_FMT_S16P:
+        COPY_SAMPLES(int16_t, 0, 0);
+        break;
+    case AV_SAMPLE_FMT_S32P:
+        if (s->avctx->bits_per_raw_sample <= 24) {
+            COPY_SAMPLES(int32_t, 0, 8);
+            break;
+        }
+    case AV_SAMPLE_FMT_FLTP:
+        memcpy(dst, src, nb_samples * 4);
+    }
+}
+
+static void set_samplerate(WavPackEncodeContext *s)
+{
+    int i;
+
+    for (i = 0; i < 15; i++) {
+        if (wv_rates[i] == s->avctx->sample_rate)
+            break;
+    }
+
+    s->flags = i << SRATE_LSB;
+}
+
+static int wavpack_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                const AVFrame *frame, int *got_packet_ptr)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+    int buf_size, ret;
+    uint8_t *buf;
+
+    s->block_samples = frame->nb_samples;
+    av_fast_padded_malloc(&s->samples[0], &s->samples_size[0],
+                          sizeof(int32_t) * s->block_samples);
+    if (!s->samples[0])
+        return AVERROR(ENOMEM);
+    if (avctx->channels > 1) {
+        av_fast_padded_malloc(&s->samples[1], &s->samples_size[1],
+                              sizeof(int32_t) * s->block_samples);
+        if (!s->samples[1])
+            return AVERROR(ENOMEM);
+    }
+
+    buf_size = s->block_samples * avctx->channels * 8
+             + 200 * avctx->channels /* for headers */;
+    if ((ret = ff_alloc_packet2(avctx, avpkt, buf_size, 0)) < 0)
+        return ret;
+    buf = avpkt->data;
+
+    for (s->ch_offset = 0; s->ch_offset < avctx->channels;) {
+        set_samplerate(s);
+
+        switch (s->avctx->sample_fmt) {
+        case AV_SAMPLE_FMT_S16P: s->flags |= 1; break;
+        case AV_SAMPLE_FMT_S32P: s->flags |= 3 - (s->avctx->bits_per_raw_sample <= 24); break;
+        case AV_SAMPLE_FMT_FLTP: s->flags |= 3 | WV_FLOAT_DATA;
+        }
+
+        fill_buffer(s, frame->extended_data[s->ch_offset], s->samples[0], s->block_samples);
+        if (avctx->channels - s->ch_offset == 1) {
+            s->flags |= WV_MONO;
+        } else {
+            s->flags |= WV_CROSS_DECORR;
+            fill_buffer(s, frame->extended_data[s->ch_offset + 1], s->samples[1], s->block_samples);
+        }
+
+        s->flags += (1 << MAG_LSB) * ((s->flags & 3) * 8 + 7);
+
+        if ((ret = wavpack_encode_block(s, s->samples[0], s->samples[1],
+                                        buf, buf_size)) < 0)
+            return ret;
+
+        buf      += ret;
+        buf_size -= ret;
+    }
+    s->sample_index += frame->nb_samples;
+
+    avpkt->pts      = frame->pts;
+    avpkt->size     = buf - avpkt->data;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static av_cold int wavpack_encode_close(AVCodecContext *avctx)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < MAX_TERMS + 2; i++) {
+        av_freep(&s->sampleptrs[i][0]);
+        av_freep(&s->sampleptrs[i][1]);
+        s->sampleptrs_size[i][0] = s->sampleptrs_size[i][1] = 0;
+    }
+
+    for (i = 0; i < 2; i++) {
+        av_freep(&s->samples[i]);
+        s->samples_size[i] = 0;
+
+        av_freep(&s->best_buffer[i]);
+        s->best_buffer_size[i] = 0;
+
+        av_freep(&s->temp_buffer[i][0]);
+        av_freep(&s->temp_buffer[i][1]);
+        s->temp_buffer_size[i][0] = s->temp_buffer_size[i][1] = 0;
+    }
+
+    av_freep(&s->js_left);
+    av_freep(&s->js_right);
+    s->js_left_size = s->js_right_size = 0;
+
+    av_freep(&s->orig_l);
+    av_freep(&s->orig_r);
+    s->orig_l_size = s->orig_r_size = 0;
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(WavPackEncodeContext, x)
+#define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+    { "joint_stereo",  "", OFFSET(joint), AV_OPT_TYPE_BOOL, {.i64=-1}, -1, 1, FLAGS },
+    { "optimize_mono", "", OFFSET(optimize_mono), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
+    { NULL },
+};
+
+static const AVClass wavpack_encoder_class = {
+    .class_name = "WavPack encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_wavpack_encoder = {
+    .name           = "wavpack",
+    .long_name      = NULL_IF_CONFIG_SMALL("WavPack"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_WAVPACK,
+    .priv_data_size = sizeof(WavPackEncodeContext),
+    .priv_class     = &wavpack_encoder_class,
+    .init           = wavpack_encode_init,
+    .encode2        = wavpack_encode_frame,
+    .close          = wavpack_encode_close,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_U8P,
+                                                     AV_SAMPLE_FMT_S16P,
+                                                     AV_SAMPLE_FMT_S32P,
+                                                     AV_SAMPLE_FMT_FLTP,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/wavpackenc.h b/libavcodec/wavpackenc.h
new file mode 100644
index 0000000..9dd2a01
--- /dev/null
+++ b/libavcodec/wavpackenc.h
@@ -0,0 +1,664 @@
+/*
+ * WavPack lossless audio encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_WAVPACKENC_H
+#define AVCODEC_WAVPACKENC_H
+
+#include "wavpack.h"
+
+typedef struct WavPackDecorrSpec {
+    int8_t joint_stereo, delta, terms[MAX_TERMS+1];
+} WavPackDecorrSpec;
+
+static const WavPackDecorrSpec fast_specs[] = {
+ { 1, 2, { 18,17 } }, { 1, 1, { 17,17 } }, { 0, 2, { 18,17 } },
+ { 0, 1, { 17,17 } }, { 1, 3, {  1,18 } }, { 1, 1, { 17, 1 } },
+ { 0, 1, {  1,17 } }, { 0, 1, { -2,17 } }, { 0, 2, { -1,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 3, { 18,18 } }, { 0, 1, { 17, 1 } },
+ { 1, 6, {  1, 2 } }, { 1, 1, { 17, 3 } }, { 0, 1, { -2, 3 } },
+ { 0, 1, {  2,17 } }, { 0, 1, { 18,-2 } }, { 0, 1, { -1,17 } },
+ { 0, 1, { 18,17 } }, { 0, 1, { 17, 2 } }, { 1, 2, { 18,-2 } },
+ { 1, 1, {  1,17 } }, { 0, 3, { 18, 2 } }, { 0, 1, { 17,-2 } },
+ { 0, 1, { 18,-2 } }, { 1, 2, { 17,-3 } }, { 0, 1, { 18, 3 } },
+ { 0, 1, { 18,18 } }, { 1, 1, {  1, 3 } }, { 1, 1, { 18, 3 } },
+ { 1, 1, {  1, 3 } }, { 0, 2, { 18,17 } }, { 1, 1, {  1,17 } },
+ { 1, 1, { 17, 3 } }, { 0, 3, { 18,17 } }, { 0, 1, { 18,18 } },
+ { 1, 1, {  1, 3 } }, { 1, 1, {  1,18 } }, { 0, 1, { 18,-2 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { -1,18 } }, { 1, 1, { 17, 3 } },
+ { 0, 1, { 17, 2 } }, { 0, 1, { 17, 3 } }, { 1, 1, { 18, 2 } },
+ { 1, 1, { 17,-2 } }, { 0, 1, {  1,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 1, { 17,-2 } }, { 1, 1, { 17,-2 } }, { 0, 1, { 18, 3 } },
+ { 0, 1, {  2,17 } }, { 1, 2, { 18,-3 } }, { 1, 2, {  1,18 } },
+ { 1, 2, { 18, 2 } }, { 0, 1, { 17,-1 } }, { 0, 1, { 17,-2 } },
+ { 1, 1, { 17,-2 } }, { 1, 1, {  1, 3 } }, { 0, 1, {  1,17 } },
+ { 1, 2, { 18,-2 } }, { 1, 2, { 17,-3 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17, 2 } }, { 1, 2, { 18,18 } },
+ { 0, 1, { 17, 2 } }, { 0, 1, { 18,17 } }, { 1, 1, {  1,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,18 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 6, {  1, 2 } }, { 0, 3, { 17,17 } },
+ { 0, 1, {  1,18 } }, { 0, 1, {  1,-2 } }, { 1, 1, { 17, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 1, { 18, 3 } },
+ { 1, 2, { 17,-3 } }, { 0, 1, { 17, 2 } }, { 0, 1, { 17, 3 } },
+ { 0, 1, { 18,-2 } }, { 1, 1, { 18,18 } }, { 1, 6, {  1, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -1,17 } },
+ { 1, 1, { 18, 3 } }, { 0, 1, { 17,18 } }, { 1, 1, { 17, 3 } },
+ { 0, 1, { 18, 3 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 18, 2 } }, { 0, 1, { -2, 3 } }, { 0, 4, { 18,-1 } },
+ { 0, 2, { 18,18 } }, { 0, 1, { -2, 3 } }, { 1, 1, { 17,-2 } },
+ { 0, 1, { 17, 3 } }, { 0, 2, { 18,17 } }, { 0, 2, { -1,18 } },
+ { 1, 1, {  2,17 } }, { 0, 2, { 17,-2 } }, { 0, 1, { 17, 2 } },
+ { 1, 2, { 18,-3 } }, { 0, 1, { 17,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17,-2 } }, { 1, 2, { 17,-3 } },
+ { 1, 1, {  1, 3 } }, { 1, 1, {  2,17 } }, { 1, 2, { 18, 2 } },
+ { 1, 1, {  2,17 } }, { 1, 1, { 18, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { 17,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 17,-1 } }, { 0, 2, { 18,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 1, {  1, 3 } },
+ { 0, 2, { -2,17 } }, { 0, 2, { 18,-2 } }, { 0, 2, { 17,-2 } },
+ { 1, 1, {  2,17 } }, { 1, 1, {  1, 3 } }, { 0, 1, {  2,17 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { -1,17 } }, { 1, 1, {  2,17 } },
+ { 0, 2, { 18,18 } }, { 0, 1, { 17, 2 } }, { 1, 4, { 18,-3 } },
+ { 1, 1, { 18, 1 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 18,-1 } }, { 0, 1, { -1,18 } }, { 1, 6, {  1, 2 } },
+ { 1, 1, { 17, 2 } }, { 1, 4, { 18, 3 } }, { 0, 1, {  1,17 } },
+ { 0, 1, { 18, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17, 2 } }, { 0, 2, { 18,-2 } }, { 0, 1, {  1,18 } },
+ { 1, 2, { 18,-3 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 2, { 18,18 } }, { 1, 3, { 17,17 } },
+ { 0, 1, { -2,17 } }, { 0, 1, { 17,18 } }, { 0, 1, { -1, 3 } },
+ { 1, 1, {  2,17 } }, { 0, 2, { 18,-1 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17,-2 } }, { 1, 2, { 17, 2 } },
+ { 1, 1, { 18, 3 } }, { 0, 1, { 18, 2 } }, { 1, 2, { 17,-3 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -2,17 } },
+ { 0, 1, { 17,-1 } }, { 0, 1, { 18,-1 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 1, {  1,18 } }, { 1, 3, { 18, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { 18,18 } }, { 0, 1, {  1,-2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 1, { 18,18 } }, { 0, 2, { 18, 2 } },
+ { 0, 1, { 17,18 } }, { 1, 2, { 18, 2 } }, { 1, 1, { 17,-2 } },
+ { 0, 2, { 17,-1 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 1, {  1,-2 } }, { 0, 1, { 18, 1 } },
+ { 1, 2, { 18,-2 } }, { 0, 1, { 17, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17, 3 } }, { 0, 1, { 17,-1 } },
+ { 0, 1, { 18, 2 } }, { 1, 1, { 17, 3 } }, { 1, 1, { 17,-2 } },
+ { 0, 1, { 18,18 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17,18 } }, { 0, 1, { -2, 3 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 2, { 18,-3 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { 18, 2 } }, { 0, 1, {  1,18 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { 17,-1 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -2, 3 } },
+ { 0, 3, { 17,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } },
+};
+
+static const WavPackDecorrSpec default_specs[] = {
+ { 1, 2, { 18,18, 2,17, 3 } }, { 0, 2, { 18,17,-1, 3, 2 } },
+ { 1, 1, { 17,18,18,-2, 2 } }, { 0, 2, { 18,17, 3,-2,17 } },
+ { 1, 2, { 18,17, 2,17, 3 } }, { 0, 1, { 18,18,-1, 2,17 } },
+ { 0, 1, { 17,17,-2, 2, 3 } }, { 0, 1, { 18,-2,18, 2,17 } },
+ { 1, 2, { 18,18,-1, 2, 3 } }, { 0, 2, { 18,17, 3, 2, 5 } },
+ { 1, 1, { 18,17,18, 2, 5 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 0, 1, { 17,-2,17, 2,-3 } },
+ { 1, 1, { 17,-2,17, 1, 2 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 0, 2, { 18,17, 3, 2, 5 } },
+ { 0, 1, { 18,18,18, 2,17 } }, { 0, 1, { 18,17,-1, 2,18 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 0, 2, { 18,-2,18, 2, 3 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 0, 3, { 18,17, 2, 3,17 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 0, 1, { 17,18,-2, 2,17 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 0, 1, { 18,17, 1, 4, 6 } }, { 1, 1, {  3,17,18, 2,17 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 0, 1, { 18,17,-1, 2, 3 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 1, 2, { 18,17,-1,17, 3 } },
+ { 1, 2, { 18,17, 2, 3,-1 } }, { 0, 2, { 18,18,-2, 2,17 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 5, { -2,18,18,18, 2 } },
+ { 1, 1, { 18,18,-1, 6, 3 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 1, 1, { 18,17,18, 2,17 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 0, 1, { -2,18, 2, 2,18 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 0, 1, { 17,18,-2, 2,17 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 1, 3, { 18,-3,18, 2, 3 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 1, 1, { 18,18, 3, 5, 2 } },
+ { 0, 2, { 18,18,-1, 2,17 } }, { 0, 1, { 18,-1,17,18, 2 } },
+ { 0, 1, { 17,-1, 2, 3, 6 } }, { 0, 1, { 18,-2,18, 2, 5 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 0, 3, { 18,18, 2, 3,17 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 0, 1, { -1, 3, 5, 4, 7 } }, { 0, 3, { 18,18, 3, 2, 5 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 1, { 18,17,-2,18, 3 } },
+ { 0, 2, { 18,18,-2, 2,17 } }, { 0, 3, { 18,17,-2, 2, 3 } },
+ { 1, 1, { 18,18,-2, 2,17 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 1, 2, {  3,18,17, 2,17 } }, { 1, 2, { 18,18, 2,-2,18 } },
+ { 1, 2, { 18,18,-1,18, 2 } }, { 0, 2, { 18,18,-2, 2,17 } },
+ { 1, 3, { 18,18, 2, 3,-2 } }, { 0, 3, { 18,18, 3, 2, 5 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 1, 1, { 17, 3, 2, 1, 7 } },
+ { 1, 3, { 18,18,-2, 2,18 } }, { 1, 1, { 17,18,18,-2, 2 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 0, 2, { 18,-2,18, 2, 3 } },
+ { 0, 1, { -1, 3, 4, 5, 7 } }, { 1, 1, { 17,17, 2,-1, 7 } },
+ { 0, 1, { 18,-1,-1, 2,-2 } }, { 0, 2, { 18,17, 2, 3,17 } },
+ { 0, 1, { 18,17, 2,18, 2 } }, { 0, 2, { 18,17,-1, 2,17 } },
+ { 0, 1, {  1,18, 3, 2, 5 } }, { 0, 2, { 18,-2, 4,18, 2 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 1, { 18,17,18, 2, 5 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 4, { 18,18,-2, 2,18 } },
+ { 1, 1, { 18,18, 3, 2, 5 } }, { 1, 1, { 17,17, 2, 1, 4 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18, 2, 1, 3 } }, { 1, 1, { 17,17, 2, 1, 4 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 1, { 18,17, 1, 4, 6 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 0, 1, { 18,-2,18, 2, 5 } },
+ { 1, 1, { 17, 2,18, 2,17 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18, 3, 6,-1 } }, { 0, 1, { 18,17, 2,18, 3 } },
+ { 0, 1, { 18,17,-2, 2,17 } }, { 1, 1, {  3,17,18, 2,17 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 1, 3, { 18,18,-3,18, 2 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 1, { 17,-2,17, 2,-3 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 0, 1, { 18,-1,-1, 2,-2 } }, { 1, 1, { 18, 3, 1, 5, 4 } },
+ { 0, 3, { 18,17,-1, 2,17 } }, { 1, 3, { 18,17, 2,18,-2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 4, {  3,18,18, 2,17 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 1, { 18,17,-1,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18,18, 3, 2 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 0, 1, { 17,-1, 2, 3, 6 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 1, 3, { 18,17, 2,-2,18 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 0, 1, { 18,18, 2,18,-2 } }, { 0, 2, { 18,-2, 4,18, 2 } },
+ { 0, 1, { -2,18, 2, 2,18 } }, { 0, 2, { 18,17, 3, 6, 2 } },
+ { 0, 1, { 18,17,18, 2, 5 } }, { 0, 3, { 18,18,-2, 3, 2 } },
+ { 1, 1, { 18,18, 2,18, 5 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 1, 4, { 18,18, 2, 3,-2 } }, { 0, 2, { 18,17,18, 2,-2 } },
+ { 0, 1, {  1,18, 3, 2, 5 } }, { 1, 4, { 18,-2,18, 2, 3 } },
+ { 1, 2, { 18, 2,18, 3,-2 } }, { 0, 2, { 18,18,18, 2, 4 } },
+ { 0, 2, {  3,17,18, 2,17 } }, { 1, 1, { 18,-1,18, 2,17 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 0, 3, {  3,18,18, 2,17 } },
+ { 0, 1, { 18,-1,17,18, 2 } }, { 0, 1, { 18,17, 2,18, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, { 18,17, 2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 1, { 18,17,-2, 2, 3 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 1, 4, { 18,-2,18, 2, 3 } },
+ { 1, 3, { 18,17, 2, 3, 6 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 0, 2, { 18,17, 2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 0, 2, { 18,18,-2, 2, 3 } },
+ { 1, 2, { 18,17, 2,17, 3 } }, { 0, 1, { 18,17, 2, 3,18 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 1, { 17,-2,17, 2,-3 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 1, 1, { 18,18,18, 2, 4 } }, { 1, 2, { 18, 2,18, 3,-2 } },
+ { 1, 1, { 18,18,-2, 2,17 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18, 2,17, 3 } }, { 0, 2, { 18,18,18, 2, 4 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, { 18,-2,18, 3, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18,-2, 2,17 } }, { 0, 3, { 18,17, 2, 3,17 } },
+ { 1, 2, { 18,18, 2,-2,18 } }, { 0, 1, { -1, 3, 5, 4, 7 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 1, 1, { 18,18,-2,18, 3 } },
+ { 0, 2, { 18,17,18, 2,-2 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 1, 2, { 18, 2,18, 3,-2 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 1, 3, { 18,17, 2, 3, 6 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 18,17,-2,-1,17 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2, 2, 3 } },
+ { 1, 1, { 18,18,18, 2, 5 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,17, 3, 6, 2 } },
+ { 0, 2, { 18,17,18, 2, 3 } }, { 0, 3, { 18,17,-3,18, 2 } },
+ { 0, 1, { 18,18,18, 2, 3 } }, { 0, 1, { 18,-2,-3, 2, 6 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 1, { 18,17,18, 2, 5 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,17,18, 2, 5 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18,18, 2, 3 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 1, 1, { 17,17, 2,-1, 7 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 0, 1, {  1,18, 3, 2, 5 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18, 3, 6, 2 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 18,-2,18, 3, 2 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 1, 1, { 18,18, 3, 2, 5 } }, { 0, 1, { 18,18,-1, 2, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,17,18, 2, 5 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, {  3,18,18, 2,17 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+};
+
+static const WavPackDecorrSpec high_specs[] = {
+ { 1, 2, { 18,18,18,-2, 2, 3, 5,-1,17, 4 } }, { 0, 1, { 18,17,-2, 2,18, 3, 7, 2, 5, 4 } },
+ { 1, 2, {  1,18, 3, 6,-2,18, 2, 3, 4, 5 } }, { 0, 2, { 18,18,-2, 2,18, 3, 6, 2,17, 4 } },
+ { 1, 2, { 18,18, 2,18, 3, 2,-1, 4,18, 5 } }, { 1, 1, {  7, 6, 5, 3, 4, 2, 5, 4, 3, 7 } },
+ { 1, 1, { 17, 3,18, 7, 2, 6, 1, 4, 3, 5 } }, { 1, 1, { -2,18,18,18, 3,-2, 6, 5, 2, 1 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 6,-2,17, 5 } }, { 0, 1, { 17,17,18, 3, 6, 4, 5, 2,18,-2 } },
+ { 1, 2, {  1,18,-2, 3, 5, 2, 4,-1, 6, 1 } }, { 0, 2, { 18,18, 3, 6,18, 2, 4, 8, 5, 3 } },
+ { 0, 1, { -2, 1,18, 2,-2, 7,18, 2,-1, 5 } }, { 1, 1, {  4, 3, 8, 1, 5, 2, 5, 6, 2, 8 } },
+ { 1, 1, { 17,18, 2, 6, 3, 4,-1, 1, 8, 6 } }, { 0, 1, { 18,18, 3, 6, 3,-2, 2, 5,-1, 1 } },
+ { 0, 1, { 18,18,17,-1, 2,-2,18, 3, 4, 5 } }, { 1, 2, { 18,17, 2,-2,18, 3, 5, 7, 2, 4 } },
+ { 1, 2, { 18,18, 3, 6,-2,18, 2, 5, 8, 3 } }, { 0, 1, { 18,17, 2,18,18, 2, 6, 5,17, 7 } },
+ { 1, 2, { 18,17, 2,18, 3, 2, 6,18,-1, 4 } }, { 1, 1, {  5, 3, 6, 5, 3, 4, 1, 2, 4, 7 } },
+ { 1, 1, {  5, 3, 6, 5, 3, 4, 1, 2, 4, 7 } }, { 0, 1, { -2,18,18,18,-2, 3, 2, 4, 6, 5 } },
+ { 1, 2, { 18,17,-3, 3,-1,18, 2, 3, 6, 5 } }, { 0, 1, { 17,18, 7, 3,-2, 7, 1, 2, 4, 5 } },
+ { 1, 1, {  2,18,18,-2, 2, 4,-1,18, 3, 6 } }, { 0, 3, {  1,18, 4, 3, 5, 2, 4,18, 2, 3 } },
+ { 0, 1, { -2,18, 2,18, 3, 7,18, 2, 6,-2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 5, 1, 4, 3 } }, { 1, 1, { 18, 3, 6, 5, 7, 8, 2, 3, 1,-1 } },
+ { 1, 1, { 18,18,18, 2,-2, 3, 5,18, 2, 8 } }, { 0, 2, { 18,17,-2, 2, 3,18,-3, 5, 2, 7 } },
+ { 1, 1, {  1, 1,-1, 8,17, 3,-2, 2, 6,17 } }, { 0, 2, { 18,18,17, 2,-2, 3, 2, 4,18, 5 } },
+ { 1, 1, { 17,18, 2,-1, 5, 7,18, 3, 4, 6 } }, { 1, 1, {  5, 4, 5,17, 3, 6, 3, 4, 7, 2 } },
+ { 0, 1, { 17, 3, 1, 7, 4, 2, 5,-2,18, 6 } }, { 0, 1, { 17,18, 2,18, 4, 3, 5, 7,-3, 6 } },
+ { 1, 2, { 17,17,-3,-2, 2, 8,18,-1, 3, 5 } }, { 0, 1, { 17,17,18, 2, 3, 6,-2, 8, 1, 7 } },
+ { 1, 1, {  1, 2, 6,-2,18, 2, 5,-3, 7,-2 } }, { 0, 1, { 18,18, 3,18, 6, 8,-2, 2, 3, 5 } },
+ { 0, 1, { 18,17, 2,18,-2, 3, 7, 6, 2, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18, 2,-1, 3, 6, 1, 3, 4, 8 } }, { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } },
+ { 0, 1, { 18,17,-3,18, 2, 4,-2, 3, 6,17 } }, { 1, 3, {  1, 2,17, 3,18, 7,-1, 5, 2, 4 } },
+ { 1, 1, { 18, 3,18, 6, 8,18,-2, 5, 7, 2 } }, { 0, 1, { 17, 2,18, 6, 3, 2, 5, 4, 8, 1 } },
+ { 0, 1, { 18,17,-1, 2, 3,18,18, 2, 3,17 } }, { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 4 } },
+ { 1, 1, {  6,17, 3, 8, 1, 5, 7,-1, 2, 1 } }, { 1, 1, { 18,-2,18, 3,-2, 2, 7, 4, 6,18 } },
+ { 1, 3, { 18,-3,18, 2, 3,18,-1, 7, 2, 5 } }, { 0, 2, { 18,-2, 7, 1, 3, 2, 4, 6,-3, 7 } },
+ { 1, 1, { 18,-2, 2,-3,18,-2,17,-1, 4, 2 } }, { 0, 3, { 17,17, 2, 5, 3, 7,18, 6, 4, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,17, 4, 6, 6, 4, 5, 3, 4, 1 } }, { 0, 1, { 18, 5, 3, 6, 2, 3, 8, 1, 3, 7 } },
+ { 1, 2, { 18,17,-2, 2,18, 3, 5, 7,-1, 2 } }, { 0, 1, {  1,18,18, 3, 6,-1, 4, 8, 5, 2 } },
+ { 1, 1, {  1, 5, 3, 4, 1, 1, 3, 5, 7, 3 } }, { 0, 1, {  3,18,18, 2,18,18,-1, 2, 3,18 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 4, 6,18, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18, 3, 1, 4, 5, 2, 7, 1, 3, 6 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 5,-2, 6, 8 } }, { 1, 1, { 17,18, 4, 8, 3, 2, 5, 2, 7, 6 } },
+ { 1, 4, {  1, 2, 5,18,-2, 2, 3, 7,-1, 4 } }, { 0, 2, { 18,17,-1, 3, 6,18, 2, 3, 7, 5 } },
+ { 0, 1, { -2,18, 2,-3, 6,18, 4, 3,-2, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17,17, 6, 2, 4, 8, 3, 5,-1,17 } }, { 1, 1, { 18, 3,18, 6, 8,18,-2, 5, 7, 2 } },
+ { 1, 2, { 17,17,-3, 2,18,-2, 8, 3, 6,-1 } }, { 1, 1, { 18,-2,17,18, 2, 3,-2, 6, 5, 4 } },
+ { 1, 2, { 18,17,-1, 3,18, 2, 5, 3, 6,-3 } }, { 0, 1, { 18,17, 2,18, 7,18, 2, 4, 3,17 } },
+ { 1, 3, { 18,18, 5, 6, 4, 3, 4,18, 6, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, {  7, 6, 5, 3, 4, 2, 5, 4, 3, 7 } }, { 0, 1, { -2,18,18,18, 3, 6, 4, 2, 5, 2 } },
+ { 0, 3, { 18,17,-3,18, 3, 2, 5,-1,17, 3 } }, { 1, 1, { 17,18, 7, 3, 1, 7, 4, 2, 6, 5 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 3, { 18,18,-1, 3, 2, 7, 5,18, 4, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18, 2,-2, 4, 8,18, 3, 6, 5 } }, { 0, 2, { 18,17, 3, 5,-2, 7, 2,18, 3,-1 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 2, {  3,17,18,18, 2, 5, 7, 6,18, 3 } },
+ { 1, 1, { 17,18,18, 4, 3, 2,18, 7, 8,-1 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17, 1, 2, 3, 5, 6, 1, 4, 8,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,17,-1,18,-3, 2, 8, 3, 6,17 } }, { 1, 1, { 17,17, 1, 2, 4, 5,-1, 2, 1, 6 } },
+ { 1, 1, {  1, 2, 6,-2,18, 2,-3, 3,-2, 5 } }, { 0, 1, { 18, 3,18, 6,18, 5, 2, 4,-1, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18,-1, 2,18, 3, 6, 4,-2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { -1,18,18,18, 2,-2, 4, 7, 2, 3 } }, { 0, 3, {  3,17,-2, 5, 2, 7,18, 6, 4, 5 } },
+ { 0, 1, { 17, 6,18, 3, 8, 4, 5, 3, 8,18 } }, { 0, 2, { 18, 2, 6, 2,18, 3, 2, 4, 5, 8 } },
+ { 0, 1, {  3,18,18, 2,18,-1, 2,18, 2,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  3, 6,17,-2, 5, 1, 2, 7, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 3, {  3,18,17, 5, 6, 2, 7,-2, 8,18 } }, { 1, 1, { 18,-1, 3, 1, 7, 2,-1, 4, 6,17 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 2, { 18, 1, 2,18, 3, 6, 5, 2, 4, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,-2, 2,18,18, 8, 5, 3, 2, 6 } }, { 0, 1, { 18,17, 2,18, 3, 2, 7,-2,18, 4 } },
+ { 1, 2, {  1,18, 2, 3,-1, 5, 6, 4, 7,17 } }, { 0, 2, { 18,17, 3, 6,-2, 2, 3, 8, 5,17 } },
+ { 0, 2, { 18,18, 3, 2,18,-1, 2, 4, 3,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 17,-1,18, 2, 3,-2, 5,18, 2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,-3,18, 2, 3,-2,18, 5, 6,-3 } }, { 0, 2, { 18,17, 3, 5,-2, 7, 2,18, 3,-1 } },
+ { 1, 1, {  1,18,-1, 2, 3, 1,-2, 8, 2, 5 } }, { 0, 1, { 18,18, 3, 6,18, 2, 3, 4, 8, 5 } },
+ { 0, 1, { -2, 1,18, 2,-2, 5, 7,18, 2,-1 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18,-1, 2, 8, 3, 4, 5, 1, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18,-1, 2,18, 3,-2, 5, 4, 2 } }, { 1, 1, { 18,17, 2,18, 3, 8, 5, 2, 7,17 } },
+ { 0, 1, { 18,18, 3,18, 6, 8,-2, 2, 3, 5 } }, { 0, 1, { 18,18, 2,18, 2, 6,18, 2,17, 7 } },
+ { 1, 3, { 18,17,18, 2, 8,18, 5,-1, 3, 6 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,17,-1, 3, 6,18, 2, 5, 8, 3 } }, { 0, 1, { 17,18,18, 4, 7, 2, 3,-2,18, 5 } },
+ { 1, 2, { 18, 1, 2, 6, 2, 5,18, 2, 4, 8 } }, { 0, 4, { 18, 4, 1, 2, 3, 5, 4, 1, 2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 2, { 18,17, 2,-1,18, 3,-3, 5, 2, 4 } },
+ { 0, 1, { 17,17, 3, 6, 3, 5,-2, 2,18,-1 } }, { 0, 2, { 18,18, 3,-2,18, 2,-3, 5, 3, 6 } },
+ { 1, 1, { 17,17, 2, 4, 1, 3, 5, 2, 6,-3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17, 1, 3, 2, 7, 1, 6, 3, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17,-1,18, 2, 1, 5, 3, 8,-1,-2 } }, { 1, 1, { 17,18,-1, 8, 2, 5, 3, 4, 1, 6 } },
+ { 1, 2, {  1,18, 3,-1, 5, 1, 2, 4, 7, 6 } }, { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  1,18,-1, 3, 8, 5, 6, 1, 2, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18, 2, 3, 6,18,-1, 4, 2, 3 } }, { 1, 1, {  1, 3, 5,18, 2, 6, 7, 2, 3, 1 } },
+ { 1, 1, {  1, 3, 8,18, 5, 2, 7, 1, 3,-2 } }, { 0, 2, { 17, 2,18, 3, 6, 2, 4, 5, 8, 3 } },
+ { 0, 1, { 18,17, 2,18, 3, 2, 7,-2,18, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,-3,18,-1, 3,-2, 5, 7, 1, 2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 3, { 18,18, 2, 6,18, 5,18, 2, 3,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 3, {  1,-1, 1, 3,-2, 2, 5, 7,-3,18 } }, { 1, 2, { 18, 7, 3,-3, 2, 8, 2, 5, 4,17 } },
+ { 1, 1, {  1, 4, 5, 1, 3, 4, 6, 7, 8, 3 } }, { 0, 1, { 18,17, 2,18,-1, 2, 3,18, 2, 4 } },
+ { 0, 2, { 18,18,-2,18, 2, 3, 4, 7, 5,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18, 2, 1, 3, 2, 5, 1, 2, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18,-1, 2, 3, 5, 8, 6, 1,-2 } }, { 0, 1, { 17,18, 8, 3, 4, 6, 5, 2, 8, 7 } },
+ { 1, 2, {  1, 3,-2,18, 2, 5, 1, 7,-1,-2 } }, { 0, 3, { 18,17,-1, 3,18, 2, 3, 6, 4,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,18, 4,18, 6, 7, 8, 3,18, 2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 17,-3,17, 2,-2, 8, 3,18, 4,-3 } }, { 1, 1, { 18,17, 3, 5, 6, 2, 8, 1, 3, 7 } },
+ { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } }, { 0, 3, { 18,18, 2, 6,18, 5,18, 2, 3,17 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 5, 1, 4, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, {  3,17,18,-3, 2, 5,18, 6,-1, 7 } }, { 1, 1, { 17,18, 3, 2, 5,-1, 6, 8, 4, 7 } },
+ { 1, 1, { 18, 1,-2, 3, 2, 1, 7, 6, 3, 4 } }, { 0, 3, {  1, 2,17, 3,18, 2, 7, 5, 4,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,-2, 2,18,18, 8, 5, 3, 2, 6 } }, { 0, 2, { 18, 5,18, 2, 3, 7,-2, 1, 6, 8 } },
+ { 0, 1, {  2,-1,18,-1, 2, 4,-3, 5,18, 3 } }, { 0, 1, {  3,17,18, 5, 2,18, 7, 3, 6, 5 } },
+ { 1, 4, {  1, 2, 5,18,-2, 2, 3, 7,-1, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  1,18, 2, 1, 3, 4, 1, 5, 2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { 17,17,18, 2, 4, 5,18,-2, 6, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 2, { 18,18,-1, 3, 5, 6, 8,18, 2, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 18,18, 4, 6, 8,18, 7, 3, 2, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { -1,18,18,18, 2, 4,-2, 2, 3, 6 } }, { 0, 2, { 18,-2, 7, 1, 3, 2, 4, 6,-3, 7 } },
+ { 1, 1, { 17,18, 8, 3, 4, 6,-2, 5, 3, 8 } }, { 0, 2, { 18, 1, 2, 6, 2, 8, 3,18, 5, 4 } },
+ { 1, 1, {  3,18,18, 2,18, 2,18, 3, 2,18 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, {  3,17,18, 5, 2, 6, 7, 1, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+};
+
+static const WavPackDecorrSpec very_high_specs[] = {
+ { 1, 2, { 18,18, 2, 3,-2,18, 2, 4, 7, 5, 3, 6, 8,-1,18, 2 } },
+ { 0, 1, { 18,18,-1,18, 2, 3, 4, 6, 5, 7,18,-3, 8, 2,-1, 3 } },
+ { 1, 2, {  1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 } },
+ { 0, 1, { 17,17, 2, 3, 4,18,-1, 5, 6, 7,18, 2, 8,17, 3,-2 } },
+ { 1, 1, { 18,18, 2,18, 3, 2,18, 4,-1, 3,18, 2, 6, 8,17, 5 } },
+ { 0, 2, { 18,17, 2, 3,-2, 5,18,-3, 2, 4, 7, 3, 6, 8, 5,17 } },
+ { 1, 1, { 18,-2, 2,-3,18, 5,-2,18, 2, 3, 6, 2,17, 4, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3,-2, 2, 5, 4,18, 6, 3, 8, 7, 2, 5, 4 } },
+ { 0, 2, { 18,17,-2, 2,18, 3, 2, 5,-3, 4, 7,18, 3, 8, 6, 2 } },
+ { 1, 1, {  3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 } },
+ { 1, 2, {  1,18, 3, 2,-2, 1, 5, 4, 6, 2, 7, 1, 8, 3,-1, 1 } },
+ { 0, 1, { 18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 } },
+ { 0, 1, { -2,18, 2,18, 7, 2, 6,-2, 3, 4,18,18, 2,-3, 8, 5 } },
+ { 0, 2, { 18,18,18, 2, 4, 3,18, 5, 3, 6,-2, 2, 4,18, 8, 7 } },
+ { 0, 1, { -2, 1,18, 2,-2,18,-1, 5, 7, 2, 3, 4,18, 2, 6, 2 } },
+ { 1, 1, { 17,18, 3, 2, 1, 7,-1, 2, 4, 3, 5, 6,-2,18, 7, 8 } },
+ { 1, 1, { 18,18, 2,18, 3, 4, 6,-2,18, 5, 8, 2, 3, 7, 4,-1 } },
+ { 0, 1, { 18,18,18,-1, 2, 3, 4, 6, 8,18, 3, 5, 2, 6, 7, 4 } },
+ { 1, 1, { 17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 } },
+ { 0, 1, { 17,17,18, 2, 3, 6,-2, 8, 1, 7, 5, 2, 3, 1, 4, 8 } },
+ { 1, 1, { 17,17, 3, 2, 7, 1, 4, 3, 6, 2, 5,-2, 8, 7,18, 6 } },
+ { 0, 1, { 18,17,-2, 2,18, 3,-3, 7, 6, 5, 2, 4,-1, 8, 3,17 } },
+ { 1, 1, {  2,18,18,-2, 2, 4,-1, 5,18, 3, 8, 6, 2, 7,17, 4 } },
+ { 0, 1, { 17, 3, 6, 8, 5, 4, 3, 8, 1,18, 7, 2, 4, 5, 6, 3 } },
+ { 1, 2, { 17,18, 4, 8, 3, 2, 5, 7, 6, 8, 2, 7,-2,18, 3, 4 } },
+ { 1, 1, {  6, 5, 5, 3, 4, 7, 3, 2, 4, 6, 3, 7, 1, 5, 2, 4 } },
+ { 1, 1, {  1,18,-1, 2, 1, 3, 8,-2, 2, 5, 6, 3, 8, 7,18, 4 } },
+ { 0, 1, {  1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 } },
+ { 0, 1, { 18, 2,18,18, 2,18, 6,-2,18, 7, 5, 4, 3, 2,18,-2 } },
+ { 0, 3, {  1, 4,18, 3, 2, 4, 1, 5, 2, 3, 6,18, 8, 7, 2, 4 } },
+ { 0, 1, { 17,-2, 1,-3, 2,18, 3,-2, 4,18, 3, 6, 7,-3, 2, 8 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 1, 2, { 18,-1,17,18, 2, 3,-2,18, 5, 8, 2, 4, 3, 7, 6,-1 } },
+ { 1, 1, { 18,18,18,-2, 4, 2, 3,18, 5, 8, 2, 4, 6, 7,-2, 3 } },
+ { 1, 2, { 18,18,-2,18,-1, 3, 2, 5,18,-2, 7, 2, 3, 4, 6, 8 } },
+ { 0, 1, { 17,18,-1, 2, 4,18, 8, 3, 6, 5, 7,-3, 2, 4, 3,17 } },
+ { 1, 1, { 18,18,17, 2,-1,18, 3, 2,18, 6, 5, 4,18, 7, 2,-1 } },
+ { 0, 2, {  1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 } },
+ { 1, 1, {  1,17,-2, 2,-3, 6, 3, 5, 1, 2, 7, 6, 8,-2, 4, 1 } },
+ { 0, 1, { 17,-1, 5, 1, 4, 3, 6, 2,-2,18, 3, 2, 4, 5, 8,-1 } },
+ { 0, 2, { 18,18,17, 2, 3,-2, 5,18, 2, 4, 7, 8, 6,17, 3, 5 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 1, 2, {  1,-1, 3, 2,18, 7,-2, 5, 2, 6, 4, 3,-1,18, 8, 7 } },
+ { 0, 2, { 18,17, 3,18, 2, 5, 4, 3, 6, 2, 7, 8,18, 3, 4, 5 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 2, { 18,18, 3,-3,18, 2, 6, 5, 3, 7,18, 4,-2, 8, 2, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 1, 1, {  3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 } },
+ { 0, 1, { 18,18,18, 2, 4,-1,18, 8,-1, 2, 3, 4, 6,-2, 1, 7 } },
+ { 1, 1, { 18,-2,17,18, 2, 6, 3,-2, 5, 4, 7, 1,-3, 8, 2, 6 } },
+ { 0, 1, { 17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 1, 5, 4, 3, 2, 5, 6, 1, 4, 5 } },
+ { 0, 1, { 18,18,-2,18, 2,-3, 3, 8, 5,18, 6, 4, 3,-1, 7, 2 } },
+ { 1, 1, { 18, 2,-2,-3,18, 5, 2, 3,-2, 4, 6, 1,-3, 2, 7, 8 } },
+ { 0, 1, { 18, 3, 5, 8, 2, 6, 7, 3, 1, 5, 2,-1, 8, 6, 7, 4 } },
+ { 1, 1, {  4, 3, 8, 1, 5, 6, 2, 5, 8,-2, 2, 7, 3,18, 5, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 3,18,18, 7, 2, 4,18, 6, 2, 3,-1, 8, 5,18,-3 } },
+ { 0, 1, {  3,17,18, 2,18, 6, 7,-3,18, 2, 5, 6, 3, 8, 7,-1 } },
+ { 1, 1, { 18,18, 2,18,18, 2,-1, 7, 3,18, 5, 2, 6, 4,-1,18 } },
+ { 0, 3, { 18, 3, 4, 1, 5, 2,18, 4, 2, 3,18, 7, 6, 1, 2, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1,18, 2, 3, 6, 4, 5, 7,18, 3, 8, 2, 4,-2,17 } },
+ { 1, 2, { 18,17, 2, 3, 5,18, 6,-2, 7, 3, 2, 4,18, 8,-1, 5 } },
+ { 0, 2, {  1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 } },
+ { 1, 1, {  1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 } },
+ { 0, 1, { 18,18, 2,18, 2,18, 7, 6,18, 2,-2, 3, 5, 4,18, 8 } },
+ { 1, 2, { 18,17, 2, 3,18,-1, 2, 3, 6,18, 5, 4, 3, 7, 2, 8 } },
+ { 1, 2, { 18,18, 3,-2, 4,18, 5, 7, 6, 2, 4,-3, 8, 5,18, 3 } },
+ { 1, 1, { 17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 } },
+ { 1, 1, {  3,17,18, 5, 7, 2, 4, 6, 1, 8,-1, 3, 7, 4, 1, 2 } },
+ { 0, 2, {  1,-2, 2,18, 3, 5, 2, 4, 7,-1, 2, 3, 5,18,-2, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1, 2,-2, 6,18,-3, 2, 7, 3,-2, 5, 6, 1, 8, 2, 4 } },
+ { 0, 1, { 18,18,18, 3,-2, 6,18, 2, 4, 3, 5, 8, 7, 6, 2,-2 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 0, 1, {  3,17,18, 2, 5,18, 6, 7, 5,-2, 2, 4,18, 3, 6, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 17,-1,18, 2, 4,-1, 8, 3,18, 7,-3, 4, 5, 1, 2,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 } },
+ { 1, 1, { 18,18, 3, 6, 4, 8,-2, 2, 5, 3, 7,18, 6, 8, 4, 2 } },
+ { 1, 1, { 17,18,18,-2, 5, 2, 3, 1, 4,-1, 8, 6, 5, 3, 2,18 } },
+ { 1, 1, { 17,17, 1, 2, 4, 5, 2, 6,-1, 3, 1, 1,-2, 4, 2, 7 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 0, 1, { 18,17,-2,-3, 1, 2, 3, 2, 5, 4, 7,-3, 6,-2, 2, 1 } },
+ { 1, 1, {  1, 3, 5,18, 1, 2, 7, 3, 6, 2, 5, 8,-1, 1, 4, 7 } },
+ { 1, 1, { 17, 3, 6, 8, 1, 4, 5, 3,-2, 7, 2, 8, 5, 6,18, 3 } },
+ { 1, 1, { 17,18, 2, 4, 8,-2, 3, 1, 5, 6, 7, 1, 2, 3, 4, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 1, 8,18, 5, 2, 3,18, 6, 7,-2, 4, 3, 2, 8,18 } },
+ { 0, 1, { 18,17, 2,18, 3, 4,-1,18, 7, 6, 2, 8, 4,18,18, 5 } },
+ { 0, 1, { 18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 } },
+ { 1, 2, { 18,17,18, 2, 3, 5,-2,18, 6,-1, 2, 3, 7, 4, 8,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 } },
+ { 1, 2, { 18,18,-2,17, 2,18, 3, 4,18, 8, 7,-1, 2, 4, 5,17 } },
+ { 0, 2, { 17,-3,17, 3, 2,-2,18, 8, 4,-3, 2,18, 5, 3,-2, 6 } },
+ { 0, 1, { 18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 } },
+ { 0, 2, {  1,18,-1, 3, 5, 2,-3,18, 7, 3,-1, 6, 4, 2,17, 5 } },
+ { 1, 1, { 17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 } },
+ { 1, 1, {  1,18, 1, 3, 5, 8, 6, 2, 3,-1, 7, 1, 4, 8, 5,-3 } },
+ { 0, 2, {  3,18,18, 2,18,-2, 6, 5, 7, 2, 4,18, 3, 6,-3, 5 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 4, { 18, 2,17, 3,18,-2, 2, 6,18, 2, 7, 3, 5, 4, 8,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 2, 5, 3,-2, 1, 4, 3, 7, 6,-3, 2, 1, 1, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18,18,-2,18,-2, 2, 3, 6,18, 4,-1, 2, 3, 8, 1, 4 } },
+ { 1, 1, { 17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 } },
+ { 0, 1, { 17,17,18, 3, 2,18,18, 6, 8, 2,-2, 3, 5, 4,17,18 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 1, 1, {  1, 3,-3,18,18, 6, 5,18, 2,-1, 3, 8, 7,-3, 4,17 } },
+ { 1, 1, { 18, 1, 2, 1, 3, 8, 7, 4, 1, 5, 2,-1,-3,18, 6, 2 } },
+ { 0, 1, { 18, 3, 5, 2, 6, 8,18, 5, 7, 2, 3,-1, 6, 7, 8, 5 } },
+ { 0, 2, { 18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 3, {  1, 1, 2, 5, 2, 7, 4, 3,-1,18,-2, 8, 2, 1, 6, 7 } },
+ { 0, 1, {  3,17,18, 5, 2, 6, 7,18, 4, 5, 3, 6,18, 2, 7, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, {  1,18, 1, 2, 3, 5, 1, 2, 6, 7, 4, 3, 8, 1,17, 5 } },
+ { 1, 2, { 17,-1,18,-2, 2, 3, 5,18, 2, 4, 6, 7, 3,-1, 5, 8 } },
+ { 1, 1, { 18,18,-3,18,-2, 2, 3,-2,18, 6, 4, 5, 8, 3,17,-3 } },
+ { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 7, 3, 4,-3, 6,18, 8 } },
+ { 0, 2, { 18,18, 2, 3, 5,18, 2, 4, 3, 6,18, 7, 8,-1, 5, 2 } },
+ { 0, 1, { 18,17,-1, 2,18, 3, 2,18, 4, 3,18, 2, 6, 5, 8,17 } },
+ { 0, 2, { 18,17, 2, 3,18, 5,-1, 6, 7, 8, 2, 3, 4, 5,18, 6 } },
+ { 1, 2, { 18,-3,18, 2, 3,-2,-3, 5,18, 7, 6, 2, 4, 3, 8,-2 } },
+ { 1, 1, { 17,18,18,-2, 2, 3, 5, 4, 8,18,-1, 5, 3, 6,-2, 7 } },
+ { 1, 2, { 18,17, 2,-2,18, 3,-1, 4,18, 2, 7, 5, 3, 8, 6, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 0, 2, { 18,18, 3, 3,-2, 2, 5,18, 6, 3,-1, 4, 7,-1, 1, 2 } },
+ { 0, 1, { -2, 1,18, 2,-2, 5, 7,18, 3, 2, 6, 2,-1, 4,-2,17 } },
+ { 0, 2, { 18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17,18,-1, 3, 2, 5, 1, 3, 2, 8, 4, 7, 6, 2,-1, 5 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 0, 1, { 18,18,-2,18, 2, 3, 4, 5, 6,18, 8, 2, 3, 7,-2, 4 } },
+ { 0, 1, { 18,-2,18,18,-3,-2, 2, 3, 5, 8, 1, 2, 6, 4, 7,-1 } },
+ { 0, 1, { 18,17, 2,18, 3,-2, 2, 7, 6, 4,18, 3, 8, 7, 4, 2 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 1, 1, { 18,17,18, 2, 5, 3,-2,18, 6, 2, 3, 4, 8, 7, 5,-1 } },
+ { 0, 1, {  2,-1,18,-1, 2, 4,-3,18, 5, 3, 6,18, 2, 4, 7, 8 } },
+ { 1, 1, { 17,18, 8, 3, 6, 4,-1, 5, 2, 7, 3, 8, 6, 5,18, 4 } },
+ { 0, 2, { 18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 } },
+ { 1, 2, { 18,-1,18, 3,-2,18, 2, 5, 3, 6, 7, 2,-1,18, 8, 4 } },
+ { 1, 2, {  1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 } },
+ { 1, 2, {  1,18,-3, 2, 3,18,-1, 5, 6, 2, 8, 3, 4, 1,-2, 7 } },
+ { 0, 1, {  1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 } },
+ { 1, 1, { 18,17,18, 4, 3, 5, 1, 2, 6, 3, 4, 7, 1, 8, 5, 2 } },
+ { 0, 1, { 18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18,-2, 2, 5, 3, 7,18, 2, 4,-3, 5, 6, 3, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 3, {  3,18,-1, 5, 2, 7,18, 6, 5, 2, 4, 3,-1, 7,18, 6 } },
+ { 0, 2, { 18,18,18, 4, 3, 2, 6, 4, 8,18, 5, 3, 2, 7,-2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18,18, 2, 4, 6,-2, 2, 8, 3, 4,18, 7,-1, 6 } },
+ { 0, 1, { 18, 1,-2, 2, 4, 1, 3,-1, 2, 5, 7, 1, 6, 8,-2,17 } },
+ { 0, 1, { 17,17,18, 2, 5, 4,18, 3, 8, 7, 4, 6, 8, 1, 5, 2 } },
+ { 1, 2, { 18,18, 5, 4, 6, 3, 4,18, 8, 4,-1, 7, 5, 3, 6, 2 } },
+ { 0, 1, { 18,18,-3,18, 3, 6, 2, 5, 7,18, 3, 8,-1, 4, 5, 2 } },
+ { 1, 1, { 18, 2,-2,-3,18, 5, 2,-2, 4, 3, 6,18, 8,-1, 2, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 17,18, 3,18, 2, 5, 4, 7,-3, 6, 3, 2,18, 4, 7, 3 } },
+ { 1, 1, {  1, 7, 4, 5, 3, 4, 5, 1, 3, 6, 3, 2, 4, 8,-2, 7 } },
+ { 0, 1, {  1,18,-1,-2,18, 3, 2,-1, 6, 7, 4, 5, 3,18, 2,-3 } },
+ { 1, 1, { 18,18,-1, 3, 6,18, 5, 4, 8, 2, 3, 6,18, 7, 4,-2 } },
+ { 0, 2, { 18,18, 2, 6,18, 2,18, 5, 3,18, 2, 4, 7, 8, 3,18 } },
+ { 1, 1, {  3,18,18, 5,18, 6, 2, 4, 7,-2,18, 5, 8, 6, 3, 2 } },
+ { 0, 1, { 18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 } },
+ { 1, 1, { 18,-2,18, 2, 5,18, 3,-2, 4, 7, 2,-1, 8, 6, 5, 1 } },
+ { 1, 1, { 17,17, 5,18, 4, 1, 2, 8, 6, 4,-2, 3, 5,-1, 1, 8 } },
+ { 0, 2, {  1, 2,17, 3, 7,18, 2,-1, 4, 5,18, 2, 7, 3, 6, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18, 2,-2, 3, 6, 4, 8,18, 2, 5, 7, 4, 3, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18, 1, 8, 3, 5, 6, 4,-1, 8, 3, 7,18, 2, 5, 8, 4 } },
+ { 1, 1, { 17,18, 5, 2, 4, 3, 1, 6,-2, 1, 3, 2, 4, 5,-1,17 } },
+ { 1, 1, { 18,17, 2,18, 3,-3, 7, 2, 6, 4, 3, 5,18, 8, 2,-2 } },
+ { 1, 1, { 18,17,18, 4, 3, 5,-1,18, 2, 7, 8, 4, 6, 3,18, 5 } },
+ { 0, 1, { 18,17,18,-2, 2,-3, 3, 4, 8, 5, 2,18, 6, 3, 7,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17,18, 8, 3, 4, 6,18, 5,-2, 3, 8, 5, 2, 4, 7, 6 } },
+ { 0, 1, { 18,-2, 3, 5, 1, 7, 3, 2, 6,-3, 4, 1, 5, 8, 3,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3,17,18, 5,-1,18, 2, 6, 7,18, 5, 3,-3,-1, 6, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 3, { 18,17,-2, 3,-1,18, 2, 5, 3, 7, 6, 2, 4, 8,18, 5 } },
+ { 0, 1, { 18,-1,18, 2,18, 3, 5,18, 2, 8,18, 5, 4,-1, 6, 2 } },
+ { 1, 2, { 18,-2,18,18, 2, 3, 4,-3, 2, 5,18, 7, 4, 3, 8, 6 } },
+ { 0, 2, { 17,-1,18, 2,-1, 1, 7, 3, 8, 5,-2, 4, 1, 2,-3, 6 } },
+ { 0, 1, { 18,17, 2,18, 2,18, 6, 7, 4, 3,18, 5, 2,-2,17, 8 } },
+ { 0, 3, { 18,17, 2, 3,-3,-1,18, 2, 4, 5,18, 7, 3, 2,-3, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, {  3,18,18,18, 2, 6, 5,18, 7, 2, 4, 6,18, 5, 3, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18, 3, 6, 3,-2, 2,18, 5,-1, 7, 3, 4,-2, 2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18,17,18,18,-2, 2, 3,-3,18, 6, 4, 2,-2, 8, 3, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18,18, 4, 2, 7, 8,18, 3, 2,-2, 4, 7, 6,17, 5 } },
+ { 1, 1, { 18,18,-1,-2, 8, 3,18, 6, 3, 5, 8, 2, 4, 7, 1, 6 } },
+ { 1, 1, {  1,-3, 3,18,18, 2,-1, 3, 6, 5,18, 4, 7,-2, 8, 3 } },
+ { 1, 1, {  1,18, 4, 2, 5,18, 1, 3,-1, 6, 1, 4, 8, 2, 5, 1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+};
+
+static const WavPackDecorrSpec * const decorr_filters[] = {
+    &fast_specs[0], &default_specs[0], &high_specs[0], &very_high_specs[0],
+};
+
+static const uint16_t decorr_filter_sizes[] = {
+    FF_ARRAY_ELEMS(fast_specs),
+    FF_ARRAY_ELEMS(default_specs),
+    FF_ARRAY_ELEMS(high_specs),
+    FF_ARRAY_ELEMS(very_high_specs),
+};
+
+static const uint8_t decorr_filter_nterms[] = { 2, 5, 10, 16 };
+
+static const int8_t nbits_table[] = {
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
+};
+
+#endif /* AVCODEC_WAVPACKENC_H */
diff --git a/libavcodec/wcmv.c b/libavcodec/wcmv.c
new file mode 100644
index 0000000..2988c15
--- /dev/null
+++ b/libavcodec/wcmv.c
@@ -0,0 +1,266 @@
+/*
+ * WinCAM Motion Video decoder
+ *
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/imgutils.h"
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+#include <zlib.h>
+
+typedef struct WCMVContext {
+    int         bpp;
+    z_stream    zstream;
+    AVFrame    *prev_frame;
+    uint8_t     block_data[65536*8];
+} WCMVContext;
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    WCMVContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    int skip, blocks, zret, ret, intra = 0, bpp = s->bpp;
+    GetByteContext gb;
+    uint8_t *dst;
+
+    ret = inflateReset(&s->zstream);
+    if (ret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", ret);
+        return AVERROR_EXTERNAL;
+    }
+
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
+    blocks = bytestream2_get_le16(&gb);
+    if (!blocks)
+        return avpkt->size;
+
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+
+    if (blocks > 5) {
+        GetByteContext bgb;
+        int x = 0, size;
+
+        if (blocks * 8 >= 0xFFFF)
+            size = bytestream2_get_le24(&gb);
+        else if (blocks * 8 >= 0xFF)
+            size = bytestream2_get_le16(&gb);
+        else
+            size = bytestream2_get_byte(&gb);
+
+        skip = bytestream2_tell(&gb);
+        if (size > avpkt->size - skip)
+            return AVERROR_INVALIDDATA;
+
+        s->zstream.next_in  = avpkt->data + skip;
+        s->zstream.avail_in = size;
+        s->zstream.next_out  = s->block_data;
+        s->zstream.avail_out = sizeof(s->block_data);
+
+        zret = inflate(&s->zstream, Z_FINISH);
+        if (zret != Z_STREAM_END) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Inflate failed with return code: %d.\n", zret);
+            return AVERROR_INVALIDDATA;
+        }
+
+        ret = inflateReset(&s->zstream);
+        if (ret != Z_OK) {
+            av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", ret);
+            return AVERROR_EXTERNAL;
+        }
+
+        bytestream2_skip(&gb, size);
+        bytestream2_init(&bgb, s->block_data, blocks * 8);
+
+        for (int i = 0; i < blocks; i++) {
+            int w, h;
+
+            bytestream2_skip(&bgb, 4);
+            w = bytestream2_get_le16(&bgb);
+            h = bytestream2_get_le16(&bgb);
+            if (x + bpp * (int64_t)w * h > INT_MAX)
+                return AVERROR_INVALIDDATA;
+            x += bpp * w * h;
+        }
+
+        if (x >= 0xFFFF)
+            bytestream2_skip(&gb, 3);
+        else if (x >= 0xFF)
+            bytestream2_skip(&gb, 2);
+        else
+            bytestream2_skip(&gb, 1);
+
+        skip = bytestream2_tell(&gb);
+
+        s->zstream.next_in  = avpkt->data + skip;
+        s->zstream.avail_in = avpkt->size - skip;
+
+        bytestream2_init(&gb, s->block_data, blocks * 8);
+    } else if (blocks) {
+        int x = 0;
+
+        bytestream2_seek(&gb, 2, SEEK_SET);
+
+        for (int i = 0; i < blocks; i++) {
+            int w, h;
+
+            bytestream2_skip(&gb, 4);
+            w = bytestream2_get_le16(&gb);
+            h = bytestream2_get_le16(&gb);
+            if (x + bpp * (int64_t)w * h > INT_MAX)
+                return AVERROR_INVALIDDATA;
+            x += bpp * w * h;
+        }
+
+        if (x >= 0xFFFF)
+            bytestream2_skip(&gb, 3);
+        else if (x >= 0xFF)
+            bytestream2_skip(&gb, 2);
+        else
+            bytestream2_skip(&gb, 1);
+
+        skip = bytestream2_tell(&gb);
+
+        s->zstream.next_in  = avpkt->data + skip;
+        s->zstream.avail_in = avpkt->size - skip;
+
+        bytestream2_seek(&gb, 2, SEEK_SET);
+    }
+
+    if (s->prev_frame->data[0]) {
+        ret = av_frame_copy(frame, s->prev_frame);
+        if (ret < 0)
+            return ret;
+    } else {
+        ptrdiff_t linesize[4] = { frame->linesize[0], 0, 0, 0 };
+        av_image_fill_black(frame->data, linesize, avctx->pix_fmt, 0,
+                            avctx->width, avctx->height);
+    }
+
+    for (int block = 0; block < blocks; block++) {
+        int x, y, w, h;
+
+        x = bytestream2_get_le16(&gb);
+        y = bytestream2_get_le16(&gb);
+        w = bytestream2_get_le16(&gb);
+        h = bytestream2_get_le16(&gb);
+
+        if (blocks == 1 && x == 0 && y == 0 && w == avctx->width && h == avctx->height)
+            intra = 1;
+
+        if (x + w > avctx->width || y + h > avctx->height)
+            return AVERROR_INVALIDDATA;
+
+        if (w > avctx->width || h > avctx->height)
+            return AVERROR_INVALIDDATA;
+
+        dst = frame->data[0] + (avctx->height - y - 1) * frame->linesize[0] + x * bpp;
+        for (int i = 0; i < h; i++) {
+            s->zstream.next_out  = dst;
+            s->zstream.avail_out = w * bpp;
+
+            zret = inflate(&s->zstream, Z_SYNC_FLUSH);
+            if (zret != Z_OK && zret != Z_STREAM_END) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Inflate failed with return code: %d.\n", zret);
+                return AVERROR_INVALIDDATA;
+            }
+
+            dst -= frame->linesize[0];
+        }
+    }
+
+    frame->key_frame = intra;
+    frame->pict_type = intra ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    av_frame_unref(s->prev_frame);
+    if ((ret = av_frame_ref(s->prev_frame, frame)) < 0)
+        return ret;
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    WCMVContext *s = avctx->priv_data;
+    int zret;
+
+    switch (avctx->bits_per_coded_sample) {
+    case 16: avctx->pix_fmt = AV_PIX_FMT_RGB565LE; break;
+    case 24: avctx->pix_fmt = AV_PIX_FMT_BGR24;  break;
+    case 32: avctx->pix_fmt = AV_PIX_FMT_BGRA;   break;
+    default: av_log(avctx, AV_LOG_ERROR, "Unsupported bits_per_coded_sample: %d\n",
+                    avctx->bits_per_coded_sample);
+             return AVERROR_PATCHWELCOME;
+    }
+
+    s->bpp = avctx->bits_per_coded_sample >> 3;
+
+    s->zstream.zalloc = Z_NULL;
+    s->zstream.zfree = Z_NULL;
+    s->zstream.opaque = Z_NULL;
+    zret = inflateInit(&s->zstream);
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate init error: %d\n", zret);
+        return AVERROR_EXTERNAL;
+    }
+
+    s->prev_frame = av_frame_alloc();
+    if (!s->prev_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    WCMVContext *s = avctx->priv_data;
+
+    av_frame_free(&s->prev_frame);
+    inflateEnd(&s->zstream);
+
+    return 0;
+}
+
+AVCodec ff_wcmv_decoder = {
+    .name             = "wcmv",
+    .long_name        = NULL_IF_CONFIG_SMALL("WinCAM Motion Video"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_WCMV,
+    .priv_data_size   = sizeof(WCMVContext),
+    .init             = decode_init,
+    .close            = decode_close,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/webp.c b/libavcodec/webp.c
index 0e769c3..077bb06 100644
--- a/libavcodec/webp.c
+++ b/libavcodec/webp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2013 Aneesh Dogra <aneesh@sugarlabs.org>
  * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,18 +31,22 @@
  * Lossless decoder
  * Compressed alpha for lossy
  *
+ * @author James Almer <jamrial@gmail.com>
+ * Exif metadata
+ * ICC profile
+ *
  * Unimplemented:
  *   - Animation
- *   - ICC profile
- *   - Exif and XMP metadata
+ *   - XMP metadata
  */
 
 #include "libavutil/imgutils.h"
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "exif.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "thread.h"
 #include "vp8.h"
@@ -183,7 +187,7 @@ typedef struct ImageContext {
 
 typedef struct WebPContext {
     VP8Context v;                       /* VP8 Context used for lossy decoding */
-    BitstreamContext bc;                /* bitstream reader for main image chunk */
+    GetBitContext gb;                   /* bitstream reader for main image chunk */
     AVFrame *alpha_frame;               /* AVFrame for alpha data decompressed from VP8L */
     AVCodecContext *avctx;              /* parent AVCodecContext */
     int initialized;                    /* set once the VP8 context is initialized */
@@ -192,6 +196,8 @@ typedef struct WebPContext {
     enum AlphaFilter alpha_filter;      /* filtering method for alpha chunk */
     uint8_t *alpha_data;                /* alpha chunk data */
     int alpha_data_size;                /* alpha chunk data size */
+    int has_exif;                       /* set after an EXIF chunk has been processed */
+    int has_iccp;                       /* set after an ICCP chunk has been processed */
     int width;                          /* image width */
     int height;                         /* image height */
     int lossless;                       /* indicates lossless or lossy */
@@ -232,41 +238,47 @@ static void image_ctx_free(ImageContext *img)
  *   - assumes 8-bit table to make reversal simpler
  *   - assumes max depth of 2 since the max code length for WebP is 15
  */
-static av_always_inline int webp_get_vlc(BitstreamContext *bc, VLC_TYPE (*table)[2])
+static av_always_inline int webp_get_vlc(GetBitContext *gb, VLC_TYPE (*table)[2])
 {
     int n, nb_bits;
     unsigned int index;
     int code;
 
-    index = bitstream_peek(bc, 8);
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+
+    index = SHOW_UBITS(re, gb, 8);
     index = ff_reverse[index];
     code  = table[index][0];
     n     = table[index][1];
 
     if (n < 0) {
-        bitstream_skip(bc, 8);
+        LAST_SKIP_BITS(re, gb, 8);
+        UPDATE_CACHE(re, gb);
 
         nb_bits = -n;
 
-        index = bitstream_peek(bc, nb_bits);
+        index = SHOW_UBITS(re, gb, nb_bits);
         index = (ff_reverse[index] >> (8 - nb_bits)) + code;
         code  = table[index][0];
         n     = table[index][1];
     }
-    bitstream_skip(bc, n);
+    SKIP_BITS(re, gb, n);
+
+    CLOSE_READER(re, gb);
 
     return code;
 }
 
-static int huff_reader_get_symbol(HuffReader *r, BitstreamContext *bc)
+static int huff_reader_get_symbol(HuffReader *r, GetBitContext *gb)
 {
     if (r->simple) {
         if (r->nb_symbols == 1)
             return r->simple_symbols[0];
         else
-            return r->simple_symbols[bitstream_read_bit(bc)];
+            return r->simple_symbols[get_bits1(gb)];
     } else
-        return webp_get_vlc(bc, r->vlc.table);
+        return webp_get_vlc(gb, r->vlc.table);
 }
 
 static int huff_reader_build_canonical(HuffReader *r, int *code_lengths,
@@ -298,7 +310,7 @@ static int huff_reader_build_canonical(HuffReader *r, int *code_lengths,
     if (max_code_length == 0 || max_code_length > MAX_HUFFMAN_CODE_LENGTH)
         return AVERROR(EINVAL);
 
-    codes = av_malloc(alphabet_size * sizeof(*codes));
+    codes = av_malloc_array(alphabet_size, sizeof(*codes));
     if (!codes)
         return AVERROR(ENOMEM);
 
@@ -333,15 +345,15 @@ static int huff_reader_build_canonical(HuffReader *r, int *code_lengths,
 
 static void read_huffman_code_simple(WebPContext *s, HuffReader *hc)
 {
-    hc->nb_symbols = bitstream_read_bit(&s->bc) + 1;
+    hc->nb_symbols = get_bits1(&s->gb) + 1;
 
-    if (bitstream_read_bit(&s->bc))
-        hc->simple_symbols[0] = bitstream_read(&s->bc, 8);
+    if (get_bits1(&s->gb))
+        hc->simple_symbols[0] = get_bits(&s->gb, 8);
     else
-        hc->simple_symbols[0] = bitstream_read_bit(&s->bc);
+        hc->simple_symbols[0] = get_bits1(&s->gb);
 
     if (hc->nb_symbols == 2)
-        hc->simple_symbols[1] = bitstream_read(&s->bc, 8);
+        hc->simple_symbols[1] = get_bits(&s->gb, 8);
 
     hc->simple = 1;
 }
@@ -353,13 +365,13 @@ static int read_huffman_code_normal(WebPContext *s, HuffReader *hc,
     int *code_lengths = NULL;
     int code_length_code_lengths[NUM_CODE_LENGTH_CODES] = { 0 };
     int i, symbol, max_symbol, prev_code_len, ret;
-    int num_codes = 4 + bitstream_read(&s->bc, 4);
+    int num_codes = 4 + get_bits(&s->gb, 4);
 
     if (num_codes > NUM_CODE_LENGTH_CODES)
         return AVERROR_INVALIDDATA;
 
     for (i = 0; i < num_codes; i++)
-        code_length_code_lengths[code_length_code_order[i]] = bitstream_read(&s->bc, 3);
+        code_length_code_lengths[code_length_code_order[i]] = get_bits(&s->gb, 3);
 
     ret = huff_reader_build_canonical(&code_len_hc, code_length_code_lengths,
                                       NUM_CODE_LENGTH_CODES);
@@ -372,9 +384,9 @@ static int read_huffman_code_normal(WebPContext *s, HuffReader *hc,
         goto finish;
     }
 
-    if (bitstream_read_bit(&s->bc)) {
-        int bits   = 2 + 2 * bitstream_read(&s->bc, 3);
-        max_symbol = 2 + bitstream_read(&s->bc, bits);
+    if (get_bits1(&s->gb)) {
+        int bits   = 2 + 2 * get_bits(&s->gb, 3);
+        max_symbol = 2 + get_bits(&s->gb, bits);
         if (max_symbol > alphabet_size) {
             av_log(s->avctx, AV_LOG_ERROR, "max symbol %d > alphabet size %d\n",
                    max_symbol, alphabet_size);
@@ -392,7 +404,7 @@ static int read_huffman_code_normal(WebPContext *s, HuffReader *hc,
 
         if (!max_symbol--)
             break;
-        code_len = huff_reader_get_symbol(&code_len_hc, &s->bc);
+        code_len = huff_reader_get_symbol(&code_len_hc, &s->gb);
         if (code_len < 16) {
             /* Code length code [0..15] indicates literal code lengths. */
             code_lengths[symbol++] = code_len;
@@ -405,18 +417,18 @@ static int read_huffman_code_normal(WebPContext *s, HuffReader *hc,
                 /* Code 16 repeats the previous non-zero value [3..6] times,
                  * i.e., 3 + ReadBits(2) times. If code 16 is used before a
                  * non-zero value has been emitted, a value of 8 is repeated. */
-                repeat = 3 + bitstream_read(&s->bc, 2);
+                repeat = 3 + get_bits(&s->gb, 2);
                 length = prev_code_len;
                 break;
             case 17:
                 /* Code 17 emits a streak of zeros [3..10], i.e.,
                  * 3 + ReadBits(3) times. */
-                repeat = 3 + bitstream_read(&s->bc, 3);
+                repeat = 3 + get_bits(&s->gb, 3);
                 break;
             case 18:
                 /* Code 18 emits a streak of zeros of length [11..138], i.e.,
                  * 11 + ReadBits(7) times. */
-                repeat = 11 + bitstream_read(&s->bc, 7);
+                repeat = 11 + get_bits(&s->gb, 7);
                 break;
             }
             if (symbol + repeat > alphabet_size) {
@@ -443,7 +455,7 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
                                       int w, int h);
 
 #define PARSE_BLOCK_SIZE(w, h) do {                                         \
-    block_bits = bitstream_read(&s->bc, 3) + 2;                                   \
+    block_bits = get_bits(&s->gb, 3) + 2;                                   \
     blocks_w   = FFALIGN((w), 1 << block_bits) >> block_bits;               \
     blocks_h   = FFALIGN((h), 1 << block_bits) >> block_bits;               \
 } while (0)
@@ -520,7 +532,7 @@ static int parse_transform_color_indexing(WebPContext *s)
     int width_bits, index_size, ret, x;
     uint8_t *ct;
 
-    index_size = bitstream_read(&s->bc, 8) + 1;
+    index_size = get_bits(&s->gb, 8) + 1;
 
     if (index_size <= 2)
         width_bits = 3;
@@ -600,8 +612,8 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
     if (ret < 0)
         return ret;
 
-    if (bitstream_read_bit(&s->bc)) {
-        img->color_cache_bits = bitstream_read(&s->bc, 4);
+    if (get_bits1(&s->gb)) {
+        img->color_cache_bits = get_bits(&s->gb, 4);
         if (img->color_cache_bits < 1 || img->color_cache_bits > 11) {
             av_log(s->avctx, AV_LOG_ERROR, "invalid color cache bits: %d\n",
                    img->color_cache_bits);
@@ -616,7 +628,7 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
     }
 
     img->nb_huffman_groups = 1;
-    if (role == IMAGE_ROLE_ARGB && bitstream_read_bit(&s->bc)) {
+    if (role == IMAGE_ROLE_ARGB && get_bits1(&s->gb)) {
         ret = decode_entropy_image(s);
         if (ret < 0)
             return ret;
@@ -635,7 +647,7 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
             if (!j && img->color_cache_bits > 0)
                 alphabet_size += 1 << img->color_cache_bits;
 
-            if (bitstream_read_bit(&s->bc)) {
+            if (get_bits1(&s->gb)) {
                 read_huffman_code_simple(s, &hg[j]);
             } else {
                 ret = read_huffman_code_normal(s, &hg[j], alphabet_size);
@@ -654,14 +666,14 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
         int v;
 
         hg = get_huffman_group(s, img, x, y);
-        v = huff_reader_get_symbol(&hg[HUFF_IDX_GREEN], &s->bc);
+        v = huff_reader_get_symbol(&hg[HUFF_IDX_GREEN], &s->gb);
         if (v < NUM_LITERAL_CODES) {
             /* literal pixel values */
             uint8_t *p = GET_PIXEL(img->frame, x, y);
             p[2] = v;
-            p[1] = huff_reader_get_symbol(&hg[HUFF_IDX_RED],   &s->bc);
-            p[3] = huff_reader_get_symbol(&hg[HUFF_IDX_BLUE],  &s->bc);
-            p[0] = huff_reader_get_symbol(&hg[HUFF_IDX_ALPHA], &s->bc);
+            p[1] = huff_reader_get_symbol(&hg[HUFF_IDX_RED],   &s->gb);
+            p[3] = huff_reader_get_symbol(&hg[HUFF_IDX_BLUE],  &s->gb);
+            p[0] = huff_reader_get_symbol(&hg[HUFF_IDX_ALPHA], &s->gb);
             if (img->color_cache_bits)
                 color_cache_put(img, AV_RB32(p));
             x++;
@@ -680,10 +692,10 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
             } else {
                 int extra_bits = (prefix_code - 2) >> 1;
                 int offset     = 2 + (prefix_code & 1) << extra_bits;
-                length = offset + bitstream_read(&s->bc, extra_bits) + 1;
+                length = offset + get_bits(&s->gb, extra_bits) + 1;
             }
-            prefix_code = huff_reader_get_symbol(&hg[HUFF_IDX_DIST], &s->bc);
-            if (prefix_code > 39) {
+            prefix_code = huff_reader_get_symbol(&hg[HUFF_IDX_DIST], &s->gb);
+            if (prefix_code > 39U) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "distance prefix code too large: %d\n", prefix_code);
                 return AVERROR_INVALIDDATA;
@@ -693,7 +705,7 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
             } else {
                 int extra_bits = prefix_code - 2 >> 1;
                 int offset     = 2 + (prefix_code & 1) << extra_bits;
-                distance = offset + bitstream_read(&s->bc, extra_bits) + 1;
+                distance = offset + get_bits(&s->gb, extra_bits) + 1;
             }
 
             /* find reference location */
@@ -1022,32 +1034,32 @@ static int apply_color_indexing_transform(WebPContext *s)
     ImageContext *img;
     ImageContext *pal;
     int i, x, y;
-    uint8_t *p, *pi;
+    uint8_t *p;
 
     img = &s->image[IMAGE_ROLE_ARGB];
     pal = &s->image[IMAGE_ROLE_COLOR_INDEXING];
 
     if (pal->size_reduction > 0) {
-        BitstreamContext bc_g;
+        GetBitContext gb_g;
         uint8_t *line;
         int pixel_bits = 8 >> pal->size_reduction;
 
-        line = av_malloc(img->frame->linesize[0]);
+        line = av_malloc(img->frame->linesize[0] + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!line)
             return AVERROR(ENOMEM);
 
         for (y = 0; y < img->frame->height; y++) {
             p = GET_PIXEL(img->frame, 0, y);
             memcpy(line, p, img->frame->linesize[0]);
-            bitstream_init8(&bc_g, line, img->frame->linesize[0]);
-            bitstream_skip(&bc_g, 16);
+            init_get_bits(&gb_g, line, img->frame->linesize[0] * 8);
+            skip_bits(&gb_g, 16);
             i = 0;
             for (x = 0; x < img->frame->width; x++) {
                 p    = GET_PIXEL(img->frame, x, y);
-                p[2] = bitstream_read(&bc_g, pixel_bits);
+                p[2] = get_bits(&gb_g, pixel_bits);
                 i++;
                 if (i == 1 << pal->size_reduction) {
-                    bitstream_skip(&bc_g, 24);
+                    skip_bits(&gb_g, 24);
                     i = 0;
                 }
             }
@@ -1055,22 +1067,54 @@ static int apply_color_indexing_transform(WebPContext *s)
         av_free(line);
     }
 
-    for (y = 0; y < img->frame->height; y++) {
-        for (x = 0; x < img->frame->width; x++) {
-            p = GET_PIXEL(img->frame, x, y);
-            i = p[2];
-            if (i >= pal->frame->width) {
-                av_log(s->avctx, AV_LOG_ERROR, "invalid palette index %d\n", i);
-                return AVERROR_INVALIDDATA;
+    // switch to local palette if it's worth initializing it
+    if (img->frame->height * img->frame->width > 300) {
+        uint8_t palette[256 * 4];
+        const int size = pal->frame->width * 4;
+        av_assert0(size <= 1024U);
+        memcpy(palette, GET_PIXEL(pal->frame, 0, 0), size);   // copy palette
+        // set extra entries to transparent black
+        memset(palette + size, 0, 256 * 4 - size);
+        for (y = 0; y < img->frame->height; y++) {
+            for (x = 0; x < img->frame->width; x++) {
+                p = GET_PIXEL(img->frame, x, y);
+                i = p[2];
+                AV_COPY32(p, &palette[i * 4]);
+            }
+        }
+    } else {
+        for (y = 0; y < img->frame->height; y++) {
+            for (x = 0; x < img->frame->width; x++) {
+                p = GET_PIXEL(img->frame, x, y);
+                i = p[2];
+                if (i >= pal->frame->width) {
+                    AV_WB32(p, 0x00000000);
+                } else {
+                    const uint8_t *pi = GET_PIXEL(pal->frame, i, 0);
+                    AV_COPY32(p, pi);
+                }
             }
-            pi = GET_PIXEL(pal->frame, i, 0);
-            AV_COPY32(p, pi);
         }
     }
 
     return 0;
 }
 
+static void update_canvas_size(AVCodecContext *avctx, int w, int h)
+{
+    WebPContext *s = avctx->priv_data;
+    if (s->width && s->width != w) {
+        av_log(avctx, AV_LOG_WARNING, "Width mismatch. %d != %d\n",
+               s->width, w);
+    }
+    s->width = w;
+    if (s->height && s->height != h) {
+        av_log(avctx, AV_LOG_WARNING, "Height mismatch. %d != %d\n",
+               s->height, h);
+    }
+    s->height = h;
+}
+
 static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
                                      int *got_frame, uint8_t *data_start,
                                      unsigned int data_size, int is_alpha_chunk)
@@ -1083,36 +1127,28 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
         avctx->pix_fmt = AV_PIX_FMT_ARGB;
     }
 
-    ret = bitstream_init8(&s->bc, data_start, data_size);
+    ret = init_get_bits8(&s->gb, data_start, data_size);
     if (ret < 0)
         return ret;
 
     if (!is_alpha_chunk) {
-        if (bitstream_read(&s->bc, 8) != 0x2F) {
+        if (get_bits(&s->gb, 8) != 0x2F) {
             av_log(avctx, AV_LOG_ERROR, "Invalid WebP Lossless signature\n");
             return AVERROR_INVALIDDATA;
         }
 
-        w = bitstream_read(&s->bc, 14) + 1;
-        h = bitstream_read(&s->bc, 14) + 1;
-        if (s->width && s->width != w) {
-            av_log(avctx, AV_LOG_WARNING, "Width mismatch. %d != %d\n",
-                   s->width, w);
-        }
-        s->width = w;
-        if (s->height && s->height != h) {
-            av_log(avctx, AV_LOG_WARNING, "Height mismatch. %d != %d\n",
-                   s->width, w);
-        }
-        s->height = h;
+        w = get_bits(&s->gb, 14) + 1;
+        h = get_bits(&s->gb, 14) + 1;
+
+        update_canvas_size(avctx, w, h);
 
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
 
-        s->has_alpha = bitstream_read_bit(&s->bc);
+        s->has_alpha = get_bits1(&s->gb);
 
-        if (bitstream_read(&s->bc, 3) != 0x0) {
+        if (get_bits(&s->gb, 3) != 0x0) {
             av_log(avctx, AV_LOG_ERROR, "Invalid WebP Lossless version\n");
             return AVERROR_INVALIDDATA;
         }
@@ -1127,9 +1163,8 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
     s->nb_transforms = 0;
     s->reduced_width = 0;
     used = 0;
-    while (bitstream_read_bit(&s->bc)) {
-        enum TransformType transform = bitstream_read(&s->bc, 2);
-        s->transforms[s->nb_transforms++] = transform;
+    while (get_bits1(&s->gb)) {
+        enum TransformType transform = get_bits(&s->gb, 2);
         if (used & (1 << transform)) {
             av_log(avctx, AV_LOG_ERROR, "Transform %d used more than once\n",
                    transform);
@@ -1137,6 +1172,7 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
             goto free_and_return;
         }
         used |= (1 << transform);
+        s->transforms[s->nb_transforms++] = transform;
         switch (transform) {
         case PREDICTOR_TRANSFORM:
             ret = parse_transform_predictor(s);
@@ -1300,11 +1336,8 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
         ff_vp8_decode_init(avctx);
         s->initialized = 1;
         s->v.actually_webp = 1;
-        if (s->has_alpha)
-            avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     }
+    avctx->pix_fmt = s->has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
     s->lossless = 0;
 
     if (data_size > INT_MAX) {
@@ -1317,6 +1350,14 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
     pkt.size = data_size;
 
     ret = ff_vp8_decode_frame(avctx, p, got_frame, &pkt);
+    if (ret < 0)
+        return ret;
+
+    if (!*got_frame)
+        return AVERROR_INVALIDDATA;
+
+    update_canvas_size(avctx, avctx->width, avctx->height);
+
     if (s->has_alpha) {
         ret = vp8_lossy_decode_alpha(avctx, p, s->alpha_data,
                                      s->alpha_data_size);
@@ -1341,6 +1382,8 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     s->height    = 0;
     *got_frame   = 0;
     s->has_alpha = 0;
+    s->has_exif  = 0;
+    s->has_iccp  = 0;
     bytestream2_init(&gb, avpkt->data, avpkt->size);
 
     if (bytestream2_get_bytes_left(&gb) < 12)
@@ -1390,10 +1433,15 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                                                 chunk_size, 0);
                 if (ret < 0)
                     return ret;
+                avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             }
             bytestream2_skip(&gb, chunk_size);
             break;
         case MKTAG('V', 'P', '8', 'X'):
+            if (s->width || s->height || *got_frame) {
+                av_log(avctx, AV_LOG_ERROR, "Canvas dimensions are already set\n");
+                return AVERROR_INVALIDDATA;
+            }
             vp8x_flags = bytestream2_get_byte(&gb);
             bytestream2_skip(&gb, 3);
             s->width  = bytestream2_get_le24(&gb) + 1;
@@ -1433,13 +1481,68 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
             break;
         }
-        case MKTAG('I', 'C', 'C', 'P'):
+        case MKTAG('E', 'X', 'I', 'F'): {
+            int le, ifd_offset, exif_offset = bytestream2_tell(&gb);
+            AVDictionary *exif_metadata = NULL;
+            GetByteContext exif_gb;
+
+            if (s->has_exif) {
+                av_log(avctx, AV_LOG_VERBOSE, "Ignoring extra EXIF chunk\n");
+                goto exif_end;
+            }
+            if (!(vp8x_flags & VP8X_FLAG_EXIF_METADATA))
+                av_log(avctx, AV_LOG_WARNING,
+                       "EXIF chunk present, but Exif bit not set in the "
+                       "VP8X header\n");
+
+            s->has_exif = 1;
+            bytestream2_init(&exif_gb, avpkt->data + exif_offset,
+                             avpkt->size - exif_offset);
+            if (ff_tdecode_header(&exif_gb, &le, &ifd_offset) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "invalid TIFF header "
+                       "in Exif data\n");
+                goto exif_end;
+            }
+
+            bytestream2_seek(&exif_gb, ifd_offset, SEEK_SET);
+            if (ff_exif_decode_ifd(avctx, &exif_gb, le, 0, &exif_metadata) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "error decoding Exif data\n");
+                goto exif_end;
+            }
+
+            av_dict_copy(&((AVFrame *) data)->metadata, exif_metadata, 0);
+
+exif_end:
+            av_dict_free(&exif_metadata);
+            bytestream2_skip(&gb, chunk_size);
+            break;
+        }
+        case MKTAG('I', 'C', 'C', 'P'): {
+            AVFrameSideData *sd;
+
+            if (s->has_iccp) {
+                av_log(avctx, AV_LOG_VERBOSE, "Ignoring extra ICCP chunk\n");
+                bytestream2_skip(&gb, chunk_size);
+                break;
+            }
+            if (!(vp8x_flags & VP8X_FLAG_ICC))
+                av_log(avctx, AV_LOG_WARNING,
+                       "ICCP chunk present, but ICC Profile bit not set in the "
+                       "VP8X header\n");
+
+            s->has_iccp = 1;
+            sd = av_frame_new_side_data(p, AV_FRAME_DATA_ICC_PROFILE, chunk_size);
+            if (!sd)
+                return AVERROR(ENOMEM);
+
+            bytestream2_get_buffer(&gb, sd->data, chunk_size);
+            break;
+        }
         case MKTAG('A', 'N', 'I', 'M'):
         case MKTAG('A', 'N', 'M', 'F'):
-        case MKTAG('E', 'X', 'I', 'F'):
         case MKTAG('X', 'M', 'P', ' '):
             AV_WL32(chunk_str, chunk_type);
-            av_log(avctx, AV_LOG_VERBOSE, "skipping unsupported chunk: %s\n",
+            av_log(avctx, AV_LOG_WARNING, "skipping unsupported chunk: %s\n",
                    chunk_str);
             bytestream2_skip(&gb, chunk_size);
             break;
diff --git a/libavcodec/webvttdec.c b/libavcodec/webvttdec.c
new file mode 100644
index 0000000..7b2d175
--- /dev/null
+++ b/libavcodec/webvttdec.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebVTT subtitle decoder
+ * @see http://dev.w3.org/html5/webvtt/
+ * @todo need to support extended markups and cue settings
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static const struct {
+    const char *from;
+    const char *to;
+} webvtt_tag_replace[] = {
+    {"<i>", "{\\i1}"}, {"</i>", "{\\i0}"},
+    {"<b>", "{\\b1}"}, {"</b>", "{\\b0}"},
+    {"<u>", "{\\u1}"}, {"</u>", "{\\u0}"},
+    {"{", "\\{"}, {"}", "\\}"}, // escape to avoid ASS markup conflicts
+    {"&gt;", ">"}, {"&lt;", "<"},
+    {"&lrm;", ""}, {"&rlm;", ""}, // FIXME: properly honor bidi marks
+    {"&amp;", "&"}, {"&nbsp;", "\\h"},
+};
+
+static int webvtt_event_to_ass(AVBPrint *buf, const char *p)
+{
+    int i, again = 0, skip = 0;
+
+    while (*p) {
+
+        for (i = 0; i < FF_ARRAY_ELEMS(webvtt_tag_replace); i++) {
+            const char *from = webvtt_tag_replace[i].from;
+            const size_t len = strlen(from);
+            if (!strncmp(p, from, len)) {
+                av_bprintf(buf, "%s", webvtt_tag_replace[i].to);
+                p += len;
+                again = 1;
+                break;
+            }
+        }
+        if (!*p)
+            break;
+
+        if (again) {
+            again = 0;
+            skip = 0;
+            continue;
+        }
+        if (*p == '<')
+            skip = 1;
+        else if (*p == '>')
+            skip = 0;
+        else if (p[0] == '\n' && p[1])
+            av_bprintf(buf, "\\N");
+        else if (!skip && *p != '\r')
+            av_bprint_chars(buf, *p, 1);
+        p++;
+    }
+    return 0;
+}
+
+static int webvtt_decode_frame(AVCodecContext *avctx,
+                               void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && !webvtt_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_webvtt_decoder = {
+    .name           = "webvtt",
+    .long_name      = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_WEBVTT,
+    .decode         = webvtt_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/webvttenc.c b/libavcodec/webvttenc.c
new file mode 100644
index 0000000..c84bbf4
--- /dev/null
+++ b/libavcodec/webvttenc.c
@@ -0,0 +1,236 @@
+/*
+ * WebVTT subtitle encoder
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (c) 2014  Aman Gupta <ffmpeg@tmm1.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "ass_split.h"
+#include "ass.h"
+
+#define WEBVTT_STACK_SIZE 64
+typedef struct {
+    AVCodecContext *avctx;
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    unsigned timestamp_end;
+    int count;
+    char stack[WEBVTT_STACK_SIZE];
+    int stack_ptr;
+} WebVTTContext;
+
+#ifdef __GNUC__
+__attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+static void webvtt_print(WebVTTContext *s, const char *str, ...)
+{
+    va_list vargs;
+    va_start(vargs, str);
+    av_vbprintf(&s->buffer, str, vargs);
+    va_end(vargs);
+}
+
+static int webvtt_stack_push(WebVTTContext *s, const char c)
+{
+    if (s->stack_ptr >= WEBVTT_STACK_SIZE)
+        return -1;
+    s->stack[s->stack_ptr++] = c;
+    return 0;
+}
+
+static char webvtt_stack_pop(WebVTTContext *s)
+{
+    if (s->stack_ptr <= 0)
+        return 0;
+    return s->stack[--s->stack_ptr];
+}
+
+static int webvtt_stack_find(WebVTTContext *s, const char c)
+{
+    int i;
+    for (i = s->stack_ptr-1; i >= 0; i--)
+        if (s->stack[i] == c)
+            break;
+    return i;
+}
+
+static void webvtt_close_tag(WebVTTContext *s, char tag)
+{
+    webvtt_print(s, "</%c>", tag);
+}
+
+static void webvtt_stack_push_pop(WebVTTContext *s, const char c, int close)
+{
+    if (close) {
+        int i = c ? webvtt_stack_find(s, c) : 0;
+        if (i < 0)
+            return;
+        while (s->stack_ptr != i)
+            webvtt_close_tag(s, webvtt_stack_pop(s));
+    } else if (webvtt_stack_push(s, c) < 0)
+        av_log(s->avctx, AV_LOG_ERROR, "tag stack overflow\n");
+}
+
+static void webvtt_style_apply(WebVTTContext *s, const char *style)
+{
+    ASSStyle *st = ff_ass_style_get(s->ass_ctx, style);
+    if (st) {
+        if (st->bold != ASS_DEFAULT_BOLD) {
+            webvtt_print(s, "<b>");
+            webvtt_stack_push(s, 'b');
+        }
+        if (st->italic != ASS_DEFAULT_ITALIC) {
+            webvtt_print(s, "<i>");
+            webvtt_stack_push(s, 'i');
+        }
+        if (st->underline != ASS_DEFAULT_UNDERLINE) {
+            webvtt_print(s, "<u>");
+            webvtt_stack_push(s, 'u');
+        }
+    }
+}
+
+static void webvtt_text_cb(void *priv, const char *text, int len)
+{
+    WebVTTContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+}
+
+static void webvtt_new_line_cb(void *priv, int forced)
+{
+    webvtt_print(priv, "\n");
+}
+
+static void webvtt_style_cb(void *priv, char style, int close)
+{
+    if (style == 's') // strikethrough unsupported
+        return;
+
+    webvtt_stack_push_pop(priv, style, close);
+    if (!close)
+        webvtt_print(priv, "<%c>", style);
+}
+
+static void webvtt_cancel_overrides_cb(void *priv, const char *style)
+{
+    webvtt_stack_push_pop(priv, 0, 1);
+    webvtt_style_apply(priv, style);
+}
+
+static void webvtt_end_cb(void *priv)
+{
+    webvtt_stack_push_pop(priv, 0, 1);
+}
+
+static const ASSCodesCallbacks webvtt_callbacks = {
+    .text             = webvtt_text_cb,
+    .new_line         = webvtt_new_line_cb,
+    .style            = webvtt_style_cb,
+    .color            = NULL,
+    .font_name        = NULL,
+    .font_size        = NULL,
+    .alignment        = NULL,
+    .cancel_overrides = webvtt_cancel_overrides_cb,
+    .move             = NULL,
+    .end              = webvtt_end_cb,
+};
+
+static int webvtt_encode_frame(AVCodecContext *avctx,
+                               unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    WebVTTContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i;
+
+    av_bprint_clear(&s->buffer);
+
+    for (i=0; i<sub->num_rects; i++) {
+        const char *ass = sub->rects[i]->ass;
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            int num;
+            dialog = ff_ass_split_dialog(s->ass_ctx, ass, 0, &num);
+            // TODO reindent
+        for (; dialog && num--; dialog++) {
+            webvtt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(&webvtt_callbacks, s, dialog->text);
+        }
+        } else {
+#endif
+            dialog = ff_ass_split_dialog2(s->ass_ctx, ass);
+            if (!dialog)
+                return AVERROR(ENOMEM);
+            webvtt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(&webvtt_callbacks, s, dialog->text);
+            ff_ass_free_dialog(&dialog);
+#if FF_API_ASS_TIMING
+        }
+#endif
+    }
+
+    if (!av_bprint_is_complete(&s->buffer))
+        return AVERROR(ENOMEM);
+    if (!s->buffer.len)
+        return 0;
+
+    if (s->buffer.len > bufsize) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        return -1;
+    }
+    memcpy(buf, s->buffer.str, s->buffer.len);
+
+    return s->buffer.len;
+}
+
+static int webvtt_encode_close(AVCodecContext *avctx)
+{
+    WebVTTContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+static av_cold int webvtt_encode_init(AVCodecContext *avctx)
+{
+    WebVTTContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+AVCodec ff_webvtt_encoder = {
+    .name           = "webvtt",
+    .long_name      = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_WEBVTT,
+    .priv_data_size = sizeof(WebVTTContext),
+    .init           = webvtt_encode_init,
+    .encode_sub     = webvtt_encode_frame,
+    .close          = webvtt_encode_close,
+};
diff --git a/libavcodec/wma.c b/libavcodec/wma.c
index 697b41b..b499209 100644
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@@ -1,28 +1,27 @@
 /*
  * WMA compatible codec
- * Copyright (c) 2002-2007 The Libav Project
+ * Copyright (c) 2002-2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
 #include "sinewin.h"
 #include "wma.h"
@@ -46,10 +45,10 @@ static av_cold int init_coef_vlc(VLC *vlc, uint16_t **prun_table,
 
     init_vlc(vlc, VLCBITS, n, table_bits, 1, 1, table_codes, 4, 4, 0);
 
-    run_table    = av_malloc(n * sizeof(uint16_t));
-    level_table  = av_malloc(n * sizeof(uint16_t));
-    flevel_table = av_malloc(n * sizeof(*flevel_table));
-    int_table    = av_malloc(n * sizeof(uint16_t));
+    run_table    = av_malloc_array(n, sizeof(uint16_t));
+    level_table  = av_malloc_array(n, sizeof(uint16_t));
+    flevel_table = av_malloc_array(n, sizeof(*flevel_table));
+    int_table    = av_malloc_array(n, sizeof(uint16_t));
     if (!run_table || !level_table || !flevel_table || !int_table) {
         av_freep(&run_table);
         av_freep(&level_table);
@@ -93,7 +92,6 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
         avctx->bit_rate    <= 0)
         return -1;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     if (avctx->codec->id == AV_CODEC_ID_WMAV1)
         s->version = 1;
@@ -142,6 +140,10 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
     bps                 = (float) avctx->bit_rate /
                           (float) (avctx->channels * avctx->sample_rate);
     s->byte_offset_bits = av_log2((int) (bps * s->frame_len / 8.0 + 0.5)) + 2;
+    if (s->byte_offset_bits + 3 > MIN_CACHE_BITS) {
+        av_log(avctx, AV_LOG_ERROR, "byte_offset_bits %d is too large\n", s->byte_offset_bits);
+        return AVERROR_PATCHWELCOME;
+    }
 
     /* compute high frequency value and choose if noise coding should
      * be activated */
@@ -183,7 +185,7 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
             high_freq = high_freq * 0.5;
     }
     ff_dlog(s->avctx, "flags2=0x%x\n", flags2);
-    ff_dlog(s->avctx, "version=%d channels=%d sample_rate=%d bitrate=%d block_align=%d\n",
+    ff_dlog(s->avctx, "version=%d channels=%d sample_rate=%d bitrate=%"PRId64" block_align=%d\n",
             s->version, avctx->channels, avctx->sample_rate, avctx->bit_rate,
             avctx->block_align);
     ff_dlog(s->avctx, "bps=%f bps1=%f high_freq=%f bitoffset=%d\n",
@@ -326,6 +328,10 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
 #endif /* TRACE */
     }
 
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+
     /* choose the VLC tables for the coefficients */
     coef_vlc_table = 2;
     if (avctx->sample_rate >= 32000) {
@@ -373,40 +379,41 @@ int ff_wma_end(AVCodecContext *avctx)
         ff_free_vlc(&s->hgain_vlc);
     for (i = 0; i < 2; i++) {
         ff_free_vlc(&s->coef_vlc[i]);
-        av_free(s->run_table[i]);
-        av_free(s->level_table[i]);
-        av_free(s->int_table[i]);
+        av_freep(&s->run_table[i]);
+        av_freep(&s->level_table[i]);
+        av_freep(&s->int_table[i]);
     }
+    av_freep(&s->fdsp);
 
     return 0;
 }
 
 /**
  * Decode an uncompressed coefficient.
- * @param bc BitstreamContext
+ * @param gb GetBitContext
  * @return the decoded coefficient
  */
-unsigned int ff_wma_get_large_val(BitstreamContext *bc)
+unsigned int ff_wma_get_large_val(GetBitContext *gb)
 {
     /** consumes up to 34 bits */
     int n_bits = 8;
     /** decode length */
-    if (bitstream_read_bit(bc)) {
+    if (get_bits1(gb)) {
         n_bits += 8;
-        if (bitstream_read_bit(bc)) {
+        if (get_bits1(gb)) {
             n_bits += 8;
-            if (bitstream_read_bit(bc))
+            if (get_bits1(gb))
                 n_bits += 7;
         }
     }
-    return bitstream_read(bc, n_bits);
+    return get_bits_long(gb, n_bits);
 }
 
 /**
  * Decode run level compressed coefficients.
  * @param avctx codec context
- * @param bc bitstream reader context
- * @param vlc VLC table for bitstream_read_vlc
+ * @param gb bitstream reader context
+ * @param vlc vlc table for get_vlc2
  * @param level_table level codes
  * @param run_table run codes
  * @param version 0 for wma1,2 1 for wmapro
@@ -418,7 +425,7 @@ unsigned int ff_wma_get_large_val(BitstreamContext *bc)
  * @param coef_nb_bits number of bits for escaped level codes
  * @return 0 on success, -1 otherwise
  */
-int ff_wma_run_level_decode(AVCodecContext *avctx, BitstreamContext *bc,
+int ff_wma_run_level_decode(AVCodecContext *avctx, GetBitContext *gb,
                             VLC *vlc, const float *level_table,
                             const uint16_t *run_table, int version,
                             WMACoef *ptr, int offset, int num_coefs,
@@ -430,44 +437,48 @@ int ff_wma_run_level_decode(AVCodecContext *avctx, BitstreamContext *bc,
     uint32_t *iptr = (uint32_t *) ptr;
     const unsigned int coef_mask = block_len - 1;
     for (; offset < num_coefs; offset++) {
-        code = bitstream_read_vlc(bc, vlc->table, VLCBITS, VLCMAX);
+        code = get_vlc2(gb, vlc->table, VLCBITS, VLCMAX);
         if (code > 1) {
             /** normal code */
             offset                  += run_table[code];
-            sign                     = bitstream_read_bit(bc) - 1;
-            iptr[offset & coef_mask] = ilvl[code] ^ sign << 31;
+            sign                     = get_bits1(gb) - 1;
+            iptr[offset & coef_mask] = ilvl[code] ^ (sign & 0x80000000);
         } else if (code == 1) {
             /** EOB */
             break;
         } else {
             /** escape */
             if (!version) {
-                level = bitstream_read(bc, coef_nb_bits);
+                level = get_bits(gb, coef_nb_bits);
                 /** NOTE: this is rather suboptimal. reading
                  *  block_len_bits would be better */
-                offset += bitstream_read(bc, frame_len_bits);
+                offset += get_bits(gb, frame_len_bits);
             } else {
-                level = ff_wma_get_large_val(bc);
+                level = ff_wma_get_large_val(gb);
                 /** escape decode */
-                if (bitstream_read_bit(bc)) {
-                    if (bitstream_read_bit(bc)) {
-                        if (bitstream_read_bit(bc)) {
+                if (get_bits1(gb)) {
+                    if (get_bits1(gb)) {
+                        if (get_bits1(gb)) {
                             av_log(avctx, AV_LOG_ERROR,
                                    "broken escape sequence\n");
                             return -1;
                         } else
-                            offset += bitstream_read(bc, frame_len_bits) + 4;
+                            offset += get_bits(gb, frame_len_bits) + 4;
                     } else
-                        offset += bitstream_read(bc, 2) + 1;
+                        offset += get_bits(gb, 2) + 1;
                 }
             }
-            sign                    = bitstream_read_bit(bc) - 1;
+            sign                    = get_bits1(gb) - 1;
             ptr[offset & coef_mask] = (level ^ sign) - sign;
         }
     }
     /** NOTE: EOB can be omitted */
     if (offset > num_coefs) {
-        av_log(avctx, AV_LOG_ERROR, "overflow in spectral RLE, ignoring\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "overflow (%d > %d) in spectral RLE, ignoring\n",
+               offset,
+               num_coefs
+              );
         return -1;
     }
 
diff --git a/libavcodec/wma.h b/libavcodec/wma.h
index 80b8286..325f03c 100644
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -1,21 +1,21 @@
 /*
  * WMA compatible codec
- * Copyright (c) 2002-2007 The Libav Project
+ * Copyright (c) 2002-2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,10 +25,9 @@
 #include "libavutil/float_dsp.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "fft.h"
+#include "get_bits.h"
 #include "put_bits.h"
-#include "vlc.h"
 
 /* size of blocks */
 #define BLOCK_MIN_BITS 7
@@ -43,7 +42,7 @@
 #define NB_LSP_COEFS 10
 
 /* XXX: is it a suitable value ? */
-#define MAX_CODED_SUPERFRAME_SIZE 16384
+#define MAX_CODED_SUPERFRAME_SIZE 32768
 
 #define MAX_CHANNELS 2
 
@@ -67,7 +66,7 @@ typedef struct CoefVLCTable {
 
 typedef struct WMACodecContext {
     AVCodecContext *avctx;
-    BitstreamContext bc;
+    GetBitContext gb;
     PutBitContext pb;
     int version;                            ///< 1 = 0x160 (WMAV1), 2 = 0x161 (WMAV2)
     int use_bit_reservoir;
@@ -117,7 +116,7 @@ typedef struct WMACodecContext {
     DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
     DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
     FFTContext mdct_ctx[BLOCK_NB_SIZES];
-    float *windows[BLOCK_NB_SIZES];
+    const float *windows[BLOCK_NB_SIZES];
     /* output buffer for one frame and the last for IMDCT windowing */
     DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
     /* last frame info */
@@ -132,7 +131,7 @@ typedef struct WMACodecContext {
     float lsp_pow_e_table[256];
     float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
     float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
 #ifdef TRACE
     int frame_count;
@@ -145,11 +144,13 @@ extern const float ff_wma_lsp_codebook[NB_LSP_COEFS][16];
 extern const uint32_t ff_aac_scalefactor_code[121];
 extern const uint8_t  ff_aac_scalefactor_bits[121];
 
+av_warn_unused_result
 int ff_wma_init(AVCodecContext *avctx, int flags2);
+
 int ff_wma_total_gain_to_bits(int total_gain);
 int ff_wma_end(AVCodecContext *avctx);
-unsigned int ff_wma_get_large_val(BitstreamContext *bc);
-int ff_wma_run_level_decode(AVCodecContext *avctx, BitstreamContext *bc,
+unsigned int ff_wma_get_large_val(GetBitContext *gb);
+int ff_wma_run_level_decode(AVCodecContext *avctx, GetBitContext *gb,
                             VLC *vlc, const float *level_table,
                             const uint16_t *run_table, int version,
                             WMACoef *ptr, int offset, int num_coefs,
diff --git a/libavcodec/wma_common.c b/libavcodec/wma_common.c
index cf76f5c..c01e0f4 100644
--- a/libavcodec/wma_common.c
+++ b/libavcodec/wma_common.c
@@ -1,20 +1,20 @@
 /*
  * common code shared by all WMA variants
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_common.h b/libavcodec/wma_common.h
index 61b1a35..55404af 100644
--- a/libavcodec/wma_common.h
+++ b/libavcodec/wma_common.h
@@ -1,20 +1,20 @@
 /*
  * common code shared by all WMA variants
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_freqs.c b/libavcodec/wma_freqs.c
index 82cef3b..03a283f 100644
--- a/libavcodec/wma_freqs.c
+++ b/libavcodec/wma_freqs.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_freqs.h b/libavcodec/wma_freqs.h
index d40ab65..6fd93e4 100644
--- a/libavcodec/wma_freqs.h
+++ b/libavcodec/wma_freqs.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmadata.h b/libavcodec/wmadata.h
index 58bffed..641cb18 100644
--- a/libavcodec/wmadata.h
+++ b/libavcodec/wmadata.h
@@ -1,21 +1,21 @@
 /*
  * WMA compatible decoder
- * copyright (c) 2002 The Libav Project
+ * copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c
index fcbac93..78b51e5 100644
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -1,21 +1,21 @@
 /*
  * WMA compatible decoder
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,9 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/ffmath.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
 #include "wma.h"
 
@@ -92,6 +92,16 @@ static av_cold int wma_decode_init(AVCodecContext *avctx)
     s->use_bit_reservoir      = flags2 & 0x0002;
     s->use_variable_block_len = flags2 & 0x0004;
 
+    if (avctx->codec->id == AV_CODEC_ID_WMAV2 && avctx->extradata_size >= 8){
+        if (AV_RL16(extradata+4)==0xd && s->use_variable_block_len){
+            av_log(avctx, AV_LOG_WARNING, "Disabling use_variable_block_len, if this fails contact the ffmpeg developers and send us the file\n");
+            s->use_variable_block_len= 0; // this fixes issue1503
+        }
+    }
+
+    for (i=0; i<MAX_CHANNELS; i++)
+        s->max_exponent[i] = 1.0;
+
     if (ff_wma_init(avctx, flags2) < 0)
         return -1;
 
@@ -154,7 +164,7 @@ static av_cold void wma_lsp_to_curve_init(WMACodecContext *s, int frame_len)
     /* tables for x^-0.25 computation */
     for (i = 0; i < 256; i++) {
         e                     = i - 126;
-        s->lsp_pow_e_table[i] = pow(2.0, e * -0.25);
+        s->lsp_pow_e_table[i] = exp2f(e * -0.25);
     }
 
     /* NOTE: these two tables are needed to avoid two operations in
@@ -163,7 +173,7 @@ static av_cold void wma_lsp_to_curve_init(WMACodecContext *s, int frame_len)
     for (i = (1 << LSP_POW_BITS) - 1; i >= 0; i--) {
         m                      = (1 << LSP_POW_BITS) + i;
         a                      = (float) m * (0.5 / (1 << LSP_POW_BITS));
-        a                      = pow(a, -0.25);
+        a                      = 1/sqrt(sqrt(a));
         s->lsp_pow_m_table1[i] = 2 * a - b;
         s->lsp_pow_m_table2[i] = b - a;
         b                      = a;
@@ -210,9 +220,9 @@ static void decode_exp_lsp(WMACodecContext *s, int ch)
 
     for (i = 0; i < NB_LSP_COEFS; i++) {
         if (i == 0 || i >= 8)
-            val = bitstream_read(&s->bc, 3);
+            val = get_bits(&s->gb, 3);
         else
-            val = bitstream_read(&s->bc, 4);
+            val = get_bits(&s->gb, 4);
         lsp_coefs[i] = ff_wma_lsp_codebook[i][val];
     }
 
@@ -319,7 +329,7 @@ static int decode_exp_vlc(WMACodecContext *s, int ch)
     q_end     = q + s->block_len;
     max_scale = 0;
     if (s->version == 1) {
-        last_exp  = bitstream_read(&s->bc, 5) + 10;
+        last_exp  = get_bits(&s->gb, 5) + 10;
         v         = ptab[last_exp];
         iv        = iptab[last_exp];
         max_scale = v;
@@ -334,7 +344,7 @@ static int decode_exp_vlc(WMACodecContext *s, int ch)
         last_exp = 36;
 
     while (q < q_end) {
-        code = bitstream_read_vlc(&s->bc, s->exp_vlc.table, EXPVLCBITS, EXPMAX);
+        code = get_vlc2(&s->gb, s->exp_vlc.table, EXPVLCBITS, EXPMAX);
         if (code < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Exponent vlc invalid\n");
             return -1;
@@ -378,14 +388,14 @@ static void wma_window(WMACodecContext *s, float *out)
         block_len = s->block_len;
         bsize     = s->frame_len_bits - s->block_len_bits;
 
-        s->fdsp.vector_fmul_add(out, in, s->windows[bsize],
+        s->fdsp->vector_fmul_add(out, in, s->windows[bsize],
                                 out, block_len);
     } else {
         block_len = 1 << s->prev_block_len_bits;
         n         = (s->block_len - block_len) / 2;
         bsize     = s->frame_len_bits - s->prev_block_len_bits;
 
-        s->fdsp.vector_fmul_add(out + n, in + n, s->windows[bsize],
+        s->fdsp->vector_fmul_add(out + n, in + n, s->windows[bsize],
                                 out + n, block_len);
 
         memcpy(out + n + block_len, in + n + block_len, n * sizeof(float));
@@ -399,7 +409,7 @@ static void wma_window(WMACodecContext *s, float *out)
         block_len = s->block_len;
         bsize     = s->frame_len_bits - s->block_len_bits;
 
-        s->fdsp.vector_fmul_reverse(out, in, s->windows[bsize], block_len);
+        s->fdsp->vector_fmul_reverse(out, in, s->windows[bsize], block_len);
     } else {
         block_len = 1 << s->next_block_len_bits;
         n         = (s->block_len - block_len) / 2;
@@ -407,7 +417,7 @@ static void wma_window(WMACodecContext *s, float *out)
 
         memcpy(out, in, n * sizeof(float));
 
-        s->fdsp.vector_fmul_reverse(out + n, in + n, s->windows[bsize],
+        s->fdsp->vector_fmul_reverse(out + n, in + n, s->windows[bsize],
                                     block_len);
 
         memset(out + n + block_len, 0, n * sizeof(float));
@@ -437,7 +447,7 @@ static int wma_decode_block(WMACodecContext *s)
 
         if (s->reset_block_lengths) {
             s->reset_block_lengths = 0;
-            v                      = bitstream_read(&s->bc, n);
+            v                      = get_bits(&s->gb, n);
             if (v >= s->nb_block_sizes) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "prev_block_len_bits %d out of range\n",
@@ -445,7 +455,7 @@ static int wma_decode_block(WMACodecContext *s)
                 return -1;
             }
             s->prev_block_len_bits = s->frame_len_bits - v;
-            v                      = bitstream_read(&s->bc, n);
+            v                      = get_bits(&s->gb, n);
             if (v >= s->nb_block_sizes) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "block_len_bits %d out of range\n",
@@ -458,7 +468,7 @@ static int wma_decode_block(WMACodecContext *s)
             s->prev_block_len_bits = s->block_len_bits;
             s->block_len_bits      = s->next_block_len_bits;
         }
-        v = bitstream_read(&s->bc, n);
+        v = get_bits(&s->gb, n);
         if (v >= s->nb_block_sizes) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "next_block_len_bits %d out of range\n",
@@ -473,6 +483,11 @@ static int wma_decode_block(WMACodecContext *s)
         s->block_len_bits      = s->frame_len_bits;
     }
 
+    if (s->frame_len_bits - s->block_len_bits >= s->nb_block_sizes){
+        av_log(s->avctx, AV_LOG_ERROR, "block_len_bits not initialized to a valid value\n");
+        return -1;
+    }
+
     /* now check if the block length is coherent with the frame length */
     s->block_len = 1 << s->block_len_bits;
     if ((s->block_pos + s->block_len) > s->frame_len) {
@@ -481,10 +496,10 @@ static int wma_decode_block(WMACodecContext *s)
     }
 
     if (s->avctx->channels == 2)
-        s->ms_stereo = bitstream_read_bit(&s->bc);
+        s->ms_stereo = get_bits1(&s->gb);
     v = 0;
     for (ch = 0; ch < s->avctx->channels; ch++) {
-        a                    = bitstream_read_bit(&s->bc);
+        a                    = get_bits1(&s->gb);
         s->channel_coded[ch] = a;
         v                   |= a;
     }
@@ -500,7 +515,11 @@ static int wma_decode_block(WMACodecContext *s)
      * coef escape coding */
     total_gain = 1;
     for (;;) {
-        a           = bitstream_read(&s->bc, 7);
+        if (get_bits_left(&s->gb) < 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "total_gain overread\n");
+            return AVERROR_INVALIDDATA;
+        }
+        a           = get_bits(&s->gb, 7);
         total_gain += a;
         if (a != 127)
             break;
@@ -520,7 +539,7 @@ static int wma_decode_block(WMACodecContext *s)
                 int i, n, a;
                 n = s->exponent_high_sizes[bsize];
                 for (i = 0; i < n; i++) {
-                    a                         = bitstream_read_bit(&s->bc);
+                    a                         = get_bits1(&s->gb);
                     s->high_band_coded[ch][i] = a;
                     /* if noise coding, the coefficients are not transmitted */
                     if (a)
@@ -537,11 +556,10 @@ static int wma_decode_block(WMACodecContext *s)
                 for (i = 0; i < n; i++) {
                     if (s->high_band_coded[ch][i]) {
                         if (val == (int) 0x80000000) {
-                            val = bitstream_read(&s->bc, 7) - 19;
+                            val = get_bits(&s->gb, 7) - 19;
                         } else {
-                            code = bitstream_read_vlc(&s->bc,
-                                                      s->hgain_vlc.table,
-                                                      HGAINVLCBITS, HGAINMAX);
+                            code = get_vlc2(&s->gb, s->hgain_vlc.table,
+                                            HGAINVLCBITS, HGAINMAX);
                             if (code < 0) {
                                 av_log(s->avctx, AV_LOG_ERROR,
                                        "hgain vlc invalid\n");
@@ -557,7 +575,7 @@ static int wma_decode_block(WMACodecContext *s)
     }
 
     /* exponents can be reused in short blocks. */
-    if ((s->block_len_bits == s->frame_len_bits) || bitstream_read_bit(&s->bc)) {
+    if ((s->block_len_bits == s->frame_len_bits) || get_bits1(&s->gb)) {
         for (ch = 0; ch < s->avctx->channels; ch++) {
             if (s->channel_coded[ch]) {
                 if (s->use_exp_vlc) {
@@ -581,13 +599,13 @@ static int wma_decode_block(WMACodecContext *s)
              * there is potentially less energy there */
             tindex = (ch == 1 && s->ms_stereo);
             memset(ptr, 0, s->block_len * sizeof(WMACoef));
-            ff_wma_run_level_decode(s->avctx, &s->bc, &s->coef_vlc[tindex],
+            ff_wma_run_level_decode(s->avctx, &s->gb, &s->coef_vlc[tindex],
                                     s->level_table[tindex], s->run_table[tindex],
                                     0, ptr, 0, nb_coefs[ch],
                                     s->block_len, s->frame_len_bits, coef_nb_bits);
         }
         if (s->version == 1 && s->avctx->channels >= 2)
-            bitstream_align(&s->bc);
+            align_get_bits(&s->gb);
     }
 
     /* normalize */
@@ -609,7 +627,7 @@ static int wma_decode_block(WMACodecContext *s)
             coefs1    = s->coefs1[ch];
             exponents = s->exponents[ch];
             esize     = s->exponents_bsize[ch];
-            mult      = pow(10, total_gain * 0.05) / s->max_exponent[ch];
+            mult      = ff_exp10(total_gain * 0.05) / s->max_exponent[ch];
             mult     *= mdct_norm;
             coefs     = s->coefs[ch];
             if (s->use_noise_coding) {
@@ -657,7 +675,7 @@ static int wma_decode_block(WMACodecContext *s)
                         /* use noise with specified power */
                         mult1 = sqrt(exp_power[j] / exp_power[last_high_band]);
                         /* XXX: use a table */
-                        mult1  = mult1 * pow(10, s->high_band_values[ch][j] * 0.05);
+                        mult1  = mult1 * ff_exp10(s->high_band_values[ch][j] * 0.05);
                         mult1  = mult1 / (s->max_exponent[ch] * s->noise_mult);
                         mult1 *= mdct_norm;
                         for (i = 0; i < n; i++) {
@@ -680,7 +698,7 @@ static int wma_decode_block(WMACodecContext *s)
 
                 /* very high freqs : noise */
                 n     = s->block_len - s->coefs_end[bsize];
-                mult1 = mult * exponents[((-1 << bsize)) >> esize];
+                mult1 = mult * exponents[(-(1 << bsize)) >> esize];
                 for (i = 0; i < n; i++) {
                     *coefs++       = s->noise_table[s->noise_index] * mult1;
                     s->noise_index = (s->noise_index + 1) & (NOISE_TAB_SIZE - 1);
@@ -718,7 +736,7 @@ static int wma_decode_block(WMACodecContext *s)
             s->channel_coded[0] = 1;
         }
 
-        s->fdsp.butterflies_float(s->coefs[0], s->coefs[1], s->block_len);
+        s->fdsp->butterflies_float(s->coefs[0], s->coefs[1], s->block_len);
     }
 
 next:
@@ -810,32 +828,56 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
                buf_size, avctx->block_align);
         return AVERROR_INVALIDDATA;
     }
-    buf_size = avctx->block_align;
+    if (avctx->block_align)
+        buf_size = avctx->block_align;
 
-    bitstream_init8(&s->bc, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size * 8);
 
     if (s->use_bit_reservoir) {
         /* read super frame header */
-        bitstream_skip(&s->bc, 4); /* super frame index */
-        nb_frames = bitstream_read(&s->bc, 4) - (s->last_superframe_len <= 0);
+        skip_bits(&s->gb, 4); /* super frame index */
+        nb_frames = get_bits(&s->gb, 4) - (s->last_superframe_len <= 0);
+        if (nb_frames <= 0) {
+            int is_error = nb_frames < 0 || get_bits_left(&s->gb) <= 8;
+            av_log(avctx, is_error ? AV_LOG_ERROR : AV_LOG_WARNING,
+                   "nb_frames is %d bits left %d\n",
+                   nb_frames, get_bits_left(&s->gb));
+            if (is_error)
+                return AVERROR_INVALIDDATA;
+
+            if ((s->last_superframe_len + buf_size - 1) >
+                MAX_CODED_SUPERFRAME_SIZE)
+                goto fail;
+
+            q   = s->last_superframe + s->last_superframe_len;
+            len = buf_size - 1;
+            while (len > 0) {
+                *q++ = get_bits (&s->gb, 8);
+                len --;
+            }
+            memset(q, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+            s->last_superframe_len += 8*buf_size - 8;
+//             s->reset_block_lengths = 1; //XXX is this needed ?
+            *got_frame_ptr = 0;
+            return buf_size;
+        }
     } else
         nb_frames = 1;
 
     /* get output buffer */
     frame->nb_samples = nb_frames * s->frame_len;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples        = (float **) frame->extended_data;
     samples_offset = 0;
 
     if (s->use_bit_reservoir) {
-        bit_offset = bitstream_read(&s->bc, s->byte_offset_bits + 3);
-        if (bit_offset > bitstream_bits_left(&s->bc)) {
+        bit_offset = get_bits(&s->gb, s->byte_offset_bits + 3);
+        if (bit_offset > get_bits_left(&s->gb)) {
             av_log(avctx, AV_LOG_ERROR,
                    "Invalid last frame bit offset %d > buf size %d (%d)\n",
-                   bit_offset, bitstream_bits_left(&s->bc), buf_size);
+                   bit_offset, get_bits_left(&s->gb), buf_size);
             goto fail;
         }
 
@@ -847,19 +889,19 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
             q   = s->last_superframe + s->last_superframe_len;
             len = bit_offset;
             while (len > 7) {
-                *q++ = bitstream_read(&s->bc, 8);
+                *q++ = (get_bits) (&s->gb, 8);
                 len -= 8;
             }
             if (len > 0)
-                *q++ = bitstream_read(&s->bc, len) << (8 - len);
+                *q++ = (get_bits) (&s->gb, len) << (8 - len);
             memset(q, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
             /* XXX: bit_offset bits into last frame */
-            bitstream_init(&s->bc, s->last_superframe,
-                           s->last_superframe_len * 8 + bit_offset);
+            init_get_bits(&s->gb, s->last_superframe,
+                          s->last_superframe_len * 8 + bit_offset);
             /* skip unused bits */
             if (s->last_bitoffset > 0)
-                bitstream_skip(&s->bc, s->last_bitoffset);
+                skip_bits(&s->gb, s->last_bitoffset);
             /* this frame is stored in the last superframe and in the
              * current one */
             if (wma_decode_frame(s, samples, samples_offset) < 0)
@@ -872,10 +914,10 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
         pos = bit_offset + 4 + 4 + s->byte_offset_bits + 3;
         if (pos >= MAX_CODED_SUPERFRAME_SIZE * 8 || pos > buf_size * 8)
             return AVERROR_INVALIDDATA;
-        bitstream_init8(&s->bc, buf + (pos >> 3), buf_size - (pos >> 3));
+        init_get_bits(&s->gb, buf + (pos >> 3), (buf_size - (pos >> 3)) * 8);
         len = pos & 7;
         if (len > 0)
-            bitstream_skip(&s->bc, len);
+            skip_bits(&s->gb, len);
 
         s->reset_block_lengths = 1;
         for (i = 0; i < nb_frames; i++) {
@@ -885,7 +927,7 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
         }
 
         /* we copy the end of the frame in the last frame buffer */
-        pos               = bitstream_tell(&s->bc) +
+        pos               = get_bits_count(&s->gb) +
                             ((bit_offset + 4 + 4 + s->byte_offset_bits + 3) & ~7);
         s->last_bitoffset = pos & 7;
         pos             >>= 3;
@@ -903,13 +945,13 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
         samples_offset += s->frame_len;
     }
 
-    ff_dlog(s->avctx, "%d %d %d %d outbytes:%td eaten:%d\n",
+    ff_dlog(s->avctx, "%d %d %d %d outbytes:%"PTRDIFF_SPECIFIER" eaten:%d\n",
             s->frame_len_bits, s->block_len_bits, s->frame_len, s->block_len,
             (int8_t *) samples - (int8_t *) data, avctx->block_align);
 
     *got_frame_ptr = 1;
 
-    return avctx->block_align;
+    return buf_size;
 
 fail:
     /* when error, we reset the bit reservoir */
@@ -925,6 +967,7 @@ static av_cold void flush(AVCodecContext *avctx)
     s->last_superframe_len = 0;
 }
 
+#if CONFIG_WMAV1_DECODER
 AVCodec ff_wmav1_decoder = {
     .name           = "wmav1",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 1"),
@@ -939,7 +982,8 @@ AVCodec ff_wmav1_decoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_WMAV2_DECODER
 AVCodec ff_wmav2_decoder = {
     .name           = "wmav2",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 2"),
@@ -954,3 +998,4 @@ AVCodec ff_wmav2_decoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index 800c000..091bc2a 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -2,31 +2,30 @@
  * WMA compatible encoder
  * Copyright (c) 2007 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/ffmath.h"
 
 #include "avcodec.h"
 #include "internal.h"
 #include "wma.h"
-
-#undef NDEBUG
-#include <assert.h>
+#include "libavutil/avassert.h"
 
 
 static av_cold int encode_init(AVCodecContext *avctx)
@@ -34,25 +33,26 @@ static av_cold int encode_init(AVCodecContext *avctx)
     WMACodecContext *s = avctx->priv_data;
     int i, flags1, flags2, block_align;
     uint8_t *extradata;
+    int ret;
 
     s->avctx = avctx;
 
     if (avctx->channels > MAX_CHANNELS) {
         av_log(avctx, AV_LOG_ERROR,
-               "too many channels: got %i, need %i or fewer",
+               "too many channels: got %i, need %i or fewer\n",
                avctx->channels, MAX_CHANNELS);
         return AVERROR(EINVAL);
     }
 
     if (avctx->sample_rate > 48000) {
-        av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz",
+        av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz\n",
                avctx->sample_rate);
         return AVERROR(EINVAL);
     }
 
     if (avctx->bit_rate < 24 * 1000) {
         av_log(avctx, AV_LOG_ERROR,
-               "bitrate too low: got %i, need 24000 or higher\n",
+               "bitrate too low: got %"PRId64", need 24000 or higher\n",
                avctx->bit_rate);
         return AVERROR(EINVAL);
     }
@@ -75,7 +75,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
         AV_WL32(extradata, flags1);
         AV_WL16(extradata + 4, flags2);
     } else {
-        assert(0);
+        av_assert0(0);
     }
     avctx->extradata          = extradata;
     s->use_exp_vlc            = flags2 & 0x0001;
@@ -84,7 +84,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
     if (avctx->channels == 2)
         s->ms_stereo = 1;
 
-    ff_wma_init(avctx, flags2);
+    if ((ret = ff_wma_init(avctx, flags2)) < 0)
+        return ret;
 
     /* init MDCT */
     for (i = 0; i < s->nb_block_sizes; i++)
@@ -94,14 +95,12 @@ static av_cold int encode_init(AVCodecContext *avctx)
                          (avctx->sample_rate * 8);
     block_align        = FFMIN(block_align, MAX_CODED_SUPERFRAME_SIZE);
     avctx->block_align = block_align;
-    avctx->bit_rate    = avctx->block_align * 8LL * avctx->sample_rate /
-                         s->frame_len;
     avctx->frame_size = avctx->initial_padding = s->frame_len;
 
     return 0;
 }
 
-static void apply_window_and_mdct(AVCodecContext *avctx, const AVFrame *frame)
+static int apply_window_and_mdct(AVCodecContext *avctx, const AVFrame *frame)
 {
     WMACodecContext *s = avctx->priv_data;
     float **audio      = (float **) frame->extended_data;
@@ -115,12 +114,18 @@ static void apply_window_and_mdct(AVCodecContext *avctx, const AVFrame *frame)
 
     for (ch = 0; ch < avctx->channels; ch++) {
         memcpy(s->output, s->frame_out[ch], window_len * sizeof(*s->output));
-        s->fdsp.vector_fmul_scalar(s->frame_out[ch], audio[ch], n, len);
-        s->fdsp.vector_fmul_reverse(&s->output[window_len], s->frame_out[ch],
+        s->fdsp->vector_fmul_scalar(s->frame_out[ch], audio[ch], n, len);
+        s->fdsp->vector_fmul_reverse(&s->output[window_len], s->frame_out[ch],
                                     win, len);
-        s->fdsp.vector_fmul(s->frame_out[ch], s->frame_out[ch], win, len);
+        s->fdsp->vector_fmul(s->frame_out[ch], s->frame_out[ch], win, len);
         mdct->mdct_calc(mdct, s->coefs[ch], s->output);
+        if (!isfinite(s->coefs[ch][0])) {
+            av_log(avctx, AV_LOG_ERROR, "Input contains NaN/+-Inf\n");
+            return AVERROR(EINVAL);
+        }
     }
+
+    return 0;
 }
 
 // FIXME use for decoding too
@@ -136,7 +141,7 @@ static void init_exp(WMACodecContext *s, int ch, const int *exp_param)
     max_scale = 0;
     while (q < q_end) {
         /* XXX: use a table */
-        v         = pow(10, *exp_param++ *(1.0 / 16.0));
+        v         = ff_exp10(*exp_param++ *(1.0 / 16.0));
         max_scale = FFMAX(max_scale, v);
         n         = *ptr++;
         do {
@@ -157,7 +162,7 @@ static void encode_exp_vlc(WMACodecContext *s, int ch, const int *exp_param)
     q_end = q + s->block_len;
     if (s->version == 1) {
         last_exp = *exp_param++;
-        assert(last_exp - 10 >= 0 && last_exp - 10 < 32);
+        av_assert0(last_exp - 10 >= 0 && last_exp - 10 < 32);
         put_bits(&s->pb, 5, last_exp - 10);
         q += *ptr++;
     } else
@@ -165,7 +170,7 @@ static void encode_exp_vlc(WMACodecContext *s, int ch, const int *exp_param)
     while (q < q_end) {
         int exp  = *exp_param++;
         int code = exp - last_exp + 60;
-        assert(code >= 0 && code < 120);
+        av_assert1(code >= 0 && code < 120);
         put_bits(&s->pb, ff_aac_scalefactor_bits[code],
                  ff_aac_scalefactor_code[code]);
         /* XXX: use a table */
@@ -190,7 +195,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
 
     // FIXME remove duplication relative to decoder
     if (s->use_variable_block_len) {
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
     } else {
         /* fixed block len */
         s->next_block_len_bits = s->frame_len_bits;
@@ -199,7 +204,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     }
 
     s->block_len = 1 << s->block_len_bits;
-//     assert((s->block_pos + s->block_len) <= s->frame_len);
+//     av_assert0((s->block_pos + s->block_len) <= s->frame_len);
     bsize = s->frame_len_bits - s->block_len_bits;
 
     // FIXME factor
@@ -231,11 +236,11 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
 
             coefs1    = s->coefs1[ch];
             exponents = s->exponents[ch];
-            mult      = pow(10, total_gain * 0.05) / s->max_exponent[ch];
+            mult      = ff_exp10(total_gain * 0.05) / s->max_exponent[ch];
             mult     *= mdct_norm;
             coefs     = src_coefs[ch];
             if (s->use_noise_coding && 0) {
-                assert(0); // FIXME not implemented
+                av_assert0(0); // FIXME not implemented
             } else {
                 coefs += s->coefs_start;
                 n      = nb_coefs[ch];
@@ -290,13 +295,13 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
                 if (s->use_exp_vlc) {
                     encode_exp_vlc(s, ch, fixed_exp);
                 } else {
-                    assert(0); // FIXME not implemented
+                    av_assert0(0); // FIXME not implemented
 //                    encode_exp_lsp(s, ch);
                 }
             }
         }
     } else
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
 
     for (ch = 0; ch < s->avctx->channels; ch++) {
         if (s->channel_coded[ch]) {
@@ -316,7 +321,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
                         if (run < s->coef_vlcs[tindex]->levels[abs_level - 1])
                             code = run + s->int_table[tindex][abs_level - 1];
 
-                    assert(code < s->coef_vlcs[tindex]->n);
+                    av_assert2(code < s->coef_vlcs[tindex]->n);
                     put_bits(&s->pb, s->coef_vlcs[tindex]->huffbits[code],
                              s->coef_vlcs[tindex]->huffcodes[code]);
 
@@ -349,7 +354,7 @@ static int encode_frame(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     init_put_bits(&s->pb, buf, buf_size);
 
     if (s->use_bit_reservoir)
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
     else if (encode_block(s, src_coefs, total_gain) < 0)
         return INT_MAX;
 
@@ -362,12 +367,15 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
                              const AVFrame *frame, int *got_packet_ptr)
 {
     WMACodecContext *s = avctx->priv_data;
-    int i, total_gain, ret;
+    int i, total_gain, ret, error;
 
     s->block_len_bits = s->frame_len_bits; // required by non variable block len
     s->block_len      = 1 << s->block_len_bits;
 
-    apply_window_and_mdct(avctx, frame);
+    ret = apply_window_and_mdct(avctx, frame);
+
+    if (ret < 0)
+        return ret;
 
     if (s->ms_stereo) {
         float a, b;
@@ -381,29 +389,32 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
         }
     }
 
-    if ((ret = ff_alloc_packet(avpkt, 2 * MAX_CODED_SUPERFRAME_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 2 * MAX_CODED_SUPERFRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
     total_gain = 128;
     for (i = 64; i; i >>= 1) {
-        int error = encode_frame(s, s->coefs, avpkt->data, avpkt->size,
+        error = encode_frame(s, s->coefs, avpkt->data, avpkt->size,
                                  total_gain - i);
-        if (error < 0)
+        if (error <= 0)
             total_gain -= i;
     }
 
-    if ((i = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain)) >= 0) {
-        av_log(avctx, AV_LOG_ERROR, "required frame size too large. please "
-                                    "use a higher bit rate.\n");
+    while(total_gain <= 128 && error > 0)
+        error = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain++);
+    if (error > 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid input data or requested bitrate too low, cannot encode\n");
+        avpkt->size = 0;
         return AVERROR(EINVAL);
     }
-    assert((put_bits_count(&s->pb) & 7) == 0);
-    while (i++)
+    av_assert0((put_bits_count(&s->pb) & 7) == 0);
+    i= avctx->block_align - (put_bits_count(&s->pb)+7)/8;
+    av_assert0(i>=0);
+    while(i--)
         put_bits(&s->pb, 8, 'N');
 
     flush_put_bits(&s->pb);
+    av_assert0(put_bits_ptr(&s->pb) - s->pb.buf == avctx->block_align);
 
     if (frame->pts != AV_NOPTS_VALUE)
         avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);
@@ -413,6 +424,7 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
     return 0;
 }
 
+#if CONFIG_WMAV1_ENCODER
 AVCodec ff_wmav1_encoder = {
     .name           = "wmav1",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 1"),
@@ -425,7 +437,8 @@ AVCodec ff_wmav1_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_WMAV2_ENCODER
 AVCodec ff_wmav2_encoder = {
     .name           = "wmav2",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 2"),
@@ -438,3 +451,4 @@ AVCodec ff_wmav2_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c
index b829987..eb1db61 100644
--- a/libavcodec/wmalosslessdec.c
+++ b/libavcodec/wmalosslessdec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2011 Andreas Öman
  * Copyright (c) 2011 - 2012 Mashiat Sarker Shakkhar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,10 @@
 #include "libavutil/avassert.h"
 
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "put_bits.h"
+#include "lossless_audiodsp.h"
 #include "wma.h"
 #include "wma_common.h"
 
@@ -46,6 +47,7 @@
 #define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS)    ///< maximum block size
 #define WMALL_BLOCK_SIZES    (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + 1) ///< possible block sizes
 
+#define WMALL_COEFF_PAD_SIZE   16                       ///< pad coef buffers with 0 for use with SIMD
 
 /**
  * @brief frame-specific decoder context for a single channel
@@ -69,7 +71,9 @@ typedef struct WmallDecodeCtx {
     /* generic decoder variables */
     AVCodecContext  *avctx;
     AVFrame         *frame;
-    uint8_t         frame_data[MAX_FRAMESIZE + AV_INPUT_BUFFER_PADDING_SIZE];  ///< compressed frame data
+    LLAudDSPContext dsp;                           ///< accelerated DSP functions
+    uint8_t         *frame_data;                    ///< compressed frame data
+    int             max_frame_size;                 ///< max bitstream size
     PutBitContext   pb;                             ///< context for filling the frame_data buffer
 
     /* frame size dependent frame information (set during initialization) */
@@ -87,7 +91,7 @@ typedef struct WmallDecodeCtx {
     uint16_t        min_samples_per_subframe;
 
     /* packet decode state */
-    BitstreamContext pbc;                           ///< bitstream reader context for the packet
+    GetBitContext   pgb;                            ///< bitstream reader context for the packet
     int             next_packet_start;              ///< start offset of the next WMA packet in the demuxer packet
     uint8_t         packet_offset;                  ///< offset to the frame in the packet
     uint8_t         packet_sequence_number;         ///< current packet number
@@ -99,7 +103,7 @@ typedef struct WmallDecodeCtx {
 
     /* frame decode state */
     uint32_t        frame_num;                      ///< current frame number (not used for decoding)
-    BitstreamContext bc;                            ///< bitstream reader context
+    GetBitContext   gb;                             ///< bitstream reader context
     int             buf_bit_size;                   ///< buffer size in bits
     int16_t         *samples_16[WMALL_MAX_CHANNELS]; ///< current sample buffer pointer (16-bit)
     int32_t         *samples_32[WMALL_MAX_CHANNELS]; ///< current sample buffer pointer (24-bit)
@@ -124,15 +128,15 @@ typedef struct WmallDecodeCtx {
 
     int8_t  acfilter_order;
     int8_t  acfilter_scaling;
-    int64_t acfilter_coeffs[16];
-    int     acfilter_prevvalues[2][16];
+    int16_t acfilter_coeffs[16];
+    int     acfilter_prevvalues[WMALL_MAX_CHANNELS][16];
 
     int8_t  mclms_order;
     int8_t  mclms_scaling;
     int16_t mclms_coeffs[WMALL_MAX_CHANNELS * WMALL_MAX_CHANNELS * 32];
     int16_t mclms_coeffs_cur[WMALL_MAX_CHANNELS * WMALL_MAX_CHANNELS];
-    int16_t mclms_prevvalues[WMALL_MAX_CHANNELS * 2 * 32];
-    int16_t mclms_updates[WMALL_MAX_CHANNELS * 2 * 32];
+    int32_t mclms_prevvalues[WMALL_MAX_CHANNELS * 2 * 32];
+    int32_t mclms_updates[WMALL_MAX_CHANNELS * 2 * 32];
     int     mclms_recent;
 
     int     movave_scaling;
@@ -143,35 +147,35 @@ typedef struct WmallDecodeCtx {
         int scaling;
         int coefsend;
         int bitsend;
-        int16_t coefs[MAX_ORDER];
-        int16_t lms_prevvalues[MAX_ORDER * 2];
-        int16_t lms_updates[MAX_ORDER * 2];
+        DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
+        DECLARE_ALIGNED(16, int32_t, lms_prevvalues)[MAX_ORDER * 2 + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
+        DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2 + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
         int recent;
-    } cdlms[2][9];
+    } cdlms[WMALL_MAX_CHANNELS][9];
 
-    int cdlms_ttl[2];
+    int cdlms_ttl[WMALL_MAX_CHANNELS];
 
     int bV3RTM;
 
-    int is_channel_coded[2];
-    int update_speed[2];
+    int is_channel_coded[WMALL_MAX_CHANNELS];
+    int update_speed[WMALL_MAX_CHANNELS];
 
-    int transient[2];
-    int transient_pos[2];
+    int transient[WMALL_MAX_CHANNELS];
+    int transient_pos[WMALL_MAX_CHANNELS];
     int seekable_tile;
 
-    int ave_sum[2];
+    int ave_sum[WMALL_MAX_CHANNELS];
 
-    int channel_residues[2][WMALL_BLOCK_MAX_SIZE];
+    int channel_residues[WMALL_MAX_CHANNELS][WMALL_BLOCK_MAX_SIZE];
 
-    int lpc_coefs[2][40];
+    int lpc_coefs[WMALL_MAX_CHANNELS][40];
     int lpc_order;
     int lpc_scaling;
     int lpc_intbits;
-
-    int channel_coeffs[2][WMALL_BLOCK_MAX_SIZE];
 } WmallDecodeCtx;
 
+/** Get sign of integer (1 for positive, -1 for negative and 0 for zero) */
+#define WMASIGN(x) (((x) > 0) - ((x) < 0))
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
@@ -180,8 +184,19 @@ static av_cold int decode_init(AVCodecContext *avctx)
     unsigned int channel_mask;
     int i, log2_max_num_subframes;
 
+    if (!avctx->block_align) {
+        av_log(avctx, AV_LOG_ERROR, "block_align is not set\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->max_frame_size = MAX_FRAMESIZE * avctx->channels;
+    s->frame_data = av_mallocz(s->max_frame_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!s->frame_data)
+        return AVERROR(ENOMEM);
+
     s->avctx = avctx;
-    init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+    ff_llauddsp_init(&s->dsp);
+    init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
 
     if (avctx->extradata_size >= 18) {
         s->decode_flags    = AV_RL16(edata_ptr + 14);
@@ -191,8 +206,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
             avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         else if (s->bits_per_sample == 24) {
             avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
-            avpriv_report_missing_feature(avctx, "Bit-depth higher than 16");
-            return AVERROR_PATCHWELCOME;
+            avctx->bits_per_raw_sample = 24;
         } else {
             av_log(avctx, AV_LOG_ERROR, "Unknown bit-depth: %"PRIu8"\n",
                    s->bits_per_sample);
@@ -286,7 +300,7 @@ static int decode_subframe_length(WmallDecodeCtx *s, int offset)
         return s->min_samples_per_subframe;
 
     len             = av_log2(s->max_num_subframes - 1) + 1;
-    frame_len_ratio = bitstream_read(&s->bc, len);
+    frame_len_ratio = get_bits(&s->gb, len);
     subframe_len    = s->min_samples_per_subframe * (frame_len_ratio + 1);
 
     /* sanity check the length */
@@ -332,7 +346,7 @@ static int decode_tilehdr(WmallDecodeCtx *s)
     for (c = 0; c < s->num_channels; c++)
         s->channel[c].num_subframes = 0;
 
-    tile_aligned = bitstream_read_bit(&s->bc);
+    tile_aligned = get_bits1(&s->gb);
     if (s->max_num_subframes == 1 || tile_aligned)
         fixed_channel_layout = 1;
 
@@ -345,11 +359,11 @@ static int decode_tilehdr(WmallDecodeCtx *s)
             if (num_samples[c] == min_channel_len) {
                 if (fixed_channel_layout || channels_for_cur_subframe == 1 ||
                    (min_channel_len == s->samples_per_frame - s->min_samples_per_subframe)) {
-                    contains_subframe[c] = in_use = 1;
+                    contains_subframe[c] = 1;
                 } else {
-                    if (bitstream_read_bit(&s->bc))
-                        contains_subframe[c] = in_use = 1;
+                    contains_subframe[c] = get_bits1(&s->gb);
                 }
+                in_use |= contains_subframe[c];
             } else
                 contains_subframe[c] = 0;
         }
@@ -407,32 +421,32 @@ static int decode_tilehdr(WmallDecodeCtx *s)
 static void decode_ac_filter(WmallDecodeCtx *s)
 {
     int i;
-    s->acfilter_order   = bitstream_read(&s->bc, 4) + 1;
-    s->acfilter_scaling = bitstream_read(&s->bc, 4);
+    s->acfilter_order   = get_bits(&s->gb, 4) + 1;
+    s->acfilter_scaling = get_bits(&s->gb, 4);
 
     for (i = 0; i < s->acfilter_order; i++)
-        s->acfilter_coeffs[i] = bitstream_read(&s->bc, s->acfilter_scaling) + 1;
+        s->acfilter_coeffs[i] = get_bitsz(&s->gb, s->acfilter_scaling) + 1;
 }
 
 static void decode_mclms(WmallDecodeCtx *s)
 {
-    s->mclms_order   = (bitstream_read(&s->bc, 4) + 1) * 2;
-    s->mclms_scaling =  bitstream_read(&s->bc, 4);
-    if (bitstream_read_bit(&s->bc)) {
+    s->mclms_order   = (get_bits(&s->gb, 4) + 1) * 2;
+    s->mclms_scaling = get_bits(&s->gb, 4);
+    if (get_bits1(&s->gb)) {
         int i, send_coef_bits;
         int cbits = av_log2(s->mclms_scaling + 1);
         if (1 << cbits < s->mclms_scaling + 1)
             cbits++;
 
-        send_coef_bits = bitstream_read(&s->bc, cbits) + 2;
+        send_coef_bits = get_bitsz(&s->gb, cbits) + 2;
 
         for (i = 0; i < s->mclms_order * s->num_channels * s->num_channels; i++)
-            s->mclms_coeffs[i] = bitstream_read(&s->bc, send_coef_bits);
+            s->mclms_coeffs[i] = get_bits(&s->gb, send_coef_bits);
 
         for (i = 0; i < s->num_channels; i++) {
             int c;
             for (c = 0; c < i; c++)
-                s->mclms_coeffs_cur[i * s->num_channels + c] = bitstream_read(&s->bc, send_coef_bits);
+                s->mclms_coeffs_cur[i * s->num_channels + c] = get_bits(&s->gb, send_coef_bits);
         }
     }
 }
@@ -440,12 +454,12 @@ static void decode_mclms(WmallDecodeCtx *s)
 static int decode_cdlms(WmallDecodeCtx *s)
 {
     int c, i;
-    int cdlms_send_coef = bitstream_read_bit(&s->bc);
+    int cdlms_send_coef = get_bits1(&s->gb);
 
     for (c = 0; c < s->num_channels; c++) {
-        s->cdlms_ttl[c] = bitstream_read(&s->bc, 3) + 1;
+        s->cdlms_ttl[c] = get_bits(&s->gb, 3) + 1;
         for (i = 0; i < s->cdlms_ttl[c]; i++) {
-            s->cdlms[c][i].order = (bitstream_read(&s->bc, 7) + 1) * 8;
+            s->cdlms[c][i].order = (get_bits(&s->gb, 7) + 1) * 8;
             if (s->cdlms[c][i].order > MAX_ORDER) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Order[%d][%d] %d > max (%d), not supported\n",
@@ -453,10 +467,17 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 s->cdlms[0][0].order = 0;
                 return AVERROR_INVALIDDATA;
             }
+            if(s->cdlms[c][i].order & 8 && s->bits_per_sample == 16) {
+                static int warned;
+                if(!warned)
+                    avpriv_request_sample(s->avctx, "CDLMS of order %d",
+                                          s->cdlms[c][i].order);
+                warned = 1;
+            }
         }
 
         for (i = 0; i < s->cdlms_ttl[c]; i++)
-            s->cdlms[c][i].scaling = bitstream_read(&s->bc, 4);
+            s->cdlms[c][i].scaling = get_bits(&s->gb, 4);
 
         if (cdlms_send_coef) {
             for (i = 0; i < s->cdlms_ttl[c]; i++) {
@@ -464,20 +485,24 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 cbits = av_log2(s->cdlms[c][i].order);
                 if ((1 << cbits) < s->cdlms[c][i].order)
                     cbits++;
-                s->cdlms[c][i].coefsend = bitstream_read(&s->bc, cbits) + 1;
+                s->cdlms[c][i].coefsend = get_bits(&s->gb, cbits) + 1;
 
                 cbits = av_log2(s->cdlms[c][i].scaling + 1);
                 if ((1 << cbits) < s->cdlms[c][i].scaling + 1)
                     cbits++;
 
-                s->cdlms[c][i].bitsend = bitstream_read(&s->bc, cbits) + 2;
+                s->cdlms[c][i].bitsend = get_bitsz(&s->gb, cbits) + 2;
                 shift_l = 32 - s->cdlms[c][i].bitsend;
                 shift_r = 32 - s->cdlms[c][i].scaling - 2;
                 for (j = 0; j < s->cdlms[c][i].coefsend; j++)
                     s->cdlms[c][i].coefs[j] =
-                        (bitstream_read(&s->bc, s->cdlms[c][i].bitsend) << shift_l) >> shift_r;
+                        (get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) >> shift_r;
             }
         }
+
+        for (i = 0; i < s->cdlms_ttl[c]; i++)
+            memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order,
+                   0, WMALL_COEFF_PAD_SIZE);
     }
 
     return 0;
@@ -487,9 +512,9 @@ static int decode_channel_residues(WmallDecodeCtx *s, int ch, int tile_size)
 {
     int i = 0;
     unsigned int ave_mean;
-    s->transient[ch] = bitstream_read_bit(&s->bc);
+    s->transient[ch] = get_bits1(&s->gb);
     if (s->transient[ch]) {
-        s->transient_pos[ch] = bitstream_read(&s->bc, av_log2(tile_size));
+        s->transient_pos[ch] = get_bits(&s->gb, av_log2(tile_size));
         if (s->transient_pos[ch])
             s->transient[ch] = 0;
         s->channel[ch].transient_counter =
@@ -498,43 +523,40 @@ static int decode_channel_residues(WmallDecodeCtx *s, int ch, int tile_size)
         s->transient[ch] = 1;
 
     if (s->seekable_tile) {
-        ave_mean = bitstream_read(&s->bc, s->bits_per_sample);
+        ave_mean = get_bits(&s->gb, s->bits_per_sample);
         s->ave_sum[ch] = ave_mean << (s->movave_scaling + 1);
     }
 
     if (s->seekable_tile) {
         if (s->do_inter_ch_decorr)
-            s->channel_residues[ch][0] = bitstream_read_signed(&s->bc, s->bits_per_sample + 1);
+            s->channel_residues[ch][0] = get_sbits_long(&s->gb, s->bits_per_sample + 1);
         else
-            s->channel_residues[ch][0] = bitstream_read_signed(&s->bc, s->bits_per_sample);
+            s->channel_residues[ch][0] = get_sbits_long(&s->gb, s->bits_per_sample);
         i++;
     }
     for (; i < tile_size; i++) {
         int quo = 0, rem, rem_bits, residue;
-        while (bitstream_read_bit(&s->bc)) {
+        while(get_bits1(&s->gb)) {
             quo++;
-            if (bitstream_bits_left(&s->bc) <= 0)
+            if (get_bits_left(&s->gb) <= 0)
                 return -1;
         }
         if (quo >= 32)
-            quo += bitstream_read(&s->bc, bitstream_read(&s->bc, 5) + 1);
+            quo += get_bits_long(&s->gb, get_bits(&s->gb, 5) + 1);
 
         ave_mean = (s->ave_sum[ch] + (1 << s->movave_scaling)) >> (s->movave_scaling + 1);
         if (ave_mean <= 1)
             residue = quo;
         else {
             rem_bits = av_ceil_log2(ave_mean);
-            rem      = rem_bits ? bitstream_read(&s->bc, rem_bits) : 0;
+            rem      = get_bits_long(&s->gb, rem_bits);
             residue  = (quo << rem_bits) + rem;
         }
 
         s->ave_sum[ch] = residue + s->ave_sum[ch] -
                          (s->ave_sum[ch] >> s->movave_scaling);
 
-        if (residue & 1)
-            residue = -(residue >> 1) - 1;
-        else
-            residue = residue >> 1;
+        residue = (residue >> 1) ^ -(residue & 1);
         s->channel_residues[ch][i] = residue;
     }
 
@@ -545,13 +567,13 @@ static int decode_channel_residues(WmallDecodeCtx *s, int ch, int tile_size)
 static void decode_lpc(WmallDecodeCtx *s)
 {
     int ch, i, cbits;
-    s->lpc_order   = bitstream_read(&s->bc, 5) + 1;
-    s->lpc_scaling = bitstream_read(&s->bc, 4);
-    s->lpc_intbits = bitstream_read(&s->bc, 3) + 1;
+    s->lpc_order   = get_bits(&s->gb, 5) + 1;
+    s->lpc_scaling = get_bits(&s->gb, 4);
+    s->lpc_intbits = get_bits(&s->gb, 3) + 1;
     cbits = s->lpc_scaling + s->lpc_intbits;
     for (ch = 0; ch < s->num_channels; ch++)
         for (i = 0; i < s->lpc_order; i++)
-            s->lpc_coefs[ch][i] = bitstream_read_signed(&s->bc, cbits);
+            s->lpc_coefs[ch][i] = get_sbits(&s->gb, cbits);
 }
 
 static void clear_codec_buffers(WmallDecodeCtx *s)
@@ -611,47 +633,31 @@ static void mclms_update(WmallDecodeCtx *s, int icoef, int *pred)
             for (i = 0; i < order * num_channels; i++)
                 s->mclms_coeffs[i + ich * order * num_channels] +=
                     s->mclms_updates[s->mclms_recent + i];
-            for (j = 0; j < ich; j++) {
-                if (s->channel_residues[j][icoef] > 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] += 1;
-                else if (s->channel_residues[j][icoef] < 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] -= 1;
-            }
+            for (j = 0; j < ich; j++)
+                s->mclms_coeffs_cur[ich * num_channels + j] += WMASIGN(s->channel_residues[j][icoef]);
         } else if (pred_error < 0) {
             for (i = 0; i < order * num_channels; i++)
                 s->mclms_coeffs[i + ich * order * num_channels] -=
                     s->mclms_updates[s->mclms_recent + i];
-            for (j = 0; j < ich; j++) {
-                if (s->channel_residues[j][icoef] > 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] -= 1;
-                else if (s->channel_residues[j][icoef] < 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] += 1;
-            }
+            for (j = 0; j < ich; j++)
+                s->mclms_coeffs_cur[ich * num_channels + j] -= WMASIGN(s->channel_residues[j][icoef]);
         }
     }
 
     for (ich = num_channels - 1; ich >= 0; ich--) {
         s->mclms_recent--;
-        s->mclms_prevvalues[s->mclms_recent] = s->channel_residues[ich][icoef];
-        if (s->channel_residues[ich][icoef] > range - 1)
-            s->mclms_prevvalues[s->mclms_recent] = range - 1;
-        else if (s->channel_residues[ich][icoef] < -range)
-            s->mclms_prevvalues[s->mclms_recent] = -range;
-
-        s->mclms_updates[s->mclms_recent] = 0;
-        if (s->channel_residues[ich][icoef] > 0)
-            s->mclms_updates[s->mclms_recent] = 1;
-        else if (s->channel_residues[ich][icoef] < 0)
-            s->mclms_updates[s->mclms_recent] = -1;
+        s->mclms_prevvalues[s->mclms_recent] = av_clip(s->channel_residues[ich][icoef],
+            -range, range - 1);
+        s->mclms_updates[s->mclms_recent] = WMASIGN(s->channel_residues[ich][icoef]);
     }
 
     if (s->mclms_recent == 0) {
         memcpy(&s->mclms_prevvalues[order * num_channels],
                s->mclms_prevvalues,
-               2 * order * num_channels);
+               sizeof(int32_t) * order * num_channels);
         memcpy(&s->mclms_updates[order * num_channels],
                s->mclms_updates,
-               2 * order * num_channels);
+               sizeof(int32_t) * order * num_channels);
         s->mclms_recent = num_channels * order;
     }
 }
@@ -667,10 +673,10 @@ static void mclms_predict(WmallDecodeCtx *s, int icoef, int *pred)
         if (!s->is_channel_coded[ich])
             continue;
         for (i = 0; i < order * num_channels; i++)
-            pred[ich] += s->mclms_prevvalues[i + s->mclms_recent] *
+            pred[ich] += (uint32_t)s->mclms_prevvalues[i + s->mclms_recent] *
                          s->mclms_coeffs[i + order * num_channels * ich];
         for (i = 0; i < ich; i++)
-            pred[ich] += s->channel_residues[i][icoef] *
+            pred[ich] += (uint32_t)s->channel_residues[i][icoef] *
                          s->mclms_coeffs_cur[i + num_channels * ich];
         pred[ich] += 1 << s->mclms_scaling - 1;
         pred[ich] >>= s->mclms_scaling;
@@ -687,60 +693,6 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size)
     }
 }
 
-static int lms_predict(WmallDecodeCtx *s, int ich, int ilms)
-{
-    int pred = 0, icoef;
-    int recent = s->cdlms[ich][ilms].recent;
-
-    for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-        pred += s->cdlms[ich][ilms].coefs[icoef] *
-                s->cdlms[ich][ilms].lms_prevvalues[icoef + recent];
-
-    return pred;
-}
-
-static void lms_update(WmallDecodeCtx *s, int ich, int ilms,
-                       int input, int residue)
-{
-    int icoef;
-    int recent = s->cdlms[ich][ilms].recent;
-    int range  = 1 << s->bits_per_sample - 1;
-
-    if (residue < 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] -=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    } else if (residue > 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] +=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    }
-
-    if (recent)
-        recent--;
-    else {
-        memcpy(&s->cdlms[ich][ilms].lms_prevvalues[s->cdlms[ich][ilms].order],
-               s->cdlms[ich][ilms].lms_prevvalues,
-               2 * s->cdlms[ich][ilms].order);
-        memcpy(&s->cdlms[ich][ilms].lms_updates[s->cdlms[ich][ilms].order],
-               s->cdlms[ich][ilms].lms_updates,
-               2 * s->cdlms[ich][ilms].order);
-        recent = s->cdlms[ich][ilms].order - 1;
-    }
-
-    s->cdlms[ich][ilms].lms_prevvalues[recent] = av_clip(input, -range, range - 1);
-    if (!input)
-        s->cdlms[ich][ilms].lms_updates[recent] = 0;
-    else if (input < 0)
-        s->cdlms[ich][ilms].lms_updates[recent] = -s->update_speed[ich];
-    else
-        s->cdlms[ich][ilms].lms_updates[recent] = s->update_speed[ich];
-
-    s->cdlms[ich][ilms].lms_updates[recent + (s->cdlms[ich][ilms].order >> 4)] >>= 2;
-    s->cdlms[ich][ilms].lms_updates[recent + (s->cdlms[ich][ilms].order >> 3)] >>= 1;
-    s->cdlms[ich][ilms].recent = recent;
-}
-
 static void use_high_update_speed(WmallDecodeCtx *s, int ich)
 {
     int ilms, recent, icoef;
@@ -776,24 +728,63 @@ static void use_normal_update_speed(WmallDecodeCtx *s, int ich)
     s->update_speed[ich] = 8;
 }
 
-static void revert_cdlms(WmallDecodeCtx *s, int ch,
-                         int coef_begin, int coef_end)
-{
-    int icoef, pred, ilms, num_lms, residue, input;
-
-    num_lms = s->cdlms_ttl[ch];
-    for (ilms = num_lms - 1; ilms >= 0; ilms--) {
-        for (icoef = coef_begin; icoef < coef_end; icoef++) {
-            pred = 1 << (s->cdlms[ch][ilms].scaling - 1);
-            residue = s->channel_residues[ch][icoef];
-            pred += lms_predict(s, ch, ilms);
-            input = residue + (pred >> s->cdlms[ch][ilms].scaling);
-            lms_update(s, ch, ilms, input, residue);
-            s->channel_residues[ch][icoef] = input;
-        }
-    }
+#define CD_LMS(bits, ROUND) \
+static void lms_update ## bits (WmallDecodeCtx *s, int ich, int ilms, int input) \
+{ \
+    int recent = s->cdlms[ich][ilms].recent; \
+    int range  = 1 << s->bits_per_sample - 1; \
+    int order  = s->cdlms[ich][ilms].order; \
+    int ##bits##_t *prev = (int##bits##_t *)s->cdlms[ich][ilms].lms_prevvalues; \
+ \
+    if (recent) \
+        recent--; \
+    else { \
+        memcpy(prev + order, prev, (bits/8) * order); \
+        memcpy(s->cdlms[ich][ilms].lms_updates + order, \
+               s->cdlms[ich][ilms].lms_updates, \
+               sizeof(*s->cdlms[ich][ilms].lms_updates) * order); \
+        recent = order - 1; \
+    } \
+ \
+    prev[recent] = av_clip(input, -range, range - 1); \
+    s->cdlms[ich][ilms].lms_updates[recent] = WMASIGN(input) * s->update_speed[ich]; \
+ \
+    s->cdlms[ich][ilms].lms_updates[recent + (order >> 4)] >>= 2; \
+    s->cdlms[ich][ilms].lms_updates[recent + (order >> 3)] >>= 1; \
+    s->cdlms[ich][ilms].recent = recent; \
+    memset(s->cdlms[ich][ilms].lms_updates + recent + order, 0, \
+           sizeof(s->cdlms[ich][ilms].lms_updates) - \
+           sizeof(*s->cdlms[ich][ilms].lms_updates)*(recent+order)); \
+} \
+ \
+static void revert_cdlms ## bits (WmallDecodeCtx *s, int ch, \
+                                  int coef_begin, int coef_end) \
+{ \
+    int icoef, pred, ilms, num_lms, residue, input; \
+ \
+    num_lms = s->cdlms_ttl[ch]; \
+    for (ilms = num_lms - 1; ilms >= 0; ilms--) { \
+        for (icoef = coef_begin; icoef < coef_end; icoef++) { \
+            int##bits##_t *prevvalues = (int##bits##_t *)s->cdlms[ch][ilms].lms_prevvalues; \
+            pred = 1 << (s->cdlms[ch][ilms].scaling - 1); \
+            residue = s->channel_residues[ch][icoef]; \
+            pred += s->dsp.scalarproduct_and_madd_int## bits (s->cdlms[ch][ilms].coefs, \
+                                                        prevvalues + s->cdlms[ch][ilms].recent, \
+                                                        s->cdlms[ch][ilms].lms_updates + \
+                                                        s->cdlms[ch][ilms].recent, \
+                                                        FFALIGN(s->cdlms[ch][ilms].order, ROUND), \
+                                                        WMASIGN(residue)); \
+            input = residue + (pred >> s->cdlms[ch][ilms].scaling); \
+            lms_update ## bits(s, ch, ilms, input); \
+            s->channel_residues[ch][icoef] = input; \
+        } \
+    } \
+    if (bits <= 16) emms_c(); \
 }
 
+CD_LMS(16, WMALL_COEFF_PAD_SIZE)
+CD_LMS(32, 8)
+
 static void revert_inter_ch_decorr(WmallDecodeCtx *s, int tile_size)
 {
     if (s->num_channels != 2)
@@ -810,7 +801,7 @@ static void revert_inter_ch_decorr(WmallDecodeCtx *s, int tile_size)
 static void revert_acfilter(WmallDecodeCtx *s, int tile_size)
 {
     int ich, pred, i, j;
-    int64_t *filter_coeffs = s->acfilter_coeffs;
+    int16_t *filter_coeffs = s->acfilter_coeffs;
     int scaling            = s->acfilter_scaling;
     int order              = s->acfilter_order;
 
@@ -830,7 +821,7 @@ static void revert_acfilter(WmallDecodeCtx *s, int tile_size)
         for (i = order; i < tile_size; i++) {
             pred = 0;
             for (j = 0; j < order; j++)
-                pred += s->channel_residues[ich][i - j - 1] * filter_coeffs[j];
+                pred += (uint32_t)s->channel_residues[ich][i - j - 1] * filter_coeffs[j];
             pred >>= scaling;
             s->channel_residues[ich][i] += pred;
         }
@@ -846,7 +837,7 @@ static int decode_subframe(WmallDecodeCtx *s)
     int total_samples = s->samples_per_frame * s->num_channels;
     int i, j, rawpcm_tile, padding_zeroes, res;
 
-    s->subframe_offset = bitstream_tell(&s->bc);
+    s->subframe_offset = get_bits_count(&s->gb);
 
     /* reset channel context and find the next block offset and size
         == the next block of the channel with the smallest number of
@@ -883,18 +874,18 @@ static int decode_subframe(WmallDecodeCtx *s)
         s->parsed_all_subframes = 1;
 
 
-    s->seekable_tile = bitstream_read_bit(&s->bc);
+    s->seekable_tile = get_bits1(&s->gb);
     if (s->seekable_tile) {
         clear_codec_buffers(s);
 
-        s->do_arith_coding    = bitstream_read_bit(&s->bc);
+        s->do_arith_coding    = get_bits1(&s->gb);
         if (s->do_arith_coding) {
             avpriv_request_sample(s->avctx, "Arithmetic coding");
             return AVERROR_PATCHWELCOME;
         }
-        s->do_ac_filter       = bitstream_read_bit(&s->bc);
-        s->do_inter_ch_decorr = bitstream_read_bit(&s->bc);
-        s->do_mclms           = bitstream_read_bit(&s->bc);
+        s->do_ac_filter       = get_bits1(&s->gb);
+        s->do_inter_ch_decorr = get_bits1(&s->gb);
+        s->do_mclms           = get_bits1(&s->gb);
 
         if (s->do_ac_filter)
             decode_ac_filter(s);
@@ -904,29 +895,32 @@ static int decode_subframe(WmallDecodeCtx *s)
 
         if ((res = decode_cdlms(s)) < 0)
             return res;
-        s->movave_scaling = bitstream_read(&s->bc, 3);
-        s->quant_stepsize = bitstream_read(&s->bc, 8) + 1;
+        s->movave_scaling = get_bits(&s->gb, 3);
+        s->quant_stepsize = get_bits(&s->gb, 8) + 1;
 
         reset_codec(s);
-    } else if (!s->cdlms[0][0].order) {
+    }
+
+    rawpcm_tile = get_bits1(&s->gb);
+
+    if (!rawpcm_tile && !s->cdlms[0][0].order) {
         av_log(s->avctx, AV_LOG_DEBUG,
                "Waiting for seekable tile\n");
         av_frame_unref(s->frame);
         return -1;
     }
 
-    rawpcm_tile = bitstream_read_bit(&s->bc);
 
     for (i = 0; i < s->num_channels; i++)
         s->is_channel_coded[i] = 1;
 
     if (!rawpcm_tile) {
         for (i = 0; i < s->num_channels; i++)
-            s->is_channel_coded[i] = bitstream_read_bit(&s->bc);
+            s->is_channel_coded[i] = get_bits1(&s->gb);
 
         if (s->bV3RTM) {
             // LPC
-            s->do_lpc = bitstream_read_bit(&s->bc);
+            s->do_lpc = get_bits1(&s->gb);
             if (s->do_lpc) {
                 decode_lpc(s);
                 avpriv_request_sample(s->avctx, "Expect wrong output since "
@@ -937,8 +931,8 @@ static int decode_subframe(WmallDecodeCtx *s)
     }
 
 
-    if (bitstream_read_bit(&s->bc))
-        padding_zeroes = bitstream_read(&s->bc, 5);
+    if (get_bits1(&s->gb))
+        padding_zeroes = get_bits(&s->gb, 5);
     else
         padding_zeroes = 0;
 
@@ -951,35 +945,40 @@ static int decode_subframe(WmallDecodeCtx *s)
         }
         ff_dlog(s->avctx, "RAWPCM %d bits per sample. "
                 "total %d bits, remain=%d\n", bits,
-                bits * s->num_channels * subframe_len, bitstream_tell(&s->bc));
+                bits * s->num_channels * subframe_len, get_bits_count(&s->gb));
         for (i = 0; i < s->num_channels; i++)
             for (j = 0; j < subframe_len; j++)
-                s->channel_coeffs[i][j] = bitstream_read_signed(&s->bc, bits);
+                s->channel_residues[i][j] = get_sbits_long(&s->gb, bits);
     } else {
-        for (i = 0; i < s->num_channels; i++)
+        for (i = 0; i < s->num_channels; i++) {
             if (s->is_channel_coded[i]) {
                 decode_channel_residues(s, i, subframe_len);
                 if (s->seekable_tile)
                     use_high_update_speed(s, i);
                 else
                     use_normal_update_speed(s, i);
-                revert_cdlms(s, i, 0, subframe_len);
+                if (s->bits_per_sample > 16)
+                    revert_cdlms32(s, i, 0, subframe_len);
+                else
+                    revert_cdlms16(s, i, 0, subframe_len);
             } else {
                 memset(s->channel_residues[i], 0, sizeof(**s->channel_residues) * subframe_len);
             }
+        }
+
+        if (s->do_mclms)
+            revert_mclms(s, subframe_len);
+        if (s->do_inter_ch_decorr)
+            revert_inter_ch_decorr(s, subframe_len);
+        if (s->do_ac_filter)
+            revert_acfilter(s, subframe_len);
+
+        /* Dequantize */
+        if (s->quant_stepsize != 1)
+            for (i = 0; i < s->num_channels; i++)
+                for (j = 0; j < subframe_len; j++)
+                    s->channel_residues[i][j] *= s->quant_stepsize;
     }
-    if (s->do_mclms)
-        revert_mclms(s, subframe_len);
-    if (s->do_inter_ch_decorr)
-        revert_inter_ch_decorr(s, subframe_len);
-    if (s->do_ac_filter)
-        revert_acfilter(s, subframe_len);
-
-    /* Dequantize */
-    if (s->quant_stepsize != 1)
-        for (i = 0; i < s->num_channels; i++)
-            for (j = 0; j < subframe_len; j++)
-                s->channel_residues[i][j] *= s->quant_stepsize;
 
     /* Write to proper output buffer depending on bit-depth */
     for (i = 0; i < s->channels_for_cur_subframe; i++) {
@@ -990,7 +989,7 @@ static int decode_subframe(WmallDecodeCtx *s)
             if (s->bits_per_sample == 16) {
                 *s->samples_16[c]++ = (int16_t) s->channel_residues[c][j] << padding_zeroes;
             } else {
-                *s->samples_32[c]++ = s->channel_residues[c][j] << padding_zeroes;
+                *s->samples_32[c]++ = s->channel_residues[c][j] << (padding_zeroes + 8);
             }
         }
     }
@@ -1015,15 +1014,14 @@ static int decode_subframe(WmallDecodeCtx *s)
  */
 static int decode_frame(WmallDecodeCtx *s)
 {
-    BitstreamContext *bc = &s->bc;
+    GetBitContext* gb = &s->gb;
     int more_frames = 0, len = 0, i, ret;
 
     s->frame->nb_samples = s->samples_per_frame;
     if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0) {
         /* return an error if no frame could be decoded at all */
-        av_log(s->avctx, AV_LOG_ERROR,
-               "not enough space for the output samples\n");
         s->packet_loss = 1;
+        s->frame->nb_samples = 0;
         return ret;
     }
     for (i = 0; i < s->num_channels; i++) {
@@ -1033,33 +1031,37 @@ static int decode_frame(WmallDecodeCtx *s)
 
     /* get frame length */
     if (s->len_prefix)
-        len = bitstream_read(bc, s->log2_frame_size);
+        len = get_bits(gb, s->log2_frame_size);
 
     /* decode tile information */
-    if (decode_tilehdr(s)) {
+    if ((ret = decode_tilehdr(s))) {
         s->packet_loss = 1;
-        return 0;
+        av_frame_unref(s->frame);
+        return ret;
     }
 
     /* read drc info */
     if (s->dynamic_range_compression)
-        s->drc_gain = bitstream_read(bc, 8);
+        s->drc_gain = get_bits(gb, 8);
 
     /* no idea what these are for, might be the number of samples
        that need to be skipped at the beginning or end of a stream */
-    if (bitstream_read_bit(bc)) {
+    if (get_bits1(gb)) {
         int av_unused skip;
 
         /* usually true for the first frame */
-        if (bitstream_read_bit(bc)) {
-            skip = bitstream_read(bc, av_log2(s->samples_per_frame * 2));
+        if (get_bits1(gb)) {
+            skip = get_bits(gb, av_log2(s->samples_per_frame * 2));
             ff_dlog(s->avctx, "start skip: %i\n", skip);
         }
 
         /* sometimes true for the last frame */
-        if (bitstream_read_bit(bc)) {
-            skip = bitstream_read(bc, av_log2(s->samples_per_frame * 2));
+        if (get_bits1(gb)) {
+            skip = get_bits(gb, av_log2(s->samples_per_frame * 2));
             ff_dlog(s->avctx, "end skip: %i\n", skip);
+            s->frame->nb_samples -= skip;
+            if (s->frame->nb_samples <= 0)
+                return AVERROR_INVALIDDATA;
         }
 
     }
@@ -1073,34 +1075,36 @@ static int decode_frame(WmallDecodeCtx *s)
 
     /* decode all subframes */
     while (!s->parsed_all_subframes) {
+        int decoded_samples = s->channel[0].decoded_samples;
         if (decode_subframe(s) < 0) {
             s->packet_loss = 1;
+            if (s->frame->nb_samples)
+                s->frame->nb_samples = decoded_samples;
             return 0;
         }
     }
 
     ff_dlog(s->avctx, "Frame done\n");
 
-    if (s->skip_frame)
-        s->skip_frame = 0;
+    s->skip_frame = 0;
 
     if (s->len_prefix) {
-        if (len != (bitstream_tell(bc) - s->frame_offset) + 2) {
+        if (len != (get_bits_count(gb) - s->frame_offset) + 2) {
             /* FIXME: not sure if this is always an error */
             av_log(s->avctx, AV_LOG_ERROR,
                    "frame[%"PRIu32"] would have to skip %i bits\n",
                    s->frame_num,
-                   len - (bitstream_tell(bc) - s->frame_offset) - 1);
+                   len - (get_bits_count(gb) - s->frame_offset) - 1);
             s->packet_loss = 1;
             return 0;
         }
 
         /* skip the rest of the frame data */
-        bitstream_skip(bc, len - (bitstream_tell(bc) - s->frame_offset) - 1);
+        skip_bits_long(gb, len - (get_bits_count(gb) - s->frame_offset) - 1);
     }
 
     /* decode trailer bit */
-    more_frames = bitstream_read_bit(bc);
+    more_frames = get_bits1(gb);
     ++s->frame_num;
     return more_frames;
 }
@@ -1108,22 +1112,22 @@ static int decode_frame(WmallDecodeCtx *s)
 /**
  * @brief Calculate remaining input buffer length.
  * @param s  codec context
- * @param bc bitstream reader context
+ * @param gb bitstream reader context
  * @return remaining size in bits
  */
-static int remaining_bits(WmallDecodeCtx *s, BitstreamContext *bc)
+static int remaining_bits(WmallDecodeCtx *s, GetBitContext *gb)
 {
-    return s->buf_bit_size - bitstream_tell(bc);
+    return s->buf_bit_size - get_bits_count(gb);
 }
 
 /**
  * @brief Fill the bit reservoir with a (partial) frame.
  * @param s      codec context
- * @param bc     bitstream reader context
+ * @param gb     bitstream reader context
  * @param len    length of the partial frame
  * @param append decides whether to reset the buffer or not
  */
-static void save_bits(WmallDecodeCtx *s, BitstreamContext *bc, int len,
+static void save_bits(WmallDecodeCtx *s, GetBitContext* gb, int len,
                       int append)
 {
     int buflen;
@@ -1134,71 +1138,75 @@ static void save_bits(WmallDecodeCtx *s, BitstreamContext *bc, int len,
         and skipped later so that a fast byte copy is possible */
 
     if (!append) {
-        s->frame_offset   = bitstream_tell(bc) & 7;
+        s->frame_offset   = get_bits_count(gb) & 7;
         s->num_saved_bits = s->frame_offset;
-        init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+        init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
     }
 
     buflen = (s->num_saved_bits + len + 8) >> 3;
 
-    if (len <= 0 || buflen > MAX_FRAMESIZE) {
+    if (len <= 0 || buflen > s->max_frame_size) {
         avpriv_request_sample(s->avctx, "Too small input buffer");
         s->packet_loss = 1;
+        s->num_saved_bits = 0;
         return;
     }
 
     s->num_saved_bits += len;
     if (!append) {
-        avpriv_copy_bits(&s->pb, bc->buffer + (bitstream_tell(bc) >> 3),
+        avpriv_copy_bits(&s->pb, gb->buffer + (get_bits_count(gb) >> 3),
                          s->num_saved_bits);
     } else {
-        int align = 8 - (bitstream_tell(bc) & 7);
+        int align = 8 - (get_bits_count(gb) & 7);
         align = FFMIN(align, len);
-        put_bits(&s->pb, align, bitstream_read(bc, align));
+        put_bits(&s->pb, align, get_bits(gb, align));
         len -= align;
-        avpriv_copy_bits(&s->pb, bc->buffer + (bitstream_tell(bc) >> 3), len);
+        avpriv_copy_bits(&s->pb, gb->buffer + (get_bits_count(gb) >> 3), len);
     }
-    bitstream_skip(bc, len);
+    skip_bits_long(gb, len);
 
     tmp = s->pb;
     flush_put_bits(&tmp);
 
-    bitstream_init(&s->bc, s->frame_data, s->num_saved_bits);
-    bitstream_skip(&s->bc, s->frame_offset);
+    init_get_bits(&s->gb, s->frame_data, s->num_saved_bits);
+    skip_bits(&s->gb, s->frame_offset);
 }
 
 static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
                          AVPacket* avpkt)
 {
     WmallDecodeCtx *s = avctx->priv_data;
-    BitstreamContext *bc = &s->pbc;
+    GetBitContext* gb  = &s->pgb;
     const uint8_t* buf = avpkt->data;
     int buf_size       = avpkt->size;
     int num_bits_prev_frame, packet_sequence_number, spliced_packet;
 
     s->frame->nb_samples = 0;
 
-    if (s->packet_done || s->packet_loss) {
+    if (!buf_size && s->num_saved_bits > get_bits_count(&s->gb)) {
+        s->packet_done = 0;
+        if (!decode_frame(s))
+            s->num_saved_bits = 0;
+    } else if (s->packet_done || s->packet_loss) {
         s->packet_done = 0;
 
-        /* sanity check for the buffer length */
-        if (buf_size < avctx->block_align)
+        if (!buf_size)
             return 0;
 
-        s->next_packet_start = buf_size - avctx->block_align;
-        buf_size             = avctx->block_align;
+        s->next_packet_start = buf_size - FFMIN(avctx->block_align, buf_size);
+        buf_size             = FFMIN(avctx->block_align, buf_size);
         s->buf_bit_size      = buf_size << 3;
 
         /* parse packet header */
-        bitstream_init(bc, buf, s->buf_bit_size);
-        packet_sequence_number = bitstream_read(bc, 4);
-        bitstream_skip(bc, 1); // Skip seekable_frame_in_packet, currently ununused
-        spliced_packet = bitstream_read_bit(bc);
+        init_get_bits(gb, buf, s->buf_bit_size);
+        packet_sequence_number = get_bits(gb, 4);
+        skip_bits(gb, 1);   // Skip seekable_frame_in_packet, currently unused
+        spliced_packet = get_bits1(gb);
         if (spliced_packet)
             avpriv_request_sample(avctx, "Bitstream splicing");
 
         /* get number of bits that need to be added to the previous frame */
-        num_bits_prev_frame = bitstream_read(bc, s->log2_frame_size);
+        num_bits_prev_frame = get_bits(gb, s->log2_frame_size);
 
         /* check for packet loss */
         if (!s->packet_loss &&
@@ -1211,7 +1219,7 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
         s->packet_sequence_number = packet_sequence_number;
 
         if (num_bits_prev_frame > 0) {
-            int remaining_packet_bits = s->buf_bit_size - bitstream_tell(bc);
+            int remaining_packet_bits = s->buf_bit_size - get_bits_count(gb);
             if (num_bits_prev_frame >= remaining_packet_bits) {
                 num_bits_prev_frame = remaining_packet_bits;
                 s->packet_done = 1;
@@ -1219,7 +1227,7 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
 
             /* Append the previous frame data to the remaining data from the
              * previous packet to create a full frame. */
-            save_bits(s, bc, num_bits_prev_frame, 1);
+            save_bits(s, gb, num_bits_prev_frame, 1);
 
             /* decode the cross packet frame if it is valid */
             if (num_bits_prev_frame < remaining_packet_bits && !s->packet_loss)
@@ -1234,23 +1242,25 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
              * to decode incomplete frames in the s->len_prefix == 0 case. */
             s->num_saved_bits = 0;
             s->packet_loss    = 0;
-            init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+            init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
         }
 
     } else {
         int frame_size;
 
         s->buf_bit_size = (avpkt->size - s->next_packet_start) << 3;
-        bitstream_init(bc, avpkt->data, s->buf_bit_size);
-        bitstream_skip(bc, s->packet_offset);
+        init_get_bits(gb, avpkt->data, s->buf_bit_size);
+        skip_bits(gb, s->packet_offset);
 
-        if (s->len_prefix && remaining_bits(s, bc) > s->log2_frame_size &&
-            (frame_size = bitstream_peek(bc, s->log2_frame_size)) &&
-            frame_size <= remaining_bits(s, bc)) {
-            save_bits(s, bc, frame_size, 0);
-            s->packet_done = !decode_frame(s);
+        if (s->len_prefix && remaining_bits(s, gb) > s->log2_frame_size &&
+            (frame_size = show_bits(gb, s->log2_frame_size)) &&
+            frame_size <= remaining_bits(s, gb)) {
+            save_bits(s, gb, frame_size, 0);
+
+            if (!s->packet_loss)
+                s->packet_done = !decode_frame(s);
         } else if (!s->len_prefix
-                   && s->num_saved_bits > bitstream_tell(&s->bc)) {
+                   && s->num_saved_bits > get_bits_count(&s->gb)) {
             /* when the frames do not have a length prefix, we don't know the
              * compressed length of the individual frames however, we know what
              * part of a new packet belongs to the previous frame therefore we
@@ -1263,19 +1273,24 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
         }
     }
 
+    if (remaining_bits(s, gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Overread %d\n", -remaining_bits(s, gb));
+        s->packet_loss = 1;
+    }
+
     if (s->packet_done && !s->packet_loss &&
-        remaining_bits(s, bc) > 0) {
+        remaining_bits(s, gb) > 0) {
         /* save the rest of the data so that it can be decoded
          * with the next packet */
-        save_bits(s, bc, remaining_bits(s, bc), 0);
+        save_bits(s, gb, remaining_bits(s, gb), 0);
     }
 
     *got_frame_ptr   = s->frame->nb_samples > 0;
     av_frame_move_ref(data, s->frame);
 
-    s->packet_offset = bitstream_tell(bc) & 7;
+    s->packet_offset = get_bits_count(gb) & 7;
 
-    return (s->packet_loss) ? AVERROR_INVALIDDATA : bitstream_tell(bc) >> 3;
+    return (s->packet_loss) ? AVERROR_INVALIDDATA : buf_size ? get_bits_count(gb) >> 3 : 0;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -1288,7 +1303,7 @@ static void flush(AVCodecContext *avctx)
     s->next_packet_start = 0;
     s->cdlms[0][0].order = 0;
     s->frame->nb_samples = 0;
-    init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+    init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
 }
 
 static av_cold int decode_close(AVCodecContext *avctx)
@@ -1296,6 +1311,7 @@ static av_cold int decode_close(AVCodecContext *avctx)
     WmallDecodeCtx *s = avctx->priv_data;
 
     av_frame_free(&s->frame);
+    av_freep(&s->frame_data);
 
     return 0;
 }
diff --git a/libavcodec/wmaprodata.h b/libavcodec/wmaprodata.h
index f8a52bf..5382479 100644
--- a/libavcodec/wmaprodata.h
+++ b/libavcodec/wmaprodata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Baptiste Coudurier, Benjamin Larsson, Ulion
  * Copyright (c) 2008 - 2009 Sascha Sommer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c
index 4eaeed6..d0fa974 100644
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Baptiste Coudurier, Benjamin Larsson, Ulion
  * Copyright (c) 2008 - 2011 Sascha Sommer, Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -88,13 +88,13 @@
 
 #include <inttypes.h>
 
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/intreadwrite.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "put_bits.h"
 #include "wmaprodata.h"
 #include "sinewin.h"
@@ -106,6 +106,9 @@
 #define MAX_SUBFRAMES  32                                    ///< max number of subframes per channel
 #define MAX_BANDS      29                                    ///< max number of scale factor bands
 #define MAX_FRAMESIZE  32768                                 ///< maximum compressed frame size
+#define XMA_MAX_STREAMS         8
+#define XMA_MAX_CHANNELS_STREAM 2
+#define XMA_MAX_CHANNELS        (XMA_MAX_STREAMS * XMA_MAX_CHANNELS_STREAM)
 
 #define WMAPRO_BLOCK_MIN_BITS  6                                           ///< log2 of min block size
 #define WMAPRO_BLOCK_MAX_BITS 13                                           ///< log2 of max block size
@@ -172,13 +175,13 @@ typedef struct WMAProChannelGrp {
 typedef struct WMAProDecodeCtx {
     /* generic decoder variables */
     AVCodecContext*  avctx;                         ///< codec context for av_log
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     uint8_t          frame_data[MAX_FRAMESIZE +
                       AV_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
     PutBitContext    pb;                            ///< context for filling the frame_data buffer
     FFTContext       mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
     DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
-    float*           windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
+    const float*     windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
 
     /* frame size dependent frame information (set during initialization) */
     uint32_t         decode_flags;                  ///< used compression features
@@ -198,7 +201,7 @@ typedef struct WMAProDecodeCtx {
     int16_t          subwoofer_cutoffs[WMAPRO_BLOCK_SIZES]; ///< subwoofer cutoff values
 
     /* packet decode state */
-    BitstreamContext pbc;                           ///< bitstream reader context for the packet
+    GetBitContext    pgb;                           ///< bitstream reader context for the packet
     int              next_packet_start;             ///< start offset of the next wma packet in the demuxer packet
     uint8_t          packet_offset;                 ///< frame offset in the packet
     uint8_t          packet_sequence_number;        ///< current packet number
@@ -207,17 +210,20 @@ typedef struct WMAProDecodeCtx {
     int              subframe_offset;               ///< subframe offset in the bit reservoir
     uint8_t          packet_loss;                   ///< set in case of bitstream error
     uint8_t          packet_done;                   ///< set when a packet is fully decoded
+    uint8_t          eof_done;                      ///< set when EOF reached and extra subframe is written (XMA1/2)
 
     /* frame decode state */
     uint32_t         frame_num;                     ///< current frame number (not used for decoding)
-    BitstreamContext bc;                            ///< bitstream reader context
+    GetBitContext    gb;                            ///< bitstream reader context
     int              buf_bit_size;                  ///< buffer size in bits
     uint8_t          drc_gain;                      ///< gain for the DRC tool
     int8_t           skip_frame;                    ///< skip output step
     int8_t           parsed_all_subframes;          ///< all subframes decoded?
+    uint8_t          skip_packets;                  ///< packets to skip to find next packet in a stream (XMA1/2)
 
     /* subframe/block decode state */
     int16_t          subframe_len;                  ///< current subframe length
+    int8_t           nb_channels;                   ///< number of channels in stream (XMA1/2)
     int8_t           channels_for_cur_subframe;     ///< number of channels that contain the subframe
     int8_t           channel_indexes_for_cur_subframe[WMAPRO_MAX_CHANNELS];
     int8_t           num_bands;                     ///< number of scale factor bands
@@ -232,6 +238,15 @@ typedef struct WMAProDecodeCtx {
     WMAProChannelCtx channel[WMAPRO_MAX_CHANNELS];  ///< per channel data
 } WMAProDecodeCtx;
 
+typedef struct XMADecodeCtx {
+    WMAProDecodeCtx xma[XMA_MAX_STREAMS];
+    AVFrame *frames[XMA_MAX_STREAMS];
+    int current_stream;
+    int num_streams;
+    float samples[XMA_MAX_CHANNELS][512 * 64];
+    int offset[XMA_MAX_STREAMS];
+    int start_channel[XMA_MAX_STREAMS];
+} XMADecodeCtx;
 
 /**
  *@brief helper function to print the most important members of the context
@@ -248,7 +263,7 @@ static av_cold void dump_context(WMAProDecodeCtx *s)
     PRINT("log2 frame size",     s->log2_frame_size);
     PRINT("max num subframes",   s->max_num_subframes);
     PRINT("len prefix",          s->len_prefix);
-    PRINT("num channels",        s->avctx->channels);
+    PRINT("num channels",        s->nb_channels);
 }
 
 /**
@@ -256,52 +271,103 @@ static av_cold void dump_context(WMAProDecodeCtx *s)
  *@param avctx codec context
  *@return 0 on success, < 0 otherwise
  */
-static av_cold int decode_end(AVCodecContext *avctx)
+static av_cold int decode_end(WMAProDecodeCtx *s)
 {
-    WMAProDecodeCtx *s = avctx->priv_data;
     int i;
 
+    av_freep(&s->fdsp);
+
     for (i = 0; i < WMAPRO_BLOCK_SIZES; i++)
         ff_mdct_end(&s->mdct_ctx[i]);
 
     return 0;
 }
 
+static av_cold int wmapro_decode_end(AVCodecContext *avctx)
+{
+    WMAProDecodeCtx *s = avctx->priv_data;
+
+    decode_end(s);
+
+    return 0;
+}
+
+static av_cold int get_rate(AVCodecContext *avctx)
+{
+    if (avctx->codec_id != AV_CODEC_ID_WMAPRO) { // XXX: is this really only for XMA?
+        if (avctx->sample_rate > 44100)
+            return 48000;
+        else if (avctx->sample_rate > 32000)
+            return 44100;
+        else if (avctx->sample_rate > 24000)
+            return 32000;
+        return 24000;
+    }
+
+    return avctx->sample_rate;
+}
+
 /**
  *@brief Initialize the decoder.
  *@param avctx codec context
  *@return 0 on success, -1 otherwise
  */
-static av_cold int decode_init(AVCodecContext *avctx)
+static av_cold int decode_init(WMAProDecodeCtx *s, AVCodecContext *avctx, int num_stream)
 {
-    WMAProDecodeCtx *s = avctx->priv_data;
     uint8_t *edata_ptr = avctx->extradata;
     unsigned int channel_mask;
     int i, bits;
     int log2_max_num_subframes;
     int num_possible_block_sizes;
 
+    if (avctx->codec_id == AV_CODEC_ID_XMA1 || avctx->codec_id == AV_CODEC_ID_XMA2)
+        avctx->block_align = 2048;
+
     if (!avctx->block_align) {
         av_log(avctx, AV_LOG_ERROR, "block_align is not set\n");
         return AVERROR(EINVAL);
     }
 
     s->avctx = avctx;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
-    if (avctx->extradata_size >= 18) {
+    /** dump the extradata */
+    av_log(avctx, AV_LOG_DEBUG, "extradata:\n");
+    for (i = 0; i < avctx->extradata_size; i++)
+        av_log(avctx, AV_LOG_DEBUG, "[%x] ", avctx->extradata[i]);
+    av_log(avctx, AV_LOG_DEBUG, "\n");
+
+    if (avctx->codec_id == AV_CODEC_ID_XMA2 && avctx->extradata_size == 34) { /* XMA2WAVEFORMATEX */
+        s->decode_flags    = 0x10d6;
+        s->bits_per_sample = 16;
+        channel_mask       = 0; //AV_RL32(edata_ptr+2); /* not always in expected order */
+        if ((num_stream+1) * XMA_MAX_CHANNELS_STREAM > avctx->channels) /* stream config is 2ch + 2ch + ... + 1/2ch */
+            s->nb_channels = 1;
+        else
+            s->nb_channels = 2;
+    } else if (avctx->codec_id == AV_CODEC_ID_XMA2) { /* XMA2WAVEFORMAT */
+        s->decode_flags    = 0x10d6;
+        s->bits_per_sample = 16;
+        channel_mask       = 0; /* would need to aggregate from all streams */
+        s->nb_channels = edata_ptr[32 + ((edata_ptr[0]==3)?0:8) + 4*num_stream + 0]; /* nth stream config */
+    } else if (avctx->codec_id == AV_CODEC_ID_XMA1) { /* XMAWAVEFORMAT */
+        s->decode_flags    = 0x10d6;
+        s->bits_per_sample = 16;
+        channel_mask       = 0; /* would need to aggregate from all streams */
+        s->nb_channels     = edata_ptr[8 + 20*num_stream + 17]; /* nth stream config */
+    } else if (avctx->codec_id == AV_CODEC_ID_WMAPRO && avctx->extradata_size >= 18) {
         s->decode_flags    = AV_RL16(edata_ptr+14);
         channel_mask       = AV_RL32(edata_ptr+2);
         s->bits_per_sample = AV_RL16(edata_ptr);
-        /** dump the extradata */
-        for (i = 0; i < avctx->extradata_size; i++)
-            ff_dlog(avctx, "[%x] ", avctx->extradata[i]);
-        ff_dlog(avctx, "\n");
+        s->nb_channels     = avctx->channels;
 
+        if (s->bits_per_sample > 32 || s->bits_per_sample < 1) {
+            avpriv_request_sample(avctx, "bits per sample is %d", s->bits_per_sample);
+            return AVERROR_PATCHWELCOME;
+        }
     } else {
         avpriv_request_sample(avctx, "Unknown extradata size");
         return AVERROR_PATCHWELCOME;
@@ -309,19 +375,31 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     /** generic init */
     s->log2_frame_size = av_log2(avctx->block_align) + 4;
+    if (s->log2_frame_size > 25) {
+        avpriv_request_sample(avctx, "Large block align");
+        return AVERROR_PATCHWELCOME;
+    }
 
     /** frame info */
-    s->skip_frame  = 1; /* skip first frame */
+    if (avctx->codec_id != AV_CODEC_ID_WMAPRO)
+        s->skip_frame = 0;
+    else
+        s->skip_frame = 1; /* skip first frame */
+
     s->packet_loss = 1;
     s->len_prefix  = (s->decode_flags & 0x40);
 
     /** get frame len */
-    bits = ff_wma_get_frame_len_bits(avctx->sample_rate, 3, s->decode_flags);
-    if (bits > WMAPRO_BLOCK_MAX_BITS) {
-        avpriv_request_sample(avctx, "14-bit block sizes");
-        return AVERROR_PATCHWELCOME;
+    if (avctx->codec_id == AV_CODEC_ID_WMAPRO) {
+        bits = ff_wma_get_frame_len_bits(avctx->sample_rate, 3, s->decode_flags);
+        if (bits > WMAPRO_BLOCK_MAX_BITS) {
+            avpriv_request_sample(avctx, "14-bit block sizes");
+            return AVERROR_PATCHWELCOME;
+        }
+        s->samples_per_frame = 1 << bits;
+    } else {
+        s->samples_per_frame = 512;
     }
-    s->samples_per_frame = 1 << bits;
 
     /** subframe info */
     log2_max_num_subframes       = ((s->decode_flags & 0x38) >> 3);
@@ -341,8 +419,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     if (s->min_samples_per_subframe < WMAPRO_BLOCK_MIN_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid minimum block size %"PRId8"\n",
-               s->max_num_subframes);
+        av_log(avctx, AV_LOG_ERROR, "min_samples_per_subframe of %d too small\n",
+               s->min_samples_per_subframe);
         return AVERROR_INVALIDDATA;
     }
 
@@ -351,18 +429,22 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    if (avctx->channels < 0) {
+    if (s->nb_channels <= 0) {
         av_log(avctx, AV_LOG_ERROR, "invalid number of channels %d\n",
-               avctx->channels);
+               s->nb_channels);
+        return AVERROR_INVALIDDATA;
+    } else if (avctx->codec_id != AV_CODEC_ID_WMAPRO && s->nb_channels > XMA_MAX_CHANNELS_STREAM) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels per XMA stream %d\n",
+               s->nb_channels);
         return AVERROR_INVALIDDATA;
-    } else if (avctx->channels > WMAPRO_MAX_CHANNELS) {
+    } else if (s->nb_channels > WMAPRO_MAX_CHANNELS) {
         avpriv_request_sample(avctx,
                               "More than %d channels", WMAPRO_MAX_CHANNELS);
         return AVERROR_PATCHWELCOME;
     }
 
     /** init previous block len */
-    for (i = 0; i < avctx->channels; i++)
+    for (i = 0; i < s->nb_channels; i++)
         s->channel[i].prev_block_len = s->samples_per_frame;
 
     /** extract lfe channel position */
@@ -410,18 +492,25 @@ static av_cold int decode_init(AVCodecContext *avctx)
         int subframe_len = s->samples_per_frame >> i;
         int x;
         int band = 1;
+        int rate = get_rate(avctx);
 
         s->sfb_offsets[i][0] = 0;
 
         for (x = 0; x < MAX_BANDS-1 && s->sfb_offsets[i][band - 1] < subframe_len; x++) {
-            int offset = (subframe_len * 2 * critical_freq[x])
-                          / s->avctx->sample_rate + 2;
+            int offset = (subframe_len * 2 * critical_freq[x]) / rate + 2;
             offset &= ~3;
             if (offset > s->sfb_offsets[i][band - 1])
                 s->sfb_offsets[i][band++] = offset;
+
+            if (offset >= subframe_len)
+                break;
         }
         s->sfb_offsets[i][band - 1] = subframe_len;
         s->num_sfb[i]               = band - 1;
+        if (s->num_sfb[i] <= 0) {
+            av_log(avctx, AV_LOG_ERROR, "num_sfb invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
 
@@ -438,14 +527,19 @@ static av_cold int decode_init(AVCodecContext *avctx)
                            + s->sfb_offsets[i][b + 1] - 1) << i) >> 1;
             for (x = 0; x < num_possible_block_sizes; x++) {
                 int v = 0;
-                while (s->sfb_offsets[x][v + 1] << x < offset)
-                    if (++v >= MAX_BANDS)
-                        return AVERROR_INVALIDDATA;
+                while (s->sfb_offsets[x][v + 1] << x < offset) {
+                    v++;
+                    av_assert0(v < MAX_BANDS);
+                }
                 s->sf_offsets[i][x][b] = v;
             }
         }
     }
 
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+
     /** init MDCT, FIXME: only init needed sizes */
     for (i = 0; i < WMAPRO_BLOCK_SIZES; i++)
         ff_mdct_init(&s->mdct_ctx[i], WMAPRO_BLOCK_MIN_BITS+1+i, 1,
@@ -462,7 +556,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     /** calculate subwoofer cutoff values */
     for (i = 0; i < num_possible_block_sizes; i++) {
         int block_size = s->samples_per_frame >> i;
-        int cutoff = (440*block_size + 3 * (s->avctx->sample_rate >> 1) - 1)
+        int cutoff = (440*block_size + 3LL * (s->avctx->sample_rate >> 1) - 1)
                      / s->avctx->sample_rate;
         s->subwoofer_cutoffs[i] = av_clip(cutoff, 4, block_size);
     }
@@ -480,6 +574,18 @@ static av_cold int decode_init(AVCodecContext *avctx)
 }
 
 /**
+ *@brief Initialize the decoder.
+ *@param avctx codec context
+ *@return 0 on success, -1 otherwise
+ */
+static av_cold int wmapro_decode_init(AVCodecContext *avctx)
+{
+    WMAProDecodeCtx *s = avctx->priv_data;
+
+    return decode_init(s, avctx, 0);
+}
+
+/**
  *@brief Decode the subframe length.
  *@param s context
  *@param offset sample offset in the frame
@@ -494,13 +600,15 @@ static int decode_subframe_length(WMAProDecodeCtx *s, int offset)
     if (offset == s->samples_per_frame - s->min_samples_per_subframe)
         return s->min_samples_per_subframe;
 
+    if (get_bits_left(&s->gb) < 1)
+        return AVERROR_INVALIDDATA;
+
     /** 1 bit indicates if the subframe is of maximum length */
     if (s->max_subframe_len_bit) {
-        if (bitstream_read_bit(&s->bc))
-            frame_len_shift = 1 + bitstream_read(&s->bc,
-                                                 s->subframe_len_bits - 1);
+        if (get_bits1(&s->gb))
+            frame_len_shift = 1 + get_bits(&s->gb, s->subframe_len_bits-1);
     } else
-        frame_len_shift = bitstream_read(&s->bc, s->subframe_len_bits);
+        frame_len_shift = get_bits(&s->gb, s->subframe_len_bits);
 
     subframe_len = s->samples_per_frame >> frame_len_shift;
 
@@ -538,7 +646,7 @@ static int decode_tilehdr(WMAProDecodeCtx *s)
 {
     uint16_t num_samples[WMAPRO_MAX_CHANNELS] = { 0 };/**< sum of samples for all currently known subframes of a channel */
     uint8_t  contains_subframe[WMAPRO_MAX_CHANNELS];  /**< flag indicating if a channel contains the current subframe */
-    int channels_for_cur_subframe = s->avctx->channels; /**< number of channels that contain the current subframe */
+    int channels_for_cur_subframe = s->nb_channels;   /**< number of channels that contain the current subframe */
     int fixed_channel_layout = 0;                     /**< flag indicating that all channels use the same subframe offsets and sizes */
     int min_channel_len = 0;                          /**< smallest sum of samples (channels with this length will be processed first) */
     int c;
@@ -550,10 +658,10 @@ static int decode_tilehdr(WMAProDecodeCtx *s)
      */
 
     /** reset tiling information */
-    for (c = 0; c < s->avctx->channels; c++)
+    for (c = 0; c < s->nb_channels; c++)
         s->channel[c].num_subframes = 0;
 
-    if (s->max_num_subframes == 1 || bitstream_read_bit(&s->bc))
+    if (s->max_num_subframes == 1 || get_bits1(&s->gb))
         fixed_channel_layout = 1;
 
     /** loop until the frame data is split between the subframes */
@@ -561,13 +669,13 @@ static int decode_tilehdr(WMAProDecodeCtx *s)
         int subframe_len;
 
         /** check which channels contain the subframe */
-        for (c = 0; c < s->avctx->channels; c++) {
+        for (c = 0; c < s->nb_channels; c++) {
             if (num_samples[c] == min_channel_len) {
                 if (fixed_channel_layout || channels_for_cur_subframe == 1 ||
                    (min_channel_len == s->samples_per_frame - s->min_samples_per_subframe))
                     contains_subframe[c] = 1;
                 else
-                    contains_subframe[c] = bitstream_read_bit(&s->bc);
+                    contains_subframe[c] = get_bits1(&s->gb);
             } else
                 contains_subframe[c] = 0;
         }
@@ -578,7 +686,7 @@ static int decode_tilehdr(WMAProDecodeCtx *s)
 
         /** add subframes to the individual channels and find new min_channel_len */
         min_channel_len += subframe_len;
-        for (c = 0; c < s->avctx->channels; c++) {
+        for (c = 0; c < s->nb_channels; c++) {
             WMAProChannelCtx* chan = &s->channel[c];
 
             if (contains_subframe[c]) {
@@ -605,12 +713,12 @@ static int decode_tilehdr(WMAProDecodeCtx *s)
         }
     } while (min_channel_len < s->samples_per_frame);
 
-    for (c = 0; c < s->avctx->channels; c++) {
+    for (c = 0; c < s->nb_channels; c++) {
         int i;
         int offset = 0;
         for (i = 0; i < s->channel[c].num_subframes; i++) {
-            ff_dlog(s->avctx, "frame[%"PRIi32"] channel[%i] subframe[%i]"
-                    " len %"PRIu16"\n", s->frame_num, c, i,
+            ff_dlog(s->avctx, "frame[%"PRIu32"] channel[%i] subframe[%i]"
+                    " len %i\n", s->frame_num, c, i,
                     s->channel[c].subframe_len[i]);
             s->channel[c].subframe_offset[i] = offset;
             offset += s->channel[c].subframe_len[i];
@@ -631,15 +739,15 @@ static void decode_decorrelation_matrix(WMAProDecodeCtx *s,
     int i;
     int offset = 0;
     int8_t rotation_offset[WMAPRO_MAX_CHANNELS * WMAPRO_MAX_CHANNELS];
-    memset(chgroup->decorrelation_matrix, 0, s->avctx->channels *
-           s->avctx->channels * sizeof(*chgroup->decorrelation_matrix));
+    memset(chgroup->decorrelation_matrix, 0, s->nb_channels *
+           s->nb_channels * sizeof(*chgroup->decorrelation_matrix));
 
     for (i = 0; i < chgroup->num_channels * (chgroup->num_channels - 1) >> 1; i++)
-        rotation_offset[i] = bitstream_read(&s->bc, 6);
+        rotation_offset[i] = get_bits(&s->gb, 6);
 
     for (i = 0; i < chgroup->num_channels; i++)
         chgroup->decorrelation_matrix[chgroup->num_channels * i + i] =
-            bitstream_read_bit(&s->bc) ? 1.0 : -1.0;
+            get_bits1(&s->gb) ? 1.0 : -1.0;
 
     for (i = 1; i < chgroup->num_channels; i++) {
         int x;
@@ -673,7 +781,7 @@ static void decode_decorrelation_matrix(WMAProDecodeCtx *s,
 /**
  *@brief Decode channel transformation parameters
  *@param s codec context
- *@return 0 in case of success, < 0 in case of bitstream errors
+ *@return >= 0 in case of success, < 0 in case of bitstream errors
  */
 static int decode_channel_transform(WMAProDecodeCtx* s)
 {
@@ -685,10 +793,10 @@ static int decode_channel_transform(WMAProDecodeCtx* s)
 
     /** in the one channel case channel transforms are pointless */
     s->num_chgroups = 0;
-    if (s->avctx->channels > 1) {
+    if (s->nb_channels > 1) {
         int remaining_channels = s->channels_for_cur_subframe;
 
-        if (bitstream_read_bit(&s->bc)) {
+        if (get_bits1(&s->gb)) {
             avpriv_request_sample(s->avctx,
                                   "Channel transform bit");
             return AVERROR_PATCHWELCOME;
@@ -706,7 +814,7 @@ static int decode_channel_transform(WMAProDecodeCtx* s)
                 for (i = 0; i < s->channels_for_cur_subframe; i++) {
                     int channel_idx = s->channel_indexes_for_cur_subframe[i];
                     if (!s->channel[channel_idx].grouped
-                        && bitstream_read_bit(&s->bc)) {
+                        && get_bits1(&s->gb)) {
                         ++chgroup->num_channels;
                         s->channel[channel_idx].grouped = 1;
                         *channel_data++ = s->channel[channel_idx].coeffs;
@@ -724,15 +832,15 @@ static int decode_channel_transform(WMAProDecodeCtx* s)
 
             /** decode transform type */
             if (chgroup->num_channels == 2) {
-                if (bitstream_read_bit(&s->bc)) {
-                    if (bitstream_read_bit(&s->bc)) {
+                if (get_bits1(&s->gb)) {
+                    if (get_bits1(&s->gb)) {
                         avpriv_request_sample(s->avctx,
                                               "Unknown channel transform type");
                         return AVERROR_PATCHWELCOME;
                     }
                 } else {
                     chgroup->transform = 1;
-                    if (s->avctx->channels == 2) {
+                    if (s->nb_channels == 2) {
                         chgroup->decorrelation_matrix[0] =  1.0;
                         chgroup->decorrelation_matrix[1] = -1.0;
                         chgroup->decorrelation_matrix[2] =  1.0;
@@ -746,9 +854,9 @@ static int decode_channel_transform(WMAProDecodeCtx* s)
                     }
                 }
             } else if (chgroup->num_channels > 2) {
-                if (bitstream_read_bit(&s->bc)) {
+                if (get_bits1(&s->gb)) {
                     chgroup->transform = 1;
-                    if (bitstream_read_bit(&s->bc)) {
+                    if (get_bits1(&s->gb)) {
                         decode_decorrelation_matrix(s, chgroup);
                     } else {
                         /** FIXME: more than 6 coupled channels not supported */
@@ -767,11 +875,11 @@ static int decode_channel_transform(WMAProDecodeCtx* s)
 
             /** decode transform on / off */
             if (chgroup->transform) {
-                if (!bitstream_read_bit(&s->bc)) {
+                if (!get_bits1(&s->gb)) {
                     int i;
                     /** transform can be enabled for individual bands */
                     for (i = 0; i < s->num_bands; i++) {
-                        chgroup->transform_band[i] = bitstream_read_bit(&s->bc);
+                        chgroup->transform_band[i] = get_bits1(&s->gb);
                     }
                 } else {
                     memset(chgroup->transform_band, 1, s->num_bands);
@@ -811,7 +919,7 @@ static int decode_coeffs(WMAProDecodeCtx *s, int c)
 
     ff_dlog(s->avctx, "decode coefficients for channel %i\n", c);
 
-    vlctable = bitstream_read_bit(&s->bc);
+    vlctable = get_bits1(&s->gb);
     vlc = &coef_vlc[vlctable];
 
     if (vlctable) {
@@ -830,19 +938,19 @@ static int decode_coeffs(WMAProDecodeCtx *s, int c)
         int i;
         unsigned int idx;
 
-        idx = bitstream_read_vlc(&s->bc, vec4_vlc.table, VLCBITS, VEC4MAXDEPTH);
+        idx = get_vlc2(&s->gb, vec4_vlc.table, VLCBITS, VEC4MAXDEPTH);
 
         if (idx == HUFF_VEC4_SIZE - 1) {
             for (i = 0; i < 4; i += 2) {
-                idx = bitstream_read_vlc(&s->bc, vec2_vlc.table, VLCBITS, VEC2MAXDEPTH);
+                idx = get_vlc2(&s->gb, vec2_vlc.table, VLCBITS, VEC2MAXDEPTH);
                 if (idx == HUFF_VEC2_SIZE - 1) {
                     uint32_t v0, v1;
-                    v0 = bitstream_read_vlc(&s->bc, vec1_vlc.table, VLCBITS, VEC1MAXDEPTH);
+                    v0 = get_vlc2(&s->gb, vec1_vlc.table, VLCBITS, VEC1MAXDEPTH);
                     if (v0 == HUFF_VEC1_SIZE - 1)
-                        v0 += ff_wma_get_large_val(&s->bc);
-                    v1 = bitstream_read_vlc(&s->bc, vec1_vlc.table, VLCBITS, VEC1MAXDEPTH);
+                        v0 += ff_wma_get_large_val(&s->gb);
+                    v1 = get_vlc2(&s->gb, vec1_vlc.table, VLCBITS, VEC1MAXDEPTH);
                     if (v1 == HUFF_VEC1_SIZE - 1)
-                        v1 += ff_wma_get_large_val(&s->bc);
+                        v1 += ff_wma_get_large_val(&s->gb);
                     vals[i  ] = av_float2int(v0);
                     vals[i+1] = av_float2int(v1);
                 } else {
@@ -860,7 +968,7 @@ static int decode_coeffs(WMAProDecodeCtx *s, int c)
         /** decode sign */
         for (i = 0; i < 4; i++) {
             if (vals[i]) {
-                uint32_t sign = bitstream_read_bit(&s->bc) - 1;
+                uint32_t sign = get_bits1(&s->gb) - 1;
                 AV_WN32A(&ci->coeffs[cur_coeff], vals[i] ^ sign << 31);
                 num_zeros = 0;
             } else {
@@ -877,7 +985,7 @@ static int decode_coeffs(WMAProDecodeCtx *s, int c)
     if (cur_coeff < s->subframe_len) {
         memset(&ci->coeffs[cur_coeff], 0,
                sizeof(*ci->coeffs) * (s->subframe_len - cur_coeff));
-        if (ff_wma_run_level_decode(s->avctx, &s->bc, vlc,
+        if (ff_wma_run_level_decode(s->avctx, &s->gb, vlc,
                                     level, run, 1, ci->coeffs,
                                     cur_coeff, s->subframe_len,
                                     s->subframe_len, s->esc_len, 0))
@@ -920,14 +1028,15 @@ static int decode_scale_factors(WMAProDecodeCtx* s)
                     s->channel[c].saved_scale_factors[s->channel[c].scale_factor_idx][*sf_offsets++];
         }
 
-        if (!s->channel[c].cur_subframe || bitstream_read_bit(&s->bc)) {
+        if (!s->channel[c].cur_subframe || get_bits1(&s->gb)) {
+
             if (!s->channel[c].reuse_sf) {
                 int val;
                 /** decode DPCM coded scale factors */
-                s->channel[c].scale_factor_step = bitstream_read(&s->bc, 2) + 1;
+                s->channel[c].scale_factor_step = get_bits(&s->gb, 2) + 1;
                 val = 45 / s->channel[c].scale_factor_step;
                 for (sf = s->channel[c].scale_factors; sf < sf_end; sf++) {
-                    val += bitstream_read_vlc(&s->bc, sf_vlc.table, SCALEVLCBITS, SCALEMAXDEPTH) - 60;
+                    val += get_vlc2(&s->gb, sf_vlc.table, SCALEVLCBITS, SCALEMAXDEPTH) - 60;
                     *sf = val;
                 }
             } else {
@@ -939,10 +1048,10 @@ static int decode_scale_factors(WMAProDecodeCtx* s)
                     int val;
                     int sign;
 
-                    idx = bitstream_read_vlc(&s->bc, sf_rl_vlc.table, VLCBITS, SCALERLMAXDEPTH);
+                    idx = get_vlc2(&s->gb, sf_rl_vlc.table, VLCBITS, SCALERLMAXDEPTH);
 
                     if (!idx) {
-                        uint32_t code = bitstream_read(&s->bc, 14);
+                        uint32_t code = get_bits(&s->gb, 14);
                         val  =  code >> 6;
                         sign = (code & 1) - 1;
                         skip = (code & 0x3f) >> 1;
@@ -951,7 +1060,7 @@ static int decode_scale_factors(WMAProDecodeCtx* s)
                     } else {
                         skip = scale_rl_run[idx];
                         val  = scale_rl_level[idx];
-                        sign = bitstream_read_bit(&s->bc)-1;
+                        sign = get_bits1(&s->gb)-1;
                     }
 
                     i += skip;
@@ -1021,12 +1130,12 @@ static void inverse_channel_transform(WMAProDecodeCtx *s)
                             (*ch)[y] = sum;
                         }
                     }
-                } else if (s->avctx->channels == 2) {
+                } else if (s->nb_channels == 2) {
                     int len = FFMIN(sfb[1], s->subframe_len) - sfb[0];
-                    s->fdsp.vector_fmul_scalar(ch_data[0] + sfb[0],
+                    s->fdsp->vector_fmul_scalar(ch_data[0] + sfb[0],
                                                ch_data[0] + sfb[0],
                                                181.0 / 128, len);
-                    s->fdsp.vector_fmul_scalar(ch_data[1] + sfb[0],
+                    s->fdsp->vector_fmul_scalar(ch_data[1] + sfb[0],
                                                ch_data[1] + sfb[0],
                                                181.0 / 128, len);
                 }
@@ -1044,7 +1153,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
     int i;
     for (i = 0; i < s->channels_for_cur_subframe; i++) {
         int c = s->channel_indexes_for_cur_subframe[i];
-        float* window;
+        const float* window;
         int winlen = s->channel[c].prev_block_len;
         float* start = s->channel[c].coeffs - (winlen >> 1);
 
@@ -1057,7 +1166,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
 
         winlen >>= 1;
 
-        s->fdsp.vector_fmul_window(start, start, start + winlen,
+        s->fdsp->vector_fmul_window(start, start, start + winlen,
                                    window, winlen);
 
         s->channel[c].prev_block_len = s->subframe_len;
@@ -1074,17 +1183,17 @@ static int decode_subframe(WMAProDecodeCtx *s)
     int offset = s->samples_per_frame;
     int subframe_len = s->samples_per_frame;
     int i;
-    int total_samples   = s->samples_per_frame * s->avctx->channels;
+    int total_samples   = s->samples_per_frame * s->nb_channels;
     int transmit_coeffs = 0;
     int cur_subwoofer_cutoff;
 
-    s->subframe_offset = bitstream_tell(&s->bc);
+    s->subframe_offset = get_bits_count(&s->gb);
 
     /** reset channel context and find the next block offset and size
         == the next block of the channel with the smallest number of
         decoded samples
     */
-    for (i = 0; i < s->avctx->channels; i++) {
+    for (i = 0; i < s->nb_channels; i++) {
         s->channel[i].grouped = 0;
         if (offset > s->channel[i].decoded_samples) {
             offset = s->channel[i].decoded_samples;
@@ -1098,7 +1207,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
 
     /** get a list of all channels that contain the estimated block */
     s->channels_for_cur_subframe = 0;
-    for (i = 0; i < s->avctx->channels; i++) {
+    for (i = 0; i < s->nb_channels; i++) {
         const int cur_subframe = s->channel[i].cur_subframe;
         /** subtract already processed samples */
         total_samples -= s->channel[i].decoded_samples;
@@ -1142,25 +1251,25 @@ static int decode_subframe(WMAProDecodeCtx *s)
     s->esc_len = av_log2(s->subframe_len - 1) + 1;
 
     /** skip extended header if any */
-    if (bitstream_read_bit(&s->bc)) {
+    if (get_bits1(&s->gb)) {
         int num_fill_bits;
-        if (!(num_fill_bits = bitstream_read(&s->bc, 2))) {
-            int len       = bitstream_read(&s->bc, 4);
-            num_fill_bits = bitstream_read(&s->bc, len) + 1;
+        if (!(num_fill_bits = get_bits(&s->gb, 2))) {
+            int len = get_bits(&s->gb, 4);
+            num_fill_bits = get_bitsz(&s->gb, len) + 1;
         }
 
         if (num_fill_bits >= 0) {
-            if (bitstream_tell(&s->bc) + num_fill_bits > s->num_saved_bits) {
+            if (get_bits_count(&s->gb) + num_fill_bits > s->num_saved_bits) {
                 av_log(s->avctx, AV_LOG_ERROR, "invalid number of fill bits\n");
                 return AVERROR_INVALIDDATA;
             }
 
-            bitstream_skip(&s->bc, num_fill_bits);
+            skip_bits_long(&s->gb, num_fill_bits);
         }
     }
 
     /** no idea for what the following bit is used */
-    if (bitstream_read_bit(&s->bc)) {
+    if (get_bits1(&s->gb)) {
         avpriv_request_sample(s->avctx, "Reserved bit");
         return AVERROR_PATCHWELCOME;
     }
@@ -1172,24 +1281,26 @@ static int decode_subframe(WMAProDecodeCtx *s)
 
     for (i = 0; i < s->channels_for_cur_subframe; i++) {
         int c = s->channel_indexes_for_cur_subframe[i];
-        if ((s->channel[c].transmit_coefs = bitstream_read_bit(&s->bc)))
+        if ((s->channel[c].transmit_coefs = get_bits1(&s->gb)))
             transmit_coeffs = 1;
     }
 
+    av_assert0(s->subframe_len <= WMAPRO_BLOCK_MAX_SIZE);
     if (transmit_coeffs) {
         int step;
         int quant_step = 90 * s->bits_per_sample >> 4;
 
         /** decode number of vector coded coefficients */
-        if ((s->transmit_num_vec_coeffs = bitstream_read_bit(&s->bc))) {
+        if ((s->transmit_num_vec_coeffs = get_bits1(&s->gb))) {
             int num_bits = av_log2((s->subframe_len + 3)/4) + 1;
             for (i = 0; i < s->channels_for_cur_subframe; i++) {
                 int c = s->channel_indexes_for_cur_subframe[i];
-                int num_vec_coeffs = bitstream_read(&s->bc, num_bits) << 2;
-                if (num_vec_coeffs + offset > FF_ARRAY_ELEMS(s->channel[c].out)) {
+                int num_vec_coeffs = get_bits(&s->gb, num_bits) << 2;
+                if (num_vec_coeffs > s->subframe_len) {
                     av_log(s->avctx, AV_LOG_ERROR, "num_vec_coeffs %d is too large\n", num_vec_coeffs);
                     return AVERROR_INVALIDDATA;
                 }
+                av_assert0(num_vec_coeffs + offset <= FF_ARRAY_ELEMS(s->channel[c].out));
                 s->channel[c].num_vec_coeffs = num_vec_coeffs;
             }
         } else {
@@ -1199,13 +1310,13 @@ static int decode_subframe(WMAProDecodeCtx *s)
             }
         }
         /** decode quantization step */
-        step = bitstream_read_signed(&s->bc, 6);
+        step = get_sbits(&s->gb, 6);
         quant_step += step;
         if (step == -32 || step == 31) {
             const int sign = (step == 31) - 1;
             int quant = 0;
-            while (bitstream_tell(&s->bc) + 5 < s->num_saved_bits &&
-                   (step = bitstream_read(&s->bc, 5)) == 31) {
+            while (get_bits_count(&s->gb) + 5 < s->num_saved_bits &&
+                   (step = get_bits(&s->gb, 5)) == 31) {
                 quant += 31;
             }
             quant_step += ((quant + step) ^ sign) - sign;
@@ -1219,13 +1330,13 @@ static int decode_subframe(WMAProDecodeCtx *s)
         if (s->channels_for_cur_subframe == 1) {
             s->channel[s->channel_indexes_for_cur_subframe[0]].quant_step = quant_step;
         } else {
-            int modifier_len = bitstream_read(&s->bc, 3);
+            int modifier_len = get_bits(&s->gb, 3);
             for (i = 0; i < s->channels_for_cur_subframe; i++) {
                 int c = s->channel_indexes_for_cur_subframe[i];
                 s->channel[c].quant_step = quant_step;
-                if (bitstream_read_bit(&s->bc)) {
+                if (get_bits1(&s->gb)) {
                     if (modifier_len) {
-                        s->channel[c].quant_step += bitstream_read(&s->bc, modifier_len) + 1;
+                        s->channel[c].quant_step += get_bits(&s->gb, modifier_len) + 1;
                     } else
                         ++s->channel[c].quant_step;
                 }
@@ -1238,13 +1349,13 @@ static int decode_subframe(WMAProDecodeCtx *s)
     }
 
     ff_dlog(s->avctx, "BITSTREAM: subframe header length was %i\n",
-            bitstream_tell(&s->bc) - s->subframe_offset);
+            get_bits_count(&s->gb) - s->subframe_offset);
 
     /** parse coefficients */
     for (i = 0; i < s->channels_for_cur_subframe; i++) {
         int c = s->channel_indexes_for_cur_subframe[i];
         if (s->channel[c].transmit_coefs &&
-            bitstream_tell(&s->bc) < s->num_saved_bits) {
+            get_bits_count(&s->gb) < s->num_saved_bits) {
             decode_coeffs(s, c);
         } else
             memset(s->channel[c].coeffs, 0,
@@ -1252,7 +1363,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
     }
 
     ff_dlog(s->avctx, "BITSTREAM: subframe length was %i\n",
-            bitstream_tell(&s->bc) - s->subframe_offset);
+            get_bits_count(&s->gb) - s->subframe_offset);
 
     if (transmit_coeffs) {
         FFTContext *mdct = &s->mdct_ctx[av_log2(subframe_len) - WMAPRO_BLOCK_MIN_BITS];
@@ -1273,9 +1384,9 @@ static int decode_subframe(WMAProDecodeCtx *s)
                 const int exp = s->channel[c].quant_step -
                             (s->channel[c].max_scale_factor - *sf++) *
                             s->channel[c].scale_factor_step;
-                const float quant = pow(10.0, exp / 20.0);
+                const float quant = ff_exp10(exp / 20.0);
                 int start = s->cur_sfb_offsets[b];
-                s->fdsp.vector_fmul_scalar(s->tmp + start,
+                s->fdsp->vector_fmul_scalar(s->tmp + start,
                                            s->channel[c].coeffs + start,
                                            quant, end - start);
             }
@@ -1309,15 +1420,14 @@ static int decode_subframe(WMAProDecodeCtx *s)
  */
 static int decode_frame(WMAProDecodeCtx *s, AVFrame *frame, int *got_frame_ptr)
 {
-    AVCodecContext *avctx = s->avctx;
-    BitstreamContext *bc = &s->bc;
+    GetBitContext* gb = &s->gb;
     int more_frames = 0;
     int len = 0;
-    int i, ret;
+    int i;
 
     /** get frame length */
     if (s->len_prefix)
-        len = bitstream_read(bc, s->log2_frame_size);
+        len = get_bits(gb, s->log2_frame_size);
 
     ff_dlog(s->avctx, "decoding frame with length %x\n", len);
 
@@ -1328,44 +1438,44 @@ static int decode_frame(WMAProDecodeCtx *s, AVFrame *frame, int *got_frame_ptr)
     }
 
     /** read postproc transform */
-    if (s->avctx->channels > 1 && bitstream_read_bit(bc)) {
-        if (bitstream_read_bit(bc)) {
-            for (i = 0; i < avctx->channels * avctx->channels; i++)
-                bitstream_skip(bc, 4);
+    if (s->nb_channels > 1 && get_bits1(gb)) {
+        if (get_bits1(gb)) {
+            for (i = 0; i < s->nb_channels * s->nb_channels; i++)
+                skip_bits(gb, 4);
         }
     }
 
     /** read drc info */
     if (s->dynamic_range_compression) {
-        s->drc_gain = bitstream_read(bc, 8);
+        s->drc_gain = get_bits(gb, 8);
         ff_dlog(s->avctx, "drc_gain %i\n", s->drc_gain);
     }
 
     /** no idea what these are for, might be the number of samples
         that need to be skipped at the beginning or end of a stream */
-    if (bitstream_read_bit(bc)) {
+    if (get_bits1(gb)) {
         int av_unused skip;
 
         /** usually true for the first frame */
-        if (bitstream_read_bit(bc)) {
-            skip = bitstream_read(bc, av_log2(s->samples_per_frame * 2));
+        if (get_bits1(gb)) {
+            skip = get_bits(gb, av_log2(s->samples_per_frame * 2));
             ff_dlog(s->avctx, "start skip: %i\n", skip);
         }
 
         /** sometimes true for the last frame */
-        if (bitstream_read_bit(bc)) {
-            skip = bitstream_read(bc, av_log2(s->samples_per_frame * 2));
+        if (get_bits1(gb)) {
+            skip = get_bits(gb, av_log2(s->samples_per_frame * 2));
             ff_dlog(s->avctx, "end skip: %i\n", skip);
         }
 
     }
 
     ff_dlog(s->avctx, "BITSTREAM: frame header length was %i\n",
-            bitstream_tell(bc) - s->frame_offset);
+            get_bits_count(gb) - s->frame_offset);
 
     /** reset subframe states */
     s->parsed_all_subframes = 0;
-    for (i = 0; i < avctx->channels; i++) {
+    for (i = 0; i < s->nb_channels; i++) {
         s->channel[i].decoded_samples = 0;
         s->channel[i].cur_subframe    = 0;
         s->channel[i].reuse_sf        = 0;
@@ -1379,20 +1489,12 @@ static int decode_frame(WMAProDecodeCtx *s, AVFrame *frame, int *got_frame_ptr)
         }
     }
 
-    /* get output buffer */
-    frame->nb_samples = s->samples_per_frame;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        s->packet_loss = 1;
-        return 0;
-    }
-
     /** copy samples to the output buffer */
-    for (i = 0; i < avctx->channels; i++)
+    for (i = 0; i < s->nb_channels; i++)
         memcpy(frame->extended_data[i], s->channel[i].out,
                s->samples_per_frame * sizeof(*s->channel[i].out));
 
-    for (i = 0; i < avctx->channels; i++) {
+    for (i = 0; i < s->nb_channels; i++) {
         /** reuse second half of the IMDCT output for the next frame */
         memcpy(&s->channel[i].out[0],
                &s->channel[i].out[s->samples_per_frame],
@@ -1408,25 +1510,25 @@ static int decode_frame(WMAProDecodeCtx *s, AVFrame *frame, int *got_frame_ptr)
     }
 
     if (s->len_prefix) {
-        if (len != (bitstream_tell(bc) - s->frame_offset) + 2) {
+        if (len != (get_bits_count(gb) - s->frame_offset) + 2) {
             /** FIXME: not sure if this is always an error */
             av_log(s->avctx, AV_LOG_ERROR,
                    "frame[%"PRIu32"] would have to skip %i bits\n",
                    s->frame_num,
-                   len - (bitstream_tell(bc) - s->frame_offset) - 1);
+                   len - (get_bits_count(gb) - s->frame_offset) - 1);
             s->packet_loss = 1;
             return 0;
         }
 
         /** skip the rest of the frame data */
-        bitstream_skip(bc, len - (bitstream_tell(bc) - s->frame_offset) - 1);
+        skip_bits_long(gb, len - (get_bits_count(gb) - s->frame_offset) - 1);
     } else {
-        while (bitstream_tell(bc) < s->num_saved_bits && bitstream_read_bit(bc) == 0) {
+        while (get_bits_count(gb) < s->num_saved_bits && get_bits1(gb) == 0) {
         }
     }
 
     /** decode trailer bit */
-    more_frames = bitstream_read_bit(bc);
+    more_frames = get_bits1(gb);
 
     ++s->frame_num;
     return more_frames;
@@ -1435,22 +1537,22 @@ static int decode_frame(WMAProDecodeCtx *s, AVFrame *frame, int *got_frame_ptr)
 /**
  *@brief Calculate remaining input buffer length.
  *@param s codec context
- *@param bc bitstream reader context
+ *@param gb bitstream reader context
  *@return remaining size in bits
  */
-static int remaining_bits(WMAProDecodeCtx *s, BitstreamContext *bc)
+static int remaining_bits(WMAProDecodeCtx *s, GetBitContext *gb)
 {
-    return s->buf_bit_size - bitstream_tell(bc);
+    return s->buf_bit_size - get_bits_count(gb);
 }
 
 /**
  *@brief Fill the bit reservoir with a (partial) frame.
  *@param s codec context
- *@param bc bitstream reader context
+ *@param gb bitstream reader context
  *@param len length of the partial frame
  *@param append decides whether to reset the buffer or not
  */
-static void save_bits(WMAProDecodeCtx *s, BitstreamContext *bc, int len,
+static void save_bits(WMAProDecodeCtx *s, GetBitContext* gb, int len,
                       int append)
 {
     int buflen;
@@ -1460,12 +1562,12 @@ static void save_bits(WMAProDecodeCtx *s, BitstreamContext *bc, int len,
         and skipped later so that a fast byte copy is possible */
 
     if (!append) {
-        s->frame_offset = bitstream_tell(bc) & 7;
+        s->frame_offset = get_bits_count(gb) & 7;
         s->num_saved_bits = s->frame_offset;
         init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
     }
 
-    buflen = (s->num_saved_bits + len + 8) >> 3;
+    buflen = (put_bits_count(&s->pb) + len + 8) >> 3;
 
     if (len <= 0 || buflen > MAX_FRAMESIZE) {
         avpriv_request_sample(s->avctx, "Too small input buffer");
@@ -1473,48 +1575,34 @@ static void save_bits(WMAProDecodeCtx *s, BitstreamContext *bc, int len,
         return;
     }
 
-    if (len > put_bits_left(&s->pb)) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "Cannot append %d bits, only %d bits available.\n",
-               len, put_bits_left(&s->pb));
-        s->packet_loss = 1;
-        return;
-    }
+    av_assert0(len <= put_bits_left(&s->pb));
 
     s->num_saved_bits += len;
     if (!append) {
-        avpriv_copy_bits(&s->pb, bc->buffer + (bitstream_tell(bc) >> 3),
-                         s->num_saved_bits);
+        avpriv_copy_bits(&s->pb, gb->buffer + (get_bits_count(gb) >> 3),
+                     s->num_saved_bits);
     } else {
-        int align = 8 - (bitstream_tell(bc) & 7);
+        int align = 8 - (get_bits_count(gb) & 7);
         align = FFMIN(align, len);
-        put_bits(&s->pb, align, bitstream_read(bc, align));
+        put_bits(&s->pb, align, get_bits(gb, align));
         len -= align;
-        avpriv_copy_bits(&s->pb, bc->buffer + (bitstream_tell(bc) >> 3), len);
+        avpriv_copy_bits(&s->pb, gb->buffer + (get_bits_count(gb) >> 3), len);
     }
-    bitstream_skip(bc, len);
+    skip_bits_long(gb, len);
 
     {
         PutBitContext tmp = s->pb;
         flush_put_bits(&tmp);
     }
 
-    bitstream_init(&s->bc, s->frame_data, s->num_saved_bits);
-    bitstream_skip(&s->bc, s->frame_offset);
+    init_get_bits(&s->gb, s->frame_data, s->num_saved_bits);
+    skip_bits(&s->gb, s->frame_offset);
 }
 
-/**
- *@brief Decode a single WMA packet.
- *@param avctx codec context
- *@param data the output buffer
- *@param avpkt input packet
- *@return number of bytes that were read from the input buffer
- */
-static int decode_packet(AVCodecContext *avctx, void *data,
-                         int *got_frame_ptr, AVPacket* avpkt)
+static int decode_packet(AVCodecContext *avctx, WMAProDecodeCtx *s,
+                         void *data, int *got_frame_ptr, AVPacket *avpkt)
 {
-    WMAProDecodeCtx *s = avctx->priv_data;
-    BitstreamContext *bc = &s->pbc;
+    GetBitContext* gb  = &s->pgb;
     const uint8_t* buf = avpkt->data;
     int buf_size       = avpkt->size;
     int num_bits_prev_frame;
@@ -1522,32 +1610,76 @@ static int decode_packet(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 0;
 
-    if (s->packet_done || s->packet_loss) {
+    if (!buf_size) {
+        AVFrame *frame = data;
+        int i;
+
+        /** Must output remaining samples after stream end. WMAPRO 5.1 created
+         * by XWMA encoder don't though (maybe only 1/2ch streams need it). */
+        s->packet_done = 0;
+        if (s->eof_done)
+            return 0;
+
+        /** clean output buffer and copy last IMDCT samples */
+        for (i = 0; i < s->nb_channels; i++) {
+            memset(frame->extended_data[i], 0,
+            s->samples_per_frame * sizeof(*s->channel[i].out));
+
+            memcpy(frame->extended_data[i], s->channel[i].out,
+                   s->samples_per_frame * sizeof(*s->channel[i].out) >> 1);
+        }
+
+        /* TODO: XMA should output 128 samples only (instead of 512) and WMAPRO
+         * maybe 768 (with 2048), XMA needs changes in multi-stream handling though. */
+
+        s->eof_done = 1;
+        s->packet_done = 1;
+        *got_frame_ptr = 1;
+        return 0;
+    }
+    else if (s->packet_done || s->packet_loss) {
         s->packet_done = 0;
 
         /** sanity check for the buffer length */
-        if (buf_size < avctx->block_align) {
+        if (avctx->codec_id == AV_CODEC_ID_WMAPRO && buf_size < avctx->block_align) {
             av_log(avctx, AV_LOG_ERROR, "Input packet too small (%d < %d)\n",
                    buf_size, avctx->block_align);
             return AVERROR_INVALIDDATA;
         }
 
-        s->next_packet_start = buf_size - avctx->block_align;
-        buf_size = avctx->block_align;
+        if (avctx->codec_id == AV_CODEC_ID_WMAPRO) {
+            s->next_packet_start = buf_size - avctx->block_align;
+            buf_size = avctx->block_align;
+        } else {
+            s->next_packet_start = buf_size - FFMIN(buf_size, avctx->block_align);
+            buf_size = FFMIN(buf_size, avctx->block_align);
+        }
         s->buf_bit_size = buf_size << 3;
 
         /** parse packet header */
-        bitstream_init(bc, buf, s->buf_bit_size);
-        packet_sequence_number = bitstream_read(bc, 4);
-        bitstream_skip(bc, 2);
+        init_get_bits(gb, buf, s->buf_bit_size);
+        if (avctx->codec_id != AV_CODEC_ID_XMA2) {
+            packet_sequence_number = get_bits(gb, 4);
+            skip_bits(gb, 2);
+        } else {
+            int num_frames = get_bits(gb, 6);
+            ff_dlog(avctx, "packet[%d]: number of frames %d\n", avctx->frame_number, num_frames);
+            packet_sequence_number = 0;
+        }
 
         /** get number of bits that need to be added to the previous frame */
-        num_bits_prev_frame = bitstream_read(bc, s->log2_frame_size);
+        num_bits_prev_frame = get_bits(gb, s->log2_frame_size);
+        if (avctx->codec_id != AV_CODEC_ID_WMAPRO) {
+            skip_bits(gb, 3);
+            s->skip_packets = get_bits(gb, 8);
+            ff_dlog(avctx, "packet[%d]: skip packets %d\n", avctx->frame_number, s->skip_packets);
+        }
+
         ff_dlog(avctx, "packet[%d]: nbpf %x\n", avctx->frame_number,
                 num_bits_prev_frame);
 
         /** check for packet loss */
-        if (!s->packet_loss &&
+        if (avctx->codec_id == AV_CODEC_ID_WMAPRO && !s->packet_loss &&
             ((s->packet_sequence_number + 1) & 0xF) != packet_sequence_number) {
             s->packet_loss = 1;
             av_log(avctx, AV_LOG_ERROR,
@@ -1557,7 +1689,7 @@ static int decode_packet(AVCodecContext *avctx, void *data,
         s->packet_sequence_number = packet_sequence_number;
 
         if (num_bits_prev_frame > 0) {
-            int remaining_packet_bits = s->buf_bit_size - bitstream_tell(bc);
+            int remaining_packet_bits = s->buf_bit_size - get_bits_count(gb);
             if (num_bits_prev_frame >= remaining_packet_bits) {
                 num_bits_prev_frame = remaining_packet_bits;
                 s->packet_done = 1;
@@ -1565,7 +1697,7 @@ static int decode_packet(AVCodecContext *avctx, void *data,
 
             /** append the previous frame data to the remaining data from the
                 previous packet to create a full frame */
-            save_bits(s, bc, num_bits_prev_frame, 1);
+            save_bits(s, gb, num_bits_prev_frame, 1);
             ff_dlog(avctx, "accumulated %x bits of frame data\n",
                     s->num_saved_bits - s->frame_offset);
 
@@ -1584,19 +1716,19 @@ static int decode_packet(AVCodecContext *avctx, void *data,
             s->num_saved_bits = 0;
             s->packet_loss = 0;
         }
-
     } else {
         int frame_size;
         s->buf_bit_size = (avpkt->size - s->next_packet_start) << 3;
-        bitstream_init(bc, avpkt->data, s->buf_bit_size);
-        bitstream_skip(bc, s->packet_offset);
-        if (s->len_prefix && remaining_bits(s, bc) > s->log2_frame_size &&
-            (frame_size = bitstream_peek(bc, s->log2_frame_size)) &&
-            frame_size <= remaining_bits(s, bc)) {
-            save_bits(s, bc, frame_size, 0);
-            s->packet_done = !decode_frame(s, data, got_frame_ptr);
+        init_get_bits(gb, avpkt->data, s->buf_bit_size);
+        skip_bits(gb, s->packet_offset);
+        if (s->len_prefix && remaining_bits(s, gb) > s->log2_frame_size &&
+            (frame_size = show_bits(gb, s->log2_frame_size)) &&
+            frame_size <= remaining_bits(s, gb)) {
+            save_bits(s, gb, frame_size, 0);
+            if (!s->packet_loss)
+                s->packet_done = !decode_frame(s, data, got_frame_ptr);
         } else if (!s->len_prefix
-                   && s->num_saved_bits > bitstream_tell(&s->bc)) {
+                   && s->num_saved_bits > get_bits_count(&s->gb)) {
             /** when the frames do not have a length prefix, we don't know
                 the compressed length of the individual frames
                 however, we know what part of a new packet belongs to the
@@ -1605,38 +1737,244 @@ static int decode_packet(AVCodecContext *avctx, void *data,
                 the "previous frame" data from the next packet so that
                 we get a buffer that only contains full frames */
             s->packet_done = !decode_frame(s, data, got_frame_ptr);
-        } else
+        } else {
             s->packet_done = 1;
+        }
+    }
+
+    if (remaining_bits(s, gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Overread %d\n", -remaining_bits(s, gb));
+        s->packet_loss = 1;
     }
 
     if (s->packet_done && !s->packet_loss &&
-        remaining_bits(s, bc) > 0) {
+        remaining_bits(s, gb) > 0) {
         /** save the rest of the data so that it can be decoded
             with the next packet */
-        save_bits(s, bc, remaining_bits(s, bc), 0);
+        save_bits(s, gb, remaining_bits(s, gb), 0);
     }
 
-    s->packet_offset = bitstream_tell(bc) & 7;
+    s->packet_offset = get_bits_count(gb) & 7;
     if (s->packet_loss)
         return AVERROR_INVALIDDATA;
 
-    return bitstream_tell(bc) >> 3;
+    return get_bits_count(gb) >> 3;
 }
 
 /**
- *@brief Clear decoder buffers (for seeking).
+ *@brief Decode a single WMA packet.
  *@param avctx codec context
+ *@param data the output buffer
+ *@param avpkt input packet
+ *@return number of bytes that were read from the input buffer
  */
-static void flush(AVCodecContext *avctx)
+static int wmapro_decode_packet(AVCodecContext *avctx, void *data,
+                                int *got_frame_ptr, AVPacket *avpkt)
 {
     WMAProDecodeCtx *s = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret;
+
+    /* get output buffer */
+    frame->nb_samples = s->samples_per_frame;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
+        s->packet_loss = 1;
+        return 0;
+    }
+
+    return decode_packet(avctx, s, data, got_frame_ptr, avpkt);
+}
+
+static int xma_decode_packet(AVCodecContext *avctx, void *data,
+                             int *got_frame_ptr, AVPacket *avpkt)
+{
+    XMADecodeCtx *s = avctx->priv_data;
+    int got_stream_frame_ptr = 0;
+    AVFrame *frame = data;
+    int i, ret, offset = INT_MAX;
+
+    /* decode current stream packet */
+    ret = decode_packet(avctx, &s->xma[s->current_stream], s->frames[s->current_stream],
+                        &got_stream_frame_ptr, avpkt);
+
+    /* copy stream samples (1/2ch) to sample buffer (Nch) */
+    if (got_stream_frame_ptr) {
+        int start_ch = s->start_channel[s->current_stream];
+        memcpy(&s->samples[start_ch + 0][s->offset[s->current_stream] * 512],
+               s->frames[s->current_stream]->extended_data[0], 512 * 4);
+        if (s->xma[s->current_stream].nb_channels > 1)
+            memcpy(&s->samples[start_ch + 1][s->offset[s->current_stream] * 512],
+                   s->frames[s->current_stream]->extended_data[1], 512 * 4);
+        s->offset[s->current_stream]++;
+    } else if (ret < 0) {
+        memset(s->offset, 0, sizeof(s->offset));
+        s->current_stream = 0;
+        return ret;
+    }
+
+    /* find next XMA packet's owner stream, and update.
+     * XMA streams find their packets following packet_skips
+     * (at start there is one packet per stream, then interleave non-linearly). */
+    if (s->xma[s->current_stream].packet_done ||
+        s->xma[s->current_stream].packet_loss) {
+
+        /* select stream with 0 skip_packets (= uses next packet) */
+        if (s->xma[s->current_stream].skip_packets != 0) {
+            int min[2];
+
+            min[0] = s->xma[0].skip_packets;
+            min[1] = i = 0;
+
+            for (i = 1; i < s->num_streams; i++) {
+                if (s->xma[i].skip_packets < min[0]) {
+                    min[0] = s->xma[i].skip_packets;
+                    min[1] = i;
+                }
+            }
+
+            s->current_stream = min[1];
+        }
+
+        /* all other streams skip next packet */
+        for (i = 0; i < s->num_streams; i++) {
+            s->xma[i].skip_packets = FFMAX(0, s->xma[i].skip_packets - 1);
+        }
+
+        /* copy samples from buffer to output if possible */
+        for (i = 0; i < s->num_streams; i++) {
+            offset = FFMIN(offset, s->offset[i]);
+        }
+        if (offset > 0) {
+            int bret;
+
+            frame->nb_samples = 512 * offset;
+            if ((bret = ff_get_buffer(avctx, frame, 0)) < 0)
+                return bret;
+
+            /* copy samples buffer (Nch) to frame samples (Nch), move unconsumed samples */
+            for (i = 0; i < s->num_streams; i++) {
+                int start_ch = s->start_channel[i];
+                memcpy(frame->extended_data[start_ch + 0], s->samples[start_ch + 0], frame->nb_samples * 4);
+                if (s->xma[i].nb_channels > 1)
+                    memcpy(frame->extended_data[start_ch + 1], s->samples[start_ch + 1], frame->nb_samples * 4);
+
+                s->offset[i] -= offset;
+                if (s->offset[i]) {
+                    memmove(s->samples[start_ch + 0], s->samples[start_ch + 0] + frame->nb_samples, s->offset[i] * 4 * 512);
+                    if (s->xma[i].nb_channels > 1)
+                        memmove(s->samples[start_ch + 1], s->samples[start_ch + 1] + frame->nb_samples, s->offset[i] * 4 * 512);
+                }
+            }
+
+            *got_frame_ptr = 1;
+        }
+    }
+
+    return ret;
+}
+
+static av_cold int xma_decode_init(AVCodecContext *avctx)
+{
+    XMADecodeCtx *s = avctx->priv_data;
+    int i, ret, start_channels = 0;
+
+    if (avctx->channels <= 0 || avctx->extradata_size == 0)
+        return AVERROR_INVALIDDATA;
+
+    /* get stream config */
+    if (avctx->codec_id == AV_CODEC_ID_XMA2 && avctx->extradata_size == 34) { /* XMA2WAVEFORMATEX */
+        s->num_streams = (avctx->channels + 1) / 2;
+    } else if (avctx->codec_id == AV_CODEC_ID_XMA2 && avctx->extradata_size >= 2) { /* XMA2WAVEFORMAT */
+        s->num_streams = avctx->extradata[1];
+        if (avctx->extradata_size != (32 + ((avctx->extradata[0]==3)?0:8) + 4*s->num_streams)) {
+            av_log(avctx, AV_LOG_ERROR, "Incorrect XMA2 extradata size\n");
+            return AVERROR(EINVAL);
+        }
+    } else if (avctx->codec_id == AV_CODEC_ID_XMA1 && avctx->extradata_size >= 4) { /* XMAWAVEFORMAT */
+        s->num_streams = avctx->extradata[4];
+        if (avctx->extradata_size != (8 + 20*s->num_streams)) {
+            av_log(avctx, AV_LOG_ERROR, "Incorrect XMA1 extradata size\n");
+            return AVERROR(EINVAL);
+        }
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Incorrect XMA config\n");
+        return AVERROR(EINVAL);
+    }
+
+    /* encoder supports up to 64 streams / 64*2 channels (would have to alloc arrays) */
+    if (avctx->channels > XMA_MAX_CHANNELS || s->num_streams > XMA_MAX_STREAMS) {
+        avpriv_request_sample(avctx, "More than %d channels in %d streams", XMA_MAX_CHANNELS, s->num_streams);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    /* init all streams (several streams of 1/2ch make Nch files) */
+    for (i = 0; i < s->num_streams; i++) {
+        ret = decode_init(&s->xma[i], avctx, i);
+        if (ret < 0)
+            return ret;
+        s->frames[i] = av_frame_alloc();
+        if (!s->frames[i])
+            return AVERROR(ENOMEM);
+        s->frames[i]->nb_samples = 512;
+        if ((ret = ff_get_buffer(avctx, s->frames[i], 0)) < 0) {
+            return AVERROR(ENOMEM);
+        }
+
+        s->start_channel[i] = start_channels;
+        start_channels += s->xma[i].nb_channels;
+    }
+
+    return ret;
+}
+
+static av_cold int xma_decode_end(AVCodecContext *avctx)
+{
+    XMADecodeCtx *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->num_streams; i++) {
+        decode_end(&s->xma[i]);
+        av_frame_free(&s->frames[i]);
+    }
+
+    return 0;
+}
+
+static void flush(WMAProDecodeCtx *s)
+{
     int i;
     /** reset output buffer as a part of it is used during the windowing of a
         new frame */
-    for (i = 0; i < avctx->channels; i++)
+    for (i = 0; i < s->nb_channels; i++)
         memset(s->channel[i].out, 0, s->samples_per_frame *
                sizeof(*s->channel[i].out));
     s->packet_loss = 1;
+    s->skip_packets = 0;
+    s->eof_done = 0;
+}
+
+
+/**
+ *@brief Clear decoder buffers (for seeking).
+ *@param avctx codec context
+ */
+static void wmapro_flush(AVCodecContext *avctx)
+{
+    WMAProDecodeCtx *s = avctx->priv_data;
+
+    flush(s);
+}
+
+static void xma_flush(AVCodecContext *avctx)
+{
+    XMADecodeCtx *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->num_streams; i++)
+        flush(&s->xma[i]);
+
+    memset(s->offset, 0, sizeof(s->offset));
+    s->current_stream = 0;
 }
 
 
@@ -1649,11 +1987,40 @@ AVCodec ff_wmapro_decoder = {
     .type           = AVMEDIA_TYPE_AUDIO,
     .id             = AV_CODEC_ID_WMAPRO,
     .priv_data_size = sizeof(WMAProDecodeCtx),
-    .init           = decode_init,
-    .close          = decode_end,
-    .decode         = decode_packet,
+    .init           = wmapro_decode_init,
+    .close          = wmapro_decode_end,
+    .decode         = wmapro_decode_packet,
     .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
-    .flush          = flush,
+    .flush          = wmapro_flush,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+};
+
+AVCodec ff_xma1_decoder = {
+    .name           = "xma1",
+    .long_name      = NULL_IF_CONFIG_SMALL("Xbox Media Audio 1"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_XMA1,
+    .priv_data_size = sizeof(XMADecodeCtx),
+    .init           = xma_decode_init,
+    .close          = xma_decode_end,
+    .decode         = xma_decode_packet,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+};
+
+AVCodec ff_xma2_decoder = {
+    .name           = "xma2",
+    .long_name      = NULL_IF_CONFIG_SMALL("Xbox Media Audio 2"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_XMA2,
+    .priv_data_size = sizeof(XMADecodeCtx),
+    .init           = xma_decode_init,
+    .close          = xma_decode_end,
+    .decode         = xma_decode_packet,
+    .flush          = xma_flush,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
diff --git a/libavcodec/wmavoice.c b/libavcodec/wmavoice.c
index 8c7ec7b..68bb659 100644
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -2,20 +2,20 @@
  * Windows Media Audio Voice decoder.
  * Copyright (c) 2009 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,10 +30,10 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/mem.h"
-
+#include "libavutil/thread.h"
 #include "avcodec.h"
-#include "bitstream.h"
 #include "internal.h"
+#include "get_bits.h"
 #include "put_bits.h"
 #include "wmavoice_data.h"
 #include "celp_filters.h"
@@ -43,7 +43,6 @@
 #include "dct.h"
 #include "rdft.h"
 #include "sinewin.h"
-#include "vlc.h"
 
 #define MAX_BLOCKS           8   ///< maximum number of blocks per frame
 #define MAX_LSPS             16  ///< maximum filter order
@@ -106,26 +105,24 @@ static const struct frame_type_desc {
     uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
                           ///< (rather than just one single pulse)
                           ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
-    uint16_t frame_size;  ///< the amount of bits that make up the block
-                          ///< data (per frame)
 } frame_descs[17] = {
-    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0,   0 },
-    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0,  28 },
-    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0,  46 },
-    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2,  80 },
-    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
-    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
-    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
-    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
-    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0,  64 },
-    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2,  80 },
-    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 104 },
-    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 108 },
-    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 132 },
-    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 168 },
-    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 176 },
-    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 208 },
-    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 256 }
+    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
+    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
+    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
+    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
+    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
+    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
+    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
+    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
+    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
+    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
+    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
+    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
+    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
+    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
+    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
+    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
+    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
 };
 
 /**
@@ -136,7 +133,7 @@ typedef struct WMAVoiceContext {
      * @name Global values specified in the stream header / extradata or used all over.
      * @{
      */
-    BitstreamContext bc;          ///< packet bitreader. During decoder init,
+    GetBitContext gb;             ///< packet bitreader. During decoder init,
                                   ///< it contains the extradata from the
                                   ///< demuxer. During decoding, it contains
                                   ///< packet data.
@@ -162,10 +159,6 @@ typedef struct WMAVoiceContext {
     int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
     int lsp_def_mode;             ///< defines different sets of LSP defaults
                                   ///< [0, 1]
-    int frame_lsp_bitsize;        ///< size (in bits) of LSPs, when encoded
-                                  ///< per-frame (independent coding)
-    int sframe_lsp_bitsize;       ///< size (in bits) of LSPs, when encoded
-                                  ///< per superframe (residual coding)
 
     int min_pitch_val;            ///< base value for pitch parsing code
     int max_pitch_val;            ///< max value + 1 for pitch parsing
@@ -253,6 +246,7 @@ typedef struct WMAVoiceContext {
 
     int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
                                   ///< only used for comfort noise in #pRNG()
+    int nb_superframes;           ///< number of superframes in current packet
     float gain_pred_err[6];       ///< cache for gain prediction
     float excitation_history[MAX_SIGNAL_HISTORY];
                                   ///< cache of the signal of previous
@@ -296,20 +290,20 @@ typedef struct WMAVoiceContext {
 
 /**
  * Set up the variable bit mode (VBM) tree from container extradata.
- * @param bc bit I/O context.
- *           The bit context (s->bc) should be loaded with byte 23-46 of the
+ * @param gb bit I/O context.
+ *           The bit context (s->gb) should be loaded with byte 23-46 of the
  *           container extradata (i.e. the ones containing the VBM tree).
  * @param vbm_tree pointer to array to which the decoded VBM tree will be
  *                 written.
  * @return 0 on success, <0 on error.
  */
-static av_cold int decode_vbmtree(BitstreamContext *bc, int8_t vbm_tree[25])
+static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
 {
     int cntr[8] = { 0 }, n, res;
 
     memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
     for (n = 0; n < 17; n++) {
-        res = bitstream_read(bc, 3);
+        res = get_bits(gb, 3);
         if (cntr[res] > 3) // should be >= 3 + (res == 7))
             return -1;
         vbm_tree[res * 3 + cntr[res]++] = n;
@@ -317,7 +311,7 @@ static av_cold int decode_vbmtree(BitstreamContext *bc, int8_t vbm_tree[25])
     return 0;
 }
 
-static av_cold void wmavoice_init_static_data(AVCodec *codec)
+static av_cold void wmavoice_init_static_data(void)
 {
     static const uint8_t bits[] = {
          2,  2,  2,  4,  4,  4,
@@ -339,14 +333,45 @@ static av_cold void wmavoice_init_static_data(AVCodec *codec)
                     bits, 1, 1, codes, 2, 2, 132);
 }
 
+static av_cold void wmavoice_flush(AVCodecContext *ctx)
+{
+    WMAVoiceContext *s = ctx->priv_data;
+    int n;
+
+    s->postfilter_agc    = 0;
+    s->sframe_cache_size = 0;
+    s->skip_bits_next    = 0;
+    for (n = 0; n < s->lsps; n++)
+        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
+    memset(s->excitation_history, 0,
+           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
+    memset(s->synth_history,      0,
+           sizeof(*s->synth_history)      * MAX_LSPS);
+    memset(s->gain_pred_err,      0,
+           sizeof(s->gain_pred_err));
+
+    if (s->do_apf) {
+        memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
+               sizeof(*s->synth_filter_out_buf) * s->lsps);
+        memset(s->dcf_mem,              0,
+               sizeof(*s->dcf_mem)              * 2);
+        memset(s->zero_exc_pf,          0,
+               sizeof(*s->zero_exc_pf)          * s->history_nsamples);
+        memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
+    }
+}
+
 /**
  * Set up decoder with parameters from demuxer (extradata etc.).
  */
 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
 {
+    static AVOnce init_static_once = AV_ONCE_INIT;
     int n, flags, pitch_range, lsp16_flag;
     WMAVoiceContext *s = ctx->priv_data;
 
+    ff_thread_once(&init_static_once, wmavoice_init_static_data);
+
     /**
      * Extradata layout:
      * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
@@ -361,6 +386,11 @@ static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
                ctx->extradata_size);
         return AVERROR_INVALIDDATA;
     }
+    if (ctx->block_align <= 0) {
+        av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
+        return AVERROR_INVALIDDATA;
+    }
+
     flags                = AV_RL32(ctx->extradata + 18);
     s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
     s->do_apf            =    flags & 0x1;
@@ -391,18 +421,14 @@ static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
     lsp16_flag           =    flags & 0x1000;
     if (lsp16_flag) {
         s->lsps               = 16;
-        s->frame_lsp_bitsize  = 34;
-        s->sframe_lsp_bitsize = 60;
     } else {
         s->lsps               = 10;
-        s->frame_lsp_bitsize  = 24;
-        s->sframe_lsp_bitsize = 48;
     }
     for (n = 0; n < s->lsps; n++)
         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
 
-    bitstream_init8(&s->bc, ctx->extradata + 22, ctx->extradata_size - 22);
-    if (decode_vbmtree(&s->bc, s->vbm_tree) < 0) {
+    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
+    if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
         av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
         return AVERROR_INVALIDDATA;
     }
@@ -485,7 +511,8 @@ static void adaptive_gain_control(float *out, const float *in,
         speech_energy     += fabsf(speech_synth[i]);
         postfilter_energy += fabsf(in[i]);
     }
-    gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
+    gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
+                        (1.0 - alpha) * speech_energy / postfilter_energy;
 
     for (i = 0; i < size; i++) {
         mem = alpha * mem + gain_scale_factor;
@@ -520,7 +547,7 @@ static int kalman_smoothen(WMAVoiceContext *s, int pitch,
     float optimal_gain = 0, dot;
     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
-                *best_hist_ptr;
+                *best_hist_ptr = NULL;
 
     /* find best fitting point in history */
     do {
@@ -780,7 +807,7 @@ static void postfilter(WMAVoiceContext *s, const float *synth,
           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
           *synth_filter_in = zero_exc_pf;
 
-    assert(size <= MAX_FRAMESIZE / 2);
+    av_assert0(size <= MAX_FRAMESIZE / 2);
 
     /* generate excitation from input signal */
     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
@@ -849,7 +876,6 @@ static void dequant_lsps(double *lsps, int num,
 /**
  * @name LSP dequantization routines
  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
- * @note we assume enough bits are available, caller should check.
  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
  * @{
@@ -857,7 +883,7 @@ static void dequant_lsps(double *lsps, int num,
 /**
  * Parse 10 independently-coded LSPs.
  */
-static void dequant_lsp10i(BitstreamContext *bc, double *lsps)
+static void dequant_lsp10i(GetBitContext *gb, double *lsps)
 {
     static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
     static const double mul_lsf[4] = {
@@ -870,10 +896,10 @@ static void dequant_lsp10i(BitstreamContext *bc, double *lsps)
     };
     uint16_t v[4];
 
-    v[0] = bitstream_read(bc, 8);
-    v[1] = bitstream_read(bc, 6);
-    v[2] = bitstream_read(bc, 5);
-    v[3] = bitstream_read(bc, 5);
+    v[0] = get_bits(gb, 8);
+    v[1] = get_bits(gb, 6);
+    v[2] = get_bits(gb, 5);
+    v[3] = get_bits(gb, 5);
 
     dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
                  mul_lsf, base_lsf);
@@ -883,7 +909,7 @@ static void dequant_lsp10i(BitstreamContext *bc, double *lsps)
  * Parse 10 independently-coded LSPs, and then derive the tables to
  * generate LSPs for the other frames from them (residual coding).
  */
-static void dequant_lsp10r(BitstreamContext *bc,
+static void dequant_lsp10r(GetBitContext *gb,
                            double *i_lsps, const double *old,
                            double *a1, double *a2, int q_mode)
 {
@@ -899,12 +925,12 @@ static void dequant_lsp10r(BitstreamContext *bc,
     uint16_t interpol, v[3];
     int n;
 
-    dequant_lsp10i(bc, i_lsps);
+    dequant_lsp10i(gb, i_lsps);
 
-    interpol = bitstream_read(bc, 5);
-    v[0]     = bitstream_read(bc, 7);
-    v[1]     = bitstream_read(bc, 6);
-    v[2]     = bitstream_read(bc, 6);
+    interpol = get_bits(gb, 5);
+    v[0]     = get_bits(gb, 7);
+    v[1]     = get_bits(gb, 6);
+    v[2]     = get_bits(gb, 6);
 
     for (n = 0; n < 10; n++) {
         double delta = old[n] - i_lsps[n];
@@ -919,7 +945,7 @@ static void dequant_lsp10r(BitstreamContext *bc,
 /**
  * Parse 16 independently-coded LSPs.
  */
-static void dequant_lsp16i(BitstreamContext *bc, double *lsps)
+static void dequant_lsp16i(GetBitContext *gb, double *lsps)
 {
     static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
     static const double mul_lsf[5] = {
@@ -934,11 +960,11 @@ static void dequant_lsp16i(BitstreamContext *bc, double *lsps)
     };
     uint16_t v[5];
 
-    v[0] = bitstream_read(bc, 8);
-    v[1] = bitstream_read(bc, 6);
-    v[2] = bitstream_read(bc, 7);
-    v[3] = bitstream_read(bc, 6);
-    v[4] = bitstream_read(bc, 7);
+    v[0] = get_bits(gb, 8);
+    v[1] = get_bits(gb, 6);
+    v[2] = get_bits(gb, 7);
+    v[3] = get_bits(gb, 6);
+    v[4] = get_bits(gb, 7);
 
     dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
                  wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
@@ -952,7 +978,7 @@ static void dequant_lsp16i(BitstreamContext *bc, double *lsps)
  * Parse 16 independently-coded LSPs, and then derive the tables to
  * generate LSPs for the other frames from them (residual coding).
  */
-static void dequant_lsp16r(BitstreamContext *bc,
+static void dequant_lsp16r(GetBitContext *gb,
                            double *i_lsps, const double *old,
                            double *a1, double *a2, int q_mode)
 {
@@ -968,12 +994,12 @@ static void dequant_lsp16r(BitstreamContext *bc,
     uint16_t interpol, v[3];
     int n;
 
-    dequant_lsp16i(bc, i_lsps);
+    dequant_lsp16i(gb, i_lsps);
 
-    interpol = bitstream_read(bc, 5);
-    v[0]     = bitstream_read(bc, 7);
-    v[1]     = bitstream_read(bc, 7);
-    v[2]     = bitstream_read(bc, 7);
+    interpol = get_bits(gb, 5);
+    v[0]     = get_bits(gb, 7);
+    v[1]     = get_bits(gb, 7);
+    v[2]     = get_bits(gb, 7);
 
     for (n = 0; n < 16; n++) {
         double delta = old[n] - i_lsps[n];
@@ -999,10 +1025,10 @@ static void dequant_lsp16r(BitstreamContext *bc,
  * Parse the offset of the first pitch-adaptive window pulses, and
  * the distribution of pulses between the two blocks in this frame.
  * @param s WMA Voice decoding context private data
- * @param bc bit I/O context
+ * @param gb bit I/O context
  * @param pitch pitch for each block in this frame
  */
-static void aw_parse_coords(WMAVoiceContext *s, BitstreamContext *bc,
+static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
                             const int *pitch)
 {
     static const int16_t start_offset[94] = {
@@ -1019,9 +1045,9 @@ static void aw_parse_coords(WMAVoiceContext *s, BitstreamContext *bc,
 
     /* position of pulse */
     s->aw_idx_is_ext = 0;
-    if ((bits = bitstream_read(bc, 6)) >= 54) {
+    if ((bits = get_bits(gb, 6)) >= 54) {
         s->aw_idx_is_ext = 1;
-        bits += (bits - 54) * 3 + bitstream_read(bc, 2);
+        bits += (bits - 54) * 3 + get_bits(gb, 2);
     }
 
     /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
@@ -1049,12 +1075,12 @@ static void aw_parse_coords(WMAVoiceContext *s, BitstreamContext *bc,
 /**
  * Apply second set of pitch-adaptive window pulses.
  * @param s WMA Voice decoding context private data
- * @param bc bit I/O context
+ * @param gb bit I/O context
  * @param block_idx block index in frame [0, 1]
  * @param fcb structure containing fixed codebook vector info
  * @return -1 on error, 0 otherwise
  */
-static int aw_pulse_set2(WMAVoiceContext *s, BitstreamContext *bc,
+static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
                          int block_idx, AMRFixed *fcb)
 {
     uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
@@ -1108,7 +1134,7 @@ static int aw_pulse_set2(WMAVoiceContext *s, BitstreamContext *bc,
         }
 
     /* find the 'aidx'th offset that is not excluded */
-    aidx = bitstream_read(bc, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
+    aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
     for (n = 0; n <= aidx; pulse_start++) {
         for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
         if (idx >= MAX_FRAMESIZE / 2) { // find from zero
@@ -1128,7 +1154,7 @@ static int aw_pulse_set2(WMAVoiceContext *s, BitstreamContext *bc,
     }
 
     fcb->x[fcb->n] = start_off;
-    fcb->y[fcb->n] = bitstream_read_bit(bc) ? -1.0 : 1.0;
+    fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
     fcb->n++;
 
     /* set offset for next block, relative to start of that block */
@@ -1140,14 +1166,14 @@ static int aw_pulse_set2(WMAVoiceContext *s, BitstreamContext *bc,
 /**
  * Apply first set of pitch-adaptive window pulses.
  * @param s WMA Voice decoding context private data
- * @param bc bit I/O context
+ * @param gb bit I/O context
  * @param block_idx block index in frame [0, 1]
  * @param fcb storage location for fixed codebook pulse info
  */
-static void aw_pulse_set1(WMAVoiceContext *s, BitstreamContext *bc,
+static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
                           int block_idx, AMRFixed *fcb)
 {
-    int val = bitstream_read(bc, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
+    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
     float v;
 
     if (s->aw_n_pulses[block_idx] > 0) {
@@ -1241,7 +1267,7 @@ static int pRNG(int frame_cntr, int block_num, int block_size)
  * Parse hardcoded signal for a single block.
  * @note see #synth_block().
  */
-static void synth_block_hardcoded(WMAVoiceContext *s, BitstreamContext *bc,
+static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
                                  int block_idx, int size,
                                  const struct frame_type_desc *frame_desc,
                                  float *excitation)
@@ -1249,15 +1275,15 @@ static void synth_block_hardcoded(WMAVoiceContext *s, BitstreamContext *bc,
     float gain;
     int n, r_idx;
 
-    assert(size <= MAX_FRAMESIZE);
+    av_assert0(size <= MAX_FRAMESIZE);
 
     /* Set the offset from which we start reading wmavoice_std_codebook */
     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
         r_idx = pRNG(s->frame_cntr, block_idx, size);
         gain  = s->silence_gain;
     } else /* FCB_TYPE_HARDCODED */ {
-        r_idx = bitstream_read(bc, 8);
-        gain  = wmavoice_gain_universal[bitstream_read(bc, 6)];
+        r_idx = get_bits(gb, 8);
+        gain  = wmavoice_gain_universal[get_bits(gb, 6)];
     }
 
     /* Clear gain prediction parameters */
@@ -1272,7 +1298,7 @@ static void synth_block_hardcoded(WMAVoiceContext *s, BitstreamContext *bc,
  * Parse FCB/ACB signal for a single block.
  * @note see #synth_block().
  */
-static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
+static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
                                 int block_idx, int size,
                                 int block_pitch_sh2,
                                 const struct frame_type_desc *frame_desc,
@@ -1285,7 +1311,7 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
     int n, idx, gain_weight;
     AMRFixed fcb;
 
-    assert(size <= MAX_FRAMESIZE / 2);
+    av_assert0(size <= MAX_FRAMESIZE / 2);
     memset(pulses, 0, sizeof(*pulses) * size);
 
     fcb.pitch_lag      = block_pitch_sh2 >> 2;
@@ -1296,8 +1322,8 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
     /* For the other frame types, this is where we apply the innovation
      * (fixed) codebook pulses of the speech signal. */
     if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
-        aw_pulse_set1(s, bc, block_idx, &fcb);
-        if (aw_pulse_set2(s, bc, block_idx, &fcb)) {
+        aw_pulse_set1(s, gb, block_idx, &fcb);
+        if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
             /* Conceal the block with silence and return.
              * Skip the correct amount of bits to read the next
              * block from the correct offset. */
@@ -1306,7 +1332,7 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
             for (n = 0; n < size; n++)
                 excitation[n] =
                     wmavoice_std_codebook[r_idx + n] * s->silence_gain;
-            bitstream_skip(bc, 7 + 1);
+            skip_bits(gb, 7 + 1);
             return;
         }
     } else /* FCB_TYPE_EXC_PULSES */ {
@@ -1319,12 +1345,12 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
             float sign;
             int pos1, pos2;
 
-            sign           = bitstream_read_bit(bc) ? 1.0 : -1.0;
-            pos1           = bitstream_read(bc, offset_nbits);
+            sign           = get_bits1(gb) ? 1.0 : -1.0;
+            pos1           = get_bits(gb, offset_nbits);
             fcb.x[fcb.n]   = n + 5 * pos1;
             fcb.y[fcb.n++] = sign;
             if (n < frame_desc->dbl_pulses) {
-                pos2           = bitstream_read(bc, offset_nbits);
+                pos2           = get_bits(gb, offset_nbits);
                 fcb.x[fcb.n]   = n + 5 * pos2;
                 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
             }
@@ -1334,7 +1360,7 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
 
     /* Calculate gain for adaptive & fixed codebook signal.
      * see ff_amr_set_fixed_gain(). */
-    idx = bitstream_read(bc, 7);
+    idx = get_bits(gb, 7);
     fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
                                                  gain_coeff, 6) -
                     5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
@@ -1393,10 +1419,9 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
 
 /**
  * Parse data in a single block.
- * @note we assume enough bits are available, caller should check.
  *
  * @param s WMA Voice decoding context private data
- * @param bc bit I/O context
+ * @param gb bit I/O context
  * @param block_idx index of the to-be-read block
  * @param size amount of samples to be read in this block
  * @param block_pitch_sh2 pitch for this block << 2
@@ -1407,7 +1432,7 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, BitstreamContext *bc,
  * @param synth target memory for the speech synthesis filter output
  * @return 0 on success, <0 on error.
  */
-static void synth_block(WMAVoiceContext *s, BitstreamContext *bc,
+static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
                         int block_idx, int size,
                         int block_pitch_sh2,
                         const double *lsps, const double *prev_lsps,
@@ -1420,9 +1445,9 @@ static void synth_block(WMAVoiceContext *s, BitstreamContext *bc,
     int n;
 
     if (frame_desc->acb_type == ACB_TYPE_NONE)
-        synth_block_hardcoded(s, bc, block_idx, size, frame_desc, excitation);
+        synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
     else
-        synth_block_fcb_acb(s, bc, block_idx, size, block_pitch_sh2,
+        synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
                             frame_desc, excitation);
 
     /* convert interpolated LSPs to LPCs */
@@ -1437,10 +1462,9 @@ static void synth_block(WMAVoiceContext *s, BitstreamContext *bc,
 
 /**
  * Synthesize output samples for a single frame.
- * @note we assume enough bits are available, caller should check.
  *
  * @param ctx WMA Voice decoder context
- * @param bc bit I/O context (s->bc or one for cross-packet superframes)
+ * @param gb bit I/O context (s->gb or one for cross-packet superframes)
  * @param frame_idx Frame number within superframe [0-2]
  * @param samples pointer to output sample buffer, has space for at least 160
  *                samples
@@ -1450,17 +1474,17 @@ static void synth_block(WMAVoiceContext *s, BitstreamContext *bc,
  * @param synth target buffer for synthesized speech data
  * @return 0 on success, <0 on error.
  */
-static int synth_frame(AVCodecContext *ctx, BitstreamContext *bc,
-                       int frame_idx, float *samples,
+static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
+                       float *samples,
                        const double *lsps, const double *prev_lsps,
                        float *excitation, float *synth)
 {
     WMAVoiceContext *s = ctx->priv_data;
-    int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
-    int pitch[MAX_BLOCKS], last_block_pitch;
+    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
+    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
 
     /* Parse frame type ("frame header"), see frame_descs */
-    int bd_idx = s->vbm_tree[bitstream_read_vlc(bc, frame_type_vlc.table, 6, 3)], block_nsamples;
+    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
 
     if (bd_idx < 0) {
         av_log(ctx, AV_LOG_ERROR,
@@ -1478,7 +1502,7 @@ static int synth_frame(AVCodecContext *ctx, BitstreamContext *bc,
          * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
         n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
         log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
-        cur_pitch_val    = s->min_pitch_val + bitstream_read(bc, s->pitch_nbits);
+        cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
         cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
         if (s->last_acb_type == ACB_TYPE_NONE ||
             20 * abs(cur_pitch_val - s->last_pitch_val) >
@@ -1502,10 +1526,10 @@ static int synth_frame(AVCodecContext *ctx, BitstreamContext *bc,
     /* Global gain (if silence) and pitch-adaptive window coordinates */
     switch (frame_descs[bd_idx].fcb_type) {
     case FCB_TYPE_SILENCE:
-        s->silence_gain = wmavoice_gain_silence[bitstream_read(bc, 8)];
+        s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
         break;
     case FCB_TYPE_AW_PULSES:
-        aw_parse_coords(s, bc, pitch);
+        aw_parse_coords(s, gb, pitch);
         break;
     }
 
@@ -1526,10 +1550,10 @@ static int synth_frame(AVCodecContext *ctx, BitstreamContext *bc,
                 t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
 
             if (n == 0) {
-                block_pitch = bitstream_read(bc, s->block_pitch_nbits);
+                block_pitch = get_bits(gb, s->block_pitch_nbits);
             } else
                 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
-                              bitstream_read(bc, s->block_delta_pitch_nbits);
+                                 get_bits(gb, s->block_delta_pitch_nbits);
             /* Convert last_ so that any next delta is within _range */
             last_block_pitch = av_clip(block_pitch,
                                        s->block_delta_pitch_hrange,
@@ -1567,7 +1591,7 @@ static int synth_frame(AVCodecContext *ctx, BitstreamContext *bc,
             break;
         }
 
-        synth_block(s, bc, n, block_nsamples, bl_pitch_sh2,
+        synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
                     lsps, prev_lsps, &frame_descs[bd_idx],
                     &excitation[n * block_nsamples],
                     &synth[n * block_nsamples]);
@@ -1656,85 +1680,9 @@ static void stabilize_lsps(double *lsps, int num)
 }
 
 /**
- * Test if there's enough bits to read 1 superframe.
- *
- * @param orig_bc bit I/O context used for reading. This function
- *                does not modify the state of the bitreader; it
- *                only uses it to copy the current stream position
- * @param s WMA Voice decoding context private data
- * @return < 0 on error, 1 on not enough bits or 0 if OK.
- */
-static int check_bits_for_superframe(BitstreamContext *orig_bc,
-                                     WMAVoiceContext *s)
-{
-    BitstreamContext s_bc, *bc = &s_bc;
-    int n, need_bits, bd_idx;
-    const struct frame_type_desc *frame_desc;
-
-    /* initialize a copy */
-    *bc = *orig_bc;
-
-    /* superframe header */
-    if (bitstream_bits_left(bc) < 14)
-        return 1;
-    if (!bitstream_read_bit(bc))
-        return AVERROR(ENOSYS);           // WMAPro-in-WMAVoice superframe
-    if (bitstream_read_bit(bc)) bitstream_skip(bc, 12); // number of  samples in superframe
-    if (s->has_residual_lsps) {           // residual LSPs (for all frames)
-        if (bitstream_bits_left(bc) < s->sframe_lsp_bitsize)
-            return 1;
-        bitstream_skip(bc, s->sframe_lsp_bitsize);
-    }
-
-    /* frames */
-    for (n = 0; n < MAX_FRAMES; n++) {
-        int aw_idx_is_ext = 0;
-
-        if (!s->has_residual_lsps) {     // independent LSPs (per-frame)
-           if (bitstream_bits_left(bc) < s->frame_lsp_bitsize)
-               return 1;
-           bitstream_skip(bc, s->frame_lsp_bitsize);
-        }
-        bd_idx = s->vbm_tree[bitstream_read_vlc(bc, frame_type_vlc.table, 6, 3)];
-        if (bd_idx < 0)
-            return AVERROR_INVALIDDATA; // invalid frame type VLC code
-        frame_desc = &frame_descs[bd_idx];
-        if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
-            if (bitstream_bits_left(bc) < s->pitch_nbits)
-                return 1;
-            bitstream_skip(bc, s->pitch_nbits);
-        }
-        if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
-            bitstream_skip(bc, 8);
-        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
-            int tmp = bitstream_read(bc, 6);
-            if (tmp >= 0x36) {
-                bitstream_skip(bc, 2);
-                aw_idx_is_ext = 1;
-            }
-        }
-
-        /* blocks */
-        if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
-            need_bits = s->block_pitch_nbits +
-                (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
-        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
-            need_bits = 2 * !aw_idx_is_ext;
-        } else
-            need_bits = 0;
-        need_bits += frame_desc->frame_size;
-        if (bitstream_bits_left(bc) < need_bits)
-            return 1;
-        bitstream_skip(bc, need_bits);
-    }
-
-    return 0;
-}
-
-/**
  * Synthesize output samples for a single superframe. If we have any data
  * cached in s->sframe_cache, that will be used instead of whatever is loaded
- * in s->bc.
+ * in s->gb.
  *
  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
  * to give a total of 480 samples per frame. See #synth_frame() for frame
@@ -1752,8 +1700,8 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
                             int *got_frame_ptr)
 {
     WMAVoiceContext *s = ctx->priv_data;
-    BitstreamContext *bc = &s->bc, s_bc;
-    int n, res, n_samples = 480;
+    GetBitContext *gb = &s->gb, s_gb;
+    int n, res, n_samples = MAX_SFRAMESIZE;
     double lsps[MAX_FRAMES][MAX_LSPS];
     const double *mean_lsf = s->lsps == 16 ?
         wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
@@ -1767,35 +1715,30 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
            s->history_nsamples * sizeof(*excitation));
 
     if (s->sframe_cache_size > 0) {
-        bc = &s_bc;
-        bitstream_init(bc, s->sframe_cache, s->sframe_cache_size);
+        gb = &s_gb;
+        init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
         s->sframe_cache_size = 0;
     }
 
-    if ((res = check_bits_for_superframe(bc, s)) == 1) {
-        *got_frame_ptr = 0;
-        return 1;
-    } else if (res < 0)
-        return res;
-
     /* First bit is speech/music bit, it differentiates between WMAVoice
      * speech samples (the actual codec) and WMAVoice music samples, which
      * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
      * the wild yet. */
-    if (!bitstream_read_bit(bc)) {
+    if (!get_bits1(gb)) {
         avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
         return AVERROR_PATCHWELCOME;
     }
 
     /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
-    if (bitstream_read_bit(bc)) {
-        if ((n_samples = bitstream_read(bc, 12)) > 480) {
+    if (get_bits1(gb)) {
+        if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
             av_log(ctx, AV_LOG_ERROR,
-                   "Superframe encodes >480 samples (%d), not allowed\n",
-                   n_samples);
+                   "Superframe encodes > %d samples (%d), not allowed\n",
+                   MAX_SFRAMESIZE, n_samples);
             return AVERROR_INVALIDDATA;
         }
     }
+
     /* Parse LSPs, if global for the superframe (can also be per-frame). */
     if (s->has_residual_lsps) {
         double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
@@ -1804,9 +1747,9 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
             prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
 
         if (s->lsps == 10) {
-            dequant_lsp10r(bc, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
+            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
         } else /* s->lsps == 16 */
-            dequant_lsp16r(bc, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
+            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
 
         for (n = 0; n < s->lsps; n++) {
             lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
@@ -1817,12 +1760,14 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
             stabilize_lsps(lsps[n], s->lsps);
     }
 
+    /* synth_superframe can run multiple times per packet
+     * free potential previous frame */
+    av_frame_unref(frame);
+
     /* get output buffer */
-    frame->nb_samples = 480;
-    if ((res = ff_get_buffer(ctx, frame, 0)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = MAX_SFRAMESIZE;
+    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
         return res;
-    }
     frame->nb_samples = n_samples;
     samples = (float *)frame->data[0];
 
@@ -1832,16 +1777,16 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
             int m;
 
             if (s->lsps == 10) {
-                dequant_lsp10i(bc, lsps[n]);
+                dequant_lsp10i(gb, lsps[n]);
             } else /* s->lsps == 16 */
-                dequant_lsp16i(bc, lsps[n]);
+                dequant_lsp16i(gb, lsps[n]);
 
             for (m = 0; m < s->lsps; m++)
                 lsps[n][m] += mean_lsf[m];
             stabilize_lsps(lsps[n], s->lsps);
         }
 
-        if ((res = synth_frame(ctx, bc, n,
+        if ((res = synth_frame(ctx, gb, n,
                                &samples[n * MAX_FRAMESIZE],
                                lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
                                &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
@@ -1854,9 +1799,14 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
     /* Statistics? FIXME - we don't check for length, a slight overrun
      * will be caught by internal buffer padding, and anything else
      * will be skipped, not read. */
-    if (bitstream_read_bit(bc)) {
-        res = bitstream_read(bc, 4);
-        bitstream_skip(bc, 10 * (res + 1));
+    if (get_bits1(gb)) {
+        res = get_bits(gb, 4);
+        skip_bits(gb, 10 * (res + 1));
+    }
+
+    if (get_bits_left(gb) < 0) {
+        wmavoice_flush(ctx);
+        return AVERROR_INVALIDDATA;
     }
 
     *got_frame_ptr = 1;
@@ -1880,35 +1830,32 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
  * decoder).
  *
  * @param s WMA Voice decoding context private data
- * @return 1 if not enough bits were available, or 0 on success.
+ * @return <0 on error, nb_superframes on success.
  */
 static int parse_packet_header(WMAVoiceContext *s)
 {
-    BitstreamContext *bc = &s->bc;
-    unsigned int res;
+    GetBitContext *gb = &s->gb;
+    unsigned int res, n_superframes = 0;
 
-    if (bitstream_bits_left(bc) < 11)
-        return 1;
-    bitstream_skip(bc, 4);      // packet sequence number
-    s->has_residual_lsps = bitstream_read_bit(bc);
+    skip_bits(gb, 4);          // packet sequence number
+    s->has_residual_lsps = get_bits1(gb);
     do {
-        res = bitstream_read(bc, 6); // number of superframes per packet
-                                     // (minus first one if there is spillover)
-        if (bitstream_bits_left(bc) < 6 * (res == 0x3F) + s->spillover_bitsize)
-            return 1;
+        res = get_bits(gb, 6); // number of superframes per packet
+                               // (minus first one if there is spillover)
+        n_superframes += res;
     } while (res == 0x3F);
-    s->spillover_nbits = bitstream_read(bc, s->spillover_bitsize);
+    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
 
-    return 0;
+    return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
 }
 
 /**
- * Copy (unaligned) bits from bc/data/size to pb.
+ * Copy (unaligned) bits from gb/data/size to pb.
  *
  * @param pb target buffer to copy bits into
  * @param data source buffer to copy bits from
  * @param size size of the source data, in bytes
- * @param bc bit I/O context specifying the current position in the source.
+ * @param gb bit I/O context specifying the current position in the source.
  *           data. This function might use this to align the bit position to
  *           a whole-byte boundary before calling #avpriv_copy_bits() on aligned
  *           source data
@@ -1919,18 +1866,18 @@ static int parse_packet_header(WMAVoiceContext *s)
  */
 static void copy_bits(PutBitContext *pb,
                       const uint8_t *data, int size,
-                      BitstreamContext *bc, int nbits)
+                      GetBitContext *gb, int nbits)
 {
     int rmn_bytes, rmn_bits;
 
-    rmn_bits = rmn_bytes = bitstream_bits_left(bc);
+    rmn_bits = rmn_bytes = get_bits_left(gb);
     if (rmn_bits < nbits)
         return;
     if (nbits > pb->size_in_bits - put_bits_count(pb))
         return;
     rmn_bits &= 7; rmn_bytes >>= 3;
     if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
-        put_bits(pb, rmn_bits, bitstream_read(bc, rmn_bits));
+        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
     avpriv_copy_bits(pb, data + size - rmn_bytes,
                  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
 }
@@ -1950,70 +1897,76 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
                                   int *got_frame_ptr, AVPacket *avpkt)
 {
     WMAVoiceContext *s = ctx->priv_data;
-    BitstreamContext *bc = &s->bc;
+    GetBitContext *gb = &s->gb;
     int size, res, pos;
 
     /* Packets are sometimes a multiple of ctx->block_align, with a packet
-     * header at each ctx->block_align bytes. However, Libav's ASF demuxer
+     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
      * feeds us ASF packets, which may concatenate multiple "codec" packets
      * in a single "muxer" packet, so we artificially emulate that by
      * capping the packet size at ctx->block_align. */
     for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
-    if (!size) {
-        *got_frame_ptr = 0;
-        return 0;
-    }
-    bitstream_init8(&s->bc, avpkt->data, size);
+    init_get_bits8(&s->gb, avpkt->data, size);
 
     /* size == ctx->block_align is used to indicate whether we are dealing with
      * a new packet or a packet of which we already read the packet header
      * previously. */
-    if (size == ctx->block_align) { // new packet header
-        if ((res = parse_packet_header(s)) < 0)
-            return res;
+    if (!(size % ctx->block_align)) { // new packet header
+        if (!size) {
+            s->spillover_nbits = 0;
+            s->nb_superframes = 0;
+        } else {
+            if ((res = parse_packet_header(s)) < 0)
+                return res;
+            s->nb_superframes = res;
+        }
 
         /* If the packet header specifies a s->spillover_nbits, then we want
          * to push out all data of the previous packet (+ spillover) before
          * continuing to parse new superframes in the current packet. */
-        if (s->spillover_nbits > 0) {
-            if (s->sframe_cache_size > 0) {
-                int cnt = bitstream_tell(bc);
-                copy_bits(&s->pb, avpkt->data, size, bc, s->spillover_nbits);
-                flush_put_bits(&s->pb);
-                s->sframe_cache_size += s->spillover_nbits;
-                if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
-                    *got_frame_ptr) {
-                    cnt += s->spillover_nbits;
-                    s->skip_bits_next = cnt & 7;
-                    return cnt >> 3;
-                } else
-                    bitstream_skip (bc, s->spillover_nbits - cnt +
-                                    bitstream_tell(bc)); // resync
+        if (s->sframe_cache_size > 0) {
+            int cnt = get_bits_count(gb);
+            if (cnt + s->spillover_nbits > avpkt->size * 8) {
+                s->spillover_nbits = avpkt->size * 8 - cnt;
+            }
+            copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
+            flush_put_bits(&s->pb);
+            s->sframe_cache_size += s->spillover_nbits;
+            if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
+                *got_frame_ptr) {
+                cnt += s->spillover_nbits;
+                s->skip_bits_next = cnt & 7;
+                res = cnt >> 3;
+                return res;
             } else
-                bitstream_skip(bc, s->spillover_nbits);  // resync
+                skip_bits_long (gb, s->spillover_nbits - cnt +
+                                get_bits_count(gb)); // resync
+        } else if (s->spillover_nbits) {
+            skip_bits_long(gb, s->spillover_nbits);  // resync
         }
     } else if (s->skip_bits_next)
-        bitstream_skip(bc, s->skip_bits_next);
+        skip_bits(gb, s->skip_bits_next);
 
     /* Try parsing superframes in current packet */
     s->sframe_cache_size = 0;
     s->skip_bits_next = 0;
-    pos = bitstream_bits_left(bc);
-    if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
-        return res;
-    } else if (*got_frame_ptr) {
-        int cnt = bitstream_tell(bc);
-        s->skip_bits_next = cnt & 7;
-        return cnt >> 3;
+    pos = get_bits_left(gb);
+    if (s->nb_superframes-- == 0) {
+        *got_frame_ptr = 0;
+        return size;
+    } else if (s->nb_superframes > 0) {
+        if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
+            return res;
+        } else if (*got_frame_ptr) {
+            int cnt = get_bits_count(gb);
+            s->skip_bits_next = cnt & 7;
+            res = cnt >> 3;
+            return res;
+        }
     } else if ((s->sframe_cache_size = pos) > 0) {
-        /* rewind bit reader to start of last (incomplete) superframe... */
-        bitstream_init8(bc, avpkt->data, size);
-        bitstream_skip(bc, (size << 3) - pos);
-        assert(bitstream_bits_left(bc) == pos);
-
-        /* ...and cache it for spillover in next packet */
+        /* ... cache it for spillover in next packet */
         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
-        copy_bits(&s->pb, avpkt->data, size, bc, s->sframe_cache_size);
+        copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
         // FIXME bad - just copy bytes as whole and add use the
         // skip_bits_next field
     }
@@ -2035,34 +1988,6 @@ static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
     return 0;
 }
 
-static av_cold void wmavoice_flush(AVCodecContext *ctx)
-{
-    WMAVoiceContext *s = ctx->priv_data;
-    int n;
-
-    s->postfilter_agc    = 0;
-    s->sframe_cache_size = 0;
-    s->skip_bits_next    = 0;
-    for (n = 0; n < s->lsps; n++)
-        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
-    memset(s->excitation_history, 0,
-           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
-    memset(s->synth_history,      0,
-           sizeof(*s->synth_history)      * MAX_LSPS);
-    memset(s->gain_pred_err,      0,
-           sizeof(s->gain_pred_err));
-
-    if (s->do_apf) {
-        memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
-               sizeof(*s->synth_filter_out_buf) * s->lsps);
-        memset(s->dcf_mem,              0,
-               sizeof(*s->dcf_mem)              * 2);
-        memset(s->zero_exc_pf,          0,
-               sizeof(*s->zero_exc_pf)          * s->history_nsamples);
-        memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
-    }
-}
-
 AVCodec ff_wmavoice_decoder = {
     .name             = "wmavoice",
     .long_name        = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
@@ -2070,9 +1995,8 @@ AVCodec ff_wmavoice_decoder = {
     .id               = AV_CODEC_ID_WMAVOICE,
     .priv_data_size   = sizeof(WMAVoiceContext),
     .init             = wmavoice_decode_init,
-    .init_static_data = wmavoice_init_static_data,
     .close            = wmavoice_decode_end,
     .decode           = wmavoice_decode_packet,
-    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
     .flush            = wmavoice_flush,
 };
diff --git a/libavcodec/wmavoice_data.h b/libavcodec/wmavoice_data.h
index 7f14fb8..cbf65b0 100644
--- a/libavcodec/wmavoice_data.h
+++ b/libavcodec/wmavoice_data.h
@@ -2,20 +2,20 @@
  * Windows Media Voice (WMAVoice) tables.
  * Copyright (c) 2009 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2.c b/libavcodec/wmv2.c
index 93bb642..327c5bd 100644
--- a/libavcodec/wmv2.c
+++ b/libavcodec/wmv2.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@ av_cold void ff_wmv2_common_init(Wmv2Context *w)
 {
     MpegEncContext *const s = &w->s;
 
-    ff_blockdsp_init(&s->bdsp);
+    ff_blockdsp_init(&s->bdsp, s->avctx);
     ff_wmv2dsp_init(&w->wdsp);
     s->idsp.perm_type = w->wdsp.idct_perm;
     ff_init_scantable_permutation(s->idsp.idct_permutation,
@@ -104,8 +104,8 @@ void ff_mspel_motion(MpegEncContext *s, uint8_t *dest_y,
 {
     Wmv2Context *const w = (Wmv2Context *) s;
     uint8_t *ptr;
-    int dxy, offset, mx, my, src_x, src_y, v_edge_pos;
-    ptrdiff_t linesize, uvlinesize;
+    int dxy, mx, my, src_x, src_y, v_edge_pos;
+    ptrdiff_t offset, linesize, uvlinesize;
     int emu = 0;
 
     dxy   = ((motion_y & 1) << 1) | (motion_x & 1);
@@ -145,21 +145,13 @@ void ff_mspel_motion(MpegEncContext *s, uint8_t *dest_y,
     if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
-    if (s->out_format == FMT_H263) {
-        dxy = 0;
-        if ((motion_x & 3) != 0)
-            dxy |= 1;
-        if ((motion_y & 3) != 0)
-            dxy |= 2;
-        mx = motion_x >> 2;
-        my = motion_y >> 2;
-    } else {
-        mx   = motion_x / 2;
-        my   = motion_y / 2;
-        dxy  = ((my & 1) << 1) | (mx & 1);
-        mx >>= 1;
-        my >>= 1;
-    }
+    dxy = 0;
+    if ((motion_x & 3) != 0)
+        dxy |= 1;
+    if ((motion_y & 3) != 0)
+        dxy |= 2;
+    mx = motion_x >> 2;
+    my = motion_y >> 2;
 
     src_x = s->mb_x * 8 + mx;
     src_y = s->mb_y * 8 + my;
diff --git a/libavcodec/wmv2.h b/libavcodec/wmv2.h
index b77dd98..0f459ae 100644
--- a/libavcodec/wmv2.h
+++ b/libavcodec/wmv2.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,7 +51,7 @@ typedef struct Wmv2Context {
     int hshift;
 
     ScanTable abt_scantable[2];
-    DECLARE_ALIGNED(16, int16_t, abt_block2)[6][64];
+    DECLARE_ALIGNED(32, int16_t, abt_block2)[6][64];
 } Wmv2Context;
 
 void ff_wmv2_common_init(Wmv2Context *w);
@@ -70,4 +70,16 @@ void ff_mspel_motion(MpegEncContext *s,
                      uint8_t **ref_picture, op_pixels_func (*pix_op)[4],
                      int motion_x, int motion_y, int h);
 
+
+static av_always_inline int wmv2_get_cbp_table_index(MpegEncContext *s, int cbp_index)
+{
+    static const uint8_t map[3][3] = {
+        { 0, 2, 1 },
+        { 1, 0, 2 },
+        { 2, 1, 0 },
+    };
+
+    return map[(s->qscale > 10) + (s->qscale > 20)][cbp_index];
+}
+
 #endif /* AVCODEC_WMV2_H */
diff --git a/libavcodec/wmv2data.c b/libavcodec/wmv2data.c
index bbb07bb..e858572 100644
--- a/libavcodec/wmv2data.c
+++ b/libavcodec/wmv2data.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2data.h b/libavcodec/wmv2data.h
index 8914e57..178346a 100644
--- a/libavcodec/wmv2data.h
+++ b/libavcodec/wmv2data.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2dec.c b/libavcodec/wmv2dec.c
index e1f86d8..92daa16 100644
--- a/libavcodec/wmv2dec.c
+++ b/libavcodec/wmv2dec.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "wmv2.h"
 
 
-static void parse_mb_skip(Wmv2Context *w)
+static int parse_mb_skip(Wmv2Context *w)
 {
     int mb_x, mb_y;
     MpegEncContext *const s = &w->s;
@@ -45,6 +45,8 @@ static void parse_mb_skip(Wmv2Context *w)
                     MB_TYPE_16x16 | MB_TYPE_L0;
         break;
     case SKIP_TYPE_MPEG:
+        if (get_bits_left(&s->gb) < s->mb_height * s->mb_width)
+            return AVERROR_INVALIDDATA;
         for (mb_y = 0; mb_y < s->mb_height; mb_y++)
             for (mb_x = 0; mb_x < s->mb_width; mb_x++)
                 mb_type[mb_y * s->mb_stride + mb_x] =
@@ -52,6 +54,8 @@ static void parse_mb_skip(Wmv2Context *w)
         break;
     case SKIP_TYPE_ROW:
         for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
+            if (get_bits_left(&s->gb) < 1)
+                return AVERROR_INVALIDDATA;
             if (get_bits1(&s->gb)) {
                 for (mb_x = 0; mb_x < s->mb_width; mb_x++)
                     mb_type[mb_y * s->mb_stride + mb_x] =
@@ -65,6 +69,8 @@ static void parse_mb_skip(Wmv2Context *w)
         break;
     case SKIP_TYPE_COL:
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
+            if (get_bits_left(&s->gb) < 1)
+                return AVERROR_INVALIDDATA;
             if (get_bits1(&s->gb)) {
                 for (mb_y = 0; mb_y < s->mb_height; mb_y++)
                     mb_type[mb_y * s->mb_stride + mb_x] =
@@ -77,6 +83,7 @@ static void parse_mb_skip(Wmv2Context *w)
         }
         break;
     }
+    return 0;
 }
 
 static int decode_ext_header(Wmv2Context *w)
@@ -108,7 +115,7 @@ static int decode_ext_header(Wmv2Context *w)
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG,
-               "fps:%d, br:%d, qpbit:%d, abt_flag:%d, j_type_bit:%d, "
+               "fps:%d, br:%"PRId64", qpbit:%d, abt_flag:%d, j_type_bit:%d, "
                "tl_mv_flag:%d, mbrl_bit:%d, code:%d, loop_filter:%d, "
                "slices:%d\n",
                fps, s->bit_rate, w->mspel_bit, w->abt_flag, w->j_type_bit,
@@ -134,6 +141,21 @@ int ff_wmv2_decode_picture_header(MpegEncContext *s)
     if (s->qscale <= 0)
         return AVERROR_INVALIDDATA;
 
+    if (s->pict_type != AV_PICTURE_TYPE_I && show_bits(&s->gb, 1)) {
+        GetBitContext gb = s->gb;
+        int skip_type = get_bits(&gb, 2);
+        int run = skip_type == SKIP_TYPE_COL ? s->mb_width : s->mb_height;
+
+        while (run > 0) {
+            int block = FFMIN(run, 25);
+            if (get_bits(&gb, block) + 1 != 1<<block)
+                break;
+            run -= block;
+        }
+        if (!run)
+            return FRAME_SKIPPED;
+    }
+
     return 0;
 }
 
@@ -159,6 +181,14 @@ int ff_wmv2_decode_secondary_picture_header(MpegEncContext *s)
             }
 
             s->dc_table_index = get_bits1(&s->gb);
+
+            // at minimum one bit per macroblock is required at least in a valid frame,
+            // we discard frames much smaller than this. Frames smaller than 1/8 of the
+            // smallest "black/skip" frame generally contain not much recoverable content
+            // while at the same time they have the highest computational requirements
+            // per byte
+            if (get_bits_left(&s->gb) * 8LL < (s->width+15)/16 * ((s->height+15)/16))
+                return AVERROR_INVALIDDATA;
         }
         s->inter_intra_pred = 0;
         s->no_rounding      = 1;
@@ -170,20 +200,14 @@ int ff_wmv2_decode_secondary_picture_header(MpegEncContext *s)
         }
     } else {
         int cbp_index;
+        int ret;
         w->j_type = 0;
 
-        parse_mb_skip(w);
+        ret = parse_mb_skip(w);
+        if (ret < 0)
+            return ret;
         cbp_index = decode012(&s->gb);
-        if (s->qscale <= 10) {
-            int map[3]         = { 0, 2, 1 };
-            w->cbp_table_index = map[cbp_index];
-        } else if (s->qscale <= 20) {
-            int map[3]         = { 1, 0, 2 };
-            w->cbp_table_index = map[cbp_index];
-        } else {
-            int map[3]         = {2,1,0};
-            w->cbp_table_index = map[cbp_index];
-        }
+        w->cbp_table_index = wmv2_get_cbp_table_index(s, cbp_index);
 
         if (w->mspel_bit)
             s->mspel = get_bits1(&s->gb);
@@ -368,6 +392,8 @@ int ff_wmv2_decode_mb(MpegEncContext *s, int16_t block[6][64])
             w->hshift      = 0;
             return 0;
         }
+        if (get_bits_left(&s->gb) <= 0)
+            return AVERROR_INVALIDDATA;
 
         code = get_vlc2(&s->gb, ff_mb_non_intra_vlc[w->cbp_table_index].table,
                         MB_NON_INTRA_VLC_BITS, 3);
@@ -378,6 +404,8 @@ int ff_wmv2_decode_mb(MpegEncContext *s, int16_t block[6][64])
         cbp = code & 0x3f;
     } else {
         s->mb_intra = 1;
+        if (get_bits_left(&s->gb) <= 0)
+            return AVERROR_INVALIDDATA;
         code = get_vlc2(&s->gb, ff_msmp4_mb_i_vlc.table, MB_INTRA_VLC_BITS, 2);
         if (code < 0) {
             av_log(s->avctx, AV_LOG_ERROR,
diff --git a/libavcodec/wmv2dsp.c b/libavcodec/wmv2dsp.c
index 90073e4..543f01b 100644
--- a/libavcodec/wmv2dsp.c
+++ b/libavcodec/wmv2dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,8 +48,8 @@ static void wmv2_idct_row(short * b)
     a4 = W0 * b[0] - W0 * b[4];
 
     /* step 2 */
-    s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8; // 1, 3, 5, 7
-    s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8;
+    s1 = (int)(181U * (a1 - a5 + a7 - a3) + 128) >> 8; // 1, 3, 5, 7
+    s2 = (int)(181U * (a1 - a5 - a7 + a3) + 128) >> 8;
 
     /* step 3 */
     b[0] = (a0 + a2 + a1 + a5 + (1 << 7)) >> 8;
@@ -78,8 +78,8 @@ static void wmv2_idct_col(short * b)
     a4 = (W0 * b[8 * 0] - W0 * b[8 * 4]    ) >> 3;
 
     /* step 2 */
-    s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8;
-    s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8;
+    s1 = (int)(181U * (a1 - a5 + a7 - a3) + 128) >> 8;
+    s2 = (int)(181U * (a1 - a5 - a7 + a3) + 128) >> 8;
 
     /* step 3 */
     b[8 * 0] = (a0 + a2 + a1 + a5 + (1 << 13)) >> 14;
@@ -262,4 +262,7 @@ av_cold void ff_wmv2dsp_init(WMV2DSPContext *c)
     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
+
+    if (ARCH_MIPS)
+        ff_wmv2dsp_init_mips(c);
 }
diff --git a/libavcodec/wmv2dsp.h b/libavcodec/wmv2dsp.h
index b38f4dc..5e40b30 100644
--- a/libavcodec/wmv2dsp.h
+++ b/libavcodec/wmv2dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,5 +33,6 @@ typedef struct WMV2DSPContext {
 } WMV2DSPContext;
 
 void ff_wmv2dsp_init(WMV2DSPContext *c);
+void ff_wmv2dsp_init_mips(WMV2DSPContext *c);
 
 #endif /* AVCODEC_WMV2DSP_H */
diff --git a/libavcodec/wmv2enc.c b/libavcodec/wmv2enc.c
index b09942e..74ae12b 100644
--- a/libavcodec/wmv2enc.c
+++ b/libavcodec/wmv2enc.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,7 +62,7 @@ static av_cold int wmv2_encode_init(AVCodecContext *avctx)
     ff_wmv2_common_init(w);
 
     avctx->extradata_size = 4;
-    avctx->extradata      = av_mallocz(avctx->extradata_size + 10);
+    avctx->extradata      = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
@@ -88,10 +88,10 @@ int ff_wmv2_encode_picture_header(MpegEncContext *s, int picture_number)
     w->abt_type        = 0;
     w->j_type          = 0;
 
-    assert(s->flipflop_rounding);
+    av_assert0(s->flipflop_rounding);
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
-        assert(s->no_rounding == 1);
+        av_assert0(s->no_rounding == 1);
         if (w->j_type_bit)
             put_bits(&s->pb, 1, w->j_type);
 
@@ -112,16 +112,7 @@ int ff_wmv2_encode_picture_header(MpegEncContext *s, int picture_number)
         put_bits(&s->pb, 2, SKIP_TYPE_NONE);
 
         ff_msmpeg4_code012(&s->pb, cbp_index = 0);
-        if (s->qscale <= 10) {
-            int map[3]         = { 0, 2, 1 };
-            w->cbp_table_index = map[cbp_index];
-        } else if (s->qscale <= 20) {
-            int map[3]         = { 1, 0, 2 };
-            w->cbp_table_index = map[cbp_index];
-        } else {
-            int map[3]         = { 2, 1, 0 };
-            w->cbp_table_index = map[cbp_index];
-        }
+        w->cbp_table_index = wmv2_get_cbp_table_index(s, cbp_index);
 
         if (w->mspel_bit)
             put_bits(&s->pb, 1, s->mspel);
@@ -174,10 +165,12 @@ void ff_wmv2_encode_mb(MpegEncContext *s, int16_t block[6][64],
                  ff_wmv2_inter_table[w->cbp_table_index][cbp + 64][1],
                  ff_wmv2_inter_table[w->cbp_table_index][cbp + 64][0]);
 
+        s->misc_bits += get_bits_diff(s);
         /* motion vector */
         ff_h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
         ff_msmpeg4_encode_motion(s, motion_x - pred_x,
                                  motion_y - pred_y);
+        s->mv_bits += get_bits_diff(s);
     } else {
         /* compute cbp */
         cbp       = 0;
@@ -210,10 +203,15 @@ void ff_wmv2_encode_mb(MpegEncContext *s, int16_t block[6][64],
                      ff_table_inter_intra[s->h263_aic_dir][1],
                      ff_table_inter_intra[s->h263_aic_dir][0]);
         }
+        s->misc_bits += get_bits_diff(s);
     }
 
     for (i = 0; i < 6; i++)
         ff_msmpeg4_encode_block(s, block[i], i);
+    if (s->mb_intra)
+        s->i_tex_bits += get_bits_diff(s);
+    else
+        s->p_tex_bits += get_bits_diff(s);
 }
 
 static const AVClass wmv2_class = {
diff --git a/libavcodec/wnv1.c b/libavcodec/wnv1.c
index f498c20..915e9c7 100644
--- a/libavcodec/wnv1.c
+++ b/libavcodec/wnv1.c
@@ -2,20 +2,20 @@
  * Winnov WNV1 codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,17 +25,14 @@
  */
 
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
-#include "vlc.h"
 
 
 typedef struct WNV1Context {
-    AVCodecContext *avctx;
-
     int shift;
-    BitstreamContext bc;
+    GetBitContext gb;
 } WNV1Context;
 
 static const uint16_t code_tab[16][2] = {
@@ -50,12 +47,12 @@ static VLC code_vlc;
 /* returns modified base_value */
 static inline int wnv1_get_code(WNV1Context *w, int base_value)
 {
-    int v = bitstream_read_vlc(&w->bc, code_vlc.table, CODE_VLC_BITS, 1);
+    int v = get_vlc2(&w->gb, code_vlc.table, CODE_VLC_BITS, 1);
 
     if (v == 15)
-        return ff_reverse[bitstream_read(&w->bc, 8 - w->shift)];
+        return ff_reverse[get_bits(&w->gb, 8 - w->shift)];
     else
-        return base_value + ((v - 7) << w->shift);
+        return base_value + ((v - 7U) << w->shift);
 }
 
 static int decode_frame(AVCodecContext *avctx,
@@ -71,8 +68,8 @@ static int decode_frame(AVCodecContext *avctx,
     int prev_y = 0, prev_u = 0, prev_v = 0;
     uint8_t *rbuf;
 
-    if (buf_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Packet is too short\n");
+    if (buf_size < 8 + avctx->height * (avctx->width/2)/8) {
+        av_log(avctx, AV_LOG_ERROR, "Packet size %d is too small\n", buf_size);
         return AVERROR_INVALIDDATA;
     }
 
@@ -81,9 +78,9 @@ static int decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
         return AVERROR(ENOMEM);
     }
+    memset(rbuf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
     if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         av_free(rbuf);
         return ret;
     }
@@ -91,7 +88,9 @@ static int decode_frame(AVCodecContext *avctx,
 
     for (i = 8; i < buf_size; i++)
         rbuf[i] = ff_reverse[buf[i]];
-    bitstream_init8(&l->bc, rbuf + 8, buf_size - 8);
+
+    if ((ret = init_get_bits8(&l->gb, rbuf + 8, buf_size - 8)) < 0)
+        return ret;
 
     if (buf[2] >> 4 == 6)
         l->shift = 2;
@@ -135,10 +134,8 @@ static int decode_frame(AVCodecContext *avctx,
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    WNV1Context * const l = avctx->priv_data;
     static VLC_TYPE code_table[1 << CODE_VLC_BITS][2];
 
-    l->avctx       = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV422P;
 
     code_vlc.table           = code_table;
diff --git a/libavcodec/wrapped_avframe.c b/libavcodec/wrapped_avframe.c
index e1273e4..85ff32d 100644
--- a/libavcodec/wrapped_avframe.c
+++ b/libavcodec/wrapped_avframe.c
@@ -2,20 +2,20 @@
  * AVFrame wrapper
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  */
 
 #include "avcodec.h"
+#include "decode.h"
 #include "internal.h"
 
 #include "libavutil/internal.h"
@@ -43,19 +44,31 @@ static int wrapped_avframe_encode(AVCodecContext *avctx, AVPacket *pkt,
                      const AVFrame *frame, int *got_packet)
 {
     AVFrame *wrapped = av_frame_clone(frame);
+    uint8_t *data;
+    int size = sizeof(*wrapped) + AV_INPUT_BUFFER_PADDING_SIZE;
 
     if (!wrapped)
         return AVERROR(ENOMEM);
 
-    pkt->buf = av_buffer_create((uint8_t *)wrapped, sizeof(*wrapped),
+    data = av_mallocz(size);
+    if (!data) {
+        av_frame_free(&wrapped);
+        return AVERROR(ENOMEM);
+    }
+
+    pkt->buf = av_buffer_create(data, size,
                                 wrapped_avframe_release_buffer, NULL,
                                 AV_BUFFER_FLAG_READONLY);
     if (!pkt->buf) {
         av_frame_free(&wrapped);
+        av_freep(&data);
         return AVERROR(ENOMEM);
     }
 
-    pkt->data = (uint8_t *)wrapped;
+    av_frame_move_ref((AVFrame*)data, wrapped);
+    av_frame_free(&wrapped);
+
+    pkt->data = data;
     pkt->size = sizeof(*wrapped);
 
     pkt->flags |= AV_PKT_FLAG_KEY;
@@ -63,6 +76,39 @@ static int wrapped_avframe_encode(AVCodecContext *avctx, AVPacket *pkt,
     return 0;
 }
 
+static int wrapped_avframe_decode(AVCodecContext *avctx, void *data,
+                                  int *got_frame, AVPacket *pkt)
+{
+    AVFrame *in, *out;
+    int err;
+
+    if (!(pkt->flags & AV_PKT_FLAG_TRUSTED)) {
+        // This decoder is not usable with untrusted input.
+        return AVERROR(EPERM);
+    }
+
+    if (pkt->size < sizeof(AVFrame))
+        return AVERROR(EINVAL);
+
+    in  = (AVFrame*)pkt->data;
+    out = data;
+
+    err = ff_decode_frame_props(avctx, out);
+    if (err < 0)
+        return err;
+
+    av_frame_move_ref(out, in);
+
+    err = ff_attach_decode_data(out);
+    if (err < 0) {
+        av_frame_unref(out);
+        return err;
+    }
+
+    *got_frame = 1;
+    return 0;
+}
+
 AVCodec ff_wrapped_avframe_encoder = {
     .name           = "wrapped_avframe",
     .long_name      = NULL_IF_CONFIG_SMALL("AVFrame to AVPacket passthrough"),
@@ -71,3 +117,12 @@ AVCodec ff_wrapped_avframe_encoder = {
     .encode2        = wrapped_avframe_encode,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
+
+AVCodec ff_wrapped_avframe_decoder = {
+    .name           = "wrapped_avframe",
+    .long_name      = NULL_IF_CONFIG_SMALL("AVPacket to AVFrame passthrough"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WRAPPED_AVFRAME,
+    .decode         = wrapped_avframe_decode,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
diff --git a/libavcodec/ws-snd1.c b/libavcodec/ws-snd1.c
index 11b7289..0f00580 100644
--- a/libavcodec/ws-snd1.c
+++ b/libavcodec/ws-snd1.c
@@ -2,20 +2,20 @@
  * Westwood SNDx codecs
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,15 +76,13 @@ static int ws_snd_decode_frame(AVCodecContext *avctx, void *data,
 
     if (in_size > buf_size) {
         av_log(avctx, AV_LOG_ERROR, "Frame data is larger than input buffer\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     /* get output buffer */
     frame->nb_samples = out_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples     = frame->data[0];
     samples_end = samples + out_size;
 
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index deff63c..2350c8b 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -3,11 +3,14 @@ OBJS                                   += x86/constants.o               \
 # subsystems
 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
-OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp.o
+OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
+                                          x86/dirac_dwt_init.o
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
+OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
 OBJS-$(CONFIG_FMTCONVERT)              += x86/fmtconvert_init.o
 OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
 OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
@@ -15,11 +18,14 @@ OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
 OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
 OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
+OBJS-$(CONFIG_LLVIDENCDSP)             += x86/lossless_videoencdsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
-OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
+OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
-OBJS-$(CONFIG_MDCT)                    += x86/mdct_init.o
+OBJS-$(CONFIG_MDCT15)                  += x86/mdct15_init.o
 OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
@@ -36,44 +42,60 @@ OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 # decoders/encoders
-OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
-OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
+OBJS-$(CONFIG_AAC_DECODER)             += x86/aacpsdsp_init.o          \
+                                          x86/sbrdsp_init.o
+OBJS-$(CONFIG_AAC_ENCODER)             += x86/aacencdsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_DECODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
+OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
-OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
+OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
+OBJS-$(CONFIG_OPUS_DECODER)            += x86/opus_dsp_init.o
+OBJS-$(CONFIG_OPUS_ENCODER)            += x86/opus_dsp_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
-OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp.o
+OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
 OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
 OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
-OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc.o
-OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
+OBJS-$(CONFIG_SBC_ENCODER)             += x86/sbcdsp_init.o
+OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
+OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
+OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
+OBJS-$(CONFIG_TTA_ENCODER)             += x86/ttaencdsp_init.o
+OBJS-$(CONFIG_UTVIDEO_DECODER)         += x86/utvideodsp_init.o
+OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
 OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP3_DECODER)             += x86/hpeldsp_vp3_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
-OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o            \
+                                          x86/vp9dsp_init_10bpp.o      \
+                                          x86/vp9dsp_init_12bpp.o      \
+                                          x86/vp9dsp_init_16bpp.o
+OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
 
 
 # GCC inline assembly optimizations
 # subsystems
 MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
-MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
-                                          x86/hpeldsp_mmx.o
-MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
-                                          x86/simple_idct.o
-MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
 MMX-OBJS-$(CONFIG_VC1DSP)              += x86/vc1dsp_mmx.o
 
 # decoders/encoders
-MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
-                                          x86/xvididct_sse2.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
 
 # subsystems
 X86ASM-OBJS-$(CONFIG_AC3DSP)           += x86/ac3dsp.o                  \
                                           x86/ac3dsp_downmix.o
 X86ASM-OBJS-$(CONFIG_AUDIODSP)         += x86/audiodsp.o
+X86ASM-OBJS-$(CONFIG_BLOCKDSP)         += x86/blockdsp.o
 X86ASM-OBJS-$(CONFIG_BSWAPDSP)         += x86/bswapdsp.o
 X86ASM-OBJS-$(CONFIG_DCT)              += x86/dct32.o
 X86ASM-OBJS-$(CONFIG_FFT)              += x86/fft.o
@@ -96,35 +118,79 @@ X86ASM-OBJS-$(CONFIG_H264QPEL)         += x86/h264_qpel_8bit.o          \
 X86ASM-OBJS-$(CONFIG_HPELDSP)          += x86/fpel.o                    \
                                           x86/hpeldsp.o
 X86ASM-OBJS-$(CONFIG_HUFFYUVDSP)       += x86/huffyuvdsp.o
+X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP)    += x86/huffyuvencdsp.o
+X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/idctdsp.o
+X86ASM-OBJS-$(CONFIG_LLAUDDSP)         += x86/lossless_audiodsp.o
+X86ASM-OBJS-$(CONFIG_LLVIDDSP)         += x86/lossless_videodsp.o
+X86ASM-OBJS-$(CONFIG_LLVIDENCDSP)      += x86/lossless_videoencdsp.o
+X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o
 X86ASM-OBJS-$(CONFIG_ME_CMP)           += x86/me_cmp.o
 X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP)     += x86/imdct36.o
 X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC)     += x86/mpegvideoencdsp.o
+X86ASM-OBJS-$(CONFIG_OPUS_ENCODER)     += x86/opus_pvq_search.o
 X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP)      += x86/pixblockdsp.o
 X86ASM-OBJS-$(CONFIG_QPELDSP)          += x86/qpeldsp.o                 \
                                           x86/fpel.o                    \
                                           x86/qpel.o
 X86ASM-OBJS-$(CONFIG_RV34DSP)          += x86/rv34dsp.o
-X86ASM-OBJS-$(CONFIG_VC1DSP)           += x86/vc1dsp.o
+X86ASM-OBJS-$(CONFIG_VC1DSP)           += x86/vc1dsp_loopfilter.o       \
+                                          x86/vc1dsp_mc.o
+X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/simple_idct10.o           \
+                                          x86/simple_idct.o
 X86ASM-OBJS-$(CONFIG_VIDEODSP)         += x86/videodsp.o
 X86ASM-OBJS-$(CONFIG_VP3DSP)           += x86/vp3dsp.o
 X86ASM-OBJS-$(CONFIG_VP8DSP)           += x86/vp8dsp.o                  \
                                           x86/vp8dsp_loopfilter.o
 
 # decoders/encoders
-X86ASM-OBJS-$(CONFIG_AAC_DECODER)      += x86/sbrdsp.o
-X86ASM-OBJS-$(CONFIG_APE_DECODER)      += x86/apedsp.o
-X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o
+X86ASM-OBJS-$(CONFIG_AAC_DECODER)      += x86/aacpsdsp.o                \
+                                          x86/sbrdsp.o
+X86ASM-OBJS-$(CONFIG_AAC_ENCODER)      += x86/aacencdsp.o
+X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
+X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
+X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
+X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
+X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
+X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
+                                          x86/dirac_dwt.o
 X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
+X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
+X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
+ifdef CONFIG_GPL
+X86ASM-OBJS-$(CONFIG_FLAC_ENCODER)     += x86/flac_dsp_gpl.o
+endif
 X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += x86/hevc_add_res.o            \
                                           x86/hevc_deblock.o            \
                                           x86/hevc_idct.o               \
-                                          x86/hevc_mc.o
+                                          x86/hevc_mc.o                 \
+                                          x86/hevc_sao.o                \
+                                          x86/hevc_sao_10bit.o
+X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
+X86ASM-OBJS-$(CONFIG_MLP_DECODER)      += x86/mlpdsp.o
+X86ASM-OBJS-$(CONFIG_MPEG4_DECODER)    += x86/xvididct.o
 X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_PRORES_DECODER)   += x86/proresdsp.o
+X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
 X86ASM-OBJS-$(CONFIG_RV40_DECODER)     += x86/rv40dsp.o
+X86ASM-OBJS-$(CONFIG_SBC_ENCODER)      += x86/sbcdsp.o
+X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER)     += x86/svq1enc.o
+X86ASM-OBJS-$(CONFIG_TAK_DECODER)      += x86/takdsp.o
+X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER)   += x86/mlpdsp.o
+X86ASM-OBJS-$(CONFIG_TTA_DECODER)      += x86/ttadsp.o
+X86ASM-OBJS-$(CONFIG_TTA_ENCODER)      += x86/ttaencdsp.o
+X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER)  += x86/utvideodsp.o
 X86ASM-OBJS-$(CONFIG_V210_ENCODER)     += x86/v210enc.o
+X86ASM-OBJS-$(CONFIG_V210_DECODER)     += x86/v210.o
 X86ASM-OBJS-$(CONFIG_VORBIS_DECODER)   += x86/vorbisdsp.o
 X86ASM-OBJS-$(CONFIG_VP3_DECODER)      += x86/hpeldsp_vp3.o
 X86ASM-OBJS-$(CONFIG_VP6_DECODER)      += x86/vp6dsp.o
-X86ASM-OBJS-$(CONFIG_VP9_DECODER)      += x86/vp9mc.o                   \
-                                          x86/vp9lpf.o
+X86ASM-OBJS-$(CONFIG_VP9_DECODER)      += x86/vp9intrapred.o            \
+                                          x86/vp9intrapred_16bpp.o      \
+                                          x86/vp9itxfm.o                \
+                                          x86/vp9itxfm_16bpp.o          \
+                                          x86/vp9lpf.o                  \
+                                          x86/vp9lpf_16bpp.o            \
+                                          x86/vp9mc.o                   \
+                                          x86/vp9mc_16bpp.o
+X86ASM-OBJS-$(CONFIG_WEBP_DECODER)     += x86/vp8dsp.o
diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
new file mode 100644
index 0000000..97af571
--- /dev/null
+++ b/libavcodec/x86/aacencdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD optimized AAC encoder DSP functions
+;*
+;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+float_abs_mask: times 4 dd 0x7fffffff
+
+SECTION .text
+
+;*******************************************************************
+;void ff_abs_pow34(float *out, const float *in, const int size);
+;*******************************************************************
+INIT_XMM sse
+cglobal abs_pow34, 3, 3, 3, out, in, size
+    mova   m2, [float_abs_mask]
+    shl    sizeq, 2
+    add    inq, sizeq
+    add    outq, sizeq
+    neg    sizeq
+.loop:
+    andps  m0, m2, [inq+sizeq]
+    sqrtps m1, m0
+    mulps  m0, m1
+    sqrtps m0, m0
+    mova   [outq+sizeq], m0
+    add    sizeq, mmsize
+    jl    .loop
+    RET
+
+;*******************************************************************
+;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
+;                           int size, int is_signed, int maxval, const float Q34,
+;                           const float rounding)
+;*******************************************************************
+INIT_XMM sse2
+cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
+%if UNIX64 == 0
+    movss     m0, Q34m
+    movss     m1, roundingm
+    cvtsi2ss  m3, dword maxvalm
+%else
+    cvtsi2ss  m3, maxvald
+%endif
+    shufps    m0, m0, 0
+    shufps    m1, m1, 0
+    shufps    m3, m3, 0
+    shl       is_signedd, 31
+    movd      m4, is_signedd
+    shufps    m4, m4, 0
+    shl       sized,   2
+    add       inq, sizeq
+    add       outq, sizeq
+    add       scaledq, sizeq
+    neg       sizeq
+.loop:
+    mulps     m2, m0, [scaledq+sizeq]
+    addps     m2, m1
+    minps     m2, m3
+    andps     m5, m4, [inq+sizeq]
+    orps      m2, m5
+    cvttps2dq m2, m2
+    mova      [outq+sizeq], m2
+    add       sizeq, mmsize
+    jl       .loop
+    RET
diff --git a/libavcodec/x86/aacencdsp_init.c b/libavcodec/x86/aacencdsp_init.c
new file mode 100644
index 0000000..d761c3c
--- /dev/null
+++ b/libavcodec/x86/aacencdsp_init.c
@@ -0,0 +1,43 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_sse(float *out, const float *in, const int size);
+
+void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
+                                int size, int is_signed, int maxval, const float Q34,
+                                const float rounding);
+
+av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags))
+        s->abs_pow34   = ff_abs_pow34_sse;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->quant_bands = ff_aac_quantize_bands_sse2;
+}
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
new file mode 100644
index 0000000..4acd087
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -0,0 +1,487 @@
+;******************************************************************************
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+SECTION .text
+
+;*************************************************************************
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
+;*************************************************************************
+%macro PS_ADD_SQUARES 1
+cglobal ps_add_squares, 3, 3, %1, dst, src, n
+    shl    nd, 3
+    add  srcq, nq
+    neg    nq
+
+align 16
+.loop:
+    movaps m0, [srcq+nq]
+    movaps m1, [srcq+nq+mmsize]
+    mulps  m0, m0
+    mulps  m1, m1
+    HADDPS m0, m1, m2
+    addps  m0, [dstq]
+    movaps [dstq], m0
+    add  dstq, mmsize
+    add    nq, mmsize*2
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_ADD_SQUARES 2
+INIT_XMM sse3
+PS_ADD_SQUARES 3
+
+;*******************************************************************
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
+;                                   float *src1, int n);
+;*******************************************************************
+INIT_XMM sse
+cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
+    shl      nd, 3
+    add   src1q, nq
+    add    dstq, nq
+    neg      nq
+
+align 16
+.loop:
+    movu     m0, [src1q+nq]
+    movu     m1, [src1q+nq+mmsize]
+    mova     m2, [src2q]
+    mova     m3, m2
+    unpcklps m2, m2
+    unpckhps m3, m3
+    mulps    m0, m2
+    mulps    m1, m3
+    mova [dstq+nq], m0
+    mova [dstq+nq+mmsize], m1
+    add   src2q, mmsize
+    add      nq, mmsize*2
+    jl .loop
+    REP_RET
+
+;***********************************************************************
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+;                                   float h[2][4], float h_step[2][4],
+;                                   int len);
+;***********************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [h_stepq]
+    unpcklps m4, m0, m0
+    unpckhps m0, m0
+    unpcklps m5, m1, m1
+    unpckhps m1, m1
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m4, m5
+    addps    m0, m1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    mulps    m2, m4
+    mulps    m3, m0
+    addps    m2, m3
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+    REP_RET
+
+;***************************************************************************
+;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+;                                       float h[2][4], float h_step[2][4],
+;                                       int len);
+;***************************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [hq+mmsize]
+%if ARCH_X86_64
+    movaps   m8, [h_stepq]
+    movaps   m9, [h_stepq+mmsize]
+    %define  H_STEP0 m8
+    %define  H_STEP1 m9
+%else
+    %define  H_STEP0 [h_stepq]
+    %define  H_STEP1 [h_stepq+mmsize]
+%endif
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m0, H_STEP0
+    addps    m1, H_STEP1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    shufps   m4, m2, m2, q2301
+    shufps   m5, m3, m3, q2301
+    unpcklps m6, m0, m0
+    unpckhps m7, m0, m0
+    mulps    m2, m6
+    mulps    m3, m7
+    unpcklps m6, m1, m1
+    unpckhps m7, m1, m1
+    mulps    m4, m6
+    mulps    m5, m7
+    addps    m2, m3
+    addsubps m2, m4
+    addsubps m2, m5
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+    REP_RET
+
+;**********************************************************
+;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
+;                                   float (*in)[32][2],
+;                                   int i, int len)
+;**********************************************************
+INIT_XMM sse
+cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea                inq, [inq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add               outq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [in0q]
+    movaps              m1, [in1q]
+    movaps              m2, [in0q+lenq]
+    movaps              m3, [in1q+lenq]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    movaps   [outq+lenq*2], m2
+    movaps [outq+3*32*2*4], m3
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add                inq, 16
+    add               outq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movlps              m0, [in0q]
+    movlps              m1, [in1q]
+    movhps              m0, [in0q+lenq]
+    movhps              m1, [in1q+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add                inq, 8
+    add               outq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movss               m0, [in0q]
+    movss               m1, [in1q]
+    movss               m2, [in0q+lenq]
+    movss               m3, [in1q+lenq]
+    movlhps             m0, m1
+    movlhps             m2, m3
+    shufps              m0, m2, q2020
+    movaps          [outq], m0
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add                inq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+
+;***********************************************************
+;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
+;                                    float (*in)[32][2],
+;                                    int i, int len)
+;***********************************************************
+%macro HYBRID_SYNTHESIS_DEINT 0
+cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
+%if cpuflag(sse4)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea               outq, [outq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add                inq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    movaps              m2, [inq+lenq*2]
+    movaps              m3, [inq+3*32*2*4]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps         [out0q], m0
+    movaps         [out1q], m1
+    movaps    [out0q+lenq], m2
+    movaps    [out1q+lenq], m3
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add               outq, 16
+    add                inq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    MOVH           [out0q], m0
+    MOVH           [out1q], m1
+    movhps    [out0q+lenq], m0
+    movhps    [out1q+lenq], m1
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add               outq, 8
+    add                inq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movaps              m0, [inq]
+    movss          [out0q], m0
+%if cpuflag(sse4)
+    extractps      [out1q], m0, 1
+    extractps [out0q+lenq], m0, 2
+    extractps [out1q+lenq], m0, 3
+%else
+    movhlps             m1, m0
+    movss     [out0q+lenq], m1
+    shufps              m0, m0, 0xb1
+    movss          [out1q], m0
+    movhlps             m1, m0
+    movss     [out1q+lenq], m1
+%endif
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add               outq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+%endmacro
+
+INIT_XMM sse
+HYBRID_SYNTHESIS_DEINT
+INIT_XMM sse4
+HYBRID_SYNTHESIS_DEINT
+
+;*******************************************************************
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
+;                                 const float (*filter)[8][2],
+;                                 ptrdiff_t stride, int n);
+;*******************************************************************
+%macro PS_HYBRID_ANALYSIS_LOOP 3
+    movu     %1, [inq+mmsize*%3]
+    movu     m1, [inq+mmsize*(5-%3)+8]
+%if cpuflag(sse3)
+    pshufd   %2, %1, q2301
+    pshufd   m4, m1, q0123
+    pshufd   m1, m1, q1032
+    pshufd   m2, [filterq+nq+mmsize*%3], q2301
+    addsubps %2, m4
+    addsubps %1, m1
+%else
+    mova     m2, [filterq+nq+mmsize*%3]
+    mova     %2, %1
+    mova     m4, m1
+    shufps   %2, %2, q2301
+    shufps   m4, m4, q0123
+    shufps   m1, m1, q1032
+    shufps   m2, m2, q2301
+    xorps    m4, m7
+    xorps    m1, m7
+    subps    %2, m4
+    subps    %1, m1
+%endif
+    mulps    %2, m2
+    mulps    %1, m2
+%if %3
+    addps    m3, %2
+    addps    m0, %1
+%endif
+%endmacro
+
+%macro PS_HYBRID_ANALYSIS 0
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
+%if cpuflag(sse3)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    shl strideq, 3
+    shl nd, 6
+    add filterq, nq
+    neg nq
+    mova m7, [ps_p1m1p1m1]
+
+align 16
+.loop:
+    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
+
+%if cpuflag(sse3)
+    pshufd   m3, m3, q2301
+    xorps    m0, m7
+    hsubps   m3, m0
+    pshufd   m1, m3, q0020
+    pshufd   m3, m3, q0031
+    addps    m1, m3
+    movsd    m2, [inq+6*8]
+%else
+    mova     m1, m3
+    mova     m2, m0
+    shufps   m1, m1, q2301
+    shufps   m2, m2, q2301
+    subps    m1, m3
+    addps    m2, m0
+    unpcklps m3, m1, m2
+    unpckhps m1, m2
+    addps    m1, m3
+    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
+%endif
+    movss    m3, [filterq+nq+8*6]
+    SPLATD   m3
+    mulps    m2, m3
+    addps    m1, m2
+    MOVH [outq], m1
+    add    outq, strideq
+    add      nq, 64
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_HYBRID_ANALYSIS
+INIT_XMM sse3
+PS_HYBRID_ANALYSIS
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
new file mode 100644
index 0000000..21f00ef
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -0,0 +1,72 @@
+/*
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_sse  (float *dst, const float (*src)[2], int n);
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+                                          float h[2][4], float h_step[2][4],
+                                          int len);
+void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
+                                      int i, int len);
+void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
+                                       int i, int len);
+void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
+                                      int i, int len);
+
+av_cold void ff_psdsp_init_x86(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse;
+        s->mul_pair_single        = ff_ps_mul_pair_single_sse;
+        s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
+        s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse;
+    }
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse3;
+        s->stereo_interpolate[0]  = ff_ps_stereo_interpolate_sse3;
+        s->stereo_interpolate[1]  = ff_ps_stereo_interpolate_ipdopd_sse3;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse3;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
+    }
+}
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 817d5a3..675ade3 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized AC-3 DSP functions
 ;* Copyright (c) 2011 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -32,7 +32,7 @@ pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
 
 ; used in ff_ac3_extract_exponents()
-pd_1:   times 4 dd 1
+cextern pd_1
 pd_151: times 4 dd 151
 
 ; used in ff_apply_window_int16()
diff --git a/libavcodec/x86/ac3dsp_downmix.asm b/libavcodec/x86/ac3dsp_downmix.asm
index b085035..057cc60 100644
--- a/libavcodec/x86/ac3dsp_downmix.asm
+++ b/libavcodec/x86/ac3dsp_downmix.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized AC-3 downmixing
 ;* Copyright (c) 2012 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 6d049b3..2e7e2fb 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -2,20 +2,20 @@
  * x86-optimized AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/alacdsp.asm b/libavcodec/x86/alacdsp.asm
new file mode 100644
index 0000000..bb2069f
--- /dev/null
+++ b/libavcodec/x86/alacdsp.asm
@@ -0,0 +1,133 @@
+;******************************************************************************
+;* ALAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
+%else
+cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
+%define  buf1q  r2q
+%endif
+    movd    m6, shiftm
+    movd    m7, weightm
+    SPLATD  m7
+    shl   lend, 2
+    mov  buf1q, [buf0q + gprsize]
+    mov  buf0q, [buf0q]
+    add  buf1q, lenq
+    add  buf0q, lenq
+    neg  lenq
+
+align 16
+.loop:
+    mova    m0, [buf0q + lenq]
+    mova    m1, [buf0q + lenq + mmsize]
+    mova    m2, [buf1q + lenq]
+    mova    m3, [buf1q + lenq + mmsize]
+    pmulld  m4, m2, m7
+    pmulld  m5, m3, m7
+    psrad   m4, m6
+    psrad   m5, m6
+    psubd   m0, m4
+    psubd   m1, m5
+    paddd   m2, m0
+    paddd   m3, m1
+    mova [buf1q + lenq], m0
+    mova [buf1q + lenq + mmsize], m1
+    mova [buf0q + lenq], m2
+    mova [buf0q + lenq + mmsize], m3
+
+    add   lenq, mmsize*2
+    jl .loop
+    RET
+
+INIT_XMM sse2
+cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
+    movifnidn lend, lenm
+    movd      m4, r2m ; exbits
+    shl     lend, 2
+    mov    buf1q, [buf0q + gprsize]
+    mov    buf0q, [buf0q]
+    mov  exbuf1q, [exbuf0q + gprsize]
+    mov  exbuf0q, [exbuf0q]
+    add    buf1q, lenq
+    add    buf0q, lenq
+    add  exbuf1q, lenq
+    add  exbuf0q, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [buf0q + lenq]
+    mova      m1, [buf0q + lenq + mmsize]
+    pslld     m0, m4
+    pslld     m1, m4
+    mova      m2, [buf1q + lenq]
+    mova      m3, [buf1q + lenq + mmsize]
+    pslld     m2, m4
+    pslld     m3, m4
+    por       m0, [exbuf0q + lenq]
+    por       m1, [exbuf0q + lenq + mmsize]
+    por       m2, [exbuf1q + lenq]
+    por       m3, [exbuf1q + lenq + mmsize]
+    mova [buf0q + lenq         ], m0
+    mova [buf0q + lenq + mmsize], m1
+    mova [buf1q + lenq         ], m2
+    mova [buf1q + lenq + mmsize], m3
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
+
+%if ARCH_X86_64
+cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
+%else
+cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
+%define exbitsm r2m
+%endif
+    movifnidn lend, r4m
+    movd     m2, exbitsm
+    shl    lend, 2
+    mov    bufq, [bufq]
+    mov  exbufq, [exbufq]
+    add    bufq, lenq
+    add  exbufq, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [bufq + lenq]
+    mova      m1, [bufq + lenq + mmsize]
+    pslld     m0, m2
+    pslld     m1, m2
+    por       m0, [exbufq + lenq]
+    por       m1, [exbufq + lenq + mmsize]
+    mova [bufq + lenq], m0
+    mova [bufq + lenq + mmsize], m1
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/alacdsp_init.c b/libavcodec/x86/alacdsp_init.c
new file mode 100644
index 0000000..18f7308
--- /dev/null
+++ b/libavcodec/x86/alacdsp_init.c
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/alacdsp.h"
+#include "config.h"
+
+void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
+                                     int decorr_shift, int decorr_left_weight);
+void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                           int extra_bits, int channels, int nb_samples);
+void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                         int extra_bits, int channels, int nb_samples);
+
+av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
+        c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_stereo   = ff_alac_decorrelate_stereo_sse4;
+    }
+#endif /* HAVE_X86ASM */
+}
diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index 0f183c5..de395e5 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -2,20 +2,20 @@
 ;* optimized audio functions
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,15 +40,11 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m1
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m2
-    paddd   m2, m0
-    pshuflw m0, m2, 0x4e
-%else
-    pshufw  m0, m2, 0x4e
-%endif
-    paddd   m2, m0
+    HADDD   m2, m0
     movd   eax, m2
+%if mmsize == 8
+    emms
+%endif
     RET
 %endmacro
 
@@ -141,7 +137,8 @@ cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
     VBROADCASTSS m0, minm
     VBROADCASTSS m1, maxm
 %elif WIN64
-    VBROADCASTSS m0, m3
+    SWAP 0, 3
+    VBROADCASTSS m0, m0
     VBROADCASTSS m1, maxm
 %else ; 64bit sysv
     VBROADCASTSS m0, m0
diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c
index 093f3f0..98e296c 100644
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,6 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
                                    int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
                                int32_t min, int32_t max, unsigned int len);
-
 void ff_vector_clipf_sse(float *dst, const float *src,
                          int len, float min, float max);
 
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
new file mode 100644
index 0000000..9d203df
--- /dev/null
+++ b/libavcodec/x86/blockdsp.asm
@@ -0,0 +1,88 @@
+;******************************************************************************
+;* SIMD-optimized clear block functions
+;* Copyright (c) 2002 Michael Niedermayer
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* AVX version by Jokyo Images
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+    ZERO  m0, m0, m0
+%assign %%i 0
+%rep %2
+    mova  [blocksq+mmsize*(0+%%i)], m0
+    mova  [blocksq+mmsize*(1+%%i)], m0
+    mova  [blocksq+mmsize*(2+%%i)], m0
+    mova  [blocksq+mmsize*(3+%%i)], m0
+%assign %%i %%i+4
+%endrep
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 4
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 2
+INIT_YMM avx
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+    add   blocksq, 768
+    mov      lenq, -768
+    ZERO       m0, m0, m0
+.loop:
+    mova  [blocksq+lenq+mmsize*0], m0
+    mova  [blocksq+lenq+mmsize*1], m0
+    mova  [blocksq+lenq+mmsize*2], m0
+    mova  [blocksq+lenq+mmsize*3], m0
+    mova  [blocksq+lenq+mmsize*4], m0
+    mova  [blocksq+lenq+mmsize*5], m0
+    mova  [blocksq+lenq+mmsize*6], m0
+    mova  [blocksq+lenq+mmsize*7], m0
+    add   lenq, mmsize*8
+    js .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
+INIT_YMM avx
+CLEAR_BLOCKS 1
diff --git a/libavcodec/x86/blockdsp.c b/libavcodec/x86/blockdsp.c
deleted file mode 100644
index b047e19..0000000
--- a/libavcodec/x86/blockdsp.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-
-#include "libavcodec/blockdsp.h"
-
-#if HAVE_INLINE_ASM
-
-#define CLEAR_BLOCKS(name, n)                           \
-static void name(int16_t *blocks)                       \
-{                                                       \
-    __asm__ volatile (                                  \
-        "pxor %%mm7, %%mm7                 \n\t"        \
-        "mov     %1,        %%"FF_REG_a"   \n\t"        \
-        "1:                                \n\t"        \
-        "movq %%mm7,   (%0, %%"FF_REG_a")  \n\t"        \
-        "movq %%mm7,  8(%0, %%"FF_REG_a")  \n\t"        \
-        "movq %%mm7, 16(%0, %%"FF_REG_a")  \n\t"        \
-        "movq %%mm7, 24(%0, %%"FF_REG_a")  \n\t"        \
-        "add    $32, %%"FF_REG_a"          \n\t"        \
-        "js      1b                        \n\t"        \
-        :: "r"(((uint8_t *) blocks) + 128 * n),         \
-           "i"(-128 * n)                                \
-        : "%"FF_REG_a);                                 \
-}
-CLEAR_BLOCKS(clear_blocks_mmx, 6)
-CLEAR_BLOCKS(clear_block_mmx, 1)
-
-static void clear_block_sse(int16_t *block)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0          \n"
-        "movaps %%xmm0,    (%0)         \n"
-        "movaps %%xmm0,  16(%0)         \n"
-        "movaps %%xmm0,  32(%0)         \n"
-        "movaps %%xmm0,  48(%0)         \n"
-        "movaps %%xmm0,  64(%0)         \n"
-        "movaps %%xmm0,  80(%0)         \n"
-        "movaps %%xmm0,  96(%0)         \n"
-        "movaps %%xmm0, 112(%0)         \n"
-        :: "r" (block)
-        : "memory");
-}
-
-static void clear_blocks_sse(int16_t *blocks)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0                 \n"
-        "mov        %1,         %%"FF_REG_a"   \n"
-        "1:                                    \n"
-        "movaps %%xmm0,    (%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  16(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  32(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  48(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  64(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  80(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0,  96(%0, %%"FF_REG_a")  \n"
-        "movaps %%xmm0, 112(%0, %%"FF_REG_a")  \n"
-        "add      $128,         %%"FF_REG_a"   \n"
-        "js         1b                         \n"
-        :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6)
-        : "%"FF_REG_a);
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c)
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (INLINE_MMX(cpu_flags)) {
-        c->clear_block  = clear_block_mmx;
-        c->clear_blocks = clear_blocks_mmx;
-    }
-
-    if (INLINE_SSE(cpu_flags)) {
-        c->clear_block  = clear_block_sse;
-        c->clear_blocks = clear_blocks_sse;
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
new file mode 100644
index 0000000..8b01a44
--- /dev/null
+++ b/libavcodec/x86/blockdsp_init.c
@@ -0,0 +1,60 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_block_avx(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+void ff_clear_blocks_avx(int16_t *blocks);
+
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
+                                  AVCodecContext *avctx)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->clear_block  = ff_clear_block_mmx;
+        c->clear_blocks = ff_clear_blocks_mmx;
+    }
+
+    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+        return;
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->clear_block  = ff_clear_block_sse;
+        c->clear_blocks = ff_clear_blocks_sse;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->clear_block  = ff_clear_block_avx;
+        c->clear_blocks = ff_clear_blocks_avx;
+    }
+#endif /* HAVE_X86ASM */
+}
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 4810867..31c6c48 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -1,21 +1,23 @@
 ;******************************************************************************
 ;* optimized bswap buffer functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -24,6 +26,8 @@
 SECTION_RODATA
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+cextern pb_80
+
 SECTION .text
 
 ; %1 = aligned/unaligned
@@ -31,14 +35,18 @@ SECTION .text
     mov      r3d, r2d
     sar      r2d, 3
     jz       .left4_%1
+%if cpuflag(avx2)
+    sar      r2d, 1
+    jz       .left8_%1
+%endif
 .loop8_%1:
     mov%1    m0, [r1 +  0]
-    mov%1    m1, [r1 + 16]
-%if cpuflag(ssse3)
+    mov%1    m1, [r1 + mmsize]
+%if cpuflag(ssse3)||cpuflag(avx2)
     pshufb   m0, m2
     pshufb   m1, m2
     mov%1    [r0 +  0], m0
-    mov%1    [r0 + 16], m1
+    mov%1    [r0 + mmsize], m1
 %else
     pshuflw  m0, m0, 10110001b
     pshuflw  m1, m1, 10110001b
@@ -55,18 +63,29 @@ SECTION .text
     mov%1    [r0 +  0], m2
     mov%1    [r0 + 16], m3
 %endif
-    add      r0, 32
-    add      r1, 32
+    add      r0, mmsize*2
+    add      r1, mmsize*2
     dec      r2d
     jnz      .loop8_%1
+%if cpuflag(avx2)
+.left8_%1:
+    mov      r2d, r3d
+    test     r3d, 8
+    jz       .left4_%1
+    mov%1    m0, [r1]
+    pshufb   m0, m2
+    mov%1    [r0 +  0], m0
+    add r1, mmsize
+    add r0, mmsize
+%endif
 .left4_%1:
     mov      r2d, r3d
     test     r3d, 4
     jz       .left
-    mov%1    m0, [r1]
+    mov%1    xm0, [r1]
 %if cpuflag(ssse3)
-    pshufb   m0, m2
-    mov%1    [r0], m0
+    pshufb   xm0, xm2
+    mov%1    [r0], xm0
 %else
     pshuflw  m0, m0, 10110001b
     pshufhw  m0, m0, 10110001b
@@ -82,13 +101,16 @@ SECTION .text
 
 ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
 %macro BSWAP32_BUF 0
-%if cpuflag(ssse3)
+%if cpuflag(ssse3)||cpuflag(avx2)
 cglobal bswap32_buf, 3,4,3
-    mova     m2, [pb_bswap32]
+    mov      r3, r1
+    VBROADCASTI128  m2, [pb_bswap32]
 %else
 cglobal bswap32_buf, 3,4,5
+    mov      r3, r1
 %endif
-    test     r1, 15
+    or       r3, r0
+    test     r3, mmsize - 1
     jz       .start_align
     BSWAP_LOOPS  u
     jmp      .left
@@ -98,9 +120,9 @@ cglobal bswap32_buf, 3,4,5
 %if cpuflag(ssse3)
     test     r2d, 2
     jz       .left1
-    movq     m0, [r1]
-    pshufb   m0, m2
-    movq     [r0], m0
+    movq     xm0, [r1]
+    pshufb   xm0, xm2
+    movq     [r0], xm0
     add      r1, 8
     add      r0, 8
 .left1:
@@ -130,3 +152,8 @@ BSWAP32_BUF
 
 INIT_XMM ssse3
 BSWAP32_BUF
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+BSWAP32_BUF
+%endif
diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c
index ba40f2d..877bab1 100644
--- a/libavcodec/x86/bswapdsp_init.c
+++ b/libavcodec/x86/bswapdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
 
 av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
 {
@@ -34,4 +35,6 @@ av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
         c->bswap_buf = ff_bswap32_buf_sse2;
     if (EXTERNAL_SSSE3(cpu_flags))
         c->bswap_buf = ff_bswap32_buf_ssse3;
+    if (EXTERNAL_AVX2_FAST(cpu_flags))
+        c->bswap_buf = ff_bswap32_buf_avx2;
 }
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 205511e..cfd3b75 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,28 @@
 #include "libavutil/x86/asm.h"
 #include "config.h"
 
+#if   (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+   || (                  !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+   || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+#       define BROKEN_COMPILER 1
+#else
+#       define BROKEN_COMPILER 0
+#endif
+
 #if HAVE_INLINE_ASM
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
+        "jge    1f                                                      \n\t"
+#endif
+
 #ifdef BROKEN_RELOCATIONS
 #define TABLES_ARG , "r"(tables)
 
@@ -73,11 +93,10 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         "jnz    2f                                                      \n\t"\
         "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
-        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
-        "movzwl (%%"FF_REG_c"), "tmp"                                   \n\t"\
+        "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
         "lea    -1("low")   , %%ecx                                     \n\t"\
         "xor    "low"       , %%ecx                                     \n\t"\
         "shr    $15         , %%ecx                                     \n\t"\
@@ -92,7 +111,8 @@
         "2:                                                             \n\t"
 
 #else /* BROKEN_RELOCATIONS */
-#define TABLES_ARG
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
 
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,8 +154,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         " jnz   2f                                                      \n\t"\
         "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
-        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
@@ -154,8 +173,7 @@
 
 #endif /* BROKEN_RELOCATIONS */
 
-
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define get_cabac_inline get_cabac_inline_x86
 static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                                                  uint8_t *const state)
@@ -167,6 +185,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -178,17 +197,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                              AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%8")
-        : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+        : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
         : "r"(state), "r"(c),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end))
           TABLES_ARG
+          ,"1"(c->low), "2"(c->range)
         : "%"FF_REG_c, "memory"
     );
     return bit & 1;
 }
-#endif /* HAVE_7REGS */
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
 
+#if !BROKEN_COMPILER
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
@@ -199,7 +220,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "xor           %%edx, %%ecx     \n\t"
@@ -211,10 +232,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "movzwl         (%1), %%edx     \n\t"
         "bswap         %%edx            \n\t"
         "shrl            $15, %%edx     \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "add              $2, %1        \n\t"
+        "addl          %%edx, %%eax     \n\t"
+        "mov              %1, %c4(%2)   \n\t"
+#else
         "addl          %%edx, %%eax     \n\t"
         "cmp         %c5(%2), %1        \n\t"
         "jge              1f            \n\t"
         "add"FF_OPSIZE"   $2, %c4(%2)   \n\t"
+#endif
         "1:                             \n\t"
         "movl          %%eax, %c3(%2)   \n\t"
 
@@ -240,7 +267,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "inc           %%edx            \n\t"
@@ -268,6 +295,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
     );
     return res;
 }
+#endif /* !BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_CABAC_H */
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index b5406ef..becb3a4 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -5,20 +5,20 @@
  * MMX-optimized DSP functions, based on H.264 optimizations by
  * Michael Niedermayer and Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,172 +34,28 @@
 #include "idctdsp.h"
 #include "config.h"
 
-#if HAVE_MMX_INLINE
 
-/* in/out: mma=mma+mmb, mmb=mmb-mma */
-#define SUMSUB_BA( a, b ) \
-    "paddw "#b", "#a" \n\t"\
-    "paddw "#b", "#b" \n\t"\
-    "psubw "#a", "#b" \n\t"
+#if HAVE_MMX_EXTERNAL
 
-/*****************************************************************************
- *
- * inverse transform
- *
- ****************************************************************************/
+void ff_cavs_idct8_mmx(int16_t *out, const int16_t *in);
 
-static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
+static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
 {
-    __asm__ volatile(
-        "movq 112(%0), %%mm4  \n\t" /* mm4 = src7 */
-        "movq  16(%0), %%mm5  \n\t" /* mm5 = src1 */
-        "movq  80(%0), %%mm2  \n\t" /* mm2 = src5 */
-        "movq  48(%0), %%mm7  \n\t" /* mm7 = src3 */
-        "movq   %%mm4, %%mm0  \n\t"
-        "movq   %%mm5, %%mm3  \n\t"
-        "movq   %%mm2, %%mm6  \n\t"
-        "movq   %%mm7, %%mm1  \n\t"
-
-        "paddw  %%mm4, %%mm4  \n\t" /* mm4 = 2*src7 */
-        "paddw  %%mm3, %%mm3  \n\t" /* mm3 = 2*src1 */
-        "paddw  %%mm6, %%mm6  \n\t" /* mm6 = 2*src5 */
-        "paddw  %%mm1, %%mm1  \n\t" /* mm1 = 2*src3 */
-        "paddw  %%mm4, %%mm0  \n\t" /* mm0 = 3*src7 */
-        "paddw  %%mm3, %%mm5  \n\t" /* mm5 = 3*src1 */
-        "paddw  %%mm6, %%mm2  \n\t" /* mm2 = 3*src5 */
-        "paddw  %%mm1, %%mm7  \n\t" /* mm7 = 3*src3 */
-        "psubw  %%mm4, %%mm5  \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
-        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
-        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
-        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
-
-        "movq   %%mm5, %%mm4  \n\t"
-        "movq   %%mm7, %%mm6  \n\t"
-        "movq   %%mm3, %%mm0  \n\t"
-        "movq   %%mm1, %%mm2  \n\t"
-        SUMSUB_BA( %%mm7, %%mm5 )   /* mm7 = a0 + a1  mm5 = a0 - a1 */
-        "paddw  %%mm3, %%mm7  \n\t" /* mm7 = a0 + a1 + a3 */
-        "paddw  %%mm1, %%mm5  \n\t" /* mm5 = a0 - a1 + a2 */
-        "paddw  %%mm7, %%mm7  \n\t"
-        "paddw  %%mm5, %%mm5  \n\t"
-        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = b4 */
-        "paddw  %%mm4, %%mm5  \n\t" /* mm5 = b5 */
-
-        SUMSUB_BA( %%mm1, %%mm3 )   /* mm1 = a3 + a2  mm3 = a3 - a2 */
-        "psubw  %%mm1, %%mm4  \n\t" /* mm4 = a0 - a2 - a3 */
-        "movq   %%mm4, %%mm1  \n\t" /* mm1 = a0 - a2 - a3 */
-        "psubw  %%mm6, %%mm3  \n\t" /* mm3 = a3 - a2 - a1 */
-        "paddw  %%mm1, %%mm1  \n\t"
-        "paddw  %%mm3, %%mm3  \n\t"
-        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = b7 */
-        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = b6 */
-
-        "movq  32(%0), %%mm2  \n\t" /* mm2 = src2 */
-        "movq  96(%0), %%mm6  \n\t" /* mm6 = src6 */
-        "movq   %%mm2, %%mm4  \n\t"
-        "movq   %%mm6, %%mm0  \n\t"
-        "psllw  $2,    %%mm4  \n\t" /* mm4 = 4*src2 */
-        "psllw  $2,    %%mm6  \n\t" /* mm6 = 4*src6 */
-        "paddw  %%mm4, %%mm2  \n\t" /* mm2 = 5*src2 */
-        "paddw  %%mm6, %%mm0  \n\t" /* mm0 = 5*src6 */
-        "paddw  %%mm2, %%mm2  \n\t"
-        "paddw  %%mm0, %%mm0  \n\t"
-        "psubw  %%mm0, %%mm4  \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
-        "paddw  %%mm2, %%mm6  \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
-
-        "movq    (%0), %%mm2  \n\t" /* mm2 = src0 */
-        "movq  64(%0), %%mm0  \n\t" /* mm0 = src4 */
-        SUMSUB_BA( %%mm0, %%mm2 )   /* mm0 = src0+src4  mm2 = src0-src4 */
-        "psllw  $3,    %%mm0  \n\t"
-        "psllw  $3,    %%mm2  \n\t"
-        "paddw  %1,    %%mm0  \n\t" /* add rounding bias */
-        "paddw  %1,    %%mm2  \n\t" /* add rounding bias */
-
-        SUMSUB_BA( %%mm6, %%mm0 )   /* mm6 = a4 + a6  mm0 = a4 - a6 */
-        SUMSUB_BA( %%mm4, %%mm2 )   /* mm4 = a5 + a7  mm2 = a5 - a7 */
-        SUMSUB_BA( %%mm7, %%mm6 )   /* mm7 = dst0  mm6 = dst7 */
-        SUMSUB_BA( %%mm5, %%mm4 )   /* mm5 = dst1  mm4 = dst6 */
-        SUMSUB_BA( %%mm3, %%mm2 )   /* mm3 = dst2  mm2 = dst5 */
-        SUMSUB_BA( %%mm1, %%mm0 )   /* mm1 = dst3  mm0 = dst4 */
-        :: "r"(block), "m"(bias)
-    );
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
+    ff_cavs_idct8_mmx(b2, block);
+    ff_add_pixels_clamped_mmx(b2, dst, stride);
 }
 
-#define SBUTTERFLY(a,b,t,n,m)\
-    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
-    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
-    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
-
-#define TRANSPOSE4(a,b,c,d,t)\
-    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
-    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
-    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
-    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
+void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
 
-static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
+static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
 {
-    int i;
-    DECLARE_ALIGNED(8, int16_t, b2)[64];
-
-    for(i=0; i<2; i++){
-        cavs_idct8_1d(block + 4 * i, ff_pw_4.a);
-
-        __asm__ volatile(
-            "psraw     $3, %%mm7  \n\t"
-            "psraw     $3, %%mm6  \n\t"
-            "psraw     $3, %%mm5  \n\t"
-            "psraw     $3, %%mm4  \n\t"
-            "psraw     $3, %%mm3  \n\t"
-            "psraw     $3, %%mm2  \n\t"
-            "psraw     $3, %%mm1  \n\t"
-            "psraw     $3, %%mm0  \n\t"
-            "movq   %%mm7,  (%0)  \n\t"
-            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
-            "movq   %%mm0,  8(%0)  \n\t"
-            "movq   %%mm6, 24(%0)  \n\t"
-            "movq   %%mm7, 40(%0)  \n\t"
-            "movq   %%mm4, 56(%0)  \n\t"
-            "movq    (%0),  %%mm7  \n\t"
-            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
-            "movq   %%mm7,   (%0)  \n\t"
-            "movq   %%mm1, 16(%0)  \n\t"
-            "movq   %%mm0, 32(%0)  \n\t"
-            "movq   %%mm3, 48(%0)  \n\t"
-            :
-            : "r"(b2 + 32 * i)
-            : "memory"
-        );
-    }
-
-    for(i=0; i<2; i++){
-        cavs_idct8_1d(b2+4*i, ff_pw_64.a);
-
-        __asm__ volatile(
-            "psraw     $7, %%mm7  \n\t"
-            "psraw     $7, %%mm6  \n\t"
-            "psraw     $7, %%mm5  \n\t"
-            "psraw     $7, %%mm4  \n\t"
-            "psraw     $7, %%mm3  \n\t"
-            "psraw     $7, %%mm2  \n\t"
-            "psraw     $7, %%mm1  \n\t"
-            "psraw     $7, %%mm0  \n\t"
-            "movq   %%mm7,    (%0)  \n\t"
-            "movq   %%mm5,  16(%0)  \n\t"
-            "movq   %%mm3,  32(%0)  \n\t"
-            "movq   %%mm1,  48(%0)  \n\t"
-            "movq   %%mm0,  64(%0)  \n\t"
-            "movq   %%mm2,  80(%0)  \n\t"
-            "movq   %%mm4,  96(%0)  \n\t"
-            "movq   %%mm6, 112(%0)  \n\t"
-            :: "r"(b2+4*i)
-            : "memory"
-        );
-    }
-
-    ff_add_pixels_clamped_mmx(b2, dst, stride);
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
+    ff_cavs_idct8_sse2(b2, block);
+    ff_add_pixels_clamped_sse2(b2, dst, stride);
 }
 
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_MMX_EXTERNAL */
 
 #if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
 
@@ -210,10 +66,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
  ****************************************************************************/
 
 /* vertical filter [-1 -2 96 42 -7  0]  */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
         "psllw $3, "#E"             \n\t"\
@@ -228,35 +84,35 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         "psubw "#B", %%mm6          \n\t"\
         "psraw $1, "#B"             \n\t"\
         "psubw "#A", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -1  5  5 -1  0]  */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "paddw "#D", %%mm6          \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "add %2, %0                 \n\t"\
         "punpcklbw %%mm7, "#F"      \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psubw "#E", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $3, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -7 42 96 -2 -1]  */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
-        "pmullw %5, %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
         "psllw $3, "#B"             \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psraw $3, "#B"             \n\t"\
@@ -269,7 +125,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         "psubw "#E", %%mm6          \n\t"\
         "psraw $1, "#E"             \n\t"\
         "psubw "#F", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
@@ -298,32 +154,34 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
         "punpcklbw %%mm7, %%mm2     \n\t"\
         "punpcklbw %%mm7, %%mm3     \n\t"\
         "punpcklbw %%mm7, %%mm4     \n\t"\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
         \
         : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
         : "memory"\
      );\
      if(h==16){\
         __asm__ volatile(\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
             \
            : "+a"(src), "+c"(dst)\
-           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+             NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
            : "memory"\
         );\
      }\
@@ -337,7 +195,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
     int h=8;\
     __asm__ volatile(\
         "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm6             \n\t"\
+        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
         "1:                         \n\t"\
         "movq    (%0), %%mm0        \n\t"\
         "movq   1(%0), %%mm2        \n\t"\
@@ -363,7 +221,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
         "paddw %%mm3, %%mm5         \n\t"\
         "psubw %%mm2, %%mm0         \n\t"\
         "psubw %%mm5, %%mm1         \n\t"\
-        "movq %6, %%mm5             \n\t"\
+        "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
         "paddw %%mm5, %%mm0         \n\t"\
         "paddw %%mm5, %%mm1         \n\t"\
         "psraw $3, %%mm0            \n\t"\
@@ -375,7 +233,8 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptr
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
         : "memory"\
     );\
 }\
@@ -387,7 +246,7 @@ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8
 \
 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
 {                                                                       \
-  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42)        \
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
@@ -468,7 +327,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 
 #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
 
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
 static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                     ptrdiff_t stride)
 {
@@ -481,6 +340,12 @@ static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels8_mmx(dst, src, stride, 8);
 }
 
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
 static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                      ptrdiff_t stride)
 {
@@ -493,9 +358,29 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels16_mmx(dst, src, stride, 16);
 }
 
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
 static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
                                      AVCodecContext *avctx)
 {
+#if HAVE_MMX_EXTERNAL
     c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
     c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
@@ -503,8 +388,8 @@ static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
 
     c->cavs_idct8_add = cavs_idct8_add_mmx;
     c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
+#endif /* HAVE_MMX_EXTERNAL */
 }
-#endif /* HAVE_MMX_INLINE */
 
 #define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
     c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
@@ -520,15 +405,6 @@ CAVS_MC(put_,  8, mmxext)
 CAVS_MC(put_, 16, mmxext)
 CAVS_MC(avg_,  8, mmxext)
 CAVS_MC(avg_, 16, mmxext)
-
-static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
-                                        AVCodecContext *avctx)
-{
-    DSPFUNC(put, 0, 16, mmxext);
-    DSPFUNC(put, 1,  8, mmxext);
-    DSPFUNC(avg, 0, 16, mmxext);
-    DSPFUNC(avg, 1,  8, mmxext);
-}
 #endif /* HAVE_MMXEXT_INLINE */
 
 #if HAVE_AMD3DNOW_INLINE
@@ -552,18 +428,36 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
 
 av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
 {
-#if HAVE_MMX_INLINE
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
+    if (X86_MMX(cpu_flags))
         cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
+
 #if HAVE_AMD3DNOW_INLINE
     if (INLINE_AMD3DNOW(cpu_flags))
         cavsdsp_init_3dnow(c, avctx);
 #endif /* HAVE_AMD3DNOW_INLINE */
 #if HAVE_MMXEXT_INLINE
-    if (INLINE_MMXEXT(cpu_flags))
-        cavsdsp_init_mmxext(c, avctx);
-#endif /* HAVE_MMXEXT_INLINE */
+    if (INLINE_MMXEXT(cpu_flags)) {
+        DSPFUNC(put, 0, 16, mmxext);
+        DSPFUNC(put, 1,  8, mmxext);
+        DSPFUNC(avg, 0, 16, mmxext);
+        DSPFUNC(avg, 1,  8, mmxext);
+    }
+#endif
+#if HAVE_MMX_EXTERNAL
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+        c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+    }
+#endif
+#if HAVE_SSE2_EXTERNAL
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+
+        c->cavs_idct8_add = cavs_idct8_add_sse2;
+        c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
+    }
+#endif
 }
diff --git a/libavcodec/x86/cavsidct.asm b/libavcodec/x86/cavsidct.asm
new file mode 100644
index 0000000..6c768c2
--- /dev/null
+++ b/libavcodec/x86/cavsidct.asm
@@ -0,0 +1,211 @@
+; Chinese AVS video (AVS1-P2, JiZhun profile) decoder
+; Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
+;
+; MMX-optimized DSP functions, based on H.264 optimizations by
+; Michael Niedermayer and Loren Merritt
+; Conversion from gcc syntax to x264asm syntax with modifications
+; by Ronald S. Bultje <rsbultje@gmail.com>
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_4
+cextern pw_64
+
+SECTION .text
+
+%macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load
+%if %3 == 1
+    mova            m4, [%1+7*16]       ; m4 = src7
+    mova            m5, [%1+1*16]       ; m5 = src1
+    mova            m2, [%1+5*16]       ; m2 = src5
+    mova            m7, [%1+3*16]       ; m7 = src3
+%else
+    SWAP             1, 7
+    SWAP             4, 6
+%endif
+    mova            m0, m4
+    mova            m3, m5
+    mova            m6, m2
+    mova            m1, m7
+
+    paddw           m4, m4              ; m4 = 2*src7
+    paddw           m3, m3              ; m3 = 2*src1
+    paddw           m6, m6              ; m6 = 2*src5
+    paddw           m1, m1              ; m1 = 2*src3
+    paddw           m0, m4              ; m0 = 3*src7
+    paddw           m5, m3              ; m5 = 3*src1
+    paddw           m2, m6              ; m2 = 3*src5
+    paddw           m7, m1              ; m7 = 3*src3
+    psubw           m5, m4              ; m5 = 3*src1 - 2*src7 = a0
+    paddw           m7, m6              ; m7 = 3*src3 - 2*src5 = a1
+    psubw           m1, m2              ; m1 = 2*src3 - 3*src5 = a2
+    paddw           m3, m0              ; m3 = 2*src1 - 3*src7 = a3
+
+    mova            m4, m5
+    mova            m6, m7
+    mova            m0, m3
+    mova            m2, m1
+    SUMSUB_BA     w, 7, 5               ; m7 = a0 + a1, m5 = a0 - a1
+    paddw           m7, m3              ; m7 = a0 + a1 + a3
+    paddw           m5, m1              ; m5 = a0 - a1 + a2
+    paddw           m7, m7
+    paddw           m5, m5
+    paddw           m7, m6              ; m7 = b4
+    paddw           m5, m4              ; m5 = b5
+
+    SUMSUB_BA     w, 1, 3               ; m1 = a3 + a2, m3 = a3 - a2
+    psubw           m4, m1              ; m4 = a0 - a2 - a3
+    mova            m1, m4              ; m1 = a0 - a2 - a3
+    psubw           m3, m6              ; m3 = a3 - a2 - a1
+    paddw           m1, m1
+    paddw           m3, m3
+    psubw           m1, m2              ; m1 = b7
+    paddw           m3, m0              ; m3 = b6
+
+    mova            m2, [%1+2*16]       ; m2 = src2
+    mova            m6, [%1+6*16]       ; m6 = src6
+    mova            m4, m2
+    mova            m0, m6
+    psllw           m4, 2               ; m4 = 4*src2
+    psllw           m6, 2               ; m6 = 4*src6
+    paddw           m2, m4              ; m2 = 5*src2
+    paddw           m0, m6              ; m0 = 5*src6
+    paddw           m2, m2
+    paddw           m0, m0
+    psubw           m4, m0              ; m4 = 4*src2 - 10*src6 = a7
+    paddw           m6, m2              ; m6 = 4*src6 + 10*src2 = a6
+
+    mova            m2, [%1+0*16]       ; m2 = src0
+    mova            m0, [%1+4*16]       ; m0 = src4
+    SUMSUB_BA     w, 0, 2               ; m0 = src0 + src4, m2 = src0 - src4
+    psllw           m0, 3
+    psllw           m2, 3
+    paddw           m0, %2              ; add rounding bias
+    paddw           m2, %2              ; add rounding bias
+
+    SUMSUB_BA     w, 6, 0               ; m6 = a4 + a6, m0 = a4 - a6
+    SUMSUB_BA     w, 4, 2               ; m4 = a5 + a7, m2 = a5 - a7
+    SUMSUB_BA     w, 7, 6               ; m7 = dst0, m6 = dst7
+    SUMSUB_BA     w, 5, 4               ; m5 = dst1, m4 = dst6
+    SUMSUB_BA     w, 3, 2               ; m3 = dst2, m2 = dst5
+    SUMSUB_BA     w, 1, 0               ; m1 = dst3, m0 = dst4
+%endmacro
+
+INIT_MMX mmx
+cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp
+    mov           cntd, 2
+    mov           tmpq, rsp
+
+.loop_1:
+    CAVS_IDCT8_1D  inq, [pw_4]
+    psraw           m7, 3
+    psraw           m6, 3
+    psraw           m5, 3
+    psraw           m4, 3
+    psraw           m3, 3
+    psraw           m2, 3
+    psraw           m1, 3
+    psraw           m0, 3
+    mova        [tmpq], m7
+    TRANSPOSE4x4W    0, 2, 4, 6, 7
+    mova    [tmpq+1*8], m0
+    mova    [tmpq+3*8], m2
+    mova    [tmpq+5*8], m4
+    mova    [tmpq+7*8], m6
+    mova            m7, [tmpq]
+    TRANSPOSE4x4W    7, 5, 3, 1, 0
+    mova    [tmpq+0*8], m7
+    mova    [tmpq+2*8], m5
+    mova    [tmpq+4*8], m3
+    mova    [tmpq+6*8], m1
+
+    add            inq, mmsize
+    add           tmpq, 64
+    dec           cntd
+    jg .loop_1
+
+    mov           cntd, 2
+    mov           tmpq, rsp
+.loop_2:
+    CAVS_IDCT8_1D tmpq, [pw_64]
+    psraw           m7, 7
+    psraw           m6, 7
+    psraw           m5, 7
+    psraw           m4, 7
+    psraw           m3, 7
+    psraw           m2, 7
+    psraw           m1, 7
+    psraw           m0, 7
+
+    mova   [outq+0*16], m7
+    mova   [outq+1*16], m5
+    mova   [outq+2*16], m3
+    mova   [outq+3*16], m1
+    mova   [outq+4*16], m0
+    mova   [outq+5*16], m2
+    mova   [outq+6*16], m4
+    mova   [outq+7*16], m6
+
+    add           outq, mmsize
+    add           tmpq, mmsize
+    dec           cntd
+    jg .loop_2
+
+    RET
+
+INIT_XMM sse2
+cglobal cavs_idct8, 2, 2, 8 + ARCH_X86_64, 0 - 8 * 16, out, in
+    CAVS_IDCT8_1D  inq, [pw_4]
+    psraw           m7, 3
+    psraw           m6, 3
+    psraw           m5, 3
+    psraw           m4, 3
+    psraw           m3, 3
+    psraw           m2, 3
+    psraw           m1, 3
+    psraw           m0, 3
+%if ARCH_X86_64
+    TRANSPOSE8x8W    7, 5, 3, 1, 0, 2, 4, 6, 8
+    mova    [rsp+4*16], m0
+%else
+    mova    [rsp+0*16], m4
+    TRANSPOSE8x8W    7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1
+%endif
+    mova    [rsp+0*16], m7
+    mova    [rsp+2*16], m3
+    mova    [rsp+6*16], m4
+    CAVS_IDCT8_1D  rsp, [pw_64], 0
+    psraw           m7, 7
+    psraw           m6, 7
+    psraw           m5, 7
+    psraw           m4, 7
+    psraw           m3, 7
+    psraw           m2, 7
+    psraw           m1, 7
+    psraw           m0, 7
+
+    mova   [outq+0*16], m7
+    mova   [outq+1*16], m5
+    mova   [outq+2*16], m3
+    mova   [outq+3*16], m1
+    mova   [outq+4*16], m0
+    mova   [outq+5*16], m2
+    mova   [outq+6*16], m4
+    mova   [outq+7*16], m6
+    RET
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 6f7dd73..4bfb78c 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,20 +1,20 @@
 /*
- * MMX/SSE constants used across x86 dsp optimizations.
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,35 +22,73 @@
 #include "libavutil/x86/asm.h" // for xmm_reg
 #include "constants.h"
 
-DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+                                                    0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+                                                    0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
+DECLARE_ASM_ALIGNED(32, const ymm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+                                                    0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_20)   = { 0x0014001400140014ULL, 0x0014001400140014ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_255)  = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+                                                    0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
                                                     0x0100010001000100ULL, 0x0100010001000100ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+                                                    0x0200020002000200ULL, 0x0200020002000200ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+                                                    0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+                                                    0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+                                                    0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+                                                    0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+                                                    0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+                                                    0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+                                                    0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
 
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+                                                    0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_2)    = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+                                                    0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg,  ff_pb_15)   = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL,
+                                                    0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
+                                                    0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_1)    = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+                                                    0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+                                                    0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+                                                    0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+                                                    0x0000200000002000ULL, 0x0000200000002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+                                                    0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 59ff947..85da38b 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -1,20 +1,20 @@
 /*
  * MMX/SSE constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,30 +25,48 @@
 
 #include "libavutil/x86/asm.h"
 
-extern const uint64_t ff_wtwo;
-
+extern const ymm_reg  ff_pw_1;
+extern const ymm_reg  ff_pw_2;
 extern const xmm_reg  ff_pw_3;
-extern const xmm_reg  ff_pw_4;
+extern const ymm_reg  ff_pw_4;
 extern const xmm_reg  ff_pw_5;
 extern const xmm_reg  ff_pw_8;
+extern const xmm_reg  ff_pw_9;
 extern const uint64_t ff_pw_15;
 extern const xmm_reg  ff_pw_16;
 extern const xmm_reg  ff_pw_18;
-extern const uint64_t ff_pw_20;
+extern const xmm_reg  ff_pw_20;
 extern const xmm_reg  ff_pw_32;
 extern const uint64_t ff_pw_42;
 extern const uint64_t ff_pw_53;
 extern const xmm_reg  ff_pw_64;
 extern const uint64_t ff_pw_96;
 extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
+extern const ymm_reg  ff_pw_255;
 extern const ymm_reg  ff_pw_256;
-extern const xmm_reg  ff_pw_512;
-extern const xmm_reg  ff_pw_m1;
+extern const ymm_reg  ff_pw_512;
+extern const ymm_reg  ff_pw_1023;
+extern const ymm_reg  ff_pw_1024;
+extern const ymm_reg  ff_pw_2048;
+extern const ymm_reg  ff_pw_4095;
+extern const ymm_reg  ff_pw_4096;
+extern const ymm_reg  ff_pw_8192;
+extern const ymm_reg  ff_pw_m1;
 
-extern const xmm_reg  ff_pb_1;
-extern const xmm_reg  ff_pb_3;
-extern const xmm_reg  ff_pb_F8;
+extern const ymm_reg  ff_pb_0;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_2;
+extern const ymm_reg  ff_pb_3;
+extern const ymm_reg  ff_pb_80;
+extern const ymm_reg  ff_pb_FE;
 extern const uint64_t ff_pb_FC;
 
+extern const xmm_reg  ff_ps_neg;
+
+extern const ymm_reg  ff_pd_1;
+extern const ymm_reg  ff_pd_16;
+extern const ymm_reg  ff_pd_32;
+extern const ymm_reg  ff_pd_8192;
+extern const ymm_reg  ff_pd_65535;
+
 #endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index 11d45ae..0000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DCA_H
-#define AVCODEC_X86_DCA_H
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
-    __asm__ volatile (
-        "cvtsi2ss        %2, %%xmm0 \n\t"
-        "mulss           %3, %%xmm0 \n\t"
-        "movq          (%1), %%xmm1 \n\t"
-        "punpcklbw   %%xmm1, %%xmm1 \n\t"
-        "movaps      %%xmm1, %%xmm2 \n\t"
-        "punpcklwd   %%xmm1, %%xmm1 \n\t"
-        "punpckhwd   %%xmm2, %%xmm2 \n\t"
-        "psrad          $24, %%xmm1 \n\t"
-        "psrad          $24, %%xmm2 \n\t"
-        "shufps  $0, %%xmm0, %%xmm0 \n\t"
-        "cvtdq2ps    %%xmm1, %%xmm1 \n\t"
-        "cvtdq2ps    %%xmm2, %%xmm2 \n\t"
-        "mulps       %%xmm0, %%xmm1 \n\t"
-        "mulps       %%xmm0, %%xmm2 \n\t"
-        "movaps      %%xmm1,  0(%0) \n\t"
-        "movaps      %%xmm2, 16(%0) \n\t"
-        :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
-        XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
-    );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
-
-#endif /* AVCODEC_X86_DCA_H */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index fa8d3cb..055361a 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -1,336 +1,301 @@
 ;******************************************************************************
-;* SSE-optimized functions for the DCA decoder
-;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-pf_inv16:  times 4 dd 0x3D800000 ; 1/16
-
 SECTION .text
 
-; %1=v0/v1  %2=in1  %3=in2
-%macro FIR_LOOP 2-3
-.loop%1:
-%define va          m1
-%define vb          m2
-%if %1
-%define OFFSET      0
-%else
-%define OFFSET      NUM_COEF*count
-%endif
-; for v0, incrementing and for v1, decrementing
-    mova        va, [cf0q + OFFSET]
-    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
-%if %0 == 3
-    mova        m4, [cf0q + OFFSET + mmsize]
-    mova        m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
-%endif
-    mulps       va, %2
-    mulps       vb, %2
-%if %0 == 3
-    mulps       m4, %3
-    mulps       m0, %3
-    addps       va, m4
-    addps       vb, m0
-%endif
-    ; va = va1 va2 va3 va4
-    ; vb = vb1 vb2 vb3 vb4
-%if %1
-    SWAP        va, vb
-%endif
-    mova        m4, va
-    unpcklps    va, vb ; va3 vb3 va4 vb4
-    unpckhps    m4, vb ; va1 vb1 va2 vb2
-    addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
-    movhlps     vb, m4 ; va1+3  vb1+3
-    addps       vb, m4 ; va0..4 vb0..4
-    movlps  [outq + count], vb
-%if %1
-    sub       cf0q, 8*NUM_COEF
-%endif
-    add      count, 8
-    jl   .loop%1
-%endmacro
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3))
 
-; void dca_lfe_fir(float *out, float *in, float *coefs)
-%macro DCA_LFE_FIR 1
-cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
-%define IN1       m3
-%define IN2       m5
-%define count     inq
-%define NUM_COEF  4*(2-%1)
-%define NUM_OUT   32*(%1+1)
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 1
+    sub     lfeq, 7*sizeof_float
+    mov    cnt1d, 32*sizeof_float
+    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+    lea   coeffq, [coeffq+cnt1q*8]
+    add samplesq, cnt1q
+    neg    cnt1q
 
-    movu     IN1, [inq + 4 - 1*mmsize]
-    shufps   IN1, IN1, q0123
-%if %1 == 0
-    movu     IN2, [inq + 4 - 2*mmsize]
-    shufps   IN2, IN2, q0123
-%endif
-
-    mov    count, -4*NUM_OUT
-    add     cf0q, 4*NUM_COEF*NUM_OUT
-    add     outq, 4*NUM_OUT
-    ; compute v0 first
-%if %1 == 0
-    FIR_LOOP   0, IN1, IN2
-%else
-    FIR_LOOP   0, IN1
-%endif
-    shufps   IN1, IN1, q0123
-    mov    count, -4*NUM_OUT
-    ; cf1 already correctly positioned
-    add     outq, 4*NUM_OUT          ; outq now at out2
-    sub     cf0q, 8*NUM_COEF
-%if %1 == 0
-    shufps   IN2, IN2, q0123
-    FIR_LOOP   1, IN2, IN1
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq+16]
+    cvtdq2ps  m5, [lfeq   ]
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq+16]
+    movu      m5, [lfeq   ]
+    cvtdq2ps  m4, m4
+    cvtdq2ps  m5, m5
+    pshufd    m7, m4, q0123
+    pshufd    m6, m5, q0123
 %else
-    FIR_LOOP   1, IN1
+    cvtpi2ps  m4, [lfeq+16]
+    cvtpi2ps  m0, [lfeq+24]
+    cvtpi2ps  m5, [lfeq   ]
+    cvtpi2ps  m1, [lfeq+8 ]
+    shufps    m4, m0, q1010
+    shufps    m5, m1, q1010
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
 %endif
-    RET
-%endmacro
 
-INIT_XMM sse
-DCA_LFE_FIR 0
-DCA_LFE_FIR 1
+.inner_loop:
+%if ARCH_X86_64
+    movaps    m8, [coeffq+cnt1q*8   ]
+    movaps    m9, [coeffq+cnt1q*8+16]
+    movaps   m10, [coeffq+cnt1q*8+32]
+    movaps   m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+    movaps   m12, [coeffq+cnt1q*8+64]
+    movaps   m13, [coeffq+cnt1q*8+80]
+    movaps   m14, [coeffq+cnt1q*8+96]
+    movaps   m15, [coeffq+cnt1q*8+112]
+    mulps     m0, m7, m8
+    mulps     m1, m7, m10
+    mulps     m2, m7, m12
+    mulps     m3, m7, m14
+    fmaddps   m0, m6, m9, m0
+    fmaddps   m1, m6, m11, m1
+    fmaddps   m2, m6, m13, m2
+    fmaddps   m3, m6, m15, m3
 
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
-    pxor          %1, %1
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
 %else
-    xorps         %1, %1, %1
-%endif
-%endmacro
+    mulps     m0, m7, m8
+    mulps     m1, m6, m9
+    mulps     m2, m7, m10
+    mulps     m3, m6, m11
+    addps     m0, m1
+    addps     m2, m3
 
-%macro SHUF 3
-%if cpuflag(avx)
-    mova          %3, [%2 - 16]
-    vperm2f128    %1, %3, %3, 1
-    vshufps       %1, %1, %1, q0123
-%elif cpuflag(sse2)
-    pshufd        %1, [%2], q0123
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m7, [coeffq+cnt1q*8    ]
+    mulps     m1, m7, [coeffq+cnt1q*8+32 ]
+    mulps     m2, m7, [coeffq+cnt1q*8+64 ]
+    mulps     m3, m7, [coeffq+cnt1q*8+96 ]
+    fmaddps   m0, m6, [coeffq+cnt1q*8+16 ], m0
+    fmaddps   m1, m6, [coeffq+cnt1q*8+48 ], m1
+    fmaddps   m2, m6, [coeffq+cnt1q*8+80 ], m2
+    fmaddps   m3, m6, [coeffq+cnt1q*8+112], m3
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
 %else
-    mova          %1, [%2]
-    shufps        %1, %1, q0123
-%endif
-%endmacro
+    mulps     m0, m7, [coeffq+cnt1q*8   ]
+    mulps     m1, m6, [coeffq+cnt1q*8+16]
+    mulps     m2, m7, [coeffq+cnt1q*8+32]
+    mulps     m3, m6, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%endif; ARCH
 
-%macro INNER_LOOP   1
-    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
-    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
-    ;~ b += window[i + j + 16] * (synth_buf[i + j])
-    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
-    mova          m6, [ptr1 + j]
-%if ARCH_X86_64
-    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
-    mova         m12, [ptr1 + j + mmsize]
-%endif
-%if cpuflag(fma3)
-    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
-    fnmaddps      m1, m5,  [win + %1 + j], m1
-%if ARCH_X86_64
-    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
-    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
-    mulps         m6, m6,  [win + %1 + j + 16 * 4]
-    mulps         m5, m5,  [win + %1 + j]
-%if ARCH_X86_64
-    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
-    mulps        m11, m11, [win + %1 + j + mmsize]
-%endif
-    addps         m2, m2, m6
-    subps         m1, m1, m5
-%if ARCH_X86_64
-    addps         m8, m8, m12
-    subps         m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
-    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
-    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
-    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
-    mova          m5, [ptr1 + j + 16 * 4]
 %if ARCH_X86_64
-    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
-    mova         m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
 %if cpuflag(fma3)
-    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
-    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
-    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
-    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
-    mulps         m5, m5,  [win + %1 + j + 32 * 4]
-    mulps         m6, m6,  [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
-    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
-    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
-    addps         m3, m3, m5
-    addps         m4, m4, m6
-%if ARCH_X86_64
-    addps         m9, m9, m11
-    addps        m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
-    sub            j, 64 * 4
-%endmacro
+    mulps     m8, m5
+    mulps    m10, m5
+    mulps    m12, m5
+    mulps    m14, m5
+    fmaddps   m8, m4, m9, m8
+    fmaddps  m10, m4, m11, m10
+    fmaddps  m12, m4, m13, m12
+    fmaddps  m14, m4, m15, m14
 
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-;                                  const float window[512], float out[32],
-;                                  intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
-                              synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
-    movd       scale, scalem
-    SPLATD        m0
-%else
-    VBROADCASTSS  m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ  r4q
+    haddps   m10, m8
+    haddps   m14, m12
+    haddps   m14, m10
+    movaps [samplesq+cnt2q], m14
 %else
-    SPLATD      xmm0
-%if cpuflag(avx)
-    vinsertf128   m0, m0, xmm0, 1
-%endif
-%define OFFQ  offq
-%endif
-    ; prepare inner counter limit 1
-    mov          r5q, 480
-    sub          r5q, offmp
-    and          r5q, -64
-    shl          r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
-    mov         OFFQ, r5q
-%define i        r5q
-    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
+    mulps     m8, m5
+    mulps     m9, m4
+    mulps    m10, m5
+    mulps    m11, m4
+    addps     m8, m9
+    addps    m10, m11
+
+    unpckhps m11, m10, m8
+    unpcklps m10, m8
+    addps    m11, m10
+    movhlps   m8, m11
+    addps     m8, m11
+    movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m5, [coeffq+cnt1q*8    ]
+    mulps     m1, m5, [coeffq+cnt1q*8+32 ]
+    mulps     m2, m5, [coeffq+cnt1q*8+64 ]
+    mulps     m3, m5, [coeffq+cnt1q*8+96 ]
+    fmaddps   m0, m4, [coeffq+cnt1q*8+16 ], m0
+    fmaddps   m1, m4, [coeffq+cnt1q*8+48 ], m1
+    fmaddps   m2, m4, [coeffq+cnt1q*8+80 ], m2
+    fmaddps   m3, m4, [coeffq+cnt1q*8+112], m3
+
+    haddps    m1, m0
+    haddps    m3, m2
+    haddps    m3, m1
+    movaps [samplesq+cnt2q], m3
 %else
-%define i 0
-%define OFFQ  r5q
-%endif
+    mulps     m0, m5, [coeffq+cnt1q*8   ]
+    mulps     m1, m4, [coeffq+cnt1q*8+16]
+    mulps     m2, m5, [coeffq+cnt1q*8+32]
+    mulps     m3, m4, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m2, m0
+    unpcklps  m2, m0
+    addps     m3, m2
+    movhlps   m0, m3
+    addps     m0, m3
+    movlps [samplesq+cnt2q], m0
+%endif
+%endif; ARCH
+
+    sub    cnt2d, 8 + FMA3_OFFSET
+    add    cnt1q, 8 + FMA3_OFFSET
+    jl .inner_loop
+
+    add     lfeq, 4
+    add samplesq,  64*sizeof_float
+    mov    cnt1q, -32*sizeof_float
+    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
+    sub nblocksd, 1
+    jg .loop
+    RET
+%endmacro
 
-%define buf2     synth_buf2q
-%if ARCH_X86_32
-    mov         buf2, synth_buf2mp
-%endif
-.mainloop:
-    ; m1 = a  m2 = b  m3 = c  m4 = d
-    SETZERO       m3
-    SETZERO       m4
-    mova          m1, [buf2 + i]
-    mova          m2, [buf2 + i + 16 * 4]
 %if ARCH_X86_32
-%define ptr1     r0q
-%define ptr2     r1q
-%define win      r2q
-%define j        r3q
-    mov          win, windowm
-    mov         ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
-    add          win, i
-    add         ptr1, i
+INIT_XMM sse
+LFE_FIR0_FLOAT
 %endif
-%else ; ARCH_X86_64
-%define ptr1     r6q
-%define ptr2     r7q ; must be loaded
-%define win      r8q
-%define j        r9q
-    SETZERO       m9
-    SETZERO      m10
-    mova          m7, [buf2 + i + mmsize]
-    mova          m8, [buf2 + i + mmsize + 16 * 4]
-    lea          win, [windowq + i]
-    lea         ptr1, [synth_bufq + i]
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR0_FLOAT
 %endif
-    mov         ptr2, synth_bufmp
-    ; prepare the inner loop counter
-    mov            j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub         ptr2, i
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+LFE_FIR0_FLOAT
 %endif
-.loop1:
-    INNER_LOOP  0
-    jge       .loop1
 
-    mov            j, 448 * 4
-    sub            j, OFFQ
-    jz          .end
-    sub         ptr1, j
-    sub         ptr2, j
-    add          win, OFFQ ; now at j-64, so define OFFSET
-    sub            j, 64 * 4
-.loop2:
-    INNER_LOOP  64 * 4
-    jge       .loop2
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 2
+    sub     lfeq, 3*sizeof_float
+    mov    cnt1d, 64*sizeof_float
+    mov    cnt2d, 64*sizeof_float-16
+    lea   coeffq, [coeffq+cnt1q*4]
+    add samplesq, cnt1q
+    neg    cnt1q
 
-.end:
-%if ARCH_X86_32
-    mov         buf2, synth_buf2m ; needed for next iteration anyway
-    mov         outq, outmp       ; j, which will be set again during it
-%endif
-    ;~ out[i]      = a * scale;
-    ;~ out[i + 16] = b * scale;
-    mulps         m1, m1, scale
-    mulps         m2, m2, scale
-%if ARCH_X86_64
-    mulps         m7, m7, scale
-    mulps         m8, m8, scale
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq]
+    shufps    m5, m4, m4, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq]
+    cvtdq2ps  m4, m4
+    pshufd    m5, m4, q0123
 %endif
-    ;~ synth_buf2[i]      = c;
-    ;~ synth_buf2[i + 16] = d;
-    mova   [buf2 + i +  0 * 4], m3
-    mova   [buf2 + i + 16 * 4], m4
+
+.inner_loop:
+    movaps    m6, [coeffq+cnt1q*4   ]
+    movaps    m7, [coeffq+cnt1q*4+16]
+    mulps     m0, m5, m6
+    mulps     m1, m5, m7
 %if ARCH_X86_64
-    mova   [buf2 + i +  0 * 4 + mmsize], m9
-    mova   [buf2 + i + 16 * 4 + mmsize], m10
+    movaps    m8, [coeffq+cnt1q*4+32]
+    movaps    m9, [coeffq+cnt1q*4+48]
+    mulps     m2, m5, m8
+    mulps     m3, m5, m9
+%else
+    mulps     m2, m5, [coeffq+cnt1q*4+32]
+    mulps     m3, m5, [coeffq+cnt1q*4+48]
 %endif
-    ;~ out[i]      = a;
-    ;~ out[i + 16] = a;
-    mova   [outq + i +  0 * 4], m1
-    mova   [outq + i + 16 * 4], m2
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+
+    mulps     m6, m4
+    mulps     m7, m4
 %if ARCH_X86_64
-    mova   [outq + i +  0 * 4 + mmsize], m7
-    mova   [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub            i, (ARCH_X86_64 + 1) * mmsize
-    jge    .mainloop
+    mulps     m8, m4
+    mulps     m9, m4
+
+    haddps    m6, m7
+    haddps    m8, m9
+    haddps    m6, m8
+%else
+    mulps     m2, m4, [coeffq+cnt1q*4+32]
+    mulps     m3, m4, [coeffq+cnt1q*4+48]
+
+    haddps    m6, m7
+    haddps    m2, m3
+    haddps    m6, m2
 %endif
+    movaps [samplesq+cnt2q], m6
+
+    sub    cnt2d, 16
+    add    cnt1q, 16
+    jl .inner_loop
+
+    add     lfeq, sizeof_float
+    add samplesq, 128*sizeof_float
+    mov    cnt1q, -64*sizeof_float
+    mov    cnt2d,  64*sizeof_float-16
+    sub nblocksd, 1
+    jg .loop
     RET
 %endmacro
 
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
 %endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 01a89fe..fc10fb8 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -1,20 +1,18 @@
 /*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,66 +21,32 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
-
-av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->lfe_fir[0]        = ff_dca_lfe_fir0_sse;
-        s->lfe_fir[1]        = ff_dca_lfe_fir1_sse;
-    }
-}
-
+#define LFE_FIR_FLOAT_FUNC(opt)                                               \
+void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
 
-#define SYNTH_FILTER_FUNC(opt)                                                 \
-void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
-                                 const float window[512],                      \
-                                 float out[32], intptr_t offset, float scale); \
-static void synth_filter_##opt(FFTContext *imdct,                              \
-                               float *synth_buf_ptr, int *synth_buf_offset,    \
-                               float synth_buf2[32], const float window[512],  \
-                               float out[32], const float in[32], float scale) \
-{                                                                              \
-    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
-                                                                               \
-    imdct->imdct_half(imdct, synth_buf, in);                                   \
-                                                                               \
-    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
-                                out, *synth_buf_offset, scale);                \
-                                                                               \
-    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
-}                                                                              \
+LFE_FIR_FLOAT_FUNC(sse)
+LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
+LFE_FIR_FLOAT_FUNC(avx)
+LFE_FIR_FLOAT_FUNC(fma3)
 
-#if HAVE_X86ASM
-#if ARCH_X86_32
-SYNTH_FILTER_FUNC(sse)
-#endif
-SYNTH_FILTER_FUNC(sse2)
-SYNTH_FILTER_FUNC(avx)
-SYNTH_FILTER_FUNC(fma3)
-#endif /* HAVE_X86ASM */
-
-av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 {
-#if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse;
-    }
-#endif
-    if (EXTERNAL_SSE2(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse2;
-    }
-    if (EXTERNAL_AVX_FAST(cpu_flags)) {
-        s->synth_filter_float = synth_filter_avx;
-    }
-    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
-        s->synth_filter_float = synth_filter_fma3;
+    if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+    if (EXTERNAL_AVX(cpu_flags)) {
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
     }
-#endif /* HAVE_X86ASM */
+    if (EXTERNAL_FMA3(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
 }
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index cfd5f52..21e2f21 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -2,20 +2,20 @@
 ;* 32 point SSE-optimized DCT transform
 ;* Copyright (c) 2010 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -190,6 +190,7 @@ ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
 
 INIT_YMM avx
 SECTION .text
+%if HAVE_AVX_EXTERNAL
 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
 cglobal dct32_float, 2,3,8, out, in, tmp
     ; pass 1
@@ -262,6 +263,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp
 INIT_XMM
     PASS6_AND_PERMUTE
     RET
+%endif
 
 %if ARCH_X86_64
 %define SPILL SWAP
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index b2e43a9..c31ef92 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dirac_dwt.asm b/libavcodec/x86/dirac_dwt.asm
new file mode 100644
index 0000000..22a5c2b
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.asm
@@ -0,0 +1,307 @@
+;******************************************************************************
+;* x86 optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
+SECTION .text
+
+; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
+%macro COMPOSE_53iL0 4
+    paddw   %2, %3
+    paddw   %2, %4
+    psraw   %2, 2
+    psubw   %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered  m3: pw_8  m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+    paddw   m0, %3
+    paddw   m1, %2
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+%if %0 > 3
+    movu    %1, %4
+%endif
+    psrad   m1, 4
+    psrad   m2, 4
+    packssdw m1, m2
+    paddw   m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+    mova    m2, [pw_2]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b0q+2*widthq]
+    mova    m0, [b1q+2*widthq]
+    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+    mova    m1, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    paddw   m0, [b2q+2*widthq]
+    paddw   m0, m1
+    psraw   m0, 1
+    paddw   m0, [b1q+2*widthq]
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                               IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+    mova    [b2q+2*widthq], m1
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_16]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    mova    m5, [b2q+2*widthq]
+    paddw   m0, [b4q+2*widthq]
+    paddw   m1, [b3q+2*widthq]
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+    psrad   m1, 5
+    psrad   m2, 5
+    packssdw m1, m2
+    psubw   m5, m1
+    mova    [b2q+2*widthq], m5
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+    mova    m3, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b1q+2*widthq]
+    mova    m0, [b0q+2*widthq]
+    mova    m2, m1
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [b0q+2*widthq], m0
+    paddw   m2, m0
+    mova    [b1q+2*widthq], m2
+    jg      .loop
+    REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+    mov     %3, [tmpq]
+%assign %%i 1
+%rep %1
+    mov     [tmpq-2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+    mov     %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+    mov     [tmpq+2*w2q+2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xq, xq
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    mova    m3, [pw_1]
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [tmpq + 2*xq], m0
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .lowpass_loop
+
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
+    jl      .end
+
+.highpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [tmpq  + 2*xq]
+    paddw   m1, m0
+
+    ; shift and interleave
+%if %2 == 1
+    paddw   m0, m3
+    paddw   m1, m3
+    psraw   m0, 1
+    psraw   m1, 1
+%endif
+    mova    m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m0
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .highpass_loop
+.end:
+    REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xd, xd
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    movu    m4, [bq+wq]
+    mova    m7, [pw_2]
+    pslldq  m4, 14
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    mova    m2, m1
+    palignr m1, m4, 14
+    mova    m4, m2
+    COMPOSE_53iL0 m0, m1, m2, m7
+    mova    [tmpq + 2*xq], m0
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .lowpass_loop
+
+    EDGE_EXTENSION 1, 2, xw
+    ; leave the last up to 7 (sse) or 3 (mmx) values for C
+    xor     xd, xd
+    and    w2d, ~(mmsize/2 - 1)
+    cmp    w2d, mmsize/2
+    jl      .end
+
+    mova    m7, [tmpq-mmsize]
+    mova    m0, [tmpq]
+    mova    m5, [pw_1]
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.highpass_loop:
+    mova    m6, m0
+    palignr m0, m7, 14
+    mova    m7, [tmpq + 2*xq + 16]
+    mova    m1, m7
+    mova    m2, m7
+    palignr m1, m6, 2
+    palignr m2, m6, 4
+    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+    mova    m0, m7
+    mova    m7, m6
+
+    ; shift and interleave
+    paddw   m6, m5
+    paddw   m1, m5
+    psraw   m6, 1
+    psraw   m1, 1
+    mova    m2, m6
+    punpcklwd m6, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m6
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .highpass_loop
+.end:
+    REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/dirac_dwt_init.c b/libavcodec/x86/dirac_dwt_init.c
new file mode 100644
index 0000000..49a6380
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_init.c
@@ -0,0 +1,229 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                           uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                          uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_X86ASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    int16_t *b = (int16_t *)_b;
+    int16_t *tmp = (int16_t *)_tmp;
+
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_X86ASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_X86ASM
+}
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
new file mode 100644
index 0000000..cc8a26f
--- /dev/null
+++ b/libavcodec/x86/diracdsp.asm
@@ -0,0 +1,347 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit:                times 8 dw 0x3ff
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
+SECTION .text
+
+%macro UNPACK_ADD 6
+    mov%5   %1, %3
+    mov%6   m5, %4
+    mova    m4, %1
+    mova    %2, m5
+    punpcklbw %1, m7
+    punpcklbw m5, m7
+    punpckhbw m4, m7
+    punpckhbw %2, m7
+    paddw   %1, m5
+    paddw   %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+    mov     src0q, srcq
+    lea     stridex3q, [3*strideq]
+    sub     src0q, stridex3q
+    pxor    m7, m7
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq], m0
+    add     dstq, mmsize
+    add     srcq, mmsize
+    add     src0q, mmsize
+    sub     widthd, mmsize
+    jg      .loop
+    RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+    dec     widthd
+    pxor    m7, m7
+    and     widthd, ~(mmsize-1)
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq + widthq], m0
+    sub     widthd, mmsize
+    jge     .loop
+    RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+    mova    m0, [pb_80]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   dst_strideq, dst_strided
+    movsxd   src_strideq, src_strided
+    mov   r7d, r5m
+    mov   r8d, wd
+    %define wspill r8d
+    %define hd r7d
+%else
+    mov    r4m, wd
+    %define wspill r4m
+    %define hd r5mp
+%endif
+
+.loopy:
+    lea     src2q, [srcq+src_strideq]
+    lea     dst2q, [dstq+dst_strideq]
+.loopx:
+    sub      wd, mmsize
+    mova     m1, [srcq +2*wq]
+    mova     m2, [src2q+2*wq]
+    packsswb m1, [srcq +2*wq+mmsize]
+    packsswb m2, [src2q+2*wq+mmsize]
+    paddb    m1, m0
+    paddb    m2, m0
+    mova    [dstq +wq], m1
+    mova    [dst2q+wq], m2
+    jg      .loopx
+
+    lea   srcq, [srcq+src_strideq*2]
+    lea   dstq, [dstq+dst_strideq*2]
+    sub     hd, 2
+    mov     wd, wspill
+    jg      .loopy
+    RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+    mova    m0, [pw_32]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   strideq, strided
+    movsxd   idwt_strideq, idwt_strided
+    mov   r8d, wd
+    %define wspill r8d
+%else
+    mov    r5m, wd
+    %define wspill r5m
+%endif
+
+.loop:
+    sub     wd, mmsize
+    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
+    paddw   m1, m0
+    psraw   m1, 6
+    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+    paddw   m2, m0
+    psraw   m2, 6
+    paddw   m1, [idwtq+2*wq]
+    paddw   m2, [idwtq+2*wq+mmsize]
+    packuswb m1, m2
+    mova    [dstq +wq], m1
+    jg      .loop
+
+    lea   srcq, [srcq + 2*strideq]
+    add   dstq, strideq
+    lea  idwtq, [idwtq+ 2*idwt_strideq]
+    sub     hd, 1
+    mov     wd, wspill
+    jg      .loop
+    RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+    pxor        m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+    mova        m0, [srcq+i]
+    mova        m1, m0
+    punpcklbw   m0, m4
+    punpckhbw   m1, m4
+    mova        m2, [obmcq+i]
+    mova        m3, m2
+   punpcklbw   m2, m4
+    punpckhbw   m3, m4
+    pmullw      m0, m2
+    pmullw      m1, m3
+    movu        m2, [dstq+2*i]
+    movu        m3, [dstq+2*i+mmsize]
+    paddw       m0, m2
+    paddw       m1, m3
+    movu        [dstq+2*i], m0
+    movu        [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
+
+INIT_XMM sse4
+
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
+cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
+    movd   m2, qfd
+    movd   m3, qsd
+    SPLATD m2
+    SPLATD m3
+    mov    r4, tot_hq
+    mov    r3, dstq
+
+    .loop_v:
+    mov    tot_hq, r4
+    mov    dstq,   r3
+
+    .loop_h:
+    movu   m0, [srcq]
+
+    pabsd  m1, m0
+    pmulld m1, m2
+    paddd  m1, m3
+    psrld  m1,  2
+    psignd m1, m0
+
+    movu   [dstq], m1
+
+    add    srcq, mmsize
+    add    dstq, mmsize
+    sub    tot_hd, 4
+    jg     .loop_h
+
+    add    r3, strideq
+    dec    tot_vd
+    jg     .loop_v
+
+    RET
+
+INIT_XMM sse4
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+%if ARCH_X86_64
+cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
+%else
+cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
+    %define  hd  r5mp
+%endif
+    shl      wd, 2
+    add    srcq, wq
+    neg      wq
+    mov     t2q, dstq
+    mov     t1q, wq
+    pxor     m2, m2
+    mova     m3, [clip_10bit]
+    mova     m4, [convert_to_unsigned_10bit]
+
+    .loop_h:
+    mov    dstq, t2q
+    mov      wq, t1q
+
+    .loop_w:
+    movu     m0, [srcq+wq+0*mmsize]
+    movu     m1, [srcq+wq+1*mmsize]
+
+    paddd    m0, m4
+    paddd    m1, m4
+    packusdw m0, m0, m1
+    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
+
+    movu     [dstq], m0
+
+    add      dstq, 1*mmsize
+    add      wq,   2*mmsize
+    jl       .loop_w
+
+    add    srcq, src_strideq
+    add     t2q, dst_strideq
+    sub      hd, 1
+    jg       .loop_h
+
+    RET
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
new file mode 100644
index 0000000..8cb84eb
--- /dev/null
+++ b/libavcodec/x86/diracdsp_init.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/diracdsp.h"
+#include "fpel.h"
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+
+void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
+#if HAVE_X86ASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+
+#else // HAVE_X86ASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                     \
+    void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,              \
+                                   const uint8_t *src, int stride, int width, int height);
+
+#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
+
+#endif // HAVE_X86ASM
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+void ff_diracdsp_init_x86(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(mm_flags)) {
+        c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+        c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+        c->add_rect_clamped = ff_add_rect_clamped_mmx;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
+#endif
+        PIXFUNC(put, 0, mmx);
+        PIXFUNC(avg, 0, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(mm_flags)) {
+        PIXFUNC(avg, 0, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(mm_flags)) {
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+
+    if (EXTERNAL_SSE4(mm_flags)) {
+        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
+        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+    }
+}
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index 091f322..b4f7595 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
 ;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index f1ff7bd..fd6f150 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm
new file mode 100644
index 0000000..3bf240c
--- /dev/null
+++ b/libavcodec/x86/exrdsp.asm
@@ -0,0 +1,118 @@
+;******************************************************************************
+;* X86 Optimized functions for Open Exr Decoder
+;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
+;*
+;* reorder_pixels, predictor based on patch by John Loy
+;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
+;*
+;* predictor AVX/AVX2 by Henrik Gramner
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pb_15
+cextern pb_80
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+;------------------------------------------------------------------------------
+
+%macro REORDER_PIXELS 0
+cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
+    lea                              src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
+    add                               dstq, sizeq         ; dst offset by size
+    shr                              sizeq, 1             ; half_size
+    add                              src1q, sizeq         ; offset src by half_size
+    neg                              sizeq                ; size = offset for dst, src1, src2
+.loop:
+
+    mova                                m0, [src1q+sizeq]        ; load first part
+    movu                                m1, [src2q+sizeq]        ; load second part
+    SBUTTERFLY bw, 0, 1, 2                                       ; interleaved
+    mova                 [dstq+2*sizeq   ], xm0                  ; copy to dst
+    mova                 [dstq+2*sizeq+16], xm1
+%if cpuflag(avx2)
+    vperm2i128                          m0, m0, m1, q0301
+    mova                 [dstq+2*sizeq+32], m0
+%endif
+    add     sizeq, mmsize
+    jl .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+REORDER_PIXELS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+REORDER_PIXELS
+%endif
+
+
+;------------------------------------------------------------------------------
+; void ff_predictor(uint8_t *src, ptrdiff_t size);
+;------------------------------------------------------------------------------
+
+%macro PREDICTOR 0
+cglobal predictor, 2,2,5, src, size
+    mova             m0, [pb_80]
+    mova            xm1, [pb_15]
+    mova            xm2, xm0
+    add            srcq, sizeq
+    neg           sizeq
+.loop:
+    pxor             m3, m0, [srcq + sizeq]
+    pslldq           m4, m3, 1
+    paddb            m3, m4
+    pslldq           m4, m3, 2
+    paddb            m3, m4
+    pslldq           m4, m3, 4
+    paddb            m3, m4
+    pslldq           m4, m3, 8
+%if mmsize == 32
+    paddb            m3, m4
+    paddb           xm2, xm3
+    vextracti128    xm4, m3, 1
+    mova [srcq + sizeq], xm2
+    pshufb          xm2, xm1
+    paddb           xm2, xm4
+    mova [srcq + sizeq + 16], xm2
+%else
+    paddb            m2, m3
+    paddb            m2, m4
+    mova [srcq + sizeq], m2
+%endif
+    pshufb          xm2, xm1
+    add           sizeq, mmsize
+    jl .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+PREDICTOR
+
+INIT_XMM avx
+PREDICTOR
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+PREDICTOR
+%endif
diff --git a/libavcodec/x86/exrdsp_init.c b/libavcodec/x86/exrdsp_init.c
new file mode 100644
index 0000000..63b3480
--- /dev/null
+++ b/libavcodec/x86/exrdsp_init.c
@@ -0,0 +1,52 @@
+/*
+ * OpenEXR (.exr) image decoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/exrdsp.h"
+
+void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+
+void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_avx(uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_avx2(uint8_t *src, ptrdiff_t size);
+
+av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        dsp->reorder_pixels = ff_reorder_pixels_sse2;
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        dsp->predictor = ff_predictor_ssse3;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        dsp->predictor = ff_predictor_avx;
+    }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        dsp->reorder_pixels = ff_reorder_pixels_avx2;
+        dsp->predictor      = ff_predictor_avx2;
+    }
+}
diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c
index e01509e..112566d 100644
--- a/libavcodec/x86/fdct.c
+++ b/libavcodec/x86/fdct.c
@@ -13,20 +13,20 @@
  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdct.h b/libavcodec/x86/fdct.h
index c94a977..648cdc5 100644
--- a/libavcodec/x86/fdct.h
+++ b/libavcodec/x86/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdctdsp_init.c b/libavcodec/x86/fdctdsp_init.c
index 4e8e4eb..0cb5fd6 100644
--- a/libavcodec/x86/fdctdsp_init.c
+++ b/libavcodec/x86/fdctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index 63e92f7..a671e8f 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -68,11 +68,12 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
 ps_m1p1: dd 1<<31, 0
 
+cextern ps_neg
+
 %assign i 16
-%rep 13
+%rep 14
 cextern cos_ %+ i
 %assign i i<<1
 %endrep
@@ -198,7 +199,7 @@ SECTION .text
     vextractf128  %4 %+ H(%5), %3, 0
     vextractf128   %4(%5 + 1), %2, 1
     vextractf128  %4 %+ H(%5 + 1), %3, 1
-%elif cpuflag(sse)
+%elif cpuflag(sse) || cpuflag(3dnow)
     mova     %3, %2
     unpcklps %2, %1
     unpckhps %3, %1
@@ -321,6 +322,7 @@ IF%1 mova  Z(1), m5
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 align 16
 fft8_avx:
     mova      m0, Z(0)
@@ -410,6 +412,8 @@ fft32_interleave_avx:
     jg .deint_loop
     ret
 
+%endif
+
 INIT_XMM sse
 
 align 16
@@ -553,6 +557,7 @@ DEFINE_ARGS zc, w, n, o1, o3
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 DECL_PASS pass_avx, PASS_BIG 1
 DECL_PASS pass_interleave_avx, PASS_BIG 0
 
@@ -563,6 +568,7 @@ cglobal fft_calc, 2,5,8
     FFT_DISPATCH _interleave %+ SUFFIX, r1
     REP_RET
 
+%endif
 
 INIT_XMM sse
 
@@ -650,6 +656,68 @@ cglobal fft_permute, 2,7,1
     jl      .loopcopy
     REP_RET
 
+%macro IMDCT_CALC_FUNC 0
+cglobal imdct_calc, 3,5,3
+    mov     r3d, [r0 + FFTContext.mdctsize]
+    mov     r4,  [r0 + FFTContext.imdcthalf]
+    add     r1,  r3
+    PUSH    r3
+    PUSH    r1
+%if ARCH_X86_32
+    push    r2
+    push    r1
+    push    r0
+%else
+    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
+%endif
+    call    r4
+%if ARCH_X86_32
+    add     esp, 12
+%else
+    add     rsp, 8+32*WIN64
+%endif
+    POP     r1
+    POP     r3
+    lea     r0, [r1 + 2*r3]
+    mov     r2, r3
+    sub     r3, mmsize
+    neg     r2
+    mova    m2, [ps_neg]
+.loop:
+%if mmsize == 8
+    PSWAPD  m0, [r1 + r3]
+    PSWAPD  m1, [r0 + r2]
+    pxor    m0, m2
+%else
+    mova    m0, [r1 + r3]
+    mova    m1, [r0 + r2]
+    shufps  m0, m0, 0x1b
+    shufps  m1, m1, 0x1b
+    xorps   m0, m2
+%endif
+    mova [r0 + r3], m1
+    mova [r1 + r2], m0
+    sub     r3, mmsize
+    add     r2, mmsize
+    jl      .loop
+%if cpuflag(3dnow)
+    femms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+IMDCT_CALC_FUNC
+INIT_MMX 3dnowext
+IMDCT_CALC_FUNC
+%endif
+
+INIT_XMM sse
+IMDCT_CALC_FUNC
+
 %if ARCH_X86_32
 INIT_MMX 3dnow
 %define mulps pfmul
@@ -684,7 +752,7 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
 %endif
 
 %assign n 1<<%1
-%rep 17-%1
+%rep 18-%1
 %assign n2 n/2
 %assign n4 n/4
 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
@@ -709,9 +777,11 @@ align 8
 dispatch_tab %+ fullsuffix: pointer list_of_fft
 %endmacro ; DECL_FFT
 
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 DECL_FFT 6
 DECL_FFT 6, _interleave
+%endif
 INIT_XMM sse
 DECL_FFT 5
 DECL_FFT 5, _interleave
@@ -724,70 +794,6 @@ DECL_FFT 4
 DECL_FFT 4, _interleave
 %endif
 
-%if CONFIG_MDCT
-
-%macro IMDCT_CALC_FUNC 0
-cglobal imdct_calc, 3,5,3
-    mov     r3d, [r0 + FFTContext.mdctsize]
-    mov     r4,  [r0 + FFTContext.imdcthalf]
-    add     r1,  r3
-    PUSH    r3
-    PUSH    r1
-%if ARCH_X86_32
-    push    r2
-    push    r1
-    push    r0
-%else
-    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
-%endif
-    call    r4
-%if ARCH_X86_32
-    add     esp, 12
-%else
-    add     rsp, 8+32*WIN64
-%endif
-    POP     r1
-    POP     r3
-    lea     r0, [r1 + 2*r3]
-    mov     r2, r3
-    sub     r3, mmsize
-    neg     r2
-    mova    m2, [ps_m1m1m1m1]
-.loop:
-%if mmsize == 8
-    PSWAPD  m0, [r1 + r3]
-    PSWAPD  m1, [r0 + r2]
-    pxor    m0, m2
-%else
-    mova    m0, [r1 + r3]
-    mova    m1, [r0 + r2]
-    shufps  m0, m0, 0x1b
-    shufps  m1, m1, 0x1b
-    xorps   m0, m2
-%endif
-    mova [r0 + r3], m1
-    mova [r1 + r2], m0
-    sub     r3, mmsize
-    add     r2, mmsize
-    jl      .loop
-%if cpuflag(3dnow)
-    femms
-    RET
-%else
-    REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-IMDCT_CALC_FUNC
-INIT_MMX 3dnowext
-IMDCT_CALC_FUNC
-%endif
-
-INIT_XMM sse
-IMDCT_CALC_FUNC
-
 INIT_XMM sse
 %undef mulps
 %undef addps
@@ -985,7 +991,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
     sub   r4, r3
 %endif
 %if notcpuflag(3dnowext) && mmsize == 8
-    movd  m7, [ps_m1m1m1m1]
+    movd  m7, [ps_neg]
 %endif
 .pre:
 %if ARCH_X86_64 == 0
@@ -1073,6 +1079,7 @@ DECL_IMDCT
 %endif
 
 INIT_YMM avx
-DECL_IMDCT
 
-%endif ; CONFIG_MDCT
+%if HAVE_AVX_EXTERNAL
+DECL_IMDCT
+%endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 94405d0..398091e 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,4 +27,12 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 #endif /* AVCODEC_X86_FFT_H */
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index ed12909..928f1dc 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,23 +28,33 @@ av_cold void ff_fft_init_x86(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (s->nbits > 16)
+        return;
+
 #if ARCH_X86_32
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnow;
+        s->imdct_half = ff_imdct_half_3dnow;
         s->fft_calc   = ff_fft_calc_3dnow;
     }
 
     if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnowext;
+        s->imdct_half = ff_imdct_half_3dnowext;
         s->fft_calc   = ff_fft_calc_3dnowext;
     }
 #endif /* ARCH_X86_32 */
 
     if (EXTERNAL_SSE(cpu_flags)) {
+        s->imdct_calc  = ff_imdct_calc_sse;
+        s->imdct_half  = ff_imdct_half_sse;
         s->fft_permute = ff_fft_permute_sse;
         s->fft_calc    = ff_fft_calc_sse;
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
     }
 
     if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
+        s->imdct_half      = ff_imdct_half_avx;
         s->fft_calc        = ff_fft_calc_avx;
         s->fft_permutation = FF_FFT_PERM_AVX;
     }
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
new file mode 100644
index 0000000..e285158
--- /dev/null
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,101 @@
+;******************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea  resq,   [resq+orderq*4]
+lea  smpq,   [smpq+orderq*4]
+lea  coefsq, [coefsq+orderq*4]
+sub  length,  orderd
+movd m3,      r5m
+neg  orderq
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
+        pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
+        paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
+    movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
+    psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
+    movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
+
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
+jg .looplen
+RET
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
new file mode 100644
index 0000000..7138611
--- /dev/null
+++ b/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,313 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PMACSDQL 5
+%if cpuflag(xop)
+    pmacsdql %1, %2, %3, %1
+%else
+    pmuldq   %2, %3
+    paddq    %1, %2
+%endif
+%endmacro
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+    sub    lend, pred_orderd
+    jle .ret
+    lea    decodedq, [decodedq+pred_orderq*4-8]
+    lea    coeffsq, [coeffsq+pred_orderq*4]
+    neg    pred_orderq
+    movd   m4, qlevelm
+ALIGN 16
+.loop_sample:
+    movd   m0, [decodedq+pred_orderq*4+8]
+    add    decodedq, 8
+    movd   m1, [coeffsq+pred_orderq*4]
+    pxor   m2, m2
+    pxor   m3, m3
+    lea    jq, [pred_orderq+1]
+    test   jq, jq
+    jz .end_order
+.loop_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    movd   m0, [decodedq+jq*4]
+    PMACSDQL m3, m1, m0, m3, m1
+    movd   m1, [coeffsq+jq*4]
+    inc    jq
+    jl .loop_order
+.end_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    psrlq  m2, m4
+    movd   m0, [decodedq]
+    paddd  m0, m2
+    movd   [decodedq], m0
+    sub  lend, 2
+    jl .ret
+    PMACSDQL m3, m1, m0, m3, m1
+    psrlq  m3, m4
+    movd   m1, [decodedq+4]
+    paddd  m1, m3
+    movd   [decodedq+4], m1
+    jg .loop_sample
+.ret:
+    REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
+;                                                   int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_16 3-4
+cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    shl      lend, 2
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    add      in1q, lenq
+    add      in0q, lenq
+    add      outq, lenq
+    neg      lenq
+
+align 16
+.loop:
+    mova       m0, [in0q + lenq]
+    mova       m1, [in1q + lenq]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+%ifnidn %1, indep2
+    p%4d       m2, m0, m1
+%endif
+    packssdw  m%2, m%2
+    packssdw  m%3, m%3
+    punpcklwd m%2, m%3
+    psllw     m%2, m3
+    mova [outq + lenq], m%2
+    add      lenq, 16
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 ls, 0, 2, sub
+FLAC_DECORRELATE_16 rs, 2, 1, add
+FLAC_DECORRELATE_16 ms, 2, 0, add
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
+;                                        int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_32 5
+cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    sub      in1q, in0q
+
+align 16
+.loop:
+    mova       m0, [in0q]
+    mova       m1, [in0q + in1q]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+    p%5d       m2, m0, m1
+    pslld     m%2, m3
+    pslld     m%3, m3
+
+    SBUTTERFLY dq, %2, %3, %4
+
+    mova  [outq         ], m%2
+    mova  [outq + mmsize], m%3
+
+    add      in0q, mmsize
+    add      outq, mmsize*2
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
+FLAC_DECORRELATE_32 rs, 2, 1, 0, add
+FLAC_DECORRELATE_32 ms, 2, 0, 1, add
+
+;-----------------------------------------------------------------------------------------
+;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
+;                                            int len, int shift);
+;-----------------------------------------------------------------------------------------
+;%1 = bps
+;%2 = channels
+;%3 = last xmm reg used
+;%4 = word/dword (shift instruction)
+%macro FLAC_DECORRELATE_INDEP 4
+%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
+cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
+%if ARCH_X86_32
+%if %2 == 6
+    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
+    %define  lend  dword r3m
+%else
+    mov      lend, lenm
+%endif
+%endif
+    movd      m%3, r4m
+
+%assign %%i 1
+%rep %2-1
+    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
+%assign %%i %%i+1
+%endrep
+
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+
+%assign %%i 1
+%rep %2-1
+    sub      in %+ %%i %+ q, in0q
+%assign %%i %%i+1
+%endrep
+
+align 16
+.loop:
+    mova       m0, [in0q]
+
+%assign %%i 1
+%rep REPCOUNT-1
+    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
+%assign %%i %%i+1
+%endrep
+
+%if %1 == 32
+
+%if %2 == 8
+    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+%elif %2 == 6
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+
+    punpcklqdq m6, m0, m2
+    punpckhqdq m2, m4
+    shufps     m4, m0, 0xe4
+    punpcklqdq m0, m1, m3
+    punpckhqdq m3, m5
+    shufps     m5, m1, 0xe4
+    SWAP 0,6,1,4,5,3
+%elif %2 == 4
+    TRANSPOSE4x4D 0, 1, 2, 3, 4
+%else ; %2 == 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%else ; %1 == 16
+
+%if %2 == 8
+    packssdw   m0, [in0q + in4q]
+    packssdw   m1, [in0q + in5q]
+    packssdw   m2, [in0q + in6q]
+    packssdw   m3, [in0q + in7q]
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+%elif %2 == 6
+    packssdw   m0, [in0q + in3q]
+    packssdw   m1, [in0q + in4q]
+    packssdw   m2, [in0q + in5q]
+    pshufd     m3, m0,     q1032
+    punpcklwd  m0, m1
+    punpckhwd  m1, m2
+    punpcklwd  m2, m3
+
+    shufps     m3, m0, m2, q2020
+    shufps     m0, m1,     q2031
+    shufps     m2, m1,     q3131
+    shufps     m1, m2, m3, q3120
+    shufps     m3, m0,     q0220
+    shufps     m0, m2,     q3113
+    SWAP 2, 0, 3
+%else ; %2 == 4
+    packssdw   m0, [in0q + in2q]
+    packssdw   m1, [in0q + in3q]
+    SBUTTERFLY wd, 0, 1, 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%endif
+
+%assign %%i 0
+%rep REPCOUNT
+    psll%4   m %+ %%i, m%3
+%assign %%i %%i+1
+%endrep
+
+%assign %%i 0
+%rep REPCOUNT
+    mova [outq + %%i*mmsize], m %+ %%i
+%assign %%i %%i+1
+%endrep
+
+    add      in0q, mmsize
+    add      outq, mmsize*REPCOUNT
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
+FLAC_DECORRELATE_INDEP 32, 2, 3, d
+FLAC_DECORRELATE_INDEP 16, 4, 3, w
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 16, 6, 4, w
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
+
+INIT_XMM avx
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 0000000..1971f81
--- /dev/null
+++ b/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt)                                                      \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                             int len, int shift);                        \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16,  avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32,  avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+                                 int bps)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_sse4;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+        }
+    }
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_xop;
+    }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+    }
+#endif
+#endif /* HAVE_X86ASM */
+}
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 2a3e4a5..8f62a0a 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,20 +2,20 @@
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -75,3 +75,50 @@ INIT_XMM sse
 INT32_TO_FLOAT_FMUL_SCALAR 5
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_SCALAR 3
+
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
+;                                    const float *mul, int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
+cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
+    shl     lend, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+    movss     m0, [mulq]
+    SPLATD    m0
+%if cpuflag(sse2)
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     mulq, 4
+    add     lenq, 32
+    jl .loop
+%if notcpuflag(sse2)
+    ;; cvtpi2ps switches to MMX even if the source is a memory location
+    ;; possible an error in documentation since every tested CPU disagrees with
+    ;; that. Use emms anyway since the vast majority of machines will use the
+    ;; SSE2 variant
+    emms
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_ARRAY8
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_ARRAY8
+
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 6306c8b..df09705 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,10 @@
 
 void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
 void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
 
 #endif /* HAVE_X86ASM */
 
@@ -41,9 +45,11 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
 
     if (EXTERNAL_SSE(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
     }
 #endif /* HAVE_X86ASM */
 }
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b1be289..961a158 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,84 +25,82 @@
 
 SECTION .text
 
-INIT_MMX mmxext
+%macro PAVGB_MMX 4
+    LOAD   %3, %1
+    por    %3, %2
+    pxor   %2, %1
+    pand   %2, %4
+    psrlq  %2, 1
+    psubb  %3, %2
+    SWAP   %2, %3
+%endmacro
+
 ; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
 ;                        ptrdiff_t line_size, int h)
-%macro PIXELS48 2
-%if %2 == 4
-%define OP movh
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN  mmsize
 %else
-%define OP mova
+%define LOAD movu
+%define SAVE mova
+%define LEN  %2
 %endif
-cglobal %1_pixels%2, 4,5
+cglobal %1_pixels%2, 4,5,4
     lea          r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+    pcmpeqd      m6, m6
+    paddb        m6, m6
+%endif
+%endif
 .loop:
-    OP           m0, [r1]
-    OP           m1, [r1+r2]
-    OP           m2, [r1+r2*2]
-    OP           m3, [r1+r4]
-    lea          r1, [r1+r2*4]
+%assign %%i 0
+%rep LEN/mmsize
+    LOAD         m0, [r1 + %%i]
+    LOAD         m1, [r1+r2 + %%i]
+    LOAD         m2, [r1+r2*2 + %%i]
+    LOAD         m3, [r1+r4 + %%i]
 %ifidn %1, avg
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
+%if notcpuflag(mmxext)
+    PAVGB_MMX    [r0 + %%i], m0, m4, m6
+    PAVGB_MMX    [r0+r2 + %%i], m1, m5, m6
+    PAVGB_MMX    [r0+r2*2 + %%i], m2, m4, m6
+    PAVGB_MMX    [r0+r4 + %%i], m3, m5, m6
+%else
+    pavgb        m0, [r0 + %%i]
+    pavgb        m1, [r0+r2 + %%i]
+    pavgb        m2, [r0+r2*2 + %%i]
+    pavgb        m3, [r0+r4 + %%i]
+%endif
 %endif
-    OP         [r0], m0
-    OP      [r0+r2], m1
-    OP    [r0+r2*2], m2
-    OP      [r0+r4], m3
+    SAVE       [r0 + %%i], m0
+    SAVE    [r0+r2 + %%i], m1
+    SAVE  [r0+r2*2 + %%i], m2
+    SAVE    [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
     sub         r3d, 4
+    lea          r1, [r1+r2*4]
     lea          r0, [r0+r2*4]
     jne       .loop
     RET
 %endmacro
 
-PIXELS48 put, 4
-PIXELS48 avg, 4
-PIXELS48 put, 8
-PIXELS48 avg, 8
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
 
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
 
 INIT_XMM sse2
-; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal put_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
-
-; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal avg_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 88d1415..4e83cf7 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,18 +22,24 @@
 #include <stddef.h>
 #include <stdint.h>
 
+void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
 void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int h);
-void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
 void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
deleted file mode 100644
index 813bcc2..0000000
--- a/libavcodec/x86/fpel_mmx.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "fpel.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-// in case more speed is needed - unrolling would certainly help
-void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             "movq  8%0, %%mm0          \n\t"
-             "movq  8%1, %%mm1          \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, 8%0          \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"FF_REG_a"   \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"FF_REG_a, "memory"
-        );
-}
-
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"FF_REG_a"   \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add %%"FF_REG_a", %1           \n\t"
-        "add %%"FF_REG_a", %2           \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"FF_REG_a, "memory"
-        );
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/g722dsp.asm b/libavcodec/x86/g722dsp.asm
new file mode 100644
index 0000000..a529422
--- /dev/null
+++ b/libavcodec/x86/g722dsp.asm
@@ -0,0 +1,54 @@
+;******************************************************************************
+;* SIMD optimized DSP functions for G722 coding
+;*
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_qmf_coeffs:  dw   3, -210,  -11, -805,  -11,  951,  53, 3876
+pw_qmf_coeffs2: dw  12, 3876, -156,  951,   32, -805, 362, -210
+pw_qmf_coeffs3: dw 362,    0 ,  32,    0, -156,    0,  12,    0
+pw_qmf_coeffs4: dw  53,    0,  -11,    0,  -11,    0,   3,    0
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal g722_apply_qmf, 2, 2, 5, prev, out
+    movu m0, [prevq+mmsize*0]
+    movu m1, [prevq+mmsize*1]
+    movu m2, [prevq+mmsize*2]
+    punpcklwd m3, m0, m1
+    punpckhwd m0, m1
+    punpcklwd m4, m2, m2
+    punpckhwd m2, m2
+    pmaddwd   m3, [pw_qmf_coeffs ]
+    pmaddwd   m0, [pw_qmf_coeffs2]
+    pmaddwd   m4, [pw_qmf_coeffs3]
+    pmaddwd   m2, [pw_qmf_coeffs4]
+    paddd     m0, m3
+    paddd     m2, m4
+    paddd     m0, m2
+    pshufd    m2, m0, q0032
+    paddd     m0, m2
+    pshufd    m0, m0, q0001
+    movq  [outq], m0
+    RET
diff --git a/libavcodec/x86/g722dsp_init.c b/libavcodec/x86/g722dsp_init.c
new file mode 100644
index 0000000..6146951
--- /dev/null
+++ b/libavcodec/x86/g722dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/g722dsp.h"
+
+void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
+
+av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        dsp->apply_qmf = ff_g722_apply_qmf_sse2;
+}
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index cd726ba..77c8cf1 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,20 +1,22 @@
 ;******************************************************************************
 ;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index d4fab98..ab81063 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_cabac.c b/libavcodec/x86/h264_cabac.c
index 475c450..2edc6d7 100644
--- a/libavcodec/x86/h264_cabac.c
+++ b/libavcodec/x86/h264_cabac.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,9 +33,15 @@
 
 #if HAVE_INLINE_ASM
 
+#if ARCH_X86_64
+#define REG64 "r"
+#else
+#define REG64 "m"
+#endif
+
 //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
 //as that would make optimization work hard)
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define decode_significance decode_significance_x86
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
@@ -52,6 +58,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
     __asm__ volatile(
         "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -127,6 +134,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -135,7 +143,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "3:                                     \n\t"
 
         "mov %10, %0                            \n\t"
-        "movzbl (%0, %6), %k6                   \n\t"
+        "movzb (%0, %6), %6                     \n\t"
         "add %9, %6                             \n\t"
 
         BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
@@ -146,14 +154,14 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%15")
 
-        "mov %1, %k6                            \n\t"
+        "mov %1, %6                             \n\t"
         "test $1, %4                            \n\t"
         " jz 4f                                 \n\t"
 
 #ifdef BROKEN_RELOCATIONS
-        "movzbl %c14(%15, %q6), %k6\n\t"
+        "movzb %c14(%15, %q6), %6\n\t"
 #else
-        "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
+        "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
 #endif
         "add %11, %6                            \n\t"
 
@@ -166,8 +174,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              "%15")
 
         "mov %2, %0                             \n\t"
-        "mov %1, %k6                            \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %1, %6                             \n\t"
+        "mov %k6, (%0)                          \n\t"
 
         "test $1, %4                            \n\t"
         " jnz 5f                                \n\t"
@@ -175,19 +183,19 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "add"FF_OPSIZE"  $4, %2                 \n\t"
 
         "4:                                     \n\t"
-        "addl $1, %k6                           \n\t"
-        "mov %k6, %1                            \n\t"
-        "cmpl $63, %k6                          \n\t"
+        "add $1, %6                             \n\t"
+        "mov %6, %1                             \n\t"
+        "cmp $63, %6                            \n\t"
         " jb 3b                                 \n\t"
         "mov %2, %0                             \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %k6, (%0)                          \n\t"
         "5:                                     \n\t"
         "addl %8, %k0                           \n\t"
         "shr $2, %k0                            \n\t"
-        : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+        : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
           "=&r"(bit), "+&r"(c->range), "=&r"(state)
         : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
-          "m"(sig_off), "m"(last_coeff_ctx_base),
+          REG64(sig_off), REG64(last_coeff_ctx_base),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end)),
           "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
@@ -195,6 +203,6 @@ static int decode_significance_8x8_x86(CABACContext *c,
     );
     return coeff_count;
 }
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index a9cac59..b5a78b5 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
 ;*               2005-2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index ff53b91..34bc419 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -249,8 +249,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7
 %define CHROMAMC_AVG  NOTHING
 INIT_XMM sse2
 CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 put
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 put
 CHROMA_MC2 put
@@ -258,8 +260,10 @@ CHROMA_MC2 put
 %define CHROMAMC_AVG  AVG
 INIT_XMM sse2
 CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 avg
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 avg
 CHROMA_MC2 avg
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 4b9cf85..6702ae9 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -37,11 +37,6 @@ cextern pb_0
 cextern pb_1
 cextern pb_3
 
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
-    [base], [base+stride], [base+stride*2], [base3], \
-    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
 %define PASS8ROWS(base, base3, stride, stride3, offset) \
     PASS8ROWS(base+offset, base3+offset, stride, stride3)
 
@@ -287,19 +282,18 @@ cextern pb_3
 ;                        int8_t *tc0)
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_LUMA 0
-cglobal deblock_v_luma_8, 5,5,10
-    movsxdifnidn  r1, r1d
+cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
     movd    m8, [r4] ; tc0
-    lea     r4, [r1*3]
-    dec     r2d        ; alpha-1
+    lea     r4, [stride_q*3]
+    dec     alpha_d        ; alpha-1
     neg     r4
-    dec     r3d        ; beta-1
-    add     r4, r0     ; pix-3*stride
+    dec     beta_d        ; beta-1
+    add     base3_q, pix_q     ; pix-3*stride
 
-    mova    m0, [r4+r1]   ; p1
-    mova    m1, [r4+2*r1] ; p0
-    mova    m2, [r0]      ; q0
-    mova    m3, [r0+r1]   ; q1
+    mova    m0, [base3_q + stride_q]   ; p1
+    mova    m1, [base3_q + 2*stride_q] ; p0
+    mova    m2, [pix_q]      ; q0
+    mova    m3, [pix_q + stride_q]   ; q1
     LOAD_MASK r2d, r3d
 
     punpcklbw m8, m8
@@ -309,24 +303,24 @@ cglobal deblock_v_luma_8, 5,5,10
     pandn   m9, m7
     pand    m8, m9
 
-    movdqa  m3, [r4] ; p2
+    movdqa  m3, [base3_q] ; p2
     DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
     pand    m6, m9
     psubb   m7, m8, m6
     pand    m6, m8
-    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+    LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4
 
-    movdqa  m4, [r0+2*r1] ; q2
+    movdqa  m4, [pix_q + 2*stride_q] ; q2
     DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
     pand    m6, m9
     pand    m8, m6
     psubb   m7, m6
-    mova    m3, [r0+r1]
-    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+    mova    m3, [pix_q + stride_q]
+    LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6
 
     DEBLOCK_P0_Q0
-    mova    [r4+2*r1], m1
-    mova    [r0], m2
+    mova    [base3_q + 2*stride_q], m1
+    mova    [pix_q], m2
     RET
 
 ;-----------------------------------------------------------------------------
@@ -336,7 +330,6 @@ cglobal deblock_v_luma_8, 5,5,10
 INIT_MMX cpuname
 cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
     movsxd r7,  r1d
-    movsxdifnidn  r1, r1d
     lea    r8,  [r7+r7*2]
     lea    r6,  [r0-4]
     lea    r5,  [r0-4+r8]
@@ -384,10 +377,101 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
     RET
 %endmacro
 
+%macro DEBLOCK_H_LUMA_MBAFF 0
+
+cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
+    movsxd stride_q,   stride_d
+    dec    alpha_d
+    dec    beta_d
+    mov    base3_q,    pix_q
+    lea    stride3_q, [3*stride_q]
+    add    base3_q,    stride3_q
+
+    movq m0, [pix_q - 4]
+    movq m1, [pix_q + stride_q - 4]
+    movq m2, [pix_q + 2*stride_q - 4]
+    movq m3, [base3_q - 4]
+    movq m4, [base3_q + stride_q - 4]
+    movq m5, [base3_q + 2*stride_q - 4]
+    movq m6, [base3_q + stride3_q - 4]
+    movq m7, [base3_q + 4*stride_q - 4]
+
+    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+
+    %assign i 0
+    %rep 8
+        movq [rsp + 16*i], m %+ i
+        %assign i i+1
+    %endrep
+
+    ; p2 = m1 [rsp + 16]
+    ; p1 = m2 [rsp + 32]
+    ; p0 = m3 [rsp + 48]
+    ; q0 = m4 [rsp + 64]
+    ; q1 = m5 [rsp + 80]
+    ; q2 = m6 [rsp + 96]
+
+    SWAP 0, 2
+    SWAP 1, 3
+    SWAP 2, 4
+    SWAP 3, 5
+
+    LOAD_MASK alpha_d, beta_d
+    movd m8, [tc0_q]
+    punpcklbw m8, m8
+    pcmpeqb m9, m9
+    pcmpeqb m9, m8
+    pandn   m9, m7
+    pand    m8, m9
+
+    movdqa  m3, [rsp + 16] ; p2
+    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+    pand    m6, m9
+    psubb   m7, m8, m6
+    pand    m6, m8
+    LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
+
+    movdqa  m4, [rsp + 96] ; q2
+    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+    pand    m6, m9
+    pand    m8, m6
+    psubb   m7, m6
+    mova    m3, [rsp + 80]
+    LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
+
+    DEBLOCK_P0_Q0
+    SWAP 1, 3
+    SWAP 2, 4
+    movq m0, [rsp]
+    movq m1, [rsp + 16]
+    movq m2, [rsp + 32]
+    movq m5, [rsp + 80]
+    movq m6, [rsp + 96]
+    movq m7, [rsp + 112]
+
+    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
+    movq [pix_q - 4], m0
+    movq [pix_q + stride_q - 4], m1
+    movq [pix_q + 2*stride_q - 4], m2
+    movq [base3_q - 4], m3
+    movq [base3_q + stride_q - 4], m4
+    movq [base3_q + 2*stride_q - 4], m5
+    movq [base3_q + stride3_q - 4], m6
+    movq [base3_q + 4*stride_q - 4], m7
+
+RET
+
+%endmacro
+
 INIT_XMM sse2
+DEBLOCK_H_LUMA_MBAFF
 DEBLOCK_LUMA
+
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
+DEBLOCK_H_LUMA_MBAFF
 DEBLOCK_LUMA
+%endif
 
 %else
 
@@ -397,7 +481,6 @@ DEBLOCK_LUMA
 ;                         int8_t *tc0)
 ;-----------------------------------------------------------------------------
 cglobal deblock_%1_luma_8, 5,5,8,2*%2
-    movsxdifnidn  r1, r1d
     lea     r4, [r1*3]
     dec     r2     ; alpha-1
     neg     r4
@@ -448,7 +531,6 @@ cglobal deblock_%1_luma_8, 5,5,8,2*%2
 ;-----------------------------------------------------------------------------
 INIT_MMX cpuname
 cglobal deblock_h_luma_8, 0,5,8,0x60+12
-    movsxdifnidn  r1, r1d
     mov    r0, r0mp
     mov    r3, r1m
     lea    r4, [r3*3]
@@ -503,8 +585,10 @@ INIT_MMX mmxext
 DEBLOCK_LUMA v8, 8
 INIT_XMM sse2
 DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA v, 16
+%endif
 
 %endif ; ARCH
 
@@ -650,7 +734,6 @@ cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
 %else
 cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
 %endif
-    movsxdifnidn  r1, r1d
     lea     r4, [r1*4]
     lea     r5, [r1*3] ; 3*stride
     dec     r2d        ; alpha-1
@@ -708,7 +791,6 @@ INIT_MMX cpuname
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_luma_intra_8, 4,9,0,0x80
     movsxd r7,  r1d
-    movsxdifnidn  r1, r1d
     lea    r8,  [r7*3]
     lea    r6,  [r0-4]
     lea    r5,  [r0-4+r8]
@@ -778,8 +860,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA v
+%endif
 %if ARCH_X86_64 == 0
 INIT_MMX mmxext
 DEBLOCK_LUMA_INTRA v8
@@ -788,7 +872,6 @@ DEBLOCK_LUMA_INTRA v8
 INIT_MMX mmxext
 
 %macro CHROMA_V_START 0
-    movsxdifnidn  r1, r1d
     dec    r2d      ; alpha-1
     dec    r3d      ; beta-1
     mov    t5, r0
@@ -797,7 +880,6 @@ INIT_MMX mmxext
 %endmacro
 
 %macro CHROMA_H_START 0
-    movsxdifnidn  r1, r1d
     dec    r2d
     dec    r3d
     sub    r0, 2
@@ -844,7 +926,11 @@ cglobal deblock_h_chroma_8, 5,7
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call ff_chroma_inter_body_mmxext
+    LOAD_MASK  r2d, r3d
+    movd       m6, [r4] ; tc0
+    punpcklbw  m6, m6
+    pand       m7, m6
+    DEBLOCK_P0_Q0
     movq  m0, buf0
     movq  m3, buf1
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
@@ -862,7 +948,52 @@ ff_chroma_inter_body_mmxext:
     DEBLOCK_P0_Q0
     ret
 
+%define t5 r4
+%define t6 r5
+
+cglobal deblock_h_chroma422_8, 5, 6
+    SUB rsp, (1+ARCH_X86_64*2)*mmsize
+    %if ARCH_X86_64
+        %define buf0 [rsp+16]
+        %define buf1 [rsp+8]
+    %else
+        %define buf0 r0m
+        %define buf1 r2m
+    %endif
+
+    movd m6, [r4]
+    punpcklbw m6, m6
+    movq [rsp], m6
+    CHROMA_H_START
 
+    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+    movq buf0, m0
+    movq buf1, m3
+    LOAD_MASK r2d, r3d
+    movd m6, [rsp]
+    punpcklwd m6, m6
+    pand m7, m6
+    DEBLOCK_P0_Q0
+    movq m0, buf0
+    movq m3, buf1
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+    lea r0, [r0+r1*8]
+    lea t5, [t5+r1*8]
+
+    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+    movq buf0, m0
+    movq buf1, m3
+    LOAD_MASK r2d, r3d
+    movd m6, [rsp+4]
+    punpcklwd m6, m6
+    pand m7, m6
+    DEBLOCK_P0_Q0
+    movq m0, buf0
+    movq m3, buf1
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+    ADD rsp, (1+ARCH_X86_64*2)*mmsize
+RET
 
 ; in: %1=p0 %2=p1 %3=q1
 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
@@ -875,9 +1006,6 @@ ff_chroma_inter_body_mmxext:
     pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
 %endmacro
 
-%define t5 r4
-%define t6 r5
-
 ;------------------------------------------------------------------------------
 ; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
 ;------------------------------------------------------------------------------
@@ -902,6 +1030,20 @@ cglobal deblock_h_chroma_intra_8, 4,6
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
     RET
 
+cglobal deblock_h_chroma422_intra_8, 4, 6
+    CHROMA_H_START
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+    call ff_chroma_intra_body_mmxext
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+    lea r0, [r0+r1*8]
+    lea t5, [t5+r1*8]
+
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+    call ff_chroma_intra_body_mmxext
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+RET
+
 ALIGN 16
 ff_chroma_intra_body_mmxext:
     LOAD_MASK r2d, r3d
@@ -917,6 +1059,202 @@ ff_chroma_intra_body_mmxext:
     paddb  m2, m6
     ret
 
+%macro LOAD_8_ROWS 8
+    movd m0, %1
+    movd m1, %2
+    movd m2, %3
+    movd m3, %4
+    movd m4, %5
+    movd m5, %6
+    movd m6, %7
+    movd m7, %8
+%endmacro
+
+%macro STORE_8_ROWS 8
+    movd %1, m0
+    movd %2, m1
+    movd %3, m2
+    movd %4, m3
+    movd %5, m4
+    movd %6, m5
+    movd %7, m6
+    movd %8, m7
+%endmacro
+
+%macro TRANSPOSE_8x4B_XMM 0
+    punpcklbw m0, m1
+    punpcklbw m2, m3
+    punpcklbw m4, m5
+    punpcklbw m6, m7
+    punpcklwd m0, m2
+    punpcklwd m4, m6
+    punpckhdq m2, m0, m4
+    punpckldq m0, m4
+    MOVHL m1, m0
+    MOVHL m3, m2
+%endmacro
+
+%macro TRANSPOSE_4x8B_XMM 0
+    punpcklbw m0, m1
+    punpcklbw m2, m3
+    punpckhwd m4, m0, m2
+    punpcklwd m0, m2
+    MOVHL m6, m4
+    MOVHL m2, m0
+    pshufd m1, m0, 1
+    pshufd m3, m2, 1
+    pshufd m5, m4, 1
+    pshufd m7, m6, 1
+%endmacro
+
+%macro CHROMA_INTER_BODY_XMM 1
+    LOAD_MASK alpha_d, beta_d
+    movd m6, [tc0_q]
+    %rep %1
+        punpcklbw m6, m6
+    %endrep
+    pand m7, m6
+    DEBLOCK_P0_Q0
+%endmacro
+
+%macro CHROMA_INTRA_BODY_XMM 0
+    LOAD_MASK alpha_d, beta_d
+    mova    m5,  m1
+    mova    m6,  m2
+    pxor    m4,  m1, m3
+    pand    m4, [pb_1]
+    pavgb   m1,  m3
+    psubusb m1,  m4
+    pavgb   m1,  m0
+    pxor    m4,  m2, m0
+    pand    m4, [pb_1]
+    pavgb   m2,  m0
+    psubusb m2,  m4
+    pavgb   m2,  m3
+    psubb   m1,  m5
+    psubb   m2,  m6
+    pand    m1,  m7
+    pand    m2,  m7
+    paddb   m1,  m5
+    paddb   m2,  m6
+%endmacro
+
+%macro CHROMA_V_START_XMM 1
+    movsxdifnidn stride_q, stride_d
+    dec alpha_d
+    dec beta_d
+    mov %1, pix_q
+    sub %1, stride_q
+    sub %1, stride_q
+%endmacro
+
+%macro CHROMA_H_START_XMM 2
+    movsxdifnidn stride_q, stride_d
+    dec alpha_d
+    dec beta_d
+    lea %2, [3*stride_q]
+    mov %1,  pix_q
+    add %1,  %2
+%endmacro
+
+%macro DEBLOCK_CHROMA_XMM 1
+
+INIT_XMM %1
+
+cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
+    CHROMA_V_START_XMM r5
+    movq m0, [r5]
+    movq m1, [r5 + stride_q]
+    movq m2, [pix_q]
+    movq m3, [pix_q + stride_q]
+    CHROMA_INTER_BODY_XMM 1
+    movq [r5 + stride_q], m1
+    movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
+    CHROMA_H_START_XMM r5, r6
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+    TRANSPOSE_8x4B_XMM
+    movq [rsp], m0
+    movq [rsp + 8], m3
+    CHROMA_INTER_BODY_XMM 1
+    movq m0, [rsp]
+    movq m3, [rsp + 8]
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_,
+    CHROMA_H_START_XMM r5, r6
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+    TRANSPOSE_8x4B_XMM
+    movq [rsp], m0
+    movq [rsp + 8], m3
+    CHROMA_INTER_BODY_XMM 2
+    movq m0, [rsp]
+    movq m3, [rsp + 8]
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+
+    lea pix_q, [pix_q + 8*stride_q]
+    lea r5,    [r5    + 8*stride_q]
+    add tc0_q,  2
+
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+    TRANSPOSE_8x4B_XMM
+    movq [rsp], m0
+    movq [rsp + 8], m3
+    CHROMA_INTER_BODY_XMM 2
+    movq m0, [rsp]
+    movq m3, [rsp + 8]
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
+RET
+
+cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_
+    CHROMA_V_START_XMM r4
+    movq m0, [r4]
+    movq m1, [r4 + stride_q]
+    movq m2, [pix_q]
+    movq m3, [pix_q + stride_q]
+    CHROMA_INTRA_BODY_XMM
+    movq [r4 + stride_q], m1
+    movq [pix_q], m2
+RET
+
+cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+    CHROMA_H_START_XMM r4, r5
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+    TRANSPOSE_8x4B_XMM
+    CHROMA_INTRA_BODY_XMM
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
+    CHROMA_H_START_XMM r4, r5
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+    TRANSPOSE_8x4B_XMM
+    CHROMA_INTRA_BODY_XMM
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+
+    lea pix_q, [pix_q + 8*stride_q]
+    lea r4,    [r4    + 8*stride_q]
+
+    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+    TRANSPOSE_8x4B_XMM
+    CHROMA_INTRA_BODY_XMM
+    TRANSPOSE_4x8B_XMM
+    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
+RET
+
+%endmacro ; DEBLOCK_CHROMA_XMM
+
+DEBLOCK_CHROMA_XMM sse2
+DEBLOCK_CHROMA_XMM avx
+
 ;-----------------------------------------------------------------------------
 ; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
 ;                                   int8_t ref[2][40], int16_t mv[2][40][2],
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index 1a424b7..1af3257 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -7,34 +7,32 @@
 ;*          Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 SECTION .text
 
 cextern pw_2
 cextern pw_3
 cextern pw_4
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 ; out: %4 = |%1-%2|-%3
 ; clobbers: %5
@@ -162,7 +160,6 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
     %define ms2 [rsp+mmsize*2]
     %define am  [rsp+mmsize*3]
     %define bm  [rsp+mmsize*4]
-    movsxdifnidn  r1, r1d
     SUB        rsp, pad
     shl        r2d, 2
     shl        r3d, 2
@@ -220,7 +217,6 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
     %define p2m [rsp+mmsize*4]
     %define am  [rsp+mmsize*5]
     %define bm  [rsp+mmsize*6]
-    movsxdifnidn  r1, r1d
     SUB        rsp, pad
     shl        r2d, 2
     shl        r3d, 2
@@ -351,7 +347,6 @@ cglobal deblock_v_luma_10, 5,5,15
     %define mask0 m7
     %define mask1 m10
     %define mask2 m11
-    movsxdifnidn  r1, r1d
     shl        r2d, 2
     shl        r3d, 2
     LOAD_AB    m12, m13, r2d, r3d
@@ -380,7 +375,6 @@ cglobal deblock_v_luma_10, 5,5,15
     REP_RET
 
 cglobal deblock_h_luma_10, 5,7,15
-    movsxdifnidn  r1, r1d
     shl        r2d, 2
     shl        r3d, 2
     LOAD_AB    m12, m13, r2d, r3d
@@ -422,9 +416,11 @@ cglobal deblock_h_luma_10, 5,7,15
 
 INIT_XMM sse2
 DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_64
 %endif
+%endif
 
 %macro SWAPMOVA 2
 %ifid %1
@@ -496,7 +492,6 @@ DEBLOCK_LUMA_64
     CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
     %assign i i+1
 %endrep
-    movsxdifnidn  r1, r1d
     SUB    rsp, pad
 %endmacro
 
@@ -620,7 +615,6 @@ cglobal deblock_v_luma_intra_10, 4,7,16
     %define q2 m13
     %define aa m5
     %define bb m14
-    movsxdifnidn  r1, r1d
     lea     r4, [r1*4]
     lea     r5, [r1*3] ; 3*stride
     neg     r4
@@ -674,7 +668,6 @@ cglobal deblock_h_luma_intra_10, 4,7,16
     %define p3 m4
     %define spill [rsp]
     %assign pad 24-(stack_offset&15)
-    movsxdifnidn  r1, r1d
     SUB     rsp, pad
     lea     r4, [r1*4]
     lea     r5, [r1*3] ; 3*stride
@@ -722,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA_64
+%endif
 
 %endif
 
@@ -809,10 +804,12 @@ DEBLOCK_LUMA_INTRA
 INIT_XMM sse2
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
 %endif
+%endif
 
 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
 ; out: %1=p0', %2=q0'
@@ -846,6 +843,83 @@ DEBLOCK_LUMA_INTRA
     mova [r0+2*r1], m2
 %endmacro
 
+; in: 8 rows of 4 words in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8W_LOAD 8
+    movq             m0, %1
+    movq             m2, %2
+    movq             m1, %3
+    movq             m3, %4
+
+    punpcklwd        m0, m2
+    punpcklwd        m1, m3
+    punpckhdq        m2, m0, m1
+    punpckldq        m0, m1
+
+    movq             m4, %5
+    movq             m6, %6
+    movq             m5, %7
+    movq             m3, %8
+
+    punpcklwd        m4, m6
+    punpcklwd        m5, m3
+    punpckhdq        m6, m4, m5
+    punpckldq        m4, m5
+
+    punpckhqdq       m1, m0, m4
+    punpcklqdq       m0, m4
+    punpckhqdq       m3, m2, m6
+    punpcklqdq       m2, m6
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 words in %1..%8
+%macro TRANSPOSE8x4W_STORE 8
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
+    movq             %1, m0
+    movhps           %2, m0
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to store p1 and q1 values
+%macro CHROMA_H_LOAD 4
+    %if mmsize == 8
+        movq m0, [pix_q - 4]
+        movq m1, [pix_q +   stride_q - 4]
+        movq m2, [pix_q + 2*stride_q - 4]
+        movq m3, [%1 - 4]
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %else
+        TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+    mova %3, m0
+    mova %4, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to load p1 and q1 values
+%macro CHROMA_H_STORE 4
+    mova m0, %3
+    mova m3, %4
+    %if mmsize == 8
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+        movq [pix_q - 4],              m0
+        movq [pix_q +   stride_q - 4], m1
+        movq [pix_q + 2*stride_q - 4], m2
+        movq [%1 - 4],                 m3
+    %else
+        TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+%endmacro
+
 %macro CHROMA_V_LOAD_TC 2
     movd        %1, [%2]
     punpcklbw   %1, %1
@@ -859,7 +933,6 @@ DEBLOCK_LUMA_INTRA
 ;                             int8_t *tc0)
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
-    movsxdifnidn  r1, r1d
     mov         r5, r0
     sub         r0, r1
     sub         r0, r1
@@ -895,7 +968,6 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
 ;                                   int beta)
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
-    movsxdifnidn  r1, r1d
     mov         r4, r0
     sub         r0, r1
     sub         r0, r1
@@ -919,6 +991,81 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
 %else
     RET
 %endif
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
+;                             int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, tc0_
+    shl alpha_d,  2
+    shl beta_d,   2
+    mov r5,       pix_q
+    lea r6,      [3*stride_q]
+    add r5,       r6
+%if mmsize == 8
+    mov r6d,      2
+    .loop:
+%endif
+
+        CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
+        LOAD_AB          m4,  m5, alpha_d, beta_d
+        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
+        pxor             m4,  m4
+        CHROMA_V_LOAD_TC m6,  tc0_q
+        psubw            m6, [pw_3]
+        pmaxsw           m6,  m4
+        pand             m7,  m6
+        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
+        CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
+
+%if mmsize == 8
+        lea pix_q, [pix_q + 4*stride_q]
+        lea r5,    [r5 + 4*stride_q]
+        add tc0_q,  2
+        dec r6d
+    jg .loop
+%endif
+RET
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta,
+;                                int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma422_10, 5, 7, 8, 0-3*mmsize, pix_, stride_, alpha_, beta_, tc0_
+    shl alpha_d,  2
+    shl beta_d,   2
+
+    movd m0, [tc0_q]
+    punpcklbw m0, m0
+    psraw m0, 6
+    movq [rsp], m0
+
+    mov r5,       pix_q
+    lea r6,      [3*stride_q]
+    add r5,       r6
+
+    mov r4, -8
+    .loop:
+
+        CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+        LOAD_AB          m4,  m5, alpha_d, beta_d
+        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
+        pxor             m4,  m4
+        movd             m6, [rsp + r4 + 8]
+        punpcklwd        m6,  m6
+        punpcklwd        m6,  m6
+        psubw            m6, [pw_3]
+        pmaxsw           m6,  m4
+        pand             m7,  m6
+        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
+        CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+
+        lea pix_q, [pix_q + (mmsize/2)*stride_q]
+        lea r5,    [r5 +    (mmsize/2)*stride_q]
+        add r4, (mmsize/4)
+    jl .loop
+RET
+
 %endmacro
 
 %if ARCH_X86_64 == 0
@@ -927,5 +1074,7 @@ DEBLOCK_CHROMA
 %endif
 INIT_XMM sse2
 DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 73631ff..c54f9f1 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,20 +9,20 @@
 ;*          Holger Lubitz <hal@duncan.ol.sub.de>
 ;*          Min Chen <chenm001.163.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
@@ -65,7 +65,15 @@ SECTION .text
 
     IDCT4_1D      w, 0, 1, 2, 3, 4, 5
     mova         m6, [pw_32]
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %if mmsize == 8
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %else
+        punpcklwd m0, m1
+        punpcklwd m2, m3
+        SBUTTERFLY dq, 0, 2, 4
+        MOVHL m1, m0
+        MOVHL m3, m2
+    %endif
     paddw        m0, m6
     IDCT4_1D      w, 0, 1, 2, 3, 4, 5
     pxor         m7, m7
@@ -87,10 +95,9 @@ cglobal h264_idct_add_8, 3, 3, 0
     RET
 
 %macro IDCT8_1D 2
-    mova         m0, m1
-    psraw        m1, 1
-    mova         m4, m5
-    psraw        m4, 1
+    psraw        m0, m1, 1
+    SWAP 0, 1
+    psraw        m4, m5, 1
     paddw        m4, m5
     paddw        m1, m0
     paddw        m4, m7
@@ -107,10 +114,9 @@ cglobal h264_idct_add_8, 3, 3, 0
     psubw        m0, m3
     psubw        m5, m7
 
-    mova         m7, m1
-    psraw        m1, 2
-    mova         m3, m4
-    psraw        m3, 2
+    psraw        m7, m1, 2
+    SWAP 7,1
+    psraw        m3, m4, 2
     paddw        m3, m0
     psraw        m0, 2
     paddw        m1, m5
@@ -118,10 +124,9 @@ cglobal h264_idct_add_8, 3, 3, 0
     psubw        m0, m4
     psubw        m7, m5
 
-    mova         m5, m6
-    psraw        m6, 1
-    mova         m4, m2
-    psraw        m4, 1
+    psraw        m5, m6, 1
+    SWAP 5,6
+    psraw        m4, m2, 1
     paddw        m6, m2
     psubw        m4, m5
 
@@ -695,7 +700,39 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
     add        r0mp, gprsize
 %endif
     call         h264_idct_add8_mmx_plane
-    RET
+    RET ; TODO: check rep ret after a function call
+
+cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
+%ifdef PIC
+    lea     picregq, [scan8_mem]
+%endif
+%if ARCH_X86_64
+    mov       dst2q, r0
+%endif
+
+    mov          r5, 16  ; i
+    add          r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
+
+    call         h264_idct_add8_mmx_plane
+    add r5, 4
+    call         h264_idct_add8_mmx_plane
+
+%if ARCH_X86_64
+    add       dst2q, gprsize ; dest[1]
+%else
+    add        r0mp, gprsize
+%endif
+
+    add r5, 4   ; set to 32
+    add r2, 256 ; set to i * 16 * sizeof(dctcoef)
+
+    call         h264_idct_add8_mmx_plane
+    add r5, 4
+    call         h264_idct_add8_mmx_plane
+
+    RET ; TODO: check rep ret after a function call
 
 h264_idct_add8_mmxext_plane:
     movsxdifnidn r3, r3d
@@ -763,7 +800,7 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
     add        r0mp, gprsize
 %endif
     call h264_idct_add8_mmxext_plane
-    RET
+    RET ; TODO: check rep ret after a function call
 
 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
@@ -846,7 +883,7 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
     add16_sse2_cycle 5, 0x24
     add16_sse2_cycle 6, 0x1e
     add16_sse2_cycle 7, 0x26
-    RET
+REP_RET
 
 %macro add16intra_sse2_cycle 2
     movzx       r0, word [r4+%2]
@@ -893,7 +930,7 @@ cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
     add16intra_sse2_cycle 5, 0x24
     add16intra_sse2_cycle 6, 0x1e
     add16intra_sse2_cycle 7, 0x26
-    RET
+REP_RET
 
 %macro add8_sse2_cycle 2
     movzx       r0, word [r4+%2]
@@ -948,7 +985,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
 %endif
     add8_sse2_cycle 2, 0x5c
     add8_sse2_cycle 3, 0x64
-    RET
+REP_RET
 
 ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
 
@@ -1106,3 +1143,57 @@ INIT_MMX mmx
 IDCT_DC_DEQUANT 0
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
+
+%ifdef __NASM_VER__
+%if __NASM_MAJOR__ >= 2 && __NASM_MINOR__ >= 4
+%unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
+%endif
+%endif
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+    movd       %3, [%7]
+    movd       %4, [%7+%8]
+    psraw      %1, %6
+    psraw      %2, %6
+    punpcklbw  %3, %5
+    punpcklbw  %4, %5
+    paddw      %3, %1
+    paddw      %4, %2
+    packuswb   %3, %5
+    packuswb   %4, %5
+    movd     [%7], %3
+    movd  [%7+%8], %4
+%endmacro
+
+%macro DC_ADD_INIT 1
+    add      %1d, 32
+    sar      %1d, 6
+    movd     m0, %1d
+    pshuflw  m0, m0, 0
+    lea      %1, [3*stride_q]
+    pxor     m1, m1
+    psubw    m1, m0
+    packuswb m0, m0
+    packuswb m1, m1
+%endmacro
+
+%macro IDCT_XMM 1
+
+INIT_XMM %1
+
+cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
+    movsxdifnidn stride_q, stride_d
+    IDCT4_ADD    dst_q, block_q, stride_q
+RET
+
+cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
+    movsxdifnidn stride_q, stride_d
+    movsx             r3d, word [block_q]
+    mov   dword [block_q], 0
+    DC_ADD_INIT r3
+    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
+RET
+
+%endmacro
+
+IDCT_XMM sse2
+IDCT_XMM avx
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 432d74b..9fd05ab 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -5,32 +5,31 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pd_32:        times 4 dd 32
-
 SECTION .text
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pd_32
+
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
@@ -84,8 +83,10 @@ cglobal h264_idct_add_10, 3,3
 
 INIT_XMM sse2
 IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
@@ -118,9 +119,11 @@ add4x4_idct %+ SUFFIX:
 INIT_XMM sse2
 ALIGN 16
 ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 ALIGN 16
 ADD4x4IDCT
+%endif
 
 %macro ADD16_OP 2
     cmp          byte [r4+%2], 0
@@ -157,8 +160,10 @@ cglobal h264_idct_add16_10, 5,6
 
 INIT_XMM sse2
 IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
@@ -224,8 +229,10 @@ cglobal h264_idct8_dc_add_10,3,4,7
 
 INIT_XMM sse2
 IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_DC_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
@@ -298,8 +305,10 @@ cglobal h264_idct_add16intra_10,5,7,8
 
 INIT_XMM sse2
 IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16INTRA_10
+%endif
 
 %assign last_block 36
 ;-----------------------------------------------------------------------------
@@ -336,8 +345,63 @@ cglobal h264_idct_add8_10,5,8,7
 
 INIT_XMM sse2
 IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD8
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset,
+;                               int16_t *block, int stride,
+;                               const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%assign last_block 44
+
+%macro IDCT_ADD8_422 0
+
+cglobal h264_idct_add8_422_10, 5, 8, 7
+    movsxdifnidn r3, r3d
+%if ARCH_X86_64
+    mov      r7, r0
+%endif
+
+    add      r2, 1024
+    mov      r0, [r0]
+    ADD16_OP_INTRA 16, 4+ 6*8
+    ADD16_OP_INTRA 18, 4+ 7*8
+    ADD16_OP_INTRA 24, 4+ 8*8 ; i+4
+    ADD16_OP_INTRA 26, 4+ 9*8 ; i+4
+    add      r2, 1024-128*4
+
+%if ARCH_X86_64
+    mov      r0, [r7+gprsize]
+%else
+    mov      r0, r0m
+    mov      r0, [r0+gprsize]
+%endif
+
+    ADD16_OP_INTRA 32, 4+11*8
+    ADD16_OP_INTRA 34, 4+12*8
+    ADD16_OP_INTRA 40, 4+13*8 ; i+4
+    ADD16_OP_INTRA 42, 4+14*8 ; i+4
+REP_RET
+    AC 16
+    AC 18
+    AC 24 ; i+4
+    AC 26 ; i+4
+    AC 32
+    AC 34
+    AC 40 ; i+4
+    AC 42 ; i+4
+
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD8_422
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD8_422
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
@@ -544,8 +608,10 @@ h264_idct8_add1_10 %+ SUFFIX:
 
 INIT_XMM sse2
 IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
@@ -585,5 +651,7 @@ cglobal h264_idct8_add4_10, 0,7,16
 
 INIT_XMM sse2
 IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD4
+%endif
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index 1ea97fa..f3aa317 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,20 +5,20 @@
 ;* Copyright (c) 2010 Loren Merritt
 ;* Copyright (c) 2010 Ronald S. Bultje
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -268,6 +268,43 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
     jg .loop
     REP_RET
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
+    sub                       dstq, strideq
+    pmovzxbw                    m0, [dstq]
+    vpbroadcastb               xm1, [r0-1]
+    pmovzxbw                    m1, xm1
+    psubw                       m0, m1
+    mov                 iterationd, 4
+    lea                   stride3q, [strideq*3]
+.loop:
+    vpbroadcastb               xm1, [dstq+strideq*1-1]
+    vpbroadcastb               xm2, [dstq+strideq*2-1]
+    vpbroadcastb               xm3, [dstq+stride3q-1]
+    vpbroadcastb               xm4, [dstq+strideq*4-1]
+    pmovzxbw                    m1, xm1
+    pmovzxbw                    m2, xm2
+    pmovzxbw                    m3, xm3
+    pmovzxbw                    m4, xm4
+    paddw                       m1, m0
+    paddw                       m2, m0
+    paddw                       m3, m0
+    paddw                       m4, m0
+    vpackuswb                   m1, m1, m2
+    vpackuswb                   m3, m3, m4
+    vpermq                      m1, m1, q3120
+    vpermq                      m3, m3, q3120
+    movdqa        [dstq+strideq*1], xm1
+    vextracti128  [dstq+strideq*2], m1, 1
+    movdqa       [dstq+stride3q*1], xm3
+    vextracti128  [dstq+strideq*4], m3, 1
+    lea                       dstq, [dstq+strideq*4]
+    dec                 iterationd
+    jg .loop
+    REP_RET
+%endif
+
 ;-----------------------------------------------------------------------------
 ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
 ;-----------------------------------------------------------------------------
@@ -2498,10 +2535,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
     pshufb     mm3, mm6
     pshufb     mm4, mm6
     pshufb     mm5, mm6
-    psubw      mm2, mm7
-    psubw      mm3, mm7
-    psubw      mm4, mm7
-    psubw      mm5, mm7
+    psubw      mm0, mm7
     paddw      mm2, mm0
     paddw      mm3, mm0
     paddw      mm4, mm0
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 7ba9828..629e0a7 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,18 +26,19 @@
 
 SECTION_RODATA
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
 cextern pw_512
 cextern pw_16
 cextern pw_8
 cextern pw_4
 cextern pw_2
 cextern pw_1
+cextern pd_16
 
 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
 pw_m3:        times 8 dw -3
-pw_pixel_max: times 8 dw ((1 << 10)-1)
 pd_17:        times 4 dd 17
-pd_16:        times 4 dd 16
 
 SECTION .text
 
@@ -83,8 +84,10 @@ INIT_XMM sse2
 PRED4x4_DR
 INIT_XMM ssse3
 PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DR
+%endif
 
 ;------------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
@@ -121,8 +124,10 @@ INIT_XMM sse2
 PRED4x4_VR
 INIT_XMM ssse3
 PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VR
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
@@ -162,28 +167,14 @@ INIT_XMM sse2
 PRED4x4_HD
 INIT_XMM ssse3
 PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_HD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
 ;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
-    movhlps %2, %1
-    paddd   %1, %2
-    pshuflw %2, %1, 0xE
-    paddd   %1, %2
-%else
-    pshufw  %2, %1, 0xE
-    paddd   %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
-    pmaddwd %1, [pw_1]
-    HADDD   %1, %2
-%endmacro
 
 INIT_MMX mmxext
 cglobal pred4x4_dc_10, 3, 3
@@ -232,8 +223,10 @@ cglobal pred4x4_down_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
@@ -260,8 +253,10 @@ cglobal pred4x4_vertical_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
@@ -571,8 +566,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_TOP_DC
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
@@ -629,8 +626,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6
 
 INIT_XMM sse2
 PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DC
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
@@ -663,8 +662,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
@@ -718,8 +719,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
@@ -785,8 +788,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_LEFT
 INIT_XMM ssse3
 PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_LEFT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
@@ -858,8 +863,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_RIGHT
 INIT_XMM ssse3
 PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
@@ -927,8 +934,10 @@ INIT_XMM sse2
 PRED8x8L_VERTICAL_RIGHT
 INIT_XMM ssse3
 PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
@@ -987,8 +996,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL_UP
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL_UP
+%endif
 
 
 ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 0e572b1..bdd5125 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3)
 PRED16x16(tm_vp8, 8, mmx)
 PRED16x16(tm_vp8, 8, mmxext)
 PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
 
 PRED8x8(top_dc, 8, mmxext)
 PRED8x8(dc_rv40, 8, mmxext)
@@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                 }
             }
         }
+
+        if(EXTERNAL_AVX2(cpu_flags)){
+            if (codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_avx2;
+            }
+        }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 4062e96..0d3dbba 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  * Copyright (c) 2011 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,10 +29,6 @@
 #include "fpel.h"
 
 #if HAVE_X86ASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
-void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                               int dstStride, int src1Stride, int h);
 void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -49,9 +45,9 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
 #define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext   ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext   ff_put_pixels4_mmx
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
@@ -284,7 +280,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
 }\
@@ -296,7 +292,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uin
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
 }\
@@ -304,74 +300,74 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+    LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index 4557e5e..8722683 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,12 +26,13 @@
 
 SECTION_RODATA 32
 
+cextern pd_65535
+cextern pw_1023
+%define pw_pixel_max pw_1023
 cextern pw_16
 cextern pw_1
 cextern pb_0
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 pad10: times 8 dw 10*1023
 pad20: times 8 dw 20*1023
 pad30: times 8 dw 30*1023
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
 tap1: times 4 dw  1, -5
 tap2: times 4 dw 20, 20
 tap3: times 4 dw -5,  1
-pd_0f: times 4 dd 0xffff
 
 SECTION .text
 
@@ -708,7 +708,7 @@ h%1_loop_op:
     psrad      m1, 10
     psrad      m2, 10
     pslld      m2, 16
-    pand       m1, [pd_0f]
+    pand       m1, [pd_65535]
     por        m1, m2
 %if num_mmregs <= 8
     pxor       m0, m0
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bc6c725..2d287ba 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -6,20 +6,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index f9da05b..0975d74 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 961ec8c..f924e55 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,11 +26,12 @@
 
 SECTION_RODATA 32
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
 sq_1: dq 1
       dq 0
 
 cextern pw_1
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 SECTION .text
 
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index 0d5ff3d..36bf29d 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 134d594..08eb7ea 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,12 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
                                                        int stride);
 
 IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, sse2)
+IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, sse2)
+IDCT_ADD_FUNC(_dc, 8, avx)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
 IDCT_ADD_FUNC(8_dc, 8, mmxext)
 IDCT_ADD_FUNC(8_dc, 10, sse2)
@@ -78,6 +82,11 @@ IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
 IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
 IDCT_ADD_REP_FUNC2(, 8, 10, avx)
 
+IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
+
+IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
+
 void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
 void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
 
@@ -92,39 +101,48 @@ void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
 
 #define LF_FUNC(DIR, TYPE, DEPTH, OPT)                                        \
 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
-                                                               int stride,    \
+                                                               ptrdiff_t stride, \
                                                                int alpha,     \
                                                                int beta,      \
                                                                int8_t *tc0);
 #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
-                                                               int stride,    \
+                                                               ptrdiff_t stride, \
                                                                int alpha,     \
                                                                int beta);
 
 #define LF_FUNCS(type, depth)                   \
-LF_FUNC(h,  chroma,       depth, mmxext)        \
-LF_IFUNC(h, chroma_intra, depth, mmxext)        \
-LF_FUNC(v,  chroma,       depth, mmxext)        \
-LF_IFUNC(v, chroma_intra, depth, mmxext)        \
-LF_FUNC(h,  luma,         depth, mmxext)        \
-LF_IFUNC(h, luma_intra,   depth, mmxext)        \
-LF_FUNC(h,  luma,         depth, sse2)          \
-LF_IFUNC(h, luma_intra,   depth, sse2)          \
-LF_FUNC(v,  luma,         depth, sse2)          \
-LF_IFUNC(v, luma_intra,   depth, sse2)          \
-LF_FUNC(h,  chroma,       depth, sse2)          \
-LF_IFUNC(h, chroma_intra, depth, sse2)          \
-LF_FUNC(v,  chroma,       depth, sse2)          \
-LF_IFUNC(v, chroma_intra, depth, sse2)          \
-LF_FUNC(h,  luma,         depth, avx)           \
-LF_IFUNC(h, luma_intra,   depth, avx)           \
-LF_FUNC(v,  luma,         depth, avx)           \
-LF_IFUNC(v, luma_intra,   depth, avx)           \
-LF_FUNC(h,  chroma,       depth, avx)           \
-LF_IFUNC(h, chroma_intra, depth, avx)           \
-LF_FUNC(v,  chroma,       depth, avx)           \
-LF_IFUNC(v, chroma_intra, depth, avx)
+LF_FUNC(h,  chroma,          depth, mmxext)     \
+LF_IFUNC(h, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  chroma422,       depth, mmxext)     \
+LF_IFUNC(h, chroma422_intra, depth, mmxext)     \
+LF_FUNC(v,  chroma,          depth, mmxext)     \
+LF_IFUNC(v, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, mmxext)     \
+LF_IFUNC(h, luma_intra,      depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, sse2)       \
+LF_IFUNC(h, luma_intra,      depth, sse2)       \
+LF_FUNC(v,  luma,            depth, sse2)       \
+LF_IFUNC(v, luma_intra,      depth, sse2)       \
+LF_FUNC(h,  chroma,          depth, sse2)       \
+LF_IFUNC(h, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  chroma422,       depth, sse2)       \
+LF_IFUNC(h, chroma422_intra, depth, sse2)       \
+LF_FUNC(v,  chroma,          depth, sse2)       \
+LF_IFUNC(v, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  luma,            depth, avx)        \
+LF_IFUNC(h, luma_intra,      depth, avx)        \
+LF_FUNC(v,  luma,            depth, avx)        \
+LF_IFUNC(v, luma_intra,      depth, avx)        \
+LF_FUNC(h,  chroma,          depth, avx)        \
+LF_IFUNC(h, chroma_intra,    depth, avx)        \
+LF_FUNC(h,  chroma422,       depth, avx)        \
+LF_IFUNC(h, chroma422_intra, depth, avx)        \
+LF_FUNC(v,  chroma,          depth, avx)        \
+LF_IFUNC(v, chroma_intra,    depth, avx)
+
+LF_FUNC(h, luma_mbaff, 8, sse2)
+LF_FUNC(h, luma_mbaff, 8, avx)
 
 LF_FUNCS(uint8_t,   8)
 LF_FUNCS(uint16_t, 10)
@@ -155,13 +173,13 @@ LF_IFUNC(v, luma_intra, 10, mmxext)
 /* weighted prediction */
 
 #define H264_WEIGHT(W, OPT)                                             \
-void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride,         \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride,   \
                                       int height, int log2_denom,       \
                                       int weight, int offset);
 
 #define H264_BIWEIGHT(W, OPT)                                           \
 void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src,     \
-                                        int stride, int height,         \
+                                        ptrdiff_t stride, int height,   \
                                         int log2_denom, int weightd,    \
                                         int weights, int offset);
 
@@ -181,7 +199,7 @@ H264_BIWEIGHT_MMX(4)
 
 #define H264_WEIGHT_10(W, DEPTH, OPT)                                   \
 void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
-                                                    int stride,         \
+                                                    ptrdiff_t stride,   \
                                                     int height,         \
                                                     int log2_denom,     \
                                                     int weight,         \
@@ -190,7 +208,7 @@ void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
 #define H264_BIWEIGHT_10(W, DEPTH, OPT)                                 \
 void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,     \
                                                       uint8_t *src,     \
-                                                      int stride,       \
+                                                      ptrdiff_t stride, \
                                                       int height,       \
                                                       int log2_denom,   \
                                                       int weightd,      \
@@ -210,6 +228,7 @@ H264_BIWEIGHT_10_SSE(4,  10)
 av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
+#if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
@@ -224,8 +243,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
             c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
-            if (chroma_format_idc <= 1)
+            if (chroma_format_idc <= 1) {
                 c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
+            }
             c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
             if (cpu_flags & AV_CPU_FLAG_CMOV)
                 c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
@@ -244,6 +266,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             if (chroma_format_idc <= 1) {
                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmxext;
                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
             }
 #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
             c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
@@ -279,6 +304,23 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_sse2;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
+            }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_sse2;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
             c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
@@ -289,12 +331,33 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_avx;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
+            }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_avx;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
 #if ARCH_X86_32
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
+            }
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
             c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmxext;
@@ -307,8 +370,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
 
             c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
-            if (chroma_format_idc <= 1)
+            if (chroma_format_idc <= 1) {
                 c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
+            }
             c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
 #if HAVE_ALIGNED_STACK
             c->h264_idct8_add  = ff_h264_idct8_add_10_sse2;
@@ -325,6 +391,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
@@ -347,8 +418,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
 
             c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
-            if (chroma_format_idc <= 1)
+            if (chroma_format_idc <= 1) {
                 c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
+            }
             c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
 #if HAVE_ALIGNED_STACK
             c->h264_idct8_add  = ff_h264_idct8_add_10_avx;
@@ -357,6 +431,11 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
@@ -365,4 +444,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #endif /* HAVE_ALIGNED_STACK */
         }
     }
+#endif
 }
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
index 4701e9d..36d4d8e 100644
--- a/libavcodec/x86/hevc_add_res.asm
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -2,30 +2,30 @@
 ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
 ; *
-; * This file is part of Libav.
+; * This file is part of FFmpeg.
 ; *
-; * Libav is free software; you can redistribute it and/or
+; * FFmpeg is free software; you can redistribute it and/or
 ; * modify it under the terms of the GNU Lesser General Public
 ; * License as published by the Free Software Foundation; either
 ; * version 2.1 of the License, or (at your option) any later version.
 ; *
-; * Libav is distributed in the hope that it will be useful,
+; * FFmpeg is distributed in the hope that it will be useful,
 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ; * Lesser General Public License for more details.
 ; *
 ; * You should have received a copy of the GNU Lesser General Public
-; * License along with Libav; if not, write to the Free Software
+; * License along with FFmpeg; if not, write to the Free Software
 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ; ******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 32
-max_pixels_10:          times 16  dw ((1 << 10)-1)
-
 SECTION .text
 
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
 %macro ADD_RES_MMX_4_8 0
     mova              m0, [r1]
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 153eaf7..85ee480 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,9 +26,11 @@
 
 SECTION_RODATA
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_m2:        times 8 dw -2
-pd_1 :        times 4 dd  1
+cextern pw_1023
+%define pw_pixel_max_10 pw_1023
+pw_pixel_max_12: times 8 dw ((1 << 12)-1)
+pw_m2:           times 8 dw -2
+pd_1 :           times 4 dd  1
 
 cextern pw_4
 cextern pw_8
@@ -37,11 +39,6 @@ cextern pw_m1
 SECTION .text
 INIT_XMM sse2
 
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
-    [base], [base+stride], [base+stride*2], [base3], \
-    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
 ; in: 8 rows of 4 bytes in %4..%11
 ; out: 4 rows of 8 words in m0..m3
 %macro TRANSPOSE4x8B_LOAD 8
@@ -57,10 +54,10 @@ INIT_XMM sse2
     movd             m4, %5
     movd             m6, %6
     movd             m5, %7
-    movd             m7, %8
+    movd             m3, %8
 
     punpcklbw        m4, m6
-    punpcklbw        m5, m7
+    punpcklbw        m5, m3
     punpcklwd        m4, m5
 
     punpckhdq        m2, m0, m4
@@ -76,16 +73,10 @@ INIT_XMM sse2
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
 %macro TRANSPOSE8x4B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m6, m0, m2
-    punpcklwd        m0, m2
+    packuswb         m0, m2
+    packuswb         m1, m3
+    SBUTTERFLY bw, 0, 1, 2
+    SBUTTERFLY wd, 0, 1, 2
 
     movd             %1, m0
     pshufd           m0, m0, 0x39
@@ -95,13 +86,13 @@ INIT_XMM sse2
     pshufd           m0, m0, 0x39
     movd             %4, m0
 
-    movd             %5, m6
-    pshufd           m6, m6, 0x39
-    movd             %6, m6
-    pshufd           m6, m6, 0x39
-    movd             %7, m6
-    pshufd           m6, m6, 0x39
-    movd             %8, m6
+    movd             %5, m1
+    pshufd           m1, m1, 0x39
+    movd             %6, m1
+    pshufd           m1, m1, 0x39
+    movd             %7, m1
+    pshufd           m1, m1, 0x39
+    movd             %8, m1
 %endmacro
 
 ; in: 8 rows of 4 words in %4..%11
@@ -120,10 +111,10 @@ INIT_XMM sse2
     movq             m4, %5
     movq             m6, %6
     movq             m5, %7
-    movq             m7, %8
+    movq             m3, %8
 
     punpcklwd        m4, m6
-    punpcklwd        m5, m7
+    punpcklwd        m5, m3
     punpckhdq        m6, m4, m5
     punpckldq        m4, m5
 
@@ -136,32 +127,23 @@ INIT_XMM sse2
 
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 words in %1..%8
-%macro TRANSPOSE8x4W_STORE 8
-    pxor             m5, m5; zeros reg
-    CLIPW            m0, m5, [pw_pixel_max]
-    CLIPW            m1, m5, [pw_pixel_max]
-    CLIPW            m2, m5, [pw_pixel_max]
-    CLIPW            m3, m5, [pw_pixel_max]
+%macro TRANSPOSE8x4W_STORE 9
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
 
-    punpckhwd        m4, m0, m1
-    punpcklwd        m0, m1
-    punpckhwd        m5, m2, m3
-    punpcklwd        m2, m3
-    punpckhdq        m6, m0, m2
-    punpckldq        m0, m2
+    pxor             m5, m5; zeros reg
+    CLIPW            m0, m5, %9
+    CLIPW            m1, m5, %9
+    CLIPW            m2, m5, %9
+    CLIPW            m3, m5, %9
 
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m6
-    movhps           %4, m6
-
-    punpckhdq        m6, m4, m5
-    punpckldq        m4, m5
-
-    movq             %5, m4
-    movhps           %6, m4
-    movq             %7, m6
-    movhps           %8, m6
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 bytes in %1..%8
@@ -212,40 +194,20 @@ INIT_XMM sse2
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 bytes in %1..%8
 %macro TRANSPOSE8x8B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-    packuswb         m4, m4
-    packuswb         m5, m5
-    packuswb         m6, m6
-    packuswb         m7, m7
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m8, m0, m2
-    punpcklwd        m0, m2
+    packuswb         m0, m4
+    packuswb         m1, m5
+    packuswb         m2, m6
+    packuswb         m3, m7
+    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
 
-    punpcklbw        m4, m5
-    punpcklbw        m6, m7
-
-    punpckhwd        m9, m4, m6
-    punpcklwd        m4, m6
-
-    punpckhdq       m10, m0, m4; 2, 3
-    punpckldq        m0, m4;   0, 1
-
-    punpckldq       m11, m8, m9;  4, 5
-    punpckhdq        m8, m9;   6, 7
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m10
-    movhps           %4, m10
-    movq             %5, m11
-    movhps           %6, m11
-    movq             %7, m8
-    movhps           %8, m8
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 words in %1..%8
@@ -264,18 +226,18 @@ INIT_XMM sse2
 
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 words in %1..%8
-%macro TRANSPOSE8x8W_STORE 8
+%macro TRANSPOSE8x8W_STORE 9
     TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
 
     pxor             m8, m8
-    CLIPW            m0, m8, [pw_pixel_max]
-    CLIPW            m1, m8, [pw_pixel_max]
-    CLIPW            m2, m8, [pw_pixel_max]
-    CLIPW            m3, m8, [pw_pixel_max]
-    CLIPW            m4, m8, [pw_pixel_max]
-    CLIPW            m5, m8, [pw_pixel_max]
-    CLIPW            m6, m8, [pw_pixel_max]
-    CLIPW            m7, m8, [pw_pixel_max]
+    CLIPW            m0, m8, %9
+    CLIPW            m1, m8, %9
+    CLIPW            m2, m8, %9
+    CLIPW            m3, m8, %9
+    CLIPW            m4, m8, %9
+    CLIPW            m5, m8, %9
+    CLIPW            m6, m8, %9
+    CLIPW            m7, m8, %9
 
     movdqu           %1, m0
     movdqu           %2, m1
@@ -318,13 +280,14 @@ ALIGN 16
     paddw            m5, m4;
 
     ;tc calculations
-    movd             m6, [r2]; tc0
-    add              r2, 4;
+    movq             m6, [tcq]; tc0
     punpcklwd        m6, m6
-    movd             m7, [r2]; tc1
-    punpcklwd        m7, m7
-    shufps           m6, m7, 0; tc0, tc1
+    pshufd           m6, m6, 0xA0; tc0, tc1
+%if cpuflag(ssse3)
+    psignw           m4, m6, [pw_m1]; -tc0, -tc1
+%else
     pmullw           m4, m6, [pw_m1]; -tc0, -tc1
+%endif
     ;end tc calculations
 
     paddw            m5, [pw_4]; +4
@@ -362,11 +325,11 @@ ALIGN 16
 
     paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
 
-    pshufhw         m14, m9,  q0033 ;0b00001111;  0d3 0d3 0d0 0d0 in high
-    pshuflw         m14, m14, q0033 ;0b00001111;  1d3 1d3 1d0 1d0 in low
+    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
+    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
 
-    pshufhw          m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3
-    pshuflw          m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3
+    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
+    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
 
     paddw           m14, m9; 0d0+0d3, 1d0+1d3
 
@@ -380,7 +343,7 @@ ALIGN 16
     psraw           m15, m13, 2;   beta >> 2
     psllw            m8, m9, 1;
     pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
-    movmskps        r14, m15;
+    movmskps        r6, m15;
     ;end weak / strong decision
 
     ; weak filter nd_p/q calculation
@@ -388,19 +351,15 @@ ALIGN 16
     psrld            m8, 16
     paddw            m8, m10
     movd            r7d, m8
-    and              r7, 0xffff; 1dp0 + 1dp3
     pshufd           m8, m8, 0x4E
     movd            r8d, m8
-    and              r8, 0xffff; 0dp0 + 0dp3
 
     pshufd           m8, m11, 0x31
     psrld            m8, 16
     paddw            m8, m11
     movd            r9d, m8
-    and              r9, 0xffff; 1dq0 + 1dq3
     pshufd           m8, m8, 0x4E
     movd           r10d, m8
-    and             r10, 0xffff; 0dq0 + 0dq3
     ; end calc for weak filter
 
     ; filtering mask
@@ -422,14 +381,13 @@ ALIGN 16
     shl             r11, %1 - 8
 %endif
     movd             m8, r11d; tc0
-    add             tcq, 4;
-    mov             r3d, [tcq];
+    mov             r3d, [tcq+4];
 %if %1 > 8
     shl              r3, %1 - 8
 %endif
-    movd             m9, r3d; tc1
     add            r11d, r3d; tc0 + tc1
     jz             .bypassluma
+    movd             m9, r3d; tc1
     punpcklwd        m8, m8
     punpcklwd        m9, m9
     shufps           m8, m9, 0; tc0, tc1
@@ -453,7 +411,7 @@ ALIGN 16
     psraw           m13, 3; beta >> 3
     pcmpgtw         m13, m12;
     movmskps        r11, m13;
-    and             r14, r11; strong mask , beta_2 and beta_3 comparisons
+    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
     ;----beta_3 comparison end-----
     ;----tc25 comparison---
     psubw           m12, m3, m4;      p0 - q0
@@ -464,23 +422,23 @@ ALIGN 16
 
     pcmpgtw          m8, m12; tc25 comparisons
     movmskps        r11, m8;
-    and             r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
     ;----tc25 comparison end---
-    mov             r11, r14;
+    mov             r11, r6;
     shr             r11, 1;
-    and             r14, r11; strong mask, bits 2 and 0
+    and             r6, r11; strong mask, bits 2 and 0
 
     pmullw          m14, m9, [pw_m2]; -tc * 2
     paddw            m9, m9
 
-    and             r14, 5; 0b101
-    mov             r11, r14; strong mask
-    shr             r14, 2;
-    movd            m12, r14d; store to xmm for mask generation
-    shl             r14, 1
+    and             r6, 5; 0b101
+    mov             r11, r6; strong mask
+    shr             r6, 2;
+    movd            m12, r6d; store to xmm for mask generation
+    shl             r6, 1
     and             r11, 1
     movd            m10, r11d; store to xmm for mask generation
-    or              r14, r11; final strong mask, bits 1 and 0
+    or              r6, r11; final strong mask, bits 1 and 0
     jz      .weakfilter
 
     shufps          m10, m12, 0
@@ -565,16 +523,16 @@ ALIGN 16
     MASKED_COPY      m3, m12
 
 .weakfilter:
-    not             r14; strong mask -> weak mask
-    and             r14, r13; final weak filtering mask, bits 0 and 1
+    not             r6; strong mask -> weak mask
+    and             r6, r13; final weak filtering mask, bits 0 and 1
     jz             .store
 
     ; weak filtering mask
-    mov             r11, r14
+    mov             r11, r6
     shr             r11, 1
     movd            m12, r11d
-    and             r14, 1
-    movd            m11, r14d
+    and             r6, 1
+    movd            m11, r6d
     shufps          m11, m12, 0
     pcmpeqd         m11, [pd_1]; filtering mask
 
@@ -609,7 +567,11 @@ ALIGN 16
     pminsw          m12, m9;  av_clip(delta0, -tc, tc)
 
     psraw            m9, 1;   tc -> tc / 2
+%if cpuflag(ssse3)
+    psignw          m14, m9, [pw_m1]; -tc / 2
+%else
     pmullw          m14, m9, [pw_m1]; -tc / 2
+%endif
 
     pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
     psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
@@ -658,117 +620,161 @@ ALIGN 16
     MASKED_COPY      m4, m8
 %endmacro
 
-INIT_XMM sse2
 ;-----------------------------------------------------------------------------
-; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
-    sub              r0, 2
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8B_LOAD  PASS8ROWS(r4, r0, r1, r5)
+%macro LOOP_FILTER_CHROMA 0
+cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 2
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 8
-    TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     RET
 
-cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(r4, r0, r1, r5)
+cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 10
-    TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
+    RET
+
+cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 12
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
     RET
 
 ;-----------------------------------------------------------------------------
-; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
-    mov              r5, r0; pix
-    sub              r5, r1
-    sub              r5, r1
-    movh             m0, [r5];      p1
-    movh             m1, [r5 + r1]; p0
-    movh             m2, [r0];      q0
-    movh             m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
+    mov           pix0q, pixq
+    sub           pix0q, strideq
+    sub           pix0q, strideq
+    movq             m0, [pix0q];    p1
+    movq             m1, [pix0q+strideq]; p0
+    movq             m2, [pixq];    q0
+    movq             m3, [pixq+strideq]; q1
     pxor             m5, m5; zeros reg
     punpcklbw        m0, m5
     punpcklbw        m1, m5
     punpcklbw        m2, m5
     punpcklbw        m3, m5
     CHROMA_DEBLOCK_BODY  8
-    packuswb          m1, m2
-    movh       [r5 + r1], m1
-    movhps          [r0], m1
+    packuswb         m1, m2
+    movh[pix0q+strideq], m1
+    movhps       [pixq], m1
     RET
 
-cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
-    mov             r5, r0; pix
-    sub             r5, r1
-    sub             r5, r1
-    movdqu          m0, [r5];      p1
-    movdqu          m1, [r5+r1];   p0
-    movdqu          m2, [r0];      q0
-    movdqu          m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
     CHROMA_DEBLOCK_BODY 10
     pxor            m5, m5; zeros reg
-    CLIPW           m1, m5, [pw_pixel_max]
-    CLIPW           m2, m5, [pw_pixel_max]
-    movdqu   [r5 + r1], m1
-    movdqu        [r0], m2
+    CLIPW           m1, m5, [pw_pixel_max_10]
+    CLIPW           m2, m5, [pw_pixel_max_10]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
+    RET
+
+cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
+    CHROMA_DEBLOCK_BODY 12
+    pxor            m5, m5; zeros reg
+    CLIPW           m1, m5, [pw_pixel_max_12]
+    CLIPW           m2, m5, [pw_pixel_max_12]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
     RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_CHROMA
+INIT_XMM avx
+LOOP_FILTER_CHROMA
 
 %if ARCH_X86_64
-INIT_XMM ssse3
+%macro LOOP_FILTER_LUMA 0
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r6, r0
-    add              r0, r5
-    TRANSPOSE8x8B_LOAD  PASS8ROWS(r6, r0, r1, r5)
+cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 4
+    lea           pix0q, [3 * r1]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
     LUMA_DEBLOCK_BODY 8, v
 .store:
-    TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
 .bypassluma:
     RET
 
-cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
+cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     sub            pixq, 8
-    lea              r5, [3 * strideq]
-    mov              r6, pixq
-    add            pixq, r5
-    TRANSPOSE8x8W_LOAD  PASS8ROWS(r6, pixq, strideq, r5)
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
     LUMA_DEBLOCK_BODY 10, v
 .store:
-    TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
+.bypassluma:
+    RET
+
+cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 8
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+    LUMA_DEBLOCK_BODY 12, v
+.store:
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
 .bypassluma:
     RET
 
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea     src3strideq, [3 * strideq]
     mov           pix0q, pixq
     sub           pix0q, src3strideq
     sub           pix0q, strideq
-    movdqu           m0, [pix0q];               p3
-    movdqu           m1, [pix0q +     strideq]; p2
-    movdqu           m2, [pix0q + 2 * strideq]; p1
-    movdqu           m3, [pix0q + src3strideq]; p0
-    movdqu           m4, [pixq];                q0
-    movdqu           m5, [pixq +     strideq];  q1
-    movdqu           m6, [pixq + 2 * strideq];  q2
-    movdqu           m7, [pixq + src3strideq];  q3
+    movq             m0, [pix0q];               p3
+    movq             m1, [pix0q +     strideq]; p2
+    movq             m2, [pix0q + 2 * strideq]; p1
+    movq             m3, [pix0q + src3strideq]; p0
+    movq             m4, [pixq];                q0
+    movq             m5, [pixq +     strideq];  q1
+    movq             m6, [pixq + 2 * strideq];  q2
+    movq             m7, [pixq + src3strideq];  q3
     pxor             m8, m8
     punpcklbw        m0, m8
     punpcklbw        m1, m8
@@ -783,16 +789,16 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0
     packuswb          m1, m2
     packuswb          m3, m4
     packuswb          m5, m6
-    movh   [r5 +     r1], m1
-    movhps [r5 + 2 * r1], m1
-    movh   [r5 +     r6], m3
-    movhps [r0         ], m3
-    movh   [r0 +     r1], m5
-    movhps [r0 + 2 * r1], m5
+    movh   [pix0q +     strideq], m1
+    movhps [pix0q + 2 * strideq], m1
+    movh   [pix0q + src3strideq], m3
+    movhps [pixq               ], m3
+    movh   [pixq  +     strideq], m5
+    movhps [pixq  + 2 * strideq], m5
 .bypassluma:
     RET
 
-cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea                  src3strideq, [3 * strideq]
     mov                        pix0q, pixq
     sub                        pix0q, src3strideq
@@ -808,12 +814,12 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     LUMA_DEBLOCK_BODY             10, h
 .store:
     pxor                          m8, m8; zeros reg
-    CLIPW                         m1, m8, [pw_pixel_max]
-    CLIPW                         m2, m8, [pw_pixel_max]
-    CLIPW                         m3, m8, [pw_pixel_max]
-    CLIPW                         m4, m8, [pw_pixel_max]
-    CLIPW                         m5, m8, [pw_pixel_max]
-    CLIPW                         m6, m8, [pw_pixel_max]
+    CLIPW                         m1, m8, [pw_pixel_max_10]
+    CLIPW                         m2, m8, [pw_pixel_max_10]
+    CLIPW                         m3, m8, [pw_pixel_max_10]
+    CLIPW                         m4, m8, [pw_pixel_max_10]
+    CLIPW                         m5, m8, [pw_pixel_max_10]
+    CLIPW                         m6, m8, [pw_pixel_max_10]
     movdqu     [pix0q +     strideq], m1;  p2
     movdqu     [pix0q + 2 * strideq], m2;  p1
     movdqu     [pix0q + src3strideq], m3;  p0
@@ -822,4 +828,44 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     movdqu     [pixq  + 2 * strideq], m6;  q2
 .bypassluma:
     RET
+
+cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea                  src3strideq, [3 * strideq]
+    mov                        pix0q, pixq
+    sub                        pix0q, src3strideq
+    sub                        pix0q, strideq
+    movdqu                        m0, [pix0q];               p3
+    movdqu                        m1, [pix0q +     strideq]; p2
+    movdqu                        m2, [pix0q + 2 * strideq]; p1
+    movdqu                        m3, [pix0q + src3strideq]; p0
+    movdqu                        m4, [pixq];                q0
+    movdqu                        m5, [pixq  +     strideq]; q1
+    movdqu                        m6, [pixq  + 2 * strideq]; q2
+    movdqu                        m7, [pixq  + src3strideq]; q3
+    LUMA_DEBLOCK_BODY             12, h
+.store:
+    pxor                          m8, m8; zeros reg
+    CLIPW                         m1, m8, [pw_pixel_max_12]
+    CLIPW                         m2, m8, [pw_pixel_max_12]
+    CLIPW                         m3, m8, [pw_pixel_max_12]
+    CLIPW                         m4, m8, [pw_pixel_max_12]
+    CLIPW                         m5, m8, [pw_pixel_max_12]
+    CLIPW                         m6, m8, [pw_pixel_max_12]
+    movdqu     [pix0q +     strideq], m1;  p2
+    movdqu     [pix0q + 2 * strideq], m2;  p1
+    movdqu     [pix0q + src3strideq], m3;  p0
+    movdqu     [pixq               ], m4;  q0
+    movdqu     [pixq  +     strideq], m5;  q1
+    movdqu     [pixq  + 2 * strideq], m6;  q2
+.bypassluma:
+    RET
+
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
+INIT_XMM avx
+LOOP_FILTER_LUMA
 %endif
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index a36fa53..1eb1973 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2014 James Almer
 ;* Copyright (c) 2016 Alexandra Hájková
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -236,7 +236,7 @@ times 4 dw 90, -90
 
 SECTION .text
 
-; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
 ; %1 = HxW
 ; %2 = number of loops
 ; %3 = bitdepth
@@ -844,7 +844,10 @@ IDCT_4x4 %1
 
 INIT_IDCT_DC 8
 INIT_IDCT_DC 10
+INIT_IDCT_DC 12
 INIT_IDCT 8, sse2
 INIT_IDCT 8, avx
 INIT_IDCT 10, sse2
 INIT_IDCT 10, avx
+;INIT_IDCT 12, sse2
+;INIT_IDCT 12, avx
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 16e5eef..ff6ed07 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -1,851 +1,1672 @@
-;*****************************************************************************
-;* x86-optimized HEVC MC
-;* Copyright 2015 Anton Khirnov
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_8192
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+%define pw_8 pw_512
+%define pw_10 pw_2048
+%define pw_12 pw_8192
+%define pw_bi_10 pw_1024
+%define pw_bi_12 pw_4096
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+pw_bi_8:                times 16 dw  (1 <<  8)
+max_pixels_12:          times 16 dw ((1 << 12)-1)
+cextern pd_1
+cextern pb_0
+
+%macro EPEL_TABLE 4
+hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
+                        times %2 d%3 10, -2
+                        times %2 d%3 -4, 54
+                        times %2 d%3 16, -2
+                        times %2 d%3 -6, 46
+                        times %2 d%3 28, -4
+                        times %2 d%3 -4, 36
+                        times %2 d%3 36, -4
+                        times %2 d%3 -4, 28
+                        times %2 d%3 46, -6
+                        times %2 d%3 -2, 16
+                        times %2 d%3 54, -4
+                        times %2 d%3 -2, 10
+                        times %2 d%3 58, -2
+%endmacro
 
-pw_1023: times 8 dw 1023
 
-cextern hevc_qpel_coeffs
-cextern hevc_qpel_coeffs8
+EPEL_TABLE  8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
+
+EPEL_TABLE  8, 8, b, sse4
+EPEL_TABLE 10, 4, w, sse4
+EPEL_TABLE 12, 4, w, sse4
+
+%macro QPEL_TABLE 4
+hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
+                        times %2 d%3 -10, 58
+                        times %2 d%3  17, -5
+                        times %2 d%3   1,  0
+                        times %2 d%3  -1,  4
+                        times %2 d%3 -11, 40
+                        times %2 d%3  40,-11
+                        times %2 d%3   4, -1
+                        times %2 d%3   0,  1
+                        times %2 d%3  -5, 17
+                        times %2 d%3  58,-10
+                        times %2 d%3   4, -1
+%endmacro
 
-cextern hevc_epel_coeffs
-cextern hevc_epel_coeffs8
+QPEL_TABLE  8, 8, b, sse4
+QPEL_TABLE 10, 4, w, sse4
+QPEL_TABLE 12, 4, w, sse4
 
-cextern pw_8
-cextern pw_16
-cextern pw_32
-cextern pw_64
+QPEL_TABLE  8,16, b, avx2
+QPEL_TABLE 10, 8, w, avx2
 
 SECTION .text
 
-; %1: width
-; %2: bit depth
-%macro COMMON_DEFS 2
-    %assign blocksize            8
-    %assign nb_blocks            ((%1 + blocksize - 1) / blocksize)
-    %define last_block_truncated (blocksize * nb_blocks > %1)
-    %if %2 > 8
-        %define LOAD_BLOCK     movu
-        %define LOAD_HALFBLOCK movq
-        %assign pixelsize      2
-    %else
-        %define LOAD_BLOCK     movq
-        %define LOAD_HALFBLOCK movd
-        %assign pixelsize      1
-    %endif
-    %define STORE_BLOCK        mova
-    %define STORE_HALFBLOCK    movq
-%endmacro
-
-; %1: block index
-%macro BLOCK_DEFS 1
-    %if last_block_truncated && %1 == nb_blocks - 1
-        %define block_truncated 1
-        %define LOAD            LOAD_HALFBLOCK
-        %define STORE           STORE_HALFBLOCK
-    %else
-        %define block_truncated 0
-        %define LOAD            LOAD_BLOCK
-        %define STORE           STORE_BLOCK
-    %endif
-%endmacro
-
-
-; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
-;                         pixel   *src, ptrdiff_t srcstride,
-;                         int height, int mx, int my, int *mcbuffer)
-
-; %1: block width
-; %2: bit depth
-; %3: log2 of height unroll
-%macro GET_PIXELS 3
-cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
-
-    %assign shift 14 - %2
-    COMMON_DEFS %1, %2
-
-%if pixelsize == 1
-    pxor      m0, m0
-%endif
-
-    shr       heightd, %3
-
-.loop:
+%define MAX_PB_SIZE  64
 
-%assign i 0
-%rep (1 << %3)
+%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
 
-%assign j 0
-%rep nb_blocks
+%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
 
-    BLOCK_DEFS j
+%if ARCH_X86_64
 
-    LOAD       m1, [srcq + j * pixelsize * blocksize]
-%if pixelsize == 1
-    punpcklbw  m1, m0
+%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
+%if %1 <= 4
+    movq              %3, [%2]                                              ; load data from source2
+%elif %1 <= 8
+    movdqa            %3, [%2]                                              ; load data from source2
+%elif %1 <= 12
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movq              %4, [%2+16]                                           ; load data from source2
+%endif ;avx
+%elif %1 <= 16
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movdqa            %4, [%2+16]                                           ; load data from source2
+%endif ; avx
+%else ; %1 = 32
+    mova              %3, [%2]
+    mova              %4, [%2+32]
 %endif
-    psllw      m1, shift
-    STORE      [dstq + j * 2 * blocksize], m1
-
-%assign j (j + 1)
-%endrep
+%endmacro
 
-    add       dstq, dststrideq
-    add       srcq, srcstrideq
+%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+    movd              %4, [%3]                                               ; load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+    movq              %4, [%3]                                               ; load data from source
+%elif notcpuflag(avx)
+    movu              %4, [%3]                                               ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+    movdqu           %4, [%3]
+%else
+    movu              %4, [%3]
+%endif
+%endmacro
 
-%assign i (i + 1)
-%endrep
 
-    dec heightd
-    jg .loop
-    RET
+%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
+%if cpuflag(avx2)
+%assign %%offset 32
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_avx2_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_avx2_%1
+%endif
+%else
+%assign %%offset 16
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_sse4_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_sse4_%1
+%endif
+%endif ;cpuflag(avx2)
+    sub              %2q, 1
+%if cpuflag(avx2)
+    shl              %2q, 6                      ; multiply by 64
+  %else
+    shl              %2q, 5                      ; multiply by 32
+%endif
+    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
+    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
 %endmacro
 
-INIT_XMM sse2
-GET_PIXELS 4,  8, 1
-GET_PIXELS 8,  8, 1
-GET_PIXELS 12, 8, 3
-GET_PIXELS 16, 8, 2
-GET_PIXELS 24, 8, 3
-GET_PIXELS 32, 8, 3
-GET_PIXELS 48, 8, 3
-GET_PIXELS 64, 8, 3
-
-GET_PIXELS 4,  10, 1
-GET_PIXELS 8,  10, 1
-GET_PIXELS 12, 10, 3
-GET_PIXELS 16, 10, 2
-GET_PIXELS 24, 10, 3
-GET_PIXELS 32, 10, 3
-GET_PIXELS 48, 10, 3
-GET_PIXELS 64, 10, 3
-
-; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                     uint8_t *src, ptrdiff_t srcstride,
-;                     int height, int mx, int my, int *mcbuffer)
-
-; 8-bit qpel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro QPEL_8 2
-%if %2
-    %define postfix    v
-    %define mvfrac     myq
-    %define coeffsaddr r5q
-    %define pixstride  srcstrideq
-    %define pixstride3 r5q
-    %define src_m3     r6q
-%else
-    %define postfix    h
-    %define mvfrac     mxq
-    %define coeffsaddr r6q
-    %define pixstride  1
-    %define pixstride3 3
-    %define src_m3     (srcq - 3)
-%endif
-
-    COMMON_DEFS %1, 8
-
-cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
-    and       mvfrac, 0x3
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsaddr, [hevc_qpel_coeffs8]
-    mova      m0,         [coeffsaddr + mvfrac]
-
-    SPLATW    m1, m0, 1
-    SPLATW    m2, m0, 2
-    SPLATW    m3, m0, 3
-    SPLATW    m0, m0, 0
-
-%if %2
-    lea       pixstride3, [srcstrideq + 2 * srcstrideq]
-    mov       src_m3, srcq
-    sub       src_m3, pixstride3
+%macro EPEL_HV_FILTER 1
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  6
+%define %%table  hevc_epel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  5
+%define %%table  hevc_epel_filters_sse4_%1
 %endif
 
-.loop:
-
-%assign i 0
-%rep nb_blocks
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
+    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
+
+%if cpuflag(avx2)
+%define %%table  hevc_epel_filters_avx2_10
+%else
+%define %%table  hevc_epel_filters_sse4_10
+%endif
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    mova             m12, [FILTER + myq]        ; get 2 first values of filters
+    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
+    lea           r3srcq, [srcstrideq*3]
+%endmacro
 
-    BLOCK_DEFS i
+%macro QPEL_FILTER 2
 
-    LOAD m4, [src_m3 + i * blocksize]
-    LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
-    punpcklbw m4, m5
-    pmaddubsw m4, m0
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  7
+%define %%table  hevc_qpel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  6
+%define %%table  hevc_qpel_filters_sse4_%1
+%endif
 
-    LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
-    LOAD m6, [srcq   + i * blocksize]
-    punpcklbw m5, m6
-    pmaddubsw m5, m1
-    paddsw    m4, m5
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
+    sub              %2q, 1
+    shl              %2q, %%shift                        ; multiply by 32
+    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
+    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
+    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
+    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
+%endmacro
 
-    LOAD m5, [srcq + i * blocksize + 1 * pixstride]
-    LOAD m6, [srcq + i * blocksize + 2 * pixstride]
-    punpcklbw m5, m6
-    pmaddubsw m5, m2
-    paddsw    m4, m5
+%macro EPEL_LOAD 4
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
 
-    LOAD m5, [srcq + i * blocksize +     pixstride3]
-    LOAD m6, [srcq + i * blocksize + 4 * pixstride]
-    punpcklbw m5, m6
-    pmaddubsw m5, m3
-    paddsw    m4, m5
+    %%load            m0, [%2q ]
+%ifnum %3
+    %%load            m1, [%2q+  %3]
+    %%load            m2, [%2q+2*%3]
+    %%load            m3, [%2q+3*%3]
+%else
+    %%load            m1, [%2q+  %3q]
+    %%load            m2, [%2q+2*%3q]
+    %%load            m3, [%2q+r3srcq]
+%endif
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 7
+    SBUTTERFLY        bw, 2, 3, 7
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 7
+    SBUTTERFLY        wd, 2, 3, 7
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+%endif
+%endif
+%endmacro
 
-    STORE [dstq + i * 2 * blocksize], m4
 
-%assign i (i + 1)
-%endrep
+%macro QPEL_H_LOAD 4
+%assign %%stride (%1+7)/8
+%if %1 == 8
+%if %3 <= 4
+%define %%load movd
+%elif %3 == 8
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%else
+%if %3 == 2
+%define %%load movd
+%elif %3 == 4
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%endif
+    %%load            m0, [%2-3*%%stride]        ;load data from source
+    %%load            m1, [%2-2*%%stride]
+    %%load            m2, [%2-%%stride  ]
+    %%load            m3, [%2           ]
+    %%load            m4, [%2+%%stride  ]
+    %%load            m5, [%2+2*%%stride]
+    %%load            m6, [%2+3*%%stride]
+    %%load            m7, [%2+4*%%stride]
+
+%if %1 == 8
+%if %3 > 8
+    SBUTTERFLY        wd, 0, 1, %4
+    SBUTTERFLY        wd, 2, 3, %4
+    SBUTTERFLY        wd, 4, 5, %4
+    SBUTTERFLY        wd, 6, 7, %4
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
+%endif
+%else
+%if %3 > 4
+    SBUTTERFLY        dq, 0, 1, %4
+    SBUTTERFLY        dq, 2, 3, %4
+    SBUTTERFLY        dq, 4, 5, %4
+    SBUTTERFLY        dq, 6, 7, %4
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
 
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-%if %2
-    add       src_m3, srcstrideq
+%macro QPEL_V_LOAD 5
+    lea              %5q, [%2]
+    sub              %5q, r3srcq
+    movu              m0, [%5q            ]      ;load x- 3*srcstride
+    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
+    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
+    movu              m3, [%2       ]      ;load x
+    movu              m4, [%2+   %3q]      ;load x+stride
+    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
+    movu              m6, [%2+r3srcq]      ;load x+3*stride
+    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 8
+    SBUTTERFLY        bw, 2, 3, 8
+    SBUTTERFLY        bw, 4, 5, 8
+    SBUTTERFLY        bw, 6, 7, 8
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
 %endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 8
+    SBUTTERFLY        wd, 2, 3, 8
+    SBUTTERFLY        wd, 4, 5, 8
+    SBUTTERFLY        wd, 6, 7, 8
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
 
-    dec heightd
-    jg .loop
-    RET
+%macro PEL_12STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_12STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_12STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_12STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_12STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_12STORE16 3
+    PEL_12STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
 %endmacro
 
-INIT_XMM ssse3
-QPEL_8 4,  0
-QPEL_8 8,  0
-QPEL_8 12, 0
-QPEL_8 16, 0
-QPEL_8 24, 0
-QPEL_8 32, 0
-QPEL_8 48, 0
-QPEL_8 64, 0
-
-QPEL_8 4,  1
-QPEL_8 8,  1
-QPEL_8 12, 1
-QPEL_8 16, 1
-QPEL_8 24, 1
-QPEL_8 32, 1
-QPEL_8 48, 1
-QPEL_8 64, 1
-
-; 16-bit qpel interpolation
-; %1: block width
-; %2: shift applied to the result
-; %3: 0 - horizontal; 1 - vertical
-%macro QPEL_16 3
-%if %3
-    %define mvfrac     myq
-    %define pixstride  srcstrideq
-    %define pixstride3 sstride3q
-    %define src_m3     srcm3q
-%else
-    %define mvfrac     mxq
-    %define pixstride  2
-    %define pixstride3 6
-    %define src_m3     (srcq - 6)
-%endif
-
-    COMMON_DEFS %1, 16
-
-    and       mvfrac, 0x3
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsregq, [hevc_qpel_coeffs]
-    mova      m0,         [coeffsregq + mvfrac]
-
-    pshufd    m1, m0, 0x55
-    pshufd    m2, m0, 0xaa
-    pshufd    m3, m0, 0xff
-    pshufd    m0, m0, 0x00
-
-%if %3
-    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
-    mov       srcm3q, srcq
-    sub       srcm3q, sstride3q
+%macro PEL_10STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+%if cpuflag(avx2)
+    movu            [%1], %2
+%else
+    PEL_10STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
 %endif
+%endmacro
 
-.loop:
+%macro PEL_10STORE32 3
+    PEL_10STORE16     %1, %2, %3
+    movu         [%1+32], %3
+%endmacro
 
-%assign i 0
-%rep nb_blocks
+%macro PEL_8STORE2 3
+    pextrw          [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+    movd            [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+    movd            [%1], %2
+    pextrw        [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+    movq            [%1], %2
+    psrldq            %2, 8
+    movd          [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+%if cpuflag(avx2)
+    movdqu        [%1], %2
+%else
+    mova          [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+    movu          [%1], %2
+%endmacro
 
-    BLOCK_DEFS i
+%macro LOOP_END 3
+    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
+    add              %2q, %3q                    ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+%endmacro
 
-    LOAD m4,  [src_m3 + i * 2 * blocksize]
-    LOAD m5,  [src_m3 + i * 2 * blocksize + 1 * pixstride]
-    LOAD m6,  [src_m3 + i * 2 * blocksize + 2 * pixstride]
-    LOAD m7,  [srcq   + i * 2 * blocksize + 0 * pixstride]
-    LOAD m8,  [srcq   + i * 2 * blocksize + 1 * pixstride]
-    LOAD m9,  [srcq   + i * 2 * blocksize + 2 * pixstride]
-    LOAD m10, [srcq   + i * 2 * blocksize +     pixstride3]
-    LOAD m11, [srcq   + i * 2 * blocksize + 4 * pixstride]
 
-    punpcklwd m12, m4, m5
-    pmaddwd   m12, m0
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && %0 ==3
+%if %1 > 16
+    vextracti128 xm1, m0, 1
+    pmovzxbw      m1, xm1
+    psllw         m1, 14-%2
+%endif
+    pmovzxbw      m0, xm0
+%else ; not avx
+%if %1 > 8
+    punpckhbw     m1, m0, m2
+    psllw         m1, 14-%2
+%endif
+    punpcklbw     m0, m2
+%endif
+%endif ;avx
+    psllw         m0, 14-%2
+%endmacro
+
+%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
+%if %1 == 8
+%if cpuflag(avx2) && (%0 == 5)
+%if %2 > 16
+    vperm2i128    m10, m0, m1, q0301
+%endif
+    vinserti128    m0, m0, xm1, 1
+    mova           m1, m10
+%if %2 > 16
+    vperm2i128    m10, m2, m3, q0301
+%endif
+    vinserti128    m2, m2, xm3, 1
+    mova           m3, m10
+%endif
+    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
+    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
+    paddw          %%reg0, %%reg2
+%if %2 > 8
+    pmaddubsw      %%reg1, %3
+    pmaddubsw      %%reg3, %4
+    paddw          %%reg1, %%reg3
+%endif
+%else
+    pmaddwd        %%reg0, %3
+    pmaddwd        %%reg2, %4
+    paddd          %%reg0, %%reg2
+%if %2 > 4
+    pmaddwd        %%reg1, %3
+    pmaddwd        %%reg3, %4
+    paddd          %%reg1, %%reg3
+%if %1 != 8
+    psrad          %%reg1, %1-8
+%endif
+%endif
+%if %1 != 8
+    psrad          %%reg0, %1-8
+%endif
+    packssdw       %%reg0, %%reg1
+%endif
+%endmacro
 
-    punpcklwd m13, m6, m7
-    pmaddwd   m13, m1
-    paddd     m12, m13
+%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
 
-    punpcklwd m13, m8, m9
-    pmaddwd   m13, m2
-    paddd     m12, m13
+%if cpuflag(avx2)
+%assign %%offset 32
+%define %%table  hevc_qpel_filters_avx2_%2
+%else
+%assign %%offset 16
+%define %%table  hevc_qpel_filters_sse4_%2
+%endif
 
-    punpcklwd m13, m10, m11
-    pmaddwd   m13, m3
-    paddd     m12, m13
-    psrad     m12, %2
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
 
-    %if block_truncated == 0
-        punpckhwd m4, m5
-        pmaddwd   m4, m0
+%if %2 == 8
+    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
+    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
+    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
+    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%else
+    pmaddwd           m0, [rfilterq + %3q*8   ]
+    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, [rfilterq + %3q*8   ]
+    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+    p%4               m0, m1
+%endif
+%endmacro
 
-        punpckhwd m6, m7
-        pmaddwd   m6, m1
-        paddd     m4, m6
+%macro QPEL_COMPUTE 2-3     ; width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && (%0 == 3)
 
-        punpckhwd m8, m9
-        pmaddwd   m8, m2
-        paddd     m4, m8
+    vperm2i128 m10, m0,  m1, q0301
+    vinserti128 m0, m0, xm1, 1
+    SWAP 1, 10
 
-        punpckhwd m10, m11
-        pmaddwd   m10, m3
-        paddd     m4, m10
+    vperm2i128 m10, m2,  m3, q0301
+    vinserti128 m2, m2, xm3, 1
+    SWAP 3, 10
 
-        psrad     m4, %2
-    %endif
-    packssdw  m12, m4
-    STORE [dstq + i * 2 * blocksize], m12
 
-%assign i (i + 1)
-%endrep
+    vperm2i128 m10, m4,  m5, q0301
+    vinserti128 m4, m4, xm5, 1
+    SWAP 5, 10
 
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-%if %3
-    add       srcm3q, srcstrideq
+    vperm2i128 m10, m6,  m7, q0301
+    vinserti128 m6, m6, xm7, 1
+    SWAP 7, 10
 %endif
 
-    dec heightd
-    jg .loop
-    RET
+    pmaddubsw         m0, m12   ;x1*c1+x2*c2
+    pmaddubsw         m2, m13   ;x3*c3+x4*c4
+    pmaddubsw         m4, m14   ;x5*c5+x6*c6
+    pmaddubsw         m6, m15   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%if %1 > 8
+    pmaddubsw         m1, m12
+    pmaddubsw         m3, m13
+    pmaddubsw         m5, m14
+    pmaddubsw         m7, m15
+    paddw             m1, m3
+    paddw             m5, m7
+    paddw             m1, m5
+%endif
+%else
+    pmaddwd           m0, m12
+    pmaddwd           m2, m13
+    pmaddwd           m4, m14
+    pmaddwd           m6, m15
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, m12
+    pmaddwd           m3, m13
+    pmaddwd           m5, m14
+    pmaddwd           m7, m15
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+%endif
 %endmacro
 
-%if ARCH_X86_64
-
-%macro QPEL_H_10 1
-cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
-QPEL_16 %1, 2, 0
-%endmacro
-
-INIT_XMM avx
-QPEL_H_10 4
-QPEL_H_10 8
-QPEL_H_10 12
-QPEL_H_10 16
-QPEL_H_10 24
-QPEL_H_10 32
-QPEL_H_10 48
-QPEL_H_10 64
-
-%macro QPEL_V_10 1
-cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 2, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_V_10 4
-QPEL_V_10 8
-QPEL_V_10 12
-QPEL_V_10 16
-QPEL_V_10 24
-QPEL_V_10 32
-QPEL_V_10 48
-QPEL_V_10 64
-
-; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
-;                  uint8_t *src, ptrdiff_t srcstride,
-;                  int height, int mx, int my, int *mcbuffer)
-
-%macro QPEL_HV 1
-cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 6, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_HV 4
-QPEL_HV 8
-QPEL_HV 12
-QPEL_HV 16
-QPEL_HV 24
-QPEL_HV 32
-QPEL_HV 48
-QPEL_HV 64
-
-%endif ; ARCH_X86_64
-
-; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                     uint8_t *src, ptrdiff_t srcstride,
-;                     int height, int mx, int my, int *mcbuffer)
+%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+    paddsw            %3, %5
+%if %1 > 8
+    paddsw            %4, %6
+%endif
+    UNI_COMPUTE       %1, %2, %3, %4, %7
+%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
+    vpermq            %3, %3, 216
+    vpermq            %4, %4, 216
+%endif
+%endmacro
 
-; 8-bit epel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro EPEL_8 2
-%if %2
-    %define postfix    v
-    %define mvfrac     myq
-    %define coeffsaddr r5q
-    %define pixstride  srcstrideq
-    %define pixstride3 r5q
+%macro UNI_COMPUTE 5
+    pmulhrsw          %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+    pmulhrsw          %4, %5
+%endif
+%if %2 == 8
+    packuswb          %3, %4
 %else
-    %define postfix    h
-    %define mvfrac     mxq
-    %define coeffsaddr r6q
-    %define pixstride  1
-    %define pixstride3 3
+    CLIPW             %3, [pb_0], [max_pixels_%2]
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
+    CLIPW             %4, [pb_0], [max_pixels_%2]
+%endif
 %endif
+%endmacro
+
 
-    COMMON_DEFS %1, 8
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+;                         uint8_t *_src, ptrdiff_t _srcstride,
+;                         int height, int mx, int my)
+; ******************************
 
-cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
-    and       mvfrac, 0x7
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsaddr, [hevc_epel_coeffs8]
-    movq      m0,         [coeffsaddr + mvfrac]
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+HEVC_PEL_PIXELS     %1, %2
+HEVC_UNI_PEL_PIXELS %1, %2
+HEVC_BI_PEL_PIXELS  %1, %2
+%endmacro
 
-    SPLATW    m1, m0, 1
-    SPLATW    m0, m0, 0
+%macro HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
+    pxor               m2, m2
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+ %endmacro
 
-%if %2
-    lea       pixstride3, [srcstrideq + 2 * srcstrideq]
-%endif
-    sub       srcq, pixstride
+%macro HEVC_UNI_PEL_PIXELS 2
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
+%macro HEVC_BI_PEL_PIXELS 2
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
+    pxor              m2, m2
+    movdqa            m5, [pw_bi_%2]
 .loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    SIMPLE_BILOAD     %1, src2q, m3, m4
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
-%assign i 0
-%rep nb_blocks
 
-    BLOCK_DEFS i
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width);
+; ******************************
 
-    LOAD m2, [srcq + i * blocksize + 0 * pixstride]
-    LOAD m3, [srcq + i * blocksize + 1 * pixstride]
-    LOAD m4, [srcq + i * blocksize + 2 * pixstride]
-    LOAD m5, [srcq + i * blocksize +     pixstride3]
 
-    punpcklbw m2, m3
-    punpcklbw m4, m5
+%macro HEVC_PUT_HEVC_EPEL 2
+%if cpuflag(avx2)
+%define XMM_REGS  11
+%else
+%define XMM_REGS  8
+%endif
+
+cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1      dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
 
-    pmaddubsw m2, m0
-    pmaddubsw m4, m1
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    movdqa            m6, [pw_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    paddsw    m2, m4
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m6, [pw_bi_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    STORE [dstq + i * 2 * blocksize], m2
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+;                      uint8_t *_src, ptrdiff_t _srcstride,
+;                      int height, int mx, int my, int width)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
 
-%assign i (i + 1)
-%endrep
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    add       dstq, dststrideq
-    add       srcq, srcstrideq
 
-    dec heightd
-    jg .loop
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_bi_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
 %endmacro
 
-INIT_XMM ssse3
-EPEL_8 4,  0
-EPEL_8 8,  0
-EPEL_8 12, 0
-EPEL_8 16, 0
-EPEL_8 24, 0
-EPEL_8 32, 0
-
-EPEL_8 4,  1
-EPEL_8 8,  1
-EPEL_8 12, 1
-EPEL_8 16, 1
-EPEL_8 24, 1
-EPEL_8 32, 1
 
-%macro EPEL_16 3
-%if %3
-    %define mvfrac     myq
-    %define pixstride  srcstrideq
-    %define pixstride3 sstride3q
+; ******************************
+; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
+
+%macro HEVC_PUT_HEVC_EPEL_HV 2
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m2, m0, xm4, 1
+    vperm2i128        m3, m0, m4, q0301
+    PEL_10STORE%1     dstq, m2, m3
 %else
-    %define mvfrac     mxq
-    %define pixstride  2
-    %define pixstride3 6
+    PEL_10STORE%1     dstq, m0, m4
 %endif
+%else
+    PEL_10STORE%1     dstq, m0, m1
+%endif
+    movdqa            m4, m5
+    movdqa            m5, m6
+    movdqa            m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
 
-    COMMON_DEFS %1, 16
+cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    mova              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
+%else
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    and       mvfrac, 0x7
-    dec       mvfrac
-    shl       mvfrac, 5
-    lea       coeffsregq, [hevc_epel_coeffs]
-    mova      m0, [coeffsregq + mvfrac]
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    SIMPLE_BILOAD     %1, src2q, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m1, m8, xm3, 1
+    vperm2i128        m2, m8, m3, q0301
+    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
+%else
+    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
+%endif
+%else
+    SIMPLE_BILOAD     %1, src2q, m8, m9
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m4
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
-    pshufd    m1, m0, 0x55
-    pshufd    m0, m0, 0x00
+; ******************************
+; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
 
-%if %3
-    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
+%macro HEVC_PUT_HEVC_QPEL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
 %endif
-    sub       srcq, pixstride
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
 
+cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
+    mova              m9, [pw_%2]
+    QPEL_FILTER       %2, mx
 .loop:
-
-%assign i 0
-%rep nb_blocks
-
-    BLOCK_DEFS i
-
-    LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
-    LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
-    LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
-    LOAD m5, [srcq + i * 2 * blocksize +     pixstride3]
-
-    punpcklwd m6, m2, m3
-    punpcklwd m7, m4, m5
-    pmaddwd   m6, m0
-    pmaddwd   m7, m1
-    paddd     m6, m7
-    psrad     m6, %2
-
-    %if block_truncated == 0
-        punpckhwd m2, m3
-        punpckhwd m4, m5
-        pmaddwd   m2, m0
-        pmaddwd   m4, m1
-        paddd     m2, m4
-        psrad     m2, %2
-    %endif
-    packssdw  m6, m2
-    STORE [dstq + i * 2 * blocksize], m6
-
-%assign i (i + 1)
-%endrep
-
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-
-    dec heightd
-    jg .loop
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
-%endmacro
 
-%if ARCH_X86_64
-
-%macro EPEL_H_10 1
-cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 0
-%endmacro
+cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m9, [pw_bi_%2]
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-INIT_XMM avx
-EPEL_H_10 4
-EPEL_H_10 8
-EPEL_H_10 12
-EPEL_H_10 16
-EPEL_H_10 24
-EPEL_H_10 32
 
-%macro EPEL_V_10 1
-cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 1
-%endmacro
+; ******************************
+; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
 
-INIT_XMM avx
-EPEL_V_10 4
-EPEL_V_10 8
-EPEL_V_10 12
-EPEL_V_10 16
-EPEL_V_10 24
-EPEL_V_10 32
+cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
 
-; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                    int16_t *src, ptrdiff_t srcstride,
-;                    int height, int mx, int my, int *mcbuffer)
+cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%macro EPEL_HV 1
-cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 6, 1
+cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_bi_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 %endmacro
 
-INIT_XMM avx
-EPEL_HV 4
-EPEL_HV 8
-EPEL_HV 12
-EPEL_HV 16
-EPEL_HV 24
-EPEL_HV 32
 
-%endif ; ARCH_X86_64
+; ******************************
+; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_QPEL_HV 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    PEL_10STORE%1     dstq, m0, m1
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
 
-; hevc_put_unweighted_pred_<w>_<d>(pixel   *dst, ptrdiff_t dststride,
-;                                  int16_t *src, ptrdiff_t srcstride,
-;                                  int height)
+cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    mova            m8, m9
+    mova            m9, m10
+    mova           m10, m11
+    mova           m11, m12
+    mova           m12, m13
+    mova           m13, m14
+    mova           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%macro AVG 5
-    %if %3
-        %if %4 == 4
-            movq %5, %2
-            paddsw %1, %5
-        %else
-            paddsw %1, %2
-        %endif
-    %endif
+cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 %endmacro
 
-; %1: 0 - one source; 1 - two sources
-; %2: width
-; %3: bit depth
-%macro PUT_PRED 3
-%if %1
-cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
+%macro WEIGHTING_FUNCS 2
+%if WIN64 || ARCH_X86_32
+cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
+    mov             r4d, denomm
+%define SHIFT  r4d
 %else
-cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
+cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
+%define SHIFT  denomd
 %endif
+    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
+%if %1 <= 4
+    pxor             m1, m1
+%endif
+    movd             m2, wxm        ; WX
+    movd             m4, SHIFT      ; shift
+%if %1 <= 4
+    punpcklwd        m2, m1
+%else
+    punpcklwd        m2, m2
+%endif
+    dec           SHIFT
+    movdqu           m5, [pd_1]
+    movd             m6, SHIFT
+    pshufd           m2, m2, 0
+    mov           SHIFT, oxm
+    pslld            m5, m6
+%if %2 != 8
+    shl           SHIFT, %2-8       ; ox << (bitd - 8)
+%endif
+    movd             m3, SHIFT      ; OX
+    pshufd           m3, m3, 0
+%if WIN64 || ARCH_X86_32
+    mov           SHIFT, heightm
+%endif
+.loop:
+   SIMPLE_LOAD        %1, 10, srcq, m0
+%if %1 <= 4
+    punpcklwd         m0, m1
+    pmaddwd           m0, m2
+    paddd             m0, m5
+    psrad             m0, m4
+    paddd             m0, m3
+%else
+    pmulhw            m6, m0, m2
+    pmullw            m0, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    paddd             m0, m5
+    paddd             m1, m5
+    psrad             m0, m4
+    psrad             m1, m4
+    paddd             m0, m3
+    paddd             m1, m3
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+    CLIPW             m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%assign shift       14 + %1 - %3
-%assign offset      (1 << (shift - 1))
-%define offset_data pw_ %+ offset
-
-    mova        m0, [offset_data]
-
-%if %3 > 8
-    %define STORE_BLOCK movu
-    %define STORE_HALF  movq
-
-    %assign pixel_max ((1 << %3) - 1)
-    %define pw_pixel_max pw_ %+ pixel_max
-    pxor    m1, m1
-    mova    m2, [pw_pixel_max]
+cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
+    movifnidn        r5d, denomm
+%if %1 <= 4
+    pxor              m1, m1
+%endif
+    movd              m2, wx0m         ; WX0
+    lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
+    movd              m3, wx1m         ; WX1
+    movd              m0, r5d          ; shift
+%if %1 <= 4
+    punpcklwd         m2, m1
+    punpcklwd         m3, m1
+%else
+    punpcklwd         m2, m2
+    punpcklwd         m3, m3
+%endif
+    inc              r5d
+    movd              m5, r5d          ; shift+1
+    pshufd            m2, m2, 0
+    mov              r5d, ox0m
+    pshufd            m3, m3, 0
+    add              r5d, ox1m
+%if %2 != 8
+    shl              r5d, %2-8         ; ox << (bitd - 8)
+%endif
+    inc              r5d
+    movd              m4, r5d          ; offset
+    pshufd            m4, m4, 0
+%if UNIX64
+%define h heightd
 %else
-    %define STORE_BLOCK movq
-    %define STORE_HALF  movd
+    mov              r5d, heightm
+%define h r5d
 %endif
+    pslld             m4, m0
 
 .loop:
-%assign i 0
-%rep (%2 + 7) / 8
-
-    %if (i + 1) * 8 > %2
-        %define LOAD movq
-        %define STORE STORE_HALF
-    %else
-        %define LOAD mova
-        %define STORE STORE_BLOCK
-    %endif
-
-    LOAD m3, [srcq  + 16 * i]
-    AVG  m3, [src2q + 16 * i], %1, %3 - i * 8, m4
-
-    paddsw m3, m0
-    psraw  m3, shift
-
-    %if %3 == 8
-        packuswb m3, m3
-        STORE [dstq + 8 * i], m3
-    %else
-        CLIPW m3, m1, m2
-        STORE [dstq + 16 * i], m3
-    %endif
-%assign i (i + 1)
-%endrep
-
-    add dstq,  dststrideq
-    add srcq,  srcstrideq
-%if %1
-    add src2q, srcstrideq
-%endif
-
-    dec         heightd
-    jg          .loop
+   SIMPLE_LOAD        %1, 10, srcq,  m0
+   SIMPLE_LOAD        %1, 10, src2q, m8
+%if %1 <= 4
+    punpcklwd         m0, m1
+    punpcklwd         m8, m1
+    pmaddwd           m0, m3
+    pmaddwd           m8, m2
+    paddd             m0, m4
+    paddd             m0, m8
+    psrad             m0, m5
+%else
+    pmulhw            m6, m0, m3
+    pmullw            m0, m3
+    pmulhw            m7, m8, m2
+    pmullw            m8, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    punpckhwd         m9, m8, m7
+    punpcklwd         m8, m7
+    paddd             m0, m8
+    paddd             m1, m9
+    paddd             m0, m4
+    paddd             m1, m4
+    psrad             m0, m5
+    psrad             m1, m5
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+     CLIPW            m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
+    dec                h                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
 %endmacro
 
-INIT_XMM sse2
-PUT_PRED 0, 4,  8
-PUT_PRED 1, 4,  8
-PUT_PRED 0, 8,  8
-PUT_PRED 1, 8,  8
-PUT_PRED 0, 12, 8
-PUT_PRED 1, 12, 8
-PUT_PRED 0, 16, 8
-PUT_PRED 1, 16, 8
-PUT_PRED 0, 24, 8
-PUT_PRED 1, 24, 8
-PUT_PRED 0, 32, 8
-PUT_PRED 1, 32, 8
-PUT_PRED 0, 48, 8
-PUT_PRED 1, 48, 8
-PUT_PRED 0, 64, 8
-PUT_PRED 1, 64, 8
-
-PUT_PRED 0, 4,  10
-PUT_PRED 1, 4,  10
-PUT_PRED 0, 8,  10
-PUT_PRED 1, 8,  10
-PUT_PRED 0, 12, 10
-PUT_PRED 1, 12, 10
-PUT_PRED 0, 16, 10
-PUT_PRED 1, 16, 10
-PUT_PRED 0, 24, 10
-PUT_PRED 1, 24, 10
-PUT_PRED 0, 32, 10
-PUT_PRED 1, 32, 10
-PUT_PRED 0, 48, 10
-PUT_PRED 1, 48, 10
-PUT_PRED 0, 64, 10
-PUT_PRED 1, 64, 10
-
-%macro PUT_WEIGHTED_PRED 3
-%if %1
-cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
-%else
-cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
-%endif
-
-    and         denomd, 0xff
-    movsx       weight0d, weight0w
-    movsx       offset0d, offset0w
-%if %1
-    movsx       weight1d, weight1w
-    movsx       offset1d, offset1w
-%endif
-
-    add         denomd, 14 + %1 - %3
-    movd        m0, denomd
+INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
+
+WEIGHTING_FUNCS 2, 8
+WEIGHTING_FUNCS 4, 8
+WEIGHTING_FUNCS 6, 8
+WEIGHTING_FUNCS 8, 8
+
+WEIGHTING_FUNCS 2, 10
+WEIGHTING_FUNCS 4, 10
+WEIGHTING_FUNCS 6, 10
+WEIGHTING_FUNCS 8, 10
 
-%if %3 > 8
-    %assign     pixel_max ((1 << %3) - 1)
-    %define     pw_pixel_max pw_ %+ pixel_max
-    pxor        m4, m4
-    mova        m5, [pw_pixel_max]
+WEIGHTING_FUNCS 2, 12
+WEIGHTING_FUNCS 4, 12
+WEIGHTING_FUNCS 6, 12
+WEIGHTING_FUNCS 8, 12
 
-    shl         offset0d, %3 - 8
-%if %1
-    shl         offset1d, %3 - 8
-%endif
-%endif
+HEVC_PUT_HEVC_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_PEL_PIXELS  6, 8
+HEVC_PUT_HEVC_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 8
 
-%if %1
-    lea         offset0d, [offset0d + offset1d + 1]
-%else
-    lea         offset0d, [2 * offset0d + 1]
-%endif
-    movd        m1, offset0d
-    SPLATD      m1
-    pslld       m1, m0
-    psrad       m1, 1
+HEVC_PUT_HEVC_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_PEL_PIXELS 8, 10
 
-    movd        m2, weight0d
-    SPLATD      m2
-%if %1
-    movd        m3, weight1d
-    SPLATD      m3
-%endif
+HEVC_PUT_HEVC_PEL_PIXELS 2, 12
+HEVC_PUT_HEVC_PEL_PIXELS 4, 12
+HEVC_PUT_HEVC_PEL_PIXELS 6, 12
+HEVC_PUT_HEVC_PEL_PIXELS 8, 12
 
-.loop:
-%assign i 0
-%rep (%2 + 3) / 4
+HEVC_PUT_HEVC_EPEL 2,  8
+HEVC_PUT_HEVC_EPEL 4,  8
+HEVC_PUT_HEVC_EPEL 6,  8
+HEVC_PUT_HEVC_EPEL 8,  8
+HEVC_PUT_HEVC_EPEL 12, 8
+HEVC_PUT_HEVC_EPEL 16, 8
 
-    pmovsxwd   m6, [src0q + 8 * i]
-    pmulld     m6, m2
 
-%if %1
-    pmovsxwd   m7, [src1q + 8 * i]
-    pmulld     m7, m3
-    paddd      m6, m7
-%endif
+HEVC_PUT_HEVC_EPEL 2, 10
+HEVC_PUT_HEVC_EPEL 4, 10
+HEVC_PUT_HEVC_EPEL 6, 10
+HEVC_PUT_HEVC_EPEL 8, 10
 
-    paddd      m6, m1
-    psrad      m6, m0
+HEVC_PUT_HEVC_EPEL 2, 12
+HEVC_PUT_HEVC_EPEL 4, 12
+HEVC_PUT_HEVC_EPEL 6, 12
+HEVC_PUT_HEVC_EPEL 8, 12
 
-    packssdw   m6, m6
+HEVC_PUT_HEVC_EPEL_HV 2,  8
+HEVC_PUT_HEVC_EPEL_HV 4,  8
+HEVC_PUT_HEVC_EPEL_HV 6,  8
+HEVC_PUT_HEVC_EPEL_HV 8,  8
+HEVC_PUT_HEVC_EPEL_HV 16, 8
 
-%if %3 > 8
-    CLIPW      m6, m4, m5
-    movq       [dstq + 8 * i], m6
-%else
-    packuswb   m6, m6
-    movd [dstq + 4 * i], m6
-%endif
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
 
-%assign i (i + 1)
-%endrep
+HEVC_PUT_HEVC_EPEL_HV 2, 12
+HEVC_PUT_HEVC_EPEL_HV 4, 12
+HEVC_PUT_HEVC_EPEL_HV 6, 12
+HEVC_PUT_HEVC_EPEL_HV 8, 12
 
-    add dstq,  dststrideq
-    add src0q, srcstrideq
-%if %1
-    add src1q, srcstrideq
-%endif
+HEVC_PUT_HEVC_QPEL 4,  8
+HEVC_PUT_HEVC_QPEL 8,  8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
 
-    dec         heightd
-    jg          .loop
-    RET
-%endmacro
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
 
-%if ARCH_X86_64
-INIT_XMM sse4
-PUT_WEIGHTED_PRED 0, 4,  8
-PUT_WEIGHTED_PRED 1, 4,  8
-PUT_WEIGHTED_PRED 0, 8,  8
-PUT_WEIGHTED_PRED 1, 8,  8
-PUT_WEIGHTED_PRED 0, 12, 8
-PUT_WEIGHTED_PRED 1, 12, 8
-PUT_WEIGHTED_PRED 0, 16, 8
-PUT_WEIGHTED_PRED 1, 16, 8
-PUT_WEIGHTED_PRED 0, 24, 8
-PUT_WEIGHTED_PRED 1, 24, 8
-PUT_WEIGHTED_PRED 0, 32, 8
-PUT_WEIGHTED_PRED 1, 32, 8
-PUT_WEIGHTED_PRED 0, 48, 8
-PUT_WEIGHTED_PRED 1, 48, 8
-PUT_WEIGHTED_PRED 0, 64, 8
-PUT_WEIGHTED_PRED 1, 64, 8
-
-PUT_WEIGHTED_PRED 0, 4,  10
-PUT_WEIGHTED_PRED 1, 4,  10
-PUT_WEIGHTED_PRED 0, 8,  10
-PUT_WEIGHTED_PRED 1, 8,  10
-PUT_WEIGHTED_PRED 0, 12, 10
-PUT_WEIGHTED_PRED 1, 12, 10
-PUT_WEIGHTED_PRED 0, 16, 10
-PUT_WEIGHTED_PRED 1, 16, 10
-PUT_WEIGHTED_PRED 0, 24, 10
-PUT_WEIGHTED_PRED 1, 24, 10
-PUT_WEIGHTED_PRED 0, 32, 10
-PUT_WEIGHTED_PRED 1, 32, 10
-PUT_WEIGHTED_PRED 0, 48, 10
-PUT_WEIGHTED_PRED 1, 48, 10
-PUT_WEIGHTED_PRED 0, 64, 10
-PUT_WEIGHTED_PRED 1, 64, 10
+HEVC_PUT_HEVC_QPEL 4, 12
+HEVC_PUT_HEVC_QPEL 8, 12
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 12
+HEVC_PUT_HEVC_QPEL_HV 4, 12
+HEVC_PUT_HEVC_QPEL_HV 6, 12
+HEVC_PUT_HEVC_QPEL_HV 8, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
+
+HEVC_PUT_HEVC_PEL_PIXELS 32, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+
+HEVC_PUT_HEVC_EPEL 32, 8
+HEVC_PUT_HEVC_EPEL 16, 10
+
+HEVC_PUT_HEVC_EPEL_HV 16, 10
+HEVC_PUT_HEVC_EPEL_HV 32, 8
+
+HEVC_PUT_HEVC_QPEL 32, 8
+
+HEVC_PUT_HEVC_QPEL 16, 10
+
+HEVC_PUT_HEVC_QPEL_HV 16, 10
 
+%endif ;AVX2
 %endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
new file mode 100644
index 0000000..756adfe
--- /dev/null
+++ b/libavcodec/x86/hevc_sao.asm
@@ -0,0 +1,340 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 8bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pb_1
+cextern pb_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 0
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    pxor              m0, m0
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
+    psraw             %1, %2, 3
+%if ARCH_X86_64
+    pcmpeqw          m10, %1, m0
+    pcmpeqw          m11, %1, m1
+    pcmpeqw          m12, %1, m2
+    pcmpeqw           %1, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand              %1, m7
+    por              m10, m11
+    por              m12, %1
+    por              m10, m12
+    paddw             %2, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
+    pcmpeqw           %1, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %1, m7
+    por               m4, m5
+    por               m6, %1
+    por               m4, m6
+    paddw             %2, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 2
+cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT
+
+align 16
+.loop:
+%if %1 == 8
+    movq              m8, [srcq]
+    punpcklbw         m8, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+    packuswb          m8, m14
+    movq          [dstq], m8
+%endif ; %1 == 8
+
+%assign i 0
+%rep %2
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %1 == 48
+
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    REP_RET
+%endmacro
+
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+HEVC_SAO_BAND_FILTER 32, 2
+HEVC_SAO_BAND_FILTER 48, 2
+HEVC_SAO_BAND_FILTER 64, 4
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 32, 1
+HEVC_SAO_BAND_FILTER 48, 1
+HEVC_SAO_BAND_FILTER 64, 2
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE
+    imul       b_strideq, EDGE_SRCSTRIDE
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
+    pminub            m4, m1, m2
+    pminub            m5, m1, m3
+    pcmpeqb           m2, m4
+    pcmpeqb           m3, m5
+    pcmpeqb           m4, m1
+    pcmpeqb           m5, m1
+    psubb             m4, m2
+    psubb             m5, m3
+    paddb             m4, m6
+    paddb             m4, m5
+
+    pshufb            m2, m0, m4
+%if %1 > 8
+    punpckhbw         m5, m7, m1
+    punpckhbw         m4, m2, m7
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m5, m4
+    pmaddubsw         m3, m2
+    packuswb          m3, m5
+%else
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m3, m2
+    packuswb          m3, m3
+%endif
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                             int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 2-3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+%endif ; ARCH
+
+%if mmsize > 16
+    vbroadcasti128    m0, [offsetq]
+%else
+    movu              m0, [offsetq]
+%endif
+    mova              m1, [pb_edge_shuffle]
+    packsswb          m0, m0
+    mova              m7, [pb_1]
+    pshufb            m0, m1
+    mova              m6, [pb_2]
+%if ARCH_X86_32
+    mov          heightd, r6m
+%endif
+
+align 16
+.loop:
+
+%if %1 == 8
+    movq              m1, [srcq]
+    movq              m2, [srcq + a_strideq]
+    movq              m3, [srcq + b_strideq]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    movq          [dstq], m3
+%endif
+
+%assign i 0
+%rep %2
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mov%3     [dstq + i], m3
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mova      [dstq + i], m3
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+HEVC_SAO_EDGE_FILTER  8, 0
+HEVC_SAO_EDGE_FILTER 16, 1, a
+HEVC_SAO_EDGE_FILTER 32, 2, a
+HEVC_SAO_EDGE_FILTER 48, 2, a
+HEVC_SAO_EDGE_FILTER 64, 4, a
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 32, 1, a
+HEVC_SAO_EDGE_FILTER 48, 1, u
+HEVC_SAO_EDGE_FILTER 64, 2, a
+%endif
diff --git a/libavcodec/x86/hevc_sao_10bit.asm b/libavcodec/x86/hevc_sao_10bit.asm
new file mode 100644
index 0000000..b30583d
--- /dev/null
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@@ -0,0 +1,370 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 10/12bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m2:     times 16 dw -2
+pw_mask10: times 16 dw 0x03FF
+pw_mask12: times 16 dw 0x0FFF
+pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pw_m1
+cextern pw_1
+cextern pw_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 1
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    mova             m13, [pw_mask %+ %1]
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    mova              m1, [pw_mask %+ %1]
+    pxor              m0, m0
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 3
+cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT %1
+
+align 16
+.loop:
+
+%assign i 0
+%assign j 0
+%rep %3
+%assign k 8+(j&1)
+%assign l 9-(j&1)
+    mova          m %+ k, [srcq + i]
+    psraw         m %+ l, m %+ k, %1-5
+%if ARCH_X86_64
+    pcmpeqw          m10, m %+ l, m0
+    pcmpeqw          m11, m %+ l, m1
+    pcmpeqw          m12, m %+ l, m2
+    pcmpeqw       m %+ l, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand          m %+ l, m7
+    por              m10, m11
+    por              m12, m %+ l
+    por              m10, m12
+    paddw         m %+ k, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
+    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
+    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
+    pcmpeqw       m %+ l, [rsp+mmsize*3]
+    pand              m4, [rsp+mmsize*4]
+    pand              m5, [rsp+mmsize*5]
+    pand              m6, [rsp+mmsize*6]
+    pand          m %+ l, m7
+    por               m4, m5
+    por               m6, m %+ l
+    por               m4, m6
+    paddw         m %+ k, m4
+%endif ; ARCH
+    CLIPW             m %+ k, m14, m13
+    mova      [dstq + i], m %+ k
+%assign i i+mmsize
+%assign j j+1
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, srcstrideq
+    dec          heightd
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER 10,  8, 1
+HEVC_SAO_BAND_FILTER 10, 16, 2
+HEVC_SAO_BAND_FILTER 10, 32, 4
+HEVC_SAO_BAND_FILTER 10, 48, 6
+HEVC_SAO_BAND_FILTER 10, 64, 8
+
+HEVC_SAO_BAND_FILTER 12,  8, 1
+HEVC_SAO_BAND_FILTER 12, 16, 2
+HEVC_SAO_BAND_FILTER 12, 32, 4
+HEVC_SAO_BAND_FILTER 12, 48, 6
+HEVC_SAO_BAND_FILTER 12, 64, 8
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 32, 2
+HEVC_SAO_BAND_FILTER 10, 48, 3
+HEVC_SAO_BAND_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 32, 2
+HEVC_SAO_BAND_FILTER 12, 48, 3
+HEVC_SAO_BAND_FILTER 12, 64, 4
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro PMINUW 4
+%if cpuflag(sse4)
+    pminuw            %1, %2, %3
+%else
+    psubusw           %4, %2, %3
+    psubw             %1, %2, %4
+%endif
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE >> 1
+    imul       b_strideq, EDGE_SRCSTRIDE >> 1
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                                   int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%endif ; ARCH
+
+%if mmsize > 16
+    SPLATW            m8, [offsetq+2]
+    SPLATW            m9, [offsetq+4]
+    SPLATW           m10, [offsetq+0]
+    SPLATW           m11, [offsetq+6]
+    SPLATW           m12, [offsetq+8]
+%else
+    movq             m10, [offsetq+0]
+    movd             m12, [offsetq+6]
+    SPLATW            m8, xm10, 1
+    SPLATW            m9, xm10, 2
+    SPLATW           m10, xm10, 0
+    SPLATW           m11, xm12, 0
+    SPLATW           m12, xm12, 1
+%endif
+    pxor              m0, m0
+%if ARCH_X86_64
+    mova             m13, [pw_m1]
+    mova             m14, [pw_1]
+    mova             m15, [pw_2]
+%else
+    mov          heightd, r6m
+    mova  [rsp+mmsize*0], m8
+    mova  [rsp+mmsize*1], m9
+    mova  [rsp+mmsize*2], m10
+    mova  [rsp+mmsize*3], m11
+    mova  [rsp+mmsize*4], m12
+%endif
+
+align 16
+.loop:
+
+%assign i 0
+%rep %3
+    mova              m1, [srcq + i]
+    movu              m2, [srcq+a_strideq + i]
+    movu              m3, [srcq+b_strideq + i]
+    PMINUW            m4, m1, m2, m6
+    PMINUW            m5, m1, m3, m7
+    pcmpeqw           m2, m4
+    pcmpeqw           m3, m5
+    pcmpeqw           m4, m1
+    pcmpeqw           m5, m1
+    psubw             m4, m2
+    psubw             m5, m3
+
+    paddw             m4, m5
+    pcmpeqw           m2, m4, [pw_m2]
+%if ARCH_X86_64
+    pcmpeqw           m3, m4, m13
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, m14
+    pcmpeqw           m7, m4, m15
+    pand              m2, m8
+    pand              m3, m9
+    pand              m5, m10
+    pand              m6, m11
+    pand              m7, m12
+%else
+    pcmpeqw           m3, m4, [pw_m1]
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, [pw_1]
+    pcmpeqw           m7, m4, [pw_2]
+    pand              m2, [rsp+mmsize*0]
+    pand              m3, [rsp+mmsize*1]
+    pand              m5, [rsp+mmsize*2]
+    pand              m6, [rsp+mmsize*3]
+    pand              m7, [rsp+mmsize*4]
+%endif
+    paddw             m2, m3
+    paddw             m5, m6
+    paddw             m2, m7
+    paddw             m2, m1
+    paddw             m2, m5
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova      [dstq + i], m2
+%assign i i+mmsize
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+HEVC_SAO_EDGE_FILTER 10, 16, 2
+HEVC_SAO_EDGE_FILTER 10, 32, 4
+HEVC_SAO_EDGE_FILTER 10, 48, 6
+HEVC_SAO_EDGE_FILTER 10, 64, 8
+
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+HEVC_SAO_EDGE_FILTER 12, 16, 2
+HEVC_SAO_EDGE_FILTER 12, 32, 4
+HEVC_SAO_EDGE_FILTER 12, 48, 6
+HEVC_SAO_EDGE_FILTER 12, 64, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 10, 16, 1
+HEVC_SAO_EDGE_FILTER 10, 32, 2
+HEVC_SAO_EDGE_FILTER 10, 48, 3
+HEVC_SAO_EDGE_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 12, 16, 1
+HEVC_SAO_EDGE_FILTER 12, 32, 2
+HEVC_SAO_EDGE_FILTER 12, 48, 3
+HEVC_SAO_EDGE_FILTER 12, 64, 4
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
new file mode 100644
index 0000000..67be0a9
--- /dev/null
+++ b/libavcodec/x86/hevcdsp.h
@@ -0,0 +1,259 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##6,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+        WEIGHTING_PROTOTYPE(2, bitd, opt); \
+        WEIGHTING_PROTOTYPE(4, bitd, opt); \
+        WEIGHTING_PROTOTYPE(6, bitd, opt); \
+        WEIGHTING_PROTOTYPE(8, bitd, opt); \
+        WEIGHTING_PROTOTYPE(12, bitd, opt); \
+        WEIGHTING_PROTOTYPE(16, bitd, opt); \
+        WEIGHTING_PROTOTYPE(24, bitd, opt); \
+        WEIGHTING_PROTOTYPE(32, bitd, opt); \
+        WEIGHTING_PROTOTYPE(48, bitd, opt); \
+        WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h ,  8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v ,  8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv ,  8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h ,  8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v,  8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv,  8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+
+void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index a95fa30..17cd233 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -2,29 +2,31 @@
  * Copyright (c) 2013 Seppo Tomperi
  * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
 #include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-
+#include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
 #include "libavcodec/hevcdsp.h"
+#include "libavcodec/x86/hevcdsp.h"
 
 #define LFC_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
@@ -32,43 +34,34 @@ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix,
 #define LFL_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
 
-#define LFC_FUNCS(type, depth) \
-    LFC_FUNC(h, depth, sse2)   \
-    LFC_FUNC(v, depth, sse2)
-
-#define LFL_FUNCS(type, depth) \
-    LFL_FUNC(h, depth, ssse3)  \
-    LFL_FUNC(v, depth, ssse3)
-
-LFC_FUNCS(uint8_t, 8)
-LFC_FUNCS(uint8_t, 10)
-LFL_FUNCS(uint8_t, 8)
-LFL_FUNCS(uint8_t, 10)
-
-#define idct_dc_proto(size, bitd, opt) \
-                void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-
-idct_dc_proto(4, 8,mmxext);
-idct_dc_proto(8, 8,mmxext);
-idct_dc_proto(16,8,  sse2);
-idct_dc_proto(32,8,  sse2);
-
-idct_dc_proto(32,8,  avx2);
+#define LFC_FUNCS(type, depth, opt) \
+    LFC_FUNC(h, depth, opt)  \
+    LFC_FUNC(v, depth, opt)
 
-idct_dc_proto(4, 10,mmxext);
-idct_dc_proto(8, 10,  sse2);
-idct_dc_proto(16,10,  sse2);
-idct_dc_proto(32,10,  sse2);
-idct_dc_proto(8, 10,   avx);
-idct_dc_proto(16,10,   avx);
-idct_dc_proto(32,10,   avx);
+#define LFL_FUNCS(type, depth, opt) \
+    LFL_FUNC(h, depth, opt)  \
+    LFL_FUNC(v, depth, opt)
 
-idct_dc_proto(16,10,  avx2);
-idct_dc_proto(32,10,  avx2);
+LFC_FUNCS(uint8_t,   8, sse2)
+LFC_FUNCS(uint8_t,  10, sse2)
+LFC_FUNCS(uint8_t,  12, sse2)
+LFC_FUNCS(uint8_t,   8, avx)
+LFC_FUNCS(uint8_t,  10, avx)
+LFC_FUNCS(uint8_t,  12, avx)
+LFL_FUNCS(uint8_t,   8, sse2)
+LFL_FUNCS(uint8_t,  10, sse2)
+LFL_FUNCS(uint8_t,  12, sse2)
+LFL_FUNCS(uint8_t,   8, ssse3)
+LFL_FUNCS(uint8_t,  10, ssse3)
+LFL_FUNCS(uint8_t,  12, ssse3)
+LFL_FUNCS(uint8_t,   8, avx)
+LFL_FUNCS(uint8_t,  10, avx)
+LFL_FUNCS(uint8_t,  12, avx)
 
 #define IDCT_DC_FUNCS(W, opt) \
 void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
-void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
+void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
+void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
 
 IDCT_DC_FUNCS(4x4,   mmxext);
 IDCT_DC_FUNCS(8x8,   mmxext);
@@ -91,208 +84,631 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
 IDCT_FUNCS(sse2)
 IDCT_FUNCS(avx)
 
-void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-
-#define GET_PIXELS(width, depth, cf)                                                                      \
-void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride,             \
-                                                           uint8_t *src, ptrdiff_t srcstride,             \
-                                                           int height, int mx, int my, int16_t *mcbuffer);
-
-GET_PIXELS(4,  8, sse2)
-GET_PIXELS(8,  8, sse2)
-GET_PIXELS(12, 8, sse2)
-GET_PIXELS(16, 8, sse2)
-GET_PIXELS(24, 8, sse2)
-GET_PIXELS(32, 8, sse2)
-GET_PIXELS(48, 8, sse2)
-GET_PIXELS(64, 8, sse2)
-
-GET_PIXELS(4,  10, sse2)
-GET_PIXELS(8,  10, sse2)
-GET_PIXELS(12, 10, sse2)
-GET_PIXELS(16, 10, sse2)
-GET_PIXELS(24, 10, sse2)
-GET_PIXELS(32, 10, sse2)
-GET_PIXELS(48, 10, sse2)
-GET_PIXELS(64, 10, sse2)
-
-/* those are independent of the bit depth, so declared separately */
-#define INTERP_HV_FUNC(width, cf)                                                         \
-void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
-                                          int16_t *src, ptrdiff_t srcstride,              \
-                                          int height, int mx, int my, int16_t *mcbuffer); \
-void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
-                                          int16_t *src, ptrdiff_t srcstride,              \
-                                          int height, int mx, int my, int16_t *mcbuffer);
-
-INTERP_HV_FUNC(4,  avx)
-INTERP_HV_FUNC(8,  avx)
-INTERP_HV_FUNC(12, avx)
-INTERP_HV_FUNC(16, avx)
-INTERP_HV_FUNC(24, avx)
-INTERP_HV_FUNC(32, avx)
-INTERP_HV_FUNC(48, avx)
-INTERP_HV_FUNC(64, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
-static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
-                                                               uint8_t *src, ptrdiff_t srcstride,             \
-                                                               int height, int mx, int my, int16_t *mcbuffer) \
+#define mc_rep_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
+                                                uint8_t *_src, ptrdiff_t _srcstride, int height,                \
+                                                intptr_t mx, intptr_t my, int width)                            \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    int16_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst = _dst + i;                                                                                         \
+        ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);            \
+    }                                                                                                           \
+}
+#define mc_rep_uni_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *_src, ptrdiff_t _srcstride, int height,            \
+                                                    intptr_t mx, intptr_t my, int width)                        \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    uint8_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src = _src + (i * ((bitd + 7) / 8));                                                                    \
+        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
+        ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                     \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+#define mc_rep_bi_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src,          \
+                                                   ptrdiff_t _srcstride, int16_t* _src2,                        \
+                                                   int height, intptr_t mx, intptr_t my, int width)             \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t  *src;                                                                                              \
+    uint8_t  *dst;                                                                                              \
+    int16_t  *src2;                                                                                             \
+    for (i = 0; i < W ; i += step) {                                                                            \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst  = _dst + (i * ((bitd + 7) / 8));                                                                   \
+        src2 = _src2 + i;                                                                                       \
+        ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt)            \
+    mc_rep_uni_func(name, bitd, step, W, opt)        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                                                  \
+                                                 uint8_t *src, ptrdiff_t _srcstride, int height,                \
+                                                 intptr_t mx, intptr_t my, int width)                           \
+{                                                                                                               \
+    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);               \
+    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)),              \
+                                                    _srcstride, height, mx, my, width);                         \
+}
+#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,                         \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,            \
+                                                     intptr_t mx, intptr_t my, int width)                       \
+{                                                                                                               \
+    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
+    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,            \
+                                                        src + (step1 * ((bitd + 7) / 8)), _srcstride,           \
+                                                        height, mx, my, width);                                 \
+}
+#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,            \
+                                                    ptrdiff_t _srcstride, int16_t* src2,                        \
+                                                    int height, intptr_t mx, intptr_t my, int width)            \
+{                                                                                                               \
+    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
+    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,             \
+                                                       src + (step1 * ((bitd + 7) / 8)), _srcstride,            \
+                                                       src2 + step1, height, mx, my, width);                    \
+}
+
+#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
+    mc_rep_func2(name, bitd, step1, step2, W, opt)      \
+    mc_rep_uni_func2(name, bitd, step1, step2, W, opt)  \
+    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
+
+#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                       \
+void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,            \
+                                                 int height, intptr_t mx, intptr_t my, int width)             \
+                                                                                                              \
+{                                                                                                             \
+    ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width);                 \
+    ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                    \
+void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,          \
+                                                    ptrdiff_t _srcstride, int16_t *src2,                      \
+                                                    int height, intptr_t mx, intptr_t my, int width)          \
 {                                                                                                             \
-    const ptrdiff_t stride = FFALIGN(width + 7, 8);                                                           \
-    ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
-                                                        height + 7, mx, my, mcbuffer);                        \
-    ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride,                \
-                                            height, mx, my, mcbuffer);                                        \
+    ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2,                     \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
+                                                   height, mx, my, width);                                    \
 }
-#else
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
-void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-QPEL_FUNCS(4,  8, ssse3, ssse3, avx)
-QPEL_FUNCS(8,  8, ssse3, ssse3, avx)
-QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
-
-QPEL_FUNCS(4,  10, avx, avx, avx)
-QPEL_FUNCS(8,  10, avx, avx, avx)
-QPEL_FUNCS(12, 10, avx, avx, avx)
-QPEL_FUNCS(16, 10, avx, avx, avx)
-QPEL_FUNCS(24, 10, avx, avx, avx)
-QPEL_FUNCS(32, 10, avx, avx, avx)
-QPEL_FUNCS(48, 10, avx, avx, avx)
-QPEL_FUNCS(64, 10, avx, avx, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
-static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
-                                                               uint8_t *src, ptrdiff_t srcstride,             \
-                                                               int height, int mx, int my, int16_t *mcbuffer) \
+
+#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                   \
+void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride,                       \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,          \
+                                                     intptr_t mx, intptr_t my, int width)                     \
 {                                                                                                             \
-    const ptrdiff_t stride = FFALIGN(width + 3, 8);                                                           \
-    ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride,     \
-                                                        height + 3, mx, my, mcbuffer);                        \
-    ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride,                    \
-                                            height, mx, my, mcbuffer);                                        \
+    ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride,                          \
+                                                      height, mx, my, width);                                 \
+    ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride,            \
+                                                      height, mx, my, width);                                 \
 }
-#else
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
-void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-EPEL_FUNCS(4,  8, ssse3, ssse3, avx)
-EPEL_FUNCS(8,  8, ssse3, ssse3, avx)
-EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-
-EPEL_FUNCS(4,  10, avx, avx, avx)
-EPEL_FUNCS(8,  10, avx, avx, avx)
-EPEL_FUNCS(12, 10, avx, avx, avx)
-EPEL_FUNCS(16, 10, avx, avx, avx)
-EPEL_FUNCS(24, 10, avx, avx, avx)
-EPEL_FUNCS(32, 10, avx, avx, avx)
-
-#define PUT_PRED(width, depth, cf_uw, cf_w) \
-void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,                   \
-                                                                       int16_t *src, ptrdiff_t srcstride,                   \
-                                                                       int height);                                         \
-void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,               \
-                                                                           int16_t *src1, int16_t *src2,                    \
-                                                                           ptrdiff_t srcstride, int height);                \
-void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset,          \
-                                                                    uint8_t *dst, ptrdiff_t dststride,                      \
-                                                                    int16_t *src, ptrdiff_t srcstride,                      \
-                                                                    int height);                                            \
-void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1,    \
-                                                                        int16_t offset0, int16_t offset1,                   \
-                                                                        uint8_t *dst, ptrdiff_t dststride,                  \
-                                                                        int16_t *src0, int16_t *src1, ptrdiff_t srcstride,  \
-                                                                        int height);
-
-PUT_PRED(4,  8, sse2, sse4)
-PUT_PRED(8,  8, sse2, sse4)
-PUT_PRED(12, 8, sse2, sse4)
-PUT_PRED(16, 8, sse2, sse4)
-PUT_PRED(24, 8, sse2, sse4)
-PUT_PRED(32, 8, sse2, sse4)
-PUT_PRED(48, 8, sse2, sse4)
-PUT_PRED(64, 8, sse2, sse4)
-
-PUT_PRED(4,  10, sse2, sse4)
-PUT_PRED(8,  10, sse2, sse4)
-PUT_PRED(12, 10, sse2, sse4)
-PUT_PRED(16, 10, sse2, sse4)
-PUT_PRED(24, 10, sse2, sse4)
-PUT_PRED(32, 10, sse2, sse4)
-PUT_PRED(48, 10, sse2, sse4)
-PUT_PRED(64, 10, sse2, sse4)
+
+#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4)   \
+mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)            \
+mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)         \
+mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
+
+#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                \
+void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,             \
+                                                int height, intptr_t mx, intptr_t my, int width)              \
+                                                                                                              \
+{                                                                                                             \
+    ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width);                  \
+    ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width);  \
+}
+
+#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                             \
+void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,           \
+                                                   ptrdiff_t _srcstride, int16_t* src2,                       \
+                                                   int height, intptr_t mx, intptr_t my, int width)           \
+{                                                                                                             \
+    ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                            \
+                                                  src2, height, mx, my, width);                               \
+    ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,              \
+                                                  src2+width2, height, mx, my, width);                        \
+}
+
+#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                            \
+void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *src, ptrdiff_t _srcstride, int height,           \
+                                                    intptr_t mx, intptr_t my, int width)                      \
+{                                                                                                             \
+    ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                           \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,             \
+                                                   height, mx, my, width);                                    \
+}
+
+#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2)   \
+mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)            \
+mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)         \
+mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
+
+#if HAVE_AVX2_EXTERNAL
+
+mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_h ,    48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_v ,    48, 32, 16, avx2, sse4)
+
+mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
+mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_hv,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_h ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_v ,   24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_mixs_10(qpel_h ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_v ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_hv,   24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
+mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
+
+mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
+
+mc_rep_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 10, 16, 32, avx2)
+mc_rep_funcs(epel_h, 10, 16, 48, avx2)
+mc_rep_funcs(epel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 10, 16, 32, avx2)
+mc_rep_funcs(epel_v, 10, 16, 48, avx2)
+mc_rep_funcs(epel_v, 10, 32, 64, avx2)
+
+
+mc_rep_funcs(epel_hv,  8, 32, 64, avx2)
+
+mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_h ,  48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_v,  48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
+
+#endif //AVX2
+
+mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
+mc_rep_funcs(pel_pixels, 8,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 64, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 48, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 32, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 16, sse4)
+mc_rep_funcs(pel_pixels,10,  4, 12, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 64, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 48, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 32, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 16, sse4)
+mc_rep_funcs(pel_pixels,12,  4, 12, sse4)
+
+mc_rep_funcs(epel_h, 8, 16, 64, sse4)
+mc_rep_funcs(epel_h, 8, 16, 48, sse4)
+mc_rep_funcs(epel_h, 8, 16, 32, sse4)
+mc_rep_funcs(epel_h, 8,  8, 24, sse4)
+mc_rep_funcs(epel_h,10,  8, 64, sse4)
+mc_rep_funcs(epel_h,10,  8, 48, sse4)
+mc_rep_funcs(epel_h,10,  8, 32, sse4)
+mc_rep_funcs(epel_h,10,  8, 24, sse4)
+mc_rep_funcs(epel_h,10,  8, 16, sse4)
+mc_rep_funcs(epel_h,10,  4, 12, sse4)
+mc_rep_funcs(epel_h,12,  8, 64, sse4)
+mc_rep_funcs(epel_h,12,  8, 48, sse4)
+mc_rep_funcs(epel_h,12,  8, 32, sse4)
+mc_rep_funcs(epel_h,12,  8, 24, sse4)
+mc_rep_funcs(epel_h,12,  8, 16, sse4)
+mc_rep_funcs(epel_h,12,  4, 12, sse4)
+mc_rep_funcs(epel_v, 8, 16, 64, sse4)
+mc_rep_funcs(epel_v, 8, 16, 48, sse4)
+mc_rep_funcs(epel_v, 8, 16, 32, sse4)
+mc_rep_funcs(epel_v, 8,  8, 24, sse4)
+mc_rep_funcs(epel_v,10,  8, 64, sse4)
+mc_rep_funcs(epel_v,10,  8, 48, sse4)
+mc_rep_funcs(epel_v,10,  8, 32, sse4)
+mc_rep_funcs(epel_v,10,  8, 24, sse4)
+mc_rep_funcs(epel_v,10,  8, 16, sse4)
+mc_rep_funcs(epel_v,10,  4, 12, sse4)
+mc_rep_funcs(epel_v,12,  8, 64, sse4)
+mc_rep_funcs(epel_v,12,  8, 48, sse4)
+mc_rep_funcs(epel_v,12,  8, 32, sse4)
+mc_rep_funcs(epel_v,12,  8, 24, sse4)
+mc_rep_funcs(epel_v,12,  8, 16, sse4)
+mc_rep_funcs(epel_v,12,  4, 12, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
+mc_rep_funcs(epel_hv, 8,  8, 24, sse4)
+mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4)
+mc_rep_funcs(epel_hv,10,  8, 64, sse4)
+mc_rep_funcs(epel_hv,10,  8, 48, sse4)
+mc_rep_funcs(epel_hv,10,  8, 32, sse4)
+mc_rep_funcs(epel_hv,10,  8, 24, sse4)
+mc_rep_funcs(epel_hv,10,  8, 16, sse4)
+mc_rep_funcs(epel_hv,10,  4, 12, sse4)
+mc_rep_funcs(epel_hv,12,  8, 64, sse4)
+mc_rep_funcs(epel_hv,12,  8, 48, sse4)
+mc_rep_funcs(epel_hv,12,  8, 32, sse4)
+mc_rep_funcs(epel_hv,12,  8, 24, sse4)
+mc_rep_funcs(epel_hv,12,  8, 16, sse4)
+mc_rep_funcs(epel_hv,12,  4, 12, sse4)
+
+mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_h, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_h,10,  8, 64, sse4)
+mc_rep_funcs(qpel_h,10,  8, 48, sse4)
+mc_rep_funcs(qpel_h,10,  8, 32, sse4)
+mc_rep_funcs(qpel_h,10,  8, 24, sse4)
+mc_rep_funcs(qpel_h,10,  8, 16, sse4)
+mc_rep_funcs(qpel_h,10,  4, 12, sse4)
+mc_rep_funcs(qpel_h,12,  8, 64, sse4)
+mc_rep_funcs(qpel_h,12,  8, 48, sse4)
+mc_rep_funcs(qpel_h,12,  8, 32, sse4)
+mc_rep_funcs(qpel_h,12,  8, 24, sse4)
+mc_rep_funcs(qpel_h,12,  8, 16, sse4)
+mc_rep_funcs(qpel_h,12,  4, 12, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_v, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_v,10,  8, 64, sse4)
+mc_rep_funcs(qpel_v,10,  8, 48, sse4)
+mc_rep_funcs(qpel_v,10,  8, 32, sse4)
+mc_rep_funcs(qpel_v,10,  8, 24, sse4)
+mc_rep_funcs(qpel_v,10,  8, 16, sse4)
+mc_rep_funcs(qpel_v,10,  4, 12, sse4)
+mc_rep_funcs(qpel_v,12,  8, 64, sse4)
+mc_rep_funcs(qpel_v,12,  8, 48, sse4)
+mc_rep_funcs(qpel_v,12,  8, 32, sse4)
+mc_rep_funcs(qpel_v,12,  8, 24, sse4)
+mc_rep_funcs(qpel_v,12,  8, 16, sse4)
+mc_rep_funcs(qpel_v,12,  4, 12, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 64, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 48, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 32, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 16, sse4)
+mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 64, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 48, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 32, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 24, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 16, sse4)
+mc_rep_funcs(qpel_hv,10,  4, 12, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 64, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 48, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 32, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 24, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 16, sse4)
+mc_rep_funcs(qpel_hv,12,  4, 12, sse4)
+
+#define mc_rep_uni_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                               int height, int denom,  int _wx, int _ox)                                \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src= _src + i;                                                                                                  \
+        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src,                                   \
+                                                     height, denom, _wx, _ox);                                          \
+    }                                                                                                                   \
+}
+
+mc_rep_uni_w(8, 6, 12, sse4)
+mc_rep_uni_w(8, 8, 16, sse4)
+mc_rep_uni_w(8, 8, 24, sse4)
+mc_rep_uni_w(8, 8, 32, sse4)
+mc_rep_uni_w(8, 8, 48, sse4)
+mc_rep_uni_w(8, 8, 64, sse4)
+
+mc_rep_uni_w(10, 6, 12, sse4)
+mc_rep_uni_w(10, 8, 16, sse4)
+mc_rep_uni_w(10, 8, 24, sse4)
+mc_rep_uni_w(10, 8, 32, sse4)
+mc_rep_uni_w(10, 8, 48, sse4)
+mc_rep_uni_w(10, 8, 64, sse4)
+
+mc_rep_uni_w(12, 6, 12, sse4)
+mc_rep_uni_w(12, 8, 16, sse4)
+mc_rep_uni_w(12, 8, 24, sse4)
+mc_rep_uni_w(12, 8, 32, sse4)
+mc_rep_uni_w(12, 8, 48, sse4)
+mc_rep_uni_w(12, 8, 64, sse4)
+
+#define mc_rep_bi_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                              int16_t *_src2, int height,                                               \
+                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    int16_t *src2;                                                                                                      \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src  = _src  + i;                                                                                               \
+        src2 = _src2 + i;                                                                                               \
+        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
+        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2,                             \
+                                                     height, denom, _wx0, _wx1, _ox0, _ox1);                             \
+    }                                                                                                                   \
+}
+
+mc_rep_bi_w(8, 6, 12, sse4)
+mc_rep_bi_w(8, 8, 16, sse4)
+mc_rep_bi_w(8, 8, 24, sse4)
+mc_rep_bi_w(8, 8, 32, sse4)
+mc_rep_bi_w(8, 8, 48, sse4)
+mc_rep_bi_w(8, 8, 64, sse4)
+
+mc_rep_bi_w(10, 6, 12, sse4)
+mc_rep_bi_w(10, 8, 16, sse4)
+mc_rep_bi_w(10, 8, 24, sse4)
+mc_rep_bi_w(10, 8, 32, sse4)
+mc_rep_bi_w(10, 8, 48, sse4)
+mc_rep_bi_w(10, 8, 64, sse4)
+
+mc_rep_bi_w(12, 6, 12, sse4)
+mc_rep_bi_w(12, 8, 16, sse4)
+mc_rep_bi_w(12, 8, 24, sse4)
+mc_rep_bi_w(12, 8, 32, sse4)
+mc_rep_bi_w(12, 8, 48, sse4)
+mc_rep_bi_w(12, 8, 64, sse4)
+
+#define mc_uni_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
+                                                      uint8_t *_src, ptrdiff_t _srcstride,          \
+                                                      int height, int denom,                        \
+                                                      int _wx, int _ox,                             \
+                                                      intptr_t mx, intptr_t my, int width)          \
+{                                                                                                   \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
+}
+
+#define mc_uni_w_funcs(name, bitd, opt)      \
+        mc_uni_w_func(name, bitd, 4, opt)    \
+        mc_uni_w_func(name, bitd, 8, opt)    \
+        mc_uni_w_func(name, bitd, 12, opt)   \
+        mc_uni_w_func(name, bitd, 16, opt)   \
+        mc_uni_w_func(name, bitd, 24, opt)   \
+        mc_uni_w_func(name, bitd, 32, opt)   \
+        mc_uni_w_func(name, bitd, 48, opt)   \
+        mc_uni_w_func(name, bitd, 64, opt)
+
+mc_uni_w_funcs(pel_pixels, 8, sse4)
+mc_uni_w_func(pel_pixels, 8, 6, sse4)
+mc_uni_w_funcs(epel_h, 8, sse4)
+mc_uni_w_func(epel_h, 8, 6, sse4)
+mc_uni_w_funcs(epel_v, 8, sse4)
+mc_uni_w_func(epel_v, 8, 6, sse4)
+mc_uni_w_funcs(epel_hv, 8, sse4)
+mc_uni_w_func(epel_hv, 8, 6, sse4)
+mc_uni_w_funcs(qpel_h, 8, sse4)
+mc_uni_w_funcs(qpel_v, 8, sse4)
+mc_uni_w_funcs(qpel_hv, 8, sse4)
+
+mc_uni_w_funcs(pel_pixels, 10, sse4)
+mc_uni_w_func(pel_pixels, 10, 6, sse4)
+mc_uni_w_funcs(epel_h, 10, sse4)
+mc_uni_w_func(epel_h, 10, 6, sse4)
+mc_uni_w_funcs(epel_v, 10, sse4)
+mc_uni_w_func(epel_v, 10, 6, sse4)
+mc_uni_w_funcs(epel_hv, 10, sse4)
+mc_uni_w_func(epel_hv, 10, 6, sse4)
+mc_uni_w_funcs(qpel_h, 10, sse4)
+mc_uni_w_funcs(qpel_v, 10, sse4)
+mc_uni_w_funcs(qpel_hv, 10, sse4)
+
+mc_uni_w_funcs(pel_pixels, 12, sse4)
+mc_uni_w_func(pel_pixels, 12, 6, sse4)
+mc_uni_w_funcs(epel_h, 12, sse4)
+mc_uni_w_func(epel_h, 12, 6, sse4)
+mc_uni_w_funcs(epel_v, 12, sse4)
+mc_uni_w_func(epel_v, 12, 6, sse4)
+mc_uni_w_funcs(epel_hv, 12, sse4)
+mc_uni_w_func(epel_hv, 12, 6, sse4)
+mc_uni_w_funcs(qpel_h, 12, sse4)
+mc_uni_w_funcs(qpel_v, 12, sse4)
+mc_uni_w_funcs(qpel_hv, 12, sse4)
+
+#define mc_bi_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
+                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
+                                                     int16_t *_src2,                                 \
+                                                     int height, int denom,                          \
+                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
+                                                     intptr_t mx, intptr_t my, int width)            \
+{                                                                                                    \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
+    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2,                         \
+                                              height, denom, _wx0, _wx1, _ox0, _ox1);                \
+}
+
+#define mc_bi_w_funcs(name, bitd, opt)      \
+        mc_bi_w_func(name, bitd, 4, opt)    \
+        mc_bi_w_func(name, bitd, 8, opt)    \
+        mc_bi_w_func(name, bitd, 12, opt)   \
+        mc_bi_w_func(name, bitd, 16, opt)   \
+        mc_bi_w_func(name, bitd, 24, opt)   \
+        mc_bi_w_func(name, bitd, 32, opt)   \
+        mc_bi_w_func(name, bitd, 48, opt)   \
+        mc_bi_w_func(name, bitd, 64, opt)
+
+mc_bi_w_funcs(pel_pixels, 8, sse4)
+mc_bi_w_func(pel_pixels, 8, 6, sse4)
+mc_bi_w_funcs(epel_h, 8, sse4)
+mc_bi_w_func(epel_h, 8, 6, sse4)
+mc_bi_w_funcs(epel_v, 8, sse4)
+mc_bi_w_func(epel_v, 8, 6, sse4)
+mc_bi_w_funcs(epel_hv, 8, sse4)
+mc_bi_w_func(epel_hv, 8, 6, sse4)
+mc_bi_w_funcs(qpel_h, 8, sse4)
+mc_bi_w_funcs(qpel_v, 8, sse4)
+mc_bi_w_funcs(qpel_hv, 8, sse4)
+
+mc_bi_w_funcs(pel_pixels, 10, sse4)
+mc_bi_w_func(pel_pixels, 10, 6, sse4)
+mc_bi_w_funcs(epel_h, 10, sse4)
+mc_bi_w_func(epel_h, 10, 6, sse4)
+mc_bi_w_funcs(epel_v, 10, sse4)
+mc_bi_w_func(epel_v, 10, 6, sse4)
+mc_bi_w_funcs(epel_hv, 10, sse4)
+mc_bi_w_func(epel_hv, 10, 6, sse4)
+mc_bi_w_funcs(qpel_h, 10, sse4)
+mc_bi_w_funcs(qpel_v, 10, sse4)
+mc_bi_w_funcs(qpel_hv, 10, sse4)
+
+mc_bi_w_funcs(pel_pixels, 12, sse4)
+mc_bi_w_func(pel_pixels, 12, 6, sse4)
+mc_bi_w_funcs(epel_h, 12, sse4)
+mc_bi_w_func(epel_h, 12, 6, sse4)
+mc_bi_w_funcs(epel_v, 12, sse4)
+mc_bi_w_func(epel_v, 12, 6, sse4)
+mc_bi_w_funcs(epel_hv, 12, sse4)
+mc_bi_w_func(epel_hv, 12, 6, sse4)
+mc_bi_w_funcs(qpel_h, 12, sse4)
+mc_bi_w_funcs(qpel_v, 12, sse4)
+mc_bi_w_funcs(qpel_hv, 12, sse4)
+#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define SAO_BAND_FILTER_FUNCS(bitd, opt)                                                                                   \
+void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+SAO_BAND_FILTER_FUNCS(8,  sse2)
+SAO_BAND_FILTER_FUNCS(10, sse2)
+SAO_BAND_FILTER_FUNCS(12, sse2)
+SAO_BAND_FILTER_FUNCS(8,   avx)
+SAO_BAND_FILTER_FUNCS(10,  avx)
+SAO_BAND_FILTER_FUNCS(12,  avx)
+SAO_BAND_FILTER_FUNCS(8,  avx2)
+SAO_BAND_FILTER_FUNCS(10, avx2)
+SAO_BAND_FILTER_FUNCS(12, avx2)
+
+#define SAO_BAND_INIT(bitd, opt) do {                                       \
+    c->sao_band_filter[0]      = ff_hevc_sao_band_filter_8_##bitd##_##opt;  \
+    c->sao_band_filter[1]      = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
+    c->sao_band_filter[2]      = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
+    c->sao_band_filter[3]      = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
+    c->sao_band_filter[4]      = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define SAO_EDGE_FILTER_FUNCS(bitd, opt)                                                                                    \
+void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,  \
+                                              int eo, int width, int height);                                               \
+void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+
+SAO_EDGE_FILTER_FUNCS(8, ssse3)
+SAO_EDGE_FILTER_FUNCS(8, avx2)
+SAO_EDGE_FILTER_FUNCS(10, sse2)
+SAO_EDGE_FILTER_FUNCS(10, avx2)
+SAO_EDGE_FILTER_FUNCS(12, sse2)
+SAO_EDGE_FILTER_FUNCS(12, avx2)
+
+#define SAO_EDGE_INIT(bitd, opt) do {                                       \
+    c->sao_edge_filter[0]      = ff_hevc_sao_edge_filter_8_##bitd##_##opt;  \
+    c->sao_edge_filter[1]      = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
+    c->sao_edge_filter[2]      = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
+    c->sao_edge_filter[3]      = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
+    c->sao_edge_filter[4]      = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
+#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
 
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#define SET_LUMA_FUNCS(tabname, funcname, depth, cf)      \
-    c->tabname[0] = funcname ## _4_  ## depth ## _ ## cf; \
-    c->tabname[1] = funcname ## _8_  ## depth ## _ ## cf; \
-    c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
-    c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
-    c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
-    c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
-    c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
-    c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
-
-#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf)    \
-    c->tabname[1] = funcname ## _4_  ## depth ## _ ## cf; \
-    c->tabname[3] = funcname ## _8_  ## depth ## _ ## cf; \
-    c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
-    c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
-    c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
-    c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
-
-#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS  (put_hevc_qpel[v][h], name, depth, cf)
-#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
-
     if (bit_depth == 8) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
@@ -303,10 +719,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 
-            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
-            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
-            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
+                c->idct[2] = ff_hevc_idct_16x16_8_sse2;
+                c->idct[3] = ff_hevc_idct_32x32_8_sse2;
+            }
+            SAO_BAND_INIT(8, sse2);
 
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
@@ -315,41 +735,166 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct[0]    = ff_hevc_idct_4x4_8_sse2;
             c->idct[1]    = ff_hevc_idct_8x8_8_sse2;
 
-            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
-            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
-
-            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
-            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
+            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
+            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
+            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
-            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
-            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
-            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
-            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
+            if(ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+            }
+            SAO_EDGE_INIT(8, ssse3);
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+
+                c->idct[2] = ff_hevc_idct_16x16_8_avx;
+                c->idct[3] = ff_hevc_idct_32x32_8_avx;
+            }
+            SAO_BAND_INIT(8, avx);
+
             c->idct[0] = ff_hevc_idct_4x4_8_avx;
             c->idct[1] = ff_hevc_idct_8x8_8_avx;
+
             c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
             c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
             c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
+            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
+            }
+            SAO_BAND_INIT(8, avx2);
+
+            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
+            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
+
             c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
             c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
-
-            c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+
+                c->idct[2] = ff_hevc_idct_16x16_10_sse2;
+                c->idct[3] = ff_hevc_idct_32x32_10_sse2;
+            }
+            SAO_BAND_INIT(10, sse2);
+            SAO_EDGE_INIT(10, sse2);
 
             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
@@ -357,89 +902,250 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 
             c->idct[0]    = ff_hevc_idct_4x4_10_sse2;
             c->idct[1]    = ff_hevc_idct_8x8_10_sse2;
-            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
-            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
-
-            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
-            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
 
             c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
             c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
             c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
         }
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+        }
         if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+
+                c->idct[2] = ff_hevc_idct_16x16_10_avx;
+                c->idct[3] = ff_hevc_idct_32x32_10_avx;
+            }
+
             c->idct[0] = ff_hevc_idct_4x4_10_avx;
             c->idct[1] = ff_hevc_idct_8x8_10_avx;
+
+            SAO_BAND_INIT(10, avx);
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
-            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
-            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
         }
-    }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 
-#if ARCH_X86_64
-    if (bit_depth == 8) {
-        if (EXTERNAL_SSE2(cpu_flags)) {
-            c->idct[2] = ff_hevc_idct_16x16_8_sse2;
-            c->idct[3] = ff_hevc_idct_32x32_8_sse2;
-        }
-        if (EXTERNAL_SSSE3(cpu_flags)) {
-            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
-            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
-        }
+                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 
-        if (EXTERNAL_SSE4(cpu_flags)) {
-            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
-            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
-        }
+                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 
-        if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
-            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
-            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
-            c->idct[2] = ff_hevc_idct_16x16_8_avx;
-            c->idct[3] = ff_hevc_idct_32x32_8_avx;
+                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
+            }
+            SAO_BAND_INIT(10, avx2);
+            SAO_EDGE_INIT(10, avx2);
+
+            c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
+            c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
         }
-        if (EXTERNAL_AVX2(cpu_flags)) {
-            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
-            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+    } else if (bit_depth == 12) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
         }
-    } else if (bit_depth == 10) {
         if (EXTERNAL_SSE2(cpu_flags)) {
-            c->idct[2] = ff_hevc_idct_16x16_10_sse2;
-            c->idct[3] = ff_hevc_idct_32x32_10_sse2;
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+            }
+            SAO_BAND_INIT(12, sse2);
+            SAO_EDGE_INIT(12, sse2);
+
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
         }
-        if (EXTERNAL_SSSE3(cpu_flags)) {
-            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
-            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
         }
-        if (EXTERNAL_SSE4(cpu_flags)) {
-            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
-            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
-            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
-            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
-            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
-            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
-            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
-            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
-            c->idct[2] = ff_hevc_idct_16x16_10_avx;
-            c->idct[3] = ff_hevc_idct_32x32_10_avx;
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+            }
+            SAO_BAND_INIT(12, avx);
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
-            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
-            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
+
+            SAO_BAND_INIT(12, avx2);
+            SAO_EDGE_INIT(12, avx2);
         }
     }
-#endif /* ARCH_X86_64 */
 }
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 8e21114..ce5d7a4 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -1,20 +1,27 @@
 ;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
 ;* SIMD-optimized halfpel functions
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -22,26 +29,49 @@
 
 SECTION_RODATA
 cextern pb_1
+cextern pw_2
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
+
+cextern pw_8192
 
 SECTION .text
 
 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
 cglobal put_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
 .loop:
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     mova       [r0], m0
     mova    [r0+r2], m1
     add          r1, r4
     add          r0, r4
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m1
@@ -99,6 +129,9 @@ INIT_MMX mmxext
 PUT_PIXELS_16
 INIT_MMX 3dnow
 PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
 
 
 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -144,20 +177,24 @@ PUT_NO_RND_PIXELS8_X2
 
 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
 cglobal put_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     add          r0, r4
     add          r1, r4
     PAVGB        m2, m1
@@ -174,6 +211,9 @@ INIT_MMX mmxext
 PUT_PIXELS8_Y2
 INIT_MMX 3dnow
 PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
 
 
 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -245,26 +285,48 @@ AVG_PIXELS8
 
 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
 cglobal avg_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
+%if notcpuflag(mmxext)
+    pcmpeqd      m5, m5
+    paddb        m5, m5
+%endif
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m2
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
     add          r0, r4
     add          r1, r4
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     mova       [r0], m0
     mova    [r0+r2], m2
     add          r0, r4
@@ -273,40 +335,45 @@ cglobal avg_pixels8_x2, 4,5
     REP_RET
 %endmacro
 
+INIT_MMX mmx
+AVG_PIXELS8_X2
 INIT_MMX mmxext
 AVG_PIXELS8_X2
 INIT_MMX 3dnow
 AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
 
 
 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
 cglobal avg_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m0, m3
-    PAVGB        m1, m4
+    PAVGB        m0, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     PAVGB        m2, m1
     PAVGB        m1, m0
     add          r0, r4
     add          r1, r4
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m2, m3
-    PAVGB        m1, m4
+    PAVGB        m2, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -319,11 +386,16 @@ INIT_MMX mmxext
 AVG_PIXELS8_Y2
 INIT_MMX 3dnow
 AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
 
 
 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_XY2 0
-cglobal avg_pixels8_xy2, 4,5
+; Note this is not correctly rounded, and is therefore used for
+; not-bitexact output
+%macro AVG_APPROX_PIXELS8_XY2 0
+cglobal avg_approx_pixels8_xy2, 4,5
     mova         m6, [pb_1]
     lea          r4, [r2*2]
     mova         m0, [r1]
@@ -360,6 +432,160 @@ cglobal avg_pixels8_xy2, 4,5
 %endmacro
 
 INIT_MMX mmxext
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
+INIT_MMX 3dnow
+AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS_XY2 1
+%if cpuflag(sse2)
+cglobal %1_pixels16_xy2, 4,5,8
+%else
+cglobal %1_pixels8_xy2, 4,5
+%endif
+    pxor        m7, m7
+    mova        m6, [pw_2]
+    movu        m0, [r1]
+    movu        m4, [r1+1]
+    mova        m1, m0
+    mova        m5, m4
+    punpcklbw   m0, m7
+    punpcklbw   m4, m7
+    punpckhbw   m1, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m0
+    paddusw     m5, m1
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m0, [r1+r4]
+    movu        m2, [r1+r4+1]
+    mova        m1, m0
+    mova        m3, m2
+    punpcklbw   m0, m7
+    punpcklbw   m2, m7
+    punpckhbw   m1, m7
+    punpckhbw   m3, m7
+    paddusw     m0, m2
+    paddusw     m1, m3
+    paddusw     m4, m6
+    paddusw     m5, m6
+    paddusw     m4, m0
+    paddusw     m5, m1
+    psrlw       m4, 2
+    psrlw       m5, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m4, m5
+    PAVGB       m4, m3
+%else
+    packuswb    m4, m5
+%endif
+    mova   [r0+r4], m4
+    add         r4, r2
+
+    movu        m2, [r1+r4]
+    movu        m4, [r1+r4+1]
+    mova        m3, m2
+    mova        m5, m4
+    punpcklbw   m2, m7
+    punpcklbw   m4, m7
+    punpckhbw   m3, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m2
+    paddusw     m5, m3
+    paddusw     m0, m6
+    paddusw     m1, m6
+    paddusw     m0, m4
+    paddusw     m1, m5
+    psrlw       m0, 2
+    psrlw       m1, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m0, m1
+    PAVGB       m0, m3
+%else
+    packuswb    m0, m1
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+SET_PIXELS_XY2 avg
 INIT_MMX 3dnow
-AVG_PIXELS8_XY2
+SET_PIXELS_XY2 avg
+INIT_XMM sse2
+SET_PIXELS_XY2 put
+SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+    mova        m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+    mova        m4, [pb_interleave8]
+%endif
+    mova        m5, [pb_1]
+    movu        m0, [r1]
+    movu        m1, [r1+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m2, [r1+r4]
+    movu        m3, [r1+r4+1]
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+    paddusw     m0, m2
+    paddusw     m1, m3
+    pmulhrsw    m0, [pw_8192]
+    pmulhrsw    m1, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m0, m1
+    pshufb      m0, m4
+    pavgb       m0, m6
+%else
+    packuswb    m0, m1
+    pshufb      m0, m4
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+
+    movu        m0, [r1+r4]
+    movu        m1, [r1+r4+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    paddusw     m2, m0
+    paddusw     m3, m1
+    pmulhrsw    m2, [pw_8192]
+    pmulhrsw    m3, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m2, m3
+    pshufb      m2, m4
+    pavgb       m2, m6
+%else
+    packuswb    m2, m3
+    pshufb      m2, m4
+%endif
+    mova   [r0+r4], m2
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index 566e518..bf97029 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,14 +29,29 @@ void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
 
 void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
 void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
 void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
-void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags);
+void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
 
 #endif /* AVCODEC_X86_HPELDSP_H */
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index d47e788..d89928c 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -39,6 +39,14 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h);
 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                      ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -61,10 +69,12 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 
 #define avg_pixels8_mmx         ff_avg_pixels8_mmx
 #define avg_pixels8_x2_mmx      ff_avg_pixels8_x2_mmx
@@ -98,11 +108,13 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
 #undef PAVGB
 #undef STATIC
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
+#endif
 
 /***********************************/
 /* MMX rounding */
@@ -125,11 +137,13 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
 #undef PAVGBP
 #undef PAVGB
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
+#endif
 
 #endif /* HAVE_INLINE_ASM */
 
@@ -143,32 +157,49 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
     CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8)
+    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
 
 HPELDSP_AVG_PIXELS16(_3dnow)
 HPELDSP_AVG_PIXELS16(_mmxext)
 
 #endif /* HAVE_X86ASM */
 
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
+    if (HAVE_MMX_EXTERNAL)                                                  \
+    c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU;
+
+#if HAVE_MMX_INLINE
 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
     do {                                                                        \
-        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
     } while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+    } while (0)
+#endif
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
-#if HAVE_MMX_INLINE
     SET_HPEL_FUNCS(put,        [0], 16, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
     SET_HPEL_FUNCS(avg,        [0], 16, mmx);
     SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
     SET_HPEL_FUNCS(put,        [1],  8, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
-    SET_HPEL_FUNCS(avg,        [1],  8, mmx);
-#endif /* HAVE_MMX_INLINE */
+    if (HAVE_MMX_EXTERNAL) {
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+    }
+#if HAVE_MMX_INLINE
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
 }
 
 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
@@ -180,6 +211,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -187,6 +219,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
@@ -194,8 +227,8 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
     }
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
@@ -209,6 +242,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
@@ -216,6 +250,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)){
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
@@ -223,8 +258,8 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
     }
 #endif /* HAVE_AMD3DNOW_EXTERNAL */
 }
@@ -234,10 +269,26 @@ static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
 #if HAVE_SSE2_EXTERNAL
     c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
     c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+    c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
+    c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
+    c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
     c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+    c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
+    c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
+    c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+    c->put_pixels_tab[0][3]            = ff_put_pixels16_xy2_ssse3;
+    c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
+    c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
+    c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
 av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -254,6 +305,9 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
     if (EXTERNAL_SSE2_FAST(cpu_flags))
         hpeldsp_init_sse2_fast(c, flags);
 
+    if (EXTERNAL_SSSE3(cpu_flags))
+        hpeldsp_init_ssse3(c, flags);
+
     if (CONFIG_VP3_DECODER)
-        ff_hpeldsp_vp3_init_x86(c, cpu_flags);
+        ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
 }
diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c
deleted file mode 100644
index c93c78e..0000000
--- a/libavcodec/x86/hpeldsp_mmx.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2001 Fabrice Bellard
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "hpeldsp.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
index 82231ad..2bff2d2 100644
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 #include <stdint.h>
 
 // put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -60,7 +60,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :FF_REG_a, "memory");
 }
 
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -106,7 +106,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff
         :FF_REG_a, "memory");
 }
 
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -115,14 +115,14 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         ".p2align 3                     \n\t"
         "1:                             \n\t"
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm2 \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
         "movq   %%mm4, (%2)             \n\t"
         "movq   %%mm5, (%2, %3)         \n\t"
         "add    %%"FF_REG_a", %1        \n\t"
         "add    %%"FF_REG_a", %2        \n\t"
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm0 \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
         "movq   %%mm4, (%2)             \n\t"
         "movq   %%mm5, (%2, %3)         \n\t"
@@ -135,33 +135,34 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :FF_REG_a, "memory");
 }
 
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
         __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
+            ".p2align 3                 \n\t"
+            "1:                         \n\t"
+            "movq  (%1), %%mm0          \n\t"
+            "movq  1(%1), %%mm1         \n\t"
+            "movq  (%2), %%mm3          \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  9%1, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
+            "movq  %%mm0, (%2)          \n\t"
+            "movq  8(%1), %%mm0         \n\t"
+            "movq  9(%1), %%mm1         \n\t"
+            "movq  8(%2), %%mm3         \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
+            "movq  %%mm0, 8(%2)         \n\t"
+            "add    %3, %1              \n\t"
+            "add    %3, %2              \n\t"
+            "subl   $1, %0              \n\t"
+            "jnz    1b                  \n\t"
+            :"+g"(h), "+S"(pixels), "+D"(block)
+            :"r"((x86_reg)line_size)
             :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
 }
 
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -170,7 +171,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         ".p2align 3                     \n\t"
         "1:                             \n\t"
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
         "movq   (%2), %%mm3             \n\t"
         PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
@@ -182,7 +183,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         "add    %%"FF_REG_a", %2        \n\t"
 
         "movq   (%1, %3), %%mm1         \n\t"
-        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
         "movq   (%2), %%mm3             \n\t"
         PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
diff --git a/libavcodec/x86/hpeldsp_vp3.asm b/libavcodec/x86/hpeldsp_vp3.asm
index 513f14e..cba96d0 100644
--- a/libavcodec/x86/hpeldsp_vp3.asm
+++ b/libavcodec/x86/hpeldsp_vp3.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* SIMD-optimized halfpel functions for VP3
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/hpeldsp_vp3_init.c b/libavcodec/x86/hpeldsp_vp3_init.c
index cc1f5e4..5979f41 100644
--- a/libavcodec/x86/hpeldsp_vp3_init.c
+++ b/libavcodec/x86/hpeldsp_vp3_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,15 +38,19 @@ void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
                                           const uint8_t *pixels,
                                           ptrdiff_t line_size, int h);
 
-av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags)
+av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
 {
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
-        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
-        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+        }
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
-        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+        }
     }
 }
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index 692162b..a1231f1 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -1,48 +1,117 @@
 ;******************************************************************************
 ;* SIMD-optimized HuffYUV functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-pb_f: times 16 db 15
-pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
-pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
-pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
-
 SECTION .text
 
-; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-;                                     const uint8_t *diff, int w,
-;                                     int *left, int *left_top)
+%include "libavcodec/x86/huffyuvdsp_template.asm"
+
+;------------------------------------------------------------------------------
+; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+;------------------------------------------------------------------------------
+
+%macro ADD_INT16 0
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+%if mmsize > 8
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+%endif
+    INT16_LOOP a, add
+%if mmsize > 8
+.unaligned:
+    INT16_LOOP u, add
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_INT16
+%endif
+
+INIT_XMM sse2
+ADD_INT16
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_INT16
+%endif
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+;                               intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
+    shl           wq, 2
+    movd          m0, [leftq]
+    lea         dstq, [dstq + wq]
+    lea         srcq, [srcq + wq]
+    LSHIFT        m0, mmsize-4
+    neg           wq
+.loop:
+    movu          m1, [srcq+wq]
+    mova          m2, m1
+%if mmsize == 8
+    punpckhdq     m0, m0
+%endif
+    LSHIFT        m1, 4
+    paddb         m1, m2
+%if mmsize == 16
+    pshufd        m0, m0, q3333
+    mova          m2, m1
+    LSHIFT        m1, 8
+    paddb         m1, m2
+%endif
+    paddb         m0, m1
+    movu   [dstq+wq], m0
+    add           wq, mmsize
+    jl         .loop
+    movd          m0, [dstq-4]
+    movd     [leftq], m0
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+LEFT_BGR32
+%endif
+INIT_XMM sse2
+LEFT_BGR32
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
 INIT_MMX mmxext
-cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
+cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+    add      wd, wd
+    movd    mm6, maskd
+    SPLATW  mm6, mm6
     movq    mm0, [topq]
     movq    mm2, mm0
     movd    mm4, [left_topq]
-    psllq   mm2, 8
+    psllq   mm2, 16
     movq    mm1, mm0
     por     mm4, mm2
     movd    mm3, [leftq]
-    psubb   mm0, mm4 ; t-tl
+    psubw   mm0, mm4 ; t-tl
     add    dstq, wq
     add    topq, wq
     add   diffq, wq
@@ -51,115 +120,45 @@ cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
 .loop:
     movq    mm4, [topq+wq]
     movq    mm0, mm4
-    psllq   mm4, 8
+    psllq   mm4, 16
     por     mm4, mm1
     movq    mm1, mm0 ; t
-    psubb   mm0, mm4 ; t-tl
+    psubw   mm0, mm4 ; t-tl
 .skip:
     movq    mm2, [diffq+wq]
 %assign i 0
-%rep 8
+%rep 4
     movq    mm4, mm0
-    paddb   mm4, mm3 ; t-tl+l
+    paddw   mm4, mm3 ; t-tl+l
+    pand    mm4, mm6
     movq    mm5, mm3
-    pmaxub  mm3, mm1
-    pminub  mm5, mm1
-    pminub  mm3, mm4
-    pmaxub  mm3, mm5 ; median
-    paddb   mm3, mm2 ; +residual
+    pmaxsw  mm3, mm1
+    pminsw  mm5, mm1
+    pminsw  mm3, mm4
+    pmaxsw  mm3, mm5 ; median
+    paddw   mm3, mm2 ; +residual
+    pand    mm3, mm6
 %if i==0
     movq    mm7, mm3
-    psllq   mm7, 56
+    psllq   mm7, 48
 %else
-    movq    mm6, mm3
-    psrlq   mm7, 8
-    psllq   mm6, 56
-    por     mm7, mm6
+    movq    mm4, mm3
+    psrlq   mm7, 16
+    psllq   mm4, 48
+    por     mm7, mm4
 %endif
-%if i<7
-    psrlq   mm0, 8
-    psrlq   mm1, 8
-    psrlq   mm2, 8
+%if i<3
+    psrlq   mm0, 16
+    psrlq   mm1, 16
+    psrlq   mm2, 16
 %endif
 %assign i i+1
 %endrep
     movq [dstq+wq], mm7
     add      wq, 8
     jl .loop
-    movzx   r2d, byte [dstq-1]
+    movzx   r2d, word [dstq-2]
     mov [leftq], r2d
-    movzx   r2d, byte [topq-1]
+    movzx   r2d, word [topq-2]
     mov [left_topq], r2d
     RET
-
-
-%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
-    add     srcq, wq
-    add     dstq, wq
-    neg     wq
-%%.loop:
-%if %2
-    mova    m1, [srcq+wq]
-%else
-    movu    m1, [srcq+wq]
-%endif
-    mova    m2, m1
-    psllw   m1, 8
-    paddb   m1, m2
-    mova    m2, m1
-    pshufb  m1, m3
-    paddb   m1, m2
-    pshufb  m0, m5
-    mova    m2, m1
-    pshufb  m1, m4
-    paddb   m1, m2
-%if mmsize == 16
-    mova    m2, m1
-    pshufb  m1, m6
-    paddb   m1, m2
-%endif
-    paddb   m0, m1
-%if %1
-    mova    [dstq+wq], m0
-%else
-    movq    [dstq+wq], m0
-    movhps  [dstq+wq+8], m0
-%endif
-    add     wq, mmsize
-    jl %%.loop
-    mov     eax, mmsize-1
-    sub     eax, wd
-    movd    m1, eax
-    pshufb  m0, m1
-    movd    eax, m0
-    RET
-%endmacro
-
-; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
-INIT_MMX ssse3
-cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
-.skip_prologue:
-    mova    m5, [pb_7]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    psllq   m0, 56
-    ADD_HFYU_LEFT_LOOP 1, 1
-
-INIT_XMM ssse3
-cglobal add_hfyu_left_pred_unaligned, 3,3,7, dst, src, w, left
-    mova    m5, [pb_f]
-    mova    m6, [pb_zzzzzzzz77777777]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    pslldq  m0, 15
-    test    srcq, 15
-    jnz .src_unaligned
-    test    dstq, 15
-    jnz .dst_unaligned
-    ADD_HFYU_LEFT_LOOP 1, 1
-.dst_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 1
-.src_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 0
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 80e6cfb..eb10de3 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -1,132 +1,61 @@
 /*
  * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/huffyuvdsp.h"
 
-void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-                                    const uint8_t *diff, int w,
-                                    int *left, int *left_top);
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_avx2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
 
-int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
-                                 int w, int left);
-int  ff_add_hfyu_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
-                                           int w, int left);
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+                                      intptr_t w, uint8_t *left);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
 
-#if HAVE_INLINE_ASM
-
-#if HAVE_7REGS
-static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
-                                      const uint8_t *diff, int w,
-                                      int *left, int *left_top)
-{
-    x86_reg w2 = -w;
-    x86_reg x;
-    int l  = *left     & 0xff;
-    int tl = *left_top & 0xff;
-    int t;
-    __asm__ volatile (
-        "mov          %7, %3            \n"
-        "1:                             \n"
-        "movzbl (%3, %4), %2            \n"
-        "mov          %2, %k3           \n"
-        "sub         %b1, %b3           \n"
-        "add         %b0, %b3           \n"
-        "mov          %2, %1            \n"
-        "cmp          %0, %2            \n"
-        "cmovg        %0, %2            \n"
-        "cmovg        %1, %0            \n"
-        "cmp         %k3, %0            \n"
-        "cmovg       %k3, %0            \n"
-        "mov          %7, %3            \n"
-        "cmp          %2, %0            \n"
-        "cmovl        %2, %0            \n"
-        "add    (%6, %4), %b0           \n"
-        "mov         %b0, (%5, %4)      \n"
-        "inc          %4                \n"
-        "jl           1b                \n"
-        : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
-        : "r"(dst + w), "r"(diff + w), "rm"(top + w)
-    );
-    *left     = l;
-    *left_top = tl;
-}
-#endif /* HAVE_7REGS */
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
+av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
 {
     int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
 
-#if HAVE_INLINE_ASM
-#if HAVE_7REGS
-    if (cpu_flags & AV_CPU_FLAG_CMOV)
-        c->add_hfyu_median_pred = add_hfyu_median_pred_cmov;
-#endif /* HAVE_7REGS */
-
-    if (INLINE_MMX(cpu_flags))
-        c->add_bytes = add_bytes_mmx;
-#endif /* HAVE_INLINE_ASM */
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+        c->add_int16 = ff_add_int16_mmx;
+    }
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        /* slower than cmov version on AMD */
-        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
-            c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
     }
 
-    if (EXTERNAL_SSSE3(cpu_flags)) {
-        c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_int16 = ff_add_int16_sse2;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
     }
 
-    if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
-        c->add_hfyu_left_pred = ff_add_hfyu_left_pred_unaligned_ssse3;
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->add_int16 = ff_add_int16_avx2;
     }
 }
diff --git a/libavcodec/x86/huffyuvdsp_template.asm b/libavcodec/x86/huffyuvdsp_template.asm
new file mode 100644
index 0000000..89721f4
--- /dev/null
+++ b/libavcodec/x86/huffyuvdsp_template.asm
@@ -0,0 +1,76 @@
+;******************************************************************************
+;* SIMD-optimized HuffYUV functions
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+    movd    xm4, maskd
+    SPLATW  m4, xm4
+    add     wd, wd
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+    push  tmpq
+%%.wordloop:
+    sub     wq, 2
+%ifidn %2, add
+    mov   tmpw, [srcq+wq]
+    add   tmpw, [dstq+wq]
+%else
+    mov   tmpw, [src1q+wq]
+    sub   tmpw, [src2q+wq]
+%endif
+    and   tmpw, maskw
+    mov     [dstq+wq], tmpw
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+    pop   tmpq
+%%.tomainloop:
+%ifidn %2, add
+    add     srcq, wq
+%else
+    add     src1q, wq
+    add     src2q, wq
+%endif
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%ifidn %2, add
+    mov%1   m0, [srcq+wq]
+    mov%1   m1, [dstq+wq]
+    mov%1   m2, [srcq+wq+mmsize]
+    mov%1   m3, [dstq+wq+mmsize]
+%else
+    mov%1   m0, [src1q+wq]
+    mov%1   m1, [src2q+wq]
+    mov%1   m2, [src1q+wq+mmsize]
+    mov%1   m3, [src2q+wq+mmsize]
+%endif
+    p%2w    m0, m1
+    p%2w    m2, m3
+    pand    m0, m4
+    pand    m2, m4
+    mov%1   [dstq+wq]       , m0
+    mov%1   [dstq+wq+mmsize], m2
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
new file mode 100644
index 0000000..d994fd0
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -0,0 +1,105 @@
+;************************************************************************
+;* SIMD-optimized HuffYUV encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%include "libavcodec/x86/huffyuvdsp_template.asm"
+
+;------------------------------------------------------------------------------
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    unsigned mask, int w);
+;------------------------------------------------------------------------------
+
+%macro DIFF_INT16 0
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+%if mmsize > 8
+    test src1q, mmsize-1
+    jnz .unaligned
+    test src2q, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+%endif
+    INT16_LOOP a, sub
+%if mmsize > 8
+.unaligned:
+    INT16_LOOP u, sub
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_INT16
+%endif
+
+INIT_XMM sse2
+DIFF_INT16
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_INT16
+%endif
+
+INIT_MMX mmxext
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+    add      wd, wd
+    movd    mm7, maskd
+    SPLATW  mm7, mm7
+    movq    mm0, [src1q]
+    movq    mm2, [src2q]
+    psllq   mm0, 16
+    psllq   mm2, 16
+    movd    mm6, [left_topq]
+    por     mm0, mm6
+    movd    mm6, [leftq]
+    por     mm2, mm6
+    xor     maskq, maskq
+.loop:
+    movq    mm1, [src1q + maskq]
+    movq    mm3, [src2q + maskq]
+    movq    mm4, mm2
+    psubw   mm2, mm0
+    paddw   mm2, mm1
+    pand    mm2, mm7
+    movq    mm5, mm4
+    pmaxsw  mm4, mm1
+    pminsw  mm1, mm5
+    pminsw  mm4, mm2
+    pmaxsw  mm4, mm1
+    psubw   mm3, mm4
+    pand    mm3, mm7
+    movq    [dstq + maskq], mm3
+    add     maskq, 8
+    movq    mm0, [src1q + maskq - 2]
+    movq    mm2, [src2q + maskq - 2]
+    cmp     maskq, wq
+        jb .loop
+    movzx maskd, word [src1q + wq - 2]
+    mov [left_topq], maskd
+    movzx maskd, word [src2q + wq - 2]
+    mov [leftq], maskd
+    RET
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
new file mode 100644
index 0000000..6c6e068
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -0,0 +1,60 @@
+/*
+ * SIMD-optimized HuffYUV encoding functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvencdsp.h"
+
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                                          unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_avx2;
+    }
+}
diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm
new file mode 100644
index 0000000..089425a
--- /dev/null
+++ b/libavcodec/x86/idctdsp.asm
@@ -0,0 +1,183 @@
+;******************************************************************************
+;* SIMD-optimized IDCT-related routines
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+SECTION .text
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                                  ptrdiff_t line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+    mova     m1, [blockq+mmsize*0+%1]
+    mova     m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m3, [blockq+mmsize*4+%1]
+    mova     m4, [blockq+mmsize*6+%1]
+%endif
+    packsswb m1, [blockq+mmsize*1+%1]
+    packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packsswb m3, [blockq+mmsize*5+%1]
+    packsswb m4, [blockq+mmsize*7+%1]
+%endif
+    paddb    m1, m0
+    paddb    m2, m0
+%if mmsize == 8
+    paddb    m3, m0
+    paddb    m4, m0
+    movq     [pixelsq+lsizeq*0], m1
+    movq     [pixelsq+lsizeq*1], m2
+    movq     [pixelsq+lsizeq*2], m3
+    movq     [pixelsq+lsize3q ], m4
+%else
+    movq     [pixelsq+lsizeq*0], m1
+    movhps   [pixelsq+lsizeq*1], m1
+    movq     [pixelsq+lsizeq*2], m2
+    movhps   [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+    mova     m0, [pb_80]
+    lea      lsize3q, [lsizeq*3]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+    lea      pixelsq, [pixelsq+lsizeq*4]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
+
+;--------------------------------------------------------------------------
+; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED 0
+cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED
+
+;--------------------------------------------------------------------------
+; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro ADD_PIXELS_CLAMPED 1
+    mova       m0, [blockq+mmsize*0+%1]
+    mova       m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+    mova       m5, [blockq+mmsize*2+%1]
+    mova       m6, [blockq+mmsize*3+%1]
+%endif
+    movq       m2, [pixelsq]
+    movq       m3, [pixelsq+lsizeq]
+%if mmsize == 8
+    mova       m7, m2
+    punpcklbw  m2, m4
+    punpckhbw  m7, m4
+    paddsw     m0, m2
+    paddsw     m1, m7
+    mova       m7, m3
+    punpcklbw  m3, m4
+    punpckhbw  m7, m4
+    paddsw     m5, m3
+    paddsw     m6, m7
+%else
+    punpcklbw  m2, m4
+    punpcklbw  m3, m4
+    paddsw     m0, m2
+    paddsw     m1, m3
+%endif
+    packuswb   m0, m1
+%if mmsize == 8
+    packuswb   m5, m6
+    movq       [pixelsq], m0
+    movq       [pixelsq+lsizeq], m5
+%else
+    movq       [pixelsq], m0
+    movhps     [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 0
+cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+    RET
+%endmacro
+
+INIT_MMX mmx
+ADD_PIXELS_CLAMPED
+INIT_XMM sse2
+ADD_PIXELS_CLAMPED
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
index 6e6c688..0d0bdb5 100644
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,16 @@
 
 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                       ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                       ptrdiff_t line_size);
+
 
 #endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 853c6a3..9103b92 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,18 +63,100 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+    if (EXTERNAL_MMX(cpu_flags)) {
         c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
         c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
 
         if (!high_bit_depth &&
+            avctx->lowres == 0 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
-             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct_put  = ff_simple_idct_put_mmx;
                 c->idct_add  = ff_simple_idct_add_mmx;
                 c->idct      = ff_simple_idct_mmx;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
     }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
+
+        if (!high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct_put  = ff_simple_idct_put_sse2;
+                c->idct_add  = ff_simple_idct_add_sse2;
+                c->perm_type = FF_IDCT_PERM_SIMPLE;
+        }
+
+        if (ARCH_X86_64 &&
+            !high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
+                avctx->idct_algo == FF_IDCT_SIMPLE)) {
+                c->idct      = ff_simple_idct8_sse2;
+                c->idct_put  = ff_simple_idct8_put_sse2;
+                c->idct_add  = ff_simple_idct8_add_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+    }
+
+    if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (EXTERNAL_AVX(cpu_flags) &&
+            !high_bit_depth &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
+                avctx->idct_algo == FF_IDCT_SIMPLE)) {
+                c->idct      = ff_simple_idct8_avx;
+                c->idct_put  = ff_simple_idct8_put_avx;
+                c->idct_add  = ff_simple_idct8_add_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+
+        if (avctx->bits_per_raw_sample == 10 &&
+            avctx->codec_id != AV_CODEC_ID_MPEG4 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLE)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct10_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct10_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct10_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct10_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+
+        if (avctx->bits_per_raw_sample == 12 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+    }
 }
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
deleted file mode 100644
index 523f368..0000000
--- a/libavcodec/x86/idctdsp_mmx.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * SIMD-optimized IDCT-related routines
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "idctdsp.h"
-#include "inline_asm.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               ptrdiff_t line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off)             \
-    "movq          "#off"(%2), %%mm1        \n\t"           \
-    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
-    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
-    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
-    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
-    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
-    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
-    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
-    "paddb              %%mm0, %%mm1        \n\t"           \
-    "paddb              %%mm0, %%mm2        \n\t"           \
-    "paddb              %%mm0, %%mm3        \n\t"           \
-    "paddb              %%mm0, %%mm4        \n\t"           \
-    "movq               %%mm1, (%0)         \n\t"           \
-    "movq               %%mm2, (%0, %3)     \n\t"           \
-    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
-    "movq               %%mm4, (%0, %1)     \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      ptrdiff_t line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
-        "lea         (%3, %3, 2), %1        \n\t"
-        put_signed_pixels_clamped_mmx_half(0)
-        "lea         (%0, %3, 4), %0        \n\t"
-        put_signed_pixels_clamped_mmx_half(64)
-        : "+&r" (pixels), "=&r" (line_skip3)
-        : "r" (block), "r" (line_skip)
-        : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               ptrdiff_t line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile (
-            "movq        (%2), %%mm0    \n\t"
-            "movq       8(%2), %%mm1    \n\t"
-            "movq      16(%2), %%mm2    \n\t"
-            "movq      24(%2), %%mm3    \n\t"
-            "movq          %0, %%mm4    \n\t"
-            "movq          %1, %%mm6    \n\t"
-            "movq       %%mm4, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm4    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm4, %%mm0    \n\t"
-            "paddsw     %%mm5, %%mm1    \n\t"
-            "movq       %%mm6, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm6    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm6, %%mm2    \n\t"
-            "paddsw     %%mm5, %%mm3    \n\t"
-            "packuswb   %%mm1, %%mm0    \n\t"
-            "packuswb   %%mm3, %%mm2    \n\t"
-            "movq       %%mm0, %0       \n\t"
-            "movq       %%mm2, %1       \n\t"
-            : "+m" (*pix), "+m" (*(pix + line_size))
-            : "r" (p)
-            : "memory");
-        pix += line_size * 2;
-        p   += 16;
-    } while (--i);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index ddb7174..b386ab9 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -2,26 +2,26 @@
 ;* 36 point SSE-optimized IMDCT transform
 ;* Copyright (c) 2011 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 16
+SECTION_RODATA
 
 ps_mask:  dd 0, ~0, ~0, ~0
 ps_mask2: dd 0, ~0,  0, ~0
@@ -49,7 +49,7 @@ ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
                dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
                dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
                dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
-               dd 1.0,  0.70710678118654752439,  0.0,  0.0
+               dd 1.0, -0.70710678118654752439,  0.0,  0.0
 
 costabs:  times 4 dd  0.98480773
           times 4 dd  0.93969262
@@ -128,7 +128,26 @@ SECTION .text
 %endif
 %endmacro
 
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+    mulps    %1, %1, [ps_cosh_sse3 + %3]
+    PSHUFD   %2, %1, 0xe1
+    addsubps %1, %1, %2
+%else
+    mulps    %1, [ps_cosh + %3]
+    PSHUFD   %2, %1, 0xe1
+    xorps    %1, [ps_p1m1p1m1]
+    addps    %1, %2
+%endif
+%endmacro
+
 %macro STORE 4
+%if cpuflag(sse4)
+    movss     [%3       ], %1
+    extractps dword [%3 +   %4], %1, 1
+    extractps dword [%3 + 2*%4], %1, 2
+    extractps dword [%3 + 3*%4], %1, 3
+%else
     movhlps %2, %1
     movss   [%3       ], %1
     movss   [%3 + 2*%4], %2
@@ -136,6 +155,7 @@ SECTION .text
     movss   [%3 +   %4], %1
     movhlps %2, %1
     movss   [%3 + 3*%4], %2
+%endif
 %endmacro
 
 %macro LOAD 4
@@ -278,11 +298,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     BUTTERF  m7, m2, 16
     BUTTERF  m3, m6, 32
     BUTTERF  m4, m1, 48
-
-    mulps   m5, m5, [ps_cosh + 64]
-    PSHUFD  m1, m5, 0xe1
-    xorps   m5, m5, [ps_p1m1p1m1]
-    addps   m5, m5, m1
+    BUTTERF2 m5, m1, 64
 
     ; permutates:
     ; m0    0  1  2  3     =>     2  6 10 14   m1
@@ -357,8 +373,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse2
 DEFINE_IMDCT
@@ -369,8 +387,10 @@ DEFINE_IMDCT
 INIT_XMM ssse3
 DEFINE_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse
 
@@ -715,5 +735,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
 INIT_XMM sse
 DEFINE_FOUR_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
index fc554bf..0198746 100644
--- a/libavcodec/x86/inline_asm.h
+++ b/libavcodec/x86/inline_asm.h
@@ -1,20 +1,20 @@
 /*
  * inline assembly helper macros
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@
         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
 
 #ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
 #else
 // for shared library it's better to use this way for accessing constants
 // pcmpeqd -> -1
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
new file mode 100644
index 0000000..61dfdd4
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -0,0 +1,164 @@
+;******************************************************************************
+;* SIMD-optimized JPEG2000 DSP functions
+;* Copyright (c) 2014 Nicolas Bertrand
+;* Copyright (c) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pf_ict0: times 8 dd 1.402
+pf_ict1: times 8 dd 0.34413
+pf_ict2: times 8 dd 0.71414
+pf_ict3: times 8 dd 1.772
+
+SECTION .text
+
+;***********************************************************************
+; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
+;***********************************************************************
+%macro ICT_FLOAT 1
+cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+    movaps   m6, [pf_ict0]
+    movaps   m7, [pf_ict1]
+    %define ICT0 m6
+    %define ICT1 m7
+
+%if ARCH_X86_64
+    movaps   m8, [pf_ict2]
+    %define ICT2 m8
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    movaps   m9, [pf_ict3]
+    %define ICT3 m9
+%endif
+
+%else ; ARCH_X86_32
+    %define ICT2 [pf_ict2]
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    %define ICT3 [pf_ict3]
+%endif
+
+%endif ; ARCH
+
+align 16
+.loop:
+    movaps   m0, [src0q+csizeq]
+    movaps   m1, [src1q+csizeq]
+    movaps   m2, [src2q+csizeq]
+
+%if cpuflag(fma4) || cpuflag(fma3)
+%if cpuflag(fma4)
+    fnmaddps  m5, m1, ICT1, m0
+    fmaddps   m4, m2, ICT0, m0
+%else ; fma3
+    movaps    m5, m1
+    movaps    m4, m2
+    fnmaddps  m5, m5, ICT1, m0
+    fmaddps   m4, m4, ICT0, m0
+%endif
+    fmaddps   m0, m1, ICT3, m0
+    fnmaddps  m5, m2, ICT2, m5
+%else ; non FMA
+%if cpuflag(avx)
+    mulps    m5, m1, ICT1
+    mulps    m4, m2, ICT0
+    mulps    m1, m1, ICT3
+    mulps    m2, m2, ICT2
+    subps    m5, m0, m5
+%else ; sse
+    movaps   m3, m1
+    movaps   m4, m2
+    movaps   m5, m0
+    mulps    m3, ICT1
+    mulps    m4, ICT0
+    mulps    m1, ICT3
+    mulps    m2, ICT2
+    subps    m5, m3
+%endif
+    addps    m4, m4, m0
+    addps    m0, m0, m1
+    subps    m5, m5, m2
+%endif
+
+    movaps   [src0q+csizeq], m4
+    movaps   [src2q+csizeq], m0
+    movaps   [src1q+csizeq], m5
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+ICT_FLOAT 10
+INIT_YMM avx
+ICT_FLOAT 9
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+ICT_FLOAT 9
+%endif
+INIT_YMM fma3
+ICT_FLOAT 9
+
+;***************************************************************************
+; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
+;***************************************************************************
+%macro RCT_INT 0
+cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+
+align 16
+.loop:
+    mova   m1, [src1q+csizeq]
+    mova   m2, [src2q+csizeq]
+    mova   m0, [src0q+csizeq]
+    paddd  m3, m1, m2
+    psrad  m3, 2
+    psubd  m0, m3
+    paddd  m1, m0
+    paddd  m2, m0
+    mova   [src1q+csizeq], m0
+    mova   [src2q+csizeq], m1
+    mova   [src0q+csizeq], m2
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+RCT_INT
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RCT_INT
+%endif
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
new file mode 100644
index 0000000..7310a1d
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -0,0 +1,60 @@
+/*
+ * SIMD optimized JPEG 2000 DSP functions
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/jpeg2000dsp.h"
+
+void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_fma3(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_fma4(void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+
+av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_avx;
+    }
+
+    if (EXTERNAL_FMA4(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_fma4;
+    }
+
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_fma3;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
+    }
+}
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/lossless_audiodsp.asm
index d6abd98..063d7b4 100644
--- a/libavcodec/x86/apedsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -58,14 +58,7 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
     mova    [v1q + orderq + mmsize], m3
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
 %endmacro
@@ -75,6 +68,39 @@ SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 
+INIT_XMM sse4
+; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    SPLATW  m7, m7
+    pxor    m6, m6
+    add v1q, orderq
+    lea v2q, [v2q + 2*orderq]
+    add v3q, orderq
+    neg orderq
+.loop:
+    mova    m3, [v1q + orderq]
+    movu    m0, [v2q + 2*orderq]
+    pmovsxwd m4, m3
+    movu    m1, [v2q + 2*orderq + mmsize]
+    movhlps m5, m3
+    movu    m2, [v3q + orderq]
+    pmovsxwd m5, m5
+    pmullw  m2, m7
+    pmulld  m0, m4
+    pmulld  m1, m5
+    paddw   m2, m3
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    add     orderq, 16
+    jl .loop
+    HADDD   m6, m0
+    movd   eax, m6
+    RET
+
 %macro SCALARPRODUCT_LOOP 1
 align 16
 .loop%1:
@@ -159,9 +185,6 @@ SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
 .end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c
index f692c2b..f74c7e4 100644
--- a/libavcodec/x86/apedsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -1,25 +1,25 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
                                                const int16_t *v3,
@@ -31,8 +31,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 
-av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
 {
+#if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags))
@@ -44,4 +49,8 @@ av_cold void ff_apedsp_init_x86(APEDSPContext *c)
     if (EXTERNAL_SSSE3(cpu_flags) &&
         !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
+#endif
 }
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
new file mode 100644
index 0000000..0a1b709
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,406 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;* Copyright (c) 2017 Jokyo Images
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_ef: times 8 db 14,15
+pb_67: times 8 db  6, 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+;                                const uint8_t *diff, int w,
+;                                int *left, int *left_top)
+;------------------------------------------------------------------------------
+%macro MEDIAN_PRED 0
+cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+    movu    m0, [topq]
+    mova    m2, m0
+    movd    m4, [left_topq]
+    LSHIFT  m2, 1
+    mova    m1, m0
+    por     m4, m2
+    movd    m3, [leftq]
+    psubb   m0, m4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movu    m4, [topq+wq]
+    mova    m0, m4
+    LSHIFT  m4, 1
+    por     m4, m1
+    mova    m1, m0 ; t
+    psubb   m0, m4 ; t-tl
+.skip:
+    movu    m2, [diffq+wq]
+%assign i 0
+%rep mmsize
+    mova    m4, m0
+    paddb   m4, m3 ; t-tl+l
+    mova    m5, m3
+    pmaxub  m3, m1
+    pminub  m5, m1
+    pminub  m3, m4
+    pmaxub  m3, m5 ; median
+    paddb   m3, m2 ; +residual
+%if i==0
+    mova    m7, m3
+    LSHIFT  m7, mmsize-1
+%else
+    mova    m6, m3
+    RSHIFT  m7, 1
+    LSHIFT  m6, mmsize-1
+    por     m7, m6
+%endif
+%if i<mmsize-1
+    RSHIFT  m0, 1
+    RSHIFT  m1, 1
+    RSHIFT  m2, 1
+%endif
+%assign i i+1
+%endrep
+    movu [dstq+wq], m7
+    add      wq, mmsize
+    jl .loop
+    movzx   r2d, byte [dstq-1]
+    mov [leftq], r2d
+    movzx   r2d, byte [topq-1]
+    mov [left_topq], r2d
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+MEDIAN_PRED
+%endif
+INIT_XMM sse2
+MEDIAN_PRED
+
+
+%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    pshufb  xm0, xm5
+%if %2
+    mova    m1, [srcq+wq]
+%else
+    movu    m1, [srcq+wq]
+%endif
+    psllw   m2, m1, 8
+    paddb   m1, m2
+    pshufb  m2, m1, m3
+    paddb   m1, m2
+    pshufb  m2, m1, m4
+    paddb   m1, m2
+%if mmsize >= 16
+    pshufb  m2, m1, m6
+    paddb   m1, m2
+%endif
+    paddb   xm0, xm1
+%if %1
+    mova    [dstq+wq], xm0
+%else
+    movq    [dstq+wq], xm0
+    movhps  [dstq+wq+8], xm0
+%endif
+
+%if mmsize == 32
+    vextracti128    xm2, m1, 1 ; get second lane of the ymm
+    pshufb          xm0, xm5   ; set alls val to last val of the first lane
+    paddb           xm0, xm2
+;store val
+%if %1
+    mova    [dstq+wq+16], xm0
+%else;
+    movq    [dstq+wq+16], xm0
+    movhps  [dstq+wq+16+8], xm0
+%endif
+%endif
+    add     wq, mmsize
+    jl %%.loop
+%if mmsize == 32
+    movzx   eax, byte [dstq - 1]
+%else;
+    mov     eax, mmsize-1
+    sub     eax, wd
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+%endif
+    RET
+%endmacro
+
+;------------------------------------------------------------------------------
+; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
+;------------------------------------------------------------------------------
+INIT_MMX ssse3
+cglobal add_left_pred, 3,3,7, dst, src, w, left
+.skip_prologue:
+    mova    m5, [pb_7]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    psllq   m0, 56
+    ADD_LEFT_LOOP 1, 1
+
+%macro ADD_LEFT_PRED_UNALIGNED 0
+cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
+    mova    xm5, [pb_15]
+    VBROADCASTI128    m6, [pb_zzzzzzzz77777777]
+    VBROADCASTI128    m4, [pb_zzzz3333zzzzbbbb]
+    VBROADCASTI128    m3, [pb_zz11zz55zz99zzdd]
+    movd    xm0, leftm
+    pslldq  xm0, 15
+    test    srcq, mmsize - 1
+    jnz .src_unaligned
+    test    dstq, mmsize - 1
+    jnz .dst_unaligned
+    ADD_LEFT_LOOP 1, 1
+.dst_unaligned:
+    ADD_LEFT_LOOP 0, 1
+.src_unaligned:
+    ADD_LEFT_LOOP 0, 0
+%endmacro
+
+INIT_XMM ssse3
+ADD_LEFT_PRED_UNALIGNED
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_LEFT_PRED_UNALIGNED
+%endif
+
+;------------------------------------------------------------------------------
+; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+;------------------------------------------------------------------------------
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3:
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES
+%endif
+INIT_XMM sse2
+ADD_BYTES
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_BYTES
+%endif
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+    add     wd, wd
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    mov%2   m1, [srcq+wq]
+    mova    m2, m1
+    pslld   m1, 16
+    paddw   m1, m2
+    mova    m2, m1
+
+    pshufb  m1, m3
+    paddw   m1, m2
+    pshufb  m0, m5
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m4
+    paddw   m1, m2
+%endif
+    paddw   m0, m1
+    pand    m0, m7
+%ifidn %1, a
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    mov     wd, eax
+    shl     wd, 8
+    lea     eax, [wd+eax-1]
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+;---------------------------------------------------------------------------------------------
+; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+;---------------------------------------------------------------------------------------------
+INIT_MMX ssse3
+cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+    mova    m5, [pb_67]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    psllq   m0, 48
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM ssse3
+cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
+    mova    m5, [pb_ef]
+    mova    m4, [pb_zzzzzzzz67676767]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    pslldq  m0, 14
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+
+;---------------------------------------------------------------------------------------------
+; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
+;---------------------------------------------------------------------------------------------
+%macro ADD_GRADIENT_PRED 0
+cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
+    mova         xm0, [pb_15]
+
+;load src - 1 in xm1
+    movd         xm1, [srcq-1]
+%if cpuflag(avx2)
+    vpbroadcastb xm1, xm1
+%else
+    pxor         xm2, xm2
+    pshufb       xm1, xm2
+%endif
+
+    add    srcq, widthq
+    neg  widthq
+    neg strideq
+
+.loop:
+    lea    tmpq, [srcq + strideq]
+    mova     m2, [tmpq + widthq] ; A = src[x-stride]
+    movu     m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
+    mova     m4, [srcq + widthq] ; current val (src[x])
+
+    psubb    m2, m3; A - B
+
+; prefix sum A-B
+    pslldq   m3, m2, 1
+    paddb    m2, m3
+    pslldq   m3, m2, 2
+    paddb    m2, m3
+    pslldq   m3, m2, 4
+    paddb    m2, m3
+    pslldq   m3, m2, 8
+    paddb    m2, m3
+
+; prefix sum current val
+    pslldq   m3, m4, 1
+    paddb    m4, m3
+    pslldq   m3, m4, 2
+    paddb    m4, m3
+    pslldq   m3, m4, 4
+    paddb    m4, m3
+    pslldq   m3, m4, 8
+    paddb    m4, m3
+
+; last sum
+    paddb                    m2, m4 ; current + (A - B)
+
+    paddb                   xm1, xm2 ; += C
+    mova        [srcq + widthq], xm1 ; store
+
+    pshufb                  xm1, xm0 ; put last val in all val of xm1
+
+%if mmsize == 32
+    vextracti128            xm2, m2, 1 ; get second lane of the ymm
+    paddb                   xm1, xm2; += C
+
+    mova   [srcq + widthq + 16], xm1 ; store
+    pshufb                  xm1, xm0 ; put last val in all val of m1
+%endif
+
+    add         widthq, mmsize
+    jl .loop
+    RET
+
+%endmacro
+
+INIT_XMM ssse3
+ADD_GRADIENT_PRED
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_GRADIENT_PRED
+%endif
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
new file mode 100644
index 0000000..6d71f14
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,128 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/x86/asm.h"
+#include "../lossless_videodsp.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+
+void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+                               const uint8_t *diff, ptrdiff_t w,
+                               int *left, int *left_top);
+void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+                             const uint8_t *diff, ptrdiff_t w,
+                             int *left, int *left_top);
+
+int  ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
+                            ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t w, int left);
+
+int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+
+void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                 const uint8_t *diff, ptrdiff_t w,
+                                 int *left, int *left_top)
+{
+    x86_reg w2 = -w;
+    x86_reg x;
+    int l  = *left     & 0xff;
+    int tl = *left_top & 0xff;
+    int t;
+    __asm__ volatile (
+        "mov          %7, %3            \n"
+        "1:                             \n"
+        "movzbl (%3, %4), %2            \n"
+        "mov          %2, %k3           \n"
+        "sub         %b1, %b3           \n"
+        "add         %b0, %b3           \n"
+        "mov          %2, %1            \n"
+        "cmp          %0, %2            \n"
+        "cmovg        %0, %2            \n"
+        "cmovg        %1, %0            \n"
+        "cmp         %k3, %0            \n"
+        "cmovg       %k3, %0            \n"
+        "mov          %7, %3            \n"
+        "cmp          %2, %0            \n"
+        "cmovl        %2, %0            \n"
+        "add    (%6, %4), %b0           \n"
+        "mov         %b0, (%5, %4)      \n"
+        "inc          %4                \n"
+        "jl           1b                \n"
+        : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
+        : "r"(dst + w), "r"(diff + w), "rm"(top + w)
+    );
+    *left     = l;
+    *left_top = tl;
+}
+#endif
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+    if (cpu_flags & AV_CPU_FLAG_CMOV)
+        c->add_median_pred = add_median_pred_cmov;
+#endif
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_bytes = ff_add_bytes_mmx;
+    }
+
+    if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
+        /* slower than cmov version on AMD */
+        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
+            c->add_median_pred = ff_add_median_pred_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes       = ff_add_bytes_sse2;
+        c->add_median_pred = ff_add_median_pred_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_ssse3;
+        c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
+        c->add_gradient_pred   = ff_add_gradient_pred_ssse3;
+    }
+
+    if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
+        c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->add_bytes       = ff_add_bytes_avx2;
+        c->add_left_pred   = ff_add_left_pred_unaligned_avx2;
+        c->add_gradient_pred = ff_add_gradient_pred_avx2;
+    }
+}
diff --git a/libavcodec/x86/lossless_videoencdsp.asm b/libavcodec/x86/lossless_videoencdsp.asm
new file mode 100644
index 0000000..fb1204f
--- /dev/null
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@ -0,0 +1,194 @@
+;************************************************************************
+;* SIMD-optimized lossless video encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pb_80
+
+SECTION .text
+
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    intptr_t w);
+%macro DIFF_BYTES_PROLOGUE 0
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+    DECLARE_REG_TMP 3
+    mov               wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+    DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+%endmacro
+
+; labels to jump to if w < regsize and w < 0
+%macro DIFF_BYTES_LOOP_PREP 2
+    mov                i, wq
+    and                i, -2 * regsize
+        js            %2
+        jz            %1
+    add             dstq, i
+    add            src1q, i
+    add            src2q, i
+    neg                i
+%endmacro
+
+; mov type used for src1q, dstq, first reg, second reg
+%macro DIFF_BYTES_LOOP_CORE 4
+%if mmsize != 16
+    mov%1             %3, [src1q + i]
+    mov%1             %4, [src1q + i + regsize]
+    psubb             %3, [src2q + i]
+    psubb             %4, [src2q + i + regsize]
+    mov%2           [dstq + i], %3
+    mov%2 [regsize + dstq + i], %4
+%else
+    ; SSE enforces alignment of psubb operand
+    mov%1             %3, [src1q + i]
+    movu              %4, [src2q + i]
+    psubb             %3, %4
+    mov%2     [dstq + i], %3
+    mov%1             %3, [src1q + i + regsize]
+    movu              %4, [src2q + i + regsize]
+    psubb             %3, %4
+    mov%2 [regsize + dstq + i], %3
+%endif
+%endmacro
+
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
+    %define regsize mmsize
+.loop_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
+    add                i, 2 * regsize
+        jl    .loop_%1%2
+.skip_main_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%if mmsize > 16
+    ; fall back to narrower xmm
+    %define regsize (mmsize / 2)
+    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa
+.loop2_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
+    add                i, 2 * regsize
+        jl   .loop2_%1%2
+.setup_loop_gpr_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%endif
+    add             dstq, wq
+    add            src1q, wq
+    add            src2q, wq
+    neg               wq
+.loop_gpr_%1%2:
+    mov              t0b, [src1q + wq]
+    sub              t0b, [src2q + wq]
+    mov      [dstq + wq], t0b
+    inc               wq
+        jl .loop_gpr_%1%2
+.end_%1%2:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
+    DIFF_BYTES_BODY    a, a
+%undef i
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    ; Directly using unaligned SSE2 version is marginally faster than
+    ; branching based on arguments.
+    DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+%endif
+
+
+;--------------------------------------------------------------------------------------------------
+;void sub_left_predict(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height)
+;--------------------------------------------------------------------------------------------------
+
+INIT_XMM avx
+cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x
+    mova             m1, [pb_80] ; prev initial
+    add            dstq, widthq
+    add            srcq, widthq
+    lea              xd, [widthq-1]
+    neg          widthq
+    and              xd, 15
+    pinsrb           m4, m1, xd, 15
+    mov              xq, widthq
+
+    .loop:
+        movu                     m0, [srcq + widthq]
+        palignr                  m2, m0, m1, 15
+        movu                     m1, [srcq + widthq + 16]
+        palignr                  m3, m1, m0, 15
+        psubb                    m2, m0, m2
+        psubb                    m3, m1, m3
+        movu        [dstq + widthq], m2
+        movu   [dstq + widthq + 16], m3
+        add                  widthq, 2 * 16
+        jl .loop
+
+    add   srcq, strideq
+    sub   dstq, xq ; dst + width
+    test    xd, 16
+    jz .mod32
+    mova    m1, m0
+
+.mod32:
+    pshufb    m1, m4
+    mov   widthq, xq
+    dec  heightd
+    jg .loop
+    RET
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/lossless_videoencdsp_init.c
index 8ffaced..40407ad 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/lossless_videoencdsp_init.c
@@ -1,24 +1,24 @@
 /*
- * SIMD-optimized HuffYUV encoding functions
+ * SIMD-optimized lossless video encoding functions
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,38 +26,24 @@
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/huffyuvencdsp.h"
+#include "libavcodec/lossless_videoencdsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_INLINE_ASM
+void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                       intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
 
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "1:                             \n\t"
-        "movq  (%2, %0), %%mm0          \n\t"
-        "movq  (%1, %0), %%mm1          \n\t"
-        "psubb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, (%3, %0)           \n\t"
-        "movq 8(%2, %0), %%mm0          \n\t"
-        "movq 8(%1, %0), %%mm1          \n\t"
-        "psubb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, 8(%3, %0)          \n\t"
-        "add $16, %0                    \n\t"
-        "cmp %4, %0                     \n\t"
-        " jb 1b                         \n\t"
-        : "+r" (i)
-        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w - 15));
+void ff_sub_left_predict_avx(uint8_t *dst, uint8_t *src,
+                            ptrdiff_t stride, ptrdiff_t width, int height);
 
-    for (; i < w; i++)
-        dst[i + 0] = src1[i + 0] - src2[i + 0];
-}
+#if HAVE_INLINE_ASM
 
-static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
-                                        const uint8_t *src2, int w,
-                                        int *left, int *left_top)
+static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
+                                   const uint8_t *src2, intptr_t w,
+                                   int *left, int *left_top)
 {
     x86_reg i = 0;
     uint8_t l, lt;
@@ -97,17 +83,29 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
 
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c)
+av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
 {
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->diff_bytes = diff_bytes_mmx;
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_mmx;
     }
 
+#if HAVE_INLINE_ASM
     if (INLINE_MMXEXT(cpu_flags)) {
-        c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
+        c->sub_median_pred = sub_median_pred_mmxext;
     }
 #endif /* HAVE_INLINE_ASM */
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_sse2;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        c->sub_left_predict = ff_sub_left_predict_avx;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_avx2;
+    }
 }
diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index e8cce42..6c72e21 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -2,26 +2,25 @@
  * SIMD-optimized LPC functions
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -73,6 +72,7 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
         "3:                                    \n\t"
         :"+&r"(i), "+&r"(j)
         :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+         NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                     "%xmm5", "%xmm6", "%xmm7")
     );
@@ -117,6 +117,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm2, 16(%1)           \n\t"
                 :"+&r"(i)
                 :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
                 :"memory"
             );
         } else {
@@ -140,6 +141,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm1, %2               \n\t"
                 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
                 :"r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
             );
         }
     }
@@ -152,7 +154,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c)
 #if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_SSE2_SLOW(cpu_flags)) {
+    if (INLINE_SSE2(cpu_flags) || INLINE_SSE2_SLOW(cpu_flags)) {
         c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
         c->lpc_compute_autocorr   = lpc_compute_autocorr_sse2;
     }
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index 2c04d9d..6298f5e 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ __asm__ volatile(\
 #endif /* HAVE_I686 */
 
 #define MASK_ABS(mask, level)                   \
-    __asm__ ("cltd                   \n\t"      \
+    __asm__ ("cdq                    \n\t"      \
              "xorl %1, %0            \n\t"      \
              "subl %1, %0            \n\t"      \
              : "+a"(level), "=&d"(mask))
diff --git a/libavcodec/x86/mdct.h b/libavcodec/x86/mdct.h
deleted file mode 100644
index cc107cb..0000000
--- a/libavcodec/x86/mdct.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_MDCT_H
-#define AVCODEC_X86_MDCT_H
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-#endif /* AVCODEC_X86_MDCT_H */
diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
new file mode 100644
index 0000000..2a2cdbd
--- /dev/null
+++ b/libavcodec/x86/mdct15.asm
@@ -0,0 +1,221 @@
+;******************************************************************************
+;* SIMD optimized non-power-of-two MDCT functions
+;*
+;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
+perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
+sign_adjust_r: times 4 dd 0x80000000, 0x00000000
+
+sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
+
+SECTION .text
+
+%if ARCH_X86_64
+
+;*****************************************************************************************
+;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+;*****************************************************************************************
+%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
+    VBROADCASTSD m0, [inq + %1]         ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
+    movsd   xm1, [inq + 1*16 +  8 + %1] ; in[ 3].re, in[ 3].im,         0,         0
+    movsd   xm4, [inq + 6*16 +  0 + %1] ; in[12].re, in[12].im,         0,         0
+    movhps  xm1, [inq + 3*16 +  0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
+    movhps  xm4, [inq + 4*16 +  8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
+
+    subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
+    addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
+
+    movhlps     %2,   xm1               ; t[0].re, t[1].re, t[0].im, t[1].im
+    addps       %2,   xm1
+    addps       %2,   xm0               ; DC[0].re, DC[0].im, junk...
+    movlhps     %2,   %2                ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
+
+    shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
+    shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
+
+    mulps       xm%3, xm1, xm5
+    mulps       xm4,  xm3, xm6
+    mulps       xm1,  xm6
+
+    xorps       xm1,  xm7
+    mulps       xm3,  xm5
+    addsubps    xm3,  xm1               ; t[0].re, t[0].im, t[2].re, t[2].im
+    subps       xm%3, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
+
+    movhlps     xm2, xm%3, xm3          ; t[2].re, t[2].im, t[5].re, t[5].im
+    movlhps     xm3, xm%3               ; t[0].re, t[0].im, t[4].re, t[4].im
+
+    xorps       xm2,  xm7
+    addps       xm%3, xm2, xm3
+    subps       xm3,  xm2
+
+    shufps      xm3,  xm3, q1032
+    vinsertf128 m%3,  m%3, xm3, 1       ; All ACs (tmp[1] through to tmp[4])
+    addps       m%3,  m%3,  m0          ; Finally offset with DCs
+%endmacro
+
+%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
+    mulps xm0,  xm9, [exptabq + %1 + 16*0]
+    mulps xm1, xm10, [exptabq + %1 + 16*1]
+
+    haddps  xm0,  xm1
+    movhlps xm1,  xm0                   ; t[0].re, t[1].re, t[0].im, t[1].im
+
+    addps   xm0,  xm1
+    addps   xm0,  xm8
+
+    movsd [outq], xm0
+%endmacro
+
+%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
+    mulps  m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
+    mulps  m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
+    mulps  m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
+    mulps  m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
+
+    addps  m0, m0, m2
+    addps  m1, m1, m3
+    addps  m0, m0, m11
+
+    shufps m1, m1, m1, q2301
+    addps  m0, m0, m1
+
+    vextractf128 xm1, m0, 1
+
+    movlps [outq + strideq*1], xm0
+    movhps [outq + strideq*2], xm0
+    movlps [outq +  stride3q], xm1
+    movhps [outq + strideq*4], xm1
+%endmacro
+
+INIT_YMM avx
+cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
+    shl strideq, 3
+
+    movaps xm5, [exptabq + 480 + 16*0]
+    movaps xm6, [exptabq + 480 + 16*1]
+    movaps xm7, [sign_adjust_5]
+
+    FFT5  0,  xm8, 11
+    FFT5  8,  xm9, 12
+    FFT5 16, xm10, 13
+
+%define stride3q inq
+    lea stride3q, [strideq + strideq*2]
+    lea stride5q, [strideq + strideq*4]
+
+    BUTTERFLIES_DC (8*6 + 4*0)*2*4
+    BUTTERFLIES_AC (8*0 + 0*0)*2*4
+
+    add outq, stride5q
+    BUTTERFLIES_DC (8*6 + 4*1)*2*4
+    BUTTERFLIES_AC (8*2 + 0*0)*2*4
+
+    add outq, stride5q
+    BUTTERFLIES_DC (8*6 + 4*2)*2*4
+    BUTTERFLIES_AC (8*4 + 0*0)*2*4
+
+    RET
+
+%endif ; ARCH_X86_64
+
+;*******************************************************************************************************
+;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
+;*******************************************************************************************************
+%macro LUT_LOAD_4D 3
+    mov      r4d, [lutq + %3q*4 +  0]
+    movsd  xmm%1, [inq +  r4q*8]
+    mov      r4d, [lutq + %3q*4 +  4]
+    movhps xmm%1, [inq +  r4q*8]
+%if cpuflag(avx2)
+    mov      r4d, [lutq + %3q*4 +  8]
+    movsd     %2, [inq +  r4q*8]
+    mov      r4d, [lutq + %3q*4 + 12]
+    movhps    %2, [inq +  r4q*8]
+    vinsertf128 %1, %1, %2, 1
+%endif
+%endmacro
+
+%macro POSTROTATE_FN 1
+cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
+
+    xor offset_nq, offset_nq
+    lea offset_pq, [len8q*2 - %1]
+
+    movaps m7,  [sign_adjust_r]
+
+%if cpuflag(avx2)
+    movaps   m8, [perm_pos]
+    movaps   m9, [perm_neg]
+%endif
+
+.loop:
+    movups m0, [expq + offset_pq*8]     ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
+    movups m1, [expq + offset_nq*8]     ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
+
+    LUT_LOAD_4D m3, xm4, offset_p       ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
+    LUT_LOAD_4D m4, xm5, offset_n       ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
+
+    mulps  m5, m3, m0                   ; in[p].reim * exp[p].reim
+    mulps  m6, m4, m1                   ; in[n].reim * exp[n].reim
+
+    xorps  m5, m7                       ; in[p].re *= -1, in[p].im *= 1
+    xorps  m6, m7                       ; in[n].re *= -1, in[n].im *= 1
+
+    shufps m3, m3, m3, q2301            ; in[p].imre
+    shufps m4, m4, m4, q2301            ; in[n].imre
+
+    mulps  m3, m0                       ; in[p].imre * exp[p].reim
+    mulps  m4, m1                       ; in[n].imre * exp[n].reim
+
+    haddps m3, m6                       ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
+    haddps m5, m4                       ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
+
+%if cpuflag(avx2)
+    vpermps m3, m9, m3                  ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
+    vpermps m5, m8, m5                  ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
+%else
+    shufps m3, m3, m3, q0312
+    shufps m5, m5, m5, q2130
+%endif
+
+    movups [outq + offset_nq*8], m3
+    movups [outq + offset_pq*8], m5
+
+    sub offset_pq, %1
+    add offset_nq, %1
+    cmp offset_nq, offset_pq
+    jle .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM sse3
+POSTROTATE_FN 2
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+POSTROTATE_FN 4
+%endif
diff --git a/libavcodec/x86/mdct15_init.c b/libavcodec/x86/mdct15_init.c
new file mode 100644
index 0000000..444801d
--- /dev/null
+++ b/libavcodec/x86/mdct15_init.c
@@ -0,0 +1,99 @@
+/*
+ * SIMD optimized non-power-of-two MDCT functions
+ *
+ * Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/mdct15.h"
+
+void ff_mdct15_postreindex_sse3(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
+void ff_mdct15_postreindex_avx2(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
+
+void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+
+static void perm_twiddles(MDCT15Context *s)
+{
+    int k;
+    FFTComplex tmp[30];
+
+    /* 5-point FFT twiddles */
+    s->exptab[60].re = s->exptab[60].im = s->exptab[19].re;
+    s->exptab[61].re = s->exptab[61].im = s->exptab[19].im;
+    s->exptab[62].re = s->exptab[62].im = s->exptab[20].re;
+    s->exptab[63].re = s->exptab[63].im = s->exptab[20].im;
+
+    /* 15-point FFT twiddles */
+    for (k = 0; k < 5; k++) {
+        tmp[6*k + 0] = s->exptab[k +  0];
+        tmp[6*k + 2] = s->exptab[k +  5];
+        tmp[6*k + 4] = s->exptab[k + 10];
+
+        tmp[6*k + 1] = s->exptab[2 * (k + 0)];
+        tmp[6*k + 3] = s->exptab[2 * (k + 5)];
+        tmp[6*k + 5] = s->exptab[2 *  k + 5 ];
+    }
+
+    for (k = 0; k < 6; k++) {
+        FFTComplex ac_exp[] = {
+            { tmp[6*1 + k].re,  tmp[6*1 + k].re },
+            { tmp[6*2 + k].re,  tmp[6*2 + k].re },
+            { tmp[6*3 + k].re,  tmp[6*3 + k].re },
+            { tmp[6*4 + k].re,  tmp[6*4 + k].re },
+            { tmp[6*1 + k].im, -tmp[6*1 + k].im },
+            { tmp[6*2 + k].im, -tmp[6*2 + k].im },
+            { tmp[6*3 + k].im, -tmp[6*3 + k].im },
+            { tmp[6*4 + k].im, -tmp[6*4 + k].im },
+        };
+        memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
+    }
+
+    /* Specialcase when k = 0 */
+    for (k = 0; k < 3; k++) {
+        FFTComplex dc_exp[] = {
+            { tmp[2*k + 0].re, -tmp[2*k + 0].im },
+            { tmp[2*k + 0].im,  tmp[2*k + 0].re },
+            { tmp[2*k + 1].re, -tmp[2*k + 1].im },
+            { tmp[2*k + 1].im,  tmp[2*k + 1].re },
+        };
+        memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
+    }
+}
+
+av_cold void ff_mdct15_init_x86(MDCT15Context *s)
+{
+    int adjust_twiddles = 0;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->postreindex = ff_mdct15_postreindex_sse3;
+
+    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
+        s->fft15 = ff_fft15_avx;
+        adjust_twiddles = 1;
+    }
+
+    if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
+        s->postreindex = ff_mdct15_postreindex_avx2;
+
+    if (adjust_twiddles)
+        perm_twiddles(s);
+}
diff --git a/libavcodec/x86/mdct_init.c b/libavcodec/x86/mdct_init.c
deleted file mode 100644
index db642d8..0000000
--- a/libavcodec/x86/mdct_init.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-
-#include "mdct.h"
-
-av_cold void ff_mdct_init_x86(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
-    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
-        s->imdct_calc = ff_imdct_calc_3dnow;
-        s->imdct_half = ff_imdct_half_3dnow;
-    }
-
-    if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
-        s->imdct_calc = ff_imdct_calc_3dnowext;
-        s->imdct_half = ff_imdct_half_3dnowext;
-    }
-#endif /* ARCH_X86_32 */
-
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->imdct_calc  = ff_imdct_calc_sse;
-        s->imdct_half  = ff_imdct_half_sse;
-    }
-
-    if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
-        s->imdct_half      = ff_imdct_half_avx;
-    }
-}
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 1a87f37..ad06d48 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -4,25 +4,30 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+cextern pb_1
+cextern pb_80
+
 SECTION .text
 
 %macro DIFF_PIXELS_1 4
@@ -210,7 +215,7 @@ hadamard8_16_wrapper %1, 3
 %elif cpuflag(mmx)
 ALIGN 16
 ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
-;                               uint8_t *src2, int stride, int h)
+;                               uint8_t *src2, ptrdiff_t stride, int h)
 ; r0 = void *s = unused, int h = unused (always 8)
 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
 ; can simply call this 2x2x (and that's why we access rsp+gprsize
@@ -274,19 +279,27 @@ INIT_XMM ssse3
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
 HADAMARD8_DIFF 9
 
-INIT_XMM sse2
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-;                   int line_size, int h);
-cglobal sse16, 5, 5, 8
-    shr      r4d, 1
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;               ptrdiff_t line_size, int h)
+
+%macro SUM_SQUARED_ERRORS 1
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
+%if %1 == mmsize
+    shr       hd, 1
+%endif
     pxor      m0, m0         ; mm0 = 0
     pxor      m7, m7         ; mm7 holds the sum
 
 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
-    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
-    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
-    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
-    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
+    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
+    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
+%if %1 == mmsize
+    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
+    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
+%else  ; %1 / 2 == mmsize; mmx only
+    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
+    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
+%endif
 
     ; todo: mm1-mm2, mm3-mm4
     ; algo: subtract mm1 from mm2 with saturation and vice versa
@@ -315,22 +328,607 @@ cglobal sse16, 5, 5, 8
     pmaddwd   m1, m1
     pmaddwd   m3, m3
 
-    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
-    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
-
     paddd     m1, m2
     paddd     m3, m4
     paddd     m7, m1
     paddd     m7, m3
 
-    dec       r4
+%if %1 == mmsize
+    lea    pix1q, [pix1q + 2*lsizeq]
+    lea    pix2q, [pix2q + 2*lsizeq]
+%else
+    add    pix1q, lsizeq
+    add    pix2q, lsizeq
+%endif
+    dec       hd
     jnz .next2lines
 
-    mova      m1, m7
-    psrldq    m7, 8          ; shift hi qword to lo
-    paddd     m7, m1
-    mova      m1, m7
-    psrldq    m7, 4          ; shift hi dword to lo
-    paddd     m7, m1
+    HADDD     m7, m1
     movd     eax, m7         ; return value
     RET
+%endmacro
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 8
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 16
+
+INIT_XMM sse2
+SUM_SQUARED_ERRORS 16
+
+;-----------------------------------------------
+;int ff_sum_abs_dctelem(int16_t *block)
+;-----------------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline loops
+
+%macro SUM_ABS_DCTELEM 2
+cglobal sum_abs_dctelem, 1, 1, %1, block
+    pxor    m0, m0
+    pxor    m1, m1
+%assign %%i 0
+%rep %2
+    mova      m2, [blockq+mmsize*(0+%%i)]
+    mova      m3, [blockq+mmsize*(1+%%i)]
+    mova      m4, [blockq+mmsize*(2+%%i)]
+    mova      m5, [blockq+mmsize*(3+%%i)]
+    ABS1_SUM  m2, m6, m0
+    ABS1_SUM  m3, m6, m1
+    ABS1_SUM  m4, m6, m0
+    ABS1_SUM  m5, m6, m1
+%assign %%i %%i+4
+%endrep
+    paddusw m0, m1
+    HSUM    m0, m1, eax
+    and     eax, 0xFFFF
+    RET
+%endmacro
+
+INIT_MMX mmx
+SUM_ABS_DCTELEM 0, 4
+INIT_MMX mmxext
+SUM_ABS_DCTELEM 0, 4
+INIT_XMM sse2
+SUM_ABS_DCTELEM 7, 2
+INIT_XMM ssse3
+SUM_ABS_DCTELEM 6, 2
+
+;------------------------------------------------------------------------------
+; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
+;------------------------------------------------------------------------------
+; %1 = 8/16. %2-5=m#
+%macro HF_NOISE_PART1 5
+    mova      m%2, [pix1q]
+%if %1 == 8
+    mova      m%3, m%2
+    psllq     m%2, 8
+    psrlq     m%3, 8
+    psrlq     m%2, 8
+%else
+    mova      m%3, [pix1q+1]
+%endif
+    mova      m%4, m%2
+    mova      m%5, m%3
+    punpcklbw m%2, m7
+    punpcklbw m%3, m7
+    punpckhbw m%4, m7
+    punpckhbw m%5, m7
+    psubw     m%2, m%3
+    psubw     m%4, m%5
+%endmacro
+
+; %1-2 = m#
+%macro HF_NOISE_PART2 4
+    psubw     m%1, m%3
+    psubw     m%2, m%4
+    pxor       m3, m3
+    pxor       m1, m1
+    pcmpgtw    m3, m%1
+    pcmpgtw    m1, m%2
+    pxor      m%1, m3
+    pxor      m%2, m1
+    psubw     m%1, m3
+    psubw     m%2, m1
+    paddw     m%2, m%1
+    paddw      m6, m%2
+%endmacro
+
+; %1 = 8/16
+%macro HF_NOISE 1
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
+    sub        hd, 2
+    pxor       m7, m7
+    pxor       m6, m6
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+.loop:
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    HF_NOISE_PART2     4, 5, 0, 2
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+    sub        hd, 2
+        jne .loop
+
+    mova       m0, m6
+    punpcklwd  m0, m7
+    punpckhwd  m6, m7
+    paddd      m6, m0
+    mova       m0, m6
+    psrlq      m6, 32
+    paddd      m0, m6
+    movd      eax, m0   ; eax = result of hf_noise8;
+    REP_RET                 ; return eax;
+%endmacro
+
+INIT_MMX mmx
+HF_NOISE 8
+HF_NOISE 16
+
+;---------------------------------------------------------------------------------------
+;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;---------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+    movu      m2, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m2, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+strideq*2]
+    lea    pix2q, [pix2q+strideq*2]
+    movu      m0, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m0, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m0
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m0, m2
+    paddw     m2, m0
+%endif
+    movd     eax, m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD 8
+SAD 16
+INIT_XMM sse2
+SAD 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
+    movu      m0, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m0, m3
+    pavgb     m2, m4
+%else
+    pavgb     m0, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m0, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m1, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m1, m3
+    pavgb     m2, m4
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_X2 8
+SAD_X2 16
+INIT_XMM sse2
+SAD_X2 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m4, [pix2q+8]
+    movu      m5, [pix2q+strideq+8]
+    movu      m6, [pix2q+2*strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_Y2 8
+SAD_Y2 16
+INIT_XMM sse2
+SAD_Y2 16
+
+;-------------------------------------------------------------------------------------------
+;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;-------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_APPROX_XY2 1
+cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
+    mova      m4, [pb_1]
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    movu      m2, [pix2q+2*strideq+1]
+    pavgb     m1, m5
+    pavgb     m0, m6
+    pavgb     m3, m2
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m0, [pix2q+strideq+1]
+    pavgb     m3, [pix2q+2*strideq+1]
+%endif
+    psubusb   m0, m4
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    movu      m7, [pix2q+2*strideq+8]
+    pavgb     m5, [pix2q+1+8]
+    pavgb     m6, [pix2q+strideq+1+8]
+    pavgb     m7, [pix2q+2*strideq+1+8]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    pavgb     m2, m5
+    pavgb     m3, m6
+%else
+    pavgb     m2, [pix2q+1]
+    pavgb     m3, [pix2q+strideq+1]
+%endif
+    psubusb   m2, m4
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m6, [pix2q+8]
+    movu      m7, [pix2q+strideq+8]
+    pavgb     m6, [pix2q+8+1]
+    pavgb     m7, [pix2q+strideq+8+1]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_APPROX_XY2 8
+SAD_APPROX_XY2 16
+INIT_XMM sse2
+SAD_APPROX_XY2 16
+
+;--------------------------------------------------------------------
+;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                  ptrdiff_t line_size, int h);
+;--------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_INTRA 1
+cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
+    mova      m0, [pix1q]
+%if %1 == mmsize
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m0, m2
+%else
+    mova      m2, [pix1q+lsizeq]
+    mova      m3, [pix1q+8]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m0, m2
+    psadbw    m3, m4
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+
+.loop:
+    lea    pix1q, [pix1q + 2*lsizeq]
+%if %1 == mmsize
+    mova      m1, [pix1q]
+    psadbw    m2, m1
+    paddw     m0, m2
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m1, m2
+    paddw     m0, m1
+%else
+    mova      m1, [pix1q]
+    mova      m3, [pix1q+8]
+    psadbw    m2, m1
+    psadbw    m4, m3
+    paddw     m0, m2
+    paddw     m0, m4
+    mova      m2, [pix1q+lsizeq]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m1, m2
+    psadbw    m3, m4
+    paddw     m0, m1
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+    jg     .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_INTRA 8
+VSAD_INTRA 16
+INIT_XMM sse2
+VSAD_INTRA 16
+
+;---------------------------------------------------------------------
+;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                   ptrdiff_t line_size, int h);
+;---------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_APPROX 1
+cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+    mova   m1, [pb_80]
+    mova   m0, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+    mova   m4, [pix1q+lsizeq]
+%if mmsize == 16
+    movu   m3, [pix2q]
+    movu   m2, [pix2q+lsizeq]
+    psubb  m0, m3
+    psubb  m4, m2
+%else
+    psubb  m0, [pix2q]
+    psubb  m4, [pix2q+lsizeq]
+%endif
+    pxor   m0, m1
+    pxor   m4, m1
+    psadbw m0, m4
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m0, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m0, m1
+    pxor   m3, m1
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m0, m4
+    psadbw m3, m5
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+
+.loop:
+    lea pix1q, [pix1q + 2*lsizeq]
+    lea pix2q, [pix2q + 2*lsizeq]
+    mova   m2, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+%if mmsize == 16
+    movu   m3, [pix2q]
+    psubb  m2, m3
+%else
+    psubb  m2, [pix2q]
+%endif
+    pxor   m2, m1
+    psadbw m4, m2
+    paddw  m0, m4
+    mova   m4, [pix1q+lsizeq]
+    movu   m3, [pix2q+lsizeq]
+    psubb  m4, m3
+    pxor   m4, m1
+    psadbw m2, m4
+    paddw  m0, m2
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m2, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m2, m1
+    pxor   m3, m1
+    psadbw m4, m2
+    psadbw m5, m3
+    paddw  m0, m4
+    paddw  m0, m5
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m2, m4
+    psadbw m3, m5
+    paddw  m0, m2
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+    jg  .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd  eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_APPROX 8
+VSAD_APPROX 16
+INIT_XMM sse2
+VSAD_APPROX 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index ee5f559..6aec93e 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,382 +29,67 @@
 #include "libavcodec/me_cmp.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_INLINE_ASM
-
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                    ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl         %4, %%ecx          \n"
-        "shr          $1, %%ecx          \n"
-        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
-        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
-        "1:                              \n"
-        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
-        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
-        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
-        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq      %%mm1, %%mm5          \n"
-        "movq      %%mm3, %%mm6          \n"
-        "psubusb   %%mm2, %%mm1          \n"
-        "psubusb   %%mm4, %%mm3          \n"
-        "psubusb   %%mm5, %%mm2          \n"
-        "psubusb   %%mm6, %%mm4          \n"
-
-        "por       %%mm1, %%mm2          \n"
-        "por       %%mm3, %%mm4          \n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq      %%mm2, %%mm1          \n"
-        "movq      %%mm4, %%mm3          \n"
-
-        "punpckhbw %%mm0, %%mm2          \n"
-        "punpckhbw %%mm0, %%mm4          \n"
-        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd   %%mm2, %%mm2          \n"
-        "pmaddwd   %%mm4, %%mm4          \n"
-        "pmaddwd   %%mm1, %%mm1          \n"
-        "pmaddwd   %%mm3, %%mm3          \n"
-
-        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * stride */
-        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * stride */
-
-        "paddd     %%mm2, %%mm1          \n"
-        "paddd     %%mm4, %%mm3          \n"
-        "paddd     %%mm1, %%mm7          \n"
-        "paddd     %%mm3, %%mm7          \n"
-
-        "decl      %%ecx                 \n"
-        "jnz       1b                    \n"
-
-        "movq      %%mm7, %%mm1          \n"
-        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
-        "paddd     %%mm7, %%mm1          \n"
-        "movd      %%mm1, %2             \n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                     ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
-        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
-        "1:\n"
-        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
-        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
-        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
-        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq %%mm1, %%mm5\n"
-        "movq %%mm3, %%mm6\n"
-        "psubusb %%mm2, %%mm1\n"
-        "psubusb %%mm4, %%mm3\n"
-        "psubusb %%mm5, %%mm2\n"
-        "psubusb %%mm6, %%mm4\n"
-
-        "por %%mm1, %%mm2\n"
-        "por %%mm3, %%mm4\n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq %%mm2, %%mm1\n"
-        "movq %%mm4, %%mm3\n"
-
-        "punpckhbw %%mm0, %%mm2\n"
-        "punpckhbw %%mm0, %%mm4\n"
-        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd %%mm2, %%mm2\n"
-        "pmaddwd %%mm4, %%mm4\n"
-        "pmaddwd %%mm1, %%mm1\n"
-        "pmaddwd %%mm3, %%mm3\n"
-
-        "add %3, %0\n"
-        "add %3, %1\n"
-
-        "paddd %%mm2, %%mm1\n"
-        "paddd %%mm4, %%mm3\n"
-        "paddd %%mm1, %%mm7\n"
-        "paddd %%mm3, %%mm7\n"
-
-        "decl %%ecx\n"
-        "jnz 1b\n"
-
-        "movq %%mm7, %%mm1\n"
-        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
-        "paddd %%mm7, %%mm1\n"
-        "movd %%mm1, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor  %%mm3, %%mm3\n"
-        "pxor  %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor  %%mm3, %%mm4\n"
-        "pxor  %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq      %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq      %%mm4, %%mm5\n"
-        "movq      %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw     %%mm1, %%mm4\n"
-        "psubw     %%mm3, %%mm5\n"
-        "psubw     %%mm4, %%mm0\n"
-        "psubw     %%mm5, %%mm2\n"
-        "pxor      %%mm3, %%mm3\n"
-        "pxor      %%mm1, %%mm1\n"
-        "pcmpgtw   %%mm0, %%mm3\n\t"
-        "pcmpgtw   %%mm2, %%mm1\n\t"
-        "pxor      %%mm3, %%mm0\n"
-        "pxor      %%mm1, %%mm2\n"
-        "psubw     %%mm3, %%mm0\n"
-        "psubw     %%mm1, %%mm2\n"
-        "paddw     %%mm0, %%mm2\n"
-        "paddw     %%mm2, %%mm6\n"
-
-        "add  %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq      %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd     %%mm0, %%mm6\n"
-
-        "movq  %%mm6, %%mm0\n"
-        "psrlq $32,   %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd  %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-    uint8_t *pix = pix1;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor %%mm3, %%mm4\n"
-        "pxor %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd %%mm0, %%mm6\n"
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                ptrdiff_t stride, int h);
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                 ptrdiff_t stride, int h);
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                              ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                               ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             ptrdiff_t stride, int h);
+int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                          ptrdiff_t stride, int h);
+int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                           ptrdiff_t stride, int h);
+int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
 
-        "movq %%mm6, %%mm0\n"
-        "psrlq $32, %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
+#define hadamard_func(cpu)                                                    \
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
+                                  uint8_t *src2, ptrdiff_t stride, int h);    \
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
+                                    uint8_t *src2, ptrdiff_t stride, int h);
 
-    return tmp + hf_noise8_mmx(pix + 8, stride, h);
-}
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
 
+#if HAVE_X86ASM
 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
@@ -413,9 +98,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
     if (c)
         score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
     else
-        score1 = sse16_mmx(c, pix1, pix2, stride, h);
-    score2 = hf_noise16_mmx(pix1, stride, h) -
-             hf_noise16_mmx(pix2, stride, h);
+        score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
+    score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
+           - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -426,9 +111,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                      ptrdiff_t stride, int h)
 {
-    int score1 = sse8_mmx(c, pix1, pix2, stride, h);
-    int score2 = hf_noise8_mmx(pix1, stride, h) -
-                 hf_noise8_mmx(pix2, stride, h);
+    int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
+    int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
+                 ff_hf_noise8_mmx(pix2, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -436,13 +121,17 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
         return score1 + FFABS(score2) * 8;
 }
 
+#endif /* HAVE_X86ASM */
+
+#if HAVE_INLINE_ASM
+
 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
                             ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)               \
     "movq (%0), %%mm2\n"                        \
@@ -500,57 +189,14 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 }
 #undef SUM
 
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
-                               ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq 8(%0), " #out1 "\n"                   \
-    "add %2, %0\n"                              \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n"
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pxor %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq 8(%0), %%mm1\n"
-        "add %2, %0\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %1\n"
-        : "+r" (pix), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix1) & 7) == 0);
+    av_assert2((((int) pix2) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)       \
     "movq (%0), %%mm2\n"                \
@@ -624,191 +270,16 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 }
 #undef SUM
 
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                         ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq (%1), %%mm2\n"                        \
-    "movq 8(%0), " #out1 "\n"                   \
-    "movq 8(%1), %%mm3\n"                       \
-    "add %3, %0\n"                              \
-    "add %3, %1\n"                              \
-    "psubb %%mm2, " #out0 "\n"                  \
-    "psubb %%mm3, " #out1 "\n"                  \
-    "pxor %%mm7, " #out0 "\n"                   \
-    "pxor %%mm7, " #out1 "\n"                   \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n    "
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pcmpeqw %%mm7, %%mm7\n"
-        "psllw $15, %%mm7\n"
-        "packsswb %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq (%1), %%mm2\n"
-        "movq 8(%0), %%mm1\n"
-        "movq 8(%1), %%mm3\n"
-        "add %3, %0\n"
-        "add %3, %1\n"
-        "psubb %%mm2, %%mm0\n"
-        "psubb %%mm3, %%mm1\n"
-        "pxor %%mm7, %%mm0\n"
-        "pxor %%mm7, %%mm1\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
-#define MMABS_MMX(a,z)                          \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "pcmpgtw " #a ", " #z "             \n\t"   \
-    "pxor "    #z ", " #a "             \n\t"   \
-    "psubw "   #z ", " #a "             \n\t"
-
-#define MMABS_MMXEXT(a, z)                      \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "psubw "   #a ", " #z "             \n\t"   \
-    "pmaxsw "  #z ", " #a "             \n\t"
-
-#define MMABS_SSSE3(a,z)                        \
-    "pabsw "   #a ", " #a "             \n\t"
-
-#define MMABS_SUM(a,z, sum)                     \
-    MMABS(a,z)                                  \
-    "paddusw " #a ", " #sum "           \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst)                     \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $32, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $16, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_MMXEXT(a, t, dst)                  \
-    "pshufw   $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshufw   $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_SSE2(a, t, dst)                    \
-    "movhlps " #a ", " #t "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define DCT_SAD4(m, mm, o)                      \
-    "mov"#m" "#o" +  0(%1), " #mm "2    \n\t"   \
-    "mov"#m" "#o" + 16(%1), " #mm "3    \n\t"   \
-    "mov"#m" "#o" + 32(%1), " #mm "4    \n\t"   \
-    "mov"#m" "#o" + 48(%1), " #mm "5    \n\t"   \
-    MMABS_SUM(mm ## 2, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 3, mm ## 7, mm ## 1)        \
-    MMABS_SUM(mm ## 4, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 5, mm ## 7, mm ## 1)        \
-
-#define DCT_SAD_MMX                             \
-    "pxor    %%mm0, %%mm0               \n\t"   \
-    "pxor    %%mm1, %%mm1               \n\t"   \
-    DCT_SAD4(q, %%mm, 0)                        \
-    DCT_SAD4(q, %%mm, 8)                        \
-    DCT_SAD4(q, %%mm, 64)                       \
-    DCT_SAD4(q, %%mm, 72)                       \
-    "paddusw %%mm1, %%mm0               \n\t"   \
-    HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2                            \
-    "pxor    %%xmm0, %%xmm0             \n\t"   \
-    "pxor    %%xmm1, %%xmm1             \n\t"   \
-    DCT_SAD4(dqa, %%xmm, 0)                     \
-    DCT_SAD4(dqa, %%xmm, 64)                    \
-    "paddusw %%xmm1, %%xmm0             \n\t"   \
-    HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu)                           \
-static int sum_abs_dctelem_ ## cpu(int16_t *block)  \
-{                                                   \
-    int sum;                                        \
-    __asm__ volatile (                              \
-        DCT_SAD                                     \
-        :"=r"(sum)                                  \
-        :"r"(block));                               \
-    return sum & 0xFFFF;                            \
-}
-
-#define DCT_SAD         DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z)     MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z)     MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD         DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z)     MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
-
 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
     0x0000000000000000ULL,
     0x0001000100010001ULL,
     0x0002000200020002ULL,
 };
 
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
-
 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -841,133 +312,10 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
         : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
 }
 
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
-                      ptrdiff_t stride, int h)
-{
-    int ret;
-    __asm__ volatile (
-        "pxor %%xmm2, %%xmm2            \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movdqu (%1), %%xmm0            \n\t"
-        "movdqu (%1, %4), %%xmm1        \n\t"
-        "psadbw (%2), %%xmm0            \n\t"
-        "psadbw (%2, %4), %%xmm1        \n\t"
-        "paddw %%xmm0, %%xmm2           \n\t"
-        "paddw %%xmm1, %%xmm2           \n\t"
-        "lea (%1,%4,2), %1              \n\t"
-        "lea (%2,%4,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        "movhlps %%xmm2, %%xmm0         \n\t"
-        "paddw   %%xmm0, %%xmm2         \n\t"
-        "movd    %%xmm2, %3             \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
-        : "r" (stride));
-    return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "pavgb 1(%1, %3), %%mm1         \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq (%1), %%mm0               \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq "MANGLE(bone)", %%mm5     \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1,%3), %%mm2            \n\t"
-        "pavgb 1(%1), %%mm1             \n\t"
-        "pavgb 1(%1,%3), %%mm2          \n\t"
-        "psubusb %%mm5, %%mm1           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2,%3), %%mm1          \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -1006,7 +354,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
         "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
@@ -1030,7 +378,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddw %%mm4, %%mm2             \n\t"
         "paddw %%mm5, %%mm3             \n\t"
-        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+        "movq %5, %%mm5                 \n\t"
         "paddw %%mm2, %%mm0             \n\t"
         "paddw %%mm3, %%mm1             \n\t"
         "paddw %%mm5, %%mm0             \n\t"
@@ -1054,7 +402,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         " js 1b                         \n\t"
         : "+a" (len)
         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
-          "r" (stride));
+          "r" (stride), "m" (round_tab[2]));
 }
 
 static inline int sum_mmx(void)
@@ -1072,15 +420,6 @@ static inline int sum_mmx(void)
     return ret & 0xFFFF;
 }
 
-static inline int sum_mmxext(void)
-{
-    int ret;
-    __asm__ volatile (
-        "movd %%mm6, %0                 \n\t"
-        : "=r" (ret));
-    return ret;
-}
-
 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
                                 ptrdiff_t stride, int h)
 {
@@ -1097,7 +436,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
                         uint8_t *blk1, ptrdiff_t stride, int h)         \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1111,7 +450,7 @@ static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1126,7 +465,7 @@ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1141,7 +480,7 @@ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
                             uint8_t *blk1, ptrdiff_t stride, int h)     \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1211,32 +550,15 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
 }                                                                       \
 
 PIX_SAD(mmx)
-PIX_SAD(mmxext)
 
 #endif /* HAVE_INLINE_ASM */
 
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                  ptrdiff_t stride, int h);
-
-#define hadamard_func(cpu)                                                    \
-    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
-                                  uint8_t *src2, ptrdiff_t stride, int h);    \
-    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
-                                    uint8_t *src2, ptrdiff_t stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmxext)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
     if (INLINE_MMX(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
-
         c->pix_abs[0][0] = sad16_mmx;
         c->pix_abs[0][1] = sad16_x2_mmx;
         c->pix_abs[0][2] = sad16_y2_mmx;
@@ -1249,77 +571,81 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
         c->sad[0] = sad16_mmx;
         c->sad[1] = sad8_mmx;
 
-        c->sse[0]  = sse16_mmx;
-        c->sse[1]  = sse8_mmx;
         c->vsad[4] = vsad_intra16_mmx;
 
-        c->nsse[0] = nsse16_mmx;
-        c->nsse[1] = nsse8_mmx;
-
         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
             c->vsad[0] = vsad16_mmx;
         }
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
-
-        c->vsad[4] = vsad_intra16_mmxext;
-
-        c->pix_abs[0][0] = sad16_mmxext;
-        c->pix_abs[1][0] = sad8_mmxext;
-
-        c->sad[0] = sad16_mmxext;
-        c->sad[1] = sad8_mmxext;
-
-        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
-            c->pix_abs[0][1] = sad16_x2_mmxext;
-            c->pix_abs[0][2] = sad16_y2_mmxext;
-            c->pix_abs[0][3] = sad16_xy2_mmxext;
-            c->pix_abs[1][1] = sad8_x2_mmxext;
-            c->pix_abs[1][2] = sad8_y2_mmxext;
-            c->pix_abs[1][3] = sad8_xy2_mmxext;
-
-            c->vsad[0] = vsad16_mmxext;
-        }
-    }
-
-    if (INLINE_SSE2(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_sse2;
-    }
-
-    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
-        c->sad[0] = sad16_sse2;
-    }
-
-#if HAVE_SSSE3_INLINE
-    if (INLINE_SSSE3(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
-    }
-#endif
 #endif /* HAVE_INLINE_ASM */
 
     if (EXTERNAL_MMX(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
+        c->sse[0]            = ff_sse16_mmx;
+        c->sse[1]            = ff_sse8_mmx;
+#if HAVE_X86ASM
+        c->nsse[0]           = nsse16_mmx;
+        c->nsse[1]           = nsse8_mmx;
+#endif
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
+
+        c->sad[0] = ff_sad16_mmxext;
+        c->sad[1] = ff_sad8_mmxext;
+
+        c->pix_abs[0][0] = ff_sad16_mmxext;
+        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+        c->pix_abs[1][0] = ff_sad8_mmxext;
+        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+        c->vsad[4] = ff_vsad_intra16_mmxext;
+        c->vsad[5] = ff_vsad_intra8_mmxext;
+
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
+            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
+
+            c->vsad[0] = ff_vsad16_approx_mmxext;
+            c->vsad[1] = ff_vsad8_approx_mmxext;
+        }
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->sse[0] = ff_sse16_sse2;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
 
 #if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
 #endif
+        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+            c->sad[0]        = ff_sad16_sse2;
+            c->pix_abs[0][0] = ff_sad16_sse2;
+            c->pix_abs[0][1] = ff_sad16_x2_sse2;
+            c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+            c->vsad[4]       = ff_vsad_intra16_sse2;
+            if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
+                c->vsad[0]       = ff_vsad16_approx_sse2;
+            }
+        }
     }
 
-    if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
     }
 }
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm
new file mode 100644
index 0000000..3dc641e
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.asm
@@ -0,0 +1,196 @@
+;******************************************************************************
+;* SIMD-optimized MLP DSP functions
+;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%if ARCH_X86_64
+
+%macro SHLX 2
+%if cpuflag(bmi2)
+   shlx %1, %1, %2q
+%else
+   shl  %1, %2b
+%endif
+%endmacro
+
+%macro REMATRIX 0
+    movdqa        m0, [samplesq]
+    movdqa        m1, [coeffsq ]
+    pshufd        m2, m0, q2301
+    pshufd        m3, m1, q2301
+    pmuldq        m0, m1
+    pmuldq        m3, m2
+    paddq         m0, m3
+%if notcpuflag(avx2)
+    movdqa        m1, [samplesq + 16]
+    movdqa        m2, [coeffsq  + 16]
+    pshufd        m3, m1, q2301
+    pshufd        m4, m2, q2301
+    pmuldq        m1, m2
+    pmuldq        m4, m3
+    paddq         m0, m1
+    paddq         m0, m4
+%else
+    vextracti128 xm1, m0, 1
+    paddq        xm0, xm1
+%endif
+%endmacro
+
+%macro LOOP_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+%macro LOOP_SHIFT_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    and       indexd, auspd                         ; index &= access_unit_size_pow2;
+    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
+    add       indexd, index2d                       ; index += index2
+    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
+    add       accumq, noiseq                        ; accum += noise_buffer[index]
+    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, noised                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
+;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
+;                             int index, unsigned int dest_ch, uint16_t blockpos,
+;                             unsigned int maxchan, int matrix_noise_shift,
+;                             int access_unit_size_pow2, int32_t mask)
+%macro MLP_REMATRIX_CHANNEL 0
+cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
+                                        index, dest_ch, blockpos, maxchan, mns, \
+                                        accum, mask, cnt
+    mov         mnsd, mnsm                          ; load matrix_noise_shift
+    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
+    mov     maxchand, maxchanm                      ; load maxchan
+    mov        maskd, maskm                         ; load mask
+%if WIN64
+    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
+%endif
+    shl     dest_chd, 2
+    lea         cntq, [blsbs_ptrq + blockposq*8]
+    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
+    jne .shift                                      ; jump if true
+    cmp     maxchand, 4                             ; is maxchan < 4?
+    jl .loop4                                       ; jump if true
+
+align 16
+.loop8:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_END
+    jne .loop8
+    RET
+
+align 16
+.loop4:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_END
+    jne .loop4
+    RET
+
+.shift:
+%if WIN64
+    mov       indexd, indexm         ; load index (not needed on UNIX64)
+%endif
+    mov          r9d, r9m            ; load access_unit_size_pow2
+%if cpuflag(bmi2)
+    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
+                index, dest_ch, accum, index2, mns, \
+                ausp, mask, cnt, noise
+    add         mnsd, 7              ; matrix_noise_shift += 7
+%else ; sse4
+    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
+%if WIN64
+    ; r0 = rcx
+    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
+                index2, accum, ausp, mask, cnt, noise
+%else ; UNIX64
+    ; r3 = rcx
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
+                index2, accum, ausp, mask, cnt, noise
+%endif
+    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
+%endif ; cpuflag
+    sub        auspd, 1              ; access_unit_size_pow2 -= 1
+    cmp          r7d, 4              ; is maxchan < 4?
+    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
+    jl .loop4_shift                  ; jump if maxchan < 4
+
+align 16
+.loop8_shift:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_SHIFT_END
+    jne .loop8_shift
+    RET
+
+align 16
+.loop4_shift:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_SHIFT_END
+    jne .loop4_shift
+    RET
+%endmacro
+
+INIT_XMM sse4
+MLP_REMATRIX_CHANNEL
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2, bmi2
+MLP_REMATRIX_CHANNEL
+%endif
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c
index 157ba7c..cb90ca2 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp_init.c
@@ -2,32 +2,47 @@
  * MLP DSP functions x86-optimized
  * Copyright (c) 2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/internal.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mlpdsp.h"
 #include "libavcodec/mlp.h"
 
-#if HAVE_7REGS && HAVE_INLINE_ASM
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+                                   const int32_t *coeffs, \
+                                   const uint8_t *bypassed_lsbs, \
+                                   const int8_t *noise_buffer, \
+                                   int index, \
+                                   unsigned int dest_ch, \
+                                   uint16_t blockpos, \
+                                   unsigned int maxchan, \
+                                   int matrix_noise_shift, \
+                                   int access_unit_size_pow2, \
+                                   int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
 
 extern char ff_mlp_firorder_8;
 extern char ff_mlp_firorder_7;
@@ -133,8 +148,8 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         FIRMUL   (ff_mlp_firorder_6, 0x14   )
         FIRMUL   (ff_mlp_firorder_5, 0x10   )
         FIRMUL   (ff_mlp_firorder_4, 0x0c   )
-        FIRMULREG(ff_mlp_firorder_3, 0x08,10)
-        FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+        FIRMUL   (ff_mlp_firorder_3, 0x08   )
+        FIRMUL   (ff_mlp_firorder_2, 0x04   )
         FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
         LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
         "jmp  *%6                     \n\t"
@@ -163,8 +178,6 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
           /* 6*/"r"(iirjump)      , /* 7*/"c"(filter_shift)
         , /* 8*/"r"((int64_t)coeff[0])
-        , /* 9*/"r"((int64_t)coeff[1])
-        , /*10*/"r"((int64_t)coeff[2])
         : "rax", "rdx", "rsi"
 #else /* ARCH_X86_32 */
           /* 3*/"+m"(blocksize)
@@ -179,9 +192,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
 
 av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
 {
-#if HAVE_7REGS && HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
     if (INLINE_MMX(cpu_flags))
         c->mlp_filter_channel = mlp_filter_channel_x86;
 #endif
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+    if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
 }
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index ffdcf1f..f46a5c4 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -2,20 +2,20 @@
  * SIMD-optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,11 +26,20 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if HAVE_X86ASM
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+#endif /* HAVE_X86ASM */
+
 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
                                float *tmpbuf);
 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
@@ -38,7 +47,7 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
 
 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
-#if HAVE_SSE2_INLINE
+#if HAVE_6REGS && HAVE_SSE_INLINE
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -182,7 +191,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     *out = sum;
 }
 
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
 
 #if HAVE_X86ASM
 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
@@ -217,16 +226,22 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
     }                                                                   \
 }
 
+#if HAVE_SSE
+#if ARCH_X86_32
 DECL_IMDCT_BLOCKS(sse,sse)
+#endif
 DECL_IMDCT_BLOCKS(sse2,sse)
 DECL_IMDCT_BLOCKS(sse3,sse)
 DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
 DECL_IMDCT_BLOCKS(avx,avx)
+#endif
 #endif /* HAVE_X86ASM */
 
 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
     int i, j;
     for (j = 0; j < 4; j++) {
@@ -242,16 +257,19 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
         }
     }
 
-#if HAVE_SSE2_INLINE
-    if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS && HAVE_SSE_INLINE
+    if (INLINE_SSE(cpu_flags)) {
         s->apply_window_float = apply_window_mp3;
     }
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE_INLINE */
 
 #if HAVE_X86ASM
+#if HAVE_SSE
+#if ARCH_X86_32
     if (EXTERNAL_SSE(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse;
     }
+#endif
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse2;
     }
@@ -261,8 +279,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_ssse3;
     }
+#endif
+#if HAVE_AVX_EXTERNAL
     if (EXTERNAL_AVX(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_avx;
     }
+#endif
 #endif /* HAVE_X86ASM */
 }
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 6c0493e..73967ca 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -2,20 +2,20 @@
  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
  * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 
 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
                                   int16_t *block, int n, int qscale)
@@ -35,7 +36,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
 
     qmul = qscale << 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     if (!s->h263_aic) {
         if (n < 4)
@@ -50,7 +51,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
     if(s->ac_pred)
         nCoeffs=63;
     else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
 
 __asm__ volatile(
                 "movd %1, %%mm6                 \n\t" //qmul
@@ -111,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
 
@@ -171,7 +172,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -190,9 +191,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -208,7 +209,7 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
                 "psraw $3, %%mm0                \n\t"
                 "psraw $3, %%mm1                \n\t"
@@ -222,7 +223,7 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -239,7 +240,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -253,9 +254,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -275,7 +276,7 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
                 "psraw $4, %%mm0                \n\t"
                 "psraw $4, %%mm1                \n\t"
@@ -289,7 +290,7 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -306,7 +307,10 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -325,9 +329,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -343,17 +347,17 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psraw $3, %%mm0                \n\t"
-                "psraw $3, %%mm1                \n\t"
+                "psraw $4, %%mm0                \n\t"
+                "psraw $4, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -371,7 +375,10 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -386,9 +393,9 @@ __asm__ volatile(
                 "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq  (%0, %%"FF_REG_a"), %%mm0\n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
                 "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq  (%1, %%"FF_REG_a"), %%mm4\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
@@ -408,10 +415,10 @@ __asm__ volatile(
                 "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%0, %%"FF_REG_a"), %%mm4\n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psrlw $4, %%mm0                \n\t"
-                "psrlw $4, %%mm1                \n\t"
+                "psrlw $5, %%mm0                \n\t"
+                "psrlw $5, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
@@ -420,7 +427,7 @@ __asm__ volatile(
                 "pandn %%mm1, %%mm5             \n\t"
                 "pxor %%mm4, %%mm7              \n\t"
                 "pxor %%mm5, %%mm7              \n\t"
-                "movq %%mm4,  (%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
                 "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
@@ -442,11 +449,11 @@ __asm__ volatile(
         );
 }
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 
 av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
 {
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
@@ -458,5 +465,5 @@ av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 }
diff --git a/libavcodec/x86/mpegvideodsp.c b/libavcodec/x86/mpegvideodsp.c
index b701ef8..6009b64 100644
--- a/libavcodec/x86/mpegvideodsp.c
+++ b/libavcodec/x86/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegvideodsp.h"
+#include "libavcodec/videodsp.h"
 
 #if HAVE_INLINE_ASM
 
@@ -42,20 +43,25 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
     const uint64_t shift2  = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
     int x, y;
 
     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
     const int dxh = dxy * (h - 1);
     const int dyw = dyx * (w - 1);
+    int need_emu  =  (unsigned) ix >= width  - w || width < w ||
+                     (unsigned) iy >= height - h || height< h
+                     ;
 
     if ( // non-constant fullpel offset (3% of blocks)
         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
         // uses more than 16 bits of subpel mv (only at huge resolution)
         (dxx | dxy | dyx | dyy) & 15 ||
-        (unsigned) ix >= width  - w ||
-        (unsigned) iy >= height - h) {
+        (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
         // FIXME could still use mmx for some of the rows
         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
                  shift, r, width, height);
@@ -63,6 +69,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     }
 
     src += ix + iy * stride;
+    if (need_emu) {
+        ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+        src = edge_buf;
+    }
 
     __asm__ volatile (
         "movd         %0, %%mm6         \n\t"
@@ -149,4 +159,3 @@ av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
         c->gmc = gmc_mmx;
 #endif /* HAVE_INLINE_ASM */
 }
-
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index ead2ed1..c884cf1 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -2,20 +2,20 @@
  * The simplest mpeg encoder (well, it was the simplest!)
  * Copyright (c) 2000,2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,12 +29,18 @@
 
 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
 DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
-    1,  2,  6,  7,  15, 16, 28, 29, 3,  5,  8,  14, 17, 27, 30, 43, 4,  9,  13,
-    18, 26, 31, 42, 44, 10, 12, 19, 25, 32, 41, 45, 54, 11, 20, 24, 33, 40, 46,
-    53, 55, 21, 23, 34, 39, 47, 52, 56, 61, 22, 35, 38, 48, 51, 57, 60, 62, 36,
-    37, 49, 50, 58, 59, 63, 64,
+    1,  2,  6,  7,  15, 16, 28, 29,
+    3,  5,  8,  14, 17, 27, 30, 43,
+    4,  9,  13, 18, 26, 31, 42, 44,
+    10, 12, 19, 25, 32, 41, 45, 54,
+    11, 20, 24, 33, 40, 46, 53, 55,
+    21, 23, 34, 39, 47, 52, 56, 61,
+    22, 35, 38, 48, 51, 57, 60, 62,
+    36, 37, 49, 50, 58, 59, 63, 64,
 };
 
+#if HAVE_6REGS
+
 #if HAVE_MMX_INLINE
 #define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2   0
@@ -86,7 +92,10 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_SSSE3_INLINE */
 
+#endif /* HAVE_6REGS */
+
 #if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -140,7 +149,9 @@ static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
         : "r"(block+64)
     );
 }
+#endif /* HAVE_MMX_INLINE */
 
+#if HAVE_SSE2_INLINE
 static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -196,9 +207,10 @@ static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
                             "%xmm4", "%xmm5", "%xmm6", "%xmm7")
     );
 }
+#endif /* HAVE_SSE2_INLINE */
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
 {
     const int dct_algo = s->avctx->dct_algo;
 
@@ -206,21 +218,25 @@ av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
 #if HAVE_MMX_INLINE
         int cpu_flags = av_get_cpu_flags();
         if (INLINE_MMX(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_mmx;
+#endif
             s->denoise_dct  = denoise_dct_mmx;
         }
 #endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_6REGS && HAVE_MMXEXT_INLINE
         if (INLINE_MMXEXT(cpu_flags))
             s->dct_quantize = dct_quantize_mmxext;
 #endif
 #if HAVE_SSE2_INLINE
         if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
+#endif
             s->denoise_dct  = denoise_dct_sse2;
         }
 #endif
-#if HAVE_SSSE3_INLINE
+#if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
 #endif
diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c b/libavcodec/x86/mpegvideoenc_qns_template.c
index 8d8d687..882d486 100644
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ b/libavcodec/x86/mpegvideoenc_qns_template.c
@@ -5,26 +5,26 @@
  * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
  * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
 #include <stdint.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/x86/asm.h"
 
@@ -36,7 +36,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
 {
     x86_reg i=0;
 
-    assert(FFABS(scale) < MAX_ABS);
+    av_assert2(FFABS(scale) < MAX_ABS);
     scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
 
     SET_RND(mm6);
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index 72df76b..1201be5 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -108,7 +108,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     const uint16_t *qmat, *bias;
     LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
-    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+    av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
 
     //s->fdct (block);
     RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
@@ -118,10 +118,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 
     if (s->mb_intra) {
         int dummy;
-        if (n < 4)
+        if (n < 4){
             q = s->y_dc_scale;
-        else
+            bias = s->q_intra_matrix16[qscale][1];
+            qmat = s->q_intra_matrix16[qscale][0];
+        }else{
             q = s->c_dc_scale;
+            bias = s->q_chroma_intra_matrix16[qscale][1];
+            qmat = s->q_chroma_intra_matrix16[qscale][0];
+        }
         /* note: block[0] is assumed to be positive */
         if (!s->h263_aic) {
         __asm__ volatile (
@@ -136,8 +141,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
         last_non_zero_p1 = 1;
-        bias = s->q_intra_matrix16[qscale][1];
-        qmat = s->q_intra_matrix16[qscale][0];
     } else {
         last_non_zero_p1 = 0;
         bias = s->q_inter_matrix16[qscale][1];
@@ -173,7 +176,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"FF_REG_a"           \n\t"
-            "movzb %%al, %%"FF_REG_a"           \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -207,7 +210,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"FF_REG_a"           \n\t"
-            "movzb %%al, %%"FF_REG_a"           \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -221,7 +224,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         "psubusw "MM"1, "MM"4               \n\t"
         "packuswb "MM"4, "MM"4              \n\t"
 #if COMPILE_TEMPLATE_SSE2
-        "packuswb "MM"4, "MM"4              \n\t"
+        "packsswb "MM"4, "MM"4              \n\t"
 #endif
         "movd "MM"4, %0                     \n\t" // *overflow
         : "=g" (*overflow)
@@ -275,7 +278,51 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
         block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
         block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
-    }else{
+    }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x04] = temp_block[0x01];
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+        block[0x05] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1C] = temp_block[0x19];
+        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    } else if (s->idsp.perm_type == FF_IDCT_PERM_NONE) {
         if(last_non_zero_p1 <= 1) goto end;
         block[0x01] = temp_block[0x01];
         block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
@@ -319,6 +366,57 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
         block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
         block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    } else if (s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE) {
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x08] = temp_block[0x01];
+        block[0x01] = temp_block[0x08]; block[0x02] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x09] = temp_block[0x09]; block[0x10] = temp_block[0x02];
+        block[0x18] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x11] = temp_block[0x0A]; block[0x0A] = temp_block[0x11];
+        block[0x03] = temp_block[0x18]; block[0x04] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x0B] = temp_block[0x19];
+        block[0x12] = temp_block[0x12]; block[0x19] = temp_block[0x0B];
+        block[0x20] = temp_block[0x04]; block[0x28] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x21] = temp_block[0x0C]; block[0x1A] = temp_block[0x13];
+        block[0x13] = temp_block[0x1A]; block[0x0C] = temp_block[0x21];
+        block[0x05] = temp_block[0x28]; block[0x06] = temp_block[0x30];
+        block[0x0D] = temp_block[0x29]; block[0x14] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1B] = temp_block[0x1B]; block[0x22] = temp_block[0x14];
+        block[0x29] = temp_block[0x0D]; block[0x30] = temp_block[0x06];
+        block[0x38] = temp_block[0x07]; block[0x31] = temp_block[0x0E];
+        block[0x2A] = temp_block[0x15]; block[0x23] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x1C] = temp_block[0x23]; block[0x15] = temp_block[0x2A];
+        block[0x0E] = temp_block[0x31]; block[0x07] = temp_block[0x38];
+        block[0x0F] = temp_block[0x39]; block[0x16] = temp_block[0x32];
+        block[0x1D] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x2B] = temp_block[0x1D]; block[0x32] = temp_block[0x16];
+        block[0x39] = temp_block[0x0F]; block[0x3A] = temp_block[0x17];
+        block[0x33] = temp_block[0x1E]; block[0x2C] = temp_block[0x25];
+        block[0x25] = temp_block[0x2C]; block[0x1E] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x17] = temp_block[0x3A]; block[0x1F] = temp_block[0x3B];
+        block[0x26] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
+        block[0x34] = temp_block[0x26]; block[0x3B] = temp_block[0x1F];
+        block[0x3C] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x2E] = temp_block[0x35]; block[0x27] = temp_block[0x3C];
+        block[0x2F] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
+        block[0x3D] = temp_block[0x2F]; block[0x3E] = temp_block[0x37];
+        block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    } else {
+        av_log(s, AV_LOG_DEBUG, "s->idsp.perm_type: %d\n",
+                (int)s->idsp.perm_type);
+        av_assert0(s->idsp.perm_type == FF_IDCT_PERM_NONE ||
+                s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2 ||
+                s->idsp.perm_type == FF_IDCT_PERM_SIMPLE ||
+                s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE);
     }
     end:
     return last_non_zero_p1 - 1;
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 9326ee7..aec73f8 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -4,92 +4,151 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION .text
+SECTION_RODATA
 
-INIT_MMX mmx
+cextern pw_1
+
+SECTION .text
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
     movsxdifnidn r1, r1d
-    mov          r2, r1
-    neg          r2
-    shl          r2, 4
-    sub          r0, r2
-    pxor         m7, m7
-    pxor         m6, m6
+    mov          r2, %1
+%if mmsize == 16
+    lea          r3, [r1*3]
+%endif
+%if notcpuflag(xop)
+    pxor         m5, m5
+%endif
+    pxor         m4, m4
 .loop:
-    mova         m0, [r0+r2+0]
-    mova         m1, [r0+r2+0]
-    mova         m2, [r0+r2+8]
-    mova         m3, [r0+r2+8]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+%if cpuflag(xop)
+    vphaddubq    m0, [r0]
+    vphaddubq    m1, [r0+r1]
+    vphaddubq    m2, [r0+r1*2]
+    vphaddubq    m3, [r0+r3]
+%else
+    mova         m0, [r0]
+%if mmsize == 8
+    mova         m1, [r0+8]
+%if cpuflag(mmxext)
+    mova         m2, [r0+r1]
+    mova         m3, [r0+r1+8]
+%endif
+%else ; sse2
+    mova         m1, [r0+r1]
+    mova         m2, [r0+r1*2]
+    mova         m3, [r0+r3]
+%endif
+%if cpuflag(mmxext)
+    psadbw       m0, m5
+    psadbw       m1, m5
+    psadbw       m2, m5
+    psadbw       m3, m5
+%else ; mmx
+    punpckhbw    m2, m0, m5
+    punpcklbw    m0, m5
+    punpckhbw    m3, m1, m5
+    punpcklbw    m1, m5
+%endif ; cpuflag(mmxext)
+%endif ; cpuflag(xop)
     paddw        m1, m0
     paddw        m3, m2
     paddw        m3, m1
-    paddw        m6, m3
-    add          r2, r1
-    js .loop
-    mova         m5, m6
-    psrlq        m6, 32
-    paddw        m6, m5
-    mova         m5, m6
-    psrlq        m6, 16
-    paddw        m6, m5
-    movd        eax, m6
-    and         eax, 0xffff
+    paddw        m4, m3
+%if cpuflag(mmxext)
+    lea          r0, [r0+r1*%3]
+%else
+    add          r0, r1
+%endif
+    dec r2
+    jne .loop
+%if mmsize == 16
+    pshufd       m0, m4, q0032
+    paddd        m4, m0
+%elif notcpuflag(mmxext)
+    HADDW        m4, m5
+%endif
+    movd        eax, m4
     RET
+%endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16  8, 4, 2
+%endif
+INIT_XMM sse2
+PIX_SUM16  4, 4, 4
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+PIX_SUM16  4, 4, 4
+%endif
+
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
     movsxdifnidn r1, r1d
-    mov          r2, 16
+    mov          r2, %2
     pxor         m0, m0
-    pxor         m7, m7
+    pxor         m5, m5
 .loop:
     mova         m2, [r0+0]
+%if mmsize == 8
     mova         m3, [r0+8]
-    mova         m1, m2
-    punpckhbw    m1, m0
+%else
+    mova         m3, [r0+r1]
+%endif
+    punpckhbw    m1, m2, m0
     punpcklbw    m2, m0
-    mova         m4, m3
-    punpckhbw    m3, m0
-    punpcklbw    m4, m0
+    punpckhbw    m4, m3, m0
+    punpcklbw    m3, m0
     pmaddwd      m1, m1
     pmaddwd      m2, m2
     pmaddwd      m3, m3
     pmaddwd      m4, m4
     paddd        m2, m1
     paddd        m4, m3
-    paddd        m7, m2
+    paddd        m5, m2
+    paddd        m5, m4
+%if mmsize == 8
     add          r0, r1
-    paddd        m7, m4
+%else
+    lea          r0, [r0+r1*2]
+%endif
     dec r2
     jne .loop
-    mova         m1, m7
-    psrlq        m7, 32
-    paddd        m1, m7
-    movd        eax, m1
+    HADDD        m5, m1
+    movd        eax, m5
     RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8
 
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index 71fbf28..532836c 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -1,29 +1,34 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 
@@ -123,7 +128,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
-    } else {
+    } else if (w == 16) {
         __asm__ volatile (
             "1:                                 \n\t"
             "movd            (%0), %%mm0        \n\t"
@@ -141,6 +146,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             "add               %1, %0           \n\t"
             "cmp               %3, %0           \n\t"
             "jb                1b               \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
@@ -195,11 +219,26 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->pix_sum   = ff_pix_sum16_mmx;
         c->pix_norm1 = ff_pix_norm1_mmx;
     }
 
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_mmxext;
+    }
+#endif
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_sse2;
+        c->pix_norm1   = ff_pix_norm1_sse2;
+    }
+
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_xop;
+    }
+
 #if HAVE_INLINE_ASM
 
     if (INLINE_MMX(cpu_flags)) {
diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c
new file mode 100644
index 0000000..a9f8a96
--- /dev/null
+++ b/libavcodec/x86/opus_dsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * Opus encoder assembly optimizations
+ * Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/opus_pvq.h"
+
+extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
+extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
+extern float ff_pvq_search_exact_avx  (float *X, int *y, int K, int N);
+
+av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_OPUS_ENCODER
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->pvq_search = ff_pvq_search_approx_sse2;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        s->pvq_search = ff_pvq_search_approx_sse4;
+
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        s->pvq_search = ff_pvq_search_exact_avx;
+#endif
+}
diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm
new file mode 100644
index 0000000..5c1e6d6
--- /dev/null
+++ b/libavcodec/x86/opus_pvq_search.asm
@@ -0,0 +1,385 @@
+;******************************************************************************
+;* SIMD optimized Opus encoder DSP function
+;*
+;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "config.asm"
+%include "libavutil/x86/x86util.asm"
+
+%ifdef __NASM_VER__
+%use "smartalign"
+ALIGNMODE p6
+%endif
+
+SECTION_RODATA 64
+
+const_float_abs_mask:   times 8 dd 0x7fffffff
+const_align_abs_edge:   times 8 dd 0
+
+const_float_0_5:        times 8 dd 0.5
+const_float_1:          times 8 dd 1.0
+const_float_sign_mask:  times 8 dd 0x80000000
+
+const_int32_offsets:
+                        %rep 8
+                                dd $-const_int32_offsets
+                        %endrep
+SECTION .text
+
+;
+;   Setup High Register to be used
+;   for holding memory constants
+;
+; %1 - the register to be used, assmues it is >= mm8
+; %2 - name of the constant.
+;
+; Subsequent opcodes are going to use the constant in the form
+; "addps m0, mm_const_name" and it would be turned into:
+; "addps m0, [const_name]" on 32 bit arch or
+; "addps m0, m8" on 64 bit arch
+%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
+%if num_mmregs > 8
+    %define  mm_%3   %2
+    %{1}        %2, [%3]    ; movaps m8, [const_name]
+%else
+    %define  mm_%3  [%3]
+%endif
+%endmacro
+
+;
+;   Set Position Independent Code
+;       Base address of a constant
+; %1 - the register to be used, if PIC is set
+; %2 - name of the constant.
+;
+; Subsequent opcode are going to use the base address in the form
+; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
+; "movaps m0, [r5 + r4]" if PIC is enabled
+; "movaps m0, [constant_name + r4]" if texrel are used
+%macro SET_PIC_BASE 3; reg, const_label
+%ifdef PIC
+    %{1}     %2, [%3]      ; lea r5, [rip+const]
+    %define  pic_base_%3 %2
+%else
+    %define  pic_base_%3 %3
+%endif
+%endmacro
+
+%macro PULSES_SEARCH 1
+; m6 Syy_norm
+; m7 Sxy_norm
+    addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
+    pxor           m1, m1                   ; max_idx
+    xorps          m3, m3                   ; p_max
+    xor           r4d, r4d
+align 16
+%%distortion_search:
+    movd          xm2, dword r4d    ; movd zero extends
+%ifidn %1,add
+    movaps         m4, [tmpY + r4]  ; y[i]
+    movaps         m5, [tmpX + r4]  ; X[i]
+
+  %if USE_APPROXIMATION == 1
+    xorps          m0, m0
+    cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
+  %endif
+
+    addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
+    addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm
+
+  %if USE_APPROXIMATION == 1
+    andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
+  %endif
+
+%else
+    movaps         m5, [tmpY + r4]      ; m5 = y[i]
+
+    xorps          m0, m0               ; m0 = 0;
+    cmpps          m0, m0, m5, 1        ; m0 = (0<y)
+
+    subps          m4, m6, m5           ; m4 = Syy_new = Syy_norm - y[i]
+    subps          m5, m7, [tmpX + r4]  ; m5 = Sxy_new = Sxy_norm - X[i]
+    andps          m5, m0               ; (0<y)?m5:0
+%endif
+
+%if USE_APPROXIMATION == 1
+    rsqrtps        m4, m4
+    mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
+%else
+    mulps          m5, m5
+    divps          m5, m4           ; m5 = p = Sxy_new*Sxy_new/Syy
+%endif
+    VPBROADCASTD   m2, xm2          ; m2=i (all lanes get same values, we add the offset-per-lane, later)
+
+    cmpps          m0, m3, m5, 1    ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
+    maxps          m3, m5           ; m3=max(p_max,p)
+                                    ; maxps here is faster than blendvps, despite blend having lower latency.
+
+    pand           m2, m0           ; This version seems faster than sse41 pblendvb
+    pmaxsw         m1, m2           ; SSE2 signed word, so it would work for N < 32768/4
+
+    add           r4d, mmsize
+    cmp           r4d, Nd
+    jb   %%distortion_search
+
+    por            m1, mm_const_int32_offsets  ; max_idx offsets per individual lane (skipped in the inner loop)
+    movdqa         m4, m1                      ; needed for the aligned y[max_idx]+=1; processing
+
+%if mmsize >= 32
+; Merge parallel maximums round 8 (4 vs 4)
+
+    vextractf128  xm5, ym3, 1       ; xmm5 = ymm3[1x128] = ymm3[255..128b]
+    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
+
+    vextracti128  xm2, ym1, 1       ; xmm2 = ymm1[1x128] = ymm1[255..128b]
+    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
+    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[1x128]       : p[0x128]
+%endif
+
+; Merge parallel maximums round 4 (2 vs 2)
+                                    ; m3=p[3210]
+    movhlps       xm5, xm3          ; m5=p[xx32]
+    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
+
+    pshufd        xm2, xm1, q3232
+    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
+    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[3,2]       : p[1,0]
+
+; Merge parallel maximums final round (1 vs 1)
+    shufps        xm0, xm3, xm3, q1111  ; m0 = m3[1] = p[1]
+    cmpss         xm0, xm3, 5           ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
+
+    pshufd        xm2, xm1, q1111
+    PBLENDVB      xm1, xm2, xm0
+
+    movd    dword r4d, xm1          ; zero extends to the rest of r4q
+
+    VBROADCASTSS   m3, [tmpX + r4]
+    %{1}ps         m7, m3           ; Sxy += X[max_idx]
+
+    VBROADCASTSS   m5, [tmpY + r4]
+    %{1}ps         m6, m5           ; Syy += Y[max_idx]
+
+    ; We have to update a single element in Y[i]
+    ; However writing 4 bytes and then doing 16 byte load in the inner loop
+    ; could cause a stall due to breaking write forwarding.
+    VPBROADCASTD   m1, xm1
+    pcmpeqd        m1, m1, m4           ; exactly 1 element matches max_idx and this finds it
+
+    and           r4d, ~(mmsize-1)      ; align address down, so the value pointed by max_idx is inside a mmsize load
+    movaps         m5, [tmpY + r4]      ; m5 = Y[y3...ym...y0]
+    andps          m1, mm_const_float_1 ; m1 =  [ 0...1.0...0]
+    %{1}ps         m5, m1               ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
+    movaps [tmpY + r4], m5              ; Y[max_idx] +-= 1.0;
+%endmacro
+
+;
+; We need one more register for
+; PIC relative addressing. Use this
+; to count it in cglobal
+;
+%ifdef PIC
+  %define num_pic_regs 1
+%else
+  %define num_pic_regs 0
+%endif
+
+;
+; Pyramid Vector Quantization Search implementation
+;
+; float * inX   - Unaligned (SIMD) access, it will be overread,
+;                 but extra data is masked away.
+; int32 * outY  - Should be aligned and padded buffer.
+;                 It is used as temp buffer.
+; uint32 K      - Number of pulses to have after quantizations.
+; uint32 N      - Number of vector elements. Must be 0 < N < 256
+;
+%macro PVQ_FAST_SEARCH 1
+cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
+%define tmpX rsp
+%define tmpY outYq
+
+    movaps     m0, [const_float_abs_mask]
+    shl        Nd, 2    ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
+    mov       r4d, Nd
+
+    neg       r4d
+    and       r4d, mmsize-1
+
+    SET_PIC_BASE lea, r5, const_align_abs_edge  ; rip+const
+    movups     m2, [pic_base_const_align_abs_edge + r4 - mmsize]
+
+    add        Nd, r4d              ; N = align(N, mmsize)
+
+    lea       r4d, [Nd - mmsize]    ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
+    movups     m1, [inXq + r4]
+    andps      m1, m2
+    movaps  [tmpX + r4], m1         ; Sx = abs( X[N-1] )
+
+align 16
+%%loop_abs_sum:
+    sub       r4d, mmsize
+    jc   %%end_loop_abs_sum
+
+    movups     m2, [inXq + r4]
+    andps      m2, m0
+
+    movaps  [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
+    addps      m1, m2       ; Sx += abs(X[i])
+    jmp  %%loop_abs_sum
+
+align 16
+%%end_loop_abs_sum:
+
+    HSUMPS     m1, m2       ; m1  = Sx
+
+    xorps      m0, m0
+    comiss    xm0, xm1      ;
+    jz   %%zero_input       ; if (Sx==0) goto zero_input
+
+    cvtsi2ss  xm0, dword Kd ; m0 = K
+%if USE_APPROXIMATION == 1
+    rcpss     xm1, xm1      ; m1 = approx(1/Sx)
+    mulss     xm0, xm1      ; m0 = K*(1/Sx)
+%else
+    divss     xm0, xm1      ; b = K/Sx
+                            ; b = K/max_x
+%endif
+
+    VBROADCASTSS  m0, xm0
+
+    lea       r4d, [Nd - mmsize]
+    pxor       m5, m5             ; Sy    ( Sum of abs( y[i]) )
+    xorps      m6, m6             ; Syy   ( Sum of y[i]*y[i]  )
+    xorps      m7, m7             ; Sxy   ( Sum of X[i]*y[i]  )
+align 16
+%%loop_guess:
+    movaps     m1, [tmpX + r4]    ; m1   = X[i]
+    mulps      m2, m0, m1         ; m2   = res*X[i]
+    cvtps2dq   m2, m2             ; yt   = (int)lrintf( res*X[i] )
+    paddd      m5, m2             ; Sy  += yt
+    cvtdq2ps   m2, m2             ; yt   = (float)yt
+    mulps      m1, m2             ; m1   = X[i]*yt
+    movaps  [tmpY + r4], m2       ; y[i] = m2
+    addps      m7, m1             ; Sxy += m1;
+    mulps      m2, m2             ; m2   = yt*yt
+    addps      m6, m2             ; Syy += m2
+
+    sub       r4d, mmsize
+    jnc  %%loop_guess
+
+    HSUMPS     m6, m1       ; Syy_norm
+    HADDD      m5, m4       ; pulses
+
+    movd  dword r4d, xm5    ; zero extends to the rest of r4q
+
+    sub        Kd, r4d      ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
+    jz   %%finish           ; K - pulses == 0
+
+    SET_HI_REG_MM_CONSTANT movaps,  m8, const_float_0_5
+    SET_HI_REG_MM_CONSTANT movaps,  m9, const_float_1
+    SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
+    ; Use Syy/2 in distortion parameter calculations.
+    ; Saves pre and post-caclulation to correct Y[] values.
+    ; Same precision, since float mantisa is normalized.
+    ; The SQRT approximation does differ.
+    HSUMPS     m7, m0         ; Sxy_norm
+    mulps      m6, mm_const_float_0_5
+
+    jc   %%remove_pulses_loop   ; K - pulses < 0
+
+align 16                        ; K - pulses > 0
+%%add_pulses_loop:
+
+    PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
+
+    sub        Kd, 1
+    jnz  %%add_pulses_loop
+
+    addps      m6, m6 ; Syy*=2
+
+    jmp  %%finish
+
+align 16
+%%remove_pulses_loop:
+
+    PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
+
+    add        Kd, 1
+    jnz  %%remove_pulses_loop
+
+    addps      m6, m6 ; Syy*=2
+
+align 16
+%%finish:
+    lea       r4d, [Nd - mmsize]
+    movaps     m2, [const_float_sign_mask]
+
+align 16
+%%restore_sign_loop:
+    movaps     m0, [tmpY + r4]    ; m0 = Y[i]
+    movups     m1, [inXq + r4]    ; m1 = X[i]
+    andps      m1, m2             ; m1 = sign(X[i])
+    orps       m0, m1             ; m0 = Y[i]*sign
+    cvtps2dq   m3, m0             ; m3 = (int)m0
+    movaps  [outYq + r4], m3
+
+    sub       r4d, mmsize
+    jnc  %%restore_sign_loop
+%%return:
+
+%if ARCH_X86_64 == 0    ; sbrdsp
+    movss     r0m, xm6  ; return (float)Syy_norm
+    fld dword r0m
+%else
+    movaps     m0, m6   ; return (float)Syy_norm
+%endif
+
+    RET
+
+align 16
+%%zero_input:
+    lea       r4d, [Nd - mmsize]
+    xorps      m0, m0
+%%zero_loop:
+    movaps  [outYq + r4], m0
+    sub       r4d, mmsize
+    jnc  %%zero_loop
+
+    movaps     m6, [const_float_1]
+    jmp  %%return
+%endmacro
+
+; if 1, use a float op that give half precision but execute for around 3 cycles.
+; On Skylake & Ryzen the division is much faster (around 11c/3),
+; that makes the full precision code about 2% slower.
+; Opus also does use rsqrt approximation in their intrinsics code.
+%define USE_APPROXIMATION   1
+
+INIT_XMM sse2
+PVQ_FAST_SEARCH _approx
+
+INIT_XMM sse4
+PVQ_FAST_SEARCH _approx
+
+%define USE_APPROXIMATION   0
+
+INIT_XMM avx
+PVQ_FAST_SEARCH _exact
diff --git a/libavcodec/x86/pixblockdsp.asm b/libavcodec/x86/pixblockdsp.asm
index 8712442..440fe29 100644
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
@@ -50,7 +50,7 @@ cglobal get_pixels, 3,4
     REP_RET
 
 INIT_XMM sse2
-cglobal get_pixels, 3, 4
+cglobal get_pixels, 3, 4, 5
     lea          r3, [r2*3]
     pxor         m4, m4
     movh         m0, [r1]
@@ -80,28 +80,49 @@ cglobal get_pixels, 3, 4
     mova  [r0+0x70], m3
     RET
 
-INIT_MMX mmx
 ; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
 ;                         ptrdiff_t stride);
-cglobal diff_pixels, 4,5
-    pxor         m7, m7
+%macro DIFF_PIXELS 0
+cglobal diff_pixels, 4,5,5
+    pxor         m4, m4
     add          r0,  128
     mov          r4, -128
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r2]
-    mova         m1, m0
-    mova         m3, m2
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+    movq         m0, [r1]
+    movq         m2, [r2]
+%if mmsize == 8
+    movq         m1, m0
+    movq         m3, m2
+    punpcklbw    m0, m4
+    punpckhbw    m1, m4
+    punpcklbw    m2, m4
+    punpckhbw    m3, m4
+%else
+    movq         m1, [r1+r3]
+    movq         m3, [r2+r3]
+    punpcklbw    m0, m4
+    punpcklbw    m1, m4
+    punpcklbw    m2, m4
+    punpcklbw    m3, m4
+%endif
     psubw        m0, m2
     psubw        m1, m3
     mova  [r0+r4+0], m0
-    mova  [r0+r4+8], m1
+    mova  [r0+r4+mmsize], m1
+%if mmsize == 8
     add          r1, r3
     add          r2, r3
-    add          r4, 16
+%else
+    lea          r1, [r1+r3*2]
+    lea          r2, [r2+r3*2]
+%endif
+    add          r4, 2 * mmsize
     jne .loop
-    REP_RET
+    RET
+%endmacro
+
+INIT_MMX mmx
+DIFF_PIXELS
+
+INIT_XMM sse2
+DIFF_PIXELS
diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c
index faa5141..ade55e0 100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * SIMD-optimized pixel operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         ptrdiff_t stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                         ptrdiff_t stride);
 
 av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
@@ -37,11 +39,14 @@ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
     if (EXTERNAL_MMX(cpu_flags)) {
         if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_mmx;
+        c->diff_pixels_unaligned =
         c->diff_pixels = ff_diff_pixels_mmx;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_sse2;
+        c->diff_pixels_unaligned =
+        c->diff_pixels = ff_diff_pixels_sse2;
     }
 }
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index 722caf0..50e4255 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
     and                waq, ~(mmsize*2-1)
     jmp .end_v
 .loop_v:
-    mova                m0, [src1q+iq]
-    mova                m1, [src1q+iq+mmsize]
-    paddb               m0, [src2q+iq]
-    paddb               m1, [src2q+iq+mmsize]
-    mova  [dstq+iq       ], m0
-    mova  [dstq+iq+mmsize], m1
+    movu                m0, [src2q+iq]
+    movu                m1, [src2q+iq+mmsize]
+    paddb               m0, [src1q+iq]
+    paddb               m1, [src1q+iq+mmsize]
+    movu  [dstq+iq       ], m0
+    movu  [dstq+iq+mmsize], m1
     add                 iq, mmsize*2
 .end_v:
     cmp                 iq, waq
@@ -157,7 +157,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
     movh            [dstq], m3
     add               dstq, bppq
     cmp               dstq, endq
-    jle .loop
+    jl .loop
 
     mov               dstq, [rsp]
     dec              cntrq
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 34a3da3..7dca62c 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -2,20 +2,20 @@
  * x86 PNG optimizations.
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 9613fa1..65c9fad 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,427 +1,68 @@
 ;******************************************************************************
 ;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
 ;*
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
-%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
-%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
-%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
-%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
-%define W6sh2  8867 ; W6 = 35468 =  8867<<2
-%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
-
 %if ARCH_X86_64
 
 SECTION_RODATA
 
-w4_plus_w2: times 4 dw W4sh2, +W2sh2
-w4_min_w2:  times 4 dw W4sh2, -W2sh2
-w4_plus_w6: times 4 dw W4sh2, +W6sh2
-w4_min_w6:  times 4 dw W4sh2, -W6sh2
-w1_plus_w3: times 4 dw W1sh2, +W3sh2
-w3_min_w1:  times 4 dw W3sh2, -W1sh2
-w7_plus_w3: times 4 dw W7sh2, +W3sh2
-w3_min_w7:  times 4 dw W3sh2, -W7sh2
-w1_plus_w5: times 4 dw W1sh2, +W5sh2
-w5_min_w1:  times 4 dw W5sh2, -W1sh2
-w5_plus_w7: times 4 dw W5sh2, +W7sh2
-w7_min_w5:  times 4 dw W7sh2, -W5sh2
-row_round:  times 8 dw (1<<14)
-
+pw_88:      times 8 dw 0x2008
+cextern pw_1
 cextern pw_4
-cextern pw_8
-cextern pw_512
 cextern pw_1019
+; Below are defined in simple_idct10.asm built from selecting idctdsp
+cextern w4_plus_w2_hi
+cextern w4_min_w2_hi
+cextern w4_plus_w6_hi
+cextern w4_min_w6_hi
+cextern w1_plus_w3_hi
+cextern w3_min_w1_hi
+cextern w7_plus_w3_hi
+cextern w3_min_w7_hi
+cextern w1_plus_w5
+cextern w5_min_w1
+cextern w5_plus_w7
+cextern w7_min_w5
+
+%include "libavcodec/x86/simple_idct10_template.asm"
 
 SECTION .text
 
-; interleave data while maintaining source
-; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
-%macro SBUTTERFLY3 5
-    punpckl%1   m%2, m%4, m%5
-    punpckh%1   m%3, m%4, m%5
-%endmacro
-
-; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
-; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
-;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
-%macro SUMSUB_SHPK 7
-    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
-    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
-    paddd       %1,  %5            ; { a0 + b0 }[0-3]
-    paddd       %2,  %6            ; { a0 + b0 }[4-7]
-    psrad       %1,  %7
-    psrad       %2,  %7
-    psrad       %3,  %7
-    psrad       %4,  %7
-    packssdw    %1,  %2            ; row[0]
-    packssdw    %3,  %4            ; row[7]
-%endmacro
-
-; %1 = row or col (for rounding variable)
-; %2 = number of bits to shift at the end
-%macro IDCT_1D 2
-    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
-    ; a1 = a0;
-    ; a2 = a0;
-    ; a3 = a0;
-    ; a0 += W2 * row[2];
-    ; a1 += W6 * row[2];
-    ; a2 -= W6 * row[2];
-    ; a3 -= W2 * row[2];
-%ifidn %1, col
-    paddw       m10,[pw_8]
-%endif
-    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
-%ifidn %1, row
-    psubw       m10,[row_round]
-%endif
-    SIGNEXTEND  m8,  m9,  m14      ; { row[2] }[0-3] / [4-7]
-    SIGNEXTEND  m10, m11, m14      ; { row[0] }[0-3] / [4-7]
-    pmaddwd     m2,  m0, [w4_plus_w6]
-    pmaddwd     m3,  m1, [w4_plus_w6]
-    pmaddwd     m4,  m0, [w4_min_w6]
-    pmaddwd     m5,  m1, [w4_min_w6]
-    pmaddwd     m6,  m0, [w4_min_w2]
-    pmaddwd     m7,  m1, [w4_min_w2]
-    pmaddwd     m0, [w4_plus_w2]
-    pmaddwd     m1, [w4_plus_w2]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
-
-    ; a0: -1*row[0]-1*row[2]
-    ; a1: -1*row[0]
-    ; a2: -1*row[0]
-    ; a3: -1*row[0]+1*row[2]
-    psubd       m2,  m10           ; a1[0-3]
-    psubd       m3,  m11           ; a1[4-7]
-    psubd       m4,  m10           ; a2[0-3]
-    psubd       m5,  m11           ; a2[4-7]
-    psubd       m0,  m10
-    psubd       m1,  m11
-    psubd       m6,  m10
-    psubd       m7,  m11
-    psubd       m0,  m8            ; a0[0-3]
-    psubd       m1,  m9            ; a0[4-7]
-    paddd       m6,  m8            ; a3[0-3]
-    paddd       m7,  m9            ; a3[4-7]
-
-    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
-    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
-    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
-    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
-    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m14, m10      ; { row[4] }[0-3] / [4-7]
-    pmaddwd     m10, m8, [w4_plus_w6]
-    pmaddwd     m11, m9, [w4_plus_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10,  m13
-    psubd       m11,  m14
-    paddd       m0,  m10            ; a0[0-3]
-    paddd       m1,  m11            ; a0[4-7]
-    pmaddwd     m10, m8, [w4_min_w6]
-    pmaddwd     m11, m9, [w4_min_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10, m13
-    psubd       m11, m14
-    paddd       m6,  m10           ; a3[0-3]
-    paddd       m7,  m11           ; a3[4-7]
-    pmaddwd     m10, m8, [w4_min_w2]
-    pmaddwd     m11, m9, [w4_min_w2]
-    pmaddwd     m8, [w4_plus_w2]
-    pmaddwd     m9, [w4_plus_w2]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    psubd       m10, m13
-    psubd       m11, m14
-    psubd       m8,  m13
-    psubd       m9,  m14
-    psubd       m4,  m10           ; a2[0-3] intermediate
-    psubd       m5,  m11           ; a2[4-7] intermediate
-    psubd       m2,  m8            ; a1[0-3] intermediate
-    psubd       m3,  m9            ; a1[4-7] intermediate
-    SIGNEXTEND  m12, m13, m10      ; { row[6] }[0-3] / [4-7]
-    psubd       m4,  m12           ; a2[0-3]
-    psubd       m5,  m13           ; a2[4-7]
-    paddd       m2,  m12           ; a1[0-3]
-    paddd       m3,  m13           ; a1[4-7]
-
-    ; load/store
-    mova   [r2+  0], m0
-    mova   [r2+ 32], m2
-    mova   [r2+ 64], m4
-    mova   [r2+ 96], m6
-    mova        m10,[r2+ 16]       ; { row[1] }[0-7]
-    mova        m8, [r2+ 48]       ; { row[3] }[0-7]
-    mova        m13,[r2+ 80]       ; { row[5] }[0-7]
-    mova        m14,[r2+112]       ; { row[7] }[0-7]
-    mova   [r2+ 16], m1
-    mova   [r2+ 48], m3
-    mova   [r2+ 80], m5
-    mova   [r2+112], m7
-%ifidn %1, row
-    pmullw      m10,[r3+ 16]
-    pmullw      m8, [r3+ 48]
-    pmullw      m13,[r3+ 80]
-    pmullw      m14,[r3+112]
-%endif
-
-    ; b0 = MUL(W1, row[1]);
-    ; MAC(b0, W3, row[3]);
-    ; b1 = MUL(W3, row[1]);
-    ; MAC(b1, -W7, row[3]);
-    ; b2 = MUL(W5, row[1]);
-    ; MAC(b2, -W1, row[3]);
-    ; b3 = MUL(W7, row[1]);
-    ; MAC(b3, -W5, row[3]);
-    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
-    SIGNEXTEND  m10, m11, m12      ; { row[1] }[0-3] / [4-7]
-    SIGNEXTEND  m8,  m9,  m12      ; { row[3] }[0-3] / [4-7]
-    pmaddwd     m2,  m0, [w3_min_w7]
-    pmaddwd     m3,  m1, [w3_min_w7]
-    pmaddwd     m4,  m0, [w5_min_w1]
-    pmaddwd     m5,  m1, [w5_min_w1]
-    pmaddwd     m6,  m0, [w7_min_w5]
-    pmaddwd     m7,  m1, [w7_min_w5]
-    pmaddwd     m0, [w1_plus_w3]
-    pmaddwd     m1, [w1_plus_w3]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
-
-    ; b0: +1*row[1]+2*row[3]
-    ; b1: +2*row[1]-1*row[3]
-    ; b2: -1*row[1]-1*row[3]
-    ; b3: +1*row[1]+1*row[3]
-    psubd       m2,  m8
-    psubd       m3,  m9
-    paddd       m0,  m8
-    paddd       m1,  m9
-    paddd       m8,  m10           ; { row[1] + row[3] }[0-3]
-    paddd       m9,  m11           ; { row[1] + row[3] }[4-7]
-    paddd       m10, m10
-    paddd       m11, m11
-    paddd       m0,  m8            ; b0[0-3]
-    paddd       m1,  m9            ; b0[4-7]
-    paddd       m2,  m10           ; b1[0-3]
-    paddd       m3,  m11           ; b2[4-7]
-    psubd       m4,  m8            ; b2[0-3]
-    psubd       m5,  m9            ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
-
-    ; MAC(b0,  W5, row[5]);
-    ; MAC(b0,  W7, row[7]);
-    ; MAC(b1, -W1, row[5]);
-    ; MAC(b1, -W5, row[7]);
-    ; MAC(b2,  W7, row[5]);
-    ; MAC(b2,  W3, row[7]);
-    ; MAC(b3,  W3, row[5]);
-    ; MAC(b3, -W1, row[7]);
-    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m12, m11      ; { row[5] }[0-3] / [4-7]
-    SIGNEXTEND  m14, m11, m10      ; { row[7] }[0-3] / [4-7]
+define_constants _hi
 
-    ; b0: -1*row[5]+1*row[7]
-    ; b1: -1*row[5]+1*row[7]
-    ; b2: +1*row[5]+2*row[7]
-    ; b3: +2*row[5]-1*row[7]
-    paddd       m4,  m13
-    paddd       m5,  m12
-    paddd       m6,  m13
-    paddd       m7,  m12
-    psubd       m13, m14           ; { row[5] - row[7] }[0-3]
-    psubd       m12, m11           ; { row[5] - row[7] }[4-7]
-    paddd       m14, m14
-    paddd       m11, m11
-    psubd       m0,  m13
-    psubd       m1,  m12
-    psubd       m2,  m13
-    psubd       m3,  m12
-    paddd       m4,  m14
-    paddd       m5,  m11
-    paddd       m6,  m13
-    paddd       m7,  m12
-
-    pmaddwd     m10, m8, [w1_plus_w5]
-    pmaddwd     m11, m9, [w1_plus_w5]
-    pmaddwd     m12, m8, [w5_plus_w7]
-    pmaddwd     m13, m9, [w5_plus_w7]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m12,  2
-    pslld       m13,  2
-    psubd       m2,  m10           ; b1[0-3]
-    psubd       m3,  m11           ; b1[4-7]
-    paddd       m0,  m12            ; b0[0-3]
-    paddd       m1,  m13            ; b0[4-7]
-    pmaddwd     m12, m8, [w7_plus_w3]
-    pmaddwd     m13, m9, [w7_plus_w3]
-    pmaddwd     m8, [w3_min_w1]
-    pmaddwd     m9, [w3_min_w1]
-    pslld       m12, 2
-    pslld       m13, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    paddd       m4,  m12           ; b2[0-3]
-    paddd       m5,  m13           ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
-
-    ; row[0] = (a0 + b0) >> 15;
-    ; row[7] = (a0 - b0) >> 15;
-    ; row[1] = (a1 + b1) >> 15;
-    ; row[6] = (a1 - b1) >> 15;
-    ; row[2] = (a2 + b2) >> 15;
-    ; row[5] = (a2 - b2) >> 15;
-    ; row[3] = (a3 + b3) >> 15;
-    ; row[4] = (a3 - b3) >> 15;
-    mova        m8, [r2+ 0]        ; a0[0-3]
-    mova        m9, [r2+16]        ; a0[4-7]
-    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
-    mova        m0, [r2+32]        ; a1[0-3]
-    mova        m1, [r2+48]        ; a1[4-7]
-    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
-    mova        m1, [r2+64]        ; a2[0-3]
-    mova        m2, [r2+80]        ; a2[4-7]
-    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
-    mova        m2, [r2+96]        ; a3[0-3]
-    mova        m3, [r2+112]       ; a3[4-7]
-    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
-%endmacro
-
-; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t linesize,
-;                                  int16_t *block, const int16_t *qmat);
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
-    pxor        m15, m15           ; zero
-
-    ; for (i = 0; i < 8; i++)
-    ;     idctRowCondDC(block + i*8);
-    mova        m10,[r2+ 0]        ; { row[0] }[0-7]
-    mova        m8, [r2+32]        ; { row[2] }[0-7]
-    mova        m13,[r2+64]        ; { row[4] }[0-7]
-    mova        m12,[r2+96]        ; { row[6] }[0-7]
-
-    pmullw      m10,[r3+ 0]
-    pmullw      m8, [r3+32]
-    pmullw      m13,[r3+64]
-    pmullw      m12,[r3+96]
-
-    IDCT_1D     row, 17
-
-    ; transpose for second part of IDCT
-    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
-    mova   [r2+ 16], m0
-    mova   [r2+ 48], m2
-    mova   [r2+ 80], m11
-    mova   [r2+112], m10
-    SWAP         8,  10
-    SWAP         1,   8
-    SWAP         4,  13
-    SWAP         9,  12
-
-    ; for (i = 0; i < 8; i++)
-    ;     idctSparseColAdd(dest + i, line_size, block + i);
-    IDCT_1D     col, 20
-
-    ; clip/store
-    mova        m6, [pw_512]
-    mova        m3, [pw_4]
-    mova        m5, [pw_1019]
-    paddw       m8,  m6
-    paddw       m0,  m6
-    paddw       m1,  m6
-    paddw       m2,  m6
-    paddw       m4,  m6
-    paddw       m11, m6
-    paddw       m9,  m6
-    paddw       m10, m6
-    pmaxsw      m8,  m3
-    pmaxsw      m0,  m3
-    pmaxsw      m1,  m3
-    pmaxsw      m2,  m3
-    pmaxsw      m4,  m3
-    pmaxsw      m11, m3
-    pmaxsw      m9,  m3
-    pmaxsw      m10, m3
-    pminsw      m8,  m5
-    pminsw      m0,  m5
-    pminsw      m1,  m5
-    pminsw      m2,  m5
-    pminsw      m4,  m5
-    pminsw      m11, m5
-    pminsw      m9,  m5
-    pminsw      m10, m5
-
-    lea         r2, [r1*3]
-    mova  [r0     ], m8
-    mova  [r0+r1  ], m0
-    mova  [r0+r1*2], m1
-    mova  [r0+r2  ], m2
-    lea         r0, [r0+r1*4]
-    mova  [r0     ], m4
-    mova  [r0+r1  ], m11
-    mova  [r0+r1*2], m9
-    mova  [r0+r2  ], m10
+%macro idct_fn 0
+cglobal prores_idct_put_10, 4, 4, 15, pixels, lsize, block, qmat
+    IDCT_FN    pw_1, 15, pw_88, 18, "put", pw_4, pw_1019, r3
     RET
 %endmacro
 
-%macro SIGNEXTEND 2-3
-%if cpuflag(sse4) ; dstlow, dsthigh
-    movhlps     %2,  %1
-    pmovsxwd    %1,  %1
-    pmovsxwd    %2,  %2
-%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
-    pxor        %3,  %3
-    pcmpgtw     %3,  %1
-    mova        %2,  %1
-    punpcklwd   %1,  %3
-    punpckhwd   %2,  %3
-%endif
-%endmacro
-
 INIT_XMM sse2
-idct_put_fn 16
-INIT_XMM sse4
-idct_put_fn 16
+idct_fn
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
-idct_put_fn 16
+idct_fn
+%endif
 
 %endif
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index ff4d398..bde79ab 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,29 +27,24 @@
 
 void ff_prores_idct_put_10_sse2(uint16_t *dst, ptrdiff_t linesize,
                                 int16_t *block, const int16_t *qmat);
-void ff_prores_idct_put_10_sse4(uint16_t *dst, ptrdiff_t linesize,
-                                int16_t *block, const int16_t *qmat);
 void ff_prores_idct_put_10_avx (uint16_t *dst, ptrdiff_t linesize,
                                 int16_t *block, const int16_t *qmat);
 
-av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
 #if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_SSE2(cpu_flags)) {
-        dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
-        dsp->idct_put = ff_prores_idct_put_10_sse2;
-    }
-
-    if (EXTERNAL_SSE4(cpu_flags)) {
-        dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
-        dsp->idct_put = ff_prores_idct_put_10_sse4;
-    }
+    if (avctx->bits_per_raw_sample == 10){
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
+            dsp->idct_put = ff_prores_idct_put_10_sse2;
+        }
 
-    if (EXTERNAL_AVX(cpu_flags)) {
-        dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
-        dsp->idct_put = ff_prores_idct_put_10_avx;
+        if (EXTERNAL_AVX(cpu_flags)) {
+            dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
+            dsp->idct_put = ff_prores_idct_put_10_avx;
+        }
     }
 #endif /* ARCH_X86_64 */
 }
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 27a1c63..4e72d50 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index ef5f1d8..282faed 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -1,22 +1,23 @@
 ;******************************************************************************
-;* quarterpel DSP functions
-;*
+;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index e280c82..3b05e15 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,13 +79,13 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
                                                 const uint8_t *src,
                                                 int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
 
 #if HAVE_X86ASM
 
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
 
 #define QPEL_OP(OPNAME, RND, MMX)                                       \
 static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst,                  \
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index 0c76d91..09946bd 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "inline_asm.h"
 
 // put_pixels
-STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
@@ -46,12 +46,12 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddusw %%mm0, %%mm4           \n\t"
         "paddusw %%mm1, %%mm5           \n\t"
-        "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
         "add    %3, %1                  \n\t"
         ".p2align 3                     \n\t"
         "1:                             \n\t"
-        "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
-        "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
         "movq   %%mm0, %%mm1            \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "punpcklbw %%mm7, %%mm0         \n\t"
@@ -67,11 +67,11 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "psrlw  $2, %%mm4               \n\t"
         "psrlw  $2, %%mm5               \n\t"
         "packuswb  %%mm5, %%mm4         \n\t"
-        "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
-        "add  %3, %%"FF_REG_a"          \n\t"
+        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
 
-        "movq  (%1, %%"FF_REG_a"), %%mm2\n\t" // 0 <-> 2   1 <-> 3
-        "movq 1(%1, %%"FF_REG_a"), %%mm4\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "movq   %%mm4, %%mm5            \n\t"
         "punpcklbw %%mm7, %%mm2         \n\t"
@@ -87,8 +87,8 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "psrlw  $2, %%mm0               \n\t"
         "psrlw  $2, %%mm1               \n\t"
         "packuswb  %%mm1, %%mm0         \n\t"
-        "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
-        "add  %3, %%"FF_REG_a"          \n\t"
+        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
 
         "subl   $2, %0                  \n\t"
         "jnz    1b                      \n\t"
@@ -99,7 +99,7 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
 
 // avg_pixels
 // this routine is 'slightly' suboptimal but mostly unused
-STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
@@ -115,12 +115,12 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddusw %%mm0, %%mm4           \n\t"
         "paddusw %%mm1, %%mm5           \n\t"
-        "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
         "add    %3, %1                  \n\t"
         ".p2align 3                     \n\t"
         "1:                             \n\t"
-        "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
-        "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
         "movq   %%mm0, %%mm1            \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "punpcklbw %%mm7, %%mm0         \n\t"
@@ -135,16 +135,16 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "paddusw %%mm1, %%mm5           \n\t"
         "psrlw  $2, %%mm4               \n\t"
         "psrlw  $2, %%mm5               \n\t"
-        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
         "packuswb  %%mm5, %%mm4         \n\t"
                 "pcmpeqd %%mm2, %%mm2   \n\t"
                 "paddb %%mm2, %%mm2     \n\t"
                 PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
-        "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
-        "add %3, %%"FF_REG_a"           \n\t"
+                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
 
-        "movq  (%1, %%"FF_REG_a"), %%mm2\n\t" // 0 <-> 2   1 <-> 3
-        "movq 1(%1, %%"FF_REG_a"), %%mm4\n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
         "movq   %%mm2, %%mm3            \n\t"
         "movq   %%mm4, %%mm5            \n\t"
         "punpcklbw %%mm7, %%mm2         \n\t"
@@ -159,13 +159,13 @@ STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
         "paddusw %%mm5, %%mm1           \n\t"
         "psrlw  $2, %%mm0               \n\t"
         "psrlw  $2, %%mm1               \n\t"
-        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
         "packuswb  %%mm1, %%mm0         \n\t"
                 "pcmpeqd %%mm2, %%mm2   \n\t"
                 "paddb %%mm2, %%mm2     \n\t"
                 PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
-        "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
-        "add  %3, %%"FF_REG_a"          \n\t"
+                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
 
         "subl   $2, %0                  \n\t"
         "jnz    1b                      \n\t"
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b..692b4ac 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -64,6 +64,7 @@ rv34_idct dc
 rv34_idct dc_noround
 
 ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+%if ARCH_X86_32
 INIT_MMX mmx
 cglobal rv34_idct_dc_add, 3, 3
     ; calculate DC
@@ -97,6 +98,7 @@ cglobal rv34_idct_dc_add, 3, 3
     movh       [r2], m4
     movh       [r2+r1], m5
     RET
+%endif
 
 ; Load coeffs and perform row transform
 ; Output: coeffs in mm[0467], rounder in mm5
@@ -167,7 +169,7 @@ cglobal rv34_idct_add, 3,3,0, d, s, b
     ret
 
 ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
-INIT_XMM sse4
+%macro RV34_IDCT_DC_ADD 0
 cglobal rv34_idct_dc_add, 3, 3, 6
     ; load data
     IDCT_DC_ROUND r2
@@ -190,7 +192,22 @@ cglobal rv34_idct_dc_add, 3, 3, 6
     paddw      m4, m0
     packuswb   m2, m4
     movd      [r0], m2
+%if cpuflag(sse4)
     pextrd [r0+r1], m2, 1
     pextrd    [r2], m2, 2
     pextrd [r2+r1], m2, 3
+%else
+    psrldq     m2, 4
+    movd   [r0+r1], m2
+    psrldq     m2, 4
+    movd      [r2], m2
+    psrldq     m2, 4
+    movd   [r2+r1], m2
+%endif
     RET
+%endmacro
+
+INIT_XMM sse2
+RV34_IDCT_DC_ADD
+INIT_XMM sse4
+RV34_IDCT_DC_ADD
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 32d4c1a..7310122 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -2,20 +2,20 @@
  * RV30/40 MMX/SSE2 optimizations
  * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 void ff_rv34_idct_dc_mmxext(int16_t *block);
 void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
 void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc);
 void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
 void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
 
@@ -34,12 +35,14 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags))
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags))
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
         c->rv34_idct_add         = ff_rv34_idct_add_mmxext;
     }
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2;
     if (EXTERNAL_SSE4(cpu_flags))
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
 }
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 4949842..bcad1ae 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -4,26 +4,26 @@
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 16
+SECTION_RODATA
 
 pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
 
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 5ba0aa5..b57a3fc 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions x86-optimised
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,13 @@
 #include "libavutil/x86/cpu.h"
 #include "hpeldsp.h"
 
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+                                               ptrdiff_t stride) \
+{ \
+    ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
 #if HAVE_X86ASM
 void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
                                   ptrdiff_t stride, int h, int x, int y);
@@ -75,7 +82,7 @@ static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst,  \
 {                                                                       \
     int i;                                                              \
     if (PH && PV) {                                                     \
-        DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)];           \
+        LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]);           \
         uint8_t *tmpptr = tmp + SIZE * 2;                               \
         src -= stride * 2;                                              \
                                                                         \
@@ -127,8 +134,8 @@ QPEL_FUNCS_DECL(OP, 3, 2, OPT)
 /** @} */
 
 #define LOOPSIZE  8
-#define HCOFF(x)  (32 * (x - 1))
-#define VCOFF(x)  (32 * (x - 1))
+#define HCOFF(x)  (32 * ((x) - 1))
+#define VCOFF(x)  (32 * ((x) - 1))
 QPEL_MC_DECL(put_, _ssse3)
 QPEL_MC_DECL(avg_, _ssse3)
 
@@ -136,8 +143,8 @@ QPEL_MC_DECL(avg_, _ssse3)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  8
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 QPEL_MC_DECL(put_, _sse2)
 QPEL_MC_DECL(avg_, _sse2)
 
@@ -146,8 +153,8 @@ QPEL_MC_DECL(avg_, _sse2)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  4
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 
 QPEL_MC_DECL(put_, _mmx)
 
@@ -186,34 +193,28 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \
 QPEL_FUNCS_SET (OP, 3, 2, OPT)
 /** @} */
 
+DEFINE_FN(put, 8, ssse3)
+
+DEFINE_FN(put, 16, sse2)
+DEFINE_FN(put, 16, ssse3)
+
+DEFINE_FN(avg, 8, mmxext)
+DEFINE_FN(avg, 8, ssse3)
+
+DEFINE_FN(avg, 16, sse2)
+DEFINE_FN(avg, 16, ssse3)
 #endif /* HAVE_X86ASM */
 
 #if HAVE_MMX_INLINE
-static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-#endif /* HAVE_MMX_INLINE */
+DEFINE_FN(put, 8, mmx)
+DEFINE_FN(avg, 8, mmx)
+DEFINE_FN(put, 16, mmx)
+DEFINE_FN(avg, 16, mmx)
+#endif
 
 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_MMX_INLINE
     if (INLINE_MMX(cpu_flags)) {
@@ -240,6 +241,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_mmxext;
         c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
         c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
@@ -251,6 +253,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_sse2;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_sse2;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
@@ -259,6 +263,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
         QPEL_MC_SET(avg_, _sse2)
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
+        c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_ssse3;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
new file mode 100644
index 0000000..d68d3a9
--- /dev/null
+++ b/libavcodec/x86/sbcdsp.asm
@@ -0,0 +1,168 @@
+;******************************************************************************
+;* SIMD optimized SBC encoder DSP functions
+;*
+;* Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+;* Copyright (C) 2008-2010  Nokia Corporation
+;* Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+;* Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+;* Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
+
+SECTION .text
+
+%macro NIDN 3
+%ifnidn %2, %3
+    %1            %2, %3
+%endif
+%endmacro
+
+%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
+    NIDN movq,    %5, %3
+    NIDN movq,    %6, %4
+    pmaddwd       %5, [constsq+%9]
+    pmaddwd       %6, [constsq+%9+8]
+    NIDN paddd,   %1, %7
+    NIDN paddd,   %2, %8
+%endmacro
+
+%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
+    ANALYZE_MAC   %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
+%endmacro
+
+%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
+%ifidn %7, pack
+    psrad         %3, 16    ; SBC_PROTO_FIXED_SCALE
+    packssdw      %3, %3
+%endif
+    ANALYZE_MAC   %1, %2, %3, %3, %4, %5, %4, %5, %6
+%endmacro
+
+;*******************************************************************
+;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
+    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask], 0
+    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 16
+    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 32
+    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 48
+    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 64
+
+    ANALYZE_MAC_REG  m0, m2, m0, m0, m2, 80, pack
+    ANALYZE_MAC_REG  m0, m2, m1, m1, m3, 96, pack
+
+    movq          [outq  ], m0
+    movq          [outq+8], m2
+
+    RET
+
+
+;*******************************************************************
+;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
+    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask],  0
+    ANALYZE_MAC_IN   m2, m3, m2, m3, [scale_mask], [scale_mask], 16
+    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  32
+    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  48
+    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  64
+    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  80
+    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  96
+    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 112
+    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5, 128
+    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 144
+
+    ANALYZE_MAC_REG  m4, m5, m0, m4, m5, 160, pack
+    ANALYZE_MAC_REG  m4, m5, m1, m6, m7, 192, pack
+    ANALYZE_MAC_REG  m4, m5, m2, m6, m7, 224, pack
+    ANALYZE_MAC_REG  m4, m5, m3, m6, m7, 256, pack
+
+    movq          [outq  ], m4
+    movq          [outq+8], m5
+
+    ANALYZE_MAC_REG  m0, m5, m0, m0, m5, 176, no
+    ANALYZE_MAC_REG  m0, m5, m1, m1, m7, 208, no
+    ANALYZE_MAC_REG  m0, m5, m2, m2, m7, 240, no
+    ANALYZE_MAC_REG  m0, m5, m3, m3, m7, 272, no
+
+    movq          [outq+16], m0
+    movq          [outq+24], m5
+
+    RET
+
+
+;*******************************************************************
+;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
+;                              uint32_t scale_factor[2][8],
+;                              int blocks, int channels, int subbands)
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
+    ; subbands = 4 * subbands * channels
+    movq          m3, [scale_mask]
+    shl           subbandsd, 2
+    cmp           channelsd, 2
+    jl            .loop_1
+    shl           subbandsd, 1
+
+.loop_1:
+    sub           subbandsq, 8
+    lea           ptrq, [sb_sample_fq + subbandsq]
+
+    ; blk = (blocks - 1) * 64;
+    lea           blkq, [blocksq - 1]
+    shl           blkd, 6
+
+    movq          m0, m3
+.loop_2:
+    movq          m1, [ptrq+blkq]
+    pxor          m2, m2
+    pcmpgtd       m1, m2
+    paddd         m1, [ptrq+blkq]
+    pcmpgtd       m2, m1
+    pxor          m1, m2
+
+    por           m0, m1
+
+    sub           blkq, 64
+    jns           .loop_2
+
+    movd          blkd, m0
+    psrlq         m0,   32
+    bsr           blkd, blkd
+    sub           blkd, 15    ; SCALE_OUT_BITS
+    mov           [scale_factorq + subbandsq], blkd
+
+    movd          blkd, m0
+    bsr           blkd, blkd
+    sub           blkd, 15    ; SCALE_OUT_BITS
+    mov           [scale_factorq + subbandsq + 4], blkd
+
+    cmp           subbandsq, 0
+    jg            .loop_1
+
+    emms
+    RET
diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c
new file mode 100644
index 0000000..86effec
--- /dev/null
+++ b/libavcodec/x86/sbcdsp_init.c
@@ -0,0 +1,51 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC MMX optimization for some basic "building bricks"
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/sbcdsp.h"
+
+void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
+                                  uint32_t scale_factor[2][8],
+                                  int blocks, int channels, int subbands);
+
+av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
+        s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
+        s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
+    }
+}
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index b449de5..62bbe51 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -2,20 +2,20 @@
 ;* AAC Spectral Band Replication decoding functions
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,13 +25,20 @@ SECTION_RODATA
 ; mask equivalent for multiply by -1.0 1.0
 ps_mask         times 2 dd 1<<31, 0
 ps_mask2        times 2 dd 0, 1<<31
-ps_neg          times 4 dd 1<<31
+ps_mask3        dd  0, 0, 0, 1<<31
+ps_noise0       times 2 dd  1.0,  0.0,
+ps_noise2       times 2 dd -1.0,  0.0
+ps_noise13      dd  0.0,  1.0, 0.0, -1.0
+                dd  0.0, -1.0, 0.0,  1.0
+                dd  0.0,  1.0, 0.0, -1.0
+cextern         sbr_noise_table
+cextern         ps_neg
 
 SECTION .text
 
 INIT_XMM sse
 cglobal sbr_sum_square, 2, 3, 6
-    mov         r2, r1
+    mov        r2d, r1d
     xorps       m0, m0
     xorps       m1, m1
     sar         r2, 3
@@ -136,51 +143,48 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
     mova       m3, m1
     mova       m4, m2
-    mova       m7, [ps_mask]
 
     ; Set pointers
 %if ARCH_X86_64 == 0 || WIN64
     ; start and end 6th and 7th args on stack
     mov        r2d, Sm
     mov        r3d, Em
-%define  start r2q
-%define  end   r3q
+    DEFINE_ARGS X_high, X_low, start, end
 %else
 ; BW does not actually occupy a register, so shift by 1
-%define  start BWq
-%define  end   Sq
+    DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
+    movsxd  startq, startd
+    movsxd    endq, endd
 %endif
-    sub      start, end          ; neg num of loops
-    lea    X_highq, [X_highq + end*2*4]
-    lea     X_lowq, [X_lowq  + end*2*4 - 2*2*4]
-    shl      start, 3            ; offset from num loops
+    sub     startq, endq         ; neg num of loops
+    lea    X_highq, [X_highq + endq*2*4]
+    lea     X_lowq, [X_lowq  + endq*2*4 - 2*2*4]
+    shl     startq, 3            ; offset from num loops
 
-    mova        m0, [X_lowq + start]
-    movlhps     m1, m1           ; (a2 a3 a2 a3)
-    movlhps     m2, m2           ; (a0 a1 a0 a1)
-    shufps      m3, m3, q0101    ; (a3 a2 a3 a2)
-    shufps      m4, m4, q0101    ; (a1 a0 a1 a0)
-    xorps       m3, m7           ; (-a3 a2 -a3 a2)
-    xorps       m4, m7           ; (-a1 a0 -a1 a0)
+    mova        m0, [X_lowq + startq]
+    shufps      m3, m3, q1111
+    shufps      m4, m4, q1111
+    xorps       m3, [ps_mask]
+    shufps      m1, m1, q0000
+    shufps      m2, m2, q0000
+    xorps       m4, [ps_mask]
 .loop2:
-    mova        m5, m0
+    movu        m7, [X_lowq + startq + 8]       ; BbCc
     mova        m6, m0
-    shufps      m0, m0, q2200    ; {Xl[-2][0],",Xl[-1][0],"}
-    shufps      m5, m5, q3311    ; {Xl[-2][1],",Xl[-1][1],"}
-    mulps       m0, m2
-    mulps       m5, m4
-    mova        m7, m6
-    addps       m5, m0
-    mova        m0, [X_lowq + start + 2*2*4]
-    shufps      m6, m0, q0022    ; {Xl[-1][0],",Xl[0][0],"}
-    shufps      m7, m0, q1133    ; {Xl[-1][1],",Xl[1][1],"}
-    mulps       m6, m1
+    mova        m5, m7
+    shufps      m0, m0, q2301                   ; aAbB
+    shufps      m7, m7, q2301                   ; bBcC
+    mulps       m0, m4
     mulps       m7, m3
-    addps       m5, m6
+    mulps       m6, m2
+    mulps       m5, m1
+    addps       m7, m0
+    mova        m0, [X_lowq + startq + 16]      ; CcDd
     addps       m7, m0
-    addps       m5, m7
-    mova  [X_highq + start], m5
-    add     start, 16
+    addps       m6, m5
+    addps       m7, m6
+    mova  [X_highq + startq], m7
+    add     startq, 16
     jnz         .loop2
     RET
 
@@ -246,33 +250,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z
     jne      .loop
     REP_RET
 
-INIT_XMM sse2
 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY  0
 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
     mov               cq, 64*4-2*mmsize
     lea            vrevq, [vq + 64*4]
 .loop:
     mova              m0, [src0q+cq]
     mova              m1, [src1q]
-    mova              m2, [src0q+cq+mmsize]
-    mova              m3, [src1q+mmsize]
-    pshufd            m4, m0, q0123
-    pshufd            m5, m1, q0123
-    pshufd            m6, m2, q0123
-    pshufd            m7, m3, q0123
-    addps             m3, m4
+    mova              m4, [src0q+cq+mmsize]
+    mova              m5, [src1q+mmsize]
+%if cpuflag(sse2)
+    pshufd            m2, m0, q0123
+    pshufd            m3, m1, q0123
+    pshufd            m6, m4, q0123
+    pshufd            m7, m5, q0123
+%else
+    shufps            m2, m0, m0, q0123
+    shufps            m3, m1, m1, q0123
+    shufps            m6, m4, m4, q0123
+    shufps            m7, m5, m5, q0123
+%endif
+    addps             m5, m2
     subps             m0, m7
     addps             m1, m6
-    subps             m2, m5
+    subps             m4, m3
     mova         [vrevq], m1
-    mova  [vrevq+mmsize], m3
+    mova  [vrevq+mmsize], m5
     mova         [vq+cq], m0
-    mova  [vq+cq+mmsize], m2
+    mova  [vq+cq+mmsize], m4
     add            src1q, 2*mmsize
     add            vrevq, 2*mmsize
     sub               cq, 2*mmsize
     jge            .loop
     REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
 
 INIT_XMM sse2
 cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@@ -303,3 +321,228 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
     movq       m2, [zq]
     movq    [r2q], m2
     REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST  1
+%ifdef PIC
+    lea  NOISE_TABLE, [%1]
+    mova          m0, [kxq + NOISE_TABLE]
+%else
+    mova          m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise0]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise2]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+    mov       kxd, m_maxm
+    DEFINE_ARGS Y, s_m, q_filt, noise, count
+%else
+    DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
+%endif
+    movsxdifnidn    noiseq, noised
+    dec    noiseq
+    shl    countd, 2
+%ifdef PIC
+    lea NOISE_TABLE, [sbr_noise_table]
+%endif
+    lea        Yq, [Yq + 2*countq]
+    add      s_mq, countq
+    add   q_filtq, countq
+    shl    noiseq, 3
+    pxor       m5, m5
+    neg    countq
+.loop:
+    mova       m1, [q_filtq + countq]
+    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
+    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
+    add    noiseq, 2*mmsize
+    and    noiseq, 0x1ff<<3
+    punpckhdq  m2, m1, m1
+    punpckldq  m1, m1
+    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mova       m3, [s_mq + countq]
+    ; TODO: replace by a vpermd in AVX2
+    punpckhdq  m4, m3, m3
+    punpckldq  m3, m3
+    pcmpeqd    m6, m3, m5 ; m6 == 0
+    pcmpeqd    m7, m4, m5 ; m7 == 0
+    mulps      m3, m0 ; s_m[m] * phi_sign
+    mulps      m4, m0 ; s_m[m] * phi_sign
+    pand       m1, m6
+    pand       m2, m7
+    movu       m6, [Yq + 2*countq]
+    movu       m7, [Yq + 2*countq + mmsize]
+    addps      m3, m1
+    addps      m4, m2
+    addps      m6, m3
+    addps      m7, m4
+    movu    [Yq + 2*countq], m6
+    movu    [Yq + 2*countq + mmsize], m7
+    add    countq, mmsize
+    jl      .loop
+    RET
+
+INIT_XMM sse
+cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
+%define COUNT  32*4
+%define OFFSET 32*4
+    mov        cq, -COUNT
+    lea     vrevq, [vq + OFFSET + COUNT]
+    add        vq, OFFSET-mmsize
+    add      srcq, 2*COUNT
+    mova       m3, [ps_neg]
+.loop:
+    mova       m0, [srcq + 2*cq + 0*mmsize]
+    mova       m1, [srcq + 2*cq + 1*mmsize]
+    shufps     m2, m0, m1, q2020
+    shufps     m1, m0, q1313
+    xorps      m2, m3
+    mova     [vq], m1
+    mova  [vrevq + cq], m2
+    sub        vq, mmsize
+    add        cq, mmsize
+    jl      .loop
+    REP_RET
+
+%macro SBR_AUTOCORRELATE 0
+cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+    mov   cntq, 37*8
+    add     xq, cntq
+    neg   cntq
+
+%if cpuflag(sse3)
+%define   MOVH  movsd
+    movddup m5, [xq+cntq]
+%else
+%define   MOVH  movlps
+    movlps  m5, [xq+cntq]
+    movlhps m5, m5
+%endif
+    MOVH    m7, [xq+cntq+8 ]
+    MOVH    m1, [xq+cntq+16]
+    shufps  m7, m7, q0110
+    shufps  m1, m1, q0110
+    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
+    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
+    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
+    movaps  [rsp   ], m3
+    movaps  [rsp+16], m4
+    add   cntq, 8
+
+    MOVH    m2, [xq+cntq+16]
+    movlhps m7, m7
+    shufps  m2, m2, q0110
+    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
+    mulps   m4, m7, m2
+    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
+    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
+
+align 16
+.loop:
+    add   cntq, 8
+    MOVH    m0, [xq+cntq+16]
+    movlhps m1, m1
+    shufps  m0, m0, q0110
+    mulps   m3, m1, m2
+    mulps   m4, m1, m0
+    mulps   m1, m1
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m1, [xq+cntq+16]
+    movlhps m2, m2
+    shufps  m1, m1, q0110
+    mulps   m3, m2, m0
+    mulps   m4, m2, m1
+    mulps   m2, m2
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m2, [xq+cntq+16]
+    movlhps m0, m0
+    shufps  m2, m2, q0110
+    mulps   m3, m0, m1
+    mulps   m4, m0, m2
+    mulps   m0, m0
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    jl .loop
+
+    movlhps m1, m1
+    mulps   m2, m1
+    mulps   m1, m1
+    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
+    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
+    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
+    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
+
+    xorps   m2, [ps_mask3]
+    xorps   m5, [ps_mask3]
+    xorps   m6, [ps_mask3]
+    HADDPS  m2, m5, m3
+    HADDPS  m7, m6, m4
+%if cpuflag(sse3)
+    movshdup m0, m1
+%else
+    movss   m0, m1
+    shufps  m1, m1, q0001
+%endif
+    addss   m1, m0
+    movaps  [phiq     ], m2
+    movhps  [phiq+0x18], m7
+    movss   [phiq+0x28], m7
+    movss   [phiq+0x10], m1
+    RET
+%endmacro
+
+INIT_XMM sse
+SBR_AUTOCORRELATE
+INIT_XMM sse3
+SBR_AUTOCORRELATE
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 9600852..6911a1a 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding functions
  * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,28 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
                        float bw, int start, int end);
 void ff_sbr_neg_odd_64_sse(float *z);
 void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_pre_shuffle_sse2(float *z);
 
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
+
+void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
+void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
+
 av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,10 +67,21 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->hf_g_filt  = ff_sbr_hf_g_filt_sse;
         s->hf_gen     = ff_sbr_hf_gen_sse;
         s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+        s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse;
+        s->qmf_deint_neg    = ff_sbr_qmf_deint_neg_sse;
+        s->autocorrelate    = ff_sbr_autocorrelate_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse2;
         s->qmf_pre_shuffle  = ff_sbr_qmf_pre_shuffle_sse2;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
+    }
+
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->autocorrelate = ff_sbr_autocorrelate_sse3;
     }
 }
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
new file mode 100644
index 0000000..6fedbb5
--- /dev/null
+++ b/libavcodec/x86/simple_idct.asm
@@ -0,0 +1,889 @@
+;
+; Simple IDCT MMX
+;
+; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
+;
+; Conversion from gcc syntax to x264asm syntax with minimal modifications
+; by James Darnley <jdarnley@obe.tv>.
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public
+; License along with FFmpeg; if not, write to the Free Software
+; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;/
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+wm1010: dw 0, 0xffff, 0, 0xffff
+d40000: dd 4 << 16, 0
+
+; 23170.475006
+; 22725.260826
+; 21406.727617
+; 19265.545870
+; 16384.000000
+; 12872.826198
+; 8866.956905
+; 4520.335430
+
+%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
+%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+%define C6 8867  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+%define C7 4520  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+%define ROW_SHIFT 11
+%define COL_SHIFT 20 ; 6
+
+coeffs:
+    dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 1
+    dw 1 << (ROW_SHIFT - 1), 0
+
+    dw C4,  C4,  C4,  C4
+    dw C4, -C4,  C4, -C4
+
+    dw C2,  C6,  C2,  C6
+    dw C6, -C2,  C6, -C2
+
+    dw C1,  C3,  C1,  C3
+    dw C5,  C7,  C5,  C7
+
+    dw C3, -C7,  C3, -C7
+    dw -C1, -C5, -C1, -C5
+
+    dw C5, -C1,  C5, -C1
+    dw C7,  C3,  C7,  C3
+
+    dw C7, -C5,  C7, -C5
+    dw C3, -C1,  C3, -C1
+
+SECTION .text
+
+%macro DC_COND_IDCT 7
+    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
+    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
+    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
+    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
+    movq            mm4, [wm1010]
+    pand            mm4, mm0
+    por             mm4, mm1
+    por             mm4, mm2
+    por             mm4, mm3
+    packssdw        mm4, mm4
+    movd            t0d, mm4
+    or              t0d, t0d
+    jz              %%1
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
+    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
+    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
+    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
+    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
+    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
+    paddd           mm4, [coeffs + 8]
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    paddd           mm4, mm5            ; A0             a0
+    psubd           mm6, mm5            ; A3             a3
+    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
+    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
+    paddd           mm0, [coeffs + 8]
+    paddd           mm1, mm0            ; A1             a1
+    paddd           mm0, mm0
+    psubd           mm0, mm1            ; A2             a2
+    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
+    paddd           mm7, mm5            ; B0             b0
+    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
+    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
+    paddd           mm7, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm7            ; A0-B0          a0-b0
+    paddd           mm5, mm2            ; B1             b1
+    psrad           mm7, %7
+    psrad           mm4, %7
+    movq            mm2, mm1            ; A1             a1
+    paddd           mm1, mm5            ; A1+B1          a1+b1
+    psubd           mm2, mm5            ; A1-B1          a1-b1
+    psrad           mm1, %7
+    psrad           mm2, %7
+    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
+    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
+    movq           [%5], mm7
+    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
+    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
+    movq      [24 + %5], mm2
+    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
+    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
+    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
+    movq            mm2, mm0            ; A2             a2
+    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd           mm4, mm7            ; B2             b2
+    paddd           mm2, mm4            ; A2+B2          a2+b2
+    psubd           mm0, mm4            ; a2-B2          a2-b2
+    psrad           mm2, %7
+    psrad           mm0, %7
+    movq            mm4, mm6            ; A3             a3
+    paddd           mm3, mm1            ; B3             b3
+    paddd           mm6, mm3            ; A3+B3          a3+b3
+    psubd           mm4, mm3            ; a3-B3          a3-b3
+    psrad           mm6, %7
+    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
+    movq       [8 + %5], mm2
+    psrad           mm4, %7
+    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
+    movq      [16 + %5], mm4
+    jmp             %%2
+%%1:
+    pslld           mm0, 16
+    paddd           mm0, [d40000]
+    psrad           mm0, 13
+    packssdw        mm0, mm0
+    movq           [%5], mm0
+    movq       [8 + %5], mm0
+    movq      [16 + %5], mm0
+    movq      [24 + %5], mm0
+%%2:
+%endmacro
+
+%macro Z_COND_IDCT 8
+    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
+    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
+    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
+    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
+    movq            mm4, mm0
+    por             mm4, mm1
+    por             mm4, mm2
+    por             mm4, mm3
+    packssdw        mm4, mm4
+    movd            t0d, mm4
+    or              t0d, t0d
+    jz               %8
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
+    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
+    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
+    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
+    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
+    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
+    paddd           mm4, [coeffs]
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    paddd           mm4, mm5            ; A0             a0
+    psubd           mm6, mm5            ; A3             a3
+    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
+    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
+    paddd           mm0, [coeffs]
+    paddd           mm1, mm0            ; A1             a1
+    paddd           mm0, mm0
+    psubd           mm0, mm1            ; A2             a2
+    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
+    paddd           mm7, mm5            ; B0             b0
+    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
+    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
+    paddd           mm7, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm7            ; A0-B0          a0-b0
+    paddd           mm5, mm2            ; B1             b1
+    psrad           mm7, %7
+    psrad           mm4, %7
+    movq            mm2, mm1            ; A1             a1
+    paddd           mm1, mm5            ; A1+B1          a1+b1
+    psubd           mm2, mm5            ; A1-B1          a1-b1
+    psrad           mm1, %7
+    psrad           mm2, %7
+    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
+    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
+    movq           [%5], mm7
+    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
+    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
+    movq      [24 + %5], mm2
+    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
+    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
+    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
+    movq            mm2, mm0            ; A2             a2
+    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd           mm4, mm7            ; B2             b2
+    paddd           mm2, mm4            ; A2+B2          a2+b2
+    psubd           mm0, mm4            ; a2-B2          a2-b2
+    psrad           mm2, %7
+    psrad           mm0, %7
+    movq            mm4, mm6            ; A3             a3
+    paddd           mm3, mm1            ; B3             b3
+    paddd           mm6, mm3            ; A3+B3          a3+b3
+    psubd           mm4, mm3            ; a3-B3          a3-b3
+    psrad           mm6, %7
+    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
+    movq       [8 + %5], mm2
+    psrad           mm4, %7
+    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
+    movq      [16 + %5], mm4
+%endmacro
+
+%macro IDCT1 6
+    movq            mm0, %1             ; R4     R0      r4      r0
+    movq            mm1, %2             ; R6     R2      r6      r2
+    movq            mm2, %3             ; R3     R1      r3      r1
+    movq            mm3, %4             ; R7     R5      r7      r5
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
+    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
+    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
+    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
+    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
+    paddd           mm4, mm5            ; A0             a0
+    psubd           mm6, mm5            ; A3             a3
+    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
+    paddd           mm0, mm1            ; A1             a1
+    psubd           mm5, mm1            ; A2             a2
+    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
+    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
+    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
+    paddd           mm7, mm1            ; B0             b0
+    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
+    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
+    paddd           mm7, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm7            ; A0-B0          a0-b0
+    paddd           mm1, mm2            ; B1             b1
+    psrad           mm7, %6
+    psrad           mm4, %6
+    movq            mm2, mm0            ; A1             a1
+    paddd           mm0, mm1            ; A1+B1          a1+b1
+    psubd           mm2, mm1            ; A1-B1          a1-b1
+    psrad           mm0, %6
+    psrad           mm2, %6
+    packssdw        mm7, mm7            ; A0+B0  a0+b0
+    movd           [%5], mm7
+    packssdw        mm0, mm0            ; A1+B1  a1+b1
+    movd      [16 + %5], mm0
+    packssdw        mm2, mm2            ; A1-B1  a1-b1
+    movd      [96 + %5], mm2
+    packssdw        mm4, mm4            ; A0-B0  a0-b0
+    movd     [112 + %5], mm4
+    movq            mm0, %3             ; R3     R1      r3      r1
+    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
+    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
+    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
+    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
+    movq            mm2, mm5            ; A2             a2
+    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd           mm4, mm7            ; B2             b2
+    paddd           mm2, mm4            ; A2+B2          a2+b2
+    psubd           mm5, mm4            ; a2-B2          a2-b2
+    psrad           mm2, %6
+    psrad           mm5, %6
+    movq            mm4, mm6            ; A3             a3
+    paddd           mm3, mm0            ; B3             b3
+    paddd           mm6, mm3            ; A3+B3          a3+b3
+    psubd           mm4, mm3            ; a3-B3          a3-b3
+    psrad           mm6, %6
+    psrad           mm4, %6
+    packssdw        mm2, mm2            ; A2+B2  a2+b2
+    packssdw        mm6, mm6            ; A3+B3  a3+b3
+    movd      [32 + %5], mm2
+    packssdw        mm4, mm4            ; A3-B3  a3-b3
+    packssdw        mm5, mm5            ; A2-B2  a2-b2
+    movd      [48 + %5], mm6
+    movd      [64 + %5], mm4
+    movd      [80 + %5], mm5
+%endmacro
+
+%macro IDCT2 6
+    movq            mm0, %1             ; R4     R0      r4      r0
+    movq            mm1, %2             ; R6     R2      r6      r2
+    movq            mm3, %4             ; R7     R5      r7      r5
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
+    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
+    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
+    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    paddd           mm4, mm5            ; A0             a0
+    psubd           mm6, mm5            ; A3             a3
+    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
+    paddd           mm0, mm1            ; A1             a1
+    psubd           mm5, mm1            ; A2             a2
+    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
+    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
+    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
+    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
+    paddd           mm1, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm1            ; A0-B0          a0-b0
+    psrad           mm1, %6
+    psrad           mm4, %6
+    movq            mm2, mm0            ; A1             a1
+    paddd           mm0, mm7            ; A1+B1          a1+b1
+    psubd           mm2, mm7            ; A1-B1          a1-b1
+    psrad           mm0, %6
+    psrad           mm2, %6
+    packssdw        mm1, mm1            ; A0+B0  a0+b0
+    movd           [%5], mm1
+    packssdw        mm0, mm0            ; A1+B1  a1+b1
+    movd      [16 + %5], mm0
+    packssdw        mm2, mm2            ; A1-B1  a1-b1
+    movd      [96 + %5], mm2
+    packssdw        mm4, mm4            ; A0-B0  a0-b0
+    movd     [112 + %5], mm4
+    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
+    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
+    movq            mm2, mm5            ; A2             a2
+    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd           mm2, mm1            ; A2+B2          a2+b2
+    psubd           mm5, mm1            ; a2-B2          a2-b2
+    psrad           mm2, %6
+    psrad           mm5, %6
+    movq            mm1, mm6            ; A3             a3
+    paddd           mm6, mm3            ; A3+B3          a3+b3
+    psubd           mm1, mm3            ; a3-B3          a3-b3
+    psrad           mm6, %6
+    psrad           mm1, %6
+    packssdw        mm2, mm2            ; A2+B2  a2+b2
+    packssdw        mm6, mm6            ; A3+B3  a3+b3
+    movd      [32 + %5], mm2
+    packssdw        mm1, mm1            ; A3-B3  a3-b3
+    packssdw        mm5, mm5            ; A2-B2  a2-b2
+    movd      [48 + %5], mm6
+    movd      [64 + %5], mm1
+    movd      [80 + %5], mm5
+%endmacro
+
+%macro IDCT3 6
+    movq            mm0, %1             ; R4     R0      r4      r0
+    movq            mm3, %4             ; R7     R5      r7      r5
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
+    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
+    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
+    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
+    paddd           mm1, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm1            ; A0-B0          a0-b0
+    psrad           mm1, %6
+    psrad           mm4, %6
+    movq            mm2, mm0            ; A1             a1
+    paddd           mm0, mm7            ; A1+B1          a1+b1
+    psubd           mm2, mm7            ; A1-B1          a1-b1
+    psrad           mm0, %6
+    psrad           mm2, %6
+    packssdw        mm1, mm1            ; A0+B0  a0+b0
+    movd           [%5], mm1
+    packssdw        mm0, mm0            ; A1+B1  a1+b1
+    movd      [16 + %5], mm0
+    packssdw        mm2, mm2            ; A1-B1  a1-b1
+    movd      [96 + %5], mm2
+    packssdw        mm4, mm4            ; A0-B0  a0-b0
+    movd     [112 + %5], mm4
+    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
+    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
+    movq            mm2, mm5            ; A2             a2
+    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd           mm2, mm1            ; A2+B2          a2+b2
+    psubd           mm5, mm1            ; a2-B2          a2-b2
+    psrad           mm2, %6
+    psrad           mm5, %6
+    movq            mm1, mm6            ; A3             a3
+    paddd           mm6, mm3            ; A3+B3          a3+b3
+    psubd           mm1, mm3            ; a3-B3          a3-b3
+    psrad           mm6, %6
+    psrad           mm1, %6
+    packssdw        mm2, mm2            ; A2+B2  a2+b2
+    packssdw        mm6, mm6            ; A3+B3  a3+b3
+    movd      [32 + %5], mm2
+    packssdw        mm1, mm1            ; A3-B3  a3-b3
+    packssdw        mm5, mm5            ; A2-B2  a2-b2
+    movd      [48 + %5], mm6
+    movd      [64 + %5], mm1
+    movd      [80 + %5], mm5
+%endmacro
+
+%macro IDCT4 6
+    movq            mm0, %1             ; R4     R0      r4      r0
+    movq            mm2, %3             ; R3     R1      r3      r1
+    movq            mm3, %4             ; R7     R5      r7      r5
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
+    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
+    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
+    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
+    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
+    paddd           mm7, mm1            ; B0             b0
+    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
+    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
+    paddd           mm7, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm7            ; A0-B0          a0-b0
+    paddd           mm1, mm2            ; B1             b1
+    psrad           mm7, %6
+    psrad           mm4, %6
+    movq            mm2, mm0            ; A1             a1
+    paddd           mm0, mm1            ; A1+B1          a1+b1
+    psubd           mm2, mm1            ; A1-B1          a1-b1
+    psrad           mm0, %6
+    psrad           mm2, %6
+    packssdw        mm7, mm7            ; A0+B0  a0+b0
+    movd           [%5], mm7
+    packssdw        mm0, mm0            ; A1+B1  a1+b1
+    movd      [16 + %5], mm0
+    packssdw        mm2, mm2            ; A1-B1  a1-b1
+    movd      [96 + %5], mm2
+    packssdw        mm4, mm4            ; A0-B0  a0-b0
+    movd     [112 + %5], mm4
+    movq            mm0, %3             ; R3     R1      r3      r1
+    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
+    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
+    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
+    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
+    movq            mm2, mm5            ; A2             a2
+    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd           mm4, mm7            ; B2             b2
+    paddd           mm2, mm4            ; A2+B2          a2+b2
+    psubd           mm5, mm4            ; a2-B2          a2-b2
+    psrad           mm2, %6
+    psrad           mm5, %6
+    movq            mm4, mm6            ; A3             a3
+    paddd           mm3, mm0            ; B3             b3
+    paddd           mm6, mm3            ; A3+B3          a3+b3
+    psubd           mm4, mm3            ; a3-B3          a3-b3
+    psrad           mm6, %6
+    psrad           mm4, %6
+    packssdw        mm2, mm2            ; A2+B2  a2+b2
+    packssdw        mm6, mm6            ; A3+B3  a3+b3
+    movd      [32 + %5], mm2
+    packssdw        mm4, mm4            ; A3-B3  a3-b3
+    packssdw        mm5, mm5            ; A2-B2  a2-b2
+    movd      [48 + %5], mm6
+    movd      [64 + %5], mm4
+    movd      [80 + %5], mm5
+%endmacro
+
+%macro IDCT5 6
+    movq            mm0, %1             ; R4     R0      r4      r0
+    movq            mm2, %3             ; R3     R1      r3      r1
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
+    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
+    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm3, [coeffs + 64]
+    pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
+    paddd           mm7, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm7            ; A0-B0          a0-b0
+    psrad           mm7, %6
+    psrad           mm4, %6
+    movq            mm1, mm0            ; A1             a1
+    paddd           mm0, mm3            ; A1+B1          a1+b1
+    psubd           mm1, mm3            ; A1-B1          a1-b1
+    psrad           mm0, %6
+    psrad           mm1, %6
+    packssdw        mm7, mm7            ; A0+B0  a0+b0
+    movd           [%5], mm7
+    packssdw        mm0, mm0            ; A1+B1  a1+b1
+    movd      [16 + %5], mm0
+    packssdw        mm1, mm1            ; A1-B1  a1-b1
+    movd      [96 + %5], mm1
+    packssdw        mm4, mm4            ; A0-B0  a0-b0
+    movd     [112 + %5], mm4
+    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
+    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
+    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
+    movq            mm1, mm5            ; A2             a2
+    paddd           mm1, mm4            ; A2+B2          a2+b2
+    psubd           mm5, mm4            ; a2-B2          a2-b2
+    psrad           mm1, %6
+    psrad           mm5, %6
+    movq            mm4, mm6            ; A3             a3
+    paddd           mm6, mm2            ; A3+B3          a3+b3
+    psubd           mm4, mm2            ; a3-B3          a3-b3
+    psrad           mm6, %6
+    psrad           mm4, %6
+    packssdw        mm1, mm1            ; A2+B2  a2+b2
+    packssdw        mm6, mm6            ; A3+B3  a3+b3
+    movd      [32 + %5], mm1
+    packssdw        mm4, mm4            ; A3-B3  a3-b3
+    packssdw        mm5, mm5            ; A2-B2  a2-b2
+    movd      [48 + %5], mm6
+    movd      [64 + %5], mm4
+    movd      [80 + %5], mm5
+%endmacro
+
+%macro IDCT6 6
+    movq            mm0, [%1]           ; R4     R0      r4      r0
+    movq            mm1, [%2]           ; R6     R2      r6      r2
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
+    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
+    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
+    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    paddd           mm4, mm5            ; A0             a0
+    psubd           mm6, mm5            ; A3             a3
+    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
+    paddd           mm0, mm1            ; A1             a1
+    psubd           mm5, mm1            ; A2             a2
+    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
+    movq            mm3, [8 + %2]       ; R6     R2      r6      r2
+    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
+    pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
+    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
+    paddd           mm7, mm1            ; A0             a0
+    paddd           mm1, mm1            ; 2C0            2c0
+    psubd           mm1, mm7            ; A3             a3
+    paddd           mm3, mm2            ; A1             a1
+    paddd           mm2, mm2            ; 2C1            2c1
+    psubd           mm2, mm3            ; A2             a2
+    psrad           mm4, %6
+    psrad           mm7, %6
+    psrad           mm3, %6
+    packssdw        mm4, mm7            ; A0     a0
+    movq           [%5], mm4
+    psrad           mm0, %6
+    packssdw        mm0, mm3            ; A1     a1
+    movq      [16 + %5], mm0
+    movq      [96 + %5], mm0
+    movq     [112 + %5], mm4
+    psrad           mm5, %6
+    psrad           mm6, %6
+    psrad           mm2, %6
+    packssdw        mm5, mm2            ; A2-B2  a2-b2
+    movq      [32 + %5], mm5
+    psrad           mm1, %6
+    packssdw        mm6, mm1            ; A3+B3  a3+b3
+    movq      [48 + %5], mm6
+    movq      [64 + %5], mm6
+    movq      [80 + %5], mm5
+%endmacro
+
+%macro IDCT7 6
+    movq            mm0, %1             ; R4     R0      r4      r0
+    movq            mm1, %2             ; R6     R2      r6      r2
+    movq            mm2, %3             ; R3     R1      r3      r1
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
+    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
+    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
+    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
+    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
+    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
+    paddd           mm4, mm5            ; A0             a0
+    psubd           mm6, mm5            ; A3             a3
+    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
+    paddd           mm0, mm1            ; A1             a1
+    psubd           mm5, mm1            ; A2             a2
+    movq            mm1, [coeffs + 64]
+    pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
+    paddd           mm7, mm4            ; A0+B0          a0+b0
+    paddd           mm4, mm4            ; 2A0            2a0
+    psubd           mm4, mm7            ; A0-B0          a0-b0
+    psrad           mm7, %6
+    psrad           mm4, %6
+    movq            mm3, mm0            ; A1             a1
+    paddd           mm0, mm1            ; A1+B1          a1+b1
+    psubd           mm3, mm1            ; A1-B1          a1-b1
+    psrad           mm0, %6
+    psrad           mm3, %6
+    packssdw        mm7, mm7            ; A0+B0  a0+b0
+    movd           [%5], mm7
+    packssdw        mm0, mm0            ; A1+B1  a1+b1
+    movd      [16 + %5], mm0
+    packssdw        mm3, mm3            ; A1-B1  a1-b1
+    movd      [96 + %5], mm3
+    packssdw        mm4, mm4            ; A0-B0  a0-b0
+    movd     [112 + %5], mm4
+    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
+    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
+    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
+    movq            mm3, mm5            ; A2             a2
+    paddd           mm3, mm4            ; A2+B2          a2+b2
+    psubd           mm5, mm4            ; a2-B2          a2-b2
+    psrad           mm3, %6
+    psrad           mm5, %6
+    movq            mm4, mm6            ; A3             a3
+    paddd           mm6, mm2            ; A3+B3          a3+b3
+    psubd           mm4, mm2            ; a3-B3          a3-b3
+    psrad           mm6, %6
+    packssdw        mm3, mm3            ; A2+B2  a2+b2
+    movd      [32 + %5], mm3
+    psrad           mm4, %6
+    packssdw        mm6, mm6            ; A3+B3  a3+b3
+    movd      [48 + %5], mm6
+    packssdw        mm4, mm4            ; A3-B3  a3-b3
+    packssdw        mm5, mm5            ; A2-B2  a2-b2
+    movd      [64 + %5], mm4
+    movd      [80 + %5], mm5
+%endmacro
+
+%macro IDCT8 6
+    movq            mm0, [%1]           ; R4     R0      r4      r0
+    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
+    psrad           mm4, %6
+    psrad           mm0, %6
+    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
+    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
+    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
+    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
+    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
+    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
+    psrad           mm1, %6
+    packssdw        mm4, mm1            ; A0     a0
+    movq           [%5], mm4
+    psrad           mm2, %6
+    packssdw        mm0, mm2            ; A1     a1
+    movq      [16 + %5], mm0
+    movq      [96 + %5], mm0
+    movq     [112 + %5], mm4
+    movq      [32 + %5], mm0
+    movq      [48 + %5], mm4
+    movq      [64 + %5], mm4
+    movq      [80 + %5], mm0
+%endmacro
+
+%macro IDCT 0
+    DC_COND_IDCT  0,   8,  16,  24, rsp +  0, null, 11
+    Z_COND_IDCT  32,  40,  48,  56, rsp + 32, null, 11, %%4
+    Z_COND_IDCT  64,  72,  80,  88, rsp + 64, null, 11, %%2
+    Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
+
+    IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
+    IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
+    IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
+    IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
+    jmp %%9
+
+    ALIGN 16
+    %%4:
+    Z_COND_IDCT 64,  72,  80,  88, rsp + 64, null, 11, %%6
+    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
+
+    IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
+    IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
+    IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
+    IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
+    jmp %%9
+
+    ALIGN 16
+    %%6:
+    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
+
+    IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
+    IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
+    IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
+    IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
+    jmp %%9
+
+    ALIGN 16
+    %%2:
+    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
+
+    IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
+    IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
+    IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
+    IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
+    jmp %%9
+
+    ALIGN 16
+    %%3:
+
+    IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
+    IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
+    IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
+    IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
+    jmp %%9
+
+    ALIGN 16
+    %%5:
+
+    IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
+    IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
+    jmp %%9
+
+    ALIGN 16
+    %%1:
+
+    IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
+    IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
+    IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
+    IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
+    jmp %%9
+
+    ALIGN 16
+    %%7:
+
+    IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
+    IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
+
+    %%9:
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 1
+    mova       m0, [blockq+mmsize*0+%1]
+    mova       m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+    mova       m5, [blockq+mmsize*2+%1]
+    mova       m6, [blockq+mmsize*3+%1]
+%endif
+    movq       m2, [pixelsq]
+    movq       m3, [pixelsq+lsizeq]
+%if mmsize == 8
+    mova       m7, m2
+    punpcklbw  m2, m4
+    punpckhbw  m7, m4
+    paddsw     m0, m2
+    paddsw     m1, m7
+    mova       m7, m3
+    punpcklbw  m3, m4
+    punpckhbw  m7, m4
+    paddsw     m5, m3
+    paddsw     m6, m7
+%else
+    punpcklbw  m2, m4
+    punpcklbw  m3, m4
+    paddsw     m0, m2
+    paddsw     m1, m3
+%endif
+    packuswb   m0, m1
+%if mmsize == 8
+    packuswb   m5, m6
+    movq       [pixelsq], m0
+    movq       [pixelsq+lsizeq], m5
+%else
+    movq       [pixelsq], m0
+    movhps     [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+INIT_MMX mmx
+
+cglobal simple_idct, 1, 2, 8, 128, block, t0
+    IDCT
+RET
+
+cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
+    IDCT
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+RET
+
+cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
+    IDCT
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+RET
+
+INIT_XMM sse2
+
+cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
+    IDCT
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+RET
+
+cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
+    IDCT
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+RET
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
deleted file mode 100644
index 0939a49..0000000
--- a/libavcodec/x86/simple_idct.c
+++ /dev/null
@@ -1,918 +0,0 @@
-/*
- * Simple IDCT MMX
- *
- * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-
-#include "libavcodec/idctdsp.h"
-
-#include "idctdsp.h"
-#include "simple_idct.h"
-
-#if HAVE_INLINE_ASM
-
-/*
-23170.475006
-22725.260826
-21406.727617
-19265.545870
-16384.000000
-12872.826198
-8866.956905
-4520.335430
-*/
-#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
-#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-
-#define ROW_SHIFT 11
-#define COL_SHIFT 20 // 6
-
-DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
-DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
-
-DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
-        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
-//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
-//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
-        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
-        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
-//        0, 0, 0, 0,
-//        0, 0, 0, 0,
-
- C4,  C4,  C4,  C4,
- C4, -C4,  C4, -C4,
-
- C2,  C6,  C2,  C6,
- C6, -C2,  C6, -C2,
-
- C1,  C3,  C1,  C3,
- C5,  C7,  C5,  C7,
-
- C3, -C7,  C3, -C7,
--C1, -C5, -C1, -C5,
-
- C5, -C1,  C5, -C1,
- C7,  C3,  C7,  C3,
-
- C7, -C5,  C7, -C5,
- C3, -C1,  C3, -C1
-};
-
-static inline void idct(int16_t *block)
-{
-        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
-        int16_t * const temp= (int16_t*)align_tmp;
-
-        __asm__ volatile(
-#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
-        "pand %%mm0, %%mm4              \n\t"\
-        "por %%mm1, %%mm4               \n\t"\
-        "por %%mm2, %%mm4               \n\t"\
-        "por %%mm3, %%mm4               \n\t"\
-        "packssdw %%mm4,%%mm4           \n\t"\
-        "movd %%mm4, %%eax              \n\t"\
-        "orl %%eax, %%eax               \n\t"\
-        "jz 1f                          \n\t"\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-        "jmp 2f                         \n\t"\
-        "1:                             \n\t"\
-        "pslld $16, %%mm0               \n\t"\
-        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
-        "psrad $13, %%mm0               \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t"\
-        "movq %%mm0, " #dst "           \n\t"\
-        "movq %%mm0, 8+" #dst "         \n\t"\
-        "movq %%mm0, 16+" #dst "        \n\t"\
-        "movq %%mm0, 24+" #dst "        \n\t"\
-        "2:                             \n\t"
-
-#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq %%mm0, %%mm4              \n\t"\
-        "por %%mm1, %%mm4               \n\t"\
-        "por %%mm2, %%mm4               \n\t"\
-        "por %%mm3, %%mm4               \n\t"\
-        "packssdw %%mm4,%%mm4           \n\t"\
-        "movd %%mm4, %%eax              \n\t"\
-        "orl %%eax, %%eax               \n\t"\
-        "jz " #bt "                     \n\t"\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-
-#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-
-//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
-DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
-Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "# .p2align 4                   \n\t"\
-        "4:                             \n\t"
-Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm1, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm1, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "# .p2align 4                   \n\t"\
-        "6:                             \n\t"
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm1, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm1, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "# .p2align 4                   \n\t"\
-        "2:                             \n\t"
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "# .p2align 4                   \n\t"\
-        "3:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 64(%2), %%mm3             \n\t"\
-        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
-        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm1, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
-        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm1, 32+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "# .p2align 4                   \n\t"\
-        "5:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
-        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
-        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
-        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
-        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
-        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
-        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
-        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm3       \n\t"\
-        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
-        "movq %%mm4, " #dst "           \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
-        "movq %%mm0, 16+" #dst "        \n\t"\
-        "movq %%mm0, 96+" #dst "        \n\t"\
-        "movq %%mm4, 112+" #dst "       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movq %%mm5, 32+" #dst "        \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movq %%mm6, 48+" #dst "        \n\t"\
-        "movq %%mm6, 64+" #dst "        \n\t"\
-        "movq %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-
-        "# .p2align 4                   \n\t"\
-        "1:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 64(%2), %%mm1             \n\t"\
-        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm3       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm3, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
-        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm3       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
-        "movd %%mm3, 32+" #dst "        \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-
-        "# .p2align 4                   \n\t"
-        "7:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
-        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
-        "movq %%mm4, " #dst "           \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
-        "movq %%mm0, 16+" #dst "        \n\t"\
-        "movq %%mm0, 96+" #dst "        \n\t"\
-        "movq %%mm4, 112+" #dst "       \n\t"\
-        "movq %%mm0, 32+" #dst "        \n\t"\
-        "movq %%mm4, 48+" #dst "        \n\t"\
-        "movq %%mm4, 64+" #dst "        \n\t"\
-        "movq %%mm0, 80+" #dst "        \n\t"
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-
-
-/*
-Input
- 00 40 04 44 20 60 24 64
- 10 30 14 34 50 70 54 74
- 01 41 03 43 21 61 23 63
- 11 31 13 33 51 71 53 73
- 02 42 06 46 22 62 26 66
- 12 32 16 36 52 72 56 76
- 05 45 07 47 25 65 27 67
- 15 35 17 37 55 75 57 77
-
-Temp
- 00 04 10 14 20 24 30 34
- 40 44 50 54 60 64 70 74
- 01 03 11 13 21 23 31 33
- 41 43 51 53 61 63 71 73
- 02 06 12 16 22 26 32 36
- 42 46 52 56 62 66 72 76
- 05 07 15 17 25 27 35 37
- 45 47 55 57 65 67 75 77
-*/
-
-"9: \n\t"
-                :: "r" (block), "r" (temp), "r" (coeffs)
-                : "%eax"
-        );
-}
-
-void ff_simple_idct_mmx(int16_t *block)
-{
-    idct(block);
-}
-
-//FIXME merge add/put into the idct
-
-void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    idct(block);
-    ff_put_pixels_clamped(block, dest, line_size);
-}
-void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    idct(block);
-    ff_add_pixels_clamped(block, dest, line_size);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 15784a9..9b64cfe 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,4 +26,28 @@ void ff_simple_idct_mmx(int16_t *block);
 void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
+void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct8_sse2(int16_t *block);
+void ff_simple_idct8_avx(int16_t *block);
+
+void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct10_sse2(int16_t *block);
+void ff_simple_idct10_avx(int16_t *block);
+
+void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct12_sse2(int16_t *block);
+void ff_simple_idct12_avx(int16_t *block);
+
+void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
 #endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
new file mode 100644
index 0000000..069bb61
--- /dev/null
+++ b/libavcodec/x86/simple_idct10.asm
@@ -0,0 +1,205 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2015 Christophe Gisquet
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+cextern pw_2
+cextern pw_16
+cextern pw_32
+cextern pw_1023
+cextern pw_4095
+pd_round_11: times 4 dd 1<<(11-1)
+pd_round_12: times 4 dd 1<<(12-1)
+pd_round_15: times 4 dd 1<<(15-1)
+pd_round_19: times 4 dd 1<<(19-1)
+pd_round_20: times 4 dd 1<<(20-1)
+
+%macro CONST_DEC  3
+const %1
+times 4 dw %2, %3
+%endmacro
+
+%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
+%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
+%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
+%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
+%define W3sh2_lo 19266
+%define W4sh2_lo 16383
+%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
+%define W6sh2  8867 ; W6 = 35468 =  8867<<2
+%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
+
+CONST_DEC  w4_plus_w2_hi,   W4sh2, +W2sh2
+CONST_DEC  w4_min_w2_hi,    W4sh2, -W2sh2
+CONST_DEC  w4_plus_w6_hi,   W4sh2, +W6sh2
+CONST_DEC  w4_min_w6_hi,    W4sh2, -W6sh2
+CONST_DEC  w1_plus_w3_hi,   W1sh2, +W3sh2
+CONST_DEC  w3_min_w1_hi,    W3sh2, -W1sh2
+CONST_DEC  w7_plus_w3_hi,   W7sh2, +W3sh2
+CONST_DEC  w3_min_w7_hi,    W3sh2, -W7sh2
+CONST_DEC  w1_plus_w5,   W1sh2, +W5sh2
+CONST_DEC  w5_min_w1,    W5sh2, -W1sh2
+CONST_DEC  w5_plus_w7,   W5sh2, +W7sh2
+CONST_DEC  w7_min_w5,    W7sh2, -W5sh2
+CONST_DEC  w4_plus_w2_lo,   W4sh2_lo, +W2sh2
+CONST_DEC  w4_min_w2_lo,    W4sh2_lo, -W2sh2
+CONST_DEC  w4_plus_w6_lo,   W4sh2_lo, +W6sh2
+CONST_DEC  w4_min_w6_lo,    W4sh2_lo, -W6sh2
+CONST_DEC  w1_plus_w3_lo,   W1sh2,    +W3sh2_lo
+CONST_DEC  w3_min_w1_lo,    W3sh2_lo, -W1sh2
+CONST_DEC  w7_plus_w3_lo,   W7sh2,    +W3sh2_lo
+CONST_DEC  w3_min_w7_lo,    W3sh2_lo, -W7sh2
+
+%include "libavcodec/x86/simple_idct10_template.asm"
+
+SECTION .text
+
+%macro STORE_HI_LO 12
+    movq   %1, %9
+    movq   %3, %10
+    movq   %5, %11
+    movq   %7, %12
+    movhps %2, %9
+    movhps %4, %10
+    movhps %6, %11
+    movhps %8, %12
+%endmacro
+
+%macro LOAD_ZXBW_8 16
+    pmovzxbw %1, %9
+    pmovzxbw %2, %10
+    pmovzxbw %3, %11
+    pmovzxbw %4, %12
+    pmovzxbw %5, %13
+    pmovzxbw %6, %14
+    pmovzxbw %7, %15
+    pmovzxbw %8, %16
+%endmacro
+
+%macro LOAD_ZXBW_4 9
+    movh %1, %5
+    movh %2, %6
+    movh %3, %7
+    movh %4, %8
+    punpcklbw %1, %9
+    punpcklbw %2, %9
+    punpcklbw %3, %9
+    punpcklbw %4, %9
+%endmacro
+
+%define PASS4ROWS(base, stride, stride3) \
+    [base], [base + stride], [base + 2*stride], [base + stride3]
+
+%macro idct_fn 0
+
+define_constants _lo
+
+cglobal simple_idct8, 1, 1, 16, 32, block
+    IDCT_FN    "", 11, pw_32, 20, "store"
+RET
+
+cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block
+    IDCT_FN    "", 11, pw_32, 20
+    lea       r3, [3*lsizeq]
+    lea       r2, [pixelsq + r3]
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
+RET
+
+cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block
+    IDCT_FN    "", 11, pw_32, 20
+    lea r2, [3*lsizeq]
+    %if cpuflag(sse4)
+        lea r3, [pixelsq + r2]
+        LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2)
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        paddsw m4, m12
+        paddsw m11, m13
+        paddsw m9, m14
+        paddsw m10, m15
+    %else
+        pxor m12, m12
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        lea r3, [pixelsq + 4*lsizeq]
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12
+        paddsw m4, m3
+        paddsw m11, m5
+        paddsw m9, m6
+        paddsw m10, m7
+        lea r3, [pixelsq + r2]
+    %endif
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9
+RET
+
+define_constants _hi
+
+cglobal simple_idct10, 1, 1, 16, block
+    IDCT_FN    "", 12, "", 19, "store"
+    RET
+
+cglobal simple_idct10_put, 3, 3, 16, pixels, lsize, block
+    IDCT_FN    "", 12, "", 19, "put", 0, pw_1023
+    RET
+
+cglobal simple_idct12, 1, 1, 16, block
+    ; coeffs are already 15bits, adding the offset would cause
+    ; overflow in the input
+    IDCT_FN    "", 15, pw_2, 16, "store"
+    RET
+
+cglobal simple_idct12_put, 3, 3, 16, pixels, lsize, block
+    ; range isn't known, so the C simple_idct range is used
+    ; Also, using a bias on input overflows, so use the bias
+    ; on output of the first butterfly instead
+    IDCT_FN    "", 15, pw_2, 16, "put", 0, pw_4095
+    RET
+%endmacro
+
+INIT_XMM sse2
+idct_fn
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+idct_fn
+%endif
+
+%endif
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
new file mode 100644
index 0000000..0d04a98
--- /dev/null
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -0,0 +1,369 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; add SECTION_RODATA and proper include before including this file!
+
+%if ARCH_X86_64
+
+%macro define_constants 1
+    %undef w4_plus_w2
+    %undef w4_min_w2
+    %undef w4_plus_w6
+    %undef w4_min_w6
+    %undef w1_plus_w3
+    %undef w3_min_w1
+    %undef w7_plus_w3
+    %undef w3_min_w7
+    %define w4_plus_w2 w4_plus_w2%1
+    %define w4_min_w2  w4_min_w2%1
+    %define w4_plus_w6 w4_plus_w6%1
+    %define w4_min_w6  w4_min_w6%1
+    %define w1_plus_w3 w1_plus_w3%1
+    %define w3_min_w1  w3_min_w1%1
+    %define w7_plus_w3 w7_plus_w3%1
+    %define w3_min_w7  w3_min_w7%1
+%endmacro
+
+; interleave data while maintaining source
+; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
+%macro SBUTTERFLY3 5
+    punpckl%1   m%2, m%4, m%5
+    punpckh%1   m%3, m%4, m%5
+%endmacro
+
+; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
+; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
+;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
+%macro SUMSUB_SHPK 7
+    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
+    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
+    paddd       %1,  %5            ; { a0 + b0 }[0-3]
+    paddd       %2,  %6            ; { a0 + b0 }[4-7]
+    psrad       %1,  %7
+    psrad       %2,  %7
+    psrad       %3,  %7
+    psrad       %4,  %7
+    packssdw    %1,  %2            ; row[0]
+    packssdw    %3,  %4            ; row[7]
+%endmacro
+
+; %1 = initial bias ("" if nop)
+; %2 = number of bits to shift at the end
+; %3 = qmat (for prores)
+%macro IDCT_1D 2-3
+    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
+    ; a1 = a0;
+    ; a2 = a0;
+    ; a3 = a0;
+    ; a0 += W2 * row[2];
+    ; a1 += W6 * row[2];
+    ; a2 -= W6 * row[2];
+    ; a3 -= W2 * row[2];
+%ifstr %1
+    mova        m15, [pd_round_ %+ %2]
+%else
+    paddw       m10, [%1]
+%endif
+    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
+    pmaddwd     m2,  m0, [w4_plus_w6]
+    pmaddwd     m3,  m1, [w4_plus_w6]
+    pmaddwd     m4,  m0, [w4_min_w6]
+    pmaddwd     m5,  m1, [w4_min_w6]
+    pmaddwd     m6,  m0, [w4_min_w2]
+    pmaddwd     m7,  m1, [w4_min_w2]
+    pmaddwd     m0, [w4_plus_w2]
+    pmaddwd     m1, [w4_plus_w2]
+%ifstr %1
+    ; Adding 1<<(%2-1) for >=15 bits values
+    paddd       m2, m15
+    paddd       m3, m15
+    paddd       m4, m15
+    paddd       m5, m15
+    paddd       m6, m15
+    paddd       m7, m15
+    paddd       m0, m15
+    paddd       m1, m15
+%endif
+
+    ; a0: -1*row[0]-1*row[2]
+    ; a1: -1*row[0]
+    ; a2: -1*row[0]
+    ; a3: -1*row[0]+1*row[2]
+
+    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
+    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
+    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
+    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
+    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
+    pmaddwd     m10, m8, [w4_plus_w6]
+    pmaddwd     m11, m9, [w4_plus_w6]
+    paddd       m0,  m10            ; a0[0-3]
+    paddd       m1,  m11            ; a0[4-7]
+    pmaddwd     m10, m8, [w4_min_w6]
+    pmaddwd     m11, m9, [w4_min_w6]
+    paddd       m6,  m10           ; a3[0-3]
+    paddd       m7,  m11           ; a3[4-7]
+    pmaddwd     m10, m8, [w4_min_w2]
+    pmaddwd     m11, m9, [w4_min_w2]
+    pmaddwd     m8, [w4_plus_w2]
+    pmaddwd     m9, [w4_plus_w2]
+    psubd       m4,  m10           ; a2[0-3] intermediate
+    psubd       m5,  m11           ; a2[4-7] intermediate
+    psubd       m2,  m8            ; a1[0-3] intermediate
+    psubd       m3,  m9            ; a1[4-7] intermediate
+
+    ; load/store
+    mova   [blockq+  0], m0
+    mova   [blockq+ 32], m2
+    mova   [blockq+ 64], m4
+    mova   [blockq+ 96], m6
+    mova        m10,[blockq+ 16]       ; { row[1] }[0-7]
+    mova        m8, [blockq+ 48]       ; { row[3] }[0-7]
+    mova        m13,[blockq+ 80]       ; { row[5] }[0-7]
+    mova        m14,[blockq+112]       ; { row[7] }[0-7]
+    mova   [blockq+ 16], m1
+    mova   [blockq+ 48], m3
+    mova   [blockq+ 80], m5
+    mova   [blockq+112], m7
+%if %0 == 3
+    pmullw      m10,[%3+ 16]
+    pmullw      m8, [%3+ 48]
+    pmullw      m13,[%3+ 80]
+    pmullw      m14,[%3+112]
+%endif
+
+    ; b0 = MUL(W1, row[1]);
+    ; MAC(b0, W3, row[3]);
+    ; b1 = MUL(W3, row[1]);
+    ; MAC(b1, -W7, row[3]);
+    ; b2 = MUL(W5, row[1]);
+    ; MAC(b2, -W1, row[3]);
+    ; b3 = MUL(W7, row[1]);
+    ; MAC(b3, -W5, row[3]);
+    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
+    pmaddwd     m2,  m0, [w3_min_w7]
+    pmaddwd     m3,  m1, [w3_min_w7]
+    pmaddwd     m4,  m0, [w5_min_w1]
+    pmaddwd     m5,  m1, [w5_min_w1]
+    pmaddwd     m6,  m0, [w7_min_w5]
+    pmaddwd     m7,  m1, [w7_min_w5]
+    pmaddwd     m0, [w1_plus_w3]
+    pmaddwd     m1, [w1_plus_w3]
+
+    ; b0: +1*row[1]+2*row[3]
+    ; b1: +2*row[1]-1*row[3]
+    ; b2: -1*row[1]-1*row[3]
+    ; b3: +1*row[1]+1*row[3]
+
+    ; MAC(b0,  W5, row[5]);
+    ; MAC(b0,  W7, row[7]);
+    ; MAC(b1, -W1, row[5]);
+    ; MAC(b1, -W5, row[7]);
+    ; MAC(b2,  W7, row[5]);
+    ; MAC(b2,  W3, row[7]);
+    ; MAC(b3,  W3, row[5]);
+    ; MAC(b3, -W1, row[7]);
+    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
+
+    ; b0: -1*row[5]+1*row[7]
+    ; b1: -1*row[5]+1*row[7]
+    ; b2: +1*row[5]+2*row[7]
+    ; b3: +2*row[5]-1*row[7]
+
+    pmaddwd     m10, m8, [w1_plus_w5]
+    pmaddwd     m11, m9, [w1_plus_w5]
+    pmaddwd     m12, m8, [w5_plus_w7]
+    pmaddwd     m13, m9, [w5_plus_w7]
+    psubd       m2,  m10           ; b1[0-3]
+    psubd       m3,  m11           ; b1[4-7]
+    paddd       m0,  m12            ; b0[0-3]
+    paddd       m1,  m13            ; b0[4-7]
+    pmaddwd     m12, m8, [w7_plus_w3]
+    pmaddwd     m13, m9, [w7_plus_w3]
+    pmaddwd     m8, [w3_min_w1]
+    pmaddwd     m9, [w3_min_w1]
+    paddd       m4,  m12           ; b2[0-3]
+    paddd       m5,  m13           ; b2[4-7]
+    paddd       m6,  m8            ; b3[0-3]
+    paddd       m7,  m9            ; b3[4-7]
+
+    ; row[0] = (a0 + b0) >> 15;
+    ; row[7] = (a0 - b0) >> 15;
+    ; row[1] = (a1 + b1) >> 15;
+    ; row[6] = (a1 - b1) >> 15;
+    ; row[2] = (a2 + b2) >> 15;
+    ; row[5] = (a2 - b2) >> 15;
+    ; row[3] = (a3 + b3) >> 15;
+    ; row[4] = (a3 - b3) >> 15;
+    mova        m8, [blockq+ 0]        ; a0[0-3]
+    mova        m9, [blockq+16]        ; a0[4-7]
+    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
+    mova        m0, [blockq+32]        ; a1[0-3]
+    mova        m1, [blockq+48]        ; a1[4-7]
+    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
+    mova        m1, [blockq+64]        ; a2[0-3]
+    mova        m2, [blockq+80]        ; a2[4-7]
+    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
+    mova        m2, [blockq+96]        ; a3[0-3]
+    mova        m3, [blockq+112]       ; a3[4-7]
+    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
+%endmacro
+
+; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride,
+;                                  int16_t *block, const int16_t *qmat);
+
+; %1 = row shift
+; %2 = row bias macro
+; %3 = column shift
+; %4 = column bias macro
+; %5 = final action (nothing, "store", "put", "add")
+; %6 = min pixel value
+; %7 = max pixel value
+; %8 = qmat (for prores)
+
+%macro IDCT_FN 4-8
+    ; for (i = 0; i < 8; i++)
+    ;     idctRowCondDC(block + i*8);
+    mova        m10,[blockq+ 0]        ; { row[0] }[0-7]
+    mova        m8, [blockq+32]        ; { row[2] }[0-7]
+    mova        m13,[blockq+64]        ; { row[4] }[0-7]
+    mova        m12,[blockq+96]        ; { row[6] }[0-7]
+
+%if %0 == 8
+    pmullw      m10,[%8+ 0]
+    pmullw      m8, [%8+32]
+    pmullw      m13,[%8+64]
+    pmullw      m12,[%8+96]
+
+    IDCT_1D     %1, %2, %8
+%elif %2 == 11
+    ; This copies the DC-only shortcut.  When there is only a DC coefficient the
+    ; C shifts the value and splats it to all coeffs rather than multiplying and
+    ; doing the full IDCT.  This causes a difference on 8-bit because the
+    ; coefficient is 16383 rather than 16384 (which you can get with shifting).
+    por      m1,  m8, m13
+    por      m1,  m12
+    por      m1, [blockq+ 16]       ; { row[1] }[0-7]
+    por      m1, [blockq+ 48]       ; { row[3] }[0-7]
+    por      m1, [blockq+ 80]       ; { row[5] }[0-7]
+    por      m1, [blockq+112]       ; { row[7] }[0-7]
+    pxor     m2,  m2
+    pcmpeqw  m1,  m2
+    psllw    m2,  m10, 3
+    pand     m2,  m1
+    pcmpeqb  m3,  m3
+    pxor     m1,  m3
+    mova    [rsp],    m1
+    mova    [rsp+16], m2
+
+    IDCT_1D  %1,  %2
+
+    mova     m5, [rsp]
+    mova     m6, [rsp+16]
+    pand     m8,  m5
+    por      m8,  m6
+    pand     m0,  m5
+    por      m0,  m6
+    pand     m1,  m5
+    por      m1,  m6
+    pand     m2,  m5
+    por      m2,  m6
+    pand     m4,  m5
+    por      m4,  m6
+    pand     m11, m5
+    por      m11, m6
+    pand     m9,  m5
+    por      m9,  m6
+    pand     m10, m5
+    por      m10, m6
+%else
+    IDCT_1D     %1, %2
+%endif
+
+    ; transpose for second part of IDCT
+    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
+    mova   [blockq+ 16], m0
+    mova   [blockq+ 48], m2
+    mova   [blockq+ 80], m11
+    mova   [blockq+112], m10
+    SWAP         8,  10
+    SWAP         1,   8
+    SWAP         4,  13
+    SWAP         9,  12
+
+    ; for (i = 0; i < 8; i++)
+    ;     idctSparseColAdd(dest + i, line_size, block + i);
+    IDCT_1D     %3, %4
+
+    ; clip/store
+%if %0 >= 5
+%ifidn %5,"store"
+    ; No clamping, means pure idct
+    mova  [blockq+  0], m8
+    mova  [blockq+ 16], m0
+    mova  [blockq+ 32], m1
+    mova  [blockq+ 48], m2
+    mova  [blockq+ 64], m4
+    mova  [blockq+ 80], m11
+    mova  [blockq+ 96], m9
+    mova  [blockq+112], m10
+%elifidn %5,"put"
+%ifidn %6, 0
+    pxor        m3, m3
+%else
+    mova        m3, [%6]
+%endif ; ifidn %6, 0
+    mova        m5, [%7]
+    pmaxsw      m8,  m3
+    pmaxsw      m0,  m3
+    pmaxsw      m1,  m3
+    pmaxsw      m2,  m3
+    pmaxsw      m4,  m3
+    pmaxsw      m11, m3
+    pmaxsw      m9,  m3
+    pmaxsw      m10, m3
+    pminsw      m8,  m5
+    pminsw      m0,  m5
+    pminsw      m1,  m5
+    pminsw      m2,  m5
+    pminsw      m4,  m5
+    pminsw      m11, m5
+    pminsw      m9,  m5
+    pminsw      m10, m5
+
+    lea         r2, [r1*3]
+    mova  [r0     ], m8
+    mova  [r0+r1  ], m0
+    mova  [r0+r1*2], m1
+    mova  [r0+r2  ], m2
+    lea         r0, [r0+r1*4]
+    mova  [r0     ], m4
+    mova  [r0+r1  ], m11
+    mova  [r0+r1*2], m9
+    mova  [r0+r2  ], m10
+%endif ; %5 action
+%endif; if %0 >= 5
+%endmacro
+
+%endif
diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000..218e686
--- /dev/null
+++ b/libavcodec/x86/snowdsp.c
@@ -0,0 +1,908 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+        IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+        // The savings in code and time are well worth having to store this value and
+        // calculate b[0] correctly afterwards.
+
+        i = 0;
+        __asm__ volatile(
+            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
+            "pcmpeqd   %%xmm3, %%xmm3         \n\t"
+            "psllw         $1, %%xmm3         \n\t"
+            "paddw     %%xmm7, %%xmm3         \n\t"
+            "psllw        $13, %%xmm3         \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "paddw  %%xmm7, %%xmm2        \n\t"
+                "paddw  %%xmm7, %%xmm6        \n\t"
+                "pmulhw %%xmm3, %%xmm2        \n\t"
+                "pmulhw %%xmm3, %%xmm6        \n\t"
+                "paddw    (%0), %%xmm2        \n\t"
+                "paddw  16(%0), %%xmm6        \n\t"
+                "movdqa %%xmm2, (%0)          \n\t"
+                "movdqa %%xmm6, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+            dst[i] = dst[i] - (b[i] + b[i + 1]);
+        }
+        for(; i<w_r-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubw  %%xmm2, %%xmm0        \n\t"
+                "psubw  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+        IDWTELEM b_0 = b[0];
+
+        i = 0;
+        __asm__ volatile(
+            "psllw         $15, %%xmm7        \n\t"
+            "pcmpeqw    %%xmm6, %%xmm6        \n\t"
+            "psrlw         $13, %%xmm6        \n\t"
+            "paddw      %%xmm7, %%xmm6        \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm0        \n\t"
+                "movdqu 16(%1), %%xmm4        \n\t"
+                "movdqu  2(%1), %%xmm1        \n\t"
+                "movdqu 18(%1), %%xmm5        \n\t" //FIXME try aligned reads and shifts
+                "paddw  %%xmm6, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "paddw  %%xmm7, %%xmm1        \n\t"
+                "paddw  %%xmm7, %%xmm5        \n\t"
+                "pavgw  %%xmm1, %%xmm0        \n\t"
+                "pavgw  %%xmm5, %%xmm4        \n\t"
+                "psubw  %%xmm7, %%xmm0        \n\t"
+                "psubw  %%xmm7, %%xmm4        \n\t"
+                "psraw      $1, %%xmm0        \n\t"
+                "psraw      $1, %%xmm4        \n\t"
+                "movdqa   (%0), %%xmm1        \n\t"
+                "movdqa 16(%0), %%xmm5        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "psraw      $2, %%xmm0        \n\t"
+                "psraw      $2, %%xmm4        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+        b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+        }
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw    (%1), %%xmm2        \n\t"
+                "paddw  16(%1), %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddw  %%xmm2, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "psraw      $1, %%xmm2        \n\t"
+                "psraw      $1, %%xmm6        \n\t"
+                "paddw  %%xmm0, %%xmm2        \n\t"
+                "paddw  %%xmm4, %%xmm6        \n\t"
+                "movdqa %%xmm2, (%2)          \n\t"
+                "movdqa %%xmm6, 16(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x3E) != 0x3E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=62; i>=0; i-=64){
+            __asm__ volatile(
+                "movdqa      (%1), %%xmm0       \n\t"
+                "movdqa    16(%1), %%xmm2       \n\t"
+                "movdqa    32(%1), %%xmm4       \n\t"
+                "movdqa    48(%1), %%xmm6       \n\t"
+                "movdqa      (%1), %%xmm1       \n\t"
+                "movdqa    16(%1), %%xmm3       \n\t"
+                "movdqa    32(%1), %%xmm5       \n\t"
+                "movdqa    48(%1), %%xmm7       \n\t"
+                "punpcklwd   (%2), %%xmm0       \n\t"
+                "punpcklwd 16(%2), %%xmm2       \n\t"
+                "punpcklwd 32(%2), %%xmm4       \n\t"
+                "punpcklwd 48(%2), %%xmm6       \n\t"
+                "movdqa    %%xmm0, (%0)         \n\t"
+                "movdqa    %%xmm2, 32(%0)       \n\t"
+                "movdqa    %%xmm4, 64(%0)       \n\t"
+                "movdqa    %%xmm6, 96(%0)       \n\t"
+                "punpckhwd   (%2), %%xmm1       \n\t"
+                "punpckhwd 16(%2), %%xmm3       \n\t"
+                "punpckhwd 32(%2), %%xmm5       \n\t"
+                "punpckhwd 48(%2), %%xmm7       \n\t"
+                "movdqa    %%xmm1, 16(%0)       \n\t"
+                "movdqa    %%xmm3, 48(%0)       \n\t"
+                "movdqa    %%xmm5, 80(%0)       \n\t"
+                "movdqa    %%xmm7, 112(%0)      \n\t"
+                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+        __asm__ volatile(
+            "pcmpeqw    %%mm7, %%mm7         \n\t"
+            "pcmpeqw    %%mm3, %%mm3         \n\t"
+            "psllw         $1, %%mm3         \n\t"
+            "paddw      %%mm7, %%mm3         \n\t"
+            "psllw        $13, %%mm3         \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "paddw   %%mm7, %%mm2        \n\t"
+                "paddw   %%mm7, %%mm6        \n\t"
+                "pmulhw  %%mm3, %%mm2        \n\t"
+                "pmulhw  %%mm3, %%mm6        \n\t"
+                "paddw    (%0), %%mm2        \n\t"
+                "paddw   8(%0), %%mm6        \n\t"
+                "movq    %%mm2, (%0)         \n\t"
+                "movq    %%mm6, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubw   %%mm2, %%mm0        \n\t"
+                "psubw   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+
+        i = 1;
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+        __asm__ volatile(
+            "psllw         $15, %%mm7        \n\t"
+            "pcmpeqw     %%mm6, %%mm6        \n\t"
+            "psrlw         $13, %%mm6        \n\t"
+            "paddw       %%mm7, %%mm6        \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm0        \n\t"
+                "movq    8(%1), %%mm4        \n\t"
+                "movq    2(%1), %%mm1        \n\t"
+                "movq   10(%1), %%mm5        \n\t"
+                "paddw   %%mm6, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "paddw   %%mm7, %%mm1        \n\t"
+                "paddw   %%mm7, %%mm5        \n\t"
+                "pavgw   %%mm1, %%mm0        \n\t"
+                "pavgw   %%mm5, %%mm4        \n\t"
+                "psubw   %%mm7, %%mm0        \n\t"
+                "psubw   %%mm7, %%mm4        \n\t"
+                "psraw      $1, %%mm0        \n\t"
+                "psraw      $1, %%mm4        \n\t"
+                "movq     (%0), %%mm1        \n\t"
+                "movq    8(%0), %%mm5        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "psraw      $2, %%mm0        \n\t"
+                "psraw      $2, %%mm4        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+        i = 0;
+
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq    2(%1), %%mm2        \n\t"
+                "movq   10(%1), %%mm6        \n\t"
+                "paddw    (%1), %%mm2        \n\t"
+                "paddw   8(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "paddw   %%mm2, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "psraw      $1, %%mm2        \n\t"
+                "psraw      $1, %%mm6        \n\t"
+                "paddw   %%mm0, %%mm2        \n\t"
+                "paddw   %%mm4, %%mm6        \n\t"
+                "movq    %%mm2, (%2)         \n\t"
+                "movq    %%mm6, 8(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x1E) != 0x1E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=30; i>=0; i-=32){
+            __asm__ volatile(
+                "movq        (%1), %%mm0       \n\t"
+                "movq       8(%1), %%mm2       \n\t"
+                "movq      16(%1), %%mm4       \n\t"
+                "movq      24(%1), %%mm6       \n\t"
+                "movq        (%1), %%mm1       \n\t"
+                "movq       8(%1), %%mm3       \n\t"
+                "movq      16(%1), %%mm5       \n\t"
+                "movq      24(%1), %%mm7       \n\t"
+                "punpcklwd   (%2), %%mm0       \n\t"
+                "punpcklwd  8(%2), %%mm2       \n\t"
+                "punpcklwd 16(%2), %%mm4       \n\t"
+                "punpcklwd 24(%2), %%mm6       \n\t"
+                "movq       %%mm0, (%0)        \n\t"
+                "movq       %%mm2, 16(%0)      \n\t"
+                "movq       %%mm4, 32(%0)      \n\t"
+                "movq       %%mm6, 48(%0)      \n\t"
+                "punpckhwd   (%2), %%mm1       \n\t"
+                "punpckhwd  8(%2), %%mm3       \n\t"
+                "punpckhwd 16(%2), %%mm5       \n\t"
+                "punpckhwd 24(%2), %%mm7       \n\t"
+                "movq       %%mm1, 8(%0)       \n\t"
+                "movq       %%mm3, 24(%0)      \n\t"
+                "movq       %%mm5, 40(%0)      \n\t"
+                "movq       %%mm7, 56(%0)      \n\t"
+                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"FF_REG_d"), %%"t0"      \n\t"\
+        ""op" 16("r",%%"FF_REG_d"), %%"t1"    \n\t"\
+        ""op" 32("r",%%"FF_REG_d"), %%"t2"    \n\t"\
+        ""op" 48("r",%%"FF_REG_d"), %%"t3"    \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "psubw %%"s0", %%"t0" \n\t"\
+        "psubw %%"s1", %%"t1" \n\t"\
+        "psubw %%"s2", %%"t2" \n\t"\
+        "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+        "movdqa %%"s0", ("w",%%"FF_REG_d")    \n\t"\
+        "movdqa %%"s1", 16("w",%%"FF_REG_d")  \n\t"\
+        "movdqa %%"s2", 32("w",%%"FF_REG_d")  \n\t"\
+        "movdqa %%"s3", 48("w",%%"FF_REG_d")  \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+        "psraw $"n", %%"t0" \n\t"\
+        "psraw $"n", %%"t1" \n\t"\
+        "psraw $"n", %%"t2" \n\t"\
+        "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "paddw %%"s0", %%"t0" \n\t"\
+        "paddw %%"s1", %%"t1" \n\t"\
+        "paddw %%"s2", %%"t2" \n\t"\
+        "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "pmulhw %%"s0", %%"t0" \n\t"\
+        "pmulhw %%"s1", %%"t1" \n\t"\
+        "pmulhw %%"s2", %%"t2" \n\t"\
+        "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movdqa %%"s0", %%"t0" \n\t"\
+        "movdqa %%"s1", %%"t1" \n\t"\
+        "movdqa %%"s2", %%"t2" \n\t"\
+        "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+
+    while(i & 0x1F)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+
+         __asm__ volatile (
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+        "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
+        "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm0, %%xmm2                   \n\t"
+        "psllw         $13, %%xmm2                   \n\t"
+        snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+        "pcmpeqw %%xmm7, %%xmm7                      \n\t"
+        "pcmpeqw %%xmm5, %%xmm5                      \n\t"
+        "psllw $15, %%xmm7                           \n\t"
+        "psrlw $13, %%xmm5                           \n\t"
+        "paddw %%xmm7, %%xmm5                        \n\t"
+        snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+        "movq   (%2,%%"FF_REG_d"), %%xmm1            \n\t"
+        "movq  8(%2,%%"FF_REG_d"), %%xmm3            \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm0                        \n\t"
+        "pavgw %%xmm3, %%xmm2                        \n\t"
+        "movq 16(%2,%%"FF_REG_d"), %%xmm1            \n\t"
+        "movq 24(%2,%%"FF_REG_d"), %%xmm3            \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm4                        \n\t"
+        "pavgw %%xmm3, %%xmm6                        \n\t"
+        snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+        snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+        "2:                                          \n\t"
+        "sub $64, %%"FF_REG_d"                       \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"FF_REG_d"), %%"t0"   \n\t"\
+        ""op" 8("r",%%"FF_REG_d"), %%"t1"  \n\t"\
+        ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
+        ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+        "movq %%"s0", ("w",%%"FF_REG_d")   \n\t"\
+        "movq %%"s1", 8("w",%%"FF_REG_d")  \n\t"\
+        "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
+        "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movq %%"s0", %%"t0" \n\t"\
+        "movq %%"s1", %%"t1" \n\t"\
+        "movq %%"s2", %%"t2" \n\t"\
+        "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+    while(i & 15)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+    __asm__ volatile(
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+        "pcmpeqw    %%mm0, %%mm0                     \n\t"
+        "pcmpeqw    %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm0, %%mm2                     \n\t"
+        "psllw        $13, %%mm2                     \n\t"
+        snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+        "pcmpeqw %%mm7, %%mm7                        \n\t"
+        "pcmpeqw %%mm5, %%mm5                        \n\t"
+        "psllw $15, %%mm7                            \n\t"
+        "psrlw $13, %%mm5                            \n\t"
+        "paddw %%mm7, %%mm5                          \n\t"
+        snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+        "movq   (%2,%%"FF_REG_d"), %%mm1             \n\t"
+        "movq  8(%2,%%"FF_REG_d"), %%mm3             \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm0                          \n\t"
+        "pavgw %%mm3, %%mm2                          \n\t"
+        "movq 16(%2,%%"FF_REG_d"), %%mm1             \n\t"
+        "movq 24(%2,%%"FF_REG_d"), %%mm3             \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm4                          \n\t"
+        "pavgw %%mm3, %%mm6                          \n\t"
+        snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+        snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+        "2:                                          \n\t"
+        "sub $32, %%"FF_REG_d"                       \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#if HAVE_6REGS
+#define snow_inner_add_yblock_sse2_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"FF_REG_c"          \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"FF_REG_S"          \n\t"\
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
+             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
+             "psllw $15, %%xmm3              \n\t"\
+             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"FF_REG_D"           \n\t"\
+             "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+             "add %3, %%"FF_REG_D"           \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\
+             "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2"             \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"FF_REG_S"), %%xmm0    \n\t"\
+             "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\
+             "movq 8(%%"FF_REG_d"), %%"out_reg2"                          \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"FF_REG_S"), %%xmm0   \n\t"\
+             "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+             "add $32, %%"FF_REG_S"                            \n\t"\
+             "add %%"FF_REG_c", %0                             \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
+             "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+             "sal $1, %%"FF_REG_c"                \n\t"\
+             "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "sar $1, %%"FF_REG_c"           \n\t"\
+             "sub $2, %2                     \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+             "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "dec %2                         \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+             "mov %0, %%"FF_REG_d"           \n\t"
+             "movdqa (%%"FF_REG_D"), %%xmm0  \n\t"
+             "movdqa %%xmm1, %%xmm2          \n\t"
+
+             "punpckhwd %%xmm7, %%xmm1       \n\t"
+             "punpcklwd %%xmm7, %%xmm2       \n\t"
+             "paddd %%xmm2, %%xmm0           \n\t"
+             "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
+             "paddd %%xmm1, %%xmm2           \n\t"
+             "paddd %%xmm3, %%xmm0           \n\t"
+             "paddd %%xmm3, %%xmm2           \n\t"
+
+             "mov %1, %%"FF_REG_D"           \n\t"
+             "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
+             "add %3, %%"FF_REG_D"           \n\t"
+
+             "movdqa (%%"FF_REG_D"), %%xmm4  \n\t"
+             "movdqa %%xmm5, %%xmm6          \n\t"
+             "punpckhwd %%xmm7, %%xmm5       \n\t"
+             "punpcklwd %%xmm7, %%xmm6       \n\t"
+             "paddd %%xmm6, %%xmm4           \n\t"
+             "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
+             "paddd %%xmm5, %%xmm6           \n\t"
+             "paddd %%xmm3, %%xmm4           \n\t"
+             "paddd %%xmm3, %%xmm6           \n\t"
+
+             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm2, %%xmm0        \n\t"
+             "packuswb %%xmm7, %%xmm0        \n\t"
+             "movq %%xmm0, (%%"FF_REG_d")    \n\t"
+
+             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm6, %%xmm4        \n\t"
+             "packuswb %%xmm7, %%xmm4        \n\t"
+             "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+             "mov %0, %%"FF_REG_d"           \n\t"
+             "psrlw $4, %%xmm1               \n\t"
+             "psrlw $4, %%xmm5               \n\t"
+             "paddw   (%%"FF_REG_D"), %%xmm1 \n\t"
+             "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
+             "paddw %%xmm3, %%xmm1           \n\t"
+             "paddw %%xmm3, %%xmm5           \n\t"
+             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
+             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
+             "packuswb %%xmm5, %%xmm1        \n\t"
+
+             "movdqu %%xmm1, (%%"FF_REG_d")  \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"FF_REG_c"          \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"FF_REG_S"          \n\t"\
+             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
+             "pcmpeqd %%mm3, %%mm3           \n\t"\
+             "psllw $15, %%mm3               \n\t"\
+             "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"FF_REG_D"           \n\t"\
+             "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+             "add %3, %%"FF_REG_D"           \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1"                 \n\t"\
+             "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2"               \n\t"\
+             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+             "movd "s_offset"(%%"FF_REG_S"), %%mm0   \n\t"\
+             "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
+             "punpcklbw %%mm7, %%mm0       \n\t"\
+             "punpcklbw %%mm7, %%mm4       \n\t"\
+             "pmullw %%mm0, %%"out_reg1"   \n\t"\
+             "pmullw %%mm4, %%"out_reg2"   \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+             "paddusw %%mm2, %%mm1         \n\t"\
+             "paddusw %%mm6, %%mm5         \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+             "mov %0, %%"FF_REG_d"           \n\t"\
+             "psrlw $4, %%mm1                \n\t"\
+             "psrlw $4, %%mm5                \n\t"\
+             "paddw "read_offset"(%%"FF_REG_D"), %%mm1   \n\t"\
+             "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
+             "paddw %%mm3, %%mm1             \n\t"\
+             "paddw %%mm3, %%mm5             \n\t"\
+             "psraw $4, %%mm1                \n\t"\
+             "psraw $4, %%mm5                \n\t"\
+             "packuswb %%mm5, %%mm1          \n\t"\
+             "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+             "add $"s_step", %%"FF_REG_S"                      \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t"\
+             "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1             \n\t"\
+             "add %%"FF_REG_c", %0                             \n\t"\
+             "dec %2                         \n\t"\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16) {
+        if (!(b_h & 1))
+            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+        else
+            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    } else
+         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16)
+        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else
+        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+#endif
+        }
+        else{
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+            }
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+#endif
+        }
+    }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/x86/svq1enc.asm b/libavcodec/x86/svq1enc.asm
new file mode 100644
index 0000000..a876328
--- /dev/null
+++ b/libavcodec/x86/svq1enc.asm
@@ -0,0 +1,61 @@
+;******************************************************************************
+;* SIMD-optimized SVQ1 encoder functions
+;* Copyright (c) 2007 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SSD_INT8_VS_INT16 0
+cglobal ssd_int8_vs_int16, 3, 3, 3, pix1, pix2, size
+    pxor m0, m0
+.loop:
+    sub       sizeq, 8
+    movq      m1, [pix1q + sizeq]
+    mova      m2, [pix2q + sizeq*2]
+%if mmsize == 8
+    movq      m3, [pix2q + sizeq*2 + mmsize]
+    punpckhbw m4, m1
+    punpcklbw m1, m1
+    psraw     m4, 8
+    psraw     m1, 8
+    psubw     m3, m4
+    psubw     m2, m1
+    pmaddwd   m3, m3
+    pmaddwd   m2, m2
+    paddd     m0, m3
+    paddd     m0, m2
+%else
+    punpcklbw m1, m1
+    psraw     m1, 8
+    psubw     m2, m1
+    pmaddwd   m2, m2
+    paddd     m0, m2
+%endif
+    jg .loop
+    HADDD     m0, m1
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmx
+SSD_INT8_VS_INT16
+INIT_XMM sse2
+SSD_INT8_VS_INT16
diff --git a/libavcodec/x86/svq1enc.c b/libavcodec/x86/svq1enc.c
deleted file mode 100644
index 02b0a84..0000000
--- a/libavcodec/x86/svq1enc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/svq1enc.h"
-
-#if HAVE_INLINE_ASM
-
-static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
-                                 int size)
-{
-    int sum;
-    x86_reg i = size;
-
-    __asm__ volatile (
-        "pxor %%mm4, %%mm4 \n"
-        "1: \n"
-        "sub $8, %0 \n"
-        "movq (%2, %0), %%mm2 \n"
-        "movq (%3, %0, 2), %%mm0 \n"
-        "movq 8(%3, %0, 2), %%mm1 \n"
-        "punpckhbw %%mm2, %%mm3 \n"
-        "punpcklbw %%mm2, %%mm2 \n"
-        "psraw $8, %%mm3 \n"
-        "psraw $8, %%mm2 \n"
-        "psubw %%mm3, %%mm1 \n"
-        "psubw %%mm2, %%mm0 \n"
-        "pmaddwd %%mm1, %%mm1 \n"
-        "pmaddwd %%mm0, %%mm0 \n"
-        "paddd %%mm1, %%mm4 \n"
-        "paddd %%mm0, %%mm4 \n"
-        "jg 1b \n"
-        "movq %%mm4, %%mm3 \n"
-        "psrlq $32, %%mm3 \n"
-        "paddd %%mm3, %%mm4 \n"
-        "movd %%mm4, %1 \n"
-        : "+r" (i), "=r" (sum)
-        : "r" (pix1), "r" (pix2));
-
-    return sum;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (INLINE_MMX(cpu_flags)) {
-        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/svq1enc_init.c b/libavcodec/x86/svq1enc_init.c
new file mode 100644
index 0000000..40b4b0e
--- /dev/null
+++ b/libavcodec/x86/svq1enc_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/svq1enc.h"
+
+int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+                             intptr_t size);
+int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
+                              intptr_t size);
+
+av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
+    }
+}
diff --git a/libavcodec/x86/synth_filter.asm b/libavcodec/x86/synth_filter.asm
new file mode 100644
index 0000000..bc1a48f
--- /dev/null
+++ b/libavcodec/x86/synth_filter.asm
@@ -0,0 +1,246 @@
+;******************************************************************************
+;* SSE-optimized functions for the DCA decoder
+;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+    pxor          %1, %1
+%else
+    xorps         %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 3
+%if cpuflag(avx)
+    mova          %3, [%2 - 16]
+    vperm2f128    %1, %3, %3, 1
+    vshufps       %1, %1, %1, q0123
+%elif cpuflag(sse2)
+    pshufd        %1, [%2], q0123
+%else
+    mova          %1, [%2]
+    shufps        %1, %1, q0123
+%endif
+%endmacro
+
+%macro INNER_LOOP   1
+    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
+    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
+    ;~ b += window[i + j + 16] * (synth_buf[i + j])
+    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
+    mova          m6, [ptr1 + j]
+%if ARCH_X86_64
+    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
+    mova         m12, [ptr1 + j + mmsize]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
+    fnmaddps      m1, m5,  [win + %1 + j], m1
+%if ARCH_X86_64
+    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
+    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
+%endif
+%else ; non-FMA
+    mulps         m6, m6,  [win + %1 + j + 16 * 4]
+    mulps         m5, m5,  [win + %1 + j]
+%if ARCH_X86_64
+    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
+    mulps        m11, m11, [win + %1 + j + mmsize]
+%endif
+    addps         m2, m2, m6
+    subps         m1, m1, m5
+%if ARCH_X86_64
+    addps         m8, m8, m12
+    subps         m7, m7, m11
+%endif
+%endif ; cpuflag(fma3)
+    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
+    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
+    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
+    mova          m5, [ptr1 + j + 16 * 4]
+%if ARCH_X86_64
+    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
+    mova         m11, [ptr1 + j + mmsize + 16 * 4]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
+    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
+%if ARCH_X86_64
+    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
+    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
+%endif
+%else ; non-FMA
+    mulps         m5, m5,  [win + %1 + j + 32 * 4]
+    mulps         m6, m6,  [win + %1 + j + 48 * 4]
+%if ARCH_X86_64
+    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
+    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
+%endif
+    addps         m3, m3, m5
+    addps         m4, m4, m6
+%if ARCH_X86_64
+    addps         m9, m9, m11
+    addps        m10, m10, m12
+%endif
+%endif ; cpuflag(fma3)
+    sub            j, 64 * 4
+%endmacro
+
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+;                                  const float window[512], float out[32],
+;                                  intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
+cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
+                              synth_buf, synth_buf2, window, out, off, scale
+%define scale m0
+%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+    movd       scale, scalem
+    SPLATD        m0
+%else
+    VBROADCASTSS  m0, scalem
+%endif
+; Make sure offset is in a register and not on the stack
+%define OFFQ  r4q
+%else
+    SPLATD      xmm0
+%if cpuflag(avx)
+    vinsertf128   m0, m0, xmm0, 1
+%endif
+%define OFFQ  offq
+%endif
+    ; prepare inner counter limit 1
+    mov          r5q, 480
+    sub          r5q, offmp
+    and          r5q, -64
+    shl          r5q, 2
+%if ARCH_X86_32 || notcpuflag(avx)
+    mov         OFFQ, r5q
+%define i        r5q
+    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
+%else
+%define i 0
+%define OFFQ  r5q
+%endif
+
+%define buf2     synth_buf2q
+%if ARCH_X86_32
+    mov         buf2, synth_buf2mp
+%endif
+.mainloop:
+    ; m1 = a  m2 = b  m3 = c  m4 = d
+    SETZERO       m3
+    SETZERO       m4
+    mova          m1, [buf2 + i]
+    mova          m2, [buf2 + i + 16 * 4]
+%if ARCH_X86_32
+%define ptr1     r0q
+%define ptr2     r1q
+%define win      r2q
+%define j        r3q
+    mov          win, windowm
+    mov         ptr1, synth_bufm
+%if ARCH_X86_32 || notcpuflag(avx)
+    add          win, i
+    add         ptr1, i
+%endif
+%else ; ARCH_X86_64
+%define ptr1     r6q
+%define ptr2     r7q ; must be loaded
+%define win      r8q
+%define j        r9q
+    SETZERO       m9
+    SETZERO      m10
+    mova          m7, [buf2 + i + mmsize]
+    mova          m8, [buf2 + i + mmsize + 16 * 4]
+    lea          win, [windowq + i]
+    lea         ptr1, [synth_bufq + i]
+%endif
+    mov         ptr2, synth_bufmp
+    ; prepare the inner loop counter
+    mov            j, OFFQ
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub         ptr2, i
+%endif
+.loop1:
+    INNER_LOOP  0
+    jge       .loop1
+
+    mov            j, 448 * 4
+    sub            j, OFFQ
+    jz          .end
+    sub         ptr1, j
+    sub         ptr2, j
+    add          win, OFFQ ; now at j-64, so define OFFSET
+    sub            j, 64 * 4
+.loop2:
+    INNER_LOOP  64 * 4
+    jge       .loop2
+
+.end:
+%if ARCH_X86_32
+    mov         buf2, synth_buf2m ; needed for next iteration anyway
+    mov         outq, outmp       ; j, which will be set again during it
+%endif
+    ;~ out[i]      = a * scale;
+    ;~ out[i + 16] = b * scale;
+    mulps         m1, m1, scale
+    mulps         m2, m2, scale
+%if ARCH_X86_64
+    mulps         m7, m7, scale
+    mulps         m8, m8, scale
+%endif
+    ;~ synth_buf2[i]      = c;
+    ;~ synth_buf2[i + 16] = d;
+    mova   [buf2 + i +  0 * 4], m3
+    mova   [buf2 + i + 16 * 4], m4
+%if ARCH_X86_64
+    mova   [buf2 + i +  0 * 4 + mmsize], m9
+    mova   [buf2 + i + 16 * 4 + mmsize], m10
+%endif
+    ;~ out[i]      = a;
+    ;~ out[i + 16] = a;
+    mova   [outq + i +  0 * 4], m1
+    mova   [outq + i + 16 * 4], m2
+%if ARCH_X86_64
+    mova   [outq + i +  0 * 4 + mmsize], m7
+    mova   [outq + i + 16 * 4 + mmsize], m8
+%endif
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub            i, (ARCH_X86_64 + 1) * mmsize
+    jge    .mainloop
+%endif
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
+INIT_YMM avx
+SYNTH_FILTER
+INIT_YMM fma3
+SYNTH_FILTER
diff --git a/libavcodec/x86/synth_filter_init.c b/libavcodec/x86/synth_filter_init.c
new file mode 100644
index 0000000..35e2b47
--- /dev/null
+++ b/libavcodec/x86/synth_filter_init.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/synth_filter.h"
+
+#define SYNTH_FILTER_FUNC(opt)                                                 \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
+                                 const float window[512],                      \
+                                 float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct,                              \
+                               float *synth_buf_ptr, int *synth_buf_offset,    \
+                               float synth_buf2[32], const float window[512],  \
+                               float out[32], const float in[32], float scale) \
+{                                                                              \
+    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
+                                                                               \
+    imdct->imdct_half(imdct, synth_buf, in);                                   \
+                                                                               \
+    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
+                                out, *synth_buf_offset, scale);                \
+                                                                               \
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
+}                                                                              \
+
+#if HAVE_X86ASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse;
+    }
+#endif
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse2;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_avx;
+    }
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_fma3;
+    }
+#endif /* HAVE_X86ASM */
+}
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
new file mode 100644
index 0000000..5f3ded3
--- /dev/null
+++ b/libavcodec/x86/takdsp.asm
@@ -0,0 +1,116 @@
+;******************************************************************************
+;* TAK DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_128: times 4 dd 128
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+.loop:
+    mova                         m0, [p1q+lengthq+mmsize*0]
+    mova                         m1, [p1q+lengthq+mmsize*1]
+    paddd                        m0, [p2q+lengthq+mmsize*0]
+    paddd                        m1, [p2q+lengthq+mmsize*1]
+    mova     [p2q+lengthq+mmsize*0], m0
+    mova     [p2q+lengthq+mmsize*1], m1
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+
+.loop:
+    mova                         m0, [p2q+lengthq+mmsize*0]
+    mova                         m1, [p2q+lengthq+mmsize*1]
+    psubd                        m0, [p1q+lengthq+mmsize*0]
+    psubd                        m1, [p1q+lengthq+mmsize*1]
+    mova     [p1q+lengthq+mmsize*0], m0
+    mova     [p1q+lengthq+mmsize*1], m1
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+
+.loop:
+    mova                         m0, [p1q+lengthq]
+    mova                         m1, [p2q+lengthq]
+    mova                         m3, [p1q+lengthq+mmsize]
+    mova                         m4, [p2q+lengthq+mmsize]
+    mova                         m2, m1
+    mova                         m5, m4
+    psrad                        m2, 1
+    psrad                        m5, 1
+    psubd                        m0, m2
+    psubd                        m3, m5
+    paddd                        m1, m0
+    paddd                        m4, m3
+    mova              [p1q+lengthq], m0
+    mova              [p2q+lengthq], m1
+    mova       [p1q+lengthq+mmsize], m3
+    mova       [p2q+lengthq+mmsize], m4
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+INIT_XMM sse4
+cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
+    shl             lengthd, 2
+    add                 p1q, lengthq
+    add                 p2q, lengthq
+    neg             lengthq
+
+    movd                 m2, dshiftm
+    movd                 m3, dfactorm
+    pshufd               m3, m3, 0
+    mova                 m4, [pd_128]
+
+.loop:
+    mova                 m0, [p1q+lengthq]
+    mova                 m1, [p2q+lengthq]
+    psrad                m1, m2
+    pmulld               m1, m3
+    paddd                m1, m4
+    psrad                m1, 8
+    pslld                m1, m2
+    psubd                m1, m0
+    mova      [p1q+lengthq], m1
+    add             lengthq, mmsize
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
new file mode 100644
index 0000000..fe0c846
--- /dev/null
+++ b/libavcodec/x86/takdsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/takdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+
+av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->decorrelate_ls = ff_tak_decorrelate_ls_sse2;
+        c->decorrelate_sr = ff_tak_decorrelate_sr_sse2;
+        c->decorrelate_sm = ff_tak_decorrelate_sm_sse2;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
+    }
+#endif
+}
diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm
new file mode 100644
index 0000000..db12a32
--- /dev/null
+++ b/libavcodec/x86/ttadsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224:  dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTA_FILTER 2
+INIT_XMM %1
+cglobal tta_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round
+    mova       m2, [qmq       ]
+    mova       m3, [qmq + 0x10]
+    mova       m4, [dxq       ]
+    mova       m5, [dxq + 0x10]
+
+    movd       m6, [errorq]         ; if (filter->error < 0) {
+    SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
+    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
+    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
+    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
+    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
+    mova       [qmq       ], m2     ; }
+    mova       [qmq + 0x10], m3     ;
+
+    mova       m0, [dlq       ]
+    mova       m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+    pmulld     m2, m0
+    pmulld     m3, m1
+%else
+    pshufd     m6, m0, 0xb1
+    pshufd     m7, m2, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m2, m0
+    pshufd     m2, m2, 0xd8
+    punpckldq  m2, m6
+
+    pshufd     m6, m1, 0xb1
+    pshufd     m7, m3, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m3, m1
+    pshufd     m3, m3, 0xd8
+    punpckldq  m3, m6
+%endif
+    ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+    paddd      m2, m3               ; int sum = filter->round +
+                                    ;           filter->dl[0] * filter->qm[0] +
+    pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
+    paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
+                                    ;           filter->dl[3] * filter->qm[3] +
+    movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
+    paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
+    pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
+    paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
+
+    palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+                                    ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+    palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+                                    ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+    psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+    por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+    pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+                                    ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+    mova       [dlq       ], m2
+    mova       [dxq       ], m5
+    mova       [dxq + 0x10], m4
+    movd       m0, [inq]            ; filter->error = *in;
+    movd       [errorq], m0         ;
+
+    movd       m2, shiftm           ; *in += (sum >> filter->shift);
+    psrad      m6, m2               ;
+    paddd      m0, m6               ;
+    movd       [inq], m0            ;
+
+    psrldq     m1, 4                ;
+    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
+    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
+    psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
+    psrldq     m1, m0, 4            ; filter->dl[7] = *in;
+    pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
+    paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
+    psrldq     m1, 4                ;
+    paddd      m0, m1               ;
+    mova       [dlq + 0x10], m0     ;
+    RET
+%endmacro
+
+TTA_FILTER ssse3, 8
+TTA_FILTER sse4,  7
diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c
new file mode 100644
index 0000000..7441c97
--- /dev/null
+++ b/libavcodec/x86/ttadsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tta_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                 int32_t *error, int32_t *in, int32_t shift,
+                                 int32_t round);
+void ff_tta_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                int32_t *error, int32_t *in, int32_t shift,
+                                int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->filter_process = ff_tta_filter_process_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->filter_process = ff_tta_filter_process_sse4;
+#endif
+}
diff --git a/libavcodec/x86/ttaencdsp.asm b/libavcodec/x86/ttaencdsp.asm
new file mode 100644
index 0000000..c9cbd49
--- /dev/null
+++ b/libavcodec/x86/ttaencdsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA Encoder DSP SIMD optimizations
+;*
+;* Copyright (C) 2014-2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224:  dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTAENC_FILTER 2
+INIT_XMM %1
+cglobal ttaenc_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round
+    mova       m2, [qmq       ]
+    mova       m3, [qmq + 0x10]
+    mova       m4, [dxq       ]
+    mova       m5, [dxq + 0x10]
+
+    movd       m6, [errorq]         ; if (filter->error < 0) {
+    SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
+    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
+    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
+    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
+    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
+    mova       [qmq       ], m2     ; }
+    mova       [qmq + 0x10], m3     ;
+
+    mova       m0, [dlq       ]
+    mova       m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+    pmulld     m2, m0
+    pmulld     m3, m1
+%else
+    pshufd     m6, m0, 0xb1
+    pshufd     m7, m2, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m2, m0
+    pshufd     m2, m2, 0xd8
+    punpckldq  m2, m6
+
+    pshufd     m6, m1, 0xb1
+    pshufd     m7, m3, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m3, m1
+    pshufd     m3, m3, 0xd8
+    punpckldq  m3, m6
+%endif
+    ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+    paddd      m2, m3               ; int sum = filter->round +
+                                    ;           filter->dl[0] * filter->qm[0] +
+    pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
+    paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
+                                    ;           filter->dl[3] * filter->qm[3] +
+    movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
+    paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
+    pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
+    paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
+
+    palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+                                    ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+    palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+                                    ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+    psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+    por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+    pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+                                    ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+    mova       [dlq       ], m2
+    mova       [dxq       ], m5
+    mova       [dxq + 0x10], m4
+
+    movd       m2, shiftm           ;
+    movd       m0, [inq]            ;
+    psrad      m6, m2               ;
+    psubd      m3, m0, m6           ;
+    movd       [inq], m3            ; *in -= (sum >> filter->shift);
+    movd       [errorq], m3         ; filter->error = *in;
+
+    psrldq     m1, 4                ;
+    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
+    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
+    psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
+    psrldq     m1, m0, 4            ; filter->dl[7] = *in;
+    pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
+    paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
+    psrldq     m1, 4                ;
+    paddd      m0, m1               ;
+    mova       [dlq + 0x10], m0     ;
+    RET
+%endmacro
+
+TTAENC_FILTER ssse3, 8
+TTAENC_FILTER sse4,  7
diff --git a/libavcodec/x86/ttaencdsp_init.c b/libavcodec/x86/ttaencdsp_init.c
new file mode 100644
index 0000000..61971a4
--- /dev/null
+++ b/libavcodec/x86/ttaencdsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014-2016 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttaencdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttaenc_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round);
+void ff_ttaenc_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                   int32_t *error, int32_t *in, int32_t shift,
+                                   int32_t round);
+
+av_cold void ff_ttaencdsp_init_x86(TTAEncDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->filter_process = ff_ttaenc_filter_process_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->filter_process = ff_ttaenc_filter_process_sse4;
+#endif
+}
diff --git a/libavcodec/x86/utvideodsp.asm b/libavcodec/x86/utvideodsp.asm
new file mode 100644
index 0000000..b799c44
--- /dev/null
+++ b/libavcodec/x86/utvideodsp.asm
@@ -0,0 +1,137 @@
+;******************************************************************************
+;* SIMD-optimized UTVideo functions
+;* Copyright (c) 2017 Paul B Mahol
+;* Copyright (c) 2017 Jokyo Images
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+cextern pw_512
+cextern pw_1023
+
+SECTION .text
+
+;-------------------------------------------------------------------------------------------
+; void restore_rgb_planes(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
+;                         ptrdiff_t linesize_r, ptrdiff_t linesize_g, ptrdiff_t linesize_b,
+;                         int width, int height)
+;-------------------------------------------------------------------------------------------
+%macro RESTORE_RGB_PLANES 0
+cglobal restore_rgb_planes, 7 + ARCH_X86_64, 7 + ARCH_X86_64 * 2, 4, src_r, src_g, src_b, linesize_r, linesize_g, linesize_b, w, h, x
+    movsxdifnidn wq, wd
+    add      src_rq, wq
+    add      src_gq, wq
+    add      src_bq, wq
+    neg          wq
+%if ARCH_X86_64 == 0
+    mov          wm, wq
+DEFINE_ARGS src_r, src_g, src_b, linesize_r, linesize_g, linesize_b, x
+%define wq r6m
+%define hd r7mp
+%endif
+    mova         m3, [pb_80]
+.nextrow:
+    mov          xq, wq
+
+    .loop:
+        mova           m0, [src_rq + xq]
+        mova           m1, [src_gq + xq]
+        mova           m2, [src_bq + xq]
+        psubb          m1, m3
+        paddb          m0, m1
+        paddb          m2, m1
+        mova  [src_rq+xq], m0
+        mova  [src_bq+xq], m2
+        add            xq, mmsize
+    jl .loop
+
+    add        src_rq, linesize_rq
+    add        src_gq, linesize_gq
+    add        src_bq, linesize_bq
+    sub        hd, 1
+    jg .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+RESTORE_RGB_PLANES
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RESTORE_RGB_PLANES
+%endif
+
+;-------------------------------------------------------------------------------------------
+; void restore_rgb_planes10(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
+;                         ptrdiff_t linesize_r, ptrdiff_t linesize_g, ptrdiff_t linesize_b,
+;                         int width, int height)
+;-------------------------------------------------------------------------------------------
+%macro RESTORE_RGB_PLANES10 0
+cglobal restore_rgb_planes10, 7 + ARCH_X86_64, 7 + ARCH_X86_64 * 2, 5, src_r, src_g, src_b, linesize_r, linesize_g, linesize_b, w, h, x
+    shl          wd, 1
+    shl linesize_rq, 1
+    shl linesize_gq, 1
+    shl linesize_bq, 1
+    add      src_rq, wq
+    add      src_gq, wq
+    add      src_bq, wq
+    mova         m3, [pw_512]
+    mova         m4, [pw_1023]
+    neg          wq
+%if ARCH_X86_64 == 0
+    mov          wm, wq
+DEFINE_ARGS src_r, src_g, src_b, linesize_r, linesize_g, linesize_b, x
+%define wq r6m
+%define hd r7mp
+%endif
+.nextrow:
+    mov          xq, wq
+
+    .loop:
+        mova           m0, [src_rq + xq]
+        mova           m1, [src_gq + xq]
+        mova           m2, [src_bq + xq]
+        psubw          m1, m3
+        paddw          m0, m1
+        paddw          m2, m1
+        pand           m0, m4
+        pand           m2, m4
+        mova  [src_rq+xq], m0
+        mova  [src_bq+xq], m2
+        add            xq, mmsize
+    jl .loop
+
+    add        src_rq, linesize_rq
+    add        src_gq, linesize_gq
+    add        src_bq, linesize_bq
+    sub        hd, 1
+    jg .nextrow
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+RESTORE_RGB_PLANES10
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RESTORE_RGB_PLANES10
+%endif
diff --git a/libavcodec/x86/utvideodsp_init.c b/libavcodec/x86/utvideodsp_init.c
new file mode 100644
index 0000000..2b436c6
--- /dev/null
+++ b/libavcodec/x86/utvideodsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/utvideodsp.h"
+
+void ff_restore_rgb_planes_sse2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
+                                ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                ptrdiff_t linesize_b, int width, int height);
+void ff_restore_rgb_planes_avx2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
+                                ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                ptrdiff_t linesize_b, int width, int height);
+
+void ff_restore_rgb_planes10_sse2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
+                                  ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                  ptrdiff_t linesize_b, int width, int height);
+void ff_restore_rgb_planes10_avx2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
+                                  ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                  ptrdiff_t linesize_b, int width, int height);
+
+av_cold void ff_utvideodsp_init_x86(UTVideoDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->restore_rgb_planes   = ff_restore_rgb_planes_sse2;
+        c->restore_rgb_planes10 = ff_restore_rgb_planes10_sse2;
+    }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->restore_rgb_planes   = ff_restore_rgb_planes_avx2;
+        c->restore_rgb_planes10 = ff_restore_rgb_planes10_avx2;
+    }
+}
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000..d64dbca
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void ff_v210_x86_init(V210DecContext *s)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+    }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000..c24c765
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,90 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 1
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1, 5, 5, 7
+    movsxdifnidn r4, r4d
+    lea    r1, [r1+2*r4]
+    add    r2, r4
+    add    r3, r4
+    neg    r4
+
+    mova   m3, [v210_mult]
+    mova   m4, [v210_mask]
+    mova   m5, [v210_luma_shuf]
+    mova   m6, [v210_chroma_shuf]
+.loop:
+%ifidn %1, unaligned
+    movu   m0, [r0]
+%else
+    mova   m0, [r0]
+%endif
+
+    pmullw m1, m0, m3
+    psrld  m0, 10
+    psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
+    pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+    shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+    pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    movu   [r1+2*r4], m2
+
+    shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+    pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+    movq   [r2+r4], m1
+    movhps [r3+r4], m1
+
+    add r0, mmsize
+    add r4, 6
+    jl  .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+v210_planar_unpack unaligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack unaligned
+%endif
+
+INIT_XMM ssse3
+v210_planar_unpack aligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack aligned
+%endif
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 0db0196..965f2be 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -2,20 +2,20 @@
 ;* V210 SIMD pack
 ;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -23,8 +23,9 @@
 
 SECTION_RODATA 32
 
-v210_enc_min_10: times 32 dw 0x4
-v210_enc_max_10: times 32 dw 0x3fb
+cextern pw_4
+%define v210_enc_min_10 pw_4
+v210_enc_max_10: times 16 dw 0x3fb
 
 v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0
 v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
@@ -32,16 +33,19 @@ v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
 v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0
 v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
 
-v210_enc_min_8: times 32 db 0x1
-v210_enc_max_8: times 32 db 0xfe
+cextern pb_1
+%define v210_enc_min_8 pb_1
+cextern pb_FE
+%define v210_enc_max_8 pb_FE
 
-v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
 v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
+v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
 
-v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
 v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
 v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
 
+v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
+
 SECTION .text
 
 %macro v210_planar_pack_10 0
@@ -59,16 +63,16 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
 .loop:
     movu        xm0, [yq+2*widthq]
 %if cpuflag(avx2)
-    vinserti128 m0, m0, [yq+2*widthq+12], 1
+    vinserti128 m0,   m0, [yq+widthq*2+12], 1
 %endif
     CLIPW   m0, m2, m3
 
-    movq    xm1, [uq+widthq]
-    movhps  xm1, [vq+widthq]
+    movq         xm1, [uq+widthq]
+    movhps       xm1, [vq+widthq]
 %if cpuflag(avx2)
     movq         xm4, [uq+widthq+6]
     movhps       xm4, [vq+widthq+6]
-    vinserti128  m1, m1, xm4, 1
+    vinserti128  m1,   m1, xm4, 1
 %endif
     CLIPW   m1, m2, m3
 
@@ -93,6 +97,7 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
 INIT_XMM ssse3
 v210_planar_pack_10
 %endif
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 v210_planar_pack_10
@@ -113,9 +118,9 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
     pxor    m6, m6
 
 .loop:
-    movu        xm1, [yq+2*widthq]
+    movu        xm1, [yq+widthq*2]
 %if cpuflag(avx2)
-    vinserti128 m1, m1, [yq+2*widthq+12], 1
+    vinserti128 m1,   m1, [yq+widthq*2+12], 1
 %endif
     CLIPUB  m1, m4, m5
 
@@ -172,6 +177,7 @@ v210_planar_pack_8
 INIT_XMM avx
 v210_planar_pack_8
 %endif
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 v210_planar_pack_8
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index c4d2745..e997b4b 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h
index 9b6c8ad..fdd4de1 100644
--- a/libavcodec/x86/vc1dsp.h
+++ b/libavcodec/x86/vc1dsp.h
@@ -1,20 +1,20 @@
 /*
  * VC-1 and WMV3 decoder - X86 DSP init functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 46bb906..8e0c284 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -27,6 +27,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavcodec/vc1dsp.h"
 #include "fpel.h"
 #include "vc1dsp.h"
@@ -63,11 +64,22 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
     ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
 }
 
-static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
-                                      ptrdiff_t stride, int rnd)
-{
-    ff_avg_pixels8_mmxext(dst, src, stride, 8);
-}
+#define DECLARE_FUNCTION(OP, DEPTH, INSN)                       \
+    static void OP##vc1_mspel_mc00_##DEPTH##INSN(uint8_t *dst,          \
+                             const uint8_t *src, ptrdiff_t stride, int rnd) \
+    {                                                                       \
+        ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH);     \
+    }
+
+DECLARE_FUNCTION(put_,  8, _mmx)
+DECLARE_FUNCTION(put_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmx)
+DECLARE_FUNCTION(avg_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmxext)
+DECLARE_FUNCTION(avg_, 16, _mmxext)
+DECLARE_FUNCTION(put_, 16, _sse2)
+DECLARE_FUNCTION(avg_, 16, _sse2)
+
 #endif /* HAVE_X86ASM */
 
 void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
@@ -80,16 +92,26 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        ptrdiff_t stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        ptrdiff_t stride, int h, int x, int y);
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
 
 
 av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMX(cpu_flags))
+        if (EXTERNAL_MMX(cpu_flags))
         ff_vc1dsp_init_mmx(dsp);
 
-    if (INLINE_MMXEXT(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags))
+        if (EXTERNAL_MMXEXT(cpu_flags))
         ff_vc1dsp_init_mmxext(dsp);
 
 #define ASSIGN_LF(EXT) \
@@ -103,6 +125,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 #if HAVE_X86ASM
     if (EXTERNAL_MMX(cpu_flags)) {
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
+
+        dsp->put_vc1_mspel_pixels_tab[1][0]      = put_vc1_mspel_mc00_8_mmx;
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmx;
     }
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
@@ -111,13 +138,22 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         ASSIGN_LF(mmxext);
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
 
-        dsp->avg_vc1_mspel_pixels_tab[0]         = avg_vc1_mspel_mc00_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmxext;
+
+        dsp->vc1_inv_trans_8x8_dc                = ff_vc1_inv_trans_8x8_dc_mmxext;
+        dsp->vc1_inv_trans_4x8_dc                = ff_vc1_inv_trans_4x8_dc_mmxext;
+        dsp->vc1_inv_trans_8x4_dc                = ff_vc1_inv_trans_8x4_dc_mmxext;
+        dsp->vc1_inv_trans_4x4_dc                = ff_vc1_inv_trans_4x4_dc_mmxext;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_sse2;
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse2;
         dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
         dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_sse2;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_sse2;
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
         ASSIGN_LF(ssse3);
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp_loopfilter.asm
index b9a770e..fd33bd1 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@ -1,21 +1,21 @@
 ;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 loopfilter optimizations
 ;* Copyright (c) 2009 David Conrad
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
new file mode 100644
index 0000000..0e6d87d
--- /dev/null
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -0,0 +1,292 @@
+;******************************************************************************
+;* VC1 motion compensation optimizations
+;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_9
+cextern pw_128
+
+SECTION .text
+
+%if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+;     when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+    pavgb           %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+    paddw           m3, m7 ; +bias-r
+    paddw           m4, m7 ; +bias-r
+    psraw           m3, %1
+    psraw           m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+    packuswb        m3, m4
+    %1              m3, [%2]
+    mova          [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+    %1              m3, [%2]
+    %1              m3, [%2 + mmsize]
+    mova          [%2], m3
+    mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+    punpcklbw       %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
+; Compute the rounder 32-r or 8-r and unpacks it to m7
+%macro LOAD_ROUNDER_MMX 1 ; round
+    movd      m7, %1
+    punpcklwd m7, m7
+    punpckldq m7, m7
+%endmacro
+
+%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
+    paddw          m%3, m%4
+    movh           m%2, [srcq + stride_neg2]
+    pmullw         m%3, m6
+    punpcklbw      m%2, m0
+    movh           m%5, [srcq + strideq]
+    psubw          m%3, m%2
+    punpcklbw      m%5, m0
+    paddw          m%3, m7
+    psubw          m%3, m%5
+    psraw          m%3, shift
+    movu   [dstq + %1], m%3
+    add           srcq, strideq
+%endmacro
+
+INIT_MMX mmx
+; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
+;                                    x86_reg stride, int rnd, int64_t shift)
+; Sacrificing m6 makes it possible to pipeline loads from src
+%if ARCH_X86_32
+cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
+    DECLARE_REG_TMP     3, 4, 5
+    %define rnd r3mp
+    %define shift qword r4m
+%else ; X86_64
+cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
+    DECLARE_REG_TMP     4, 5, 6
+    %define   rnd r3d
+    ; We need shift either in memory or in a mm reg as it's used in psraw
+    ; On WIN64, the arg is already on the stack
+    ; On UNIX64, m5 doesn't seem to be used
+%if WIN64
+    %define shift r4mp
+%else ; UNIX64
+    %define shift m5
+    mova shift, r4q
+%endif ; WIN64
+%endif ; X86_32
+%define stride_neg2 t0q
+%define stride_9minus4 t1q
+%define i t2q
+    mov       stride_neg2, strideq
+    neg       stride_neg2
+    add       stride_neg2, stride_neg2
+    lea    stride_9minus4, [strideq * 9 - 4]
+    mov                 i, 3
+    LOAD_ROUNDER_MMX  rnd
+    mova               m6, [pw_9]
+    pxor               m0, m0
+.loop:
+    movh               m2, [srcq]
+    add              srcq, strideq
+    movh               m3, [srcq]
+    punpcklbw          m2, m0
+    punpcklbw          m3, m0
+    SHIFT2_LINE         0, 1, 2, 3, 4
+    SHIFT2_LINE        24, 2, 3, 4, 1
+    SHIFT2_LINE        48, 3, 4, 1, 2
+    SHIFT2_LINE        72, 4, 1, 2, 3
+    SHIFT2_LINE        96, 1, 2, 3, 4
+    SHIFT2_LINE       120, 2, 3, 4, 1
+    SHIFT2_LINE       144, 3, 4, 1, 2
+    SHIFT2_LINE       168, 4, 1, 2, 3
+    sub              srcq, stride_9minus4
+    add              dstq, 8
+    dec                 i
+        jnz         .loop
+    REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+;                                  const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+    mov                hq, 8
+    sub              srcq, 2
+    sub              rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+    LOAD_ROUNDER_MMX rndd
+    mova               m5, [pw_9]
+    mova               m6, [pw_128]
+    pxor               m0, m0
+
+.loop:
+    mova               m1, [srcq + 2 * 0]
+    mova               m2, [srcq + 2 * 0 + mmsize]
+    mova               m3, [srcq + 2 * 1]
+    mova               m4, [srcq + 2 * 1 + mmsize]
+    paddw              m3, [srcq + 2 * 2]
+    paddw              m4, [srcq + 2 * 2 + mmsize]
+    paddw              m1, [srcq + 2 * 3]
+    paddw              m2, [srcq + 2 * 3 + mmsize]
+    pmullw             m3, m5
+    pmullw             m4, m5
+    psubw              m3, m1
+    psubw              m4, m2
+    NORMALIZE_MMX      7
+    ; remove bias
+    paddw              m3, m6
+    paddw              m4, m6
+    TRANSFER_DO_PACK   %1, dstq
+    add              srcq, 24
+    add              dstq, strideq
+    dec                hq
+        jnz         .loop
+
+    RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
+%endif ; HAVE_MMX_INLINE
+
+%macro INV_TRANS_INIT 0
+    movsxdifnidn linesizeq, linesized
+    movd       m0, blockd
+    SPLATW     m0, m0
+    pxor       m1, m1
+    psubw      m1, m0
+    packuswb   m0, m0
+    packuswb   m1, m1
+
+    DEFINE_ARGS dest, linesize, linesize3
+    lea    linesize3q, [linesizeq*3]
+%endmacro
+
+%macro INV_TRANS_PROCESS 1
+    mov%1                  m2, [destq+linesizeq*0]
+    mov%1                  m3, [destq+linesizeq*1]
+    mov%1                  m4, [destq+linesizeq*2]
+    mov%1                  m5, [destq+linesize3q]
+    paddusb                m2, m0
+    paddusb                m3, m0
+    paddusb                m4, m0
+    paddusb                m5, m0
+    psubusb                m2, m1
+    psubusb                m3, m1
+    psubusb                m4, m1
+    psubusb                m5, m1
+    mov%1 [linesizeq*0+destq], m2
+    mov%1 [linesizeq*1+destq], m3
+    mov%1 [linesizeq*2+destq], m4
+    mov%1 [linesize3q +destq], m5
+%endmacro
+
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    shl        blockd, 2               ;  4 * dc
+    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
+    sar        blockd, 5               ; >> 5
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS a
+    RET
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index d64ddf0..45c8a68 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -25,7 +25,6 @@
  */
 
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -34,7 +33,15 @@
 #include "fpel.h"
 #include "vc1dsp.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
+
+void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+                                   const uint8_t *src, x86_reg stride,
+                                   int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+                                   const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+                                      const int16_t *src, int rnd);
 
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -67,102 +74,6 @@
      "punpcklwd %%mm7, %%mm7           \n\t"    \
      "punpckldq %%mm7, %%mm7           \n\t"
 
-#define SHIFT2_LINE(OFF, R0,R1,R2,R3)           \
-    "paddw     %%mm"#R2", %%mm"#R1"    \n\t"    \
-    "movd      (%0,%3), %%mm"#R0"      \n\t"    \
-    "pmullw    %%mm6, %%mm"#R1"        \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R0"        \n\t"    \
-    "movd      (%0,%2), %%mm"#R3"      \n\t"    \
-    "psubw     %%mm"#R0", %%mm"#R1"    \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R3"        \n\t"    \
-    "paddw     %%mm7, %%mm"#R1"        \n\t"    \
-    "psubw     %%mm"#R3", %%mm"#R1"    \n\t"    \
-    "psraw     %4, %%mm"#R1"           \n\t"    \
-    "movq      %%mm"#R1", "#OFF"(%1)   \n\t"    \
-    "add       %2, %0                  \n\t"
-
-/** Sacrificing mm6 allows to pipeline loads from src */
-static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
-                                       const uint8_t *src, x86_reg stride,
-                                       int rnd, int64_t shift)
-{
-    __asm__ volatile(
-        "mov       $3, %%"FF_REG_c"        \n\t"
-        LOAD_ROUNDER_MMX("%5")
-        "movq      "MANGLE(ff_pw_9)", %%mm6 \n\t"
-        "1:                                \n\t"
-        "movd      (%0), %%mm2             \n\t"
-        "add       %2, %0                  \n\t"
-        "movd      (%0), %%mm3             \n\t"
-        "punpcklbw %%mm0, %%mm2            \n\t"
-        "punpcklbw %%mm0, %%mm3            \n\t"
-        SHIFT2_LINE(  0, 1, 2, 3, 4)
-        SHIFT2_LINE( 24, 2, 3, 4, 1)
-        SHIFT2_LINE( 48, 3, 4, 1, 2)
-        SHIFT2_LINE( 72, 4, 1, 2, 3)
-        SHIFT2_LINE( 96, 1, 2, 3, 4)
-        SHIFT2_LINE(120, 2, 3, 4, 1)
-        SHIFT2_LINE(144, 3, 4, 1, 2)
-        SHIFT2_LINE(168, 4, 1, 2, 3)
-        "sub       %6, %0                  \n\t"
-        "add       $8, %1                  \n\t"
-        "dec       %%"FF_REG_c"            \n\t"
-        "jnz 1b                            \n\t"
-        : "+r"(src), "+r"(dst)
-        : "r"(stride), "r"(-2*stride),
-          "m"(shift), "m"(rnd), "r"(9*stride-4)
-        : "%"FF_REG_c, "memory"
-    );
-}
-
-/**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
-                                             const int16_t *src, int rnd)\
-{\
-    int h = 8;\
-\
-    src -= 1;\
-    rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
-    __asm__ volatile(\
-        LOAD_ROUNDER_MMX("%4")\
-        "movq      "MANGLE(ff_pw_128)", %%mm6\n\t"\
-        "movq      "MANGLE(ff_pw_9)", %%mm5 \n\t"\
-        "1:                                \n\t"\
-        "movq      2*0+0(%1), %%mm1        \n\t"\
-        "movq      2*0+8(%1), %%mm2        \n\t"\
-        "movq      2*1+0(%1), %%mm3        \n\t"\
-        "movq      2*1+8(%1), %%mm4        \n\t"\
-        "paddw     2*3+0(%1), %%mm1        \n\t"\
-        "paddw     2*3+8(%1), %%mm2        \n\t"\
-        "paddw     2*2+0(%1), %%mm3        \n\t"\
-        "paddw     2*2+8(%1), %%mm4        \n\t"\
-        "pmullw    %%mm5, %%mm3            \n\t"\
-        "pmullw    %%mm5, %%mm4            \n\t"\
-        "psubw     %%mm1, %%mm3            \n\t"\
-        "psubw     %%mm2, %%mm4            \n\t"\
-        NORMALIZE_MMX("$7")\
-        /* Remove bias */\
-        "paddw     %%mm6, %%mm3            \n\t"\
-        "paddw     %%mm6, %%mm4            \n\t"\
-        TRANSFER_DO_PACK(OP)\
-        "add       $24, %1                 \n\t"\
-        "add       %3, %2                  \n\t"\
-        "decl      %0                      \n\t"\
-        "jnz 1b                            \n\t"\
-        : "+r"(h), "+r" (src),  "+r" (dst)\
-        : "r"(stride), "m"(rnd)\
-        : "memory"\
-    );\
-}
-
-VC1_HOR_16b_SHIFT2(OP_PUT, put_)
-VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
-
-
 /**
  * Purely vertical or horizontal 1/2 shift interpolation.
  * Sacrifice mm6 for *9 factor.
@@ -213,6 +124,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
         : "+r"(src),  "+r"(dst)\
         : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
           "g"(stride-offset)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)\
         : "%"FF_REG_c, "memory"\
     );\
 }
@@ -315,6 +227,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(src_stride), "r"(3*src_stride),                           \
           "m"(rnd), "m"(shift)                                          \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -352,6 +265,7 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(stride), "m"(rnd)                                         \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128)    \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -387,6 +301,7 @@ OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \
+          NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -420,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
  * @param  hmode   Vertical filter.
  * @param  rnd     Rounding bias.
  */
-#define VC1_MSPEL_MC(OP)\
+#define VC1_MSPEL_MC(OP, INSTR)\
 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
                                int hmode, int vmode, int rnd)\
 {\
     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
-         { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+         { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
-         { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
+         { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
          { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
 \
@@ -441,7 +356,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
             static const int shift_value[] = { 0, 5, 1, 5 };\
             int              shift = (shift_value[hmode]+shift_value[vmode])>>1;\
             int              r;\
-            DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
+            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
 \
             r = (1<<(shift-1)) + rnd-1;\
             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
@@ -457,10 +372,19 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
 \
     /* Horizontal mode with no vertical mode */\
     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+} \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
+                                  int stride, int hmode, int vmode, int rnd)\
+{ \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+    dst += 8*stride; src += 8*stride; \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
 }
 
-VC1_MSPEL_MC(put_)
-VC1_MSPEL_MC(avg_)
+VC1_MSPEL_MC(put_, mmx)
+VC1_MSPEL_MC(avg_, mmxext)
 
 /** Macro to ease bicubic filter interpolation functions declarations */
 #define DECLARE_FUNCTION(a, b)                                          \
@@ -477,6 +401,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst,         \
                                                   int rnd)              \
 {                                                                       \
      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \
+}\
+static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t stride,     \
+                                                  int rnd)              \
+{                                                                       \
+     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst,      \
+                                                     const uint8_t *src,\
+                                                     ptrdiff_t stride,  \
+                                                     int rnd)           \
+{                                                                       \
+     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
 }
 
 DECLARE_FUNCTION(0, 1)
@@ -498,261 +436,51 @@ DECLARE_FUNCTION(3, 1)
 DECLARE_FUNCTION(3, 2)
 DECLARE_FUNCTION(3, 3)
 
-static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (17 * dc +  4) >> 3;
-    dc = (17 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (17 * dc +  4) >> 3;
-    dc = (12 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-    dest += 4 * stride;
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = ( 3 * dc +  1) >> 1;
-    dc = (17 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t stride,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (3 * dc +  1) >> 1;
-    dc = (3 * dc + 16) >> 5;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-    dest += 4 * stride;
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t *)(dest + 0 * stride)),
-         "+m"(*(uint32_t *)(dest + 1 * stride)),
-         "+m"(*(uint32_t *)(dest + 2 * stride)),
-         "+m"(*(uint32_t *)(dest + 3 * stride))
-    );
-}
-
-static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
-                                   ptrdiff_t stride, int rnd)
-{
-    ff_put_pixels8_mmx(dst, src, stride, 8);
-}
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
 
 av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 {
-    dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+    FN_ASSIGN(put_, 0, 1, _mmx);
+    FN_ASSIGN(put_, 0, 2, _mmx);
+    FN_ASSIGN(put_, 0, 3, _mmx);
+
+    FN_ASSIGN(put_, 1, 0, _mmx);
+    FN_ASSIGN(put_, 1, 1, _mmx);
+    FN_ASSIGN(put_, 1, 2, _mmx);
+    FN_ASSIGN(put_, 1, 3, _mmx);
+
+    FN_ASSIGN(put_, 2, 0, _mmx);
+    FN_ASSIGN(put_, 2, 1, _mmx);
+    FN_ASSIGN(put_, 2, 2, _mmx);
+    FN_ASSIGN(put_, 2, 3, _mmx);
+
+    FN_ASSIGN(put_, 3, 0, _mmx);
+    FN_ASSIGN(put_, 3, 1, _mmx);
+    FN_ASSIGN(put_, 3, 2, _mmx);
+    FN_ASSIGN(put_, 3, 3, _mmx);
 }
 
 av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
 {
-    dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
-
-    dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
-    dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
-    dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
-    dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
+    FN_ASSIGN(avg_, 0, 1, _mmxext);
+    FN_ASSIGN(avg_, 0, 2, _mmxext);
+    FN_ASSIGN(avg_, 0, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 1, 0, _mmxext);
+    FN_ASSIGN(avg_, 1, 1, _mmxext);
+    FN_ASSIGN(avg_, 1, 2, _mmxext);
+    FN_ASSIGN(avg_, 1, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 2, 0, _mmxext);
+    FN_ASSIGN(avg_, 2, 1, _mmxext);
+    FN_ASSIGN(avg_, 2, 2, _mmxext);
+    FN_ASSIGN(avg_, 2, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 3, 0, _mmxext);
+    FN_ASSIGN(avg_, 3, 1, _mmxext);
+    FN_ASSIGN(avg_, 3, 2, _mmxext);
+    FN_ASSIGN(avg_, 3, 3, _mmxext);
 }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index b22e0fe..e237860 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -2,20 +2,20 @@
 ;* Core video DSP functions
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -54,13 +54,13 @@ SECTION .text
 ; |    |    <- bottom is copied from last line in body of source
 ; '----' <- bh
 %if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
                                 start_y, end_y, bh, w
 %else ; x86-32
 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
 %define src_strideq r3mp
-%define dst_strideq r2mp
-    mov            srcq, r1mp
+%define dst_strideq r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
@@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
     neg        n_wordsq
     lea        start_xq, [start_xq+n_wordsq*2]
 .y_loop:                                        ; do {
-    ; FIXME also write a ssse3 version using pshufb
+%if cpuflag(avx2)
+    vpbroadcastb     m0, [dstq+start_xq]
+    mov              wq, n_wordsq               ;   initialize w
+%else
     movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
     imul             wd, 0x01010101             ;   w *= 0x01010101
     movd             m0, wd
@@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
 %else ; mmx
     punpckldq        m0, m0                     ;   splat
 %endif ; mmx/sse
+%endif ; avx2
 .x_loop:                                        ;   do {
     movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
     add              wq, mmsize/2               ;     w -= $mmsize/2
@@ -127,6 +131,11 @@ hvar_fn
 INIT_XMM sse2
 hvar_fn
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+hvar_fn
+%endif
+
 ; macro to read/write a horizontal number of pixels (%2) to/from registers
 ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
@@ -184,10 +193,10 @@ hvar_fn
     mov            valb, [srcq+%2-1]
 %elif (%2-%%off) == 2
     mov            valw, [srcq+%2-2]
-%elifidn %1, body
-    mov            vald, [srcq+%2-3]
 %else
-    movd mm %+ %%mmx_idx, [srcq+%2-3]
+    mov            valb, [srcq+%2-1]
+    ror            vald, 16
+    mov            valw, [srcq+%2-3]
 %endif
 %endif ; (%2-%%off) >= 1
 %endmacro ; READ_NUM_BYTES
@@ -240,15 +249,13 @@ hvar_fn
     mov     [dstq+%2-1], valb
 %elif (%2-%%off) == 2
     mov     [dstq+%2-2], valw
-%elifidn %1, body
-    mov     [dstq+%2-3], valw
-    shr            vald, 16
-    mov     [dstq+%2-1], valb
 %else
-    movd           vald, mm %+ %%mmx_idx
     mov     [dstq+%2-3], valw
-    shr            vald, 16
+    ror            vald, 16
     mov     [dstq+%2-1], valb
+%ifnidn %1, body
+    ror            vald, 16
+%endif
 %endif
 %endif ; (%2-%%off) >= 1
 %endmacro ; WRITE_NUM_BYTES
@@ -262,30 +269,30 @@ hvar_fn
 %rep 1+%2-%1
 %if %%n <= 3
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, val, bh
     mov             bhq, r6mp                   ; r6mp = bhmp
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
     mov            dstq, r0mp
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %else
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, bh
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %endif
@@ -344,9 +351,8 @@ VERTICAL_EXTEND 16, 22
 ; obviously not the same on both sides.
 
 %macro READ_V_PIXEL 2
-%if %1 == 2
-    movzx          valw, byte %2
-    imul           valw, 0x0101
+%if cpuflag(avx2)
+    vpbroadcastb     m0, %2
 %else
     movzx          vald, byte %2
     imul           vald, 0x01010101
@@ -356,13 +362,16 @@ VERTICAL_EXTEND 16, 22
     pshufd           m0, m0, q0000
 %else
     punpckldq        m0, m0
-%endif
-%endif ; %1 >= 8
-%endif
+%endif ; mmsize == 16
+%endif ; %1 > 16
+%endif ; avx2
 %endmacro ; READ_V_PIXEL
 
 %macro WRITE_V_PIXEL 2
 %assign %%off 0
+
+%if %1 >= 8
+
 %rep %1/mmsize
     movu     [%2+%%off], m0
 %assign %%off %%off+mmsize
@@ -378,34 +387,44 @@ VERTICAL_EXTEND 16, 22
 %assign %%off %%off+8
 %endif
 %endif ; %1-%%off >= 8
-%endif
+%endif ; mmsize == 16
 
 %if %1-%%off >= 4
 %if %1 > 8 && %1-%%off > 4
     movq      [%2+%1-8], m0
 %assign %%off %1
-%elif %1 >= 8 && %1-%%off >= 4
-    movd     [%2+%%off], m0
-%assign %%off %%off+4
 %else
-    mov      [%2+%%off], vald
+    movd     [%2+%%off], m0
 %assign %%off %%off+4
 %endif
 %endif ; %1-%%off >= 4
 
-%if %1-%%off >= 2
-%if %1 >= 8
-    movd      [%2+%1-4], m0
+%else ; %1 < 8
+
+%rep %1/4
+    mov      [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
+%if cpuflag(avx2)
+    movd     [%2+%%off-2], m0
 %else
     mov      [%2+%%off], valw
-%endif
+%endif ; avx2
 %endif ; (%1-%%off)/2
 %endmacro ; WRITE_V_PIXEL
 
 %macro H_EXTEND 2
 %assign %%n %1
 %rep 1+(%2-%1)/2
+%if cpuflag(avx2)
+cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
+%else
 cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
+%endif
 .loop_y:                                        ; do {
     READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
     WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
@@ -426,6 +445,11 @@ H_EXTEND 16, 22
 INIT_XMM sse2
 H_EXTEND 16, 22
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+H_EXTEND 8, 22
+%endif
+
 %macro PREFETCH_FN 1
 cglobal prefetch, 3, 3, 0, buf, stride, h
 .loop:
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index e7f24a9..eeebb41 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -1,25 +1,27 @@
 /*
+ * Copyright (C) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
@@ -28,11 +30,11 @@
 #include "libavcodec/videodsp.h"
 
 #if HAVE_X86ASM
-typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh,
                                 x86_reg w);
 
@@ -126,6 +128,23 @@ static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
     ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
 };
 extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
 
 static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t dst_stride,
@@ -141,14 +160,18 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
     x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
 
     if (!w || !h)
-         return;
+        return;
+
+    av_assert2(block_w <= FFABS(dst_stride));
 
     if (src_y >= h) {
-        src  -= src_y * src_stride;
-        src_y = src_y_add = h - 1;
+        src -= src_y*src_stride;
+        src_y_add = h - 1;
+        src_y     = h - 1;
     } else if (src_y <= -block_h) {
-        src  -= src_y*src_stride;
-        src_y = src_y_add = 1 - block_h;
+        src -= src_y*src_stride;
+        src_y_add = 1 - block_h;
+        src_y     = 1 - block_h;
     }
     if (src_x >= w) {
         src   += w - 1 - src_x;
@@ -162,18 +185,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y   = FFMIN(block_h, h-src_y);
     end_x   = FFMIN(block_w, w-src_x);
-    assert(start_x < end_x && block_w > 0);
-    assert(start_y < end_y && block_h > 0);
+    av_assert2(start_x < end_x && block_w > 0);
+    av_assert2(start_y < end_y && block_h > 0);
 
     // fill in the to-be-copied part plus all above/below
     src += (src_y_add + start_y) * src_stride + start_x;
     w = end_x - start_x;
     if (w <= 22) {
-        vfix_tbl[w - 1](dst + start_x, src,
-                        dst_stride, src_stride,
+        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                         start_y, end_y, block_h);
     } else {
-        v_extend_var(dst + start_x, src, dst_stride, src_stride,
+        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                      start_y, end_y, block_h, w);
     }
 
@@ -212,7 +234,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
                      hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
 }
 
-static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src,
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
                                              int block_w, int block_h,
@@ -231,10 +253,24 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                               int src_x, int src_y, int w,
                                               int h)
 {
-    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x,
-                     src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                      hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
 }
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+                                              ptrdiff_t buf_stride,
+                                              ptrdiff_t src_stride,
+                                              int block_w, int block_h,
+                                              int src_x, int src_y, int w,
+                                              int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
 #endif /* HAVE_X86ASM */
 
 void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
@@ -264,5 +300,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
     if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
         ctx->emulated_edge_mc = emulated_edge_mc_sse2;
     }
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+    }
+#endif
 #endif /* HAVE_X86ASM */
 }
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
index c54650e..d952296 100644
--- a/libavcodec/x86/vorbisdsp.asm
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -2,20 +2,20 @@
 ;* Vorbis x86 optimizations
 ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -57,13 +57,17 @@ cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
 %endif
 
 INIT_XMM sse
-cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
+cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
     mova                     m5, [pdw_80000000]
-    xor                   cntrq, cntrq
+    shl             block_sized, 2
+    add                    magq, block_sizeq
+    add                    angq, block_sizeq
+    neg             block_sizeq
+
 align 16
 .loop:
-    mova                     m0, [magq+cntrq*4]
-    mova                     m1, [angq+cntrq*4]
+    mova                     m0, [magq+block_sizeq]
+    mova                     m1, [angq+block_sizeq]
     xorps                    m2, m2
     xorps                    m3, m3
     cmpleps                  m2, m0     ; m <= 0.0
@@ -75,9 +79,8 @@ align 16
     andnps                   m4, m1
     addps                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
     subps                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
-    mova         [angq+cntrq*4], m3
-    mova         [magq+cntrq*4], m0
-    add                   cntrq, 4
-    cmp                   cntrq, block_sizeq
+    mova     [angq+block_sizeq], m3
+    mova     [magq+block_sizeq], m0
+    add             block_sizeq, mmsize
     jl .loop
     RET
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index bbd8319..bc1cc43 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 8587741..d88d5a1 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,6 +40,7 @@ pb_81: times 8 db 0x81
 cextern pb_1
 cextern pb_3
 cextern pb_80
+cextern pb_FE
 
 cextern pw_8
 
@@ -141,6 +142,49 @@ cglobal vp3_h_loop_filter, 3, 4
     STORE_4_WORDS m3
     RET
 
+%macro PAVGB_NO_RND 0
+    mova   m4, m0
+    mova   m5, m2
+    pand   m4, m1
+    pand   m5, m3
+    pxor   m1, m0
+    pxor   m3, m2
+    pand   m1, m6
+    pand   m3, m6
+    psrlq  m1, 1
+    psrlq  m3, 1
+    paddb  m4, m1
+    paddb  m5, m3
+%endmacro
+
+INIT_MMX mmx
+cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
+    mova   m6, [pb_FE]
+    lea    stride3q,[strideq+strideq*2]
+.loop:
+    mova   m0, [src1q]
+    mova   m1, [src2q]
+    mova   m2, [src1q+strideq]
+    mova   m3, [src2q+strideq]
+    PAVGB_NO_RND
+    mova   [dstq], m4
+    mova   [dstq+strideq], m5
+
+    mova   m0, [src1q+strideq*2]
+    mova   m1, [src2q+strideq*2]
+    mova   m2, [src1q+stride3q]
+    mova   m3, [src2q+stride3q]
+    PAVGB_NO_RND
+    mova   [dstq+strideq*2], m4
+    mova   [dstq+stride3q],  m5
+
+    lea    src1q, [src1q+strideq*4]
+    lea    src2q, [src2q+strideq*4]
+    lea    dstq,  [dstq+strideq*4]
+    sub    hd, 4
+    jnz .loop
+    RET
+
 ; from original comments: The Macro does IDct on 4 1-D Dcts
 %macro BeginIDCT 0
     movq          m2, I(3)
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 043e10f..1ba9576 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,7 +25,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
-#include "config.h"
 
 void ff_vp3_idct_put_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@@ -38,16 +39,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
 void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
                                  int *bounding_values);
 
+void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
+                                     const uint8_t *b, ptrdiff_t stride,
+                                     int h);
+
 av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
+#if ARCH_X86_32
         c->idct_put  = ff_vp3_idct_put_mmx;
         c->idct_add  = ff_vp3_idct_add_mmx;
-    }
 #endif
+    }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index 0a69368..810cc8d 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,49 +4,46 @@
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (C) 2010  Eli Friedman
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_X86_VP56_ARITH_H
 #define AVCODEC_X86_VP56_ARITH_H
 
-#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
 #define vp56_rac_get_prob vp56_rac_get_prob
 static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
 {
     unsigned int code_word = vp56_rac_renorm(c);
-    unsigned int high = c->high;
-    unsigned int low = 1 + (((high - 1) * prob) >> 8);
+    unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
     unsigned int low_shift = low << 16;
     int bit = 0;
+    c->code_word = code_word;
 
     __asm__(
         "subl  %4, %1      \n\t"
         "subl  %3, %2      \n\t"
-        "leal (%2, %3), %3 \n\t"
         "setae %b0         \n\t"
         "cmovb %4, %1      \n\t"
-        "cmovb %3, %2      \n\t"
-        : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
-        : "r"(low)
+        "cmovb %5, %2      \n\t"
+        : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+        : "r"(low_shift), "r"(low), "r"(code_word)
     );
 
-    c->high      = high;
-    c->code_word = code_word;
     return bit;
 }
 #endif
diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm
index b667d38..0be531e 100644
--- a/libavcodec/x86/vp6dsp.asm
+++ b/libavcodec/x86/vp6dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
 ;* Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c
index 6d98db1..ce49893 100644
--- a/libavcodec/x86/vp6dsp_init.c
+++ b/libavcodec/x86/vp6dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
  * Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 4270cdd..75de569 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -895,6 +895,7 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
     %4 [dst2q+strideq+%3], m5
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
     ; load data
@@ -918,8 +919,9 @@ cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
     lea     dst2q, [dst1q+strideq*2]
     ADD_DC     m0, m1, 0, movh
     RET
+%endif
 
-INIT_XMM sse4
+%macro VP8_IDCT_DC_ADD 0
 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
     ; load data
     movd       m0, [blockq]
@@ -945,10 +947,25 @@ cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
     paddw      m4, m0
     packuswb   m2, m4
     movd   [dst1q], m2
+%if cpuflag(sse4)
     pextrd [dst1q+strideq], m2, 1
     pextrd [dst2q], m2, 2
     pextrd [dst2q+strideq], m2, 3
+%else
+    psrldq     m2, 4
+    movd [dst1q+strideq], m2
+    psrldq     m2, 4
+    movd [dst2q], m2
+    psrldq     m2, 4
+    movd [dst2q+strideq], m2
+%endif
     RET
+%endmacro
+
+INIT_XMM sse2
+VP8_IDCT_DC_ADD
+INIT_XMM sse4
+VP8_IDCT_DC_ADD
 
 ;-----------------------------------------------------------------------------
 ; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 8702c59..397b251 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -168,7 +168,7 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
     uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
     src -= srcstride * (TAPNUMY / 2 - 1); \
     ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
@@ -213,7 +213,7 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
     ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
         tmp, SIZE,      src, srcstride, height + 1, mx, my); \
     ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
@@ -233,6 +233,8 @@ HVBILIN(ssse3, 8, 16, 16)
 
 void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
                             ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
+                             ptrdiff_t stride);
 void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
                              ptrdiff_t stride);
 void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
@@ -346,7 +348,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
 
-    if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+    if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
         VP8_LUMA_MC_FUNC(0, 16, sse2);
         VP8_MC_FUNC(1, 8, sse2);
         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -370,9 +372,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
         c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
 #if ARCH_X86_32
+        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
         c->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmx;
         c->vp8_idct_add       = ff_vp8_idct_add_mmx;
         c->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmx;
@@ -416,7 +418,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
         c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
     }
 
-    if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+    if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
 
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -427,9 +429,10 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse2;
         c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
 
-        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
 
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
@@ -454,7 +457,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     }
 
     if (EXTERNAL_SSE4(cpu_flags)) {
-        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
 
         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm
index 9ffd83a..caeb405 100644
--- a/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index fbd68db..837cce8 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -1,221 +1,124 @@
 /*
  * VP9 SIMD optimizations
  *
- * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
 
 #if HAVE_X86ASM
 
-#define fpel_func(avg, sz, opt)                                                 \
-void ff_vp9_ ## avg ## sz ## _ ## opt(uint8_t *dst, ptrdiff_t dst_stride,       \
-                                      const uint8_t *src, ptrdiff_t src_stride, \
-                                      int h, int mx, int my)
-
-fpel_func(put,  4, mmx);
-fpel_func(put,  8, mmx);
-fpel_func(put, 16, sse);
-fpel_func(put, 32, sse);
-fpel_func(put, 64, sse);
-fpel_func(avg,  4, mmxext);
-fpel_func(avg,  8, mmxext);
-fpel_func(avg, 16, sse2);
-fpel_func(avg, 32, sse2);
-fpel_func(avg, 64, sse2);
-fpel_func(put, 32, avx);
-fpel_func(put, 64, avx);
-fpel_func(avg, 32, avx2);
-fpel_func(avg, 64, avx2);
-#undef fpel_func
-
-#define mc_func(avg, sz, dir, opt, type, f_sz)                                  \
-void                                                                            \
-ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                          ptrdiff_t dst_stride, \
-                                                          const uint8_t *src,   \
-                                                          ptrdiff_t src_stride, \
-                                                          int h,                \
-                                                          const type (*filter)[f_sz])
-
-#define mc_funcs(sz, opt, type, f_sz)     \
-    mc_func(put, sz, h, opt, type, f_sz); \
-    mc_func(avg, sz, h, opt, type, f_sz); \
-    mc_func(put, sz, v, opt, type, f_sz); \
-    mc_func(avg, sz, v, opt, type, f_sz)
-
-mc_funcs(4, mmxext, int16_t,  8);
-mc_funcs(8, sse2,   int16_t,  8);
-mc_funcs(4, ssse3,  int8_t,  32);
-mc_funcs(8, ssse3,  int8_t,  32);
+decl_fpel_func(put,  4,   , mmx);
+decl_fpel_func(put,  8,   , mmx);
+decl_fpel_func(put, 16,   , sse);
+decl_fpel_func(put, 32,   , sse);
+decl_fpel_func(put, 64,   , sse);
+decl_fpel_func(avg,  4, _8, mmxext);
+decl_fpel_func(avg,  8, _8, mmxext);
+decl_fpel_func(avg, 16, _8, sse2);
+decl_fpel_func(avg, 32, _8, sse2);
+decl_fpel_func(avg, 64, _8, sse2);
+decl_fpel_func(put, 32,   , avx);
+decl_fpel_func(put, 64,   , avx);
+decl_fpel_func(avg, 32, _8, avx2);
+decl_fpel_func(avg, 64, _8, avx2);
+
+decl_mc_funcs(4, mmxext, int16_t, 8, 8);
+decl_mc_funcs(8, sse2, int16_t,  8, 8);
+decl_mc_funcs(4, ssse3, int8_t, 32, 8);
+decl_mc_funcs(8, ssse3, int8_t, 32, 8);
 #if ARCH_X86_64
-mc_funcs(16, ssse3, int8_t,  32);
-mc_funcs(32, avx2,  int8_t,  32);
+decl_mc_funcs(16, ssse3, int8_t, 32, 8);
+decl_mc_funcs(32, avx2, int8_t, 32, 8);
 #endif
 
-#undef mc_funcs
-#undef mc_func
-
-#define mc_rep_func(avg, sz, hsz, dir, opt, type, f_sz)                     \
-static av_always_inline void                                                \
-ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,     \
-                                                      ptrdiff_t dst_stride, \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const type (*filter)[f_sz]) \
-{                                                                           \
-    ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst,         \
-                                                           dst_stride,      \
-                                                           src,             \
-                                                           src_stride,      \
-                                                           h,               \
-                                                           filter);         \
-    ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz,   \
-                                                           dst_stride,      \
-                                                           src + hsz,       \
-                                                           src_stride,      \
-                                                           h, filter);      \
-}
-
-#define mc_rep_funcs(sz, hsz, opt, type, f_sz)     \
-    mc_rep_func(put, sz, hsz, h, opt, type, f_sz)  \
-    mc_rep_func(avg, sz, hsz, h, opt, type, f_sz)  \
-    mc_rep_func(put, sz, hsz, v, opt, type, f_sz)  \
-    mc_rep_func(avg, sz, hsz, v, opt, type, f_sz)
-
-mc_rep_funcs(16, 8,  sse2,  int16_t,  8)
+mc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8)
 #if ARCH_X86_32
-mc_rep_funcs(16, 8,  ssse3, int8_t,  32)
+mc_rep_funcs(16,  8,  8, ssse3, int8_t,  32, 8)
 #endif
-mc_rep_funcs(32, 16, sse2,  int16_t,  8)
-mc_rep_funcs(32, 16, ssse3, int8_t,  32)
-mc_rep_funcs(64, 32, sse2,  int16_t,  8)
-mc_rep_funcs(64, 32, ssse3, int8_t,  32)
+mc_rep_funcs(32, 16, 16, sse2,  int16_t,  8, 8)
+mc_rep_funcs(32, 16, 16, ssse3, int8_t,  32, 8)
+mc_rep_funcs(64, 32, 32, sse2,  int16_t,  8, 8)
+mc_rep_funcs(64, 32, 32, ssse3, int8_t,  32, 8)
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-mc_rep_funcs(64, 32, avx2,  int8_t,  32)
+mc_rep_funcs(64, 32, 32, avx2,  int8_t,  32, 8)
 #endif
 
-#undef mc_rep_funcs
-#undef mc_rep_func
-
 extern const int8_t ff_filters_ssse3[3][15][4][32];
 extern const int16_t ff_filters_sse2[3][15][8][8];
 
-#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, opt)                   \
-static void                                                                      \
-op ## _8tap_ ## fname ## _ ## sz ## hv_ ## opt(uint8_t *dst,                     \
-                                               ptrdiff_t dst_stride,             \
-                                               const uint8_t *src,               \
-                                               ptrdiff_t src_stride,             \
-                                               int h, int mx, int my)            \
-{                                                                                \
-    LOCAL_ALIGNED_ ## align(uint8_t, temp, [71 * 64]);                           \
-    ff_vp9_put_8tap_1d_h_ ## sz ## _ ## opt(temp, 64,                            \
-                                            src - 3 * src_stride,                \
-                                            src_stride, h + 7,                   \
-                                            ff_filters_ ## f_opt[f][mx - 1]);    \
-    ff_vp9_ ## op ## _8tap_1d_v_ ## sz ## _ ## opt(dst, dst_stride,              \
-                                                   temp + 3 * 64, 64, h,         \
-                                                   ff_filters_ ## f_opt[f][my - 1]); \
-}
-
-#define filters_8tap_2d_fn(op, sz, align, opt, f_opt)                          \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, opt) \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   align, opt) \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  align, opt)
-
-#define filters_8tap_2d_fn2(op, align, opt4, opt8, f_opt) \
-    filters_8tap_2d_fn(op, 64, align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 32, align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 16, align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 8,  align, opt8, f_opt)  \
-    filters_8tap_2d_fn(op, 4,  align, opt4, f_opt)
-
-
-filters_8tap_2d_fn2(put, 16, mmxext, sse2, sse2)
-filters_8tap_2d_fn2(avg, 16, mmxext, sse2, sse2)
-filters_8tap_2d_fn2(put, 16, ssse3, ssse3, ssse3)
-filters_8tap_2d_fn2(avg, 16, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-filters_8tap_2d_fn(put, 64, 32, avx2, ssse3)
-filters_8tap_2d_fn(put, 32, 32, avx2, ssse3)
-filters_8tap_2d_fn(avg, 64, 32, avx2, ssse3)
-filters_8tap_2d_fn(avg, 32, 32, avx2, ssse3)
+filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
 #endif
 
-#undef filters_8tap_2d_fn2
-#undef filters_8tap_2d_fn
-#undef filter_8tap_2d_fn
-
-#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, opt)         \
-static void                                                                \
-op ## _8tap_ ## fname ## _ ## sz ## dir ## _ ## opt(uint8_t *dst,          \
-                                                    ptrdiff_t dst_stride,  \
-                                                    const uint8_t *src,    \
-                                                    ptrdiff_t src_stride,  \
-                                                    int h, int mx,         \
-                                                    int my)                \
-{                                                                          \
-    ff_vp9_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(dst,          \
-                                                             dst_stride,   \
-                                                             src,          \
-                                                             src_stride, h,\
-                                                             ff_filters_ ## f_opt[f][dvar - 1]); \
-}
-
-#define filters_8tap_1d_fn(op, sz, dir, dvar, opt, f_opt)                          \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, opt) \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   dir, dvar, opt) \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  dir, dvar, opt)
-
-#define filters_8tap_1d_fn2(op, sz, opt, f_opt)        \
-    filters_8tap_1d_fn(op, sz, h, mx, opt, f_opt)      \
-    filters_8tap_1d_fn(op, sz, v, my, opt, f_opt)
-
-#define filters_8tap_1d_fn3(op, opt4, opt8, f_opt) \
-    filters_8tap_1d_fn2(op, 64, opt8, f_opt) \
-    filters_8tap_1d_fn2(op, 32, opt8, f_opt) \
-    filters_8tap_1d_fn2(op, 16, opt8, f_opt) \
-    filters_8tap_1d_fn2(op,  8, opt8, f_opt) \
-    filters_8tap_1d_fn2(op,  4, opt4, f_opt)
-
-filters_8tap_1d_fn3(put, mmxext, sse2, sse2)
-filters_8tap_1d_fn3(avg, mmxext, sse2, sse2)
-filters_8tap_1d_fn3(put, ssse3, ssse3, ssse3)
-filters_8tap_1d_fn3(avg, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-filters_8tap_1d_fn2(put, 64, avx2, ssse3)
-filters_8tap_1d_fn2(put, 32, avx2, ssse3)
-filters_8tap_1d_fn2(avg, 64, avx2, ssse3)
-filters_8tap_1d_fn2(avg, 32, avx2, ssse3)
+filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
 #endif
 
-#undef filters_8tap_1d_fn
-#undef filters_8tap_1d_fn2
-#undef filters_8tap_1d_fn3
-#undef filter_8tap_1d_fn
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                            int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct,  idct,  size, opt); \
+itxfm_func(iadst, idct,  size, opt); \
+itxfm_func(idct,  iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct,  idct,  4, mmxext);
+itxfm_func(idct,  iadst, 4, sse2);
+itxfm_func(iadst, idct,  4, sse2);
+itxfm_func(iadst, iadst, 4, sse2);
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, sse2);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, sse2);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, sse2);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+itxfm_funcs(16, avx2);
+itxfm_func(idct, idct, 32, avx2);
+
+#undef itxfm_func
+#undef itxfm_funcs
 
 #define lpf_funcs(size1, size2, opt) \
 void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
@@ -223,6 +126,8 @@ void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
 void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
                                                     int E, int I, int H)
 
+lpf_funcs(4, 8, mmxext);
+lpf_funcs(8, 8, mmxext);
 lpf_funcs(16, 16, sse2);
 lpf_funcs(16, 16, ssse3);
 lpf_funcs(16, 16, avx);
@@ -241,42 +146,88 @@ lpf_funcs(88, 16, avx);
 
 #undef lpf_funcs
 
-#endif /* HAVE_X86ASM */
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                   const uint8_t *l, const uint8_t *a)
 
-av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
-{
-#if HAVE_X86ASM
-    int cpu_flags = av_get_cpu_flags();
+ipred_func(8, v, mmx);
+
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
+
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
+
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
 
-#define init_fpel(idx1, idx2, sz, type, opt)                            \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_ ## type ## sz ## _ ## opt
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
 
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
 
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _ ## opt
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
 
-#define init_subpel2(idx1, idx2, sz, type, opt) \
-    init_subpel1(idx1, idx2, 1, 1, sz, hv, type, opt); \
-    init_subpel1(idx1, idx2, 0, 1, sz, v,  type, opt); \
-    init_subpel1(idx1, idx2, 1, 0, sz, h,  type, opt)
+ipred_func(4, h, sse2);
 
-#define init_subpel3_32_64(idx, type, opt) \
-    init_subpel2(0, idx, 64, type, opt); \
-    init_subpel2(1, idx, 32, type, opt)
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
 
-#define init_subpel3_8to64(idx, type, opt) \
-    init_subpel3_32_64(idx, type, opt); \
-    init_subpel2(2, idx, 16, type, opt); \
-    init_subpel2(3, idx,  8, type, opt)
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
 
-#define init_subpel3(idx, type, opt) \
-    init_subpel3_8to64(idx, type, opt); \
-    init_subpel2(4, idx,  4, type, opt)
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
+
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
+
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
+{
+#if HAVE_X86ASM
+    int cpu_flags;
+
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
+        return;
+    }
+
+    cpu_flags = av_get_cpu_flags();
 
 #define init_lpf(opt) do { \
     dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
@@ -291,53 +242,169 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
     dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
 } while (0)
 
+#define init_ipred(sz, opt, t, e) \
+    dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+    init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+    init_ipred(sz, opt, hd, HOR_DOWN); \
+    init_ipred(sz, opt, vl, VERT_LEFT); \
+    init_ipred(sz, opt, hu, HOR_UP); \
+    init_ipred(sz, opt, tm, TM_VP8); \
+    init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+    init_dir_tm_ipred(sz, opt); \
+    init_ipred(sz, opt, h,  HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dc,      DC); \
+    init_ipred(sz, opt, dc_left, LEFT_DC); \
+    init_ipred(sz, opt, dc_top,  TOP_DC); \
+} while (0)
+#define init_all_ipred(sz, opt) do { \
+    init_dc_ipred(sz, opt); \
+    init_dir_tm_h_ipred(sz, opt); \
+} while (0)
+
     if (EXTERNAL_MMX(cpu_flags)) {
-        init_fpel(4, 0,  4, put, mmx);
-        init_fpel(3, 0,  8, put, mmx);
+        init_fpel_func(4, 0,  4, put, , mmx);
+        init_fpel_func(3, 0,  8, put, , mmx);
+        if (!bitexact) {
+            dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+            dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+            dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+            dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+        }
+        init_ipred(8, mmx, v, VERT);
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-        init_subpel2(4, 0, 4, put, mmxext);
-        init_subpel2(4, 1, 4, avg, mmxext);
-        init_fpel(4, 1,  4, avg, mmxext);
-        init_fpel(3, 1,  8, avg, mmxext);
+        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
+        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
+        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
+        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
+        init_subpel2(4, 0, 4, put, 8, mmxext);
+        init_subpel2(4, 1, 4, avg, 8, mmxext);
+        init_fpel_func(4, 1,  4, avg, _8, mmxext);
+        init_fpel_func(3, 1,  8, avg, _8, mmxext);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+        init_dc_ipred(4, mmxext);
+        init_dc_ipred(8, mmxext);
+        init_dir_tm_ipred(4, mmxext);
     }
 
     if (EXTERNAL_SSE(cpu_flags)) {
-        init_fpel(2, 0, 16, put, sse);
-        init_fpel(1, 0, 32, put, sse);
-        init_fpel(0, 0, 64, put, sse);
+        init_fpel_func(2, 0, 16, put, , sse);
+        init_fpel_func(1, 0, 32, put, , sse);
+        init_fpel_func(0, 0, 64, put, , sse);
+        init_ipred(16, sse, v, VERT);
+        init_ipred(32, sse, v, VERT);
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-        init_subpel3_8to64(0, put, sse2);
-        init_subpel3_8to64(1, avg, sse2);
-        init_fpel(2, 1, 16, avg, sse2);
-        init_fpel(1, 1, 32, avg, sse2);
-        init_fpel(0, 1, 64, avg, sse2);
+        init_subpel3_8to64(0, put, 8, sse2);
+        init_subpel3_8to64(1, avg, 8, sse2);
+        init_fpel_func(2, 1, 16, avg,  _8, sse2);
+        init_fpel_func(1, 1, 32, avg,  _8, sse2);
+        init_fpel_func(0, 1, 64, avg,  _8, sse2);
         init_lpf(sse2);
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
+        init_dc_ipred(16, sse2);
+        init_dc_ipred(32, sse2);
+        init_dir_tm_h_ipred(8, sse2);
+        init_dir_tm_h_ipred(16, sse2);
+        init_dir_tm_h_ipred(32, sse2);
+        init_ipred(4, sse2, h, HOR);
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
-        init_subpel3(0, put, ssse3);
-        init_subpel3(1, avg, ssse3);
+        init_subpel3(0, put, 8, ssse3);
+        init_subpel3(1, avg, 8, ssse3);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
         init_lpf(ssse3);
+        init_all_ipred(4, ssse3);
+        init_all_ipred(8, ssse3);
+        init_all_ipred(16, ssse3);
+        init_all_ipred(32, ssse3);
     }
 
     if (EXTERNAL_AVX(cpu_flags)) {
-        init_fpel(1, 0, 32, put, avx);
-        init_fpel(0, 0, 64, put, avx);
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
         init_lpf(avx);
+        init_dir_tm_h_ipred(8, avx);
+        init_dir_tm_h_ipred(16, avx);
+        init_dir_tm_h_ipred(32, avx);
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(1, 0, 32, put, , avx);
+        init_fpel_func(0, 0, 64, put, , avx);
+        init_ipred(32, avx, v, VERT);
     }
 
-    if (EXTERNAL_AVX2(cpu_flags)) {
-        init_fpel(1, 1, 32, avg, avx2);
-        init_fpel(0, 1, 64, avg, avx2);
-
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        init_fpel_func(1, 1, 32, avg, _8, avx2);
+        init_fpel_func(0, 1, 64, avg, _8, avx2);
+        if (ARCH_X86_64) {
 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-        init_subpel3_32_64(0, put, avx2);
-        init_subpel3_32_64(1, avg, avx2);
-#endif /* ARCH_X86_64 && HAVE_AVX2_EXTERNAL */
+            dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
+            dsp->itxfm_add[TX_32X32][ADST_ADST] =
+            dsp->itxfm_add[TX_32X32][ADST_DCT] =
+            dsp->itxfm_add[TX_32X32][DCT_ADST] =
+            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
+            init_subpel3_32_64(0, put, 8, avx2);
+            init_subpel3_32_64(1, avg, 8, avx2);
+#endif
+        }
+        init_dc_ipred(32, avx2);
+        init_ipred(32, avx2, h,  HOR);
+        init_ipred(32, avx2, tm, TM_VP8);
     }
 
 #undef init_fpel
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
new file mode 100644
index 0000000..e410cab
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -0,0 +1,189 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+// hack to force-expand BPC
+#define cat(a, bpp, b) a##bpp##b
+
+#define decl_fpel_func(avg, sz, bpp, opt) \
+void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my)
+
+#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                         const uint8_t *src, ptrdiff_t src_stride, \
+                                                         int h, const type (*filter)[f_sz])
+
+#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
+decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+                                                       ptrdiff_t stride, \
+                                                       const uint8_t *l, \
+                                                       const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type,  4, bpp, opt4); \
+decl_ipred_fn(type,  8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
+#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
+void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
+                                                                         ptrdiff_t stride, \
+                                                                         int16_t *block, \
+                                                                         int eob)
+
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct,  idct,  size, bpp, opt); \
+decl_itxfm_func(iadst, idct,  size, bpp, opt); \
+decl_itxfm_func(idct,  iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
+#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                    const uint8_t *src, ptrdiff_t src_stride, \
+                                                    int h, const type (*filter)[f_sz]) \
+{ \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst,        dst_stride, src, \
+                                                         src_stride, h, filter); \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
+                                                         src_stride, h, filter); \
+}
+
+#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
+
+#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                          const uint8_t *src, ptrdiff_t src_stride, \
+                                                          int h, int mx, int my) \
+{ \
+    ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
+                                                       h, ff_filters_##f_opt[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  dir, dvar, bpp, opt)
+
+#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
+
+#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
+
+#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
+static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my) \
+{ \
+    LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
+    ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
+                                              src_stride,  h + 7, \
+                                              ff_filters_##f_opt[f][mx - 1]); \
+    ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
+                                                 64 * bytes, h, \
+                                                 ff_filters_##f_opt[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth, align, bpp, bytes, opt)
+
+#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
+filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
+
+#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+        type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+        type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = \
+        type##_8tap_sharp_##sz##dir##_##bpp##_##opt
+
+#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
+    init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
+    init_subpel1(idx1, idx2, 0, 1, sz, v,  type, bpp, opt); \
+    init_subpel1(idx1, idx2, 1, 0, sz, h,  type, bpp, opt)
+
+#define init_subpel3_32_64(idx, type, bpp, opt) \
+    init_subpel2(0, idx, 64, type, bpp, opt); \
+    init_subpel2(1, idx, 32, type, bpp, opt)
+
+#define init_subpel3_8to64(idx, type, bpp, opt) \
+    init_subpel3_32_64(idx, type, bpp, opt); \
+    init_subpel2(2, idx, 16, type, bpp, opt); \
+    init_subpel2(3, idx,  8, type, bpp, opt)
+
+#define init_subpel3(idx, type, bpp, opt) \
+    init_subpel3_8to64(idx, type, bpp, opt); \
+    init_subpel2(4, idx,  4, type, bpp, opt)
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+    dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+        cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  8, bpp, opt); \
+    init_ipred_func(type, enum, 16, bpp, opt); \
+    init_ipred_func(type, enum, 32, bpp, opt)
+
+#define init_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  4, bpp, opt); \
+    init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
+void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
diff --git a/libavcodec/x86/vp9dsp_init_10bpp.c b/libavcodec/x86/vp9dsp_init_10bpp.c
new file mode 100644
index 0000000..2694c06
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_10bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_12bpp.c b/libavcodec/x86/vp9dsp_init_12bpp.c
new file mode 100644
index 0000000..5da3bc1
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_12bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
new file mode 100644
index 0000000..60d10a1
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -0,0 +1,149 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_X86ASM
+
+decl_fpel_func(put,   8,    , mmx);
+decl_fpel_func(avg,   8, _16, mmxext);
+decl_fpel_func(put,  16,    , sse);
+decl_fpel_func(put,  32,    , sse);
+decl_fpel_func(put,  64,    , sse);
+decl_fpel_func(put, 128,    , sse);
+decl_fpel_func(avg,  16, _16, sse2);
+decl_fpel_func(avg,  32, _16, sse2);
+decl_fpel_func(avg,  64, _16, sse2);
+decl_fpel_func(avg, 128, _16, sse2);
+decl_fpel_func(put,  32,    , avx);
+decl_fpel_func(put,  64,    , avx);
+decl_fpel_func(put, 128,    , avx);
+decl_fpel_func(avg,  32, _16, avx2);
+decl_fpel_func(avg,  64, _16, avx2);
+decl_fpel_func(avg, 128, _16, avx2);
+
+decl_ipred_fns(v,       16, mmx,    sse);
+decl_ipred_fns(h,       16, mmxext, sse2);
+decl_ipred_fns(dc,      16, mmxext, sse2);
+decl_ipred_fns(dc_top,  16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
+decl_ipred_fn(dl,       16,     16, avx2);
+decl_ipred_fn(dl,       32,     16, avx2);
+decl_ipred_fn(dr,       16,     16, avx2);
+decl_ipred_fn(dr,       32,     16, avx2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2,  sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx,   avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel_func(4, 0,   8, put, , mmx);
+        init_ipred_func(v, VERT, 4, 16, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_fpel_func(4, 1,   8, avg, _16, mmxext);
+        init_ipred_func(h, HOR, 4, 16, mmxext);
+        init_ipred_func(dc, DC, 4, 16, mmxext);
+        init_ipred_func(dc_top,  TOP_DC,  4, 16, mmxext);
+        init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        init_fpel_func(3, 0,  16, put, , sse);
+        init_fpel_func(2, 0,  32, put, , sse);
+        init_fpel_func(1, 0,  64, put, , sse);
+        init_fpel_func(0, 0, 128, put, , sse);
+        init_8_16_32_ipred_funcs(v, VERT, 16, sse);
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_fpel_func(3, 1,  16, avg, _16, sse2);
+        init_fpel_func(2, 1,  32, avg, _16, sse2);
+        init_fpel_func(1, 1,  64, avg, _16, sse2);
+        init_fpel_func(0, 1, 128, avg, _16, sse2);
+        init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+        init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+        init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
+        init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+        init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+        init_ipred_funcs(hu, HOR_UP, 16, sse2);
+        init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+        init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+        init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+        init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(2, 0,  32, put, , avx);
+        init_fpel_func(1, 0,  64, put, , avx);
+        init_fpel_func(0, 0, 128, put, , avx);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+        init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+        init_ipred_funcs(hu, HOR_UP, 16, avx);
+        init_ipred_funcs(hd, HOR_DOWN, 16, avx);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        init_fpel_func(2, 1,  32, avg, _16, avx2);
+        init_fpel_func(1, 1,  64, avg, _16, avx2);
+        init_fpel_func(0, 1, 128, avg, _16, avx2);
+        init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+        init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
+        init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
+#if ARCH_X86_64
+        init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
+#endif
+    }
+
+#endif /* HAVE_X86ASM */
+}
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
new file mode 100644
index 0000000..b56afc7
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -0,0 +1,240 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_X86ASM
+
+extern const int16_t ff_filters_16bpp[3][15][4][16];
+
+decl_mc_funcs(4, sse2, int16_t, 16, BPC);
+decl_mc_funcs(8, sse2, int16_t, 16, BPC);
+decl_mc_funcs(16, avx2, int16_t, 16, BPC);
+
+mc_rep_funcs(16,  8, 16, sse2, int16_t, 16, BPC)
+mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC)
+#if HAVE_AVX2_EXTERNAL
+mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC)
+#endif
+
+filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp)
+filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp)
+#endif
+
+filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp)
+filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
+#endif
+
+#define decl_lpf_func(dir, wd, bpp, opt) \
+void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                     int E, int I, int H)
+
+#define decl_lpf_funcs(dir, wd, bpp) \
+decl_lpf_func(dir, wd, bpp, sse2); \
+decl_lpf_func(dir, wd, bpp, ssse3); \
+decl_lpf_func(dir, wd, bpp, avx)
+
+#define decl_lpf_funcs_wd(dir) \
+decl_lpf_funcs(dir,  4, BPC); \
+decl_lpf_funcs(dir,  8, BPC); \
+decl_lpf_funcs(dir, 16, BPC)
+
+decl_lpf_funcs_wd(h);
+decl_lpf_funcs_wd(v);
+
+#define lpf_16_wrapper(dir, off, bpp, opt) \
+static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                 int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst,       stride, E, I, H); \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
+}
+
+#define lpf_16_wrappers(bpp, opt) \
+lpf_16_wrapper(h, 8 * stride, bpp, opt) \
+lpf_16_wrapper(v, 16,         bpp, opt)
+
+lpf_16_wrappers(BPC, sse2)
+lpf_16_wrappers(BPC, ssse3)
+lpf_16_wrappers(BPC, avx)
+
+#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
+static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                           int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst,       stride, \
+                                                     E & 0xff, I & 0xff, H & 0xff); \
+    ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
+                                                     E >> 8,   I >> 8,   H >> 8); \
+}
+
+#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(v, 16,         wd1, wd2, bpp, opt)
+
+#define lpf_mix2_wrappers_set(bpp, opt) \
+lpf_mix2_wrappers(4, 4, bpp, opt) \
+lpf_mix2_wrappers(4, 8, bpp, opt) \
+lpf_mix2_wrappers(8, 4, bpp, opt) \
+lpf_mix2_wrappers(8, 8, bpp, opt) \
+
+lpf_mix2_wrappers_set(BPC, sse2)
+lpf_mix2_wrappers_set(BPC, ssse3)
+lpf_mix2_wrappers_set(BPC, avx)
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
+
+decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+decl_itxfm_func(idct,  idct,  4, BPC, mmxext);
+decl_itxfm_funcs(4, BPC, ssse3);
+#else
+decl_itxfm_func(idct,  idct,  4, BPC, sse2);
+#endif
+decl_itxfm_func(idct,  iadst, 4, BPC, sse2);
+decl_itxfm_func(iadst, idct,  4, BPC, sse2);
+decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
+decl_itxfm_funcs(8, BPC, sse2);
+decl_itxfm_funcs(16, BPC, sse2);
+decl_itxfm_func(idct,  idct, 32, BPC, sse2);
+#endif /* HAVE_X86ASM */
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
+#define init_lpf_16_func(idx, dir, bpp, opt) \
+    dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
+#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
+
+#define init_lpf_funcs(bpp, opt) \
+    init_lpf_8_func(0, 0, h,  4, bpp, opt); \
+    init_lpf_8_func(0, 1, v,  4, bpp, opt); \
+    init_lpf_8_func(1, 0, h,  8, bpp, opt); \
+    init_lpf_8_func(1, 1, v,  8, bpp, opt); \
+    init_lpf_8_func(2, 0, h, 16, bpp, opt); \
+    init_lpf_8_func(2, 1, v, 16, bpp, opt); \
+    init_lpf_16_func(0, h, bpp, opt); \
+    init_lpf_16_func(1, v, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+
+#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \
+    dsp->itxfm_add[idxa][idxb] = \
+        cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt);
+#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \
+    init_itx_func(idx, DCT_DCT,   typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, ADST_DCT,  typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, DCT_ADST,  typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
+#define init_itx_funcs(idx, size, bpp, opt) \
+    init_itx_func(idx, DCT_DCT,   idct,  idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_DCT,  idct,  iadst, size, bpp, opt); \
+    init_itx_func(idx, DCT_ADST,  iadst, idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+        if (!bitexact) {
+            init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+            init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
+#endif
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_subpel3(0, put, BPC, sse2);
+        init_subpel3(1, avg, BPC, sse2);
+        init_lpf_funcs(BPC, sse2);
+        init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+#if BPC == 10
+        if (!bitexact) {
+            init_itx_func(TX_4X4, ADST_DCT,  idct,  iadst, 4, 10, sse2);
+            init_itx_func(TX_4X4, DCT_ADST,  iadst, idct,  4, 10, sse2);
+            init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
+        }
+#else
+        init_itx_funcs(TX_4X4, 4, 12, sse2);
+#endif
+        init_itx_funcs(TX_8X8, 8, BPC, sse2);
+        init_itx_funcs(TX_16X16, 16, BPC, sse2);
+        init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_lpf_funcs(BPC, ssse3);
+#if BPC == 10
+        if (!bitexact) {
+            init_itx_funcs(TX_4X4, 4, BPC, ssse3);
+        }
+#endif
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        init_lpf_funcs(BPC, avx);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if HAVE_AVX2_EXTERNAL
+        init_subpel3_32_64(0,  put, BPC, avx2);
+        init_subpel3_32_64(1,  avg, BPC, avx2);
+        init_subpel2(2, 0, 16, put, BPC, avx2);
+        init_subpel2(2, 1, 16, avg, BPC, avx2);
+#endif
+    }
+
+#endif /* HAVE_X86ASM */
+
+    ff_vp9dsp_init_16bpp_x86(dsp);
+}
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 0000000..31f7d44
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
+pw_4096: times 8 dw 4096
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+                    times 4 db 2
+                    times 4 db 1
+                    times 4 db 0
+pb_8x1_8x0:   times 8 db 1
+              times 8 db 0
+pb_8x3_8x2:   times 8 db 3
+              times 8 db 2
+pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
+              times 8 db -1
+pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
+              times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+              times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+              times 11 db 7
+pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+               times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+              times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_15x0_1xm1: times 15 db 0
+              db -1
+pb_0to2_5x3: db 0, 1, 2
+             times 5 db 3
+pb_6xm1_2x0: times 6 db -1
+             times 2 db 0
+pb_6x0_2xm1: times 6 db 0
+             times 2 db -1
+
+cextern pb_1
+cextern pb_2
+cextern pb_3
+cextern pb_15
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_255
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_8192
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_4to8_FUNCS 0
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [lq]
+    movq                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_4to8_FUNCS
+INIT_MMX ssse3
+DC_4to8_FUNCS
+
+%macro DC_16to32_FUNCS 0
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    mova                    m2, [aq]
+    mova                    m3, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m4, m4
+    psadbw                  m0, m4
+    psadbw                  m1, m4
+    psadbw                  m2, m4
+    psadbw                  m3, m4
+    paddw                   m0, m1
+    paddw                   m2, m3
+    paddw                   m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_512]
+    pshufb                  m0, m4
+%else
+    paddw                   m0, [pw_32]
+    psraw                   m0, 6
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_16to32_FUNCS
+INIT_XMM ssse3
+DC_16to32_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_512]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [%2q]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_8192]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_2]
+    psraw                   m0, 2
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+INIT_MMX ssse3
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+
+%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    mova                    m1, [%2q+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+INIT_XMM ssse3
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+
+%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
+%if HAVE_AVX2_EXTERNAL
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_1024]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+%endmacro
+
+INIT_YMM avx2
+DC_1D_AVX2_FUNCS top,  a
+DC_1D_AVX2_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_YMM avx
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+; h
+
+%macro H_XMM_FUNCS 2
+%if notcpuflag(avx)
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
+%else
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0123
+    punpcklwd               m0, m0
+%endif
+    lea               stride3q, [strideq*3]
+    movd      [dstq+strideq*0], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*1], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*2], m0
+    psrldq                  m0, 4
+    movd      [dstq+stride3q ], m0
+    RET
+%endif
+
+cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m2, [pb_8x1_8x0]
+    mova                    m3, [pb_8x3_8x2]
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 1
+.loop:
+    movd                    m0, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, m3
+    pshufb                  m0, m2
+%else
+    punpcklbw               m0, m0
+    punpcklwd               m0, m0
+    pshufd                  m1, m0, q2233
+    pshufd                  m0, m0, q0011
+%endif
+    movq      [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m1
+    movq      [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 3
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+H_XMM_FUNCS 2, 4
+INIT_XMM ssse3
+H_XMM_FUNCS 4, 8
+INIT_XMM avx
+H_XMM_FUNCS 4, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                   xm3, [lq+cntq*4]
+    vinserti128             m3, m3, xm3, 1
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; tm
+
+%macro TM_MMX_FUNCS 0
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+    pxor                    m1, m1
+    movd                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 1
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m4, m2, q1111
+    pshufw                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m4
+    packuswb                m2, m2
+    movd      [dstq+strideq*0], m4
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+TM_MMX_FUNCS
+INIT_MMX ssse3
+TM_MMX_FUNCS
+
+%macro TM_XMM_FUNCS 0
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+    pxor                    m1, m1
+    movh                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 3
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m4, m2, q1111
+    pshufd                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m2
+    movh      [dstq+strideq*0], m4
+    movhps    [dstq+strideq*1], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    mova                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m4, [pw_m256]
+    mova                    m3, [pw_m255]
+    pshufb                  m2, m4
+%else
+    punpcklbw               m2, m3
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 7
+.loop:
+    pinsrw                  m7, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m5, m7, m3
+    pshufb                  m7, m4
+%else
+    punpcklbw               m7, m3
+    punpcklwd               m7, m7
+    pshufd                  m5, m7, q1111
+    pshufd                  m7, m7, q0000
+%endif
+    paddw                   m2, m5, m0
+    paddw                   m5, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m5
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+%if ARCH_X86_64
+%define mem 0
+%else
+%define mem 64
+%endif
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
+    pxor                    m5, m5
+    pinsrw                  m4, [aq-1], 0
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova                   m12, [pw_m256]
+    mova                   m13, [pw_m255]
+%define pw_m256_reg m12
+%define pw_m255_reg m13
+%else
+%define pw_m256_reg [pw_m256]
+%define pw_m255_reg [pw_m255]
+%endif
+    pshufb                  m4, pw_m256_reg
+%else
+    punpcklbw               m4, m5
+    punpcklwd               m4, m4
+    pshufd                  m4, m4, q0000
+%endif
+    punpckhbw               m1, m0,  m5
+    punpckhbw               m3, m2,  m5
+    punpcklbw               m0, m5
+    punpcklbw               m2, m5
+    psubw                   m1, m4
+    psubw                   m0, m4
+    psubw                   m3, m4
+    psubw                   m2, m4
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+    SWAP                     2, 10
+    SWAP                     3, 11
+%else
+    mova            [rsp+0*16], m0
+    mova            [rsp+1*16], m1
+    mova            [rsp+2*16], m2
+    mova            [rsp+3*16], m3
+%endif
+    mov                   cntq, 15
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m7, m3, pw_m255_reg
+    pshufb                  m3, pw_m256_reg
+%else
+    pxor                    m7, m7
+    punpcklbw               m3, m7
+    punpcklwd               m3, m3
+    pshufd                  m7, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+%if ARCH_X86_64
+    paddw                   m4, m7, m8
+    paddw                   m5, m7, m9
+    paddw                   m6, m7, m10
+    paddw                   m7, m11
+    paddw                   m0, m3, m8
+    paddw                   m1, m3, m9
+    paddw                   m2, m3, m10
+    paddw                   m3, m11
+%else
+    paddw                   m4, m7, [rsp+0*16]
+    paddw                   m5, m7, [rsp+1*16]
+    paddw                   m6, m7, [rsp+2*16]
+    paddw                   m7, [rsp+3*16]
+    paddw                   m0, m3, [rsp+0*16]
+    paddw                   m1, m3, [rsp+1*16]
+    paddw                   m2, m3, [rsp+2*16]
+    paddw                   m3, [rsp+3*16]
+%endif
+    packuswb                m4, m5
+    packuswb                m6, m7
+    packuswb                m0, m1
+    packuswb                m2, m3
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m6
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%undef pw_m256_reg
+%undef pw_m255_reg
+%undef mem
+%endmacro
+
+INIT_XMM sse2
+TM_XMM_FUNCS
+INIT_XMM ssse3
+TM_XMM_FUNCS
+INIT_XMM avx
+TM_XMM_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    pinsrw                 xm2, [aq-1], 0
+    vinserti128             m2, m2, xm2, 1
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m4, [pw_m256]
+    mova                    m5, [pw_m255]
+    pshufb                  m2, m4
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 15
+.loop:
+    pinsrw                 xm7, [lq+cntq*2], 0
+    vinserti128             m7, m7, xm7, 1
+    pshufb                  m3, m7, m5
+    pshufb                  m7, m4
+    paddw                   m2, m3, m0
+    paddw                   m3, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m3
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+    pxor                   m%4, m%1, m%3
+    pand                   m%4, [pb_1]
+    pavgb                  m%1, m%3
+    psubusb                m%1, m%4
+    pavgb                  m%1, m%2
+%endmacro
+
+%macro DL_MMX_FUNCS 0
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, m1, [pb_0to5_2x7]
+    pshufb                  m2, m1, [pb_2to6_3x7]
+%else
+    punpckhbw               m3, m1, m1              ; 44556677
+    pand                    m0, m1, [pb_6xm1_2x0]   ; 012345__
+    pand                    m3, [pb_6x0_2xm1]       ; ______77
+    psrlq                   m2, m1, 16              ; 234567__
+    por                     m0, m3                  ; 01234577
+    por                     m2, m3                  ; 23456777
+%endif
+    psrlq                   m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    pshufw                  m1, m0, q3321
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    psrlq                   m0, 8
+    psrlq                   m1, 8
+    add                   dstq, strideq
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DL_MMX_FUNCS
+INIT_MMX ssse3
+DL_MMX_FUNCS
+
+%macro DL_XMM_FUNCS 0
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+    movq                    m0, [aq]
+    lea               stride5q, [strideq*5]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, [pb_1to6_10x7]
+%else
+    punpcklbw               m1, m0, m0              ; 0011223344556677
+    punpckhwd               m1, m1                  ; 4x4,4x5,4x6,4x7
+%endif
+    shufps                  m0, m1, q3310
+%if notcpuflag(ssse3)
+    psrldq                  m1, m0, 1
+    shufps                  m1, m0, q3210
+%endif
+    psrldq                  m2, m1, 1
+    LOWPASS                  0, 1, 2, 3
+
+    pshufd                  m1, m0, q3321
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m0, [aq]
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m1, m0, m5
+    pshufb                  m2, m1, m5
+    pshufb                  m4, m0, [pb_15]
+%else
+    pand                    m5, m0, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m1, m0, 1                   ; 123456789ABCDEF_
+    por                     m1, m5                      ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1                   ; 23456789ABCDEFF_
+    por                     m2, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m4, m1, q3333               ; xxxxxxxxFFFFFFFF
+%endif
+    LOWPASS                  0, 1, 2, 3
+    DEFINE_ARGS dst, stride, cnt, stride9
+    lea               stride9q, [strideq+strideq*8]
+    mov                   cntd, 4
+
+.loop:
+    movhlps                 m4, m0
+    mova      [dstq+strideq*0], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+strideq*8], m4
+    movhlps                 m4, m0
+    mova      [dstq+strideq*1], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+stride9q ], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    PALIGNR                 m2, m1, m0, 1, m4
+    PALIGNR                 m3, m1, m0, 2, m4
+    LOWPASS                  0, 2, 3, 4
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m2, m1, m5
+    pshufb                  m3, m2, m5
+    pshufb                  m6, m1, [pb_15]
+    mova                    m7, m6
+%else
+    pand                    m5, m1, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m2, m1, 1                   ; 123456789ABCDEF_
+    por                     m2, m5                      ; 123456789ABCDEFF
+    psrldq                  m3, m2, 1                   ; 23456789ABCDEFF_
+    por                     m3, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m7, m2, q3333               ; xxxxxxxxFFFFFFFF
+    pshufd                  m6, m7, q3333
+%endif
+    LOWPASS                  1, 2, 3, 4
+    lea                 dst16q, [dstq  +strideq*8]
+    mov                   cntd, 8
+    lea                 dst16q, [dst16q+strideq*8]
+.loop:
+    movhlps                 m7, m1
+    mova [dstq  +strideq*0+ 0], m0
+    mova [dstq  +strideq*0+16], m1
+    movhps [dstq+strideq*8+ 0], m0
+    movq [dstq  +strideq*8+ 8], m1
+    mova [dstq  +strideq*8+16], m7
+    mova [dst16q+strideq*0+ 0], m1
+    mova [dst16q+strideq*0+16], m6
+    mova [dst16q+strideq*8+ 0], m7
+    mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 1
+    pshufb                  m1, m5
+%elif cpuflag(ssse3)
+    palignr                 m2, m1, m0, 1
+    pshufb                  m1, m5
+    mova                    m0, m2
+%else
+    mova                    m4, m1
+    psrldq                  m0, 1
+    pslldq                  m4, 15
+    psrldq                  m1, 1
+    por                     m0, m4
+    por                     m1, m5
+%endif
+    add                   dstq, strideq
+    add                 dst16q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_XMM_FUNCS
+INIT_XMM ssse3
+DL_XMM_FUNCS
+INIT_XMM avx
+DL_XMM_FUNCS
+
+; dr
+
+%macro DR_MMX_FUNCS 0
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    movd                    m1, [aq+3]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m1, m0, 1, m3
+    psrlq                   m2, m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    movd      [dstq+stride3q ], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*2], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*1], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*0], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DR_MMX_FUNCS
+INIT_MMX ssse3
+DR_MMX_FUNCS
+
+%macro DR_XMM_FUNCS 0
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m1, [lq]
+    movhps                  m1, [aq-1]
+    movd                    m2, [aq+7]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m3
+    LOWPASS                  0, 1, 2, 3
+
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*4]
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    RET
+
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m1, [lq]
+    movu                    m2, [aq-1]
+    movd                    m4, [aq+15]
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea               stride9q, [strideq *3]
+    mov                   cntd, 4
+    lea               stride9q, [stride9q*3]
+    PALIGNR                 m4, m2, 1, m5
+    PALIGNR                 m3, m2, m1, 15, m5
+    LOWPASS                  3,  2, 4, 5
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m4
+    LOWPASS                  0,  1, 2, 4
+
+.loop:
+    mova    [dstq+strideq*0  ], m3
+    movhps  [dstq+strideq*8+0], m0
+    movq    [dstq+strideq*8+8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    mova    [dstq+strideq*1  ], m3
+    movhps  [dstq+stride9q +0], m0
+    movq    [dstq+stride9q +8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+    mova                    m1, [lq]
+    mova                    m2, [lq+16]
+    movu                    m3, [aq-1]
+    movu                    m4, [aq+15]
+    movd                    m5, [aq+31]
+    DEFINE_ARGS dst, stride, stride8, cnt
+    lea               stride8q, [strideq*8]
+    PALIGNR                 m5, m4, 1, m7
+    PALIGNR                 m6, m4, m3, 15, m7
+    LOWPASS                  5,  4,  6,  7
+    PALIGNR                 m4, m3, 1, m7
+    PALIGNR                 m6, m3, m2, 15, m7
+    LOWPASS                  4,  3,  6,  7
+    PALIGNR                 m3, m2, 1, m7
+    PALIGNR                 m6, m2, m1, 15, m7
+    LOWPASS                  3,  2,  6,  7
+    PALIGNR                 m2, m1, 1, m6
+    pslldq                  m0, m1, 1
+    LOWPASS                  2,  1,  0,  6
+    mov                   cntd, 16
+
+    ; out=m2/m3/m4/m5
+.loop:
+    mova  [dstq+stride8q*0+ 0], m4
+    mova  [dstq+stride8q*0+16], m5
+    mova  [dstq+stride8q*2+ 0], m3
+    mova  [dstq+stride8q*2+16], m4
+    PALIGNR                 m5, m4, 15, m6
+    PALIGNR                 m4, m3, 15, m6
+    PALIGNR                 m3, m2, 15, m6
+    pslldq                  m2, 1
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_XMM_FUNCS
+INIT_XMM ssse3
+DR_XMM_FUNCS
+INIT_XMM avx
+DR_XMM_FUNCS
+
+; vl
+
+INIT_MMX mmxext
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    psrlq                   m1, 8
+    psrlq                   m2, 8
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    RET
+
+%macro VL_XMM_FUNCS 0
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m0, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0
+    punpckhwd               m1, m1
+    shufps                  m0, m1, q3310
+%endif
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m0, 2
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+    pshufb                  m2, m1, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m1, m0, 1               ; 123456789ABCDEF_
+    por                     m1, m4                  ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1               ; 23456789ABCDEFF_
+    por                     m2, m4                  ; 23456789ABCDEFFF
+%endif
+    LOWPASS                  2,  1,  0, 3
+    pavgb                   m1, m0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m5, [aq+16]
+    DEFINE_ARGS dst, stride, dst16, cnt
+    PALIGNR                 m2, m5, m0, 1, m4
+    PALIGNR                 m3, m5, m0, 2, m4
+    lea                 dst16q, [dstq  +strideq*8]
+    LOWPASS                  3,  2,  0, 6
+    pavgb                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m0, m5, m4
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m5, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m0, m5, 1               ; 123456789ABCDEF_
+    por                     m0, m4                  ; 123456789ABCDEFF
+    psrldq                  m1, m0, 1               ; 23456789ABCDEFF_
+    por                     m1, m4                  ; 23456789ABCDEFFF
+%endif
+    lea                 dst16q, [dst16q+strideq*8]
+    LOWPASS                  1,  0,  5, 6
+    pavgb                   m0, m5
+%if cpuflag(ssse3)
+    pshufb                  m5, [pb_15]
+%else
+    punpckhbw               m5, m4, m4
+    pshufhw                 m5, m5, q3333
+    punpckhqdq              m5, m5
+%endif
+    mov                   cntd, 8
+
+.loop:
+%macro %%write 3
+    mova    [dstq+stride%1+ 0], %2
+    mova    [dstq+stride%1+16], %3
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+    palignr                 %2, %3, %2, 1
+    pshufb                  %3, m4
+%elif cpuflag(ssse3)
+    palignr                 m6, %3, %2, 1
+    pshufb                  %3, m4
+    mova                    %2, m6
+%else
+    pslldq                  m6, %3, 15
+    psrldq                  %3, 1
+    psrldq                  %2, 1
+    por                     %3, m4
+    por                     %2, m6
+%endif
+%endmacro
+
+    %%write                q*0, m2, m0
+    %%write                q*1, m3, m1
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_XMM_FUNCS
+INIT_XMM ssse3
+VL_XMM_FUNCS
+INIT_XMM avx
+VL_XMM_FUNCS
+
+; vr
+
+%macro VR_MMX_FUNCS 0
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq-1]
+    punpckldq               m2, [lq]
+    movd                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 5, m3
+    psrlq                   m2, m1, 8
+    psllq                   m3, m1, 8
+    LOWPASS                  2,  1, 3, 4
+
+    ; ABCD <- for the following predictor:
+    ; EFGH
+    ; IABC  | m0 contains ABCDxxxx
+    ; JEFG  | m2 contains xJIEFGHx
+
+%if cpuflag(ssse3)
+    punpckldq               m0, m2
+    pshufb                  m2, [pb_13456_3xm1]
+    movd      [dstq+strideq*0], m0
+    pshufb                  m0, [pb_6012_4xm1]
+    movd      [dstq+stride3q ], m2
+    psrlq                   m2, 8
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+strideq*1], m2
+%else
+    psllq                   m1, m2, 40
+    psrlq                   m2, 24
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m2
+    PALIGNR                 m0, m1, 7, m3
+    psllq                   m1, 8
+    PALIGNR                 m2, m1, 7, m3
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+stride3q ], m2
+%endif
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VR_MMX_FUNCS
+INIT_MMX ssse3
+VR_MMX_FUNCS
+
+%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-1]
+    movhps                  m2, [lq]
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 9, m3
+    pslldq                  m2, m1, 1
+    pslldq                  m3, m1, 2
+    LOWPASS                  1,  2, 3, 4
+
+    ; ABCDEFGH <- for the following predictor:
+    ; IJKLMNOP
+    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
+    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
+    ; SQABCDEF
+    ; TRIJKLMN
+    ; USQABCDE
+    ; VTRIJKLM
+
+%if cpuflag(ssse3)
+    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
+%endif
+    movq      [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_6xm1_BDF_0to6]  ; xxxxxxUSQABCDEFG
+    pshufb                  m1, [pb_6xm1_246_8toE]  ; xxxxxxVTRIJKLMNO
+%else
+    psrlw                   m2, m1, 8               ; x_U_S_Q_xxxxxxxx
+    pand                    m3, m1, [pw_255]        ; x_V_T_R_xxxxxxxx
+    packuswb                m3, m2                  ; xVTRxxxxxUSQxxxx
+    pslldq                  m3, 4                   ; xxxxxVTRxxxxxUSQ
+    PALIGNR                 m0, m3, 7, m4           ; xxxxxxUSQABCDEFG
+    psrldq                  m1, 8
+    pslldq                  m3, 8
+    PALIGNR                 m1, m3, 7, m4           ; xxxxxxVTRIJKLMNO
+%endif
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    RET
+
+cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
+    mova                    m0, [aq]
+    movu                    m1, [aq-1]
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m3, m1, m2, 15, m6
+    LOWPASS                  3,  1,  0,  4
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2,  1, m6
+    pslldq                  m4, m2,  1
+    LOWPASS                  1,  2,  4,  5
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m5, m1, 8
+    pand                    m1, [pw_255]
+    packuswb                m1, m5
+%endif
+    mov                   cntd, 4
+
+.loop:
+    movlhps                 m2, m1
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m4, m0, m1, 15, m6
+    PALIGNR                 m5, m3, m2, 15, m6
+    mova      [dstq+strideq*2], m4
+    mova      [dstq+stride3q ], m5
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m1, 14, m6
+    PALIGNR                 m3, m2, 14, m6
+    pslldq                  m1, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    movu                    m1, [aq-1]
+    PALIGNR                 m3, m2, m0, 15, m6
+    PALIGNR                 m4, m2, m0, 14, m6
+    LOWPASS                  4,  3,  2,  5
+    pavgb                   m3, m2
+    mova                    m2, [lq+16]
+    PALIGNR                 m5, m1, m2, 15, m6
+    LOWPASS                  5,  1,  0,  6
+    pavgb                   m0, m1
+    mova                    m6, [lq]
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                [dstq], m0
+%endif
+    PALIGNR                 m1, m2,  1, m0
+    PALIGNR                 m7, m2, m6, 15, m0
+    LOWPASS                  1,  2,  7,  0
+    PALIGNR                 m2, m6,  1, m0
+    pslldq                  m7, m6,  1
+    LOWPASS                  2,  6,  7,  0
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+    pshufb                  m2, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m0, m1, 8
+    psrlw                   m6, m2, 8
+    pand                    m1, [pw_255]
+    pand                    m2, [pw_255]
+    packuswb                m1, m0
+    packuswb                m2, m6
+%endif
+    DEFINE_ARGS dst, stride, dst16, cnt
+    lea                 dst16q, [dstq  +strideq*8]
+    lea                 dst16q, [dst16q+strideq*8]
+    SBUTTERFLY             qdq,  2,  1,  6
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                    m0, [dstq]
+%endif
+    mov                   cntd, 8
+
+.loop:
+    ; even lines (0, 2, 4, ...): m1 | m0, m3
+    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+    mova    [dstq+stride%1+ 0], %3
+    mova    [dstq+stride%1+16], %4
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], %4
+    PALIGNR                 %4, %3, 15, m6
+    PALIGNR                 %3, %2, 15, m6
+    pslldq                  %2,  1
+%endmacro
+
+    %%write                q*0, m1, m0, m3
+    %%write                q*1, m2, m5, m4
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_XMM_FUNCS 7
+INIT_XMM ssse3
+VR_XMM_FUNCS 6
+INIT_XMM avx
+VR_XMM_FUNCS 6
+
+; hd
+
+INIT_MMX mmxext
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0,  3
+    pavgb                   m1, m0
+
+    ; DHIJ <- for the following predictor:
+    ; CGDH
+    ; BFCG  | m1 contains ABCDxxxx
+    ; AEBF  | m2 contains EFGHIJxx
+
+    punpcklbw               m1, m2
+    punpckhdq               m0, m1, m2
+
+    ; m1 contains AEBFCGDH
+    ; m0 contains CGDHIJxx
+
+    movd      [dstq+stride3q ], m1
+    movd      [dstq+strideq*1], m0
+    psrlq                   m1, 16
+    psrlq                   m0, 16
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+strideq*0], m0
+    RET
+
+%macro HD_XMM_FUNCS 0
+cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
+    movq                    m0, [lq]
+    movhps                  m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    ; HPQRSTUV <- for the following predictor
+    ; GOHPQRST
+    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
+    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
+    ; DLEMFNGO
+    ; CKDLEMFN
+    ; BJCKDLEM
+    ; AIBJCKDL
+
+    punpcklbw               m1, m2
+    movhlps                 m2, m2
+
+    ; m1 contains AIBJCKDLEMFNGOHP
+    ; m2 contains QRSTUVxxxxxxxxxx
+
+    movhps   [dstq +stride3q ], m1
+    movq     [dst4q+stride3q ], m1
+    PALIGNR                 m3, m2, m1, 2, m4
+    movhps   [dstq +strideq*2], m3
+    movq     [dst4q+strideq*2], m3
+    PALIGNR                 m3, m2, m1, 4, m4
+    movhps   [dstq +strideq*1], m3
+    movq     [dst4q+strideq*1], m3
+    PALIGNR                 m2, m1, 6, m4
+    movhps   [dstq +strideq*0], m2
+    movq     [dst4q+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m3, [aq-1]
+    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+    lea               stride4q, [strideq*4]
+    lea                  dst4q, [dstq +stride4q]
+    lea                  dst8q, [dst4q+stride4q]
+    lea                 dst12q, [dst8q+stride4q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m1, m3, m0,  1, m6
+    PALIGNR                 m2, m3, m0,  2, m6
+    LOWPASS                  2,  1,  0,  6
+    pavgb                   m1, m0
+    SBUTTERFLY              bw,  1,  2,  6
+
+    ; I PROBABLY INVERTED L0 ad L16 here
+    ; m1, m2, m5
+.loop:
+    sub               stride4q, strideq
+    movhps [dstq +stride4q +0], m2
+    movq   [dstq +stride4q +8], m5
+    mova   [dst4q+stride4q   ], m2
+    movhps [dst8q+stride4q +0], m1
+    movq   [dst8q+stride4q +8], m2
+    mova  [dst12q+stride4q   ], m1
+%if cpuflag(avx)
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m5, m2, 2
+%elif cpuflag(ssse3)
+    palignr                 m3, m2, m1, 2
+    palignr                 m0, m5, m2, 2
+    mova                    m1, m3
+    mova                    m2, m0
+%else
+    ; slightly modified version of PALIGNR
+    mova                    m6, m2
+    mova                    m4, m5
+    pslldq                  m6, 14
+    pslldq                  m4, 14
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    por                     m1, m6
+    por                     m2, m4
+%endif
+    psrldq                  m5, 2
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    movu                    m2, [aq-1]
+    movu                    m3, [aq+15]
+    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+    lea               stride8q, [strideq*8]
+    lea                  dst8q, [dstq  +stride8q]
+    lea                 dst16q, [dst8q +stride8q]
+    lea                 dst24q, [dst16q+stride8q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m4, m3, m2,  2, m6
+    PALIGNR                 m3, m2,  1, m6
+    LOWPASS                  4,  3,  2,  6
+    PALIGNR                 m3, m2, m1,  2, m6
+    PALIGNR                 m2, m1,  1, m6
+    LOWPASS                  3,  2,  1,  6
+    pavgb                   m2, m1
+    PALIGNR                 m6, m1, m0,  1, m7
+    PALIGNR                 m1, m0,  2, m7
+    LOWPASS                  1,  6,  0,  7
+    pavgb                   m0, m6
+    SBUTTERFLY              bw,  2,  3,  6
+    SBUTTERFLY              bw,  0,  1,  6
+
+    ; m0, m1, m2, m3, m4, m5
+.loop:
+    sub               stride8q, strideq
+    mova  [dstq  +stride8q+ 0], m3
+    mova  [dstq  +stride8q+16], m4
+    mova  [dst8q +stride8q+ 0], m2
+    mova  [dst8q +stride8q+16], m3
+    mova  [dst16q+stride8q+ 0], m1
+    mova  [dst16q+stride8q+16], m2
+    mova  [dst24q+stride8q+ 0], m0
+    mova  [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+    palignr                 m0, m1, m0, 2
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m4, m3, 2
+    palignr                 m4, m5, m4, 2
+    psrldq                  m5, 2
+%elif cpuflag(ssse3)
+    psrldq                  m6, m5, 2
+    palignr                 m5, m4, 2
+    palignr                 m4, m3, 2
+    palignr                 m3, m2, 2
+    palignr                 m2, m1, 2
+    palignr                 m1, m0, 2
+    mova                    m0, m1
+    mova                    m1, m2
+    mova                    m2, m3
+    mova                    m3, m4
+    mova                    m4, m5
+    mova                    m5, m6
+%else
+    ; sort of a half-integrated version of PALIGNR
+    pslldq                  m7, m4, 14
+    pslldq                  m6, m5, 14
+    psrldq                  m4, 2
+    psrldq                  m5, 2
+    por                     m4, m6
+    pslldq                  m6, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    pslldq                  m7, m2, 14
+    psrldq                  m2, 2
+    por                     m2, m6
+    pslldq                  m6, m1, 14
+    psrldq                  m1, 2
+    por                     m1, m7
+    psrldq                  m0, 2
+    por                     m0, m6
+%endif
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_XMM_FUNCS
+INIT_XMM ssse3
+HD_XMM_FUNCS
+INIT_XMM avx
+HD_XMM_FUNCS
+
+%macro HU_MMX_FUNCS 0
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to2_5x3]
+%else
+    punpcklbw               m1, m0, m0          ; 00112233
+    pshufw                  m1, m1, q3333       ; 33333333
+    punpckldq               m0, m1              ; 01233333
+%endif
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    SBUTTERFLY              bw,  1, 2, 0
+    PALIGNR                 m2, m1, 2, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    punpckhdq               m1, m1
+    punpckhdq               m2, m2
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+stride3q ], m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+HU_MMX_FUNCS
+INIT_MMX ssse3
+HU_MMX_FUNCS
+
+%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+    movq                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0          ; 0011223344556677
+    punpckhwd               m1, m1              ; 4444555566667777
+    shufps                  m0, m1, q3310       ; 0123456777777777
+%endif
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    SBUTTERFLY              bw,  1, 2, 0
+    movq     [dstq +strideq*0], m1
+    movhps   [dst4q+strideq*0], m1
+    PALIGNR                 m0, m2, m1, 2, m3
+    movq     [dstq +strideq*1], m0
+    movhps   [dst4q+strideq*1], m0
+    PALIGNR                 m0, m2, m1, 4, m3
+    movq     [dstq +strideq*2], m0
+    movhps   [dst4q+strideq*2], m0
+    PALIGNR                 m2, m1, 6, m3
+    movq     [dstq +stride3q ], m2
+    movhps   [dst4q+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2toE_3xF]
+    pshufb                  m1, m0, [pb_1toE_2xF]
+    pshufb                  m2, m0, m3
+%else
+    pand                    m3, m0, [pb_15x0_1xm1]
+    psrldq                  m1, m0, 1
+    por                     m1, m3
+    punpckhbw               m3, m3
+    psrldq                  m2, m0, 2
+    por                     m2, m3
+%endif
+    LOWPASS                  2,  1,  0,  4
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea                stride9q, [strideq*8+strideq]
+    mov                   cntd,  4
+    SBUTTERFLY              bw,  1,  2,  0
+
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*8], m2
+    PALIGNR                 m0, m2, m1, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride9q ], m2
+    PALIGNR                 m1, m2, m0, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
+    mova                    m1, [lq]
+    mova                    m0, [lq+16]
+    PALIGNR                 m2, m0, m1,  1, m5
+    PALIGNR                 m3, m0, m1,  2, m5
+    LOWPASS                  3,  2,  1,  5
+    pavgb                   m2, m1
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2toE_3xF]
+    pshufb                  m5, m0, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]
+    psrldq                  m5, m0, 1
+    por                     m5, m4
+    punpckhbw               m4, m4
+    psrldq                  m1, m0, 2
+    por                     m1, m4
+%endif
+    LOWPASS                  1,  5,  0,  6
+    pavgb                   m0, m5
+    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+    mov                   cntd,  8
+    xor               stride0q, stride0q
+    lea                  dst8q, [dstq  +strideq*8]
+    lea                 dst16q, [dst8q +strideq*8]
+    lea                 dst24q, [dst16q+strideq*8]
+    SBUTTERFLY              bw,  0,  1,  5
+    SBUTTERFLY              bw,  2,  3,  5
+%if cpuflag(ssse3)
+    pshufb                  m6, m1, [pb_15]
+%else
+    pshufhw                 m6, m4, q3333
+    punpckhqdq              m6, m6
+%endif
+
+.loop:
+    mova  [dstq  +stride0q+ 0], m2
+    mova  [dstq  +stride0q+16], m3
+    mova  [dst8q +stride0q+ 0], m3
+    mova  [dst8q +stride0q+16], m0
+    mova  [dst16q+stride0q+ 0], m0
+    mova  [dst16q+stride0q+16], m1
+    mova  [dst24q+stride0q+ 0], m1
+    mova  [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m0, m3, 2
+    palignr                 m0, m1, m0, 2
+    pshufb                  m1, m4
+%elif cpuflag(ssse3)
+    pshufb                  m5, m1, m4
+    palignr                 m1, m0, 2
+    palignr                 m0, m3, 2
+    palignr                 m3, m2, 2
+    mova                    m2, m3
+    mova                    m3, m0
+    mova                    m0, m1
+    mova                    m1, m5
+%else
+    ; half-integrated version of PALIGNR
+    pslldq                  m5, m1, 14
+    pslldq                  m7, m0, 14
+    psrldq                  m1, 2
+    psrldq                  m0, 2
+    por                     m1, m4
+    por                     m0, m5
+    pslldq                  m5, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    psrldq                  m2, 2
+    por                     m2, m5
+%endif
+    add               stride0q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_XMM_FUNCS 8
+INIT_XMM ssse3
+HU_XMM_FUNCS 7
+INIT_XMM avx
+HU_XMM_FUNCS 7
+
+; FIXME 127, 128, 129 ?
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000..32b6982
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,2392 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+cextern pd_16
+cextern pd_32
+cextern pd_65535;
+
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    mova                    m3, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m2
+    mova   [dstq+strideq*1+48], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+    mova                    m3, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    punpckhwd               m3, m2, m2
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m3, q1111
+    pshufd                  m1, m3, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    punpcklwd               m2, m2
+    pshufd                  m0, m2, q3333
+    pshufd                  m1, m2, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m2, q1111
+    pshufd                  m1, m2, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 3
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova    [dstq+strideq*0+ 0], m0
+    mova    [dstq+strideq*0+16], m0
+    mova    [dstq+strideq*1+ 0], m1
+    mova    [dstq+strideq*1+16], m1
+    mova    [dstq+strideq*2+ 0], m2
+    mova    [dstq+strideq*2+16], m2
+    mova    [dstq+stride3q + 0], m3
+    mova    [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 7
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m1
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+strideq*2+32], m2
+    mova   [dstq+strideq*2+48], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    mova   [dstq+stride3q +32], m3
+    mova   [dstq+stride3q +48], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [lq+mmsize]
+    paddw                   m0, [aq]
+    paddw                   m0, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq+mmsize*0]
+    paddw                   m0, [lq+mmsize*1]
+    paddw                   m0, [lq+mmsize*2]
+    paddw                   m0, [lq+mmsize*3]
+    paddw                   m0, [aq+mmsize*0]
+    paddw                   m0, [aq+mmsize*1]
+    paddw                   m0, [aq+mmsize*2]
+    paddw                   m0, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_32]
+    paddd                   m0, m1
+    psrad                   m0, 6
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_2]
+    paddd                   m0, m1
+    psrad                   m0, 2
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    paddw                   m0, [%2+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2+mmsize*0]
+    paddw                   m0, [%2+mmsize*1]
+    paddw                   m0, [%2+mmsize*2]
+    paddw                   m0, [%2+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+DC_1D_FNS top,  aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_1023]
+.body:
+    mova                    m4, [aq]
+    mova                    m3, [lq]
+    movd                    m0, [aq-4]
+    pshufw                  m0, m0, q1111
+    psubw                   m4, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    paddw                   m0, m4
+    paddw                   m1, m4
+    paddw                   m2, m4
+    paddw                   m3, m4
+    pxor                    m4, m4
+    pmaxsw                  m0, m4
+    pmaxsw                  m1, m4
+    pmaxsw                  m2, m4
+    pmaxsw                  m3, m4
+    pminsw                  m0, m5
+    pminsw                  m1, m5
+    pminsw                  m2, m5
+    pminsw                  m3, m5
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m5, [aq]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 1
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m5
+    paddw                   m1, m5
+    paddw                   m2, m5
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m4
+    pminsw                  m1, m4
+    pminsw                  m2, m4
+    pminsw                  m3, m4
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m4, [aq]
+    mova                    m5, [aq+mmsize]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+    punpcklwd               m3, m3
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m2, m4
+    paddw                   m2, m5
+    paddw                   m1, m3, m4
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m7
+    pminsw                  m2, m7
+    pminsw                  m1, m7
+    pminsw                  m3, m7
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_1023]
+.body:
+    pxor                    m1, m1
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+    mova              [rsp+ 0], m0
+    mova              [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+    mova                    m4, [aq+mmsize*0]
+    mova                    m5, [aq+mmsize*1]
+    mova                    m6, [aq+mmsize*2]
+    mova                    m7, [aq+mmsize*3]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    psubw                   m6, m0
+    psubw                   m7, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 31
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+    punpcklwd               m3, m3
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m3, m4
+    paddw                   m1, m3, m5
+    paddw                   m2, m3, m6
+    paddw                   m3, m7
+    pmaxsw                  m0, reg_min
+    pmaxsw                  m1, reg_min
+    pmaxsw                  m2, reg_min
+    pmaxsw                  m3, reg_min
+    pminsw                  m0, reg_max
+    pminsw                  m1, reg_max
+    pminsw                  m2, reg_max
+    pminsw                  m3, reg_max
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    add                   dstq, strideq
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra predicion functions
+;
+; in the functions below, 'abcdefgh' refers to above data (sometimes simply
+; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
+; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
+; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
+; top-left data.
+
+; left=(left+2*center+right+2)>>2
+%macro LOWPASS 3 ; left [dst], center, right
+    paddw                  m%1, m%3
+    psraw                  m%1, 1
+    pavgw                  m%1, m%2
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst)
+; dst/src can be the same register
+%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
+%else
+    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+%endif
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
+%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
+    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
+%else
+    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
+    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
+%endif
+%endmacro
+
+%macro DL_FUNCS 0
+cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m1, [aq]                ; abcdefgh
+    pshufhw                 m0, m1, q3310           ; abcdefhh
+    SHIFT_RIGHT             m1, m1                  ; bcdefghh
+    psrldq                  m2, m1, 2               ; cdefghh.
+    LOWPASS                  0,  1,  2              ; BCDEFGh.
+    pshufd                  m1, m0, q3321           ; DEFGh...
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    add                   dstq, strideq
+    psrldq                  m0, 2                   ; CDEFGh..
+    psrldq                  m1, 2                   ; EFGh....
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    RET
+
+cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
+    LOWPASS                  0,  1,  2              ; BCDEFGHh
+    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
+    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
+    DEFINE_ARGS dst, stride, stride5
+    lea               stride5q, [strideq*5]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*4], m1
+    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
+    pshuflw                 m1, m1, q3321           ; GHhhhhhh
+    pshufd                  m2, m0, q3321           ; EFGHhhhh
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
+    mova      [dstq+strideq*0], m3
+    mova      [dstq+strideq*4], m1
+    pshuflw                 m1, m1, q3321           ; hhhhhhhh
+    mova      [dstq+strideq*1], m2
+    mova      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+    mova                    m3, [aq+mmsize]         ; ijklmnop
+    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
+    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
+    LOWPASS                  0,  1,  2              ; BCDEFGHI
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
+    LOWPASS                  1,  2,  3              ; JKLMNOPp
+    pshufd                  m2, m2, q3333           ; pppppppp
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*8+ 0], m1
+    mova   [dstq+strideq*8+16], m2
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+%else
+    PALIGNR                 m3, m1, m0, 2, m4
+    mova                    m0, m3
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]       ; abcdefgh
+    mova                    m1, [aq+mmsize*1]       ; ijklmnop
+    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
+    mova                    m3, [aq+mmsize*3]       ; yz012345
+    PALIGNR                 m4, m1, m0, 2, m6
+    PALIGNR                 m5, m1, m0, 4, m6
+    LOWPASS                  0,  4,  5              ; BCDEFGHI
+    PALIGNR                 m4, m2, m1, 2, m6
+    PALIGNR                 m5, m2, m1, 4, m6
+    LOWPASS                  1,  4,  5              ; JKLMNOPQ
+    PALIGNR                 m4, m3, m2, 2, m6
+    PALIGNR                 m5, m3, m2, 4, m6
+    LOWPASS                  2,  4,  5              ; RSTUVWXY
+%if cpuflag(ssse3)
+    mova                    m6, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m4, m5, m3, m6
+    LOWPASS                  3,  4,  5              ; Z0123455
+    pshufd                  m4, m4, q3333           ; 55555555
+    DEFINE_ARGS dst, stride, stride8, stride24, cnt
+    mov                   cntd, 8
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+
+.loop:
+    mova  [dstq+stride8q*0+ 0], m0
+    mova  [dstq+stride8q*0+16], m1
+    mova  [dstq+stride8q*0+32], m2
+    mova  [dstq+stride8q*0+48], m3
+    mova  [dstq+stride8q*1+ 0], m1
+    mova  [dstq+stride8q*1+16], m2
+    mova  [dstq+stride8q*1+32], m3
+    mova  [dstq+stride8q*1+48], m4
+    mova  [dstq+stride8q*2+ 0], m2
+    mova  [dstq+stride8q*2+16], m3
+    mova  [dstq+stride8q*2+32], m4
+    mova  [dstq+stride8q*2+48], m4
+    mova  [dstq+stride24q + 0], m3
+    mova  [dstq+stride24q +16], m4
+    mova  [dstq+stride24q +32], m4
+    mova  [dstq+stride24q +48], m4
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+    vpalignr                m1, m2, m1, 2
+    vpalignr                m2, m3, m2, 2
+%else
+    PALIGNR                 m5, m1, m0, 2, m6
+    mova                    m0, m5
+    PALIGNR                 m5, m2, m1, 2, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 2, m6
+    mova                    m2, m5
+%endif
+    SHIFT_RIGHT             m3, m3, m6
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_FUNCS
+INIT_XMM ssse3
+DL_FUNCS
+INIT_XMM avx
+DL_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                   ; abcdefghijklmnop
+    vpbroadcastw           xm1, [aq+30]                ; pppppppp
+    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
+    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
+    vpalignr                m4, m2, m0, 4              ; cdefghijklmnoppp
+    LOWPASS                  0,  3,  4                 ; BCDEFGHIJKLMNOPp
+    vperm2i128              m2, m0, m1, q0201          ; JKLMNOPppppppppp
+    DEFINE_ARGS dst, stride, stride3, cnt
+    mov                   cntd, 2
+    lea               stride3q, [strideq*3]
+
+.loop:
+    mova      [dstq+strideq*0], m0
+    vpalignr                m3, m2, m0, 2
+    vpalignr                m4, m2, m0, 4
+    mova      [dstq+strideq*1], m3
+    mova      [dstq+strideq*2], m4
+    vpalignr                m3, m2, m0, 6
+    vpalignr                m4, m2, m0, 8
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m4
+    vpalignr                m3, m2, m0, 10
+    vpalignr                m4, m2, m0, 12
+    mova      [dstq+strideq*1], m3
+    mova      [dstq+strideq*2], m4
+    vpalignr                m3, m2, m0, 14
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    mova                    m0, m2
+    vperm2i128              m2, m2, m2, q0101          ; pppppppppppppppp
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0+ 0]       ; abcdefghijklmnop
+    mova                    m1, [aq+mmsize*1+ 0]       ; qrstuvwxyz012345
+    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
+    vperm2i128              m5, m0, m1, q0201          ; ijklmnopqrstuvwx
+    vpalignr                m2, m5, m0, 2              ; bcdefghijklmnopq
+    vpalignr                m3, m5, m0, 4              ; cdefghijklmnopqr
+    LOWPASS                  0,  2,  3                 ; BCDEFGHIJKLMNOPQ
+    vperm2i128              m5, m1, m4, q0201          ; yz01234555555555
+    vpalignr                m2, m5, m1, 2              ; rstuvwxyz0123455
+    vpalignr                m3, m5, m1, 4              ; stuvwxyz01234555
+    LOWPASS                  1,  2,  3                 ; RSTUVWXYZ......5
+    vperm2i128              m2, m1, m4, q0201          ; Z......555555555
+    vperm2i128              m5, m0, m1, q0201          ; JKLMNOPQRSTUVWXY
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+
+.loop:
+    mova   [dstq+strideq*0 + 0], m0
+    mova   [dstq+strideq*0 +32], m1
+    vpalignr                 m3, m5, m0, 2
+    vpalignr                 m4, m2, m1, 2
+    mova   [dstq+strideq*1 + 0], m3
+    mova   [dstq+strideq*1 +32], m4
+    vpalignr                 m3, m5, m0, 4
+    vpalignr                 m4, m2, m1, 4
+    mova   [dstq+strideq*2 + 0], m3
+    mova   [dstq+strideq*2 +32], m4
+    vpalignr                 m3, m5, m0, 6
+    vpalignr                 m4, m2, m1, 6
+    mova   [dstq+stride3q*1+ 0], m3
+    mova   [dstq+stride3q*1+32], m4
+    lea                    dstq, [dstq+strideq*4]
+    vpalignr                 m3, m5, m0, 8
+    vpalignr                 m4, m2, m1, 8
+    mova   [dstq+strideq*0 + 0], m3
+    mova   [dstq+strideq*0 +32], m4
+    vpalignr                 m3, m5, m0, 10
+    vpalignr                 m4, m2, m1, 10
+    mova   [dstq+strideq*1 + 0], m3
+    mova   [dstq+strideq*1 +32], m4
+    vpalignr                 m3, m5, m0, 12
+    vpalignr                 m4, m2, m1, 12
+    mova   [dstq+strideq*2+ 0], m3
+    mova   [dstq+strideq*2+32], m4
+    vpalignr                 m3, m5, m0, 14
+    vpalignr                 m4, m2, m1, 14
+    mova   [dstq+stride3q+  0], m3
+    mova   [dstq+stride3q+ 32], m4
+    vpalignr                 m3, m5, m0, 16
+    vpalignr                 m4, m2, m1, 16
+    vperm2i128               m5, m3, m4, q0201
+    vperm2i128               m2, m4, m4, q0101
+    mova                     m0, m3
+    mova                     m1, m4
+    lea                    dstq, [dstq+strideq*4]
+    dec                    cntd
+    jg .loop
+    RET
+%endif
+
+%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; wxyz....
+    movhps                  m0, [aq-2]              ; wxyz*abc
+    movd                    m1, [aq+6]              ; d.......
+    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
+    psrldq                  m2, m1, 2               ; yz*abcd.
+    LOWPASS                  0, 1, 2                ; XYZ#ABC.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m0
+    psrldq                  m0, 2                   ; YZ#ABC..
+    movh      [dstq+strideq*2], m0
+    psrldq                  m0, 2                   ; Z#ABC...
+    movh      [dstq+strideq*1], m0
+    psrldq                  m0, 2                   ; #ABC....
+    movh      [dstq+strideq*0], m0
+    RET
+
+cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]                ; stuvwxyz
+    movu                    m1, [aq-2]              ; *abcdefg
+    mova                    m2, [aq]                ; abcdefgh
+    psrldq                  m3, m2, 2               ; bcdefgh.
+    LOWPASS                  3,  2, 1               ; ABCDEFG.
+    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
+    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
+    LOWPASS                  2,  1, 0               ; TUVWXYZ#
+    DEFINE_ARGS dst, stride, dst4, stride3
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+
+    movhps [dstq +stride3q +0], m2
+    movh   [dstq+ stride3q +8], m3
+    mova   [dst4q+stride3q +0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*2+0], m1
+    movh   [dstq+ strideq*2+8], m3
+    mova   [dst4q+strideq*2+0], m1
+    PALIGNR                 m2, m3, m1, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*1+0], m2
+    movh   [dstq+ strideq*1+8], m3
+    mova   [dst4q+strideq*1+0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*0+0], m1
+    movh   [dstq+ strideq*0+8], m3
+    mova   [dst4q+strideq*0+0], m1
+    RET
+
+cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [lq]                ; klmnopqr
+    mova                    m1, [lq+mmsize]         ; stuvwxyz
+    movu                    m2, [aq-2]              ; *abcdefg
+    movu                    m3, [aq+mmsize-2]       ; hijklmno
+    mova                    m4, [aq]                ; abcdefgh
+    mova                    m5, [aq+mmsize]         ; ijklmnop
+    psrldq                  m6, m5, 2               ; jklmnop.
+    LOWPASS                  6,  5, 3               ; IJKLMNO.
+    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
+    LOWPASS                  5,  4, 2               ; ABCDEFGH
+    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
+    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
+    LOWPASS                  4,  2, 1               ; TUVWXYZ#
+    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
+    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
+    LOWPASS                  2, 1, 0                ; LMNOPQRS
+    DEFINE_ARGS dst, stride, dst8, cnt
+    lea                  dst8q, [dstq+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+    mova  [dst8q+strideq*0+ 0], m4
+    mova  [dst8q+strideq*0+16], m5
+    mova  [dst8q+strideq*8+ 0], m2
+    mova  [dst8q+strideq*8+16], m4
+%if cpuflag(avx)
+    vpalignr                m2, m4, m2, 2
+    vpalignr                m4, m5, m4, 2
+    vpalignr                m5, m6, m5, 2
+%else
+    PALIGNR                 m0, m4, m2, 2, m1
+    mova                    m2, m0
+    PALIGNR                 m0, m5, m4, 2, m1
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m5, 2, m1
+    mova                    m5, m0
+%endif
+    psrldq                  m6, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
+                               %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
+    mova                    m0, [aq+mmsize*3]       ; a[24-31]
+    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
+    psrldq                  m2, m0, 2               ; a[25-31].
+    LOWPASS                  2,  0, 1               ; A[24-30].
+    mova                    m1, [aq+mmsize*2]       ; a[16-23]
+    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
+    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
+    LOWPASS                  0,  1, 3               ; A[16-23]
+    mova                    m3, [aq+mmsize*1]       ; a[8-15]
+    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
+    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
+    LOWPASS                  1,  3, 4               ; A[8-15]
+    mova                    m4, [aq+mmsize*0]       ; a[0-7]
+    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
+    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
+    LOWPASS                  3,  4, 5               ; A[0-7]
+    SCRATCH                  1,  8, rsp+0*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0, 10, rsp+2*mmsize
+%endif
+    mova                    m6, [lq+mmsize*3]       ; l[24-31]
+    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
+    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
+    LOWPASS                  4,  5, 6               ; L[25-31]#
+    mova                    m7, [lq+mmsize*2]       ; l[16-23]
+    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
+    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
+    LOWPASS                  5,  6, 7               ; L[17-24]
+    mova                    m1, [lq+mmsize*1]       ; l[8-15]
+    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
+    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
+    LOWPASS                  6,  7, 1               ; L[9-16]
+    mova                    m3, [lq+mmsize*0]       ; l[0-7]
+    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
+    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
+    LOWPASS                  7,  1, 3               ; L[1-8]
+%if cpuflag(ssse3)
+%if cpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%endif
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%else
+    UNSCRATCH                0, 10, rsp+2*mmsize
+%endif
+    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+    lea                  dst8q, [dst8q+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+%if notcpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%endif
+%endif
+    mova [dst8q+stride8q*0+ 0], m4
+    mova [dst8q+stride8q*0+16], m3
+    mova [dst8q+stride8q*0+32], m1
+    mova [dst8q+stride8q*0+48], m0
+    mova [dst8q+stride8q*1+ 0], m5
+    mova [dst8q+stride8q*1+16], m4
+    mova [dst8q+stride8q*1+32], m3
+    mova [dst8q+stride8q*1+48], m1
+    mova [dst8q+stride8q*2+ 0], m6
+    mova [dst8q+stride8q*2+16], m5
+    mova [dst8q+stride8q*2+32], m4
+    mova [dst8q+stride8q*2+48], m3
+    mova [dst8q+stride24q + 0], m7
+    mova [dst8q+stride24q +16], m6
+    mova [dst8q+stride24q +32], m5
+    mova [dst8q+stride24q +48], m4
+%if cpuflag(avx)
+    vpalignr                m7, m6, m7, 2
+    vpalignr                m6, m5, m6, 2
+    vpalignr                m5, m4, m5, 2
+    vpalignr                m4, m3, m4, 2
+    vpalignr                m3, m1, m3, 2
+    vpalignr                m1, m0, m1, 2
+    vpalignr                m0, m2, m0, 2
+%else
+    SCRATCH                  2,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m6, m7, 2, m0
+    mova                    m7, m2
+    PALIGNR                 m2, m5, m6, 2, m0
+    mova                    m6, m2
+    PALIGNR                 m2, m4, m5, 2, m0
+    mova                    m5, m2
+    PALIGNR                 m2, m3, m4, 2, m0
+    mova                    m4, m2
+    PALIGNR                 m2, m1, m3, 2, m0
+    mova                    m3, m2
+%if notcpuflag(ssse3)
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m0, m1, 2, m3
+    mova                    m1, m2
+    UNSCRATCH                2,  8, rsp+0*mmsize
+    SCRATCH                  1,  8, rsp+0*mmsize
+    PALIGNR                 m1, m2, m0, 2, m3
+    mova                    m0, m1
+%endif
+    psrldq                  m2, 2
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_FUNCS 3
+INIT_XMM ssse3
+DR_FUNCS 2
+INIT_XMM avx
+DR_FUNCS 2
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
+    mova                    m0, [lq]                   ; klmnopqrstuvwxyz
+    movu                    m1, [aq-2]                 ; *abcdefghijklmno
+    mova                    m2, [aq]                   ; abcdefghijklmnop
+    vperm2i128              m4, m2, m2, q2001          ; ijklmnop........
+    vpalignr                m5, m4, m2, 2              ; bcdefghijklmnop.
+    vperm2i128              m3, m0, m1, q0201          ; stuvwxyz*abcdefg
+    LOWPASS                  1,  2,  5                 ; ABCDEFGHIJKLMNO.
+    vpalignr                m4, m3, m0, 2              ; lmnopqrstuvwxyz*
+    vpalignr                m5, m3, m0, 4              ; mnopqrstuvwxyz*a
+    LOWPASS                  0,  4,  5                 ; LMNOPQRSTUVWXYZ#
+    vperm2i128              m5, m0, m1, q0201          ; TUVWXYZ#ABCDEFGH
+    DEFINE_ARGS dst, stride, stride3, stride5, dst3
+    lea                  dst3q, [dstq+strideq*4]
+    lea               stride3q, [strideq*3]
+    lea               stride5q, [stride3q+strideq*2]
+
+    vpalignr                m3, m5, m0, 2
+    vpalignr                m4, m1, m5, 2
+    mova    [dst3q+stride5q*2], m3                     ; 14
+    mova    [ dstq+stride3q*2], m4                     ; 6
+    vpalignr                m3, m5, m0, 4
+    vpalignr                m4, m1, m5, 4
+    sub                  dst3q, strideq
+    mova    [dst3q+stride5q*2], m3                     ; 13
+    mova    [dst3q+strideq*2 ], m4                     ; 5
+    mova    [dst3q+stride3q*4], m0                     ; 15
+    vpalignr                m3, m5, m0, 6
+    vpalignr                m4, m1, m5, 6
+    mova     [dstq+stride3q*4], m3                     ; 12
+    mova     [dst3q+strideq*1], m4                     ; 4
+    vpalignr                m3, m5, m0, 8
+    vpalignr                m4, m1, m5, 8
+    mova     [dst3q+strideq*8], m3                     ; 11
+    mova     [dst3q+strideq*0], m4                     ; 3
+    vpalignr                m3, m5, m0, 10
+    vpalignr                m4, m1, m5, 10
+    mova     [dstq+stride5q*2], m3                     ; 10
+    mova     [dstq+strideq*2 ], m4                     ; 2
+    vpalignr                m3, m5, m0, 12
+    vpalignr                m4, m1, m5, 12
+    mova    [dst3q+stride3q*2], m3                     ; 9
+    mova     [dstq+strideq*1 ], m4                     ; 1
+    vpalignr                m3, m5, m0, 14
+    vpalignr                m4, m1, m5, 14
+    mova      [dstq+strideq*8], m3                     ; 8
+    mova      [dstq+strideq*0], m4                     ; 0
+    mova     [dst3q+strideq*4], m5                     ; 7
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
+    mova                    m0, [lq+mmsize*0+0]        ; l[0-15]
+    mova                    m1, [lq+mmsize*1+0]        ; l[16-31]
+    movu                    m2, [aq+mmsize*0-2]        ; *abcdefghijklmno
+    mova                    m3, [aq+mmsize*0+0]        ; abcdefghijklmnop
+    mova                    m4, [aq+mmsize*1+0]        ; qrstuvwxyz012345
+    vperm2i128              m5, m0, m1, q0201          ; lmnopqrstuvwxyz0
+    vpalignr                m6, m5, m0, 2              ; mnopqrstuvwxyz01
+    vpalignr                m7, m5, m0, 4              ; nopqrstuvwxyz012
+    LOWPASS                  0,  6,  7                 ; L[0-15]
+    vperm2i128              m7, m1, m2, q0201          ; stuvwxyz*abcdefg
+    vpalignr                m5, m7, m1, 2              ; lmnopqrstuvwxyz*
+    vpalignr                m6, m7, m1, 4              ; mnopqrstuvwxyz*a
+    LOWPASS                  1,  5,  6                 ; L[16-31]#
+    vperm2i128              m5, m3, m4, q0201          ; ijklmnopqrstuvwx
+    vpalignr                m6, m5, m3, 2              ; bcdefghijklmnopq
+    LOWPASS                  2,  3,  6                 ; A[0-15]
+    movu                    m3, [aq+mmsize*1-2]        ; pqrstuvwxyz01234
+    vperm2i128              m6, m4, m4, q2001          ; yz012345........
+    vpalignr                m7, m6, m4, 2              ; rstuvwxyz012345.
+    LOWPASS                  3,  4,  7                 ; A[16-31].
+    vperm2i128              m4, m1, m2, q0201          ; TUVWXYZ#ABCDEFGH
+    vperm2i128              m5, m0, m1, q0201          ; L[7-15]L[16-23]
+    vperm2i128              m8, m2, m3, q0201          ; IJKLMNOPQRSTUVWX
+    DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
+    lea               stride3q, [strideq*3]
+    lea               stride5q, [stride3q+strideq*2]
+    lea               stride7q, [strideq*4+stride3q]
+    lea                 dst24q, [dst8q+stride3q*8]
+    lea                  dst8q, [dst8q+strideq*8]
+    mov                   cntd, 2
+
+.loop:
+    mova  [dst24q+stride7q+0 ], m0                     ; 31 23 15 7
+    mova  [dst24q+stride7q+32], m1
+    mova    [dst8q+stride7q+0], m1
+    mova   [dst8q+stride7q+32], m2
+    vpalignr                m6, m4, m1, 2
+    vpalignr                m7, m5, m0, 2
+    vpalignr                m9, m8, m2, 2
+    mova [dst24q+stride3q*2+0], m7                     ; 30 22 14 6
+    mova [dst24q+stride3q*2+32], m6
+    mova  [dst8q+stride3q*2+0], m6
+    mova [dst8q+stride3q*2+32], m9
+    vpalignr                m6, m4, m1, 4
+    vpalignr                m7, m5, m0, 4
+    vpalignr                m9, m8, m2, 4
+    mova   [dst24q+stride5q+0], m7                     ; 29 21 13 5
+    mova  [dst24q+stride5q+32], m6
+    mova    [dst8q+stride5q+0], m6
+    mova   [dst8q+stride5q+32], m9
+    vpalignr                m6, m4, m1, 6
+    vpalignr                m7, m5, m0, 6
+    vpalignr                m9, m8, m2, 6
+    mova [dst24q+strideq*4+0 ], m7                     ; 28 20 12 4
+    mova [dst24q+strideq*4+32], m6
+    mova   [dst8q+strideq*4+0], m6
+    mova  [dst8q+strideq*4+32], m9
+    vpalignr                m6, m4, m1, 8
+    vpalignr                m7, m5, m0, 8
+    vpalignr                m9, m8, m2, 8
+    mova  [dst24q+stride3q+0 ], m7                     ; 27 19 11 3
+    mova  [dst24q+stride3q+32], m6
+    mova    [dst8q+stride3q+0], m6
+    mova   [dst8q+stride3q+32], m9
+    vpalignr                m6, m4, m1, 10
+    vpalignr                m7, m5, m0, 10
+    vpalignr                m9, m8, m2, 10
+    mova [dst24q+strideq*2+0 ], m7                     ; 26 18 10 2
+    mova [dst24q+strideq*2+32], m6
+    mova   [dst8q+strideq*2+0], m6
+    mova  [dst8q+strideq*2+32], m9
+    vpalignr                m6, m4, m1, 12
+    vpalignr                m7, m5, m0, 12
+    vpalignr                m9, m8, m2, 12
+    mova   [dst24q+strideq+0 ], m7                     ; 25 17 9 1
+    mova   [dst24q+strideq+32], m6
+    mova     [dst8q+strideq+0], m6
+    mova    [dst8q+strideq+32], m9
+    vpalignr                m6, m4, m1, 14
+    vpalignr                m7, m5, m0, 14
+    vpalignr                m9, m8, m2, 14
+    mova [dst24q+strideq*0+0 ], m7                     ; 24 16 8 0
+    mova [dst24q+strideq*0+32], m6
+    mova   [dst8q+strideq*0+0], m6
+    mova  [dst8q+strideq*0+32], m9
+    mova                    m0, m5
+    mova                    m5, m1
+    mova                    m1, m4
+    mova                    m4, m2
+    mova                    m2, m8
+    mova                    m8, m3
+    sub                 dst24q, stride7q
+    sub                 dst24q, strideq
+    sub                  dst8q, stride7q
+    sub                  dst8q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+%endif
+
+%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m0, [aq]                ; abcdefgh
+    psrldq                  m1, m0, 2               ; bcdefgh.
+    psrldq                  m2, m0, 4               ; cdefgh..
+    LOWPASS                  2,  1, 0               ; BCDEFGH.
+    pavgw                   m1, m0                  ; ABCDEFG.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1
+    movh      [dstq+strideq*1], m2
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    movh      [dstq+strideq*2], m1
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
+    LOWPASS                  2,  1, 0               ; BCDEFGHh
+    pavgw                   m1, m0                  ; ABCDEFGh
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m5, m0, m1, m4
+    LOWPASS                  0,  5,  1
+    pavgw                   m1, m5
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m2
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m3
+    mova   [dstq+strideq*1+16], m0
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m2, m1, m2, 2
+    vpalignr                m3, m0, m3, 2
+%else
+    PALIGNR                 m5, m1, m2, 2, m4
+    mova                    m2, m5
+    PALIGNR                 m5, m0, m3, 2, m4
+    mova                    m3, m5
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    SHIFT_RIGHT             m0, m0, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    PALIGNR                 m6, m1, m0, 2, m5
+    PALIGNR                 m7, m1, m0, 4, m5
+    LOWPASS                  7,  6,  0
+    pavgw                   m6, m0
+    SCRATCH                  6,  8, rsp+0*mmsize
+    PALIGNR                 m4, m2, m1, 2, m0
+    PALIGNR                 m5, m2, m1, 4, m0
+    LOWPASS                  5,  4,  1
+    pavgw                   m4, m1
+    mova                    m0, [aq+mmsize*3]
+    PALIGNR                 m1, m0, m2, 2, m6
+    PALIGNR                 m3, m0, m2, 4, m6
+    LOWPASS                  3,  1,  2
+    pavgw                   m2, m1
+%if cpuflag(ssse3)
+    PRELOAD                 10, pb_2to15_14_15, shuf
+%endif
+    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
+    LOWPASS                  1,  6,  0
+    pavgw                   m0, m6
+%if ARCH_X86_64
+    pshufd                  m9, m6, q3333
+%endif
+%if cpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride16, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+    lea              stride17q, [stride16q+strideq]
+
+    ; FIXME m8 is unused for avx, so we could save one register here for win64
+.loop:
+%if notcpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    mova   [dstq+strideq*0+ 0], m6
+    mova   [dstq+strideq*0+16], m4
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m7
+    mova   [dstq+strideq*1+16], m5
+    mova   [dstq+strideq*1+32], m3
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+stride16q+ 0], m4
+    mova   [dstq+stride16q+16], m2
+    mova   [dstq+stride16q+32], m0
+%if ARCH_X86_64
+    mova   [dstq+stride16q+48], m9
+%endif
+    mova   [dstq+stride17q+ 0], m5
+    mova   [dstq+stride17q+16], m3
+    mova   [dstq+stride17q+32], m1
+%if ARCH_X86_64
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m6, m4, m6, 2
+    vpalignr                m4, m2, m4, 2
+    vpalignr                m2, m0, m2, 2
+    vpalignr                m7, m5, m7, 2
+    vpalignr                m5, m3, m5, 2
+    vpalignr                m3, m1, m3, 2
+%else
+    SCRATCH                  3,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  1, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m3, m4, m6, 2, m1
+    mova                    m6, m3
+    PALIGNR                 m3, m2, m4, 2, m1
+    mova                    m4, m3
+    PALIGNR                 m3, m0, m2, 2, m1
+    mova                    m2, m3
+    PALIGNR                 m3, m5, m7, 2, m1
+    mova                    m7, m3
+    UNSCRATCH                3,  8, rsp+0*mmsize
+    SCRATCH                  6,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                1, 10, rsp+1*mmsize
+    SCRATCH                  7, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m6, m3, m5, 2, m7
+    mova                    m5, m6
+    PALIGNR                 m6, m1, m3, 2, m7
+    mova                    m3, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+1*mmsize
+%endif
+%endif
+    SHIFT_RIGHT             m1, m1, reg_shuf
+    SHIFT_RIGHT             m0, m0, reg_shuf
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+%assign %%n 0
+%rep 4
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+48], m0
+    mova   [dstq+strideq*2+48], m0
+    mova   [dstq+stride3q +48], m0
+%if %%n < 3
+    lea                   dstq, [dstq+strideq*4]
+%endif
+%assign %%n (%%n+1)
+%endrep
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_FUNCS 2
+INIT_XMM ssse3
+VL_FUNCS 1
+INIT_XMM avx
+VL_FUNCS 1
+
+%macro VR_FUNCS 0
+cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movu                    m0, [aq-2]
+    movhps                  m1, [lq]
+    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
+    pslldq                  m1, m0, 2               ; .xyz*abc
+    pslldq                  m2, m0, 4               ; ..xyz*ab
+    LOWPASS                  2,  1, 0               ; ..YZ#ABC
+    pavgw                   m1, m0                  ; ....#ABC
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movhps    [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m2
+    shufps                  m0, m2, m1, q3210
+%if cpuflag(ssse3)
+    pshufb                  m2, [pb_4_5_8to13_8x0]
+%else
+    pshuflw                 m2, m2, q2222
+    psrldq                  m2, 6
+%endif
+    psrldq                  m0, 6
+    movh      [dstq+strideq*2], m0
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [lq]                ; stuvwxyz
+    mova                    m0, [aq]                ; abcdefgh
+    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
+    LOWPASS                  3,  1,  0
+    pavgw                   m0, m1
+    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
+    pslldq                  m4, m2,  2              ; .stuvwxy
+    LOWPASS                  4,  2,  1
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m4
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [aq+mmsize-2]       ; hijklmno
+    mova                    m3, [aq]                ; abcdefgh
+    mova                    m4, [aq+mmsize]         ; ijklmnop
+    mova                    m5, [lq+mmsize]         ; stuvwxyz
+    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
+    movu                    m6, [aq+mmsize-4]       ; ghijklmn
+    LOWPASS                  6,  2,  4
+    pavgw                   m2, m4
+    LOWPASS                  0,  1,  3
+    pavgw                   m3, m1
+    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
+    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
+    LOWPASS                  1,  5,  7
+    movu                    m5, [lq+2]              ; lmnopqrs
+    pslldq                  m4, m5,  2              ; .lmnopqr
+    pslldq                  m7, m5,  4              ; ..lmnopq
+    LOWPASS                  5,  4,  7
+    psrld                   m4, m1, 16
+    psrld                   m7, m5, 16
+    pand                    m1, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m7, m4
+    packssdw                m5, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m3
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m6
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m2, m3, 14, m4
+    PALIGNR                 m3, m7, 14, m4
+    pslldq                  m7, 2
+    PALIGNR                 m6, m0, 14, m4
+    PALIGNR                 m0, m5, 14, m4
+    pslldq                  m5, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
+    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
+    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
+    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
+    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
+    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
+    LOWPASS                  5,  3,  4              ; A[23-30]
+    SCRATCH                  5,  8, rsp+0*mmsize
+    pavgw                   m3, m4
+    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
+    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
+    LOWPASS                  6,  2,  4              ; A[15-22]
+    SCRATCH                  6,  9, rsp+1*mmsize
+    pavgw                   m2, m4
+    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
+    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
+    LOWPASS                  7,  1,  4              ; A[7-14]
+    SCRATCH                  7, 10, rsp+2*mmsize
+    pavgw                   m1, m4
+    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
+    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
+    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
+    LOWPASS                  6,  0,  4              ; #A[0-6]
+    SCRATCH                  6, 11, rsp+3*mmsize
+    pavgw                   m4, m0
+    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
+    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
+    LOWPASS                  0,  5,  7              ; L[24-31]
+    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
+    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
+    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
+    LOWPASS                  5,  7,  6              ; L[16-23]
+    psrld                   m7, m0, 16
+    psrld                   m6, m5, 16
+    pand                    m0, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m6, m7
+    packssdw                m5, m0
+    SCRATCH                  5, 12, rsp+4*mmsize
+    SCRATCH                  6, 13, rsp+5*mmsize
+    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
+    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
+    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
+    LOWPASS                  6,  0,  5              ; L[8-15]
+    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
+    pslldq                  m5, m0,  2              ; .l[1-7]
+    pslldq                  m7, m0,  4              ; ..l[1-6]
+    LOWPASS                  0,  5,  7
+    psrld                   m5, m6, 16
+    psrld                   m7, m0, 16
+    pand                    m6, [pd_65535]
+    pand                    m0, [pd_65535]
+    packssdw                m7, m5
+    packssdw                m0, m6
+    UNSCRATCH                6, 13, rsp+5*mmsize
+    DEFINE_ARGS dst, stride, stride16, cnt, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+%if ARCH_X86_64
+    lea              stride17q, [stride16q+strideq]
+%endif
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+%if ARCH_X86_64
+    mova   [dstq+strideq*1+ 0], m11
+    mova   [dstq+strideq*1+16], m10
+    mova   [dstq+strideq*1+32], m9
+    mova   [dstq+strideq*1+48], m8
+%endif
+    mova   [dstq+stride16q+ 0], m6
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m1
+    mova   [dstq+stride16q+48], m2
+%if ARCH_X86_64
+    mova   [dstq+stride17q+ 0], m12
+    mova   [dstq+stride17q+16], m11
+    mova   [dstq+stride17q+32], m10
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m3, m2,  14, m5
+    PALIGNR                 m2, m1,  14, m5
+    PALIGNR                 m1, m4,  14, m5
+    PALIGNR                 m4, m6,  14, m5
+    PALIGNR                 m6, m7,  14, m5
+    pslldq                  m7, 2
+%if ARCH_X86_64
+    PALIGNR                 m8, m9,  14, m5
+    PALIGNR                 m9, m10, 14, m5
+    PALIGNR                m10, m11, 14, m5
+    PALIGNR                m11, m12, 14, m5
+    PALIGNR                m12, m0,  14, m5
+    pslldq                  m0, 2
+%endif
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                5, 12, rsp+4*mmsize
+    UNSCRATCH                4, 11, rsp+3*mmsize
+    UNSCRATCH                3, 10, rsp+2*mmsize
+    UNSCRATCH                2,  9, rsp+1*mmsize
+    UNSCRATCH                1,  8, rsp+0*mmsize
+    mov                   dstq, dstm
+    mov                   cntd, 8
+    add                   dstq, strideq
+.loop2:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m3
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m1
+    mova   [dstq+stride16q+ 0], m5
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m3
+    mova   [dstq+stride16q+48], m2
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m1, m2,  14, m6
+    PALIGNR                 m2, m3,  14, m6
+    PALIGNR                 m3, m4,  14, m6
+    PALIGNR                 m4, m5,  14, m6
+    PALIGNR                 m5, m0,  14, m6
+    pslldq                  m0, 2
+    dec                   cntd
+    jg .loop2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_FUNCS
+INIT_XMM ssse3
+VR_FUNCS
+INIT_XMM avx
+VR_FUNCS
+
+%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; abcd
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
+%else
+    punpcklqdq              m0, m0
+    pshufhw                 m0, m0, q3333           ; abcddddd
+%endif
+    psrldq                  m1, m0,  2              ; bcddddd.
+    psrldq                  m2, m0,  4              ; cddddd..
+    LOWPASS                  2,  1,  0              ; BCDddd..
+    pavgw                   m1, m0                  ; abcddddd
+    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
+    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1                  ; aBbC
+    movh      [dstq+strideq*1], m2                  ; bCcD
+    movhps    [dstq+strideq*2], m1                  ; cDdd
+    movhps    [dstq+stride3q ], m2                  ; dddd
+    RET
+
+cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY          wd,  1,  2,  0
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+%else
+    PALIGNR                 m0, m2, m1, 4, m3
+    mova                    m1, m0
+%endif
+    pshufd                  m2, m2, q3321
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    RET
+
+cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m3, [lq+mmsize]
+    movu                    m1, [lq+2]
+    movu                    m2, [lq+4]
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY           wd, 1,  2,  0
+%if cpuflag(ssse3)
+    mova                    m5, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m0, m4, m3, m5
+    LOWPASS                  4,  0,  3
+    pavgw                   m3, m0
+    SBUTTERFLY           wd, 3,  4,  5
+    pshufd                  m0, m0, q3333
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+
+.loop:
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m2
+    mova  [dstq+strideq *4+ 0], m2
+    mova  [dstq+strideq *4+16], m3
+    mova  [dstq+strideq *8+ 0], m3
+    mova  [dstq+strideq *8+16], m4
+    mova  [dstq+stride3q*4+ 0], m4
+    mova  [dstq+stride3q*4+16], m0
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m4, m3, 4
+    vpalignr                m4, m0, m4, 4
+%else
+    PALIGNR                 m5, m2, m1, 4, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 4, m6
+    mova                    m2, m5
+    PALIGNR                 m5, m4, m3, 4, m6
+    mova                    m3, m5
+    PALIGNR                 m5, m0, m4, 4, m6
+    mova                    m4, m5
+%endif
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
+                               %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    SCRATCH                  1,  8, rsp+0*mmsize
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m7, [lq+mmsize*3+0]
+    SCRATCH                  0,  9, rsp+1*mmsize
+%if cpuflag(ssse3)
+    mova                    m0, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m6, m7, m0
+    LOWPASS                  6,  1,  7
+    pavgw                   m7, m1
+    SBUTTERFLY           wd, 7,  6,  0
+    pshufd                  m1, m1, q3333
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    lea               stride3q, [strideq*3]
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+    mov                   cntd, 4
+
+.loop:
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+1*mmsize], m1
+    mova                    m1, [rsp+0*mmsize]
+%endif
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m0
+    mova  [dstq+strideq *0+32], m3
+    mova  [dstq+strideq *0+48], m2
+    mova  [dstq+stride4q*1+ 0], m0
+    mova  [dstq+stride4q*1+16], m3
+    mova  [dstq+stride4q*1+32], m2
+    mova  [dstq+stride4q*1+48], m5
+    mova  [dstq+stride4q*2+ 0], m3
+    mova  [dstq+stride4q*2+16], m2
+    mova  [dstq+stride4q*2+32], m5
+    mova  [dstq+stride4q*2+48], m4
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+%else
+    SCRATCH                  6,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 10, rsp+3*mmsize
+%endif
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    UNSCRATCH                6,  9, rsp+2*mmsize
+    SCRATCH                  0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+3*mmsize
+    SCRATCH                  3, 10, rsp+3*mmsize
+%endif
+%endif
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+0*mmsize], m1
+    mova                    m1, [rsp+1*mmsize]
+%endif
+    mova  [dstq+stride3q*4+ 0], m2
+    mova  [dstq+stride3q*4+16], m5
+    mova  [dstq+stride3q*4+32], m4
+    mova  [dstq+stride3q*4+48], m7
+    mova  [dstq+stride4q*4+ 0], m5
+    mova  [dstq+stride4q*4+16], m4
+    mova  [dstq+stride4q*4+32], m7
+    mova  [dstq+stride4q*4+48], m6
+    mova  [dstq+stride20q + 0], m4
+    mova  [dstq+stride20q +16], m7
+    mova  [dstq+stride20q +32], m6
+    mova  [dstq+stride20q +48], m1
+    mova  [dstq+stride3q*8+ 0], m7
+    mova  [dstq+stride3q*8+16], m6
+    mova  [dstq+stride3q*8+32], m1
+    mova  [dstq+stride3q*8+48], m1
+    mova  [dstq+stride28q + 0], m6
+    mova  [dstq+stride28q +16], m1
+    mova  [dstq+stride28q +32], m1
+    mova  [dstq+stride28q +48], m1
+%if cpuflag(avx)
+    vpalignr                m2, m5, m2, 4
+    vpalignr                m5, m4, m5, 4
+    vpalignr                m4, m7, m4, 4
+    vpalignr                m7, m6, m7, 4
+    vpalignr                m6, m1, m6, 4
+%else
+    PALIGNR                 m0, m5, m2, 4, m3
+    mova                    m2, m0
+    PALIGNR                 m0, m4, m5, 4, m3
+    mova                    m5, m0
+    PALIGNR                 m0, m7, m4, 4, m3
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m7, 4, m3
+    mova                    m7, m0
+    PALIGNR                 m0, m1, m6, 4, m3
+    mova                    m6, m0
+    UNSCRATCH                0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3, 10, rsp+3*mmsize
+%endif
+%endif
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_FUNCS 4
+INIT_XMM ssse3
+HU_FUNCS 3
+INIT_XMM avx
+HU_FUNCS 2
+
+%macro HD_FUNCS 0
+cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
+    movh                    m0, [lq]
+    movhps                  m0, [aq-2]
+    psrldq                  m1, m0, 2
+    psrldq                  m2, m0, 4
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    punpcklwd               m1, m2
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m1
+    movhps    [dstq+strideq*1], m1
+    movhlps                 m2, m2
+    PALIGNR                 m2, m1, 4, m0
+    movh      [dstq+strideq*2], m2
+    movhps    [dstq+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m1, [aq-2]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+    SBUTTERFLY           wd, 2,  3,  0
+    psrldq                  m0, m1,  2
+    psrldq                  m4, m1,  4
+    LOWPASS                  1,  0,  4
+    DEFINE_ARGS dst8, mstride, cnt
+    lea                  dst8q, [dst8q+mstrideq*8]
+    neg               mstrideq
+    mov                   cntd, 4
+
+.loop:
+    add                  dst8q, mstrideq
+    mova    [dst8q+mstrideq*0], m2
+    mova    [dst8q+mstrideq*4], m3
+%if cpuflag(avx)
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m1, m3, 4
+%else
+    PALIGNR                 m0, m3, m2, 4, m4
+    mova                    m2, m0
+    PALIGNR                 m0, m1, m3, 4, m4
+    mova                    m3, m0
+%endif
+    psrldq                  m1, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
+    mova                    m2, [lq]
+    movu                    m1, [lq+2]
+    movu                    m0, [lq+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    mova                    m4, [lq+mmsize]
+    movu                    m5, [aq-2]
+    PALIGNR                 m3, m5, m4, 2, m6
+    PALIGNR                 m2, m5, m4, 4, m6
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 1,  0,  4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [aq]
+    movu                    m4, [aq+2]
+    LOWPASS                  4,  6,  5
+    movu                    m5, [aq+mmsize-2]
+    psrldq                  m6, m5,  2
+    psrldq                  m7, m5,  4
+    LOWPASS                  5,  6,  7
+    DEFINE_ARGS dst, mstride, mstride3, cnt
+    lea                   dstq, [dstq+mstrideq*8]
+    lea                   dstq, [dstq+mstrideq*8]
+    neg               mstrideq
+    lea              mstride3q, [mstrideq*3]
+    mov                   cntd, 4
+
+.loop:
+    add                  dstq, mstrideq
+    mova [dstq+mstride3q*4+ 0], m2
+    mova [dstq+mstride3q*4+16], m4
+    mova [dstq+mstrideq *8+ 0], m3
+    mova [dstq+mstrideq *8+16], m2
+    mova [dstq+mstrideq *4+ 0], m0
+    mova [dstq+mstrideq *4+16], m3
+    mova [dstq+mstrideq *0+ 0], m1
+    mova [dstq+mstrideq *0+16], m0
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+    vpalignr                m2, m4, m2, 4
+    vpalignr                m4, m5, m4, 4
+%else
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m4, m2, 4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m5, m4, 4, m7
+    mova                    m4, m6
+%endif
+    psrldq                  m5, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
+                               10 * -mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    SCRATCH                  0,  8, rsp+0*mmsize
+    SCRATCH                  1,  9, rsp+1*mmsize
+    SCRATCH                  2, 10, rsp+2*mmsize
+    SCRATCH                  3, 11, rsp+3*mmsize
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m0, [lq+mmsize*3+0]
+    movu                    m1, [aq+mmsize*0-2]
+    PALIGNR                 m7, m1, m0, 2, m2
+    PALIGNR                 m6, m1, m0, 4, m2
+    LOWPASS                  6,  7,  0
+    pavgw                   m7, m0
+    SBUTTERFLY           wd, 7,  6,  0
+    mova                    m2, [aq+mmsize*0+0]
+    movu                    m0, [aq+mmsize*0+2]
+    LOWPASS                  0,  2,  1
+    movu                    m1, [aq+mmsize*1-2]
+    mova                    m2, [aq+mmsize*1+0]
+    movu                    m3, [aq+mmsize*1+2]
+    LOWPASS                  1,  2,  3
+    SCRATCH                  6, 12, rsp+6*mmsize
+    SCRATCH                  7, 13, rsp+7*mmsize
+    movu                    m2, [aq+mmsize*2-2]
+    mova                    m3, [aq+mmsize*2+0]
+    movu                    m6, [aq+mmsize*2+2]
+    LOWPASS                  2,  3,  6
+    movu                    m3, [aq+mmsize*3-2]
+    psrldq                  m6, m3,  2
+    psrldq                  m7, m3,  4
+    LOWPASS                  3,  6,  7
+    UNSCRATCH                6, 12, rsp+6*mmsize
+    UNSCRATCH                7, 13, rsp+7*mmsize
+%if ARCH_X86_32
+    mova        [rsp+4*mmsize], m4
+    mova        [rsp+5*mmsize], m5
+    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
+    ; to do it again here
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    mov                   cntd, 4
+    lea               stride3q, [strideq*3]
+%if ARCH_X86_64
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+%endif
+    add                   dstq, stride3q
+
+    ; x86-32 doesn't have enough registers, so on that platform, we split
+    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
+.loop:
+%if ARCH_X86_64
+    mova  [dstq+stride28q + 0], m9
+    mova  [dstq+stride28q +16], m8
+    mova  [dstq+stride28q +32], m11
+    mova  [dstq+stride28q +48], m10
+    mova  [dstq+stride3q*8+ 0], m8
+    mova  [dstq+stride3q*8+16], m11
+    mova  [dstq+stride3q*8+32], m10
+    mova  [dstq+stride3q*8+48], m5
+    mova  [dstq+stride20q + 0], m11
+    mova  [dstq+stride20q +16], m10
+    mova  [dstq+stride20q +32], m5
+    mova  [dstq+stride20q +48], m4
+    mova  [dstq+stride4q*4+ 0], m10
+    mova  [dstq+stride4q*4+16], m5
+    mova  [dstq+stride4q*4+32], m4
+    mova  [dstq+stride4q*4+48], m7
+%endif
+    mova  [dstq+stride3q*4+ 0], m5
+    mova  [dstq+stride3q*4+16], m4
+    mova  [dstq+stride3q*4+32], m7
+    mova  [dstq+stride3q*4+48], m6
+    mova  [dstq+strideq* 8+ 0], m4
+    mova  [dstq+strideq* 8+16], m7
+    mova  [dstq+strideq* 8+32], m6
+    mova  [dstq+strideq* 8+48], m0
+    mova  [dstq+strideq* 4+ 0], m7
+    mova  [dstq+strideq* 4+16], m6
+    mova  [dstq+strideq* 4+32], m0
+    mova  [dstq+strideq* 4+48], m1
+    mova  [dstq+strideq* 0+ 0], m6
+    mova  [dstq+strideq* 0+16], m0
+    mova  [dstq+strideq* 0+32], m1
+    mova  [dstq+strideq* 0+48], m2
+    sub                   dstq, strideq
+%if cpuflag(avx)
+%if ARCH_X86_64
+    vpalignr                m9, m8,  m9,  4
+    vpalignr                m8, m11, m8,  4
+    vpalignr               m11, m10, m11, 4
+    vpalignr               m10, m5,  m10, 4
+%endif
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+    vpalignr                m6, m0,  m6,  4
+    vpalignr                m0, m1,  m0,  4
+    vpalignr                m1, m2,  m1,  4
+    vpalignr                m2, m3,  m2,  4
+%else
+%if ARCH_X86_64
+    PALIGNR                m12, m8,  m9,  4, m13
+    mova                    m9, m12
+    PALIGNR                m12, m11, m8,  4, m13
+    mova                    m8, m12
+    PALIGNR                m12, m10, m11, 4, m13
+    mova                   m11, m12
+    PALIGNR                m12, m5,  m10, 4, m13
+    mova                   m10, m12
+%endif
+    SCRATCH                  3, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  2, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m4,  m5,  4, m2
+    mova                    m5, m3
+    PALIGNR                 m3, m7,  m4,  4, m2
+    mova                    m4, m3
+    PALIGNR                 m3, m6,  m7,  4, m2
+    mova                    m7, m3
+    PALIGNR                 m3, m0,  m6,  4, m2
+    mova                    m6, m3
+    PALIGNR                 m3, m1,  m0,  4, m2
+    mova                    m0, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                2, 13, rsp+9*mmsize
+    SCRATCH                  0, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m2,  m1,  4, m0
+    mova                    m1, m3
+    PALIGNR                 m3, reg_sh,  m2,  4, m0
+    mova                    m2, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                0, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                3, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m3, 4
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                0,  8, rsp+0*mmsize
+    UNSCRATCH                1,  9, rsp+1*mmsize
+    UNSCRATCH                2, 10, rsp+2*mmsize
+    UNSCRATCH                3, 11, rsp+3*mmsize
+    mova                    m4, [rsp+4*mmsize]
+    mova                    m5, [rsp+5*mmsize]
+    mova                    m6, [rsp+6*mmsize]
+    mova                    m7, [rsp+7*mmsize]
+    DEFINE_ARGS dst, stride, stride5, stride3
+    lea               stride5q, [strideq*5]
+    lea                   dstq, [dstq+stride5q*4]
+    DEFINE_ARGS dst, stride, cnt, stride3
+    mov                   cntd, 4
+.loop_2:
+    mova  [dstq+stride3q*4+ 0], m1
+    mova  [dstq+stride3q*4+16], m0
+    mova  [dstq+stride3q*4+32], m3
+    mova  [dstq+stride3q*4+48], m2
+    mova  [dstq+strideq* 8+ 0], m0
+    mova  [dstq+strideq* 8+16], m3
+    mova  [dstq+strideq* 8+32], m2
+    mova  [dstq+strideq* 8+48], m5
+    mova  [dstq+strideq* 4+ 0], m3
+    mova  [dstq+strideq* 4+16], m2
+    mova  [dstq+strideq* 4+32], m5
+    mova  [dstq+strideq* 4+48], m4
+    mova  [dstq+strideq* 0+ 0], m2
+    mova  [dstq+strideq* 0+16], m5
+    mova  [dstq+strideq* 0+32], m4
+    mova  [dstq+strideq* 0+48], m7
+    sub                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m0,  m1,  4
+    vpalignr                m0, m3,  m0,  4
+    vpalignr                m3, m2,  m3,  4
+    vpalignr                m2, m5,  m2,  4
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+%else
+    SCRATCH                  6, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m0,  m1,  4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3,  m0,  4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2,  m3,  4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m5,  m2,  4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m4,  m5,  4, m7
+    mova                    m5, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 13, rsp+9*mmsize
+    SCRATCH                  5, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m7,  m4,  4, m5
+    mova                    m4, m6
+    PALIGNR                 m6, reg_sh,  m7,  4, m5
+    mova                    m7, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                5, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                6, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m6, 4
+    dec                   cntd
+    jg .loop_2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_FUNCS
+INIT_XMM ssse3
+HD_FUNCS
+INIT_XMM avx
+HD_FUNCS
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 0000000..2c63fe5
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,3197 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA 32
+
+%macro VP9_IDCT_COEFFS 2-3 0
+const pw_m%1_%2
+times 8 dw -%1,  %2
+const pw_%2_%1
+times 8 dw  %2,  %1
+
+%if %3 == 1
+const pw_m%2_m%1
+times 8 dw -%2, -%1
+%if %1 != %2
+const pw_m%2_%1
+times 8 dw -%2,  %1
+const pw_%1_%2
+times 8 dw  %1,  %2
+%endif
+%endif
+
+%if %1 < 11585
+pw_m%1x2:   times 16 dw -%1*2
+%elif %1 > 11585
+pw_%1x2:    times 16 dw  %1*2
+%else
+const pw_%1x2
+times 16 dw %1*2
+%endif
+
+%if %2 != %1
+pw_%2x2:    times 16 dw  %2*2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 16364,   804
+VP9_IDCT_COEFFS 16305,  1606
+VP9_IDCT_COEFFS 16069,  3196, 1
+VP9_IDCT_COEFFS 15893,  3981
+VP9_IDCT_COEFFS 15137,  6270, 1
+VP9_IDCT_COEFFS 14811,  7005
+VP9_IDCT_COEFFS 14449,  7723
+VP9_IDCT_COEFFS 13160,  9760
+VP9_IDCT_COEFFS 11585, 11585, 1
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS  9102, 13623, 1
+VP9_IDCT_COEFFS  8423, 14053
+VP9_IDCT_COEFFS  5520, 15426
+VP9_IDCT_COEFFS  4756, 15679
+VP9_IDCT_COEFFS  2404, 16207
+
+const pw_5283_13377
+times 4 dw 5283, 13377
+const pw_9929_13377
+times 4 dw 9929, 13377
+const pw_15212_m13377
+times 4 dw 15212, -13377
+const pw_15212_9929
+times 4 dw 15212, 9929
+const pw_m5283_m15212
+times 4 dw -5283, -15212
+const pw_13377x2
+times 8 dw 13377*2
+const pw_m13377_13377
+times 4 dw -13377, 13377
+const pw_13377_0
+times 4 dw 13377, 0
+
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_m1
+cextern pd_8192
+
+SECTION .text
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+    punpckhwd          m%4, m%2, m%1
+    punpcklwd          m%2, m%1
+    pmaddwd            m%3, m%4, [pw_m%5_%6]
+    pmaddwd            m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+    SUMSUB_BA            d, %1, %2, %5
+    SUMSUB_BA            d, %3, %4, %5
+    paddd              m%1, %6
+    paddd              m%2, %6
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%1, 14
+    psrad              m%2, 14
+    psrad              m%3, 14
+    psrad              m%4, 14
+    packssdw           m%1, m%3
+    packssdw           m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+%if mmsize == 32
+    pmovzxbw           m%3, [%6]
+    pmovzxbw           m%4, [%6+strideq]
+%else
+    movh               m%3, [%6]
+    movh               m%4, [%6+strideq]
+    punpcklbw          m%3, m%5
+    punpcklbw          m%4, m%5
+%endif
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+%if mmsize == 32
+    packuswb           m%3, m%4
+    ; Intel...
+    vpermq             m%3, m%3, q3120
+    mova              [%6], xm%3
+    vextracti128 [%6+strideq], m%3, 1
+%elif mmsize == 16
+    packuswb           m%3, m%4
+    movh              [%6], m%3
+    movhps    [%6+strideq], m%3
+%else
+    packuswb           m%3, m%5
+    packuswb           m%4, m%5
+    movh              [%6], m%3
+    movh      [%6+strideq], m%4
+%endif
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+    mova                m0, [blockq+0*8]
+    mova                m1, [blockq+1*8]
+    mova                m2, [blockq+2*8]
+    mova                m3, [blockq+3*8]
+    psraw               m0, 2
+    psraw               m1, 2
+    psraw               m2, 2
+    psraw               m3, 2
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m4, m4
+    VP9_STORE_2X         0, 1, 5, 6, 4
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 5, 6, 4
+    ZERO_BLOCK      blockq, 8, 4, m4
+    RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+    pmulhrsw            m0, m5                              ; m0=t1
+    mova                m2, m0                              ; m2=t0
+    mova                m3, m1
+    pmulhrsw            m1, m6                              ; m1=t2
+    pmulhrsw            m3, m7                              ; m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    pmulhrsw            m1, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    psraw               m0, 4
+    psraw               m1, 4
+%endif
+    VP9_STORE_2X         0,  1,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%if cpuflag(ssse3)
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    VP9_STORE_2X         2,  3,  6,  7,  4
+%endmacro
+
+%macro IDCT_4x4_FN 1
+INIT_MMX %1
+cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+    cmp eobd, 4 ; 2x2 or smaller
+    jg .idctfull
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct2x2
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (8 << 14) + 8192
+    sar              coefd, 14 + 4
+    movd                m0, coefd
+%endif
+    pshufw              m0, m0, 0
+    pxor                m4, m4
+    movh          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+8]
+    mova                m5, [pw_11585x2]
+    mova                m6, [pw_6270x2]
+    mova                m7, [pw_15137x2]
+    VP9_IDCT4_2x2_1D
+    ; partial 2x4 transpose
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    SBUTTERFLY          dq, 0, 2, 1
+    SWAP                1, 2
+    VP9_IDCT4_2x2_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movh       [blockq+ 0], m4
+    movh       [blockq+ 8], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endif
+
+.idctfull: ; generic full 4x4 idct/idct
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IDCT_4x4_FN mmxext
+IDCT_4x4_FN ssse3
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, sse2
+IADST4_FN iadst, IADST4, idct,  IDCT4,  sse2
+IADST4_FN iadst, IADST4, iadst, IADST4, sse2
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova              [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+    SUMSUB_BA            w,  3,  6, 5                       ; m3=t0+t7, m6=t0-t7
+    SUMSUB_BA            w,  1,  2, 5                       ; m1=t1+t6, m2=t1-t6
+    SUMSUB_BA            w,  7,  0, 5                       ; m7=t2+t5, m0=t2-t5
+
+    UNSCRATCH            5, 8, blockq+ 0
+    SCRATCH              2, 8, blockq+ 0
+
+    SUMSUB_BA            w,  5,  4, 2                       ; m5=t3+t4, m4=t3-t4
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+%endmacro
+
+; x86-32
+; - in: m0/m4 is in mem
+; - out: m6 is in mem
+; x86-64:
+; - everything is in registers (m0-7)
+%macro VP9_IDCT8_1D 0
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 4, 9
+%endif
+
+    VP9_UNPACK_MULSUB_2W_4X 5,  3,  9102, 13623, D_8192_REG, 0, 4  ; m5=t5a, m3=t6a
+    VP9_UNPACK_MULSUB_2W_4X 1,  7, 16069,  3196, D_8192_REG, 0, 4  ; m1=t4a, m7=t7a
+    SUMSUB_BA            w,  5,  1, 0                       ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 0                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+%if cpuflag(ssse3)
+    SUMSUB_BA            w,  1,  7, 0                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+%else
+    VP9_UNPACK_MULSUB_2W_4X 7,  1, 11585, 11585, D_8192_REG, 0, 4
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 2,  6, 15137,  6270, D_8192_REG, 0, 4  ; m2=t2a, m6=t3a
+
+    UNSCRATCH            0, 8, blockq+ 0    ; IN(0)
+    UNSCRATCH            4, 9, blockq+64    ; IN(4)
+    SCRATCH              5, 8, blockq+ 0
+
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 4, 0, 5                         ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
+    pmulhrsw            m4, W_11585x2_REG                   ; m4=t0a
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a
+%else
+    SCRATCH              7, 9, blockq+64
+    VP9_UNPACK_MULSUB_2W_4X 0,  4, 11585, 11585, D_8192_REG, 5, 7
+    UNSCRATCH            7, 9, blockq+64
+%endif
+    SUMSUB_BA            w,  6,  4, 5                       ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a/t0a
+    pmulhrsw            m6, m2, [pw_15137x2]                ; m6=t3a
+    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
+    pmulhrsw            m7, m1, [pw_16069x2]                ; m7=t7a
+    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
+    pmulhrsw            m5, m3, [pw_m9102x2]                ; m5=t5a
+    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
+    SUMSUB_BA            w,  5,  1, 4                       ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 4                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+    SUMSUB_BA            w,  1,  7, 4                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    psubw               m4, m0, m6                          ; m4=t0a-t3a (t3)
+    paddw               m6, m0                              ; m6=t0a+t3a (t0)
+    SCRATCH              5,  8, blockq+ 0
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_2x2_1D 1
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t0
+    pmulhrsw            m3, m1, W_16069x2_REG               ; m3=t7
+    pmulhrsw            m1, W_3196x2_REG                    ; m1=t4
+    psubw               m7, m3, m1                          ; t5 = t7a - t4a
+    paddw               m5, m3, m1                          ; t6 = t7a + t4a
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
+    SWAP                 5,  1
+    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
+    psubw               m6, m0, m3                          ; m6=t0-t7
+    paddw               m3, m0                              ; m3=t0+t7
+    psubw               m2, m0, m1                          ; m2=t1-t6
+    paddw               m1, m0                              ; m1=t1+t6
+%if %1 == 1
+    punpcklwd           m3, m1
+%define SCRATCH_REG 1
+%elif ARCH_X86_32
+    mova       [blockq+ 0], m2
+%define SCRATCH_REG 2
+%else
+%define SCRATCH_REG 8
+%endif
+    psubw               m4, m0, m5                          ; m4=t3-t4
+    paddw               m5, m0                              ; m5=t3+t4
+    SUMSUB_BA            w,  7,  0, SCRATCH_REG             ; m7=t2+t5, m0=t2-t5
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+%undef SCRATCH_REG
+%endmacro
+
+%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
+%if cpuflag(ssse3)
+    pmulhrsw           m%1, %6              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+    pmulhrsw           m%2, %6
+%else
+    paddw              m%1, %6
+    paddw              m%2, %6
+    psraw              m%1, %7
+    psraw              m%2, %7
+%endif
+%if %0 <= 7
+    VP9_STORE_2X        %1, %2, %3, %4, %5
+%else
+    VP9_STORE_2X        %1, %2, %3, %4, %5, %8
+%endif
+%endmacro
+
+; x86-32:
+; - m6 is in mem
+; x86-64:
+; - m8 holds m6 (SWAP)
+; m6 holds zero
+%macro VP9_IDCT8_WRITEOUT 0
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova                m9, [pw_1024]
+%else
+    mova                m9, [pw_16]
+%endif
+%define ROUND_REG m9
+%else
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_1024]
+%else
+%define ROUND_REG [pw_16]
+%endif
+%endif
+    SCRATCH              5, 10, blockq+16
+    SCRATCH              7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    0,  1, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 10, blockq+16
+    UNSCRATCH            7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    4,  5, 0, 1, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 8, blockq+ 0
+    VP9_IDCT8_WRITEx2    5,  7, 0, 1, 6, ROUND_REG
+
+%undef ROUND_REG
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova               m12, [pw_11585x2]    ; often used
+%define W_11585x2_REG m12
+%else
+%define W_11585x2_REG [pw_11585x2]
+%endif
+
+    cmp eobd, 12 ; top left half or less
+    jg .idctfull
+
+    cmp eobd, 3  ; top left corner or less
+    jg .idcthalf
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idcttopleftcorner
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    pmulhrsw            m0, W_11585x2_REG
+    pmulhrsw            m0, W_11585x2_REG
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (16 << 14) + 8192
+    sar              coefd, 14 + 5
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, 0
+    pxor                m4, m4
+    movd          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_1024]       ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+%endif
+%rep 3
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+16]
+%if ARCH_X86_64
+    mova               m10, [pw_3196x2]
+    mova               m11, [pw_16069x2]
+%define W_3196x2_REG m10
+%define W_16069x2_REG m11
+%else
+%define W_3196x2_REG [pw_3196x2]
+%define W_16069x2_REG [pw_16069x2]
+%endif
+    VP9_IDCT8_2x2_1D 1
+    ; partial 2x8 transpose
+    ; punpcklwd m0, m1 already done inside idct
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    punpckldq           m0, m2
+    punpckldq           m4, m6
+    SBUTTERFLY         qdq, 0, 4, 1
+    SWAP                 1, 4
+    VP9_IDCT8_2x2_1D 2
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movd       [blockq+ 0], m6
+    movd       [blockq+16], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    RET
+
+.idcthalf:
+    movh                m0, [blockq + 0]
+    movh                m1, [blockq +16]
+    movh                m2, [blockq +32]
+    movh                m3, [blockq +48]
+    VP9_IDCT8_4x4_1D
+    ; partial 4x8 transpose
+%if ARCH_X86_32
+    mova                m6, [blockq+ 0]
+%endif
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    SBUTTERFLY          dq, 0, 2, 1
+    SBUTTERFLY          dq, 4, 6, 5
+    SBUTTERFLY         qdq, 0, 4, 1
+    SBUTTERFLY         qdq, 2, 6, 5
+    SWAP                 1, 4
+    SWAP                 3, 6
+    VP9_IDCT8_4x4_1D
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movh       [blockq+ 0], m6
+    movh       [blockq+16], m6
+    movh       [blockq+32], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    movh       [blockq+48], m6
+    RET
+%endif
+
+.idctfull: ; generic full 8x8 idct/idct
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+    mova                m3, [blockq+ 48]    ; IN(3)
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+    mova                m7, [blockq+112]    ; IN(7)
+%if ARCH_X86_64
+    mova               m11, [pd_8192]       ; rounding
+%define D_8192_REG m11
+%else
+%define D_8192_REG [pd_8192]
+%endif
+    VP9_IDCT8_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova        [blockq+0], m0
+%endif
+    VP9_IDCT8_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+%undef W_11585x2_REG
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
+VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-32:
+; - in: m0/3/4/7 are in mem [blockq+N*16]
+; - out: m6 is in mem [blockq+0]
+; x86-64:
+; - everything is in registers
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     3, 9
+    SWAP                     4, 10
+    SWAP                     7, 11
+%endif
+
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  0,  3, 14449,  7723    ; m5/2=t3[d], m2/4=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  1,  6,  4,  7,  4756, 15679    ; m1/4=t7[d], m6/7=t6[d]
+    SCRATCH                  4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     6,  2,  7,  3, 4, D_8192_REG  ; m6=t2[w], m2=t6[w]
+    UNSCRATCH                4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     1,  5,  4,  0, 3, D_8192_REG  ; m1=t3[w], m5=t7[w]
+
+    UNSCRATCH                0,  8, blockq+16*0
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                7, 11, blockq+16*7
+    SCRATCH                  1,  8, blockq+16*1
+    SCRATCH                  2,  9, blockq+16*2
+    SCRATCH                  5, 10, blockq+16*5
+    SCRATCH                  6, 11, blockq+16*6
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  0,  1,  2, 16305,  1606    ; m7/1=t1[d], m0/2=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  4,  5,  6, 10394, 12665    ; m3/5=t5[d], m4/6=t4[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     4,  0,  6,  2, 1, D_8192_REG  ; m4=t0[w], m0=t4[w]
+    UNSCRATCH                1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     3,  7,  5,  1, 2, D_8192_REG  ; m3=t1[w], m7=t5[w]
+
+    UNSCRATCH                2,  9, blockq+16*2
+    UNSCRATCH                5, 10, blockq+16*5
+    SCRATCH                  3,  9, blockq+16*3
+    SCRATCH                  4, 10, blockq+16*4
+
+    ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  7,  1,  3, 15137,  6270    ; m0/1=t5[d], m7/3=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  4,  6,  6270, 15137    ; m5/4=t6[d], m2/6=t7[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     5,  7,  4,  3, 1, D_8192_REG
+    UNSCRATCH                1, 12, blockq+ 0*16
+    PSIGNW                  m5, W_M1_REG                    ; m5=out1[w], m7=t6[w]
+    VP9_RND_SH_SUMSUB_BA     2,  0,  6,  1, 3, D_8192_REG   ; m2=out6[w], m0=t7[w]
+
+    UNSCRATCH                1,  8, blockq+16*1
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                6, 11, blockq+16*6
+    SCRATCH                  2,  8, blockq+16*0
+
+    SUMSUB_BA                w,  6,  4, 2                   ; m6=out0[w], m4=t2[w]
+    SUMSUB_BA                w,  1,  3, 2
+    PSIGNW                  m1, W_M1_REG                    ; m1=out7[w], m3=t3[w]
+
+    ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  3,  4,  2
+    SUMSUB_BA                w,  0,  7,  2
+    pmulhrsw                m3, W_11585x2_REG
+    pmulhrsw                m7, W_11585x2_REG
+    pmulhrsw                m4, W_11585x2_REG               ; out4
+    pmulhrsw                m0, W_11585x2_REG               ; out2
+%else
+    SCRATCH                  5,  9, blockq+16*1
+    VP9_UNPACK_MULSUB_2W_4X  4, 3, 11585, 11585, D_8192_REG, 2, 5
+    VP9_UNPACK_MULSUB_2W_4X  7, 0, 11585, 11585, D_8192_REG, 2, 5
+    UNSCRATCH                5,  9, blockq+16*1
+%endif
+    PSIGNW                  m3, W_M1_REG                    ; out3
+    PSIGNW                  m7, W_M1_REG                    ; out5
+
+    ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
+
+%if ARCH_X86_64
+    SWAP                     2, 8
+%endif
+    SWAP                     0, 6, 2
+    SWAP                     7, 1, 5
+%endmacro
+
+%macro IADST8_FN 6
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
+
+%ifidn %1, idct
+%define first_is_idct 1
+%else
+%define first_is_idct 0
+%endif
+
+%ifidn %3, idct
+%define second_is_idct 1
+%else
+%define second_is_idct 0
+%endif
+
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+%if ARCH_X86_64 || first_is_idct
+    mova                m3, [blockq+ 48]    ; IN(3)
+%endif
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+%if ARCH_X86_64 || first_is_idct
+    mova                m7, [blockq+112]    ; IN(7)
+%endif
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova               m15, [pw_11585x2]    ; often used
+%endif
+    mova               m13, [pd_8192]       ; rounding
+    mova               m14, [pw_m1]
+%define W_11585x2_REG m15
+%define D_8192_REG m13
+%define W_M1_REG m14
+%else
+%define W_11585x2_REG [pw_11585x2]
+%define D_8192_REG [pd_8192]
+%define W_M1_REG [pw_m1]
+%endif
+
+    ; note different calling conventions for idct8 vs. iadst8 on x86-32
+    VP9_%2_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova      [blockq+  0], m0
+%if second_is_idct == 0
+    mova      [blockq+ 48], m3
+    mova      [blockq+112], m7
+%endif
+%endif
+    VP9_%4_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+
+%undef W_11585x2_REG
+%undef first_is_idct
+%undef second_is_idct
+
+%endmacro
+
+IADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
+IADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
+IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
+IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
+IADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-64:
+; at the end of this macro, m7 is stored in [%4+15*%5]
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
+; and the following simsubs have not been done yet:
+;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
+;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
+
+%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
+%if %2 <= 4
+    mova                m3, [%1+ 1*%3]      ; IN(1)
+    mova                m0, [%1+ 3*%3]      ; IN(3)
+
+    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
+    pmulhrsw            m3, [pw_1606x2]             ; t8-9
+    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
+    pmulhrsw            m0, [pw_15679x2]            ; t12-13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 1, 6 ; t9,  t14
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+    UNSCRATCH            5, 11, %4+ 7*%5
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+    mova                m5, [%1+ 1*%3]      ; IN(1)
+    mova                m4, [%1+ 7*%3]      ; IN(7)
+%if %2 <= 8
+    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
+    pmulhrsw            m5, [pw_1606x2]             ; t8
+    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
+    pmulhrsw            m4, [pw_12665x2]            ; t14
+%else
+    mova                m3, [%1+ 9*%3]      ; IN(9)
+    mova                m2, [%1+15*%3]      ; IN(15)
+
+    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 0, 1 ; t8,  t15
+    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 0, 1 ; t9,  t14
+%endif
+
+    SUMSUB_BA            w,  3,  5, 0       ; t8,  t9
+    SUMSUB_BA            w,  4,  2, 0       ; t15, t14
+
+    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 0, 1 ; t9,  t14
+
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+
+    mova                m6, [%1+ 3*%3]      ; IN(3)
+    mova                m7, [%1+ 5*%3]      ; IN(5)
+%if %2 <= 8
+    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
+    pmulhrsw            m7, [pw_7723x2]             ; t10
+    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
+    pmulhrsw            m6, [pw_15679x2]            ; t12
+%else
+    mova                m0, [%1+11*%3]      ; IN(11)
+    mova                m1, [%1+13*%3]      ; IN(13)
+
+    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 4, 5 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 4, 5 ; t11, t12
+%endif
+
+    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+    SUMSUB_BA            w,  7,  1, 4       ; t11, t10
+    SUMSUB_BA            w,  0,  6, 4       ; t12, t13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+
+    UNSCRATCH            5, 11, %4+ 7*%5
+%endif
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+    SUMSUB_BA            w,  7,  3, 4       ; t8,  t11
+
+    ; backup first register
+    mova        [%4+15*%5], m7
+
+    SUMSUB_BA            w,  6,  2, 7       ; t9,  t10
+    UNSCRATCH            4, 10, %4+ 1*%5
+    SUMSUB_BA            w,  0,  4, 7       ; t15, t12
+    SUMSUB_BA            w,  1,  5, 7       ; t14. t13
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  2,  5, 7
+    SUMSUB_BA            w,  3,  4, 7
+    pmulhrsw            m5, [pw_11585x2]    ; t10
+    pmulhrsw            m4, [pw_11585x2]    ; t11
+    pmulhrsw            m3, [pw_11585x2]    ; t12
+    pmulhrsw            m2, [pw_11585x2]    ; t13
+%else
+    SCRATCH              6, 10, %4+ 1*%5
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   4,   3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
+    UNSCRATCH            6, 10, %4+ 1*%5
+%endif
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+    SCRATCH              0,  8, %4+ 1*%5
+    SCRATCH              1,  9, %4+ 3*%5
+    SCRATCH              2, 10, %4+ 5*%5
+    SCRATCH              3, 11, %4+ 7*%5
+    SCRATCH              4, 12, %4+ 9*%5
+    SCRATCH              5, 13, %4+11*%5
+    SCRATCH              6, 14, %4+13*%5
+
+    ; even (tx8x8)
+%if %2 <= 4
+    mova                m3, [%1+ 0*%3]      ; IN(0)
+    mova                m4, [%1+ 2*%3]      ; IN(2)
+
+    pmulhrsw            m3, [pw_11585x2]    ; t0-t3
+    pmulhrsw            m7, m4, [pw_16069x2]        ; t6-7
+    pmulhrsw            m4, [pw_3196x2]             ; t4-5
+
+%if 0 ; overflows :(
+    paddw               m6, m7, m4
+    psubw               m5, m7, m4
+    pmulhrsw            m5, [pw_11585x2]            ; t5
+    pmulhrsw            m6, [pw_11585x2]            ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5,  t6
+%endif
+
+    psubw               m0, m3, m7
+    paddw               m7, m3
+    psubw               m1, m3, m6
+    paddw               m6, m3
+    psubw               m2, m3, m5
+    paddw               m5, m3
+
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+%else
+    mova                m6, [%1+ 2*%3]      ; IN(2)
+    mova                m1, [%1+ 4*%3]      ; IN(4)
+    mova                m7, [%1+ 6*%3]      ; IN(6)
+%if %2 <= 8
+    pmulhrsw            m0, m1,  [pw_15137x2]       ; t3
+    pmulhrsw            m1, [pw_6270x2]             ; t2
+    pmulhrsw            m5, m6, [pw_16069x2]        ; t7
+    pmulhrsw            m6, [pw_3196x2]             ; t4
+    pmulhrsw            m4, m7, [pw_m9102x2]        ; t5
+    pmulhrsw            m7, [pw_13623x2]            ; t6
+%else
+    mova                m4, [%1+10*%3]      ; IN(10)
+    mova                m0, [%1+12*%3]      ; IN(12)
+    mova                m5, [%1+14*%3]      ; IN(14)
+
+    VP9_UNPACK_MULSUB_2W_4X   1,   0, 15137,  6270, [pd_8192], 2, 3 ; t2,  t3
+    VP9_UNPACK_MULSUB_2W_4X   6,   5, 16069,  3196, [pd_8192], 2, 3 ; t4,  t7
+    VP9_UNPACK_MULSUB_2W_4X   4,   7,  9102, 13623, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
+    SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  6,  5, 2
+    pmulhrsw            m5, [pw_11585x2]                              ; t5
+    pmulhrsw            m6, [pw_11585x2]                              ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5,  6, 11585, 11585, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SCRATCH              5, 15, %4+10*%5
+    mova                m2, [%1+ 0*%3]      ; IN(0)
+%if %2 <= 8
+    pmulhrsw            m2, [pw_11585x2]    ; t0 and t1
+    psubw               m3, m2, m0
+    paddw               m0, m2
+
+    SUMSUB_BA            w,  7,  0, 5       ; t0,  t7
+%else
+    mova                m3, [%1+ 8*%3]      ; IN(8)
+
+    ; from 3 stages back
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  3,  2, 5
+    pmulhrsw            m3, [pw_11585x2]    ; t0
+    pmulhrsw            m2, [pw_11585x2]    ; t1
+%else
+    mova        [%1+ 0*%3], m0
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585,  11585, [pd_8192], 5, 0 ; t0, t1
+    mova                m0, [%1+ 0*%3]
+%endif
+
+    ; from 2 stages back
+    SUMSUB_BA            w,  0,  3, 5      ; t0,  t3
+
+    SUMSUB_BA            w,  7,  0, 5      ; t0,  t7
+%endif
+    UNSCRATCH            5, 15, %4+10*%5
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+    SUMSUB_BA            w,  1,  2, 7       ; t1,  t2
+
+    ; from 1 stage back
+    SUMSUB_BA            w,  6,  1, 7       ; t1,  t6
+    SUMSUB_BA            w,  5,  2, 7       ; t2,  t5
+%endif
+    SUMSUB_BA            w,  4,  3, 7       ; t3,  t4
+
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 1, 9
+    SWAP                 2, 10
+    SWAP                 3, 11
+    SWAP                 4, 12
+    SWAP                 5, 13
+    SWAP                 6, 14
+
+    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
+    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
+    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
+    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
+    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
+    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
+%else
+    SWAP                 1, 6
+    SWAP                 2, 5
+    SWAP                 3, 4
+    mova        [%4+14*%5], m6
+
+%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
+    mova                m6, [%4+%2*%5]
+    SUMSUB_BA            w,  6, %1, 7
+    SWAP                %1, 6
+    mova        [%4+%3*%5], m6
+%endmacro
+
+    %%SUMSUB_BA_STORE    0,  1,  1, %4, %5  ; t0, t15
+    %%SUMSUB_BA_STORE    1,  3,  3, %4, %5  ; t1, t14
+    %%SUMSUB_BA_STORE    2,  5,  5, %4, %5  ; t2, t13
+    %%SUMSUB_BA_STORE    3,  7,  7, %4, %5  ; t3, t12
+    %%SUMSUB_BA_STORE    4,  9,  9, %4, %5  ; t4, t11
+    %%SUMSUB_BA_STORE    5, 11, 11, %4, %5  ; t5, t10
+%endif
+%endmacro
+
+%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
+%if %2 == 1
+    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
+
+%if ARCH_X86_64
+    ; backup a different register
+    mova                m7, [tmpq+15*16]
+    mova      [tmpq+ 1*16], m15
+
+    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
+    mova        [tmpq+  0], m0
+    mova        [tmpq+ 32], m1
+    mova        [tmpq+ 64], m2
+    mova        [tmpq+ 96], m3
+    mova        [tmpq+128], m4
+    mova        [tmpq+160], m5
+    mova        [tmpq+192], m6
+    mova        [tmpq+224], m7
+
+    mova               m15, [tmpq+ 1*16]
+    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova        [tmpq+ 16], m8
+    mova        [tmpq+ 48], m9
+    mova        [tmpq+ 80], m10
+    mova        [tmpq+112], m11
+    mova        [tmpq+144], m12
+    mova        [tmpq+176], m13
+    mova        [tmpq+208], m14
+    mova        [tmpq+240], m15
+%else
+    mova                m6, [tmpq+13*16]
+    mova                m7, [tmpq+14*16]
+    SUMSUB_BA            w, 6, 7                ; t6, t9
+    mova      [tmpq+14*16], m6
+    mova      [tmpq+13*16], m7
+    mova                m7, [tmpq+15*16]
+    mova                m6, [tmpq+12*16]
+    SUMSUB_BA            w, 7, 6                ; t7, t8
+    mova      [tmpq+15*16], m6
+
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
+    mova     [tmpq+ 0*16], m0
+    mova     [tmpq+ 2*16], m1
+    mova     [tmpq+ 4*16], m2
+    mova     [tmpq+ 6*16], m3
+    mova     [tmpq+10*16], m5
+    mova     [tmpq+12*16], m6
+    mova     [tmpq+14*16], m7
+
+    mova                m0, [tmpq+15*16]
+    mova                m1, [tmpq+13*16]
+    mova                m2, [tmpq+11*16]
+    mova                m3, [tmpq+ 9*16]
+    mova                m4, [tmpq+ 7*16]
+    mova                m5, [tmpq+ 5*16]
+    mova                m7, [tmpq+ 1*16]
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
+    mova     [tmpq+ 1*16], m0
+    mova     [tmpq+ 3*16], m1
+    mova     [tmpq+ 5*16], m2
+    mova     [tmpq+ 7*16], m3
+    mova     [tmpq+11*16], m5
+    mova     [tmpq+13*16], m6
+    mova     [tmpq+15*16], m7
+%endif
+%else ; %2 == 2
+    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+    pxor                m7, m7
+%if ARCH_X86_64
+    ; backup more registers
+    mova        [%1+ 2*32], m8
+    mova        [%1+ 3*32], m9
+
+    VP9_IDCT8_WRITEx2    0,  1, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    4,  5, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    ; restore from cache
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m7, [%1+15*32]
+    mova                m8, [%1+ 2*32]
+    mova                m9, [%1+ 3*32]
+
+    SUMSUB_BA            w,  6,  9, 3       ; t6, t9
+    SUMSUB_BA            w,  7,  8, 3       ; t7, t8
+
+    VP9_IDCT8_WRITEx2    6,  7, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    8,  9, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, 0, ROUND_REG, 6
+%else
+    mova      [tmpq+ 0*32], m5
+
+    VP9_IDCT8_WRITEx2    0,  1, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m5, [tmpq+ 0*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+13*32]
+    mova                m7, [tmpq+14*32]
+    mova                m5, [tmpq+15*32]
+    mova                m6, [tmpq+12*32]
+    SUMSUB_BADC w, 4, 7, 5, 6, 1
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+11*32]
+    mova                m5, [tmpq+ 9*32]
+    mova                m6, [tmpq+ 7*32]
+    mova                m7, [tmpq+ 5*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+ 3*32]
+    mova                m5, [tmpq+ 1*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+%endif
+
+%undef ROUND_REG
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+    mova               m%3, [dstq]
+    mova               m%5, [dstq+%7]
+    punpcklbw          m%2, m%3, m%6
+    punpckhbw          m%3, m%6
+    punpcklbw          m%4, m%5, m%6
+    punpckhbw          m%5, m%6
+    paddw              m%2, m%1
+    paddw              m%3, m%1
+    paddw              m%4, m%1
+    paddw              m%5, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova            [dstq], m%2
+    mova         [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+%if cpuflag(ssse3)
+    ; 2x2=eob=3, 4x4=eob=10
+    cmp eobd, 38
+    jg .idctfull
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct8x8
+%else
+    cmp eobd, 1 ; faster path for when only DC is set
+    jg .idctfull
+%endif
+
+    ; dc-only
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 7
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    RET
+
+    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+%if cpuflag(ssse3)
+.idct8x8:
+    mov               tmpq, rsp
+    VP9_IDCT16_1D   blockq, 1, 8, 0
+
+    mov               cntd, 2
+    mov           dst_bakq, dstq
+.loop2_8x8:
+    VP9_IDCT16_1D     tmpq, 2, 8, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 8, m0
+    RET
+%endif
+
+.idctfull:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT16_1D   blockq, 1, 16, 0
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_IDCT16_1D     tmpq, 2, 16, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM sse2
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+%macro VP9_IDCT16_YMM_1D 0
+    VP9_UNPACK_MULSUB_2W_4X  1,  15, 16305,  1606, [pd_8192], 0, 4 ; t8,  t15
+    VP9_UNPACK_MULSUB_2W_4X  9,   7, 10394, 12665, [pd_8192], 0, 4 ; t9,  t14
+
+    SUMSUB_BA            w,  9,   1, 0      ; t8,  t9
+    SUMSUB_BA            w,  7,  15, 0      ; t15, t14
+
+    VP9_UNPACK_MULSUB_2W_4X 15,   1, 15137,  6270, [pd_8192], 0, 4 ; t9,  t14
+
+    VP9_UNPACK_MULSUB_2W_4X  5,  11, 14449,  7723, [pd_8192], 0, 4 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X 13,   3,  4756, 15679, [pd_8192], 0, 4 ; t11, t12
+
+    SUMSUB_BA            w,  5,  13, 0      ; t11, t10
+    SUMSUB_BA            w, 11,   3, 0      ; t12, t13
+
+    VP9_UNPACK_MULSUB_2W_4X  3,  13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13
+
+    SUMSUB_BA            w,  5,   9, 0      ; t8,  t11
+    SUMSUB_BA            w,  3,  15, 0      ; t9,  t10
+    SUMSUB_BA            w, 11,   7, 0      ; t15, t12
+    SUMSUB_BA            w, 13,   1, 0      ; t14, t13
+
+    SUMSUB_BA            w, 15,   1, 0
+    SUMSUB_BA            w,  9,   7, 0
+    pmulhrsw            m1, [pw_11585x2]    ; t10
+    pmulhrsw            m7, [pw_11585x2]    ; t11
+    pmulhrsw            m9, [pw_11585x2]    ; t12
+    pmulhrsw           m15, [pw_11585x2]    ; t13
+
+    ; even (tx8x8)
+    mova                m4, [blockq+128]
+    mova      [blockq+128], m5
+    VP9_UNPACK_MULSUB_2W_4X   4,  12, 15137,  6270, [pd_8192], 0, 5 ; t2,  t3
+    VP9_UNPACK_MULSUB_2W_4X   2,  14, 16069,  3196, [pd_8192], 0, 5 ; t4,  t7
+    VP9_UNPACK_MULSUB_2W_4X  10,   6,  9102, 13623, [pd_8192], 0, 5 ; t5,  t6
+    mova                m0, [blockq+  0]
+    SUMSUB_BA            w,   8,   0, 5
+    pmulhrsw            m8, [pw_11585x2]    ; t0
+    pmulhrsw            m0, [pw_11585x2]    ; t1
+
+    SUMSUB_BA            w,  10,   2, 5     ; t4,  t5
+    SUMSUB_BA            w,   6,  14, 5     ; t7,  t6
+    SUMSUB_BA            w,  12,   8, 5     ; t0,  t3
+    SUMSUB_BA            w,   4,   0, 5     ; t1,  t2
+
+    SUMSUB_BA            w,   2,  14, 5
+    pmulhrsw           m14, [pw_11585x2]    ; t5
+    pmulhrsw            m2, [pw_11585x2]    ; t6
+
+    SUMSUB_BA            w,   6,  12, 5     ; t0,  t7
+    SUMSUB_BA            w,   2,   4, 5     ; t1,  t6
+    SUMSUB_BA            w,  14,   0, 5     ; t2,  t5
+    SUMSUB_BA            w,  10,   8, 5     ; t3,  t4
+
+    ; final stage
+    SUMSUB_BA            w, 11,  6,  5      ; out0, out15
+    SUMSUB_BA            w, 13,  2,  5      ; out1, out14
+    SUMSUB_BA            w, 15, 14,  5      ; out2, out13
+    SUMSUB_BA            w,  9, 10,  5      ; out3, out12
+    SUMSUB_BA            w,  7,  8,  5      ; out4, out11
+    SUMSUB_BA            w,  1,  0,  5      ; out5, out10
+    SUMSUB_BA            w,  3,  4,  5      ; out6, out9
+    mova                m5, [blockq+128]
+    mova      [blockq+192], m3
+    SUMSUB_BA            w,  5, 12,  3      ; out7, out8
+
+    SWAP  0, 11,  8, 12, 10
+    SWAP  1, 13, 14,  2, 15,  6,  3,  9,  4,  7,  5
+%endmacro
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+    mova              xm%2, [dstq]
+    mova              xm%4, [dstq+strideq*2]
+    vinserti128        m%2, m%2, [dstq+strideq], 1
+    vinserti128        m%4, m%4, [dstq+stride3q], 1
+    punpckhbw          m%3, m%2, m%6
+    punpcklbw          m%2, m%6
+    punpckhbw          m%5, m%4, m%6
+    punpcklbw          m%4, m%6
+    paddw              m%3, m%1
+    paddw              m%2, m%1
+    paddw              m%5, m%1
+    paddw              m%4, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova            [dstq], xm%2
+    mova        [dstq+strideq*2], xm%4
+    vextracti128  [dstq+strideq], m%2, 1
+    vextracti128 [dstq+stride3q], m%4, 1
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
+    cmp eobd, 1 ; faster path for when only DC is set
+    jg .idctfull
+
+    ; dc-only
+    mova                m1, [pw_11585x2]
+    vpbroadcastw        m0, [blockq]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+    pxor                m5, m5
+    pmulhrsw            m0, [pw_512]
+    movd          [blockq], xm5
+
+    DEFINE_ARGS dst, stride, stride3, cnt
+    mov               cntd, 4
+    lea           stride3q, [strideq*3]
+.loop_dc:
+    VP9_STORE_YMM_DC_4X  0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+4*strideq]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+    DEFINE_ARGS dst, stride, block, eob
+.idctfull:
+    mova                m1, [blockq+ 32]
+    mova                m2, [blockq+ 64]
+    mova                m3, [blockq+ 96]
+    mova                m5, [blockq+160]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    mova                m8, [blockq+256]
+    mova                m9, [blockq+288]
+    mova               m10, [blockq+320]
+    mova               m11, [blockq+352]
+    mova               m12, [blockq+384]
+    mova               m13, [blockq+416]
+    mova               m14, [blockq+448]
+    mova               m15, [blockq+480]
+
+    VP9_IDCT16_YMM_1D
+    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+                         [blockq+192], [blockq+128], 1
+    mova      [blockq+  0], m0
+    VP9_IDCT16_YMM_1D
+
+    mova      [blockq+224], m7
+
+    ; store
+    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    pxor                m0, m0
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+    mova                m0, [%1+ 0*32]  ; in0
+    mova                m1, [%1+15*32]  ; in15
+    mova                m2, [%1+ 7*32]  ; in7
+    mova                m3, [%1+ 8*32]  ; in8
+
+    VP9_UNPACK_MULSUB_2D_4X  1,  0,  4,  5, 16364,   804    ; m1/4=t1[d], m0/5=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  6, 11003, 12140    ; m2/7=t9[d], m3/6=t8[d]
+    SCRATCH              4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  6,  5,  4, [pd_8192]   ; m3=t0[w], m0=t8[w]
+    UNSCRATCH            4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  7,  4,  5, [pd_8192]   ; m2=t1[w], m1=t9[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    mova   [tmpq+ 7*%%str], m2
+    mova   [tmpq+ 8*%%str], m3
+
+    mova                m1, [%1+ 2*32]  ; in2
+    mova                m0, [%1+13*32]  ; in13
+    mova                m3, [%1+ 5*32]  ; in5
+    mova                m2, [%1+10*32]  ; in10
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 15893,  3981    ; m0/6=t3[d], m1/7=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
+    SCRATCH              4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  5,  7,  4, [pd_8192]   ; m2=t2[w], m1=t10[w]
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  4,  6,  5, [pd_8192]   ; m3=t3[w], m0=t11[w]
+
+    SCRATCH              0, 12, tmpq+ 2*%%str
+    SCRATCH              1, 13, tmpq+13*%%str
+    mova   [tmpq+ 5*%%str], m2
+    mova   [tmpq+10*%%str], m3
+
+    mova                m2, [%1+ 4*32]  ; in4
+    mova                m3, [%1+11*32]  ; in11
+    mova                m0, [%1+ 3*32]  ; in3
+    mova                m1, [%1+12*32]  ; in12
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 14811,  7005    ; m3/7=t5[d], m2/6=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  5520, 15426    ; m0/4=t13[d], m1/5=t12[d]
+    SCRATCH              4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t4[w], m2=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t5[w], m3=t13[w]
+
+    SCRATCH              0,  8, tmpq+ 4*%%str
+    mova   [tmpq+11*%%str], m1          ; t4:m1->r11
+    UNSCRATCH            0, 10, tmpq+ 0*%%str
+    UNSCRATCH            1, 11, tmpq+15*%%str
+
+    ; round 2 interleaved part 1
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  5,  4,  3196, 16069    ; m3/5=t12[d], m2/4=t13[d]
+    SCRATCH              4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  1,  5,  7,  4, [pd_8192]   ; m3=t8[w], m1=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  0,  4,  6,  5, [pd_8192]   ; m2=t9[w], m0=t13[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    SCRATCH              2, 14, tmpq+ 3*%%str
+    SCRATCH              3, 15, tmpq+12*%%str
+
+    mova                m2, [%1+ 6*32]  ; in6
+    mova                m3, [%1+ 9*32]  ; in9
+    mova                m0, [%1+ 1*32]  ; in1
+    mova                m1, [%1+14*32]  ; in14
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  2404, 16207    ; m0/4=t15[d], m1/5=t14[d]
+    SCRATCH              4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t6[w], m2=t14[w]
+    UNSCRATCH            4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t7[w], m3=t15[w]
+
+    ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
+    ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
+
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    UNSCRATCH            5, 13, tmpq+13*%%str
+    SCRATCH              0, 12, tmpq+ 1*%%str
+    SCRATCH              1, 13, tmpq+14*%%str
+
+    ; remainder of round 2 (rest of t8-15)
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  6,  7,  9102, 13623    ; m5/6=t11[d], m4/7=t10[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  1,  0, 13623,  9102    ; m3/1=t14[d], m2/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  4,  1,  7,  0, [pd_8192]   ; m3=t10[w], m4=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  5,  0,  6,  1, [pd_8192]   ; m2=t11[w], m5=t15[w]
+
+    ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
+
+    UNSCRATCH            6, 14, tmpq+ 3*%%str
+    UNSCRATCH            7, 15, tmpq+12*%%str
+
+    SUMSUB_BA                w,  3,  7,  1
+    PSIGNW                  m3, [pw_m1]                     ; m3=out1[w], m7=t10[w]
+    SUMSUB_BA                w,  2,  6,  1                  ; m2=out14[w], m6=t11[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_11585x2]                ; m7=out6[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out9[w]
+%else
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    mova       [tmpq+ 3*%%str], m6
+    mova       [tmpq+ 6*%%str], m7
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    UNSCRATCH                7, 11, tmpq+15*%%str
+    mova       [tmpq+13*%%str], m2
+    SCRATCH                  3, 11, tmpq+ 9*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  6,  2,  3, 15137,  6270    ; m6/3=t13[d], m7/2=t12[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  1,  0,  6270, 15137    ; m5/1=t14[d], m4/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     5,  6,  1,  3,  0, [pd_8192]   ; m5=out2[w], m6=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
+    PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
+%else
+    PSIGNW                  m7, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X  7,  6, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
+
+    mova                    m2, [tmpq+ 8*%%str]
+    mova                    m3, [tmpq+ 7*%%str]
+    mova                    m1, [tmpq+11*%%str]
+    mova       [tmpq+ 7*%%str], m6
+    mova       [tmpq+11*%%str], m4
+    mova                    m4, [tmpq+ 5*%%str]
+    SCRATCH                  5, 14, tmpq+ 5*%%str
+    SCRATCH                  7, 15, tmpq+ 8*%%str
+    UNSCRATCH                6,  8, tmpq+ 4*%%str
+    UNSCRATCH                5, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+14*%%str
+
+    ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+    SUMSUB_BA                w,  1,  2, 0                   ; m1=t0[w], m2=t4[w]
+    mova                    m0, [tmpq+10*%%str]
+    SCRATCH                  1, 12, tmpq+ 1*%%str
+    SUMSUB_BA                w,  6,  3, 1                   ; m8=t1[w], m3=t5[w]
+    SCRATCH                  6, 13, tmpq+ 4*%%str
+    SUMSUB_BA                w,  7,  4, 1                   ; m13=t2[w], m9=t6[w]
+    SCRATCH                  7,  8, tmpq+10*%%str
+    SUMSUB_BA                w,  5,  0, 1                   ; m12=t3[w], m0=t7[w]
+    SCRATCH                  5,  9, tmpq+14*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  5, 15137,  6270    ; m2/6=t5[d], m3/10=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  4,  1,  6,  6270, 15137    ; m0/14=t6[d], m9/15=t7[d]
+    SCRATCH                  6, 10, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  1,  5,  6, [pd_8192]
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    PSIGNW                  m0, [pw_m1]                     ; m0=out3[w], m3=t6[w]
+    VP9_RND_SH_SUMSUB_BA     4,  2,  6,  7,  5, [pd_8192]   ; m9=out12[w], m2=t7[w]
+
+    UNSCRATCH                1,  8, tmpq+10*%%str
+    UNSCRATCH                5,  9, tmpq+14*%%str
+    UNSCRATCH                6, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+ 4*%%str
+    SCRATCH                  4,  9, tmpq+14*%%str
+
+    SUMSUB_BA                w,  1,  6,  4                  ; m13=out0[w], m1=t2[w]
+    SUMSUB_BA                w,  5,  7,  4
+    PSIGNW                  m5, [pw_m1]                     ; m12=out15[w], m8=t3[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+%if 0 ; cpuflag(ssse3)
+    SUMSUB_BA               w,   7,  6,  4
+    pmulhrsw                m7, [pw_m11585x2]               ; m8=out7[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m1=out8[w]
+    SWAP                     6,  7
+    SUMSUB_BA                w,  3,  2,  4
+    pmulhrsw                m3, [pw_11585x2]                ; m3=out4[w]
+    pmulhrsw                m2, [pw_11585x2]                ; m2=out11[w]
+%else
+    SCRATCH                  5,  8, tmpq+10*%%str
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, m11585, [pd_8192],  5,  4
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585, 11585, [pd_8192],  5,  4
+    UNSCRATCH                5,  8, tmpq+10*%%str
+%endif
+
+    ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+%if %2 == 1
+%if ARCH_X86_64
+    mova                   m13, [tmpq+ 6*%%str]
+    TRANSPOSE8x8W            1, 11, 14, 0, 3, 15, 13, 6, 10
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m11
+    mova          [tmpq+ 4*16], m14
+    mova          [tmpq+ 6*16], m0
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+    mova          [tmpq+ 8*16], m3
+    mova          [tmpq+10*16], m15
+    mova          [tmpq+12*16], m13
+    mova          [tmpq+14*16], m6
+
+    TRANSPOSE8x8W            7, 1, 11, 2, 9, 14, 0, 5, 10
+    mova          [tmpq+ 1*16], m7
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m11
+    mova          [tmpq+ 7*16], m2
+    mova          [tmpq+ 9*16], m9
+    mova          [tmpq+11*16], m14
+    mova          [tmpq+13*16], m0
+    mova          [tmpq+15*16], m5
+%else
+    mova       [tmpq+12*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+15*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    mova                    m5, [tmpq+ 5*%%str]
+    mova                    m7, [tmpq+ 8*%%str]
+    TRANSPOSE8x8W            1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m2
+    mova          [tmpq+ 4*16], m5
+    mova          [tmpq+ 6*16], m0
+    mova          [tmpq+10*16], m7
+    mova                    m3, [tmpq+12*%%str]
+    mova          [tmpq+12*16], m4
+    mova                    m4, [tmpq+14*%%str]
+    mova          [tmpq+14*16], m6
+
+    mova                    m0, [tmpq+15*%%str]
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                    m2, [tmpq+ 7*%%str]
+    mova                    m5, [tmpq+11*%%str]
+    mova                    m7, [tmpq+ 1*%%str]
+    TRANSPOSE8x8W            0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
+    mova          [tmpq+ 1*16], m0
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m2
+    mova          [tmpq+ 7*16], m3
+    mova          [tmpq+11*16], m5
+    mova          [tmpq+13*16], m6
+    mova          [tmpq+15*16], m7
+%endif
+%else
+    pxor                    m4, m4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%if ARCH_X86_64
+    mova                   m12, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        1, 11, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       14,  0, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        3, 15, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       12,  6, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+
+    VP9_IDCT8_WRITEx2        7,  1, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       11,  2, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        9, 14, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        0,  5, 10,  8,  4, ROUND_REG, 6
+%else
+    mova       [tmpq+ 0*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+ 2*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    VP9_IDCT8_WRITEx2        1,  2,  5,  7,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 5*%%str]
+    VP9_IDCT8_WRITEx2        5,  0,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 8*%%str]
+    VP9_IDCT8_WRITEx2        3,  5,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        5,  6,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m0, [tmpq+ 2*%%str]
+    mova                    m3, [tmpq+ 3*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+ 7*%%str]
+    mova                    m3, [tmpq+ 0*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+14*%%str]
+    mova                    m3, [tmpq+11*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+13*%%str]
+    mova                    m3, [tmpq+ 1*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+%endif
+
+    SWAP                     0,  4 ; zero
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_%2_1D       blockq, 1
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_%4_1D         tmpq, 2
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+IADST16_FN idct,  IDCT16,  iadst, IADST16, sse2
+IADST16_FN iadst, IADST16, idct,  IDCT16,  sse2
+IADST16_FN iadst, IADST16, iadst, IADST16, sse2
+IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
+; out: m[0-15] except m6, which is in [blockq+192]
+; uses blockq as scratch space
+%macro VP9_IADST16_YMM_1D 0
+    mova          [blockq+ 32], m3
+    mova          [blockq+ 64], m7
+    mova          [blockq+ 96], m8
+
+    ; first half of round 1
+    VP9_UNPACK_MULSUB_2D_4X  9,  6,  0,  3, 13160,  9760    ; m9/x=t7[d], m6/x=t6[d]
+    VP9_UNPACK_MULSUB_2D_4X  1, 14,  4,  7,  2404, 16207    ; m1/x=t15[d], m14/x=t14[d]
+    VP9_RND_SH_SUMSUB_BA    14,  6,  7,  3,  8, [pd_8192]   ; m14=t6[w], m6=t14[w]
+    VP9_RND_SH_SUMSUB_BA     1,  9,  4,  0,  8, [pd_8192]   ; m1=t7[w], m9=t15[w]
+
+    VP9_UNPACK_MULSUB_2D_4X 13,  2,  4,  7, 15893,  3981    ; m13/x=t3[d], m2/x=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  5, 10,  0,  3,  8423, 14053    ; m5/x=t11[d], m10/x=t10[d]
+    VP9_RND_SH_SUMSUB_BA    10,  2,  3,  7,  8, [pd_8192]   ; m10=t2[w], m2=t10[w]
+    VP9_RND_SH_SUMSUB_BA     5, 13,  0,  4,  8, [pd_8192]   ; m5=t3[w], m13=t11[w]
+
+    ; half of round 2 t8-15
+    VP9_UNPACK_MULSUB_2D_4X  2, 13,  4,  7,  9102, 13623    ; m2/x=t11[d], m13/x=t10[d]
+    VP9_UNPACK_MULSUB_2D_4X  9,  6,  3,  0, 13623,  9102    ; m9/x=t14[d], m6/x=t15[d]
+    VP9_RND_SH_SUMSUB_BA     9, 13,  3,  7,  8, [pd_8192]   ; m9=t10[w], m13=t14[w]
+    VP9_RND_SH_SUMSUB_BA     6,  2,  0,  4,  8, [pd_8192]   ; m6=t11[w], m2=t15[w]
+
+    SUMSUB_BA            w, 14, 10,  8                      ; m14=t2, m10=t6
+    SUMSUB_BA            w,  1,  5,  8                      ; m1=t3, m5=t7
+
+    mova                    m0, [blockq+  0]
+    mova                    m4, [blockq+128]
+    mova                    m3, [blockq+ 32]
+    mova                    m7, [blockq+ 64]
+    mova                    m8, [blockq+ 96]
+    mova          [blockq+  0], m1
+    mova          [blockq+128], m14
+    mova          [blockq+ 32], m6
+    mova          [blockq+ 64], m9
+    mova          [blockq+ 96], m10
+
+    ; second half of round 1
+    VP9_UNPACK_MULSUB_2D_4X 15,  0,  1,  9, 16364,   804    ; m15/x=t1[d], m0/x=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  7,  8, 10,  6, 11003, 12140    ; m7/x=t9[d], m8/x=t8[d]
+    VP9_RND_SH_SUMSUB_BA     8,  0,  6,  9, 14, [pd_8192]   ; m8=t0[w], m0=t8[w]
+    VP9_RND_SH_SUMSUB_BA     7, 15, 10,  1, 14, [pd_8192]   ; m7=t1[w], m15=t9[w]
+
+    VP9_UNPACK_MULSUB_2D_4X 11,  4, 10,  6, 14811,  7005    ; m11/x=t5[d], m4/x=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  3, 12,  1,  9,  5520, 15426    ; m3/x=t13[d], m12/x=t12[d]
+    VP9_RND_SH_SUMSUB_BA    12,  4,  9,  6, 14, [pd_8192]   ; m12=t4[w], m4=t12[w]
+    VP9_RND_SH_SUMSUB_BA     3, 11,  1, 10, 14, [pd_8192]   ; m3=t5[w], m11=t13[w]
+
+    ; second half of round 2 t8-15
+    VP9_UNPACK_MULSUB_2D_4X  0, 15,  6, 10, 16069,  3196    ; m15/x=t8[d], m0/x=t9[d]
+    VP9_UNPACK_MULSUB_2D_4X 11,  4,  9,  1,  3196, 16069    ; m11/x=t12[d], m4/x=t13[d]
+    VP9_RND_SH_SUMSUB_BA    11, 15,  9, 10, 14, [pd_8192]   ; m11=t8[w], m15=t12[w]
+    VP9_RND_SH_SUMSUB_BA     4,  0,  1,  6, 14, [pd_8192]   ; m4=t9[w], m0=t13[w]
+
+    SUMSUB_BA            w, 12,  8, 14                      ; m12=t0, m8=t4
+    SUMSUB_BA            w,  3,  7, 14                      ; m3=t1, m7=t5
+
+    mova                   m10, [blockq+ 96]
+    mova          [blockq+ 96], m12
+
+    ; round 3
+    VP9_UNPACK_MULSUB_2D_4X 15,  0,  9, 12, 15137,  6270    ; m15/x=t13[d], m0/x=t12[d]
+    VP9_UNPACK_MULSUB_2D_4X  2, 13,  1,  6,  6270, 15137    ; m2/x=t14[d], m13/x=t15[d]
+    VP9_RND_SH_SUMSUB_BA     2,  0,  1, 12, 14, [pd_8192]   ; m2=out2[w], m0=t14a[w]
+    VP9_RND_SH_SUMSUB_BA    13, 15,  6,  9, 14, [pd_8192]
+    PSIGNW                 m13, [pw_m1]                     ; m13=out13[w], m15=t15a[w]
+
+    VP9_UNPACK_MULSUB_2D_4X  8,  7, 12,  9, 15137,  6270    ; m8/x=t5[d], m7/x=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  5, 10,  1,  6,  6270, 15137    ; m5/x=t6[d], m10/x=t7[d]
+    VP9_RND_SH_SUMSUB_BA     5,  7,  1,  9, 14, [pd_8192]
+    PSIGNW                  m5, [pw_m1]                     ; m5=out3[w], m7=t6[w]
+    VP9_RND_SH_SUMSUB_BA    10,  8,  6, 12, 14, [pd_8192]   ; m10=out12[w], m8=t7[w]
+
+    mova                    m1, [blockq+  0]
+    mova                   m14, [blockq+128]
+    mova                    m6, [blockq+ 32]
+    mova                    m9, [blockq+ 64]
+    mova                   m12, [blockq+ 96]
+    mova          [blockq+  0], m10
+    mova          [blockq+128], m5
+
+    SUMSUB_BA            w, 14, 12,  5                      ; m14=out0, m12=t2a
+    SUMSUB_BA            w,  1,  3,  5
+    PSIGNW                  m1, [pw_m1]                     ; m1=out15, m3=t3a
+
+    SUMSUB_BA            w,  9, 11,  5
+    PSIGNW                  m9, [pw_m1]                     ; m9=out1, m11=t10
+    SUMSUB_BA            w,  6,  4,  5                      ; m6=out14, m4=t11
+
+    VP9_UNPACK_MULSUB_2W_4X  4, 11, 11585, 11585, [pd_8192],  5, 10 ; m4=out9, m11=out6
+    mova                    m5, [blockq+128]
+    mova          [blockq+192], m11
+    PSIGNW                 m15, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X 15,  0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
+
+    PSIGNW                  m3, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X  3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
+    VP9_UNPACK_MULSUB_2W_4X  8,  7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
+
+    mova                   m10, [blockq+  0]
+
+    SWAP                     0, 14,  6, 11,  8, 12, 10
+    SWAP                     1,  9, 15,  4,  7,  3,  5
+    SWAP                     5,  9, 15
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+%macro IADST16_YMM_FN 4
+INIT_YMM avx2
+cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
+    mova                m1, [blockq+ 32]
+    mova                m2, [blockq+ 64]
+    mova                m3, [blockq+ 96]
+    mova                m5, [blockq+160]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    mova                m8, [blockq+256]
+    mova                m9, [blockq+288]
+    mova               m10, [blockq+320]
+    mova               m11, [blockq+352]
+    mova               m12, [blockq+384]
+    mova               m13, [blockq+416]
+    mova               m14, [blockq+448]
+    mova               m15, [blockq+480]
+
+    VP9_%2_YMM_1D
+    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+                         [blockq+192], [blockq+128], 1
+    mova      [blockq+  0], m0
+    VP9_%4_YMM_1D
+
+    mova      [blockq+224], m7
+
+    ; store
+    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    mova                m6, [blockq+192]
+    mova                m7, [blockq+224]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
+    lea               dstq, [dstq+2*strideq]
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    pxor                m0, m0
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+IADST16_YMM_FN idct,  IDCT16,  iadst, IADST16
+IADST16_YMM_FN iadst, IADST16, idct,  IDCT16
+IADST16_YMM_FN iadst, IADST16, iadst, IADST16
+%endif
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%if %2 == 1
+%assign %%str mmsize
+%else
+%assign %%str 64
+%endif
+
+    ; first do t0-15, this can be done identical to idct16x16
+    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
+
+    ; store everything on stack to make space available for t16-31
+    ; we store interleaved with the output of the second half (t16-31)
+    ; so we don't need to allocate extra stack space
+    mova    [tmpq+ 0*%%str], m0     ; t0
+    mova    [tmpq+ 4*%%str], m1     ; t1
+    mova    [tmpq+ 8*%%str], m2     ; t2
+    mova    [tmpq+12*%%str], m3     ; t3
+    mova    [tmpq+16*%%str], m4     ; t4
+    mova    [tmpq+20*%%str], m5     ; t5
+%if ARCH_X86_64
+    mova    [tmpq+22*%%str], m10    ; t10
+    mova    [tmpq+18*%%str], m11    ; t11
+    mova    [tmpq+14*%%str], m12    ; t12
+    mova    [tmpq+10*%%str], m13    ; t13
+    mova    [tmpq+ 6*%%str], m14    ; t14
+    mova    [tmpq+ 2*%%str], m15    ; t15
+%endif
+
+    mova                m0, [tmpq+ 30*%%str]
+    UNSCRATCH            1,  6, tmpq+26*%%str
+    UNSCRATCH            2,  8, tmpq+24*%%str
+    UNSCRATCH            3,  9, tmpq+28*%%str
+    SUMSUB_BA            w,  1,  3, 4       ; t6, t9
+    SUMSUB_BA            w,  0,  2, 4       ; t7, t8
+
+    mova    [tmpq+24*%%str], m1     ; t6
+    mova    [tmpq+28*%%str], m0     ; t7
+    mova    [tmpq+30*%%str], m2     ; t8
+    mova    [tmpq+26*%%str], m3     ; t9
+
+    ; then, secondly, do t16-31
+%if %3 <= 8
+    mova                 m4, [%1+ 1*64]
+    mova                 m7, [%1+ 7*64]
+
+    pmulhrsw             m1,  m4, [pw_16364x2] ;t31
+    pmulhrsw             m4, [pw_804x2] ;t16
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0,  1,  4, 16069,  3196, [pd_8192], 6,  2 ; t17, t30
+
+    pmulhrsw             m3,  m7, [pw_m5520x2] ;t19
+    pmulhrsw             m7, [pw_15426x2] ;t28
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6,  7,  3, 3196, m16069, [pd_8192], 4,  5 ; t18, t29
+%else
+    mova                 m0, [%1+ 1*64]
+    mova                 m1, [%1+15*64]
+%if %3 <= 16
+    pmulhrsw             m5, m0, [pw_16364x2]
+    pmulhrsw             m0, [pw_804x2]
+    pmulhrsw             m4, m1, [pw_m11003x2]
+    pmulhrsw             m1, [pw_12140x2]
+%else
+    mova                 m4, [%1+17*64]
+    mova                 m5, [%1+31*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+%endif
+    SUMSUB_BA             w,  4,  0,  2
+    SUMSUB_BA             w,  1,  5,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0, 16069,  3196, [pd_8192], 2, 3 ; t17, t30
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    mova                 m2, [%1+ 7*64]
+    mova                 m3, [%1+ 9*64]
+%if %3 <= 16
+    pmulhrsw             m7,  m3, [pw_14811x2]
+    pmulhrsw             m3, [pw_7005x2]
+    pmulhrsw             m6,  m2, [pw_m5520x2]
+    pmulhrsw             m2, [pw_15426x2]
+%else
+    mova                 m7, [%1+23*64]
+    mova                 m6, [%1+25*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  7, 14811,  7005, [pd_8192], 4, 5 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   6,  2,  5520, 15426, [pd_8192], 4, 5 ; t19, t28
+%endif
+    SUMSUB_BA             w,  3,  6,  4
+    SUMSUB_BA             w,  7,  2,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%endif
+
+    UNSCRATCH             5, 12, tmpq+15*%%str
+    SUMSUB_BA             w,  6,  0,  4
+    mova    [tmpq+25*%%str], m6             ; t19
+    UNSCRATCH             4, 13, tmpq+ 1*%%str
+    SUMSUB_BA             w,  7,  1,  6
+    SUMSUB_BA             w,  3,  4,  6
+    mova    [tmpq+23*%%str], m3             ; t16
+    SUMSUB_BA             w,  2,  5,  6
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 15137,  6270, [pd_8192], 6, 3 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   1,  4, 15137,  6270, [pd_8192], 6, 3 ; t19, t28
+
+    SCRATCH               0, 10, tmpq+ 1*%%str
+    SCRATCH               1, 11, tmpq+ 7*%%str
+    SCRATCH               2,  9, tmpq+ 9*%%str
+    SCRATCH               4, 14, tmpq+15*%%str
+    SCRATCH               5, 15, tmpq+17*%%str
+    SCRATCH               7, 13, tmpq+31*%%str
+
+%if %3 <= 8
+    mova                 m0, [%1+ 5*64]
+    mova                 m3, [%1+ 3*64]
+
+    pmulhrsw             m5,  m0, [pw_15893x2] ;t27
+    pmulhrsw             m0, [pw_3981x2] ;t20
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  5,  0,  9102, 13623, [pd_8192], 7,  2 ; t21, t26
+
+    pmulhrsw             m6,  m3, [pw_m2404x2] ;t23
+    pmulhrsw             m3, [pw_16207x2] ;t24
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2,  3,  6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%else
+    mova                 m4, [%1+ 5*64]
+    mova                 m5, [%1+11*64]
+%if %3 <= 16
+    pmulhrsw             m1, m4, [pw_15893x2]
+    pmulhrsw             m4, [pw_3981x2]
+    pmulhrsw             m0, m5, [pw_m8423x2]
+    pmulhrsw             m5, [pw_14053x2]
+%else
+    mova                 m0, [%1+21*64]
+    mova                 m1, [%1+27*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   0,  5,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+    SUMSUB_BA             w,  0,  4,  2
+    SUMSUB_BA             w,  5,  1,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  9102, 13623, [pd_8192], 2, 3 ; t21, t26
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    mova                 m7, [%1+ 3*64]
+    mova                 m6, [%1+13*64]
+%if %3 <= 16
+    pmulhrsw             m3, m6, [pw_13160x2]
+    pmulhrsw             m6, [pw_9760x2]
+    pmulhrsw             m2, m7, [pw_m2404x2]
+    pmulhrsw             m7, [pw_16207x2]
+%else
+    mova                 m2, [%1+29*64]
+    mova                 m3, [%1+19*64]
+    VP9_UNPACK_MULSUB_2W_4X   6,  3, 13160,  9760, [pd_8192], 4, 5 ; t22, t25
+    VP9_UNPACK_MULSUB_2W_4X   2,  7,  2404, 16207, [pd_8192], 4, 5 ; t23, t24
+%endif
+    SUMSUB_BA             w,  6,  2,  4
+    SUMSUB_BA             w,  3,  7,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%endif
+
+    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+    UNSCRATCH             4, 12, tmpq+11*%%str
+    SUMSUB_BA             w,  0,  6, 5
+    SUMSUB_BA             w,  4,  2, 5
+    UNSCRATCH             5,  8, tmpq+ 5*%%str
+    SCRATCH               4,  8, tmpq+11*%%str
+    SUMSUB_BA             w,  1,  7, 4
+    SUMSUB_BA             w,  5,  3, 4
+    SCRATCH               5, 12, tmpq+ 5*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   2,  7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
+
+    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+    UNSCRATCH             5,  9, tmpq+ 9*%%str
+    mova                 m4, [tmpq+23*%%str] ; t16
+%if ARCH_X86_64
+    SUMSUB_BA             w,  1,  5,  9
+    SUMSUB_BA             w,  0,  4,  9
+%else
+    SUMSUB_BADC           w,  1,  5,  0,  4
+%endif
+    mova    [tmpq+29*%%str], m1     ; t17
+    mova    [tmpq+21*%%str], m0     ; t16
+    UNSCRATCH             0, 10, tmpq+ 1*%%str
+    UNSCRATCH             1, 11, tmpq+ 7*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  2,  0,  9
+    SUMSUB_BA             w,  3,  1,  9
+%else
+    SUMSUB_BADC           w,  2,  0,  3,  1
+%endif
+    mova    [tmpq+ 9*%%str], m2     ; t18
+    mova    [tmpq+13*%%str], m3     ; t19
+    SCRATCH               0, 10, tmpq+23*%%str
+    SCRATCH               1, 11, tmpq+27*%%str
+
+    UNSCRATCH             2, 14, tmpq+15*%%str
+    UNSCRATCH             3, 15, tmpq+17*%%str
+    SUMSUB_BA             w,  6,  2, 0
+    SUMSUB_BA             w,  7,  3, 0
+    SCRATCH               6, 14, tmpq+ 3*%%str
+    SCRATCH               7, 15, tmpq+ 7*%%str
+
+    UNSCRATCH             0,  8, tmpq+11*%%str
+    mova                 m1, [tmpq+25*%%str] ; t19
+    UNSCRATCH             6, 12, tmpq+ 5*%%str
+    UNSCRATCH             7, 13, tmpq+31*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  0,  1,  9
+    SUMSUB_BA             w,  6,  7,  9
+%else
+    SUMSUB_BADC           w,  0,  1,  6,  7
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+%if 0; cpuflag(ssse3)
+%if ARCH_X86_64
+    SUMSUB_BA             w,  4,  7,  8
+    SUMSUB_BA             w,  5,  1,  8
+%else
+    SUMSUB_BADC           w,  4,  7,  5,  1
+%endif
+
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m4, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+    pmulhrsw             m5, [pw_11585x2]
+
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+
+%if ARCH_X86_64
+    SUMSUB_BA             w,  7,  3, 10
+    SUMSUB_BA             w,  1,  2, 10
+%else
+    SUMSUB_BADC           w,  7,  3,  1,  2
+%endif
+
+    pmulhrsw             m3, [pw_11585x2]
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m2, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+%else
+    SCRATCH               0,  8, tmpq+15*%%str
+    SCRATCH               6,  9, tmpq+17*%%str
+    VP9_UNPACK_MULSUB_2W_4X  7,  4, 11585, 11585, [pd_8192], 0, 6
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    VP9_UNPACK_MULSUB_2W_4X  1,  5, 11585, 11585, [pd_8192], 0, 6
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+    VP9_UNPACK_MULSUB_2W_4X  3,  7, 11585, 11585, [pd_8192], 0, 6
+    VP9_UNPACK_MULSUB_2W_4X  2,  1, 11585, 11585, [pd_8192], 0, 6
+    UNSCRATCH             0,  8, tmpq+15*%%str
+    UNSCRATCH             6,  9, tmpq+17*%%str
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+    ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+    mova    [tmpq+17*%%str], m2     ; t20
+    mova    [tmpq+ 1*%%str], m3     ; t21
+%if ARCH_X86_64
+    mova    [tmpq+25*%%str], m13    ; t22
+
+    mova                 m8, [tmpq+ 0*%%str] ; t0
+    mova                 m9, [tmpq+ 4*%%str] ; t1
+    mova                m12, [tmpq+ 8*%%str] ; t2
+    mova                m11, [tmpq+12*%%str] ; t3
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    mova                m13, [tmpq+24*%%str] ; t6
+
+    SUMSUB_BA             w,  6,  8, 10
+    mova    [tmpq+ 3*%%str], m8              ; t15
+    SUMSUB_BA             w,  0,  9,  8
+    SUMSUB_BA             w, 15, 12,  8
+    SUMSUB_BA             w, 14, 11,  8
+    SUMSUB_BA             w,  1,  2,  8
+    SUMSUB_BA             w,  7,  3,  8
+    SUMSUB_BA             w,  5, 13,  8
+    mova                m10, [tmpq+28*%%str] ; t7
+    SUMSUB_BA             w,  4, 10,  8
+%if cpuflag(avx2)
+    ; the "shitty" about this idct is that the final pass does the outermost
+    ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
+    ; to be sequential, which means I need to load/store half of the sumsub
+    ; intermediates back to/from memory to get a 16x16 transpose going...
+    ; This would be easier if we had more (e.g. 32) YMM regs here.
+    mova    [tmpq+ 7*%%str], m9
+    mova    [tmpq+11*%%str], m12
+    mova    [tmpq+15*%%str], m11
+    mova    [tmpq+19*%%str], m2
+    mova    [tmpq+23*%%str], m3
+    mova    [tmpq+27*%%str], m13
+    mova    [tmpq+31*%%str], m10
+    mova    [tmpq+12*%%str], m5
+
+    mova                m13, [tmpq+30*%%str] ; t8
+    mova                m12, [tmpq+26*%%str] ; t9
+    mova                m11, [tmpq+22*%%str] ; t10
+    mova                m10, [tmpq+18*%%str] ; t11
+    mova                 m9, [tmpq+17*%%str] ; t20
+    mova                 m8, [tmpq+ 1*%%str] ; t21
+    mova                 m3, [tmpq+25*%%str] ; t22
+    mova                 m2, [tmpq+ 5*%%str] ; t23
+
+    SUMSUB_BA             w,  9, 10, 5
+    SUMSUB_BA             w,  8, 11, 5
+    SUMSUB_BA             w,  3, 12, 5
+    SUMSUB_BA             w,  2, 13, 5
+    mova    [tmpq+ 1*%%str], m10
+    mova    [tmpq+ 5*%%str], m11
+    mova    [tmpq+17*%%str], m12
+    mova    [tmpq+25*%%str], m13
+
+    mova                m13, [tmpq+14*%%str] ; t12
+    mova                m12, [tmpq+10*%%str] ; t13
+    mova                m11, [tmpq+ 9*%%str] ; t18
+    mova                m10, [tmpq+13*%%str] ; t19
+
+    SUMSUB_BA             w, 11, 12, 5
+    SUMSUB_BA             w, 10, 13, 5
+    mova    [tmpq+ 9*%%str], m13
+    mova    [tmpq+13*%%str], m12
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+
+    mova                m13, [tmpq+ 6*%%str] ; t14
+    mova                m12, [tmpq+ 2*%%str] ; t15
+    mova                m11, [tmpq+21*%%str] ; t16
+    mova                m10, [tmpq+29*%%str] ; t17
+    SUMSUB_BA             w, 11, 12, 5
+    SUMSUB_BA             w, 10, 13, 5
+    mova    [tmpq+21*%%str], m12
+    mova    [tmpq+29*%%str], m13
+    mova                m12, [tmpq+10*%%str]
+    mova                m13, [tmpq+14*%%str]
+
+    TRANSPOSE16x16W       6,  0, 15, 14,  1,  7,  5,  4, \
+                          2,  3,  8,  9, 12, 13, 10, 11, \
+            [tmpq+12*%%str], [tmpq+ 8*%%str], 1
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+ 4*%%str], m15
+    mova    [tmpq+ 6*%%str], m14
+    mova    [tmpq+10*%%str], m7
+    mova    [tmpq+12*%%str], m5
+    mova    [tmpq+14*%%str], m4
+    mova    [tmpq+16*%%str], m2
+    mova    [tmpq+18*%%str], m3
+    mova    [tmpq+20*%%str], m8
+    mova    [tmpq+22*%%str], m9
+    mova    [tmpq+24*%%str], m12
+    mova    [tmpq+26*%%str], m13
+    mova    [tmpq+28*%%str], m10
+    mova    [tmpq+30*%%str], m11
+
+    mova                 m0, [tmpq+21*%%str]
+    mova                 m1, [tmpq+29*%%str]
+    mova                 m2, [tmpq+13*%%str]
+    mova                 m3, [tmpq+ 9*%%str]
+    mova                 m4, [tmpq+ 1*%%str]
+    mova                 m5, [tmpq+ 5*%%str]
+    mova                 m7, [tmpq+25*%%str]
+    mova                 m8, [tmpq+31*%%str]
+    mova                 m9, [tmpq+27*%%str]
+    mova                m10, [tmpq+23*%%str]
+    mova                m11, [tmpq+19*%%str]
+    mova                m12, [tmpq+15*%%str]
+    mova                m13, [tmpq+11*%%str]
+    mova                m14, [tmpq+ 7*%%str]
+    mova                m15, [tmpq+ 3*%%str]
+    TRANSPOSE16x16W       0,  1,  2,  3,  4,  5,  6,  7, \
+                          8,  9, 10, 11, 12, 13, 14, 15, \
+            [tmpq+17*%%str], [tmpq+ 9*%%str], 1
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 3*%%str], m1
+    mova    [tmpq+ 5*%%str], m2
+    mova    [tmpq+ 7*%%str], m3
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+13*%%str], m6
+    mova    [tmpq+15*%%str], m7
+    mova    [tmpq+17*%%str], m8
+    mova    [tmpq+19*%%str], m9
+    mova    [tmpq+21*%%str], m10
+    mova    [tmpq+23*%%str], m11
+    mova    [tmpq+25*%%str], m12
+    mova    [tmpq+27*%%str], m13
+    mova    [tmpq+29*%%str], m14
+    mova    [tmpq+31*%%str], m15
+%else ; !avx2
+    TRANSPOSE8x8W         6, 0, 15, 14, 1, 7, 5, 4, 8
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m15
+    mova    [tmpq+12*%%str], m14
+    mova    [tmpq+16*%%str], m1
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+28*%%str], m4
+
+    mova                  m8, [tmpq+ 3*%%str] ; t15
+    TRANSPOSE8x8W         10, 13, 3, 2, 11, 12, 9, 8, 0
+    mova    [tmpq+ 3*%%str], m10
+    mova    [tmpq+ 7*%%str], m13
+    mova    [tmpq+11*%%str], m3
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+19*%%str], m11
+    mova    [tmpq+23*%%str], m12
+    mova    [tmpq+27*%%str], m9
+    mova    [tmpq+31*%%str], m8
+
+    mova                m15, [tmpq+30*%%str] ; t8
+    mova                m14, [tmpq+26*%%str] ; t9
+    mova                m13, [tmpq+22*%%str] ; t10
+    mova                m12, [tmpq+18*%%str] ; t11
+    mova                m11, [tmpq+14*%%str] ; t12
+    mova                m10, [tmpq+10*%%str] ; t13
+    mova                 m9, [tmpq+ 6*%%str] ; t14
+    mova                 m8, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    mova                 m1, [tmpq+25*%%str] ; t22
+
+    SUMSUB_BA             w,  7,  8, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BA             w,  6,  9, 8
+    SUMSUB_BA             w,  5, 10, 8
+    SUMSUB_BA             w,  4, 11, 8
+    SUMSUB_BA             w,  3, 12, 8
+    SUMSUB_BA             w,  2, 13, 8
+    SUMSUB_BA             w,  1, 14, 8
+    SUMSUB_BA             w,  0, 15, 8
+
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+17*%%str], m4
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m8, [tmpq+ 2*%%str]
+    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova    [tmpq+ 6*%%str], m9
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+    mova    [tmpq+18*%%str], m12
+    mova    [tmpq+22*%%str], m13
+    mova    [tmpq+26*%%str], m14
+    mova    [tmpq+30*%%str], m15
+%endif ; avx2
+%else
+    mova                 m2, [tmpq+24*%%str] ; t6
+    mova                 m3, [tmpq+28*%%str] ; t7
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+23*%%str], m2
+    mova    [tmpq+28*%%str], m4
+    mova    [tmpq+19*%%str], m3
+
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    SUMSUB_BA             w,  1,  2,  5
+    SUMSUB_BA             w,  7,  3,  5
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+11*%%str], m3
+
+    mova                 m2, [tmpq+ 0*%%str] ; t0
+    mova                 m3, [tmpq+ 4*%%str] ; t1
+    SUMSUB_BA             w,  6,  2,  5
+    SUMSUB_BA             w,  0,  3,  5
+    mova    [tmpq+31*%%str], m2
+    mova    [tmpq+27*%%str], m3
+
+    mova                 m2, [tmpq+ 8*%%str] ; t2
+    mova                 m3, [tmpq+12*%%str] ; t3
+    mova                 m5, [tmpq+ 7*%%str]
+    mova                 m4, [tmpq+ 3*%%str]
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+ 7*%%str], m2
+    mova    [tmpq+ 3*%%str], m3
+
+    mova                 m3, [tmpq+28*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m5
+    mova    [tmpq+12*%%str], m4
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m2
+    mova    [tmpq+28*%%str], m3
+
+    mova                 m6, [tmpq+19*%%str]
+    mova                 m0, [tmpq+23*%%str]
+    mova                 m5, [tmpq+11*%%str]
+    mova                 m4, [tmpq+15*%%str]
+    mova                 m1, [tmpq+ 3*%%str]
+    mova                 m7, [tmpq+ 7*%%str]
+    mova                 m3, [tmpq+31*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
+    mova    [tmpq+ 3*%%str], m6
+    mova    [tmpq+ 7*%%str], m0
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+15*%%str], m4
+    mova    [tmpq+23*%%str], m7
+    mova    [tmpq+27*%%str], m2
+    mova    [tmpq+31*%%str], m3
+
+    mova                 m1, [tmpq+ 6*%%str] ; t14
+    mova                 m0, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    SUMSUB_BA             w,  7,  0,  2
+    SUMSUB_BA             w,  6,  1,  2
+    mova    [tmpq+29*%%str], m7
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+21*%%str], m6
+    mova    [tmpq+ 6*%%str], m1
+
+    mova                 m1, [tmpq+14*%%str] ; t12
+    mova                 m0, [tmpq+10*%%str] ; t13
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    SUMSUB_BA             w,  5,  0,  2
+    SUMSUB_BA             w,  4,  1,  2
+    mova     [tmpq+10*%%str], m0
+    mova     [tmpq+14*%%str], m1
+
+    mova                 m1, [tmpq+22*%%str] ; t10
+    mova                 m0, [tmpq+18*%%str] ; t11
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    SUMSUB_BA             w,  3,  0,  6
+    SUMSUB_BA             w,  2,  1,  6
+    mova     [tmpq+18*%%str], m0
+    mova     [tmpq+22*%%str], m1
+
+    mova                 m7, [tmpq+30*%%str] ; t8
+    mova                 m6, [tmpq+26*%%str] ; t9
+    mova                 m1, [tmpq+25*%%str] ; t22
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BADC           w,  1,  6,  0,  7
+    mova     [tmpq+26*%%str], m6
+    mova     [tmpq+30*%%str], m7
+
+    mova                 m7, [tmpq+29*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m0, [tmpq+ 2*%%str]
+    mova                 m1, [tmpq+ 6*%%str]
+    mova                 m2, [tmpq+10*%%str]
+    mova                 m3, [tmpq+14*%%str]
+    mova                 m4, [tmpq+18*%%str]
+    mova                 m5, [tmpq+22*%%str]
+    mova                 m7, [tmpq+30*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+ 6*%%str], m1
+    mova    [tmpq+10*%%str], m2
+    mova    [tmpq+14*%%str], m3
+    mova    [tmpq+22*%%str], m5
+    mova    [tmpq+26*%%str], m6
+    mova    [tmpq+30*%%str], m7
+%endif
+%else
+    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+    ; t20-22 is in m4-6
+    ; t24-31 is in m8-15
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+    SUMSUB_BA            w, %4, %1, %5
+    SUMSUB_BA            w, %3, %2, %5
+    VP9_IDCT8_WRITEx2   %4, %3, %5, %6, %7, ROUND_REG, 6
+%if %8 == 1
+    add               dstq, stride2q
+%endif
+    VP9_IDCT8_WRITEx2   %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
+%if %8 == 1
+    sub           dst_endq, stride2q
+%endif
+%endmacro
+
+%if ARCH_X86_64
+    pxor               m10, m10
+
+    ; store t0-1 and t30-31
+    mova                m8, [tmpq+ 0*%%str]
+    mova                m9, [tmpq+ 4*%%str]
+    %%STORE_2X2          8,  9,  0,  6, 12, 11, 10
+
+    ; store t2-3 and t28-29
+    mova                m8, [tmpq+ 8*%%str]
+    mova                m9, [tmpq+12*%%str]
+    %%STORE_2X2          8,  9, 14, 15, 12, 11, 10
+
+    ; store t4-5 and t26-27
+    mova                m8, [tmpq+16*%%str]
+    mova                m9, [tmpq+20*%%str]
+    %%STORE_2X2          8,  9,  7,  1, 12, 11, 10
+
+    ; store t6-7 and t24-25
+    mova                m8, [tmpq+24*%%str]
+    mova                m9, [tmpq+28*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t8-9 and t22-23
+    mova                m8, [tmpq+30*%%str]
+    mova                m9, [tmpq+26*%%str]
+    mova                m0, [tmpq+ 5*%%str]
+    %%STORE_2X2          8,  9, 13,  0, 12, 11, 10
+
+    ; store t10-11 and t20-21
+    mova                m8, [tmpq+22*%%str]
+    mova                m9, [tmpq+18*%%str]
+    %%STORE_2X2          8,  9,  2,  3, 12, 11, 10
+
+    ; store t12-13 and t18-19
+    mova                m8, [tmpq+14*%%str]
+    mova                m9, [tmpq+10*%%str]
+    mova                m5, [tmpq+13*%%str]
+    mova                m4, [tmpq+ 9*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t14-17
+    mova                m8, [tmpq+ 6*%%str]
+    mova                m9, [tmpq+ 2*%%str]
+    mova                m5, [tmpq+29*%%str]
+    mova                m4, [tmpq+21*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10, 0
+
+    SWAP                 1, 10 ; zero
+%else
+    mova   [tmpq+ 1*%%str], m1
+    mova   [tmpq+11*%%str], m2
+    mova   [tmpq+15*%%str], m3
+    mova   [tmpq+17*%%str], m4
+    mova   [tmpq+19*%%str], m5
+    pxor                m1, m1
+
+    ; store t0-1 and t30-31
+    mova                m2, [tmpq+ 0*%%str]
+    mova                m3, [tmpq+ 4*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t2-3 and t28-29
+    mova                m2, [tmpq+ 8*%%str]
+    mova                m3, [tmpq+12*%%str]
+    mova                m0, [tmpq+ 3*%%str]
+    mova                m6, [tmpq+ 7*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t4-5 and t26-27
+    mova                m2, [tmpq+16*%%str]
+    mova                m3, [tmpq+20*%%str]
+    mova                m0, [tmpq+ 1*%%str]
+    %%STORE_2X2          2,  3,  7,  0, 4, 5, 1
+
+    ; store t6-7 and t24-25
+    mova                m2, [tmpq+24*%%str]
+    mova                m3, [tmpq+28*%%str]
+    mova                m0, [tmpq+17*%%str]
+    mova                m6, [tmpq+19*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t8-9 and t22-23
+    mova                m2, [tmpq+30*%%str]
+    mova                m3, [tmpq+26*%%str]
+    mova                m0, [tmpq+25*%%str]
+    mova                m6, [tmpq+ 5*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t10-11 and t20-21
+    mova                m2, [tmpq+22*%%str]
+    mova                m3, [tmpq+18*%%str]
+    mova                m0, [tmpq+11*%%str]
+    mova                m6, [tmpq+15*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t12-13 and t18-19
+    mova                m2, [tmpq+14*%%str]
+    mova                m3, [tmpq+10*%%str]
+    mova                m6, [tmpq+13*%%str]
+    mova                m0, [tmpq+ 9*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t14-17
+    mova                m2, [tmpq+ 6*%%str]
+    mova                m3, [tmpq+ 2*%%str]
+    mova                m6, [tmpq+29*%%str]
+    mova                m0, [tmpq+21*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1, 0
+%endif
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+    movifnidn         eobd, dword eobm
+%if cpuflag(ssse3)
+    cmp eobd, 135
+    jg .idctfull
+    cmp eobd, 34
+    jg .idct16x16
+    cmp eobd, 1
+    jg .idct8x8
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+    ; dc-only case
+    movifnidn       blockq, blockmp
+    movifnidn         dstq, dstmp
+    movifnidn      strideq, stridemp
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 31
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    add               dstq, strideq
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    RET
+
+%if ARCH_X86_64
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+%else
+%define dst_bakq r0mp
+%endif
+%if cpuflag(ssse3)
+.idct8x8:
+%if ARCH_X86_32
+    DEFINE_ARGS block, u1, u2, u3, u4, tmp
+    mov             blockq, r2mp
+%endif
+    mov               tmpq, rsp
+    VP9_IDCT32_1D   blockq, 1, 8
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    sub          stride30q, stride2q        ; stride*30
+.loop2_8x8:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 8
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64,  8, m1
+    RET
+
+.idct16x16:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_16x16:
+    VP9_IDCT32_1D   blockq, 1, 16
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_16x16
+
+%if ARCH_X86_64
+    sub             blockq, 32
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_16x16:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 16
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_16x16
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 16, m1
+    RET
+%endif
+
+.idctfull:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 4
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT32_1D   blockq, 1
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_full
+
+%if ARCH_X86_64
+    sub             blockq, 64
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_full:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 32, m1
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM sse2
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
+
+; this is almost identical to VP9_STORE_2X, but it does two rows
+; for slightly improved interleaving, and it omits vpermq since the
+; input is DC so all values are identical
+%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
+    mova               m%2, [dstq]
+    mova               m%4, [dstq+strideq]
+    punpckhbw          m%3, m%2, m%6
+    punpcklbw          m%2, m%6
+    punpckhbw          m%5, m%4, m%6
+    punpcklbw          m%4, m%6
+    paddw              m%3, m%1
+    paddw              m%2, m%1
+    paddw              m%5, m%1
+    paddw              m%4, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova  [dstq+strideq*0], m%2
+    mova  [dstq+strideq*1], m%4
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
+    cmp eobd, 135
+    jg .idctfull
+    cmp eobd, 1
+    jg .idct16x16
+
+    ; dc-only case
+    mova                m1, [pw_11585x2]
+    vpbroadcastw        m0, [blockq]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+    pxor                m5, m5
+    pmulhrsw            m0, [pw_512]
+    movd          [blockq], xm5
+
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 16
+.loop_dc:
+    VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+2*strideq]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+.idct16x16:
+    mov               tmpq, rsp
+    VP9_IDCT32_1D   blockq, 1, 16
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 2
+    sub          stride30q, stride2q        ; stride*30
+.loop2_16x16:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 16
+    add           dst_bakq, 16
+    add               tmpq, 32
+    dec               cntd
+    jg .loop2_16x16
+
+    ; at the end of the loop, m1 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64, 16, m1
+    RET
+
+.idctfull:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT32_1D   blockq, 1
+    add             blockq, 32
+    add               tmpq, 1024
+    dec               cntd
+    jg .loop1_full
+
+    sub             blockq, 64
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 2
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_full:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2
+    add           dst_bakq, 16
+    add               tmpq, 32
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m1 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64, 32, m1
+    RET
+%endif
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
new file mode 100644
index 0000000..902685e
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 inverse transform x86 SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA
+
+cextern pw_8
+cextern pw_1023
+cextern pw_2048
+cextern pw_4095
+cextern pw_m1
+cextern pd_1
+cextern pd_16
+cextern pd_32
+cextern pd_8192
+
+pd_8: times 4 dd 8
+pd_3fff: times 4 dd 0x3fff
+
+cextern pw_11585x2
+
+cextern pw_5283_13377
+cextern pw_9929_13377
+cextern pw_15212_m13377
+cextern pw_15212_9929
+cextern pw_m5283_m15212
+cextern pw_13377x2
+cextern pw_m13377_13377
+cextern pw_13377_0
+
+pw_9929_m5283: times 4 dw 9929, -5283
+
+%macro COEF_PAIR 2-3
+cextern pw_m%1_%2
+cextern pw_%2_%1
+%if %0 == 3
+cextern pw_m%1_m%2
+%if %1 != %2
+cextern pw_m%2_%1
+cextern pw_%1_%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR  2404, 16207
+COEF_PAIR  3196, 16069, 1
+COEF_PAIR  4756, 15679
+COEF_PAIR  5520, 15426
+COEF_PAIR  6270, 15137, 1
+COEF_PAIR  8423, 14053
+COEF_PAIR 10394, 12665
+COEF_PAIR 11003, 12140
+COEF_PAIR 11585, 11585, 1
+COEF_PAIR 13160,  9760
+COEF_PAIR 13623,  9102, 1
+COEF_PAIR 14449,  7723
+COEF_PAIR 14811,  7005
+COEF_PAIR 15893,  3981
+COEF_PAIR 16305,  1606
+COEF_PAIR 16364,   804
+
+default_8x8:
+times 12 db 1
+times 52 db 2
+row_8x8:
+times 18 db 1
+times 46 db 2
+col_8x8:
+times 6 db 1
+times 58 db 2
+default_16x16:
+times 10 db 1
+times 28 db 2
+times 51 db 3
+times 167 db 4
+row_16x16:
+times 21 db 1
+times 45 db 2
+times 60 db 3
+times 130 db 4
+col_16x16:
+times 5 db 1
+times 12 db 2
+times 25 db 3
+times 214 db 4
+default_32x32:
+times 9 db 1
+times 25 db 2
+times 36 db 3
+times 65 db 4
+times 105 db 5
+times 96 db 6
+times 112 db 7
+times 576 db 8
+
+SECTION .text
+
+%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
+    mova               m%3, [%7]
+    mova               m%4, [%7+strideq]
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+    pmaxsw             m%3, m%5
+    pmaxsw             m%4, m%5
+    pminsw             m%3, m%6
+    pminsw             m%4, m%6
+    mova              [%7], m%3
+    mova      [%7+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*4/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+; the input coefficients are scaled up by 2 bit (which we downscale immediately
+; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
+; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
+; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
+; add 2 bits, we need to scale before converting to word in 12bpp, since the
+; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
+; we can scale after converting to words (which is half the instructions),
+; since the input is only 14+sign bit, which fits in 15+sign words directly.
+
+%macro IWHT4_FN 2 ; bpp, max
+cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
+    mova                m7, [pw_%2]
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+%if %1 >= 12
+    mova                m4, [blockq+0*16+8]
+    mova                m5, [blockq+1*16+8]
+    psrad               m0, 2
+    psrad               m1, 2
+    psrad               m4, 2
+    psrad               m5, 2
+    packssdw            m0, m4
+    packssdw            m1, m5
+%else
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    psraw               m0, 2
+    psraw               m1, 2
+%endif
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+%if %1 >= 12
+    mova                m4, [blockq+2*16+8]
+    mova                m5, [blockq+3*16+8]
+    psrad               m2, 2
+    psrad               m3, 2
+    psrad               m4, 2
+    psrad               m5, 2
+    packssdw            m2, m4
+    packssdw            m3, m5
+%else
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+    psraw               m2, 2
+    psraw               m3, 2
+%endif
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m6, m6
+    VP9_STORE_2X         0, 1, 4, 5, 6, 7
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 4, 5, 6, 7
+    ZERO_BLOCK      blockq, 16, 4, m6
+    RET
+%endmacro
+
+INIT_MMX mmxext
+IWHT4_FN 10, 1023
+INIT_MMX mmxext
+IWHT4_FN 12, 4095
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5
+    pmulhrsw            m1, m5
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m0, 4
+    psraw               m1, 4
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    mova                m5, [pw_1023]
+    VP9_STORE_2X         0,  1,  6,  7,  4,  5
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         2,  3,  6,  7,  4,  5
+%endmacro
+
+%macro DC_ONLY 2 ; shift, zero
+    mov              coefd, dword [blockq]
+    movd          [blockq], %2
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, ((1 << (%1 - 1)) << 14) + 8192
+    sar              coefd, 14 + %1
+%endmacro
+
+; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
+; in 15+1 words without additional effort, since the coefficients are 15bpp.
+
+%macro IDCT4_10_FN 0
+cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only
+    pxor                m4, m4
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    movd          [blockq], m4
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    DC_ONLY              4, m4
+    movd                m0, coefd
+%endif
+    pshufw              m0, m0, 0
+    mova                m5, [pw_1023]
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+    VP9_STORE_2X         0,  0,  6,  7,  4,  5
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4,  5
+    RET
+
+.idctfull:
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+INIT_MMX mmxext
+IDCT4_10_FN
+INIT_MMX ssse3
+IDCT4_10_FN
+
+%macro IADST4_FN 4
+cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+INIT_MMX sse2
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+INIT_MMX ssse3
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
+; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
+%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
+    pand               m%3, m%1, %8
+    pand               m%4, m%2, %8
+    psrad              m%1, 14
+    psrad              m%2, 14
+    packssdw           m%4, m%2
+    packssdw           m%3, m%1
+    punpckhwd          m%2, m%4, m%3
+    punpcklwd          m%4, m%3
+    pmaddwd            m%3, m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_%6_%5]
+    pmaddwd            m%4, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_m%5_%6]
+    paddd              m%3, %7
+    paddd              m%4, %7
+    psrad              m%3, 14
+    psrad              m%4, 14
+    paddd              m%1, m%3
+    paddd              m%2, m%4
+%endmacro
+
+%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
+    SUMSUB_MUL          %3, %5, %7, %8, 11585, 11585, %1, %2
+    SUMSUB_MUL          %4, %6, %7, %8, 15137,  6270, %1, %2
+    SUMSUB_BA        d, %4, %3, %7
+    SUMSUB_BA        d, %6, %5, %7
+    SWAP                %4, %6, %3
+%endmacro
+
+%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
+    movh               m%1, [dstq+strideq*0]
+    movh               m%2, [dstq+strideq*2]
+    movhps             m%1, [dstq+strideq*1]
+    movhps             m%2, [dstq+stride3q ]
+    paddw              m%1, m%3
+    paddw              m%2, m%4
+    pmaxsw             m%1, %5
+    pmaxsw             m%2, %5
+    pminsw             m%1, %6
+    pminsw             m%2, %6
+    movh   [dstq+strideq*0], m%1
+    movhps [dstq+strideq*1], m%1
+    movh   [dstq+strideq*2], m%2
+    movhps [dstq+stride3q ], m%2
+%endmacro
+
+%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
+    paddd              m%1, %7
+    paddd              m%2, %7
+    paddd              m%3, %7
+    paddd              m%4, %7
+    psrad              m%1, %8
+    psrad              m%2, %8
+    psrad              m%3, %8
+    psrad              m%4, %8
+    packssdw           m%1, m%2
+    packssdw           m%3, m%4
+    STORE_4x4           %2, %4, %1, %3, %5, %6
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
+    ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
+    ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
+    ; dword. After the final shift (4), the result is 13+sign bits, so we
+    ; don't need any additional processing to fit it in a word
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m4, m4
+    DC_ONLY              4, m4
+    movd                m0, coefd
+    pshuflw             m0, m0, q0000
+    punpcklqdq          m0, m0
+    mova                m5, [pw_4095]
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    STORE_4x4            1, 3, 0, 0, m4, m5
+    RET
+
+.idctfull:
+    DEFINE_ARGS dst, stride, block, eob
+    mova                m0, [blockq+0*16]
+    mova                m1, [blockq+1*16]
+    mova                m2, [blockq+2*16]
+    mova                m3, [blockq+3*16]
+    mova                m6, [pd_8192]
+    mova                m7, [pd_3fff]
+
+    IDCT4_12BPP_1D      m6, m7
+    TRANSPOSE4x4D        0, 1, 2, 3, 4
+    IDCT4_12BPP_1D      m6, m7
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+
+    ; writeout
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    mova                m5, [pw_4095]
+    mova                m6, [pd_8]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
+    RET
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; out0 =  5283 * in0 + 13377 + in1 + 15212 * in2 +  9929 * in3 + rnd >> 14
+; out1 =  9929 * in0 + 13377 * in1 -  5283 * in2 - 15282 * in3 + rnd >> 14
+; out2 = 13377 * in0               - 13377 * in2 + 13377 * in3 + rnd >> 14
+; out3 = 15212 * in0 - 13377 * in1 +  9929 * in2 -  5283 * in3 + rnd >> 14
+%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
+    pand                m4, m0, %2
+    pand                m5, m1, %2
+    psrad               m0, 14
+    psrad               m1, 14
+    packssdw            m5, m1
+    packssdw            m4, m0
+    punpckhwd           m1, m4, m5
+    punpcklwd           m4, m5
+    pand                m5, m2, %2
+    pand                m6, m3, %2
+    psrad               m2, 14
+    psrad               m3, 14
+    packssdw            m6, m3
+    packssdw            m5, m2
+    punpckhwd           m3, m5, m6
+    punpcklwd           m5, m6
+    SCRATCH              1,  8, rsp+0*mmsize, a
+    SCRATCH              5,  9, rsp+1*mmsize, b
+
+    ; m1/3 have the high bits of 0,1,2,3
+    ; m4/5 have the low bits of 0,1,2,3
+    ; m0/2/6/7 are free
+
+    mova                m2, [pw_15212_9929]
+    mova                m0, [pw_5283_13377]
+    pmaddwd             m7, m2, reg_b
+    pmaddwd             m6, m4, m0
+    pmaddwd             m2, m3
+    pmaddwd             m0, reg_a
+    paddd               m6, m7
+    paddd               m0, m2
+    mova                m1, [pw_m13377_13377]
+    mova                m5, [pw_13377_0]
+    pmaddwd             m7, m1, reg_b
+    pmaddwd             m2, m4, m5
+    pmaddwd             m1, m3
+    pmaddwd             m5, reg_a
+    paddd               m2, m7
+    paddd               m1, m5
+    paddd               m6, %1
+    paddd               m2, %1
+    psrad               m6, 14
+    psrad               m2, 14
+    paddd               m0, m6                      ; t0
+    paddd               m2, m1                      ; t2
+
+    mova                m7, [pw_m5283_m15212]
+    mova                m5, [pw_9929_13377]
+    pmaddwd             m1, m7, reg_b
+    pmaddwd             m6, m4, m5
+    pmaddwd             m7, m3
+    pmaddwd             m5, reg_a
+    paddd               m6, m1
+    paddd               m7, m5
+    UNSCRATCH            5,  9, rsp+1*mmsize, b
+    pmaddwd             m5, [pw_9929_m5283]
+    pmaddwd             m4, [pw_15212_m13377]
+    pmaddwd             m3, [pw_9929_m5283]
+    UNSCRATCH            1,  8, rsp+0*mmsize, a
+    pmaddwd             m1, [pw_15212_m13377]
+    paddd               m4, m5
+    paddd               m3, m1
+    paddd               m6, %1
+    paddd               m4, %1
+    psrad               m6, 14
+    psrad               m4, 14
+    paddd               m7, m6                      ; t1
+    paddd               m3, m4                      ; t3
+
+    SWAP                 1, 7
+%endmacro
+
+%macro IADST4_12BPP_FN 4
+cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
+    mova                m0, [blockq+0*16]
+    mova                m1, [blockq+1*16]
+    mova                m2, [blockq+2*16]
+    mova                m3, [blockq+3*16]
+
+    PRELOAD             10, pd_8192, rnd
+    PRELOAD             11, pd_3fff, mask
+    %2_12BPP_1D    reg_rnd, reg_mask
+    TRANSPOSE4x4D        0, 1, 2, 3, 4
+    %4_12BPP_1D    reg_rnd, reg_mask
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+
+    ; writeout
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    mova                m5, [pw_4095]
+    mova                m6, [pd_8]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
+    RET
+%endmacro
+
+INIT_XMM sse2
+IADST4_12BPP_FN idct,  IDCT4,  iadst, IADST4
+IADST4_12BPP_FN iadst, IADST4, idct,  IDCT4
+IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH            6, 8, rsp+%3*mmsize
+%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
+    mova                m0, [%1+0*%4]
+    mova                m2, [%1+2*%4]
+    mova                m4, [%1+4*%4]
+    mova                m6, [%1+6*%4]
+    IDCT4_12BPP_1D      %2, %3, 0, 2, 4, 6, 1, 3            ; m0/2/4/6 have t0/1/2/3
+    SCRATCH              4, 8, rsp+(%5+0)*mmsize
+    SCRATCH              6, 9, rsp+(%5+1)*mmsize
+    mova                m1, [%1+1*%4]
+    mova                m3, [%1+3*%4]
+    mova                m5, [%1+5*%4]
+    mova                m7, [%1+7*%4]
+    SUMSUB_MUL           1, 7, 4, 6, 16069,  3196, %2, %3   ; m1=t7a, m7=t4a
+    SUMSUB_MUL           5, 3, 4, 6,  9102, 13623, %2, %3   ; m5=t6a, m3=t5a
+    SUMSUB_BA         d, 3, 7, 4                            ; m3=t4, m7=t5a
+    SUMSUB_BA         d, 5, 1, 4                            ; m5=t7, m1=t6a
+    SUMSUB_MUL           1, 7, 4, 6, 11585, 11585, %2, %3   ; m1=t6, m7=t5
+    SUMSUB_BA         d, 5, 0, 4                            ; m5=out0, m0=out7
+    SUMSUB_BA         d, 1, 2, 4                            ; m1=out1, m2=out6
+    UNSCRATCH            4, 8, rsp+(%5+0)*mmsize
+    UNSCRATCH            6, 9, rsp+(%5+1)*mmsize
+    SCRATCH              2, 8, rsp+(%5+0)*mmsize
+    SUMSUB_BA         d, 7, 4, 2                            ; m7=out2, m4=out5
+    SUMSUB_BA         d, 3, 6, 2                            ; m3=out3, m6=out4
+    SWAP                 0, 5, 4, 6, 2, 7
+%endmacro
+
+%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
+    mova               m%1, [%6+%7*0]
+    mova               m%2, [%6+%7*1]
+    paddw              m%1, m%3
+    paddw              m%2, m%3
+    pmaxsw             m%1, %4
+    pmaxsw             m%2, %4
+    pminsw             m%1, %5
+    pminsw             m%2, %5
+    mova         [%6+%7*0], m%1
+    mova         [%6+%7*1], m%2
+%endmacro
+
+; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
+; storage also instead of allocating two more stack spaces. This doesn't
+; matter much but it's something...
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
+                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+                                  dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              5, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 4
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    SCRATCH              0, 12, rsp+16*mmsize, max
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_8x8]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_8x8+cntq-1]
+%endif
+    mov              skipd, 2
+    sub              skipd, cntd
+    mov               ptrq, rsp
+    PRELOAD             10, pd_8192, rnd
+    PRELOAD             11, pd_3fff, mask
+    PRELOAD             13, pd_16, srnd
+.loop_1:
+    IDCT8_1D        blockq, reg_rnd, reg_mask
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 6
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 2*mmsize], m1
+    mova  [ptrq+ 4*mmsize], m2
+    mova  [ptrq+ 6*mmsize], m3
+    UNSCRATCH            6, 8, rsp+17*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 1*mmsize], m4
+    mova  [ptrq+ 3*mmsize], m5
+    mova  [ptrq+ 5*mmsize], m6
+    mova  [ptrq+ 7*mmsize], m7
+    add               ptrq, 8 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    add               ptrq, 4 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 2
+    mov               ptrq, rsp
+.loop_2:
+    IDCT8_1D          ptrq, reg_rnd, reg_mask
+
+    pxor                m6, m6
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+    lea               dstq, [dstq+strideq*4]
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    UNSCRATCH            1, 12, rsp+16*mmsize, max
+    UNSCRATCH            2, 13, pd_16, srnd
+    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
+    add               ptrq, 16
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+8]
+%else
+    mov               dstq, dstm
+    add               dstq, 8
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m6 is still zero
+    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+    RET
+
+%macro DC_ONLY_64BIT 2 ; shift, zero
+%if ARCH_X86_64
+    movsxd           coefq, dword [blockq]
+    movd          [blockq], %2
+    imul             coefq, 11585
+    add              coefq, 8192
+    sar              coefq, 14
+    imul             coefq, 11585
+    add              coefq, ((1 << (%1 - 1)) << 14) + 8192
+    sar              coefq, 14 + %1
+%else
+    mov              coefd, dword [blockq]
+    movd          [blockq], %2
+    DEFINE_ARGS dst, stride, cnt, coef, coefl
+    mov               cntd, 2
+.loop_dc_calc:
+    mov             coefld, coefd
+    sar              coefd, 14
+    and             coefld, 0x3fff
+    imul             coefd, 11585
+    imul            coefld, 11585
+    add             coefld, 8192
+    sar             coefld, 14
+    add              coefd, coefld
+    dec               cntd
+    jg .loop_dc_calc
+    add              coefd, 1 << (%1 - 1)
+    sar              coefd, %1
+%endif
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
+                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+                                  dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
+    ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        5, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 4
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
+; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
+%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
+    pand               m%3, m%1, %7
+    pand               m%4, m%2, %7
+    psrad              m%1, 14
+    psrad              m%2, 14
+    packssdw           m%4, m%2
+    packssdw           m%3, m%1
+    punpckhwd          m%2, m%4, m%3
+    punpcklwd          m%4, m%3
+    pmaddwd            m%3, m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_%6_%5]
+    pmaddwd            m%4, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_m%5_%6]
+%endmacro
+
+; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
+; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
+%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
+    SUMSUB_BA        d, %1, %2, %5
+    SUMSUB_BA        d, %3, %4, %5
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%3, 14
+    psrad              m%4, 14
+    paddd              m%1, m%3
+    paddd              m%2, m%4
+%endmacro
+
+%macro NEGD 1
+%if cpuflag(ssse3)
+    psignd              %1, [pw_m1]
+%else
+    pxor                %1, [pw_m1]
+    paddd               %1, [pd_1]
+%endif
+%endmacro
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH            6, 8, rsp+17*mmsize
+%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
+    mova                m0, [%1+ 0*mmsize]
+    mova                m3, [%1+ 6*mmsize]
+    mova                m4, [%1+ 8*mmsize]
+    mova                m7, [%1+14*mmsize]
+    SUMSUB_MUL_D         7, 0, 1, 2, 16305,  1606, %3   ; m7/1=t0a, m0/2=t1a
+    SUMSUB_MUL_D         3, 4, 5, 6, 10394, 12665, %3   ; m3/5=t4a, m4/6=t5a
+    SCRATCH              0, 8, rsp+17*mmsize
+    SUMSUB_PACK_D        3, 7, 5, 1, 0, %2              ; m3=t0, m7=t4
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    SUMSUB_PACK_D        4, 0, 6, 2, 1, %2              ; m4=t1, m0=t5
+
+    SCRATCH              3, 8, rsp+17*mmsize
+    SCRATCH              4, 9, rsp+18*mmsize
+    SCRATCH              7, 10, rsp+19*mmsize
+    SCRATCH              0, 11, rsp+20*mmsize
+
+    mova                m1, [%1+ 2*mmsize]
+    mova                m2, [%1+ 4*mmsize]
+    mova                m5, [%1+10*mmsize]
+    mova                m6, [%1+12*mmsize]
+    SUMSUB_MUL_D         5, 2, 3, 4, 14449,  7723, %3   ; m5/8=t2a, m2/9=t3a
+    SUMSUB_MUL_D         1, 6, 7, 0,  4756, 15679, %3   ; m1/10=t6a, m6/11=t7a
+    SCRATCH              2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        1, 5, 7, 3, 2, %2              ; m1=t2, m5=t6
+    UNSCRATCH            2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        6, 2, 0, 4, 3, %2              ; m6=t3, m2=t7
+
+    UNSCRATCH            7, 10, rsp+19*mmsize
+    UNSCRATCH            0, 11, rsp+20*mmsize
+    SCRATCH              1, 10, rsp+19*mmsize
+    SCRATCH              6, 11, rsp+20*mmsize
+
+    SUMSUB_MUL_D         7, 0, 3, 4, 15137,  6270, %3   ; m7/8=t4a, m0/9=t5a
+    SUMSUB_MUL_D         2, 5, 1, 6,  6270, 15137, %3   ; m2/10=t7a, m5/11=t6a
+    SCRATCH              2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        5, 7, 6, 3, 2, %2              ; m5=-out1, m7=t6
+    UNSCRATCH            2, 12, rsp+21*mmsize
+    NEGD                m5                              ; m5=out1
+    SUMSUB_PACK_D        2, 0, 1, 4, 3, %2              ; m2=out6, m0=t7
+    SUMSUB_MUL           7, 0, 3, 4, 11585, 11585, %2, %3   ; m7=out2, m0=-out5
+    NEGD                m0                              ; m0=out5
+
+    UNSCRATCH            3, 8, rsp+17*mmsize
+    UNSCRATCH            4, 9, rsp+18*mmsize
+    UNSCRATCH            1, 10, rsp+19*mmsize
+    UNSCRATCH            6, 11, rsp+20*mmsize
+    SCRATCH              2, 8, rsp+17*mmsize
+    SCRATCH              0, 9, rsp+18*mmsize
+
+    SUMSUB_BA         d, 1, 3,  2                       ; m1=out0, m3=t2
+    SUMSUB_BA         d, 6, 4,  2                       ; m6=-out7, m4=t3
+    NEGD                m6                              ; m6=out7
+    SUMSUB_MUL           3, 4,  2,  0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
+    NEGD                m3                              ; m3=out3
+
+    UNSCRATCH            0, 9, rsp+18*mmsize
+
+    SWAP                 0, 1, 5
+    SWAP                 2, 7, 6
+%endmacro
+
+%macro IADST8_FN 5
+cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
+                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+                              dst, stride, block, eob
+    mova                m0, [pw_1023]
+
+.body:
+    SCRATCH              0, 13, rsp+16*mmsize, max
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [%5_8x8]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [%5_8x8+cntq-1]
+%endif
+    mov              skipd, 2
+    sub              skipd, cntd
+    mov               ptrq, rsp
+    PRELOAD             14, pd_8192, rnd
+    PRELOAD             15, pd_3fff, mask
+.loop_1:
+    %2_1D           blockq, reg_rnd, reg_mask
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 6
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 2*mmsize], m1
+    mova  [ptrq+ 4*mmsize], m2
+    mova  [ptrq+ 6*mmsize], m3
+    UNSCRATCH            6, 8, rsp+17*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 1*mmsize], m4
+    mova  [ptrq+ 3*mmsize], m5
+    mova  [ptrq+ 5*mmsize], m6
+    mova  [ptrq+ 7*mmsize], m7
+    add               ptrq, 8 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    add               ptrq, 4 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 2
+    mov               ptrq, rsp
+.loop_2:
+    %4_1D             ptrq, reg_rnd, reg_mask
+
+    pxor                m6, m6
+    PRELOAD              9, pd_16, srnd
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+    lea               dstq, [dstq+strideq*4]
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    UNSCRATCH            1, 13, rsp+16*mmsize, max
+    UNSCRATCH            2, 9, pd_16, srnd
+    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
+    add               ptrq, 16
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+8]
+%else
+    mov               dstq, dstm
+    add               dstq, 8
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m6 is still zero
+    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+    RET
+
+cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
+                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+                              dst, stride, block, eob
+    mova                m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST8_FN idct,  IDCT8,  iadst, IADST8, row
+IADST8_FN iadst, IADST8, idct,  IDCT8,  col
+IADST8_FN iadst, IADST8, iadst, IADST8, default
+
+%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
+    IDCT8_1D            %1, [pd_8192], [pd_3fff], %2 * 2, %4    ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
+    ; SCRATCH            6, 8, rsp+(%4+0)*mmsize    ; t6
+    SCRATCH              0, 15, rsp+(%4+7)*mmsize   ; t0a
+    SCRATCH              1, 14, rsp+(%4+6)*mmsize   ; t1a
+    SCRATCH              2, 13, rsp+(%4+5)*mmsize   ; t2a
+    SCRATCH              3, 12, rsp+(%4+4)*mmsize   ; t3a
+    SCRATCH              4, 11, rsp+(%4+3)*mmsize   ; t4
+    mova [rsp+(%3+0)*mmsize], m5                    ; t5
+    mova [rsp+(%3+1)*mmsize], m7                    ; t7
+
+    mova                m0, [%1+ 1*%2]              ; in1
+    mova                m3, [%1+ 7*%2]              ; in7
+    mova                m4, [%1+ 9*%2]              ; in9
+    mova                m7, [%1+15*%2]              ; in15
+
+    SUMSUB_MUL           0, 7, 1, 2, 16305,  1606   ; m0=t15a, m7=t8a
+    SUMSUB_MUL           4, 3, 1, 2, 10394, 12665   ; m4=t14a, m3=t9a
+    SUMSUB_BA         d, 3, 7, 1                    ; m3=t8, m7=t9
+    SUMSUB_BA         d, 4, 0, 1                    ; m4=t15,m0=t14
+    SUMSUB_MUL           0, 7, 1, 2, 15137,  6270   ; m0=t14a, m7=t9a
+
+    mova                m1, [%1+ 3*%2]              ; in3
+    mova                m2, [%1+ 5*%2]              ; in5
+    mova                m5, [%1+11*%2]              ; in11
+    mova                m6, [%1+13*%2]              ; in13
+
+    SCRATCH              0,  9, rsp+(%4+1)*mmsize
+    SCRATCH              7, 10, rsp+(%4+2)*mmsize
+
+    SUMSUB_MUL           2, 5, 0, 7, 14449,  7723   ; m2=t13a, m5=t10a
+    SUMSUB_MUL           6, 1, 0, 7,  4756, 15679   ; m6=t12a, m1=t11a
+    SUMSUB_BA         d, 5, 1, 0                    ; m5=t11,m1=t10
+    SUMSUB_BA         d, 2, 6, 0                    ; m2=t12,m6=t13
+    NEGD                m1                          ; m1=-t10
+    SUMSUB_MUL           1, 6, 0, 7, 15137,  6270   ; m1=t13a, m6=t10a
+
+    UNSCRATCH            7, 10, rsp+(%4+2)*mmsize
+    SUMSUB_BA         d, 5, 3, 0                    ; m5=t8a, m3=t11a
+    SUMSUB_BA         d, 6, 7, 0                    ; m6=t9,  m7=t10
+    SUMSUB_BA         d, 2, 4, 0                    ; m2=t15a,m4=t12a
+    SCRATCH              5, 10, rsp+(%4+2)*mmsize
+    SUMSUB_MUL           4, 3, 0, 5, 11585, 11585   ; m4=t12, m3=t11
+    UNSCRATCH            0, 9, rsp+(%4+1)*mmsize
+    SUMSUB_BA         d, 1, 0, 5                    ; m1=t14, m0=t13
+    SCRATCH              6, 9, rsp+(%4+1)*mmsize
+    SUMSUB_MUL           0, 7, 6, 5, 11585, 11585   ; m0=t13a,m7=t10a
+
+    ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
+    ; free: 6,5
+
+    UNSCRATCH            5, 15, rsp+(%4+7)*mmsize
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=out0, m5=out15
+    SCRATCH              5, 15, rsp+(%4+7)*mmsize
+    UNSCRATCH            5, 14, rsp+(%4+6)*mmsize
+    SUMSUB_BA         d, 1, 5, 6                    ; m1=out1, m5=out14
+    SCRATCH              5, 14, rsp+(%4+6)*mmsize
+    UNSCRATCH            5, 13, rsp+(%4+5)*mmsize
+    SUMSUB_BA         d, 0, 5, 6                    ; m0=out2, m5=out13
+    SCRATCH              5, 13, rsp+(%4+5)*mmsize
+    UNSCRATCH            5, 12, rsp+(%4+4)*mmsize
+    SUMSUB_BA         d, 4, 5, 6                    ; m4=out3, m5=out12
+    SCRATCH              5, 12, rsp+(%4+4)*mmsize
+    UNSCRATCH            5, 11, rsp+(%4+3)*mmsize
+    SUMSUB_BA         d, 3, 5, 6                    ; m3=out4, m5=out11
+    SCRATCH              4, 11, rsp+(%4+3)*mmsize
+    mova                m4, [rsp+(%3+0)*mmsize]
+    SUMSUB_BA         d, 7, 4, 6                    ; m7=out5, m4=out10
+    mova [rsp+(%3+0)*mmsize], m5
+    UNSCRATCH            5, 8, rsp+(%4+0)*mmsize
+    UNSCRATCH            6, 9, rsp+(%4+1)*mmsize
+    SCRATCH              2, 8, rsp+(%4+0)*mmsize
+    SCRATCH              1, 9, rsp+(%4+1)*mmsize
+    UNSCRATCH            1, 10, rsp+(%4+2)*mmsize
+    SCRATCH              0, 10, rsp+(%4+2)*mmsize
+    mova                m0, [rsp+(%3+1)*mmsize]
+    SUMSUB_BA         d, 6, 5, 2                    ; m6=out6, m5=out9
+    SUMSUB_BA         d, 1, 0, 2                    ; m1=out7, m0=out8
+
+    SWAP                 0, 3, 1, 7, 2, 6, 4
+
+    ; output order: 8-11|r67-70=out0-3
+    ;               0-6,r65=out4-11
+    ;               12-15|r71-74=out12-15
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 8
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    mova   [rsp+64*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_16x16]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_16x16+cntq-1]
+%endif
+    mov              skipd, 4
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    IDCT16_1D       blockq
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+ 9*mmsize], m2
+    mova  [ptrq+13*mmsize], m3
+    mova                m7, [rsp+65*mmsize]
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m4
+    mova  [ptrq+ 6*mmsize], m5
+    mova  [ptrq+10*mmsize], m6
+    mova  [ptrq+14*mmsize], m7
+    UNSCRATCH               0, 8, rsp+67*mmsize
+    UNSCRATCH               1, 9, rsp+68*mmsize
+    UNSCRATCH               2, 10, rsp+69*mmsize
+    UNSCRATCH               3, 11, rsp+70*mmsize
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 4*mmsize], m1
+    mova  [ptrq+ 8*mmsize], m2
+    mova  [ptrq+12*mmsize], m3
+    UNSCRATCH               4, 12, rsp+71*mmsize
+    UNSCRATCH               5, 13, rsp+72*mmsize
+    UNSCRATCH               6, 14, rsp+73*mmsize
+    UNSCRATCH               7, 15, rsp+74*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 3*mmsize], m4
+    mova  [ptrq+ 7*mmsize], m5
+    mova  [ptrq+11*mmsize], m6
+    mova  [ptrq+15*mmsize], m7
+    add               ptrq, 16 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 4
+    mov               ptrq, rsp
+.loop_2:
+    IDCT16_1D         ptrq
+
+    pxor               m7, m7
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+    lea               dstq, [dstq+strideq*4]
+    mova                m0, [rsp+65*mmsize]
+    mova                m1, [rsp+64*mmsize]
+    mova                m2, [pd_32]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+    mov               dstq, dstm
+%endif
+    UNSCRATCH               0, 8, rsp+67*mmsize
+    UNSCRATCH               4, 9, rsp+68*mmsize
+    UNSCRATCH               5, 10, rsp+69*mmsize
+    UNSCRATCH               3, 11, rsp+70*mmsize
+    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea               dstq, [dstbakq+stride3q*4]
+%else
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    UNSCRATCH               4, 12, rsp+71*mmsize
+    UNSCRATCH               5, 13, rsp+72*mmsize
+    UNSCRATCH               6, 14, rsp+73*mmsize
+    UNSCRATCH               0, 15, rsp+74*mmsize
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+    RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 8
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+; r65-69 are available for spills
+; r70-77 are available on x86-32 only (x86-64 should use m8-15)
+; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
+%macro IADST16_1D 1 ; src
+    mova                m0, [%1+ 0*4*mmsize]        ; in0
+    mova                m1, [%1+ 7*4*mmsize]        ; in7
+    mova                m2, [%1+ 8*4*mmsize]        ; in8
+    mova                m3, [%1+15*4*mmsize]        ; in15
+    SUMSUB_MUL_D         3, 0, 4, 5, 16364,  804    ; m3/4=t0, m0/5=t1
+    SUMSUB_MUL_D         1, 2, 6, 7, 11003, 12140   ; m1/6=t8, m2/7=t9
+    SCRATCH              0, 8, rsp+70*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t0a, m3=t8a
+    UNSCRATCH            0, 8, rsp+70*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t1a, m0=t9a
+    mova   [rsp+67*mmsize], m1
+    SCRATCH              2, 9, rsp+71*mmsize
+    SCRATCH              3, 12, rsp+74*mmsize
+    SCRATCH              0, 13, rsp+75*mmsize
+
+    mova                m0, [%1+ 3*4*mmsize]        ; in3
+    mova                m1, [%1+ 4*4*mmsize]        ; in4
+    mova                m2, [%1+11*4*mmsize]        ; in11
+    mova                m3, [%1+12*4*mmsize]        ; in12
+    SUMSUB_MUL_D         2, 1, 4, 5, 14811,  7005   ; m2/4=t4, m1/5=t5
+    SUMSUB_MUL_D         0, 3, 6, 7,  5520, 15426   ; m0/6=t12, m3/7=t13
+    SCRATCH              1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t4a, m2=t12a
+    UNSCRATCH            1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t5a, m1=t13a
+    SCRATCH              0, 15, rsp+77*mmsize
+    SCRATCH              3, 11, rsp+73*mmsize
+
+    UNSCRATCH            0, 12, rsp+74*mmsize       ; t8a
+    UNSCRATCH            3, 13, rsp+75*mmsize       ; t9a
+    SUMSUB_MUL_D         0, 3, 4, 5, 16069,  3196   ; m0/4=t8, m3/5=t9
+    SUMSUB_MUL_D         1, 2, 6, 7,  3196, 16069   ; m1/6=t13, m2/7=t12
+    SCRATCH              1, 12, rsp+74*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 4, 1              ; m2=t8a, m0=t12a
+    UNSCRATCH            1, 12, rsp+74*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 5, 4              ; m1=t9a, m3=t13a
+    mova   [rsp+65*mmsize], m2
+    mova   [rsp+66*mmsize], m1
+    SCRATCH              0, 8, rsp+70*mmsize
+    SCRATCH              3, 12, rsp+74*mmsize
+
+    mova                m0, [%1+ 2*4*mmsize]        ; in2
+    mova                m1, [%1+ 5*4*mmsize]        ; in5
+    mova                m2, [%1+10*4*mmsize]        ; in10
+    mova                m3, [%1+13*4*mmsize]        ; in13
+    SUMSUB_MUL_D         3, 0, 4, 5, 15893,  3981   ; m3/4=t2, m0/5=t3
+    SUMSUB_MUL_D         1, 2, 6, 7,  8423, 14053   ; m1/6=t10, m2/7=t11
+    SCRATCH              0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t2a, m3=t10a
+    UNSCRATCH            0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t3a, m0=t11a
+    mova   [rsp+68*mmsize], m1
+    mova   [rsp+69*mmsize], m2
+    SCRATCH              3, 13, rsp+75*mmsize
+    SCRATCH              0, 14, rsp+76*mmsize
+
+    mova                m0, [%1+ 1*4*mmsize]        ; in1
+    mova                m1, [%1+ 6*4*mmsize]        ; in6
+    mova                m2, [%1+ 9*4*mmsize]        ; in9
+    mova                m3, [%1+14*4*mmsize]        ; in14
+    SUMSUB_MUL_D         2, 1, 4, 5, 13160,  9760   ; m2/4=t6, m1/5=t7
+    SUMSUB_MUL_D         0, 3, 6, 7,  2404, 16207   ; m0/6=t14, m3/7=t15
+    SCRATCH              1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t6a, m2=t14a
+    UNSCRATCH            1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t7a, m1=t15a
+
+    UNSCRATCH            4, 13, rsp+75*mmsize       ; t10a
+    UNSCRATCH            5, 14, rsp+76*mmsize       ; t11a
+    SCRATCH              0, 13, rsp+75*mmsize
+    SCRATCH              3, 14, rsp+76*mmsize
+    SUMSUB_MUL_D         4, 5, 6, 7,  9102, 13623   ; m4/6=t10, m5/7=t11
+    SUMSUB_MUL_D         1, 2, 0, 3, 13623,  9102   ; m1/0=t15, m2/3=t14
+    SCRATCH              0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        2, 4, 3, 6, 0              ; m2=t10a, m4=t14a
+    UNSCRATCH            0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        1, 5, 0, 7, 6              ; m1=t11a, m5=t15a
+
+    UNSCRATCH            0, 8, rsp+70*mmsize        ; t12a
+    UNSCRATCH            3, 12, rsp+74*mmsize       ; t13a
+    SCRATCH              2, 8, rsp+70*mmsize
+    SCRATCH              1, 12, rsp+74*mmsize
+    SUMSUB_MUL_D         0, 3, 1, 2, 15137,  6270   ; m0/1=t12, m3/2=t13
+    SUMSUB_MUL_D         5, 4, 7, 6,  6270, 15137   ; m5/7=t15, m4/6=t14
+    SCRATCH              2, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        4, 0, 6, 1, 2              ; m4=out2, m0=t14a
+    UNSCRATCH            2, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        5, 3, 7, 2, 1              ; m5=-out13, m3=t15a
+    NEGD                m5                          ; m5=out13
+
+    UNSCRATCH            1, 9, rsp+71*mmsize        ; t1a
+    mova                m2, [rsp+68*mmsize]         ; t2a
+    UNSCRATCH            6, 13, rsp+75*mmsize       ; t6a
+    UNSCRATCH            7, 14, rsp+76*mmsize       ; t7a
+    SCRATCH              4, 10, rsp+72*mmsize
+    SCRATCH              5, 13, rsp+75*mmsize
+    UNSCRATCH            4, 15, rsp+77*mmsize       ; t4a
+    UNSCRATCH            5, 11, rsp+73*mmsize       ; t5a
+    SCRATCH              0, 14, rsp+76*mmsize
+    SCRATCH              3, 15, rsp+77*mmsize
+    mova                m0, [rsp+67*mmsize]         ; t0a
+    SUMSUB_BA         d, 4, 0, 3                    ; m4=t0, m0=t4
+    SUMSUB_BA         d, 5, 1, 3                    ; m5=t1, m1=t5
+    SUMSUB_BA         d, 6, 2, 3                    ; m6=t2, m2=t6
+    SCRATCH              4, 9, rsp+71*mmsize
+    mova                m3, [rsp+69*mmsize]         ; t3a
+    SUMSUB_BA         d, 7, 3, 4                    ; m7=t3, m3=t7
+
+    mova   [rsp+67*mmsize], m5
+    mova   [rsp+68*mmsize], m6
+    mova   [rsp+69*mmsize], m7
+    SUMSUB_MUL_D         0, 1, 4, 5, 15137,  6270   ; m0/4=t4a, m1/5=t5a
+    SUMSUB_MUL_D         3, 2, 7, 6,  6270, 15137   ; m3/7=t7a, m2/6=t6a
+    SCRATCH              1, 11, rsp+73*mmsize
+    SUMSUB_PACK_D        2, 0, 6, 4, 1              ; m2=-out3, m0=t6
+    NEGD                m2                          ; m2=out3
+    UNSCRATCH            1, 11, rsp+73*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=out12, m1=t7
+    SCRATCH              2, 11, rsp+73*mmsize
+    UNSCRATCH            2, 12, rsp+74*mmsize       ; t11a
+    SCRATCH              3, 12, rsp+74*mmsize
+
+    UNSCRATCH            3, 8, rsp+70*mmsize        ; t10a
+    mova                m4, [rsp+65*mmsize]         ; t8a
+    mova                m5, [rsp+66*mmsize]         ; t9a
+    SUMSUB_BA         d, 3, 4, 6                    ; m3=-out1, m4=t10
+    NEGD                m3                          ; m3=out1
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=out14, m5=t11
+    UNSCRATCH            6, 9, rsp+71*mmsize        ; t0
+    UNSCRATCH            7, 14, rsp+76*mmsize       ; t14a
+    SCRATCH              3, 9, rsp+71*mmsize
+    SCRATCH              2, 14, rsp+76*mmsize
+
+    SUMSUB_MUL           1, 0, 2, 3, 11585, 11585   ; m1=out4, m0=out11
+    mova   [rsp+65*mmsize], m0
+    SUMSUB_MUL           5, 4, 2, 3, 11585, 11585   ; m5=out6, m4=out9
+    UNSCRATCH            0, 15, rsp+77*mmsize       ; t15a
+    SUMSUB_MUL           7, 0, 2, 3, 11585, m11585  ; m7=out10, m0=out5
+
+    mova                m2, [rsp+68*mmsize]         ; t2
+    SUMSUB_BA         d, 2, 6, 3                    ; m2=out0, m6=t2a
+    SCRATCH              2, 8, rsp+70*mmsize
+    mova                m2, [rsp+67*mmsize]         ; t1
+    mova                m3, [rsp+69*mmsize]         ; t3
+    mova   [rsp+67*mmsize], m7
+    SUMSUB_BA         d, 3, 2, 7                    ; m3=-out15, m2=t3a
+    NEGD                m3                          ; m3=out15
+    SCRATCH              3, 15, rsp+77*mmsize
+    SUMSUB_MUL           6, 2, 7, 3, 11585, m11585  ; m6=out8, m2=out7
+    mova                m7, [rsp+67*mmsize]
+
+    SWAP                 0, 1
+    SWAP                 2, 5, 4, 6, 7, 3
+%endmacro
+
+%macro IADST16_FN 7
+cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                dst, stride, block, eob
+    mova                m0, [pw_1023]
+
+.body:
+    mova   [rsp+64*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [%7_16x16]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [%7_16x16+cntq-1]
+%endif
+    mov              skipd, 4
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    %2_1D           blockq
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+ 9*mmsize], m2
+    mova  [ptrq+13*mmsize], m3
+    mova                m7, [rsp+65*mmsize]
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m4
+    mova  [ptrq+ 6*mmsize], m5
+    mova  [ptrq+10*mmsize], m6
+    mova  [ptrq+14*mmsize], m7
+    UNSCRATCH               0, 8, rsp+(%3+0)*mmsize
+    UNSCRATCH               1, 9, rsp+(%3+1)*mmsize
+    UNSCRATCH               2, 10, rsp+(%3+2)*mmsize
+    UNSCRATCH               3, 11, rsp+(%3+3)*mmsize
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 4*mmsize], m1
+    mova  [ptrq+ 8*mmsize], m2
+    mova  [ptrq+12*mmsize], m3
+    UNSCRATCH               4, 12, rsp+(%3+4)*mmsize
+    UNSCRATCH               5, 13, rsp+(%3+5)*mmsize
+    UNSCRATCH               6, 14, rsp+(%3+6)*mmsize
+    UNSCRATCH               7, 15, rsp+(%3+7)*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 3*mmsize], m4
+    mova  [ptrq+ 7*mmsize], m5
+    mova  [ptrq+11*mmsize], m6
+    mova  [ptrq+15*mmsize], m7
+    add               ptrq, 16 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 4
+    mov               ptrq, rsp
+.loop_2:
+    %5_1D             ptrq
+
+    pxor                m7, m7
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+    lea               dstq, [dstq+strideq*4]
+    mova                m0, [rsp+65*mmsize]
+    mova                m1, [rsp+64*mmsize]
+    mova                m2, [pd_32]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+    mov               dstq, dstm
+%endif
+    UNSCRATCH               0, 8, rsp+(%6+0)*mmsize
+    UNSCRATCH               4, 9, rsp+(%6+1)*mmsize
+    UNSCRATCH               5, 10, rsp+(%6+2)*mmsize
+    UNSCRATCH               3, 11, rsp+(%6+3)*mmsize
+    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea               dstq, [dstbakq+stride3q*4]
+%else
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    UNSCRATCH               4, 12, rsp+(%6+4)*mmsize
+    UNSCRATCH               5, 13, rsp+(%6+5)*mmsize
+    UNSCRATCH               6, 14, rsp+(%6+6)*mmsize
+    UNSCRATCH               0, 15, rsp+(%6+7)*mmsize
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+    RET
+
+cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                dst, stride, block, eob
+    mova                m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST16_FN idct,  IDCT16,  67, iadst, IADST16, 70, row
+IADST16_FN iadst, IADST16, 70, idct,  IDCT16,  67, col
+IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
+
+%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
+    IDCT16_1D %2, 2 * %3, 272, 257
+%if ARCH_X86_64
+    mova  [rsp+257*mmsize], m8
+    mova  [rsp+258*mmsize], m9
+    mova  [rsp+259*mmsize], m10
+    mova  [rsp+260*mmsize], m11
+    mova  [rsp+261*mmsize], m12
+    mova  [rsp+262*mmsize], m13
+    mova  [rsp+263*mmsize], m14
+    mova  [rsp+264*mmsize], m15
+%endif
+    mova  [rsp+265*mmsize], m0
+    mova  [rsp+266*mmsize], m1
+    mova  [rsp+267*mmsize], m2
+    mova  [rsp+268*mmsize], m3
+    mova  [rsp+269*mmsize], m4
+    mova  [rsp+270*mmsize], m5
+    mova  [rsp+271*mmsize], m6
+
+    ; r257-260: t0-3
+    ; r265-272: t4/5a/6a/7/8/9a/10/11a
+    ; r261-264: t12a/13/14a/15
+    ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
+
+    mova                m0, [%2+ 1*%3]              ; in1
+    mova                m1, [%2+15*%3]              ; in15
+    mova                m2, [%2+17*%3]              ; in17
+    mova                m3, [%2+31*%3]              ; in31
+    SUMSUB_MUL           0, 3, 4, 5, 16364,  804    ; m0=t31a, m3=t16a
+    SUMSUB_MUL           2, 1, 4, 5, 11003, 12140   ; m2=t30a, m1=t17a
+    SUMSUB_BA         d, 1, 3, 4                    ; m1=t16, m3=t17
+    SUMSUB_BA         d, 2, 0, 4                    ; m2=t31, m0=t30
+    SUMSUB_MUL           0, 3, 4, 5, 16069,  3196   ; m0=t30a, m3=t17a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              2, 9, rsp+276*mmsize
+
+    ; end of stage 1-3 first quart
+
+    mova                m0, [%2+ 7*%3]              ; in7
+    mova                m2, [%2+ 9*%3]              ; in9
+    mova                m4, [%2+23*%3]              ; in23
+    mova                m5, [%2+25*%3]              ; in25
+    SUMSUB_MUL           2, 4, 6, 7, 14811,  7005   ; m2=t29a, m4=t18a
+    SUMSUB_MUL           5, 0, 6, 7,  5520, 15426   ; m5=t28a, m0=t19a
+    SUMSUB_BA         d, 4, 0, 6                    ; m4=t19, m0=t18
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=t28, m5=t29
+    SUMSUB_MUL           5, 0, 6, 7,  3196, m16069  ; m5=t29a, m0=t18a
+
+    ; end of stage 1-3 second quart
+
+    SUMSUB_BA         d, 4, 1, 6                    ; m4=t16a, m1=t19a
+    SUMSUB_BA         d, 0, 3, 6                    ; m0=t17, m3=t18
+    UNSCRATCH            6, 8, rsp+275*mmsize       ; t30a
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; t31
+    mova  [rsp+273*mmsize], m4
+    mova  [rsp+274*mmsize], m0
+    SUMSUB_BA         d, 2, 7, 0                    ; m2=t31a, m7=t28a
+    SUMSUB_BA         d, 5, 6, 0                    ; m5=t30, m6=t29
+    SUMSUB_MUL           6, 3, 0, 4, 15137,  6270   ; m6=t29a, m3=t18a
+    SUMSUB_MUL           7, 1, 0, 4, 15137,  6270   ; m7=t28, m1=t19
+    SCRATCH              3, 10, rsp+277*mmsize
+    SCRATCH              1, 11, rsp+278*mmsize
+    SCRATCH              7, 12, rsp+279*mmsize
+    SCRATCH              6, 13, rsp+280*mmsize
+    SCRATCH              5, 14, rsp+281*mmsize
+    SCRATCH              2, 15, rsp+282*mmsize
+
+    ; end of stage 4-5 first half
+
+    mova                m0, [%2+ 5*%3]              ; in5
+    mova                m1, [%2+11*%3]              ; in11
+    mova                m2, [%2+21*%3]              ; in21
+    mova                m3, [%2+27*%3]              ; in27
+    SUMSUB_MUL           0, 3, 4, 5, 15893,  3981   ; m0=t27a, m3=t20a
+    SUMSUB_MUL           2, 1, 4, 5,  8423, 14053   ; m2=t26a, m1=t21a
+    SUMSUB_BA         d, 1, 3, 4                    ; m1=t20, m3=t21
+    SUMSUB_BA         d, 2, 0, 4                    ; m2=t27, m0=t26
+    SUMSUB_MUL           0, 3, 4, 5,  9102, 13623   ; m0=t26a, m3=t21a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              2, 9, rsp+276*mmsize
+
+    ; end of stage 1-3 third quart
+
+    mova                m0, [%2+ 3*%3]              ; in3
+    mova                m2, [%2+13*%3]              ; in13
+    mova                m4, [%2+19*%3]              ; in19
+    mova                m5, [%2+29*%3]              ; in29
+    SUMSUB_MUL           2, 4, 6, 7, 13160,  9760   ; m2=t25a, m4=t22a
+    SUMSUB_MUL           5, 0, 6, 7,  2404, 16207   ; m5=t24a, m0=t23a
+    SUMSUB_BA         d, 4, 0, 6                    ; m4=t23, m0=t22
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=t24, m5=t25
+    SUMSUB_MUL           5, 0, 6, 7, 13623, m9102   ; m5=t25a, m0=t22a
+
+    ; end of stage 1-3 fourth quart
+
+    SUMSUB_BA         d, 1, 4, 6                    ; m1=t23a, m4=t20a
+    SUMSUB_BA         d, 3, 0, 6                    ; m3=t22, m0=t21
+    UNSCRATCH            6, 8, rsp+275*mmsize       ; t26a
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; t27
+    SCRATCH              3, 8, rsp+275*mmsize
+    SCRATCH              1, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 7, 2, 1                    ; m7=t24a, m2=t27a
+    SUMSUB_BA         d, 6, 5, 1                    ; m6=t25, m5=t26
+    SUMSUB_MUL           2, 4, 1, 3,  6270, m15137  ; m2=t27, m4=t20
+    SUMSUB_MUL           5, 0, 1, 3,  6270, m15137  ; m5=t26a, m0=t21a
+
+    ; end of stage 4-5 second half
+
+    UNSCRATCH            1, 12, rsp+279*mmsize      ; t28
+    UNSCRATCH            3, 13, rsp+280*mmsize      ; t29a
+    SCRATCH              4, 12, rsp+279*mmsize
+    SCRATCH              0, 13, rsp+280*mmsize
+    SUMSUB_BA         d, 5, 3, 0                    ; m5=t29, m3=t26
+    SUMSUB_BA         d, 2, 1, 0                    ; m2=t28a, m1=t27a
+    UNSCRATCH            0, 14, rsp+281*mmsize      ; t30
+    UNSCRATCH            4, 15, rsp+282*mmsize      ; t31a
+    SCRATCH              2, 14, rsp+281*mmsize
+    SCRATCH              5, 15, rsp+282*mmsize
+    SUMSUB_BA         d, 6, 0, 2                    ; m6=t30a, m0=t25a
+    SUMSUB_BA         d, 7, 4, 2                    ; m7=t31, m4=t24
+
+    mova                m2, [rsp+273*mmsize]        ; t16a
+    mova                m5, [rsp+274*mmsize]        ; t17
+    mova  [rsp+273*mmsize], m6
+    mova  [rsp+274*mmsize], m7
+    UNSCRATCH            6, 10, rsp+277*mmsize      ; t18a
+    UNSCRATCH            7, 11, rsp+278*mmsize      ; t19
+    SCRATCH              4, 10, rsp+277*mmsize
+    SCRATCH              0, 11, rsp+278*mmsize
+    UNSCRATCH            4, 12, rsp+279*mmsize      ; t20
+    UNSCRATCH            0, 13, rsp+280*mmsize      ; t21a
+    SCRATCH              3, 12, rsp+279*mmsize
+    SCRATCH              1, 13, rsp+280*mmsize
+    SUMSUB_BA         d, 0, 6, 1                    ; m0=t18, m6=t21
+    SUMSUB_BA         d, 4, 7, 1                    ; m4=t19a, m7=t20a
+    UNSCRATCH            3, 8, rsp+275*mmsize       ; t22
+    UNSCRATCH            1, 9, rsp+276*mmsize       ; t23a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              4, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 3, 5, 0                    ; m3=t17a, m5=t22a
+    SUMSUB_BA         d, 1, 2, 0                    ; m1=t16, m2=t23
+
+    ; end of stage 6
+
+    UNSCRATCH            0, 10, rsp+277*mmsize      ; t24
+    UNSCRATCH            4, 11, rsp+278*mmsize      ; t25a
+    SCRATCH              1, 10, rsp+277*mmsize
+    SCRATCH              3, 11, rsp+278*mmsize
+    SUMSUB_MUL           0, 2, 1, 3, 11585, 11585   ; m0=t24a, m2=t23a
+    SUMSUB_MUL           4, 5, 1, 3, 11585, 11585   ; m4=t25, m5=t22
+    UNSCRATCH            1, 12, rsp+279*mmsize      ; t26
+    UNSCRATCH            3, 13, rsp+280*mmsize      ; t27a
+    SCRATCH              0, 12, rsp+279*mmsize
+    SCRATCH              4, 13, rsp+280*mmsize
+    SUMSUB_MUL           3, 7, 0, 4, 11585, 11585   ; m3=t27, m7=t20
+    SUMSUB_MUL           1, 6, 0, 4, 11585, 11585   ; m1=t26a, m6=t21a
+
+    ; end of stage 7
+
+    mova                m0, [rsp+269*mmsize]        ; t8
+    mova                m4, [rsp+270*mmsize]        ; t9a
+    mova  [rsp+269*mmsize], m1                      ; t26a
+    mova  [rsp+270*mmsize], m3                      ; t27
+    mova                m3, [rsp+271*mmsize]        ; t10
+    SUMSUB_BA         d, 2, 0, 1                    ; m2=out8, m0=out23
+    SUMSUB_BA         d, 5, 4, 1                    ; m5=out9, m4=out22
+    SUMSUB_BA         d, 6, 3, 1                    ; m6=out10, m3=out21
+    mova                m1, [rsp+272*mmsize]        ; t11a
+    mova  [rsp+271*mmsize], m0
+    SUMSUB_BA         d, 7, 1, 0                    ; m7=out11, m1=out20
+
+%if %1 == 1
+    TRANSPOSE4x4D        2, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m2
+    mova  [ptrq+10*mmsize], m5
+    mova  [ptrq+18*mmsize], m6
+    mova  [ptrq+26*mmsize], m7
+%else ; %1 == 2
+    pxor                m0, m0
+    lea               dstq, [dstq+strideq*8]
+    ROUND_AND_STORE_4x4  2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+    mova                m2, [rsp+271*mmsize]
+%if %1 == 1
+    TRANSPOSE4x4D        1, 3, 4, 2, 0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+13*mmsize], m3
+    mova  [ptrq+21*mmsize], m4
+    mova  [ptrq+29*mmsize], m2
+%else ; %1 == 2
+    lea               dstq, [dstq+stride3q*4]
+    ROUND_AND_STORE_4x4  1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out8-11 and out20-23
+
+    UNSCRATCH            0, 9, rsp+276*mmsize       ; t19a
+    UNSCRATCH            1, 8, rsp+275*mmsize       ; t18
+    UNSCRATCH            2, 11, rsp+278*mmsize      ; t17a
+    UNSCRATCH            3, 10, rsp+277*mmsize      ; t16
+    mova                m7, [rsp+261*mmsize]        ; t12a
+    mova                m6, [rsp+262*mmsize]        ; t13
+    mova                m5, [rsp+263*mmsize]        ; t14a
+    SUMSUB_BA         d, 0, 7, 4                    ; m0=out12, m7=out19
+    SUMSUB_BA         d, 1, 6, 4                    ; m1=out13, m6=out18
+    SUMSUB_BA         d, 2, 5, 4                    ; m2=out14, m5=out17
+    mova                m4, [rsp+264*mmsize]        ; t15
+    SCRATCH              7, 8, rsp+275*mmsize
+    SUMSUB_BA         d, 3, 4, 7                    ; m3=out15, m4=out16
+
+%if %1 == 1
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 3*mmsize], m0
+    mova  [ptrq+11*mmsize], m1
+    mova  [ptrq+19*mmsize], m2
+    mova  [ptrq+27*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 7, 9
+    lea               dstq, [dstbakq+stride3q*4]
+%else ; x86-32
+    pxor                m7, m7
+    mov               dstq, dstm
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            0, 8, rsp+275*mmsize       ; out19
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 0, 7
+    mova  [ptrq+ 4*mmsize], m4
+    mova  [ptrq+12*mmsize], m5
+    mova  [ptrq+20*mmsize], m6
+    mova  [ptrq+28*mmsize], m0
+%else ; %1 == 2
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out12-19
+
+%if ARCH_X86_64
+    SWAP                 7, 8
+%endif
+    mova                m7, [rsp+257*mmsize]        ; t0
+    mova                m6, [rsp+258*mmsize]        ; t1
+    mova                m5, [rsp+259*mmsize]        ; t2
+    mova                m4, [rsp+260*mmsize]        ; t3
+    mova                m0, [rsp+274*mmsize]        ; t31
+    mova                m1, [rsp+273*mmsize]        ; t30a
+    UNSCRATCH            2, 15, rsp+282*mmsize      ; t29
+    SUMSUB_BA         d, 0, 7, 3                    ; m0=out0, m7=out31
+    SUMSUB_BA         d, 1, 6, 3                    ; m1=out1, m6=out30
+    SUMSUB_BA         d, 2, 5, 3                    ; m2=out2, m5=out29
+    SCRATCH              0, 9, rsp+276*mmsize
+    UNSCRATCH            3, 14, rsp+281*mmsize      ; t28a
+    SUMSUB_BA         d, 3, 4, 0                    ; m3=out3, m4=out28
+
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 7*mmsize], m4
+    mova  [ptrq+15*mmsize], m5
+    mova  [ptrq+23*mmsize], m6
+    mova  [ptrq+31*mmsize], m7
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 0, 8
+%else ; x86-32
+    pxor                m0, m0
+%endif
+    lea               dstq, [dstq+stride3q*4]
+    ROUND_AND_STORE_4x4  4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; out0
+%if %1 == 1
+    TRANSPOSE4x4D        7, 1, 2, 3, 0
+    mova  [ptrq+ 0*mmsize], m7
+    mova  [ptrq+ 8*mmsize], m1
+    mova  [ptrq+16*mmsize], m2
+    mova  [ptrq+24*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else ; x86-32
+    mov               dstq, dstm
+%endif
+    ROUND_AND_STORE_4x4  7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+%endif
+%endif
+
+    ; end of last stage + store for out0-3 and out28-31
+
+%if ARCH_X86_64
+    SWAP                 0, 8
+%endif
+    mova                m7, [rsp+265*mmsize]        ; t4
+    mova                m6, [rsp+266*mmsize]        ; t5a
+    mova                m5, [rsp+267*mmsize]        ; t6a
+    mova                m4, [rsp+268*mmsize]        ; t7
+    mova                m0, [rsp+270*mmsize]        ; t27
+    mova                m1, [rsp+269*mmsize]        ; t26a
+    UNSCRATCH            2, 13, rsp+280*mmsize      ; t25
+    SUMSUB_BA         d, 0, 7, 3                    ; m0=out4, m7=out27
+    SUMSUB_BA         d, 1, 6, 3                    ; m1=out5, m6=out26
+    SUMSUB_BA         d, 2, 5, 3                    ; m2=out6, m5=out25
+    UNSCRATCH            3, 12, rsp+279*mmsize      ; t24a
+    SCRATCH              7, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 3, 4, 7                    ; m3=out7, m4=out24
+
+%if %1 == 1
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 9*mmsize], m1
+    mova  [ptrq+17*mmsize], m2
+    mova  [ptrq+25*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 7, 8
+    lea               dstq, [dstbakq+strideq*4]
+%else ; x86-32
+    pxor                m7, m7
+    lea               dstq, [dstq+strideq*4]
+%endif
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            0, 9, rsp+276*mmsize       ; out27
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 0, 7
+    mova  [ptrq+ 6*mmsize], m4
+    mova  [ptrq+14*mmsize], m5
+    mova  [ptrq+22*mmsize], m6
+    mova  [ptrq+30*mmsize], m0
+%else ; %1 == 2
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+stride3q*8]
+%else
+    mov               dstq, dstm
+    lea               dstq, [dstq+stride3q*8]
+%endif
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out4-7 and out24-27
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 32
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+    add               dstq, strideq
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    mova  [rsp+256*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_32x32]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_32x32+cntq-1]
+%endif
+    mov              skipd, 8
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    IDCT32_1D            1, blockq
+
+    add               ptrq, 32 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    shl              skipd, 2
+    lea             blockq, [blockq+skipq*(mmsize/4)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 8
+    mov               ptrq, rsp
+.loop_2:
+    IDCT32_1D            2, ptrq
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
+    RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 32
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+    add               dstq, strideq
+    dec               cntd
+    jg .loop_dc
+    RET
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm
new file mode 100644
index 0000000..d2f2257
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_template.asm
@@ -0,0 +1,142 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%macro VP9_IWHT4_1D 0
+    SWAP                 1, 2, 3
+    paddw               m0, m2
+    psubw               m3, m1
+    psubw               m4, m0, m3
+    psraw               m4, 1
+    psubw               m5, m4, m1
+    SWAP                 5, 1
+    psubw               m4, m2
+    SWAP                 4, 2
+    psubw               m0, m1
+    paddw               m3, m2
+    SWAP                 3, 2, 1
+%endmacro
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+    pmaddwd            m%1, m%2, %4
+    pmaddwd            m%2,  %5
+    paddd              m%1,  %3
+    paddd              m%2,  %3
+    psrad              m%1,  14
+    psrad              m%2,  14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+    VP9_MULSUB_2W_2X    %7,  %6,  %5, [pw_m%3_%4], [pw_%4_%3]
+    VP9_MULSUB_2W_2X    %1,  %2,  %5, [pw_m%3_%4], [pw_%4_%3]
+    packssdw           m%1, m%7
+    packssdw           m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+    punpckhwd          m%6, m%2, m%1
+    punpcklwd          m%2, m%1
+    VP9_MULSUB_2W_4X   %1, %2, %3, %4, %5, %6, %7
+%else
+    punpckhwd          m%8, m%4, m%3
+    punpcklwd          m%2, m%4, m%3
+    VP9_MULSUB_2W_4X   %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
+    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
+    SWAP                 0, 3, 2                            ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 2, 0, 4                         ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+    pmulhrsw            m2, m6                              ; m2=t0
+    pmulhrsw            m0, m6                              ; m0=t1
+%else ; <= sse2
+    VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5    ; m0=t1, m1=t0
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5     ; m1=t2, m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IADST4_1D 0
+    movq2dq           xmm0, m0
+    movq2dq           xmm1, m1
+    movq2dq           xmm2, m2
+    movq2dq           xmm3, m3
+%if cpuflag(ssse3)
+    paddw               m3, m0
+%endif
+    punpcklwd         xmm0, xmm1
+    punpcklwd         xmm2, xmm3
+    pmaddwd           xmm1, xmm0, [pw_5283_13377]
+    pmaddwd           xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm6, xmm0, [pw_13377_0]
+%endif
+    pmaddwd           xmm0, [pw_15212_m13377]
+    pmaddwd           xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm7, xmm2, [pw_m13377_13377]
+%endif
+    pmaddwd           xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+    psubw               m3, m2
+%else
+    paddd             xmm6, xmm7
+%endif
+    paddd             xmm0, xmm2
+    paddd             xmm3, xmm5
+    paddd             xmm2, xmm5
+%if notcpuflag(ssse3)
+    paddd             xmm6, xmm5
+%endif
+    paddd             xmm1, xmm3
+    paddd             xmm0, xmm3
+    paddd             xmm4, xmm2
+    psrad             xmm1, 14
+    psrad             xmm0, 14
+    psrad             xmm4, 14
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_13377x2]        ; out2
+%else
+    psrad             xmm6, 14
+%endif
+    packssdw          xmm0, xmm0
+    packssdw          xmm1, xmm1
+    packssdw          xmm4, xmm4
+%if notcpuflag(ssse3)
+    packssdw          xmm6, xmm6
+%endif
+    movdq2q             m0, xmm0                ; out3
+    movdq2q             m1, xmm1                ; out0
+    movdq2q             m2, xmm4                ; out1
+%if notcpuflag(ssse3)
+    movdq2q             m3, xmm6                ; out2
+%endif
+    SWAP                 0, 1, 2, 3
+%endmacro
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 54f20fe..4e7ede22 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -4,24 +4,23 @@
 ;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
 ;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
@@ -37,8 +36,8 @@ pb_f8:  times 16 db 0xf8
 pb_fe:  times 16 db 0xfe
 pb_ff:  times 16 db 0xff
 
-pw_4:   times  8 dw 4
-pw_8:   times  8 dw 8
+cextern pw_4
+cextern pw_8
 
 ; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
 ; the following mask is used to splat both in the same register
@@ -53,7 +52,7 @@ mask_mix48: times 8 db 0x00
 SECTION .text
 
 %macro SCRATCH 3
-%if ARCH_X86_64
+%ifdef m8
     SWAP                %1, %2
 %else
     mova              [%3], m%1
@@ -61,7 +60,7 @@ SECTION .text
 %endmacro
 
 %macro UNSCRATCH 3
-%if ARCH_X86_64
+%ifdef m8
     SWAP                %1, %2
 %else
     mova               m%1, [%3]
@@ -70,7 +69,7 @@ SECTION .text
 
 ; %1 = abs(%2-%3)
 %macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
-%if ARCH_X86_64
+%ifdef m8
     psubusb             %1, %3, %2
     psubusb             %4, %2, %3
 %else
@@ -103,7 +102,7 @@ SECTION .text
 %endmacro
 
 %macro UNPACK 4
-%if ARCH_X86_64
+%ifdef m8
     punpck%1bw          %2, %3, %4
 %else
     mova                %2, %3
@@ -113,27 +112,27 @@ SECTION .text
 
 %macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
                              ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
-    psubw               %3, [rsp+%4+%5*32]
-    psubw               %3, [rsp+%4+%6*32]
-    paddw               %3, [rsp+%4+%7*32]
+    psubw               %3, [rsp+%4+%5*mmsize*2]
+    psubw               %3, [rsp+%4+%6*mmsize*2]
+    paddw               %3, [rsp+%4+%7*mmsize*2]
 %ifnidn %10, ""
 %if %11 == 0
     punpck%2bw          %1, %10, m0
 %else
     UNPACK          %2, %1, %10, m0
 %endif
-    mova    [rsp+%4+%8*32], %1
+    mova [rsp+%4+%8*mmsize*2], %1
     paddw               %3, %1
 %else
-    paddw               %3, [rsp+%4+%8*32]
+    paddw               %3, [rsp+%4+%8*mmsize*2]
 %endif
     psraw               %1, %3, %9
 %endmacro
 
 ; FIXME interleave l/h better (for instruction pairing)
 %macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
-    FILTER%7_INIT       %1, l, %3, %6 +  0
-    FILTER%7_INIT       %2, h, %4, %6 + 16
+    FILTER%7_INIT       %1, l, %3, %6 +      0
+    FILTER%7_INIT       %2, h, %4, %6 + mmsize
     packuswb            %1, %2
     MASK_APPLY          %1, %9, %8, %2
     mova                %5, %1
@@ -148,8 +147,8 @@ SECTION .text
     mova               %14, %15
 %endif
 %endif
-    FILTER_SUBx2_ADDx2  %1, l, %3, %6 +  0, %7, %8, %9, %10, %11, %14, %16
-    FILTER_SUBx2_ADDx2  %2, h, %4, %6 + 16, %7, %8, %9, %10, %11, %14, %16
+    FILTER_SUBx2_ADDx2  %1, l, %3, %6 +      0, %7, %8, %9, %10, %11, %14, %16
+    FILTER_SUBx2_ADDx2  %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16
     packuswb            %1, %2
 %ifnidn %13, ""
     MASK_APPLY          %1, %13, %12, %2
@@ -196,21 +195,21 @@ SECTION .text
 
 %macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
     UNPACK          %2, %1, rp3, m0                     ; p3: B->W
-    mova     [rsp+%4+0*32], %1
+    mova [rsp+%4+0*mmsize*2], %1
     paddw               %3, %1, %1                      ; p3*2
     paddw               %3, %1                          ; p3*3
     punpck%2bw          %1, m1,  m0                     ; p2: B->W
-    mova     [rsp+%4+1*32], %1
+    mova [rsp+%4+1*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2
     paddw               %3, %1                          ; p3*3 + p2*2
     UNPACK          %2, %1, rp1, m0                     ; p1: B->W
-    mova     [rsp+%4+2*32], %1
+    mova [rsp+%4+2*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2*2 + p1
     UNPACK          %2, %1, rp0, m0                     ; p0: B->W
-    mova     [rsp+%4+3*32], %1
+    mova [rsp+%4+3*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0
     UNPACK          %2, %1, rq0, m0                     ; q0: B->W
-    mova     [rsp+%4+4*32], %1
+    mova [rsp+%4+4*mmsize*2], %1
     paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0 + q0
     paddw               %3, [pw_4]                      ; p3*3 + p2*2 + p1 + p0 + q0 + 4
     psraw               %1, %3, 3                       ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
@@ -218,24 +217,24 @@ SECTION .text
 
 %macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
     punpck%2bw          %1, m2, m0                      ; p7: B->W
-    mova    [rsp+%4+ 8*32], %1
+    mova [rsp+%4+ 8*mmsize*2], %1
     psllw               %3, %1, 3                       ; p7*8
     psubw               %3, %1                          ; p7*7
     punpck%2bw          %1, m3, m0                      ; p6: B->W
-    mova    [rsp+%4+ 9*32], %1
+    mova [rsp+%4+ 9*mmsize*2], %1
     paddw               %3, %1                          ; p7*7 + p6
     paddw               %3, %1                          ; p7*7 + p6*2
     UNPACK          %2, %1, rp5, m0                     ; p5: B->W
-    mova    [rsp+%4+10*32], %1
+    mova [rsp+%4+10*mmsize*2], %1
     paddw               %3, %1                          ; p7*7 + p6*2 + p5
     UNPACK          %2, %1, rp4, m0                     ; p4: B->W
-    mova    [rsp+%4+11*32], %1
+    mova [rsp+%4+11*mmsize*2], %1
     paddw               %3, %1                          ; p7*7 + p6*2 + p5 + p4
-    paddw               %3, [rsp+%4+ 0*32]              ; p7*7 + p6*2 + p5 + p4 + p3
-    paddw               %3, [rsp+%4+ 1*32]              ; p7*7 + p6*2 + p5 + .. + p2
-    paddw               %3, [rsp+%4+ 2*32]              ; p7*7 + p6*2 + p5 + .. + p1
-    paddw               %3, [rsp+%4+ 3*32]              ; p7*7 + p6*2 + p5 + .. + p0
-    paddw               %3, [rsp+%4+ 4*32]              ; p7*7 + p6*2 + p5 + .. + p0 + q0
+    paddw               %3, [rsp+%4+ 0*mmsize*2]        ; p7*7 + p6*2 + p5 + p4 + p3
+    paddw               %3, [rsp+%4+ 1*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p2
+    paddw               %3, [rsp+%4+ 2*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p1
+    paddw               %3, [rsp+%4+ 3*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p0
+    paddw               %3, [rsp+%4+ 4*mmsize*2]        ; p7*7 + p6*2 + p5 + .. + p0 + q0
     paddw               %3, [pw_8]                      ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
     psraw               %1, %3, 4                       ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
 %endmacro
@@ -335,22 +334,24 @@ SECTION .text
 %endmacro
 
 %macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
-%define P3 rsp +   0 + %1
-%define P2 rsp +  16 + %1
-%define P1 rsp +  32 + %1
-%define P0 rsp +  48 + %1
-%define Q0 rsp +  64 + %1
-%define Q1 rsp +  80 + %1
-%define Q2 rsp +  96 + %1
-%define Q3 rsp + 112 + %1
-%define P7 rsp + 128 + %1
-%define P6 rsp + 144 + %1
-%define P5 rsp + 160 + %1
-%define P4 rsp + 176 + %1
-%define Q4 rsp + 192 + %1
-%define Q5 rsp + 208 + %1
-%define Q6 rsp + 224 + %1
-%define Q7 rsp + 240 + %1
+%define P3 rsp +  0*mmsize + %1
+%define P2 rsp +  1*mmsize + %1
+%define P1 rsp +  2*mmsize + %1
+%define P0 rsp +  3*mmsize + %1
+%define Q0 rsp +  4*mmsize + %1
+%define Q1 rsp +  5*mmsize + %1
+%define Q2 rsp +  6*mmsize + %1
+%define Q3 rsp +  7*mmsize + %1
+%if mmsize == 16
+%define P7 rsp +  8*mmsize + %1
+%define P6 rsp +  9*mmsize + %1
+%define P5 rsp + 10*mmsize + %1
+%define P4 rsp + 11*mmsize + %1
+%define Q4 rsp + 12*mmsize + %1
+%define Q5 rsp + 13*mmsize + %1
+%define Q6 rsp + 14*mmsize + %1
+%define Q7 rsp + 15*mmsize + %1
+%endif
 %endmacro
 
 ; ..............AB -> AAAAAAAABBBBBBBB
@@ -364,14 +365,19 @@ SECTION .text
 %endif
 %endmacro
 
-%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=32bit stack only
+%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only
+%assign %%ext 0
+%if ARCH_X86_32 || mmsize == 8
+%assign %%ext %5
+%endif
+
 %if UNIX64
-cglobal vp9_loop_filter_%1_%2_16, 5, 9, 16, %3 + %4, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
 %else
 %if WIN64
-cglobal vp9_loop_filter_%1_%2_16, 4, 8, 16, %3 + %4, dst, stride, E, I, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3
 %else
-cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, dst2, stride3, mstride3
+cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3
 %define Ed dword r2m
 %define Id dword r3m
 %endif
@@ -385,18 +391,22 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     lea              mstride3q, [mstrideq*3]
 
 %ifidn %1, h
-%if %2 > 16
+%if %2 != 16
+%if mmsize == 16
 %define movx movh
+%else
+%define movx mova
+%endif
     lea                   dstq, [dstq + 4*strideq - 4]
 %else
 %define movx movu
     lea                   dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
 %endif
-    lea                  dst2q, [dstq + 8*strideq]
 %else
     lea                   dstq, [dstq + 4*mstrideq]
-    lea                  dst2q, [dstq + 8*strideq]
 %endif
+    ; FIXME we shouldn't need two dts registers if mmsize == 8
+    lea                  dst2q, [dstq + 8*strideq]
 
     DEFINE_REAL_P7_TO_Q7
 
@@ -407,11 +417,11 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movx                    m3, [P4]
     movx                    m4, [P3]
     movx                    m5, [P2]
-%if ARCH_X86_64 || %2 != 16
+%if (ARCH_X86_64 && mmsize == 16) || %2 > 16
     movx                    m6, [P1]
 %endif
     movx                    m7, [P0]
-%if ARCH_X86_64
+%ifdef m8
     movx                    m8, [Q0]
     movx                    m9, [Q1]
     movx                   m10, [Q2]
@@ -503,7 +513,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps        [Q5], m6
     movhps        [Q7], m7
     DEFINE_TRANSPOSED_P7_TO_Q7
-%else ; %2 == 44/48/84/88
+%elif %2 > 16 ; %2 == 44/48/84/88
     punpcklbw        m0, m1
     punpcklbw        m2, m3
     punpcklbw        m4, m5
@@ -530,12 +540,31 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova           [Q1],  m5
     mova           [Q2],  m7
     mova           [Q3],  m3
+%else ; %2 == 4 || %2 == 8
+    SBUTTERFLY       bw, 0, 1, 6
+    SBUTTERFLY       bw, 2, 3, 6
+    SBUTTERFLY       bw, 4, 5, 6
+    mova [rsp+4*mmsize], m5
+    mova             m6, [P1]
+    SBUTTERFLY       bw, 6, 7, 5
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    TRANSPOSE4x4W     0, 2, 4, 6, 5
+    mova           [P3], m0
+    mova           [P2], m2
+    mova           [P1], m4
+    mova           [P0], m6
+    mova             m5, [rsp+4*mmsize]
+    TRANSPOSE4x4W     1, 3, 5, 7, 0
+    mova           [Q0], m1
+    mova           [Q1], m3
+    mova           [Q2], m5
+    mova           [Q3], m7
 %endif ; %2
 %endif ; x86-32/64
 %endif ; %1 == h
 
     ; calc fm mask
-%if %2 == 16
+%if %2 == 16 || mmsize == 8
 %if cpuflag(ssse3)
     pxor                m0, m0
 %endif
@@ -553,7 +582,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova                m0, [pb_80]
     pxor                m2, m0
     pxor                m3, m0
-%if ARCH_X86_64
+%ifdef m8
 %ifidn %1, v
     mova                m8, [P3]
     mova                m9, [P2]
@@ -614,10 +643,10 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 
     ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
     ; calc flat8in (if not 44_16) and hev masks
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     mova                m6, [pb_81]                     ; [1 1 1 1 ...] ^ 0x80
     ABSSUB_GT           m2, rp3, rp0, m6, m5            ; abs(p3 - p0) <= 1
-%if ARCH_X86_64
+%ifdef m8
     mova                m8, [pb_80]
 %define rb80 m8
 %else
@@ -626,7 +655,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     ABSSUB_GT           m1, rp2, rp0, m6, m5, rb80      ; abs(p2 - p0) <= 1
     por                 m2, m1
     ABSSUB              m4, rp1, rp0, m5                ; abs(p1 - p0)
-%if %2 == 16
+%if %2 <= 16
 %if cpuflag(ssse3)
     pxor                m0, m0
 %endif
@@ -656,8 +685,15 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 %endif
 %else
     mova                m6, [pb_80]
+%if %2 == 44
     movd                m7, Hd
     SPLATB_MIX          m7
+%else
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m7, H, m0                       ; H H H H ...
+%endif
     pxor                m7, m6
     ABSSUB              m4, rp1, rp0, m1                ; abs(p1 - p0)
     pxor                m4, m6
@@ -671,7 +707,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 %if %2 == 16
     ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
     ; calc flat8out mask
-%if ARCH_X86_64
+%ifdef m8
     mova                m8, [P7]
     mova                m9, [P6]
 %define rp7 m8
@@ -683,7 +719,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     ABSSUB_GT           m1, rp7, rp0, m6, m5            ; abs(p7 - p0) <= 1
     ABSSUB_GT           m7, rp6, rp0, m6, m5            ; abs(p6 - p0) <= 1
     por                 m1, m7
-%if ARCH_X86_64
+%ifdef m8
     mova                m8, [P5]
     mova                m9, [P4]
 %define rp5 m8
@@ -696,7 +732,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     por                 m1, m7
     ABSSUB_GT           m7, rp4, rp0, m6, m5            ; abs(p4 - p0) <= 1
     por                 m1, m7
-%if ARCH_X86_64
+%ifdef m8
     mova                m14, [Q4]
     mova                m15, [Q5]
 %define rq4 m14
@@ -709,7 +745,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     por                 m1, m7
     ABSSUB_GT           m7, rq5, rq0, m6, m5            ; abs(q5 - q0) <= 1
     por                 m1, m7
-%if ARCH_X86_64
+%ifdef m8
     mova                m14, [Q6]
     mova                m15, [Q7]
 %define rq6 m14
@@ -739,7 +775,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 
     ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
     ; filter2()
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     mova                m6, [pb_80]                     ; already in m6 if 44_16
     SCRATCH              2, 15, rsp+%3+%4
 %if %2 == 16
@@ -757,7 +793,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     paddsb              m4, m2                          ; 3*(q0 - p0) + (p1 - q1)
     paddsb              m6, m4, [pb_4]                  ; m6: f1 = clip(f + 4, 127)
     paddsb              m4, [pb_3]                      ; m4: f2 = clip(f + 3, 127)
-%if ARCH_X86_64
+%ifdef m8
     mova                m14, [pb_10]                    ; will be reused in filter4()
 %define rb10 m14
 %else
@@ -766,8 +802,8 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     SRSHIFT3B_2X        m6, m4, rb10, m7                ; f1 and f2 sign byte shift by 3
     SIGN_SUB            m7, rq0, m6, m5                 ; m7 = q0 - f1
     SIGN_ADD            m1, rp0, m4, m5                 ; m1 = p0 + f2
-%if %2 != 44
-%if ARCH_X86_64
+%if %2 != 44 && %2 != 4
+%ifdef m8
     pandn               m6, m15, m3                     ;  ~mask(in) & mask(fm)
 %else
     mova                m6, [rsp+%3+%4]
@@ -788,8 +824,8 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     paddsb              m6, m2, [pb_4]                  ; m6:  f1 = clip(f + 4, 127)
     paddsb              m2, [pb_3]                      ; m2: f2 = clip(f + 3, 127)
     SRSHIFT3B_2X        m6, m2, rb10, m4                ; f1 and f2 sign byte shift by 3
-%if %2 != 44
-%if ARCH_X86_64
+%if %2 != 44 && %2 != 4
+%ifdef m8
     pandn               m5, m15, m3                     ;               ~mask(in) & mask(fm)
 %else
     mova                m5, [rsp+%3+%4]
@@ -816,26 +852,26 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova                [P1], m1
     mova                [Q1], m4
 
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     UNSCRATCH            2, 15, rsp+%3+%4
 %endif
 
     ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
     ; filter6()
-%if %2 != 44
+%if %2 != 44 && %2 != 4
     pxor                m0, m0
-%if %2 > 16
+%if %2 != 16
     pand                m3, m2
 %else
     pand                m2, m3                          ;               mask(fm) & mask(in)
-%if ARCH_X86_64
+%ifdef m8
     pandn               m3, m8, m2                      ; ~mask(out) & (mask(fm) & mask(in))
 %else
     mova                m3, [rsp+%3+%4+16]
     pandn               m3, m2
 %endif
 %endif
-%if ARCH_X86_64
+%ifdef m8
     mova               m14, [P3]
     mova                m9, [Q3]
 %define rp3 m14
@@ -883,7 +919,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     pand            m1, m2                                                              ; mask(out) & (mask(fm) & mask(in))
     mova            m2, [P7]
     mova            m3, [P6]
-%if ARCH_X86_64
+%ifdef m8
     mova            m8, [P5]
     mova            m9, [P4]
 %define rp5 m8
@@ -1009,7 +1045,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps [Q5],  m6
     movhps [Q7],  m7
 %endif
-%elif %2 == 44
+%elif %2 == 44 || %2 == 4
     SWAP 0, 1   ; m0 = p1
     SWAP 1, 7   ; m1 = p0
     SWAP 2, 5   ; m2 = q0
@@ -1019,6 +1055,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     SBUTTERFLY  bw, 2, 3, 4
     SBUTTERFLY  wd, 0, 2, 4
     SBUTTERFLY  wd, 1, 3, 4
+%if mmsize == 16
     movd  [P7], m0
     movd  [P3], m2
     movd  [Q0], m1
@@ -1048,6 +1085,20 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movd  [Q3], m1
     movd  [Q7], m3
 %else
+    movd  [P7], m0
+    movd  [P5], m2
+    movd  [P3], m1
+    movd  [P1], m3
+    psrlq   m0, 32
+    psrlq   m2, 32
+    psrlq   m1, 32
+    psrlq   m3, 32
+    movd  [P6], m0
+    movd  [P4], m2
+    movd  [P2], m1
+    movd  [P0], m3
+%endif
+%else
     ; the following code do a transpose of 8 full lines to 16 half
     ; lines (high part). It is inlined to avoid the need of a staging area
     mova                    m0, [P3]
@@ -1056,12 +1107,12 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova                    m3, [P0]
     mova                    m4, [Q0]
     mova                    m5, [Q1]
-%if ARCH_X86_64
+%ifdef m8
     mova                    m6, [Q2]
 %endif
     mova                    m7, [Q3]
     DEFINE_REAL_P7_TO_Q7
-%if ARCH_X86_64
+%ifdef m8
     SBUTTERFLY  bw,  0,  1, 8
     SBUTTERFLY  bw,  2,  3, 8
     SBUTTERFLY  bw,  4,  5, 8
@@ -1076,27 +1127,32 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     SBUTTERFLY  dq,  3,  7, 8
 %else
     SBUTTERFLY  bw,  0,  1, 6
-    mova  [rsp+64], m1
-    mova        m6, [rsp+96]
+    mova [rsp+mmsize*4], m1
+    mova        m6, [rsp+mmsize*6]
     SBUTTERFLY  bw,  2,  3, 1
     SBUTTERFLY  bw,  4,  5, 1
     SBUTTERFLY  bw,  6,  7, 1
     SBUTTERFLY  wd,  0,  2, 1
-    mova  [rsp+96], m2
-    mova        m1, [rsp+64]
+    mova [rsp+mmsize*6], m2
+    mova        m1, [rsp+mmsize*4]
     SBUTTERFLY  wd,  1,  3, 2
     SBUTTERFLY  wd,  4,  6, 2
     SBUTTERFLY  wd,  5,  7, 2
     SBUTTERFLY  dq,  0,  4, 2
     SBUTTERFLY  dq,  1,  5, 2
+%if mmsize == 16
     movh      [Q0], m1
     movhps    [Q1], m1
-    mova        m2, [rsp+96]
+%else
+    mova      [P3], m1
+%endif
+    mova        m2, [rsp+mmsize*6]
     SBUTTERFLY  dq,  2,  6, 1
     SBUTTERFLY  dq,  3,  7, 1
 %endif
     SWAP         3, 6
     SWAP         1, 4
+%if mmsize == 16
     movh      [P7], m0
     movhps    [P6], m0
     movh      [P5], m1
@@ -1105,7 +1161,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps    [P2], m2
     movh      [P1], m3
     movhps    [P0], m3
-%if ARCH_X86_64
+%ifdef m8
     movh      [Q0], m4
     movhps    [Q1], m4
 %endif
@@ -1115,6 +1171,15 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     movhps    [Q5], m6
     movh      [Q6], m7
     movhps    [Q7], m7
+%else
+    mova      [P7], m0
+    mova      [P6], m1
+    mova      [P5], m2
+    mova      [P4], m3
+    mova      [P2], m5
+    mova      [P1], m6
+    mova      [P0], m7
+%endif
 %endif
 %endif
 
@@ -1138,3 +1203,9 @@ LPF_16_VH_ALL_OPTS 44,   0, 128,  0
 LPF_16_VH_ALL_OPTS 48, 256, 128, 16
 LPF_16_VH_ALL_OPTS 84, 256, 128, 16
 LPF_16_VH_ALL_OPTS 88, 256, 128, 16
+
+INIT_MMX mmxext
+LOOPFILTER v, 4,   0,  0, 0
+LOOPFILTER h, 4,   0, 64, 0
+LOOPFILTER v, 8, 128,  0, 8
+LOOPFILTER h, 8, 128, 64, 8
diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm
new file mode 100644
index 0000000..c088817
--- /dev/null
+++ b/libavcodec/x86/vp9lpf_16bpp.asm
@@ -0,0 +1,823 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_511: times 16 dw 511
+pw_2047: times 16 dw 2047
+pw_16384: times 16 dw 16384
+pw_m512: times 16 dw -512
+pw_m2048: times 16 dw -2048
+
+cextern pw_1
+cextern pw_3
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_256
+cextern pw_1023
+cextern pw_4095
+cextern pw_m1
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; calculate p or q portion of flat8out
+%macro FLAT8OUT_HALF 0
+    psubw               m4, m0                      ; q4-q0
+    psubw               m5, m0                      ; q5-q0
+    psubw               m6, m0                      ; q6-q0
+    psubw               m7, m0                      ; q7-q0
+    ABS2                m4, m5, m2, m3              ; abs(q4-q0) | abs(q5-q0)
+    ABS2                m6, m7, m2, m3              ; abs(q6-q0) | abs(q7-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q4-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q5-q0) > F
+    pcmpgtw             m6, reg_F                   ; abs(q6-q0) > F
+    pcmpgtw             m7, reg_F                   ; abs(q7-q0) > F
+    por                 m5, m4
+    por                 m7, m6
+    por                 m7, m5                      ; !flat8out, q portion
+%endmacro
+
+; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
+%macro FLAT8IN_HALF 1
+%if %1 > 4
+    psubw               m4, m3, m0                  ; q3-q0
+    psubw               m5, m2, m0                  ; q2-q0
+    ABS2                m4, m5, m6, m7              ; abs(q3-q0) | abs(q2-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q3-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q2-q0) > F
+%endif
+    psubw               m3, m2                      ; q3-q2
+    psubw               m2, m1                      ; q2-q1
+    ABS2                m3, m2, m6, m7              ; abs(q3-q2) | abs(q2-q1)
+    pcmpgtw             m3, reg_I                   ; abs(q3-q2) > I
+    pcmpgtw             m2, reg_I                   ; abs(q2-q1) > I
+%if %1 > 4
+    por                 m4, m5
+%endif
+    por                 m2, m3
+    psubw               m3, m1, m0                  ; q1-q0
+    ABS1                m3, m5                      ; abs(q1-q0)
+%if %1 > 4
+    pcmpgtw             m6, m3, reg_F               ; abs(q1-q0) > F
+%endif
+    pcmpgtw             m7, m3, reg_H               ; abs(q1-q0) > H
+    pcmpgtw             m3, reg_I                   ; abs(q1-q0) > I
+%if %1 > 4
+    por                 m4, m6
+%endif
+    por                 m2, m3
+%endmacro
+
+; one step in filter_14/filter_6
+;
+; take sum $reg, downshift, apply mask and write into dst
+;
+; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
+; step's sum $reg. This is omitted for the last row in each filter.
+;
+; if dont_store is set, don't write the result into memory, instead keep the
+; values in register so we can write it out later
+%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
+                                      ; src/sub1, sub2, add1, add2, dont_store
+    psrlw               %1, %2, %4
+    psubw               %1, %6                      ; abs->delta
+%ifnidn %7, ""
+    psubw               %2, %6
+    psubw               %2, %7
+    paddw               %2, %8
+    paddw               %2, %9
+%endif
+    pand                %1, reg_%3                  ; apply mask
+%if %10 == 1
+    paddw               %6, %1                      ; delta->abs
+%else
+    paddw               %1, %6                      ; delta->abs
+    mova              [%5], %1
+%endif
+%endmacro
+
+; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
+
+%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
+
+%if ARCH_X86_64
+%if %2 == 16
+%assign %%num_xmm_regs 16
+%elif %2 == 8
+%assign %%num_xmm_regs 15
+%else ; %2 == 4
+%assign %%num_xmm_regs 14
+%endif ; %2
+%assign %%bak_mem 0
+%else ; ARCH_X86_32
+%assign %%num_xmm_regs 8
+%if %2 == 16
+%assign %%bak_mem 7
+%elif %2 == 8
+%assign %%bak_mem 6
+%else ; %2 == 4
+%assign %%bak_mem 5
+%endif ; %2
+%endif ; ARCH_X86_64/32
+
+%if %2 == 16
+%ifidn %1, v
+%assign %%num_gpr_regs 6
+%else ; %1 == h
+%assign %%num_gpr_regs 5
+%endif ; %1
+%assign %%wd_mem 6
+%else ; %2 == 8/4
+%assign %%num_gpr_regs 5
+%if ARCH_X86_32 && %2 == 8
+%assign %%wd_mem 2
+%else ; ARCH_X86_64 || %2 == 4
+%assign %%wd_mem 0
+%endif ; ARCH_X86_64/32 etc.
+%endif ; %2
+
+%ifidn %1, v
+%assign %%tsp_mem 0
+%elif %2 == 16 ; && %1 == h
+%assign %%tsp_mem 16
+%else ; %1 == h && %1 == 8/4
+%assign %%tsp_mem 8
+%endif ; %1/%2
+
+%assign %%off %%wd_mem
+%assign %%tspoff %%bak_mem+%%wd_mem
+%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
+
+%if %3 == 10
+%define %%maxsgn 511
+%define %%minsgn m512
+%define %%maxusgn 1023
+%define %%maxf 4
+%else ; %3 == 12
+%define %%maxsgn 2047
+%define %%minsgn m2048
+%define %%maxusgn 4095
+%define %%maxf 16
+%endif ; %3
+
+cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
+    ; prepare E, I and H masks
+    shl                 Ed, %3-8
+    shl                 Id, %3-8
+    shl                 Hd, %3-8
+%if cpuflag(ssse3)
+    mova                m0, [pw_256]
+%endif
+    movd                m1, Ed
+    movd                m2, Id
+    movd                m3, Hd
+%if cpuflag(ssse3)
+    pshufb              m1, m0                      ; E << (bit_depth - 8)
+    pshufb              m2, m0                      ; I << (bit_depth - 8)
+    pshufb              m3, m0                      ; H << (bit_depth - 8)
+%else
+    punpcklwd           m1, m1
+    punpcklwd           m2, m2
+    punpcklwd           m3, m3
+    pshufd              m1, m1, q0000
+    pshufd              m2, m2, q0000
+    pshufd              m3, m3, q0000
+%endif
+    SCRATCH              1,  8, rsp+(%%off+0)*mmsize,  E
+    SCRATCH              2,  9, rsp+(%%off+1)*mmsize,  I
+    SCRATCH              3, 10, rsp+(%%off+2)*mmsize,  H
+%if %2 > 4
+    PRELOAD                 11, pw_ %+ %%maxf, F
+%endif
+
+    ; set up variables to load data
+%ifidn %1, v
+    DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
+    lea           stride3q, [strideq*3]
+    neg            strideq
+%if %2 == 16
+    lea              dst0q, [dst8q+strideq*8]
+%else
+    lea              dst4q, [dst8q+strideq*4]
+%endif
+    neg            strideq
+%if %2 == 16
+    lea             dst12q, [dst8q+strideq*4]
+    lea              dst4q, [dst0q+strideq*4]
+%endif
+
+%if %2 == 16
+%define %%p7 dst0q
+%define %%p6 dst0q+strideq
+%define %%p5 dst0q+strideq*2
+%define %%p4 dst0q+stride3q
+%endif
+%define %%p3 dst4q
+%define %%p2 dst4q+strideq
+%define %%p1 dst4q+strideq*2
+%define %%p0 dst4q+stride3q
+%define %%q0 dst8q
+%define %%q1 dst8q+strideq
+%define %%q2 dst8q+strideq*2
+%define %%q3 dst8q+stride3q
+%if %2 == 16
+%define %%q4 dst12q
+%define %%q5 dst12q+strideq
+%define %%q6 dst12q+strideq*2
+%define %%q7 dst12q+stride3q
+%endif
+%else ; %1 == h
+    DEFINE_ARGS dst0, stride, stride3, dst4
+    lea           stride3q, [strideq*3]
+    lea              dst4q, [dst0q+strideq*4]
+
+%define %%p3 rsp+(%%tspoff+0)*mmsize
+%define %%p2 rsp+(%%tspoff+1)*mmsize
+%define %%p1 rsp+(%%tspoff+2)*mmsize
+%define %%p0 rsp+(%%tspoff+3)*mmsize
+%define %%q0 rsp+(%%tspoff+4)*mmsize
+%define %%q1 rsp+(%%tspoff+5)*mmsize
+%define %%q2 rsp+(%%tspoff+6)*mmsize
+%define %%q3 rsp+(%%tspoff+7)*mmsize
+
+%if %2 < 16
+    movu                m0, [dst0q+strideq*0-8]
+    movu                m1, [dst0q+strideq*1-8]
+    movu                m2, [dst0q+strideq*2-8]
+    movu                m3, [dst0q+stride3q -8]
+    movu                m4, [dst4q+strideq*0-8]
+    movu                m5, [dst4q+strideq*1-8]
+    movu                m6, [dst4q+strideq*2-8]
+    movu                m7, [dst4q+stride3q -8]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
+%endif
+
+    mova            [%%p3], m0
+    mova            [%%p2], m1
+    mova            [%%p1], m2
+    mova            [%%p0], m3
+%if ARCH_X86_64
+    mova            [%%q0], m4
+%endif
+    mova            [%%q1], m5
+    mova            [%%q2], m6
+    mova            [%%q3], m7
+
+    ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
+    ; order here accordingly
+%else ; %2 == 16
+
+%define %%p7 rsp+(%%tspoff+ 8)*mmsize
+%define %%p6 rsp+(%%tspoff+ 9)*mmsize
+%define %%p5 rsp+(%%tspoff+10)*mmsize
+%define %%p4 rsp+(%%tspoff+11)*mmsize
+%define %%q4 rsp+(%%tspoff+12)*mmsize
+%define %%q5 rsp+(%%tspoff+13)*mmsize
+%define %%q6 rsp+(%%tspoff+14)*mmsize
+%define %%q7 rsp+(%%tspoff+15)*mmsize
+
+    mova                m0, [dst0q+strideq*0-16]
+    mova                m1, [dst0q+strideq*1-16]
+    mova                m2, [dst0q+strideq*2-16]
+    mova                m3, [dst0q+stride3q -16]
+    mova                m4, [dst4q+strideq*0-16]
+    mova                m5, [dst4q+strideq*1-16]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2-16]
+%endif
+    mova                m7, [dst4q+stride3q -16]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
+%endif
+
+    mova            [%%p7], m0
+    mova            [%%p6], m1
+    mova            [%%p5], m2
+    mova            [%%p4], m3
+%if ARCH_X86_64
+    mova            [%%p3], m4
+%endif
+    mova            [%%p2], m5
+    mova            [%%p1], m6
+    mova            [%%p0], m7
+
+    mova                m0, [dst0q+strideq*0]
+    mova                m1, [dst0q+strideq*1]
+    mova                m2, [dst0q+strideq*2]
+    mova                m3, [dst0q+stride3q ]
+    mova                m4, [dst4q+strideq*0]
+    mova                m5, [dst4q+strideq*1]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2]
+%endif
+    mova                m7, [dst4q+stride3q ]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
+%endif
+
+    mova            [%%q0], m0
+    mova            [%%q1], m1
+    mova            [%%q2], m2
+    mova            [%%q3], m3
+%if ARCH_X86_64
+    mova            [%%q4], m4
+%endif
+    mova            [%%q5], m5
+    mova            [%%q6], m6
+    mova            [%%q7], m7
+
+    ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
+    ; order here accordingly
+%endif ; %2
+%endif ; %1
+
+    ; load q0|q4-7 data
+    mova                m0, [%%q0]
+%if %2 == 16
+    mova                m4, [%%q4]
+    mova                m5, [%%q5]
+    mova                m6, [%%q6]
+    mova                m7, [%%q7]
+
+    ; flat8out q portion
+    FLAT8OUT_HALF
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; load q1-3 data
+    mova                m1, [%%q1]
+    mova                m2, [%%q2]
+    mova                m3, [%%q3]
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flatout[q]
+    ; m12-14=free
+    ; m0-3=q0-q3
+    ; m4-7=free
+
+    ; flat8in|fm|hev q portion
+    FLAT8IN_HALF        %2
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+%if %2 > 4
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8I
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; m2=!fm[q]
+    ; m0,1=q0-q1
+    ; m2-7=free
+    ; m12=free
+
+    ; load p0-1
+    mova                m3, [%%p0]
+    mova                m4, [%%p1]
+
+    ; fm mb_edge portion
+    psubw               m5, m3, m0                  ; q0-p0
+    psubw               m6, m4, m1                  ; q1-p1
+%if ARCH_X86_64
+    ABS2                m5, m6, m7, m12             ; abs(q0-p0) | abs(q1-p1)
+%else
+    ABS1                m5, m7                      ; abs(q0-p0)
+    ABS1                m6, m7                      ; abs(q1-p1)
+%endif
+    paddw               m5, m5
+    psraw               m6, 1
+    paddw               m6, m5                      ; abs(q0-p0)*2+(abs(q1-p1)>>1)
+    pcmpgtw             m6, reg_E
+    por                 m2, m6
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, FM
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m3-4=q0-1
+    ; m0-2/5-7=free
+
+    ; load p4-7 data
+    SWAP                 3, 0                       ; p0
+    SWAP                 4, 1                       ; p1
+%if %2 == 16
+    mova                m7, [%%p7]
+    mova                m6, [%%p6]
+    mova                m5, [%%p5]
+    mova                m4, [%%p4]
+
+    ; flat8out p portion
+    FLAT8OUT_HALF
+    por                 m7, reg_F8O
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m0=p0
+    ; m1-7=free
+
+    ; load p2-3 data
+    mova                m2, [%%p2]
+    mova                m3, [%%p3]
+
+    ; flat8in|fm|hev p portion
+    FLAT8IN_HALF        %2
+    por                 m7, reg_HEV
+%if %2 > 4
+    por                 m4, reg_F8I
+%endif
+    por                 m2, reg_FM
+%if %2 > 4
+    por                 m4, m2                      ; !flat8|!fm
+%if %2 == 16
+    por                 m5, m4, reg_F8O             ; !flat16|!fm
+    pandn               m2, m4                      ; filter4_mask
+    pandn               m4, m5                      ; filter8_mask
+    pxor                m5, [pw_m1]                 ; filter16_mask
+    SCRATCH              5, 15, rsp+(%%off+6)*mmsize, F16M
+%else
+    pandn               m2, m4                      ; filter4_mask
+    pxor                m4, [pw_m1]                 ; filter8_mask
+%endif
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8M
+%else
+    pxor                m2, [pw_m1]                 ; filter4_mask
+%endif
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, F4M
+
+    ; r9[m15]=filter16_mask
+    ; r10[m13]=hev
+    ; r11[m14]=filter8_mask
+    ; r12[m12]=filter4_mask
+    ; m0,1=p0-p1
+    ; m2-7=free
+    ; m8-11=free
+
+%if %2 > 4
+%if %2 == 16
+    ; filter_14
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m6, [%%p5]
+    mova                m7, [%%p4]
+    PRELOAD              8, %%p3, P3
+    PRELOAD              9, %%p2, P2
+%endif
+    PRELOAD             10, %%q0, Q0
+    PRELOAD             11, %%q1, Q1
+%if %2 == 16
+    psllw               m4, m2, 3
+    paddw               m5, m3, m3
+    paddw               m4, m6
+    paddw               m5, m7
+    paddw               m4, reg_P3
+    paddw               m5, reg_P2
+    paddw               m4, m1
+    paddw               m5, m0
+    paddw               m4, reg_Q0                  ; q0+p1+p3+p5+p7*8
+    psubw               m5, m2                      ; p0+p2+p4+p6*2-p7
+    paddw               m4, [pw_8]
+    paddw               m5, m4                      ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
+
+    ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
+    ; at the end of the filter
+
+    mova    [rsp+0*mmsize], m3
+    FILTER_STEP         m4, m5, F16M, 4, %%p6, m3,     m2,             m6,     reg_Q1
+%endif
+    mova                m3, [%%q2]
+%if %2 == 16
+    mova    [rsp+1*mmsize], m6
+    FILTER_STEP         m4, m5, F16M, 4, %%p5, m6,     m2,             m7,     m3
+%endif
+    mova                m6, [%%q3]
+%if %2 == 16
+    mova    [rsp+2*mmsize], m7
+    FILTER_STEP         m4, m5, F16M, 4, %%p4, m7,     m2,             reg_P3, m6
+    mova                m7, [%%q4]
+%if ARCH_X86_64
+    mova    [rsp+3*mmsize], reg_P3
+%else
+    mova                m4, reg_P3
+    mova    [rsp+3*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p3, reg_P3, m2,             reg_P2, m7
+    PRELOAD              8, %%q5, Q5
+%if ARCH_X86_64
+    mova    [rsp+4*mmsize], reg_P2
+%else
+    mova                m4, reg_P2
+    mova    [rsp+4*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p2, reg_P2, m2,             m1,     reg_Q5
+    PRELOAD              9, %%q6, Q6
+    mova    [rsp+5*mmsize], m1
+    FILTER_STEP         m4, m5, F16M, 4, %%p1, m1,     m2,             m0,     reg_Q6
+    mova                m1, [%%q7]
+    FILTER_STEP         m4, m5, F16M, 4, %%p0, m0,     m2,             reg_Q0, m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q2, m3,     [rsp+2*mmsize], m6,     m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q3, m6,     [rsp+3*mmsize], m7,     m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q4, m7,     [rsp+4*mmsize], reg_Q5, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q6, reg_Q6
+
+    mova                m7, [%%p1]
+%else
+    SWAP                 1, 7
+%endif
+
+    mova                m2, [%%p3]
+    mova                m1, [%%p2]
+
+    ; reg_Q0-1 (m10-m11)
+    ; m0=p0
+    ; m1=p2
+    ; m2=p3
+    ; m3=q2
+    ; m4-5=free
+    ; m6=q3
+    ; m7=p1
+    ; m8-9 unused
+
+    ; filter_6
+    psllw               m4, m2, 2
+    paddw               m5, m1, m1
+    paddw               m4, m7
+    psubw               m5, m2
+    paddw               m4, m0
+    paddw               m5, reg_Q0
+    paddw               m4, [pw_4]
+    paddw               m5, m4
+
+%if ARCH_X86_64
+    mova                m8, m1
+    mova                m9, m7
+%else
+    mova    [rsp+0*mmsize], m1
+    mova    [rsp+1*mmsize], m7
+%endif
+%ifidn %1, v
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1, 1
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%p1, m7,     m2,             m0,     m3, 1
+    FILTER_STEP         m4, m5, F8M, 3, %%p0, m0,     m2,             reg_Q0, m6, 1
+%if ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, m8,             reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, m9,             m3,     m6, ARCH_X86_64
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m6, ARCH_X86_64
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%q2, m3
+
+    UNSCRATCH            2, 10, %%q0
+    UNSCRATCH            6, 11, %%q1
+%else
+    SWAP                 1, 7
+    mova                m2, [%%q0]
+    mova                m6, [%%q1]
+%endif
+    UNSCRATCH            3, 13, rsp+(%%off+4)*mmsize, HEV
+
+    ; m0=p0
+    ; m1=p2
+    ; m2=q0
+    ; m3=hev_mask
+    ; m4-5=free
+    ; m6=q1
+    ; m7=p1
+
+    ; filter_4
+    psubw               m4, m7, m6              ; p1-q1
+    psubw               m5, m2, m0              ; q0-p0
+    pand                m4, m3
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(p1-q1, 9) -> f
+    paddw               m4, m5
+    paddw               m5, m5
+    paddw               m4, m5                  ; 3*(q0-p0)+f
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(3*(q0-p0)+f, 9) -> f
+    pand                m4, reg_F4M
+    paddw               m5, m4, [pw_4]
+    paddw               m4, [pw_3]
+    pminsw              m5, [pw_ %+ %%maxsgn]
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    psraw               m5, 3                   ; min_intp2(f+4, 9)>>3 -> f1
+    psraw               m4, 3                   ; min_intp2(f+3, 9)>>3 -> f2
+    psubw               m2, m5                  ; q0-f1
+    paddw               m0, m4                  ; p0+f2
+    pandn               m3, m5                  ; f1 & !hev (for p1/q1 adj)
+    pxor                m4, m4
+    mova                m5, [pw_ %+ %%maxusgn]
+    pmaxsw              m2, m4
+    pmaxsw              m0, m4
+    pminsw              m2, m5
+    pminsw              m0, m5
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_16384]          ; (f1+1)>>1
+%else
+    paddw               m3, [pw_1]
+    psraw               m3, 1
+%endif
+    paddw               m7, m3                  ; p1+f
+    psubw               m6, m3                  ; q1-f
+    pmaxsw              m7, m4
+    pmaxsw              m6, m4
+    pminsw              m7, m5
+    pminsw              m6, m5
+
+    ; store
+%ifidn %1, v
+    mova            [%%p1], m7
+    mova            [%%p0], m0
+    mova            [%%q0], m2
+    mova            [%%q1], m6
+%else ; %1 == h
+%if %2 == 4
+    TRANSPOSE4x4W        7, 0, 2, 6, 1
+    movh   [dst0q+strideq*0-4], m7
+    movhps [dst0q+strideq*1-4], m7
+    movh   [dst0q+strideq*2-4], m0
+    movhps [dst0q+stride3q -4], m0
+    movh   [dst4q+strideq*0-4], m2
+    movhps [dst4q+strideq*1-4], m2
+    movh   [dst4q+strideq*2-4], m6
+    movhps [dst4q+stride3q -4], m6
+%elif %2 == 8
+    mova                m3, [%%p3]
+    mova                m4, [%%q2]
+    mova                m5, [%%q3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, 8
+%else
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
+    mova                m2, [%%q0]
+%endif
+
+    movu [dst0q+strideq*0-8], m3
+    movu [dst0q+strideq*1-8], m1
+    movu [dst0q+strideq*2-8], m7
+    movu [dst0q+stride3q -8], m0
+    movu [dst4q+strideq*0-8], m2
+    movu [dst4q+strideq*1-8], m6
+    movu [dst4q+strideq*2-8], m4
+    movu [dst4q+stride3q -8], m5
+%else ; %2 == 16
+    SCRATCH              2, 8, %%q0
+    SCRATCH              6, 9, %%q1
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m4, [%%p5]
+    mova                m5, [%%p4]
+    mova                m6, [%%p3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, 10
+%else
+    mova            [%%p1], m7
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
+%endif
+
+    mova [dst0q+strideq*0-16], m2
+    mova [dst0q+strideq*1-16], m3
+    mova [dst0q+strideq*2-16], m4
+    mova [dst0q+stride3q -16], m5
+%if ARCH_X86_64
+    mova [dst4q+strideq*0-16], m6
+%endif
+    mova [dst4q+strideq*1-16], m1
+    mova [dst4q+strideq*2-16], m7
+    mova [dst4q+stride3q -16], m0
+
+    UNSCRATCH            2, 8, %%q0
+    UNSCRATCH            6, 9, %%q1
+    mova                m0, [%%q2]
+    mova                m1, [%%q3]
+    mova                m3, [%%q4]
+    mova                m4, [%%q5]
+%if ARCH_X86_64
+    mova                m5, [%%q6]
+%endif
+    mova                m7, [%%q7]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, 8
+%else
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
+%endif
+
+    mova [dst0q+strideq*0], m2
+    mova [dst0q+strideq*1], m6
+    mova [dst0q+strideq*2], m0
+    mova [dst0q+stride3q ], m1
+%if ARCH_X86_64
+    mova [dst4q+strideq*0], m3
+%endif
+    mova [dst4q+strideq*1], m4
+    mova [dst4q+strideq*2], m5
+    mova [dst4q+stride3q ], m7
+%endif ; %2
+%endif ; %1
+    RET
+%endmacro
+
+%macro LOOP_FILTER_CPUSETS 3
+INIT_XMM sse2
+LOOP_FILTER %1, %2, %3
+INIT_XMM ssse3
+LOOP_FILTER %1, %2, %3
+INIT_XMM avx
+LOOP_FILTER %1, %2, %3
+%endmacro
+
+%macro LOOP_FILTER_WDSETS 2
+LOOP_FILTER_CPUSETS %1,  4, %2
+LOOP_FILTER_CPUSETS %1,  8, %2
+LOOP_FILTER_CPUSETS %1, 16, %2
+%endmacro
+
+LOOP_FILTER_WDSETS h, 10
+LOOP_FILTER_WDSETS v, 10
+LOOP_FILTER_WDSETS h, 12
+LOOP_FILTER_WDSETS v, 12
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index c9701ae..f64161b 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -3,20 +3,20 @@
 ;*
 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -45,6 +45,13 @@ times 8 dw %7
 times 8 dw %8
 %endmacro
 
+%macro F8_16BPP_TAPS 8
+times 8 dw %1, %2
+times 8 dw %3, %4
+times 8 dw %5, %6
+times 8 dw %7, %8
+%endmacro
+
 %macro FILTER 1
 const filters_%1 ; smooth
                     F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
@@ -102,12 +109,15 @@ FILTER ssse3
 %define F8_TAPS F8_SSE2_TAPS
 ; int16_t ff_filters_sse2[3][15][8][8]
 FILTER sse2
+%define F8_TAPS F8_16BPP_TAPS
+; int16_t ff_filters_16bpp[3][15][4][16]
+FILTER 16bpp
 
 SECTION .text
 
 %macro filter_sse2_h_fn 1
 %assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 15, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
     pxor        m5, m5
     mova        m6, [pw_64]
     mova        m7, [filteryq+  0]
@@ -192,7 +202,7 @@ filter_sse2_h_fn avg
 
 %macro filter_h_fn 1
 %assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
     mova        m6, [pw_256]
     mova        m7, [filteryq+ 0]
 %if ARCH_X86_64 && mmsize > 8
@@ -253,7 +263,7 @@ filter_h_fn avg
 %if ARCH_X86_64
 %macro filter_hx2_fn 1
 %assign %%px mmsize
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
     mova       m13, [pw_256]
     mova        m8, [filteryq+ 0]
     mova        m9, [filteryq+32]
@@ -315,9 +325,9 @@ filter_hx2_fn avg
 %macro filter_sse2_v_fn 1
 %assign %%px mmsize/2
 %if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
 %else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
     mov   filteryq, r5mp
 %define hd r4mp
 %endif
@@ -413,9 +423,9 @@ filter_sse2_v_fn avg
 %macro filter_v_fn 1
 %assign %%px mmsize/2
 %if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
 %else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
     mov   filteryq, r5mp
 %define hd r4mp
 %endif
@@ -486,7 +496,7 @@ filter_v_fn avg
 
 %macro filter_vx2_fn 1
 %assign %%px mmsize
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
     mova       m13, [pw_256]
     lea  sstride3q, [sstrideq*3]
     lea      src4q, [srcq+sstrideq]
@@ -552,7 +562,7 @@ filter_vx2_fn avg
 
 %endif ; ARCH_X86_64
 
-%macro fpel_fn 6
+%macro fpel_fn 6-8 0, 4
 %if %2 == 4
 %define %%srcfn movh
 %define %%dstfn movh
@@ -561,29 +571,57 @@ filter_vx2_fn avg
 %define %%dstfn mova
 %endif
 
+%if %7 == 8
+%define %%pavg pavgb
+%define %%szsuf _8
+%elif %7 == 16
+%define %%pavg pavgw
+%define %%szsuf _16
+%else
+%define %%szsuf
+%endif
+
 %if %2 <= mmsize
-cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
     lea  sstride3q, [sstrideq*3]
     lea  dstride3q, [dstrideq*3]
 %else
-cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
+cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
 %endif
 .loop:
     %%srcfn     m0, [srcq]
     %%srcfn     m1, [srcq+s%3]
     %%srcfn     m2, [srcq+s%4]
     %%srcfn     m3, [srcq+s%5]
+%if %2/mmsize == 8
+    %%srcfn     m4, [srcq+mmsize*4]
+    %%srcfn     m5, [srcq+mmsize*5]
+    %%srcfn     m6, [srcq+mmsize*6]
+    %%srcfn     m7, [srcq+mmsize*7]
+%endif
     lea       srcq, [srcq+sstrideq*%6]
 %ifidn %1, avg
-    pavgb       m0, [dstq]
-    pavgb       m1, [dstq+d%3]
-    pavgb       m2, [dstq+d%4]
-    pavgb       m3, [dstq+d%5]
+    %%pavg      m0, [dstq]
+    %%pavg      m1, [dstq+d%3]
+    %%pavg      m2, [dstq+d%4]
+    %%pavg      m3, [dstq+d%5]
+%if %2/mmsize == 8
+    %%pavg      m4, [dstq+mmsize*4]
+    %%pavg      m5, [dstq+mmsize*5]
+    %%pavg      m6, [dstq+mmsize*6]
+    %%pavg      m7, [dstq+mmsize*7]
+%endif
 %endif
     %%dstfn [dstq], m0
     %%dstfn [dstq+d%3], m1
     %%dstfn [dstq+d%4], m2
     %%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+    %%dstfn [dstq+mmsize*4], m4
+    %%dstfn [dstq+mmsize*5], m5
+    %%dstfn [dstq+mmsize*6], m6
+    %%dstfn [dstq+mmsize*7], m7
+%endif
     lea       dstq, [dstq+dstrideq*%6]
     sub         hd, %6
     jnz .loop
@@ -598,23 +636,38 @@ INIT_MMX mmx
 fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
 fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
 INIT_MMX mmxext
-fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
-fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
+fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
 INIT_XMM sse
 fpel_fn put, 16, strideq, strideq*2, stride3q, 4
 fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
 fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
 INIT_XMM sse2
-fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
-fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
+fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
 INIT_YMM avx
 fpel_fn put, 32, strideq, strideq*2, stride3q, 4
 fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
+%endif
+INIT_MMX mmxext
+fpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
+INIT_XMM sse2
+fpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
 %endif
 %undef s16
 %undef d16
diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm
new file mode 100644
index 0000000..9a462ea
--- /dev/null
+++ b/libavcodec/x86/vp9mc_16bpp.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_64: times 8 dd 64
+
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+%macro filter_h4_fn 1-2 12
+cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movh        m0, [srcq-6]
+    movh        m1, [srcq-4]
+    movh        m2, [srcq-2]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+2]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+32]
+%endif
+    movu        m1, [srcq+4]
+    movu        m3, [srcq+6]
+    paddd       m0, m2
+    movu        m2, [srcq+8]
+    add       srcq, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h4_fn put
+filter_h4_fn avg
+
+%macro filter_h_fn 1-2 12
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movu        m0, [srcq-6]
+    movu        m1, [srcq-4]
+    movu        m2, [srcq-2]
+    movu        m3, [srcq+0]
+    movu        m4, [srcq+2]
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+    pmaddwd     m4, m9
+%else
+    pmaddwd     m2, [filteryq+32]
+    pmaddwd     m3, [filteryq+32]
+    pmaddwd     m4, [filteryq+64]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    paddd       m0, m4
+    movu        m2, [srcq+4]
+    movu        m3, [srcq+6]
+    movu        m4, [srcq+8]
+    add       srcq, sstrideq
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m9
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m2, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+    pmaddwd     m4, [filteryq+96]
+%endif
+    paddd       m1, m2
+    paddd       m0, m3
+    paddd       m1, m4
+    paddd       m0, m6
+    paddd       m1, m6
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+    packusdw    m1, m1
+%else
+    packssdw    m0, m0
+    packssdw    m1, m1
+%endif
+    punpcklwd   m0, m1
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h_fn put
+filter_h_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_h_fn put
+filter_h_fn avg
+%endif
+
+%macro filter_v4_fn 1-2 12
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movh        m4, [src4q]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+%endif
+    movh        m1, [src4q+sstrideq]
+    movh        m3, [src4q+sstrideq*2]
+    paddd       m0, m2
+    movh        m2, [src4q+sstride3q]
+    add      src4q, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m3, [filteryq+ 96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v4_fn put
+filter_v4_fn avg
+
+%macro filter_v_fn 1-2 13
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m12, m12
+%endif
+%if ARCH_X86_64
+    mova       m11, [pd_64]
+%endif
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movu        m0, [srcq]
+    movu        m1, [srcq+sstrideq]
+    movu        m2, [srcq+sstrideq*2]
+    movu        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movu        m4, [src4q]
+    SBUTTERFLY  wd, 0, 1, 6
+    SBUTTERFLY  wd, 2, 3, 6
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+    pmaddwd     m3, [filteryq+ 32]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    movu        m2, [src4q+sstrideq]
+    movu        m3, [src4q+sstrideq*2]
+    SBUTTERFLY  wd, 4, 2, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m2, m9
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m2, [filteryq+ 64]
+%endif
+    paddd       m0, m4
+    paddd       m1, m2
+    movu        m4, [src4q+sstride3q]
+    add      src4q, sstrideq
+    SBUTTERFLY  wd, 3, 4, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m3, [filteryq+ 96]
+    pmaddwd     m4, [filteryq+ 96]
+%endif
+    paddd       m0, m3
+    paddd       m1, m4
+%if ARCH_X86_64
+    paddd       m0, m11
+    paddd       m1, m11
+%else
+    paddd       m0, [pd_64]
+    paddd       m1, [pd_64]
+%endif
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m1
+%else
+    packssdw    m0, m1
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m12
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v_fn put
+filter_v_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_v_fn put
+filter_v_fn avg
+%endif
diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c
index 8925573..8f2b8a6 100644
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
  * check XMM registers for clobbers on Win64
  * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
new file mode 100644
index 0000000..0220885
--- /dev/null
+++ b/libavcodec/x86/xvididct.asm
@@ -0,0 +1,983 @@
+; XVID MPEG-4 VIDEO CODEC
+;
+; Conversion from gcc syntax to x264asm syntax with modifications
+; by Christophe Gisquet <christophe.gisquet@gmail.com>
+;
+; ===========     SSE2 inverse discrete cosine transform     ===========
+;
+; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+;
+; Conversion to gcc syntax with modifications
+; by Alexander Strange <astrange@ithinksw.com>
+;
+; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+;
+; Vertical pass is an implementation of the scheme:
+;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+;  Proc. ICASSP 1989, 988-991.
+;
+; Horizontal pass is a double 4x4 vector/matrix multiplication,
+; (see also Intel's Application Note 922:
+;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+;  Copyright (C) 1999 Intel Corporation)
+;
+; More details at http://skal.planet-d.net/coding/dct.html
+;
+; =======     MMX and XMM forward discrete cosine transform     =======
+;
+; Copyright(C) 2001 Peter Ross <pross@xvid.org>
+;
+; Originally provided by Intel at AP-922
+; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+; but in a limited edition.
+; New macro implements a column part for precise iDCT
+; The routine precision now satisfies IEEE standard 1180-1990.
+;
+; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+;
+; http://www.elecard.com/peter/idct.html
+; http://www.linuxvideo.org/mpeg2dec/
+;
+; These examples contain code fragments for first stage iDCT 8x8
+; (for rows) and first stage DCT 8x8 (for columns)
+;
+; conversion to gcc syntax by Michael Niedermayer
+;
+; ======================================================================
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+; Similar to tg_1_16 in MMX code
+tan1:   times 8 dw 13036
+tan2:   times 8 dw 27146
+tan3:   times 8 dw 43790
+sqrt2:  times 8 dw 23170
+
+; SSE2 tables
+iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
+        dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
+        dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
+        dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+iTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
+        dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
+        dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
+        dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+iTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
+        dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
+        dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
+        dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
+        dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
+        dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
+        dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+
+%if ARCH_X86_32
+; -----------------------------------------------------------------------------
+;
+; The first stage iDCT 8x8 - inverse DCTs of rows
+;
+; -----------------------------------------------------------------------------
+; The 8-point inverse DCT direct algorithm
+; -----------------------------------------------------------------------------
+;
+; static const short w[32] = {
+;     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
+;     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
+;     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
+;     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+;     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
+;     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
+;
+; #define DCT_8_INV_ROW(x, y)
+; {
+;     int a0, a1, a2, a3, b0, b1, b2, b3;
+;
+;     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
+;     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
+;     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
+;     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+;     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+;     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+;     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+;     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+;
+;     y[0] = SHIFT_ROUND(a0 + b0);
+;     y[1] = SHIFT_ROUND(a1 + b1);
+;     y[2] = SHIFT_ROUND(a2 + b2);
+;     y[3] = SHIFT_ROUND(a3 + b3);
+;     y[4] = SHIFT_ROUND(a3 - b3);
+;     y[5] = SHIFT_ROUND(a2 - b2);
+;     y[6] = SHIFT_ROUND(a1 - b1);
+;     y[7] = SHIFT_ROUND(a0 - b0);
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; In this implementation the outputs of the iDCT-1D are multiplied
+;     for rows 0,4 - by cos_4_16,
+;     for rows 1,7 - by cos_1_16,
+;     for rows 2,6 - by cos_2_16,
+;     for rows 3,5 - by cos_3_16
+; and are shifted to the left for better accuracy.
+;
+; For the constants used,
+;     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; Tables for mmx processors
+; -----------------------------------------------------------------------------
+
+; Table for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_mmx: dw  16384,  16384,  16384, -16384
+              dw  21407,   8867,   8867, -21407 ; w07 w05 w03 w01
+              dw  16384, -16384,  16384,  16384 ; w14 w12 w10 w08
+              dw  -8867,  21407, -21407,  -8867 ; w15 w13 w11 w09
+              dw  22725,  12873,  19266, -22725 ; w22 w20 w18 w16
+              dw  19266,   4520,  -4520, -12873 ; w23 w21 w19 w17
+              dw  12873,   4520,   4520,  19266 ; w30 w28 w26 w24
+              dw -22725,  19266, -12873, -22725 ; w31 w29 w27 w25
+; Table for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  22725,  22725, -22725 ; movq-> w06 w04 w02 w00
+              dw  29692,  12299,  12299, -29692 ; w07 w05 w03 w01
+              dw  22725, -22725,  22725,  22725 ; w14 w12 w10 w08
+              dw -12299,  29692, -29692, -12299 ; w15 w13 w11 w09
+              dw  31521,  17855,  26722, -31521 ; w22 w20 w18 w16
+              dw  26722,   6270,  -6270, -17855 ; w23 w21 w19 w17
+              dw  17855,   6270,   6270,  26722 ; w30 w28 w26 w24
+              dw -31521,  26722, -17855, -31521 ; w31 w29 w27 w25
+; Table for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  21407,  21407, -21407 ; movq-> w06 w04 w02 w00
+              dw  27969,  11585,  11585, -27969 ; w07 w05 w03 w01
+              dw  21407, -21407,  21407,  21407 ; w14 w12 w10 w08
+              dw -11585,  27969, -27969, -11585 ; w15 w13 w11 w09
+              dw  29692,  16819,  25172, -29692 ; w22 w20 w18 w16
+              dw  25172,   5906,  -5906, -16819 ; w23 w21 w19 w17
+              dw  16819,   5906,   5906,  25172 ; w30 w28 w26 w24
+              dw -29692,  25172, -16819, -29692 ; w31 w29 w27 w25
+; Table for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  19266,  19266, -19266 ; movq-> w06 w04 w02 w00
+              dw  25172,  10426,  10426, -25172 ; w07 w05 w03 w01
+              dw  19266, -19266,  19266,  19266 ; w14 w12 w10 w08
+              dw -10426,  25172, -25172, -10426 ; w15 w13 w11 w09
+              dw  26722,  15137,  22654, -26722 ; w22 w20 w18 w16
+              dw  22654,   5315,  -5315, -15137 ; w23 w21 w19 w17
+              dw  15137,   5315,   5315,  22654 ; w30 w28 w26 w24
+              dw -26722,  22654, -15137, -26722 ; w31 w29 w27 w25
+
+; -----------------------------------------------------------------------------
+; Tables for xmm processors
+; -----------------------------------------------------------------------------
+
+; %3 for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_xmm: dw  16384,  21407,  16384,   8867 ; movq-> w05 w04 w01 w00
+              dw  16384,   8867, -16384, -21407 ; w07 w06 w03 w02
+              dw  16384,  -8867,  16384, -21407 ; w13 w12 w09 w08
+              dw -16384,  21407,  16384,  -8867 ; w15 w14 w11 w10
+              dw  22725,  19266,  19266,  -4520 ; w21 w20 w17 w16
+              dw  12873,   4520, -22725, -12873 ; w23 w22 w19 w18
+              dw  12873, -22725,   4520, -12873 ; w29 w28 w25 w24
+              dw   4520,  19266,  19266, -22725 ; w31 w30 w27 w26
+; %3 for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  29692,  22725,  12299 ; movq-> w05 w04 w01 w00
+              dw  22725,  12299, -22725, -29692 ; w07 w06 w03 w02
+              dw  22725, -12299,  22725, -29692 ; w13 w12 w09 w08
+              dw -22725,  29692,  22725, -12299 ; w15 w14 w11 w10
+              dw  31521,  26722,  26722,  -6270 ; w21 w20 w17 w16
+              dw  17855,   6270, -31521, -17855 ; w23 w22 w19 w18
+              dw  17855, -31521,   6270, -17855 ; w29 w28 w25 w24
+              dw   6270,  26722,  26722, -31521 ; w31 w30 w27 w26
+; %3 for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  27969,  21407,  11585 ; movq-> w05 w04 w01 w00
+              dw  21407,  11585, -21407, -27969 ; w07 w06 w03 w02
+              dw  21407, -11585,  21407, -27969 ; w13 w12 w09 w08
+              dw -21407,  27969,  21407, -11585 ; w15 w14 w11 w10
+              dw  29692,  25172,  25172,  -5906 ; w21 w20 w17 w16
+              dw  16819,   5906, -29692, -16819 ; w23 w22 w19 w18
+              dw  16819, -29692,   5906, -16819 ; w29 w28 w25 w24
+              dw   5906,  25172,  25172, -29692 ; w31 w30 w27 w26
+; %3 for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  25172,  19266,  10426 ; movq-> w05 w04 w01 w00
+              dw  19266,  10426, -19266, -25172 ; w07 w06 w03 w02
+              dw  19266, -10426,  19266, -25172 ; w13 w12 w09 w08
+              dw -19266,  25172,  19266, -10426 ; w15 w14 w11 w10
+              dw  26722,  22654,  22654,  -5315 ; w21 w20 w17 w16
+              dw  15137,   5315, -26722, -15137 ; w23 w22 w19 w18
+              dw  15137, -26722,   5315, -15137 ; w29 w28 w25 w24
+              dw   5315,  22654,  22654, -26722 ; w31 w30 w27 w26
+%endif ; ~ARCH_X86_32
+
+; Similar to rounder_0 in MMX code
+; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
+walkenIdctRounders: times 4 dd 65536
+                    times 4 dd  3597
+                    times 4 dd  2260
+                    times 4 dd  1203
+                    times 4 dd   120
+                    times 4 dd   512
+                    times 2 dd     0
+
+pb_127: times 8 db 127
+
+SECTION .text
+
+; Temporary storage before the column pass
+%define ROW1 xmm6
+%define ROW3 xmm4
+%define ROW5 xmm5
+%define ROW7 xmm7
+
+%macro CLEAR_ODD 1
+    pxor      %1, %1
+%endmacro
+%macro PUT_ODD 1
+    pshufhw   %1, xmm2, 0x1B
+%endmacro
+
+%macro MOV32 2
+%if ARCH_X86_32
+    movdqa    %2, %1
+%endif
+%endmacro
+
+%macro CLEAR_EVEN 1
+%if ARCH_X86_64
+    CLEAR_ODD %1
+%endif
+%endmacro
+
+%macro PUT_EVEN 1
+%if ARCH_X86_64
+    PUT_ODD   %1
+%else
+    pshufhw xmm2, xmm2, 0x1B
+    movdqa    %1, xmm2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+%define ROW0  xmm8
+%define REG0  ROW0
+%define ROW2  xmm9
+%define REG2  ROW2
+%define ROW4  xmm10
+%define REG4  ROW4
+%define ROW6  xmm11
+%define REG6  ROW6
+%define XMMS  xmm12
+%define SREG2 REG2
+%define TAN3  xmm13
+%define TAN1  xmm14
+%else
+%define ROW0  [BLOCK + 0*16]
+%define REG0  xmm4
+%define ROW2  [BLOCK + 2*16]
+%define REG2  xmm4
+%define ROW4  [BLOCK + 4*16]
+%define REG4  xmm6
+%define ROW6  [BLOCK + 6*16]
+%define REG6  xmm6
+%define XMMS  xmm2
+%define SREG2 xmm7
+%define TAN3  xmm0
+%define TAN1  xmm2
+%endif
+
+%macro JZ  2
+    test      %1, %1
+    jz       .%2
+%endmacro
+
+%macro JNZ  2
+    test      %1, %1
+    jnz      .%2
+%endmacro
+
+%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
+    %3        %4
+    movq     mm1, [%1]
+    por      mm1, [%1 + 8]
+    paddusb  mm1, mm0
+    pmovmskb  %2, mm1
+%endmacro
+
+;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
+%macro  TEST_TWO_ROWS  8
+    %5         %6
+    %7         %8
+    movq      mm1, [%1 + 0]
+    por       mm1, [%1 + 8]
+    movq      mm2, [%2 + 0]
+    por       mm2, [%2 + 8]
+    paddusb   mm1, mm0
+    paddusb   mm2, mm0
+    pmovmskb   %3, mm1
+    pmovmskb   %4, mm2
+%endmacro
+
+; IDCT pass on rows.
+%macro iMTX_MULT   4-5 ; src, table, put, arg, rounder
+    movdqa       xmm3, [%1]
+    movdqa       xmm0, xmm3
+    pshufd       xmm1, xmm3, 0x11 ; 4602
+    punpcklqdq   xmm0, xmm0       ; 0246
+    pmaddwd      xmm0, [%2]
+    pmaddwd      xmm1, [%2+16]
+    pshufd       xmm2, xmm3, 0xBB ; 5713
+    punpckhqdq   xmm3, xmm3       ; 1357
+    pmaddwd      xmm2, [%2+32]
+    pmaddwd      xmm3, [%2+48]
+    paddd        xmm0, xmm1
+    paddd        xmm2, xmm3
+%if %0 == 5
+    paddd        xmm0, [walkenIdctRounders+%5]
+%endif
+    movdqa       xmm3, xmm2
+    paddd        xmm2, xmm0
+    psubd        xmm0, xmm3
+    psrad        xmm2, 11
+    psrad        xmm0, 11
+    packssdw     xmm2, xmm0
+    %3           %4
+%endmacro
+
+%macro iLLM_HEAD 0
+    movdqa   TAN3, [tan3]
+    movdqa   TAN1, [tan1]
+%endmacro
+
+%macro FIRST_HALF 2  ; %1=dct  %2=type(normal,add,put)
+    psraw    xmm5, 6
+    psraw    REG0, 6
+    psraw    TAN3, 6
+    psraw    xmm3, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+%else
+    ; Must now load args as gprs are no longer used for masks
+    ; DEST is set to where address of dest was loaded
+    %if ARCH_X86_32
+        %if %2 == 2 ; Not enough xmms, store
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+        %endif
+    %xdefine DEST r2q ; BLOCK is r0, stride r1
+    movifnidn DEST, destm
+    movifnidn strideq, stridem
+    %else
+    %xdefine DEST r0q
+    %endif
+    lea      r3q, [3*strideq]
+    %if %2 == 1
+    packuswb TAN3, xmm3
+    packuswb xmm5, REG0
+    movq     [DEST + strideq], TAN3
+    movhps   [DEST + 2*strideq], TAN3
+    ; REG0 and TAN3 are now available (and likely used in second half)
+    %endif
+%endif
+%endmacro
+
+%macro SECOND_HALF 6 ; %1=dct  %2=type(normal,add,put) 3-6: xmms
+    psraw    %3, 6
+    psraw    %4, 6
+    psraw    %5, 6
+    psraw    %6, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+0*16], %3
+    movdqa   [%1+3*16], %5
+    movdqa   [%1+4*16], %6
+    movdqa   [%1+7*16], %4
+%elif %2 == 1
+    packuswb %3, %5
+    packuswb %6, %4
+    ; address of dest may have been loaded
+    movq     [DEST], %3
+    movhps   [DEST + r3q], %3
+    lea      DEST, [DEST + 4*strideq]
+    movq     [DEST], %6
+    movhps   [DEST + r3q], %6
+    ; and now write remainder of first half
+    movq     [DEST + 2*strideq], xmm5
+    movhps   [DEST + strideq], xmm5
+%elif %2 == 2
+    pxor        xmm0, xmm0
+    %if ARCH_X86_32
+    ; free: m3 REG0=m4 m5
+    ; input: m1, m7, m2, m6
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    paddsw      xmm3, %3
+    paddsw      xmm4, [%1 + 1*16]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw        %3, [%1 + 2*16]
+    paddsw      xmm5, %5
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw      xmm3, %6
+    paddsw      xmm4, [%1 + 5*16]
+    paddsw        %3, [%1 + 6*16]
+    paddsw      xmm5, %4
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    %else
+    ; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
+    ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %3
+    paddsw      xmm4, TAN3
+    paddsw     xmm12, xmm3
+    paddsw     xmm11, %5
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %6
+    paddsw      xmm4, REG0
+    paddsw     xmm12, xmm5
+    paddsw     xmm11, %4
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    %endif
+%endif
+%endmacro
+
+
+; IDCT pass on columns.
+%macro iLLM_PASS  2  ; %1=dct  %2=type(normal,add,put)
+    movdqa   xmm1, TAN3
+    movdqa   xmm3, TAN1
+    pmulhw   TAN3, xmm4
+    pmulhw   xmm1, xmm5
+    paddsw   TAN3, xmm4
+    paddsw   xmm1, xmm5
+    psubsw   TAN3, xmm5
+    paddsw   xmm1, xmm4
+    pmulhw   xmm3, xmm7
+    pmulhw   TAN1, xmm6
+    paddsw   xmm3, xmm6
+    psubsw   TAN1, xmm7
+    movdqa   xmm7, xmm3
+    movdqa   xmm6, TAN1
+    psubsw   xmm3, xmm1
+    psubsw   TAN1, TAN3
+    paddsw   xmm1, xmm7
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm7, [tan2]
+    MOV32    ROW2, REG2
+    MOV32    ROW6, REG6
+    movdqa   xmm5, xmm7
+    pmulhw   xmm7, REG6
+    pmulhw   xmm5, REG2
+    paddsw   xmm7, REG2
+    psubsw   xmm5, REG6
+    MOV32    ROW0, REG0
+    MOV32    ROW4, REG4
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, REG4
+    paddsw   REG4, XMMS
+    movdqa   XMMS, REG4
+    psubsw   REG4, xmm7
+    paddsw   xmm7, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, xmm7
+    movdqa   xmm4, REG4
+    psubsw   xmm7, xmm1
+    psubsw   REG4, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
+%endmacro
+
+; IDCT pass on columns, assuming rows 4-7 are zero
+%macro iLLM_PASS_SPARSE   2 ; %1=dct   %2=type(normal,put,add)
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, xmm4
+    movdqa   xmm3, xmm6
+    pmulhw   TAN1, xmm6
+    movdqa   xmm1, xmm4
+    psubsw   xmm3, xmm1
+    paddsw   xmm1, xmm6
+    movdqa   xmm6, TAN1
+    psubsw   TAN1, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm5, [tan2]
+    MOV32    ROW2, SREG2
+    pmulhw   xmm5, SREG2
+    MOV32    ROW0, REG0
+    movdqa   xmm6, REG0
+    psubsw   xmm6, SREG2
+    paddsw  SREG2, REG0
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, SREG2
+    movdqa   xmm4, xmm6
+    psubsw  SREG2, xmm1
+    psubsw   xmm6, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
+%endmacro
+
+%macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
+%if %1 == 0 || ARCH_X86_32
+    %define GPR0  r1d
+    %define GPR1  r2d
+    %define GPR2  r3d
+    %define GPR3  r4d
+    %define NUM_GPRS 5
+%else
+    %define GPR0  r3d
+    %define GPR1  r4d
+    %define GPR2  r5d
+    %define GPR3  r6d
+    %define NUM_GPRS 7
+%endif
+%if %1 == 0
+cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
+%xdefine BLOCK blockq
+%else
+    %if %1 == 1
+cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %else
+cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %endif
+    %if ARCH_X86_64
+    %xdefine BLOCK blockq
+    %else
+    mov    r0q, blockm
+    %xdefine BLOCK r0q
+    %endif
+%endif
+    movq           mm0, [pb_127]
+    iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
+    iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
+    iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
+
+    TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
+    JZ   GPR0, col1
+    iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
+.col1:
+    TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
+    TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
+
+    iLLM_HEAD
+    JNZ  GPR1, 2
+    JNZ  GPR0, 3
+    JNZ  GPR2, 4
+    JNZ  GPR3, 5
+    iLLM_PASS_SPARSE BLOCK, %1
+    jmp .6
+.2:
+    iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
+.3:
+    iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
+    JZ   GPR2, col2
+.4:
+    iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
+.col2:
+    JZ   GPR3, col3
+.5:
+    iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
+.col3:
+%if ARCH_X86_32
+    iLLM_HEAD
+%endif
+    iLLM_PASS     BLOCK, %1
+.6:
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_SSE2 0
+IDCT_SSE2 1
+IDCT_SSE2 2
+
+%if ARCH_X86_32
+
+; %1=offset  %2=tab_offset
+; %3=rnd_offset where 4*8->6*16  5*8->4*16  6/7*8->5*16
+%macro DCT_8_INV_ROW  3
+    movq       mm0, [r0+16*%1+0]  ; 0 ; x3 x2 x1 x0
+    movq       mm1, [r0+16*%1+8]  ; 1 ; x7 x6 x5 x4
+    movq       mm2, mm0       ; 2 ; x3 x2 x1 x0
+    movq       mm3, [%2+ 0]   ; 3 ; w06 w04 w02 w00
+%if cpuflag(mmxext)
+    pshufw     mm0, mm0, 0x88 ; x2 x0 x2 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w06 w03 w02
+    movq       mm5, mm1       ; 5 ; x7 x6 x5 x4
+    pmaddwd    mm3, mm0       ; x2*w05+x0*w04 x2*w01+x0*w00
+    movq       mm6, [%2+32]   ; 6 ; w21 w20 w17 w16
+    pshufw     mm1, mm1, 0x88 ; x6 x4 x6 x4
+    pmaddwd    mm4, mm1       ; x6*w07+x4*w06 x6*w03+x4*w02
+    movq       mm7, [%2+40]   ; 7; w23 w22 w19 w18
+    pshufw     mm2, mm2, 0xdd ; x3 x1 x3 x1
+    pmaddwd    mm6, mm2       ; x3*w21+x1*w20 x3*w17+x1*w16
+    pshufw     mm5, mm5, 0xdd ; x7 x5 x7 x5
+    pmaddwd    mm7, mm5       ; x7*w23+x5*w22 x7*w19+x5*w18
+    paddd      mm3, [walkenIdctRounders + %3]      ; +%3
+    pmaddwd    mm0, [%2+16]   ; x2*w13+x0*w12 x2*w09+x0*w08
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm1, [%2+24]   ; x6*w15+x4*w14 x6*w11+x4*w10
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm2, [%2+48]   ; x3*w29+x1*w28 x3*w25+x1*w24
+    paddd      mm6, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    pmaddwd    mm5, [%2+56]   ; x7*w31+x5*w30 x7*w27+x5*w26
+    paddd      mm3, mm6       ; a1+b1 a0+b0
+    paddd      mm0, [walkenIdctRounders + %3]      ; +%3
+    psrad      mm3, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm0, mm1       ; 1 ; a3=sum(even3) a2=sum(even2)
+    psubd      mm4, mm6       ; 6 ; a1-b1 a0-b0
+    movq       mm7, mm0       ; 7 ; a3 a2
+    paddd      mm2, mm5       ; 5 ; b3=sum(odd3) b2=sum(odd2)
+    paddd      mm0, mm2       ; a3+b3 a2+b2
+    psrad      mm4, 11        ; y6=a1-b1 y7=a0-b0
+    psubd      mm7, mm2       ; 2 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm7, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm3, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm7, mm4       ; 4 ; y6 y7 y4 y5
+    movq  [r0+16*%1+0], mm3       ; 3 ; save y3 y2 y1 y0
+    pshufw     mm7, mm7, 0xb1 ; y7 y6 y5 y4
+%else
+    punpcklwd  mm0, mm1       ; x5 x1 x4 x0
+    movq       mm5, mm0       ; 5 ; x5 x1 x4 x0
+    punpckldq  mm0, mm0       ; x4 x0 x4 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w05 w03 w01
+    punpckhwd  mm2, mm1       ; 1 ; x7 x3 x6 x2
+    pmaddwd    mm3, mm0       ; x4*w06+x0*w04 x4*w02+x0*w00
+    movq       mm6, mm2       ; 6 ; x7 x3 x6 x2
+    movq       mm1, [%2+32]   ; 1 ; w22 w20 w18 w16
+    punpckldq  mm2, mm2       ; x6 x2 x6 x2
+    pmaddwd    mm4, mm2       ; x6*w07+x2*w05 x6*w03+x2*w01
+    punpckhdq  mm5, mm5       ; x5 x1 x5 x1
+    pmaddwd    mm0, [%2+16]   ; x4*w14+x0*w12 x4*w10+x0*w08
+    punpckhdq  mm6, mm6       ; x7 x3 x7 x3
+    movq       mm7, [%2+40]   ; 7 ; w23 w21 w19 w17
+    pmaddwd    mm1, mm5       ; x5*w22+x1*w20 x5*w18+x1*w16
+    paddd      mm3, [walkenIdctRounders + %3]     ; +%3
+    pmaddwd    mm7, mm6       ; x7*w23+x3*w21 x7*w19+x3*w17
+    pmaddwd    mm2, [%2+24]   ; x6*w15+x2*w13 x6*w11+x2*w09
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm5, [%2+48]   ; x5*w30+x1*w28 x5*w26+x1*w24
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm6, [%2+56]   ; x7*w31+x3*w29 x7*w27+x3*w25
+    paddd      mm1, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    paddd      mm0, [walkenIdctRounders + %3]     ; +%3
+    psubd      mm3, mm1       ; a1-b1 a0-b0
+    psrad      mm3, 11        ; y6=a1-b1 y7=a0-b0
+    paddd      mm1, mm4       ; 4 ; a1+b1 a0+b0
+    paddd      mm0, mm2       ; 2 ; a3=sum(even3) a2=sum(even2)
+    psrad      mm1, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm5, mm6       ; 6 ; b3=sum(odd3) b2=sum(odd2)
+    movq       mm4, mm0       ; 4 ; a3 a2
+    paddd      mm0, mm5       ; a3+b3 a2+b2
+    psubd      mm4, mm5       ; 5 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm4, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm1, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm4, mm3       ; 3 ; y6 y7 y4 y5
+    movq       mm7, mm4       ; 7 ; y6 y7 y4 y5
+    psrld      mm4, 16        ; 0 y6 0 y4
+    pslld      mm7, 16        ; y7 0 y5 0
+    movq  [r0+16*%1+0], mm1   ; 1 ; save y3 y2 y1 y0
+    por        mm7, mm4       ; 4 ; y7 y6 y5 y4
+%endif
+    movq  [r0+16*%1+8], mm7   ; 7 ; save y7 y6 y5 y4
+%endmacro
+
+; -----------------------------------------------------------------------------
+;
+; The first stage DCT 8x8 - forward DCTs of columns
+;
+; The %2puts are multiplied
+; for rows 0,4 - on cos_4_16,
+; for rows 1,7 - on cos_1_16,
+; for rows 2,6 - on cos_2_16,
+; for rows 3,5 - on cos_3_16
+; and are shifted to the left for rise of accuracy
+;
+; -----------------------------------------------------------------------------
+;
+; The 8-point scaled forward DCT algorithm (26a8m)
+;
+; -----------------------------------------------------------------------------
+;
+;#define DCT_8_FRW_COL(x, y)
+; {
+;     short t0, t1, t2, t3, t4, t5, t6, t7;
+;     short tp03, tm03, tp12, tm12, tp65, tm65;
+;     short tp465, tm465, tp765, tm765;
+;
+;     t0 = LEFT_SHIFT(x[0] + x[7]);
+;     t1 = LEFT_SHIFT(x[1] + x[6]);
+;     t2 = LEFT_SHIFT(x[2] + x[5]);
+;     t3 = LEFT_SHIFT(x[3] + x[4]);
+;     t4 = LEFT_SHIFT(x[3] - x[4]);
+;     t5 = LEFT_SHIFT(x[2] - x[5]);
+;     t6 = LEFT_SHIFT(x[1] - x[6]);
+;     t7 = LEFT_SHIFT(x[0] - x[7]);
+;
+;     tp03 = t0 + t3;
+;     tm03 = t0 - t3;
+;     tp12 = t1 + t2;
+;     tm12 = t1 - t2;
+;
+;     y[0] = tp03 + tp12;
+;     y[4] = tp03 - tp12;
+;
+;     y[2] = tm03 + tm12 * tg_2_16;
+;     y[6] = tm03 * tg_2_16 - tm12;
+;
+;     tp65 = (t6 + t5) * cos_4_16;
+;     tm65 = (t6 - t5) * cos_4_16;
+;
+;     tp765 = t7 + tp65;
+;     tm765 = t7 - tp65;
+;     tp465 = t4 + tm65;
+;     tm465 = t4 - tm65;
+;
+;     y[1] = tp765 + tp465 * tg_1_16;
+;     y[7] = tp765 * tg_1_16 - tp465;
+;     y[5] = tm765 * tg_3_16 + tm465;
+;     y[3] = tm765 - tm465 * tg_3_16;
+; }
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; DCT_8_INV_COL_4  INP,OUT
+; -----------------------------------------------------------------------------
+%macro DCT_8_INV_COL 1
+    movq        mm0, [tan3]
+    movq        mm3, [%1+16*3]
+    movq        mm1, mm0 ; tg_3_16
+    movq        mm5, [%1+16*5]
+    pmulhw      mm0, mm3 ; x3*(tg_3_16-1)
+    movq        mm4, [tan1]
+    pmulhw      mm1, mm5 ; x5*(tg_3_16-1)
+    movq        mm7, [%1+16*7]
+    movq        mm2, mm4 ; tg_1_16
+    movq        mm6, [%1+16*1]
+    pmulhw      mm4, mm7 ; x7*tg_1_16
+    paddsw      mm0, mm3 ; x3*tg_3_16
+    pmulhw      mm2, mm6 ; x1*tg_1_16
+    paddsw      mm1, mm3 ; x3+x5*(tg_3_16-1)
+    psubsw      mm0, mm5 ; x3*tg_3_16-x5 = tm35
+    movq        mm3, [sqrt2]
+    paddsw      mm1, mm5 ; x3+x5*tg_3_16 = tp35
+    paddsw      mm4, mm6 ; x1+tg_1_16*x7 = tp17
+    psubsw      mm2, mm7 ; x1*tg_1_16-x7 = tm17
+    movq        mm5, mm4 ; tp17
+    movq        mm6, mm2 ; tm17
+    paddsw      mm5, mm1 ; tp17+tp35 = b0
+    psubsw      mm6, mm0 ; tm17-tm35 = b3
+    psubsw      mm4, mm1 ; tp17-tp35 = t1
+    paddsw      mm2, mm0 ; tm17+tm35 = t2
+    movq        mm7, [tan2]
+    movq        mm1, mm4 ; t1
+    movq  [%1+3*16], mm5 ; save b0
+    paddsw      mm1, mm2 ; t1+t2
+    movq  [%1+5*16], mm6 ; save b3
+    psubsw      mm4, mm2 ; t1-t2
+    movq        mm5, [%1+2*16]
+    movq        mm0, mm7 ; tg_2_16
+    movq        mm6, [%1+6*16]
+    pmulhw      mm0, mm5 ; x2*tg_2_16
+    pmulhw      mm7, mm6 ; x6*tg_2_16
+    pmulhw      mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2
+    movq        mm2, [%1+0*16]
+    pmulhw      mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2
+    psubsw      mm0, mm6 ; t2*tg_2_16-x6 = tm26
+    movq        mm3, mm2 ; x0
+    movq        mm6, [%1+4*16]
+    paddsw      mm7, mm5 ; x2+x6*tg_2_16 = tp26
+    paddsw      mm2, mm6 ; x0+x4 = tp04
+    psubsw      mm3, mm6 ; x0-x4 = tm04
+    movq        mm5, mm2 ; tp04
+    movq        mm6, mm3 ; tm04
+    psubsw      mm2, mm7 ; tp04-tp26 = a3
+    paddsw      mm3, mm0 ; tm04+tm26 = a1
+    paddsw      mm1, mm1 ; b1
+    paddsw      mm4, mm4 ; b2
+    paddsw      mm5, mm7 ; tp04+tp26 = a0
+    psubsw      mm6, mm0 ; tm04-tm26 = a2
+    movq        mm7, mm3 ; a1
+    movq        mm0, mm6 ; a2
+    paddsw      mm3, mm1 ; a1+b1
+    paddsw      mm6, mm4 ; a2+b2
+    psraw       mm3, 6   ; dst1
+    psubsw      mm7, mm1 ; a1-b1
+    psraw       mm6, 6   ; dst2
+    psubsw      mm0, mm4 ; a2-b2
+    movq        mm1, [%1+3*16] ; load b0
+    psraw       mm7, 6   ; dst6
+    movq        mm4, mm5 ; a0
+    psraw       mm0, 6   ; dst5
+    movq  [%1+1*16], mm3
+    paddsw      mm5, mm1 ; a0+b0
+    movq  [%1+2*16], mm6
+    psubsw      mm4, mm1 ; a0-b0
+    movq        mm3, [%1+5*16] ; load b3
+    psraw       mm5, 6   ; dst0
+    movq        mm6, mm2 ; a3
+    psraw       mm4, 6   ; dst7
+    movq  [%1+5*16], mm0
+    paddsw      mm2, mm3 ; a3+b3
+    movq  [%1+6*16], mm7
+    psubsw      mm6, mm3 ; a3-b3
+    movq  [%1+0*16], mm5
+    psraw       mm2, 6   ; dst3
+    movq  [%1+7*16], mm4
+    psraw       mm6, 6   ; dst4
+    movq  [%1+3*16], mm2
+    movq  [%1+4*16], mm6
+%endmacro
+
+%macro XVID_IDCT_MMX 0
+cglobal xvid_idct, 1, 1, 0, block
+%if cpuflag(mmxext)
+%define TAB tab_i_04_xmm
+%else
+%define TAB tab_i_04_mmx
+%endif
+    ; Process each row - beware of rounder offset
+    DCT_8_INV_ROW  0, TAB + 64 * 0, 0*16
+    DCT_8_INV_ROW  1, TAB + 64 * 1, 1*16
+    DCT_8_INV_ROW  2, TAB + 64 * 2, 2*16
+    DCT_8_INV_ROW  3, TAB + 64 * 3, 3*16
+    DCT_8_INV_ROW  4, TAB + 64 * 0, 6*16
+    DCT_8_INV_ROW  5, TAB + 64 * 3, 4*16
+    DCT_8_INV_ROW  6, TAB + 64 * 2, 5*16
+    DCT_8_INV_ROW  7, TAB + 64 * 1, 5*16
+
+    ; Process the columns (4 at a time)
+    DCT_8_INV_COL  r0+0
+    DCT_8_INV_COL  r0+8
+
+    RET
+%endmacro
+
+INIT_MMX mmx
+XVID_IDCT_MMX
+INIT_MMX mmxext
+XVID_IDCT_MMX
+
+%endif ; ~ARCH_X86_32
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
index 6640b6b..edb5ebf 100644
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@ -1,20 +1,20 @@
 /*
  * XVID MPEG-4 VIDEO CODEC
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@ void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
 void ff_xvid_idct_sse2(short *block);
-void ff_xvid_idct_sse2_put(uint8_t *dest, ptrdiff_t line_size, short *block);
-void ff_xvid_idct_sse2_add(uint8_t *dest, ptrdiff_t line_size, short *block);
+void ff_xvid_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
+void ff_xvid_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, short *block);
 
 #endif /* AVCODEC_X86_XVIDIDCT_H */
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index e4f7345..a91b416 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,36 @@
 #include "idctdsp.h"
 #include "xvididct.h"
 
+#if ARCH_X86_32 && HAVE_X86ASM
+static void xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+#endif
+
 av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
+#if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
     if (high_bit_depth ||
@@ -36,24 +63,27 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
           avctx->idct_algo == FF_IDCT_XVID))
         return;
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmx_put;
-        c->idct_add  = ff_xvid_idct_mmx_add;
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmx_put;
+        c->idct_add  = xvid_idct_mmx_add;
         c->idct      = ff_xvid_idct_mmx;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmxext_put;
-        c->idct_add  = ff_xvid_idct_mmxext_add;
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmxext_put;
+        c->idct_add  = xvid_idct_mmxext_add;
         c->idct      = ff_xvid_idct_mmxext;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
+#endif
 
-    if (INLINE_SSE2(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_sse2_put;
-        c->idct_add  = ff_xvid_idct_sse2_add;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->idct_put  = ff_xvid_idct_put_sse2;
+        c->idct_add  = ff_xvid_idct_add_sse2;
         c->idct      = ff_xvid_idct_sse2;
         c->perm_type = FF_IDCT_PERM_SSE2;
     }
+#endif /* HAVE_X86ASM */
 }
diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c
deleted file mode 100644
index 9bb407c..0000000
--- a/libavcodec/x86/xvididct_mmx.c
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - MMX and XMM forward discrete cosine transform -
- *
- * Copyright(C) 2001 Peter Ross <pross@xvid.org>
- *
- * Originally provided by Intel at AP-922
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
- * but in a limited edition.
- * New macro implements a column part for precise iDCT
- * The routine precision now satisfies IEEE standard 1180-1990.
- *
- * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
- * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
- *
- * http://www.elecard.com/peter/idct.html
- * http://www.linuxvideo.org/mpeg2dec/
- *
- * These examples contain code fragments for first stage iDCT 8x8
- * (for rows) and first stage DCT 8x8 (for columns)
- *
- * conversion to gcc syntax by Michael Niedermayer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-
-#include "config.h"
-
-#include "libavutil/mem.h"
-
-#include "libavcodec/avcodec.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_MMX_INLINE
-
-// -----------------------------------------------------------------------------
-// Various memory constants (trigonometric values or rounding values)
-// -----------------------------------------------------------------------------
-
-DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4 * 4] = {
-     13036,  13036,  13036,  13036, // tg * (2 << 16) + 0.5
-     27146,  27146,  27146,  27146, // tg * (2 << 16) + 0.5
-    -21746, -21746, -21746, -21746, // tg * (2 << 16) + 0.5
-     23170,  23170,  23170,  23170
-};                                  // cos * (2 << 15) + 0.5
-
-DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2 * 8] = {
-    65536, 65536,
-    3597,   3597,
-    2260,   2260,
-    1203,   1203,
-    0,         0,
-    120,     120,
-    512,     512,
-    512, 512
-};
-
-// -----------------------------------------------------------------------------
-//
-// The first stage iDCT 8x8 - inverse DCTs of rows
-//
-// -----------------------------------------------------------------------------
-// The 8-point inverse DCT direct algorithm
-// -----------------------------------------------------------------------------
-//
-// static const short w[32] = {
-//     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
-//     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
-//     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
-//     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
-//     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
-//     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
-//
-// #define DCT_8_INV_ROW(x, y)
-// {
-//     int a0, a1, a2, a3, b0, b1, b2, b3;
-//
-//     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
-//     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
-//     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
-//     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
-//     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
-//     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
-//     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
-//     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
-//
-//     y[0] = SHIFT_ROUND(a0 + b0);
-//     y[1] = SHIFT_ROUND(a1 + b1);
-//     y[2] = SHIFT_ROUND(a2 + b2);
-//     y[3] = SHIFT_ROUND(a3 + b3);
-//     y[4] = SHIFT_ROUND(a3 - b3);
-//     y[5] = SHIFT_ROUND(a2 - b2);
-//     y[6] = SHIFT_ROUND(a1 - b1);
-//     y[7] = SHIFT_ROUND(a0 - b0);
-// }
-//
-// -----------------------------------------------------------------------------
-//
-// In this implementation the outputs of the iDCT-1D are multiplied
-//     for rows 0,4 - by cos_4_16,
-//     for rows 1,7 - by cos_1_16,
-//     for rows 2,6 - by cos_2_16,
-//     for rows 3,5 - by cos_3_16
-// and are shifted to the left for better accuracy.
-//
-// For the constants used,
-//     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// Tables for mmx processors
-// -----------------------------------------------------------------------------
-
-// Table for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32 * 4] = {
-     16384,  16384,  16384, -16384, // movq-> w06 w04 w02 w00
-     21407,   8867,   8867, -21407, // w07 w05 w03 w01
-     16384, -16384,  16384,  16384, // w14 w12 w10 w08
-     -8867,  21407, -21407,  -8867, // w15 w13 w11 w09
-     22725,  12873,  19266, -22725, // w22 w20 w18 w16
-     19266,   4520,  -4520, -12873, // w23 w21 w19 w17
-     12873,   4520,   4520,  19266, // w30 w28 w26 w24
-    -22725,  19266, -12873, -22725, // w31 w29 w27 w25
-// Table for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  22725,  22725, -22725, // movq-> w06 w04 w02 w00
-     29692,  12299,  12299, -29692, // w07 w05 w03 w01
-     22725, -22725,  22725,  22725, // w14 w12 w10 w08
-    -12299,  29692, -29692, -12299, // w15 w13 w11 w09
-     31521,  17855,  26722, -31521, // w22 w20 w18 w16
-     26722,   6270,  -6270, -17855, // w23 w21 w19 w17
-     17855,   6270,   6270,  26722, // w30 w28 w26 w24
-    -31521,  26722, -17855, -31521, // w31 w29 w27 w25
-// Table for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  21407,  21407, -21407, // movq-> w06 w04 w02 w00
-     27969,  11585,  11585, -27969, // w07 w05 w03 w01
-     21407, -21407,  21407,  21407, // w14 w12 w10 w08
-    -11585,  27969, -27969, -11585, // w15 w13 w11 w09
-     29692,  16819,  25172, -29692, // w22 w20 w18 w16
-     25172,   5906,  -5906, -16819, // w23 w21 w19 w17
-     16819,   5906,   5906,  25172, // w30 w28 w26 w24
-    -29692,  25172, -16819, -29692, // w31 w29 w27 w25
-// Table for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  19266,  19266, -19266, // movq-> w06 w04 w02 w00
-     25172,  10426,  10426, -25172, // w07 w05 w03 w01
-     19266, -19266,  19266,  19266, // w14 w12 w10 w08
-    -10426,  25172, -25172, -10426, // w15 w13 w11 w09
-     26722,  15137,  22654, -26722, // w22 w20 w18 w16
-     22654,   5315,  -5315, -15137, // w23 w21 w19 w17
-     15137,   5315,   5315,  22654, // w30 w28 w26 w24
-    -26722,  22654, -15137, -26722, // w31 w29 w27 w25
-};
-// -----------------------------------------------------------------------------
-// Tables for xmm processors
-// -----------------------------------------------------------------------------
-
-// %3 for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32 * 4] = {
-     16384,  21407,  16384,   8867, // movq-> w05 w04 w01 w00
-     16384,   8867, -16384, -21407, // w07 w06 w03 w02
-     16384,  -8867,  16384, -21407, // w13 w12 w09 w08
-    -16384,  21407,  16384,  -8867, // w15 w14 w11 w10
-     22725,  19266,  19266,  -4520, // w21 w20 w17 w16
-     12873,   4520, -22725, -12873, // w23 w22 w19 w18
-     12873, -22725,   4520, -12873, // w29 w28 w25 w24
-      4520,  19266,  19266, -22725, // w31 w30 w27 w26
-// %3 for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  29692,  22725,  12299, // movq-> w05 w04 w01 w00
-     22725,  12299, -22725, -29692, // w07 w06 w03 w02
-     22725, -12299,  22725, -29692, // w13 w12 w09 w08
-    -22725,  29692,  22725, -12299, // w15 w14 w11 w10
-     31521,  26722,  26722,  -6270, // w21 w20 w17 w16
-     17855,   6270, -31521, -17855, // w23 w22 w19 w18
-     17855, -31521,   6270, -17855, // w29 w28 w25 w24
-      6270,  26722,  26722, -31521, // w31 w30 w27 w26
-// %3 for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  27969,  21407,  11585, // movq-> w05 w04 w01 w00
-     21407,  11585, -21407, -27969, // w07 w06 w03 w02
-     21407, -11585,  21407, -27969, // w13 w12 w09 w08
-    -21407,  27969,  21407, -11585, // w15 w14 w11 w10
-     29692,  25172,  25172,  -5906, // w21 w20 w17 w16
-     16819,   5906, -29692, -16819, // w23 w22 w19 w18
-     16819, -29692,   5906, -16819, // w29 w28 w25 w24
-      5906,  25172,  25172, -29692, // w31 w30 w27 w26
-// %3 for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  25172,  19266,  10426, // movq-> w05 w04 w01 w00
-     19266,  10426, -19266, -25172, // w07 w06 w03 w02
-     19266, -10426,  19266, -25172, // w13 w12 w09 w08
-    -19266,  25172,  19266, -10426, // w15 w14 w11 w10
-     26722,  22654,  22654,  -5315, // w21 w20 w17 w16
-     15137,   5315, -26722, -15137, // w23 w22 w19 w18
-     15137, -26722,   5315, -15137, // w29 w28 w25 w24
-      5315,  22654,  22654, -26722, // w31 w30 w27 w26
-};
-// =============================================================================
-// Helper macros for the code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_MMX(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0    \n\t" /* 0 ; x3 x2 x1 x0 */                     \
-    "movq     8+"#A1", %%mm1    \n\t" /* 1 ; x7 x6 x5 x4 */                     \
-    "movq       %%mm0, %%mm2    \n\t" /* 2 ; x3 x2 x1 x0 */                     \
-    "movq       "#A3", %%mm3    \n\t" /* 3 ; w06 w04 w02 w00 */                 \
-    "punpcklwd  %%mm1, %%mm0    \n\t" /* x5 x1 x4 x0 */                         \
-    "movq       %%mm0, %%mm5    \n\t" /* 5 ; x5 x1 x4 x0 */                     \
-    "punpckldq  %%mm0, %%mm0    \n\t" /* x4 x0 x4 x0 */                         \
-    "movq     8+"#A3", %%mm4    \n\t" /* 4 ; w07 w05 w03 w01 */                 \
-    "punpckhwd  %%mm1, %%mm2    \n\t" /* 1 ; x7 x3 x6 x2 */                     \
-    "pmaddwd    %%mm0, %%mm3    \n\t" /* x4*w06+x0*w04 x4*w02+x0*w00 */         \
-    "movq       %%mm2, %%mm6    \n\t" /* 6 ; x7 x3 x6 x2 */                     \
-    "movq    32+"#A3", %%mm1    \n\t" /* 1 ; w22 w20 w18 w16 */                 \
-    "punpckldq  %%mm2, %%mm2    \n\t" /* x6 x2 x6 x2 */                         \
-    "pmaddwd    %%mm2, %%mm4    \n\t" /* x6*w07+x2*w05 x6*w03+x2*w01 */         \
-    "punpckhdq  %%mm5, %%mm5    \n\t" /* x5 x1 x5 x1 */                         \
-    "pmaddwd 16+"#A3", %%mm0    \n\t" /* x4*w14+x0*w12 x4*w10+x0*w08 */         \
-    "punpckhdq  %%mm6, %%mm6    \n\t" /* x7 x3 x7 x3 */                         \
-    "movq 40+   "#A3", %%mm7    \n\t" /* 7 ; w23 w21 w19 w17 */                 \
-    "pmaddwd    %%mm5, %%mm1    \n\t" /* x5*w22+x1*w20 x5*w18+x1*w16 */         \
-    "paddd      "#A4", %%mm3    \n\t" /* +%4 */                                 \
-    "pmaddwd    %%mm6, %%mm7    \n\t" /* x7*w23+x3*w21 x7*w19+x3*w17 */         \
-    "pmaddwd 24+"#A3", %%mm2    \n\t" /* x6*w15+x2*w13 x6*w11+x2*w09 */         \
-    "paddd      %%mm4, %%mm3    \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */     \
-    "pmaddwd 48+"#A3", %%mm5    \n\t" /* x5*w30+x1*w28 x5*w26+x1*w24 */         \
-    "movq       %%mm3, %%mm4    \n\t" /* 4 ; a1 a0 */                           \
-    "pmaddwd 56+"#A3", %%mm6    \n\t" /* x7*w31+x3*w29 x7*w27+x3*w25 */         \
-    "paddd      %%mm7, %%mm1    \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */       \
-    "paddd      "#A4", %%mm0    \n\t" /* +%4 */                                 \
-    "psubd      %%mm1, %%mm3    \n\t" /* a1-b1 a0-b0 */                         \
-    "psrad        $11, %%mm3    \n\t" /* y6=a1-b1 y7=a0-b0 */                   \
-    "paddd      %%mm4, %%mm1    \n\t" /* 4 ; a1+b1 a0+b0 */                     \
-    "paddd      %%mm2, %%mm0    \n\t" /* 2 ; a3=sum(even3) a2=sum(even2) */     \
-    "psrad        $11, %%mm1    \n\t" /* y1=a1+b1 y0=a0+b0 */                   \
-    "paddd      %%mm6, %%mm5    \n\t" /* 6 ; b3=sum(odd3) b2=sum(odd2) */       \
-    "movq       %%mm0, %%mm4    \n\t" /* 4 ; a3 a2 */                           \
-    "paddd      %%mm5, %%mm0    \n\t" /* a3+b3 a2+b2 */                         \
-    "psubd      %%mm5, %%mm4    \n\t" /* 5 ; a3-b3 a2-b2 */                     \
-    "psrad        $11, %%mm0    \n\t" /* y3=a3+b3 y2=a2+b2 */                   \
-    "psrad        $11, %%mm4    \n\t" /* y4=a3-b3 y5=a2-b2 */                   \
-    "packssdw   %%mm0, %%mm1    \n\t" /* 0 ; y3 y2 y1 y0 */                     \
-    "packssdw   %%mm3, %%mm4    \n\t" /* 3 ; y6 y7 y4 y5 */                     \
-    "movq       %%mm4, %%mm7    \n\t" /* 7 ; y6 y7 y4 y5 */                     \
-    "psrld        $16, %%mm4    \n\t" /* 0 y6 0 y4 */                           \
-    "pslld        $16, %%mm7    \n\t" /* y7 0 y5 0 */                           \
-    "movq       %%mm1, "#A2"    \n\t" /* 1 ; save y3 y2 y1 y0 */                \
-    "por        %%mm4, %%mm7    \n\t" /* 4 ; y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"  \n\t" /* 7 ; save y7 y6 y5 y4 */                \
-
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_XMM(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0        \n\t" /* 0 ; x3 x2 x1 x0 */                 \
-    "movq     8+"#A1", %%mm1        \n\t" /* 1 ; x7 x6 x5 x4 */                 \
-    "movq       %%mm0, %%mm2        \n\t" /* 2 ; x3 x2 x1 x0 */                 \
-    "movq       "#A3", %%mm3        \n\t" /* 3 ; w05 w04 w01 w00 */             \
-    "pshufw     $0x88, %%mm0, %%mm0 \n\t" /* x2 x0 x2 x0 */                     \
-    "movq     8+"#A3", %%mm4        \n\t" /* 4 ; w07 w06 w03 w02 */             \
-    "movq       %%mm1, %%mm5        \n\t" /* 5 ; x7 x6 x5 x4 */                 \
-    "pmaddwd    %%mm0, %%mm3        \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */     \
-    "movq    32+"#A3", %%mm6        \n\t" /* 6 ; w21 w20 w17 w16 */             \
-    "pshufw     $0x88, %%mm1, %%mm1 \n\t" /* x6 x4 x6 x4 */                     \
-    "pmaddwd    %%mm1, %%mm4        \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */     \
-    "movq    40+"#A3", %%mm7        \n\t" /* 7; w23 w22 w19 w18 */              \
-    "pshufw     $0xdd, %%mm2, %%mm2 \n\t" /* x3 x1 x3 x1 */                     \
-    "pmaddwd    %%mm2, %%mm6        \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */     \
-    "pshufw     $0xdd, %%mm5, %%mm5 \n\t" /* x7 x5 x7 x5 */                     \
-    "pmaddwd    %%mm5, %%mm7        \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */     \
-    "paddd      "#A4", %%mm3        \n\t" /* +%4 */                             \
-    "pmaddwd 16+"#A3", %%mm0        \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */     \
-    "paddd      %%mm4, %%mm3        \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \
-    "pmaddwd 24+"#A3", %%mm1        \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */     \
-    "movq       %%mm3, %%mm4        \n\t" /* 4 ; a1 a0 */                       \
-    "pmaddwd 48+"#A3", %%mm2        \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */     \
-    "paddd      %%mm7, %%mm6        \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */   \
-    "pmaddwd 56+"#A3", %%mm5        \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */     \
-    "paddd      %%mm6, %%mm3        \n\t" /* a1+b1 a0+b0 */                     \
-    "paddd      "#A4", %%mm0        \n\t" /* +%4 */                             \
-    "psrad        $11, %%mm3        \n\t" /* y1=a1+b1 y0=a0+b0 */               \
-    "paddd      %%mm1, %%mm0        \n\t" /* 1 ; a3=sum(even3) a2=sum(even2) */ \
-    "psubd      %%mm6, %%mm4        \n\t" /* 6 ; a1-b1 a0-b0 */                 \
-    "movq       %%mm0, %%mm7        \n\t" /* 7 ; a3 a2 */                       \
-    "paddd      %%mm5, %%mm2        \n\t" /* 5 ; b3=sum(odd3) b2=sum(odd2) */   \
-    "paddd      %%mm2, %%mm0        \n\t" /* a3+b3 a2+b2 */                     \
-    "psrad        $11, %%mm4        \n\t" /* y6=a1-b1 y7=a0-b0 */               \
-    "psubd      %%mm2, %%mm7        \n\t" /* 2 ; a3-b3 a2-b2 */                 \
-    "psrad        $11, %%mm0        \n\t" /* y3=a3+b3 y2=a2+b2 */               \
-    "psrad        $11, %%mm7        \n\t" /* y4=a3-b3 y5=a2-b2 */               \
-    "packssdw   %%mm0, %%mm3        \n\t" /* 0 ; y3 y2 y1 y0 */                 \
-    "packssdw   %%mm4, %%mm7        \n\t" /* 4 ; y6 y7 y4 y5 */                 \
-    "movq       %%mm3, "#A2"        \n\t" /* 3 ; save y3 y2 y1 y0 */            \
-    "pshufw     $0xb1, %%mm7, %%mm7 \n\t" /* y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"      \n\t" /* 7 ; save y7 y6 y5 y4 */            \
-
-
-// -----------------------------------------------------------------------------
-//
-// The first stage DCT 8x8 - forward DCTs of columns
-//
-// The %2puts are multiplied
-// for rows 0,4 - on cos_4_16,
-// for rows 1,7 - on cos_1_16,
-// for rows 2,6 - on cos_2_16,
-// for rows 3,5 - on cos_3_16
-// and are shifted to the left for rise of accuracy
-//
-// -----------------------------------------------------------------------------
-//
-// The 8-point scaled forward DCT algorithm (26a8m)
-//
-// -----------------------------------------------------------------------------
-//
-//#define DCT_8_FRW_COL(x, y)
-// {
-//     short t0, t1, t2, t3, t4, t5, t6, t7;
-//     short tp03, tm03, tp12, tm12, tp65, tm65;
-//     short tp465, tm465, tp765, tm765;
-//
-//     t0 = LEFT_SHIFT(x[0] + x[7]);
-//     t1 = LEFT_SHIFT(x[1] + x[6]);
-//     t2 = LEFT_SHIFT(x[2] + x[5]);
-//     t3 = LEFT_SHIFT(x[3] + x[4]);
-//     t4 = LEFT_SHIFT(x[3] - x[4]);
-//     t5 = LEFT_SHIFT(x[2] - x[5]);
-//     t6 = LEFT_SHIFT(x[1] - x[6]);
-//     t7 = LEFT_SHIFT(x[0] - x[7]);
-//
-//     tp03 = t0 + t3;
-//     tm03 = t0 - t3;
-//     tp12 = t1 + t2;
-//     tm12 = t1 - t2;
-//
-//     y[0] = tp03 + tp12;
-//     y[4] = tp03 - tp12;
-//
-//     y[2] = tm03 + tm12 * tg_2_16;
-//     y[6] = tm03 * tg_2_16 - tm12;
-//
-//     tp65 = (t6 + t5) * cos_4_16;
-//     tm65 = (t6 - t5) * cos_4_16;
-//
-//     tp765 = t7 + tp65;
-//     tm765 = t7 - tp65;
-//     tp465 = t4 + tm65;
-//     tm465 = t4 - tm65;
-//
-//     y[1] = tp765 + tp465 * tg_1_16;
-//     y[7] = tp765 * tg_1_16 - tp465;
-//     y[5] = tm765 * tg_3_16 + tm465;
-//     y[3] = tm765 - tm465 * tg_3_16;
-// }
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_COL_4  INP,OUT
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_COL(A1, A2)                                                   \
-    "movq    2*8(%3), %%mm0         \n\t"                                       \
-    "movq 16*3+"#A1", %%mm3         \n\t"                                       \
-    "movq      %%mm0, %%mm1         \n\t" /* tg_3_16 */                         \
-    "movq 16*5+"#A1", %%mm5         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm0         \n\t" /* x3*(tg_3_16-1) */                  \
-    "movq       (%3), %%mm4         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm1         \n\t" /* x5*(tg_3_16-1) */                  \
-    "movq 16*7+"#A1", %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm2         \n\t" /* tg_1_16 */                         \
-    "movq 16*1+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm7, %%mm4         \n\t" /* x7*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm0         \n\t" /* x3*tg_3_16 */                      \
-    "pmulhw    %%mm6, %%mm2         \n\t" /* x1*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm1         \n\t" /* x3+x5*(tg_3_16-1) */               \
-    "psubsw    %%mm5, %%mm0         \n\t" /* x3*tg_3_16-x5 = tm35 */            \
-    "movq    3*8(%3), %%mm3         \n\t"                                       \
-    "paddsw    %%mm5, %%mm1         \n\t" /* x3+x5*tg_3_16 = tp35 */            \
-    "paddsw    %%mm6, %%mm4         \n\t" /* x1+tg_1_16*x7 = tp17 */            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* x1*tg_1_16-x7 = tm17 */            \
-    "movq      %%mm4, %%mm5         \n\t" /* tp17 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* tm17 */                            \
-    "paddsw    %%mm1, %%mm5         \n\t" /* tp17+tp35 = b0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm17-tm35 = b3 */                  \
-    "psubsw    %%mm1, %%mm4         \n\t" /* tp17-tp35 = t1 */                  \
-    "paddsw    %%mm0, %%mm2         \n\t" /* tm17+tm35 = t2 */                  \
-    "movq    1*8(%3), %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm1         \n\t" /* t1 */                              \
-    "movq      %%mm5, 3*16+"#A2"    \n\t" /* save b0 */                         \
-    "paddsw    %%mm2, %%mm1         \n\t" /* t1+t2 */                           \
-    "movq      %%mm6, 5*16+"#A2"    \n\t" /* save b3 */                         \
-    "psubsw    %%mm2, %%mm4         \n\t" /* t1-t2 */                           \
-    "movq 2*16+"#A1", %%mm5         \n\t"                                       \
-    "movq      %%mm7, %%mm0         \n\t" /* tg_2_16 */                         \
-    "movq 6*16+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm0         \n\t" /* x2*tg_2_16 */                      \
-    "pmulhw    %%mm6, %%mm7         \n\t" /* x6*tg_2_16 */                      \
-    "pmulhw    %%mm3, %%mm1         \n\t" /* ocos_4_16*(t1+t2) = b1/2 */        \
-    "movq 0*16+"#A1", %%mm2         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm4         \n\t" /* ocos_4_16*(t1-t2) = b2/2 */        \
-    "psubsw    %%mm6, %%mm0         \n\t" /* t2*tg_2_16-x6 = tm26 */            \
-    "movq      %%mm2, %%mm3         \n\t" /* x0 */                              \
-    "movq 4*16+"#A1", %%mm6         \n\t"                                       \
-    "paddsw    %%mm5, %%mm7         \n\t" /* x2+x6*tg_2_16 = tp26 */            \
-    "paddsw    %%mm6, %%mm2         \n\t" /* x0+x4 = tp04 */                    \
-    "psubsw    %%mm6, %%mm3         \n\t" /* x0-x4 = tm04 */                    \
-    "movq      %%mm2, %%mm5         \n\t" /* tp04 */                            \
-    "movq      %%mm3, %%mm6         \n\t" /* tm04 */                            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* tp04-tp26 = a3 */                  \
-    "paddsw    %%mm0, %%mm3         \n\t" /* tm04+tm26 = a1 */                  \
-    "paddsw    %%mm1, %%mm1         \n\t" /* b1 */                              \
-    "paddsw    %%mm4, %%mm4         \n\t" /* b2 */                              \
-    "paddsw    %%mm7, %%mm5         \n\t" /* tp04+tp26 = a0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm04-tm26 = a2 */                  \
-    "movq      %%mm3, %%mm7         \n\t" /* a1 */                              \
-    "movq      %%mm6, %%mm0         \n\t" /* a2 */                              \
-    "paddsw    %%mm1, %%mm3         \n\t" /* a1+b1 */                           \
-    "paddsw    %%mm4, %%mm6         \n\t" /* a2+b2 */                           \
-    "psraw        $6, %%mm3         \n\t" /* dst1 */                            \
-    "psubsw    %%mm1, %%mm7         \n\t" /* a1-b1 */                           \
-    "psraw        $6, %%mm6         \n\t" /* dst2 */                            \
-    "psubsw    %%mm4, %%mm0         \n\t" /* a2-b2 */                           \
-    "movq 3*16+"#A2", %%mm1         \n\t" /* load b0 */                         \
-    "psraw        $6, %%mm7         \n\t" /* dst6 */                            \
-    "movq      %%mm5, %%mm4         \n\t" /* a0 */                              \
-    "psraw        $6, %%mm0         \n\t" /* dst5 */                            \
-    "movq      %%mm3, 1*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm1, %%mm5         \n\t" /* a0+b0 */                           \
-    "movq      %%mm6, 2*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm1, %%mm4         \n\t" /* a0-b0 */                           \
-    "movq 5*16+"#A2", %%mm3         \n\t" /* load b3 */                         \
-    "psraw        $6, %%mm5         \n\t" /* dst0 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* a3 */                              \
-    "psraw        $6, %%mm4         \n\t" /* dst7 */                            \
-    "movq      %%mm0, 5*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm3, %%mm2         \n\t" /* a3+b3 */                           \
-    "movq      %%mm7, 6*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm3, %%mm6         \n\t" /* a3-b3 */                           \
-    "movq      %%mm5, 0*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm2         \n\t" /* dst3 */                            \
-    "movq      %%mm4, 7*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm6         \n\t" /* dst4 */                            \
-    "movq      %%mm2, 3*16+"#A2"    \n\t"                                       \
-    "movq      %%mm6, 4*16+"#A2"    \n\t"                                       \
-
-// =============================================================================
-// Code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// void idct_mmx(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmx(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_MMX(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_MMX(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_MMX(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_MMX(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_MMX(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_MMX(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_MMX(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_MMX(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_mmx), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmx_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmx_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-// -----------------------------------------------------------------------------
-// void idct_xmm(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmxext(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_XMM(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_XMM(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_XMM(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_XMM(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_XMM(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_XMM(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_XMM(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_XMM(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_xmm), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmxext_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmxext_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c
deleted file mode 100644
index 0de59a5..0000000
--- a/libavcodec/x86/xvididct_sse2.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange@ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of Libav.
- *
- * Vertical pass is an implementation of the scheme:
- *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
- *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- *  Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- *  Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_SSE2_INLINE
-
-/**
- * @file
- * @brief SSE2 IDCT compatible with the Xvid IDCT
- */
-
-#define X8(x) x, x, x, x, x, x, x, x
-
-DECLARE_ASM_CONST(16, int16_t, tan1)[]  = { X8(13036) }; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2)[]  = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3)[]  = { X8(43790) }; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8,  uint8_t, m127)[]  = { X8(127) };
-
-DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
-    0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
-    0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
-    0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
-    0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
-    0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
-    0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
-    0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
-    0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
-    0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
-    0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
-    0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
-    0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
-    0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
-    0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
-    0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
-    0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
-    65536, 65536, 65536, 65536,
-     3597,  3597,  3597,  3597,
-     2260,  2260,  2260,  2260,
-     1203,  1203,  1203,  1203,
-      120,   120,   120,   120,
-      512,   512,   512,   512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
-
-#if ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
-    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
-    "movdqa          %%xmm2, "dst"    \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd   "MANGLE(x)
-
-#define JZ(reg, to)                         \
-    "testl     "reg","reg"            \n\t" \
-    "jz        "to"                   \n\t"
-
-#define JNZ(reg, to)                        \
-    "testl     "reg","reg"            \n\t" \
-    "jnz       "to"                   \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear)       \
-    clear                                   \
-    "movq     "src", %%mm1            \n\t" \
-    "por    8+"src", %%mm1            \n\t" \
-    "paddusb  %%mm0, %%mm1            \n\t" \
-    "pmovmskb %%mm1, "reg"            \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
-    clear1                                                    \
-    clear2                                                    \
-    "movq     "row1", %%mm1           \n\t"                   \
-    "por    8+"row1", %%mm1           \n\t"                   \
-    "movq     "row2", %%mm2           \n\t"                   \
-    "por    8+"row2", %%mm2           \n\t"                   \
-    "paddusb   %%mm0, %%mm1           \n\t"                   \
-    "paddusb   %%mm0, %%mm2           \n\t"                   \
-    "pmovmskb  %%mm1, "reg1"          \n\t"                   \
-    "pmovmskb  %%mm2, "reg2"          \n\t"
-
-/// IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put)            \
-    "movdqa        "src", %%xmm3      \n\t"            \
-    "movdqa       %%xmm3, %%xmm0      \n\t"            \
-    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
-    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
-    "pmaddwd     "table", %%xmm0      \n\t"            \
-    "pmaddwd  16+"table", %%xmm1      \n\t"            \
-    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
-    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
-    "pmaddwd  32+"table", %%xmm2      \n\t"            \
-    "pmaddwd  48+"table", %%xmm3      \n\t"            \
-    "paddd        %%xmm1, %%xmm0      \n\t"            \
-    "paddd        %%xmm3, %%xmm2      \n\t"            \
-    rounder",     %%xmm0              \n\t"            \
-    "movdqa       %%xmm2, %%xmm3      \n\t"            \
-    "paddd        %%xmm0, %%xmm2      \n\t"            \
-    "psubd        %%xmm3, %%xmm0      \n\t"            \
-    "psrad           $11, %%xmm2      \n\t"            \
-    "psrad           $11, %%xmm0      \n\t"            \
-    "packssdw     %%xmm0, %%xmm2      \n\t"            \
-    put                                                \
-    "1:                               \n\t"
-
-#define iLLM_HEAD                           \
-    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
-    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
-
-/// IDCT pass on columns.
-#define iLLM_PASS(dct)                      \
-    "movdqa   "TAN3", %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "pmulhw   %%xmm5, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm5, %%xmm1          \n\t" \
-    "psubsw   %%xmm5, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, %%xmm1          \n\t" \
-    "pmulhw   %%xmm7, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "paddsw   %%xmm6, %%xmm3          \n\t" \
-    "psubsw   %%xmm7, "TAN1"          \n\t" \
-    "movdqa   %%xmm3, %%xmm7          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm7, %%xmm1          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
-    MOV_32_ONLY ROW2", "REG2"         \n\t" \
-    MOV_32_ONLY ROW6", "REG6"         \n\t" \
-    "movdqa   %%xmm7, %%xmm5          \n\t" \
-    "pmulhw   "REG6", %%xmm7          \n\t" \
-    "pmulhw   "REG2", %%xmm5          \n\t" \
-    "paddsw   "REG2", %%xmm7          \n\t" \
-    "psubsw   "REG6", %%xmm5          \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    MOV_32_ONLY ROW4", "REG4"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   "REG4", "REG0"          \n\t" \
-    "paddsw   "XMMS", "REG4"          \n\t" \
-    "movdqa   "REG4", "XMMS"          \n\t" \
-    "psubsw   %%xmm7, "REG4"          \n\t" \
-    "paddsw   "XMMS", %%xmm7          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   %%xmm7, %%xmm0          \n\t" \
-    "movdqa   "REG4", %%xmm4          \n\t" \
-    "psubsw   %%xmm1, %%xmm7          \n\t" \
-    "psubsw   "TAN1", "REG4"          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, %%xmm7          \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, "REG4"          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   "REG4", 4*16("dct")     \n\t" \
-    "movdqa   %%xmm7, 7*16("dct")     \n\t"
-
-/// IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct)               \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "movdqa   %%xmm6, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "movdqa   %%xmm4, %%xmm1          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "paddsw   %%xmm6, %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
-    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
-    "pmulhw   "SREG2", %%xmm5         \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    "movdqa   "REG0", %%xmm6          \n\t" \
-    "psubsw   "SREG2", %%xmm6         \n\t" \
-    "paddsw   "REG0", "SREG2"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   "SREG2", %%xmm0         \n\t" \
-    "movdqa   %%xmm6, %%xmm4          \n\t" \
-    "psubsw   %%xmm1, "SREG2"         \n\t" \
-    "psubsw   "TAN1", %%xmm6          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, "SREG2"         \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm6          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
-    "movdqa   "SREG2", 7*16("dct")    \n\t"
-
-void ff_xvid_idct_sse2(short *block)
-{
-    __asm__ volatile (
-        "movq     "MANGLE (m127) ", %%mm0                              \n\t"
-        iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),          PUT_EVEN(ROW0))
-        iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1))
-        iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2))
-
-        TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
-        JZ("%%eax", "1f")
-        iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3))
-
-        TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
-        TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
-        iLLM_HEAD
-        ".p2align 4 \n\t"
-        JNZ("%%ecx", "2f")
-        JNZ("%%eax", "3f")
-        JNZ("%%edx", "4f")
-        JNZ("%%esi", "5f")
-        iLLM_PASS_SPARSE("%0")
-        "jmp 6f                                                      \n\t"
-        "2:                                                          \n\t"
-        iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
-        "3:                                                          \n\t"
-        iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5))
-        JZ("%%edx", "1f")
-        "4:                                                          \n\t"
-        iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6))
-        JZ("%%esi", "1f")
-        "5:                                                          \n\t"
-        iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7))
-#if ARCH_X86_32
-        iLLM_HEAD
-#endif
-        iLLM_PASS("%0")
-        "6:                                                          \n\t"
-        : "+r" (block)
-        :
-        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                       "%xmm4", "%xmm5", "%xmm6", "%xmm7", )
-#if ARCH_X86_64
-          XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11",
-                       "%xmm12", "%xmm13", "%xmm14", )
-#endif
-          "%eax", "%ecx", "%edx", "%esi", "memory");
-}
-
-void ff_xvid_idct_sse2_put(uint8_t *dest, ptrdiff_t line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_sse2_add(uint8_t *dest, ptrdiff_t line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_SSE2_INLINE */
diff --git a/libavcodec/xan.c b/libavcodec/xan.c
index 5bb4f16..1ccf164 100644
--- a/libavcodec/xan.c
+++ b/libavcodec/xan.c
@@ -2,20 +2,20 @@
  * Wing Commander/Xan Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,8 +37,8 @@
 
 #define BITSTREAM_READER_LE
 #include "avcodec.h"
-#include "bitstream.h"
 #include "bytestream.h"
+#include "get_bits.h"
 #include "internal.h"
 
 #define RUNTIME_GAMMA 0
@@ -55,13 +55,13 @@ typedef struct XanContext {
     AVCodecContext *avctx;
     AVFrame *last_frame;
 
-    const unsigned char *buf;
+    const uint8_t *buf;
     int size;
 
     /* scratch space */
-    unsigned char *buffer1;
+    uint8_t *buffer1;
     int buffer1_size;
-    unsigned char *buffer2;
+    uint8_t *buffer2;
     int buffer2_size;
 
     unsigned *palettes;
@@ -114,25 +114,27 @@ static av_cold int xan_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int xan_huffman_decode(unsigned char *dest, int dest_len,
-                              const unsigned char *src, int src_len)
+static int xan_huffman_decode(uint8_t *dest, int dest_len,
+                              const uint8_t *src, int src_len)
 {
-    unsigned char byte = *src++;
-    unsigned char ival = byte + 0x16;
-    const unsigned char * ptr = src + byte*2;
+    uint8_t byte = *src++;
+    uint8_t ival = byte + 0x16;
+    const uint8_t * ptr = src + byte*2;
     int ptr_len = src_len - 1 - byte*2;
-    unsigned char val = ival;
-    unsigned char *dest_end = dest + dest_len;
-    unsigned char *dest_start = dest;
-    BitstreamContext bc;
+    uint8_t val = ival;
+    uint8_t *dest_end = dest + dest_len;
+    uint8_t *dest_start = dest;
+    int ret;
+    GetBitContext gb;
 
-    if (ptr_len < 0)
-        return AVERROR_INVALIDDATA;
-
-    bitstream_init8(&bc, ptr, ptr_len);
+    if ((ret = init_get_bits8(&gb, ptr, ptr_len)) < 0)
+        return ret;
 
     while (val != 0x16) {
-        unsigned idx = val - 0x17 + bitstream_read_bit(&bc) * byte;
+        unsigned idx;
+        if (get_bits_left(&gb) < 1)
+            return AVERROR_INVALIDDATA;
+        idx = val - 0x17 + get_bits1(&gb) * byte;
         if (idx >= 2 * byte)
             return AVERROR_INVALIDDATA;
         val = src[idx];
@@ -153,13 +155,13 @@ static int xan_huffman_decode(unsigned char *dest, int dest_len,
  *
  * @param dest destination buffer of dest_len, must be padded with at least 130 bytes
  */
-static void xan_unpack(unsigned char *dest, int dest_len,
-                       const unsigned char *src, int src_len)
+static void xan_unpack(uint8_t *dest, int dest_len,
+                       const uint8_t *src, int src_len)
 {
-    unsigned char opcode;
+    uint8_t opcode;
     int size;
-    unsigned char *dest_org = dest;
-    unsigned char *dest_end = dest + dest_len;
+    uint8_t *dest_org = dest;
+    uint8_t *dest_end = dest + dest_len;
     GetByteContext ctx;
 
     bytestream2_init(&ctx, src, src_len);
@@ -208,14 +210,14 @@ static void xan_unpack(unsigned char *dest, int dest_len,
 }
 
 static inline void xan_wc3_output_pixel_run(XanContext *s, AVFrame *frame,
-    const unsigned char *pixel_buffer, int x, int y, int pixel_count)
+    const uint8_t *pixel_buffer, int x, int y, int pixel_count)
 {
     int stride;
     int line_inc;
     int index;
     int current_x;
     int width = s->avctx->width;
-    unsigned char *palette_plane;
+    uint8_t *palette_plane;
 
     palette_plane = frame->data[0];
     stride = frame->linesize[0];
@@ -247,7 +249,7 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, AVFrame *frame,
     int curframe_index, prevframe_index;
     int curframe_x, prevframe_x;
     int width = s->avctx->width;
-    unsigned char *palette_plane, *prev_palette_plane;
+    uint8_t *palette_plane, *prev_palette_plane;
 
     if (y + motion_y < 0 || y + motion_y >= s->avctx->height ||
         x + motion_x < 0 || x + motion_x >= s->avctx->width)
@@ -263,6 +265,12 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, AVFrame *frame,
     curframe_x = x;
     prevframe_index = (y + motion_y) * stride + x + motion_x;
     prevframe_x = x + motion_x;
+
+    if (prev_palette_plane == palette_plane && FFABS(motion_x + width*motion_y) < pixel_count) {
+         avpriv_request_sample(s->avctx, "Overlapping copy");
+         return ;
+    }
+
     while (pixel_count &&
            curframe_index  < s->frame_size &&
            prevframe_index < s->frame_size) {
@@ -295,22 +303,22 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
     int width  = s->avctx->width;
     int height = s->avctx->height;
     int total_pixels = width * height;
-    unsigned char opcode;
-    unsigned char flag = 0;
+    uint8_t opcode;
+    uint8_t flag = 0;
     int size = 0;
     int motion_x, motion_y;
     int x, y, ret;
 
-    unsigned char *opcode_buffer = s->buffer1;
-    unsigned char *opcode_buffer_end = s->buffer1 + s->buffer1_size;
+    uint8_t *opcode_buffer = s->buffer1;
+    uint8_t *opcode_buffer_end = s->buffer1 + s->buffer1_size;
     int opcode_buffer_size = s->buffer1_size;
-    const unsigned char *imagedata_buffer = s->buffer2;
+    const uint8_t *imagedata_buffer = s->buffer2;
 
     /* pointers to segments inside the compressed chunk */
-    const unsigned char *huffman_segment;
+    const uint8_t *huffman_segment;
     GetByteContext       size_segment;
     GetByteContext       vector_segment;
-    const unsigned char *imagedata_segment;
+    const uint8_t *imagedata_segment;
     int huffman_offset, size_offset, vector_offset, imagedata_offset,
         imagedata_size;
 
@@ -383,16 +391,28 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
 
         case 9:
         case 19:
+            if (bytestream2_get_bytes_left(&size_segment) < 1) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_byte(&size_segment);
             break;
 
         case 10:
         case 20:
+            if (bytestream2_get_bytes_left(&size_segment) < 2) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_be16(&size_segment);
             break;
 
         case 11:
         case 21:
+            if (bytestream2_get_bytes_left(&size_segment) < 3) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_be24(&size_segment);
             break;
         }
@@ -414,8 +434,13 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
                 imagedata_size -= size;
             }
         } else {
+            uint8_t vector;
+            if (bytestream2_get_bytes_left(&vector_segment) <= 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "vector_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             /* run-based motion compensation from last frame */
-            uint8_t vector = bytestream2_get_byte(&vector_segment);
+            vector = bytestream2_get_byte(&vector_segment);
             motion_x = sign_extend(vector >> 4,  4);
             motion_y = sign_extend(vector & 0xF, 4);
 
@@ -535,6 +560,10 @@ static int xan_decode_frame(AVCodecContext *avctx,
         int i;
         tag  = bytestream2_get_le32(&ctx);
         size = bytestream2_get_be32(&ctx);
+        if (size < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid tag size %d\n", size);
+            return AVERROR_INVALIDDATA;
+        }
         size = FFMIN(size, bytestream2_get_bytes_left(&ctx));
         switch (tag) {
         case PALT_TAG:
@@ -542,8 +571,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
                 return AVERROR_INVALIDDATA;
             if (s->palettes_count >= PALETTES_MAX)
                 return AVERROR_INVALIDDATA;
-            tmpptr = av_realloc(s->palettes,
-                                (s->palettes_count + 1) * AVPALETTE_SIZE);
+            tmpptr = av_realloc_array(s->palettes,
+                                      s->palettes_count + 1, AVPALETTE_SIZE);
             if (!tmpptr)
                 return AVERROR(ENOMEM);
             s->palettes = tmpptr;
@@ -558,7 +587,7 @@ static int xan_decode_frame(AVCodecContext *avctx,
                 int g = gamma_lookup[bytestream2_get_byteu(&ctx)];
                 int b = gamma_lookup[bytestream2_get_byteu(&ctx)];
 #endif
-                *tmpptr++ = (r << 16) | (g << 8) | b;
+                *tmpptr++ = (0xFFU << 24) | (r << 16) | (g << 8) | b;
             }
             s->palettes_count++;
             break;
@@ -585,10 +614,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF))) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (!s->frame_size)
         s->frame_size = frame->linesize[0] * s->avctx->height;
diff --git a/libavcodec/xbmdec.c b/libavcodec/xbmdec.c
index 2ce1465..d19bdae 100644
--- a/libavcodec/xbmdec.c
+++ b/libavcodec/xbmdec.c
@@ -1,20 +1,22 @@
 /*
  * XBM image format
  *
- * This file is part of Libav.
+ * Copyright (c) 2012 Paul B Mahol
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,43 +26,54 @@
 #include "internal.h"
 #include "mathops.h"
 
+static int convert(uint8_t x)
+{
+    if (x >= 'a')
+        x -= 87;
+    else if (x >= 'A')
+        x -= 55;
+    else
+        x -= '0';
+    return x;
+}
+
+static int parse_str_int(const uint8_t *p, int len, const uint8_t *key)
+{
+    const uint8_t *end = p + len;
+
+    for(; p<end - strlen(key); p++) {
+        if (!memcmp(p, key, strlen(key)))
+            break;
+    }
+    p += strlen(key);
+    if (p >= end)
+        return INT_MIN;
+
+    for(; p<end; p++) {
+        char *eptr;
+        int64_t ret = strtol(p, &eptr, 10);
+        if ((const uint8_t *)eptr != p)
+            return ret;
+    }
+    return INT_MIN;
+}
+
 static int xbm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
 {
     AVFrame *p = data;
-    int ret, linesize, i;
+    int ret, linesize, i, j;
     int width  = 0;
     int height = 0;
-    const uint8_t *ptr = avpkt->data;
+    const uint8_t *end, *ptr = avpkt->data;
+    const uint8_t *next;
     uint8_t *dst;
 
     avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-    while (!width || !height) {
-        ptr += strcspn(ptr, "#");
-        if (ptr >= avpkt->data + avpkt->size) {
-            av_log(avctx, AV_LOG_ERROR, "End of file reached.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        if (strncmp(ptr, "#define", 7) != 0) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Unexpected preprocessor directive.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        // skip the name
-        ptr += strcspn(ptr, "_") + 1;
-        // get width or height
-        if (strncmp(ptr, "width", 5) == 0) {
-            ptr += strcspn(ptr, " ");
-            width = strtol(ptr, NULL, 10);
-        } else if (strncmp(ptr, "height", 6) == 0) {
-            ptr += strcspn(ptr, " ");
-            height = strtol(ptr, NULL, 10);
-        } else {
-            // skip offset and unknown variables
-            av_log(avctx, AV_LOG_VERBOSE,
-                   "Ignoring preprocessor directive.\n");
-        }
-    }
+    end = avpkt->data + avpkt->size;
+
+    width  = parse_str_int(avpkt->data, avpkt->size, "_width");
+    height = parse_str_int(avpkt->data, avpkt->size, "_height");
 
     if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
         return ret;
@@ -68,46 +81,48 @@ static int xbm_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
 
-    // go to start of image data
-    ptr += strcspn(ptr, "{");
+    // goto start of image data
+    next = memchr(ptr, '{', avpkt->size);
+    if (!next)
+        next = memchr(ptr, '(', avpkt->size);
+    if (!next)
+        return AVERROR_INVALIDDATA;
+    ptr = next + 1;
 
     linesize = (avctx->width + 7) / 8;
     for (i = 0; i < avctx->height; i++) {
-        int eol = 0, e = 0;
         dst = p->data[0] + i * p->linesize[0];
-        if (ptr >= avpkt->data + avpkt->size) {
-            av_log(avctx, AV_LOG_ERROR, "End of file reached.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        do {
-            int val;
-            uint8_t *endptr;
+        for (j = 0; j < linesize; j++) {
+            uint8_t val;
 
-            ptr += strcspn(ptr, "x") - 1; // -1 to get 0x
-            val = strtol(ptr, (char **)&endptr, 16);
+            while (ptr < end && *ptr != 'x' && *ptr != '$')
+                ptr++;
 
-            if (endptr - ptr == 4) {
-                // XBM X11 format
+            ptr ++;
+            if (ptr < end && av_isxdigit(*ptr)) {
+                val = convert(*ptr++);
+                if (av_isxdigit(*ptr))
+                    val = (val << 4) + convert(*ptr++);
                 *dst++ = ff_reverse[val];
-                eol = linesize;
-            } else if (endptr - ptr == 6) {
-                // XBM X10 format
-                *dst++ = ff_reverse[val >> 8];
-                *dst++ = ff_reverse[val & 0xFF];
-                eol = linesize / 2; // 2 bytes read
+                if (av_isxdigit(*ptr) && j+1 < linesize) {
+                    j++;
+                    val = convert(*ptr++);
+                    if (av_isxdigit(*ptr))
+                        val = (val << 4) + convert(*ptr++);
+                    *dst++ = ff_reverse[val];
+                }
             } else {
                 av_log(avctx, AV_LOG_ERROR,
                        "Unexpected data at %.8s.\n", ptr);
                 return AVERROR_INVALIDDATA;
             }
-            ptr = endptr;
-        } while (++e < eol);
+        }
     }
 
     p->key_frame = 1;
     p->pict_type = AV_PICTURE_TYPE_I;
 
-    *got_frame = 1;
+    *got_frame       = 1;
 
     return avpkt->size;
 }
diff --git a/libavcodec/xbmenc.c b/libavcodec/xbmenc.c
index 4840050..b25615f 100644
--- a/libavcodec/xbmenc.c
+++ b/libavcodec/xbmenc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,17 +24,6 @@
 #include "internal.h"
 #include "mathops.h"
 
-static av_cold int xbm_encode_init(AVCodecContext *avctx)
-{
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    return 0;
-}
-
 static int xbm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                             const AVFrame *p, int *got_packet)
 {
@@ -43,10 +32,8 @@ static int xbm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     linesize = (avctx->width + 7) / 8;
     size     = avctx->height * (linesize * 7 + 2) + 110;
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0)
         return ret;
-    }
 
     buf = pkt->data;
     ptr = p->data[0];
@@ -73,8 +60,7 @@ AVCodec ff_xbm_encoder = {
     .long_name    = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"),
     .type         = AVMEDIA_TYPE_VIDEO,
     .id           = AV_CODEC_ID_XBM,
-    .init         = xbm_encode_init,
     .encode2      = xbm_encode_frame,
     .pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE,
-                                                 AV_PIX_FMT_NONE },
+                                                   AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/xface.c b/libavcodec/xface.c
new file mode 100644
index 0000000..184c174
--- /dev/null
+++ b/libavcodec/xface.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face common data and utilities definition.
+ */
+
+#include "libavutil/avassert.h"
+
+#include "xface.h"
+
+void ff_big_add(BigInt *b, uint8_t a)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c;
+
+    a &= XFACE_WORDMASK;
+    if (a == 0)
+        return;
+    w = b->words;
+    c = a;
+    for (i = 0; i < b->nb_words && c; i++) {
+        c += *w;
+        *w++ = c & XFACE_WORDMASK;
+        c >>= XFACE_BITSPERWORD;
+    }
+    if (i == b->nb_words && c) {
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        b->nb_words++;
+        *w = c & XFACE_WORDMASK;
+    }
+}
+
+void ff_big_div(BigInt *b, uint8_t a, uint8_t *r)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c, d;
+
+    a &= XFACE_WORDMASK;
+    if (a == 1 || b->nb_words == 0) {
+        *r = 0;
+        return;
+    }
+
+    /* treat this as a == WORDCARRY and just shift everything right a WORD */
+    if (a == 0) {
+        i = --b->nb_words;
+        w = b->words;
+        *r = *w;
+        while (i--) {
+            *w = *(w + 1);
+            w++;
+        }
+        *w = 0;
+        return;
+    }
+    i = b->nb_words;
+    w = b->words + i;
+    c = 0;
+    while (i--) {
+        c <<= XFACE_BITSPERWORD;
+        c += *--w;
+        d = c / (uint16_t)a;
+        c = c % (uint16_t)a;
+        *w = d & XFACE_WORDMASK;
+    }
+    *r = c;
+    if (b->words[b->nb_words - 1] == 0)
+        b->nb_words--;
+}
+
+void ff_big_mul(BigInt *b, uint8_t a)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c;
+
+    a &= XFACE_WORDMASK;
+    if (a == 1 || b->nb_words == 0)
+        return;
+    if (a == 0) {
+        /* treat this as a == WORDCARRY and just shift everything left a WORD */
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        i = b->nb_words++;
+        w = b->words + i;
+        while (i--) {
+            *w = *(w - 1);
+            w--;
+        }
+        *w = 0;
+        return;
+    }
+    i = b->nb_words;
+    w = b->words;
+    c = 0;
+    while (i--) {
+        c += (uint16_t)*w * (uint16_t)a;
+        *(w++) = c & XFACE_WORDMASK;
+        c >>= XFACE_BITSPERWORD;
+    }
+    if (c) {
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        b->nb_words++;
+        *w = c & XFACE_WORDMASK;
+    }
+}
+
+const ProbRange ff_xface_probranges_per_level[4][3] = {
+    //  black      grey       white
+    { {  1, 255}, {251, 0}, {  4, 251} }, /* Top of tree almost always grey */
+    { {  1, 255}, {200, 0}, { 55, 200} },
+    { { 33, 223}, {159, 0}, { 64, 159} },
+    { {131,   0}, {  0, 0}, {125, 131} }, /* Grey disallowed at bottom */
+};
+
+const ProbRange ff_xface_probranges_2x2[16] = {
+    { 0,   0},  {38,   0}, {38,  38},  {13, 152},
+    {38,  76},  {13, 165}, {13, 178},  { 6, 230},
+    {38, 114},  {13, 191}, {13, 204},  { 6, 236},
+    {13, 217},  { 6, 242}, { 5, 248},  { 3, 253},
+};
+
+/*
+ * The "guess the next pixel" tables follow. Normally there are 12
+ * neighbour pixels used to give 1<<12 cases as we get closer to the
+ * upper left corner lesser numbers of neighbours are available.
+ *
+ * Each byte in the tables represents 8 boolean values starting from
+ * the most significant bit.
+ */
+
+static const uint8_t g_00[] = {
+    0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0xe3, 0xdf, 0x05, 0x17,
+    0x05, 0x0f, 0x00, 0x1b, 0x0f, 0xdf, 0x00, 0x04, 0x00, 0x00,
+    0x0d, 0x0f, 0x03, 0x7f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x1d,
+    0x45, 0x2f, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x0a, 0xff, 0xff,
+    0x00, 0x04, 0x00, 0x05, 0x01, 0x3f, 0xcf, 0xff, 0x10, 0x01,
+    0x80, 0xc9, 0x0f, 0x0f, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x1b, 0x1f, 0xff, 0xff, 0x4f, 0x54, 0x07, 0x1f, 0x57, 0x47,
+    0xd7, 0x3d, 0xff, 0xff, 0x5f, 0x1f, 0x7f, 0xff, 0x7f, 0x7f,
+    0x05, 0x0f, 0x01, 0x0f, 0x0f, 0x5f, 0x9b, 0xdf, 0x7f, 0xff,
+    0x5f, 0x1d, 0x5f, 0xff, 0x0f, 0x1f, 0x0f, 0x5f, 0x03, 0x1f,
+    0x4f, 0x5f, 0xf7, 0x7f, 0x7f, 0xff, 0x0d, 0x0f, 0xfb, 0xff,
+    0xf7, 0xbf, 0x0f, 0x4f, 0xd7, 0x3f, 0x4f, 0x7f, 0xff, 0xff,
+    0x67, 0xbf, 0x56, 0x25, 0x1f, 0x7f, 0x9f, 0xff, 0x00, 0x00,
+    0x00, 0x05, 0x5f, 0x7f, 0x01, 0xdf, 0x14, 0x00, 0x05, 0x0f,
+    0x07, 0xa2, 0x09, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x5f,
+    0x18, 0xd7, 0x94, 0x71, 0x00, 0x05, 0x1f, 0xb7, 0x0c, 0x07,
+    0x0f, 0x0f, 0x00, 0x0f, 0x0f, 0x1f, 0x84, 0x8f, 0x05, 0x15,
+    0x05, 0x0f, 0x4f, 0xff, 0x87, 0xdf, 0x05, 0x01, 0x10, 0x00,
+    0x0f, 0x0f, 0x00, 0x08, 0x05, 0x04, 0x04, 0x01, 0x4f, 0xff,
+    0x9f, 0x8f, 0x4a, 0x40, 0x5f, 0x5f, 0xff, 0xfe, 0xdf, 0xff,
+    0x7f, 0xf7, 0xff, 0x7f, 0xff, 0xff, 0x7b, 0xff, 0x0f, 0xfd,
+    0xd7, 0x5f, 0x4f, 0x7f, 0x7f, 0xdf, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0x77, 0xdf, 0x7f, 0x4f, 0xef, 0xff, 0xff, 0x77, 0xff,
+    0xff, 0xff, 0x6f, 0xff, 0x0f, 0x4f, 0xff, 0xff, 0x9d, 0xff,
+    0x0f, 0xef, 0xff, 0xdf, 0x6f, 0xff, 0xff, 0xff, 0x4f, 0xff,
+    0xcd, 0x0f, 0x4f, 0xff, 0xff, 0xdf, 0x00, 0x00, 0x00, 0x0b,
+    0x05, 0x02, 0x02, 0x0f, 0x04, 0x00, 0x00, 0x0c, 0x01, 0x06,
+    0x00, 0x0f, 0x20, 0x03, 0x00, 0x00, 0x05, 0x0f, 0x40, 0x08,
+    0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x0c, 0x0f, 0x01, 0x00,
+    0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x14, 0x01, 0x05,
+    0x01, 0x15, 0xaf, 0x0f, 0x00, 0x01, 0x10, 0x00, 0x08, 0x00,
+    0x46, 0x0c, 0x20, 0x00, 0x88, 0x00, 0x0f, 0x15, 0xff, 0xdf,
+    0x02, 0x00, 0x00, 0x0f, 0x7f, 0x5f, 0xdb, 0xff, 0x4f, 0x3e,
+    0x05, 0x0f, 0x7f, 0xf7, 0x95, 0x4f, 0x0d, 0x0f, 0x01, 0x0f,
+    0x4f, 0x5f, 0x9f, 0xdf, 0x25, 0x0e, 0x0d, 0x0d, 0x4f, 0x7f,
+    0x8f, 0x0f, 0x0f, 0xfa, 0x04, 0x4f, 0x4f, 0xff, 0xf7, 0x77,
+    0x47, 0xed, 0x05, 0x0f, 0xff, 0xff, 0xdf, 0xff, 0x4f, 0x6f,
+    0xd8, 0x5f, 0x0f, 0x7f, 0xdf, 0x5f, 0x07, 0x0f, 0x94, 0x0d,
+    0x1f, 0xff, 0xff, 0xff, 0x00, 0x02, 0x00, 0x03, 0x46, 0x57,
+    0x01, 0x0d, 0x01, 0x08, 0x01, 0x0f, 0x47, 0x6c, 0x0d, 0x0f,
+    0x02, 0x00, 0x00, 0x00, 0x0b, 0x4f, 0x00, 0x08, 0x05, 0x00,
+    0x95, 0x01, 0x0f, 0x7f, 0x0c, 0x0f, 0x01, 0x0e, 0x00, 0x00,
+    0x0f, 0x41, 0x00, 0x00, 0x04, 0x24, 0x0d, 0x0f, 0x0f, 0x7f,
+    0xcf, 0xdf, 0x00, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00,
+    0x06, 0x26, 0xcf, 0x05, 0xcf, 0x7f, 0xdf, 0xdf, 0x00, 0x00,
+    0x17, 0x5f, 0xff, 0xfd, 0xff, 0xff, 0x46, 0x09, 0x4f, 0x5f,
+    0x7f, 0xfd, 0xdf, 0xff, 0x0a, 0x88, 0xa7, 0x7f, 0x7f, 0xff,
+    0xff, 0xff, 0x0f, 0x04, 0xdf, 0x7f, 0x4f, 0xff, 0x9f, 0xff,
+    0x0e, 0xe6, 0xdf, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x0f, 0xec,
+    0x8f, 0x4f, 0x7f, 0xff, 0xdf, 0xff, 0x0f, 0xcf, 0xdf, 0xff,
+    0x6f, 0x7f, 0xff, 0xff, 0x03, 0x0c, 0x9d, 0x0f, 0x7f, 0xff,
+    0xff, 0xff,
+};
+
+static const uint8_t g_01[] = {
+    0x37, 0x73, 0x00, 0x19, 0x57, 0x7f, 0xf5, 0xfb, 0x70, 0x33,
+    0xf0, 0xf9, 0x7f, 0xff, 0xff, 0xff,
+};
+
+static const uint8_t g_02[] = {
+    0x50,
+};
+
+static const uint8_t g_10[] = {
+    0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0xf3, 0x5f, 0x84, 0x04,
+    0x17, 0x9f, 0x04, 0x23, 0x05, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x03, 0x03, 0x33, 0xd7, 0x05, 0x03, 0x5f, 0x3f, 0x17, 0x33,
+    0xff, 0xff, 0x00, 0x80, 0x02, 0x04, 0x12, 0x00, 0x11, 0x57,
+    0x05, 0x25, 0x05, 0x03, 0x35, 0xbf, 0x9f, 0xff, 0x07, 0x6f,
+    0x20, 0x40, 0x17, 0x06, 0xfa, 0xe8, 0x01, 0x07, 0x1f, 0x9f,
+    0x1f, 0xff, 0xff, 0xff,
+};
+
+static const uint8_t g_20[] = {
+    0x04, 0x00, 0x01, 0x01, 0x43, 0x2e, 0xff, 0x3f,
+};
+
+static const uint8_t g_30[] = {
+    0x11, 0x11, 0x11, 0x11, 0x51, 0x11, 0x13, 0x11, 0x11, 0x11,
+    0x13, 0x11, 0x11, 0x11, 0x33, 0x11, 0x13, 0x11, 0x13, 0x13,
+    0x13, 0x13, 0x31, 0x31, 0x11, 0x01, 0x11, 0x11, 0x71, 0x11,
+    0x11, 0x75,
+};
+
+static const uint8_t g_40[] = {
+    0x00, 0x0f, 0x00, 0x09, 0x00, 0x0d, 0x00, 0x0d, 0x00, 0x0f,
+    0x00, 0x4e, 0xe4, 0x0d, 0x10, 0x0f, 0x00, 0x0f, 0x44, 0x4f,
+    0x00, 0x1e, 0x0f, 0x0f, 0xae, 0xaf, 0x45, 0x7f, 0xef, 0xff,
+    0x0f, 0xff, 0x00, 0x09, 0x01, 0x11, 0x00, 0x01, 0x1c, 0xdd,
+    0x00, 0x15, 0x00, 0xff, 0x00, 0x10, 0x00, 0xfd, 0x00, 0x0f,
+    0x4f, 0x5f, 0x3d, 0xff, 0xff, 0xff, 0x4f, 0xff, 0x1c, 0xff,
+    0xdf, 0xff, 0x8f, 0xff, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x15,
+    0x01, 0x07, 0x00, 0x01, 0x02, 0x1f, 0x01, 0x11, 0x05, 0x7f,
+    0x00, 0x1f, 0x41, 0x57, 0x1f, 0xff, 0x05, 0x77, 0x0d, 0x5f,
+    0x4d, 0xff, 0x4f, 0xff, 0x0f, 0xff, 0x00, 0x00, 0x02, 0x05,
+    0x00, 0x11, 0x05, 0x7d, 0x10, 0x15, 0x2f, 0xff, 0x40, 0x50,
+    0x0d, 0xfd, 0x04, 0x0f, 0x07, 0x1f, 0x07, 0x7f, 0x0f, 0xbf,
+    0x0d, 0x7f, 0x0f, 0xff, 0x4d, 0x7d, 0x0f, 0xff,
+};
+
+static const uint8_t g_11[] = {
+    0x01, 0x13, 0x03, 0x7f,
+};
+
+static const uint8_t g_21[] = {
+    0x17,
+};
+
+static const uint8_t g_31[] = {
+    0x55, 0x57, 0x57, 0x7f,
+};
+
+static const uint8_t g_41[] = {
+    0x01, 0x01, 0x01, 0x1f, 0x03, 0x1f, 0x3f, 0xff,
+};
+
+static const uint8_t g_12[] = {
+    0x40,
+};
+
+static const uint8_t g_22[] = {
+    0x00,
+};
+
+static const uint8_t g_32[] = {
+    0x10,
+};
+
+static const uint8_t g_42[] = {
+    0x10,
+};
+
+void ff_xface_generate_face(uint8_t *dst, uint8_t * const src)
+{
+    int h, i, j, k, l, m;
+
+    for (j = 0; j < XFACE_HEIGHT; j++) {
+        for (i = 0; i < XFACE_WIDTH; i++) {
+            h = i + j * XFACE_WIDTH;
+            k = 0;
+
+            /*
+               Compute k, encoding the bits *before* the current one, contained in the
+               image buffer. That is, given the grid:
+
+                l      i
+                |      |
+                v      v
+               +--+--+--+--+--+
+          m -> | 1| 2| 3| 4| 5|
+               +--+--+--+--+--+
+               | 6| 7| 8| 9|10|
+               +--+--+--+--+--+
+          j -> |11|12| *|  |  |
+               +--+--+--+--+--+
+
+               the value k for the pixel marked as "*" will contain the bit encoding of
+               the values in the matrix marked from "1" to "12". In case the pixel is
+               near the border of the grid, the number of values contained within the
+               grid will be lesser than 12.
+             */
+
+            for (l = i - 2; l <= i + 2; l++) {
+                for (m = j - 2; m <= j; m++) {
+                    if (l <= 0 || l >= i && m == j)
+                        continue;
+                    if (l <= XFACE_WIDTH && m > 0)
+                        k = 2*k + src[l + m * XFACE_WIDTH];
+                }
+            }
+
+            /*
+              Use the guess for the given position and the computed value of k.
+
+              The following table shows the number of digits in k, depending on
+              the position of the pixel, and shows the corresponding guess table
+              to use:
+
+                 i=1  i=2  i=3       i=w-1 i=w
+               +----+----+----+ ... +----+----+
+           j=1 |  0 |  1 |  2 |     |  2 |  2 |
+               |g22 |g12 |g02 |     |g42 |g32 |
+               +----+----+----+ ... +----+----+
+           j=2 |  3 |  5 |  7 |     |  6 |  5 |
+               |g21 |g11 |g01 |     |g41 |g31 |
+               +----+----+----+ ... +----+----+
+           j=3 |  5 |  9 | 12 |     | 10 |  8 |
+               |g20 |g10 |g00 |     |g40 |g30 |
+               +----+----+----+ ... +----+----+
+            */
+
+#define GEN(table) dst[h] ^= (table[k>>3]>>(7-(k&7)))&1
+
+            switch (i) {
+            case 1:
+                switch (j) {
+                case 1:  GEN(g_22); break;
+                case 2:  GEN(g_21); break;
+                default: GEN(g_20); break;
+                }
+                break;
+            case 2:
+                switch (j) {
+                case 1:  GEN(g_12); break;
+                case 2:  GEN(g_11); break;
+                default: GEN(g_10); break;
+                }
+                break;
+            case XFACE_WIDTH - 1:
+                switch (j) {
+                case 1:  GEN(g_42); break;
+                case 2:  GEN(g_41); break;
+                default: GEN(g_40); break;
+                }
+                break;
+            case XFACE_WIDTH:
+                switch (j) {
+                case 1:  GEN(g_32); break;
+                case 2:  GEN(g_31); break;
+                default: GEN(g_30); break;
+                }
+                break;
+            default:
+                switch (j) {
+                case 1:  GEN(g_02); break;
+                case 2:  GEN(g_01); break;
+                default: GEN(g_00); break;
+                }
+                break;
+            }
+        }
+    }
+}
diff --git a/libavcodec/xface.h b/libavcodec/xface.h
new file mode 100644
index 0000000..d366fdb
--- /dev/null
+++ b/libavcodec/xface.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face common definitions.
+ */
+
+#ifndef AVCODEC_XFACE_H
+#define AVCODEC_XFACE_H
+
+#include <stdint.h>
+
+/* define the face size - 48x48x1 */
+#define XFACE_WIDTH  48
+#define XFACE_HEIGHT 48
+#define XFACE_PIXELS (XFACE_WIDTH * XFACE_HEIGHT)
+
+/* compressed output uses the full range of printable characters.
+ * In ASCII these are in a contiguous block so we just need to know
+ * the first and last. The total number of printables is needed too. */
+#define XFACE_FIRST_PRINT '!'
+#define XFACE_LAST_PRINT '~'
+#define XFACE_PRINTS (XFACE_LAST_PRINT - XFACE_FIRST_PRINT + 1)
+
+/*
+ * Image is encoded as a big integer, using characters from '~' to
+ * '!', for a total of 94 symbols. In order to express
+ * 48x48 pixels with the worst case encoding 666 symbols should
+ * be sufficient.
+ */
+#define XFACE_MAX_DIGITS 666
+
+#define XFACE_BITSPERWORD 8
+#define XFACE_WORDCARRY (1 << XFACE_BITSPERWORD)
+#define XFACE_WORDMASK (XFACE_WORDCARRY - 1)
+
+// This must be larger or equal to log256(94^XFACE_MAX_DIGITS)
+#define XFACE_MAX_WORDS 546
+
+/* Portable, very large unsigned integer arithmetic is needed.
+ * Implementation uses arrays of WORDs. */
+typedef struct {
+    int nb_words;
+    uint8_t words[XFACE_MAX_WORDS];
+} BigInt;
+
+/**
+ * Add a to b storing the result in b.
+ */
+void ff_big_add(BigInt *b, uint8_t a);
+
+/**
+ * Divide b by a storing the result in b and the remainder in the word
+ * pointed to by r.
+ */
+void ff_big_div(BigInt *b, uint8_t a, uint8_t *r);
+
+/**
+ * Multiply a by b storing the result in b.
+ */
+void ff_big_mul(BigInt *b, uint8_t a);
+
+/* Each face is encoded using 9 octrees of 16x16 each. Each level of the
+ * trees has varying probabilities of being white, grey or black.
+ * The table below is based on sampling many faces */
+enum XFaceColor { XFACE_COLOR_BLACK = 0, XFACE_COLOR_GREY, XFACE_COLOR_WHITE };
+
+/* Data of varying probabilities are encoded by a value in the range 0 - 255.
+ * The probability of the data determines the range of possible encodings.
+ * Offset gives the first possible encoding of the range. */
+typedef struct {
+    uint8_t range;
+    uint8_t offset;
+} ProbRange;
+
+extern const ProbRange ff_xface_probranges_per_level[4][3];
+
+extern const ProbRange ff_xface_probranges_2x2[16];
+
+void ff_xface_generate_face(uint8_t *dst, uint8_t * const src);
+
+#endif /* AVCODEC_XFACE_H */
diff --git a/libavcodec/xfacedec.c b/libavcodec/xfacedec.c
new file mode 100644
index 0000000..ab4c082
--- /dev/null
+++ b/libavcodec/xfacedec.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face decoder, based on libcompface, by James Ashton.
+ */
+
+#include "libavutil/pixdesc.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "xface.h"
+
+static int pop_integer(BigInt *b, const ProbRange *pranges)
+{
+    uint8_t r;
+    int i;
+
+    /* extract the last byte into r, and shift right b by 8 bits */
+    ff_big_div(b, 0, &r);
+
+    i = 0;
+    while (r < pranges->offset || r >= pranges->range + pranges->offset) {
+        pranges++;
+        i++;
+    }
+    ff_big_mul(b, pranges->range);
+    ff_big_add(b, r - pranges->offset);
+    return i;
+}
+
+static void pop_greys(BigInt *b, char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        pop_greys(b, bitmap,                       w, h);
+        pop_greys(b, bitmap + w,                   w, h);
+        pop_greys(b, bitmap + XFACE_WIDTH * h,     w, h);
+        pop_greys(b, bitmap + XFACE_WIDTH * h + w, w, h);
+    } else {
+        w = pop_integer(b, ff_xface_probranges_2x2);
+        if (w & 1) bitmap[0]               = 1;
+        if (w & 2) bitmap[1]               = 1;
+        if (w & 4) bitmap[XFACE_WIDTH]     = 1;
+        if (w & 8) bitmap[XFACE_WIDTH + 1] = 1;
+    }
+}
+
+static void decode_block(BigInt *b, char *bitmap, int w, int h, int level)
+{
+    switch (pop_integer(b, &ff_xface_probranges_per_level[level][0])) {
+    case XFACE_COLOR_WHITE:
+        return;
+    case XFACE_COLOR_BLACK:
+        pop_greys(b, bitmap, w, h);
+        return;
+    default:
+        w /= 2;
+        h /= 2;
+        level++;
+        decode_block(b, bitmap,                       w, h, level);
+        decode_block(b, bitmap + w,                   w, h, level);
+        decode_block(b, bitmap + h * XFACE_WIDTH,     w, h, level);
+        decode_block(b, bitmap + w + h * XFACE_WIDTH, w, h, level);
+        return;
+    }
+}
+
+typedef struct XFaceContext {
+    uint8_t bitmap[XFACE_PIXELS]; ///< image used internally for decoding
+} XFaceContext;
+
+static av_cold int xface_decode_init(AVCodecContext *avctx)
+{
+    if (avctx->width || avctx->height) {
+        if (avctx->width != XFACE_WIDTH || avctx->height != XFACE_HEIGHT) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Size value %dx%d not supported, only accepts a size of %dx%d\n",
+                   avctx->width, avctx->height, XFACE_WIDTH, XFACE_HEIGHT);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    avctx->width   = XFACE_WIDTH;
+    avctx->height  = XFACE_HEIGHT;
+    avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+
+    return 0;
+}
+
+static int xface_decode_frame(AVCodecContext *avctx,
+                              void *data, int *got_frame,
+                              AVPacket *avpkt)
+{
+    XFaceContext *xface = avctx->priv_data;
+    int ret, i, j, k;
+    uint8_t byte;
+    BigInt b = {0};
+    char *buf;
+    int64_t c;
+    AVFrame *frame = data;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (i = 0, k = 0; i < avpkt->size && avpkt->data[i]; i++) {
+        c = avpkt->data[i];
+
+        /* ignore invalid digits */
+        if (c < XFACE_FIRST_PRINT || c > XFACE_LAST_PRINT)
+            continue;
+
+        if (++k > XFACE_MAX_DIGITS) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Buffer is longer than expected, truncating at byte %d\n", i);
+            break;
+        }
+        ff_big_mul(&b, XFACE_PRINTS);
+        ff_big_add(&b, c - XFACE_FIRST_PRINT);
+    }
+
+    /* decode image and put it in bitmap */
+    memset(xface->bitmap, 0, XFACE_PIXELS);
+    buf = xface->bitmap;
+    decode_block(&b, buf,                         16, 16, 0);
+    decode_block(&b, buf + 16,                    16, 16, 0);
+    decode_block(&b, buf + 32,                    16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16,      16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16 + 16, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16 + 32, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32     , 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32 + 16, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32 + 32, 16, 16, 0);
+
+    ff_xface_generate_face(xface->bitmap, xface->bitmap);
+
+    /* convert image from 1=black 0=white bitmap to MONOWHITE */
+    buf = frame->data[0];
+    for (i = 0, j = 0, k = 0, byte = 0; i < XFACE_PIXELS; i++) {
+        byte += xface->bitmap[i];
+        if (k == 7) {
+            buf[j++] = byte;
+            byte = k = 0;
+        } else {
+            k++;
+            byte <<= 1;
+        }
+        if (j == XFACE_WIDTH/8) {
+            j = 0;
+            buf += frame->linesize[0];
+        }
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_xface_decoder = {
+    .name           = "xface",
+    .long_name      = NULL_IF_CONFIG_SMALL("X-face image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XFACE,
+    .priv_data_size = sizeof(XFaceContext),
+    .init           = xface_decode_init,
+    .decode         = xface_decode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/xfaceenc.c b/libavcodec/xfaceenc.c
new file mode 100644
index 0000000..bfb9fb9
--- /dev/null
+++ b/libavcodec/xfaceenc.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face encoder, based on libcompface, by James Ashton.
+ */
+
+#include "xface.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
+
+typedef struct XFaceContext {
+    AVClass *class;
+    uint8_t bitmap[XFACE_PIXELS]; ///< image used internally for decoding
+    int max_line_len;             ///< max line length for compressed data
+    int set_header;               ///< set X-Face header in the output
+} XFaceContext;
+
+static int all_same(char *bitmap, int w, int h)
+{
+    char val, *row;
+    int x;
+
+    val = *bitmap;
+    while (h--) {
+        row = bitmap;
+        x = w;
+        while (x--)
+            if (*(row++) != val)
+                return 0;
+        bitmap += XFACE_WIDTH;
+    }
+    return 1;
+}
+
+static int all_black(char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        return (all_black(bitmap, w, h) && all_black(bitmap + w, w, h) &&
+                all_black(bitmap + XFACE_WIDTH * h, w, h) &&
+                all_black(bitmap + XFACE_WIDTH * h + w, w, h));
+    } else {
+        /* at least one pixel in the 2x2 grid is non-zero */
+        return *bitmap || *(bitmap + 1) ||
+               *(bitmap + XFACE_WIDTH) || *(bitmap + XFACE_WIDTH + 1);
+    }
+}
+
+static int all_white(char *bitmap, int w, int h)
+{
+    return *bitmap == 0 && all_same(bitmap, w, h);
+}
+
+typedef struct {
+    ProbRange prob_ranges[XFACE_PIXELS*2];
+    int prob_ranges_idx;
+} ProbRangesQueue;
+
+static inline int pq_push(ProbRangesQueue *pq, const ProbRange *p)
+{
+    if (pq->prob_ranges_idx >= XFACE_PIXELS * 2 - 1)
+        return -1;
+    pq->prob_ranges[pq->prob_ranges_idx++] = *p;
+    return 0;
+}
+
+static void push_greys(ProbRangesQueue *pq, char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        push_greys(pq, bitmap,                       w, h);
+        push_greys(pq, bitmap + w,                   w, h);
+        push_greys(pq, bitmap + XFACE_WIDTH * h,     w, h);
+        push_greys(pq, bitmap + XFACE_WIDTH * h + w, w, h);
+    } else {
+        const ProbRange *p = ff_xface_probranges_2x2 +
+                 *bitmap +
+            2 * *(bitmap + 1) +
+            4 * *(bitmap + XFACE_WIDTH) +
+            8 * *(bitmap + XFACE_WIDTH + 1);
+        pq_push(pq, p);
+    }
+}
+
+static void encode_block(char *bitmap, int w, int h, int level, ProbRangesQueue *pq)
+{
+    if (all_white(bitmap, w, h)) {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_WHITE]);
+    } else if (all_black(bitmap, w, h)) {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_BLACK]);
+        push_greys(pq, bitmap, w, h);
+    } else {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_GREY]);
+        w /= 2;
+        h /= 2;
+        level++;
+        encode_block(bitmap,                       w, h, level, pq);
+        encode_block(bitmap + w,                   w, h, level, pq);
+        encode_block(bitmap + h * XFACE_WIDTH,     w, h, level, pq);
+        encode_block(bitmap + w + h * XFACE_WIDTH, w, h, level, pq);
+    }
+}
+
+static void push_integer(BigInt *b, const ProbRange *prange)
+{
+    uint8_t r;
+
+    ff_big_div(b, prange->range, &r);
+    ff_big_mul(b, 0);
+    ff_big_add(b, r + prange->offset);
+}
+
+static int xface_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *frame, int *got_packet)
+{
+    XFaceContext *xface = avctx->priv_data;
+    ProbRangesQueue pq = {{{ 0 }}, 0};
+    uint8_t bitmap_copy[XFACE_PIXELS];
+    BigInt b = {0};
+    int i, j, k, ret = 0;
+    const uint8_t *buf;
+    uint8_t *p;
+    char intbuf[XFACE_MAX_DIGITS];
+
+    if (avctx->width || avctx->height) {
+        if (avctx->width != XFACE_WIDTH || avctx->height != XFACE_HEIGHT) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Size value %dx%d not supported, only accepts a size of %dx%d\n",
+                   avctx->width, avctx->height, XFACE_WIDTH, XFACE_HEIGHT);
+            return AVERROR(EINVAL);
+        }
+    }
+    avctx->width  = XFACE_WIDTH;
+    avctx->height = XFACE_HEIGHT;
+
+    /* convert image from MONOWHITE to 1=black 0=white bitmap */
+    buf = frame->data[0];
+    i = j = 0;
+    do {
+        for (k = 0; k < 8; k++)
+            xface->bitmap[i++] = (buf[j]>>(7-k))&1;
+        if (++j == XFACE_WIDTH/8) {
+            buf += frame->linesize[0];
+            j = 0;
+        }
+    } while (i < XFACE_PIXELS);
+
+    /* create a copy of bitmap */
+    memcpy(bitmap_copy, xface->bitmap, XFACE_PIXELS);
+    ff_xface_generate_face(xface->bitmap, bitmap_copy);
+
+    encode_block(xface->bitmap,                         16, 16, 0, &pq);
+    encode_block(xface->bitmap + 16,                    16, 16, 0, &pq);
+    encode_block(xface->bitmap + 32,                    16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16,      16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16 + 16, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16 + 32, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32,      16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32 + 16, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32 + 32, 16, 16, 0, &pq);
+
+    while (pq.prob_ranges_idx > 0)
+        push_integer(&b, &pq.prob_ranges[--pq.prob_ranges_idx]);
+
+    /* write the inverted big integer in b to intbuf */
+    i = 0;
+    av_assert0(b.nb_words < XFACE_MAX_WORDS);
+    while (b.nb_words) {
+        uint8_t r;
+        ff_big_div(&b, XFACE_PRINTS, &r);
+        av_assert0(i < sizeof(intbuf));
+        intbuf[i++] = r + XFACE_FIRST_PRINT;
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, i+2, 0)) < 0)
+        return ret;
+
+    /* revert the number, and close the buffer */
+    p = pkt->data;
+    while (--i >= 0)
+        *(p++) = intbuf[i];
+    *(p++) = '\n';
+    *(p++) = 0;
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+AVCodec ff_xface_encoder = {
+    .name           = "xface",
+    .long_name      = NULL_IF_CONFIG_SMALL("X-face image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XFACE,
+    .priv_data_size = sizeof(XFaceContext),
+    .encode2        = xface_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/xiph.c b/libavcodec/xiph.c
index 7c3c710..d072224 100644
--- a/libavcodec/xiph.c
+++ b/libavcodec/xiph.c
@@ -1,28 +1,28 @@
 /*
- * Copyright (C) 2007  Libav Project
+ * Copyright (C) 2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
 #include "xiph.h"
 
-int avpriv_split_xiph_headers(uint8_t *extradata, int extradata_size,
-                          int first_header_size, uint8_t *header_start[3],
+int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size,
+                          int first_header_size, const uint8_t *header_start[3],
                           int header_len[3])
 {
     int i;
diff --git a/libavcodec/xiph.h b/libavcodec/xiph.h
index afaece7..1741a51 100644
--- a/libavcodec/xiph.h
+++ b/libavcodec/xiph.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2007  Libav Project
+ * Copyright (C) 2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,8 @@
  * @param[out] header_len The sizes of each of the three headers.
  * @return On error a negative value is returned, on success zero.
  */
-int avpriv_split_xiph_headers(uint8_t *extradata, int extradata_size,
-                              int first_header_size, uint8_t *header_start[3],
+int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size,
+                              int first_header_size, const uint8_t *header_start[3],
                               int header_len[3]);
 
 #endif /* AVCODEC_XIPH_H */
diff --git a/libavcodec/xl.c b/libavcodec/xl.c
index 7286c14..37ab46e 100644
--- a/libavcodec/xl.c
+++ b/libavcodec/xl.c
@@ -2,20 +2,20 @@
  * Miro VideoXL codec
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,19 +50,16 @@ static int decode_frame(AVCodecContext *avctx,
     int y0, y1, y2, y3 = 0, c0 = 0, c1 = 0;
 
     if (avctx->width % 4) {
-        av_log(avctx, AV_LOG_ERROR, "Width not a multiple of 4.\n");
+        av_log(avctx, AV_LOG_ERROR, "width is not a multiple of 4\n");
         return AVERROR_INVALIDDATA;
     }
-
     if (buf_size < avctx->width * avctx->height) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
diff --git a/libavcodec/xma_parser.c b/libavcodec/xma_parser.c
new file mode 100644
index 0000000..0513679
--- /dev/null
+++ b/libavcodec/xma_parser.c
@@ -0,0 +1,62 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * XMA2 audio parser
+ */
+
+#include "parser.h"
+
+typedef struct XMAParserContext{
+    int skip_packets;
+} XMAParserContext;
+
+static int xma_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                     const uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size)
+{
+    XMAParserContext *s = s1->priv_data;
+
+    if (buf_size % 2048 == 0) {
+        int duration = 0, packet, nb_packets = buf_size / 2048;
+
+        for (packet = 0; packet < nb_packets; packet++) {
+            if (s->skip_packets == 0) {
+                duration += buf[packet * 2048] * 128;
+                s->skip_packets = buf[packet * 2048 + 3] + 1;
+            }
+            s->skip_packets--;
+        }
+
+        s1->duration = duration;
+        s1->key_frame = !!duration;
+    }
+
+    /* always return the full packet. this parser isn't doing any splitting or
+       combining, only packet analysis */
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return buf_size;
+}
+
+AVCodecParser ff_xma_parser = {
+    .codec_ids      = { AV_CODEC_ID_XMA2 },
+    .priv_data_size = sizeof(XMAParserContext),
+    .parser_parse   = xma_parse,
+};
diff --git a/libavcodec/xpmdec.c b/libavcodec/xpmdec.c
new file mode 100644
index 0000000..43dd9bc
--- /dev/null
+++ b/libavcodec/xpmdec.c
@@ -0,0 +1,447 @@
+/*
+ * XPM image format
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ * Copyright (c) 2017 Paras Chadha
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/parseutils.h"
+#include "libavutil/avstring.h"
+#include "avcodec.h"
+#include "internal.h"
+
+#define MIN_ELEMENT ' '
+#define MAX_ELEMENT 0xfe
+#define NB_ELEMENTS (MAX_ELEMENT - MIN_ELEMENT + 1)
+
+typedef struct XPMContext {
+    uint32_t  *pixels;
+    int        pixels_size;
+    uint8_t   *buf;
+    int        buf_size;
+} XPMDecContext;
+
+typedef struct ColorEntry {
+    const char *name;         ///< a string representing the name of the color
+    uint32_t    rgb_color;    ///< RGB values for the color
+} ColorEntry;
+
+static int color_table_compare(const void *lhs, const void *rhs)
+{
+    return av_strcasecmp(lhs, ((const ColorEntry *)rhs)->name);
+}
+
+static const ColorEntry color_table[] = {
+    { "AliceBlue",            0xFFF0F8FF },
+    { "AntiqueWhite",         0xFFFAEBD7 },
+    { "Aqua",                 0xFF00FFFF },
+    { "Aquamarine",           0xFF7FFFD4 },
+    { "Azure",                0xFFF0FFFF },
+    { "Beige",                0xFFF5F5DC },
+    { "Bisque",               0xFFFFE4C4 },
+    { "Black",                0xFF000000 },
+    { "BlanchedAlmond",       0xFFFFEBCD },
+    { "Blue",                 0xFF0000FF },
+    { "BlueViolet",           0xFF8A2BE2 },
+    { "Brown",                0xFFA52A2A },
+    { "BurlyWood",            0xFFDEB887 },
+    { "CadetBlue",            0xFF5F9EA0 },
+    { "Chartreuse",           0xFF7FFF00 },
+    { "Chocolate",            0xFFD2691E },
+    { "Coral",                0xFFFF7F50 },
+    { "CornflowerBlue",       0xFF6495ED },
+    { "Cornsilk",             0xFFFFF8DC },
+    { "Crimson",              0xFFDC143C },
+    { "Cyan",                 0xFF00FFFF },
+    { "DarkBlue",             0xFF00008B },
+    { "DarkCyan",             0xFF008B8B },
+    { "DarkGoldenRod",        0xFFB8860B },
+    { "DarkGray",             0xFFA9A9A9 },
+    { "DarkGreen",            0xFF006400 },
+    { "DarkKhaki",            0xFFBDB76B },
+    { "DarkMagenta",          0xFF8B008B },
+    { "DarkOliveGreen",       0xFF556B2F },
+    { "Darkorange",           0xFFFF8C00 },
+    { "DarkOrchid",           0xFF9932CC },
+    { "DarkRed",              0xFF8B0000 },
+    { "DarkSalmon",           0xFFE9967A },
+    { "DarkSeaGreen",         0xFF8FBC8F },
+    { "DarkSlateBlue",        0xFF483D8B },
+    { "DarkSlateGray",        0xFF2F4F4F },
+    { "DarkTurquoise",        0xFF00CED1 },
+    { "DarkViolet",           0xFF9400D3 },
+    { "DeepPink",             0xFFFF1493 },
+    { "DeepSkyBlue",          0xFF00BFFF },
+    { "DimGray",              0xFF696969 },
+    { "DodgerBlue",           0xFF1E90FF },
+    { "FireBrick",            0xFFB22222 },
+    { "FloralWhite",          0xFFFFFAF0 },
+    { "ForestGreen",          0xFF228B22 },
+    { "Fuchsia",              0xFFFF00FF },
+    { "Gainsboro",            0xFFDCDCDC },
+    { "GhostWhite",           0xFFF8F8FF },
+    { "Gold",                 0xFFFFD700 },
+    { "GoldenRod",            0xFFDAA520 },
+    { "Gray",                 0xFFBEBEBE },
+    { "Green",                0xFF00FF00 },
+    { "GreenYellow",          0xFFADFF2F },
+    { "HoneyDew",             0xFFF0FFF0 },
+    { "HotPink",              0xFFFF69B4 },
+    { "IndianRed",            0xFFCD5C5C },
+    { "Indigo",               0xFF4B0082 },
+    { "Ivory",                0xFFFFFFF0 },
+    { "Khaki",                0xFFF0E68C },
+    { "Lavender",             0xFFE6E6FA },
+    { "LavenderBlush",        0xFFFFF0F5 },
+    { "LawnGreen",            0xFF7CFC00 },
+    { "LemonChiffon",         0xFFFFFACD },
+    { "LightBlue",            0xFFADD8E6 },
+    { "LightCoral",           0xFFF08080 },
+    { "LightCyan",            0xFFE0FFFF },
+    { "LightGoldenRodYellow", 0xFFFAFAD2 },
+    { "LightGreen",           0xFF90EE90 },
+    { "LightGrey",            0xFFD3D3D3 },
+    { "LightPink",            0xFFFFB6C1 },
+    { "LightSalmon",          0xFFFFA07A },
+    { "LightSeaGreen",        0xFF20B2AA },
+    { "LightSkyBlue",         0xFF87CEFA },
+    { "LightSlateGray",       0xFF778899 },
+    { "LightSteelBlue",       0xFFB0C4DE },
+    { "LightYellow",          0xFFFFFFE0 },
+    { "Lime",                 0xFF00FF00 },
+    { "LimeGreen",            0xFF32CD32 },
+    { "Linen",                0xFFFAF0E6 },
+    { "Magenta",              0xFFFF00FF },
+    { "Maroon",               0xFFB03060 },
+    { "MediumAquaMarine",     0xFF66CDAA },
+    { "MediumBlue",           0xFF0000CD },
+    { "MediumOrchid",         0xFFBA55D3 },
+    { "MediumPurple",         0xFF9370D8 },
+    { "MediumSeaGreen",       0xFF3CB371 },
+    { "MediumSlateBlue",      0xFF7B68EE },
+    { "MediumSpringGreen",    0xFF00FA9A },
+    { "MediumTurquoise",      0xFF48D1CC },
+    { "MediumVioletRed",      0xFFC71585 },
+    { "MidnightBlue",         0xFF191970 },
+    { "MintCream",            0xFFF5FFFA },
+    { "MistyRose",            0xFFFFE4E1 },
+    { "Moccasin",             0xFFFFE4B5 },
+    { "NavajoWhite",          0xFFFFDEAD },
+    { "Navy",                 0xFF000080 },
+    { "None",                 0x00000000 },
+    { "OldLace",              0xFFFDF5E6 },
+    { "Olive",                0xFF808000 },
+    { "OliveDrab",            0xFF6B8E23 },
+    { "Orange",               0xFFFFA500 },
+    { "OrangeRed",            0xFFFF4500 },
+    { "Orchid",               0xFFDA70D6 },
+    { "PaleGoldenRod",        0xFFEEE8AA },
+    { "PaleGreen",            0xFF98FB98 },
+    { "PaleTurquoise",        0xFFAFEEEE },
+    { "PaleVioletRed",        0xFFD87093 },
+    { "PapayaWhip",           0xFFFFEFD5 },
+    { "PeachPuff",            0xFFFFDAB9 },
+    { "Peru",                 0xFFCD853F },
+    { "Pink",                 0xFFFFC0CB },
+    { "Plum",                 0xFFDDA0DD },
+    { "PowderBlue",           0xFFB0E0E6 },
+    { "Purple",               0xFFA020F0 },
+    { "Red",                  0xFFFF0000 },
+    { "RosyBrown",            0xFFBC8F8F },
+    { "RoyalBlue",            0xFF4169E1 },
+    { "SaddleBrown",          0xFF8B4513 },
+    { "Salmon",               0xFFFA8072 },
+    { "SandyBrown",           0xFFF4A460 },
+    { "SeaGreen",             0xFF2E8B57 },
+    { "SeaShell",             0xFFFFF5EE },
+    { "Sienna",               0xFFA0522D },
+    { "Silver",               0xFFC0C0C0 },
+    { "SkyBlue",              0xFF87CEEB },
+    { "SlateBlue",            0xFF6A5ACD },
+    { "SlateGray",            0xFF708090 },
+    { "Snow",                 0xFFFFFAFA },
+    { "SpringGreen",          0xFF00FF7F },
+    { "SteelBlue",            0xFF4682B4 },
+    { "Tan",                  0xFFD2B48C },
+    { "Teal",                 0xFF008080 },
+    { "Thistle",              0xFFD8BFD8 },
+    { "Tomato",               0xFFFF6347 },
+    { "Turquoise",            0xFF40E0D0 },
+    { "Violet",               0xFFEE82EE },
+    { "Wheat",                0xFFF5DEB3 },
+    { "White",                0xFFFFFFFF },
+    { "WhiteSmoke",           0xFFF5F5F5 },
+    { "Yellow",               0xFFFFFF00 },
+    { "YellowGreen",          0xFF9ACD32 }
+};
+
+static unsigned hex_char_to_number(uint8_t x)
+{
+    if (x >= 'a' && x <= 'f')
+        x -= 'a' - 10;
+    else if (x >= 'A' && x <= 'F')
+        x -= 'A' - 10;
+    else if (x >= '0' && x <= '9')
+        x -= '0';
+    else
+        x = 0;
+    return x;
+}
+
+/*
+ * Function same as strcspn but ignores characters if they are inside a C style comments
+ */
+static size_t mod_strcspn(const char *string, const char *reject)
+{
+    int i, j;
+
+    for (i = 0; string && string[i]; i++) {
+        if (string[i] == '/' && string[i+1] == '*') {
+            i += 2;
+            while ( string && string[i] && (string[i] != '*' || string[i+1] != '/') )
+                i++;
+            i++;
+        } else if (string[i] == '/' && string[i+1] == '/') {
+            i += 2;
+            while ( string && string[i] && string[i] != '\n' )
+                i++;
+        } else {
+            for (j = 0; reject && reject[j]; j++) {
+                if (string[i] == reject[j])
+                    break;
+            }
+            if (reject && reject[j])
+                break;
+        }
+    }
+    return i;
+}
+
+static uint32_t color_string_to_rgba(const char *p, int len)
+{
+    uint32_t ret = 0xFF000000;
+    const ColorEntry *entry;
+    char color_name[100];
+
+    len = FFMIN(FFMAX(len, 0), sizeof(color_name) - 1);
+
+    if (*p == '#') {
+        p++;
+        len--;
+        if (len == 3) {
+            ret |= (hex_char_to_number(p[2]) <<  4) |
+                   (hex_char_to_number(p[1]) << 12) |
+                   (hex_char_to_number(p[0]) << 20);
+        } else if (len == 4) {
+            ret  = (hex_char_to_number(p[3]) <<  4) |
+                   (hex_char_to_number(p[2]) << 12) |
+                   (hex_char_to_number(p[1]) << 20) |
+                   (hex_char_to_number(p[0]) << 28);
+        } else if (len == 6) {
+            ret |=  hex_char_to_number(p[5])        |
+                   (hex_char_to_number(p[4]) <<  4) |
+                   (hex_char_to_number(p[3]) <<  8) |
+                   (hex_char_to_number(p[2]) << 12) |
+                   (hex_char_to_number(p[1]) << 16) |
+                   (hex_char_to_number(p[0]) << 20);
+        } else if (len == 8) {
+            ret  =  hex_char_to_number(p[7])        |
+                   (hex_char_to_number(p[6]) <<  4) |
+                   (hex_char_to_number(p[5]) <<  8) |
+                   (hex_char_to_number(p[4]) << 12) |
+                   (hex_char_to_number(p[3]) << 16) |
+                   (hex_char_to_number(p[2]) << 20) |
+                   (hex_char_to_number(p[1]) << 24) |
+                   (hex_char_to_number(p[0]) << 28);
+        }
+    } else {
+        strncpy(color_name, p, len);
+        color_name[len] = '\0';
+
+        entry = bsearch(color_name,
+                        color_table,
+                        FF_ARRAY_ELEMS(color_table),
+                        sizeof(ColorEntry),
+                        color_table_compare);
+
+        if (!entry)
+            return ret;
+
+        ret = entry->rgb_color;
+    }
+    return ret;
+}
+
+static int ascii2index(const uint8_t *cpixel, int cpp)
+{
+    const uint8_t *p = cpixel;
+    int n = 0, m = 1, i;
+
+    for (i = 0; i < cpp; i++) {
+        if (*p < MIN_ELEMENT || *p > MAX_ELEMENT)
+            return AVERROR_INVALIDDATA;
+        n += (*p++ - MIN_ELEMENT) * m;
+        m *= NB_ELEMENTS;
+    }
+    return n;
+}
+
+static int xpm_decode_frame(AVCodecContext *avctx, void *data,
+                            int *got_frame, AVPacket *avpkt)
+{
+    XPMDecContext *x = avctx->priv_data;
+    AVFrame *p=data;
+    const uint8_t *end, *ptr;
+    int ncolors, cpp, ret, i, j;
+    int64_t size;
+    uint32_t *dst;
+
+    avctx->pix_fmt = AV_PIX_FMT_BGRA;
+
+    av_fast_padded_malloc(&x->buf, &x->buf_size, avpkt->size);
+    if (!x->buf)
+        return AVERROR(ENOMEM);
+    memcpy(x->buf, avpkt->data, avpkt->size);
+    x->buf[avpkt->size] = 0;
+
+    ptr = x->buf;
+    end = x->buf + avpkt->size;
+    while (end - ptr > 9 && memcmp(ptr, "/* XPM */", 9))
+        ptr++;
+
+    if (end - ptr <= 9) {
+        av_log(avctx, AV_LOG_ERROR, "missing signature\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ptr += mod_strcspn(ptr, "\"");
+    if (sscanf(ptr, "\"%u %u %u %u\",",
+               &avctx->width, &avctx->height, &ncolors, &cpp) != 4) {
+        av_log(avctx, AV_LOG_ERROR, "missing image parameters\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_set_dimensions(avctx, avctx->width, avctx->height)) < 0)
+        return ret;
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
+
+    if (cpp <= 0 || cpp >= 5) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported/invalid number of chars per pixel: %d\n", cpp);
+        return AVERROR_INVALIDDATA;
+    }
+
+    size = 1;
+    for (i = 0; i < cpp; i++)
+        size *= NB_ELEMENTS;
+
+    if (ncolors <= 0 || ncolors > size) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of colors: %d\n", ncolors);
+        return AVERROR_INVALIDDATA;
+    }
+
+    size *= 4;
+
+    av_fast_padded_malloc(&x->pixels, &x->pixels_size, size);
+    if (!x->pixels)
+        return AVERROR(ENOMEM);
+
+    ptr += mod_strcspn(ptr, ",") + 1;
+    if (end - ptr < 1)
+        return AVERROR_INVALIDDATA;
+
+    for (i = 0; i < ncolors; i++) {
+        const uint8_t *index;
+        int len;
+
+        ptr += mod_strcspn(ptr, "\"") + 1;
+        if (end - ptr < cpp)
+            return AVERROR_INVALIDDATA;
+        index = ptr;
+        ptr += cpp;
+
+        ptr = strstr(ptr, "c ");
+        if (ptr) {
+            ptr += 2;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+
+        len = strcspn(ptr, "\" ");
+
+        if ((ret = ascii2index(index, cpp)) < 0)
+            return ret;
+
+        x->pixels[ret] = color_string_to_rgba(ptr, len);
+        ptr += mod_strcspn(ptr, ",") + 1;
+        if (end - ptr < 1)
+            return AVERROR_INVALIDDATA;
+    }
+
+    for (i = 0; i < avctx->height; i++) {
+        dst = (uint32_t *)(p->data[0] + i * p->linesize[0]);
+        if (end - ptr < 1)
+            return AVERROR_INVALIDDATA;
+        ptr += mod_strcspn(ptr, "\"") + 1;
+        if (end - ptr < 1)
+            return AVERROR_INVALIDDATA;
+
+        for (j = 0; j < avctx->width; j++) {
+            if (end - ptr < cpp)
+                return AVERROR_INVALIDDATA;
+
+            if ((ret = ascii2index(ptr, cpp)) < 0)
+                return ret;
+
+            *dst++ = x->pixels[ret];
+            ptr += cpp;
+        }
+        ptr += mod_strcspn(ptr, ",") + 1;
+    }
+
+    p->key_frame = 1;
+    p->pict_type = AV_PICTURE_TYPE_I;
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int xpm_decode_close(AVCodecContext *avctx)
+{
+    XPMDecContext *x = avctx->priv_data;
+    av_freep(&x->pixels);
+
+    av_freep(&x->buf);
+    x->buf_size = 0;
+
+    return 0;
+}
+
+AVCodec ff_xpm_decoder = {
+    .name           = "xpm",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XPM,
+    .priv_data_size = sizeof(XPMDecContext),
+    .close          = xpm_decode_close,
+    .decode         = xpm_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("XPM (X PixMap) image")
+};
diff --git a/libavcodec/xsubdec.c b/libavcodec/xsubdec.c
index 635067c..93fd0f4 100644
--- a/libavcodec/xsubdec.c
+++ b/libavcodec/xsubdec.c
@@ -2,28 +2,27 @@
  * XSUB subtitle decoder
  * Copyright (c) 2007 Reimar Döffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/mathematics.h"
 #include "libavutil/imgutils.h"
-
 #include "avcodec.h"
-#include "bitstream.h"
+#include "get_bits.h"
 #include "bytestream.h"
 
 static av_cold int decode_init(AVCodecContext *avctx) {
@@ -54,16 +53,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     AVSubtitle *sub = data;
     const uint8_t *buf_end = buf + buf_size;
     uint8_t *bitmap;
-    int w, h, x, y, i;
+    int w, h, x, y, i, ret;
     int64_t packet_time = 0;
-    BitstreamContext bc;
+    GetBitContext gb;
     int has_alpha = avctx->codec_tag == MKTAG('D','X','S','A');
 
-    memset(sub, 0, sizeof(*sub));
-
     // check that at least header fits
     if (buf_size < 27 + 7 * 2 + 4 * (3 + has_alpha)) {
-        av_log(avctx, AV_LOG_ERROR, "coded frame too small\n");
+        av_log(avctx, AV_LOG_ERROR, "coded frame size %d too small\n", buf_size);
         return -1;
     }
 
@@ -94,16 +91,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     // we just ignore it
     bytestream_get_le16(&buf);
 
+    if (buf_end - buf < h + 3*4)
+        return AVERROR_INVALIDDATA;
+
     // allocate sub and set values
     sub->rects =  av_mallocz(sizeof(*sub->rects));
     if (!sub->rects)
         return AVERROR(ENOMEM);
+
     sub->rects[0] = av_mallocz(sizeof(*sub->rects[0]));
     if (!sub->rects[0]) {
         av_freep(&sub->rects);
         return AVERROR(ENOMEM);
     }
-    sub->num_rects = 1;
     sub->rects[0]->x = x; sub->rects[0]->y = y;
     sub->rects[0]->w = w; sub->rects[0]->h = h;
     sub->rects[0]->type = SUBTITLE_BITMAP;
@@ -118,6 +118,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
         av_freep(&sub->rects);
         return AVERROR(ENOMEM);
     }
+    sub->num_rects = 1;
 
     // read palette
     for (i = 0; i < sub->rects[0]->nb_colors; i++)
@@ -147,15 +148,16 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     // process RLE-compressed data
-    bitstream_init8(&bc, buf, buf_end - buf);
+    if ((ret = init_get_bits8(&gb, buf, buf_end - buf)) < 0)
+        return ret;
     bitmap = sub->rects[0]->data[0];
     for (y = 0; y < h; y++) {
         // interlaced: do odd lines
         if (y == (h + 1) / 2) bitmap = sub->rects[0]->data[0] + w;
         for (x = 0; x < w; ) {
-            int log2 = ff_log2_tab[bitstream_peek(&bc, 8)];
-            int run = bitstream_read(&bc, 14 - 4 * (log2 >> 1));
-            int color = bitstream_read(&bc, 2);
+            int log2 = ff_log2_tab[show_bits(&gb, 8)];
+            int run = get_bits(&gb, 14 - 4 * (log2 >> 1));
+            int color = get_bits(&gb, 2);
             run = FFMIN(run, w - x);
             // run length 0 means till end of row
             if (!run) run = w - x;
@@ -165,7 +167,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
         // interlaced, skip every second line
         bitmap += w;
-        bitstream_align(&bc);
+        align_get_bits(&gb);
     }
     *data_size = 1;
     return buf_size;
diff --git a/libavcodec/xsubenc.c b/libavcodec/xsubenc.c
index 5b7e135..b3da909 100644
--- a/libavcodec/xsubenc.c
+++ b/libavcodec/xsubenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 DivX, Inc.
  * Copyright (c) 2009 Bjorn Axelsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static int xsub_encode(AVCodecContext *avctx, unsigned char *buf,
     }
 
     // TODO: support multiple rects
-    if (h->num_rects > 1)
+    if (h->num_rects != 1)
         av_log(avctx, AV_LOG_WARNING, "Only single rects supported (%d in subtitle.)\n", h->num_rects);
 
 #if FF_API_AVPICTURE
@@ -155,7 +155,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         av_log(avctx, AV_LOG_WARNING, "No more than 4 subtitle colors supported (%d found.)\n", h->rects[0]->nb_colors);
 
     // TODO: Palette swapping if color zero is not transparent
-    if (((uint32_t *)h->rects[0]->data[1])[0] & 0xff)
+    if (((uint32_t *)h->rects[0]->data[1])[0] & 0xff000000)
         av_log(avctx, AV_LOG_WARNING, "Color index 0 is not transparent. Transparency will be messed up.\n");
 
     if (make_tc(startTime, start_tc) || make_tc(endTime, end_tc)) {
@@ -179,8 +179,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     bytestream_put_le16(&hdr, height);
     bytestream_put_le16(&hdr, h->rects[0]->x);
     bytestream_put_le16(&hdr, h->rects[0]->y);
-    bytestream_put_le16(&hdr, h->rects[0]->x + width);
-    bytestream_put_le16(&hdr, h->rects[0]->y + height);
+    bytestream_put_le16(&hdr, h->rects[0]->x + width -1);
+    bytestream_put_le16(&hdr, h->rects[0]->y + height -1);
 
     rlelenptr = hdr; // Will store length of first field here later.
     hdr+=2;
@@ -203,7 +203,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                         h->rects[0]->w, h->rects[0]->h >> 1))
         return -1;
 
-    // Enforce total height to be be multiple of 2
+    // Enforce total height to be a multiple of 2
     if (h->rects[0]->h & 1) {
         put_xsub_rle(&pb, h->rects[0]->w, PADDING_COLOR);
         avpriv_align_put_bits(&pb);
@@ -219,6 +219,8 @@ static av_cold int xsub_encoder_init(AVCodecContext *avctx)
     if (!avctx->codec_tag)
         avctx->codec_tag = MKTAG('D','X','S','B');
 
+    avctx->bits_per_coded_sample = 4;
+
     return 0;
 }
 
diff --git a/libavcodec/xvididct.c b/libavcodec/xvididct.c
index 40bd61a..d8f3dd7 100644
--- a/libavcodec/xvididct.c
+++ b/libavcodec/xvididct.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2006-2011 Xvid Solutions GmbH
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -321,22 +321,25 @@ void ff_xvid_idct(int16_t *const in)
 static void xvid_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_xvid_idct(block);
-    ff_put_pixels_clamped(block, dest, line_size);
+    ff_put_pixels_clamped_c(block, dest, line_size);
 }
 
 static void xvid_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 {
     ff_xvid_idct(block);
-    ff_add_pixels_clamped(block, dest, line_size);
+    ff_add_pixels_clamped_c(block, dest, line_size);
 }
 
 av_cold void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (!high_bit_depth &&
-        (avctx->idct_algo == FF_IDCT_AUTO ||
-         avctx->idct_algo == FF_IDCT_XVID)) {
+    if (high_bit_depth || avctx->lowres ||
+        !(avctx->idct_algo == FF_IDCT_AUTO ||
+          avctx->idct_algo == FF_IDCT_XVID))
+        return;
+
+    if (avctx->idct_algo == FF_IDCT_XVID) {
         c->idct_put  = xvid_idct_put;
         c->idct_add  = xvid_idct_add;
         c->idct      = ff_xvid_idct;
@@ -345,6 +348,8 @@ av_cold void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx)
 
     if (ARCH_X86)
         ff_xvid_idct_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_xvid_idct_init_mips(c, avctx, high_bit_depth);
 
     ff_init_scantable_permutation(c->idct_permutation, c->perm_type);
 }
diff --git a/libavcodec/xvididct.h b/libavcodec/xvididct.h
index 499f819..e0bc1a2 100644
--- a/libavcodec/xvididct.h
+++ b/libavcodec/xvididct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,5 +30,7 @@ void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx);
 
 void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                            unsigned high_bit_depth);
+void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                            unsigned high_bit_depth);
 
 #endif /* AVCODEC_XVIDIDCT_H */
diff --git a/libavcodec/xvmc.h b/libavcodec/xvmc.h
new file mode 100644
index 0000000..465ee78
--- /dev/null
+++ b/libavcodec/xvmc.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2003 Ivan Kalvachev
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_XVMC_H
+#define AVCODEC_XVMC_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_xvmc
+ * Public libavcodec XvMC header.
+ */
+
+#include <X11/extensions/XvMC.h>
+
+#include "libavutil/attributes.h"
+#include "version.h"
+#include "avcodec.h"
+
+/**
+ * @defgroup lavc_codec_hwaccel_xvmc XvMC
+ * @ingroup lavc_codec_hwaccel
+ *
+ * @{
+ */
+
+#define AV_XVMC_ID                    0x1DC711C0  /**< special value to ensure that regular pixel routines haven't corrupted the struct
+                                                       the number is 1337 speak for the letters IDCT MCo (motion compensation) */
+
+struct attribute_deprecated xvmc_pix_fmt {
+    /** The field contains the special constant value AV_XVMC_ID.
+        It is used as a test that the application correctly uses the API,
+        and that there is no corruption caused by pixel routines.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             xvmc_id;
+
+    /** Pointer to the block array allocated by XvMCCreateBlocks().
+        The array has to be freed by XvMCDestroyBlocks().
+        Each group of 64 values represents one data block of differential
+        pixel information (in MoCo mode) or coefficients for IDCT.
+        - application - set the pointer during initialization
+        - libavcodec  - fills coefficients/pixel data into the array
+    */
+    short*          data_blocks;
+
+    /** Pointer to the macroblock description array allocated by
+        XvMCCreateMacroBlocks() and freed by XvMCDestroyMacroBlocks().
+        - application - set the pointer during initialization
+        - libavcodec  - fills description data into the array
+    */
+    XvMCMacroBlock* mv_blocks;
+
+    /** Number of macroblock descriptions that can be stored in the mv_blocks
+        array.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             allocated_mv_blocks;
+
+    /** Number of blocks that can be stored at once in the data_blocks array.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             allocated_data_blocks;
+
+    /** Indicate that the hardware would interpret data_blocks as IDCT
+        coefficients and perform IDCT on them.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             idct;
+
+    /** In MoCo mode it indicates that intra macroblocks are assumed to be in
+        unsigned format; same as the XVMC_INTRA_UNSIGNED flag.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             unsigned_intra;
+
+    /** Pointer to the surface allocated by XvMCCreateSurface().
+        It has to be freed by XvMCDestroySurface() on application exit.
+        It identifies the frame and its state on the video hardware.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    XvMCSurface*    p_surface;
+
+/** Set by the decoder before calling ff_draw_horiz_band(),
+    needed by the XvMCRenderSurface function. */
+//@{
+    /** Pointer to the surface used as past reference
+        - application - unchanged
+        - libavcodec  - set
+    */
+    XvMCSurface*    p_past_surface;
+
+    /** Pointer to the surface used as future reference
+        - application - unchanged
+        - libavcodec  - set
+    */
+    XvMCSurface*    p_future_surface;
+
+    /** top/bottom field or frame
+        - application - unchanged
+        - libavcodec  - set
+    */
+    unsigned int    picture_structure;
+
+    /** XVMC_SECOND_FIELD - 1st or 2nd field in the sequence
+        - application - unchanged
+        - libavcodec  - set
+    */
+    unsigned int    flags;
+//}@
+
+    /** Number of macroblock descriptions in the mv_blocks array
+        that have already been passed to the hardware.
+        - application - zeroes it on get_buffer().
+                        A successful ff_draw_horiz_band() may increment it
+                        with filled_mb_block_num or zero both.
+        - libavcodec  - unchanged
+    */
+    int             start_mv_blocks_num;
+
+    /** Number of new macroblock descriptions in the mv_blocks array (after
+        start_mv_blocks_num) that are filled by libavcodec and have to be
+        passed to the hardware.
+        - application - zeroes it on get_buffer() or after successful
+                        ff_draw_horiz_band().
+        - libavcodec  - increment with one of each stored MB
+    */
+    int             filled_mv_blocks_num;
+
+    /** Number of the next free data block; one data block consists of
+        64 short values in the data_blocks array.
+        All blocks before this one have already been claimed by placing their
+        position into the corresponding block description structure field,
+        that are part of the mv_blocks array.
+        - application - zeroes it on get_buffer().
+                        A successful ff_draw_horiz_band() may zero it together
+                        with start_mb_blocks_num.
+        - libavcodec  - each decoded macroblock increases it by the number
+                        of coded blocks it contains.
+    */
+    int             next_free_data_block_num;
+};
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_XVMC_H */
diff --git a/libavcodec/xvmc_internal.h b/libavcodec/xvmc_internal.h
new file mode 100644
index 0000000..d365ef0
--- /dev/null
+++ b/libavcodec/xvmc_internal.h
@@ -0,0 +1,31 @@
+/*
+ * XVideo Motion Compensation internal functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_XVMC_INTERNAL_H
+#define AVCODEC_XVMC_INTERNAL_H
+
+#include "avcodec.h"
+#include "mpegvideo.h"
+#include "version.h"
+
+void ff_xvmc_init_block(MpegEncContext *s);
+void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp);
+
+#endif /* AVCODEC_XVMC_INTERNAL_H */
diff --git a/libavcodec/xwd.h b/libavcodec/xwd.h
index f41e2cd..d046046 100644
--- a/libavcodec/xwd.h
+++ b/libavcodec/xwd.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/xwddec.c b/libavcodec/xwddec.c
index c43724e..8c4358f 100644
--- a/libavcodec/xwddec.c
+++ b/libavcodec/xwddec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,7 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
     uint32_t pixformat, pixdepth, bunit, bitorder, bpad;
     uint32_t rgb[3];
     uint8_t *ptr;
+    int width, height;
     GetByteContext gb;
 
     if (buf_size < XWD_HEADER_SIZE)
@@ -60,8 +61,8 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
 
     pixformat     = bytestream2_get_be32u(&gb);
     pixdepth      = bytestream2_get_be32u(&gb);
-    avctx->width  = bytestream2_get_be32u(&gb);
-    avctx->height = bytestream2_get_be32u(&gb);
+    width         = bytestream2_get_be32u(&gb);
+    height        = bytestream2_get_be32u(&gb);
     xoffset       = bytestream2_get_be32u(&gb);
     be            = bytestream2_get_be32u(&gb);
     bunit         = bytestream2_get_be32u(&gb);
@@ -77,6 +78,9 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
     ncolors       = bytestream2_get_be32u(&gb);
     bytestream2_skipu(&gb, header_size - (XWD_HEADER_SIZE - 20));
 
+    if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
+        return ret;
+
     av_log(avctx, AV_LOG_DEBUG,
            "pixformat %"PRIu32", pixdepth %"PRIu32", bunit %"PRIu32", bitorder %"PRIu32", bpad %"PRIu32"\n",
            pixformat, pixdepth, bunit, bitorder, bpad);
@@ -155,12 +159,13 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
     switch (vclass) {
     case XWD_STATIC_GRAY:
     case XWD_GRAY_SCALE:
-        if (bpp != 1 && bpp != 8 || bpp != pixdepth)
+        if (bpp != 1 && bpp != 8)
             return AVERROR_INVALIDDATA;
-        if (pixdepth == 1)
+        if (bpp == 1 && pixdepth == 1) {
             avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-        else if (pixdepth == 8)
+        } else if (bpp == 8 && pixdepth == 8) {
             avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        }
         break;
     case XWD_STATIC_COLOR:
     case XWD_PSEUDO_COLOR:
@@ -206,10 +211,8 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_PATCHWELCOME;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->key_frame = 1;
     p->pict_type = AV_PICTURE_TYPE_I;
@@ -228,7 +231,7 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
             blue   = bytestream2_get_byteu(&gb);
             bytestream2_skipu(&gb, 3); // skip bitmask flag and padding
 
-            dst[i] = red << 16 | green << 8 | blue;
+            dst[i] = 0xFFU << 24 | red << 16 | green << 8 | blue;
         }
     }
 
diff --git a/libavcodec/xwdenc.c b/libavcodec/xwdenc.c
index e346b5c..81cca6c 100644
--- a/libavcodec/xwdenc.c
+++ b/libavcodec/xwdenc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
 #define WINDOW_NAME_SIZE    11
 
 static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *p, int *got_packet)
+                            const AVFrame *pict, int *got_packet)
 {
     enum AVPixelFormat pix_fmt = avctx->pix_fmt;
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -40,6 +40,8 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     uint32_t header_size;
     int i, out_size, ret;
     uint8_t *ptr, *buf;
+    AVFrame * const p = (AVFrame *)pict;
+    uint32_t pal[256];
 
     pixdepth = av_get_bits_per_pixel(desc);
     if (desc->flags & AV_PIX_FMT_FLAG_BE)
@@ -124,6 +126,11 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         bpad     = 8;
         ncolors  = 256;
         break;
+    case AV_PIX_FMT_GRAY8:
+        bpp      = 8;
+        bpad     = 8;
+        vclass   = XWD_STATIC_GRAY;
+        break;
     case AV_PIX_FMT_MONOWHITE:
         be       = 1;
         bitorder = 1;
@@ -132,7 +139,7 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         vclass   = XWD_STATIC_GRAY;
         break;
     default:
-        av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
+        av_log(avctx, AV_LOG_ERROR, "unsupported pixel format\n");
         return AVERROR(EINVAL);
     }
 
@@ -140,18 +147,12 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     header_size = XWD_HEADER_SIZE + WINDOW_NAME_SIZE;
     out_size    = header_size + ncolors * XWD_CMAP_SIZE + avctx->height * lsize;
 
-    if ((ret = ff_alloc_packet(pkt, out_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "output buffer too small\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, out_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->key_frame = 1;
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    p->key_frame = 1;
+    p->pict_type = AV_PICTURE_TYPE_I;
 
     bytestream_put_be32(&buf, header_size);
     bytestream_put_be32(&buf, XWD_VERSION);   // file version
@@ -180,11 +181,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
     bytestream_put_be32(&buf, 0);             // window border width
     bytestream_put_buffer(&buf, WINDOW_NAME, WINDOW_NAME_SIZE);
 
+    if (pix_fmt == AV_PIX_FMT_PAL8) {
+        memcpy(pal, p->data[1], sizeof(pal));
+    } else {
+        avpriv_set_systematic_pal2(pal, pix_fmt);
+    }
+
     for (i = 0; i < ncolors; i++) {
         uint32_t val;
         uint8_t red, green, blue;
 
-        val   = AV_RN32A(p->data[1] + i * 4);
+        val   = pal[i];
         red   = (val >> 16) & 0xFF;
         green = (val >>  8) & 0xFF;
         blue  =  val        & 0xFF;
@@ -233,6 +240,7 @@ AVCodec ff_xwd_encoder = {
                                                  AV_PIX_FMT_RGB4_BYTE,
                                                  AV_PIX_FMT_BGR4_BYTE,
                                                  AV_PIX_FMT_PAL8,
+                                                 AV_PIX_FMT_GRAY8,
                                                  AV_PIX_FMT_MONOWHITE,
                                                  AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/xxan.c b/libavcodec/xxan.c
index 5495bec..8bb7087 100644
--- a/libavcodec/xxan.c
+++ b/libavcodec/xxan.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2011 Konstantin Shishkov
  * based on work by Mike Melanson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,7 +72,7 @@ static av_cold int xan_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
     s->scratch_buffer = av_malloc(s->buffer_size + 130);
     if (!s->scratch_buffer) {
-        av_freep(&s->y_buffer);
+        xan_decode_end(avctx);
         return AVERROR(ENOMEM);
     }
 
@@ -223,16 +223,18 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off)
     if (mode) {
         for (j = 0; j < avctx->height >> 1; j++) {
             for (i = 0; i < avctx->width >> 1; i++) {
+                if (src_end - src < 1)
+                    return 0;
                 val = *src++;
-                if (val && val < table_size) {
+                if (val) {
+                    if (val >= table_size)
+                        return AVERROR_INVALIDDATA;
                     val  = AV_RL16(table + (val << 1));
                     uval = (val >> 3) & 0xF8;
                     vval = (val >> 8) & 0xF8;
                     U[i] = uval | (uval >> 5);
                     V[i] = vval | (vval >> 5);
                 }
-                if (src == src_end)
-                    return 0;
             }
             U += s->pic->linesize[1];
             V += s->pic->linesize[2];
@@ -247,8 +249,12 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off)
 
         for (j = 0; j < avctx->height >> 2; j++) {
             for (i = 0; i < avctx->width >> 1; i += 2) {
+                if (src_end - src < 1)
+                    return 0;
                 val = *src++;
-                if (val && val < table_size) {
+                if (val) {
+                    if (val >= table_size)
+                        return AVERROR_INVALIDDATA;
                     val  = AV_RL16(table + (val << 1));
                     uval = (val >> 3) & 0xF8;
                     vval = (val >> 8) & 0xF8;
@@ -287,7 +293,7 @@ static int xan_decode_frame_type0(AVCodecContext *avctx)
     if ((ret = xan_decode_chroma(avctx, chroma_off)) != 0)
         return ret;
 
-    if (corr_off >= (s->gb.buffer_end - s->gb.buffer_start)) {
+    if (corr_off >= bytestream2_size(&s->gb)) {
         av_log(avctx, AV_LOG_WARNING, "Ignoring invalid correction block position\n");
         corr_off = 0;
     }
@@ -332,6 +338,9 @@ static int xan_decode_frame_type0(AVCodecContext *avctx)
         dec_size = xan_unpack(s, s->scratch_buffer, s->buffer_size / 2);
         if (dec_size < 0)
             dec_size = 0;
+        else
+            dec_size = FFMIN(dec_size, s->buffer_size/2 - 1);
+
         for (i = 0; i < dec_size; i++)
             s->y_buffer[i*2+1] = (s->y_buffer[i*2+1] + (s->scratch_buffer[i] << 1)) & 0x3F;
     }
@@ -401,10 +410,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
     int ftype;
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->pic))) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->pic)) < 0)
         return ret;
-    }
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
     ftype = bytestream2_get_le32(&s->gb);
diff --git a/libavcodec/y41pdec.c b/libavcodec/y41pdec.c
new file mode 100644
index 0000000..85a39e4
--- /dev/null
+++ b/libavcodec/y41pdec.c
@@ -0,0 +1,92 @@
+/*
+ * y41p decoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y41p_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV411P;
+    avctx->bits_per_raw_sample = 12;
+
+    if (avctx->width & 7) {
+        av_log(avctx, AV_LOG_WARNING, "y41p requires width to be divisible by 8.\n");
+    }
+
+    return 0;
+}
+
+static int y41p_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 3LL * avctx->height * FFALIGN(avctx->width, 8) / 2) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    for (i = avctx->height - 1; i >= 0 ; i--) {
+        y = &pic->data[0][i * pic->linesize[0]];
+        u = &pic->data[1][i * pic->linesize[1]];
+        v = &pic->data[2][i * pic->linesize[2]];
+        for (j = 0; j < avctx->width; j += 8) {
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+        }
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_y41p_decoder = {
+    .name         = "y41p",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_Y41P,
+    .init         = y41p_decode_init,
+    .decode       = y41p_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/y41penc.c b/libavcodec/y41penc.c
new file mode 100644
index 0000000..ca94a3c
--- /dev/null
+++ b/libavcodec/y41penc.c
@@ -0,0 +1,94 @@
+/*
+ * y41p encoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y41p_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width & 7) {
+        av_log(avctx, AV_LOG_ERROR, "y41p requires width to be divisible by 8.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    avctx->bits_per_coded_sample = 12;
+    avctx->bit_rate = ff_guess_coded_bitrate(avctx);
+
+    return 0;
+}
+
+static int y41p_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 1.5, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    for (i = avctx->height - 1; i >= 0; i--) {
+        y = &pic->data[0][i * pic->linesize[0]];
+        u = &pic->data[1][i * pic->linesize[1]];
+        v = &pic->data[2][i * pic->linesize[2]];
+        for (j = 0; j < avctx->width; j += 8) {
+            *(dst++) = *(u++);
+            *(dst++) = *(y++);
+            *(dst++) = *(v++);
+            *(dst++) = *(y++);
+
+            *(dst++) = *(u++);
+            *(dst++) = *(y++);
+            *(dst++) = *(v++);
+            *(dst++) = *(y++);
+
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+        }
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int y41p_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_y41p_encoder = {
+    .name         = "y41p",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_Y41P,
+    .init         = y41p_encode_init,
+    .encode2      = y41p_encode_frame,
+    .close        = y41p_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV411P,
+                                                 AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/ylc.c b/libavcodec/ylc.c
new file mode 100644
index 0000000..1133322
--- /dev/null
+++ b/libavcodec/ylc.c
@@ -0,0 +1,502 @@
+/*
+ * YUY2 Lossless Codec
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/imgutils.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "avcodec.h"
+#include "bswapdsp.h"
+#include "get_bits.h"
+#include "huffyuvdsp.h"
+#include "internal.h"
+#include "thread.h"
+#include "unary.h"
+
+typedef struct YLCContext {
+    VLC vlc[4];
+    uint32_t table[1024];
+    uint8_t *table_bits;
+    uint8_t *bitstream_bits;
+    int table_bits_size;
+    int bitstream_bits_size;
+    BswapDSPContext bdsp;
+} YLCContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    YLCContext *s = avctx->priv_data;
+
+    avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+    ff_bswapdsp_init(&s->bdsp);
+
+    return 0;
+}
+
+typedef struct Node {
+    int16_t  sym;
+    int16_t  n0;
+    uint32_t count;
+    int16_t  l, r;
+} Node;
+
+static void get_tree_codes(uint32_t *bits, int16_t *lens, uint8_t *xlat,
+                           Node *nodes, int node,
+                           uint32_t pfx, int pl, int *pos)
+{
+    int s;
+
+    s = nodes[node].sym;
+    if (s != -1) {
+        bits[*pos] = (~pfx) & ((1ULL << FFMAX(pl, 1)) - 1);
+        lens[*pos] = FFMAX(pl, 1);
+        xlat[*pos] = s + (pl == 0);
+        (*pos)++;
+    } else {
+        pfx <<= 1;
+        pl++;
+        get_tree_codes(bits, lens, xlat, nodes, nodes[node].l, pfx, pl,
+                       pos);
+        pfx |= 1;
+        get_tree_codes(bits, lens, xlat, nodes, nodes[node].r, pfx, pl,
+                       pos);
+    }
+}
+
+static int build_vlc(AVCodecContext *avctx, VLC *vlc, const uint32_t *table)
+{
+    Node nodes[512];
+    uint32_t bits[256];
+    int16_t lens[256];
+    uint8_t xlat[256];
+    int cur_node, i, j, pos = 0;
+
+    ff_free_vlc(vlc);
+
+    for (i = 0; i < 256; i++) {
+        nodes[i].count = table[i];
+        nodes[i].sym   = i;
+        nodes[i].n0    = -2;
+        nodes[i].l     = i;
+        nodes[i].r     = i;
+    }
+
+    cur_node = 256;
+    j = 0;
+    do {
+        for (i = 0; ; i++) {
+            int new_node = j;
+            int first_node = cur_node;
+            int second_node = cur_node;
+            unsigned nd, st;
+
+            nodes[cur_node].count = -1;
+
+            do {
+                int val = nodes[new_node].count;
+                if (val && (val < nodes[first_node].count)) {
+                    if (val >= nodes[second_node].count) {
+                        first_node = new_node;
+                    } else {
+                        first_node = second_node;
+                        second_node = new_node;
+                    }
+                }
+                new_node += 1;
+            } while (new_node != cur_node);
+
+            if (first_node == cur_node)
+                break;
+
+            nd = nodes[second_node].count;
+            st = nodes[first_node].count;
+            nodes[second_node].count = 0;
+            nodes[first_node].count  = 0;
+            if (nd >= UINT32_MAX - st) {
+                av_log(avctx, AV_LOG_ERROR, "count overflow\n");
+                return AVERROR_INVALIDDATA;
+            }
+            nodes[cur_node].count = nd + st;
+            nodes[cur_node].sym = -1;
+            nodes[cur_node].n0 = cur_node;
+            nodes[cur_node].l = first_node;
+            nodes[cur_node].r = second_node;
+            cur_node++;
+        }
+        j++;
+    } while (cur_node - 256 == j);
+
+    get_tree_codes(bits, lens, xlat, nodes, cur_node - 1, 0, 0, &pos);
+
+    return ff_init_vlc_sparse(vlc, 10, pos, lens, 2, 2, bits, 4, 4, xlat, 1, 1, 0);
+}
+
+static const uint8_t table_y1[] = {
+    0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
+    0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
+    0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
+    0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
+    0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
+    0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x00,
+};
+
+static const uint8_t table_u[] = {
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x00,
+};
+
+static const uint8_t table_y2[] = {
+    0xFC, 0xFC, 0xFC, 0xFD, 0xFD, 0xFD, 0xFE, 0xFE,
+    0xFE, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0xFC,
+    0xFC, 0xFC, 0xFD, 0xFD, 0xFD, 0xFE, 0xFE, 0xFE,
+    0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0xFC, 0xFC,
+    0xFC, 0xFD, 0xFD, 0xFD, 0xFE, 0xFE, 0xFE, 0xFF,
+    0xFF, 0xFF, 0x00, 0x00, 0x00, 0xFD, 0xFD, 0xFD,
+    0xFE, 0xFE, 0xFE, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+    0x00, 0x01, 0x01, 0x01, 0xFD, 0xFD, 0xFD, 0xFE,
+    0xFE, 0xFE, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+    0x01, 0x01, 0x01, 0xFD, 0xFD, 0xFD, 0xFE, 0xFE,
+    0xFE, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x01, 0xFE, 0xFE, 0xFE, 0xFF, 0xFF, 0xFF,
+    0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x02, 0x02,
+    0x02, 0xFE, 0xFE, 0xFE, 0xFF, 0xFF, 0xFF, 0x00,
+    0x00, 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+    0xFE, 0xFE, 0xFE, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+    0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0xFF,
+    0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+    0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0xFF, 0xFF,
+    0xFF, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x02,
+    0x02, 0x02, 0x03, 0x03, 0x03, 0xFF, 0xFF, 0xFF,
+    0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x02, 0x02,
+    0x02, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x01,
+    0x01, 0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03,
+    0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x01, 0x01,
+    0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x04,
+    0x04, 0x04, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+    0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x04, 0x04,
+    0x04, 0x00,
+};
+
+static const uint8_t table_v[] = {
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF,
+    0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01,
+    0xFF, 0x00, 0x01, 0xFF, 0x00, 0x01, 0xFF, 0x00,
+    0x01, 0x00,
+};
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    int TL[4] = { 128, 128, 128, 128 };
+    int L[4]  = { 128, 128, 128, 128 };
+    YLCContext *s = avctx->priv_data;
+    ThreadFrame frame = { .f = data };
+    const uint8_t *buf = avpkt->data;
+    int ret, x, y, toffset, boffset;
+    AVFrame * const p = data;
+    GetBitContext gb;
+    uint8_t *dst;
+
+    if (avpkt->size <= 16)
+        return AVERROR_INVALIDDATA;
+
+    if (AV_RL32(buf) != MKTAG('Y', 'L', 'C', '0') ||
+        AV_RL32(buf + 4) != 0)
+        return AVERROR_INVALIDDATA;
+
+    toffset = AV_RL32(buf + 8);
+    if (toffset < 16 || toffset >= avpkt->size)
+        return AVERROR_INVALIDDATA;
+
+    boffset = AV_RL32(buf + 12);
+    if (toffset >= boffset || boffset >= avpkt->size)
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    av_fast_malloc(&s->table_bits, &s->table_bits_size,
+                   boffset - toffset + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!s->table_bits)
+        return AVERROR(ENOMEM);
+
+    memcpy(s->table_bits, avpkt->data + toffset, boffset - toffset);
+    memset(s->table_bits + boffset - toffset, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    s->bdsp.bswap_buf((uint32_t *) s->table_bits,
+                      (uint32_t *) s->table_bits,
+                      (boffset - toffset + 3) >> 2);
+    if ((ret = init_get_bits8(&gb, s->table_bits, boffset - toffset)) < 0)
+        return ret;
+
+    for (x = 0; x < 1024; x++) {
+        unsigned len = get_unary(&gb, 1, 31);
+        uint32_t val = ((1U << len) - 1) + get_bits_long(&gb, len);
+
+        s->table[x] = val;
+    }
+
+    ret = build_vlc(avctx, &s->vlc[0], &s->table[0  ]);
+    if (ret < 0)
+        return ret;
+    ret = build_vlc(avctx, &s->vlc[1], &s->table[256]);
+    if (ret < 0)
+        return ret;
+    ret = build_vlc(avctx, &s->vlc[2], &s->table[512]);
+    if (ret < 0)
+        return ret;
+    ret = build_vlc(avctx, &s->vlc[3], &s->table[768]);
+    if (ret < 0)
+        return ret;
+
+    av_fast_malloc(&s->bitstream_bits, &s->bitstream_bits_size,
+                   avpkt->size - boffset + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!s->bitstream_bits)
+        return AVERROR(ENOMEM);
+
+    memcpy(s->bitstream_bits, avpkt->data + boffset, avpkt->size - boffset);
+    memset(s->bitstream_bits + avpkt->size - boffset, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    s->bdsp.bswap_buf((uint32_t *) s->bitstream_bits,
+                      (uint32_t *) s->bitstream_bits,
+                      (avpkt->size - boffset) >> 2);
+    if ((ret = init_get_bits8(&gb, s->bitstream_bits, avpkt->size - boffset)) < 0)
+        return ret;
+
+    dst = p->data[0];
+    for (y = 0; y < avctx->height; y++) {
+        memset(dst, 0, avctx->width * 2);
+        dst += p->linesize[0];
+    }
+
+    dst = p->data[0];
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width * 2 && y < avctx->height;) {
+            if (get_bits_left(&gb) <= 0)
+                return AVERROR_INVALIDDATA;
+
+            if (get_bits1(&gb)) {
+                int val = get_vlc2(&gb, s->vlc[0].table, s->vlc[0].bits, 3);
+                if (val < 0) {
+                    return AVERROR_INVALIDDATA;
+                } else if (val < 0xE1) {
+                    dst[x    ] = table_y1[val];
+                    dst[x + 1] = table_u[val];
+                    dst[x + 2] = table_y2[val];
+                    dst[x + 3] = table_v[val];
+                    x += 4;
+                } else {
+                    int incr = (val - 0xDF) * 4;
+                    if (x + incr >= avctx->width * 2) {
+                        int iy = ((x + incr) / (avctx->width * 2));
+                        x  = (x + incr) % (avctx->width * 2);
+                        y += iy;
+                        dst += iy * p->linesize[0];
+                    } else {
+                        x += incr;
+                    }
+                }
+            } else {
+                int y1, y2, u, v;
+
+                y1 = get_vlc2(&gb, s->vlc[1].table, s->vlc[1].bits, 3);
+                u  = get_vlc2(&gb, s->vlc[2].table, s->vlc[2].bits, 3);
+                y2 = get_vlc2(&gb, s->vlc[1].table, s->vlc[1].bits, 3);
+                v  = get_vlc2(&gb, s->vlc[3].table, s->vlc[3].bits, 3);
+                if (y1 < 0 || y2 < 0 || u < 0 || v < 0)
+                    return AVERROR_INVALIDDATA;
+                dst[x    ] = y1;
+                dst[x + 1] = u;
+                dst[x + 2] = y1 + y2;
+                dst[x + 3] = v;
+                x += 4;
+            }
+        }
+        dst += p->linesize[0];
+    }
+
+    dst = p->data[0];
+    for (x = 0; x < avctx->width * 2; x += 4) {
+        dst[x    ] =        dst[x    ] + L[0];
+        dst[x + 2] = L[0] = dst[x + 2] + L[0];
+        L[1] = dst[x + 1] + L[1];
+        dst[x + 1] = L[1];
+        L[2] = dst[x + 3] + L[2];
+        dst[x + 3] = L[2];
+    }
+    dst += p->linesize[0];
+
+    for (y = 1; y < avctx->height; y++) {
+        x = 0;
+        dst[x    ] =        dst[x    ] + L[0] + dst[x + 0 - p->linesize[0]] - TL[0];
+        dst[x + 2] = L[0] = dst[x + 2] + L[0] + dst[x + 2 - p->linesize[0]] - TL[0];
+        TL[0] = dst[x + 2 - p->linesize[0]];
+        L[1] = dst[x + 1] + L[1] + dst[x + 1 - p->linesize[0]] - TL[1];
+        dst[x + 1] = L[1];
+        TL[1] = dst[x + 1 - p->linesize[0]];
+        L[2] = dst[x + 3] + L[2] + dst[x + 3 - p->linesize[0]] - TL[2];
+        dst[x + 3] = L[2];
+        TL[2] = dst[x + 3 - p->linesize[0]];
+        for (x = 4; x < avctx->width * 2; x += 4) {
+            dst[x    ] =        dst[x    ] + L[0] + dst[x + 0 - p->linesize[0]] - TL[0];
+            dst[x + 2] = L[0] = dst[x + 2] + L[0] + dst[x + 2 - p->linesize[0]] - TL[0];
+            TL[0] = dst[x + 2 - p->linesize[0]];
+            L[1] = dst[x + 1] + L[1] + dst[x + 1 - p->linesize[0]] - TL[1];
+            dst[x + 1] = L[1];
+            TL[1] = dst[x + 1 - p->linesize[0]];
+            L[2] = dst[x + 3] + L[2] + dst[x + 3 - p->linesize[0]] - TL[2];
+            dst[x + 3] = L[2];
+            TL[2] = dst[x + 3 - p->linesize[0]];
+        }
+        dst += p->linesize[0];
+    }
+
+    p->pict_type = AV_PICTURE_TYPE_I;
+    p->key_frame = 1;
+    *got_frame   = 1;
+
+    return avpkt->size;
+}
+
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    YLCContext *s = avctx->priv_data;
+
+    memset(&s->vlc[0], 0, sizeof(VLC));
+    memset(&s->vlc[1], 0, sizeof(VLC));
+    memset(&s->vlc[2], 0, sizeof(VLC));
+    memset(&s->vlc[3], 0, sizeof(VLC));
+    s->table_bits = NULL;
+    s->table_bits_size = 0;
+    s->bitstream_bits = NULL;
+    s->bitstream_bits_size = 0;
+
+    return 0;
+}
+#endif
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    YLCContext *s = avctx->priv_data;
+
+    ff_free_vlc(&s->vlc[0]);
+    ff_free_vlc(&s->vlc[1]);
+    ff_free_vlc(&s->vlc[2]);
+    ff_free_vlc(&s->vlc[3]);
+    av_freep(&s->table_bits);
+    s->table_bits_size = 0;
+    av_freep(&s->bitstream_bits);
+    s->bitstream_bits_size = 0;
+
+    return 0;
+}
+
+AVCodec ff_ylc_decoder = {
+    .name           = "ylc",
+    .long_name      = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_YLC,
+    .priv_data_size = sizeof(YLCContext),
+    .init           = decode_init,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+};
diff --git a/libavcodec/yop.c b/libavcodec/yop.c
index 5de4ac8..32cfea2 100644
--- a/libavcodec/yop.c
+++ b/libavcodec/yop.c
@@ -5,20 +5,20 @@
  * derived from the code by
  * Copyright (C) 2009 Thomas P. Higdon <thomas.p.higdon@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 
 typedef struct YopDecContext {
     AVCodecContext *avctx;
+    AVFrame *frame;
 
     int num_pal_colors;
     int first_color[2];
@@ -80,6 +81,15 @@ static const int8_t motion_vector[16][2] =
      { 4, -2}, {-2,  0},
     };
 
+static av_cold int yop_decode_close(AVCodecContext *avctx)
+{
+    YopDecContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
 static av_cold int yop_decode_init(AVCodecContext *avctx)
 {
     YopDecContext *s = avctx->priv_data;
@@ -105,10 +115,14 @@ static av_cold int yop_decode_init(AVCodecContext *avctx)
     if (s->num_pal_colors + s->first_color[0] > 256 ||
         s->num_pal_colors + s->first_color[1] > 256) {
         av_log(avctx, AV_LOG_ERROR,
-               "YOP: palette parameters invalid, header probably corrupt\n");
+               "Palette parameters invalid, header probably corrupt\n");
         return AVERROR_INVALIDDATA;
     }
 
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
@@ -146,8 +160,7 @@ static int yop_copy_previous_block(YopDecContext *s, int linesize, int copy_tag)
     bufptr = s->dstptr + motion_vector[copy_tag][0] +
              linesize * motion_vector[copy_tag][1];
     if (bufptr < s->dstbuf) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "YOP: cannot decode, file probably corrupt\n");
+        av_log(s->avctx, AV_LOG_ERROR, "File probably corrupt\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -181,7 +194,7 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
     YopDecContext *s = avctx->priv_data;
-    AVFrame *frame = data;
+    AVFrame *frame = s->frame;
     int tag, firstcolor, is_odd_frame;
     int ret, i, x, y;
     uint32_t *palette;
@@ -191,11 +204,8 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    ret = ff_get_buffer(avctx, frame, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
         return ret;
-    }
 
     if (!avctx->frame_number)
         memset(frame->data[1], 0, AVPALETTE_SIZE);
@@ -207,13 +217,20 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     s->low_nibble = NULL;
 
     is_odd_frame = avpkt->data[0];
+    if(is_odd_frame>1){
+        av_log(avctx, AV_LOG_ERROR, "frame is too odd %d\n", is_odd_frame);
+        return AVERROR_INVALIDDATA;
+    }
     firstcolor   = s->first_color[is_odd_frame];
     palette      = (uint32_t *)frame->data[1];
 
-    for (i = 0; i < s->num_pal_colors; i++, s->srcptr += 3)
+    for (i = 0; i < s->num_pal_colors; i++, s->srcptr += 3) {
         palette[i + firstcolor] = (s->srcptr[0] << 18) |
                                   (s->srcptr[1] << 10) |
                                   (s->srcptr[2] << 2);
+        palette[i + firstcolor] |= 0xFFU << 24 |
+                                   (palette[i + firstcolor] >> 6) & 0x30303;
+    }
 
     frame->palette_has_changed = 1;
 
@@ -241,6 +258,9 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         s->dstptr += 2*frame->linesize[0] - x;
     }
 
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
+
     *got_frame = 1;
     return avpkt->size;
 }
@@ -252,6 +272,6 @@ AVCodec ff_yop_decoder = {
     .id             = AV_CODEC_ID_YOP,
     .priv_data_size = sizeof(YopDecContext),
     .init           = yop_decode_init,
+    .close          = yop_decode_close,
     .decode         = yop_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/yuv4dec.c b/libavcodec/yuv4dec.c
new file mode 100644
index 0000000..f89f62d
--- /dev/null
+++ b/libavcodec/yuv4dec.c
@@ -0,0 +1,84 @@
+/*
+ * libquicktime yuv4 decoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int yuv4_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    return 0;
+}
+
+static int yuv4_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 6 * (avctx->width + 1 >> 1) * (avctx->height + 1 >> 1)) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < (avctx->height + 1) >> 1; i++) {
+        for (j = 0; j < (avctx->width + 1) >> 1; j++) {
+            u[j] = *src++ ^ 0x80;
+            v[j] = *src++ ^ 0x80;
+            y[                   2 * j    ] = *src++;
+            y[                   2 * j + 1] = *src++;
+            y[pic->linesize[0] + 2 * j    ] = *src++;
+            y[pic->linesize[0] + 2 * j + 1] = *src++;
+        }
+
+        y += 2 * pic->linesize[0];
+        u +=     pic->linesize[1];
+        v +=     pic->linesize[2];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_yuv4_decoder = {
+    .name         = "yuv4",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_YUV4,
+    .init         = yuv4_decode_init,
+    .decode       = yuv4_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/yuv4enc.c b/libavcodec/yuv4enc.c
new file mode 100644
index 0000000..cc8846d
--- /dev/null
+++ b/libavcodec/yuv4enc.c
@@ -0,0 +1,80 @@
+/*
+ * libquicktime yuv4 encoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int yuv4_encode_init(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+static int yuv4_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, 6 * (avctx->width + 1 >> 1) * (avctx->height + 1 >> 1), 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height + 1 >> 1; i++) {
+        for (j = 0; j < avctx->width + 1 >> 1; j++) {
+            *dst++ = u[j] ^ 0x80;
+            *dst++ = v[j] ^ 0x80;
+            *dst++ = y[                   2 * j    ];
+            *dst++ = y[                   2 * j + 1];
+            *dst++ = y[pic->linesize[0] + 2 * j    ];
+            *dst++ = y[pic->linesize[0] + 2 * j + 1];
+        }
+        y += 2 * pic->linesize[0];
+        u +=     pic->linesize[1];
+        v +=     pic->linesize[2];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int yuv4_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_yuv4_encoder = {
+    .name         = "yuv4",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_YUV4,
+    .init         = yuv4_encode_init,
+    .encode2      = yuv4_encode_frame,
+    .close        = yuv4_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/zerocodec.c b/libavcodec/zerocodec.c
index 4448e85..e67eee4 100644
--- a/libavcodec/zerocodec.c
+++ b/libavcodec/zerocodec.c
@@ -59,10 +59,8 @@ static int zerocodec_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (ff_get_buffer(avctx, pic, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
-    }
+    if ((ret = ff_get_buffer(avctx, pic, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
 
     zstream->next_in  = avpkt->data;
     zstream->avail_in = avpkt->size;
diff --git a/libavcodec/zmbv.c b/libavcodec/zmbv.c
index f945aea..e07009d 100644
--- a/libavcodec/zmbv.c
+++ b/libavcodec/zmbv.c
@@ -2,20 +2,20 @@
  * Zip Motion Blocks Video (ZMBV) decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 
 #include "libavutil/common.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
@@ -56,6 +57,7 @@ typedef struct ZmbvContext {
     AVCodecContext *avctx;
 
     int bpp;
+    int alloc_bpp;
     unsigned int decomp_size;
     uint8_t* decomp_buf;
     uint8_t pal[768];
@@ -64,6 +66,7 @@ typedef struct ZmbvContext {
     int fmt;
     int comp;
     int flags;
+    int stride;
     int bw, bh, bx, by;
     int decomp_len;
     z_stream zstream;
@@ -143,7 +146,7 @@ static int zmbv_decode_xor_8(ZmbvContext *c)
         prev += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -217,7 +220,7 @@ static int zmbv_decode_xor_16(ZmbvContext *c)
         prev += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -375,7 +378,7 @@ static int zmbv_decode_xor_32(ZmbvContext *c)
         prev   += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -406,17 +409,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int zret = Z_OK; // Zlib return code
     int len = buf_size;
     int hi_ver, lo_ver, ret;
-    uint8_t *tmp;
-
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
+    int expected_size;
 
     /* parse header */
+    if (len < 1)
+        return AVERROR_INVALIDDATA;
     c->flags = buf[0];
     buf++; len--;
     if (c->flags & ZMBV_KEYFRAME) {
+        void *decode_intra = NULL;
+        c->decode_intra= NULL;
+
+        if (len < 6)
+            return AVERROR_INVALIDDATA;
         hi_ver = buf[0];
         lo_ver = buf[1];
         c->comp = buf[2];
@@ -447,29 +452,39 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         switch (c->fmt) {
         case ZMBV_FMT_8BPP:
             c->bpp = 8;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_8;
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            c->stride = c->width;
             break;
         case ZMBV_FMT_15BPP:
         case ZMBV_FMT_16BPP:
             c->bpp = 16;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_16;
+            if (c->fmt == ZMBV_FMT_15BPP)
+                avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_RGB565LE;
+            c->stride = c->width * 2;
             break;
 #ifdef ZMBV_ENABLE_24BPP
         case ZMBV_FMT_24BPP:
             c->bpp = 24;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_24;
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            c->stride = c->width * 3;
             break;
 #endif //ZMBV_ENABLE_24BPP
         case ZMBV_FMT_32BPP:
             c->bpp = 32;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_32;
+            avctx->pix_fmt = AV_PIX_FMT_BGR0;
+            c->stride = c->width * 4;
             break;
         default:
-            c->decode_intra = NULL;
             c->decode_xor = NULL;
             avpriv_request_sample(avctx, "Format %i", c->fmt);
             return AVERROR_PATCHWELCOME;
@@ -481,17 +496,29 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
             return AVERROR_UNKNOWN;
         }
 
-        tmp = av_realloc(c->cur,  avctx->width * avctx->height * (c->bpp / 8));
-        if (!tmp)
-            return AVERROR(ENOMEM);
-        c->cur = tmp;
-        tmp = av_realloc(c->prev, avctx->width * avctx->height * (c->bpp / 8));
-        if (!tmp)
+        if (c->alloc_bpp < c->bpp) {
+            c->cur  = av_realloc_f(c->cur, avctx->width * avctx->height,  (c->bpp / 8));
+            c->prev = av_realloc_f(c->prev, avctx->width * avctx->height,  (c->bpp / 8));
+            c->alloc_bpp = c->bpp;
+        }
+        c->bx = (c->width + c->bw - 1) / c->bw;
+        c->by = (c->height+ c->bh - 1) / c->bh;
+        if (!c->cur || !c->prev) {
+            c->alloc_bpp = 0;
             return AVERROR(ENOMEM);
-        c->prev = tmp;
-        c->bx   = (c->width  + c->bw - 1) / c->bw;
-        c->by   = (c->height + c->bh - 1) / c->bh;
+        }
+        memset(c->cur, 0, avctx->width * avctx->height * (c->bpp / 8));
+        memset(c->prev, 0, avctx->width * avctx->height * (c->bpp / 8));
+        c->decode_intra= decode_intra;
     }
+    if (c->flags & ZMBV_KEYFRAME) {
+        expected_size = avctx->width * avctx->height * (c->bpp / 8);
+    } else {
+        expected_size = (c->bx * c->by * 2 + 3) & ~3;
+    }
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8 &&
+        (c->flags & (ZMBV_DELTAPAL | ZMBV_KEYFRAME)))
+        expected_size += 768;
 
     if (!c->decode_intra) {
         av_log(avctx, AV_LOG_ERROR, "Error! Got no format or no keyframe!\n");
@@ -504,9 +531,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
             return AVERROR_INVALIDDATA;
         }
         memcpy(c->decomp_buf, buf, len);
+        c->decomp_len = len;
     } else { // ZLIB-compressed data
         c->zstream.total_in = c->zstream.total_out = 0;
-        c->zstream.next_in = buf;
+        c->zstream.next_in = (uint8_t*)buf;
         c->zstream.avail_in = len;
         c->zstream.next_out = c->decomp_buf;
         c->zstream.avail_out = c->decomp_size;
@@ -517,6 +545,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         }
         c->decomp_len = c->zstream.total_out;
     }
+    if (expected_size > c->decomp_len ||
+        (c->flags & ZMBV_KEYFRAME) && expected_size < c->decomp_len) {
+        av_log(avctx, AV_LOG_ERROR, "decompressed size %d is incorrect, expected %d\n", c->decomp_len, expected_size);
+        return AVERROR_INVALIDDATA;
+    }
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
     if (c->flags & ZMBV_KEYFRAME) {
         frame->key_frame = 1;
         frame->pict_type = AV_PICTURE_TYPE_I;
@@ -524,6 +560,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     } else {
         frame->key_frame = 0;
         frame->pict_type = AV_PICTURE_TYPE_P;
+        if (c->decomp_len < 2LL * ((c->width + c->bw - 1) / c->bw) * ((c->height + c->bh - 1) / c->bh))
+            return AVERROR_INVALIDDATA;
         if (c->decomp_len)
             c->decode_xor(c);
     }
@@ -531,64 +569,22 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     /* update frames */
     {
         uint8_t *out, *src;
-        int i, j;
+        int j;
 
         out = frame->data[0];
         src = c->cur;
         switch (c->fmt) {
         case ZMBV_FMT_8BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    out[i * 3 + 0] = c->pal[(*src) * 3 + 0];
-                    out[i * 3 + 1] = c->pal[(*src) * 3 + 1];
-                    out[i * 3 + 2] = c->pal[(*src) * 3 + 2];
-                    src++;
-                }
-                out += frame->linesize[0];
-            }
-            break;
+            for (j = 0; j < 256; j++)
+                AV_WN32(&frame->data[1][j * 4], 0xFFU << 24 | AV_RB24(&c->pal[j * 3]));
         case ZMBV_FMT_15BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint16_t tmp = AV_RL16(src);
-                    src += 2;
-                    out[i * 3 + 0] = (tmp & 0x7C00) >> 7;
-                    out[i * 3 + 1] = (tmp & 0x03E0) >> 2;
-                    out[i * 3 + 2] = (tmp & 0x001F) << 3;
-                }
-                out += frame->linesize[0];
-            }
-            break;
         case ZMBV_FMT_16BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint16_t tmp = AV_RL16(src);
-                    src += 2;
-                    out[i * 3 + 0] = (tmp & 0xF800) >> 8;
-                    out[i * 3 + 1] = (tmp & 0x07E0) >> 3;
-                    out[i * 3 + 2] = (tmp & 0x001F) << 3;
-                }
-                out += frame->linesize[0];
-            }
-            break;
 #ifdef ZMBV_ENABLE_24BPP
         case ZMBV_FMT_24BPP:
-            for (j = 0; j < c->height; j++) {
-                memcpy(out, src, c->width * 3);
-                src += c->width * 3;
-                out += frame->linesize[0];
-            }
-            break;
-#endif //ZMBV_ENABLE_24BPP
+#endif
         case ZMBV_FMT_32BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint32_t tmp = AV_RL32(src);
-                    src += 4;
-                    AV_WB24(out+(i*3), tmp);
-                }
-                out += frame->linesize[0];
-            }
+            av_image_copy_plane(out, frame->linesize[0], src, c->stride,
+                                c->stride, c->height);
             break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Cannot handle format %i\n", c->fmt);
@@ -616,16 +612,19 @@ static av_cold int decode_init(AVCodecContext *avctx)
     // Needed if zlib unused or init aborted before inflateInit
     memset(&c->zstream, 0, sizeof(z_stream));
 
-    avctx->pix_fmt = AV_PIX_FMT_RGB24;
+    if ((avctx->width + 255ULL) * (avctx->height + 64ULL) > FFMIN(avctx->max_pixels, INT_MAX / 4) ) {
+        av_log(avctx, AV_LOG_ERROR, "Internal buffer (decomp_size) larger than max_pixels or too large\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     c->decomp_size = (avctx->width + 255) * 4 * (avctx->height + 64);
 
     /* Allocate decompression buffer */
-    if (c->decomp_size) {
-        if (!(c->decomp_buf = av_malloc(c->decomp_size))) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Can't allocate decompression buffer.\n");
-            return AVERROR(ENOMEM);
-        }
+    c->decomp_buf = av_mallocz(c->decomp_size);
+    if (!c->decomp_buf) {
+        av_log(avctx, AV_LOG_ERROR,
+                "Can't allocate decompression buffer.\n");
+        return AVERROR(ENOMEM);
     }
 
     c->zstream.zalloc = Z_NULL;
@@ -663,4 +662,5 @@ AVCodec ff_zmbv_decoder = {
     .close          = decode_end,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
diff --git a/libavcodec/zmbvenc.c b/libavcodec/zmbvenc.c
index e7b39f4..98029de 100644
--- a/libavcodec/zmbvenc.c
+++ b/libavcodec/zmbvenc.c
@@ -2,20 +2,20 @@
  * Zip Motion Blocks Video (ZMBV) encoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,34 +34,53 @@
 
 #include <zlib.h>
 
+/* Frame header flags */
 #define ZMBV_KEYFRAME 1
 #define ZMBV_DELTAPAL 2
 
+/* Motion block width/height (maximum allowed value is 255)
+ * Note: histogram datatype in block_cmp() must be big enough to hold values
+ * up to (4 * ZMBV_BLOCK * ZMBV_BLOCK)
+ */
 #define ZMBV_BLOCK 16
 
+/* Keyframe header format values */
+enum ZmbvFormat {
+    ZMBV_FMT_NONE  = 0,
+    ZMBV_FMT_1BPP  = 1,
+    ZMBV_FMT_2BPP  = 2,
+    ZMBV_FMT_4BPP  = 3,
+    ZMBV_FMT_8BPP  = 4,
+    ZMBV_FMT_15BPP = 5,
+    ZMBV_FMT_16BPP = 6,
+    ZMBV_FMT_24BPP = 7,
+    ZMBV_FMT_32BPP = 8
+};
+
 /**
  * Encoder context
  */
 typedef struct ZmbvEncContext {
     AVCodecContext *avctx;
 
-    int range;
+    int lrange, urange;
     uint8_t *comp_buf, *work_buf;
     uint8_t pal[768];
     uint32_t pal2[256]; //for quick comparisons
-    uint8_t *prev;
+    uint8_t *prev, *prev_buf;
     int pstride;
     int comp_size;
     int keyint, curfrm;
+    int bypp;
+    enum ZmbvFormat fmt;
     z_stream zstream;
 
-    int score_tab[256];
+    int score_tab[ZMBV_BLOCK * ZMBV_BLOCK * 4 + 1];
 } ZmbvEncContext;
 
 
 /** Block comparing function
  * XXX should be optimized and moved to DSPContext
- * TODO handle out of edge ME
  */
 static inline int block_cmp(ZmbvEncContext *c, uint8_t *src, int stride,
                             uint8_t *src2, int stride2, int bw, int bh,
@@ -69,20 +88,27 @@ static inline int block_cmp(ZmbvEncContext *c, uint8_t *src, int stride,
 {
     int sum = 0;
     int i, j;
-    uint8_t histogram[256] = {0};
+    uint16_t histogram[256] = {0};
+    int bw_bytes = bw * c->bypp;
 
-    *xored = 0;
+    /* Build frequency histogram of byte values for src[] ^ src2[] */
     for(j = 0; j < bh; j++){
-        for(i = 0; i < bw; i++){
+        for(i = 0; i < bw_bytes; i++){
             int t = src[i] ^ src2[i];
             histogram[t]++;
-            *xored |= t;
         }
         src += stride;
         src2 += stride2;
     }
 
-    for(i = 1; i < 256; i++)
+    /* If not all the xored values were 0, then the blocks are different */
+    *xored = (histogram[0] < bw_bytes * bh);
+
+    /* Exit early if blocks are equal */
+    if (!*xored) return 0;
+
+    /* Sum the entropy of all values */
+    for(i = 0; i < 256; i++)
         sum += c->score_tab[histogram[i]];
 
     return sum;
@@ -94,23 +120,42 @@ static inline int block_cmp(ZmbvEncContext *c, uint8_t *src, int stride,
 static int zmbv_me(ZmbvEncContext *c, uint8_t *src, int sstride, uint8_t *prev,
                    int pstride, int x, int y, int *mx, int *my, int *xored)
 {
-    int dx, dy, tx, ty, tv, bv, bw, bh;
+    int dx, dy, txored, tv, bv, bw, bh;
+    int mx0, my0;
 
-    *mx = *my = 0;
+    mx0 = *mx;
+    my0 = *my;
     bw = FFMIN(ZMBV_BLOCK, c->avctx->width - x);
     bh = FFMIN(ZMBV_BLOCK, c->avctx->height - y);
+
+    /* Try (0,0) */
     bv = block_cmp(c, src, sstride, prev, pstride, bw, bh, xored);
+    *mx = *my = 0;
     if(!bv) return 0;
-    for(ty = FFMAX(y - c->range, 0); ty < FFMIN(y + c->range, c->avctx->height - bh); ty++){
-        for(tx = FFMAX(x - c->range, 0); tx < FFMIN(x + c->range, c->avctx->width - bw); tx++){
-            if(tx == x && ty == y) continue; // we already tested this block
-            dx = tx - x;
-            dy = ty - y;
-            tv = block_cmp(c, src, sstride, prev + dx + dy * pstride, pstride, bw, bh, xored);
+
+    /* Try previous block's MV (if not 0,0) */
+    if (mx0 || my0){
+        tv = block_cmp(c, src, sstride, prev + mx0 * c->bypp + my0 * pstride, pstride, bw, bh, &txored);
+        if(tv < bv){
+            bv = tv;
+            *mx = mx0;
+            *my = my0;
+            *xored = txored;
+            if(!bv) return 0;
+        }
+    }
+
+    /* Try other MVs from top-to-bottom, left-to-right */
+    for(dy = -c->lrange; dy <= c->urange; dy++){
+        for(dx = -c->lrange; dx <= c->urange; dx++){
+            if(!dx && !dy) continue; // we already tested this block
+            if(dx == mx0 && dy == my0) continue; // this one too
+            tv = block_cmp(c, src, sstride, prev + dx * c->bypp + dy * pstride, pstride, bw, bh, &txored);
             if(tv < bv){
                  bv = tv;
                  *mx = dx;
                  *my = dy;
+                 *xored = txored;
                  if(!bv) return 0;
              }
          }
@@ -141,9 +186,10 @@ FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->key_frame = keyframe;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
-    chpal = !keyframe && memcmp(p->data[1], c->pal2, 1024);
 
-    palptr = (uint32_t*)p->data[1];
+    palptr = (avctx->pix_fmt == AV_PIX_FMT_PAL8) ? (uint32_t *)p->data[1] : NULL;
+    chpal = !keyframe && palptr && memcmp(palptr, c->pal2, 1024);
+
     src = p->data[0];
     prev = c->prev;
     if(chpal){
@@ -157,25 +203,27 @@ FF_ENABLE_DEPRECATION_WARNINGS
             c->pal[i * 3 + 1] = tpal[1];
             c->pal[i * 3 + 2] = tpal[2];
         }
-        memcpy(c->pal2, p->data[1], 1024);
+        memcpy(c->pal2, palptr, 1024);
     }
     if(keyframe){
-        for(i = 0; i < 256; i++){
-            AV_WB24(c->pal+(i*3), palptr[i]);
+        if (palptr){
+            for(i = 0; i < 256; i++){
+                AV_WB24(c->pal+(i*3), palptr[i]);
+            }
+            memcpy(c->work_buf, c->pal, 768);
+            memcpy(c->pal2, palptr, 1024);
+            work_size = 768;
         }
-        memcpy(c->work_buf, c->pal, 768);
-        memcpy(c->pal2, p->data[1], 1024);
-        work_size = 768;
         for(i = 0; i < avctx->height; i++){
-            memcpy(c->work_buf + work_size, src, avctx->width);
+            memcpy(c->work_buf + work_size, src, avctx->width * c->bypp);
             src += p->linesize[0];
-            work_size += avctx->width;
+            work_size += avctx->width * c->bypp;
         }
     }else{
         int x, y, bh2, bw2, xored;
         uint8_t *tsrc, *tprev;
         uint8_t *mv;
-        int mx, my;
+        int mx = 0, my = 0;
 
         bw = (avctx->width + ZMBV_BLOCK - 1) / ZMBV_BLOCK;
         bh = (avctx->height + ZMBV_BLOCK - 1) / ZMBV_BLOCK;
@@ -188,16 +236,16 @@ FF_ENABLE_DEPRECATION_WARNINGS
             for(x = 0; x < avctx->width; x += ZMBV_BLOCK, mv += 2) {
                 bw2 = FFMIN(avctx->width - x, ZMBV_BLOCK);
 
-                tsrc = src + x;
-                tprev = prev + x;
+                tsrc = src + x * c->bypp;
+                tprev = prev + x * c->bypp;
 
                 zmbv_me(c, tsrc, p->linesize[0], tprev, c->pstride, x, y, &mx, &my, &xored);
                 mv[0] = (mx << 1) | !!xored;
                 mv[1] = my << 1;
-                tprev += mx + my * c->pstride;
+                tprev += mx * c->bypp + my * c->pstride;
                 if(xored){
                     for(j = 0; j < bh2; j++){
-                        for(i = 0; i < bw2; i++)
+                        for(i = 0; i < bw2 * c->bypp; i++)
                             c->work_buf[work_size++] = tsrc[i] ^ tprev[i];
                         tsrc += p->linesize[0];
                         tprev += c->pstride;
@@ -212,7 +260,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     src = p->data[0];
     prev = c->prev;
     for(i = 0; i < avctx->height; i++){
-        memcpy(prev, src, avctx->width);
+        memcpy(prev, src, avctx->width * c->bypp);
         prev += c->pstride;
         src += p->linesize[0];
     }
@@ -233,10 +281,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     pkt_size = c->zstream.total_out + 1 + 6*keyframe;
-    if ((ret = ff_alloc_packet(pkt, pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting packet of size %d.\n", pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
     fl = (keyframe ? ZMBV_KEYFRAME : 0) | (chpal ? ZMBV_DELTAPAL : 0);
@@ -245,7 +291,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         *buf++ = 0; // hi ver
         *buf++ = 1; // lo ver
         *buf++ = 1; // comp
-        *buf++ = 4; // format - 8bpp
+        *buf++ = c->fmt; // format
         *buf++ = ZMBV_BLOCK; // block width
         *buf++ = ZMBV_BLOCK; // block height
     }
@@ -265,7 +311,7 @@ static av_cold int encode_end(AVCodecContext *avctx)
     av_freep(&c->work_buf);
 
     deflateEnd(&c->zstream);
-    av_freep(&c->prev);
+    av_freep(&c->prev_buf);
 
     return 0;
 }
@@ -279,17 +325,48 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int zret; // Zlib return code
     int i;
     int lvl = 9;
+    int prev_size, prev_offset;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_PAL8:
+        c->fmt = ZMBV_FMT_8BPP;
+        c->bypp = 1;
+        break;
+    case AV_PIX_FMT_RGB555LE:
+        c->fmt = ZMBV_FMT_15BPP;
+        c->bypp = 2;
+        break;
+    case AV_PIX_FMT_RGB565LE:
+        c->fmt = ZMBV_FMT_16BPP;
+        c->bypp = 2;
+        break;
+    case AV_PIX_FMT_BGR0:
+        c->fmt = ZMBV_FMT_32BPP;
+        c->bypp = 4;
+        break;
+    default:
+        av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
+        return AVERROR(EINVAL);
+    }
 
-    for(i=1; i<256; i++)
-        c->score_tab[i] = -i * log(i / (double)(ZMBV_BLOCK * ZMBV_BLOCK)) * (256 / M_LN2);
+    /* Entropy-based score tables for comparing blocks.
+     * Suitable for blocks up to (ZMBV_BLOCK * ZMBV_BLOCK) bytes.
+     * Scores are nonnegative, lower is better.
+     */
+    for(i = 1; i <= ZMBV_BLOCK * ZMBV_BLOCK * c->bypp; i++)
+        c->score_tab[i] = -i * log2(i / (double)(ZMBV_BLOCK * ZMBV_BLOCK * c->bypp)) * 256;
 
     c->avctx = avctx;
 
     c->curfrm = 0;
     c->keyint = avctx->keyint_min;
-    c->range = 8;
-    if(avctx->me_range > 0)
-        c->range = FFMIN(avctx->me_range, 127);
+
+    /* Motion estimation range: maximum distance is -64..63 */
+    c->lrange = c->urange = 8;
+    if(avctx->me_range > 0){
+        c->lrange = FFMIN(avctx->me_range, 64);
+        c->urange = FFMIN(avctx->me_range, 63);
+    }
 
     if(avctx->compression_level >= 0)
         lvl = avctx->compression_level;
@@ -300,7 +377,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
     // Needed if zlib unused or init aborted before deflateInit
     memset(&c->zstream, 0, sizeof(z_stream));
-    c->comp_size = avctx->width * avctx->height + 1024 +
+    c->comp_size = avctx->width * c->bypp * avctx->height + 1024 +
         ((avctx->width + ZMBV_BLOCK - 1) / ZMBV_BLOCK) * ((avctx->height + ZMBV_BLOCK - 1) / ZMBV_BLOCK) * 2 + 4;
     if (!(c->work_buf = av_malloc(c->comp_size))) {
         av_log(avctx, AV_LOG_ERROR, "Can't allocate work buffer.\n");
@@ -315,11 +392,23 @@ static av_cold int encode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Can't allocate compression buffer.\n");
         return AVERROR(ENOMEM);
     }
-    c->pstride = FFALIGN(avctx->width, 16);
-    if (!(c->prev = av_malloc(c->pstride * avctx->height))) {
+
+    /* Allocate prev buffer - pad around the image to allow out-of-edge ME:
+     * - The image should be padded with `lrange` rows before and `urange` rows
+     *   after.
+     * - The stride should be padded with `lrange` pixels, then rounded up to a
+     *   multiple of 16 bytes.
+     * - The first row should also be padded with `lrange` pixels before, then
+     *   aligned up to a multiple of 16 bytes.
+     */
+    c->pstride = FFALIGN((avctx->width + c->lrange) * c->bypp, 16);
+    prev_size = FFALIGN(c->lrange * c->bypp, 16) + c->pstride * (c->lrange + avctx->height + c->urange);
+    prev_offset = FFALIGN(c->lrange, 16) + c->pstride * c->lrange;
+    if (!(c->prev_buf = av_mallocz(prev_size))) {
         av_log(avctx, AV_LOG_ERROR, "Can't allocate picture.\n");
         return AVERROR(ENOMEM);
     }
+    c->prev = c->prev_buf + prev_offset;
 
     c->zstream.zalloc = Z_NULL;
     c->zstream.zfree = Z_NULL;
@@ -342,5 +431,9 @@ AVCodec ff_zmbv_encoder = {
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_PAL8, AV_PIX_FMT_NONE },
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_PAL8,
+                                                     AV_PIX_FMT_RGB555LE,
+                                                     AV_PIX_FMT_RGB565LE,
+                                                     AV_PIX_FMT_BGR0,
+                                                     AV_PIX_FMT_NONE },
 };